diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..d4a27a7e44
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,126 @@
+---
+# General options
+Language: Cpp
+Standard: c++17
+DisableFormat: false
+
+AccessModifierOffset: -4
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: false
+AlignEscapedNewlines: Right
+AlignOperands: false
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Allman
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeComma
+BreakStringLiterals: true
+ColumnLimit: 119
+CommentPragmas:  '^ COMMENT pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks: Regroup
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: false
+IndentGotoLabels: true
+IndentPPDirectives: AfterHash
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: All
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 1000
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: false
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: Never
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+TabWidth: 4
+UseCRLF: false
+UseTab: Never
+
+# Project specific options
+IncludeCategories:
+  # Local headers (in "") above all else
+  - Regex: '"([A-Za-z0-9.\/-_])+"'
+    Priority: 1
+    # <pmacc/foo.hpp>
+  - Regex: '<pmacc/([A-Za-z0-9.\/-_])+>'
+    Priority: 2
+    # <cupla/foo.hpp>
+  - Regex: '<cupla/([A-Za-z0-9.\/-_])+>'
+    Priority: 3
+  # <alpaka/foo.hpp>
+  - Regex: '<alpaka/([A-Za-z0-9.\/-_])+>'
+    Priority: 4
+  # <boost/foo.hpp>
+  - Regex: '<boost/([A-Za-z0-9.\/-_])+>'
+    Priority: 5
+  # C++ standard library headers are the last group to be included
+  - Regex: '<([A-Za-z0-9\/-_])+>'
+    Priority: 6
+  # Includes that made it this far are third-party headers and will be placed
+  # below alpaka's includes
+  - Regex: '<([A-Za-z0-9.\/-_])+>'
+    Priority: 7
+
+# Future options - not supported in clang-format 11
+# AlignConsecutiveBitFields: false
+# AllowShortEnumsOnASingleLine: false
+# BitFieldColonSpacing: Both
+# IndentCaseBlocks: true
+# IndentExternBlock: AfterExternBlock
+# OperandAlignmentStyle: Align
+...
diff --git a/.gitignore b/.gitignore
index 0bc62f425b..3d9c18fcf4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,9 @@
 # Visual Studio Code configuration files
 .vscode
 
+# JetBrains project files
+.idea/
+
 # python byte code
 *.pyc
 
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000..7f8cb77a21
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,69 @@
+
+stages:
+  - validate
+  - generate
+  - compile
+
+.base_generate-reduced-matrix:
+  stage: generate
+  script:
+    - apt update
+    - apt install -y python3-pip
+    - pip3 install allpairspy
+    - $CI_PROJECT_DIR/share/ci/git_merge.sh
+    - $CI_PROJECT_DIR/share/ci/generate_reduced_matrix.sh -n ${TEST_TUPLE_NUM_ELEM} > compile.yml
+    - cat compile.yml
+  artifacts:
+    paths:
+      - compile.yml
+
+# pull request validation:
+#   - check PR destination
+#   - check python code style: flake8, pyflake
+#   - rebase the PR to the destination branch
+#   - check C++ code style
+pull-request-validation:
+  stage: validate
+  image: ubuntu:focal
+  script:
+    - apt update
+    - apt install -y -q git curl wget python3 python3-pip
+    # Test if pull request can be merged into the destination branch
+    - $CI_PROJECT_DIR/test/correctBranchPR
+    - source $CI_PROJECT_DIR/share/ci/git_merge.sh
+    - pip3 install -U flake8 pyflakes
+    # Test Python Files for PEP8 conformance
+    - flake8 --exclude=thirdParty .
+    # Warnings, unused code, etc.
+    - pyflakes .
+    # install clang-format-11
+    - apt install -y -q gnupg2
+    - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
+    - echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-11 main" | tee -a /etc/apt/sources.list
+    - apt update
+    - apt install -y clang-format-11
+    # Check C++ code style
+    - source $CI_PROJECT_DIR/share/ci/check_cpp_code_style.sh
+  tags:
+    - x86_64
+
+# generate reduced test matrix
+# required variables (space separated lists):
+#   PIC_INPUTS - path to examples relative to share/picongpu
+#                e.g.
+#                    "examples" starts one gitlab job per directory in `examples/*`
+#                    "examples/" compile all directories in `examples/*` within one gitlab job
+#                    "examples/KelvinHelmholtz" compile all cases within one gitlab job
+generate-reduced-matrix:
+  variables:
+    PIC_INPUTS: "examples tests benchmarks"
+    TEST_TUPLE_NUM_ELEM: 1
+  extends: ".base_generate-reduced-matrix"
+
+compile-reduced-matrix:
+  stage: compile
+  trigger:
+    include:
+      - artifact: compile.yml
+        job: generate-reduced-matrix
+    strategy: depend
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index a9545f476a..0000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,180 +0,0 @@
-language: none
-sudo: false
-dist: trusty
-
-cache:
-  apt: true
-  directories:
-    - $HOME/.cache/spack
-    - $HOME/.cache/cmake-3.11.4
-  pip: true
-
-addons:
-  apt:
-    sources:
-      - ubuntu-toolchain-r-test
-    packages:
-      - g++-4.9
-      - gfortran-4.9  # spack OpenMPI dependency
-      - environment-modules
-      - openmpi-bin
-      - libopenmpi-dev
-      # clang 5.0.0 is pre-installed
-      # - clang-tidy-3.9
-
-env:
-  global:
-    - SPACK_ROOT: $HOME/.cache/spack
-    - PATH: $PATH:$HOME/.cache/spack/bin
-    - CXXFLAGS: "-std=c++11"
-
-install:
-  #############################################################################
-  # PMacc CPU-only dependencies                                               #
-  #############################################################################
-  - SPACK_FOUND=$(which spack >/dev/null && { echo 0; } || { echo 1; })
-  - if [ $SPACK_FOUND -ne 0 ]; then
-      mkdir -p $SPACK_ROOT &&
-      git clone --depth 50 https://github.com/spack/spack.git $SPACK_ROOT &&
-      echo -e "config:""\n  build_jobs:"" 2" > $SPACK_ROOT/etc/spack/config.yaml &&
-      echo -e "packages:""\n  cmake:""\n    version:"" [3.11.4]""\n    paths:""\n      cmake@3.11.4:"" /home/travis/.cache/cmake-3.11.4""\n    buildable:"" False" > $SPACK_ROOT/etc/spack/packages.yaml;
-    fi
-  - spack compiler add
-  # required dependencies - CMake 3.11.4
-  - if [ "$TRAVIS_OS_NAME" == "linux" ]; then
-      if [ ! -f $HOME/.cache/cmake-3.11.4/bin/cmake ]; then
-        wget -O cmake.sh https://cmake.org/files/v3.11/cmake-3.11.4-Linux-x86_64.sh &&
-        sh cmake.sh --skip-license --exclude-subdir --prefix=$HOME/.cache/cmake-3.11.4 &&
-        rm cmake.sh;
-      fi;
-    elif [ "$TRAVIS_OS_NAME" == "osx" ]; then
-      if [ ! -d /Applications/CMake.app/Contents/ ]; then
-        curl -L -s -o cmake.dmg https://cmake.org/files/v3.11/cmake-3.11.4-Darwin-x86_64.dmg &&
-        yes | hdiutil mount cmake.dmg &&
-        sudo cp -R "/Volumes/cmake-3.11.4-Darwin-x86_64/CMake.app" /Applications &&
-        hdiutil detach /dev/disk1s1 &&
-        rm cmake.dmg;
-      fi;
-    fi
-  - travis_wait spack install
-      cmake
-      $COMPILERSPEC
-  # required dependencies - Boost 1.65.1
-  - travis_wait spack install
-      boost@1.65.1~date_time~graph~iostreams~locale~log~random~thread~timer~wave
-      $COMPILERSPEC
-  - spack clean -a
-  - source /etc/profile &&
-    source $SPACK_ROOT/share/spack/setup-env.sh
-  - spack load cmake
-  - spack load boost $COMPILERSPEC
-
-jobs:
-  fast_finish: true
-  include:
-    - stage: 'Target Branch'
-      install: skip
-      script:
-        #############################################################################
-        # Disallow PRs to `ComputationalRadiationPhysics/picongpu` branch `master`  #
-        # if not an other mainline branch such as `dev` or `release-...`            #
-        #############################################################################
-        - . test/correctBranchPR
-    - &style-python
-      stage: 'Style'
-      language: python
-      python: "2.7"
-      install: pip install -U flake8
-      script:
-        #############################################################################
-        # Test Python Files for PEP8 conformance                                    #
-        #############################################################################
-        - flake8 --exclude=thirdParty .
-    - <<: *style-python
-      python: "3.6"
-    - install: skip
-      language: cpp
-      script:
-        #############################################################################
-        # Conformance with Alpaka: Do not write __global__ CUDA kernels directly    #
-        #############################################################################
-        - test/hasCudaGlobalKeyword include/pmacc
-        - test/hasCudaGlobalKeyword share/pmacc/examples
-        - test/hasCudaGlobalKeyword include/picongpu
-        - test/hasCudaGlobalKeyword share/picongpu/examples
-
-        #############################################################################
-        # Disallow end-of-line (EOL) white spaces                                   #
-        #############################################################################
-        - test/hasEOLwhiteSpace
-
-        #############################################################################
-        # Disallow TABs, use white spaces                                           #
-        #############################################################################
-        - test/hasTabs
-
-        #############################################################################
-        # Disallow non-ASCII in source files and scripts                            #
-        #############################################################################
-        - test/hasNonASCII
-
-        #############################################################################
-        # Disallow spaces before pre-compiler macros                                #
-        #############################################################################
-        - test/hasSpaceBeforePrecompiler
-
-        #############################################################################
-        # Enforce angle brackets <...> for includes of external library files       #
-        #############################################################################
-        - test/hasExtLibIncludeBrackets include boost
-        - test/hasExtLibIncludeBrackets include alpaka
-        - test/hasExtLibIncludeBrackets include cupla
-        - test/hasExtLibIncludeBrackets include splash
-        - test/hasExtLibIncludeBrackets include mallocMC
-        - test/hasExtLibIncludeBrackets include/picongpu pmacc
-        - test/hasExtLibIncludeBrackets share/picongpu/examples pmacc
-        - test/hasExtLibIncludeBrackets share/picongpu/examples boost
-        - test/hasExtLibIncludeBrackets share/picongpu/examples alpaka
-        - test/hasExtLibIncludeBrackets share/picongpu/examples cupla
-        - test/hasExtLibIncludeBrackets share/picongpu/examples splash
-        - test/hasExtLibIncludeBrackets share/picongpu/examples mallocMC
-        - test/hasExtLibIncludeBrackets share/pmacc/examples pmacc
-    - &static-code-python
-      stage: 'Static Code Analysis'
-      language: python
-      python: "2.7"
-      install: pip install -U pyflakes
-      script:
-        #############################################################################
-        # Warnings, unused code, etc.                                               #
-        #############################################################################
-        - pyflakes .
-    - <<: *static-code-python
-      python: "3.6"
-    - &test-cpp-unit
-      stage: 'C++ Unit Tests'
-      language: cpp
-      env: [ COMPILERSPEC='%gcc@4.9.4' ]
-      before_install:
-        - export CXX=g++-4.9
-        - export CC=gcc-4.9
-        - export FC=gfortran-4.9
-      script:
-        - $CXX --version
-        - $CC --version
-        - $FC --version
-        #############################################################################
-        # PMacc CPU-only tests                                                      #
-        #############################################################################
-        - mkdir -p $HOME/build
-        - cd $HOME/build
-        - cmake $TRAVIS_BUILD_DIR/include/pmacc
-                -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON
-        - make -j 2
-        # - make test  # reduce memory and RT costs first
-    - <<: *test-cpp-unit
-      env: [ COMPILERSPEC='%clang@5.0.0' ]
-      before_install:
-        - export CXX=clang++
-        - export CC=clang
-        - export FC=gfortran-4.9
diff --git a/INSTALL.rst b/INSTALL.rst
index 58e9b5f200..1b9f583c23 100644
--- a/INSTALL.rst
+++ b/INSTALL.rst
@@ -31,27 +31,28 @@ Mandatory
 
 gcc
 """
-- 4.9 - 7 (if you want to build for Nvidia GPUs, supported compilers depend on your current `CUDA version <https://gist.github.com/ax3l/9489132>`_)
+- 5.5 - 10.0 (if you want to build for Nvidia GPUs, supported compilers depend on your current `CUDA version <https://gist.github.com/ax3l/9489132>`_)
 
-  - CUDA 9.2 - 10.0: Use gcc 4.9 - 7
-  - CUDA 10.1/10.2: Use gcc 4.9 - 8
+  - CUDA 9.2 - 10.0: Use gcc 5.5 - 7
+  - CUDA 10.1/10.2: Use gcc 5.5 - 8
+  - CUDA 11.x: Used gcc 5.5 - 10.0
 - *note:* be sure to build all libraries/dependencies with the *same* gcc version; GCC 5 or newer is recommended
 - *Debian/Ubuntu:*
   
-  - ``sudo apt-get install gcc-5.3 g++-5.3 build-essential``
-  - ``sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5.3 60 --slave /usr/bin/g++ g++ /usr/bin/g++-5.3``
+  - ``sudo apt-get install gcc-5 g++-5 build-essential``
+  - ``sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 60 --slave /usr/bin/g++ g++ /usr/bin/g++-5``
 - *Arch Linux:*
   
   - ``sudo pacman --sync base-devel``
   - if the installed version of **gcc** is too new, `compile an older gcc <https://gist.github.com/slizzered/a9dc4e13cb1c7fffec53>`_
 - *Spack:*
   
-  - ``spack install gcc@5.3.0``
-  - make it the default in your `packages.yaml <http://spack.readthedocs.io/en/latest/getting_started.html#compiler-configuration>`_ or *suffix* `all following <http://spack.readthedocs.io/en/latest/features.html#simple-package-installation>`_ ``spack install`` commands with a *space* and ``%gcc@5.3.0``
+  - ``spack install gcc@5.5.0``
+  - make it the default in your `packages.yaml <http://spack.readthedocs.io/en/latest/getting_started.html#compiler-configuration>`_ or *suffix* `all following <http://spack.readthedocs.io/en/latest/features.html#simple-package-installation>`_ ``spack install`` commands with a *space* and ``%gcc@5.5.0``
 
 CMake
 """""
-- 3.11.4 or higher
+- 3.15.0 or higher
 - *Debian/Ubuntu:* ``sudo apt-get install cmake file cmake-curses-gui``
 - *Arch Linux:* ``sudo pacman --sync cmake``
 - *Spack:* ``spack install cmake``
@@ -325,6 +326,38 @@ ADIOS
   - ``export ADIOS_ROOT=$HOME/lib/adios``
   - ``export LD_LIBRARY_PATH=$ADIOS_ROOT/lib:$LD_LIBRARY_PATH``
 
+openPMD API
+"""""""""""
+- 0.12.0+ (bare minimum) / 0.13.0+ (for streaming IO)
+- *Spack*: ``spack install openpmd-api``
+- For usage in PIConGPU, the openPMD API must have been built either with support for ADIOS2 or HDF5 (or both).
+  When building the openPMD API from source (described below), these dependencies must be built and installed first.
+
+  - For ADIOS2, CMake build instructions can be found in the `official documentation <https://adios2.readthedocs.io/en/latest/setting_up/setting_up.html>`_.
+    The default configuration should generally be sufficient, the ``CMAKE_INSTALL_PREFIX`` should be set to a fitting location.
+  - For HDF5, CMake build  instructions can be found in the `official documentation <https://support.hdfgroup.org/HDF5/release/cmakebuild.html>`_.
+    The parameters ``-DHDF5_BUILD_CPP_LIB=OFF -DHDF5_ENABLE_PARALLEL=ON`` are required, the ``CMAKE_INSTALL_PREFIX`` should be set to a fitting location.
+- *from source:*
+
+  - ``mkdir -p ~/src ~/lib``
+  - ``cd ~/src``
+  - ``git clone https://github.com/openPMD/openPMD-api.git``
+  - ``cd openPMD-api``
+  - ``mkdir build && cd build``
+  - ``cmake .. -DopenPMD_USE_MPI=ON -DCMAKE_INSTALL_PREFIX=~/lib/openPMD-api``
+    Optionally, specify the parameters ``-DopenPMD_USE_ADIOS2=ON -DopenPMD_USE_HDF5=ON``. Otherwise, these parameters are set to ``ON`` automatically if CMake detects the dependencies on your system.
+  - ``make -j $(nproc) install``
+- environment:* (assumes install from source in ``$HOME/lib/openPMD-api``)
+
+  - ``export CMAKE_PREFIX_PATH="$HOME/lib/openPMD-api:$CMAKE_PREFIX_PATH"``
+  - ``export LD_LIBRARY_PATH="$HOME/lib/openPMD-api/lib:$LD_LIBRARY_PATH"``
+- If PIConGPU is built with openPMD output enabled, the JSON library
+  nlohmann_json will automatically be used, found in the ``thirdParty/``
+  directory.
+  By setting the CMake parameter ``PIC_nlohmann_json_PROVIDER=extern``, CMake
+  can be instructed to search for an installation of nlohmann_json externally.
+  Refer to LICENSE.md for further information.
+
 ISAAC
 """""
 - 1.4.0+
diff --git a/LICENSE.md b/LICENSE.md
index 188597079f..589c477222 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -59,14 +59,6 @@ of an easier install of `PIConGPU`. Contributions to these parts of the
 repository should *not* be made in the `thirdParty/` directory but in
 *their according repositories* (that we import).
 
- - `thirdParty/alpaka`:
-   alpaka is a header-only C++11 abstraction library for parallel
-   kernel development on accelerator hardware. It provides a single-source,
-   performance portable programming model for PIConGPU and PMacc.
-   Please visit
-     https://github.com/ComputationalRadiationPhysics/alpaka
-   for further details and contributions.
-
  - `thirdParty/mallocMC`:
    mallocMC is a fast memory allocator for many core accelerators and was
    originally forked from the `ScatterAlloc` project.
@@ -100,3 +92,11 @@ repository should *not* be made in the `thirdParty/` directory but in
    Please visit
      https://github.com/ComputationalRadiationPhysics/cupla
    for further details and contributions.
+
+- `thirdParty/nlohmann_json`:
+   nlohmann_json is a modern C++ library for working with JSON data, developed
+   by Niels Lohmann, published under the MIT License.
+   Please refer to the file `thirdParty/nlohmann_json/LICENSE.MIT` for license
+   information.
+   Please visit https://github.com/nlohmann/json for further details
+   and contributions.
diff --git a/README.md b/README.md
index 2346f3f5b5..667cd9360e 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 PIConGPU - Particle-in-Cell Simulations for the Exascale Era
 ============================================================
 
-[![Code Status master](https://img.shields.io/travis/ComputationalRadiationPhysics/picongpu/master.svg?label=master)](https://travis-ci.org/ComputationalRadiationPhysics/picongpu/branches)
-[![Code Status dev](https://img.shields.io/travis/ComputationalRadiationPhysics/picongpu/dev.svg?label=dev)](https://travis-ci.org/ComputationalRadiationPhysics/picongpu/branches)
+[![Code Status master](https://gitlab.com/hzdr/crp/picongpu/badges/master/pipeline.svg?key_text=master)](https://gitlab.com/hzdr/crp/picongpu/pipelines/master/latest)
+[![Code Status dev](https://gitlab.com/hzdr/crp/picongpu/badges/dev/pipeline.svg?key_text=dev)](https://gitlab.com/hzdr/crp/picongpu/pipelines/dev/latest)
 [![Documentation Status](https://readthedocs.org/projects/picongpu/badge/?version=latest)](http://picongpu.readthedocs.io)
 [![Doxygen](https://img.shields.io/badge/API-Doxygen-blue.svg)](http://computationalradiationphysics.github.io/picongpu)
 [![GitHub commits since last release](https://img.shields.io/github/commits-since/ComputationalRadiationPhysics/picongpu/latest/dev.svg)](https://github.com/ComputationalRadiationPhysics/picongpu/compare/master...dev)
-[![Language](https://img.shields.io/badge/language-C%2B%2B11-orange.svg)](https://isocpp.org/)
+[![Language](https://img.shields.io/badge/language-C%2B%2B14-orange.svg)](https://isocpp.org/)
 [![License PIConGPU](https://img.shields.io/badge/license-GPLv3-blue.svg?label=PIConGPU)](https://www.gnu.org/licenses/gpl-3.0.html)
 [![License PMacc](https://img.shields.io/badge/license-LGPLv3-blue.svg?label=PMacc)](https://www.gnu.org/licenses/lgpl-3.0.html)
 
@@ -38,7 +38,7 @@ Its features for the electro-magnetic PIC algorithm include:
   [*Esirkepov*](http://dx.doi.org/10.1016/S0010-4655%2800%2900228-9)
   and *ZigZag*
 - macro-particle form factors ranging from NGP (0th order), CIC (1st),
-  TSC (2nd), PSQ (3rd) to P4S (4th)
+  TSC (2nd), PQS (3rd) to PCS (4th)
 
 and the electro-magnetic PIC algorithm is further self-consistently coupled to:
 - classical radiation reaction
diff --git a/USAGE.rst b/USAGE.rst
index 11564e857d..b1d3376fc2 100644
--- a/USAGE.rst
+++ b/USAGE.rst
@@ -2,7 +2,14 @@
 
 .. seealso::
 
-   You need to have an :ref:`environment loaded <install-profile>` (``source $HOME/picongpu.profile``) that provides all :ref:`PIConGPU dependencies <install-dependencies>` to complete this chapter.
+   You need to have an :ref:`environment loaded <install-profile>` (``source $HOME/picongpu.profile`` when installing from source or ``spack load picongpu`` when using spack) that provides all :ref:`PIConGPU dependencies <install-dependencies>` to complete this chapter.
+
+.. warning::
+
+   PIConGPU source code is portable and can be compiled on all major operating systems.
+   However, helper tools like ``pic-create`` and ``pic-build`` described in this section rely on Linux utilities and thus are not expected to work on other platforms out-of-the-box.
+   Note that building and using PIConGPU on other operating systems is still possible but has to be done manually or with custom tools.
+   This case is not covered in the documentation, but we can assist users with it when needed.
 
 Basics
 ======
@@ -101,16 +108,16 @@ tbg
 The ``tbg`` tool is explained in detail :ref:`in its own section <usage-tbg>`.
 Its primary purpose is to abstract the options in runtime ``.cfg`` files from the technical details on how to run on various supercomputers.
 
-For example, if you want to run on the HPC System `"Hypnos" at HZDR <https://www.hzdr.de/db/Cms?pOid=12231>`_, your ``tbg`` submit command would just change to:
+For example, if you want to run on the HPC System `"Hemera" at HZDR <https://www.hzdr.de/db/Cms?pOid=12231>`_, your ``tbg`` submit command would just change to:
 
 .. code-block:: bash
    :emphasize-lines: 2
 
    # request 1 GPU from the PBS batch system and run on the queue "k20"
-   tbg -s qsub -c etc/picongpu/1.cfg -t etc/picongpu/hypnos-hzdr/k20.tpl $SCRATCH/runs/lwfa_002
+   tbg -s sbatch -c etc/picongpu/1.cfg -t etc/picongpu/hemera-hzdr/k20.tpl $SCRATCH/runs/lwfa_002
 
    # run again, this time on 16 GPUs
-   tbg -s qsub -c etc/picongpu/16.cfg -t etc/picongpu/hypnos-hzdr/k20.tpl $SCRATCH/runs/lwfa_003
+   tbg -s sbatch -c etc/picongpu/16.cfg -t etc/picongpu/hemera-hzdr/k20.tpl $SCRATCH/runs/lwfa_003
 
 Note that we can use the same ``1.cfg`` file, your input set is *portable*.
 
diff --git a/bin/cuda_memtest.sh b/bin/cuda_memtest.sh
index 6f432dc192..be022f4551 100755
--- a/bin/cuda_memtest.sh
+++ b/bin/cuda_memtest.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Rene Widera
+# Copyright 2013-2021 Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/bin/egetopt b/bin/egetopt
index 57f76f1279..f421563c0a 100755
--- a/bin/egetopt
+++ b/bin/egetopt
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2014-2020 Rene Widera
+# Copyright 2014-2021 Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/bin/pic-build b/bin/pic-build
index 84d42c3771..bfde3856d0 100755
--- a/bin/pic-build
+++ b/bin/pic-build
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/bin/pic-compile b/bin/pic-compile
index ce4370a9da..0090006cc7 100755
--- a/bin/pic-compile
+++ b/bin/pic-compile
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -131,7 +131,7 @@ do
             $picongpu_prefix/buildsystem/CompileSuite/compileSet.sh \
                 "$example_name" "$testFlagNr" "$globalCMakeOptions" \
                 "$tmpRun_path" "$buildDir" "$examples_path" \
-                "$quiet_run" | tee $buildDir"/compile.log" || exit $?
+                "$quiet_run"  &> $buildDir"/compile.log"
         fi
 
         testFlagNr=$(( testFlagNr + 1 ))
diff --git a/bin/pic-configure b/bin/pic-configure
index ae8a33497c..00d14b0174 100755
--- a/bin/pic-configure
+++ b/bin/pic-configure
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -39,8 +39,8 @@ help()
     echo "                       (default is <inputDirectory>)"
     echo "-b | --backend       - set compute backend and optionally the architecture"
     echo "                       syntax: backend[:architecture]"
-    echo "                       supported backends: cuda, omp2b, serial, tbb, threads"
-    echo "                       (e.g.: \"cuda:20;35;37;52;60\" or \"omp2b:native\" or \"omp2b\")"
+    echo "                       supported backends: cuda, hip, omp2b, serial, tbb, threads"
+    echo "                       (e.g.: \"cuda:35;37;52;60\" or \"omp2b:native\" or \"omp2b\")"
     echo "                       default: \"cuda\" if not set via environment variable PIC_BACKEND"
     echo "                       note: architecture names are compiler dependent"
     echo "-c | --cmake         - overwrite options for cmake"
@@ -64,25 +64,30 @@ get_backend_flags()
             result+=" -DALPAKA_CUDA_ARCH=\"${backend_cfg[1]}\""
         fi
     elif [ "${backend_cfg[0]}" == "omp2b" ] ; then
-        result+=" -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON"
+        result+=" -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON -DCUPLA_STREAM_ASYNC_ENABLE=OFF"
         if [ $num_options -eq 2 ] ; then
             result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
         fi
     elif [ "${backend_cfg[0]}" == "serial" ] ; then
-        result+=" -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON"
+        result+=" -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -DCUPLA_STREAM_ASYNC_ENABLE=OFF"
         if [ $num_options -eq 2 ] ; then
             result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
         fi
     elif [ "${backend_cfg[0]}" == "tbb" ] ; then
-        result+=" -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=ON"
+        result+=" -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=ON -DCUPLA_STREAM_ASYNC_ENABLE=OFF"
         if [ $num_options -eq 2 ] ; then
             result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
         fi
     elif [ "${backend_cfg[0]}" == "threads" ] ; then
-        result+=" -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON"
+        result+=" -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON -DCUPLA_STREAM_ASYNC_ENABLE=OFF"
         if [ $num_options -eq 2 ] ; then
             result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
         fi
+    elif [ "${backend_cfg[0]}" == "hip" ] ; then
+        result+=" -DALPAKA_ACC_GPU_HIP_ENABLE=ON -DALPAKA_ACC_GPU_HIP_ONLY_MODE=ON"
+        if [ $num_options -eq 2 ] ; then
+            result+=" -DALPAKA_HIP_ARCH=\"${backend_cfg[1]}\""
+        fi
     else
         echo "unsupported backend given '$1'" >&2
         exit 1
diff --git a/bin/pic-create b/bin/pic-create
index f700107975..42cf13a0b5 100755
--- a/bin/pic-create
+++ b/bin/pic-create
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -36,7 +36,7 @@ help()
     echo ""
     echo "usage: pic-create [OPTION] [src_dir] dest_dir"
     echo "If no src_dir is set picongpu a default case is cloned"
-    echo "If src_dir is not in the currrent directory, pic-create will"
+    echo "If src_dir is not in the current directory, pic-create will"
     echo 'look for it in $PIC_EXAMPLES'
     echo ""
     echo "-f | --force         - merge data if destination already exists"
@@ -73,7 +73,7 @@ done
 cmake_path="$*"
 
 if [ $# -eq 0 ] || [ $# -gt 2 ] ; then
-    echo "Missing destination directory or to many directories were given." >&2
+    echo "Missing destination directory or too many directories were given." >&2
     exit
 fi
 
diff --git a/bin/pic-edit b/bin/pic-edit
index 2eedc5b466..c35fe78dac 100755
--- a/bin/pic-edit
+++ b/bin/pic-edit
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/bin/tbg b/bin/tbg
index c4a51db88f..4c061f57d6 100755
--- a/bin/tbg
+++ b/bin/tbg
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
@@ -272,7 +272,12 @@ help()
 }
 
 #!/usr/bin/env bash
-initCall="$0 $*"
+#Check whether the provided path is relative or absolute and
+#convert it to an abolute path
+script=$0
+
+absScriptPath=$(realpath $0)
+initCall="cd $(pwd); $absScriptPath $*"
 projectPath="."
 
 pathToegetopt=$(which egetopt 2>/dev/null)
diff --git a/buildsystem/CompileSuite/autoTests/config.sh b/buildsystem/CompileSuite/autoTests/config.sh
index eb1e2025a1..9bd6cadad2 100755
--- a/buildsystem/CompileSuite/autoTests/config.sh
+++ b/buildsystem/CompileSuite/autoTests/config.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/buildsystem/CompileSuite/autoTests/get_work.sh b/buildsystem/CompileSuite/autoTests/get_work.sh
index b0164fc2da..b4faa82ce4 100755
--- a/buildsystem/CompileSuite/autoTests/get_work.sh
+++ b/buildsystem/CompileSuite/autoTests/get_work.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/buildsystem/CompileSuite/autoTests/new_commits.sh b/buildsystem/CompileSuite/autoTests/new_commits.sh
index ff429f7d48..5a846ac844 100755
--- a/buildsystem/CompileSuite/autoTests/new_commits.sh
+++ b/buildsystem/CompileSuite/autoTests/new_commits.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -94,7 +94,7 @@ touch "$thisDir"runGuard
             #export PIC_COMPILE_SUITE_CMAKE="-DPIC_ENABLE_PNG=OFF -DALPAKA_CUDA_ARCH=35"
             export PIC_BACKEND="cuda"
             . /etc/profile
-            module load gcc/5.1.0 boost/1.65.1 cmake/3.11.4 cuda/9.2.148 openmpi/3.0.4
+            module load gcc/5.5.0 boost/1.65.1 cmake/3.15.0 cuda/9.2.148 openmpi/3.0.4
             module load libSplash/1.7.0 adios/1.13.1
             module load pngwriter/0.7.0 zlib/1.2.11
             module load libjpeg-turbo/1.5.1 icet/2.1.1 jansson/2.9 isaac/1.4.0
diff --git a/buildsystem/CompileSuite/autoTests/report.sh b/buildsystem/CompileSuite/autoTests/report.sh
index 6f67c91026..018d946096 100755
--- a/buildsystem/CompileSuite/autoTests/report.sh
+++ b/buildsystem/CompileSuite/autoTests/report.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/buildsystem/CompileSuite/color.sh b/buildsystem/CompileSuite/color.sh
index 86650a4641..52038ad590 100755
--- a/buildsystem/CompileSuite/color.sh
+++ b/buildsystem/CompileSuite/color.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/buildsystem/CompileSuite/compileSet.sh b/buildsystem/CompileSuite/compileSet.sh
index 4c13e365d9..94bee74ce7 100755
--- a/buildsystem/CompileSuite/compileSet.sh
+++ b/buildsystem/CompileSuite/compileSet.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/buildsystem/CompileSuite/exec_helper.sh b/buildsystem/CompileSuite/exec_helper.sh
index 905609a83d..f731c8982c 100755
--- a/buildsystem/CompileSuite/exec_helper.sh
+++ b/buildsystem/CompileSuite/exec_helper.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/buildsystem/CompileSuite/help.sh b/buildsystem/CompileSuite/help.sh
index 582c11b6e6..4fe8cb19ff 100755
--- a/buildsystem/CompileSuite/help.sh
+++ b/buildsystem/CompileSuite/help.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/buildsystem/CompileSuite/options.sh b/buildsystem/CompileSuite/options.sh
index 9a67952cd6..f847e3eb0e 100755
--- a/buildsystem/CompileSuite/options.sh
+++ b/buildsystem/CompileSuite/options.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/buildsystem/CompileSuite/path.sh b/buildsystem/CompileSuite/path.sh
index 9e107302fd..507e86ab39 100755
--- a/buildsystem/CompileSuite/path.sh
+++ b/buildsystem/CompileSuite/path.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/docs/COMMIT.md b/docs/COMMIT.md
index 7e14e74a9a..6b4490b929 100644
--- a/docs/COMMIT.md
+++ b/docs/COMMIT.md
@@ -6,7 +6,30 @@ We agree on the following simple rules to make our lives easier :)
 - Stick to the **style** below for **commit messages**
 - **Commit compiling patches** for the *main* branches (`master` and `dev`),
   you can be less strict for (unshared) *topic branches*
-
+- Commits should be formated with clang-format-11
+
+Format Code
+-----------
+
+- Install *ClangFormat 11*
+- To format all files in your working copy, you can run this command in bash from the root folder of PIConGPU:
+  ```bash
+  find include/ share/picongpu/ share/pmacc -iname "*.def" \
+  -o -iname "*.h" -o -iname "*.cpp" -o -iname "*.cu" \
+  -o -iname "*.hpp" -o -iname "*.tpp" -o -iname "*.kernel" \
+  -o -iname "*.loader" -o -iname "*.param" -o -iname "*.unitless" \
+  | xargs clang-format-11 -i
+  ```
+ 
+Instead of using the bash command above you can use *Git* together with *ClangFormat* to format your patched code only. 
+Before applying this command, you must extend your local git configuration **once** with all file endings used in *PIConGPU*:
+
+```
+git config --local clangFormat.extensions def,h,cpp,cu,hpp,tpp,kernel,loader,param,unitless
+```
+
+For only formatting lines you added using `git add`, call `git clang-format-11` before you create a commit.
+Please be aware that un-staged changes will not be formatted.
 
 Commit Messages
 ---------------
diff --git a/docs/TBG_macros.cfg b/docs/TBG_macros.cfg
index 7b985f3d44..d830c1a593 100644
--- a/docs/TBG_macros.cfg
+++ b/docs/TBG_macros.cfg
@@ -1,4 +1,5 @@
-# Copyright 2014-2020 Felix Schmitt, Axel Huebl, Richard Pausch, Heiko Burau
+# Copyright 2014-2021 Felix Schmitt, Axel Huebl, Richard Pausch, Heiko Burau,
+#                     Franz Poeschel
 #
 # This file is part of PIConGPU.
 #
@@ -22,6 +23,7 @@
 ## These variables basically wrap PIConGPU command line flags.
 ## To see all flags available for your PIConGPU binary, run
 ## picongpu --help. The avalable flags depend on your configuration flags.
+## Note that this is not meant to be a complete and functioning .cfg file.
 ##
 ## Flags that target a specific species e.g. electrons (--e_png) or ions
 ## (--i_png) must only be used if the respective species is activated (configure flags).
@@ -123,7 +125,8 @@ TBG_stopWindow="--stopWindow 1337"
 #--<species>_radiation.end     Time step to stop calculating the radiation
 #--<species>_radiation.radPerGPU     If flag is set, each GPU stores its own spectra without summing the entire simulation area
 #--<species>_radiation.folderRadPerGPU     Folder where the GPU specific spectras are stored
-#--e_<species>_radiation.compression    If flag is set, the hdf5 output will be compressed.
+#--<species>_radiation.compression    If flag is set, the hdf5 output will be compressed.
+#--<species>_radiation.numJobs     Number of independent jobs used for the radiation calculation.
 TBG_radiation="--<species>_radiation.period 1 --<species>_radiation.dump 2 --<species>_radiation.totalRadiation \
                --<species>_radiation.lastRadiation --<species>_radiation.start 2800 --<species>_radiation.end 3000"
 
@@ -132,6 +135,26 @@ TBG_radiation="--<species>_radiation.period 1 --<species>_radiation.dump 2 --<sp
 #--<species>_transRad.period   Gives the number of time steps between which the radiation should be calculated.
 TBG_transRad="--<species>_transRad.period 1000"
 
+# The following flags are available for the xrayScattering plugin.
+# For a full description, see the plugins section in the online documentation.
+#--<species>_xrayScattering.period    Period at which the plugin is enabled.
+#--<species>_xrayScattering.outputPeriod    Period at which the accumulated amplitude is written to the output file.
+#--<species>_xrayScattering.qx_max    Upper bound of reciprocal space range in qx direction.
+#--<species>_xrayScattering.qy_max    Upper bound of reciprocal space range in qy direction.
+#--<species>_xrayScattering.qx_max    Lower bound of reciprocal space range in qx direction.
+#--<species>_xrayScattering.qy_max    Lower bound of reciprocal space range in qy direction.
+#--<species>_xrayScattering.n_qx    Number of scattering vectors needed to be calculated in qx direction.
+#--<species>_xrayScattering.n_qy    Number of scattering vectors needed to be calculated in qy direction.
+#--<species>_xrayScattering.file    Output file name. Default is `<species>_xrayScatteringOutput`.
+#--<species>_xrayScattering.ext    `openPMD` filename extension. This controls the backend picked by the `openPMD` API. Default is `bp` for adios backend.
+#--<species>_xrayScattering.compression    Backend-specific `openPMD` compression method (e.g.) zlib.
+#--<species>_xrayScattering.memoryLayout    Possible values: `mirror` and `split`. Output can be mirrored on all Host+Device pairs or uniformly split, in chunks, over all nodes.
+TBG_<species>_xrayScattering="--<species>_xrayScattering.period 1 --e_xrayScattering.outputPeriod 10 \
+                    --e_xrayScattering.n_qx 512 --e_xrayScattering.n_qy 512 \
+                    --e_xrayScattering.qx_min 0 --e_xrayScattering.qx_max 1 \
+                    --e_xrayScattering.qy_min -1 --e_xrayScattering.qy_max 1 \
+                    --e_xrayScattering.memoryLayout split"
+
 # Create 2D images in PNG format every .period steps.
 # The slice plane is defined using .axis [yx,yz] and .slicePoint (offset from origin
 # as a float within [0.0,1.0].
@@ -143,6 +166,10 @@ TBG_<species>_pngYX="--<species>_png.period 10 --<species>_png.axis yx --<specie
 # Enable macro particle merging
 TBG_<species>_merger="--<species>_merger.period 100 --<species>_merger.minParticlesToMerge 8 --<species>_merger.posSpreadThreshold 0.2 --<species>_merger.absMomSpreadThreshold 0.01"
 
+# Enable probabilistic version of particle merging
+TBG_<species>_randomizedMerger="--<species>_randomizedMerger.period 100 --<species>_randomizedMerger.maxParticlesToMerge 8                  \
+                                --<species>_randomizedMerger.ratioDeletedParticles 0.9 --<species>_randomizedMerger.posSpreadThreshold 0.01 \
+                                --<species>_randomizedMerger.momSpreadThreshold  0.0005"
 
 # Notification period of position plugin (single-particle debugging)
 TBG_<species>_pos_dbg="--<species>_position.period 1"
@@ -155,7 +182,6 @@ TBG_<species>_histogram="--<species>_energyHistogram.period 500 --<species>_ener
 
 
 # Calculate a 2D phase space
-# - requires parallel libSplash for HDF5 output
 # - momentum range in m_<species> c
 TBG_<species>_PSxpx="--<species>_phaseSpace.period 10 --<species>_phaseSpace.filter all --<species>_phaseSpace.space x --<species>_phaseSpace.momentum px --<species>_phaseSpace.min -1.0 --<species>_phaseSpace.max 1.0"
 TBG_<species>_PSxpz="--<species>_phaseSpace.period 10 --<species>_phaseSpace.filter all --<species>_phaseSpace.space x --<species>_phaseSpace.momentum pz --<species>_phaseSpace.min -1.0 --<species>_phaseSpace.max 1.0"
@@ -181,12 +207,11 @@ TBG_macroCount="--<species>_macroParticlesCount.period 100"
 # Count makro particles of a species per super cell
 TBG_countPerSuper="--<species>_macroParticlesPerSuperCell.period 100 --<species>_macroParticlesPerSuperCell.period 100"
 
-# Dump simulation data (fields and particles) to HDF5 files using libSplash.
-# Data selected in .source is dumped every .period steps to the fileset .file.
-TBG_hdf5="--hdf5.period 100 --hdf5.file simData --hdf5.source 'species_all,fields_all'"
 
 # Dump simulation data (fields and particles) to ADIOS files.
 # Data is dumped every .period steps to the fileset .file.
+# Warning: we do not recommend using the ADIOS plugin for output,
+# but the openPMD plugin with ADIOS (2) backend instead, see TBG_openPMD below.
 TBG_adios="--adios.period 100 --adios.file simData --adios.source 'species_all,fields_all'"
 # see 'adios_config -m', e.g., for on-the-fly zlib compression
 #     (compile ADIOS with --with-zlib=<ZLIB_ROOT>)
@@ -205,13 +230,29 @@ TBG_adios="--adios.period 100 --adios.file simData --adios.source 'species_all,f
 # select data sources for the dump
 #   --adios.source <comma_separated_list_of_data_sources>
 
+# Dump simulation data (fields and particles) via the openPMD API.
+# Data is dumped every .period steps to the fileset .file.
+TBG_openPMD="--openPMD.period 100   \
+             --openPMD.file simOutput \
+             --openPMD.ext bp \
+             --openPMD.json '{ \"adios2\": { \"engine\": { \"type\": \"file\", \"parameters\": { \"BufferGrowthFactor\": \"1.2\", \"InitialBufferSize\": \"2GB\" } } } }'"
+# Further control over the backends used in the openPMD plugins is available
+# through the mechanisms exposed by the openPMD API:
+# * environment variables
+# * JSON-formatted configuration string
+# Further information on both is retrieved from the official documentation
+# https://openpmd-api.readthedocs.io
+# Notice that specifying compression settings via --openPMD.compression
+# is considered legacy and backend-specific settings via the JSON string are
+# preferred if available for a backend.
+
 # Create a checkpoint that is restartable every --checkpoint.period steps
 #   http://git.io/PToFYg
 TBG_checkpoint="--checkpoint.period 1000"
-# Select the backend for the checkpoint, available are hdf5 and adios
-#    --checkpoint.backend adios
-#                         hdf5
-# Available backend options are exactly as in --adios.* and --hdf5.* and can be set
+# Select the backend for the checkpoint, available are openPMD and adios
+#    --checkpoint.backend openPMD
+#                         adios
+# Available backend options are exactly as in --openPMD.* and --adios.* and can be set
 # via:
 #   --checkpoint.<IO-backend>.* <value>
 # e.g.:
@@ -223,8 +264,8 @@ TBG_checkpoint="--checkpoint.period 1000"
 # Restart the simulation from checkpoint created using TBG_checkpoint
 TBG_restart="--checkpoint.restart"
 # Select the backend for the restart (must fit the created checkpoint)
-#    --checkpoint.restart.backend adios
-#                                 hdf5
+#    --checkpoint.restart.backend openPMD
+#                                 adios
 # By default, the last checkpoint is restarted if not specified via
 #   --checkpoint.restart.step 1000
 # To restart in a new run directory point to the old run where to start from
diff --git a/docs/propose_changelog.py b/docs/propose_changelog.py
index dc2861abb2..47f466bfd8 100755
--- a/docs/propose_changelog.py
+++ b/docs/propose_changelog.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl
 #
 # License: GPLv3+
 #
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 533349d9da..dc0ba8723a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -105,9 +105,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'0.5.0'
+version = u'0.6.0'
 # The full version, including alpha/beta/rc tags.
-release = u'0.5.0'
+release = u'0.6.0-dev'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/source/dev/picongpu.rst b/docs/source/dev/picongpu.rst
index 37e281fb8d..f058abe670 100644
--- a/docs/source/dev/picongpu.rst
+++ b/docs/source/dev/picongpu.rst
@@ -3,10 +3,10 @@ Important PIConGPU Classes
 
 This is very, very small selection of classes of interest to get you started.
 
-MySimulation
-------------
+Simulation
+----------
 
-.. doxygenclass:: picongpu::MySimulation
+.. doxygenclass:: picongpu::Simulation
    :project: PIConGPU
    :members:
    :undoc-members:
diff --git a/docs/source/dev/styleguide.rst b/docs/source/dev/styleguide.rst
index a5e5126a44..a53d2ca6e4 100644
--- a/docs/source/dev/styleguide.rst
+++ b/docs/source/dev/styleguide.rst
@@ -15,12 +15,77 @@ Source Style
 For contributions, *an ideal patch blends in the existing coding style around it* without being noticed as an addition when applied.
 Nevertheless, please make sure *new files* follow the styles linked above as strict as possible from the beginning.
 
-Unfortunately, we currently do not have tools available to auto-format all aspects of our style guidelines.
-Since we want to focus on the content of your contribution, we try to cover as much as possible by automated tests which you always have to pass.
-Nevertheless, we will not enforce the still uncovered, *non-semantic aspects* of style in a *pedantic* way until we find a way to automate it fully.
+clang-format-11 should be used to format the code.
+There are different ways to format the code.
 
-(That also means that we do not encourage manual style-only changes of our existing code base, since both you and us have better things to do than adding newlines and spaces manually.
-Doxygen and documentation additions are always welcome!)
+Format All Files
+^^^^^^^^^^^^^^^^
+
+To format all files in your working copy, you can run this command in bash from the root folder of PIConGPU:
+
+.. code-block:: bash
+
+   find include/ share/picongpu/ share/pmacc -iname "*.def" \
+     -o -iname "*.h" -o -iname "*.cpp" -o -iname "*.cu" \
+     -o -iname "*.hpp" -o -iname "*.tpp" -o -iname "*.kernel" \
+     -o -iname "*.loader" -o -iname "*.param" -o -iname "*.unitless" \
+     | xargs clang-format-11 -i
+
+Format Only Changes, Using Git
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Instead of using the bash command above you can use *Git* together with *ClangFormat* to format your patched code only.
+
+    *ClangFormat* is an external tool for code formating that can be called by *Git* on changed files only and
+    is part of clang tools.
+
+Before applying this command, you must extend your local git configuration **once** with all file endings used in *PIConGPU*:
+
+.. code-block:: bash
+
+   git config --local clangFormat.extensions def,h,cpp,cu,hpp,tpp,kernel,loader,param,unitless
+
+After installing, or on a cluster loading the module(see introduction), clangFormat can be called by git on all **staged files** using the command:
+
+.. code-block:: bash
+
+   git clangFormat
+
+.. warning::
+
+    The binary for *ClangFormat* is called `clang-format` on some operating systems.
+    If *clangFormat* is not recognized, try *clang-format* instead, in addition please check that `clang-format --version` returns version `11.X.X` in this case.
+
+The Typical workflow using git clangFormat is the following,
+
+1. make your patch
+
+2. stage the changed files in git
+
+.. code-block:: bash
+
+    git add <files you changed>/ -A
+
+3. format them according to guidelines
+
+.. code-block:: bash
+
+    git clangFormat
+
+4. stage the now changed(formated) files again
+
+.. code-block:: bash
+
+    git add <files you changed>
+
+5. commit changed files
+
+.. code-block:: bash
+
+    git commit -m <commit message>
+
+Please be aware that un-staged changes will not be formatted.
+Formatting all changes of the previous commit can be achieved by executing the command `git clang-format-11 HEAD~1`.
 
 License Header
 --------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e9b35d10f0..e52983fdfb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -80,6 +80,8 @@ Models
    :hidden:
 
    models/pic
+   models/AOFDTD
+   models/shapes
    models/LL_RR
    models/field_ionization
    models/collisional_ionization
diff --git a/docs/source/install/libraryDependencies.dot b/docs/source/install/libraryDependencies.dot
index 1b81068feb..10faa26002 100644
--- a/docs/source/install/libraryDependencies.dot
+++ b/docs/source/install/libraryDependencies.dot
@@ -35,6 +35,19 @@ digraph PIConGPU {
     hdf5 -> adios [style=dashed label="optional"];
     fileSystem -> adios [style=dashed label="extra hints"];
 
+    adios2 [label="ADIOS2"];
+    mpi -> adios2;
+    zlib -> adios2;
+    hdf5 -> adios2 [style=dashed label="optional"];
+    compiler -> libfabric;
+    libfabric -> adios2[style=dashed label="RDMA staging"];
+
+    openpmd [label="openPMD API"];
+    adios2 -> openpmd;
+    hdf5 -> openpmd;
+    nlohmann_json [label="nlohmann_json\n(header-only)\n(internall shipped\nby default)"];
+    nlohmann_json -> openpmd[style=dashed"]
+
     libpng -> PNGwriter;
 
     trace [label="VampirTrace / Score-P"];
diff --git a/docs/source/install/profile.rst b/docs/source/install/profile.rst
index f48bc50ce7..77b2b9baa8 100644
--- a/docs/source/install/profile.rst
+++ b/docs/source/install/profile.rst
@@ -59,37 +59,6 @@ Queue: k80 (8x NVIDIA K80 12GB)
 .. literalinclude:: profiles/hemera-hzdr/k80_picongpu.profile.example
    :language: bash
 
-Hypnos (HZDR)
--------------
-
-**System overview:** `link (internal) <https://www.hzdr.de/db/Cms?pOid=29813>`_
-
-**User guide:** `link (internal) <http://hypnos3/wiki>`_
-
-**Production directory:** ``/bigdata/hplsim/`` with ``external/``, ``scratch/``, ``development/`` and ``production/``
-
-For these profiles to work, you need to download the :ref:`PIConGPU source code <install-dependencies-picongpu>` manually.
-
-Queue: laser (AMD Opteron 6276 CPUs)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. literalinclude:: profiles/hypnos-hzdr/laser_picongpu.profile.example
-   :language: bash
-
-Hydra (HZDR)
--------------
-
-**System overview:** `link (internal) <https://www.hzdr.de/db/Cms?pOid=29813>`_
-
-**User guide:** `link (internal) <http://hypnos3/wiki>`_
-
-**Production directory:** ``/bigdata/hplsim/`` with ``external/``, ``scratch/``, ``development/`` and ``production/``
-
-For this profile to work, you need to download the :ref:`PIConGPU source code <install-dependencies-picongpu>` manually.
-
-.. literalinclude:: profiles/hydra-hzdr/default_picongpu.profile.example
-   :language: bash
-
 Summit (ORNL)
 -------------
 
diff --git a/docs/source/models/AOFDTD.rst b/docs/source/models/AOFDTD.rst
new file mode 100644
index 0000000000..19bc977d70
--- /dev/null
+++ b/docs/source/models/AOFDTD.rst
@@ -0,0 +1,376 @@
+.. _model-AOFDTD:
+
+Finite-Difference Time-Domain Method
+====================================
+
+.. sectionauthor:: Klaus Steiniger
+
+
+For the discretization of Maxwell's equations on a mesh in PIConGPU, only the equations
+
+.. math::
+
+   \frac{1}{c^2}\frac{\partial}{\partial t}\vec E &= \nabla \times \vec B - \mu_0 \vec J
+
+   \frac{\partial}{\partial t}\vec B &= - \nabla \times \vec E
+
+are solved.
+This becomes possible, first, by correctly solving Gauss's law
+:math:`\nabla \cdot \vec{E} = \frac{1}{\varepsilon_0}\sum_s \rho_s` using
+Esirkepov's current deposition method [Esirkepov2001]_ (or variants thereof) which solve the discretized continuity
+equation exactly.
+Second, by assuming that the initially given electric and magnetic field satisfy Gauss' laws.
+Starting simulations in an initially charge free and magnetic-divergence-free space, i.e.
+
+.. math::
+
+   \nabla \cdot \vec E &= 0
+
+   \nabla \cdot \vec B &= 0
+
+is standard.
+
+
+Discretization on a staggered mesh
+----------------------------------
+In the Finite-Difference Time-Domain method, above Maxwell's equations are discretized by replacing the partial space and
+time derivatives with centered finite differences.
+For example, the partial space derivative along :math:`x` of a scalar field :math:`u` at position
+:math:`(i,j,k)` and time step :math:`n` becomes
+
+.. math::
+
+   \partial_x u(i\Delta x,j\Delta y,k\Delta z,n\Delta t) = \frac{u_{i+1/2,j,k}^n - u_{i-1/2,j,k}^n}{\Delta x}
+
+and the temporal derivative becomes
+
+.. math::
+
+   \partial_t u(i\Delta x,j\Delta y,k\Delta z,n\Delta t) = \frac{u_{i,j,k}^{n+1/2} - u_{i,j,k}^{n-1/2}}{\Delta t},
+
+when replacing with the lowest order central differences.
+Note, with this leapfrog discretization or staggering, derivatives of field quantities are calculated at positions
+between positions where the field quantities are known.
+
+The above discretization uses one neighbor to each side from the point where the derivative is calculated yielding a
+second order accurate approximation of the derivative.
+Using more neighbors for the approximation of the spatial derivative is possible in PIConGPU and reduces the
+discretization error.
+Which is to say that the order of the method is increased.
+The error order scales with twice the number of neighbors :math:`M` used to approximate the derivative.
+The arbitrary order finite difference of order :math:`2M` reads
+
+.. math::
+
+   \partial_x u(i\Delta x,j\Delta y,k\Delta z,n\Delta t) &=  \sum\limits_{l=1/2}^{M-1/2}
+      \left[ g^{2M}_l \frac{u_{i + l, j, k}^n - u_{i - l, j, k}^n}{\Delta x} \right]\,\mathrm{, where}
+
+   g^{2M}_l &= \frac{(-1)^{l-1/2}}{2l^2} \frac{((2M-1)!!)^2}{(2M -1 - 2l)!! (2M -1 + 2l)!!}
+
+with :math:`l=-M+1/2, -M+1+1/2, ..., -1/2, 1/2, ..., M-1/2` [Ghrist2000]_.
+A recurrence relation for the weights exists,
+
+.. math::
+
+   g^{2M}_l &= (-1)\frac{(l-1)^2}{l^2} \frac{(2M+1-2l)}{(2M-1+2l)} g^{2M}_{l-1}
+
+   g^{2M}_\frac{1}{2} &= \frac{16^{1-M}}{M} \left( \frac{(2M-1)!}{\left[(M-1)!\right]^2} \right)^2
+
+
+
+Maxwell's equations on the mesh
+-------------------------------
+When discretizing on the mesh with centered finite differences, the spatial positions of field components need to be
+chosen such that a field component, whose **temporal derivative** is
+calculated on the left hand side of a Maxwell equation, is spatially positioned between the two field components whose
+**spatial derivative** is evaluated on the right hand side of the respective Maxwell equation.
+In this way, the spatial points where a left hand side temporal derivative of a field is evaluate lies exactly at the
+position where the spatial derivative of the right hand side fields is calculated.
+The following image visualizes the arrangement of field components in PIConGPU.
+
+.. image:: media/Yee-cell.png
+   :width: 400
+   :alt: Yee cell in PIConGPU
+
+Component-wise and using second order finite differences for the derivative approximation, Maxwell's equations read in
+PIConGPU
+
+.. math::
+
+   \frac{E_x\rvert_{i+1/2, j, k}^{n+1} - E_x\rvert_{i+1/2, j, k}^{n}}{c^2 \Delta t} =&
+    \frac{B_z\rvert_{i+1/2, j+1/2, k}^{n+1/2} - B_z\rvert_{i+1/2, j-1/2, k}^{n+1/2}}{\Delta y}
+
+   & - \frac{B_y\rvert_{i+1/2, j, k+1/2}^{n+1/2} - B_y\rvert_{i+1/2, j, k-1/2}^{n+1/2}}{\Delta z}
+    - \mu_0 J_x\rvert_{i+1/2, j, k}^{n+1/2}
+
+   \frac{E_y\rvert_{i, j+1/2, k}^{n+1} - E_y\rvert_{i, j+1/2, k}^{n}}{c^2 \Delta t} =&
+    \frac{B_x\rvert_{i, j+1/2, k+1/2}^{n+1/2} - B_x\rvert_{i, j, k-1/2}^{n+1/2}}{\Delta z}
+
+   & - \frac{B_z\rvert_{i+1/2, j+1/2, k}^{n+1/2} - B_z\rvert_{i-1/2, j+1/2, k}^{n+1/2}}{\Delta x}
+    - \mu_0 J_y\rvert_{i, j+1/2, k}^{n+1/2}
+
+   \frac{E_z\rvert_{i, j, k+1/2}^{n+1} - E_z\rvert_{i, j, k+1/2}^{n}}{c^2 \Delta t} =&
+    \frac{B_y\rvert_{i+1/2, j, k+1/2}^{n+1/2} - B_y\rvert_{i-1/2, j, k+1/2}^{n+1/2}}{\Delta x}
+
+   & - \frac{B_x\rvert_{i, j+1/2, k+1/2}^{n+1/2} - B_x\rvert_{i, j-1/2, k+1/2}^{n+1/2}}{\Delta y}
+    - \mu_0 J_z\rvert_{i+1/2, j+1/2, k}^{n+1/2}
+
+   \frac{B_x\rvert_{i, j+1/2, k+1/2}^{n+3/2} - B_x\rvert_{i, j+1/2, k+1/2}^{n+1/2}}{\Delta t} =&
+    \frac{E_y\rvert_{i, j+1/2, k+1}^{n+1} - E_y\rvert_{i, j+1/2, k}^{n+1}}{\Delta z}
+    - \frac{E_z\rvert_{i, j+1, k+1/2}^{n+1} - E_z\rvert_{i, j, k+1/2}^{n+1}}{\Delta y}
+
+   \frac{B_y\rvert_{i+1/2, j, k+1/2}^{n+3/2} - B_y\rvert_{i+1/2, j, k+1/2}^{n+1/2}}{\Delta t} =&
+    \frac{E_z\rvert_{i+1, j, k+1/2}^{n+1} - E_z\rvert_{i, j, k+1/2}^{n+1}}{\Delta x}
+    - \frac{E_x\rvert_{i+1/2, j, k+1}^{n+1} - E_x\rvert_{i+1/2, j, k}^{n+1}}{\Delta z}
+
+   \frac{B_z\rvert_{i+1/2, j+1/2, k}^{n+3/2} - B_z\rvert_{i+1/2, j+1/2, k}^{n+1/2}}{\Delta t} =&
+    \frac{E_x\rvert_{i+1/2, j+1, k}^{n+1} - E_x\rvert_{i+1/2, j, k}^{n+1}}{\Delta y}
+    - \frac{E_y\rvert_{i+1, j+1/2, k}^{n+1} - E_y\rvert_{i, j+1/2, k}^{n+1}}{\Delta x}
+
+As can be seen from these equations, the components of the source current are located at the respective components of
+the electric field.
+Following Gauss's law, the charge density is located at the cell corner.
+
+Using Esirkepov's notation for the discretized differential operators,
+
+.. math::
+
+   \nabla^+ u_{i,j,k} &= \left( \frac{u_{i+1,j,k} - u_{i,j,k}}{\Delta x},
+                                \frac{u_{i,j+1,k} - u_{i,j,k}}{\Delta y}
+                                \frac{u_{i,j,k+1} - u_{i,j,k}}{\Delta z}
+                         \right)
+
+   \nabla^- u_{i,j,k} &= \left( \frac{u_{i,j,k} - u_{i-1,j,k}}{\Delta x},
+                                \frac{u_{i,j,k} - u_{i,j-1,k}}{\Delta y}
+                                \frac{u_{i,j,k} - u_{i,j,k-1}}{\Delta z}
+                         \right)\,,
+
+the shorthand notation for the discretized Maxwell equations in PIConGPU reads
+
+.. math::
+
+   \frac{\vec E\rvert^{n+1} - \vec E\rvert^{n}}{c^2 \Delta t} &=
+       \nabla^- \times \vec B\rvert^{n+1/2} - \mu_0 \vec J\rvert^{n+1/2}
+
+   \frac{\vec B\rvert^{n+3/2} - \vec B\rvert^{n+1/2}}{\Delta t} &= - \nabla^+ \times \vec E\rvert^{n+1}
+
+   \nabla^- \cdot \vec E\rvert^{n+1} &= \rho\rvert^{n+1}
+
+   \nabla^+ \cdot \vec B\rvert^{n+3/2} &= 0\,,
+
+with initial conditions
+
+.. math::
+
+   \nabla^- \cdot \vec E &= 0
+
+   \nabla^+ \cdot \vec B &= 0\,.
+
+The components :math:`E_x\rvert_{1/2, 0, 0}=E_y\rvert_{0, 1/2, 0}=E_z\rvert_{0, 0, 1/2}
+=B_x\rvert_{I, J+1/2, K+1/2}=B_y\rvert_{I+1/2, J, K+1/2}=B_z\rvert_{I+1/2, J+1/2, K}=0` for all times when using
+absorbing boundary conditions.
+Here, :math:`I,J,K` are the maximum values of :math:`i,j,k` defining the total mesh size.
+
+Note, in PIConGPU the :math:`\vec B`-field update is split in two updates of half the time step, e.g.
+
+.. math::
+
+   \frac{B_x\rvert_{i, j+1/2, k+1/2}^{n+1} - B_x\rvert_{i, j+1/2, k+1/2}^{n+1/2}}{\Delta t / 2} =
+    \frac{E_y\rvert_{i, j+1/2, k+1}^{n+1} - E_y\rvert_{i, j+1/2, k}^{n+1}}{\Delta z}
+    - \frac{E_z\rvert_{i, j+1, k+1/2}^{n+1} - E_z\rvert_{i, j, k+1/2}^{n+1}}{\Delta y}
+
+and
+
+.. math::
+
+   \frac{B_x\rvert_{i, j+1/2, k+1/2}^{n+3/2} - B_x\rvert_{i, j+1/2, k+1/2}^{n+1}}{\Delta t / 2} =
+    \frac{E_y\rvert_{i, j+1/2, k+1}^{n+1} - E_y\rvert_{i, j+1/2, k}^{n+1}}{\Delta z}
+    - \frac{E_z\rvert_{i, j+1, k+1/2}^{n+1} - E_z\rvert_{i, j, k+1/2}^{n+1}}{\Delta y}
+
+for the :math:`B_x` component, where the second half of the update is performed at the beginning of the next time step
+such that the electric and magnetic field are known at equal time in the particle pusher and at the end of a time step.
+
+
+Dispersion relation of light waves on a mesh
+--------------------------------------------
+The dispersion relation of a wave relates its oscillation period in time :math:`T` to its oscillation wavelength
+:math:`\lambda`, i.e. its angular frequency :math:`\omega = \frac{2\pi}{T}` to wave vector
+:math:`\vec k = \frac{2\pi}{\lambda} \vec e_k`.
+For an electromagnetic wave in vacuum,
+
+.. math::
+
+   \left[ \frac{\omega}{c} \right]^2 = k_x^2 + k_y^2 + k_z^2\,.
+
+However, on a 2D mesh, with arbitrary order finite differences for the spatial derivatives, the dispersion relation
+becomes
+
+.. math::
+
+   \left[ \frac{1}{c\Delta t} \sin\left(\frac{\omega \Delta t}{2} \right) \right]^2 =
+  \sum\limits_{l=1/2}^{M - 1/2} \sum\limits_{p=1/2}^{M - 1/2} g_l^{2M} g_p^{2M}
+    \left\lbrace
+      \frac{\sin(\tilde k_x l \Delta x)\sin(\tilde k_x p \Delta x)}{\Delta x^2}
+      + \frac{\sin(\tilde k_y l \Delta y)\sin(\tilde k_y p \Delta y)}{\Delta y^2}
+    \right\rbrace\,,
+
+
+where :math:`\tilde k_x` and :math:`\tilde k_y` are the wave vector components on the mesh in :math:`x` and :math:`y`
+direction, respectively.
+As is obvious from the relation, the numerical wave vector will be different from the real world wave vector for a given
+frequency :math:`\omega` due to discretization.
+
+
+Dispersion Relation for Yee's Method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Yee's Method [Yee1966]_ uses second order finite differences for the approximation of spatial derivatives.
+The corresponding dispersion relation reads
+
+.. math::
+
+   \left[ \frac{1}{c\Delta t} \sin\left(\frac{\omega \Delta t}{2}\right) \right]^2 =
+  \left[
+      \frac{1}{\Delta x} \sin\left(\frac{\tilde k_x \Delta x}{2}\right)
+  \right]^2
+      + \left[
+      \frac{1}{\Delta y} \sin\left(\frac{\tilde k_y \Delta y}{2}\right)
+  \right]^2\,.
+
+Solving for a wave's numerical frequency :math:`\omega` in dependence on its wave vector
+:math:`\vec{\tilde k} = (\tilde k\cos\phi, \tilde k\sin\phi)`, where the angle :math:`\phi` is enclosed by the mesh's
+:math:`x`-axis and the wave's propagation direction,
+
+.. math::
+
+   \omega = \frac{2}{\Delta t} \arcsin \xi\,\text{, where } \xi_\mathrm{max} = c\Delta t
+     \sqrt{ \frac{1}{\Delta x^2} + \frac{1}{\Delta y^2} + \frac{1}{\Delta z^2}} \quad \text{(in 3D)}
+
+reveals two important properties of the field solver.
+(The 2D version is obtained by letting :math:`\Delta z \rightarrow \infty`.)
+
+First, only within the range :math:`\xi_\mathrm{max} \leq 1` the field solver operates stable.
+This gives the *Courant-Friedrichs-Lewy* stability condition relating time step to mesh spacing
+
+.. math::
+
+   c\Delta t < \frac{1}{\sqrt{ \frac{1}{\Delta x^2} + \frac{1}{\Delta y^2} + \frac{1}{\Delta z^2} }} \quad \text{(in 3D)}
+
+Typically, :math:`\xi_\mathrm{max} = 0.995` is chosen.
+Outside this stability region, the frequency :math:`\omega` corresponding to a certain wave vector becomes imaginary,
+meaning that wave amplitudes can be nonphysical exponentially amplified [Taflove2005]_.
+
+Second, there exists a purely numerical anisotropy in a wave's phase velocity :math:`\tilde v_p = \omega / \tilde k`
+(speed of electromagnetic wave propagation) depending on its propagation direction :math:`\phi`, as depicted in the following figure
+
+.. image:: media/dispersion-relation_Yee.png
+   :width: 400
+   :alt: Velocity anisotropy for Yee
+
+assuming square cells :math:`\Delta x = \Delta y = \Delta` and where :math:`S=c\Delta t / \Delta`,
+:math:`N_\lambda=\lambda/\Delta`.
+That is, for the chosen sampling of three samples per wavelength :math:`\lambda`, the phase velocities along a cell
+edge and a cell diagonal differ by approximately 20%.
+The velocity error is largest for propagation along the edge.
+The phase velocity error can be significantly reduced by increasing the sampling, as visualized in the following figure
+by the scaling of the velocity error with wavelength sampling for propagation along the cell edge
+
+.. image:: media/dispersion-relation_Yee_sampling.png
+   :width: 400
+   :alt: Dispersion for Yee
+
+Another conclusion from this figure is, that a short-pulse laser with a large bandwidth will suffer from severe
+dispersion if the sampling is bad.
+In the extreme case where a wavelength is not even sampled twice on the mesh, its field is exponentially damped
+[Taflove2005]_.
+
+Given that most simulations employ short-pulse lasers propagating along the :math:`y`-axis and featuring a large bandwidth,
+the resolution of the laser wavelength should be a lot better than in the example, e.g. :math:`N_\lambda=24`, to reduce
+errors due to numerical dispersion.
+
+Note, the reduced phase velocity of light can further cause the emission of numerical Cherenkov radiation by fast charged
+particles  in the simulation [Lehe2012]_.
+The largest emitted wavelength equals the wavelength whose phase velocity is as slow as the particle's velocity, provided
+it is resolved at least twice on the mesh.
+
+
+Dispersion Relation for Arbitrary Order Finite Differences
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Using higher order finite differences for the approximation of spatial derivatives significantly improves the
+dispersion properties of the solver.
+Most notably, the velocity anisotropy reduces and the dependence of phase velocity on sampling reduces, too.
+Yet higher order solvers still feature dispersion.
+As shown in the following picture, its effect is, however, not reduction of phase velocity but increase of phase velocity
+beyond the physical vacuum speed of light.
+But this can be tweaked by reducing the time step relative to the limit set by the stability criterion.
+
+.. image:: media/dispersion-relation_AOFDTD_3.png
+   :width: 400
+   :alt: Velocity anisotropy for AOFDTD
+
+.. image:: media/dispersion-relation_AOFDTD_sampling.png
+   :width: 400
+   :alt: Dispersion for AOFDTD
+
+Note, it is generally not a good idea to reduce the time step in Yee's method significantly below the stability
+criterion as this increases the absolute phase velocity error.
+See the following figure,
+
+.. image:: media/dispersion-relation_AOFDTD_Courant-factor.png
+   :width: 400
+   :alt: Scaling of velocity error with Courant factor for diagonal propagation
+
+from which the optimum Courant factor :math:`S=c\Delta t / \Delta` can be read off for a 2D, square mesh, too.
+
+An important conclusion from the above figures showing velocity error over sampling is, that
+a higher order solver, with a larger mesh spacing and a smaller time step than given by the above stability limit,
+produces physically more accurate results than the standard Yee solver operating with smaller mesh spacing and a
+time step close to the stability limit.
+
+That is, it can be beneficial not only in terms of **physical accuracy**, but also in terms of **memory complexity**
+and **time to solution**, to chose a higher order solver with lower spatial resolution and increased time sampling
+relative to the stability limit.
+Memory complexity scales with number of cells :math:`N_\mathrm{cells}` required to sample a given volume
+:math:`N_\mathrm{cells}^d`, where :math:`d=2,3` is the dimension of the simulation domain,
+which decreases for larger cells.
+Time to solution scales with the time step and this can be larger with solvers of higher order compared to the Yee solver
+with comparable dispersion properties (which requires a smaller cell size than the arbitrary order solver)
+since the time step is limited by the stability condition which scales with cell size.
+Since the cell size can be larger for arbitrary order solvers, the respective time step limit given by the stability
+condition is larger and operating with a time step ten times smaller than the limit might still result in a larger
+step than those of the comparable Yee solver.
+Finally, physical accuracy is increased by the reduction of the impact of dispersion effects.
+
+
+Usage
+-----
+The field solver can be chosen and configured in :ref:`fieldSolver.param <usage-params-core>`.
+
+
+References
+----------
+.. [Esirkepov2001]
+        T.Zh. Esirkepov,
+        *Exact charge conservation scheme for particle-in-cell simulation with an arbitrary form-factor*,
+        Computer Physics Communications 135.2 (2001): 144-153,
+        https://doi.org/10.1016/S0010-4655(00)00228-9
+
+.. [Ghrist2000]
+        M. Ghrist,
+        *High-Order Finite Difference Methods for Wave Equations*,
+        PhD thesis (2000),
+        Department of Applied Mathematics, University of Colorado
+
+.. [Lehe2012]
+        R. Lehe et al.
+        *Numerical growth of emittance in simulations of laser-wakefield acceleration*,
+        Physical Review Special Topics-Accelerators and Beams 16.2 (2013): 021301.
+
+.. [Taflove2005]
+        A. Taflove
+        *Computational electrodynamics: the finite-difference time-domain method*
+        Artech house (2005)
+
+.. [Yee1966]
+        K.S. Yee,
+        *Numerical solution of initial boundary value problems involving Maxwell's equations in isotropic media*,
+        IEEE Trans. Antennas Propagat. 14, 302-307 (1966)
diff --git a/docs/source/models/field_ionization.rst b/docs/source/models/field_ionization.rst
index 74657e5ba1..ac52a93eb5 100644
--- a/docs/source/models/field_ionization.rst
+++ b/docs/source/models/field_ionization.rst
@@ -56,6 +56,24 @@ Overview: Implemented Models
 
     Models marked with "(R&D)" are under *research and development* and should be used with care.
 
+Ionization Current
+------------------
+
+In order to conserve energy, PIConGPU supports an ionization current to decrease the electric field according to the amount of energy lost to field ioniztion processes.
+The current for a single ion is
+
+.. math::
+
+    \mathbf{J}_\mathrm{ion} = E_\mathrm{ion} \frac{\mathbf{E}}{|\mathbf{E}|^2 \Delta t V_\mathrm{cell}}
+
+It is assigned to the grid according to the macroparticle shape.
+:math:`E_\mathrm{ion}` is the energy required to ionize the atom/ion, :math:`\mathbf{E}` is the electric field at the particle position and :math:`V_\mathrm{cell}` is the cell volume.
+This formula makes the assumption that the ejection energy of the electron is zero.
+See [Mulser]_.
+The ionization current is accessible in :ref:`speciesDefinition.param <usage-params-core>`. To activate ionization current, set the second template of the ionization model to particles::ionization::current::EnergyConservation.
+By default the ionization current is deactivated.
+
+
 Usage
 -----
 
@@ -200,3 +218,9 @@ References
         *Atomic Screening Constant from SCF Functions. II. Atoms with 37 to 86 Electrons*,
         The Journal of Chemical Physics 47, 1300-1307 (1967)
         https://dx.doi.org/10.1063/1.1712084
+
+.. [Mulser]
+        P. Mulser et al.
+        *Modeling field ionization in an energy conserving form and resulting nonstandard fluid dynamcis*,
+        Physics of Plasmas 5, 4466 (1998)
+        https://doi.org/10.1063/1.873184
diff --git a/docs/source/models/field_ionization_charge_state_prediction.py b/docs/source/models/field_ionization_charge_state_prediction.py
index fb32207e9a..7e572948e6 100644
--- a/docs/source/models/field_ionization_charge_state_prediction.py
+++ b/docs/source/models/field_ionization_charge_state_prediction.py
@@ -1,7 +1,7 @@
 """Ionization prediction module and example.
 
 This file is part of the PIConGPU.
-Copyright 2019-2020 PIConGPU contributors
+Copyright 2019-2021 PIConGPU contributors
 Authors: Marco Garten
 License: GPLv3+
 """
diff --git a/docs/source/models/media/Yee-cell.png b/docs/source/models/media/Yee-cell.png
new file mode 100644
index 0000000000..fc63f4a9c8
Binary files /dev/null and b/docs/source/models/media/Yee-cell.png differ
diff --git a/docs/source/models/media/Yee-cell.svg b/docs/source/models/media/Yee-cell.svg
new file mode 100644
index 0000000000..9e5fb4d8fd
--- /dev/null
+++ b/docs/source/models/media/Yee-cell.svg
@@ -0,0 +1,922 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="229.10875mm"
+   height="144.87747mm"
+   viewBox="0 0 229.10875 144.87747"
+   version="1.1"
+   id="svg8"
+   inkscape:version="0.92.3 (2405546, 2018-03-11)"
+   sodipodi:docname="Yee-cell.svg"
+   inkscape:export-filename="/home/steinigk/HEMERA/src/devpic/docs/source/models/Yee-cell.png"
+   inkscape:export-xdpi="96"
+   inkscape:export-ydpi="96">
+  <defs
+     id="defs2" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="3.6459443"
+     inkscape:cx="417.9622"
+     inkscape:cy="306.80375"
+     inkscape:document-units="mm"
+     inkscape:current-layer="layer1"
+     showgrid="true"
+     inkscape:object-paths="true"
+     inkscape:snap-grids="true"
+     inkscape:snap-intersection-paths="true"
+     inkscape:window-width="2560"
+     inkscape:window-height="1362"
+     inkscape:window-x="1920"
+     inkscape:window-y="24"
+     inkscape:window-maximized="1"
+     inkscape:snap-bbox="true"
+     inkscape:bbox-paths="true"
+     inkscape:bbox-nodes="true"
+     inkscape:snap-nodes="true"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:snap-to-guides="false">
+    <inkscape:grid
+       type="xygrid"
+       id="grid10"
+       originx="18.520832"
+       originy="-139.54961" />
+  </sodipodi:namedview>
+  <metadata
+     id="metadata5">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(18.520832,-12.572913)">
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 71.437499,21.833333 H 201.08333 V 98.5625 H 71.4375 Z"
+       id="path22"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="ccccc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 26.458333,66.8125 v 76.72917 H 156.10417 V 66.812497 Z"
+       id="path14"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="ccccc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 26.458333,66.8125 71.437499,21.833333"
+       id="path16"
+       inkscape:connector-curvature="0" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path18"
+       d="M 156.10417,66.8125 201.08333,21.833333"
+       style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 156.10417,143.54167 201.08333,98.5625"
+       id="path20"
+       inkscape:connector-curvature="0" />
+    <path
+       inkscape:connector-curvature="0"
+       id="path24"
+       d="M 26.458333,143.54167 71.437499,98.5625"
+       style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    <g
+       transform="translate(-66.469139,-8.2672484)"
+       id="g1119">
+      <rect
+         y="52.104664"
+         x="137.48793"
+         height="9.8319569"
+         width="9.5045872"
+         id="rect1111"
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.72912222" />
+      <text
+         id="text1117"
+         y="59.819954"
+         x="136.44923"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#ff0000;stroke-width:0.26458332"
+           y="59.819954"
+           x="136.44923"
+           id="tspan1115"
+           sodipodi:role="line">E<tspan
+   id="tspan1113"
+   style="font-size:64.99999762%;baseline-shift:sub;fill:#ff0000">x</tspan></tspan></text>
+    </g>
+    <g
+       id="g991"
+       transform="translate(15.741055,8.7687409)">
+      <rect
+         y="90.624702"
+         x="28.872112"
+         height="10.741307"
+         width="41.57132"
+         id="rect986"
+         style="opacity:1;fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.26458332;stroke-miterlimit:4;stroke-dasharray:0.79374999, 0.79374999;stroke-dashoffset:0;stroke-opacity:1" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="28.406536"
+         y="99.006912"
+         id="text389"><tspan
+           sodipodi:role="line"
+           id="tspan387"
+           x="28.406536"
+           y="99.006912"
+           style="stroke-width:0.26458332">(i,j,k+1)</tspan></text>
+    </g>
+    <path
+       inkscape:connector-curvature="0"
+       id="path34"
+       d="m 48.947915,44.32291 v 76.72918 H 178.59376 V 44.32291 Z"
+       style="fill:none;stroke:#999999;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       sodipodi:nodetypes="ccccc" />
+    <g
+       transform="translate(-59.662657,20.150707)"
+       id="g1043">
+      <rect
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.78120512"
+         id="rect1035"
+         width="9.5247412"
+         height="11.26288"
+         x="188.51474"
+         y="74.972206" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="187.47604"
+         y="82.6875"
+         id="text1041"><tspan
+           sodipodi:role="line"
+           id="tspan1039"
+           x="187.47604"
+           y="82.6875"
+           style="fill:#ff0000;stroke-width:0.26458332">E<tspan
+   style="font-size:64.99999762%;baseline-shift:sub"
+   id="tspan1037">y</tspan></tspan></text>
+    </g>
+    <path
+       style="fill:none;stroke:#999999;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 26.458078,105.17707 H 156.10417 L 201.08333,60.197917 H 71.437337 Z"
+       id="path36"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="ccccc" />
+    <path
+       style="fill:none;stroke:#999999;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 91.281226,143.54167 136.26025,98.5625 V 21.833333 L 91.281226,66.8125 Z"
+       id="path38"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#999999;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 113.77073,121.05209 1e-5,-76.729181"
+       id="path40"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#999999;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 136.26025,60.197917 91.281226,105.17707"
+       id="path42"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#999999;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 178.59375,82.687492 48.947913,82.687286"
+       id="path44"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       x="12.848831"
+       y="154.12494"
+       id="text48"><tspan
+         sodipodi:role="line"
+         id="tspan46"
+         x="12.848831"
+         y="154.12494"
+         style="stroke-width:0.26458332">(i,j,k)</tspan></text>
+    <g
+       id="g129"
+       transform="translate(1e-6,-6.657552e-6)">
+      <path
+         inkscape:connector-curvature="0"
+         id="path91"
+         d="m -15.875,123.69792 -1.322916,5.29167 1.322916,-1.32292 1.322917,1.32292 z"
+         style="fill:#000000;stroke:#000000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path93"
+         d="m -15.875,127.66667 v 15.875"
+         style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="M -1.6752462e-8,143.54167 H -15.875"
+         id="path95"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:#000000;stroke:#000000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="M 3.96875,143.54167 -1.32292,142.21876 -1.6752462e-8,143.54167 -1.32292,144.86459 Z"
+         id="path99"
+         inkscape:connector-curvature="0" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path101"
+         d="M -7.7770033,135.44367 -12.454223,138.25 h 1.87089 v 1.87089 z"
+         style="fill:#000000;stroke:#000000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         sodipodi:nodetypes="cc"
+         inkscape:connector-curvature="0"
+         id="path103"
+         d="m -10.26234,137.92901 -5.61266,5.61266"
+         style="fill:none;stroke:#000000;stroke-width:0.70555556;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <text
+         id="text107"
+         y="122.87109"
+         x="-18.825724"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="stroke-width:0.26458332"
+           y="122.87109"
+           x="-18.825724"
+           id="tspan105"
+           sodipodi:role="line">x</tspan></text>
+      <text
+         id="text111"
+         y="145.33484"
+         x="6.0476193"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="stroke-width:0.26458332"
+           y="145.33484"
+           x="6.0476193"
+           id="tspan109"
+           sodipodi:role="line">y</tspan></text>
+      <text
+         id="text115"
+         y="135.60417"
+         x="-7.0693359"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="stroke-width:0.26458332"
+           y="135.60417"
+           x="-7.0693359"
+           id="tspan113"
+           sodipodi:role="line">z</tspan></text>
+    </g>
+    <g
+       transform="rotate(45,105.91929,-102.30664)"
+       id="g135">
+      <path
+         inkscape:connector-curvature="0"
+         id="path131"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path133"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <g
+       transform="translate(-109.80208,25.135413)"
+       id="g153">
+      <path
+         inkscape:connector-curvature="0"
+         id="path149"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#0000ff;stroke:#0000ff;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path151"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#0000ff;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <g
+       id="g5027"
+       transform="translate(64.011009,17.504893)">
+      <rect
+         y="88.995102"
+         x="27.270241"
+         height="9.8319607"
+         width="9.5319757"
+         id="rect4931"
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.73017216" />
+      <text
+         id="text157"
+         y="96.710396"
+         x="26.231544"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#0000ff;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#0000ff;stroke-width:0.26458332"
+           y="96.710396"
+           x="26.231544"
+           id="tspan155"
+           sodipodi:role="line">B<tspan
+   id="tspan159"
+   style="font-size:64.99999762%;baseline-shift:sub;fill:#0000ff">z</tspan></tspan></text>
+    </g>
+    <g
+       id="g4991"
+       transform="translate(-59.209049,20.339692)">
+      <rect
+         y="58.379055"
+         x="100.21947"
+         height="11.262877"
+         width="10.082847"
+         id="rect4937"
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.80376667" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#0000ff;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="99.180771"
+         y="66.094345"
+         id="text183"><tspan
+           sodipodi:role="line"
+           id="tspan181"
+           x="99.180771"
+           y="66.094345"
+           style="fill:#0000ff;stroke-width:0.26458332">B<tspan
+   id="tspan185"
+   style="font-size:64.99999762%;baseline-shift:sub">y</tspan></tspan></text>
+    </g>
+    <g
+       id="g203"
+       transform="rotate(90,133.61458,53.583336)">
+      <path
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path199"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path201"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       id="g4967"
+       transform="translate(-97.233489,71.215291)">
+      <rect
+         y="74.972206"
+         x="188.51474"
+         height="11.26288"
+         width="9.5247412"
+         id="rect4911"
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.78120512" />
+      <text
+         id="text221"
+         y="82.6875"
+         x="187.47604"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#ff0000;stroke-width:0.26458332"
+           y="82.6875"
+           x="187.47604"
+           id="tspan219"
+           sodipodi:role="line">E<tspan
+   id="tspan223"
+   style="font-size:64.99999762%;baseline-shift:sub">y</tspan></tspan></text>
+    </g>
+    <g
+       id="g4985"
+       transform="translate(-111.0296,54.395333)">
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="136.44923"
+         y="59.819954"
+         id="text271"><tspan
+           sodipodi:role="line"
+           id="tspan269"
+           x="136.44923"
+           y="59.819954"
+           style="fill:#ff0000;stroke-width:0.26458332">E<tspan
+   style="font-size:64.99999762%;baseline-shift:sub;fill:#ff0000"
+   id="tspan267">x</tspan></tspan></text>
+    </g>
+    <g
+       id="g315"
+       transform="translate(-197.11458,9.260413)">
+      <path
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path311"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path313"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       id="g5003"
+       transform="translate(-65.786152,17.731663)">
+      <rect
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.70847356"
+         id="rect4921"
+         width="8.9738703"
+         height="9.8319588"
+         x="114.73407"
+         y="104.64333" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="113.69537"
+         y="112.35863"
+         id="text329"><tspan
+           sodipodi:role="line"
+           id="tspan327"
+           x="113.69537"
+           y="112.35863"
+           style="fill:#ff0000;stroke-width:0.26458332">E<tspan
+   style="font-size:64.99999762%;baseline-shift:sub"
+   id="tspan325">z</tspan></tspan></text>
+    </g>
+    <g
+       id="g5021"
+       transform="translate(63.708586,9.0382199)">
+      <rect
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.75022393"
+         id="rect4929"
+         width="10.062693"
+         height="9.8319607"
+         x="50.062248"
+         y="113.33678" />
+      <text
+         id="text341"
+         y="121.05207"
+         x="49.023552"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#0000ff;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#0000ff;stroke-width:0.26458332"
+           y="121.05207"
+           x="49.023552"
+           id="tspan339"
+           sodipodi:role="line">B<tspan
+   style="font-size:64.99999762%;baseline-shift:sub"
+   id="tspan337">x</tspan></tspan></text>
+    </g>
+    <text
+       id="text385"
+       y="154.56941"
+       x="135.56279"
+       style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       xml:space="preserve"><tspan
+         style="stroke-width:0.26458332"
+         y="154.56941"
+         x="135.56279"
+         id="tspan383"
+         sodipodi:role="line">(i,j+1,k)</tspan></text>
+    <g
+       transform="rotate(90,171.97917,15.218752)"
+       id="g1001">
+      <path
+         inkscape:connector-curvature="0"
+         id="path997"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path999"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <g
+       id="g1007"
+       transform="rotate(90,216.95833,15.218752)">
+      <path
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path1003"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path1005"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       transform="rotate(90,178.59375,53.583334)"
+       id="g1013">
+      <path
+         inkscape:connector-curvature="0"
+         id="path1009"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path1011"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <g
+       transform="translate(-89.295989,-17.420126)"
+       id="g1023">
+      <rect
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.78120512"
+         id="rect1015"
+         width="9.5247412"
+         height="11.26288"
+         x="188.51474"
+         y="74.972206" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="187.47604"
+         y="82.6875"
+         id="text1021"><tspan
+           sodipodi:role="line"
+           id="tspan1019"
+           x="187.47604"
+           y="82.6875"
+           style="fill:#ff0000;stroke-width:0.26458332">E<tspan
+   style="font-size:64.99999762%;baseline-shift:sub"
+   id="tspan1017">y</tspan></tspan></text>
+    </g>
+    <g
+       id="g1033"
+       transform="translate(-44.05224,-62.399293)">
+      <rect
+         y="74.972206"
+         x="188.51474"
+         height="11.26288"
+         width="9.5247412"
+         id="rect1025"
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.78120512" />
+      <text
+         id="text1031"
+         y="82.6875"
+         x="187.47604"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#ff0000;stroke-width:0.26458332"
+           y="82.6875"
+           x="187.47604"
+           id="tspan1029"
+           sodipodi:role="line">E<tspan
+   id="tspan1027"
+   style="font-size:64.99999762%;baseline-shift:sub">y</tspan></tspan></text>
+    </g>
+    <g
+       id="g1049"
+       transform="rotate(45,198.53958,-140.67122)">
+      <path
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path1045"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path1047"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       transform="rotate(45,263.3625,15.825142)"
+       id="g1055">
+      <path
+         inkscape:connector-curvature="0"
+         id="path1051"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path1053"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <g
+       id="g1061"
+       transform="rotate(45,170.7422,54.189726)">
+      <path
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path1057"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path1059"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       transform="translate(-75.046569,-64.28917)"
+       id="g1071">
+      <rect
+         y="104.64333"
+         x="114.73407"
+         height="9.8319588"
+         width="8.9738703"
+         id="rect1063"
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.70847356" />
+      <text
+         id="text1069"
+         y="112.35863"
+         x="113.69537"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#ff0000;stroke-width:0.26458332"
+           y="112.35863"
+           x="113.69537"
+           id="tspan1067"
+           sodipodi:role="line">E<tspan
+   id="tspan1065"
+   style="font-size:64.99999762%;baseline-shift:sub">z</tspan></tspan></text>
+    </g>
+    <g
+       id="g1081"
+       transform="translate(54.599265,-64.28917)">
+      <rect
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.70847356"
+         id="rect1073"
+         width="8.9738703"
+         height="9.8319588"
+         x="114.73407"
+         y="104.64333" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="113.69537"
+         y="112.35863"
+         id="text1079"><tspan
+           sodipodi:role="line"
+           id="tspan1077"
+           x="113.69537"
+           y="112.35863"
+           style="fill:#ff0000;stroke-width:0.26458332">E<tspan
+   style="font-size:64.99999762%;baseline-shift:sub"
+   id="tspan1075">z</tspan></tspan></text>
+    </g>
+    <g
+       transform="translate(63.859681,17.731663)"
+       id="g1091">
+      <rect
+         y="104.64333"
+         x="114.73407"
+         height="9.8319588"
+         width="8.9738703"
+         id="rect1083"
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.70847356" />
+      <text
+         id="text1089"
+         y="112.35863"
+         x="113.69537"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#ff0000;stroke-width:0.26458332"
+           y="112.35863"
+           x="113.69537"
+           id="tspan1087"
+           sodipodi:role="line">E<tspan
+   id="tspan1085"
+   style="font-size:64.99999762%;baseline-shift:sub">z</tspan></tspan></text>
+    </g>
+    <g
+       transform="translate(-152.13541,-35.718754)"
+       id="g1097">
+      <path
+         inkscape:connector-curvature="0"
+         id="path1093"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path1095"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <g
+       id="g1103"
+       transform="translate(-22.489576,-35.718754)">
+      <path
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path1099"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path1101"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       transform="translate(-67.468742,9.260413)"
+       id="g1109">
+      <path
+         inkscape:connector-curvature="0"
+         id="path1105"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#ff0000;stroke:#ff0000;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path1107"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#ff0000;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <g
+       id="g1129"
+       transform="translate(63.595404,9.4161672)">
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="136.44923"
+         y="59.819954"
+         id="text1127"><tspan
+           sodipodi:role="line"
+           id="tspan1125"
+           x="136.44923"
+           y="59.819954"
+           style="fill:#ff0000;stroke-width:0.26458332">E<tspan
+   style="font-size:64.99999762%;baseline-shift:sub;fill:#ff0000"
+   id="tspan1123">x</tspan></tspan></text>
+    </g>
+    <g
+       transform="translate(18.616237,54.395333)"
+       id="g1139">
+      <text
+         id="text1137"
+         y="59.819954"
+         x="136.44923"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#ff0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#ff0000;stroke-width:0.26458332"
+           y="59.819954"
+           x="136.44923"
+           id="tspan1135"
+           sodipodi:role="line">E<tspan
+   id="tspan1133"
+   style="font-size:64.99999762%;baseline-shift:sub;fill:#ff0000">x</tspan></tspan></text>
+    </g>
+    <g
+       id="g1145"
+       transform="translate(-109.80208,-51.593754)">
+      <path
+         style="fill:#0000ff;stroke:#0000ff;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path1141"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#0000ff;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path1143"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       transform="translate(63.708586,-85.756479)"
+       id="g1155">
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#0000ff;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="49.023552"
+         y="121.05207"
+         id="text1153"><tspan
+           sodipodi:role="line"
+           id="tspan1151"
+           x="49.023552"
+           y="121.05207"
+           style="fill:#0000ff;stroke-width:0.26458332">B<tspan
+   id="tspan1149"
+   style="font-size:64.99999762%;baseline-shift:sub">x</tspan></tspan></text>
+    </g>
+    <g
+       id="g347"
+       transform="rotate(90,142.875,1.9895857)">
+      <path
+         style="fill:#0000ff;stroke:#0000ff;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path343"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#0000ff;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path345"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       transform="rotate(90,207.69792,66.812502)"
+       id="g1161">
+      <path
+         inkscape:connector-curvature="0"
+         id="path1157"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#0000ff;stroke:#0000ff;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path1159"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#0000ff;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <g
+       transform="translate(87.634701,20.339692)"
+       id="g1171">
+      <rect
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.80376667"
+         id="rect1163"
+         width="10.082847"
+         height="11.262877"
+         x="100.21947"
+         y="58.379055" />
+      <text
+         id="text1169"
+         y="66.094345"
+         x="99.180771"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#0000ff;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         xml:space="preserve"><tspan
+           style="fill:#0000ff;stroke-width:0.26458332"
+           y="66.094345"
+           x="99.180771"
+           id="tspan1167"
+           sodipodi:role="line">B<tspan
+   style="font-size:64.99999762%;baseline-shift:sub"
+   id="tspan1165">y</tspan></tspan></text>
+    </g>
+    <g
+       id="g289"
+       transform="rotate(45,146.24877,-59.143285)">
+      <path
+         style="fill:#0000ff;stroke:#0000ff;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         id="path285"
+         inkscape:connector-curvature="0" />
+      <path
+         style="fill:none;stroke:#0000ff;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+         d="m 223.57291,90.625 v 5.291667"
+         id="path287"
+         inkscape:connector-curvature="0" />
+    </g>
+    <g
+       transform="translate(114.57792,-42.893971)"
+       id="g1187">
+      <rect
+         style="fill:#ffffff;fill-opacity:0.76470588;stroke:none;stroke-width:0.73017216"
+         id="rect1179"
+         width="9.5319757"
+         height="9.8319607"
+         x="27.270241"
+         y="88.995102" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#0000ff;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+         x="26.231544"
+         y="96.710396"
+         id="text1185"><tspan
+           sodipodi:role="line"
+           id="tspan1183"
+           x="26.231544"
+           y="96.710396"
+           style="fill:#0000ff;stroke-width:0.26458332">B<tspan
+   style="font-size:64.99999762%;baseline-shift:sub;fill:#0000ff"
+   id="tspan1181">z</tspan></tspan></text>
+    </g>
+    <g
+       transform="rotate(45,223.03301,-27.338211)"
+       id="g1177">
+      <path
+         inkscape:connector-curvature="0"
+         id="path1173"
+         d="m 223.57291,87.979167 -1.32291,3.96875 1.32291,-1.322917 1.32292,1.322917 z"
+         style="fill:#0000ff;stroke:#0000ff;stroke-width:0.35277778;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+      <path
+         inkscape:connector-curvature="0"
+         id="path1175"
+         d="m 223.57291,90.625 v 5.291667"
+         style="fill:none;stroke:#0000ff;stroke-width:1.05833328;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
+    </g>
+    <text
+       id="text1191"
+       y="69.902748"
+       x="-16.784504"
+       style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
+       xml:space="preserve"><tspan
+         style="stroke-width:0.26458332"
+         y="69.902748"
+         x="-16.784504"
+         id="tspan1189"
+         sodipodi:role="line">(i+1,j,k)</tspan></text>
+  </g>
+</svg>
diff --git a/docs/source/models/media/dispersion-relation_AOFDTD_3.png b/docs/source/models/media/dispersion-relation_AOFDTD_3.png
new file mode 100644
index 0000000000..4eaf9793f0
Binary files /dev/null and b/docs/source/models/media/dispersion-relation_AOFDTD_3.png differ
diff --git a/docs/source/models/media/dispersion-relation_AOFDTD_Courant-factor.png b/docs/source/models/media/dispersion-relation_AOFDTD_Courant-factor.png
new file mode 100644
index 0000000000..53ab333781
Binary files /dev/null and b/docs/source/models/media/dispersion-relation_AOFDTD_Courant-factor.png differ
diff --git a/docs/source/models/media/dispersion-relation_AOFDTD_sampling.png b/docs/source/models/media/dispersion-relation_AOFDTD_sampling.png
new file mode 100644
index 0000000000..663e9508a0
Binary files /dev/null and b/docs/source/models/media/dispersion-relation_AOFDTD_sampling.png differ
diff --git a/docs/source/models/media/dispersion-relation_Yee.png b/docs/source/models/media/dispersion-relation_Yee.png
new file mode 100755
index 0000000000..f66ac4b09f
Binary files /dev/null and b/docs/source/models/media/dispersion-relation_Yee.png differ
diff --git a/docs/source/models/media/dispersion-relation_Yee_sampling.png b/docs/source/models/media/dispersion-relation_Yee_sampling.png
new file mode 100644
index 0000000000..72db9c3e91
Binary files /dev/null and b/docs/source/models/media/dispersion-relation_Yee_sampling.png differ
diff --git a/docs/source/models/pic.rst b/docs/source/models/pic.rst
index 6b94b986d1..7792d6fdbe 100644
--- a/docs/source/models/pic.rst
+++ b/docs/source/models/pic.rst
@@ -3,7 +3,7 @@
 The Particle-in-Cell Algorithm
 ==============================
 
-.. sectionauthor:: Axel Huebl
+.. sectionauthor:: Axel Huebl, Klaus Steiniger
 
 Please also refer to the textbooks [BirdsallLangdon]_, [HockneyEastwood]_, our :ref:`latest paper on PIConGPU <usage-reference>` and the works in [Huebl2014]_ and [Huebl2019]_ .
 
@@ -13,15 +13,15 @@ System of Equations
 .. math::
 
    \nabla \cdot \mathbf{E} &= \frac{1}{\varepsilon_0}\sum_s \rho_s
-   
+
    \nabla \cdot \mathbf{B} &= 0
-   
+
    \nabla \times \mathbf{E} &= -\frac{\partial \mathbf{B}} {\partial t}
-   
+
    \nabla \times \mathbf{B} &= \mu_0\left(\sum_s \mathbf{J}_s + \varepsilon_0 \frac{\partial \mathbf{E}} {\partial t} \right)
-   
+
 for multiple particle species :math:`s`.
-:math:`\mathbf{E}(t)` represents the electic, :math:`\mathbf{B}(t)` the magnetic, :math:`\rho_s` the charge density and :math:`\mathbf{J}_s(t)` the current density field.
+:math:`\mathbf{E}(t)` represents the electric, :math:`\mathbf{B}(t)` the magnetic, :math:`\rho_s` the charge density and :math:`\mathbf{J}_s(t)` the current density field.
 
 Except for normalization of constants, PIConGPU implements the governing equations in SI units.
 
@@ -61,22 +61,15 @@ Electro-Magnetic PIC Method
 ---------------------------
 
 **Fields** such as :math:`\mathbf{E}(t), \mathbf{B}(t)` and :math:`\mathbf{J}(t)` are discretized on a regular mesh in Eulerian frame of reference (see [EulerLagrangeFrameOfReference]_).
+See :ref:`section Finite-Difference Time-Domain Method <model-AOFDTD>` describing how Maxwell's equations are discretized on a mesh in PIConGPU.
 
 The distribution of **Particles** is described by the distribution function :math:`f_s(\mathbf{x},\mathbf{v},t)`.
-This distribution function is sampled by *markers* (commonly referred to as *macro-particles*).
+This distribution function is sampled by *markers*, which are commonly referred to as *macroparticles*.
+These markers represent blobs of incompressible phase fluid moving in phase space.
 The temporal evolution of the distribution function is simulated by advancing the markers over time according to the Vlasov--Maxwell--Equation in Lagrangian frame (see eq. :eq:`VlasovMaxwell` and [EulerLagrangeFrameOfReference]_).
-
-Markers carry a spatial shape of order :math:`n` and a delta-distribution in momentum space.
-In most cases, these shapes are implemented as B-splines and are pre-integrated to *assignment functions* :math:`S` of the form:
-
-.. math::
-
-   S^0(x) = \big\{ \substack{1 \qquad \text{if}~0 \le x \lt 1\\ 0 \qquad \text{else}}
-
-   S^n(x) = \left(S^{n-1} * S^0\right)(x) = \int_{x-1}^x S^{n-1}(\xi) d\xi
-
-PIConGPU implements these up to order :math:`n=4`.
-The three dimensional marker shape is a multiplicative union of B-splines :math:`S^n(x,y,z) = S^n(x) S^n(y) S^n(z)`.
+A marker has a finite-size and a velocity, such that it can be regarded as a cloud of particles, whose center of mass is the marker's position and whose mean velocity is the marker's velocity.
+The cloud shape :math:`S^n(x)` of order :math:`n` of a marker describes its charge density distribution.
+See :ref:`section Hierarchy of Charge Assignment Schemes <model-shapes>` for a list of available marker shapes in PIConGPU.
 
 References
 ----------
diff --git a/docs/source/models/shapes.rst b/docs/source/models/shapes.rst
new file mode 100644
index 0000000000..8210020f0e
--- /dev/null
+++ b/docs/source/models/shapes.rst
@@ -0,0 +1,63 @@
+.. _model-shapes:
+
+Hierarchy of Charge Assignment Schemes
+======================================
+
+.. sectionauthor:: Klaus Steiniger
+
+In PIConGPU, the cloud shapes :math:`S^n(x)` are pre-integrated to *assignment functions* :math:`W^n(x)`.
+
+.. math::
+   W^n(x) = \Pi(x) \ast S^n(x) = \int\limits_{-\infty}^{+\infty} \Pi(x^\prime) S^n(x^\prime - x) dx^\prime\,, \text{ where }
+   \Pi(x) = \left\{\begin{array}{ll}
+            0 & |x| \gt \frac{1}{2} \\
+            \frac{1}{2} & |x| = \frac{1}{2} \\
+            1 & |x| \lt \frac{1}{2}
+      \end{array}\right.
+
+is the top-hat function and :math:`\ast` the convolution.
+
+Evaluating the assignment functions at mesh points directly provides the fraction of charge from the marker assigned to that point.
+
+The assignment functions are implemented as B-splines.
+The zeroth order assignment function :math:`W^0` is the top-hat function :math:`\Pi`.
+It represents charge assignment to the nearest mesh point only, resulting in a stepwise charge density distribution.
+Therefore, it should not be used.
+The assignment function of order :math:`n` is generated by convolution of the assignment function of order :math:`n-1` with the top-hat function
+
+.. math::
+   W^n(x) = W^{n-1}(x) \ast \Pi(x) = \int\limits_{-\infty}^{+\infty} W^{n-1}(x^\prime) \Pi(x^\prime - x) dx^\prime\,.
+
+The three dimensional assignment function is a multiplicative union of B-splines :math:`W^n(x,y,z) = W^n(x) W^n(y) W^n(z)`.
+
+PIConGPU implements these up to order :math:`n=4`.
+The naming scheme follows [HockneyEastwood]_, tab. 5-1, p. 144, where the name of a scheme
+is defined by the visual form of its cloud shape :math:`S`.
+
+
+.. table::
+    :widths: auto
+    :name: assignment_schemes_hierarchy
+
+    +---------------------------------------+-------+----------------------------+
+    | Scheme                                | Order | Assignment function        |
+    +=======================================+=======+============================+
+    | NGP (nearest-grid-point)              | 0     | stepwise                   |
+    +---------------------------------------+-------+----------------------------+
+    | CIC (cloud-in-cell)                   | 1     | piecewise linear spline    |
+    +---------------------------------------+-------+----------------------------+
+    | TSC (triangular shaped cloud)         | 2     | piecewise quadratic spline |
+    +---------------------------------------+-------+----------------------------+
+    | PQS (piecewise quadratic cloud shape) | 3     | piecewise cubic spline     |
+    +---------------------------------------+-------+----------------------------+
+    | PCS (piecewise cubic cloud shape)     | 4     | piecewise quartic spline   |
+    +---------------------------------------+-------+----------------------------+
+
+References
+----------
+
+.. [HockneyEastwood]
+        R.W. Hockney, J.W. Eastwood.
+        *Computer Simulation Using Particles*,
+        CRC Press (1988),
+        ISBN 0-85274-392-0
diff --git a/docs/source/prgpatterns/lockstep.rst b/docs/source/prgpatterns/lockstep.rst
index 8b97a818fd..d1f4ed4473 100644
--- a/docs/source/prgpatterns/lockstep.rst
+++ b/docs/source/prgpatterns/lockstep.rst
@@ -45,7 +45,7 @@ Collective Loop
     // `frame` is a list which must be traversed collectively
     while( frame.isValid() )
     {
-        uint32_t const workerIdx = threadIdx.x;
+        uint32_t const workerIdx = cupla::threadIdx( acc ).x;
         using ParticleDomCfg = IdxConfig<
             frameSize,
             numWorker
@@ -67,7 +67,7 @@ Non-Collective Loop
 
 .. code-block:: cpp
 
-    uint32_t const workerIdx = threadIdx.x;
+    uint32_t const workerIdx = cupla::threadIdx( acc ).x;
     using ParticleDomCfg = IdxConfig<
         frameSize,
         numWorkers
@@ -91,7 +91,7 @@ Create a Context Variable
 
 .. code-block:: cpp
 
-    uint32_t const workerIdx = threadIdx.x;
+    uint32_t const workerIdx = cupla::threadIdx( acc ).x;
     using ParticleDomCfg = IdxConfig<
         frameSize,
         numWorkers
@@ -128,7 +128,7 @@ Using a Master Worker
         bool
     );
 
-    uint32_t const workerIdx = threadIdx.x;
+    uint32_t const workerIdx = cupla::threadIdx( acc ).x;
     ForEachIdx<
         IdxConfig<
             1,
@@ -150,4 +150,4 @@ Using a Master Worker
     /* important: synchronize now, in case upcoming operations (with
      * other workers) access that manipulated shared memory section
      */
-    __syncthreads();
+    cupla::__syncthreads( acc );
diff --git a/docs/source/usage/param/core.rst b/docs/source/usage/param/core.rst
index 126fde70f0..97d57ef8db 100644
--- a/docs/source/usage/param/core.rst
+++ b/docs/source/usage/param/core.rst
@@ -106,6 +106,8 @@ species.param
    :path: include/picongpu/param/species.param
    :no-link:
 
+:ref:`Current solver details <usage-params-core-currentdeposition>`.
+
 speciesDefinition.param
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/source/usage/param/particles/current.rst b/docs/source/usage/param/particles/current.rst
new file mode 100644
index 0000000000..c957dc143a
--- /dev/null
+++ b/docs/source/usage/param/particles/current.rst
@@ -0,0 +1,62 @@
+.. _usage-params-core-currentdeposition:
+
+Current Deposition
+""""""""""""""""""
+
+The current solver can be set in :ref:`species.param <usage-params-core>` or directly per species :ref:`speciesDefinition.param <usage-params-core>`.
+
+.. _usage-params-core-particles-currentsolver:
+
+Current Solver
+''''''''''''''
+
+Esirkepov
+~~~~~~~~~
+
+.. doxygenstruct:: picongpu::currentSolver::Esirkepov
+   :project: PIConGPU
+
+EmZ
+~~~
+
+.. doxygenstruct:: picongpu::currentSolver::EmZ
+   :project: PIConGPU
+
+VillaBune
+~~~~~~~~~
+
+.. doxygenstruct:: picongpu::currentSolver::VillaBune
+   :project: PIConGPU
+
+EsirkepovNative
+~~~~~~~~~~~~~~~
+
+.. doxygenstruct:: picongpu::currentSolver::EsirkepovNative
+   :project: PIConGPU
+
+
+.. _usage-params-core-particles-depositionstrategy:
+
+Deposition Strategy
+'''''''''''''''''''
+
+A current solver supports a strategy to change how the algorithm behaves on different compute architectures.
+The strategy is optional, could affect performance.
+
+StridedCachedSupercells
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. doxygenstruct:: picongpu::currentSolver::strategy::StridedCachedSupercells
+   :project: PIConGPU
+
+CachedSupercells
+~~~~~~~~~~~~~~~~
+
+.. doxygenstruct:: picongpu::currentSolver::strategy::CachedSupercells
+   :project: PIConGPU
+
+NonCachedSupercells
+~~~~~~~~~~~~~~~~~~~
+
+.. doxygenstruct:: picongpu::currentSolver::strategy::NonCachedSupercells
+   :project: PIConGPU
diff --git a/docs/source/usage/param/plugins.rst b/docs/source/usage/param/plugins.rst
index 289f5b9960..0eb0365a64 100644
--- a/docs/source/usage/param/plugins.rst
+++ b/docs/source/usage/param/plugins.rst
@@ -68,7 +68,7 @@ pngColorScales.param
    :no-link:
 
 transitionRadiation.param
-^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. doxygenfile:: transitionRadiation.param
    :project: PIConGPU
diff --git a/docs/source/usage/plugins.rst b/docs/source/usage/plugins.rst
index 86c1be2a66..2ed481b84d 100644
--- a/docs/source/usage/plugins.rst
+++ b/docs/source/usage/plugins.rst
@@ -7,6 +7,7 @@ Plugins
 Plugin name                                                                          short description
 ==================================================================================== =================================================================================
 :ref:`ADIOS <usage-plugins-ADIOS>` [#f2]_ [#f7]_                                     stores simulation data as openPMD flavoured ADIOS files [Huebl2017]_
+:ref:`openPMD <usage-plugins-openPMD>` [#f2]_ [#f7]_                                 outputs simulation data via the openPMD API
 :ref:`energy histogram <usage-plugins-energyHistogram>` [#f7]_                       energy histograms for electrons and ions
 :ref:`charge conservation <usage-plugins-chargeConservation>` [#f6]_                 maximum difference between electron charge density and div E
 :ref:`checkpoint <usage-plugins-checkpoint>` [#f2]_                                  stores the primary data of the simulation for restarts.
@@ -14,7 +15,6 @@ Plugin name
 :ref:`count per supercell <usage-plugins-countPerSupercell>` [#f3]_                  count macro particles *per supercell*
 :ref:`energy fields <usage-plugins-energyFields>`                                    electromagnetic field energy per time step
 :ref:`energy particles <usage-plugins-energyParticles>` [#f7]_                       kinetic and total energies summed over all electrons and/or ions
-:ref:`HDF5 <usage-plugins-HDF5>` [#f2]_ [#f7]_                                       stores simulation data as openPMD flavoured HDF5 files [Huebl2017]_
 :ref:`ISAAC <usage-plugins-ISAAC>`                                                   interactive 3D live visualization [Matthes2016]_
 :ref:`intensity <usage-plugins-intensity>` [#f1]_ [#f5]_ [#f6]_                      maximum and integrated electric field along the y-direction
 :ref:`particle calorimeter <usage-plugins-particleCalorimeter>` [#f3]_ [#f4]_ [#f7]_ spatially resolved, particle energy detector in infinite distance
@@ -28,6 +28,7 @@ Plugin name
 :ref:`slice field printer <usage-plugins-sliceFieldPrinter>` [#f5]_                  print out a slice of the electric and/or magnetic and/or current field
 :ref:`sum currents <usage-plugins-sumCurrents>`                                      compute the total current summed over all cells
 :ref:`transitionRadiation <usage-plugins-transitionRadiation>`                       compute emitted electromagnetic spectra
+:ref:`xrayScattering <usage-plugins-xrayScattering>`                                 compute SAXS scattering amplitude ( based on `FieldTmp` species density )
 ==================================================================================== =================================================================================
 
 .. rubric:: Footnotes
@@ -123,7 +124,7 @@ If you would like to help in developing those classes for a plugin of your choic
         `DOI:10.1016/j.nima.2013.10.073 <https://doi.org/10.1016/j.nima.2013.10.073>`_
 
 .. [Pausch2018]
-        R. Pausch, A. Debus, A. Huebl, U. Schramma, K. Steiniger, R. Widera, and M. Bussmann.
+        R. Pausch, A. Debus, A. Huebl, U. Schramm, K. Steiniger, R. Widera, and M. Bussmann.
         *Quantitatively consistent computation of coherent and incoherent radiation in particle-in-cell codes - a general form factor formalism for macro-particles*,
         Nuclear Instruments and Methods in Physics Research Section A: Accelerators, Spectrometers, Detectors and Associated Equipment 909, pp. 419-422 (2018)
         `arXiv:1802.03972 <https://arxiv.org/abs/1802.03972>`_, `DOI:10.1016/j.nima.2018.02.020 <https://doi.org/10.1016/j.nima.2018.02.020>`_
diff --git a/docs/source/usage/plugins/chargeConservation.rst b/docs/source/usage/plugins/chargeConservation.rst
index 4ba55db43b..cf5f4b698c 100644
--- a/docs/source/usage/plugins/chargeConservation.rst
+++ b/docs/source/usage/plugins/chargeConservation.rst
@@ -10,7 +10,6 @@ The maximum deviation value multiplied by the cell's volume is printed.
 .. attention::
 
    This plugin assumes a Yee-like divergence E stencil!
-   Do not use it together with other field solvers like *directional splitting* (for the *Lehe* solver it is still correct).
 
 .cfg file
 ^^^^^^^^^
diff --git a/docs/source/usage/plugins/checkpoint.rst b/docs/source/usage/plugins/checkpoint.rst
index b5567bd8c7..d5a4a1be07 100644
--- a/docs/source/usage/plugins/checkpoint.rst
+++ b/docs/source/usage/plugins/checkpoint.rst
@@ -21,18 +21,18 @@ What is the format of the created files?
 
 We write our fields and particles in an open markup called :ref:`openPMD <pp-openPMD>`.
 
-For further details, see the according sections in :ref:`HDF5 <usage-plugins-HDF5>` and :ref:`ADIOS <usage-plugins-ADIOS>`.
+For further details, see the according sections in :ref:`the openPMD API <usage-plugins-openPMD>` and :ref:`ADIOS <usage-plugins-ADIOS>`.
 
 External Dependencies
 ^^^^^^^^^^^^^^^^^^^^^
 
-The plugin is available as soon as the :ref:`libSplash (HDF5) or ADIOS libraries <install-dependencies>` are compiled in.
+The plugin is available as soon as the :ref:`openPMD API or ADIOS libraries <install-dependencies>` are compiled in.
 
 .cfg file
 ^^^^^^^^^
 
 You can use ``--checkpoint.period`` to specify the output period of the created checkpoints.
-Note that this plugin will only be available if libSplash (HDF5) or ADIOS is found during compile configuration.
+Note that this plugin will only be available if the openPMD API, libSplash (HDF5) or ADIOS is found during compile configuration.
 
 ============================================= ======================================================================================
 PIConGPU command line option                  Description
@@ -59,9 +59,9 @@ PIConGPU command line option                  Description
 ``--checkpoint.<IO-backend>.*``               Additional options to control the IO-backend
 ============================================= ======================================================================================
 
-Depending on the available external dependencies (see above), the options for the ``<IO-backend>`` are:
+Depending on the available external dependencies (see above), the options for the `<IO-backend>` are:
 
-* :ref:`hdf5 <usage-plugins-HDF5>`
+* :ref:`openPMD <usage-plugins-openPMD>`
 * :ref:`adios <usage-plugins-ADIOS>` (keep in mind the :ref:`note on meta-files <usage-plugins-ADIOS-meta>` for restarts)
 
 Interacting Manually with Checkpoint Data
diff --git a/docs/source/usage/plugins/energyHistogram.rst b/docs/source/usage/plugins/energyHistogram.rst
index 5e1b56b60b..5d816250d4 100644
--- a/docs/source/usage/plugins/energyHistogram.rst
+++ b/docs/source/usage/plugins/energyHistogram.rst
@@ -104,13 +104,13 @@ You can quickly load and interact with the data in Python with:
    eh_data.get_times(species='e')
 
    # load data for a given iteration
-   counts, bins_keV = eh_data.get('e', species_filter='all', iteration=2000)
-
-   # load data for a given time
-   counts, bins_keV = eh_data.get('e', species_filter='all', time=1.3900e-14)
+   counts, bins_keV, _, _ = eh_data.get(species='e', species_filter='all', iteration=2000)
 
    # get data for multiple iterations
-   d, bins, iteration, dt = eh_data.get(species='e', iteration=[200, 400, 8000])
+   counts, bins_keV, iteration, dt = eh_data.get(species='e', iteration=[200, 400, 8000])
+
+   # load data for a given time
+   counts, bins_keV, iteration, dt = eh_data.get(species='e', species_filter='all', time=1.3900e-14)
 
 
 Matplotlib Visualizer
diff --git a/docs/source/usage/plugins/hdf5.rst b/docs/source/usage/plugins/hdf5.rst
deleted file mode 100644
index 2d06dc3af4..0000000000
--- a/docs/source/usage/plugins/hdf5.rst
+++ /dev/null
@@ -1,102 +0,0 @@
-.. _usage-plugins-HDF5:
-
-HDF5
-----
-
-Stores simulation data such as fields and particles along with domain information,
-conversion units etc. as `HDF5 <http://www.hdfgroup.org/HDF5/>`_ files [Huebl2017]_ .
-It uses `libSplash <https://github.com/ComputationalRadiationPhysics/libSplash>`_ for writing HDF5 data. 
-It is used for post-simulation analysis and for **restarts** of the simulation after a crash or an intended stop. 
-
-What is the format of the created HDF5 files?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We write our fields and particles in an open markup called **openPMD**.
-You can investigate your files via a large collection of `tools and frameworks <https://github.com/openPMD/openPMD-projects>`_ or use the native HDF5 bindings of your `favorite programming language <https://en.wikipedia.org/wiki/Hierarchical_Data_Format#Interfaces>`_.
-
-**Resources for a quick-start:**
-
-* `online tutorial <http://www.openPMD.org>`_
-* `example files <https://github.com/openPMD/openPMD-example-datasets>`_
-* `written standard <https://github.com/openPMD/openPMD-standard>`_ of the openPMD standard
-* `list of projects <https://github.com/openPMD/openPMD-projects>`_ supporting openPMD files
-
-External Dependencies
-^^^^^^^^^^^^^^^^^^^^^
-
-The plugin is available as soon as the :ref:`libSplash and HDF5 libraries <install-dependencies>` are compiled in.
-
-.param file
-^^^^^^^^^^^
-
-The corresponding ``.param`` file is :ref:`fileOutput.param <usage-params-plugins>`.
-
-One can e.g. disable the output of particles by setting:
-
-.. code-block:: cpp
-
-   /* output all species */
-   using FileOutputParticles = VectorAllSpecies;
-   /* disable */
-   using FileOutputParticles = MakeSeq_t< >;
-
-.cfg file
-^^^^^^^^^
-
-You can use ``--hdf5.period`` and ``--hdf5.file`` to specify the output period and path and name of the created fileset.
-For example, ``--hdf5.period 128 --hdf5.file simData --hdf5.source 'species_all'`` will write only the particle species data to files of the form ``simData_0.h5``, ``simData_128.h5`` in the default simulation output directory every 128 steps.
-Note that this plugin will only be available if libSplash and HDF5 is found during compile configuration.
-
-============================ ====================================================================
-PIConGPU command line option Description
-============================ ====================================================================
-``--hdf5.period``            Period after which simulation data should be stored on disk.
-``--hdf5.file``              Relative or absolute fileset prefix for simulation data.
-                             If relative, files are stored under ``simOutput/``.
-``--hdf5.source``            Select data sources to dump. Default is ``species_all,fields_all``, 
-                             which dumps all fields and particle species.
-============================ ====================================================================
-
-.. note::
-
-   This plugin is a multi plugin. 
-   Command line parameter can be used multiple times to create e.g. dumps with different dumping period.
-   In the case where a optional parameter with a default value is explicitly defined the parameter will be always passed to the instance of the multi plugin where the parameter is not set.
-   e.g. 
-
-   .. code-block:: bash
-
-      --hdf5.period 128 --hdf5.file simData1 
-      --hdf5.period 1000 --hdf5.file simData2 --hdf5.source 'species_all'
-
-   creates two plugins:
- 
-   #. dump **all species data** each 128th time step.
-   #. dump **all fields and species data** (this is the default) data each 1000th time step.
-
-Memory Complexity
-^^^^^^^^^^^^^^^^^
-
-Accelerator
-"""""""""""
-
-no extra allocations.
-
-Host
-""""
-
-During I/O, each complete particle species is allocated one after an other.
-
-Additional Tools
-^^^^^^^^^^^^^^^^
-
-See our :ref:`openPMD <pp-openPMD>` chapter.
-
-References
-^^^^^^^^^^
-
-.. [Huebl2017]
-        A. Huebl, R. Widera, F. Schmitt, A. Matthes, N. Podhorszki, J.Y. Choi, S. Klasky, and M. Bussmann.
-        *On the Scalability of Data Reduction Techniques in Current and Upcoming HPC Systems from an Application Perspective.*
-        ISC High Performance Workshops 2017, LNCS 10524, pp. 15-29 (2017),
-        `arXiv:1706.00522 <https://arxiv.org/abs/1706.00522>`_, `DOI:10.1007/978-3-319-67630-2_2 <https://doi.org/10.1007/978-3-319-67630-2_2>`_
diff --git a/docs/source/usage/plugins/openPMD.cfg b/docs/source/usage/plugins/openPMD.cfg
new file mode 100644
index 0000000000..f584925c70
--- /dev/null
+++ b/docs/source/usage/plugins/openPMD.cfg
@@ -0,0 +1,21 @@
+TBG_openPMD="--openPMD.period 100   \
+             --openPMD.file simOutput \
+             --openPMD.ext bp \
+             --openPMD.json '{ \
+                 \"adios2\": { \
+                   \"dataset\": { \
+                     \"operators\": [ \
+                       { \
+                         \"type\": \"bzip2\" \
+                       } \
+                     ] \
+                   }, \
+                   \"engine\": { \
+                     \"type\": \"file\", \
+                     \"parameters\": { \
+                       \"BufferGrowthFactor\": \"1.2\", \
+                       \"InitialBufferSize\": \"2GB\" \
+                     } \
+                   } \
+                 } \
+               }'"
\ No newline at end of file
diff --git a/docs/source/usage/plugins/openPMD.rst b/docs/source/usage/plugins/openPMD.rst
new file mode 100644
index 0000000000..45edf8f0ee
--- /dev/null
+++ b/docs/source/usage/plugins/openPMD.rst
@@ -0,0 +1,152 @@
+.. _usage-plugins-openPMD:
+
+openPMD
+-------
+
+Stores simulation data such as fields and particles according to the `openPMD standard <https://github.com/openPMD/openPMD-standard>`_ using the `openPMD API <https://openpmd-api.readthedocs.io>`_.
+
+External Dependencies
+^^^^^^^^^^^^^^^^^^^^^
+
+The plugin is available as soon as the :ref:`openPMD API <install-dependencies>` is compiled in.
+If the openPMD API is found in version 0.13.0 or greater, PIConGPU will support streaming IO via openPMD.
+
+.param file
+^^^^^^^^^^^
+
+The corresponding ``.param`` file is :ref:`fileOutput.param <usage-params-plugins>`.
+
+One can e.g. disable the output of particles by setting:
+
+.. code-block:: cpp
+
+   /* output all species */
+   using FileOutputParticles = VectorAllSpecies;
+   /* disable */
+   using FileOutputParticles = MakeSeq_t< >;
+
+.cfg file
+^^^^^^^^^
+
+You can use ``--openPMD.period`` to specify the output period.
+The base filename is specified via ``--openPMD.file``.
+The openPMD API will parse the file name to decide the chosen backend and iteration layout:
+
+* The filename extension will determine the backend.
+* The openPMD will either create one file encompassing all iterations (group-based iteration layout) or one file per iteration (file-based iteration layout).
+  The filename will be searched for a pattern describing how to derive a concrete iteration's filename.
+  If no such pattern is found, the group-based iteration layout will be chosen.
+  Please refer to the documentation of the openPMD API for further information.
+
+In order to set defaults for these value, two further options control the filename:
+
+* ``--openPMD.ext`` sets the filename extension.
+  Possible extensions include ``bp`` for the ADIOS backends (default), ``h5`` for HDF5 and ``sst`` for Streaming via ADIOS2/SST.
+  If the openPMD API has been built with support for the ADIOS1 and ADIOS2 backends, ADIOS2 will take precedence over ADIOS1.
+  This behavior can be overridden by setting the environment variable ``OPENPMD_BP_BACKEND=ADIOS1``.
+* ``--openPMD.infix`` sets the filename pattern that controls the iteration layout, default is "_06T" for a six-digit number specifying the iteration.
+  Leave empty to pick group-based iteration layout.
+  Since passing an empty string may be tricky in some workflows, specifying ``--openPMD.infix=NULL`` is also possible.
+
+  Note that streaming IO requires group-based iteration layout in openPMD, i.e. ``--openPMD.infix=NULL`` is mandatory.
+  If PIConGPU detects a streaming backend (e.g. by ``--openPMD.ext=sst``), it will automatically set ``--openPMD.infix=NULL``, overriding the user's choice.
+  Note however that the ADIOS2 backend can also be selected via ``--openPMD.json`` and via environment variables which PIConGPU does not check.
+  It is hence recommended to set ``--openPMD.infix=NULL`` explicitly.
+
+For example, ``--openPMD.period 128 --openPMD.file simData --openPMD.source 'species_all'`` will write only the particle species data to files of the form ``simData_000000.bp``, ``simData_000128.bp`` in the default simulation output directory every 128 steps.
+Note that this plugin will only be available if the openPMD API is found during compile configuration.
+
+openPMD backend-specific settings may be controlled via two mechanisms:
+
+* Environment variables.
+  Please refer to the backends' documentations for information on environment variables understood by the backends.
+* Backend-specific runtime parameters may be set via JSON in the openPMD API.
+  PIConGPU exposes this via the command line option ``--openPMD.json``.
+  Please refer to the openPMD API's documentation for further information.
+
+The JSON parameter may be passed directly as a string, or by filename.
+The latter case is distinguished by prepending the filename with an at-sign ``@``.
+Specifying a JSON-formatted string from within a ``.cfg`` file can be tricky due to colliding escape mechanisms.
+An example for a well-escaped JSON string as part of a ``.cfg`` file is found below.
+
+.. literalinclude:: openPMD.cfg
+
+PIConGPU further defines an **extended format for JSON options** that may alternatively used in order to pass dataset-specific configurations.
+For each backend ``<backend>``, the backend-specific dataset configuration found under ``config["<backend>"]["dataset"]`` may take the form of a JSON list of patterns: ``[<pattern_1>, <pattern_2>, …]``.
+
+Each such pattern ``<pattern_i>`` is a JSON object with key ``cfg`` and optional key ``select``: ``{"select": <pattern>, "cfg": <cfg>}``.
+
+In here, ``<pattern>`` is a regex or a list of regexes, as used by POSIX ``grep -E``.
+``<cfg>`` is a configuration that will be forwarded as-is to openPMD.
+
+The single patterns will be processed in top-down manner, selecting the first matching pattern found in the list.
+The regexes will be matched against the openPMD dataset path within the iteration (e.g. ``E/x`` or ``particles/.*/position/.*``), considering full matches only.
+
+The **default configuration** is specified by omitting the ``select`` key.
+Specifying more than one default is an error.
+If no pattern matches a dataset, the default configuration is chosen if specified, or an empty JSON object ``{}`` otherwise.
+
+A full example:
+
+.. literalinclude:: openPMD_extended_config.json
+
+Two data preparation strategies are available for downloading particle data off compute devices.
+
+* Set ``--openPMD.dataPreparationStrategy doubleBuffer`` for use of the strategy that has been optimized for use with ADIOS-based backends.
+  The alias ``openPMD.dataPreparationStrategy adios`` may be used.
+  This strategy requires at least 2x the GPU main memory on the host side.
+  This is the default.
+* Set ``--openPMD.dataPreparationStrategy mappedMemory`` for use of the strategy that has been optimized for use with HDF5-based backends.
+  This strategy has a small host-side memory footprint (<< GPU main memory).
+  The alias ``openPMD.dataPreparationStrategy hdf5`` may be used.
+
+===================================== ====================================================================================================================================================
+PIConGPU command line option          description
+===================================== ====================================================================================================================================================
+``--openPMD.period``                  Period after which simulation data should be stored on disk.
+``--openPMD.source``                  Select data sources to dump. Default is ``species_all,fields_all``, which dumps all fields and particle species.
+``--openPMD.compression``             Legacy parameter to set data transform compression method to be used for ADIOS1 backend until it implements setting compression from JSON config.
+``--openPMD.file``                    Relative or absolute openPMD file prefix for simulation data. If relative, files are stored under ``simOutput``. 
+``--openPMD.ext``                     openPMD filename extension (this controls thebackend picked by the openPMD API).
+``--openPMD.infix``                   openPMD filename infix (use to pick file- or group-based layout in openPMD). Set to NULL to keep empty (e.g. to pick group-based iteration layout).
+``--openPMD.json``                    Set backend-specific parameters for openPMD backends in JSON format.
+``--openPMD.dataPreparationStrategy`` Strategy for preparation of particle data ('doubleBuffer' or 'mappedMemory'). Aliases 'adios' and 'hdf5' may be used respectively.
+===================================== ====================================================================================================================================================
+
+.. note::
+
+   This plugin is a multi plugin. 
+   Command line parameter can be used multiple times to create e.g. dumps with different dumping period.
+   In the case where an optional parameter with a default value is explicitly defined, the parameter will always be passed to the instance of the multi plugin where the parameter is not set.
+   e.g.
+
+   .. code-block:: bash
+
+      --openPMD.period 128 --openPMD.file simData1 --openPMD.source 'species_all' 
+      --openPMD.period 1000 --openPMD.file simData2 --openPMD.source 'fields_all' --openPMD.ext h5
+
+   creates two plugins:
+
+   #. dump all species data each 128th time step, use HDF5 backend.
+   #. dump all field data each 1000th time step, use the default ADIOS backend.
+
+Memory Complexity
+^^^^^^^^^^^^^^^^^
+
+Accelerator
+"""""""""""
+
+no extra allocations.
+
+Host
+""""
+
+As soon as the openPMD plugin is compiled in, one extra ``mallocMC`` heap for the particle buffer is permanently reserved.
+During I/O, particle attributes are allocated one after another.
+Using ``--openPMD.dataPreparationStrategy doubleBuffer`` (default) will require at least 2x the GPU memory on the host side.
+For a smaller host side memory footprint (<< GPU main memory) pick ``--openPMD.dataPreparationStrategy mappedMemory``.
+
+Additional Tools
+^^^^^^^^^^^^^^^^
+
+See our :ref:`openPMD <pp-openPMD>` chapter.
diff --git a/docs/source/usage/plugins/openPMD_extended_config.json b/docs/source/usage/plugins/openPMD_extended_config.json
new file mode 100644
index 0000000000..e2dad6900b
--- /dev/null
+++ b/docs/source/usage/plugins/openPMD_extended_config.json
@@ -0,0 +1,35 @@
+{
+  "adios2": {
+    "engine": {
+      "usesteps": true,
+      "parameters": {
+        "InitialBufferSize": "2Gb",
+        "Profile": "On"
+      }
+    },
+    "dataset": [
+      {
+        "cfg": {
+          "operators": [
+            {
+              "type": "blosc",
+              "parameters": {
+                "clevel": "1",
+                "doshuffle": "BLOSC_BITSHUFFLE"
+              }
+            }
+          ]
+        }
+      },
+      {
+        "select": [
+          ".*positionOffset.*",
+          ".*particlePatches.*"
+        ],
+        "cfg": {
+          "operators": []
+        }
+      }
+    ]
+  }
+}
diff --git a/docs/source/usage/plugins/particleMergerProbabilistic.rst b/docs/source/usage/plugins/particleMergerProbabilistic.rst
new file mode 100644
index 0000000000..e070e168e8
--- /dev/null
+++ b/docs/source/usage/plugins/particleMergerProbabilistic.rst
@@ -0,0 +1,69 @@
+.. _usage-plugins-particleMergerProbabilistic:
+
+Particle Merger Probabilistic Version
+-------------------------------------
+
+Merges macro particles that are close in phase space to reduce computational load.
+Voronoi-based probalistic variative algorithm. The difference between Base Voronoi algorothm
+and probabilistic version in parameters: instead of threshold of spread in position and momentum
+use ratio of deleted particles. 
+
+
+.param file
+^^^^^^^^^^^
+
+In :ref:`particleMerging.param <usage-params-plugins>` is currently one compile-time parameter:
+
+===================== ====================================================================================
+Compile-Time Option   Description
+===================== ====================================================================================
+``MAX_VORONOI_CELLS`` Maximum number of active Voronoi cells per supercell.
+                      If the number of active Voronoi cells reaches this limit merging events are dropped.
+===================== ====================================================================================
+
+.cfg file
+^^^^^^^^^
+
+====================================================== ========================================================================================================================
+PIConGPU command line option                           Description
+====================================================== ========================================================================================================================
+``--<species>_randomizedMerger.period``                The ouput periodicity of the plugin. A value of ``100`` would mean an output at simulation time step *0, 100, 200, ...*.
+											 
+``--<species>_randomizedMerger.ratioDeletedParticles`` The ratio of particles to delete. The parameter have to be in Range *[0:1]*.
+
+``--<species>_randomizedMerger.maxParticlesToMerge``   Maximum number of macroparticles that can be merged into a single macroparticle.
+
+``--<species>_randomizedMerger.posSpreadThreshold``    Below this threshold of spread in position macroparticles can be merged [unit: cell edge length].
+
+``--<species>_randomizedMerger.momSpreadThreshold``    Below this absolute threshold of spread in momentum macroparticles can be merged [unit: :math:`m_{e-} \cdot c`].
+====================================================== ========================================================================================================================
+
+Memory Complexity
+^^^^^^^^^^^^^^^^^
+
+Accelerator
+"""""""""""
+
+no extra allocations, but requires an extra particle attribute per species, ``voronoiCellId``.
+
+Host
+""""
+
+no extra allocations.
+
+Known Limitations
+^^^^^^^^^^^^^^^^^
+
+- this plugin is only available with the CUDA backend
+- this plugin might take a significant amount of time due to not being fully parallelized.
+
+Reference
+^^^^^^^^^
+
+The particle merger implements a macro particle merging algorithm based on:
+
+Luu, P. T., Tueckmantel, T., & Pukhov, A. (2016).
+Voronoi particle merging algorithm for PIC codes.
+Computer Physics Communications, 202, 165-174.
+
+There is a slight deviation from the paper in determining the next subdivision. The implementation always tries to subdivide a Voronoi cell by positions first; momentums are only checked in case the spreads in the positions satisfy the threshold.
diff --git a/docs/source/usage/plugins/phaseSpace.rst b/docs/source/usage/plugins/phaseSpace.rst
index 63a73c2a32..e3fc40f725 100644
--- a/docs/source/usage/plugins/phaseSpace.rst
+++ b/docs/source/usage/plugins/phaseSpace.rst
@@ -8,7 +8,7 @@ This plugin creates a 2D phase space image for a user-given spatial and momentum
 External Dependencies
 ^^^^^^^^^^^^^^^^^^^^^
 
-The plugin is available as soon as the :ref:`libSplash and HDF5 libraries <install-dependencies>` are compiled in.
+The plugin is available as soon as the :ref:`openPMD API <install-dependencies>` is compiled in.
 
 .cfg file
 ^^^^^^^^^
@@ -19,13 +19,13 @@ Example for *y-pz* phase space for the *electron* species (``.cfg`` file macro):
 
    # Calculate a 2D phase space
    # - momentum range in m_e c
-   TGB_ePSypz="--e_phaseSpace.period 10 --e_phaseSpace.filter all --e_phaseSpace.space y --e_phaseSpace.momentum pz --e_phaseSpace.min -1.0 --e_phaseSpace.max 1.0"
+   TGB_ePSypz="--e_phaseSpace.period 10 --e_phaseSpace.filter all --e_phaseSpace.space y --e_phaseSpace.momentum pz --e_phaseSpace.min -1.0 --e_phaseSpace.max 1.0 --e_phaseSpace.ext h5"
 
 
 The distinct options are (assuming a species ``e`` for electrons):
 
 ====================================== ======================================================== ============================
-Option                                 Usage                                     Unit
+Option                                 Usage                                                    Unit
 ====================================== ======================================================== ============================
 ``--e_phaseSpace.period <N>``          calculate each N steps                                   *none*
 ``--e_phaseSpace.filter``              Use filtered particles. Available filters are set up in  *none*
@@ -34,6 +34,7 @@ Option                                 Usage
 ``--e_phaseSpace.momentum <px/py/pz>`` momentum coordinate of the 2D phase space                *none*
 ``--e_phaseSpace.min <ValL>``          minimum of the momentum range                            :math:`m_\mathrm{species} c`
 ``--e_phaseSpace.max <ValR>``          maximum of the momentum range                            :math:`m_\mathrm{species} c`
+``--e_phaseSpace.ext <ext>``           filename extension for openPMD backend                   *none*
 ====================================== ======================================================== ============================
 
 Memory Complexity
@@ -52,11 +53,23 @@ negligible.
 Output
 ^^^^^^
 
-The 2D histograms are stored in ``.hdf5`` files in the ``simOutput/phaseSpace/`` directory.
+The 2D histograms are stored in the ``simOutput/phaseSpace/`` directory, by default in ``.h5`` files.
 A file is created per species, phasespace selection and time step.
 
 Values are given as *charge density* per phase space bin.
-In order to scale to a simpler *charge of particles* per :math:`\mathrm{d}r_i` and :math:`\mathrm{d}p_i` -bin multiply by the cell volume ``dV``.
+In order to scale to a simpler *charge of particles* per :math:`\mathrm{d}r_i` and :math:`\mathrm{d}p_i` -bin multiply by the cell volume ``dV`` (written as an attribute of the openPMD Mesh).
+
+The output writes a number of non-standard custom openPMD attributes:
+
+* ``p_min`` and ``p_max``: The lower and upper bounds for the momentum axis, respectively.
+* ``dr``: The spacing of the spatial axis in PIConGPU units.
+* ``dV``: The volume of a phase space cell. Relates to ``dr`` via ``dV = dp * dr`` where ``dp`` would be the grid spacing along the momentum axis.
+* ``dr_unit``: The SI scaling for the spatial axis. Use this instead of ``gridUnitSI``.
+* ``p_unit``: The SI scaling for the momentum axis. Use this instead of ``gridUnitSI``.
+* ``globalDomainOffset``, ``globalDomainSize`` and ``globalDomainAxisLabels``: Information on the global domain.
+* ``totalDomainOffset``, ``totalDomainSize`` and ``totalDomainAxisLabels``: Information on the total domain.
+  Please consult the `PIConGPU wiki <https://github.com/ComputationalRadiationPhysics/picongpu/wiki/PIConGPU-domain-definitions>`_ for explanations on the meaning of global and total domain.
+* ``sim_unit``: SI scaling for the charge density values. Alias for ``unitSI``.
 
 Analysis Tools
 ^^^^^^^^^^^^^^
@@ -223,7 +236,8 @@ Known Limitations
 - charge deposition uses the counter shape for now (would need one more write to neighbors to evaluate it correctly according to the shape)
 - the user has to define the momentum range in advance
 - the resolution is fixed to ``1024 bins`` in momentum and the number of cells in the selected spatial dimension
-- this plugin does not yet use :ref:`openPMD markup <pp-openPMD>`.
+- While the openPMD standard `has already been updated <https://github.com/openPMD/openPMD-standard/pull/193>`_ to support phase space data, the openPMD API does not yet implement this part.
+  The openPMD attribute ``gridUnitSI`` and ``gridUnitDimension`` can hence not be correctly written yet and should be ignored in favor of the custom attributes written by this plugin.
 
 References
 ^^^^^^^^^^
diff --git a/docs/source/usage/plugins/radiation.rst b/docs/source/usage/plugins/radiation.rst
index 002035a3b4..a47cc8580e 100644
--- a/docs/source/usage/plugins/radiation.rst
+++ b/docs/source/usage/plugins/radiation.rst
@@ -287,6 +287,11 @@ Command line option                       Description
 ``--<species>_radiation.folderRadPerGPU`` Name of the folder, where the GPU specific spectra are stored.
                                           Default: ``radPerGPU``
 ``--<species>_radiation.compression``     If set, the hdf5 output is compressed.
+``--<species>_radiation.numJobs``         Number of independent jobs used for the radiation calculation.
+                                          This option is used to increase the utilization of the device by producing more independent work.
+                                          This option enables accumulation of data in parallel into multiple temporary arrays, thereby increasing the utilization of
+                                          the device by increasing the memory footprint
+                                          Default: ``2``
 ========================================= ==============================================================================================================================
 
 Memory Complexity
@@ -295,7 +300,8 @@ Memory Complexity
 Accelerator
 """""""""""
 
-each energy bin times each coordinate bin allocates one counter (``float_X``) permanently and on each accelerator.
+locally, ``numJobs`` times number of frequencies ``N_omega`` times number of directions ``N_theta`` is permanently allocated.
+Each result element (amplitude) is a double precision complex number.
 
 Host
 """"
diff --git a/docs/source/usage/plugins/transitionRadiation.rst b/docs/source/usage/plugins/transitionRadiation.rst
index c092af03b6..f76c449571 100644
--- a/docs/source/usage/plugins/transitionRadiation.rst
+++ b/docs/source/usage/plugins/transitionRadiation.rst
@@ -1,7 +1,7 @@
 .. _usage-plugins-transitionRadiation:
 
 Transition Radiation
----------
+--------------------
 
 The spectrally resolved far field radiation created by electrons passing through a metal foil.
 
diff --git a/docs/source/usage/plugins/xrayScattering.rst b/docs/source/usage/plugins/xrayScattering.rst
new file mode 100644
index 0000000000..4806290aa6
--- /dev/null
+++ b/docs/source/usage/plugins/xrayScattering.rst
@@ -0,0 +1,142 @@
+.. _usage-plugins-xrayScattering:
+
+xrayScattering
+--------------
+
+This plugin calculates Small Angle X-ray Scattering (SAXS) patterns from electron density.
+( Using a density `FieldTmp` as an intermediate step and not directly the macro particle distribution. )
+This is a species specific plugin and it has to be run separately for each scattering species.
+Since the plugin output is the scattered complex amplitude, contributions from different species can be coherently summed later on. 
+
+.. math::
+
+   \Phi({\vec q}) &= \frac{r_e}{d}  \int_{t} \mathrm{d}t \int_{V} \mathrm{d}V \phi({\vec r}, t) n({\vec r}, t) \\
+   I &= \left| \Phi \right|^2
+
+
+============================== ================================================================================
+Variable                       Meaning
+============================== ================================================================================
+:math:`\Phi`                   Scattered amplitude
+:math:`\vec q`                  Scattering vector with :math:`|{\vec q}| = \frac{4 \pi \sin \theta}{\lambda}`
+:math:`\theta`                 Scattering angle. :math:`2\theta` is the angle between the incoming and the scattered k-vectors.
+:math:`\lambda`                Probing beam wavelength
+:math:`n`                      Electron density
+:math:`\phi`                   Incoming wave amplitude
+:math:`I`                      Scattering intensity
+:math:`d`                      Screen distance
+:math:`r_e`                    Classical electron radius
+
+============================== ================================================================================
+
+
+For the free electrons, the density :math:`n` is just their number density, for ions it is the bound electrons density of the species.
+This plugin will automatically switch to bound electrons density for species having the `boundElectrons` property.
+
+The volume integral is realized by a discrete sum over the simulation cells and the temporal integration reduces to accumulating the amplitude over simulation time steps.
+
+.. note::
+    This calculation is based on the kinematic model of scattering. Multiple scattering CAN NOT be handled in this model.
+
+.param file
+^^^^^^^^^^^
+
+The `xrayScattering.param` file sets the x-ray beam alignment as well as its temporal and transverse envelope.
+
+.. note::
+    At the moment the translation (to the side center + offset) is not working correctly.
+    For that reason, the envelopes and the offset can't be set in the ``.param`` file yet.
+    The probe is always a plane wave.
+    Beam rotation works.
+
+The alignment settings define a beam coordinate system with :math:`\hat{z}  = \hat{k}` and :math:`\hat{x}`, :math:`\hat{y}` perpendicular to the x-ray propagation direction.
+It is always a right-hand system. It is oriented in such way that for propagation parallel to the PIC x- or y-axis (`Side`: `X`, `XR`, `Y` or `YR`) :math:`\hat{x}_{\text{beam}} = - \hat{z}_{\text{PIC}}` holds and if :math:`{\vec k }` is parallel to  the PIC z-axis (`Side`: `Z` or `ZR`),  :math:`\hat{x}_{\text{beam}} = - \hat{y}_{\text{PIC}}` holds.
+The orientation can be then fine adjusted with the `RotationParam` setting.
+.. TODO: Figures showing the beam coordinate system orientation in the PIC system.
+
+.. TODO: Add other parameters after the coordinate transform has been fixed and the settings have been moved back to the .param file.
+
+=================  ===============================================================================================================================
+  Setting                      Description 
+=================  ===============================================================================================================================
+``ProbingSide``    The side from which the x-ray is propagated.
+                   Set `X`, `Y` or `Z` for propagation along one of the PIC coordinate system axes;
+                   `XR`, `YR` or `ZR` for propagation in an opposite direction.
+
+``RotationParam``  Rotation of the beam axis, :math:`z_{\text{beam}}`, from the default orientation ( perpendicular the the simulation box side ).
+                   Set the beam yaw and pitch angles in radians.
+=================  ===============================================================================================================================
+
+.. TODO: Add BEAM_OFFSET in between after the coordinate transform has been fixed.
+
+The coordinate transfer from the PIC system to the beam system is performed in the following order:
+rotation to one of the default orientations (``ProbingSide`` setting), additional rotation (``RotationParam`` ). This has to be taken into account when defining the experimental setup.
+
+
+.cfg file
+^^^^^^^^^
+
+For a specific (charged) species ``<species>`` e.g. ``e``, the scattering can be computed by the following commands.
+
+============================================ ============================================================================================================================================
+Command line option                          Description
+============================================ ============================================================================================================================================
+``--<species>_xrayScattering.period``        Period at which the plugin is enabled (PIC period syntax). Only the intensity from this steps is accumulated.
+                                             Default is `0`, which means that the scattering intensity in never calculated and therefor off
+
+``--<species>_xrayScattering.outputPeriod``  Period at which the accumulated amplitude is written to the output file (PIC period syntax). Usually set close to the x-ray coherence time.
+
+``--<species>_xrayScattering.qx_max``        Upper bound of reciprocal space range in qx direction. The unit is :math:`Å^{-1}`. Default is `5`.
+
+``--<species>_xrayScattering.qy_max``        Upper bound of reciprocal space range in qy direction. The unit is :math:`Å^{-1}` Default is `5`.
+
+``--<species>_xrayScattering.qx_min``        Lower bound of reciprocal space range in qx direction. The unit is :math:`Å^{-1}` Default is `-5`.
+
+``--<species>_xrayScattering.qy_min``        Lower bound of reciprocal space range in qy direction. The unit is :math:`Å^{-1}` Default is `-5`.
+
+``--<species>_xrayScattering.n_qx``          Number of scattering vectors needed to be calculated in qx direction. Default is `100`,
+
+``--<species>_xrayScattering.n_qy``          Number of scattering vectors needed to be calculated in qy direction. Default is '100'.
+
+``--<species>_xrayScattering.file``          Output file name. Default is `<species>_xrayScatteringOutput`.
+
+``--<species>_xrayScattering.ext``           `openPMD` filename extension. This controls the backend picked by the `openPMD` API. Default is `bp` for adios backend.
+
+``--<species>_xrayScattering.compression``   Backend-specific `openPMD` compression method (e.g.) zlib.
+
+``--<species>_xrayScattering.memoryLayout``  Possible values: `mirror` and `split`. Output can be mirrored on all Host+Device pairs or uniformly split, in chunks, over all nodes.
+                                             Use split when the output array is too big to store the complete computed q-space on one device.
+                                             For small output grids the `mirror` setting could turn out to be more efficient.
+============================================ ============================================================================================================================================
+
+
+Output
+^^^^^^
+
+``<species>_xrayScatteringOutput.<backend-specific extension>``
+
+Output file in the `openPMD` standard. An example on how to access your data with the python reader:
+
+.. code-block:: python
+
+    from picongpu.plugins.data import XrayScatteringData
+
+    simulation_path = '...' # dir containing simOutput, input, ..,
+    # Read output from the 0th step, for electrons, hdf5 backend.
+    data = XrayScatteringData( simulation_path, 'e', 'h5' )
+    amplitude = saxsData.get(iteration=0) * saxsData.get_unit()
+    del XrayScatteringData
+
+When you don't want to use the python reader keep in mind that:
+ * All iterations are saved in a single file
+ * The mesh containing the output is called `'amplitude'`
+ * This mesh has 2 components,  `'x'` is the real part and `'y'` is the imaginary part.
+
+.. note::
+    The amplitude is not zeroed on ``outputPeriod`` so one has to subtract the output from the iteration one period before and then calculate :math:`\left|\Phi\right|^2` and sum it with the intensities from other coherence periods.
+
+
+References
+^^^^^^^^^^
+
+- [1] Kluge, T., Rödel, C., Rödel, M., Pelka, A., McBride, E. E., Fletcher, L. B., … Cowan, T. E. (2017). Nanometer-scale characterization of laser-driven compression, shocks, and phase transitions, by x-ray scattering using free electron lasers. Physics of Plasmas, 24(10). https://doi.org/10.1063/1.5008289
diff --git a/docs/source/usage/tbg.rst b/docs/source/usage/tbg.rst
index 6578a0e59f..a4762ca6d2 100644
--- a/docs/source/usage/tbg.rst
+++ b/docs/source/usage/tbg.rst
@@ -58,15 +58,6 @@ Slurm is a modern batch system, e.g. installed on the Taurus cluster at TU Dresd
 .. include:: ../install/profiles/taurus-tud/Slurm_Tutorial.rst
    :start-line: 3
 
-PBS
-"""
-
-PBS (for *Portable Batch System*) is a widely distributed batch system that comes in several implementations (open, professional, etc.).
-It is used, e.g. on Hypnos at HZDR.
-
-.. include:: ../install/profiles/hypnos-hzdr/PBS_Tutorial.rst
-   :start-line: 3
-
 LSF
 """
 
diff --git a/docs/source/usage/workflows/memoryPerDevice.py b/docs/source/usage/workflows/memoryPerDevice.py
index e4ffdc7c80..190fa3047f 100755
--- a/docs/source/usage/workflows/memoryPerDevice.py
+++ b/docs/source/usage/workflows/memoryPerDevice.py
@@ -3,8 +3,8 @@
 """
 This file is part of PIConGPU.
 
-Copyright 2018-2020 PIConGPU contributors
-Authors: Marco Garten, Paweł Ordyna
+Copyright 2018-2021 PIConGPU contributors
+Authors: Marco Garten, Pawel Ordyna
 License: GPLv3+
 """
 
diff --git a/docs/source/usage/workflows/probeParticles.rst b/docs/source/usage/workflows/probeParticles.rst
index 98a01d99c0..f771e36fb1 100644
--- a/docs/source/usage/workflows/probeParticles.rst
+++ b/docs/source/usage/workflows/probeParticles.rst
@@ -16,7 +16,7 @@ Self-consistently interacting particles are usually called :ref:`tracer particle
 Workflow
 """"""""
 
-* ``speciesDefinition.param``: create a species specifically for probes and add ``fieldE`` and ``fieldB`` attributes to it for storing interpolated fields
+* ``speciesDefinition.param``: create a species specifically for probes and add ``probeE`` and ``probeB`` attributes to it for storing interpolated fields
 
 .. code-block:: cpp
 
diff --git a/etc/picongpu/aris-grnet/gpu.tpl b/etc/picongpu/aris-grnet/gpu.tpl
index 440a2080b8..413dccbb0f 100644
--- a/etc/picongpu/aris-grnet/gpu.tpl
+++ b/etc/picongpu/aris-grnet/gpu.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov,
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov,
 #                     Jian Fuh Ong
 #
 # This file is part of PIConGPU.
@@ -102,7 +102,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   srun -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/bash/mpiexec.tpl b/etc/picongpu/bash/mpiexec.tpl
index 04f509cb57..8c8e774c77 100644
--- a/etc/picongpu/bash/mpiexec.tpl
+++ b/etc/picongpu/bash/mpiexec.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Anton Helm, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -53,7 +53,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   mpiexec -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/bash/mpirun.tpl b/etc/picongpu/bash/mpirun.tpl
index fb6e760cd1..ef21800811 100644
--- a/etc/picongpu/bash/mpirun.tpl
+++ b/etc/picongpu/bash/mpirun.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Anton Helm, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -53,7 +53,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   mpirun -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/cori-nersc/knl.tpl b/etc/picongpu/cori-nersc/knl.tpl
index 9d8c908836..47a975d8b4 100644
--- a/etc/picongpu/cori-nersc/knl.tpl
+++ b/etc/picongpu/cori-nersc/knl.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Alexander Matthes
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Alexander Matthes
 #
 # This file is part of PIConGPU.
 #
diff --git a/etc/picongpu/cori-nersc/knl_picongpu.profile.example b/etc/picongpu/cori-nersc/knl_picongpu.profile.example
index 5cf1d64aae..b2fb1c0e95 100644
--- a/etc/picongpu/cori-nersc/knl_picongpu.profile.example
+++ b/etc/picongpu/cori-nersc/knl_picongpu.profile.example
@@ -21,7 +21,7 @@ export proj="<yourProject>"
 #
 module swap craype-haswell craype-mic-knl
 module swap PrgEnv-intel PrgEnv-gnu  # GCC 8.2.0
-module load cmake/3.14.4
+module load cmake/3.15.0
 module load boost/1.70.0
 
 # Other Software ##############################################################
diff --git a/etc/picongpu/cpuNumaStarter.sh b/etc/picongpu/cpuNumaStarter.sh
index 62504ebf52..aecaa01160 100755
--- a/etc/picongpu/cpuNumaStarter.sh
+++ b/etc/picongpu/cpuNumaStarter.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2017-2020 Rene Widera, Alexander Matthes
+# Copyright 2017-2021 Rene Widera, Alexander Matthes
 #
 # This file is part of PIConGPU.
 #
diff --git a/etc/picongpu/cuda.filter b/etc/picongpu/cuda.filter
index ed8c25e0ec..0663ae5f77 100644
--- a/etc/picongpu/cuda.filter
+++ b/etc/picongpu/cuda.filter
@@ -1,7 +1,7 @@
 std::* -- 0
 *boost::* -- 0
 pmacc::Environment* -- 0
-pmacc::algorithms::* -- 0
+pmacc::* -- 0
 *Event* -- 0
 *MPI_Test* -- 0
 *new* -- 0
diff --git a/etc/picongpu/davide-cineca/gpu.tpl b/etc/picongpu/davide-cineca/gpu.tpl
index f4c0f50ee6..0d01e1954f 100644
--- a/etc/picongpu/davide-cineca/gpu.tpl
+++ b/etc/picongpu/davide-cineca/gpu.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   srun --cpu-bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/davide-cineca/gpu_picongpu.profile.example b/etc/picongpu/davide-cineca/gpu_picongpu.profile.example
index 21b041a260..e926cc2979 100644
--- a/etc/picongpu/davide-cineca/gpu_picongpu.profile.example
+++ b/etc/picongpu/davide-cineca/gpu_picongpu.profile.example
@@ -21,7 +21,7 @@ export proj=$(groups | awk '{print $2}')
 #
 module purge
 module load gnu/6.4.0
-module load cmake/3.11.4
+module load cmake/3.15.0
 module load cuda/9.2.88
 module load openmpi/3.1.0--gnu--6.4.0
 module load boost/1.68.0--openmpi--3.1.0--gnu--6.4.0
diff --git a/etc/picongpu/davinci-rice/picongpu.tpl b/etc/picongpu/davinci-rice/picongpu.tpl
index b4e316b2bd..045dc3b70b 100644
--- a/etc/picongpu/davinci-rice/picongpu.tpl
+++ b/etc/picongpu/davinci-rice/picongpu.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
@@ -81,7 +81,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   mpirun -n TBG_tasks --display-map -am tbg/openib.conf --mca mpi_leave_pinned 0 !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/draco-mpcdf/general.tpl b/etc/picongpu/draco-mpcdf/general.tpl
index a29d4e16bf..a6bbf6a52d 100644
--- a/etc/picongpu/draco-mpcdf/general.tpl
+++ b/etc/picongpu/draco-mpcdf/general.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/etc/picongpu/draco-mpcdf/picongpu.profile.example b/etc/picongpu/draco-mpcdf/picongpu.profile.example
index 5c02c7fcf2..7717c94f5d 100644
--- a/etc/picongpu/draco-mpcdf/picongpu.profile.example
+++ b/etc/picongpu/draco-mpcdf/picongpu.profile.example
@@ -19,7 +19,7 @@ module purge
 
 module load git/2.14
 module load gcc/6.3
-module load cmake/3.11.4
+module load cmake/3.15.0
 module load boost/gcc/1.64
 module load impi/2017.3
 module load hdf5-mpi/gcc/1.8.18
diff --git a/etc/picongpu/hemera-hzdr/defq.tpl b/etc/picongpu/hemera-hzdr/defq.tpl
index 238c720050..9a7ddbbf14 100644
--- a/etc/picongpu/hemera-hzdr/defq.tpl
+++ b/etc/picongpu/hemera-hzdr/defq.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/etc/picongpu/hemera-hzdr/defq_picongpu.profile.example b/etc/picongpu/hemera-hzdr/defq_picongpu.profile.example
index 42a6d4bc70..acb0a4f702 100644
--- a/etc/picongpu/hemera-hzdr/defq_picongpu.profile.example
+++ b/etc/picongpu/hemera-hzdr/defq_picongpu.profile.example
@@ -16,6 +16,7 @@ export MY_NAME="$(whoami) <$MY_MAIL>"
 # General modules #############################################################
 #
 module purge
+module load git
 module load gcc/7.3.0
 module load cmake/3.15.2
 module load openmpi/2.1.2
@@ -29,6 +30,7 @@ module load c-blosc/1.14.4
 module load adios/1.13.1
 module load hdf5-parallel/1.8.20
 module load libsplash/1.7.0
+module load python/3.6.5
 
 module load libpng/1.6.35
 module load pngwriter/0.7.0
diff --git a/etc/picongpu/hemera-hzdr/fwkt_v100.tpl b/etc/picongpu/hemera-hzdr/fwkt_v100.tpl
index 5bc8341d9b..413442fa4e 100644
--- a/etc/picongpu/hemera-hzdr/fwkt_v100.tpl
+++ b/etc/picongpu/hemera-hzdr/fwkt_v100.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera,
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera,
 #                     Marco Garten, Alexander Debus
 #
 # This file is part of PIConGPU.
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/fwkt_v100_picongpu.profile.example b/etc/picongpu/hemera-hzdr/fwkt_v100_picongpu.profile.example
index 12f0932eca..da4dae273f 100644
--- a/etc/picongpu/hemera-hzdr/fwkt_v100_picongpu.profile.example
+++ b/etc/picongpu/hemera-hzdr/fwkt_v100_picongpu.profile.example
@@ -16,10 +16,11 @@ export MY_NAME="$(whoami) <$MY_MAIL>"
 # General modules #############################################################
 #
 module purge
+module load git
 module load gcc/7.3.0
 module load cmake/3.15.2
-module load cuda/10.0
-module load openmpi/2.1.2-cuda100
+module load cuda/10.2
+module load openmpi/2.1.2-cuda102
 module load boost/1.68.0
 
 # Other Software ##############################################################
@@ -27,9 +28,12 @@ module load boost/1.68.0
 module load zlib/1.2.11
 module load c-blosc/1.14.4
 
-module load adios/1.13.1-cuda100
-module load hdf5-parallel/1.8.20-cuda100
-module load libsplash/1.7.0-cuda100
+module load hdf5-parallel/1.8.20-cuda102
+module load libsplash/1.7.0-cuda102
+module load python/3.6.5
+module load adios/1.13.1-cuda102
+module load adios2/2.6.0-cuda102
+module load openpmd/0.12.0-cuda102
 
 module load libpng/1.6.35
 module load pngwriter/0.7.0
diff --git a/etc/picongpu/hemera-hzdr/gpu.tpl b/etc/picongpu/hemera-hzdr/gpu.tpl
index 90c9b2d12c..ff7487d18b 100644
--- a/etc/picongpu/hemera-hzdr/gpu.tpl
+++ b/etc/picongpu/hemera-hzdr/gpu.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera, Marco Garten
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera, Marco Garten
 #
 # This file is part of PIConGPU.
 #
@@ -101,7 +101,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/gpu_picongpu.profile.example b/etc/picongpu/hemera-hzdr/gpu_picongpu.profile.example
index 42a7de084d..2e5cb5d869 100644
--- a/etc/picongpu/hemera-hzdr/gpu_picongpu.profile.example
+++ b/etc/picongpu/hemera-hzdr/gpu_picongpu.profile.example
@@ -16,10 +16,11 @@ export MY_NAME="$(whoami) <$MY_MAIL>"
 # General modules #############################################################
 #
 module purge
+module load git
 module load gcc/7.3.0
 module load cmake/3.15.2
-module load cuda/10.0
-module load openmpi/2.1.2-cuda100
+module load cuda/10.2
+module load openmpi/2.1.2-cuda102
 module load boost/1.68.0
 
 # Other Software ##############################################################
@@ -27,9 +28,12 @@ module load boost/1.68.0
 module load zlib/1.2.11
 module load c-blosc/1.14.4
 
-module load adios/1.13.1-cuda100
-module load hdf5-parallel/1.8.20-cuda100
-module load libsplash/1.7.0-cuda100
+module load hdf5-parallel/1.8.20-cuda102
+module load libsplash/1.7.0-cuda102
+module load python/3.6.5
+module load adios/1.13.1-cuda102
+module load adios2/2.6.0-cuda102
+module load openpmd/0.12.0-cuda102
 
 module load libpng/1.6.35
 module load pngwriter/0.7.0
diff --git a/etc/picongpu/hemera-hzdr/k20.tpl b/etc/picongpu/hemera-hzdr/k20.tpl
index b8992555f0..44dc0a6970 100644
--- a/etc/picongpu/hemera-hzdr/k20.tpl
+++ b/etc/picongpu/hemera-hzdr/k20.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Richard Pausch, Rene Widera,
+# Copyright 2013-2021 Axel Huebl, Anton Helm, Richard Pausch, Rene Widera,
 #                     Marco Garten
 #
 # This file is part of PIConGPU.
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/k20_picongpu.profile.example b/etc/picongpu/hemera-hzdr/k20_picongpu.profile.example
index 25afced6be..2162dad95c 100644
--- a/etc/picongpu/hemera-hzdr/k20_picongpu.profile.example
+++ b/etc/picongpu/hemera-hzdr/k20_picongpu.profile.example
@@ -16,10 +16,11 @@ export MY_NAME="$(whoami) <$MY_MAIL>"
 # General modules #############################################################
 #
 module purge
+module load git
 module load gcc/7.3.0
 module load cmake/3.15.2
-module load cuda/10.0
-module load openmpi/2.1.2-cuda100
+module load cuda/10.2
+module load openmpi/2.1.2-cuda102
 module load boost/1.68.0
 
 # Other Software ##############################################################
@@ -27,9 +28,12 @@ module load boost/1.68.0
 module load zlib/1.2.11
 module load c-blosc/1.14.4
 
-module load adios/1.13.1-cuda100
-module load hdf5-parallel/1.8.20-cuda100
-module load libsplash/1.7.0-cuda100
+module load hdf5-parallel/1.8.20-cuda102
+module load libsplash/1.7.0-cuda102
+module load python/3.6.5
+module load adios/1.13.1-cuda102
+module load adios2/2.6.0-cuda102
+module load openpmd/0.12.0-cuda102
 
 module load libpng/1.6.35
 module load pngwriter/0.7.0
diff --git a/etc/picongpu/hemera-hzdr/k20_restart.tpl b/etc/picongpu/hemera-hzdr/k20_restart.tpl
index 52b9701b07..48e4ae642b 100644
--- a/etc/picongpu/hemera-hzdr/k20_restart.tpl
+++ b/etc/picongpu/hemera-hzdr/k20_restart.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch,
+# Copyright 2013-2021 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch,
 #                     Bifeng Lei, Marco Garten
 #
 # This file is part of PIConGPU.
@@ -167,7 +167,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq !TBG_gpusPerNode ] ; then
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/k80.tpl b/etc/picongpu/hemera-hzdr/k80.tpl
index 3cfc81ea4d..b20d60192e 100644
--- a/etc/picongpu/hemera-hzdr/k80.tpl
+++ b/etc/picongpu/hemera-hzdr/k80.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Richard Pausch, Rene Widera,
+# Copyright 2013-2021 Axel Huebl, Anton Helm, Richard Pausch, Rene Widera,
 #                     Marco Garten
 #
 # This file is part of PIConGPU.
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq
   # Run CUDA memtest to check GPU's health
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hemera-hzdr/k80_picongpu.profile.example b/etc/picongpu/hemera-hzdr/k80_picongpu.profile.example
index 87262ff55a..03e08aa848 100644
--- a/etc/picongpu/hemera-hzdr/k80_picongpu.profile.example
+++ b/etc/picongpu/hemera-hzdr/k80_picongpu.profile.example
@@ -16,10 +16,11 @@ export MY_NAME="$(whoami) <$MY_MAIL>"
 # General modules #############################################################
 #
 module purge
+module load git
 module load gcc/7.3.0
 module load cmake/3.15.2
-module load cuda/10.0
-module load openmpi/2.1.2-cuda100
+module load cuda/10.2
+module load openmpi/2.1.2-cuda102
 module load boost/1.68.0
 
 # Other Software ##############################################################
@@ -27,9 +28,12 @@ module load boost/1.68.0
 module load zlib/1.2.11
 module load c-blosc/1.14.4
 
-module load adios/1.13.1-cuda100
-module load hdf5-parallel/1.8.20-cuda100
-module load libsplash/1.7.0-cuda100
+module load hdf5-parallel/1.8.20-cuda102
+module load libsplash/1.7.0-cuda102
+module load python/3.6.5
+module load adios/1.13.1-cuda102
+module load adios2/2.6.0-cuda102
+module load openpmd/0.12.0-cuda102
 
 module load libpng/1.6.35
 module load pngwriter/0.7.0
diff --git a/etc/picongpu/hemera-hzdr/k80_restart.tpl b/etc/picongpu/hemera-hzdr/k80_restart.tpl
index d65f9e9730..8ed316c572 100644
--- a/etc/picongpu/hemera-hzdr/k80_restart.tpl
+++ b/etc/picongpu/hemera-hzdr/k80_restart.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch,
+# Copyright 2013-2021 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch,
 #                     Bifeng Lei, Marco Garten
 #
 # This file is part of PIConGPU.
@@ -167,7 +167,7 @@ export OMPI_MCA_io=^ompio
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq !TBG_gpusPerNode ] ; then
   mpiexec !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/hydra-hzdr/default.tpl b/etc/picongpu/hydra-hzdr/default.tpl
deleted file mode 100644
index 9f9105f844..0000000000
--- a/etc/picongpu/hydra-hzdr/default.tpl
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera
-#
-# This file is part of PIConGPU.
-#
-# PIConGPU is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# PIConGPU is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with PIConGPU.
-# If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-# PIConGPU batch script for hydra PBS batch system
-
-#PBS -q !TBG_queue
-#PBS -l walltime=!TBG_wallTime
-# Sets batch job's name
-#PBS -N !TBG_jobName
-#PBS -l nodes=!TBG_nodes:ppn=!TBG_coresPerNode
-#PBS -l mem=!TBG_globalMainMemStr
-#PBS -m !TBG_mailSettings -M !TBG_mailAddress
-#PBS -d !TBG_dstPath
-#PBS -n
-
-#PBS -o stdout
-#PBS -e stderr
-
-
-## calculation are done by tbg ##
-.TBG_queue="default"
-
-# settings that can be controlled by environment variables before submit
-.TBG_mailSettings=${MY_MAILNOTIFY:-"n"}
-.TBG_mailAddress=${MY_MAIL:-"someone@example.com"}
-.TBG_author=${MY_NAME:+--author \"${MY_NAME}\"}
-.TBG_profile=${PIC_PROFILE:-"~/picongpu.profile"}
-
-# 2 packages per node if we need more than 2 ranks else same count as TBG_tasks
-.TBG_gpusPerNode=`if [ $TBG_tasks -gt 2 ] ; then echo 2; else echo $TBG_tasks; fi`
-
-#number of cores per parallel node / default is 2 cores per gpu on 'default' queue
-.TBG_coresPerNode="$(( TBG_gpusPerNode * 16 ))"
-
-# use ceil to caculate nodes
-.TBG_nodes="$(( ( TBG_tasks + TBG_gpusPerNode -1 ) / TBG_gpusPerNode))"
-# main memory used for the job
-.TBG_globalMainMem=$(( TBG_nodes * 256 ))
-.TBG_globalMainMemStr="!TBG_globalMainMem"GB
-## end calculations ##
-
-echo 'Running program...'
-
-cd !TBG_dstPath
-
-export MODULES_NO_OUTPUT=1
-source !TBG_profile
-if [ $? -ne 0 ] ; then
-  echo "Error: PIConGPU environment profile under \"!TBG_profile\" not found!"
-  exit 1
-fi
-unset MODULES_NO_OUTPUT
-
-#set user rights to u=rwx;g=r-x;o=---
-umask 0027
-
-mkdir simOutput 2> /dev/null
-cd simOutput
-
-#wait that all nodes see ouput folder
-sleep 1
-
-# The OMPIO backend in OpenMPI up to 3.1.3 and 4.0.0 is broken, use the
-# fallback ROMIO backend instead.
-#   see bug https://github.com/open-mpi/ompi/issues/6285
-export OMPI_MCA_io=^ompio
-
-if [ $? -eq 0 ] ; then
-  mpiexec --prefix $MPIHOME -x LIBRARY_PATH -tag-output --bind-to none --display-map -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/tbg/cpuNumaStarter.sh !TBG_dstPath/input/bin/picongpu !TBG_author !TBG_programParams | tee output
-fi
-
-mpiexec --prefix $MPIHOME -x LIBRARY_PATH -npernode !TBG_gpusPerNode -n !TBG_tasks /usr/bin/env bash -c "killall -9 picongpu 2>/dev/null || true"
diff --git a/etc/picongpu/hydra-hzdr/default_picongpu.profile.example b/etc/picongpu/hydra-hzdr/default_picongpu.profile.example
deleted file mode 100644
index 9e10352310..0000000000
--- a/etc/picongpu/hydra-hzdr/default_picongpu.profile.example
+++ /dev/null
@@ -1,71 +0,0 @@
-# Name and Path of this Script ############################### (DO NOT change!)
-export PIC_PROFILE=$(cd $(dirname $BASH_SOURCE) && pwd)"/"$(basename $BASH_SOURCE)
-
-# User Information ################################# (edit the following lines)
-#   - automatically add your name and contact to output file meta data
-#   - send me mails on batch system job (b)egin, (e)nd, (a)bortion or (n)o mail
-export MY_MAILNOTIFY="n"
-export MY_MAIL="someone@example.com"
-export MY_NAME="$(whoami) <$MY_MAIL>"
-
-# Text Editor for Tools ###################################### (edit this line)
-#   - examples: "nano", "vim", "emacs -nw", "vi" or without terminal: "gedit"
-#export EDITOR="nano"
-
-# Modules #####################################################################
-#
-if [ -f /etc/profile.modules ]
-then
-        . /etc/profile.modules
-        module purge
-#       export MODULES_NO_OUTPUT=1
-
-        # Core Dependencies
-        module load gcc/5.3.0
-        module load cmake/3.13.4
-        module load boost/1.65.1
-        module load openmpi/1.8.6
-        module load numactl
-
-        # Plugins (optional)
-        module load pngwriter/0.7.0
-        module load hdf5-parallel/1.8.15 libsplash/1.7.0
-
-        # either use libSplash or ADIOS for file I/O
-        #module load adios/1.13.1
-
-        # Debug Tools
-        #module load gdb
-        #module load valgrind/3.8.1
-
-#       unset MODULES_NO_OUTPUT
-fi
-
-# Environment #################################################################
-#
-alias getNode='qsub -I -q default -lwalltime=00:30:00 -lnodes=1:ppn=32'
-
-export PICSRC=/home/$(whoami)/src/picongpu
-export PIC_EXAMPLES=$PICSRC/share/picongpu/examples
-export PIC_BACKEND="omp2b:ivybridge"
-
-export PATH=$PATH:$PICSRC
-export PATH=$PATH:$PICSRC/bin
-export PATH=$PATH:$PICSRC/src/splash2txt/build
-export PATH=$PATH:$PICSRC/src/tools/bin
-
-export PYTHONPATH=$PICSRC/src/tools/lib/python:$PYTHONPATH
-
-# "tbg" default options #######################################################
-#   - PBS/Torque (qsub)
-#   - "default" queue
-export TBG_SUBMIT="qsub"
-export TBG_TPLFILE="etc/picongpu/hydra-hzdr/default.tpl"
-
-# Load autocompletion for PIConGPU commands
-BASH_COMP_FILE=$PICSRC/bin/picongpu-completion.bash
-if [ -f $BASH_COMP_FILE ] ; then
-    source $BASH_COMP_FILE
-else
-    echo "bash completion file '$BASH_COMP_FILE' not found." >&2
-fi
diff --git a/etc/picongpu/hypnos-hzdr/PBS_Tutorial.rst b/etc/picongpu/hypnos-hzdr/PBS_Tutorial.rst
deleted file mode 100644
index 837a50e406..0000000000
--- a/etc/picongpu/hypnos-hzdr/PBS_Tutorial.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-PBS examples
-============
-
-Job Submission
-''''''''''''''
-
-PIConGPU job submission on the *Hypnos* cluster at *HZDR*:
-
-* ``tbg -s qsub -c etc/picongpu/0008gpus.cfg -t etc/picongpu/hypnos-hzdr/k20.tpl /bigdata/hplsim/<...>/test-001``
-
-Where ``<...>`` is one of:
-
-* ``external/$(whoami)``
-* internal:
-
-  * ``scratch/$(whoami)``
-  * ``development/$(whoami)``
-  * ``production/<project name>``
-
-Job Control
-'''''''''''
-
-* interactive job:
-
-  * ``qsub -I -q k20 -lwalltime=12:00:00 -lnodes=1:ppn=8``
-
-* details for my jobs:
-
-  * ``qstat -f 12345`` all details for job with <job id> ``12345``
-  * ``qstat -u $(whoami)`` all jobs under my user name
-
-* details for queues:
-
-  * ``qstat -a queueName`` show all jobs in a queue
-  * ``pbs_free -l`` compact view on free and busy nodes
-  * ``pbsnodes`` list all nodes and their detailed state (free, busy/job-exclusive, offline)
-
-* communicate with job:
-
-  * ``qdel <job id>`` abort job
-  * ``qsig -s <signal number> <job id>`` send signal or signal name to job
-  * ``qalter -lwalltime=12:00:00 <job id>`` change the walltime of a job
-  * ``qalter -Wdepend=afterany:54321 12345`` only start job ``12345`` after job with id ``54321`` has finished
-  * ``qhold <job id>`` prevent the job from starting
-  * ``qrls <job id>`` release the job to be eligible for run (after it was set on hold)
diff --git a/etc/picongpu/hypnos-hzdr/fermi.tpl b/etc/picongpu/hypnos-hzdr/fermi.tpl
deleted file mode 100644
index 107a30678d..0000000000
--- a/etc/picongpu/hypnos-hzdr/fermi.tpl
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera
-#
-# This file is part of PIConGPU.
-#
-# PIConGPU is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# PIConGPU is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with PIConGPU.
-# If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-# PIConGPU batch script for hypnos PBS batch system
-
-#PBS -q !TBG_queue
-#PBS -l walltime=!TBG_wallTime
-# Sets batch job's name
-#PBS -N !TBG_jobName
-#PBS -l nodes=!TBG_nodes:ppn=!TBG_coresPerNode
-#PBS -m !TBG_mailSettings -M !TBG_mailAddress
-#PBS -d !TBG_dstPath
-
-#PBS -o stdout
-#PBS -e stderr
-
-
-## calculation are done by tbg ##
-# Tesla C2070 queue on kepler018 & kepler019
-.TBG_queue="k20f"
-
-# settings that can be controlled by environment variables before submit
-.TBG_mailSettings=${MY_MAILNOTIFY:-"n"}
-.TBG_mailAddress=${MY_MAIL:-"someone@example.com"}
-.TBG_author=${MY_NAME:+--author \"${MY_NAME}\"}
-.TBG_profile=${PIC_PROFILE:-"~/picongpu.profile"}
-
-# number of available/hosted GPUs per node in the system
-.TBG_numHostedGPUPerNode=4
-
-# required GPUs per node for the current job
-.TBG_gpusPerNode=`if [ $TBG_tasks -gt $TBG_numHostedGPUPerNode ] ; then echo $TBG_numHostedGPUPerNode; else echo $TBG_tasks; fi`
-
-#number of cores per parallel node / default is 2 cores per gpu on k20 queue
-.TBG_coresPerNode="$(( TBG_gpusPerNode * 2 ))"
-
-# use ceil to caculate nodes
-.TBG_nodes="$(( ( TBG_tasks + TBG_gpusPerNode -1 ) / TBG_gpusPerNode))"
-## end calculations ##
-
-echo 'Running program...'
-
-cd !TBG_dstPath
-
-export MODULES_NO_OUTPUT=1
-source !TBG_profile
-if [ $? -ne 0 ] ; then
-  echo "Error: PIConGPU environment profile under \"!TBG_profile\" not found!"
-  exit 1
-fi
-unset MODULES_NO_OUTPUT
-
-#set user rights to u=rwx;g=r-x;o=---
-umask 0027
-
-mkdir simOutput 2> /dev/null
-cd simOutput
-
-#wait that all nodes see ouput folder
-sleep 1
-
-# The OMPIO backend in OpenMPI up to 3.1.3 and 4.0.0 is broken, use the
-# fallback ROMIO backend instead.
-#   see bug https://github.com/open-mpi/ompi/issues/6285
-export OMPI_MCA_io=^ompio
-
-# test if cuda_memtest binary is available and we have the node exclusive
-if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedGPUPerNode -eq !TBG_gpusPerNode ] ; then
-  mpiexec --prefix $MPIHOME -tag-output --display-map -x LIBRARY_PATH -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
-else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
-fi
-
-if [ $? -eq 0 ] ; then
-  mpiexec --prefix $MPIHOME -x LIBRARY_PATH -tag-output --display-map -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/input/bin/picongpu !TBG_author !TBG_programParams | tee output
-fi
-
-mpiexec --prefix $MPIHOME -x LIBRARY_PATH -npernode !TBG_gpusPerNode -n !TBG_tasks /usr/bin/env bash -c "killall -9 picongpu 2>/dev/null || true"
diff --git a/etc/picongpu/hypnos-hzdr/laser.tpl b/etc/picongpu/hypnos-hzdr/laser.tpl
deleted file mode 100644
index 7c1c312a72..0000000000
--- a/etc/picongpu/hypnos-hzdr/laser.tpl
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera
-#
-# This file is part of PIConGPU.
-#
-# PIConGPU is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# PIConGPU is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with PIConGPU.
-# If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-# PIConGPU batch script for hypnos PBS batch system
-
-#PBS -q !TBG_queue
-#PBS -l walltime=!TBG_wallTime
-# Sets batch job's name
-#PBS -N !TBG_jobName
-#PBS -l nodes=!TBG_nodes:ppn=!TBG_coresPerNode
-#PBS -m !TBG_mailSettings -M !TBG_mailAddress
-#PBS -d !TBG_dstPath
-
-#PBS -o stdout
-#PBS -e stderr
-
-
-## calculation are done by tbg ##
-.TBG_queue="laser"
-
-# settings that can be controlled by environment variables before submit
-.TBG_mailSettings=${MY_MAILNOTIFY:-"n"}
-.TBG_mailAddress=${MY_MAIL:-"someone@example.com"}
-.TBG_author=${MY_NAME:+--author \"${MY_NAME}\"}
-.TBG_profile=${PIC_PROFILE:-"~/picongpu.profile"}
-
-# 8 packages per node if we need more than 8 ranks else same count as TBG_tasks
-.TBG_gpusPerNode=`if [ $TBG_tasks -gt 8 ] ; then echo 8; else echo $TBG_tasks; fi`
-
-#number of cores per parallel node / default is 2 cores per gpu on k20 queue
-.TBG_coresPerNode="$(( TBG_gpusPerNode * 8 ))"
-
-# use ceil to caculate nodes
-.TBG_nodes="$(( ( TBG_tasks + TBG_gpusPerNode -1 ) / TBG_gpusPerNode))"
-## end calculations ##
-
-echo 'Running program...'
-
-cd !TBG_dstPath
-
-export MODULES_NO_OUTPUT=1
-source !TBG_profile
-if [ $? -ne 0 ] ; then
-  echo "Error: PIConGPU environment profile under \"!TBG_profile\" not found!"
-  exit 1
-fi
-unset MODULES_NO_OUTPUT
-
-#set user rights to u=rwx;g=r-x;o=---
-umask 0027
-
-mkdir simOutput 2> /dev/null
-cd simOutput
-
-#wait that all nodes see ouput folder
-sleep 1
-
-# The OMPIO backend in OpenMPI up to 3.1.3 and 4.0.0 is broken, use the
-# fallback ROMIO backend instead.
-#   see bug https://github.com/open-mpi/ompi/issues/6285
-export OMPI_MCA_io=^ompio
-
-if [ $? -eq 0 ] ; then
-  mpiexec --prefix $MPIHOME -x LIBRARY_PATH -tag-output --display-map -am !TBG_dstPath/tbg/openib.conf --mca mpi_leave_pinned 0 -npernode !TBG_gpusPerNode -n !TBG_tasks !TBG_dstPath/tbg/cpuNumaStarter.sh !TBG_dstPath/input/bin/picongpu !TBG_author !TBG_programParams | tee output
-fi
-
-mpiexec --prefix $MPIHOME -x LIBRARY_PATH -npernode !TBG_gpusPerNode -n !TBG_tasks /usr/bin/env bash -c "killall -9 picongpu 2>/dev/null || true"
diff --git a/etc/picongpu/hypnos-hzdr/laser_picongpu.profile.example b/etc/picongpu/hypnos-hzdr/laser_picongpu.profile.example
deleted file mode 100644
index 3752a60e84..0000000000
--- a/etc/picongpu/hypnos-hzdr/laser_picongpu.profile.example
+++ /dev/null
@@ -1,72 +0,0 @@
-# Name and Path of this Script ############################### (DO NOT change!)
-export PIC_PROFILE=$(cd $(dirname $BASH_SOURCE) && pwd)"/"$(basename $BASH_SOURCE)
-
-# User Information ################################# (edit the following lines)
-#   - automatically add your name and contact to output file meta data
-#   - send me mails on batch system job (b)egin, (e)nd, (a)bortion or (n)o mail
-export MY_MAILNOTIFY="n"
-export MY_MAIL="someone@example.com"
-export MY_NAME="$(whoami) <$MY_MAIL>"
-
-# Text Editor for Tools ###################################### (edit this line)
-#   - examples: "nano", "vim", "emacs -nw", "vi" or without terminal: "gedit"
-#export EDITOR="nano"
-
-# Modules #####################################################################
-#
-if [ -f /etc/profile.modules ]
-then
-        . /etc/profile.modules
-        module purge
-#       export MODULES_NO_OUTPUT=1
-
-        # Core Dependencies
-        module load gcc/5.3.0
-        module load cmake/3.13.4
-        module load boost/1.65.1
-        module load openmpi/1.8.6
-        module load numactl
-
-        # Plugins (optional)
-        module load zlib/1.2.8
-        module load pngwriter/0.7.0
-        module load hdf5-parallel/1.8.15 libsplash/1.7.0
-
-        # either use libSplash or ADIOS for file I/O
-        #module load adios/1.13.1
-
-        # Debug Tools
-        #module load gdb
-        #module load valgrind/3.8.1
-
-#       unset MODULES_NO_OUTPUT
-fi
-
-# Environment #################################################################
-#
-alias getNode='qsub -I -q laser -lwalltime=00:30:00 -lnodes=1:ppn=64'
-
-export PICSRC=/home/$(whoami)/src/picongpu
-export PIC_EXAMPLES=$PICSRC/share/picongpu/examples
-export PIC_BACKEND="omp2b:bdver1"
-
-export PATH=$PATH:$PICSRC
-export PATH=$PATH:$PICSRC/bin
-export PATH=$PATH:$PICSRC/src/splash2txt/build
-export PATH=$PATH:$PICSRC/src/tools/bin
-
-export PYTHONPATH=$PICSRC/lib/python:$PYTHONPATH
-
-# "tbg" default options #######################################################
-#   - PBS/Torque (qsub)
-#   - "laser" queue
-export TBG_SUBMIT="qsub"
-export TBG_TPLFILE="etc/picongpu/hypnos-hzdr/laser.tpl"
-
-# Load autocompletion for PIConGPU commands
-BASH_COMP_FILE=$PICSRC/bin/picongpu-completion.bash
-if [ -f $BASH_COMP_FILE ] ; then
-    source $BASH_COMP_FILE
-else
-    echo "bash completion file '$BASH_COMP_FILE' not found." >&2
-fi
diff --git a/etc/picongpu/jureca-jsc/batch.tpl b/etc/picongpu/jureca-jsc/batch.tpl
index e9e111fa5c..e7b9daadcf 100644
--- a/etc/picongpu/jureca-jsc/batch.tpl
+++ b/etc/picongpu/jureca-jsc/batch.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
 #
 # This file is part of PIConGPU.
 #
diff --git a/etc/picongpu/jureca-jsc/batch_picongpu.profile.example b/etc/picongpu/jureca-jsc/batch_picongpu.profile.example
index 9d46ab1898..5ea730e87e 100644
--- a/etc/picongpu/jureca-jsc/batch_picongpu.profile.example
+++ b/etc/picongpu/jureca-jsc/batch_picongpu.profile.example
@@ -24,7 +24,7 @@ jutil env activate -p $proj
 #
 module purge
 module load Intel/2019.0.117-GCC-7.3.0
-module load CMake/3.13.0
+module load CMake/3.15.0
 module load IntelMPI/2018.4.274
 module load Python/3.6.6
 module load Boost/1.68.0-Python-3.6.6
diff --git a/etc/picongpu/jureca-jsc/booster.tpl b/etc/picongpu/jureca-jsc/booster.tpl
index 5261491b6f..dc4fa6255e 100644
--- a/etc/picongpu/jureca-jsc/booster.tpl
+++ b/etc/picongpu/jureca-jsc/booster.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
 #
 # This file is part of PIConGPU.
 #
diff --git a/etc/picongpu/jureca-jsc/booster_picongpu.profile.example b/etc/picongpu/jureca-jsc/booster_picongpu.profile.example
index 20a5b0b51b..806c8fe123 100644
--- a/etc/picongpu/jureca-jsc/booster_picongpu.profile.example
+++ b/etc/picongpu/jureca-jsc/booster_picongpu.profile.example
@@ -25,7 +25,7 @@ jutil env activate -p $proj
 module purge
 module load Architecture/KNL
 module load Intel/2019.0.117-GCC-7.3.0
-module load CMake/3.12.3
+module load CMake/3.15.0
 module load IntelMPI/2018.4.274
 module load Python/3.6.6
 module load Boost/1.68.0-Python-3.6.6
diff --git a/etc/picongpu/jureca-jsc/gpus.tpl b/etc/picongpu/jureca-jsc/gpus.tpl
index a6bda9a5e2..8f3550bdbe 100644
--- a/etc/picongpu/jureca-jsc/gpus.tpl
+++ b/etc/picongpu/jureca-jsc/gpus.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
 #
 # This file is part of PIConGPU.
 #
@@ -93,7 +93,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedDevicesPerNode
   # Run CUDA memtest to check GPU's health
   srun --cpu_bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/jureca-jsc/gpus_picongpu.profile.example b/etc/picongpu/jureca-jsc/gpus_picongpu.profile.example
index 387fded36b..25b30b7f5d 100644
--- a/etc/picongpu/jureca-jsc/gpus_picongpu.profile.example
+++ b/etc/picongpu/jureca-jsc/gpus_picongpu.profile.example
@@ -25,7 +25,7 @@ jutil env activate -p $proj
 module purge
 module load GCC/7.3.0
 module load CUDA/9.2.88
-module load CMake/3.13.0
+module load CMake/3.15.0
 module load MVAPICH2/2.3-GDR
 module load Python/3.6.6
 
diff --git a/etc/picongpu/juwels-jsc/batch.tpl b/etc/picongpu/juwels-jsc/batch.tpl
index 2840946edc..64dd07439f 100644
--- a/etc/picongpu/juwels-jsc/batch.tpl
+++ b/etc/picongpu/juwels-jsc/batch.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
 #
 # This file is part of PIConGPU.
 #
@@ -32,7 +32,7 @@
 #SBATCH --mem=!TBG_memPerNode
 #SBATCH --mail-type=!TBG_mailSettings
 #SBATCH --mail-user=!TBG_mailAddress
-#SBATCH --workdir=!TBG_dstPath
+#SBATCH --chdir=!TBG_dstPath
 
 #SBATCH -o stdout
 #SBATCH -e stderr
diff --git a/etc/picongpu/juwels-jsc/batch_picongpu.profile.example b/etc/picongpu/juwels-jsc/batch_picongpu.profile.example
index a6b0260093..87331071d6 100644
--- a/etc/picongpu/juwels-jsc/batch_picongpu.profile.example
+++ b/etc/picongpu/juwels-jsc/batch_picongpu.profile.example
@@ -10,31 +10,41 @@ export MY_MAIL="someone@example.com"
 export MY_NAME="$(whoami) <$MY_MAIL>"
 
 # Project Information ######################################## (edit this line)
-#   - project account for computing time
-export proj=$(groups | awk '{print $4}')
-
+#   - project and account for allocation
+#
+#   `jutil user projects` will return a table of project associations.
+#   Each row contains: project,unixgroup,PI-uid,project-type,budget-accounts
+#   We need the first and last entry.
+#   Here: select the last available project.
+#   Alternative: Set proj, account manually
+export proj=$( jutil user projects --noheader | awk '{print $1}' | tail -n 1 )
+export account=$(jutil user projects -n | awk '{print $NF}' | tail -n 1)
 # Text Editor for Tools ###################################### (edit this line)
 #   - examples: "nano", "vim", "emacs -nw", "vi" or without terminal: "gedit"
 #export EDITOR="nano"
-
 # Set up environment, including $SCRATCH and $PROJECT
-jutil env activate -p $proj
+# Handle a case where the budgeting account is not set.
+if [ $accountt = "-" ]; then
+    jutil env activate --project $proj;
+else
+    jutil env activate --project $proj --budget $account
+fi
+
 
 # General modules #############################################################
 #
 module purge
-module load Intel/2019.0.117-GCC-7.3.0
-module load CMake/3.13.0
-module load IntelMPI/2018.4.274
-module load Python/3.6.6
-module load Boost/1.68.0-Python-3.6.6
+module load Intel/2020.2.254-GCC-9.3.0
+module load CMake/3.18.0
+module load IntelMPI/2019.8.254
+module load Python/3.8.5
+
+module load Boost/1.73.0
 
 # Other Software ##############################################################
 #
-module load zlib/.1.2.11
-module load HDF5/1.10.1
-module load libpng/.1.6.35
-export CMAKE_PREFIX_PATH=$EBROOTZLIB:$EBROOTLIBPNG:$CMAKE_PREFIX_PATH
+module load HDF5/1.10.6
+#export CMAKE_PREFIX_PATH=$EBROOTZLIB:$EBROOTLIBPNG:$CMAKE_PREFIX_PATH
 
 PARTITION_LIB=$PROJECT/lib_batch
 LIBSPLASH_ROOT=$PARTITION_LIB/libSplash
@@ -86,7 +96,7 @@ function getNode() {
     fi
     echo "Hint: please use 'srun --cpu_bind=sockets <COMMAND>' for launching multiple processes in the interactive mode"
     export OMP_NUM_THREADS=48
-    salloc --time=1:00:00 --nodes=$numNodes --ntasks-per-node=2 --mem=94000 -A $proj -p batch bash
+    salloc --time=1:00:00 --nodes=$numNodes --ntasks-per-node=2 --mem=94000 -A $account -p batch bash
 }
 
 # allocate an interactive shell for one hour
@@ -104,7 +114,7 @@ function getDevice() {
     fi
     echo "Hint: please use 'srun --cpu_bind=sockets <COMMAND>' for launching multiple processes in the interactive mode"
     export OMP_NUM_THREADS=48
-    salloc --time=1:00:00 --ntasks-per-node=$(($numDevices)) --mem=94000 -A $proj -p batch bash
+    salloc --time=1:00:00 --ntasks-per-node=$(($numDevices)) --mem=94000 -A $account -p batch bash
 }
 
 # Load autocompletion for PIConGPU commands
diff --git a/etc/picongpu/juwels-jsc/gpus.tpl b/etc/picongpu/juwels-jsc/gpus.tpl
index 6d8c717931..34a69e7414 100644
--- a/etc/picongpu/juwels-jsc/gpus.tpl
+++ b/etc/picongpu/juwels-jsc/gpus.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera, Sergei Bastrakov
 #
 # This file is part of PIConGPU.
 #
@@ -34,7 +34,7 @@
 #SBATCH --gres=gpu:!TBG_devicesPerNode
 #SBATCH --mail-type=!TBG_mailSettings
 #SBATCH --mail-user=!TBG_mailAddress
-#SBATCH --workdir=!TBG_dstPath
+#SBATCH --chdir=!TBG_dstPath
 
 #SBATCH -o stdout
 #SBATCH -e stderr
@@ -93,7 +93,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] && [ !TBG_numHostedDevicesPerNode
   # Run CUDA memtest to check GPU's health
   srun --cpu_bind=sockets !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available or compute node is not exclusively allocated, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available or compute node is not exclusively allocated. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/juwels-jsc/gpus_picongpu.profile.example b/etc/picongpu/juwels-jsc/gpus_picongpu.profile.example
index dd9470b2d8..508ba55ab9 100644
--- a/etc/picongpu/juwels-jsc/gpus_picongpu.profile.example
+++ b/etc/picongpu/juwels-jsc/gpus_picongpu.profile.example
@@ -10,44 +10,46 @@ export MY_MAIL="someone@example.com"
 export MY_NAME="$(whoami) <$MY_MAIL>"
 
 # Project Information ######################################## (edit this line)
-#   - project account for computing time
-export proj=$(groups | awk '{print $4}')
+#   - project and account for allocation
+#   jutil user projects will return a table of project associations.
+#   Each row contains: project,unixgroup,PI-uid,project-type,budget-accounts
+#   We need the first and last entry.
+#   Here: select the last available project.
+export proj=$( jutil user projects --noheader | awk '{print $1}' | tail -n 1 )
+export account=$(jutil user projects -n | awk '{print $NF}' | tail -n 1)
 
 # Text Editor for Tools ###################################### (edit this line)
 #   - examples: "nano", "vim", "emacs -nw", "vi" or without terminal: "gedit"
 #export EDITOR="nano"
 
 # Set up environment, including $SCRATCH and $PROJECT
-jutil env activate -p $proj
+# Handle a case where the budgeting account is not set.
+if [ "$account" = "-" ]; then
+    jutil env activate --project $proj;
+else
+    jutil env activate --project $proj --budget $account
+fi
 
 # General modules #############################################################
 #
 module purge
-module load GCC/7.3.0
-module load CUDA/9.2.88
-module load CMake/3.13.0
-module load MVAPICH2/2.3-GDR
-module load Python/3.6.6
+module load GCC/9.3.0
+module load CUDA/11.0
+module load CMake/3.18.0
+module load ParaStationMPI/5.4.7-1
+module load mpi-settings/CUDA
+module load Python/3.8.5
+
+module load Boost/1.74.0
+module load HDF5/1.10.6
+# necessary for evaluations (NumPy, SciPy, Matplotlib, SymPy, Pandas, IPython)
+module load SciPy-Stack/2020-Python-3.8.5
 
 # Other Software ##############################################################
 #
-module load zlib/.1.2.11
-module load libpng/.1.6.35
-export CMAKE_PREFIX_PATH=$EBROOTZLIB:$EBROOTLIBPNG:$CMAKE_PREFIX_PATH
-
-# This is required for Boost to have correct dynamic library dependencies
-module load ICU/61.1
-export LD_LIBRARY_PATH=$EBROOTICU/lib:$LD_LIBRARY_PATH
-
+# Manually installed libraries are stored in PARTITION_LIB
 PARTITION_LIB=$PROJECT/lib_gpus
-BOOST_ROOT=$PARTITION_LIB/boost
-export CMAKE_PREFIX_PATH=$BOOST_ROOT:$CMAKE_PREFIX_PATH
-export LD_LIBRARY_PATH=$BOOST_ROOT/lib:$LD_LIBRARY_PATH
 
-HDF5_ROOT=$PARTITION_LIB/hdf5
-export PATH=$HDF5_ROOT/bin:$PATH
-export CMAKE_PREFIX_PATH=$HDF5_ROOT:$CMAKE_PREFIX_PATH
-export LD_LIBRARY_PATH=$HDF5_ROOT/lib:$LD_LIBRARY_PATH
 
 LIBSPLASH_ROOT=$PARTITION_LIB/libSplash
 PNGWRITER_ROOT=$PARTITION_LIB/pngwriter
@@ -62,11 +64,8 @@ export PATH=$ADIOS_ROOT/bin:$PATH
 export CMAKE_PREFIX_PATH=$ADIOS_ROOT:$CMAKE_PREFIX_PATH
 
 
-export LD_LIBRARY_PATH=$EBROOTICU/lib:$LD_LIBRARY_PATH
-
 # Environment #################################################################
 #
-#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$BOOST_LIB
 
 export PICSRC=$HOME/src/picongpu
 export PIC_EXAMPLES=$PICSRC/share/picongpu/examples
@@ -97,7 +96,7 @@ function getNode() {
         return 1
     fi
     echo "Hint: please use 'srun --cpu_bind=sockets <COMMAND>' for launching multiple processes in the interactive mode"
-    salloc --time=1:00:00 --nodes=$numNodes --ntasks-per-node=4 --gres=gpu:4 --mem=180000 -A $proj -p gpus bash
+    salloc --time=1:00:00 --nodes=$numNodes --ntasks-per-node=4 --gres=gpu:4 --mem=180000 -A $account -p gpus bash
 }
 
 # allocate an interactive shell for one hour
@@ -114,7 +113,7 @@ function getDevice() {
         fi
     fi
     echo "Hint: please use 'srun --cpu_bind=sockets <COMMAND>' for launching multiple processes in the interactive mode"
-    salloc --time=1:00:00 --ntasks-per-node=$(($numDevices)) --gres=gpu:4 --mem=180000 -A $proj -p gpus bash
+    salloc --time=1:00:00 --ntasks-per-node=$(($numDevices)) --gres=gpu:4 --mem=180000 -A $account -p gpus bash
 }
 
 # Load autocompletion for PIConGPU commands
diff --git a/etc/picongpu/lawrencium-lbnl/fermi.tpl b/etc/picongpu/lawrencium-lbnl/fermi.tpl
index 796562c1b2..d7be1d54d4 100644
--- a/etc/picongpu/lawrencium-lbnl/fermi.tpl
+++ b/etc/picongpu/lawrencium-lbnl/fermi.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -106,7 +106,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   mpirun !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/lawrencium-lbnl/k20.tpl b/etc/picongpu/lawrencium-lbnl/k20.tpl
index 76518fbd1a..c2e7737ef5 100644
--- a/etc/picongpu/lawrencium-lbnl/k20.tpl
+++ b/etc/picongpu/lawrencium-lbnl/k20.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -104,7 +104,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   mpirun !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/pizdaint-cscs/large.tpl b/etc/picongpu/pizdaint-cscs/large.tpl
index 41c87ffa7b..3bf5a81d26 100644
--- a/etc/picongpu/pizdaint-cscs/large.tpl
+++ b/etc/picongpu/pizdaint-cscs/large.tpl
@@ -1,5 +1,5 @@
 #!/bin/bash -l
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -82,7 +82,7 @@ ln -s ../stdout output
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   srun  -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/pizdaint-cscs/normal.tpl b/etc/picongpu/pizdaint-cscs/normal.tpl
index 5de12231bf..2743f2b3c5 100644
--- a/etc/picongpu/pizdaint-cscs/normal.tpl
+++ b/etc/picongpu/pizdaint-cscs/normal.tpl
@@ -1,5 +1,5 @@
 #!/bin/bash -l
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -87,7 +87,7 @@ export PMI_NO_PREINITIALIZE=1
 if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   srun  -n !TBG_tasks !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/pizdaint-cscs/picongpu.profile.example b/etc/picongpu/pizdaint-cscs/picongpu.profile.example
index 09a10ca0de..42def63ec6 100644
--- a/etc/picongpu/pizdaint-cscs/picongpu.profile.example
+++ b/etc/picongpu/pizdaint-cscs/picongpu.profile.example
@@ -40,7 +40,7 @@ export CXX=$(which CC)
 export CRAY_CPU_TARGET=x86-64
 
 # Libraries ###################################################################
-module load CMake/3.11.4
+module load CMake/3.15.0
 
 module load cray-mpich/7.6.0
 module load cray-hdf5-parallel/1.10.0.3
diff --git a/etc/picongpu/submitAction.sh b/etc/picongpu/submitAction.sh
index 064e343abc..fc2fa46bd9 100755
--- a/etc/picongpu/submitAction.sh
+++ b/etc/picongpu/submitAction.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/etc/picongpu/summit-ornl/gpu_batch.tpl b/etc/picongpu/summit-ornl/gpu_batch.tpl
index 95bc165ca6..7558e058b4 100644
--- a/etc/picongpu/summit-ornl/gpu_batch.tpl
+++ b/etc/picongpu/summit-ornl/gpu_batch.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2019-2020 Axel Huebl, Rene Widera
+# Copyright 2019-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -86,6 +86,6 @@ cd simOutput
 
 #if [ $? -eq 0 ] ; then
 export OMP_NUM_THREADS=!TBG_coresPerGPU
-jsrun --nrs !TBG_tasks --tasks_per_rs 1 --cpu_per_rs !TBG_coresPerGPU --gpu_per_rs 1 --latency_priority GPU-CPU --bind rs --smpiargs="-gpu" !TBG_dstPath/input/bin/picongpu !TBG_author !TBG_programParams | tee output
+jsrun --nrs !TBG_tasks --tasks_per_rs 1 --cpu_per_rs !TBG_coresPerGPU --gpu_per_rs 1 --latency_priority GPU-CPU --bind rs --smpiargs="-gpu" !TBG_dstPath/input/bin/picongpu --mpiDirect !TBG_author !TBG_programParams | tee output
 # note: instead of the PIConGPU binary, one can also debug starting "js_task_info | sort"
 #fi
diff --git a/etc/picongpu/summit-ornl/gpu_picongpu.profile.example b/etc/picongpu/summit-ornl/gpu_picongpu.profile.example
index 6661b76b12..c3284054f1 100644
--- a/etc/picongpu/summit-ornl/gpu_picongpu.profile.example
+++ b/etc/picongpu/summit-ornl/gpu_picongpu.profile.example
@@ -18,20 +18,23 @@ export proj=<yourProject>
 #export EDITOR="nano"
 
 # basic environment ###########################################################
-module load gcc/6.4.0
+module load gcc/8.1.1
 
 export CC=$(which gcc)
 export CXX=$(which g++)
 
 # required tools and libs
 module load git
-module load cmake/3.14.2
-module load cuda/10.1.168
+module load cmake/3.18.2
+module load cuda/10.1.243
 module load boost/1.66.0
 
 # plugins (optional) ##########################################################
-module load hdf5/1.10.3
-module load adios/1.13.1-py2 c-blosc zfp sz lz4
+module load ums
+module load ums-aph114
+module load hdf5/1.10.4
+module load adios/1.13.1-py2 c-blosc zfp/0.5.5 sz lz4
+module load openpmd-api/0.12.0
 
 # optionally download libSplash and compile it yourself from
 #   https://github.com/ComputationalRadiationPhysics/libSplash/
diff --git a/etc/picongpu/taurus-tud/V100.tpl b/etc/picongpu/taurus-tud/V100.tpl
index 8f6dbbf922..821e3b60ca 100644
--- a/etc/picongpu/taurus-tud/V100.tpl
+++ b/etc/picongpu/taurus-tud/V100.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Alexander Debus, Klaus Steiniger
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Alexander Debus, Klaus Steiniger
 #
 # This file is part of PIConGPU.
 #
@@ -107,7 +107,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/taurus-tud/V100_picongpu.profile.example b/etc/picongpu/taurus-tud/V100_picongpu.profile.example
index c7f71d5df2..2175755b40 100644
--- a/etc/picongpu/taurus-tud/V100_picongpu.profile.example
+++ b/etc/picongpu/taurus-tud/V100_picongpu.profile.example
@@ -19,7 +19,7 @@ module switch modenv/ml
 
 # load CUDA/9.2.88-GCC-7.3.0-2.30, also loads GCC/7.3.0-2.30, zlib, OpenMPI and others
 module load fosscuda/2018b
-module load CMake/3.11.4-GCCcore-7.3.0
+module load CMake/3.15.0-GCCcore-7.3.0
 module load libpng/1.6.34-GCCcore-7.3.0
 
 printf "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"
diff --git a/etc/picongpu/taurus-tud/V100_restart.tpl b/etc/picongpu/taurus-tud/V100_restart.tpl
index 8e34ff0d1b..168932b76d 100644
--- a/etc/picongpu/taurus-tud/V100_restart.tpl
+++ b/etc/picongpu/taurus-tud/V100_restart.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Alexander Debus, Klaus Steiniger
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Alexander Debus, Klaus Steiniger
 #
 # This file is part of PIConGPU.
 #
@@ -175,7 +175,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   mpiexec -hostfile ../machinefile.txt !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/taurus-tud/k20x.tpl b/etc/picongpu/taurus-tud/k20x.tpl
index d7be22efe9..79dee88d89 100644
--- a/etc/picongpu/taurus-tud/k20x.tpl
+++ b/etc/picongpu/taurus-tud/k20x.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
@@ -97,7 +97,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/taurus-tud/k20x_picongpu.profile.example b/etc/picongpu/taurus-tud/k20x_picongpu.profile.example
index 37437867a9..ab3c69a9ba 100644
--- a/etc/picongpu/taurus-tud/k20x_picongpu.profile.example
+++ b/etc/picongpu/taurus-tud/k20x_picongpu.profile.example
@@ -22,7 +22,7 @@ export proj=$(groups | awk '{print $1}')
 module load modenv/scs5
 module load foss/2018a
 module load GCC/6.4.0-2.28
-module load CMake/3.11.4-GCCcore-6.4.0
+module load CMake/3.15.0-GCCcore-6.4.0
 module load CUDA/9.2.88  # gcc <= 7, intel 15-17
 module load OpenMPI/2.1.2-GCC-6.4.0-2.28
 
diff --git a/etc/picongpu/taurus-tud/k80.tpl b/etc/picongpu/taurus-tud/k80.tpl
index 279bdc9e43..777b920c23 100644
--- a/etc/picongpu/taurus-tud/k80.tpl
+++ b/etc/picongpu/taurus-tud/k80.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
@@ -97,7 +97,7 @@ if [ -f !TBG_dstPath/input/bin/cuda_memtest ] ; then
   # Run CUDA memtest to check GPU's health
   srun -K1 !TBG_dstPath/input/bin/cuda_memtest.sh
 else
-  echo "no binary 'cuda_memtest' available, skip GPU memory test" >&2
+  echo "Note: GPU memory test was skipped as no binary 'cuda_memtest' available. This does not affect PIConGPU, starting it now" >&2
 fi
 
 if [ $? -eq 0 ] ; then
diff --git a/etc/picongpu/taurus-tud/k80_picongpu.profile.example b/etc/picongpu/taurus-tud/k80_picongpu.profile.example
index b7fc7d1c63..11dc9799aa 100644
--- a/etc/picongpu/taurus-tud/k80_picongpu.profile.example
+++ b/etc/picongpu/taurus-tud/k80_picongpu.profile.example
@@ -22,7 +22,7 @@ export proj=$(groups | awk '{print $1}')
 module load modenv/scs5
 module load foss/2018a
 module load GCC/6.4.0-2.28
-module load CMake/3.11.4-GCCcore-6.4.0
+module load CMake/3.16.0-GCCcore-6.4.0
 module load CUDA/9.2.88  # gcc <= 7, intel 15-17
 module load OpenMPI/2.1.2-GCC-6.4.0-2.28
 
diff --git a/etc/picongpu/taurus-tud/knl.tpl b/etc/picongpu/taurus-tud/knl.tpl
index a49a6742d1..7bc35023e8 100644
--- a/etc/picongpu/taurus-tud/knl.tpl
+++ b/etc/picongpu/taurus-tud/knl.tpl
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2013-2020 Axel Huebl, Richard Pausch, Alexander Matthes
+# Copyright 2013-2021 Axel Huebl, Richard Pausch, Alexander Matthes
 #
 # This file is part of PIConGPU.
 #
diff --git a/etc/picongpu/taurus-tud/knl_picongpu.profile.example b/etc/picongpu/taurus-tud/knl_picongpu.profile.example
index 4ff09580c2..53ca4ff717 100644
--- a/etc/picongpu/taurus-tud/knl_picongpu.profile.example
+++ b/etc/picongpu/taurus-tud/knl_picongpu.profile.example
@@ -22,7 +22,7 @@ export proj=$(groups | awk '{print $1}')
 module load modenv/scs5
 module load iimpi/2018a
 module load git/2.18.0-GCCcore-6.4.0
-module load CMake/3.11.4-GCCcore-7.3.0
+module load CMake/3.15.0-GCCcore-7.3.0
 module load Boost/1.66.0-intel-2018a
 module load HDF5/1.10.1-intel-2018a
 module load libpng/1.6.34-GCCcore-7.3.0
diff --git a/include/mpiInfo/CMakeLists.txt b/include/mpiInfo/CMakeLists.txt
index 43478f36a3..d77db5915a 100644
--- a/include/mpiInfo/CMakeLists.txt
+++ b/include/mpiInfo/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of mpiInfo.
 #
@@ -22,7 +22,7 @@
 # Required cmake version
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 
 ################################################################################
@@ -61,10 +61,10 @@ endif()
 # Language Flags
 ###############################################################################
 
-# enforce C++11
+# enforce C++14
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 
 
 ################################################################################
diff --git a/include/mpiInfo/mpiInfo.cpp b/include/mpiInfo/mpiInfo.cpp
index e2836a3b38..53a6bff6c6 100644
--- a/include/mpiInfo/mpiInfo.cpp
+++ b/include/mpiInfo/mpiInfo.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020  Rene Widera
+/* Copyright 2013-2021  Rene Widera
  *
  * This file is part of mpiInfo.
  *
@@ -27,7 +27,15 @@
 #include <boost/program_options/variables_map.hpp>
 
 
-#define MPI_CHECK(cmd) {int error = cmd; if(error!=MPI_SUCCESS){printf("<%s>:%i ",__FILE__,__LINE__); throw std::runtime_error(std::string("[MPI] Error"));}}
+#define MPI_CHECK(cmd)                                                                                                \
+    {                                                                                                                 \
+        int error = cmd;                                                                                              \
+        if(error != MPI_SUCCESS)                                                                                      \
+        {                                                                                                             \
+            printf("<%s>:%i ", __FILE__, __LINE__);                                                                   \
+            throw std::runtime_error(std::string("[MPI] Error"));                                                     \
+        }                                                                                                             \
+    }
 
 namespace po = boost::program_options;
 
@@ -44,15 +52,12 @@ enum
  * name like p1223(Pid=1233) is than p1223
  * in some MPI implementation /mpich) the hostname is unique
  */
-void cleanHostname( char* name )
+void cleanHostname(char* name)
 {
-    for ( int i = 0; i < MPI_MAX_PROCESSOR_NAME; ++i )
+    for(int i = 0; i < MPI_MAX_PROCESSOR_NAME; ++i)
     {
-        if ( !( name[i] >= 'A' && name[i] <= 'Z' ) &&
-             !( name[i] >= 'a' && name[i] <= 'z' ) &&
-             !( name[i] >= '0' && name[i] <= '9' ) &&
-             !( name[i] == '_' ) &&
-             !( name[i] == '-' ) )
+        if(!(name[i] >= 'A' && name[i] <= 'Z') && !(name[i] >= 'a' && name[i] <= 'z')
+           && !(name[i] >= '0' && name[i] <= '9') && !(name[i] == '_') && !(name[i] == '-'))
         {
             name[i] = 0;
             return;
@@ -68,7 +73,7 @@ void cleanHostname( char* name )
  * from the master.
  *
  */
-int getHostRank( )
+int getHostRank()
 {
     char hostname[MPI_MAX_PROCESSOR_NAME];
     int length;
@@ -77,99 +82,102 @@ int getHostRank( )
     int totalnodes;
     int myrank;
 
-    MPI_CHECK( MPI_Get_processor_name( hostname, &length ) );
-    cleanHostname( hostname );
+    MPI_CHECK(MPI_Get_processor_name(hostname, &length));
+    cleanHostname(hostname);
     hostname[length++] = '\0';
 
-    //int totalnodes;
+    // int totalnodes;
 
-    MPI_CHECK( MPI_Comm_size( MPI_COMM_WORLD, &totalnodes ) );
-    MPI_CHECK( MPI_Comm_rank( MPI_COMM_WORLD, &myrank ) );
+    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &totalnodes));
+    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myrank));
 
-    if ( myrank == 0 )
+    if(myrank == 0)
     {
-
         std::map<std::string, int> hosts;
         hosts[hostname] = 0;
         hostRank = 0;
-        for ( int rank = 1; rank < totalnodes; ++rank )
+        for(int rank = 1; rank < totalnodes; ++rank)
         {
-
-            MPI_CHECK( MPI_Recv( hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, rank, gridHostnameTag, MPI_COMM_WORLD, MPI_STATUS_IGNORE ) );
-
-            //printf("Hostname: %s\n", hostname);
+            MPI_CHECK(MPI_Recv(
+                hostname,
+                MPI_MAX_PROCESSOR_NAME,
+                MPI_CHAR,
+                rank,
+                gridHostnameTag,
+                MPI_COMM_WORLD,
+                MPI_STATUS_IGNORE));
+
+            // printf("Hostname: %s\n", hostname);
             int hostrank = 0;
-            if ( hosts.count( hostname ) > 0 ) hostrank = hosts[hostname] + 1;
+            if(hosts.count(hostname) > 0)
+                hostrank = hosts[hostname] + 1;
 
-            MPI_CHECK( MPI_Send( &hostrank, 1, MPI_INT, rank, gridHostRankTag, MPI_COMM_WORLD ) );
+            MPI_CHECK(MPI_Send(&hostrank, 1, MPI_INT, rank, gridHostRankTag, MPI_COMM_WORLD));
 
             hosts[hostname] = hostrank;
-
-
         }
-
     }
     else
     {
-        MPI_CHECK( MPI_Send( hostname, length, MPI_CHAR, 0, gridHostnameTag, MPI_COMM_WORLD ) );
+        MPI_CHECK(MPI_Send(hostname, length, MPI_CHAR, 0, gridHostnameTag, MPI_COMM_WORLD));
 
-        MPI_CHECK( MPI_Recv( &hostRank, 1, MPI_INT, 0, gridHostRankTag, MPI_COMM_WORLD, MPI_STATUS_IGNORE ) );
+        MPI_CHECK(MPI_Recv(&hostRank, 1, MPI_INT, 0, gridHostRankTag, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
         // if(hostRank!=0) hostRank--; //!\todo fix mpi hostrank start with 1
     }
 
     return hostRank;
 }
 
-int getMyRank( )
+int getMyRank()
 {
     int myrank;
-    MPI_CHECK( MPI_Comm_rank( MPI_COMM_WORLD, &myrank ) );
+    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myrank));
     return myrank;
 }
 
-int getTotalRanks( )
+int getTotalRanks()
 {
     int totalnodes;
-    MPI_CHECK( MPI_Comm_size( MPI_COMM_WORLD, &totalnodes ) );
+    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &totalnodes));
     return totalnodes;
 }
 
-int main( int argc, char** argv )
+int main(int argc, char** argv)
 {
     bool localRank = false;
     bool myRank = false;
     bool totalRank = false;
 
-    po::options_description desc( "Allowed options" );
-    desc.add_options( )
-        ( "help,h", "produce help message" )
-        ( "mpi_host_rank", po::value<bool > ( &localRank )->zero_tokens( ), "get local mpi rank" )
-        ( "mpi_rank", po::value<bool > ( &myRank )->zero_tokens( ), "get mpi rank" )
-        ( "mpi_size", po::value<bool > ( &totalRank )->zero_tokens( ), "get count of mpi ranks" );
+    po::options_description desc("Allowed options");
+    desc.add_options()(
+        "help,h",
+        "produce help message")("mpi_host_rank", po::value<bool>(&localRank)->zero_tokens(), "get local mpi rank")(
+        "mpi_rank",
+        po::value<bool>(&myRank)->zero_tokens(),
+        "get mpi rank")("mpi_size", po::value<bool>(&totalRank)->zero_tokens(), "get count of mpi ranks");
 
     // parse command line options and config file and store values in vm
     po::variables_map vm;
-    po::store( boost::program_options::parse_command_line( argc, argv, desc ), vm );
-    po::notify( vm );
+    po::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
 
     // print help message and quit simulation
-    if ( vm.count( "help" ) )
+    if(vm.count("help"))
     {
         std::cerr << desc << "\n";
-        return false;
+        return 0;
     }
 
-    MPI_CHECK( MPI_Init( &argc, &argv ) );
-    if ( localRank )
-        std::cout << "mpi_host_rank: " << getHostRank( ) << std::endl;
-    if ( myRank )
-        std::cout << "mpi_rank: " << getMyRank( ) << std::endl;
-    if ( totalRank )
-        std::cout << "mpi_size: " << getTotalRanks( ) << std::endl;
+    MPI_CHECK(MPI_Init(&argc, &argv));
+    if(localRank)
+        std::cout << "mpi_host_rank: " << getHostRank() << std::endl;
+    if(myRank)
+        std::cout << "mpi_rank: " << getMyRank() << std::endl;
+    if(totalRank)
+        std::cout << "mpi_size: " << getTotalRanks() << std::endl;
 
 
-    MPI_CHECK( MPI_Finalize( ) );
+    MPI_CHECK(MPI_Finalize());
 
     return 0;
 }
-
diff --git a/include/picongpu/ArgsParser.cpp b/include/picongpu/ArgsParser.cpp
index 766c7290cd..a497b08aba 100644
--- a/include/picongpu/ArgsParser.cpp
+++ b/include/picongpu/ArgsParser.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -34,74 +34,66 @@
 
 namespace picongpu
 {
-
-namespace
-{
-
-    /** Report deprecated parameters
-     *
-     * This function is meant to handle cases when some parameters are changed
-     * but the old versions temporarily kept for backward compatibility and
-     * deprecated. Notably, this applies to compile-time parameters getting a
-     * run-time version. Hence it deliberately ignores incapsulation and code
-     * duplication and simply has a hardcoded set of cases.
-     */
-    void reportDeprecated( boost::program_options::variables_map const & vm )
+    namespace
     {
-        using pmacc::log;
-        using Level = PIConGPUVerbose::PHYSICS;
-
-        /* Moving window: a new run-time parameter 'windowMovePoint' to replace
-         * compile-time 'movePoint' variable
+        /** Report deprecated parameters
+         *
+         * This function is meant to handle cases when some parameters are changed
+         * but the old versions temporarily kept for backward compatibility and
+         * deprecated. Notably, this applies to compile-time parameters getting a
+         * run-time version. Hence it deliberately ignores incapsulation and code
+         * duplication and simply has a hardcoded set of cases.
          */
-        bool isMovingWindowEnabled = !vm[ "moving" ].empty();
-        if( isMovingWindowEnabled )
+        void reportDeprecated(boost::program_options::variables_map const& vm)
         {
-            bool isWindowMovePointSet = !vm[ "windowMovePoint" ].defaulted( );
-            if( !isWindowMovePointSet )
-                log< Level >(
-                    "Warning: Compile-time variable 'movePoint' in grid.param "
-                    "is deprecated. It is currently still required for "
-                    "building purposes. Please keep the variable in your "
-                    "grid.param, but for future compatibility set this value "
-                    "using the 'windowMovePoint' parameter in your .cfg file. "
-                    "The value of movePoint is the default for windowMovePoint, "
-                    "setting the latter explicitly will override this."
-                );
+            using pmacc::log;
+            using Level = PIConGPUVerbose::PHYSICS;
+
+            /* Moving window: a new run-time parameter 'windowMovePoint' to replace
+             * compile-time 'movePoint' variable
+             */
+            bool isMovingWindowEnabled = !vm["moving"].empty();
+            if(isMovingWindowEnabled)
+            {
+                bool isWindowMovePointSet = !vm["windowMovePoint"].defaulted();
+                if(!isWindowMovePointSet)
+                    log<Level>("Warning: Compile-time variable 'movePoint' in grid.param "
+                               "is deprecated. It is currently still required for "
+                               "building purposes. Please keep the variable in your "
+                               "grid.param, but for future compatibility set this value "
+                               "using the 'windowMovePoint' parameter in your .cfg file. "
+                               "The value of movePoint is the default for windowMovePoint, "
+                               "setting the latter explicitly will override this.");
+            }
         }
-    }
 
-} // anonymous namespace
+    } // anonymous namespace
 
-    ArgsParser::ArgsParser( )
+    ArgsParser::ArgsParser()
     {
-
     }
 
-    ArgsParser::ArgsParser( ArgsParser& )
+    ArgsParser::ArgsParser(ArgsParser&)
     {
-
     }
 
-    template <class T>
-    bool from_string( T& t,
-                      const std::string& s,
-                      std::ios_base& ( *f )( std::ios_base& ) )
+    template<class T>
+    bool from_string(T& t, const std::string& s, std::ios_base& (*f)(std::ios_base&) )
     {
-        std::istringstream iss( s );
-        if ( ( iss >> f >> t ).fail( ) )
-            throw std::invalid_argument( "convertion invalid!" );
+        std::istringstream iss(s);
+        if((iss >> f >> t).fail())
+            throw std::invalid_argument("convertion invalid!");
 
         return true;
     }
 
-    ArgsParser& ArgsParser::getInstance( )
+    ArgsParser& ArgsParser::getInstance()
     {
         static ArgsParser instance;
         return instance;
     }
 
-    ArgsParser::Status ArgsParser::parse( int argc, char** argv )
+    ArgsParser::Status ArgsParser::parse(int argc, char** argv)
     {
         namespace po = boost::program_options;
 
@@ -111,67 +103,66 @@ namespace
             std::stringstream desc_stream;
             desc_stream << "Usage picongpu [-d dx=1 dy=1 dz=1] -g width height depth [options]" << std::endl;
 
-            po::options_description desc( desc_stream.str( ) );
+            po::options_description desc(desc_stream.str());
 
             std::vector<std::string> config_files;
 
             // add possible options
-            desc.add_options()
-                    ( "help,h", "print help message and exit" )
-                    ( "validate", "validate command line parameters and exit" )
-                    ( "version,v", "print version information and exit" )
-                    ( "config,c", po::value<std::vector<std::string> > ( &config_files )->multitoken( ), "Config file(s)" )
-                    ;
+            desc.add_options()("help,h", "print help message and exit")(
+                "validate",
+                "validate command line parameters and exit")("version,v", "print version information and exit")(
+                "config,c",
+                po::value<std::vector<std::string>>(&config_files)->multitoken(),
+                "Config file(s)");
 
             // add all options from plugins
-            for ( std::list<po::options_description>::iterator iter = options.begin( );
-                  iter != options.end( ); ++iter )
-                desc.add( *iter );
+            for(std::list<po::options_description>::iterator iter = options.begin(); iter != options.end(); ++iter)
+                desc.add(*iter);
 
             // parse command line options and config file and store values in vm
             po::variables_map vm;
-            //log<picLog::SIMULATION_STATE > ("parsing command line");
-            po::store( po::parse_command_line( argc, argv, desc ), vm );
+            // log<picLog::SIMULATION_STATE > ("parsing command line");
+            po::store(po::parse_command_line(argc, argv, desc), vm);
 
-            if ( vm.count( "config" ) )
+            if(vm.count("config"))
             {
-                std::vector<std::string> conf_files = vm["config"].as<std::vector<std::string> >( );
+                std::vector<std::string> conf_files = vm["config"].as<std::vector<std::string>>();
 
-                for ( std::vector<std::string>::const_iterator iter = conf_files.begin( );
-                      iter != conf_files.end( ); ++iter )
+                for(std::vector<std::string>::const_iterator iter = conf_files.begin(); iter != conf_files.end();
+                    ++iter)
                 {
-                    //log<picLog::SIMULATION_STATE > ("parsing config file '%1%'") % (*iter);
-                    std::ifstream config_file_stream( iter->c_str( ) );
-                    po::store( po::parse_config_file( config_file_stream, desc ), vm );
+                    // log<picLog::SIMULATION_STATE > ("parsing config file '%1%'") % (*iter);
+                    std::ifstream config_file_stream(iter->c_str());
+                    po::store(po::parse_config_file(config_file_stream, desc), vm);
                 }
             }
 
-            po::notify( vm );
+            po::notify(vm);
 
             // print help message and quit simulation
-            if ( vm.count( "help" ) )
+            if(vm.count("help"))
             {
                 std::cout << desc << "\n";
                 return Status::successExit;
             }
             // print versions of dependent software
-            if ( vm.count( "version" ) )
+            if(vm.count("version"))
             {
-                void( getSoftwareVersions( std::cout ) );
+                void(getSoftwareVersions(std::cout));
                 return Status::successExit;
             }
             // no parameters set: required parameters (e.g., -g) will be missing
             // -> obvious wrong usage
             // -> print help and exit with error code
-            if ( argc == 1 ) // argc[0] is always the program name
+            if(argc == 1) // argc[0] is always the program name
             {
                 std::cerr << desc << "\n";
                 return Status::error;
             }
 
-            reportDeprecated( vm );
+            reportDeprecated(vm);
 
-            if ( vm.count( "validate" ) )
+            if(vm.count("validate"))
             {
                 /* if we reach this part of code the parameters are valid
                  * and the option `validate` is set.
@@ -179,7 +170,7 @@ namespace
                 return Status::successExit;
             }
         }
-        catch ( const po::error& e )
+        catch(const po::error& e)
         {
             std::cerr << e.what() << std::endl;
             return Status::error;
@@ -188,4 +179,4 @@ namespace
         return Status::success;
     }
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/ArgsParser.hpp b/include/picongpu/ArgsParser.hpp
index 4f39455e59..9f4808069f 100644
--- a/include/picongpu/ArgsParser.hpp
+++ b/include/picongpu/ArgsParser.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -41,7 +41,6 @@ namespace picongpu
     class ArgsParser
     {
     public:
-
         //! Parsing status
         enum Status
         {
@@ -69,7 +68,7 @@ namespace picongpu
          * @param argv command line arguments
          * @return parsing status
          */
-        Status parse(int argc, char **argv);
+        Status parse(int argc, char** argv);
 
     private:
         /**
@@ -82,4 +81,4 @@ namespace picongpu
         std::list<po::options_description> options;
     };
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/CMakeLists.txt b/include/picongpu/CMakeLists.txt
index 618658b450..f0147788a8 100644
--- a/include/picongpu/CMakeLists.txt
+++ b/include/picongpu/CMakeLists.txt
@@ -1,5 +1,6 @@
-# Copyright 2013-2020 Axel Huebl, Benjamin Schneider, Felix Schmitt, Heiko Burau,
-#                     Rene Widera, Alexander Grund, Alexander Matthes
+# Copyright 2013-2021 Axel Huebl, Benjamin Schneider, Felix Schmitt, Heiko Burau,
+#                     Rene Widera, Alexander Grund, Alexander Matthes,
+#                     Franz Poeschel, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
@@ -22,7 +23,7 @@
 # Required cmake version
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 
 ################################################################################
@@ -42,6 +43,7 @@ list(APPEND CMAKE_PREFIX_PATH "$ENV{CUDA_ROOT}")
 list(APPEND CMAKE_PREFIX_PATH "$ENV{BOOST_ROOT}")
 list(APPEND CMAKE_PREFIX_PATH "$ENV{HDF5_ROOT}")
 list(APPEND CMAKE_PREFIX_PATH "$ENV{ADIOS_ROOT}")
+list(APPEND CMAKE_PREFIX_PATH "$ENV{OPENPMD_ROOT}")
 # Add from environment after specific env vars
 list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}")
 
@@ -62,10 +64,10 @@ endif()
 # Language Flags
 ###############################################################################
 
-# enforce C++11
+# enforce C++14
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 
 
 ################################################################################
@@ -118,27 +120,13 @@ set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
 # Find OpenMP
 ################################################################################
 
-find_package(OpenMP)
-if(OPENMP_FOUND)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-endif()
-
-
-################################################################################
-# Find mallocMC
-################################################################################
-if(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    find_package(mallocMC 2.3.0 QUIET)
-
-    if(NOT mallocMC_FOUND)
-        message(STATUS "Using mallocMC from thirdParty/ directory")
-        set(MALLOCMC_ROOT "${PIConGPUapp_SOURCE_DIR}/../../thirdParty/mallocMC")
-        find_package(mallocMC 2.3.0 REQUIRED)
-    endif(NOT mallocMC_FOUND)
-
-    include_directories(SYSTEM ${mallocMC_INCLUDE_DIRS})
-    add_definitions(${mallocMC_DEFINITIONS})
-    set(LIBS ${LIBS} ${mallocMC_LIBRARIES})
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND ALPAKA_ACC_GPU_CUDA_ENABLE AND ALPAKA_CUDA_COMPILER MATCHES "clang")
+    message(WARNING "OpenMP host side acceleration is disabled: CUDA compilation with clang is not supporting OpenMP.")
+else()
+    find_package(OpenMP)
+    if(OPENMP_FOUND)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    endif()
 endif()
 
 
@@ -233,21 +221,6 @@ set(PIC_VERBOSE "1" CACHE STRING
 add_definitions(-DPIC_VERBOSE_LVL=${PIC_VERBOSE})
 
 
-################################################################################
-# ADIOS
-################################################################################
-
-# find adios installation
-#   set(ADIOS_USE_STATIC_LIBS ON) # force static linking
-find_package(ADIOS 1.13.1)
-
-if(ADIOS_FOUND)
-    add_definitions(-DENABLE_ADIOS=1)
-    include_directories(SYSTEM ${ADIOS_INCLUDE_DIRS})
-    set(LIBS ${LIBS} ${ADIOS_LIBRARIES})
-endif(ADIOS_FOUND)
-
-
 ################################################################################
 # Additional defines for PIConGPU outputs
 ################################################################################
@@ -293,6 +266,66 @@ elseif(MSVC)
 endif()
 
 
+################################################################################
+# openPMD
+################################################################################
+
+# find openPMD installation
+find_package(openPMD 0.12.0 CONFIG COMPONENTS MPI)
+
+if(openPMD_FOUND)
+    if(openPMD_HAVE_ADIOS2 OR openPMD_HAVE_HDF5)
+        message(STATUS "Found openPMD: ${openPMD_DIR}")
+        add_definitions(-DENABLE_OPENPMD=1)
+
+        # non of these should appear in cmake-gui, so make them internal
+        set(JSON_BuildTests OFF CACHE INTERNAL "")
+        set(JSON_MultipleHeaders OFF CACHE INTERNAL "")
+        set(JSON_ImplicitConversions OFF CACHE INTERNAL "")
+        set(JSON_Install OFF CACHE INTERNAL "")  # only used PRIVATE
+
+        # allow to use externally installed nlohmann_json
+        set(
+            PIC_nlohmann_json_PROVIDER "intern" CACHE
+            STRING "Use internally shipped or external nlohmann_json library.")
+        set_property(
+            CACHE PIC_nlohmann_json_PROVIDER
+            PROPERTY STRINGS "intern;extern")
+        mark_as_advanced(PIC_nlohmann_json_PROVIDER)
+        if(${PIC_nlohmann_json_PROVIDER} STREQUAL "intern")
+            add_subdirectory(
+                "${PIConGPUapp_SOURCE_DIR}/../../thirdParty/nlohmann_json"
+                "${CMAKE_CURRENT_BINARY_DIR}/build_nlohmann_json")
+        else()
+            find_package(nlohmann_json 3.9.1 CONFIG REQUIRED)
+            message(STATUS "nlohmann-json: Found version '${nlohmann_json_VERSION}'")
+        endif()
+        set(LIBS ${LIBS} openPMD::openPMD)
+    else()
+        message(STATUS "Found openPMD at ${openPMD_DIR}, but PIConGPU requires"
+                       " availability of either its ADIOS2 or HDF5 backend - "
+                       "NOT BUILDING the openPMD plugin")
+    endif()
+else(openPMD_FOUND)
+    message(STATUS "Could NOT find openPMD - set openPMD_DIR or check your CMAKE_PREFIX_PATH")
+endif(openPMD_FOUND)
+
+
+################################################################################
+# ADIOS
+################################################################################
+
+# find adios installation
+#   set(ADIOS_USE_STATIC_LIBS ON) # force static linking
+find_package(ADIOS 1.13.1)
+
+if(ADIOS_FOUND)
+    add_definitions(-DENABLE_ADIOS=1)
+    include_directories(SYSTEM ${ADIOS_INCLUDE_DIRS})
+    set(LIBS ${LIBS} ${ADIOS_LIBRARIES})
+endif(ADIOS_FOUND)
+
+
 ################################################################################
 # libSplash (+ hdf5 due to required headers)
 ################################################################################
@@ -368,6 +401,21 @@ else(ISAAC_FOUND)
     endif()
 endif(ISAAC_FOUND)
 
+################################################################################
+# PIConGPU Workarounds
+################################################################################
+
+set(PIC_COMPUTE_CURRENT_THREAD_LIMITER_DEFAULT OFF)
+if(ALPAKA_ACC_GPU_HIP_ENABLE)
+    set(PIC_COMPUTE_CURRENT_THREAD_LIMITER_DEFAULT ON)
+endif()
+option(PIC_COMPUTE_CURRENT_THREAD_LIMITER "Compute current results with HIP alpaka backend are wrong when more threads than particles in a frame will be used (possible compiler BUG).\
+ ON means the number of threads will be limited to number of particles in a frame." ${PIC_COMPUTE_CURRENT_THREAD_LIMITER_DEFAULT})
+
+if(PIC_COMPUTE_CURRENT_THREAD_LIMITER)
+    add_definitions(-DPIC_COMPUTE_CURRENT_THREAD_LIMITER=1)
+endif()
+
 ################################################################################
 # Check if PIC_EXTENSION_PATH is relative or absolute
 ################################################################################
@@ -416,6 +464,16 @@ cupla_add_executable(picongpu
 
 target_link_libraries(picongpu PUBLIC ${LIBS} picongpu-hostonly)
 
+if(openPMD_FOUND)
+    # Including <nlohmann/json.hpp> will throw loads of warnings. Quiet them.
+    # (Doesn't work for nvcc??)
+    target_include_directories(
+        picongpu-hostonly
+        SYSTEM PRIVATE
+        $<TARGET_PROPERTY:nlohmann_json::nlohmann_json,INTERFACE_INCLUDE_DIRECTORIES>)
+    target_link_libraries(picongpu-hostonly PRIVATE nlohmann_json::nlohmann_json)
+endif()
+
 
 ################################################################################
 # Clang-Tidy (3.9+) Target for CI
@@ -456,7 +514,7 @@ if(${CLANG_TIDY_RETURN} EQUAL 0)
             # -checks='-*,modernize-use-using'
             # -fix # -fix-errors
             --
-            -std=c++11
+            -std=c++14
             ${OpenMP_CXX_FLAGS}
             ${ALL_INCLUDES_STR}
             ${ALL_DEFINES_STR}
diff --git a/include/picongpu/_defaultParam.loader b/include/picongpu/_defaultParam.loader
index 0f6362d47f..ab50dd3769 100644
--- a/include/picongpu/_defaultParam.loader
+++ b/include/picongpu/_defaultParam.loader
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten, Finn-Ole Carstens
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -26,12 +26,12 @@
 #pragma once
 
 #include "picongpu/param/dimension.param"
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "picongpu/param/mallocMC.param"
+#include "picongpu/param/precision.param"
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+#    include "picongpu/param/mallocMC.param"
 #endif
 #include "picongpu/param/memory.param"
 #include "picongpu/param/random.param"
-#include "picongpu/param/precision.param"
 #include "picongpu/param/physicalConstants.param"
 #include "picongpu/param/flylite.param"
 #include "picongpu/param/speciesConstants.param"
@@ -46,8 +46,8 @@
 #include "picongpu/param/pml.param"
 #include "picongpu/param/unit.param"
 #include "picongpu/param/particleFilters.param"
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "picongpu/param/bremsstrahlung.param"
+#if(PMACC_CUDA_ENABLED == 1)
+#    include "picongpu/param/bremsstrahlung.param"
 #endif
 #include "picongpu/param/radiation.param"
 #include "picongpu/param/transitionRadiation.param"
@@ -65,3 +65,6 @@
 #include "picongpu/param/isaac.param"
 #include "picongpu/param/radiationObserver.param"
 #include "picongpu/param/particleMerger.param"
+#if(ENABLE_OPENPMD == 1)
+#    include "picongpu/param/xrayScattering.param"
+#endif
diff --git a/include/picongpu/_defaultUnitless.loader b/include/picongpu/_defaultUnitless.loader
index 4a2679af5a..24cf2a081d 100644
--- a/include/picongpu/_defaultUnitless.loader
+++ b/include/picongpu/_defaultUnitless.loader
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Marco Garten, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
@@ -39,7 +39,7 @@
 #include "picongpu/unitless/speciesInitialization.unitless"
 #include "picongpu/unitless/fieldBackground.unitless"
 #include "picongpu/unitless/synchrotronPhotons.unitless"
-#if( PMACC_CUDA_ENABLED == 1 )
+#if(PMACC_CUDA_ENABLED == 1)
 #    include "picongpu/unitless/bremsstrahlung.unitless"
 #endif
 
diff --git a/include/picongpu/algorithms/AssignedTrilinearInterpolation.hpp b/include/picongpu/algorithms/AssignedTrilinearInterpolation.hpp
index 80eb489cb8..0b7e34abe2 100644
--- a/include/picongpu/algorithms/AssignedTrilinearInterpolation.hpp
+++ b/include/picongpu/algorithms/AssignedTrilinearInterpolation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -32,28 +32,21 @@ namespace picongpu
 
 namespace pmacc
 {
-namespace result_of
-{
-
-    template< typename T_Cursor >
-    struct Functor<
-        picongpu::AssignedTrilinearInterpolation,
-        T_Cursor
-    >
+    namespace result_of
     {
-        using type =
-            typename boost::remove_reference< typename T_Cursor::type >::type;
-    };
+        template<typename T_Cursor>
+        struct Functor<picongpu::AssignedTrilinearInterpolation, T_Cursor>
+        {
+            using type = typename boost::remove_reference<typename T_Cursor::type>::type;
+        };
 
-} // result_of
-} // pmacc
+    } // namespace result_of
+} // namespace pmacc
 
 namespace picongpu
 {
-
     struct AssignedTrilinearInterpolation
     {
-
         /** Does a 3D trilinear field-to-point interpolation for
          * arbitrary assignment function and arbitrary field_value types.
          *
@@ -67,95 +60,57 @@ namespace picongpu
          *
          * interpolate on grid points in range [T_begin;T_end]
          */
-        template<
-            typename T_AssignmentFunction,
-            int T_begin,
-            int T_end,
-            typename T_Cursor
-        >
-        HDINLINE static
-        auto
-        interpolate(
-            const T_Cursor& cursor,
-            const float3_X & pos
-        )
-        -> typename ::pmacc::result_of::Functor<
-            AssignedTrilinearInterpolation,
-            T_Cursor
-        >::type
+        template<typename T_AssignmentFunction, int T_begin, int T_end, typename T_Cursor>
+        HDINLINE static auto interpolate(const T_Cursor& cursor, const float3_X& pos) ->
+            typename ::pmacc::result_of::Functor<AssignedTrilinearInterpolation, T_Cursor>::type
         {
-            using type = typename ::pmacc::result_of::Functor<
-                AssignedTrilinearInterpolation,
-                T_Cursor
-            >::type;
+            using type = typename ::pmacc::result_of::Functor<AssignedTrilinearInterpolation, T_Cursor>::type;
 
-            type result_z = type( 0.0 );
-            for( int z = T_begin; z <= T_end; ++z )
+            type result_z = type(0.0);
+            for(int z = T_begin; z <= T_end; ++z)
             {
-                type result_y = type( 0.0 );
-                for( int y = T_begin; y <= T_end; ++y )
+                type result_y = type(0.0);
+                for(int y = T_begin; y <= T_end; ++y)
                 {
-                    type result_x = type( 0.0 );
-                    for( int x = T_begin; x <= T_end; ++x )
+                    type result_x = type(0.0);
+                    for(int x = T_begin; x <= T_end; ++x)
                         /* a form factor is the "amount of particle" that is affected by this cell
                          * so we have to sum over: cell_value * form_factor
                          */
-                        result_x += *cursor( x, y, z ) * T_AssignmentFunction()( float_X( x ) - pos.x() );
+                        result_x += *cursor(x, y, z) * T_AssignmentFunction()(float_X(x) - pos.x());
 
-                    result_y += result_x * T_AssignmentFunction()( float_X( y ) - pos.y() );
+                    result_y += result_x * T_AssignmentFunction()(float_X(y) - pos.y());
                 }
 
-                result_z += result_y * T_AssignmentFunction()( float_X( z ) - pos.z() );
+                result_z += result_y * T_AssignmentFunction()(float_X(z) - pos.z());
             }
             return result_z;
         }
 
         /** Implementation for 2D position*/
-        template<
-            class T_AssignmentFunction,
-            int T_begin,
-            int T_end,
-            class T_Cursor
-        >
-        HDINLINE static
-        auto
-        interpolate(
-            T_Cursor const & cursor,
-            float2_X const & pos
-        )
-        -> typename ::pmacc::result_of::Functor<
-            AssignedTrilinearInterpolation,
-            T_Cursor
-        >::type
+        template<class T_AssignmentFunction, int T_begin, int T_end, class T_Cursor>
+        HDINLINE static auto interpolate(T_Cursor const& cursor, float2_X const& pos) ->
+            typename ::pmacc::result_of::Functor<AssignedTrilinearInterpolation, T_Cursor>::type
         {
-            using type = typename ::pmacc::result_of::Functor<
-                AssignedTrilinearInterpolation,
-                T_Cursor
-            >::type;
+            using type = typename ::pmacc::result_of::Functor<AssignedTrilinearInterpolation, T_Cursor>::type;
 
-            type result_y = type( 0.0 );
-            for( int y = T_begin; y <= T_end; ++y )
+            type result_y = type(0.0);
+            for(int y = T_begin; y <= T_end; ++y)
             {
-                type result_x = type( 0.0 );
-                for( int x = T_begin; x <= T_end; ++x )
-                    //a form factor is the "amount of particle" that is affected by this cell
-                    //so we have to sum over: cell_value * form_factor
-                    result_x += *cursor(x, y ) * T_AssignmentFunction()( float_X( x ) - pos.x() );
+                type result_x = type(0.0);
+                for(int x = T_begin; x <= T_end; ++x)
+                    // a form factor is the "amount of particle" that is affected by this cell
+                    // so we have to sum over: cell_value * form_factor
+                    result_x += *cursor(x, y) * T_AssignmentFunction()(float_X(x) - pos.x());
 
-                result_y += result_x * T_AssignmentFunction()( float_X( y ) - pos.y() );
+                result_y += result_x * T_AssignmentFunction()(float_X(y) - pos.y());
             }
             return result_y;
         }
 
-        static
-        auto
-        getStringProperties()
-        -> pmacc::traits::StringProperty
+        static auto getStringProperties() -> pmacc::traits::StringProperty
         {
-            pmacc::traits::StringProperty propList(
-                "name",
-                "uniform"
-            );
+            pmacc::traits::StringProperty propList("name", "uniform");
             return propList;
         }
     };
diff --git a/include/picongpu/algorithms/DifferenceToLower.def b/include/picongpu/algorithms/DifferenceToLower.def
deleted file mode 100644
index 6a24895ea9..0000000000
--- a/include/picongpu/algorithms/DifferenceToLower.def
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <pmacc/math/Vector.hpp>
-
-
-namespace picongpu
-{
-
-    /** calculate difference to lower value
-     *
-     * @tparam T_Dim number of dimensions of the accessed memory
-     *
-     * Zero is returned if `GetDifference` is called for a direction greater or equal to T_Dim.
-     */
-    template< uint32_t T_Dim >
-    struct DifferenceToLower;
-
-} // namespace picongpu
diff --git a/include/picongpu/algorithms/DifferenceToLower.hpp b/include/picongpu/algorithms/DifferenceToLower.hpp
deleted file mode 100644
index 9eef409382..0000000000
--- a/include/picongpu/algorithms/DifferenceToLower.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/algorithms/DifferenceToLower.def"
-
-#include <pmacc/math/Vector.hpp>
-
-
-namespace picongpu
-{
-
-    template< uint32_t T_dim >
-    struct DifferenceToLower
-    {
-        static constexpr uint32_t dim = T_dim;
-
-
-        using OffsetOrigin = typename pmacc::math::CT::make_Int<
-            dim,
-            1
-        >::type;
-        using OffsetEnd = typename pmacc::math::CT::make_Int<
-            dim,
-            0
-        >::type;
-
-        /** calculate the difference for a given direction
-         *
-         * @tparam T_direction direction for the difference operation
-         * @tparam T_isLesserThanDim not needed/ this is calculated by the compiler
-         */
-        template<
-            uint32_t T_direction,
-            bool T_isLesserThanDim = ( T_direction < dim )
-        >
-        struct GetDifference
-        {
-            static constexpr uint32_t direction = T_direction;
-
-            HDINLINE GetDifference( )
-            {
-            }
-
-            /** get difference to lower value
-             * @return difference divided by cell size of the given direction
-             */
-            template< typename Memory >
-            HDINLINE typename Memory::ValueType operator()( Memory const & mem ) const
-            {
-                // defaults to (0, 0, 0) in 3D
-                DataSpace< dim > const indexIdentity;
-                // e.g., (0, -1, 0) for d/dy in 3D
-                DataSpace< dim > indexLower;
-                indexLower[ direction ] = -1;
-
-                return ( mem( indexIdentity ) - mem( indexLower ) ) /
-                    cellSize[ direction ];
-            }
-        };
-
-        /** special case for `direction >= simulation dimensions`
-         *
-         *  difference = d/dx = 0
-         */
-        template< uint32_t T_direction >
-        struct GetDifference<
-            T_direction,
-            false
-        >
-        {
-
-            HDINLINE GetDifference( )
-            {
-            }
-
-            /** @return always a zeroed value
-             */
-            template< typename Memory >
-            HDINLINE typename Memory::ValueType operator()( Memory const & mem ) const
-            {
-                return Memory::ValueType::create( 0.0 );
-            }
-        };
-
-    };
-
-} // namespace picongpu
diff --git a/include/picongpu/algorithms/DifferenceToUpper.def b/include/picongpu/algorithms/DifferenceToUpper.def
deleted file mode 100644
index 971a5971b0..0000000000
--- a/include/picongpu/algorithms/DifferenceToUpper.def
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <pmacc/math/Vector.hpp>
-
-
-namespace picongpu
-{
-
-    /** calculate difference to upper value
-     *
-     * @tparam T_Dim number of dimensions of the accessed memory
-     *
-     * Zero is returned if `GetDifference` is called for a direction greater or equal to T_Dim.
-     */
-    template< uint32_t T_Dim >
-    struct DifferenceToUpper;
-
-} // namespace picongpu
diff --git a/include/picongpu/algorithms/DifferenceToUpper.hpp b/include/picongpu/algorithms/DifferenceToUpper.hpp
deleted file mode 100644
index f2bc7b49d9..0000000000
--- a/include/picongpu/algorithms/DifferenceToUpper.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/algorithms/DifferenceToUpper.def"
-
-#include <pmacc/math/Vector.hpp>
-
-
-namespace picongpu
-{
-
-    template< uint32_t T_dim >
-    struct DifferenceToUpper
-    {
-        static constexpr uint32_t dim = T_dim;
-
-        using OffsetOrigin = typename pmacc::math::CT::make_Int<
-            dim,
-            0
-        >::type;
-        using OffsetEnd = typename pmacc::math::CT::make_Int<
-            dim,
-            1
-        >::type;
-
-        /** calculate the difference for a given direction
-         *
-         * @tparam T_direction direction for the difference operation
-         * @tparam T_isLesserThanDim not needed/ this is calculated by the compiler
-         */
-        template<
-            uint32_t T_direction,
-            bool T_isLesserThanDim = ( T_direction < dim )
-        >
-        struct GetDifference
-        {
-            static constexpr uint32_t direction = T_direction;
-
-            HDINLINE GetDifference( )
-            {
-            }
-
-            /** get difference to lower value
-             * @return difference divided by cell size of the given direction
-             */
-            template< typename Memory >
-            HDINLINE typename Memory::ValueType operator()( Memory const & mem ) const
-            {
-                // defaults to (0, 0, 0) in 3D
-                DataSpace< dim > const indexIdentity;
-                // e.g., (0, 1, 0) for d/dy in 3D
-                DataSpace< dim > indexUpper;
-                indexUpper[ direction ] = 1;
-
-                return ( mem( indexUpper ) - mem( indexIdentity ) ) /
-                    cellSize[ direction ];
-            }
-        };
-
-        /** special case for `direction >= simulation dimensions`
-         *
-         *  difference = d/dx = 0
-         */
-        template< uint32_t T_direction >
-        struct GetDifference<
-            T_direction,
-            false
-        >
-        {
-            HDINLINE GetDifference( )
-            {
-            }
-
-            /** @return always a zeroed value
-             */
-            template< typename Memory >
-            HDINLINE typename Memory::ValueType operator()( Memory const & mem) const
-            {
-                return Memory::ValueType::create( 0.0 );
-            }
-        };
-
-    };
-
-} // namespace picongpu
diff --git a/include/picongpu/algorithms/FieldToParticleInterpolation.hpp b/include/picongpu/algorithms/FieldToParticleInterpolation.hpp
index dd70e3438d..b696b4153b 100644
--- a/include/picongpu/algorithms/FieldToParticleInterpolation.hpp
+++ b/include/picongpu/algorithms/FieldToParticleInterpolation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
@@ -29,101 +28,89 @@
 
 namespace picongpu
 {
-
-/** interpolate field which are defined on a grid to a point inside of a grid
- *
- * interpolate around a point from -AssignmentFunction::support/2 to
- * (AssignmentFunction::support+1)/2
- *
- * \tparam GridShiftMethod functor which shift coordinate system that al value are
- * located on corner
- * \tparam AssignmentFunction AssignmentFunction which is used for interpolation
- * \tparam InterpolationMethod functor for interpolation method
- */
-template<class T_Shape, class InterpolationMethod>
-struct FieldToParticleInterpolation
-{
-    using AssignmentFunction = typename T_Shape::ChargeAssignmentOnSupport;
-    static constexpr int supp = AssignmentFunction::support;
-
-    static constexpr int lowerMargin = supp / 2 ;
-    static constexpr int upperMargin = (supp + 1) / 2;
-    using LowerMargin = typename pmacc::math::CT::make_Int<simDim,lowerMargin>::type;
-    using UpperMargin = typename pmacc::math::CT::make_Int<simDim,upperMargin>::type;
-
-    PMACC_CASSERT_MSG(
-        __FieldToParticleInterpolation_supercell_is_too_small_for_stencil,
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= lowerMargin &&
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= upperMargin
-    );
-
-    /*(supp + 1) % 2 is 1 for even supports else 0*/
-    static constexpr int begin = -supp / 2 + (supp + 1) % 2;
-    static constexpr int end = begin+supp-1;
-
-
-    template<class Cursor, class VecVector>
-    HDINLINE typename Cursor::ValueType operator()(Cursor field,
-                                                   const floatD_X& particlePos,
-                                                   const VecVector& fieldPos)
+    /** interpolate field which are defined on a grid to a point inside of a grid
+     *
+     * interpolate around a point from -AssignmentFunction::support/2 to
+     * (AssignmentFunction::support+1)/2
+     *
+     * \tparam GridShiftMethod functor which shift coordinate system that al value are
+     * located on corner
+     * \tparam AssignmentFunction AssignmentFunction which is used for interpolation
+     * \tparam InterpolationMethod functor for interpolation method
+     */
+    template<class T_Shape, class InterpolationMethod>
+    struct FieldToParticleInterpolation
     {
-        /**\brief:
-         * The following calls seperate the vector interpolation into
-         * independent scalar interpolations.
-         */
-        using Supports = typename pmacc::math::CT::make_Int<simDim,supp>::type;
-
-        typename Cursor::ValueType result;
-        for(uint32_t i = 0; i < Cursor::ValueType::dim; i++)
+        using AssignmentFunction = typename T_Shape::ChargeAssignmentOnSupport;
+        static constexpr int supp = AssignmentFunction::support;
+
+        static constexpr int lowerMargin = supp / 2;
+        static constexpr int upperMargin = (supp + 1) / 2;
+        using LowerMargin = typename pmacc::math::CT::make_Int<simDim, lowerMargin>::type;
+        using UpperMargin = typename pmacc::math::CT::make_Int<simDim, upperMargin>::type;
+
+        PMACC_CASSERT_MSG(
+            __FieldToParticleInterpolation_supercell_is_too_small_for_stencil,
+            pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                    >= lowerMargin
+                && pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                    >= upperMargin);
+
+        /*(supp + 1) % 2 is 1 for even supports else 0*/
+        static constexpr int begin = -supp / 2 + (supp + 1) % 2;
+        static constexpr int end = begin + supp - 1;
+
+
+        template<class Cursor, class VecVector>
+        HDINLINE typename Cursor::ValueType operator()(
+            Cursor field,
+            const floatD_X& particlePos,
+            const VecVector& fieldPos)
         {
-            auto fieldComponent = pmacc::cursor::make_FunctorCursor(
-                field,
-                pmacc::algorithm::functor::GetComponent<float_X>(i)
-            );
-            floatD_X particlePosShifted = particlePos;
-            ShiftCoordinateSystem<Supports>()(fieldComponent, particlePosShifted, fieldPos[i]);
-            result[i] = InterpolationMethod::template interpolate<AssignmentFunction, begin, end > (fieldComponent, particlePosShifted);
+            /**\brief:
+             * The following calls seperate the vector interpolation into
+             * independent scalar interpolations.
+             */
+            using Supports = typename pmacc::math::CT::make_Int<simDim, supp>::type;
+
+            typename Cursor::ValueType result;
+            for(uint32_t i = 0; i < Cursor::ValueType::dim; i++)
+            {
+                auto fieldComponent
+                    = pmacc::cursor::make_FunctorCursor(field, pmacc::algorithm::functor::GetComponent<float_X>(i));
+                floatD_X particlePosShifted = particlePos;
+                ShiftCoordinateSystem<Supports>()(fieldComponent, particlePosShifted, fieldPos[i]);
+                result[i] = InterpolationMethod::template interpolate<AssignmentFunction, begin, end>(
+                    fieldComponent,
+                    particlePosShifted);
+            }
+
+            return result;
         }
 
-        return result;
-    }
+        static pmacc::traits::StringProperty getStringProperties()
+        {
+            GetStringProperties<InterpolationMethod> propList;
+            return propList;
+        }
+    };
 
-    static pmacc::traits::StringProperty getStringProperties()
+    namespace traits
     {
-        GetStringProperties<InterpolationMethod> propList;
-        return propList;
-    }
-
-};
-
-namespace traits
-{
-
-/*Get margin of a solver
- * class must define a LowerMargin and UpperMargin
- */
-template<class AssignMethod, class InterpolationMethod>
-struct GetMargin<picongpu::FieldToParticleInterpolation<AssignMethod, InterpolationMethod> >
-{
-private:
-    using Interpolation = picongpu::FieldToParticleInterpolation<AssignMethod, InterpolationMethod>;
-public:
-    using LowerMargin = typename Interpolation::LowerMargin;
-    using UpperMargin = typename Interpolation::UpperMargin;
-};
-
-} //namespace traits
+        /*Get margin of a solver
+         * class must define a LowerMargin and UpperMargin
+         */
+        template<class AssignMethod, class InterpolationMethod>
+        struct GetMargin<picongpu::FieldToParticleInterpolation<AssignMethod, InterpolationMethod>>
+        {
+        private:
+            using Interpolation = picongpu::FieldToParticleInterpolation<AssignMethod, InterpolationMethod>;
 
-} //namespace picongpu
+        public:
+            using LowerMargin = typename Interpolation::LowerMargin;
+            using UpperMargin = typename Interpolation::UpperMargin;
+        };
 
+    } // namespace traits
 
+} // namespace picongpu
diff --git a/include/picongpu/algorithms/FieldToParticleInterpolationNative.hpp b/include/picongpu/algorithms/FieldToParticleInterpolationNative.hpp
index 47bfc3c85b..71fb7d676f 100644
--- a/include/picongpu/algorithms/FieldToParticleInterpolationNative.hpp
+++ b/include/picongpu/algorithms/FieldToParticleInterpolationNative.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
@@ -30,76 +29,89 @@
 
 namespace picongpu
 {
-
-/** interpolate field which are defined on a grid to a point inside of a grid
- *
- * interpolate around of a point from -AssignmentFunction::support/2 to
- * (AssignmentFunction::support+1)/2
- *
- * \tparam GridShiftMethod functor which shift coordinate system that al value are
- * located on corner
- * \tparam AssignmentFunction AssignmentFunction which is used for interpolation
- * \tparam InterpolationMethod functor for interpolation method
- */
-template<class T_Shape, class InterpolationMethod>
-struct FieldToParticleInterpolationNative
-{
-    using AssignmentFunction = typename T_Shape::ChargeAssignment;
-    static constexpr int supp = AssignmentFunction::support;
-
-    static constexpr int lowerMargin = supp / 2;
-    static constexpr int upperMargin = (supp + 1) / 2;
-    using LowerMargin = typename pmacc::math::CT::make_Int<simDim,lowerMargin>::type;
-    using UpperMargin = typename pmacc::math::CT::make_Int<simDim,upperMargin>::type;
-
-    template<class Cursor, class VecVector_ >
-    HDINLINE float3_X operator()(Cursor field, const floatD_X& particlePos,
-                                 const VecVector_ & fieldPos)
+    /** interpolate field which are defined on a grid to a point inside of a grid
+     *
+     * interpolate around of a point from -AssignmentFunction::support/2 to
+     * (AssignmentFunction::support+1)/2
+     *
+     * \tparam GridShiftMethod functor which shift coordinate system that al value are
+     * located on corner
+     * \tparam AssignmentFunction AssignmentFunction which is used for interpolation
+     * \tparam InterpolationMethod functor for interpolation method
+     */
+    template<class T_Shape, class InterpolationMethod>
+    struct FieldToParticleInterpolationNative
     {
-        /**\brief:
-         * The following three calls seperate the vector interpolation into three
-         * independent scalar interpolations. In each call the coordinate system
-         * is turned so that E_scalar does the interpolation for the z-component.
+        using AssignmentFunction = typename T_Shape::ChargeAssignment;
+        static constexpr int supp = AssignmentFunction::support;
+
+        static constexpr int lowerMargin = supp / 2;
+        static constexpr int upperMargin = (supp + 1) / 2;
+        using LowerMargin = typename pmacc::math::CT::make_Int<simDim, lowerMargin>::type;
+        using UpperMargin = typename pmacc::math::CT::make_Int<simDim, upperMargin>::type;
+
+        template<class Cursor, class VecVector_>
+        HDINLINE float3_X operator()(Cursor field, const floatD_X& particlePos, const VecVector_& fieldPos)
+        {
+            /**\brief:
+             * The following three calls seperate the vector interpolation into three
+             * independent scalar interpolations. In each call the coordinate system
+             * is turned so that E_scalar does the interpolation for the z-component.
+             */
+
+            auto field_x
+                = pmacc::cursor::make_FunctorCursor(field, pmacc::algorithm::functor::GetComponent<float_X>(0));
+            floatD_X pos_tmp(particlePos);
+            ShiftCoordinateSystemNative<supp>()(field_x, pos_tmp, fieldPos.x());
+            float_X result_x
+                = InterpolationMethod::template interpolate<AssignmentFunction, -lowerMargin, upperMargin>(
+                    field_x,
+                    pos_tmp);
+
+            auto field_y
+                = pmacc::cursor::make_FunctorCursor(field, pmacc::algorithm::functor::GetComponent<float_X>(1));
+            pos_tmp = particlePos;
+            ShiftCoordinateSystemNative<supp>()(field_y, pos_tmp, fieldPos.y());
+            float_X result_y
+                = InterpolationMethod::template interpolate<AssignmentFunction, -lowerMargin, upperMargin>(
+                    field_y,
+                    pos_tmp);
+
+            auto field_z
+                = pmacc::cursor::make_FunctorCursor(field, pmacc::algorithm::functor::GetComponent<float_X>(2));
+            pos_tmp = particlePos;
+            ShiftCoordinateSystemNative<supp>()(field_z, pos_tmp, fieldPos.z());
+            float_X result_z
+                = InterpolationMethod::template interpolate<AssignmentFunction, -lowerMargin, upperMargin>(
+                    field_z,
+                    pos_tmp);
+
+            return float3_X(result_x, result_y, result_z);
+        }
+
+        static pmacc::traits::StringProperty getStringProperties()
+        {
+            GetStringProperties<InterpolationMethod> propList;
+            return propList;
+        }
+    };
+
+    namespace traits
+    {
+        /*Get margin of a solver
+         * class must define a LowerMargin and UpperMargin
          */
+        template<class AssignMethod, class InterpolationMethod>
+        struct GetMargin<picongpu::FieldToParticleInterpolationNative<AssignMethod, InterpolationMethod>>
+        {
+        private:
+            using Interpolation = picongpu::FieldToParticleInterpolationNative<AssignMethod, InterpolationMethod>;
 
-        auto field_x = pmacc::cursor::make_FunctorCursor(field, pmacc::algorithm::functor::GetComponent<float_X>(0));
-        floatD_X pos_tmp(particlePos);
-        ShiftCoordinateSystemNative<supp>()(field_x, pos_tmp, fieldPos.x());
-        float_X result_x = InterpolationMethod::template interpolate<AssignmentFunction, -lowerMargin, upperMargin > (field_x, pos_tmp);
-
-        auto field_y = pmacc::cursor::make_FunctorCursor(field, pmacc::algorithm::functor::GetComponent<float_X>(1));
-        pos_tmp = particlePos;
-        ShiftCoordinateSystemNative<supp>()(field_y, pos_tmp, fieldPos.y());
-        float_X result_y = InterpolationMethod::template interpolate<AssignmentFunction, -lowerMargin, upperMargin > (field_y, pos_tmp);
-
-        auto field_z = pmacc::cursor::make_FunctorCursor(field, pmacc::algorithm::functor::GetComponent<float_X>(2));
-        pos_tmp = particlePos;
-        ShiftCoordinateSystemNative<supp>()(field_z, pos_tmp, fieldPos.z());
-        float_X result_z = InterpolationMethod::template interpolate<AssignmentFunction, -lowerMargin, upperMargin > (field_z, pos_tmp);
-
-        return float3_X(result_x, result_y, result_z);
-    }
-
-};
-
-namespace traits
-{
-
-/*Get margin of a solver
- * class must define a LowerMargin and UpperMargin
- */
-template<class AssignMethod, class InterpolationMethod>
-struct GetMargin<picongpu::FieldToParticleInterpolationNative<AssignMethod, InterpolationMethod> >
-{
-private:
-    using Interpolation = picongpu::FieldToParticleInterpolationNative< AssignMethod, InterpolationMethod>;
-public:
-    using LowerMargin = typename Interpolation::LowerMargin;
-    using UpperMargin = typename Interpolation::UpperMargin;
-};
-
-} //namespace traits
-
-} //namespace picongpu
+        public:
+            using LowerMargin = typename Interpolation::LowerMargin;
+            using UpperMargin = typename Interpolation::UpperMargin;
+        };
 
+    } // namespace traits
 
+} // namespace picongpu
diff --git a/include/picongpu/algorithms/Gamma.def b/include/picongpu/algorithms/Gamma.def
index 6ea120c120..ce4ee276e0 100644
--- a/include/picongpu/algorithms/Gamma.def
+++ b/include/picongpu/algorithms/Gamma.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,12 +24,11 @@
 
 namespace picongpu
 {
-
     /** calculate the gamma of a particle
      *
      * @tparam T_PrecisionType precision in which the calculation is performed
      */
-    template< typename T_PrecisionType = float_X >
+    template<typename T_PrecisionType = float_X>
     struct Gamma
     {
         using valueType = T_PrecisionType;
@@ -43,17 +42,8 @@ namespace picongpu
          * @param mass particle mass
          * @return particle gamma
          */
-        template<
-            typename T_MomType,
-             typename T_MassType
-        >
-        HDINLINE
-        valueType
-        operator()(
-            T_MomType const & mom,
-            T_MassType const mass
-        ) const;
-
+        template<typename T_MomType, typename T_MassType>
+        HDINLINE valueType operator()(T_MomType const& mom, T_MassType const mass) const;
     };
 
     /** calculate the gamma of a particle
@@ -66,19 +56,10 @@ namespace picongpu
      * @param mass particle mass
      * @return particle gamma
      */
-    template<
-        typename T_PrecisionType,
-        typename T_MomType,
-        typename T_MassType
-    >
-    HDINLINE
-    T_PrecisionType
-    gamma( T_MomType const & mom, T_MassType const mass )
+    template<typename T_PrecisionType, typename T_MomType, typename T_MassType>
+    HDINLINE T_PrecisionType gamma(T_MomType const& mom, T_MassType const mass)
     {
-        return Gamma< T_PrecisionType >{}(
-            mom,
-            mass
-        );
-    };
+        return Gamma<T_PrecisionType>{}(mom, mass);
+    }
 
 } // namespace picongpu
diff --git a/include/picongpu/algorithms/Gamma.hpp b/include/picongpu/algorithms/Gamma.hpp
index f5b0cf2a3f..0665952af0 100644
--- a/include/picongpu/algorithms/Gamma.hpp
+++ b/include/picongpu/algorithms/Gamma.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,30 +25,18 @@
 
 namespace picongpu
 {
-
-    template< typename T_PrecisionType >
-    template<
-        typename T_MomType,
-        typename T_MassType
-    >
-    HDINLINE
-    T_PrecisionType
-    Gamma< T_PrecisionType >::operator()(
-        T_MomType const & mom,
-        T_MassType const mass
-    ) const
+    template<typename T_PrecisionType>
+    template<typename T_MomType, typename T_MassType>
+    HDINLINE T_PrecisionType Gamma<T_PrecisionType>::operator()(T_MomType const& mom, T_MassType const mass) const
     {
         using namespace pmacc;
 
-        valueType const fMom2 = math::abs2( precisionCast< valueType >( mom ) );
+        valueType const fMom2 = pmacc::math::abs2(precisionCast<valueType>(mom));
         constexpr valueType c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
 
-        valueType const m2_c2_reci = valueType( 1.0 ) /
-            precisionCast<valueType >( mass * mass * c2 );
+        valueType const m2_c2_reci = valueType(1.0) / precisionCast<valueType>(mass * mass * c2);
 
-        return math::sqrt(
-            precisionCast<valueType >( valueType( 1.0 ) + fMom2 * m2_c2_reci )
-        );
+        return math::sqrt(precisionCast<valueType>(valueType(1.0) + fMom2 * m2_c2_reci));
     }
 
 } // namespace picongpu
diff --git a/include/picongpu/algorithms/KinEnergy.hpp b/include/picongpu/algorithms/KinEnergy.hpp
index 5f2c17b9e0..929f12a151 100644
--- a/include/picongpu/algorithms/KinEnergy.hpp
+++ b/include/picongpu/algorithms/KinEnergy.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2017-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,48 +25,47 @@
 
 namespace picongpu
 {
+    using namespace pmacc;
 
-using namespace pmacc;
+    /** Computes the kinetic energy of a particle given its momentum and mass.
+     *
+     * The mass may be zero.
+     *
+     * For massive particle with low energy the non-relativistic
+     * kinetic energy expression is used in order to avoid bad roundings.
+     *
+     */
+    template<typename T_PrecisionType = float_X>
+    struct KinEnergy
+    {
+        using ValueType = T_PrecisionType;
 
-/** Computes the kinetic energy of a particle given its momentum and mass.
- *
- * The mass may be zero.
- *
- * For massive particle with low energy the non-relativistic
- * kinetic energy expression is used in order to avoid bad roundings.
- *
- */
-template< typename T_PrecisionType = float_X >
-struct KinEnergy
-{
-    using ValueType = T_PrecisionType;
+        template<typename MomType, typename MassType>
+        HDINLINE ValueType operator()(MomType const& mom, MassType const& mass)
+        {
+            if(mass == MassType(0.0))
+                return SPEED_OF_LIGHT * math::abs(precisionCast<ValueType>(mom));
 
-    template< typename MomType, typename MassType >
-    HDINLINE ValueType operator()( MomType const & mom, MassType const & mass )
-    {
-        if( mass == MassType( 0.0 ) )
-            return SPEED_OF_LIGHT * math::abs( precisionCast< ValueType >( mom ) );
+            /* if mass is non-zero then gamma is well defined */
+            const ValueType gamma = Gamma<ValueType>()(mom, mass);
 
-        /* if mass is non-zero then gamma is well defined */
-        const ValueType gamma = Gamma< ValueType >()( mom, mass );
+            ValueType kinEnergy;
 
-        ValueType kinEnergy;
+            if(gamma < GAMMA_THRESH)
+            {
+                const ValueType mom2 = pmacc::math::abs2(precisionCast<ValueType>(mom));
+                /* non relativistic kinetic energy expression */
+                kinEnergy = mom2 / (ValueType(2.0) * mass);
+            }
+            else
+            {
+                constexpr ValueType c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
+                /* kinetic energy for particles: E = (gamma - 1) * m * c^2 */
+                kinEnergy = (gamma - ValueType(1.0)) * mass * c2;
+            }
 
-        if( gamma < GAMMA_THRESH )
-        {
-            const ValueType mom2 = math::abs2( precisionCast< ValueType >( mom ) );
-            /* non relativistic kinetic energy expression */
-            kinEnergy = mom2 / ( ValueType( 2.0 ) * mass );
+            return kinEnergy;
         }
-        else
-        {
-            constexpr ValueType c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
-            /* kinetic energy for particles: E = (gamma - 1) * m * c^2 */
-            kinEnergy = ( gamma - ValueType( 1.0 ) ) * mass * c2;
-        }
-
-        return kinEnergy;
-    }
-};
+    };
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/algorithms/LinearInterpolateWithUpper.hpp b/include/picongpu/algorithms/LinearInterpolateWithUpper.hpp
index ac2d716e72..e341fb7f13 100644
--- a/include/picongpu/algorithms/LinearInterpolateWithUpper.hpp
+++ b/include/picongpu/algorithms/LinearInterpolateWithUpper.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau, Rene Widera
+/* Copyright 2015-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include <pmacc/types.hpp>
@@ -26,60 +25,57 @@
 
 namespace picongpu
 {
-
-/** Calculate linear interpolation to upper cell value
- *
- * @tparam T_Dim for how many dimensions does this operator interpolate
- *
- * If `GetDifference` is called for a direction greater or equal T_Dim,
- * a zeroed value is returned (assumes symmetry in those directions).
- */
-template<uint32_t T_Dim>
-struct LinearInterpolateWithUpper
-{
-    static constexpr uint32_t dim = T_Dim;
-
-    using OffsetOrigin = typename pmacc::math::CT::make_Int<dim, 0>::type;
-    using OffsetEnd = typename pmacc::math::CT::make_Int<dim, 1>::type;
-
-    /** calculate the linear interpolation for a given direction
+    /** Calculate linear interpolation to upper cell value
+     *
+     * @tparam T_Dim for how many dimensions does this operator interpolate
      *
-     * @tparam T_direction direction for the interpolation operation
-     * @tparam T_isLesserThanDim not needed/ this is calculated by the compiler
+     * If `GetDifference` is called for a direction greater or equal T_Dim,
+     * a zeroed value is returned (assumes symmetry in those directions).
      */
-    template<uint32_t T_direction, bool T_isLesserThanDim = (T_direction < dim)>
-    struct GetInterpolatedValue
+    template<uint32_t T_Dim>
+    struct LinearInterpolateWithUpper
     {
-        static constexpr uint32_t direction = T_direction;
+        static constexpr uint32_t dim = T_Dim;
 
-        /** get interpolated value
-         * @return interpolated value
+        using OffsetOrigin = typename pmacc::math::CT::make_Int<dim, 0>::type;
+        using OffsetEnd = typename pmacc::math::CT::make_Int<dim, 1>::type;
+
+        /** calculate the linear interpolation for a given direction
+         *
+         * @tparam T_direction direction for the interpolation operation
+         * @tparam T_isLesserThanDim not needed/ this is calculated by the compiler
          */
-        template<class Memory >
-        HDINLINE typename Memory::ValueType operator()(const Memory& mem) const
+        template<uint32_t T_direction, bool T_isLesserThanDim = (T_direction < dim)>
+        struct GetInterpolatedValue
         {
-            const DataSpace<dim> indexIdentity; /* defaults to (0, 0, 0) in 3D */
-            DataSpace<dim> indexUpper; /* e.g., (0, 1, 0) for direction y in 3D */
-            indexUpper[direction] = 1;
+            static constexpr uint32_t direction = T_direction;
 
-            return ( mem(indexUpper) + mem(indexIdentity)) * Memory::ValueType::create(0.5);
-        }
-    };
+            /** get interpolated value
+             * @return interpolated value
+             */
+            template<class Memory>
+            HDINLINE typename Memory::ValueType operator()(const Memory& mem) const
+            {
+                const DataSpace<dim> indexIdentity; /* defaults to (0, 0, 0) in 3D */
+                DataSpace<dim> indexUpper; /* e.g., (0, 1, 0) for direction y in 3D */
+                indexUpper[direction] = 1;
 
-    /** special case for `direction >= simulation dimensions`*/
-    template<uint32_t T_direction>
-    struct GetInterpolatedValue<T_direction, false>
-    {
+                return (mem(indexUpper) + mem(indexIdentity)) * Memory::ValueType::create(0.5);
+            }
+        };
 
-        /** @return always identity
-         */
-        template<class Memory >
-        HDINLINE typename Memory::ValueType operator()(const Memory& mem) const
+        /** special case for `direction >= simulation dimensions`*/
+        template<uint32_t T_direction>
+        struct GetInterpolatedValue<T_direction, false>
         {
-            return *mem;
-        }
+            /** @return always identity
+             */
+            template<class Memory>
+            HDINLINE typename Memory::ValueType operator()(const Memory& mem) const
+            {
+                return *mem;
+            }
+        };
     };
 
-};
-
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/algorithms/Set.hpp b/include/picongpu/algorithms/Set.hpp
index 6816f89213..a243cb4849 100644
--- a/include/picongpu/algorithms/Set.hpp
+++ b/include/picongpu/algorithms/Set.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -23,31 +23,22 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-template<typename Type_>
-struct Set
-{
-
-    HDINLINE Set(Type_ defaultValue) : value(defaultValue)
+    template<typename Type_>
+    struct Set
     {
-
-    }
-
-    template<
-        typename Dst,
-        typename T_Acc
-    >
-    HDINLINE void operator()(
-        T_Acc const &,
-        Dst & dst
-    ) const
-    {
-        dst = value;
-    }
-
-private:
-    PMACC_ALIGN(value, const Type_);
-};
-}
-
+        HDINLINE Set(Type_ defaultValue) : value(defaultValue)
+        {
+        }
+
+        template<typename Dst, typename T_Acc>
+        HDINLINE void operator()(T_Acc const&, Dst& dst) const
+        {
+            dst = value;
+        }
+
+    private:
+        PMACC_ALIGN(value, const Type_);
+    };
+} // namespace picongpu
diff --git a/include/picongpu/algorithms/ShiftCoordinateSystem.hpp b/include/picongpu/algorithms/ShiftCoordinateSystem.hpp
index 1ef0fc0db0..7db476e292 100644
--- a/include/picongpu/algorithms/ShiftCoordinateSystem.hpp
+++ b/include/picongpu/algorithms/ShiftCoordinateSystem.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include <pmacc/types.hpp>
@@ -31,116 +30,109 @@
 
 namespace picongpu
 {
+    /** calculate offset to move coordinate system in an easy to use system
+     *
+     * There are two cases:
+     *  - system with even shape and odd shape
+     *  - for more see documentation of the implementation
+     */
+    template<bool T_isEvenShape>
+    struct GetOffsetToStaticShapeSystem;
 
-/** calculate offset to move coordinate system in an easy to use system
- *
- * There are two cases:
- *  - system with even shape and odd shape
- *  - for more see documentation of the implementation
- */
-template<bool T_isEvenShape>
-struct GetOffsetToStaticShapeSystem;
-
-template<typename T_Component, typename T_Supports>
-struct AssignToDim
-{
-
-    template<typename T_Type, typename T_Vector, typename T_FieldType>
-    HDINLINE void
-    operator()(T_Type& cursor, T_Vector& pos, const T_FieldType& fieldPos)
+    template<typename T_Component, typename T_Supports>
+    struct AssignToDim
     {
-        const uint32_t dim = T_Vector::dim;
-        using ValueType = typename T_Vector::type;
+        template<typename T_Type, typename T_Vector, typename T_FieldType>
+        HDINLINE void operator()(T_Type& cursor, T_Vector& pos, const T_FieldType& fieldPos)
+        {
+            const uint32_t dim = T_Vector::dim;
+            using ValueType = typename T_Vector::type;
 
-        using Supports = T_Supports;
-        using Component = T_Component;
+            using Supports = T_Supports;
+            using Component = T_Component;
 
-        const uint32_t component = Component::x::value;
-        const uint32_t support = Supports::template at<component>::type::value;
-        const bool isEven = (support % 2) == 0;
+            const uint32_t component = Component::x::value;
+            const uint32_t support = Supports::template at<component>::type::value;
+            const bool isEven = (support % 2) == 0;
 
 
-        const ValueType v_pos = pos[component] - fieldPos[component];
-        DataSpace< dim > intShift;
-        intShift[component] = GetOffsetToStaticShapeSystem <isEven>()(v_pos);
-        cursor = cursor(intShift);
-        pos[component] = v_pos - ValueType(intShift[component]);
-    }
-};
-
-/** shift to new coordinate system
- *
- * @tparam T_supports CT::Vector with support
- */
-template<typename T_supports>
-struct ShiftCoordinateSystem
-{
+            const ValueType v_pos = pos[component] - fieldPos[component];
+            DataSpace<dim> intShift;
+            intShift[component] = GetOffsetToStaticShapeSystem<isEven>()(v_pos);
+            cursor = cursor(intShift);
+            pos[component] = v_pos - ValueType(intShift[component]);
+        }
+    };
 
     /** shift to new coordinate system
      *
-     * shift cursor and vector to new coordinate system
-     * @param[in,out] cursor cursor to memory
-     * @param[in,out] vector short vector with coordinates in old system
-     *                        - defined for [0.0;1.0) per dimension
-     * @param fieldPos vector with relative coordinates for shift ( value range [0.0;0.5] )
-     *
-     * After this coordinate shift vector has well defined ranges per dimension,
-     * for each defined fieldPos:
-     *
-     * - Even Support: vector is always [0.0;1.0)
-     * - Odd Support: vector is always [-0.5;0.5)
+     * @tparam T_supports CT::Vector with support
      */
-    template<typename T_Cursor, typename T_Vector, typename T_FieldType >
-    HDINLINE void operator()(T_Cursor& cursor, T_Vector& vector, const T_FieldType & fieldPos)
+    template<typename T_supports>
+    struct ShiftCoordinateSystem
     {
-        /** \todo check if a static assert on
-         *  "T_Cursor::dim" == T_Vector::dim ==  T_FieldType::dim is possible
-         *  and does not waste registers */
-        const uint32_t dim = T_Vector::dim;
-
-        using Size = boost::mpl::vector1 < boost::mpl::range_c<uint32_t, 0, dim > >;
-        using CombiTypes = typename AllCombinations<Size>::type;
-
-        meta::ForEach<CombiTypes, AssignToDim<bmpl::_1, T_supports> > shift;
-        shift(cursor, vector, fieldPos);
-
-    }
-};
-
-
-/** Offset calculation for even support
- *
- * @param pos position of the particle relative to the grid
- *            - defined for [-0.5;1.0)
- * @return offset for the old system ( new system = old_system - offset)
- */
-template<>
-struct GetOffsetToStaticShapeSystem<true>
-{
-
-    template<typename T_Type>
-    HDINLINE int operator()(const T_Type& pos)
+        /** shift to new coordinate system
+         *
+         * shift cursor and vector to new coordinate system
+         * @param[in,out] cursor cursor to memory
+         * @param[in,out] vector short vector with coordinates in old system
+         *                        - defined for [0.0;1.0) per dimension
+         * @param fieldPos vector with relative coordinates for shift ( value range [0.0;0.5] )
+         *
+         * After this coordinate shift vector has well defined ranges per dimension,
+         * for each defined fieldPos:
+         *
+         * - Even Support: vector is always [0.0;1.0)
+         * - Odd Support: vector is always [-0.5;0.5)
+         */
+        template<typename T_Cursor, typename T_Vector, typename T_FieldType>
+        HDINLINE void operator()(T_Cursor& cursor, T_Vector& vector, const T_FieldType& fieldPos)
+        {
+            /** \todo check if a static assert on
+             *  "T_Cursor::dim" == T_Vector::dim ==  T_FieldType::dim is possible
+             *  and does not waste registers */
+            const uint32_t dim = T_Vector::dim;
+
+            using Size = boost::mpl::vector1<boost::mpl::range_c<uint32_t, 0, dim>>;
+            using CombiTypes = typename AllCombinations<Size>::type;
+
+            meta::ForEach<CombiTypes, AssignToDim<bmpl::_1, T_supports>> shift;
+            shift(cursor, vector, fieldPos);
+        }
+    };
+
+
+    /** Offset calculation for even support
+     *
+     * @param pos position of the particle relative to the grid
+     *            - defined for [-0.5;1.0)
+     * @return offset for the old system ( new system = old_system - offset)
+     */
+    template<>
+    struct GetOffsetToStaticShapeSystem<true>
     {
-        return math::float2int_rd(pos);
-    }
-};
+        template<typename T_Type>
+        HDINLINE int operator()(const T_Type& pos)
+        {
+            return pmacc::math::float2int_rd(pos);
+        }
+    };
 
 
-/** Offset calculation for odd support
- *
- * @param pos position of the particle relative to the grid
- *            - defined for [-0.5;1.0)
- * @return offset for the old system ( new system = old_system - offset)
- */
-template<>
-struct GetOffsetToStaticShapeSystem<false>
-{
-
-    template<typename T_Type>
-    HDINLINE int operator()(const T_Type& pos)
+    /** Offset calculation for odd support
+     *
+     * @param pos position of the particle relative to the grid
+     *            - defined for [-0.5;1.0)
+     * @return offset for the old system ( new system = old_system - offset)
+     */
+    template<>
+    struct GetOffsetToStaticShapeSystem<false>
     {
-        return pos >= T_Type(0.5) ? 1 : 0;
-    }
-};
+        template<typename T_Type>
+        HDINLINE int operator()(const T_Type& pos)
+        {
+            return pos >= T_Type(0.5) ? 1 : 0;
+        }
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/algorithms/ShiftCoordinateSystemNative.hpp b/include/picongpu/algorithms/ShiftCoordinateSystemNative.hpp
index d33c961dd8..46da69bdd9 100644
--- a/include/picongpu/algorithms/ShiftCoordinateSystemNative.hpp
+++ b/include/picongpu/algorithms/ShiftCoordinateSystemNative.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include <pmacc/types.hpp>
@@ -26,24 +25,22 @@
 
 namespace picongpu
 {
-
-template<uint32_t T_support>
-struct ShiftCoordinateSystemNative
-{
-
-    /**shift to new coordinat system
-     *
-     * shift cursor and vector to new coordinate system
-     * @param curser curser to memory
-     * @param vector short vector with coordinates in old system
-     * @param fieldPos vector with relative coordinates for shift ( value range [0.0;0.5] )
-     */
-    template<typename Cursor, typename Vector >
-    HDINLINE void operator()(Cursor& cursor, Vector& vector, const floatD_X & fieldPos)
+    template<uint32_t T_support>
+    struct ShiftCoordinateSystemNative
     {
-        for (uint32_t i = 0; i < simDim; ++i)
-            vector[i] -= fieldPos[i];
-    }
-};
+        /**shift to new coordinat system
+         *
+         * shift cursor and vector to new coordinate system
+         * @param curser curser to memory
+         * @param vector short vector with coordinates in old system
+         * @param fieldPos vector with relative coordinates for shift ( value range [0.0;0.5] )
+         */
+        template<typename Cursor, typename Vector>
+        HDINLINE void operator()(Cursor& cursor, Vector& vector, const floatD_X& fieldPos)
+        {
+            for(uint32_t i = 0; i < simDim; ++i)
+                vector[i] -= fieldPos[i];
+        }
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/algorithms/Velocity.hpp b/include/picongpu/algorithms/Velocity.hpp
index 2b6c97d651..90fc9f0b67 100644
--- a/include/picongpu/algorithms/Velocity.hpp
+++ b/include/picongpu/algorithms/Velocity.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -21,21 +21,18 @@
 
 namespace picongpu
 {
-
     using namespace pmacc;
 
     struct Velocity
     {
-
-        template<typename MomType, typename MassType >
-                HDINLINE MomType operator()(const MomType mom, const MassType mass0)
+        template<typename MomType, typename MassType>
+        HDINLINE MomType operator()(const MomType mom, const MassType mass0)
         {
             const float_X rc2 = MUE0_EPS0;
-            const float_X m0_2 = mass0*mass0;
-            const float_X fMom2 = math::abs2(mom);
+            const float_X m0_2 = mass0 * mass0;
+            const float_X fMom2 = pmacc::math::abs2(mom);
             float_X t = math::rsqrt(precisionCast<sqrt_X>(m0_2 + fMom2 * rc2));
             return t * mom;
         }
     };
-}
-
+} // namespace picongpu
diff --git a/include/picongpu/debug/PIConGPUVerbose.hpp b/include/picongpu/debug/PIConGPUVerbose.hpp
index b2d0026055..d7a6b774be 100644
--- a/include/picongpu/debug/PIConGPUVerbose.hpp
+++ b/include/picongpu/debug/PIConGPUVerbose.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,26 +24,21 @@
 
 namespace picongpu
 {
-
 #ifndef PIC_VERBOSE_LVL
-#define PIC_VERBOSE_LVL 0
+#    define PIC_VERBOSE_LVL 0
 #endif
 
-/*create verbose class*/
-DEFINE_VERBOSE_CLASS(PIConGPUVerbose)
-(
-    /* define log lvl for later use
-     * e.g. log<pmaccLogLvl::NOTHING>("TEXT");*/
-    DEFINE_LOGLVL(0,NOTHING);
-    DEFINE_LOGLVL(1,PHYSICS);
-    DEFINE_LOGLVL(2,DOMAINS);
-    DEFINE_LOGLVL(4,CRITICAL);
-    DEFINE_LOGLVL(8,MEMORY);
-    DEFINE_LOGLVL(16,SIMULATION_STATE);
-    DEFINE_LOGLVL(32,INPUT_OUTPUT);
-)
-/*set default verbose lvl (integer number)*/
-(NOTHING::lvl|PIC_VERBOSE_LVL);
+    /*create verbose class*/
+    DEFINE_VERBOSE_CLASS(PIConGPUVerbose)
+    (
+        /* define log lvl for later use
+         * e.g. log<pmaccLogLvl::NOTHING>("TEXT");*/
+        DEFINE_LOGLVL(0, NOTHING); DEFINE_LOGLVL(1, PHYSICS); DEFINE_LOGLVL(2, DOMAINS); DEFINE_LOGLVL(4, CRITICAL);
+        DEFINE_LOGLVL(8, MEMORY);
+        DEFINE_LOGLVL(16, SIMULATION_STATE);
+        DEFINE_LOGLVL(32, INPUT_OUTPUT);)
+        /*set default verbose lvl (integer number)*/
+        (NOTHING::lvl | PIC_VERBOSE_LVL);
 
 
 } /* namespace picongpu */
diff --git a/include/picongpu/extensionParam.loader b/include/picongpu/extensionParam.loader
index be8434d2cb..a3daaabf88 100644
--- a/include/picongpu/extensionParam.loader
+++ b/include/picongpu/extensionParam.loader
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/extensionUnitless.loader b/include/picongpu/extensionUnitless.loader
index be8434d2cb..a3daaabf88 100644
--- a/include/picongpu/extensionUnitless.loader
+++ b/include/picongpu/extensionUnitless.loader
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/fields/CellType.hpp b/include/picongpu/fields/CellType.hpp
index 5515976b15..4e1e3fffa3 100644
--- a/include/picongpu/fields/CellType.hpp
+++ b/include/picongpu/fields/CellType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -21,15 +21,15 @@
 
 #include "picongpu/simulation_defines.hpp"
 #include "picongpu/fields/MaxwellSolver/Solvers.hpp"
+#include "picongpu/traits/GetCellType.hpp"
 
 
 namespace picongpu
 {
-namespace fields
-{
-
-    //! Alias for a cell type used by the field solver
-    using CellType = Solver::CellType;
+    namespace fields
+    {
+        //! Alias for a cell type used by the field solver
+        using CellType = traits::GetCellType<Solver>::type;
 
-} // namespace fields
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/EMFieldBase.hpp b/include/picongpu/fields/EMFieldBase.hpp
index 89b5dfc538..b750673910 100644
--- a/include/picongpu/fields/EMFieldBase.hpp
+++ b/include/picongpu/fields/EMFieldBase.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -41,96 +41,91 @@
 
 namespace picongpu
 {
-namespace fields
-{
-
-    /** Base class for implementation inheritance in classes for the
-     *  electromagnetic fields
-     *
-     * Stores field values on host and device and provides data synchronization
-     * between them.
-     *
-     * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
-     * ISimulationData.
-     */
-    class EMFieldBase :
-        public SimulationFieldHelper< MappingDesc >,
-        public ISimulationData
+    namespace fields
     {
-    public:
-
-        //! Type of each field value
-        using ValueType = float3_X;
-
-        //! Number of components of ValueType, for serialization
-        static constexpr int numComponents = ValueType::dim;
-
-        //! Type of host-device buffer for field values
-        using Buffer = pmacc::GridBuffer< ValueType, simDim >;
-
-        //! Type of data box for field values on host and device
-        using DataBoxType = pmacc::DataBox< PitchedBox< ValueType, simDim > >;
-
-        //! Size of supercell
-        using SuperCellSize = MappingDesc::SuperCellSize;
-
-        /** Create a field
-         *
-         * @tparam T_tag communication tag value
-         *
-         * @param cellDescription mapping for kernels
-         * @param id unique id
-         * @param tag helper parameter for T_tag deduction
-         */
-        template< CommunicationTag T_tag >
-        HINLINE EMFieldBase(
-            MappingDesc const & cellDescription,
-            pmacc::SimulationDataId const & id,
-            std::integral_constant< CommunicationTag, T_tag > tag
-        );
-
-        //! Get a reference to the host-device buffer for the field values
-        HINLINE Buffer & getGridBuffer( );
-
-        //! Get the grid layout
-        HINLINE GridLayout< simDim > getGridLayout( );
-
-        //! Get the host data box for the field values
-        HINLINE DataBoxType getHostDataBox( );
-
-        //! Get the device data box for the field values
-        HINLINE DataBoxType getDeviceDataBox( );
-
-        /** Start asynchronous communication of field values
+        /** Base class for implementation inheritance in classes for the
+         *  electromagnetic fields
          *
-         * @param serialEvent event to depend on
-         */
-        HINLINE EventTask asyncCommunication( EventTask serialEvent );
-
-        /** Reset the host-device buffer for field values
+         * Stores field values on host and device and provides data synchronization
+         * between them.
          *
-         * @param currentStep index of time iteration
+         * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
+         * ISimulationData.
          */
-        HINLINE void reset( uint32_t currentStep ) override;
-
-        //! Synchronize device data with host data
-        HINLINE void syncToDevice( ) override;
-
-        //! Synchronize host data with device data
-        HINLINE void synchronize( ) override;
-
-        //! Get id
-        HINLINE SimulationDataId getUniqueId( ) override;
-
-    private:
-
-        //! Host-device buffer for field values
-        std::unique_ptr< Buffer > buffer;
-
-        //! Unique id
-        pmacc::SimulationDataId id;
-
-    };
-
-} // namespace fields
+        class EMFieldBase
+            : public SimulationFieldHelper<MappingDesc>
+            , public ISimulationData
+        {
+        public:
+            //! Type of each field value
+            using ValueType = float3_X;
+
+            //! Number of components of ValueType, for serialization
+            static constexpr int numComponents = ValueType::dim;
+
+            //! Type of host-device buffer for field values
+            using Buffer = pmacc::GridBuffer<ValueType, simDim>;
+
+            //! Type of data box for field values on host and device
+            using DataBoxType = pmacc::DataBox<PitchedBox<ValueType, simDim>>;
+
+            //! Size of supercell
+            using SuperCellSize = MappingDesc::SuperCellSize;
+
+            /** Create a field
+             *
+             * @tparam T_tag communication tag value
+             *
+             * @param cellDescription mapping for kernels
+             * @param id unique id
+             * @param tag helper parameter for T_tag deduction
+             */
+            template<CommunicationTag T_tag>
+            HINLINE EMFieldBase(
+                MappingDesc const& cellDescription,
+                pmacc::SimulationDataId const& id,
+                std::integral_constant<CommunicationTag, T_tag> tag);
+
+            //! Get a reference to the host-device buffer for the field values
+            HINLINE Buffer& getGridBuffer();
+
+            //! Get the grid layout
+            HINLINE GridLayout<simDim> getGridLayout();
+
+            //! Get the host data box for the field values
+            HINLINE DataBoxType getHostDataBox();
+
+            //! Get the device data box for the field values
+            HINLINE DataBoxType getDeviceDataBox();
+
+            /** Start asynchronous communication of field values
+             *
+             * @param serialEvent event to depend on
+             */
+            HINLINE EventTask asyncCommunication(EventTask serialEvent);
+
+            /** Reset the host-device buffer for field values
+             *
+             * @param currentStep index of time iteration
+             */
+            HINLINE void reset(uint32_t currentStep) override;
+
+            //! Synchronize device data with host data
+            HINLINE void syncToDevice() override;
+
+            //! Synchronize host data with device data
+            HINLINE void synchronize() override;
+
+            //! Get id
+            HINLINE SimulationDataId getUniqueId() override;
+
+        private:
+            //! Host-device buffer for field values
+            std::unique_ptr<Buffer> buffer;
+
+            //! Unique id
+            pmacc::SimulationDataId id;
+        };
+
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/EMFieldBase.tpp b/include/picongpu/fields/EMFieldBase.tpp
index 1371951691..6160caeed1 100644
--- a/include/picongpu/fields/EMFieldBase.tpp
+++ b/include/picongpu/fields/EMFieldBase.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch, Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -35,7 +35,6 @@
 #include <pmacc/mappings/kernel/ExchangeMapping.hpp>
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/memory/buffers/GridBuffer.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/particles/traits/FilterByFlag.hpp>
 
 #include <boost/mpl/accumulate.hpp>
@@ -46,135 +45,117 @@
 
 namespace picongpu
 {
-namespace fields
-{
-
-    template< CommunicationTag T_tag >
-    EMFieldBase::EMFieldBase(
-        MappingDesc const & cellDescription,
-        pmacc::SimulationDataId const & id,
-        std::integral_constant< CommunicationTag, T_tag >
-    ) :
-        SimulationFieldHelper< MappingDesc >( cellDescription ),
-        id( id )
+    namespace fields
     {
-        buffer = pmacc::memory::makeUnique< Buffer >(
-            cellDescription.getGridLayout( )
-        );
-
-        using VectorSpeciesWithInterpolation = typename pmacc::particles::traits::FilterByFlag
-        <
-            VectorAllSpecies,
-            interpolation<>
-        >::type;
-        using LowerMarginInterpolation = bmpl::accumulate<
-            VectorSpeciesWithInterpolation,
-            typename pmacc::math::CT::make_Int<simDim, 0>::type,
-            pmacc::math::CT::max<bmpl::_1, GetLowerMargin< GetInterpolation<bmpl::_2> > >
-        >::type;
-        using UpperMarginInterpolation = bmpl::accumulate<
-            VectorSpeciesWithInterpolation,
-            typename pmacc::math::CT::make_Int<simDim, 0>::type,
-            pmacc::math::CT::max<bmpl::_1, GetUpperMargin< GetInterpolation<bmpl::_2> > >
-        >::type;
-
-        /* Calculate the maximum Neighbors we need from MAX(ParticleShape, FieldSolver) */
-        using LowerMarginSolver = typename GetMargin<fields::Solver, T_tag >::LowerMargin;
-        using LowerMarginInterpolationAndSolver = typename pmacc::math::CT::max<
-            LowerMarginInterpolation,
-            LowerMarginSolver
-        >::type;
-        using UpperMarginSolver = typename GetMargin<fields::Solver, T_tag >::UpperMargin;
-        using UpperMarginInterpolationAndSolver = typename pmacc::math::CT::max<
-            UpperMarginInterpolation,
-            UpperMarginSolver
-        >::type;
-
-        /* Calculate upper and lower margin for pusher
-           (currently all pusher use the interpolation of the species)
-           and find maximum margin
-        */
-        using VectorSpeciesWithPusherAndInterpolation = typename pmacc::particles::traits::FilterByFlag
-        <
-            VectorSpeciesWithInterpolation,
-            particlePusher<>
-        >::type;
-        using LowerMargin = typename bmpl::accumulate<
-            VectorSpeciesWithPusherAndInterpolation,
-            LowerMarginInterpolationAndSolver,
-            pmacc::math::CT::max<bmpl::_1, GetLowerMarginPusher<bmpl::_2> >
-        >::type;
-
-        using UpperMargin = typename bmpl::accumulate<
-            VectorSpeciesWithPusherAndInterpolation,
-            UpperMarginInterpolationAndSolver,
-            pmacc::math::CT::max<bmpl::_1, GetUpperMarginPusher<bmpl::_2> >
-        >::type;
-
-        const DataSpace< simDim > originGuard( LowerMargin( ).toRT( ) );
-        const DataSpace< simDim > endGuard( UpperMargin( ).toRT( ) );
-
-        /*go over all directions*/
-        for ( uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i )
+        template<CommunicationTag T_tag>
+        EMFieldBase::EMFieldBase(
+            MappingDesc const& cellDescription,
+            pmacc::SimulationDataId const& id,
+            std::integral_constant<CommunicationTag, T_tag>)
+            : SimulationFieldHelper<MappingDesc>(cellDescription)
+            , id(id)
         {
-            DataSpace<simDim> relativeMask = Mask::getRelativeDirections<simDim > ( i );
-            /* guarding cells depend on direction
-             * for negative direction use originGuard else endGuard (relative direction ZERO is ignored)
-             * don't switch end and origin because this is a read buffer and no send buffer
-             */
-            DataSpace<simDim> guardingCells;
-            for ( uint32_t d = 0; d < simDim; ++d )
-                guardingCells[d] = ( relativeMask[d] == -1 ? originGuard[d] : endGuard[d] );
-            buffer->addExchange( GUARD, i, guardingCells, T_tag );
+            buffer = std::make_unique<Buffer>(cellDescription.getGridLayout());
+
+            using VectorSpeciesWithInterpolation =
+                typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, interpolation<>>::type;
+            using LowerMarginInterpolation = bmpl::accumulate<
+                VectorSpeciesWithInterpolation,
+                typename pmacc::math::CT::make_Int<simDim, 0>::type,
+                pmacc::math::CT::max<bmpl::_1, GetLowerMargin<GetInterpolation<bmpl::_2>>>>::type;
+            using UpperMarginInterpolation = bmpl::accumulate<
+                VectorSpeciesWithInterpolation,
+                typename pmacc::math::CT::make_Int<simDim, 0>::type,
+                pmacc::math::CT::max<bmpl::_1, GetUpperMargin<GetInterpolation<bmpl::_2>>>>::type;
+
+            /* Calculate the maximum Neighbors we need from MAX(ParticleShape, FieldSolver) */
+            using LowerMarginSolver = typename GetMargin<fields::Solver, T_tag>::LowerMargin;
+            using LowerMarginInterpolationAndSolver =
+                typename pmacc::math::CT::max<LowerMarginInterpolation, LowerMarginSolver>::type;
+            using UpperMarginSolver = typename GetMargin<fields::Solver, T_tag>::UpperMargin;
+            using UpperMarginInterpolationAndSolver =
+                typename pmacc::math::CT::max<UpperMarginInterpolation, UpperMarginSolver>::type;
+
+            /* Calculate upper and lower margin for pusher
+               (currently all pusher use the interpolation of the species)
+               and find maximum margin
+            */
+            using VectorSpeciesWithPusherAndInterpolation = typename pmacc::particles::traits::
+                FilterByFlag<VectorSpeciesWithInterpolation, particlePusher<>>::type;
+            using LowerMargin = typename bmpl::accumulate<
+                VectorSpeciesWithPusherAndInterpolation,
+                LowerMarginInterpolationAndSolver,
+                pmacc::math::CT::max<bmpl::_1, GetLowerMarginPusher<bmpl::_2>>>::type;
+
+            using UpperMargin = typename bmpl::accumulate<
+                VectorSpeciesWithPusherAndInterpolation,
+                UpperMarginInterpolationAndSolver,
+                pmacc::math::CT::max<bmpl::_1, GetUpperMarginPusher<bmpl::_2>>>::type;
+
+            const DataSpace<simDim> originGuard(LowerMargin().toRT());
+            const DataSpace<simDim> endGuard(UpperMargin().toRT());
+
+            /*go over all directions*/
+            for(uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i)
+            {
+                DataSpace<simDim> relativeMask = Mask::getRelativeDirections<simDim>(i);
+                /* guarding cells depend on direction
+                 * for negative direction use originGuard else endGuard (relative direction ZERO is ignored)
+                 * don't switch end and origin because this is a read buffer and no send buffer
+                 */
+                DataSpace<simDim> guardingCells;
+                for(uint32_t d = 0; d < simDim; ++d)
+                    guardingCells[d] = (relativeMask[d] == -1 ? originGuard[d] : endGuard[d]);
+                buffer->addExchange(GUARD, i, guardingCells, T_tag);
+            }
         }
-    }
 
-    EMFieldBase::Buffer & EMFieldBase::getGridBuffer( )
-    {
-        return *buffer;
-    }
+        EMFieldBase::Buffer& EMFieldBase::getGridBuffer()
+        {
+            return *buffer;
+        }
 
-    GridLayout< simDim > EMFieldBase::getGridLayout( )
-    {
-        return cellDescription.getGridLayout( );
-    }
+        GridLayout<simDim> EMFieldBase::getGridLayout()
+        {
+            return cellDescription.getGridLayout();
+        }
 
-    EMFieldBase::DataBoxType EMFieldBase::getHostDataBox( )
-    {
-        return buffer->getHostBuffer( ).getDataBox( );
-    }
+        EMFieldBase::DataBoxType EMFieldBase::getHostDataBox()
+        {
+            return buffer->getHostBuffer().getDataBox();
+        }
 
-    EMFieldBase::DataBoxType EMFieldBase::getDeviceDataBox( )
-    {
-        return buffer->getDeviceBuffer( ).getDataBox( );
-    }
+        EMFieldBase::DataBoxType EMFieldBase::getDeviceDataBox()
+        {
+            return buffer->getDeviceBuffer().getDataBox();
+        }
 
-    EventTask EMFieldBase::asyncCommunication( EventTask serialEvent )
-    {
-        EventTask eB = buffer->asyncCommunication( serialEvent );
-        return eB;
-    }
+        EventTask EMFieldBase::asyncCommunication(EventTask serialEvent)
+        {
+            EventTask eB = buffer->asyncCommunication(serialEvent);
+            return eB;
+        }
 
-    void EMFieldBase::reset( uint32_t )
-    {
-        buffer->getHostBuffer( ).reset( true );
-        buffer->getDeviceBuffer( ).reset( false );
-    }
+        void EMFieldBase::reset(uint32_t)
+        {
+            buffer->getHostBuffer().reset(true);
+            buffer->getDeviceBuffer().reset(false);
+        }
 
-    void EMFieldBase::syncToDevice( )
-    {
-        buffer->hostToDevice( );
-    }
+        void EMFieldBase::syncToDevice()
+        {
+            buffer->hostToDevice();
+        }
 
-    void EMFieldBase::synchronize( )
-    {
-        buffer->deviceToHost( );
-    }
+        void EMFieldBase::synchronize()
+        {
+            buffer->deviceToHost();
+        }
 
-    pmacc::SimulationDataId EMFieldBase::getUniqueId( )
-    {
-        return id;
-    }
+        pmacc::SimulationDataId EMFieldBase::getUniqueId()
+        {
+            return id;
+        }
 
-} // namespace fields
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/FieldB.hpp b/include/picongpu/fields/FieldB.hpp
index b9662b44f3..f484123bca 100644
--- a/include/picongpu/fields/FieldB.hpp
+++ b/include/picongpu/fields/FieldB.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -32,7 +32,6 @@
 
 namespace picongpu
 {
-
     /** Representation of the magnetic field
      *
      * Stores field values on host and device and provides data synchronization
@@ -44,18 +43,17 @@ namespace picongpu
     class FieldB : public fields::EMFieldBase
     {
     public:
-
         /** Create a field
          *
          * @param cellDescription mapping for kernels
          */
-        HINLINE FieldB( MappingDesc const & cellDescription );
+        HINLINE FieldB(MappingDesc const& cellDescription);
 
         //! Unit type of field components
-        using UnitValueType = promoteType< float_64, ValueType >::type;
+        using UnitValueType = promoteType<float_64, ValueType>::type;
 
         //! Get units of field components
-        HDINLINE static UnitValueType getUnit( );
+        HDINLINE static UnitValueType getUnit();
 
         /** Get unit representation as powers of the 7 base measures
          *
@@ -64,11 +62,10 @@ namespace picongpu
          *  thermodynamic temperature theta, amount of substance N,
          *  luminous intensity J)
          */
-        HINLINE static std::vector< float_64 > getUnitDimension( );
+        HINLINE static std::vector<float_64> getUnitDimension();
 
         //! Get text name
-        HINLINE static std::string getName( );
-
+        HINLINE static std::string getName();
     };
 
 } // namespace picongpu
diff --git a/include/picongpu/fields/FieldB.tpp b/include/picongpu/fields/FieldB.tpp
index c02bda48e9..faa0226289 100644
--- a/include/picongpu/fields/FieldB.tpp
+++ b/include/picongpu/fields/FieldB.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch, Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -32,36 +32,31 @@
 
 namespace picongpu
 {
-
-    FieldB::FieldB( MappingDesc const & cellDescription ) :
-        fields::EMFieldBase(
-            cellDescription,
-            getName( ),
-            std::integral_constant< CommunicationTag, FIELD_B >{ }
-        )
+    FieldB::FieldB(MappingDesc const& cellDescription)
+        : fields::EMFieldBase(cellDescription, getName(), std::integral_constant<CommunicationTag, FIELD_B>{})
     {
     }
 
-    HDINLINE FieldB::UnitValueType FieldB::getUnit( )
+    HDINLINE FieldB::UnitValueType FieldB::getUnit()
     {
-        return UnitValueType{ UNIT_BFIELD, UNIT_BFIELD, UNIT_BFIELD };
+        return UnitValueType{UNIT_BFIELD, UNIT_BFIELD, UNIT_BFIELD};
     }
 
-    std::vector< float_64 > FieldB::getUnitDimension( )
+    std::vector<float_64> FieldB::getUnitDimension()
     {
         /* B is in Tesla : kg / (A * s^2)
          *   -> M * T^-2 * I^-1
          */
-        std::vector< float_64 > unitDimension( 7, 0.0 );
-        unitDimension.at( SIBaseUnits::mass ) =  1.0;
-        unitDimension.at( SIBaseUnits::time ) = -2.0;
-        unitDimension.at( SIBaseUnits::electricCurrent ) = -1.0;
+        std::vector<float_64> unitDimension(7, 0.0);
+        unitDimension.at(SIBaseUnits::mass) = 1.0;
+        unitDimension.at(SIBaseUnits::time) = -2.0;
+        unitDimension.at(SIBaseUnits::electricCurrent) = -1.0;
         return unitDimension;
     }
 
-    std::string FieldB::getName( )
+    std::string FieldB::getName()
     {
         return "B";
     }
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/fields/FieldE.hpp b/include/picongpu/fields/FieldE.hpp
index 56a32035f5..f162d910d1 100644
--- a/include/picongpu/fields/FieldE.hpp
+++ b/include/picongpu/fields/FieldE.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -32,7 +32,6 @@
 
 namespace picongpu
 {
-
     /** Representation of the electric field
      *
      * Stores field values on host and device and provides data synchronization
@@ -44,18 +43,17 @@ namespace picongpu
     class FieldE : public fields::EMFieldBase
     {
     public:
-
         /** Create a field
          *
          * @param cellDescription mapping for kernels
          */
-        HINLINE FieldE( MappingDesc const & cellDescription );
+        HINLINE FieldE(MappingDesc const& cellDescription);
 
         //! Unit type of field components
-        using UnitValueType = promoteType< float_64, ValueType >::type;
+        using UnitValueType = promoteType<float_64, ValueType>::type;
 
         //! Get units of field components
-        HDINLINE static UnitValueType getUnit( );
+        HDINLINE static UnitValueType getUnit();
 
         /** Get unit representation as powers of the 7 base measures
          *
@@ -64,11 +62,10 @@ namespace picongpu
          *  thermodynamic temperature theta, amount of substance N,
          *  luminous intensity J)
          */
-        HINLINE static std::vector< float_64 > getUnitDimension( );
+        HINLINE static std::vector<float_64> getUnitDimension();
 
         //! Get text name
-        HINLINE static std::string getName( );
-
+        HINLINE static std::string getName();
     };
 
 } // namespace picongpu
diff --git a/include/picongpu/fields/FieldE.tpp b/include/picongpu/fields/FieldE.tpp
index c9cab400e4..074da2796f 100644
--- a/include/picongpu/fields/FieldE.tpp
+++ b/include/picongpu/fields/FieldE.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch, Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -32,35 +32,30 @@
 
 namespace picongpu
 {
-
-    FieldE::FieldE( MappingDesc const & cellDescription ) :
-        fields::EMFieldBase(
-            cellDescription,
-            getName( ),
-            std::integral_constant< CommunicationTag, FIELD_E >{ }
-        )
+    FieldE::FieldE(MappingDesc const& cellDescription)
+        : fields::EMFieldBase(cellDescription, getName(), std::integral_constant<CommunicationTag, FIELD_E>{})
     {
     }
 
-    HDINLINE FieldE::UnitValueType FieldE::getUnit( )
+    HDINLINE FieldE::UnitValueType FieldE::getUnit()
     {
-        return UnitValueType{ UNIT_EFIELD, UNIT_EFIELD, UNIT_EFIELD };
+        return UnitValueType{UNIT_EFIELD, UNIT_EFIELD, UNIT_EFIELD};
     }
 
-    std::vector< float_64 > FieldE::getUnitDimension( )
+    std::vector<float_64> FieldE::getUnitDimension()
     {
         /* E is in volts per meters: V / m = kg * m / (A * s^3)
          *   -> L * M * T^-3 * I^-1
          */
-        std::vector< float_64 > unitDimension( 7, 0.0 );
-        unitDimension.at( SIBaseUnits::length ) =  1.0;
-        unitDimension.at( SIBaseUnits::mass )   =  1.0;
-        unitDimension.at( SIBaseUnits::time )   = -3.0;
-        unitDimension.at( SIBaseUnits::electricCurrent ) = -1.0;
+        std::vector<float_64> unitDimension(7, 0.0);
+        unitDimension.at(SIBaseUnits::length) = 1.0;
+        unitDimension.at(SIBaseUnits::mass) = 1.0;
+        unitDimension.at(SIBaseUnits::time) = -3.0;
+        unitDimension.at(SIBaseUnits::electricCurrent) = -1.0;
         return unitDimension;
     }
 
-    std::string FieldE::getName( )
+    std::string FieldE::getName()
     {
         return "E";
     }
diff --git a/include/picongpu/fields/FieldJ.hpp b/include/picongpu/fields/FieldJ.hpp
index acb50d2610..0865845907 100644
--- a/include/picongpu/fields/FieldJ.hpp
+++ b/include/picongpu/fields/FieldJ.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -42,7 +42,6 @@
 
 namespace picongpu
 {
-
     /** Representation of the current density field
      *
      * Stores field values on host and device and provides data synchronization
@@ -51,10 +50,11 @@ namespace picongpu
      * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
      * ISimulationData.
      */
-    class FieldJ : public SimulationFieldHelper<MappingDesc>, public ISimulationData
+    class FieldJ
+        : public SimulationFieldHelper<MappingDesc>
+        , public ISimulationData
     {
     public:
-
         //! Type of each field value
         using ValueType = float3_X;
 
@@ -65,19 +65,19 @@ namespace picongpu
         using UnitValueType = promoteType<float_64, ValueType>::type;
 
         //! Type of data box for field values on host and device
-        using DataBoxType = DataBox<PitchedBox<ValueType, simDim> >;
+        using DataBoxType = DataBox<PitchedBox<ValueType, simDim>>;
 
         /** Create a field
          *
          * @param cellDescription mapping for kernels
          */
-        HINLINE FieldJ(MappingDesc const & cellDescription);
+        HINLINE FieldJ(MappingDesc const& cellDescription);
 
         //! Destroy a field
         HINLINE virtual ~FieldJ() = default;
 
         //! Get a reference to the host-device buffer for the field values
-        HINLINE GridBuffer<ValueType, simDim> &getGridBuffer();
+        HINLINE GridBuffer<ValueType, simDim>& getGridBuffer();
 
         //! Get the grid layout
         HINLINE GridLayout<simDim> getGridLayout();
@@ -149,17 +149,17 @@ namespace picongpu
          * @param currentStep index of time iteration
          */
         template<uint32_t T_area, class T_Species>
-        HINLINE void computeCurrent(T_Species & species, uint32_t currentStep);
+        HINLINE void computeCurrent(T_Species& species, uint32_t currentStep);
 
         /** Smooth current density and add it to the electric field
          *
          * @tparam T_area area to operate on
-         * @tparam T_CurrentInterpolation current interpolation type
+         * @tparam T_CurrentInterpolationFunctor current interpolation functor type
          *
-         * @param myCurrentInterpolation current interpolation
+         * @param myCurrentInterpolationFunctor current interpolation functor
          */
-        template<uint32_t T_area, class T_CurrentInterpolation>
-        HINLINE void addCurrentToEMF( T_CurrentInterpolation& myCurrentInterpolation );
+        template<uint32_t T_area, class T_CurrentInterpolationFunctor>
+        HINLINE void addCurrentToEMF(T_CurrentInterpolationFunctor myCurrentInterpolationFunctor);
 
         /** Bash field in a direction.
          *
@@ -176,13 +176,11 @@ namespace picongpu
         HINLINE void insertField(uint32_t exchangeType);
 
     private:
-
         //! Host-device buffer for current density values
         GridBuffer<ValueType, simDim> buffer;
 
         //! Buffer for receiving near-boundary values
-        std::unique_ptr< GridBuffer<ValueType, simDim> > fieldJrecv;
-
+        std::unique_ptr<GridBuffer<ValueType, simDim>> fieldJrecv;
     };
 
 } // namespace picongpu
diff --git a/include/picongpu/fields/FieldJ.kernel b/include/picongpu/fields/FieldJ.kernel
index 5418068275..7590e441ab 100644
--- a/include/picongpu/fields/FieldJ.kernel
+++ b/include/picongpu/fields/FieldJ.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -20,419 +20,275 @@
 
 #pragma once
 
-#include <pmacc/types.hpp>
-#include <pmacc/particles/frame_types.hpp>
-
 #include "picongpu/simulation_defines.hpp"
-
-#include "FieldJ.hpp"
-#include <pmacc/particles/memory/boxes/ParticlesBox.hpp>
-
-
+#include "picongpu/fields/currentDeposition/Strategy.def"
+#include "picongpu/fields/currentDeposition/Cache.hpp"
+#include "picongpu/fields/FieldJ.hpp"
 #include "picongpu/algorithms/Velocity.hpp"
 
 #include <pmacc/memory/boxes/CachedBox.hpp>
 #include <pmacc/dimensions/DataSpaceOperations.hpp>
-#include <pmacc/nvidia/functors/Add.hpp>
 #include <pmacc/mappings/threads/ThreadCollective.hpp>
-#include "picongpu/algorithms/Set.hpp"
 #include <pmacc/mappings/threads/ForEachIdx.hpp>
 #include <pmacc/mappings/threads/IdxConfig.hpp>
 #include <pmacc/memory/CtxArray.hpp>
 #include <pmacc/particles/frame_types.hpp>
+#include <pmacc/particles/memory/boxes/ParticlesBox.hpp>
+#include <pmacc/types.hpp>
+
+#include <type_traits>
+#include <utility>
 
 namespace picongpu
 {
+    namespace currentSolver
+    {
+        /** compute current
+         *
+         * @tparam T_numWorkers number of workers
+         * @tparam T_BlockDescription current field domain description needed for the
+         *                            collective stencil
+         */
+        template<uint32_t T_numWorkers, typename T_BlockDescription>
+        struct KernelComputeCurrent
+        {
+            /** scatter particle current of particles located in a supercell
+             *
+             * The current for the supercell including the guards is cached in shared memory
+             * and scattered at the end of the functor to the global memory.
+             *
+             * @tparam JBox pmacc::DataBox, particle current box type
+             * @tparam ParBox pmacc::ParticlesBox, particle box type
+             * @tparam Mapping mapper functor type
+             * @tparam FrameSolver frame solver functor type
+             * @param T_Acc alpaka accelerator type
+             *
+             * @param alpaka accelerator
+             * @param fieldJ field with particle current
+             * @param boxPar particle memory
+             * @param frameSolver functor to calculate the current for a frame
+             * @param mapper functor to map a block to a supercell
+             */
+            template<typename JBox, typename ParBox, typename FrameSolver, typename Mapping, typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                JBox fieldJ,
+                ParBox boxPar,
+                FrameSolver frameSolver,
+                Mapping mapper) const
+            {
+                using namespace mappings::threads;
 
-using namespace pmacc;
+                using FrameType = typename ParBox::FrameType;
+                using FramePtr = typename ParBox::FramePtr;
+                using SuperCellSize = typename Mapping::SuperCellSize;
 
-using J_DataBox = FieldJ::DataBoxType;
+                /** @todo numParticlesPerFrame should be max number of particles within a frame
+                 * and not a magic number derived from SuperCellSize
+                 */
+                constexpr uint32_t numParticlesPerFrame = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                constexpr uint32_t numWorkers = T_numWorkers;
 
-/** compute current
- *
- * @tparam T_numWorkers number of workers
- * @tparam T_BlockDescription current field domain description needed for the
- *                            collective stencil
- */
-template<
-    uint32_t T_numWorkers,
-    typename T_BlockDescription
->
-struct KernelComputeCurrent
-{
-    /** scatter particle current of particles located in a supercell
-     *
-     * The current for the supercell including the guards is cached in shared memory
-     * and scattered at the end of the functor to the global memory.
-     *
-     * @tparam JBox pmacc::DataBox, particle current box type
-     * @tparam ParBox pmacc::ParticlesBox, particle box type
-     * @tparam Mapping mapper functor type
-     * @tparam FrameSolver frame solver functor type
-     * @param T_Acc alpaka accelerator type
-     *
-     * @param alpaka accelerator
-     * @param fieldJ field with particle current
-     * @param boxPar particle memory
-     * @param frameSolver functor to calculate the current for a frame
-     * @param mapper functor to map a block to a supercell
-     */
-    template<
-        typename JBox,
-        typename ParBox,
-        typename FrameSolver,
-        typename Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        JBox fieldJ,
-        ParBox boxPar,
-        FrameSolver frameSolver,
-        Mapping mapper
-    ) const
-    {
-        using namespace mappings::threads;
+                /* We work with virtual CUDA blocks if we have more workers than particles.
+                 * Each virtual CUDA block is working on a frame, if we have 2 blocks each block processes
+                 * every second frame until all frames are processed.
+                 */
+                constexpr uint32_t numVirtualBlocks = (numWorkers + numParticlesPerFrame - 1u) / numParticlesPerFrame;
 
-        using FrameType = typename ParBox::FrameType;
-        using FramePtr = typename ParBox::FramePtr;
-        using SuperCellSize = typename Mapping::SuperCellSize;
 
-        /** @todo numParticlesPerFrame should be max number of particles within a frame
-         * and not a magic number derived from SuperCellSize
-         */
-        constexpr uint32_t numParticlesPerFrame = pmacc::math::CT::volume< SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
+                const DataSpace<simDim> block(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+                uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-        /* We work with virtual CUDA blocks if we have more workers than particles.
-         * Each virtual CUDA block is working on a frame, if we have 2 blocks each block processes
-         * every second frame until all frames are processed.
-         */
-        constexpr uint32_t numVirtualBlocks = ( numWorkers + numParticlesPerFrame - 1u ) / numParticlesPerFrame;
-
-
-        const DataSpace< simDim > block(
-            mapper.getSuperCellIndex(
-                DataSpace< simDim >( blockIdx )
-            )
-        );
-        uint32_t const workerIdx = threadIdx.x;
-
-        using VirtualWorkerDomCfg = IdxConfig<
-            numParticlesPerFrame * numVirtualBlocks,
-            numWorkers
-        >;
-
-        /* each virtual worker is part of one virtual block */
-        memory::CtxArray<
-            uint32_t,
-            VirtualWorkerDomCfg
-        >
-        virtualBlockIdCtx(
-            workerIdx,
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
-                return linearIdx / numParticlesPerFrame;
-            }
-        );
-
-        /* linear virtual worker index in the virtual block*/
-        memory::CtxArray<
-            uint32_t,
-            VirtualWorkerDomCfg
-        >
-        virtualLinearIdCtx(
-            workerIdx,
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const idx
-            )
-            {
-                /* map virtualLinearIdCtx to the range [0;numParticlesPerFrame) */
-                return linearIdx - ( virtualBlockIdCtx[ idx ] * numParticlesPerFrame );
-            }
-        );
-
-        /* each virtual worker stores the currently used frame */
-        memory::CtxArray<
-            FramePtr,
-            VirtualWorkerDomCfg
-        > frameCtx;
-
-        memory::CtxArray<
-            lcellId_t,
-            VirtualWorkerDomCfg
-        > particlesInSuperCellCtx( 0u );
-
-        /* loop over all virtual workers */
-        ForEachIdx< VirtualWorkerDomCfg > forEachVirtualWorker( workerIdx );
-
-        forEachVirtualWorker(
-            [&](
-                uint32_t const,
-                uint32_t const idx
-            )
-            {
-                frameCtx[ idx ] = boxPar.getLastFrame( block );
-                if( frameCtx[ idx ].isValid() && virtualBlockIdCtx[ idx ] == 0u )
-                    particlesInSuperCellCtx[ idx ] = boxPar.getSuperCell( block ).getSizeLastFrame();
+                using VirtualWorkerDomCfg = IdxConfig<numParticlesPerFrame * numVirtualBlocks, numWorkers>;
 
-                /* select N-th (N=virtualBlockId) frame from the end of the list */
-                for( uint32_t i = 1; i <= virtualBlockIdCtx[ idx ] && frameCtx[ idx ].isValid(); ++i )
-                {
-                    particlesInSuperCellCtx[ idx ] = numParticlesPerFrame;
-                    frameCtx[ idx ] = boxPar.getPreviousFrame( frameCtx[ idx ] );
-                }
-            }
-        );
+                /* each virtual worker is part of one virtual block */
+                memory::CtxArray<uint32_t, VirtualWorkerDomCfg> virtualBlockIdCtx(
+                    workerIdx,
+                    [&](uint32_t const linearIdx, uint32_t const) { return linearIdx / numParticlesPerFrame; });
 
-        /* this memory is used by all virtual blocks */
-        auto cachedJ = CachedBox::create<
-            0u,
-            typename JBox::ValueType
-        >(
-            acc,
-            T_BlockDescription()
-        );
+                /* linear virtual worker index in the virtual block*/
+                memory::CtxArray<uint32_t, VirtualWorkerDomCfg> virtualLinearIdCtx(
+                    workerIdx,
+                    [&](uint32_t const linearIdx, uint32_t const idx) {
+                        /* map virtualLinearIdCtx to the range [0;numParticlesPerFrame) */
+                        return linearIdx - (virtualBlockIdCtx[idx] * numParticlesPerFrame);
+                    });
 
-        Set< typename JBox::ValueType > set( float3_X::create( 0.0 ) );
-        ThreadCollective<
-            T_BlockDescription,
-            numWorkers
-        > collectiveSet( workerIdx );
+                /* each virtual worker stores the currently used frame */
+                memory::CtxArray<FramePtr, VirtualWorkerDomCfg> frameCtx;
 
-        /* initialize shared memory with zeros */
-        collectiveSet( acc, set, cachedJ );
+                memory::CtxArray<lcellId_t, VirtualWorkerDomCfg> particlesInSuperCellCtx(0u);
 
-        __syncthreads();
+                /* loop over all virtual workers */
+                ForEachIdx<VirtualWorkerDomCfg> forEachVirtualWorker(workerIdx);
 
-        while( true )
-        {
-            bool isOneFrameValid = false;
-            forEachVirtualWorker(
-                [&](
-                    uint32_t const,
-                    uint32_t const idx
-                )
-                {
-                    isOneFrameValid = isOneFrameValid || frameCtx[ idx ].isValid();
-                }
-            );
-
-            if( !isOneFrameValid )
-                break;
+                forEachVirtualWorker([&](uint32_t const, uint32_t const idx) {
+                    frameCtx[idx] = boxPar.getLastFrame(block);
+                    if(frameCtx[idx].isValid() && virtualBlockIdCtx[idx] == 0u)
+                        particlesInSuperCellCtx[idx] = boxPar.getSuperCell(block).getSizeLastFrame();
 
-            forEachVirtualWorker(
-                [&](
-                    uint32_t const,
-                    uint32_t const idx
-                )
-                {
-                    /* this test is only important for the last frame
-                     * if the frame is not the last one then: `particlesInSuperCell == numParticlesPerFrame`
-                     */
-                    if(
-                        frameCtx[ idx ].isValid() &&
-                        virtualLinearIdCtx[ idx ] < particlesInSuperCellCtx[ idx ]
-                    )
+                    /* select N-th (N=virtualBlockId) frame from the end of the list */
+                    for(uint32_t i = 1; i <= virtualBlockIdCtx[idx] && frameCtx[idx].isValid(); ++i)
                     {
-                        frameSolver(
-                            acc,
-                            *frameCtx[ idx ],
-                            virtualLinearIdCtx[ idx ],
-                            cachedJ
-                        );
+                        particlesInSuperCellCtx[idx] = numParticlesPerFrame;
+                        frameCtx[idx] = boxPar.getPreviousFrame(frameCtx[idx]);
                     }
-                }
-            );
+                });
+
+                DataSpace<simDim> const blockCell = block * SuperCellSize::toRT();
+                using Strategy = currentSolver::traits::GetStrategy_t<FrameSolver>;
 
-            forEachVirtualWorker(
-                [&](
-                    uint32_t const,
-                    uint32_t const idx
-                )
+                /* this memory is used by all virtual blocks */
+                auto cachedJ = detail::Cache<Strategy>::template create<numWorkers, T_BlockDescription>(
+                    acc,
+                    fieldJ.shift(blockCell),
+                    workerIdx);
+
+                cupla::__syncthreads(acc);
+
+                while(true)
                 {
-                    if( frameCtx[ idx ].isValid() )
-                    {
-                        particlesInSuperCellCtx[ idx ] = numParticlesPerFrame;
-                        for( int i = 0; i < numVirtualBlocks && frameCtx[ idx ].isValid(); ++i )
+                    bool isOneFrameValid = false;
+                    forEachVirtualWorker([&](uint32_t const, uint32_t const idx) {
+                        isOneFrameValid = isOneFrameValid || frameCtx[idx].isValid();
+                    });
+
+                    if(!isOneFrameValid)
+                        break;
+
+                    forEachVirtualWorker([&](uint32_t const, uint32_t const idx) {
+                        /* this test is only important for the last frame
+                         * if the frame is not the last one then: `particlesInSuperCell == numParticlesPerFrame`
+                         */
+                        if(frameCtx[idx].isValid() && virtualLinearIdCtx[idx] < particlesInSuperCellCtx[idx])
                         {
-                            frameCtx[ idx ] = boxPar.getPreviousFrame( frameCtx[ idx ] );
+                            frameSolver(acc, *frameCtx[idx], virtualLinearIdCtx[idx], cachedJ);
                         }
-                    }
+                    });
+
+                    forEachVirtualWorker([&](uint32_t const, uint32_t const idx) {
+                        if(frameCtx[idx].isValid())
+                        {
+                            particlesInSuperCellCtx[idx] = numParticlesPerFrame;
+                            for(int i = 0; i < numVirtualBlocks && frameCtx[idx].isValid(); ++i)
+                            {
+                                frameCtx[idx] = boxPar.getPreviousFrame(frameCtx[idx]);
+                            }
+                        }
+                    });
                 }
-            );
-        }
-
-        /* we wait that all workers finish the loop */
-        __syncthreads();
-
-        nvidia::functors::Add add;
-        DataSpace< simDim > const blockCell = block * SuperCellSize::toRT();
-        ThreadCollective<
-            T_BlockDescription,
-            numWorkers
-        > collectiveAdd( workerIdx );
-        auto fieldJBlock = fieldJ.shift( blockCell );
-
-        /* write scatter results back to the global memory */
-        collectiveAdd(
-            acc,
-            add,
-            fieldJBlock,
-            cachedJ
-        );
-    }
-};
-
-template<class ParticleAlgo, class Velocity, class TVec>
-struct ComputeCurrentPerFrame
-{
 
-    HDINLINE ComputeCurrentPerFrame(const float_X deltaTime) :
-    m_deltaTime(deltaTime)
-    {
-    }
-
-    template<
-        typename FrameType,
-        typename BoxJ,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        FrameType& frame,
-        const int localIdx,
-        BoxJ & jBox
-    )
-    {
+                /* we wait that all workers finish the loop */
+                cupla::__syncthreads(acc);
 
-        auto particle = frame[localIdx];
-        const float_X weighting = particle[weighting_];
-        const floatD_X pos = particle[position_];
-        const int particleCellIdx = particle[localCellIdx_];
-        const float_X charge = attribute::getCharge(weighting,particle);
-        const DataSpace<simDim> localCell(DataSpaceOperations<simDim>::template map<TVec > (particleCellIdx));
-
-        Velocity velocity;
-        const float3_X vel = velocity(
-                                      particle[momentum_],
-                                      attribute::getMass(weighting,particle));
-        auto fieldJShiftToParticle = jBox.shift(localCell);
-        ParticleAlgo perParticle;
-        perParticle(
-            acc,
-            fieldJShiftToParticle,
-            pos,
-            vel,
-            charge,
-            m_deltaTime
-        );
-    }
-
-private:
-    PMACC_ALIGN(m_deltaTime, const float_32);
-};
-
-/** add current to electric and magnetic field
- *
- * @tparam T_numWorkers number of workers
- */
-template<
-    uint32_t T_numWorkers
->
-struct KernelAddCurrentToEMF
-{
-    template<
-        typename T_CurrentInterpolation,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        typename FieldE::DataBoxType fieldE,
-        typename FieldB::DataBoxType fieldB,
-        J_DataBox fieldJ,
-        T_CurrentInterpolation currentInterpolation,
-        T_Mapping mapper
-    ) const
-    {
-        using namespace mappings::threads;
-
-        /* Caching of fieldJ */
-        typedef SuperCellDescription<
-            SuperCellSize,
-            typename T_CurrentInterpolation::LowerMargin,
-            typename T_CurrentInterpolation::UpperMargin
-        > BlockArea;
-
-        constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
-
-        uint32_t const workerIdx = threadIdx.x;
-
-        auto cachedJ = CachedBox::create<
-            0,
-            typename J_DataBox::ValueType
-        >(
-            acc,
-            BlockArea( )
-        );
-
-        nvidia::functors::Assign assign;
-        DataSpace< simDim > const block(
-            mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) )
-        );
-        DataSpace< simDim > const blockCell = block * MappingDesc::SuperCellSize::toRT();
-
-
-        auto fieldJBlock = fieldJ.shift(blockCell);
-
-        ThreadCollective<
-            BlockArea,
-            numWorkers
-        > collective( workerIdx );
-
-        collective(
-            acc,
-            assign,
-            cachedJ,
-            fieldJBlock
-        );
-
-        __syncthreads( );
-
-        ForEachIdx<
-            IdxConfig<
-                cellsPerSuperCell,
-                numWorkers
-            >
-        >{ workerIdx }(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
+                /* this memory is used by all virtual blocks */
+                detail::Cache<Strategy>::template flush<numWorkers, T_BlockDescription>(
+                    acc,
+                    fieldJ.shift(blockCell),
+                    cachedJ,
+                    workerIdx);
+            }
+        };
+
+        template<typename T_ParticleAlgo, typename Velocity, typename TVec>
+        struct ComputePerFrame
+        {
+            using ParticleAlgo = T_ParticleAlgo;
+
+            HDINLINE ComputePerFrame(const float_X deltaTime) : m_deltaTime(deltaTime)
+            {
+            }
+
+            template<typename FrameType, typename BoxJ, typename T_Acc>
+            DINLINE void operator()(T_Acc const& acc, FrameType& frame, const int localIdx, BoxJ& jBox)
+            {
+                auto particle = frame[localIdx];
+                const float_X weighting = particle[weighting_];
+                const floatD_X pos = particle[position_];
+                const int particleCellIdx = particle[localCellIdx_];
+                const float_X charge = attribute::getCharge(weighting, particle);
+                const DataSpace<simDim> localCell(DataSpaceOperations<simDim>::template map<TVec>(particleCellIdx));
+
+                Velocity velocity;
+                const float3_X vel = velocity(particle[momentum_], attribute::getMass(weighting, particle));
+                auto fieldJShiftToParticle = jBox.shift(localCell);
+                ParticleAlgo perParticle;
+                perParticle(acc, fieldJShiftToParticle, pos, vel, charge, m_deltaTime);
+            }
+
+        private:
+            PMACC_ALIGN(m_deltaTime, const float_32);
+        };
+
+        namespace traits
+        {
+            template<typename ParticleAlgo, typename Velocity, typename TVec>
+            struct GetStrategy<ComputePerFrame<ParticleAlgo, Velocity, TVec>>
+            {
+                using type = GetStrategy_t<ParticleAlgo>;
+            };
+        } // namespace traits
+
+        /** add current to electric and magnetic field
+         *
+         * @tparam T_numWorkers number of workers
+         */
+        template<uint32_t T_numWorkers>
+        struct KernelAddCurrentToEMF
+        {
+            template<typename T_CurrentInterpolationFunctor, typename T_Mapping, typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                typename FieldE::DataBoxType fieldE,
+                typename FieldB::DataBoxType fieldB,
+                typename FieldJ::DataBoxType fieldJ,
+                T_CurrentInterpolationFunctor currentInterpolationFunctor,
+                T_Mapping mapper) const
             {
-                /* cell index within the superCell */
-                DataSpace< simDim > const cellIdx =
-                    DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
-                DataSpace< simDim > const cell( blockCell + cellIdx );
-
-                // Amperes Law:
-                //   Change of the dE = - j / EPS0 * dt
-                //                        j = current density (= current per area)
-                //                          = fieldJ
-                currentInterpolation(
-                    fieldE.shift( cell ),
-                    fieldB.shift( cell ),
-                    cachedJ.shift( cellIdx )
-                );
+                using namespace mappings::threads;
+
+                /* Caching of fieldJ */
+                typedef SuperCellDescription<
+                    SuperCellSize,
+                    typename T_CurrentInterpolationFunctor::LowerMargin,
+                    typename T_CurrentInterpolationFunctor::UpperMargin>
+                    BlockArea;
+
+                constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                constexpr uint32_t numWorkers = T_numWorkers;
+
+                uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                auto cachedJ = CachedBox::create<0, typename FieldJ::DataBoxType::ValueType>(acc, BlockArea());
+
+                nvidia::functors::Assign assign;
+                DataSpace<simDim> const block(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+                DataSpace<simDim> const blockCell = block * MappingDesc::SuperCellSize::toRT();
+
+
+                auto fieldJBlock = fieldJ.shift(blockCell);
+
+                ThreadCollective<BlockArea, numWorkers> collective(workerIdx);
+
+                collective(acc, assign, cachedJ, fieldJBlock);
+
+                cupla::__syncthreads(acc);
+
+                ForEachIdx<IdxConfig<cellsPerSuperCell, numWorkers>>{workerIdx}(
+                    [&](uint32_t const linearIdx, uint32_t const) {
+                        /* cell index within the superCell */
+                        DataSpace<simDim> const cellIdx
+                            = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
+                        DataSpace<simDim> const cell(blockCell + cellIdx);
+
+                        // Amperes Law:
+                        //   Change of the dE = - j / EPS0 * dt
+                        //                        j = current density (= current per area)
+                        //                          = fieldJ
+                        currentInterpolationFunctor(fieldE.shift(cell), fieldB.shift(cell), cachedJ.shift(cellIdx));
+                    });
             }
-        );
-    }
-};
+        };
 
+    } // namespace currentSolver
 } // namespace picongpu
diff --git a/include/picongpu/fields/FieldJ.tpp b/include/picongpu/fields/FieldJ.tpp
index 1ea6c35c60..9eea174353 100644
--- a/include/picongpu/fields/FieldJ.tpp
+++ b/include/picongpu/fields/FieldJ.tpp
@@ -1,5 +1,5 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
- *                     Richard Pausch, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+ *                     Richard Pausch, Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -23,22 +23,20 @@
 #include "picongpu/simulation_defines.hpp"
 #include "picongpu/fields/FieldJ.hpp"
 #include "picongpu/fields/FieldJ.kernel"
-
+#include "picongpu/fields/currentInterpolation/CurrentInterpolation.hpp"
+#include "picongpu/fields/currentDeposition/Deposit.hpp"
+#include "picongpu/particles/traits/GetCurrentSolver.hpp"
+#include "picongpu/traits/GetMargin.hpp"
+#include "picongpu/traits/SIBaseUnits.hpp"
 
 #include <pmacc/particles/memory/boxes/ParticlesBox.hpp>
-
 #include <pmacc/Environment.hpp>
 #include <pmacc/mappings/kernel/AreaMapping.hpp>
-#include <pmacc/mappings/kernel/StrideMapping.hpp>
 #include <pmacc/fields/tasks/FieldFactory.hpp>
 #include <pmacc/math/Vector.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/fields/operations/CopyGuardToExchange.hpp>
 #include <pmacc/fields/operations/AddExchangeToBorder.hpp>
-#include "picongpu/particles/traits/GetCurrentSolver.hpp"
-#include "picongpu/traits/GetMargin.hpp"
 #include <pmacc/traits/Resolve.hpp>
-#include "picongpu/traits/SIBaseUnits.hpp"
 #include <pmacc/traits/GetNumWorkers.hpp>
 
 #include <boost/mpl/accumulate.hpp>
@@ -49,295 +47,234 @@
 
 namespace picongpu
 {
+    using namespace pmacc;
 
-using namespace pmacc;
-
-FieldJ::FieldJ( MappingDesc const & cellDescription ) :
-    SimulationFieldHelper<MappingDesc>( cellDescription ),
-    buffer( cellDescription.getGridLayout( ) ),
-    fieldJrecv( nullptr )
-{
-    const DataSpace<simDim> coreBorderSize = cellDescription.getGridLayout( ).getDataSpaceWithoutGuarding( );
-
-    /* cell margins the current might spread to due to particle shapes */
-    using AllSpeciesWithCurrent = typename pmacc::particles::traits::FilterByFlag<
-        VectorAllSpecies,
-        current<>
-    >::type;
-
-    using LowerMarginShapes = bmpl::accumulate<
-        AllSpeciesWithCurrent,
-        typename pmacc::math::CT::make_Int<simDim, 0>::type,
-        pmacc::math::CT::max<bmpl::_1, GetLowerMargin< GetCurrentSolver<bmpl::_2> > >
-        >::type;
-
-    using UpperMarginShapes = bmpl::accumulate<
-        AllSpeciesWithCurrent,
-        typename pmacc::math::CT::make_Int<simDim, 0>::type,
-        pmacc::math::CT::max<bmpl::_1, GetUpperMargin< GetCurrentSolver<bmpl::_2> > >
-        >::type;
-
-    /* margins are always positive, also for lower margins
-     * additional current interpolations and current filters on FieldJ might
-     * spread the dependencies on neighboring cells
-     *   -> use max(shape,filter) */
-    using LowerMargin = pmacc::math::CT::max<
-        LowerMarginShapes,
-        GetMargin<typename fields::Solver::CurrentInterpolation>::LowerMargin
-        >::type;
-
-    using UpperMargin = pmacc::math::CT::max<
-        UpperMarginShapes,
-        GetMargin<typename fields::Solver::CurrentInterpolation>::UpperMargin
-        >::type;
-
-    const DataSpace<simDim> originGuard( LowerMargin( ).toRT( ) );
-    const DataSpace<simDim> endGuard( UpperMargin( ).toRT( ) );
-
-    /*go over all directions*/
-    for ( uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i )
+    FieldJ::FieldJ(MappingDesc const& cellDescription)
+        : SimulationFieldHelper<MappingDesc>(cellDescription)
+        , buffer(cellDescription.getGridLayout())
+        , fieldJrecv(nullptr)
     {
-        DataSpace<simDim> relativMask = Mask::getRelativeDirections<simDim > ( i );
-        /*guarding cells depend on direction
-         */
-        DataSpace<simDim> guardingCells;
-        for ( uint32_t d = 0; d < simDim; ++d )
+        const DataSpace<simDim> coreBorderSize = cellDescription.getGridLayout().getDataSpaceWithoutGuarding();
+
+        /* cell margins the current might spread to due to particle shapes */
+        using AllSpeciesWithCurrent =
+            typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, current<>>::type;
+
+        using LowerMarginShapes = bmpl::accumulate<
+            AllSpeciesWithCurrent,
+            typename pmacc::math::CT::make_Int<simDim, 0>::type,
+            pmacc::math::CT::max<bmpl::_1, GetLowerMargin<GetCurrentSolver<bmpl::_2>>>>::type;
+
+        using UpperMarginShapes = bmpl::accumulate<
+            AllSpeciesWithCurrent,
+            typename pmacc::math::CT::make_Int<simDim, 0>::type,
+            pmacc::math::CT::max<bmpl::_1, GetUpperMargin<GetCurrentSolver<bmpl::_2>>>>::type;
+
+        /* margins are always positive, also for lower margins
+         * additional current interpolations and current filters on FieldJ might
+         * spread the dependencies on neighboring cells
+         *   -> use max(shape,filter) */
+        auto const& interpolation = currentInterpolation::CurrentInterpolationInfo::get();
+        auto const interpolationLowerMargin = interpolation.getLowerMargin();
+        auto const interpolationUpperMargin = interpolation.getUpperMargin();
+        auto const originGuard = pmacc::math::max(LowerMarginShapes::toRT(), interpolationLowerMargin);
+        auto const endGuard = pmacc::math::max(UpperMarginShapes::toRT(), interpolationUpperMargin);
+
+        /*go over all directions*/
+        for(uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i)
         {
-            /*originGuard and endGuard are switch because we send data
-             * e.g. from left I get endGuardingCells and from right I originGuardingCells
+            DataSpace<simDim> relativMask = Mask::getRelativeDirections<simDim>(i);
+            /*guarding cells depend on direction
              */
-            switch ( relativMask[d] )
+            DataSpace<simDim> guardingCells;
+            for(uint32_t d = 0; d < simDim; ++d)
             {
-                // receive from negativ side to positiv (end) guarding cells
-            case -1: guardingCells[d] = endGuard[d];
-                break;
-                // receive from positiv side to negativ (origin) guarding cells
-            case 1: guardingCells[d] = originGuard[d];
-                break;
-            case 0: guardingCells[d] = coreBorderSize[d];
-                break;
-            };
+                /*originGuard and endGuard are switch because we send data
+                 * e.g. from left I get endGuardingCells and from right I originGuardingCells
+                 */
+                switch(relativMask[d])
+                {
+                    // receive from negativ side to positiv (end) guarding cells
+                case -1:
+                    guardingCells[d] = endGuard[d];
+                    break;
+                    // receive from positiv side to negativ (origin) guarding cells
+                case 1:
+                    guardingCells[d] = originGuard[d];
+                    break;
+                case 0:
+                    guardingCells[d] = coreBorderSize[d];
+                    break;
+                };
+            }
+            buffer.addExchangeBuffer(i, guardingCells, FIELD_J);
+        }
+
+        /* Receive border values in own guard for "receive" communication pattern - necessary for current
+         * interpolation/filter */
+        const DataSpace<simDim> originRecvGuard = interpolationLowerMargin;
+        const DataSpace<simDim> endRecvGuard = interpolationUpperMargin;
+        if(originRecvGuard != DataSpace<simDim>::create(0) || endRecvGuard != DataSpace<simDim>::create(0))
+        {
+            fieldJrecv = std::make_unique<GridBuffer<ValueType, simDim>>(
+                buffer.getDeviceBuffer(),
+                cellDescription.getGridLayout());
 
+            /*go over all directions*/
+            for(uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i)
+            {
+                DataSpace<simDim> relativMask = Mask::getRelativeDirections<simDim>(i);
+                /* guarding cells depend on direction
+                 * for negative direction use originGuard else endGuard (relative direction ZERO is ignored)
+                 * don't switch end and origin because this is a read buffer and no send buffer
+                 */
+                DataSpace<simDim> guardingCells;
+                for(uint32_t d = 0; d < simDim; ++d)
+                    guardingCells[d] = (relativMask[d] == -1 ? originRecvGuard[d] : endRecvGuard[d]);
+                fieldJrecv->addExchange(GUARD, i, guardingCells, FIELD_JRECV);
+            }
         }
-        // std::cout << "ex " << i << " x=" << guardingCells[0] << " y=" << guardingCells[1] << " z=" << guardingCells[2] << std::endl;
-        buffer.addExchangeBuffer( i, guardingCells, FIELD_J );
     }
 
-    /* Receive border values in own guard for "receive" communication pattern - necessary for current interpolation/filter */
-    const DataSpace<simDim> originRecvGuard( GetMargin<typename fields::Solver::CurrentInterpolation>::LowerMargin( ).toRT( ) );
-    const DataSpace<simDim> endRecvGuard( GetMargin<typename fields::Solver::CurrentInterpolation>::UpperMargin( ).toRT( ) );
-    if( originRecvGuard != DataSpace<simDim>::create(0) ||
-        endRecvGuard != DataSpace<simDim>::create(0) )
+    GridBuffer<FieldJ::ValueType, simDim>& FieldJ::getGridBuffer()
     {
-        fieldJrecv = pmacc::memory::makeUnique< GridBuffer<ValueType, simDim > >(
-            buffer.getDeviceBuffer(),
-            cellDescription.getGridLayout( )
-        );
+        return buffer;
+    }
 
-        /*go over all directions*/
-        for ( uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i )
+    GridLayout<simDim> FieldJ::getGridLayout()
+    {
+        return cellDescription.getGridLayout();
+    }
+
+    EventTask FieldJ::asyncCommunication(EventTask serialEvent)
+    {
+        EventTask ret;
+        __startTransaction(serialEvent);
+        FieldFactory::getInstance().createTaskFieldReceiveAndInsert(*this);
+        ret = __endTransaction();
+
+        __startTransaction(serialEvent);
+        FieldFactory::getInstance().createTaskFieldSend(*this);
+        ret += __endTransaction();
+
+        if(fieldJrecv != nullptr)
         {
-            DataSpace<simDim> relativMask = Mask::getRelativeDirections<simDim > ( i );
-            /* guarding cells depend on direction
-             * for negative direction use originGuard else endGuard (relative direction ZERO is ignored)
-             * don't switch end and origin because this is a read buffer and no send buffer
-             */
-            DataSpace<simDim> guardingCells;
-            for ( uint32_t d = 0; d < simDim; ++d )
-                guardingCells[d] = ( relativMask[d] == -1 ? originRecvGuard[d] : endRecvGuard[d] );
-            fieldJrecv->addExchange( GUARD, i, guardingCells, FIELD_JRECV );
+            EventTask eJ = fieldJrecv->asyncCommunication(ret);
+            return eJ;
         }
+        else
+            return ret;
     }
-}
 
-GridBuffer<FieldJ::ValueType, simDim> &FieldJ::getGridBuffer( )
-{
-    return buffer;
-}
+    void FieldJ::reset(uint32_t)
+    {
+    }
 
-GridLayout<simDim> FieldJ::getGridLayout( )
-{
-    return cellDescription.getGridLayout( );
-}
+    void FieldJ::synchronize()
+    {
+        buffer.deviceToHost();
+    }
 
-EventTask FieldJ::asyncCommunication( EventTask serialEvent )
-{
-    EventTask ret;
-    __startTransaction( serialEvent );
-    FieldFactory::getInstance( ).createTaskFieldReceiveAndInsert( *this );
-    ret = __endTransaction( );
+    SimulationDataId FieldJ::getUniqueId()
+    {
+        return getName();
+    }
 
-    __startTransaction( serialEvent );
-    FieldFactory::getInstance( ).createTaskFieldSend( *this );
-    ret += __endTransaction( );
+    HDINLINE
+    FieldJ::UnitValueType FieldJ::getUnit()
+    {
+        const float_64 UNIT_CURRENT = UNIT_CHARGE / UNIT_TIME / (UNIT_LENGTH * UNIT_LENGTH);
+        return UnitValueType(UNIT_CURRENT, UNIT_CURRENT, UNIT_CURRENT);
+    }
 
-    if( fieldJrecv != nullptr )
+    HINLINE
+    std::vector<float_64> FieldJ::getUnitDimension()
     {
-        EventTask eJ = fieldJrecv->asyncCommunication( ret );
-        return eJ;
+        /* L, M, T, I, theta, N, J
+         *
+         * J is in A/m^2
+         *   -> L^-2 * I
+         */
+        std::vector<float_64> unitDimension(7, 0.0);
+        unitDimension.at(SIBaseUnits::length) = -2.0;
+        unitDimension.at(SIBaseUnits::electricCurrent) = 1.0;
+
+        return unitDimension;
     }
-    else
-        return ret;
-}
 
-void FieldJ::reset( uint32_t )
-{
-}
+    std::string FieldJ::getName()
+    {
+        return "J";
+    }
 
-void FieldJ::synchronize( )
-{
-    buffer.deviceToHost( );
-}
+    void FieldJ::assign(ValueType value)
+    {
+        buffer.getDeviceBuffer().setValue(value);
+        // fieldJ.reset(false);
+    }
 
-SimulationDataId FieldJ::getUniqueId( )
-{
-    return getName( );
-}
+    template<uint32_t T_area, class T_Species>
+    void FieldJ::computeCurrent(T_Species& species, uint32_t)
+    {
+        using FrameType = typename T_Species::FrameType;
+        typedef typename pmacc::traits::Resolve<typename GetFlagType<FrameType, current<>>::type>::type
+            ParticleCurrentSolver;
 
-HDINLINE
-FieldJ::UnitValueType
-FieldJ::getUnit( )
-{
-    const float_64 UNIT_CURRENT = UNIT_CHARGE / UNIT_TIME / ( UNIT_LENGTH * UNIT_LENGTH );
-    return UnitValueType( UNIT_CURRENT, UNIT_CURRENT, UNIT_CURRENT );
-}
+        using FrameSolver
+            = currentSolver::ComputePerFrame<ParticleCurrentSolver, Velocity, MappingDesc::SuperCellSize>;
 
-HINLINE
-std::vector<float_64>
-FieldJ::getUnitDimension( )
-{
-    /* L, M, T, I, theta, N, J
-    *
-    * J is in A/m^2
-    *   -> L^-2 * I
-    */
-    std::vector<float_64> unitDimension( 7, 0.0 );
-    unitDimension.at(SIBaseUnits::length) = -2.0;
-    unitDimension.at(SIBaseUnits::electricCurrent) =  1.0;
-
-    return unitDimension;
-}
-
-std::string
-FieldJ::getName( )
-{
-    return "J";
-}
+        typedef SuperCellDescription<
+            typename MappingDesc::SuperCellSize,
+            typename GetMargin<ParticleCurrentSolver>::LowerMargin,
+            typename GetMargin<ParticleCurrentSolver>::UpperMargin>
+            BlockArea;
 
-void FieldJ::assign( ValueType value )
-{
-    buffer.getDeviceBuffer( ).setValue( value );
-    //fieldJ.reset(false);
-}
+        using Strategy = currentSolver::traits::GetStrategy_t<FrameSolver>;
 
-template<uint32_t T_area, class T_Species>
-void FieldJ::computeCurrent( T_Species & species, uint32_t )
-{
-    /* tuning parameter to use more workers than cells in a supercell
-    * valid domain: 1 <= workerMultiplier
-    */
-    const int workerMultiplier = 2;
-
-    using FrameType = typename T_Species::FrameType;
-    typedef typename pmacc::traits::Resolve<
-        typename GetFlagType<FrameType, current<> >::type
-    >::type ParticleCurrentSolver;
-
-    typedef ComputeCurrentPerFrame<ParticleCurrentSolver, Velocity, MappingDesc::SuperCellSize> FrameSolver;
-
-    typedef SuperCellDescription<
-        typename MappingDesc::SuperCellSize,
-        typename GetMargin<ParticleCurrentSolver>::LowerMargin,
-        typename GetMargin<ParticleCurrentSolver>::UpperMargin
-    > BlockArea;
-
-    /* The needed stride for the stride mapper depends on the stencil width.
-    * If the upper and lower margin of the stencil fits into one supercell
-    * a double checker board (stride 2) is needed.
-    * The round up sum of margins is the number of supercells to skip.
-    */
-    using MarginPerDim = typename pmacc::math::CT::add<
-        typename GetMargin<ParticleCurrentSolver>::LowerMargin,
-        typename GetMargin<ParticleCurrentSolver>::UpperMargin
-    >::type;
-    using MaxMargin = typename pmacc::math::CT::max< MarginPerDim >::type;
-    using SuperCellMinSize = typename pmacc::math::CT::min< SuperCellSize >::type;
-
-    /* number of supercells which must be skipped to avoid overlapping areas
-    * between different blocks in the kernel
-    */
-    constexpr uint32_t skipSuperCells = ( MaxMargin::value + SuperCellMinSize::value - 1u ) / SuperCellMinSize::value;
-    StrideMapping<
-        T_area,
-        skipSuperCells + 1u, // stride 1u means each supercell is used
-        MappingDesc
-    > mapper( cellDescription );
-
-    typename T_Species::ParticlesBoxType pBox = species.getDeviceParticlesBox( );
-    FieldJ::DataBoxType jBox = buffer.getDeviceBuffer( ).getDataBox( );
-    FrameSolver solver( DELTA_T );
-
-    constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-        pmacc::math::CT::volume< SuperCellSize >::type::value * workerMultiplier
-    >::value;
-
-    do
+        constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
+            pmacc::math::CT::volume<SuperCellSize>::type::value * Strategy::workerMultiplier>::value;
+
+        auto const depositionKernel = currentSolver::KernelComputeCurrent<numWorkers, BlockArea>{};
+
+        typename T_Species::ParticlesBoxType pBox = species.getDeviceParticlesBox();
+        FieldJ::DataBoxType jBox = buffer.getDeviceBuffer().getDataBox();
+        FrameSolver solver(DELTA_T);
+
+        auto const deposit = currentSolver::Deposit<Strategy>{};
+        deposit.template execute<T_area, numWorkers>(cellDescription, depositionKernel, solver, jBox, pBox);
+    }
+
+    template<uint32_t T_area, class T_CurrentInterpolationFunctor>
+    void FieldJ::addCurrentToEMF(T_CurrentInterpolationFunctor myCurrentInterpolationFunctor)
     {
-        PMACC_KERNEL( KernelComputeCurrent< numWorkers, BlockArea >{} )
-            ( mapper.getGridDim( ), numWorkers )
-            ( jBox,
-                pBox, solver, mapper );
+        DataConnector& dc = Environment<>::get().DataConnector();
+        auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+        auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
+
+        AreaMapping<T_area, MappingDesc> mapper(cellDescription);
+
+        constexpr uint32_t numWorkers
+            = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+        PMACC_KERNEL(currentSolver::KernelAddCurrentToEMF<numWorkers>{})
+        (mapper.getGridDim(), numWorkers)(
+            fieldE->getDeviceDataBox(),
+            fieldB->getDeviceDataBox(),
+            buffer.getDeviceBuffer().getDataBox(),
+            myCurrentInterpolationFunctor,
+            mapper);
+        dc.releaseData(FieldE::getName());
+        dc.releaseData(FieldB::getName());
     }
-    while ( mapper.next( ) );
 
-}
+    void FieldJ::bashField(uint32_t exchangeType)
+    {
+        pmacc::fields::operations::CopyGuardToExchange{}(buffer, SuperCellSize{}, exchangeType);
+    }
 
-template<uint32_t T_area, class T_CurrentInterpolation>
-void FieldJ::addCurrentToEMF( T_CurrentInterpolation& myCurrentInterpolation )
-{
-    DataConnector &dc = Environment<>::get().DataConnector();
-    auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-    auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
-
-    AreaMapping<
-        T_area,
-        MappingDesc
-    > mapper(cellDescription);
-
-    constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-        pmacc::math::CT::volume< SuperCellSize >::type::value
-    >::value;
-
-    PMACC_KERNEL( KernelAddCurrentToEMF< numWorkers >{} )(
-        mapper.getGridDim(),
-        numWorkers
-        )(
-            fieldE->getDeviceDataBox( ),
-            fieldB->getDeviceDataBox( ),
-            buffer.getDeviceBuffer( ).getDataBox( ),
-            myCurrentInterpolation,
-            mapper
-            );
-    dc.releaseData( FieldE::getName() );
-    dc.releaseData( FieldB::getName() );
-}
-
-void FieldJ::bashField( uint32_t exchangeType )
-{
-    pmacc::fields::operations::CopyGuardToExchange{ }(
-        buffer,
-        SuperCellSize{ },
-        exchangeType
-    );
-}
-
-void FieldJ::insertField( uint32_t exchangeType )
-{
-    pmacc::fields::operations::AddExchangeToBorder{ }(
-        buffer,
-        SuperCellSize{ },
-        exchangeType
-    );
-}
+    void FieldJ::insertField(uint32_t exchangeType)
+    {
+        pmacc::fields::operations::AddExchangeToBorder{}(buffer, SuperCellSize{}, exchangeType);
+    }
 
 } // namespace picongpu
diff --git a/include/picongpu/fields/FieldTmp.hpp b/include/picongpu/fields/FieldTmp.hpp
index a3385c352e..812abaa53c 100644
--- a/include/picongpu/fields/FieldTmp.hpp
+++ b/include/picongpu/fields/FieldTmp.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -38,7 +38,6 @@
 
 namespace picongpu
 {
-
     /** Representation of the temporary scalar field for plugins and temporary
      *  particle data mapped to grid (charge density, energy density, etc.)
      *
@@ -48,12 +47,11 @@ namespace picongpu
      * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
      * ISimulationData.
      */
-    class FieldTmp :
-        public SimulationFieldHelper<MappingDesc>,
-        public ISimulationData
+    class FieldTmp
+        : public SimulationFieldHelper<MappingDesc>
+        , public ISimulationData
     {
     public:
-
         //! Type of each field value
         using ValueType = float1_X;
 
@@ -64,32 +62,29 @@ namespace picongpu
         using SuperCellSize = MappingDesc::SuperCellSize;
 
         //! Type of data box for field values on host and device
-        using DataBoxType = DataBox<PitchedBox<ValueType, simDim> >;
+        using DataBoxType = DataBox<PitchedBox<ValueType, simDim>>;
 
         /** Create a field
          *
          * @param cellDescription mapping for kernels
          * @param slotId index of the temporary field
          */
-        HINLINE FieldTmp(
-            MappingDesc const & cellDescription,
-            uint32_t slotId
-        );
+        HINLINE FieldTmp(MappingDesc const& cellDescription, uint32_t slotId);
 
         //! Destroy a field
-        virtual ~FieldTmp( ) = default;
+        virtual ~FieldTmp() = default;
 
         //! Get a reference to the host-device buffer for the field values
-        HINLINE GridBuffer<ValueType, simDim>& getGridBuffer( );
+        HINLINE GridBuffer<ValueType, simDim>& getGridBuffer();
 
         //! Get the grid layout
-        HINLINE GridLayout<simDim> getGridLayout( );
+        HINLINE GridLayout<simDim> getGridLayout();
 
         //! Get the host data box for the field values
-        HINLINE DataBoxType getHostDataBox( );
+        HINLINE DataBoxType getHostDataBox();
 
         //! Get the device data box for the field values
-        HINLINE DataBoxType getDeviceDataBox( );
+        HINLINE DataBoxType getDeviceDataBox();
 
         /** Start asynchronous send of field values
          *
@@ -99,31 +94,31 @@ namespace picongpu
          *
          * @param serialEvent event to depend on
          */
-        HINLINE virtual EventTask asyncCommunication( EventTask serialEvent );
+        HINLINE virtual EventTask asyncCommunication(EventTask serialEvent);
 
         /** Reset the host-device buffer for field values
          *
          * @param currentStep index of time iteration
          */
-        HINLINE void reset( uint32_t currentStep ) override;
+        HINLINE void reset(uint32_t currentStep) override;
 
         //! Synchronize device data with host data
-        HINLINE void syncToDevice( ) override;
+        HINLINE void syncToDevice() override;
 
         //! Synchronize host data with device data
-        HINLINE void synchronize( ) override;
+        HINLINE void synchronize() override;
 
         /** Get id
          *
          * @param slotId index of the temporary field
          */
-        HINLINE static SimulationDataId getUniqueId( uint32_t slotId );
+        HINLINE static SimulationDataId getUniqueId(uint32_t slotId);
 
         //! Get id
         HINLINE SimulationDataId getUniqueId() override;
 
         //! Get unit of field components
-        template< class FrameSolver >
+        template<class FrameSolver>
         HDINLINE static UnitValueType getUnit();
 
         /** Get unit representation as powers of the 7 base measures
@@ -133,7 +128,7 @@ namespace picongpu
          *  thermodynamic temperature theta, amount of substance N,
          *  luminous intensity J)
          */
-        template< class FrameSolver >
+        template<class FrameSolver>
         HINLINE static std::vector<float_64> getUnitDimension();
 
         //! Get mapping for kernels
@@ -151,7 +146,7 @@ namespace picongpu
          * This method can be called before or after asyncCommunication without
          * explicit handling to avoid race conditions between both methods.
          */
-        HINLINE EventTask asyncCommunicationGather( EventTask serialEvent );
+        HINLINE EventTask asyncCommunicationGather(EventTask serialEvent);
 
         /** Compute current density created by a species in an area
          *
@@ -169,21 +164,20 @@ namespace picongpu
          *
          * @param exchangeType exchange type
          */
-        HINLINE void bashField( uint32_t exchangeType );
+        HINLINE void bashField(uint32_t exchangeType);
 
         /** Insert all particles which are in device exchange buffer
          *
          * @param exchangeType exchange type
          */
-        HINLINE void insertField( uint32_t exchangeType );
+        HINLINE void insertField(uint32_t exchangeType);
 
     private:
-
         //! Host-device buffer for current density values
-        std::unique_ptr< GridBuffer<ValueType, simDim> > fieldTmp;
+        std::unique_ptr<GridBuffer<ValueType, simDim>> fieldTmp;
 
         //! Buffer for receiving near-boundary values
-        std::unique_ptr< GridBuffer<ValueType, simDim> > fieldTmpRecv;
+        std::unique_ptr<GridBuffer<ValueType, simDim>> fieldTmpRecv;
 
         //! Index of the temporary field
         uint32_t m_slotId;
@@ -195,7 +189,6 @@ namespace picongpu
         //! Tags for communication
         uint32_t m_commTagScatter;
         uint32_t m_commTagGather;
-
     };
 
 } // namespace picongpu
diff --git a/include/picongpu/fields/FieldTmp.kernel b/include/picongpu/fields/FieldTmp.kernel
index 5c6b887996..e22184b319 100644
--- a/include/picongpu/fields/FieldTmp.kernel
+++ b/include/picongpu/fields/FieldTmp.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Marco Garten
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -48,10 +48,7 @@ namespace picongpu
      * @tparam T_numWorkers number of workers
      * @tparam T_BlockDescription stance area description of the user functor
      */
-    template<
-        uint32_t T_numWorkers,
-        typename T_BlockDescription
-    >
+    template<uint32_t T_numWorkers, typename T_BlockDescription>
     struct KernelComputeSupercells
     {
         /** derive species property
@@ -66,106 +63,63 @@ namespace picongpu
          * @param frameSolver functor to calculate the current for a frame
          * @param mapper functor to map a block to a supercell
          */
-        template<
-            typename T_TmpBox,
-            typename T_ParBox,
-            typename T_FrameSolver,
-            typename T_Mapping,
-            typename T_Acc
-        >
+        template<typename T_TmpBox, typename T_ParBox, typename T_FrameSolver, typename T_Mapping, typename T_Acc>
         DINLINE void operator()(
-            T_Acc const & acc,
+            T_Acc const& acc,
             T_TmpBox fieldTmp,
             T_ParBox boxPar,
             T_FrameSolver frameSolver,
-            T_Mapping mapper
-        ) const
+            T_Mapping mapper) const
         {
             using namespace mappings::threads;
 
             using FramePtr = typename T_ParBox::FramePtr;
             using SuperCellSize = typename T_BlockDescription::SuperCellSize;
 
-            constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume< SuperCellSize >::type::value;
+            constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
             constexpr uint32_t numWorkers = T_numWorkers;
 
-            uint32_t const workerIdx = threadIdx.x;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-            DataSpace< simDim > const block( mapper.getSuperCellIndex( DataSpace< simDim > ( blockIdx ) ) );
+            DataSpace<simDim> const block(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
 
             FramePtr frame;
             lcellId_t particlesInSuperCell;
 
-            frame = boxPar.getLastFrame( block );
-            particlesInSuperCell = boxPar.getSuperCell( block ).getSizeLastFrame( );
-
-            if( !frame.isValid() )
-                return; //end kernel if we have no frames
-
-            auto cachedVal = CachedBox::create <
-                0,
-                typename T_TmpBox::ValueType
-            > (
-                acc,
-                T_BlockDescription{ }
-            );
-            Set< typename T_TmpBox::ValueType > set( float_X( 0.0 ) );
-
-            ThreadCollective<
-                T_BlockDescription,
-                numWorkers
-            > collective( workerIdx );
-            collective(
-                acc,
-                set,
-                cachedVal
-            );
-
-            __syncthreads( );
-
-            while( frame.isValid() )
+            frame = boxPar.getLastFrame(block);
+            particlesInSuperCell = boxPar.getSuperCell(block).getSizeLastFrame();
+
+            if(!frame.isValid())
+                return; // end kernel if we have no frames
+
+            auto cachedVal = CachedBox::create<0, typename T_TmpBox::ValueType>(acc, T_BlockDescription{});
+            Set<typename T_TmpBox::ValueType> set(float_X(0.0));
+
+            ThreadCollective<T_BlockDescription, numWorkers> collective(workerIdx);
+            collective(acc, set, cachedVal);
+
+            cupla::__syncthreads(acc);
+
+            while(frame.isValid())
             {
-                ForEachIdx<
-                    IdxConfig<
-                        cellsPerSuperCell,
-                        numWorkers
-                    >
-                >{ workerIdx }(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const
-                    )
-                    {
-                        if( linearIdx < particlesInSuperCell )
+                ForEachIdx<IdxConfig<cellsPerSuperCell, numWorkers>>{workerIdx}(
+                    [&](uint32_t const linearIdx, uint32_t const) {
+                        if(linearIdx < particlesInSuperCell)
                         {
-                            frameSolver(
-                                acc,
-                                *frame,
-                                linearIdx,
-                                SuperCellSize::toRT(),
-                                cachedVal
-                            );
+                            frameSolver(acc, *frame, linearIdx, SuperCellSize::toRT(), cachedVal);
                         }
-                    }
-                );
+                    });
 
-                frame = boxPar.getPreviousFrame( frame );
+                frame = boxPar.getPreviousFrame(frame);
                 particlesInSuperCell = cellsPerSuperCell;
-
-
             }
 
-            __syncthreads( );
+            cupla::__syncthreads(acc);
 
             nvidia::functors::Add add;
-            DataSpace< simDim > const blockCell = block * SuperCellSize::toRT( );
-            auto fieldTmpBlock = fieldTmp.shift( blockCell );
-            collective(
-                acc,
-                add,
-                fieldTmpBlock,
-                cachedVal
-            );
+            DataSpace<simDim> const blockCell = block * SuperCellSize::toRT();
+            auto fieldTmpBlock = fieldTmp.shift(blockCell);
+            collective(acc, add, fieldTmpBlock, cachedVal);
         }
     };
 
diff --git a/include/picongpu/fields/FieldTmp.tpp b/include/picongpu/fields/FieldTmp.tpp
index a6f3d718d6..e274f7281b 100644
--- a/include/picongpu/fields/FieldTmp.tpp
+++ b/include/picongpu/fields/FieldTmp.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -27,7 +27,6 @@
 #include "picongpu/particles/traits/GetInterpolation.hpp"
 
 #include <pmacc/memory/buffers/GridBuffer.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/mappings/simulation/GridController.hpp>
 #include <pmacc/dataManagement/DataConnector.hpp>
 #include <pmacc/mappings/kernel/AreaMapping.hpp>
@@ -50,29 +49,23 @@ namespace picongpu
 {
     using namespace pmacc;
 
-    FieldTmp::FieldTmp(
-        MappingDesc const & cellDescription,
-        uint32_t slotId
-    ) :
-        SimulationFieldHelper<MappingDesc>( cellDescription ),
-        m_slotId( slotId )
+    FieldTmp::FieldTmp(MappingDesc const& cellDescription, uint32_t slotId)
+        : SimulationFieldHelper<MappingDesc>(cellDescription)
+        , m_slotId(slotId)
     {
         /* Since this class is instantiated for each temporary field slot,
          * use getNextId( ) directly to get unique tags for each instance.
          * Add SPECIES_FIRSTTAG to avoid collisions with the tags for
          * other fields.
          */
-        m_commTagScatter = pmacc::traits::getNextId( ) + SPECIES_FIRSTTAG;
-        m_commTagGather = pmacc::traits::getNextId( ) + SPECIES_FIRSTTAG;
+        m_commTagScatter = pmacc::traits::getNextId() + SPECIES_FIRSTTAG;
+        m_commTagGather = pmacc::traits::getNextId() + SPECIES_FIRSTTAG;
 
-        using Buffer = GridBuffer< ValueType, simDim >;
-        fieldTmp = memory::makeUnique< Buffer >( cellDescription.getGridLayout( ) );
+        using Buffer = GridBuffer<ValueType, simDim>;
+        fieldTmp = std::make_unique<Buffer>(cellDescription.getGridLayout());
 
-        if( fieldTmpSupportGatherCommunication )
-            fieldTmpRecv = memory::makeUnique< Buffer >(
-                fieldTmp->getDeviceBuffer(),
-                cellDescription.getGridLayout( )
-            );
+        if(fieldTmpSupportGatherCommunication)
+            fieldTmpRecv = std::make_unique<Buffer>(fieldTmp->getDeviceBuffer(), cellDescription.getGridLayout());
 
         /** \todo The exchange has to be resetted and set again regarding the
          *  temporary "Fill-"Functor we want to use.
@@ -80,39 +73,29 @@ namespace picongpu
          *  Problem: buffers don't allow "bigger" exchange during run time.
          *           so let's stay with the maximum guards.
          */
-        const DataSpace<simDim> coreBorderSize = cellDescription.getGridLayout( ).getDataSpaceWithoutGuarding( );
+        const DataSpace<simDim> coreBorderSize = cellDescription.getGridLayout().getDataSpaceWithoutGuarding();
 
-        typedef typename pmacc::particles::traits::FilterByFlag
-        <
-            VectorAllSpecies,
-            interpolation<>
-        >::type VectorSpeciesWithInterpolation;
+        typedef typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, interpolation<>>::type
+            VectorSpeciesWithInterpolation;
 
         /* ------------------ lower margin  ----------------------------------*/
         typedef bmpl::accumulate<
             VectorSpeciesWithInterpolation,
             typename pmacc::math::CT::make_Int<simDim, 0>::type,
-            pmacc::math::CT::max<bmpl::_1, GetLowerMargin< GetInterpolation<bmpl::_2> > >
-        >::type SpeciesLowerMargin;
+            pmacc::math::CT::max<bmpl::_1, GetLowerMargin<GetInterpolation<bmpl::_2>>>>::type SpeciesLowerMargin;
 
         typedef bmpl::accumulate<
             FieldTmpSolvers,
             typename pmacc::math::CT::make_Int<simDim, 0>::type,
-            pmacc::math::CT::max<bmpl::_1, GetLowerMargin< bmpl::_2 > >
-        >::type FieldTmpLowerMargin;
+            pmacc::math::CT::max<bmpl::_1, GetLowerMargin<bmpl::_2>>>::type FieldTmpLowerMargin;
 
-        typedef pmacc::math::CT::max<
-            SpeciesLowerMargin,
-            FieldTmpLowerMargin>::type SpeciesFieldTmpLowerMargin;
+        typedef pmacc::math::CT::max<SpeciesLowerMargin, FieldTmpLowerMargin>::type SpeciesFieldTmpLowerMargin;
 
         typedef pmacc::math::CT::max<
             GetMargin<fields::Solver, FIELD_B>::LowerMargin,
-            GetMargin<fields::Solver, FIELD_E>::LowerMargin>::type
-            FieldSolverLowerMargin;
+            GetMargin<fields::Solver, FIELD_E>::LowerMargin>::type FieldSolverLowerMargin;
 
-        typedef pmacc::math::CT::max<
-            SpeciesFieldTmpLowerMargin,
-            FieldSolverLowerMargin>::type LowerMargin;
+        typedef pmacc::math::CT::max<SpeciesFieldTmpLowerMargin, FieldSolverLowerMargin>::type LowerMargin;
 
 
         /* ------------------ upper margin  -----------------------------------*/
@@ -120,216 +103,185 @@ namespace picongpu
         typedef bmpl::accumulate<
             VectorSpeciesWithInterpolation,
             typename pmacc::math::CT::make_Int<simDim, 0>::type,
-            pmacc::math::CT::max<bmpl::_1, GetUpperMargin< GetInterpolation<bmpl::_2> > >
-        >::type SpeciesUpperMargin;
+            pmacc::math::CT::max<bmpl::_1, GetUpperMargin<GetInterpolation<bmpl::_2>>>>::type SpeciesUpperMargin;
 
         typedef bmpl::accumulate<
             FieldTmpSolvers,
             typename pmacc::math::CT::make_Int<simDim, 0>::type,
-            pmacc::math::CT::max<bmpl::_1, GetUpperMargin< bmpl::_2 > >
-        >::type FieldTmpUpperMargin;
+            pmacc::math::CT::max<bmpl::_1, GetUpperMargin<bmpl::_2>>>::type FieldTmpUpperMargin;
 
-        typedef pmacc::math::CT::max<
-            SpeciesUpperMargin,
-            FieldTmpUpperMargin>::type SpeciesFieldTmpUpperMargin;
+        typedef pmacc::math::CT::max<SpeciesUpperMargin, FieldTmpUpperMargin>::type SpeciesFieldTmpUpperMargin;
 
         typedef pmacc::math::CT::max<
             GetMargin<fields::Solver, FIELD_B>::UpperMargin,
-            GetMargin<fields::Solver, FIELD_E>::UpperMargin>::type
-            FieldSolverUpperMargin;
+            GetMargin<fields::Solver, FIELD_E>::UpperMargin>::type FieldSolverUpperMargin;
 
-        typedef pmacc::math::CT::max<
-            SpeciesFieldTmpUpperMargin,
-            FieldSolverUpperMargin>::type UpperMargin;
+        typedef pmacc::math::CT::max<SpeciesFieldTmpUpperMargin, FieldSolverUpperMargin>::type UpperMargin;
 
-        const DataSpace<simDim> originGuard( LowerMargin( ).toRT( ) );
-        const DataSpace<simDim> endGuard( UpperMargin( ).toRT( ) );
+        const DataSpace<simDim> originGuard(LowerMargin().toRT());
+        const DataSpace<simDim> endGuard(UpperMargin().toRT());
 
         /*go over all directions*/
-        for( uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i )
+        for(uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i)
         {
-            DataSpace<simDim> relativMask = Mask::getRelativeDirections<simDim > ( i );
+            DataSpace<simDim> relativMask = Mask::getRelativeDirections<simDim>(i);
             /*guarding cells depend on direction
              */
             DataSpace<simDim> guardingCells;
-            for( uint32_t d = 0; d < simDim; ++d )
+            for(uint32_t d = 0; d < simDim; ++d)
             {
                 /*originGuard and endGuard are switch because we send data
                  * e.g. from left I get endGuardingCells and from right I originGuardingCells
                  */
-                switch( relativMask[d] )
+                switch(relativMask[d])
                 {
-                        // receive from negativ side to positiv (end) guarding cells
-                    case -1: guardingCells[d] = endGuard[d];
-                        break;
-                        // receive from positiv side to negativ (origin) guarding cells
-                    case 1: guardingCells[d] = originGuard[d];
-                        break;
-                    case 0: guardingCells[d] = coreBorderSize[d];
-                        break;
+                    // receive from negativ side to positiv (end) guarding cells
+                case -1:
+                    guardingCells[d] = endGuard[d];
+                    break;
+                    // receive from positiv side to negativ (origin) guarding cells
+                case 1:
+                    guardingCells[d] = originGuard[d];
+                    break;
+                case 0:
+                    guardingCells[d] = coreBorderSize[d];
+                    break;
                 };
-
             }
 
-            fieldTmp->addExchangeBuffer( i, guardingCells, m_commTagScatter );
+            fieldTmp->addExchangeBuffer(i, guardingCells, m_commTagScatter);
 
-            if( fieldTmpRecv )
+            if(fieldTmpRecv)
             {
                 /* guarding cells depend on direction
                  * for negative direction use originGuard else endGuard (relative direction ZERO is ignored)
                  * don't switch end and origin because this is a read buffer and not send buffer
                  */
-                for ( uint32_t d = 0; d < simDim; ++d )
-                    guardingCells[d] = ( relativMask[d] == -1 ? originGuard[d] : endGuard[d] );
-                fieldTmpRecv->addExchange( GUARD, i, guardingCells, m_commTagGather );
+                for(uint32_t d = 0; d < simDim; ++d)
+                    guardingCells[d] = (relativMask[d] == -1 ? originGuard[d] : endGuard[d]);
+                fieldTmpRecv->addExchange(GUARD, i, guardingCells, m_commTagGather);
             }
         }
-
     }
 
     template<uint32_t AREA, class FrameSolver, class ParticlesClass>
-    void FieldTmp::computeValue( ParticlesClass& parClass, uint32_t )
+    void FieldTmp::computeValue(ParticlesClass& parClass, uint32_t)
     {
         typedef SuperCellDescription<
             typename MappingDesc::SuperCellSize,
             typename FrameSolver::LowerMargin,
-            typename FrameSolver::UpperMargin
-            > BlockArea;
+            typename FrameSolver::UpperMargin>
+            BlockArea;
 
-        StrideMapping<AREA, 3, MappingDesc> mapper( cellDescription );
-        typename ParticlesClass::ParticlesBoxType pBox = parClass.getDeviceParticlesBox( );
-        FieldTmp::DataBoxType tmpBox = this->fieldTmp->getDeviceBuffer( ).getDataBox( );
+        StrideMapping<AREA, 3, MappingDesc> mapper(cellDescription);
+        typename ParticlesClass::ParticlesBoxType pBox = parClass.getDeviceParticlesBox();
+        FieldTmp::DataBoxType tmpBox = this->fieldTmp->getDeviceBuffer().getDataBox();
         FrameSolver solver;
-        constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-            pmacc::math::CT::volume< SuperCellSize >::type::value
-        >::value;
+        constexpr uint32_t numWorkers
+            = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
 
         do
         {
-            PMACC_KERNEL( KernelComputeSupercells<
-                numWorkers,
-                BlockArea
-            >{ } )(
-                mapper.getGridDim( ),
-                numWorkers
-            )(
-                tmpBox,
-                pBox,
-                solver,
-                mapper
-            );
-        } while( mapper.next( ) );
+            PMACC_KERNEL(KernelComputeSupercells<numWorkers, BlockArea>{})
+            (mapper.getGridDim(), numWorkers)(tmpBox, pBox, solver, mapper);
+        } while(mapper.next());
     }
 
 
-    SimulationDataId
-    FieldTmp::getUniqueId( uint32_t slotId )
+    SimulationDataId FieldTmp::getUniqueId(uint32_t slotId)
     {
-        return getName() + std::to_string( slotId );
+        return getName() + std::to_string(slotId);
     }
 
-    SimulationDataId
-    FieldTmp::getUniqueId()
+    SimulationDataId FieldTmp::getUniqueId()
     {
-        return getUniqueId( m_slotId );
+        return getUniqueId(m_slotId);
     }
 
-    void FieldTmp::synchronize( )
+    void FieldTmp::synchronize()
     {
-        fieldTmp->deviceToHost( );
+        fieldTmp->deviceToHost();
     }
 
-    void FieldTmp::syncToDevice( )
+    void FieldTmp::syncToDevice()
     {
-        fieldTmp->hostToDevice( );
+        fieldTmp->hostToDevice();
     }
 
-    EventTask FieldTmp::asyncCommunication( EventTask serialEvent )
+    EventTask FieldTmp::asyncCommunication(EventTask serialEvent)
     {
         EventTask ret;
-        __startTransaction( serialEvent + m_gatherEv + m_scatterEv );
-        FieldFactory::getInstance( ).createTaskFieldReceiveAndInsert( *this );
-        ret = __endTransaction( );
+        __startTransaction(serialEvent + m_gatherEv + m_scatterEv);
+        FieldFactory::getInstance().createTaskFieldReceiveAndInsert(*this);
+        ret = __endTransaction();
 
-        __startTransaction( serialEvent + m_gatherEv + m_scatterEv);
-        FieldFactory::getInstance( ).createTaskFieldSend( *this );
-        ret += __endTransaction( );
+        __startTransaction(serialEvent + m_gatherEv + m_scatterEv);
+        FieldFactory::getInstance().createTaskFieldSend(*this);
+        ret += __endTransaction();
         m_scatterEv = ret;
         return ret;
     }
 
-    EventTask FieldTmp::asyncCommunicationGather( EventTask serialEvent )
+    EventTask FieldTmp::asyncCommunicationGather(EventTask serialEvent)
     {
         PMACC_VERIFY_MSG(
             fieldTmpSupportGatherCommunication == true,
-            "fieldTmpSupportGatherCommunication in memory.param must be set to true"
-        );
+            "fieldTmpSupportGatherCommunication in memory.param must be set to true");
 
-        if( fieldTmpRecv != nullptr )
-            m_gatherEv = fieldTmpRecv->asyncCommunication( serialEvent + m_scatterEv + m_gatherEv );
+        if(fieldTmpRecv != nullptr)
+            m_gatherEv = fieldTmpRecv->asyncCommunication(serialEvent + m_scatterEv + m_gatherEv);
         return m_gatherEv;
     }
 
-    void FieldTmp::bashField( uint32_t exchangeType )
+    void FieldTmp::bashField(uint32_t exchangeType)
     {
-        pmacc::fields::operations::CopyGuardToExchange{ }(
-            *fieldTmp,
-            SuperCellSize{ },
-            exchangeType
-        );
+        pmacc::fields::operations::CopyGuardToExchange{}(*fieldTmp, SuperCellSize{}, exchangeType);
     }
 
-    void FieldTmp::insertField( uint32_t exchangeType )
+    void FieldTmp::insertField(uint32_t exchangeType)
     {
-        pmacc::fields::operations::AddExchangeToBorder{ }(
-            *fieldTmp,
-            SuperCellSize{ },
-            exchangeType
-        );
+        pmacc::fields::operations::AddExchangeToBorder{}(*fieldTmp, SuperCellSize{}, exchangeType);
     }
 
-    FieldTmp::DataBoxType FieldTmp::getDeviceDataBox( )
+    FieldTmp::DataBoxType FieldTmp::getDeviceDataBox()
     {
-        return fieldTmp->getDeviceBuffer( ).getDataBox( );
+        return fieldTmp->getDeviceBuffer().getDataBox();
     }
 
-    FieldTmp::DataBoxType FieldTmp::getHostDataBox( )
+    FieldTmp::DataBoxType FieldTmp::getHostDataBox()
     {
-        return fieldTmp->getHostBuffer( ).getDataBox( );
+        return fieldTmp->getHostBuffer().getDataBox();
     }
 
-    GridBuffer<typename FieldTmp::ValueType, simDim> &FieldTmp::getGridBuffer( )
+    GridBuffer<typename FieldTmp::ValueType, simDim>& FieldTmp::getGridBuffer()
     {
         return *fieldTmp;
     }
 
-    GridLayout< simDim> FieldTmp::getGridLayout( )
+    GridLayout<simDim> FieldTmp::getGridLayout()
     {
-        return cellDescription.getGridLayout( );
+        return cellDescription.getGridLayout();
     }
 
-    void FieldTmp::reset( uint32_t )
+    void FieldTmp::reset(uint32_t)
     {
-        fieldTmp->getHostBuffer( ).reset( true );
-        fieldTmp->getDeviceBuffer( ).reset( false );
+        fieldTmp->getHostBuffer().reset(true);
+        fieldTmp->getDeviceBuffer().reset(false);
     }
 
-    template<class FrameSolver >
-    HDINLINE FieldTmp::UnitValueType
-    FieldTmp::getUnit( )
+    template<class FrameSolver>
+    HDINLINE FieldTmp::UnitValueType FieldTmp::getUnit()
     {
         return FrameSolver().getUnit();
     }
 
-    template<class FrameSolver >
-    HINLINE std::vector<float_64>
-    FieldTmp::getUnitDimension( )
+    template<class FrameSolver>
+    HINLINE std::vector<float_64> FieldTmp::getUnitDimension()
     {
         return FrameSolver().getUnitDimension();
     }
 
-    std::string
-    FieldTmp::getName( )
+    std::string FieldTmp::getName()
     {
         return "FieldTmp";
     }
diff --git a/include/picongpu/fields/Fields.def b/include/picongpu/fields/Fields.def
index 8a6be8cd29..0996a0ca55 100644
--- a/include/picongpu/fields/Fields.def
+++ b/include/picongpu/fields/Fields.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -21,7 +21,6 @@
 
 namespace picongpu
 {
-
     /** Define which operation is used to fill up FieldTmp
      *
      * This is better than use of boost::mtl::pair because
diff --git a/include/picongpu/fields/Fields.hpp b/include/picongpu/fields/Fields.hpp
index 38b0d4a6b0..961684c8bc 100644
--- a/include/picongpu/fields/Fields.hpp
+++ b/include/picongpu/fields/Fields.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include "picongpu/fields/FieldB.hpp"
diff --git a/include/picongpu/fields/Fields.tpp b/include/picongpu/fields/Fields.tpp
index 7c7953de39..3f2040d993 100644
--- a/include/picongpu/fields/Fields.tpp
+++ b/include/picongpu/fields/Fields.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/fields/LaserPhysics.def b/include/picongpu/fields/LaserPhysics.def
index 45f4b6d0fa..29ae1ad365 100644
--- a/include/picongpu/fields/LaserPhysics.def
+++ b/include/picongpu/fields/LaserPhysics.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,4 +22,4 @@
 namespace picongpu
 {
     struct LaserPhysics;
-}//namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/fields/LaserPhysics.hpp b/include/picongpu/fields/LaserPhysics.hpp
index 62a68d8fc3..6ffc5570d9 100644
--- a/include/picongpu/fields/LaserPhysics.hpp
+++ b/include/picongpu/fields/LaserPhysics.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -35,185 +35,150 @@
 
 namespace picongpu
 {
-namespace fields
-{
-    /** compute the electric field of the laser
-     *
-     * @tparam T_numWorkers number of workers
-     * @tparam T_LaserPlaneSizeInSuperCell number of cells per dimension which
-     *  initialize the laser (size must be less or equal than the supercell size)
-     */
-    template<
-        uint32_t T_numWorkers,
-        typename T_LaserPlaneSizeInSuperCell
-    >
-    struct KernelLaser
+    namespace fields
     {
-        template<
-            typename T_Acc,
-            typename T_LaserFunctor
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_LaserFunctor laserFunctor
-        ) const
+        /** compute the electric field of the laser
+         *
+         * @tparam T_numWorkers number of workers
+         * @tparam T_LaserPlaneSizeInSuperCell number of cells per dimension which
+         *  initialize the laser (size must be less or equal than the supercell size)
+         */
+        template<uint32_t T_numWorkers, typename T_LaserPlaneSizeInSuperCell>
+        struct KernelLaser
         {
-            using LaserPlaneSizeInSuperCell = T_LaserPlaneSizeInSuperCell;
-            using LaserFunctor = T_LaserFunctor;
+            template<typename T_Acc, typename T_LaserFunctor>
+            DINLINE void operator()(T_Acc const& acc, T_LaserFunctor laserFunctor) const
+            {
+                using LaserPlaneSizeInSuperCell = T_LaserPlaneSizeInSuperCell;
+                using LaserFunctor = T_LaserFunctor;
 
-            PMACC_CASSERT_MSG(
-                __LaserPlaneSizeInSuperCell_y_must_be_less_or_equal_than_SuperCellSize_y,
-                LaserPlaneSizeInSuperCell::y::value <= SuperCellSize::y::value
-            );
+                PMACC_CASSERT_MSG(
+                    __LaserPlaneSizeInSuperCell_y_must_be_less_or_equal_than_SuperCellSize_y,
+                    LaserPlaneSizeInSuperCell::y::value <= SuperCellSize::y::value);
 
-            constexpr uint32_t planeSize = pmacc::math::CT::volume< LaserPlaneSizeInSuperCell >::type::value;
-            PMACC_CONSTEXPR_CAPTURE uint32_t numWorkers = T_numWorkers;
+                constexpr uint32_t planeSize = pmacc::math::CT::volume<LaserPlaneSizeInSuperCell>::type::value;
+                PMACC_CONSTEXPR_CAPTURE uint32_t numWorkers = T_numWorkers;
 
-            const uint32_t workerIdx = threadIdx.x;
+                const uint32_t workerIdx = cupla::threadIdx(acc).x;
 
-            // offset of the superCell (in cells, without any guards) to the origin of the local domain
+                // offset of the superCell (in cells, without any guards) to the origin of the local domain
 
-            DataSpace< simDim > localSuperCellOffset = DataSpace< simDim >( blockIdx );
+                DataSpace<simDim> localSuperCellOffset = DataSpace<simDim>(cupla::blockIdx(acc));
 
-            // add not handled supercells from LaserFunctor::Unitless::initPlaneY
-            localSuperCellOffset.y() += LaserFunctor::Unitless::initPlaneY / SuperCellSize::y::value;
+                // add not handled supercells from LaserFunctor::Unitless::initPlaneY
+                localSuperCellOffset.y() += LaserFunctor::Unitless::initPlaneY / SuperCellSize::y::value;
 
-            uint32_t cellOffsetInSuperCellFromInitPlaneY = LaserFunctor::Unitless::initPlaneY % SuperCellSize::y::value;
+                uint32_t cellOffsetInSuperCellFromInitPlaneY
+                    = LaserFunctor::Unitless::initPlaneY % SuperCellSize::y::value;
 
-            mappings::threads::ForEachIdx<
-                mappings::threads::IdxConfig<
-                    planeSize,
-                    numWorkers
-                >
-            > { workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                   auto accLaserFunctor = laserFunctor(
-                       acc,
-                       localSuperCellOffset,
-                       mappings::threads::WorkerCfg< numWorkers >{ workerIdx }
-                    );
+                mappings::threads::ForEachIdx<mappings::threads::IdxConfig<planeSize, numWorkers>>{
+                    workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                    auto accLaserFunctor
+                        = laserFunctor(acc, localSuperCellOffset, mappings::threads::WorkerCfg<numWorkers>{workerIdx});
 
                     /* cell index within the superCell */
-                    DataSpace< simDim > cellIdxInSuperCell = DataSpaceOperations< simDim >::template map< LaserPlaneSizeInSuperCell >( linearIdx );
+                    DataSpace<simDim> cellIdxInSuperCell
+                        = DataSpaceOperations<simDim>::template map<LaserPlaneSizeInSuperCell>(linearIdx);
                     cellIdxInSuperCell.y() += cellOffsetInSuperCellFromInitPlaneY;
 
-                    accLaserFunctor( acc, cellIdxInSuperCell );
-                }
-            );
-        }
-    };
+                    accLaserFunctor(acc, cellIdxInSuperCell);
+                });
+            }
+        };
 
-    /** Laser init in a single xz plane */
-    struct LaserPhysics
-    {
-        void operator()(uint32_t currentStep) const
+        /** Laser init in a single xz plane */
+        struct LaserPhysics
         {
-            /* The laser can be initialized in the plane of the first cell or
-             * any later x-z plane inside the simulation. Initializing the
-             * laser in planes inside the simulation corresponds to an
-             * evaluation of the field at negatively shifted time.
-             */
-            constexpr float_X laserTimeShift = laserProfiles::Selected::Unitless::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
-
-            const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
-
-            /* Disable laser if
-             * - init time of laser is over or
-             * - we have periodic boundaries in Y direction or
-             * - we already performed a slide
-             */
-            bool const laserNone = ( laserProfiles::Selected::Unitless::INIT_TIME == float_X(0.0) );
-            bool const laserInitTimeOver =
-                ( ( currentStep * DELTA_T  - laserTimeShift ) >= laserProfiles::Selected::Unitless::INIT_TIME );
-            bool const topBoundariesArePeriodic =
-                ( Environment<simDim>::get().GridController().getCommunicationMask( ).isSet( TOP ) );
-            bool const boxHasSlided = ( numSlides != 0 );
-
-            bool const disableLaser =
-                laserNone ||
-                laserInitTimeOver ||
-                topBoundariesArePeriodic ||
-                boxHasSlided;
-            if( !disableLaser )
+            void operator()(uint32_t currentStep) const
             {
-                PMACC_VERIFY_MSG(
-                    laserProfiles::Selected::Unitless::initPlaneY < static_cast<uint32_t>( Environment<simDim>::get().SubGrid().getLocalDomain().size.y() ),
-                    "initPlaneY must be located in the top GPU"
-                );
-
-                // laser is disabled e.g. laserNone
-                constexpr bool isLaserDisabled = laserProfiles::Selected::Unitless::INIT_TIME == 0.0_X;
-                constexpr bool isLaserInitInFirstCell = laserProfiles::Selected::Unitless::initPlaneY == 0;
-                // X + 1 is a workaround to avoid warning: pointless comparison of unsigned integer with zero
-                constexpr bool isInitPlaneYOutsideOfAbsorber =
-                    laserProfiles::Selected::Unitless::initPlaneY + 1 > absorber::numCells[1][0] + 1;
-                PMACC_CASSERT_MSG(
-                    __initPlaneY_needs_to_be_greater_than_the_top_absorber_cells_or_zero,
-                    isLaserDisabled || isLaserInitInFirstCell || isInitPlaneYOutsideOfAbsorber
-                );
-
-                /* Calculate how many neighbors to the left we have
-                 * to initialize the laser in the E-Field
-                 *
-                 * Example: Yee needs one neighbor to perform dB = curlE
-                 *            -> initialize in y=0 plane
-                 *          A second order solver could need 2 neighbors left:
-                 *            -> initialize in y=0 and y=1 plane
-                 *
-                 * Question: Why do other codes initialize the B-Field instead?
-                 * Answer:   Because our fields are defined on the lower cell side
-                 *           (C-Style ftw). Therefore, our curls (for example Yee)
-                 *           are shifted nabla+ <-> nabla- compared to Fortran codes
-                 *           (in other words: curlLeft <-> curlRight)
-                 *           for E and B.
-                 *           For this reason, we have to initialize E instead of B.
-                 *
-                 * Problem: that's still not our case. For example our Yee does a
-                 *          dE = curlLeft(B) - therefor, we should init B, too.
-                 *
-                 *
-                 *  @todo: might also lack temporal offset since our formulas are E(x,z,t) instead of E(x,y,z,t)
-                 *  `const int max_y_neighbors = Get<fields::Solver::OffsetOrigin_E, 1 >::value;`
-                 *
-                 * @todo Right now, the phase could be wrong ( == is cloned)
-                 *       @see LaserPhysics.hpp
-                 *
-                 * @todo What about the B-Field in the second plane?
-                 *
+                /* The laser can be initialized in the plane of the first cell or
+                 * any later x-z plane inside the simulation. Initializing the
+                 * laser in planes inside the simulation corresponds to an
+                 * evaluation of the field at negatively shifted time.
                  */
-                constexpr int laserInitCellsInY = 1;
+                constexpr float_X laserTimeShift
+                    = laserProfiles::Selected::Unitless::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
 
-                using LaserPlaneSizeInSuperCells = typename pmacc::math::CT::AssignIfInRange<
+                const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+
+                /* Disable laser if
+                 * - init time of laser is over or
+                 * - we have periodic boundaries in Y direction or
+                 * - we already performed a slide
+                 */
+                bool const laserNone = (laserProfiles::Selected::Unitless::INIT_TIME == float_X(0.0));
+                bool const laserInitTimeOver
+                    = ((currentStep * DELTA_T - laserTimeShift) >= laserProfiles::Selected::Unitless::INIT_TIME);
+                bool const topBoundariesArePeriodic
+                    = (Environment<simDim>::get().GridController().getCommunicationMask().isSet(TOP));
+                bool const boxHasSlided = (numSlides != 0);
+
+                bool const disableLaser = laserNone || laserInitTimeOver || topBoundariesArePeriodic || boxHasSlided;
+                if(!disableLaser)
+                {
+                    PMACC_VERIFY_MSG(
+                        laserProfiles::Selected::Unitless::initPlaneY
+                            < static_cast<uint32_t>(Environment<simDim>::get().SubGrid().getLocalDomain().size.y()),
+                        "initPlaneY must be located in the top GPU");
+
+                    // laser is disabled e.g. laserNone
+                    constexpr bool isLaserDisabled = laserProfiles::Selected::Unitless::INIT_TIME == 0.0_X;
+                    constexpr bool isLaserInitInFirstCell = laserProfiles::Selected::Unitless::initPlaneY == 0;
+                    // X + 1 is a workaround to avoid warning: pointless comparison of unsigned integer with zero
+                    constexpr bool isInitPlaneYOutsideOfAbsorber
+                        = laserProfiles::Selected::Unitless::initPlaneY + 1 > absorber::numCells[1][0] + 1;
+                    PMACC_CASSERT_MSG(
+                        __initPlaneY_needs_to_be_greater_than_the_top_absorber_cells_or_zero,
+                        isLaserDisabled || isLaserInitInFirstCell || isInitPlaneYOutsideOfAbsorber);
+
+                    /* Calculate how many neighbors to the left we have
+                     * to initialize the laser in the E-Field
+                     *
+                     * Example: Yee needs one neighbor to perform dB = curlE
+                     *            -> initialize in y=0 plane
+                     *          A second order solver could need 2 neighbors left:
+                     *            -> initialize in y=0 and y=1 plane
+                     *
+                     * Question: Why do other codes initialize the B-Field instead?
+                     * Answer:   Because our fields are defined on the lower cell side
+                     *           (C-Style ftw). Therefore, our curls (for example Yee)
+                     *           are shifted nabla+ <-> nabla- compared to Fortran codes
+                     *           (in other words: curlLeft <-> curlRight)
+                     *           for E and B.
+                     *           For this reason, we have to initialize E instead of B.
+                     *
+                     * Problem: that's still not our case. For example our Yee does a
+                     *          dE = curlLeft(B) - therefor, we should init B, too.
+                     *
+                     *
+                     *  @todo: might also lack temporal offset since our formulas are E(x,z,t) instead of E(x,y,z,t)
+                     *  `const int max_y_neighbors = Get<fields::Solver::OffsetOrigin_E, 1 >::value;`
+                     *
+                     * @todo Right now, the phase could be wrong ( == is cloned)
+                     *       @see LaserPhysics.hpp
+                     *
+                     * @todo What about the B-Field in the second plane?
+                     *
+                     */
+                    constexpr int laserInitCellsInY = 1;
+
+                    using LaserPlaneSizeInSuperCells = typename pmacc::math::CT::AssignIfInRange<
                         typename SuperCellSize::vector_type,
-                        bmpl::integral_c< uint32_t, 1 >, /* y direction */
-                        bmpl::integral_c< int, laserInitCellsInY >
-                >::type;
-
-                DataSpace< simDim > gridBlocks = Environment< simDim >::get().SubGrid().getLocalDomain().size / SuperCellSize::toRT();
-                // use the one supercell in y to initialize the laser plane
-                gridBlocks.y() = 1;
-
-                constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                    pmacc::math::CT::volume< LaserPlaneSizeInSuperCells >::type::value
-                >::value;
-
-                PMACC_KERNEL(
-                    KernelLaser<
-                        numWorkers,
-                        LaserPlaneSizeInSuperCells
-                    >{}
-                )(
-                    gridBlocks,
-                    numWorkers
-                )(
-                    laserProfiles::Selected( currentStep )
-                );
+                        bmpl::integral_c<uint32_t, 1>, /* y direction */
+                        bmpl::integral_c<int, laserInitCellsInY>>::type;
+
+                    DataSpace<simDim> gridBlocks
+                        = Environment<simDim>::get().SubGrid().getLocalDomain().size / SuperCellSize::toRT();
+                    // use the one supercell in y to initialize the laser plane
+                    gridBlocks.y() = 1;
+
+                    constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
+                        pmacc::math::CT::volume<LaserPlaneSizeInSuperCells>::type::value>::value;
+
+                    PMACC_KERNEL(KernelLaser<numWorkers, LaserPlaneSizeInSuperCells>{})
+                    (gridBlocks, numWorkers)(laserProfiles::Selected(currentStep));
+                }
             }
-        }
-    };
-} // namespace fields
+        };
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.def b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.def
new file mode 100644
index 0000000000..8f56074612
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.def
@@ -0,0 +1,82 @@
+/* Copyright 2020-2021 Klaus Steiniger, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
+#include "picongpu/fields/differentiation/Curl.def"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.def"
+#include "picongpu/fields/MaxwellSolver/Yee/Yee.def"
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace aoFDTD
+            {
+                /** Curl to be applied for the E-field
+                 *
+                 * @tparam T_neighbors Number of neighbors used to calculate the derivative from finite differences.
+                 *                     Same number of neighbors is used along all directions.
+                 *                     Order of derivative approximation is 2 * T_neighbors.
+                 */
+                template<uint32_t T_neighbors>
+                using CurlE = differentiation::Curl<Forward<T_neighbors>>;
+
+                /** Curl to be applied for the B-field
+                 *
+                 * @tparam T_neighbors Number of neighbors used to calculate the derivative from finite differences.
+                 *                     Same number of neighbors is used along all directions.
+                 *                     Order of derivative approximation is 2 * T_neighbors.
+                 */
+                template<uint32_t T_neighbors>
+                using CurlB = differentiation::Curl<Backward<T_neighbors>>;
+
+            } // namespace aoFDTD
+
+            /** Finite difference field solver of chosen order.
+             *
+             * References: M Ghrist
+             *             High-Order Finite Difference Methods for Wave Equations
+             *             PhD thesis (2000)
+             *             Department of Applied Mathematics, University of Colarado
+             *
+             *             H Vincenti et al
+             *             doi:10.1016/j.cpc.2015.11.009
+             *
+             * @tparam T_neighbors Number of neighbors used to calculate the derivative from finite differences.
+             *                     Same number of neighbors is used along all directions.
+             *                     Order of derivative approximation is 2 * T_neighbors.
+             */
+            template<uint32_t T_neighbors, typename T_CurrentInterpolation = currentInterpolation::None>
+            using ArbitraryOrderFDTD = ::picongpu::fields::maxwellSolver::
+                Yee<T_CurrentInterpolation, aoFDTD::CurlE<T_neighbors>, aoFDTD::CurlB<T_neighbors>>;
+
+            /* We need no definition of margins, because the Yee solver uses its curl
+             * classes to define margins
+             */
+
+        } // namespace maxwellSolver
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.hpp b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.hpp
new file mode 100644
index 0000000000..bce5f341eb
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.hpp
@@ -0,0 +1,49 @@
+/* Copyright 2020-2021 Klaus Steiniger, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.def"
+#include "picongpu/fields/differentiation/Curl.hpp"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.hpp"
+
+#include <cstdint>
+#include <string>
+
+
+namespace pmacc
+{
+    namespace traits
+    {
+        template<uint32_t T_neighbors, typename T_CurrentInterpolation>
+        struct StringProperties<
+            ::picongpu::fields::maxwellSolver::ArbitraryOrderFDTD<T_neighbors, T_CurrentInterpolation>>
+        {
+            static StringProperty get()
+            {
+                pmacc::traits::StringProperty propList("name", "other");
+                propList["param"] = std::string("Arbitrary order FDTD, order ") + std::to_string(T_neighbors);
+
+                return propList;
+            }
+        };
+
+    } // namespace traits
+} // namespace pmacc
diff --git a/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.def b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.def
new file mode 100644
index 0000000000..4e5fdb60a1
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.def
@@ -0,0 +1,56 @@
+/* Copyright 2020-2021 Klaus Steiniger, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace aoFDTD
+            {
+                /** Tag for forward derivative used in CurlE
+                 *  of the ArbitraryOrderFDTD solver
+                 *
+                 * @tparam T_neighbors Number of neighbors used to calculate
+                 *                     the spatial derivatives with finite differences.
+                 *                     Order of approximation is 2 * T_neighbors.
+                 */
+                template<uint32_t T_neighbors>
+                struct Forward;
+
+                /** Tag for forward derivative used in CurlE
+                 *  of the ArbitraryOrderFDTD solver
+                 *
+                 * @tparam T_neighbors Number of neighbors used to calculate
+                 *                     the spatial derivatives with finite differences.
+                 *                     Order of approximation is 2 * T_neighbors.
+                 */
+                template<uint32_t T_neighbors>
+                struct Backward;
+
+            } // namespace aoFDTD
+        } // namespace maxwellSolver
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.hpp b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.hpp
new file mode 100644
index 0000000000..56b3b4670d
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.hpp
@@ -0,0 +1,194 @@
+/* Copyright 2020-2021 Klaus Steiniger, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Derivative.def"
+#include "picongpu/fields/differentiation/Traits.hpp"
+#include <picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Weights.hpp>
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/meta/accessors/Identity.hpp>
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace aoFDTD
+            {
+                namespace detail
+                {
+                    /** Abstraction of the arbitrary-order finite-difference time domain
+                     *  derivative functor.
+                     *
+                     * @tparam T_lowerNeighbors Number of neighbors required in negative
+                     *                          direction to calculate field derivative
+                     *
+                     *
+                     * @tparam T_upperNeighbors Number of neighbors required in positive
+                     *                          direction to calculate field derivative
+                     *
+                     * @tparam T_neighbors Number of neighbors used to calculate
+                     *                     the derivative from finite differences.
+                     *                     Order of derivative approximation is
+                     *                     2 * T_neighbors
+                     *
+                     * @tparam T_direction Direction to take derivative in, 0 = x, 1 = y, 2 = z
+                     */
+                    template<
+                        uint32_t T_lowerNeighbors,
+                        uint32_t T_upperNeighbors,
+                        uint32_t T_neighbors,
+                        uint32_t T_direction>
+                    struct GeneralAofdtdDerivative
+                    {
+                        //! Lower margin
+                        using LowerMargin = typename pmacc::math::CT::mul<
+                            typename pmacc::math::CT::make_Int<simDim, T_lowerNeighbors>::type,
+                            typename pmacc::math::CT::make_BasisVector<simDim, T_direction, int>::type>::type;
+
+                        //! Upper margin
+                        using UpperMargin = typename pmacc::math::CT::mul<
+                            typename pmacc::math::CT::make_Int<simDim, T_upperNeighbors>::type,
+                            typename pmacc::math::CT::make_BasisVector<simDim, T_direction, int>::type>::type;
+
+                        /** Return derivative value at the given point
+                         *
+                         * @tparam T_DataBox data box type with field data
+                         * @param data position in the data box to compute derivative at
+                         */
+                        template<typename T_DataBox>
+                        HDINLINE typename T_DataBox::ValueType operator()(T_DataBox const& data) const
+                        {
+                            // Define shorthand type to access DataBox
+                            using IndexType = pmacc::DataSpace<simDim>;
+
+                            // Define indice vectors for data access
+                            auto lowerIndex = IndexType{}; // Vector initialized with zeros
+                            auto upperIndex = IndexType{};
+
+                            // lowerIndex: 0 if ( Forward ) else -1
+                            lowerIndex[T_direction]
+                                = static_cast<int32_t>(T_upperNeighbors) - static_cast<int32_t>(T_neighbors);
+                            // upperIndex: 1 if ( Forward ) else 0
+                            upperIndex[T_direction]
+                                = static_cast<int32_t>(T_neighbors) - static_cast<int32_t>(T_lowerNeighbors);
+
+                            AOFDTDWeights<T_neighbors> const weights{};
+
+                            // shortest distance finite difference as initial value
+                            auto finiteDifference = weights[0] * (data(upperIndex) - data(lowerIndex));
+
+                            // Compute next finite differences according to order
+                            for(uint32_t l = 1u; l < T_neighbors; ++l)
+                            {
+                                lowerIndex[T_direction] -= 1;
+                                upperIndex[T_direction] += 1;
+
+                                finiteDifference += weights[l] * (data(upperIndex) - data(lowerIndex));
+                            }
+
+                            return finiteDifference / cellSize[T_direction];
+                        }
+                    };
+                } // namespace detail
+
+
+                /**@{*/
+                /** Functors for forward and backward derivative along the given direction used in ArbitraryOrderFDTD
+                 * solver
+                 *
+                 * Compute an approximation of the derivative of a field f by a finite difference of
+                 * order 2 * T_neighbors, where T_neighbors is the number of neighbors
+                 * used to calculate the finite difference.
+                 *
+                 * This finite difference approximations for the forward and backward derivative are computed on a
+                 * staggered grid. That is, the forward derivative will be known at a position i+1/2, if the field f is
+                 * known at 2 * T_neighbors grid nodes i - T_neighbors + 1, i - T_neighbors + 2, ..., i + T_neighbors.
+                 * The backward derivative will be known at a position i-1/2, if the field f is known
+                 * at 2 * T_neighbors grid nodes i - T_neighbors, i - T_neighbors + 1, ..., i + T_neighbors - 1.
+                 *
+                 * The finite difference calculation can be expressed as a sum of finite differences where the
+                 * distance of field components used in individual finite differences computations increases, e.g.
+                 *     D_x f(i+1/2) = sum_{l=0}^{T_neighbors-1} g_l^{2T_neighbors} * ( f(i+1+l) - f(i-l) ) / dx,
+                 * for the forward derivative and where D_x is the derivative operator along x, dx the grid spacing
+                 * along x, and g_l^{2T_neighbors} weightings for the finite differences of different distance l from
+                 * the point i of computation.
+                 *
+                 * @tparam T_neighbors Number of neighbors used to calculate
+                 *                     the derivative from finite differences.
+                 *                     Order of derivative approximation is
+                 *                     2 * T_neighbors
+                 *
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_neighbors, uint32_t T_direction>
+                using ForwardDerivativeFunctor
+                    = detail::GeneralAofdtdDerivative<T_neighbors - 1, T_neighbors, T_neighbors, T_direction>;
+
+
+                template<uint32_t T_neighbors, uint32_t T_direction>
+                using BackwardDerivativeFunctor
+                    = detail::GeneralAofdtdDerivative<T_neighbors, T_neighbors - 1, T_neighbors, T_direction>;
+                /**@}*/
+
+            } // namespace aoFDTD
+        } // namespace maxwellSolver
+
+        namespace differentiation
+        {
+            namespace traits
+            {
+                /**@{*/
+                /** DerivativeFunctor type trait specialization for Forward and Backward derivative in
+                 *  ArbitraryOrderFDTD solver
+                 *
+                 * @tparam T_neighbors Number of neighbors used to calculate
+                 *                      the derivative from finite differences.
+                 *                      Order of derivative approximation is
+                 *                      2 * T_neighbors
+                 *
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_neighbors, uint32_t T_direction>
+                struct DerivativeFunctor<maxwellSolver::aoFDTD::Forward<T_neighbors>, T_direction>
+                    : pmacc::meta::accessors::Identity<
+                          maxwellSolver::aoFDTD::ForwardDerivativeFunctor<T_neighbors, T_direction>>
+                {
+                };
+
+
+                template<uint32_t T_neighbors, uint32_t T_direction>
+                struct DerivativeFunctor<maxwellSolver::aoFDTD::Backward<T_neighbors>, T_direction>
+                    : pmacc::meta::accessors::Identity<
+                          maxwellSolver::aoFDTD::BackwardDerivativeFunctor<T_neighbors, T_direction>>
+                {
+                };
+                /**@}*/
+
+            } // namespace traits
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Weights.hpp b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Weights.hpp
new file mode 100644
index 0000000000..281d592a48
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/Weights.hpp
@@ -0,0 +1,83 @@
+/* Copyright 2020-2021 Klaus Steiniger, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cmath>
+#include "picongpu/plugins/radiation/utilities.hpp"
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace aoFDTD
+            {
+                /** Compute weights of finite differences in
+                 *
+                 * @tparam T_neighbors Number of neighbors used to calculate
+                 *                     the derivative from finite differences.
+                 *                     Order of derivative approximation is
+                 *                     2 * T_neighbors
+                 */
+                template<uint32_t T_neighbors>
+                struct AOFDTDWeights
+                {
+                    HDINLINE constexpr AOFDTDWeights()
+                    {
+                        namespace powSpace = ::picongpu::plugins::radiation::util;
+                        // Set initial value
+                        weights[0] = 4.0_X * T_neighbors
+                            * powSpace::pow(
+                                         (factorial(2 * T_neighbors)
+                                          / float_X(
+                                              powSpace::pow(2.0_X, 2 * T_neighbors)
+                                              * powSpace::pow(factorial(T_neighbors), 2))),
+                                         2);
+
+                        // Compute all other values
+                        for(uint32_t l = 1u; l < T_neighbors; ++l)
+                        {
+                            weights[l] = -1.0_X * powSpace::pow(float_X(l) - 0.5_X, 2) * (T_neighbors - l)
+                                / float_X(T_neighbors + l) / float_X(powSpace::pow(float_X(l) + 0.5_X, 2))
+                                * weights[l - 1];
+                        }
+                    }
+
+                    HDINLINE constexpr float_X operator[](uint32_t const l) const
+                    {
+                        PMACC_ASSERT_MSG(l < T_neighbors, "NUMBER_OF_COEFFICIENTS_IS_LIMITED_BY_NUMBER_OF_NEIGHBORS");
+                        return weights[l];
+                    }
+
+                private:
+                    HDINLINE constexpr uint32_t factorial(uint32_t const n) const
+                    {
+                        return n <= 1u ? 1u : (n * factorial(n - 1u));
+                    }
+
+                    float_X weights[T_neighbors];
+                };
+            } // namespace aoFDTD
+        } // namespace maxwellSolver
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.def b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.def
new file mode 100644
index 0000000000..bdcfd14944
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.def
@@ -0,0 +1,57 @@
+/* Copyright 2020-2021 Klaus Steiniger
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.def"
+#include "picongpu/fields/MaxwellSolver/YeePML/YeePML.def"
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            /**
+             * Finite difference field solver of chosen order with
+             * Perfectly Matched Layer Absorbing Boundary Conditions (PML).
+             *
+             * References: M Ghrist
+             *             High-Order Finite Difference Methods for Wave Equations
+             *             PhD thesis (2000)
+             *             Department of Applied Mathematics, University of Colarado
+             *
+             *             H Vincenti et al
+             *             doi:10.1016/j.cpc.2015.11.009
+             *
+             * @tparam T_neighbors Number of neighbors used to calculate the derivative from finite differences.
+             *                     Same number of neighbors is used along all directions.
+             *                     Order of derivative approximation is 2 * T_neighbors.
+             */
+            template<uint32_t T_neighbors, typename T_CurrentInterpolation = currentInterpolation::None>
+            using ArbitraryOrderFDTDPML = ::picongpu::fields::maxwellSolver::
+                YeePML<T_CurrentInterpolation, aoFDTD::CurlE<T_neighbors>, aoFDTD::CurlB<T_neighbors>>;
+
+        } // namespace maxwellSolver
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.hpp b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.hpp
new file mode 100644
index 0000000000..0416c07230
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.hpp
@@ -0,0 +1,47 @@
+/* Copyright 2020-2021 Klaus Steiniger
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.hpp"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.def"
+
+#include <cstdint>
+
+
+namespace pmacc
+{
+    namespace traits
+    {
+        template<uint32_t T_neighbors, typename T_CurrentInterpolation>
+        struct StringProperties<
+            ::picongpu::fields::maxwellSolver::ArbitraryOrderFDTDPML<T_neighbors, T_CurrentInterpolation>>
+        {
+            static StringProperty get()
+            {
+                pmacc::traits::StringProperty propList("name", "other");
+                propList["param"] = std::string("Arbitrary order FDTD with PML, order ") + std::to_string(T_neighbors);
+
+                return propList;
+            }
+        };
+
+    } // namespace traits
+} // namespace pmacc
diff --git a/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.def b/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.def
deleted file mode 100644
index 89065c17de..0000000000
--- a/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.def
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
-
-namespace picongpu
-{
-namespace fields
-{
-namespace maxwellSolver
-{
-
-    template< typename T_CurrentInterpolation = currentInterpolation::NoneDS >
-    class DirSplitting;
-
-} // namespace maxwellSolver
-} // namespace fields
-
-namespace traits
-{
-
-    template< typename T_CurrentInterpolation >
-    struct GetMargin<
-        picongpu::fields::maxwellSolver::DirSplitting< T_CurrentInterpolation >,
-        picongpu::FIELD_B
-    >
-    {
-        using LowerMargin = pmacc::math::CT::Int <
-            1,
-            1,
-            1
-        >;
-        using UpperMargin = LowerMargin;
-    };
-
-    template< typename T_CurrentInterpolation >
-    struct GetMargin<
-        picongpu::fields::maxwellSolver::DirSplitting< T_CurrentInterpolation >,
-        picongpu::FIELD_E
-    >
-    {
-        using LowerMargin = pmacc::math::CT::Int <
-            1,
-            1,
-            1
-        >;
-        using UpperMargin = LowerMargin;
-    };
-
-} //namespace traits
-} // picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.hpp b/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.hpp
deleted file mode 100644
index d990871d4d..0000000000
--- a/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.hpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.def"
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.kernel"
-#include "picongpu/fields/FieldB.hpp"
-#include "picongpu/fields/FieldE.hpp"
-#include "picongpu/fields/cellType/Centered.hpp"
-#include "picongpu/fields/LaserPhysics.hpp"
-
-#include <pmacc/cuSTL/algorithm/kernel/ForeachBlock.hpp>
-#include <pmacc/cuSTL/cursor/NestedCursor.hpp>
-#include <pmacc/math/Vector.hpp>
-#include <pmacc/math/vector/Int.hpp>
-#include <pmacc/math/vector/TwistComponents.hpp>
-#include <pmacc/math/vector/compile-time/TwistComponents.hpp>
-#include <pmacc/dataManagement/DataConnector.hpp>
-
-
-namespace picongpu
-{
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace dirSplitting
-{
-    /** Check Directional Splitting grid and time conditions
-     *
-     * This is a workaround that the condition check is only
-     * triggered if the current used solver is `DirSplitting`
-     */
-    template<typename T_UsedSolver, typename T_Dummy = void>
-    struct ConditionCheck
-    {
-    };
-
-    template<typename T_CurrentInterpolation, typename T_Dummy>
-    struct ConditionCheck<DirSplitting< T_CurrentInterpolation >, T_Dummy>
-    {
-        /* Directional Splitting conditions:
-         *
-         * using SI units to avoid round off errors
-         *
-         * The compiler is allowed to evaluate an expression those not depends on a template parameter
-         * even if the class is never instantiated. In that case static assert is always
-         * evaluated (e.g. with clang), this results in an error if the condition is false.
-         * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
-         *
-         * A workaround is to add a template dependency to the expression.
-         * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
-         */
-        PMACC_CASSERT_MSG(DirectionSplitting_Set_dX_equal_dt_times_c____check_your_grid_param_file,
-                          (SI::SPEED_OF_LIGHT_SI * SI::DELTA_T_SI) == SI::CELL_WIDTH_SI &&
-                          (sizeof(T_Dummy) != 0));
-        PMACC_CASSERT_MSG(DirectionSplitting_use_cubic_cells____check_your_grid_param_file,
-                          SI::CELL_HEIGHT_SI == SI::CELL_WIDTH_SI &&
-                          (sizeof(T_Dummy) != 0));
-#if (SIMDIM == DIM3)
-        PMACC_CASSERT_MSG(DirectionSplitting_use_cubic_cells____check_your_grid_param_file,
-                          SI::CELL_DEPTH_SI == SI::CELL_WIDTH_SI &&
-                          (sizeof(T_Dummy) != 0));
-#endif
-    };
-} // namespace dirSplitting
-
-    template< typename T_CurrentInterpolation >
-    class DirSplitting: private dirSplitting::ConditionCheck< DirSplitting< T_CurrentInterpolation > >
-    {
-    private:
-        template<typename OrientationTwist,typename CursorE, typename CursorB, typename GridSize>
-        void propagate(CursorE cursorE, CursorB cursorB, GridSize gridSize) const
-        {
-            using namespace cursor::tools;
-            using namespace pmacc::math;
-
-            auto gridSizeTwisted = twistComponents<OrientationTwist>(gridSize);
-
-            /* twist components of the supercell */
-            using BlockDim = typename CT::TwistComponents<SuperCellSize, OrientationTwist>::type;
-
-            algorithm::kernel::ForeachBlock<BlockDim> foreach;
-            foreach(zone::SphericZone<3>(pmacc::math::Size_t<3>(BlockDim::x::value, gridSizeTwisted.y(), gridSizeTwisted.z())),
-                    cursor::make_NestedCursor(twistVectorFieldAxes<OrientationTwist>(cursorE)),
-                    cursor::make_NestedCursor(twistVectorFieldAxes<OrientationTwist>(cursorB)),
-                    DirSplittingKernel<BlockDim>((int)gridSizeTwisted.x()));
-        }
-    public:
-
-        using CellType = cellType::Centered;
-        using CurrentInterpolation = T_CurrentInterpolation;
-
-        DirSplitting(MappingDesc) {}
-
-        void update_beforeCurrent(uint32_t currentStep) const
-        {
-            using GuardDim = SuperCellSize;
-
-            DataConnector &dc = Environment<>::get().DataConnector();
-
-            auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-            auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
-
-            auto fieldE_coreBorder =
-                fieldE->getGridBuffer().getDeviceBuffer().
-                       cartBuffer().view(GuardDim().toRT(),
-                                         -GuardDim().toRT());
-            auto fieldB_coreBorder =
-                fieldB->getGridBuffer().getDeviceBuffer().
-                cartBuffer().view(GuardDim().toRT(),
-                                  -GuardDim().toRT());
-
-            using namespace cursor::tools;
-            using namespace pmacc::math;
-
-            pmacc::math::Size_t<3> gridSize = fieldE_coreBorder.size();
-
-
-            using Orientation_X = pmacc::math::CT::Int<0,1,2>;
-            propagate<Orientation_X>(
-                      fieldE_coreBorder.origin(),
-                      fieldB_coreBorder.origin(),
-                      gridSize);
-
-            __setTransactionEvent(fieldE->asyncCommunication(__getTransactionEvent()));
-            __setTransactionEvent(fieldB->asyncCommunication(__getTransactionEvent()));
-
-            using Orientation_Y = pmacc::math::CT::Int<1,2,0>;
-            propagate<Orientation_Y>(
-                      fieldE_coreBorder.origin(),
-                      fieldB_coreBorder.origin(),
-                      gridSize);
-
-            __setTransactionEvent(fieldE->asyncCommunication(__getTransactionEvent()));
-            __setTransactionEvent(fieldB->asyncCommunication(__getTransactionEvent()));
-
-            using Orientation_Z = pmacc::math::CT::Int<2,0,1>;
-            propagate<Orientation_Z>(
-                      fieldE_coreBorder.origin(),
-                      fieldB_coreBorder.origin(),
-                      gridSize);
-
-            if (laserProfiles::Selected::INIT_TIME > float_X(0.0))
-                LaserPhysics{}(currentStep);
-
-            __setTransactionEvent(fieldE->asyncCommunication(__getTransactionEvent()));
-            __setTransactionEvent(fieldB->asyncCommunication(__getTransactionEvent()));
-
-            dc.releaseData( FieldE::getName() );
-            dc.releaseData( FieldB::getName() );
-        }
-
-        void update_afterCurrent(uint32_t) const
-        {
-            DataConnector &dc = Environment<>::get().DataConnector();
-
-            auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-            auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
-
-            EventTask eRfieldE = fieldE->asyncCommunication(__getTransactionEvent());
-            EventTask eRfieldB = fieldB->asyncCommunication(__getTransactionEvent());
-            __setTransactionEvent(eRfieldE);
-            __setTransactionEvent(eRfieldB);
-
-            dc.releaseData( FieldE::getName() );
-            dc.releaseData( FieldB::getName() );
-        }
-
-        static pmacc::traits::StringProperty getStringProperties()
-        {
-            pmacc::traits::StringProperty propList( "name", "DS" );
-            return propList;
-        }
-    };
-
-} // namespace maxwellSolver
-} // namespace fields
-} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.kernel b/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.kernel
deleted file mode 100644
index 86d73ad14d..0000000000
--- a/include/picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.kernel
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <pmacc/types.hpp>
-#include <pmacc/math/vector/Float.hpp>
-#include <pmacc/math/Vector.hpp>
-#include <pmacc/cuSTL/container/compile-time/SharedBuffer.hpp>
-#include <pmacc/cuSTL/algorithm/cudaBlock/Foreach.hpp>
-#include <pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp>
-#include <pmacc/nvidia/functors/Assign.hpp>
-
-
-namespace picongpu
-{
-namespace fields
-{
-namespace maxwellSolver
-{
-
-template<typename BlockDim>
-struct DirSplittingKernel
-{
-    using result_type = void;
-
-    PMACC_ALIGN(m_totalLength,int);
-    DirSplittingKernel(int totalLength) : m_totalLength(totalLength) {}
-
-    template<typename CursorE, typename CursorB >
-    DINLINE void propagate(CursorE cursorE, CursorB cursorB) const
-    {
-        float_X a_plus = (*cursorB(-1, 0, 0)).z() + (*cursorE(-1, 0, 0)).y();
-        float_X a_minus = (*cursorB(1, 0, 0)).z() - (*cursorE(1, 0, 0)).y();
-        float_X a_prime_plus = (*cursorB(-1, 0, 0)).y() - (*cursorE(-1, 0, 0)).z();
-        float_X a_prime_minus = (*cursorB(1, 0, 0)).y() + (*cursorE(1, 0, 0)).z();
-
-        __syncthreads();
-
-        (*cursorB).z() = float_X(0.5) * (a_plus + a_minus);
-        (*cursorE).y() = float_X(0.5) * (a_plus - a_minus);
-        (*cursorB).y() = float_X(0.5) * (a_prime_plus + a_prime_minus);
-        (*cursorE).z() = float_X(0.5) * (a_prime_minus - a_prime_plus);
-
-        __syncthreads();
-    }
-
-    template<
-        typename CursorE,
-        typename CursorB,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        CursorE globalE,
-        CursorB globalB
-    ) const
-    {
-        //\todo: optimize cache size
-        typedef typename pmacc::math::CT::add<
-            typename BlockDim::vector_type,
-            typename pmacc::math::CT::Int < 2, 0, 0 > ::vector_type>::type CacheSize;
-
-        typedef container::CT::SharedBuffer<float3_X, CacheSize, 0 > CacheE;
-        typedef container::CT::SharedBuffer<float3_X, CacheSize, 1 > CacheB;
-        CacheE cacheE( acc );
-        CacheB cacheB( acc );
-
-        float3_X fieldE_old;
-        float3_X fieldB_old;
-        int threadPos_x = threadIdx.x;
-
-        //!@todo remove this explicit index calculation, this is a workaround during the lockstep refactoring
-        int linearThreadIdx = threadIdx.z * BlockDim::x::value * BlockDim::y::value +
-            threadIdx.y * BlockDim::x::value +
-            threadIdx.x;
-        algorithm::cudaBlock::Foreach<BlockDim> foreach(linearThreadIdx);
-
-        for (int x_offset = 0; x_offset < this->m_totalLength; x_offset += BlockDim::x::value)
-        {
-            foreach(acc, typename CacheE::Zone(), cacheE.origin(), globalE(-1 + x_offset, 0, 0), pmacc::nvidia::functors::Assign{});
-            foreach(acc, typename CacheB::Zone(), cacheB.origin(), globalB(-1 + x_offset, 0, 0), pmacc::nvidia::functors::Assign{});
-            __syncthreads();
-
-            auto cursorE = cacheE.origin()(1, 0, 0)(threadPos_x, threadIdx.y, threadIdx.z);
-            auto cursorB = cacheB.origin()(1, 0, 0)(threadPos_x, threadIdx.y, threadIdx.z);
-
-            if(threadPos_x == BlockDim::x::value - 1)
-            {
-                fieldE_old = *cursorE;
-                fieldB_old = *cursorB;
-            }
-            if(threadPos_x == 0 && x_offset > 0)
-            {
-                *cursorE(-1,0,0) = fieldE_old;
-                *cursorB(-1,0,0) = fieldB_old;
-            }
-
-            propagate(cursorE, cursorB);
-
-            typedef zone::CT::SphericZone<BlockDim> BlockZone;
-            foreach(acc, BlockZone(), globalE(x_offset, 0, 0), cacheE.origin()(1, 0, 0), pmacc::nvidia::functors::Assign{});
-            foreach(acc, BlockZone(), globalB(x_offset, 0, 0), cacheB.origin()(1, 0, 0), pmacc::nvidia::functors::Assign{});
-
-            __syncthreads();
-
-            threadPos_x = BlockDim::x::value - 1 - threadPos_x;
-        }
-    }
-
-};
-
-} // namespace maxwellSolver
-} // namespace fields
-} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Lehe/Curl.def b/include/picongpu/fields/MaxwellSolver/Lehe/Curl.def
deleted file mode 100644
index d051a838fc..0000000000
--- a/include/picongpu/fields/MaxwellSolver/Lehe/Curl.def
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <pmacc/types.hpp>
-#include <pmacc/math/Vector.hpp>
-
-
-namespace picongpu
-{
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace lehe
-{
-
-    class CherenkovFreeDirection_X{ };
-    class CherenkovFreeDirection_Y{ };
-
-    template< class Direction >
-    struct CurlE;
-
-} // namespace lehe
-} // namespace maxwellSolver
-} // namespace fields
-} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Lehe/Curl.hpp b/include/picongpu/fields/MaxwellSolver/Lehe/Curl.hpp
deleted file mode 100644
index 59e4d7b042..0000000000
--- a/include/picongpu/fields/MaxwellSolver/Lehe/Curl.hpp
+++ /dev/null
@@ -1,339 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/fields/MaxwellSolver/Lehe/Curl.def"
-
-#include <pmacc/algorithms/math/defines/pi.hpp>
-#include <pmacc/types.hpp>
-#include <pmacc/math/Vector.hpp>
-
-
-namespace picongpu
-{
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace lehe
-{
-    template< >
-    struct CurlE< CherenkovFreeDirection_X >
-    {
-        typedef pmacc::math::CT::Int< 1, 1, 1 > LowerMargin;
-        typedef pmacc::math::CT::Int< 2, 2, 2 > UpperMargin;
-
-        float_X mySin;
-
-        HINLINE CurlE( )
-        {
-            mySin = float_X(
-                math::sin(
-                    pmacc::algorithms::math::Pi< float_64 >::halfValue *
-                    float_64( SPEED_OF_LIGHT ) *
-                    float_64( DELTA_T ) / float_64( CELL_WIDTH )
-                )
-            );
-        }
-
-        template<class Memory >
-        HDINLINE typename Memory::ValueType operator( )(const Memory & mem ) const
-        {
-            /* Distinguished direction where the numerical Cherenkov Radiation
-             * of moving particles is suppressed.
-             */
-            constexpr float_X isDir_x = float_X( 1.0 );
-            constexpr float_X isDir_y = float_X( 0.0 );
-            constexpr float_X isDir_z = float_X( 0.0 );
-
-            constexpr float_X isNotDir_x = float_X( 1.0 ) - isDir_x;
-            constexpr float_X isNotDir_y = float_X( 1.0 ) - isDir_y;
-            constexpr float_X isNotDir_z = float_X( 1.0 ) - isDir_z;
-
-            constexpr float_X dx2 = CELL_WIDTH * CELL_WIDTH;
-            constexpr float_X dy2 = CELL_HEIGHT * CELL_HEIGHT;
-            constexpr float_X dz2 = CELL_DEPTH * CELL_DEPTH;
-            constexpr float_X dt2 = DELTA_T * DELTA_T;
-            constexpr float_X c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
-
-            constexpr float_X reci_dx = float_X( 1.0 ) / CELL_WIDTH;
-            constexpr float_X reci_dy = float_X( 1.0 ) / CELL_HEIGHT;
-            constexpr float_X reci_dz = float_X( 1.0 ) / CELL_DEPTH;
-
-            constexpr float_X beta_xy = float_X( 0.125 ) * dx2 / dy2 * isDir_x
-                + float_X( 0.125 ) * isNotDir_x * isDir_y;
-            constexpr float_X beta_xz = float_X( 0.125 ) * dx2 / dz2 * isDir_x
-                + float_X( 0.125 ) * isNotDir_x * isDir_z;
-
-            constexpr float_X beta_yx = float_X( 0.125 ) * dy2 / dx2 * isDir_y
-                + float_X( 0.125 ) * isNotDir_y * isDir_x;
-            constexpr float_X beta_yz = float_X( 0.125 ) * dy2 / dz2 * isDir_y
-                + float_X( 0.125 ) * isNotDir_y * isDir_z;
-
-            constexpr float_X beta_zx = float_X( 0.125 ) * dz2 / dx2 * isDir_z
-                + float_X( 0.125 ) * isNotDir_z * isDir_x;
-            constexpr float_X beta_zy = float_X( 0.125 ) * dz2 / dy2 * isDir_z
-                + float_X( 0.125 ) * isNotDir_z * isDir_y;
-
-            constexpr float_X d_dir = CELL_WIDTH * isDir_x
-                + CELL_HEIGHT * isDir_y
-                + CELL_DEPTH * isDir_z;
-            constexpr float_X d_dir2 = d_dir * d_dir;
-
-            // delta_x0 == delta_x
-            // delta_dir0 == delta_dir
-            const float_X delta_dir0 = float_X( 0.25 ) *
-                ( float_X( 1.0 ) - d_dir2 / ( c2 * dt2 ) * mySin * mySin );
-
-            const float_X alpha_x = float_X( 1.0 )
-                - float_X( 2.0 ) * beta_xy * isNotDir_x * isDir_y
-                - float_X( 2.0 ) * beta_xz * isNotDir_x * isDir_z
-                - float_X( 2.0 ) * beta_xy * isDir_x
-                - float_X( 2.0 ) * beta_xz * isDir_x
-                - float_X( 3.0 ) * delta_dir0 * isDir_x;
-
-            const float_X alpha_y = float_X( 1.0 )
-                - float_X( 2.0 ) * beta_yx * isNotDir_y * isDir_x
-                - float_X( 2.0 ) * beta_yz * isNotDir_y * isDir_z
-                - float_X( 2.0 ) * beta_yx * isDir_y
-                - float_X( 2.0 ) * beta_yz * isDir_y
-                - float_X( 3.0 ) * delta_dir0 * isDir_y;
-
-            const float_X alpha_z = float_X( 1.0 )
-                - float_X( 2.0 ) * beta_zx * isNotDir_z * isDir_x
-                - float_X( 2.0 ) * beta_zy * isNotDir_z * isDir_y
-                - float_X( 2.0 ) * beta_zx * isDir_z
-                - float_X( 2.0 ) * beta_zy * isDir_z
-                - float_X( 3.0 ) * delta_dir0 * isDir_z;
-
-
-            const float_X curl_x
-                = (
-                    alpha_y * ( mem[0][0][0].z( ) - mem[0][-1][0].z( ) )
-                    + beta_yx * ( mem[1][0][0].z( ) - mem[1][-1][0].z( ) )
-                    + beta_yx * ( mem[-1][0][0].z( ) - mem[-1][-1][0].z( ) )
-                    ) * reci_dy
-                - (
-                    alpha_z * ( mem[0][0][0].y( ) - mem[0][0][-1].y( ) )
-                    + beta_zx * ( mem[1][0][0].y( ) - mem[1][0][-1].y( ) )
-                    + beta_zx * ( mem[-1][0][0].y( ) - mem[-1][0][-1].y( ) )
-                    ) * reci_dz;
-
-
-            const float_X curl_y
-                = (
-                    alpha_z * ( mem[0][0][0].x( ) - mem[0][0][-1].x( ) )
-                    + beta_zx * ( mem[1][0][0].x( ) - mem[1][0][-1].x( ) )
-                    + beta_zx * ( mem[-1][0][0].x( ) - mem[-1][0][-1].x( ) )
-                    ) * reci_dz
-                - (
-                    alpha_x * ( mem[0][0][0].z( ) - mem[-1][0][0].z( ) )
-                    + delta_dir0 * ( mem[1][0][0].z( ) - mem[-2][0][0].z( ) )
-                    + beta_xy * ( mem[0][1][0].z( ) - mem[-1][1][0].z( ) )
-                    + beta_xy * ( mem[0][-1][0].z( ) - mem[-1][-1][0].z( ) )
-                    + beta_xz * ( mem[0][0][1].z( ) - mem[-1][0][1].z( ) )
-                    + beta_xz * ( mem[0][0][-1].z( ) - mem[-1][0][-1].z( ) )
-                    ) * reci_dx;
-
-
-            const float_X curl_z
-                = (
-                    alpha_x * ( mem[0][0][0].y( ) - mem[-1][0][0].y( ) )
-                    + delta_dir0 * ( mem[1][0][0].y( ) - mem[-2][0][0].y( ) )
-                    + beta_xy * ( mem[0][1][0].y( ) - mem[-1][1][0].y( ) )
-                    + beta_xy * ( mem[0][-1][0].y( ) - mem[-1][-1][0].y( ) )
-                    + beta_xz * ( mem[0][0][1].y( ) - mem[-1][0][1].y( ) )
-                    + beta_xz * ( mem[0][0][-1].y( ) - mem[-1][0][-1].y( ) )
-                    ) * reci_dx
-                - (
-                    alpha_y * ( mem[0][0][0].x( ) - mem[0][-1][0].x( ) )
-                    + beta_yx * ( mem[1][0][0].x( ) - mem[1][-1][0].x( ) )
-                    + beta_yx * ( mem[-1][0][0].x( ) - mem[-1][-1][0].x( ) )
-                    ) * reci_dy;
-
-            return float3_X( curl_x, curl_y, curl_z );
-
-            //return float3_X(diff(mem, 1).z() - diff(mem, 2).y(),
-            //                diff(mem, 2).x() - diff(mem, 0).z(),
-            //                diff(mem, 0).y() - diff(mem, 1).x());
-        }
-    };
-
-
-    template< >
-    struct CurlE< CherenkovFreeDirection_Y >
-    {
-        typedef pmacc::math::CT::Int< 1, 1, 1 > LowerMargin;
-        typedef pmacc::math::CT::Int< 2, 2, 2 > UpperMargin;
-
-        float_X mySin;
-
-        HINLINE CurlE( )
-        {
-            mySin = float_X(
-                math::sin(
-                    pmacc::algorithms::math::Pi< float_64 >::halfValue *
-                    float_64( SPEED_OF_LIGHT ) *
-                    float_64( DELTA_T ) / float_64( CELL_HEIGHT )
-                )
-            );
-        }
-
-        template<class Memory >
-        HDINLINE typename Memory::ValueType operator( )(const Memory & mem ) const
-        {
-            /* Distinguished direction where the numerical Cherenkov Radiation
-             * of moving particles is suppressed.
-             */
-            constexpr float_X isDir_x = float_X( 0.0 );
-            constexpr float_X isDir_y = float_X( 1.0 );
-            constexpr float_X isDir_z = float_X( 0.0 );
-
-            constexpr float_X isNotDir_x = float_X( 1.0 ) - isDir_x;
-            constexpr float_X isNotDir_y = float_X( 1.0 ) - isDir_y;
-            constexpr float_X isNotDir_z = float_X( 1.0 ) - isDir_z;
-
-            constexpr float_X dx2 = CELL_WIDTH * CELL_WIDTH;
-            constexpr float_X dy2 = CELL_HEIGHT * CELL_HEIGHT;
-            constexpr float_X dz2 = CELL_DEPTH * CELL_DEPTH;
-            constexpr float_X dt2 = DELTA_T * DELTA_T;
-            constexpr float_X c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
-
-            constexpr float_X reci_dx = float_X( 1.0 ) / CELL_WIDTH;
-            constexpr float_X reci_dy = float_X( 1.0 ) / CELL_HEIGHT;
-            constexpr float_X reci_dz = float_X( 1.0 ) / CELL_DEPTH;
-
-            /** Naming of the coefficients
-             *  1st letter: direction of differentiation
-             *  2nd letter: direction of averaging
-             */
-            constexpr float_X beta_xy = float_X( 0.125 ) * dx2 / dy2 * isDir_x
-                + float_X( 0.125 ) * isNotDir_x * isDir_y;
-            constexpr float_X beta_xz = float_X( 0.125 ) * dx2 / dz2 * isDir_x
-                + float_X( 0.125 ) * isNotDir_x * isDir_z;
-
-            constexpr float_X beta_yx = float_X( 0.125 ) * dy2 / dx2 * isDir_y
-                + float_X( 0.125 ) * isNotDir_y * isDir_x;
-            constexpr float_X beta_yz = float_X( 0.125 ) * dy2 / dz2 * isDir_y
-                + float_X( 0.125 ) * isNotDir_y * isDir_z;
-
-            constexpr float_X beta_zx = float_X( 0.125 ) * dz2 / dx2 * isDir_z
-                + float_X( 0.125 ) * isNotDir_z * isDir_x;
-            constexpr float_X beta_zy = float_X( 0.125 ) * dz2 / dy2 * isDir_z
-                + float_X( 0.125 ) * isNotDir_z * isDir_y;
-
-            constexpr float_X d_dir = CELL_WIDTH * isDir_x
-                + CELL_HEIGHT * isDir_y
-                + CELL_DEPTH * isDir_z;
-            constexpr float_X d_dir2 = d_dir * d_dir;
-
-            // delta_y0 == delta_y
-            // delta_dir0 == delta_dir
-            const float_X delta_dir0 = float_X( 0.25 ) *
-                ( float_X( 1.0 ) - d_dir2 / ( c2 * dt2 ) * mySin * mySin );
-
-            const float_X alpha_x = float_X( 1.0 )
-                - float_X( 2.0 ) * beta_xy * isNotDir_x * isDir_y
-                - float_X( 2.0 ) * beta_xz * isNotDir_x * isDir_z
-                - float_X( 2.0 ) * beta_xy * isDir_x
-                - float_X( 2.0 ) * beta_xz * isDir_x
-                - float_X( 3.0 ) * delta_dir0 * isDir_x;
-
-            const float_X alpha_y = float_X( 1.0 )
-                - float_X( 2.0 ) * beta_yx * isNotDir_y * isDir_x
-                - float_X( 2.0 ) * beta_yz * isNotDir_y * isDir_z
-                - float_X( 2.0 ) * beta_yx * isDir_y
-                - float_X( 2.0 ) * beta_yz * isDir_y
-                - float_X( 3.0 ) * delta_dir0 * isDir_y;
-
-            const float_X alpha_z = float_X( 1.0 )
-                - float_X( 2.0 ) * beta_zx * isNotDir_z * isDir_x
-                - float_X( 2.0 ) * beta_zy * isNotDir_z * isDir_y
-                - float_X( 2.0 ) * beta_zx * isDir_z
-                - float_X( 2.0 ) * beta_zy * isDir_z
-                - float_X( 3.0 ) * delta_dir0 * isDir_z;
-
-            // Typedef an accessor to access mem[z][y][x]
-            // in (x,y,z) order :)
-            typedef DataSpace<DIM3> Space;
-
-            const float_X curl_x
-                = (
-                    alpha_y * ( mem(Space(0,0,0)*(-1)).z( ) - mem(Space(0,-1,0)*(-1)).z( ) )
-                    + beta_yz * ( mem(Space(0,0,1)*(-1)).z( ) - mem(Space(0,-1,1)*(-1)).z( ) )
-                    + beta_yz * ( mem(Space(0,0,-1)*(-1)).z( ) - mem(Space(0,-1,-1)*(-1)).z( ) )
-                    + beta_yx * ( mem(Space(1,0,0)*(-1)).z( ) - mem(Space(1,-1,0)*(-1)).z( ) )
-                    + beta_yx * ( mem(Space(-1,0,0)*(-1)).z( ) - mem(Space(-1,-1,0)*(-1)).z( ) )
-                    + delta_dir0 * ( mem(Space(0,1,0)*(-1)).z( ) - mem(Space(0,-2,0)*(-1)).z( ) )
-                    ) * reci_dy
-                - (
-                    alpha_z * ( mem(Space(0,0,0)*(-1)).y( ) - mem(Space(0,0,-1)*(-1)).y( ) )
-                    + beta_zx * ( mem(Space(1,0,0)*(-1)).y( ) - mem(Space(1,0,-1)*(-1)).y( ) )
-                    + beta_zx * ( mem(Space(-1,0,0)*(-1)).y( ) - mem(Space(-1,0,-1)*(-1)).y( ) )
-                    + beta_zy * ( mem(Space(0,1,0)*(-1)).y( ) - mem(Space(0,1,-1)*(-1)).y( ) )
-                    + beta_zy * ( mem(Space(0,-1,0)*(-1)).y( ) - mem(Space(0,-1,-1)*(-1)).y( ) )
-                    ) * reci_dz;
-
-
-            const float_X curl_y
-                = (
-                    alpha_z * ( mem(Space(0,0,0)*(-1)).x( ) - mem(Space(0,0,-1)*(-1)).x( ) )
-                    + beta_zx * ( mem(Space(1,0,0)*(-1)).x( ) - mem(Space(1,0,-1)*(-1)).x( ) )
-                    + beta_zx * ( mem(Space(-1,0,0)*(-1)).x( ) - mem(Space(-1,0,-1)*(-1)).x( ) )
-                    + beta_zy * ( mem(Space(0,1,0)*(-1)).x( ) - mem(Space(0,1,-1)*(-1)).x( ) )
-                    + beta_zy * ( mem(Space(0,-1,0)*(-1)).x( ) - mem(Space(0,-1,-1)*(-1)).x( ) )
-                    ) * reci_dz
-                - (
-                    alpha_x * ( mem(Space(0,0,0)*(-1)).z( ) - mem(Space(-1,0,0)*(-1)).z( ) )
-                    + beta_xy * ( mem(Space(0,1,0)*(-1)).z( ) - mem(Space(-1,1,0)*(-1)).z( ) )
-                    + beta_xy * ( mem(Space(0,-1,0)*(-1)).z( ) - mem(Space(-1,-1,0)*(-1)).z( ) )
-                    + beta_xz * ( mem(Space(0,0,1)*(-1)).z( ) - mem(Space(-1,0,1)*(-1)).z( ) )
-                    + beta_xz * ( mem(Space(0,0,-1)*(-1)).z( ) - mem(Space(-1,0,-1)*(-1)).z( ) )
-                    ) * reci_dx;
-
-
-            const float_X curl_z
-                = (
-                    alpha_x * ( mem(Space(0,0,0)*(-1)).y( ) - mem(Space(-1,0,0)*(-1)).y( ) )
-                    + beta_xy * ( mem(Space(0,1,0)*(-1)).y( ) - mem(Space(-1,1,0)*(-1)).y( ) )
-                    + beta_xy * ( mem(Space(0,-1,0)*(-1)).y( ) - mem(Space(-1,-1,0)*(-1)).y( ) )
-                    + beta_xz * ( mem(Space(0,0,1)*(-1)).y( ) - mem(Space(-1,0,1)*(-1)).y( ) )
-                    + beta_xz * ( mem(Space(0,0,-1)*(-1)).y( ) - mem(Space(-1,0,-1)*(-1)).y( ) )
-                    ) * reci_dx
-                - (
-                    alpha_y * ( mem(Space(0,0,0)*(-1)).x( ) - mem(Space(0,-1,0)*(-1)).x( ) )
-                    + beta_yz * ( mem(Space(0,0,1)*(-1)).x( ) - mem(Space(0,-1,1)*(-1)).x( ) )
-                    + beta_yz * ( mem(Space(0,0,-1)*(-1)).x( ) - mem(Space(0,-1,-1)*(-1)).x( ) )
-                    + beta_yx * ( mem(Space(1,0,0)*(-1)).x( ) - mem(Space(1,-1,0)*(-1)).x( ) )
-                    + beta_yx * ( mem(Space(-1,0,0)*(-1)).x( ) - mem(Space(-1,-1,0)*(-1)).x( ) )
-                    + delta_dir0 * ( mem(Space(0,1,0)*(-1)).x( ) - mem(Space(0,-2,0)*(-1)).x( ) )
-                    ) * reci_dy;
-
-            return float3_X( -curl_x, -curl_y, -curl_z );
-
-            //return float3_X(diff(mem, 1).z() - diff(mem, 2).y(),
-            //                diff(mem, 2).x() - diff(mem, 0).z(),
-            //                diff(mem, 0).y() - diff(mem, 1).x());
-        }
-    };
-} // namespace lehe
-} // namespace maxwellSolver
-} // namespace fields
-} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Lehe/Derivative.def b/include/picongpu/fields/MaxwellSolver/Lehe/Derivative.def
new file mode 100644
index 0000000000..095242d043
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/Lehe/Derivative.def
@@ -0,0 +1,49 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe,
+ *                     Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace lehe
+            {
+                /** Tag for derivative used in the Lehe solver
+                 *
+                 * Implements eq. (6) in R. Lehe et al
+                 *     Phys. Rev. ST Accel. Beams 16, 021301 (2013)
+                 *
+                 * @tparam T_cherenkovFreeDirection direction to remove numerical Cherenkov
+                 *                                  radiation in, 0 = x, 1 = y, 2 = z
+                 *                                  (unrelated to differentiating direction)
+                 */
+                template<uint32_t T_cherenkovFreeDirection>
+                struct Derivative;
+
+            } // namespace lehe
+        } // namespace maxwellSolver
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Lehe/Derivative.hpp b/include/picongpu/fields/MaxwellSolver/Lehe/Derivative.hpp
new file mode 100644
index 0000000000..c4cab1f0dc
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/Lehe/Derivative.hpp
@@ -0,0 +1,249 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe,
+ *                     Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/fields/differentiation/Derivative.hpp"
+#include "picongpu/fields/differentiation/ForwardDerivative.hpp"
+#include "picongpu/fields/differentiation/Traits.hpp"
+#include "picongpu/fields/MaxwellSolver/Lehe/Derivative.def"
+#include "picongpu/traits/GetMargin.hpp"
+
+#include <pmacc/algorithms/math/defines/pi.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include <pmacc/meta/accessors/Identity.hpp>
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/types.hpp>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace lehe
+            {
+                /** Functor for derivative used in the Lehe solver
+                 *
+                 * Implements eq. (6) in R. Lehe et al
+                 *     Phys. Rev. ST Accel. Beams 16, 021301 (2013)
+                 * This derivative can only be applied for the E field.
+                 *
+                 * @tparam T_cherenkovFreeDirection direction to remove numerical Cherenkov
+                 *                                  radiation in, 0 = x, 1 = y, 2 = z
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_cherenkovFreeDirection, uint32_t T_direction>
+                struct DerivativeFunctor;
+
+                /** Functor for derivative along the Cherenkov free direction
+                 *
+                 * Implements eq. (6) in R. Lehe et al
+                 *     Phys. Rev. ST Accel. Beams 16, 021301 (2013)
+                 *
+                 * @tparam T_direction Cherenkov free direction and derivative direction,
+                 *                     0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_direction>
+                struct DerivativeFunctor<T_direction, T_direction>
+                {
+                private:
+                    //! Internally used derivative functor
+                    using InternalDerivativeFunctor
+                        = differentiation::DerivativeFunctor<differentiation::Forward, T_direction>;
+
+                public:
+                    /** Lower margin: we move by 1 along each direction and
+                     *  apply InternalDerivativeFunctor, add those up
+                     */
+                    using LowerMargin = typename pmacc::math::CT::add<
+                        typename pmacc::math::CT::make_Int<simDim, 1>::type,
+                        typename GetLowerMargin<InternalDerivativeFunctor>::type>::type;
+
+                    /** Upper margin: we move by 1 along each direction and
+                     *  effectively apply InternalDerivativeFunctor (for T_direction not
+                     *  literally, but structurally), add those up
+                     */
+                    using UpperMargin = typename pmacc::math::CT::add<
+                        typename pmacc::math::CT::make_Int<simDim, 1>::type,
+                        typename GetUpperMargin<InternalDerivativeFunctor>::type>::type;
+
+                    //! Create a functor
+                    HDINLINE DerivativeFunctor()
+                    {
+                        // differentiate along dir0; dir1 and dir2 are the other two directions
+                        constexpr uint32_t dir0 = T_direction;
+                        constexpr uint32_t dir1 = (dir0 + 1) % 3;
+                        constexpr uint32_t dir2 = (dir0 + 2) % 3;
+
+                        float_64 const stepRatio = cellSize[dir0] / (SPEED_OF_LIGHT * DELTA_T);
+                        float_64 const coeff = stepRatio
+                            * math::sin(pmacc::math::Pi<float_64>::halfValue * float_64(SPEED_OF_LIGHT)
+                                        * float_64(DELTA_T) / float_64(cellSize[dir0]));
+                        delta = static_cast<float_X>(0.25 * (1.0 - coeff * coeff));
+                        // for 2D the betas corresponding to z are 0
+                        float_64 const stepRatio1 = dir1 < simDim ? cellSize[dir0] / cellSize[dir1] : 0.0;
+                        float_64 const stepRatio2 = dir2 < simDim ? cellSize[dir0] / cellSize[dir2] : 0.0;
+                        float_64 const betaDir1 = 0.125 * stepRatio1 * stepRatio1;
+                        float_64 const betaDir2 = 0.125 * stepRatio2 * stepRatio2;
+                        alpha = static_cast<float_X>(1.0 - 2.0 * betaDir1 - 2.0 * betaDir2 - 3.0 * delta);
+                    }
+
+                    /** Return derivative value at the given point
+                     *
+                     * @tparam T_DataBox data box type with field data
+                     * @param data position in the data box to compute derivative at
+                     */
+                    template<typename T_DataBox>
+                    HDINLINE typename T_DataBox::ValueType operator()(T_DataBox const& data) const
+                    {
+                        // differentiate along dir0; dir1 and dir2 are the other two directions
+                        constexpr uint32_t dir0 = T_direction;
+                        constexpr uint32_t dir1 = (dir0 + 1) % 3;
+                        constexpr uint32_t dir2 = (dir0 + 2) % 3;
+
+                        // cellSize is not constexpr currently, so make an own constexpr array
+                        constexpr float_X step[3] = {CELL_WIDTH, CELL_HEIGHT, CELL_DEPTH};
+
+                        /* beta_xy and beta_xz from eq. (11), generic for any T_direction;
+                         * for 2D the betas corresponding to z are 0
+                         */
+                        constexpr float_X stepRatio1 = dir1 < simDim ? step[dir0] / step[dir1] : 0.0_X;
+                        constexpr float_X stepRatio2 = dir2 < simDim ? step[dir0] / step[dir2] : 0.0_X;
+                        constexpr float_X betaDir1 = 0.125_X * stepRatio1 * stepRatio1;
+                        constexpr float_X betaDir2 = 0.125_X * stepRatio2 * stepRatio2;
+
+                        // finite-difference expression from eq. (6), generic for any T_direction
+                        using Index = pmacc::DataSpace<simDim>;
+                        auto const secondUpperIndexDir0 = 2 * pmacc::math::basisVector<Index, dir0>();
+                        auto const lowerIndexDir0 = -pmacc::math::basisVector<Index, dir0>();
+                        auto const upperNeighborDir1 = pmacc::math::basisVector<Index, dir1>();
+                        auto const upperNeighborDir2 = pmacc::math::basisVector<Index, dir2>();
+                        InternalDerivativeFunctor forwardDerivative
+                            = differentiation::makeDerivativeFunctor<differentiation::Forward, T_direction>();
+                        return alpha * forwardDerivative(data)
+                            + betaDir1 * forwardDerivative(data.shift(upperNeighborDir1))
+                            + betaDir1 * forwardDerivative(data.shift(-upperNeighborDir1))
+                            + betaDir2 * forwardDerivative(data.shift(upperNeighborDir2))
+                            + betaDir2 * forwardDerivative(data.shift(-upperNeighborDir2))
+                            + delta * (data(secondUpperIndexDir0) - data(lowerIndexDir0)) / step[T_direction];
+                    }
+
+                private:
+                    //! alpha_x from eq. (7), generic for any T_direction
+                    float_X alpha;
+
+                    //! delta_x0 from eq. (10), generic for any T_direction
+                    float_X delta;
+                };
+
+                /** Functor for derivative not along the Cherenkov free direction
+                 *
+                 * Implements eq. (6) in R. Lehe et al
+                 *     Phys. Rev. ST Accel. Beams 16, 021301 (2013)
+                 * Implementation is separated as a few terms vanish in this case
+                 *
+                 * @tparam T_cherenkovFreeDirection direction to remove numerical Cherenkov
+                 *                                  radiation in, 0 = x, 1 = y, 2 = z
+                 * @tparam T_direction direction to take derivative in, not equal to
+                 *                     T_cherenkovFreeDirection, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_cherenkovFreeDirection, uint32_t T_direction>
+                struct DerivativeFunctor
+                {
+                    PMACC_CASSERT_MSG(
+                        _lehe_solver_cherenkov_free_direction_z_is_not_supported_for_2d,
+                        T_cherenkovFreeDirection < simDim);
+
+                    PMACC_CASSERT_MSG(
+                        _internal_error_wrong_lehe_derivative_functor_specialization,
+                        T_cherenkovFreeDirection != T_direction);
+
+                private:
+                    //! Internally used derivative functor
+                    using InternalDerivativeFunctor
+                        = differentiation::DerivativeFunctor<differentiation::Forward, T_direction>;
+
+                public:
+                    /** Lower margin: we move by 1 along T_cherenkovFreeDirection and
+                     *  apply InternalDerivativeFunctor, add those up
+                     */
+                    using LowerMargin = typename pmacc::math::CT::add<
+                        typename pmacc::math::CT::make_BasisVector<simDim, T_cherenkovFreeDirection, int>::type,
+                        typename GetLowerMargin<InternalDerivativeFunctor>::type>::type;
+
+                    /** Upper margin: we move by 1 along T_cherenkovFreeDirection and
+                     *  apply InternalDerivativeFunctor, add those up
+                     */
+                    using UpperMargin = typename pmacc::math::CT::add<
+                        typename pmacc::math::CT::make_BasisVector<simDim, T_cherenkovFreeDirection, int>::type,
+                        typename GetUpperMargin<InternalDerivativeFunctor>::type>::type;
+
+                    /** Return derivative value at the given point
+                     *
+                     * @tparam T_DataBox data box type with field data
+                     * @param data position in the data box to compute derivative at
+                     */
+                    template<typename T_DataBox>
+                    HDINLINE typename T_DataBox::ValueType operator()(T_DataBox const& data) const
+                    {
+                        /* To obtain the following scheme, consider eq. (6) for x direction
+                         * being Cherenkov-free and taking derivatives along y, z.
+                         * Then in eq. (11) delta_y = delta_z = 0, beta_yz = beta_zy = 0,
+                         * so only 3 terms are left in the derivative expression.
+                         * It is implemented generically for any T_cherenkovFreeDirection
+                         * and T_direction that are not equal to one another
+                         */
+                        constexpr float_X beta = 0.125_X;
+                        constexpr float_X alpha = 1.0_X - 2.0_X * beta;
+                        InternalDerivativeFunctor forwardDerivative
+                            = differentiation::makeDerivativeFunctor<differentiation::Forward, T_direction>();
+                        auto const upperNeighbor
+                            = pmacc::math::basisVector<pmacc::DataSpace<simDim>, T_cherenkovFreeDirection>();
+                        return alpha * forwardDerivative(data) + beta * forwardDerivative(data.shift(upperNeighbor))
+                            + beta * forwardDerivative(data.shift(-upperNeighbor));
+                    }
+                };
+
+            } // namespace lehe
+        } // namespace maxwellSolver
+
+        namespace differentiation
+        {
+            namespace traits
+            {
+                /** Functor type trait specialization for the Lehe solver derivative derivative
+                 *
+                 * @tparam T_cherenkovFreeDirection direction to remove numerical Cherenkov
+                 *                                  radiation in, 0 = x, 1 = y, 2 = z
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_cherenkovFreeDirection, uint32_t T_direction>
+                struct DerivativeFunctor<maxwellSolver::lehe::Derivative<T_cherenkovFreeDirection>, T_direction>
+                    : pmacc::meta::accessors::Identity<
+                          maxwellSolver::lehe::DerivativeFunctor<T_cherenkovFreeDirection, T_direction>>
+                {
+                };
+
+            } // namespace traits
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Lehe/Lehe.def b/include/picongpu/fields/MaxwellSolver/Lehe/Lehe.def
index 65520fdd3d..80e3b049c7 100644
--- a/include/picongpu/fields/MaxwellSolver/Lehe/Lehe.def
+++ b/include/picongpu/fields/MaxwellSolver/Lehe/Lehe.def
@@ -1,4 +1,5 @@
-/* Copyright 2013-2020 Axel Huebl, Remi Lehe
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe,
+ *                     Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -19,39 +20,63 @@
 
 #pragma once
 
-#include "picongpu/fields/MaxwellSolver/Lehe/Curl.def"
-#include "picongpu/fields/MaxwellSolver/Yee/Yee.def"
 #include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
+#include "picongpu/fields/differentiation/Curl.def"
+#include "picongpu/fields/MaxwellSolver/Lehe/Derivative.def"
+#include "picongpu/fields/MaxwellSolver/Yee/Yee.def"
+
+#include <cstdint>
 
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace lehe
+            {
+                /* Note: Directions are kept as global names for compatibility with the
+                 * previously used solver interface
+                 */
+
+                //! Remove numerical Cherenkov radiation along x
+                constexpr uint32_t CherenkovFreeDirection_X = 0;
+
+                //! Remove numerical Cherenkov radiation along y
+                constexpr uint32_t CherenkovFreeDirection_Y = 1;
+
+                //! Remove numerical Cherenkov radiation along z
+                constexpr uint32_t CherenkovFreeDirection_Z = 2;
+
+                /** Curl to be applied for the E field
+                 *
+                 * @tparam T_cherenkovFreeDirection direction to remove numerical Cherenkov
+                 *                                  radiation in, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_cherenkovFreeDirection>
+                using CurlE = differentiation::Curl<Derivative<T_cherenkovFreeDirection>>;
+
+            } // namespace lehe
+
+            /** modified Yee solver
+             *
+             * Reference: R. Lehe et al
+             *            Phys. Rev. ST Accel. Beams 16, 021301 (2013)
+             *
+             * @tparam T_CherenkovFreeDir the direction (axis) which should be free of cherenkov radiation
+             *                            0 = x, 1 = y, 2 = z
+             */
+            template<
+                typename T_CurrentInterpolation = currentInterpolation::None,
+                uint32_t T_cherenkovFreeDir = lehe::CherenkovFreeDirection_Y>
+            using Lehe
+                = ::picongpu::fields::maxwellSolver::Yee<T_CurrentInterpolation, lehe::CurlE<T_cherenkovFreeDir>>;
+
+            /* We need no definition of margins, because the Yee solver uses its curl
+             * classes to define margins
+             */
 
-    /** modified Yee solver
-     *
-     * Reference: R. Lehe et al
-     *            Phys. Rev. ST Accel. Beams 16, 021301 (2013)
-     *
-     * @tparam T_CherenkovFreeDir the direction (axis) which should be free of cherenkov radiation
-     *                            valid types: lehe::CherenkovFreeDirection_Y, lehe::CherenkovFreeDirection_Y
-     */
-    template<
-        typename T_CurrentInterpolation = currentInterpolation::None,
-        typename T_CherenkovFreeDir = lehe::CherenkovFreeDirection_Y
-    >
-    using Lehe = ::picongpu::fields::maxwellSolver::Yee<
-        T_CurrentInterpolation,
-        lehe::CurlE< T_CherenkovFreeDir >
-    >;
-
-    /* we need no definition of margins, because the YeeSolver uses its curl
-     * classes to define margins
-     */
-
-} // namespace maxwellSolver
-} // namespace fields
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Lehe/Lehe.hpp b/include/picongpu/fields/MaxwellSolver/Lehe/Lehe.hpp
index 7134f433c7..47f18c1c80 100644
--- a/include/picongpu/fields/MaxwellSolver/Lehe/Lehe.hpp
+++ b/include/picongpu/fields/MaxwellSolver/Lehe/Lehe.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe,
+ *                     Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -17,40 +18,31 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-
-
 #pragma once
 
-#include "picongpu/fields/MaxwellSolver/Lehe/Lehe.def"
-#include "picongpu/fields/MaxwellSolver/Lehe/Curl.hpp"
 #include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/MaxwellSolver/Lehe/Lehe.def"
+#include "picongpu/fields/MaxwellSolver/Lehe/Derivative.hpp"
+
+#include <cstdint>
+
 
 namespace pmacc
 {
-namespace traits
-{
-    template<
-        typename T_CurrentInterpolation,
-        typename T_CherenkovFreeDir
-    >
-    struct StringProperties<
-        ::picongpu::fields::maxwellSolver::Lehe<
-            T_CurrentInterpolation,
-            T_CherenkovFreeDir
-        >
-    >
+    namespace traits
     {
-        static StringProperty get()
+        template<typename T_CurrentInterpolation, uint32_t T_cherenkovFreeDir>
+        struct StringProperties<::picongpu::fields::maxwellSolver::Lehe<T_CurrentInterpolation, T_cherenkovFreeDir>>
         {
-            auto propList =
-                ::picongpu::fields::maxwellSolver::Lehe<
-                    T_CurrentInterpolation,
-                    T_CherenkovFreeDir
-                >::getStringProperties();
-            // overwrite the name of the yee solver (inherit all other properties)
-            propList["name"].value = "Lehe";
-            return propList;
-        }
-    };
-} // namespace traits
+            static StringProperty get()
+            {
+                auto propList = ::picongpu::fields::maxwellSolver::Lehe<T_CurrentInterpolation, T_cherenkovFreeDir>::
+                    getStringProperties();
+                // overwrite the name of the Yee solver (inherit all other properties)
+                propList["name"].value = "Lehe";
+                return propList;
+            }
+        };
+
+    } // namespace traits
 } // namespace pmacc
diff --git a/include/picongpu/fields/MaxwellSolver/LehePML/LehePML.def b/include/picongpu/fields/MaxwellSolver/LehePML/LehePML.def
new file mode 100644
index 0000000000..10410b3390
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/LehePML/LehePML.def
@@ -0,0 +1,56 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe,
+ *                     Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
+#include "picongpu/fields/MaxwellSolver/Lehe/Lehe.def"
+#include "picongpu/fields/MaxwellSolver/YeePML/YeePML.def"
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            /** modified Yee solver with PML absorber
+             *
+             * Reference: R. Lehe et al
+             *            Phys. Rev. ST Accel. Beams 16, 021301 (2013)
+             *
+             * @tparam T_CherenkovFreeDir the direction (axis) which should be free of cherenkov radiation
+             *                            0 = x, 1 = y, 2 = z
+             */
+            template<
+                typename T_CurrentInterpolation = currentInterpolation::None,
+                uint32_t T_cherenkovFreeDir = lehe::CherenkovFreeDirection_Y>
+            using LehePML
+                = ::picongpu::fields::maxwellSolver::YeePML<T_CurrentInterpolation, lehe::CurlE<T_cherenkovFreeDir>>;
+
+            /* We need no definition of margins, because the YeePML solver uses its curl
+             * classes to define margins
+             */
+
+        } // namespace maxwellSolver
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/LehePML/LehePML.hpp b/include/picongpu/fields/MaxwellSolver/LehePML/LehePML.hpp
new file mode 100644
index 0000000000..dcb55b674b
--- /dev/null
+++ b/include/picongpu/fields/MaxwellSolver/LehePML/LehePML.hpp
@@ -0,0 +1,48 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Remi Lehe,
+ *                     Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/MaxwellSolver/Lehe/Lehe.hpp"
+#include "picongpu/fields/MaxwellSolver/LehePML/LehePML.def"
+
+#include <cstdint>
+
+
+namespace pmacc
+{
+    namespace traits
+    {
+        template<typename T_CurrentInterpolation, uint32_t T_cherenkovFreeDir>
+        struct StringProperties<::picongpu::fields::maxwellSolver::LehePML<T_CurrentInterpolation, T_cherenkovFreeDir>>
+        {
+            static StringProperty get()
+            {
+                auto propList = ::picongpu::fields::maxwellSolver::
+                    LehePML<T_CurrentInterpolation, T_cherenkovFreeDir>::getStringProperties();
+                // overwrite the name of the solver (inherit all other properties)
+                propList["name"].value = "Lehe";
+                return propList;
+            }
+        };
+
+    } // namespace traits
+} // namespace pmacc
diff --git a/include/picongpu/fields/MaxwellSolver/None/None.def b/include/picongpu/fields/MaxwellSolver/None/None.def
index 6c7402cb82..0036d3f63c 100644
--- a/include/picongpu/fields/MaxwellSolver/None/None.def
+++ b/include/picongpu/fields/MaxwellSolver/None/None.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,45 +25,31 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-
-    template< typename T_CurrentInterpolation = currentInterpolation::None >
-    class None;
-
-} // namespace maxwellSolver
-} // namespace fields
-
-namespace traits
-{
-
-    template< typename T_CurrentInterpolation >
-    struct GetMargin<
-        picongpu::fields::maxwellSolver::None< T_CurrentInterpolation >,
-        FIELD_B
-    >
+    namespace fields
     {
-        using LowerMargin = typename pmacc::math::CT::make_Int<
-            simDim,
-            0
-        >::type;
-        using UpperMargin = LowerMargin;
-    };
+        namespace maxwellSolver
+        {
+            template<typename T_CurrentInterpolation = currentInterpolation::None>
+            class None;
 
-    template< typename T_CurrentInterpolation >
-    struct GetMargin<
-        picongpu::fields::maxwellSolver::None< T_CurrentInterpolation >,
-        FIELD_E
-    >
-    {
-        using LowerMargin = typename pmacc::math::CT::make_Int<
-            simDim,
-            0
-        >::type;
-        using UpperMargin = LowerMargin;
-    };
+        } // namespace maxwellSolver
+    } // namespace fields
 
-} // namespace traits
+    namespace traits
+    {
+        template<typename T_CurrentInterpolation>
+        struct GetMargin<picongpu::fields::maxwellSolver::None<T_CurrentInterpolation>, FIELD_B>
+        {
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = LowerMargin;
+        };
+
+        template<typename T_CurrentInterpolation>
+        struct GetMargin<picongpu::fields::maxwellSolver::None<T_CurrentInterpolation>, FIELD_E>
+        {
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = LowerMargin;
+        };
+
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/None/None.hpp b/include/picongpu/fields/MaxwellSolver/None/None.hpp
index 4335a5341f..b3384cec95 100644
--- a/include/picongpu/fields/MaxwellSolver/None/None.hpp
+++ b/include/picongpu/fields/MaxwellSolver/None/None.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,66 +28,66 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace none
-{
-    /** Check Yee grid and time conditions
-     *
-     * This is a workaround that the condition check is only
-     * triggered if the current used solver is `NoSolver`
-     */
-    template<typename T_UsedSolver, typename T_Dummy = void>
-    struct ConditionCheck
-    {
-    };
-
-    template<typename T_CurrentInterpolation, typename T_Dummy>
-    struct ConditionCheck<
-        None< T_CurrentInterpolation > ,
-        T_Dummy
-    >
+    namespace fields
     {
-        /* Courant-Friedrichs-Levy-Condition for Yee Field Solver: */
-        PMACC_CASSERT_MSG(Courant_Friedrichs_Levy_condition_failure____check_your_grid_param_file,
-            (SPEED_OF_LIGHT*SPEED_OF_LIGHT*DELTA_T*DELTA_T*INV_CELL2_SUM)<=1.0);
-    };
-} // namespace none
-
-    template< typename T_CurrentInterpolation >
-    class None : private none::ConditionCheck< None< T_CurrentInterpolation> >
-    {
-    private:
-        typedef MappingDesc::SuperCellSize SuperCellSize;
-
-    public:
-        using CellType = cellType::Yee;
-        using CurrentInterpolation = T_CurrentInterpolation;
-
-        None(MappingDesc)
+        namespace maxwellSolver
         {
-
-        }
-
-        void update_beforeCurrent(uint32_t)
-        {
-
-        }
-
-        void update_afterCurrent(uint32_t)
-        {
-
-        }
-
-        static pmacc::traits::StringProperty getStringProperties()
-        {
-            pmacc::traits::StringProperty propList( "name", "none" );
-            return propList;
-        }
-    };
-
-} // namespace maxwellSolver
-} // namespace fields
+            namespace none
+            {
+                /** Check Yee grid and time conditions
+                 *
+                 * This is a workaround that the condition check is only
+                 * triggered if the current used solver is `NoSolver`
+                 */
+                template<typename T_UsedSolver, typename T_Dummy = void>
+                struct ConditionCheck
+                {
+                };
+
+                template<typename T_CurrentInterpolation, typename T_Dummy>
+                struct ConditionCheck<None<T_CurrentInterpolation>, T_Dummy>
+                {
+                    /* Courant-Friedrichs-Levy-Condition for Yee Field Solver:
+                     *
+                     * A workaround is to add a template dependency to the expression.
+                     * `sizeof(ANY_TYPE*) != 0` is always true and defers the evaluation.
+                     */
+                    PMACC_CASSERT_MSG(
+                        Courant_Friedrichs_Levy_condition_failure____check_your_grid_param_file,
+                        (SPEED_OF_LIGHT * SPEED_OF_LIGHT * DELTA_T * DELTA_T * INV_CELL2_SUM) <= 1.0
+                            && sizeof(T_Dummy*) != 0);
+                };
+            } // namespace none
+
+            template<typename T_CurrentInterpolation>
+            class None : private none::ConditionCheck<None<T_CurrentInterpolation>>
+            {
+            private:
+                typedef MappingDesc::SuperCellSize SuperCellSize;
+
+            public:
+                using CellType = cellType::Yee;
+                using CurrentInterpolation = T_CurrentInterpolation;
+
+                None(MappingDesc)
+                {
+                }
+
+                void update_beforeCurrent(uint32_t)
+                {
+                }
+
+                void update_afterCurrent(uint32_t)
+                {
+                }
+
+                static pmacc::traits::StringProperty getStringProperties()
+                {
+                    pmacc::traits::StringProperty propList("name", "none");
+                    return propList;
+                }
+            };
+
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Solvers.def b/include/picongpu/fields/MaxwellSolver/Solvers.def
index 815871126c..1f37df1af2 100644
--- a/include/picongpu/fields/MaxwellSolver/Solvers.def
+++ b/include/picongpu/fields/MaxwellSolver/Solvers.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -17,16 +17,12 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-
-
 #pragma once
 
 #include "picongpu/fields/MaxwellSolver/None/None.def"
 #include "picongpu/fields/MaxwellSolver/Yee/Yee.def"
 #include "picongpu/fields/MaxwellSolver/YeePML/YeePML.def"
-#if (SIMDIM==3)
 #include "picongpu/fields/MaxwellSolver/Lehe/Lehe.def"
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.def"
-#endif
-#endif
+#include "picongpu/fields/MaxwellSolver/LehePML/LehePML.def"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.def"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.def"
diff --git a/include/picongpu/fields/MaxwellSolver/Solvers.hpp b/include/picongpu/fields/MaxwellSolver/Solvers.hpp
index 8994dbf417..4b1608c390 100644
--- a/include/picongpu/fields/MaxwellSolver/Solvers.hpp
+++ b/include/picongpu/fields/MaxwellSolver/Solvers.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -17,16 +17,12 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-
-
 #pragma once
 
 #include "picongpu/fields/MaxwellSolver/None/None.hpp"
 #include "picongpu/fields/MaxwellSolver/Yee/Yee.hpp"
 #include "picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp"
-#if (SIMDIM==3)
 #include "picongpu/fields/MaxwellSolver/Lehe/Lehe.hpp"
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "picongpu/fields/MaxwellSolver/DirSplitting/DirSplitting.hpp"
-#endif
-#endif
+#include "picongpu/fields/MaxwellSolver/LehePML/LehePML.hpp"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTD/ArbitraryOrderFDTD.hpp"
+#include "picongpu/fields/MaxwellSolver/ArbitraryOrderFDTDPML/ArbitraryOrderFDTDPML.hpp"
diff --git a/include/picongpu/fields/MaxwellSolver/Yee/Curl.def b/include/picongpu/fields/MaxwellSolver/Yee/Curl.def
deleted file mode 100644
index f5bca11df4..0000000000
--- a/include/picongpu/fields/MaxwellSolver/Yee/Curl.def
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/algorithms/DifferenceToUpper.def"
-#include "picongpu/algorithms/DifferenceToLower.def"
-
-#include <pmacc/types.hpp>
-
-
-namespace picongpu
-{
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yee
-{
-
-    template< typename Difference >
-    struct Curl;
-
-    using CurlLeft = Curl< DifferenceToLower< simDim > >;
-    using CurlRight = Curl< DifferenceToUpper< simDim > >;
-
-} // namespace yee
-} // namespace maxwellSolver
-} // namespace fields
-} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Yee/Curl.hpp b/include/picongpu/fields/MaxwellSolver/Yee/Curl.hpp
deleted file mode 100644
index 5023c37f58..0000000000
--- a/include/picongpu/fields/MaxwellSolver/Yee/Curl.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/algorithms/DifferenceToUpper.hpp"
-#include "picongpu/algorithms/DifferenceToLower.hpp"
-#include "picongpu/fields/MaxwellSolver/Yee/Curl.def"
-
-#include <pmacc/types.hpp>
-
-
-namespace picongpu
-{
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yee
-{
-
-    template< typename T_Difference >
-    struct Curl
-    {
-        using Difference = T_Difference;
-        using LowerMargin = typename Difference::OffsetOrigin;
-        using UpperMargin = typename Difference::OffsetEnd;
-
-        template<class Memory >
-        HDINLINE typename Memory::ValueType operator()( Memory const & mem ) const
-        {
-            const typename Difference::template GetDifference< 0 > Dx;
-            const typename Difference::template GetDifference< 1 > Dy;
-            const typename Difference::template GetDifference< 2 > Dz;
-
-            return float3_X(
-                Dy( mem ).z() - Dz( mem ).y(),
-                Dz( mem ).x() - Dx( mem ).z(),
-                Dx( mem ).y() - Dy( mem ).x()
-            );
-        }
-    };
-
-} // namespace yee
-} // namespace maxwellSolver
-} // namespace fields
-} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Yee/Yee.def b/include/picongpu/fields/MaxwellSolver/Yee/Yee.def
index 0274616cca..f9442aba06 100644
--- a/include/picongpu/fields/MaxwellSolver/Yee/Yee.def
+++ b/include/picongpu/fields/MaxwellSolver/Yee/Yee.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -19,64 +19,30 @@
 
 #pragma once
 
-#include "picongpu/fields/MaxwellSolver/Yee/Curl.def"
 #include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
+#include "picongpu/fields/differentiation/Curl.def"
+#include "picongpu/fields/differentiation/Derivative.def"
 
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-
-    template<
-        typename T_CurrentInterpolation = currentInterpolation::None,
-        typename CurlE = yee::CurlRight,
-        typename CurlB = yee::CurlLeft
-    >
-    class Yee;
-
-} // namespace maxwellSolver
-} // namespace fields
-
-namespace traits
-{
-
-    template<
-        typename T_CurrentInterpolation,
-        class CurlE,
-        class CurlB
-    >
-    struct GetMargin<
-        picongpu::fields::maxwellSolver::Yee<
-            T_CurrentInterpolation,
-            CurlE,
-            CurlB
-        >, FIELD_B
-    >
+    namespace fields
     {
-        using LowerMargin = typename CurlB::LowerMargin;
-        using UpperMargin = typename CurlB::UpperMargin;
-    };
-
-    template<
-        typename T_CurrentInterpolation,
-        class CurlE,
-        class CurlB
-    >
-    struct GetMargin<
-        picongpu::fields::maxwellSolver::Yee<
-            T_CurrentInterpolation,
-            CurlE,
-            CurlB
-        >,
-        FIELD_E
-    >
-    {
-        using LowerMargin = typename CurlE::LowerMargin;
-        using UpperMargin = typename CurlE::UpperMargin;
-    };
-
-} //namespace traits
+        namespace maxwellSolver
+        {
+            namespace yee
+            {
+                using CurlLeft = differentiation::Curl<differentiation::Backward>;
+                using CurlRight = differentiation::Curl<differentiation::Forward>;
+
+            } // namespace yee
+
+            template<
+                typename T_CurrentInterpolation = currentInterpolation::None,
+                typename CurlE = yee::CurlRight,
+                typename CurlB = yee::CurlLeft>
+            class Yee;
+
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Yee/Yee.hpp b/include/picongpu/fields/MaxwellSolver/Yee/Yee.hpp
index cf09534251..699454f1c8 100644
--- a/include/picongpu/fields/MaxwellSolver/Yee/Yee.hpp
+++ b/include/picongpu/fields/MaxwellSolver/Yee/Yee.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -21,13 +21,14 @@
 
 #include "picongpu/simulation_defines.hpp"
 #include "picongpu/fields/MaxwellSolver/Yee/Yee.def"
-#include "picongpu/fields/MaxwellSolver/Yee/Curl.hpp"
 #include "picongpu/fields/absorber/ExponentialDamping.hpp"
 #include "picongpu/fields/FieldE.hpp"
 #include "picongpu/fields/FieldB.hpp"
 #include "picongpu/fields/MaxwellSolver/Yee/Yee.kernel"
 #include "picongpu/fields/cellType/Yee.hpp"
 #include "picongpu/fields/LaserPhysics.hpp"
+#include "picongpu/fields/differentiation/Curl.hpp"
+#include "picongpu/traits/GetMargin.hpp"
 
 #include <pmacc/nvidia/functors/Assign.hpp>
 #include <pmacc/mappings/threads/ThreadCollective.hpp>
@@ -37,135 +38,135 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-
-    template<
-        typename T_CurrentInterpolation,
-        class CurlE,
-        class CurlB
-    >
-    class Yee
+    namespace fields
     {
-    private:
-        typedef MappingDesc::SuperCellSize SuperCellSize;
-
-
-        std::shared_ptr< FieldE > fieldE;
-        std::shared_ptr< FieldB > fieldB;
-        MappingDesc m_cellDescription;
-
-        template<uint32_t AREA>
-        void updateE()
-        {
-            /* Courant-Friedrichs-Levy-Condition for Yee Field Solver: */
-            PMACC_CASSERT_MSG(Courant_Friedrichs_Levy_condition_failure____check_your_grid_param_file,
-                (SPEED_OF_LIGHT*SPEED_OF_LIGHT*DELTA_T*DELTA_T*INV_CELL2_SUM)<=1.0);
-
-            typedef SuperCellDescription<
-                    SuperCellSize,
-                    typename CurlB::LowerMargin,
-                    typename CurlB::UpperMargin
-                    > BlockArea;
-
-            AreaMapping<AREA, MappingDesc> mapper(m_cellDescription);
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            PMACC_KERNEL(yee::KernelUpdateE< numWorkers, BlockArea >{ })
-                ( mapper.getGridDim(), numWorkers )(
-                    CurlB( ),
-                    this->fieldE->getDeviceDataBox(),
-                    this->fieldB->getDeviceDataBox(),
-                    mapper
-                );
-        }
-
-        template<uint32_t AREA>
-        void updateBHalf()
+        namespace maxwellSolver
         {
-            typedef SuperCellDescription<
-                    SuperCellSize,
-                    typename CurlE::LowerMargin,
-                    typename CurlE::UpperMargin
-                    > BlockArea;
-
-            AreaMapping<AREA, MappingDesc> mapper(m_cellDescription);
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            PMACC_KERNEL(yee::KernelUpdateBHalf< numWorkers, BlockArea >{ })
-                ( mapper.getGridDim(), numWorkers )(
-                    CurlE( ),
-                    this->fieldB->getDeviceDataBox(),
-                    this->fieldE->getDeviceDataBox(),
-                    mapper
-                );
-        }
-
-    public:
-
-        using CellType = cellType::Yee;
-        using CurrentInterpolation = T_CurrentInterpolation;
-
-        Yee(MappingDesc cellDescription) : m_cellDescription(cellDescription)
+            template<typename T_CurrentInterpolation, class CurlE, class CurlB>
+            class Yee
+            {
+            private:
+                typedef MappingDesc::SuperCellSize SuperCellSize;
+
+
+                std::shared_ptr<FieldE> fieldE;
+                std::shared_ptr<FieldB> fieldB;
+                MappingDesc m_cellDescription;
+
+                template<uint32_t AREA>
+                void updateE()
+                {
+                    /* Courant-Friedrichs-Levy-Condition for Yee Field Solver:
+                     *
+                     * A workaround is to add a template dependency to the expression.
+                     * `sizeof(ANY_TYPE*) != 0` is always true and defers the evaluation.
+                     */
+                    PMACC_CASSERT_MSG(
+                        Courant_Friedrichs_Levy_condition_failure____check_your_grid_param_file,
+                        (SPEED_OF_LIGHT * SPEED_OF_LIGHT * DELTA_T * DELTA_T * INV_CELL2_SUM) <= 1.0
+                            && sizeof(T_CurrentInterpolation*) != 0);
+
+                    typedef SuperCellDescription<
+                        SuperCellSize,
+                        typename traits::GetLowerMargin<CurlB>::type,
+                        typename traits::GetUpperMargin<CurlB>::type>
+                        BlockArea;
+
+                    AreaMapping<AREA, MappingDesc> mapper(m_cellDescription);
+
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                    PMACC_KERNEL(yee::KernelUpdateE<numWorkers, BlockArea>{})
+                    (mapper.getGridDim(),
+                     numWorkers)(CurlB(), this->fieldE->getDeviceDataBox(), this->fieldB->getDeviceDataBox(), mapper);
+                }
+
+                template<uint32_t AREA>
+                void updateBHalf()
+                {
+                    typedef SuperCellDescription<
+                        SuperCellSize,
+                        typename CurlE::LowerMargin,
+                        typename CurlE::UpperMargin>
+                        BlockArea;
+
+                    AreaMapping<AREA, MappingDesc> mapper(m_cellDescription);
+
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                    PMACC_KERNEL(yee::KernelUpdateBHalf<numWorkers, BlockArea>{})
+                    (mapper.getGridDim(),
+                     numWorkers)(CurlE(), this->fieldB->getDeviceDataBox(), this->fieldE->getDeviceDataBox(), mapper);
+                }
+
+            public:
+                using CellType = cellType::Yee;
+                using CurrentInterpolation = T_CurrentInterpolation;
+
+                Yee(MappingDesc cellDescription) : m_cellDescription(cellDescription)
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+
+                    this->fieldE = dc.get<FieldE>(FieldE::getName(), true);
+                    this->fieldB = dc.get<FieldB>(FieldB::getName(), true);
+                }
+
+                void update_beforeCurrent(uint32_t)
+                {
+                    updateBHalf<CORE + BORDER>();
+                    EventTask eRfieldB = fieldB->asyncCommunication(__getTransactionEvent());
+
+                    updateE<CORE>();
+                    __setTransactionEvent(eRfieldB);
+                    updateE<BORDER>();
+                }
+
+                void update_afterCurrent(uint32_t currentStep)
+                {
+                    using Absorber = absorber::ExponentialDamping;
+                    Absorber::run(currentStep, this->m_cellDescription, this->fieldE->getDeviceDataBox());
+                    if(laserProfiles::Selected::INIT_TIME > float_X(0.0))
+                        LaserPhysics{}(currentStep);
+
+                    EventTask eRfieldE = fieldE->asyncCommunication(__getTransactionEvent());
+
+                    updateBHalf<CORE>();
+                    __setTransactionEvent(eRfieldE);
+                    updateBHalf<BORDER>();
+
+                    Absorber::run(currentStep, this->m_cellDescription, fieldB->getDeviceDataBox());
+
+                    EventTask eRfieldB = fieldB->asyncCommunication(__getTransactionEvent());
+                    __setTransactionEvent(eRfieldB);
+                }
+
+                static pmacc::traits::StringProperty getStringProperties()
+                {
+                    pmacc::traits::StringProperty propList("name", "Yee");
+                    return propList;
+                }
+            };
+
+        } // namespace maxwellSolver
+    } // namespace fields
+
+    namespace traits
+    {
+        template<typename T_CurrentInterpolation, class CurlE, class CurlB>
+        struct GetMargin<picongpu::fields::maxwellSolver::Yee<T_CurrentInterpolation, CurlE, CurlB>, FIELD_B>
         {
-            DataConnector &dc = Environment<>::get().DataConnector();
+            using LowerMargin = typename CurlB::LowerMargin;
+            using UpperMargin = typename CurlB::UpperMargin;
+        };
 
-            this->fieldE = dc.get< FieldE >( FieldE::getName(), true );
-            this->fieldB = dc.get< FieldB >( FieldB::getName(), true );
-        }
-
-        void update_beforeCurrent(uint32_t)
+        template<typename T_CurrentInterpolation, class CurlE, class CurlB>
+        struct GetMargin<picongpu::fields::maxwellSolver::Yee<T_CurrentInterpolation, CurlE, CurlB>, FIELD_E>
         {
-            updateBHalf < CORE+BORDER >();
-            EventTask eRfieldB = fieldB->asyncCommunication(__getTransactionEvent());
-
-            updateE<CORE>();
-            __setTransactionEvent(eRfieldB);
-            updateE<BORDER>();
-        }
+            using LowerMargin = typename CurlE::LowerMargin;
+            using UpperMargin = typename CurlE::UpperMargin;
+        };
 
-        void update_afterCurrent(uint32_t currentStep)
-        {
-            using Absorber = absorber::ExponentialDamping;
-            Absorber::run(
-                currentStep,
-                this->m_cellDescription,
-                this->fieldE->getDeviceDataBox()
-            );
-            if (laserProfiles::Selected::INIT_TIME > float_X(0.0))
-                LaserPhysics{}(currentStep);
-
-            EventTask eRfieldE = fieldE->asyncCommunication(__getTransactionEvent());
-
-            updateBHalf < CORE> ();
-            __setTransactionEvent(eRfieldE);
-            updateBHalf < BORDER > ();
-
-            Absorber::run(
-                currentStep,
-                this->m_cellDescription,
-                fieldB->getDeviceDataBox()
-            );
-
-            EventTask eRfieldB = fieldB->asyncCommunication(__getTransactionEvent());
-            __setTransactionEvent(eRfieldB);
-        }
-
-        static pmacc::traits::StringProperty getStringProperties()
-        {
-            pmacc::traits::StringProperty propList( "name", "Yee" );
-            return propList;
-        }
-    };
-
-} // namespace maxwellSolver
-} // namespace fields
-} // picongpu
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/Yee/Yee.kernel b/include/picongpu/fields/MaxwellSolver/Yee/Yee.kernel
index ca90edb75d..82365a5515 100644
--- a/include/picongpu/fields/MaxwellSolver/Yee/Yee.kernel
+++ b/include/picongpu/fields/MaxwellSolver/Yee/Yee.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -28,212 +28,148 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yee
-{
-    using namespace pmacc;
-
-    /** compute electric field
-     *
-     * @tparam T_numWorkers number of workers
-     * @tparam T_BlockDescription field (electric and magnetic) domain description
-     */
-    template<
-        uint32_t T_workers,
-        typename T_BlockDescription
-    >
-    struct KernelUpdateE
+    namespace fields
     {
-        /** update electric field
-         *
-         * @tparam T_Curl curl functor type
-         * @tparam T_EBox pmacc::DataBox, electric field box type
-         * @tparam T_BBox pmacc::DataBox, magnetic field box type
-         * @tparam T_Mapping mapper functor type
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param acc alpaka accelerator
-         * @param curl functor to calculate the electric field, interface must be
-         *             `operator()(T_BBox)`
-         * @param fieldE electric field iterator
-         * @param fieldB magnetic field iterator
-         * @param mapper functor to map a block to a supercell
-         */
-        template<
-            typename T_Curl,
-            typename T_EBox,
-            typename T_BBox,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_Curl const curl,
-            T_EBox fieldE,
-            T_BBox const fieldB,
-            T_Mapping mapper
-        ) const
+        namespace maxwellSolver
         {
-            using namespace mappings::threads;
-
-            constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorkers = T_workers;
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            auto cachedB = CachedBox::create<
-                0u,
-                typename T_BBox::ValueType
-            >(
-                acc,
-                T_BlockDescription( )
-            );
-
-            nvidia::functors::Assign assign;
-            DataSpace< simDim > const block( mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) ) );
-            DataSpace< simDim > const blockCell = block * MappingDesc::SuperCellSize::toRT( );
-
-            auto fieldBBlock = fieldB.shift( blockCell );
-
-            ThreadCollective<
-                T_BlockDescription,
-                numWorkers
-            > collective( workerIdx );
-
-            collective(
-                acc,
-                assign,
-                cachedB,
-                fieldBBlock
-            );
-
-            __syncthreads();
-
-            constexpr float_X c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
-            constexpr float_X dt = DELTA_T;
-
-            ForEachIdx<
-                IdxConfig<
-                    cellsPerSuperCell,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
+            namespace yee
+            {
+                using namespace pmacc;
+
+                /** compute electric field
+                 *
+                 * @tparam T_numWorkers number of workers
+                 * @tparam T_BlockDescription field (electric and magnetic) domain description
+                 */
+                template<uint32_t T_workers, typename T_BlockDescription>
+                struct KernelUpdateE
                 {
-                    /* cell index within the superCell */
-                    DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
-
-                    fieldE( blockCell + cellIdx ) += curl( cachedB.shift( cellIdx ) ) * c2 * dt;
-                }
-            );
-        }
-    };
-
-    /** compute magnetic field
-     *
-     * @tparam T_numWorkers number of workers
-     * @tparam T_BlockDescription field (electric and magnetic) domain description
-     */
-    template<
-        uint32_t T_workers,
-        typename T_BlockDescription
-    >
-    struct KernelUpdateBHalf
-    {
-        /** update magnetic field
-         *
-         * @tparam T_Curl curl functor type
-         * @tparam T_EBox pmacc::DataBox, electric field box type
-         * @tparam T_BBox pmacc::DataBox, magnetic field box type
-         * @tparam T_Mapping mapper functor type
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param acc alpaka accelerator
-         * @param curl functor to calculate the electric field, interface must be
-         *             `operator()(T_EBox)`
-         * @param fieldB magnetic field iterator
-         * @param fieldE electric field iterator
-         * @param mapper functor to map a block to a supercell
-         */
-        template<
-            typename T_Curl,
-            typename T_EBox,
-            typename T_BBox,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_Curl const curl,
-            T_BBox fieldB,
-            T_EBox const fieldE,
-            T_Mapping mapper
-        ) const
-        {
-            using namespace mappings::threads;
-
-            constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorkers = T_workers;
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            auto cachedE = CachedBox::create<
-                0u,
-                typename T_EBox::ValueType
-            >(
-                acc,
-                T_BlockDescription( )
-            );
-
-            nvidia::functors::Assign assign;
-            DataSpace< simDim > const block( mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) ) );
-            DataSpace< simDim > const blockCell = block * MappingDesc::SuperCellSize::toRT( );
-
-            auto fieldEBlock = fieldE.shift( blockCell );
-
-            ThreadCollective<
-                T_BlockDescription,
-                numWorkers
-            > collective( workerIdx );
-
-            collective(
-                acc,
-                assign,
-                cachedE,
-                fieldEBlock
-            );
-
-            __syncthreads();
-
-            constexpr float_X dt = DELTA_T;
-
-            ForEachIdx<
-                IdxConfig<
-                    cellsPerSuperCell,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
+                    /** update electric field
+                     *
+                     * @tparam T_Curl curl functor type
+                     * @tparam T_EBox pmacc::DataBox, electric field box type
+                     * @tparam T_BBox pmacc::DataBox, magnetic field box type
+                     * @tparam T_Mapping mapper functor type
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param acc alpaka accelerator
+                     * @param curl functor to calculate the electric field, interface must be
+                     *             `operator()(T_BBox)`
+                     * @param fieldE electric field iterator
+                     * @param fieldB magnetic field iterator
+                     * @param mapper functor to map a block to a supercell
+                     */
+                    template<typename T_Curl, typename T_EBox, typename T_BBox, typename T_Mapping, typename T_Acc>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_Curl const curl,
+                        T_EBox fieldE,
+                        T_BBox const fieldB,
+                        T_Mapping mapper) const
+                    {
+                        using namespace mappings::threads;
+
+                        constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                        constexpr uint32_t numWorkers = T_workers;
+
+                        uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                        auto cachedB = CachedBox::create<0u, typename T_BBox::ValueType>(acc, T_BlockDescription());
+
+                        nvidia::functors::Assign assign;
+                        DataSpace<simDim> const block(
+                            mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+                        DataSpace<simDim> const blockCell = block * MappingDesc::SuperCellSize::toRT();
+
+                        auto fieldBBlock = fieldB.shift(blockCell);
+
+                        ThreadCollective<T_BlockDescription, numWorkers> collective(workerIdx);
+
+                        collective(acc, assign, cachedB, fieldBBlock);
+
+                        cupla::__syncthreads(acc);
+
+                        constexpr float_X c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
+                        constexpr float_X dt = DELTA_T;
+
+                        ForEachIdx<IdxConfig<cellsPerSuperCell, numWorkers>>{workerIdx}(
+                            [&](uint32_t const linearIdx, uint32_t const) {
+                                /* cell index within the superCell */
+                                DataSpace<simDim> const cellIdx
+                                    = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
+
+                                fieldE(blockCell + cellIdx) += curl(cachedB.shift(cellIdx)) * c2 * dt;
+                            });
+                    }
+                };
+
+                /** compute magnetic field
+                 *
+                 * @tparam T_numWorkers number of workers
+                 * @tparam T_BlockDescription field (electric and magnetic) domain description
+                 */
+                template<uint32_t T_workers, typename T_BlockDescription>
+                struct KernelUpdateBHalf
                 {
-                    /* cell index within the superCell */
-                    DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
-
-                    fieldB( blockCell + cellIdx ) -= curl( cachedE.shift( cellIdx ) ) * float_X( 0.5 ) * dt;
-                }
-            );
-        }
-    };
-
-} // namespace yee
-} // namespace maxwellSolver
-} // namespace fields
+                    /** update magnetic field
+                     *
+                     * @tparam T_Curl curl functor type
+                     * @tparam T_EBox pmacc::DataBox, electric field box type
+                     * @tparam T_BBox pmacc::DataBox, magnetic field box type
+                     * @tparam T_Mapping mapper functor type
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param acc alpaka accelerator
+                     * @param curl functor to calculate the electric field, interface must be
+                     *             `operator()(T_EBox)`
+                     * @param fieldB magnetic field iterator
+                     * @param fieldE electric field iterator
+                     * @param mapper functor to map a block to a supercell
+                     */
+                    template<typename T_Curl, typename T_EBox, typename T_BBox, typename T_Mapping, typename T_Acc>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_Curl const curl,
+                        T_BBox fieldB,
+                        T_EBox const fieldE,
+                        T_Mapping mapper) const
+                    {
+                        using namespace mappings::threads;
+
+                        constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                        constexpr uint32_t numWorkers = T_workers;
+
+                        uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                        auto cachedE = CachedBox::create<0u, typename T_EBox::ValueType>(acc, T_BlockDescription());
+
+                        nvidia::functors::Assign assign;
+                        DataSpace<simDim> const block(
+                            mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+                        DataSpace<simDim> const blockCell = block * MappingDesc::SuperCellSize::toRT();
+
+                        auto fieldEBlock = fieldE.shift(blockCell);
+
+                        ThreadCollective<T_BlockDescription, numWorkers> collective(workerIdx);
+
+                        collective(acc, assign, cachedE, fieldEBlock);
+
+                        cupla::__syncthreads(acc);
+
+                        constexpr float_X dt = DELTA_T;
+
+                        ForEachIdx<IdxConfig<cellsPerSuperCell, numWorkers>>{workerIdx}(
+                            [&](uint32_t const linearIdx, uint32_t const) {
+                                /* cell index within the superCell */
+                                DataSpace<simDim> const cellIdx
+                                    = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
+
+                                fieldB(blockCell + cellIdx) -= curl(cachedE.shift(cellIdx)) * float_X(0.5) * dt;
+                            });
+                    }
+                };
+
+            } // namespace yee
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/YeePML/Field.hpp b/include/picongpu/fields/MaxwellSolver/YeePML/Field.hpp
index b27e0d6144..e9b6759a8c 100644
--- a/include/picongpu/fields/MaxwellSolver/YeePML/Field.hpp
+++ b/include/picongpu/fields/MaxwellSolver/YeePML/Field.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -44,507 +44,434 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yeePML
-{
-
-    //! Additional node values for E or B in PML
-    struct NodeValues
+    namespace fields
     {
-
-        /* The first letter corresponds to x, y, z field components,
-         * the second to transverse directions for the component
-         */
-        float_X xy, xz, yx, yz, zx, zy;
-
-        //! Number of components per node value
-        static constexpr int numComponents = 6;
-
-        /** Construct node values
-         *
-         * @param initialValue initial value for all components
-         */
-        HDINLINE NodeValues( float_X const initialValue = 0._X );
-
-        /** Construction for compatibility with pmacc vectors
-         *
-         * @param initialValue initial value for all components
-         */
-        HDINLINE static const NodeValues create( float_X const initialValue );
-
-        /** Element access for compatibility with pmacc vectors
-         *
-         * This is a utility for checkpointing and does not need a device
-         * version. For performance considerations does not check that the index
-         * is valid and relies on the components being stored in order, without
-         * padding.
-         *
-         * @param idx index less than 6
-         */
-        float_X & operator[ ]( uint32_t const idx );
-
-        /** Const element access for compatibility with pmacc vectors
-         *
-         * This is a utility for checkpointing and does not need a device
-         * version. For performance considerations does not check that the index
-         * is valid and relies on the components being stored in order, without
-         * padding.
-         *
-         * @param idx index less than 6
-         */
-        float_X const & operator[ ]( uint32_t const idx ) const;
-
-    };
-
-    /** Data box type used for PML fields in kernels
-     *
-     * Only stores data in the PML area using the given 1d data box.
-     * Access is provided via a simDim-dimensional index, same as for other
-     * grid values.
-     *
-     * @tparam T_DataBox1d underlying 1d data box type
-     */
-    template< typename T_DataBox1d >
-    class OuterLayerBox
-    {
-    public:
-
-        //! Underlying data box type
-        using DataBox = T_DataBox1d;
-
-        //! Element type
-        using ValueType = typename DataBox::ValueType;
-
-        //! Grid index type to be used for access
-        using Idx = pmacc::DataSpace< simDim >;
-
-        /** Create an outer layer box
-         *
-         * Only stores data in the PML area using the given 1d data box.
-         * Access is provided via a simDim-dimensional index, same as for other
-         * grid values.
-         *
-         * @param gridLayout grid layout, as for normal fields
-         * @param globalThickness global PML thickness
-         * @param box underlying data box, preallocated to fit all data
-         *            the constructed OuterLayerBox does not own the box memory,
-         *            so can only be used before the box is reallocated
-         */
-        OuterLayerBox(
-            GridLayout< simDim > const & gridLayout,
-            Thickness const & globalThickness,
-            DataBox box
-        );
-
-        /** Constant element access by a simDim-dimensional index
-         *
-         * @param idx grid index
-         */
-        HDINLINE ValueType const & operator( )( Idx const & idx ) const;
-
-        /** Element access by a simDim-dimensional index
-         *
-         * @param idx grid index
-         */
-        HDINLINE ValueType & operator( )( Idx const & idx );
-
-    private:
-
-        /** Convert a simDim-dimensional index to a linear one
-         *
-         * @param idxWithGuard grid index with guard
-         */
-        HDINLINE int getLinearIdx( Idx const & idxWithGuard ) const;
-
-        //! A single Cartesial layer that is part of the outer layer box
-        class Layer
+        namespace maxwellSolver
         {
-        public:
-
-            /** Create a layer
-             *
-             * @param beginIdx first index
-             * @param endIdx index right after the last
-             */
-            HDINLINE Layer(
-                Idx const & beginIdx = Idx::create( 0 ),
-                Idx const & endIdx = Idx::create( 0 )
-            );
-
-            /** Check if the layer contains given index
-             *
-             * @param idx grid index without guard
-             */
-            HDINLINE bool contains( Idx const & idx ) const;
-
-            //! Get the simDim-dimensional volume of the layer
-            HDINLINE int getVolume( ) const;
-
-            /** Get a linear index inside a layer
-             *
-             * Same as in pmacc::DataBox, x is minor and z is major.
-             *
-             * @param idx grid index without guard
-             */
-            HDINLINE int getLinearIdx( Idx const & idx ) const;
-
-        private:
-
-            //! First index of the layer
-            Idx beginIdx;
-
-            //! Size of the layer
-            Idx size;
-
-            //! simDim-dimensional volume of the layer
-            int volume;
-
-        };
-
-        //! Number of layers: a positive and a negative one for each axis
-        static constexpr auto numLayers = 2 * simDim;
-
-        /** Cartesian layers constituting the outer layer
-         *
-         * The ordering inside the array is z-y-x for 3d and y-x for 2d.
-         * However, it should not be relevant since the layers do not intersect,
-         * and logically it represents a set of layers
-         */
-        Layer layers[ numLayers ];
-
-        //! Data box, does not own memory
-        DataBox box;
-
-        //! Guard size
-        Idx const guardSize;
-
-    };
-
-    /** Base class for implementation inheritance in classes for the
-     *  electromagnetic fields in PML
-     *
-     * Stores field values on host and device and provides data synchronization
-     * between them.
-     *
-     * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
-     * ISimulationData.
-     */
-    class Field :
-        public SimulationFieldHelper< MappingDesc >,
-        public ISimulationData
+            namespace yeePML
+            {
+                //! Additional node values for E or B in PML
+                struct NodeValues
+                {
+                    /* The first letter corresponds to x, y, z field components,
+                     * the second to transverse directions for the component
+                     */
+                    float_X xy, xz, yx, yz, zx, zy;
+
+                    //! Number of components per node value
+                    static constexpr int numComponents = 6;
+
+                    /** Construct node values
+                     *
+                     * @param initialValue initial value for all components
+                     */
+                    HDINLINE NodeValues(float_X const initialValue = 0._X);
+
+                    /** Construction for compatibility with pmacc vectors
+                     *
+                     * @param initialValue initial value for all components
+                     */
+                    HDINLINE static const NodeValues create(float_X const initialValue);
+
+                    /** Element access for compatibility with pmacc vectors
+                     *
+                     * This is a utility for checkpointing and does not need a device
+                     * version. For performance considerations does not check that the index
+                     * is valid and relies on the components being stored in order, without
+                     * padding.
+                     *
+                     * @param idx index less than 6
+                     */
+                    float_X& operator[](uint32_t const idx);
+
+                    /** Const element access for compatibility with pmacc vectors
+                     *
+                     * This is a utility for checkpointing and does not need a device
+                     * version. For performance considerations does not check that the index
+                     * is valid and relies on the components being stored in order, without
+                     * padding.
+                     *
+                     * @param idx index less than 6
+                     */
+                    float_X const& operator[](uint32_t const idx) const;
+                };
+
+                /** Data box type used for PML fields in kernels
+                 *
+                 * Only stores data in the PML area using the given 1d data box.
+                 * Access is provided via a simDim-dimensional index, same as for other
+                 * grid values.
+                 *
+                 * @tparam T_DataBox1d underlying 1d data box type
+                 */
+                template<typename T_DataBox1d>
+                class OuterLayerBox
+                {
+                public:
+                    //! Underlying data box type
+                    using DataBox = T_DataBox1d;
+
+                    //! Element type
+                    using ValueType = typename DataBox::ValueType;
+
+                    //! Grid index type to be used for access
+                    using Idx = pmacc::DataSpace<simDim>;
+
+                    /** Create an outer layer box
+                     *
+                     * Only stores data in the PML area using the given 1d data box.
+                     * Access is provided via a simDim-dimensional index, same as for other
+                     * grid values.
+                     *
+                     * @param gridLayout grid layout, as for normal fields
+                     * @param globalThickness global PML thickness
+                     * @param box underlying data box, preallocated to fit all data
+                     *            the constructed OuterLayerBox does not own the box memory,
+                     *            so can only be used before the box is reallocated
+                     */
+                    OuterLayerBox(GridLayout<simDim> const& gridLayout, Thickness const& globalThickness, DataBox box);
+
+                    /** Constant element access by a simDim-dimensional index
+                     *
+                     * @param idx grid index
+                     */
+                    HDINLINE ValueType const& operator()(Idx const& idx) const;
+
+                    /** Element access by a simDim-dimensional index
+                     *
+                     * @param idx grid index
+                     */
+                    HDINLINE ValueType& operator()(Idx const& idx);
+
+                private:
+                    /** Convert a simDim-dimensional index to a linear one
+                     *
+                     * @param idxWithGuard grid index with guard
+                     */
+                    HDINLINE int getLinearIdx(Idx const& idxWithGuard) const;
+
+                    //! A single Cartesial layer that is part of the outer layer box
+                    class Layer
+                    {
+                    public:
+                        /** Create a layer
+                         *
+                         * @param beginIdx first index
+                         * @param endIdx index right after the last
+                         */
+                        HDINLINE Layer(Idx const& beginIdx = Idx::create(0), Idx const& endIdx = Idx::create(0));
+
+                        /** Check if the layer contains given index
+                         *
+                         * @param idx grid index without guard
+                         */
+                        HDINLINE bool contains(Idx const& idx) const;
+
+                        //! Get the simDim-dimensional volume of the layer
+                        HDINLINE int getVolume() const;
+
+                        /** Get a linear index inside a layer
+                         *
+                         * Same as in pmacc::DataBox, x is minor and z is major.
+                         *
+                         * @param idx grid index without guard
+                         */
+                        HDINLINE int getLinearIdx(Idx const& idx) const;
+
+                    private:
+                        //! First index of the layer
+                        Idx beginIdx;
+
+                        //! Size of the layer
+                        Idx size;
+
+                        //! simDim-dimensional volume of the layer
+                        int volume;
+                    };
+
+                    //! Number of layers: a positive and a negative one for each axis
+                    static constexpr auto numLayers = 2 * simDim;
+
+                    /** Cartesian layers constituting the outer layer
+                     *
+                     * The ordering inside the array is z-y-x for 3d and y-x for 2d.
+                     * However, it should not be relevant since the layers do not intersect,
+                     * and logically it represents a set of layers
+                     */
+                    Layer layers[numLayers];
+
+                    //! Data box, does not own memory
+                    DataBox box;
+
+                    //! Guard size
+                    Idx const guardSize;
+                };
+
+                /** Base class for implementation inheritance in classes for the
+                 *  electromagnetic fields in PML
+                 *
+                 * Stores field values on host and device and provides data synchronization
+                 * between them.
+                 *
+                 * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
+                 * ISimulationData.
+                 */
+                class Field
+                    : public SimulationFieldHelper<MappingDesc>
+                    , public ISimulationData
+                {
+                public:
+                    //! Type of each field value
+                    using ValueType = NodeValues;
+
+                    //! Number of components of ValueType, for serialization
+                    static constexpr int numComponents = NodeValues::numComponents;
+
+                    //! Unit type of field components
+                    using UnitValueType = pmacc::math::Vector<float_64, numComponents>;
+
+                    /** Type of host-device buffer for field values
+                     *
+                     * The buffer is logically 1d, but technically multidimentional
+                     * for easier coupling to output utilities.
+                     */
+                    using Buffer = pmacc::GridBuffer<ValueType, simDim>;
+
+                    /** Type of data box for field values on host and device
+                     *
+                     * The data box is logically 1d, but technically multidimentional
+                     * for easier coupling to output utilities.
+                     */
+                    using DataBoxType = pmacc::DataBox<pmacc::PitchedBox<ValueType, simDim>>;
+
+                    //! Data box type used for PML fields in kernels
+                    using OuterLayerBoxType = OuterLayerBox<pmacc::DataBoxDim1Access<DataBoxType>>;
+
+                    //! Size of supercell
+                    using SuperCellSize = MappingDesc::SuperCellSize;
+
+                    /** Create a field
+                     *
+                     * @param cellDescription mapping for kernels
+                     * @param globalThickness global PML thickness
+                     */
+                    HINLINE Field(MappingDesc const& cellDescription, Thickness const& globalThickness);
+
+                    //! Get a reference to the host-device buffer for the field values
+                    HINLINE Buffer& getGridBuffer();
+
+                    //! Get the grid layout
+                    HINLINE pmacc::GridLayout<simDim> getGridLayout();
+
+                    //! Get the host data box for the field values
+                    HINLINE DataBoxType getHostDataBox();
+
+                    //! Get the device data box for the field values
+                    HINLINE DataBoxType getDeviceDataBox();
+
+                    //! Get the device outer layer data box for the field values
+                    HINLINE OuterLayerBoxType getDeviceOuterLayerBox();
+
+                    /** Start asynchronous communication of field values
+                     *
+                     * @param serialEvent event to depend on
+                     */
+                    HINLINE virtual EventTask asyncCommunication(EventTask serialEvent);
+
+                    /** Reset the host-device buffer for field values
+                     *
+                     * @param currentStep index of time iteration
+                     */
+                    HINLINE void reset(uint32_t currentStep) override;
+
+                    //! Synchronize device data with host data
+                    HINLINE void syncToDevice() override;
+
+                    //! Synchronize host data with device data
+                    HINLINE void synchronize() override;
+
+                private:
+                    //! Host-device buffer for field values
+                    std::unique_ptr<Buffer> data;
+
+                    //! Grid layout for normal (non-PML) fields
+                    pmacc::GridLayout<simDim> gridLayout;
+
+                    // PML global thickness
+                    Thickness globalThickness;
+                };
+
+                //! Data box type used for PML fields in kernels
+                using FieldBox = Field::OuterLayerBoxType;
+
+                /** Representation of the additinal electric field components in PML
+                 *
+                 * Stores field values on host and device and provides data synchronization
+                 * between them.
+                 *
+                 * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
+                 * ISimulationData.
+                 */
+                class FieldE : public Field
+                {
+                public:
+                    /** Create a field
+                     *
+                     * @param cellDescription mapping for kernels
+                     * @param globalThickness global PML thickness
+                     */
+                    HINLINE FieldE(MappingDesc const& cellDescription, Thickness const& globalThickness)
+                        : Field(cellDescription, globalThickness)
+                    {
+                    }
+
+                    //! Get id
+                    HINLINE SimulationDataId getUniqueId()
+                    {
+                        return getName();
+                    }
+
+                    //! Get units of field components
+                    HDINLINE static UnitValueType getUnit()
+                    {
+                        return UnitValueType::create(UNIT_EFIELD);
+                    }
+
+                    /** Get unit representation as powers of the 7 base measures
+                     *
+                     * Characterizing the record's unit in SI
+                     * (length L, mass M, time T, electric current I,
+                     *  thermodynamic temperature theta, amount of substance N,
+                     *  luminous intensity J)
+                     */
+                    HINLINE static std::vector<float_64> getUnitDimension()
+                    {
+                        return picongpu::FieldE::getUnitDimension();
+                    }
+
+                    //! Get text name
+                    HINLINE static std::string getName()
+                    {
+                        return "Convolutional PML E";
+                    }
+                };
+
+                /** Representation of the additinal magnetic field components in PML
+                 *
+                 * Stores field values on host and device and provides data synchronization
+                 * between them.
+                 *
+                 * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
+                 * ISimulationData.
+                 */
+                class FieldB : public Field
+                {
+                public:
+                    /** Create a field
+                     *
+                     * @param cellDescription mapping for kernels
+                     * @param globalThickness global PML thickness
+                     */
+                    HINLINE FieldB(MappingDesc const& cellDescription, Thickness const& globalThickness)
+                        : Field(cellDescription, globalThickness)
+                    {
+                    }
+
+                    //! Get id
+                    HINLINE SimulationDataId getUniqueId()
+                    {
+                        return getName();
+                    }
+
+                    //! Get units of field components
+                    HDINLINE static UnitValueType getUnit()
+                    {
+                        return UnitValueType::create(UNIT_BFIELD);
+                    }
+
+                    /** Get unit representation as powers of the 7 base measures
+                     *
+                     * Characterizing the record's unit in SI
+                     * (length L, mass M, time T, electric current I,
+                     *  thermodynamic temperature theta, amount of substance N,
+                     *  luminous intensity J)
+                     */
+                    HINLINE static std::vector<float_64> getUnitDimension()
+                    {
+                        return picongpu::FieldB::getUnitDimension();
+                    }
+
+                    //! Get text name
+                    HINLINE static std::string getName()
+                    {
+                        return "Convolutional PML B";
+                    }
+                };
+
+            } // namespace yeePML
+        } // namespace maxwellSolver
+    } // namespace fields
+
+    namespace traits
     {
-    public:
-
-        //! Type of each field value
-        using ValueType = NodeValues;
-
-        //! Number of components of ValueType, for serialization
-        static constexpr int numComponents = NodeValues::numComponents;
-
-        //! Unit type of field components
-        using UnitValueType = pmacc::math::Vector< float_64, numComponents >;
-
-        /** Type of host-device buffer for field values
-         *
-         * The buffer is logically 1d, but technically multidimentional
-         * for easier coupling to output utilities.
-         */
-        using Buffer = pmacc::GridBuffer<
-            ValueType,
-            simDim
-        >;
-
-        /** Type of data box for field values on host and device
-         *
-         * The data box is logically 1d, but technically multidimentional
-         * for easier coupling to output utilities.
-         */
-        using DataBoxType = pmacc::DataBox<
-            pmacc::PitchedBox<
-                ValueType,
-                simDim
-            >
-        >;
-
-        //! Data box type used for PML fields in kernels
-        using OuterLayerBoxType = OuterLayerBox<
-            pmacc::DataBoxDim1Access< DataBoxType >
-        >;
-
-        //! Size of supercell
-        using SuperCellSize = MappingDesc::SuperCellSize ;
-
-        /** Create a field
+        /** Field position traits for checkpointing
          *
-         * @param cellDescription mapping for kernels
-         * @param globalThickness global PML thickness
+         * PML fields do not fit well, for now just copy the normal fields.
+         * Specialize only for Yee cell type, as this is the only one supported.
          */
-        HINLINE Field(
-            MappingDesc const & cellDescription,
-            Thickness const & globalThickness
-        );
-
-        //! Get a reference to the host-device buffer for the field values
-        HINLINE Buffer & getGridBuffer( );
-
-        //! Get the grid layout
-        HINLINE pmacc::GridLayout< simDim > getGridLayout( );
-
-        //! Get the host data box for the field values
-        HINLINE DataBoxType getHostDataBox( );
-
-        //! Get the device data box for the field values
-        HINLINE DataBoxType getDeviceDataBox( );
-
-        //! Get the device outer layer data box for the field values
-        HINLINE OuterLayerBoxType getDeviceOuterLayerBox( );
-
-        /** Start asynchronous communication of field values
-         *
-         * @param serialEvent event to depend on
-         */
-        HINLINE virtual EventTask asyncCommunication( EventTask serialEvent );
-
-        /** Reset the host-device buffer for field values
-         *
-         * @param currentStep index of time iteration
-         */
-        HINLINE void reset( uint32_t currentStep ) override;
-
-        //! Synchronize device data with host data
-        HINLINE void syncToDevice( ) override;
-
-        //! Synchronize host data with device data
-        HINLINE void synchronize( ) override;
-
-    private:
-
-        //! Host-device buffer for field values
-        std::unique_ptr< Buffer > data;
-
-        //! Grid layout for normal (non-PML) fields
-        pmacc::GridLayout< simDim > gridLayout;
-
-        // PML global thickness
-        Thickness globalThickness;
-
-    };
-
-    //! Data box type used for PML fields in kernels
-    using FieldBox = Field::OuterLayerBoxType;
-
-    /** Representation of the additinal electric field components in PML
-     *
-     * Stores field values on host and device and provides data synchronization
-     * between them.
-     *
-     * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
-     * ISimulationData.
-     */
-    class FieldE : public Field
-    {
-    public:
-
-        /** Create a field
-         *
-         * @param cellDescription mapping for kernels
-         * @param globalThickness global PML thickness
-         */
-        HINLINE FieldE(
-            MappingDesc const & cellDescription,
-            Thickness const & globalThickness
-        ):
-            Field(
-                cellDescription,
-                globalThickness
-            )
-        {
-        }
-
-        //! Get id
-        HINLINE SimulationDataId getUniqueId( )
-        {
-            return getName( );
-        }
-
-        //! Get units of field components
-        HDINLINE static UnitValueType getUnit( )
+        template<uint32_t T_dim>
+        struct FieldPosition<fields::cellType::Yee, fields::maxwellSolver::yeePML::FieldE, T_dim>
+            : FieldPosition<fields::cellType::Yee, FieldE, T_dim>
         {
-            return UnitValueType::create( UNIT_EFIELD );
-        }
+        };
 
-        /** Get unit representation as powers of the 7 base measures
+        /** Field position traits for checkpointing
          *
-         * Characterizing the record's unit in SI
-         * (length L, mass M, time T, electric current I,
-         *  thermodynamic temperature theta, amount of substance N,
-         *  luminous intensity J)
+         * PML fields do not fit well, for now just copy the normal fields.
+         * Specialize only for Yee cell type, as this is the only one supported.
          */
-        HINLINE static std::vector< float_64 > getUnitDimension( )
+        template<uint32_t T_dim>
+        struct FieldPosition<fields::cellType::Yee, fields::maxwellSolver::yeePML::FieldB, T_dim>
+            : FieldPosition<fields::cellType::Yee, FieldB, T_dim>
         {
-            return picongpu::FieldE::getUnitDimension( );
-        }
-
-        //! Get text name
-        HINLINE static std::string getName( )
-        {
-            return "Convolutional PML E";
-        }
-
-    };
-
-    /** Representation of the additinal magnetic field components in PML
-     *
-     * Stores field values on host and device and provides data synchronization
-     * between them.
-     *
-     * Implements interfaces defined by SimulationFieldHelper< MappingDesc > and
-     * ISimulationData.
-     */
-    class FieldB : public Field
-    {
-    public:
+        };
 
-        /** Create a field
-         *
-         * @param cellDescription mapping for kernels
-         * @param globalThickness global PML thickness
+        /** Field domain boundness trait for output and checkpointing:
+         *  PML fields are not domain-bound
          */
-        HINLINE FieldB(
-            MappingDesc const & cellDescription,
-            Thickness const & globalThickness
-        ):
-            Field(
-                cellDescription,
-                globalThickness
-            )
-        {
-        }
-
-        //! Get id
-        HINLINE SimulationDataId getUniqueId( )
+        template<>
+        struct IsFieldDomainBound<fields::maxwellSolver::yeePML::FieldE> : std::false_type
         {
-            return getName( );
-        }
-
-        //! Get units of field components
-        HDINLINE static UnitValueType getUnit( )
-        {
-            return UnitValueType::create( UNIT_BFIELD );
-        }
+        };
 
-        /** Get unit representation as powers of the 7 base measures
-         *
-         * Characterizing the record's unit in SI
-         * (length L, mass M, time T, electric current I,
-         *  thermodynamic temperature theta, amount of substance N,
-         *  luminous intensity J)
+        /** Field domain boundness trait for output and checkpointing:
+         *  PML fields are not domain-bound
          */
-        HINLINE static std::vector< float_64 > getUnitDimension( )
-        {
-            return picongpu::FieldB::getUnitDimension( );
-        }
-
-        //! Get text name
-        HINLINE static std::string getName( )
+        template<>
+        struct IsFieldDomainBound<fields::maxwellSolver::yeePML::FieldB> : std::false_type
         {
-            return "Convolutional PML B";
-        }
-
-    };
-
-} // namespace yeePML
-} // namespace maxwellSolver
-} // namespace fields
-
-namespace traits
-{
-
-    /** Field position traits for checkpointing
-     *
-     * PML fields do not fit well, for now just copy the normal fields.
-     * Specialize only for Yee cell type, as this is the only one supported.
-     */
-    template< uint32_t T_dim >
-    struct FieldPosition<
-        fields::cellType::Yee,
-        fields::maxwellSolver::yeePML::FieldE,
-        T_dim
-    > : FieldPosition<
-        fields::cellType::Yee,
-        FieldE,
-        T_dim
-    >
-    {
-    };
-
-    /** Field position traits for checkpointing
-     *
-     * PML fields do not fit well, for now just copy the normal fields.
-     * Specialize only for Yee cell type, as this is the only one supported.
-     */
-    template< uint32_t T_dim >
-    struct FieldPosition<
-        fields::cellType::Yee,
-        fields::maxwellSolver::yeePML::FieldB,
-        T_dim
-    > : FieldPosition<
-        fields::cellType::Yee,
-        FieldB,
-        T_dim
-    >
-    {
-    };
-
-    /** Field domain boundness trait for output and checkpointing:
-     *  PML fields are not domain-bound
-     */
-    template< >
-    struct IsFieldDomainBound< fields::maxwellSolver::yeePML::FieldE > :
-        std::false_type
-    {
-    };
-
-    /** Field domain boundness trait for output and checkpointing:
-     *  PML fields are not domain-bound
-     */
-    template< >
-    struct IsFieldDomainBound< fields::maxwellSolver::yeePML::FieldB > :
-        std::false_type
-    {
-    };
+        };
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
 
 namespace pmacc
 {
-namespace traits
-{
-
-    //! Node value traits for checkpointing
-    template< >
-    struct GetComponentsType<
-        picongpu::fields::maxwellSolver::yeePML::NodeValues,
-        false
-    >
-    {
-        typedef picongpu::float_X type;
-    };
-
-    //! Node value traits for checkpointing
-    template< >
-    struct GetNComponents<
-        picongpu::fields::maxwellSolver::yeePML::NodeValues,
-        false
-    >
+    namespace traits
     {
-        static constexpr uint32_t value =
-            picongpu::fields::maxwellSolver::yeePML::NodeValues::numComponents;
-    };
+        //! Node value traits for checkpointing
+        template<>
+        struct GetComponentsType<picongpu::fields::maxwellSolver::yeePML::NodeValues, false>
+        {
+            typedef picongpu::float_X type;
+        };
+
+        //! Node value traits for checkpointing
+        template<>
+        struct GetNComponents<picongpu::fields::maxwellSolver::yeePML::NodeValues, false>
+        {
+            static constexpr uint32_t value = picongpu::fields::maxwellSolver::yeePML::NodeValues::numComponents;
+        };
 
-} // namespace traits
+    } // namespace traits
 } // namespace pmacc
diff --git a/include/picongpu/fields/MaxwellSolver/YeePML/Field.tpp b/include/picongpu/fields/MaxwellSolver/YeePML/Field.tpp
index 63d98e0ff8..a80ecb4689 100644
--- a/include/picongpu/fields/MaxwellSolver/YeePML/Field.tpp
+++ b/include/picongpu/fields/MaxwellSolver/YeePML/Field.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch, Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -31,315 +31,266 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yeePML
-{
-
-    namespace detail
+    namespace fields
     {
-
-        /** Construct an simDim-dimensional index out of 3 components.
-        *
-        * For 2d z is ignored
-        *
-        * @param x x component
-        * @param y y component
-        * @param z z component
-        */
-        HDINLINE pmacc::DataSpace< simDim > makeIdx(
-            int const x,
-            int const y,
-            int const z
-        )
-        {
-            auto const idx = pmacc::DataSpace< 3 >{ x, y, z };
-            pmacc::DataSpace< simDim > result;
-            for( uint32_t dim = 0u; dim < simDim; dim++ )
-                result[ dim ] = idx[ dim ];
-            return result;
-        }
-
-        /** Get linear size of the outer layer box
-        *
-        * @param gridLayout grid layout, as for normal fields
-        * @param globalThickness global PML thickness
-        */
-        HDINLINE int getOuterLayerBoxLinearSize(
-            GridLayout< simDim > const & gridLayout,
-            Thickness const & globalThickness
-        )
+        namespace maxwellSolver
         {
-            // All sizes are without guard, since Pml is only on the internal area
-            auto const gridDataSpace = gridLayout.getDataSpaceWithoutGuarding( );
-            auto const nonPmlDataSpace = gridDataSpace -
-                ( globalThickness.positiveBorder + globalThickness.negativeBorder );
-            auto const numGridCells = gridDataSpace.productOfComponents( );
-            auto const numNonPmlCells = nonPmlDataSpace.productOfComponents( );
-            return numGridCells - numNonPmlCells;
-        }
-
-    } // namespace detail
-
-    HDINLINE NodeValues::NodeValues( float_X const initialValue /* = 0._X */ ):
-        xy( initialValue ),
-        xz( initialValue ),
-        yx( initialValue ),
-        yz( initialValue ),
-        zx( initialValue ),
-        zy( initialValue )
-    {
-    }
-
-    HDINLINE const NodeValues NodeValues::create(
-        float_X const initialValue
-    )
-    {
-        return NodeValues{ initialValue };
-    }
-
-    float_X & NodeValues::operator[ ]( uint32_t const idx )
-    {
-        // Here it is safe to call the const version
-        auto constThis = const_cast< NodeValues const * >( this );
-        return const_cast< float_X & >( ( *constThis )[ idx ] );
-    }
-
-    float_X const & NodeValues::operator[ ]( uint32_t const idx ) const
-    {
-        return *( &xy + idx );
-    }
-
-    template< typename T_Value >
-    OuterLayerBox< T_Value >::OuterLayerBox(
-        GridLayout< simDim > const & gridLayout,
-        Thickness const & globalThickness,
-        DataBox box
-    ):
-        guardSize( gridLayout.getGuard( ) ),
-        box( box )
-    {
-        auto const negativeSize = globalThickness.negativeBorder;
-        auto const positiveSize = globalThickness.positiveBorder;
-        /* The region of interest is grid without guard,
-         * which consists of PML and internal area
-         */
-        auto const gridSize = gridLayout.getDataSpaceWithoutGuarding( );
-        auto const positiveBegin = gridSize - positiveSize;
-
-        // Note: since this should compile for 2d, .z( ) can't be used
-        using detail::makeIdx;
-        int layerIdx = 0;
-        if( simDim == 3 )
-        {
-            auto const negativeZLayer = Layer{
-                makeIdx( 0, 0, 0 ),
-                makeIdx( gridSize[ 0 ], gridSize[ 1 ], negativeSize[ 2 ] )
-            };
-            layers[ layerIdx++ ] = negativeZLayer;
-            auto const positiveZLayer = Layer{
-                makeIdx( 0, 0, positiveBegin[ 2 ] ),
-                makeIdx( gridSize[ 0 ], gridSize[ 1 ], gridSize[ 2 ] )
-            };
-            layers[ layerIdx++ ] = positiveZLayer;
-        }
-
-        auto const negativeYLayer = Layer{
-            makeIdx( 0, 0, negativeSize[ 2 ] ),
-            makeIdx( gridSize[ 0 ], negativeSize[ 1 ], positiveBegin[ 2 ] )
-        };
-        layers[ layerIdx++ ] = negativeYLayer;
-        auto const positiveYLayer = Layer{
-            makeIdx( 0, positiveBegin[ 1 ], negativeSize[ 2 ] ),
-            makeIdx( gridSize[ 0 ], gridSize[ 1 ], positiveBegin[ 2 ] )
-        };
-        layers[ layerIdx++ ] = positiveYLayer;
-
-        auto const negativeXLayer = Layer{
-            makeIdx( 0, negativeSize[ 1 ], negativeSize[ 2 ] ),
-            makeIdx( negativeSize[ 0 ], positiveBegin[ 1 ], positiveBegin[ 2 ] )
-        };
-        layers[ layerIdx++ ] = negativeXLayer;
-        auto const positiveXLayer = Layer{
-            makeIdx( positiveBegin[ 0 ], negativeSize[ 1 ], negativeSize[ 2 ] ),
-            makeIdx( gridSize[ 0 ], positiveBegin[ 1 ], positiveBegin[ 2 ] )
-        };
-        layers[ layerIdx++ ] = positiveXLayer;
-    }
-
-    template< typename T_Value >
-    HDINLINE typename OuterLayerBox< T_Value >::ValueType const &
-        OuterLayerBox< T_Value >::operator( )( Idx const & idx ) const
-    {
-        return box(
-            getLinearIdx( idx )
-        );
-    }
-
-    template< typename T_Value >
-    HDINLINE typename OuterLayerBox< T_Value >::ValueType &
-        OuterLayerBox< T_Value >::operator( )( Idx const & idx )
-    {
-        return box(
-            getLinearIdx( idx )
-        );
-    }
-
-    template< typename T_Value >
-    HDINLINE int OuterLayerBox< T_Value >::getLinearIdx(
-        Idx const & idxWithGuard
-    ) const
-    {
-        /* Each PML layer provide a contiguous 1d index range.
-         * The resulting index is a sum of the baseIdx representing the total
-         * size of all previous layers and an index inside the current layer.
-         */
-        auto const idx = idxWithGuard - guardSize;
-        int currentLayerBeginIdx = 0;
-        int result = -1;
-        for( Layer const & layer : layers )
-            if( layer.contains( idx ) )
+            namespace yeePML
             {
-                /* Note: here we could have returned the result directly,
-                 * but chose to have a single return for potential
-                 * performance gains on GPU. The break is not required,
-                 * since each valid index belonds to exactly one layer.
-                 */
-                result = currentLayerBeginIdx + layer.getLinearIdx( idx );
-                break;
-            }
-            else
-                currentLayerBeginIdx += layer.getVolume( );
-        return result;
-    }
-
-    template< typename T_Value >
-    HDINLINE OuterLayerBox< T_Value >::Layer::Layer(
-        Idx const & beginIdx,
-        Idx const & endIdx
-    ):
-        beginIdx{ beginIdx },
-        size{ endIdx - beginIdx },
-        volume{ size.productOfComponents( ) }
-    {
-    }
-
-    template< typename T_Value >
-    HDINLINE bool OuterLayerBox< T_Value >::Layer::contains(
-        Idx const & idx
-    ) const
-    {
-        for( uint32_t dim = 0u; dim < simDim; dim++ )
-            if( ( idx[ dim ] < beginIdx[ dim ] ) ||
-                ( idx[ dim ] >= beginIdx[ dim ] + size[ dim ] ) )
-                return false;
-        return true;
-    }
-
-    template< typename T_Value >
-    HDINLINE int OuterLayerBox< T_Value >::Layer::getVolume( ) const
-    {
-        return volume;
-    }
-
-    template< typename T_Value >
-    HDINLINE int OuterLayerBox< T_Value >::Layer::getLinearIdx(
-        Idx const & idx
-    ) const
-    {
-        // Convert to 3d zero-based index, for 2d keep .z( ) == 0
-        pmacc::DataSpace< 3 > zeroBasedIdx{ 0, 0, 0 };
-        for( uint32_t dim = 0u; dim < simDim; dim++ )
-            zeroBasedIdx[ dim ] = idx[ dim ] - beginIdx[ dim ];
-        return zeroBasedIdx.x( ) + zeroBasedIdx.y( ) * size.x( ) +
-            zeroBasedIdx.z( ) * size.y( ) * size.x( );
-    }
-
-    Field::Field(
-        MappingDesc const & cellDescription,
-        Thickness const & globalThickness ) :
-        SimulationFieldHelper< MappingDesc >( cellDescription ),
-        gridLayout( cellDescription.getGridLayout( ) ),
-        globalThickness( globalThickness )
-    {
-        /* Create a simDim-dimentional buffer
-         * with size = linearSize x 1 [x 1 for 3d]
-         */
-        auto size = pmacc::DataSpace< simDim >::create( 1 );
-        size[ 0 ] = detail::getOuterLayerBoxLinearSize(
-            gridLayout,
-            globalThickness
-        );
-        auto const guardSize = pmacc::DataSpace< simDim >::create( 0 );
-        auto const layout = pmacc::GridLayout< simDim >(
-            size,
-            guardSize
-        );
-        data.reset(
-            new Buffer( layout )
-        );
-    }
-
-    Field::Buffer & Field::getGridBuffer( )
-    {
-        return *data;
-    }
-
-    pmacc::GridLayout< simDim > Field::getGridLayout( )
-    {
-        return data->getGridLayout( );
-    }
-
-    Field::DataBoxType Field::getHostDataBox( )
-    {
-        return data->getHostBuffer( ).getDataBox( );
-    }
-
-    Field::DataBoxType Field::getDeviceDataBox( )
-    {
-        return data->getDeviceBuffer( ).getDataBox( );
-    }
-
-    Field::OuterLayerBoxType Field::getDeviceOuterLayerBox( )
-    {
-        auto const boxWrapper1d = pmacc::DataBoxDim1Access< DataBoxType >{
-            getDeviceDataBox( ),
-            data->getGridLayout( ).getDataSpace( )
-        };
-        /* Note: the outer layer box type just provides access to data,
-         * it does not own or make copy of the data (nor is that required)
-         */
-        return OuterLayerBoxType{
-            gridLayout,
-            globalThickness,
-            boxWrapper1d
-        };
-    }
-
-    EventTask Field::asyncCommunication( EventTask serialEvent )
-    {
-        return data->asyncCommunication( serialEvent );
-    }
-
-    void Field::reset( uint32_t )
-    {
-        data->getHostBuffer( ).reset( true );
-        data->getDeviceBuffer( ).reset( false );
-    }
-
-    void Field::syncToDevice( )
-    {
-        data->hostToDevice( );
-    }
-
-    void Field::synchronize( )
-    {
-        data->deviceToHost( );
-    }
-
-} // namespace yeePML
-} // namespace maxwellSolver
-} // namespace fields
+                namespace detail
+                {
+                    /** Construct an simDim-dimensional index out of 3 components.
+                     *
+                     * For 2d z is ignored
+                     *
+                     * @param x x component
+                     * @param y y component
+                     * @param z z component
+                     */
+                    HDINLINE pmacc::DataSpace<simDim> makeIdx(int const x, int const y, int const z)
+                    {
+                        auto const idx = pmacc::DataSpace<3>{x, y, z};
+                        pmacc::DataSpace<simDim> result;
+                        for(uint32_t dim = 0u; dim < simDim; dim++)
+                            result[dim] = idx[dim];
+                        return result;
+                    }
+
+                    /** Get linear size of the outer layer box
+                     *
+                     * @param gridLayout grid layout, as for normal fields
+                     * @param globalThickness global PML thickness
+                     */
+                    HDINLINE int getOuterLayerBoxLinearSize(
+                        GridLayout<simDim> const& gridLayout,
+                        Thickness const& globalThickness)
+                    {
+                        // All sizes are without guard, since Pml is only on the internal area
+                        auto const gridDataSpace = gridLayout.getDataSpaceWithoutGuarding();
+                        auto const nonPmlDataSpace
+                            = gridDataSpace - (globalThickness.positiveBorder + globalThickness.negativeBorder);
+                        auto const numGridCells = gridDataSpace.productOfComponents();
+                        auto const numNonPmlCells = nonPmlDataSpace.productOfComponents();
+                        return numGridCells - numNonPmlCells;
+                    }
+
+                } // namespace detail
+
+                HDINLINE NodeValues::NodeValues(float_X const initialValue /* = 0._X */)
+                    : xy(initialValue)
+                    , xz(initialValue)
+                    , yx(initialValue)
+                    , yz(initialValue)
+                    , zx(initialValue)
+                    , zy(initialValue)
+                {
+                }
+
+                HDINLINE const NodeValues NodeValues::create(float_X const initialValue)
+                {
+                    return NodeValues{initialValue};
+                }
+
+                float_X& NodeValues::operator[](uint32_t const idx)
+                {
+                    // Here it is safe to call the const version
+                    auto constThis = const_cast<NodeValues const*>(this);
+                    return const_cast<float_X&>((*constThis)[idx]);
+                }
+
+                float_X const& NodeValues::operator[](uint32_t const idx) const
+                {
+                    return *(&xy + idx);
+                }
+
+                template<typename T_Value>
+                OuterLayerBox<T_Value>::OuterLayerBox(
+                    GridLayout<simDim> const& gridLayout,
+                    Thickness const& globalThickness,
+                    DataBox box)
+                    : guardSize(gridLayout.getGuard())
+                    , box(box)
+                {
+                    auto const negativeSize = globalThickness.negativeBorder;
+                    auto const positiveSize = globalThickness.positiveBorder;
+                    /* The region of interest is grid without guard,
+                     * which consists of PML and internal area
+                     */
+                    auto const gridSize = gridLayout.getDataSpaceWithoutGuarding();
+                    auto const positiveBegin = gridSize - positiveSize;
+
+                    // Note: since this should compile for 2d, .z( ) can't be used
+                    using detail::makeIdx;
+                    int layerIdx = 0;
+                    if(simDim == 3)
+                    {
+                        auto const negativeZLayer
+                            = Layer{makeIdx(0, 0, 0), makeIdx(gridSize[0], gridSize[1], negativeSize[2])};
+                        layers[layerIdx++] = negativeZLayer;
+                        auto const positiveZLayer
+                            = Layer{makeIdx(0, 0, positiveBegin[2]), makeIdx(gridSize[0], gridSize[1], gridSize[2])};
+                        layers[layerIdx++] = positiveZLayer;
+                    }
+
+                    auto const negativeYLayer = Layer{
+                        makeIdx(0, 0, negativeSize[2]),
+                        makeIdx(gridSize[0], negativeSize[1], positiveBegin[2])};
+                    layers[layerIdx++] = negativeYLayer;
+                    auto const positiveYLayer = Layer{
+                        makeIdx(0, positiveBegin[1], negativeSize[2]),
+                        makeIdx(gridSize[0], gridSize[1], positiveBegin[2])};
+                    layers[layerIdx++] = positiveYLayer;
+
+                    auto const negativeXLayer = Layer{
+                        makeIdx(0, negativeSize[1], negativeSize[2]),
+                        makeIdx(negativeSize[0], positiveBegin[1], positiveBegin[2])};
+                    layers[layerIdx++] = negativeXLayer;
+                    auto const positiveXLayer = Layer{
+                        makeIdx(positiveBegin[0], negativeSize[1], negativeSize[2]),
+                        makeIdx(gridSize[0], positiveBegin[1], positiveBegin[2])};
+                    layers[layerIdx++] = positiveXLayer;
+                }
+
+                template<typename T_Value>
+                HDINLINE typename OuterLayerBox<T_Value>::ValueType const& OuterLayerBox<T_Value>::operator()(
+                    Idx const& idx) const
+                {
+                    return box(getLinearIdx(idx));
+                }
+
+                template<typename T_Value>
+                HDINLINE typename OuterLayerBox<T_Value>::ValueType& OuterLayerBox<T_Value>::operator()(Idx const& idx)
+                {
+                    return box(getLinearIdx(idx));
+                }
+
+                template<typename T_Value>
+                HDINLINE int OuterLayerBox<T_Value>::getLinearIdx(Idx const& idxWithGuard) const
+                {
+                    /* Each PML layer provide a contiguous 1d index range.
+                     * The resulting index is a sum of the baseIdx representing the total
+                     * size of all previous layers and an index inside the current layer.
+                     */
+                    auto const idx = idxWithGuard - guardSize;
+                    int currentLayerBeginIdx = 0;
+                    int result = -1;
+                    for(Layer const& layer : layers)
+                        if(layer.contains(idx))
+                        {
+                            /* Note: here we could have returned the result directly,
+                             * but chose to have a single return for potential
+                             * performance gains on GPU. The break is not required,
+                             * since each valid index belonds to exactly one layer.
+                             */
+                            result = currentLayerBeginIdx + layer.getLinearIdx(idx);
+                            break;
+                        }
+                        else
+                            currentLayerBeginIdx += layer.getVolume();
+                    return result;
+                }
+
+                template<typename T_Value>
+                HDINLINE OuterLayerBox<T_Value>::Layer::Layer(Idx const& beginIdx, Idx const& endIdx)
+                    : beginIdx{beginIdx}
+                    , size{endIdx - beginIdx}
+                    , volume{size.productOfComponents()}
+                {
+                }
+
+                template<typename T_Value>
+                HDINLINE bool OuterLayerBox<T_Value>::Layer::contains(Idx const& idx) const
+                {
+                    for(uint32_t dim = 0u; dim < simDim; dim++)
+                        if((idx[dim] < beginIdx[dim]) || (idx[dim] >= beginIdx[dim] + size[dim]))
+                            return false;
+                    return true;
+                }
+
+                template<typename T_Value>
+                HDINLINE int OuterLayerBox<T_Value>::Layer::getVolume() const
+                {
+                    return volume;
+                }
+
+                template<typename T_Value>
+                HDINLINE int OuterLayerBox<T_Value>::Layer::getLinearIdx(Idx const& idx) const
+                {
+                    // Convert to 3d zero-based index, for 2d keep .z( ) == 0
+                    pmacc::DataSpace<3> zeroBasedIdx{0, 0, 0};
+                    for(uint32_t dim = 0u; dim < simDim; dim++)
+                        zeroBasedIdx[dim] = idx[dim] - beginIdx[dim];
+                    return zeroBasedIdx.x() + zeroBasedIdx.y() * size.x() + zeroBasedIdx.z() * size.y() * size.x();
+                }
+
+                Field::Field(MappingDesc const& cellDescription, Thickness const& globalThickness)
+                    : SimulationFieldHelper<MappingDesc>(cellDescription)
+                    , gridLayout(cellDescription.getGridLayout())
+                    , globalThickness(globalThickness)
+                {
+                    /* Create a simDim-dimentional buffer
+                     * with size = linearSize x 1 [x 1 for 3d]
+                     */
+                    auto size = pmacc::DataSpace<simDim>::create(1);
+                    size[0] = detail::getOuterLayerBoxLinearSize(gridLayout, globalThickness);
+                    auto const guardSize = pmacc::DataSpace<simDim>::create(0);
+                    auto const layout = pmacc::GridLayout<simDim>(size, guardSize);
+                    data.reset(new Buffer(layout));
+                }
+
+                Field::Buffer& Field::getGridBuffer()
+                {
+                    return *data;
+                }
+
+                pmacc::GridLayout<simDim> Field::getGridLayout()
+                {
+                    return data->getGridLayout();
+                }
+
+                Field::DataBoxType Field::getHostDataBox()
+                {
+                    return data->getHostBuffer().getDataBox();
+                }
+
+                Field::DataBoxType Field::getDeviceDataBox()
+                {
+                    return data->getDeviceBuffer().getDataBox();
+                }
+
+                Field::OuterLayerBoxType Field::getDeviceOuterLayerBox()
+                {
+                    auto const boxWrapper1d = pmacc::DataBoxDim1Access<DataBoxType>{
+                        getDeviceDataBox(),
+                        data->getGridLayout().getDataSpace()};
+                    /* Note: the outer layer box type just provides access to data,
+                     * it does not own or make copy of the data (nor is that required)
+                     */
+                    return OuterLayerBoxType{gridLayout, globalThickness, boxWrapper1d};
+                }
+
+                EventTask Field::asyncCommunication(EventTask serialEvent)
+                {
+                    return data->asyncCommunication(serialEvent);
+                }
+
+                void Field::reset(uint32_t)
+                {
+                    data->getHostBuffer().reset(true);
+                    data->getDeviceBuffer().reset(false);
+                }
+
+                void Field::syncToDevice()
+                {
+                    data->hostToDevice();
+                }
+
+                void Field::synchronize()
+                {
+                    data->deviceToHost();
+                }
+
+            } // namespace yeePML
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/YeePML/Parameters.hpp b/include/picongpu/fields/MaxwellSolver/YeePML/Parameters.hpp
index 4daa44d34d..6deba68a9e 100644
--- a/include/picongpu/fields/MaxwellSolver/YeePML/Parameters.hpp
+++ b/include/picongpu/fields/MaxwellSolver/YeePML/Parameters.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2019-2020 Sergei Bastrakov
+/* Copyright 2019-2021 Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -29,94 +29,89 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yeePML
-{
-
-    /** Parameters of PML, except thickness
-     *
-     * A detailed description and recommended ranges are given in pml.param,
-     * normalizations and unit conversions in pml.unitless.
-     */
-    struct Parameters
+    namespace fields
     {
-        /** Max value of artificial electric conductivity
-         *
-         * Components correspond to directions. Normalized, so that
-         * normalizedSigma = sigma / eps0 = sigma* / mue0.
-         * Unit: 1/unit_time in PIC units
-         */
-        floatD_X normalizedSigmaMax;
+        namespace maxwellSolver
+        {
+            namespace yeePML
+            {
+                /** Parameters of PML, except thickness
+                 *
+                 * A detailed description and recommended ranges are given in pml.param,
+                 * normalizations and unit conversions in pml.unitless.
+                 */
+                struct Parameters
+                {
+                    /** Max value of artificial electric conductivity
+                     *
+                     * Components correspond to directions. Normalized, so that
+                     * normalizedSigma = sigma / eps0 = sigma* / mue0.
+                     * Unit: 1/unit_time in PIC units
+                     */
+                    floatD_X normalizedSigmaMax;
 
-        /** Order of polynomial growth of sigma and kappa
-         *
-         * The growth is from PML internal boundary to the external boundary.
-         * Sigma grows from 0, kappa from 1, both to their max values.
-         */
-        float_X sigmaKappaGradingOrder;
+                    /** Order of polynomial growth of sigma and kappa
+                     *
+                     * The growth is from PML internal boundary to the external boundary.
+                     * Sigma grows from 0, kappa from 1, both to their max values.
+                     */
+                    float_X sigmaKappaGradingOrder;
 
-        /** Max value of coordinate stretching coefficient
-         *
-         * Unitless.
-         */
-        floatD_X kappaMax;
+                    /** Max value of coordinate stretching coefficient
+                     *
+                     * Unitless.
+                     */
+                    floatD_X kappaMax;
 
-        /** Max value of complex frequency shift
-         *
-         * Components correspond to directions. Normalized by eps0.
-         * Unit: 1/unit_time in PIC units
-         */
-        floatD_X normalizedAlphaMax;
+                    /** Max value of complex frequency shift
+                     *
+                     * Components correspond to directions. Normalized by eps0.
+                     * Unit: 1/unit_time in PIC units
+                     */
+                    floatD_X normalizedAlphaMax;
 
-        /** Order of polynomial growth of alpha
-         *
-         * The growth is from PML external boundary to the internal boundary.
-         * Grows from 0 to the max value.
-         */
-        float_X alphaGradingOrder;
-    };
+                    /** Order of polynomial growth of alpha
+                     *
+                     * The growth is from PML external boundary to the internal boundary.
+                     * Grows from 0 to the max value.
+                     */
+                    float_X alphaGradingOrder;
+                };
 
-    //! Thickness of PML at each border, in number of cells
-    struct Thickness
-    {
-        //! Negative border is at the local domain sides minimum in coordinates
-        DataSpace< simDim > negativeBorder;
-        //! Positive border is at the local domain sides maximum in coordinates
-        DataSpace< simDim > positiveBorder;
+                //! Thickness of PML at each border, in number of cells
+                struct Thickness
+                {
+                    //! Negative border is at the local domain sides minimum in coordinates
+                    DataSpace<simDim> negativeBorder;
+                    //! Positive border is at the local domain sides maximum in coordinates
+                    DataSpace<simDim> positiveBorder;
 
-        /** Element access with indexing used in the .param file
-         *
-         * This is only for initialization convenience and so does not have
-         * a device version. Since this is not performance-critical at all,
-         * do range checks on parameters.
-         *
-         * @param axis 0 = x, 1 = y, 2 = z
-         * @param direction 0 = negative, 1 = positive
-         */
-        int & operator()( uint32_t const axis, uint32_t const direction )
-        {
-            if( axis >= simDim )
-                throw std::out_of_range(
-                    "In Thickness::operator() the axis = " +
-                    std::to_string( axis ) + " is invalid"
-                );
-            if( direction == 0 )
-                return negativeBorder[ axis ];
-            else
-                if( direction == 1 )
-                    return positiveBorder[ axis ];
-                else
-                    throw std::out_of_range(
-                        "In Thickness::operator() the direction = " +
-                        std::to_string( direction ) +  " is invalid"
-                    );
-        }
-    };
+                    /** Element access with indexing used in the .param file
+                     *
+                     * This is only for initialization convenience and so does not have
+                     * a device version. Since this is not performance-critical at all,
+                     * do range checks on parameters.
+                     *
+                     * @param axis 0 = x, 1 = y, 2 = z
+                     * @param direction 0 = negative, 1 = positive
+                     */
+                    int& operator()(uint32_t const axis, uint32_t const direction)
+                    {
+                        if(axis >= simDim)
+                            throw std::out_of_range(
+                                "In Thickness::operator() the axis = " + std::to_string(axis) + " is invalid");
+                        if(direction == 0)
+                            return negativeBorder[axis];
+                        else if(direction == 1)
+                            return positiveBorder[axis];
+                        else
+                            throw std::out_of_range(
+                                "In Thickness::operator() the direction = " + std::to_string(direction)
+                                + " is invalid");
+                    }
+                };
 
-} // namespace yeePML
-} // namespace maxwellSolver
-} // namespace fields
+            } // namespace yeePML
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.def b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.def
index 2baf4c786b..8447344789 100644
--- a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.def
+++ b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera,
  *                     Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -20,74 +20,38 @@
 
 #pragma once
 
-#include "picongpu/fields/MaxwellSolver/Yee/Curl.def"
 #include "picongpu/fields/MaxwellSolver/Yee/Yee.def"
 #include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
 
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-
-    template<
-        typename T_CurrentInterpolation = currentInterpolation::None,
-        typename T_CurlE = yee::CurlRight,
-        typename T_CurlB = yee::CurlLeft
-    >
-    class YeePML;
-
-} // namespace maxwellSolver
-} // namespace fields
-
-namespace traits
-{
-
-    template<
-        typename T_CurrentInterpolation,
-        typename T_CurlE,
-        typename T_CurlB
-    >
-    struct GetMargin<
-        picongpu::fields::maxwellSolver::YeePML<
-            T_CurrentInterpolation,
-            T_CurlE,
-            T_CurlB
-        >, FIELD_B
-    > : public GetMargin<
-        picongpu::fields::maxwellSolver::Yee<
-            T_CurrentInterpolation,
-            T_CurlE,
-            T_CurlB
-        >,
-        FIELD_B
-    >
+    namespace fields
     {
-    };
-
-    template<
-        typename T_CurrentInterpolation,
-        typename T_CurlE,
-        typename T_CurlB
-    >
-    struct GetMargin<
-        picongpu::fields::maxwellSolver::YeePML<
-            T_CurrentInterpolation,
-            T_CurlE,
-            T_CurlB
-        >, FIELD_E
-    > : public GetMargin<
-        picongpu::fields::maxwellSolver::Yee<
-            T_CurrentInterpolation,
-            T_CurlE,
-            T_CurlB
-        >,
-        FIELD_E
-    >
+        namespace maxwellSolver
+        {
+            template<
+                typename T_CurrentInterpolation = currentInterpolation::None,
+                typename T_CurlE = yee::CurlRight,
+                typename T_CurlB = yee::CurlLeft>
+            class YeePML;
+
+        } // namespace maxwellSolver
+    } // namespace fields
+
+    namespace traits
     {
-    };
-
-} //namespace traits
+        template<typename T_CurrentInterpolation, typename T_CurlE, typename T_CurlB>
+        struct GetMargin<picongpu::fields::maxwellSolver::YeePML<T_CurrentInterpolation, T_CurlE, T_CurlB>, FIELD_B>
+            : public GetMargin<picongpu::fields::maxwellSolver::Yee<T_CurrentInterpolation, T_CurlE, T_CurlB>, FIELD_B>
+        {
+        };
+
+        template<typename T_CurrentInterpolation, typename T_CurlE, typename T_CurlB>
+        struct GetMargin<picongpu::fields::maxwellSolver::YeePML<T_CurrentInterpolation, T_CurlE, T_CurlB>, FIELD_E>
+            : public GetMargin<picongpu::fields::maxwellSolver::Yee<T_CurrentInterpolation, T_CurlE, T_CurlB>, FIELD_E>
+        {
+        };
+
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp
index 0f1fede572..7131ec46e6 100644
--- a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp
+++ b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.hpp
@@ -1,5 +1,5 @@
-/* Copyright 2019-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz,
- *                Sergei Bastrakov
+/* Copyright 2019-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz,
+ *                Sergei Bastrakov, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -26,6 +26,7 @@
 #include "picongpu/fields/MaxwellSolver/YeePML/Parameters.hpp"
 #include "picongpu/fields/MaxwellSolver/YeePML/YeePML.kernel"
 #include "picongpu/fields/cellType/Yee.hpp"
+#include "picongpu/traits/GetMargin.hpp"
 
 #include <pmacc/traits/GetStringProperties.hpp>
 
@@ -35,419 +36,433 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-
-    /* Note: the yeePML namespace is only used for details and not the YeePML
-     * itself in order to be consistent with other field solvers.
-     */
-    namespace yeePML
-    {
-    namespace detail
+    namespace fields
     {
-
-        /** Implementation of Yee + PML solver updates of E and B
-         *
-         * The original paper on this approach is J.A. Roden, S.D. Gedney.
-         * Convolution PML (CPML): An efficient FDTD implementation of the
-         * CFS - PML for arbitrary media. Microwave and optical technology
-         * letters. 27 (5), 334-339 (2000).
-         * https://doi.org/10.1002/1098-2760(20001205)27:5%3C334::AID-MOP14%3E3.0.CO;2-A
-         * Our implementation is based on a more detailed description in section
-         * 7.9 of the book A. Taflove, S.C. Hagness. Computational
-         * Electrodynamics. The Finite-Difference Time-Domain Method. Third
-         * Edition. Artech house, Boston (2005), referred to as
-         * [Taflove, Hagness].
-         *
-         * @tparam T_CurlE functor to compute curl of E
-         * @tparam T_CurlB functor to compute curl of B
-         */
-        template<
-            typename T_CurlE,
-            typename T_CurlB
-        >
-        class Solver
+        namespace maxwellSolver
         {
-        public:
-
-            using CurlE = T_CurlE;
-            using CurlB = T_CurlB;
-
-            Solver( MappingDesc const cellDescription ) :
-                cellDescription{ cellDescription }
-            {
-                initParameters( );
-                initFields( );
-            }
-
-            //! Get a reference to field E
-            picongpu::FieldE & getFieldE( )
-            {
-                return *( fieldE.get( ) );
-            }
-
-            //! Get a reference to field B
-            picongpu::FieldB & getFieldB( )
+            /* Note: the yeePML namespace is only used for details and not the YeePML
+             * itself in order to be consistent with other field solvers.
+             */
+            namespace yeePML
             {
-                return *( fieldB.get( ) );
-            }
-
-            /** Propagate B values in the given area by half a time step
+                namespace detail
+                {
+                    /** Implementation of Yee + PML solver updates of E and B
+                     *
+                     * The original paper on this approach is J.A. Roden, S.D. Gedney.
+                     * Convolution PML (CPML): An efficient FDTD implementation of the
+                     * CFS - PML for arbitrary media. Microwave and optical technology
+                     * letters. 27 (5), 334-339 (2000).
+                     * https://doi.org/10.1002/1098-2760(20001205)27:5%3C334::AID-MOP14%3E3.0.CO;2-A
+                     * Our implementation is based on a more detailed description in section
+                     * 7.9 of the book A. Taflove, S.C. Hagness. Computational
+                     * Electrodynamics. The Finite-Difference Time-Domain Method. Third
+                     * Edition. Artech house, Boston (2005), referred to as
+                     * [Taflove, Hagness].
+                     *
+                     * @tparam T_CurlE functor to compute curl of E
+                     * @tparam T_CurlB functor to compute curl of B
+                     */
+                    template<typename T_CurlE, typename T_CurlB>
+                    class Solver
+                    {
+                    public:
+                        using CurlE = T_CurlE;
+                        using CurlB = T_CurlB;
+
+                        Solver(MappingDesc const cellDescription) : cellDescription{cellDescription}
+                        {
+                            initParameters();
+                            initFields();
+                        }
+
+                        //! Get a reference to field E
+                        picongpu::FieldE& getFieldE()
+                        {
+                            return *(fieldE.get());
+                        }
+
+                        //! Get a reference to field B
+                        picongpu::FieldB& getFieldB()
+                        {
+                            return *(fieldB.get());
+                        }
+
+                        /** Propagate B values in the given area by the first half of a time step
+                         *
+                         * This operation propagates grid values of field B by dt/2 and prepares the internal state of
+                         * convolutional components so that calling updateBSecondHalf() afterwards competes the update.
+                         *
+                         * @tparam T_Area area to apply updates to, the curl must be applicable to all points;
+                         * normally CORE, BORDER, or CORE + BORDER
+                         *
+                         * @param currentStep index of the current time iteration
+                         */
+                        template<uint32_t T_Area>
+                        void updateBFirstHalf(uint32_t const currentStep)
+                        {
+                            updateBHalf<T_Area>(currentStep, true);
+                        }
+
+                        /** Propagate B values in the given area by the second half of a time step
+                         *
+                         * This operation propagates grid values of field B by dt/2 and relies on the internal state of
+                         * convolutional components set up by a prior call to updateBFirstHalf(). After this call is
+                         * completed, the convolutional components are in the state to call updateBFirstHalf() for the
+                         * next time step.
+                         *
+                         * @tparam T_Area area to apply updates to, the curl must be applicable to all points;
+                         * normally CORE, BORDER, or CORE + BORDER
+                         *
+                         * @param currentStep index of the current time iteration
+                         */
+                        template<uint32_t T_Area>
+                        void updateBSecondHalf(uint32_t const currentStep)
+                        {
+                            updateBHalf<T_Area>(currentStep, false);
+                        }
+
+                        /** Propagate E values in the given area by a time step.
+                         *
+                         * @tparam T_Area area to apply updates to, the curl must be
+                         * applicable to all points; normally CORE, BORDER, or CORE + BORDER
+                         *
+                         * @param currentStep index of the current time iteration
+                         */
+                        template<uint32_t T_Area>
+                        void updateE(uint32_t currentStep)
+                        {
+                            /* Courant-Friedrichs-Levy-Condition for Yee Field Solver:
+                             *
+                             * A workaround is to add a template dependency to the expression.
+                             * `sizeof(ANY_TYPE*) != 0` is always true and defers the evaluation.
+                             */
+                            PMACC_CASSERT_MSG(
+                                Courant_Friedrichs_Levy_condition_failure____check_your_grid_param_file,
+                                (SPEED_OF_LIGHT * SPEED_OF_LIGHT * DELTA_T * DELTA_T * INV_CELL2_SUM) <= 1.0
+                                    && sizeof(T_CurlE*) != 0);
+
+                            constexpr auto numWorkers = getNumWorkers();
+                            using Kernel = yeePML::KernelUpdateE<numWorkers, BlockDescription<CurlB>>;
+                            AreaMapper<T_Area> mapper{cellDescription};
+                            // Note: optimization considerations same as in updateBHalf( ).
+                            PMACC_KERNEL(Kernel{})
+                            (mapper.getGridDim(), numWorkers)(
+                                mapper,
+                                getLocalParameters(mapper, currentStep),
+                                CurlB(),
+                                fieldB->getDeviceDataBox(),
+                                fieldE->getDeviceDataBox(),
+                                psiE->getDeviceOuterLayerBox());
+                        }
+
+                    private:
+                        // Helper types for configuring kernels
+                        template<typename T_Curl>
+                        using BlockDescription = pmacc::SuperCellDescription<
+                            SuperCellSize,
+                            typename traits::GetLowerMargin<T_Curl>::type,
+                            typename traits::GetUpperMargin<T_Curl>::type>;
+                        template<uint32_t T_Area>
+                        using AreaMapper = pmacc::AreaMapping<T_Area, MappingDesc>;
+
+                        // Yee solver data
+                        std::shared_ptr<picongpu::FieldE> fieldE;
+                        std::shared_ptr<picongpu::FieldB> fieldB;
+                        MappingDesc cellDescription;
+
+                        /* PML convolutional field data, defined as in [Taflove, Hagness],
+                         * eq. (7.105a,b), and similar for other components
+                         */
+                        std::shared_ptr<yeePML::FieldE> psiE;
+                        std::shared_ptr<yeePML::FieldB> psiB;
+
+                        /** Thickness in terms of the global domain.
+                         *
+                         * We store only global thickness, as the local one can change
+                         * during the simulation and so has to be recomputed for each time
+                         * step. PML must be fully contained in a single layer of local
+                         * domains near the global simulation area boundary. (Note that
+                         * the domains of this layer might be changing, e.g. due to moving
+                         * window.) There are no other limitations on PML thickness. In
+                         * particular, it is independent of the BORDER area size.
+                         */
+                        Thickness globalSize;
+                        Parameters parameters;
+
+                        /** Propagate B values in the given area by half a time step
+                         *
+                         * @tparam T_Area area to apply updates to, the curl must be
+                         * applicable to all points; normally CORE, BORDER, or CORE + BORDER
+                         *
+                         * @param currentStep index of the current time iteration
+                         * @param updatePsiB whether convolutional magnetic fields need to be updated, or are
+                         * up-to-date
+                         */
+                        template<uint32_t T_Area>
+                        void updateBHalf(uint32_t const currentStep, bool const updatePsiB)
+                        {
+                            constexpr auto numWorkers = getNumWorkers();
+                            using Kernel = yeePML::KernelUpdateBHalf<numWorkers, BlockDescription<CurlE>>;
+                            AreaMapper<T_Area> mapper{cellDescription};
+                            /* Note: here it is possible to first check if PML is enabled
+                             * in the local domain at all, and otherwise optimize by calling
+                             * the normal Yee update kernel. We do not do that, as this
+                             * would be fragile with respect to future separation of PML
+                             * into a plugin.
+                             */
+                            PMACC_KERNEL(Kernel{})
+                            (mapper.getGridDim(), numWorkers)(
+                                mapper,
+                                getLocalParameters(mapper, currentStep),
+                                CurlE(),
+                                fieldE->getDeviceDataBox(),
+                                updatePsiB,
+                                fieldB->getDeviceDataBox(),
+                                psiB->getDeviceOuterLayerBox());
+                        }
+
+                        void initParameters()
+                        {
+                            namespace pml = maxwellSolver::Pml;
+
+                            globalSize = getGlobalThickness();
+                            parameters.sigmaKappaGradingOrder = pml::SIGMA_KAPPA_GRADING_ORDER;
+                            parameters.alphaGradingOrder = pml::ALPHA_GRADING_ORDER;
+                            for(uint32_t dim = 0u; dim < simDim; dim++)
+                            {
+                                parameters.normalizedSigmaMax[dim] = pml::NORMALIZED_SIGMA_MAX[dim];
+                                parameters.kappaMax[dim] = pml::KAPPA_MAX[dim];
+                                parameters.normalizedAlphaMax[dim] = pml::NORMALIZED_ALPHA_MAX[dim];
+                            }
+                        }
+
+                        Thickness getGlobalThickness() const
+                        {
+                            Thickness globalThickness;
+                            for(uint32_t axis = 0u; axis < simDim; axis++)
+                                for(auto direction = 0; direction < 2; direction++)
+                                    globalThickness(axis, direction) = absorber::getGlobalThickness()(axis, direction);
+                            return globalThickness;
+                        }
+
+                        void initFields()
+                        {
+                            /* Split fields are created here (and not with normal E and B)
+                             * in order to not waste memory in case PML is not used.
+                             */
+                            DataConnector& dc = Environment<>::get().DataConnector();
+                            fieldE = dc.get<picongpu::FieldE>(picongpu::FieldE::getName(), true);
+                            fieldB = dc.get<picongpu::FieldB>(picongpu::FieldB::getName(), true);
+                            psiE = std::make_shared<yeePML::FieldE>(cellDescription, globalSize);
+                            psiB = std::make_shared<yeePML::FieldB>(cellDescription, globalSize);
+                            dc.share(psiE);
+                            dc.share(psiB);
+                        }
+
+                        template<uint32_t T_Area>
+                        yeePML::LocalParameters getLocalParameters(
+                            AreaMapper<T_Area>& mapper,
+                            uint32_t const currentStep) const
+                        {
+                            Thickness localThickness = getLocalThickness(currentStep);
+                            checkLocalThickness(localThickness);
+                            return yeePML::LocalParameters(
+                                parameters,
+                                localThickness,
+                                mapper.getGridSuperCells() * SuperCellSize::toRT(),
+                                mapper.getGuardingSuperCells() * SuperCellSize::toRT());
+                        }
+
+                        /**
+                         * Get PML thickness for the local domain at the current time step.
+                         * It depends on the current step because of the moving window.
+                         */
+                        Thickness getLocalThickness(uint32_t const currentStep) const
+                        {
+                            /* The logic of the following checks is the same as in
+                             * absorber::ExponentialDamping::run( ), to disable the absorber
+                             * at a border we set the corresponding thickness to 0.
+                             */
+                            auto& movingWindow = MovingWindow::getInstance();
+                            auto const numSlides = movingWindow.getSlideCounter(currentStep);
+                            auto const numExchanges = NumberOfExchanges<simDim>::value;
+                            auto const communicationMask
+                                = Environment<simDim>::get().GridController().getCommunicationMask();
+                            Thickness localThickness = globalSize;
+                            for(uint32_t exchange = 1u; exchange < numExchanges; ++exchange)
+                            {
+                                /* Here we are only interested in the positive and negative
+                                 * directions for x, y, z axes and not the "diagonal" ones.
+                                 * So skip other directions except left, right, top, bottom,
+                                 * back, front
+                                 */
+                                if(FRONT % exchange != 0)
+                                    continue;
+
+                                // Transform exchange into a pair of axis and direction
+                                uint32_t axis = 0;
+                                if(exchange >= BOTTOM && exchange <= TOP)
+                                    axis = 1;
+                                if(exchange >= BACK)
+                                    axis = 2;
+                                uint32_t direction = exchange % 2;
+
+                                // No PML at the borders between two local domains
+                                bool hasNeighbour = communicationMask.isSet(exchange);
+                                if(hasNeighbour)
+                                    localThickness(axis, direction) = 0;
+
+                                // Disable PML during laser initialization
+                                if(fields::laserProfiles::Selected::initPlaneY == 0)
+                                {
+                                    bool isLaserInitializationOver
+                                        = (currentStep * DELTA_T) >= fields::laserProfiles::Selected::INIT_TIME;
+                                    if(numSlides == 0 && !isLaserInitializationOver && exchange == TOP)
+                                        localThickness(axis, direction) = 0;
+                                }
+
+                                // Disable PML at the far side of the moving window
+                                if(movingWindow.isSlidingWindowActive(currentStep) && exchange == BOTTOM)
+                                    localThickness(axis, direction) = 0;
+                            }
+                            return localThickness;
+                        }
+
+                        //! Verify that PML fits the local domain
+                        void checkLocalThickness(Thickness const localThickness) const
+                        {
+                            auto const localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+                            auto const localPMLSize = localThickness.negativeBorder + localThickness.positiveBorder;
+                            auto pmlFitsDomain = true;
+                            for(uint32_t dim = 0u; dim < simDim; dim++)
+                                if(localPMLSize[dim] > localDomain.size[dim])
+                                    pmlFitsDomain = false;
+                            if(!pmlFitsDomain)
+                                throw std::out_of_range("Requested PML size exceeds the local domain");
+                        }
+
+                        //! Get number of workers for kernels
+                        static constexpr uint32_t getNumWorkers()
+                        {
+                            return pmacc::traits::GetNumWorkers<
+                                pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+                        }
+                    };
+
+                } // namespace detail
+            } // namespace yeePML
+
+            /** Yee field solver with perfectly matched layer (PML) absorber
              *
-             * @tparam T_Area area to apply updates to, the curl must be
-             * applicable to all points; normally CORE, BORDER, or CORE + BORDER
+             * Absorption is done using convolutional perfectly matched layer (CPML),
+             * implemented according to [Taflove, Hagness].
              *
-             * @param currentStep index of the current time iteration
-             */
-            template< uint32_t T_Area >
-            void updateBHalf( uint32_t const currentStep )
-            {
-                constexpr auto numWorkers = getNumWorkers( );
-                using Kernel = yeePML::KernelUpdateBHalf<
-                    numWorkers,
-                    BlockDescription< CurlE >
-                >;
-                AreaMapper< T_Area > mapper{ cellDescription };
-                /* Note: here it is possible to first check if PML is enabled
-                 * in the local domain at all, and otherwise optimize by calling
-                 * the normal Yee update kernel. We do not do that, as this
-                 * would be fragile with respect to future separation of PML
-                 * into a plugin.
-                 */
-                PMACC_KERNEL( Kernel{ } )
-                    ( mapper.getGridDim( ), numWorkers )(
-                        mapper,
-                        getLocalParameters( mapper, currentStep ),
-                        CurlE( ),
-                        fieldE->getDeviceDataBox( ),
-                        fieldB->getDeviceDataBox( ),
-                        psiB->getDeviceOuterLayerBox( )
-                    );
-            }
-
-            /** Propagate E values in the given area by a time step.
+             * This class template is a public interface to be used, e.g. in .param
+             * files and is compatible with other field solvers. Parameters of PML
+             * are taken from pml.param, pml.unitless.
              *
-             * @tparam T_Area area to apply updates to, the curl must be
-             * applicable to all points; normally CORE, BORDER, or CORE + BORDER
+             * Enabling this solver results in more memory being used on a device:
+             * 12 additional scalar field values per each grid cell of a local domain.
+             * Another limitation is not full persistency with checkpointing: the
+             * additional values are not saved and so set to 0 after loading a
+             * checkpoint (which in some cases still provides proper absorption, but
+             * it is not guaranteed and results will differ due to checkpointing).
              *
-             * @param currentStep index of the current time iteration
-             */
-            template< uint32_t T_Area >
-            void updateE( uint32_t currentStep )
-            {
-                /* Courant-Friedrichs-Levy-Condition for Yee Field Solver: */
-                PMACC_CASSERT_MSG(Courant_Friedrichs_Levy_condition_failure____check_your_grid_param_file,
-                    (SPEED_OF_LIGHT*SPEED_OF_LIGHT*DELTA_T*DELTA_T*INV_CELL2_SUM)<=1.0);
-
-                constexpr auto numWorkers = getNumWorkers( );
-                using Kernel = yeePML::KernelUpdateE<
-                    numWorkers,
-                    BlockDescription< CurlB >
-                >;
-                AreaMapper< T_Area > mapper{ cellDescription };
-                // Note: optimization considerations same as in updateBHalf( ).
-                PMACC_KERNEL( Kernel{ } )
-                    ( mapper.getGridDim( ), numWorkers )(
-                        mapper,
-                        getLocalParameters( mapper, currentStep ),
-                        CurlB( ),
-                        fieldB->getDeviceDataBox( ),
-                        fieldE->getDeviceDataBox( ),
-                        psiE->getDeviceOuterLayerBox( )
-                    );
-            }
-
-        private:
-
-            // Helper types for configuring kernels
-            template< typename T_Curl >
-            using BlockDescription = pmacc::SuperCellDescription<
-                SuperCellSize,
-                typename T_Curl::LowerMargin,
-                typename T_Curl::UpperMargin
-            >;
-            template< uint32_t T_Area >
-            using AreaMapper = pmacc::AreaMapping<
-                T_Area,
-                MappingDesc
-            >;
-
-            // Yee solver data
-            std::shared_ptr< picongpu::FieldE > fieldE;
-            std::shared_ptr< picongpu::FieldB > fieldB;
-            MappingDesc cellDescription;
-
-            /* PML convolutional field data, defined as in [Taflove, Hagness],
-             * eq. (7.105a,b), and similar for other components
-             */
-            std::shared_ptr< yeePML::FieldE > psiE;
-            std::shared_ptr< yeePML::FieldB > psiB;
-
-            /** Thickness in terms of the global domain.
+             * This class template implements the general flow of CORE and BORDER field
+             * updates and communication. The numerical schemes to perform the updates
+             * are implemented by yeePML::detail::Solver.
              *
-             * We store only global thickness, as the local one can change
-             * during the simulation and so has to be recomputed for each time
-             * step. PML must be fully contained in a single layer of local
-             * domains near the global simulation area boundary. (Note that
-             * the domains of this layer might be changing, e.g. due to moving
-             * window.) There are no other limitations on PML thickness. In
-             * particular, it is independent of the BORDER area size.
+             * @tparam T_CurrentInterpolation current interpolation functor
+             * @tparam T_CurlE functor to compute curl of E
+             * @tparam T_CurlB functor to compute curl of B
              */
-            Thickness globalSize;
-            Parameters parameters;
-
-            void initParameters( )
+            template<typename T_CurrentInterpolation, typename T_CurlE, typename T_CurlB>
+            class YeePML
             {
-                globalSize = getGlobalThickness( );
-                parameters.sigmaKappaGradingOrder = SIGMA_KAPPA_GRADING_ORDER;
-                parameters.alphaGradingOrder = ALPHA_GRADING_ORDER;
-                for( uint32_t dim = 0u; dim < simDim; dim++ )
+            public:
+                // Types required by field solver interface
+                using CellType = cellType::Yee;
+                using CurrentInterpolation = T_CurrentInterpolation;
+                using CurlE = T_CurlE;
+                using CurlB = T_CurlB;
+
+                YeePML(MappingDesc const cellDescription) : solver(cellDescription)
                 {
-                    parameters.normalizedSigmaMax[ dim ] = NORMALIZED_SIGMA_MAX[ dim ];
-                    parameters.kappaMax[ dim ] = KAPPA_MAX[ dim ];
-                    parameters.normalizedAlphaMax[ dim ] = NORMALIZED_ALPHA_MAX[ dim ];
                 }
-            }
 
-            Thickness getGlobalThickness( ) const
-            {
-                Thickness globalThickness;
-                for( uint32_t axis = 0u; axis < simDim; axis++ )
-                    for( auto direction = 0; direction < 2; direction++ )
-                        globalThickness( axis, direction ) = absorber::numCells[ axis ][ direction ];
-                return globalThickness;
-            }
-
-            void initFields( )
-            {
-                /* Split fields are created here (and not with normal E and B)
-                 * in order to not waste memory in case PML is not used.
+                /** Perform the first part of E and B propagation by a time step.
+                 *
+                 * Together with update_afterCurrent( ) forms the full propagation.
+                 *
+                 * @param currentStep index of the current time iteration
                  */
-                DataConnector & dc = Environment<>::get( ).DataConnector( );
-                fieldE = dc.get< picongpu::FieldE >(
-                    picongpu::FieldE::getName( ),
-                    true
-                );
-                fieldB = dc.get< picongpu::FieldB >(
-                    picongpu::FieldB::getName( ),
-                    true
-                );
-                psiE = std::make_shared< yeePML::FieldE >(
-                    cellDescription,
-                    globalSize
-                );
-                psiB = std::make_shared< yeePML::FieldB >(
-                    cellDescription,
-                    globalSize
-                );
-                dc.share( psiE );
-                dc.share( psiB );
-            }
-
-            template< uint32_t T_Area >
-            yeePML::LocalParameters getLocalParameters(
-                AreaMapper< T_Area > & mapper,
-                uint32_t const currentStep
-            ) const
-            {
-                Thickness localThickness = getLocalThickness( currentStep );
-                checkLocalThickness( localThickness );
-                return yeePML::LocalParameters(
-                    parameters,
-                    localThickness,
-                    mapper.getGridSuperCells( ) * SuperCellSize::toRT( ),
-                    mapper.getGuardingSuperCells( ) * SuperCellSize::toRT( )
-                );
-            }
-
-            /**
-             * Get PML thickness for the local domain at the current time step.
-             * It depends on the current step because of the moving window.
-             */
-            Thickness getLocalThickness( uint32_t const currentStep ) const
-            {
-                /* The logic of the following checks is the same as in
-                 * absorber::ExponentialDamping::run( ), to disable the absorber
-                 * at a border we set the corresponding thickness to 0.
-                 */
-                auto & movingWindow = MovingWindow::getInstance( );
-                auto const numSlides = movingWindow.getSlideCounter( currentStep );
-                auto const numExchanges = NumberOfExchanges< simDim >::value;
-                auto const communicationMask = Environment< simDim >::get( ).GridController( ).getCommunicationMask( );
-                Thickness localThickness = globalSize;
-                for( uint32_t exchange = 1u; exchange < numExchanges; ++exchange )
+                void update_beforeCurrent(uint32_t const currentStep)
                 {
-                    /* Here we are only interested in the positive and negative
-                     * directions for x, y, z axes and not the "diagonal" ones.
-                     * So skip other directions except left, right, top, bottom,
-                     * back, front
+                    /* These steps are the same as in the Yee solver, PML updates are done as part of methods of
+                     * solver. Note that here we do the second half of updating B, thus completing the first half
+                     * started in a call to update_afterCurrent() at the previous time step. This splitting of B update
+                     * is standard for Yee-type field solvers in PIC codes due to particle pushers normally requiring E
+                     * and B values defined at the same time while the field solver operates with time-staggered
+                     * fields. However, while the standard Yee solver in vacuum is linear in a way of two consecutive
+                     * updates by dt/2 being equal to one update by dt, this is not true for the convolutional field
+                     * updates in PML. Thus, for PML we have to distinguish between the updates by dt/2 by introducing
+                     * first and second halves of the update. This distinction only concerns the convolutional field B
+                     * data used inside the PML, and not the full fields used by the rest of the code. In the very
+                     * first time step of a simulation we start with the second half right away, but this is no
+                     * problem, since the only meaningful initial conditions in the PML area are zero for the
+                     * to-be-absorbed components.
                      */
-                    if( FRONT % exchange != 0 )
-                        continue;
-
-                    // Transform exchange into a pair of axis and direction
-                    uint32_t axis = 0;
-                    if( exchange >= BOTTOM && exchange <= TOP )
-                        axis = 1;
-                    if( exchange >= BACK )
-                        axis = 2;
-                    uint32_t direction = exchange % 2;
-
-                    // No PML at the borders between two local domains
-                    bool hasNeighbour = communicationMask.isSet( exchange );
-                    if( hasNeighbour )
-                        localThickness( axis, direction ) = 0;
-
-                    // Disable PML during laser initialization
-                    if( fields::laserProfiles::Selected::initPlaneY == 0 )
-                    {
-                        bool isLaserInitializationOver =
-                            (currentStep * DELTA_T) >= fields::laserProfiles::Selected::INIT_TIME;
-                        if( numSlides == 0 && !isLaserInitializationOver && exchange == TOP )
-                            localThickness( axis, direction ) = 0;
-                    }
-
-                    // Disable PML at the far side of the moving window
-                    if( movingWindow.isSlidingWindowActive( currentStep ) && exchange == BOTTOM )
-                        localThickness( axis, direction ) = 0;
-                }
-                return localThickness;
-            }
+                    solver.template updateBSecondHalf<CORE + BORDER>(currentStep);
+                    auto& fieldB = solver.getFieldB();
+                    EventTask eRfieldB = fieldB.asyncCommunication(__getTransactionEvent());
 
-            //! Verify that PML fits the local domain
-            void checkLocalThickness( Thickness const localThickness ) const
-            {
-                auto const localDomain = Environment< simDim >::get( ).SubGrid( ).getLocalDomain( );
-                auto const localPMLSize = localThickness.negativeBorder + localThickness.positiveBorder;
-                auto pmlFitsDomain = true;
-                for( uint32_t dim = 0u; dim < simDim; dim++ )
-                    if( localPMLSize[ dim ] > localDomain.size[ dim ] )
-                        pmlFitsDomain = false;
-                if( !pmlFitsDomain )
-                    throw std::out_of_range( "Requested PML size exceeds the local domain" );
-            }
-
-            //! Get number of workers for kernels
-            static constexpr uint32_t getNumWorkers( )
-            {
-                return pmacc::traits::GetNumWorkers<
-                    pmacc::math::CT::volume< SuperCellSize >::type::value
-                >::value;
-            }
-
-        };
-
-    } // namespace detail
-    } // namespace yeePML
-
-    /** Yee field solver with perfectly matched layer (PML) absorber
-     *
-     * Absorption is done using convolutional perfectly matched layer (CPML),
-     * implemented according to [Taflove, Hagness].
-     *
-     * This class template is a public interface to be used, e.g. in .param
-     * files and is compatible with other field solvers. Parameters of PML
-     * are taken from pml.param, pml.unitless.
-     *
-     * Enabling this solver results in more memory being used on a device:
-     * 12 additional scalar field values per each grid cell of a local domain.
-     * Another limitation is not full persistency with checkpointing: the
-     * additional values are not saved and so set to 0 after loading a
-     * checkpoint (which in some cases still provides proper absorption, but
-     * it is not guaranteed and results will differ due to checkpointing).
-     *
-     * This class template implements the general flow of CORE and BORDER field
-     * updates and communication. The numerical schemes to perform the updates
-     * are implemented by yeePML::detail::Solver.
-     *
-     * @tparam T_CurrentInterpolation current interpolation functor
-     * @tparam T_CurlE functor to compute curl of E
-     * @tparam T_CurlB functor to compute curl of B
-     */
-    template<
-        typename T_CurrentInterpolation,
-        typename T_CurlE,
-        typename T_CurlB
-    >
-    class YeePML
-    {
-    public:
-
-        // Types required by field solver interface
-        using CellType = cellType::Yee;
-        using CurrentInterpolation = T_CurrentInterpolation;
-        using CurlE = T_CurlE;
-        using CurlB = T_CurlB;
-
-        YeePML( MappingDesc const cellDescription ) :
-            solver( cellDescription )
-        {
-        }
-
-        /** Perform the first part of E and B propagation by a time step.
-         *
-         * Together with update_afterCurrent( ) forms the full propagation.
-         *
-         * @param currentStep index of the current time iteration
-         */
-        void update_beforeCurrent( uint32_t const currentStep )
-        {
-            /* These steps are the same as in the Yee solver,
-             * PML updates are done as part of solver.updateE( ),
-             * solver.updateBHalf( )
-             */
-            solver.template updateBHalf < CORE + BORDER >( currentStep );
-            auto & fieldB = solver.getFieldB( );
-            EventTask eRfieldB = fieldB.asyncCommunication( __getTransactionEvent( ) );
-
-            solver.template updateE< CORE >( currentStep );
-            __setTransactionEvent( eRfieldB );
-            solver.template updateE< BORDER >( currentStep );
-        }
-
-        /** Perform the last part of E and B propagation by a time step
-         *
-         * Together with update_beforeCurrent( ) forms the full propagation.
-         *
-         * @param currentStep index of the current time iteration
-         */
-        void update_afterCurrent( uint32_t const currentStep )
-        {
-            /* These steps are the same as in the Yee solver,
-             * except the Fabsorber::ExponentialDamping::run( ) is not called,
-             * PML updates are done as part of solver.updateBHalf( ).
-             */
-            if( laserProfiles::Selected::INIT_TIME > 0.0_X )
-                LaserPhysics{ }( currentStep );
-
-            auto & fieldE = solver.getFieldE( );
-            EventTask eRfieldE = fieldE.asyncCommunication( __getTransactionEvent( ) );
+                    solver.template updateE<CORE>(currentStep);
+                    __setTransactionEvent(eRfieldB);
+                    solver.template updateE<BORDER>(currentStep);
+                }
 
-            solver.template updateBHalf< CORE >( currentStep );
-            __setTransactionEvent( eRfieldE );
-            solver.template updateBHalf< BORDER >( currentStep );
+                /** Perform the last part of E and B propagation by a time step
+                 *
+                 * Together with update_beforeCurrent( ) forms the full propagation.
+                 *
+                 * @param currentStep index of the current time iteration
+                 */
+                void update_afterCurrent(uint32_t const currentStep)
+                {
+                    /* These steps are the same as in the Yee solver, except the Fabsorber::ExponentialDamping::run( )
+                     * is not called, PML updates are done as part of calls to methods of solver. As explained in more
+                     * detail in comments inside update_beforeCurrent(), here we start a new step of updating B in
+                     * terms of the time-staggered Yee grid. And so this is the first half of B update, to be completed
+                     * in a call to update_beforeCurrent() on the next time step.
+                     */
+                    if(laserProfiles::Selected::INIT_TIME > 0.0_X)
+                        LaserPhysics{}(currentStep);
 
-            auto & fieldB = solver.getFieldB( );
-            EventTask eRfieldB = fieldB.asyncCommunication( __getTransactionEvent( ) );
-            __setTransactionEvent( eRfieldB );
-        }
+                    auto& fieldE = solver.getFieldE();
+                    EventTask eRfieldE = fieldE.asyncCommunication(__getTransactionEvent());
 
-        static pmacc::traits::StringProperty getStringProperties( )
-        {
-            pmacc::traits::StringProperty propList( "name", "YeePML" );
-            return propList;
-        }
+                    solver.template updateBFirstHalf<CORE>(currentStep);
+                    __setTransactionEvent(eRfieldE);
+                    solver.template updateBFirstHalf<BORDER>(currentStep);
 
-    private:
+                    auto& fieldB = solver.getFieldB();
+                    EventTask eRfieldB = fieldB.asyncCommunication(__getTransactionEvent());
+                    __setTransactionEvent(eRfieldB);
+                }
 
-        yeePML::detail::Solver< CurlE, CurlB > solver;
+                static pmacc::traits::StringProperty getStringProperties()
+                {
+                    pmacc::traits::StringProperty propList("name", "Yee");
+                    return propList;
+                }
 
-    };
+            private:
+                yeePML::detail::Solver<CurlE, CurlB> solver;
+            };
 
-} // namespace maxwellSolver
-} // namespace fields
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
 
 #include "picongpu/fields/MaxwellSolver/YeePML/Field.tpp"
diff --git a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.kernel b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.kernel
index dcc5fff27b..d04eae9614 100644
--- a/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.kernel
+++ b/include/picongpu/fields/MaxwellSolver/YeePML/YeePML.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten,
  *                     Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -33,551 +33,434 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yeePML
-{
-
-    //! Parameters of PML for the local domain
-    struct LocalParameters : public Parameters
-    {
-        /** PML size in cells, stored as floats to avoid type casts later,
-         *  negative and positive borders defined the same way as for Thickness
-         */
-        floatD_X const negativeBorderSize;
-        floatD_X const positiveBorderSize;
-
-        //! Local domain characteristics, including guard cells
-        DataSpace< simDim > const numLocalDomainCells;
-        DataSpace< simDim > const numGuardCells;
-
-        LocalParameters(
-            Parameters const parameters,
-            Thickness const localThickness,
-            DataSpace< simDim > const numLocalDomainCells,
-            DataSpace< simDim > const numGuardCells
-        ):
-            Parameters( parameters ),
-            negativeBorderSize( precisionCast< float_X >( localThickness.negativeBorder ) ),
-            positiveBorderSize( precisionCast< float_X >( localThickness.positiveBorder ) ),
-            numLocalDomainCells( numLocalDomainCells ),
-            numGuardCells( numGuardCells )
-        {
-        }
-    };
-
-    namespace detail
+    namespace fields
     {
-
-        /** Get relative depth of a given cell for 1D.
-         *
-         * This function operates with a 1D slice of domain and PML.
-         * index == numGuardCells corresponds to the external negative PML
-         * border, and index == numLocalDomainCells - numGuardCells - 1
-         * corresponds to the external positive PML border.
-         * For the internal area result is 0, for points in PML the depth
-         * scales from 0 at the internal border to 1 at the external border.
-         * Index and local domain size include the guard.
-         *
-         * @param cellIdx cell index including the guard, can be fractional,
-         * e.g. for halves of cells
-         * @param numPMLCellsNegative number of PML cells at the negative border
-         * @param numPMLCellsPositive number of PML cells at the positive border
-         * @param numLocalDomainCells number of cells of the local domain
-         * including the guard
-         * @param numGuardCells number of guard cells at each side
-         * @return relative depth, value between 0 and 1
-         */
-        DINLINE float_X getRelativeDepth(
-            float_X const cellIdx,
-            float_X const numPMLCellsNegative,
-            float_X const numPMLCellsPositive,
-            uint32_t const numLocalDomainCells,
-            uint32_t const numGuardCells
-        )
+        namespace maxwellSolver
         {
-            auto zeroBasedIdx = cellIdx - numGuardCells;
-            auto const isInLeftPML = ( zeroBasedIdx < numPMLCellsNegative );
-            if( isInLeftPML )
-                return ( numPMLCellsNegative - zeroBasedIdx ) / numPMLCellsNegative;
-            else
+            namespace yeePML
             {
-                auto zeroBasedRightPMLStart = numLocalDomainCells -
-                    2 * numGuardCells - numPMLCellsPositive;
-                auto const isInRightPML = ( zeroBasedIdx > zeroBasedRightPMLStart );
-                if( isInRightPML )
-                    return ( zeroBasedIdx - zeroBasedRightPMLStart ) / numPMLCellsPositive;
-            }
-            return 0._X;
-        }
-
-        /** Get absorption parameters: sigma, kappa and alpha at a given cell
-         *
-         * Apply polynomial grading, as described in pml.param.
-         *
-         * @param cellIdx cell index including the guard, can be fractional,
-         * e.g. for halves of cells
-         * @param parameters parameters of PML in the local domain
-         * @param[out] normalizedSigma value of normalized sigma at the cell
-         * @param[out] kappa value of normalized kappa at the cell
-         * @param[out] normalizedAlpha value of normalized alpha at the cell
-         */
-        DINLINE void getAbsorptionParameters(
-            floatD_X const cellIdx,
-            LocalParameters const parameters,
-            float3_X & normalizedSigma,
-            float3_X & kappa,
-            float3_X & normalizedAlpha
-        )
-        {
-            // initialize with values for non-PML area
-            normalizedSigma = float3_X::create( 0._X );
-            kappa = float3_X::create( 1._X );
-            normalizedAlpha = float3_X::create( 0._X );
-            for( uint32_t dim = 0u; dim < simDim; dim++ )
-            {
-                auto const relativeDepth = getRelativeDepth(
-                    cellIdx[ dim ],
-                    parameters.negativeBorderSize[ dim ],
-                    parameters.positiveBorderSize[ dim ],
-                    parameters.numLocalDomainCells[ dim ],
-                    parameters.numGuardCells[ dim ]
-                );
-                // Since normally most points are not in PML, avoid costly
-                // computing in this case
-                if( relativeDepth != 0._X )
+                //! Parameters of PML for the local domain
+                struct LocalParameters : public Parameters
                 {
-                    /* Grading done according to [Taflove, Hagness], eq. (7.60a, b).
-                     * Note: here we use a general expression, it is possible
-                     * to specialize for sigmaKappaGradingOrder = 2, 3, or 4,
-                     * but currently seems not worth it.
-                     */
-                    auto const sigmaKappaGradingCoeff = math::pow(
-                        relativeDepth,
-                        parameters.sigmaKappaGradingOrder
-                    );
-                    normalizedSigma[ dim ] = parameters.normalizedSigmaMax[ dim ] *
-                        sigmaKappaGradingCoeff;
-                    kappa[ dim ] = 1._X + ( parameters.kappaMax[ dim ] - 1._X ) *
-                        sigmaKappaGradingCoeff;
-                    /* Grading done according to [Taflove, Hagness], eq. (7.79),
-                     * note that this code is only correct when relativeDepth != 0
+                    /** PML size in cells, stored as floats to avoid type casts later,
+                     *  negative and positive borders defined the same way as for Thickness
                      */
-                    auto const alphaGradingCoeff = math::pow(
-                        1._X - relativeDepth,
-                        parameters.alphaGradingOrder
-                    );
-                    normalizedAlpha[ dim ] = parameters.normalizedAlphaMax[ dim ] *
-                        alphaGradingCoeff;
-                }
-            }
-        }
-
-        //! Coefficients for E or B updates at a particular point
-        struct Coefficients
-        {
-            //! Coordinate stretching coefficient
-            float3_X kappa;
-
-            //! Damping coefficient, [Taflove, Hagness], eq. (7.102)
-            float3_X b;
-
-            //! Spatial difference coefficient, [Taflove, Hagness], eq. (7.99)
-            float3_X c;
-        };
-
-        /** Get coefficients for E or B updates at a given cell
-         *
-         * Apply polynomial grading, as described in pml.param.
-         * Due to normalizations, the same way of computing coefficients applies
-         * to E and B updates.
-         *
-         * @param cellIdx cell index including the guard, can be fractional,
-         * e.g. for halves of cells
-         * @param parameters parameters of PML in the local domain
-         * @param dt value of time step to propagate by
-         * @result an instance of Coefficients with computed values
-         */
-        DINLINE Coefficients getCoefficients(
-            floatD_X const cellIdx,
-            LocalParameters const parameters,
-            float_X const dt
-        )
-        {
-            Coefficients coeffs;
-            float3_X normalizedSigma, normalizedAlpha;
-            getAbsorptionParameters(
-                cellIdx,
-                parameters,
-                normalizedSigma,
-                coeffs.kappa,
-                normalizedAlpha
-            );
-
-            /* [Taflove, Hagness], eq. (7.102), normalizedSigma and
-             * normalizedAlpha are already divided by eps0
-             */
-            coeffs.b = math::exp(
-                -( normalizedSigma / coeffs.kappa + normalizedAlpha ) * dt
-            );
-            /* [Taflove, Hagness], eq. (7.99), in our case both the numerator
-             * and the denominator are equally normalized
-             */
-            coeffs.c = float3_X::create( 0._X );
-            for ( uint32_t dim = 0u; dim < 3; dim++ )
-            {
-                auto const denominator = coeffs.kappa[ dim ] *
-                    ( normalizedSigma[ dim ] + normalizedAlpha[ dim ] *
-                    coeffs.kappa[ dim ] );
-                // Avoid the 0 / 0 uncertainty, in that case keep the value 0
-                if( denominator )
-                    coeffs.c[ dim ] = normalizedSigma[ dim ] *
-                        ( coeffs.b[ dim ] - 1.0_X ) / denominator;
-            }
-            return coeffs;
-        }
-
-        /** Return if a point with given coefficients belongs to PML
-         *
-         * @param coeffs values of coefficients
-         * @result boolean value if a point with given coefficients belongs
-         * to PML
-         */
-        DINLINE bool isInPML( Coefficients const coeffs )
-        {
-            /* Each damping component is < 1 when absorption is enabled
-             * along this direction and == 1 otherwise.
-             * So a product is 1 in the internal area and < 1 in PML
-             */
-            return coeffs.b.productOfComponents( ) != 1.0_X;
-        }
-
-    } // namespace detail
-
-    /** Functor to update the electric field by a time step
-     *
-     * @tparam T_numWorkers number of workers
-     * @tparam T_BlockDescription field (electric and magnetic) domain description
-     */
-    template<
-        uint32_t T_numWorkers,
-        typename T_BlockDescription
-    >
-    struct KernelUpdateE
-    {
-        /** Update the electric field by a time step
-         *
-         * @tparam T_Acc alpaka accelerator type
-         * @tparam T_Mapping mapper functor type
-         * @tparam T_Curl curl functor type
-         * @tparam T_BBox pmacc::DataBox, magnetic field box type
-         * @tparam T_EBox pmacc::DataBox, electric field box type
-         *
-         * @param acc alpaka accelerator
-         * @param mapper functor to map a block to a supercell
-         * @param parameters PML parameters for a local domain
-         * @param curl functor to calculate the electric field, interface must be
-         *             `operator( )( T_EBox )`
-         * @param fieldB magnetic field iterator
-         * @param fieldE electric field iterator
-         * @param fieldPsiE PML convolutional electric field iterator
-         */
-        template<
-            typename T_Acc,
-            typename T_Mapping,
-            typename T_Curl,
-            typename T_BBox,
-            typename T_EBox
-        >
-        DINLINE void operator( )(
-            T_Acc const & acc,
-            T_Mapping const mapper,
-            LocalParameters const parameters,
-            T_Curl const curl,
-            T_BBox const fieldB,
-            T_EBox fieldE,
-            FieldBox fieldPsiE
-        ) const
-        {
-            /* Each block processes grid values in a supercell,
-             * the index includes guards, same as all indices in this kernel
-             */
-            auto const blockBeginIdx = mapper.getSuperCellIndex(
-                DataSpace< simDim >( blockIdx )
-            ) * MappingDesc::SuperCellSize::toRT( );
-
-            // Cache B values for the block
-            using namespace mappings::threads;
-            constexpr auto numWorkers = T_numWorkers;
-            auto const workerIdx = threadIdx.x;
-            nvidia::functors::Assign assign;
-            auto fieldBBlock = fieldB.shift( blockBeginIdx );
-            ThreadCollective<
-                T_BlockDescription,
-                numWorkers
-            > collectiveCacheB( workerIdx );
-            auto cachedB = CachedBox::create<
-                0u,
-                typename T_BBox::ValueType
-            >(
-                acc,
-                T_BlockDescription( )
-            );
-            collectiveCacheB(
-                acc,
-                assign,
-                cachedB,
-                fieldBBlock
-            );
-            __syncthreads( );
+                    floatD_X const negativeBorderSize;
+                    floatD_X const positiveBorderSize;
+
+                    //! Local domain characteristics, including guard cells
+                    DataSpace<simDim> const numLocalDomainCells;
+                    DataSpace<simDim> const numGuardCells;
+
+                    LocalParameters(
+                        Parameters const parameters,
+                        Thickness const localThickness,
+                        DataSpace<simDim> const numLocalDomainCells,
+                        DataSpace<simDim> const numGuardCells)
+                        : Parameters(parameters)
+                        , negativeBorderSize(precisionCast<float_X>(localThickness.negativeBorder))
+                        , positiveBorderSize(precisionCast<float_X>(localThickness.positiveBorder))
+                        , numLocalDomainCells(numLocalDomainCells)
+                        , numGuardCells(numGuardCells)
+                    {
+                    }
+                };
 
-            // Threads process values of the supercell in parallel
-            constexpr auto numCellsPerSuperCell =
-                pmacc::math::CT::volume< SuperCellSize >::type::value;
-            ForEachIdx<
-                IdxConfig<
-                    numCellsPerSuperCell,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
+                namespace detail
                 {
-                    constexpr auto c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
-                    constexpr auto dt = DELTA_T;
+                    /** Get relative depth of a given cell for 1D.
+                     *
+                     * This function operates with a 1D slice of domain and PML.
+                     * index == numGuardCells corresponds to the external negative PML
+                     * border, and index == numLocalDomainCells - numGuardCells - 1
+                     * corresponds to the external positive PML border.
+                     * For the internal area result is 0, for points in PML the depth
+                     * scales from 0 at the internal border to 1 at the external border.
+                     * Index and local domain size include the guard.
+                     *
+                     * @param cellIdx cell index including the guard, can be fractional,
+                     * e.g. for halves of cells
+                     * @param numPMLCellsNegative number of PML cells at the negative border
+                     * @param numPMLCellsPositive number of PML cells at the positive border
+                     * @param numLocalDomainCells number of cells of the local domain
+                     * including the guard
+                     * @param numGuardCells number of guard cells at each side
+                     * @return relative depth, value between 0 and 1
+                     */
+                    DINLINE float_X getRelativeDepth(
+                        float_X const cellIdx,
+                        float_X const numPMLCellsNegative,
+                        float_X const numPMLCellsPositive,
+                        uint32_t const numLocalDomainCells,
+                        uint32_t const numGuardCells)
+                    {
+                        auto zeroBasedIdx = cellIdx - numGuardCells;
+                        auto const isInLeftPML = (zeroBasedIdx < numPMLCellsNegative);
+                        if(isInLeftPML)
+                            return (numPMLCellsNegative - zeroBasedIdx) / numPMLCellsNegative;
+                        else
+                        {
+                            auto zeroBasedRightPMLStart
+                                = numLocalDomainCells - 2 * numGuardCells - numPMLCellsPositive;
+                            auto const isInRightPML = (zeroBasedIdx > zeroBasedRightPMLStart);
+                            if(isInRightPML)
+                                return (zeroBasedIdx - zeroBasedRightPMLStart) / numPMLCellsPositive;
+                        }
+                        return 0._X;
+                    }
 
-                    auto const idxInSuperCell =
-                        DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
-                    // grid index to process with the current thread
-                    auto const idx = blockBeginIdx + idxInSuperCell;
-                    // with the current Yee grid, no shift needed here
-                    auto const pmlIdx = precisionCast< float_X >( idx );
-                    auto const coeffs = detail::getCoefficients(
-                        pmlIdx,
-                        parameters,
-                        dt
-                    );
+                    /** Get absorption parameters: sigma, kappa and alpha at a given cell
+                     *
+                     * Apply polynomial grading, as described in pml.param.
+                     *
+                     * @param cellIdx cell index including the guard, can be fractional,
+                     * e.g. for halves of cells
+                     * @param parameters parameters of PML in the local domain
+                     * @param[out] normalizedSigma value of normalized sigma at the cell
+                     * @param[out] kappa value of normalized kappa at the cell
+                     * @param[out] normalizedAlpha value of normalized alpha at the cell
+                     */
+                    DINLINE void getAbsorptionParameters(
+                        floatD_X const cellIdx,
+                        LocalParameters const parameters,
+                        float3_X& normalizedSigma,
+                        float3_X& kappa,
+                        float3_X& normalizedAlpha)
+                    {
+                        // initialize with values for non-PML area
+                        normalizedSigma = float3_X::create(0._X);
+                        kappa = float3_X::create(1._X);
+                        normalizedAlpha = float3_X::create(0._X);
+                        for(uint32_t dim = 0u; dim < simDim; dim++)
+                        {
+                            auto const relativeDepth = getRelativeDepth(
+                                cellIdx[dim],
+                                parameters.negativeBorderSize[dim],
+                                parameters.positiveBorderSize[dim],
+                                parameters.numLocalDomainCells[dim],
+                                parameters.numGuardCells[dim]);
+                            // Since normally most points are not in PML, avoid costly
+                            // computing in this case
+                            if(relativeDepth != 0._X)
+                            {
+                                /* Grading done according to [Taflove, Hagness], eq. (7.60a, b).
+                                 * Note: here we use a general expression, it is possible
+                                 * to specialize for sigmaKappaGradingOrder = 2, 3, or 4,
+                                 * but currently seems not worth it.
+                                 */
+                                auto const sigmaKappaGradingCoeff
+                                    = math::pow(relativeDepth, parameters.sigmaKappaGradingOrder);
+                                normalizedSigma[dim] = parameters.normalizedSigmaMax[dim] * sigmaKappaGradingCoeff;
+                                kappa[dim] = 1._X + (parameters.kappaMax[dim] - 1._X) * sigmaKappaGradingCoeff;
+                                /* Grading done according to [Taflove, Hagness], eq. (7.79),
+                                 * note that this code is only correct when relativeDepth != 0
+                                 */
+                                auto const alphaGradingCoeff
+                                    = math::pow(1._X - relativeDepth, parameters.alphaGradingOrder);
+                                normalizedAlpha[dim] = parameters.normalizedAlphaMax[dim] * alphaGradingCoeff;
+                            }
+                        }
+                    }
 
-                    if( detail::isInPML( coeffs ) )
+                    //! Coefficients for E or B updates at a particular point
+                    struct Coefficients
                     {
-                        /* This precomputation of partial derivatives is done
-                         * more for readability, rather than avoiding computing
-                         * it twice
-                         */
-                        using Difference = typename T_Curl::Difference;
-                        const typename Difference::template GetDifference< 0 > Dx;
-                        const typename Difference::template GetDifference< 1 > Dy;
-                        const typename Difference::template GetDifference< 2 > Dz;
-                        auto const localB = cachedB.shift( idxInSuperCell );
-                        auto const dBxDy = Dy( localB ).x( );
-                        auto const dBxDz = Dz( localB ).x( );
-                        auto const dByDx = Dx( localB ).y( );
-                        auto const dByDz = Dz( localB ).y( );
-                        auto const dBzDx = Dx( localB ).z( );
-                        auto const dBzDy = Dy( localB ).z( );
+                        //! Coordinate stretching coefficient
+                        float3_X kappa;
+
+                        //! Damping coefficient, [Taflove, Hagness], eq. (7.102)
+                        float3_X b;
+
+                        //! Spatial difference coefficient, [Taflove, Hagness], eq. (7.99)
+                        float3_X c;
+                    };
+
+                    /** Get coefficients for E or B updates at a given cell
+                     *
+                     * Apply polynomial grading, as described in pml.param.
+                     * Due to normalizations, the same way of computing coefficients applies
+                     * to E and B updates.
+                     *
+                     * @param cellIdx cell index including the guard, can be fractional,
+                     * e.g. for halves of cells
+                     * @param parameters parameters of PML in the local domain
+                     * @param dt value of time step to propagate by
+                     * @result an instance of Coefficients with computed values
+                     */
+                    DINLINE Coefficients
+                    getCoefficients(floatD_X const cellIdx, LocalParameters const parameters, float_X const dt)
+                    {
+                        Coefficients coeffs;
+                        float3_X normalizedSigma, normalizedAlpha;
+                        getAbsorptionParameters(cellIdx, parameters, normalizedSigma, coeffs.kappa, normalizedAlpha);
 
-                        /* Update convolutional fields using [Taflove, Hagness],
-                         * eq. (7.105a,b) and similar for other components.
-                         * For PIC the right-hand side uses B, not H.
+                        /* [Taflove, Hagness], eq. (7.102), normalizedSigma and
+                         * normalizedAlpha are already divided by eps0
                          */
-                        auto & psiE = fieldPsiE( idx );
-                        psiE.yx = coeffs.b.x( ) * psiE.yx + coeffs.c.x( ) * dBzDx;
-                        psiE.zx = coeffs.b.x( ) * psiE.zx + coeffs.c.x( ) * dByDx;
-                        psiE.xy = coeffs.b.y( ) * psiE.xy + coeffs.c.y( ) * dBzDy;
-                        psiE.zy = coeffs.b.y( ) * psiE.zy + coeffs.c.y( ) * dBxDy;
-                        psiE.xz = coeffs.b.z( ) * psiE.xz + coeffs.c.z( ) * dByDz;
-                        psiE.yz = coeffs.b.z( ) * psiE.yz + coeffs.c.z( ) * dBxDz;
-
-                        /* [Taflove, Hagness], eq. (7.106) and similar for other
-                         * components. Coefficients Ca, Cb as given in (7.107a,b)
-                         * are general to account for materials, in addition to
-                         * artificial PML absorbing medium. We do not have any
-                         * real material, so in (7.107a,b) we have to use
-                         * sigma(i + 1/2, j, k) = 0 (it is another sigma,
-                         * unrelated to PML), eps(i + 1/2, j, k) = EPS0. Also,
-                         * same as the Yee scheme in PIC, adjusted to use B,
-                         * not H, on the right-hand side.
+                        coeffs.b = math::exp(-(normalizedSigma / coeffs.kappa + normalizedAlpha) * dt);
+                        /* [Taflove, Hagness], eq. (7.99), in our case both the numerator
+                         * and the denominator are equally normalized
                          */
-                        fieldE( idx ).x( ) += c2 * dt * (dBzDy / coeffs.kappa.y( ) -
-                            dByDz / coeffs.kappa.z( ) + psiE.xy - psiE.xz );
-                        fieldE( idx ).y( ) += c2 * dt * (dBxDz / coeffs.kappa.z( ) -
-                            dBzDx / coeffs.kappa.x( ) + psiE.yz - psiE.yx );
-                        fieldE( idx ).z( ) += c2 * dt * (dByDx / coeffs.kappa.x( ) -
-                            dBxDy / coeffs.kappa.y( ) + psiE.zx - psiE.zy );
+                        coeffs.c = float3_X::create(0._X);
+                        for(uint32_t dim = 0u; dim < 3; dim++)
+                        {
+                            auto const denominator = coeffs.kappa[dim]
+                                * (normalizedSigma[dim] + normalizedAlpha[dim] * coeffs.kappa[dim]);
+                            // Avoid the 0 / 0 uncertainty, in that case keep the value 0
+                            if(denominator)
+                                coeffs.c[dim] = normalizedSigma[dim] * (coeffs.b[dim] - 1.0_X) / denominator;
+                        }
+                        return coeffs;
                     }
-                    else
-                        // Normal Yee scheme update
-                        fieldE( idx ) += curl( cachedB.shift( idxInSuperCell ) ) * c2 * dt;
-                }
-            );
-        }
-    };
 
-    /** Functor to update the magnetic field by half a time step
-     *
-     * @tparam T_numWorkers number of workers
-     * @tparam T_BlockDescription field (electric and magnetic) domain description
-     */
-    template<
-        uint32_t T_numWorkers,
-        typename T_BlockDescription
-    >
-    struct KernelUpdateBHalf
-    {
-        /** Update the magnetic field by half a time step
-         *
-         * @tparam T_Acc alpaka accelerator type
-         * @tparam T_Mapping mapper functor type
-         * @tparam T_Curl curl functor type
-         * @tparam T_EBox pmacc::DataBox electric field box type
-         * @tparam T_BBox pmacc::DataBox magnetic field box type
-         *
-         * @param acc alpaka accelerator
-         * @param mapper functor to map a block to a supercell
-         * @param parameters PML parameters for a local domain
-         * @param curl functor to calculate the electric field, interface must be
-         *             `operator( )( T_EBox )`
-         * @param fieldE electric field iterator
-         * @param fieldB magnetic field iterator
-         * @param fieldPsiB PML convolutional magnetic field iterator
-         */
-        template<
-            typename T_Acc,
-            typename T_Mapping,
-            typename T_Curl,
-            typename T_EBox,
-            typename T_BBox
-        >
-        DINLINE void operator( )(
-            T_Acc const & acc,
-            T_Mapping const mapper,
-            LocalParameters const parameters,
-            T_Curl const curl,
-            T_EBox const fieldE,
-            T_BBox fieldB,
-            FieldBox fieldPsiB
-        ) const
-        {
-            /* Each block processes grid values in a supercell,
-             * the index includes guards, same as all indices in this kernel
-             */
-            auto const blockBeginIdx = mapper.getSuperCellIndex(
-                DataSpace< simDim >( blockIdx )
-            ) * MappingDesc::SuperCellSize::toRT( );
+                    /** Return if a point with given coefficients belongs to PML
+                     *
+                     * @param coeffs values of coefficients
+                     * @result boolean value if a point with given coefficients belongs
+                     * to PML
+                     */
+                    DINLINE bool isInPML(Coefficients const coeffs)
+                    {
+                        /* Each damping component is < 1 when absorption is enabled
+                         * along this direction and == 1 otherwise.
+                         * So a product is 1 in the internal area and < 1 in PML
+                         */
+                        return coeffs.b.productOfComponents() != 1.0_X;
+                    }
 
-            // Cache E values for the block
-            using namespace mappings::threads;
-            constexpr auto numWorkers = T_numWorkers;
-            auto const workerIdx = threadIdx.x;
-            nvidia::functors::Assign assign;
-            auto fieldEBlock = fieldE.shift( blockBeginIdx );
-            ThreadCollective<
-                T_BlockDescription,
-                numWorkers
-            > collectiveCacheE( workerIdx );
-            auto cachedE = CachedBox::create<
-                0u,
-                typename T_EBox::ValueType
-            >(
-                acc,
-                T_BlockDescription( )
-            );
-            collectiveCacheE(
-                acc,
-                assign,
-                cachedE,
-                fieldEBlock
-            );
-            __syncthreads( );
+                } // namespace detail
 
-            // Threads process values of the supercell in parallel
-            constexpr auto numCellsPerSuperCell =
-                pmacc::math::CT::volume< SuperCellSize >::type::value;
-            ForEachIdx<
-                IdxConfig<
-                    numCellsPerSuperCell,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
+                /** Functor to update the electric field by a time step
+                 *
+                 * @tparam T_numWorkers number of workers
+                 * @tparam T_BlockDescription field (electric and magnetic) domain description
+                 */
+                template<uint32_t T_numWorkers, typename T_BlockDescription>
+                struct KernelUpdateE
                 {
-                    constexpr auto halfDt = 0.5_X * DELTA_T;
-                    auto const idxInSuperCell =
-                        DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
-                    // grid index to process with the current thread
-                    auto const idx = blockBeginIdx + idxInSuperCell;
-                    // with the current Yee grid, a half cell shift is needed here
-                    auto const pmlIdx = floatD_X::create( 0.5_X ) +
-                        precisionCast< float_X >( idx );
-                    auto const coeffs = detail::getCoefficients(
-                        pmlIdx,
-                        parameters,
-                        halfDt
-                    );
-
-                    if( detail::isInPML( coeffs ) )
+                    /** Update the electric field by a time step
+                     *
+                     * @tparam T_Acc alpaka accelerator type
+                     * @tparam T_Mapping mapper functor type
+                     * @tparam T_Curl curl functor type
+                     * @tparam T_BBox pmacc::DataBox, magnetic field box type
+                     * @tparam T_EBox pmacc::DataBox, electric field box type
+                     *
+                     * @param acc alpaka accelerator
+                     * @param mapper functor to map a block to a supercell
+                     * @param parameters PML parameters for a local domain
+                     * @param curl functor to calculate the electric field, interface must be
+                     *             `operator( )( T_EBox )`
+                     * @param fieldB magnetic field iterator
+                     * @param fieldE electric field iterator
+                     * @param fieldPsiE PML convolutional electric field iterator
+                     */
+                    template<typename T_Acc, typename T_Mapping, typename T_Curl, typename T_BBox, typename T_EBox>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_Mapping const mapper,
+                        LocalParameters const parameters,
+                        T_Curl const curl,
+                        T_BBox const fieldB,
+                        T_EBox fieldE,
+                        FieldBox fieldPsiE) const
                     {
-                        /* This precomputation of partial derivatives is done
-                        * more for readability, rather than avoiding computing
-                        * it twice
-                        */
-                        using Difference = typename T_Curl::Difference;
-                        const typename Difference::template GetDifference< 0 > Dx;
-                        const typename Difference::template GetDifference< 1 > Dy;
-                        const typename Difference::template GetDifference< 2 > Dz;
-                        auto const localE = cachedE.shift( idxInSuperCell );
-                        auto const dExDy = Dy( localE ).x( );
-                        auto const dExDz = Dz( localE ).x( );
-                        auto const dEyDx = Dx( localE ).y( );
-                        auto const dEyDz = Dz( localE ).y( );
-                        auto const dEzDx = Dx( localE ).z( );
-                        auto const dEzDy = Dy( localE ).z( );
-
-                        /* Update convolutional fields using [Taflove, Hagness],
-                         * eq. (7.110a,b) and similar for other components.
-                         * For PIC the left-hand side uses B, not H.
+                        /* Each block processes grid values in a supercell,
+                         * the index includes guards, same as all indices in this kernel
                          */
-                        auto & psiB = fieldPsiB( idx );
-                        psiB.yx = coeffs.b.x( ) * psiB.yx + coeffs.c.x( ) * dEzDx;
-                        psiB.zx = coeffs.b.x( ) * psiB.zx + coeffs.c.x( ) * dEyDx;
-                        psiB.xy = coeffs.b.y( ) * psiB.xy + coeffs.c.y( ) * dEzDy;
-                        psiB.zy = coeffs.b.y( ) * psiB.zy + coeffs.c.y( ) * dExDy;
-                        psiB.xz = coeffs.b.z( ) * psiB.xz + coeffs.c.z( ) * dEyDz;
-                        psiB.yz = coeffs.b.z( ) * psiB.yz + coeffs.c.z( ) * dExDz;
-
-                        /* [Taflove, Hagness], eq. (7.108) and similar for other
-                         * components. Coefficients Da, Db as given in (7.109a,b)
-                         * are general to account for materials, in addition to
-                         * artificial PML absorbing medium. We do not have any
-                         * real material, so in (7.109a,b) we have to use
-                         * sigma*(i + 1/2, j, k) = 0 (it is another sigma*,
-                         * unrelated to PML), mue(i + 1/2, j, k) = MUE0. Also,
-                         * same as the Yee scheme in PIC, adjusted to use B,
-                         * not H, on the left-hand side.
-                        */
-                        fieldB( idx ).x( ) += halfDt * ( dEyDz / coeffs.kappa.z( ) -
-                            dEzDy / coeffs.kappa.y( ) + psiB.xz - psiB.xy );
-                        fieldB( idx ).y( ) += halfDt * ( dEzDx / coeffs.kappa.x( ) -
-                            dExDz / coeffs.kappa.z( ) + psiB.yx - psiB.yz );
-                        fieldB( idx ).z( ) += halfDt * ( dExDy / coeffs.kappa.y( ) -
-                            dEyDx / coeffs.kappa.x( ) + psiB.zy - psiB.zx );
+                        auto const blockBeginIdx = mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc)))
+                            * MappingDesc::SuperCellSize::toRT();
+
+                        // Cache B values for the block
+                        using namespace mappings::threads;
+                        constexpr auto numWorkers = T_numWorkers;
+                        auto const workerIdx = cupla::threadIdx(acc).x;
+                        nvidia::functors::Assign assign;
+                        auto fieldBBlock = fieldB.shift(blockBeginIdx);
+                        ThreadCollective<T_BlockDescription, numWorkers> collectiveCacheB(workerIdx);
+                        auto cachedB = CachedBox::create<0u, typename T_BBox::ValueType>(acc, T_BlockDescription());
+                        collectiveCacheB(acc, assign, cachedB, fieldBBlock);
+                        cupla::__syncthreads(acc);
+
+                        // Threads process values of the supercell in parallel
+                        constexpr auto numCellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                        ForEachIdx<IdxConfig<numCellsPerSuperCell, numWorkers>>{
+                            workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                            constexpr auto c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
+                            constexpr auto dt = DELTA_T;
+
+                            auto const idxInSuperCell
+                                = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
+                            // grid index to process with the current thread
+                            auto const idx = blockBeginIdx + idxInSuperCell;
+                            // with the current Yee grid, no shift needed here
+                            auto const pmlIdx = precisionCast<float_X>(idx);
+                            auto const coeffs = detail::getCoefficients(pmlIdx, parameters, dt);
+
+                            if(detail::isInPML(coeffs))
+                            {
+                                /* Update convolutional fields using [Taflove, Hagness],
+                                 * eq. (7.105a,b) and similar for other components.
+                                 * For PIC the right-hand side uses B, not H.
+                                 *
+                                 * Notation: dBdx = dB / dx, dBdx.y() = dBy / dx, etc.
+                                 */
+                                auto const localB = cachedB.shift(idxInSuperCell);
+                                auto const dBdx = curl.xDerivative(localB);
+                                auto const dBdy = curl.yDerivative(localB);
+                                auto const dBdz = curl.zDerivative(localB);
+                                auto& psiE = fieldPsiE(idx);
+                                psiE.yx = coeffs.b.x() * psiE.yx + coeffs.c.x() * dBdx.z();
+                                psiE.zx = coeffs.b.x() * psiE.zx + coeffs.c.x() * dBdx.y();
+                                psiE.xy = coeffs.b.y() * psiE.xy + coeffs.c.y() * dBdy.z();
+                                psiE.zy = coeffs.b.y() * psiE.zy + coeffs.c.y() * dBdy.x();
+                                psiE.xz = coeffs.b.z() * psiE.xz + coeffs.c.z() * dBdz.y();
+                                psiE.yz = coeffs.b.z() * psiE.yz + coeffs.c.z() * dBdz.x();
+
+                                /* [Taflove, Hagness], eq. (7.106) and similar for other
+                                 * components. Coefficients Ca, Cb as given in (7.107a,b)
+                                 * are general to account for materials, in addition to
+                                 * artificial PML absorbing medium. We do not have any
+                                 * real material, so in (7.107a,b) we have to use
+                                 * sigma(i + 1/2, j, k) = 0 (it is another sigma,
+                                 * unrelated to PML), eps(i + 1/2, j, k) = EPS0. Also,
+                                 * same as the Yee scheme in PIC, adjusted to use B,
+                                 * not H, on the right-hand side.
+                                 */
+                                fieldE(idx).x() += c2 * dt
+                                    * (dBdy.z() / coeffs.kappa.y() - dBdz.y() / coeffs.kappa.z() + psiE.xy - psiE.xz);
+                                fieldE(idx).y() += c2 * dt
+                                    * (dBdz.x() / coeffs.kappa.z() - dBdx.z() / coeffs.kappa.x() + psiE.yz - psiE.yx);
+                                fieldE(idx).z() += c2 * dt
+                                    * (dBdx.y() / coeffs.kappa.x() - dBdy.x() / coeffs.kappa.y() + psiE.zx - psiE.zy);
+                            }
+                            else
+                                // Normal Yee scheme update
+                                fieldE(idx) += curl(cachedB.shift(idxInSuperCell)) * c2 * dt;
+                        });
+                    }
+                };
+
+                /** Functor to update the magnetic field by half a time step
+                 *
+                 * @tparam T_numWorkers number of workers
+                 * @tparam T_BlockDescription field (electric and magnetic) domain description
+                 */
+                template<uint32_t T_numWorkers, typename T_BlockDescription>
+                struct KernelUpdateBHalf
+                {
+                    /** Update the magnetic field by half a time step
+                     *
+                     * @tparam T_Acc alpaka accelerator type
+                     * @tparam T_Mapping mapper functor type
+                     * @tparam T_Curl curl functor type
+                     * @tparam T_EBox pmacc::DataBox electric field box type
+                     * @tparam T_BBox pmacc::DataBox magnetic field box type
+                     *
+                     * @param acc alpaka accelerator
+                     * @param mapper functor to map a block to a supercell
+                     * @param parameters PML parameters for a local domain
+                     * @param curl functor to calculate the electric field, interface must be
+                     *             `operator( )( T_EBox )`
+                     * @param fieldE electric field iterator
+                     * @param updatePsiB whether convolutional magnetic fields need to be updated, or are up-to-date
+                     * @param fieldB magnetic field iterator
+                     * @param fieldPsiB PML convolutional magnetic field iterator
+                     */
+                    template<typename T_Acc, typename T_Mapping, typename T_Curl, typename T_EBox, typename T_BBox>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_Mapping const mapper,
+                        LocalParameters const parameters,
+                        T_Curl const curl,
+                        T_EBox const fieldE,
+                        bool const updatePsiB,
+                        T_BBox fieldB,
+                        FieldBox fieldPsiB) const
+                    {
+                        /* Each block processes grid values in a supercell,
+                         * the index includes guards, same as all indices in this kernel
+                         */
+                        auto const blockBeginIdx = mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc)))
+                            * MappingDesc::SuperCellSize::toRT();
+
+                        // Cache E values for the block
+                        using namespace mappings::threads;
+                        constexpr auto numWorkers = T_numWorkers;
+                        auto const workerIdx = cupla::threadIdx(acc).x;
+                        nvidia::functors::Assign assign;
+                        auto fieldEBlock = fieldE.shift(blockBeginIdx);
+                        ThreadCollective<T_BlockDescription, numWorkers> collectiveCacheE(workerIdx);
+                        auto cachedE = CachedBox::create<0u, typename T_EBox::ValueType>(acc, T_BlockDescription());
+                        collectiveCacheE(acc, assign, cachedE, fieldEBlock);
+                        cupla::__syncthreads(acc);
+
+                        // Threads process values of the supercell in parallel
+                        constexpr auto numCellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                        ForEachIdx<IdxConfig<numCellsPerSuperCell, numWorkers>>{
+                            workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                            constexpr auto dt = DELTA_T;
+                            constexpr auto halfDt = 0.5_X * dt;
+                            auto const idxInSuperCell
+                                = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
+                            // grid index to process with the current thread
+                            auto const idx = blockBeginIdx + idxInSuperCell;
+                            // with the current Yee grid, a half cell shift is needed here
+                            auto const pmlIdx = floatD_X::create(0.5_X) + precisionCast<float_X>(idx);
+                            /* Note that convolutional fields are updated once per dt. So the coefficients are computed
+                             * in this way, and whether the update has to be performed is controlled by a kernel caller
+                             * with updatePsiB parameter.
+                             */
+                            auto const coeffs = detail::getCoefficients(pmlIdx, parameters, dt);
+
+                            if(detail::isInPML(coeffs))
+                            {
+                                /* Update convolutional fields using [Taflove, Hagness],
+                                 * eq. (7.110a,b) and similar for other components.
+                                 * For PIC the left-hand side uses B, not H.
+                                 *
+                                 * Notation: dEdx = dE / dx, dEdx.y() = dEy / dx, etc.
+                                 */
+                                auto const localE = cachedE.shift(idxInSuperCell);
+                                auto const dEdx = curl.xDerivative(localE);
+                                auto const dEdy = curl.yDerivative(localE);
+                                auto const dEdz = curl.zDerivative(localE);
+                                auto& psiB = fieldPsiB(idx);
+                                if(updatePsiB)
+                                {
+                                    psiB.yx = coeffs.b.x() * psiB.yx + coeffs.c.x() * dEdx.z();
+                                    psiB.zx = coeffs.b.x() * psiB.zx + coeffs.c.x() * dEdx.y();
+                                    psiB.xy = coeffs.b.y() * psiB.xy + coeffs.c.y() * dEdy.z();
+                                    psiB.zy = coeffs.b.y() * psiB.zy + coeffs.c.y() * dEdy.x();
+                                    psiB.xz = coeffs.b.z() * psiB.xz + coeffs.c.z() * dEdz.y();
+                                    psiB.yz = coeffs.b.z() * psiB.yz + coeffs.c.z() * dEdz.x();
+                                }
+
+                                /* [Taflove, Hagness], eq. (7.108) and similar for other
+                                 * components. Coefficients Da, Db as given in (7.109a,b)
+                                 * are general to account for materials, in addition to
+                                 * artificial PML absorbing medium. We do not have any
+                                 * real material, so in (7.109a,b) we have to use
+                                 * sigma*(i + 1/2, j, k) = 0 (it is another sigma*,
+                                 * unrelated to PML), mue(i + 1/2, j, k) = MUE0. Also,
+                                 * same as the Yee scheme in PIC, adjusted to use B,
+                                 * not H, on the left-hand side.
+                                 */
+                                fieldB(idx).x() += halfDt
+                                    * (dEdz.y() / coeffs.kappa.z() - dEdy.z() / coeffs.kappa.y() + psiB.xz - psiB.xy);
+                                fieldB(idx).y() += halfDt
+                                    * (dEdx.z() / coeffs.kappa.x() - dEdz.x() / coeffs.kappa.z() + psiB.yx - psiB.yz);
+                                fieldB(idx).z() += halfDt
+                                    * (dEdy.x() / coeffs.kappa.y() - dEdx.y() / coeffs.kappa.x() + psiB.zy - psiB.zx);
+                            }
+                            else
+                                // Normal Yee scheme update
+                                fieldB(idx) -= curl(cachedE.shift(idxInSuperCell)) * halfDt;
+                        });
                     }
-                    else
-                        // Normal Yee scheme update
-                        fieldB( idx ) -= curl( cachedE.shift( idxInSuperCell ) ) * halfDt;
-                }
-            );
-        }
-    };
+                };
 
-} // namespace yeePML
-} // namespace maxwellSolver
-} // namespace fields
+            } // namespace yeePML
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/absorber/Absorber.hpp b/include/picongpu/fields/absorber/Absorber.hpp
index 0495001a5a..09205338b3 100644
--- a/include/picongpu/fields/absorber/Absorber.hpp
+++ b/include/picongpu/fields/absorber/Absorber.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Sergei Bastrakov
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Sergei Bastrakov, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -31,208 +31,295 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-
-    /** Forward declaration to avoid mutual including with YeePML.hpp
-     *
-     * @tparam T_CurrentInterpolation current interpolation functor
-     * @tparam T_CurlE functor to compute curl of E
-     * @tparam T_CurlB functor to compute curl of B
-     */
-    template<
-        typename T_CurrentInterpolation,
-        typename T_CurlE,
-        typename T_CurlB
-    >
-    class YeePML;
-
-} // namespace maxwellSolver
-
-namespace absorber
-{
-
-    //! Forward declaration to avoid mutual including with ExponentialDamping.hpp
-    class ExponentialDamping;
-
-namespace detail
-{
-
-    /** Get string properties of the absorber
-     *
-     * @param name absorber name
-     */
-    HINLINE pmacc::traits::StringProperty getStringProperties(
-        std::string const & name
-    );
-
-    /** Absorber wrapper
-     *
-     * Provides unified interface for the absorber information:
-     * size along the 6 boundaries and getStringProperties() implementation.
-     * Currently does not provide the computational part, only description.
-     *
-     * The general version uses exponential absorber settings since this is the
-     * default absorber.
-     *
-     * @tparam T_FieldSolver field solver
-     */
-    template< typename T_FieldSolver >
-    struct Absorber
+    namespace fields
     {
-        //! Number of absorber cells along the min x boundary
-        static constexpr uint32_t xNegativeNumCells = ABSORBER_CELLS[ 0 ][ 0 ];
-
-        //! Number of absorber cells along the max x boundary
-        static constexpr uint32_t xPositiveNumCells = ABSORBER_CELLS[ 0 ][ 1 ];
-
-        //! Number of absorber cells along the min y boundary
-        static constexpr uint32_t yNegativeNumCells = ABSORBER_CELLS[ 1 ][ 0 ];
-
-        //! Number of absorber cells along the max y boundary
-        static constexpr uint32_t yPositiveNumCells = ABSORBER_CELLS[ 1 ][ 1 ];
-
-        //! Number of absorber cells along the min z boundary
-        static constexpr uint32_t zNegativeNumCells = ABSORBER_CELLS[ 2 ][ 0 ];
-
-        //! Number of cells along the max z boundary
-        static constexpr uint32_t zPositiveNumCells = ABSORBER_CELLS[ 2 ][ 1 ];
-
-        //! Get string properties of the absorber
-        static pmacc::traits::StringProperty getStringProperties()
+        namespace maxwellSolver
         {
-            return detail::getStringProperties( "exponential damping" );
-        }
-    };
-
-    namespace pml = maxwellSolver::yeePML;
-
-    /** Absorber wrapper
-     *
-     * Specialization for PML
-     *
-     * @tparam T_CurrentInterpolation current interpolation for YeePML
-     * @tparam T_CurlE curl E for YeePML
-     * @tparam T_CurlB curl B for YeePML
-     */
-    template<
-        typename T_CurrentInterpolation,
-        typename T_CurlE,
-        typename T_CurlB
-    >
-    struct Absorber<
-        maxwellSolver::YeePML<
-            T_CurrentInterpolation,
-            T_CurlE,
-            T_CurlB
-        >
-    >
-    {
-        //! Number of absorber cells along the min x boundary
-        static constexpr uint32_t xNegativeNumCells = pml::NUM_CELLS[ 0 ][ 0 ];
-
-        //! Number of absorber cells along the max x boundary
-        static constexpr uint32_t xPositiveNumCells = pml::NUM_CELLS[ 0 ][ 1 ];
-
-        //! Number of absorber cells along the min y boundary
-        static constexpr uint32_t yNegativeNumCells = pml::NUM_CELLS[ 1 ][ 0 ];
-
-        //! Number of absorber cells along the max y boundary
-        static constexpr uint32_t yPositiveNumCells = pml::NUM_CELLS[ 1 ][ 1 ];
-
-        //! Number of absorber cells along the min z boundary
-        static constexpr uint32_t zNegativeNumCells = pml::NUM_CELLS[ 2 ][ 0 ];
-
-        //! Number of absorber cells along the max z boundary
-        static constexpr uint32_t zPositiveNumCells = pml::NUM_CELLS[ 2 ][ 1 ];
-
-        //! Get string properties of the absorber
-        static pmacc::traits::StringProperty getStringProperties()
+            /** Forward declaration to avoid mutual including with YeePML.hpp
+             *
+             * @tparam T_CurrentInterpolation current interpolation functor
+             * @tparam T_CurlE functor to compute curl of E
+             * @tparam T_CurlB functor to compute curl of B
+             */
+            template<typename T_CurrentInterpolation, typename T_CurlE, typename T_CurlB>
+            class YeePML;
+
+        } // namespace maxwellSolver
+
+        namespace absorber
         {
-            return detail::getStringProperties( "convolutional PML" );
-        }
-
-    };
-
-} // namespace detail
-
-    /** Absorber description implementing getStringProperties()
-     *
-     * To be used for writing absorber meta information, does not provide
-     * interface for running the absorber
-     */
-    using Absorber = detail::Absorber< Solver >;
-
-    /** Number of absorber cells along each boundary
-     *
-     * Is uniform for both PML and exponential damping absorbers.
-     * First index: 0 = x, 1 = y, 2 = z.
-     * Second index: 0 = negative (min coordinate), 1 = positive (max coordinate).
-     * Not for ODR-use.
-     */
-    constexpr uint32_t numCells[ 3 ][ 2 ] = {
-        { Absorber::xNegativeNumCells, Absorber::xPositiveNumCells },
-        { Absorber::yNegativeNumCells, Absorber::yPositiveNumCells },
-        { Absorber::zNegativeNumCells, Absorber::zPositiveNumCells }
-    };
-
-namespace detail
-{
-
-    // Implementation has to be after numCells is defined
-    pmacc::traits::StringProperty getStringProperties( std::string const & name )
-    {
-        pmacc::traits::StringProperty propList;
-        const DataSpace<DIM3> periodic =
-            Environment<simDim>::get().EnvironmentController().getCommunicator().getPeriodic();
+            //! Forward declaration to avoid mutual including with ExponentialDamping.hpp
+            class ExponentialDamping;
 
-        for( uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i )
-        {
-            // for each planar direction: left right top bottom back front
-            if( FRONT % i == 0 )
+            namespace detail
             {
-                const std::string directionName = ExchangeTypeNames()[i];
-                const DataSpace<DIM3> relDir = Mask::getRelativeDirections<DIM3>(i);
-
-                bool isPeriodic = false;
-                uint32_t axis = 0;    // x(0) y(1) z(2)
-                uint32_t axisDir = 0; // negative (0), positive (1)
-                for( uint32_t d = 0; d < simDim; d++ )
+                /** Get string properties of the absorber
+                 *
+                 * @param name absorber name
+                 */
+                HINLINE pmacc::traits::StringProperty getStringProperties(std::string const& name);
+
+                /** Absorber wrapper
+                 *
+                 * Provides unified interface for the absorber information:
+                 * size along the 6 boundaries and getStringProperties() implementation.
+                 * Currently does not provide the computational part, only description.
+                 *
+                 * The general version uses exponential absorber settings since this is the
+                 * default absorber.
+                 *
+                 * @tparam T_FieldSolver field solver
+                 */
+                template<typename T_FieldSolver>
+                struct Absorber
                 {
-                    if( relDir[d] * periodic[d] != 0 )
-                        isPeriodic = true;
-                    if( relDir[d] != 0 )
-                        axis = d;
+                    //! Number of absorber cells along the min x boundary
+                    static constexpr uint32_t xNegativeNumCells = ABSORBER_CELLS[0][0];
+
+                    //! Number of absorber cells along the max x boundary
+                    static constexpr uint32_t xPositiveNumCells = ABSORBER_CELLS[0][1];
+
+                    //! Number of absorber cells along the min y boundary
+                    static constexpr uint32_t yNegativeNumCells = ABSORBER_CELLS[1][0];
+
+                    //! Number of absorber cells along the max y boundary
+                    static constexpr uint32_t yPositiveNumCells = ABSORBER_CELLS[1][1];
+
+                    //! Number of absorber cells along the min z boundary
+                    static constexpr uint32_t zNegativeNumCells = ABSORBER_CELLS[2][0];
+
+                    //! Number of cells along the max z boundary
+                    static constexpr uint32_t zPositiveNumCells = ABSORBER_CELLS[2][1];
+
+                    //! Get string properties of the absorber
+                    static pmacc::traits::StringProperty getStringProperties()
+                    {
+                        return detail::getStringProperties("exponential damping");
+                    }
+                };
+
+                namespace pml = maxwellSolver::Pml;
+
+                /** Absorber wrapper
+                 *
+                 * Specialization for PML, works for both YeePML and LehePML
+                 *
+                 * @tparam T_CurrentInterpolation current interpolation for YeePML
+                 * @tparam T_CurlE curl E for YeePML
+                 * @tparam T_CurlB curl B for YeePML
+                 */
+                template<typename T_CurrentInterpolation, typename T_CurlE, typename T_CurlB>
+                struct Absorber<maxwellSolver::YeePML<T_CurrentInterpolation, T_CurlE, T_CurlB>>
+                {
+                    //! Number of absorber cells along the min x boundary
+                    static constexpr uint32_t xNegativeNumCells = pml::NUM_CELLS[0][0];
+
+                    //! Number of absorber cells along the max x boundary
+                    static constexpr uint32_t xPositiveNumCells = pml::NUM_CELLS[0][1];
+
+                    //! Number of absorber cells along the min y boundary
+                    static constexpr uint32_t yNegativeNumCells = pml::NUM_CELLS[1][0];
+
+                    //! Number of absorber cells along the max y boundary
+                    static constexpr uint32_t yPositiveNumCells = pml::NUM_CELLS[1][1];
+
+                    //! Number of absorber cells along the min z boundary
+                    static constexpr uint32_t zNegativeNumCells = pml::NUM_CELLS[2][0];
+
+                    //! Number of absorber cells along the max z boundary
+                    static constexpr uint32_t zPositiveNumCells = pml::NUM_CELLS[2][1];
+
+                    //! Get string properties of the absorber
+                    static pmacc::traits::StringProperty getStringProperties()
+                    {
+                        return detail::getStringProperties("convolutional PML");
+                    }
+                };
+
+            } // namespace detail
+
+            /** Absorber description implementing getStringProperties()
+             *
+             * To be used for writing absorber meta information, does not provide
+             * interface for running the absorber
+             */
+            using Absorber = detail::Absorber<Solver>;
+
+            /** Number of absorber cells along each boundary
+             *
+             * Stores the global absorber thickness in case the absorbing boundary
+             * conditions are used along each boundary. Note that in case of periodic
+             * boundaries the corresponding values will be ignored.
+             *
+             * Is uniform for both PML and exponential damping absorbers.
+             * First index: 0 = x, 1 = y, 2 = z.
+             * Second index: 0 = negative (min coordinate), 1 = positive (max coordinate).
+             * Not for ODR-use.
+             */
+            constexpr uint32_t numCells[3][2]
+                = {{Absorber::xNegativeNumCells, Absorber::xPositiveNumCells},
+                   {Absorber::yNegativeNumCells, Absorber::yPositiveNumCells},
+                   {Absorber::zNegativeNumCells, Absorber::zPositiveNumCells}};
+
+            //! Thickness of the absorbing layer
+            class Thickness
+            {
+            public:
+                //! Create a zero thickness
+                Thickness()
+                {
+                    for(uint32_t axis = 0u; axis < 3u; axis++)
+                        for(uint32_t direction = 0u; direction < 2u; direction++)
+                            (*this)(axis, direction) = 0u;
                 }
-                if( relDir[axis] > 0 )
-                    axisDir = 1;
 
-                std::string boundaryName = "open"; // absorbing boundary
-                if( isPeriodic )
-                    boundaryName = "periodic";
-
-                if( boundaryName == "open" )
+                /** Get thickness for the given boundary
+                 *
+                 * @param axis axis, 0 = x, 1 = y, 2 = z
+                 * @param direction direction, 0 = negative (min coordinate),
+                 *                  1 = positive (max coordinate)
+                 */
+                uint32_t operator()(uint32_t const axis, uint32_t const direction) const
                 {
-                    std::ostringstream boundaryParam;
-                    boundaryParam << name + " over "
-                        << numCells[axis][axisDir] << " cells";
-                    propList[directionName]["param"] = boundaryParam.str();
+                    return numCells[axis][direction];
                 }
-                else
+
+                /** Get reference to thickness for the given boundary
+                 *
+                 * @param axis axis, 0 = x, 1 = y, 2 = z
+                 * @param direction direction, 0 = negative (min coordinate),
+                 *                  1 = positive (max coordinate)
+                 */
+                uint32_t& operator()(uint32_t const axis, uint32_t const direction)
                 {
-                    propList[directionName]["param"] = "none";
+                    return numCells[axis][direction];
                 }
 
-                propList[directionName]["name"] = boundaryName;
+            private:
+                /** Number of absorber cells along each boundary
+                 *
+                 * First index: 0 = x, 1 = y, 2 = z.
+                 * Second index: 0 = negative (min coordinate), 1 = positive (max coordinate).
+                 */
+                uint32_t numCells[3][2];
+            };
+
+            /** Get absorber thickness in number of cells for the global domain
+             *
+             * This function takes into account which boundaries are periodic and
+             * absorbing.
+             */
+            inline Thickness getGlobalThickness()
+            {
+                Thickness thickness;
+                for(uint32_t axis = 0u; axis < 3u; axis++)
+                    for(uint32_t direction = 0u; direction < 2u; direction++)
+                        thickness(axis, direction) = numCells[axis][direction];
+                const DataSpace<DIM3> isPeriodicBoundary
+                    = Environment<simDim>::get().EnvironmentController().getCommunicator().getPeriodic();
+                for(uint32_t axis = 0u; axis < 3u; axis++)
+                    if(isPeriodicBoundary[axis])
+                    {
+                        thickness(axis, 0) = 0u;
+                        thickness(axis, 1) = 0u;
+                    }
+                return thickness;
+            }
+
+            /** Get absorber thickness in number of cells for the current local domain
+             *
+             * This function takes into account the current domain decomposition and
+             * which boundaries are periodic and absorbing.
+             *
+             * Note that unlike getGlobalThickness() result which does not change
+             * throughout the simulation, the local thickness can change. Thus,
+             * the result of this function should not be reused on another time step,
+             * but rather the function called again.
+             */
+            inline Thickness getLocalThickness()
+            {
+                Thickness thickness = getGlobalThickness();
+                auto const numExchanges = NumberOfExchanges<simDim>::value;
+                auto const communicationMask = Environment<simDim>::get().GridController().getCommunicationMask();
+                for(uint32_t exchange = 1u; exchange < numExchanges; exchange++)
+                {
+                    /* Here we are only interested in the positive and negative
+                     * directions for x, y, z axes and not the "diagonal" ones.
+                     * So skip other directions except left, right, top, bottom,
+                     * back, front
+                     */
+                    if(FRONT % exchange != 0)
+                        continue;
+
+                    // Transform exchange into a pair of axis and direction
+                    uint32_t axis = 0;
+                    if(exchange >= BOTTOM && exchange <= TOP)
+                        axis = 1;
+                    if(exchange >= BACK)
+                        axis = 2;
+                    uint32_t direction = exchange % 2;
+
+                    // No absorber at the borders between two local domains
+                    bool hasNeighbour = communicationMask.isSet(exchange);
+                    if(hasNeighbour)
+                        thickness(axis, direction) = 0u;
+                }
+                return thickness;
             }
-        }
-        return propList;
-    }
 
-} // namespace detail
+            namespace detail
+            {
+                // Implementation has to be after numCells is defined
+                pmacc::traits::StringProperty getStringProperties(std::string const& name)
+                {
+                    pmacc::traits::StringProperty propList;
+                    const DataSpace<DIM3> periodic
+                        = Environment<simDim>::get().EnvironmentController().getCommunicator().getPeriodic();
+
+                    for(uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i)
+                    {
+                        // for each planar direction: left right top bottom back front
+                        if(FRONT % i == 0)
+                        {
+                            const std::string directionName = ExchangeTypeNames()[i];
+                            const DataSpace<DIM3> relDir = Mask::getRelativeDirections<DIM3>(i);
+
+                            bool isPeriodic = false;
+                            uint32_t axis = 0; // x(0) y(1) z(2)
+                            uint32_t axisDir = 0; // negative (0), positive (1)
+                            for(uint32_t d = 0; d < simDim; d++)
+                            {
+                                if(relDir[d] * periodic[d] != 0)
+                                    isPeriodic = true;
+                                if(relDir[d] != 0)
+                                    axis = d;
+                            }
+                            if(relDir[axis] > 0)
+                                axisDir = 1;
+
+                            std::string boundaryName = "open"; // absorbing boundary
+                            if(isPeriodic)
+                                boundaryName = "periodic";
+
+                            if(boundaryName == "open")
+                            {
+                                std::ostringstream boundaryParam;
+                                boundaryParam << name + " over " << numCells[axis][axisDir] << " cells";
+                                propList[directionName]["param"] = boundaryParam.str();
+                            }
+                            else
+                            {
+                                propList[directionName]["param"] = "none";
+                            }
+
+                            propList[directionName]["name"] = boundaryName;
+                        }
+                    }
+                    return propList;
+                }
+
+            } // namespace detail
 
-} // namespace absorber
-} // namespace fields
+        } // namespace absorber
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/absorber/ExponentialDamping.hpp b/include/picongpu/fields/absorber/ExponentialDamping.hpp
index 392a413408..8fb34eab3f 100644
--- a/include/picongpu/fields/absorber/ExponentialDamping.hpp
+++ b/include/picongpu/fields/absorber/ExponentialDamping.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -34,81 +34,74 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace absorber
-{
-
-class ExponentialDamping
-{
-public:
-
-    template<class BoxedMemory>
-    static void run(uint32_t currentStep, MappingDesc &cellDescription, BoxedMemory deviceBox)
+    namespace fields
     {
-        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
-        for (uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i)
+        namespace absorber
         {
-            /* only call for planes: left right top bottom back front*/
-            if (FRONT % i == 0 && !(Environment<simDim>::get().GridController().getCommunicationMask().isSet(i)))
+            class ExponentialDamping
             {
-                uint32_t direction = 0; /*set direction to X (default)*/
-                if (i >= BOTTOM && i <= TOP)
-                    direction = 1; /*set direction to Y*/
-                if (i >= BACK)
-                    direction = 2; /*set direction to Z*/
-
-                /* exchange mod 2 to find positive or negative direction
-                 * positive direction = 1
-                 * negative direction = 0
-                 */
-                uint32_t pos_or_neg = i % 2;
-
-                uint32_t thickness = absorber::numCells[direction][pos_or_neg];
-                float_X absorber_strength = ABSORBER_STRENGTH[direction][pos_or_neg];
-
-                if (thickness == 0) continue; /*if the absorber has no thickness we check the next side*/
-
-                /* allow to enable the absorber on the top side if the laser
-                 * initialization plane in y direction is *not* in cell zero
-                 */
-                if (fields::laserProfiles::Selected::initPlaneY == 0)
+            public:
+                template<class BoxedMemory>
+                static void run(uint32_t currentStep, MappingDesc& cellDescription, BoxedMemory deviceBox)
                 {
-                    /* disable the absorber on top side if
-                     *      no slide was performed and
-                     *      laser init time is not over
-                     */
-                    if (numSlides == 0 && ((currentStep * DELTA_T) <= fields::laserProfiles::Selected::INIT_TIME))
+                    const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+                    for(uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i)
                     {
-                        /* disable absorber on top side */
-                        if (i == TOP) continue;
+                        /* only call for planes: left right top bottom back front*/
+                        if(FRONT % i == 0
+                           && !(Environment<simDim>::get().GridController().getCommunicationMask().isSet(i)))
+                        {
+                            uint32_t direction = 0; /*set direction to X (default)*/
+                            if(i >= BOTTOM && i <= TOP)
+                                direction = 1; /*set direction to Y*/
+                            if(i >= BACK)
+                                direction = 2; /*set direction to Z*/
+
+                            /* exchange mod 2 to find positive or negative direction
+                             * positive direction = 1
+                             * negative direction = 0
+                             */
+                            uint32_t pos_or_neg = i % 2;
+
+                            uint32_t thickness = absorber::numCells[direction][pos_or_neg];
+                            float_X absorber_strength = ABSORBER_STRENGTH[direction][pos_or_neg];
+
+                            if(thickness == 0)
+                                continue; /*if the absorber has no thickness we check the next side*/
+
+                            /* allow to enable the absorber on the top side if the laser
+                             * initialization plane in y direction is *not* in cell zero
+                             */
+                            if(fields::laserProfiles::Selected::initPlaneY == 0)
+                            {
+                                /* disable the absorber on top side if
+                                 *      no slide was performed and
+                                 *      laser init time is not over
+                                 */
+                                if(numSlides == 0
+                                   && ((currentStep * DELTA_T) <= fields::laserProfiles::Selected::INIT_TIME))
+                                {
+                                    /* disable absorber on top side */
+                                    if(i == TOP)
+                                        continue;
+                                }
+                            }
+
+                            /* if sliding window is active we disable absorber on bottom side*/
+                            if(MovingWindow::getInstance().isSlidingWindowActive(currentStep) && i == BOTTOM)
+                                continue;
+
+                            ExchangeMapping<GUARD, MappingDesc> mapper(cellDescription, i);
+                            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
+                                pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                            PMACC_KERNEL(KernelAbsorbBorder<numWorkers>{})
+                            (mapper.getGridDim(), numWorkers)(deviceBox, thickness, absorber_strength, mapper);
+                        }
                     }
                 }
+            };
 
-                /* if sliding window is active we disable absorber on bottom side*/
-                if (MovingWindow::getInstance().isSlidingWindowActive(currentStep) && i == BOTTOM) continue;
-
-                ExchangeMapping<GUARD, MappingDesc> mapper(cellDescription, i);
-                constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                    pmacc::math::CT::volume< SuperCellSize >::type::value
-                >::value;
-
-                PMACC_KERNEL( KernelAbsorbBorder< numWorkers> {} )(
-                    mapper.getGridDim(),
-                    numWorkers
-                )(
-                    deviceBox,
-                    thickness,
-                    absorber_strength,
-                    mapper
-                );
-            }
-        }
-    }
-
-};
-
-} // namespace absorber
-} // namespace fields
+        } // namespace absorber
+    } // namespace fields
 } // namespace picongpu
-
diff --git a/include/picongpu/fields/absorber/ExponentialDamping.kernel b/include/picongpu/fields/absorber/ExponentialDamping.kernel
index dd282dc48f..b31978aa33 100644
--- a/include/picongpu/fields/absorber/ExponentialDamping.kernel
+++ b/include/picongpu/fields/absorber/ExponentialDamping.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -33,202 +33,153 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace absorber
-{
-namespace detail
-{
-
-    /** damp each field component at exchange the border
-     *
-     * @tparam T_NumWorkers, boost::mpl::integral_c number of workers
-     * @tparam T_Axis, boost::mpl::integral_c axis of the coordinate system
-     *                 (0 = x, 1 = y, 2 = z)
-     */
-    template<
-        typename T_NumWorkers,
-        typename T_Axis
-    >
-    struct AbsorbInOneDirection
+    namespace fields
     {
-        /** absorb one direction
-         *
-         * The functor is only performed if `relExchangeDir[ T_Axis::value ] != 0`.
-         *
-         * @tparam T_BoxedMemory pmacc::DataBox, type of the field
-         * @tparam T_Mapping mapper functor type
-         *
-         * @param field field to manipulate
-         * @param thickness the thickness of the absorber area (in cells)
-         * @param absorberStrength strength of the absorber
-         * @param mapper functor to map a block to a supercell
-         * @param relExchangeDir relative direction for each dimension
-         *        (-1 = negative; +1 = positive direction; 0 = direction not selected)
-         */
-        template<
-            typename T_BoxedMemory,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_BoxedMemory & field,
-            uint32_t const thickness,
-            float_X const absorberStrength,
-            T_Mapping & mapper,
-            DataSpace< simDim > const & relExchangeDir
-        ) const
+        namespace absorber
         {
-            using namespace mappings::threads;
-
-            constexpr int axis = T_Axis::value;
-
-            // return if axis is not selected
-            if( relExchangeDir[ axis ] == 0 )
-                return;
-
-            using SuperCellSize = typename T_Mapping::SuperCellSize;
-            DataSpace< simDim > const superCellIdx(
-                mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) )
-            );
-
-            constexpr uint32_t numWorkers = T_NumWorkers::value;
-            constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            auto const numGuardSuperCells = mapper.getGuardingSuperCells();
-            DataSpace< simDim > guardCells( numGuardSuperCells * SuperCellSize::toRT() );
-
-            // cell index of the supercell within the local domain (incl. the guards)
-            DataSpace< simDim > const localDomainCells = mapper.getGridSuperCells() * SuperCellSize::toRT();
-
-            using SuperCellDomCfg = IdxConfig<
-                cellsPerSuperCell,
-                numWorkers
-            >;
-
-            ForEachIdx<
-                SuperCellDomCfg
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
+            namespace detail
+            {
+                /** damp each field component at exchange the border
+                 *
+                 * @tparam T_NumWorkers, boost::mpl::integral_c number of workers
+                 * @tparam T_Axis, boost::mpl::integral_c axis of the coordinate system
+                 *                 (0 = x, 1 = y, 2 = z)
+                 */
+                template<typename T_NumWorkers, typename T_Axis>
+                struct AbsorbInOneDirection
                 {
-                    /* cell index within the superCell */
-                    DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::
-                        template map< SuperCellSize >( linearIdx );
-
-                    DataSpace< simDim > cell( superCellIdx * SuperCellSize::toRT( ) + cellIdx);
-
-
-                    do
+                    /** absorb one direction
+                     *
+                     * The functor is only performed if `relExchangeDir[ T_Axis::value ] != 0`.
+                     *
+                     * @tparam T_BoxedMemory pmacc::DataBox, type of the field
+                     * @tparam T_Mapping mapper functor type
+                     *
+                     * @param field field to manipulate
+                     * @param thickness the thickness of the absorber area (in cells)
+                     * @param absorberStrength strength of the absorber
+                     * @param mapper functor to map a block to a supercell
+                     * @param relExchangeDir relative direction for each dimension
+                     *        (-1 = negative; +1 = positive direction; 0 = direction not selected)
+                     */
+                    template<typename T_BoxedMemory, typename T_Mapping, typename T_Acc>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_BoxedMemory& field,
+                        uint32_t const thickness,
+                        float_X const absorberStrength,
+                        T_Mapping& mapper,
+                        DataSpace<simDim> const& relExchangeDir) const
                     {
-                        cell[ axis ] += guardCells[ axis ] * -relExchangeDir[ axis ];
-                        int factor(0);
-
-                        if( relExchangeDir[ axis ] < 0 )
-                        {
-                            factor = guardCells[ axis ] - cell[ axis ] +
-                                thickness - 1;
-                        }
-                        else
-                        {
-                            factor = guardCells[ axis ] + cell[ axis ] -
-                                localDomainCells[ axis ] + thickness;
-                        }
-
-                        if( factor <= 0 )
-                        {
-                            break;
-                        }
-                        else
-                        {
-                            float_X const a = math::exp( -absorberStrength * float_X( factor ) );
-                            field( cell ) = field( cell ) * a;
-                        }
-                    } while( true );
-                }
-            );
-
-        }
-    };
+                        using namespace mappings::threads;
+
+                        constexpr int axis = T_Axis::value;
+
+                        // return if axis is not selected
+                        if(relExchangeDir[axis] == 0)
+                            return;
+
+                        using SuperCellSize = typename T_Mapping::SuperCellSize;
+                        DataSpace<simDim> const superCellIdx(
+                            mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+
+                        constexpr uint32_t numWorkers = T_NumWorkers::value;
+                        constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+
+                        uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                        auto const numGuardSuperCells = mapper.getGuardingSuperCells();
+                        DataSpace<simDim> guardCells(numGuardSuperCells * SuperCellSize::toRT());
+
+                        // cell index of the supercell within the local domain (incl. the guards)
+                        DataSpace<simDim> const localDomainCells = mapper.getGridSuperCells() * SuperCellSize::toRT();
+
+                        using SuperCellDomCfg = IdxConfig<cellsPerSuperCell, numWorkers>;
+
+                        ForEachIdx<SuperCellDomCfg>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                            /* cell index within the superCell */
+                            DataSpace<simDim> const cellIdx
+                                = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
+
+                            DataSpace<simDim> cell(superCellIdx * SuperCellSize::toRT() + cellIdx);
+
+
+                            do
+                            {
+                                cell[axis] += guardCells[axis] * -relExchangeDir[axis];
+                                int factor(0);
+
+                                if(relExchangeDir[axis] < 0)
+                                {
+                                    factor = guardCells[axis] - cell[axis] + thickness - 1;
+                                }
+                                else
+                                {
+                                    factor = guardCells[axis] + cell[axis] - localDomainCells[axis] + thickness;
+                                }
+
+                                if(factor <= 0)
+                                {
+                                    break;
+                                }
+                                else
+                                {
+                                    float_X const a = math::exp(-absorberStrength * float_X(factor));
+                                    field(cell) = field(cell) * a;
+                                }
+                            } while(true);
+                        });
+                    }
+                };
+
+            } // namespace detail
+
+            /** damp each field's components at the outer cells of the global domain
+             *
+             * Done for one direction per call.
+             *
+             * @tparam T_numWorkers number of workers
+             */
+            template<uint32_t T_numWorkers>
+            struct KernelAbsorbBorder
+            {
+                /** damp a field at the border
+                 *
+                 * @tparam T_BoxedMemory pmacc::DataBox, type of the field
+                 * @tparam T_Mapping pmacc::ExchangeMapping, mapper functor type
+                 *
+                 * @param field filed to manipulate
+                 * @param thickness the thickness of the absorber area (in cells)
+                 * @param absorberStrength strength of the absorber (positive, exponential damping constant)
+                 * @param mapper functor to map a block to a supercell,
+                 *               selects the direction of damping by the exchange type
+                 */
+                template<typename T_BoxedMemory, typename T_Mapping, typename T_Acc>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    T_BoxedMemory field,
+                    uint32_t const thickness,
+                    float_X const absorberStrength,
+                    T_Mapping mapper) const
+                {
+                    DataSpace<simDim> const relExchangeDir
+                        = Mask::getRelativeDirections<simDim>(mapper.getExchangeType());
 
-} // namespace detail
+                    /* create a sequence with int values [0;simDim)
+                     * MakeSeq_t allows to use the result of mpl::range_c
+                     * within the PMacc ForEach
+                     */
+                    using SimulationDimensions = MakeSeq_t<boost::mpl::range_c<int, 0, int(simDim)>>;
 
-    /** damp each field's components at the outer cells of the global domain
-     *
-     * Done for one direction per call.
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template< uint32_t T_numWorkers >
-    struct KernelAbsorbBorder
-    {
-        /** damp a field at the border
-         *
-         * @tparam T_BoxedMemory pmacc::DataBox, type of the field
-         * @tparam T_Mapping pmacc::ExchangeMapping, mapper functor type
-         *
-         * @param field filed to manipulate
-         * @param thickness the thickness of the absorber area (in cells)
-         * @param absorberStrength strength of the absorber (positive, exponential damping constant)
-         * @param mapper functor to map a block to a supercell,
-         *               selects the direction of damping by the exchange type
-         */
-        template<
-            typename T_BoxedMemory,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_BoxedMemory field,
-            uint32_t const thickness,
-            float_X const absorberStrength,
-            T_Mapping mapper
-        ) const
-        {
+                    meta::ForEach<
+                        SimulationDimensions,
+                        detail::AbsorbInOneDirection<boost::mpl::integral_c<uint32_t, T_numWorkers>, boost::mpl::_1>>
+                        absorbInAllDirections;
 
-            DataSpace< simDim > const relExchangeDir =
-                Mask::getRelativeDirections< simDim >( mapper.getExchangeType( ) );
+                    absorbInAllDirections(acc, field, thickness, absorberStrength, mapper, relExchangeDir);
+                }
+            };
 
-            /* create a sequence with int values [0;simDim)
-             * MakeSeq_t allows to use the result of mpl::range_c
-             * within the PMacc ForEach
-             */
-            using SimulationDimensions = MakeSeq_t<
-                boost::mpl::range_c<
-                    int,
-                    0,
-                    int( simDim )
-                >
-            >;
-
-            meta::ForEach<
-                SimulationDimensions,
-                detail::AbsorbInOneDirection<
-                    boost::mpl::integral_c<
-                        uint32_t,
-                        T_numWorkers
-                    >,
-                    boost::mpl::_1
-                >
-            > absorbInAllDirections;
-
-            absorbInAllDirections(
-                acc,
-                field,
-                thickness,
-                absorberStrength,
-                mapper,
-                relExchangeDir
-            );
-        }
-    };
-
-} // namespace absorber
-} // namespace fields
+        } // namespace absorber
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/background/cellwiseOperation.hpp b/include/picongpu/fields/background/cellwiseOperation.hpp
index 8186fc4f69..1ea3e10c95 100644
--- a/include/picongpu/fields/background/cellwiseOperation.hpp
+++ b/include/picongpu/fields/background/cellwiseOperation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Rene Widera
+/* Copyright 2014-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -32,166 +32,128 @@
 
 namespace picongpu
 {
-namespace cellwiseOperation
-{
-
-    /** call a functor for each cell
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template< uint32_t T_numWorkers >
-    struct KernelCellwiseOperation
+    namespace cellwiseOperation
     {
-        /** Kernel that calls T_OpFunctor and T_ValFunctor on each cell of a field
-         *
-         * performed code for each cell:
-         * @code{.cpp}
-         * opFunctor( acc, field, valFunctor( totalCellIdx, currentStep ) );
-         * @endcode
+        /** call a functor for each cell
          *
-         * @tparam T_OpFunctor like assign, add, subtract, ...
-         * @tparam T_ValFunctor like "f(x,t)", "0.0", "readFromOtherField", ...
-         * @tparam T_FieldBox field type
-         * @tparam T_Mapping mapper which defines the working region
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param acc alpaka accelerator
-         * @param[in,out] field field to manipulate
-         * @param opFunctor binary operator used with the old and functor value
-         *                  (collective functors are not supported)
-         * @param valFunctor functor to execute (collective functors are not supported)
-         * @param totalDomainOffset offset to the local domain relative to the origin of the global domain
-         * @param currentStep simulation time step
-         * @param mapper functor to map a block to a supercell
+         * @tparam T_numWorkers number of workers
          */
-        template<
-            typename T_OpFunctor,
-            typename T_ValFunctor,
-            typename T_FieldBox,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void
-        operator()(
-            T_Acc const & acc,
-            T_FieldBox field,
-            T_OpFunctor opFunctor,
-            T_ValFunctor valFunctor,
-            DataSpace< simDim > const totalDomainOffset,
-            uint32_t const currentStep,
-            T_Mapping mapper
-        ) const
-        {
-            using namespace mappings::threads;
-            constexpr uint32_t cellsPerSupercell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorker = T_numWorkers;
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            DataSpace< simDim > const block( mapper.getSuperCellIndex( DataSpace<simDim>( blockIdx ) ) );
-            DataSpace< simDim > const blockCell = block * SuperCellSize::toRT( );
-            DataSpace< simDim > const guardCells = mapper.getGuardingSuperCells( ) * SuperCellSize::toRT( );
-
-            ForEachIdx<
-                IdxConfig<
-                    cellsPerSupercell,
-                    numWorker
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    // cell index within the superCell
-                    DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::
-                        template map< SuperCellSize >( linearIdx );
-
-                    opFunctor(
-                        acc,
-                        field( blockCell + cellIdx ),
-                        valFunctor(
-                            blockCell + cellIdx + totalDomainOffset - guardCells,
-                            currentStep
-                        )
-                    );
-                }
-            );
-        }
-    };
-
-    /** Call a functor on each cell of a field
-     *
-     * \tparam T_Area Where to compute on (CORE, BORDER, GUARD)
-     */
-    template< uint32_t T_Area >
-    class CellwiseOperation
-    {
-    private:
-
-        MappingDesc m_cellDescription;
-
-    public:
-        CellwiseOperation( MappingDesc const cellDescription ) :
-            m_cellDescription( cellDescription )
+        template<uint32_t T_numWorkers>
+        struct KernelCellwiseOperation
         {
-        }
-
-        /** Functor call to execute the op/valFunctor on a given field
+            /** Kernel that calls T_OpFunctor and T_ValFunctor on each cell of a field
+             *
+             * performed code for each cell:
+             * @code{.cpp}
+             * opFunctor( acc, field, valFunctor( totalCellIdx, currentStep ) );
+             * @endcode
+             *
+             * @tparam T_OpFunctor like assign, add, subtract, ...
+             * @tparam T_ValFunctor like "f(x,t)", "0.0", "readFromOtherField", ...
+             * @tparam T_FieldBox field type
+             * @tparam T_Mapping mapper which defines the working region
+             * @tparam T_Acc alpaka accelerator type
+             *
+             * @param acc alpaka accelerator
+             * @param[in,out] field field to manipulate
+             * @param opFunctor binary operator used with the old and functor value
+             *                  (collective functors are not supported)
+             * @param valFunctor functor to execute (collective functors are not supported)
+             * @param totalDomainOffset offset to the local domain relative to the origin of the global domain
+             * @param currentStep simulation time step
+             * @param mapper functor to map a block to a supercell
+             */
+            template<
+                typename T_OpFunctor,
+                typename T_ValFunctor,
+                typename T_FieldBox,
+                typename T_Mapping,
+                typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                T_FieldBox field,
+                T_OpFunctor opFunctor,
+                T_ValFunctor valFunctor,
+                DataSpace<simDim> const totalDomainOffset,
+                uint32_t const currentStep,
+                T_Mapping mapper) const
+            {
+                using namespace mappings::threads;
+                constexpr uint32_t cellsPerSupercell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                constexpr uint32_t numWorker = T_numWorkers;
+
+                uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                DataSpace<simDim> const block(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+                DataSpace<simDim> const blockCell = block * SuperCellSize::toRT();
+                DataSpace<simDim> const guardCells = mapper.getGuardingSuperCells() * SuperCellSize::toRT();
+
+                ForEachIdx<IdxConfig<cellsPerSupercell, numWorker>>{workerIdx}(
+                    [&](uint32_t const linearIdx, uint32_t const) {
+                        // cell index within the superCell
+                        DataSpace<simDim> const cellIdx
+                            = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
+
+                        opFunctor(
+                            acc,
+                            field(blockCell + cellIdx),
+                            valFunctor(blockCell + cellIdx + totalDomainOffset - guardCells, currentStep));
+                    });
+            }
+        };
+
+        /** Call a functor on each cell of a field
          *
-         * @tparam ValFunctor A Value-Producing functor for a given cell
-         *                    in time and space
-         * @tparam OpFunctor A manipulating functor like pmacc::nvidia::functors::add
+         * \tparam T_Area Where to compute on (CORE, BORDER, GUARD)
          */
-        template<
-            typename T_Field,
-            typename T_OpFunctor,
-            typename T_ValFunctor
-        >
-        void
-        operator()(
-            T_Field field,
-            T_OpFunctor opFunctor,
-            T_ValFunctor valFunctor,
-            uint32_t const currentStep,
-            const bool enabled = true
-        ) const
+        template<uint32_t T_Area>
+        class CellwiseOperation
         {
-            if( !enabled )
-                return;
-
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get( ).SubGrid();
-            // offset to the local domain relative to the origin of the global domain
-            DataSpace< simDim > totalDomainOffset( subGrid.getLocalDomain( ).offset );
-            uint32_t const numSlides = MovingWindow::getInstance( ).getSlideCounter( currentStep );
-
-            /** Assumption: all GPUs have the same number of cells in
-             *              y direction for sliding window
+        private:
+            MappingDesc m_cellDescription;
+
+        public:
+            CellwiseOperation(MappingDesc const cellDescription) : m_cellDescription(cellDescription)
+            {
+            }
+
+            /** Functor call to execute the op/valFunctor on a given field
+             *
+             * @tparam ValFunctor A Value-Producing functor for a given cell
+             *                    in time and space
+             * @tparam OpFunctor A manipulating functor like pmacc::nvidia::functors::add
              */
-            totalDomainOffset.y( ) += numSlides * subGrid.getLocalDomain().size.y( );
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            AreaMapping<
-                T_Area,
-                MappingDesc
-            > mapper( m_cellDescription );
-
-            PMACC_KERNEL( KernelCellwiseOperation< numWorkers >{ })(
-                mapper.getGridDim( ),
-                numWorkers
-            )(
-                field->getDeviceDataBox( ),
-                opFunctor,
-                valFunctor,
-                totalDomainOffset,
-                currentStep,
-                mapper
-            );
-        }
-    };
-
-} // namespace cellwiseOperation
+            template<typename T_Field, typename T_OpFunctor, typename T_ValFunctor>
+            void operator()(
+                T_Field field,
+                T_OpFunctor opFunctor,
+                T_ValFunctor valFunctor,
+                uint32_t const currentStep,
+                const bool enabled = true) const
+            {
+                if(!enabled)
+                    return;
+
+                SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                // offset to the local domain relative to the origin of the global domain
+                DataSpace<simDim> totalDomainOffset(subGrid.getLocalDomain().offset);
+                uint32_t const numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+
+                /** Assumption: all GPUs have the same number of cells in
+                 *              y direction for sliding window
+                 */
+                totalDomainOffset.y() += numSlides * subGrid.getLocalDomain().size.y();
+
+                constexpr uint32_t numWorkers
+                    = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                AreaMapping<T_Area, MappingDesc> mapper(m_cellDescription);
+
+                PMACC_KERNEL(KernelCellwiseOperation<numWorkers>{})
+                (mapper.getGridDim(),
+                 numWorkers)(field->getDeviceDataBox(), opFunctor, valFunctor, totalDomainOffset, currentStep, mapper);
+            }
+        };
+
+    } // namespace cellwiseOperation
 } // namespace picongpu
diff --git a/include/picongpu/fields/background/templates/TWTS/BField.hpp b/include/picongpu/fields/background/templates/TWTS/BField.hpp
index 1fe49f322d..11d8fc943f 100644
--- a/include/picongpu/fields/background/templates/TWTS/BField.hpp
+++ b/include/picongpu/fields/background/templates/TWTS/BField.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus, Axel Huebl
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -28,168 +28,159 @@
 
 namespace picongpu
 {
-/* Load pre-defined background field */
-namespace templates
-{
-/* Traveling-wave Thomson scattering laser pulse */
-namespace twts
-{
-
-class BField
-{
-public:
-    using float_T = float_X;
-
-    enum PolarizationType
+    /* Load pre-defined background field */
+    namespace templates
     {
-        /* The linear polarization of the TWTS laser is defined
-         * relative to the plane of the pulse front tilt (reference plane).
-         *
-         * Polarisation is normal to the reference plane.
-         * Use Ex-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
-         */
-        LINEAR_X = 1u,
-        /* Polarization lies within the reference plane.
-         * Use Ey-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
-         */
-        LINEAR_YZ = 2u,
-    };
-
-    /* Center of simulation volume in number of cells */
-    PMACC_ALIGN(halfSimSize,DataSpace<simDim>);
-    /* y-position of TWTS coordinate origin inside the simulation coordinates [meter]
-     * The other origin coordinates (x and z) default to globally centered values
-     * with respect to the simulation volume.
-     */
-    PMACC_ALIGN(focus_y_SI, const float_64);
-    /* Laser wavelength [meter] */
-    PMACC_ALIGN(wavelength_SI, const float_64);
-    /* TWTS laser pulse duration [second] */
-    PMACC_ALIGN(pulselength_SI, const float_64);
-    /* line focus height of TWTS pulse [meter] */
-    PMACC_ALIGN(w_x_SI, const float_64);
-    /* line focus width of TWTS pulse [meter] */
-    PMACC_ALIGN(w_y_SI, const float_64);
-    /* interaction angle between TWTS laser propagation vector and the y-axis [rad] */
-    PMACC_ALIGN(phi, const float_X);
-    /* Takes value 1.0 for phi > 0 and -1.0 for phi < 0. */
-    PMACC_ALIGN(phiPositive,float_X);
-    /* propagation speed of TWTS laser overlap
-       normalized to the speed of light. [Default: beta0 = 1.0] */
-    PMACC_ALIGN(beta_0, const float_X);
-    /* If auto_tdelay=FALSE, then a user defined delay is used. [second] */
-    PMACC_ALIGN(tdelay_user_SI, const float_64);
-    /* Make time step constant accessible to device. */
-    PMACC_ALIGN(dt, const float_64);
-    /* Make length normalization constant accessible to device. */
-    PMACC_ALIGN(unit_length, const float_64);
-    /* TWTS laser time delay */
-    PMACC_ALIGN(tdelay,float_64);
-    /* Should the TWTS laser time delay be chosen automatically, such that
-     * the laser gradually enters the simulation volume? [Default: TRUE]
-     */
-    PMACC_ALIGN(auto_tdelay, const bool);
-    /* Polarization of TWTS laser */
-    PMACC_ALIGN(pol, const PolarizationType);
-
-    /** Magnetic field of the TWTS laser
-     *
-     * \param focus_y_SI the distance to the laser focus in y-direction [m]
-     * \param wavelength_SI central wavelength [m]
-     * \param pulselength_SI sigma of std. gauss for intensity (E^2),
-     *  pulselength_SI = FWHM_of_Intensity / 2.35482 [seconds (sigma)]
-     * \param w_x beam waist: distance from the axis where the pulse electric field
-     *  decreases to its 1/e^2-th part at the focus position of the laser [m]
-     * \param w_y \see w_x
-     * \param phi interaction angle between TWTS laser propagation vector and
-     *  the y-axis [rad, default = 90.*(PI/180.)]
-     * \param beta_0 propagation speed of overlap normalized to
-     *  the speed of light [c, default = 1.0]
-     * \param tdelay_user manual time delay if auto_tdelay is false
-     * \param auto_tdelay calculate the time delay such that the TWTS pulse is not
-     *  inside the simulation volume at simulation start timestep = 0 [default = true]
-     * \param pol determines the TWTS laser polarization, which is either normal or parallel
-     *  to the laser pulse front tilt plane [ default= LINEAR_X , LINEAR_YZ ]
-     */
-    HINLINE
-    BField( const float_64 focus_y_SI,
-            const float_64 wavelength_SI,
-            const float_64 pulselength_SI,
-            const float_64 w_x_SI,
-            const float_64 w_y_SI,
-            const float_X phi               = 90.*(PI / 180.),
-            const float_X beta_0            = 1.0,
-            const float_64 tdelay_user_SI   = 0.0,
-            const bool auto_tdelay          = true,
-            const PolarizationType pol      = LINEAR_X );
-
-
-    /** Specify your background field B(r,t) here
-     *
-     * \param cellIdx The total cell id counted from the start at t=0
-     * \param currentStep The current time step */
-    HDINLINE float3_X
-    operator()( const DataSpace<simDim>& cellIdx,
-                const uint32_t currentStep ) const;
-
-    /** Calculate the By(r,t) field, when electric field vector (Ex,0,0)
-     *  is normal to the pulse-front-tilt plane (y,z)
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *  for calculating the field */
-    HDINLINE float_T
-    calcTWTSBy( const float3_64& pos, const float_64 time ) const;
-
-    /** Calculate the Bz(r,t) field, when electric field vector (Ex,0,0)
-     *  is normal to the pulse-front-tilt plane (y,z)
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *  for calculating the field */
-    HDINLINE float_T
-    calcTWTSBz_Ex( const float3_64& pos, const float_64 time ) const;
-
-    /** Calculate the By(r,t) field, when electric field vector (0,Ey,0)
-     *  lies within the pulse-front-tilt plane (y,z)
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *  for calculating the field */
-    HDINLINE float_T
-    calcTWTSBx( const float3_64& pos, const float_64 time ) const;
-
-    /** Calculate the Bz(r,t) field here (electric field vector (0,Ey,0)
-     *  lies within the pulse-front-tilt plane (y,z)
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *  for calculating the field */
-    HDINLINE float_T
-    calcTWTSBz_Ey( const float3_64& pos, const float_64 time ) const;
-
-    /** Calculate the B-field vector of the TWTS laser in SI units.
-     * \tparam T_dim Specializes for the simulation dimension
-     * \param cellIdx The total cell id counted from the start at timestep 0
-     * \return B-field vector of the rotated TWTS field in SI units */
-    template<unsigned T_dim>
-    HDINLINE float3_X
-    getTWTSBfield_Normalized(
-            const pmacc::math::Vector<floatD_64,detail::numComponents>& eFieldPositions_SI,
-            const float_64 time) const;
-
-    /** Calculate the B-field vector of the "in-plane" polarized TWTS laser in SI units.
-     * \tparam T_dim Specializes for the simulation dimension
-     * \param cellIdx The total cell id counted from the start at timestep 0
-     * \return B-field vector of the rotated TWTS field in SI units */
-    template<unsigned T_dim>
-    HDINLINE float3_X
-    getTWTSBfield_Normalized_Ey(
-            const pmacc::math::Vector<floatD_64,detail::numComponents>& eFieldPositions_SI,
-            const float_64 time) const;
-
-};
-
-} /* namespace twts */
-} /* namespace templates */
+        /* Traveling-wave Thomson scattering laser pulse */
+        namespace twts
+        {
+            class BField
+            {
+            public:
+                using float_T = float_X;
+
+                enum PolarizationType
+                {
+                    /* The linear polarization of the TWTS laser is defined
+                     * relative to the plane of the pulse front tilt (reference plane).
+                     *
+                     * Polarisation is normal to the reference plane.
+                     * Use Ex-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
+                     */
+                    LINEAR_X = 1u,
+                    /* Polarization lies within the reference plane.
+                     * Use Ey-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
+                     */
+                    LINEAR_YZ = 2u,
+                };
+
+                /* Center of simulation volume in number of cells */
+                PMACC_ALIGN(halfSimSize, DataSpace<simDim>);
+                /* y-position of TWTS coordinate origin inside the simulation coordinates [meter]
+                 * The other origin coordinates (x and z) default to globally centered values
+                 * with respect to the simulation volume.
+                 */
+                PMACC_ALIGN(focus_y_SI, const float_64);
+                /* Laser wavelength [meter] */
+                PMACC_ALIGN(wavelength_SI, const float_64);
+                /* TWTS laser pulse duration [second] */
+                PMACC_ALIGN(pulselength_SI, const float_64);
+                /* line focus height of TWTS pulse [meter] */
+                PMACC_ALIGN(w_x_SI, const float_64);
+                /* line focus width of TWTS pulse [meter] */
+                PMACC_ALIGN(w_y_SI, const float_64);
+                /* interaction angle between TWTS laser propagation vector and the y-axis [rad] */
+                PMACC_ALIGN(phi, const float_X);
+                /* Takes value 1.0 for phi > 0 and -1.0 for phi < 0. */
+                PMACC_ALIGN(phiPositive, float_X);
+                /* propagation speed of TWTS laser overlap
+                   normalized to the speed of light. [Default: beta0 = 1.0] */
+                PMACC_ALIGN(beta_0, const float_X);
+                /* If auto_tdelay=FALSE, then a user defined delay is used. [second] */
+                PMACC_ALIGN(tdelay_user_SI, const float_64);
+                /* Make time step constant accessible to device. */
+                PMACC_ALIGN(dt, const float_64);
+                /* Make length normalization constant accessible to device. */
+                PMACC_ALIGN(unit_length, const float_64);
+                /* TWTS laser time delay */
+                PMACC_ALIGN(tdelay, float_64);
+                /* Should the TWTS laser time delay be chosen automatically, such that
+                 * the laser gradually enters the simulation volume? [Default: TRUE]
+                 */
+                PMACC_ALIGN(auto_tdelay, const bool);
+                /* Polarization of TWTS laser */
+                PMACC_ALIGN(pol, const PolarizationType);
+
+                /** Magnetic field of the TWTS laser
+                 *
+                 * \param focus_y_SI the distance to the laser focus in y-direction [m]
+                 * \param wavelength_SI central wavelength [m]
+                 * \param pulselength_SI sigma of std. gauss for intensity (E^2),
+                 *  pulselength_SI = FWHM_of_Intensity / 2.35482 [seconds (sigma)]
+                 * \param w_x beam waist: distance from the axis where the pulse electric field
+                 *  decreases to its 1/e^2-th part at the focus position of the laser [m]
+                 * \param w_y \see w_x
+                 * \param phi interaction angle between TWTS laser propagation vector and
+                 *  the y-axis [rad, default = 90.*(PI/180.)]
+                 * \param beta_0 propagation speed of overlap normalized to
+                 *  the speed of light [c, default = 1.0]
+                 * \param tdelay_user manual time delay if auto_tdelay is false
+                 * \param auto_tdelay calculate the time delay such that the TWTS pulse is not
+                 *  inside the simulation volume at simulation start timestep = 0 [default = true]
+                 * \param pol determines the TWTS laser polarization, which is either normal or parallel
+                 *  to the laser pulse front tilt plane [ default= LINEAR_X , LINEAR_YZ ]
+                 */
+                HINLINE
+                BField(
+                    const float_64 focus_y_SI,
+                    const float_64 wavelength_SI,
+                    const float_64 pulselength_SI,
+                    const float_64 w_x_SI,
+                    const float_64 w_y_SI,
+                    const float_X phi = 90. * (PI / 180.),
+                    const float_X beta_0 = 1.0,
+                    const float_64 tdelay_user_SI = 0.0,
+                    const bool auto_tdelay = true,
+                    const PolarizationType pol = LINEAR_X);
+
+
+                /** Specify your background field B(r,t) here
+                 *
+                 * \param cellIdx The total cell id counted from the start at t=0
+                 * \param currentStep The current time step */
+                HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const;
+
+                /** Calculate the By(r,t) field, when electric field vector (Ex,0,0)
+                 *  is normal to the pulse-front-tilt plane (y,z)
+                 *
+                 * \param pos Spatial position of the target field.
+                 * \param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field */
+                HDINLINE float_T calcTWTSBy(const float3_64& pos, const float_64 time) const;
+
+                /** Calculate the Bz(r,t) field, when electric field vector (Ex,0,0)
+                 *  is normal to the pulse-front-tilt plane (y,z)
+                 *
+                 * \param pos Spatial position of the target field.
+                 * \param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field */
+                HDINLINE float_T calcTWTSBz_Ex(const float3_64& pos, const float_64 time) const;
+
+                /** Calculate the By(r,t) field, when electric field vector (0,Ey,0)
+                 *  lies within the pulse-front-tilt plane (y,z)
+                 *
+                 * \param pos Spatial position of the target field.
+                 * \param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field */
+                HDINLINE float_T calcTWTSBx(const float3_64& pos, const float_64 time) const;
+
+                /** Calculate the Bz(r,t) field here (electric field vector (0,Ey,0)
+                 *  lies within the pulse-front-tilt plane (y,z)
+                 *
+                 * \param pos Spatial position of the target field.
+                 * \param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field */
+                HDINLINE float_T calcTWTSBz_Ey(const float3_64& pos, const float_64 time) const;
+
+                /** Calculate the B-field vector of the TWTS laser in SI units.
+                 * \tparam T_dim Specializes for the simulation dimension
+                 * \param cellIdx The total cell id counted from the start at timestep 0
+                 * \return B-field vector of the rotated TWTS field in SI units */
+                template<unsigned T_dim>
+                HDINLINE float3_X getTWTSBfield_Normalized(
+                    const pmacc::math::Vector<floatD_64, detail::numComponents>& eFieldPositions_SI,
+                    const float_64 time) const;
+
+                /** Calculate the B-field vector of the "in-plane" polarized TWTS laser in SI units.
+                 * \tparam T_dim Specializes for the simulation dimension
+                 * \param cellIdx The total cell id counted from the start at timestep 0
+                 * \return B-field vector of the rotated TWTS field in SI units */
+                template<unsigned T_dim>
+                HDINLINE float3_X getTWTSBfield_Normalized_Ey(
+                    const pmacc::math::Vector<floatD_64, detail::numComponents>& eFieldPositions_SI,
+                    const float_64 time) const;
+            };
+
+        } /* namespace twts */
+    } /* namespace templates */
 } /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/TWTS/BField.tpp b/include/picongpu/fields/background/templates/TWTS/BField.tpp
index 89a29ba74a..b0dfdf918d 100644
--- a/include/picongpu/fields/background/templates/TWTS/BField.tpp
+++ b/include/picongpu/fields/background/templates/TWTS/BField.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus, Axel Huebl
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -37,695 +37,674 @@
 
 namespace picongpu
 {
-/** Load pre-defined background field */
-namespace templates
-{
-/** Traveling-wave Thomson scattering laser pulse */
-namespace twts
-{
-
-    HINLINE
-    BField::BField( const float_64 focus_y_SI,
-                    const float_64 wavelength_SI,
-                    const float_64 pulselength_SI,
-                    const float_64 w_x_SI,
-                    const float_64 w_y_SI,
-                    const float_X phi,
-                    const float_X beta_0,
-                    const float_64 tdelay_user_SI,
-                    const bool auto_tdelay,
-                    const PolarizationType pol ) :
-        focus_y_SI(focus_y_SI), wavelength_SI(wavelength_SI),
-        pulselength_SI(pulselength_SI), w_x_SI(w_x_SI),
-        w_y_SI(w_y_SI), phi(phi), beta_0(beta_0),
-        tdelay_user_SI(tdelay_user_SI), dt(SI::DELTA_T_SI),
-        unit_length(UNIT_LENGTH), auto_tdelay(auto_tdelay), pol(pol), phiPositive( float_X(1.0) )
-    {
-        /* Note: Enviroment-objects cannot be instantiated on CUDA GPU device. Since this is done
-         * on host (see fieldBackground.param), this is no problem.
-         */
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-        halfSimSize = subGrid.getGlobalDomain().size / 2;
-        tdelay = detail::getInitialTimeDelay_SI(auto_tdelay, tdelay_user_SI,
-                                                halfSimSize, pulselength_SI,
-                                                focus_y_SI, phi, beta_0);
-        if ( phi < float_X(0.0) ) phiPositive = float_X(-1.0);
-    }
-
-    template<>
-    HDINLINE float3_X
-    BField::getTWTSBfield_Normalized<DIM3>(
-            const pmacc::math::Vector<floatD_64,detail::numComponents>& bFieldPositions_SI,
-            const float_64 time) const
-    {
-        typedef pmacc::math::Vector<float3_64,detail::numComponents> PosVecVec;
-        PosVecVec pos(PosVecVec::create(
-                                           float3_64::create(0.0)
-                                       ));
-
-        for (uint32_t k = 0; k<detail::numComponents;++k) {
-            for (uint32_t i = 0; i<simDim;++i)
-                pos[k][i] = bFieldPositions_SI[k][i];
-        }
-
-        /* An example of intra-cell position offsets is the staggered Yee-grid.
-         *
-         * Calculate By-component with the intra-cell offset of a By-field
-         */
-        const float_64 By_By = calcTWTSBy(pos[1], time);
-        /* Calculate Bz-component the the intra-cell offset of a By-field */
-        const float_64 Bz_By = calcTWTSBz_Ex(pos[1], time);
-        /* Calculate By-component the the intra-cell offset of a Bz-field */
-        const float_64 By_Bz = calcTWTSBy(pos[2], time);
-        /* Calculate Bz-component the the intra-cell offset of a Bz-field */
-        const float_64 Bz_Bz = calcTWTSBz_Ex(pos[2], time);
-        /* Since we rotated all position vectors before calling calcTWTSBy and calcTWTSBz_Ex,
-         * we need to back-rotate the resulting B-field vector.
-         *
-         * RotationMatrix[-(PI/2+phi)].(By,Bz) for rotating back the field vectors.
-         */
-        const float_64 By_rot = -math::sin(+phi)*By_By+math::cos(+phi)*Bz_By;
-        const float_64 Bz_rot = -math::cos(+phi)*By_Bz-math::sin(+phi)*Bz_Bz;
-
-        /* Finally, the B-field normalized to the peak amplitude. */
-        return float3_X( float_X(0.0),
-                         float_X(By_rot),
-                         float_X(Bz_rot) );
-    }
-
-    template<>
-    HDINLINE float3_X
-    BField::getTWTSBfield_Normalized_Ey<DIM3>(
-            const pmacc::math::Vector<floatD_64,detail::numComponents>& bFieldPositions_SI,
-            const float_64 time) const
-    {
-        typedef pmacc::math::Vector<float3_64,detail::numComponents> PosVecVec;
-        PosVecVec pos(PosVecVec::create(
-                                           float3_64::create(0.0)
-                                       ));
-
-        for (uint32_t k = 0; k<detail::numComponents;++k) {
-            for (uint32_t i = 0; i<simDim;++i) pos[k][i] = bFieldPositions_SI[k][i];
-        }
-
-        /* Calculate Bz-component with the intra-cell offset of a By-field */
-        const float_64 Bz_By = calcTWTSBz_Ey(pos[1], time);
-        /* Calculate Bz-component with the intra-cell offset of a Bz-field */
-        const float_64 Bz_Bz = calcTWTSBz_Ey(pos[2], time);
-        /* Since we rotated all position vectors before calling calcTWTSBz_Ey,
-         * we need to back-rotate the resulting B-field vector.
-         *
-         * RotationMatrix[-(PI/2+phi)].(By,Bz) for rotating back the field-vectors.
-         */
-        const float_64 By_rot = +math::cos(+phi)*Bz_By;
-        const float_64 Bz_rot = -math::sin(+phi)*Bz_Bz;
-
-        /* Finally, the B-field normalized to the peak amplitude. */
-        return float3_X( float_X( calcTWTSBx(pos[0], time) ),
-                         float_X( By_rot ),
-                         float_X( Bz_rot ) );
-    }
-
-    template<>
-    HDINLINE float3_X
-    BField::getTWTSBfield_Normalized<DIM2>(
-            const pmacc::math::Vector<floatD_64,detail::numComponents>& bFieldPositions_SI,
-            const float_64 time) const
+    /** Load pre-defined background field */
+    namespace templates
     {
-        typedef pmacc::math::Vector<float3_64,detail::numComponents> PosVecVec;
-        PosVecVec pos(PosVecVec::create(
-                                           float3_64::create(0.0)
-                                       ));
-
-        for (uint32_t k = 0; k<detail::numComponents;++k) {
-            /* 2D (y,z) vectors are mapped on 3D (x,y,z) vectors. */
-            for (uint32_t i = 0; i<DIM2;++i)
-                pos[k][i+1] = bFieldPositions_SI[k][i];
-        }
-
-        /* General background comment for the rest of this function:
-         *
-         * Corresponding position vector for the field components in 2D simulations.
-         *  3D     3D vectors in 2D space (x, y)
-         *  x -->  z (Meaning: In 2D-sim, insert cell-coordinate x
-         *            into TWTS field function coordinate z.)
-         *  y -->  y
-         *  z --> -x (Since z=0 for 2D, we use the existing
-         *            3D TWTS-field-function and set x = -0)
-         *  The transformed 3D coordinates are used to calculate the field components.
-         *  Ex --> Ez (Meaning: Calculate Ex-component of existing 3D TWTS-field (calcTWTSEx) using
-         *             transformed position vectors to obtain the corresponding Ez-component in 2D.
-         *             Note: Swapping field component coordinates also alters the
-         *                   intra-cell position offset.)
-         *  By --> By
-         *  Bz --> -Bx (Yes, the sign is necessary.)
-         *
-         * An example of intra-cell position offsets is the staggered Yee-grid.
-         *
-         * This procedure is analogous to 3D case, but replace By --> By and Bz --> -Bx. Hence the
-         * grid cell offset for Bx has to be used instead of Bz. Mind the "-"-sign.
-         */
-
-        /* Calculate By-component with the intra-cell offset of a By-field */
-        const float_64 By_By =  calcTWTSBy(pos[1], time);
-        /* Calculate Bx-component with the intra-cell offset of a By-field */
-        const float_64 Bx_By = -calcTWTSBz_Ex(pos[1], time);
-        /* Calculate By-component with the intra-cell offset of a Bx-field */
-        const float_64 By_Bx =  calcTWTSBy(pos[0], time);
-        /* Calculate Bx-component with the intra-cell offset of a Bx-field */
-        const float_64 Bx_Bx = -calcTWTSBz_Ex(pos[0], time);
-        /* Since we rotated all position vectors before calling calcTWTSBy and calcTWTSBz_Ex, we
-         * need to back-rotate the resulting B-field vector. Now the rotation is done
-         * analogously in the (y,x)-plane. (Reverse of the position vector transformation.)
-         *
-         * RotationMatrix[-(PI / 2+phi)].(By,Bx) for rotating back the field vectors.
-         */
-        const float_64 By_rot = -math::sin(phi)*By_By+math::cos(phi)*Bx_By;
-        const float_64 Bx_rot = -math::cos(phi)*By_Bx-math::sin(phi)*Bx_Bx;
-
-        /* Finally, the B-field normalized to the peak amplitude. */
-        return float3_X( float_X(Bx_rot),
-                         float_X(By_rot),
-                         float_X(0.0) );
-    }
-
-    template<>
-    HDINLINE float3_X
-    BField::getTWTSBfield_Normalized_Ey<DIM2>(
-            const pmacc::math::Vector<floatD_64,detail::numComponents>& bFieldPositions_SI,
-            const float_64 time) const
-    {
-        typedef pmacc::math::Vector<float3_64,detail::numComponents> PosVecVec;
-        PosVecVec pos(PosVecVec::create(
-                                           float3_64::create(0.0)
-                                       ));
-
-        for (uint32_t k = 0; k<detail::numComponents;++k) {
-            /* The 2D output of getFieldPositions_SI only returns
-             * the y- and z-component of a 3D vector.
-             */
-            for (uint32_t i = 0; i<DIM2;++i) pos[k][i+1] = bFieldPositions_SI[k][i];
-        }
-
-        /* General background comment for the rest of this function:
-         *
-         * Corresponding position vector for the field components in 2D simulations.
-         *  3D     3D vectors in 2D space (x, y)
-         *  x -->  z (Meaning: In 2D-sim, insert cell-coordinate x
-         *            into TWTS field function coordinate z.)
-         *  y -->  y
-         *  z --> -x (Since z=0 for 2D, we use the existing
-         *            3D TWTS-field-function and set x = -0)
-         *  Ex --> Ez (Meaning: Calculate Ex-component of existing 3D TWTS-field to obtain
-         *             corresponding Ez-component in 2D.
-         *             Note: the intra-cell position offset due to the staggered grid for Ez.)
-         *  By --> By
-         *  Bz --> -Bx (Yes, the sign is necessary.)
-         *
-         * This procedure is analogous to 3D case, but replace By --> By and Bz --> -Bx. Hence the
-         * grid cell offset for Bx has to be used instead of Bz. Mind the -sign.
-         */
-
-        /* Calculate Bx-component with the intra-cell offset of a By-field */
-        const float_64 Bx_By = -calcTWTSBz_Ex(pos[1], time);
-        /* Calculate Bx-component with the intra-cell offset of a Bx-field */
-        const float_64 Bx_Bx = -calcTWTSBz_Ex(pos[0], time);
-
-        /* Since we rotated all position vectors before calling calcTWTSBz_Ex, we
-         * need to back-rotate the resulting B-field vector. Now the rotation is done
-         * analogously in the (y,x)-plane. (Reverse of the position vector transformation.)
-         *
-         * RotationMatrix[-(PI / 2+phi)].(By,Bx)
-         * for rotating back the field-vectors.
-         */
-        const float_64 By_rot = +math::cos(phi)*Bx_By;
-        const float_64 Bx_rot = -math::sin(phi)*Bx_Bx;
-
-        /* Finally, the B-field normalized to the peak amplitude. */
-        return float3_X( float_X( Bx_rot ),
-                         float_X( By_rot ),
-                         float_X( calcTWTSBx(pos[2], time) ) );
-    }
-
-    HDINLINE float3_X
-    BField::operator()( const DataSpace<simDim>& cellIdx,
-                            const uint32_t currentStep ) const
-    {
-        const float_64 time_SI = float_64(currentStep) * dt - tdelay;
-        const traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
-
-        const pmacc::math::Vector<floatD_64,detail::numComponents> bFieldPositions_SI =
-              detail::getFieldPositions_SI(cellIdx, halfSimSize,
-                fieldPosB(), unit_length, focus_y_SI, phi);
-        /* Single TWTS-Pulse */
-        switch (pol)
+        /** Traveling-wave Thomson scattering laser pulse */
+        namespace twts
         {
-            case LINEAR_X :
-            return getTWTSBfield_Normalized<simDim>(bFieldPositions_SI, time_SI);
-
-            case LINEAR_YZ :
-            return getTWTSBfield_Normalized_Ey<simDim>(bFieldPositions_SI, time_SI);
-        }
-        return getTWTSBfield_Normalized<simDim>(bFieldPositions_SI, time_SI); // defensive default
-    }
-
-    /** Calculate the By(r,t) field here
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *             for calculating the field */
-    HDINLINE BField::float_T
-    BField::calcTWTSBy( const float3_64& pos, const float_64 time ) const
-    {
-        using complex_T = pmacc::math::Complex< float_T >;
-        using complex_64 = pmacc::math::Complex< float_64 >;
-        /* Unit of speed */
-        const float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
-        /* Unit of time */
-        const float_64 UNIT_TIME = SI::DELTA_T_SI;
-        /* Unit of length */
-        const float_64 UNIT_LENGTH = UNIT_TIME*UNIT_SPEED;
-
-        /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
-        const float_T beta0 = float_T(beta_0);
-        /* If phi < 0 the formulas below are not directly applicable.
-         * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
-         * z-axis of the coordinate system in this function.
-         */
-        const float_T phiReal = float_T( math::abs(phi) );
-        const float_T alphaTilt = math::atan2(float_T(1.0)-beta0*math::cos(phiReal),
-                                                beta0*math::sin(phiReal));
-        /* Definition of the laser pulse front tilt angle for the laser field below.
-         *
-         * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
-         * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
-         * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
-         * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
-         * the dispersion will (although physically correct) be slightly off the ideal TWTS
-         * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
-         * scenarios close to beta0 = 1.
-         */
-        const float_T phiT = float_T(2.0)*alphaTilt;
-
-        /* Angle between the laser pulse front and the y-axis. Not used, but remains in code for
-         * documentation purposes.
-         * const float_T eta = float_T(PI/2) - (phiReal - alphaTilt);
-         */
-
-        const float_T cspeed = float_T( SI::SPEED_OF_LIGHT_SI / UNIT_SPEED );
-        const float_T lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
-        const float_T om0 = float_T(2.0*PI*cspeed / lambda0);
-        /* factor 2  in tauG arises from definition convention in laser formula */
-        const float_T tauG = float_T(pulselength_SI*2.0 / UNIT_TIME);
-        /* w0 is wx here --> w0 could be replaced by wx */
-        const float_T w0 = float_T(w_x_SI / UNIT_LENGTH);
-        const float_T rho0 = float_T(PI*w0*w0 / lambda0);
-        /* wy is width of TWTS pulse */
-        const float_T wy = float_T(w_y_SI / UNIT_LENGTH);
-        const float_T k = float_T(2.0*PI / lambda0);
-        /* If phi < 0 the entire pulse is rotated by 180 deg around the
-         * z-axis of the coordinate system without also changing
-         * the orientation of the resulting field vectors.
-         */
-        const float_T x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
-        const float_T y = float_T(phiPositive * pos.y() / UNIT_LENGTH);
-        const float_T z = float_T(pos.z() / UNIT_LENGTH);
-        const float_T t = float_T(time / UNIT_TIME);
-
-        /* Shortcuts for speeding up the field calculation. */
-        const float_T sinPhi = math::sin(phiT);
-        const float_T cosPhi = math::cos(phiT);
-        const float_T cosPhi2 = math::cos(phiT / 2.0);
-        const float_T tanPhi2 = math::tan(phiT / 2.0);
-
-        /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
-         * thus help with formal code verification through manual code inspection.
-         */
-        const complex_T helpVar1 = rho0 + complex_T(0,1)*y*cosPhi + complex_T(0,1)*z*sinPhi;
-        const complex_T helpVar2 = cspeed*om0*tauG*tauG + complex_T(0,2)
-                                    *(-z - y*math::tan(float_T(PI / 2)-phiT))*tanPhi2*tanPhi2;
-        const complex_T helpVar3 = complex_T(0,1)*rho0 - y*cosPhi - z*sinPhi;
-
-        const complex_T helpVar4 = float_T(-1.0)*(
-            cspeed*cspeed*k*om0*tauG*tauG*wy*wy*x*x
-            + float_T(2.0)*cspeed*cspeed*om0*t*t*wy*wy*rho0
-            - complex_T(0,2)*cspeed*cspeed*om0*om0*t*tauG*tauG*wy*wy*rho0
-            + float_T(2.0)*cspeed*cspeed*om0*tauG*tauG*y*y*rho0
-            - float_T(4.0)*cspeed*om0*t*wy*wy*z*rho0
-            + complex_T(0,2)*cspeed*om0*om0*tauG*tauG*wy*wy*z*rho0
-            + float_T(2.0)*om0*wy*wy*z*z*rho0
-            + float_T(4.0)*cspeed*om0*t*wy*wy*y*rho0*tanPhi2
-            - float_T(4.0)*om0*wy*wy*y*z*rho0*tanPhi2
-            - complex_T(0,2)*cspeed*k*wy*wy*x*x*z*tanPhi2*tanPhi2
-            + float_T(2.0)*om0*wy*wy*y*y*rho0*tanPhi2*tanPhi2
-            - float_T(4.0)*cspeed*om0*t*wy*wy*z*rho0*tanPhi2*tanPhi2
-            - complex_T(0,4)*cspeed*y*y*z*rho0*tanPhi2*tanPhi2
-            + float_T(4.0)*om0*wy*wy*z*z*rho0*tanPhi2*tanPhi2
-            - complex_T(0,2)*cspeed*k*wy*wy*x*x*y*math::tan(float_T(PI / 2)-phiT)*tanPhi2*tanPhi2
-            - float_T(4.0)*cspeed*om0*t*wy*wy*y*rho0*math::tan(float_T(PI / 2)-phiT)
-                *tanPhi2*tanPhi2
-            - complex_T(0,4)*cspeed*y*y*y*rho0*math::tan(float_T(PI / 2)-phiT)*tanPhi2*tanPhi2
-            + float_T(4.0)*om0*wy*wy*y*z*rho0*math::tan(float_T(PI / 2)-phiT)*tanPhi2*tanPhi2
-            + float_T(2.0)*z*sinPhi*(
-                + om0*(
-                    + cspeed*cspeed*(
-                          complex_T(0,1)*t*t*wy*wy
-                        + om0*t*tauG*tauG*wy*wy
-                        + complex_T(0,1)*tauG*tauG*y*y
-                    )
-                    - cspeed*(complex_T(0,2)*t + om0*tauG*tauG)*wy*wy*z
-                    + complex_T(0,1)*wy*wy*z*z
-                    )
-                + complex_T(0,2)*om0*wy*wy*y*(cspeed*t - z)*tanPhi2
-                + complex_T(0,1)*tanPhi2*tanPhi2*(
-                      complex_T(0,-2)*cspeed*y*y*z
-                    + om0*wy*wy*( y*y - float_T(2.0)*(cspeed*t - z)*z )
-                )
-            )
-            + float_T(2.0)*y*cosPhi*(
-                + om0*(
-                    + cspeed*cspeed*(
-                          complex_T(0,1)*t*t*wy*wy
-                        + om0*t*tauG*tauG*wy*wy
-                        + complex_T(0,1)*tauG*tauG*y*y
-                    )
-                - cspeed*(complex_T(0,2)*t + om0*tauG*tauG)*wy*wy*z
-                + complex_T(0,1)*wy*wy*z*z
-                )
-            + complex_T(0,2)*om0*wy*wy*y*(cspeed*t - z)*tanPhi2
-            + complex_T(0,1)*(
-                  complex_T(0,-4)*cspeed*y*y*z
-                + om0*wy*wy*(y*y - float_T(4.0)*(cspeed*t - z)*z)
-                - float_T(2.0)*y*(
-                    + cspeed*om0*t*wy*wy
-                    + complex_T(0,1)*cspeed*y*y
-                    - om0*wy*wy*z
-                    )*math::tan(float_T(PI / 2)-phiT)
-                )*tanPhi2*tanPhi2
-            )
-        /* The "round-trip" conversion in the line below fixes a gross accuracy bug
-         * in floating-point arithmetics, when float_T is set to float_X.
-         */
-        ) * complex_T( float_64(1.0) / complex_64(float_T(2.0)*cspeed*wy*wy*helpVar1*helpVar2) );
-
-        const complex_T helpVar5 = complex_T(0,-1)*cspeed*om0*tauG*tauG
-                                + (-z - y*math::tan(float_T(PI / 2)-phiT))
-                                    *tanPhi2*tanPhi2*float_T(2.0);
-        const complex_T helpVar6 = (cspeed*(cspeed*om0*tauG*tauG + complex_T(0,2)
-                                *(-z - y*math::tan(float_T(PI / 2)-phiT))*tanPhi2*tanPhi2))
-                                    / (om0*rho0);
-        const complex_T result = (math::exp(helpVar4)*tauG / cosPhi2 / cosPhi2
-            *(rho0 + complex_T(0,1)*y*cosPhi + complex_T(0,1)*z*sinPhi)
-            *(
-                  complex_T(0,2)*cspeed*t + cspeed*om0*tauG*tauG - complex_T(0,4)*z
-                + cspeed*(complex_T(0,2)*t + om0*tauG*tauG)*cosPhi
-                + complex_T(0,2)*y*tanPhi2
-            )*math::pow(helpVar3,float_T(-1.5))
-        ) / (float_T(2.0)*helpVar5*math::sqrt(helpVar6));
-
-        return result.get_real() / UNIT_SPEED;
-    }
-
-    /** Calculate the Bz(r,t) field
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *             for calculating the field */
-    HDINLINE BField::float_T
-    BField::calcTWTSBz_Ex( const float3_64& pos, const float_64 time ) const
-    {
-        using complex_T = pmacc::math::Complex< float_T >;
-        /** Unit of Speed */
-        const float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
-        /** Unit of time */
-        const float_64 UNIT_TIME = SI::DELTA_T_SI;
-        /** Unit of length */
-        const float_64 UNIT_LENGTH = UNIT_TIME*UNIT_SPEED;
-
-        /* propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
-        const float_T beta0 = float_T(beta_0);
-        /* If phi < 0 the formulas below are not directly applicable.
-         * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
-         * z-axis of the coordinate system in this function.
-         */
-        const float_T phiReal = float_T( math::abs(phi) );
-        const float_T alphaTilt = math::atan2(float_T(1.0)-beta0*math::cos(phiReal),
-                                                beta0*math::sin(phiReal));
-
-        /* Definition of the laser pulse front tilt angle for the laser field below.
-         *
-         * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
-         * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
-         * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
-         * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
-         * the dispersion will (although physically correct) be slightly off the ideal TWTS
-         * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
-         * scenarios close to beta0 = 1.
-         */
-        const float_T phiT = float_T(2.0)*alphaTilt;
-
-        /* Angle between the laser pulse front and the y-axis.
-         * Not used, but remains in code for documentation purposes.
-         * const float_T eta = float_T(float_T(PI / 2)) - (phiReal - alphaTilt);
-         */
-
-        const float_T cspeed = float_T( SI::SPEED_OF_LIGHT_SI / UNIT_SPEED );
-        const float_T lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
-        const float_T om0 = float_T(2.0*PI*cspeed / lambda0);
-        /* factor 2  in tauG arises from definition convention in laser formula */
-        const float_T tauG = float_T(pulselength_SI*2.0 / UNIT_TIME);
-        /* w0 is wx here --> w0 could be replaced by wx */
-        const float_T w0 = float_T(w_x_SI / UNIT_LENGTH);
-        const float_T rho0 = float_T(PI*w0*w0 / lambda0);
-        /* wy is width of TWTS pulse */
-        const float_T wy = float_T(w_y_SI / UNIT_LENGTH);
-        const float_T k = float_T(2.0*PI / lambda0);
-        /* If phi < 0 the entire pulse is rotated by 180 deg around the
-         * z-axis of the coordinate system without also changing
-         * the orientation of the resulting field vectors.
-         */
-        const float_T x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
-        const float_T y = float_T(phiPositive * pos.y() / UNIT_LENGTH);
-        const float_T z = float_T(pos.z() / UNIT_LENGTH);
-        const float_T t = float_T(time / UNIT_TIME);
-
-        /* Shortcuts for speeding up the field calculation. */
-        const float_T sinPhi = math::sin(phiT);
-        const float_T cosPhi = math::cos(phiT);
-        const float_T sinPhi2 = math::sin(phiT / float_T(2.0));
-        const float_T cosPhi2 = math::cos(phiT / float_T(2.0));
-        const float_T tanPhi2 = math::tan(phiT / float_T(2.0));
-
-        /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
-         * thus help with formal code verification through manual code inspection.
-         */
-        const complex_T helpVar1 = -(cspeed*z) - cspeed*y*math::tan(float_T(PI / 2)-phiT)
-                                    + complex_T(0,1)*cspeed*rho0 / sinPhi;
-        const complex_T helpVar2 = complex_T(0,1)*rho0 - y*cosPhi - z*sinPhi;
-        const complex_T helpVar3 = helpVar2*cspeed;
-        const complex_T helpVar4 = cspeed*om0*tauG*tauG
-                                    - complex_T(0,1)*y*cosPhi / cosPhi2 / cosPhi2*tanPhi2
-                                    - complex_T(0,2)*z*tanPhi2*tanPhi2;
-        const complex_T helpVar5 = float_T(2.0)*cspeed*t - complex_T(0,1)*cspeed*om0*tauG*tauG
-                            - float_T(2.0)*z + float_T(8.0)*y / sinPhi / sinPhi / sinPhi
-                                *sinPhi2*sinPhi2*sinPhi2*sinPhi2
-                            - float_T(2.0)*z*tanPhi2*tanPhi2;
-
-        const complex_T helpVar6 = (
-        (om0*y*rho0 / cosPhi2 / cosPhi2 / cosPhi2 / cosPhi2) / helpVar1
-        - (complex_T(0,2)*k*x*x) / helpVar2
-        - (complex_T(0,1)*om0*om0*tauG*tauG*rho0) / helpVar2
-        - (complex_T(0,4)*y*y*rho0) / (wy*wy*helpVar2)
-        + (om0*om0*tauG*tauG*y*cosPhi) / helpVar2
-        + (float_T(4.0)*y*y*y*cosPhi) / (wy*wy*helpVar2)
-        + (om0*om0*tauG*tauG*z*sinPhi) / helpVar2
-        + (float_T(4.0)*y*y*z*sinPhi) / (wy*wy*helpVar2)
-        + (complex_T(0,2)*om0*y*y*cosPhi / cosPhi2 / cosPhi2*tanPhi2) / helpVar3
-        + (om0*y*rho0*cosPhi / cosPhi2 / cosPhi2*tanPhi2) / helpVar3
-        + (complex_T(0,1)*om0*y*y*cosPhi*cosPhi/cosPhi2/cosPhi2*tanPhi2)/helpVar3
-        + (complex_T(0,4)*om0*y*z*tanPhi2*tanPhi2) / helpVar3
-        - (float_T(2.0)*om0*z*rho0*tanPhi2*tanPhi2) / helpVar3
-        - (complex_T(0,2)*om0*z*z*sinPhi*tanPhi2*tanPhi2) / helpVar3
-        - (om0*helpVar5*helpVar5) / (cspeed*helpVar4)
-        ) / float_T(4.0);
-
-        const complex_T helpVar7 = cspeed*om0*tauG*tauG
-                                    - complex_T(0,1)*y*cosPhi / cosPhi2 / cosPhi2*tanPhi2
-                                    - complex_T(0,2)*z*tanPhi2*tanPhi2;
-        const complex_T result = ( complex_T(0,2)*math::exp(helpVar6)*tauG*tanPhi2
-                                    *(cspeed*t - z + y*tanPhi2)
-                                    *math::sqrt( (om0*rho0) / helpVar3 )
-                                  ) / math::pow(helpVar7,float_T(1.5));
-
-        return result.get_real() / UNIT_SPEED;
-    }
-
-    /** Calculate the Bx(r,t) field
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *             for calculating the field */
-    HDINLINE BField::float_T
-    BField::calcTWTSBx( const float3_64& pos, const float_64 time ) const
-    {
-        /* The Bx-field for the Ey-field is the same as
-         * for the By-field for the Ex-field except for the sign.
-         */
-        return -calcTWTSBy( pos, time );
-    }
-
-    /** Calculate the Bz(r,t) field
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *             for calculating the field */
-    HDINLINE BField::float_T
-    BField::calcTWTSBz_Ey( const float3_64& pos, const float_64 time ) const
-    {
-        using complex_T = pmacc::math::Complex< float_T >;
-        using complex_64 = pmacc::math::Complex< float_64 >;
-        /** Unit of speed */
-        const float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
-        /** Unit of time */
-        const float_64 UNIT_TIME = SI::DELTA_T_SI;
-        /** Unit of length */
-        const float_64 UNIT_LENGTH = UNIT_TIME*UNIT_SPEED;
-
-        /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
-        const float_T beta0 = float_T(beta_0);
-        /* If phi < 0 the formulas below are not directly applicable.
-         * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
-         * z-axis of the coordinate system in this function.
-         */
-        const float_T phiReal = float_T( math::abs(phi) );
-        const float_T alphaTilt = math::atan2(float_T(1.0)-beta0*math::cos(phiReal),
-                                                beta0*math::sin(phiReal));
-        /* Definition of the laser pulse front tilt angle for the laser field below.
-         *
-         * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
-         * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
-         * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
-         * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
-         * the dispersion will (although physically correct) be slightly off the ideal TWTS
-         * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
-         * scenarios close to beta0 = 1.
-         */
-        const float_T phiT = float_T(2.0)*alphaTilt;
-
-        /* Angle between the laser pulse front and the y-axis.
-         * Not used, but remains in code for documentation purposes.
-         * const float_T eta = float_T(float_T(PI / 2)) - (phiReal - alphaTilt);
-         */
-
-        const float_T cspeed = float_T( SI::SPEED_OF_LIGHT_SI / UNIT_SPEED );
-        const float_T lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
-        const float_T om0 = float_T(2.0*PI*cspeed / lambda0);
-        /* factor 2  in tauG arises from definition convention in laser formula */
-        const float_T tauG = float_T(pulselength_SI*2.0 / UNIT_TIME);
-        /* w0 is wx here --> w0 could be replaced by wx */
-        const float_T w0 = float_T(w_x_SI / UNIT_LENGTH);
-        const float_T rho0 = float_T(PI*w0*w0 / lambda0);
-        /* wy is width of TWTS pulse */
-        const float_T wy = float_T(w_y_SI / UNIT_LENGTH);
-        const float_T k = float_T(2.0*PI / lambda0);
-        /* If phi < 0 the entire pulse is rotated by 180 deg around the
-         * z-axis of the coordinate system without also changing
-         * the orientation of the resulting field vectors.
-         */
-        const float_T x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
-        const float_T y = float_T(phiPositive * pos.y() / UNIT_LENGTH);
-        const float_T z = float_T(pos.z() / UNIT_LENGTH);
-        const float_T t = float_T(time / UNIT_TIME);
-
-        /* Shortcuts for speeding up the field calculation. */
-        const float_T sinPhi = math::sin(phiT);
-        const float_T cosPhi = math::cos(phiT);
-        const float_T sinPhi2 = math::sin(phiT / float_T(2.0));
-        const float_T cosPhi2 = math::cos(phiT / float_T(2.0));
-        const float_T tanPhi2 = math::tan(phiT / float_T(2.0));
-
-        /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
-         * thus help with formal code verification through manual code inspection.
-         */
-        const complex_T helpVar1 =
-            complex_T(0,-1)*cspeed*om0*tauG*tauG
-            - y*cosPhi / cosPhi2 / cosPhi2 * tanPhi2
-            - float_T(2.0)*z*tanPhi2*tanPhi2;
-        const complex_T helpVar2 = complex_T(0,1)*rho0 - y*cosPhi - z*sinPhi;
-
-        const complex_T helpVar3 = (
-            - cspeed*cspeed*k*om0*tauG*tauG*wy*wy*x*x
-            - float_T(2.0)*cspeed*cspeed*om0*t*t*wy*wy*rho0
-            + complex_T(0,2)*cspeed*cspeed*om0*om0*t*tauG*tauG*wy*wy*rho0
-            - float_T(2.0)*cspeed*cspeed*om0*tauG*tauG*y*y*rho0
-            + float_T(4.0)*cspeed*om0*t*wy*wy*z*rho0
-            - complex_T(0,2)*cspeed*om0*om0*tauG*tauG*wy*wy*z*rho0
-            - float_T(2.0)*om0*wy*wy*z*z*rho0
-            - complex_T(0,8)*om0*wy*wy*y*(cspeed*t - z)*z*sinPhi2*sinPhi2
-            + complex_T(0,8) / sinPhi *(
-                float_T(2.0)*z*z*(cspeed*om0*t*wy*wy + complex_T(0,1)*cspeed*y*y - om0*wy*wy*z)
-                + y*(
-                    cspeed*k*wy*wy*x*x
-                    - complex_T(0,2)*cspeed*om0*t*wy*wy*rho0
-                    + float_T(2.0)*cspeed*y*y*rho0
-                    + complex_T(0,2)*om0*wy*wy*z*rho0
-                )*math::tan(float_T(PI) / float_T(2.0)-phiT) / sinPhi
-            )*sinPhi2*sinPhi2*sinPhi2*sinPhi2
-            - complex_T(0,2)*cspeed*cspeed*om0*t*t*wy*wy*z*sinPhi
-            - float_T(2.0)*cspeed*cspeed*om0*om0*t*tauG*tauG*wy*wy*z*sinPhi
-            - complex_T(0,2)*cspeed*cspeed*om0*tauG*tauG*y*y*z*sinPhi
-            + complex_T(0,4)*cspeed*om0*t*wy*wy*z*z*sinPhi
-            + float_T(2.0)*cspeed*om0*om0*tauG*tauG*wy*wy*z*z*sinPhi
-            - complex_T(0,2)*om0*wy*wy*z*z*z*sinPhi
-            - float_T(4.0)*cspeed*om0*t*wy*wy*y*rho0*tanPhi2
-            + float_T(4.0)*om0*wy*wy*y*z*rho0*tanPhi2
-            + complex_T(0,2)*y*y*(
-                cspeed*om0*t*wy*wy
-                + complex_T(0,1)*cspeed*y*y
-                - om0*wy*wy*z
-            )*cosPhi*cosPhi / cosPhi2 / cosPhi2 * tanPhi2
-            + complex_T(0,2)*cspeed*k*wy*wy*x*x*z*tanPhi2*tanPhi2
-            - float_T(2.0)*om0*wy*wy*y*y*rho0*tanPhi2*tanPhi2
-            + float_T(4.0)*cspeed*om0*t*wy*wy*z*rho0*tanPhi2*tanPhi2
-            + complex_T(0,4)*cspeed*y*y*z*rho0*tanPhi2*tanPhi2
-            - float_T(4.0)*om0*wy*wy*z*z*rho0*tanPhi2*tanPhi2
-            - complex_T(0,2)*om0*wy*wy*y*y*z*sinPhi*tanPhi2*tanPhi2
-            - float_T(2.0)*y*cosPhi*(
-                om0*(
-                    cspeed*cspeed*(complex_T(0,1)*t*t*wy*wy
-                    + om0*t*tauG*tauG*wy*wy
-                    + complex_T(0,1)*tauG*tauG*y*y)
-                    - cspeed*(complex_T(0,2)*t + om0*tauG*tauG)*wy*wy*z
-                    + complex_T(0,1)*wy*wy*z*z
-                )
-                + complex_T(0,2)*om0*wy*wy*y*(cspeed*t - z)*tanPhi2
-                + complex_T(0,1)*(
-                    complex_T(0,-4)*cspeed*y*y*z
-                    + om0*wy*wy*(y*y - float_T(4.0)*(cspeed*t - z)*z)
-                )*tanPhi2*tanPhi2
-            )
-        /* The "round-trip" conversion in the line below fixes a gross accuracy bug
-         * in floating-point arithmetics, when float_T is set to float_X.
-         */
-        ) * complex_T( float_64(1.0) / complex_64(float_T(2.0)*cspeed*wy*wy*helpVar2*helpVar1) );
-
-        const complex_T helpVar4 = (
-            cspeed*om0*(
-                cspeed*om0*tauG*tauG
-                - complex_T(0,8)*y*math::tan( float_T(PI) / float_T(2.0) - phiT )
-                    / sinPhi / sinPhi * sinPhi2*sinPhi2*sinPhi2*sinPhi2
-                - complex_T(0,2)*z*tanPhi2*tanPhi2
-            )
-        ) / rho0;
-
-        const complex_T result = float_T(-1.0)*(
-            cspeed*math::exp(helpVar3)*k*tauG*x*math::pow( helpVar2, float_T(-1.5) )
-            / math::sqrt(helpVar4)
-        );
-
-        return result.get_real() / UNIT_SPEED;
-    }
-
-} /* namespace twts */
-} /* namespace templates */
+            HINLINE
+            BField::BField(
+                const float_64 focus_y_SI,
+                const float_64 wavelength_SI,
+                const float_64 pulselength_SI,
+                const float_64 w_x_SI,
+                const float_64 w_y_SI,
+                const float_X phi,
+                const float_X beta_0,
+                const float_64 tdelay_user_SI,
+                const bool auto_tdelay,
+                const PolarizationType pol)
+                : focus_y_SI(focus_y_SI)
+                , wavelength_SI(wavelength_SI)
+                , pulselength_SI(pulselength_SI)
+                , w_x_SI(w_x_SI)
+                , w_y_SI(w_y_SI)
+                , phi(phi)
+                , beta_0(beta_0)
+                , tdelay_user_SI(tdelay_user_SI)
+                , dt(SI::DELTA_T_SI)
+                , unit_length(UNIT_LENGTH)
+                , auto_tdelay(auto_tdelay)
+                , pol(pol)
+                , phiPositive(float_X(1.0))
+            {
+                /* Note: Enviroment-objects cannot be instantiated on CUDA GPU device. Since this is done
+                 * on host (see fieldBackground.param), this is no problem.
+                 */
+                const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                halfSimSize = subGrid.getGlobalDomain().size / 2;
+                tdelay = detail::getInitialTimeDelay_SI(
+                    auto_tdelay,
+                    tdelay_user_SI,
+                    halfSimSize,
+                    pulselength_SI,
+                    focus_y_SI,
+                    phi,
+                    beta_0);
+                if(phi < float_X(0.0))
+                    phiPositive = float_X(-1.0);
+            }
+
+            template<>
+            HDINLINE float3_X BField::getTWTSBfield_Normalized<DIM3>(
+                const pmacc::math::Vector<floatD_64, detail::numComponents>& bFieldPositions_SI,
+                const float_64 time) const
+            {
+                typedef pmacc::math::Vector<float3_64, detail::numComponents> PosVecVec;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    for(uint32_t i = 0; i < simDim; ++i)
+                        pos[k][i] = bFieldPositions_SI[k][i];
+                }
+
+                /* An example of intra-cell position offsets is the staggered Yee-grid.
+                 *
+                 * Calculate By-component with the intra-cell offset of a By-field
+                 */
+                const float_64 By_By = calcTWTSBy(pos[1], time);
+                /* Calculate Bz-component the the intra-cell offset of a By-field */
+                const float_64 Bz_By = calcTWTSBz_Ex(pos[1], time);
+                /* Calculate By-component the the intra-cell offset of a Bz-field */
+                const float_64 By_Bz = calcTWTSBy(pos[2], time);
+                /* Calculate Bz-component the the intra-cell offset of a Bz-field */
+                const float_64 Bz_Bz = calcTWTSBz_Ex(pos[2], time);
+                /* Since we rotated all position vectors before calling calcTWTSBy and calcTWTSBz_Ex,
+                 * we need to back-rotate the resulting B-field vector.
+                 *
+                 * RotationMatrix[-(PI/2+phi)].(By,Bz) for rotating back the field vectors.
+                 */
+                const float_64 By_rot = -math::sin(+phi) * By_By + math::cos(+phi) * Bz_By;
+                const float_64 Bz_rot = -math::cos(+phi) * By_Bz - math::sin(+phi) * Bz_Bz;
+
+                /* Finally, the B-field normalized to the peak amplitude. */
+                return float3_X(float_X(0.0), float_X(By_rot), float_X(Bz_rot));
+            }
+
+            template<>
+            HDINLINE float3_X BField::getTWTSBfield_Normalized_Ey<DIM3>(
+                const pmacc::math::Vector<floatD_64, detail::numComponents>& bFieldPositions_SI,
+                const float_64 time) const
+            {
+                typedef pmacc::math::Vector<float3_64, detail::numComponents> PosVecVec;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    for(uint32_t i = 0; i < simDim; ++i)
+                        pos[k][i] = bFieldPositions_SI[k][i];
+                }
+
+                /* Calculate Bz-component with the intra-cell offset of a By-field */
+                const float_64 Bz_By = calcTWTSBz_Ey(pos[1], time);
+                /* Calculate Bz-component with the intra-cell offset of a Bz-field */
+                const float_64 Bz_Bz = calcTWTSBz_Ey(pos[2], time);
+                /* Since we rotated all position vectors before calling calcTWTSBz_Ey,
+                 * we need to back-rotate the resulting B-field vector.
+                 *
+                 * RotationMatrix[-(PI/2+phi)].(By,Bz) for rotating back the field-vectors.
+                 */
+                const float_64 By_rot = +math::cos(+phi) * Bz_By;
+                const float_64 Bz_rot = -math::sin(+phi) * Bz_Bz;
+
+                /* Finally, the B-field normalized to the peak amplitude. */
+                return float3_X(float_X(calcTWTSBx(pos[0], time)), float_X(By_rot), float_X(Bz_rot));
+            }
+
+            template<>
+            HDINLINE float3_X BField::getTWTSBfield_Normalized<DIM2>(
+                const pmacc::math::Vector<floatD_64, detail::numComponents>& bFieldPositions_SI,
+                const float_64 time) const
+            {
+                typedef pmacc::math::Vector<float3_64, detail::numComponents> PosVecVec;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    /* 2D (y,z) vectors are mapped on 3D (x,y,z) vectors. */
+                    for(uint32_t i = 0; i < DIM2; ++i)
+                        pos[k][i + 1] = bFieldPositions_SI[k][i];
+                }
+
+                /* General background comment for the rest of this function:
+                 *
+                 * Corresponding position vector for the field components in 2D simulations.
+                 *  3D     3D vectors in 2D space (x, y)
+                 *  x -->  z (Meaning: In 2D-sim, insert cell-coordinate x
+                 *            into TWTS field function coordinate z.)
+                 *  y -->  y
+                 *  z --> -x (Since z=0 for 2D, we use the existing
+                 *            3D TWTS-field-function and set x = -0)
+                 *  The transformed 3D coordinates are used to calculate the field components.
+                 *  Ex --> Ez (Meaning: Calculate Ex-component of existing 3D TWTS-field (calcTWTSEx) using
+                 *             transformed position vectors to obtain the corresponding Ez-component in 2D.
+                 *             Note: Swapping field component coordinates also alters the
+                 *                   intra-cell position offset.)
+                 *  By --> By
+                 *  Bz --> -Bx (Yes, the sign is necessary.)
+                 *
+                 * An example of intra-cell position offsets is the staggered Yee-grid.
+                 *
+                 * This procedure is analogous to 3D case, but replace By --> By and Bz --> -Bx. Hence the
+                 * grid cell offset for Bx has to be used instead of Bz. Mind the "-"-sign.
+                 */
+
+                /* Calculate By-component with the intra-cell offset of a By-field */
+                const float_64 By_By = calcTWTSBy(pos[1], time);
+                /* Calculate Bx-component with the intra-cell offset of a By-field */
+                const float_64 Bx_By = -calcTWTSBz_Ex(pos[1], time);
+                /* Calculate By-component with the intra-cell offset of a Bx-field */
+                const float_64 By_Bx = calcTWTSBy(pos[0], time);
+                /* Calculate Bx-component with the intra-cell offset of a Bx-field */
+                const float_64 Bx_Bx = -calcTWTSBz_Ex(pos[0], time);
+                /* Since we rotated all position vectors before calling calcTWTSBy and calcTWTSBz_Ex, we
+                 * need to back-rotate the resulting B-field vector. Now the rotation is done
+                 * analogously in the (y,x)-plane. (Reverse of the position vector transformation.)
+                 *
+                 * RotationMatrix[-(PI / 2+phi)].(By,Bx) for rotating back the field vectors.
+                 */
+                const float_64 By_rot = -math::sin(phi) * By_By + math::cos(phi) * Bx_By;
+                const float_64 Bx_rot = -math::cos(phi) * By_Bx - math::sin(phi) * Bx_Bx;
+
+                /* Finally, the B-field normalized to the peak amplitude. */
+                return float3_X(float_X(Bx_rot), float_X(By_rot), float_X(0.0));
+            }
+
+            template<>
+            HDINLINE float3_X BField::getTWTSBfield_Normalized_Ey<DIM2>(
+                const pmacc::math::Vector<floatD_64, detail::numComponents>& bFieldPositions_SI,
+                const float_64 time) const
+            {
+                typedef pmacc::math::Vector<float3_64, detail::numComponents> PosVecVec;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    /* The 2D output of getFieldPositions_SI only returns
+                     * the y- and z-component of a 3D vector.
+                     */
+                    for(uint32_t i = 0; i < DIM2; ++i)
+                        pos[k][i + 1] = bFieldPositions_SI[k][i];
+                }
+
+                /* General background comment for the rest of this function:
+                 *
+                 * Corresponding position vector for the field components in 2D simulations.
+                 *  3D     3D vectors in 2D space (x, y)
+                 *  x -->  z (Meaning: In 2D-sim, insert cell-coordinate x
+                 *            into TWTS field function coordinate z.)
+                 *  y -->  y
+                 *  z --> -x (Since z=0 for 2D, we use the existing
+                 *            3D TWTS-field-function and set x = -0)
+                 *  Ex --> Ez (Meaning: Calculate Ex-component of existing 3D TWTS-field to obtain
+                 *             corresponding Ez-component in 2D.
+                 *             Note: the intra-cell position offset due to the staggered grid for Ez.)
+                 *  By --> By
+                 *  Bz --> -Bx (Yes, the sign is necessary.)
+                 *
+                 * This procedure is analogous to 3D case, but replace By --> By and Bz --> -Bx. Hence the
+                 * grid cell offset for Bx has to be used instead of Bz. Mind the -sign.
+                 */
+
+                /* Calculate Bx-component with the intra-cell offset of a By-field */
+                const float_64 Bx_By = -calcTWTSBz_Ex(pos[1], time);
+                /* Calculate Bx-component with the intra-cell offset of a Bx-field */
+                const float_64 Bx_Bx = -calcTWTSBz_Ex(pos[0], time);
+
+                /* Since we rotated all position vectors before calling calcTWTSBz_Ex, we
+                 * need to back-rotate the resulting B-field vector. Now the rotation is done
+                 * analogously in the (y,x)-plane. (Reverse of the position vector transformation.)
+                 *
+                 * RotationMatrix[-(PI / 2+phi)].(By,Bx)
+                 * for rotating back the field-vectors.
+                 */
+                const float_64 By_rot = +math::cos(phi) * Bx_By;
+                const float_64 Bx_rot = -math::sin(phi) * Bx_Bx;
+
+                /* Finally, the B-field normalized to the peak amplitude. */
+                return float3_X(float_X(Bx_rot), float_X(By_rot), float_X(calcTWTSBx(pos[2], time)));
+            }
+
+            HDINLINE float3_X BField::operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
+            {
+                const float_64 time_SI = float_64(currentStep) * dt - tdelay;
+                const traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
+
+                const pmacc::math::Vector<floatD_64, detail::numComponents> bFieldPositions_SI
+                    = detail::getFieldPositions_SI(cellIdx, halfSimSize, fieldPosB(), unit_length, focus_y_SI, phi);
+                /* Single TWTS-Pulse */
+                switch(pol)
+                {
+                case LINEAR_X:
+                    return getTWTSBfield_Normalized<simDim>(bFieldPositions_SI, time_SI);
+
+                case LINEAR_YZ:
+                    return getTWTSBfield_Normalized_Ey<simDim>(bFieldPositions_SI, time_SI);
+                }
+                return getTWTSBfield_Normalized<simDim>(bFieldPositions_SI, time_SI); // defensive default
+            }
+
+            /** Calculate the By(r,t) field here
+             *
+             * \param pos Spatial position of the target field.
+             * \param time Absolute time (SI, including all offsets and transformations)
+             *             for calculating the field */
+            HDINLINE BField::float_T BField::calcTWTSBy(const float3_64& pos, const float_64 time) const
+            {
+                using complex_T = pmacc::math::Complex<float_T>;
+                using complex_64 = pmacc::math::Complex<float_64>;
+                /* Unit of speed */
+                const float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
+                /* Unit of time */
+                const float_64 UNIT_TIME = SI::DELTA_T_SI;
+                /* Unit of length */
+                const float_64 UNIT_LENGTH = UNIT_TIME * UNIT_SPEED;
+
+                /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
+                const float_T beta0 = float_T(beta_0);
+                /* If phi < 0 the formulas below are not directly applicable.
+                 * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
+                 * z-axis of the coordinate system in this function.
+                 */
+                const float_T phiReal = float_T(math::abs(phi));
+                const float_T alphaTilt
+                    = math::atan2(float_T(1.0) - beta0 * math::cos(phiReal), beta0 * math::sin(phiReal));
+                /* Definition of the laser pulse front tilt angle for the laser field below.
+                 *
+                 * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
+                 * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
+                 * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
+                 * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
+                 * the dispersion will (although physically correct) be slightly off the ideal TWTS
+                 * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
+                 * scenarios close to beta0 = 1.
+                 */
+                const float_T phiT = float_T(2.0) * alphaTilt;
+
+                /* Angle between the laser pulse front and the y-axis. Not used, but remains in code for
+                 * documentation purposes.
+                 * const float_T eta = float_T(PI/2) - (phiReal - alphaTilt);
+                 */
+
+                const float_T cspeed = float_T(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
+                const float_T lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
+                const float_T om0 = float_T(2.0 * PI * cspeed / lambda0);
+                /* factor 2  in tauG arises from definition convention in laser formula */
+                const float_T tauG = float_T(pulselength_SI * 2.0 / UNIT_TIME);
+                /* w0 is wx here --> w0 could be replaced by wx */
+                const float_T w0 = float_T(w_x_SI / UNIT_LENGTH);
+                const float_T rho0 = float_T(PI * w0 * w0 / lambda0);
+                /* wy is width of TWTS pulse */
+                const float_T wy = float_T(w_y_SI / UNIT_LENGTH);
+                const float_T k = float_T(2.0 * PI / lambda0);
+                /* If phi < 0 the entire pulse is rotated by 180 deg around the
+                 * z-axis of the coordinate system without also changing
+                 * the orientation of the resulting field vectors.
+                 */
+                const float_T x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
+                const float_T y = float_T(phiPositive * pos.y() / UNIT_LENGTH);
+                const float_T z = float_T(pos.z() / UNIT_LENGTH);
+                const float_T t = float_T(time / UNIT_TIME);
+
+                /* Shortcuts for speeding up the field calculation. */
+                const float_T sinPhi = math::sin(phiT);
+                const float_T cosPhi = math::cos(phiT);
+                const float_T cosPhi2 = math::cos(phiT / 2.0);
+                const float_T tanPhi2 = math::tan(phiT / 2.0);
+
+                /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
+                 * thus help with formal code verification through manual code inspection.
+                 */
+                const complex_T helpVar1 = rho0 + complex_T(0, 1) * y * cosPhi + complex_T(0, 1) * z * sinPhi;
+                const complex_T helpVar2 = cspeed * om0 * tauG * tauG
+                    + complex_T(0, 2) * (-z - y * math::tan(float_T(PI / 2) - phiT)) * tanPhi2 * tanPhi2;
+                const complex_T helpVar3 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+
+                const complex_T helpVar4 = float_T(-1.0)
+                    * (cspeed * cspeed * k * om0 * tauG * tauG * wy * wy * x * x
+                       + float_T(2.0) * cspeed * cspeed * om0 * t * t * wy * wy * rho0
+                       - complex_T(0, 2) * cspeed * cspeed * om0 * om0 * t * tauG * tauG * wy * wy * rho0
+                       + float_T(2.0) * cspeed * cspeed * om0 * tauG * tauG * y * y * rho0
+                       - float_T(4.0) * cspeed * om0 * t * wy * wy * z * rho0
+                       + complex_T(0, 2) * cspeed * om0 * om0 * tauG * tauG * wy * wy * z * rho0
+                       + float_T(2.0) * om0 * wy * wy * z * z * rho0
+                       + float_T(4.0) * cspeed * om0 * t * wy * wy * y * rho0 * tanPhi2
+                       - float_T(4.0) * om0 * wy * wy * y * z * rho0 * tanPhi2
+                       - complex_T(0, 2) * cspeed * k * wy * wy * x * x * z * tanPhi2 * tanPhi2
+                       + float_T(2.0) * om0 * wy * wy * y * y * rho0 * tanPhi2 * tanPhi2
+                       - float_T(4.0) * cspeed * om0 * t * wy * wy * z * rho0 * tanPhi2 * tanPhi2
+                       - complex_T(0, 4) * cspeed * y * y * z * rho0 * tanPhi2 * tanPhi2
+                       + float_T(4.0) * om0 * wy * wy * z * z * rho0 * tanPhi2 * tanPhi2
+                       - complex_T(0, 2) * cspeed * k * wy * wy * x * x * y * math::tan(float_T(PI / 2) - phiT)
+                           * tanPhi2 * tanPhi2
+                       - float_T(4.0) * cspeed * om0 * t * wy * wy * y * rho0 * math::tan(float_T(PI / 2) - phiT)
+                           * tanPhi2 * tanPhi2
+                       - complex_T(0, 4) * cspeed * y * y * y * rho0 * math::tan(float_T(PI / 2) - phiT) * tanPhi2
+                           * tanPhi2
+                       + float_T(4.0) * om0 * wy * wy * y * z * rho0 * math::tan(float_T(PI / 2) - phiT) * tanPhi2
+                           * tanPhi2
+                       + float_T(2.0) * z * sinPhi
+                           * (+om0
+                                  * (+cspeed * cspeed
+                                         * (complex_T(0, 1) * t * t * wy * wy + om0 * t * tauG * tauG * wy * wy
+                                            + complex_T(0, 1) * tauG * tauG * y * y)
+                                     - cspeed * (complex_T(0, 2) * t + om0 * tauG * tauG) * wy * wy * z
+                                     + complex_T(0, 1) * wy * wy * z * z)
+                              + complex_T(0, 2) * om0 * wy * wy * y * (cspeed * t - z) * tanPhi2
+                              + complex_T(0, 1) * tanPhi2 * tanPhi2
+                                  * (complex_T(0, -2) * cspeed * y * y * z
+                                     + om0 * wy * wy * (y * y - float_T(2.0) * (cspeed * t - z) * z)))
+                       + float_T(2.0) * y * cosPhi
+                           * (+om0
+                                  * (+cspeed * cspeed
+                                         * (complex_T(0, 1) * t * t * wy * wy + om0 * t * tauG * tauG * wy * wy
+                                            + complex_T(0, 1) * tauG * tauG * y * y)
+                                     - cspeed * (complex_T(0, 2) * t + om0 * tauG * tauG) * wy * wy * z
+                                     + complex_T(0, 1) * wy * wy * z * z)
+                              + complex_T(0, 2) * om0 * wy * wy * y * (cspeed * t - z) * tanPhi2
+                              + complex_T(0, 1)
+                                  * (complex_T(0, -4) * cspeed * y * y * z
+                                     + om0 * wy * wy * (y * y - float_T(4.0) * (cspeed * t - z) * z)
+                                     - float_T(2.0) * y
+                                         * (+cspeed * om0 * t * wy * wy + complex_T(0, 1) * cspeed * y * y
+                                            - om0 * wy * wy * z)
+                                         * math::tan(float_T(PI / 2) - phiT))
+                                  * tanPhi2 * tanPhi2)
+                       /* The "round-trip" conversion in the line below fixes a gross accuracy bug
+                        * in floating-point arithmetics, when float_T is set to float_X.
+                        */
+                       )
+                    * complex_T(float_64(1.0) / complex_64(float_T(2.0) * cspeed * wy * wy * helpVar1 * helpVar2));
+
+                const complex_T helpVar5 = complex_T(0, -1) * cspeed * om0 * tauG * tauG
+                    + (-z - y * math::tan(float_T(PI / 2) - phiT)) * tanPhi2 * tanPhi2 * float_T(2.0);
+                const complex_T helpVar6
+                    = (cspeed
+                       * (cspeed * om0 * tauG * tauG
+                          + complex_T(0, 2) * (-z - y * math::tan(float_T(PI / 2) - phiT)) * tanPhi2 * tanPhi2))
+                    / (om0 * rho0);
+                const complex_T result
+                    = (math::exp(helpVar4) * tauG / cosPhi2 / cosPhi2
+                       * (rho0 + complex_T(0, 1) * y * cosPhi + complex_T(0, 1) * z * sinPhi)
+                       * (complex_T(0, 2) * cspeed * t + cspeed * om0 * tauG * tauG - complex_T(0, 4) * z
+                          + cspeed * (complex_T(0, 2) * t + om0 * tauG * tauG) * cosPhi
+                          + complex_T(0, 2) * y * tanPhi2)
+                       * math::pow(helpVar3, float_T(-1.5)))
+                    / (float_T(2.0) * helpVar5 * math::sqrt(helpVar6));
+
+                return result.get_real() / UNIT_SPEED;
+            }
+
+            /** Calculate the Bz(r,t) field
+             *
+             * \param pos Spatial position of the target field.
+             * \param time Absolute time (SI, including all offsets and transformations)
+             *             for calculating the field */
+            HDINLINE BField::float_T BField::calcTWTSBz_Ex(const float3_64& pos, const float_64 time) const
+            {
+                using complex_T = pmacc::math::Complex<float_T>;
+                /** Unit of Speed */
+                const float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
+                /** Unit of time */
+                const float_64 UNIT_TIME = SI::DELTA_T_SI;
+                /** Unit of length */
+                const float_64 UNIT_LENGTH = UNIT_TIME * UNIT_SPEED;
+
+                /* propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
+                const float_T beta0 = float_T(beta_0);
+                /* If phi < 0 the formulas below are not directly applicable.
+                 * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
+                 * z-axis of the coordinate system in this function.
+                 */
+                const float_T phiReal = float_T(math::abs(phi));
+                const float_T alphaTilt
+                    = math::atan2(float_T(1.0) - beta0 * math::cos(phiReal), beta0 * math::sin(phiReal));
+
+                /* Definition of the laser pulse front tilt angle for the laser field below.
+                 *
+                 * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
+                 * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
+                 * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
+                 * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
+                 * the dispersion will (although physically correct) be slightly off the ideal TWTS
+                 * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
+                 * scenarios close to beta0 = 1.
+                 */
+                const float_T phiT = float_T(2.0) * alphaTilt;
+
+                /* Angle between the laser pulse front and the y-axis.
+                 * Not used, but remains in code for documentation purposes.
+                 * const float_T eta = float_T(float_T(PI / 2)) - (phiReal - alphaTilt);
+                 */
+
+                const float_T cspeed = float_T(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
+                const float_T lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
+                const float_T om0 = float_T(2.0 * PI * cspeed / lambda0);
+                /* factor 2  in tauG arises from definition convention in laser formula */
+                const float_T tauG = float_T(pulselength_SI * 2.0 / UNIT_TIME);
+                /* w0 is wx here --> w0 could be replaced by wx */
+                const float_T w0 = float_T(w_x_SI / UNIT_LENGTH);
+                const float_T rho0 = float_T(PI * w0 * w0 / lambda0);
+                /* wy is width of TWTS pulse */
+                const float_T wy = float_T(w_y_SI / UNIT_LENGTH);
+                const float_T k = float_T(2.0 * PI / lambda0);
+                /* If phi < 0 the entire pulse is rotated by 180 deg around the
+                 * z-axis of the coordinate system without also changing
+                 * the orientation of the resulting field vectors.
+                 */
+                const float_T x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
+                const float_T y = float_T(phiPositive * pos.y() / UNIT_LENGTH);
+                const float_T z = float_T(pos.z() / UNIT_LENGTH);
+                const float_T t = float_T(time / UNIT_TIME);
+
+                /* Shortcuts for speeding up the field calculation. */
+                const float_T sinPhi = math::sin(phiT);
+                const float_T cosPhi = math::cos(phiT);
+                const float_T sinPhi2 = math::sin(phiT / float_T(2.0));
+                const float_T cosPhi2 = math::cos(phiT / float_T(2.0));
+                const float_T tanPhi2 = math::tan(phiT / float_T(2.0));
+
+                /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
+                 * thus help with formal code verification through manual code inspection.
+                 */
+                const complex_T helpVar1 = -(cspeed * z) - cspeed * y * math::tan(float_T(PI / 2) - phiT)
+                    + complex_T(0, 1) * cspeed * rho0 / sinPhi;
+                const complex_T helpVar2 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+                const complex_T helpVar3 = helpVar2 * cspeed;
+                const complex_T helpVar4 = cspeed * om0 * tauG * tauG
+                    - complex_T(0, 1) * y * cosPhi / cosPhi2 / cosPhi2 * tanPhi2
+                    - complex_T(0, 2) * z * tanPhi2 * tanPhi2;
+                const complex_T helpVar5 = float_T(2.0) * cspeed * t - complex_T(0, 1) * cspeed * om0 * tauG * tauG
+                    - float_T(2.0) * z
+                    + float_T(8.0) * y / sinPhi / sinPhi / sinPhi * sinPhi2 * sinPhi2 * sinPhi2 * sinPhi2
+                    - float_T(2.0) * z * tanPhi2 * tanPhi2;
+
+                const complex_T helpVar6
+                    = ((om0 * y * rho0 / cosPhi2 / cosPhi2 / cosPhi2 / cosPhi2) / helpVar1
+                       - (complex_T(0, 2) * k * x * x) / helpVar2
+                       - (complex_T(0, 1) * om0 * om0 * tauG * tauG * rho0) / helpVar2
+                       - (complex_T(0, 4) * y * y * rho0) / (wy * wy * helpVar2)
+                       + (om0 * om0 * tauG * tauG * y * cosPhi) / helpVar2
+                       + (float_T(4.0) * y * y * y * cosPhi) / (wy * wy * helpVar2)
+                       + (om0 * om0 * tauG * tauG * z * sinPhi) / helpVar2
+                       + (float_T(4.0) * y * y * z * sinPhi) / (wy * wy * helpVar2)
+                       + (complex_T(0, 2) * om0 * y * y * cosPhi / cosPhi2 / cosPhi2 * tanPhi2) / helpVar3
+                       + (om0 * y * rho0 * cosPhi / cosPhi2 / cosPhi2 * tanPhi2) / helpVar3
+                       + (complex_T(0, 1) * om0 * y * y * cosPhi * cosPhi / cosPhi2 / cosPhi2 * tanPhi2) / helpVar3
+                       + (complex_T(0, 4) * om0 * y * z * tanPhi2 * tanPhi2) / helpVar3
+                       - (float_T(2.0) * om0 * z * rho0 * tanPhi2 * tanPhi2) / helpVar3
+                       - (complex_T(0, 2) * om0 * z * z * sinPhi * tanPhi2 * tanPhi2) / helpVar3
+                       - (om0 * helpVar5 * helpVar5) / (cspeed * helpVar4))
+                    / float_T(4.0);
+
+                const complex_T helpVar7 = cspeed * om0 * tauG * tauG
+                    - complex_T(0, 1) * y * cosPhi / cosPhi2 / cosPhi2 * tanPhi2
+                    - complex_T(0, 2) * z * tanPhi2 * tanPhi2;
+                const complex_T result = (complex_T(0, 2) * math::exp(helpVar6) * tauG * tanPhi2
+                                          * (cspeed * t - z + y * tanPhi2) * math::sqrt((om0 * rho0) / helpVar3))
+                    / math::pow(helpVar7, float_T(1.5));
+
+                return result.get_real() / UNIT_SPEED;
+            }
+
+            /** Calculate the Bx(r,t) field
+             *
+             * \param pos Spatial position of the target field.
+             * \param time Absolute time (SI, including all offsets and transformations)
+             *             for calculating the field */
+            HDINLINE BField::float_T BField::calcTWTSBx(const float3_64& pos, const float_64 time) const
+            {
+                /* The Bx-field for the Ey-field is the same as
+                 * for the By-field for the Ex-field except for the sign.
+                 */
+                return -calcTWTSBy(pos, time);
+            }
+
+            /** Calculate the Bz(r,t) field
+             *
+             * \param pos Spatial position of the target field.
+             * \param time Absolute time (SI, including all offsets and transformations)
+             *             for calculating the field */
+            HDINLINE BField::float_T BField::calcTWTSBz_Ey(const float3_64& pos, const float_64 time) const
+            {
+                using complex_T = pmacc::math::Complex<float_T>;
+                using complex_64 = pmacc::math::Complex<float_64>;
+                /** Unit of speed */
+                const float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
+                /** Unit of time */
+                const float_64 UNIT_TIME = SI::DELTA_T_SI;
+                /** Unit of length */
+                const float_64 UNIT_LENGTH = UNIT_TIME * UNIT_SPEED;
+
+                /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
+                const float_T beta0 = float_T(beta_0);
+                /* If phi < 0 the formulas below are not directly applicable.
+                 * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
+                 * z-axis of the coordinate system in this function.
+                 */
+                const float_T phiReal = float_T(math::abs(phi));
+                const float_T alphaTilt
+                    = math::atan2(float_T(1.0) - beta0 * math::cos(phiReal), beta0 * math::sin(phiReal));
+                /* Definition of the laser pulse front tilt angle for the laser field below.
+                 *
+                 * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
+                 * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
+                 * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
+                 * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
+                 * the dispersion will (although physically correct) be slightly off the ideal TWTS
+                 * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
+                 * scenarios close to beta0 = 1.
+                 */
+                const float_T phiT = float_T(2.0) * alphaTilt;
+
+                /* Angle between the laser pulse front and the y-axis.
+                 * Not used, but remains in code for documentation purposes.
+                 * const float_T eta = float_T(float_T(PI / 2)) - (phiReal - alphaTilt);
+                 */
+
+                const float_T cspeed = float_T(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
+                const float_T lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
+                const float_T om0 = float_T(2.0 * PI * cspeed / lambda0);
+                /* factor 2  in tauG arises from definition convention in laser formula */
+                const float_T tauG = float_T(pulselength_SI * 2.0 / UNIT_TIME);
+                /* w0 is wx here --> w0 could be replaced by wx */
+                const float_T w0 = float_T(w_x_SI / UNIT_LENGTH);
+                const float_T rho0 = float_T(PI * w0 * w0 / lambda0);
+                /* wy is width of TWTS pulse */
+                const float_T wy = float_T(w_y_SI / UNIT_LENGTH);
+                const float_T k = float_T(2.0 * PI / lambda0);
+                /* If phi < 0 the entire pulse is rotated by 180 deg around the
+                 * z-axis of the coordinate system without also changing
+                 * the orientation of the resulting field vectors.
+                 */
+                const float_T x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
+                const float_T y = float_T(phiPositive * pos.y() / UNIT_LENGTH);
+                const float_T z = float_T(pos.z() / UNIT_LENGTH);
+                const float_T t = float_T(time / UNIT_TIME);
+
+                /* Shortcuts for speeding up the field calculation. */
+                const float_T sinPhi = math::sin(phiT);
+                const float_T cosPhi = math::cos(phiT);
+                const float_T sinPhi2 = math::sin(phiT / float_T(2.0));
+                const float_T cosPhi2 = math::cos(phiT / float_T(2.0));
+                const float_T tanPhi2 = math::tan(phiT / float_T(2.0));
+
+                /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
+                 * thus help with formal code verification through manual code inspection.
+                 */
+                const complex_T helpVar1 = complex_T(0, -1) * cspeed * om0 * tauG * tauG
+                    - y * cosPhi / cosPhi2 / cosPhi2 * tanPhi2 - float_T(2.0) * z * tanPhi2 * tanPhi2;
+                const complex_T helpVar2 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+
+                const complex_T helpVar3
+                    = (-cspeed * cspeed * k * om0 * tauG * tauG * wy * wy * x * x
+                       - float_T(2.0) * cspeed * cspeed * om0 * t * t * wy * wy * rho0
+                       + complex_T(0, 2) * cspeed * cspeed * om0 * om0 * t * tauG * tauG * wy * wy * rho0
+                       - float_T(2.0) * cspeed * cspeed * om0 * tauG * tauG * y * y * rho0
+                       + float_T(4.0) * cspeed * om0 * t * wy * wy * z * rho0
+                       - complex_T(0, 2) * cspeed * om0 * om0 * tauG * tauG * wy * wy * z * rho0
+                       - float_T(2.0) * om0 * wy * wy * z * z * rho0
+                       - complex_T(0, 8) * om0 * wy * wy * y * (cspeed * t - z) * z * sinPhi2 * sinPhi2
+                       + complex_T(0, 8) / sinPhi
+                           * (float_T(2.0) * z * z
+                                  * (cspeed * om0 * t * wy * wy + complex_T(0, 1) * cspeed * y * y - om0 * wy * wy * z)
+                              + y
+                                  * (cspeed * k * wy * wy * x * x - complex_T(0, 2) * cspeed * om0 * t * wy * wy * rho0
+                                     + float_T(2.0) * cspeed * y * y * rho0
+                                     + complex_T(0, 2) * om0 * wy * wy * z * rho0)
+                                  * math::tan(float_T(PI) / float_T(2.0) - phiT) / sinPhi)
+                           * sinPhi2 * sinPhi2 * sinPhi2 * sinPhi2
+                       - complex_T(0, 2) * cspeed * cspeed * om0 * t * t * wy * wy * z * sinPhi
+                       - float_T(2.0) * cspeed * cspeed * om0 * om0 * t * tauG * tauG * wy * wy * z * sinPhi
+                       - complex_T(0, 2) * cspeed * cspeed * om0 * tauG * tauG * y * y * z * sinPhi
+                       + complex_T(0, 4) * cspeed * om0 * t * wy * wy * z * z * sinPhi
+                       + float_T(2.0) * cspeed * om0 * om0 * tauG * tauG * wy * wy * z * z * sinPhi
+                       - complex_T(0, 2) * om0 * wy * wy * z * z * z * sinPhi
+                       - float_T(4.0) * cspeed * om0 * t * wy * wy * y * rho0 * tanPhi2
+                       + float_T(4.0) * om0 * wy * wy * y * z * rho0 * tanPhi2
+                       + complex_T(0, 2) * y * y
+                           * (cspeed * om0 * t * wy * wy + complex_T(0, 1) * cspeed * y * y - om0 * wy * wy * z)
+                           * cosPhi * cosPhi / cosPhi2 / cosPhi2 * tanPhi2
+                       + complex_T(0, 2) * cspeed * k * wy * wy * x * x * z * tanPhi2 * tanPhi2
+                       - float_T(2.0) * om0 * wy * wy * y * y * rho0 * tanPhi2 * tanPhi2
+                       + float_T(4.0) * cspeed * om0 * t * wy * wy * z * rho0 * tanPhi2 * tanPhi2
+                       + complex_T(0, 4) * cspeed * y * y * z * rho0 * tanPhi2 * tanPhi2
+                       - float_T(4.0) * om0 * wy * wy * z * z * rho0 * tanPhi2 * tanPhi2
+                       - complex_T(0, 2) * om0 * wy * wy * y * y * z * sinPhi * tanPhi2 * tanPhi2
+                       - float_T(2.0) * y * cosPhi
+                           * (om0
+                                  * (cspeed * cspeed
+                                         * (complex_T(0, 1) * t * t * wy * wy + om0 * t * tauG * tauG * wy * wy
+                                            + complex_T(0, 1) * tauG * tauG * y * y)
+                                     - cspeed * (complex_T(0, 2) * t + om0 * tauG * tauG) * wy * wy * z
+                                     + complex_T(0, 1) * wy * wy * z * z)
+                              + complex_T(0, 2) * om0 * wy * wy * y * (cspeed * t - z) * tanPhi2
+                              + complex_T(0, 1)
+                                  * (complex_T(0, -4) * cspeed * y * y * z
+                                     + om0 * wy * wy * (y * y - float_T(4.0) * (cspeed * t - z) * z))
+                                  * tanPhi2 * tanPhi2)
+                       /* The "round-trip" conversion in the line below fixes a gross accuracy bug
+                        * in floating-point arithmetics, when float_T is set to float_X.
+                        */
+                       )
+                    * complex_T(float_64(1.0) / complex_64(float_T(2.0) * cspeed * wy * wy * helpVar2 * helpVar1));
+
+                const complex_T helpVar4 = (cspeed * om0
+                                            * (cspeed * om0 * tauG * tauG
+                                               - complex_T(0, 8) * y * math::tan(float_T(PI) / float_T(2.0) - phiT)
+                                                   / sinPhi / sinPhi * sinPhi2 * sinPhi2 * sinPhi2 * sinPhi2
+                                               - complex_T(0, 2) * z * tanPhi2 * tanPhi2))
+                    / rho0;
+
+                const complex_T result = float_T(-1.0)
+                    * (cspeed * math::exp(helpVar3) * k * tauG * x * math::pow(helpVar2, float_T(-1.5))
+                       / math::sqrt(helpVar4));
+
+                return result.get_real() / UNIT_SPEED;
+            }
+
+        } /* namespace twts */
+    } /* namespace templates */
 } /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/TWTS/EField.hpp b/include/picongpu/fields/background/templates/TWTS/EField.hpp
index 19b66bcaa2..af73b3338f 100644
--- a/include/picongpu/fields/background/templates/TWTS/EField.hpp
+++ b/include/picongpu/fields/background/templates/TWTS/EField.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus, Axel Huebl
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -28,150 +28,143 @@
 
 namespace picongpu
 {
-/* Load pre-defined background field */
-namespace templates
-{
-/* Traveling-wave Thomson scattering laser pulse */
-namespace twts
-{
-
-class EField
-{
-public:
-    using float_T = float_X;
-
-    enum PolarizationType
+    /* Load pre-defined background field */
+    namespace templates
     {
-        /* The linear polarization of the TWTS laser is defined
-         * relative to the plane of the pulse front tilt.
-         *
-         * Polarisation is normal to the reference plane.
-         * Use Ex-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
-         */
-        LINEAR_X = 1u,
-        /* Polarization lies within the reference plane.
-         * Use Ey-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
-         */
-        LINEAR_YZ = 2u,
-    };
-
-    /* Center of simulation volume in number of cells */
-    PMACC_ALIGN(halfSimSize,DataSpace<simDim>);
-    /* y-position of TWTS coordinate origin inside the simulation coordinates [meter]
-       The other origin coordinates (x and z) default to globally centered values
-       with respect to the simulation volume. */
-    PMACC_ALIGN(focus_y_SI, const float_64);
-    /* Laser wavelength [meter] */
-    PMACC_ALIGN(wavelength_SI, const float_64);
-    /* TWTS laser pulse duration [second] */
-    PMACC_ALIGN(pulselength_SI, const float_64);
-    /* line focus height of TWTS pulse [meter] */
-    PMACC_ALIGN(w_x_SI, const float_64);
-    /* line focus width of TWTS pulse [meter] */
-    PMACC_ALIGN(w_y_SI, const float_64);
-    /* interaction angle between TWTS laser propagation vector and the y-axis [rad] */
-    PMACC_ALIGN(phi, const float_X);
-    /* Takes value 1.0 for phi > 0 and -1.0 for phi < 0. */
-    PMACC_ALIGN(phiPositive,float_X);
-    /* propagation speed of TWTS laser overlap
-    normalized to the speed of light. [Default: beta0=1.0] */
-    PMACC_ALIGN(beta_0, const float_X);
-    /* If auto_tdelay=FALSE, then a user defined delay is used. [second] */
-    PMACC_ALIGN(tdelay_user_SI, const float_64);
-    /* Make time step constant accessible to device. */
-    PMACC_ALIGN(dt, const float_64);
-    /* Make length normalization constant accessible to device. */
-    PMACC_ALIGN(unit_length, const float_64);
-    /* TWTS laser time delay */
-    PMACC_ALIGN(tdelay,float_64);
-    /* Should the TWTS laser delay be chosen automatically, such that
-     * the laser gradually enters the simulation volume? [Default: TRUE]
-     */
-    PMACC_ALIGN(auto_tdelay, const bool);
-    /* Polarization of TWTS laser */
-    PMACC_ALIGN(pol, const PolarizationType);
-
-    /** Electric field of the TWTS laser
-     *
-     * \param focus_y_SI the distance to the laser focus in y-direction [m]
-     * \param wavelength_SI central wavelength [m]
-     * \param pulselength_SI sigma of std. gauss for intensity (E^2),
-     *  pulselength_SI = FWHM_of_Intensity / 2.35482 [seconds (sigma)]
-     * \param w_x beam waist: distance from the axis where the pulse electric field
-     *  decreases to its 1/e^2-th part at the focus position of the laser [m]
-     * \param w_y \see w_x
-     * \param phi interaction angle between TWTS laser propagation vector and
-     *  the y-axis [rad, default = 90.*(PI/180.)]
-     * \param beta_0 propagation speed of overlap normalized to
-     *  the speed of light [c, default = 1.0]
-     * \param tdelay_user manual time delay if auto_tdelay is false
-     * \param auto_tdelay calculate the time delay such that the TWTS pulse is not
-     *  inside the simulation volume at simulation start timestep = 0 [default = true]
-     * \param pol dtermines the TWTS laser polarization, which is either normal or parallel
-     *  to the laser pulse front tilt plane [ default= LINEAR_X , LINEAR_YZ ]
-     */
-    HINLINE
-    EField( const float_64 focus_y_SI,
-            const float_64 wavelength_SI,
-            const float_64 pulselength_SI,
-            const float_64 w_x_SI,
-            const float_64 w_y_SI,
-            const float_X phi               = 90.*(PI / 180.),
-            const float_X beta_0            = 1.0,
-            const float_64 tdelay_user_SI   = 0.0,
-            const bool auto_tdelay          = true,
-            const PolarizationType pol      = LINEAR_X );
-
-    /** Specify your background field E(r,t) here
-     *
-     * \param cellIdx The total cell id counted from the start at timestep 0.
-     * \param currentStep The current time step
-     * \return float3_X with field normalized to amplitude in range [-1.:1.]
-     */
-    HDINLINE float3_X
-    operator()( const DataSpace<simDim>& cellIdx,
-                const uint32_t currentStep ) const;
-
-    /** Calculate the Ex(r,t) field here (electric field vector normal to pulse-front-tilt plane)
-     *
-     * \param pos Spatial position of the target field
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *  for calculating the field
-     * \return Ex-field component of the non-rotated TWTS field in SI units */
-    HDINLINE float_T
-    calcTWTSEx( const float3_64& pos, const float_64 time ) const;
-
-    /** Calculate the Ey(r,t) field here (electric field vector in pulse-front-tilt plane)
-     *
-     * \param pos Spatial position of the target field
-     * \param time Absolute time (SI, including all offsets and transformations)
-     *  for calculating the field
-     * \return Ex-field component of the non-rotated TWTS field in SI units */
-    HDINLINE float_T
-    calcTWTSEy( const float3_64& pos, const float_64 time ) const;
-
-    /** Calculate the E-field vector of the TWTS laser in SI units.
-     * \tparam T_dim Specializes for the simulation dimension
-     * \param cellIdx The total cell id counted from the start at timestep 0
-     * \return Efield vector of the rotated TWTS field in SI units */
-    template <unsigned T_dim>
-    HDINLINE float3_X
-    getTWTSEfield_Normalized(
-            const pmacc::math::Vector<floatD_64,detail::numComponents>& eFieldPositions_SI,
-            const float_64 time) const;
-
-    /** Calculate the E-field vector of the "in-plane polarized" TWTS laser in SI units.
-     * \tparam T_dim Specializes for the simulation dimension
-     * \param cellIdx The total cell id counted from the start at timestep 0
-     * \return Efield vector of the rotated TWTS field in SI units */
-    template <unsigned T_dim>
-    HDINLINE float3_X
-    getTWTSEfield_Normalized_Ey(
-            const pmacc::math::Vector<floatD_64,detail::numComponents>& eFieldPositions_SI,
-            const float_64 time) const;
-
-};
-
-} /* namespace twts */
-} /* namespace templates */
+        /* Traveling-wave Thomson scattering laser pulse */
+        namespace twts
+        {
+            class EField
+            {
+            public:
+                using float_T = float_X;
+
+                enum PolarizationType
+                {
+                    /* The linear polarization of the TWTS laser is defined
+                     * relative to the plane of the pulse front tilt.
+                     *
+                     * Polarisation is normal to the reference plane.
+                     * Use Ex-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
+                     */
+                    LINEAR_X = 1u,
+                    /* Polarization lies within the reference plane.
+                     * Use Ey-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
+                     */
+                    LINEAR_YZ = 2u,
+                };
+
+                /* Center of simulation volume in number of cells */
+                PMACC_ALIGN(halfSimSize, DataSpace<simDim>);
+                /* y-position of TWTS coordinate origin inside the simulation coordinates [meter]
+                   The other origin coordinates (x and z) default to globally centered values
+                   with respect to the simulation volume. */
+                PMACC_ALIGN(focus_y_SI, const float_64);
+                /* Laser wavelength [meter] */
+                PMACC_ALIGN(wavelength_SI, const float_64);
+                /* TWTS laser pulse duration [second] */
+                PMACC_ALIGN(pulselength_SI, const float_64);
+                /* line focus height of TWTS pulse [meter] */
+                PMACC_ALIGN(w_x_SI, const float_64);
+                /* line focus width of TWTS pulse [meter] */
+                PMACC_ALIGN(w_y_SI, const float_64);
+                /* interaction angle between TWTS laser propagation vector and the y-axis [rad] */
+                PMACC_ALIGN(phi, const float_X);
+                /* Takes value 1.0 for phi > 0 and -1.0 for phi < 0. */
+                PMACC_ALIGN(phiPositive, float_X);
+                /* propagation speed of TWTS laser overlap
+                normalized to the speed of light. [Default: beta0=1.0] */
+                PMACC_ALIGN(beta_0, const float_X);
+                /* If auto_tdelay=FALSE, then a user defined delay is used. [second] */
+                PMACC_ALIGN(tdelay_user_SI, const float_64);
+                /* Make time step constant accessible to device. */
+                PMACC_ALIGN(dt, const float_64);
+                /* Make length normalization constant accessible to device. */
+                PMACC_ALIGN(unit_length, const float_64);
+                /* TWTS laser time delay */
+                PMACC_ALIGN(tdelay, float_64);
+                /* Should the TWTS laser delay be chosen automatically, such that
+                 * the laser gradually enters the simulation volume? [Default: TRUE]
+                 */
+                PMACC_ALIGN(auto_tdelay, const bool);
+                /* Polarization of TWTS laser */
+                PMACC_ALIGN(pol, const PolarizationType);
+
+                /** Electric field of the TWTS laser
+                 *
+                 * \param focus_y_SI the distance to the laser focus in y-direction [m]
+                 * \param wavelength_SI central wavelength [m]
+                 * \param pulselength_SI sigma of std. gauss for intensity (E^2),
+                 *  pulselength_SI = FWHM_of_Intensity / 2.35482 [seconds (sigma)]
+                 * \param w_x beam waist: distance from the axis where the pulse electric field
+                 *  decreases to its 1/e^2-th part at the focus position of the laser [m]
+                 * \param w_y \see w_x
+                 * \param phi interaction angle between TWTS laser propagation vector and
+                 *  the y-axis [rad, default = 90.*(PI/180.)]
+                 * \param beta_0 propagation speed of overlap normalized to
+                 *  the speed of light [c, default = 1.0]
+                 * \param tdelay_user manual time delay if auto_tdelay is false
+                 * \param auto_tdelay calculate the time delay such that the TWTS pulse is not
+                 *  inside the simulation volume at simulation start timestep = 0 [default = true]
+                 * \param pol dtermines the TWTS laser polarization, which is either normal or parallel
+                 *  to the laser pulse front tilt plane [ default= LINEAR_X , LINEAR_YZ ]
+                 */
+                HINLINE
+                EField(
+                    const float_64 focus_y_SI,
+                    const float_64 wavelength_SI,
+                    const float_64 pulselength_SI,
+                    const float_64 w_x_SI,
+                    const float_64 w_y_SI,
+                    const float_X phi = 90. * (PI / 180.),
+                    const float_X beta_0 = 1.0,
+                    const float_64 tdelay_user_SI = 0.0,
+                    const bool auto_tdelay = true,
+                    const PolarizationType pol = LINEAR_X);
+
+                /** Specify your background field E(r,t) here
+                 *
+                 * \param cellIdx The total cell id counted from the start at timestep 0.
+                 * \param currentStep The current time step
+                 * \return float3_X with field normalized to amplitude in range [-1.:1.]
+                 */
+                HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const;
+
+                /** Calculate the Ex(r,t) field here (electric field vector normal to pulse-front-tilt plane)
+                 *
+                 * \param pos Spatial position of the target field
+                 * \param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field
+                 * \return Ex-field component of the non-rotated TWTS field in SI units */
+                HDINLINE float_T calcTWTSEx(const float3_64& pos, const float_64 time) const;
+
+                /** Calculate the Ey(r,t) field here (electric field vector in pulse-front-tilt plane)
+                 *
+                 * \param pos Spatial position of the target field
+                 * \param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field
+                 * \return Ex-field component of the non-rotated TWTS field in SI units */
+                HDINLINE float_T calcTWTSEy(const float3_64& pos, const float_64 time) const;
+
+                /** Calculate the E-field vector of the TWTS laser in SI units.
+                 * \tparam T_dim Specializes for the simulation dimension
+                 * \param cellIdx The total cell id counted from the start at timestep 0
+                 * \return Efield vector of the rotated TWTS field in SI units */
+                template<unsigned T_dim>
+                HDINLINE float3_X getTWTSEfield_Normalized(
+                    const pmacc::math::Vector<floatD_64, detail::numComponents>& eFieldPositions_SI,
+                    const float_64 time) const;
+
+                /** Calculate the E-field vector of the "in-plane polarized" TWTS laser in SI units.
+                 * \tparam T_dim Specializes for the simulation dimension
+                 * \param cellIdx The total cell id counted from the start at timestep 0
+                 * \return Efield vector of the rotated TWTS field in SI units */
+                template<unsigned T_dim>
+                HDINLINE float3_X getTWTSEfield_Normalized_Ey(
+                    const pmacc::math::Vector<floatD_64, detail::numComponents>& eFieldPositions_SI,
+                    const float_64 time) const;
+            };
+
+        } /* namespace twts */
+    } /* namespace templates */
 } /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/TWTS/EField.tpp b/include/picongpu/fields/background/templates/TWTS/EField.tpp
index ab108252ed..ef60bf0ecc 100644
--- a/include/picongpu/fields/background/templates/TWTS/EField.tpp
+++ b/include/picongpu/fields/background/templates/TWTS/EField.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus, Axel Huebl
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -36,319 +36,315 @@
 
 namespace picongpu
 {
-/* Load pre-defined background field */
-namespace templates
-{
-/* Traveling-wave Thomson scattering laser pulse */
-namespace twts
-{
-
-    HINLINE
-    EField::EField( const float_64 focus_y_SI,
-                    const float_64 wavelength_SI,
-                    const float_64 pulselength_SI,
-                    const float_64 w_x_SI,
-                    const float_64 w_y_SI,
-                    const float_X phi,
-                    const float_X beta_0,
-                    const float_64 tdelay_user_SI,
-                    const bool auto_tdelay,
-                    const PolarizationType pol ) :
-        focus_y_SI(focus_y_SI), wavelength_SI(wavelength_SI),
-        pulselength_SI(pulselength_SI), w_x_SI(w_x_SI),
-        w_y_SI(w_y_SI), phi(phi), beta_0(beta_0),
-        tdelay_user_SI(tdelay_user_SI), dt(SI::DELTA_T_SI),
-        unit_length(UNIT_LENGTH), auto_tdelay(auto_tdelay), pol(pol), phiPositive( float_X(1.0) )
+    /* Load pre-defined background field */
+    namespace templates
     {
-        /* Note: Enviroment-objects cannot be instantiated on CUDA GPU device. Since this is done
-                 on host (see fieldBackground.param), this is no problem.
-         */
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-        halfSimSize = subGrid.getGlobalDomain().size / 2;
-        tdelay = detail::getInitialTimeDelay_SI(auto_tdelay, tdelay_user_SI,
-                                                halfSimSize, pulselength_SI,
-                                                focus_y_SI, phi, beta_0);
-        if ( phi < float_X(0.0) ) phiPositive = float_X(-1.0);
-    }
-
-    template<>
-    HDINLINE float3_X
-    EField::getTWTSEfield_Normalized<DIM3>(
-                const pmacc::math::Vector<floatD_64,detail::numComponents>& eFieldPositions_SI,
+        /* Traveling-wave Thomson scattering laser pulse */
+        namespace twts
+        {
+            HINLINE
+            EField::EField(
+                const float_64 focus_y_SI,
+                const float_64 wavelength_SI,
+                const float_64 pulselength_SI,
+                const float_64 w_x_SI,
+                const float_64 w_y_SI,
+                const float_X phi,
+                const float_X beta_0,
+                const float_64 tdelay_user_SI,
+                const bool auto_tdelay,
+                const PolarizationType pol)
+                : focus_y_SI(focus_y_SI)
+                , wavelength_SI(wavelength_SI)
+                , pulselength_SI(pulselength_SI)
+                , w_x_SI(w_x_SI)
+                , w_y_SI(w_y_SI)
+                , phi(phi)
+                , beta_0(beta_0)
+                , tdelay_user_SI(tdelay_user_SI)
+                , dt(SI::DELTA_T_SI)
+                , unit_length(UNIT_LENGTH)
+                , auto_tdelay(auto_tdelay)
+                , pol(pol)
+                , phiPositive(float_X(1.0))
+            {
+                /* Note: Enviroment-objects cannot be instantiated on CUDA GPU device. Since this is done
+                         on host (see fieldBackground.param), this is no problem.
+                 */
+                const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                halfSimSize = subGrid.getGlobalDomain().size / 2;
+                tdelay = detail::getInitialTimeDelay_SI(
+                    auto_tdelay,
+                    tdelay_user_SI,
+                    halfSimSize,
+                    pulselength_SI,
+                    focus_y_SI,
+                    phi,
+                    beta_0);
+                if(phi < float_X(0.0))
+                    phiPositive = float_X(-1.0);
+            }
+
+            template<>
+            HDINLINE float3_X EField::getTWTSEfield_Normalized<DIM3>(
+                const pmacc::math::Vector<floatD_64, detail::numComponents>& eFieldPositions_SI,
                 const float_64 time) const
-    {
-        float3_64 pos(float3_64::create(0.0));
-        for (uint32_t i = 0; i<simDim;++i) pos[i] = eFieldPositions_SI[0][i];
-        return float3_X( float_X( calcTWTSEx(pos,time) ),
-                         float_X(0.), float_X(0.) );
-    }
-
-    template<>
-    HDINLINE float3_X
-    EField::getTWTSEfield_Normalized_Ey<DIM3>(
-                const pmacc::math::Vector<floatD_64,detail::numComponents>& eFieldPositions_SI,
+            {
+                float3_64 pos(float3_64::create(0.0));
+                for(uint32_t i = 0; i < simDim; ++i)
+                    pos[i] = eFieldPositions_SI[0][i];
+                return float3_X(float_X(calcTWTSEx(pos, time)), float_X(0.), float_X(0.));
+            }
+
+            template<>
+            HDINLINE float3_X EField::getTWTSEfield_Normalized_Ey<DIM3>(
+                const pmacc::math::Vector<floatD_64, detail::numComponents>& eFieldPositions_SI,
                 const float_64 time) const
-    {
-        typedef pmacc::math::Vector<float3_64,detail::numComponents> PosVecVec;
-        PosVecVec pos(PosVecVec::create(
-                                           float3_64::create(0.0)
-                                       ));
-
-        for (uint32_t k = 0; k<detail::numComponents;++k) {
-            for (uint32_t i = 0; i<simDim;++i) pos[k][i] = eFieldPositions_SI[k][i];
-        }
-
-        /* Calculate Ey-component with the intra-cell offset of a Ey-field */
-        const float_64 Ey_Ey = calcTWTSEy(pos[1], time);
-        /* Calculate Ey-component with the intra-cell offset of a Ez-field */
-        const float_64 Ey_Ez = calcTWTSEy(pos[2], time);
-
-        /* Since we rotated all position vectors before calling calcTWTSEy,
-         * we need to back-rotate the resulting E-field vector.
-         *
-         * RotationMatrix[-(PI/2+phi)].(Ey,Ez) for rotating back the field-vectors.
-         */
-        const float_64 Ey_rot = -math::sin(+phi)*Ey_Ey;
-        const float_64 Ez_rot = -math::cos(+phi)*Ey_Ez;
-
-        /* Finally, the E-field normalized to the peak amplitude. */
-        return float3_X( float_X(0.0),
-                         float_X(Ey_rot),
-                         float_X(Ez_rot) );
-    }
-
-    template<>
-    HDINLINE float3_X
-    EField::getTWTSEfield_Normalized<DIM2>(
-        const pmacc::math::Vector<floatD_64,detail::numComponents>& eFieldPositions_SI,
-        const float_64 time) const
-    {
-        /* Ex->Ez, so also the grid cell offset for Ez has to be used. */
-        float3_64 pos(float3_64::create(0.0));
-        /* 2D (y,z) vectors are mapped on 3D (x,y,z) vectors. */
-        for (uint32_t i = 0; i<DIM2;++i) pos[i+1] = eFieldPositions_SI[2][i];
-        return float3_X( float_X(0.), float_X(0.),
-                         float_X( calcTWTSEx(pos,time) ) );
-    }
-
-    template<>
-    HDINLINE float3_X
-    EField::getTWTSEfield_Normalized_Ey<DIM2>(
-        const pmacc::math::Vector<floatD_64,detail::numComponents>& eFieldPositions_SI,
-        const float_64 time) const
-    {
-        typedef pmacc::math::Vector<float3_64,detail::numComponents> PosVecVec;
-        PosVecVec pos(PosVecVec::create(
-                                           float3_64::create(0.0)
-                                       ));
-
-        /* The 2D output of getFieldPositions_SI only returns
-         * the y- and z-component of a 3D vector.
-         */
-        for (uint32_t k = 0; k<detail::numComponents;++k) {
-            for (uint32_t i = 0; i<DIM2;++i) pos[k][i+1] = eFieldPositions_SI[k][i];
-        }
-
-        /* Ey->Ey, but grid cell offsets for Ex and Ey have to be used.
-         *
-         * Calculate Ey-component with the intra-cell offset of a Ey-field
-         */
-        const float_64 Ey_Ey = calcTWTSEy(pos[1], time);
-        /* Calculate Ey-component with the intra-cell offset of a Ex-field */
-        const float_64 Ey_Ex = calcTWTSEy(pos[0], time);
-
-        /* Since we rotated all position vectors before calling calcTWTSEy,
-         * we need to back-rotate the resulting E-field vector.
-         *
-         * RotationMatrix[-(PI / 2+phi)].(Ey,Ex) for rotating back the field-vectors.
-         */
-        const float_64 Ey_rot = -math::sin(+phi)*Ey_Ey;
-        const float_64 Ex_rot = -math::cos(+phi)*Ey_Ex;
-
-        /* Finally, the E-field normalized to the peak amplitude. */
-        return float3_X( float_X(Ex_rot),
-                         float_X(Ey_rot),
-                         float_X(0.0) );
-    }
-
-    HDINLINE float3_X
-    EField::operator()( const DataSpace<simDim>& cellIdx,
-                            const uint32_t currentStep ) const
-    {
-        const float_64 time_SI = float_64(currentStep) * dt - tdelay;
-        const traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
-
-        const pmacc::math::Vector<floatD_64,detail::numComponents> eFieldPositions_SI =
-              detail::getFieldPositions_SI(cellIdx, halfSimSize,
-                fieldPosE(), unit_length, focus_y_SI, phi);
-
-        /* Single TWTS-Pulse */
-        switch (pol)
-        {
-            case LINEAR_X :
-            return getTWTSEfield_Normalized<simDim>(eFieldPositions_SI, time_SI);
-
-            case LINEAR_YZ :
-            return getTWTSEfield_Normalized_Ey<simDim>(eFieldPositions_SI, time_SI);
-        }
-        return getTWTSEfield_Normalized<simDim>(eFieldPositions_SI, time_SI); // defensive default
-    }
-
-    /** Calculate the Ex(r,t) field here
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations) for calculating
-     *             the field */
-    HDINLINE EField::float_T
-    EField::calcTWTSEx( const float3_64& pos, const float_64 time) const
-    {
-        using complex_T = pmacc::math::Complex< float_T >;
-        using complex_64 = pmacc::math::Complex< float_64 >;
-        /* Unit of speed */
-        const float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
-        /* Unit of time */
-        const float_64 UNIT_TIME = SI::DELTA_T_SI;
-        /* Unit of length */
-        const float_64 UNIT_LENGTH = UNIT_TIME*UNIT_SPEED;
-
-        /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
-        const float_T beta0 = float_T(beta_0);
-        /* If phi < 0 the formulas below are not directly applicable.
-         * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
-         * z-axis of the coordinate system in this function.
-         */
-        const float_T phiReal = float_T( math::abs(phi) );
-        const float_T alphaTilt = math::atan2(float_T(1.0)-beta0*math::cos(phiReal),
-                                                beta0*math::sin(phiReal));
-        /* Definition of the laser pulse front tilt angle for the laser field below.
-         *
-         * For beta0 = 1.0, this is equivalent to our standard definition. Question: Why is the
-         * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
-         * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
-         * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
-         * the dispersion will (although physically correct) be slightly off the ideal TWTS
-         * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
-         * scenarios close to beta0 = 1.
-         */
-        const float_T phiT = float_T(2.0)*alphaTilt;
-
-        /* Angle between the laser pulse front and the y-axis. Not used, but remains in code for
-         * documentation purposes.
-         * const float_T eta = (PI / 2) - (phiReal - alphaTilt);
-         */
-
-        const float_T cspeed = float_T( SI::SPEED_OF_LIGHT_SI / UNIT_SPEED );
-        const float_T lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
-        const float_T om0 = float_T(2.0*PI*cspeed / lambda0);
-        /* factor 2  in tauG arises from definition convention in laser formula */
-        const float_T tauG = float_T(pulselength_SI*2.0 / UNIT_TIME);
-        /* w0 is wx here --> w0 could be replaced by wx */
-        const float_T w0 = float_T(w_x_SI / UNIT_LENGTH);
-        const float_T rho0 = float_T(PI*w0*w0/lambda0);
-        /* wy is width of TWTS pulse */
-        const float_T wy = float_T(w_y_SI / UNIT_LENGTH);
-        const float_T k = float_T(2.0*PI / lambda0);
-        const float_T x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
-        const float_T y = float_T(phiPositive * pos.y() / UNIT_LENGTH);
-        const float_T z = float_T(pos.z() / UNIT_LENGTH);
-        const float_T t = float_T(time / UNIT_TIME);
-
-        /* Calculating shortcuts for speeding up field calculation */
-        const float_T sinPhi = math::sin(phiT);
-        const float_T cosPhi = math::cos(phiT);
-        const float_T sinPhi2 = math::sin(phiT / float_T(2.0));
-        const float_T cosPhi2 = math::cos(phiT / float_T(2.0));
-        const float_T tanPhi2 = math::tan(phiT / float_T(2.0));
-
-        /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
-         * thus help with formal code verification through manual code inspection.
-         */
-        const complex_T helpVar1 = complex_T(0,1)*rho0 - y*cosPhi - z*sinPhi;
-        const complex_T helpVar2 = complex_T(0,-1)*cspeed*om0*tauG*tauG
-                                    - y*cosPhi / cosPhi2 / cosPhi2*tanPhi2
-                                    - float_T(2.0)*z*tanPhi2*tanPhi2;
-        const complex_T helpVar3 = complex_T(0,1)*rho0 - y*cosPhi - z*sinPhi;
-
-        const complex_T helpVar4 = (
-            -(cspeed*cspeed*k*om0*tauG*tauG*wy*wy*x*x)
-            - float_T(2.0)*cspeed*cspeed*om0*t*t*wy*wy*rho0
-            + complex_T(0,2)*cspeed*cspeed*om0*om0*t*tauG*tauG*wy*wy*rho0
-            - float_T(2.0)*cspeed*cspeed*om0*tauG*tauG*y*y*rho0
-            + float_T(4.0)*cspeed*om0*t*wy*wy*z*rho0
-            - complex_T(0,2)*cspeed*om0*om0*tauG*tauG*wy*wy*z*rho0
-            - float_T(2.0)*om0*wy*wy*z*z*rho0
-            - complex_T(0,8)*om0*wy*wy*y*(cspeed*t - z)*z*sinPhi2*sinPhi2
-            + complex_T(0,8) / sinPhi*(
-                    +float_T(2.0)*z*z*(cspeed*om0*t*wy*wy+complex_T(0,1)*cspeed*y*y-om0*wy*wy*z)
-                    + y*(
-                        + cspeed*k*wy*wy*x*x
-                        - complex_T(0,2)*cspeed*om0*t*wy*wy*rho0
-                        + float_T(2.0)*cspeed*y*y*rho0
-                        + complex_T(0,2)*om0*wy*wy*z*rho0
-                    )*math::tan(float_T(PI / 2.0)-phiT)/sinPhi
-                )*sinPhi2*sinPhi2*sinPhi2*sinPhi2
-            - complex_T(0,2)*cspeed*cspeed*om0*t*t*wy*wy*z*sinPhi
-            - float_T(2.0)*cspeed*cspeed*om0*om0*t*tauG*tauG*wy*wy*z*sinPhi
-            - complex_T(0,2)*cspeed*cspeed*om0*tauG*tauG*y*y*z*sinPhi
-            + complex_T(0,4)*cspeed*om0*t*wy*wy*z*z*sinPhi
-            + float_T(2.0)*cspeed*om0*om0*tauG*tauG*wy*wy*z*z*sinPhi
-            - complex_T(0,2)*om0*wy*wy*z*z*z*sinPhi
-            - float_T(4.0)*cspeed*om0*t*wy*wy*y*rho0*tanPhi2
-            + float_T(4.0)*om0*wy*wy*y*z*rho0*tanPhi2
-            + complex_T(0,2)*y*y*(
-                 + cspeed*om0*t*wy*wy + complex_T(0,1)*cspeed*y*y - om0*wy*wy*z
-                 )*cosPhi*cosPhi / cosPhi2 / cosPhi2*tanPhi2
-            + complex_T(0,2)*cspeed*k*wy*wy*x*x*z*tanPhi2*tanPhi2
-            - float_T(2.0)*om0*wy*wy*y*y*rho0*tanPhi2*tanPhi2
-            + float_T(4.0)*cspeed*om0*t*wy*wy*z*rho0*tanPhi2*tanPhi2
-            + complex_T(0,4)*cspeed*y*y*z*rho0*tanPhi2*tanPhi2
-            - float_T(4.0)*om0*wy*wy*z*z*rho0*tanPhi2*tanPhi2
-            - complex_T(0,2)*om0*wy*wy*y*y*z*sinPhi*tanPhi2*tanPhi2
-            - float_T(2.0)*y*cosPhi*(
-                + om0*(
-                    + cspeed*cspeed*(
-                          complex_T(0,1)*t*t*wy*wy
-                        + om0*t*tauG*tauG*wy*wy
-                        + complex_T(0,1)*tauG*tauG*y*y
-                        )
-                    - cspeed*(complex_T(0,2)*t
-                    + om0*tauG*tauG)*wy*wy*z
-                    + complex_T(0,1)*wy*wy*z*z
-                    )
-                + complex_T(0,2)*om0*wy*wy*y*(cspeed*t - z)*tanPhi2
-                + complex_T(0,1)*tanPhi2*tanPhi2*(
-                      complex_T(0,-4)*cspeed*y*y*z
-                    + om0*wy*wy*(y*y - float_T(4.0)*(cspeed*t - z)*z)
-                )
-            )
-        /* The "round-trip" conversion in the line below fixes a gross accuracy bug
-         * in floating-point arithmetics, when float_T is set to float_X.
-         */
-        ) * complex_T( float_64(1.0) / complex_64(float_T(2.0)*cspeed*wy*wy*helpVar1*helpVar2) );
-
-        const complex_T helpVar5 = cspeed*om0*tauG*tauG
-            - complex_T(0,8)*y*math::tan( float_T(PI / 2)-phiT )
-                                / sinPhi / sinPhi*sinPhi2*sinPhi2*sinPhi2*sinPhi2
-            - complex_T(0,2)*z*tanPhi2*tanPhi2;
-        const complex_T result = (math::exp(helpVar4)*tauG
-            *math::sqrt((cspeed*om0*rho0) / helpVar3)) / math::sqrt(helpVar5);
-        return result.get_real();
-    }
-
-    /** Calculate the Ey(r,t) field here
-     *
-     * \param pos Spatial position of the target field.
-     * \param time Absolute time (SI, including all offsets and transformations) for calculating
-     *             the field */
-    HDINLINE EField::float_T
-    EField::calcTWTSEy( const float3_64& pos, const float_64 time) const
-    {
-        /* The field function of Ey (polarization in pulse-front-tilt plane)
-         * is by definition identical to Ex (polarization normal to pulse-front-tilt plane)
-         */
-        return calcTWTSEx( pos, time );
-    }
-
-} /* namespace twts */
-} /* namespace templates */
+            {
+                typedef pmacc::math::Vector<float3_64, detail::numComponents> PosVecVec;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    for(uint32_t i = 0; i < simDim; ++i)
+                        pos[k][i] = eFieldPositions_SI[k][i];
+                }
+
+                /* Calculate Ey-component with the intra-cell offset of a Ey-field */
+                const float_64 Ey_Ey = calcTWTSEy(pos[1], time);
+                /* Calculate Ey-component with the intra-cell offset of a Ez-field */
+                const float_64 Ey_Ez = calcTWTSEy(pos[2], time);
+
+                /* Since we rotated all position vectors before calling calcTWTSEy,
+                 * we need to back-rotate the resulting E-field vector.
+                 *
+                 * RotationMatrix[-(PI/2+phi)].(Ey,Ez) for rotating back the field-vectors.
+                 */
+                const float_64 Ey_rot = -math::sin(+phi) * Ey_Ey;
+                const float_64 Ez_rot = -math::cos(+phi) * Ey_Ez;
+
+                /* Finally, the E-field normalized to the peak amplitude. */
+                return float3_X(float_X(0.0), float_X(Ey_rot), float_X(Ez_rot));
+            }
+
+            template<>
+            HDINLINE float3_X EField::getTWTSEfield_Normalized<DIM2>(
+                const pmacc::math::Vector<floatD_64, detail::numComponents>& eFieldPositions_SI,
+                const float_64 time) const
+            {
+                /* Ex->Ez, so also the grid cell offset for Ez has to be used. */
+                float3_64 pos(float3_64::create(0.0));
+                /* 2D (y,z) vectors are mapped on 3D (x,y,z) vectors. */
+                for(uint32_t i = 0; i < DIM2; ++i)
+                    pos[i + 1] = eFieldPositions_SI[2][i];
+                return float3_X(float_X(0.), float_X(0.), float_X(calcTWTSEx(pos, time)));
+            }
+
+            template<>
+            HDINLINE float3_X EField::getTWTSEfield_Normalized_Ey<DIM2>(
+                const pmacc::math::Vector<floatD_64, detail::numComponents>& eFieldPositions_SI,
+                const float_64 time) const
+            {
+                typedef pmacc::math::Vector<float3_64, detail::numComponents> PosVecVec;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                /* The 2D output of getFieldPositions_SI only returns
+                 * the y- and z-component of a 3D vector.
+                 */
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    for(uint32_t i = 0; i < DIM2; ++i)
+                        pos[k][i + 1] = eFieldPositions_SI[k][i];
+                }
+
+                /* Ey->Ey, but grid cell offsets for Ex and Ey have to be used.
+                 *
+                 * Calculate Ey-component with the intra-cell offset of a Ey-field
+                 */
+                const float_64 Ey_Ey = calcTWTSEy(pos[1], time);
+                /* Calculate Ey-component with the intra-cell offset of a Ex-field */
+                const float_64 Ey_Ex = calcTWTSEy(pos[0], time);
+
+                /* Since we rotated all position vectors before calling calcTWTSEy,
+                 * we need to back-rotate the resulting E-field vector.
+                 *
+                 * RotationMatrix[-(PI / 2+phi)].(Ey,Ex) for rotating back the field-vectors.
+                 */
+                const float_64 Ey_rot = -math::sin(+phi) * Ey_Ey;
+                const float_64 Ex_rot = -math::cos(+phi) * Ey_Ex;
+
+                /* Finally, the E-field normalized to the peak amplitude. */
+                return float3_X(float_X(Ex_rot), float_X(Ey_rot), float_X(0.0));
+            }
+
+            HDINLINE float3_X EField::operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
+            {
+                const float_64 time_SI = float_64(currentStep) * dt - tdelay;
+                const traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
+
+                const pmacc::math::Vector<floatD_64, detail::numComponents> eFieldPositions_SI
+                    = detail::getFieldPositions_SI(cellIdx, halfSimSize, fieldPosE(), unit_length, focus_y_SI, phi);
+
+                /* Single TWTS-Pulse */
+                switch(pol)
+                {
+                case LINEAR_X:
+                    return getTWTSEfield_Normalized<simDim>(eFieldPositions_SI, time_SI);
+
+                case LINEAR_YZ:
+                    return getTWTSEfield_Normalized_Ey<simDim>(eFieldPositions_SI, time_SI);
+                }
+                return getTWTSEfield_Normalized<simDim>(eFieldPositions_SI, time_SI); // defensive default
+            }
+
+            /** Calculate the Ex(r,t) field here
+             *
+             * \param pos Spatial position of the target field.
+             * \param time Absolute time (SI, including all offsets and transformations) for calculating
+             *             the field */
+            HDINLINE EField::float_T EField::calcTWTSEx(const float3_64& pos, const float_64 time) const
+            {
+                using complex_T = pmacc::math::Complex<float_T>;
+                using complex_64 = pmacc::math::Complex<float_64>;
+                /* Unit of speed */
+                const float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
+                /* Unit of time */
+                const float_64 UNIT_TIME = SI::DELTA_T_SI;
+                /* Unit of length */
+                const float_64 UNIT_LENGTH = UNIT_TIME * UNIT_SPEED;
+
+                /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
+                const float_T beta0 = float_T(beta_0);
+                /* If phi < 0 the formulas below are not directly applicable.
+                 * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
+                 * z-axis of the coordinate system in this function.
+                 */
+                const float_T phiReal = float_T(math::abs(phi));
+                const float_T alphaTilt
+                    = math::atan2(float_T(1.0) - beta0 * math::cos(phiReal), beta0 * math::sin(phiReal));
+                /* Definition of the laser pulse front tilt angle for the laser field below.
+                 *
+                 * For beta0 = 1.0, this is equivalent to our standard definition. Question: Why is the
+                 * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
+                 * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
+                 * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
+                 * the dispersion will (although physically correct) be slightly off the ideal TWTS
+                 * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
+                 * scenarios close to beta0 = 1.
+                 */
+                const float_T phiT = float_T(2.0) * alphaTilt;
+
+                /* Angle between the laser pulse front and the y-axis. Not used, but remains in code for
+                 * documentation purposes.
+                 * const float_T eta = (PI / 2) - (phiReal - alphaTilt);
+                 */
+
+                const float_T cspeed = float_T(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
+                const float_T lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
+                const float_T om0 = float_T(2.0 * PI * cspeed / lambda0);
+                /* factor 2  in tauG arises from definition convention in laser formula */
+                const float_T tauG = float_T(pulselength_SI * 2.0 / UNIT_TIME);
+                /* w0 is wx here --> w0 could be replaced by wx */
+                const float_T w0 = float_T(w_x_SI / UNIT_LENGTH);
+                const float_T rho0 = float_T(PI * w0 * w0 / lambda0);
+                /* wy is width of TWTS pulse */
+                const float_T wy = float_T(w_y_SI / UNIT_LENGTH);
+                const float_T k = float_T(2.0 * PI / lambda0);
+                const float_T x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
+                const float_T y = float_T(phiPositive * pos.y() / UNIT_LENGTH);
+                const float_T z = float_T(pos.z() / UNIT_LENGTH);
+                const float_T t = float_T(time / UNIT_TIME);
+
+                /* Calculating shortcuts for speeding up field calculation */
+                const float_T sinPhi = math::sin(phiT);
+                const float_T cosPhi = math::cos(phiT);
+                const float_T sinPhi2 = math::sin(phiT / float_T(2.0));
+                const float_T cosPhi2 = math::cos(phiT / float_T(2.0));
+                const float_T tanPhi2 = math::tan(phiT / float_T(2.0));
+
+                /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
+                 * thus help with formal code verification through manual code inspection.
+                 */
+                const complex_T helpVar1 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+                const complex_T helpVar2 = complex_T(0, -1) * cspeed * om0 * tauG * tauG
+                    - y * cosPhi / cosPhi2 / cosPhi2 * tanPhi2 - float_T(2.0) * z * tanPhi2 * tanPhi2;
+                const complex_T helpVar3 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+
+                const complex_T helpVar4
+                    = (-(cspeed * cspeed * k * om0 * tauG * tauG * wy * wy * x * x)
+                       - float_T(2.0) * cspeed * cspeed * om0 * t * t * wy * wy * rho0
+                       + complex_T(0, 2) * cspeed * cspeed * om0 * om0 * t * tauG * tauG * wy * wy * rho0
+                       - float_T(2.0) * cspeed * cspeed * om0 * tauG * tauG * y * y * rho0
+                       + float_T(4.0) * cspeed * om0 * t * wy * wy * z * rho0
+                       - complex_T(0, 2) * cspeed * om0 * om0 * tauG * tauG * wy * wy * z * rho0
+                       - float_T(2.0) * om0 * wy * wy * z * z * rho0
+                       - complex_T(0, 8) * om0 * wy * wy * y * (cspeed * t - z) * z * sinPhi2 * sinPhi2
+                       + complex_T(0, 8) / sinPhi
+                           * (+float_T(2.0) * z * z
+                                  * (cspeed * om0 * t * wy * wy + complex_T(0, 1) * cspeed * y * y - om0 * wy * wy * z)
+                              + y
+                                  * (+cspeed * k * wy * wy * x * x
+                                     - complex_T(0, 2) * cspeed * om0 * t * wy * wy * rho0
+                                     + float_T(2.0) * cspeed * y * y * rho0
+                                     + complex_T(0, 2) * om0 * wy * wy * z * rho0)
+                                  * math::tan(float_T(PI / 2.0) - phiT) / sinPhi)
+                           * sinPhi2 * sinPhi2 * sinPhi2 * sinPhi2
+                       - complex_T(0, 2) * cspeed * cspeed * om0 * t * t * wy * wy * z * sinPhi
+                       - float_T(2.0) * cspeed * cspeed * om0 * om0 * t * tauG * tauG * wy * wy * z * sinPhi
+                       - complex_T(0, 2) * cspeed * cspeed * om0 * tauG * tauG * y * y * z * sinPhi
+                       + complex_T(0, 4) * cspeed * om0 * t * wy * wy * z * z * sinPhi
+                       + float_T(2.0) * cspeed * om0 * om0 * tauG * tauG * wy * wy * z * z * sinPhi
+                       - complex_T(0, 2) * om0 * wy * wy * z * z * z * sinPhi
+                       - float_T(4.0) * cspeed * om0 * t * wy * wy * y * rho0 * tanPhi2
+                       + float_T(4.0) * om0 * wy * wy * y * z * rho0 * tanPhi2
+                       + complex_T(0, 2) * y * y
+                           * (+cspeed * om0 * t * wy * wy + complex_T(0, 1) * cspeed * y * y - om0 * wy * wy * z)
+                           * cosPhi * cosPhi / cosPhi2 / cosPhi2 * tanPhi2
+                       + complex_T(0, 2) * cspeed * k * wy * wy * x * x * z * tanPhi2 * tanPhi2
+                       - float_T(2.0) * om0 * wy * wy * y * y * rho0 * tanPhi2 * tanPhi2
+                       + float_T(4.0) * cspeed * om0 * t * wy * wy * z * rho0 * tanPhi2 * tanPhi2
+                       + complex_T(0, 4) * cspeed * y * y * z * rho0 * tanPhi2 * tanPhi2
+                       - float_T(4.0) * om0 * wy * wy * z * z * rho0 * tanPhi2 * tanPhi2
+                       - complex_T(0, 2) * om0 * wy * wy * y * y * z * sinPhi * tanPhi2 * tanPhi2
+                       - float_T(2.0) * y * cosPhi
+                           * (+om0
+                                  * (+cspeed * cspeed
+                                         * (complex_T(0, 1) * t * t * wy * wy + om0 * t * tauG * tauG * wy * wy
+                                            + complex_T(0, 1) * tauG * tauG * y * y)
+                                     - cspeed * (complex_T(0, 2) * t + om0 * tauG * tauG) * wy * wy * z
+                                     + complex_T(0, 1) * wy * wy * z * z)
+                              + complex_T(0, 2) * om0 * wy * wy * y * (cspeed * t - z) * tanPhi2
+                              + complex_T(0, 1) * tanPhi2 * tanPhi2
+                                  * (complex_T(0, -4) * cspeed * y * y * z
+                                     + om0 * wy * wy * (y * y - float_T(4.0) * (cspeed * t - z) * z)))
+                       /* The "round-trip" conversion in the line below fixes a gross accuracy bug
+                        * in floating-point arithmetics, when float_T is set to float_X.
+                        */
+                       )
+                    * complex_T(float_64(1.0) / complex_64(float_T(2.0) * cspeed * wy * wy * helpVar1 * helpVar2));
+
+                const complex_T helpVar5 = cspeed * om0 * tauG * tauG
+                    - complex_T(0, 8) * y * math::tan(float_T(PI / 2) - phiT) / sinPhi / sinPhi * sinPhi2 * sinPhi2
+                        * sinPhi2 * sinPhi2
+                    - complex_T(0, 2) * z * tanPhi2 * tanPhi2;
+                const complex_T result = (math::exp(helpVar4) * tauG * math::sqrt((cspeed * om0 * rho0) / helpVar3))
+                    / math::sqrt(helpVar5);
+                return result.get_real();
+            }
+
+            /** Calculate the Ey(r,t) field here
+             *
+             * \param pos Spatial position of the target field.
+             * \param time Absolute time (SI, including all offsets and transformations) for calculating
+             *             the field */
+            HDINLINE EField::float_T EField::calcTWTSEy(const float3_64& pos, const float_64 time) const
+            {
+                /* The field function of Ey (polarization in pulse-front-tilt plane)
+                 * is by definition identical to Ex (polarization normal to pulse-front-tilt plane)
+                 */
+                return calcTWTSEx(pos, time);
+            }
+
+        } /* namespace twts */
+    } /* namespace templates */
 } /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/TWTS/GetInitialTimeDelay_SI.tpp b/include/picongpu/fields/background/templates/TWTS/GetInitialTimeDelay_SI.tpp
index acf73c8f45..0975f83a34 100644
--- a/include/picongpu/fields/background/templates/TWTS/GetInitialTimeDelay_SI.tpp
+++ b/include/picongpu/fields/background/templates/TWTS/GetInitialTimeDelay_SI.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus
+/* Copyright 2014-2021 Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -26,134 +26,137 @@
 
 namespace picongpu
 {
-namespace templates
-{
-namespace twts
-{
-/* Auxiliary functions for calculating the TWTS field */
-namespace detail
-{
-
-    template <unsigned T_dim>
-    class GetInitialTimeDelay
+    namespace templates
     {
-        public:
-        /** Obtain the SI time delay that later enters the Ex(r, t), By(r, t) and Bz(r, t)
-         *  calculations as t.
-         * \tparam T_dim Specializes for the simulation dimension
-         *  \param auto_tdelay calculate the time delay such that the TWTS pulse is not
-         *                     inside the simulation volume at simulation start
-         *                     timestep = 0 [default = true]
-         *  \param tdelay_user_SI manual time delay if auto_tdelay is false
-         *  \param halfSimSize center of simulation volume in number of cells
-         *  \param pulselength_SI sigma of std. gauss for intensity (E^2)
-         *  \param focus_y_SI the distance to the laser focus in y-direction [m]
-         *  \param phi interaction angle between TWTS laser propagation vector and
-         *             the y-axis [rad, default = 90.*(PI / 180.)]
-         *  \param beta_0 propagation speed of overlap normalized
-         *                to the speed of light [c, default = 1.0]
-         *  \return time delay in SI units */
-        HDINLINE float_64 operator()( const bool auto_tdelay,
-                                      const float_64 tdelay_user_SI,
-                                      const DataSpace<simDim>& halfSimSize,
-                                      const float_64 pulselength_SI,
-                                      const float_64 focus_y_SI,
-                                      const float_X phi,
-                                      const float_X beta_0 ) const;
-    };
+        namespace twts
+        {
+            /* Auxiliary functions for calculating the TWTS field */
+            namespace detail
+            {
+                template<unsigned T_dim>
+                class GetInitialTimeDelay
+                {
+                public:
+                    /** Obtain the SI time delay that later enters the Ex(r, t), By(r, t) and Bz(r, t)
+                     *  calculations as t.
+                     * \tparam T_dim Specializes for the simulation dimension
+                     *  \param auto_tdelay calculate the time delay such that the TWTS pulse is not
+                     *                     inside the simulation volume at simulation start
+                     *                     timestep = 0 [default = true]
+                     *  \param tdelay_user_SI manual time delay if auto_tdelay is false
+                     *  \param halfSimSize center of simulation volume in number of cells
+                     *  \param pulselength_SI sigma of std. gauss for intensity (E^2)
+                     *  \param focus_y_SI the distance to the laser focus in y-direction [m]
+                     *  \param phi interaction angle between TWTS laser propagation vector and
+                     *             the y-axis [rad, default = 90.*(PI / 180.)]
+                     *  \param beta_0 propagation speed of overlap normalized
+                     *                to the speed of light [c, default = 1.0]
+                     *  \return time delay in SI units */
+                    HDINLINE float_64 operator()(
+                        const bool auto_tdelay,
+                        const float_64 tdelay_user_SI,
+                        const DataSpace<simDim>& halfSimSize,
+                        const float_64 pulselength_SI,
+                        const float_64 focus_y_SI,
+                        const float_X phi,
+                        const float_X beta_0) const;
+                };
 
-    template<>
-    HDINLINE float_64
-    GetInitialTimeDelay<DIM3>::operator()( const bool auto_tdelay,
-                                           const float_64 tdelay_user_SI,
-                                           const DataSpace<simDim>& halfSimSize,
-                                           const float_64 pulselength_SI,
-                                           const float_64 focus_y_SI,
-                                           const float_X phi,
-                                           const float_X beta_0 ) const
-    {
-        if ( auto_tdelay ) {
+                template<>
+                HDINLINE float_64 GetInitialTimeDelay<DIM3>::operator()(
+                    const bool auto_tdelay,
+                    const float_64 tdelay_user_SI,
+                    const DataSpace<simDim>& halfSimSize,
+                    const float_64 pulselength_SI,
+                    const float_64 focus_y_SI,
+                    const float_X phi,
+                    const float_X beta_0) const
+                {
+                    if(auto_tdelay)
+                    {
+                        /* angle between the laser pulse front and the y-axis. Good approximation for
+                         * beta0\simeq 1. For exact relation look in TWTS core routines for Ex, By or Bz. */
+                        const float_64 eta = (PI / 2) - (phi / 2);
+                        /* halfSimSize[2] --> Half-depth of simulation volume (in z); By geometric
+                         * projection we calculate the y-distance walkoff of the TWTS-pulse.
+                         * The abs()-function is for correct offset for -phi<-90Deg and +phi>+90Deg. */
+                        const float_64 y1
+                            = float_64(halfSimSize[2] * picongpu::SI::CELL_DEPTH_SI) * math::abs(math::cos(eta));
+                        /* Fudge parameter to make sure, that TWTS pulse starts to impact simulation volume
+                         * at low intensity values. */
+                        const float_64 m = 3.;
+                        /* Approximate cross section of laser pulse through y-axis,
+                         * scaled with "fudge factor" m. */
+                        const float_64 y2 = m * (pulselength_SI * picongpu::SI::SPEED_OF_LIGHT_SI) / math::cos(eta);
+                        /* y-position of laser coordinate system origin within simulation. */
+                        const float_64 y3 = focus_y_SI;
+                        /* Programmatically obtained time-delay */
+                        const float_64 tdelay = (y1 + y2 + y3) / (picongpu::SI::SPEED_OF_LIGHT_SI * beta_0);
 
-            /* angle between the laser pulse front and the y-axis. Good approximation for
-             * beta0\simeq 1. For exact relation look in TWTS core routines for Ex, By or Bz. */
-            const float_64 eta = (PI / 2) - (phi / 2);
-            /* halfSimSize[2] --> Half-depth of simulation volume (in z); By geometric
-             * projection we calculate the y-distance walkoff of the TWTS-pulse.
-             * The abs()-function is for correct offset for -phi<-90Deg and +phi>+90Deg. */
-            const float_64 y1 = float_64(halfSimSize[2]
-                                *picongpu::SI::CELL_DEPTH_SI)*math::abs(math::cos(eta));
-            /* Fudge parameter to make sure, that TWTS pulse starts to impact simulation volume
-             * at low intensity values. */
-            const float_64 m = 3.;
-            /* Approximate cross section of laser pulse through y-axis,
-             * scaled with "fudge factor" m. */
-            const float_64 y2 = m*(pulselength_SI*picongpu::SI::SPEED_OF_LIGHT_SI)
-                                / math::cos(eta);
-            /* y-position of laser coordinate system origin within simulation. */
-            const float_64 y3 = focus_y_SI;
-            /* Programmatically obtained time-delay */
-            const float_64 tdelay = (y1+y2+y3) / (picongpu::SI::SPEED_OF_LIGHT_SI*beta_0);
+                        return tdelay;
+                    }
+                    else
+                        return tdelay_user_SI;
+                }
 
-            return tdelay;
-        }
-        else
-            return tdelay_user_SI;
-    }
+                template<>
+                HDINLINE float_64 GetInitialTimeDelay<DIM2>::operator()(
+                    const bool auto_tdelay,
+                    const float_64 tdelay_user_SI,
+                    const DataSpace<simDim>& halfSimSize,
+                    const float_64 pulselength_SI,
+                    const float_64 focus_y_SI,
+                    const float_X phi,
+                    const float_X beta_0) const
+                {
+                    if(auto_tdelay)
+                    {
+                        /* angle between the laser pulse front and the y-axis. Good approximation for
+                         * beta0\simeq 1. For exact relation look in TWTS core routines for Ex, By or Bz. */
+                        const float_64 eta = (PI / 2) - (phi / 2);
+                        /* halfSimSize[0] --> Half-depth of simulation volume (in x); By geometric
+                         * projection we calculate the y-distance walkoff of the TWTS-pulse.
+                         * The abs()-function is for correct offset for -phi<-90Deg and +phi>+90Deg. */
+                        const float_64 y1
+                            = float_64(halfSimSize[0] * picongpu::SI::CELL_WIDTH_SI) * math::abs(math::cos(eta));
+                        /* Fudge parameter to make sure, that TWTS pulse starts to impact simulation volume
+                         * at low intensity values. */
+                        const float_64 m = 3.;
+                        /* Approximate cross section of laser pulse through y-axis,
+                         * scaled with "fudge factor" m. */
+                        const float_64 y2 = m * (pulselength_SI * picongpu::SI::SPEED_OF_LIGHT_SI) / math::cos(eta);
+                        /* y-position of laser coordinate system origin within simulation. */
+                        const float_64 y3 = focus_y_SI;
+                        /* Programmatically obtained time-delay */
+                        const float_64 tdelay = (y1 + y2 + y3) / (picongpu::SI::SPEED_OF_LIGHT_SI * beta_0);
 
-    template <>
-    HDINLINE float_64
-    GetInitialTimeDelay<DIM2>::operator()( const bool auto_tdelay,
-                                           const float_64 tdelay_user_SI,
-                                           const DataSpace<simDim>& halfSimSize,
-                                           const float_64 pulselength_SI,
-                                           const float_64 focus_y_SI,
-                                           const float_X phi,
-                                           const float_X beta_0 ) const
-    {
-        if ( auto_tdelay ) {
-
-            /* angle between the laser pulse front and the y-axis. Good approximation for
-             * beta0\simeq 1. For exact relation look in TWTS core routines for Ex, By or Bz. */
-            const float_64 eta = (PI / 2) - (phi / 2);
-            /* halfSimSize[0] --> Half-depth of simulation volume (in x); By geometric
-             * projection we calculate the y-distance walkoff of the TWTS-pulse.
-             * The abs()-function is for correct offset for -phi<-90Deg and +phi>+90Deg. */
-            const float_64 y1 = float_64(halfSimSize[0]
-                                *picongpu::SI::CELL_WIDTH_SI)*math::abs(math::cos(eta));
-            /* Fudge parameter to make sure, that TWTS pulse starts to impact simulation volume
-             * at low intensity values. */
-            const float_64 m = 3.;
-            /* Approximate cross section of laser pulse through y-axis,
-             * scaled with "fudge factor" m. */
-            const float_64 y2 = m*(pulselength_SI*picongpu::SI::SPEED_OF_LIGHT_SI)
-                                / math::cos(eta);
-            /* y-position of laser coordinate system origin within simulation. */
-            const float_64 y3 = focus_y_SI;
-            /* Programmatically obtained time-delay */
-            const float_64 tdelay = (y1+y2+y3) / (picongpu::SI::SPEED_OF_LIGHT_SI*beta_0);
+                        return tdelay;
+                    }
+                    else
+                        return tdelay_user_SI;
+                }
 
-            return tdelay;
-        }
-        else
-            return tdelay_user_SI;
-    }
-
-    template <unsigned T_Dim>
-    HDINLINE float_64
-    getInitialTimeDelay_SI( const bool auto_tdelay,
-                            const float_64 tdelay_user_SI,
-                            const DataSpace<T_Dim>& halfSimSize,
-                            const float_64 pulselength_SI,
-                            const float_64 focus_y_SI,
-                            const float_X phi,
-                            const float_X beta_0 )
-    {
-        return GetInitialTimeDelay<T_Dim>()(auto_tdelay, tdelay_user_SI,
-                                            halfSimSize, pulselength_SI,
-                                            focus_y_SI, phi, beta_0);
-    }
+                template<unsigned T_Dim>
+                HDINLINE float_64 getInitialTimeDelay_SI(
+                    const bool auto_tdelay,
+                    const float_64 tdelay_user_SI,
+                    const DataSpace<T_Dim>& halfSimSize,
+                    const float_64 pulselength_SI,
+                    const float_64 focus_y_SI,
+                    const float_X phi,
+                    const float_X beta_0)
+                {
+                    return GetInitialTimeDelay<T_Dim>()(
+                        auto_tdelay,
+                        tdelay_user_SI,
+                        halfSimSize,
+                        pulselength_SI,
+                        focus_y_SI,
+                        phi,
+                        beta_0);
+                }
 
-} /* namespace detail */
-} /* namespace twts */
-} /* namespace templates */
+            } /* namespace detail */
+        } /* namespace twts */
+    } /* namespace templates */
 } /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/TWTS/RotateField.tpp b/include/picongpu/fields/background/templates/TWTS/RotateField.tpp
index 401bc135a9..a422161346 100644
--- a/include/picongpu/fields/background/templates/TWTS/RotateField.tpp
+++ b/include/picongpu/fields/background/templates/TWTS/RotateField.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus, Rene Widera
+/* Copyright 2014-2021 Alexander Debus, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,96 +26,90 @@
 
 namespace picongpu
 {
-namespace templates
-{
-namespace twts
-{
-/** Auxiliary functions for calculating the TWTS field */
-namespace detail
-{
-
-    template <typename T_Type, typename T_AngleType>
-    struct RotateField;
-
-    template <typename T_Type, typename T_AngleType>
-    struct RotateField<pmacc::math::Vector<T_Type,3>, T_AngleType >
+    namespace templates
     {
-        typedef pmacc::math::Vector<T_Type,3> result;
-        typedef T_AngleType AngleType;
-        HDINLINE result
-        operator()( const result& fieldPosVector,
-                    const AngleType phi ) const
+        namespace twts
         {
-            /*  Since, the laser propagation direction encloses an angle of phi with the
-             *  simulation y-axis (i.e. direction of sliding window), the positions vectors are
-             *  rotated around the simulation x-axis before calling the TWTS field functions.
-             *  Note: The TWTS field functions are in non-rotated frame and only use the angle
-             *  phi to determine the required amount of pulse front tilt.
-             *  RotationMatrix[PI/2+phi].(y,z) (180Deg-flip at phi=90Deg since coordinate
-             *  system in paper is oriented the other way round.) */
-            return result(
-                fieldPosVector.x(),
-               -math::sin(AngleType(phi))*fieldPosVector.y()
-                    -math::cos(AngleType(phi))*fieldPosVector.z() ,
-               +math::cos(AngleType(phi))*fieldPosVector.y()
-                    -math::sin(AngleType(phi))*fieldPosVector.z() );
-        }
+            /** Auxiliary functions for calculating the TWTS field */
+            namespace detail
+            {
+                template<typename T_Type, typename T_AngleType>
+                struct RotateField;
 
-    };
+                template<typename T_Type, typename T_AngleType>
+                struct RotateField<pmacc::math::Vector<T_Type, 3>, T_AngleType>
+                {
+                    typedef pmacc::math::Vector<T_Type, 3> result;
+                    typedef T_AngleType AngleType;
+                    HDINLINE result operator()(const result& fieldPosVector, const AngleType phi) const
+                    {
+                        /*  Since, the laser propagation direction encloses an angle of phi with the
+                         *  simulation y-axis (i.e. direction of sliding window), the positions vectors are
+                         *  rotated around the simulation x-axis before calling the TWTS field functions.
+                         *  Note: The TWTS field functions are in non-rotated frame and only use the angle
+                         *  phi to determine the required amount of pulse front tilt.
+                         *  RotationMatrix[PI/2+phi].(y,z) (180Deg-flip at phi=90Deg since coordinate
+                         *  system in paper is oriented the other way round.) */
+                        return result(
+                            fieldPosVector.x(),
+                            -math::sin(AngleType(phi)) * fieldPosVector.y()
+                                - math::cos(AngleType(phi)) * fieldPosVector.z(),
+                            +math::cos(AngleType(phi)) * fieldPosVector.y()
+                                - math::sin(AngleType(phi)) * fieldPosVector.z());
+                    }
+                };
 
-    template <typename T_Type, typename T_AngleType>
-    struct RotateField<pmacc::math::Vector<T_Type,2>, T_AngleType >
-    {
-        typedef pmacc::math::Vector<T_Type,2> result;
-        typedef T_AngleType AngleType;
-        HDINLINE result
-        operator()( const result& fieldPosVector,
-                    const AngleType phi ) const
-        {
-            /*  Since, the laser propagation direction encloses an angle of phi with the
-             *  simulation y-axis (i.e. direction of sliding window), the positions vectors are
-             *  rotated around the simulation x-axis before calling the TWTS field functions.
-             *  Note: The TWTS field functions are in non-rotated frame and only use the angle
-             *  phi to determine the required amount of pulse front tilt.
-             *  RotationMatrix[PI/2+phi].(y,z) (180Deg-flip at phi=90Deg since coordinate
-             *  system in paper is oriented the other way round.) */
+                template<typename T_Type, typename T_AngleType>
+                struct RotateField<pmacc::math::Vector<T_Type, 2>, T_AngleType>
+                {
+                    typedef pmacc::math::Vector<T_Type, 2> result;
+                    typedef T_AngleType AngleType;
+                    HDINLINE result operator()(const result& fieldPosVector, const AngleType phi) const
+                    {
+                        /*  Since, the laser propagation direction encloses an angle of phi with the
+                         *  simulation y-axis (i.e. direction of sliding window), the positions vectors are
+                         *  rotated around the simulation x-axis before calling the TWTS field functions.
+                         *  Note: The TWTS field functions are in non-rotated frame and only use the angle
+                         *  phi to determine the required amount of pulse front tilt.
+                         *  RotationMatrix[PI/2+phi].(y,z) (180Deg-flip at phi=90Deg since coordinate
+                         *  system in paper is oriented the other way round.) */
 
-            /*  Rotate 90 degree around y-axis, so that TWTS laser propagates within
-             *  the 2D (x,y)-plane. Corresponding position vector for the Ez-components
-             *  in 2D simulations.
-             *  3D     3D vectors in 2D space (x,y)
-             *  x -->  z
-             *  y -->  y
-             *  z --> -x (Since z=0 for 2D, we use the existing
-             *            TWTS-field-function and set -x=0)
-             *
-             * Explicit implementation in 3D coordinates:
-             * fieldPosVector = float3_64( -fieldPosVector.z(),       //(Here: ==0)
-             *                              fieldPosVector.y(),
-             *                              fieldPosVector.x() );
-             * fieldPosVector = float3_64( fieldPosVector.x(),
-             *       -sin(phi)*fieldPosVector.y()-cos(phi)*fieldPosVector.z(),
-             *       +cos(phi)*fieldPosVector.y()-sin(phi)*fieldPosVector.z()  );
-             * The 2D implementation here only calculates the last two components.
-             * Note: The x-axis of rotation is fine in 2D, because that component now contains
-             *       the (non-existing) simulation z-coordinate. */
-             return result(
-                -math::sin(AngleType(phi))*fieldPosVector.y()
-                    -math::cos(AngleType(phi))*fieldPosVector.x() ,
-                +math::cos(AngleType(phi))*fieldPosVector.y()
-                    -math::sin(AngleType(phi))*fieldPosVector.x() );
-        }
-    };
+                        /*  Rotate 90 degree around y-axis, so that TWTS laser propagates within
+                         *  the 2D (x,y)-plane. Corresponding position vector for the Ez-components
+                         *  in 2D simulations.
+                         *  3D     3D vectors in 2D space (x,y)
+                         *  x -->  z
+                         *  y -->  y
+                         *  z --> -x (Since z=0 for 2D, we use the existing
+                         *            TWTS-field-function and set -x=0)
+                         *
+                         * Explicit implementation in 3D coordinates:
+                         * fieldPosVector = float3_64( -fieldPosVector.z(),       //(Here: ==0)
+                         *                              fieldPosVector.y(),
+                         *                              fieldPosVector.x() );
+                         * fieldPosVector = float3_64( fieldPosVector.x(),
+                         *       -sin(phi)*fieldPosVector.y()-cos(phi)*fieldPosVector.z(),
+                         *       +cos(phi)*fieldPosVector.y()-sin(phi)*fieldPosVector.z()  );
+                         * The 2D implementation here only calculates the last two components.
+                         * Note: The x-axis of rotation is fine in 2D, because that component now contains
+                         *       the (non-existing) simulation z-coordinate. */
+                        return result(
+                            -math::sin(AngleType(phi)) * fieldPosVector.y()
+                                - math::cos(AngleType(phi)) * fieldPosVector.x(),
+                            +math::cos(AngleType(phi)) * fieldPosVector.y()
+                                - math::sin(AngleType(phi)) * fieldPosVector.x());
+                    }
+                };
 
-    template <typename T_Type, typename T_AngleType>
-    HDINLINE typename RotateField<T_Type,T_AngleType>::result
-    rotateField( const T_Type& fieldPosVector,
-                 const T_AngleType phi )
-    {
-        return RotateField<T_Type,T_AngleType>()(fieldPosVector,phi);
-    }
+                template<typename T_Type, typename T_AngleType>
+                HDINLINE typename RotateField<T_Type, T_AngleType>::result rotateField(
+                    const T_Type& fieldPosVector,
+                    const T_AngleType phi)
+                {
+                    return RotateField<T_Type, T_AngleType>()(fieldPosVector, phi);
+                }
 
-} /* namespace detail */
-} /* namespace twts */
-} /* namespace templates */
+            } /* namespace detail */
+        } /* namespace twts */
+    } /* namespace templates */
 } /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/TWTS/TWTS.hpp b/include/picongpu/fields/background/templates/TWTS/TWTS.hpp
index 0757567c52..a93796b405 100644
--- a/include/picongpu/fields/background/templates/TWTS/TWTS.hpp
+++ b/include/picongpu/fields/background/templates/TWTS/TWTS.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus
+/* Copyright 2014-2021 Alexander Debus
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/fields/background/templates/TWTS/TWTS.tpp b/include/picongpu/fields/background/templates/TWTS/TWTS.tpp
index 9dd20c62b4..248c501da9 100644
--- a/include/picongpu/fields/background/templates/TWTS/TWTS.tpp
+++ b/include/picongpu/fields/background/templates/TWTS/TWTS.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus
+/* Copyright 2014-2021 Alexander Debus
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/fields/background/templates/TWTS/getFieldPositions_SI.tpp b/include/picongpu/fields/background/templates/TWTS/getFieldPositions_SI.tpp
index e1f8f086d1..a88de8b99c 100644
--- a/include/picongpu/fields/background/templates/TWTS/getFieldPositions_SI.tpp
+++ b/include/picongpu/fields/background/templates/TWTS/getFieldPositions_SI.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus
+/* Copyright 2014-2021 Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -27,55 +27,55 @@
 
 namespace picongpu
 {
-namespace templates
-{
-namespace twts
-{
-/** Auxiliary functions for calculating the TWTS field */
-namespace detail
-{
-    /** Calculate the SI position vectors that later enter the Ex(r, t), By(r, t)
-     *  and Bz(r ,t) calculations as r.
-     *  \param cellIdx The total cell id counted from the start at timestep 0. */
-    HDINLINE pmacc::math::Vector<floatD_64,numComponents>
-    getFieldPositions_SI(const DataSpace<simDim>& cellIdx,
-                         const DataSpace<simDim>& halfSimSize,
-                         const pmacc::math::Vector<floatD_X, numComponents>& fieldOnGridPositions,
-                         const float_64 unit_length,
-                         const float_64 focus_y_SI,
-                         const float_X phi )
+    namespace templates
     {
-        /* Note: Neither direct precisionCast on picongpu::cellSize
-           or casting on floatD_ does work. */
-        const floatD_64 cellDim(picongpu::cellSize.shrink<simDim>());
-        const floatD_64 cellDimensions = cellDim * unit_length;
+        namespace twts
+        {
+            /** Auxiliary functions for calculating the TWTS field */
+            namespace detail
+            {
+                /** Calculate the SI position vectors that later enter the Ex(r, t), By(r, t)
+                 *  and Bz(r ,t) calculations as r.
+                 *  \param cellIdx The total cell id counted from the start at timestep 0. */
+                HDINLINE pmacc::math::Vector<floatD_64, numComponents> getFieldPositions_SI(
+                    const DataSpace<simDim>& cellIdx,
+                    const DataSpace<simDim>& halfSimSize,
+                    const pmacc::math::Vector<floatD_X, numComponents>& fieldOnGridPositions,
+                    const float_64 unit_length,
+                    const float_64 focus_y_SI,
+                    const float_X phi)
+                {
+                    /* Note: Neither direct precisionCast on picongpu::cellSize
+                       or casting on floatD_ does work. */
+                    const floatD_64 cellDim(picongpu::cellSize.shrink<simDim>());
+                    const floatD_64 cellDimensions = cellDim * unit_length;
 
-        /* TWTS laser coordinate origin is centered transversally and defined longitudinally by
-           the laser center in y (usually maximum of intensity). */
-        floatD_X laserOrigin = precisionCast<float_X>(halfSimSize);
-        laserOrigin.y() = float_X( focus_y_SI/cellDimensions.y() );
+                    /* TWTS laser coordinate origin is centered transversally and defined longitudinally by
+                       the laser center in y (usually maximum of intensity). */
+                    floatD_X laserOrigin = precisionCast<float_X>(halfSimSize);
+                    laserOrigin.y() = float_X(focus_y_SI / cellDimensions.y());
 
-        /* For staggered fields (e.g. Yee-grid), obtain the fractional cell index components and add
-         * that to the total cell indices. The physical field coordinate origin is transversally
-         * centered with respect to the global simulation volume.
-         * pmacc::math::Vector<floatD_X, numComponents> fieldPositions =
-         *                traits::FieldPosition<fields::CellType, FieldE>(); */
-        pmacc::math::Vector<floatD_X, numComponents> fieldPositions = fieldOnGridPositions;
+                    /* For staggered fields (e.g. Yee-grid), obtain the fractional cell index components and add
+                     * that to the total cell indices. The physical field coordinate origin is transversally
+                     * centered with respect to the global simulation volume.
+                     * pmacc::math::Vector<floatD_X, numComponents> fieldPositions =
+                     *                traits::FieldPosition<fields::CellType, FieldE>(); */
+                    pmacc::math::Vector<floatD_X, numComponents> fieldPositions = fieldOnGridPositions;
 
-        pmacc::math::Vector<floatD_64,numComponents> fieldPositions_SI;
+                    pmacc::math::Vector<floatD_64, numComponents> fieldPositions_SI;
 
-        for( uint32_t i = 0; i < numComponents; ++i ) /* cellIdx Ex, Ey and Ez */
-        {
-            fieldPositions[i]   += ( precisionCast<float_X>(cellIdx) - laserOrigin );
-            fieldPositions_SI[i] = precisionCast<float_64>(fieldPositions[i]) * cellDimensions;
+                    for(uint32_t i = 0; i < numComponents; ++i) /* cellIdx Ex, Ey and Ez */
+                    {
+                        fieldPositions[i] += (precisionCast<float_X>(cellIdx) - laserOrigin);
+                        fieldPositions_SI[i] = precisionCast<float_64>(fieldPositions[i]) * cellDimensions;
 
-            fieldPositions_SI[i] = rotateField(fieldPositions_SI[i],phi);
-        }
+                        fieldPositions_SI[i] = rotateField(fieldPositions_SI[i], phi);
+                    }
 
-        return fieldPositions_SI;
-    }
+                    return fieldPositions_SI;
+                }
 
-} /* namespace detail */
-} /* namespace twts */
-} /* namespace templates */
+            } /* namespace detail */
+        } /* namespace twts */
+    } /* namespace templates */
 } /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/TWTS/numComponents.hpp b/include/picongpu/fields/background/templates/TWTS/numComponents.hpp
index 7ea538b4a1..7d40e6d593 100644
--- a/include/picongpu/fields/background/templates/TWTS/numComponents.hpp
+++ b/include/picongpu/fields/background/templates/TWTS/numComponents.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Alexander Debus, Axel Huebl
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -22,15 +22,15 @@
 
 namespace picongpu
 {
-namespace templates
-{
-namespace twts
-{
-namespace detail
-{
-    /* Number of field components used in the simulation. [Default: 3 for both 2D and 3D] */
-    const uint32_t numComponents = 3;
-} /* namespace detail */
-} /* namespace twts */
-} /* namespace templates */
+    namespace templates
+    {
+        namespace twts
+        {
+            namespace detail
+            {
+                /* Number of field components used in the simulation. [Default: 3 for both 2D and 3D] */
+                const uint32_t numComponents = 3;
+            } /* namespace detail */
+        } /* namespace twts */
+    } /* namespace templates */
 } /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/BField.hpp b/include/picongpu/fields/background/templates/twtsfast/BField.hpp
new file mode 100644
index 0000000000..b06600a962
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/BField.hpp
@@ -0,0 +1,182 @@
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include <pmacc/types.hpp>
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include "picongpu/fields/background/templates/twtsfast/numComponents.hpp"
+
+namespace picongpu
+{
+    /* Load pre-defined background field */
+    namespace templates
+    {
+        /* Traveling-wave Thomson scattering laser pulse */
+        namespace twtsfast
+        {
+            class BField
+            {
+            public:
+                using float_T = float_X;
+
+                enum PolarizationType
+                {
+                    /** The linear polarization of the TWTS laser is defined
+                     *  relative to the plane of the pulse front tilt (reference plane).
+                     *
+                     *  Polarisation is normal to the reference plane.
+                     *  Use Ex-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
+                     */
+                    LINEAR_X = 1u,
+                    /** Polarization lies within the reference plane.
+                     *  Use Ey-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
+                     */
+                    LINEAR_YZ = 2u,
+                };
+
+                /** Center of simulation volume in number of cells */
+                PMACC_ALIGN(halfSimSize, DataSpace<simDim>);
+                /** y-position of TWTS coordinate origin inside the simulation coordinates [meter]
+                 *  The other origin coordinates (x and z) default to globally centered values
+                 *  with respect to the simulation volume.
+                 */
+                PMACC_ALIGN(focus_y_SI, float_64 const);
+                /** Laser wavelength [meter] */
+                PMACC_ALIGN(wavelength_SI, float_64 const);
+                /** TWTS laser pulse duration [second] */
+                PMACC_ALIGN(pulselength_SI, float_64 const);
+                /** line focus height of TWTS pulse [meter] */
+                PMACC_ALIGN(w_x_SI, float_64 const);
+                /** interaction angle between TWTS laser propagation vector and the y-axis [rad] */
+                PMACC_ALIGN(phi, float_X const);
+                /** Takes value 1.0 for phi > 0 and -1.0 for phi < 0. */
+                PMACC_ALIGN(phiPositive, float_X);
+                /** propagation speed of TWTS laser overlap
+                    normalized to the speed of light. [Default: beta0 = 1.0] */
+                PMACC_ALIGN(beta_0, float_X const);
+                /** If auto_tdelay=FALSE, then a user defined delay is used. [second] */
+                PMACC_ALIGN(tdelay_user_SI, float_64 const);
+                /** Make time step constant accessible to device. */
+                PMACC_ALIGN(dt, float_64 const);
+                /** Make length normalization constant accessible to device. */
+                PMACC_ALIGN(unit_length, float_64 const);
+                /** TWTS laser time delay */
+                PMACC_ALIGN(tdelay, float_64);
+                /** Should the TWTS laser time delay be chosen automatically, such that
+                 * the laser gradually enters the simulation volume? [Default: TRUE]
+                 */
+                PMACC_ALIGN(auto_tdelay, bool const);
+                /** Polarization of TWTS laser */
+                PMACC_ALIGN(pol, PolarizationType const);
+
+                /** Magnetic field of the TWTS laser
+                 *
+                 * @param focus_y_SI the distance to the laser focus in y-direction [m]
+                 * @param wavelength_SI central wavelength [m]
+                 * @param pulselength_SI sigma of std. gauss for intensity (E^2),
+                 *  pulselength_SI = FWHM_of_Intensity / 2.35482 [seconds (sigma)]
+                 * @param w_x beam waist: distance from the axis where the pulse electric field
+                 *  decreases to its 1/e^2-th part at the focus position of the laser [m]
+                 * @param phi interaction angle between TWTS laser propagation vector and
+                 *  the y-axis [rad, default = 90.*(PI/180.)]
+                 * @param beta_0 propagation speed of overlap normalized to
+                 *  the speed of light [c, default = 1.0]
+                 * @param tdelay_user manual time delay if auto_tdelay is false
+                 * @param auto_tdelay calculate the time delay such that the TWTS pulse is not
+                 *  inside the simulation volume at simulation start timestep = 0 [default = true]
+                 * @param pol determines the TWTS laser polarization, which is either normal or parallel
+                 *  to the laser pulse front tilt plane [ default= LINEAR_X , LINEAR_YZ ]
+                 */
+                HINLINE
+                BField(
+                    float_64 const focus_y_SI,
+                    float_64 const wavelength_SI,
+                    float_64 const pulselength_SI,
+                    float_64 const w_x_SI,
+                    float_X const phi = 90. * (PI / 180.),
+                    float_X const beta_0 = 1.0,
+                    float_64 const tdelay_user_SI = 0.0,
+                    bool const auto_tdelay = true,
+                    PolarizationType const pol = LINEAR_X);
+
+
+                /** Specify your background field B(r,t) here
+                 *
+                 * @param cellIdx The total cell id counted from the start at t=0
+                 * @param currentStep The current time step */
+                HDINLINE float3_X operator()(DataSpace<simDim> const& cellIdx, uint32_t const currentStep) const;
+
+                /** Calculate the By(r,t) field, when electric field vector (Ex,0,0)
+                 *  is normal to the pulse-front-tilt plane (y,z)
+                 *
+                 * @param pos Spatial position of the target field.
+                 * @param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field */
+                HDINLINE float_T calcTWTSBy(float3_64 const& pos, float_64 const time) const;
+
+                /** Calculate the Bz(r,t) field, when electric field vector (Ex,0,0)
+                 *  is normal to the pulse-front-tilt plane (y,z)
+                 *
+                 * @param pos Spatial position of the target field.
+                 * @param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field */
+                HDINLINE float_T calcTWTSBz_Ex(float3_64 const& pos, float_64 const time) const;
+
+                /** Calculate the By(r,t) field, when electric field vector (0,Ey,0)
+                 *  lies within the pulse-front-tilt plane (y,z)
+                 *
+                 * @param pos Spatial position of the target field.
+                 * @param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field */
+                HDINLINE float_T calcTWTSBx(float3_64 const& pos, float_64 const time) const;
+
+                /** Calculate the Bz(r,t) field here (electric field vector (0,Ey,0)
+                 *  lies within the pulse-front-tilt plane (y,z)
+                 *
+                 * @param pos Spatial position of the target field.
+                 * @param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field */
+                HDINLINE float_T calcTWTSBz_Ey(float3_64 const& pos, float_64 const time) const;
+
+                /** Calculate the B-field vector of the TWTS laser in SI units.
+                 * @tparam T_dim Specializes for the simulation dimension
+                 * @param cellIdx The total cell id counted from the start at timestep 0
+                 * @return B-field vector of the rotated TWTS field in SI units */
+                template<unsigned T_dim>
+                HDINLINE float3_X getTWTSBfield_Normalized(
+                    pmacc::math::Vector<floatD_64, detail::numComponents> const& eFieldPositions_SI,
+                    float_64 const time) const;
+
+                /** Calculate the B-field vector of the "in-plane" polarized TWTS laser in SI units.
+                 * @tparam T_dim Specializes for the simulation dimension
+                 * @param cellIdx The total cell id counted from the start at timestep 0
+                 * @return B-field vector of the rotated TWTS field in SI units */
+                template<unsigned T_dim>
+                HDINLINE float3_X getTWTSBfield_Normalized_Ey(
+                    pmacc::math::Vector<floatD_64, detail::numComponents> const& eFieldPositions_SI,
+                    float_64 const time) const;
+            };
+
+        } /* namespace twtsfast */
+    } /* namespace templates */
+} /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/BField.tpp b/include/picongpu/fields/background/templates/twtsfast/BField.tpp
new file mode 100644
index 0000000000..19b2575cb9
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/BField.tpp
@@ -0,0 +1,761 @@
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include <pmacc/types.hpp>
+#include "picongpu/simulation_defines.hpp"
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include <pmacc/mappings/simulation/SubGrid.hpp>
+#include <pmacc/math/Complex.hpp>
+
+#include "picongpu/fields/background/templates/twtsfast/RotateField.tpp"
+#include "picongpu/fields/background/templates/twtsfast/GetInitialTimeDelay_SI.tpp"
+#include "picongpu/fields/background/templates/twtsfast/getFieldPositions_SI.tpp"
+#include "picongpu/fields/background/templates/twtsfast/BField.hpp"
+#include "picongpu/fields/CellType.hpp"
+
+
+namespace picongpu
+{
+    /** Load pre-defined background field */
+    namespace templates
+    {
+        /** Traveling-wave Thomson scattering laser pulse */
+        namespace twtsfast
+        {
+            HINLINE
+            BField::BField(
+                float_64 const focus_y_SI,
+                float_64 const wavelength_SI,
+                float_64 const pulselength_SI,
+                float_64 const w_x_SI,
+                float_X const phi,
+                float_X const beta_0,
+                float_64 const tdelay_user_SI,
+                bool const auto_tdelay,
+                PolarizationType const pol)
+                : focus_y_SI(focus_y_SI)
+                , wavelength_SI(wavelength_SI)
+                , pulselength_SI(pulselength_SI)
+                , w_x_SI(w_x_SI)
+                , phi(phi)
+                , beta_0(beta_0)
+                , tdelay_user_SI(tdelay_user_SI)
+                , dt(SI::DELTA_T_SI)
+                , unit_length(UNIT_LENGTH)
+                , auto_tdelay(auto_tdelay)
+                , pol(pol)
+                , phiPositive(float_X(1.0))
+            {
+                /* Note: Enviroment-objects cannot be instantiated on CUDA GPU device. Since this is done
+                 * on host (see fieldBackground.param), this is no problem.
+                 */
+                SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                halfSimSize = subGrid.getGlobalDomain().size / 2;
+                tdelay = detail::getInitialTimeDelay_SI(
+                    auto_tdelay,
+                    tdelay_user_SI,
+                    halfSimSize,
+                    pulselength_SI,
+                    focus_y_SI,
+                    phi,
+                    beta_0);
+                if(phi < float_X(0.0))
+                    phiPositive = float_X(-1.0);
+            }
+
+            template<>
+            HDINLINE float3_X BField::getTWTSBfield_Normalized<DIM3>(
+                pmacc::math::Vector<floatD_64, detail::numComponents> const& bFieldPositions_SI,
+                float_64 const time) const
+            {
+                using PosVecVec = pmacc::math::Vector<float3_64, detail::numComponents>;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    for(uint32_t i = 0; i < simDim; ++i)
+                    {
+                        pos[k][i] = bFieldPositions_SI[k][i];
+                    }
+                }
+
+                /* An example of intra-cell position offsets is the staggered Yee-grid.
+                 *
+                 * Calculate By-component with the intra-cell offset of a By-field
+                 */
+                float_64 const By_By = calcTWTSBy(pos[1], time);
+                /* Calculate Bz-component the the intra-cell offset of a By-field */
+                float_64 const Bz_By = calcTWTSBz_Ex(pos[1], time);
+                /* Calculate By-component the the intra-cell offset of a Bz-field */
+                float_64 const By_Bz = calcTWTSBy(pos[2], time);
+                /* Calculate Bz-component the the intra-cell offset of a Bz-field */
+                float_64 const Bz_Bz = calcTWTSBz_Ex(pos[2], time);
+                /* Since we rotated all position vectors before calling calcTWTSBy and calcTWTSBz_Ex,
+                 * we need to back-rotate the resulting B-field vector.
+                 *
+                 * RotationMatrix[-(PI/2+phi)].(By,Bz) for rotating back the field vectors.
+                 */
+                float_X sinPhi;
+                float_X cosPhi;
+                pmacc::math::sincos(phi, sinPhi, cosPhi);
+                float_X const By_rot = -sinPhi * float_X(By_By) + cosPhi * float_X(Bz_By);
+                float_X const Bz_rot = -cosPhi * float_X(By_Bz) - sinPhi * float_X(Bz_Bz);
+
+                /* Finally, the B-field normalized to the peak amplitude. */
+                return float3_X(0.0_X, By_rot, Bz_rot);
+            }
+
+            template<>
+            HDINLINE float3_X BField::getTWTSBfield_Normalized_Ey<DIM3>(
+                pmacc::math::Vector<floatD_64, detail::numComponents> const& bFieldPositions_SI,
+                float_64 const time) const
+            {
+                using PosVecVec = pmacc::math::Vector<float3_64, detail::numComponents>;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    for(uint32_t i = 0; i < simDim; ++i)
+                    {
+                        pos[k][i] = bFieldPositions_SI[k][i];
+                    }
+                }
+
+                /* Calculate Bz-component with the intra-cell offset of a By-field */
+                float_64 const Bz_By = calcTWTSBz_Ey(pos[1], time);
+                /* Calculate Bz-component with the intra-cell offset of a Bz-field */
+                float_64 const Bz_Bz = calcTWTSBz_Ey(pos[2], time);
+                /* Since we rotated all position vectors before calling calcTWTSBz_Ey,
+                 * we need to back-rotate the resulting B-field vector.
+                 *
+                 * RotationMatrix[-(PI/2+phi)].(By,Bz) for rotating back the field-vectors.
+                 */
+                float_X sinPhi;
+                float_X cosPhi;
+                pmacc::math::sincos(phi, sinPhi, cosPhi);
+                float_X const By_rot = +cosPhi * float_X(Bz_By);
+                float_X const Bz_rot = -sinPhi * float_X(Bz_Bz);
+
+                /* Finally, the B-field normalized to the peak amplitude. */
+                return float3_X(float_X(calcTWTSBx(pos[0], time)), By_rot, Bz_rot);
+            }
+
+            template<>
+            HDINLINE float3_X BField::getTWTSBfield_Normalized<DIM2>(
+                pmacc::math::Vector<floatD_64, detail::numComponents> const& bFieldPositions_SI,
+                float_64 const time) const
+            {
+                using PosVecVec = pmacc::math::Vector<float3_64, detail::numComponents>;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    /* 2D (y,z) vectors are mapped on 3D (x,y,z) vectors. */
+                    for(uint32_t i = 0; i < DIM2; ++i)
+                    {
+                        pos[k][i + 1] = bFieldPositions_SI[k][i];
+                    }
+                }
+
+                /* General background comment for the rest of this function:
+                 *
+                 * Corresponding position vector for the field components in 2D simulations.
+                 *  3D     3D vectors in 2D space (x, y)
+                 *  x -->  z (Meaning: In 2D-sim, insert cell-coordinate x
+                 *            into TWTS field function coordinate z.)
+                 *  y -->  y
+                 *  z --> -x (Since z=0 for 2D, we use the existing
+                 *            3D TWTS-field-function and set x = -0)
+                 *  The transformed 3D coordinates are used to calculate the field components.
+                 *  Ex --> Ez (Meaning: Calculate Ex-component of existing 3D TWTS-field (calcTWTSEx) using
+                 *             transformed position vectors to obtain the corresponding Ez-component in 2D.
+                 *             Note: Swapping field component coordinates also alters the
+                 *                   intra-cell position offset.)
+                 *  By --> By
+                 *  Bz --> -Bx (Yes, the sign is necessary.)
+                 *
+                 * An example of intra-cell position offsets is the staggered Yee-grid.
+                 *
+                 * This procedure is analogous to 3D case, but replace By --> By and Bz --> -Bx. Hence the
+                 * grid cell offset for Bx has to be used instead of Bz. Mind the "-"-sign.
+                 */
+
+                /* Calculate By-component with the intra-cell offset of a By-field */
+                float_64 const By_By = calcTWTSBy(pos[1], time);
+                /* Calculate Bx-component with the intra-cell offset of a By-field */
+                float_64 const Bx_By = -calcTWTSBz_Ex(pos[1], time);
+                /* Calculate By-component with the intra-cell offset of a Bx-field */
+                float_64 const By_Bx = calcTWTSBy(pos[0], time);
+                /* Calculate Bx-component with the intra-cell offset of a Bx-field */
+                float_64 const Bx_Bx = -calcTWTSBz_Ex(pos[0], time);
+                /* Since we rotated all position vectors before calling calcTWTSBy and calcTWTSBz_Ex, we
+                 * need to back-rotate the resulting B-field vector. Now the rotation is done
+                 * analogously in the (y,x)-plane. (Reverse of the position vector transformation.)
+                 *
+                 * RotationMatrix[-(PI / 2+phi)].(By,Bx) for rotating back the field vectors.
+                 */
+                float_X sinPhi;
+                float_X cosPhi;
+                pmacc::math::sincos(phi, sinPhi, cosPhi);
+                float_X const By_rot = -sinPhi * float_X(By_By) + cosPhi * float_X(Bx_By);
+                float_X const Bx_rot = -cosPhi * float_X(By_Bx) - sinPhi * float_X(Bx_Bx);
+
+                /* Finally, the B-field normalized to the peak amplitude. */
+                return float3_X(Bx_rot, By_rot, 0.0_X);
+            }
+
+            template<>
+            HDINLINE float3_X BField::getTWTSBfield_Normalized_Ey<DIM2>(
+                pmacc::math::Vector<floatD_64, detail::numComponents> const& bFieldPositions_SI,
+                float_64 const time) const
+            {
+                using PosVecVec = pmacc::math::Vector<float3_64, detail::numComponents>;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    /* The 2D output of getFieldPositions_SI only returns
+                     * the y- and z-component of a 3D vector.
+                     */
+                    for(uint32_t i = 0; i < DIM2; ++i)
+                    {
+                        pos[k][i + 1] = bFieldPositions_SI[k][i];
+                    }
+                }
+
+                /* General background comment for the rest of this function:
+                 *
+                 * Corresponding position vector for the field components in 2D simulations.
+                 *  3D     3D vectors in 2D space (x, y)
+                 *  x -->  z (Meaning: In 2D-sim, insert cell-coordinate x
+                 *            into TWTS field function coordinate z.)
+                 *  y -->  y
+                 *  z --> -x (Since z=0 for 2D, we use the existing
+                 *            3D TWTS-field-function and set x = -0)
+                 *  Ex --> Ez (Meaning: Calculate Ex-component of existing 3D TWTS-field to obtain
+                 *             corresponding Ez-component in 2D.
+                 *             Note: the intra-cell position offset due to the staggered grid for Ez.)
+                 *  By --> By
+                 *  Bz --> -Bx (Yes, the sign is necessary.)
+                 *
+                 * This procedure is analogous to 3D case, but replace By --> By and Bz --> -Bx. Hence the
+                 * grid cell offset for Bx has to be used instead of Bz. Mind the -sign.
+                 */
+
+                /* Calculate Bx-component with the intra-cell offset of a By-field */
+                float_64 const Bx_By = -calcTWTSBz_Ex(pos[1], time);
+                /* Calculate Bx-component with the intra-cell offset of a Bx-field */
+                float_64 const Bx_Bx = -calcTWTSBz_Ex(pos[0], time);
+
+                /* Since we rotated all position vectors before calling calcTWTSBz_Ex, we
+                 * need to back-rotate the resulting B-field vector. Now the rotation is done
+                 * analogously in the (y,x)-plane. (Reverse of the position vector transformation.)
+                 *
+                 * RotationMatrix[-(PI / 2+phi)].(By,Bx)
+                 * for rotating back the field-vectors.
+                 */
+                float_X sinPhi;
+                float_X cosPhi;
+                pmacc::math::sincos(phi, sinPhi, cosPhi);
+                float_X const By_rot = +cosPhi * float_X(Bx_By);
+                float_X const Bx_rot = -sinPhi * float_X(Bx_Bx);
+
+                /* Finally, the B-field normalized to the peak amplitude. */
+                return float3_X(Bx_rot, By_rot, float_X(calcTWTSBx(pos[2], time)));
+            }
+
+            HDINLINE
+            float3_X BField::operator()(DataSpace<simDim> const& cellIdx, uint32_t const currentStep) const
+            {
+                float_64 const time_SI = float_64(currentStep) * dt - tdelay;
+                traits::FieldPosition<fields::CellType, FieldB> const fieldPosB;
+
+                pmacc::math::Vector<floatD_64, detail::numComponents> const bFieldPositions_SI
+                    = detail::getFieldPositions_SI(cellIdx, halfSimSize, fieldPosB(), unit_length, focus_y_SI, phi);
+                /* Single TWTS-Pulse */
+                switch(pol)
+                {
+                case LINEAR_X:
+                    return getTWTSBfield_Normalized<simDim>(bFieldPositions_SI, time_SI);
+
+                case LINEAR_YZ:
+                    return getTWTSBfield_Normalized_Ey<simDim>(bFieldPositions_SI, time_SI);
+                }
+                return getTWTSBfield_Normalized<simDim>(bFieldPositions_SI,
+                                                        time_SI); // defensive default
+            }
+
+            /** Calculate the By(r,t) field here
+             *
+             * @param pos Spatial position of the target field.
+             * @param time Absolute time (SI, including all offsets and transformations)
+             *             for calculating the field */
+            HDINLINE
+            BField::float_T BField::calcTWTSBy(float3_64 const& pos, float_64 const time) const
+            {
+                using complex_T = pmacc::math::Complex<float_T>;
+                using complex_64 = pmacc::math::Complex<float_64>;
+
+                /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
+                float_T const beta0 = float_T(beta_0);
+                /* If phi < 0 the formulas below are not directly applicable.
+                 * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
+                 * z-axis of the coordinate system in this function.
+                 */
+                float_T const phiReal = float_T(math::abs(phi));
+                float_T sinPhiReal;
+                float_T cosPhiReal;
+                pmacc::math::sincos(phiReal, sinPhiReal, cosPhiReal);
+                float_T const alphaTilt = math::atan2(float_T(1.0) - beta0 * cosPhiReal, beta0 * sinPhiReal);
+                /* Definition of the laser pulse front tilt angle for the laser field below.
+                 *
+                 * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
+                 * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
+                 * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
+                 * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
+                 * the dispersion will (although physically correct) be slightly off the ideal TWTS
+                 * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
+                 * scenarios close to beta0 = 1.
+                 */
+                float_T const phiT = float_T(2.0) * alphaTilt;
+
+                /* Angle between the laser pulse front and the y-axis. Not used, but remains in code for
+                 * documentation purposes.
+                 * float_T const eta = float_T(PI/2) - (phiReal - alphaTilt);
+                 */
+
+                float_T const cspeed = float_T(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
+                float_T const lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
+                float_T const om0 = float_T(2.0 * PI) * cspeed / lambda0;
+                /* factor 2  in tauG arises from definition convention in laser formula */
+                float_T const tauG = float_T(pulselength_SI * 2.0 / UNIT_TIME);
+                /* w0 is wx here --> w0 could be replaced by wx */
+                float_T const w0 = float_T(w_x_SI / UNIT_LENGTH);
+                float_T const rho0 = float_T(PI * w0 * w0 / lambda0);
+                float_T const k = float_T(2.0 * PI / lambda0);
+
+                /* In order to calculate in single-precision and in order to account for errors in
+                 * the approximations far from the coordinate origin, we use the wavelength-periodicity and
+                 * the known propagation direction for realizing the laser pulse using relative coordinates
+                 * (i.e. from a finite coordinate range) only. All these quantities have to be calculated
+                 * in double precision.
+                 */
+                float_64 sinPhiVal;
+                float_64 cosPhiVal;
+                pmacc::math::sincos(precisionCast<float_64>(phi), sinPhiVal, cosPhiVal);
+                float_64 const tanAlpha = (1.0 - beta_0 * cosPhiVal) / (beta_0 * sinPhiVal);
+                float_64 const tanFocalLine = math::tan(PI / 2.0 - phi);
+                float_64 const deltaT = wavelength_SI / SI::SPEED_OF_LIGHT_SI * (1.0 + tanAlpha / tanFocalLine);
+                float_64 const deltaY = wavelength_SI / tanFocalLine;
+                float_64 const deltaZ = -wavelength_SI;
+                float_64 const numberOfPeriods = math::floor(time / deltaT);
+                float_T const timeMod = float_T(time - numberOfPeriods * deltaT);
+                float_T const yMod = float_T(pos.y() + numberOfPeriods * deltaY);
+                float_T const zMod = float_T(pos.z() + numberOfPeriods * deltaZ);
+
+                float_T const x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
+                float_T const y = float_T(phiPositive * yMod / UNIT_LENGTH);
+                float_T const z = float_T(zMod / UNIT_LENGTH);
+                float_T const t = float_T(timeMod / UNIT_TIME);
+
+                /* Calculating shortcuts for speeding up field calculation */
+                float_T sinPhi;
+                float_T cosPhi;
+                pmacc::math::sincos(phiT, sinPhi, cosPhi);
+                float_T const cscPhi = float_T(1.0) / sinPhi;
+                float_T const secPhi2 = float_T(1.0) / math::cos(phiT / float_T(2.0));
+                float_T const sinPhi2 = math::sin(phiT / float_T(2.0));
+                float_T const sin2Phi = math::sin(phiT * float_T(2.0));
+                float_T const tanPhi2 = math::tan(phiT / float_T(2.0));
+
+                float_T const sinPhi_2 = sinPhi * sinPhi;
+                float_T const sinPhi_3 = sinPhi * sinPhi_2;
+                float_T const sinPhi_4 = sinPhi_2 * sinPhi_2;
+
+                float_T const sinPhi2_2 = sinPhi2 * sinPhi2;
+                float_T const sinPhi2_4 = sinPhi2_2 * sinPhi2_2;
+                float_T const tanPhi2_2 = tanPhi2 * tanPhi2;
+
+                float_T const tauG2 = tauG * tauG;
+                float_T const x2 = x * x;
+                float_T const y2 = y * y;
+                float_T const z2 = z * z;
+
+                /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
+                 * thus help with formal code verification through manual code inspection.
+                 */
+                const complex_T helpVar1 = cspeed * om0 * tauG2 * sinPhi_4
+                    - complex_T(0, 8) * sinPhi2_4 * sinPhi * (y * cosPhi + z * sinPhi);
+
+                const complex_T helpVar2 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+
+                const complex_T helpVar3
+                    = (complex_T(0, float_T(-0.5)) * cscPhi
+                       * (complex_T(0, -8) * om0 * y * (cspeed * t - z) * sinPhi2_2 * sinPhi_4
+                              * (complex_T(0, 1) * rho0 - z * sinPhi)
+                          - om0 * sinPhi_4 * sinPhi
+                              * (-float_t(2.0) * z2 * rho0
+                                 - cspeed * cspeed
+                                     * (k * tauG2 * x2 + float_t(2.0) * t * (t - complex_T(0, 1) * om0 * tauG2) * rho0)
+                                 + cspeed * (4 * t * z * rho0 - complex_T(0, 2) * om0 * tauG2 * z * rho0)
+                                 - complex_T(0, 2) * (cspeed * t - z)
+                                     * (cspeed * (t - complex_T(0, 1) * om0 * tauG2) - z) * z * sinPhi)
+                          + y * sinPhi
+                              * (complex_T(0, 4) * om0 * y * (cspeed * t - z) * sinPhi2_2 * sinPhi_2
+                                 + om0 * (cspeed * t - z)
+                                     * (complex_T(0, 1) * cspeed * t + cspeed * om0 * tauG2 - complex_T(0, 1) * z)
+                                     * sinPhi_3
+                                 - complex_T(0, 4) * sinPhi2_4
+                                     * (cspeed * k * x2 - om0 * (y2 - float_T(4.0) * (cspeed * t - z) * z) * sinPhi))
+                              * sin2Phi
+                          - complex_T(0, 4) * sinPhi2_4
+                              * (complex_T(0, -4) * om0 * y * (cspeed * t - z) * rho0 * cosPhi * sinPhi_2
+                                 + complex_T(0, 2)
+                                     * (om0 * (y2 + float_T(2.0) * z2) * rho0
+                                        - cspeed * z * (complex_T(0, 1) * k * x2 + float_T(2.0) * om0 * t * rho0))
+                                     * sinPhi_3
+                                 - float_T(2.0) * om0 * z * (y2 - float_T(2.0) * (cspeed * t - z) * z) * sinPhi_4
+                                 + om0 * y2 * (cspeed * t - z) * sin2Phi * sin2Phi)))
+                    / (cspeed * helpVar2 * helpVar1);
+
+                complex_T const helpVar4 = cspeed * om0 * tauG * tauG
+                    - complex_T(0, 8) * y * math::tan(float_T(PI / 2.0) - phiT) * cscPhi * cscPhi * sinPhi2_4
+                    - complex_T(0, 2) * z * tanPhi2_2;
+
+                complex_T const result
+                    = (math::exp(helpVar3) * tauG * secPhi2 * secPhi2
+                       * (complex_T(0, 2) * cspeed * t + cspeed * om0 * tauG2 - complex_T(0, 4) * z
+                          + cspeed * (complex_T(0, 2) * t + om0 * tauG2) * cosPhi + complex_T(0, 2) * y * tanPhi2)
+                       * math::sqrt(cspeed * om0 * rho0 / helpVar2))
+                    / (float_T(2.0) * cspeed * math::pow(helpVar4, float_T(1.5)));
+
+                return result.get_real() / UNIT_SPEED;
+            }
+
+            /** Calculate the Bz(r,t) field
+             *
+             * @param pos Spatial position of the target field.
+             * @param time Absolute time (SI, including all offsets and transformations)
+             *             for calculating the field */
+            HDINLINE
+            BField::float_T BField::calcTWTSBz_Ex(float3_64 const& pos, float_64 const time) const
+            {
+                using complex_T = pmacc::math::Complex<float_T>;
+
+                /* propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
+                float_T const beta0 = float_T(beta_0);
+                /* If phi < 0 the formulas below are not directly applicable.
+                 * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
+                 * z-axis of the coordinate system in this function.
+                 */
+                float_T const phiReal = float_T(math::abs(phi));
+                float_T sinPhiReal;
+                float_T cosPhiReal;
+                pmacc::math::sincos(phiReal, sinPhiReal, cosPhiReal);
+                float_T const alphaTilt = math::atan2(float_T(1.0) - beta0 * cosPhiReal, beta0 * sinPhiReal);
+
+                /* Definition of the laser pulse front tilt angle for the laser field below.
+                 *
+                 * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
+                 * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
+                 * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
+                 * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
+                 * the dispersion will (although physically correct) be slightly off the ideal TWTS
+                 * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
+                 * scenarios close to beta0 = 1.
+                 */
+                float_T const phiT = float_T(2.0) * alphaTilt;
+
+                /* Angle between the laser pulse front and the y-axis.
+                 * Not used, but remains in code for documentation purposes.
+                 * float_T const eta = float_T(float_T(PI / 2)) - (phiReal - alphaTilt);
+                 */
+
+                float_T const cspeed = float_T(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
+                float_T const lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
+                float_T const om0 = float_T(2.0 * PI) * cspeed / lambda0;
+                /* factor 2  in tauG arises from definition convention in laser formula */
+                float_T const tauG = float_T(pulselength_SI * 2.0 / UNIT_TIME);
+                /* w0 is wx here --> w0 could be replaced by wx */
+                float_T const w0 = float_T(w_x_SI / UNIT_LENGTH);
+                float_T const rho0 = float_T(PI * w0 * w0 / lambda0);
+                float_T const k = float_T(2.0 * PI / lambda0);
+
+                /* In order to calculate in single-precision and in order to account for errors in
+                 * the approximations far from the coordinate origin, we use the wavelength-periodicity and
+                 * the known propagation direction for realizing the laser pulse using relative coordinates
+                 * (i.e. from a finite coordinate range) only. All these quantities have to be calculated
+                 * in double precision.
+                 */
+                float_64 sinPhiVal;
+                float_64 cosPhiVal;
+                pmacc::math::sincos(precisionCast<float_64>(phi), sinPhiVal, cosPhiVal);
+                float_64 const tanAlpha = (1.0 - beta_0 * cosPhiVal) / (beta_0 * sinPhiVal);
+                float_64 const tanFocalLine = math::tan(PI / 2.0 - phi);
+                float_64 const deltaT = wavelength_SI / SI::SPEED_OF_LIGHT_SI * (1.0 + tanAlpha / tanFocalLine);
+                float_64 const deltaY = wavelength_SI / tanFocalLine;
+                float_64 const deltaZ = -wavelength_SI;
+                float_64 const numberOfPeriods = math::floor(time / deltaT);
+                float_T const timeMod = float_T(time - numberOfPeriods * deltaT);
+                float_T const yMod = float_T(pos.y() + numberOfPeriods * deltaY);
+                float_T const zMod = float_T(pos.z() + numberOfPeriods * deltaZ);
+
+                float_T const x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
+                float_T const y = float_T(phiPositive * yMod / UNIT_LENGTH);
+                float_T const z = float_T(zMod / UNIT_LENGTH);
+                float_T const t = float_T(timeMod / UNIT_TIME);
+
+                /* Calculating shortcuts for speeding up field calculation */
+                float_T sinPhi;
+                float_T cosPhi;
+                pmacc::math::sincos(phiT, sinPhi, cosPhi);
+                float_T const cscPhi = float_T(1.0) / sinPhi;
+                float_T const secPhi2 = float_T(1.0) / math::cos(phiT / float_T(2.0));
+                float_T const sinPhi2 = math::sin(phiT / float_T(2.0));
+                float_T const tanPhi2 = math::tan(phiT / float_T(2.0));
+
+                float_T const cscPhi_3 = cscPhi * cscPhi * cscPhi;
+
+                float_T const sinPhi2_2 = sinPhi2 * sinPhi2;
+                float_T const sinPhi2_4 = sinPhi2_2 * sinPhi2_2;
+                float_T const tanPhi2_2 = tanPhi2 * tanPhi2;
+                float_T const secPhi2_2 = secPhi2 * secPhi2;
+
+                float_T const tanPI2_phi = math::tan(float_T(PI / 2.0) - phiT);
+
+                float_T const tauG2 = tauG * tauG;
+                float_T const om02 = om0 * om0;
+                float_T const x2 = x * x;
+                float_T const y2 = y * y;
+                float_T const z2 = z * z;
+
+                /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
+                 * thus help with formal code verification through manual code inspection.
+                 */
+                const complex_T helpVar1 = cspeed * om0 * tauG2 - complex_T(0, 1) * y * cosPhi * secPhi2_2 * tanPhi2
+                    - complex_T(0, 2) * z * tanPhi2_2;
+                const complex_T helpVar2 = complex_T(0, 1) * cspeed * rho0 - cspeed * y * cosPhi - cspeed * z * sinPhi;
+                const complex_T helpVar3 = rho0 + complex_T(0, 1) * y * cosPhi + complex_T(0, 1) * z * sinPhi;
+                const complex_T helpVar4 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+                const complex_T helpVar5 = -z - y * tanPI2_phi + complex_T(0, 1) * rho0 * cscPhi;
+                const complex_T helpVar6
+                    = -cspeed * z - cspeed * y * tanPI2_phi + complex_T(0, 1) * cspeed * rho0 * cscPhi;
+                const complex_T helpVar7 = complex_T(0, 1) * cspeed * rho0 - cspeed * y * cosPhi - cspeed * z * sinPhi;
+
+                const complex_T helpVar8
+                    = (om0 * y * rho0 * secPhi2_2 * secPhi2_2 / helpVar6
+                       + (om0 * y * tanPI2_phi
+                          * (cspeed * om0 * tauG2
+                             + float_T(8.0) * (complex_T(0, 2) * y + rho0) * cscPhi_3 * sinPhi2_4))
+                           / (cspeed * helpVar5)
+                       + om02 * tauG2 * z * sinPhi / helpVar4 - float_T(2.0) * k * x2 / helpVar3
+                       - om02 * tauG2 * rho0 / helpVar3
+                       + complex_T(0, 1) * om0 * y2 * cosPhi * cosPhi * secPhi2_2 * tanPhi2 / helpVar2
+                       + complex_T(0, 4) * om0 * y * z * tanPhi2_2 / helpVar2
+                       - float_T(2.0) * om0 * z * rho0 * tanPhi2_2 / helpVar2
+                       - complex_T(0, 2) * om0 * z2 * sinPhi * tanPhi2_2 / helpVar2
+                       - (om0
+                          * math::pow(
+                              float_T(2.0) * cspeed * t - complex_T(0, 1) * cspeed * om0 * tauG2 - float_T(2.0) * z
+                                  + float_T(8.0) * y * cscPhi_3 * sinPhi2_4 - float_T(2.0) * z * tanPhi2_2,
+                              float_T(2.0)))
+                           / (cspeed * helpVar1))
+                    / float_T(4.0);
+
+                const complex_T helpVar9 = cspeed * om0 * tauG2 - complex_T(0, 1) * y * cosPhi * secPhi2_2 * tanPhi2
+                    - complex_T(0, 2) * z * tanPhi2_2;
+
+                const complex_T result = float_T(phiPositive)
+                    * (complex_T(0, 2) * math::exp(helpVar8) * tauG * tanPhi2 * (cspeed * t - z + y * tanPhi2)
+                       * math::sqrt(om0 * rho0 / helpVar7))
+                    / math::pow(helpVar9, float_T(1.5));
+
+                return result.get_real() / UNIT_SPEED;
+            }
+
+            /** Calculate the Bx(r,t) field
+             *
+             * @param pos Spatial position of the target field.
+             * @param time Absolute time (SI, including all offsets and transformations)
+             *             for calculating the field */
+            HDINLINE
+            BField::float_T BField::calcTWTSBx(float3_64 const& pos, float_64 const time) const
+            {
+                /* The Bx-field for the Ey-field is the same as
+                 * for the By-field for the Ex-field except for the sign.
+                 */
+                return -calcTWTSBy(pos, time);
+            }
+
+            /** Calculate the Bz(r,t) field
+             *
+             * @param pos Spatial position of the target field.
+             * @param time Absolute time (SI, including all offsets and transformations)
+             *             for calculating the field */
+            HDINLINE
+            BField::float_T BField::calcTWTSBz_Ey(float3_64 const& pos, float_64 const time) const
+            {
+                using complex_T = pmacc::math::Complex<float_T>;
+                using complex_64 = pmacc::math::Complex<float_64>;
+
+                /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
+                float_T const beta0 = float_T(beta_0);
+                /* If phi < 0 the formulas below are not directly applicable.
+                 * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
+                 * z-axis of the coordinate system in this function.
+                 */
+                float_T const phiReal = float_T(math::abs(phi));
+                float_T cosPhiReal;
+                float_T sinPhiReal;
+                pmacc::math::sincos(phiReal, sinPhiReal, cosPhiReal);
+                float_T const alphaTilt = math::atan2(float_T(1.0) - beta0 * cosPhiReal, beta0 * sinPhiReal);
+                /* Definition of the laser pulse front tilt angle for the laser field below.
+                 *
+                 * For beta0=1.0, this is equivalent to our standard definition. Question: Why is the
+                 * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
+                 * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
+                 * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
+                 * the dispersion will (although physically correct) be slightly off the ideal TWTS
+                 * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
+                 * scenarios close to beta0 = 1.
+                 */
+                float_T const phiT = float_T(2.0) * alphaTilt;
+
+                /* Angle between the laser pulse front and the y-axis.
+                 * Not used, but remains in code for documentation purposes.
+                 * float_T const eta = float_T(float_T(PI / 2)) - (phiReal - alphaTilt);
+                 */
+
+                float_T const cspeed = float_T(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
+                float_T const lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
+                float_T const om0 = float_T(2.0 * PI) * cspeed / lambda0;
+                /* factor 2  in tauG arises from definition convention in laser formula */
+                float_T const tauG = float_T(pulselength_SI * 2.0 / UNIT_TIME);
+                /* w0 is wx here --> w0 could be replaced by wx */
+                float_T const w0 = float_T(w_x_SI / UNIT_LENGTH);
+                float_T const rho0 = float_T(PI * w0 * w0 / lambda0);
+                float_T const k = float_T(2.0 * PI / lambda0);
+
+                /* In order to calculate in single-precision and in order to account for errors in
+                 * the approximations far from the coordinate origin, we use the wavelength-periodicity and
+                 * the known propagation direction for realizing the laser pulse using relative coordinates
+                 * (i.e. from a finite coordinate range) only. All these quantities have to be calculated
+                 * in double precision.
+                 */
+                float_64 sinPhiVal;
+                float_64 cosPhiVal;
+                pmacc::math::sincos(precisionCast<float_64>(phi), sinPhiVal, cosPhiVal);
+                float_64 const tanAlpha = (1.0 - beta_0 * cosPhiVal) / (beta_0 * sinPhiVal);
+                float_64 const tanFocalLine = math::tan(PI / 2.0 - phi);
+                float_64 const deltaT = wavelength_SI / SI::SPEED_OF_LIGHT_SI * (1.0 + tanAlpha / tanFocalLine);
+                float_64 const deltaY = wavelength_SI / tanFocalLine;
+                float_64 const deltaZ = -wavelength_SI;
+                float_64 const numberOfPeriods = math::floor(time / deltaT);
+                float_T const timeMod = float_T(time - numberOfPeriods * deltaT);
+                float_T const yMod = float_T(pos.y() + numberOfPeriods * deltaY);
+                float_T const zMod = float_T(pos.z() + numberOfPeriods * deltaZ);
+
+                float_T const x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
+                float_T const y = float_T(phiPositive * yMod / UNIT_LENGTH);
+                float_T const z = float_T(zMod / UNIT_LENGTH);
+                float_T const t = float_T(timeMod / UNIT_TIME);
+
+                /* Shortcuts for speeding up the field calculation. */
+                float_T sinPhi;
+                float_T cosPhi;
+                pmacc::math::sincos(phiT, sinPhi, cosPhi);
+                float_T const sin2Phi = math::sin(phiT * float_T(2.0));
+                float_T const sinPhi2 = math::sin(phiT / float_T(2.0));
+                float_T const tanPhi2 = math::tan(phiT / float_T(2.0));
+
+                float_T const cscPhi = float_T(1.0) / sinPhi;
+                float_T const tanPI2_phi = math::tan(float_T(PI / 2.0) - phiT);
+
+                float_T const sinPhi_2 = sinPhi * sinPhi;
+                float_T const sinPhi_4 = sinPhi_2 * sinPhi_2;
+                float_T const sinPhi2_2 = sinPhi2 * sinPhi2;
+                float_T const sinPhi2_4 = sinPhi2_2 * sinPhi2_2;
+                float_T const tanPhi2_2 = tanPhi2 * tanPhi2;
+
+                float_T const tauG2 = tauG * tauG;
+
+                float_T const x2 = x * x;
+                float_T const y2 = y * y;
+                float_T const z2 = z * z;
+
+                /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
+                 * thus help with formal code verification through manual code inspection.
+                 */
+                const complex_T helpVar1 = cspeed * om0 * tauG2 * sinPhi_4
+                    - complex_T(0, 8) * sinPhi2_4 * sinPhi * (y * cosPhi + z * sinPhi);
+
+                const complex_T helpVar2 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+
+                const complex_T helpVar3
+                    = (complex_T(0, float_T(-0.5)) * cscPhi
+                       * (complex_T(0, -8) * om0 * y * (cspeed * t - z) * sinPhi2_2 * sinPhi_4
+                              * (complex_T(0, 1) * rho0 - z * sinPhi)
+                          - om0 * sinPhi * sinPhi_4
+                              * (float_T(-2.0) * z2 * rho0
+                                 - cspeed * cspeed
+                                     * (k * tauG2 * x2 + float_T(2.0) * t * (t - complex_T(0, 1) * om0 * tauG2) * rho0)
+                                 + cspeed * (float_T(4.0) * t * z * rho0 - complex_T(0, 2) * om0 * tauG2 * z * rho0)
+                                 - complex_T(0, 2) * (cspeed * t - z)
+                                     * (cspeed * (t - complex_T(0, 1) * om0 * tauG2) - z) * z * sinPhi)
+                          + float_T(2.0) * y * cosPhi * sinPhi_2
+                              * (complex_T(0, 4) * om0 * y * (cspeed * t - z) * sinPhi2_2 * sinPhi_2
+                                 + om0 * (cspeed * t - z)
+                                     * (complex_T(0, 1) * cspeed * t + cspeed * om0 * tauG2 - complex_T(0, 1) * z)
+                                     * sinPhi_2 * sinPhi
+                                 - complex_T(0, 4) * sinPhi2_4
+                                     * (cspeed * k * x2 - om0 * (y2 - float_T(4.0) * (cspeed * t - z) * z) * sinPhi))
+                          - complex_T(0, 4) * sinPhi2_4
+                              * (complex_T(0, -4) * om0 * y * (cspeed * t - z) * rho0 * cosPhi * sinPhi_2
+                                 + complex_T(0, 2)
+                                     * (om0 * (y2 + float_T(2.0) * z2) * rho0
+                                        - cspeed * z * (complex_T(0, 1) * k * x2 + float_T(2.0) * om0 * t * rho0))
+                                     * sinPhi_2 * sinPhi
+                                 - float_T(2.0) * om0 * z * (y2 - float_T(2.0) * (cspeed * t - z) * z) * sinPhi_4
+                                 + om0 * y2 * (cspeed * t - z) * sin2Phi * sin2Phi))
+                       /* The "round-trip" conversion in the line below fixes a gross accuracy bug
+                        * in floating-point arithmetics, when float_T is set to float_X.
+                        */
+                       )
+                    * complex_T(1.0 / complex_64(cspeed * helpVar2 * helpVar1));
+
+                const complex_T helpVar4 = cspeed * om0 * rho0
+                    * (cspeed * om0 * tauG2 - complex_T(0, 8) * y * tanPI2_phi * cscPhi * cscPhi * sinPhi2_4
+                       - complex_T(0, 2) * z * tanPhi2_2);
+
+                const complex_T result = float_T(-1.0)
+                    * (cspeed * math::exp(helpVar3) * k * tauG * x * rho0
+                       * math::pow(float_T(1.0) / helpVar2, float_T(1.5)))
+                    / math::sqrt(helpVar4);
+
+                return result.get_real() / UNIT_SPEED;
+            }
+
+        } /* namespace twtsfast */
+    } /* namespace templates */
+} /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/EField.hpp b/include/picongpu/fields/background/templates/twtsfast/EField.hpp
new file mode 100644
index 0000000000..43c39802ae
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/EField.hpp
@@ -0,0 +1,166 @@
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include <pmacc/types.hpp>
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include "picongpu/fields/background/templates/twtsfast/numComponents.hpp"
+
+namespace picongpu
+{
+    /* Load pre-defined background field */
+    namespace templates
+    {
+        /* Traveling-wave Thomson scattering laser pulse */
+        namespace twtsfast
+        {
+            class EField
+            {
+            public:
+                using float_T = float_X;
+
+                enum PolarizationType
+                {
+                    /** The linear polarization of the TWTS laser is defined
+                     *  relative to the plane of the pulse front tilt.
+                     *
+                     *  Polarisation is normal to the reference plane.
+                     *  Use Ex-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
+                     */
+                    LINEAR_X = 1u,
+                    /** Polarization lies within the reference plane.
+                     *  Use Ey-fields (and corresponding B-fields) in TWTS laser internal coordinate system.
+                     */
+                    LINEAR_YZ = 2u,
+                };
+
+                /** Center of simulation volume in number of cells */
+                PMACC_ALIGN(halfSimSize, DataSpace<simDim>);
+                /** y-position of TWTS coordinate origin inside the simulation coordinates [meter]
+                    The other origin coordinates (x and z) default to globally centered values
+                    with respect to the simulation volume. */
+                PMACC_ALIGN(focus_y_SI, float_64 const);
+                /** Laser wavelength [meter] */
+                PMACC_ALIGN(wavelength_SI, float_64 const);
+                /** TWTS laser pulse duration [second] */
+                PMACC_ALIGN(pulselength_SI, float_64 const);
+                /** line focus height of TWTS pulse [meter] */
+                PMACC_ALIGN(w_x_SI, float_64 const);
+                /** interaction angle between TWTS laser propagation vector and the y-axis [rad] */
+                PMACC_ALIGN(phi, float_X const);
+                /** Takes value 1.0 for phi > 0 and -1.0 for phi < 0. */
+                PMACC_ALIGN(phiPositive, float_X);
+                /** propagation speed of TWTS laser overlap
+                normalized to the speed of light. [Default: beta0=1.0] */
+                PMACC_ALIGN(beta_0, float_X const);
+                /** If auto_tdelay=FALSE, then a user defined delay is used. [second] */
+                PMACC_ALIGN(tdelay_user_SI, float_64 const);
+                /** Make time step constant accessible to device. */
+                PMACC_ALIGN(dt, float_64 const);
+                /** Make length normalization constant accessible to device. */
+                PMACC_ALIGN(unit_length, float_64 const);
+                /** TWTS laser time delay */
+                PMACC_ALIGN(tdelay, float_64);
+                /** Should the TWTS laser delay be chosen automatically, such that
+                 *  the laser gradually enters the simulation volume? [Default: TRUE]
+                 */
+                PMACC_ALIGN(auto_tdelay, bool const);
+                /** Polarization of TWTS laser */
+                PMACC_ALIGN(pol, PolarizationType const);
+
+                /** Electric field of the TWTS laser
+                 *
+                 * @param focus_y_SI the distance to the laser focus in y-direction [m]
+                 * @param wavelength_SI central wavelength [m]
+                 * @param pulselength_SI sigma of std. gauss for intensity (E^2),
+                 *  pulselength_SI = FWHM_of_Intensity / 2.35482 [seconds (sigma)]
+                 * @param w_x beam waist: distance from the axis where the pulse electric field
+                 *  decreases to its 1/e^2-th part at the focus position of the laser [m]
+                 * @param phi interaction angle between TWTS laser propagation vector and
+                 *  the y-axis [rad, default = 90.*(PI/180.)]
+                 * @param beta_0 propagation speed of overlap normalized to
+                 *  the speed of light [c, default = 1.0]
+                 * @param tdelay_user manual time delay if auto_tdelay is false
+                 * @param auto_tdelay calculate the time delay such that the TWTS pulse is not
+                 *  inside the simulation volume at simulation start timestep = 0 [default = true]
+                 * @param pol dtermines the TWTS laser polarization, which is either normal or parallel
+                 *  to the laser pulse front tilt plane [ default= LINEAR_X , LINEAR_YZ ]
+                 */
+                HINLINE
+                EField(
+                    float_64 const focus_y_SI,
+                    float_64 const wavelength_SI,
+                    float_64 const pulselength_SI,
+                    float_64 const w_x_SI,
+                    float_X const phi = 90. * (PI / 180.),
+                    float_X const beta_0 = 1.0,
+                    float_64 const tdelay_user_SI = 0.0,
+                    bool const auto_tdelay = true,
+                    PolarizationType const pol = LINEAR_X);
+
+                /** Specify your background field E(r,t) here
+                 *
+                 * @param cellIdx The total cell id counted from the start at timestep 0.
+                 * @param currentStep The current time step
+                 * @return float3_X with field normalized to amplitude in range [-1.:1.]
+                 */
+                HDINLINE float3_X operator()(DataSpace<simDim> const& cellIdx, uint32_t const currentStep) const;
+
+                /** Calculate the Ex(r,t) field here (electric field vector normal to pulse-front-tilt plane)
+                 *
+                 * @param pos Spatial position of the target field
+                 * @param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field
+                 * @return Ex-field component of the non-rotated TWTS field in SI units */
+                HDINLINE float_T calcTWTSEx(float3_64 const& pos, float_64 const time) const;
+
+                /** Calculate the Ey(r,t) field here (electric field vector in pulse-front-tilt plane)
+                 *
+                 * @param pos Spatial position of the target field
+                 * @param time Absolute time (SI, including all offsets and transformations)
+                 *  for calculating the field
+                 * @return Ex-field component of the non-rotated TWTS field in SI units */
+                HDINLINE float_T calcTWTSEy(float3_64 const& pos, float_64 const time) const;
+
+                /** Calculate the E-field vector of the TWTS laser in SI units.
+                 * @tparam T_dim Specializes for the simulation dimension
+                 * @param cellIdx The total cell id counted from the start at timestep 0
+                 * @return Efield vector of the rotated TWTS field in SI units */
+                template<unsigned T_dim>
+                HDINLINE float3_X getTWTSEfield_Normalized(
+                    pmacc::math::Vector<floatD_64, detail::numComponents> const& eFieldPositions_SI,
+                    float_64 const time) const;
+
+                /** Calculate the E-field vector of the "in-plane polarized" TWTS laser in SI units.
+                 * @tparam T_dim Specializes for the simulation dimension
+                 * @param cellIdx The total cell id counted from the start at timestep 0
+                 * @return Efield vector of the rotated TWTS field in SI units */
+                template<unsigned T_dim>
+                HDINLINE float3_X getTWTSEfield_Normalized_Ey(
+                    pmacc::math::Vector<floatD_64, detail::numComponents> const& eFieldPositions_SI,
+                    float_64 const time) const;
+            };
+
+        } /* namespace twtsfast */
+    } /* namespace templates */
+} /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/EField.tpp b/include/picongpu/fields/background/templates/twtsfast/EField.tpp
new file mode 100644
index 0000000000..16e4d283b6
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/EField.tpp
@@ -0,0 +1,358 @@
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include <pmacc/types.hpp>
+#include "picongpu/simulation_defines.hpp"
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include <pmacc/mappings/simulation/SubGrid.hpp>
+#include <pmacc/math/Complex.hpp>
+
+#include "picongpu/fields/background/templates/twtsfast/RotateField.tpp"
+#include "picongpu/fields/background/templates/twtsfast/GetInitialTimeDelay_SI.tpp"
+#include "picongpu/fields/background/templates/twtsfast/getFieldPositions_SI.tpp"
+#include "picongpu/fields/background/templates/twtsfast/EField.hpp"
+#include "picongpu/fields/CellType.hpp"
+
+namespace picongpu
+{
+    /* Load pre-defined background field */
+    namespace templates
+    {
+        /* Traveling-wave Thomson scattering laser pulse */
+        namespace twtsfast
+        {
+            HINLINE
+            EField::EField(
+                float_64 const focus_y_SI,
+                float_64 const wavelength_SI,
+                float_64 const pulselength_SI,
+                float_64 const w_x_SI,
+                float_X const phi,
+                float_X const beta_0,
+                float_64 const tdelay_user_SI,
+                bool const auto_tdelay,
+                PolarizationType const pol)
+                : focus_y_SI(focus_y_SI)
+                , wavelength_SI(wavelength_SI)
+                , pulselength_SI(pulselength_SI)
+                , w_x_SI(w_x_SI)
+                , phi(phi)
+                , beta_0(beta_0)
+                , tdelay_user_SI(tdelay_user_SI)
+                , dt(SI::DELTA_T_SI)
+                , unit_length(UNIT_LENGTH)
+                , auto_tdelay(auto_tdelay)
+                , pol(pol)
+                , phiPositive(float_X(1.0))
+            {
+                /* Note: Enviroment-objects cannot be instantiated on CUDA GPU device. Since this is done
+                         on host (see fieldBackground.param), this is no problem.
+                 */
+                SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                halfSimSize = subGrid.getGlobalDomain().size / 2;
+                tdelay = detail::getInitialTimeDelay_SI(
+                    auto_tdelay,
+                    tdelay_user_SI,
+                    halfSimSize,
+                    pulselength_SI,
+                    focus_y_SI,
+                    phi,
+                    beta_0);
+                if(phi < 0.0_X)
+                    phiPositive = float_X(-1.0);
+            }
+
+            template<>
+            HDINLINE float3_X EField::getTWTSEfield_Normalized<DIM3>(
+                pmacc::math::Vector<floatD_64, detail::numComponents> const& eFieldPositions_SI,
+                float_64 const time) const
+            {
+                float3_64 pos(float3_64::create(0.0));
+                for(uint32_t i = 0; i < simDim; ++i)
+                    pos[i] = eFieldPositions_SI[0][i];
+                return float3_X(float_X(calcTWTSEx(pos, time)), 0.0_X, 0.0_X);
+            }
+
+            template<>
+            HDINLINE float3_X EField::getTWTSEfield_Normalized_Ey<DIM3>(
+                pmacc::math::Vector<floatD_64, detail::numComponents> const& eFieldPositions_SI,
+                float_64 const time) const
+            {
+                using PosVecVec = pmacc::math::Vector<float3_64, detail::numComponents>;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    for(uint32_t i = 0; i < simDim; ++i)
+                        pos[k][i] = eFieldPositions_SI[k][i];
+                }
+
+                /* Calculate Ey-component with the intra-cell offset of a Ey-field */
+                float_64 const Ey_Ey = calcTWTSEy(pos[1], time);
+                /* Calculate Ey-component with the intra-cell offset of a Ez-field */
+                float_64 const Ey_Ez = calcTWTSEy(pos[2], time);
+
+                /* Since we rotated all position vectors before calling calcTWTSEy,
+                 * we need to back-rotate the resulting E-field vector.
+                 *
+                 * RotationMatrix[-(PI/2+phi)].(Ey,Ez) for rotating back the field-vectors.
+                 */
+                float_X sinPhi;
+                float_X cosPhi;
+                pmacc::math::sincos(phi, sinPhi, cosPhi);
+                float_X const Ey_rot = -sinPhi * float_X(Ey_Ey);
+                float_X const Ez_rot = -cosPhi * float_X(Ey_Ez);
+
+                /* Finally, the E-field normalized to the peak amplitude. */
+                return float3_X(0.0_X, Ey_rot, Ez_rot);
+            }
+
+            template<>
+            HDINLINE float3_X EField::getTWTSEfield_Normalized<DIM2>(
+                pmacc::math::Vector<floatD_64, detail::numComponents> const& eFieldPositions_SI,
+                float_64 const time) const
+            {
+                /* Ex->Ez, so also the grid cell offset for Ez has to be used. */
+                float3_64 pos(float3_64::create(0.0));
+                /* 2D (y,z) vectors are mapped on 3D (x,y,z) vectors. */
+                for(uint32_t i = 0; i < DIM2; ++i)
+                    pos[i + 1] = eFieldPositions_SI[2][i];
+                return float3_X(0.0_X, 0.0_X, float_X(calcTWTSEx(pos, time)));
+            }
+
+            template<>
+            HDINLINE float3_X EField::getTWTSEfield_Normalized_Ey<DIM2>(
+                pmacc::math::Vector<floatD_64, detail::numComponents> const& eFieldPositions_SI,
+                float_64 const time) const
+            {
+                using PosVecVec = pmacc::math::Vector<float3_64, detail::numComponents>;
+                PosVecVec pos(PosVecVec::create(float3_64::create(0.0)));
+
+                /* The 2D output of getFieldPositions_SI only returns
+                 * the y- and z-component of a 3D vector.
+                 */
+                for(uint32_t k = 0; k < detail::numComponents; ++k)
+                {
+                    for(uint32_t i = 0; i < DIM2; ++i)
+                        pos[k][i + 1] = eFieldPositions_SI[k][i];
+                }
+
+                /* Ey->Ey, but grid cell offsets for Ex and Ey have to be used.
+                 *
+                 * Calculate Ey-component with the intra-cell offset of a Ey-field
+                 */
+                float_64 const Ey_Ey = calcTWTSEy(pos[1], time);
+                /* Calculate Ey-component with the intra-cell offset of a Ex-field */
+                float_64 const Ey_Ex = calcTWTSEy(pos[0], time);
+
+                /* Since we rotated all position vectors before calling calcTWTSEy,
+                 * we need to back-rotate the resulting E-field vector.
+                 *
+                 * RotationMatrix[-(PI / 2+phi)].(Ey,Ex) for rotating back the field-vectors.
+                 */
+                float_X sinPhi;
+                float_X cosPhi;
+                pmacc::math::sincos(phi, sinPhi, cosPhi);
+                float_X const Ey_rot = -sinPhi * float_X(Ey_Ey);
+                float_X const Ex_rot = -cosPhi * float_X(Ey_Ex);
+
+                /* Finally, the E-field normalized to the peak amplitude. */
+                return float3_X(Ex_rot, Ey_rot, 0.0_X);
+            }
+
+            HDINLINE float3_X EField::operator()(DataSpace<simDim> const& cellIdx, uint32_t const currentStep) const
+            {
+                float_64 const time_SI = float_64(currentStep) * dt - tdelay;
+                traits::FieldPosition<fields::CellType, FieldE> const fieldPosE;
+
+                pmacc::math::Vector<floatD_64, detail::numComponents> const eFieldPositions_SI
+                    = detail::getFieldPositions_SI(cellIdx, halfSimSize, fieldPosE(), unit_length, focus_y_SI, phi);
+
+                /* Single TWTS-Pulse */
+                switch(pol)
+                {
+                case LINEAR_X:
+                    return getTWTSEfield_Normalized<simDim>(eFieldPositions_SI, time_SI);
+
+                case LINEAR_YZ:
+                    return getTWTSEfield_Normalized_Ey<simDim>(eFieldPositions_SI, time_SI);
+                }
+                return getTWTSEfield_Normalized<simDim>(eFieldPositions_SI, time_SI); // defensive default
+            }
+
+            /** Calculate the Ex(r,t) field here
+             *
+             * @param pos Spatial position of the target field.
+             * @param time Absolute time (SI, including all offsets and transformations) for calculating
+             *             the field */
+            HDINLINE EField::float_T EField::calcTWTSEx(float3_64 const& pos, float_64 const time) const
+            {
+                using complex_T = pmacc::math::Complex<float_T>;
+                using complex_64 = pmacc::math::Complex<float_64>;
+
+                /* Propagation speed of overlap normalized to the speed of light [Default: beta0=1.0] */
+                float_T const beta0 = float_T(beta_0);
+                /* If phi < 0 the formulas below are not directly applicable.
+                 * Instead phi is taken positive, but the entire pulse rotated by 180 deg around the
+                 * z-axis of the coordinate system in this function.
+                 */
+                float_T const phiReal = float_T(math::abs(phi));
+                float_T sinPhiReal;
+                float_T cosPhiReal;
+                pmacc::math::sincos(phiReal, sinPhiReal, cosPhiReal);
+                float_T const alphaTilt = math::atan2(float_T(1.0) - beta0 * cosPhiReal, beta0 * sinPhiReal);
+                /* Definition of the laser pulse front tilt angle for the laser field below.
+                 *
+                 * For beta0 = 1.0, this is equivalent to our standard definition. Question: Why is the
+                 * local "phi_T" not equal in value to the object member "phiReal" or "phi"?
+                 * Because the standard TWTS pulse is defined for beta0 = 1.0 and in the coordinate-system
+                 * of the TWTS model phi is responsible for pulse front tilt and dispersion only. Hence
+                 * the dispersion will (although physically correct) be slightly off the ideal TWTS
+                 * pulse for beta0 != 1.0. This only shows that this TWTS pulse is primarily designed for
+                 * scenarios close to beta0 = 1.
+                 */
+                float_T const phiT = float_T(2.0) * alphaTilt;
+
+                /* Angle between the laser pulse front and the y-axis. Not used, but remains in code for
+                 * documentation purposes.
+                 * float_T const eta = (PI / 2) - (phiReal - alphaTilt);
+                 */
+
+                float_T const cspeed = float_T(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
+                float_T const lambda0 = float_T(wavelength_SI / UNIT_LENGTH);
+                float_T const om0 = float_T(2.0 * PI) * cspeed / lambda0;
+                /* factor 2  in tauG arises from definition convention in laser formula */
+                float_T const tauG = float_T(pulselength_SI * 2.0 / UNIT_TIME);
+                /* w0 is wx here --> w0 could be replaced by wx */
+                float_T const w0 = float_T(w_x_SI / UNIT_LENGTH);
+                float_T const rho0 = float_T(PI * w0 * w0 / lambda0);
+                float_T const k = float_T(2.0 * PI / lambda0);
+
+                /* In order to calculate in single-precision and in order to account for errors in
+                 * the approximations far from the coordinate origin, we use the wavelength-periodicity and
+                 * the known propagation direction for realizing the laser pulse using relative coordinates
+                 * (i.e. from a finite coordinate range) only. All these quantities have to be calculated
+                 * in double precision.
+                 */
+                float_64 sinPhiVal;
+                float_64 cosPhiVal;
+                pmacc::math::sincos(precisionCast<float_64>(phi), sinPhiVal, cosPhiVal);
+                float_64 const tanAlpha = (1.0 - beta_0 * cosPhiVal) / (beta_0 * sinPhiVal);
+                float_64 const tanFocalLine = math::tan(PI / 2.0 - phi);
+                float_64 const deltaT = wavelength_SI / SI::SPEED_OF_LIGHT_SI * (1.0 + tanAlpha / tanFocalLine);
+                float_64 const deltaY = wavelength_SI / tanFocalLine;
+                float_64 const deltaZ = -wavelength_SI;
+                float_64 const numberOfPeriods = math::floor(time / deltaT);
+                float_T const timeMod = float_T(time - numberOfPeriods * deltaT);
+                float_T const yMod = float_T(pos.y() + numberOfPeriods * deltaY);
+                float_T const zMod = float_T(pos.z() + numberOfPeriods * deltaZ);
+
+                float_T const x = float_T(phiPositive * pos.x() / UNIT_LENGTH);
+                float_T const y = float_T(phiPositive * yMod / UNIT_LENGTH);
+                float_T const z = float_T(zMod / UNIT_LENGTH);
+                float_T const t = float_T(timeMod / UNIT_TIME);
+
+                /* Calculating shortcuts for speeding up field calculation */
+                float_T sinPhi;
+                float_T cosPhi;
+                pmacc::math::sincos(phiT, sinPhi, cosPhi);
+                float_T const cscPhi = float_T(1.0) / sinPhi;
+                float_T const sinPhi2 = math::sin(phiT / float_T(2.0));
+                float_T const sin2Phi = math::sin(phiT * float_T(2.0));
+                float_T const tanPhi2 = math::tan(phiT / float_T(2.0));
+
+                float_T const sinPhi_2 = sinPhi * sinPhi;
+                float_T const sinPhi_3 = sinPhi * sinPhi_2;
+                float_T const sinPhi_4 = sinPhi_2 * sinPhi_2;
+
+                float_T const sinPhi2_2 = sinPhi2 * sinPhi2;
+                float_T const sinPhi2_4 = sinPhi2_2 * sinPhi2_2;
+                float_T const tanPhi2_2 = tanPhi2 * tanPhi2;
+
+                float_T const tauG2 = tauG * tauG;
+                float_T const x2 = x * x;
+                float_T const y2 = y * y;
+                float_T const z2 = z * z;
+
+                /* The "helpVar" variables decrease the nesting level of the evaluated expressions and
+                 * thus help with formal code verification through manual code inspection.
+                 */
+                complex_T const helpVar1 = cspeed * om0 * tauG2 * sinPhi_4
+                    - complex_T(0, 8) * sinPhi2_4 * sinPhi * (y * cosPhi + z * sinPhi);
+
+                complex_T const helpVar2 = complex_T(0, 1) * rho0 - y * cosPhi - z * sinPhi;
+
+                complex_T const helpVar3 = complex_T(0, float_T(-0.5)) * cscPhi
+                    * (complex_T(0, -8) * om0 * y * (cspeed * t - z) * sinPhi2_2 * sinPhi_4
+                           * (complex_T(0, 1) * rho0 - z * sinPhi)
+                       - om0 * sinPhi_4 * sinPhi
+                           * (-float_T(2.0) * z2 * rho0
+                              - cspeed * cspeed
+                                  * (k * tauG2 * x2 + float_T(2.0) * t * (t - complex_T(0, 1) * om0 * tauG2) * rho0)
+                              + cspeed * (float_T(4.0) * t * z * rho0 - complex_T(0, 2) * om0 * tauG2 * z * rho0)
+                              - complex_T(0, 2) * (cspeed * t - z) * (cspeed * (t - complex_T(0, 1) * om0 * tauG2) - z)
+                                  * z * sinPhi)
+                       + float_T(2.0) * y * cosPhi * sinPhi_2
+                           * (complex_T(0, 4) * om0 * y * (cspeed * t - z) * sinPhi2_2 * sinPhi_2
+                              + om0 * (cspeed * t - z)
+                                  * (complex_T(0, 1) * cspeed * t + cspeed * om0 * tauG2 - complex_T(0, 1) * z)
+                                  * sinPhi_3
+                              - complex_T(0, 4) * sinPhi2_4
+                                  * (cspeed * k * x2 - om0 * (y2 - float_T(4.0) * (cspeed * t - z) * z) * sinPhi))
+                       - complex_T(0, 4) * sinPhi2_4
+                           * (complex_T(0, -4) * om0 * y * (cspeed * t - z) * rho0 * cosPhi * sinPhi_2
+                              + complex_T(0, 2)
+                                  * (om0 * (y2 + float_T(2.0) * z2) * rho0
+                                     - cspeed * z * (complex_T(0, 1) * k * x2 + float_T(2.0) * om0 * t * rho0))
+                                  * sinPhi_3
+                              - float_T(2.0) * om0 * z * (y2 - float_T(2.0) * (cspeed * t - z) * z) * sinPhi_4
+                              + om0 * y2 * (cspeed * t - z) * sin2Phi * sin2Phi))
+                    / (cspeed * helpVar2 * helpVar1);
+
+                complex_T const helpVar4 = cspeed * om0 * tauG2
+                    - complex_T(0, 8) * y * math::tan(float_T(PI / 2.0) - phiT) * cscPhi * cscPhi * sinPhi2_4
+                    - complex_T(0, 2) * z * tanPhi2_2;
+
+                complex_T const result
+                    = (math::exp(helpVar3) * tauG * math::sqrt(cspeed * om0 * rho0 / helpVar2)) / math::sqrt(helpVar4);
+
+                return result.get_real();
+            }
+
+            /** Calculate the Ey(r,t) field here
+             *
+             * @param pos Spatial position of the target field.
+             * @param time Absolute time (SI, including all offsets and transformations) for calculating
+             *             the field */
+            HDINLINE EField::float_T EField::calcTWTSEy(float3_64 const& pos, float_64 const time) const
+            {
+                /* The field function of Ey (polarization in pulse-front-tilt plane)
+                 * is by definition identical to Ex (polarization normal to pulse-front-tilt plane)
+                 */
+                return calcTWTSEx(pos, time);
+            }
+
+        } /* namespace twtsfast */
+    } /* namespace templates */
+} /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/GetInitialTimeDelay_SI.tpp b/include/picongpu/fields/background/templates/twtsfast/GetInitialTimeDelay_SI.tpp
new file mode 100644
index 0000000000..8f99ef7fa7
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/GetInitialTimeDelay_SI.tpp
@@ -0,0 +1,162 @@
+/* Copyright 2014-2021 Alexander Debus
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include <pmacc/types.hpp>
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+
+namespace picongpu
+{
+    namespace templates
+    {
+        namespace twtsfast
+        {
+            /* Auxiliary functions for calculating the TWTS field */
+            namespace detail
+            {
+                template<unsigned T_dim>
+                class GetInitialTimeDelay
+                {
+                public:
+                    /** Obtain the SI time delay that later enters the Ex(r, t), By(r, t) and Bz(r, t)
+                     *  calculations as t.
+                     * @tparam T_dim Specializes for the simulation dimension
+                     *  @param auto_tdelay calculate the time delay such that the TWTS pulse is not
+                     *                     inside the simulation volume at simulation start
+                     *                     timestep = 0 [default = true]
+                     *  @param tdelay_user_SI manual time delay if auto_tdelay is false
+                     *  @param halfSimSize center of simulation volume in number of cells
+                     *  @param pulselength_SI sigma of std. gauss for intensity (E^2)
+                     *  @param focus_y_SI the distance to the laser focus in y-direction [m]
+                     *  @param phi interaction angle between TWTS laser propagation vector and
+                     *             the y-axis [rad, default = 90.*(PI / 180.)]
+                     *  @param beta_0 propagation speed of overlap normalized
+                     *                to the speed of light [c, default = 1.0]
+                     *  @return time delay in SI units */
+                    HDINLINE float_64 operator()(
+                        bool const auto_tdelay,
+                        float_64 const tdelay_user_SI,
+                        DataSpace<simDim> const& halfSimSize,
+                        float_64 const pulselength_SI,
+                        float_64 const focus_y_SI,
+                        float_X const phi,
+                        float_X const beta_0) const;
+                };
+
+                template<>
+                HDINLINE float_64 GetInitialTimeDelay<DIM3>::operator()(
+                    bool const auto_tdelay,
+                    float_64 const tdelay_user_SI,
+                    DataSpace<simDim> const& halfSimSize,
+                    float_64 const pulselength_SI,
+                    float_64 const focus_y_SI,
+                    float_X const phi,
+                    float_X const beta_0) const
+                {
+                    if(auto_tdelay)
+                    {
+                        /* angle between the laser pulse front and the y-axis. Good approximation for
+                         * beta0\simeq 1. For exact relation look in TWTS core routines for Ex, By or Bz. */
+                        float_64 const eta = (PI / 2) - (phi / 2);
+                        /* halfSimSize[2] --> Half-depth of simulation volume (in z); By geometric
+                         * projection we calculate the y-distance walkoff of the TWTS-pulse.
+                         * The abs( )-function is for correct offset for -phi<-90Deg and +phi>+90Deg. */
+                        float_64 const y1
+                            = float_64(halfSimSize[2] * picongpu::SI::CELL_DEPTH_SI) * math::abs(math::cos(eta));
+                        /* Fudge parameter to make sure, that TWTS pulse starts to impact simulation volume
+                         * at low intensity values. */
+                        float_64 const m = 3.;
+                        /* Approximate cross section of laser pulse through y-axis,
+                         * scaled with "fudge factor" m. */
+                        float_64 const y2 = m * (pulselength_SI * picongpu::SI::SPEED_OF_LIGHT_SI) / math::cos(eta);
+                        /* y-position of laser coordinate system origin within simulation. */
+                        float_64 const y3 = focus_y_SI;
+                        /* Programmatically obtained time-delay */
+                        float_64 const tdelay = (y1 + y2 + y3) / (picongpu::SI::SPEED_OF_LIGHT_SI * beta_0);
+
+                        return tdelay;
+                    }
+                    else
+                        return tdelay_user_SI;
+                }
+
+                template<>
+                HDINLINE float_64 GetInitialTimeDelay<DIM2>::operator()(
+                    bool const auto_tdelay,
+                    float_64 const tdelay_user_SI,
+                    DataSpace<simDim> const& halfSimSize,
+                    float_64 const pulselength_SI,
+                    float_64 const focus_y_SI,
+                    float_X const phi,
+                    float_X const beta_0) const
+                {
+                    if(auto_tdelay)
+                    {
+                        /* angle between the laser pulse front and the y-axis. Good approximation for
+                         * beta0\simeq 1. For exact relation look in TWTS core routines for Ex, By or Bz. */
+                        float_64 const eta = (PI / 2) - (phi / 2);
+                        /* halfSimSize[0] --> Half-depth of simulation volume (in x); By geometric
+                         * projection we calculate the y-distance walkoff of the TWTS-pulse.
+                         * The abs( )-function is for correct offset for -phi<-90Deg and +phi>+90Deg. */
+                        float_64 const y1
+                            = float_64(halfSimSize[0] * picongpu::SI::CELL_WIDTH_SI) * math::abs(math::cos(eta));
+                        /* Fudge parameter to make sure, that TWTS pulse starts to impact simulation volume
+                         * at low intensity values. */
+                        float_64 const m = 3.;
+                        /* Approximate cross section of laser pulse through y-axis,
+                         * scaled with "fudge factor" m. */
+                        float_64 const y2 = m * (pulselength_SI * picongpu::SI::SPEED_OF_LIGHT_SI) / math::cos(eta);
+                        /* y-position of laser coordinate system origin within simulation. */
+                        float_64 const y3 = focus_y_SI;
+                        /* Programmatically obtained time-delay */
+                        float_64 const tdelay = (y1 + y2 + y3) / (picongpu::SI::SPEED_OF_LIGHT_SI * beta_0);
+
+                        return tdelay;
+                    }
+                    else
+                        return tdelay_user_SI;
+                }
+
+                template<unsigned T_Dim>
+                HDINLINE float_64 getInitialTimeDelay_SI(
+                    bool const auto_tdelay,
+                    float_64 const tdelay_user_SI,
+                    DataSpace<T_Dim> const& halfSimSize,
+                    float_64 const pulselength_SI,
+                    float_64 const focus_y_SI,
+                    float_X const phi,
+                    float_X const beta_0)
+                {
+                    return GetInitialTimeDelay<T_Dim>()(
+                        auto_tdelay,
+                        tdelay_user_SI,
+                        halfSimSize,
+                        pulselength_SI,
+                        focus_y_SI,
+                        phi,
+                        beta_0);
+                }
+
+            } /* namespace detail */
+        } /* namespace twtsfast */
+    } /* namespace templates */
+} /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/RotateField.tpp b/include/picongpu/fields/background/templates/twtsfast/RotateField.tpp
new file mode 100644
index 0000000000..dadb89969b
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/RotateField.tpp
@@ -0,0 +1,117 @@
+/* Copyright 2014-2021 Alexander Debus, Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include <pmacc/types.hpp>
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+
+namespace picongpu
+{
+    namespace templates
+    {
+        namespace twtsfast
+        {
+            /** Auxiliary functions for calculating the TWTS field */
+            namespace detail
+            {
+                template<typename T_Type, typename T_AngleType>
+                struct RotateField;
+
+                template<typename T_Type, typename T_AngleType>
+                struct RotateField<pmacc::math::Vector<T_Type, 3>, T_AngleType>
+                {
+                    using result = pmacc::math::Vector<T_Type, 3>;
+                    using AngleType = T_AngleType;
+                    HDINLINE result operator()(result const& fieldPosVector, AngleType const phi) const
+                    {
+                        /*  Since, the laser propagation direction encloses an angle of phi with the
+                         *  simulation y-axis (i.e. direction of sliding window), the positions vectors are
+                         *  rotated around the simulation x-axis before calling the TWTS field functions.
+                         *  Note: The TWTS field functions are in non-rotated frame and only use the angle
+                         *  phi to determine the required amount of pulse front tilt.
+                         *  RotationMatrix[PI/2+phi].(y,z) (180Deg-flip at phi=90Deg since coordinate
+                         *  system in paper is oriented the other way round.) */
+                        AngleType sinPhi;
+                        AngleType cosPhi;
+                        pmacc::math::sincos(AngleType(phi), sinPhi, cosPhi);
+                        return result(
+                            fieldPosVector.x(),
+                            -sinPhi * fieldPosVector.y() - cosPhi * fieldPosVector.z(),
+                            +cosPhi * fieldPosVector.y() - sinPhi * fieldPosVector.z());
+                    }
+                };
+
+                template<typename T_Type, typename T_AngleType>
+                struct RotateField<pmacc::math::Vector<T_Type, 2>, T_AngleType>
+                {
+                    using result = pmacc::math::Vector<T_Type, 2>;
+                    using AngleType = T_AngleType;
+                    HDINLINE result operator()(result const& fieldPosVector, AngleType const phi) const
+                    {
+                        /*  Since, the laser propagation direction encloses an angle of phi with the
+                         *  simulation y-axis (i.e. direction of sliding window), the positions vectors are
+                         *  rotated around the simulation x-axis before calling the TWTS field functions.
+                         *  Note: The TWTS field functions are in non-rotated frame and only use the angle
+                         *  phi to determine the required amount of pulse front tilt.
+                         *  RotationMatrix[PI/2+phi].(y,z) (180Deg-flip at phi=90Deg since coordinate
+                         *  system in paper is oriented the other way round.) */
+
+                        /*  Rotate 90 degree around y-axis, so that TWTS laser propagates within
+                         *  the 2D (x,y)-plane. Corresponding position vector for the Ez-components
+                         *  in 2D simulations.
+                         *  3D     3D vectors in 2D space (x,y)
+                         *  x -->  z
+                         *  y -->  y
+                         *  z --> -x (Since z=0 for 2D, we use the existing
+                         *            TWTS-field-function and set -x=0)
+                         *
+                         * Explicit implementation in 3D coordinates:
+                         * fieldPosVector = float3_64( -fieldPosVector.z( ),       //(Here: ==0)
+                         *                              fieldPosVector.y( ),
+                         *                              fieldPosVector.x( ) );
+                         * fieldPosVector = float3_64( fieldPosVector.x( ),
+                         *       -sin(phi)*fieldPosVector.y( )-cos(phi)*fieldPosVector.z(),
+                         *       +cos(phi)*fieldPosVector.y( )-sin(phi)*fieldPosVector.z()  );
+                         * The 2D implementation here only calculates the last two components.
+                         * Note: The x-axis of rotation is fine in 2D, because that component now contains
+                         *       the (non-existing) simulation z-coordinate. */
+                        AngleType sinPhi;
+                        AngleType cosPhi;
+                        pmacc::math::sincos(AngleType(phi), sinPhi, cosPhi);
+                        return result(
+                            -sinPhi * fieldPosVector.y() - cosPhi * fieldPosVector.x(),
+                            +cosPhi * fieldPosVector.y() - sinPhi * fieldPosVector.x());
+                    }
+                };
+
+                template<typename T_Type, typename T_AngleType>
+                HDINLINE typename RotateField<T_Type, T_AngleType>::result rotateField(
+                    T_Type const& fieldPosVector,
+                    T_AngleType const phi)
+                {
+                    return RotateField<T_Type, T_AngleType>()(fieldPosVector, phi);
+                }
+
+            } /* namespace detail */
+        } /* namespace twtsfast */
+    } /* namespace templates */
+} /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/getFieldPositions_SI.tpp b/include/picongpu/fields/background/templates/twtsfast/getFieldPositions_SI.tpp
new file mode 100644
index 0000000000..a7e799e865
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/getFieldPositions_SI.tpp
@@ -0,0 +1,81 @@
+/* Copyright 2014-2021 Alexander Debus
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include <pmacc/types.hpp>
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include "picongpu/fields/background/templates/twtsfast/numComponents.hpp"
+
+namespace picongpu
+{
+    namespace templates
+    {
+        namespace twtsfast
+        {
+            /** Auxiliary functions for calculating the TWTS field */
+            namespace detail
+            {
+                /** Calculate the SI position vectors that later enter the Ex(r, t), By(r, t)
+                 *  and Bz(r ,t) calculations as r.
+                 *  @param cellIdx The total cell id counted from the start at timestep 0. */
+                HDINLINE pmacc::math::Vector<floatD_64, numComponents> getFieldPositions_SI(
+                    DataSpace<simDim> const& cellIdx,
+                    DataSpace<simDim> const& halfSimSize,
+                    pmacc::math::Vector<floatD_X, numComponents> const& fieldOnGridPositions,
+                    float_64 const unit_length,
+                    float_64 const focus_y_SI,
+                    float_X const phi)
+                {
+                    /* Note: Neither direct precisionCast on picongpu::cellSize
+                       or casting on floatD_ does work. */
+                    floatD_64 const cellDim(picongpu::cellSize.shrink<simDim>());
+                    floatD_64 const cellDimensions = cellDim * unit_length;
+
+                    /* TWTS laser coordinate origin is centered transversally and defined longitudinally by
+                       the laser center in y (usually maximum of intensity). */
+                    floatD_X laserOrigin = precisionCast<float_X>(halfSimSize);
+                    laserOrigin.y() = float_X(focus_y_SI / cellDimensions.y());
+
+                    /* For staggered fields (e.g. Yee-grid), obtain the fractional cell index components and add
+                     * that to the total cell indices. The physical field coordinate origin is transversally
+                     * centered with respect to the global simulation volume.
+                     * pmacc::math::Vector<floatD_X, numComponents> fieldPositions =
+                     *                traits::FieldPosition<fields::CellType, FieldE>( ); */
+                    pmacc::math::Vector<floatD_X, numComponents> fieldPositions = fieldOnGridPositions;
+
+                    pmacc::math::Vector<floatD_64, numComponents> fieldPositions_SI;
+
+                    for(uint32_t i = 0; i < numComponents; ++i) /* cellIdx Ex, Ey and Ez */
+                    {
+                        fieldPositions[i] += (precisionCast<float_X>(cellIdx) - laserOrigin);
+                        fieldPositions_SI[i] = precisionCast<float_64>(fieldPositions[i]) * cellDimensions;
+
+                        fieldPositions_SI[i] = rotateField(fieldPositions_SI[i], phi);
+                    }
+
+                    return fieldPositions_SI;
+                }
+
+            } /* namespace detail */
+        } /* namespace twtsfast */
+    } /* namespace templates */
+} /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/numComponents.hpp b/include/picongpu/fields/background/templates/twtsfast/numComponents.hpp
new file mode 100644
index 0000000000..fbd4eb334c
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/numComponents.hpp
@@ -0,0 +1,36 @@
+/* Copyright 2014-2021 Alexander Debus, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+namespace picongpu
+{
+    namespace templates
+    {
+        namespace twtsfast
+        {
+            namespace detail
+            {
+                /** Number of field components used in the simulation. [Default: 3 for both 2D and 3D] */
+                uint32_t const numComponents = 3;
+            } /* namespace detail */
+        } /* namespace twtsfast*/
+    } /* namespace templates */
+} /* namespace picongpu */
diff --git a/include/picongpu/fields/background/templates/twtsfast/twtsfast.hpp b/include/picongpu/fields/background/templates/twtsfast/twtsfast.hpp
new file mode 100644
index 0000000000..1c42b9481e
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/twtsfast.hpp
@@ -0,0 +1,59 @@
+/* Copyright 2014-2021 Alexander Debus
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * This background field implements a obliquely incident, cylindrically-focused, pulse-front tilted laser for some
+ * incidence angle phi as used for [1].
+ *
+ * The TWTS implementation generally follows the definition of eq. (7) in [1]. In deriving the magnetic field
+ * components, a slowly-varying wave approximation was assumed, by neglegting the spatial derivatives of the
+ * 2nd omega-order TWTS-phase-terms for the B-field-component transverse to direction of propagation, and additionally
+ * neglect the 1st-order TWTS-phase-terms for the B-field-component longitudinal to the direction of propagation.
+ *
+ * Specifically, this TWTSfast approximation assumes a special case, where the transverse extent (but not its height wx
+ * or its pulse duration) of the TWTS-laser wy is assumed to be infinite. While this special case of the TWTS laser
+ * applies to a large range of use cases, the resulting form allows to use different spatial and time coordinates
+ * (timeMod, yMod and zMod), which allow long term numerical stability beyond 100000 timesteps at single precision,
+ * as well as for mitigating errors of the approximations far from the coordinate origin.
+ *
+ * We exploit the wavelength-periodicity and the known propagation direction for realizing the laser pulse
+ * using relative coordinates (i.e. from a finite coordinate range) only. All these quantities have to be calculated
+ * in double precision.
+ *
+ * float_64 const tanAlpha = (float_64(1.0) - beta_0 * math::cos(phi)) / (beta_0 * math::sin(phi));
+ * float_64 const tanFocalLine = math::tan(PI / float_64(2.0) - phi);
+ * float_64 const deltaT = wavelength_SI / SI::SPEED_OF_LIGHT_SI * (float_64(1.0) + tanAlpha / tanFocalLine);
+ * float_64 const deltaY = wavelength_SI / tanFocalLine;
+ * float_64 const deltaZ = -wavelength_SI;
+ * float_64 const numberOfPeriods = math::floor(time / deltaT);
+ * float_T const timeMod = float_T(time - numberOfPeriods * deltaT);
+ * float_T const yMod = float_T(pos.y() + numberOfPeriods * deltaY);
+ * float_T const zMod = float_T(pos.z() + numberOfPeriods * deltaZ);
+ *
+ * Literature:
+ * [1] Steiniger et al., "Optical free-electron lasers with Traveling-Wave Thomson-Scattering",
+ *     Journal of Physics B: Atomic, Molecular and Optical Physics, Volume 47, Number 23 (2014),
+ *     https://doi.org/10.1088/0953-4075/47/23/234011
+ */
+
+#pragma once
+
+#include "picongpu/fields/background/templates/twtsfast/EField.hpp"
+#include "picongpu/fields/background/templates/twtsfast/BField.hpp"
diff --git a/include/picongpu/fields/background/templates/twtsfast/twtsfast.tpp b/include/picongpu/fields/background/templates/twtsfast/twtsfast.tpp
new file mode 100644
index 0000000000..65b6a513e6
--- /dev/null
+++ b/include/picongpu/fields/background/templates/twtsfast/twtsfast.tpp
@@ -0,0 +1,24 @@
+/* Copyright 2014-2021 Alexander Debus
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+#include "picongpu/fields/background/templates/twtsfast/EField.tpp"
+#include "picongpu/fields/background/templates/twtsfast/BField.tpp"
diff --git a/include/picongpu/fields/cellType/Centered.hpp b/include/picongpu/fields/cellType/Centered.hpp
index 5d9941444a..3ba35d7bb9 100644
--- a/include/picongpu/fields/cellType/Centered.hpp
+++ b/include/picongpu/fields/cellType/Centered.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -28,176 +28,156 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace cellType
-{
-
-    struct Centered{};
+    namespace fields
+    {
+        namespace cellType
+        {
+            struct Centered
+            {
+            };
 
-} //namespace fields
-} //namespace cellType
+        } // namespace cellType
+    } // namespace fields
 
-namespace traits
-{
-    /** position (floatD_X in case of T_simDim == simDim) in cell for
-     *  E_x, E_y, E_z
-     */
-    template< uint32_t T_simDim >
-    struct FieldPosition<
-        fields::cellType::Centered,
-        FieldE,
-        T_simDim
-    >
+    namespace traits
     {
-        using PosType = pmacc::math::Vector<float_X, T_simDim>;
-        using ReturnType = const pmacc::math::Vector<PosType, DIM3>;
-
-        /// boost::result_of hints
-        template<class> struct result;
+        /** position (floatD_X in case of T_simDim == simDim) in cell for
+         *  E_x, E_y, E_z
+         */
+        template<uint32_t T_simDim>
+        struct FieldPosition<fields::cellType::Centered, FieldE, T_simDim>
+        {
+            using PosType = pmacc::math::Vector<float_X, T_simDim>;
+            using ReturnType = const pmacc::math::Vector<PosType, DIM3>;
 
-        template<class F>
-        struct result<F()> {
-            using type = ReturnType;
-        };
+            /// boost::result_of hints
+            template<class>
+            struct result;
 
-        HDINLINE FieldPosition()
-        {
-        }
+            template<class F>
+            struct result<F()>
+            {
+                using type = ReturnType;
+            };
 
-        HDINLINE ReturnType operator()() const
-        {
-            const auto center = PosType::create( 0.5 );
-
-            return ReturnType::create( center );
-        }
-    };
-
-    /** position (floatD_X in case of T_simDim == simDim) in cell for
-     *  B_x, B_y, B_z
-     */
-    template< uint32_t T_simDim >
-    struct FieldPosition<
-        fields::cellType::Centered,
-        FieldB,
-        T_simDim
-    > : public FieldPosition<
-        fields::cellType::Centered,
-        FieldE,
-        T_simDim
-    >
-    {
-        HDINLINE FieldPosition()
-        {
-        }
-    };
-
-    /** position (float2_X) in cell for J_x, J_y, J_z */
-    template<>
-    struct FieldPosition<
-        fields::cellType::Centered,
-        FieldJ,
-        DIM2
-    >
-    {
-        /** \tparam float2_X position of the component in the cell
-         *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
-         */
-        using VectorVector2D3V = const ::pmacc::math::Vector<
-           float2_X,
-           DIM3
-        >;
-        /// boost::result_of hints
-        template<class> struct result;
-
-        template<class F>
-        struct result<F()> {
-            using type = VectorVector2D3V;
-        };
+            HDINLINE FieldPosition()
+            {
+            }
 
-        HDINLINE FieldPosition()
-        {
-        }
+            HDINLINE ReturnType operator()() const
+            {
+                const auto center = PosType::create(0.5);
 
-        HDINLINE VectorVector2D3V operator()() const
-        {
-            const float2_X posJ_x( 0.5, 0.0 );
-            const float2_X posJ_y( 0.0, 0.5 );
-            const float2_X posJ_z( 0.0, 0.0 );
-
-            return VectorVector2D3V( posJ_x, posJ_y, posJ_z );
-        }
-    };
-
-    /** position (float3_X) in cell for J_x, J_y, J_z
-     */
-    template<>
-    struct FieldPosition<
-        fields::cellType::Centered,
-        FieldJ,
-        DIM3
-    >
-    {
-        /** \tparam float2_X position of the component in the cell
-         *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
-         */
-        using VectorVector3D3V = const ::pmacc::math::Vector<
-            float3_X,
-            DIM3
-        >;
-        /// boost::result_of hints
-        template<class> struct result;
-
-        template<class F>
-        struct result<F()> {
-            using type = VectorVector3D3V;
+                return ReturnType::create(center);
+            }
         };
 
-        HDINLINE FieldPosition()
+        /** position (floatD_X in case of T_simDim == simDim) in cell for
+         *  B_x, B_y, B_z
+         */
+        template<uint32_t T_simDim>
+        struct FieldPosition<fields::cellType::Centered, FieldB, T_simDim>
+            : public FieldPosition<fields::cellType::Centered, FieldE, T_simDim>
         {
-        }
+            HDINLINE FieldPosition()
+            {
+            }
+        };
 
-        HDINLINE VectorVector3D3V operator()() const
+        /** position (float2_X) in cell for J_x, J_y, J_z */
+        template<>
+        struct FieldPosition<fields::cellType::Centered, FieldJ, DIM2>
         {
-            const float3_X posJ_x( 0.5, 0.0, 0.0 );
-            const float3_X posJ_y( 0.0, 0.5, 0.0 );
-            const float3_X posJ_z( 0.0, 0.0, 0.5 );
-
-            return VectorVector3D3V( posJ_x, posJ_y, posJ_z );
-        }
-    };
-
-    /** position (floatD_X in case of T_simDim == simDim) in cell, wrapped in
-     * one-component vector since it's a scalar field with only one component, for the
-     * scalar field FieldTmp
-     */
-    template< uint32_t T_simDim >
-    struct FieldPosition<
-        fields::cellType::Centered,
-        FieldTmp,
-        T_simDim
-    >
-    {
-        using FieldPos = pmacc::math::Vector<float_X, T_simDim>;
-        using ReturnType = pmacc::math::Vector<FieldPos, DIM1>;
-
-        /// boost::result_of hints
-        template<class> struct result;
-
-        template<class F>
-        struct result<F()> {
-            using type = ReturnType;
+            /** \tparam float2_X position of the component in the cell
+             *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
+             */
+            using VectorVector2D3V = const ::pmacc::math::Vector<float2_X, DIM3>;
+            /// boost::result_of hints
+            template<class>
+            struct result;
+
+            template<class F>
+            struct result<F()>
+            {
+                using type = VectorVector2D3V;
+            };
+
+            HDINLINE FieldPosition()
+            {
+            }
+
+            HDINLINE VectorVector2D3V operator()() const
+            {
+                const float2_X posJ_x(0.5, 0.0);
+                const float2_X posJ_y(0.0, 0.5);
+                const float2_X posJ_z(0.0, 0.0);
+
+                return VectorVector2D3V(posJ_x, posJ_y, posJ_z);
+            }
         };
 
-        HDINLINE FieldPosition()
+        /** position (float3_X) in cell for J_x, J_y, J_z
+         */
+        template<>
+        struct FieldPosition<fields::cellType::Centered, FieldJ, DIM3>
         {
-        }
+            /** \tparam float2_X position of the component in the cell
+             *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
+             */
+            using VectorVector3D3V = const ::pmacc::math::Vector<float3_X, DIM3>;
+            /// boost::result_of hints
+            template<class>
+            struct result;
+
+            template<class F>
+            struct result<F()>
+            {
+                using type = VectorVector3D3V;
+            };
+
+            HDINLINE FieldPosition()
+            {
+            }
+
+            HDINLINE VectorVector3D3V operator()() const
+            {
+                const float3_X posJ_x(0.5, 0.0, 0.0);
+                const float3_X posJ_y(0.0, 0.5, 0.0);
+                const float3_X posJ_z(0.0, 0.0, 0.5);
+
+                return VectorVector3D3V(posJ_x, posJ_y, posJ_z);
+            }
+        };
 
-        HDINLINE ReturnType operator()() const
+        /** position (floatD_X in case of T_simDim == simDim) in cell, wrapped in
+         * one-component vector since it's a scalar field with only one component, for the
+         * scalar field FieldTmp
+         */
+        template<uint32_t T_simDim>
+        struct FieldPosition<fields::cellType::Centered, FieldTmp, T_simDim>
         {
-            return ReturnType( FieldPos::create(0.0) );
-        }
-    };
+            using FieldPos = pmacc::math::Vector<float_X, T_simDim>;
+            using ReturnType = pmacc::math::Vector<FieldPos, DIM1>;
+
+            /// boost::result_of hints
+            template<class>
+            struct result;
+
+            template<class F>
+            struct result<F()>
+            {
+                using type = ReturnType;
+            };
+
+            HDINLINE FieldPosition()
+            {
+            }
+
+            HDINLINE ReturnType operator()() const
+            {
+                return ReturnType(FieldPos::create(0.0));
+            }
+        };
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/fields/cellType/Yee.hpp b/include/picongpu/fields/cellType/Yee.hpp
index 320b333952..d8d6f136d7 100644
--- a/include/picongpu/fields/cellType/Yee.hpp
+++ b/include/picongpu/fields/cellType/Yee.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -28,221 +28,194 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace cellType
-{
-    struct Yee{};
-
-} // namespace cellType
-} // namespace fields
-
-namespace traits
-{
-    /** position (float2_X) in cell for E_x, E_y, E_z
-     */
-    template<>
-    struct FieldPosition<
-        fields::cellType::Yee,
-        FieldE,
-        DIM2
-    >
+    namespace fields
     {
-        /** \tparam float2_X position of the component in the cell
-         *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
-         */
-        using VectorVector2D3V = const ::pmacc::math::Vector<
-            float2_X,
-            DIM3
-        >;
-        /// boost::result_of hints
-        template<class> struct result;
-
-        template<class F>
-        struct result<F()> {
-            using type = VectorVector2D3V;
-        };
-
-        HDINLINE FieldPosition()
+        namespace cellType
         {
-        }
+            struct Yee
+            {
+            };
 
-        HDINLINE VectorVector2D3V operator()() const
-        {
-            const float2_X posE_x( 0.5, 0.0 );
-            const float2_X posE_y( 0.0, 0.5 );
-            const float2_X posE_z( 0.0, 0.0 );
-
-            return VectorVector2D3V( posE_x, posE_y, posE_z );
-        }
-    };
-
-    /** position (float3_X) in cell for E_x, E_y, E_z
-     */
-    template<>
-    struct FieldPosition<
-        fields::cellType::Yee,
-        FieldE,
-        DIM3
-    >
-    {
-        /** \tparam float2_X position of the component in the cell
-         *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
-         */
-        using VectorVector3D3V = const ::pmacc::math::Vector<
-            float3_X,
-            DIM3
-        >;
-
-        /// boost::result_of hints
-        template<class> struct result;
+        } // namespace cellType
+    } // namespace fields
 
-        template<class F>
-        struct result<F()> {
-            using type = VectorVector3D3V;
-        };
-
-        HDINLINE FieldPosition()
-        {
-        }
-
-        HDINLINE VectorVector3D3V operator()() const
-        {
-            const float3_X posE_x( 0.5, 0.0, 0.0 );
-            const float3_X posE_y( 0.0, 0.5, 0.0 );
-            const float3_X posE_z( 0.0, 0.0, 0.5 );
-
-            return VectorVector3D3V( posE_x, posE_y, posE_z );
-        }
-    };
-
-    /** position (float2_X) in cell for B_x, B_y, B_z
-     */
-    template<>
-    struct FieldPosition<
-        fields::cellType::Yee,
-        FieldB,
-        DIM2
-    >
+    namespace traits
     {
-        /** \tparam float2_X position of the component in the cell
-         *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
+        /** position (float2_X) in cell for E_x, E_y, E_z
          */
-        using VectorVector2D3V = const ::pmacc::math::Vector<
-            float2_X,
-            DIM3
-        >;
-        /// boost::result_of hints
-        template<class> struct result;
-
-        template<class F>
-        struct result<F()> {
-            using type = VectorVector2D3V;
-        };
-
-        HDINLINE FieldPosition()
+        template<>
+        struct FieldPosition<fields::cellType::Yee, FieldE, DIM2>
         {
-        }
+            /** \tparam float2_X position of the component in the cell
+             *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
+             */
+            using VectorVector2D3V = const ::pmacc::math::Vector<float2_X, DIM3>;
+            /// boost::result_of hints
+            template<class>
+            struct result;
+
+            template<class F>
+            struct result<F()>
+            {
+                using type = VectorVector2D3V;
+            };
+
+            HDINLINE FieldPosition()
+            {
+            }
+
+            HDINLINE VectorVector2D3V operator()() const
+            {
+                const float2_X posE_x(0.5, 0.0);
+                const float2_X posE_y(0.0, 0.5);
+                const float2_X posE_z(0.0, 0.0);
+
+                return VectorVector2D3V(posE_x, posE_y, posE_z);
+            }
+        };
 
-        HDINLINE VectorVector2D3V operator()() const
-        {
-            const float2_X posB_x( 0.0, 0.5 );
-            const float2_X posB_y( 0.5, 0.0 );
-            const float2_X posB_z( 0.5, 0.5 );
-
-            return VectorVector2D3V( posB_x, posB_y, posB_z );
-        }
-    };
-
-    /** position (float3_X) in cell for B_x, B_y, B_z
-     */
-    template<>
-    struct FieldPosition<
-        fields::cellType::Yee,
-        FieldB,
-        DIM3
-    >
-    {
-        /** \tparam float2_X position of the component in the cell
-         *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
+        /** position (float3_X) in cell for E_x, E_y, E_z
          */
-        using VectorVector3D3V = const ::pmacc::math::Vector<
-            float3_X,
-            DIM3
-        >;
-
-        /// boost::result_of hints
-        template<class> struct result;
-
-        template<class F>
-        struct result<F()> {
-            using type = VectorVector3D3V;
+        template<>
+        struct FieldPosition<fields::cellType::Yee, FieldE, DIM3>
+        {
+            /** \tparam float2_X position of the component in the cell
+             *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
+             */
+            using VectorVector3D3V = const ::pmacc::math::Vector<float3_X, DIM3>;
+
+            /// boost::result_of hints
+            template<class>
+            struct result;
+
+            template<class F>
+            struct result<F()>
+            {
+                using type = VectorVector3D3V;
+            };
+
+            HDINLINE FieldPosition()
+            {
+            }
+
+            HDINLINE VectorVector3D3V operator()() const
+            {
+                const float3_X posE_x(0.5, 0.0, 0.0);
+                const float3_X posE_y(0.0, 0.5, 0.0);
+                const float3_X posE_z(0.0, 0.0, 0.5);
+
+                return VectorVector3D3V(posE_x, posE_y, posE_z);
+            }
         };
 
-        HDINLINE FieldPosition()
+        /** position (float2_X) in cell for B_x, B_y, B_z
+         */
+        template<>
+        struct FieldPosition<fields::cellType::Yee, FieldB, DIM2>
         {
-        }
+            /** \tparam float2_X position of the component in the cell
+             *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
+             */
+            using VectorVector2D3V = const ::pmacc::math::Vector<float2_X, DIM3>;
+            /// boost::result_of hints
+            template<class>
+            struct result;
+
+            template<class F>
+            struct result<F()>
+            {
+                using type = VectorVector2D3V;
+            };
+
+            HDINLINE FieldPosition()
+            {
+            }
+
+            HDINLINE VectorVector2D3V operator()() const
+            {
+                const float2_X posB_x(0.0, 0.5);
+                const float2_X posB_y(0.5, 0.0);
+                const float2_X posB_z(0.5, 0.5);
+
+                return VectorVector2D3V(posB_x, posB_y, posB_z);
+            }
+        };
 
-        HDINLINE VectorVector3D3V operator()() const
-        {
-            const float3_X posB_x( 0.0, 0.5, 0.5 );
-            const float3_X posB_y( 0.5, 0.0, 0.5 );
-            const float3_X posB_z( 0.5, 0.5, 0.0 );
-
-            return VectorVector3D3V( posB_x, posB_y, posB_z );
-        }
-    };
-
-    /** position (floatD_X in case of T_simDim == simDim) in cell for
-     *  J_x, J_y, J_z
-     */
-    template< uint32_t T_simDim >
-    struct FieldPosition<
-        fields::cellType::Yee,
-        FieldJ,
-        T_simDim
-    > : public FieldPosition<
-        fields::cellType::Yee,
-        FieldE,
-        T_simDim
-    >
-    {
-        HDINLINE FieldPosition()
+        /** position (float3_X) in cell for B_x, B_y, B_z
+         */
+        template<>
+        struct FieldPosition<fields::cellType::Yee, FieldB, DIM3>
         {
-        }
-    };
-
-    /** position (floatD_X in case of T_simDim == simDim) in cell, wrapped in
-     * one-component vector since it's a scalar field with only one component, for the
-     * scalar field FieldTmp
-     */
-    template< uint32_t T_simDim >
-    struct FieldPosition<
-        fields::cellType::Yee,
-        FieldTmp,
-        T_simDim
-    >
-    {
-        using FieldPos = pmacc::math::Vector<float_X, T_simDim>;
-        using ReturnType = pmacc::math::Vector<FieldPos, DIM1>;
-
-        /// boost::result_of hints
-        template<class> struct result;
-
-        template<class F>
-        struct result<F()> {
-            using type = ReturnType;
+            /** \tparam float2_X position of the component in the cell
+             *  \tparam DIM3     Fields (E/B/J) have 3 components, even in 1 or 2D !
+             */
+            using VectorVector3D3V = const ::pmacc::math::Vector<float3_X, DIM3>;
+
+            /// boost::result_of hints
+            template<class>
+            struct result;
+
+            template<class F>
+            struct result<F()>
+            {
+                using type = VectorVector3D3V;
+            };
+
+            HDINLINE FieldPosition()
+            {
+            }
+
+            HDINLINE VectorVector3D3V operator()() const
+            {
+                const float3_X posB_x(0.0, 0.5, 0.5);
+                const float3_X posB_y(0.5, 0.0, 0.5);
+                const float3_X posB_z(0.5, 0.5, 0.0);
+
+                return VectorVector3D3V(posB_x, posB_y, posB_z);
+            }
         };
 
-        HDINLINE FieldPosition()
+        /** position (floatD_X in case of T_simDim == simDim) in cell for
+         *  J_x, J_y, J_z
+         */
+        template<uint32_t T_simDim>
+        struct FieldPosition<fields::cellType::Yee, FieldJ, T_simDim>
+            : public FieldPosition<fields::cellType::Yee, FieldE, T_simDim>
         {
-        }
+            HDINLINE FieldPosition()
+            {
+            }
+        };
 
-        HDINLINE ReturnType operator()() const
+        /** position (floatD_X in case of T_simDim == simDim) in cell, wrapped in
+         * one-component vector since it's a scalar field with only one component, for the
+         * scalar field FieldTmp
+         */
+        template<uint32_t T_simDim>
+        struct FieldPosition<fields::cellType::Yee, FieldTmp, T_simDim>
         {
-            return ReturnType( FieldPos::create(0.0) );
-        }
-    };
+            using FieldPos = pmacc::math::Vector<float_X, T_simDim>;
+            using ReturnType = pmacc::math::Vector<FieldPos, DIM1>;
+
+            /// boost::result_of hints
+            template<class>
+            struct result;
+
+            template<class F>
+            struct result<F()>
+            {
+                using type = ReturnType;
+            };
+
+            HDINLINE FieldPosition()
+            {
+            }
+
+            HDINLINE ReturnType operator()() const
+            {
+                return ReturnType(FieldPos::create(0.0));
+            }
+        };
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/Cache.hpp b/include/picongpu/fields/currentDeposition/Cache.hpp
new file mode 100644
index 0000000000..4b9ea4ec3d
--- /dev/null
+++ b/include/picongpu/fields/currentDeposition/Cache.hpp
@@ -0,0 +1,132 @@
+/* Copyright 2020-2021 Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/algorithms/Set.hpp"
+
+#include <alpaka/core/Unused.hpp>
+
+#include <pmacc/mappings/threads/ThreadCollective.hpp>
+#include <pmacc/types.hpp>
+
+
+namespace picongpu
+{
+    namespace currentSolver
+    {
+        namespace detail
+        {
+            /** Transparent cache implementation for the current solver
+             *
+             * @tparam T_Strategy Used strategy to reduce the scattered data [currentSolver::strategy]
+             * @tparam T_Sfinae Optional specialization
+             */
+            template<typename T_Strategy, typename T_Sfinae = void>
+            struct Cache;
+
+            template<typename T_Strategy>
+            struct Cache<T_Strategy, typename std::enable_if<T_Strategy::useBlockCache>::type>
+            {
+                /** Create a cache
+                 *
+                 * @attention thread-collective operation, requires external thread synchronization
+                 */
+                template<uint32_t T_numWorkers, typename T_BlockDescription, typename T_Acc, typename T_FieldBox>
+                DINLINE static auto create(T_Acc const& acc, T_FieldBox const& fieldBox, uint32_t const workerIdx)
+#if(!BOOST_COMP_CLANG)
+                    -> decltype(
+                        CachedBox::create<0u, typename T_FieldBox::ValueType>(acc, std::declval<T_BlockDescription>()))
+#endif
+                {
+                    using ValueType = typename T_FieldBox::ValueType;
+                    /* this memory is used by all virtual blocks */
+                    auto cache = CachedBox::create<0u, ValueType>(acc, T_BlockDescription{});
+
+                    Set<ValueType> set(ValueType::create(0.0_X));
+                    ThreadCollective<T_BlockDescription, T_numWorkers> collectiveFill(workerIdx);
+
+                    /* initialize shared memory with zeros */
+                    collectiveFill(acc, set, cache);
+                    return cache;
+                }
+
+                /** Flush the cache
+                 *
+                 * @attention thread-collective operation, requires external thread synchronization
+                 */
+                template<
+                    uint32_t T_numWorkers,
+                    typename T_BlockDescription,
+                    typename T_Acc,
+                    typename T_FieldBox,
+                    typename T_FieldCache>
+                DINLINE static void flush(
+                    T_Acc const& acc,
+                    T_FieldBox fieldBox,
+                    T_FieldCache const& cachedBox,
+                    uint32_t const workerIdx)
+                {
+                    typename T_Strategy::GridReductionOp const op;
+                    ThreadCollective<T_BlockDescription, T_numWorkers> collectiveAdd(workerIdx);
+
+                    /* write scatter results back to the global memory */
+                    collectiveAdd(acc, op, fieldBox, cachedBox);
+                }
+            };
+
+            template<typename T_Strategy>
+            struct Cache<T_Strategy, typename std::enable_if<!T_Strategy::useBlockCache>::type>
+            {
+                /** Create a cache
+                 *
+                 * @attention thread-collective operation, requires external thread synchronization
+                 */
+                template<uint32_t T_numWorkers, typename T_BlockDescription, typename T_Acc, typename T_FieldBox>
+                DINLINE static auto create(T_Acc const& acc, T_FieldBox const& fieldBox, uint32_t const workerIdx)
+#if(!BOOST_COMP_CLANG)
+                    -> T_FieldBox
+#endif
+                {
+                    alpaka::ignore_unused(acc, workerIdx);
+                    return fieldBox;
+                }
+
+                /** Flush the cache
+                 *
+                 * @attention thread-collective operation, requires external thread synchronization
+                 */
+                template<
+                    uint32_t T_numWorkers,
+                    typename T_BlockDescription,
+                    typename T_Acc,
+                    typename T_FieldBox,
+                    typename T_FieldCache>
+                DINLINE static void flush(
+                    T_Acc const& acc,
+                    T_FieldBox fieldBox,
+                    T_FieldCache const& cachedBox,
+                    uint32_t const workerIdx)
+                {
+                    alpaka::ignore_unused(acc, fieldBox, cachedBox, workerIdx);
+                }
+            };
+        } // namespace detail
+    } // namespace currentSolver
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/Deposit.hpp b/include/picongpu/fields/currentDeposition/Deposit.hpp
new file mode 100644
index 0000000000..144083afcf
--- /dev/null
+++ b/include/picongpu/fields/currentDeposition/Deposit.hpp
@@ -0,0 +1,125 @@
+/* Copyright 2020-2021 Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/traits/GetCurrentSolver.hpp"
+#include "picongpu/traits/GetMargin.hpp"
+
+#include <pmacc/mappings/kernel/AreaMapping.hpp>
+#include <pmacc/mappings/kernel/StrideMapping.hpp>
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/types.hpp>
+
+
+namespace picongpu
+{
+    namespace currentSolver
+    {
+        /** Executes the current deposition kernel
+         *
+         * @tparam T_Strategy Used strategy to reduce the scattered data [currentSolver::strategy]
+         * @tparam T_Sfinae Optional specialization
+         */
+        template<typename T_Strategy, typename T_Sfinae = void>
+        struct Deposit;
+
+        template<typename T_Strategy>
+        struct Deposit<T_Strategy, typename std::enable_if<T_Strategy::stridedMapping>::type>
+        {
+            /** Execute the current deposition with a checker board
+             *
+             * The stride between the supercells for the checker board will be automatically
+             * adjusted, based on the species shape.
+             */
+            template<
+                uint32_t T_area,
+                uint32_t T_numWorkers,
+                typename T_CellDescription,
+                typename T_DepositionKernel,
+                typename T_FrameSolver,
+                typename T_JBox,
+                typename T_ParticleBox>
+            void execute(
+                T_CellDescription const& cellDescription,
+                T_DepositionKernel const& depositionKernel,
+                T_FrameSolver const& frameSolver,
+                T_JBox const& jBox,
+                T_ParticleBox const& parBox) const
+            {
+                /* The needed stride for the stride mapper depends on the stencil width.
+                 * If the upper and lower margin of the stencil fits into one supercell
+                 * a double checker board (stride 2) is needed.
+                 * The round up sum of margins is the number of supercells to skip.
+                 */
+                using MarginPerDim = typename pmacc::math::CT::add<
+                    typename GetMargin<typename T_FrameSolver::ParticleAlgo>::LowerMargin,
+                    typename GetMargin<typename T_FrameSolver::ParticleAlgo>::UpperMargin>::type;
+                using MaxMargin = typename pmacc::math::CT::max<MarginPerDim>::type;
+                using SuperCellMinSize = typename pmacc::math::CT::min<SuperCellSize>::type;
+
+                /* number of supercells which must be skipped to avoid overlapping areas
+                 * between different blocks in the kernel
+                 */
+                constexpr uint32_t skipSuperCells
+                    = (MaxMargin::value + SuperCellMinSize::value - 1u) / SuperCellMinSize::value;
+                StrideMapping<
+                    T_area,
+                    skipSuperCells + 1u, // stride 1u means each supercell is used
+                    MappingDesc>
+                    mapper(cellDescription);
+
+                do
+                {
+                    PMACC_KERNEL(depositionKernel)
+                    (mapper.getGridDim(), T_numWorkers)(jBox, parBox, frameSolver, mapper);
+                } while(mapper.next());
+            }
+        };
+
+        template<typename T_Strategy>
+        struct Deposit<T_Strategy, typename std::enable_if<!T_Strategy::stridedMapping>::type>
+        {
+            /** Execute the current deposition for each supercell
+             *
+             * All supercells will be processed in parallel.
+             */
+            template<
+                uint32_t T_area,
+                uint32_t T_numWorkers,
+                typename T_CellDescription,
+                typename T_DepositionKernel,
+                typename T_FrameSolver,
+                typename T_JBox,
+                typename T_ParticleBox>
+            void execute(
+                T_CellDescription const& cellDescription,
+                T_DepositionKernel const& depositionKernel,
+                T_FrameSolver const& frameSolver,
+                T_JBox const& jBox,
+                T_ParticleBox const& parBox) const
+            {
+                AreaMapping<T_area, MappingDesc> mapper(cellDescription);
+
+                PMACC_KERNEL(depositionKernel)(mapper.getGridDim(), T_numWorkers)(jBox, parBox, frameSolver, mapper);
+            }
+        };
+
+    } // namespace currentSolver
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/EmZ/DepositCurrent.hpp b/include/picongpu/fields/currentDeposition/EmZ/DepositCurrent.hpp
index c7fd53a68c..3d2a40751d 100644
--- a/include/picongpu/fields/currentDeposition/EmZ/DepositCurrent.hpp
+++ b/include/picongpu/fields/currentDeposition/EmZ/DepositCurrent.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -21,7 +21,6 @@
 
 #include <pmacc/cuSTL/cursor/Cursor.hpp>
 #include <pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp>
-#include <pmacc/nvidia/atomic.hpp>
 
 #include "picongpu/fields/currentDeposition/EmZ/EmZ.def"
 #include "picongpu/fields/currentDeposition/Esirkepov/Line.hpp"
@@ -29,320 +28,224 @@
 
 namespace picongpu
 {
-namespace currentSolver
-{
-namespace emz
-{
-    using namespace pmacc;
-
-    template<
-        typename ParticleAssign
-    >
-    struct BaseMethods
+    namespace currentSolver
     {
-        /** evaluate particle shape
-         * @param line element with previous and current position of the particle
-         * @param gridPoint used grid point to evaluate assignment shape
-         * @param d dimension range {0,1,2} means {x,y,z}
-         *          different to Esirkepov paper, here we use C style
-         * @{
-         */
-
-        /** evaluate shape for the first particle S0 (see paper) */
-        DINLINE float_X
-        S0(
-            const Line< floatD_X >& line,
-            const float_X gridPoint,
-            const uint32_t d
-        ) const
-        {
-            return ParticleAssign( )( gridPoint - line.m_pos0[d] );
-        }
-
-        /** evaluate shape for the second particle */
-        DINLINE float_X
-        S1(
-            const Line< floatD_X >& line,
-            const float_X gridPoint,
-            const uint32_t d
-        ) const
+        namespace emz
         {
-            return ParticleAssign( )( gridPoint - line.m_pos1[d] );
-        }
-        /*! @} */
+            using namespace pmacc;
 
-        /** calculate DS (see paper)
-         * @param line element with previous and current position of the particle
-         * @param gridPoint used grid point to evaluate assignment shape
-         * @param d dimension range {0,1,2} means {x,y,z}]
-         *          different to Esirkepov paper, here we use C style
-         */
-        DINLINE float_X
-        DS(
-            const Line<floatD_X>& line,
-            const float_X gridPoint,
-            const uint32_t d
-        ) const
-        {
-            return ParticleAssign( )( gridPoint - line.m_pos1[d] ) - ParticleAssign( )( gridPoint - line.m_pos0[d] );
-        }
-    };
+            template<typename ParticleAssign>
+            struct BaseMethods
+            {
+                /** evaluate particle shape
+                 * @param line element with previous and current position of the particle
+                 * @param gridPoint used grid point to evaluate assignment shape
+                 * @param d dimension range {0,1,2} means {x,y,z}
+                 *          different to Esirkepov paper, here we use C style
+                 * @{
+                 */
+
+                /** evaluate shape for the first particle S0 (see paper) */
+                DINLINE float_X S0(const Line<floatD_X>& line, const float_X gridPoint, const uint32_t d) const
+                {
+                    return ParticleAssign()(gridPoint - line.m_pos0[d]);
+                }
 
-    template<
-        typename ParticleAssign,
-        int T_begin,
-        int T_end
-    >
-    struct DepositCurrent<
-        ParticleAssign,
-        T_begin,
-        T_end,
-        DIM3
-    > : public BaseMethods< ParticleAssign >
-    {
-        template<
-            typename T_Cursor,
-            typename T_Acc
-        >
-        DINLINE void
-        operator()(
-            T_Acc const & acc,
-            const T_Cursor& cursorJ,
-            const Line< float3_X >& line,
-            const float_X chargeDensity,
-            const float_X
-        ) const
-        {
-            /**
-             * \brief the following three calls separate the 3D current deposition
-             * into three independent 1D calls, each for one direction and current component.
-             * Therefore the coordinate system has to be rotated so that the z-direction
-             * is always specific.
-             */
-            using namespace cursor::tools;
-            cptCurrent1D(
-                acc,
-                twistVectorFieldAxes< pmacc::math::CT::Int < 1, 2, 0 > >( cursorJ ),
-                rotateOrigin< 1, 2, 0 >( line ),
-                cellSize.x( ) * chargeDensity / DELTA_T
-            );
-            cptCurrent1D(
-                acc,
-                twistVectorFieldAxes< pmacc::math::CT::Int < 2, 0, 1 > >( cursorJ ),
-                rotateOrigin< 2, 0, 1 >( line ),
-                cellSize.y( ) * chargeDensity / DELTA_T
-            );
-            cptCurrent1D(
-                acc,
-                cursorJ,
-                line,
-                cellSize.z( ) * chargeDensity / DELTA_T
-            );
-        }
+                /** evaluate shape for the second particle */
+                DINLINE float_X S1(const Line<floatD_X>& line, const float_X gridPoint, const uint32_t d) const
+                {
+                    return ParticleAssign()(gridPoint - line.m_pos1[d]);
+                }
+                /*! @} */
+
+                /** calculate DS (see paper)
+                 * @param line element with previous and current position of the particle
+                 * @param gridPoint used grid point to evaluate assignment shape
+                 * @param d dimension range {0,1,2} means {x,y,z}]
+                 *          different to Esirkepov paper, here we use C style
+                 */
+                DINLINE float_X DS(const Line<floatD_X>& line, const float_X gridPoint, const uint32_t d) const
+                {
+                    return ParticleAssign()(gridPoint - line.m_pos1[d]) - ParticleAssign()(gridPoint - line.m_pos0[d]);
+                }
+            };
 
-        /** deposites current in z-direction
-         *
-         * \param cursorJ cursor pointing at the current density field of the particle's cell
-         * \param line trajectory of the virtual particle
-         * \param currentSurfaceDensity surface density
-         */
-        template<
-            typename CursorJ,
-            typename T_Line,
-            typename T_Acc
-        >
-        DINLINE void
-        cptCurrent1D(
-            T_Acc const & acc,
-            CursorJ cursorJ,
-            const T_Line& line,
-            const float_X currentSurfaceDensity
-        ) const
-        {
-            if( line.m_pos0[2] == line.m_pos1[2] )
-                return;
-            /* pick every cell in the xy-plane that is overlapped by particle's
-             * form factor and deposit the current for the cells above and beneath
-             * that cell and for the cell itself.
-             */
-            for( int i = T_begin ; i < T_end ; ++i )
+            template<typename T_AtomicAddOp, typename ParticleAssign, int T_begin, int T_end>
+            struct DepositCurrent<T_AtomicAddOp, ParticleAssign, T_begin, T_end, DIM3>
+                : public BaseMethods<ParticleAssign>
             {
-                const float_X s0i = this->S0( line, i, 0 );
-                const float_X dsi = this->S1( line, i, 0 ) - s0i;
-                for( int j = T_begin ; j < T_end ; ++j )
+                template<typename T_Cursor, typename T_Acc>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    const T_Cursor& cursorJ,
+                    const Line<float3_X>& line,
+                    const float_X chargeDensity,
+                    const float_X) const
                 {
-                    const float_X s0j = this->S0( line, j, 1 );
-                    const float_X dsj = this->S1( line, j, 1 ) - s0j;
-
-                    float_X tmp =
-                        -currentSurfaceDensity * (
-                            s0i * s0j +
-                            float_X( 0.5 ) * ( dsi * s0j + s0i * dsj ) +
-                            ( float_X( 1.0 ) / float_X( 3.0 ) ) * dsj * dsi
-                        );
+                    /**
+                     * \brief the following three calls separate the 3D current deposition
+                     * into three independent 1D calls, each for one direction and current component.
+                     * Therefore the coordinate system has to be rotated so that the z-direction
+                     * is always specific.
+                     */
+                    using namespace cursor::tools;
+                    cptCurrent1D(
+                        acc,
+                        twistVectorFieldAxes<pmacc::math::CT::Int<1, 2, 0>>(cursorJ),
+                        rotateOrigin<1, 2, 0>(line),
+                        cellSize.x() * chargeDensity / DELTA_T);
+                    cptCurrent1D(
+                        acc,
+                        twistVectorFieldAxes<pmacc::math::CT::Int<2, 0, 1>>(cursorJ),
+                        rotateOrigin<2, 0, 1>(line),
+                        cellSize.y() * chargeDensity / DELTA_T);
+                    cptCurrent1D(acc, cursorJ, line, cellSize.z() * chargeDensity / DELTA_T);
+                }
 
-                    float_X accumulated_J = float_X( 0.0 );
-                    for( int k = T_begin ; k < T_end - 1 ; ++k )
+                /** deposites current in z-direction
+                 *
+                 * \param cursorJ cursor pointing at the current density field of the particle's cell
+                 * \param line trajectory of the virtual particle
+                 * \param currentSurfaceDensity surface density
+                 */
+                template<typename CursorJ, typename T_Line, typename T_Acc>
+                DINLINE void cptCurrent1D(
+                    T_Acc const& acc,
+                    CursorJ cursorJ,
+                    const T_Line& line,
+                    const float_X currentSurfaceDensity) const
+                {
+                    if(line.m_pos0[2] == line.m_pos1[2])
+                        return;
+                    /* pick every cell in the xy-plane that is overlapped by particle's
+                     * form factor and deposit the current for the cells above and beneath
+                     * that cell and for the cell itself.
+                     */
+                    for(int i = T_begin; i < T_end; ++i)
                     {
-                        /* This is the implementation of the FORTRAN W(i,j,k,3)/ C style W(i,j,k,2) version from
-                         * Esirkepov paper. All coordinates are rotated before thus we can
-                         * always use C style W(i,j,k,2).
-                         */
-                        const float_X W = this->DS( line, k, 2 ) * tmp;
-                        accumulated_J += W;
-                        atomicAdd(
-                            &( (*cursorJ( i, j, k ) ).z( ) ),
-                            accumulated_J,
-                            ::alpaka::hierarchy::Threads{}
-                        );
+                        const float_X s0i = this->S0(line, i, 0);
+                        const float_X dsi = this->S1(line, i, 0) - s0i;
+                        for(int j = T_begin; j < T_end; ++j)
+                        {
+                            const float_X s0j = this->S0(line, j, 1);
+                            const float_X dsj = this->S1(line, j, 1) - s0j;
+
+                            float_X tmp = -currentSurfaceDensity
+                                * (s0i * s0j + float_X(0.5) * (dsi * s0j + s0i * dsj)
+                                   + (float_X(1.0) / float_X(3.0)) * dsj * dsi);
+
+                            float_X accumulated_J = float_X(0.0);
+                            for(int k = T_begin; k < T_end - 1; ++k)
+                            {
+                                /* This is the implementation of the FORTRAN W(i,j,k,3)/ C style W(i,j,k,2) version
+                                 * from Esirkepov paper. All coordinates are rotated before thus we can always use C
+                                 * style W(i,j,k,2).
+                                 */
+                                const float_X W = this->DS(line, k, 2) * tmp;
+                                accumulated_J += W;
+                                auto const atomicOp = T_AtomicAddOp{};
+                                atomicOp(acc, (*cursorJ(i, j, k)).z(), accumulated_J);
+                            }
+                        }
                     }
                 }
-            }
-        }
-    };
-
-    template<
-        typename ParticleAssign,
-        int T_begin,
-        int T_end
-    >
-    struct DepositCurrent<
-        ParticleAssign,
-        T_begin,
-        T_end,
-        DIM2
-    > : public BaseMethods< ParticleAssign >
-    {
-        template<
-            typename T_Cursor,
-            typename T_Acc
-        >
-        DINLINE void
-        operator()(
-            T_Acc const & acc,
-            const T_Cursor& cursorJ,
-            const Line< float2_X >& line,
-            const float_X chargeDensity,
-            const float_X velocityZ
-        ) const
-        {
-            using namespace cursor::tools;
-            cptCurrent1D(
-                acc,
-                cursorJ,
-                line,
-                cellSize.x( ) * chargeDensity / DELTA_T
-            );
-            cptCurrent1D(
-                acc,
-                twistVectorFieldAxes< pmacc::math::CT::Int < 1, 0 > >( cursorJ ),
-                rotateOrigin < 1, 0 > ( line ),
-                cellSize.y( ) * chargeDensity / DELTA_T
-            );
-            cptCurrentZ(
-                acc,
-                cursorJ,
-                line,
-                velocityZ * chargeDensity
-            );
-        }
+            };
 
-        /** deposites current in x-direction
-         *
-         * \param cursorJ cursor pointing at the current density field of the particle's cell
-         * \param line trajectory of the virtual particle
-         * \param currentSurfaceDensity surface density
-         */
-        template<
-            typename CursorJ,
-            typename T_Line,
-            typename T_Acc
-        >
-        DINLINE void
-        cptCurrent1D(
-            T_Acc const & acc,
-            CursorJ cursorJ,
-            const T_Line& line,
-            const float_X currentSurfaceDensity
-        ) const
-        {
-            if( line.m_pos0[0] == line.m_pos1[0] )
-                return;
-
-            for( int j = T_begin; j < T_end; ++j )
+            template<typename T_AtomicAddOp, typename ParticleAssign, int T_begin, int T_end>
+            struct DepositCurrent<T_AtomicAddOp, ParticleAssign, T_begin, T_end, DIM2>
+                : public BaseMethods<ParticleAssign>
             {
-                const float_X s0j = this->S0( line, j, 1 );
-                const float_X dsj = this->S1( line, j, 1 ) - s0j;
-
-                float_X tmp = -currentSurfaceDensity *
-                    (
-                        s0j +
-                        float_X( 0.5 ) * dsj
-                    );
-
-                float_X accumulated_J = float_X( 0.0 );
-                for( int i = T_begin; i < T_end - 1; ++i )
+                template<typename T_Cursor, typename T_Acc>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    const T_Cursor& cursorJ,
+                    const Line<float2_X>& line,
+                    const float_X chargeDensity,
+                    const float_X velocityZ) const
                 {
-                    /* This is the implementation of the FORTRAN W(i,j,k,1)/ C style W(i,j,k,0) version from
-                     * Esirkepov paper. All coordinates are rotated before thus we can
-                     * always use C style W(i,j,k,0).
-                     */
-                    const float_X W = this->DS( line, i, 0 ) * tmp;
-                    accumulated_J += W;
-                    atomicAdd(
-                        &( ( *cursorJ( i, j ) ).x( ) ),
-                        accumulated_J,
-                        ::alpaka::hierarchy::Threads{}
-                    );
+                    using namespace cursor::tools;
+                    cptCurrent1D(acc, cursorJ, line, cellSize.x() * chargeDensity / DELTA_T);
+                    cptCurrent1D(
+                        acc,
+                        twistVectorFieldAxes<pmacc::math::CT::Int<1, 0>>(cursorJ),
+                        rotateOrigin<1, 0>(line),
+                        cellSize.y() * chargeDensity / DELTA_T);
+                    cptCurrentZ(acc, cursorJ, line, velocityZ * chargeDensity);
                 }
-            }
-        }
 
-        /** deposites current in z-direction
-         *
-         * \param cursorJ cursor pointing at the current density field of the particle's cell
-         * \param line trajectory of the virtual particle
-         * \param currentSurfaceDensityZ surface density in z direction
-         */
-        template<
-            typename CursorJ,
-            typename T_Line,
-            typename T_Acc
-        >
-        DINLINE void
-        cptCurrentZ(
-            T_Acc const & acc,
-            CursorJ cursorJ,
-            const T_Line& line,
-            const float_X currentSurfaceDensityZ
-        ) const
-        {
-            if( currentSurfaceDensityZ == float_X( 0.0 ) )
-                return;
+                /** deposites current in x-direction
+                 *
+                 * \param cursorJ cursor pointing at the current density field of the particle's cell
+                 * \param line trajectory of the virtual particle
+                 * \param currentSurfaceDensity surface density
+                 */
+                template<typename CursorJ, typename T_Line, typename T_Acc>
+                DINLINE void cptCurrent1D(
+                    T_Acc const& acc,
+                    CursorJ cursorJ,
+                    const T_Line& line,
+                    const float_X currentSurfaceDensity) const
+                {
+                    if(line.m_pos0[0] == line.m_pos1[0])
+                        return;
 
-            for( int j = T_begin; j < T_end; ++j )
-            {
-                const float_X s0j = this->S0( line, j, 1 );
-                const float_X dsj = this->S1( line, j, 1 ) - s0j;
-                for( int i = T_begin; i < T_end; ++i )
+                    for(int j = T_begin; j < T_end; ++j)
+                    {
+                        const float_X s0j = this->S0(line, j, 1);
+                        const float_X dsj = this->S1(line, j, 1) - s0j;
+
+                        float_X tmp = -currentSurfaceDensity * (s0j + float_X(0.5) * dsj);
+
+                        float_X accumulated_J = float_X(0.0);
+                        for(int i = T_begin; i < T_end - 1; ++i)
+                        {
+                            /* This is the implementation of the FORTRAN W(i,j,k,1)/ C style W(i,j,k,0) version from
+                             * Esirkepov paper. All coordinates are rotated before thus we can
+                             * always use C style W(i,j,k,0).
+                             */
+                            const float_X W = this->DS(line, i, 0) * tmp;
+                            accumulated_J += W;
+                            auto const atomicOp = T_AtomicAddOp{};
+                            atomicOp(acc, (*cursorJ(i, j)).x(), accumulated_J);
+                        }
+                    }
+                }
+
+                /** deposites current in z-direction
+                 *
+                 * \param cursorJ cursor pointing at the current density field of the particle's cell
+                 * \param line trajectory of the virtual particle
+                 * \param currentSurfaceDensityZ surface density in z direction
+                 */
+                template<typename CursorJ, typename T_Line, typename T_Acc>
+                DINLINE void cptCurrentZ(
+                    T_Acc const& acc,
+                    CursorJ cursorJ,
+                    const T_Line& line,
+                    const float_X currentSurfaceDensityZ) const
                 {
-                    const float_X s0i = this->S0( line, i, 0 );
-                    const float_X dsi = this->S1( line, i, 0 ) - s0i;
-                    float_X W = s0i * this->S0( line, j, 1 ) +
-                        float_X( 0.5 ) * ( dsi * s0j + s0i * dsj ) +
-                        ( float_X( 1.0 ) / float_X( 3.0 ) ) * dsi * dsj;
+                    if(currentSurfaceDensityZ == float_X(0.0))
+                        return;
 
-                    const float_X j_z = W * currentSurfaceDensityZ;
-                    atomicAdd(
-                        &( ( *cursorJ( i, j ) ).z( ) ),
-                        j_z,
-                        ::alpaka::hierarchy::Threads{}
-                    );
+                    for(int j = T_begin; j < T_end; ++j)
+                    {
+                        const float_X s0j = this->S0(line, j, 1);
+                        const float_X dsj = this->S1(line, j, 1) - s0j;
+                        for(int i = T_begin; i < T_end; ++i)
+                        {
+                            const float_X s0i = this->S0(line, i, 0);
+                            const float_X dsi = this->S1(line, i, 0) - s0i;
+                            float_X W = s0i * this->S0(line, j, 1) + float_X(0.5) * (dsi * s0j + s0i * dsj)
+                                + (float_X(1.0) / float_X(3.0)) * dsi * dsj;
+
+                            const float_X j_z = W * currentSurfaceDensityZ;
+                            auto const atomicOp = T_AtomicAddOp{};
+                            atomicOp(acc, (*cursorJ(i, j)).z(), j_z);
+                        }
+                    }
                 }
-            }
-        }
-    };
+            };
 
-} // namespace emz
-} // namespace currentSolver
+        } // namespace emz
+    } // namespace currentSolver
 } // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/EmZ/EmZ.def b/include/picongpu/fields/currentDeposition/EmZ/EmZ.def
index 252fe57bcf..d5f224fd91 100644
--- a/include/picongpu/fields/currentDeposition/EmZ/EmZ.def
+++ b/include/picongpu/fields/currentDeposition/EmZ/EmZ.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -17,76 +17,75 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/currentDeposition/Strategy.def"
 
 
 namespace picongpu
 {
-namespace currentSolver
-{
-
-namespace emz
-{
-    template<
-        typename ParticleAssign,
-        int T_begin,
-        int T_end,
-        uint32_t T_dim = simDim
-    >
-    struct DepositCurrent;
-} //namespace emz
+    namespace currentSolver
+    {
+        namespace emz
+        {
+            template<typename T_AtomicAddOp, typename ParticleAssign, int T_begin, int T_end, uint32_t T_dim = simDim>
+            struct DepositCurrent;
+        } // namespace emz
 
-/** EmZ (Esirkepov meets ZigZag) current deposition
- *
- * Deposit the particle current with a mixed algorithm based on Esirkepov and
- * the ZigZag way splitting.
- * EmZ supports arbitrary symmetric shapes and 2D/3D cartesian grids.
- *
- * ZigZag publications:
- * 1. order paper: "A new charge conservation method in electromagnetic
- *                  particle-in-cell simulations", Comput. Phys. Commun. (2003)
- *                  T. Umeda, Y. Omura, T. Tominaga, H. Matsumoto
- *                  DOI: 10.1016/S0010-4655(03)00437-5
- * 2. order paper: "Charge conservation methods for computing current densities
- *                  in electromagnetic particle-in-cell simulations",
- *                  Proceedings of ISSS. Vol. 7. 2005
- *                  T. Umeda, Y. Omura, H. Matsumoto
- * 3. order paper: "High-Order Interpolation Algorithms for Charge Conservation
- *                  in Particle-in-Cell Simulation", Commun. Comput. Phys 13 (2013)
- *                  Jinqing Yu, Xiaolin Jin, Weimin Zhou, Bin Li, Yuqiu Gu
- *                  DOI:10.1109/ICCIS.2012.159
- *
- * \tparam T_ParticleShape the particle shape for the species, \see picongpu::particles::shapes
- *
- */
-template< typename ParticleShape >
-struct EmZ;
+        /** EmZ (Esirkepov meets ZigZag) current deposition
+         *
+         * Deposit the particle current with a mixed algorithm based on Esirkepov and
+         * the ZigZag way splitting.
+         * EmZ supports arbitrary symmetric shapes and 2D/3D cartesian grids.
+         *
+         * ZigZag publications:
+         * 1. order paper: "A new charge conservation method in electromagnetic
+         *                  particle-in-cell simulations", Comput. Phys. Commun. (2003)
+         *                  T. Umeda, Y. Omura, T. Tominaga, H. Matsumoto
+         *                  DOI: 10.1016/S0010-4655(03)00437-5
+         * 2. order paper: "Charge conservation methods for computing current densities
+         *                  in electromagnetic particle-in-cell simulations",
+         *                  Proceedings of ISSS. Vol. 7. 2005
+         *                  T. Umeda, Y. Omura, H. Matsumoto
+         * 3. order paper: "High-Order Interpolation Algorithms for Charge Conservation
+         *                  in Particle-in-Cell Simulation", Commun. Comput. Phys 13 (2013)
+         *                  Jinqing Yu, Xiaolin Jin, Weimin Zhou, Bin Li, Yuqiu Gu
+         *                  DOI:10.1109/ICCIS.2012.159
+         *
+         * @tparam T_ParticleShape the particle shape for the species [picongpu::particles::shapes]
+         * @tparam T_Strategy Used strategy to reduce the scattered data [currentSolver::strategy]
+         *
+         */
+        template<typename T_ParticleShape, typename T_Strategy = traits::GetDefaultStrategy_t<>>
+        struct EmZ;
 
-} //namespace currentSolver
+        namespace traits
+        {
+            template<typename T_ParticleShape, typename T_Strategy>
+            struct GetStrategy<EmZ<T_ParticleShape, T_Strategy>>
+            {
+                using type = T_Strategy;
+            };
+        } // namespace traits
 
-namespace traits
-{
+    } // namespace currentSolver
 
-/*Get margin of a solver
- * class must define a LowerMargin and UpperMargin
- */
-template< typename ParticleShape >
-struct GetMargin<
-    picongpu::currentSolver::EmZ<
-        ParticleShape
-    >
->
-{
-private:
-    typedef picongpu::currentSolver::EmZ< ParticleShape > Solver;
-public:
-    typedef typename Solver::LowerMargin LowerMargin;
-    typedef typename Solver::UpperMargin UpperMargin;
-};
+    namespace traits
+    {
+        /*Get margin of a solver
+         * class must define a LowerMargin and UpperMargin
+         */
+        template<typename T_ParticleShape, typename T_Strategy>
+        struct GetMargin<picongpu::currentSolver::EmZ<T_ParticleShape, T_Strategy>>
+        {
+        private:
+            using Solver = picongpu::currentSolver::EmZ<T_ParticleShape, T_Strategy>;
 
-} //namespace traits
+        public:
+            using LowerMargin = typename Solver::LowerMargin;
+            using UpperMargin = typename Solver::UpperMargin;
+        };
 
-} //namespace picongpu
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/EmZ/EmZ.hpp b/include/picongpu/fields/currentDeposition/EmZ/EmZ.hpp
index 4168f42a77..679999c96b 100644
--- a/include/picongpu/fields/currentDeposition/EmZ/EmZ.hpp
+++ b/include/picongpu/fields/currentDeposition/EmZ/EmZ.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,174 +26,126 @@
 #include "picongpu/fields/currentDeposition/EmZ/DepositCurrent.hpp"
 #include "picongpu/fields/currentDeposition/Esirkepov/Line.hpp"
 
-namespace picongpu
-{
-namespace currentSolver
-{
 
-template<
-    typename T_ParticleShape
->
-struct EmZ
+namespace picongpu
 {
-    using ParticleAssign = typename T_ParticleShape::ChargeAssignmentOnSupport;
-    static constexpr int supp = ParticleAssign::support;
-
-    static constexpr int currentLowerMargin = supp / 2 + 1 - (supp + 1) % 2;
-    static constexpr int currentUpperMargin = (supp + 1) / 2 + 1;
-    typedef typename pmacc::math::CT::make_Int<simDim, currentLowerMargin>::type LowerMargin;
-    typedef typename pmacc::math::CT::make_Int<simDim, currentUpperMargin>::type UpperMargin;
-
-    PMACC_CASSERT_MSG(
-        __EmZ_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= currentLowerMargin &&
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= currentUpperMargin
-    );
-
-
-    static constexpr int begin = -currentLowerMargin + 1;
-    static constexpr int end = begin + supp;
-
-
-    /** deposit the current of a particle
-     *
-     * @tparam DataBoxJ any pmacc DataBox
-     *
-     * @param dataBoxJ box shifted to the cell of particle
-     * @param posEnd position of the particle after it is pushed
-     * @param velocity velocity of the particle
-     * @param charge charge of the particle
-     * @param deltaTime time of one time step
-     */
-    template<
-        typename DataBoxJ,
-        typename T_Acc
-    >
-    DINLINE void
-    operator()(
-        T_Acc const & acc,
-        DataBoxJ dataBoxJ,
-        floatD_X const posEnd,
-        float3_X const velocity,
-        float_X const charge,
-        float_X const/* deltaTime */
-    )
+    namespace currentSolver
     {
-        floatD_X deltaPos;
-        for ( uint32_t d = 0; d < simDim; ++d )
-            deltaPos[d] = ( velocity[d] * DELTA_T ) / cellSize[d];
+        template<typename T_ParticleShape, typename T_Strategy>
+        struct EmZ
+        {
+            using ParticleAssign = typename T_ParticleShape::ChargeAssignmentOnSupport;
+            static constexpr int supp = ParticleAssign::support;
+
+            static constexpr int currentLowerMargin = supp / 2 + 1 - (supp + 1) % 2;
+            static constexpr int currentUpperMargin = (supp + 1) / 2 + 1;
+            typedef typename pmacc::math::CT::make_Int<simDim, currentLowerMargin>::type LowerMargin;
+            typedef typename pmacc::math::CT::make_Int<simDim, currentUpperMargin>::type UpperMargin;
+
+            PMACC_CASSERT_MSG(
+                __EmZ_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
+                pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                        >= currentLowerMargin
+                    && pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                        >= currentUpperMargin);
+
+
+            static constexpr int begin = -currentLowerMargin + 1;
+            static constexpr int end = begin + supp;
+
+
+            /** deposit the current of a particle
+             *
+             * @tparam DataBoxJ any pmacc DataBox
+             *
+             * @param dataBoxJ box shifted to the cell of particle
+             * @param posEnd position of the particle after it is pushed
+             * @param velocity velocity of the particle
+             * @param charge charge of the particle
+             * @param deltaTime time of one time step
+             */
+            template<typename DataBoxJ, typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                DataBoxJ dataBoxJ,
+                floatD_X const posEnd,
+                float3_X const velocity,
+                float_X const charge,
+                float_X const /* deltaTime */
+            )
+            {
+                floatD_X deltaPos;
+                for(uint32_t d = 0; d < simDim; ++d)
+                    deltaPos[d] = (velocity[d] * DELTA_T) / cellSize[d];
+
+                /*note: all positions are normalized to the grid*/
+                const floatD_X posStart(posEnd - deltaPos);
+
+                DataSpace<simDim> I[2];
+                floatD_X relayPoint;
+
+                /* calculate the relay point for the trajectory splitting */
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    constexpr bool isSupportEven = (supp % 2 == 0);
+                    relayPoint[d] = RelayPoint<isSupportEven>()(I[0][d], I[1][d], posStart[d], posEnd[d]);
+                }
+
+                Line<floatD_X> line;
+                const float_X chargeDensity = charge / CELL_VOLUME;
+
+                /* Esirkepov implementation for the current deposition */
+                emz::DepositCurrent<typename T_Strategy::BlockReductionOp, ParticleAssign, begin, end> deposit;
+
+                /* calculate positions for the second virtual particle */
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    line.m_pos0[d] = calc_InCellPos(posStart[d], I[0][d]);
+                    line.m_pos1[d] = calc_InCellPos(relayPoint[d], I[0][d]);
+                }
+
+                const bool twoParticlesNeeded = I[0] != I[1];
+
+                deposit(
+                    acc,
+                    dataBoxJ.shift(I[0]).toCursor(),
+                    line,
+                    chargeDensity,
+                    velocity.z() * (twoParticlesNeeded ? float_X(0.5) : float_X(1.0)));
+
+                /* detect if there is a second virtual particle */
+                if(twoParticlesNeeded)
+                {
+                    /* calculate positions for the second virtual particle */
+                    for(uint32_t d = 0; d < simDim; ++d)
+                    {
+                        /* switched start and end point */
+                        line.m_pos1[d] = calc_InCellPos(posEnd[d], I[1][d]);
+                        line.m_pos0[d] = calc_InCellPos(relayPoint[d], I[1][d]);
+                    }
+                    deposit(acc, dataBoxJ.shift(I[1]).toCursor(), line, chargeDensity, velocity.z() * float_X(0.5));
+                }
+            }
 
-        /*note: all positions are normalized to the grid*/
-        const floatD_X posStart( posEnd - deltaPos );
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "EmZ");
+                return propList;
+            }
 
-        DataSpace<simDim> I[2];
-        floatD_X relayPoint;
 
-        /* calculate the relay point for the trajectory splitting */
-        for ( uint32_t d = 0; d < simDim; ++d )
-        {
-            constexpr bool isSupportEven = ( supp % 2 == 0 );
-            relayPoint[d] = RelayPoint< isSupportEven >()(
-                I[0][d],
-                I[1][d],
-                posStart[d],
-                posEnd[d]
-            );
-        }
-
-        Line< floatD_X > line;
-        const float_X chargeDensity = charge / CELL_VOLUME;
-
-        /* Esirkepov implementation for the current deposition */
-        emz::DepositCurrent<
-            ParticleAssign,
-            begin,
-            end
-        > deposit;
-
-        /* calculate positions for the second virtual particle */
-        for (uint32_t d = 0; d < simDim; ++d)
-        {
-            line.m_pos0[d] = calc_InCellPos(
-                posStart[d],
-                I[0][d]
-            );
-            line.m_pos1[d] = calc_InCellPos(
-                relayPoint[d],
-                I[0][d]
-            );
-        }
-
-        const bool twoParticlesNeeded = I[0] != I[1];
-
-        deposit(
-            acc,
-            dataBoxJ.shift( I[0] ).toCursor(),
-            line,
-            chargeDensity,
-            velocity.z() * ( twoParticlesNeeded ? float_X(0.5) : float_X(1.0) )
-        );
-
-        /* detect if there is a second virtual particle */
-        if( twoParticlesNeeded )
-        {
-            /* calculate positions for the second virtual particle */
-            for (uint32_t d = 0; d < simDim; ++d)
+            /** get normalized in cell particle position
+             *
+             * @param x position of the particle
+             * @param i shift of grid (only integral positions are allowed)
+             * @return in cell position
+             */
+            DINLINE float_X calc_InCellPos(const float_X x, const float_X i) const
             {
-                /* switched start and end point */
-                line.m_pos1[d] = calc_InCellPos(
-                    posEnd[d],
-                    I[1][d]
-                );
-                line.m_pos0[d] = calc_InCellPos(
-                    relayPoint[d],
-                    I[1][d]
-                );
+                return x - i;
             }
-            deposit(
-                acc,
-                dataBoxJ.shift( I[1] ).toCursor(),
-                line,
-                chargeDensity,
-                velocity.z() * float_X(0.5)
-            );
-        }
-    }
-
-    static pmacc::traits::StringProperty
-    getStringProperties()
-    {
-        pmacc::traits::StringProperty propList( "name", "EmZ" );
-        return propList;
-    }
-
-
-    /** get normalized in cell particle position
-     *
-     * @param x position of the particle
-     * @param i shift of grid (only integral positions are allowed)
-     * @return in cell position
-     */
-    DINLINE float_X
-    calc_InCellPos(
-        const float_X x,
-        const float_X i
-    ) const
-    {
-        return x - i;
-    }
-};
+        };
 
-} //namespace currentSolver
+    } // namespace currentSolver
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov.def b/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov.def
index 585d65842c..26d312976f 100644
--- a/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov.def
+++ b/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -17,54 +17,79 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/currentDeposition/Strategy.def"
 
 
 namespace picongpu
 {
-namespace currentSolver
-{
-using namespace pmacc;
+    namespace currentSolver
+    {
+        /**
+         * Implements the current deposition algorithm from T.Zh. Esirkepov
+         *
+         * for an arbitrary particle assign function given as a template parameter.
+         * See available shapes at "intermediateLib/particleShape".
+         * paper: "Exact charge conservation scheme for Particle-in-Cell simulation
+         *  with an arbitrary form-factor"
+         *
+         * @tparam T_ParticleShape the particle shape for the species, [picongpu::particles::shapes]
+         * @tparam T_Strategy Used strategy to reduce the scattered data [currentSolver::strategy]
+         * @tparam T_Dim Implementation for 2D or 3D
+         */
+        template<
+            typename T_ParticleShape,
+            typename T_Strategy = traits::GetDefaultStrategy_t<>,
+            uint32_t T_dim = simDim>
+        struct Esirkepov;
 
+        /** Paper like implementation of Esirkepov current deposition
+         *
+         * The implementation uses an non optimized stencil width and is therefore over
+         * 4x slower than the other Esirkepov implementation.
+         * @attention this solver is only for testing
+         *
+         * @tparam T_ParticleShape the particle shape for the species, [picongpu::particles::shapes]
+         * @tparam T_Strategy Used strategy to reduce the scattered data [currentSolver::strategy]
+         */
+        template<typename T_ParticleShape, typename T_Strategy = traits::GetDefaultStrategy_t<>>
+        struct EsirkepovNative;
 
-/**
- * Implements the current deposition algorithm from T.Zh. Esirkepov
- *
- * for an arbitrary particle assign function given as a template parameter.
- * See available shapes at "intermediateLib/particleShape".
- * paper: "Exact charge conservation scheme for Particle-in-Cell simulation
- *  with an arbitrary form-factor"
- *
- * \tparam T_ParticleShape the particle shape for the species, \see picongpu::particles::shapes
- * \tparam T_Dim Implementation for 2D or 3D
- */
-template<typename ParticleShape,uint32_t T_dim=simDim>
-struct Esirkepov;
+        namespace traits
+        {
+            template<typename T_ParticleShape, typename T_Strategy, uint32_t T_dim>
+            struct GetStrategy<Esirkepov<T_ParticleShape, T_Strategy, T_dim>>
+            {
+                using type = T_Strategy;
+            };
 
-template<typename ParticleAssign>
-struct EsirkepovNative;
+            template<typename T_ParticleShape, typename T_Strategy>
+            struct GetStrategy<EsirkepovNative<T_ParticleShape, T_Strategy>>
+            {
+                using type = T_Strategy;
+            };
 
-} //namespace currentSolver
+        } // namespace traits
+    } // namespace currentSolver
 
-namespace traits
-{
+    namespace traits
+    {
+        /*Get margin of a solver
+         * class must define a LowerMargin and UpperMargin
+         */
+        template<typename T_ParticleShape, typename T_Strategy, uint32_t T_dim>
+        struct GetMargin<picongpu::currentSolver::Esirkepov<T_ParticleShape, T_Strategy, T_dim>>
+        {
+        private:
+            using Solver = picongpu::currentSolver::Esirkepov<T_ParticleShape, T_Strategy, T_dim>;
 
-/*Get margin of a solver
- * class must define a LowerMargin and UpperMargin
- */
-template<typename ParticleShape,uint32_t T_dim>
-struct GetMargin<picongpu::currentSolver::Esirkepov<ParticleShape,T_dim> >
-{
-private:
-    typedef picongpu::currentSolver::Esirkepov<ParticleShape,T_dim> Solver;
-public:
-    typedef typename Solver::LowerMargin LowerMargin;
-    typedef typename Solver::UpperMargin UpperMargin;
-};
+        public:
+            typedef typename Solver::LowerMargin LowerMargin;
+            typedef typename Solver::UpperMargin UpperMargin;
+        };
 
-} //namespace traits
+    } // namespace traits
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov.hpp b/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov.hpp
index 63d220d226..8e28e0427f 100644
--- a/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov.hpp
+++ b/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,7 +24,6 @@
 #include <pmacc/cuSTL/cursor/Cursor.hpp>
 #include <pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp>
 #include <pmacc/cuSTL/cursor/compile-time/SafeCursor.hpp>
-#include <pmacc/nvidia/atomic.hpp>
 
 #include "picongpu/fields/currentDeposition/Esirkepov/Esirkepov.def"
 #include "picongpu/fields/currentDeposition/Esirkepov/Line.hpp"
@@ -33,252 +32,216 @@
 
 namespace picongpu
 {
-namespace currentSolver
-{
-using namespace pmacc;
-
-template<typename T_ParticleShape>
-struct Esirkepov<T_ParticleShape, DIM3>
-{
-    using ParticleAssign = typename T_ParticleShape::ChargeAssignment;
-    static constexpr int supp = ParticleAssign::support;
-
-    static constexpr int currentLowerMargin = supp / 2 + 1 - (supp + 1) % 2;
-    static constexpr int currentUpperMargin = (supp + 1) / 2 + 1;
-    typedef pmacc::math::CT::Int<currentLowerMargin, currentLowerMargin, currentLowerMargin> LowerMargin;
-    typedef pmacc::math::CT::Int<currentUpperMargin, currentUpperMargin, currentUpperMargin> UpperMargin;
-
-    PMACC_CASSERT_MSG(
-        __Esirkepov_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= currentLowerMargin &&
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= currentUpperMargin
-    );
-
-    float_X charge;
-
-    /* At the moment Esirkepov only supports Yee cells where W is defined at origin (0,0,0)
-     *
-     * \todo: please fix me that we can use CenteredCell
-     */
-    template<
-        typename DataBoxJ,
-        typename PosType,
-        typename VelType,
-        typename ChargeType,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        DataBoxJ dataBoxJ,
-        const PosType pos,
-        const VelType velocity,
-        const ChargeType charge,
-        const float_X deltaTime
-    )
+    namespace currentSolver
     {
-        this->charge = charge;
-        const float3_X deltaPos = float3_X(velocity.x() * deltaTime / cellSize.x(),
-                                           velocity.y() * deltaTime / cellSize.y(),
-                                           velocity.z() * deltaTime / cellSize.z());
-        const PosType oldPos = pos - deltaPos;
-        Line<float3_X> line(oldPos, pos);
-
-        DataSpace<DIM3> gridShift;
-
-        /* Define in which direction the particle leaves the cell.
-         * It is not relevant whether the particle leaves the cell via
-         * the positive or negative cell border.
-         *
-         * 0 == stay in cell
-         * 1 == leave cell
-         */
-        DataSpace<simDim> leaveCell;
-
-        /* calculate the offset for the virtual coordinate system */
-        for(int d=0; d<simDim; ++d)
+        template<typename T_ParticleShape, typename T_Strategy>
+        struct Esirkepov<T_ParticleShape, T_Strategy, DIM3>
         {
-            int iStart;
-            int iEnd;
-            constexpr bool isSupportEven = ( supp % 2 == 0 );
-            RelayPoint< isSupportEven >()(
-                iStart,
-                iEnd,
-                line.m_pos0[d],
-                line.m_pos1[d]
-            );
-            gridShift[d] = iStart < iEnd ? iStart : iEnd; // integer min function
-            /* particle is leaving the cell */
-            leaveCell[d] = iStart != iEnd ? 1 : 0;
-            /* shift the particle position to the virtual coordinate system */
-            line.m_pos0[d] -= gridShift[d];
-            line.m_pos1[d] -= gridShift[d];
-        }
-        /* shift current field to the virtual coordinate system */
-        auto cursorJ = dataBoxJ.shift(gridShift).toCursor();
-        /**
-         * \brief the following three calls separate the 3D current deposition
-         * into three independent 1D calls, each for one direction and current component.
-         * Therefore the coordinate system has to be rotated so that the z-direction
-         * is always specific.
-         */
-        using namespace cursor::tools;
-        cptCurrent1D(
-            acc,
-            DataSpace<simDim>(leaveCell.y(),leaveCell.z(),leaveCell.x()),
-            twistVectorFieldAxes<pmacc::math::CT::Int < 1, 2, 0 > >(cursorJ),
-            rotateOrigin < 1, 2, 0 > (line),
-            cellSize.x()
-        );
-        cptCurrent1D(
-            acc,
-            DataSpace<simDim>(leaveCell.z(),leaveCell.x(),leaveCell.y()),
-            twistVectorFieldAxes<pmacc::math::CT::Int < 2, 0, 1 > >(cursorJ),
-            rotateOrigin < 2, 0, 1 > (line),
-            cellSize.y()
-        );
-        cptCurrent1D(
-            acc,
-            leaveCell,
-            cursorJ,
-            line,
-            cellSize.z()
-        );
-    }
-
-    /**
-     * deposites current in z-direction
-     *
-     * \param leaveCell vector with information if the particle is leaving the cell
-     *         (for each direction, 0 means stays in cell and 1 means leaves cell)
-     * \param cursorJ cursor pointing at the current density field of the particle's cell
-     * \param line trajectory of the particle from to last to the current time step
-     * \param cellEdgeLength length of edge of the cell in z-direction
-     */
-    template<
-        typename CursorJ,
-        typename T_Acc
-    >
-    DINLINE void cptCurrent1D(
-        T_Acc const & acc,
-        const DataSpace<simDim>& leaveCell,
-        CursorJ cursorJ,
-        const Line<float3_X>& line,
-        const float_X cellEdgeLength
-    )
-    {
-        /* skip calculation if the particle is not moving in z direction */
-        if(line.m_pos0[2] == line.m_pos1[2])
-            return;
-
-        constexpr int begin = -currentLowerMargin + 1;
-        constexpr int end = begin + supp;
-
-        /* We multiply with `cellEdgeLength` due to the fact that the attribute for the
-         * in-cell particle `position` (and it's change in DELTA_T) is normalize to [0,1)
-         */
-        const float_X currentSurfaceDensity = this->charge * (float_X(1.0) / float_X(CELL_VOLUME * DELTA_T)) * cellEdgeLength;
+            using ParticleAssign = typename T_ParticleShape::ChargeAssignment;
+            static constexpr int supp = ParticleAssign::support;
+
+            static constexpr int currentLowerMargin = supp / 2 + 1 - (supp + 1) % 2;
+            static constexpr int currentUpperMargin = (supp + 1) / 2 + 1;
+            typedef pmacc::math::CT::Int<currentLowerMargin, currentLowerMargin, currentLowerMargin> LowerMargin;
+            typedef pmacc::math::CT::Int<currentUpperMargin, currentUpperMargin, currentUpperMargin> UpperMargin;
+
+            PMACC_CASSERT_MSG(
+                __Esirkepov_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
+                pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                        >= currentLowerMargin
+                    && pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                        >= currentUpperMargin);
+
+            float_X charge;
+
+            /* At the moment Esirkepov only supports Yee cells where W is defined at origin (0,0,0)
+             *
+             * \todo: please fix me that we can use CenteredCell
+             */
+            template<typename DataBoxJ, typename PosType, typename VelType, typename ChargeType, typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                DataBoxJ dataBoxJ,
+                const PosType pos,
+                const VelType velocity,
+                const ChargeType charge,
+                const float_X deltaTime)
+            {
+                this->charge = charge;
+                const float3_X deltaPos = float3_X(
+                    velocity.x() * deltaTime / cellSize.x(),
+                    velocity.y() * deltaTime / cellSize.y(),
+                    velocity.z() * deltaTime / cellSize.z());
+                const PosType oldPos = pos - deltaPos;
+                Line<float3_X> line(oldPos, pos);
+
+                DataSpace<DIM3> gridShift;
+
+                /* Define in which direction the particle leaves the cell.
+                 * It is not relevant whether the particle leaves the cell via
+                 * the positive or negative cell border.
+                 *
+                 * 0 == stay in cell
+                 * 1 == leave cell
+                 */
+                DataSpace<simDim> leaveCell;
+
+                /* calculate the offset for the virtual coordinate system */
+                for(int d = 0; d < simDim; ++d)
+                {
+                    int iStart;
+                    int iEnd;
+                    constexpr bool isSupportEven = (supp % 2 == 0);
+                    RelayPoint<isSupportEven>()(iStart, iEnd, line.m_pos0[d], line.m_pos1[d]);
+                    gridShift[d] = iStart < iEnd ? iStart : iEnd; // integer min function
+                    /* particle is leaving the cell */
+                    leaveCell[d] = iStart != iEnd ? 1 : 0;
+                    /* shift the particle position to the virtual coordinate system */
+                    line.m_pos0[d] -= gridShift[d];
+                    line.m_pos1[d] -= gridShift[d];
+                }
+                /* shift current field to the virtual coordinate system */
+                auto cursorJ = dataBoxJ.shift(gridShift).toCursor();
+                /**
+                 * \brief the following three calls separate the 3D current deposition
+                 * into three independent 1D calls, each for one direction and current component.
+                 * Therefore the coordinate system has to be rotated so that the z-direction
+                 * is always specific.
+                 */
+                using namespace cursor::tools;
+                cptCurrent1D(
+                    acc,
+                    DataSpace<simDim>(leaveCell.y(), leaveCell.z(), leaveCell.x()),
+                    twistVectorFieldAxes<pmacc::math::CT::Int<1, 2, 0>>(cursorJ),
+                    rotateOrigin<1, 2, 0>(line),
+                    cellSize.x());
+                cptCurrent1D(
+                    acc,
+                    DataSpace<simDim>(leaveCell.z(), leaveCell.x(), leaveCell.y()),
+                    twistVectorFieldAxes<pmacc::math::CT::Int<2, 0, 1>>(cursorJ),
+                    rotateOrigin<2, 0, 1>(line),
+                    cellSize.y());
+                cptCurrent1D(acc, leaveCell, cursorJ, line, cellSize.z());
+            }
 
-        /* pick every cell in the xy-plane that is overlapped by particle's
-         * form factor and deposit the current for the cells above and beneath
-         * that cell and for the cell itself.
-         *
-         * for loop optimization (help the compiler to generate better code):
-         *   - use a loop with a static range
-         *   - skip invalid indexes with a if condition around the full loop body
-         *     ( this helps the compiler to mask threads without work )
-         */
-        for( int i = begin ; i < end  + 1; ++i )
-            if( i < end + leaveCell[0] )
+            /**
+             * deposites current in z-direction
+             *
+             * \param leaveCell vector with information if the particle is leaving the cell
+             *         (for each direction, 0 means stays in cell and 1 means leaves cell)
+             * \param cursorJ cursor pointing at the current density field of the particle's cell
+             * \param line trajectory of the particle from to last to the current time step
+             * \param cellEdgeLength length of edge of the cell in z-direction
+             */
+            template<typename CursorJ, typename T_Acc>
+            DINLINE void cptCurrent1D(
+                T_Acc const& acc,
+                const DataSpace<simDim>& leaveCell,
+                CursorJ cursorJ,
+                const Line<float3_X>& line,
+                const float_X cellEdgeLength)
             {
-                const float_X s0i = S0( line, i, 0 );
-                const float_X dsi = S1( line, i, 0 ) - s0i;
-                for( int j = begin ; j < end  + 1; ++j )
-                    if( j < end + leaveCell[1] )
+                /* skip calculation if the particle is not moving in z direction */
+                if(line.m_pos0[2] == line.m_pos1[2])
+                    return;
+
+                constexpr int begin = -currentLowerMargin + 1;
+                constexpr int end = begin + supp;
+
+                /* We multiply with `cellEdgeLength` due to the fact that the attribute for the
+                 * in-cell particle `position` (and it's change in DELTA_T) is normalize to [0,1)
+                 */
+                const float_X currentSurfaceDensity
+                    = this->charge * (float_X(1.0) / float_X(CELL_VOLUME * DELTA_T)) * cellEdgeLength;
+
+                /* pick every cell in the xy-plane that is overlapped by particle's
+                 * form factor and deposit the current for the cells above and beneath
+                 * that cell and for the cell itself.
+                 *
+                 * for loop optimization (help the compiler to generate better code):
+                 *   - use a loop with a static range
+                 *   - skip invalid indexes with a if condition around the full loop body
+                 *     ( this helps the compiler to mask threads without work )
+                 */
+                for(int i = begin; i < end + 1; ++i)
+                    if(i < end + leaveCell[0])
                     {
-                        const float_X s0j = S0( line, j, 1 );
-                        const float_X dsj = S1( line, j, 1 ) - s0j;
+                        const float_X s0i = S0(line, i, 0);
+                        const float_X dsi = S1(line, i, 0) - s0i;
+                        for(int j = begin; j < end + 1; ++j)
+                            if(j < end + leaveCell[1])
+                            {
+                                const float_X s0j = S0(line, j, 1);
+                                const float_X dsj = S1(line, j, 1) - s0j;
 
-                        float_X tmp =
-                            -currentSurfaceDensity * (
-                                s0i * s0j +
-                                float_X( 0.5 ) * ( dsi * s0j + s0i * dsj ) +
-                                ( float_X( 1.0 ) / float_X( 3.0 ) ) * dsj * dsi
-                            );
+                                float_X tmp = -currentSurfaceDensity
+                                    * (s0i * s0j + float_X(0.5) * (dsi * s0j + s0i * dsj)
+                                       + (float_X(1.0) / float_X(3.0)) * dsj * dsi);
 
-                        float_X accumulated_J = float_X( 0.0 );
+                                float_X accumulated_J = float_X(0.0);
 
-                        /* attention: inner loop has no upper bound `end + 1` because
-                         * the current for the point `end` is always zero,
-                         * therefore we skip the calculation
-                         */
-                        for( int k = begin ; k < end; ++k )
-                            if( k < end + leaveCell[2] - 1 )
-                            {
-                                /* This is the implementation of the FORTRAN W(i,j,k,3)/ C style W(i,j,k,2) version from
-                                 * Esirkepov paper. All coordinates are rotated before thus we can
-                                 * always use C style W(i,j,k,2).
+                                /* attention: inner loop has no upper bound `end + 1` because
+                                 * the current for the point `end` is always zero,
+                                 * therefore we skip the calculation
                                  */
-                                const float_X W = DS( line, k, 2 ) * tmp;
-                                accumulated_J += W;
-                                atomicAdd( &( ( *cursorJ( i, j, k ) ).z() ), accumulated_J, ::alpaka::hierarchy::Threads{} );
+                                for(int k = begin; k < end; ++k)
+                                    if(k < end + leaveCell[2] - 1)
+                                    {
+                                        /* This is the implementation of the FORTRAN W(i,j,k,3)/ C style W(i,j,k,2)
+                                         * version from Esirkepov paper. All coordinates are rotated before thus we can
+                                         * always use C style W(i,j,k,2).
+                                         */
+                                        const float_X W = DS(line, k, 2) * tmp;
+                                        accumulated_J += W;
+                                        auto const atomicOp = typename T_Strategy::BlockReductionOp{};
+                                        atomicOp(acc, (*cursorJ(i, j, k)).z(), accumulated_J);
+                                    }
                             }
                     }
             }
 
-    }
-
-    /** calculate S0 (see paper)
-     * @param line element with previous and current position of the particle
-     * @param gridPoint used grid point to evaluate assignment shape
-     * @param d dimension range {0,1,2} means {x,y,z}
-     *          different to Esirkepov paper, here we use C style
-     */
-    DINLINE float_X S0(const Line<float3_X>& line, const float_X gridPoint, const uint32_t d)
-    {
-        return ParticleAssign()(gridPoint - line.m_pos0[d]);
-    }
+            /** calculate S0 (see paper)
+             * @param line element with previous and current position of the particle
+             * @param gridPoint used grid point to evaluate assignment shape
+             * @param d dimension range {0,1,2} means {x,y,z}
+             *          different to Esirkepov paper, here we use C style
+             */
+            DINLINE float_X S0(const Line<float3_X>& line, const float_X gridPoint, const uint32_t d)
+            {
+                return ParticleAssign()(gridPoint - line.m_pos0[d]);
+            }
 
-   /** calculate S1 (see paper)
-     * @param line element with previous and current position of the particle
-     * @param gridPoint used grid point to evaluate assignment shape
-     * @param d dimension range {0,1,2} means {x,y,z}
-     *          different to Esirkepov paper, here we use C style
-     */
-    DINLINE float_X S1(const Line<float3_X>& line, const float_X gridPoint, const uint32_t d)
-    {
-        return ParticleAssign()(gridPoint - line.m_pos1[d]);
-    }
+            /** calculate S1 (see paper)
+             * @param line element with previous and current position of the particle
+             * @param gridPoint used grid point to evaluate assignment shape
+             * @param d dimension range {0,1,2} means {x,y,z}
+             *          different to Esirkepov paper, here we use C style
+             */
+            DINLINE float_X S1(const Line<float3_X>& line, const float_X gridPoint, const uint32_t d)
+            {
+                return ParticleAssign()(gridPoint - line.m_pos1[d]);
+            }
 
-    /** calculate DS (see paper)
-     * @param line element with previous and current position of the particle
-     * @param gridPoint used grid point to evaluate assignment shape
-     * @param d dimension range {0,1,2} means {x,y,z}]
-     *          different to Esirkepov paper, here we use C style
-     */
-    DINLINE float_X DS(const Line<float3_X>& line, const float_X gridPoint, const uint32_t d)
-    {
-        return ParticleAssign()(gridPoint - line.m_pos1[d]) - ParticleAssign()(gridPoint - line.m_pos0[d]);
-    }
+            /** calculate DS (see paper)
+             * @param line element with previous and current position of the particle
+             * @param gridPoint used grid point to evaluate assignment shape
+             * @param d dimension range {0,1,2} means {x,y,z}]
+             *          different to Esirkepov paper, here we use C style
+             */
+            DINLINE float_X DS(const Line<float3_X>& line, const float_X gridPoint, const uint32_t d)
+            {
+                return ParticleAssign()(gridPoint - line.m_pos1[d]) - ParticleAssign()(gridPoint - line.m_pos0[d]);
+            }
 
-    static pmacc::traits::StringProperty getStringProperties()
-    {
-        pmacc::traits::StringProperty propList( "name", "Esirkepov" );
-        return propList;
-    }
-};
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "Esirkepov");
+                return propList;
+            }
+        };
 
-} //namespace currentSolver
+    } // namespace currentSolver
 
-} //namespace picongpu
+} // namespace picongpu
 
 #include "picongpu/fields/currentDeposition/Esirkepov/Esirkepov2D.hpp"
diff --git a/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov2D.hpp b/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov2D.hpp
index 9391726dab..195f201c4a 100644
--- a/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov2D.hpp
+++ b/include/picongpu/fields/currentDeposition/Esirkepov/Esirkepov2D.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2014-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -23,7 +23,6 @@
 #include <pmacc/cuSTL/cursor/Cursor.hpp>
 #include <pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp>
 #include <pmacc/cuSTL/cursor/compile-time/SafeCursor.hpp>
-#include <pmacc/nvidia/atomic.hpp>
 
 #include "picongpu/fields/currentDeposition/Esirkepov/Esirkepov.hpp"
 #include "picongpu/fields/currentDeposition/Esirkepov/Line.hpp"
@@ -32,283 +31,233 @@
 
 namespace picongpu
 {
-namespace currentSolver
-{
-using namespace pmacc;
-
-/**
- * Implements the current deposition algorithm from T.Zh. Esirkepov
- *
- * for an arbitrary particle assign function given as a template parameter.
- * See available shapes at "intermediateLib/particleShape".
- * paper: "Exact charge conservation scheme for Particle-in-Cell simulation
- *  with an arbitrary form-factor"
- */
-template<typename T_ParticleShape>
-struct Esirkepov<T_ParticleShape, DIM2>
-{
-    using ParticleAssign = typename T_ParticleShape::ChargeAssignment;
-    static constexpr int supp = ParticleAssign::support;
-
-    static constexpr int currentLowerMargin = supp / 2 + 1 - (supp + 1) % 2;
-    static constexpr int currentUpperMargin = (supp + 1) / 2 + 1;
-    typedef typename pmacc::math::CT::make_Int<DIM2, currentLowerMargin>::type LowerMargin;
-    typedef typename pmacc::math::CT::make_Int<DIM2, currentUpperMargin>::type UpperMargin;
-
-    PMACC_CASSERT_MSG(
-        __Esirkepov2D_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= currentLowerMargin &&
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= currentUpperMargin
-    );
-
-    static constexpr int begin = -currentLowerMargin + 1;
-    static constexpr int end = begin + supp;
-
-    float_X charge;
-
-    template<
-        typename DataBoxJ,
-        typename PosType,
-        typename VelType,
-        typename ChargeType,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        DataBoxJ dataBoxJ,
-        const PosType pos,
-        const VelType velocity,
-        const ChargeType charge,
-        const float_X deltaTime
-    )
+    namespace currentSolver
     {
-        this->charge = charge;
-        const float2_X deltaPos = float2_X(velocity.x() * deltaTime / cellSize.x(),
-                                           velocity.y() * deltaTime / cellSize.y());
-        const PosType oldPos = pos - deltaPos;
-        Line<float2_X> line(oldPos, pos);
-
-        DataSpace<simDim> gridShift;
-        /* Define in which direction the particle leaves the cell.
-         * It is not important whether the particle move over the positive or negative
-         * cell border.
+        /**
+         * Implements the current deposition algorithm from T.Zh. Esirkepov
          *
-         * 0 == stay in cell
-         * 1 == leave cell
+         * for an arbitrary particle assign function given as a template parameter.
+         * See available shapes at "intermediateLib/particleShape".
+         * paper: "Exact charge conservation scheme for Particle-in-Cell simulation
+         *  with an arbitrary form-factor"
          */
-        DataSpace<DIM2> leaveCell;
-
-        /* calculate the offset for the virtual coordinate system */
-        for(int d=0; d<simDim; ++d)
+        template<typename T_ParticleShape, typename T_Strategy>
+        struct Esirkepov<T_ParticleShape, T_Strategy, DIM2>
         {
-            int iStart;
-            int iEnd;
-            constexpr bool isSupportEven = ( supp % 2 == 0 );
-            RelayPoint< isSupportEven >()(
-                iStart,
-                iEnd,
-                line.m_pos0[d],
-                line.m_pos1[d]
-            );
-            gridShift[d] = iStart < iEnd ? iStart : iEnd; // integer min function
-            /* particle is leaving the cell */
-            leaveCell[d] = iStart != iEnd ? 1 : 0;
-            /* shift the particle position to the virtual coordinate system */
-            line.m_pos0[d] -= gridShift[d];
-            line.m_pos1[d] -= gridShift[d];
-        }
-        /* shift current field to the virtual coordinate system */
-        auto cursorJ = dataBoxJ.shift(gridShift).toCursor();
-
-        /**
-         * \brief the following three calls separate the 3D current deposition
-         * into three independent 1D calls, each for one direction and current component.
-         * Therefore the coordinate system has to be rotated so that the z-direction
-         * is always specific.
-         */
-
-        using namespace cursor::tools;
-        cptCurrent1D(
-            acc,
-            leaveCell,
-            cursorJ,
-            line,
-            cellSize.x()
-        );
-        cptCurrent1D(
-            acc,
-            DataSpace<DIM2>(
-                leaveCell[1],
-                leaveCell[0]
-            ),
-            twistVectorFieldAxes<pmacc::math::CT::Int < 1, 0 > >(cursorJ),
-            rotateOrigin < 1, 0 > (line),
-            cellSize.y()
-        );
-        cptCurrentZ(
-            acc,
-            leaveCell,
-            cursorJ,
-            line,
-            velocity.z()
-        );
-    }
-
-    /**
-     * deposites current in z-direction
-     * \param leaveCell vector with information if the particle is leaving the cell
-     *         (for each direction, 0 means stays in cell and 1 means leaves cell)
-     * \param cursorJ cursor pointing at the current density field of the particle's cell
-     * \param line trajectory of the particle from to last to the current time step
-     * \param cellEdgeLength length of edge of the cell in z-direction
-     *
-     * @{
-     */
-    template<
-        typename CursorJ,
-        typename T_Acc
-    >
-    DINLINE void cptCurrent1D(
-        T_Acc const & acc,
-        const DataSpace<simDim>& leaveCell,
-        CursorJ cursorJ,
-        const Line<float2_X>& line,
-        const float_X cellEdgeLength
-    )
-    {
-        /* skip calculation if the particle is not moving in x direction */
-        if(line.m_pos0[0] == line.m_pos1[0])
-            return;
+            using ParticleAssign = typename T_ParticleShape::ChargeAssignment;
+            static constexpr int supp = ParticleAssign::support;
+
+            static constexpr int currentLowerMargin = supp / 2 + 1 - (supp + 1) % 2;
+            static constexpr int currentUpperMargin = (supp + 1) / 2 + 1;
+            typedef typename pmacc::math::CT::make_Int<DIM2, currentLowerMargin>::type LowerMargin;
+            typedef typename pmacc::math::CT::make_Int<DIM2, currentUpperMargin>::type UpperMargin;
+
+            PMACC_CASSERT_MSG(
+                __Esirkepov2D_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
+                pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                        >= currentLowerMargin
+                    && pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                        >= currentUpperMargin);
+
+            static constexpr int begin = -currentLowerMargin + 1;
+            static constexpr int end = begin + supp;
+
+            float_X charge;
+
+            template<typename DataBoxJ, typename PosType, typename VelType, typename ChargeType, typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                DataBoxJ dataBoxJ,
+                const PosType pos,
+                const VelType velocity,
+                const ChargeType charge,
+                const float_X deltaTime)
+            {
+                this->charge = charge;
+                const float2_X deltaPos
+                    = float2_X(velocity.x() * deltaTime / cellSize.x(), velocity.y() * deltaTime / cellSize.y());
+                const PosType oldPos = pos - deltaPos;
+                Line<float2_X> line(oldPos, pos);
+
+                DataSpace<simDim> gridShift;
+                /* Define in which direction the particle leaves the cell.
+                 * It is not important whether the particle move over the positive or negative
+                 * cell border.
+                 *
+                 * 0 == stay in cell
+                 * 1 == leave cell
+                 */
+                DataSpace<DIM2> leaveCell;
+
+                /* calculate the offset for the virtual coordinate system */
+                for(int d = 0; d < simDim; ++d)
+                {
+                    int iStart;
+                    int iEnd;
+                    constexpr bool isSupportEven = (supp % 2 == 0);
+                    RelayPoint<isSupportEven>()(iStart, iEnd, line.m_pos0[d], line.m_pos1[d]);
+                    gridShift[d] = iStart < iEnd ? iStart : iEnd; // integer min function
+                    /* particle is leaving the cell */
+                    leaveCell[d] = iStart != iEnd ? 1 : 0;
+                    /* shift the particle position to the virtual coordinate system */
+                    line.m_pos0[d] -= gridShift[d];
+                    line.m_pos1[d] -= gridShift[d];
+                }
+                /* shift current field to the virtual coordinate system */
+                auto cursorJ = dataBoxJ.shift(gridShift).toCursor();
+
+                /**
+                 * \brief the following three calls separate the 3D current deposition
+                 * into three independent 1D calls, each for one direction and current component.
+                 * Therefore the coordinate system has to be rotated so that the z-direction
+                 * is always specific.
+                 */
 
-        /* We multiply with `cellEdgeLength` due to the fact that the attribute for the
-         * in-cell particle `position` (and it's change in DELTA_T) is normalize to [0,1)
-         */
-        const float_X currentSurfaceDensity = this->charge * ( float_X( 1.0 ) / float_X( CELL_VOLUME * DELTA_T ) ) * cellEdgeLength;
+                using namespace cursor::tools;
+                cptCurrent1D(acc, leaveCell, cursorJ, line, cellSize.x());
+                cptCurrent1D(
+                    acc,
+                    DataSpace<DIM2>(leaveCell[1], leaveCell[0]),
+                    twistVectorFieldAxes<pmacc::math::CT::Int<1, 0>>(cursorJ),
+                    rotateOrigin<1, 0>(line),
+                    cellSize.y());
+                cptCurrentZ(acc, leaveCell, cursorJ, line, velocity.z());
+            }
 
-        for( int j = begin; j < end + 1; ++j )
-            if( j < end + leaveCell[1] )
+            /**
+             * deposites current in z-direction
+             * \param leaveCell vector with information if the particle is leaving the cell
+             *         (for each direction, 0 means stays in cell and 1 means leaves cell)
+             * \param cursorJ cursor pointing at the current density field of the particle's cell
+             * \param line trajectory of the particle from to last to the current time step
+             * \param cellEdgeLength length of edge of the cell in z-direction
+             *
+             * @{
+             */
+            template<typename CursorJ, typename T_Acc>
+            DINLINE void cptCurrent1D(
+                T_Acc const& acc,
+                const DataSpace<simDim>& leaveCell,
+                CursorJ cursorJ,
+                const Line<float2_X>& line,
+                const float_X cellEdgeLength)
             {
-                const float_X s0j = S0( line, j, 1 );
-                const float_X dsj = S1( line, j, 1 ) - s0j;
+                /* skip calculation if the particle is not moving in x direction */
+                if(line.m_pos0[0] == line.m_pos1[0])
+                    return;
 
-                float_X tmp = -currentSurfaceDensity *
-                    (
-                        s0j +
-                        float_X( 0.5 ) * dsj
-                    );
-
-                float_X accumulated_J = float_X(0.0);
-                /* attention: inner loop has no upper bound `end + 1` because
-                 * the current for the point `end` is always zero,
-                 * therefore we skip the calculation
+                /* We multiply with `cellEdgeLength` due to the fact that the attribute for the
+                 * in-cell particle `position` (and it's change in DELTA_T) is normalize to [0,1)
                  */
-                for( int i = begin; i < end; ++i )
-                    if( i < end + leaveCell[0] - 1 )
+                const float_X currentSurfaceDensity
+                    = this->charge * (float_X(1.0) / float_X(CELL_VOLUME * DELTA_T)) * cellEdgeLength;
+
+                for(int j = begin; j < end + 1; ++j)
+                    if(j < end + leaveCell[1])
                     {
-                        /* This is the implementation of the FORTRAN W(i,j,k,1)/ C style W(i,j,k,0) version from
-                         * Esirkepov paper. All coordinates are rotated before thus we can
-                         * always use C style W(i,j,k,0).
+                        const float_X s0j = S0(line, j, 1);
+                        const float_X dsj = S1(line, j, 1) - s0j;
+
+                        float_X tmp = -currentSurfaceDensity * (s0j + float_X(0.5) * dsj);
+
+                        float_X accumulated_J = float_X(0.0);
+                        /* attention: inner loop has no upper bound `end + 1` because
+                         * the current for the point `end` is always zero,
+                         * therefore we skip the calculation
                          */
-                        const float_X W = DS( line, i, 0 ) * tmp;
-                        accumulated_J += W;
-                        atomicAdd( &( ( *cursorJ( i, j ) ).x() ), accumulated_J, ::alpaka::hierarchy::Threads{} );
+                        for(int i = begin; i < end; ++i)
+                            if(i < end + leaveCell[0] - 1)
+                            {
+                                /* This is the implementation of the FORTRAN W(i,j,k,1)/ C style W(i,j,k,0) version
+                                 * from Esirkepov paper. All coordinates are rotated before thus we can always use C
+                                 * style W(i,j,k,0).
+                                 */
+                                const float_X W = DS(line, i, 0) * tmp;
+                                accumulated_J += W;
+                                auto const atomicOp = typename T_Strategy::BlockReductionOp{};
+                                atomicOp(acc, (*cursorJ(i, j)).x(), accumulated_J);
+                            }
                     }
             }
 
-    }
-
-    template<
-        typename CursorJ,
-        typename T_Acc
-    >
-    DINLINE void cptCurrentZ(
-        T_Acc const & acc,
-        const DataSpace<simDim>& leaveCell,
-        CursorJ cursorJ,
-        const Line<float2_X>& line,
-        const float_X v_z
-    )
-    {
-        if( v_z == float_X( 0.0 ) )
-            return;
-
-        const float_X currentSurfaceDensityZ = this->charge * ( float_X( 1.0 ) / float_X( CELL_VOLUME ) ) * v_z;
-
-        for( int j = begin; j < end + 1; ++j )
-            if( j < end + leaveCell[1] )
+            template<typename CursorJ, typename T_Acc>
+            DINLINE void cptCurrentZ(
+                T_Acc const& acc,
+                const DataSpace<simDim>& leaveCell,
+                CursorJ cursorJ,
+                const Line<float2_X>& line,
+                const float_X v_z)
             {
-                const float_X s0j = S0( line, j, 1 );
-                const float_X dsj = S1( line, j, 1 ) - s0j;
+                if(v_z == float_X(0.0))
+                    return;
 
-                for( int i = begin; i < end + 1; ++i )
-                    if( i < end + leaveCell[0] )
-                    {
-                        const float_X s0i = S0( line, i, 0 );
-                        const float_X dsi = S1( line, i, 0 ) - s0i;
-                        float_X W = s0i * S0( line, j, 1 ) +
-                            float_X( 0.5 ) * ( dsi * s0j + s0i * dsj ) +
-                            ( float_X( 1.0 ) / float_X( 3.0 ) ) * dsi * dsj;
+                const float_X currentSurfaceDensityZ = this->charge * (float_X(1.0) / float_X(CELL_VOLUME)) * v_z;
 
-                        const float_X j_z = W * currentSurfaceDensityZ;
-                        atomicAdd( &( ( *cursorJ( i, j ) ).z() ), j_z, ::alpaka::hierarchy::Threads{} );
+                for(int j = begin; j < end + 1; ++j)
+                    if(j < end + leaveCell[1])
+                    {
+                        const float_X s0j = S0(line, j, 1);
+                        const float_X dsj = S1(line, j, 1) - s0j;
+
+                        for(int i = begin; i < end + 1; ++i)
+                            if(i < end + leaveCell[0])
+                            {
+                                const float_X s0i = S0(line, i, 0);
+                                const float_X dsi = S1(line, i, 0) - s0i;
+                                float_X W = s0i * S0(line, j, 1) + float_X(0.5) * (dsi * s0j + s0i * dsj)
+                                    + (float_X(1.0) / float_X(3.0)) * dsi * dsj;
+
+                                const float_X j_z = W * currentSurfaceDensityZ;
+                                auto const atomicOp = typename T_Strategy::BlockReductionOp{};
+                                atomicOp(acc, (*cursorJ(i, j)).z(), j_z);
+                            }
                     }
             }
-    }
-
-    /**
-     * @}
-     */
 
-    /** calculate S0 (see paper)
-     * @param line element with previous and current position of the particle
-     * @param gridPoint used grid point to evaluate assignment shape
-     * @param d dimension range {0,1} means {x,y}
-     *          different to Esirkepov paper, here we use C style
-     */
-    DINLINE float_X S0(const Line<float2_X>& line, const float_X gridPoint, const uint32_t d)
-    {
-        return ParticleAssign()(gridPoint - line.m_pos0[d]);
-    }
+            /**
+             * @}
+             */
+
+            /** calculate S0 (see paper)
+             * @param line element with previous and current position of the particle
+             * @param gridPoint used grid point to evaluate assignment shape
+             * @param d dimension range {0,1} means {x,y}
+             *          different to Esirkepov paper, here we use C style
+             */
+            DINLINE float_X S0(const Line<float2_X>& line, const float_X gridPoint, const uint32_t d)
+            {
+                return ParticleAssign()(gridPoint - line.m_pos0[d]);
+            }
 
-    /** calculate S1 (see paper)
-     * @param line element with previous and current position of the particle
-     * @param gridPoint used grid point to evaluate assignment shape
-     * @param d dimension range {0,1,2} means {x,y,z}
-     *          different to Esirkepov paper, here we use C style
-     */
-    DINLINE float_X S1(const Line<float2_X>& line, const float_X gridPoint, const uint32_t d)
-    {
-        return ParticleAssign()(gridPoint - line.m_pos1[d]);
-    }
+            /** calculate S1 (see paper)
+             * @param line element with previous and current position of the particle
+             * @param gridPoint used grid point to evaluate assignment shape
+             * @param d dimension range {0,1,2} means {x,y,z}
+             *          different to Esirkepov paper, here we use C style
+             */
+            DINLINE float_X S1(const Line<float2_X>& line, const float_X gridPoint, const uint32_t d)
+            {
+                return ParticleAssign()(gridPoint - line.m_pos1[d]);
+            }
 
-    /** calculate DS (see paper)
-     * @param line element with previous and current position of the particle
-     * @param gridPoint used grid point to evaluate assignment shape
-     * @param d dimension range {0,1} means {x,y}
-     *          different to Esirkepov paper, here we use C style
-     */
-    DINLINE float_X DS(const Line<float2_X>& line, const float_X gridPoint, const uint32_t d)
-    {
-        return ParticleAssign()(gridPoint - line.m_pos1[d]) - ParticleAssign()(gridPoint - line.m_pos0[d]);
-    }
+            /** calculate DS (see paper)
+             * @param line element with previous and current position of the particle
+             * @param gridPoint used grid point to evaluate assignment shape
+             * @param d dimension range {0,1} means {x,y}
+             *          different to Esirkepov paper, here we use C style
+             */
+            DINLINE float_X DS(const Line<float2_X>& line, const float_X gridPoint, const uint32_t d)
+            {
+                return ParticleAssign()(gridPoint - line.m_pos1[d]) - ParticleAssign()(gridPoint - line.m_pos0[d]);
+            }
 
-    static pmacc::traits::StringProperty getStringProperties()
-    {
-        pmacc::traits::StringProperty propList( "name", "Esirkepov" );
-        return propList;
-    }
-};
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "Esirkepov");
+                return propList;
+            }
+        };
 
-} //namespace currentSolver
+    } // namespace currentSolver
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/Esirkepov/EsirkepovNative.hpp b/include/picongpu/fields/currentDeposition/Esirkepov/EsirkepovNative.hpp
index 86465f6435..fc9639679e 100644
--- a/include/picongpu/fields/currentDeposition/Esirkepov/EsirkepovNative.hpp
+++ b/include/picongpu/fields/currentDeposition/Esirkepov/EsirkepovNative.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,191 +25,174 @@
 #include <pmacc/cuSTL/cursor/Cursor.hpp>
 #include <pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp>
 #include <pmacc/cuSTL/cursor/compile-time/SafeCursor.hpp>
-#include <pmacc/nvidia/atomic.hpp>
 
 #include "picongpu/fields/currentDeposition/Esirkepov/Line.hpp"
 
 
 namespace picongpu
 {
-namespace currentSolver
-{
-using namespace pmacc;
-
-/**
- * Implements the current deposition algorithm from T.Zh. Esirkepov
- *
- * for an arbitrary particle assign function given as a template parameter.
- * See available shapes at "intermediateLib/particleShape".
- * paper: "Exact charge conservation scheme for Particle-in-Cell simulation
- *  with an arbitrary form-factor"
- */
-template<typename T_ParticleShape>
-struct EsirkepovNative
-{
-    using ParticleAssign = typename T_ParticleShape::ChargeAssignment;
-    static constexpr int supp = ParticleAssign::support;
-
-    static constexpr int currentLowerMargin = supp / 2 + 1;
-    static constexpr int currentUpperMargin = (supp + 1) / 2 + 1;
-    typedef pmacc::math::CT::Int<currentLowerMargin, currentLowerMargin, currentLowerMargin> LowerMargin;
-    typedef pmacc::math::CT::Int<currentUpperMargin, currentUpperMargin, currentUpperMargin> UpperMargin;
-
-    PMACC_CASSERT_MSG(
-        __EsirkepovNative_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= currentLowerMargin &&
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= currentUpperMargin
-    );
-
-    /* iterate over all grid points */
-    static constexpr int begin = -currentLowerMargin;
-    static constexpr int end = currentUpperMargin + 1;
-
-    float_X charge;
-
-    /* At the moment Esirkepov only supports Yee cells where W is defined at origin (0,0,0)
-     *
-     * \todo: please fix me that we can use CenteredCell
-     */
-    template<
-        typename DataBoxJ,
-        typename PosType,
-        typename VelType,
-        typename ChargeType,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        DataBoxJ dataBoxJ,
-        const PosType pos,
-        const VelType velocity,
-        const ChargeType charge, const float_X deltaTime
-    )
+    namespace currentSolver
     {
-        this->charge = charge;
-        const float3_X deltaPos = float3_X(velocity.x() * deltaTime / cellSize.x(),
-                                           velocity.y() * deltaTime / cellSize.y(),
-                                           velocity.z() * deltaTime / cellSize.z());
-        const PosType oldPos = pos - deltaPos;
-        Line<float3_X> line(oldPos, pos);
-        auto cursorJ = dataBoxJ.toCursor();
-
         /**
-         * \brief the following three calls separate the 3D current deposition
-         * into three independent 1D calls, each for one direction and current component.
-         * Therefore the coordinate system has to be rotated so that the z-direction
-         * is always specific.
-         */
-
-        using namespace cursor::tools;
-        cptCurrent1D(acc, twistVectorFieldAxes<pmacc::math::CT::Int < 1, 2, 0 > >(cursorJ), rotateOrigin < 1, 2, 0 > (line), cellSize.x());
-        cptCurrent1D(acc, twistVectorFieldAxes<pmacc::math::CT::Int < 2, 0, 1 > >(cursorJ), rotateOrigin < 2, 0, 1 > (line), cellSize.y());
-        cptCurrent1D(acc, cursorJ, line, cellSize.z());
-    }
-
-    /**
-     * deposites current in z-direction
-     * \param cursorJ cursor pointing at the current density field of the particle's cell
-     * \param line trajectory of the particle from to last to the current time step
-     * \param cellEdgeLength length of edge of the cell in z-direction
-     */
-    template<
-        typename CursorJ,
-        typename T_Acc
-    >
-    DINLINE void cptCurrent1D(
-        T_Acc const & acc,
-        CursorJ cursorJ,
-        const Line<float3_X>& line,
-        const float_X cellEdgeLength
-    )
-    {
-        /* pick every cell in the xy-plane that is overlapped by particle's
-         * form factor and deposit the current for the cells above and beneath
-         * that cell and for the cell itself.
+         * Implements the current deposition algorithm from T.Zh. Esirkepov
+         *
+         * for an arbitrary particle assign function given as a template parameter.
+         * See available shapes at "intermediateLib/particleShape".
+         * paper: "Exact charge conservation scheme for Particle-in-Cell simulation
+         *  with an arbitrary form-factor"
          */
-        for (int i = begin; i < end; ++i)
+        template<typename T_ParticleShape, typename T_Strategy>
+        struct EsirkepovNative
         {
-            for (int j = begin; j < end; ++j)
+            using ParticleAssign = typename T_ParticleShape::ChargeAssignment;
+            static constexpr int supp = ParticleAssign::support;
+
+            static constexpr int currentLowerMargin = supp / 2 + 1;
+            static constexpr int currentUpperMargin = (supp + 1) / 2 + 1;
+            typedef pmacc::math::CT::Int<currentLowerMargin, currentLowerMargin, currentLowerMargin> LowerMargin;
+            typedef pmacc::math::CT::Int<currentUpperMargin, currentUpperMargin, currentUpperMargin> UpperMargin;
+
+            PMACC_CASSERT_MSG(
+                __EsirkepovNative_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
+                pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                        >= currentLowerMargin
+                    && pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                        >= currentUpperMargin);
+
+            /* iterate over all grid points */
+            static constexpr int begin = -currentLowerMargin;
+            static constexpr int end = currentUpperMargin + 1;
+
+            float_X charge;
+
+            /* At the moment Esirkepov only supports Yee cells where W is defined at origin (0,0,0)
+             *
+             * \todo: please fix me that we can use CenteredCell
+             */
+            template<typename DataBoxJ, typename PosType, typename VelType, typename ChargeType, typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                DataBoxJ dataBoxJ,
+                const PosType pos,
+                const VelType velocity,
+                const ChargeType charge,
+                const float_X deltaTime)
             {
-                float_X tmp =
-                    S0(line, i, 1) * S0(line, j, 2) +
-                    float_X(0.5) * DS(line, i, 1) * S0(line, j, 2) +
-                    float_X(0.5) * S0(line, i, 1) * DS(line, j, 2) +
-                    (float_X(1.0) / float_X(3.0)) * DS(line, i, 1) * DS(line, j, 2);
-
-                float_X accumulated_J = float_X(0.0);
-                for (int k = begin; k < end; ++k)
+                this->charge = charge;
+                const float3_X deltaPos = velocity * deltaTime / cellSize;
+                const PosType oldPos = pos - deltaPos;
+                const Line<float3_X> line(oldPos, pos);
+                auto cursorJ = dataBoxJ.toCursor();
+
+                /**
+                 * \brief the following three calls separate the 3D current deposition
+                 * into three independent 1D calls, each for one direction and current component.
+                 * Therefore the coordinate system has to be rotated so that the z-direction
+                 * is always specific.
+                 */
+
+                using namespace cursor::tools;
+                cptCurrent1D(
+                    acc,
+                    twistVectorFieldAxes<pmacc::math::CT::Int<1, 2, 0>>(cursorJ),
+                    rotateOrigin<1, 2, 0>(line),
+                    cellSize.x());
+                cptCurrent1D(
+                    acc,
+                    twistVectorFieldAxes<pmacc::math::CT::Int<2, 0, 1>>(cursorJ),
+                    rotateOrigin<2, 0, 1>(line),
+                    cellSize.y());
+                cptCurrent1D(acc, cursorJ, line, cellSize.z());
+            }
+
+            /**
+             * deposites current in z-direction
+             * \param cursorJ cursor pointing at the current density field of the particle's cell
+             * \param line trajectory of the particle from to last to the current time step
+             * \param cellEdgeLength length of edge of the cell in z-direction
+             */
+            template<typename CursorJ, typename T_Acc>
+            DINLINE void cptCurrent1D(
+                T_Acc const& acc,
+                CursorJ cursorJ,
+                const Line<float3_X>& line,
+                const float_X cellEdgeLength)
+            {
+                /* pick every cell in the xy-plane that is overlapped by particle's
+                 * form factor and deposit the current for the cells above and beneath
+                 * that cell and for the cell itself.
+                 */
+                for(int i = begin; i < end; ++i)
                 {
-                    float_X W = DS(line, k, 3) * tmp;
-                    /* We multiply with `cellEdgeLength` due to the fact that the attribute for the
-                     * in-cell particle `position` (and it's change in DELTA_T) is normalize to [0,1) */
-                    accumulated_J += -this->charge * (float_X(1.0) / float_X(CELL_VOLUME * DELTA_T)) * W * cellEdgeLength;
-                    atomicAdd(&((*cursorJ(i, j, k)).z()), accumulated_J, ::alpaka::hierarchy::Threads{});
+                    for(int j = begin; j < end; ++j)
+                    {
+                        float_X tmp = S0(line, i, 1) * S0(line, j, 2) + float_X(0.5) * DS(line, i, 1) * S0(line, j, 2)
+                            + float_X(0.5) * S0(line, i, 1) * DS(line, j, 2)
+                            + (float_X(1.0) / float_X(3.0)) * DS(line, i, 1) * DS(line, j, 2);
+
+                        float_X accumulated_J = float_X(0.0);
+                        for(int k = begin; k < end; ++k)
+                        {
+                            const float_X W = DS(line, k, 3) * tmp;
+                            /* We multiply with `cellEdgeLength` due to the fact that the attribute for the
+                             * in-cell particle `position` (and it's change in DELTA_T) is normalize to [0,1) */
+                            accumulated_J += -this->charge * (float_X(1.0) / float_X(CELL_VOLUME * DELTA_T)) * W
+                                * cellEdgeLength;
+                            auto const atomicOp = typename T_Strategy::BlockReductionOp{};
+                            atomicOp(acc, (*cursorJ(i, j, k)).z(), accumulated_J);
+                        }
+                    }
                 }
             }
-        }
-
-    }
 
-    /** calculate S0 (see paper)
-     * @param line element with previous and current position of the particle
-     * @param gridPoint used grid point to evaluate assignment shape
-     * @param d dimension range {1,2,3} means {x,y,z}
-     *        same like in Esirkepov paper (FORTAN style)
-     */
-    DINLINE float_X S0(const Line<float3_X>& line, const float_X gridPoint, const float_X d)
-    {
-        return ParticleAssign()(gridPoint - line.m_pos0[d - 1]);
-    }
-
-    /** calculate DS (see paper)
-     * @param line element with previous and current position of the particle
-     * @param gridPoint used grid point to evaluate assignment shape
-     * @param d dimension range {1,2,3} means {x,y,z}
-     *        same like in Esirkepov paper (FORTAN style)
-     */
-    DINLINE float_X DS(const Line<float3_X>& line, const float_X gridPoint, const float_X d)
-    {
-        return ParticleAssign()(gridPoint - line.m_pos1[d - 1]) - ParticleAssign()(gridPoint - line.m_pos0[d - 1]);
-    }
+            /** calculate S0 (see paper)
+             * @param line element with previous and current position of the particle
+             * @param gridPoint used grid point to evaluate assignment shape
+             * @param d dimension range {1,2,3} means {x,y,z}
+             *        same like in Esirkepov paper (FORTAN style)
+             */
+            DINLINE float_X S0(const Line<float3_X>& line, const float_X gridPoint, const float_X d)
+            {
+                return ParticleAssign()(gridPoint - line.m_pos0[d - 1]);
+            }
 
-    static pmacc::traits::StringProperty getStringProperties()
-    {
-        pmacc::traits::StringProperty propList( "name", "Esirkepov" );
-        propList["param"] = "native implementation";
-        return propList;
-    }
-};
+            /** calculate DS (see paper)
+             * @param line element with previous and current position of the particle
+             * @param gridPoint used grid point to evaluate assignment shape
+             * @param d dimension range {1,2,3} means {x,y,z}
+             *        same like in Esirkepov paper (FORTAN style)
+             */
+            DINLINE float_X DS(const Line<float3_X>& line, const float_X gridPoint, const float_X d)
+            {
+                return ParticleAssign()(gridPoint - line.m_pos1[d - 1])
+                    - ParticleAssign()(gridPoint - line.m_pos0[d - 1]);
+            }
 
-} //namespace currentSolver
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "Esirkepov");
+                propList["param"] = "native implementation";
+                return propList;
+            }
+        };
 
-namespace traits
-{
+    } // namespace currentSolver
 
-/*Get margin of a solver
- * class must define a LowerMargin and UpperMargin
- */
-template<typename T_ParticleShape>
-struct GetMargin<picongpu::currentSolver::EsirkepovNative<T_ParticleShape> >
-{
-private:
-    typedef picongpu::currentSolver::EsirkepovNative<T_ParticleShape> Solver;
-public:
-    typedef typename Solver::LowerMargin LowerMargin;
-    typedef typename Solver::UpperMargin UpperMargin;
-};
+    namespace traits
+    {
+        /*Get margin of a solver
+         * class must define a LowerMargin and UpperMargin
+         */
+        template<typename T_ParticleShape, typename T_Strategy>
+        struct GetMargin<picongpu::currentSolver::EsirkepovNative<T_ParticleShape, T_Strategy>>
+        {
+        private:
+            using Solver = picongpu::currentSolver::EsirkepovNative<T_ParticleShape, T_Strategy>;
 
-} //namespace traits
+        public:
+            using LowerMargin = typename Solver::LowerMargin;
+            using UpperMargin = typename Solver::UpperMargin;
+        };
 
-} //namespace picongpu
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/Esirkepov/Line.hpp b/include/picongpu/fields/currentDeposition/Esirkepov/Line.hpp
index 60b2ad8aa0..4fd8b59f4f 100644
--- a/include/picongpu/fields/currentDeposition/Esirkepov/Line.hpp
+++ b/include/picongpu/fields/currentDeposition/Esirkepov/Line.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
@@ -26,78 +25,79 @@
 
 namespace picongpu
 {
-namespace currentSolver
-{
-using namespace pmacc;
-
-template<typename T_Type>
-struct Line
-{
-    using type = T_Type;
-
-    type m_pos0;
-    type m_pos1;
-
-    DINLINE Line()
+    namespace currentSolver
     {
-    }
-
-    DINLINE Line(const type& pos0, const type & pos1) : m_pos0(pos0), m_pos1(pos1)
-    {
-    }
-
-    DINLINE Line<type>& operator-=(const type & rhs)
-    {
-        m_pos0 -= rhs;
-        m_pos1 -= rhs;
-        return *this;
-    }
-};
-
-template<typename T_Type>
-DINLINE Line<T_Type> operator-(const Line<T_Type>& lhs, const T_Type& rhs)
-{
-    return Line<T_Type>(lhs.m_pos0 - rhs, lhs.m_pos1 - rhs);
-}
-
-template<typename T_Type>
-DINLINE Line<T_Type> operator-(const T_Type& lhs, const Line<T_Type>& rhs)
-{
-    return Line<T_Type>(lhs - rhs.m_pos0, lhs - rhs.m_pos1);
-}
-
-///auxillary function to rotate a vector
-
-template<int newXAxis, int newYAxis, int newZAxis>
-DINLINE float3_X rotateOrigin(const float3_X& vec)
-{
-    return float3_X(vec[newXAxis], vec[newYAxis], vec[newZAxis]);
-}
-
-template<int newXAxis, int newYAxis>
-DINLINE float2_X rotateOrigin(const float2_X& vec)
-{
-    return float2_X(vec[newXAxis], vec[newYAxis]);
-}
-///auxillary function to rotate a line
-
-template<int newXAxis, int newYAxis, int newZAxis,typename T_Type>
-DINLINE Line<T_Type> rotateOrigin(const Line<T_Type>& line)
-{
-    Line<T_Type> result(rotateOrigin<newXAxis, newYAxis, newZAxis > (line.m_pos0),
-                rotateOrigin<newXAxis, newYAxis, newZAxis > (line.m_pos1));
-    return result;
-}
-
-template<int newXAxis, int newYAxis,typename T_Type>
-DINLINE Line<T_Type> rotateOrigin(const Line<T_Type>& line)
-{
-    Line<T_Type> result(rotateOrigin<newXAxis, newYAxis > (line.m_pos0),
-                rotateOrigin<newXAxis, newYAxis > (line.m_pos1));
-    return result;
-}
-
-} //namespace currentSolver
-
-} //namespace picongpu
-
+        using namespace pmacc;
+
+        template<typename T_Type>
+        struct Line
+        {
+            using type = T_Type;
+
+            type m_pos0;
+            type m_pos1;
+
+            DINLINE Line()
+            {
+            }
+
+            DINLINE Line(const type& pos0, const type& pos1) : m_pos0(pos0), m_pos1(pos1)
+            {
+            }
+
+            DINLINE Line<type>& operator-=(const type& rhs)
+            {
+                m_pos0 -= rhs;
+                m_pos1 -= rhs;
+                return *this;
+            }
+        };
+
+        template<typename T_Type>
+        DINLINE Line<T_Type> operator-(const Line<T_Type>& lhs, const T_Type& rhs)
+        {
+            return Line<T_Type>(lhs.m_pos0 - rhs, lhs.m_pos1 - rhs);
+        }
+
+        template<typename T_Type>
+        DINLINE Line<T_Type> operator-(const T_Type& lhs, const Line<T_Type>& rhs)
+        {
+            return Line<T_Type>(lhs - rhs.m_pos0, lhs - rhs.m_pos1);
+        }
+
+        /// auxillary function to rotate a vector
+
+        template<int newXAxis, int newYAxis, int newZAxis>
+        DINLINE float3_X rotateOrigin(const float3_X& vec)
+        {
+            return float3_X(vec[newXAxis], vec[newYAxis], vec[newZAxis]);
+        }
+
+        template<int newXAxis, int newYAxis>
+        DINLINE float2_X rotateOrigin(const float2_X& vec)
+        {
+            return float2_X(vec[newXAxis], vec[newYAxis]);
+        }
+        /// auxillary function to rotate a line
+
+        template<int newXAxis, int newYAxis, int newZAxis, typename T_Type>
+        DINLINE Line<T_Type> rotateOrigin(const Line<T_Type>& line)
+        {
+            Line<T_Type> result(
+                rotateOrigin<newXAxis, newYAxis, newZAxis>(line.m_pos0),
+                rotateOrigin<newXAxis, newYAxis, newZAxis>(line.m_pos1));
+            return result;
+        }
+
+        template<int newXAxis, int newYAxis, typename T_Type>
+        DINLINE Line<T_Type> rotateOrigin(const Line<T_Type>& line)
+        {
+            Line<T_Type> result(
+                rotateOrigin<newXAxis, newYAxis>(line.m_pos0),
+                rotateOrigin<newXAxis, newYAxis>(line.m_pos1));
+            return result;
+        }
+
+    } // namespace currentSolver
+
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/RelayPoint.hpp b/include/picongpu/fields/currentDeposition/RelayPoint.hpp
index 285f7d7671..731bc68c0f 100644
--- a/include/picongpu/fields/currentDeposition/RelayPoint.hpp
+++ b/include/picongpu/fields/currentDeposition/RelayPoint.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -23,68 +23,56 @@
 
 namespace picongpu
 {
-namespace currentSolver
-{
-    template< bool isEven >
-    struct RelayPoint
+    namespace currentSolver
     {
-       /** calculate virtual point were we split our particle trajectory
-        *
-        * The relay point calculation differs from the ZigZag paper version in the point
-        * that the trajectory of a particle which does not leave the cell is not split.
-        * The relay point for a particle which does not leave the cell is set to the
-        * current position `x_2`
-        *
-        * If `i_1 == i_2` than the trajectory is not split.
-        *
-        * This function assumes that the shape in later steps is always evaluated
-        * at grid integral points.
-        *
-        * @param i_1[out] offset to shift the coordinate system for the first
-        *                 particle at position x_1
-        * @param i_2[out] offset to shift the coordinate system for the second
-        *                 particle at position x_2
-        * @param x_1 begin position of the particle trajectory
-        * @param x_2 end position of the particle trajectory
-        * @return relay point for particle trajectory
-        */
-        DINLINE float_X
-        operator( )(
-            int& i_1,
-            int& i_2,
-            const float_X x_1,
-            const float_X x_2
-        ) const
+        template<bool isEven>
+        struct RelayPoint
         {
-            using namespace pmacc;
-            i_1 = math::floor( x_1 );
-            i_2 = math::floor( x_2 );
+            /** calculate virtual point were we split our particle trajectory
+             *
+             * The relay point calculation differs from the ZigZag paper version in the point
+             * that the trajectory of a particle which does not leave the cell is not split.
+             * The relay point for a particle which does not leave the cell is set to the
+             * current position `x_2`
+             *
+             * If `i_1 == i_2` than the trajectory is not split.
+             *
+             * This function assumes that the shape in later steps is always evaluated
+             * at grid integral points.
+             *
+             * @param i_1[out] offset to shift the coordinate system for the first
+             *                 particle at position x_1
+             * @param i_2[out] offset to shift the coordinate system for the second
+             *                 particle at position x_2
+             * @param x_1 begin position of the particle trajectory
+             * @param x_2 end position of the particle trajectory
+             * @return relay point for particle trajectory
+             */
+            DINLINE float_X operator()(int& i_1, int& i_2, const float_X x_1, const float_X x_2) const
+            {
+                using namespace pmacc;
+                i_1 = math::floor(x_1);
+                i_2 = math::floor(x_2);
 
-            return i_1 == i_2 ? x_2 : math::max( i_1, i_2 );
-        }
-    };
+                return i_1 == i_2 ? x_2 : math::max(i_1, i_2);
+            }
+        };
 
-    template<>
-    struct RelayPoint< false >
-    {
-       /** calculate virtual point were we split our particle trajectory
-        *
-        * @see RelayPoint< >::operator( ) description
-        */
-        DINLINE float_X
-        operator( )(
-            int& i_1,
-            int& i_2,
-            const float_X x_1,
-            const float_X x_2
-        ) const
+        template<>
+        struct RelayPoint<false>
         {
-            i_1 = math::float2int_rd( x_1 + float_X( 0.5 ) );
-            i_2 = math::float2int_rd( x_2 + float_X( 0.5 ) );
+            /** calculate virtual point were we split our particle trajectory
+             *
+             * @see RelayPoint< >::operator( ) description
+             */
+            DINLINE float_X operator()(int& i_1, int& i_2, const float_X x_1, const float_X x_2) const
+            {
+                i_1 = pmacc::math::float2int_rd(x_1 + float_X(0.5));
+                i_2 = pmacc::math::float2int_rd(x_2 + float_X(0.5));
 
-            return i_1 == i_2 ? x_2 : float_X( i_1 + i_2 )/float_X( 2.0 );
-        }
-    };
+                return i_1 == i_2 ? x_2 : float_X(i_1 + i_2) / float_X(2.0);
+            }
+        };
 
-} // namespace currentSolver
+    } // namespace currentSolver
 } // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/Solver.def b/include/picongpu/fields/currentDeposition/Solver.def
index 43c90bdf7f..354c6cbad9 100644
--- a/include/picongpu/fields/currentDeposition/Solver.def
+++ b/include/picongpu/fields/currentDeposition/Solver.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,9 +18,10 @@
  */
 
 
+#include "picongpu/fields/currentDeposition/Strategy.def"
 #include "picongpu/fields/currentDeposition/Esirkepov/Esirkepov.def"
 #include "picongpu/fields/currentDeposition/EmZ/EmZ.def"
 
-#if(SIMDIM==DIM3)
-#include "picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.def"
+#if(SIMDIM == DIM3)
+#    include "picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.def"
 #endif
diff --git a/include/picongpu/fields/currentDeposition/Solver.hpp b/include/picongpu/fields/currentDeposition/Solver.hpp
index 3f4bbc6908..064a5883bd 100644
--- a/include/picongpu/fields/currentDeposition/Solver.hpp
+++ b/include/picongpu/fields/currentDeposition/Solver.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,6 +22,6 @@
 #include "picongpu/fields/currentDeposition/Esirkepov/EsirkepovNative.hpp"
 #include "picongpu/fields/currentDeposition/EmZ/EmZ.hpp"
 
-#if(SIMDIM==DIM3)
-#include "picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.hpp"
+#if(SIMDIM == DIM3)
+#    include "picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.hpp"
 #endif
diff --git a/include/picongpu/fields/currentDeposition/Strategy.def b/include/picongpu/fields/currentDeposition/Strategy.def
new file mode 100644
index 0000000000..629276bc51
--- /dev/null
+++ b/include/picongpu/fields/currentDeposition/Strategy.def
@@ -0,0 +1,216 @@
+/* Copyright 2020-2021 Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <pmacc/nvidia/functors/Add.hpp>
+#include <pmacc/nvidia/functors/Atomic.hpp>
+#include <pmacc/types.hpp>
+
+
+namespace picongpu
+{
+    namespace currentSolver
+    {
+        namespace strategy
+        {
+            namespace detail
+            {
+                /** Validate and adjust worker multiplier
+                 *
+                 * @param multiplicator Number used as multiplier to oversubscribe the number of threads for the
+                 * compute current task/kernel.
+                 * @return valid multiplier
+                 */
+                constexpr int validateAndAdjustWorkerMultiplier(int const multiplicator)
+                {
+#if BOOST_COMP_HIP && PIC_COMPUTE_CURRENT_THREAD_LIMITER
+                    // HIP-clang creates wrong results if more threads than particles in a frame will be used
+                    return 1;
+#else
+                    return multiplicator >= 1 ? multiplicator : 1;
+#endif
+                }
+            } // namespace detail
+
+            /** Work on strided supercell domains with local caching strategy
+             *
+             * The current for each particle will be reduced with atomic operations into a supercell
+             * local cache. The cache will be flushed to the global memory without atomics.
+             * The device local domain of fieldJ will be decomposed with a checker board.
+             *
+             * Suggestion: Use this strategy if atomic operations to global memory are slow.
+             * To utilize the device fully you should have enough supercells
+             *   - 2D: minimum multiprocessor count * 9 * 4
+             *   - 3D: minimum multiprocessor count * 27 * 4
+             *
+             * @{
+             */
+            struct StridedCachedSupercells
+            {
+                static constexpr bool useBlockCache = true;
+                static constexpr bool stridedMapping = true;
+                using BlockReductionOp = nvidia::functors::Atomic<::alpaka::AtomicAdd, ::alpaka::hierarchy::Threads>;
+                using GridReductionOp = nvidia::functors::Add;
+                static constexpr int workerMultiplier = 1;
+            };
+
+            /** @tparam T_workerMultiplier Oversubscribe the number of workers used to compute the current by the given
+             * multiplier. Can be used to optimize the device occupancy.
+             */
+            template<int T_workerMultiplier>
+            struct StridedCachedSupercellsScaled
+            {
+                static constexpr bool useBlockCache = true;
+                static constexpr bool stridedMapping = true;
+                using BlockReductionOp = nvidia::functors::Atomic<::alpaka::AtomicAdd, ::alpaka::hierarchy::Threads>;
+                using GridReductionOp = nvidia::functors::Add;
+                static constexpr int workerMultiplier = detail::validateAndAdjustWorkerMultiplier(T_workerMultiplier);
+            };
+
+            /** @} */
+
+            /** Local caching strategy
+             *
+             * The current for each particle will be reduced with atomic operations into a supercell
+             * local cache. The cache will be flushed with atomic operations to the global memory.
+             *
+             * Suggestion: Use this strategy if block local and global atomics are fast.
+             *
+             * @{
+             */
+            struct CachedSupercells
+            {
+                static constexpr bool useBlockCache = true;
+                static constexpr bool stridedMapping = false;
+                using BlockReductionOp = nvidia::functors::Atomic<::alpaka::AtomicAdd, ::alpaka::hierarchy::Threads>;
+                using GridReductionOp = nvidia::functors::Atomic<::alpaka::AtomicAdd, ::alpaka::hierarchy::Blocks>;
+                static constexpr int workerMultiplier = 1;
+            };
+
+            /** @tparam T_workerMultiplier Oversubscribe the number of workers used to compute the current by the given
+             * multiplier. Can be used to optimize the device occupancy.
+             */
+            template<int T_workerMultiplier>
+            struct CachedSupercellsScaled
+            {
+                static constexpr bool useBlockCache = true;
+                static constexpr bool stridedMapping = false;
+                using BlockReductionOp = nvidia::functors::Atomic<::alpaka::AtomicAdd, ::alpaka::hierarchy::Threads>;
+                using GridReductionOp = nvidia::functors::Atomic<::alpaka::AtomicAdd, ::alpaka::hierarchy::Blocks>;
+                static constexpr int workerMultiplier = detail::validateAndAdjustWorkerMultiplier(T_workerMultiplier);
+            };
+
+            /** @} */
+
+            /** Non cached strategy
+             *
+             * The current for each particle will be reduced with atomic operations directly
+             * to the global memory.
+             *
+             * Suggestion: Use this strategy if global atomics are fast and random memory access
+             * to a large range in memory is not a bottle neck.
+             *
+             * @{
+             */
+            struct NonCachedSupercells
+            {
+                static constexpr bool useBlockCache = false;
+                static constexpr bool stridedMapping = false;
+                using BlockReductionOp = nvidia::functors::Atomic<::alpaka::AtomicAdd, ::alpaka::hierarchy::Blocks>;
+                // dummy which produces a compile time error if used
+                using GridReductionOp = void;
+                static constexpr int workerMultiplier = 1;
+            };
+
+            /** @tparam T_workerMultiplier Oversubscribe the number of workers used to compute the current by the given
+             * multiplier. Can be used to optimize the device occupancy.
+             */
+            template<int T_workerMultiplier>
+            struct NonCachedSupercellsScaled
+            {
+                static constexpr bool useBlockCache = false;
+                static constexpr bool stridedMapping = false;
+                using BlockReductionOp = nvidia::functors::Atomic<::alpaka::AtomicAdd, ::alpaka::hierarchy::Blocks>;
+                // dummy which produces a compile time error if used
+                using GridReductionOp = void;
+                static constexpr int workerMultiplier = detail::validateAndAdjustWorkerMultiplier(T_workerMultiplier);
+            };
+
+            /** @} */
+
+        } // namespace strategy
+
+        namespace traits
+        {
+            /** Get current deposition strategy from a solver
+             *
+             * @tparam T_Solver type to derive the strategy
+             * @treturn ::type strategy description
+             */
+            template<typename T_Solver>
+            struct GetStrategy;
+
+            /** Get current deposition strategy from a solver
+             *
+             * @see GetStrategy
+             */
+            template<typename T_Solver>
+            using GetStrategy_t = typename GetStrategy<T_Solver>::type;
+
+            /** Default strategy for the current deposition
+             *
+             * Default will be selected based on the cupla accelerator.
+             *
+             * @tparam T_Acc the accelerator type
+             */
+            template<typename T_Acc = cupla::AccThreadSeq>
+            struct GetDefaultStrategy
+            {
+                using type = strategy::StridedCachedSupercells;
+            };
+
+            /** Default strategy for the current deposition
+             *
+             * @see GetDefaultStrategy
+             */
+            template<typename T_Acc = cupla::AccThreadSeq>
+            using GetDefaultStrategy_t = typename GetDefaultStrategy<T_Acc>::type;
+
+#if(ALPAKA_ACC_GPU_CUDA_ENABLED == 1)
+            template<typename... T_Args>
+            struct GetDefaultStrategy<alpaka::AccGpuCudaRt<T_Args...>>
+            {
+                // GPU Utilization is higher compared to `StridedCachedSupercells`
+                using type = strategy::CachedSupercells;
+            };
+#endif
+
+#if(ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+            template<typename... T_Args>
+            struct GetDefaultStrategy<alpaka::AccGpuHipRt<T_Args...>>
+            {
+                // GPU Utilization is higher compared to `StridedCachedSupercells`
+                using type = strategy::CachedSupercells;
+            };
+#endif
+
+        } // namespace traits
+    } // namespace currentSolver
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.def b/include/picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.def
index cda17a43af..cacdafa945 100644
--- a/include/picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.def
+++ b/include/picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -17,23 +17,39 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-
 #pragma once
 
 #include <pmacc/types.hpp>
 #include "picongpu/simulation_defines.hpp"
 #include "picongpu/particles/shapes/CIC.hpp"
+#include "picongpu/fields/currentDeposition/Strategy.def"
 
 
 namespace picongpu
 {
-namespace currentSolver
-{
-using namespace pmacc;
-
-template<typename ParticleShape = picongpu::particles::shapes::CIC>
-struct VillaBune;
-
-} //namespace currentSolver
-
-} //namespace picongpu
+    namespace currentSolver
+    {
+        /** Current deposition algorithm from J. Villasenor and O. Buneman
+         *
+         * paper: J. Villasenor and O. Buneman. Rigorous charge conservation for local
+         * electromagnetic field solvers. Computer Physics Communications, 69:306, 1992.
+         * https://doi.org/10.1016/0010-4655(92)90169-Y
+         *
+         * @tparam T_ParticleShape the particle shape for the species, supports only [picongpu::particles::shapes::CIC]
+         * @tparam T_Strategy Used strategy to reduce the scattered data [currentSolver::strategy]
+         */
+        template<
+            typename T_ParticleShape = picongpu::particles::shapes::CIC,
+            typename T_Strategy = traits::GetDefaultStrategy_t<>>
+        struct VillaBune;
+
+        namespace traits
+        {
+            template<typename T_ParticleShape, typename T_Strategy>
+            struct GetStrategy<VillaBune<T_ParticleShape, T_Strategy>>
+            {
+                using type = T_Strategy;
+            };
+        } // namespace traits
+    } // namespace currentSolver
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.hpp b/include/picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.hpp
index f0bb7cbe97..a9d33928bc 100644
--- a/include/picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.hpp
+++ b/include/picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -20,274 +20,282 @@
 #pragma once
 
 #include <pmacc/types.hpp>
+#include "picongpu/fields/currentDeposition/VillaBune/CurrentVillaBune.def"
 #include "picongpu/simulation_defines.hpp"
+#include "picongpu/particles/shapes/CIC.hpp"
 #include <pmacc/dimensions/DataSpace.hpp>
 #include <pmacc/math/Vector.hpp>
-#include <pmacc/nvidia/atomic.hpp>
 
-#include "picongpu/particles/shapes/CIC.hpp"
+#include <type_traits>
 
 
 namespace picongpu
 {
-namespace currentSolver
-{
-using namespace pmacc;
-
-template<typename T_ParticleShape>
-struct VillaBune
-{
-    template<class BoxJ, typename PosType, typename VelType, typename ChargeType, typename T_Acc >
-    DINLINE void operator()(const T_Acc& acc,
-                            BoxJ& boxJ_par, /*box which is shifted to particles cell*/
-                            const PosType pos,
-                            const VelType velocity,
-                            const ChargeType charge, const float_X deltaTime)
-    {
-        /* VillaBune: field to particle interpolation _requires_ the CIC shape */
-        PMACC_CASSERT_MSG_TYPE(currentSolverVillaBune_requires_shapeCIC_in_particleConfig,
-                    T_ParticleShape,
-                    T_ParticleShape::support == 2);
-
-        // normalize deltaPos to innerCell units [0.; 1.)
-        //   that means: dx_real   = v.x() * dt
-        //               dx_inCell = v.x() * dt / cellSize.x()
-        const float3_X deltaPos(
-                                velocity.x() * deltaTime / cellSize.x(),
-                                velocity.y() * deltaTime / cellSize.y(),
-                                velocity.z() * deltaTime / cellSize.z());
-
-        const PosType oldPos = (PosType) (precisionCast<float_X > (pos) - deltaPos);
-
-        addCurrentSplitX(acc, oldPos, pos, charge, boxJ_par, deltaTime);
-    }
-
-    static pmacc::traits::StringProperty getStringProperties()
-    {
-        pmacc::traits::StringProperty propList( "name", "VillaBune" );
-        return propList;
-    }
-
-private:
-    //Splits the [oldPos,newPos] beam into two beams at the x-boundary of the cell
-    //if necessary
-
-    template<
-        typename Buffer,
-        typename T_Acc
-    >
-    DINLINE void addCurrentSplitX(
-        T_Acc const & acc,
-        const float3_X& oldPos,
-        const float3_X& newPos,
-        const float_X charge,
-        Buffer & mem,
-        const float_X deltaTime
-    )
+    namespace currentSolver
     {
-
-        if (math::float2int_rd(oldPos.x()) != math::float2int_rd(newPos.x()))
+        template<typename T_ParticleShape, typename T_Strategy>
+        struct VillaBune
         {
-            const float3_X interPos = intersectXPlane(oldPos, newPos,
-                                                      math::max(math::float2int_rd(oldPos.x()), math::float2int_rd(newPos.x())));
-            addCurrentSplitY(acc, oldPos, interPos, charge, mem, deltaTime);
-            addCurrentSplitY(acc, interPos, newPos, charge, mem, deltaTime);
-            return;
-        }
-        addCurrentSplitY(acc, oldPos, newPos, charge, mem, deltaTime);
-    }
-
-    template<
-        typename Buffer,
-        typename T_Acc
-    >
-    DINLINE void addCurrentToSingleCell(
-        T_Acc const & acc,
-        float3_X meanPos,
-        const float3_X& deltaPos,
-        const float_X charge,
-        Buffer & memIn,
-        const float_X deltaTime
-    )
-    {
-        //shift to the cell meanPos belongs to
-        //because meanPos may exceed the range [0,1)
-        DataSpace<DIM3> off(math::float2int_rd(meanPos.x()),
-                            math::float2int_rd(meanPos.y()),
-                            math::float2int_rd(meanPos.z()));
-
-        auto mem = memIn.shift(off);
-
-        //fit meanPos into the range [0,1)
-        meanPos.x() -= math::floor(meanPos.x());
-        meanPos.y() -= math::floor(meanPos.y());
-        meanPos.z() -= math::floor(meanPos.z());
-
-        //for the formulas used in here see Villasenor/Buneman paper page 314
-        const float_X tmp = deltaPos.x() * deltaPos.y() * deltaPos.z() * (float_X(1.0) / float_X(12.0));
-
-        // j = rho * v
-        //   = rho * dr / dt
-        //const float_X rho = charge * (1.0 / (CELL_WIDTH * CELL_HEIGHT * CELL_DEPTH));
-        //const float_X rho_dt = rho * (1.0 / deltaTime);
-
-        // now carefully:
-        // deltaPos is in "inCell" coordinates, that means:
-        //   deltaPos.x() = deltaPos_real.x() / cellSize.x()
-        // to calculate the current density in realUnits it is
-        //   j.x() = rho * deltaPos_real.x() / dt
-        //       = rho * deltaPos.x() * cellSize.x() / dt
-        // So put adding the constant directly to rho results in:
-        //   const float_X rho_dtX = rho * CELL_WIDTH;
-        //   const float_X rho_dtY = rho * CELL_HEIGHT;
-        //   const float_X rho_dtZ = rho * CELL_DEPTH;
-
-        // This is exactly the same like:
-        // j = Q / A / t
-        //   j.x() = Q.x() * (1.0 / (CELL_HEIGHT * CELL_DEPTH * deltaTime));
-        //   j.y() = Q.y() * (1.0 / (CELL_WIDTH * CELL_DEPTH * deltaTime));
-        //   j.z() = Q.z() * (1.0 / (CELL_WIDTH * CELL_HEIGHT * deltaTime));
-        // with the difference, that (imagine a moving quader)
-        //   Q.x() = charge * deltaPos_real.x() / cellsize.x()
-        //       = charge * deltaPos.x() / 1.0
-        //
-        const float_X rho_dtX = charge * (float_X(1.0) / (CELL_HEIGHT * CELL_DEPTH * deltaTime));
-        const float_X rho_dtY = charge * (float_X(1.0) / (CELL_WIDTH * CELL_DEPTH * deltaTime));
-        const float_X rho_dtZ = charge * (float_X(1.0) / (CELL_WIDTH * CELL_HEIGHT * deltaTime));
-
-        atomicAdd(&(mem[1][1][0].x()), rho_dtX * (deltaPos.x() * meanPos.y() * meanPos.z() + tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[1][0][0].x()), rho_dtX * (deltaPos.x() * (float_X(1.0) - meanPos.y()) * meanPos.z() - tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[0][1][0].x()), rho_dtX * (deltaPos.x() * meanPos.y() * (float_X(1.0) - meanPos.z()) - tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[0][0][0].x()), rho_dtX * (deltaPos.x() * (float_X(1.0) - meanPos.y()) * (float_X(1.0) - meanPos.z()) + tmp), ::alpaka::hierarchy::Threads{});
-
-        atomicAdd(&(mem[1][0][1].y()), rho_dtY * (deltaPos.y() * meanPos.z() * meanPos.x() + tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[0][0][1].y()), rho_dtY * (deltaPos.y() * (float_X(1.0) - meanPos.z()) * meanPos.x() - tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[1][0][0].y()), rho_dtY * (deltaPos.y() * meanPos.z() * (float_X(1.0) - meanPos.x()) - tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[0][0][0].y()), rho_dtY * (deltaPos.y() * (float_X(1.0) - meanPos.z()) * (float_X(1.0) - meanPos.x()) + tmp), ::alpaka::hierarchy::Threads{});
-
-        atomicAdd(&(mem[0][1][1].z()), rho_dtZ * (deltaPos.z() * meanPos.x() * meanPos.y() + tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[0][1][0].z()), rho_dtZ * (deltaPos.z() * (float_X(1.0) - meanPos.x()) * meanPos.y() - tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[0][0][1].z()), rho_dtZ * (deltaPos.z() * meanPos.x() * (float_X(1.0) - meanPos.y()) - tmp), ::alpaka::hierarchy::Threads{});
-        atomicAdd(&(mem[0][0][0].z()), rho_dtZ * (deltaPos.z() * (float_X(1.0) - meanPos.x()) * (float_X(1.0) - meanPos.y()) + tmp), ::alpaka::hierarchy::Threads{});
-
-    }
-
-    //calculates the intersection point of the [pos1,pos2] beam with an y,z-plane at position x0
-
-    DINLINE float3_X intersectXPlane(const float3_X& pos1, const float3_X& pos2, const float_X x0)
-    {
-        const float_X t = (x0 - pos1.x()) / (pos2.x() - pos1.x());
-
-        return float3_X(x0, pos1.y() + t * (pos2.y() - pos1.y()), pos1.z() + t * (pos2.z() - pos1.z()));
-    }
-
-    DINLINE float3_X intersectYPlane(const float3_X& pos1, const float3_X& pos2, const float_X y0)
-    {
-        const float_X t = (y0 - pos1.y()) / (pos2.y() - pos1.y());
-
-        return float3_X(pos1.x() + t * (pos2.x() - pos1.x()), y0, pos1.z() + t * (pos2.z() - pos1.z()));
-    }
-
-    DINLINE float3_X intersectZPlane(const float3_X& pos1, const float3_X& pos2, const float_X z0)
-    {
-        const float_X t = (z0 - pos1.z()) / (pos2.z() - pos1.z());
-
-        return float3_X(pos1.x() + t * (pos2.x() - pos1.x()), pos1.y() + t * (pos2.y() - pos1.y()), z0);
-    }
-
-    //Splits the [oldPos,newPos] beam into two beams at the z-boundary of the cell
-    //if necessary
-
-    template<
-        typename Buffer,
-        typename T_Acc
-    >
-    DINLINE void addCurrentSplitZ(
-        T_Acc const & acc,
-        const float3_X &oldPos,
-        const float3_X &newPos,
-        const float_X charge,
-        Buffer & mem,
-        const float_X deltaTime
-    )
-    {
-
-        if (math::float2int_rd(oldPos.z()) != math::float2int_rd(newPos.z()))
-        {
-            const float3_X interPos = intersectZPlane(oldPos, newPos,
-                                                      math::max(math::float2int_rd(oldPos.z()), math::float2int_rd(newPos.z())));
-            float3_X deltaPos = interPos - oldPos;
-            float3_X meanPos = oldPos + float_X(0.5) * deltaPos;
-            addCurrentToSingleCell(acc, meanPos, deltaPos, charge, mem, deltaTime);
-
-            deltaPos = newPos - interPos;
-            meanPos = interPos + float_X(0.5) * deltaPos;
-            addCurrentToSingleCell(acc, meanPos, deltaPos, charge, mem, deltaTime);
-            return;
-        }
-        const float3_X deltaPos = newPos - oldPos;
-        const float3_X meanPos = oldPos + float_X(0.5) * deltaPos;
-        addCurrentToSingleCell(acc, meanPos, deltaPos, charge, mem, deltaTime);
-    }
-
-    //Splits the [oldPos,newPos] beam into two beams at the y-boundary of the cell
-    //if necessary
-
-    template<
-        typename Buffer,
-        typename T_Acc
-    >
-    DINLINE void addCurrentSplitY(
-        T_Acc const & acc,
-        const float3_X& oldPos,
-        const float3_X& newPos,
-        const float_X charge,
-        Buffer & mem,
-        const float_X deltaTime
-    )
+            template<class BoxJ, typename PosType, typename VelType, typename ChargeType, typename T_Acc>
+            DINLINE void operator()(
+                const T_Acc& acc,
+                BoxJ& boxJ_par, /*box which is shifted to particles cell*/
+                const PosType pos,
+                const VelType velocity,
+                const ChargeType charge,
+                const float_X deltaTime)
+            {
+                /* VillaBune: field to particle interpolation _requires_ the CIC shape */
+                PMACC_CASSERT_MSG_TYPE(
+                    currentSolverVillaBune_requires_shapeCIC_in_particleConfig,
+                    T_ParticleShape,
+                    std::is_same<T_ParticleShape, particles::shapes::CIC>::value);
+
+                // normalize deltaPos to innerCell units [0.; 1.)
+                //   that means: dx_real   = v.x() * dt
+                //               dx_inCell = v.x() * dt / cellSize.x()
+                const float3_X deltaPos(
+                    velocity.x() * deltaTime / cellSize.x(),
+                    velocity.y() * deltaTime / cellSize.y(),
+                    velocity.z() * deltaTime / cellSize.z());
+
+                const PosType oldPos = (PosType)(precisionCast<float_X>(pos) - deltaPos);
+
+                addCurrentSplitX(acc, oldPos, pos, charge, boxJ_par, deltaTime);
+            }
+
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "VillaBune");
+                return propList;
+            }
+
+        private:
+            // Splits the [oldPos,newPos] beam into two beams at the x-boundary of the cell
+            // if necessary
+
+            template<typename Buffer, typename T_Acc>
+            DINLINE void addCurrentSplitX(
+                T_Acc const& acc,
+                const float3_X& oldPos,
+                const float3_X& newPos,
+                const float_X charge,
+                Buffer& mem,
+                const float_X deltaTime)
+            {
+                if(pmacc::math::float2int_rd(oldPos.x()) != pmacc::math::float2int_rd(newPos.x()))
+                {
+                    const float3_X interPos = intersectXPlane(
+                        oldPos,
+                        newPos,
+                        math::max(pmacc::math::float2int_rd(oldPos.x()), pmacc::math::float2int_rd(newPos.x())));
+                    addCurrentSplitY(acc, oldPos, interPos, charge, mem, deltaTime);
+                    addCurrentSplitY(acc, interPos, newPos, charge, mem, deltaTime);
+                    return;
+                }
+                addCurrentSplitY(acc, oldPos, newPos, charge, mem, deltaTime);
+            }
+
+            template<typename Buffer, typename T_Acc>
+            DINLINE void addCurrentToSingleCell(
+                T_Acc const& acc,
+                float3_X meanPos,
+                const float3_X& deltaPos,
+                const float_X charge,
+                Buffer& memIn,
+                const float_X deltaTime)
+            {
+                // shift to the cell meanPos belongs to
+                // because meanPos may exceed the range [0,1)
+                DataSpace<DIM3> off(
+                    pmacc::math::float2int_rd(meanPos.x()),
+                    pmacc::math::float2int_rd(meanPos.y()),
+                    pmacc::math::float2int_rd(meanPos.z()));
+
+                auto mem = memIn.shift(off);
+
+                // fit meanPos into the range [0,1)
+                meanPos.x() -= math::floor(meanPos.x());
+                meanPos.y() -= math::floor(meanPos.y());
+                meanPos.z() -= math::floor(meanPos.z());
+
+                // for the formulas used in here see Villasenor/Buneman paper page 314
+                const float_X tmp = deltaPos.x() * deltaPos.y() * deltaPos.z() * (float_X(1.0) / float_X(12.0));
+
+                // j = rho * v
+                //   = rho * dr / dt
+                // const float_X rho = charge * (1.0 / (CELL_WIDTH * CELL_HEIGHT * CELL_DEPTH));
+                // const float_X rho_dt = rho * (1.0 / deltaTime);
+
+                // now carefully:
+                // deltaPos is in "inCell" coordinates, that means:
+                //   deltaPos.x() = deltaPos_real.x() / cellSize.x()
+                // to calculate the current density in realUnits it is
+                //   j.x() = rho * deltaPos_real.x() / dt
+                //       = rho * deltaPos.x() * cellSize.x() / dt
+                // So put adding the constant directly to rho results in:
+                //   const float_X rho_dtX = rho * CELL_WIDTH;
+                //   const float_X rho_dtY = rho * CELL_HEIGHT;
+                //   const float_X rho_dtZ = rho * CELL_DEPTH;
+
+                // This is exactly the same like:
+                // j = Q / A / t
+                //   j.x() = Q.x() * (1.0 / (CELL_HEIGHT * CELL_DEPTH * deltaTime));
+                //   j.y() = Q.y() * (1.0 / (CELL_WIDTH * CELL_DEPTH * deltaTime));
+                //   j.z() = Q.z() * (1.0 / (CELL_WIDTH * CELL_HEIGHT * deltaTime));
+                // with the difference, that (imagine a moving quader)
+                //   Q.x() = charge * deltaPos_real.x() / cellsize.x()
+                //       = charge * deltaPos.x() / 1.0
+                //
+                const float_X rho_dtX = charge * (float_X(1.0) / (CELL_HEIGHT * CELL_DEPTH * deltaTime));
+                const float_X rho_dtY = charge * (float_X(1.0) / (CELL_WIDTH * CELL_DEPTH * deltaTime));
+                const float_X rho_dtZ = charge * (float_X(1.0) / (CELL_WIDTH * CELL_HEIGHT * deltaTime));
+
+                auto const atomicOp = typename T_Strategy::BlockReductionOp{};
+
+                atomicOp(acc, mem[1][1][0].x(), rho_dtX * (deltaPos.x() * meanPos.y() * meanPos.z() + tmp));
+                atomicOp(
+                    acc,
+                    mem[1][0][0].x(),
+                    rho_dtX * (deltaPos.x() * (float_X(1.0) - meanPos.y()) * meanPos.z() - tmp));
+                atomicOp(
+                    acc,
+                    mem[0][1][0].x(),
+                    rho_dtX * (deltaPos.x() * meanPos.y() * (float_X(1.0) - meanPos.z()) - tmp));
+                atomicOp(
+                    acc,
+                    mem[0][0][0].x(),
+                    rho_dtX * (deltaPos.x() * (float_X(1.0) - meanPos.y()) * (float_X(1.0) - meanPos.z()) + tmp));
+
+                atomicOp(acc, mem[1][0][1].y(), rho_dtY * (deltaPos.y() * meanPos.z() * meanPos.x() + tmp));
+                atomicOp(
+                    acc,
+                    mem[0][0][1].y(),
+                    rho_dtY * (deltaPos.y() * (float_X(1.0) - meanPos.z()) * meanPos.x() - tmp));
+                atomicOp(
+                    acc,
+                    mem[1][0][0].y(),
+                    rho_dtY * (deltaPos.y() * meanPos.z() * (float_X(1.0) - meanPos.x()) - tmp));
+                atomicOp(
+                    acc,
+                    mem[0][0][0].y(),
+                    rho_dtY * (deltaPos.y() * (float_X(1.0) - meanPos.z()) * (float_X(1.0) - meanPos.x()) + tmp));
+
+                atomicOp(acc, mem[0][1][1].z(), rho_dtZ * (deltaPos.z() * meanPos.x() * meanPos.y() + tmp));
+                atomicOp(
+                    acc,
+                    mem[0][1][0].z(),
+                    rho_dtZ * (deltaPos.z() * (float_X(1.0) - meanPos.x()) * meanPos.y() - tmp));
+                atomicOp(
+                    acc,
+                    mem[0][0][1].z(),
+                    rho_dtZ * (deltaPos.z() * meanPos.x() * (float_X(1.0) - meanPos.y()) - tmp));
+                atomicOp(
+                    acc,
+                    mem[0][0][0].z(),
+                    rho_dtZ * (deltaPos.z() * (float_X(1.0) - meanPos.x()) * (float_X(1.0) - meanPos.y()) + tmp));
+            }
+
+            // calculates the intersection point of the [pos1,pos2] beam with an y,z-plane at position x0
+
+            DINLINE float3_X intersectXPlane(const float3_X& pos1, const float3_X& pos2, const float_X x0)
+            {
+                const float_X t = (x0 - pos1.x()) / (pos2.x() - pos1.x());
+
+                return float3_X(x0, pos1.y() + t * (pos2.y() - pos1.y()), pos1.z() + t * (pos2.z() - pos1.z()));
+            }
+
+            DINLINE float3_X intersectYPlane(const float3_X& pos1, const float3_X& pos2, const float_X y0)
+            {
+                const float_X t = (y0 - pos1.y()) / (pos2.y() - pos1.y());
+
+                return float3_X(pos1.x() + t * (pos2.x() - pos1.x()), y0, pos1.z() + t * (pos2.z() - pos1.z()));
+            }
+
+            DINLINE float3_X intersectZPlane(const float3_X& pos1, const float3_X& pos2, const float_X z0)
+            {
+                const float_X t = (z0 - pos1.z()) / (pos2.z() - pos1.z());
+
+                return float3_X(pos1.x() + t * (pos2.x() - pos1.x()), pos1.y() + t * (pos2.y() - pos1.y()), z0);
+            }
+
+            // Splits the [oldPos,newPos] beam into two beams at the z-boundary of the cell
+            // if necessary
+
+            template<typename Buffer, typename T_Acc>
+            DINLINE void addCurrentSplitZ(
+                T_Acc const& acc,
+                const float3_X& oldPos,
+                const float3_X& newPos,
+                const float_X charge,
+                Buffer& mem,
+                const float_X deltaTime)
+            {
+                if(pmacc::math::float2int_rd(oldPos.z()) != pmacc::math::float2int_rd(newPos.z()))
+                {
+                    const float3_X interPos = intersectZPlane(
+                        oldPos,
+                        newPos,
+                        math::max(pmacc::math::float2int_rd(oldPos.z()), pmacc::math::float2int_rd(newPos.z())));
+                    float3_X deltaPos = interPos - oldPos;
+                    float3_X meanPos = oldPos + float_X(0.5) * deltaPos;
+                    addCurrentToSingleCell(acc, meanPos, deltaPos, charge, mem, deltaTime);
+
+                    deltaPos = newPos - interPos;
+                    meanPos = interPos + float_X(0.5) * deltaPos;
+                    addCurrentToSingleCell(acc, meanPos, deltaPos, charge, mem, deltaTime);
+                    return;
+                }
+                const float3_X deltaPos = newPos - oldPos;
+                const float3_X meanPos = oldPos + float_X(0.5) * deltaPos;
+                addCurrentToSingleCell(acc, meanPos, deltaPos, charge, mem, deltaTime);
+            }
+
+            // Splits the [oldPos,newPos] beam into two beams at the y-boundary of the cell
+            // if necessary
+
+            template<typename Buffer, typename T_Acc>
+            DINLINE void addCurrentSplitY(
+                T_Acc const& acc,
+                const float3_X& oldPos,
+                const float3_X& newPos,
+                const float_X charge,
+                Buffer& mem,
+                const float_X deltaTime)
+            {
+                if(pmacc::math::float2int_rd(oldPos.y()) != pmacc::math::float2int_rd(newPos.y()))
+                {
+                    const float3_X interPos = intersectYPlane(
+                        oldPos,
+                        newPos,
+                        math::max(pmacc::math::float2int_rd(oldPos.y()), pmacc::math::float2int_rd(newPos.y())));
+                    addCurrentSplitZ(acc, oldPos, interPos, charge, mem, deltaTime);
+                    addCurrentSplitZ(acc, interPos, newPos, charge, mem, deltaTime);
+                    return;
+                }
+                addCurrentSplitZ(acc, oldPos, newPos, charge, mem, deltaTime);
+            }
+        };
+
+    } // namespace currentSolver
+
+    namespace traits
     {
-
-        if (math::float2int_rd(oldPos.y()) != math::float2int_rd(newPos.y()))
+        template<typename T_ParticleShape, typename T_Strategy>
+        struct GetMargin<picongpu::currentSolver::VillaBune<T_ParticleShape, T_Strategy>>
         {
-            const float3_X interPos = intersectYPlane(oldPos, newPos,
-                                                      math::max(math::float2int_rd(oldPos.y()), math::float2int_rd(newPos.y())));
-            addCurrentSplitZ(acc, oldPos, interPos, charge, mem, deltaTime);
-            addCurrentSplitZ(acc, interPos, newPos, charge, mem, deltaTime);
-            return;
-        }
-        addCurrentSplitZ(acc, oldPos, newPos, charge, mem, deltaTime);
-    }
-
-};
-
-} //namespace currentSolver
-
-namespace traits
-{
-
-template<typename T_ParticleShape>
-struct GetMargin<picongpu::currentSolver::VillaBune<T_ParticleShape> >
-{
-    typedef ::pmacc::math::CT::Int < 1, 1, 1 > LowerMargin;
-    typedef ::pmacc::math::CT::Int < 2, 2, 2 > UpperMargin;
-
-    /** maximum margin size of LowerMargin and UpperMargin */
-    static constexpr int maxMargin = 2;
-
-    PMACC_CASSERT_MSG(
-        __VillaBune_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
-        pmacc::math::CT::min<
-            typename pmacc::math::CT::mul<
-                SuperCellSize,
-                GuardSize
-            >::type
-        >::type::value >= maxMargin
-    );
-};
-
-} //namespace traits
+            using LowerMargin = ::pmacc::math::CT::Int<1, 1, 1>;
+            using UpperMargin = ::pmacc::math::CT::Int<2, 2, 2>;
 
-} //namespace picongpu
+            /** maximum margin size of LowerMargin and UpperMargin */
+            static constexpr int maxMargin = 2;
 
+            PMACC_CASSERT_MSG(
+                __VillaBune_supercell_or_number_of_guard_supercells_is_too_small_for_stencil,
+                pmacc::math::CT::min<typename pmacc::math::CT::mul<SuperCellSize, GuardSize>::type>::type::value
+                    >= maxMargin);
+        };
 
+    } // namespace traits
 
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentInterpolation/Binomial/Binomial.def b/include/picongpu/fields/currentInterpolation/Binomial/Binomial.def
index aacb2bce75..c4d927165d 100644
--- a/include/picongpu/fields/currentInterpolation/Binomial/Binomial.def
+++ b/include/picongpu/fields/currentInterpolation/Binomial/Binomial.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -22,15 +22,14 @@
 
 namespace picongpu
 {
-namespace currentInterpolation
-{
-
-    /** 2nd order Binomial filter
-     *
-     * Smooths the current before assignment in staggered grid.
-     * Updates E & breaks local charge conservation slightly.
-     */
-    struct Binomial;
+    namespace currentInterpolation
+    {
+        /** 2nd order Binomial filter functor
+         *
+         * Smooths the current before assignment in staggered grid.
+         * Updates E & breaks local charge conservation slightly.
+         */
+        struct Binomial;
 
-} // namespace currentInterpolation
+    } // namespace currentInterpolation
 } // namespace picongpu
diff --git a/include/picongpu/fields/currentInterpolation/Binomial/Binomial.hpp b/include/picongpu/fields/currentInterpolation/Binomial/Binomial.hpp
index 922822de9e..5202bfbfe0 100644
--- a/include/picongpu/fields/currentInterpolation/Binomial/Binomial.hpp
+++ b/include/picongpu/fields/currentInterpolation/Binomial/Binomial.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl, Benjamin Worpitz, Klaus Steiniger
+/* Copyright 2015-2021 Axel Huebl, Benjamin Worpitz, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -27,201 +27,164 @@
 
 namespace picongpu
 {
-namespace currentInterpolation
-{
-namespace detail
-{
-
-    template< uint32_t T_dim >
-    struct Binomial;
-
-
-    //! Specialization for 3D
-    template< >
-    struct Binomial< DIM3 >
+    namespace currentInterpolation
     {
-        static constexpr uint32_t dim = DIM3;
-
-        using LowerMargin = typename pmacc::math::CT::make_Int<
-            dim,
-            1
-        >::type ;
-        using UpperMargin = LowerMargin;
-
-        template<
-            typename T_DataBoxE,
-            typename T_DataBoxB,
-            typename T_DataBoxJ
-        >
-        HDINLINE void operator()(
-            T_DataBoxE fieldE,
-            T_DataBoxB const,
-            T_DataBoxJ const fieldJ
-        )
+        namespace detail
         {
-            using TypeJ = typename T_DataBoxJ::ValueType;
-            using DS = DataSpace< dim >;
-
-            // weighting for original value, i.e. center element of a cell
-            constexpr float_X M = 8.0;
-            // weighting for nearest neighbours, i.e. cells sharing a face with the center cell
-            constexpr float_X S = 4.0;
-            // weighting for next to nearest neighbours, i.e. cells sharing an edge with the center cell
-            constexpr float_X D = 2.0;
-            // weighting for farthest neighbours, i.e. cells sharing a corner with the center cell
-            constexpr float_X T = 1.0;
-
-            TypeJ averagedJ =
-                // sum far neighbours, i.e. corner elements, weighting T
-                T * (
-                    fieldJ( DS( -1, -1, -1 ) ) + fieldJ( DS( +1, -1, -1 ) ) + fieldJ( DS( -1, +1, -1 ) ) + fieldJ( DS( +1, +1, -1 ) ) +
-                    fieldJ( DS( -1, -1, +1 ) ) + fieldJ( DS( +1, -1, +1 ) ) + fieldJ( DS( -1, +1, +1 ) ) + fieldJ( DS( +1, +1, +1 ) )
-                ) +
-                // sum next to nearest neighbours, i.e. edge elements, weighting D
-                D * (
-                    fieldJ( DS( -1, -1, 0 ) ) + fieldJ( DS( +1, -1, 0 ) ) + fieldJ( DS( -1, +1, 0 ) ) + fieldJ( DS( +1, +1, 0 ) ) +
-                    fieldJ( DS( -1, 0, -1 ) ) + fieldJ( DS( +1, 0, -1 ) ) + fieldJ( DS( -1, 0, +1 ) ) + fieldJ( DS( +1, 0, +1 ) ) +
-                    fieldJ( DS( 0, -1, -1 ) ) + fieldJ( DS( 0, +1, -1 ) ) + fieldJ( DS( 0, -1, +1 ) ) + fieldJ( DS( 0, +1, +1 ) )
-                ) +
-                // sum next neighbours, i.e. face elements, weighting S
-                S * (
-                    fieldJ( DS( -1, 0, 0 ) ) + fieldJ( DS( +1, 0, 0 ) ) +
-                    fieldJ( DS( 0, -1, 0 ) ) + fieldJ( DS( 0, +1, 0 ) ) +
-                    fieldJ( DS( 0, 0, -1 ) ) + fieldJ( DS( 0, 0, +1 ) )
-                ) +
-                // add original value, i.e. center element, weighting M
-                M * (
-                    fieldJ( DS( 0, 0, 0 ) )
-                );
-
-            /* calc average by normalizing weighted sum In 3D there are:
-             *   - original value with weighting M
-             *   - 6 nearest neighbours with weighting S
-             *   - 12 next to nearest neighbours with weighting D
-             *   - 8 farthest neighbours with weighting T
-             */
-            constexpr float_X inverseDivisor = 1._X / ( M + 6._X * S + 12._X * D + 8._X * T );
-            averagedJ *= inverseDivisor;
-
-            constexpr float_X deltaT = DELTA_T;
-            *fieldE -= averagedJ * ( 1._X / EPS0 ) * deltaT;
-        }
-    };
-
-
-    //! Specialization for 2D
-    template< >
-    struct Binomial< DIM2 >
-    {
-        static constexpr uint32_t dim = DIM2;
-
-        using LowerMargin = typename pmacc::math::CT::make_Int<
-            dim,
-            1
-        >::type ;
-        using UpperMargin = LowerMargin;
-
-        template<
-            typename T_DataBoxE,
-            typename T_DataBoxB,
-            typename T_DataBoxJ
-        >
-        HDINLINE void operator()(
-            T_DataBoxE fieldE,
-            T_DataBoxB const,
-            T_DataBoxJ const fieldJ
-        )
+            template<uint32_t T_dim>
+            struct Binomial;
+
+            //! Specialization for 3D
+            template<>
+            struct Binomial<DIM3>
+            {
+                static constexpr uint32_t dim = DIM3;
+
+                using LowerMargin = typename pmacc::math::CT::make_Int<dim, 1>::type;
+                using UpperMargin = LowerMargin;
+
+                template<typename T_DataBoxE, typename T_DataBoxB, typename T_DataBoxJ>
+                HDINLINE void operator()(T_DataBoxE fieldE, T_DataBoxB const, T_DataBoxJ const fieldJ)
+                {
+                    using TypeJ = typename T_DataBoxJ::ValueType;
+                    using DS = DataSpace<dim>;
+
+                    // weighting for original value, i.e. center element of a cell
+                    constexpr float_X M = 8.0;
+                    // weighting for nearest neighbours, i.e. cells sharing a face with the center cell
+                    constexpr float_X S = 4.0;
+                    // weighting for next to nearest neighbours, i.e. cells sharing an edge with the center cell
+                    constexpr float_X D = 2.0;
+                    // weighting for farthest neighbours, i.e. cells sharing a corner with the center cell
+                    constexpr float_X T = 1.0;
+
+                    TypeJ averagedJ =
+                        // sum far neighbours, i.e. corner elements, weighting T
+                        T
+                            * (fieldJ(DS(-1, -1, -1)) + fieldJ(DS(+1, -1, -1)) + fieldJ(DS(-1, +1, -1))
+                               + fieldJ(DS(+1, +1, -1)) + fieldJ(DS(-1, -1, +1)) + fieldJ(DS(+1, -1, +1))
+                               + fieldJ(DS(-1, +1, +1)) + fieldJ(DS(+1, +1, +1)))
+                        +
+                        // sum next to nearest neighbours, i.e. edge elements, weighting D
+                        D
+                            * (fieldJ(DS(-1, -1, 0)) + fieldJ(DS(+1, -1, 0)) + fieldJ(DS(-1, +1, 0))
+                               + fieldJ(DS(+1, +1, 0)) + fieldJ(DS(-1, 0, -1)) + fieldJ(DS(+1, 0, -1))
+                               + fieldJ(DS(-1, 0, +1)) + fieldJ(DS(+1, 0, +1)) + fieldJ(DS(0, -1, -1))
+                               + fieldJ(DS(0, +1, -1)) + fieldJ(DS(0, -1, +1)) + fieldJ(DS(0, +1, +1)))
+                        +
+                        // sum next neighbours, i.e. face elements, weighting S
+                        S
+                            * (fieldJ(DS(-1, 0, 0)) + fieldJ(DS(+1, 0, 0)) + fieldJ(DS(0, -1, 0))
+                               + fieldJ(DS(0, +1, 0)) + fieldJ(DS(0, 0, -1)) + fieldJ(DS(0, 0, +1)))
+                        +
+                        // add original value, i.e. center element, weighting M
+                        M * (fieldJ(DS(0, 0, 0)));
+
+                    /* calc average by normalizing weighted sum In 3D there are:
+                     *   - original value with weighting M
+                     *   - 6 nearest neighbours with weighting S
+                     *   - 12 next to nearest neighbours with weighting D
+                     *   - 8 farthest neighbours with weighting T
+                     */
+                    constexpr float_X inverseDivisor = 1._X / (M + 6._X * S + 12._X * D + 8._X * T);
+                    averagedJ *= inverseDivisor;
+
+                    constexpr float_X deltaT = DELTA_T;
+                    *fieldE -= averagedJ * (1._X / EPS0) * deltaT;
+                }
+            };
+
+
+            //! Specialization for 2D
+            template<>
+            struct Binomial<DIM2>
+            {
+                static constexpr uint32_t dim = DIM2;
+
+                using LowerMargin = typename pmacc::math::CT::make_Int<dim, 1>::type;
+                using UpperMargin = LowerMargin;
+
+                template<typename T_DataBoxE, typename T_DataBoxB, typename T_DataBoxJ>
+                HDINLINE void operator()(T_DataBoxE fieldE, T_DataBoxB const, T_DataBoxJ const fieldJ)
+                {
+                    using TypeJ = typename T_DataBoxJ::ValueType;
+                    using DS = DataSpace<dim>;
+
+                    // weighting for original value, i.e. center element of a cell
+                    constexpr float_X M = 4.0;
+                    // weighting for nearest neighbours, i.e. cells sharing an edge with the center cell
+                    constexpr float_X S = 2.0;
+                    // weighting for next to nearest neighbours, i.e. cells sharing a corner with the center cell
+                    constexpr float_X D = 1.0;
+
+                    TypeJ averagedJ =
+                        // sum next to nearest neighbours, i.e. corner neighbors, weighting D
+                        D * (fieldJ(DS(-1, -1)) + fieldJ(DS(+1, -1)) + fieldJ(DS(-1, +1)) + fieldJ(DS(+1, +1))) +
+                        // sum next neighbours, i.e. edge neighbors, weighting S
+                        S * (fieldJ(DS(-1, 0)) + fieldJ(DS(+1, 0)) + fieldJ(DS(0, -1)) + fieldJ(DS(0, +1))) +
+                        // add original value, i.e. center cell, weighting M
+                        M * (fieldJ(DS(0, 0)));
+
+                    /* calc average by normalizing weighted sum
+                     * In 2D there are:
+                     *    - original value with weighting M
+                     *    - 4 nearest neighbours with weighting S
+                     *    - 4 next to nearest neighbours with weighting D
+                     */
+                    constexpr float_X inverseDivisor = 1._X / (M + 4._X * S + 4._X * D);
+                    averagedJ *= inverseDivisor;
+
+                    constexpr float_X deltaT = DELTA_T;
+                    *fieldE -= averagedJ * (1._X / EPS0) * deltaT;
+                }
+            };
+
+        } // namespace detail
+
+
+        /** Smoothing the current density before passing it to the field solver
+         *
+         * This technique mitigates numerical Cherenkov effects and short wavelength
+         * instabilities as it effectively implements a low pass filter which
+         * damps high frequency noise (near the Nyquist frequency) in the
+         * current distribution.
+         *
+         * A description and a two-dimensional implementation of this filter
+         * is given in
+         * CK Birdsall, AB Langdon. Plasma Physics via Computer Simulation. Appendix C. Taylor & Francis, 2004.
+         * It is a 2D version of the commonly used one-dimensional three points filter with binomial coefficients
+         *
+         * The three-dimensional extension of the above two-dimensional smoothing scheme
+         * uses all 26 neighbors of a cell.
+         */
+        struct Binomial : public detail::Binomial<simDim>
         {
-            using TypeJ = typename T_DataBoxJ::ValueType;
-            using DS = DataSpace< dim >;
-
-            // weighting for original value, i.e. center element of a cell
-            constexpr float_X M = 4.0;
-            // weighting for nearest neighbours, i.e. cells sharing an edge with the center cell
-            constexpr float_X S = 2.0;
-            // weighting for next to nearest neighbours, i.e. cells sharing a corner with the center cell
-            constexpr float_X D = 1.0;
-
-            TypeJ averagedJ =
-                // sum next to nearest neighbours, i.e. corner neighbors, weighting D
-                D * (
-                    fieldJ( DS( -1, -1 ) ) + fieldJ( DS( +1, -1 ) ) +
-                    fieldJ( DS( -1, +1 ) ) + fieldJ( DS( +1, +1 ) )
-                ) +
-                // sum next neighbours, i.e. edge neighbors, weighting S
-                S * (
-                    fieldJ( DS( -1, 0 ) ) + fieldJ( DS( +1, 0 ) ) +
-                    fieldJ( DS( 0, -1 ) ) + fieldJ( DS( 0, +1 ) )
-                ) +
-                // add original value, i.e. center cell, weighting M
-                M * (
-                    fieldJ( DS( 0, 0 ) )
-                );
-
-            /* calc average by normalizing weighted sum
-             * In 2D there are:
-             *    - original value with weighting M
-             *    - 4 nearest neighbours with weighting S
-             *    - 4 next to nearest neighbours with weighting D
-             */
-            constexpr float_X inverseDivisor = 1._X / ( M + 4._X * S + 4._X * D );
-            averagedJ *= inverseDivisor;
-
-            constexpr float_X deltaT = DELTA_T;
-            *fieldE -= averagedJ * ( 1._X / EPS0 ) * deltaT;
-        }
-    };
-
-} // namespace detail
-
-
-    /** Smoothing the current density before passing it to the field solver
-     *
-     * This technique mitigates numerical Cherenkov effects and short wavelength
-     * instabilities as it effectively implements a low pass filter which
-     * damps high frequency noise (near the Nyquist frequency) in the
-     * current distribution.
-     *
-     * A description and a two-dimensional implementation of this filter
-     * is given in
-     * CK Birdsall, AB Langdon. Plasma Physics via Computer Simulation. Appendix C. Taylor & Francis, 2004.
-     * It is a 2D version of the commonly used one-dimensional three points filter with binomial coefficients
-     *
-     * The three-dimensional extension of the above two-dimensional smoothing scheme
-     * uses all 26 neighbors of a cell.
-     */
-    struct Binomial : public detail::Binomial< simDim >
-    {
-        static pmacc::traits::StringProperty getStringProperties()
-        {
-            pmacc::traits::StringProperty propList(
-                "name",
-                "Binomial"
-            );
-            propList[ "param" ] = "period=1;numPasses=1;compensator=false";
-            return propList;
-        }
-    };
-
-} // namespace currentInterpolation
-
-namespace traits
-{
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "Binomial");
+                propList["param"] = "period=1;numPasses=1;compensator=false";
+                return propList;
+            }
+        };
 
-    /* Get margin of the current interpolation
-     *
-     * This class defines a LowerMargin and an UpperMargin.
-     */
-    template< >
-    struct GetMargin< picongpu::currentInterpolation::Binomial >
+    } // namespace currentInterpolation
+
+    namespace traits
     {
-    private:
-        using MyInterpolation = picongpu::currentInterpolation::Binomial;
+        /* Get margin of the current interpolation
+         *
+         * This class defines a LowerMargin and an UpperMargin.
+         */
+        template<>
+        struct GetMargin<picongpu::currentInterpolation::Binomial>
+        {
+        private:
+            using MyInterpolation = picongpu::currentInterpolation::Binomial;
 
-    public:
-        using LowerMargin = typename MyInterpolation::LowerMargin;
-        using UpperMargin = typename MyInterpolation::UpperMargin;
-    };
+        public:
+            using LowerMargin = typename MyInterpolation::LowerMargin;
+            using UpperMargin = typename MyInterpolation::UpperMargin;
+        };
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/fields/currentInterpolation/CurrentInterpolation.def b/include/picongpu/fields/currentInterpolation/CurrentInterpolation.def
index c91dcf199e..3bfcb0debf 100644
--- a/include/picongpu/fields/currentInterpolation/CurrentInterpolation.def
+++ b/include/picongpu/fields/currentInterpolation/CurrentInterpolation.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -20,4 +20,3 @@
 
 #include "picongpu/fields/currentInterpolation/None/None.def"
 #include "picongpu/fields/currentInterpolation/Binomial/Binomial.def"
-#include "picongpu/fields/currentInterpolation/NoneDS/NoneDS.def"
diff --git a/include/picongpu/fields/currentInterpolation/CurrentInterpolation.hpp b/include/picongpu/fields/currentInterpolation/CurrentInterpolation.hpp
index ef4f76ab80..7812888cab 100644
--- a/include/picongpu/fields/currentInterpolation/CurrentInterpolation.hpp
+++ b/include/picongpu/fields/currentInterpolation/CurrentInterpolation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -17,7 +17,77 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
+#pragma once
 
 #include "picongpu/fields/currentInterpolation/None/None.hpp"
 #include "picongpu/fields/currentInterpolation/Binomial/Binomial.hpp"
-#include "picongpu/fields/currentInterpolation/NoneDS/NoneDS.hpp"
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/traits/GetStringProperties.hpp>
+
+
+namespace picongpu
+{
+    namespace currentInterpolation
+    {
+        /** Singleton to represent current interpolation kind
+         *
+         * It does not perform interpolation itself, that is done by functors None and Binomial.
+         * Provides run-time utilities to get margin values and string properties.
+         *
+         * Note: for now it is called CurrentInterpolationInfo to not conflict with CurrentInterpolation type alias
+         * used in standard .param files. Will be renamed to just CurrentInterpolation after a transition to a
+         * run-time parameter
+         */
+        struct CurrentInterpolationInfo
+        {
+        public:
+            //! Supported interpolation kinds
+            enum class Kind
+            {
+                None,
+                Binomial
+            };
+
+            //! Interpolation kind used in the simulation
+            Kind kind = Kind::None;
+
+            //! Get the single instance of the current interpolation object
+            static CurrentInterpolationInfo& get()
+            {
+                static CurrentInterpolationInfo instance;
+                return instance;
+            }
+
+            //! Get string properties
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                return get().kind == Kind::None ? None::getStringProperties() : Binomial::getStringProperties();
+            }
+
+            //! Get the lower margin of the used interpolation functor
+            static pmacc::math::Vector<int, simDim> getLowerMargin()
+            {
+                return get().kind == Kind::None ? None::LowerMargin::toRT() : Binomial::LowerMargin::toRT();
+            }
+
+            //! Get the upper margin of the used interpolation functor
+            static pmacc::math::Vector<int, simDim> getUpperMargin()
+            {
+                return get().kind == Kind::None ? None::UpperMargin::toRT() : Binomial::UpperMargin::toRT();
+            }
+
+            //! Copy construction is forbidden
+            CurrentInterpolationInfo(CurrentInterpolationInfo const&) = delete;
+
+            //! Assignment is forbidden
+            CurrentInterpolationInfo& operator=(CurrentInterpolationInfo const&) = delete;
+
+        private:
+            CurrentInterpolationInfo() = default;
+            ~CurrentInterpolationInfo() = default;
+        };
+
+    } // namespace currentInterpolation
+
+} // namespace picongpu
diff --git a/include/picongpu/fields/currentInterpolation/None/None.def b/include/picongpu/fields/currentInterpolation/None/None.def
index 13ca3bc81d..dca886acde 100644
--- a/include/picongpu/fields/currentInterpolation/None/None.def
+++ b/include/picongpu/fields/currentInterpolation/None/None.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -22,15 +22,14 @@
 
 namespace picongpu
 {
-namespace currentInterpolation
-{
-
-    /* None interpolated current assignment
-     *
-     * Default for staggered grids/Yee-scheme.
-     * Updates field E only.
-     */
-    struct None;
+    namespace currentInterpolation
+    {
+        /* None interpolated current assignment functor
+         *
+         * Default for staggered grids/Yee-scheme.
+         * Updates field E only.
+         */
+        struct None;
 
-} // namespace currentInterpolation
+    } // namespace currentInterpolation
 } // namespace picongpu
diff --git a/include/picongpu/fields/currentInterpolation/None/None.hpp b/include/picongpu/fields/currentInterpolation/None/None.hpp
index b9ab590204..6254f32941 100644
--- a/include/picongpu/fields/currentInterpolation/None/None.hpp
+++ b/include/picongpu/fields/currentInterpolation/None/None.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl, Benjamin Worpitz
+/* Copyright 2015-2021 Axel Huebl, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -26,65 +26,49 @@
 
 namespace picongpu
 {
-namespace currentInterpolation
-{
-
-    struct None
+    namespace currentInterpolation
     {
-        static constexpr uint32_t dim = simDim;
-
-        using LowerMargin = typename pmacc::math::CT::make_Int<
-            dim,
-            0
-        >::type;
-        using UpperMargin = LowerMargin;
-
-        template<
-            typename T_DataBoxE,
-            typename T_DataBoxB,
-            typename T_DataBoxJ
-        >
-        HDINLINE void operator()(
-            T_DataBoxE fieldE,
-            T_DataBoxB const,
-            T_DataBoxJ const fieldJ
-        )
+        struct None
         {
-            DataSpace< dim > const self;
+            static constexpr uint32_t dim = simDim;
 
-            constexpr float_X deltaT = DELTA_T;
-            fieldE( self ) -= fieldJ( self ) * ( float_X( 1.0 ) / EPS0 ) * deltaT;
-        }
+            using LowerMargin = typename pmacc::math::CT::make_Int<dim, 0>::type;
+            using UpperMargin = LowerMargin;
 
-        static pmacc::traits::StringProperty getStringProperties( )
-        {
-            pmacc::traits::StringProperty propList(
-                "name",
-                "none"
-            );
-            return propList;
-        }
-    };
+            template<typename T_DataBoxE, typename T_DataBoxB, typename T_DataBoxJ>
+            HDINLINE void operator()(T_DataBoxE fieldE, T_DataBoxB const, T_DataBoxJ const fieldJ)
+            {
+                DataSpace<dim> const self;
 
-} // namespace currentInterpolation
+                constexpr float_X deltaT = DELTA_T;
+                fieldE(self) -= fieldJ(self) * (float_X(1.0) / EPS0) * deltaT;
+            }
 
-namespace traits
-{
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "none");
+                return propList;
+            }
+        };
 
-    /* Get margin of the current interpolation
-     *
-     * This class defines a LowerMargin and an UpperMargin.
-     */
-    template< >
-    struct GetMargin< picongpu::currentInterpolation::None >
+    } // namespace currentInterpolation
+
+    namespace traits
     {
-    private:
-        using MyInterpolation = picongpu::currentInterpolation::None;
+        /* Get margin of the current interpolation
+         *
+         * This class defines a LowerMargin and an UpperMargin.
+         */
+        template<>
+        struct GetMargin<picongpu::currentInterpolation::None>
+        {
+        private:
+            using MyInterpolation = picongpu::currentInterpolation::None;
 
-    public:
-        using LowerMargin = typename MyInterpolation::LowerMargin;
-        using UpperMargin = typename MyInterpolation::UpperMargin;
-    };
+        public:
+            using LowerMargin = typename MyInterpolation::LowerMargin;
+            using UpperMargin = typename MyInterpolation::UpperMargin;
+        };
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/fields/currentInterpolation/NoneDS/NoneDS.def b/include/picongpu/fields/currentInterpolation/NoneDS/NoneDS.def
deleted file mode 100644
index 3a7386ccfa..0000000000
--- a/include/picongpu/fields/currentInterpolation/NoneDS/NoneDS.def
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2015-2020 Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-namespace picongpu
-{
-namespace currentInterpolation
-{
-
-    /* The standard interpolation for Directional Splitting
-     *
-     * Experimental assignment for all-centered cells used in directional splitting.
-     * Updates E & B at the same time.
-     */
-    struct NoneDS;
-
-} // namespace currentInterpolation
-} // namespace picongpu
diff --git a/include/picongpu/fields/currentInterpolation/NoneDS/NoneDS.hpp b/include/picongpu/fields/currentInterpolation/NoneDS/NoneDS.hpp
deleted file mode 100644
index 5ba1c18245..0000000000
--- a/include/picongpu/fields/currentInterpolation/NoneDS/NoneDS.hpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright 2015-2020 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/fields/currentInterpolation/None/None.def"
-#include "picongpu/algorithms/DifferenceToUpper.hpp"
-#include "picongpu/algorithms/LinearInterpolateWithUpper.hpp"
-#include "picongpu/fields/MaxwellSolver/Yee/Curl.hpp"
-
-#include <pmacc/traits/GetComponentsType.hpp>
-#include <pmacc/dimensions/DataSpace.hpp>
-
-
-namespace picongpu
-{
-namespace currentInterpolation
-{
-namespace detail
-{
-    template<uint32_t T_simDim, uint32_t T_plane>
-    struct LinearInterpolateComponentPlaneUpper
-    {
-        static constexpr uint32_t dim = T_simDim;
-
-        /* UpperMargin is actually 0 in direction of T_plane */
-        using LowerMargin = typename pmacc::math::CT::make_Int<
-            dim,
-            0
-        >::type;
-        using UpperMargin = typename pmacc::math::CT::make_Int<
-            dim,
-            1
-        >::type;
-
-        template<typename DataBox>
-        HDINLINE float_X operator()( DataBox const & field ) const
-        {
-            DataSpace< dim > const self;
-            DataSpace< dim > up;
-            up[(T_plane + 1) % dim] = 1;
-
-            using Avg = LinearInterpolateWithUpper< dim >;
-
-            typename Avg::template GetInterpolatedValue< ( T_plane + 2 ) % dim > const avg;
-
-            return float_X( 0.5 ) * ( avg( field )[ T_plane ] + avg( field.shift( up ) )[ T_plane ] );
-        }
-    };
-
-    /* shift a databox along a specific direction
-     *
-     * returns the identity (assume periodic symmetry) if direction is not
-     * available, such as in a 2D simulation
-     *
-     * \todo accept a full CT::Vector and shift if possible
-     * \todo call with CT::Vector of correct dimensionality that was created
-     *       with AssignIfInRange...
-     *
-     * \tparam T_simDim maximum dimensionality of the mesh
-     * \tparam T_direction (0)X (1)Y or (2)Z for the direction one wants to
-     *                     shift to
-     * \tparam isShiftAble auto-filled value that decides if this direction
-     *                     is actually non-existent == periodic
-     */
-    template<
-        uint32_t T_simDim,
-        uint32_t T_direction,
-        bool isShiftAble = ( T_direction < T_simDim )
-    >
-    struct ShiftMeIfYouCan
-    {
-        static constexpr uint32_t dim = T_simDim;
-        static constexpr uint32_t dir = T_direction;
-
-        HDINLINE ShiftMeIfYouCan()
-        {
-        }
-
-        template< typename T_DataBox >
-        HDINLINE T_DataBox operator()( T_DataBox const & dataBox ) const
-        {
-            DataSpace< dim > shift;
-            shift[ dir ] = 1;
-            return dataBox.shift( shift );
-        }
-    };
-
-    template<
-        uint32_t T_simDim,
-        uint32_t T_direction
-    >
-    struct ShiftMeIfYouCan<
-        T_simDim,
-        T_direction,
-        false
-    >
-    {
-        HDINLINE ShiftMeIfYouCan()
-        {
-        }
-
-        template< typename T_DataBox >
-        HDINLINE T_DataBox operator()( T_DataBox const & dataBox ) const
-        {
-            return dataBox;
-        }
-    };
-
-    /* that is not a "real" yee curl, but it looks a bit like it */
-    template< typename Difference >
-    struct ShiftCurl
-    {
-        using LowerMargin = typename Difference::OffsetOrigin;
-        using UpperMargin = typename Difference::OffsetEnd;
-
-        template<class DataBox >
-        HDINLINE typename DataBox::ValueType operator()( DataBox const & mem ) const
-        {
-            typename Difference::template GetDifference< 0 > const Dx;
-            typename Difference::template GetDifference< 1 > const Dy;
-            typename Difference::template GetDifference< 2 > const Dz;
-
-            ShiftMeIfYouCan<
-                simDim,
-                0
-            > const sx;
-            ShiftMeIfYouCan<
-                simDim,
-                1
-            > const sy;
-            ShiftMeIfYouCan<
-                simDim,
-                2
-            > const sz;
-
-            return float3_X(
-                Dy( sx( mem ) ).z( ) - Dz( sx( mem ) ).y( ),
-                Dz( sy( mem ) ).x( ) - Dx( sy( mem ) ).z( ),
-                Dx( sz( mem ) ).y( ) - Dy( sz( mem ) ).x( )
-            );
-        }
-    };
-} // namespace detail
-
-    struct NoneDS
-    {
-        static constexpr uint32_t dim = simDim;
-
-        typedef typename pmacc::math::CT::make_Int<dim, 0>::type LowerMargin;
-        typedef typename pmacc::math::CT::make_Int<dim, 1>::type UpperMargin;
-
-        template<
-            typename T_DataBoxE,
-            typename T_DataBoxB,
-            typename T_DataBoxJ
-        >
-        HDINLINE void operator()(
-            T_DataBoxE fieldE,
-            T_DataBoxB fieldB,
-            T_DataBoxJ const fieldJ
-        )
-        {
-            using TypeJ = typename T_DataBoxJ::ValueType;
-            using ComponentJ = typename GetComponentsType< TypeJ >::type;
-
-            DataSpace< dim > const self;
-
-            constexpr ComponentJ deltaT = DELTA_T;
-            ComponentJ const constE = ( float_X( 1.0 )  / EPS0 ) * deltaT;
-            ComponentJ const constB = ( float_X( 0.25 ) / EPS0 ) * deltaT * deltaT;
-
-            detail::LinearInterpolateComponentPlaneUpper<
-                dim,
-                0
-            > const avgX;
-            ComponentJ const jXavg = avgX( fieldJ );
-            detail::LinearInterpolateComponentPlaneUpper<
-                dim,
-                1
-            > const avgY;
-            ComponentJ const jYavg = avgY( fieldJ );
-            detail::LinearInterpolateComponentPlaneUpper<
-                dim,
-                2
-            > const avgZ;
-            ComponentJ const jZavg = avgZ( fieldJ );
-
-            TypeJ const jAvgE = TypeJ(
-                jXavg,
-                jYavg,
-                jZavg
-            );
-            fieldE( self ) -= jAvgE * constE;
-
-            using CurlRight = fields::maxwellSolver::yee::Curl< DifferenceToUpper< dim > >;
-            using ShiftCurlRight = detail::ShiftCurl< DifferenceToUpper< dim > >;
-            CurlRight curl;
-            ShiftCurlRight shiftCurl;
-
-            TypeJ const jAvgB = curl( fieldJ ) + shiftCurl( fieldJ );
-            fieldB(self) += jAvgB * constB;
-        }
-
-        static pmacc::traits::StringProperty getStringProperties()
-        {
-            pmacc::traits::StringProperty propList(
-                "name",
-                "none"
-            );
-            return propList;
-        }
-    };
-
-} // namespace currentInterpolation
-
-namespace traits
-{
-
-    /* Get margin of the current interpolation
-     *
-     * This class defines a LowerMargin and an UpperMargin.
-     */
-    template< >
-    struct GetMargin< picongpu::currentInterpolation::NoneDS >
-    {
-    private:
-        using MyInterpolation = picongpu::currentInterpolation::NoneDS;
-
-    public:
-        using LowerMargin = typename MyInterpolation::LowerMargin;
-        using UpperMargin = typename MyInterpolation::UpperMargin;
-    };
-
-} // namespace traits
-} // namespace picongpu
diff --git a/include/picongpu/fields/differentiation/BackwardDerivative.hpp b/include/picongpu/fields/differentiation/BackwardDerivative.hpp
new file mode 100644
index 0000000000..7b8b1cfe70
--- /dev/null
+++ b/include/picongpu/fields/differentiation/BackwardDerivative.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/differentiation/Derivative.def"
+#include "picongpu/fields/differentiation/Traits.hpp"
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/meta/accessors/Identity.hpp>
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace differentiation
+        {
+            /** Functor for backward difference derivative along the given direction
+             *
+             * Computes (current - lower) / step, previously called DifferenceToLower.
+             *
+             * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+             */
+            template<uint32_t T_direction>
+            struct BackwardDerivativeFunctor
+            {
+                //! Lower margin
+                using LowerMargin = typename pmacc::math::CT::make_BasisVector<simDim, T_direction, int>::type;
+
+                //! Upper margin
+                using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+
+                /** Return derivative value at the given point
+                 *
+                 * @tparam T_DataBox data box type with field data
+                 * @param data position in the data box to compute derivative at
+                 */
+                template<typename T_DataBox>
+                HDINLINE typename T_DataBox::ValueType operator()(T_DataBox const& data) const
+                {
+                    using Index = pmacc::DataSpace<simDim>;
+                    auto const lowerIndex = -pmacc::math::basisVector<Index, T_direction>();
+                    return (data(Index{}) - data(lowerIndex)) / cellSize[T_direction];
+                }
+            };
+
+            namespace traits
+            {
+                /** Functor type trait specialization for backward derivative
+                 *
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_direction>
+                struct DerivativeFunctor<Backward, T_direction>
+                    : pmacc::meta::accessors::Identity<BackwardDerivativeFunctor<T_direction>>
+                {
+                };
+
+            } // namespace traits
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/differentiation/Curl.def b/include/picongpu/fields/differentiation/Curl.def
new file mode 100644
index 0000000000..f49e894263
--- /dev/null
+++ b/include/picongpu/fields/differentiation/Curl.def
@@ -0,0 +1,41 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/fields/differentiation/Derivative.def"
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace differentiation
+        {
+            /** Functor to compute field curl at the given point
+             *
+             * @tparam T_Derivative derivative tag (not functor), defines the
+             *                      finite-difference scheme for partial derivatives
+             */
+            template<typename T_Derivative>
+            struct Curl;
+
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/differentiation/Curl.hpp b/include/picongpu/fields/differentiation/Curl.hpp
new file mode 100644
index 0000000000..8cd238558e
--- /dev/null
+++ b/include/picongpu/fields/differentiation/Curl.hpp
@@ -0,0 +1,128 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/fields/differentiation/Curl.def"
+#include "picongpu/fields/differentiation/Derivative.hpp"
+#include "picongpu/traits/GetMargin.hpp"
+
+#include <pmacc/math/Vector.hpp>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace differentiation
+        {
+            /** Functor to compute field curl at the given point
+             *
+             * @tparam T_Derivative derivative tag (not functor), defines the
+             *                      finite-difference scheme for partial derivatives
+             */
+            template<typename T_Derivative>
+            struct Curl
+            {
+                //! Derivative tag
+                using Derivative = T_Derivative;
+
+                //! Derivative function along x type
+                using XDerivativeFunctor = decltype(makeDerivativeFunctor<Derivative, 0>());
+
+                //! Derivative function along y type
+                using YDerivativeFunctor = decltype(makeDerivativeFunctor<Derivative, 1>());
+
+                //! Derivative function along z type
+                using ZDerivativeFunctor = decltype(makeDerivativeFunctor<Derivative, 2>());
+
+                //! Lower margin: max of the derivative lower margins
+                using LowerMargin = typename pmacc::math::CT::max<
+                    typename pmacc::math::CT::max<
+                        typename GetLowerMargin<XDerivativeFunctor>::type,
+                        typename GetLowerMargin<YDerivativeFunctor>::type>::type,
+                    typename GetLowerMargin<ZDerivativeFunctor>::type>::type;
+
+                //! Upper margin: max of the derivative upper margins
+                using UpperMargin = typename pmacc::math::CT::max<
+                    typename pmacc::math::CT::max<
+                        typename GetUpperMargin<XDerivativeFunctor>::type,
+                        typename GetUpperMargin<YDerivativeFunctor>::type>::type,
+                    typename GetUpperMargin<ZDerivativeFunctor>::type>::type;
+
+                //! Create curl functor
+                HDINLINE Curl()
+                    : xDerivativeFunctor(makeDerivativeFunctor<Derivative, 0>())
+                    , yDerivativeFunctor(makeDerivativeFunctor<Derivative, 1>())
+                    , zDerivativeFunctor(makeDerivativeFunctor<Derivative, 2>())
+                {
+                }
+
+                /** Return curl value at the given point
+                 *
+                 * @tparam T_DataBox data box type with field data
+                 */
+                template<typename T_DataBox>
+                HDINLINE typename T_DataBox::ValueType operator()(T_DataBox const& data) const
+                {
+                    auto const dFdx = xDerivative(data);
+                    auto const dFdy = yDerivative(data);
+                    auto const dFdz = zDerivative(data);
+                    return float3_X{dFdy.z() - dFdz.y(), dFdz.x() - dFdx.z(), dFdx.y() - dFdy.x()};
+                }
+
+                /** Return x derivative value at the given point
+                 *
+                 * @tparam T_DataBox data box type with field data
+                 */
+                template<typename T_DataBox>
+                HDINLINE typename T_DataBox::ValueType xDerivative(T_DataBox const& data) const
+                {
+                    return xDerivativeFunctor(data);
+                }
+
+                /** Return y derivative value at the given point
+                 *
+                 * @tparam T_DataBox data box type with field data
+                 */
+                template<typename T_DataBox>
+                HDINLINE typename T_DataBox::ValueType yDerivative(T_DataBox const& data) const
+                {
+                    return yDerivativeFunctor(data);
+                }
+
+                /** Return z derivative value at the given point
+                 *
+                 * @tparam T_DataBox data box type with field data
+                 */
+                template<typename T_DataBox>
+                HDINLINE typename T_DataBox::ValueType zDerivative(T_DataBox const& data) const
+                {
+                    return zDerivativeFunctor(data);
+                }
+
+            private:
+                XDerivativeFunctor const xDerivativeFunctor;
+                YDerivativeFunctor const yDerivativeFunctor;
+                ZDerivativeFunctor const zDerivativeFunctor;
+            };
+
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/differentiation/Derivative.def b/include/picongpu/fields/differentiation/Derivative.def
new file mode 100644
index 0000000000..269c8478b3
--- /dev/null
+++ b/include/picongpu/fields/differentiation/Derivative.def
@@ -0,0 +1,40 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace differentiation
+        {
+            //! Forward (upper - current) difference derivative tag
+            struct Forward;
+
+            //! Backward (current - lower) difference derivative tag
+            struct Backward;
+
+            //! Zero derivative tag
+            struct Zero;
+
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/differentiation/Derivative.hpp b/include/picongpu/fields/differentiation/Derivative.hpp
new file mode 100644
index 0000000000..b99127b48f
--- /dev/null
+++ b/include/picongpu/fields/differentiation/Derivative.hpp
@@ -0,0 +1,79 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/differentiation/BackwardDerivative.hpp"
+#include "picongpu/fields/differentiation/Derivative.def"
+#include "picongpu/fields/differentiation/ForwardDerivative.hpp"
+#include "picongpu/fields/differentiation/Traits.hpp"
+#include "picongpu/fields/differentiation/ZeroDerivative.hpp"
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace differentiation
+        {
+            /** Interface of field derivative functors created by makeDerivativeFunctor()
+             *
+             * In addition to operator(), the functor must be copyable and assignable.
+             */
+            struct DerivativeFunctorConcept
+            {
+                /** Return derivative value at the given point
+                 *
+                 * @tparam T_DataBox data box type with field data
+                 * @param data position in the data box to compute derivative at
+                 */
+                template<typename T_DataBox>
+                HDINLINE typename T_DataBox::ValueType operator()(T_DataBox const& data) const;
+            };
+
+            /** Type of derivative functor for the given derivative tag and direction
+             *
+             * Derivative tag defines the scheme and is used for configuration, while
+             * the functor actually computes the derivatives along the given direction.
+             *
+             * @tparam T_Derivative derivative tag, defines the finite-difference scheme
+             * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+             */
+            template<typename T_Derivative, uint32_t T_direction>
+            using DerivativeFunctor = typename traits::DerivativeFunctor<T_Derivative, T_direction>::type;
+
+            /** Create a functor to compute field derivative along the given direction
+             *
+             * In case T_direction is >= simDim, returns the zero derivative functor
+             *
+             * @tparam T_Derivative derivative tag, defines the finite-difference scheme
+             * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+             */
+            template<typename T_Derivative, uint32_t T_direction>
+            HDINLINE auto makeDerivativeFunctor()
+            {
+                return traits::MakeDerivativeFunctor<T_Derivative, T_direction>{}();
+            }
+
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/differentiation/ForwardDerivative.hpp b/include/picongpu/fields/differentiation/ForwardDerivative.hpp
new file mode 100644
index 0000000000..c47734c723
--- /dev/null
+++ b/include/picongpu/fields/differentiation/ForwardDerivative.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/differentiation/Derivative.def"
+#include "picongpu/fields/differentiation/Traits.hpp"
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/meta/accessors/Identity.hpp>
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace differentiation
+        {
+            /** Functor for forward difference derivative along the given direction
+             *
+             * Computes (upper - current) / step, previously called DifferenceToUpper.
+             *
+             * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+             */
+            template<uint32_t T_direction>
+            struct ForwardDerivativeFunctor
+            {
+                //! Lower margin
+                using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+
+                //! Upper margin
+                using UpperMargin = typename pmacc::math::CT::make_BasisVector<simDim, T_direction, int>::type;
+
+                /** Return derivative value at the given point
+                 *
+                 * @tparam T_DataBox data box type with field data
+                 * @param data position in the data box to compute derivative at
+                 */
+                template<typename T_DataBox>
+                HDINLINE typename T_DataBox::ValueType operator()(T_DataBox const& data) const
+                {
+                    using Index = pmacc::DataSpace<simDim>;
+                    auto const upperIndex = pmacc::math::basisVector<Index, T_direction>();
+                    return (data(upperIndex) - data(Index{})) / cellSize[T_direction];
+                }
+            };
+
+            namespace traits
+            {
+                /** Functor type trait specialization for forward derivative
+                 *
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_direction>
+                struct DerivativeFunctor<Forward, T_direction>
+                    : pmacc::meta::accessors::Identity<ForwardDerivativeFunctor<T_direction>>
+                {
+                };
+
+            } // namespace traits
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/differentiation/Traits.hpp b/include/picongpu/fields/differentiation/Traits.hpp
new file mode 100644
index 0000000000..0db9250370
--- /dev/null
+++ b/include/picongpu/fields/differentiation/Traits.hpp
@@ -0,0 +1,90 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/differentiation/Derivative.def"
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace differentiation
+        {
+            namespace traits
+            {
+                /** Type trait for derivative functor for the given derivative tag and
+                 *  direction, accessible as ::type
+                 *
+                 * Has to be specialized for each derivative tag.
+                 *
+                 * @tparam T_Derivative derivative tag, defines the finite-difference scheme
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<typename T_Derivative, uint32_t T_direction>
+                struct DerivativeFunctor;
+
+                /** Factory for functors to compute field derivative along the given direction
+                 *
+                 * In case T_direction is >= simDim, returns the zero derivative functor.
+                 * Does not need to be specialized when DerivativeFunctor is specialized.
+                 *
+                 * @tparam T_Derivative derivative tag, defines the finite-difference scheme
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 * @tparam T_isLesserThanDim flag to decide between normal and zero derivative
+                 */
+                template<typename T_Derivative, uint32_t T_direction, bool T_isLesserThanDim = (T_direction < simDim)>
+                struct MakeDerivativeFunctor
+                {
+                    using Functor = typename DerivativeFunctor<T_Derivative, T_direction>::type;
+
+                    //! Return a functor
+                    HDINLINE Functor operator()() const
+                    {
+                        return Functor{};
+                    }
+                };
+
+                /** Factory for functors to compute field derivative along the given direction
+                 *
+                 * Implementation for T_direction >= simDim, always returns zero derivative
+                 *
+                 * @tparam T_Derivative derivative tag, defines the finite-difference scheme
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<typename T_Derivative, uint32_t T_direction>
+                struct MakeDerivativeFunctor<T_Derivative, T_direction, false>
+                {
+                    using ZeroFunctor = typename DerivativeFunctor<Zero, T_direction>::type;
+
+                    //! Return a zero functor
+                    HDINLINE ZeroFunctor operator()() const
+                    {
+                        return ZeroFunctor{};
+                    }
+                };
+
+            } // namespace traits
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/differentiation/ZeroDerivative.hpp b/include/picongpu/fields/differentiation/ZeroDerivative.hpp
new file mode 100644
index 0000000000..6a22b4ccc2
--- /dev/null
+++ b/include/picongpu/fields/differentiation/ZeroDerivative.hpp
@@ -0,0 +1,80 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/differentiation/Derivative.def"
+#include "picongpu/fields/differentiation/Traits.hpp"
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/meta/accessors/Identity.hpp>
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        namespace differentiation
+        {
+            /** Functor for zero derivative along the given direction
+             *
+             * Always returns zero.
+             *
+             * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+             */
+            template<uint32_t T_direction>
+            struct ZeroDerivativeFunctor
+            {
+                //! Lower margin
+                using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+
+                //! Upper margin
+                using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+
+                /** Return zero
+                 *
+                 * @tparam T_DataBox data box type with field data
+                 * @param data position in the data box to compute derivative at
+                 */
+                template<typename T_DataBox>
+                HDINLINE typename T_DataBox::ValueType operator()(T_DataBox const& data) const
+                {
+                    return T_DataBox::ValueType::create(0.0_X);
+                }
+            };
+
+            namespace traits
+            {
+                /** Functor type trait specialization for zero derivative
+                 *
+                 * @tparam T_direction direction to take derivative in, 0 = x, 1 = y, 2 = z
+                 */
+                template<uint32_t T_direction>
+                struct DerivativeFunctor<Zero, T_direction>
+                    : pmacc::meta::accessors::Identity<ZeroDerivativeFunctor<T_direction>>
+                {
+                };
+
+            } // namespace traits
+        } // namespace differentiation
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/ExpRampWithPrepulse.def b/include/picongpu/fields/laserProfiles/ExpRampWithPrepulse.def
index d4b213f82d..e6e9c59373 100644
--- a/include/picongpu/fields/laserProfiles/ExpRampWithPrepulse.def
+++ b/include/picongpu/fields/laserProfiles/ExpRampWithPrepulse.def
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Ilja Goethel, Axel Huebl
+/* Copyright 2018-2021 Ilja Goethel, Axel Huebl
  *
  *
  * This file is part of PIConGPU.
@@ -25,153 +25,157 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace expRampWithPrepulse
-{
-namespace defaults
-{
-    struct ExpRampWithPrepulseParam
+    namespace fields
     {
-        // Intensities of prepulse and exponential preramp
-        static constexpr float_X INT_RATIO_PREPULSE = 0.;
-        static constexpr float_X INT_RATIO_POINT_1 = 1.e-8;
-        static constexpr float_X INT_RATIO_POINT_2 = 1.e-4;
-        static constexpr float_X INT_RATIO_POINT_3 = 1.e-4;
-
-        // time-positions of prepulse and preramps points
-        static constexpr float_64 TIME_PREPULSE_SI = -950.0e-15;
-        static constexpr float_64 TIME_PEAKPULSE_SI = 0.0e-15;
-        static constexpr float_64 TIME_POINT_1_SI = -1000.0e-15;
-        static constexpr float_64 TIME_POINT_2_SI = -300.0e-15;
-        static constexpr float_64 TIME_POINT_3_SI = -100.0e-15;
-
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** UNITCONV */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        static constexpr float_64 _A0  = 20.;
-
-        /** unit: Volt /meter */
-        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt /meter */
-        //constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Stretch temporal profile by a constant plateau between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 0.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 3.0e-14 / 2.35482; // half of the time in which E falls to half its initial value (then I falls to half its value in 15fs, approx 6 wavelengths). Those are 4.8 wavelenghts.
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              WO_X_SI is this distance in x-direction
-         *              W0_Z_SI is this distance in z-direction
-         *              if both values are equal, the laser has a circular shape in x-z
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *  unit: meter */
-        static constexpr float_64 W0_X_SI = 2.5 * WAVE_LENGTH_SI;
-        static constexpr float_64 W0_Z_SI = W0_X_SI;
-
-        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before plateau
-         *  and half at the end of the plateau
-         *  unit: none */
-        static constexpr float_64 RAMP_INIT = 16.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-} // namespace defaults
-} // namespace expRampWithPrepulse
-
-    /** Wavepacket with spatial Gaussian envelope and adjustable temporal shape.
-     *
-     * Allows defining a prepulse and two regions of exponential preramp with
-     * independent slopes. The definition works by specifying three (t, intensity)-
-     * points, where time is counted from the very beginning in SI and the
-     * intensity (yes, intensity, not amplitude) is given in multiples of the main
-     * peak.
-     *
-     * Be careful - problematic for few cycle pulses. Thought the rest is cloned
-     * from laserWavepacket, the correctionFactor is not included (this made a
-     * correction to the laser phase, which is necessary for very short pulses,
-     * since otherwise a test particle is, after the laser pulse has passed, not
-     * returned to immobility, as it should). Since the analytical solution is
-     * only implemented for the Gaussian regime, and we have mostly exponential
-     * regimes here, it was not retained here.
-     *
-     * A Gaussian peak (optionally lengthened by a plateau) is preceded by
-     * two pieces of exponential preramps, defined by 3 (time, intensity)-
-     * -points.
-     *
-     * The first two points get connected by an exponential, the 2nd and
-     * 3rd point are connected by another exponential, which is then
-     * extrapolated to the peak. The Gaussian is added everywhere, but
-     * typically contributes significantly only near the peak.
-     * It is advisable to set the third point far enough from the plateau
-     * (approx 3*FWHM), then the contribution from the Gaussian is
-     * negligible there, and the intensity can be set as measured from the
-     * laser profile.
-     *
-     * Optionally a Gaussian prepulse can be added, given by the parameters
-     * of the relative intensity and time point.
-     * The time of the prepulse and the three preramp points are given in
-     * SI, the intensities are given as multiples of the peak intensity.
-     *
-     * @tparam T_Params class parameter to configure the Gaussian Beam profile,
-     *                  see members of
-     *                  expRampWithPrepulse::defaults::ExpRampWithPrepulseParam
-     *                  for required members
-     */
-    template< typename T_Params = expRampWithPrepulse::defaults::ExpRampWithPrepulseParam >
-    struct ExpRampWithPrepulse;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace expRampWithPrepulse
+            {
+                namespace defaults
+                {
+                    struct ExpRampWithPrepulseParam
+                    {
+                        // Intensities of prepulse and exponential preramp
+                        static constexpr float_X INT_RATIO_PREPULSE = 0.;
+                        static constexpr float_X INT_RATIO_POINT_1 = 1.e-8;
+                        static constexpr float_X INT_RATIO_POINT_2 = 1.e-4;
+                        static constexpr float_X INT_RATIO_POINT_3 = 1.e-4;
+
+                        // time-positions of prepulse and preramps points
+                        static constexpr float_64 TIME_PREPULSE_SI = -950.0e-15;
+                        static constexpr float_64 TIME_PEAKPULSE_SI = 0.0e-15;
+                        static constexpr float_64 TIME_POINT_1_SI = -1000.0e-15;
+                        static constexpr float_64 TIME_POINT_2_SI = -300.0e-15;
+                        static constexpr float_64 TIME_POINT_3_SI = -100.0e-15;
+
+                        /** unit: meter */
+                        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                        /** UNITCONV */
+                        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                            * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                            * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                        /** unit: W / m^2 */
+                        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                        /** unit: none */
+                        static constexpr float_64 _A0 = 20.;
+
+                        /** unit: Volt /meter */
+                        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                        /** unit: Volt /meter */
+                        // constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                        /** Stretch temporal profile by a constant plateau between the up and downramp
+                         *  unit: seconds */
+                        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI
+                            = 0.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
+
+                        /** Pulse length: sigma of std. gauss for intensity (E^2)
+                         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                         *                                          [    2.354820045     ]
+                         *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                         *                      = what a experimentalist calls "pulse duration"
+                         *  unit: seconds (1 sigma) */
+                        static constexpr float_64 PULSE_LENGTH_SI = 3.0e-14
+                            / 2.35482; // half of the time in which E falls to half its initial value (then I falls to
+                                       // half its value in 15fs, approx 6 wavelengths). Those are 4.8 wavelenghts.
+
+                        /** beam waist: distance from the axis where the pulse intensity (E^2)
+                         *              decreases to its 1/e^2-th part,
+                         *              WO_X_SI is this distance in x-direction
+                         *              W0_Z_SI is this distance in z-direction
+                         *              if both values are equal, the laser has a circular shape in x-z
+                         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                         *                             [   1.17741    ]
+                         *  unit: meter */
+                        static constexpr float_64 W0_X_SI = 2.5 * WAVE_LENGTH_SI;
+                        static constexpr float_64 W0_Z_SI = W0_X_SI;
+
+                        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before
+                         * plateau and half at the end of the plateau unit: none */
+                        static constexpr float_64 RAMP_INIT = 16.0;
+
+                        /** cell from top where the laser is initialized
+                         *
+                         * if `initPlaneY == 0` than the absorber are disabled.
+                         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                         * direction is enabled
+                         *
+                         * valid ranges:
+                         *   - initPlaneY == 0
+                         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                         */
+                        static constexpr uint32_t initPlaneY = 0;
+
+                        /** laser phase shift (no shift: 0.0)
+                         *
+                         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                         *
+                         * unit: rad, periodic in 2*pi
+                         */
+                        static constexpr float_X LASER_PHASE = 0.0;
+
+                        /** Available polarisation types
+                         */
+                        enum PolarisationType
+                        {
+                            LINEAR_X = 1u,
+                            LINEAR_Z = 2u,
+                            CIRCULAR = 4u,
+                        };
+
+                        /** Polarization selection
+                         */
+                        static constexpr PolarisationType Polarisation = LINEAR_X;
+                    };
+                } // namespace defaults
+            } // namespace expRampWithPrepulse
+
+            /** Wavepacket with spatial Gaussian envelope and adjustable temporal shape.
+             *
+             * Allows defining a prepulse and two regions of exponential preramp with
+             * independent slopes. The definition works by specifying three (t, intensity)-
+             * points, where time is counted from the very beginning in SI and the
+             * intensity (yes, intensity, not amplitude) is given in multiples of the main
+             * peak.
+             *
+             * Be careful - problematic for few cycle pulses. Thought the rest is cloned
+             * from laserWavepacket, the correctionFactor is not included (this made a
+             * correction to the laser phase, which is necessary for very short pulses,
+             * since otherwise a test particle is, after the laser pulse has passed, not
+             * returned to immobility, as it should). Since the analytical solution is
+             * only implemented for the Gaussian regime, and we have mostly exponential
+             * regimes here, it was not retained here.
+             *
+             * A Gaussian peak (optionally lengthened by a plateau) is preceded by
+             * two pieces of exponential preramps, defined by 3 (time, intensity)-
+             * -points.
+             *
+             * The first two points get connected by an exponential, the 2nd and
+             * 3rd point are connected by another exponential, which is then
+             * extrapolated to the peak. The Gaussian is added everywhere, but
+             * typically contributes significantly only near the peak.
+             * It is advisable to set the third point far enough from the plateau
+             * (approx 3*FWHM), then the contribution from the Gaussian is
+             * negligible there, and the intensity can be set as measured from the
+             * laser profile.
+             *
+             * Optionally a Gaussian prepulse can be added, given by the parameters
+             * of the relative intensity and time point.
+             * The time of the prepulse and the three preramp points are given in
+             * SI, the intensities are given as multiples of the peak intensity.
+             *
+             * @tparam T_Params class parameter to configure the Gaussian Beam profile,
+             *                  see members of
+             *                  expRampWithPrepulse::defaults::ExpRampWithPrepulseParam
+             *                  for required members
+             */
+            template<typename T_Params = expRampWithPrepulse::defaults::ExpRampWithPrepulseParam>
+            struct ExpRampWithPrepulse;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/ExpRampWithPrepulse.hpp b/include/picongpu/fields/laserProfiles/ExpRampWithPrepulse.hpp
index 52cdf7af2a..a8878d249b 100644
--- a/include/picongpu/fields/laserProfiles/ExpRampWithPrepulse.hpp
+++ b/include/picongpu/fields/laserProfiles/ExpRampWithPrepulse.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Ilja Goethel, Axel Huebl
+/* Copyright 2018-2021 Ilja Goethel, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -27,370 +27,353 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace expRampWithPrepulse
-{
-    template< typename T_Params >
-    struct Unitless : public T_Params
-    {
-        using Params = T_Params;
-
-        static constexpr float_X WAVE_LENGTH = float_X( Params::WAVE_LENGTH_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X PULSE_LENGTH = float_X( Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (1 sigma)
-        static constexpr float_X LASER_NOFOCUS_CONSTANT = float_X( Params::LASER_NOFOCUS_CONSTANT_SI / UNIT_TIME ); // unit: seconds
-        static constexpr float_X AMPLITUDE = float_X( Params::AMPLITUDE_SI / UNIT_EFIELD ); // unit: Volt /meter
-        static constexpr float_X W0_X = float_X( Params::W0_X_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X W0_Z = float_X( Params::W0_Z_SI / UNIT_LENGTH ); // unit: meter
-
-        static constexpr float_64 TIME_PREPULSE = float_64( Params::TIME_PREPULSE_SI / UNIT_TIME );
-        static constexpr float_64 TIME_PEAKPULSE = float_64( Params::TIME_PEAKPULSE_SI / UNIT_TIME );
-        static constexpr float_64 TIME_1 = float_64( Params::TIME_POINT_1_SI / UNIT_TIME );
-        static constexpr float_64 TIME_2 = float_64( Params::TIME_POINT_2_SI / UNIT_TIME );
-        static constexpr float_64 TIME_3 = float_64( Params::TIME_POINT_3_SI / UNIT_TIME );
-        static constexpr float_X endUpramp = TIME_PEAKPULSE - 0.5_X * LASER_NOFOCUS_CONSTANT;
-        static constexpr float_X startDownramp = TIME_PEAKPULSE + 0.5_X * LASER_NOFOCUS_CONSTANT;
-
-        static constexpr float_X INIT_TIME = float_X( ( TIME_PEAKPULSE + Params::RAMP_INIT * PULSE_LENGTH ) / UNIT_TIME );
-
-        // compile-time checks for physical sanity:
-        static_assert(
-            ( TIME_1 < TIME_2 ) && ( TIME_2 < TIME_3 ) && ( TIME_3 < endUpramp ),
-            "The times in the parameters TIME_POINT_1/2/3 and the beginning of the plateau (which is at TIME_PEAKPULSE - 0.5*RAMP_INIT*PULSE_LENGTH) should be in ascending order"
-        );
-
-        // some prerequisites for check of intensities (approximate check, because I can't use exp and log)
-        static constexpr float_X ratio_dt = ( endUpramp - TIME_3 ) / ( TIME_3 - TIME_2 ); // ratio of time intervals
-        static constexpr float_X ri1 = Params::INT_RATIO_POINT_3 / Params::INT_RATIO_POINT_2; // first intensity ratio
-        static constexpr float_X ri2 = 0.2_X / Params::INT_RATIO_POINT_3; // second intensity ratio (0.2 is an arbitrary upper border for the intensity of the exp ramp)
-
-        /* Approximate check, if ri1 ^ ratio_dt > ri2. That would mean, that the exponential curve through (time2, int2) and (time3, int3) lies above (endUpramp, 0.2)
-         * the power function is emulated by "rounding" the exponent to a rational number and expanding both sides by the common denominator, to get integer powers, see below
-         * for this, the range for ratio_dt is split into parts; the checked condition is "rounded down", i.e. it's weaker in every point of those ranges except one.
-         */
-        static constexpr bool intensity_too_big =
-            ( ratio_dt >= 3._X   && ri1 * ri1 * ri1 > ri2) ||
-            ( ratio_dt >= 2._X   && ri1 * ri1 > ri2) ||
-            ( ratio_dt >= 1.5_X  && ri1 * ri1 * ri1 > ri2 * ri2) ||
-            ( ratio_dt >= 1._X   && ri1 > ri2) ||
-            ( ratio_dt >= 0.8_X  && ri1 * ri1 * ri1 * ri1 > ri2 * ri2 * ri2 * ri2 * ri2 ) ||
-            ( ratio_dt >= 0.75_X && ri1 * ri1 * ri1 > ri2 * ri2 * ri2 * ri2 ) ||
-            ( ratio_dt >= 0.67_X && ri1 * ri1 > ri2 * ri2 * ri2 ) ||
-            ( ratio_dt >= 0.6_X  && ri1 * ri1 * ri1 > ri2 * ri2 * ri2 * ri2 * ri2 ) ||
-            ( ratio_dt >= 0.5_X  && ri1 > ri2 * ri2 ) ||
-            ( ratio_dt >= 0.4_X  && ri1 * ri1 > ri2 * ri2 * ri2 * ri2 * ri2 ) ||
-            ( ratio_dt >= 0.33_X && ri1 > ri2 * ri2 * ri2 ) ||
-            ( ratio_dt >= 0.25_X && ri1 > ri2 * ri2 * ri2 * ri2 ) ||
-            ( ratio_dt >= 0.2_X  && ri1 > ri2 * ri2 * ri2 * ri2 * ri2 );
-        static_assert(
-            !intensity_too_big,
-            "The intensities of the ramp are very large - the extrapolation to the time of the main pulse would give more than half of the pulse amplitude. This is not a Gaussian pulse at all anymore - probably some of the parameters are different from what you think!?"
-        );
-
-        /* initialize the laser not in the first cell is equal to a negative shift
-         * in time
-         */
-        static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
-
-        /* a symmetric pulse will be initialized at position z=0 for
-         * a time of RAMP_INIT * PULSE_LENGTH + LASER_NOFOCUS_CONSTANT = INIT_TIME.
-         * we shift the complete pulse for the half of this time to start with
-         * the front of the laser pulse.
-         */
-        static constexpr float_X time_start_init = TIME_1 - ( 0.5 * Params::RAMP_INIT * PULSE_LENGTH );
-        static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
-        static constexpr float_64 w = 2.0 * PI * f;
-    };
-} // namespace expRampWithPrepulse
-
-namespace acc
-{
-    template< typename T_Unitless >
-    struct ExpRampWithPrepulse : public T_Unitless
-    {
-        using Unitless = T_Unitless;
-
-        float3_X m_elong;
-        float_X m_phase;
-        typename FieldE::DataBoxType m_dataBoxE;
-        DataSpace< simDim > m_offsetToTotalDomain;
-        DataSpace< simDim > m_superCellToLocalOriginCellOffset;
-
-        /** Device-Side Constructor
-         *
-         * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
-         * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly after transform to centered origin)
-         */
-        HDINLINE ExpRampWithPrepulse(
-            typename FieldE::DataBoxType const & dataBoxE,
-            DataSpace< simDim > const & superCellToLocalOriginCellOffset,
-            DataSpace< simDim > const & offsetToTotalDomain,
-            float3_X const & elong
-        ) :
-            m_elong( elong ),
-            m_dataBoxE( dataBoxE ),
-            m_offsetToTotalDomain( offsetToTotalDomain ),
-            m_superCellToLocalOriginCellOffset( superCellToLocalOriginCellOffset )
-        {
-        }
-
-        /** device side manipulation for init plane (transversal)
-         *
-         * @tparam T_Args type of the arguments passed to the user manipulator functor
-         *
-         * @param cellIndexInSuperCell ND cell index in current supercell
-         */
-        template< typename T_Acc >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            DataSpace< simDim > const & cellIndexInSuperCell
-        )
-        {
-            // coordinate system to global simulation as origin
-            DataSpace< simDim > const localCell(
-                cellIndexInSuperCell +
-                m_superCellToLocalOriginCellOffset
-            );
-
-            // transform coordinate system to center of x-z plane of initialization
-            constexpr uint8_t planeNormalDir = 1u;
-            DataSpace< simDim > offsetToCenterOfPlane( m_offsetToTotalDomain );
-            offsetToCenterOfPlane[ planeNormalDir ] = 0; // do not shift origin of plane normal
-            floatD_X const pos = precisionCast< float_X >( localCell + offsetToCenterOfPlane ) * cellSize.shrink< simDim >();
-            // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
-
-            // transversal position only
-            float3_X const w0_3D( Unitless::W0_X, 0., Unitless::W0_Z );
-            auto const w0( w0_3D.shrink< simDim >().remove< planeNormalDir >() );
-            auto const pos_trans( pos.remove< planeNormalDir >() );
-            auto const exp_compos( pos_trans * pos_trans / ( w0 * w0 ) );
-            float_X const exp_arg( exp_compos.sumOfComponents() );
-
-            m_elong *= math::exp( -1.0_X * exp_arg );
-
-            if( Unitless::initPlaneY != 0 ) // compile time if
-            {
-                /* If the laser is not initialized in the first cell we emit a
-                 * negatively and positively propagating wave. Therefore we need to multiply the
-                 * amplitude with a correction factor depending of the cell size in
-                 * propagation direction.
-                 * The negatively propagating wave is damped by the absorber.
-                 *
-                 * The `correctionFactor` assume that the wave is moving in y direction.
-                 */
-                auto const correctionFactor = ( SPEED_OF_LIGHT * DELTA_T ) / CELL_HEIGHT * 2._X;
-
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) +=  correctionFactor * m_elong;
-            }
-            else
-            {
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) = m_elong;
-            }
-        }
-    };
-} // namespace acc
-
-    template< typename T_Params >
-    struct ExpRampWithPrepulse : public expRampWithPrepulse::Unitless< T_Params >
+    namespace fields
     {
-        using Unitless = expRampWithPrepulse::Unitless< T_Params >;
-
-        float3_X elong;
-        float_X phase;
-        typename FieldE::DataBoxType dataBoxE;
-        DataSpace< simDim > offsetToTotalDomain;
-
-        /** takes time t relative to the center of the Gaussian and returns value
-         * between 0 and 1, i.e. as multiple of the max value.
-         * use as: amp_t = amp_0 * gauss( t - t_0 )
-         */
-        HDINLINE float_X
-        gauss( float_X const t )
-        {
-            float_X const exponent = t / float_X( Unitless::PULSE_LENGTH );
-            return math::exp( -0.25_X * exponent * exponent );
-        }
-
-        /** get value of exponential curve through two points at given t
-         * t/t1/t2 given as float_X, since the envelope doesn't need the accuracy
-         */
-        HDINLINE float_X
-        extrapolate_expo(
-            float_X const t1,
-            float_X const a1,
-            float_X const t2,
-            float_X const a2,
-            float_X const t
-        )
-        {
-            const float_X log1 = ( t2 - t ) * math::log( a1 );
-            const float_X log2 = ( t - t1 ) * math::log( a2 );
-            return math::exp( ( log1 + log2 )/( t2 - t1 ) );
-        }
-
-        HINLINE float_X
-        get_envelope( float_X runTime )
+        namespace laserProfiles
         {
-            float_X const AMP_PREPULSE = float_X( math::sqrt( Unitless::INT_RATIO_PREPULSE ) * Unitless::AMPLITUDE);
-            float_X const AMP_1 = float_X( math::sqrt( Unitless::INT_RATIO_POINT_1 ) * Unitless::AMPLITUDE );
-            float_X const AMP_2 = float_X( math::sqrt( Unitless::INT_RATIO_POINT_2 ) * Unitless::AMPLITUDE );
-            float_X const AMP_3 = float_X( math::sqrt( Unitless::INT_RATIO_POINT_3 ) * Unitless::AMPLITUDE );
-
-            float_X env = 0.0;
-            bool const before_preupramp = runTime < Unitless::time_start_init;
-            bool const before_start = runTime < Unitless::TIME_1;
-            bool const before_peakpulse = runTime < Unitless::endUpramp;
-            bool const during_first_exp = ( Unitless::TIME_1 < runTime ) &&
-                ( runTime < Unitless::TIME_2 );
-            bool const after_peakpulse = Unitless::startDownramp <= runTime;
-
-            if( before_preupramp )
-                env = 0.;
-            else if( before_start )
+            namespace expRampWithPrepulse
             {
-                env = AMP_1 * gauss( runTime - Unitless::TIME_1 );
-            }
-            else if( before_peakpulse )
+                template<typename T_Params>
+                struct Unitless : public T_Params
+                {
+                    using Params = T_Params;
+
+                    static constexpr float_X WAVE_LENGTH
+                        = float_X(Params::WAVE_LENGTH_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X PULSE_LENGTH
+                        = float_X(Params::PULSE_LENGTH_SI / UNIT_TIME); // unit: seconds (1 sigma)
+                    static constexpr float_X LASER_NOFOCUS_CONSTANT
+                        = float_X(Params::LASER_NOFOCUS_CONSTANT_SI / UNIT_TIME); // unit: seconds
+                    static constexpr float_X AMPLITUDE
+                        = float_X(Params::AMPLITUDE_SI / UNIT_EFIELD); // unit: Volt /meter
+                    static constexpr float_X W0_X = float_X(Params::W0_X_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X W0_Z = float_X(Params::W0_Z_SI / UNIT_LENGTH); // unit: meter
+
+                    static constexpr float_64 TIME_PREPULSE = float_64(Params::TIME_PREPULSE_SI / UNIT_TIME);
+                    static constexpr float_64 TIME_PEAKPULSE = float_64(Params::TIME_PEAKPULSE_SI / UNIT_TIME);
+                    static constexpr float_64 TIME_1 = float_64(Params::TIME_POINT_1_SI / UNIT_TIME);
+                    static constexpr float_64 TIME_2 = float_64(Params::TIME_POINT_2_SI / UNIT_TIME);
+                    static constexpr float_64 TIME_3 = float_64(Params::TIME_POINT_3_SI / UNIT_TIME);
+                    static constexpr float_X endUpramp = TIME_PEAKPULSE - 0.5_X * LASER_NOFOCUS_CONSTANT;
+                    static constexpr float_X startDownramp = TIME_PEAKPULSE + 0.5_X * LASER_NOFOCUS_CONSTANT;
+
+                    static constexpr float_X INIT_TIME
+                        = float_X((TIME_PEAKPULSE + Params::RAMP_INIT * PULSE_LENGTH) / UNIT_TIME);
+
+                    // compile-time checks for physical sanity:
+                    static_assert(
+                        (TIME_1 < TIME_2) && (TIME_2 < TIME_3) && (TIME_3 < endUpramp),
+                        "The times in the parameters TIME_POINT_1/2/3 and the beginning of the plateau (which is at "
+                        "TIME_PEAKPULSE - 0.5*RAMP_INIT*PULSE_LENGTH) should be in ascending order");
+
+                    // some prerequisites for check of intensities (approximate check, because I can't use exp and log)
+                    static constexpr float_X ratio_dt
+                        = (endUpramp - TIME_3) / (TIME_3 - TIME_2); // ratio of time intervals
+                    static constexpr float_X ri1
+                        = Params::INT_RATIO_POINT_3 / Params::INT_RATIO_POINT_2; // first intensity ratio
+                    static constexpr float_X ri2
+                        = 0.2_X / Params::INT_RATIO_POINT_3; // second intensity ratio (0.2 is an arbitrary upper
+                                                             // border for the intensity of the exp ramp)
+
+                    /* Approximate check, if ri1 ^ ratio_dt > ri2. That would mean, that the exponential curve through
+                     * (time2, int2) and (time3, int3) lies above (endUpramp, 0.2) the power function is emulated by
+                     * "rounding" the exponent to a rational number and expanding both sides by the common denominator,
+                     * to get integer powers, see below for this, the range for ratio_dt is split into parts; the
+                     * checked condition is "rounded down", i.e. it's weaker in every point of those ranges except one.
+                     */
+                    static constexpr bool intensity_too_big = (ratio_dt >= 3._X && ri1 * ri1 * ri1 > ri2)
+                        || (ratio_dt >= 2._X && ri1 * ri1 > ri2) || (ratio_dt >= 1.5_X && ri1 * ri1 * ri1 > ri2 * ri2)
+                        || (ratio_dt >= 1._X && ri1 > ri2)
+                        || (ratio_dt >= 0.8_X && ri1 * ri1 * ri1 * ri1 > ri2 * ri2 * ri2 * ri2 * ri2)
+                        || (ratio_dt >= 0.75_X && ri1 * ri1 * ri1 > ri2 * ri2 * ri2 * ri2)
+                        || (ratio_dt >= 0.67_X && ri1 * ri1 > ri2 * ri2 * ri2)
+                        || (ratio_dt >= 0.6_X && ri1 * ri1 * ri1 > ri2 * ri2 * ri2 * ri2 * ri2)
+                        || (ratio_dt >= 0.5_X && ri1 > ri2 * ri2)
+                        || (ratio_dt >= 0.4_X && ri1 * ri1 > ri2 * ri2 * ri2 * ri2 * ri2)
+                        || (ratio_dt >= 0.33_X && ri1 > ri2 * ri2 * ri2)
+                        || (ratio_dt >= 0.25_X && ri1 > ri2 * ri2 * ri2 * ri2)
+                        || (ratio_dt >= 0.2_X && ri1 > ri2 * ri2 * ri2 * ri2 * ri2);
+                    static_assert(
+                        !intensity_too_big,
+                        "The intensities of the ramp are very large - the extrapolation to the time of the main pulse "
+                        "would give more than half of the pulse amplitude. This is not a Gaussian pulse at all "
+                        "anymore - probably some of the parameters are different from what you think!?");
+
+                    /* initialize the laser not in the first cell is equal to a negative shift
+                     * in time
+                     */
+                    static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
+
+                    /* a symmetric pulse will be initialized at position z=0 for
+                     * a time of RAMP_INIT * PULSE_LENGTH + LASER_NOFOCUS_CONSTANT = INIT_TIME.
+                     * we shift the complete pulse for the half of this time to start with
+                     * the front of the laser pulse.
+                     */
+                    static constexpr float_X time_start_init = TIME_1 - (0.5 * Params::RAMP_INIT * PULSE_LENGTH);
+                    static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
+                    static constexpr float_64 w = 2.0 * PI * f;
+                };
+            } // namespace expRampWithPrepulse
+
+            namespace acc
             {
-                float_X const ramp_when_peakpulse = extrapolate_expo(
-                    Unitless::TIME_2,
-                    AMP_2,
-                    Unitless::TIME_3,
-                    AMP_3,
-                    Unitless::endUpramp
-                ) / Unitless::AMPLITUDE;
-
-                if( ramp_when_peakpulse > 0.5 )
+                template<typename T_Unitless>
+                struct ExpRampWithPrepulse : public T_Unitless
                 {
-                    log< picLog::PHYSICS >(
-                        "Attention, the intensities of the laser upramp are very large! "
-                        "The extrapolation of the last exponential to the time of "
-                        "the peakpulse gives more than half of the amplitude of "
-                        "the peak Gaussian. This is not a Gaussian at all anymore, "
-                        "and physically very unplausible, check the params for misunderstandings!"
-                    );
-                }
-
-                env += Unitless::AMPLITUDE * ( 1._X - ramp_when_peakpulse ) *
-                    gauss( runTime - Unitless::endUpramp );
-                env += AMP_PREPULSE * gauss( runTime - Unitless::TIME_PREPULSE );
-                if( during_first_exp )
-                    env += extrapolate_expo(
-                        Unitless::TIME_1,
-                        AMP_1,
-                        Unitless::TIME_2,
-                        AMP_2,
-                        runTime
-                    );
-                else
-                    env += extrapolate_expo(
-                        Unitless::TIME_2,
-                        AMP_2,
-                        Unitless::TIME_3,
-                        AMP_3,
-                        runTime
-                    );
-            }
-            else if( !after_peakpulse )
-                env = Unitless::AMPLITUDE;
-            else // after startDownramp
-                env = Unitless::AMPLITUDE * gauss( runTime - Unitless::startDownramp );
-            return env;
-        }
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE ExpRampWithPrepulse( uint32_t currentStep )
-        {
-            // get data
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            dataBoxE = dc.get< FieldE >(
-                FieldE::getName(),
-                true
-            )->getDeviceDataBox();
-
-            // get meta data for offsets
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get().SubGrid();
-            // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
-            DataSpace< simDim > const globalCellOffset( subGrid.getLocalDomain().offset );
-            DataSpace< simDim > const halfSimSize( subGrid.getGlobalDomain().size / 2 );
-
-            // transform coordinate system to center of global simulation as origin [cells]
-            offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
+                    using Unitless = T_Unitless;
+
+                    float3_X m_elong;
+                    float_X m_phase;
+                    typename FieldE::DataBoxType m_dataBoxE;
+                    DataSpace<simDim> m_offsetToTotalDomain;
+                    DataSpace<simDim> m_superCellToLocalOriginCellOffset;
+
+                    /** Device-Side Constructor
+                     *
+                     * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
+                     * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly
+                     * after transform to centered origin)
+                     */
+                    HDINLINE ExpRampWithPrepulse(
+                        typename FieldE::DataBoxType const& dataBoxE,
+                        DataSpace<simDim> const& superCellToLocalOriginCellOffset,
+                        DataSpace<simDim> const& offsetToTotalDomain,
+                        float3_X const& elong)
+                        : m_elong(elong)
+                        , m_dataBoxE(dataBoxE)
+                        , m_offsetToTotalDomain(offsetToTotalDomain)
+                        , m_superCellToLocalOriginCellOffset(superCellToLocalOriginCellOffset)
+                    {
+                    }
+
+                    /** device side manipulation for init plane (transversal)
+                     *
+                     * @tparam T_Args type of the arguments passed to the user manipulator functor
+                     *
+                     * @param cellIndexInSuperCell ND cell index in current supercell
+                     */
+                    template<typename T_Acc>
+                    HDINLINE void operator()(T_Acc const&, DataSpace<simDim> const& cellIndexInSuperCell)
+                    {
+                        // coordinate system to global simulation as origin
+                        DataSpace<simDim> const localCell(cellIndexInSuperCell + m_superCellToLocalOriginCellOffset);
+
+                        // transform coordinate system to center of x-z plane of initialization
+                        constexpr uint8_t planeNormalDir = 1u;
+                        DataSpace<simDim> offsetToCenterOfPlane(m_offsetToTotalDomain);
+                        offsetToCenterOfPlane[planeNormalDir] = 0; // do not shift origin of plane normal
+                        floatD_X const pos
+                            = precisionCast<float_X>(localCell + offsetToCenterOfPlane) * cellSize.shrink<simDim>();
+                        // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
+
+                        // transversal position only
+                        float3_X const w0_3D(Unitless::W0_X, 0., Unitless::W0_Z);
+                        auto const w0(w0_3D.shrink<simDim>().remove<planeNormalDir>());
+                        auto const pos_trans(pos.remove<planeNormalDir>());
+                        auto const exp_compos(pos_trans * pos_trans / (w0 * w0));
+                        float_X const exp_arg(exp_compos.sumOfComponents());
+
+                        m_elong *= math::exp(-1.0_X * exp_arg);
+
+                        if(Unitless::initPlaneY != 0) // compile time if
+                        {
+                            /* If the laser is not initialized in the first cell we emit a
+                             * negatively and positively propagating wave. Therefore we need to multiply the
+                             * amplitude with a correction factor depending of the cell size in
+                             * propagation direction.
+                             * The negatively propagating wave is damped by the absorber.
+                             *
+                             * The `correctionFactor` assume that the wave is moving in y direction.
+                             */
+                            auto const correctionFactor = (SPEED_OF_LIGHT * DELTA_T) / CELL_HEIGHT * 2._X;
+
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT())
+                                += correctionFactor * m_elong;
+                        }
+                        else
+                        {
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT()) = m_elong;
+                        }
+                    }
+                };
+            } // namespace acc
+
+            template<typename T_Params>
+            struct ExpRampWithPrepulse : public expRampWithPrepulse::Unitless<T_Params>
+            {
+                using Unitless = expRampWithPrepulse::Unitless<T_Params>;
 
-            // @todo reset origin of direction of moving window
-            // offsetToTotalDomain.y() = 0
+                float3_X elong;
+                float_X phase;
+                typename FieldE::DataBoxType dataBoxE;
+                DataSpace<simDim> offsetToTotalDomain;
 
-            elong = float3_X::create( 0.0 );
+                /** takes time t relative to the center of the Gaussian and returns value
+                 * between 0 and 1, i.e. as multiple of the max value.
+                 * use as: amp_t = amp_0 * gauss( t - t_0 )
+                 */
+                HDINLINE float_X gauss(float_X const t)
+                {
+                    float_X const exponent = t / float_X(Unitless::PULSE_LENGTH);
+                    return math::exp(-0.25_X * exponent * exponent);
+                }
 
-            /* initialize the laser not in the first cell is equal to a negative shift
-             * in time
-             */
-            const float_64 runTime = Unitless::time_start_init - Unitless::laserTimeShift +
-                DELTA_T * currentStep;
+                /** get value of exponential curve through two points at given t
+                 * t/t1/t2 given as float_X, since the envelope doesn't need the accuracy
+                 */
+                HDINLINE float_X extrapolate_expo(
+                    float_X const t1,
+                    float_X const a1,
+                    float_X const t2,
+                    float_X const a2,
+                    float_X const t)
+                {
+                    const float_X log1 = (t2 - t) * math::log(a1);
+                    const float_X log2 = (t - t1) * math::log(a2);
+                    return math::exp((log1 + log2) / (t2 - t1));
+                }
 
-            phase = float_X( Unitless::w * runTime ) + Unitless::LASER_PHASE;
+                HINLINE float_X get_envelope(float_X runTime)
+                {
+                    /* workaround for clang 5 linker issues
+                     * `undefined reference to
+                     * `picongpu::fields::laserProfiles::ExpRampWithPrepulseParam::INT_RATIO_POINT_1'`
+                     */
+                    constexpr auto int_ratio_prepule = Unitless::INT_RATIO_PREPULSE;
+                    constexpr auto int_ratio_point_1 = Unitless::INT_RATIO_POINT_1;
+                    constexpr auto int_ratio_point_2 = Unitless::INT_RATIO_POINT_2;
+                    constexpr auto int_ratio_point_3 = Unitless::INT_RATIO_POINT_3;
+                    float_X const AMP_PREPULSE = float_X(math::sqrt(int_ratio_prepule) * Unitless::AMPLITUDE);
+                    float_X const AMP_1 = float_X(math::sqrt(int_ratio_point_1) * Unitless::AMPLITUDE);
+                    float_X const AMP_2 = float_X(math::sqrt(int_ratio_point_2) * Unitless::AMPLITUDE);
+                    float_X const AMP_3 = float_X(math::sqrt(int_ratio_point_3) * Unitless::AMPLITUDE);
+
+                    float_X env = 0.0;
+                    bool const before_preupramp = runTime < Unitless::time_start_init;
+                    bool const before_start = runTime < Unitless::TIME_1;
+                    bool const before_peakpulse = runTime < Unitless::endUpramp;
+                    bool const during_first_exp = (Unitless::TIME_1 < runTime) && (runTime < Unitless::TIME_2);
+                    bool const after_peakpulse = Unitless::startDownramp <= runTime;
+
+                    if(before_preupramp)
+                        env = 0.;
+                    else if(before_start)
+                    {
+                        env = AMP_1 * gauss(runTime - Unitless::TIME_1);
+                    }
+                    else if(before_peakpulse)
+                    {
+                        float_X const ramp_when_peakpulse
+                            = extrapolate_expo(Unitless::TIME_2, AMP_2, Unitless::TIME_3, AMP_3, Unitless::endUpramp)
+                            / Unitless::AMPLITUDE;
+
+                        if(ramp_when_peakpulse > 0.5)
+                        {
+                            log<picLog::PHYSICS>(
+                                "Attention, the intensities of the laser upramp are very large! "
+                                "The extrapolation of the last exponential to the time of "
+                                "the peakpulse gives more than half of the amplitude of "
+                                "the peak Gaussian. This is not a Gaussian at all anymore, "
+                                "and physically very unplausible, check the params for misunderstandings!");
+                        }
+
+                        env += Unitless::AMPLITUDE * (1._X - ramp_when_peakpulse)
+                            * gauss(runTime - Unitless::endUpramp);
+                        env += AMP_PREPULSE * gauss(runTime - Unitless::TIME_PREPULSE);
+                        if(during_first_exp)
+                            env += extrapolate_expo(Unitless::TIME_1, AMP_1, Unitless::TIME_2, AMP_2, runTime);
+                        else
+                            env += extrapolate_expo(Unitless::TIME_2, AMP_2, Unitless::TIME_3, AMP_3, runTime);
+                    }
+                    else if(!after_peakpulse)
+                        env = Unitless::AMPLITUDE;
+                    else // after startDownramp
+                        env = Unitless::AMPLITUDE * gauss(runTime - Unitless::startDownramp);
+                    return env;
+                }
 
-            float_X const envelope = get_envelope( runTime );
+                /** constructor
+                 *
+                 * @param currentStep current simulation time step
+                 */
+                HINLINE ExpRampWithPrepulse(uint32_t currentStep)
+                {
+                    // get data
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    dataBoxE = dc.get<FieldE>(FieldE::getName(), true)->getDeviceDataBox();
+
+                    // get meta data for offsets
+                    SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                    // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
+                    DataSpace<simDim> const globalCellOffset(subGrid.getLocalDomain().offset);
+                    DataSpace<simDim> const halfSimSize(subGrid.getGlobalDomain().size / 2);
+
+                    // transform coordinate system to center of global simulation as origin [cells]
+                    offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
+
+                    // @todo reset origin of direction of moving window
+                    // offsetToTotalDomain.y() = 0
+
+                    elong = float3_X::create(0.0);
+
+                    /* initialize the laser not in the first cell is equal to a negative shift
+                     * in time
+                     */
+                    const float_64 runTime
+                        = Unitless::time_start_init - Unitless::laserTimeShift + DELTA_T * currentStep;
+
+                    phase = float_X(Unitless::w * runTime) + Unitless::LASER_PHASE;
+
+                    float_X const envelope = get_envelope(runTime);
+
+                    if(Unitless::Polarisation == Unitless::LINEAR_X)
+                    {
+                        elong.x() = envelope * math::sin(phase);
+                    }
+                    else if(Unitless::Polarisation == Unitless::LINEAR_Z)
+                    {
+                        elong.z() = envelope * math::sin(phase);
+                    }
+                    else if(Unitless::Polarisation == Unitless::CIRCULAR)
+                    {
+                        elong.x() = envelope / math::sqrt(2.0_X) * math::sin(phase);
+                        elong.z() = envelope / math::sqrt(2.0_X) * math::cos(phase);
+                    }
+                }
 
-            if( Unitless::Polarisation == Unitless::LINEAR_X )
-            {
-                elong.x() = envelope * math::sin( phase );
-            }
-            else if( Unitless::Polarisation == Unitless::LINEAR_Z )
-            {
-                elong.z() = envelope * math::sin( phase );
-            }
-            else if( Unitless::Polarisation == Unitless::CIRCULAR )
-            {
-                elong.x() = envelope / math::sqrt( 2.0_X ) * math::sin( phase );
-                elong.z() = envelope / math::sqrt( 2.0_X ) * math::cos( phase );
-            }
-        }
-
-        /** create device manipulator functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset (in supercells, without guards) to the
-         *        origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::ExpRampWithPrepulse< Unitless >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const &
-        ) const
-        {
-            auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
-            return acc::ExpRampWithPrepulse< Unitless >( dataBoxE, superCellToLocalOriginCellOffset, offsetToTotalDomain, elong );
-        }
-
-        //! get the name of the laser profile
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return "ExpRampWithPrepulse";
-        }
+                /** create device manipulator functor
+                 *
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @tparam T_Acc alpaka accelerator type
+                 *
+                 * @param alpaka accelerator
+                 * @param localSupercellOffset (in supercells, without guards) to the
+                 *        origin of the local domain
+                 * @param configuration of the worker
+                 */
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::ExpRampWithPrepulse<Unitless> operator()(
+                    T_Acc const&,
+                    DataSpace<simDim> const& localSupercellOffset,
+                    T_WorkerCfg const&) const
+                {
+                    auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
+                    return acc::ExpRampWithPrepulse<Unitless>(
+                        dataBoxE,
+                        superCellToLocalOriginCellOffset,
+                        offsetToTotalDomain,
+                        elong);
+                }
 
-    };
+                //! get the name of the laser profile
+                static HINLINE std::string getName()
+                {
+                    return "ExpRampWithPrepulse";
+                }
+            };
 
-} // namespace laserProfiles
-} // namespace fields
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
-
diff --git a/include/picongpu/fields/laserProfiles/GaussianBeam.def b/include/picongpu/fields/laserProfiles/GaussianBeam.def
index 0c274ba2c6..6aba5d3bbf 100644
--- a/include/picongpu/fields/laserProfiles/GaussianBeam.def
+++ b/include/picongpu/fields/laserProfiles/GaussianBeam.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
  *                     Richard Pausch, Alexander Debus
  *
  * This file is part of PIConGPU.
@@ -25,114 +25,118 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace gaussianBeam
-{
-namespace defaults
-{
-    //! Use only the 0th Laguerremode for a standard Gaussian
-    static constexpr uint32_t MODENUMBER = 0;
-    PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, 1.0);
-    // This is just an example for a more complicated set of Laguerre modes
-    //constexpr uint32_t MODENUMBER = 12;
-    //PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, -1.0, 0.0300519, 0.319461, -0.23783, 0.0954839, 0.0318653, -0.144547, 0.0249208, -0.111989, 0.0434385, -0.030038, -0.00896321, -0.0160788);
-
-    struct GaussianBeamParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        //static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *
-         *  unit: meter */
-        static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
-        /** the distance to the laser focus in y-direction
-         *  unit: meter */
-        static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 20.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        using LAGUERREMODES_t = defaults::LAGUERREMODES_t;
-        static constexpr uint32_t MODENUMBER = defaults::MODENUMBER;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = CIRCULAR;
-    };
-} // namespace defaults
-} // namespace gaussianBeam
-
-    /** Gaussian Beam laser profile with finite pulse length
-     *
-     * @tparam T_Params class parameter to configure the Gaussian Beam profile,
-     *                  see members of gaussianBeam::default::GaussianBeamParam
-     *                  for required members
-     */
-    template< typename T_Params = gaussianBeam::defaults::GaussianBeamParam >
-    struct GaussianBeam;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace gaussianBeam
+            {
+                namespace defaults
+                {
+                    //! Use only the 0th Laguerremode for a standard Gaussian
+                    static constexpr uint32_t MODENUMBER = 0;
+                    PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, 1.0);
+                    // This is just an example for a more complicated set of Laguerre modes
+                    // constexpr uint32_t MODENUMBER = 12;
+                    // PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, -1.0, 0.0300519, 0.319461, -0.23783,
+                    // 0.0954839, 0.0318653, -0.144547, 0.0249208, -0.111989, 0.0434385, -0.030038, -0.00896321,
+                    // -0.0160788);
+
+                    struct GaussianBeamParam
+                    {
+                        /** unit: meter */
+                        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                        /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                            * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                            * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                        /** unit: W / m^2 */
+                        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                        /** unit: none */
+                        // static constexpr float_64 _A0  = 1.5;
+
+                        /** unit: Volt / meter */
+                        // static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                        /** unit: Volt / meter */
+                        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                        /** Pulse length: sigma of std. gauss for intensity (E^2)
+                         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                         *                                          [    2.354820045     ]
+                         *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                         *                      = what a experimentalist calls "pulse duration"
+                         *
+                         *  unit: seconds (1 sigma) */
+                        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                        /** beam waist: distance from the axis where the pulse intensity (E^2)
+                         *              decreases to its 1/e^2-th part,
+                         *              at the focus position of the laser
+                         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                         *                             [   1.17741    ]
+                         *
+                         *  unit: meter */
+                        static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
+                        /** the distance to the laser focus in y-direction
+                         *  unit: meter */
+                        static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
+
+                        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                         *
+                         *  unit: none */
+                        static constexpr float_64 PULSE_INIT = 20.0;
+
+                        /** cell from top where the laser is initialized
+                         *
+                         * if `initPlaneY == 0` than the absorber are disabled.
+                         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                         * direction is enabled
+                         *
+                         * valid ranges:
+                         *   - initPlaneY == 0
+                         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                         */
+                        static constexpr uint32_t initPlaneY = 0;
+
+                        /** laser phase shift (no shift: 0.0)
+                         *
+                         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                         *
+                         * unit: rad, periodic in 2*pi
+                         */
+                        static constexpr float_X LASER_PHASE = 0.0;
+
+                        using LAGUERREMODES_t = defaults::LAGUERREMODES_t;
+                        static constexpr uint32_t MODENUMBER = defaults::MODENUMBER;
+
+                        /** Available polarisation types
+                         */
+                        enum PolarisationType
+                        {
+                            LINEAR_X = 1u,
+                            LINEAR_Z = 2u,
+                            CIRCULAR = 4u,
+                        };
+                        /** Polarization selection
+                         */
+                        static constexpr PolarisationType Polarisation = CIRCULAR;
+                    };
+                } // namespace defaults
+            } // namespace gaussianBeam
+
+            /** Gaussian Beam laser profile with finite pulse length
+             *
+             * @tparam T_Params class parameter to configure the Gaussian Beam profile,
+             *                  see members of gaussianBeam::default::GaussianBeamParam
+             *                  for required members
+             */
+            template<typename T_Params = gaussianBeam::defaults::GaussianBeamParam>
+            struct GaussianBeam;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/GaussianBeam.hpp b/include/picongpu/fields/laserProfiles/GaussianBeam.hpp
index 100684bb63..ab595c8d2a 100644
--- a/include/picongpu/fields/laserProfiles/GaussianBeam.hpp
+++ b/include/picongpu/fields/laserProfiles/GaussianBeam.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
  *                     Richard Pausch, Alexander Debus
  *
  * This file is part of PIConGPU.
@@ -28,338 +28,358 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace gaussianBeam
-{
-    template< typename T_Params >
-    struct Unitless : public T_Params
-    {
-        using Params = T_Params;
-
-        static constexpr float_X WAVE_LENGTH = float_X( Params::WAVE_LENGTH_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X PULSE_LENGTH = float_X( Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (1 sigma)
-        static constexpr float_X AMPLITUDE = float_X( Params::AMPLITUDE_SI / UNIT_EFIELD ); // unit: Volt /meter
-        static constexpr float_X W0 = float_X( Params::W0_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X FOCUS_POS = float_X( Params::FOCUS_POS_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X INIT_TIME = float_X( ( Params::PULSE_INIT * Params::PULSE_LENGTH_SI ) / UNIT_TIME ); // unit: seconds (full initialization length)
-
-        /* initialize the laser not in the first cell is equal to a negative shift
-         * in time
-         */
-        static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
-
-        static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
-
-    };
-} // namespace gaussianBeam
-
-namespace acc
-{
-    template< typename T_Unitless >
-    struct GaussianBeam : public T_Unitless
+    namespace fields
     {
-        using Unitless = T_Unitless;
-
-        float3_X m_elong;
-        float_X m_phase;
-        typename FieldE::DataBoxType m_dataBoxE;
-        DataSpace< simDim > m_offsetToTotalDomain;
-        DataSpace< simDim > m_superCellToLocalOriginCellOffset;
-
-        /** Simple iteration algorithm to implement Laguerre polynomials for GPUs.
-         *
-         *  @param n order of the Laguerre polynomial
-         *  @param x coordinate at which the polynomial is evaluated
-         *  @return ...
-         */
-        HDINLINE float_X simpleLaguerre( const uint32_t n, const float_X x )
+        namespace laserProfiles
         {
-            //Result for special case n == 0
-            if (n == 0) return 1.0_X;
-            uint32_t currentN = 1;
-            float_X laguerreNMinus1 = 1.0_X;
-            float_X laguerreN = 1.0_X - x;
-            float_X laguerreNPlus1( 0.0_X );
-            while (currentN < n)
+            namespace gaussianBeam
             {
-                //Core statement of the algorithm
-                laguerreNPlus1 = ( ( 2.0_X * float_X(currentN) + 1.0_X - x) * laguerreN - float_X(currentN) * laguerreNMinus1 ) / float_X(currentN + 1u);
-                //Advance by one order
-                laguerreNMinus1 = laguerreN;
-                laguerreN = laguerreNPlus1;
-                currentN++;
-            }
-            return laguerreN;
-        }
-
-        /** Device-Side Constructor
-         *
-         * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
-         * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly after transform to centered origin)
-         */
-        HDINLINE GaussianBeam(
-            typename FieldE::DataBoxType const & dataBoxE,
-            DataSpace< simDim > const & superCellToLocalOriginCellOffset,
-            DataSpace< simDim > const & offsetToTotalDomain,
-            float3_X const & elong,
-            float_X const phase
-        ) :
-            m_elong( elong ),
-            m_phase( phase ),
-            m_dataBoxE( dataBoxE ),
-            m_offsetToTotalDomain( offsetToTotalDomain ),
-            m_superCellToLocalOriginCellOffset( superCellToLocalOriginCellOffset )
-        {
-        }
-
-        /** device side manipulation for init plane (transversal)
-         *
-         * @tparam T_Args type of the arguments passed to the user manipulator functor
-         *
-         * @param cellIndexInSuperCell ND cell index in current supercell
-         */
-        template< typename T_Acc >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            DataSpace< simDim > const & cellIndexInSuperCell
-        )
-        {
-            // coordinate system to global simulation as origin
-            DataSpace< simDim > const localCell(
-                cellIndexInSuperCell +
-                m_superCellToLocalOriginCellOffset
-            );
-
-            // transform coordinate system to center of x-z plane of initialization
-            constexpr uint8_t planeNormalDir = 1u;
-            DataSpace< simDim > offsetToCenterOfPlane( m_offsetToTotalDomain );
-            offsetToCenterOfPlane[ planeNormalDir ] = 0; // do not shift origin of plane normal
-            floatD_X const pos = precisionCast< float_X >( localCell + offsetToCenterOfPlane ) * cellSize.shrink< simDim >();
-            // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
-
-            // transversal position only
-            floatD_X planeNoNormal = floatD_X::create( 1.0_X );
-            planeNoNormal[ planeNormalDir ] = 0.0_X;
-            float_X const r2 = math::abs2( pos * planeNoNormal );
-
-            // calculate focus position relative to the laser initialization plane
-            float_X const focusPos = Unitless::FOCUS_POS - pos.y();
-
-            // rayleigh length (in y-direction)
-            float_X const y_R = float_X( PI ) * Unitless::W0 * Unitless::W0 / Unitless::WAVE_LENGTH;
-
-            // inverse radius of curvature of the beam's  wavefronts
-            float_X const R_y_inv = -focusPos / ( y_R * y_R + focusPos * focusPos );
-
-            // initialize temporary variables
-            float_X etrans( 0.0_X );
-            float_X etrans_norm( 0.0_X );
-            PMACC_CASSERT_MSG(
-                MODENUMBER_must_be_smaller_than_number_of_entries_in_LAGUERREMODES_vector,
-                Unitless::MODENUMBER < Unitless::LAGUERREMODES_t::dim
-            );
-            for( uint32_t m = 0 ; m <= Unitless::MODENUMBER ; ++m )
-                etrans_norm += typename Unitless::LAGUERREMODES_t{}[m];
-
-            // beam waist in the near field: w_y(y=0) == W0
-            float_X const w_y = Unitless::W0 * algorithms::math::sqrt( 1.0_X + ( focusPos / y_R )*( focusPos / y_R ) );
-            //! the Gouy phase shift
-            float_X const xi_y = algorithms::math::atan( -focusPos / y_R );
-
-            if( Unitless::Polarisation == Unitless::LINEAR_X || Unitless::Polarisation == Unitless::LINEAR_Z )
-            {
-                for( uint32_t m = 0 ; m <= Unitless::MODENUMBER ; ++m )
+                template<typename T_Params>
+                struct Unitless : public T_Params
                 {
-                    etrans += typename Unitless::LAGUERREMODES_t{}[m] * simpleLaguerre( m, 2.0_X * r2 / w_y / w_y )
-                        * math::exp( -r2 / w_y / w_y ) * math::cos( 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * focusPos - 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + ( 2._X * float_X( m ) + 1._X ) * xi_y + m_phase )
-                        * math::exp( -( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                              *( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                              / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0_X * Unitless::PULSE_LENGTH ) / ( 2.0_X * Unitless::PULSE_LENGTH ) );
-                }
-                m_elong *= etrans / etrans_norm;
-            }
-            else if( Unitless::Polarisation == Unitless::CIRCULAR )
+                    using Params = T_Params;
+
+                    static constexpr float_X WAVE_LENGTH
+                        = float_X(Params::WAVE_LENGTH_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X PULSE_LENGTH
+                        = float_X(Params::PULSE_LENGTH_SI / UNIT_TIME); // unit: seconds (1 sigma)
+                    static constexpr float_X AMPLITUDE
+                        = float_X(Params::AMPLITUDE_SI / UNIT_EFIELD); // unit: Volt /meter
+                    static constexpr float_X W0 = float_X(Params::W0_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X FOCUS_POS = float_X(Params::FOCUS_POS_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X INIT_TIME = float_X(
+                        (Params::PULSE_INIT * Params::PULSE_LENGTH_SI)
+                        / UNIT_TIME); // unit: seconds (full initialization length)
+
+                    /* initialize the laser not in the first cell is equal to a negative shift
+                     * in time
+                     */
+                    static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
+
+                    static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
+                };
+            } // namespace gaussianBeam
+
+            namespace acc
             {
-                for( uint32_t m = 0 ; m <= Unitless::MODENUMBER ; ++m )
+                template<typename T_Unitless>
+                struct GaussianBeam : public T_Unitless
                 {
-                    etrans += typename Unitless::LAGUERREMODES_t{}[m] * simpleLaguerre( m, 2.0_X * r2 / w_y / w_y )
-                        * math::exp( -r2 / w_y / w_y ) * math::cos( 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * focusPos - 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + ( 2._X * float_X( m ) + 1._X ) * xi_y + m_phase )
-                        * math::exp( -( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                              *( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                              / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0_X * Unitless::PULSE_LENGTH ) / ( 2.0_X * Unitless::PULSE_LENGTH ) );
-                }
-                m_elong.x() *= etrans / etrans_norm;
-                m_phase += float_X( PI / 2.0 );
-                etrans = 0.0_X;
-                for( uint32_t m = 0 ; m <= Unitless::MODENUMBER ; ++m )
+                    using Unitless = T_Unitless;
+
+                    float3_X m_elong;
+                    float_X m_phase;
+                    typename FieldE::DataBoxType m_dataBoxE;
+                    DataSpace<simDim> m_offsetToTotalDomain;
+                    DataSpace<simDim> m_superCellToLocalOriginCellOffset;
+
+                    /** Simple iteration algorithm to implement Laguerre polynomials for GPUs.
+                     *
+                     *  @param n order of the Laguerre polynomial
+                     *  @param x coordinate at which the polynomial is evaluated
+                     *  @return ...
+                     */
+                    HDINLINE float_X simpleLaguerre(const uint32_t n, const float_X x)
+                    {
+                        // Result for special case n == 0
+                        if(n == 0)
+                            return 1.0_X;
+                        uint32_t currentN = 1;
+                        float_X laguerreNMinus1 = 1.0_X;
+                        float_X laguerreN = 1.0_X - x;
+                        float_X laguerreNPlus1(0.0_X);
+                        while(currentN < n)
+                        {
+                            // Core statement of the algorithm
+                            laguerreNPlus1 = ((2.0_X * float_X(currentN) + 1.0_X - x) * laguerreN
+                                              - float_X(currentN) * laguerreNMinus1)
+                                / float_X(currentN + 1u);
+                            // Advance by one order
+                            laguerreNMinus1 = laguerreN;
+                            laguerreN = laguerreNPlus1;
+                            currentN++;
+                        }
+                        return laguerreN;
+                    }
+
+                    /** Device-Side Constructor
+                     *
+                     * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
+                     * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly
+                     * after transform to centered origin)
+                     */
+                    HDINLINE GaussianBeam(
+                        typename FieldE::DataBoxType const& dataBoxE,
+                        DataSpace<simDim> const& superCellToLocalOriginCellOffset,
+                        DataSpace<simDim> const& offsetToTotalDomain,
+                        float3_X const& elong,
+                        float_X const phase)
+                        : m_elong(elong)
+                        , m_phase(phase)
+                        , m_dataBoxE(dataBoxE)
+                        , m_offsetToTotalDomain(offsetToTotalDomain)
+                        , m_superCellToLocalOriginCellOffset(superCellToLocalOriginCellOffset)
+                    {
+                    }
+
+                    /** device side manipulation for init plane (transversal)
+                     *
+                     * @tparam T_Args type of the arguments passed to the user manipulator functor
+                     *
+                     * @param cellIndexInSuperCell ND cell index in current supercell
+                     */
+                    template<typename T_Acc>
+                    HDINLINE void operator()(T_Acc const&, DataSpace<simDim> const& cellIndexInSuperCell)
+                    {
+                        // coordinate system to global simulation as origin
+                        DataSpace<simDim> const localCell(cellIndexInSuperCell + m_superCellToLocalOriginCellOffset);
+
+                        // transform coordinate system to center of x-z plane of initialization
+                        constexpr uint8_t planeNormalDir = 1u;
+                        DataSpace<simDim> offsetToCenterOfPlane(m_offsetToTotalDomain);
+                        offsetToCenterOfPlane[planeNormalDir] = 0; // do not shift origin of plane normal
+                        floatD_X const pos
+                            = precisionCast<float_X>(localCell + offsetToCenterOfPlane) * cellSize.shrink<simDim>();
+                        // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
+
+                        // transversal position only
+                        floatD_X planeNoNormal = floatD_X::create(1.0_X);
+                        planeNoNormal[planeNormalDir] = 0.0_X;
+                        float_X const r2 = pmacc::math::abs2(pos * planeNoNormal);
+
+                        // calculate focus position relative to the laser initialization plane
+                        float_X const focusPos = Unitless::FOCUS_POS - pos.y();
+
+                        // rayleigh length (in y-direction)
+                        float_X const y_R = float_X(PI) * Unitless::W0 * Unitless::W0 / Unitless::WAVE_LENGTH;
+
+                        // inverse radius of curvature of the beam's  wavefronts
+                        float_X const R_y_inv = -focusPos / (y_R * y_R + focusPos * focusPos);
+
+                        // initialize temporary variables
+                        float_X etrans(0.0_X);
+                        float_X etrans_norm(0.0_X);
+                        PMACC_CASSERT_MSG(
+                            MODENUMBER_must_be_smaller_than_number_of_entries_in_LAGUERREMODES_vector,
+                            Unitless::MODENUMBER < Unitless::LAGUERREMODES_t::dim);
+                        for(uint32_t m = 0; m <= Unitless::MODENUMBER; ++m)
+                            etrans_norm += typename Unitless::LAGUERREMODES_t{}[m];
+
+                        // beam waist in the near field: w_y(y=0) == W0
+                        float_X const w_y = Unitless::W0 * math::sqrt(1.0_X + (focusPos / y_R) * (focusPos / y_R));
+                        //! the Gouy phase shift
+                        float_X const xi_y = math::atan(-focusPos / y_R);
+
+                        if(Unitless::Polarisation == Unitless::LINEAR_X
+                           || Unitless::Polarisation == Unitless::LINEAR_Z)
+                        {
+                            for(uint32_t m = 0; m <= Unitless::MODENUMBER; ++m)
+                            {
+                                etrans += typename Unitless::LAGUERREMODES_t{}[m]
+                                    * simpleLaguerre(m, 2.0_X * r2 / w_y / w_y) * math::exp(-r2 / w_y / w_y)
+                                    * math::cos(
+                                              2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * focusPos
+                                              - 2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv
+                                              + (2._X * float_X(m) + 1._X) * xi_y + m_phase)
+                                    * math::exp(
+                                              -(r2 / 2.0_X * R_y_inv - focusPos
+                                                - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                              * (r2 / 2.0_X * R_y_inv - focusPos
+                                                 - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                              / SPEED_OF_LIGHT / SPEED_OF_LIGHT / (2.0_X * Unitless::PULSE_LENGTH)
+                                              / (2.0_X * Unitless::PULSE_LENGTH));
+                            }
+                            m_elong *= etrans / etrans_norm;
+                        }
+                        else if(Unitless::Polarisation == Unitless::CIRCULAR)
+                        {
+                            for(uint32_t m = 0; m <= Unitless::MODENUMBER; ++m)
+                            {
+                                etrans += typename Unitless::LAGUERREMODES_t{}[m]
+                                    * simpleLaguerre(m, 2.0_X * r2 / w_y / w_y) * math::exp(-r2 / w_y / w_y)
+                                    * math::cos(
+                                              2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * focusPos
+                                              - 2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv
+                                              + (2._X * float_X(m) + 1._X) * xi_y + m_phase)
+                                    * math::exp(
+                                              -(r2 / 2.0_X * R_y_inv - focusPos
+                                                - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                              * (r2 / 2.0_X * R_y_inv - focusPos
+                                                 - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                              / SPEED_OF_LIGHT / SPEED_OF_LIGHT / (2.0_X * Unitless::PULSE_LENGTH)
+                                              / (2.0_X * Unitless::PULSE_LENGTH));
+                            }
+                            m_elong.x() *= etrans / etrans_norm;
+                            m_phase += float_X(PI / 2.0);
+                            etrans = 0.0_X;
+                            for(uint32_t m = 0; m <= Unitless::MODENUMBER; ++m)
+                            {
+                                etrans += typename Unitless::LAGUERREMODES_t{}[m]
+                                    * simpleLaguerre(m, 2.0_X * r2 / w_y / w_y) * math::exp(-r2 / w_y / w_y)
+                                    * math::cos(
+                                              2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * focusPos
+                                              - 2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv
+                                              + (2._X * float_X(m) + 1._X) * xi_y + m_phase)
+                                    * math::exp(
+                                              -(r2 / 2.0_X * R_y_inv - focusPos
+                                                - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                              * (r2 / 2.0_X * R_y_inv - focusPos
+                                                 - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                              / SPEED_OF_LIGHT / SPEED_OF_LIGHT / (2.0_X * Unitless::PULSE_LENGTH)
+                                              / (2.0_X * Unitless::PULSE_LENGTH));
+                            }
+                            m_elong.z() *= etrans / etrans_norm;
+                            // reminder: if you want to use phase below, substract pi/2
+                            // m_phase -= float_X( PI / 2.0 );
+                        }
+
+                        if(Unitless::initPlaneY != 0) // compile time if
+                        {
+                            /* If the laser is not initialized in the first cell we emit a
+                             * negatively and positively propagating wave. Therefore we need to multiply the
+                             * amplitude with a correction factor depending of the cell size in
+                             * propagation direction.
+                             * The negatively propagating wave is damped by the absorber.
+                             *
+                             * The `correctionFactor` assume that the wave is moving in y direction.
+                             */
+                            auto const correctionFactor = (SPEED_OF_LIGHT * DELTA_T) / CELL_HEIGHT * 2._X;
+
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT())
+                                += correctionFactor * m_elong;
+                        }
+                        else
+                        {
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT()) = m_elong;
+                        }
+                    }
+                };
+            } // namespace acc
+
+            template<typename T_Params>
+            struct GaussianBeam : public gaussianBeam::Unitless<T_Params>
+            {
+                using Unitless = gaussianBeam::Unitless<T_Params>;
+
+                float3_X elong;
+                float_X phase;
+                typename FieldE::DataBoxType dataBoxE;
+                DataSpace<simDim> offsetToTotalDomain;
+
+                /** constructor
+                 *
+                 * @param currentStep current simulation time step
+                 */
+                HINLINE GaussianBeam(uint32_t currentStep)
                 {
-                    etrans += typename Unitless::LAGUERREMODES_t{}[m] * simpleLaguerre( m, 2.0_X * r2 / w_y / w_y )
-                        * math::exp( -r2 / w_y / w_y ) * math::cos( 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * focusPos - 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + ( 2._X * float_X( m ) + 1._X ) * xi_y + m_phase )
-                        * math::exp( -( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                              *( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                              / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0_X * Unitless::PULSE_LENGTH ) / ( 2.0_X * Unitless::PULSE_LENGTH ) );
+                    // get data
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    dataBoxE = dc.get<FieldE>(FieldE::getName(), true)->getDeviceDataBox();
+
+                    // get meta data for offsets
+                    SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                    // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
+                    DataSpace<simDim> const globalCellOffset(subGrid.getLocalDomain().offset);
+                    DataSpace<simDim> const halfSimSize(subGrid.getGlobalDomain().size / 2);
+
+                    // transform coordinate system to center of global simulation as origin [cells]
+                    offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
+
+                    // @todo reset origin of direction of moving window
+                    // offsetToTotalDomain.y() = 0
+
+                    float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift;
+
+                    // calculate focus position relative to the laser initialization plane
+                    float_X const focusPos = Unitless::FOCUS_POS - Unitless::initPlaneY * CELL_HEIGHT;
+
+                    elong = float3_X::create(0.0_X);
+
+                    // This check is done here on HOST, since std::numeric_limits<float_X>::epsilon() does not compile
+                    // on laserTransversal(), which is on DEVICE.
+                    float_X etrans_norm(0.0_X);
+
+                    PMACC_CASSERT_MSG(
+                        MODENUMBER_must_be_smaller_than_number_of_entries_in_LAGUERREMODES_vector,
+                        Unitless::MODENUMBER < Unitless::LAGUERREMODES_t::dim);
+                    for(uint32_t m = 0; m <= Unitless::MODENUMBER; ++m)
+                        etrans_norm += typename Unitless::LAGUERREMODES_t{}[m];
+                    PMACC_VERIFY_MSG(
+                        math::abs(etrans_norm) > std::numeric_limits<float_X>::epsilon(),
+                        "Sum of LAGUERREMODES can not be 0.");
+
+
+                    // a symmetric pulse will be initialized at position z=0 for
+                    // a time of PULSE_INIT * PULSE_LENGTH = INIT_TIME.
+                    // we shift the complete pulse for the half of this time to start with
+                    // the front of the laser pulse.
+                    constexpr float_64 mue = 0.5 * Unitless::INIT_TIME;
+
+                    // rayleigh length (in y-direction)
+                    constexpr float_64 y_R = PI * Unitless::W0 * Unitless::W0 / Unitless::WAVE_LENGTH;
+                    // gaussian beam waist in the nearfield: w_y(y=0) == W0
+                    float_64 const w_y = Unitless::W0 * math::sqrt(1.0 + (focusPos / y_R) * (focusPos / y_R));
+
+                    float_64 envelope = float_64(Unitless::AMPLITUDE);
+                    if(simDim == DIM2)
+                        envelope *= math::sqrt(float_64(Unitless::W0) / w_y);
+                    else if(simDim == DIM3)
+                        envelope *= float_64(Unitless::W0) / w_y;
+                    /* no 1D representation/implementation */
+
+                    if(Unitless::Polarisation == Unitless::LINEAR_X)
+                    {
+                        elong.x() = float_X(envelope);
+                    }
+                    else if(Unitless::Polarisation == Unitless::LINEAR_Z)
+                    {
+                        elong.z() = float_X(envelope);
+                    }
+                    else if(Unitless::Polarisation == Unitless::CIRCULAR)
+                    {
+                        elong.x() = float_X(envelope) / math::sqrt(2.0_X);
+                        elong.z() = float_X(envelope) / math::sqrt(2.0_X);
+                    }
+
+                    phase = 2.0_X * float_X(PI) * float_X(Unitless::f)
+                            * (runTime - float_X(mue) - focusPos / SPEED_OF_LIGHT)
+                        + Unitless::LASER_PHASE;
                 }
-                m_elong.z() *= etrans / etrans_norm;
-                // reminder: if you want to use phase below, substract pi/2
-                // m_phase -= float_X( PI / 2.0 );
-            }
 
-            if( Unitless::initPlaneY != 0 ) // compile time if
-            {
-                /* If the laser is not initialized in the first cell we emit a
-                 * negatively and positively propagating wave. Therefore we need to multiply the
-                 * amplitude with a correction factor depending of the cell size in
-                 * propagation direction.
-                 * The negatively propagating wave is damped by the absorber.
+                /** create device manipulator functor
                  *
-                 * The `correctionFactor` assume that the wave is moving in y direction.
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @tparam T_Acc alpaka accelerator type
+                 *
+                 * @param alpaka accelerator
+                 * @param localSupercellOffset (in supercells, without guards) to the
+                 *        origin of the local domain
+                 * @param configuration of the worker
                  */
-                auto const correctionFactor = ( SPEED_OF_LIGHT * DELTA_T ) / CELL_HEIGHT * 2._X;
-
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) +=  correctionFactor * m_elong;
-            }
-            else
-            {
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) = m_elong;
-            }
-        }
-    };
-} // namespace acc
-
-    template< typename T_Params >
-    struct GaussianBeam : public gaussianBeam::Unitless< T_Params >
-    {
-        using Unitless = gaussianBeam::Unitless< T_Params >;
-
-        float3_X elong;
-        float_X phase;
-        typename FieldE::DataBoxType dataBoxE;
-        DataSpace< simDim > offsetToTotalDomain;
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE GaussianBeam( uint32_t currentStep )
-        {
-            // get data
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            dataBoxE = dc.get< FieldE >(
-                FieldE::getName(),
-                true
-            )->getDeviceDataBox();
-
-            // get meta data for offsets
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get().SubGrid();
-            // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
-            DataSpace< simDim > const globalCellOffset( subGrid.getLocalDomain().offset );
-            DataSpace< simDim > const halfSimSize( subGrid.getGlobalDomain().size / 2 );
-
-            // transform coordinate system to center of global simulation as origin [cells]
-            offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
-
-            // @todo reset origin of direction of moving window
-            // offsetToTotalDomain.y() = 0
-
-            float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift;
-
-            // calculate focus position relative to the laser initialization plane
-            float_X const focusPos = Unitless::FOCUS_POS - Unitless::initPlaneY * CELL_HEIGHT;
-
-            elong = float3_X::create( 0.0_X );
-
-            // This check is done here on HOST, since std::numeric_limits<float_X>::epsilon() does not compile on laserTransversal(), which is on DEVICE.
-            float_X etrans_norm( 0.0_X );
-
-            PMACC_CASSERT_MSG(
-                MODENUMBER_must_be_smaller_than_number_of_entries_in_LAGUERREMODES_vector,
-                Unitless::MODENUMBER < Unitless::LAGUERREMODES_t::dim
-            );
-            for( uint32_t m = 0 ; m <= Unitless::MODENUMBER ; ++m )
-                etrans_norm += typename Unitless::LAGUERREMODES_t{}[m];
-            PMACC_VERIFY_MSG(
-                algorithms::math::abs( etrans_norm ) > std::numeric_limits< float_X >::epsilon(),
-                "Sum of LAGUERREMODES can not be 0."
-            );
-
-
-            // a symmetric pulse will be initialized at position z=0 for
-            // a time of PULSE_INIT * PULSE_LENGTH = INIT_TIME.
-            // we shift the complete pulse for the half of this time to start with
-            // the front of the laser pulse.
-            constexpr float_64 mue = 0.5 * Unitless::INIT_TIME;
-
-            // rayleigh length (in y-direction)
-            constexpr float_64 y_R = PI * Unitless::W0 * Unitless::W0 / Unitless::WAVE_LENGTH;
-            // gaussian beam waist in the nearfield: w_y(y=0) == W0
-            float_64 const w_y = Unitless::W0 * math::sqrt( 1.0 + ( focusPos / y_R )*( focusPos / y_R ) );
-
-            float_64 envelope = float_64( Unitless::AMPLITUDE );
-            if( simDim == DIM2 )
-                envelope *= math::sqrt( float_64( Unitless::W0 ) / w_y );
-            else if( simDim == DIM3 )
-                envelope *= float_64( Unitless::W0 ) / w_y;
-            /* no 1D representation/implementation */
-
-            if( Unitless::Polarisation == Unitless::LINEAR_X )
-            {
-                elong.x() = float_X( envelope );
-            }
-            else if( Unitless::Polarisation == Unitless::LINEAR_Z )
-            {
-                elong.z() = float_X( envelope );
-            }
-            else if( Unitless::Polarisation == Unitless::CIRCULAR )
-            {
-                elong.x() = float_X( envelope ) / math::sqrt( 2.0_X );
-                elong.z() = float_X( envelope ) / math::sqrt( 2.0_X );
-            }
-
-            phase = 2.0_X * float_X( PI ) * float_X( Unitless::f ) * ( runTime - float_X( mue ) - focusPos / SPEED_OF_LIGHT ) + Unitless::LASER_PHASE;
-        }
-
-        /** create device manipulator functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset (in supercells, without guards) to the
-         *        origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::GaussianBeam< Unitless >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const &
-        ) const
-        {
-            auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
-            return acc::GaussianBeam< Unitless >( dataBoxE, superCellToLocalOriginCellOffset, offsetToTotalDomain, elong, phase );
-        }
-
-        //! get the name of the laser profile
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return "GaussianBeam";
-        }
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::GaussianBeam<Unitless> operator()(
+                    T_Acc const&,
+                    DataSpace<simDim> const& localSupercellOffset,
+                    T_WorkerCfg const&) const
+                {
+                    auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
+                    return acc::GaussianBeam<Unitless>(
+                        dataBoxE,
+                        superCellToLocalOriginCellOffset,
+                        offsetToTotalDomain,
+                        elong,
+                        phase);
+                }
 
-    };
+                //! get the name of the laser profile
+                static HINLINE std::string getName()
+                {
+                    return "GaussianBeam";
+                }
+            };
 
-} // namespace laserProfiles
-} // namespace fields
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
-
diff --git a/include/picongpu/fields/laserProfiles/None.def b/include/picongpu/fields/laserProfiles/None.def
index 6674f57e03..42fcc3fd46 100644
--- a/include/picongpu/fields/laserProfiles/None.def
+++ b/include/picongpu/fields/laserProfiles/None.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl
+/* Copyright 2013-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -24,43 +24,43 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace none
-{
-namespace defaults
-{
-    struct NoneParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.0;
+        namespace laserProfiles
+        {
+            namespace none
+            {
+                namespace defaults
+                {
+                    struct NoneParam
+                    {
+                        /** unit: meter */
+                        static constexpr float_64 WAVE_LENGTH_SI = 0.0;
 
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 0.0;
+                        /** unit: Volt / meter */
+                        static constexpr float_64 AMPLITUDE_SI = 0.0;
 
-        /** unit: s */
-        static constexpr float_64 PULSE_LENGTH_SI = 0.0;
+                        /** unit: s */
+                        static constexpr float_64 PULSE_LENGTH_SI = 0.0;
 
-        /** unit: cells */
-        static constexpr uint32_t initPlaneY = 0u;
-    };
-} // namespace defaults
-} // namespace none
+                        /** unit: cells */
+                        static constexpr uint32_t initPlaneY = 0u;
+                    };
+                } // namespace defaults
+            } // namespace none
 
-    /** Empty laser profile
-     *
-     * Does not define a laser profile but provides some hard-coded constants
-     * that are accessed directly in some places.
-     *
-     * @tparam T_Params class parameter to configure the "no laser" profile,
-     *                  see members of none::defaults::NoneParam for required
-     *                  members
-     */
-    template< typename T_Params = none::defaults::NoneParam >
-    struct None;
+            /** Empty laser profile
+             *
+             * Does not define a laser profile but provides some hard-coded constants
+             * that are accessed directly in some places.
+             *
+             * @tparam T_Params class parameter to configure the "no laser" profile,
+             *                  see members of none::defaults::NoneParam for required
+             *                  members
+             */
+            template<typename T_Params = none::defaults::NoneParam>
+            struct None;
 
-} // namespace laserProfiles
-} // namespace fields
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/None.hpp b/include/picongpu/fields/laserProfiles/None.hpp
index 46e3ffa1e7..e3f66bb953 100644
--- a/include/picongpu/fields/laserProfiles/None.hpp
+++ b/include/picongpu/fields/laserProfiles/None.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,92 +27,80 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace none
-{
-    template< typename T_Params >
-    struct Unitless : public T_Params
+    namespace fields
     {
-        using Params = T_Params;
-
-        static constexpr float_X WAVE_LENGTH = float_X( Params::WAVE_LENGTH_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X PULSE_LENGTH = float_X( Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (1 sigma)
-        static constexpr float_X AMPLITUDE = float_X( Params::AMPLITUDE_SI / UNIT_EFIELD ); // unit: Volt /meter
-        static constexpr float_X INIT_TIME = 0.0_X; // unit: seconds (no initialization time)
-    };
-} // namespace none
-namespace acc
-{
-    template< typename T_Unitless >
-    struct None : public T_Unitless
-    {
-        using Unitless = T_Unitless;
-
-        /** Device-Side Constructor
-         */
-        HDINLINE None()
+        namespace laserProfiles
         {
-        }
+            namespace none
+            {
+                template<typename T_Params>
+                struct Unitless : public T_Params
+                {
+                    using Params = T_Params;
 
-        /** device side manipulation for init plane (transversal)
-         *
-         * @tparam T_Args type of the arguments passed to the user manipulator functor
-         */
-        template< typename T_Acc >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            DataSpace< simDim > const &
-        )
-        {
-        }
-    };
-} // namespace acc
+                    static constexpr float_X WAVE_LENGTH
+                        = float_X(Params::WAVE_LENGTH_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X PULSE_LENGTH
+                        = float_X(Params::PULSE_LENGTH_SI / UNIT_TIME); // unit: seconds (1 sigma)
+                    static constexpr float_X AMPLITUDE
+                        = float_X(Params::AMPLITUDE_SI / UNIT_EFIELD); // unit: Volt /meter
+                    static constexpr float_X INIT_TIME = 0.0_X; // unit: seconds (no initialization time)
+                };
+            } // namespace none
+            namespace acc
+            {
+                template<typename T_Unitless>
+                struct None : public T_Unitless
+                {
+                    using Unitless = T_Unitless;
 
-    template< typename T_Params >
-    struct None : public none::Unitless< T_Params >
-    {
-        using Unitless = none::Unitless< T_Params >;
+                    /** Device-Side Constructor
+                     */
+                    HDINLINE None()
+                    {
+                    }
 
-        /** constructor
-         */
-        HINLINE None( uint32_t )
-        {
-        }
+                    /** device side manipulation for init plane (transversal)
+                     *
+                     * @tparam T_Args type of the arguments passed to the user manipulator functor
+                     */
+                    template<typename T_Acc>
+                    HDINLINE void operator()(T_Acc const&, DataSpace<simDim> const&)
+                    {
+                    }
+                };
+            } // namespace acc
 
-        /** create device manipulator functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::None< Unitless >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const &,
-            T_WorkerCfg const &
-        ) const
-        {
-            return acc::None< Unitless >( );
-        }
+            template<typename T_Params>
+            struct None : public none::Unitless<T_Params>
+            {
+                using Unitless = none::Unitless<T_Params>;
 
-        //! get the name of the laser profile
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return "None";
-        }
+                /** constructor
+                 */
+                HINLINE None(uint32_t)
+                {
+                }
 
-    };
+                /** create device manipulator functor
+                 *
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @tparam T_Acc alpaka accelerator type
+                 */
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::None<Unitless> operator()(T_Acc const&, DataSpace<simDim> const&, T_WorkerCfg const&)
+                    const
+                {
+                    return acc::None<Unitless>();
+                }
 
-} // namespace laserProfiles
-} // namespace fields
-} // namespace picongpu
+                //! get the name of the laser profile
+                static HINLINE std::string getName()
+                {
+                    return "None";
+                }
+            };
 
+        } // namespace laserProfiles
+    } // namespace fields
+} // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/PlaneWave.def b/include/picongpu/fields/laserProfiles/PlaneWave.def
index 38c6511beb..a73b137842 100644
--- a/include/picongpu/fields/laserProfiles/PlaneWave.def
+++ b/include/picongpu/fields/laserProfiles/PlaneWave.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl
+/* Copyright 2013-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -24,96 +24,98 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace planeWave
-{
-namespace defaults
-{
-    struct PlaneWaveParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Stretch temporal profile by a constant plateau between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 13.34e-15;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before and after the plateau
-         *  unit: none */
-        static constexpr float_64 RAMP_INIT = 20.6146;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarization types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-} // namespace defaults
-} // namespace planeWave
-
-    /** Plane wave laser profile
-     *
-     * Defines a plane wave with temporally Gaussian envelope.
-     *
-     * @tparam T_Params class parameter to configure the plane wave profile,
-     *                  see members of planeWave::defaults::PlaneWaveParam for
-     *                  required members
-     */
-    template< typename T_Params = planeWave::defaults::PlaneWaveParam >
-    struct PlaneWave;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace planeWave
+            {
+                namespace defaults
+                {
+                    struct PlaneWaveParam
+                    {
+                        /** unit: meter */
+                        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                        /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                            * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                            * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                        /** unit: W / m^2 */
+                        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                        /** unit: none */
+                        static constexpr float_64 _A0 = 1.5;
+
+                        /** unit: Volt / meter */
+                        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                        /** unit: Volt / meter */
+                        // static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                        /** Stretch temporal profile by a constant plateau between the up and downramp
+                         *  unit: seconds */
+                        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 13.34e-15;
+
+                        /** Pulse length: sigma of std. gauss for intensity (E^2)
+                         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                         *                                          [    2.354820045     ]
+                         *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                         *                      = what a experimentalist calls "pulse duration"
+                         *  unit: seconds (1 sigma) */
+                        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                        /** cell from top where the laser is initialized
+                         *
+                         * if `initPlaneY == 0` than the absorber are disabled.
+                         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                         * direction is enabled
+                         *
+                         * valid ranges:
+                         *   - initPlaneY == 0
+                         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                         */
+                        static constexpr uint32_t initPlaneY = 0;
+
+                        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before and
+                         * after the plateau unit: none */
+                        static constexpr float_64 RAMP_INIT = 20.6146;
+
+                        /** laser phase shift (no shift: 0.0)
+                         *
+                         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                         *
+                         * unit: rad, periodic in 2*pi
+                         */
+                        static constexpr float_X LASER_PHASE = 0.0;
+
+                        /** Available polarization types
+                         */
+                        enum PolarisationType
+                        {
+                            LINEAR_X = 1u,
+                            LINEAR_Z = 2u,
+                            CIRCULAR = 4u,
+                        };
+                        /** Polarization selection
+                         */
+                        static constexpr PolarisationType Polarisation = LINEAR_X;
+                    };
+                } // namespace defaults
+            } // namespace planeWave
+
+            /** Plane wave laser profile
+             *
+             * Defines a plane wave with temporally Gaussian envelope.
+             *
+             * @tparam T_Params class parameter to configure the plane wave profile,
+             *                  see members of planeWave::defaults::PlaneWaveParam for
+             *                  required members
+             */
+            template<typename T_Params = planeWave::defaults::PlaneWaveParam>
+            struct PlaneWave;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/PlaneWave.hpp b/include/picongpu/fields/laserProfiles/PlaneWave.hpp
index 426ac8f4bc..8cabf92f5a 100644
--- a/include/picongpu/fields/laserProfiles/PlaneWave.hpp
+++ b/include/picongpu/fields/laserProfiles/PlaneWave.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -27,231 +27,225 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace planeWave
-{
-    template< typename T_Params >
-    struct Unitless : public T_Params
-    {
-        using Params = T_Params;
-
-        static constexpr float_X WAVE_LENGTH = float_X( Params::WAVE_LENGTH_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X PULSE_LENGTH = float_X( Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (1 sigma)
-        static constexpr float_X LASER_NOFOCUS_CONSTANT = float_X( Params::LASER_NOFOCUS_CONSTANT_SI / UNIT_TIME ); // unit: seconds
-        static constexpr float_X AMPLITUDE = float_X( Params::AMPLITUDE_SI / UNIT_EFIELD ); // unit: Volt /meter
-        static constexpr float_X INIT_TIME = float_X( ( Params::RAMP_INIT * Params::PULSE_LENGTH_SI + Params::LASER_NOFOCUS_CONSTANT_SI ) / UNIT_TIME ); // unit: seconds (full inizialisation length)
-
-        /* initialize the laser not in the first cell is equal to a negative shift
-         * in time
-         */
-        static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
-
-        static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
-
-    };
-} // namespace planeWave
-
-namespace acc
-{
-    template< typename T_Unitless >
-    struct PlaneWave : public T_Unitless
+    namespace fields
     {
-        using Unitless = T_Unitless;
-
-        float3_X m_elong;
-        typename FieldE::DataBoxType m_dataBoxE;
-        DataSpace< simDim > m_offsetToTotalDomain;
-        DataSpace< simDim > m_superCellToLocalOriginCellOffset;
-
-        /** Device-Side Constructor
-         *
-         * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
-         * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly after transform to centered origin)
-         */
-        HDINLINE PlaneWave(
-            typename FieldE::DataBoxType const & dataBoxE,
-            DataSpace< simDim > const & superCellToLocalOriginCellOffset,
-            DataSpace< simDim > const & offsetToTotalDomain,
-            float3_X const & elong
-        ) :
-            m_elong( elong ),
-            m_dataBoxE( dataBoxE ),
-            m_offsetToTotalDomain( offsetToTotalDomain ),
-            m_superCellToLocalOriginCellOffset( superCellToLocalOriginCellOffset )
+        namespace laserProfiles
         {
-        }
-
-        /** device side manipulation for init plane (transversal)
-         *
-         * @tparam T_Args type of the arguments passed to the user manipulator functor
-         *
-         * @param cellIndexInSuperCell ND cell index in current supercell
-         */
-        template< typename T_Acc >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            DataSpace< simDim > const & cellIndexInSuperCell
-        )
-        {
-            // coordinate system to global simulation as origin
-            DataSpace< simDim > const localCell(
-                cellIndexInSuperCell +
-                m_superCellToLocalOriginCellOffset
-            );
-
-            if( Unitless::initPlaneY != 0 ) // compile time if
-            {
-                /* If the laser is not initialized in the first cell we emit a
-                 * negatively and positively propagating wave. Therefore we need to multiply the
-                 * amplitude with a correction factor depending of the cell size in
-                 * propagation direction.
-                 * The negatively propagating wave is damped by the absorber.
-                 *
-                 * The `correctionFactor` assume that the wave is moving in y direction.
-                 */
-                auto const correctionFactor = ( SPEED_OF_LIGHT * DELTA_T ) / CELL_HEIGHT * 2._X;
-
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) +=  correctionFactor * m_elong;
-            }
-            else
-            {
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) = m_elong;
-            }
-        }
-    };
-} // namespace acc
-
-    template< typename T_Params >
-    struct PlaneWave : public planeWave::Unitless< T_Params >
-    {
-        using Unitless = planeWave::Unitless< T_Params >;
-
-        float3_X elong;
-        float_X phase;
-        typename FieldE::DataBoxType dataBoxE;
-        DataSpace< simDim > offsetToTotalDomain;
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE PlaneWave( uint32_t currentStep ) :
-            phase( 0.0_X )
-        {
-            // get data
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            dataBoxE = dc.get< FieldE >(
-                FieldE::getName(),
-                true
-            )->getDeviceDataBox();
-
-            // get meta data for offsets
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get().SubGrid();
-            // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
-            DataSpace< simDim > const globalCellOffset( subGrid.getLocalDomain().offset );
-            DataSpace< simDim > const halfSimSize( subGrid.getGlobalDomain().size / 2 );
-
-            // transform coordinate system to center of global simulation as origin [cells]
-            offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
-
-            // @todo reset origin of direction of moving window
-            // offsetToTotalDomain.y() = 0
-
-            float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift;
-
-            elong = float3_X::create( 0.0 );
-
-            float_64 envelope = float_64( Unitless::AMPLITUDE );
-
-            float_64 const mue = 0.5 * Unitless::RAMP_INIT * Unitless::PULSE_LENGTH;
-
-            float_64 const w = 2.0 * PI * Unitless::f;
-            float_64 const tau = Unitless::PULSE_LENGTH * math::sqrt( 2.0 );
-
-            float_64 const endUpramp = mue;
-            float_64 const startDownramp = mue + Unitless::LASER_NOFOCUS_CONSTANT;
-
-            float_64 integrationCorrectionFactor = 0.0;
-
-            if( runTime > startDownramp )
+            namespace planeWave
             {
-                // downramp = end
-                float_64 const exponent = ( runTime - startDownramp ) / tau;
-                envelope *= exp( -0.5 * exponent * exponent );
-                integrationCorrectionFactor = ( runTime - startDownramp )/ ( w * tau * tau );
-            }
-            else if( runTime < endUpramp )
+                template<typename T_Params>
+                struct Unitless : public T_Params
+                {
+                    using Params = T_Params;
+
+                    static constexpr float_X WAVE_LENGTH
+                        = float_X(Params::WAVE_LENGTH_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X PULSE_LENGTH
+                        = float_X(Params::PULSE_LENGTH_SI / UNIT_TIME); // unit: seconds (1 sigma)
+                    static constexpr float_X LASER_NOFOCUS_CONSTANT
+                        = float_X(Params::LASER_NOFOCUS_CONSTANT_SI / UNIT_TIME); // unit: seconds
+                    static constexpr float_X AMPLITUDE
+                        = float_X(Params::AMPLITUDE_SI / UNIT_EFIELD); // unit: Volt /meter
+                    static constexpr float_X INIT_TIME = float_X(
+                        (Params::RAMP_INIT * Params::PULSE_LENGTH_SI + Params::LASER_NOFOCUS_CONSTANT_SI)
+                        / UNIT_TIME); // unit: seconds (full inizialisation length)
+
+                    /* initialize the laser not in the first cell is equal to a negative shift
+                     * in time
+                     */
+                    static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
+
+                    static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
+                };
+            } // namespace planeWave
+
+            namespace acc
             {
-                // upramp = start
-                float_64 const exponent = ( runTime - endUpramp ) / tau;
-                envelope *= exp( -0.5 * exponent * exponent );
-                integrationCorrectionFactor = ( runTime - endUpramp )/ ( w * tau * tau );
-            }
-
-            float_64 const timeOszi = runTime - endUpramp;
-            float_64 const t_and_phase = w * timeOszi + Unitless::LASER_PHASE;
-            // to understand both components [sin(...) + t/tau^2 * cos(...)] see description above
-            if( Unitless::Polarisation == Unitless::LINEAR_X )
+                template<typename T_Unitless>
+                struct PlaneWave : public T_Unitless
+                {
+                    using Unitless = T_Unitless;
+
+                    float3_X m_elong;
+                    typename FieldE::DataBoxType m_dataBoxE;
+                    DataSpace<simDim> m_offsetToTotalDomain;
+                    DataSpace<simDim> m_superCellToLocalOriginCellOffset;
+
+                    /** Device-Side Constructor
+                     *
+                     * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
+                     * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly
+                     * after transform to centered origin)
+                     */
+                    HDINLINE PlaneWave(
+                        typename FieldE::DataBoxType const& dataBoxE,
+                        DataSpace<simDim> const& superCellToLocalOriginCellOffset,
+                        DataSpace<simDim> const& offsetToTotalDomain,
+                        float3_X const& elong)
+                        : m_elong(elong)
+                        , m_dataBoxE(dataBoxE)
+                        , m_offsetToTotalDomain(offsetToTotalDomain)
+                        , m_superCellToLocalOriginCellOffset(superCellToLocalOriginCellOffset)
+                    {
+                    }
+
+                    /** device side manipulation for init plane (transversal)
+                     *
+                     * @tparam T_Args type of the arguments passed to the user manipulator functor
+                     *
+                     * @param cellIndexInSuperCell ND cell index in current supercell
+                     */
+                    template<typename T_Acc>
+                    HDINLINE void operator()(T_Acc const&, DataSpace<simDim> const& cellIndexInSuperCell)
+                    {
+                        // coordinate system to global simulation as origin
+                        DataSpace<simDim> const localCell(cellIndexInSuperCell + m_superCellToLocalOriginCellOffset);
+
+                        if(Unitless::initPlaneY != 0) // compile time if
+                        {
+                            /* If the laser is not initialized in the first cell we emit a
+                             * negatively and positively propagating wave. Therefore we need to multiply the
+                             * amplitude with a correction factor depending of the cell size in
+                             * propagation direction.
+                             * The negatively propagating wave is damped by the absorber.
+                             *
+                             * The `correctionFactor` assume that the wave is moving in y direction.
+                             */
+                            auto const correctionFactor = (SPEED_OF_LIGHT * DELTA_T) / CELL_HEIGHT * 2._X;
+
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT())
+                                += correctionFactor * m_elong;
+                        }
+                        else
+                        {
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT()) = m_elong;
+                        }
+                    }
+                };
+            } // namespace acc
+
+            template<typename T_Params>
+            struct PlaneWave : public planeWave::Unitless<T_Params>
             {
-              elong.x() = float_X( envelope * ( math::sin( t_and_phase )
-                          + math::cos( t_and_phase ) * integrationCorrectionFactor ) );
-            }
-            else if( Unitless::Polarisation == Unitless::LINEAR_Z)
-            {
-              elong.z() = float_X( envelope * ( math::sin( t_and_phase )
-                          + math::cos( t_and_phase ) * integrationCorrectionFactor ) );
-            }
-            else if( Unitless::Polarisation == Unitless::CIRCULAR )
-            {
-                elong.x() = float_X( envelope / math::sqrt(2.0) * ( math::sin( t_and_phase )
-                            + math::cos( t_and_phase ) * integrationCorrectionFactor));
-                elong.z() = float_X( envelope / math::sqrt(2.0) * ( math::cos( t_and_phase )
-                            - math::sin( t_and_phase ) * integrationCorrectionFactor ) );
-            }
-        }
-
-        /** create device manipulator functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset (in supercells, without guards) to the
-         *        origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::PlaneWave< Unitless >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const &
-        ) const
-        {
-            auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
-            return acc::PlaneWave< Unitless >( dataBoxE, superCellToLocalOriginCellOffset, offsetToTotalDomain, elong );
-        }
-
-        //! get the name of the laser profile
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return "PlaneWave";
-        }
+                using Unitless = planeWave::Unitless<T_Params>;
 
-    };
+                float3_X elong;
+                float_X phase;
+                typename FieldE::DataBoxType dataBoxE;
+                DataSpace<simDim> offsetToTotalDomain;
 
-} // namespace laserProfiles
-} // namespace fields
+                /** constructor
+                 *
+                 * @param currentStep current simulation time step
+                 */
+                HINLINE PlaneWave(uint32_t currentStep) : phase(0.0_X)
+                {
+                    // get data
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    dataBoxE = dc.get<FieldE>(FieldE::getName(), true)->getDeviceDataBox();
+
+                    // get meta data for offsets
+                    SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                    // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
+                    DataSpace<simDim> const globalCellOffset(subGrid.getLocalDomain().offset);
+                    DataSpace<simDim> const halfSimSize(subGrid.getGlobalDomain().size / 2);
+
+                    // transform coordinate system to center of global simulation as origin [cells]
+                    offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
+
+                    // @todo reset origin of direction of moving window
+                    // offsetToTotalDomain.y() = 0
+
+                    float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift;
+
+                    elong = float3_X::create(0.0);
+
+                    float_64 envelope = float_64(Unitless::AMPLITUDE);
+
+                    float_64 const mue = 0.5 * Unitless::RAMP_INIT * Unitless::PULSE_LENGTH;
+
+                    float_64 const w = 2.0 * PI * Unitless::f;
+                    float_64 const tau = Unitless::PULSE_LENGTH * math::sqrt(2.0);
+
+                    float_64 const endUpramp = mue;
+                    float_64 const startDownramp = mue + Unitless::LASER_NOFOCUS_CONSTANT;
+
+                    float_64 integrationCorrectionFactor = 0.0;
+
+                    if(runTime > startDownramp)
+                    {
+                        // downramp = end
+                        float_64 const exponent = (runTime - startDownramp) / tau;
+                        envelope *= exp(-0.5 * exponent * exponent);
+                        integrationCorrectionFactor = (runTime - startDownramp) / (w * tau * tau);
+                    }
+                    else if(runTime < endUpramp)
+                    {
+                        // upramp = start
+                        float_64 const exponent = (runTime - endUpramp) / tau;
+                        envelope *= exp(-0.5 * exponent * exponent);
+                        integrationCorrectionFactor = (runTime - endUpramp) / (w * tau * tau);
+                    }
+
+                    float_64 const timeOszi = runTime - endUpramp;
+                    float_64 const t_and_phase = w * timeOszi + Unitless::LASER_PHASE;
+                    // to understand both components [sin(...) + t/tau^2 * cos(...)] see description above
+                    if(Unitless::Polarisation == Unitless::LINEAR_X)
+                    {
+                        elong.x() = float_X(
+                            envelope
+                            * (math::sin(t_and_phase) + math::cos(t_and_phase) * integrationCorrectionFactor));
+                    }
+                    else if(Unitless::Polarisation == Unitless::LINEAR_Z)
+                    {
+                        elong.z() = float_X(
+                            envelope
+                            * (math::sin(t_and_phase) + math::cos(t_and_phase) * integrationCorrectionFactor));
+                    }
+                    else if(Unitless::Polarisation == Unitless::CIRCULAR)
+                    {
+                        elong.x() = float_X(
+                            envelope / math::sqrt(2.0)
+                            * (math::sin(t_and_phase) + math::cos(t_and_phase) * integrationCorrectionFactor));
+                        elong.z() = float_X(
+                            envelope / math::sqrt(2.0)
+                            * (math::cos(t_and_phase) - math::sin(t_and_phase) * integrationCorrectionFactor));
+                    }
+                }
+
+                /** create device manipulator functor
+                 *
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @tparam T_Acc alpaka accelerator type
+                 *
+                 * @param alpaka accelerator
+                 * @param localSupercellOffset (in supercells, without guards) to the
+                 *        origin of the local domain
+                 * @param configuration of the worker
+                 */
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::PlaneWave<Unitless> operator()(
+                    T_Acc const&,
+                    DataSpace<simDim> const& localSupercellOffset,
+                    T_WorkerCfg const&) const
+                {
+                    auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
+                    return acc::PlaneWave<Unitless>(
+                        dataBoxE,
+                        superCellToLocalOriginCellOffset,
+                        offsetToTotalDomain,
+                        elong);
+                }
+
+                //! get the name of the laser profile
+                static HINLINE std::string getName()
+                {
+                    return "PlaneWave";
+                }
+            };
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
-
diff --git a/include/picongpu/fields/laserProfiles/Polynom.def b/include/picongpu/fields/laserProfiles/Polynom.def
index 205a57f697..f45c72e14c 100644
--- a/include/picongpu/fields/laserProfiles/Polynom.def
+++ b/include/picongpu/fields/laserProfiles/Polynom.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -24,96 +24,98 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace polynom
-{
-namespace defaults
-{
-    struct PolynomParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        //static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 4.0e-15;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         *  unit: meter
-         */
-        static constexpr float_64 W0_X_SI = 4.246e-6; // waist in x-direction
-        static constexpr float_64 W0_Z_SI = W0_X_SI; // waist in z-direction
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarization types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-} // namespace defaults
-} // namespace gaussianBeam
-
-    /** Wavepacket with a polynomial temporal intensity shape.
-     *
-     * Based on a wavepacket with Gaussian spatial envelope.
-     *
-     * @tparam T_Params class parameter to configure the polynomial laser profile,
-     *                  see members of polynom::defaults::PolynomParam for
-     *                  required members
-     */
-    template< typename T_Params = polynom::defaults::PolynomParam >
-    struct Polynom;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace polynom
+            {
+                namespace defaults
+                {
+                    struct PolynomParam
+                    {
+                        /** unit: meter */
+                        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                        /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                            * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                            * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                        /** unit: W / m^2 */
+                        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                        /** unit: none */
+                        // static constexpr float_64 _A0  = 1.5;
+
+                        /** unit: Volt / meter */
+                        // static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                        /** unit: Volt / meter */
+                        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                        /** Pulse length: sigma of std. gauss for intensity (E^2)
+                         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                         *                                          [    2.354820045     ]
+                         *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                         *                      = what a experimentalist calls "pulse duration"
+                         *  unit: seconds (1 sigma) */
+                        static constexpr float_64 PULSE_LENGTH_SI = 4.0e-15;
+
+                        /** beam waist: distance from the axis where the pulse intensity (E^2)
+                         *              decreases to its 1/e^2-th part,
+                         *              at the focus position of the laser
+                         *  unit: meter
+                         */
+                        static constexpr float_64 W0_X_SI = 4.246e-6; // waist in x-direction
+                        static constexpr float_64 W0_Z_SI = W0_X_SI; // waist in z-direction
+
+                        /** cell from top where the laser is initialized
+                         *
+                         * if `initPlaneY == 0` than the absorber are disabled.
+                         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                         * direction is enabled
+                         *
+                         * valid ranges:
+                         *   - initPlaneY == 0
+                         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                         */
+                        static constexpr uint32_t initPlaneY = 0;
+
+                        /** laser phase shift (no shift: 0.0)
+                         *
+                         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                         *
+                         * unit: rad, periodic in 2*pi
+                         */
+                        static constexpr float_X LASER_PHASE = 0.0;
+
+                        /** Available polarization types
+                         */
+                        enum PolarisationType
+                        {
+                            LINEAR_X = 1u,
+                            LINEAR_Z = 2u,
+                            CIRCULAR = 4u,
+                        };
+                        /** Polarization selection
+                         */
+                        static constexpr PolarisationType Polarisation = LINEAR_X;
+                    };
+                } // namespace defaults
+            } // namespace polynom
+
+            /** Wavepacket with a polynomial temporal intensity shape.
+             *
+             * Based on a wavepacket with Gaussian spatial envelope.
+             *
+             * @tparam T_Params class parameter to configure the polynomial laser profile,
+             *                  see members of polynom::defaults::PolynomParam for
+             *                  required members
+             */
+            template<typename T_Params = polynom::defaults::PolynomParam>
+            struct Polynom;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/Polynom.hpp b/include/picongpu/fields/laserProfiles/Polynom.hpp
index f3a8b2af1b..6c3c370fb9 100644
--- a/include/picongpu/fields/laserProfiles/Polynom.hpp
+++ b/include/picongpu/fields/laserProfiles/Polynom.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -27,244 +27,234 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace polynom
-{
-    template< typename T_Params >
-    struct Unitless : public T_Params
-    {
-        using Params = T_Params;
-
-        static constexpr float_X WAVE_LENGTH = float_X( Params::WAVE_LENGTH_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X PULSE_LENGTH = float_X( Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (1 sigma)
-        static constexpr float_X AMPLITUDE = float_X( Params::AMPLITUDE_SI / UNIT_EFIELD ); // unit: Volt /meter
-        static constexpr float_X W0_X = float_X( Params::W0_X_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X W0_Z = float_X( Params::W0_Z_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X INIT_TIME = float_X( Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (full initialization length)
-
-        /* initialize the laser not in the first cell is equal to a negative shift
-         * in time
-         */
-        static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
-
-        static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
-
-    };
-} // namespace polynom
-
-namespace acc
-{
-    template< typename T_Unitless >
-    struct Polynom : public T_Unitless
+    namespace fields
     {
-        using Unitless = T_Unitless;
-
-        float3_X m_elong;
-        float_X m_phase;
-        typename FieldE::DataBoxType m_dataBoxE;
-        DataSpace< simDim > m_offsetToTotalDomain;
-        DataSpace< simDim > m_superCellToLocalOriginCellOffset;
-
-        /** Device-Side Constructor
-         *
-         * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
-         * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly after transform to centered origin)
-         */
-        HDINLINE Polynom(
-            typename FieldE::DataBoxType const & dataBoxE,
-            DataSpace< simDim > const & superCellToLocalOriginCellOffset,
-            DataSpace< simDim > const & offsetToTotalDomain,
-            float3_X const & elong,
-            float_X const phase
-        ) :
-            m_elong( elong ),
-            m_phase( phase ),
-            m_dataBoxE( dataBoxE ),
-            m_offsetToTotalDomain( offsetToTotalDomain ),
-            m_superCellToLocalOriginCellOffset( superCellToLocalOriginCellOffset )
-        {
-        }
-
-        /** device side manipulation for init plane (transversal)
-         *
-         * @tparam T_Args type of the arguments passed to the user manipulator functor
-         *
-         * @param cellIndexInSuperCell ND cell index in current supercell
-         */
-        template< typename T_Acc >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            DataSpace< simDim > const & cellIndexInSuperCell
-        )
-        {
-            // coordinate system to global simulation as origin
-            DataSpace< simDim > const localCell(
-                cellIndexInSuperCell +
-                m_superCellToLocalOriginCellOffset
-            );
-
-            // transform coordinate system to center of x-z plane of initialization
-            constexpr uint8_t planeNormalDir = 1u;
-            DataSpace< simDim > offsetToCenterOfPlane( m_offsetToTotalDomain );
-            offsetToCenterOfPlane[ planeNormalDir ] = 0; // do not shift origin of plane normal
-            floatD_X const pos = precisionCast< float_X >( localCell + offsetToCenterOfPlane ) * cellSize.shrink< simDim >();
-            // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
-
-            // transversal position only
-            float3_X const w0_3D( Unitless::W0_X, 0., Unitless::W0_Z );
-            auto const w0( w0_3D.shrink< simDim >().remove< planeNormalDir >() );
-            auto const pos_trans( pos.remove< planeNormalDir >() );
-            auto const exp_compos( pos_trans * pos_trans / ( w0 * w0 ) );
-            float_X const exp_arg( exp_compos.sumOfComponents() );
-
-            m_elong *= math::exp( -1.0_X * exp_arg );
-
-            if( Unitless::initPlaneY != 0 ) // compile time if
-            {
-                /* If the laser is not initialized in the first cell we emit a
-                 * negatively and positively propagating wave. Therefore we need to multiply the
-                 * amplitude with a correction factor depending of the cell size in
-                 * propagation direction.
-                 * The negatively propagating wave is damped by the absorber.
-                 *
-                 * The `correctionFactor` assume that the wave is moving in y direction.
-                 */
-                auto const correctionFactor = ( SPEED_OF_LIGHT * DELTA_T ) / CELL_HEIGHT * 2._X;
-
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) +=  correctionFactor * m_elong;
-            }
-            else
-            {
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) = m_elong;
-            }
-        }
-    };
-} // namespace acc
-
-    template< typename T_Params >
-    struct Polynom : public polynom::Unitless< T_Params >
-    {
-        using Unitless = polynom::Unitless< T_Params >;
-
-        float3_X elong;
-        float_X phase;
-        typename FieldE::DataBoxType dataBoxE;
-        DataSpace< simDim > offsetToTotalDomain;
-
-        HDINLINE float_X
-        Tpolynomial( float_X const tau )
+        namespace laserProfiles
         {
-            float_X result( 0.0_X );
-            if( tau >= 0.0_X && tau <= 1.0_X )
-                result = tau * tau * tau * ( 10.0_X - 15.0_X * tau + 6.0_X * tau * tau );
-            else if( tau > 1.0_X && tau <= 2.0_X )
-                result = ( 2.0_X - tau ) * ( 2.0_X - tau ) * ( 2.0_X - tau ) * ( 4.0_X - 9.0_X * tau + 6.0_X * tau * tau );
-
-            return result;
-        }
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE Polynom( uint32_t currentStep ) :
-            phase( 0.0_X )
-        {
-            // get data
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            dataBoxE = dc.get< FieldE >(
-                FieldE::getName(),
-                true
-            )->getDeviceDataBox();
-
-            // get meta data for offsets
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get().SubGrid();
-            // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
-            DataSpace< simDim > const globalCellOffset( subGrid.getLocalDomain().offset );
-            DataSpace< simDim > const halfSimSize( subGrid.getGlobalDomain().size / 2 );
-
-            // transform coordinate system to center of global simulation as origin [cells]
-            offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
-
-            // @todo reset origin of direction of moving window
-            // offsetToTotalDomain.y() = 0
-
-            float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift;
-
-            elong = float3_X::create( 0.0_X );
-
-            /* a symmetric pulse will be initialized at position z=0
-             * the laser amplitude rises  for t_rise
-             * and falls for t_rise
-             * making the laser pulse 2*t_rise long
-             */
-
-            const float_X t_rise = 0.5_X * Unitless::PULSE_LENGTH;
-            const float_X tau = runTime / t_rise;
-
-            const float_X omegaLaser = 2.0_X * PI * Unitless::f;
-
-            if( Unitless::Polarisation == Unitless::LINEAR_X )
+            namespace polynom
             {
-                elong.x() = Unitless::AMPLITUDE * Tpolynomial( tau ) *
-                    math::sin( omegaLaser * ( runTime - t_rise ) + Unitless::LASER_PHASE );
-            }
-            else if( Unitless::Polarisation == Unitless::LINEAR_Z )
+                template<typename T_Params>
+                struct Unitless : public T_Params
+                {
+                    using Params = T_Params;
+
+                    static constexpr float_X WAVE_LENGTH
+                        = float_X(Params::WAVE_LENGTH_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X PULSE_LENGTH
+                        = float_X(Params::PULSE_LENGTH_SI / UNIT_TIME); // unit: seconds (1 sigma)
+                    static constexpr float_X AMPLITUDE
+                        = float_X(Params::AMPLITUDE_SI / UNIT_EFIELD); // unit: Volt /meter
+                    static constexpr float_X W0_X = float_X(Params::W0_X_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X W0_Z = float_X(Params::W0_Z_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X INIT_TIME
+                        = float_X(Params::PULSE_LENGTH_SI / UNIT_TIME); // unit: seconds (full initialization length)
+
+                    /* initialize the laser not in the first cell is equal to a negative shift
+                     * in time
+                     */
+                    static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
+
+                    static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
+                };
+            } // namespace polynom
+
+            namespace acc
             {
-                elong.z() = Unitless::AMPLITUDE * Tpolynomial( tau ) *
-                    math::sin( omegaLaser * ( runTime - t_rise ) + Unitless::LASER_PHASE );
-            }
-            else if( Unitless::Polarisation == Unitless::CIRCULAR )
+                template<typename T_Unitless>
+                struct Polynom : public T_Unitless
+                {
+                    using Unitless = T_Unitless;
+
+                    float3_X m_elong;
+                    float_X m_phase;
+                    typename FieldE::DataBoxType m_dataBoxE;
+                    DataSpace<simDim> m_offsetToTotalDomain;
+                    DataSpace<simDim> m_superCellToLocalOriginCellOffset;
+
+                    /** Device-Side Constructor
+                     *
+                     * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
+                     * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly
+                     * after transform to centered origin)
+                     */
+                    HDINLINE Polynom(
+                        typename FieldE::DataBoxType const& dataBoxE,
+                        DataSpace<simDim> const& superCellToLocalOriginCellOffset,
+                        DataSpace<simDim> const& offsetToTotalDomain,
+                        float3_X const& elong,
+                        float_X const phase)
+                        : m_elong(elong)
+                        , m_phase(phase)
+                        , m_dataBoxE(dataBoxE)
+                        , m_offsetToTotalDomain(offsetToTotalDomain)
+                        , m_superCellToLocalOriginCellOffset(superCellToLocalOriginCellOffset)
+                    {
+                    }
+
+                    /** device side manipulation for init plane (transversal)
+                     *
+                     * @tparam T_Args type of the arguments passed to the user manipulator functor
+                     *
+                     * @param cellIndexInSuperCell ND cell index in current supercell
+                     */
+                    template<typename T_Acc>
+                    HDINLINE void operator()(T_Acc const&, DataSpace<simDim> const& cellIndexInSuperCell)
+                    {
+                        // coordinate system to global simulation as origin
+                        DataSpace<simDim> const localCell(cellIndexInSuperCell + m_superCellToLocalOriginCellOffset);
+
+                        // transform coordinate system to center of x-z plane of initialization
+                        constexpr uint8_t planeNormalDir = 1u;
+                        DataSpace<simDim> offsetToCenterOfPlane(m_offsetToTotalDomain);
+                        offsetToCenterOfPlane[planeNormalDir] = 0; // do not shift origin of plane normal
+                        floatD_X const pos
+                            = precisionCast<float_X>(localCell + offsetToCenterOfPlane) * cellSize.shrink<simDim>();
+                        // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
+
+                        // transversal position only
+                        float3_X const w0_3D(Unitless::W0_X, 0., Unitless::W0_Z);
+                        auto const w0(w0_3D.shrink<simDim>().remove<planeNormalDir>());
+                        auto const pos_trans(pos.remove<planeNormalDir>());
+                        auto const exp_compos(pos_trans * pos_trans / (w0 * w0));
+                        float_X const exp_arg(exp_compos.sumOfComponents());
+
+                        m_elong *= math::exp(-1.0_X * exp_arg);
+
+                        if(Unitless::initPlaneY != 0) // compile time if
+                        {
+                            /* If the laser is not initialized in the first cell we emit a
+                             * negatively and positively propagating wave. Therefore we need to multiply the
+                             * amplitude with a correction factor depending of the cell size in
+                             * propagation direction.
+                             * The negatively propagating wave is damped by the absorber.
+                             *
+                             * The `correctionFactor` assume that the wave is moving in y direction.
+                             */
+                            auto const correctionFactor = (SPEED_OF_LIGHT * DELTA_T) / CELL_HEIGHT * 2._X;
+
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT())
+                                += correctionFactor * m_elong;
+                        }
+                        else
+                        {
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT()) = m_elong;
+                        }
+                    }
+                };
+            } // namespace acc
+
+            template<typename T_Params>
+            struct Polynom : public polynom::Unitless<T_Params>
             {
-                elong.x() = Unitless::AMPLITUDE * Tpolynomial( tau ) / math::sqrt( 2.0_X ) *
-                    math::sin( omegaLaser * ( runTime - t_rise ) + Unitless::LASER_PHASE );
-                elong.z() = Unitless::AMPLITUDE * Tpolynomial( tau ) / math::sqrt( 2.0_X ) *
-                    math::cos( omegaLaser * ( runTime - t_rise ) + Unitless::LASER_PHASE );
-            }
-        }
-
-        /** create device manipulator functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset (in supercells, without guards) to the
-         *        origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::Polynom< Unitless >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const &
-        ) const
-        {
-            auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
-            return acc::Polynom< Unitless >( dataBoxE, superCellToLocalOriginCellOffset, offsetToTotalDomain, elong, phase );
-        }
-
-        //! get the name of the laser profile
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return "Polynom";
-        }
-
-    };
-
-} // namespace laserProfiles
-} // namespace fields
+                using Unitless = polynom::Unitless<T_Params>;
+
+                float3_X elong;
+                float_X phase;
+                typename FieldE::DataBoxType dataBoxE;
+                DataSpace<simDim> offsetToTotalDomain;
+
+                HDINLINE float_X Tpolynomial(float_X const tau)
+                {
+                    float_X result(0.0_X);
+                    if(tau >= 0.0_X && tau <= 1.0_X)
+                        result = tau * tau * tau * (10.0_X - 15.0_X * tau + 6.0_X * tau * tau);
+                    else if(tau > 1.0_X && tau <= 2.0_X)
+                        result = (2.0_X - tau) * (2.0_X - tau) * (2.0_X - tau)
+                            * (4.0_X - 9.0_X * tau + 6.0_X * tau * tau);
+
+                    return result;
+                }
+
+                /** constructor
+                 *
+                 * @param currentStep current simulation time step
+                 */
+                HINLINE Polynom(uint32_t currentStep) : phase(0.0_X)
+                {
+                    // get data
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    dataBoxE = dc.get<FieldE>(FieldE::getName(), true)->getDeviceDataBox();
+
+                    // get meta data for offsets
+                    SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                    // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
+                    DataSpace<simDim> const globalCellOffset(subGrid.getLocalDomain().offset);
+                    DataSpace<simDim> const halfSimSize(subGrid.getGlobalDomain().size / 2);
+
+                    // transform coordinate system to center of global simulation as origin [cells]
+                    offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
+
+                    // @todo reset origin of direction of moving window
+                    // offsetToTotalDomain.y() = 0
+
+                    float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift;
+
+                    elong = float3_X::create(0.0_X);
+
+                    /* a symmetric pulse will be initialized at position z=0
+                     * the laser amplitude rises  for t_rise
+                     * and falls for t_rise
+                     * making the laser pulse 2*t_rise long
+                     */
+
+                    const float_X t_rise = 0.5_X * Unitless::PULSE_LENGTH;
+                    const float_X tau = runTime / t_rise;
+
+                    const float_X omegaLaser = 2.0_X * PI * Unitless::f;
+
+                    if(Unitless::Polarisation == Unitless::LINEAR_X)
+                    {
+                        elong.x() = Unitless::AMPLITUDE * Tpolynomial(tau)
+                            * math::sin(omegaLaser * (runTime - t_rise) + Unitless::LASER_PHASE);
+                    }
+                    else if(Unitless::Polarisation == Unitless::LINEAR_Z)
+                    {
+                        elong.z() = Unitless::AMPLITUDE * Tpolynomial(tau)
+                            * math::sin(omegaLaser * (runTime - t_rise) + Unitless::LASER_PHASE);
+                    }
+                    else if(Unitless::Polarisation == Unitless::CIRCULAR)
+                    {
+                        elong.x() = Unitless::AMPLITUDE * Tpolynomial(tau) / math::sqrt(2.0_X)
+                            * math::sin(omegaLaser * (runTime - t_rise) + Unitless::LASER_PHASE);
+                        elong.z() = Unitless::AMPLITUDE * Tpolynomial(tau) / math::sqrt(2.0_X)
+                            * math::cos(omegaLaser * (runTime - t_rise) + Unitless::LASER_PHASE);
+                    }
+                }
+
+                /** create device manipulator functor
+                 *
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @tparam T_Acc alpaka accelerator type
+                 *
+                 * @param alpaka accelerator
+                 * @param localSupercellOffset (in supercells, without guards) to the
+                 *        origin of the local domain
+                 * @param configuration of the worker
+                 */
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::Polynom<Unitless> operator()(
+                    T_Acc const&,
+                    DataSpace<simDim> const& localSupercellOffset,
+                    T_WorkerCfg const&) const
+                {
+                    auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
+                    return acc::Polynom<Unitless>(
+                        dataBoxE,
+                        superCellToLocalOriginCellOffset,
+                        offsetToTotalDomain,
+                        elong,
+                        phase);
+                }
+
+                //! get the name of the laser profile
+                static HINLINE std::string getName()
+                {
+                    return "Polynom";
+                }
+            };
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
-
diff --git a/include/picongpu/fields/laserProfiles/PulseFrontTilt.def b/include/picongpu/fields/laserProfiles/PulseFrontTilt.def
index 45c1dec928..3c6229b21a 100644
--- a/include/picongpu/fields/laserProfiles/PulseFrontTilt.def
+++ b/include/picongpu/fields/laserProfiles/PulseFrontTilt.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Anton Helm, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Anton Helm, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Axel Huebl, Alexander Debus
  *
  * This file is part of PIConGPU.
@@ -25,111 +25,113 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace pulseFrontTilt
-{
-namespace defaults
-{
-    struct PulseFrontTiltParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        //static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *
-         *  unit: meter */
-        static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
-
-        /** the distance to the laser focus in y-direction
-         *  unit: meter */
-        static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
-
-        /** the tilt angle between laser propagation in y-direction and laser axis in
-        *  x-direction (0 degree == no tilt)
-        *  unit: degree */
-        static constexpr float_64 TILT_X_SI = 0.0;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 20.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        //! Available polarisation types
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-} // namespace defaults
-} // namespace pulseFrontTilt
-
-    /** Gaussian Beam laser profile with titled pulse front
-     *
-     * @tparam T_Params class parameter to configure the Gaussian Beam with
-     *                  pulse front titlt, see members of
-     *                  pulseFrontTilt::defaults::PulseFrontTiltParam for
-     *                  required members
-     */
-    template< typename T_Params = pulseFrontTilt::defaults::PulseFrontTiltParam >
-    struct PulseFrontTilt;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace pulseFrontTilt
+            {
+                namespace defaults
+                {
+                    struct PulseFrontTiltParam
+                    {
+                        /** unit: meter */
+                        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                        /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                            * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                            * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                        /** unit: W / m^2 */
+                        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                        /** unit: none */
+                        // static constexpr float_64 _A0  = 1.5;
+
+                        /** unit: Volt / meter */
+                        // static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                        /** unit: Volt / meter */
+                        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                        /** Pulse length: sigma of std. gauss for intensity (E^2)
+                         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                         *                                          [    2.354820045     ]
+                         *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                         *                      = what a experimentalist calls "pulse duration"
+                         *
+                         *  unit: seconds (1 sigma) */
+                        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                        /** beam waist: distance from the axis where the pulse intensity (E^2)
+                         *              decreases to its 1/e^2-th part,
+                         *              at the focus position of the laser
+                         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                         *                             [   1.17741    ]
+                         *
+                         *  unit: meter */
+                        static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
+
+                        /** the distance to the laser focus in y-direction
+                         *  unit: meter */
+                        static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
+
+                        /** the tilt angle between laser propagation in y-direction and laser axis in
+                         *  x-direction (0 degree == no tilt)
+                         *  unit: degree */
+                        static constexpr float_64 TILT_X_SI = 0.0;
+
+                        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                         *
+                         *  unit: none */
+                        static constexpr float_64 PULSE_INIT = 20.0;
+
+                        /** cell from top where the laser is initialized
+                         *
+                         * if `initPlaneY == 0` than the absorber are disabled.
+                         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                         * direction is enabled
+                         *
+                         * valid ranges:
+                         *   - initPlaneY == 0
+                         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                         */
+                        static constexpr uint32_t initPlaneY = 0;
+
+                        /** laser phase shift (no shift: 0.0)
+                         *
+                         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                         *
+                         * unit: rad, periodic in 2*pi
+                         */
+                        static constexpr float_X LASER_PHASE = 0.0;
+
+                        //! Available polarisation types
+                        enum PolarisationType
+                        {
+                            LINEAR_X = 1u,
+                            LINEAR_Z = 2u,
+                            CIRCULAR = 4u,
+                        };
+
+                        /** Polarization selection
+                         */
+                        static constexpr PolarisationType Polarisation = LINEAR_X;
+                    };
+                } // namespace defaults
+            } // namespace pulseFrontTilt
+
+            /** Gaussian Beam laser profile with titled pulse front
+             *
+             * @tparam T_Params class parameter to configure the Gaussian Beam with
+             *                  pulse front titlt, see members of
+             *                  pulseFrontTilt::defaults::PulseFrontTiltParam for
+             *                  required members
+             */
+            template<typename T_Params = pulseFrontTilt::defaults::PulseFrontTiltParam>
+            struct PulseFrontTilt;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/PulseFrontTilt.hpp b/include/picongpu/fields/laserProfiles/PulseFrontTilt.hpp
index 5e33222657..f2214f11ca 100644
--- a/include/picongpu/fields/laserProfiles/PulseFrontTilt.hpp
+++ b/include/picongpu/fields/laserProfiles/PulseFrontTilt.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Anton Helm, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Anton Helm, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Axel Huebl, Alexander Debus
  *
  * This file is part of PIConGPU.
@@ -28,281 +28,300 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace pulseFrontTilt
-{
-    template< typename T_Params >
-    struct Unitless : public T_Params
-    {
-        using Params = T_Params;
-
-        static constexpr float_X WAVE_LENGTH = float_X( Params::WAVE_LENGTH_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X PULSE_LENGTH = float_X( Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (1 sigma)
-        static constexpr float_X AMPLITUDE = float_X( Params::AMPLITUDE_SI / UNIT_EFIELD ); // unit: Volt /meter
-        static constexpr float_X W0 = float_X( Params::W0_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X FOCUS_POS = float_X( Params::FOCUS_POS_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X INIT_TIME = float_X( Params::PULSE_INIT * Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (full initialization length)
-        static constexpr float_X TILT_X = float_X( Params::TILT_X_SI * PI / 180. ); // unit: radiant (in dimensions of pi)
-
-        /* initialize the laser not in the first cell is equal to a negative shift
-         * in time
-         */
-        static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
-
-        static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
-
-    };
-} // namespace pulseFrontTilt
-
-namespace acc
-{
-    template< typename T_Unitless >
-    struct PulseFrontTilt : public T_Unitless
-    {
-        using Unitless = T_Unitless;
-
-        float3_X m_elong;
-        float_X m_phase;
-        typename FieldE::DataBoxType m_dataBoxE;
-        DataSpace< simDim > m_offsetToTotalDomain;
-        DataSpace< simDim > m_superCellToLocalOriginCellOffset;
-
-        /** Device-Side Constructor
-         *
-         * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
-         * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly after transform to centered origin)
-         */
-        HDINLINE PulseFrontTilt(
-            typename FieldE::DataBoxType const & dataBoxE,
-            DataSpace< simDim > const & superCellToLocalOriginCellOffset,
-            DataSpace< simDim > const & offsetToTotalDomain,
-            float3_X const & elong,
-            float_X const phase
-        ) :
-            m_elong( elong ),
-            m_phase( phase ),
-            m_dataBoxE( dataBoxE ),
-            m_offsetToTotalDomain( offsetToTotalDomain ),
-            m_superCellToLocalOriginCellOffset( superCellToLocalOriginCellOffset )
-        {
-        }
-
-        /** device side manipulation for init plane (transversal)
-         *
-         * @tparam T_Args type of the arguments passed to the user manipulator functor
-         *
-         * @param cellIndexInSuperCell ND cell index in current supercell
-         */
-        template< typename T_Acc >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            DataSpace< simDim > const & cellIndexInSuperCell
-        )
-        {
-            // coordinate system to global simulation as origin
-            DataSpace< simDim > const localCell(
-                cellIndexInSuperCell +
-                m_superCellToLocalOriginCellOffset
-            );
-
-            // transform coordinate system to center of x-z plane of initialization
-            constexpr uint8_t planeNormalDir = 1u;
-            DataSpace< simDim > offsetToCenterOfPlane( m_offsetToTotalDomain );
-            offsetToCenterOfPlane[ planeNormalDir ] = 0; // do not shift origin of plane normal
-            floatD_X const pos = precisionCast< float_X >( localCell + offsetToCenterOfPlane ) * cellSize.shrink< simDim >();
-            // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
-
-            // calculate focus position relative to the laser initialization plane
-            float_X const focusPos = Unitless::FOCUS_POS - pos.y();
-
-            float_X const timeShift = m_phase / ( 2.0_X * float_X( PI ) * float_X( Unitless::f ) ) + focusPos / SPEED_OF_LIGHT;
-            float_X const local_tilt_x = Unitless::TILT_X;
-            float_X const spaceShift_x = SPEED_OF_LIGHT * algorithms::math::tan( local_tilt_x ) * timeShift / cellSize.y();
-
-            // transversal position only
-            // floatD_X planeNoNormal = floatD_X::create( 1.0 );
-            // planeNoNormal[ planeNormalDir ] = 0.0;
-            // Gaussian Beam with zero tilt:
-            //            r2 = math::abs2( pos * planeNoNormal );
-            auto const spaceShift = float3_X( spaceShift_x, 0., 0. ).shrink< simDim >().remove< planeNormalDir >();
-            auto const pos_trans( pos.remove< planeNormalDir >() );
-
-            float_X const r2 = math::abs2( pos_trans + spaceShift );
-
-            // rayleigh length (in y-direction)
-            float_X const y_R = float_X( PI ) * Unitless::W0 * Unitless::W0 / Unitless::WAVE_LENGTH;
-
-            // inverse radius of curvature of the beam's  wavefronts
-            float_X const R_y_inv = -focusPos / ( y_R * y_R  + focusPos * focusPos);
-
-            // beam waist in the near field: w_y(y=0) == W0
-            float_X const w_y = Unitless::W0 * algorithms::math::sqrt( 1.0_X + ( focusPos / y_R )*( focusPos / y_R ) );
-            //! the Gouy phase shift
-            float_X const xi_y = algorithms::math::atan( -focusPos / y_R );
-
-            if( Unitless::Polarisation == Unitless::LINEAR_X || Unitless::Polarisation == Unitless::LINEAR_Z )
-            {
-                m_elong *= math::exp( -r2 / w_y / w_y ) * math::cos( 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * focusPos - 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + xi_y + m_phase )
-                    * math::exp( -( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                          *( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                          / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0_X * Unitless::PULSE_LENGTH ) / ( 2.0_X * Unitless::PULSE_LENGTH ) );
-            }
-            else if( Unitless::Polarisation == Unitless::CIRCULAR )
-            {
-                m_elong.x() *= math::exp( -r2 / w_y / w_y ) * math::cos( 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * focusPos - 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + xi_y + m_phase )
-                    * math::exp( -( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                          *( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                          / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0_X * Unitless::PULSE_LENGTH ) / ( 2.0_X * Unitless::PULSE_LENGTH ) );
-                m_phase += float_X( PI ) / 2.0_X;
-                m_elong.z() *= math::exp( -r2 / w_y / w_y ) * math::cos( 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * focusPos - 2.0_X * float_X( PI ) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + xi_y + m_phase )
-                    * math::exp( -( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::WAVE_LENGTH )
-                          *( r2 / 2.0_X * R_y_inv - focusPos - m_phase / 2.0_X / float_X( PI ) * Unitless::Unitless::WAVE_LENGTH )
-                          / SPEED_OF_LIGHT / SPEED_OF_LIGHT / ( 2.0_X * Unitless::PULSE_LENGTH ) / ( 2.0_X * Unitless::PULSE_LENGTH ) );
-                // reminder: if you want to use phase below, substract pi/2
-                // m_phase -= float_X( PI ) / 2.0_X;
-            }
-
-            if( Unitless::initPlaneY != 0 ) // compile time if
-            {
-                /* If the laser is not initialized in the first cell we emit a
-                 * negatively and positively propagating wave. Therefore we need to multiply the
-                 * amplitude with a correction factor depending of the cell size in
-                 * propagation direction.
-                 * The negatively propagating wave is damped by the absorber.
-                 *
-                 * The `correctionFactor` assume that the wave is moving in y direction.
-                 */
-                auto const correctionFactor = ( SPEED_OF_LIGHT * DELTA_T ) / CELL_HEIGHT * 2._X;
-
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) +=  correctionFactor * m_elong;
-            }
-            else
-            {
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) = m_elong;
-            }
-        }
-    };
-} // namespace acc
-
-    template< typename T_Params >
-    struct PulseFrontTilt : public pulseFrontTilt::Unitless< T_Params >
+    namespace fields
     {
-        using Unitless = pulseFrontTilt::Unitless< T_Params >;
-
-        float3_X elong;
-        float_X phase;
-        typename FieldE::DataBoxType dataBoxE;
-        DataSpace< simDim > offsetToTotalDomain;
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE PulseFrontTilt( uint32_t currentStep )
+        namespace laserProfiles
         {
-            // get data
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            dataBoxE = dc.get< FieldE >(
-                FieldE::getName(),
-                true
-            )->getDeviceDataBox();
-
-            // get meta data for offsets
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get().SubGrid();
-            // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
-            DataSpace< simDim > const globalCellOffset( subGrid.getLocalDomain().offset );
-            DataSpace< simDim > const halfSimSize( subGrid.getGlobalDomain().size / 2 );
-
-            // transform coordinate system to center of global simulation as origin [cells]
-            offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
-
-            // @todo reset origin of direction of moving window
-            // offsetToTotalDomain.y() = 0
-
-            float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift;
-
-            // calculate focus position relative to the laser initialization plane
-            float_X const focusPos = Unitless::FOCUS_POS - Unitless::initPlaneY * CELL_HEIGHT;
-
-            elong = float3_X::create( 0.0 );
-
-            // a symmetric pulse will be initialized at position z=0 for
-            // a time of PULSE_INIT * PULSE_LENGTH = INIT_TIME.
-            // we shift the complete pulse for the half of this time to start with
-            // the front of the laser pulse.
-            constexpr float_64 mue = 0.5 * Unitless::INIT_TIME;
-
-            // rayleigh length (in y-direction)
-            constexpr float_64 y_R = PI * Unitless::W0 * Unitless::W0 / Unitless::WAVE_LENGTH;
-            // gaussian beam waist in the nearfield: w_y(y=0) == W0
-            float_64 const w_y = Unitless::W0 * math::sqrt( 1.0 + ( focusPos / y_R )*( focusPos / y_R ) );
-
-            float_64 envelope = float_64( Unitless::AMPLITUDE );
-            if( simDim == DIM2 )
-                envelope *= math::sqrt( float_64( Unitless::W0 ) / w_y );
-            else if( simDim == DIM3 )
-                envelope *= float_64( Unitless::W0 ) / w_y;
-            /* no 1D representation/implementation */
-
-            if( Unitless::Polarisation == Unitless::LINEAR_X )
+            namespace pulseFrontTilt
             {
-                elong.x() = float_X( envelope );
-            }
-            else if( Unitless::Polarisation == Unitless::LINEAR_Z )
+                template<typename T_Params>
+                struct Unitless : public T_Params
+                {
+                    using Params = T_Params;
+
+                    static constexpr float_X WAVE_LENGTH
+                        = float_X(Params::WAVE_LENGTH_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X PULSE_LENGTH
+                        = float_X(Params::PULSE_LENGTH_SI / UNIT_TIME); // unit: seconds (1 sigma)
+                    static constexpr float_X AMPLITUDE
+                        = float_X(Params::AMPLITUDE_SI / UNIT_EFIELD); // unit: Volt /meter
+                    static constexpr float_X W0 = float_X(Params::W0_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X FOCUS_POS = float_X(Params::FOCUS_POS_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X INIT_TIME = float_X(
+                        Params::PULSE_INIT * Params::PULSE_LENGTH_SI
+                        / UNIT_TIME); // unit: seconds (full initialization length)
+                    static constexpr float_X TILT_X
+                        = float_X(Params::TILT_X_SI * PI / 180.); // unit: radiant (in dimensions of pi)
+
+                    /* initialize the laser not in the first cell is equal to a negative shift
+                     * in time
+                     */
+                    static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
+
+                    static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
+                };
+            } // namespace pulseFrontTilt
+
+            namespace acc
             {
-                elong.z() = float_X( envelope );
-            }
-            else if( Unitless::Polarisation == Unitless::CIRCULAR )
+                template<typename T_Unitless>
+                struct PulseFrontTilt : public T_Unitless
+                {
+                    using Unitless = T_Unitless;
+
+                    float3_X m_elong;
+                    float_X m_phase;
+                    typename FieldE::DataBoxType m_dataBoxE;
+                    DataSpace<simDim> m_offsetToTotalDomain;
+                    DataSpace<simDim> m_superCellToLocalOriginCellOffset;
+
+                    /** Device-Side Constructor
+                     *
+                     * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
+                     * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly
+                     * after transform to centered origin)
+                     */
+                    HDINLINE PulseFrontTilt(
+                        typename FieldE::DataBoxType const& dataBoxE,
+                        DataSpace<simDim> const& superCellToLocalOriginCellOffset,
+                        DataSpace<simDim> const& offsetToTotalDomain,
+                        float3_X const& elong,
+                        float_X const phase)
+                        : m_elong(elong)
+                        , m_phase(phase)
+                        , m_dataBoxE(dataBoxE)
+                        , m_offsetToTotalDomain(offsetToTotalDomain)
+                        , m_superCellToLocalOriginCellOffset(superCellToLocalOriginCellOffset)
+                    {
+                    }
+
+                    /** device side manipulation for init plane (transversal)
+                     *
+                     * @tparam T_Args type of the arguments passed to the user manipulator functor
+                     *
+                     * @param cellIndexInSuperCell ND cell index in current supercell
+                     */
+                    template<typename T_Acc>
+                    HDINLINE void operator()(T_Acc const&, DataSpace<simDim> const& cellIndexInSuperCell)
+                    {
+                        // coordinate system to global simulation as origin
+                        DataSpace<simDim> const localCell(cellIndexInSuperCell + m_superCellToLocalOriginCellOffset);
+
+                        // transform coordinate system to center of x-z plane of initialization
+                        constexpr uint8_t planeNormalDir = 1u;
+                        DataSpace<simDim> offsetToCenterOfPlane(m_offsetToTotalDomain);
+                        offsetToCenterOfPlane[planeNormalDir] = 0; // do not shift origin of plane normal
+                        floatD_X const pos
+                            = precisionCast<float_X>(localCell + offsetToCenterOfPlane) * cellSize.shrink<simDim>();
+                        // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
+
+                        // calculate focus position relative to the laser initialization plane
+                        float_X const focusPos = Unitless::FOCUS_POS - pos.y();
+
+                        float_X const timeShift
+                            = m_phase / (2.0_X * float_X(PI) * float_X(Unitless::f)) + focusPos / SPEED_OF_LIGHT;
+                        float_X const local_tilt_x = Unitless::TILT_X;
+                        float_X const spaceShift_x
+                            = SPEED_OF_LIGHT * math::tan(local_tilt_x) * timeShift / cellSize.y();
+
+                        // transversal position only
+                        // floatD_X planeNoNormal = floatD_X::create( 1.0 );
+                        // planeNoNormal[ planeNormalDir ] = 0.0;
+                        // Gaussian Beam with zero tilt:
+                        //            r2 = pmacc::math::abs2( pos * planeNoNormal );
+                        auto const spaceShift
+                            = float3_X(spaceShift_x, 0., 0.).shrink<simDim>().remove<planeNormalDir>();
+                        auto const pos_trans(pos.remove<planeNormalDir>());
+
+                        float_X const r2 = pmacc::math::abs2(pos_trans + spaceShift);
+
+                        // rayleigh length (in y-direction)
+                        float_X const y_R = float_X(PI) * Unitless::W0 * Unitless::W0 / Unitless::WAVE_LENGTH;
+
+                        // inverse radius of curvature of the beam's  wavefronts
+                        float_X const R_y_inv = -focusPos / (y_R * y_R + focusPos * focusPos);
+
+                        // beam waist in the near field: w_y(y=0) == W0
+                        float_X const w_y = Unitless::W0 * math::sqrt(1.0_X + (focusPos / y_R) * (focusPos / y_R));
+                        //! the Gouy phase shift
+                        float_X const xi_y = math::atan(-focusPos / y_R);
+
+                        if(Unitless::Polarisation == Unitless::LINEAR_X
+                           || Unitless::Polarisation == Unitless::LINEAR_Z)
+                        {
+                            m_elong *= math::exp(-r2 / w_y / w_y)
+                                * math::cos(
+                                           2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * focusPos
+                                           - 2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + xi_y
+                                           + m_phase)
+                                * math::exp(
+                                           -(r2 / 2.0_X * R_y_inv - focusPos
+                                             - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                           * (r2 / 2.0_X * R_y_inv - focusPos
+                                              - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                           / SPEED_OF_LIGHT / SPEED_OF_LIGHT / (2.0_X * Unitless::PULSE_LENGTH)
+                                           / (2.0_X * Unitless::PULSE_LENGTH));
+                        }
+                        else if(Unitless::Polarisation == Unitless::CIRCULAR)
+                        {
+                            m_elong.x() *= math::exp(-r2 / w_y / w_y)
+                                * math::cos(2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * focusPos
+                                            - 2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + xi_y
+                                            + m_phase)
+                                * math::exp(-(r2 / 2.0_X * R_y_inv - focusPos
+                                              - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                            * (r2 / 2.0_X * R_y_inv - focusPos
+                                               - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                            / SPEED_OF_LIGHT / SPEED_OF_LIGHT / (2.0_X * Unitless::PULSE_LENGTH)
+                                            / (2.0_X * Unitless::PULSE_LENGTH));
+                            m_phase += float_X(PI) / 2.0_X;
+                            m_elong.z() *= math::exp(-r2 / w_y / w_y)
+                                * math::cos(2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * focusPos
+                                            - 2.0_X * float_X(PI) / Unitless::WAVE_LENGTH * r2 / 2.0_X * R_y_inv + xi_y
+                                            + m_phase)
+                                * math::exp(-(r2 / 2.0_X * R_y_inv - focusPos
+                                              - m_phase / 2.0_X / float_X(PI) * Unitless::WAVE_LENGTH)
+                                            * (r2 / 2.0_X * R_y_inv - focusPos
+                                               - m_phase / 2.0_X / float_X(PI) * Unitless::Unitless::WAVE_LENGTH)
+                                            / SPEED_OF_LIGHT / SPEED_OF_LIGHT / (2.0_X * Unitless::PULSE_LENGTH)
+                                            / (2.0_X * Unitless::PULSE_LENGTH));
+                            // reminder: if you want to use phase below, substract pi/2
+                            // m_phase -= float_X( PI ) / 2.0_X;
+                        }
+
+                        if(Unitless::initPlaneY != 0) // compile time if
+                        {
+                            /* If the laser is not initialized in the first cell we emit a
+                             * negatively and positively propagating wave. Therefore we need to multiply the
+                             * amplitude with a correction factor depending of the cell size in
+                             * propagation direction.
+                             * The negatively propagating wave is damped by the absorber.
+                             *
+                             * The `correctionFactor` assume that the wave is moving in y direction.
+                             */
+                            auto const correctionFactor = (SPEED_OF_LIGHT * DELTA_T) / CELL_HEIGHT * 2._X;
+
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT())
+                                += correctionFactor * m_elong;
+                        }
+                        else
+                        {
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT()) = m_elong;
+                        }
+                    }
+                };
+            } // namespace acc
+
+            template<typename T_Params>
+            struct PulseFrontTilt : public pulseFrontTilt::Unitless<T_Params>
             {
-                elong.x() = float_X( envelope ) / math::sqrt( 2.0_X );
-                elong.z() = float_X( envelope ) / math::sqrt( 2.0_X );
-            }
-
-            phase = 2.0_X * float_X( PI ) * float_X( Unitless::f ) * ( runTime - float_X( mue ) - focusPos / SPEED_OF_LIGHT ) + Unitless::LASER_PHASE;
-        }
-
-        /** create device manipulator functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset (in supercells, without guards) to the
-         *        origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::PulseFrontTilt< Unitless >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const &
-        ) const
-        {
-            auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
-            return acc::PulseFrontTilt< Unitless >( dataBoxE, superCellToLocalOriginCellOffset, offsetToTotalDomain, elong, phase );
-        }
-
-        //! get the name of the laser profile
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return "PulseFrontTilt";
-        }
+                using Unitless = pulseFrontTilt::Unitless<T_Params>;
 
-    };
+                float3_X elong;
+                float_X phase;
+                typename FieldE::DataBoxType dataBoxE;
+                DataSpace<simDim> offsetToTotalDomain;
 
-} // namespace laserProfiles
-} // namespace fields
+                /** constructor
+                 *
+                 * @param currentStep current simulation time step
+                 */
+                HINLINE PulseFrontTilt(uint32_t currentStep)
+                {
+                    // get data
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    dataBoxE = dc.get<FieldE>(FieldE::getName(), true)->getDeviceDataBox();
+
+                    // get meta data for offsets
+                    SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                    // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
+                    DataSpace<simDim> const globalCellOffset(subGrid.getLocalDomain().offset);
+                    DataSpace<simDim> const halfSimSize(subGrid.getGlobalDomain().size / 2);
+
+                    // transform coordinate system to center of global simulation as origin [cells]
+                    offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
+
+                    // @todo reset origin of direction of moving window
+                    // offsetToTotalDomain.y() = 0
+
+                    float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift;
+
+                    // calculate focus position relative to the laser initialization plane
+                    float_X const focusPos = Unitless::FOCUS_POS - Unitless::initPlaneY * CELL_HEIGHT;
+
+                    elong = float3_X::create(0.0);
+
+                    // a symmetric pulse will be initialized at position z=0 for
+                    // a time of PULSE_INIT * PULSE_LENGTH = INIT_TIME.
+                    // we shift the complete pulse for the half of this time to start with
+                    // the front of the laser pulse.
+                    constexpr float_64 mue = 0.5 * Unitless::INIT_TIME;
+
+                    // rayleigh length (in y-direction)
+                    constexpr float_64 y_R = PI * Unitless::W0 * Unitless::W0 / Unitless::WAVE_LENGTH;
+                    // gaussian beam waist in the nearfield: w_y(y=0) == W0
+                    float_64 const w_y = Unitless::W0 * math::sqrt(1.0 + (focusPos / y_R) * (focusPos / y_R));
+
+                    float_64 envelope = float_64(Unitless::AMPLITUDE);
+                    if(simDim == DIM2)
+                        envelope *= math::sqrt(float_64(Unitless::W0) / w_y);
+                    else if(simDim == DIM3)
+                        envelope *= float_64(Unitless::W0) / w_y;
+                    /* no 1D representation/implementation */
+
+                    if(Unitless::Polarisation == Unitless::LINEAR_X)
+                    {
+                        elong.x() = float_X(envelope);
+                    }
+                    else if(Unitless::Polarisation == Unitless::LINEAR_Z)
+                    {
+                        elong.z() = float_X(envelope);
+                    }
+                    else if(Unitless::Polarisation == Unitless::CIRCULAR)
+                    {
+                        elong.x() = float_X(envelope) / math::sqrt(2.0_X);
+                        elong.z() = float_X(envelope) / math::sqrt(2.0_X);
+                    }
+
+                    phase = 2.0_X * float_X(PI) * float_X(Unitless::f)
+                            * (runTime - float_X(mue) - focusPos / SPEED_OF_LIGHT)
+                        + Unitless::LASER_PHASE;
+                }
+
+                /** create device manipulator functor
+                 *
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @tparam T_Acc alpaka accelerator type
+                 *
+                 * @param alpaka accelerator
+                 * @param localSupercellOffset (in supercells, without guards) to the
+                 *        origin of the local domain
+                 * @param configuration of the worker
+                 */
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::PulseFrontTilt<Unitless> operator()(
+                    T_Acc const&,
+                    DataSpace<simDim> const& localSupercellOffset,
+                    T_WorkerCfg const&) const
+                {
+                    auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
+                    return acc::PulseFrontTilt<Unitless>(
+                        dataBoxE,
+                        superCellToLocalOriginCellOffset,
+                        offsetToTotalDomain,
+                        elong,
+                        phase);
+                }
+
+                //! get the name of the laser profile
+                static HINLINE std::string getName()
+                {
+                    return "PulseFrontTilt";
+                }
+            };
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
-
diff --git a/include/picongpu/fields/laserProfiles/Wavepacket.def b/include/picongpu/fields/laserProfiles/Wavepacket.def
index 192b69fce6..ac13a4d878 100644
--- a/include/picongpu/fields/laserProfiles/Wavepacket.def
+++ b/include/picongpu/fields/laserProfiles/Wavepacket.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Stefan Tietze
  *
  * This file is part of PIConGPU.
@@ -25,106 +25,109 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace wavepacket
-{
-namespace defaults
-{
-    struct WavepacketParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        //static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Stretch temporal profile by a constant plateau between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 7.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *
-         *  unit: meter */
-        static constexpr float_64 W0_X_SI = 4.246e-6;
-        static constexpr float_64 W0_Z_SI = W0_X_SI;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 20.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-} // namespace defaults
-} // namespace wavepacket
-
-    /** Wavepacket with Gaussian spatial and temporal envelope
-     *
-     * @tparam T_Params class parameter to configure the Wavepacket profile,
-     *                  see members of wavepacket::defaults::WavepacketParam for
-     *                  required members
-     */
-    template< typename T_Params = wavepacket::defaults::WavepacketParam >
-    struct Wavepacket;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace wavepacket
+            {
+                namespace defaults
+                {
+                    struct WavepacketParam
+                    {
+                        /** unit: meter */
+                        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                        /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                            * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                            * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                        /** unit: W / m^2 */
+                        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                        /** unit: none */
+                        // static constexpr float_64 _A0  = 1.5;
+
+                        /** unit: Volt / meter */
+                        // static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                        /** unit: Volt / meter */
+                        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                        /** Stretch temporal profile by a constant plateau between the up and downramp
+                         *  unit: seconds */
+                        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI
+                            = 7.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
+
+                        /** Pulse length: sigma of std. gauss for intensity (E^2)
+                         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                         *                                          [    2.354820045     ]
+                         *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                         *                      = what a experimentalist calls "pulse duration"
+                         *
+                         *  unit: seconds (1 sigma) */
+                        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                        /** beam waist: distance from the axis where the pulse intensity (E^2)
+                         *              decreases to its 1/e^2-th part,
+                         *              at the focus position of the laser
+                         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                         *                             [   1.17741    ]
+                         *
+                         *  unit: meter */
+                        static constexpr float_64 W0_X_SI = 4.246e-6;
+                        static constexpr float_64 W0_Z_SI = W0_X_SI;
+
+                        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                         *
+                         *  unit: none */
+                        static constexpr float_64 PULSE_INIT = 20.0;
+
+                        /** cell from top where the laser is initialized
+                         *
+                         * if `initPlaneY == 0` than the absorber are disabled.
+                         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                         * direction is enabled
+                         *
+                         * valid ranges:
+                         *   - initPlaneY == 0
+                         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                         */
+                        static constexpr uint32_t initPlaneY = 0;
+
+                        /** laser phase shift (no shift: 0.0)
+                         *
+                         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                         *
+                         * unit: rad, periodic in 2*pi
+                         */
+                        static constexpr float_X LASER_PHASE = 0.0;
+
+                        /** Available polarisation types
+                         */
+                        enum PolarisationType
+                        {
+                            LINEAR_X = 1u,
+                            LINEAR_Z = 2u,
+                            CIRCULAR = 4u,
+                        };
+                        /** Polarization selection
+                         */
+                        static constexpr PolarisationType Polarisation = LINEAR_X;
+                    };
+                } // namespace defaults
+            } // namespace wavepacket
+
+            /** Wavepacket with Gaussian spatial and temporal envelope
+             *
+             * @tparam T_Params class parameter to configure the Wavepacket profile,
+             *                  see members of wavepacket::defaults::WavepacketParam for
+             *                  required members
+             */
+            template<typename T_Params = wavepacket::defaults::WavepacketParam>
+            struct Wavepacket;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/fields/laserProfiles/Wavepacket.hpp b/include/picongpu/fields/laserProfiles/Wavepacket.hpp
index 0cfd747652..9f922216a1 100644
--- a/include/picongpu/fields/laserProfiles/Wavepacket.hpp
+++ b/include/picongpu/fields/laserProfiles/Wavepacket.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Stefan Tietze
  *
  * This file is part of PIConGPU.
@@ -28,249 +28,245 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace wavepacket
-{
-    template< typename T_Params >
-    struct Unitless : public T_Params
-    {
-        using Params = T_Params;
-
-        static constexpr float_X WAVE_LENGTH = float_X( Params::WAVE_LENGTH_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X PULSE_LENGTH = float_X( Params::PULSE_LENGTH_SI / UNIT_TIME ); // unit: seconds (1 sigma)
-        static constexpr float_X LASER_NOFOCUS_CONSTANT = float_X( Params::LASER_NOFOCUS_CONSTANT_SI / UNIT_TIME ); //unit: seconds
-        static constexpr float_X AMPLITUDE = float_X( Params::AMPLITUDE_SI / UNIT_EFIELD ); // unit: Volt /meter
-        static constexpr float_X W0_X = float_X( Params::W0_X_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X W0_Z = float_X( Params::W0_Z_SI / UNIT_LENGTH ); // unit: meter
-        static constexpr float_X INIT_TIME = float_X( Params::PULSE_INIT * PULSE_LENGTH + LASER_NOFOCUS_CONSTANT ); // unit: seconds (full initialization length)
-        static constexpr float_X endUpramp = -0.5_X * LASER_NOFOCUS_CONSTANT; // unit: seconds
-        static constexpr float_X startDownramp = 0.5_X * LASER_NOFOCUS_CONSTANT; // unit: seconds
-
-        /* initialize the laser not in the first cell is equal to a negative shift
-         * in time
-         */
-        static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
-
-        static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
-        static constexpr float_64 w = 2.0 * PI * f;
-    };
-} // namespace wavepacket
-
-namespace acc
-{
-    template< typename T_Unitless >
-    struct Wavepacket : public T_Unitless
+    namespace fields
     {
-        using Unitless = T_Unitless;
-
-        float3_X m_elong;
-        float_X m_phase;
-        typename FieldE::DataBoxType m_dataBoxE;
-        DataSpace< simDim > m_offsetToTotalDomain;
-        DataSpace< simDim > m_superCellToLocalOriginCellOffset;
-
-        /** Device-Side Constructor
-         *
-         * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
-         * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly after transform to centered origin)
-         */
-        HDINLINE Wavepacket(
-            typename FieldE::DataBoxType const & dataBoxE,
-            DataSpace< simDim > const & superCellToLocalOriginCellOffset,
-            DataSpace< simDim > const & offsetToTotalDomain,
-            float3_X const & elong,
-            float_X const phase
-        ) :
-            m_elong( elong ),
-            m_phase( phase ),
-            m_dataBoxE( dataBoxE ),
-            m_offsetToTotalDomain( offsetToTotalDomain ),
-            m_superCellToLocalOriginCellOffset( superCellToLocalOriginCellOffset )
+        namespace laserProfiles
         {
-        }
-
-        /** device side manipulation for init plane (transversal)
-         *
-         * @tparam T_Args type of the arguments passed to the user manipulator functor
-         *
-         * @param cellIndexInSuperCell ND cell index in current supercell
-         */
-        template< typename T_Acc >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            DataSpace< simDim > const & cellIndexInSuperCell
-        )
-        {
-            // coordinate system to global simulation as origin
-            DataSpace< simDim > const localCell(
-                cellIndexInSuperCell +
-                m_superCellToLocalOriginCellOffset
-            );
-
-            // transform coordinate system to center of x-z plane of initialization
-            constexpr uint8_t planeNormalDir = 1u;
-            DataSpace< simDim > offsetToCenterOfPlane( m_offsetToTotalDomain );
-            offsetToCenterOfPlane[ planeNormalDir ] = 0; // do not shift origin of plane normal
-            floatD_X const pos = precisionCast< float_X >( localCell + offsetToCenterOfPlane ) * cellSize.shrink< simDim >();
-            // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
-
-            // transversal position only
-            float3_X const w0_3D( Unitless::W0_X, 0._X, Unitless::W0_Z );
-            auto const w0( w0_3D.shrink< simDim >().remove< planeNormalDir >() );
-            auto const pos_trans( pos.remove< planeNormalDir >() );
-            auto const exp_compos( pos_trans * pos_trans / ( w0 * w0 ) );
-            float_X const exp_arg( exp_compos.sumOfComponents() );
-
-            m_elong *= math::exp( -1.0_X * exp_arg );
-
-            if( Unitless::initPlaneY != 0 ) // compile time if
-            {
-                /* If the laser is not initialized in the first cell we emit a
-                 * negatively and positively propagating wave. Therefore we need to multiply the
-                 * amplitude with a correction factor depending of the cell size in
-                 * propagation direction.
-                 * The negatively propagating wave is damped by the absorber.
-                 *
-                 * The `correctionFactor` assume that the wave is moving in y direction.
-                 */
-                auto const correctionFactor = ( SPEED_OF_LIGHT * DELTA_T ) / CELL_HEIGHT * 2._X;
-
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) +=  correctionFactor * m_elong;
-            }
-            else
-            {
-                // jump over the guard of the electric field
-                m_dataBoxE( localCell + SuperCellSize::toRT() * GuardSize::toRT() ) = m_elong;
-            }
-        }
-    };
-} // namespace acc
-
-    template< typename T_Params >
-    struct Wavepacket : public wavepacket::Unitless< T_Params >
-    {
-        using Unitless = wavepacket::Unitless< T_Params >;
-
-        float3_X elong;
-        float_X phase;
-        typename FieldE::DataBoxType dataBoxE;
-        DataSpace< simDim > offsetToTotalDomain;
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE Wavepacket( uint32_t currentStep )
-        {
-            // get data
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            dataBoxE = dc.get< FieldE >(
-                FieldE::getName(),
-                true
-            )->getDeviceDataBox();
-
-            // get meta data for offsets
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get().SubGrid();
-            // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
-            DataSpace< simDim > const globalCellOffset( subGrid.getLocalDomain().offset );
-            DataSpace< simDim > const halfSimSize( subGrid.getGlobalDomain().size / 2 );
-
-            // transform coordinate system to center of global simulation as origin [cells]
-            offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
-
-            // @todo reset origin of direction of moving window
-            // offsetToTotalDomain.y() = 0
-
-            // a symmetric pulse will be initialized at position z=0 for
-            // a time of RAMP_INIT * PULSE_LENGTH + LASER_NOFOCUS_CONSTANT = INIT_TIME.
-            // we shift the complete pulse for the half of this time to start with
-            // the front of the laser pulse.
-            const float_64 mue = 0.5 * Unitless::INIT_TIME;
-
-            float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift - mue;
-
-            elong = float3_X::create( 0.0_X );
-            float_X envelope = float_X( Unitless::AMPLITUDE );
-
-            const float_64 tau = Unitless::PULSE_LENGTH * math::sqrt( 2.0_X );
-
-            float_64 correctionFactor = 0.0;
-
-            if( runTime > Unitless::startDownramp )
-            {
-                // downramp = end
-                const float_64 exponent =
-                    ( ( runTime - Unitless::startDownramp )
-                     / Unitless::PULSE_LENGTH / math::sqrt( 2.0 ) );
-                envelope *= math::exp( -0.5 * exponent * exponent );
-                correctionFactor = ( runTime - Unitless::startDownramp ) / ( tau * tau * Unitless::w );
-            }
-            else if( runTime < Unitless::endUpramp )
-            {
-                // upramp = start
-                const float_X exponent = ( ( runTime - Unitless::endUpramp ) / Unitless::PULSE_LENGTH / math::sqrt( 2.0_X ) );
-                envelope *= math::exp( -0.5_X * exponent * exponent );
-                correctionFactor = ( runTime - Unitless::endUpramp ) / ( tau * tau * Unitless::w );
-            }
-
-            phase += float_X( Unitless::w * runTime ) + Unitless::LASER_PHASE;
-
-            if( Unitless::Polarisation == Unitless::LINEAR_X )
+            namespace wavepacket
             {
-                elong.x() = envelope * ( math::sin( phase ) + correctionFactor * math::cos( phase ) );
-            }
-            else if( Unitless::Polarisation == Unitless::LINEAR_Z )
+                template<typename T_Params>
+                struct Unitless : public T_Params
+                {
+                    using Params = T_Params;
+
+                    static constexpr float_X WAVE_LENGTH
+                        = float_X(Params::WAVE_LENGTH_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X PULSE_LENGTH
+                        = float_X(Params::PULSE_LENGTH_SI / UNIT_TIME); // unit: seconds (1 sigma)
+                    static constexpr float_X LASER_NOFOCUS_CONSTANT
+                        = float_X(Params::LASER_NOFOCUS_CONSTANT_SI / UNIT_TIME); // unit: seconds
+                    static constexpr float_X AMPLITUDE
+                        = float_X(Params::AMPLITUDE_SI / UNIT_EFIELD); // unit: Volt /meter
+                    static constexpr float_X W0_X = float_X(Params::W0_X_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X W0_Z = float_X(Params::W0_Z_SI / UNIT_LENGTH); // unit: meter
+                    static constexpr float_X INIT_TIME = float_X(
+                        Params::PULSE_INIT * PULSE_LENGTH
+                        + LASER_NOFOCUS_CONSTANT); // unit: seconds (full initialization length)
+                    static constexpr float_X endUpramp = -0.5_X * LASER_NOFOCUS_CONSTANT; // unit: seconds
+                    static constexpr float_X startDownramp = 0.5_X * LASER_NOFOCUS_CONSTANT; // unit: seconds
+
+                    /* initialize the laser not in the first cell is equal to a negative shift
+                     * in time
+                     */
+                    static constexpr float_X laserTimeShift = Params::initPlaneY * CELL_HEIGHT / SPEED_OF_LIGHT;
+
+                    static constexpr float_64 f = SPEED_OF_LIGHT / WAVE_LENGTH;
+                    static constexpr float_64 w = 2.0 * PI * f;
+                };
+            } // namespace wavepacket
+
+            namespace acc
             {
-                elong.z() = envelope * ( math::sin( phase ) + correctionFactor * math::cos( phase ) );
-            }
-            else if( Unitless::Polarisation == Unitless::CIRCULAR )
+                template<typename T_Unitless>
+                struct Wavepacket : public T_Unitless
+                {
+                    using Unitless = T_Unitless;
+
+                    float3_X m_elong;
+                    float_X m_phase;
+                    typename FieldE::DataBoxType m_dataBoxE;
+                    DataSpace<simDim> m_offsetToTotalDomain;
+                    DataSpace<simDim> m_superCellToLocalOriginCellOffset;
+
+                    /** Device-Side Constructor
+                     *
+                     * @param superCellToLocalOriginCellOffset local offset in cells to current supercell
+                     * @param offsetToTotalDomain offset to origin of global (@todo: total) coordinate system (possibly
+                     * after transform to centered origin)
+                     */
+                    HDINLINE Wavepacket(
+                        typename FieldE::DataBoxType const& dataBoxE,
+                        DataSpace<simDim> const& superCellToLocalOriginCellOffset,
+                        DataSpace<simDim> const& offsetToTotalDomain,
+                        float3_X const& elong,
+                        float_X const phase)
+                        : m_elong(elong)
+                        , m_phase(phase)
+                        , m_dataBoxE(dataBoxE)
+                        , m_offsetToTotalDomain(offsetToTotalDomain)
+                        , m_superCellToLocalOriginCellOffset(superCellToLocalOriginCellOffset)
+                    {
+                    }
+
+                    /** device side manipulation for init plane (transversal)
+                     *
+                     * @tparam T_Args type of the arguments passed to the user manipulator functor
+                     *
+                     * @param cellIndexInSuperCell ND cell index in current supercell
+                     */
+                    template<typename T_Acc>
+                    HDINLINE void operator()(T_Acc const&, DataSpace<simDim> const& cellIndexInSuperCell)
+                    {
+                        // coordinate system to global simulation as origin
+                        DataSpace<simDim> const localCell(cellIndexInSuperCell + m_superCellToLocalOriginCellOffset);
+
+                        // transform coordinate system to center of x-z plane of initialization
+                        constexpr uint8_t planeNormalDir = 1u;
+                        DataSpace<simDim> offsetToCenterOfPlane(m_offsetToTotalDomain);
+                        offsetToCenterOfPlane[planeNormalDir] = 0; // do not shift origin of plane normal
+                        floatD_X const pos
+                            = precisionCast<float_X>(localCell + offsetToCenterOfPlane) * cellSize.shrink<simDim>();
+                        // @todo add half-cells via traits::FieldPosition< Solver::NumicalCellType, FieldE >()
+
+                        // transversal position only
+                        float3_X const w0_3D(Unitless::W0_X, 0._X, Unitless::W0_Z);
+                        auto const w0(w0_3D.shrink<simDim>().remove<planeNormalDir>());
+                        auto const pos_trans(pos.remove<planeNormalDir>());
+                        auto const exp_compos(pos_trans * pos_trans / (w0 * w0));
+                        float_X const exp_arg(exp_compos.sumOfComponents());
+
+                        m_elong *= math::exp(-1.0_X * exp_arg);
+
+                        if(Unitless::initPlaneY != 0) // compile time if
+                        {
+                            /* If the laser is not initialized in the first cell we emit a
+                             * negatively and positively propagating wave. Therefore we need to multiply the
+                             * amplitude with a correction factor depending of the cell size in
+                             * propagation direction.
+                             * The negatively propagating wave is damped by the absorber.
+                             *
+                             * The `correctionFactor` assume that the wave is moving in y direction.
+                             */
+                            auto const correctionFactor = (SPEED_OF_LIGHT * DELTA_T) / CELL_HEIGHT * 2._X;
+
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT())
+                                += correctionFactor * m_elong;
+                        }
+                        else
+                        {
+                            // jump over the guard of the electric field
+                            m_dataBoxE(localCell + SuperCellSize::toRT() * GuardSize::toRT()) = m_elong;
+                        }
+                    }
+                };
+            } // namespace acc
+
+            template<typename T_Params>
+            struct Wavepacket : public wavepacket::Unitless<T_Params>
             {
-                elong.x() = envelope / math::sqrt( 2.0_X ) * ( math::sin( phase ) + correctionFactor * math::cos( phase ) );
-                elong.z() = envelope / math::sqrt( 2.0_X ) * ( math::cos( phase ) + correctionFactor * math::sin( phase ) );
-            }
-        }
-
-        /** create device manipulator functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset (in supercells, without guards) to the
-         *        origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::Wavepacket< Unitless >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const &
-        ) const
-        {
-            auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
-            return acc::Wavepacket< Unitless >( dataBoxE, superCellToLocalOriginCellOffset, offsetToTotalDomain, elong, phase );
-        }
-
-        //! get the name of the laser profile
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return "Wavepacket";
-        }
+                using Unitless = wavepacket::Unitless<T_Params>;
 
-    };
+                float3_X elong;
+                float_X phase;
+                typename FieldE::DataBoxType dataBoxE;
+                DataSpace<simDim> offsetToTotalDomain;
 
-} // namespace laserProfiles
-} // namespace fields
+                /** constructor
+                 *
+                 * @param currentStep current simulation time step
+                 */
+                HINLINE Wavepacket(uint32_t currentStep)
+                {
+                    // get data
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    dataBoxE = dc.get<FieldE>(FieldE::getName(), true)->getDeviceDataBox();
+
+                    // get meta data for offsets
+                    SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                    // const DataSpace< simDim > totalCellOffset( subGrid.getGlobalDomain().offset );
+                    DataSpace<simDim> const globalCellOffset(subGrid.getLocalDomain().offset);
+                    DataSpace<simDim> const halfSimSize(subGrid.getGlobalDomain().size / 2);
+
+                    // transform coordinate system to center of global simulation as origin [cells]
+                    offsetToTotalDomain = /* totalCellOffset + */ globalCellOffset - halfSimSize;
+
+                    // @todo reset origin of direction of moving window
+                    // offsetToTotalDomain.y() = 0
+
+                    // a symmetric pulse will be initialized at position z=0 for
+                    // a time of RAMP_INIT * PULSE_LENGTH + LASER_NOFOCUS_CONSTANT = INIT_TIME.
+                    // we shift the complete pulse for the half of this time to start with
+                    // the front of the laser pulse.
+                    const float_64 mue = 0.5 * Unitless::INIT_TIME;
+
+                    float_64 const runTime = DELTA_T * currentStep - Unitless::laserTimeShift - mue;
+
+                    elong = float3_X::create(0.0_X);
+                    float_X envelope = float_X(Unitless::AMPLITUDE);
+
+                    const float_64 tau = Unitless::PULSE_LENGTH * math::sqrt(2.0_X);
+
+                    float_64 correctionFactor = 0.0;
+
+                    if(runTime > Unitless::startDownramp)
+                    {
+                        // downramp = end
+                        const float_64 exponent
+                            = ((runTime - Unitless::startDownramp) / Unitless::PULSE_LENGTH / math::sqrt(2.0));
+                        envelope *= math::exp(-0.5 * exponent * exponent);
+                        correctionFactor = (runTime - Unitless::startDownramp) / (tau * tau * Unitless::w);
+                    }
+                    else if(runTime < Unitless::endUpramp)
+                    {
+                        // upramp = start
+                        const float_X exponent
+                            = ((runTime - Unitless::endUpramp) / Unitless::PULSE_LENGTH / math::sqrt(2.0_X));
+                        envelope *= math::exp(-0.5_X * exponent * exponent);
+                        correctionFactor = (runTime - Unitless::endUpramp) / (tau * tau * Unitless::w);
+                    }
+
+                    phase += float_X(Unitless::w * runTime) + Unitless::LASER_PHASE;
+
+                    if(Unitless::Polarisation == Unitless::LINEAR_X)
+                    {
+                        elong.x() = envelope * (math::sin(phase) + correctionFactor * math::cos(phase));
+                    }
+                    else if(Unitless::Polarisation == Unitless::LINEAR_Z)
+                    {
+                        elong.z() = envelope * (math::sin(phase) + correctionFactor * math::cos(phase));
+                    }
+                    else if(Unitless::Polarisation == Unitless::CIRCULAR)
+                    {
+                        elong.x()
+                            = envelope / math::sqrt(2.0_X) * (math::sin(phase) + correctionFactor * math::cos(phase));
+                        elong.z()
+                            = envelope / math::sqrt(2.0_X) * (math::cos(phase) + correctionFactor * math::sin(phase));
+                    }
+                }
+
+                /** create device manipulator functor
+                 *
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @tparam T_Acc alpaka accelerator type
+                 *
+                 * @param alpaka accelerator
+                 * @param localSupercellOffset (in supercells, without guards) to the
+                 *        origin of the local domain
+                 * @param configuration of the worker
+                 */
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::Wavepacket<Unitless> operator()(
+                    T_Acc const&,
+                    DataSpace<simDim> const& localSupercellOffset,
+                    T_WorkerCfg const&) const
+                {
+                    auto const superCellToLocalOriginCellOffset = localSupercellOffset * SuperCellSize::toRT();
+                    return acc::Wavepacket<Unitless>(
+                        dataBoxE,
+                        superCellToLocalOriginCellOffset,
+                        offsetToTotalDomain,
+                        elong,
+                        phase);
+                }
+
+                //! get the name of the laser profile
+                static HINLINE std::string getName()
+                {
+                    return "Wavepacket";
+                }
+            };
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
-
diff --git a/include/picongpu/fields/laserProfiles/profiles.def b/include/picongpu/fields/laserProfiles/profiles.def
index f189d5aea1..5bff13daab 100644
--- a/include/picongpu/fields/laserProfiles/profiles.def
+++ b/include/picongpu/fields/laserProfiles/profiles.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Ilja Goethel
  *
  * This file is part of PIConGPU.
diff --git a/include/picongpu/fields/laserProfiles/profiles.hpp b/include/picongpu/fields/laserProfiles/profiles.hpp
index 1c07befdc4..17a532811a 100644
--- a/include/picongpu/fields/laserProfiles/profiles.hpp
+++ b/include/picongpu/fields/laserProfiles/profiles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Ilja Goethel
  *
  * This file is part of PIConGPU.
diff --git a/include/picongpu/initialization/IInitPlugin.hpp b/include/picongpu/initialization/IInitPlugin.hpp
index 371cd408f0..885834a0f6 100644
--- a/include/picongpu/initialization/IInitPlugin.hpp
+++ b/include/picongpu/initialization/IInitPlugin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include <pmacc/types.hpp>
@@ -30,7 +29,7 @@ namespace picongpu
 {
     using namespace pmacc;
 
-    class IInitPlugin :  public ILightweightPlugin
+    class IInitPlugin : public ILightweightPlugin
     {
     public:
         virtual void slide(uint32_t currentStep) = 0;
@@ -40,7 +39,5 @@ namespace picongpu
         virtual ~IInitPlugin()
         {
         }
-
     };
-}
-
+} // namespace picongpu
diff --git a/include/picongpu/initialization/InitPluginNone.hpp b/include/picongpu/initialization/InitPluginNone.hpp
index 636d33a4cc..bd31db0913 100644
--- a/include/picongpu/initialization/InitPluginNone.hpp
+++ b/include/picongpu/initialization/InitPluginNone.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,13 +18,11 @@
  */
 
 
-
 #pragma once
 
 #include "picongpu/initialization/IInitPlugin.hpp"
 
 
-
 namespace picongpu
 {
     using namespace pmacc;
@@ -32,7 +30,6 @@ namespace picongpu
     class InitPluginNone : public IInitPlugin
     {
     public:
-
         virtual void slide(uint32_t currentStep)
         {
         }
@@ -62,12 +59,11 @@ namespace picongpu
             return "InitPluginNone";
         }
 
-        virtual void setMappingDescription(MappingDesc *cellDescription)
+        virtual void setMappingDescription(MappingDesc* cellDescription)
         {
         }
 
     protected:
-
         virtual void pluginLoad()
         {
         }
@@ -77,5 +73,4 @@ namespace picongpu
         }
     };
 
-}
-
+} // namespace picongpu
diff --git a/include/picongpu/initialization/InitialiserController.hpp b/include/picongpu/initialization/InitialiserController.hpp
index b2f2a6303f..872ea786b1 100644
--- a/include/picongpu/initialization/InitialiserController.hpp
+++ b/include/picongpu/initialization/InitialiserController.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -39,173 +39,167 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
 
-namespace po = boost::program_options;
+    namespace po = boost::program_options;
 
-class InitialiserController : public IInitPlugin
-{
-public:
-
-    InitialiserController() :
-    cellDescription(nullptr)
+    class InitialiserController : public IInitPlugin
     {
-    }
+    public:
+        InitialiserController() : cellDescription(nullptr)
+        {
+        }
 
-    virtual ~InitialiserController()
-    {
-    }
+        virtual ~InitialiserController()
+        {
+        }
 
-    /**
-     * Initialize simulation state at timestep 0
-     */
-    virtual void init()
-    {
-        // start simulation using default values
-        log<picLog::SIMULATION_STATE > ("Starting simulation from timestep 0");
+        /**
+         * Initialize simulation state at timestep 0
+         */
+        virtual void init()
+        {
+            // start simulation using default values
+            log<picLog::SIMULATION_STATE>("Starting simulation from timestep 0");
 
-        SimStartInitialiser simStartInitialiser;
-        Environment<>::get().DataConnector().initialise(simStartInitialiser, 0);
-        __getTransactionEvent().waitForFinished();
+            SimStartInitialiser simStartInitialiser;
+            Environment<>::get().DataConnector().initialise(simStartInitialiser, 0);
+            __getTransactionEvent().waitForFinished();
 
-        log<picLog::SIMULATION_STATE > ("Loading from default values finished");
-    }
+            log<picLog::SIMULATION_STATE>("Loading from default values finished");
+        }
 
-    /**
-     * Load persistent simulation state from \p restartStep
-     */
-    virtual void restart(uint32_t restartStep, const std::string restartDirectory)
-    {
-        // restart simulation by loading from persistent data
-        // the simulation will start after restartStep
-        log<picLog::SIMULATION_STATE > ("Restarting simulation from timestep %1% in directory '%2%'") %
-            restartStep % restartDirectory;
-
-        Environment<>::get().PluginConnector().restartPlugins(restartStep, restartDirectory);
-        __getTransactionEvent().waitForFinished();
-
-        CUDA_CHECK(cudaDeviceSynchronize());
-        CUDA_CHECK(cudaGetLastError());
-
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-
-        // avoid deadlock between not finished pmacc tasks and MPI_Barrier
-        __getTransactionEvent().waitForFinished();
-        /* can be spared for better scalings, but guarantees the user
-         * that the restart was successful */
-        MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
-
-        log<picLog::SIMULATION_STATE > ("Loading from persistent data finished");
-    }
-
-    /** Log omega_p for each species
-     *
-     * Calculate omega_p for each given species and create a `picLog::PHYSICS`
-     * log message
-     */
-    template<typename T_Species = bmpl::_1>
-    struct LogOmegaP
-    {
-        void operator()()
+        /**
+         * Load persistent simulation state from \p restartStep
+         */
+        virtual void restart(uint32_t restartStep, const std::string restartDirectory)
         {
-            /* The omega_p calculation is based on species' densityRatio
-             * relative to the BASE_DENSITY. Thus, it is only accurate
-             * for species with macroparticles sampled by density,
-             * but not necessarily for derived ones.
-             */
-            using FrameType = typename T_Species::FrameType;
-            const float_32 charge = frame::getCharge<FrameType>();
-            const float_32 mass = frame::getMass<FrameType>();
-            const auto densityRatio = traits::GetDensityRatio< T_Species >::type::getValue( );
-            const auto density = BASE_DENSITY * densityRatio;
-            log<picLog::PHYSICS >("species %2%: omega_p * dt <= 0.1 ? %1%") %
-                                 (sqrt(density * charge / mass * charge / EPS0) * DELTA_T) %
-                                  FrameType::getName();
+            // restart simulation by loading from persistent data
+            // the simulation will start after restartStep
+            log<picLog::SIMULATION_STATE>("Restarting simulation from timestep %1% in directory '%2%'") % restartStep
+                % restartDirectory;
+
+            Environment<>::get().PluginConnector().restartPlugins(restartStep, restartDirectory);
+            __getTransactionEvent().waitForFinished();
+
+            CUDA_CHECK(cuplaDeviceSynchronize());
+            CUDA_CHECK(cuplaGetLastError());
+
+            GridController<simDim>& gc = Environment<simDim>::get().GridController();
+
+            // avoid deadlock between not finished pmacc tasks and MPI_Barrier
+            __getTransactionEvent().waitForFinished();
+            /* can be spared for better scalings, but guarantees the user
+             * that the restart was successful */
+            MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
+
+            log<picLog::SIMULATION_STATE>("Loading from persistent data finished");
         }
-    };
 
-    /**
-     * Print interesting initialization information
-     */
-    virtual void printInformation()
-    {
-        if (Environment<simDim>::get().GridController().getGlobalRank() == 0)
+        /** Log omega_p for each species
+         *
+         * Calculate omega_p for each given species and create a `picLog::PHYSICS`
+         * log message
+         */
+        template<typename T_Species = bmpl::_1>
+        struct LogOmegaP
         {
-            log<picLog::PHYSICS >("Courant c*dt <= %1% ? %2%") %
-                                 (1./math::sqrt(INV_CELL2_SUM)) %
-                                 (SPEED_OF_LIGHT * DELTA_T);
-
-            using SpeciesWithMass = typename pmacc::particles::traits::FilterByFlag<
-                VectorAllSpecies,
-                massRatio<>
-            >::type;
-            using SpeciesWithMassCharge = typename pmacc::particles::traits::FilterByFlag<
-                SpeciesWithMass,
-                chargeRatio<>
-            >::type;
-            meta::ForEach< SpeciesWithMassCharge, LogOmegaP<> > logOmegaP;
-            log<picLog::PHYSICS >("Resolving plasma oscillations?\n"
-                "   Estimates are based on DensityRatio to BASE_DENSITY of each species\n"
-                "   (see: density.param, speciesDefinition.param).\n"
-                "   It and does not cover other forms of initialization");
-            logOmegaP();
-
-            if (fields::laserProfiles::Selected::INIT_TIME > float_X(0.0))
-                log<picLog::PHYSICS >("y-cells per wavelength: %1%") %
-                                     (fields::laserProfiles::Selected::WAVE_LENGTH / CELL_HEIGHT);
-            const int localNrOfCells = cellDescription->getGridLayout().getDataSpaceWithoutGuarding().productOfComponents();
-            log<picLog::PHYSICS >("macro particles per device: %1%") %
-                                 (localNrOfCells * particles::TYPICAL_PARTICLES_PER_CELL * (bmpl::size<VectorAllSpecies>::type::value));
-            log<picLog::PHYSICS >("typical macro particle weighting: %1%") % (particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
-
-
-            log<picLog::PHYSICS >("UNIT_SPEED %1%") % UNIT_SPEED;
-            log<picLog::PHYSICS >("UNIT_TIME %1%") % UNIT_TIME;
-            log<picLog::PHYSICS >("UNIT_LENGTH %1%") % UNIT_LENGTH;
-            log<picLog::PHYSICS >("UNIT_MASS %1%") % UNIT_MASS;
-            log<picLog::PHYSICS >("UNIT_CHARGE %1%") % UNIT_CHARGE;
-            log<picLog::PHYSICS >("UNIT_EFIELD %1%") % UNIT_EFIELD;
-            log<picLog::PHYSICS >("UNIT_BFIELD %1%") % UNIT_BFIELD;
-            log<picLog::PHYSICS >("UNIT_ENERGY %1%") % UNIT_ENERGY;
+            void operator()()
+            {
+                /* The omega_p calculation is based on species' densityRatio
+                 * relative to the BASE_DENSITY. Thus, it is only accurate
+                 * for species with macroparticles sampled by density,
+                 * but not necessarily for derived ones.
+                 */
+                using FrameType = typename T_Species::FrameType;
+                const float_32 charge = frame::getCharge<FrameType>();
+                const float_32 mass = frame::getMass<FrameType>();
+                const auto densityRatio = traits::GetDensityRatio<T_Species>::type::getValue();
+                const auto density = BASE_DENSITY * densityRatio;
+                log<picLog::PHYSICS>("species %2%: omega_p * dt <= 0.1 ? %1%")
+                    % (sqrt(density * charge / mass * charge / EPS0) * DELTA_T) % FrameType::getName();
+            }
+        };
+
+        /**
+         * Print interesting initialization information
+         */
+        virtual void printInformation()
+        {
+            if(Environment<simDim>::get().GridController().getGlobalRank() == 0)
+            {
+                log<picLog::PHYSICS>("Courant c*dt <= %1% ? %2%") % (1. / math::sqrt(INV_CELL2_SUM))
+                    % (SPEED_OF_LIGHT * DELTA_T);
+
+                using SpeciesWithMass =
+                    typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, massRatio<>>::type;
+                using SpeciesWithMassCharge =
+                    typename pmacc::particles::traits::FilterByFlag<SpeciesWithMass, chargeRatio<>>::type;
+                meta::ForEach<SpeciesWithMassCharge, LogOmegaP<>> logOmegaP;
+                log<picLog::PHYSICS>("Resolving plasma oscillations?\n"
+                                     "   Estimates are based on DensityRatio to BASE_DENSITY of each species\n"
+                                     "   (see: density.param, speciesDefinition.param).\n"
+                                     "   It and does not cover other forms of initialization");
+                logOmegaP();
+
+                if(fields::laserProfiles::Selected::INIT_TIME > float_X(0.0))
+                    log<picLog::PHYSICS>("y-cells per wavelength: %1%")
+                        % (fields::laserProfiles::Selected::WAVE_LENGTH / CELL_HEIGHT);
+                const int localNrOfCells
+                    = cellDescription->getGridLayout().getDataSpaceWithoutGuarding().productOfComponents();
+                log<picLog::PHYSICS>("macro particles per device: %1%")
+                    % (localNrOfCells * particles::TYPICAL_PARTICLES_PER_CELL
+                       * (bmpl::size<VectorAllSpecies>::type::value));
+                log<picLog::PHYSICS>("typical macro particle weighting: %1%")
+                    % (particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
+
+
+                log<picLog::PHYSICS>("UNIT_SPEED %1%") % UNIT_SPEED;
+                log<picLog::PHYSICS>("UNIT_TIME %1%") % UNIT_TIME;
+                log<picLog::PHYSICS>("UNIT_LENGTH %1%") % UNIT_LENGTH;
+                log<picLog::PHYSICS>("UNIT_MASS %1%") % UNIT_MASS;
+                log<picLog::PHYSICS>("UNIT_CHARGE %1%") % UNIT_CHARGE;
+                log<picLog::PHYSICS>("UNIT_EFIELD %1%") % UNIT_EFIELD;
+                log<picLog::PHYSICS>("UNIT_BFIELD %1%") % UNIT_BFIELD;
+                log<picLog::PHYSICS>("UNIT_ENERGY %1%") % UNIT_ENERGY;
+            }
         }
-    }
-
-    void notify(uint32_t)
-    {
-        // nothing to do here
-    }
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        // nothing to do here
-    }
+        void notify(uint32_t)
+        {
+            // nothing to do here
+        }
 
-    std::string pluginGetName() const
-    {
-        return "Initializers";
-    }
+        void pluginRegisterHelp(po::options_description& desc)
+        {
+            // nothing to do here
+        }
 
-    virtual void setMappingDescription(MappingDesc *cellDescription)
-    {
-        PMACC_ASSERT(cellDescription != nullptr);
-        this->cellDescription = cellDescription;
-    }
+        std::string pluginGetName() const
+        {
+            return "Initializers";
+        }
 
-    virtual void slide(uint32_t currentStep)
-    {
-        SimStartInitialiser simStartInitialiser;
-        Environment<>::get().DataConnector().initialise(simStartInitialiser, currentStep);
-        __getTransactionEvent().waitForFinished();
-    }
+        virtual void setMappingDescription(MappingDesc* cellDescription)
+        {
+            PMACC_ASSERT(cellDescription != nullptr);
+            this->cellDescription = cellDescription;
+        }
 
-private:
-    /*Descripe simulation area*/
-    MappingDesc *cellDescription;
+        virtual void slide(uint32_t currentStep)
+        {
+            SimStartInitialiser simStartInitialiser;
+            Environment<>::get().DataConnector().initialise(simStartInitialiser, currentStep);
+            __getTransactionEvent().waitForFinished();
+        }
 
-    bool restartSim;
-    std::string restartFile;
+    private:
+        /*Descripe simulation area*/
+        MappingDesc* cellDescription;
 
-};
+        bool restartSim;
+        std::string restartFile;
+    };
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/initialization/ParserGridDistribution.cpp b/include/picongpu/initialization/ParserGridDistribution.cpp
index 20250a187f..01ea99e1c0 100644
--- a/include/picongpu/initialization/ParserGridDistribution.cpp
+++ b/include/picongpu/initialization/ParserGridDistribution.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -21,8 +21,8 @@
 
 #include <pmacc/verify.hpp>
 #include <cstdint>
-#include <vector>   // std::vector
-#include <string>   // std::string
+#include <vector> // std::vector
+#include <string> // std::string
 #include <iterator> // std::distance
 
 #include <regex>
@@ -30,21 +30,19 @@
 
 namespace picongpu
 {
-
-    ParserGridDistribution::ParserGridDistribution( std::string const s )
+    ParserGridDistribution::ParserGridDistribution(std::string const s)
     {
-        parsedInput = parse( s );
+        parsedInput = parse(s);
     }
 
-    uint32_t
-    ParserGridDistribution::getOffset( uint32_t const devicePos, uint32_t const maxCells ) const
+    uint32_t ParserGridDistribution::getOffset(uint32_t const devicePos, uint32_t const maxCells) const
     {
         value_type::const_iterator iter = parsedInput.begin();
         // go to last device of these n subdomains extent{n}
         uint32_t i = iter->count - 1u;
         uint32_t sum = 0u;
 
-        while( i < devicePos )
+        while(i < devicePos)
         {
             // add last subdomain
             sum += iter->extent * iter->count;
@@ -55,26 +53,25 @@ namespace picongpu
         }
 
         // add part of this subdomain that is before me
-        sum += iter->extent * ( devicePos + iter->count - i - 1u );
+        sum += iter->extent * (devicePos + iter->count - i - 1u);
 
         // check total number of cells
         uint32_t sumTotal = 0u;
-        for( iter = parsedInput.begin(); iter != parsedInput.end(); ++iter )
+        for(iter = parsedInput.begin(); iter != parsedInput.end(); ++iter)
             sumTotal += iter->extent * iter->count;
 
-        PMACC_VERIFY( sumTotal == maxCells );
+        PMACC_VERIFY(sumTotal == maxCells);
 
         return sum;
     }
 
-    uint32_t
-    ParserGridDistribution::getLocalSize( uint32_t const devicePos ) const
+    uint32_t ParserGridDistribution::getLocalSize(uint32_t const devicePos) const
     {
         value_type::const_iterator iter = parsedInput.begin();
         // go to last device of these n subdomains extent{n}
         uint32_t i = iter->count - 1u;
 
-        while( i < devicePos )
+        while(i < devicePos)
         {
             ++iter;
             // go to last device of these n subdomains extent{n}
@@ -84,57 +81,43 @@ namespace picongpu
         return iter->extent;
     }
 
-    void
-    ParserGridDistribution::verifyDevices( uint32_t const numDevices ) const
+    void ParserGridDistribution::verifyDevices(uint32_t const numDevices) const
     {
         uint32_t numSubdomains = 0u;
-        for( SubdomainPair const & p : parsedInput )
+        for(SubdomainPair const& p : parsedInput)
             numSubdomains += p.count;
 
-        PMACC_VERIFY( numSubdomains == numDevices );
+        PMACC_VERIFY(numSubdomains == numDevices);
     }
 
-    ParserGridDistribution::value_type
-    ParserGridDistribution::parse( std::string const s ) const
+    ParserGridDistribution::value_type ParserGridDistribution::parse(std::string const s) const
     {
-        std::regex regFind(
-            R"([0-9]+(\{[0-9]+})*)",
-            std::regex::egrep
-        );
+        std::regex regFind(R"([0-9]+(\{[0-9]+})*)", std::regex::egrep);
 
-        std::sregex_token_iterator iter( s.begin( ), s.end( ),
-                                           regFind, 0 );
+        std::sregex_token_iterator iter(s.begin(), s.end(), regFind, 0);
         std::sregex_token_iterator end;
 
         value_type newInput;
-        newInput.reserve( std::distance( iter, end ) );
+        newInput.reserve(std::distance(iter, end));
 
-        for(; iter != end; ++iter )
+        for(; iter != end; ++iter)
         {
             std::string pM = *iter;
 
             // find count n and extent b of b{n}
-            std::regex regCount(
-                R"((.*\{)|(}))",
-                std::regex::egrep
-            );
-            std::string count = std::regex_replace( pM, regCount, "" );
-
-            std::regex regExtent(
-                R"(\{.*})",
-                std::regex::egrep
-            );
-            std::string extent = std::regex_replace( pM, regExtent, "" );
+            std::regex regCount(R"((.*\{)|(}))", std::regex::egrep);
+            std::string count = std::regex_replace(pM, regCount, "");
+
+            std::regex regExtent(R"(\{.*})", std::regex::egrep);
+            std::string extent = std::regex_replace(pM, regExtent, "");
 
             // no count {n} given (implies one)
-            if( count == *iter )
+            if(count == *iter)
                 count = "1";
 
-            const SubdomainPair g = {
-                static_cast< uint32_t > ( std::stoul(extent) ),
-                static_cast< uint32_t > ( std::stoul(count) )
-            };
-            newInput.emplace_back( g );
+            const SubdomainPair g
+                = {static_cast<uint32_t>(std::stoul(extent)), static_cast<uint32_t>(std::stoul(count))};
+            newInput.emplace_back(g);
         }
 
         return newInput;
diff --git a/include/picongpu/initialization/ParserGridDistribution.hpp b/include/picongpu/initialization/ParserGridDistribution.hpp
index 7366ff5e7a..d4058ea5f9 100644
--- a/include/picongpu/initialization/ParserGridDistribution.hpp
+++ b/include/picongpu/initialization/ParserGridDistribution.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -19,73 +19,68 @@
 
 #pragma once
 
-#include <vector>   // std::vector
-#include <string>   // std::string
+#include <vector> // std::vector
+#include <string> // std::string
 #include <cstdint>
 
 
 namespace picongpu
 {
+    class ParserGridDistribution
+    {
+    private:
+        /** 1D sudomain extents
+         *
+         * Pair of extent and count entry in our grid distribution.
+         *
+         * For example, a single entry of the grid distribution a,b,c{n},d{m},e,f
+         * is stored as entry (a,1) in SubdomainPair. Another as (b,1), another
+         * n equally spaced subdomains as (c,n), another m subdomains of extent d
+         * as (d,m), and so on.
+         */
+        struct SubdomainPair
+        {
+            // extent of the current subdomain
+            uint32_t extent;
+            // count of how often the subdomain shall be repeated
+            uint32_t count;
+        };
+        using value_type = std::vector<SubdomainPair>;
 
-class ParserGridDistribution
-{
-private:
-    /** 1D sudomain extents
-     *
-     * Pair of extent and count entry in our grid distribution.
-     *
-     * For example, a single entry of the grid distribution a,b,c{n},d{m},e,f
-     * is stored as entry (a,1) in SubdomainPair. Another as (b,1), another
-     * n equally spaced subdomains as (c,n), another m subdomains of extent d
-     * as (d,m), and so on.
-     */
-    struct SubdomainPair {
-        // extent of the current subdomain
-        uint32_t extent;
-        // count of how often the subdomain shall be repeated
-        uint32_t count;
-    };
-    using value_type = std::vector< SubdomainPair >;
-
-public:
-    ParserGridDistribution( std::string const s );
+    public:
+        ParserGridDistribution(std::string const s);
 
-    uint32_t
-    getOffset( uint32_t const devicePos, uint32_t const maxCells ) const;
+        uint32_t getOffset(uint32_t const devicePos, uint32_t const maxCells) const;
 
-    /** Get local Size of this dimension
-     *
-     *  \param[in] devicePos as unsigned integer in the range [0, n-1] for this dimension
-     *  \return uint32_t with local number of cells
-     */
-    uint32_t
-    getLocalSize( uint32_t const devicePos ) const;
+        /** Get local Size of this dimension
+         *
+         *  \param[in] devicePos as unsigned integer in the range [0, n-1] for this dimension
+         *  \return uint32_t with local number of cells
+         */
+        uint32_t getLocalSize(uint32_t const devicePos) const;
 
-    /** Verify the number of subdomains matches the devices
-     *
-     * Check that the number of subdomains in a dimension, after we
-     * expanded all regexes, matches the number of devices for it.
-     *
-     * \param[in] numDevices number of devices for this dimension
-     */
-    void
-    verifyDevices( uint32_t const numDevices ) const;
+        /** Verify the number of subdomains matches the devices
+         *
+         * Check that the number of subdomains in a dimension, after we
+         * expanded all regexes, matches the number of devices for it.
+         *
+         * \param[in] numDevices number of devices for this dimension
+         */
+        void verifyDevices(uint32_t const numDevices) const;
 
-private:
-    value_type parsedInput;
+    private:
+        value_type parsedInput;
 
-    /** Parses the input string to a vector of SubdomainPair(s)
-     *
-     * Parses the input string in the form a,b,c{n},d{m},e,f
-     * to a vector of SubdomainPair with extent number (a,b,c,d,e,f) and
-     * counts (1,1,n,m,e,f)
-     *
-     * \param[in] s as string in the form a,b{n}
-     * \return std::vector<SubdomainPair> with 2x uint32_t (extent, count)
-     */
-    value_type
-    parse( std::string const s ) const;
-
-};
+        /** Parses the input string to a vector of SubdomainPair(s)
+         *
+         * Parses the input string in the form a,b,c{n},d{m},e,f
+         * to a vector of SubdomainPair with extent number (a,b,c,d,e,f) and
+         * counts (1,1,n,m,e,f)
+         *
+         * \param[in] s as string in the form a,b{n}
+         * \return std::vector<SubdomainPair> with 2x uint32_t (extent, count)
+         */
+        value_type parse(std::string const s) const;
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/initialization/SimStartInitialiser.hpp b/include/picongpu/initialization/SimStartInitialiser.hpp
index c21cc5da79..1a831feb28 100644
--- a/include/picongpu/initialization/SimStartInitialiser.hpp
+++ b/include/picongpu/initialization/SimStartInitialiser.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,26 +27,21 @@
 
 namespace picongpu
 {
-
-/**
- * Simulation startup initialiser.
- *
- * Initialises a new simulation from default values.
- *
- */
-class SimStartInitialiser : public AbstractInitialiser
-{
-public:
-
-    void init(ISimulationData& data, uint32_t currentStep)
+    /**
+     * Simulation startup initialiser.
+     *
+     * Initialises a new simulation from default values.
+     *
+     */
+    class SimStartInitialiser : public AbstractInitialiser
     {
-
-    }
-
-    virtual ~SimStartInitialiser()
-    {
-
-    }
-};
-}
-
+    public:
+        void init(ISimulationData& data, uint32_t currentStep)
+        {
+        }
+
+        virtual ~SimStartInitialiser()
+        {
+        }
+    };
+} // namespace picongpu
diff --git a/include/picongpu/main.cpp b/include/picongpu/main.cpp
index 5b0bfb5a37..a4dbd92cab 100644
--- a/include/picongpu/main.cpp
+++ b/include/picongpu/main.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -18,6 +18,8 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <pmacc/boost_workaround.hpp>
+
 #include "picongpu/ArgsParser.hpp"
 #include <pmacc/Environment.hpp>
 #include <pmacc/types.hpp>
@@ -32,37 +34,36 @@
 
 namespace
 {
-
     /** Run a PIConGPU simulation
      *
      * @param argc count of arguments in argv (same as for main() )
      * @param argv arguments of program start (same as for main() )
      */
-    int runSimulation( int argc, char **argv )
+    int runSimulation(int argc, char** argv)
     {
         using namespace picongpu;
 
         simulation_starter::SimStarter sim;
-        auto const parserStatus = sim.parseConfigs( argc, argv );
+        auto const parserStatus = sim.parseConfigs(argc, argv);
         int errorCode = EXIT_FAILURE;
 
-        switch( parserStatus )
+        switch(parserStatus)
         {
-            case ArgsParser::Status::error:
-                errorCode = EXIT_FAILURE;
-                break;
-            case ArgsParser::Status::success:
-                sim.load( );
-                sim.start( );
-                sim.unload( );
-                PMACC_FALLTHROUGH;
-            case ArgsParser::Status::successExit:
-                errorCode = 0;
-                break;
+        case ArgsParser::Status::error:
+            errorCode = EXIT_FAILURE;
+            break;
+        case ArgsParser::Status::success:
+            sim.load();
+            sim.start();
+            sim.unload();
+            PMACC_FALLTHROUGH;
+        case ArgsParser::Status::successExit:
+            errorCode = 0;
+            break;
         };
 
         // finalize the pmacc context */
-        pmacc::Environment<>::get( ).finalize( );
+        pmacc::Environment<>::get().finalize();
 
         return errorCode;
     }
@@ -74,21 +75,20 @@ namespace
  * @param argc count of arguments in argv
  * @param argv arguments of program start
  */
-int main( int argc, char **argv )
+int main(int argc, char** argv)
 {
     try
     {
-        return runSimulation( argc, argv );
+        return runSimulation(argc, argv);
     }
     // A last-ditch effort to report exceptions to a user
-    catch ( const std::exception & ex )
+    catch(const std::exception& ex)
     {
-        auto const typeName = std::string( typeid( ex ).name( ) );
-        std::cerr << "Unhandled exception of type '" + typeName +
-            "' with message '" + ex.what() + "', terminating\n";
+        auto const typeName = std::string(typeid(ex).name());
+        std::cerr << "Unhandled exception of type '" + typeName + "' with message '" + ex.what() + "', terminating\n";
         return EXIT_FAILURE;
     }
-    catch ( ... )
+    catch(...)
     {
         std::cerr << "Unhandled exception of unknown type, terminating\n";
         return EXIT_FAILURE;
diff --git a/include/picongpu/param/bremsstrahlung.param b/include/picongpu/param/bremsstrahlung.param
index 6cadb06d21..14f7c0c279 100644
--- a/include/picongpu/param/bremsstrahlung.param
+++ b/include/picongpu/param/bremsstrahlung.param
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -21,98 +21,98 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace bremsstrahlung
-{
-
-/** params related to the energy loss and deflection of the incident electron
- */
-namespace electron
-{
-    /** Minimal kinetic electron energy in MeV for the lookup table.
-     * For electrons below this value Bremsstrahlung is not taken into account.
-     */
-    constexpr float_64 MIN_ENERGY_MeV = 0.5;
-
-    /** Maximal kinetic electron energy in MeV for the lookup table.
-     * Electrons above this value cause a out-of-bounds access at the
-     * lookup table. Bounds checking is enabled for "CRITICAL" log level.
-     */
-    constexpr float_64 MAX_ENERGY_MeV = 200.0;
-
-    /** Minimal polar deflection angle due to screening. See Jackson 13.5 for a rule of thumb to this value. */
-    constexpr float_64 MIN_THETA = 0.01;
-
-    /** number of lookup table divisions for the kappa axis.
-     * Kappa is the energy loss normalized to the initial kinetic energy.
-     * The axis is scaled linearly.
-     */
-    constexpr uint32_t NUM_SAMPLES_KAPPA = 32;
-
-    /** number of lookup table divisions for the initial kinetic energy axis.
-     * The axis is scaled logarithmically.
-     */
-    constexpr uint32_t NUM_SAMPLES_EKIN = 32;
-
-    /** Kappa is the energy loss normalized to the initial kinetic energy.
-     * This minimal value is needed by the numerics to avoid a division by zero.
-     */
-    constexpr float_64 MIN_KAPPA = 1.0e-10;
-
-} // namespace electron
-
-/** params related to the creation and the emission angle of the photon
- */
-namespace photon
-{
-    /** Low-energy threshold in keV of the incident electron for the creation of photons.
-     * Below this value photon emission is neglected.
-     */
-    constexpr float_64 SOFT_PHOTONS_CUTOFF_keV = 5000.0;
-
-    /** number of lookup table divisions for the delta axis.
-     * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
-     * where theta is the angle between the photon momentum and the final electron momentum.
-     *
-     * The axis is scaled linearly.
-     */
-    constexpr uint32_t NUM_SAMPLES_DELTA = 256;
-
-    /** number of lookup table divisions for the gamma axis.
-     * Gamma is the relativistic factor of the incident electron.
-     *
-     * The axis is scaled logarithmically.
-     */
-    constexpr uint32_t NUM_SAMPLES_GAMMA = 64;
-
-    /** Maximal value of delta for the lookup table.
-     * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
-     * where theta is the angle between the photon momentum and the final electron momentum.
-     *
-     * A value close to one is reasonable. Though exactly one was actually correct,
-     * because it would map to theta = pi (maximum polar angle), the sampling then would be bad
-     * in the ultrarelativistic case. In this regime the emission primarily takes place at small thetas.
-     * So a maximum delta close to one maps to a reasonable maximum theta.
-     */
-    constexpr float_64 MAX_DELTA = 0.95;
-
-    /** minimal gamma for the lookup table. */
-    constexpr float_64 MIN_GAMMA = 1.0;
-
-    /** maximal gamma for the lookup table.
-     * Bounds checking is enabled for "CRITICAL" log level.
-     */
-    constexpr float_64 MAX_GAMMA = 250;
-
-    /** if the emission probability per timestep is higher than this value and the log level is set to
-     *  "CRITICAL" a warning will be raised.
-     */
-    constexpr float_64 SINGLE_EMISSION_PROB_LIMIT = 0.4;
-
-    constexpr float_64 WEIGHTING_RATIO = 10;
-} // namespace photon
-
-} // namespace bremsstrahlung
-} // namespace particles
+    namespace particles
+    {
+        namespace bremsstrahlung
+        {
+            /** params related to the energy loss and deflection of the incident electron
+             */
+            namespace electron
+            {
+                /** Minimal kinetic electron energy in MeV for the lookup table.
+                 * For electrons below this value Bremsstrahlung is not taken into account.
+                 */
+                constexpr float_64 MIN_ENERGY_MeV = 0.5;
+
+                /** Maximal kinetic electron energy in MeV for the lookup table.
+                 * Electrons above this value cause a out-of-bounds access at the
+                 * lookup table. Bounds checking is enabled for "CRITICAL" log level.
+                 */
+                constexpr float_64 MAX_ENERGY_MeV = 200.0;
+
+                /** Minimal polar deflection angle due to screening. See Jackson 13.5 for a rule of thumb to this
+                 * value. */
+                constexpr float_64 MIN_THETA = 0.01;
+
+                /** number of lookup table divisions for the kappa axis.
+                 * Kappa is the energy loss normalized to the initial kinetic energy.
+                 * The axis is scaled linearly.
+                 */
+                constexpr uint32_t NUM_SAMPLES_KAPPA = 32;
+
+                /** number of lookup table divisions for the initial kinetic energy axis.
+                 * The axis is scaled logarithmically.
+                 */
+                constexpr uint32_t NUM_SAMPLES_EKIN = 32;
+
+                /** Kappa is the energy loss normalized to the initial kinetic energy.
+                 * This minimal value is needed by the numerics to avoid a division by zero.
+                 */
+                constexpr float_64 MIN_KAPPA = 1.0e-10;
+
+            } // namespace electron
+
+            /** params related to the creation and the emission angle of the photon
+             */
+            namespace photon
+            {
+                /** Low-energy threshold in keV of the incident electron for the creation of photons.
+                 * Below this value photon emission is neglected.
+                 */
+                constexpr float_64 SOFT_PHOTONS_CUTOFF_keV = 5000.0;
+
+                /** number of lookup table divisions for the delta axis.
+                 * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
+                 * where theta is the angle between the photon momentum and the final electron momentum.
+                 *
+                 * The axis is scaled linearly.
+                 */
+                constexpr uint32_t NUM_SAMPLES_DELTA = 256;
+
+                /** number of lookup table divisions for the gamma axis.
+                 * Gamma is the relativistic factor of the incident electron.
+                 *
+                 * The axis is scaled logarithmically.
+                 */
+                constexpr uint32_t NUM_SAMPLES_GAMMA = 64;
+
+                /** Maximal value of delta for the lookup table.
+                 * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
+                 * where theta is the angle between the photon momentum and the final electron momentum.
+                 *
+                 * A value close to one is reasonable. Though exactly one was actually correct,
+                 * because it would map to theta = pi (maximum polar angle), the sampling then would be bad
+                 * in the ultrarelativistic case. In this regime the emission primarily takes place at small thetas.
+                 * So a maximum delta close to one maps to a reasonable maximum theta.
+                 */
+                constexpr float_64 MAX_DELTA = 0.95;
+
+                /** minimal gamma for the lookup table. */
+                constexpr float_64 MIN_GAMMA = 1.0;
+
+                /** maximal gamma for the lookup table.
+                 * Bounds checking is enabled for "CRITICAL" log level.
+                 */
+                constexpr float_64 MAX_GAMMA = 250;
+
+                /** if the emission probability per timestep is higher than this value and the log level is set to
+                 *  "CRITICAL" a warning will be raised.
+                 */
+                constexpr float_64 SINGLE_EMISSION_PROB_LIMIT = 0.4;
+
+                constexpr float_64 WEIGHTING_RATIO = 10;
+            } // namespace photon
+
+        } // namespace bremsstrahlung
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/param/components.param b/include/picongpu/param/components.param
index bd261d1820..a960304deb 100644
--- a/include/picongpu/param/components.param
+++ b/include/picongpu/param/components.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Anton Helm,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Anton Helm,
  *                     Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -30,12 +30,12 @@
 
 namespace picongpu
 {
-/** @namespace simulation_starter
- *
- * Simulation Starter Selection:
- * This value does usually not need to be changed. Change only if you want to
- * implement your own `SimulationHelper` (e.g. `MySimulation`) class.
- *  - defaultPIConGPU         : default PIConGPU configuration
- */
-namespace simulation_starter = defaultPIConGPU;
+    /** @namespace simulation_starter
+     *
+     * Simulation Starter Selection:
+     * This value does usually not need to be changed. Change only if you want to
+     * implement your own `SimulationHelper` (e.g. `Simulation`) class.
+     *  - defaultPIConGPU         : default PIConGPU configuration
+     */
+    namespace simulation_starter = defaultPIConGPU;
 } // namespace picongpu
diff --git a/include/picongpu/param/density.param b/include/picongpu/param/density.param
index 087085a75e..6712a5b70a 100644
--- a/include/picongpu/param/density.param
+++ b/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -34,250 +34,244 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     */
-    constexpr float_64 BASE_DENSITY_SI = 1.e25;
-} // namespace SI
-
-namespace densityProfiles
-{
-    /** Profile Formula:
-     *   `const float_X exponent = abs((y - gasCenter_SI) / gasSigma_SI);`
-     *   `const float_X density = exp(gasFactor * pow(exponent, gasPower));`
-     *
-     *   takes `gasCenterLeft_SI      for y < gasCenterLeft_SI`,
-     *         `gasCenterRight_SI     for y > gasCenterRight_SI`,
-     *   and `exponent = 0.0 for gasCenterLeft_SI < y < gasCenterRight_SI`
-     */
-    PMACC_STRUCT(GaussianParam,
-        /** ...
-         */
-        (PMACC_C_VALUE(float_X, gasFactor, -1.0))
-        (PMACC_C_VALUE(float_X, gasPower, 4.0))
-
-        /** height of vacuum area on top border
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
          *
-         *  this vacuum is important because of the laser initialization,
-         *  which is done in the first cells of the simulation and
-         *  assumes a charge-free volume
-         *  unit: cells
-         */
-        (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
-
-        /** The central position of the distribution
-          *  unit: meter
-          */
-        (PMACC_C_VALUE(float_64, gasCenterLeft_SI, 4.62e-5))
-        (PMACC_C_VALUE(float_64, gasCenterRight_SI, 4.62e-5))
-
-        /** the distance from gasCenter_SI until the gas density decreases to its 1/e-th part
-          *  unit: meter
-          */
-        (PMACC_C_VALUE(float_64, gasSigmaLeft_SI, 4.62e-5))
-        (PMACC_C_VALUE(float_64, gasSigmaRight_SI, 4.62e-5))
-    ); /* struct GaussianParam */
-
-    /* definition of density profile with gaussian profile */
-    using Gaussian = GaussianImpl< GaussianParam >;
-
-
-    /* definition of homogenous profile */
-    using Homogenous = HomogenousImpl;
-
-
-    /** parameter for `LinearExponential` profile
-     *
-     * @verbatim
-     * Density Profile: /\
-     *                 /  -,_
-     *   linear       /      -,_    exponential
-     *   slope       /  |       -,_ slope
-     *                  MAX
-     * @endverbatim
-     */
-    PMACC_STRUCT(LinearExponentialParam,
-        /** height of vacuum area on top border
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
          *
-         * this vacuum is important because of the laser initialization,
-         * which is done in the first cells of the simulation and
-         * assumes a charge-free volume
-         * unit: cells
-         */
-        (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
-
-        /** Y-Position where the linear slope ends and the exponential slope
-         *  begins
-         *  unit: meter
-         */
-        (PMACC_C_VALUE(float_64, gasYMax_SI, 1.0e-3))
-
-        /** Parameters for the linear slope:
-         *  For Y <= gasYMax_SI:
-         *    \rho / BASE_DENSITY = A * Y + B
-         *                        = element [0.0; 1.0]
-         *  unit for A: 1/m
-         *  unit for B: none
+         * unit: ELEMENTS/m^3
          */
-        (PMACC_C_VALUE(float_64, gasA_SI, 1.0e-3))
-
-        /** Parameters for the exponential slope
-         *  For Y > gasYMax_SI:
-         *    let Y' = Y - gasYMax_SI
-         *    \rho = exp[ - Y' * D ]
-         *         = element [0.0; 1.0]
-         *  unit: 1/m
-         */
-        (PMACC_C_VALUE(float_64, gasD_SI, 1.0e-3))
-
-        (PMACC_C_VALUE(float_64, gasB, 0.0))
-    ); /* struct LinearExponentialParam */
-
-    /* definition of gas with linear start slop and exponential end slope */
-    using LinearExponential = LinearExponentialImpl< LinearExponentialParam >;
+        constexpr float_64 BASE_DENSITY_SI = 1.e25;
+    } // namespace SI
 
-
-    PMACC_STRUCT(GaussianCloudParam,
+    namespace densityProfiles
+    {
         /** Profile Formula:
-         *     exponent = |globalCellPos - center| / sigma
-         *     density = e^[ gasFactor * exponent^gasPower ]
-         */
-        (PMACC_C_VALUE(float_X, gasFactor, -0.5))
-        (PMACC_C_VALUE(float_X, gasPower, 2.0))
-
-        /** height of vacuum area on top border
-         *
-         * this vacuum is important because of the laser initialization,
-         * which is done in the first cells of the simulation and
-         * assumes a charge-free volume
-         * unit: cells
-         */
-        (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
-
-        /** The central position of the gas distribution
-         *  unit: meter
-         */
-        (PMACC_C_VECTOR_DIM(float_64, simDim, center_SI, 1.134e-5, 1.134e-5, 1.134e-5))
-
-        /** the distance from gasCenter_SI until the gas density decreases to its 1/e-th part
-          *  unit: meter */
-        (PMACC_C_VECTOR_DIM(float_64, simDim, sigma_SI, 7.0e-6, 7.0e-6, 7.0e-6))
-    ); /* struct GaussianCloudParam */
-
-    /* definition of cloud profile */
-    using GaussianCloud = GaussianCloudImpl< GaussianCloudParam >;
-
-
-    /** The profile consists out of the composition of 3 1D profiles
-     *  with the scheme: exponential increasing flank, constant sphere,
-     *                   exponential decreasing flank
-     * @verbatim
-     *           ___
-     *  1D:  _,./   \.,_   rho(r)
-     *
-     *  2D:  ..,x,..   density: . low
-     *       .,xxx,.            , middle
-     *       ..,x,..            x high (constant)
-     * @endverbatim
-     */
-    PMACC_STRUCT(SphereFlanksParam,
-        /** height of vacuum area on top border
+         *   `const float_X exponent = abs((y - gasCenter_SI) / gasSigma_SI);`
+         *   `const float_X density = exp(gasFactor * pow(exponent, gasPower));`
          *
-         * this vacuum is important because of the laser initialization,
-         * which is done in the first cells of the simulation and
-         * assumes a charge-free volume
-         * unit: cells
-         */
-        (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
-
-        /** Radius of the constant sphere
-         *  unit: meter
-         */
-        (PMACC_C_VALUE(float_64, r_SI, 1.0e-3))
-
-        /** Inner radius if you want to build a shell/ring
-         *  unit: meter
-         */
-        (PMACC_C_VALUE(float_64, ri_SI, 0.0))
-
-        /** Middle of the constant sphere
-         *  unit: meter
-         */
-        (PMACC_C_VECTOR_DIM(float_64, simDim, center_SI, 8.0e-3, 8.0e-3, 8.0e-3))
-
-        /** Parameters for the exponential slope
-         *  For distance > r_SI:
-         *    let distance' = distance - r
-         *    \rho = exp[ - distance' * exponent ]
-         *  unit: 1/m
+         *   takes `gasCenterLeft_SI      for y < gasCenterLeft_SI`,
+         *         `gasCenterRight_SI     for y > gasCenterRight_SI`,
+         *   and `exponent = 0.0 for gasCenterLeft_SI < y < gasCenterRight_SI`
          */
-        (PMACC_C_VALUE(float_64, exponent_SI, 1.0e3))
-
-    ); /* struct SphereFlanksParam */
+        PMACC_STRUCT(
+            GaussianParam,
+            /** ...
+             */
+            (PMACC_C_VALUE(float_X, gasFactor, -1.0))(PMACC_C_VALUE(float_X, gasPower, 4.0))
 
-    /* definition of sphere profile with flanks */
-    using SphereFlanks = SphereFlanksImpl<SphereFlanksParam>;
+            /** height of vacuum area on top border
+             *
+             *  this vacuum is important because of the laser initialization,
+             *  which is done in the first cells of the simulation and
+             *  assumes a charge-free volume
+             *  unit: cells
+             */
+            (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
 
+            /** The central position of the distribution
+             *  unit: meter
+             */
+            (PMACC_C_VALUE(float_64, gasCenterLeft_SI, 4.62e-5))(PMACC_C_VALUE(float_64, gasCenterRight_SI, 4.62e-5))
 
-    PMACC_STRUCT(FromHDF5Param,
-        /* prefix of filename
-         * full file name: gas_0.h5
-         * filename = "gas"
-         * iteration = 0
-         */
-        (PMACC_C_STRING(filename,"gas"))
+            /** the distance from gasCenter_SI until the gas density decreases to its 1/e-th part
+             *  unit: meter
+             */
+            (PMACC_C_VALUE(float_64, gasSigmaLeft_SI, 4.62e-5))(
+                PMACC_C_VALUE(float_64, gasSigmaRight_SI, 4.62e-5))); /* struct GaussianParam */
 
-        (PMACC_C_STRING(datasetName,"fields/e_chargeDensity"))
+        /* definition of density profile with gaussian profile */
+        using Gaussian = GaussianImpl<GaussianParam>;
 
-        /* simulation step*/
-        (PMACC_C_VALUE(uint32_t, iteration, 0))
-        (PMACC_C_VALUE(float_X, defaultDensity, 0.0))
-    ); /* struct FromHDF5Param */
 
-    /* definition of cloud profile */
-    using FromHDF5 = FromHDF5Impl< FromHDF5Param >;
+        /* definition of homogenous profile */
+        using Homogenous = HomogenousImpl;
 
 
-    struct FreeFormulaFunctor
-    {
-        /** This formula uses SI quantities only.
-         *  The profile will be multiplied by BASE_DENSITY_SI.
+        /** parameter for `LinearExponential` profile
          *
-         * @param position_SI total offset including all slides [meter]
-         * @param cellSize_SI cell sizes [meter]
+         * @verbatim
+         * Density Profile: /\
+         *                 /  -,_
+         *   linear       /      -,_    exponential
+         *   slope       /  |       -,_ slope
+         *                  MAX
+         * @endverbatim
+         */
+        PMACC_STRUCT(
+            LinearExponentialParam,
+            /** height of vacuum area on top border
+             *
+             * this vacuum is important because of the laser initialization,
+             * which is done in the first cells of the simulation and
+             * assumes a charge-free volume
+             * unit: cells
+             */
+            (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
+
+            /** Y-Position where the linear slope ends and the exponential slope
+             *  begins
+             *  unit: meter
+             */
+            (PMACC_C_VALUE(float_64, gasYMax_SI, 1.0e-3))
+
+            /** Parameters for the linear slope:
+             *  For Y <= gasYMax_SI:
+             *    \rho / BASE_DENSITY = A * Y + B
+             *                        = element [0.0; 1.0]
+             *  unit for A: 1/m
+             *  unit for B: none
+             */
+            (PMACC_C_VALUE(float_64, gasA_SI, 1.0e-3))
+
+            /** Parameters for the exponential slope
+             *  For Y > gasYMax_SI:
+             *    let Y' = Y - gasYMax_SI
+             *    \rho = exp[ - Y' * D ]
+             *         = element [0.0; 1.0]
+             *  unit: 1/m
+             */
+            (PMACC_C_VALUE(float_64, gasD_SI, 1.0e-3))
+
+                (PMACC_C_VALUE(float_64, gasB, 0.0))); /* struct LinearExponentialParam */
+
+        /* definition of gas with linear start slop and exponential end slope */
+        using LinearExponential = LinearExponentialImpl<LinearExponentialParam>;
+
+
+        PMACC_STRUCT(
+            GaussianCloudParam,
+            /** Profile Formula:
+             *     exponent = |globalCellPos - center| / sigma
+             *     density = e^[ gasFactor * exponent^gasPower ]
+             */
+            (PMACC_C_VALUE(float_X, gasFactor, -0.5))(PMACC_C_VALUE(float_X, gasPower, 2.0))
+
+            /** height of vacuum area on top border
+             *
+             * this vacuum is important because of the laser initialization,
+             * which is done in the first cells of the simulation and
+             * assumes a charge-free volume
+             * unit: cells
+             */
+            (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
+
+            /** The central position of the gas distribution
+             *  unit: meter
+             */
+            (PMACC_C_VECTOR_DIM(float_64, simDim, center_SI, 1.134e-5, 1.134e-5, 1.134e-5))
+
+            /** the distance from gasCenter_SI until the gas density decreases to its 1/e-th part
+             *  unit: meter */
+            (PMACC_C_VECTOR_DIM(float_64, simDim, sigma_SI, 7.0e-6, 7.0e-6, 7.0e-6))); /* struct GaussianCloudParam */
+
+        /* definition of cloud profile */
+        using GaussianCloud = GaussianCloudImpl<GaussianCloudParam>;
+
+
+        /** The profile consists out of the composition of 3 1D profiles
+         *  with the scheme: exponential increasing flank, constant sphere,
+         *                   exponential decreasing flank
+         * @verbatim
+         *           ___
+         *  1D:  _,./   \.,_   rho(r)
          *
-         * @return float_X density [normalized to 1.0]
+         *  2D:  ..,x,..   density: . low
+         *       .,xxx,.            , middle
+         *       ..,x,..            x high (constant)
+         * @endverbatim
          */
-        HDINLINE float_X
-        operator()(
-            const floatD_64& position_SI,
-            const float3_64& cellSize_SI
-        )
+        PMACC_STRUCT(
+            SphereFlanksParam,
+            /** height of vacuum area on top border
+             *
+             * this vacuum is important because of the laser initialization,
+             * which is done in the first cells of the simulation and
+             * assumes a charge-free volume
+             * unit: cells
+             */
+            (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
+
+            /** Radius of the constant sphere
+             *  unit: meter
+             */
+            (PMACC_C_VALUE(float_64, r_SI, 1.0e-3))
+
+            /** Inner radius if you want to build a shell/ring
+             *  unit: meter
+             */
+            (PMACC_C_VALUE(float_64, ri_SI, 0.0))
+
+            /** Middle of the constant sphere
+             *  unit: meter
+             */
+            (PMACC_C_VECTOR_DIM(float_64, simDim, center_SI, 8.0e-3, 8.0e-3, 8.0e-3))
+
+            /** Parameters for the exponential slope
+             *  For distance > r_SI:
+             *    let distance' = distance - r
+             *    \rho = exp[ - distance' * exponent ]
+             *  unit: 1/m
+             */
+            (PMACC_C_VALUE(float_64, exponent_SI, 1.0e3))
+
+        ); /* struct SphereFlanksParam */
+
+        /* definition of sphere profile with flanks */
+        using SphereFlanks = SphereFlanksImpl<SphereFlanksParam>;
+
+
+        PMACC_STRUCT(
+            FromHDF5Param,
+            /* prefix of filename
+             * full file name: gas_0.h5
+             * filename = "gas"
+             * iteration = 0
+             */
+            (PMACC_C_STRING(filename, "gas"))
+
+                (PMACC_C_STRING(datasetName, "fields/e_chargeDensity"))
+
+            /* simulation step*/
+            (PMACC_C_VALUE(uint32_t, iteration, 0))(
+                PMACC_C_VALUE(float_X, defaultDensity, 0.0))); /* struct FromHDF5Param */
+
+        /* definition of cloud profile */
+        using FromHDF5 = FromHDF5Impl<FromHDF5Param>;
+
+
+        struct FreeFormulaFunctor
         {
-            const float_64 y( position_SI.y() * 1000.0 ); // m -> mm
-            //const uint64_t y_cell_id( uint64_t(position_SI.y() / cellSize_SI[1]) );
-
-            /* triangle function example
-             * for a density profile from 0 to 400 microns */
-            float_X s = 1.0_X - 5.0_X * math::abs( y - 0.2_X );
-
-            /* give it an empty/filled striping for every second cell */
-            //s *= float_X( (y_cell_id % 2) == 0 );
-
-            /* all parts of the function MUST be > 0 */
-            s *= float_X( s >= 0.0 );
-            return s;
-        }
-    };
-
-    /* definition of free formula profile */
-    using FreeFormula = FreeFormulaImpl< FreeFormulaFunctor >;
-} // namespace densityProfiles
+            /** This formula uses SI quantities only.
+             *  The profile will be multiplied by BASE_DENSITY_SI.
+             *
+             * @param position_SI total offset including all slides [meter]
+             * @param cellSize_SI cell sizes [meter]
+             *
+             * @return float_X density [normalized to 1.0]
+             */
+            HDINLINE float_X operator()(const floatD_64& position_SI, const float3_64& cellSize_SI)
+            {
+                const float_64 y(position_SI.y() * 1000.0); // m -> mm
+                // const uint64_t y_cell_id( uint64_t(position_SI.y() / cellSize_SI[1]) );
+
+                /* triangle function example
+                 * for a density profile from 0 to 400 microns */
+                float_X s = 1.0_X - 5.0_X * math::abs(y - 0.2_X);
+
+                /* give it an empty/filled striping for every second cell */
+                // s *= float_X( (y_cell_id % 2) == 0 );
+
+                /* all parts of the function MUST be > 0 */
+                s *= float_X(s >= 0.0);
+                return s;
+            }
+        };
+
+        /* definition of free formula profile */
+        using FreeFormula = FreeFormulaImpl<FreeFormulaFunctor>;
+    } // namespace densityProfiles
 } // namespace picongpu
diff --git a/include/picongpu/param/dimension.param b/include/picongpu/param/dimension.param
index 081417d0bf..a05b7ade06 100644
--- a/include/picongpu/param/dimension.param
+++ b/include/picongpu/param/dimension.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl
+/* Copyright 2014-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/param/fieldBackground.param b/include/picongpu/param/fieldBackground.param
index 357f97801a..7cfdaff60f 100644
--- a/include/picongpu/param/fieldBackground.param
+++ b/include/picongpu/param/fieldBackground.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Alexander Debus, Richard Pausch
+/* Copyright 2014-2021 Axel Huebl, Alexander Debus, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -36,16 +36,15 @@ namespace picongpu
         /* We use this to calculate your SI input back to our unit system */
         PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundE( const float3_64 unitField ) : m_unitField(unitField)
-        {}
+        HDINLINE FieldBackgroundE(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field E(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t = 0
          * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()( const DataSpace<simDim>& cellIdx,
-                    const uint32_t currentStep ) const
+        HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
         {
             /* example: periodicity of 20 microns ( = 2.0e-5 m) */
             constexpr float_64 period_SI(20.0e-6);
@@ -55,8 +54,8 @@ namespace picongpu
              *       multiplying with DELTA_T_SI */
 
             /* specify your E-Field in V/m and convert to PIConGPU units */
-            const float_X sinArg = precisionCast<float_X>( y_SI / period_SI * 2.0 * PI );
-            return float3_X(0.0, math::sin( sinArg ) / m_unitField[1], 0.0);
+            const float_X sinArg = precisionCast<float_X>(y_SI / period_SI * 2.0 * PI);
+            return float3_X(0.0, math::sin(sinArg) / m_unitField[1], 0.0);
         }
     };
 
@@ -69,16 +68,15 @@ namespace picongpu
         /* We use this to calculate your SI input back to our unit system */
         PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundB( const float3_64 unitField ) : m_unitField(unitField)
-        {}
+        HDINLINE FieldBackgroundB(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field B(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t=0
          * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()( const DataSpace<simDim>& cellIdx,
-                    const uint32_t currentStep ) const
+        HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
         {
             /* example: periodicity of 20 microns ( = 2.0e-5 m) */
             constexpr float_64 period_SI(20.0e-6);
@@ -88,8 +86,8 @@ namespace picongpu
              *       multiplying with DELTA_T_SI */
 
             /* specify your B-Field in T and convert to PIConGPU units */
-            const float_X sinArg = precisionCast<float_X>( y_SI / period_SI * 2.0 * PI );
-            return float3_X(0.0, math::cos( sinArg ) / m_unitField[1], 0.0);
+            const float_X sinArg = precisionCast<float_X>(y_SI / period_SI * 2.0 * PI);
+            return float3_X(0.0, math::cos(sinArg) / m_unitField[1], 0.0);
         }
     };
 
@@ -102,16 +100,15 @@ namespace picongpu
         /* We use this to calculate your SI input back to our unit system */
         PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundJ( const float3_64 unitField ) : m_unitField(unitField)
-        {}
+        HDINLINE FieldBackgroundJ(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field J(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t=0
          * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()( const DataSpace<simDim>& cellIdx,
-                    const uint32_t currentStep ) const
+        HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
         {
             /* example: periodicity of 20 microns ( = 2.0e-5 m) */
             constexpr float_64 period_SI(20.0e-6);
@@ -121,8 +118,8 @@ namespace picongpu
              *       multiplying with DELTA_T_SI */
 
             /* specify your J-Field in A/m^2 and convert to PIConGPU units */
-            const float_X sinArg = precisionCast<float_X>( y_SI / period_SI * 2.0 * PI );
-            return float3_X(0.0, math::cos( sinArg ) / m_unitField[1], 0.0);
+            const float_X sinArg = precisionCast<float_X>(y_SI / period_SI * 2.0 * PI);
+            return float3_X(0.0, math::cos(sinArg) / m_unitField[1], 0.0);
         }
     };
 
diff --git a/include/picongpu/param/fieldSolver.param b/include/picongpu/param/fieldSolver.param
index a516aacaad..9955e8919c 100644
--- a/include/picongpu/param/fieldSolver.param
+++ b/include/picongpu/param/fieldSolver.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -25,6 +25,11 @@
  *
  * Also allows to configure ad hoc mitigations for high frequency
  * noise in some setups via current smoothing.
+ *
+ * \attention
+ * Currently, the laser initialization in PIConGPU is implemented to work with the standard Yee solver.
+ * Using a solver of higher order will result in a slightly increased laser amplitude and energy than expected.
+ *
  */
 
 #pragma once
@@ -35,38 +40,43 @@
 
 namespace picongpu
 {
-namespace fields
-{
-
-    /** Current Interpolation
-     *
-     * CurrentInterpolation is used to set a method performing the
-     * interpolate/assign operation from the generated currents of particle
-     * species to the electro-magnetic fields.
-     *
-     * Allowed values are:
-     *   - None:
-     *     - default for staggered grids/Yee-scheme
-     *     - updates E
-     *   - Binomial: 2nd order Binomial filter
-     *     - smooths the current before assignment in staggered grid
-     *     - updates E & breaks local charge conservation slightly
-     *   - NoneDS:
-     *     - experimental assignment for all-centered/directional splitting
-     *     - updates E & B at the same time
-     */
-    using CurrentInterpolation = currentInterpolation::None;
+    namespace fields
+    {
+        /** Current Interpolation
+         *
+         * CurrentInterpolation is used to set a method performing the
+         * interpolate/assign operation from the generated currents of particle
+         * species to the electro-magnetic fields.
+         *
+         * Allowed values are:
+         *   - None:
+         *     - default for staggered grids/Yee-scheme
+         *     - updates E
+         *   - Binomial: 2nd order Binomial filter
+         *     - smooths the current before assignment in staggered grid
+         *     - updates E & breaks local charge conservation slightly
+         */
+        using CurrentInterpolation = currentInterpolation::None;
 
-    /** FieldSolver
-     *
-     * Field Solver Selection:
-     *  - Yee< CurrentInterpolation > : standard Yee solver
-     *  - YeePML< CurrentInterpolation >: standard Yee solver with PML absorber
-     *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
-     *  - DirSplitting< CurrentInterpolation >: Sentoku's Directional Splitting Method
-     *  - None< CurrentInterpolation >: disable the vacuum update of E and B
-     */
-    using Solver = maxwellSolver::Yee< CurrentInterpolation >;
+        /** FieldSolver
+         *
+         * Field Solver Selection:
+         *  - Yee< CurrentInterpolation > : Standard Yee solver approximating derivatives with respect to time and
+         * space by second order finite differences.
+         *  - YeePML< CurrentInterpolation >: Standard Yee solver using Perfectly Matched Layer Absorbing Boundary
+         * Conditions (PML)
+         *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *  - LehePML< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *                                     using Perfectly Matched Layer Absorbing Boundary Conditions (PML)
+         *  - ArbitraryOrderFDTD< 4, CurrentInterpolation >: Solver using 4 neighbors to each direction to approximate
+         * *spatial* derivatives by finite differences. The number of neighbors can be changed from 4 to any positive,
+         * integer number. The order of the solver will be twice the number of neighbors in each direction. Yee's
+         * method is a special case of this using one neighbor to each direction.
+         *  - ArbitraryOrderFDTDPML< 4, CurrentInterpolation >: ArbitraryOrderFDTD solver using Perfectly Matched Layer
+         *                                                      Absorbing Boundary Conditions (PML)
+         *  - None< CurrentInterpolation >: disable the vacuum update of E and B
+         */
+        using Solver = maxwellSolver::Yee<CurrentInterpolation>;
 
-} // namespace fields
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/param/fileOutput.param b/include/picongpu/param/fileOutput.param
index cdc412710d..cb9a269464 100644
--- a/include/picongpu/param/fileOutput.param
+++ b/include/picongpu/param/fileOutput.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
  *                     Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -63,49 +63,33 @@ namespace picongpu
     namespace deriveField = particles::particleToGrid;
 
     /* ChargeDensity section */
-    using ChargeDensity_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::ChargeDensity
-    >;
+    using ChargeDensity_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::ChargeDensity>;
 
     /* EnergyDensity section */
-    using EnergyDensity_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::EnergyDensity
-    >;
+    using EnergyDensity_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::EnergyDensity>;
 
     /* MomentumComponentsection: define "component" as
        0=X (default), 1=Y or 2=Z (results: [-1.:1.])
      */
-    using MomentumComponent_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::MomentumComponent< 0 >
-    >;
+    using MomentumComponent_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::MomentumComponent<0>>;
 
     /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
      *
      * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
      */
-    using FieldTmpSolvers = MakeSeq_t<
-        ChargeDensity_Seq,
-        EnergyDensity_Seq,
-        MomentumComponent_Seq
-    >;
+    using FieldTmpSolvers = MakeSeq_t<ChargeDensity_Seq, EnergyDensity_Seq, MomentumComponent_Seq>;
 
 
     /** FileOutputFields: Groups all Fields that shall be dumped *************/
 
     /** Possible native fields: FieldE, FieldB, FieldJ
      */
-    using NativeFileOutputFields = MakeSeq_t<
-        FieldE,
-        FieldB
-    >;
+    using NativeFileOutputFields = MakeSeq_t<FieldE, FieldB>;
 
-    using FileOutputFields = MakeSeq_t<
-        NativeFileOutputFields,
-        FieldTmpSolvers
-    >;
+    using FileOutputFields = MakeSeq_t<NativeFileOutputFields, FieldTmpSolvers>;
 
 
     /** FileOutputParticles: Groups all Species that shall be dumped **********
@@ -115,4 +99,4 @@ namespace picongpu
      */
     using FileOutputParticles = VectorAllSpecies;
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/param/flylite.param b/include/picongpu/param/flylite.param
index f4adb10a64..85c8182858 100644
--- a/include/picongpu/param/flylite.param
+++ b/include/picongpu/param/flylite.param
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -41,54 +41,51 @@
 
 namespace picongpu
 {
-namespace flylite
-{
-    /** number of populations (numpop)
-     *
-     * this number defines how many configurations make up a superconfiguration
-     *
-     * range: [0, 255]
-     */
-    constexpr uint8_t populations = 3u; // example Cu data set: 32u
+    namespace flylite
+    {
+        /** number of populations (numpop)
+         *
+         * this number defines how many configurations make up a superconfiguration
+         *
+         * range: [0, 255]
+         */
+        constexpr uint8_t populations = 3u; // example Cu data set: 32u
 
-    using Superconfig = types::Superconfig<
-        float_64,
-        populations
-    >;
+        using Superconfig = types::Superconfig<float_64, populations>;
 
-    /** ionization states of the atom (iz)
-     *
-     * range: [0, 255]
-     */
-    constexpr uint8_t ionizationStates = 29u;
+        /** ionization states of the atom (iz)
+         *
+         * range: [0, 255]
+         */
+        constexpr uint8_t ionizationStates = 29u;
 
-    /** number of energy bins
-     *
-     * energy steps used for local energy histograms
-     * @note: no overflow- or underflow-bins are used, particles with energies
-     *        outside the range (see below) are ignored
-     */
-    constexpr uint16_t energies = 512u;
+        /** number of energy bins
+         *
+         * energy steps used for local energy histograms
+         * @note: no overflow- or underflow-bins are used, particles with energies
+         *        outside the range (see below) are ignored
+         */
+        constexpr uint16_t energies = 512u;
 
-    /** energy range for electron and photon histograms
-     *
-     * electron and photon histograms f(e) f(ph) are currently
-     * calculated in a linearly binned histogram while particles with
-     * energies outside the ranges below are ignored
-     *
-     * unit: eV
-     */
-    constexpr float_X electronMinEnergy = 0.0;
-    constexpr float_X electronMaxEnergy = 100.e3;
-    constexpr float_X photonMinEnergy = 0.0;
-    constexpr float_X photonMaxEnergy = 100.e3;
+        /** energy range for electron and photon histograms
+         *
+         * electron and photon histograms f(e) f(ph) are currently
+         * calculated in a linearly binned histogram while particles with
+         * energies outside the ranges below are ignored
+         *
+         * unit: eV
+         */
+        constexpr float_X electronMinEnergy = 0.0;
+        constexpr float_X electronMaxEnergy = 100.e3;
+        constexpr float_X photonMinEnergy = 0.0;
+        constexpr float_X photonMaxEnergy = 100.e3;
 
-    /** you better not change this line, the wooooorld depends on it!
-     *
-     * no seriously, per-supercell is the quickest way to average particle
-     * quantities such as density, energy histogram, etc. and I won't implement
-     * another size until needed
-     */
-    using spatialAverageBox = SuperCellSize;
-} // namespace flylite
+        /** you better not change this line, the wooooorld depends on it!
+         *
+         * no seriously, per-supercell is the quickest way to average particle
+         * quantities such as density, energy histogram, etc. and I won't implement
+         * another size until needed
+         */
+        using spatialAverageBox = SuperCellSize;
+    } // namespace flylite
 } // namespace picongpu
diff --git a/include/picongpu/param/grid.param b/include/picongpu/param/grid.param
index 8b06404d69..8813496202 100644
--- a/include/picongpu/param/grid.param
+++ b/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -68,9 +68,9 @@ namespace picongpu
      *  unit: none
      */
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {32, 32},  /*x direction [negative,positive]*/
-        {32, 32},  /*y direction [negative,positive]*/
-        {32, 32}   /*z direction [negative,positive]*/
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
     };
 
     /** Define the strength of the absorber for any direction
@@ -80,7 +80,7 @@ namespace picongpu
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
     };
 
     /** When to move the co-moving window.
@@ -101,4 +101,3 @@ namespace picongpu
     constexpr float_64 movePoint = 0.9;
 
 } // namespace picongpu
-
diff --git a/include/picongpu/param/ionizationEnergies.param b/include/picongpu/param/ionizationEnergies.param
index 303ba7b68b..5b12c5b534 100644
--- a/include/picongpu/param/ionizationEnergies.param
+++ b/include/picongpu/param/ionizationEnergies.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten, Axel Huebl
+/* Copyright 2014-2021 Marco Garten, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -32,246 +32,253 @@
 
 namespace picongpu
 {
-namespace ionization
-{
-/** Ionization potentials
- *
- * Please follow these rules for defining ionization energies of atomic species,
- * unless your chosen ionization model requires a different unit system than `AU::`
- *     - input of values in either atomic units or converting eV or Joule to them
- *     -> use either UNITCONV_eV_to_AU or SI::ATOMIC_UNIT_ENERGY for that purpose
- *     - use `float_X` as the preferred data type
- *
- * example:
- *     ionization energy for ground state hydrogen: 13.6 eV
- *     1 Joule       = 1 kg * m^2 / s^2
- *     1 eV          = 1.602e-19 J
- *
- *     1 AU (energy) = 27.2 eV
- *                   = 1 Hartree
- *                   = 4.36e-18 J
- *                   = 2 Rydberg
- *                   = 2 x Hydrogen ground state binding energy
- *
- * Atomic units are useful for ionization models because they simplify the
- * formulae greatly and provide intuitively understandable relations to a
- * well-known system, i.e. the Hydrogen atom.
- *
- * for PMACC_CONST_VECTOR usage,
- * @see include/pmacc/math/ConstVector.hpp
- * for finding ionization energies,
- * @url http://physics.nist.gov/PhysRefData/ASD/ionEnergy.html
- *
- * Reference: Kramida, A., Ralchenko, Yu., Reader, J., and NIST ASD Team (2014)
- *            NIST Atomic Spectra Database (ver. 5.2), [Online]
- *            Available: http://physics.nist.gov/asd [2017, February 8]
- *            National Institute of Standards and Technology, Gaithersburg, MD
- */
-namespace energies
-{
-namespace AU
-{
-    /* ionization energy for ground state hydrogen in atomic units */
-    PMACC_CONST_VECTOR(float_X, 1, Hydrogen,
-        13.59843 * UNITCONV_eV_to_AU
-    );
+    namespace ionization
+    {
+        /** Ionization potentials
+         *
+         * Please follow these rules for defining ionization energies of atomic species,
+         * unless your chosen ionization model requires a different unit system than `AU::`
+         *     - input of values in either atomic units or converting eV or Joule to them
+         *     -> use either UNITCONV_eV_to_AU or SI::ATOMIC_UNIT_ENERGY for that purpose
+         *     - use `float_X` as the preferred data type
+         *
+         * example:
+         *     ionization energy for ground state hydrogen: 13.6 eV
+         *     1 Joule       = 1 kg * m^2 / s^2
+         *     1 eV          = 1.602e-19 J
+         *
+         *     1 AU (energy) = 27.2 eV
+         *                   = 1 Hartree
+         *                   = 4.36e-18 J
+         *                   = 2 Rydberg
+         *                   = 2 x Hydrogen ground state binding energy
+         *
+         * Atomic units are useful for ionization models because they simplify the
+         * formulae greatly and provide intuitively understandable relations to a
+         * well-known system, i.e. the Hydrogen atom.
+         *
+         * for PMACC_CONST_VECTOR usage,
+         * @see include/pmacc/math/ConstVector.hpp
+         * for finding ionization energies,
+         * @url http://physics.nist.gov/PhysRefData/ASD/ionEnergy.html
+         *
+         * Reference: Kramida, A., Ralchenko, Yu., Reader, J., and NIST ASD Team (2014)
+         *            NIST Atomic Spectra Database (ver. 5.2), [Online]
+         *            Available: http://physics.nist.gov/asd [2017, February 8]
+         *            National Institute of Standards and Technology, Gaithersburg, MD
+         */
+        namespace energies
+        {
+            namespace AU
+            {
+                /* ionization energy for ground state hydrogen in atomic units */
+                PMACC_CONST_VECTOR(float_X, 1, Hydrogen, 13.59843 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for ground state deuterium in atomic units */
-    PMACC_CONST_VECTOR(float_X, 1, Deuterium,
-        13.60213 * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for ground state deuterium in atomic units */
+                PMACC_CONST_VECTOR(float_X, 1, Deuterium, 13.60213 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for ground state helium in atomic units */
-    PMACC_CONST_VECTOR(float_X, 2, Helium,
-        24.58739 * UNITCONV_eV_to_AU,
-        54.41776 * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for ground state helium in atomic units */
+                PMACC_CONST_VECTOR(float_X, 2, Helium, 24.58739 * UNITCONV_eV_to_AU, 54.41776 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for carbon in atomic units */
-    PMACC_CONST_VECTOR(float_X, 6, Carbon,
-        11.2603 * UNITCONV_eV_to_AU,
-        24.3845 * UNITCONV_eV_to_AU,
-        47.88778 * UNITCONV_eV_to_AU,
-        64.49351 * UNITCONV_eV_to_AU,
-        392.0905 * UNITCONV_eV_to_AU,
-        489.993177 * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for carbon in atomic units */
+                PMACC_CONST_VECTOR(
+                    float_X,
+                    6,
+                    Carbon,
+                    11.2603 * UNITCONV_eV_to_AU,
+                    24.3845 * UNITCONV_eV_to_AU,
+                    47.88778 * UNITCONV_eV_to_AU,
+                    64.49351 * UNITCONV_eV_to_AU,
+                    392.0905 * UNITCONV_eV_to_AU,
+                    489.993177 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for nitrogen in atomic units */
-    PMACC_CONST_VECTOR(float_X, 7, Nitrogen,
-        14.53413 * UNITCONV_eV_to_AU,
-        29.60125 * UNITCONV_eV_to_AU,
-        47.4453 * UNITCONV_eV_to_AU,
-        77.4735 * UNITCONV_eV_to_AU,
-        97.89013 * UNITCONV_eV_to_AU,
-        552.06731 * UNITCONV_eV_to_AU,
-        667.04609 * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for nitrogen in atomic units */
+                PMACC_CONST_VECTOR(
+                    float_X,
+                    7,
+                    Nitrogen,
+                    14.53413 * UNITCONV_eV_to_AU,
+                    29.60125 * UNITCONV_eV_to_AU,
+                    47.4453 * UNITCONV_eV_to_AU,
+                    77.4735 * UNITCONV_eV_to_AU,
+                    97.89013 * UNITCONV_eV_to_AU,
+                    552.06731 * UNITCONV_eV_to_AU,
+                    667.04609 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for oxygen in atomic units */
-    PMACC_CONST_VECTOR(float_X, 8, Oxygen,
-        13.61805 * UNITCONV_eV_to_AU,
-        35.12112 * UNITCONV_eV_to_AU,
-        54.93554 * UNITCONV_eV_to_AU,
-        77.41350 * UNITCONV_eV_to_AU,
-        113.8989 * UNITCONV_eV_to_AU,
-        138.1189 * UNITCONV_eV_to_AU,
-        739.3268 * UNITCONV_eV_to_AU,
-        871.4098 * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for oxygen in atomic units */
+                PMACC_CONST_VECTOR(
+                    float_X,
+                    8,
+                    Oxygen,
+                    13.61805 * UNITCONV_eV_to_AU,
+                    35.12112 * UNITCONV_eV_to_AU,
+                    54.93554 * UNITCONV_eV_to_AU,
+                    77.41350 * UNITCONV_eV_to_AU,
+                    113.8989 * UNITCONV_eV_to_AU,
+                    138.1189 * UNITCONV_eV_to_AU,
+                    739.3268 * UNITCONV_eV_to_AU,
+                    871.4098 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for aluminium in atomic units */
-    PMACC_CONST_VECTOR(float_X, 13, Aluminium,
-        5.98577 * UNITCONV_eV_to_AU,
-        18.8285 * UNITCONV_eV_to_AU,
-        28.4476 * UNITCONV_eV_to_AU,
-        119.992 * UNITCONV_eV_to_AU,
-        153.825 * UNITCONV_eV_to_AU,
-        190.495 * UNITCONV_eV_to_AU,
-        241.769 * UNITCONV_eV_to_AU,
-        284.647 * UNITCONV_eV_to_AU,
-        330.214 * UNITCONV_eV_to_AU,
-        398.656 * UNITCONV_eV_to_AU,
-        442.006 * UNITCONV_eV_to_AU,
-        2085.97 * UNITCONV_eV_to_AU,
-        2304.14 * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for aluminium in atomic units */
+                PMACC_CONST_VECTOR(
+                    float_X,
+                    13,
+                    Aluminium,
+                    5.98577 * UNITCONV_eV_to_AU,
+                    18.8285 * UNITCONV_eV_to_AU,
+                    28.4476 * UNITCONV_eV_to_AU,
+                    119.992 * UNITCONV_eV_to_AU,
+                    153.825 * UNITCONV_eV_to_AU,
+                    190.495 * UNITCONV_eV_to_AU,
+                    241.769 * UNITCONV_eV_to_AU,
+                    284.647 * UNITCONV_eV_to_AU,
+                    330.214 * UNITCONV_eV_to_AU,
+                    398.656 * UNITCONV_eV_to_AU,
+                    442.006 * UNITCONV_eV_to_AU,
+                    2085.97 * UNITCONV_eV_to_AU,
+                    2304.14 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for silicon in atomic units */
-    PMACC_CONST_VECTOR(float_X, 14, Silicon,
-        8.151683 * UNITCONV_eV_to_AU,
-        16.345845 * UNITCONV_eV_to_AU,
-        33.493 * UNITCONV_eV_to_AU,
-        45.14179 * UNITCONV_eV_to_AU,
-        166.767 * UNITCONV_eV_to_AU,
-        205.267 * UNITCONV_eV_to_AU,
-        246.32 * UNITCONV_eV_to_AU,
-        303.66 * UNITCONV_eV_to_AU,
-        351.1 * UNITCONV_eV_to_AU,
-        401.38 * UNITCONV_eV_to_AU,
-        476.18 * UNITCONV_eV_to_AU,
-        523.415 * UNITCONV_eV_to_AU,
-        2437.65804 * UNITCONV_eV_to_AU,
-        2673.1774 * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for silicon in atomic units */
+                PMACC_CONST_VECTOR(
+                    float_X,
+                    14,
+                    Silicon,
+                    8.151683 * UNITCONV_eV_to_AU,
+                    16.345845 * UNITCONV_eV_to_AU,
+                    33.493 * UNITCONV_eV_to_AU,
+                    45.14179 * UNITCONV_eV_to_AU,
+                    166.767 * UNITCONV_eV_to_AU,
+                    205.267 * UNITCONV_eV_to_AU,
+                    246.32 * UNITCONV_eV_to_AU,
+                    303.66 * UNITCONV_eV_to_AU,
+                    351.1 * UNITCONV_eV_to_AU,
+                    401.38 * UNITCONV_eV_to_AU,
+                    476.18 * UNITCONV_eV_to_AU,
+                    523.415 * UNITCONV_eV_to_AU,
+                    2437.65804 * UNITCONV_eV_to_AU,
+                    2673.1774 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for copper in atomic units */
-    PMACC_CONST_VECTOR(float_X, 29, Copper,
-        7.72638 * UNITCONV_eV_to_AU,
-        20.2924 * UNITCONV_eV_to_AU,
-        36.8411 * UNITCONV_eV_to_AU,
-        57.385  * UNITCONV_eV_to_AU,
-        79.87   * UNITCONV_eV_to_AU,
-        103.010 * UNITCONV_eV_to_AU,
-        139.012 * UNITCONV_eV_to_AU,
-        166.021 * UNITCONV_eV_to_AU,
-        198.022 * UNITCONV_eV_to_AU,
-        232.25  * UNITCONV_eV_to_AU,
-        265.332 * UNITCONV_eV_to_AU,
-        367.09  * UNITCONV_eV_to_AU,
-        401.03  * UNITCONV_eV_to_AU,
-        436.06  * UNITCONV_eV_to_AU,
-        483.19  * UNITCONV_eV_to_AU,
-        518.712 * UNITCONV_eV_to_AU,
-        552.821 * UNITCONV_eV_to_AU,
-        632.56  * UNITCONV_eV_to_AU,
-        670.608 * UNITCONV_eV_to_AU,
-        1690.59 * UNITCONV_eV_to_AU,
-        1800.3  * UNITCONV_eV_to_AU,
-        1918.4  * UNITCONV_eV_to_AU,
-        2044.6  * UNITCONV_eV_to_AU,
-        2179.4  * UNITCONV_eV_to_AU,
-        2307.32 * UNITCONV_eV_to_AU,
-        2479.12 * UNITCONV_eV_to_AU,
-        2586.95 * UNITCONV_eV_to_AU,
-        11062.4 * UNITCONV_eV_to_AU,
-        11567.6 * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for copper in atomic units */
+                PMACC_CONST_VECTOR(
+                    float_X,
+                    29,
+                    Copper,
+                    7.72638 * UNITCONV_eV_to_AU,
+                    20.2924 * UNITCONV_eV_to_AU,
+                    36.8411 * UNITCONV_eV_to_AU,
+                    57.385 * UNITCONV_eV_to_AU,
+                    79.87 * UNITCONV_eV_to_AU,
+                    103.010 * UNITCONV_eV_to_AU,
+                    139.012 * UNITCONV_eV_to_AU,
+                    166.021 * UNITCONV_eV_to_AU,
+                    198.022 * UNITCONV_eV_to_AU,
+                    232.25 * UNITCONV_eV_to_AU,
+                    265.332 * UNITCONV_eV_to_AU,
+                    367.09 * UNITCONV_eV_to_AU,
+                    401.03 * UNITCONV_eV_to_AU,
+                    436.06 * UNITCONV_eV_to_AU,
+                    483.19 * UNITCONV_eV_to_AU,
+                    518.712 * UNITCONV_eV_to_AU,
+                    552.821 * UNITCONV_eV_to_AU,
+                    632.56 * UNITCONV_eV_to_AU,
+                    670.608 * UNITCONV_eV_to_AU,
+                    1690.59 * UNITCONV_eV_to_AU,
+                    1800.3 * UNITCONV_eV_to_AU,
+                    1918.4 * UNITCONV_eV_to_AU,
+                    2044.6 * UNITCONV_eV_to_AU,
+                    2179.4 * UNITCONV_eV_to_AU,
+                    2307.32 * UNITCONV_eV_to_AU,
+                    2479.12 * UNITCONV_eV_to_AU,
+                    2586.95 * UNITCONV_eV_to_AU,
+                    11062.4 * UNITCONV_eV_to_AU,
+                    11567.6 * UNITCONV_eV_to_AU);
 
-    /* ionization energy for gold in atomic units */
-    PMACC_CONST_VECTOR(float_X, 79, Gold,
-        9.2256 * UNITCONV_eV_to_AU,
-        20.203 * UNITCONV_eV_to_AU,
-        30.016 * UNITCONV_eV_to_AU,
-        45.017 * UNITCONV_eV_to_AU,
-        60.019 * UNITCONV_eV_to_AU,
-        74.020 * UNITCONV_eV_to_AU,
-        94.020 * UNITCONV_eV_to_AU,
-        112.02 * UNITCONV_eV_to_AU,
-        130.12 * UNITCONV_eV_to_AU,
-        149.02 * UNITCONV_eV_to_AU,
-        168.21 * UNITCONV_eV_to_AU,
-        248.01 * UNITCONV_eV_to_AU,
-        275.14 * UNITCONV_eV_to_AU,
-        299.15 * UNITCONV_eV_to_AU,
-        324.16 * UNITCONV_eV_to_AU,
-        365.19 * UNITCONV_eV_to_AU,
-        392.20 * UNITCONV_eV_to_AU,
-        433.21 * UNITCONV_eV_to_AU,
-        487.25 * UNITCONV_eV_to_AU,
-        517.30 * UNITCONV_eV_to_AU,
-        546.30 * UNITCONV_eV_to_AU,
-        600.30 * UNITCONV_eV_to_AU,
-        650.40 * UNITCONV_eV_to_AU,
-        710.40 * UNITCONV_eV_to_AU,
-        760.40 * UNITCONV_eV_to_AU,
-        820.40 * UNITCONV_eV_to_AU,
-        870.40 * UNITCONV_eV_to_AU,
-        930.50 * UNITCONV_eV_to_AU,
-        990.50 * UNITCONV_eV_to_AU,
-        1040.5 * UNITCONV_eV_to_AU,
-        1100.5 * UNITCONV_eV_to_AU,
-        1150.6 * UNITCONV_eV_to_AU,
-        1210.6 * UNITCONV_eV_to_AU,
-        1475.5 * UNITCONV_eV_to_AU,
-        1527.5 * UNITCONV_eV_to_AU,
-        1584.5 * UNITCONV_eV_to_AU,
-        1644.5 * UNITCONV_eV_to_AU,
-        1702.4 * UNITCONV_eV_to_AU,
-        1758.4 * UNITCONV_eV_to_AU,
-        1845.4 * UNITCONV_eV_to_AU,
-        1904.4 * UNITCONV_eV_to_AU,
-        1967.4 * UNITCONV_eV_to_AU,
-        2026.4 * UNITCONV_eV_to_AU,
-        2261.4 * UNITCONV_eV_to_AU,
-        2320.4 * UNITCONV_eV_to_AU,
-        2383.4 * UNITCONV_eV_to_AU,
-        2443.4 * UNITCONV_eV_to_AU,
-        2640.4 * UNITCONV_eV_to_AU,
-        2708.4 * UNITCONV_eV_to_AU,
-        2870.4 * UNITCONV_eV_to_AU,
-        2941.0 * UNITCONV_eV_to_AU,
-        4888.4 * UNITCONV_eV_to_AU,
-        5013.4 * UNITCONV_eV_to_AU,
-        5156.5 * UNITCONV_eV_to_AU,
-        5307.5 * UNITCONV_eV_to_AU,
-        5452.5 * UNITCONV_eV_to_AU,
-        5594.5 * UNITCONV_eV_to_AU,
-        5846.6 * UNITCONV_eV_to_AU,
-        5994.6 * UNITCONV_eV_to_AU,
-        6156.7 * UNITCONV_eV_to_AU,
-        6305.1 * UNITCONV_eV_to_AU,
-        6724.1 * UNITCONV_eV_to_AU,
-        6854.1 * UNITCONV_eV_to_AU,
-        6997.2 * UNITCONV_eV_to_AU,
-        7130.2 * UNITCONV_eV_to_AU,
-        7756.3 * UNITCONV_eV_to_AU,
-        7910.4 * UNITCONV_eV_to_AU,
-        8210.4 * UNITCONV_eV_to_AU,
-        8360.5 * UNITCONV_eV_to_AU,
-        18040. * UNITCONV_eV_to_AU,
-        18401. * UNITCONV_eV_to_AU,
-        18791. * UNITCONV_eV_to_AU,
-        19151. * UNITCONV_eV_to_AU,
-        21471. * UNITCONV_eV_to_AU,
-        21921. * UNITCONV_eV_to_AU,
-        22500. * UNITCONV_eV_to_AU,
-        22868. * UNITCONV_eV_to_AU,
-        91516. * UNITCONV_eV_to_AU,
-        93254. * UNITCONV_eV_to_AU
-    );
+                /* ionization energy for gold in atomic units */
+                PMACC_CONST_VECTOR(
+                    float_X,
+                    79,
+                    Gold,
+                    9.2256 * UNITCONV_eV_to_AU,
+                    20.203 * UNITCONV_eV_to_AU,
+                    30.016 * UNITCONV_eV_to_AU,
+                    45.017 * UNITCONV_eV_to_AU,
+                    60.019 * UNITCONV_eV_to_AU,
+                    74.020 * UNITCONV_eV_to_AU,
+                    94.020 * UNITCONV_eV_to_AU,
+                    112.02 * UNITCONV_eV_to_AU,
+                    130.12 * UNITCONV_eV_to_AU,
+                    149.02 * UNITCONV_eV_to_AU,
+                    168.21 * UNITCONV_eV_to_AU,
+                    248.01 * UNITCONV_eV_to_AU,
+                    275.14 * UNITCONV_eV_to_AU,
+                    299.15 * UNITCONV_eV_to_AU,
+                    324.16 * UNITCONV_eV_to_AU,
+                    365.19 * UNITCONV_eV_to_AU,
+                    392.20 * UNITCONV_eV_to_AU,
+                    433.21 * UNITCONV_eV_to_AU,
+                    487.25 * UNITCONV_eV_to_AU,
+                    517.30 * UNITCONV_eV_to_AU,
+                    546.30 * UNITCONV_eV_to_AU,
+                    600.30 * UNITCONV_eV_to_AU,
+                    650.40 * UNITCONV_eV_to_AU,
+                    710.40 * UNITCONV_eV_to_AU,
+                    760.40 * UNITCONV_eV_to_AU,
+                    820.40 * UNITCONV_eV_to_AU,
+                    870.40 * UNITCONV_eV_to_AU,
+                    930.50 * UNITCONV_eV_to_AU,
+                    990.50 * UNITCONV_eV_to_AU,
+                    1040.5 * UNITCONV_eV_to_AU,
+                    1100.5 * UNITCONV_eV_to_AU,
+                    1150.6 * UNITCONV_eV_to_AU,
+                    1210.6 * UNITCONV_eV_to_AU,
+                    1475.5 * UNITCONV_eV_to_AU,
+                    1527.5 * UNITCONV_eV_to_AU,
+                    1584.5 * UNITCONV_eV_to_AU,
+                    1644.5 * UNITCONV_eV_to_AU,
+                    1702.4 * UNITCONV_eV_to_AU,
+                    1758.4 * UNITCONV_eV_to_AU,
+                    1845.4 * UNITCONV_eV_to_AU,
+                    1904.4 * UNITCONV_eV_to_AU,
+                    1967.4 * UNITCONV_eV_to_AU,
+                    2026.4 * UNITCONV_eV_to_AU,
+                    2261.4 * UNITCONV_eV_to_AU,
+                    2320.4 * UNITCONV_eV_to_AU,
+                    2383.4 * UNITCONV_eV_to_AU,
+                    2443.4 * UNITCONV_eV_to_AU,
+                    2640.4 * UNITCONV_eV_to_AU,
+                    2708.4 * UNITCONV_eV_to_AU,
+                    2870.4 * UNITCONV_eV_to_AU,
+                    2941.0 * UNITCONV_eV_to_AU,
+                    4888.4 * UNITCONV_eV_to_AU,
+                    5013.4 * UNITCONV_eV_to_AU,
+                    5156.5 * UNITCONV_eV_to_AU,
+                    5307.5 * UNITCONV_eV_to_AU,
+                    5452.5 * UNITCONV_eV_to_AU,
+                    5594.5 * UNITCONV_eV_to_AU,
+                    5846.6 * UNITCONV_eV_to_AU,
+                    5994.6 * UNITCONV_eV_to_AU,
+                    6156.7 * UNITCONV_eV_to_AU,
+                    6305.1 * UNITCONV_eV_to_AU,
+                    6724.1 * UNITCONV_eV_to_AU,
+                    6854.1 * UNITCONV_eV_to_AU,
+                    6997.2 * UNITCONV_eV_to_AU,
+                    7130.2 * UNITCONV_eV_to_AU,
+                    7756.3 * UNITCONV_eV_to_AU,
+                    7910.4 * UNITCONV_eV_to_AU,
+                    8210.4 * UNITCONV_eV_to_AU,
+                    8360.5 * UNITCONV_eV_to_AU,
+                    18040. * UNITCONV_eV_to_AU,
+                    18401. * UNITCONV_eV_to_AU,
+                    18791. * UNITCONV_eV_to_AU,
+                    19151. * UNITCONV_eV_to_AU,
+                    21471. * UNITCONV_eV_to_AU,
+                    21921. * UNITCONV_eV_to_AU,
+                    22500. * UNITCONV_eV_to_AU,
+                    22868. * UNITCONV_eV_to_AU,
+                    91516. * UNITCONV_eV_to_AU,
+                    93254. * UNITCONV_eV_to_AU);
 
-} // namespace AU
-} // namespace energies
-} // namespace ionization
+            } // namespace AU
+        } // namespace energies
+    } // namespace ionization
 } // namespace picongpu
diff --git a/include/picongpu/param/ionizer.param b/include/picongpu/param/ionizer.param
index 7ecc3189a6..900854127a 100644
--- a/include/picongpu/param/ionizer.param
+++ b/include/picongpu/param/ionizer.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten, Axel Huebl
+/* Copyright 2014-2021 Marco Garten, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -23,7 +23,6 @@
  * of the periodic table. The elements here should have a matching list of
  * ionization energies in @see ionizationEnergies.param. Moreover this file
  * contains a description of how to configure an ionization model for a species.
- * Currently each species can only be assigned exactly one ionization model.
  *
  * Furthermore there are parameters for specific ionization models to be found
  * here. That includes lists of screened nuclear charges as seen by bound
@@ -40,474 +39,495 @@
 
 namespace picongpu
 {
-/** Ionization Model Configuration
- *
- * - None : no particle is ionized
- * - BSI : simple barrier suppression ionization
- * - BSIEffectiveZ : BSI taking electron shielding into account via an effective
- *                   atomic number Z_eff
- * - ADKLinPol : Ammosov-Delone-Krainov tunneling ionization (H-like)
- *               -> linearly polarized lasers
- * - ADKCircPol : Ammosov-Delone-Krainov tunneling ionization (H-like)
- *                -> circularly polarized lasers
- * - Keldysh : Keldysh ionization model
- * - ThomasFermi : statistical impact ionization based on Thomas-Fermi
- *                 atomic model
- *                 Attention: requires 2 FieldTmp slots @see memory.param
- *
- * Research and development:
- * - BSIStarkShifted : BSI for hydrogen-like atoms and ions considering the
- *                     Stark upshift of ionization potentials
- *
- * Usage: Add flags to the list of particle flags that has the following structure
- *
- *        ionizers< MakeSeq_t< particles::ionization::IonizationModel< Species2BCreated > > >,
- *        atomicNumbers< ionization::atomicNumbers::Element_t >,
- *        effectiveNuclearCharge< ionization::effectiveNuclearCharge::Element_t >,
- *        ionizationEnergies< ionization::energies::AU::Element_t >
- */
-namespace ionization
-{
-/*! Specify (chemical) element
- *
- * Proton and neutron numbers define the chemical element that the ion species
- * is based on. This value can be non-integer for physical models taking
- * charge shielding effects into account.
- * @see http://en.wikipedia.org/wiki/Effective_nuclear_charge
- *
- * It is wrapped into a struct because of C++ restricting floats from being
- * template arguments.
- *
- * Do not forget to set the correct mass and charge via
- * `massRatio<>` and `chargeRatio<>`!
- */
-namespace atomicNumbers
-{
-    /** H-1 99.98% NA */
-    struct Hydrogen_t
-    {
-        static constexpr float_X numberOfProtons  = 1.0;
-        static constexpr float_X numberOfNeutrons = 0.0;
-    };
-
-    /** H-2 0.02% NA */
-    struct Deuterium_t
-    {
-        static constexpr float_X numberOfProtons  = 1.0;
-        static constexpr float_X numberOfNeutrons = 1.0;
-    };
-
-    /** He-4 ~100% NA */
-    struct Helium_t
-    {
-        static constexpr float_X numberOfProtons  = 2.0;
-        static constexpr float_X numberOfNeutrons = 2.0;
-    };
-
-    /** C-12 98.9% NA */
-    struct Carbon_t
-    {
-        static constexpr float_X numberOfProtons  = 6.0;
-        static constexpr float_X numberOfNeutrons = 6.0;
-    };
-
-    /** N-14 99.6% NA */
-    struct Nitrogen_t
-    {
-        static constexpr float_X numberOfProtons  = 7.0;
-        static constexpr float_X numberOfNeutrons = 7.0;
-    };
-
-    /** O-16 99.76% NA */
-    struct Oxygen_t
-    {
-        static constexpr float_X numberOfProtons  = 8.0;
-        static constexpr float_X numberOfNeutrons = 8.0;
-    };
-
-    /** Al-27 ~100% NA */
-    struct Aluminium_t
-    {
-        static constexpr float_X numberOfProtons  = 13.0;
-        static constexpr float_X numberOfNeutrons = 14.0;
-    };
-
-    /** Si-28 ~92.23% NA */
-    struct Silicon_t
-    {
-        static constexpr float_X numberOfProtons  = 14.0;
-        static constexpr float_X numberOfNeutrons = 14.0;
-    };
-
-    /** Cu-63 69.15% NA */
-    struct Copper_t
-    {
-        static constexpr float_X numberOfProtons  = 29.0;
-        static constexpr float_X numberOfNeutrons = 34.0;
-    };
-
-    /** Au-197 ~100% NA */
-    struct Gold_t
-    {
-        static constexpr float_X numberOfProtons  = 79.0;
-        static constexpr float_X numberOfNeutrons = 118.0;
-    };
-} // namespace atomicNumbers
-
-/** Effective Nuclear Charge
- *
- * Due to the shielding effect of inner electron shells in an atom / ion
- * which makes the core charge seem smaller to valence electrons
- * new, effective, atomic core charge numbers can be defined to make the
- * crude barrier suppression ionization (BSI) model less inaccurate.
- *
- * @see https://en.wikipedia.org/wiki/Effective_nuclear_charge
- * or refer directly to the calculations by Slater or Clementi and Raimondi
- *
- * References:
- *   Clementi, E.; Raimondi, D. L. (1963)
- *     "Atomic Screening Constants from SCF Functions"
- *     J. Chem. Phys. 38 (11): 2686–2689. doi:10.1063/1.1733573
- *   Clementi, E.; Raimondi, D. L.; Reinhardt, W. P. (1967)
- *     "Atomic Screening Constants from SCF Functions. II. Atoms with 37 to 86 Electrons"
- *     Journal of Chemical Physics. 47: 1300–1307. doi:10.1063/1.1712084
- *
- * IMPORTANT NOTE:
- * You have to insert the values in REVERSE order since the lowest shell
- * corresponds to the last ionization process!
- */
-namespace effectiveNuclearCharge
-{
-    /* For hydrogen Z_eff is obviously equal to Z */
-    PMACC_CONST_VECTOR(float_X, 1, Hydrogen,
-        /* 1s^1 */
-        1.
-    );
-
-    /* Example: deuterium */
-    PMACC_CONST_VECTOR(float_X, 1, Deuterium,
-        /* 1s^1 */
-        1.
-    );
-
-    /* Example: helium */
-    PMACC_CONST_VECTOR(float_X, 2, Helium,
-        /* 1s^2 */
-        1.688,
-        1.688
-    );
-
-    /* Example: carbon */
-    PMACC_CONST_VECTOR(float_X, 6, Carbon,
-        /* 2p^2 */
-        3.136,
-        3.136,
-        /* 2s^2 */
-        3.217,
-        3.217,
-        /* 1s^2 */
-        5.673,
-        5.673
-    );
-
-    /* Example: nitrogen */
-    PMACC_CONST_VECTOR(float_X, 7, Nitrogen,
-        /* 2p^3 */
-        3.834,
-        3.834,
-        3.834,
-        /* 2s^2 */
-        3.874,
-        3.874,
-        /* 1s^2 */
-        6.665,
-        6.665
-    );
-
-    /* Example: oxygen */
-    PMACC_CONST_VECTOR(float_X, 8, Oxygen,
-        /* 2p^4 */
-        4.453,
-        4.453,
-        4.453,
-        4.453,
-        /* 2s^2 */
-        4.492,
-        4.492,
-        /* 1s^2 */
-        7.658,
-        7.658
-    );
-
-    /* Example: aluminium */
-    PMACC_CONST_VECTOR(float_X, 13, Aluminium,
-        /* 3p^1 */
-        4.066,
-        /* 3s^2 */
-        4.117,
-        4.117,
-        /* 2p^6 */
-        8.963,
-        8.963,
-        8.963,
-        8.963,
-        8.963,
-        8.963,
-        /* 2s^2 */
-        8.214,
-        8.214,
-        /* 1s^2 */
-        12.591,
-        12.591
-    );
-
-    /* Example: silicon */
-    PMACC_CONST_VECTOR(float_X, 14, Silicon,
-        /* 3p^2 */
-        4.285,
-        4.285,
-        /* 3s^2 */
-        4.903,
-        4.903,
-        /* 2p^6 */
-        9.945,
-        9.945,
-        9.945,
-        9.945,
-        9.945,
-        9.945,
-        /* 2s^2 */
-        9.020,
-        9.020,
-        /* 1s^2 */
-        13.575,
-        13.575
-    );
-
-
-    /* Example: copper
-     * Note: Copper is one of the few exceptions to the Madelung energy ordering
-     *       rule! Other exceptions: Au, Ag, Pd, Cr, Mo
-     *       predicted configuration: [Ar] 4s^2 3d^9
-     *       actual configuration:    [Ar] 4s^1 3d^10
-     */
-    PMACC_CONST_VECTOR(float_X, 29, Copper,
-        /* 3d^10 */
-        13.201,
-        13.201,
-        13.201,
-        13.201,
-        13.201,
-        13.201,
-        13.201,
-        13.201,
-        13.201,
-        13.201,
-        /* 4s^1 */
-        5.842,
-        /* 3p^6 */
-        14.731,
-        14.731,
-        14.731,
-        14.731,
-        14.731,
-        14.731,
-        /* 3s^2 */
-        15.594,
-        15.594,
-        /* 2p^6 */
-        25.097,
-        25.097,
-        25.097,
-        25.097,
-        25.097,
-        25.097,
-        /* 2s^2 */
-        21.020,
-        21.020,
-        /* 1s^2 */
-        28.339,
-        28.339
-    );
-
-    /* Example: gold
-     * Note: Gold is one of the few exceptions to the Madelung energy ordering
-     *       rule! Other exceptions: Cu, Ag, Pd, Cr, Mo
-     *       predicted configuration: [Xe] 6s^2 4f^14 5d^9
-     *       actual configuration:    [Xe] 6s^1 4f^14 5d^10
-     */
-    PMACC_CONST_VECTOR(float_X, 79, Gold,
-        /* 5d^10 */
-        20.126,
-        20.126,
-        20.126,
-        20.126,
-        20.126,
-        20.126,
-        20.126,
-        20.126,
-        20.126,
-        20.126,
-        /* 4f^14 */
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        40.650,
-        /* 6s^1 */
-        10.938,
-        /* 5p^6 */
-        25.170,
-        25.170,
-        25.170,
-        25.170,
-        25.170,
-        25.170,
-        /* 4d^10 */
-        41.528,
-        41.528,
-        41.528,
-        41.528,
-        41.528,
-        41.528,
-        41.528,
-        41.528,
-        41.528,
-        41.528,
-        /* 5s^2 */
-        27.327,
-        27.327,
-        /* 4p^6 */
-        43.547,
-        43.547,
-        43.547,
-        43.547,
-        43.547,
-        43.547,
-        /* 3d^10 */
-        65.508,
-        65.508,
-        65.508,
-        65.508,
-        65.508,
-        65.508,
-        65.508,
-        65.508,
-        65.508,
-        65.508,
-        /* 4s^2 */
-        44.413,
-        44.413,
-        /* 3p^6 */
-        56.703,
-        56.703,
-        56.703,
-        56.703,
-        56.703,
-        56.703,
-        /* 3s^2 */
-        55.763,
-        55.763,
-        /* 2p^6 */
-        74.513,
-        74.513,
-        74.513,
-        74.513,
-        74.513,
-        74.513,
-        /* 2s^2 */
-        58.370,
-        58.370,
-        /* 1s^2 */
-        77.476,
-        77.476
-    );
-} // namespace effectiveNuclearCharge
-} // namespace ionization
-
-namespace particles
-{
-namespace ionization
-{
-namespace thomasFermi
-{
-
-    /** Fitting parameters to average ionization degree Z* = 4/3*pi*R_0^3 * n(R_0)
-     * as an extension towards arbitrary atoms and temperatures
-     *
-     * See table IV of
-     * \url http://www.sciencedirect.com/science/article/pii/S0065219908601451
-     * doi:10.1016/S0065-2199(08)60145-1
-     */
-    constexpr float_X TFAlpha = 14.3139;
-    constexpr float_X TFBeta  =  0.6624;
-
-    constexpr float_X TFA1    =  3.323e-3;
-    constexpr float_X TFA2    =  9.718e-1;
-    constexpr float_X TFA3    =  9.26148e-5;
-    constexpr float_X TFA4    =  3.10165;
-
-    constexpr float_X TFB0    = -1.7630;
-    constexpr float_X TFB1    =  1.43175;
-    constexpr float_X TFB2    =  0.31546;
-
-    constexpr float_X TFC1    = -0.366667;
-    constexpr float_X TFC2    =  0.983333;
-
-    /** cutoff energy for electron "temperature" calculation
-     *
-     * In laser produced plasmas we can have different, well-separable groups
-     * of electrons. For the Thomas-Fermi ionization model we only want the
-     * thermalized "bulk" electrons. Including the high-energy "prompt"
-     * electrons is physically questionable since they do not have a large
-     * cross section for collisional ionization.
-     *
-     * unit: keV
-     */
-    constexpr float_X CUTOFF_MAX_ENERGY_KEV = 50.0;
-    /** cutoff energy for electron "temperature" calculation in SI units*/
-    constexpr float_X CUTOFF_MAX_ENERGY = CUTOFF_MAX_ENERGY_KEV * UNITCONV_keV_to_Joule;
-
-    /** lower ion density cutoff
+    /** Ionization Model Configuration
      *
-     * The Thomas-Fermi model yields unphysical artifacts for low ion densities.
-     * Low ion densities imply lower collision frequency and thus less collisional ionization.
-     * The Thomas-Fermi model yields an increasing charge state for decreasing densities and electron temperatures of 10eV and above.
-     * This cutoff will be used to set the lower application threshold for charge state calculation.
+     * - None : no particle is ionized
+     * - BSI : simple barrier suppression ionization
+     * - BSIEffectiveZ : BSI taking electron shielding into account via an effective
+     *                   atomic number Z_eff
+     * - ADKLinPol : Ammosov-Delone-Krainov tunneling ionization (H-like)
+     *               -> linearly polarized lasers
+     * - ADKCircPol : Ammosov-Delone-Krainov tunneling ionization (H-like)
+     *                -> circularly polarized lasers
+     * - Keldysh : Keldysh ionization model
+     * - ThomasFermi : statistical impact ionization based on Thomas-Fermi
+     *                 atomic model
+     *                 Attention: requires 2 FieldTmp slots @see memory.param
      *
-     * @note This cutoff value should be set in accordance to FLYCHK calculations,
-     *       for instance! It is not a universal value and requires some preliminary
-     *       approximations!
+     * Research and development:
+     * - BSIStarkShifted : BSI for hydrogen-like atoms and ions considering the
+     *                     Stark upshift of ionization potentials
      *
-     * unit: 1 / m^3
+     * Usage: Add flags to the list of particle flags that has the following structure
      *
-     * example: 1.7422e27 as a hydrogen ion number density equal to the corresponding critical electron number density for an 800nm laser
-     *
-     * The choice of the default is motivated by by the following:
-     * In laser-driven plasmas all dynamics in density regions below the
-     * critical electron density will be laser-dominated. Once ions of that density
-     * are ionized once the laser will not penetrate fully anymore and the as electrons are heated
-     * the dynamics will be collision-dominated.
-     */
-    constexpr float_X CUTOFF_LOW_DENSITY = 1.7422e27;
-
-    /** lower electron temperature cutoff
-     *
-     * The Thomas-Fermi model predicts initial ionization for many materials of
-     * solid density even when the electron temperature is 0.
+     *        ionizers< MakeSeq_t< particles::ionization::IonizationModel< Species2BCreated > > >,
+     *        atomicNumbers< ionization::atomicNumbers::Element_t >,
+     *        effectiveNuclearCharge< ionization::effectiveNuclearCharge::Element_t >,
+     *        ionizationEnergies< ionization::energies::AU::Element_t >
      */
-    constexpr float_X CUTOFF_LOW_TEMPERATURE_EV = 1.0;
-
-} // namespace thomasFermi
-} // namespace ionization
-} // namespace particles
+    namespace ionization
+    {
+        /*! Specify (chemical) element
+         *
+         * Proton and neutron numbers define the chemical element that the ion species
+         * is based on. This value can be non-integer for physical models taking
+         * charge shielding effects into account.
+         * @see http://en.wikipedia.org/wiki/Effective_nuclear_charge
+         *
+         * It is wrapped into a struct because of C++ restricting floats from being
+         * template arguments.
+         *
+         * Do not forget to set the correct mass and charge via
+         * `massRatio<>` and `chargeRatio<>`!
+         */
+        namespace atomicNumbers
+        {
+            /** H-1 99.98% NA */
+            struct Hydrogen_t
+            {
+                static constexpr float_X numberOfProtons = 1.0;
+                static constexpr float_X numberOfNeutrons = 0.0;
+            };
+
+            /** H-2 0.02% NA */
+            struct Deuterium_t
+            {
+                static constexpr float_X numberOfProtons = 1.0;
+                static constexpr float_X numberOfNeutrons = 1.0;
+            };
+
+            /** He-4 ~100% NA */
+            struct Helium_t
+            {
+                static constexpr float_X numberOfProtons = 2.0;
+                static constexpr float_X numberOfNeutrons = 2.0;
+            };
+
+            /** C-12 98.9% NA */
+            struct Carbon_t
+            {
+                static constexpr float_X numberOfProtons = 6.0;
+                static constexpr float_X numberOfNeutrons = 6.0;
+            };
+
+            /** N-14 99.6% NA */
+            struct Nitrogen_t
+            {
+                static constexpr float_X numberOfProtons = 7.0;
+                static constexpr float_X numberOfNeutrons = 7.0;
+            };
+
+            /** O-16 99.76% NA */
+            struct Oxygen_t
+            {
+                static constexpr float_X numberOfProtons = 8.0;
+                static constexpr float_X numberOfNeutrons = 8.0;
+            };
+
+            /** Al-27 ~100% NA */
+            struct Aluminium_t
+            {
+                static constexpr float_X numberOfProtons = 13.0;
+                static constexpr float_X numberOfNeutrons = 14.0;
+            };
+
+            /** Si-28 ~92.23% NA */
+            struct Silicon_t
+            {
+                static constexpr float_X numberOfProtons = 14.0;
+                static constexpr float_X numberOfNeutrons = 14.0;
+            };
+
+            /** Cu-63 69.15% NA */
+            struct Copper_t
+            {
+                static constexpr float_X numberOfProtons = 29.0;
+                static constexpr float_X numberOfNeutrons = 34.0;
+            };
+
+            /** Au-197 ~100% NA */
+            struct Gold_t
+            {
+                static constexpr float_X numberOfProtons = 79.0;
+                static constexpr float_X numberOfNeutrons = 118.0;
+            };
+        } // namespace atomicNumbers
+
+        /** Effective Nuclear Charge
+         *
+         * Due to the shielding effect of inner electron shells in an atom / ion
+         * which makes the core charge seem smaller to valence electrons
+         * new, effective, atomic core charge numbers can be defined to make the
+         * crude barrier suppression ionization (BSI) model less inaccurate.
+         *
+         * @see https://en.wikipedia.org/wiki/Effective_nuclear_charge
+         * or refer directly to the calculations by Slater or Clementi and Raimondi
+         *
+         * References:
+         *   Clementi, E.; Raimondi, D. L. (1963)
+         *     "Atomic Screening Constants from SCF Functions"
+         *     J. Chem. Phys. 38 (11): 2686–2689. doi:10.1063/1.1733573
+         *   Clementi, E.; Raimondi, D. L.; Reinhardt, W. P. (1967)
+         *     "Atomic Screening Constants from SCF Functions. II. Atoms with 37 to 86 Electrons"
+         *     Journal of Chemical Physics. 47: 1300–1307. doi:10.1063/1.1712084
+         *
+         * IMPORTANT NOTE:
+         * You have to insert the values in REVERSE order since the lowest shell
+         * corresponds to the last ionization process!
+         */
+        namespace effectiveNuclearCharge
+        {
+            /* For hydrogen Z_eff is obviously equal to Z */
+            PMACC_CONST_VECTOR(
+                float_X,
+                1,
+                Hydrogen,
+                /* 1s^1 */
+                1.);
+
+            /* Example: deuterium */
+            PMACC_CONST_VECTOR(
+                float_X,
+                1,
+                Deuterium,
+                /* 1s^1 */
+                1.);
+
+            /* Example: helium */
+            PMACC_CONST_VECTOR(
+                float_X,
+                2,
+                Helium,
+                /* 1s^2 */
+                1.688,
+                1.688);
+
+            /* Example: carbon */
+            PMACC_CONST_VECTOR(
+                float_X,
+                6,
+                Carbon,
+                /* 2p^2 */
+                3.136,
+                3.136,
+                /* 2s^2 */
+                3.217,
+                3.217,
+                /* 1s^2 */
+                5.673,
+                5.673);
+
+            /* Example: nitrogen */
+            PMACC_CONST_VECTOR(
+                float_X,
+                7,
+                Nitrogen,
+                /* 2p^3 */
+                3.834,
+                3.834,
+                3.834,
+                /* 2s^2 */
+                3.874,
+                3.874,
+                /* 1s^2 */
+                6.665,
+                6.665);
+
+            /* Example: oxygen */
+            PMACC_CONST_VECTOR(
+                float_X,
+                8,
+                Oxygen,
+                /* 2p^4 */
+                4.453,
+                4.453,
+                4.453,
+                4.453,
+                /* 2s^2 */
+                4.492,
+                4.492,
+                /* 1s^2 */
+                7.658,
+                7.658);
+
+            /* Example: aluminium */
+            PMACC_CONST_VECTOR(
+                float_X,
+                13,
+                Aluminium,
+                /* 3p^1 */
+                4.066,
+                /* 3s^2 */
+                4.117,
+                4.117,
+                /* 2p^6 */
+                8.963,
+                8.963,
+                8.963,
+                8.963,
+                8.963,
+                8.963,
+                /* 2s^2 */
+                8.214,
+                8.214,
+                /* 1s^2 */
+                12.591,
+                12.591);
+
+            /* Example: silicon */
+            PMACC_CONST_VECTOR(
+                float_X,
+                14,
+                Silicon,
+                /* 3p^2 */
+                4.285,
+                4.285,
+                /* 3s^2 */
+                4.903,
+                4.903,
+                /* 2p^6 */
+                9.945,
+                9.945,
+                9.945,
+                9.945,
+                9.945,
+                9.945,
+                /* 2s^2 */
+                9.020,
+                9.020,
+                /* 1s^2 */
+                13.575,
+                13.575);
+
+
+            /* Example: copper
+             * Note: Copper is one of the few exceptions to the Madelung energy ordering
+             *       rule! Other exceptions: Au, Ag, Pd, Cr, Mo
+             *       predicted configuration: [Ar] 4s^2 3d^9
+             *       actual configuration:    [Ar] 4s^1 3d^10
+             */
+            PMACC_CONST_VECTOR(
+                float_X,
+                29,
+                Copper,
+                /* 3d^10 */
+                13.201,
+                13.201,
+                13.201,
+                13.201,
+                13.201,
+                13.201,
+                13.201,
+                13.201,
+                13.201,
+                13.201,
+                /* 4s^1 */
+                5.842,
+                /* 3p^6 */
+                14.731,
+                14.731,
+                14.731,
+                14.731,
+                14.731,
+                14.731,
+                /* 3s^2 */
+                15.594,
+                15.594,
+                /* 2p^6 */
+                25.097,
+                25.097,
+                25.097,
+                25.097,
+                25.097,
+                25.097,
+                /* 2s^2 */
+                21.020,
+                21.020,
+                /* 1s^2 */
+                28.339,
+                28.339);
+
+            /* Example: gold
+             * Note: Gold is one of the few exceptions to the Madelung energy ordering
+             *       rule! Other exceptions: Cu, Ag, Pd, Cr, Mo
+             *       predicted configuration: [Xe] 6s^2 4f^14 5d^9
+             *       actual configuration:    [Xe] 6s^1 4f^14 5d^10
+             */
+            PMACC_CONST_VECTOR(
+                float_X,
+                79,
+                Gold,
+                /* 5d^10 */
+                20.126,
+                20.126,
+                20.126,
+                20.126,
+                20.126,
+                20.126,
+                20.126,
+                20.126,
+                20.126,
+                20.126,
+                /* 4f^14 */
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                40.650,
+                /* 6s^1 */
+                10.938,
+                /* 5p^6 */
+                25.170,
+                25.170,
+                25.170,
+                25.170,
+                25.170,
+                25.170,
+                /* 4d^10 */
+                41.528,
+                41.528,
+                41.528,
+                41.528,
+                41.528,
+                41.528,
+                41.528,
+                41.528,
+                41.528,
+                41.528,
+                /* 5s^2 */
+                27.327,
+                27.327,
+                /* 4p^6 */
+                43.547,
+                43.547,
+                43.547,
+                43.547,
+                43.547,
+                43.547,
+                /* 3d^10 */
+                65.508,
+                65.508,
+                65.508,
+                65.508,
+                65.508,
+                65.508,
+                65.508,
+                65.508,
+                65.508,
+                65.508,
+                /* 4s^2 */
+                44.413,
+                44.413,
+                /* 3p^6 */
+                56.703,
+                56.703,
+                56.703,
+                56.703,
+                56.703,
+                56.703,
+                /* 3s^2 */
+                55.763,
+                55.763,
+                /* 2p^6 */
+                74.513,
+                74.513,
+                74.513,
+                74.513,
+                74.513,
+                74.513,
+                /* 2s^2 */
+                58.370,
+                58.370,
+                /* 1s^2 */
+                77.476,
+                77.476);
+        } // namespace effectiveNuclearCharge
+    } // namespace ionization
+
+    namespace particles
+    {
+        namespace ionization
+        {
+            namespace thomasFermi
+            {
+                /** Fitting parameters to average ionization degree Z* = 4/3*pi*R_0^3 * n(R_0)
+                 * as an extension towards arbitrary atoms and temperatures
+                 *
+                 * See table IV of
+                 * \url http://www.sciencedirect.com/science/article/pii/S0065219908601451
+                 * doi:10.1016/S0065-2199(08)60145-1
+                 */
+                constexpr float_X TFAlpha = 14.3139;
+                constexpr float_X TFBeta = 0.6624;
+
+                constexpr float_X TFA1 = 3.323e-3;
+                constexpr float_X TFA2 = 9.718e-1;
+                constexpr float_X TFA3 = 9.26148e-5;
+                constexpr float_X TFA4 = 3.10165;
+
+                constexpr float_X TFB0 = -1.7630;
+                constexpr float_X TFB1 = 1.43175;
+                constexpr float_X TFB2 = 0.31546;
+
+                constexpr float_X TFC1 = -0.366667;
+                constexpr float_X TFC2 = 0.983333;
+
+                /** cutoff energy for electron "temperature" calculation
+                 *
+                 * In laser produced plasmas we can have different, well-separable groups
+                 * of electrons. For the Thomas-Fermi ionization model we only want the
+                 * thermalized "bulk" electrons. Including the high-energy "prompt"
+                 * electrons is physically questionable since they do not have a large
+                 * cross section for collisional ionization.
+                 *
+                 * unit: keV
+                 */
+                constexpr float_X CUTOFF_MAX_ENERGY_KEV = 50.0;
+                /** cutoff energy for electron "temperature" calculation in SI units*/
+                constexpr float_X CUTOFF_MAX_ENERGY = CUTOFF_MAX_ENERGY_KEV * UNITCONV_keV_to_Joule;
+
+                /** lower ion density cutoff
+                 *
+                 * The Thomas-Fermi model yields unphysical artifacts for low ion densities.
+                 * Low ion densities imply lower collision frequency and thus less collisional ionization.
+                 * The Thomas-Fermi model yields an increasing charge state for decreasing densities and electron
+                 * temperatures of 10eV and above. This cutoff will be used to set the lower application threshold for
+                 * charge state calculation.
+                 *
+                 * @note This cutoff value should be set in accordance to FLYCHK calculations,
+                 *       for instance! It is not a universal value and requires some preliminary
+                 *       approximations!
+                 *
+                 * unit: 1 / m^3
+                 *
+                 * example: 1.7422e27 as a hydrogen ion number density equal to the corresponding critical electron
+                 * number density for an 800nm laser
+                 *
+                 * The choice of the default is motivated by by the following:
+                 * In laser-driven plasmas all dynamics in density regions below the
+                 * critical electron density will be laser-dominated. Once ions of that density
+                 * are ionized once the laser will not penetrate fully anymore and the as electrons are heated
+                 * the dynamics will be collision-dominated.
+                 */
+                constexpr float_X CUTOFF_LOW_DENSITY = 1.7422e27;
+
+                /** lower electron temperature cutoff
+                 *
+                 * The Thomas-Fermi model predicts initial ionization for many materials of
+                 * solid density even when the electron temperature is 0.
+                 */
+                constexpr float_X CUTOFF_LOW_TEMPERATURE_EV = 1.0;
+
+            } // namespace thomasFermi
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/param/isaac.param b/include/picongpu/param/isaac.param
index 4d02df141d..8e86287915 100644
--- a/include/picongpu/param/isaac.param
+++ b/include/picongpu/param/isaac.param
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Matthes
+/* Copyright 2016-2021 Alexander Matthes
  *
  * This file is part of PIConGPU.
  *
@@ -41,35 +41,24 @@
 
 namespace picongpu
 {
-namespace isaacP
-{
-
-    /** Intermediate list of native particle species of PIConGPU which shall be
-     *  visualized. */
-    using Particle_Seq = VectorAllSpecies;
+    namespace isaacP
+    {
+        /** Intermediate list of native particle species of PIConGPU which shall be
+         *  visualized. */
+        using Particle_Seq = VectorAllSpecies;
 
-    /** Intermediate list of native fields of PIConGPU which shall be
-     *  visualized. */
-    using Native_Seq = MakeSeq_t<
-        FieldE,
-        FieldB,
-        FieldJ
-    >;
+        /** Intermediate list of native fields of PIConGPU which shall be
+         *  visualized. */
+        using Native_Seq = MakeSeq_t<FieldE, FieldB, FieldJ>;
 
-    /** Intermediate list of particle species, from which density fields
-     *  shall be created at runtime to visualize them. */
-    using Density_Seq = deriveField::CreateEligible_t<
-        Particle_Seq,
-        deriveField::derivedAttributes::Density
-    >;
+        /** Intermediate list of particle species, from which density fields
+         *  shall be created at runtime to visualize them. */
+        using Density_Seq = deriveField::CreateEligible_t<Particle_Seq, deriveField::derivedAttributes::Density>;
 
-    /** Compile time sequence of all fields which shall be visualized. Basically
-     *  the join of Native_Seq and Density_Seq. */
-    using Fields_Seq = MakeSeq_t<
-        Native_Seq,
-        Density_Seq
-    >;
+        /** Compile time sequence of all fields which shall be visualized. Basically
+         *  the join of Native_Seq and Density_Seq. */
+        using Fields_Seq = MakeSeq_t<Native_Seq, Density_Seq>;
 
 
-} // namespace isaacP
+    } // namespace isaacP
 } // namespace picongpu
diff --git a/include/picongpu/param/laser.param b/include/picongpu/param/laser.param
index a624b4032f..3df0a5b4f5 100644
--- a/include/picongpu/param/laser.param
+++ b/include/picongpu/param/laser.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch,
  *                     Alexander Debus
  *
  * This file is part of PIConGPU.
@@ -49,541 +49,559 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace gaussianBeam
-{
-    //! Use only the 0th Laguerremode for a standard Gaussian
-    static constexpr uint32_t MODENUMBER = 0;
-    PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, 1.0);
-    // This is just an example for a more complicated set of Laguerre modes
-    //constexpr uint32_t MODENUMBER = 12;
-    //PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, -1.0, 0.0300519, 0.319461, -0.23783, 0.0954839, 0.0318653, -0.144547, 0.0249208, -0.111989, 0.0434385, -0.030038, -0.00896321, -0.0160788);
-
-} // namespace gaussianBeam
-
-    struct GaussianBeamParam
-    {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        //static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *
-         *  unit: meter */
-        static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
-        /** the distance to the laser focus in y-direction
-         *  unit: meter */
-        static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 20.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        using LAGUERREMODES_t = gaussianBeam::LAGUERREMODES_t;
-        static constexpr uint32_t MODENUMBER = gaussianBeam::MODENUMBER;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
-        {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = CIRCULAR;
-    };
-
-    struct PulseFrontTiltParam
-    {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        //static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *
-         *  unit: meter */
-        static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
-
-        /** the distance to the laser focus in y-direction
-         *  unit: meter */
-        static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
-
-        /** the tilt angle between laser propagation in y-direction and laser axis in
-        *  x-direction (0 degree == no tilt)
-        *  unit: degree */
-        static constexpr float_64 TILT_X_SI = 0.0;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 20.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        //! Available polarisation types
-        enum PolarisationType
-        {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = CIRCULAR;
-    };
-
-    struct WavepacketParam
-    {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        //static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** The profile of the test Lasers 0 and 2 can be stretched by a
-         *      constant area between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 7.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *
-         *  unit: meter */
-        static constexpr float_64 W0_X_SI = 4.246e-6;
-        static constexpr float_64 W0_Z_SI = W0_X_SI;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 20.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
-        {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-
-    /** Based on a wavepacket with Gaussian spatial envelope
-     *
-     * and the following temporal shape:
-     * A Gaussian peak (optionally lengthened by a plateau) is preceded by
-     * two pieces of exponential preramps, defined by 3 (time, intensity)-
-     * -points.
-     * The first two points get connected by an exponential, the 2nd and
-     * 3rd point are connected by another exponential, which is then
-     * extrapolated to the peak. The Gaussian is added everywhere, but
-     * typically contributes significantly only near the peak.
-     * It is advisable to set the third point far enough from the plateau
-     * (approx 3*FWHM), then the contribution from the Gaussian is
-     * negligible there, and the intensity can be set as measured from the
-     * laser profile.
-     * Optionally a Gaussian prepulse can be added, given by the parameters
-     * of the relative intensity and time point.
-     * The time of the prepulse and the three preramp points are given in
-     * SI, the intensities are given as multiples of the peak intensity.
-     */
-    struct ExpRampWithPrepulseParam
-    {
-        // Intensities of prepulse and exponential preramp
-        static constexpr float_X INT_RATIO_PREPULSE = 0.;
-        static constexpr float_X INT_RATIO_POINT_1 = 1.e-8;
-        static constexpr float_X INT_RATIO_POINT_2 = 1.e-4;
-        static constexpr float_X INT_RATIO_POINT_3 = 1.e-4;
-
-        // time-positions of prepulse and preramps points
-        static constexpr float_64 TIME_PREPULSE_SI = -950.0e-15;
-        static constexpr float_64 TIME_PEAKPULSE_SI = 0.0e-15;
-        static constexpr float_64 TIME_POINT_1_SI = -1000.0e-15;
-        static constexpr float_64 TIME_POINT_2_SI = -300.0e-15;
-        static constexpr float_64 TIME_POINT_3_SI = -100.0e-15;
-
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** UNITCONV */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        static constexpr float_64 _A0  = 20.;
-
-        /** unit: Volt /meter */
-        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt /meter */
-        //constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** The profile of the test Lasers 0 and 2 can be stretched by a
-         *      constant area between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 0.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 3.0e-14 / 2.35482; // half of the time in which E falls to half its initial value (then I falls to half its value in 15fs, approx 6 wavelengths). Those are 4.8 wavelenghts.
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              WO_X_SI is this distance in x-direction
-         *              W0_Z_SI is this distance in z-direction
-         *              if both values are equal, the laser has a circular shape in x-z
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *  unit: meter */
-        static constexpr float_64 W0_X_SI = 2.5 * WAVE_LENGTH_SI;
-        static constexpr float_64 W0_Z_SI = W0_X_SI;
-
-        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before plateau
-         *  and half at the end of the plateau
-         *  unit: none */
-        static constexpr float_64 RAMP_INIT = 16.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
-        {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-
-    /** Based on a wavepacket with Gaussian spatial envelope
-     *
-     * Wavepacket with a polynomial temporal intensity shape.
-     */
-    struct PolynomParam
-    {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        //static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** The profile of the test Lasers 0 and 2 can be stretched by a
-         *      constant area between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 13.34e-15;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         *  unit: meter
-         */
-        static constexpr float_64 W0_X_SI = 4.246e-6; // waist in x-direction
-        static constexpr float_64 W0_Z_SI = W0_X_SI; // waist in z-direction
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 20.0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarization types
-         */
-        enum PolarisationType
-        {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-
-    struct PlaneWaveParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        static constexpr float_64 _A0  = 1.5;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** The profile of the test Lasers 0 and 2 can be stretched by a
-         *      constant area between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 13.34e-15;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before and after the plateau
-         *  unit: none */
-        static constexpr float_64 RAMP_INIT = 20.6146;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarization types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-
-    //! currently selected laser profile
-    using Selected = None<>;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace gaussianBeam
+            {
+                //! Use only the 0th Laguerremode for a standard Gaussian
+                static constexpr uint32_t MODENUMBER = 0;
+                PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, 1.0);
+                // This is just an example for a more complicated set of Laguerre modes
+                // constexpr uint32_t MODENUMBER = 12;
+                // PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, -1.0, 0.0300519, 0.319461, -0.23783,
+                // 0.0954839, 0.0318653, -0.144547, 0.0249208, -0.111989, 0.0434385, -0.030038, -0.00896321,
+                // -0.0160788);
+
+            } // namespace gaussianBeam
+
+            struct GaussianBeamParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                // static constexpr float_64 _A0  = 1.5;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                /** beam waist: distance from the axis where the pulse intensity (E^2)
+                 *              decreases to its 1/e^2-th part,
+                 *              at the focus position of the laser
+                 * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                 *                             [   1.17741    ]
+                 *
+                 *  unit: meter */
+                static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
+                /** the distance to the laser focus in y-direction
+                 *  unit: meter */
+                static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
+
+                /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                 *
+                 *  unit: none */
+                static constexpr float_64 PULSE_INIT = 20.0;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                using LAGUERREMODES_t = gaussianBeam::LAGUERREMODES_t;
+                static constexpr uint32_t MODENUMBER = gaussianBeam::MODENUMBER;
+
+                /** Available polarisation types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = CIRCULAR;
+            };
+
+            struct PulseFrontTiltParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                // static constexpr float_64 _A0  = 1.5;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                /** beam waist: distance from the axis where the pulse intensity (E^2)
+                 *              decreases to its 1/e^2-th part,
+                 *              at the focus position of the laser
+                 * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                 *                             [   1.17741    ]
+                 *
+                 *  unit: meter */
+                static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
+
+                /** the distance to the laser focus in y-direction
+                 *  unit: meter */
+                static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
+
+                /** the tilt angle between laser propagation in y-direction and laser axis in
+                 *  x-direction (0 degree == no tilt)
+                 *  unit: degree */
+                static constexpr float_64 TILT_X_SI = 0.0;
+
+                /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                 *
+                 *  unit: none */
+                static constexpr float_64 PULSE_INIT = 20.0;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                //! Available polarisation types
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = CIRCULAR;
+            };
+
+            struct WavepacketParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                // static constexpr float_64 _A0  = 1.5;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** The profile of the test Lasers 0 and 2 can be stretched by a
+                 *      constant area between the up and downramp
+                 *  unit: seconds */
+                static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI
+                    = 7.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                /** beam waist: distance from the axis where the pulse intensity (E^2)
+                 *              decreases to its 1/e^2-th part,
+                 *              at the focus position of the laser
+                 * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                 *                             [   1.17741    ]
+                 *
+                 *  unit: meter */
+                static constexpr float_64 W0_X_SI = 4.246e-6;
+                static constexpr float_64 W0_Z_SI = W0_X_SI;
+
+                /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                 *
+                 *  unit: none */
+                static constexpr float_64 PULSE_INIT = 20.0;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                /** Available polarisation types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = LINEAR_X;
+            };
+
+            /** Based on a wavepacket with Gaussian spatial envelope
+             *
+             * and the following temporal shape:
+             * A Gaussian peak (optionally lengthened by a plateau) is preceded by
+             * two pieces of exponential preramps, defined by 3 (time, intensity)-
+             * -points.
+             * The first two points get connected by an exponential, the 2nd and
+             * 3rd point are connected by another exponential, which is then
+             * extrapolated to the peak. The Gaussian is added everywhere, but
+             * typically contributes significantly only near the peak.
+             * It is advisable to set the third point far enough from the plateau
+             * (approx 3*FWHM), then the contribution from the Gaussian is
+             * negligible there, and the intensity can be set as measured from the
+             * laser profile.
+             * Optionally a Gaussian prepulse can be added, given by the parameters
+             * of the relative intensity and time point.
+             * The time of the prepulse and the three preramp points are given in
+             * SI, the intensities are given as multiples of the peak intensity.
+             */
+            struct ExpRampWithPrepulseParam
+            {
+                // Intensities of prepulse and exponential preramp
+                static constexpr float_X INT_RATIO_PREPULSE = 0.;
+                static constexpr float_X INT_RATIO_POINT_1 = 1.e-8;
+                static constexpr float_X INT_RATIO_POINT_2 = 1.e-4;
+                static constexpr float_X INT_RATIO_POINT_3 = 1.e-4;
+
+                // time-positions of prepulse and preramps points
+                static constexpr float_64 TIME_PREPULSE_SI = -950.0e-15;
+                static constexpr float_64 TIME_PEAKPULSE_SI = 0.0e-15;
+                static constexpr float_64 TIME_POINT_1_SI = -1000.0e-15;
+                static constexpr float_64 TIME_POINT_2_SI = -300.0e-15;
+                static constexpr float_64 TIME_POINT_3_SI = -100.0e-15;
+
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** UNITCONV */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                static constexpr float_64 _A0 = 20.;
+
+                /** unit: Volt /meter */
+                static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt /meter */
+                // constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** The profile of the test Lasers 0 and 2 can be stretched by a
+                 *      constant area between the up and downramp
+                 *  unit: seconds */
+                static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI
+                    = 0.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 3.0e-14
+                    / 2.35482; // half of the time in which E falls to half its initial value (then I falls to half its
+                               // value in 15fs, approx 6 wavelengths). Those are 4.8 wavelenghts.
+
+                /** beam waist: distance from the axis where the pulse intensity (E^2)
+                 *              decreases to its 1/e^2-th part,
+                 *              WO_X_SI is this distance in x-direction
+                 *              W0_Z_SI is this distance in z-direction
+                 *              if both values are equal, the laser has a circular shape in x-z
+                 * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                 *                             [   1.17741    ]
+                 *  unit: meter */
+                static constexpr float_64 W0_X_SI = 2.5 * WAVE_LENGTH_SI;
+                static constexpr float_64 W0_Z_SI = W0_X_SI;
+
+                /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before plateau
+                 *  and half at the end of the plateau
+                 *  unit: none */
+                static constexpr float_64 RAMP_INIT = 16.0;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                /** Available polarisation types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = LINEAR_X;
+            };
+
+            /** Based on a wavepacket with Gaussian spatial envelope
+             *
+             * Wavepacket with a polynomial temporal intensity shape.
+             */
+            struct PolynomParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                // static constexpr float_64 _A0  = 1.5;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** The profile of the test Lasers 0 and 2 can be stretched by a
+                 *      constant area between the up and downramp
+                 *  unit: seconds */
+                static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 13.34e-15;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                /** beam waist: distance from the axis where the pulse intensity (E^2)
+                 *              decreases to its 1/e^2-th part,
+                 *              at the focus position of the laser
+                 *  unit: meter
+                 */
+                static constexpr float_64 W0_X_SI = 4.246e-6; // waist in x-direction
+                static constexpr float_64 W0_Z_SI = W0_X_SI; // waist in z-direction
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                 *
+                 *  unit: none */
+                static constexpr float_64 PULSE_INIT = 20.0;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                /** Available polarization types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = LINEAR_X;
+            };
+
+            struct PlaneWaveParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                static constexpr float_64 _A0 = 1.5;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** The profile of the test Lasers 0 and 2 can be stretched by a
+                 *      constant area between the up and downramp
+                 *  unit: seconds */
+                static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 13.34e-15;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 10.615e-15 / 4.0;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before and after
+                 * the plateau unit: none */
+                static constexpr float_64 RAMP_INIT = 20.6146;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                /** Available polarization types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = LINEAR_X;
+            };
+
+            //! currently selected laser profile
+            using Selected = None<>;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/param/mallocMC.param b/include/picongpu/param/mallocMC.param
index 1268f2fa7e..ca0c466be8 100644
--- a/include/picongpu/param/mallocMC.param
+++ b/include/picongpu/param/mallocMC.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Carlchristian Eckert
  *
  * This file is part of PIConGPU.
@@ -34,25 +34,24 @@
 
 namespace picongpu
 {
-
     //! configure the CreationPolicy "Scatter"
     struct DeviceHeapConfig
     {
         //! 2MiB page can hold around 256 particle frames
-        using pagesize = boost::mpl::int_< 2 * 1024 * 1024 >;
+        static constexpr uint32_t pagesize = 2u * 1024u * 1024u;
 
         /** accessblocks, regionsize and wastefactor are not conclusively
          * investigated and might be performance sensitive for multiple
          * particle species with heavily varying attributes (frame sizes)
          */
-        using accessblocks = boost::mpl::int_< 4 >;
-        using regionsize = boost::mpl::int_< 8 >;
-        using wastefactor = boost::mpl::int_< 2 >;
+        static constexpr uint32_t accessblocks = 4u;
+        static constexpr uint32_t regionsize = 8u;
+        static constexpr uint32_t wastefactor = 2u;
 
         /** resetfreedpages is used to minimize memory fragmentation with
          * varying frame sizes
          */
-        using resetfreedpages = boost::mpl::bool_< true >;
+        static constexpr bool resetfreedpages = true;
     };
 
     /** Define a new allocator
@@ -61,11 +60,11 @@ namespace picongpu
      * algorithm.
      */
     using DeviceHeap = mallocMC::Allocator<
-        mallocMC::CreationPolicies::Scatter< DeviceHeapConfig >,
+        cupla::Acc,
+        mallocMC::CreationPolicies::Scatter<DeviceHeapConfig>,
         mallocMC::DistributionPolicies::Noop,
         mallocMC::OOMPolicies::ReturnNull,
-        mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-        mallocMC::AlignmentPolicies::Shrink<>
-    >;
+        mallocMC::ReservePoolPolicies::AlpakaBuf<cupla::Acc>,
+        mallocMC::AlignmentPolicies::Shrink<>>;
 
 } // namespace picongpu
diff --git a/include/picongpu/param/memory.param b/include/picongpu/param/memory.param
index b469775458..2e6025da2c 100644
--- a/include/picongpu/param/memory.param
+++ b/include/picongpu/param/memory.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -30,16 +30,17 @@
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/mappings/kernel/MappingDescription.hpp>
 
+#include <array>
+
 
 namespace picongpu
 {
-
     /* We have to hold back 350MiB for gpu-internal operations:
      *   - random number generator
      *   - reduces
      *   - ...
      */
-    constexpr size_t reservedGpuMemorySize = 350 *1024*1024;
+    constexpr size_t reservedGpuMemorySize = 350 * 1024 * 1024;
 
     /* short namespace*/
     namespace mCT = pmacc::math::CT;
@@ -47,13 +48,10 @@ namespace picongpu
      *
      * volume of a superCell must be <= 1024
      */
-    using SuperCellSize = typename mCT::shrinkTo<
-        mCT::Int< 8, 8, 4 >,
-        simDim
-    >::type;
+    using SuperCellSize = typename mCT::shrinkTo<mCT::Int<8, 8, 4>, simDim>::type;
 
     /** define mapper which is used for kernel call mappings */
-    using MappingDesc = MappingDescription< simDim, SuperCellSize >;
+    using MappingDesc = MappingDescription<simDim, SuperCellSize>;
 
     /** define the size of the core, border and guard area
      *
@@ -69,10 +67,7 @@ namespace picongpu
      *
      * GuardSize is defined in units of SuperCellSize per dimension.
      */
-    using GuardSize = typename mCT::shrinkTo<
-        mCT::Int< 1, 1, 1 >,
-        simDim
-    >::type;
+    using GuardSize = typename mCT::shrinkTo<mCT::Int<1, 1, 1>, simDim>::type;
 
     /** bytes reserved for species exchange buffer
      *
@@ -89,6 +84,21 @@ namespace picongpu
         static constexpr uint32_t BYTES_EXCHANGE_Z = 1 * 1024 * 1024; // 1 MiB
         static constexpr uint32_t BYTES_EDGES = 32 * 1024; // 32 kiB
         static constexpr uint32_t BYTES_CORNER = 8 * 1024; // 8 kiB
+
+        /** Reference local domain size
+         *
+         * The size of the local domain for which the exchange sizes `BYTES_*` are configured for.
+         * The required size of each exchange will be calculated at runtime based on the local domain size and the
+         * reference size. The exchange size will be scaled only up and not down. Zero means that there is no reference
+         * domain size, exchanges will not be scaled.
+         */
+        using REF_LOCAL_DOM_SIZE = mCT::Int<0, 0, 0>;
+        /** Scaling rate per direction.
+         *
+         * 1.0 means it scales linear with the ratio between the local domain size at runtime and the reference local
+         * domain size.
+         */
+        const std::array<float_X, 3> DIR_SCALING_FACTOR = {{0.0, 0.0, 0.0}};
     };
 
     /** number of scalar fields that are reserved as temporary fields */
diff --git a/include/picongpu/param/particle.param b/include/picongpu/param/particle.param
index 1cec2d40e4..02153dfecb 100644
--- a/include/picongpu/param/particle.param
+++ b/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -38,143 +38,128 @@
 
 namespace picongpu
 {
-namespace particles
-{
-
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *
-     *  unit: none */
-    constexpr float_X MIN_WEIGHTING = 10.0;
-
-    /** Number of maximum particles per cell during density profile evaluation.
-     *
-     * Determines the weighting of a macro particle and with it, the number of
-     * particles "sampling" dynamics in phase space.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 2u;
-
-namespace manipulators
-{
-
-    /** Parameter for DriftParam
-     */
-    CONST_VECTOR(float_X,3,DriftParam_direction,1.0,0.0,0.0);
-    /** Parameter for a particle drift assignment
-     */
-    struct DriftParam
-    {
-        static constexpr float_64 gamma = 1.0;
-        const DriftParam_direction_t direction;
-    };
-    /** definition of manipulator that assigns a drift in X */
-    using AssignXDrift = unary::Drift<
-        DriftParam,
-        nvidia::functors::Assign
-    >;
-
-
-    /** Parameter for a temperature assignment
-     */
-    struct TemperatureParam
-    {
-        /*Initial temperature
-         *  unit: keV
-         */
-        static constexpr float_64 temperature = 0.0;
-    };
-    /* definition a temperature assignment manipulator */
-    using AddTemperature = unary::Temperature< TemperatureParam >;
-
-    /** Unary particle manipulator: double each weighting
-     */
-    struct DoubleWeightingFunctor
-    {
-        template< typename T_Particle >
-        DINLINE void operator()( T_Particle& particle )
-        {
-            particle[weighting_] *= 2.0_X;
-        }
-    };
-
-    /** definition of a free particle manipulator: double weighting */
-    using DoubleWeighting = generic::Free< DoubleWeightingFunctor >;
-
-    struct RandomEnabledRadiationFunctor
+    namespace particles
     {
-        template< typename T_Rng, typename T_Particle >
-        DINLINE void operator()( T_Rng& rng, T_Particle& particle )
-        {
-            // enable radiation for 10% of the particles
-            particle[ radiationMask_ ] = rng() < 0.1_X;
-        }
-    };
-
-    /* definition of RandomEnableRadiation start */
-    using RandomEnabledRadiation = generic::FreeRng<
-        RandomEnabledRadiationFunctor,
-        pmacc::random::distributions::Uniform< float_X >
-    >;
-
-    /** changes the in-cell position of each particle of a species */
-    using RandomPosition = unary::RandomPosition;
-
-} // namespace manipulators
-
-namespace startPosition
-{
-
-    struct RandomParameter
-    {
-        /** Count of particles per cell at initial state
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
          *
          *  unit: none */
-        static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
-    };
-    /** definition of random particle start */
-    using Random = RandomImpl< RandomParameter >;
+        constexpr float_X MIN_WEIGHTING = 10.0;
 
-    struct QuietParam
-    {
-        /** Count of particles per cell per direction at initial state
+        /** Number of maximum particles per cell during density profile evaluation.
          *
-         *  unit: none */
-        using numParticlesPerDimension = mCT::shrinkTo<
-            mCT::Int<
-                1,
-                TYPICAL_PARTICLES_PER_CELL,
-                1
-                >,
-            simDim
-        >::type;
-    };
-
-    /** definition of quiet particle start */
-    using Quiet = QuietImpl< QuietParam >;
-
-    /** sit directly in lower corner of the cell */
-    CONST_VECTOR(
-        float_X,
-        3,
-        InCellOffset,
-        /* each x, y, z in-cell position component in range [0.0, 1.0) */
-        0.0,
-        0.0,
-        0.0
-    );
-    struct OnePositionParameter
-    {
-        /** Count of particles per cell at initial state
-         *
-         *  unit: none */
-        static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
-
-        const InCellOffset_t inCellOffset;
-    };
-
-    /** definition of one specific position for particle start */
-    using OnePosition = OnePositionImpl< OnePositionParameter >;
+         * Determines the weighting of a macro particle and with it, the number of
+         * particles "sampling" dynamics in phase space.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 2u;
 
-} // namespace startPosition
-} // namespace particles
+        namespace manipulators
+        {
+            /** Parameter for DriftParam
+             */
+            CONST_VECTOR(float_X, 3, DriftParam_direction, 1.0, 0.0, 0.0);
+            /** Parameter for a particle drift assignment
+             */
+            struct DriftParam
+            {
+                static constexpr float_64 gamma = 1.0;
+                const DriftParam_direction_t direction;
+            };
+            /** definition of manipulator that assigns a drift in X */
+            using AssignXDrift = unary::Drift<DriftParam, nvidia::functors::Assign>;
+
+
+            /** Parameter for a temperature assignment
+             */
+            struct TemperatureParam
+            {
+                /*Initial temperature
+                 *  unit: keV
+                 */
+                static constexpr float_64 temperature = 0.0;
+            };
+            /* definition a temperature assignment manipulator */
+            using AddTemperature = unary::Temperature<TemperatureParam>;
+
+            /** Unary particle manipulator: double each weighting
+             */
+            struct DoubleWeightingFunctor
+            {
+                template<typename T_Particle>
+                DINLINE void operator()(T_Particle& particle)
+                {
+                    particle[weighting_] *= 2.0_X;
+                }
+            };
+
+            /** definition of a free particle manipulator: double weighting */
+            using DoubleWeighting = generic::Free<DoubleWeightingFunctor>;
+
+            struct RandomEnabledRadiationFunctor
+            {
+                template<typename T_Rng, typename T_Particle>
+                DINLINE void operator()(T_Rng& rng, T_Particle& particle)
+                {
+                    // enable radiation for 10% of the particles
+                    particle[radiationMask_] = rng() < 0.1_X;
+                }
+            };
+
+            /* definition of RandomEnableRadiation start */
+            using RandomEnabledRadiation
+                = generic::FreeRng<RandomEnabledRadiationFunctor, pmacc::random::distributions::Uniform<float_X>>;
+
+            /** changes the in-cell position of each particle of a species */
+            using RandomPosition = unary::RandomPosition;
+
+        } // namespace manipulators
+
+        namespace startPosition
+        {
+            struct RandomParameter
+            {
+                /** Count of particles per cell at initial state
+                 *
+                 *  unit: none */
+                static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
+            };
+            /** definition of random particle start */
+            using Random = RandomImpl<RandomParameter>;
+
+            struct QuietParam
+            {
+                /** Count of particles per cell per direction at initial state
+                 *
+                 *  unit: none */
+                using numParticlesPerDimension
+                    = mCT::shrinkTo<mCT::Int<1, TYPICAL_PARTICLES_PER_CELL, 1>, simDim>::type;
+            };
+
+            /** definition of quiet particle start */
+            using Quiet = QuietImpl<QuietParam>;
+
+            /** sit directly in lower corner of the cell */
+            CONST_VECTOR(
+                float_X,
+                3,
+                InCellOffset,
+                /* each x, y, z in-cell position component in range [0.0, 1.0) */
+                0.0,
+                0.0,
+                0.0);
+            struct OnePositionParameter
+            {
+                /** Count of particles per cell at initial state
+                 *
+                 *  unit: none */
+                static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
+
+                const InCellOffset_t inCellOffset;
+            };
+
+            /** definition of one specific position for particle start */
+            using OnePosition = OnePositionImpl<OnePositionParameter>;
+
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/param/particleCalorimeter.param b/include/picongpu/param/particleCalorimeter.param
index 23782a0311..95de4d17cc 100644
--- a/include/picongpu/param/particleCalorimeter.param
+++ b/include/picongpu/param/particleCalorimeter.param
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -21,31 +21,25 @@
 
 namespace picongpu
 {
-namespace particleCalorimeter
-{
-
-/** Map yaw and pitch into [0,1] respectively. These ranges correspond to
- * the normalized histogram range of the calorimeter (0: first bin, 1: last bin).
- * Out-of-range values are mapped to the first or the last bin.
- *
- * Useful for fine tuning the spatial calorimeter resolution.
- *
- * \param yaw -maxYaw...maxYaw
- * \param pitch -maxPitch...maxPitch
- * \param maxYaw maximum value of angle yaw
- * \param maxPitch maximum value of angle pitch
- * \return Two values within [-1,1]
- */
-HDINLINE float2_X mapYawPitchToNormedRange(const float_X yaw,
-                                           const float_X pitch,
-                                           const float_X maxYaw,
-                                           const float_X maxPitch)
-{
-    return float2_X(
-        0.5_X + 0.5_X * yaw / maxYaw,
-        0.5_X + 0.5_X * pitch / maxPitch
-    );
-}
+    namespace particleCalorimeter
+    {
+        /** Map yaw and pitch into [0,1] respectively. These ranges correspond to
+         * the normalized histogram range of the calorimeter (0: first bin, 1: last bin).
+         * Out-of-range values are mapped to the first or the last bin.
+         *
+         * Useful for fine tuning the spatial calorimeter resolution.
+         *
+         * \param yaw -maxYaw...maxYaw
+         * \param pitch -maxPitch...maxPitch
+         * \param maxYaw maximum value of angle yaw
+         * \param maxPitch maximum value of angle pitch
+         * \return Two values within [-1,1]
+         */
+        HDINLINE float2_X
+        mapYawPitchToNormedRange(const float_X yaw, const float_X pitch, const float_X maxYaw, const float_X maxPitch)
+        {
+            return float2_X(0.5_X + 0.5_X * yaw / maxYaw, 0.5_X + 0.5_X * pitch / maxPitch);
+        }
 
-} // namespace particleCalorimeter
+    } // namespace particleCalorimeter
 } // namespace picongpu
diff --git a/include/picongpu/param/particleFilters.param b/include/picongpu/param/particleFilters.param
index 6c5e1a1c13..b66188799b 100644
--- a/include/picongpu/param/particleFilters.param
+++ b/include/picongpu/param/particleFilters.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -41,27 +41,25 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-    /** Plugins: collection of all available particle filters
-     *
-     * Create a list of all filters here that you want to use in plugins.
-     *
-     * Note: filter All is defined in picongpu/particles/filter/filter.def
-     */
-    using AllParticleFilters = MakeSeq_t<
-        All
-    >;
+    namespace particles
+    {
+        namespace filter
+        {
+            /** Plugins: collection of all available particle filters
+             *
+             * Create a list of all filters here that you want to use in plugins.
+             *
+             * Note: filter All is defined in picongpu/particles/filter/filter.def
+             */
+            using AllParticleFilters = MakeSeq_t<All>;
 
-} // namespace filter
+        } // namespace filter
 
-namespace traits
-{
-    /* if needed for generic "free" filters,
-     * place `SpeciesEligibleForSolver` traits for filters here
-     */
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            /* if needed for generic "free" filters,
+             * place `SpeciesEligibleForSolver` traits for filters here
+             */
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/param/particleMerger.param b/include/picongpu/param/particleMerger.param
index 686cc510a0..b827dce21c 100644
--- a/include/picongpu/param/particleMerger.param
+++ b/include/picongpu/param/particleMerger.param
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Heiko Burau
+/* Copyright 2017-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -27,16 +27,15 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace particleMerging
-{
-
-    /** maximum number of active Voronoi cells per supercell. If the number
-      * of active Voronoi cells reaches this limit merging events are dropped.
-      */
-    constexpr size_t MAX_VORONOI_CELLS = 128;
+    namespace plugins
+    {
+        namespace particleMerging
+        {
+            /** maximum number of active Voronoi cells per supercell. If the number
+             * of active Voronoi cells reaches this limit merging events are dropped.
+             */
+            constexpr size_t MAX_VORONOI_CELLS = 128;
 
-} // namespace particleMerging
-} // namespace plugins
+        } // namespace particleMerging
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/param/physicalConstants.param b/include/picongpu/param/physicalConstants.param
index 85ac5ed3a7..7c6968bc40 100644
--- a/include/picongpu/param/physicalConstants.param
+++ b/include/picongpu/param/physicalConstants.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Marco Garten
  *
  * This file is part of PIConGPU.
@@ -19,7 +19,6 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
@@ -34,8 +33,7 @@ namespace picongpu
         /** unit: N / A^2 */
         constexpr float_64 MUE0_SI = PI * 4.e-7;
         /** unit: C / (V m) */
-        constexpr float_64 EPS0_SI = 1.0 / MUE0_SI / SPEED_OF_LIGHT_SI
-            / SPEED_OF_LIGHT_SI;
+        constexpr float_64 EPS0_SI = 1.0 / MUE0_SI / SPEED_OF_LIGHT_SI / SPEED_OF_LIGHT_SI;
 
         /** impedance of free space
          * unit: ohm */
@@ -75,11 +73,15 @@ namespace picongpu
          * doi:10.1088/0026-1394/52/2/360
          */
         constexpr float_64 N_AVOGADRO = 6.02214076e23;
-    }
+
+        //! Classical electron radius in SI units
+        constexpr float_64 ELECTRON_RADIUS_SI = ELECTRON_CHARGE_SI * ELECTRON_CHARGE_SI
+            / (4.0 * PI * EPS0_SI * ELECTRON_MASS_SI * SPEED_OF_LIGHT_SI * SPEED_OF_LIGHT_SI);
+    } // namespace SI
 
     /** Unit of speed */
     constexpr float_64 UNIT_SPEED = SI::SPEED_OF_LIGHT_SI;
-    constexpr float_X SPEED_OF_LIGHT = float_X( SI::SPEED_OF_LIGHT_SI / UNIT_SPEED );
+    constexpr float_X SPEED_OF_LIGHT = float_X(SI::SPEED_OF_LIGHT_SI / UNIT_SPEED);
 
     // converts
     //
@@ -96,14 +98,17 @@ namespace picongpu
     //   constexpr float_64 An_Arbitrary_Energy_Input_keV = 30.0; // unit: keV
     //
     //   // first convert to SI (because SI stays our standard Unit System!)
-    //   constexpr float_64 An_Arbitrary_Energy_Input_SI = An_Arbitrary_Energy_Input_keV * UNITCONV_keV_to_Joule // unit: Joule
+    //   constexpr float_64 An_Arbitrary_Energy_Input_SI = An_Arbitrary_Energy_Input_keV * UNITCONV_keV_to_Joule //
+    //   unit: Joule
     //
     //   // now the "real" convert to our internal unitless system
-    //   constexpr float_X An_Arbitrary_Energy_Input = float_X(An_Arbitrary_Energy_Input_SI / UNIT_ENERGY) // unit: none
+    //   constexpr float_X An_Arbitrary_Energy_Input = float_X(An_Arbitrary_Energy_Input_SI / UNIT_ENERGY) // unit:
+    //   none
     //
     // As a convention, we DO NOT use the short track:
     //   constexpr float_64 An_Arbitrary_Energy_Input_keV = 30.0; // unit: keV
-    //   constexpr float_X An_Arbitrary_Energy_Input = float_X(An_Arbitrary_Energy_Input_SI * UNITCONV_keV_to_Joule / UNIT_ENERGY) // unit: none
+    //   constexpr float_X An_Arbitrary_Energy_Input = float_X(An_Arbitrary_Energy_Input_SI * UNITCONV_keV_to_Joule /
+    //   UNIT_ENERGY) // unit: none
     //
     constexpr float_64 UNITCONV_keV_to_Joule = 1.60217646e-16;
     constexpr float_64 UNITCONV_Joule_to_keV = (1.0 / UNITCONV_keV_to_Joule);
@@ -113,4 +118,4 @@ namespace picongpu
     constexpr float_64 UNITCONV_AU_to_eV = 27.21139;
     constexpr float_64 UNITCONV_eV_to_AU = (1.0 / UNITCONV_AU_to_eV);
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/param/pml.param b/include/picongpu/param/pml.param
index f75e75bcd6..9d875c1b11 100644
--- a/include/picongpu/param/pml.param
+++ b/include/picongpu/param/pml.param
@@ -1,4 +1,4 @@
-/* Copyright 2019-2020 Sergei Bastrakov
+/* Copyright 2019-2021 Sergei Bastrakov, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -19,9 +19,9 @@
 
 /** @file
  *
- * Configure the perfectly matched layer (PML).
+ * Configure the Perfectly Matched Layer absorbing boundary conditions (PML).
  *
- * To enable PML use YeePML field solver.
+ * To enable PML use YeePML, LehePML or ArbitraryOrderFDTDPML field solver.
  */
 
 #pragma once
@@ -29,133 +29,122 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yeePML
-{
-
-    /* The parameters in this file are only used if the field solver selected is
-     * YeePML.
-     * The original paper on this approach is J.A. Roden, S.D. Gedney.
-     * Convolution PML (CPML): An efficient FDTD implementation of the CFS - PML
-     * for arbitrary media. Microwave and optical technology letters. 27 (5),
-     * 334-339 (2000).
-     * https://doi.org/10.1002/1098-2760(20001205)27:5%3C334::AID-MOP14%3E3.0.CO;2-A
-     * Our implementation is based on a more detailed description in section
-     * 7.9 of the book A. Taflove, S.C. Hagness. Computational Electrodynamics.
-     * The Finite-Difference Time-Domain Method. Third Edition. Artech house,
-     * Boston (2005), referred to as [Taflove, Hagness].
-     */
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace Pml
+            {
+                /* The parameters in this file are only used if the field solver selected
+                 * uses Perfectly Matched Layer Absorbing Boundary Conditions (PML).
+                 * The original paper on this approach is J.A. Roden, S.D. Gedney.
+                 * Convolution PML (CPML): An efficient FDTD implementation of the CFS - PML
+                 * for arbitrary media. Microwave and optical technology letters. 27 (5),
+                 * 334-339 (2000).
+                 * https://doi.org/10.1002/1098-2760(20001205)27:5%3C334::AID-MOP14%3E3.0.CO;2-A
+                 * Our implementation is based on a more detailed description in section
+                 * 7.9 of the book A. Taflove, S.C. Hagness. Computational Electrodynamics.
+                 * The Finite-Difference Time-Domain Method. Third Edition. Artech house,
+                 * Boston (2005), referred to as [Taflove, Hagness].
+                 */
 
-    constexpr uint32_t THICKNESS = 8;
+                constexpr uint32_t THICKNESS = 8;
 
-    /** Thickness of the absorbing layer, in number of cells
-     *
-     * PML is located inside the global simulation area, near the outer borders.
-     * Setting size to 0 results in disabling absorption at the corresponding
-     * boundary. Normally thickness is between 6 and 16 cells, with larger
-     * values providing less reflections.
-     * 8 cells should be good enough for most simulations. There are no
-     * requirements on thickness being a multiple of the supercell size.
-     * It is only required that PML is small enough to be fully contained in
-     * a single layer of local domains near the global simulation area boundary
-     * (Note that the domains of this layer might be changing, e.g. due to
-     * moving window.)
-     * Unit: number of cells.
-     */
-    constexpr uint32_t NUM_CELLS[ 3 ][ 2 ] = {
-        { THICKNESS, THICKNESS },  // x direction [negative, positive]
-        { THICKNESS, THICKNESS },  // y direction [negative, positive]
-        { THICKNESS, THICKNESS }   // z direction [negative, positive]
-    };
+                /** Thickness of the absorbing layer, in number of cells
+                 *
+                 * PML is located inside the global simulation area, near the outer borders.
+                 * Setting size to 0 results in disabling absorption at the corresponding
+                 * boundary. Normally thickness is between 6 and 16 cells, with larger
+                 * values providing less reflections.
+                 * 8 cells should be good enough for most simulations. There are no
+                 * requirements on thickness being a multiple of the supercell size.
+                 * It is only required that PML is small enough to be fully contained in
+                 * a single layer of local domains near the global simulation area boundary
+                 * (Note that the domains of this layer might be changing, e.g. due to
+                 * moving window.)
+                 * Unit: number of cells.
+                 */
+                constexpr uint32_t NUM_CELLS[3][2] = {
+                    {THICKNESS, THICKNESS}, // x direction [negative, positive]
+                    {THICKNESS, THICKNESS}, // y direction [negative, positive]
+                    {THICKNESS, THICKNESS} // z direction [negative, positive]
+                };
 
-    /** Order of polynomial grading for artificial electric conductivity and
-     *  stretching coefficient
-     *
-     * The conductivity (sigma) is polynomially scaling from 0 at the internal
-     * border of PML to the maximum value (defined below) at the external
-     * border. The stretching coefficient (kappa) scales from 1 to the
-     * corresponding maximum value (defined below) with the same polynomial.
-     * The grading is given in [Taflove, Hagness], eq. (7.60a, b), with
-     * the order denoted 'm'.
-     * Must be >= 0. Normally between 3 and 4, not required to be integer.
-     * Unitless.
-     */
-    constexpr float_64 SIGMA_KAPPA_GRADING_ORDER = 4.0;
+                /** Order of polynomial grading for artificial electric conductivity and
+                 *  stretching coefficient
+                 *
+                 * The conductivity (sigma) is polynomially scaling from 0 at the internal
+                 * border of PML to the maximum value (defined below) at the external
+                 * border. The stretching coefficient (kappa) scales from 1 to the
+                 * corresponding maximum value (defined below) with the same polynomial.
+                 * The grading is given in [Taflove, Hagness], eq. (7.60a, b), with
+                 * the order denoted 'm'.
+                 * Must be >= 0. Normally between 3 and 4, not required to be integer.
+                 * Unitless.
+                 */
+                constexpr float_64 SIGMA_KAPPA_GRADING_ORDER = 4.0;
 
-    // [Taflove, Hagness], eq. (7.66)
-    constexpr float_64 SIGMA_OPT_SI[ 3 ] = {
-        0.8 * ( SIGMA_KAPPA_GRADING_ORDER + 1.0 ) / ( SI::Z0_SI * SI::CELL_WIDTH_SI ),
-        0.8 * ( SIGMA_KAPPA_GRADING_ORDER + 1.0 ) / ( SI::Z0_SI * SI::CELL_HEIGHT_SI ),
-        0.8 * ( SIGMA_KAPPA_GRADING_ORDER + 1.0 ) / ( SI::Z0_SI * SI::CELL_DEPTH_SI )
-    };
+                // [Taflove, Hagness], eq. (7.66)
+                constexpr float_64 SIGMA_OPT_SI[3]
+                    = {0.8 * (SIGMA_KAPPA_GRADING_ORDER + 1.0) / (SI::Z0_SI * SI::CELL_WIDTH_SI),
+                       0.8 * (SIGMA_KAPPA_GRADING_ORDER + 1.0) / (SI::Z0_SI * SI::CELL_HEIGHT_SI),
+                       0.8 * (SIGMA_KAPPA_GRADING_ORDER + 1.0) / (SI::Z0_SI * SI::CELL_DEPTH_SI)};
 
-    // Muptiplier to express SIGMA_MAX_SI with SIGMA_OPT_SI
-    constexpr float_64 SIGMA_OPT_MULTIPLIER = 1.0;
+                // Muptiplier to express SIGMA_MAX_SI with SIGMA_OPT_SI
+                constexpr float_64 SIGMA_OPT_MULTIPLIER = 1.0;
 
-    /** Max value of artificial electric conductivity in PML
-     *
-     * Components correspond to directions: element 0 corresponds to absorption
-     * along x direction, 1 = y, 2 = z. Grading is described in comments for
-     * SIGMA_KAPPA_GRADING_ORDER.
-     * Too small values lead to significant reflections from the external
-     * border, too large - to reflections due to discretization errors.
-     * Artificial magnetic permeability will be chosen to perfectly match this.
-     * Must be >= 0. Normally between 0.7 * SIGMA_OPT_SI and 1.1 * SIGMA_OPT_SI.
-     * Unit: siemens / m.
-     */
-    constexpr float_64 SIGMA_MAX_SI[ 3 ] = {
-        SIGMA_OPT_SI[ 0 ] * SIGMA_OPT_MULTIPLIER,
-        SIGMA_OPT_SI[ 1 ] * SIGMA_OPT_MULTIPLIER,
-        SIGMA_OPT_SI[ 2 ] * SIGMA_OPT_MULTIPLIER
-    };
+                /** Max value of artificial electric conductivity in PML
+                 *
+                 * Components correspond to directions: element 0 corresponds to absorption
+                 * along x direction, 1 = y, 2 = z. Grading is described in comments for
+                 * SIGMA_KAPPA_GRADING_ORDER.
+                 * Too small values lead to significant reflections from the external
+                 * border, too large - to reflections due to discretization errors.
+                 * Artificial magnetic permeability will be chosen to perfectly match this.
+                 * Must be >= 0. Normally between 0.7 * SIGMA_OPT_SI and 1.1 * SIGMA_OPT_SI.
+                 * Unit: siemens / m.
+                 */
+                constexpr float_64 SIGMA_MAX_SI[3]
+                    = {SIGMA_OPT_SI[0] * SIGMA_OPT_MULTIPLIER,
+                       SIGMA_OPT_SI[1] * SIGMA_OPT_MULTIPLIER,
+                       SIGMA_OPT_SI[2] * SIGMA_OPT_MULTIPLIER};
 
-    /** Max value of coordinate stretching coefficient in PML
-     *
-     * Components correspond to directions: element 0 corresponds to absorption
-     * along x direction, 1 = y, 2 = z. Grading is described in comments for
-     * SIGMA_KAPPA_GRADING_ORDER.
-     * Must be >= 1. For relatively homogeneous domains 1.0 is a reasonable value.
-     * Highly elongated domains can have better absorption with values between
-     * 7.0 and 20.0, for example, see section 7.11.2 in [Taflove, Hagness].
-     * Unitless.
-     */
-    constexpr float_64 KAPPA_MAX[ 3 ] = {
-        1.0,
-        1.0,
-        1.0
-    };
+                /** Max value of coordinate stretching coefficient in PML
+                 *
+                 * Components correspond to directions: element 0 corresponds to absorption
+                 * along x direction, 1 = y, 2 = z. Grading is described in comments for
+                 * SIGMA_KAPPA_GRADING_ORDER.
+                 * Must be >= 1. For relatively homogeneous domains 1.0 is a reasonable value.
+                 * Highly elongated domains can have better absorption with values between
+                 * 7.0 and 20.0, for example, see section 7.11.2 in [Taflove, Hagness].
+                 * Unitless.
+                 */
+                constexpr float_64 KAPPA_MAX[3] = {1.0, 1.0, 1.0};
 
-    /** Order of polynomial grading for complex frequency shift
-     *
-     * The complex frequency shift (alpha) is polynomially downscaling from the
-     * maximum value (defined below) at the internal border of PML to 0 at the
-     * external border. The grading is given in [Taflove, Hagness], eq. (7.79),
-     * with the order denoted 'm_a'.
-     * Must be >= 0. Normally values are around 1.0.
-     * Unitless.
-     */
-    constexpr float_64 ALPHA_GRADING_ORDER = 1.0;
+                /** Order of polynomial grading for complex frequency shift
+                 *
+                 * The complex frequency shift (alpha) is polynomially downscaling from the
+                 * maximum value (defined below) at the internal border of PML to 0 at the
+                 * external border. The grading is given in [Taflove, Hagness], eq. (7.79),
+                 * with the order denoted 'm_a'.
+                 * Must be >= 0. Normally values are around 1.0.
+                 * Unitless.
+                 */
+                constexpr float_64 ALPHA_GRADING_ORDER = 1.0;
 
-    /** Complex frequency shift in PML
-     *
-     * Components correspond to directions: element 0 corresponds to absorption
-     * along x direction, 1 = y, 2 = z. Setting it to 0 will make PML behave
-     * as uniaxial PML. Setting it to a positive value helps to attenuate
-     * evanescent modes, but can degrade absorption of propagating modes, as
-     * described in section 7.7 and 7.11.3 in [Taflove, Hagness].
-     * Must be >= 0. Normally values are 0 or between 0.15 and 0.3.
-     * Unit: siemens / m.
-     */
-    constexpr float_64 ALPHA_MAX_SI[ 3 ] = {
-        0.2,
-        0.2,
-        0.2
-    };
+                /** Complex frequency shift in PML
+                 *
+                 * Components correspond to directions: element 0 corresponds to absorption
+                 * along x direction, 1 = y, 2 = z. Setting it to 0 will make PML behave
+                 * as uniaxial PML. Setting it to a positive value helps to attenuate
+                 * evanescent modes, but can degrade absorption of propagating modes, as
+                 * described in section 7.7 and 7.11.3 in [Taflove, Hagness].
+                 * Must be >= 0. Normally values are 0 or between 0.15 and 0.3.
+                 * Unit: siemens / m.
+                 */
+                constexpr float_64 ALPHA_MAX_SI[3] = {0.2, 0.2, 0.2};
 
-} // namespace yeePML
-} // namespace maxwellSolver
-} // namespace fields
+            } // namespace Pml
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/param/png.param b/include/picongpu/param/png.param
index 8a57c09de6..06e2832943 100644
--- a/include/picongpu/param/png.param
+++ b/include/picongpu/param/png.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -65,7 +65,7 @@ namespace picongpu
         /* png preview settings for each channel */
         DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
         {
-            return math::abs2(field_J);
+            return pmacc::math::abs2(field_J);
         }
 
         DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
@@ -77,6 +77,5 @@ namespace picongpu
         {
             return -1.0_X * field_E.y();
         }
-    }
-}
-
+    } // namespace visPreview
+} // namespace picongpu
diff --git a/include/picongpu/param/pngColorScales.param b/include/picongpu/param/pngColorScales.param
index f626d83595..01e453ecf4 100644
--- a/include/picongpu/param/pngColorScales.param
+++ b/include/picongpu/param/pngColorScales.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
@@ -27,88 +26,66 @@ namespace picongpu
     {
         namespace none
         {
-            HDINLINE void addRGB( const float3_X&,
-                                  const float_X,
-                                  const float_X )
+            HDINLINE void addRGB(const float3_X&, const float_X, const float_X)
             {
                 return;
             }
-        }
+        } // namespace none
 
         namespace gray
         {
-            HDINLINE void addRGB( float3_X& img,
-                                  const float_X value,
-                                  const float_X opacity )
+            HDINLINE void addRGB(float3_X& img, const float_X value, const float_X opacity)
             {
-                const float3_X myChannel( 1.0, 1.0, 1.0 );
+                const float3_X myChannel(1.0, 1.0, 1.0);
                 img = img
-                      - opacity * float3_X( myChannel.x() * img.x(),
-                                               myChannel.y() * img.y(),
-                                               myChannel.z() * img.z() )
-                      + myChannel * value * opacity;
+                    - opacity * float3_X(myChannel.x() * img.x(), myChannel.y() * img.y(), myChannel.z() * img.z())
+                    + myChannel * value * opacity;
             }
-        }
+        } // namespace gray
 
         namespace grayInv
         {
-            HDINLINE void addRGB( float3_X& img,
-                                  const float_X value,
-                                  const float_X opacity )
+            HDINLINE void addRGB(float3_X& img, const float_X value, const float_X opacity)
             {
-                const float3_X myChannel( 1.0, 1.0, 1.0 );
+                const float3_X myChannel(1.0, 1.0, 1.0);
                 img = img
-                      - opacity * float3_X( myChannel.x() * img.x(),
-                                               myChannel.y() * img.y(),
-                                               myChannel.z() * img.z() )
-                      + myChannel * ( 1.0_X - value ) * opacity;
+                    - opacity * float3_X(myChannel.x() * img.x(), myChannel.y() * img.y(), myChannel.z() * img.z())
+                    + myChannel * (1.0_X - value) * opacity;
             }
-        }
+        } // namespace grayInv
 
         namespace red
         {
-            HDINLINE void addRGB( float3_X& img,
-                                  const float_X value,
-                                  const float_X opacity )
+            HDINLINE void addRGB(float3_X& img, const float_X value, const float_X opacity)
             {
-                const float3_X myChannel( 1.0, 0.0, 0.0 );
+                const float3_X myChannel(1.0, 0.0, 0.0);
                 img = img
-                      - opacity * float3_X( myChannel.x() * img.x(),
-                                               myChannel.y() * img.y(),
-                                               myChannel.z() * img.z() )
-                      + myChannel * value * opacity;
+                    - opacity * float3_X(myChannel.x() * img.x(), myChannel.y() * img.y(), myChannel.z() * img.z())
+                    + myChannel * value * opacity;
             }
-        }
+        } // namespace red
 
         namespace green
         {
-            HDINLINE void addRGB( float3_X& img,
-                                  const float_X value,
-                                  const float_X opacity )
+            HDINLINE void addRGB(float3_X& img, const float_X value, const float_X opacity)
             {
-                const float3_X myChannel( 0.0, 1.0, 0.0 );
+                const float3_X myChannel(0.0, 1.0, 0.0);
                 img = img
-                      - opacity * float3_X( myChannel.x() * img.x(),
-                                               myChannel.y() * img.y(),
-                                               myChannel.z() * img.z() )
-                      + myChannel * value * opacity;
+                    - opacity * float3_X(myChannel.x() * img.x(), myChannel.y() * img.y(), myChannel.z() * img.z())
+                    + myChannel * value * opacity;
             }
-        }
+        } // namespace green
 
         namespace blue
         {
-            HDINLINE void addRGB( float3_X& img,
-                                  const float_X value,
-                                  const float_X opacity )
+            HDINLINE void addRGB(float3_X& img, const float_X value, const float_X opacity)
             {
-                const float3_X myChannel( 0.0, 0.0, 1.0 );
+                const float3_X myChannel(0.0, 0.0, 1.0);
                 img = img
-                      - opacity * float3_X( myChannel.x() * img.x(),
-                                               myChannel.y() * img.y(),
-                                               myChannel.z() * img.z() )
-                      + myChannel * value * opacity;
+                    - opacity * float3_X(myChannel.x() * img.x(), myChannel.y() * img.y(), myChannel.z() * img.z())
+                    + myChannel * value * opacity;
             }
-        }
+        } // namespace blue
 
-    }
-}
+    } // namespace colorScales
+} // namespace picongpu
diff --git a/include/picongpu/param/precision.param b/include/picongpu/param/precision.param
index cb8e27269f..969d84d5c6 100644
--- a/include/picongpu/param/precision.param
+++ b/include/picongpu/param/precision.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -33,23 +33,22 @@
 
 namespace picongpu
 {
-
-/*! Select a precision for the simulation data
- *  - precision32Bit : use 32Bit floating point numbers
- *                     [significant digits 7 to 8]
- *  - precision64Bit : use 64Bit floating point numbers
- *                     [significant digits 15 to 16]
- */
-namespace precisionPIConGPU      = precision32Bit;
-
-/*! Select a precision special operations (can be different from simulation precision)
- *  - precisionPIConGPU : use precision which is selected on top (precisionPIConGPU)
- *  - precision32Bit    : use 32Bit floating point numbers
- *  - precision64Bit    : use 64Bit floating point numbers
- */
-namespace precisionSqrt          = precisionPIConGPU;
-namespace precisionExp           = precisionPIConGPU;
-namespace precisionTrigonometric = precisionPIConGPU;
+    /*! Select a precision for the simulation data
+     *  - precision32Bit : use 32Bit floating point numbers
+     *                     [significant digits 7 to 8]
+     *  - precision64Bit : use 64Bit floating point numbers
+     *                     [significant digits 15 to 16]
+     */
+    namespace precisionPIConGPU = precision32Bit;
+
+    /*! Select a precision special operations (can be different from simulation precision)
+     *  - precisionPIConGPU : use precision which is selected on top (precisionPIConGPU)
+     *  - precision32Bit    : use 32Bit floating point numbers
+     *  - precision64Bit    : use 64Bit floating point numbers
+     */
+    namespace precisionSqrt = precisionPIConGPU;
+    namespace precisionExp = precisionPIConGPU;
+    namespace precisionTrigonometric = precisionPIConGPU;
 
 
 } // namespace picongpu
diff --git a/include/picongpu/param/pusher.param b/include/picongpu/param/pusher.param
index dcf5597be0..2a55ffe881 100644
--- a/include/picongpu/param/pusher.param
+++ b/include/picongpu/param/pusher.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -60,11 +60,19 @@ namespace picongpu
          *  - precision64Bit
          */
         namespace sqrt_Vay = precision64Bit;
-    }
+    } // namespace particlePusherVay
 
-    namespace particlePusherAxel
+    namespace particlePusherHigueraCary
     {
+        /** Precision of the square roots during the push step
+         *  - precision32Bit
+         *  - precision64Bit
+         */
+        namespace sqrt_HigueraCary = precision64Bit;
+    } // namespace particlePusherHigueraCary
 
+    namespace particlePusherAxel
+    {
         enum TrajectoryInterpolationType
         {
             LINEAR = 1u,
@@ -72,23 +80,28 @@ namespace picongpu
         };
         constexpr TrajectoryInterpolationType TrajectoryInterpolation = LINEAR;
 
-    }
+    } // namespace particlePusherAxel
 
     namespace particles
     {
-    namespace pusher
-    {
-        struct Vay;
-        struct Boris;
-        struct Photon;
-        struct Acceleration;
-        struct Free;
-        struct Probe;
-        struct ReducedLandauLifshitz;
-#if(SIMDIM==DIM3)
-        struct Axel;
+        namespace pusher
+        {
+            struct HigueraCary;
+            struct Vay;
+            struct Boris;
+            struct Photon;
+            struct Acceleration;
+            struct Free;
+            struct Probe;
+            struct ReducedLandauLifshitz;
+#if(SIMDIM == DIM3)
+            struct Axel;
 #endif
-    } // namespace pusher
+            template<typename T_FirstPusher, typename T_SecondPusher, typename T_ActivationFunctor>
+            struct Composite;
+            template<uint32_t T_switchTimeStep>
+            struct CompositeBinarySwitchActivationFunctor;
+        } // namespace pusher
     } // namespace particles
 
     namespace particlePusherProbe
@@ -105,6 +118,6 @@ namespace picongpu
          * - void (no push)
          */
         using ActualPusher = void;
-    }
+    } // namespace particlePusherProbe
 
 } // namespace picongpu
diff --git a/include/picongpu/param/radiation.param b/include/picongpu/param/radiation.param
index 009ee20c9e..bff4b3e133 100644
--- a/include/picongpu/param/radiation.param
+++ b/include/picongpu/param/radiation.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -44,163 +44,180 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace linear_frequencies
-{
-namespace SI
-{
-    /** mimimum frequency of the linear frequency scale in units of [1/s] */
-    constexpr float_64 omega_min = 0.0;
-    /** maximum frequency of the linear frequency scale in units of [1/s] */
-    constexpr float_64 omega_max = 1.06e16;
-} // namespace SI
-
-    /** number of frequency values to compute in the linear frequency [unitless] */
-    constexpr unsigned int N_omega = 2048;
-} // namespace linear_frequencies
-
-namespace log_frequencies
-{
-namespace SI
-{
-    /** mimimum frequency of the logarithmic frequency scale in units of [1/s] */
-    constexpr float_64 omega_min = 1.0e14;
-    /** maximum frequency of the logarithmic frequency scale in units of [1/s] */
-    constexpr float_64 omega_max = 1.0e17;
-} // namespace SI
-
-    /** number of frequency values to compute in the logarithmic frequency [unitless] */
-    constexpr unsigned int N_omega = 2048;
-} // namespace log_frequencies
-
-
-namespace frequencies_from_list
-{
-    /** path to text file with frequencies */
-    constexpr const char * listLocation = "/path/to/frequency_list";
-    /** number of frequency values to compute if frequencies are given in a file [unitless] */
-    constexpr unsigned int N_omega = 2048;
-} // namespace frequencies_from_list
-
-    /** selected mode of frequency scaling:
-     *
-     * options:
-     * - linear_frequencies
-     * - log_frequencies
-     * - frequencies_from_list
-     */
-    namespace radiation_frequencies = linear_frequencies;
-
-namespace radiationNyquist
-{
-    /** Nyquist factor: fraction of the local Nyquist frequency above which the spectra is set to zero
-     * should be in (0, 1).
-     */
-    constexpr float_32 NyquistFactor = 0.5;
-} // namespace radiationNyquist
-
-
-    ///////////////////////////////////////////////////
-
-
-    /** correct treatment of coherent and incoherent radiation from macro particles
-     *
-     * Choose different form factors in order to consider different particle shapes for radiation
-     *  - radFormFactor_CIC_3D ... CIC charge distribution
-     *  - radFormFactor_TSC_3D ... TSC charge distribution
-     *  - radFormFactor_PCS_3D ... PCS charge distribution
-     *  - radFormFactor_CIC_1Dy ... only CIC charge distribution in y
-     *  - radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
-     *  - radFormFactor_Gauss_cell ... Gauss charge distribution according to cell size
-     *  - radFormFactor_incoherent ... only incoherent radiation
-     *  - radFormFactor_coherent ... only coherent radiation
-     */
-    namespace radFormFactor_CIC_3D { }
-    namespace radFormFactor_TSC_3D { }
-    namespace radFormFactor_PCS_3D { }
-    namespace radFormFactor_CIC_1Dy { }
-    namespace radFormFactor_Gauss_spherical { }
-    namespace radFormFactor_Gauss_cell { }
-    namespace radFormFactor_incoherent { }
-    namespace radFormFactor_coherent { }
-
-    namespace radFormFactor = radFormFactor_Gauss_spherical;
-
-
-    ///////////////////////////////////////////////////////////
-
-
-namespace parameters
-{
-
-    /** number of observation directions */
-    constexpr unsigned int N_observer = 256;
-
-} // namespace parameters
-
-    /** select particles for radiation
-     * example of a filter for the relativistic Lorentz factor gamma
-     */
-    struct GammaFilterFunctor
+    namespace plugins
     {
-        /** Gamma value above which the radiation is calculated */
-        static constexpr float_X radiationGamma = 5.0;
-
-        template< typename T_Particle >
-        HDINLINE void operator()( T_Particle& particle )
+        namespace radiation
         {
-            if(
-               picongpu::gamma<float_X>(
-                                        particle[ picongpu::momentum_ ],
-                                        picongpu::traits::attribute::getMass(
-                                                                             particle[ picongpu::weighting_ ],
-                                                                             particle
-                                                                             )
-                                        ) >= radiationGamma
-               )
-              particle[ picongpu::radiationMask_ ] = true;
-        }
-    };
-
-
-    /** filter to (de)select particles for the radiation calculation
-     *
-     * to activate the filter:
-     *   - goto file `speciesDefinition.param`
-     *   - add the attribute `radiationMask` to the particle species
-     */
-    using RadiationParticleFilter = picongpu::particles::manipulators::generic::Free<
-        GammaFilterFunctor
-      >;
-
-
-
-    //////////////////////////////////////////////////
-
-
-    /** add a window function weighting to the radiation in order
-     * to avoid ringing effects from sharpe boundaries
-     * default: no window function via `radWindowFunctionNone`
-     *
-     * Choose different window function in order to get better ringing reduction
-     * radWindowFunctionTriangle
-     * radWindowFunctionHamming
-     * radWindowFunctionTriplett
-     * radWindowFunctionGauss
-     * radWindowFunctionNone
-     */
-    namespace radWindowFunctionTriangle { }
-    namespace radWindowFunctionHamming { }
-    namespace radWindowFunctionTriplett { }
-    namespace radWindowFunctionGauss { }
-    namespace radWindowFunctionNone { }
-
-    namespace radWindowFunction = radWindowFunctionNone;
-
-
-} // namespace radiation
-} // namespace plugins
+            namespace linear_frequencies
+            {
+                namespace SI
+                {
+                    /** mimimum frequency of the linear frequency scale in units of [1/s] */
+                    constexpr float_64 omega_min = 0.0;
+                    /** maximum frequency of the linear frequency scale in units of [1/s] */
+                    constexpr float_64 omega_max = 1.06e16;
+                } // namespace SI
+
+                /** number of frequency values to compute in the linear frequency [unitless] */
+                constexpr unsigned int N_omega = 2048;
+            } // namespace linear_frequencies
+
+            namespace log_frequencies
+            {
+                namespace SI
+                {
+                    /** mimimum frequency of the logarithmic frequency scale in units of [1/s] */
+                    constexpr float_64 omega_min = 1.0e14;
+                    /** maximum frequency of the logarithmic frequency scale in units of [1/s] */
+                    constexpr float_64 omega_max = 1.0e17;
+                } // namespace SI
+
+                /** number of frequency values to compute in the logarithmic frequency [unitless] */
+                constexpr unsigned int N_omega = 2048;
+            } // namespace log_frequencies
+
+
+            namespace frequencies_from_list
+            {
+                /** path to text file with frequencies */
+                constexpr const char* listLocation = "/path/to/frequency_list";
+                /** number of frequency values to compute if frequencies are given in a file [unitless] */
+                constexpr unsigned int N_omega = 2048;
+            } // namespace frequencies_from_list
+
+            /** selected mode of frequency scaling:
+             *
+             * options:
+             * - linear_frequencies
+             * - log_frequencies
+             * - frequencies_from_list
+             */
+            namespace radiation_frequencies = linear_frequencies;
+
+            namespace radiationNyquist
+            {
+                /** Nyquist factor: fraction of the local Nyquist frequency above which the spectra is set to zero
+                 * should be in (0, 1).
+                 */
+                constexpr float_32 NyquistFactor = 0.5;
+            } // namespace radiationNyquist
+
+
+            ///////////////////////////////////////////////////
+
+
+            /** correct treatment of coherent and incoherent radiation from macro particles
+             *
+             * Choose different form factors in order to consider different particle shapes for radiation
+             *  - radFormFactor_CIC_3D ... CIC charge distribution
+             *  - radFormFactor_TSC_3D ... TSC charge distribution
+             *  - radFormFactor_PCS_3D ... PCS charge distribution
+             *  - radFormFactor_CIC_1Dy ... only CIC charge distribution in y
+             *  - radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
+             *  - radFormFactor_Gauss_cell ... Gauss charge distribution according to cell size
+             *  - radFormFactor_incoherent ... only incoherent radiation
+             *  - radFormFactor_coherent ... only coherent radiation
+             */
+            namespace radFormFactor_CIC_3D
+            {
+            }
+            namespace radFormFactor_TSC_3D
+            {
+            }
+            namespace radFormFactor_PCS_3D
+            {
+            }
+            namespace radFormFactor_CIC_1Dy
+            {
+            }
+            namespace radFormFactor_Gauss_spherical
+            {
+            }
+            namespace radFormFactor_Gauss_cell
+            {
+            }
+            namespace radFormFactor_incoherent
+            {
+            }
+            namespace radFormFactor_coherent
+            {
+            }
+
+            namespace radFormFactor = radFormFactor_Gauss_spherical;
+
+
+            ///////////////////////////////////////////////////////////
+
+
+            namespace parameters
+            {
+                /** number of observation directions */
+                constexpr unsigned int N_observer = 256;
+
+            } // namespace parameters
+
+            /** select particles for radiation
+             * example of a filter for the relativistic Lorentz factor gamma
+             */
+            struct GammaFilterFunctor
+            {
+                /** Gamma value above which the radiation is calculated */
+                static constexpr float_X radiationGamma = 5.0;
+
+                template<typename T_Particle>
+                HDINLINE void operator()(T_Particle& particle)
+                {
+                    if(picongpu::gamma<float_X>(
+                           particle[picongpu::momentum_],
+                           picongpu::traits::attribute::getMass(particle[picongpu::weighting_], particle))
+                       >= radiationGamma)
+                        particle[picongpu::radiationMask_] = true;
+                }
+            };
+
+
+            /** filter to (de)select particles for the radiation calculation
+             *
+             * to activate the filter:
+             *   - goto file `speciesDefinition.param`
+             *   - add the attribute `radiationMask` to the particle species
+             */
+            using RadiationParticleFilter = picongpu::particles::manipulators::generic::Free<GammaFilterFunctor>;
+
+
+            //////////////////////////////////////////////////
+
+
+            /** add a window function weighting to the radiation in order
+             * to avoid ringing effects from sharpe boundaries
+             * default: no window function via `radWindowFunctionNone`
+             *
+             * Choose different window function in order to get better ringing reduction
+             * radWindowFunctionTriangle
+             * radWindowFunctionHamming
+             * radWindowFunctionTriplett
+             * radWindowFunctionGauss
+             * radWindowFunctionNone
+             */
+            namespace radWindowFunctionTriangle
+            {
+            }
+            namespace radWindowFunctionHamming
+            {
+            }
+            namespace radWindowFunctionTriplett
+            {
+            }
+            namespace radWindowFunctionGauss
+            {
+            }
+            namespace radWindowFunctionNone
+            {
+            }
+
+            namespace radWindowFunction = radWindowFunctionNone;
+
+
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/param/radiationObserver.param b/include/picongpu/param/radiationObserver.param
index 308a8fc2fc..f2663f76ec 100644
--- a/include/picongpu/param/radiationObserver.param
+++ b/include/picongpu/param/radiationObserver.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -29,106 +29,105 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace radiation_observer
-{
-    /** Compute observation angles
-     *
-     * This function is used in the Radiation plug-in kernel to compute
-     * the observation directions given as a unit vector pointing
-     * towards a 'virtual' detector
-     *
-     * This default setup is an example of a 2D detector array. It computes
-     * observation directions for 2D virtual detector field
-     * with its center pointing toward the +y direction (for theta=0, phi=0)
-     * with observation angles ranging from
-     * theta = [angle_theta_start : angle_theta_end]
-     * phi   = [angle_phi_start   : angle_phi_end  ]
-     * Every observation_id_extern index moves the phi angle from its
-     * start value toward its end value until the observation_id_extern
-     * reaches N_split. After that the theta angle moves further from its
-     * start value towards its end value while phi is reset to its start
-     * value.
-     *
-     * The unit vector pointing towards the observing virtual detector
-     * can be described using theta and phi by:
-     * x_value = sin(theta) * cos(phi)
-     * y_value = cos(theta)
-     * z_value = sin(theta) * sin(phi)
-     * These are the standard spherical coordinates.
-     *
-     * The example setup describes an detector array of
-     * 16x16 detectors ranging from -pi/8= -22.5 degrees
-     * to +pi/8= +22.5 degrees for both angles with the center
-     * pointing toward the y-axis (laser propagation direction).
-     *
-     * @param    observation_id_extern
-     *           int index that identifies each block on the GPU
-     *           to compute the observation direction
-     *
-     * @return   unit vector pointing in observation direction
-     *           type: vector_64
-     *
-     */
-    HDINLINE vector_64 observation_direction(const int observation_id_extern)
+    namespace plugins
     {
-        /* generate two indices from single block index */
-        /** split distance of given index
-         * pseudo-code:
-         * index_a = index / split_distance
-         * index_b = index % split_distance
-         */
-        constexpr int N_angle_split = 16;
-        /** get index for computing angle theta: */
-        const int my_index_theta = observation_id_extern / N_angle_split;
-        /** get index for computing angle phi: */
-        const int my_index_phi = observation_id_extern % N_angle_split;
-
+        namespace radiation
+        {
+            namespace radiation_observer
+            {
+                /** Compute observation angles
+                 *
+                 * This function is used in the Radiation plug-in kernel to compute
+                 * the observation directions given as a unit vector pointing
+                 * towards a 'virtual' detector
+                 *
+                 * This default setup is an example of a 2D detector array. It computes
+                 * observation directions for 2D virtual detector field
+                 * with its center pointing toward the +y direction (for theta=0, phi=0)
+                 * with observation angles ranging from
+                 * theta = [angle_theta_start : angle_theta_end]
+                 * phi   = [angle_phi_start   : angle_phi_end  ]
+                 * Every observation_id_extern index moves the phi angle from its
+                 * start value toward its end value until the observation_id_extern
+                 * reaches N_split. After that the theta angle moves further from its
+                 * start value towards its end value while phi is reset to its start
+                 * value.
+                 *
+                 * The unit vector pointing towards the observing virtual detector
+                 * can be described using theta and phi by:
+                 * x_value = sin(theta) * cos(phi)
+                 * y_value = cos(theta)
+                 * z_value = sin(theta) * sin(phi)
+                 * These are the standard spherical coordinates.
+                 *
+                 * The example setup describes an detector array of
+                 * 16x16 detectors ranging from -pi/8= -22.5 degrees
+                 * to +pi/8= +22.5 degrees for both angles with the center
+                 * pointing toward the y-axis (laser propagation direction).
+                 *
+                 * @param    observation_id_extern
+                 *           int index that identifies each block on the GPU
+                 *           to compute the observation direction
+                 *
+                 * @return   unit vector pointing in observation direction
+                 *           type: vector_64
+                 *
+                 */
+                HDINLINE vector_64 observation_direction(const int observation_id_extern)
+                {
+                    /* generate two indices from single block index */
+                    /** split distance of given index
+                     * pseudo-code:
+                     * index_a = index / split_distance
+                     * index_b = index % split_distance
+                     */
+                    constexpr int N_angle_split = 16;
+                    /** get index for computing angle theta: */
+                    const int my_index_theta = observation_id_extern / N_angle_split;
+                    /** get index for computing angle phi: */
+                    const int my_index_phi = observation_id_extern % N_angle_split;
 
-        /* set up observation angle range */
-        /* angles range for theta */
-        /** minimum theta angle [rad] */
-        const picongpu::float_64 angle_theta_start = - picongpu::PI/8.0;
-        /** maximum theta angle [rad] */
-        const picongpu::float_64 angle_theta_end   = + picongpu::PI/8.0;
-        /* angles range for phi */
-        /** minimum phi angle [rad] */
-        constexpr picongpu::float_64 angle_phi_start = - picongpu::PI/8.0;
-        /** maximum phi angle [rad] */
-        constexpr picongpu::float_64 angle_phi_end   = + picongpu::PI/8.0;
 
+                    /* set up observation angle range */
+                    /* angles range for theta */
+                    /** minimum theta angle [rad] */
+                    const picongpu::float_64 angle_theta_start = -picongpu::PI / 8.0;
+                    /** maximum theta angle [rad] */
+                    const picongpu::float_64 angle_theta_end = +picongpu::PI / 8.0;
+                    /* angles range for phi */
+                    /** minimum phi angle [rad] */
+                    constexpr picongpu::float_64 angle_phi_start = -picongpu::PI / 8.0;
+                    /** maximum phi angle [rad] */
+                    constexpr picongpu::float_64 angle_phi_end = +picongpu::PI / 8.0;
 
-        /* compute step with between two angles for range [angle_??_start : angle_??_end] */
-        /** number of theta angles */
-        constexpr int N_theta = parameters::N_observer / N_angle_split;
-        /** step width angle theta */
-        const picongpu::float_64 delta_angle_theta =  (angle_theta_end -
-                                                       angle_theta_start) / (N_theta-1.0);
-        /** step width angle phi */
-        const picongpu::float_64 delta_angle_phi   =  (angle_phi_end -
-                                                       angle_phi_start)   / (N_angle_split-1.0);
 
-        /** compute observation angles theta */
-        const picongpu::float_64 theta( my_index_theta * delta_angle_theta + angle_theta_start );
-        /** compute observation angles theta */
-        const picongpu::float_64 phi( my_index_phi * delta_angle_phi - angle_phi_start );
+                    /* compute step with between two angles for range [angle_??_start : angle_??_end] */
+                    /** number of theta angles */
+                    constexpr int N_theta = parameters::N_observer / N_angle_split;
+                    /** step width angle theta */
+                    const picongpu::float_64 delta_angle_theta
+                        = (angle_theta_end - angle_theta_start) / (N_theta - 1.0);
+                    /** step width angle phi */
+                    const picongpu::float_64 delta_angle_phi
+                        = (angle_phi_end - angle_phi_start) / (N_angle_split - 1.0);
 
-        /* helper functions for efficient trigonometric calculations */
-        picongpu::float_32 sinPhi;
-        picongpu::float_32 cosPhi;
-        picongpu::float_32 sinTheta;
-        picongpu::float_32 cosTheta;
-        math::sincos(precisionCast<picongpu::float_32>(phi), sinPhi, cosPhi);
-        math::sincos(precisionCast<picongpu::float_32>(theta), sinTheta, cosTheta);
-        /** compute observation unit vector */
-        return vector_64( sinTheta*cosPhi , cosTheta, sinTheta*sinPhi ) ;
+                    /** compute observation angles theta */
+                    const picongpu::float_64 theta(my_index_theta * delta_angle_theta + angle_theta_start);
+                    /** compute observation angles theta */
+                    const picongpu::float_64 phi(my_index_phi * delta_angle_phi - angle_phi_start);
 
-    }
+                    /* helper functions for efficient trigonometric calculations */
+                    picongpu::float_32 sinPhi;
+                    picongpu::float_32 cosPhi;
+                    picongpu::float_32 sinTheta;
+                    picongpu::float_32 cosTheta;
+                    pmacc::math::sincos(precisionCast<picongpu::float_32>(phi), sinPhi, cosPhi);
+                    pmacc::math::sincos(precisionCast<picongpu::float_32>(theta), sinTheta, cosTheta);
+                    /** compute observation unit vector */
+                    return vector_64(sinTheta * cosPhi, cosTheta, sinTheta * sinPhi);
+                }
 
-} // namespace radiation_observer
-} // namespace radiation
-} // namespace plugins
+            } // namespace radiation_observer
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/param/random.param b/include/picongpu/param/random.param
index c27b82d799..051944819d 100644
--- a/include/picongpu/param/random.param
+++ b/include/picongpu/param/random.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Rene Widera
+/* Copyright 2014-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -33,28 +33,28 @@
 
 namespace picongpu
 {
-namespace random
-{
-    /** Random number generation methods
-     *
-     * It is not allowed to change the method and restart an already existing checkpoint.
-     *
-     *  - pmacc::random::methods::XorMin
-     *  - pmacc::random::methods::MRG32k3aMin
-     *  - pmacc::random::methods::AlpakaRand
-     */
-    using Generator =  pmacc::random::methods::XorMin< >;
+    namespace random
+    {
+        /** Random number generation methods
+         *
+         * It is not allowed to change the method and restart an already existing checkpoint.
+         *
+         *  - pmacc::random::methods::XorMin
+         *  - pmacc::random::methods::MRG32k3aMin
+         *  - pmacc::random::methods::AlpakaRand
+         */
+        using Generator = pmacc::random::methods::XorMin<>;
 
-    /** random number start seed
-     *
-     * Generator to create a seed for the random number generator.
-     * Depending of the generator the seed is reproducible or
-     * or changed with each program execution.
-     *
-     *   - seed::Value< 42 >
-     *   - seed::FromTime
-     *   - seed::FromEnvironment
-     */
-    using SeedGenerator = seed::Value< 42 > ;
-} // namespace random
+        /** random number start seed
+         *
+         * Generator to create a seed for the random number generator.
+         * Depending of the generator the seed is reproducible or
+         * or changed with each program execution.
+         *
+         *   - seed::Value< 42 >
+         *   - seed::FromTime
+         *   - seed::FromEnvironment
+         */
+        using SeedGenerator = seed::Value<42>;
+    } // namespace random
 } // namespace picongpu
diff --git a/include/picongpu/param/species.param b/include/picongpu/param/species.param
index e3e7a2b528..524337f2a9 100644
--- a/include/picongpu/param/species.param
+++ b/include/picongpu/param/species.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Richard Pausch
+/* Copyright 2014-2021 Rene Widera, Richard Pausch, Annegret Roeszler, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -19,9 +19,19 @@
 
 /** @file
  *
- * Forward declarations for speciesDefinition.param in case one wants to use
- * the same particle shape, interpolation, current solver and particle pusher
- * for all particle species.
+ * Particle shape, field to particle interpolation, current solver, and particle pusher
+ * can be declared here for usage in `speciesDefinition.param`.
+ *
+ * @see
+ *   **MODELS / Hierarchy of Charge Assignment Schemes**
+ *   in the online documentation for information on particle shapes.
+ *
+ *
+ * \attention
+ * The higher order shape names are redefined with release 0.6.0 in order to provide a consistent naming:
+ *     * PQS is the name of the 3rd order assignment function (instead of PCS)
+ *     * PCS is the name of the 4th order assignment function (instead of P4S)
+ *     * P4S does not exist anymore
  */
 
 #pragma once
@@ -36,53 +46,61 @@
 
 namespace picongpu
 {
+    /** select macroparticle shape
+     *
+     * **WARNING** the shape names are redefined and diverge from PIConGPU versions before 0.6.0.
+     *
+     *  - particles::shapes::CIC : Assignment function is a piecewise linear spline
+     *  - particles::shapes::TSC : Assignment function is a piecewise quadratic spline
+     *  - particles::shapes::PQS : Assignment function is a piecewise cubic spline
+     *  - particles::shapes::PCS : Assignment function is a piecewise quartic spline
+     */
+    using UsedParticleShape = particles::shapes::TSC;
 
-/** Particle Shape definitions
- *  - particles::shapes::CIC : 1st order
- *  - particles::shapes::TSC : 2nd order
- *  - particles::shapes::PCS : 3rd order
- *  - particles::shapes::P4S : 4th order
- *
- *  example: using UsedParticleShape = particles::shapes::CIC;
- */
-using UsedParticleShape = particles::shapes::TSC;
+    /** select interpolation method to be used for interpolation of grid-based field values to particle positions
+     */
+    using UsedField2Particle = FieldToParticleInterpolation<UsedParticleShape, AssignedTrilinearInterpolation>;
 
-/** define which interpolation method is used to interpolate fields to particles
- */
-using UsedField2Particle = FieldToParticleInterpolation<
-    UsedParticleShape,
-    AssignedTrilinearInterpolation
->;
+    /*! select current solver method
+     * - currentSolver::Esirkepov< SHAPE, STRATEGY > : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     * - currentSolver::VillaBune< SHAPE, STRATEGY > : particle shapes - CIC (1st order) only
+     * - currentSolver::EmZ< SHAPE, STRATEGY >       : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     *
+     * For development purposes:
+     * - currentSolver::EsirkepovNative< SHAPE, STRATEGY > : generic version of currentSolverEsirkepov
+     *   without optimization (~4x slower and needs more shared memory)
+     *
+     * STRATEGY (optional):
+     * - currentSolver::strategy::StridedCachedSupercells
+     * - currentSolver::strategy::StridedCachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::CachedSupercells
+     * - currentSolver::strategy::CachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::NonCachedSupercells
+     * - currentSolver::strategy::NonCachedSupercellsScaled<N> with N >= 1
+     */
+    using UsedParticleCurrentSolver = currentSolver::Esirkepov<UsedParticleShape>;
 
-/** select current solver method
- * - currentSolver::Esirkepov< SHAPE > : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- * - currentSolver::VillaBune<>        : particle shapes - CIC (1st order) only
- * - currentSolver::EmZ< SHAPE >       : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- *
- * For development purposes:
- * - currentSolver::currentSolver::EsirkepovNative< SHAPE > : generic version of currentSolverEsirkepov
- *   without optimization (~4x slower and needs more shared memory)
- */
-using UsedParticleCurrentSolver = currentSolver::Esirkepov< UsedParticleShape >;
-
-/** particle pusher configuration
- *
- * Defining a pusher is optional for particles
- *
- * - particles::pusher::Vay : better suited relativistic boris pusher
- * - particles::pusher::Boris : standard boris pusher
- * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
- *                                              with classical radiation reaction
- *
- * For diagnostics & modeling: ------------------------------------------------
- * - particles::pusher::Acceleration : Accelerate particles by applying a constant electric field
- * - particles::pusher::Free : free propagation, ignore fields
- *                             (= free stream model)
- * - particles::pusher::Photon : propagate with c in direction of normalized mom.
- * - particles::pusher::Probe : Probe particles that interpolate E & B
- * For development purposes: --------------------------------------------------
- * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
- */
-using UsedParticlePusher = particles::pusher::Boris;
+    /** particle pusher configuration
+     *
+     * Defining a pusher is optional for particles
+     *
+     * - particles::pusher::HigueraCary : Higuera & Cary's relativistic pusher preserving both volume and ExB velocity
+     * - particles::pusher::Vay : Vay's relativistic pusher preserving ExB velocity
+     * - particles::pusher::Boris : Boris' relativistic pusher preserving volume
+     * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
+     *                                              with classical radiation reaction
+     * - particles::pusher::Composite : composite of two given pushers,
+     *                                  switches between using one (or none) of those
+     *
+     * For diagnostics & modeling: ------------------------------------------------
+     * - particles::pusher::Acceleration : Accelerate particles by applying a constant electric field
+     * - particles::pusher::Free : free propagation, ignore fields
+     *                             (= free stream model)
+     * - particles::pusher::Photon : propagate with c in direction of normalized mom.
+     * - particles::pusher::Probe : Probe particles that interpolate E & B
+     * For development purposes: --------------------------------------------------
+     * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
+     */
+    using UsedParticlePusher = particles::pusher::Boris;
 
 } // namespace picongpu
diff --git a/include/picongpu/param/speciesAttributes.param b/include/picongpu/param/speciesAttributes.param
index 51aba8374c..12cb764000 100644
--- a/include/picongpu/param/speciesAttributes.param
+++ b/include/picongpu/param/speciesAttributes.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Marco Garten, Alexander Grund, Axel Huebl,
+/* Copyright 2014-2021 Rene Widera, Marco Garten, Alexander Grund, Axel Huebl,
  *                     Heiko Burau
  *
  * This file is part of PIConGPU.
@@ -53,63 +53,31 @@ namespace picongpu
      * float3_64, ...
      * This is only a name without a specialization.
      */
-    alias( position );
+    alias(position);
 
     //! unique identifier for a particle
-    value_identifier(
-        uint64_t,
-        particleId,
-        IdProvider< simDim >::getNewId()
-    );
+    value_identifier(uint64_t, particleId, IdProvider<simDim>::getNewId());
 
     //! specialization for the relative in-cell position
-    value_identifier(
-        floatD_X,
-        position_pic,
-        floatD_X::create( 0. )
-    );
+    value_identifier(floatD_X, position_pic, floatD_X::create(0.));
 
     //! momentum at timestep t
-    value_identifier(
-        float3_X,
-        momentum,
-        float3_X::create( 0. )
-    );
+    value_identifier(float3_X, momentum, float3_X::create(0.));
 
     //! momentum at (previous) timestep t-1
-    value_identifier(
-        float3_X,
-        momentumPrev1,
-        float3_X::create( 0._X )
-    );
+    value_identifier(float3_X, momentumPrev1, float3_X::create(0._X));
 
     //! weighting of the macro particle
-    value_identifier(
-        float_X,
-        weighting,
-        0._X
-    );
+    value_identifier(float_X, weighting, 0._X);
 
     //! Voronoi cell of the macro particle
-    value_identifier(
-        int16_t,
-        voronoiCellId,
-        -1
-    );
+    value_identifier(int16_t, voronoiCellId, -1);
 
     //! interpolated electric field with respect to particle shape
-    value_identifier(
-        float3_X,
-        probeE,
-        float3_X::create( 0. )
-    );
+    value_identifier(float3_X, probeE, float3_X::create(0.));
 
     //! interpolated electric field with respect to particle shape
-    value_identifier(
-        float3_X,
-        probeB,
-        float3_X::create( 0. )
-    );
+    value_identifier(float3_X, probeB, float3_X::create(0.));
 
     /** masking a particle for radiation
      *
@@ -117,11 +85,7 @@ namespace picongpu
      * `RadiationParticleFilter` in radiation.param to (de)select
      * particles for the radiation calculation.
      */
-    value_identifier(
-        bool,
-        radiationMask,
-        false
-    );
+    value_identifier(bool, radiationMask, false);
 
     /** masking a particle for transition radiation
      *
@@ -129,11 +93,7 @@ namespace picongpu
      * `TransitionRadiationParticleFilter` in transitionRadiation.param to (de)select
      * particles for the transition radiation calculation.
      */
-    value_identifier(
-        bool,
-        transitionRadiationMask,
-        false
-    );
+    value_identifier(bool, transitionRadiationMask, false);
 
     /** number of electrons bound to the atom / ion
      *
@@ -145,22 +105,14 @@ namespace picongpu
      *
      * @todo connect default to proton number
      */
-    value_identifier(
-        float_X,
-        boundElectrons,
-        0._X
-    );
+    value_identifier(float_X, boundElectrons, 0._X);
 
     /** atomic superconfiguration
      *
      * atomic configuration of an ion for collisional-radiative modeling,
      * see also flylite.param
      */
-    value_identifier(
-        flylite::Superconfig,
-        superconfig,
-        flylite::Superconfig::create( 0. )
-    );
+    value_identifier(flylite::Superconfig, superconfig, flylite::Superconfig::create(0.));
 
     /** Total cell index of a particle.
      *
@@ -169,57 +121,53 @@ namespace picongpu
      *    `globalDomain.offset` + `localDomain.offset`
      *  added to the N-dimensional cell index the particle belongs to on that GPU.
      */
-    value_identifier(
-        DataSpace< simDim >,
-        totalCellIdx,
-        DataSpace< simDim >( )
-    );
+    value_identifier(DataSpace<simDim>, totalCellIdx, DataSpace<simDim>());
 
     //! alias for particle shape, see also species.param
-    alias( shape );
+    alias(shape);
 
     //! alias for particle pusher, see alsospecies.param
-    alias( particlePusher );
+    alias(particlePusher);
 
     //! alias for particle ionizers, see also ionizer.param
-    alias( ionizers );
+    alias(ionizers);
 
     //! alias for ionization energy container, see also ionizationEnergies.param
-    alias( ionizationEnergies );
+    alias(ionizationEnergies);
 
     //! alias for synchrotronPhotons, see also speciesDefinition.param
-    alias( synchrotronPhotons )
+    alias(synchrotronPhotons);
 
     //! alias for ion species used for bremsstrahlung
-    alias( bremsstrahlungIons );
+    alias(bremsstrahlungIons);
 
     //! alias for photon species used for bremsstrahlung
-    alias( bremsstrahlungPhotons );
+    alias(bremsstrahlungPhotons);
 
     //! alias for particle to field interpolation, see also species.param
-    alias( interpolation );
+    alias(interpolation);
 
     //! alias for particle current solver, see also species.param
-    alias( current );
+    alias(current);
 
     /** alias for particle flag: atomic numbers, see also ionizer.param
      * - only reasonable for atoms / ions / nuclei
      * - is required when boundElectrons is set
      */
-    alias( atomicNumbers );
+    alias(atomicNumbers);
 
     /** alias for particle flag: effective nuclear charge,
      *
      * - see also ionizer.param
      * - only reasonable for atoms / ions / nuclei
      */
-    alias( effectiveNuclearCharge );
+    alias(effectiveNuclearCharge);
 
     /** alias for particle population kinetics model (e.g. FLYlite)
      *
      * see also flylite.param
      */
-    alias( populationKinetics );
+    alias(populationKinetics);
 
     /** alias for particle mass ratio
      *
@@ -228,7 +176,7 @@ namespace picongpu
      *
      * default value: 1.0 if unset
      */
-    alias( massRatio );
+    alias(massRatio);
 
     /** alias for particle charge ratio
      *
@@ -237,7 +185,7 @@ namespace picongpu
      *
      * default value: 1.0 if unset
      */
-    alias( chargeRatio );
+    alias(chargeRatio);
 
     /** alias for particle density ratio
      *
@@ -246,7 +194,7 @@ namespace picongpu
      *
      * default value: 1.0 if unset
      */
-    alias( densityRatio );
+    alias(densityRatio);
 
     /** alias to reserved bytes for each communication direction
      *
@@ -263,10 +211,12 @@ namespace picongpu
      *     static constexpr uint32_t BYTES_EXCHANGE_Z = 5 * 1024 * 1024;
      *     static constexpr uint32_t BYTES_CORNER = 16 * 1024;
      *     static constexpr uint32_t BYTES_EDGES = 16 * 1024;
+     *     using REF_LOCAL_DOM_SIZE = mCT::Int<0, 0, 0>;
+     *     const std::array<float_X, 3> DIR_SCALING_FACTOR = {{0.0, 0.0, 0.0}};
      * };
      * @endcode
      */
-    alias( exchangeMemCfg );
+    alias(exchangeMemCfg);
 
     /** alias to specify the boundary condition for particles
      *
@@ -277,6 +227,6 @@ namespace picongpu
      * Note: alias `boundaryCondition` will be ignored if the runtime parameter
      * `--periodic` is set.
      */
-    alias( boundaryCondition );
+    alias(boundaryCondition);
 
 } // namespace picongpu
diff --git a/include/picongpu/param/speciesConstants.param b/include/picongpu/param/speciesConstants.param
index 5915567a33..1771428203 100644
--- a/include/picongpu/param/speciesConstants.param
+++ b/include/picongpu/param/speciesConstants.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -65,5 +65,5 @@ namespace picongpu
          * unit: C
          */
         constexpr float_64 BASE_CHARGE_SI = ELECTRON_CHARGE_SI;
-    }
-}
+    } // namespace SI
+} // namespace picongpu
diff --git a/include/picongpu/param/speciesDefinition.param b/include/picongpu/param/speciesDefinition.param
index 3307508c34..11f27a4d47 100644
--- a/include/picongpu/param/speciesDefinition.param
+++ b/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Heiko Burau
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -43,105 +43,88 @@
 
 namespace picongpu
 {
+    /*########################### define particle attributes #####################*/
 
-/*########################### define particle attributes #####################*/
-
-/** describe attributes of a particle*/
-using DefaultParticleAttributes = MakeSeq_t<
-    position< position_pic >,
-    momentum,
-    weighting
->;
-
-/*########################### end particle attributes ########################*/
-
-/*########################### define species #################################*/
-
-/*--------------------------- photons -------------------------------------------*/
-
-value_identifier( float_X, MassRatioPhotons, 0.0 );
-value_identifier( float_X, ChargeRatioPhotons, 0.0 );
-
-using ParticleFlagsPhotons = MakeSeq_t<
-    particlePusher< particles::pusher::Photon >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    massRatio< MassRatioPhotons >,
-    chargeRatio< ChargeRatioPhotons >
->;
-
-/* define species photons */
-using PIC_Photons = Particles<
-    PMACC_CSTRING( "ph" ),
-    ParticleFlagsPhotons,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- electrons --------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioElectrons, 1.0 );
-value_identifier( float_X, ChargeRatioElectrons, 1.0 );
-
-using ParticleFlagsElectrons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioElectrons >,
-    chargeRatio< ChargeRatioElectrons >
-#if( ENABLE_SYNCHROTRON_PHOTONS == 1 )
-    , synchrotronPhotons< PIC_Photons >
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting>;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+    /*--------------------------- photons -------------------------------------------*/
+
+    value_identifier(float_X, MassRatioPhotons, 0.0);
+    value_identifier(float_X, ChargeRatioPhotons, 0.0);
+
+    using ParticleFlagsPhotons = MakeSeq_t<
+        particlePusher<particles::pusher::Photon>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        massRatio<MassRatioPhotons>,
+        chargeRatio<ChargeRatioPhotons>>;
+
+    /* define species photons */
+    using PIC_Photons = Particles<PMACC_CSTRING("ph"), ParticleFlagsPhotons, DefaultParticleAttributes>;
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>
+#if(ENABLE_SYNCHROTRON_PHOTONS == 1)
+        ,
+        synchrotronPhotons<PIC_Photons>
 #endif
->;
-
-/* define species electrons */
-using PIC_Electrons = Particles<
-    PMACC_CSTRING( "e" ),
-    ParticleFlagsElectrons,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- ions -------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioIons, 1836.152672 );
-value_identifier( float_X, ChargeRatioIons, -1.0 );
-
-/* ratio relative to BASE_DENSITY */
-value_identifier( float_X, DensityRatioIons, 1.0 );
-
-using ParticleFlagsIons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioIons >,
-    chargeRatio< ChargeRatioIons >,
-    densityRatio< DensityRatioIons >,
-    atomicNumbers< ionization::atomicNumbers::Hydrogen_t >
->;
-
-/* define species ions */
-using PIC_Ions = Particles<
-    PMACC_CSTRING( "i" ),
-    ParticleFlagsIons,
-    DefaultParticleAttributes
->;
-
-/*########################### end species ####################################*/
-
-/** All known particle species of the simulation
- *
- * List all defined particle species from above in this list
- * to make them available to the PIC algorithm.
- */
-using VectorAllSpecies = MakeSeq_t<
-    PIC_Electrons,
-    PIC_Ions
-#if( ENABLE_SYNCHROTRON_PHOTONS == 1 )
-    , PIC_Photons
+        >;
+
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*--------------------------- ions -------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioIons, 1836.152672);
+    value_identifier(float_X, ChargeRatioIons, -1.0);
+
+    /* ratio relative to BASE_DENSITY */
+    value_identifier(float_X, DensityRatioIons, 1.0);
+
+    using ParticleFlagsIons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioIons>,
+        chargeRatio<ChargeRatioIons>,
+        densityRatio<DensityRatioIons>,
+        atomicNumbers<ionization::atomicNumbers::Hydrogen_t>>;
+
+    /* define species ions */
+    using PIC_Ions = Particles<PMACC_CSTRING("i"), ParticleFlagsIons, DefaultParticleAttributes>;
+
+    /*########################### end species ####################################*/
+
+    /** All known particle species of the simulation
+     *
+     * List all defined particle species from above in this list
+     * to make them available to the PIC algorithm.
+     */
+    using VectorAllSpecies = MakeSeq_t<
+        PIC_Electrons,
+        PIC_Ions
+#if(ENABLE_SYNCHROTRON_PHOTONS == 1)
+        ,
+        PIC_Photons
 #endif
->;
+        >;
 
 } // namespace picongpu
diff --git a/include/picongpu/param/speciesInitialization.param b/include/picongpu/param/speciesInitialization.param
index 7850be3a25..e9b1216837 100644
--- a/include/picongpu/param/speciesInitialization.param
+++ b/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,13 +33,13 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline defines in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<>;
+    namespace particles
+    {
+        /** InitPipeline defines in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/param/starter.param b/include/picongpu/param/starter.param
index 5e6c700755..a7ca54ee55 100644
--- a/include/picongpu/param/starter.param
+++ b/include/picongpu/param/starter.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 
@@ -26,9 +25,5 @@ namespace picongpu
 {
     namespace defaultPIConGPU
     {
-
     }
-}
-
-
-
+} // namespace picongpu
diff --git a/include/picongpu/param/synchrotronPhotons.param b/include/picongpu/param/synchrotronPhotons.param
index 1ffaaf5761..53ec1e2b45 100644
--- a/include/picongpu/param/synchrotronPhotons.param
+++ b/include/picongpu/param/synchrotronPhotons.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -21,36 +21,36 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace synchrotronPhotons
-{
-
+    namespace particles
+    {
+        namespace synchrotronPhotons
+        {
 /** enable synchrotron photon emission */
 #ifndef ENABLE_SYNCHROTRON_PHOTONS
-#define ENABLE_SYNCHROTRON_PHOTONS 0
+#    define ENABLE_SYNCHROTRON_PHOTONS 0
 #endif
 
-/** enable (disable) QED (classical) photon emission spectrum */
-constexpr bool enableQEDTerm = false;
+            /** enable (disable) QED (classical) photon emission spectrum */
+            constexpr bool enableQEDTerm = false;
 
-/** Above this value (to the power of three, see comments on mapping) the synchrotron functions are nearly zero. */
-constexpr float_64 SYNC_FUNCS_CUTOFF = 5.0;
+            /** Above this value (to the power of three, see comments on mapping) the synchrotron functions are nearly
+             * zero. */
+            constexpr float_64 SYNC_FUNCS_CUTOFF = 5.0;
 
-/** stepwidth for the numerical integration of the bessel function for the first synchrotron function */
-constexpr float_64 SYNC_FUNCS_BESSEL_INTEGRAL_STEPWIDTH = 1.0e-3;
+            /** stepwidth for the numerical integration of the bessel function for the first synchrotron function */
+            constexpr float_64 SYNC_FUNCS_BESSEL_INTEGRAL_STEPWIDTH = 1.0e-3;
 
-/** Number of sampling points of the lookup table */
-constexpr uint32_t SYNC_FUNCS_NUM_SAMPLES = 8192;
+            /** Number of sampling points of the lookup table */
+            constexpr uint32_t SYNC_FUNCS_NUM_SAMPLES = 8192;
 
-/** Photons of oscillation periods greater than a timestep are not created since the grid already accounts for them.
- * This cutoff ratio is defined as: photon-oscillation-period / timestep */
-constexpr float_64 SOFT_PHOTONS_CUTOFF_RATIO = 1.0;
+            /** Photons of oscillation periods greater than a timestep are not created since the grid already accounts
+             * for them. This cutoff ratio is defined as: photon-oscillation-period / timestep */
+            constexpr float_64 SOFT_PHOTONS_CUTOFF_RATIO = 1.0;
 
-/** if the emission probability per timestep is higher than this value and the log level is set to
- *  "CRITICAL" a warning will be raised. */
-constexpr float_64 SINGLE_EMISSION_PROB_LIMIT = 0.4;
+            /** if the emission probability per timestep is higher than this value and the log level is set to
+             *  "CRITICAL" a warning will be raised. */
+            constexpr float_64 SINGLE_EMISSION_PROB_LIMIT = 0.4;
 
-} // namespace synchrotronPhotons
-} // namespace particles
+        } // namespace synchrotronPhotons
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/param/transitionRadiation.param b/include/picongpu/param/transitionRadiation.param
index 95024ac20a..21ca7f8464 100644
--- a/include/picongpu/param/transitionRadiation.param
+++ b/include/picongpu/param/transitionRadiation.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch, Finn-Ole Carstens
+/* Copyright 2013-2021 Rene Widera, Richard Pausch, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -43,232 +43,244 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-// initiate the formfactor namespaces from the radiation plugin
-namespace radiation
-{
-    namespace radFormFactor_CIC_3D { }
-    namespace radFormFactor_TSC_3D { }
-    namespace radFormFactor_PCS_3D { }
-    namespace radFormFactor_CIC_1Dy { }
-    namespace radFormFactor_Gauss_spherical { }
-    namespace radFormFactor_Gauss_cell { }
-    namespace radFormFactor_incoherent { }
-    namespace radFormFactor_coherent { }
-} // namespace radiation
-
-namespace transitionRadiation
-{
-namespace linearFrequencies
-{
-    namespace SI
+    namespace plugins
     {
-        //! mimimum frequency of the linear frequency scale in units of [1/s]
-        constexpr float_64 omegaMin = 0.0;
-        //! maximum frequency of the linear frequency scale in units of [1/s]
-        constexpr float_64 omegaMax = 1.06e16;
-    }
-
-    //! number of frequency values to compute in the linear frequency [unitless]
-    constexpr unsigned int nOmega = 512;
-
-} // namespace linearFrequencies
-
-namespace logFrequencies
-{
-    namespace SI
-    {
-        //! mimimum frequency of the logarithmic frequency scale in units of [1/s]
-        constexpr float_64 omegaMin = 1.0e13;
-        //! maximum frequency of the logarithmic frequency scale in units of [1/s]
-        constexpr float_64 omegaMax = 1.0e17;
-    }
-
-    //! number of frequency values to compute in the logarithmic frequency [unitless]
-    constexpr unsigned int nOmega = 256;
-
-} // namespace logFrequencies
-
-
-namespace listFrequencies
-{
-    //! path to text file with frequencies
-    constexpr char listLocation[] = "/path/to/frequency_list";
-    //! number of frequency values to compute if frequencies are given in a file [unitless]
-    constexpr unsigned int nOmega = 512;
-
-} // namespace listFrequencies
-
-
-    /** selected mode of frequency scaling:
-     *
-     * options:
-     * - linearFrequencies
-     * - logFrequencies
-     * - listFrequencies
-     */
-    namespace frequencies = logFrequencies;
-
-    ///////////////////////////////////////////////////
-
-
-    /** correct treatment of coherent radiation from macro particles
-     *
-     * These formfactors are the same as in the radiation plugin!
-     * Choose different form factors in order to consider different particle shapes for radiation
-     *  - ::picongpu::plugins::radiation::radFormFactor_CIC_3D ... CIC charge distribution
-     *  - ::picongpu::plugins::radiation::radFormFactor_TSC_3D ... TSC charge distribution
-     *  - ::picongpu::plugins::radiation::radFormFactor_PCS_3D ... PCS charge distribution
-     *  - ::picongpu::plugins::radiation::radFormFactor_CIC_1Dy ... only CIC charge distribution in y
-     *  - ::picongpu::plugins::radiation::radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
-     *  - ::picongpu::plugins::radiation::radFormFactor_Gauss_cell ... Gauss charge distribution according to cell size
-     *  - ::picongpu::plugins::radiation::radFormFactor_incoherent ... only incoherent radiation
-     *  - ::picongpu::plugins::radiation::radFormFactor_coherent ... only coherent radiation
-     */
-    namespace macroParticleFormFactor = ::picongpu::plugins::radiation::radFormFactor_Gauss_spherical;
-
-    ///////////////////////////////////////////////////////////
-
-    namespace parameters
-    {
-        /** Number of observation directions
-         *
-         * If nPhi or nTheta is equal to 1, the transition radiation will be calculated
-         * for phiMin or thetaMin respectively.
-         */
-        constexpr unsigned int nPhi = 128;
-        constexpr unsigned int nTheta = 128;
-        constexpr unsigned int nObserver = nPhi * nTheta;
-
-        // theta goes from 0 to pi
-        constexpr float_64 thetaMin = 0.0;
-        constexpr float_64 thetaMax = picongpu::PI;
-
-        // phi goes from 0 to 2*pi
-        constexpr float_64 phiMin = 0.0;
-        constexpr float_64 phiMax = 2 * picongpu::PI;
-
-        namespace SI
+        // initiate the formfactor namespaces from the radiation plugin
+        namespace radiation
         {
-            // y position of the foil to calculate transition radiation at
-            // leave at 0 for no virtual particle propagation
-            constexpr float_64 foilPosition = 0.0;
-        }
-
-    } /* end namespace parameters */
-
-
-    //! example of a filter for the relativistic Lorentz factor gamma
-    struct GammaFilterFunctor
-    {
-        //! Gamma value above which the radiation is calculated
-        static constexpr float_X filterGamma = 5.0;
-
-        template< typename T_Particle >
-        HDINLINE void operator()( T_Particle& particle )
+            namespace radFormFactor_CIC_3D
+            {
+            }
+            namespace radFormFactor_TSC_3D
+            {
+            }
+            namespace radFormFactor_PCS_3D
+            {
+            }
+            namespace radFormFactor_CIC_1Dy
+            {
+            }
+            namespace radFormFactor_Gauss_spherical
+            {
+            }
+            namespace radFormFactor_Gauss_cell
+            {
+            }
+            namespace radFormFactor_incoherent
+            {
+            }
+            namespace radFormFactor_coherent
+            {
+            }
+        } // namespace radiation
+
+        namespace transitionRadiation
         {
-            if(
-                picongpu::gamma<float_X>(
-                    particle[ picongpu::momentum_ ],
-                    picongpu::traits::attribute::getMass(
-                        particle[ picongpu::weighting_ ],
-                        particle
-                    )
-                ) >= filterGamma
-            )
-                particle[ picongpu::transitionRadiationMask_ ] = true;
-        }
-    };
-
-    /** filter to (de)select particles for the radiation calculation
-     *
-     * to activate the filter:
-     *   - goto file `speciesDefinition.param`
-     *   - add the attribute `transitionRadiationMask` to the particle species
-     */
-    using GammaFilter = picongpu::particles::manipulators::generic::Free<
-        GammaFilterFunctor
-    >;
-
-    /** Compute observation angles
-     *
-     * This function is used in the transition radiation plugin kernel to compute
-     * the observation directions given as a unit vector pointing
-     * towards a 'virtual' detector
-     *
-     * This default setup is an example of a 2D detector array. It computes
-     * observation directions for 2D virtual detector field
-     * with its center pointing toward the +y direction (for theta=0, phi=0)
-     * with observation angles ranging from
-     * theta = [angle_theta_start : angle_theta_end]
-     * phi   = [angle_phi_start   : angle_phi_end  ]
-     * Every observation_id_extern index moves the phi angle from its
-     * start value toward its end value until the observation_id_extern
-     * reaches N_split. After that the theta angle moves further from its
-     * start value towards its end value while phi is reset to its start
-     * value.
-     *
-     * The unit vector pointing towards the observing virtual detector
-     * can be described using theta and phi by:
-     * x_value = sin(theta) * cos(phi)
-     * y_value = cos(theta)
-     * z_value = sin(theta) * sin(phi)
-     * These are the standard spherical coordinates.
-     *
-     * The example setup describes an detector array of
-     * 128X128 detectors ranging from 0 to pi for the azimuth angle
-     * theta and from 0 to 2 pi for the polar angle phi.
-     *
-     * If the calculation is only supposed to be done for a single azimuth
-     * or polar angle, it will use the respective minimal angle.
-     *
-     * @param    observation_id_extern
-     *           int index that identifies each block on the GPU
-     *           to compute the observation direction
-     *
-     * @return   unit vector pointing in observation direction
-     *           type: float3_X
-     */
-    HDINLINE float3_X observationDirection(const int observation_id_extern)
-    {
-        /* generate two indices from single block index */
-        /** split distance of given index
-         * pseudo-code:
-         * index_a = index / split_distance
-         * index_b = index % split_distance
-         */
-        /** get index for computing angle theta: */
-        const int indexTheta = observation_id_extern / parameters::nPhi;
-
-        /** step width angle theta */
-        const picongpu::float_64 deltaTheta = ( parameters::nTheta > 1 ) ?
-                ( parameters::thetaMax - parameters::thetaMin ) / ( parameters::nTheta - 1.0 ) : 0.0;
-
-        /** compute observation angles theta */
-        const picongpu::float_64 theta = indexTheta * deltaTheta + parameters::thetaMin;
-
-        /** get index for computing angle phi: */
-        const int indexPhi = observation_id_extern % parameters::nPhi;
-
-        /** step width angle phi */
-        const picongpu::float_64 deltaPhi = ( parameters::nPhi > 1 ) ?
-                ( parameters::phiMax - parameters::phiMin ) / ( parameters::nPhi - 1.0 ) : 0.0;
-
-        /** compute observation angles phi */
-        const picongpu::float_64 phi = indexPhi * deltaPhi - parameters::phiMin;
-
-        /* helper functions for efficient trigonometric calculations */
-        picongpu::float_32 sinPhi;
-        picongpu::float_32 cosPhi;
-        picongpu::float_32 sinTheta;
-        picongpu::float_32 cosTheta;
-        math::sincos( precisionCast< picongpu::float_32 >( phi ), sinPhi, cosPhi );
-        math::sincos( precisionCast< picongpu::float_32 >( theta ), sinTheta, cosTheta );
-        /** compute observation unit vector */
-        return float3_X( sinTheta * cosPhi , cosTheta, sinTheta * sinPhi );
-    }
-
-} // namespace transitionRadiation
-} // namespace plugins
+            namespace linearFrequencies
+            {
+                namespace SI
+                {
+                    //! mimimum frequency of the linear frequency scale in units of [1/s]
+                    constexpr float_64 omegaMin = 0.0;
+                    //! maximum frequency of the linear frequency scale in units of [1/s]
+                    constexpr float_64 omegaMax = 1.06e16;
+                } // namespace SI
+
+                //! number of frequency values to compute in the linear frequency [unitless]
+                constexpr unsigned int nOmega = 512;
+
+            } // namespace linearFrequencies
+
+            namespace logFrequencies
+            {
+                namespace SI
+                {
+                    //! mimimum frequency of the logarithmic frequency scale in units of [1/s]
+                    constexpr float_64 omegaMin = 1.0e13;
+                    //! maximum frequency of the logarithmic frequency scale in units of [1/s]
+                    constexpr float_64 omegaMax = 1.0e17;
+                } // namespace SI
+
+                //! number of frequency values to compute in the logarithmic frequency [unitless]
+                constexpr unsigned int nOmega = 256;
+
+            } // namespace logFrequencies
+
+
+            namespace listFrequencies
+            {
+                //! path to text file with frequencies
+                constexpr char listLocation[] = "/path/to/frequency_list";
+                //! number of frequency values to compute if frequencies are given in a file [unitless]
+                constexpr unsigned int nOmega = 512;
+
+            } // namespace listFrequencies
+
+
+            /** selected mode of frequency scaling:
+             *
+             * options:
+             * - linearFrequencies
+             * - logFrequencies
+             * - listFrequencies
+             */
+            namespace frequencies = logFrequencies;
+
+            ///////////////////////////////////////////////////
+
+
+            /** correct treatment of coherent radiation from macro particles
+             *
+             * These formfactors are the same as in the radiation plugin!
+             * Choose different form factors in order to consider different particle shapes for radiation
+             *  - ::picongpu::plugins::radiation::radFormFactor_CIC_3D ... CIC charge distribution
+             *  - ::picongpu::plugins::radiation::radFormFactor_TSC_3D ... TSC charge distribution
+             *  - ::picongpu::plugins::radiation::radFormFactor_PCS_3D ... PCS charge distribution
+             *  - ::picongpu::plugins::radiation::radFormFactor_CIC_1Dy ... only CIC charge distribution in y
+             *  - ::picongpu::plugins::radiation::radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
+             *  - ::picongpu::plugins::radiation::radFormFactor_Gauss_cell ... Gauss charge distribution according to
+             * cell size
+             *  - ::picongpu::plugins::radiation::radFormFactor_incoherent ... only incoherent radiation
+             *  - ::picongpu::plugins::radiation::radFormFactor_coherent ... only coherent radiation
+             */
+            namespace macroParticleFormFactor = ::picongpu::plugins::radiation::radFormFactor_Gauss_spherical;
+
+            ///////////////////////////////////////////////////////////
+
+            namespace parameters
+            {
+                /** Number of observation directions
+                 *
+                 * If nPhi or nTheta is equal to 1, the transition radiation will be calculated
+                 * for phiMin or thetaMin respectively.
+                 */
+                constexpr unsigned int nPhi = 128;
+                constexpr unsigned int nTheta = 128;
+                constexpr unsigned int nObserver = nPhi * nTheta;
+
+                // theta goes from 0 to pi
+                constexpr float_64 thetaMin = 0.0;
+                constexpr float_64 thetaMax = picongpu::PI;
+
+                // phi goes from 0 to 2*pi
+                constexpr float_64 phiMin = 0.0;
+                constexpr float_64 phiMax = 2 * picongpu::PI;
+
+                namespace SI
+                {
+                    // y position of the foil to calculate transition radiation at
+                    // leave at 0 for no virtual particle propagation
+                    constexpr float_64 foilPosition = 0.0;
+                } // namespace SI
+
+            } /* end namespace parameters */
+
+
+            //! example of a filter for the relativistic Lorentz factor gamma
+            struct GammaFilterFunctor
+            {
+                //! Gamma value above which the radiation is calculated
+                static constexpr float_X filterGamma = 5.0;
+
+                template<typename T_Particle>
+                HDINLINE void operator()(T_Particle& particle)
+                {
+                    if(picongpu::gamma<float_X>(
+                           particle[picongpu::momentum_],
+                           picongpu::traits::attribute::getMass(particle[picongpu::weighting_], particle))
+                       >= filterGamma)
+                        particle[picongpu::transitionRadiationMask_] = true;
+                }
+            };
+
+            /** filter to (de)select particles for the radiation calculation
+             *
+             * to activate the filter:
+             *   - goto file `speciesDefinition.param`
+             *   - add the attribute `transitionRadiationMask` to the particle species
+             */
+            using GammaFilter = picongpu::particles::manipulators::generic::Free<GammaFilterFunctor>;
+
+            /** Compute observation angles
+             *
+             * This function is used in the transition radiation plugin kernel to compute
+             * the observation directions given as a unit vector pointing
+             * towards a 'virtual' detector
+             *
+             * This default setup is an example of a 2D detector array. It computes
+             * observation directions for 2D virtual detector field
+             * with its center pointing toward the +y direction (for theta=0, phi=0)
+             * with observation angles ranging from
+             * theta = [angle_theta_start : angle_theta_end]
+             * phi   = [angle_phi_start   : angle_phi_end  ]
+             * Every observation_id_extern index moves the phi angle from its
+             * start value toward its end value until the observation_id_extern
+             * reaches N_split. After that the theta angle moves further from its
+             * start value towards its end value while phi is reset to its start
+             * value.
+             *
+             * The unit vector pointing towards the observing virtual detector
+             * can be described using theta and phi by:
+             * x_value = sin(theta) * cos(phi)
+             * y_value = cos(theta)
+             * z_value = sin(theta) * sin(phi)
+             * These are the standard spherical coordinates.
+             *
+             * The example setup describes an detector array of
+             * 128X128 detectors ranging from 0 to pi for the azimuth angle
+             * theta and from 0 to 2 pi for the polar angle phi.
+             *
+             * If the calculation is only supposed to be done for a single azimuth
+             * or polar angle, it will use the respective minimal angle.
+             *
+             * @param    observation_id_extern
+             *           int index that identifies each block on the GPU
+             *           to compute the observation direction
+             *
+             * @return   unit vector pointing in observation direction
+             *           type: float3_X
+             */
+            HDINLINE float3_X observationDirection(const int observation_id_extern)
+            {
+                /* generate two indices from single block index */
+                /** split distance of given index
+                 * pseudo-code:
+                 * index_a = index / split_distance
+                 * index_b = index % split_distance
+                 */
+                /** get index for computing angle theta: */
+                const int indexTheta = observation_id_extern / parameters::nPhi;
+
+                /** step width angle theta */
+                const picongpu::float_64 deltaTheta = (parameters::nTheta > 1)
+                    ? (parameters::thetaMax - parameters::thetaMin) / (parameters::nTheta - 1.0)
+                    : 0.0;
+
+                /** compute observation angles theta */
+                const picongpu::float_64 theta = indexTheta * deltaTheta + parameters::thetaMin;
+
+                /** get index for computing angle phi: */
+                const int indexPhi = observation_id_extern % parameters::nPhi;
+
+                /** step width angle phi */
+                const picongpu::float_64 deltaPhi = (parameters::nPhi > 1)
+                    ? (parameters::phiMax - parameters::phiMin) / (parameters::nPhi - 1.0)
+                    : 0.0;
+
+                /** compute observation angles phi */
+                const picongpu::float_64 phi = indexPhi * deltaPhi - parameters::phiMin;
+
+                /* helper functions for efficient trigonometric calculations */
+                picongpu::float_32 sinPhi;
+                picongpu::float_32 cosPhi;
+                picongpu::float_32 sinTheta;
+                picongpu::float_32 cosTheta;
+                pmacc::math::sincos(precisionCast<picongpu::float_32>(phi), sinPhi, cosPhi);
+                pmacc::math::sincos(precisionCast<picongpu::float_32>(theta), sinTheta, cosTheta);
+                /** compute observation unit vector */
+                return float3_X(sinTheta * cosPhi, cosTheta, sinTheta * sinPhi);
+            }
+
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/param/unit.param b/include/picongpu/param/unit.param
index 3f8fed5e7b..2f5c181c8c 100644
--- a/include/picongpu/param/unit.param
+++ b/include/picongpu/param/unit.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Marco Garten, Heiko Burau
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Marco Garten, Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -32,22 +32,23 @@ namespace picongpu
     /** Unit of time */
     constexpr float_64 UNIT_TIME = SI::DELTA_T_SI;
     /** Unit of length */
-    constexpr float_64 UNIT_LENGTH = UNIT_TIME*UNIT_SPEED;
+    constexpr float_64 UNIT_LENGTH = UNIT_TIME * UNIT_SPEED;
 
     namespace particles
     {
         /** Number of particles per makro particle (= macro particle weighting)
          *  unit: none */
-        constexpr float_X TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE =
-            float_64( SI::BASE_DENSITY_SI * SI::CELL_WIDTH_SI * SI::CELL_HEIGHT_SI * SI::CELL_DEPTH_SI ) /
-            float_64( particles::TYPICAL_PARTICLES_PER_CELL );
-    }
+        constexpr float_X TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE
+            = float_64(SI::BASE_DENSITY_SI * SI::CELL_WIDTH_SI * SI::CELL_HEIGHT_SI * SI::CELL_DEPTH_SI)
+            / float_64(particles::TYPICAL_PARTICLES_PER_CELL);
+    } // namespace particles
 
 
     /** Unit of mass */
     constexpr float_64 UNIT_MASS = SI::BASE_MASS_SI * double(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
     /** Unit of charge */
-    constexpr float_64 UNIT_CHARGE = -1.0 * SI::BASE_CHARGE_SI * double(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
+    constexpr float_64 UNIT_CHARGE
+        = -1.0 * SI::BASE_CHARGE_SI * double(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
     /** Unit of energy */
     constexpr float_64 UNIT_ENERGY = (UNIT_MASS * UNIT_LENGTH * UNIT_LENGTH / (UNIT_TIME * UNIT_TIME));
     /** Unit of EField: V/m */
@@ -55,4 +56,4 @@ namespace picongpu
     //** Unit of BField: Tesla [T] = Vs/m^2 */
     constexpr float_64 UNIT_BFIELD = (UNIT_MASS / (UNIT_TIME * UNIT_CHARGE));
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/param/xrayScattering.param b/include/picongpu/param/xrayScattering.param
new file mode 100644
index 0000000000..58985ac712
--- /dev/null
+++ b/include/picongpu/param/xrayScattering.param
@@ -0,0 +1,53 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/plugins/xrayScattering/beam/Side.hpp"
+
+/* preprocessor struct generator */
+#include <pmacc/preprocessor/struct.hpp>
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                using namespace picongpu::plugins::xrayScattering::beam;
+                /* Choose from:
+                 *  - ZSide
+                 *  - YSide
+                 *  - XSide
+                 * - ZRSide
+                 * - YRSide
+                 * - XRSide
+                 */
+                using ProbingSide = ZSide;
+
+                PMACC_STRUCT(
+                    RotationParam,
+                    (PMACC_C_VALUE(float_X, yawAngle, 0))(PMACC_C_VALUE(float_X, pitchAngle, 0)));
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/particles/InitFunctors.hpp b/include/picongpu/particles/InitFunctors.hpp
index aca4bb1c6f..d5c52ed217 100644
--- a/include/picongpu/particles/InitFunctors.hpp
+++ b/include/picongpu/particles/InitFunctors.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -44,220 +44,184 @@
 
 namespace picongpu
 {
-
-namespace particles
-{
-
-/** call a functor
- *
- * @tparam T_Functor unary lambda functor
- *                   operator() must take two params
- *                      - first: storage tuple
- *                      - second: current time step
- */
-template<typename T_Functor = bmpl::_1>
-struct CallFunctor
-{
-    using Functor = T_Functor;
-
-    HINLINE void operator()(
-        const uint32_t currentStep
-    )
-    {
-        Functor()( currentStep );
-    }
-};
-
-/** Create particle distribution from a normalized density profile
- *
- * Create particles inside a species. The created particles are macroscopically
- * distributed according to a given normalized density profile
- * (`T_DensityFunctor`). Their microscopic position inside individual cells is
- * determined by the `T_PositionFunctor`.
- *
- * @note FillAllGaps is automatically called after creation.
- *
- * @tparam T_DensityFunctor unary lambda functor with profile description,
- *                          see density.param,
- *                          example: picongpu::particles::densityProfiles::Homogenous
- * @tparam T_PositionFunctor unary lambda functor with position description,
- *                           see particle.param,
- *                           examples: picongpu::particles::startPosition::Quiet,
- *                                     picongpu::particles::startPosition::Random
- * @tparam T_SpeciesType type or name as boost::mpl::string of the used species,
- *                       see speciesDefinition.param
- */
-template<
-    typename T_DensityFunctor,
-    typename T_PositionFunctor,
-    typename T_SpeciesType = bmpl::_1
->
-struct CreateDensity
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-
-    using UserDensityFunctor = typename bmpl::apply1<T_DensityFunctor, SpeciesType>::type;
-    /* add interface for compile time interface validation*/
-    using DensityFunctor = densityProfiles::IProfile<UserDensityFunctor>;
-
-    using UserPositionFunctor = typename bmpl::apply1<T_PositionFunctor, SpeciesType>::type;
-    /* add interface for compile time interface validation*/
-    using PositionFunctor = manipulators::IUnary<UserPositionFunctor>;
-
-    HINLINE void operator()( const uint32_t currentStep )
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto speciesPtr = dc.get< SpeciesType >( FrameType::getName(), true );
-
-        DensityFunctor densityFunctor(currentStep);
-        PositionFunctor positionFunctor(currentStep);
-        speciesPtr->initDensityProfile(densityFunctor, positionFunctor, currentStep);
-
-        dc.releaseData( FrameType::getName() );
-    }
-};
-
-
-/** Generate particles in a species by deriving and manipulating from another species' particles
- *
- * Create particles in `T_DestSpeciesType` by deriving (copying) all particles
- * and their matching attributes (except `particleId`) from `T_SrcSpeciesType`.
- * During the derivation, the particle attributes in can be manipulated with
- * `T_ManipulateFunctor`.
- *
- * @note FillAllGaps is called on on T_DestSpeciesType after the derivation is
- *       finished.
- *       If the derivation also manipulates the T_SrcSpeciesType, e.g. in order
- *       to deactivate some particles for a move, FillAllGaps needs to be
- *       called for the T_SrcSpeciesType manually in the next step!
- *
- * @tparam T_Manipulator a pseudo-binary functor accepting two particle species:
- *                       destination and source,
- *                       @see picongpu::particles::manipulators
- * @tparam T_SrcSpeciesType type or name as boost::mpl::string of the source species
- * @tparam T_DestSpeciesType type or name as boost::mpl::string of the destination species
- * @tparam T_SrcFilter picongpu::particles::filter, particle filter type to
- *                     select particles in T_SrcSpeciesType to derive into
- *                     T_DestSpeciesType
- */
-template<
-    typename T_Manipulator,
-    typename T_SrcSpeciesType,
-    typename T_DestSpeciesType = bmpl::_1,
-    typename T_SrcFilter = filter::All
->
-struct ManipulateDerive
-{
-    using DestSpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_DestSpeciesType
-    >;
-    using DestFrameType = typename DestSpeciesType::FrameType;
-    using SrcSpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SrcSpeciesType
-    >;
-    using SrcFrameType = typename SrcSpeciesType::FrameType;
-
-    using DestFunctor = typename bmpl::apply1<
-        T_Manipulator,
-        DestSpeciesType
-    >::type;
-
-    using SrcFilter = typename bmpl::apply1<
-        T_SrcFilter,
-        SrcSpeciesType
-    >::type;
-
-    /* note: this is a FilteredManipulator with filter::All for
-     * destination species, users can filter the destination directly via if's
-     * in the T_Manipulator.
-     */
-    using FilteredManipulator = manipulators::IBinary< DestFunctor >;
-    using SrcFilterInterfaced = filter::IUnary< SrcFilter >;
-
-    HINLINE void operator()( const uint32_t currentStep )
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto speciesPtr = dc.get< DestSpeciesType >( DestFrameType::getName(), true );
-        auto srcSpeciesPtr = dc.get< SrcSpeciesType >( SrcFrameType::getName(), true );
-
-        FilteredManipulator filteredManipulator( currentStep );
-        SrcFilterInterfaced srcFilter( currentStep );
-
-        speciesPtr->deviceDeriveFrom( *srcSpeciesPtr, filteredManipulator, srcFilter );
-
-        dc.releaseData( DestFrameType::getName() );
-        dc.releaseData( SrcFrameType::getName() );
-    }
-};
-
-
-/** Generate particles in a species by deriving from another species' particles
- *
- * Create particles in `T_DestSpeciesType` by deriving (copying) all particles
- * and their matching attributes (except `particleId`) from `T_SrcSpeciesType`.
- *
- * @note FillAllGaps is called on on `T_DestSpeciesType` after the derivation is
- *       finished.
- *
- * @tparam T_SrcSpeciesType type or name as boost::mpl::string of the source species
- * @tparam T_DestSpeciesType type or name as boost::mpl::string of the destination species
- * @tparam T_Filter picongpu::particles::filter,
- *                  particle filter type to select source particles to derive
- */
-template<
-    typename T_SrcSpeciesType,
-    typename T_DestSpeciesType = bmpl::_1,
-    typename T_Filter = filter::All
->
-struct Derive : ManipulateDerive<
-    manipulators::generic::None,
-    T_SrcSpeciesType,
-    T_DestSpeciesType,
-    T_Filter
->
-{
-};
-
-
-/** Generate a valid, contiguous list of particle frames
- *
- * Some operations, such as deactivating or adding particles to a particle
- * species can generate "gaps" in our internal particle storage, a list
- * of frames.
- *
- * This operation copies all particles from the end of the frame list to
- * "gaps" in the beginning of the frame list.
- * After execution, the requirement that all particle frames must be filled
- * contiguously with valid particles and that all frames but the last are full
- * is fulfilled.
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of the particle species
- *                       to fill gaps in memory
- */
-template< typename T_SpeciesType = bmpl::_1 >
-struct FillAllGaps
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    HINLINE void operator()( const uint32_t currentStep )
+    namespace particles
     {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto speciesPtr = dc.get< SpeciesType >( FrameType::getName(), true );
-        speciesPtr->fillAllGaps();
-        dc.releaseData( FrameType::getName() );
-    }
-};
-
-} // namespace particles
+        /** call a functor
+         *
+         * @tparam T_Functor unary lambda functor
+         *                   operator() must take two params
+         *                      - first: storage tuple
+         *                      - second: current time step
+         */
+        template<typename T_Functor = bmpl::_1>
+        struct CallFunctor
+        {
+            using Functor = T_Functor;
+
+            HINLINE void operator()(const uint32_t currentStep)
+            {
+                Functor()(currentStep);
+            }
+        };
+
+        /** Create particle distribution from a normalized density profile
+         *
+         * Create particles inside a species. The created particles are macroscopically
+         * distributed according to a given normalized density profile
+         * (`T_DensityFunctor`). Their microscopic position inside individual cells is
+         * determined by the `T_PositionFunctor`.
+         *
+         * @note FillAllGaps is automatically called after creation.
+         *
+         * @tparam T_DensityFunctor unary lambda functor with profile description,
+         *                          see density.param,
+         *                          example: picongpu::particles::densityProfiles::Homogenous
+         * @tparam T_PositionFunctor unary lambda functor with position description,
+         *                           see particle.param,
+         *                           examples: picongpu::particles::startPosition::Quiet,
+         *                                     picongpu::particles::startPosition::Random
+         * @tparam T_SpeciesType type or name as boost::mpl::string of the used species,
+         *                       see speciesDefinition.param
+         */
+        template<typename T_DensityFunctor, typename T_PositionFunctor, typename T_SpeciesType = bmpl::_1>
+        struct CreateDensity
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+
+            using UserDensityFunctor = typename bmpl::apply1<T_DensityFunctor, SpeciesType>::type;
+            /* add interface for compile time interface validation*/
+            using DensityFunctor = densityProfiles::IProfile<UserDensityFunctor>;
+
+            using UserPositionFunctor = typename bmpl::apply1<T_PositionFunctor, SpeciesType>::type;
+            /* add interface for compile time interface validation*/
+            using PositionFunctor = manipulators::IUnary<UserPositionFunctor>;
+
+            HINLINE void operator()(const uint32_t currentStep)
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                auto speciesPtr = dc.get<SpeciesType>(FrameType::getName(), true);
+
+                DensityFunctor densityFunctor(currentStep);
+                PositionFunctor positionFunctor(currentStep);
+                speciesPtr->initDensityProfile(densityFunctor, positionFunctor, currentStep);
+
+                dc.releaseData(FrameType::getName());
+            }
+        };
+
+
+        /** Generate particles in a species by deriving and manipulating from another species' particles
+         *
+         * Create particles in `T_DestSpeciesType` by deriving (copying) all particles
+         * and their matching attributes (except `particleId`) from `T_SrcSpeciesType`.
+         * During the derivation, the particle attributes in can be manipulated with
+         * `T_ManipulateFunctor`.
+         *
+         * @note FillAllGaps is called on on T_DestSpeciesType after the derivation is
+         *       finished.
+         *       If the derivation also manipulates the T_SrcSpeciesType, e.g. in order
+         *       to deactivate some particles for a move, FillAllGaps needs to be
+         *       called for the T_SrcSpeciesType manually in the next step!
+         *
+         * @tparam T_Manipulator a pseudo-binary functor accepting two particle species:
+         *                       destination and source,
+         *                       @see picongpu::particles::manipulators
+         * @tparam T_SrcSpeciesType type or name as boost::mpl::string of the source species
+         * @tparam T_DestSpeciesType type or name as boost::mpl::string of the destination species
+         * @tparam T_SrcFilter picongpu::particles::filter, particle filter type to
+         *                     select particles in T_SrcSpeciesType to derive into
+         *                     T_DestSpeciesType
+         */
+        template<
+            typename T_Manipulator,
+            typename T_SrcSpeciesType,
+            typename T_DestSpeciesType = bmpl::_1,
+            typename T_SrcFilter = filter::All>
+        struct ManipulateDerive
+        {
+            using DestSpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_DestSpeciesType>;
+            using DestFrameType = typename DestSpeciesType::FrameType;
+            using SrcSpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SrcSpeciesType>;
+            using SrcFrameType = typename SrcSpeciesType::FrameType;
+
+            using DestFunctor = typename bmpl::apply1<T_Manipulator, DestSpeciesType>::type;
+
+            using SrcFilter = typename bmpl::apply1<T_SrcFilter, SrcSpeciesType>::type;
+
+            /* note: this is a FilteredManipulator with filter::All for
+             * destination species, users can filter the destination directly via if's
+             * in the T_Manipulator.
+             */
+            using FilteredManipulator = manipulators::IBinary<DestFunctor>;
+            using SrcFilterInterfaced = filter::IUnary<SrcFilter>;
+
+            HINLINE void operator()(const uint32_t currentStep)
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                auto speciesPtr = dc.get<DestSpeciesType>(DestFrameType::getName(), true);
+                auto srcSpeciesPtr = dc.get<SrcSpeciesType>(SrcFrameType::getName(), true);
+
+                FilteredManipulator filteredManipulator(currentStep);
+                SrcFilterInterfaced srcFilter(currentStep);
+
+                speciesPtr->deviceDeriveFrom(*srcSpeciesPtr, filteredManipulator, srcFilter);
+
+                dc.releaseData(DestFrameType::getName());
+                dc.releaseData(SrcFrameType::getName());
+            }
+        };
+
+
+        /** Generate particles in a species by deriving from another species' particles
+         *
+         * Create particles in `T_DestSpeciesType` by deriving (copying) all particles
+         * and their matching attributes (except `particleId`) from `T_SrcSpeciesType`.
+         *
+         * @note FillAllGaps is called on on `T_DestSpeciesType` after the derivation is
+         *       finished.
+         *
+         * @tparam T_SrcSpeciesType type or name as boost::mpl::string of the source species
+         * @tparam T_DestSpeciesType type or name as boost::mpl::string of the destination species
+         * @tparam T_Filter picongpu::particles::filter,
+         *                  particle filter type to select source particles to derive
+         */
+        template<typename T_SrcSpeciesType, typename T_DestSpeciesType = bmpl::_1, typename T_Filter = filter::All>
+        struct Derive : ManipulateDerive<manipulators::generic::None, T_SrcSpeciesType, T_DestSpeciesType, T_Filter>
+        {
+        };
+
+
+        /** Generate a valid, contiguous list of particle frames
+         *
+         * Some operations, such as deactivating or adding particles to a particle
+         * species can generate "gaps" in our internal particle storage, a list
+         * of frames.
+         *
+         * This operation copies all particles from the end of the frame list to
+         * "gaps" in the beginning of the frame list.
+         * After execution, the requirement that all particle frames must be filled
+         * contiguously with valid particles and that all frames but the last are full
+         * is fulfilled.
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of the particle species
+         *                       to fill gaps in memory
+         */
+        template<typename T_SpeciesType = bmpl::_1>
+        struct FillAllGaps
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+            HINLINE void operator()(const uint32_t currentStep)
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                auto speciesPtr = dc.get<SpeciesType>(FrameType::getName(), true);
+                speciesPtr->fillAllGaps();
+                dc.releaseData(FrameType::getName());
+            }
+        };
+
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/InterpolationForPusher.hpp b/include/picongpu/particles/InterpolationForPusher.hpp
index 53645f8c00..0215962f9e 100644
--- a/include/picongpu/particles/InterpolationForPusher.hpp
+++ b/include/picongpu/particles/InterpolationForPusher.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Richard Pausch
+/* Copyright 2015-2021 Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -18,72 +18,63 @@
  */
 
 
-
-
 #pragma once
 
 namespace picongpu
 {
-
-/** functor for particle field interpolator
- *
- * This functor is a simplification of the full
- * field to particle interpolator that can be used in the
- * particle pusher
- */
-template< typename T_Field2PartInt, typename T_MemoryType, typename T_FieldPosition >
-struct InterpolationForPusher
-{
-    using Field2PartInt = T_Field2PartInt;
-
-    HDINLINE
-    InterpolationForPusher( const T_MemoryType& mem, const T_FieldPosition& fieldPos )
-        : m_mem( mem ), m_fieldPos( fieldPos )
+    /** functor for particle field interpolator
+     *
+     * This functor is a simplification of the full
+     * field to particle interpolator that can be used in the
+     * particle pusher
+     */
+    template<typename T_Field2PartInt, typename T_MemoryType, typename T_FieldPosition>
+    struct InterpolationForPusher
     {
-    }
+        using Field2PartInt = T_Field2PartInt;
 
-    /* apply shift policy before interpolation */
-    template< typename T_PosType, typename T_ShiftPolicy >
-    HDINLINE
-    float3_X operator()( const T_PosType& pos, const T_ShiftPolicy& shiftPolicy ) const
-    {
-        return Field2PartInt()( shiftPolicy.memory(m_mem, pos),
-                                shiftPolicy.position(pos),
-                                m_fieldPos );
-    }
+        HDINLINE
+        InterpolationForPusher(const T_MemoryType& mem, const T_FieldPosition& fieldPos)
+            : m_mem(mem)
+            , m_fieldPos(fieldPos)
+        {
+        }
 
-    /* interpolation using given memory and position */
-    template< typename T_PosType >
-    HDINLINE
-    float3_X operator()( const T_PosType& pos ) const
-    {
-        return Field2PartInt()( m_mem,
-                                pos,
-                                m_fieldPos );
-    }
+        /* apply shift policy before interpolation */
+        template<typename T_PosType, typename T_ShiftPolicy>
+        HDINLINE float3_X operator()(const T_PosType& pos, const T_ShiftPolicy& shiftPolicy) const
+        {
+            return Field2PartInt()(shiftPolicy.memory(m_mem, pos), shiftPolicy.position(pos), m_fieldPos);
+        }
 
+        /* interpolation using given memory and position */
+        template<typename T_PosType>
+        HDINLINE float3_X operator()(const T_PosType& pos) const
+        {
+            return Field2PartInt()(m_mem, pos, m_fieldPos);
+        }
 
 
-private:
-    PMACC_ALIGN( m_mem, T_MemoryType );
-    PMACC_ALIGN( m_fieldPos, const T_FieldPosition );
-};
+    private:
+        PMACC_ALIGN(m_mem, T_MemoryType);
+        PMACC_ALIGN(m_fieldPos, const T_FieldPosition);
+    };
 
 
-/** functor to create particle field interpolator
- *
- * required to get interpolator for pusher
- */
-template<typename T_Field2PartInt>
-struct CreateInterpolationForPusher
-{
-    template< typename T_MemoryType, typename T_FieldPosition >
-    HDINLINE
-    InterpolationForPusher< T_Field2PartInt, T_MemoryType, T_FieldPosition >
-    operator()( const T_MemoryType& mem, const T_FieldPosition& fieldPos )
+    /** functor to create particle field interpolator
+     *
+     * required to get interpolator for pusher
+     */
+    template<typename T_Field2PartInt>
+    struct CreateInterpolationForPusher
     {
-        return InterpolationForPusher< T_Field2PartInt, T_MemoryType, T_FieldPosition >( mem, fieldPos );
-    }
-};
+        template<typename T_MemoryType, typename T_FieldPosition>
+        HDINLINE InterpolationForPusher<T_Field2PartInt, T_MemoryType, T_FieldPosition> operator()(
+            const T_MemoryType& mem,
+            const T_FieldPosition& fieldPos)
+        {
+            return InterpolationForPusher<T_Field2PartInt, T_MemoryType, T_FieldPosition>(mem, fieldPos);
+        }
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/particles/Manipulate.hpp b/include/picongpu/particles/Manipulate.hpp
index d587140563..24c49137c8 100644
--- a/include/picongpu/particles/Manipulate.hpp
+++ b/include/picongpu/particles/Manipulate.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Sergei Bastrakov
+/* Copyright 2014-2021 Rene Widera, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -32,121 +32,84 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace detail
-{
-    /** Operator to create a filtered functor
-     */
-    template<
-        typename T_Manipulator,
-        typename T_Species,
-        typename T_Filter
-    >
-    struct MakeUnaryFilteredFunctor
+    namespace particles
     {
-    private:
-        using Species = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_Species
-        >;
-        using SpeciesFunctor = typename bmpl::apply1<
-            T_Manipulator,
-            Species
-        >::type;
-        using ParticleFilter = typename bmpl::apply1<
-            T_Filter,
-            Species
-        >::type;
-    public:
-        using type = manipulators::IUnary<
-            SpeciesFunctor,
-            ParticleFilter
-        >;
-    };
-} // namespace detail
+        namespace detail
+        {
+            /** Operator to create a filtered functor
+             */
+            template<typename T_Manipulator, typename T_Species, typename T_Filter>
+            struct MakeUnaryFilteredFunctor
+            {
+            private:
+                using Species = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_Species>;
+                using SpeciesFunctor = typename bmpl::apply1<T_Manipulator, Species>::type;
+                using ParticleFilter = typename bmpl::apply1<T_Filter, Species>::type;
 
-    /** Run a user defined manipulation for each particle of a species
-     *
-     * Allows to manipulate attributes of existing particles in a species with
-     * arbitrary unary functors ("manipulators").
-     *
-     * @warning Does NOT call FillAllGaps after manipulation! If the
-     *          manipulation deactivates particles or creates "gaps" in any
-     *          other way, FillAllGaps needs to be called for the
-     *          `T_Species` manually in the next step!
-     *
-     * @tparam T_Manipulator unary lambda functor accepting one particle
-     *                       species,
-     *                       @see picongpu::particles::manipulators
-     * @tparam T_Species type or name as boost::mpl::string of the used species
-     * @tparam T_Filter picongpu::particles::filter, particle filter type to
-     *                  select particles in `T_Species` to manipulate
-     */
-    template<
-        typename T_Manipulator,
-        typename T_Species = bmpl::_1,
-        typename T_Filter = filter::All
-    >
-    struct Manipulate : public pmacc::particles::algorithm::CallForEach<
-        pmacc::particles::meta::FindByNameOrType<
-            VectorAllSpecies,
-            T_Species
-        >,
-        detail::MakeUnaryFilteredFunctor<
-            T_Manipulator,
-            T_Species,
-            T_Filter
-        >
-    >
-    {
-    };
+            public:
+                using type = manipulators::IUnary<SpeciesFunctor, ParticleFilter>;
+            };
+        } // namespace detail
 
+        /** Run a user defined manipulation for each particle of a species
+         *
+         * Allows to manipulate attributes of existing particles in a species with
+         * arbitrary unary functors ("manipulators").
+         *
+         * @warning Does NOT call FillAllGaps after manipulation! If the
+         *          manipulation deactivates particles or creates "gaps" in any
+         *          other way, FillAllGaps needs to be called for the
+         *          `T_Species` manually in the next step!
+         *
+         * @tparam T_Manipulator unary lambda functor accepting one particle
+         *                       species,
+         *                       @see picongpu::particles::manipulators
+         * @tparam T_Species type or name as boost::mpl::string of the used species
+         * @tparam T_Filter picongpu::particles::filter, particle filter type to
+         *                  select particles in `T_Species` to manipulate
+         */
+        template<typename T_Manipulator, typename T_Species = bmpl::_1, typename T_Filter = filter::All>
+        struct Manipulate
+            : public pmacc::particles::algorithm::CallForEach<
+                  pmacc::particles::meta::FindByNameOrType<VectorAllSpecies, T_Species>,
+                  detail::MakeUnaryFilteredFunctor<T_Manipulator, T_Species, T_Filter>>
+        {
+        };
 
-    /** Apply a manipulation for each particle of a species or a sequence of
-     *  species
-     *
-     * This function provides a high-level interface to particle manipulation
-     * from simulation stages and plugins, but not .param files. The common
-     * workflow is as follows:
-     * - select the species to manipulate, often by filtering VectorAllSpecies
-     * - define a manipulator type; in case the manipulator has a species type
-     * as a template parameter, use the bmpl::_1 placeholder instead
-     * - define a filter type when necessary
-     * - call manipulate()
-     *
-     * This is a function-style wrapper around creating a Manipulate object and
-     * calling its operator(). Unlike Manipulate, it supports both single
-     * species and sequences of species.
-     *
-     * @tparam T_Manipulator unary lambda functor accepting one particle
-     *                       species, @see picongpu::particles::manipulators
-     * @tparam T_Species a single species or a sequence of species; in both
-     *                   cases each species is defined by a type or a name
-     * @tparam T_Filter picongpu::particles::filter, particle filter type to
-     *                  select particles in `T_Species` to manipulate via
-     *                  `T_DestSpeciesType`
-     *
-     * @param currentStep index of the current time iteration
-     */
-    template<
-        typename T_Manipulator,
-        typename T_Species,
-        typename T_Filter = filter::All
-    >
-    inline void manipulate( uint32_t const currentStep )
-    {
-        using SpeciesSeq = typename pmacc::ToSeq< T_Species >::type;
-        using Functor = Manipulate<
-            T_Manipulator,
-            bmpl::_1,
-            T_Filter
-        >;
-        pmacc::meta::ForEach<
-            SpeciesSeq,
-            Functor
-        > forEach;
-        forEach( currentStep );
-    }
-} //namespace particles
-} //namespace picongpu
+
+        /** Apply a manipulation for each particle of a species or a sequence of
+         *  species
+         *
+         * This function provides a high-level interface to particle manipulation
+         * from simulation stages and plugins, but not .param files. The common
+         * workflow is as follows:
+         * - select the species to manipulate, often by filtering VectorAllSpecies
+         * - define a manipulator type; in case the manipulator has a species type
+         * as a template parameter, use the bmpl::_1 placeholder instead
+         * - define a filter type when necessary
+         * - call manipulate()
+         *
+         * This is a function-style wrapper around creating a Manipulate object and
+         * calling its operator(). Unlike Manipulate, it supports both single
+         * species and sequences of species.
+         *
+         * @tparam T_Manipulator unary lambda functor accepting one particle
+         *                       species, @see picongpu::particles::manipulators
+         * @tparam T_Species a single species or a sequence of species; in both
+         *                   cases each species is defined by a type or a name
+         * @tparam T_Filter picongpu::particles::filter, particle filter type to
+         *                  select particles in `T_Species` to manipulate via
+         *                  `T_DestSpeciesType`
+         *
+         * @param currentStep index of the current time iteration
+         */
+        template<typename T_Manipulator, typename T_Species, typename T_Filter = filter::All>
+        inline void manipulate(uint32_t const currentStep)
+        {
+            using SpeciesSeq = typename pmacc::ToSeq<T_Species>::type;
+            using Functor = Manipulate<T_Manipulator, bmpl::_1, T_Filter>;
+            pmacc::meta::ForEach<SpeciesSeq, Functor> forEach;
+            forEach(currentStep);
+        }
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/Particles.hpp b/include/picongpu/particles/Particles.hpp
index 0fad4e54fe..e55de31dc5 100644
--- a/include/picongpu/particles/Particles.hpp
+++ b/include/picongpu/particles/Particles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Marco Garten, Alexander Grund
  *
  * This file is part of PIConGPU.
@@ -47,227 +47,181 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-#if( PMACC_CUDA_ENABLED != 1 )
-/* dummy because we are not using mallocMC with cupla
- * DeviceHeap is defined in `mallocMC.param`
- */
-struct DeviceHeap
-{
-    using AllocatorHandle = int;
-
-    int getAllocatorHandle()
+#if(!BOOST_LANG_CUDA && !BOOST_COMP_HIP)
+    /* dummy because we are not using mallocMC with cupla
+     * DeviceHeap is defined in `mallocMC.param`
+     */
+    struct DeviceHeap
     {
-        return 0;
-    }
-};
+        using AllocatorHandle = int;
+
+        int getAllocatorHandle()
+        {
+            return 0;
+        }
+    };
 #endif
 
-/** particle species
- *
- * @tparam T_Name name of the species [type boost::mpl::string]
- * @tparam T_Attributes sequence with attributes [type boost::mpl forward sequence]
- * @tparam T_Flags sequence with flags e.g. solver [type boost::mpl forward sequence]
- */
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-class Particles : public ParticlesBase<
-    ParticleDescription<
-        T_Name,
-        SuperCellSize,
-        T_Attributes,
-        T_Flags,
-        typename bmpl::if_<
-            // check if alias boundaryCondition is defined for the species
-            bmpl::contains<
-                T_Flags,
-                typename GetKeyFromAlias<
-                    T_Flags,
-                    boundaryCondition< >
-                >::type
-            >,
-            // resolve the alias
-            typename pmacc::traits::Resolve<
-                typename GetKeyFromAlias<
-                    T_Flags,
-                    boundaryCondition< >
-                >::type
-            >::type,
-            // fallback if the species has not defined the alias boundaryCondition
-            pmacc::HandleGuardRegion<
-                pmacc::particles::policies::ExchangeParticles,
-                particles::boundary::CallPluginsAndDeleteParticles
-            >
-        >::type
-    >,
-    MappingDesc,
-    DeviceHeap
->, public ISimulationData
-{
-public:
-
-    using SpeciesParticleDescription = pmacc::ParticleDescription<
-        T_Name,
-        SuperCellSize,
-        T_Attributes,
-        T_Flags,
-        typename bmpl::if_<
-            // check if alias boundaryCondition is defined for the species
-            bmpl::contains<
-                T_Flags,
-                typename GetKeyFromAlias<
-                    T_Flags,
-                    boundaryCondition< >
-                >::type
-            >,
-            // resolve the alias
-            typename pmacc::traits::Resolve<
-                typename GetKeyFromAlias<
-                    T_Flags,
-                    boundaryCondition< >
-                >::type
-            >::type,
-            // fallback if the species has not defined the alias boundaryCondition
-            pmacc::HandleGuardRegion<
-                pmacc::particles::policies::ExchangeParticles,
-                particles::boundary::CallPluginsAndDeleteParticles
-            >
-        >::type
-    >;
-    using ParticlesBaseType = ParticlesBase<SpeciesParticleDescription, picongpu::MappingDesc, DeviceHeap>;
-    using FrameType = typename ParticlesBaseType::FrameType;
-    using FrameTypeBorder = typename ParticlesBaseType::FrameTypeBorder;
-    using ParticlesBoxType = typename ParticlesBaseType::ParticlesBoxType;
-
-
-    Particles(const std::shared_ptr<DeviceHeap>& heap, picongpu::MappingDesc cellDescription, SimulationDataId datasetID);
-
-    void createParticleBuffer();
-
-    void update( uint32_t const currentStep );
-
-    template<typename T_DensityFunctor, typename T_PositionFunctor>
-    void initDensityProfile(T_DensityFunctor& densityFunctor, T_PositionFunctor& positionFunctor, const uint32_t currentStep);
-
-    template<
-        typename T_SrcName,
-        typename T_SrcAttributes,
-        typename T_SrcFlags,
-        typename T_ManipulateFunctor,
-        typename T_SrcFilterFunctor
-    >
-    void deviceDeriveFrom(
-        Particles<
-            T_SrcName,
-            T_SrcAttributes,
-            T_SrcFlags
-        >& src,
-        T_ManipulateFunctor& manipulateFunctor,
-        T_SrcFilterFunctor& srcFilterFunctor
-    );
-
-    SimulationDataId getUniqueId() override;
-
-    /* sync device data to host
+    /** particle species
      *
-     * ATTENTION: - in the current implementation only supercell meta data are copied!
-     *            - the shared (between all species) mallocMC buffer must be copied once
-     *              by the user
+     * @tparam T_Name name of the species [type boost::mpl::string]
+     * @tparam T_Attributes sequence with attributes [type boost::mpl forward sequence]
+     * @tparam T_Flags sequence with flags e.g. solver [type boost::mpl forward sequence]
      */
-    void synchronize() override;
-
-    void syncToDevice() override;
-
-    static pmacc::traits::StringProperty getStringProperties()
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    class Particles
+        : public ParticlesBase<
+              ParticleDescription<
+                  T_Name,
+                  SuperCellSize,
+                  T_Attributes,
+                  T_Flags,
+                  typename bmpl::if_<
+                      // check if alias boundaryCondition is defined for the species
+                      bmpl::contains<T_Flags, typename GetKeyFromAlias<T_Flags, boundaryCondition<>>::type>,
+                      // resolve the alias
+                      typename pmacc::traits::Resolve<
+                          typename GetKeyFromAlias<T_Flags, boundaryCondition<>>::type>::type,
+                      // fallback if the species has not defined the alias boundaryCondition
+                      pmacc::HandleGuardRegion<
+                          pmacc::particles::policies::ExchangeParticles,
+                          particles::boundary::CallPluginsAndDeleteParticles>>::type>,
+              MappingDesc,
+              DeviceHeap>
+        , public ISimulationData
     {
-        pmacc::traits::StringProperty propList;
-        const DataSpace<DIM3> periodic =
-            Environment<simDim>::get().EnvironmentController().getCommunicator().getPeriodic();
-
-        for( uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i )
+    public:
+        using SpeciesParticleDescription = pmacc::ParticleDescription<
+            T_Name,
+            SuperCellSize,
+            T_Attributes,
+            T_Flags,
+            typename bmpl::if_<
+                // check if alias boundaryCondition is defined for the species
+                bmpl::contains<T_Flags, typename GetKeyFromAlias<T_Flags, boundaryCondition<>>::type>,
+                // resolve the alias
+                typename pmacc::traits::Resolve<typename GetKeyFromAlias<T_Flags, boundaryCondition<>>::type>::type,
+                // fallback if the species has not defined the alias boundaryCondition
+                pmacc::HandleGuardRegion<
+                    pmacc::particles::policies::ExchangeParticles,
+                    particles::boundary::CallPluginsAndDeleteParticles>>::type>;
+        using ParticlesBaseType = ParticlesBase<SpeciesParticleDescription, picongpu::MappingDesc, DeviceHeap>;
+        using FrameType = typename ParticlesBaseType::FrameType;
+        using FrameTypeBorder = typename ParticlesBaseType::FrameTypeBorder;
+        using ParticlesBoxType = typename ParticlesBaseType::ParticlesBoxType;
+
+
+        Particles(
+            const std::shared_ptr<DeviceHeap>& heap,
+            picongpu::MappingDesc cellDescription,
+            SimulationDataId datasetID);
+
+        void createParticleBuffer();
+
+        void update(uint32_t const currentStep);
+
+        template<typename T_DensityFunctor, typename T_PositionFunctor>
+        void initDensityProfile(
+            T_DensityFunctor& densityFunctor,
+            T_PositionFunctor& positionFunctor,
+            const uint32_t currentStep);
+
+        template<
+            typename T_SrcName,
+            typename T_SrcAttributes,
+            typename T_SrcFlags,
+            typename T_ManipulateFunctor,
+            typename T_SrcFilterFunctor>
+        void deviceDeriveFrom(
+            Particles<T_SrcName, T_SrcAttributes, T_SrcFlags>& src,
+            T_ManipulateFunctor& manipulateFunctor,
+            T_SrcFilterFunctor& srcFilterFunctor);
+
+        SimulationDataId getUniqueId() override;
+
+        /* sync device data to host
+         *
+         * ATTENTION: - in the current implementation only supercell meta data are copied!
+         *            - the shared (between all species) mallocMC buffer must be copied once
+         *              by the user
+         */
+        void synchronize() override;
+
+        void syncToDevice() override;
+
+        static pmacc::traits::StringProperty getStringProperties()
         {
-            // for each planar direction: left right top bottom back front
-            if( FRONT % i == 0 )
+            pmacc::traits::StringProperty propList;
+            const DataSpace<DIM3> periodic
+                = Environment<simDim>::get().EnvironmentController().getCommunicator().getPeriodic();
+
+            for(uint32_t i = 1; i < NumberOfExchanges<simDim>::value; ++i)
             {
-                const std::string directionName = ExchangeTypeNames()[i];
-                const DataSpace<DIM3> relDir = Mask::getRelativeDirections<DIM3>(i);
+                // for each planar direction: left right top bottom back front
+                if(FRONT % i == 0)
+                {
+                    const std::string directionName = ExchangeTypeNames()[i];
+                    const DataSpace<DIM3> relDir = Mask::getRelativeDirections<DIM3>(i);
 
-                const bool isPeriodic =
-                    (relDir * periodic) != DataSpace<DIM3>::create(0);
+                    const bool isPeriodic = (relDir * periodic) != DataSpace<DIM3>::create(0);
 
-                std::string boundaryName = "absorbing";
-                if( isPeriodic )
-                    boundaryName = "periodic";
+                    std::string boundaryName = "absorbing";
+                    if(isPeriodic)
+                        boundaryName = "periodic";
 
-                if( boundaryName == "absorbing" )
-                {
-                    propList[directionName]["param"] = std::string("without field correction");
-                }
-                else
-                {
-                    propList[directionName]["param"] = std::string("none");
-                }
+                    if(boundaryName == "absorbing")
+                    {
+                        propList[directionName]["param"] = std::string("without field correction");
+                    }
+                    else
+                    {
+                        propList[directionName]["param"] = std::string("none");
+                    }
 
-                propList[directionName]["name"] = boundaryName;
+                    propList[directionName]["name"] = boundaryName;
+                }
             }
+            return propList;
         }
-        return propList;
-    }
 
-private:
-    SimulationDataId m_datasetID;
+        template<typename T_Pusher>
+        void push(uint32_t const currentStep);
 
-    FieldE *fieldE;
-    FieldB *fieldB;
-};
+    private:
+        SimulationDataId m_datasetID;
 
-namespace traits
-{
-    template<
-        typename T_Name,
-        typename T_Attributes,
-        typename T_Flags
-    >
-    struct GetDataBoxType<
-        picongpu::Particles<
-            T_Name,
-            T_Attributes,
-            T_Flags
-       >
-    >
-    {
-        using type = typename picongpu::Particles<
-            T_Name,
-            T_Attributes,
-            T_Flags
-        >::ParticlesBoxType;
+        /** Get exchange memory size.
+         *
+         * @param ex exchange index calculated from pmacc::typ::ExchangeType, valid range: [0;27)
+         * @return exchange size in bytes
+         */
+        size_t exchangeMemorySize(uint32_t ex) const;
+
+        FieldE* fieldE;
+        FieldB* fieldB;
     };
-} //namespace traits
-} //namespace picongpu
+
+    namespace traits
+    {
+        template<typename T_Name, typename T_Attributes, typename T_Flags>
+        struct GetDataBoxType<picongpu::Particles<T_Name, T_Attributes, T_Flags>>
+        {
+            using type = typename picongpu::Particles<T_Name, T_Attributes, T_Flags>::ParticlesBoxType;
+        };
+    } // namespace traits
+} // namespace picongpu
 
 namespace pmacc
 {
-namespace traits
-{
-    template<
-        typename T_Name,
-        typename T_Flags,
-        typename T_Attributes
-    >
-    struct GetCTName<
-        ::picongpu::Particles<
-            T_Name,
-            T_Flags,
-            T_Attributes
-        >
-    >
+    namespace traits
     {
-        using type = T_Name;
-    };
+        template<typename T_Name, typename T_Flags, typename T_Attributes>
+        struct GetCTName<::picongpu::Particles<T_Name, T_Flags, T_Attributes>>
+        {
+            using type = T_Name;
+        };
 
-} // namepsace traits
+    } // namespace traits
 } // namespace pmacc
diff --git a/include/picongpu/particles/Particles.kernel b/include/picongpu/particles/Particles.kernel
index a06c24a03f..9906c1f88b 100644
--- a/include/picongpu/particles/Particles.kernel
+++ b/include/picongpu/particles/Particles.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Wen Fu,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Wen Fu,
  *                     Marco Garten, Alexander Grund, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -35,10 +35,6 @@
 #include <pmacc/nvidia/functors/Assign.hpp>
 #include <pmacc/mappings/threads/ThreadCollective.hpp>
 
-#include <pmacc/nvidia/rng/RNG.hpp>
-#include <pmacc/nvidia/rng/methods/Xor.hpp>
-#include <pmacc/nvidia/rng/distributions/Normal_float.hpp>
-
 #include <pmacc/particles/operations/Assign.hpp>
 #include <pmacc/particles/operations/Deselect.hpp>
 #include <pmacc/nvidia/atomic.hpp>
@@ -52,521 +48,383 @@
 
 namespace picongpu
 {
-
-/** derive new particles from a source species
- *
- * This functor prepares a source and destination particle box to call
- * a user defined functor which allows to derive new particles out of
- * another species.
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelDeriveParticles
-{
-    /** frame-wise derive new particles
+    /** derive new particles from a source species
      *
-     * @tparam T_DestParBox pmacc::ParticlesBox, type of the destination species box
-     * @tparam T_SrcParBox pmacc::ParticlesBox, type of the source species box
-     * @tparam T_ManipulateFunctor type of the user functor to derive a particle
-     * @tparam T_Mapping mapping functor type
+     * This functor prepares a source and destination particle box to call
+     * a user defined functor which allows to derive new particles out of
+     * another species.
      *
-     * @param destBox particles box for the destination species
-     * @param srcBox particles box of the source species
-     * @param manipulateFunctor functor to derive a particle out of another one
-     *                          must fulfill the interface particles::manipulators::IManipulator
-     * @param srcFilterFunctor unary filter to select in the source species
-     *                         which particles to derive
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-   template<
-        typename T_DestParBox,
-        typename T_SrcParBox,
-        typename T_ManipulateFunctor,
-        typename T_SrcFilterFunctor,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_DestParBox destBox,
-        T_SrcParBox srcBox,
-        T_ManipulateFunctor manipulateFunctor,
-        T_SrcFilterFunctor srcFilterFunctor,
-        T_Mapping const mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelDeriveParticles
     {
-        using namespace pmacc::particles::operations;
-        using namespace mappings::threads;
-
-        using DestFramePtr = typename T_DestParBox::FramePtr;
-        using SrcFramePtr = typename T_SrcParBox::FramePtr;
-
-        constexpr uint32_t frameSize = pmacc::math::CT::volume< SuperCellSize >::type::value;
-        constexpr uint32_t numWorker = T_numWorkers;
-
-        uint32_t const workerIdx = threadIdx.x;
-
-        PMACC_SMEM(
-            acc,
-            srcFrame,
-            SrcFramePtr
-        );
-        PMACC_SMEM(
-            acc,
-            destFrame,
-            DestFramePtr
-        );
-
-        DataSpace< simDim > const superCellIdx = mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) );
-
-        // offset of the superCell (in cells, without any guards) to the origin of the local domain
-        DataSpace< simDim > const localSuperCellOffset =
-            superCellIdx - mapper.getGuardingSuperCells( );
-
-        ForEachIdx<
-            IdxConfig<
-                1,
-                numWorker
-            >
-        > onlyMaster{ workerIdx };
-
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                srcFrame = srcBox.getFirstFrame( superCellIdx );
-                if( srcFrame.isValid( ) )
+        /** frame-wise derive new particles
+         *
+         * @tparam T_DestParBox pmacc::ParticlesBox, type of the destination species box
+         * @tparam T_SrcParBox pmacc::ParticlesBox, type of the source species box
+         * @tparam T_ManipulateFunctor type of the user functor to derive a particle
+         * @tparam T_Mapping mapping functor type
+         *
+         * @param destBox particles box for the destination species
+         * @param srcBox particles box of the source species
+         * @param manipulateFunctor functor to derive a particle out of another one
+         *                          must fulfill the interface particles::manipulators::IManipulator
+         * @param srcFilterFunctor unary filter to select in the source species
+         *                         which particles to derive
+         * @param mapper functor to map a block to a supercell
+         */
+        template<
+            typename T_DestParBox,
+            typename T_SrcParBox,
+            typename T_ManipulateFunctor,
+            typename T_SrcFilterFunctor,
+            typename T_Mapping,
+            typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_DestParBox destBox,
+            T_SrcParBox srcBox,
+            T_ManipulateFunctor manipulateFunctor,
+            T_SrcFilterFunctor srcFilterFunctor,
+            T_Mapping const mapper) const
+        {
+            using namespace pmacc::particles::operations;
+            using namespace mappings::threads;
+
+            using DestFramePtr = typename T_DestParBox::FramePtr;
+            using SrcFramePtr = typename T_SrcParBox::FramePtr;
+
+            constexpr uint32_t frameSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
+            constexpr uint32_t numWorker = T_numWorkers;
+
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+            PMACC_SMEM(acc, srcFrame, SrcFramePtr);
+            PMACC_SMEM(acc, destFrame, DestFramePtr);
+
+            DataSpace<simDim> const superCellIdx = mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc)));
+
+            // offset of the superCell (in cells, without any guards) to the origin of the local domain
+            DataSpace<simDim> const localSuperCellOffset = superCellIdx - mapper.getGuardingSuperCells();
+
+            ForEachIdx<IdxConfig<1, numWorker>> onlyMaster{workerIdx};
+
+            onlyMaster([&](uint32_t const, uint32_t const) {
+                srcFrame = srcBox.getFirstFrame(superCellIdx);
+                if(srcFrame.isValid())
                 {
                     // we have something to clone
-                    destFrame = destBox.getEmptyFrame( );
+                    destFrame = destBox.getEmptyFrame(acc);
                 }
-            }
-        );
-
-        auto accManipulator = manipulateFunctor(
-            acc,
-            localSuperCellOffset,
-            WorkerCfg< numWorker >{ workerIdx }
-        );
-        auto accSrcFilter = srcFilterFunctor(
-            acc,
-            localSuperCellOffset,
-            WorkerCfg< numWorker >{ workerIdx }
-        );
-
-        __syncthreads( );
-
-        // move over all Frames
-        while( srcFrame.isValid( ) )
-        {
-            using ParticleDomCfg = IdxConfig<
-                frameSize,
-                numWorker
-            >;
-
-            // loop over all particles in the frame
-            ForEachIdx< ParticleDomCfg >{ workerIdx }
-            (
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    auto parDest = destFrame[ linearIdx ];
-                    auto parSrc = srcFrame[ linearIdx ];
-                    if( parSrc[ multiMask_ ] != 1 )
-                        parSrc.setHandleInvalid( );
+            });
 
-                    if( accSrcFilter( acc, parSrc ) )
+            auto accManipulator = manipulateFunctor(acc, localSuperCellOffset, WorkerCfg<numWorker>{workerIdx});
+            auto accSrcFilter = srcFilterFunctor(acc, localSuperCellOffset, WorkerCfg<numWorker>{workerIdx});
+
+            cupla::__syncthreads(acc);
+
+            // move over all Frames
+            while(srcFrame.isValid())
+            {
+                using ParticleDomCfg = IdxConfig<frameSize, numWorker>;
+
+                // loop over all particles in the frame
+                ForEachIdx<ParticleDomCfg>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                    auto parDest = destFrame[linearIdx];
+                    auto parSrc = srcFrame[linearIdx];
+                    if(parSrc[multiMask_] != 1)
+                        parSrc.setHandleInvalid();
+
+                    if(accSrcFilter(acc, parSrc))
                     {
-                        assign(
-                            parDest,
-                            deselect< particleId >( parSrc )
-                        );
-
-                        accManipulator(
-                            acc,
-                            parDest,
-                            parSrc
-                        );
+                        assign(parDest, deselect<particleId>(parSrc));
+
+                        accManipulator(acc, parDest, parSrc);
                     }
-                }
-            );
+                });
 
-            __syncthreads( );
+                cupla::__syncthreads(acc);
 
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    destBox.setAsLastFrame(
-                        acc,
-                        destFrame,
-                        superCellIdx
-                    );
-
-                    srcFrame = srcBox.getNextFrame( srcFrame );
-                    if( srcFrame.isValid( ) )
+                onlyMaster([&](uint32_t const, uint32_t const) {
+                    destBox.setAsLastFrame(acc, destFrame, superCellIdx);
+
+                    srcFrame = srcBox.getNextFrame(srcFrame);
+                    if(srcFrame.isValid())
                     {
-                        destFrame = destBox.getEmptyFrame( );
+                        destFrame = destBox.getEmptyFrame(acc);
                     }
-                }
-            );
-            __syncthreads( );
+                });
+                cupla::__syncthreads(acc);
+            }
         }
-    }
-};
+    };
 
-/** move over all particles
- *
- * Move frame-wise over a species and call a functor for each particle.
- * This kernel is optimized for the particle push step and handles the
- * special flag `mustShift` of the supercell to optimize the kernel shift particles
- * in pmacc.
- *
- * @tparam T_numWorkers number of workers
- * @tparam T_DataDomain pmacc::SuperCellDescription, compile time data domain
- *                      description with a CORE and GUARD
- */
-template<
-    uint32_t T_numWorkers,
-    typename T_DataDomain
->
-struct KernelMoveAndMarkParticles
-{
-    /** update all particles
+    /** move over all particles
      *
-     * @tparam T_ParBox pmacc::ParticlesBox, particle box type
-     * @tparam T_EBox pmacc::DataBox, electric field box type
-     * @tparam T_BBox pmacc::DataBox, magnetic field box type
-     * @tparam T_ParticleFunctor particle functor type
-     * @tparam T_Mapping mapper functor type
-     * @tparam T_Acc alpaka accelerator type
+     * Move frame-wise over a species and call a functor for each particle.
+     * This kernel is optimized for the particle push step and handles the
+     * special flag `mustShift` of the supercell to optimize the kernel shift particles
+     * in pmacc.
      *
-     * @param alpaka accelerator
-     * @param pb particle memory
-     * @param fieldE electric field data
-     * @param fieldB magnetic field data
-     * @param particleFunctor functor to manipulate (update) a particle
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
+     * @tparam T_DataDomain pmacc::SuperCellDescription, compile time data domain
+     *                      description with a CORE and GUARD
      */
-    template<
-        typename T_ParBox,
-        typename T_EBox,
-        typename T_BBox,
-        typename T_ParticleFunctor,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParBox pb,
-        T_EBox fieldE,
-        T_BBox fieldB,
-        uint32_t const currentStep,
-        T_ParticleFunctor particleFunctor,
-        T_Mapping mapper
-    ) const
+    template<uint32_t T_numWorkers, typename T_DataDomain>
+    struct KernelMoveAndMarkParticles
     {
-        using namespace mappings::threads;
+        /** update all particles
+         *
+         * @tparam T_ParBox pmacc::ParticlesBox, particle box type
+         * @tparam T_EBox pmacc::DataBox, electric field box type
+         * @tparam T_BBox pmacc::DataBox, magnetic field box type
+         * @tparam T_ParticleFunctor particle functor type
+         * @tparam T_Mapping mapper functor type
+         * @tparam T_Acc alpaka accelerator type
+         *
+         * @param alpaka accelerator
+         * @param pb particle memory
+         * @param fieldE electric field data
+         * @param fieldB magnetic field data
+         * @param particleFunctor functor to manipulate (update) a particle
+         * @param mapper functor to map a block to a supercell
+         */
+        template<
+            typename T_ParBox,
+            typename T_EBox,
+            typename T_BBox,
+            typename T_ParticleFunctor,
+            typename T_Mapping,
+            typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_ParBox pb,
+            T_EBox fieldE,
+            T_BBox fieldB,
+            uint32_t const currentStep,
+            T_ParticleFunctor particleFunctor,
+            T_Mapping mapper) const
+        {
+            using namespace mappings::threads;
+
+            constexpr uint32_t frameSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
+
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+            using FramePtr = typename T_ParBox::FramePtr;
+
+            DataSpace<simDim> const block(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+
+            // relative offset (in cells) to the supercell (including the guard)
+            DataSpace<simDim> const superCellOffset = block * SuperCellSize::toRT();
+
+            using ParticleDomCfg = IdxConfig<frameSize, numWorkers>;
+
+            PMACC_SMEM(acc, mustShift, int);
+
+            // current processed frame
+            FramePtr frame;
+            lcellId_t particlesInSuperCell;
 
-        constexpr uint32_t frameSize = pmacc::math::CT::volume< SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
+            ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
 
-        uint32_t const workerIdx = threadIdx.x;
+            onlyMaster([&](uint32_t const, uint32_t const) { mustShift = 0; });
 
-        using FramePtr = typename T_ParBox::FramePtr;
+            frame = pb.getLastFrame(block);
+            particlesInSuperCell = pb.getSuperCell(block).getSizeLastFrame();
 
-        DataSpace< simDim > const block(
-            mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) )
-        );
+            auto cachedB = CachedBox::create<0, typename T_BBox::ValueType>(acc, T_DataDomain());
+            auto cachedE = CachedBox::create<1, typename T_EBox::ValueType>(acc, T_DataDomain());
 
-        // relative offset (in cells) to the supercell (including the guard)
-        DataSpace< simDim > const superCellOffset = block * SuperCellSize::toRT();
+            cupla::__syncthreads(acc);
 
-        using ParticleDomCfg = IdxConfig<
-            frameSize,
-            numWorkers
-        >;
+            // end kernel if we have no frames
+            if(!frame.isValid())
+                return;
 
-        PMACC_SMEM(
-            acc,
-            mustShift,
-            int
-        );
+            nvidia::functors::Assign assign;
+            ThreadCollective<T_DataDomain, numWorkers> collective{workerIdx};
 
-        // current processed frame
-        FramePtr frame;
-        lcellId_t particlesInSuperCell;
+            auto fieldBBlock = fieldB.shift(superCellOffset);
+            collective(acc, assign, cachedB, fieldBBlock);
 
-        ForEachIdx<
-            IdxConfig<
-                1,
-                numWorkers
-            >
-        > onlyMaster{ workerIdx };
+            auto fieldEBlock = fieldE.shift(superCellOffset);
+            collective(acc, assign, cachedE, fieldEBlock);
 
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
+            cupla::__syncthreads(acc);
+
+            // move over frames and call frame solver
+            while(frame.isValid())
             {
-                mustShift = 0;
-            }
-        );
-
-        frame = pb.getLastFrame( block );
-        particlesInSuperCell = pb.getSuperCell( block ).getSizeLastFrame( );
-
-        auto cachedB = CachedBox::create<
-            0,
-            typename T_BBox::ValueType
-        >(
-            acc,
-            T_DataDomain( )
-        );
-        auto cachedE = CachedBox::create<
-            1,
-            typename T_EBox::ValueType
-        >(
-            acc,
-            T_DataDomain( )
-        );
-
-        __syncthreads();
-
-        // end kernel if we have no frames
-        if( !frame.isValid( ) )
-           return;
-
-        nvidia::functors::Assign assign;
-        ThreadCollective<
-            T_DataDomain,
-            numWorkers
-        > collective{ workerIdx };
-
-        auto fieldBBlock = fieldB.shift( superCellOffset );
-        collective(
-            acc,
-            assign,
-            cachedB,
-            fieldBBlock
-        );
-
-        auto fieldEBlock = fieldE.shift( superCellOffset );
-        collective(
-            acc,
-            assign,
-            cachedE,
-            fieldEBlock
-        );
-
-        __syncthreads();
-
-        // move over frames and call frame solver
-        while( frame.isValid( ) )
-        {
-            // loop over all particles in the frame
-            ForEachIdx< ParticleDomCfg >{ workerIdx }
-            (
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    if( linearIdx < particlesInSuperCell )
+                // loop over all particles in the frame
+                ForEachIdx<ParticleDomCfg>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                    if(linearIdx < particlesInSuperCell)
                     {
-                        particleFunctor(
-                            acc,
-                            *frame,
-                            linearIdx,
-                            cachedB,
-                            cachedE,
-                            currentStep,
-                            mustShift
-                        );
+                        particleFunctor(acc, *frame, linearIdx, cachedB, cachedE, currentStep, mustShift);
                     }
-                }
-            );
-            // independent for each worker
-            frame = pb.getPreviousFrame( frame );
-            particlesInSuperCell = frameSize;
-        }
+                });
+                // independent for each worker
+                frame = pb.getPreviousFrame(frame);
+                particlesInSuperCell = frameSize;
+            }
 
-        __syncthreads();
+            cupla::__syncthreads(acc);
 
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
+            onlyMaster([&](uint32_t const, uint32_t const) {
                 /* set in SuperCell the mustShift flag which is an optimization
                  * for shift particles (pmacc::KernelShiftParticles)
                  */
-                if( mustShift == 1 )
+                if(mustShift == 1)
                 {
-                    pb.getSuperCell(
-                        mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) )
-                    ).setMustShift( true );
+                    pb.getSuperCell(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))))
+                        .setMustShift(true);
                 }
-            }
-        );
-   }
-};
-
-template<class PushAlgo, class TVec, class T_Field2ParticleInterpolation>
-struct PushParticlePerFrame
-{
+            });
+        }
+    };
 
-    template<class FrameType, class BoxB, class BoxE, typename T_Acc >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        FrameType& frame,
-        int localIdx,
-        BoxB& bBox,
-        BoxE& eBox,
-        uint32_t const currentStep,
-        int& mustShift
-    )
+    template<class PushAlgo, class TVec, class T_Field2ParticleInterpolation>
+    struct PushParticlePerFrame
     {
+        template<class FrameType, class BoxB, class BoxE, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            FrameType& frame,
+            int localIdx,
+            BoxB& bBox,
+            BoxE& eBox,
+            uint32_t const currentStep,
+            int& mustShift)
+        {
+            using Block = TVec;
+            using Field2ParticleInterpolation = T_Field2ParticleInterpolation;
 
-        using Block = TVec;
-        using Field2ParticleInterpolation = T_Field2ParticleInterpolation;
+            using BType = typename BoxB::ValueType;
+            using EType = typename BoxE::ValueType;
 
-        using BType = typename BoxB::ValueType;
-        using EType = typename BoxE::ValueType;
+            auto particle = frame[localIdx];
 
-        auto particle = frame[localIdx];
+            floatD_X pos = particle[position_];
+            const int particleCellIdx = particle[localCellIdx_];
 
-        floatD_X pos = particle[position_];
-        const int particleCellIdx = particle[localCellIdx_];
+            DataSpace<TVec::dim> localCell(DataSpaceOperations<TVec::dim>::template map<TVec>(particleCellIdx));
 
-        DataSpace<TVec::dim> localCell(DataSpaceOperations<TVec::dim>::template map<TVec > (particleCellIdx));
+            const traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
+            const traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
 
-        const traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
-        const traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
+            auto functorEfield = CreateInterpolationForPusher<Field2ParticleInterpolation>()(
+                eBox.shift(localCell).toCursor(),
+                fieldPosE());
+            auto functorBfield = CreateInterpolationForPusher<Field2ParticleInterpolation>()(
+                bBox.shift(localCell).toCursor(),
+                fieldPosB());
 
-        auto functorEfield = CreateInterpolationForPusher<Field2ParticleInterpolation>()( eBox.shift(localCell).toCursor(), fieldPosE() );
-        auto functorBfield = CreateInterpolationForPusher<Field2ParticleInterpolation>()( bBox.shift(localCell).toCursor(), fieldPosB() );
+            /** @todo this functor should only manipulate the momentum and all changes
+             *        in position and cell below need to go into a separate kernel
+             */
+            PushAlgo push;
+            push(functorBfield, functorEfield, particle, pos, currentStep);
 
-        /** @todo this functor should only manipulate the momentum and all changes
-         *        in position and cell below need to go into a separate kernel
-         */
-        PushAlgo push;
-        push(
-             functorBfield,
-             functorEfield,
-             particle,
-             pos,
-             currentStep
-        );
-
-        DataSpace<simDim> dir;
-        for (uint32_t i = 0; i < simDim; ++i)
-        {
-            /* ATTENTION we must handle float rounding errors
-             * pos in range [-1;2)
-             *
-             * If pos is negative and very near to 0 (e.g. pos < -1e-8)
-             * and we move pos with pos+=1.0 back to normal in cell postion
-             * we get a rounding error and pos is assigned to 1. This breaks
-             * our in cell definition range [0,1)
-             *
-             * if pos negativ moveDir is set to -1
-             * if pos positive and >1 moveDir is set to +1
-             * 0 (zero) if particle stays in cell
+            DataSpace<simDim> dir;
+            for(uint32_t i = 0; i < simDim; ++i)
+            {
+                /* ATTENTION we must handle float rounding errors
+                 * pos in range [-1;2)
+                 *
+                 * If pos is negative and very near to 0 (e.g. pos < -1e-8)
+                 * and we move pos with pos+=1.0 back to normal in cell postion
+                 * we get a rounding error and pos is assigned to 1. This breaks
+                 * our in cell definition range [0,1)
+                 *
+                 * if pos negativ moveDir is set to -1
+                 * if pos positive and >1 moveDir is set to +1
+                 * 0 (zero) if particle stays in cell
+                 */
+                float_X moveDir = math::floor(pos[i]);
+                /* shift pos back to cell range [0;1)*/
+                pos[i] -= moveDir;
+                /* check for rounding errors and correct them
+                 * if position now is 1 we have a rounding error
+                 *
+                 * We correct moveDir that we not have left the cell
+                 */
+                const float_X valueCorrector = math::floor(pos[i]);
+                /* One has also to correct moveDir for the following reason:
+                 * Imagine a new particle moves to -1e-20, leaving the cell to the left,
+                 * setting moveDir to -1.
+                 * The new in-cell position will be -1e-20 + 1.0,
+                 * which can flip to 1.0 (wrong value).
+                 * We move the particle back to the old cell at position 0.0 and
+                 * moveDir has to be corrected back, too (add +1 again).*/
+                moveDir += valueCorrector;
+                /* If we have corrected moveDir we must set pos to 0 */
+                pos[i] -= valueCorrector;
+                dir[i] = precisionCast<int>(moveDir);
+            }
+            particle[position_] = pos;
+
+            /* new local cell position after particle move
+             * can be out of supercell
              */
-            float_X moveDir = math::floor(pos[i]);
-            /* shift pos back to cell range [0;1)*/
-            pos[i] -= moveDir;
-            /* check for rounding errors and correct them
-             * if position now is 1 we have a rounding error
+            localCell += dir;
+
+            /* ATTENTION ATTENTION we cast to unsigned, this means that a negative
+             * direction is know a very very big number, than we compare with supercell!
              *
-             * We correct moveDir that we not have left the cell
+             * if particle is inside of the supercell the **unsigned** representation
+             * of dir is always >= size of the supercell
              */
-            const float_X valueCorrector = math::floor(pos[i]);
-            /* One has also to correct moveDir for the following reason:
-             * Imagine a new particle moves to -1e-20, leaving the cell to the left,
-             * setting moveDir to -1.
-             * The new in-cell position will be -1e-20 + 1.0,
-             * which can flip to 1.0 (wrong value).
-             * We move the particle back to the old cell at position 0.0 and
-             * moveDir has to be corrected back, too (add +1 again).*/
-            moveDir += valueCorrector;
-            /* If we have corrected moveDir we must set pos to 0 */
-            pos[i] -= valueCorrector;
-            dir[i] = precisionCast<int>(moveDir);
-        }
-        particle[position_] = pos;
-
-        /* new local cell position after particle move
-         * can be out of supercell
-         */
-        localCell += dir;
-
-        /* ATTENTION ATTENTION we cast to unsigned, this means that a negative
-         * direction is know a very very big number, than we compare with supercell!
-         *
-         * if particle is inside of the supercell the **unsigned** representation
-         * of dir is always >= size of the supercell
-         */
-        for (uint32_t i = 0; i < simDim; ++i)
-            dir[i] *= precisionCast<uint32_t>(localCell[i]) >= precisionCast<uint32_t>(TVec::toRT()[i]) ? 1 : 0;
+            for(uint32_t i = 0; i < simDim; ++i)
+                dir[i] *= precisionCast<uint32_t>(localCell[i]) >= precisionCast<uint32_t>(TVec::toRT()[i]) ? 1 : 0;
 
-        /* if partice is outside of the supercell we use mod to
-         * set particle at cell supercellSize to 1
-         * and partticle at cell -1 to supercellSize-1
-         * % (mod) can't use with negativ numbers, we add one supercellSize to hide this
-         *
-        localCell.x() = (localCell.x() + TVec::x) % TVec::x;
-        localCell.y() = (localCell.y() + TVec::y) % TVec::y;
-        localCell.z() = (localCell.z() + TVec::z) % TVec::z;
-         */
+            /* if partice is outside of the supercell we use mod to
+             * set particle at cell supercellSize to 1
+             * and partticle at cell -1 to supercellSize-1
+             * % (mod) can't use with negativ numbers, we add one supercellSize to hide this
+             *
+            localCell.x() = (localCell.x() + TVec::x) % TVec::x;
+            localCell.y() = (localCell.y() + TVec::y) % TVec::y;
+            localCell.z() = (localCell.z() + TVec::z) % TVec::z;
+             */
 
-        /*dir is only +1 or -1 if particle is outside of supercell
-         * y=cell-(dir*superCell_size)
-         * y=0 if dir==-1
-         * y=superCell_size if dir==+1
-         * for dir 0 localCel is not changed
-         */
-        localCell -= (dir * TVec::toRT());
-        /*calculate one dimensional cell index*/
-        particle[localCellIdx_] = DataSpaceOperations<TVec::dim>::template map<TVec > (localCell);
-
-        /* [ dir + int(dir < 0)*3 ] == [ (dir + 3) %3 = y ]
-         * but without modulo
-         * y=0 for dir = 0
-         * y=1 for dir = 1
-         * y=2 for dir = -1
-         */
-        int direction = 1;
-        uint32_t exchangeType = 1; // see inlcude/pmacc/types.h for RIGHT, BOTTOM and BACK
-        for (uint32_t i = 0; i < simDim; ++i)
-        {
-            direction += (dir[i] == -1 ? 2 : dir[i]) * exchangeType;
-            exchangeType *= 3; // =3^i (1=RIGHT, 3=BOTTOM; 9=BACK)
-        }
+            /*dir is only +1 or -1 if particle is outside of supercell
+             * y=cell-(dir*superCell_size)
+             * y=0 if dir==-1
+             * y=superCell_size if dir==+1
+             * for dir 0 localCel is not changed
+             */
+            localCell -= (dir * TVec::toRT());
+            /*calculate one dimensional cell index*/
+            particle[localCellIdx_] = DataSpaceOperations<TVec::dim>::template map<TVec>(localCell);
+
+            /* [ dir + int(dir < 0)*3 ] == [ (dir + 3) %3 = y ]
+             * but without modulo
+             * y=0 for dir = 0
+             * y=1 for dir = 1
+             * y=2 for dir = -1
+             */
+            int direction = 1;
+            uint32_t exchangeType = 1; // see inlcude/pmacc/types.h for RIGHT, BOTTOM and BACK
+            for(uint32_t i = 0; i < simDim; ++i)
+            {
+                direction += (dir[i] == -1 ? 2 : dir[i]) * exchangeType;
+                exchangeType *= 3; // =3^i (1=RIGHT, 3=BOTTOM; 9=BACK)
+            }
 
-        particle[multiMask_] = direction;
+            particle[multiMask_] = direction;
 
-        /* set our tuning flag if minimal one particle leave the supercell
-         * This flag is needed for later fast shift of particles only if needed
-         */
-        if (direction >= 2)
-        {
-            /* if we did not use atomic we would get a WAW error */
-            nvidia::atomicAllExch(acc, &mustShift, 1, ::alpaka::hierarchy::Threads{});
+            /* set our tuning flag if minimal one particle leave the supercell
+             * This flag is needed for later fast shift of particles only if needed
+             */
+            if(direction >= 2)
+            {
+                /* if we did not use atomic we would get a WAW error */
+                nvidia::atomicAllExch(acc, &mustShift, 1, ::alpaka::hierarchy::Threads{});
+            }
         }
-    }
-};
-
+    };
 
 
-} //namespace
+} // namespace picongpu
diff --git a/include/picongpu/particles/Particles.tpp b/include/picongpu/particles/Particles.tpp
index f900f0d25a..9bceedcb06 100644
--- a/include/picongpu/particles/Particles.tpp
+++ b/include/picongpu/particles/Particles.tpp
@@ -1,5 +1,5 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch, Felix Schmitt,
- *                     Alexander Grund
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch, Felix Schmitt,
+ *                     Alexander Grund, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -24,6 +24,7 @@
 #include "picongpu/particles/Particles.hpp"
 
 #include "picongpu/particles/Particles.kernel"
+#include "picongpu/particles/pusher/Traits.hpp"
 #include "picongpu/particles/traits/GetExchangeMemCfg.hpp"
 
 #include <pmacc/dataManagement/DataConnector.hpp>
@@ -47,343 +48,370 @@
 #include <iostream>
 #include <limits>
 #include <memory>
-
+#include <utility>
 
 namespace picongpu
 {
-
-
-using namespace pmacc;
-
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-Particles<
-    T_Name,
-    T_Flags,
-    T_Attributes
->::Particles(
-    const std::shared_ptr<DeviceHeap>& heap,
-    picongpu::MappingDesc cellDescription,
-    SimulationDataId datasetID
-) :
-    ParticlesBase<
-        SpeciesParticleDescription,
-        picongpu::MappingDesc,
-        DeviceHeap
-    >(
-        heap,
-        cellDescription
-    ),
-    m_datasetID( datasetID )
-{
-    using ExchangeMemCfg = GetExchangeMemCfg_t< Particles >;
-
-    size_t sizeOfExchanges = 0u;
-
-    const uint32_t commTag = pmacc::traits::GetUniqueTypeId<FrameType, uint32_t>::uid() + SPECIES_FIRSTTAG;
-    log<picLog::MEMORY > ( "communication tag for species %1%: %2%" ) % FrameType::getName( ) % commTag;
-
-    this->particlesBuffer->addExchange( Mask( LEFT ) + Mask( RIGHT ),
-                                        ExchangeMemCfg::BYTES_EXCHANGE_X,
-                                        commTag);
-    sizeOfExchanges += ExchangeMemCfg::BYTES_EXCHANGE_X * 2u;
-
-    this->particlesBuffer->addExchange( Mask( TOP ) + Mask( BOTTOM ),
-                                        ExchangeMemCfg::BYTES_EXCHANGE_Y,
-                                        commTag);
-    sizeOfExchanges += ExchangeMemCfg::BYTES_EXCHANGE_Y * 2u;
-
-    //edges of the simulation area
-    this->particlesBuffer->addExchange( Mask( RIGHT + TOP ) + Mask( LEFT + TOP ) +
-                                        Mask( LEFT + BOTTOM ) + Mask( RIGHT + BOTTOM ), ExchangeMemCfg::BYTES_EDGES,
-                                        commTag);
-    sizeOfExchanges += ExchangeMemCfg::BYTES_EDGES * 4u;
-
-#if(SIMDIM==DIM3)
-    this->particlesBuffer->addExchange( Mask( FRONT ) + Mask( BACK ), ExchangeMemCfg::BYTES_EXCHANGE_Z,
-                                        commTag);
-    sizeOfExchanges += ExchangeMemCfg::BYTES_EXCHANGE_Z * 2u;
-
-    //edges of the simulation area
-    this->particlesBuffer->addExchange( Mask( FRONT + TOP ) + Mask( BACK + TOP ) +
-                                        Mask( FRONT + BOTTOM ) + Mask( BACK + BOTTOM ),
-                                        ExchangeMemCfg::BYTES_EDGES,
-                                        commTag);
-    sizeOfExchanges += ExchangeMemCfg::BYTES_EDGES * 4u;
-
-    this->particlesBuffer->addExchange( Mask( FRONT + RIGHT ) + Mask( BACK + RIGHT ) +
-                                        Mask( FRONT + LEFT ) + Mask( BACK + LEFT ),
-                                        ExchangeMemCfg::BYTES_EDGES,
-                                        commTag);
-    sizeOfExchanges += ExchangeMemCfg::BYTES_EDGES * 4u;
-
-    //corner of the simulation area
-    this->particlesBuffer->addExchange( Mask( TOP + FRONT + RIGHT ) + Mask( TOP + BACK + RIGHT ) +
-                                        Mask( BOTTOM + FRONT + RIGHT ) + Mask( BOTTOM + BACK + RIGHT ),
-                                        ExchangeMemCfg::BYTES_CORNER,
-                                        commTag);
-    sizeOfExchanges += ExchangeMemCfg::BYTES_CORNER * 4u;
-
-    this->particlesBuffer->addExchange( Mask( TOP + FRONT + LEFT ) + Mask( TOP + BACK + LEFT ) +
-                                        Mask( BOTTOM + FRONT + LEFT ) + Mask( BOTTOM + BACK + LEFT ),
-                                        ExchangeMemCfg::BYTES_CORNER,
-                                        commTag);
-    sizeOfExchanges += ExchangeMemCfg::BYTES_CORNER * 4u;
-#endif
-
-    /* The buffer size must be multiplied by two because PMacc generates a send
-     * and receive buffer for each direction.
+    using namespace pmacc;
+
+    namespace detail
+    {
+        /* Helper to check if a member exists
+         *
+         * Derived from C++17 std::void_t.
+         * This implementation will be removed with Void provided by alpaka 0.6.0 release (not included in the 0.6.0rc3
+         * we currently using).
+         */
+        template<class...>
+        using Void = void;
+
+        /** Calculate the scaling factor for each direction.
+         *
+         * The scaling factor is derived from the reference size of the local domain and a scaling factor provided by
+         * the user.
+         *
+         * @tparam T_ExchangeMemCfg exchange configuration for a species
+         * @tparam T_Sfinae Type for conditionally specialization (no input parameter)
+         * @{
+         */
+        template<typename T_ExchangeMemCfg, typename T_Sfinae = void>
+        struct DirScalingFactor
+        {
+            //! @return factor to scale the amount of memory for each direction
+            static floatD_64 get()
+            {
+                return floatD_64::create(1.0);
+            }
+        };
+
+        /** Specialization for species with exchange memory information which provides
+         * DIR_SCALING_FACTOR and REF_LOCAL_DOM_SIZE
+         */
+        template<typename T_ExchangeMemCfg>
+        struct DirScalingFactor<
+            T_ExchangeMemCfg,
+            Void<
+                decltype(std::declval<T_ExchangeMemCfg>().DIR_SCALING_FACTOR),
+                typename T_ExchangeMemCfg::REF_LOCAL_DOM_SIZE>>
+        {
+            static floatD_64 get()
+            {
+                auto baseLocalCells = T_ExchangeMemCfg::REF_LOCAL_DOM_SIZE::toRT();
+                auto userScalingFactor = T_ExchangeMemCfg{}.DIR_SCALING_FACTOR;
+
+                auto localDomSize = Environment<simDim>::get().SubGrid().getLocalDomain().size;
+                // set too local domain size in case there is no base volume defined
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    if(baseLocalCells[d] <= 0)
+                        baseLocalCells[d] = localDomSize[d];
+                }
+
+                auto scale = floatD_64::create(1.0);
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    auto dir1 = (d + 1) % simDim;
+                    auto dir2 = (d + 2) % simDim;
+                    // precision: numbers are small, therefore the usage of double is fine
+                    auto scaleDirection = std::ceil(
+                        float_64(localDomSize[dir1]) / float_64(baseLocalCells[dir1]) * float_64(localDomSize[dir2])
+                        / float_64(baseLocalCells[dir2]));
+                    float_64 scalingFactor = scaleDirection * userScalingFactor[d];
+                    // do not scale down
+                    scale[d] = std::max(scalingFactor, 1.0);
+                }
+
+                return scale;
+            }
+        };
+
+        //! @}
+    } // namespace detail
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    size_t Particles<T_Name, T_Flags, T_Attributes>::exchangeMemorySize(uint32_t ex) const
+    {
+        // no communication direction
+        if(ex == 0u)
+            return 0u;
+
+        using ExchangeMemCfg = GetExchangeMemCfg_t<Particles>;
+        // scaling factor for each direction
+        auto dirScalingFactors = picongpu::detail::DirScalingFactor<ExchangeMemCfg>::get();
+
+        /* type of the exchange direction
+         * 1 = plane
+         * 2 = edge
+         * 3 = corner
+         */
+        uint32_t relDirType = 0u;
+
+        // scaling factor for the current exchange
+        float_64 exchangeScalingFactor = 1.0;
+
+        auto relDir = Mask::getRelativeDirections<simDim>(ex);
+        for(uint32_t d = 0; d < simDim; ++d)
+        {
+            // calculate the exchange type
+            relDirType += std::abs(relDir[d]);
+            exchangeScalingFactor *= relDir[d] != 0 ? dirScalingFactors[d] : 1.0;
+        }
+        size_t exchangeBytes = 0;
+
+        using ExchangeMemCfg = GetExchangeMemCfg_t<Particles>;
+
+        // it is a exachange
+        if(relDirType == 1u)
+        {
+            // x, y, z, edge, corner
+            pmacc::math::Vector<uint32_t, 3> requiredMem(
+                ExchangeMemCfg::BYTES_EXCHANGE_X,
+                ExchangeMemCfg::BYTES_EXCHANGE_Y,
+                ExchangeMemCfg::BYTES_EXCHANGE_Z);
+
+            for(uint32_t d = 0; d < simDim; ++d)
+                if(std::abs(relDir[d]) == 1)
+                {
+                    exchangeBytes = requiredMem[d];
+                    break;
+                }
+        }
+        // it is an edge
+        else if(relDirType == 2u)
+            exchangeBytes = ExchangeMemCfg::BYTES_EDGES;
+        // it is a corner
+        else
+            exchangeBytes = ExchangeMemCfg::BYTES_CORNER;
+
+        // using double to calculate the memory size is fine, double can precise store integer values up too 2^53
+        return exchangeBytes * exchangeScalingFactor;
+    }
+
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    Particles<T_Name, T_Flags, T_Attributes>::Particles(
+        const std::shared_ptr<DeviceHeap>& heap,
+        picongpu::MappingDesc cellDescription,
+        SimulationDataId datasetID)
+        : ParticlesBase<SpeciesParticleDescription, picongpu::MappingDesc, DeviceHeap>(heap, cellDescription)
+        , m_datasetID(datasetID)
+    {
+        size_t sizeOfExchanges = 0u;
+
+        const uint32_t commTag = pmacc::traits::GetUniqueTypeId<FrameType, uint32_t>::uid() + SPECIES_FIRSTTAG;
+        log<picLog::MEMORY>("communication tag for species %1%: %2%") % FrameType::getName() % commTag;
+
+        auto const numExchanges = NumberOfExchanges<simDim>::value;
+        for(uint32_t exchange = 1u; exchange < numExchanges; ++exchange)
+        {
+            auto mask = Mask(exchange);
+            auto mem = exchangeMemorySize(exchange);
+
+            this->particlesBuffer->addExchange(mask, mem, commTag);
+            /* The buffer size must be multiplied by two because PMacc generates a send
+             * and receive buffer for each direction.
+             */
+            sizeOfExchanges += mem * 2u;
+        };
+
+        constexpr size_t byteToMiB = 1024u * 1024u;
+
+        log<picLog::MEMORY>("size for all exchange of species %1% = %2% MiB") % FrameType::getName()
+            % (static_cast<float_64>(sizeOfExchanges) / static_cast<float_64>(byteToMiB));
+    }
+
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    void Particles<T_Name, T_Flags, T_Attributes>::createParticleBuffer()
+    {
+        this->particlesBuffer->createParticleBuffer();
+    }
+
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    SimulationDataId Particles<T_Name, T_Flags, T_Attributes>::getUniqueId()
+    {
+        return m_datasetID;
+    }
+
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    void Particles<T_Name, T_Flags, T_Attributes>::synchronize()
+    {
+        this->particlesBuffer->deviceToHost();
+    }
+
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    void Particles<T_Name, T_Flags, T_Attributes>::syncToDevice()
+    {
+    }
+
+    /** Launcher of the particle push
+     *
+     * @tparam T_Pusher pusher type
+     * @tparam T_isComposite if the pusher is composite
      */
-    sizeOfExchanges *= 2u;
-
-    constexpr size_t byteToMiB = 1024u * 1024u;
-
-    log< picLog::MEMORY >( "size for all exchange of species %1% = %2% MiB" ) %
-        FrameType::getName( ) %
-        ( static_cast< float_64 >( sizeOfExchanges ) / static_cast< float_64 >( byteToMiB ) );
-}
-
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-void
-Particles<
-    T_Name,
-    T_Flags,
-    T_Attributes
->::createParticleBuffer( )
-{
-    this->particlesBuffer->createParticleBuffer( );
-}
-
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-SimulationDataId
-Particles<
-    T_Name,
-    T_Flags,
-    T_Attributes
->::getUniqueId( )
-{
-    return m_datasetID;
-}
-
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-void
-Particles<
-    T_Name,
-    T_Flags,
-    T_Attributes
->::synchronize( )
-{
-    this->particlesBuffer->deviceToHost();
-}
-
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-void
-Particles<
-    T_Name,
-    T_Flags,
-    T_Attributes
->::syncToDevice( )
-{
+    template<typename T_Pusher, bool T_isComposite = particles::pusher::IsComposite<T_Pusher>::value>
+    struct PushLauncher;
 
-}
-
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-void
-Particles<
-    T_Name,
-    T_Flags,
-    T_Attributes
->::update( uint32_t const currentStep )
-{
-    using PusherAlias = typename GetFlagType<FrameType,particlePusher<> >::type;
-    using ParticlePush = typename pmacc::traits::Resolve<PusherAlias>::type;
-
-    using InterpolationScheme = typename pmacc::traits::Resolve<
-        typename GetFlagType<
-            FrameType,
-            interpolation< >
-        >::type
-    >::type;
-
-    using FrameSolver = PushParticlePerFrame<
-        ParticlePush,
-        MappingDesc::SuperCellSize,
-        InterpolationScheme
-    >;
-
-    DataConnector & dc = Environment< >::get( ).DataConnector( );
-    auto fieldE = dc.get< FieldE >(
-        FieldE::getName(),
-        true
-    );
-    auto fieldB = dc.get< FieldB >(
-        FieldB::getName(),
-        true
-    );
-
-    // adjust interpolation area in particle pusher to allow sub-sampling pushes
-    using LowerMargin = typename GetLowerMarginPusher< Particles >::type;
-    using UpperMargin = typename GetUpperMarginPusher< Particles >::type;
-
-    using BlockArea = SuperCellDescription<
-        typename MappingDesc::SuperCellSize,
-        LowerMargin,
-        UpperMargin
-    >;
-
-    AreaMapping<
-        CORE + BORDER,
-        picongpu::MappingDesc
-    > mapper( this->cellDescription );
-
-    constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-        pmacc::math::CT::volume< SuperCellSize >::type::value
-    >::value;
-
-    PMACC_KERNEL( KernelMoveAndMarkParticles< numWorkers, BlockArea >{ } )(
-        mapper.getGridDim(),
-        numWorkers
-    )(
-        this->getDeviceParticlesBox( ),
-        fieldE->getDeviceDataBox( ),
-        fieldB->getDeviceDataBox( ),
-        currentStep,
-        FrameSolver( ),
-        mapper
-    );
-
-    dc.releaseData( FieldE::getName() );
-    dc.releaseData( FieldB::getName() );
-
-    ParticlesBaseType::template shiftParticles < CORE + BORDER > ( );
-}
-
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-template<
-    typename T_DensityFunctor,
-    typename T_PositionFunctor
->
-void
-Particles<
-    T_Name,
-    T_Flags,
-    T_Attributes
->::initDensityProfile(
-    T_DensityFunctor& densityFunctor,
-    T_PositionFunctor& positionFunctor,
-    const uint32_t currentStep
-)
-{
-    log<picLog::SIMULATION_STATE >( "initialize density profile for species %1%" ) % FrameType::getName( );
-
-    uint32_t const numSlides = MovingWindow::getInstance( ).getSlideCounter( currentStep );
-    SubGrid< simDim > const & subGrid = Environment< simDim >::get( ).SubGrid( );
-    DataSpace< simDim > localCells = subGrid.getLocalDomain( ).size;
-    DataSpace< simDim > totalGpuCellOffset = subGrid.getLocalDomain( ).offset;
-    totalGpuCellOffset.y( ) += numSlides * localCells.y( );
-
-    constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-        pmacc::math::CT::volume< SuperCellSize >::type::value
-    >::value;
-
-    AreaMapping<
-        CORE + BORDER,
-        picongpu::MappingDesc
-    > mapper( this->cellDescription );
-    PMACC_KERNEL(
-        KernelFillGridWithParticles<
-            numWorkers,
-            Particles
-        >{}
-    )
-    (
-        mapper.getGridDim( ),
-        numWorkers
-    )
-    (
-        densityFunctor,
-        positionFunctor,
-        totalGpuCellOffset,
-        this->particlesBuffer->getDeviceParticleBox( ),
-        mapper
-    );
-
-    this->fillAllGaps( );
-}
-
-template<
-    typename T_Name,
-    typename T_Flags,
-    typename T_Attributes
->
-template<
-    typename T_SrcName,
-    typename T_SrcAttributes,
-    typename T_SrcFlags,
-    typename T_ManipulateFunctor,
-    typename T_SrcFilterFunctor
->
-void
-Particles<
-    T_Name,
-    T_Flags,
-    T_Attributes
->::deviceDeriveFrom(
-    Particles<
-        T_SrcName,
-        T_SrcAttributes,
-        T_SrcFlags
-    >& src,
-    T_ManipulateFunctor& manipulatorFunctor,
-    T_SrcFilterFunctor& srcFilterFunctor
-)
-{
-    log< picLog::SIMULATION_STATE > ( "clone species %1%" ) % FrameType::getName( );
-
-    AreaMapping<CORE + BORDER, picongpu::MappingDesc> mapper(this->cellDescription);
-
-    constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-           pmacc::math::CT::volume< SuperCellSize >::type::value
-    >::value;
-
-    PMACC_KERNEL( KernelDeriveParticles< numWorkers >{ } )(
-        mapper.getGridDim(),
-        numWorkers
-    )(
-        this->getDeviceParticlesBox( ),
-        src.getDeviceParticlesBox( ),
-        manipulatorFunctor,
-        srcFilterFunctor,
-        mapper
-    );
-    this->fillAllGaps( );
-}
+    /** Launcher of the particle push for non-composite pushers
+     *
+     * @tparam T_Pusher pusher type
+     */
+    template<typename T_Pusher>
+    struct PushLauncher<T_Pusher, false>
+    {
+        /** Launch the pusher for all particles of a species
+         *
+         * @tparam T_Particles particles type
+         * @param currentStep current time iteration
+         */
+        template<typename T_Particles>
+        void operator()(T_Particles&& particles, uint32_t const currentStep) const
+        {
+            particles.template push<T_Pusher>(currentStep);
+        }
+    };
+
+    /** Launcher of the particle push for composite pushers
+     *
+     * @tparam T_Pusher pusher type
+     */
+    template<typename T_CompositePusher>
+    struct PushLauncher<T_CompositePusher, true>
+    {
+        /** Launch the pusher for all particles of a species
+         *
+         * @tparam T_Particles particles type
+         * @param currentStep current time iteration
+         */
+        template<typename T_Particles>
+        void operator()(T_Particles&& particles, uint32_t const currentStep) const
+        {
+            /* Here we check for the active pusher and only call PushLauncher for
+             * that one. Note that we still instantiate both templates, but this
+             * should be fine as both pushers are eventually getting used (otherwise
+             * using the composite does not make sense).
+             */
+            auto activePusherIdx = T_CompositePusher::activePusherIdx(currentStep);
+            if(activePusherIdx == 1)
+                PushLauncher<typename T_CompositePusher::FirstPusher>{}(particles, currentStep);
+            else if(activePusherIdx == 2)
+                PushLauncher<typename T_CompositePusher::SecondPusher>{}(particles, currentStep);
+        }
+    };
+
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    void Particles<T_Name, T_Flags, T_Attributes>::update(uint32_t const currentStep)
+    {
+        using PusherAlias = typename GetFlagType<FrameType, particlePusher<>>::type;
+        using ParticlePush = typename pmacc::traits::Resolve<PusherAlias>::type;
+        // Because of composite pushers, we have to defer using the launcher
+        PushLauncher<ParticlePush>{}(*this, currentStep);
+    }
+
+    /** Do the particle push stage using the given pusher
+     *
+     * @tparam T_Pusher non-composite pusher type
+     * @param currentStep current time iteration
+     */
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    template<typename T_Pusher>
+    void Particles<T_Name, T_Flags, T_Attributes>::push(uint32_t const currentStep)
+    {
+        PMACC_CASSERT_MSG(
+            _internal_error_particle_push_instantiated_for_composite_pusher,
+            particles::pusher::IsComposite<T_Pusher>::type::value == false);
+
+        using InterpolationScheme =
+            typename pmacc::traits::Resolve<typename GetFlagType<FrameType, interpolation<>>::type>::type;
+
+        using FrameSolver = PushParticlePerFrame<T_Pusher, MappingDesc::SuperCellSize, InterpolationScheme>;
+
+        DataConnector& dc = Environment<>::get().DataConnector();
+        auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+        auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
+
+        /* Adjust interpolation area in particle pusher to allow sub-stepping pushes.
+         * Here were provide an actual pusher and use its actual margins
+         */
+        using LowerMargin = typename GetLowerMarginForPusher<Particles, T_Pusher>::type;
+        using UpperMargin = typename GetUpperMarginForPusher<Particles, T_Pusher>::type;
+
+        using BlockArea = SuperCellDescription<typename MappingDesc::SuperCellSize, LowerMargin, UpperMargin>;
+
+        AreaMapping<CORE + BORDER, picongpu::MappingDesc> mapper(this->cellDescription);
+
+        constexpr uint32_t numWorkers
+            = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+        PMACC_KERNEL(KernelMoveAndMarkParticles<numWorkers, BlockArea>{})
+        (mapper.getGridDim(), numWorkers)(
+            this->getDeviceParticlesBox(),
+            fieldE->getDeviceDataBox(),
+            fieldB->getDeviceDataBox(),
+            currentStep,
+            FrameSolver(),
+            mapper);
+
+        dc.releaseData(FieldE::getName());
+        dc.releaseData(FieldB::getName());
+
+        ParticlesBaseType::template shiftParticles<CORE + BORDER>();
+    }
+
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    template<typename T_DensityFunctor, typename T_PositionFunctor>
+    void Particles<T_Name, T_Flags, T_Attributes>::initDensityProfile(
+        T_DensityFunctor& densityFunctor,
+        T_PositionFunctor& positionFunctor,
+        const uint32_t currentStep)
+    {
+        log<picLog::SIMULATION_STATE>("initialize density profile for species %1%") % FrameType::getName();
+
+        uint32_t const numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+        SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+        DataSpace<simDim> localCells = subGrid.getLocalDomain().size;
+        DataSpace<simDim> totalGpuCellOffset = subGrid.getLocalDomain().offset;
+        totalGpuCellOffset.y() += numSlides * localCells.y();
+
+        constexpr uint32_t numWorkers
+            = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+        AreaMapping<CORE + BORDER, picongpu::MappingDesc> mapper(this->cellDescription);
+        PMACC_KERNEL(KernelFillGridWithParticles<numWorkers, Particles>{})
+        (mapper.getGridDim(), numWorkers)(
+            densityFunctor,
+            positionFunctor,
+            totalGpuCellOffset,
+            this->particlesBuffer->getDeviceParticleBox(),
+            mapper);
+
+        this->fillAllGaps();
+    }
+
+    template<typename T_Name, typename T_Flags, typename T_Attributes>
+    template<
+        typename T_SrcName,
+        typename T_SrcAttributes,
+        typename T_SrcFlags,
+        typename T_ManipulateFunctor,
+        typename T_SrcFilterFunctor>
+    void Particles<T_Name, T_Flags, T_Attributes>::deviceDeriveFrom(
+        Particles<T_SrcName, T_SrcAttributes, T_SrcFlags>& src,
+        T_ManipulateFunctor& manipulatorFunctor,
+        T_SrcFilterFunctor& srcFilterFunctor)
+    {
+        log<picLog::SIMULATION_STATE>("clone species %1%") % FrameType::getName();
+
+        AreaMapping<CORE + BORDER, picongpu::MappingDesc> mapper(this->cellDescription);
+
+        constexpr uint32_t numWorkers
+            = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+        PMACC_KERNEL(KernelDeriveParticles<numWorkers>{})
+        (mapper.getGridDim(), numWorkers)(
+            this->getDeviceParticlesBox(),
+            src.getDeviceParticlesBox(),
+            manipulatorFunctor,
+            srcFilterFunctor,
+            mapper);
+        this->fillAllGaps();
+    }
 
 } // namespace picongpu
diff --git a/include/picongpu/particles/ParticlesFunctors.hpp b/include/picongpu/particles/ParticlesFunctors.hpp
index d7565f2e92..a30e5b7fb2 100644
--- a/include/picongpu/particles/ParticlesFunctors.hpp
+++ b/include/picongpu/particles/ParticlesFunctors.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Marco Garten, Alexander Grund,
+/* Copyright 2014-2021 Rene Widera, Marco Garten, Alexander Grund,
  *                     Heiko Burau, Axel Huebl
  *
  * This file is part of PIConGPU.
@@ -28,11 +28,10 @@
 #include <pmacc/Environment.hpp>
 #include <pmacc/communication/AsyncCommunication.hpp>
 #include <pmacc/particles/meta/FindByNameOrType.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 
 #include "picongpu/particles/traits/GetIonizerList.hpp"
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "picongpu/particles/bremsstrahlung/Bremsstrahlung.hpp"
+#if(PMACC_CUDA_ENABLED == 1)
+#    include "picongpu/particles/bremsstrahlung/Bremsstrahlung.hpp"
 #endif
 #include "picongpu/particles/traits/GetPhotonCreator.hpp"
 #include "picongpu/particles/synchrotronPhotons/SynchrotronFunctions.hpp"
@@ -49,522 +48,426 @@
 
 namespace picongpu
 {
-
-namespace particles
-{
-
-/** assign nullptr to all attributes of a species
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of the species
- */
-template<typename T_SpeciesType>
-struct AssignNull
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    void operator()()
+    namespace particles
     {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto species = dc.get< SpeciesType >( FrameType::getName(), true );
-        species = nullptr;
-        dc.releaseData( FrameType::getName() );
-    }
-};
-
-/** create memory for the given species type
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of the species
- */
-template< typename T_SpeciesType >
-struct CreateSpecies
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    template<
-        typename T_DeviceHeap,
-        typename T_CellDescription
-    >
-    HINLINE void operator()(
-        const std::shared_ptr<T_DeviceHeap>& deviceHeap,
-        T_CellDescription* cellDesc
-    ) const
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        dc.consume(
-            pmacc::memory::makeUnique<SpeciesType>(
-                deviceHeap,
-                *cellDesc,
-                FrameType::getName()
-            )
-        );
-    }
-};
-
-/** write memory statistics to the terminal
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of the species
- */
-template< typename T_SpeciesType >
-struct LogMemoryStatisticsForSpecies
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    template<typename T_DeviceHeap>
-    HINLINE void operator()(
-        const std::shared_ptr<T_DeviceHeap>& deviceHeap
-    ) const
-    {
-#if( PMACC_CUDA_ENABLED == 1 )
-        log<picLog::MEMORY >("mallocMC: free slots for species %3%: %1% a %2%") %
-            deviceHeap->getAvailableSlots(sizeof (FrameType)) %
-            sizeof (FrameType) %
-            FrameType::getName();
+        /** assign nullptr to all attributes of a species
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of the species
+         */
+        template<typename T_SpeciesType>
+        struct AssignNull
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+            void operator()()
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                auto species = dc.get<SpeciesType>(FrameType::getName(), true);
+                species = nullptr;
+                dc.releaseData(FrameType::getName());
+            }
+        };
+
+        /** create memory for the given species type
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of the species
+         */
+        template<typename T_SpeciesType>
+        struct CreateSpecies
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+            template<typename T_DeviceHeap, typename T_CellDescription>
+            HINLINE void operator()(const std::shared_ptr<T_DeviceHeap>& deviceHeap, T_CellDescription* cellDesc) const
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                dc.consume(std::make_unique<SpeciesType>(deviceHeap, *cellDesc, FrameType::getName()));
+            }
+        };
+
+        /** write memory statistics to the terminal
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of the species
+         */
+        template<typename T_SpeciesType>
+        struct LogMemoryStatisticsForSpecies
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+            template<typename T_DeviceHeap>
+            HINLINE void operator()(const std::shared_ptr<T_DeviceHeap>& deviceHeap) const
+            {
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+                log<picLog::MEMORY>("mallocMC: free slots for species %3%: %1% a %2%")
+                    % deviceHeap->getAvailableSlots(
+                        cupla::manager::Device<cupla::AccDev>::get().current(),
+                        cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(0),
+                        sizeof(FrameType))
+                    % sizeof(FrameType) % FrameType::getName();
 #endif
-    }
-};
-
-/** call method reset for the given species
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of the species to reset
- */
-template< typename T_SpeciesType >
-struct CallReset
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    HINLINE void operator()( const uint32_t currentStep )
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto species = dc.get< SpeciesType >( FrameType::getName(), true );
-        species->reset( currentStep );
-        dc.releaseData( FrameType::getName() );
-    }
-};
-
-/** Allocate helper fields for FLYlite population kinetics for atomic physics
- *
- * energy histograms, rate matrix, etc.
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of ion species
- */
-template< typename T_SpeciesType >
-struct CallPopulationKineticsInit
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    using PopulationKineticsSolver = typename pmacc::traits::Resolve<
-        typename GetFlagType<
-            FrameType,
-            populationKinetics<>
-        >::type
-    >::type;
-
-    HINLINE void operator()(
-        pmacc::DataSpace< simDim > gridSizeLocal
-    ) const
-    {
-        PopulationKineticsSolver flylite;
-        flylite.init( gridSizeLocal, FrameType::getName() );
-    }
-};
-
-/** Calculate FLYlite population kinetics evolving one time step
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of ion species
- */
-template< typename T_SpeciesType >
-struct CallPopulationKinetics
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-
-    using FrameType = typename SpeciesType::FrameType;
-
-    using PopulationKineticsSolver = typename pmacc::traits::Resolve<
-        typename GetFlagType<
-            FrameType,
-            populationKinetics<>
-        >::type
-    >::type;
-
-    HINLINE void operator()( uint32_t currentStep ) const
-    {
-        PopulationKineticsSolver flylite{};
-        flylite.template update< SpeciesType >(
-            FrameType::getName(),
-            currentStep
-        );
-    }
-};
-
-/** push a species
- *
- * push is only triggered for species with a pusher
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of particle species that is checked
- */
-template<typename T_SpeciesType>
-struct PushSpecies
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    template<typename T_EventList>
-    HINLINE void operator()(
-        const uint32_t currentStep,
-        const EventTask& eventInt,
-        T_EventList& updateEvent
-    ) const
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto species = dc.get< SpeciesType >( FrameType::getName(), true );
-
-        __startTransaction(eventInt);
-        species->update(currentStep);
-        dc.releaseData( FrameType::getName() );
-        EventTask ev = __endTransaction();
-        updateEvent.push_back(ev);
-    }
-};
-
-/** Communicate a species
- *
- * communication is only triggered for species with a pusher
- *
- * @tparam T_SpeciesType type or name as boost::mpl::string of particle species that is checked
- */
-template<typename T_SpeciesType>
-struct CommunicateSpecies
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    template<typename T_EventList>
-    HINLINE void operator()(
-        T_EventList& updateEventList,
-        T_EventList& commEventList
-    ) const
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto species = dc.get< SpeciesType >( FrameType::getName(), true );
-
-        EventTask updateEvent(*(updateEventList.begin()));
-
-        updateEventList.pop_front();
-        commEventList.push_back( communication::asyncCommunication(*species, updateEvent) );
-
-        dc.releaseData( FrameType::getName() );
-    }
-};
+            }
+        };
 
-/** update momentum, move and communicate all species */
-struct PushAllSpecies
-{
-    /** push and communicate all species
-     *
-     * @param currentStep current simulation step
-     * @param pushEvent[out] grouped event that marks the end of the species push
-     * @param commEvent[out] grouped event that marks the end of the species communication
-     */
-    HINLINE void operator()(
-        const uint32_t currentStep,
-        const EventTask& eventInt,
-        EventTask& pushEvent,
-        EventTask& commEvent
-    ) const
-    {
-        using EventList = std::list<EventTask>;
-        EventList updateEventList;
-        EventList commEventList;
-
-        /* push all species */
-        using VectorSpeciesWithPusher = typename pmacc::particles::traits::FilterByFlag
-        <
-            VectorAllSpecies,
-            particlePusher<>
-        >::type;
-        meta::ForEach< VectorSpeciesWithPusher, particles::PushSpecies< bmpl::_1 > > pushSpecies;
-        pushSpecies( currentStep, eventInt, updateEventList );
-
-        /* join all push events */
-        for (typename EventList::iterator iter = updateEventList.begin();
-             iter != updateEventList.end();
-             ++iter)
+        /** call method reset for the given species
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of the species to reset
+         */
+        template<typename T_SpeciesType>
+        struct CallReset
         {
-            pushEvent += *iter;
-        }
-
-        /* call communication for all species */
-        meta::ForEach< VectorSpeciesWithPusher, particles::CommunicateSpecies< bmpl::_1> > communicateSpecies;
-        communicateSpecies( updateEventList, commEventList );
-
-        /* join all communication events */
-        for (typename EventList::iterator iter = commEventList.begin();
-             iter != commEventList.end();
-             ++iter)
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+            HINLINE void operator()(const uint32_t currentStep)
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                auto species = dc.get<SpeciesType>(FrameType::getName(), true);
+                species->reset(currentStep);
+                dc.releaseData(FrameType::getName());
+            }
+        };
+
+        /** Allocate helper fields for FLYlite population kinetics for atomic physics
+         *
+         * energy histograms, rate matrix, etc.
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of ion species
+         */
+        template<typename T_SpeciesType>
+        struct CallPopulationKineticsInit
         {
-            commEvent += *iter;
-        }
-    }
-};
-
-/** Call an ionization method upon an ion species
- *
- * \tparam T_SpeciesType type or name as boost::mpl::string of particle species that is going to be ionized with
- *                       ionization scheme T_SelectIonizer
- */
-template< typename T_SpeciesType, typename T_SelectIonizer >
-struct CallIonizationScheme
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using SelectIonizer = T_SelectIonizer;
-    using FrameType = typename SpeciesType::FrameType;
-
-    /* define the type of the species to be created
-    * from inside the ionization model specialization
-    */
-    using DestSpecies = typename SelectIonizer::DestSpecies;
-    using DestFrameType = typename DestSpecies::FrameType;
-
-    /** Functor implementation
-     *
-     * \tparam T_CellDescription contains the number of blocks and blocksize
-     *                           that is later passed to the kernel
-     * \param cellDesc logical block information like dimension and cell sizes
-     * \param currentStep The current time step
-     */
-    template<typename T_CellDescription>
-    HINLINE void operator()(
-        T_CellDescription cellDesc,
-        const uint32_t currentStep
-    ) const
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        // alias for pointer on source species
-        auto srcSpeciesPtr = dc.get< SpeciesType >( FrameType::getName(), true );
-        // alias for pointer on destination species
-        auto electronsPtr = dc.get< DestSpecies >( DestFrameType::getName(), true );
-
-        SelectIonizer selectIonizer(currentStep);
-
-        creation::createParticlesFromSpecies(*srcSpeciesPtr, *electronsPtr, selectIonizer, cellDesc);
-
-        /* fill the gaps in the created species' particle frames to ensure that only
-         * the last frame is not completely filled but every other before is full
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+            using PopulationKineticsSolver =
+                typename pmacc::traits::Resolve<typename GetFlagType<FrameType, populationKinetics<>>::type>::type;
+
+            HINLINE void operator()(pmacc::DataSpace<simDim> gridSizeLocal) const
+            {
+                PopulationKineticsSolver flylite;
+                flylite.init(gridSizeLocal, FrameType::getName());
+            }
+        };
+
+        /** Calculate FLYlite population kinetics evolving one time step
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of ion species
          */
-        electronsPtr->fillAllGaps();
-
-        dc.releaseData( FrameType::getName() );
-        dc.releaseData( DestFrameType::getName() );
-
-    }
-
-};
-
-/** Call all ionization schemes of an ion species
- *
- * Tests if species can be ionized and calls the kernels to do that
- *
- * \tparam T_SpeciesType type or name as boost::mpl::string of particle species that is checked for ionization
- */
-template< typename T_SpeciesType >
-struct CallIonization
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    using FrameType = typename SpeciesType::FrameType;
-
-    // SelectIonizer will be either the specified one or fallback: None
-    using SelectIonizerList = typename traits::GetIonizerList< SpeciesType >::type;
-
-    /** Functor implementation
-     *
-     * \tparam T_CellDescription contains the number of blocks and blocksize
-     *                           that is later passed to the kernel
-     * \param cellDesc logical block information like dimension and cell sizes
-     * \param currentStep The current time step
-     */
-    template<typename T_CellDescription>
-    HINLINE void operator()(
-        T_CellDescription cellDesc,
-        const uint32_t currentStep
-    ) const
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        // only if an ionizer has been specified, this is executed
-        using hasIonizers = typename HasFlag< FrameType, ionizers<> >::type;
-        if (hasIonizers::value)
+        template<typename T_SpeciesType>
+        struct CallPopulationKinetics
         {
-            meta::ForEach< SelectIonizerList, CallIonizationScheme< SpeciesType, bmpl::_1 > > particleIonization;
-            particleIonization( cellDesc, currentStep );
-        }
-    }
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
 
-};
+            using FrameType = typename SpeciesType::FrameType;
 
-#if( PMACC_CUDA_ENABLED == 1 )
+            using PopulationKineticsSolver =
+                typename pmacc::traits::Resolve<typename GetFlagType<FrameType, populationKinetics<>>::type>::type;
 
-/** Handles the bremsstrahlung effect for electrons on ions.
- *
- * @tparam T_ElectronSpecies type or name as boost::mpl::string of electron particle species
- */
-template<typename T_ElectronSpecies>
-struct CallBremsstrahlung
-{
-    using ElectronSpecies = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_ElectronSpecies
-    >;
-    using ElectronFrameType = typename ElectronSpecies::FrameType;
-
-    using IonSpecies = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        typename pmacc::particles::traits::ResolveAliasFromSpecies<
-            ElectronSpecies,
-            bremsstrahlungIons<>
-        >::type
-    >;
-    using PhotonSpecies = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        typename pmacc::particles::traits::ResolveAliasFromSpecies<
-            ElectronSpecies,
-            bremsstrahlungPhotons<>
-        >::type
-    >;
-    using PhotonFrameType = typename PhotonSpecies::FrameType;
-    using BremsstrahlungFunctor = bremsstrahlung::Bremsstrahlung<
-        IonSpecies,
-        ElectronSpecies,
-        PhotonSpecies
-    >;
-
-    /** Functor implementation
-     *
-     * \tparam T_CellDescription contains the number of blocks and blocksize
-     *                           that is later passed to the kernel
-     * \param cellDesc logical block information like dimension and cell sizes
-     * \param currentStep the current time step
-     */
-    template<typename T_CellDescription, typename ScaledSpectrumMap>
-    HINLINE void operator()(
-        T_CellDescription cellDesc,
-        const uint32_t currentStep,
-        const ScaledSpectrumMap& scaledSpectrumMap,
-        const bremsstrahlung::GetPhotonAngle& photonAngle
-    ) const
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
+            HINLINE void operator()(uint32_t currentStep) const
+            {
+                PopulationKineticsSolver flylite{};
+                flylite.template update<SpeciesType>(FrameType::getName(), currentStep);
+            }
+        };
 
-        /* alias for pointer on source species */
-        auto electronSpeciesPtr = dc.get< ElectronSpecies >( ElectronFrameType::getName(), true );
-        /* alias for pointer on destination species */
-        auto photonSpeciesPtr = dc.get< PhotonSpecies >( PhotonFrameType::getName(), true );
+        /** push a species
+         *
+         * push is only triggered for species with a pusher
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of particle species that is checked
+         */
+        template<typename T_SpeciesType>
+        struct PushSpecies
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+            template<typename T_EventList>
+            HINLINE void operator()(const uint32_t currentStep, const EventTask& eventInt, T_EventList& updateEvent)
+                const
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                auto species = dc.get<SpeciesType>(FrameType::getName(), true);
+
+                __startTransaction(eventInt);
+                species->update(currentStep);
+                dc.releaseData(FrameType::getName());
+                EventTask ev = __endTransaction();
+                updateEvent.push_back(ev);
+            }
+        };
+
+        /** Communicate a species
+         *
+         * communication is only triggered for species with a pusher
+         *
+         * @tparam T_SpeciesType type or name as boost::mpl::string of particle species that is checked
+         */
+        template<typename T_SpeciesType>
+        struct CommunicateSpecies
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
 
-        const float_X targetZ = GetAtomicNumbers<IonSpecies>::type::numberOfProtons;
+            template<typename T_EventList>
+            HINLINE void operator()(T_EventList& updateEventList, T_EventList& commEventList) const
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                auto species = dc.get<SpeciesType>(FrameType::getName(), true);
 
-        using namespace bremsstrahlung;
-        BremsstrahlungFunctor bremsstrahlungFunctor(
-            scaledSpectrumMap.at(targetZ).getScaledSpectrumFunctor(),
-            scaledSpectrumMap.at(targetZ).getStoppingPowerFunctor(),
-            photonAngle.getPhotonAngleFunctor(),
-            currentStep);
+                EventTask updateEvent(*(updateEventList.begin()));
 
-        creation::createParticlesFromSpecies(*electronSpeciesPtr, *photonSpeciesPtr, bremsstrahlungFunctor, cellDesc);
+                updateEventList.pop_front();
+                commEventList.push_back(communication::asyncCommunication(*species, updateEvent));
 
-        dc.releaseData( ElectronFrameType::getName() );
-        dc.releaseData( PhotonFrameType::getName() );
-    }
+                dc.releaseData(FrameType::getName());
+            }
+        };
 
-};
+        /** update momentum, move and communicate all species */
+        struct PushAllSpecies
+        {
+            /** push and communicate all species
+             *
+             * @param currentStep current simulation step
+             * @param pushEvent[out] grouped event that marks the end of the species push
+             * @param commEvent[out] grouped event that marks the end of the species communication
+             */
+            HINLINE void operator()(
+                const uint32_t currentStep,
+                const EventTask& eventInt,
+                EventTask& pushEvent,
+                EventTask& commEvent) const
+            {
+                using EventList = std::list<EventTask>;
+                EventList updateEventList;
+                EventList commEventList;
+
+                /* push all species */
+                using VectorSpeciesWithPusher =
+                    typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, particlePusher<>>::type;
+                meta::ForEach<VectorSpeciesWithPusher, particles::PushSpecies<bmpl::_1>> pushSpecies;
+                pushSpecies(currentStep, eventInt, updateEventList);
+
+                /* join all push events */
+                for(typename EventList::iterator iter = updateEventList.begin(); iter != updateEventList.end(); ++iter)
+                {
+                    pushEvent += *iter;
+                }
+
+                /* call communication for all species */
+                meta::ForEach<VectorSpeciesWithPusher, particles::CommunicateSpecies<bmpl::_1>> communicateSpecies;
+                communicateSpecies(updateEventList, commEventList);
+
+                /* join all communication events */
+                for(typename EventList::iterator iter = commEventList.begin(); iter != commEventList.end(); ++iter)
+                {
+                    commEvent += *iter;
+                }
+            }
+        };
+
+        /** Call an ionization method upon an ion species
+         *
+         * \tparam T_SpeciesType type or name as boost::mpl::string of particle species that is going to be ionized
+         * with ionization scheme T_SelectIonizer
+         */
+        template<typename T_SpeciesType, typename T_SelectIonizer>
+        struct CallIonizationScheme
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using SelectIonizer = T_SelectIonizer;
+            using FrameType = typename SpeciesType::FrameType;
+
+            /* define the type of the species to be created
+             * from inside the ionization model specialization
+             */
+            using DestSpecies = typename SelectIonizer::DestSpecies;
+            using DestFrameType = typename DestSpecies::FrameType;
+
+            /** Functor implementation
+             *
+             * \tparam T_CellDescription contains the number of blocks and blocksize
+             *                           that is later passed to the kernel
+             * \param cellDesc logical block information like dimension and cell sizes
+             * \param currentStep The current time step
+             */
+            template<typename T_CellDescription>
+            HINLINE void operator()(T_CellDescription cellDesc, const uint32_t currentStep) const
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+
+                // alias for pointer on source species
+                auto srcSpeciesPtr = dc.get<SpeciesType>(FrameType::getName(), true);
+                // alias for pointer on destination species
+                auto electronsPtr = dc.get<DestSpecies>(DestFrameType::getName(), true);
+
+                SelectIonizer selectIonizer(currentStep);
+
+                creation::createParticlesFromSpecies(*srcSpeciesPtr, *electronsPtr, selectIonizer, cellDesc);
+
+                /* fill the gaps in the created species' particle frames to ensure that only
+                 * the last frame is not completely filled but every other before is full
+                 */
+                electronsPtr->fillAllGaps();
+
+                dc.releaseData(FrameType::getName());
+                dc.releaseData(DestFrameType::getName());
+            }
+        };
+
+        /** Call all ionization schemes of an ion species
+         *
+         * Tests if species can be ionized and calls the kernels to do that
+         *
+         * \tparam T_SpeciesType type or name as boost::mpl::string of particle species that is checked for ionization
+         */
+        template<typename T_SpeciesType>
+        struct CallIonization
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            using FrameType = typename SpeciesType::FrameType;
+
+            // SelectIonizer will be either the specified one or fallback: None
+            using SelectIonizerList = typename traits::GetIonizerList<SpeciesType>::type;
+
+            /** Functor implementation
+             *
+             * \tparam T_CellDescription contains the number of blocks and blocksize
+             *                           that is later passed to the kernel
+             * \param cellDesc logical block information like dimension and cell sizes
+             * \param currentStep The current time step
+             */
+            template<typename T_CellDescription>
+            HINLINE void operator()(T_CellDescription cellDesc, const uint32_t currentStep) const
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+
+                // only if an ionizer has been specified, this is executed
+                using hasIonizers = typename HasFlag<FrameType, ionizers<>>::type;
+                if(hasIonizers::value)
+                {
+                    meta::ForEach<SelectIonizerList, CallIonizationScheme<SpeciesType, bmpl::_1>> particleIonization;
+                    particleIonization(cellDesc, currentStep);
+                }
+            }
+        };
+
+#if(PMACC_CUDA_ENABLED == 1)
+
+        /** Handles the bremsstrahlung effect for electrons on ions.
+         *
+         * @tparam T_ElectronSpecies type or name as boost::mpl::string of electron particle species
+         */
+        template<typename T_ElectronSpecies>
+        struct CallBremsstrahlung
+        {
+            using ElectronSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_ElectronSpecies>;
+            using ElectronFrameType = typename ElectronSpecies::FrameType;
+
+            using IonSpecies = pmacc::particles::meta::FindByNameOrType_t<
+                VectorAllSpecies,
+                typename pmacc::particles::traits::ResolveAliasFromSpecies<ElectronSpecies, bremsstrahlungIons<>>::
+                    type>;
+            using PhotonSpecies = pmacc::particles::meta::FindByNameOrType_t<
+                VectorAllSpecies,
+                typename pmacc::particles::traits::ResolveAliasFromSpecies<ElectronSpecies, bremsstrahlungPhotons<>>::
+                    type>;
+            using PhotonFrameType = typename PhotonSpecies::FrameType;
+            using BremsstrahlungFunctor = bremsstrahlung::Bremsstrahlung<IonSpecies, ElectronSpecies, PhotonSpecies>;
+
+            /** Functor implementation
+             *
+             * \tparam T_CellDescription contains the number of blocks and blocksize
+             *                           that is later passed to the kernel
+             * \param cellDesc logical block information like dimension and cell sizes
+             * \param currentStep the current time step
+             */
+            template<typename T_CellDescription, typename ScaledSpectrumMap>
+            HINLINE void operator()(
+                T_CellDescription cellDesc,
+                const uint32_t currentStep,
+                const ScaledSpectrumMap& scaledSpectrumMap,
+                const bremsstrahlung::GetPhotonAngle& photonAngle) const
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+
+                /* alias for pointer on source species */
+                auto electronSpeciesPtr = dc.get<ElectronSpecies>(ElectronFrameType::getName(), true);
+                /* alias for pointer on destination species */
+                auto photonSpeciesPtr = dc.get<PhotonSpecies>(PhotonFrameType::getName(), true);
+
+                const float_X targetZ = GetAtomicNumbers<IonSpecies>::type::numberOfProtons;
+
+                using namespace bremsstrahlung;
+                BremsstrahlungFunctor bremsstrahlungFunctor(
+                    scaledSpectrumMap.at(targetZ).getScaledSpectrumFunctor(),
+                    scaledSpectrumMap.at(targetZ).getStoppingPowerFunctor(),
+                    photonAngle.getPhotonAngleFunctor(),
+                    currentStep);
+
+                creation::createParticlesFromSpecies(
+                    *electronSpeciesPtr,
+                    *photonSpeciesPtr,
+                    bremsstrahlungFunctor,
+                    cellDesc);
+
+                dc.releaseData(ElectronFrameType::getName());
+                dc.releaseData(PhotonFrameType::getName());
+            }
+        };
 #endif
 
-/** Handles the synchrotron radiation emission of photons from electrons
- *
- * @tparam T_ElectronSpecies type or name as boost::mpl::string of electron particle species
- */
-template<typename T_ElectronSpecies>
-struct CallSynchrotronPhotons
-{
-    using ElectronSpecies = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_ElectronSpecies
-    >;
-    using ElectronFrameType = typename ElectronSpecies::FrameType;
-
-    /* SelectedPhotonCreator will be either PhotonCreator or fallback: CreatorBase */
-    using SelectedPhotonCreator = typename traits::GetPhotonCreator< ElectronSpecies >::type;
-    using PhotonSpecies = typename SelectedPhotonCreator::PhotonSpecies;
-    using PhotonFrameType = typename PhotonSpecies::FrameType;
-
-    /** Functor implementation
-     *
-     * \tparam T_CellDescription contains the number of blocks and blocksize
-     *                           that is later passed to the kernel
-     * \param cellDesc logical block information like dimension and cell sizes
-     * \param currentStep The current time step
-     * \param synchrotronFunctions synchrotron functions wrapper object
-     */
-    template<typename T_CellDescription>
-    HINLINE void operator()(
-        T_CellDescription cellDesc,
-        const uint32_t currentStep,
-        const synchrotronPhotons::SynchrotronFunctions& synchrotronFunctions
-    ) const
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        /* alias for pointer on source species */
-        auto electronSpeciesPtr = dc.get< ElectronSpecies >( ElectronFrameType::getName(), true );
-        /* alias for pointer on destination species */
-        auto photonSpeciesPtr = dc.get< PhotonSpecies >( PhotonFrameType::getName(), true );
-
-        using namespace synchrotronPhotons;
-        SelectedPhotonCreator photonCreator(
-            synchrotronFunctions.getCursor(SynchrotronFunctions::first),
-            synchrotronFunctions.getCursor(SynchrotronFunctions::second));
-
-        creation::createParticlesFromSpecies(*electronSpeciesPtr, *photonSpeciesPtr, photonCreator, cellDesc);
-
-        dc.releaseData( ElectronFrameType::getName() );
-        dc.releaseData( PhotonFrameType::getName() );
-    }
-
-};
-
-} // namespace particles
+        /** Handles the synchrotron radiation emission of photons from electrons
+         *
+         * @tparam T_ElectronSpecies type or name as boost::mpl::string of electron particle species
+         */
+        template<typename T_ElectronSpecies>
+        struct CallSynchrotronPhotons
+        {
+            using ElectronSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_ElectronSpecies>;
+            using ElectronFrameType = typename ElectronSpecies::FrameType;
+
+            /* SelectedPhotonCreator will be either PhotonCreator or fallback: CreatorBase */
+            using SelectedPhotonCreator = typename traits::GetPhotonCreator<ElectronSpecies>::type;
+            using PhotonSpecies = typename SelectedPhotonCreator::PhotonSpecies;
+            using PhotonFrameType = typename PhotonSpecies::FrameType;
+
+            /** Functor implementation
+             *
+             * \tparam T_CellDescription contains the number of blocks and blocksize
+             *                           that is later passed to the kernel
+             * \param cellDesc logical block information like dimension and cell sizes
+             * \param currentStep The current time step
+             * \param synchrotronFunctions synchrotron functions wrapper object
+             */
+            template<typename T_CellDescription>
+            HINLINE void operator()(
+                T_CellDescription cellDesc,
+                const uint32_t currentStep,
+                const synchrotronPhotons::SynchrotronFunctions& synchrotronFunctions) const
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+
+                /* alias for pointer on source species */
+                auto electronSpeciesPtr = dc.get<ElectronSpecies>(ElectronFrameType::getName(), true);
+                /* alias for pointer on destination species */
+                auto photonSpeciesPtr = dc.get<PhotonSpecies>(PhotonFrameType::getName(), true);
+
+                using namespace synchrotronPhotons;
+                SelectedPhotonCreator photonCreator(
+                    synchrotronFunctions.getCursor(SynchrotronFunctions::first),
+                    synchrotronFunctions.getCursor(SynchrotronFunctions::second));
+
+                creation::createParticlesFromSpecies(*electronSpeciesPtr, *photonSpeciesPtr, photonCreator, cellDesc);
+
+                dc.releaseData(ElectronFrameType::getName());
+                dc.releaseData(PhotonFrameType::getName());
+            }
+        };
+
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ParticlesInit.kernel b/include/picongpu/particles/ParticlesInit.kernel
index a6673db7cc..0893ef933e 100644
--- a/include/picongpu/particles/ParticlesInit.kernel
+++ b/include/picongpu/particles/ParticlesInit.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -37,35 +37,29 @@
 
 namespace picongpu
 {
-
-    /** returns the particle density.
+    /** Return physical particle density value for the given cell
+     *
+     * That is, the number of real (physical, not macro-) particles,
+     * in PIC units of volume**-3.
+     * Takes into account base density and density ratio of the species.
+     * The resulting density is assumed constant inside a cell.
      *
-     *  That means:
-     *     The REAL number of particles density in units of volume**-3,
-     *       normed to UNIT_LENGHT**3
-     *     That is NOT the species' macro particle density.
+     * @tparam T_Species particle species type
+     * @tparam T_DensityProfile density functor type,
+     *                          follows densityProfiles::IProfile concept
      *
-     * @param offset The gpu offset (left top front cell in 3D)
-     * @param cellIdx the current cell on this gpu
-     * @return a float_X which stands for the real number of particles per volume
+     * @param totalCellOffset total offset from the start of the global
+     *                        simulation area, including all slides [in cells]
      */
-    template<
-        typename T_Species,
-        typename T_DensityProfile
-    >
-    DINLINE float_X calcRealDensity(
-        T_DensityProfile & densityFunctor,
-        DataSpace< simDim > const & totalGpuCellIdx
-    )
+    template<typename T_Species, typename T_DensityProfile>
+    DINLINE float_X calcRealDensity(T_DensityProfile& densityFunctor, DataSpace<simDim> const& totalCellOffset)
     {
         PMACC_CASSERT_MSG(
             Please_deselect_densityProfileInitMethod_for_your_species_or_set_BASE_DENSITY_to_a_value_greater_than_0,
-            BASE_DENSITY > float_X( 0.0 )
-        );
-
-        float_X const densityRatioOfSpecies = traits::GetDensityRatio< T_Species >::type::getValue( );
+            BASE_DENSITY > float_X(0.0));
 
-        float_X const value = densityFunctor( totalGpuCellIdx ) * BASE_DENSITY * densityRatioOfSpecies;
+        float_X const densityRatioOfSpecies = traits::GetDensityRatio<T_Species>::type::getValue();
+        float_X const value = densityFunctor(totalCellOffset) * BASE_DENSITY * densityRatioOfSpecies;
         return value;
     }
 
@@ -74,10 +68,7 @@ namespace picongpu
      * @tparam T_numWorkers number of workers
      * @tparam T_Species picongpu::Particles, species type which is initialized
      */
-    template<
-        uint32_t T_numWorkers,
-        typename T_Species
-    >
+    template<uint32_t T_numWorkers, typename T_Species>
     struct KernelFillGridWithParticles
     {
         /** fill supercell grid with particles
@@ -103,270 +94,163 @@ namespace picongpu
             typename T_PositionFunctor,
             typename T_ParBox,
             typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void
-        operator()(
-            T_Acc const & acc,
+            typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
             T_DensityProfile densityFunctor,
             T_PositionFunctor positionFunctor,
-            DataSpace< simDim > const totalGpuCellOffset,
+            DataSpace<simDim> const totalGpuCellOffset,
             T_ParBox pb,
-            T_Mapping mapper
-        ) const
+            T_Mapping mapper) const
         {
             using namespace mappings::threads;
 
-            constexpr uint32_t frameSize = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            PMACC_CONSTEXPR_CAPTURE uint32_t cellsPerSupercell = pmacc::math::CT::volume< SuperCellSize >::type::value;
+            constexpr uint32_t frameSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
+            PMACC_CONSTEXPR_CAPTURE uint32_t cellsPerSupercell = pmacc::math::CT::volume<SuperCellSize>::type::value;
             constexpr uint32_t numWorkers = T_numWorkers;
 
-            uint32_t const workerIdx = threadIdx.x;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
             using FramePtr = typename T_ParBox::FramePtr;
             using FrameType = typename T_ParBox::FrameType;
             using ParticleType = typename FrameType::ParticleType;
-            DataSpace< simDim > const superCells( mapper.getGridSuperCells( ) );
-
-            PMACC_SMEM(
-                acc,
-                frame,
-                FramePtr
-            );
-            PMACC_SMEM(
-                acc,
-                finished,
-                int
-            );
-
-            DataSpace< simDim > const superCellIdx(
-                mapper.getSuperCellIndex( DataSpace<simDim >( blockIdx ) )
-            );
+            DataSpace<simDim> const superCells(mapper.getGridSuperCells());
+
+            PMACC_SMEM(acc, frame, FramePtr);
+            PMACC_SMEM(acc, finished, int);
+
+            DataSpace<simDim> const superCellIdx(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
 
             /* offset of the superCell relative to the local domain [in supercells] (without guarding supercells) */
-            DataSpace< simDim > const localSuperCellOffset(
-                superCellIdx - mapper.getGuardingSuperCells()
-            );
+            DataSpace<simDim> const localSuperCellOffset(superCellIdx - mapper.getGuardingSuperCells());
 
-            using ParticleDomCfg = IdxConfig<
-                frameSize,
-                numWorkers
-            >;
+            using ParticleDomCfg = IdxConfig<frameSize, numWorkers>;
 
-            using SuperCellDomCfg = IdxConfig<
-                cellsPerSupercell,
-                numWorkers
-            >;
+            using SuperCellDomCfg = IdxConfig<cellsPerSupercell, numWorkers>;
 
-            ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
+            ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
 
             /* number of particles to create for each cell (virtual worker) */
-            memory::CtxArray<
-                uint32_t,
-                SuperCellDomCfg
-            >
-            numParsPerCellCtx( 0 );
+            memory::CtxArray<uint32_t, SuperCellDomCfg> numParsPerCellCtx(0);
 
             /* create for each virtual thread a position functor instance */
             memory::CtxArray<
-                decltype(
-                    positionFunctor(
-                        acc,
-                        alpaka::core::declval< DataSpace< simDim > const >( ),
-                        /* cellsPerSupercell is used because each virtual worker
-                         * is creating **exactly one** functor
-                         */
-                        alpaka::core::declval< WorkerCfg< cellsPerSupercell > const >( )
-                    )
-                ),
-                SuperCellDomCfg
-            >
-            positionFunctorCtx{ };
+                decltype(positionFunctor(
+                    acc,
+                    alpaka::core::declval<DataSpace<simDim> const>(),
+                    /* cellsPerSupercell is used because each virtual worker
+                     * is creating **exactly one** functor
+                     */
+                    alpaka::core::declval<WorkerCfg<cellsPerSupercell> const>())),
+                SuperCellDomCfg>
+                positionFunctorCtx{};
 
 
-            ForEachIdx<
-                IdxConfig<
-                    1,
-                    numWorkers
-                >
-            > onlyMaster{ workerIdx };
+            ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
 
             /* reset shared memory flag if a virtual worker needs to create a particle */
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    finished = 1;
-                }
-            );
-
-            __syncthreads();
+            onlyMaster([&](uint32_t const, uint32_t const) { finished = 1; });
 
-            // initialize the position functor for each cell in the supercell
-            ForEachIdx<
-                IdxConfig<
-                    cellsPerSupercell,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    /* cell index within the superCell */
-                    DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
-
-                    /* cell offset to the begin of the simulation */
-                    DataSpace< simDim > const totalCellOffset =
-                        totalGpuCellOffset +
-                        localSuperCellOffset * SuperCellSize::toRT() +
-                        cellIdx;
-                    float_X const realDensity = calcRealDensity< T_Species >(
-                        densityFunctor,
-                        totalCellOffset
-                    );
-
-                    /** @bug volatile is required for CUDA 9.2 and sm_60 else the compiler will
-                     * optimize out `if(realParticlesPerCell > 0.0_X)` later on.
-                     */
-                    volatile float_X const realParticlesPerCell = realDensity * CELL_VOLUME;
+            cupla::__syncthreads(acc);
 
-                    // create an independent position functor for each cell in the supercell
-                    positionFunctorCtx[ idx ] = positionFunctor(
+            // initialize the position functor for each cell in the supercell
+            ForEachIdx<IdxConfig<cellsPerSupercell, numWorkers>>{
+                workerIdx}([&](uint32_t const linearIdx, uint32_t const idx) {
+                /* cell index within the superCell */
+                DataSpace<simDim> const cellIdx = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
+
+                /* cell offset to the begin of the simulation */
+                DataSpace<simDim> const totalCellOffset
+                    = totalGpuCellOffset + localSuperCellOffset * SuperCellSize::toRT() + cellIdx;
+                float_X const realDensity = calcRealDensity<T_Species>(densityFunctor, totalCellOffset);
+
+                /** @bug volatile is required for CUDA 9.2 and sm_60 else the compiler will
+                 * optimize out `if(realParticlesPerCell > 0.0_X)` later on.
+                 */
+                volatile float_X const realParticlesPerCell = realDensity * CELL_VOLUME;
+
+                // create an independent position functor for each cell in the supercell
+                positionFunctorCtx[idx]
+                    = positionFunctor(acc, localSuperCellOffset, WorkerCfg<cellsPerSupercell>{linearIdx});
+
+                if(realParticlesPerCell > 0.0_X)
+                    numParsPerCellCtx[idx]
+                        = positionFunctorCtx[idx].template numberOfMacroParticles<ParticleType>(realParticlesPerCell);
+
+                if(numParsPerCellCtx[idx] > 0)
+                    nvidia::atomicAllExch(
                         acc,
-                        localSuperCellOffset,
-                        WorkerCfg< cellsPerSupercell >{ linearIdx }
-                    );
+                        &finished,
+                        0,
+                        ::alpaka::hierarchy::Threads{}); // one or more cells have particles to create
 
-                    if(realParticlesPerCell > 0.0_X)
-                        numParsPerCellCtx[ idx ] =
-                            positionFunctorCtx[ idx ].template numberOfMacroParticles< ParticleType >( realParticlesPerCell );
+                return numParsPerCellCtx[idx];
+            });
 
-                    if( numParsPerCellCtx[ idx ] > 0 )
-                        nvidia::atomicAllExch(
-                            acc,
-                            &finished,
-                            0,
-                            ::alpaka::hierarchy::Threads{}
-                        ); //one or more cells have particles to create
+            cupla::__syncthreads(acc);
 
-                    return numParsPerCellCtx[ idx ];
-                }
-            );
-
-            __syncthreads();
-
-            if( finished == 1 )
+            if(finished == 1)
                 return; // if there is no particle which has to be created
 
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    frame = pb.getEmptyFrame();
-                    pb.setAsLastFrame(
-                        acc,
-                        frame,
-                        superCellIdx
-                    );
-                }
-            );
+            onlyMaster([&](uint32_t const, uint32_t const) {
+                frame = pb.getEmptyFrame(acc);
+                pb.setAsLastFrame(acc, frame, superCellIdx);
+            });
 
             // distribute the particles within the cell
             do
             {
                 // wait that master updates the current used frame
-                __syncthreads();
+                cupla::__syncthreads(acc);
 
-                onlyMaster(
-                    [&](
-                        uint32_t const,
-                        uint32_t const
-                    )
-                    {
-                        finished = 1;
-                    }
-                );
+                onlyMaster([&](uint32_t const, uint32_t const) { finished = 1; });
 
-                __syncthreads();
+                cupla::__syncthreads(acc);
 
-                forEachParticle(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    if(numParsPerCellCtx[idx] > 0u)
                     {
-                        if( numParsPerCellCtx[ idx ] > 0u )
+                        auto particle = frame[linearIdx];
+
+                        /** we now initialize all attributes of the new particle to their default values
+                         *   some attributes, such as the position, localCellIdx, weighting or the
+                         *   multiMask (\see AttrToIgnore) of the particle will be set individually
+                         *   in the following lines since they are already known at this point.
+                         */
                         {
-                            auto particle = frame[ linearIdx ];
-
-                            /** we now initialize all attributes of the new particle to their default values
-                             *   some attributes, such as the position, localCellIdx, weighting or the
-                             *   multiMask (\see AttrToIgnore) of the particle will be set individually
-                             *   in the following lines since they are already known at this point.
-                             */
-                            {
-                                using ParticleAttrList = typename FrameType::ValueTypeSeq;
-                                using AttrToIgnore = bmpl::vector4<
-                                    position<>,
-                                    multiMask,
-                                    localCellIdx,
-                                    weighting
-                                >;
-                                using ParticleCleanedAttrList = typename ResolveAndRemoveFromSeq<
-                                    ParticleAttrList,
-                                    AttrToIgnore
-                                >::type;
-
-                                meta::ForEach<
-                                    ParticleCleanedAttrList,
-                                    SetAttributeToDefault< bmpl::_1 >
-                                > setToDefault;
-                                setToDefault( particle );
-                            }
-                            particle[ multiMask_ ] = 1;
-                            particle[ localCellIdx_ ] = linearIdx;
-                            // initialize position and weighting
-                            positionFunctorCtx[ idx ]( acc, particle );
-
-                            numParsPerCellCtx[ idx ]--;
-                            if( numParsPerCellCtx[ idx ] > 0 )
-                                nvidia::atomicAllExch(
-                                    acc,
-                                    &finished,
-                                    0,
-                                    ::alpaka::hierarchy::Threads{}
-                                ); // one or more cells have particles to create
+                            using ParticleAttrList = typename FrameType::ValueTypeSeq;
+                            using AttrToIgnore = bmpl::vector4<position<>, multiMask, localCellIdx, weighting>;
+                            using ParticleCleanedAttrList =
+                                typename ResolveAndRemoveFromSeq<ParticleAttrList, AttrToIgnore>::type;
+
+                            meta::ForEach<ParticleCleanedAttrList, SetAttributeToDefault<bmpl::_1>> setToDefault;
+                            setToDefault(particle);
                         }
+                        particle[multiMask_] = 1;
+                        particle[localCellIdx_] = linearIdx;
+                        // initialize position and weighting
+                        positionFunctorCtx[idx](acc, particle);
+
+                        numParsPerCellCtx[idx]--;
+                        if(numParsPerCellCtx[idx] > 0)
+                            nvidia::atomicAllExch(
+                                acc,
+                                &finished,
+                                0,
+                                ::alpaka::hierarchy::Threads{}); // one or more cells have particles to create
                     }
-                );
+                });
 
-                __syncthreads();
+                cupla::__syncthreads(acc);
 
-                onlyMaster(
-                    [&](
-                        uint32_t const,
-                        uint32_t const
-                    )
+                onlyMaster([&](uint32_t const, uint32_t const) {
+                    if(finished == 0)
                     {
-                        if( finished == 0 )
-                        {
-                            frame = pb.getEmptyFrame();
-                            pb.setAsLastFrame(
-                                acc,
-                                frame,
-                                superCellIdx
-                            );
-                        }
+                        frame = pb.getEmptyFrame(acc);
+                        pb.setAsLastFrame(acc, frame, superCellIdx);
                     }
-                );
-            }
-            while( finished == 0 );
+                });
+            } while(finished == 0);
         }
     };
 
diff --git a/include/picongpu/particles/access/Cell2Particle.hpp b/include/picongpu/particles/access/Cell2Particle.hpp
index c655dba172..48a8c62b61 100644
--- a/include/picongpu/particles/access/Cell2Particle.hpp
+++ b/include/picongpu/particles/access/Cell2Particle.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,34 +28,41 @@
 
 namespace picongpu
 {
-namespace particleAccess
-{
-
-#define TEMPLATE_ARGS(Z, N, _) typename Arg ## N
-#define NORMAL_ARGS(Z, N, _) Arg ## N arg ## N
+    namespace particleAccess
+    {
+#define TEMPLATE_ARGS(Z, N, _) typename Arg##N
+#define NORMAL_ARGS(Z, N, _) Arg##N arg##N
 
-#define CELL2PARTICLE_OPERATOR(Z, N, _) \
-    template<typename T_Acc, typename TParticlesBox, typename CellIndex, typename Functor, typename T_Filter BOOST_PP_ENUM_TRAILING(N, TEMPLATE_ARGS, _)> \
-    DINLINE void operator()(T_Acc const & acc, TParticlesBox pb, const uint32_t workerIdx, const CellIndex& cellIndex, Functor functor, T_Filter filter BOOST_PP_ENUM_TRAILING(N, NORMAL_ARGS, _)); \
+#define CELL2PARTICLE_OPERATOR(Z, N, _)                                                                               \
+    template<                                                                                                         \
+        typename T_Acc,                                                                                               \
+        typename TParticlesBox,                                                                                       \
+        typename CellIndex,                                                                                           \
+        typename Functor,                                                                                             \
+        typename T_Filter BOOST_PP_ENUM_TRAILING(N, TEMPLATE_ARGS, _)>                                                \
+    DINLINE void operator()(                                                                                          \
+        T_Acc const& acc,                                                                                             \
+        TParticlesBox pb,                                                                                             \
+        const uint32_t workerIdx,                                                                                     \
+        const CellIndex& cellIndex,                                                                                   \
+        Functor functor,                                                                                              \
+        T_Filter filter BOOST_PP_ENUM_TRAILING(N, NORMAL_ARGS, _));
 
 
-template<
-    typename SuperCellSize,
-    uint32_t T_numWorkers
->
-struct Cell2Particle
-{
-    using result_type = void;
-    static constexpr uint32_t numWorkers = T_numWorkers;
+        template<typename SuperCellSize, uint32_t T_numWorkers>
+        struct Cell2Particle
+        {
+            using result_type = void;
+            static constexpr uint32_t numWorkers = T_numWorkers;
 
-    BOOST_PP_REPEAT(5, CELL2PARTICLE_OPERATOR, _)
-};
+            BOOST_PP_REPEAT(5, CELL2PARTICLE_OPERATOR, _)
+        };
 
 #undef CELL2PARTICLE_OPERATOR
 #undef TEMPLATE_ARGS
 #undef NORMAL_ARGS
 
-} // namespace particleAccess
+    } // namespace particleAccess
 } // namespace picongpu
 
 #include "Cell2Particle.tpp"
diff --git a/include/picongpu/particles/access/Cell2Particle.tpp b/include/picongpu/particles/access/Cell2Particle.tpp
index d9195d9d66..cfb5657910 100644
--- a/include/picongpu/particles/access/Cell2Particle.tpp
+++ b/include/picongpu/particles/access/Cell2Particle.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -31,107 +31,78 @@
 
 namespace picongpu
 {
-namespace particleAccess
-{
-
-#define TEMPLATE_ARGS(Z, N, _) typename Arg ## N
-#define NORMAL_ARGS(Z, N, _) Arg ## N arg ## N
-#define ARGS(Z, N, _) arg ## N
+    namespace particleAccess
+    {
+#define TEMPLATE_ARGS(Z, N, _) typename Arg##N
+#define NORMAL_ARGS(Z, N, _) Arg##N arg##N
+#define ARGS(Z, N, _) arg##N
 
-#define CELL2PARTICLE_OPERATOR(Z, N, _) \
-template<typename SuperCellSize, uint32_t T_numWorkers> \
-template<typename T_Acc, typename TParticlesBox, typename CellIndex, typename Functor, typename T_Filter \
-         BOOST_PP_ENUM_TRAILING(N, TEMPLATE_ARGS, _)> \
-DINLINE void Cell2Particle<SuperCellSize, T_numWorkers>::operator() \
-(T_Acc const & acc, TParticlesBox pb, const uint32_t workerIdx, const CellIndex& cellIndex, Functor functor, T_Filter filter \
-BOOST_PP_ENUM_TRAILING(N, NORMAL_ARGS, _)) \
-{ \
-    using namespace mappings::threads; \
-    constexpr uint32_t numWorkers = T_numWorkers; \
-    constexpr lcellId_t maxParticlesInFrame = pmacc::math::CT::volume< typename TParticlesBox::FrameType::SuperCellSize >::type::value; \
-    CellIndex superCellIdx = cellIndex / (CellIndex)SuperCellSize::toRT(); \
-    \
-    using FramePtr = typename TParticlesBox::FramePtr; \
-    using Frame = typename TParticlesBox::FrameType; \
-    PMACC_SMEM( acc, frame, FramePtr ); \
-    PMACC_SMEM( acc, particlesInSuperCell, uint16_t ); \
-    ForEachIdx< \
-        IdxConfig< \
-            1, \
-            numWorkers \
-        > \
-    > onlyMaster{ workerIdx }; \
-    \
-    onlyMaster( \
-        [&]( \
-            uint32_t const, \
-            uint32_t const \
-        ) \
-        { \
-            frame = pb.getLastFrame(superCellIdx); \
-            particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame(); \
-        } \
-    ); \
-    __syncthreads(); \
-    \
-    if (!frame.isValid()) return; /* leave kernel if we have no frames*/ \
-    \
-    auto accFilter = filter( \
-        acc, \
-        superCellIdx - GuardSize::toRT(), \
-        mappings::threads::WorkerCfg< numWorkers >{ workerIdx } \
-    ); \
-    \
-    while (frame.isValid()) \
-    { \
-        using ParticleDomCfg = IdxConfig< \
-            maxParticlesInFrame, \
-            numWorkers \
-        >; \
-        ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx ); \
-        forEachParticle( \
-            [&]( \
-                uint32_t const linearThreadIdx, \
-                uint32_t const \
-            ) \
-            { \
-                if (linearThreadIdx < particlesInSuperCell) \
-                { \
-                    if( \
-                        accFilter( \
-                            acc, \
-                            frame[ linearThreadIdx ] \
-                        ) \
-                    ) \
-                        functor( \
-                            acc, \
-                            frame, linearThreadIdx \
-                            BOOST_PP_ENUM_TRAILING(N, ARGS, _) \
-                            ); \
-                } \
-            } \
-        ); \
-        __syncthreads(); \
-        onlyMaster( \
-            [&]( \
-                uint32_t const, \
-                uint32_t const \
-            ) \
-            { \
-                frame = pb.getPreviousFrame(frame); \
-                particlesInSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value; \
-            } \
-        ); \
-        __syncthreads(); \
-    } \
-}
+#define CELL2PARTICLE_OPERATOR(Z, N, _)                                                                               \
+    template<typename SuperCellSize, uint32_t T_numWorkers>                                                           \
+    template<                                                                                                         \
+        typename T_Acc,                                                                                               \
+        typename TParticlesBox,                                                                                       \
+        typename CellIndex,                                                                                           \
+        typename Functor,                                                                                             \
+        typename T_Filter BOOST_PP_ENUM_TRAILING(N, TEMPLATE_ARGS, _)>                                                \
+    DINLINE void Cell2Particle<SuperCellSize, T_numWorkers>::operator()(                                              \
+        T_Acc const& acc,                                                                                             \
+        TParticlesBox pb,                                                                                             \
+        const uint32_t workerIdx,                                                                                     \
+        const CellIndex& cellIndex,                                                                                   \
+        Functor functor,                                                                                              \
+        T_Filter filter BOOST_PP_ENUM_TRAILING(N, NORMAL_ARGS, _))                                                    \
+    {                                                                                                                 \
+        using namespace mappings::threads;                                                                            \
+        constexpr uint32_t numWorkers = T_numWorkers;                                                                 \
+        constexpr lcellId_t maxParticlesInFrame                                                                       \
+            = pmacc::math::CT::volume<typename TParticlesBox::FrameType::SuperCellSize>::type::value;                 \
+        CellIndex superCellIdx = cellIndex / (CellIndex) SuperCellSize::toRT();                                       \
+                                                                                                                      \
+        using FramePtr = typename TParticlesBox::FramePtr;                                                            \
+        using Frame = typename TParticlesBox::FrameType;                                                              \
+        PMACC_SMEM(acc, frame, FramePtr);                                                                             \
+        PMACC_SMEM(acc, particlesInSuperCell, uint16_t);                                                              \
+        ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};                                                   \
+                                                                                                                      \
+        onlyMaster([&](uint32_t const, uint32_t const) {                                                              \
+            frame = pb.getLastFrame(superCellIdx);                                                                    \
+            particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame();                                  \
+        });                                                                                                           \
+        cupla::__syncthreads(acc);                                                                                    \
+                                                                                                                      \
+        if(!frame.isValid())                                                                                          \
+            return; /* leave kernel if we have no frames*/                                                            \
+                                                                                                                      \
+        auto accFilter                                                                                                \
+            = filter(acc, superCellIdx - GuardSize::toRT(), mappings::threads::WorkerCfg<numWorkers>{workerIdx});     \
+                                                                                                                      \
+        while(frame.isValid())                                                                                        \
+        {                                                                                                             \
+            using ParticleDomCfg = IdxConfig<maxParticlesInFrame, numWorkers>;                                        \
+            ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);                                                    \
+            forEachParticle([&](uint32_t const linearThreadIdx, uint32_t const) {                                     \
+                if(linearThreadIdx < particlesInSuperCell)                                                            \
+                {                                                                                                     \
+                    if(accFilter(acc, frame[linearThreadIdx]))                                                        \
+                        functor(acc, frame, linearThreadIdx BOOST_PP_ENUM_TRAILING(N, ARGS, _));                      \
+                }                                                                                                     \
+            });                                                                                                       \
+            cupla::__syncthreads(acc);                                                                                \
+            onlyMaster([&](uint32_t const, uint32_t const) {                                                          \
+                frame = pb.getPreviousFrame(frame);                                                                   \
+                particlesInSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;                           \
+            });                                                                                                       \
+            cupla::__syncthreads(acc);                                                                                \
+        }                                                                                                             \
+    }
 
-BOOST_PP_REPEAT(5, CELL2PARTICLE_OPERATOR, _)
+        BOOST_PP_REPEAT(5, CELL2PARTICLE_OPERATOR, _)
 
 #undef CELL2PARTICLE_OPERATOR
 #undef TEMPLATE_ARGS
 #undef NORMAL_ARGS
 #undef ARGS
 
-} // namespace particleAccess
+    } // namespace particleAccess
 } // namespace picongpu
diff --git a/include/picongpu/particles/boundary/CallPluginsAndDeleteParticles.hpp b/include/picongpu/particles/boundary/CallPluginsAndDeleteParticles.hpp
index 141756e61d..882cb1e1dd 100644
--- a/include/picongpu/particles/boundary/CallPluginsAndDeleteParticles.hpp
+++ b/include/picongpu/particles/boundary/CallPluginsAndDeleteParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -27,50 +27,40 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace boundary
-{
-    /**
-     * Guard handler policy calling all registered plugins when particles
-     * leave the global simulation volume. This class serves as policy for
-     * the `ParticleDescription` template class.
-     *
-     * For each plugin the method `IPlugin::onParticleLeave()` is called.
-     * After that the guard particles are deleted.
-     */
-    struct CallPluginsAndDeleteParticles
+    namespace particles
     {
-        template< class T_Particles >
-        void
-        handleOutgoing(
-            T_Particles & particles,
-            int32_t const direction
-        ) const
+        namespace boundary
         {
-            using Plugins = std::list<pmacc::IPlugin*>;
-            Plugins plugins = Environment<>::get().PluginConnector().getAllPlugins();
-
-            for( Plugins::iterator iter = plugins.begin(); iter != plugins.end(); iter++ )
+            /**
+             * Guard handler policy calling all registered plugins when particles
+             * leave the global simulation volume. This class serves as policy for
+             * the `ParticleDescription` template class.
+             *
+             * For each plugin the method `IPlugin::onParticleLeave()` is called.
+             * After that the guard particles are deleted.
+             */
+            struct CallPluginsAndDeleteParticles
             {
-                ( *iter )->onParticleLeave(
-                    T_Particles::FrameType::getName(),
-                    direction
-                );
-            }
+                template<class T_Particles>
+                void handleOutgoing(T_Particles& particles, int32_t const direction) const
+                {
+                    using Plugins = std::list<pmacc::IPlugin*>;
+                    Plugins plugins = Environment<>::get().PluginConnector().getAllPlugins();
+
+                    for(Plugins::iterator iter = plugins.begin(); iter != plugins.end(); iter++)
+                    {
+                        (*iter)->onParticleLeave(T_Particles::FrameType::getName(), direction);
+                    }
 
-            particles.deleteGuardParticles( direction );
-        }
+                    particles.deleteGuardParticles(direction);
+                }
 
-        template< class T_Particles >
-        void
-        handleIncoming(
-            T_Particles &,
-            int32_t const
-        ) const
-        {}
-    };
+                template<class T_Particles>
+                void handleIncoming(T_Particles&, int32_t const) const
+                {
+                }
+            };
 
-} // namespace particles
-} // namespace boundary
+        } // namespace boundary
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/bremsstrahlung/Bremsstrahlung.hpp b/include/picongpu/particles/bremsstrahlung/Bremsstrahlung.hpp
index 053beb4ff4..47e76bdc1e 100644
--- a/include/picongpu/particles/bremsstrahlung/Bremsstrahlung.hpp
+++ b/include/picongpu/particles/bremsstrahlung/Bremsstrahlung.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -33,155 +33,146 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace bremsstrahlung
-{
-
-/** Handling of the Bremsstrahlung effect.
- *
- * Here the screened Bethe-Heitler cross section is used. See e.g.:
- * Salvat, F., et al. "Monte Carlo simulation of bremsstrahlung emission by electrons."
- * Radiation Physics and Chemistry 75.10 (2006): 1201-1219.
- *
- * The numerics separates the energy spectrum into two parts. In the low-energy part
- * photon emission is neglected and a drag force is applied to the electrons. In the high-energy part
- * photons are created in addition to the drag force.
- *
- * Electron deflection is treated as screened Rutherford scattering, see e.g. Jackson, chap. 13.5
- *
- * The photon emission angle is taken from the Lorentz-boosted dipole radiation formula,
- * see e.g. Jackson, chap. 15.2
- *
- * \tparam T_ElectronSpecies
- * \tparam T_PhotonSpecies
- */
-template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
-struct Bremsstrahlung
-{
-    using IonSpecies = T_IonSpecies;
-    using ElectronSpecies = T_ElectronSpecies;
-    using PhotonSpecies = T_PhotonSpecies;
-
-    using FrameType = typename ElectronSpecies::FrameType;
-
-    /* specify field to particle interpolation scheme */
-    using Field2ParticleInterpolation = typename pmacc::traits::Resolve<
-        typename GetFlagType<FrameType,interpolation<> >::type
-    >::type;
-
-    /* margins around the supercell for the interpolation of the field on the cells */
-    using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
-    using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
-
-    /* relevant area of a block */
-    using BlockArea = SuperCellDescription<
-        typename MappingDesc::SuperCellSize,
-        LowerMargin,
-        UpperMargin
-    >;
-
-    BlockArea BlockDescription;
-
-    using TVec = MappingDesc::SuperCellSize;
-
-    using ValueTypeIonDensity = FieldTmp::ValueType;
-
-private:
-    /* global memory ion density field device databoxes */
-    PMACC_ALIGN(ionDensityBox, FieldTmp::DataBoxType);
-    /* shared memory ion density device databoxes */
-    PMACC_ALIGN(cachedIonDensity, DataBox<SharedBox<ValueTypeIonDensity, typename BlockArea::FullSuperCellSize, 0> >);
-
-    PMACC_ALIGN(scaledSpectrumFunctor, ScaledSpectrum::LookupTableFunctor);
-    PMACC_ALIGN(stoppingPowerFunctor, ScaledSpectrum::LookupTableFunctor);
-    PMACC_ALIGN(getPhotonAngleFunctor, GetPhotonAngle::GetPhotonAngleFunctor);
-
-    PMACC_ALIGN(photonMom, float3_X);
-
-    /* random number generator */
-    using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
-    using Distribution = pmacc::random::distributions::Uniform<float_X>;
-    using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
-    RandomGen randomGen;
-
-public:
-    /* host constructor initializing member */
-    HINLINE Bremsstrahlung(
-        const ScaledSpectrum::LookupTableFunctor& scaledSpectrumFunctor,
-        const ScaledSpectrum::LookupTableFunctor& stoppingPowerFunctor,
-        const GetPhotonAngle::GetPhotonAngleFunctor& getPhotonAngleFunctor,
-        const uint32_t currentStep);
-
-    /** Initialization function on device
-     *
-     * \brief Cache ion density field on device
-     *         and initialize possible prerequisites, like e.g. random number generator.
-     *
-     * This function will be called inline on the device which must happen BEFORE threads diverge
-     * during loop execution. The reason for this is the `__syncthreads()` call which is necessary after
-     * initializing the ion density field in shared memory.
-     */
-    template< typename T_Acc >
-    DINLINE void init(
-        T_Acc const & acc,
-        const DataSpace<simDim>& blockCell,
-        const int& linearThreadIdx,
-        const DataSpace<simDim>& localCellOffset
-    );
-
-    /** cache fields used by this functor
-     *
-     * @warning this is a collective method and calls synchronize
-     *
-     * @tparam T_Acc alpaka accelerator type
-     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-     *
-     * @param acc alpaka accelerator
-     * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
-     * @param workerCfg configuration of the worker
-     */
-    template<
-        typename T_Acc ,
-        typename T_WorkerCfg
-    >
-    DINLINE void collectiveInit(
-        const T_Acc & acc,
-        const DataSpace<simDim>& blockCell,
-        const T_WorkerCfg & workerCfg
-    );
-
-    /** Rotates a vector to a given polar angle and a random azimuthal angle.
-     *
-     * @param vec vector to be rotated
-     * @param theta polar angle
-     * @return rotated vector
-     */
-    template< typename T_Acc >
-    DINLINE float3_X scatterByTheta(const T_Acc& acc, const float3_X vec, const float_X theta);
-
-    /** Return the number of target particles to be created from each source particle.
-     *
-     * Called for each frame of the source species.
-     *
-     * @param sourceFrame Frame of the source species
-     * @param localIdx Index of the source particle within frame
-     * @return number of particle to be created from each source particle
-     */
-    template< typename T_Acc >
-    DINLINE unsigned int numNewParticles(const T_Acc& acc, FrameType& sourceFrame, int localIdx);
-
-    /** Functor implementation.
-     *
-     * Called once for each single particle creation.
-     *
-     * \tparam Electron type of electron which creates the photon
-     * \tparam Photon type of photon that is created
-     */
-    template<typename Electron, typename Photon, typename T_Acc>
-    DINLINE void operator()(const T_Acc& acc, Electron& electron, Photon& photon);
-};
-
-} // namespace bremsstrahlung
-} // namespace particles
+    namespace particles
+    {
+        namespace bremsstrahlung
+        {
+            /** Handling of the Bremsstrahlung effect.
+             *
+             * Here the screened Bethe-Heitler cross section is used. See e.g.:
+             * Salvat, F., et al. "Monte Carlo simulation of bremsstrahlung emission by electrons."
+             * Radiation Physics and Chemistry 75.10 (2006): 1201-1219.
+             *
+             * The numerics separates the energy spectrum into two parts. In the low-energy part
+             * photon emission is neglected and a drag force is applied to the electrons. In the high-energy part
+             * photons are created in addition to the drag force.
+             *
+             * Electron deflection is treated as screened Rutherford scattering, see e.g. Jackson, chap. 13.5
+             *
+             * The photon emission angle is taken from the Lorentz-boosted dipole radiation formula,
+             * see e.g. Jackson, chap. 15.2
+             *
+             * \tparam T_ElectronSpecies
+             * \tparam T_PhotonSpecies
+             */
+            template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
+            struct Bremsstrahlung
+            {
+                using IonSpecies = T_IonSpecies;
+                using ElectronSpecies = T_ElectronSpecies;
+                using PhotonSpecies = T_PhotonSpecies;
+
+                using FrameType = typename ElectronSpecies::FrameType;
+
+                /* specify field to particle interpolation scheme */
+                using Field2ParticleInterpolation =
+                    typename pmacc::traits::Resolve<typename GetFlagType<FrameType, interpolation<>>::type>::type;
+
+                /* margins around the supercell for the interpolation of the field on the cells */
+                using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
+                using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
+
+                /* relevant area of a block */
+                using BlockArea = SuperCellDescription<typename MappingDesc::SuperCellSize, LowerMargin, UpperMargin>;
+
+                BlockArea BlockDescription;
+
+                using TVec = MappingDesc::SuperCellSize;
+
+                using ValueTypeIonDensity = FieldTmp::ValueType;
+
+            private:
+                /* global memory ion density field device databoxes */
+                PMACC_ALIGN(ionDensityBox, FieldTmp::DataBoxType);
+                /* shared memory ion density device databoxes */
+                PMACC_ALIGN(
+                    cachedIonDensity,
+                    DataBox<SharedBox<ValueTypeIonDensity, typename BlockArea::FullSuperCellSize, 0>>);
+
+                PMACC_ALIGN(scaledSpectrumFunctor, ScaledSpectrum::LookupTableFunctor);
+                PMACC_ALIGN(stoppingPowerFunctor, ScaledSpectrum::LookupTableFunctor);
+                PMACC_ALIGN(getPhotonAngleFunctor, GetPhotonAngle::GetPhotonAngleFunctor);
+
+                PMACC_ALIGN(photonMom, float3_X);
+
+                /* random number generator */
+                using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
+                using Distribution = pmacc::random::distributions::Uniform<float_X>;
+                using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
+                RandomGen randomGen;
+
+            public:
+                /* host constructor initializing member */
+                HINLINE Bremsstrahlung(
+                    const ScaledSpectrum::LookupTableFunctor& scaledSpectrumFunctor,
+                    const ScaledSpectrum::LookupTableFunctor& stoppingPowerFunctor,
+                    const GetPhotonAngle::GetPhotonAngleFunctor& getPhotonAngleFunctor,
+                    const uint32_t currentStep);
+
+                /** Initialization function on device
+                 *
+                 * \brief Cache ion density field on device
+                 *         and initialize possible prerequisites, like e.g. random number generator.
+                 *
+                 * This function will be called inline on the device which must happen BEFORE threads diverge
+                 * during loop execution. The reason for this is the `cupla::__syncthreads( acc )` call which is
+                 * necessary after initializing the ion density field in shared memory.
+                 */
+                template<typename T_Acc>
+                DINLINE void init(
+                    T_Acc const& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const int& linearThreadIdx,
+                    const DataSpace<simDim>& localCellOffset);
+
+                /** cache fields used by this functor
+                 *
+                 * @warning this is a collective method and calls synchronize
+                 *
+                 * @tparam T_Acc alpaka accelerator type
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 *
+                 * @param acc alpaka accelerator
+                 * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
+                 * @param workerCfg configuration of the worker
+                 */
+                template<typename T_Acc, typename T_WorkerCfg>
+                DINLINE void collectiveInit(
+                    const T_Acc& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const T_WorkerCfg& workerCfg);
+
+                /** Rotates a vector to a given polar angle and a random azimuthal angle.
+                 *
+                 * @param vec vector to be rotated
+                 * @param theta polar angle
+                 * @return rotated vector
+                 */
+                template<typename T_Acc>
+                DINLINE float3_X scatterByTheta(const T_Acc& acc, const float3_X vec, const float_X theta);
+
+                /** Return the number of target particles to be created from each source particle.
+                 *
+                 * Called for each frame of the source species.
+                 *
+                 * @param sourceFrame Frame of the source species
+                 * @param localIdx Index of the source particle within frame
+                 * @return number of particle to be created from each source particle
+                 */
+                template<typename T_Acc>
+                DINLINE unsigned int numNewParticles(const T_Acc& acc, FrameType& sourceFrame, int localIdx);
+
+                /** Functor implementation.
+                 *
+                 * Called once for each single particle creation.
+                 *
+                 * \tparam Electron type of electron which creates the photon
+                 * \tparam Photon type of photon that is created
+                 */
+                template<typename Electron, typename Photon, typename T_Acc>
+                DINLINE void operator()(const T_Acc& acc, Electron& electron, Photon& photon);
+            };
+
+        } // namespace bremsstrahlung
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/bremsstrahlung/Bremsstrahlung.tpp b/include/picongpu/particles/bremsstrahlung/Bremsstrahlung.tpp
index 6c472071fa..64584f9654 100644
--- a/include/picongpu/particles/bremsstrahlung/Bremsstrahlung.tpp
+++ b/include/picongpu/particles/bremsstrahlung/Bremsstrahlung.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -30,7 +30,6 @@
 #include "picongpu/particles/traits/GetAtomicNumbers.hpp"
 
 #include <pmacc/dataManagement/DataConnector.hpp>
-#include <pmacc/algorithms/math/defines/sqrt.hpp>
 #include <pmacc/algorithms/math/defines/dot.hpp>
 #include <pmacc/algorithms/math/defines/cross.hpp>
 #include <pmacc/algorithms/math/defines/pi.hpp>
@@ -39,278 +38,244 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace bremsstrahlung
-{
-
-template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
-Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::Bremsstrahlung(
-    const ScaledSpectrum::LookupTableFunctor& scaledSpectrumFunctor,
-    const ScaledSpectrum::LookupTableFunctor& stoppingPowerFunctor,
-    const GetPhotonAngle::GetPhotonAngleFunctor& getPhotonAngleFunctor,
-    const uint32_t currentStep)
-        : scaledSpectrumFunctor(scaledSpectrumFunctor),
-          stoppingPowerFunctor(stoppingPowerFunctor),
-          getPhotonAngleFunctor(getPhotonAngleFunctor),
-          photonMom(float3_X::create(0)),
-          randomGen(RNGFactory::createRandom<Distribution>())
-{
-    DataConnector &dc = Environment<>::get().DataConnector();
-
-    /* initialize pointers on host-side tmp-field databoxes */
-    auto fieldIonDensity = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true );
-    /* reset values to zero */
-    fieldIonDensity->getGridBuffer().getDeviceBuffer().setValue(FieldTmp::ValueType(0.0));
-
-    /* load species without copying the particle data to the host */
-    auto ionSpecies = dc.get< T_IonSpecies >( T_IonSpecies::FrameType::getName(), true );
-
-    /* compute ion density */
-    using DensitySolver = typename particleToGrid::CreateFieldTmpOperation<
-        T_IonSpecies,
-        particleToGrid::derivedAttributes::Density
-    >::type::Solver;
-    fieldIonDensity->template computeValue< CORE + BORDER, DensitySolver >(*ionSpecies, currentStep);
-    dc.releaseData(T_IonSpecies::FrameType::getName());
-
-    /* initialize device-side tmp-field databoxes */
-    this->ionDensityBox = fieldIonDensity->getDeviceDataBox();
-}
-
-template<
-    typename T_IonSpecies,
-    typename T_ElectronSpecies,
-    typename T_PhotonSpecies
->
-template<
-    typename T_Acc,
-    typename T_WorkerCfg
->
-DINLINE void Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::collectiveInit(
-    const T_Acc & acc,
-    const DataSpace<simDim>& blockCell,
-    const T_WorkerCfg & workerCfg
-)
-{
-    /* caching of ion density field */
-    cachedIonDensity = CachedBox::create<
-        0,
-        ValueTypeIonDensity
-    >(
-        acc,
-        BlockArea()
-    );
-
-    /* instance of nvidia assignment operator */
-    nvidia::functors::Assign assign;
-    /* copy fields from global to shared */
-    const auto fieldIonDensityBlock = ionDensityBox.shift(blockCell);
-
-    ThreadCollective<
-        BlockArea,
-        T_WorkerCfg::numWorkers
-    > collective( workerCfg.getWorkerIdx( ) );
-    collective(
-              acc,
-              assign,
-              cachedIonDensity,
-              fieldIonDensityBlock
-              );
-
-    /* wait for shared memory to be initialized */
-    __syncthreads();
-}
-
-template<
-    typename T_IonSpecies,
-    typename T_ElectronSpecies,
-    typename T_PhotonSpecies
->
-template< typename T_Acc >
-DINLINE
-void Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::init(
-    T_Acc const & acc,
-    const DataSpace<simDim>& blockCell,
-    const int& linearThreadIdx,
-    const DataSpace<simDim>& localCellOffset
-)
-{
-    /* initialize random number generator with the local cell index in the simulation */
-    this->randomGen.init(localCellOffset);
-}
-
-
-template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
-template<typename T_Acc>
-DINLINE
-float3_X Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::scatterByTheta
-    (const T_Acc & acc, const float3_X vec, const float_X theta)
-{
-    using namespace pmacc::algorithms;
-
-    float_X sinTheta, cosTheta;
-    math::sincos(theta, sinTheta, cosTheta);
-
-    const float_X phi = -math::Pi<float_X>::value + math::Pi<float_X>::doubleValue * this->randomGen(acc);
-    float_X sinPhi, cosPhi;
-    math::sincos(phi, sinPhi, cosPhi);
-
-    const float3_X vecUp(0.0, 0.0, 1.0);
-    float3_X vecOrtho1 = math::cross(vecUp, vec);
-    const float_X vecOrtho1Abs = math::abs(vecOrtho1);
-
-    float3_X vecOrtho1_norm;
-    if(vecOrtho1Abs == float_X(0.0))
-        vecOrtho1_norm = float3_X(1.0, 0.0, 0.0);
-    else
-        vecOrtho1_norm = vecOrtho1 / vecOrtho1Abs;
-    const float3_X vecOrtho2 = math::cross(vecOrtho1_norm, vec);
-    vecOrtho1 = vecOrtho1_norm * math::abs(vec);
-
-    return vec * cosTheta +
-           vecOrtho1 * (sinTheta * cosPhi) +
-           vecOrtho2 * (sinTheta * sinPhi);
-}
-
-template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
-template<typename T_Acc>
-DINLINE
-unsigned int Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::numNewParticles(
-    const T_Acc& acc,
-    FrameType& sourceFrame,
-    int localIdx
-)
-{
-    using namespace pmacc::algorithms;
-
-    auto particle = sourceFrame[localIdx];
-
-    /* particle position, used for field-to-particle interpolation */
-    const floatD_X pos = particle[position_];
-    const int particleCellIdx = particle[localCellIdx_];
-    /* multi-dim coordinate of the local cell inside the super cell */
-    const DataSpace<TVec::dim> localCell(DataSpaceOperations<TVec::dim>::template map<TVec > (particleCellIdx));
-    /* interpolation of fieldTmp */
-    const picongpu::traits::FieldPosition<fields::CellType, FieldTmp, simDim> fieldTmpPos;
-    const ValueTypeIonDensity ionDensity_norm = Field2ParticleInterpolation()
-        (cachedIonDensity.shift(localCell).toCursor(), pos, fieldTmpPos());
-
-    /* TODO: obtain the ion density from the molare ion density in order to avoid the rescaling.
-     * So this should be: ionDensity = ionMolDensity / UNIT_AMOUNT_SUBSTANCE */
-    const float_X ionDensity = ionDensity_norm.x() * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE;
-
-    const float_X weighting = particle[weighting_];
-
-    const float_X c = SPEED_OF_LIGHT;
-    float3_X mom = particle[momentum_] / weighting;
-    const float_X momAbs = math::abs(mom);
-    float3_X mom_norm = mom / momAbs;
-
-    const float_X mass = frame::getMass<FrameType>();
-    const float_X Ekin = (Gamma<>()(mom, mass) - float_X(1.0)) * mass * c*c;
-    if(Ekin < electron::MIN_ENERGY)
-        return 0;
-
-    /* electron deflection due to Rutherford scattering without modifying the electron
-       energy based on radiation emission */
-    const float_X zMin = float_X(1.0) / (math::Pi<float_X>::value * math::Pi<float_X>::value);
-    const float_X zMax = float_X(1.0) / (electron::MIN_THETA*electron::MIN_THETA);
-    const float_X z = zMin + this->randomGen(acc) * (zMax - zMin);
-    const float_X theta = math::rsqrt(z);
-    const float_X targetZ = GetAtomicNumbers<T_IonSpecies>::type::numberOfProtons;
-    const float_X rutherfordCoeff = float_X(2.0) * ELECTRON_CHARGE*ELECTRON_CHARGE /
-        (float_X(4.0) * math::Pi<float_X>::value * EPS0) * targetZ / Ekin;
-    const float_X scaledDeflectionDCS = math::Pi<float_X>::value * (zMax - zMin) * rutherfordCoeff*rutherfordCoeff;
-    const float_X deflectionProb = ionDensity * c * DELTA_T * scaledDeflectionDCS;
-
-    if(this->randomGen(acc) < deflectionProb)
-    {
-        mom = this->scatterByTheta(acc, mom, theta);
-        mom_norm = mom / momAbs;
-    }
-
-    /* non-radiative Bremsstrahlung */
-    const float_X kappaCutoff = math::min(photon::SOFT_PHOTONS_CUTOFF / Ekin, float_X(1.0));
-    const float_X stoppingPower = ionDensity * c * this->stoppingPowerFunctor(Ekin, kappaCutoff);
-    const float_X newEkin = math::max(Ekin - stoppingPower * DELTA_T, float_X(0.0));
-    const float_X newEkin_norm = newEkin / (mass * c*c);
-    /* This is based on: (p / mc)^2 = (E_kin / mc^2)^2 + 2 * (E_kin / mc^2) */
-    const float_X newMomAbs = mass * c * math::sqrt(newEkin_norm*newEkin_norm + float_X(2.0) * newEkin_norm);
-    const float_X deltaMom = newMomAbs - momAbs;
-    particle[momentum_] = (mom + deltaMom * mom_norm) * weighting;
-
-    /* photon emission */
-    const float_X delta = this->randomGen(acc);
-    const float_X kappa = math::pow(kappaCutoff, delta);
-    const float_X scalingFactor = -math::log(kappaCutoff);
-    const float_X emissionProb = photon::WEIGHTING_RATIO * scalingFactor * ionDensity * c * DELTA_T * this->scaledSpectrumFunctor(Ekin, kappa);
-
-    // raise a warning if the emission probability is too high.
-    if(picLog::log_level & picLog::CRITICAL::lvl)
+    namespace particles
     {
-        if(emissionProb > float_X(photon::SINGLE_EMISSION_PROB_LIMIT))
+        namespace bremsstrahlung
         {
-            const float_X Ekin_SI = Ekin * UNIT_ENERGY;
-            printf("[Bremsstrahlung] warning: emission probability is too high: \
+            template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
+            Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::Bremsstrahlung(
+                const ScaledSpectrum::LookupTableFunctor& scaledSpectrumFunctor,
+                const ScaledSpectrum::LookupTableFunctor& stoppingPowerFunctor,
+                const GetPhotonAngle::GetPhotonAngleFunctor& getPhotonAngleFunctor,
+                const uint32_t currentStep)
+                : scaledSpectrumFunctor(scaledSpectrumFunctor)
+                , stoppingPowerFunctor(stoppingPowerFunctor)
+                , getPhotonAngleFunctor(getPhotonAngleFunctor)
+                , photonMom(float3_X::create(0))
+                , randomGen(RNGFactory::createRandom<Distribution>())
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+
+                /* initialize pointers on host-side tmp-field databoxes */
+                auto fieldIonDensity = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+                /* reset values to zero */
+                fieldIonDensity->getGridBuffer().getDeviceBuffer().setValue(FieldTmp::ValueType(0.0));
+
+                /* load species without copying the particle data to the host */
+                auto ionSpecies = dc.get<T_IonSpecies>(T_IonSpecies::FrameType::getName(), true);
+
+                /* compute ion density */
+                using DensitySolver = typename particleToGrid::
+                    CreateFieldTmpOperation<T_IonSpecies, particleToGrid::derivedAttributes::Density>::type::Solver;
+                fieldIonDensity->template computeValue<CORE + BORDER, DensitySolver>(*ionSpecies, currentStep);
+                dc.releaseData(T_IonSpecies::FrameType::getName());
+
+                /* initialize device-side tmp-field databoxes */
+                this->ionDensityBox = fieldIonDensity->getDeviceDataBox();
+            }
+
+            template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
+            template<typename T_Acc, typename T_WorkerCfg>
+            DINLINE void Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::collectiveInit(
+                const T_Acc& acc,
+                const DataSpace<simDim>& blockCell,
+                const T_WorkerCfg& workerCfg)
+            {
+                /* caching of ion density field */
+                cachedIonDensity = CachedBox::create<0, ValueTypeIonDensity>(acc, BlockArea());
+
+                /* instance of nvidia assignment operator */
+                nvidia::functors::Assign assign;
+                /* copy fields from global to shared */
+                const auto fieldIonDensityBlock = ionDensityBox.shift(blockCell);
+
+                ThreadCollective<BlockArea, T_WorkerCfg::numWorkers> collective(workerCfg.getWorkerIdx());
+                collective(acc, assign, cachedIonDensity, fieldIonDensityBlock);
+
+                /* wait for shared memory to be initialized */
+                cupla::__syncthreads(acc);
+            }
+
+            template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
+            template<typename T_Acc>
+            DINLINE void Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::init(
+                T_Acc const& acc,
+                const DataSpace<simDim>& blockCell,
+                const int& linearThreadIdx,
+                const DataSpace<simDim>& localCellOffset)
+            {
+                /* initialize random number generator with the local cell index in the simulation */
+                this->randomGen.init(localCellOffset);
+            }
+
+
+            template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
+            template<typename T_Acc>
+            DINLINE float3_X Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::scatterByTheta(
+                const T_Acc& acc,
+                const float3_X vec,
+                const float_X theta)
+            {
+                using namespace pmacc::algorithms;
+
+                float_X sinTheta, cosTheta;
+                pmacc::math::sincos(theta, sinTheta, cosTheta);
+
+                const float_X phi
+                    = -pmacc::math::Pi<float_X>::value + pmacc::math::Pi<float_X>::doubleValue * this->randomGen(acc);
+                float_X sinPhi, cosPhi;
+                pmacc::math::sincos(phi, sinPhi, cosPhi);
+
+                const float3_X vecUp(0.0, 0.0, 1.0);
+                float3_X vecOrtho1 = pmacc::math::cross(vecUp, vec);
+                const float_X vecOrtho1Abs = math::abs(vecOrtho1);
+
+                float3_X vecOrtho1_norm;
+                if(vecOrtho1Abs == float_X(0.0))
+                    vecOrtho1_norm = float3_X(1.0, 0.0, 0.0);
+                else
+                    vecOrtho1_norm = vecOrtho1 / vecOrtho1Abs;
+                const float3_X vecOrtho2 = pmacc::math::cross(vecOrtho1_norm, vec);
+                vecOrtho1 = vecOrtho1_norm * math::abs(vec);
+
+                return vec * cosTheta + vecOrtho1 * (sinTheta * cosPhi) + vecOrtho2 * (sinTheta * sinPhi);
+            }
+
+            template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
+            template<typename T_Acc>
+            DINLINE unsigned int Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::numNewParticles(
+                const T_Acc& acc,
+                FrameType& sourceFrame,
+                int localIdx)
+            {
+                using namespace pmacc::algorithms;
+
+                auto particle = sourceFrame[localIdx];
+
+                /* particle position, used for field-to-particle interpolation */
+                const floatD_X pos = particle[position_];
+                const int particleCellIdx = particle[localCellIdx_];
+                /* multi-dim coordinate of the local cell inside the super cell */
+                const DataSpace<TVec::dim> localCell(
+                    DataSpaceOperations<TVec::dim>::template map<TVec>(particleCellIdx));
+                /* interpolation of fieldTmp */
+                const picongpu::traits::FieldPosition<fields::CellType, FieldTmp, simDim> fieldTmpPos;
+                const ValueTypeIonDensity ionDensity_norm
+                    = Field2ParticleInterpolation()(cachedIonDensity.shift(localCell).toCursor(), pos, fieldTmpPos());
+
+                /* TODO: obtain the ion density from the molare ion density in order to avoid the rescaling.
+                 * So this should be: ionDensity = ionMolDensity / UNIT_AMOUNT_SUBSTANCE */
+                const float_X ionDensity = ionDensity_norm.x() * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE;
+
+                const float_X weighting = particle[weighting_];
+
+                const float_X c = SPEED_OF_LIGHT;
+                float3_X mom = particle[momentum_] / weighting;
+                const float_X momAbs = math::abs(mom);
+                float3_X mom_norm = mom / momAbs;
+
+                const float_X mass = frame::getMass<FrameType>();
+                const float_X Ekin = (Gamma<>()(mom, mass) - float_X(1.0)) * mass * c * c;
+                if(Ekin < electron::MIN_ENERGY)
+                    return 0;
+
+                /* electron deflection due to Rutherford scattering without modifying the electron
+                   energy based on radiation emission */
+                const float_X zMin
+                    = float_X(1.0) / (pmacc::math::Pi<float_X>::value * pmacc::math::Pi<float_X>::value);
+                const float_X zMax = float_X(1.0) / (electron::MIN_THETA * electron::MIN_THETA);
+                const float_X z = zMin + this->randomGen(acc) * (zMax - zMin);
+                const float_X theta = math::rsqrt(z);
+                const float_X targetZ = GetAtomicNumbers<T_IonSpecies>::type::numberOfProtons;
+                const float_X rutherfordCoeff = float_X(2.0) * ELECTRON_CHARGE * ELECTRON_CHARGE
+                    / (float_X(4.0) * pmacc::math::Pi<float_X>::value * EPS0) * targetZ / Ekin;
+                const float_X scaledDeflectionDCS
+                    = pmacc::math::Pi<float_X>::value * (zMax - zMin) * rutherfordCoeff * rutherfordCoeff;
+                const float_X deflectionProb = ionDensity * c * DELTA_T * scaledDeflectionDCS;
+
+                if(this->randomGen(acc) < deflectionProb)
+                {
+                    mom = this->scatterByTheta(acc, mom, theta);
+                    mom_norm = mom / momAbs;
+                }
+
+                /* non-radiative Bremsstrahlung */
+                const float_X kappaCutoff = math::min(photon::SOFT_PHOTONS_CUTOFF / Ekin, float_X(1.0));
+                const float_X stoppingPower = ionDensity * c * this->stoppingPowerFunctor(Ekin, kappaCutoff);
+                const float_X newEkin = math::max(Ekin - stoppingPower * DELTA_T, float_X(0.0));
+                const float_X newEkin_norm = newEkin / (mass * c * c);
+                /* This is based on: (p / mc)^2 = (E_kin / mc^2)^2 + 2 * (E_kin / mc^2) */
+                const float_X newMomAbs
+                    = mass * c * math::sqrt(newEkin_norm * newEkin_norm + float_X(2.0) * newEkin_norm);
+                const float_X deltaMom = newMomAbs - momAbs;
+                particle[momentum_] = (mom + deltaMom * mom_norm) * weighting;
+
+                /* photon emission */
+                const float_X delta = this->randomGen(acc);
+                const float_X kappa = math::pow(kappaCutoff, delta);
+                const float_X scalingFactor = -math::log(kappaCutoff);
+                const float_X emissionProb = photon::WEIGHTING_RATIO * scalingFactor * ionDensity * c * DELTA_T
+                    * this->scaledSpectrumFunctor(Ekin, kappa);
+
+                // raise a warning if the emission probability is too high.
+                if(picLog::log_level & picLog::CRITICAL::lvl)
+                {
+                    if(emissionProb > float_X(photon::SINGLE_EMISSION_PROB_LIMIT))
+                    {
+                        const float_X Ekin_SI = Ekin * UNIT_ENERGY;
+                        printf(
+                            "[Bremsstrahlung] warning: emission probability is too high: \
                     p = %g, at Ekin = %g keV, kappa = %g, ion density = %g m^-3\n",
-                    emissionProb,
-                    Ekin_SI * UNITCONV_Joule_to_keV,
-                    kappa,
-                    ionDensity / (UNIT_LENGTH*UNIT_LENGTH*UNIT_LENGTH));
-        }
-    }
-
-    if(this->randomGen(acc) < emissionProb)
-    {
-        const float_X photonEnergy = kappa * Ekin;
-        this->photonMom = mom_norm * weighting / photon::WEIGHTING_RATIO * photonEnergy / c;
-        return 1;
-    }
-
-    return 0;
-}
-
-
-template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
-template<typename Electron, typename Photon, typename T_Acc>
-DINLINE
-void Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::operator()(
-    const T_Acc& acc,
-    Electron& electron,
-    Photon& photon
-)
-{
-    auto destPhoton =
-        pmacc::particles::operations::deselect<
-            boost::mpl::vector<
-                multiMask,
-                momentum,
-                weighting
-            >
-        >(photon);
-
-    namespace parOp = pmacc::particles::operations;
-    parOp::assign( destPhoton, parOp::deselect<particleId>(electron) );
-
-    const float3_X elMom = electron[momentum_];
-    const float_X weighting = electron[weighting_] / photon::WEIGHTING_RATIO;
-    electron[momentum_] = elMom - this->photonMom; // ultra relativistic limit in terms of energy
-
-    /* photon emission angle */
-    const float_X mass = frame::getMass<FrameType>();
-    const float_X gamma = Gamma<>()(elMom / weighting, mass);
-
-    const float_X theta = this->getPhotonAngleFunctor(this->randomGen(acc), gamma);
-
-    const float3_X scatteredPhotonMom = this->scatterByTheta(acc, this->photonMom, theta);
-
-    photon[multiMask_] = 1;
-    photon[momentum_] = scatteredPhotonMom;
-    photon[weighting_] = weighting;
-}
-
-
-} // namespace bremsstrahlung
-} // namespace particles
+                            emissionProb,
+                            Ekin_SI * UNITCONV_Joule_to_keV,
+                            kappa,
+                            ionDensity / (UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH));
+                    }
+                }
+
+                if(this->randomGen(acc) < emissionProb)
+                {
+                    const float_X photonEnergy = kappa * Ekin;
+                    this->photonMom = mom_norm * weighting / photon::WEIGHTING_RATIO * photonEnergy / c;
+                    return 1;
+                }
+
+                return 0;
+            }
+
+
+            template<typename T_IonSpecies, typename T_ElectronSpecies, typename T_PhotonSpecies>
+            template<typename Electron, typename Photon, typename T_Acc>
+            DINLINE void Bremsstrahlung<T_IonSpecies, T_ElectronSpecies, T_PhotonSpecies>::operator()(
+                const T_Acc& acc,
+                Electron& electron,
+                Photon& photon)
+            {
+                auto destPhoton
+                    = pmacc::particles::operations::deselect<boost::mpl::vector<multiMask, momentum, weighting>>(
+                        photon);
+
+                namespace parOp = pmacc::particles::operations;
+                parOp::assign(destPhoton, parOp::deselect<particleId>(electron));
+
+                const float3_X elMom = electron[momentum_];
+                const float_X weighting = electron[weighting_] / photon::WEIGHTING_RATIO;
+                electron[momentum_] = elMom - this->photonMom; // ultra relativistic limit in terms of energy
+
+                /* photon emission angle */
+                const float_X mass = frame::getMass<FrameType>();
+                const float_X gamma = Gamma<>()(elMom / weighting, mass);
+
+                const float_X theta = this->getPhotonAngleFunctor(this->randomGen(acc), gamma);
+
+                const float3_X scatteredPhotonMom = this->scatterByTheta(acc, this->photonMom, theta);
+
+                photon[multiMask_] = 1;
+                photon[momentum_] = scatteredPhotonMom;
+                photon[weighting_] = weighting;
+            }
+
+
+        } // namespace bremsstrahlung
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/bremsstrahlung/PhotonEmissionAngle.hpp b/include/picongpu/particles/bremsstrahlung/PhotonEmissionAngle.hpp
index b51daba6f2..527abe9a09 100644
--- a/include/picongpu/particles/bremsstrahlung/PhotonEmissionAngle.hpp
+++ b/include/picongpu/particles/bremsstrahlung/PhotonEmissionAngle.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -27,14 +27,14 @@
 #include <pmacc/algorithms/math.hpp>
 #include <boost/array.hpp>
 #include <boost/shared_ptr.hpp>
-#if( BOOST_VERSION == 106400 )
-    /* `array_wrapper.hpp` must be included before `integrate.hpp` to avoid
-     * the error
-     * `boost/numeric/ublas/matrix.hpp(5977): error: namespace "boost::serialization" has no member "make_array"`
-     * in boost 1.64.0
-     * see boost issue https://svn.boost.org/trac/boost/ticket/12516
-     */
-#   include <boost/serialization/array_wrapper.hpp>
+#if(BOOST_VERSION == 106400)
+/* `array_wrapper.hpp` must be included before `integrate.hpp` to avoid
+ * the error
+ * `boost/numeric/ublas/matrix.hpp(5977): error: namespace "boost::serialization" has no member "make_array"`
+ * in boost 1.64.0
+ * see boost issue https://svn.boost.org/trac/boost/ticket/12516
+ */
+#    include <boost/serialization/array_wrapper.hpp>
 #endif
 #include <boost/numeric/odeint/integrate/integrate.hpp>
 #include <boost/math/tools/minima.hpp>
@@ -43,230 +43,227 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace bremsstrahlung
-{
-
-namespace detail
-{
-
-/** Functor mapping `delta` to the photon emission polar angle `theta`,
- * where delta is a uniformly distributed random number between zero and one.
- */
-struct GetPhotonAngleFunctor
-{
-    using LinInterpCursor = typename ::pmacc::result_of::Functor<
-        ::pmacc::cursor::tools::LinearInterp<float_X>,
-        ::pmacc::cursor::BufferCursor<float_X, DIM2>
-    >::type;
-
-    using type = float_X;
-
-    LinInterpCursor linInterpCursor;
-    float_X lnMinGamma;
-    float_X lnMaxGamma;
-
-    /** constructor
-     *
-     * @param linInterpCursor lookup table for the photon emission angle.
-     */
-    HDINLINE GetPhotonAngleFunctor(LinInterpCursor linInterpCursor)
-        : linInterpCursor(linInterpCursor)
-    {
-        this->lnMinGamma = math::log(photon::MIN_GAMMA);
-        this->lnMaxGamma = math::log(photon::MAX_GAMMA);
-    }
-
-    /** Return the polar emission angle of the photon.
-     *
-     * @param delta uniformly distributed random number between zero and one.
-     * @param gamma relativistic factor of the incident electron.
-     */
-    HDINLINE float_X operator()(const float_X delta, const float_X gamma) const
+    namespace particles
     {
-        const float_X deltaLookupPos = delta * static_cast<float_64>(photon::NUM_SAMPLES_DELTA - 1);
-
-        const float_X lnGamma = algorithms::math::log(gamma);
-        const float_X gammaLookupPos =
-            (lnGamma - this->lnMinGamma) /
-            (this->lnMaxGamma - this->lnMinGamma) *
-            static_cast<float_X>(photon::NUM_SAMPLES_GAMMA - 1);
-
-        if (picLog::log_level & picLog::CRITICAL::lvl)
+        namespace bremsstrahlung
         {
-            if(gamma > photon::MAX_GAMMA)
+            namespace detail
             {
-                printf("[Bremsstrahlung] error lookup table: gamma = %g is out of range.\n",
-                       gamma);
-            }
-        }
-
-        return this->linInterpCursor[float2_X(deltaLookupPos, gammaLookupPos)];
-    }
-};
-
-} // namespace detail
-
-/** Creates and holds the lookup table for the photon emission angle.
- */
-struct GetPhotonAngle
-{
-    using GetPhotonAngleFunctor = detail::GetPhotonAngleFunctor;
-
-private:
-
-    using MyBuf = boost::shared_ptr<pmacc::container::DeviceBuffer<float_X, DIM2> >;
-    MyBuf dBufTheta;
-
-    /** probability density at polar angle theta.
-     * It's the ultrarelativistic limit of the dipole radiation formula, see e.g. Jackson, chap. 15.2
-     */
-    struct Probability
-    {
-        const float_64 gamma2;
-        Probability(const float_64 gamma) : gamma2(gamma*gamma) {}
-
-        template<typename T_State>
-        void operator()(const T_State &p, T_State &dpdtheta, const float_64 theta) const
-        {
-            const float_64 theta2 = theta*theta;
-            const float_64 denom = float_64(1.0) + gamma2 * theta2;
-
-            dpdtheta[0] = float_64(3.0) * theta * gamma2 * (float_64(1.0) + gamma2*gamma2 * theta2*theta2) /
-                          (denom*denom*denom*denom);
-        }
-    };
-
-    /** Return the absolute deviation of a delta, computed from a given theta, and a reference delta.
-     *
-     * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
-     * where theta is the angle between the photon momentum and the final electron momentum.
-     */
-    struct AimForDelta
-    {
-        const float_64 targetDelta;
-        const float_64 gamma;
-
-        /** constructor
-         *
-         * @param targetDelta reference delta
-         * @param gamma relativistic factor
-         */
-        AimForDelta(const float_64 targetDelta, const float_64 gamma) :
-            targetDelta(targetDelta), gamma(gamma) {}
-
-        float_64 delta(const float_64 theta, const float_64 gamma) const
-        {
-            namespace odeint = boost::numeric::odeint;
-
-            using state_type = boost::array<float_64, 1>;
-
-            state_type integral_result = {0.0};
-            const float_64 lowerLimit = 0.0;
-            const float_64 upperLimit = theta;
-            const float_64 stepwidth = (upperLimit - lowerLimit) / float_64(1000.0);
-            Probability integrand(gamma);
-            odeint::integrate(integrand, integral_result, lowerLimit, upperLimit, stepwidth);
-
-            return integral_result[0];
-        }
-
-        float_64 operator()(const float_64 theta) const
-        {
-            return math::abs(this->delta(theta, this->gamma) - this->targetDelta);
-        }
-    };
-
-    /** Return the maximal theta which corresponds to the maximal delta and a given gamma
-     *
-     * @param gamma relativistic factor
-     */
-    float_64 maxTheta(const float_64 gamma) const
-    {
-        AimForDelta aimForDelta(photon::MAX_DELTA, gamma);
-
-        std::pair<float_64, float_64> minimum;
-
-        minimum = boost::math::tools::brent_find_minima(
-            aimForDelta,
-            0.0,
-            pmacc::algorithms::math::Pi<float_64>::value,
-            std::numeric_limits<float_64>::digits);
-
-        return minimum.first;
-    }
-
-    /** computes the polar emission angle theta.
-     *
-     * @param delta uniformly distributed random number within [0, 1] or (0, 1)
-     * @param gamma relativistic factor
-     * @param maxTheta maximal theta
-     */
-    float_64 theta(const float_64 delta, const float_64 gamma, const float_64 maxTheta) const
-    {
-        AimForDelta aimForDelta(delta, gamma);
-        const float_64 minTheta = 0.0;
-        std::pair<float_64, float_64> minimum;
-
-        minimum = boost::math::tools::brent_find_minima(
-            aimForDelta,
-            minTheta,
-            maxTheta,
-            std::numeric_limits<float_64>::digits);
-
-        return minimum.first;
-    }
-
-public:
-
-    /** Generate lookup table
-     */
-    void init()
-    {
-        // there is a margin of one cell to make the linear interpolation valid for border cells.
-        this->dBufTheta = MyBuf(new pmacc::container::DeviceBuffer<float_X, DIM2>(
-            photon::NUM_SAMPLES_DELTA + 1,
-            photon::NUM_SAMPLES_GAMMA + 1));
-
-        pmacc::container::HostBuffer<float_X, DIM2> hBufTheta(this->dBufTheta->size());
-        hBufTheta.assign(float_X(0.0));
-        auto curTheta = hBufTheta.origin();
-
-        const float_64 lnMinGamma = math::log(photon::MIN_GAMMA);
-        const float_64 lnMaxGamma = math::log(photon::MAX_GAMMA);
-
-        for(uint32_t gammaIdx = 0; gammaIdx < photon::NUM_SAMPLES_GAMMA; gammaIdx++)
-        {
-            const float_64 lnGamma_norm = static_cast<float_64>(gammaIdx) /
-                                          static_cast<float_64>(photon::NUM_SAMPLES_GAMMA - 1);
-            const float_64 gamma = math::exp(lnMinGamma + (lnMaxGamma - lnMinGamma) * lnGamma_norm);
-            const float_64 maxTheta = this->maxTheta(gamma);
-
-            for(uint32_t deltaIdx = 0; deltaIdx < photon::NUM_SAMPLES_DELTA; deltaIdx++)
+                /** Functor mapping `delta` to the photon emission polar angle `theta`,
+                 * where delta is a uniformly distributed random number between zero and one.
+                 */
+                struct GetPhotonAngleFunctor
+                {
+                    using LinInterpCursor = typename ::pmacc::result_of::Functor<
+                        ::pmacc::cursor::tools::LinearInterp<float_X>,
+                        ::pmacc::cursor::BufferCursor<float_X, DIM2>>::type;
+
+                    using type = float_X;
+
+                    LinInterpCursor linInterpCursor;
+                    float_X lnMinGamma;
+                    float_X lnMaxGamma;
+
+                    /** constructor
+                     *
+                     * @param linInterpCursor lookup table for the photon emission angle.
+                     */
+                    HDINLINE GetPhotonAngleFunctor(LinInterpCursor linInterpCursor) : linInterpCursor(linInterpCursor)
+                    {
+                        this->lnMinGamma = math::log(photon::MIN_GAMMA);
+                        this->lnMaxGamma = math::log(photon::MAX_GAMMA);
+                    }
+
+                    /** Return the polar emission angle of the photon.
+                     *
+                     * @param delta uniformly distributed random number between zero and one.
+                     * @param gamma relativistic factor of the incident electron.
+                     */
+                    HDINLINE float_X operator()(const float_X delta, const float_X gamma) const
+                    {
+                        const float_X deltaLookupPos = delta * static_cast<float_64>(photon::NUM_SAMPLES_DELTA - 1);
+
+                        const float_X lnGamma = math::log(gamma);
+                        const float_X gammaLookupPos = (lnGamma - this->lnMinGamma)
+                            / (this->lnMaxGamma - this->lnMinGamma)
+                            * static_cast<float_X>(photon::NUM_SAMPLES_GAMMA - 1);
+
+                        if(picLog::log_level & picLog::CRITICAL::lvl)
+                        {
+                            if(gamma > photon::MAX_GAMMA)
+                            {
+                                printf("[Bremsstrahlung] error lookup table: gamma = %g is out of range.\n", gamma);
+                            }
+                        }
+
+                        return this->linInterpCursor[float2_X(deltaLookupPos, gammaLookupPos)];
+                    }
+                };
+
+            } // namespace detail
+
+            /** Creates and holds the lookup table for the photon emission angle.
+             */
+            struct GetPhotonAngle
             {
-                const float_64 delta = photon::MAX_DELTA * static_cast<float_64>(deltaIdx) /
-                                       static_cast<float_64>(photon::NUM_SAMPLES_DELTA - 1);
-
-                *curTheta(deltaIdx, gammaIdx) = static_cast<float_X>(this->theta(delta, gamma, maxTheta));
-            }
-        }
-
-        *this->dBufTheta = hBufTheta;
-    }
-
-    /** Return a functor mapping `delta` to the photon emission polar angle `theta`,
-     * where delta is a uniformly distributed random number within [0, 1] or (0, 1)
-     */
-    GetPhotonAngleFunctor getPhotonAngleFunctor() const
-    {
-        GetPhotonAngleFunctor::LinInterpCursor linInterpCursor =
-            pmacc::cursor::tools::LinearInterp<float_X>()(this->dBufTheta->origin());
-
-        return GetPhotonAngleFunctor(linInterpCursor);
-    }
-};
-
-} // namespace bremsstrahlung
-} // namespace particles
+                using GetPhotonAngleFunctor = detail::GetPhotonAngleFunctor;
+
+            private:
+                using MyBuf = boost::shared_ptr<pmacc::container::DeviceBuffer<float_X, DIM2>>;
+                MyBuf dBufTheta;
+
+                /** probability density at polar angle theta.
+                 * It's the ultrarelativistic limit of the dipole radiation formula, see e.g. Jackson, chap. 15.2
+                 */
+                struct Probability
+                {
+                    const float_64 gamma2;
+                    Probability(const float_64 gamma) : gamma2(gamma * gamma)
+                    {
+                    }
+
+                    template<typename T_State>
+                    void operator()(const T_State& p, T_State& dpdtheta, const float_64 theta) const
+                    {
+                        const float_64 theta2 = theta * theta;
+                        const float_64 denom = float_64(1.0) + gamma2 * theta2;
+
+                        dpdtheta[0] = float_64(3.0) * theta * gamma2
+                            * (float_64(1.0) + gamma2 * gamma2 * theta2 * theta2) / (denom * denom * denom * denom);
+                    }
+                };
+
+                /** Return the absolute deviation of a delta, computed from a given theta, and a reference delta.
+                 *
+                 * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
+                 * where theta is the angle between the photon momentum and the final electron momentum.
+                 */
+                struct AimForDelta
+                {
+                    const float_64 targetDelta;
+                    const float_64 gamma;
+
+                    /** constructor
+                     *
+                     * @param targetDelta reference delta
+                     * @param gamma relativistic factor
+                     */
+                    AimForDelta(const float_64 targetDelta, const float_64 gamma)
+                        : targetDelta(targetDelta)
+                        , gamma(gamma)
+                    {
+                    }
+
+                    float_64 delta(const float_64 theta, const float_64 gamma) const
+                    {
+                        namespace odeint = boost::numeric::odeint;
+
+                        using state_type = boost::array<float_64, 1>;
+
+                        state_type integral_result = {0.0};
+                        const float_64 lowerLimit = 0.0;
+                        const float_64 upperLimit = theta;
+                        const float_64 stepwidth = (upperLimit - lowerLimit) / float_64(1000.0);
+                        Probability integrand(gamma);
+                        odeint::integrate(integrand, integral_result, lowerLimit, upperLimit, stepwidth);
+
+                        return integral_result[0];
+                    }
+
+                    float_64 operator()(const float_64 theta) const
+                    {
+                        return math::abs(this->delta(theta, this->gamma) - this->targetDelta);
+                    }
+                };
+
+                /** Return the maximal theta which corresponds to the maximal delta and a given gamma
+                 *
+                 * @param gamma relativistic factor
+                 */
+                float_64 maxTheta(const float_64 gamma) const
+                {
+                    AimForDelta aimForDelta(photon::MAX_DELTA, gamma);
+
+                    std::pair<float_64, float_64> minimum;
+
+                    minimum = boost::math::tools::brent_find_minima(
+                        aimForDelta,
+                        0.0,
+                        pmacc::math::Pi<float_64>::value,
+                        std::numeric_limits<float_64>::digits);
+
+                    return minimum.first;
+                }
+
+                /** computes the polar emission angle theta.
+                 *
+                 * @param delta uniformly distributed random number within [0, 1] or (0, 1)
+                 * @param gamma relativistic factor
+                 * @param maxTheta maximal theta
+                 */
+                float_64 theta(const float_64 delta, const float_64 gamma, const float_64 maxTheta) const
+                {
+                    AimForDelta aimForDelta(delta, gamma);
+                    const float_64 minTheta = 0.0;
+                    std::pair<float_64, float_64> minimum;
+
+                    minimum = boost::math::tools::brent_find_minima(
+                        aimForDelta,
+                        minTheta,
+                        maxTheta,
+                        std::numeric_limits<float_64>::digits);
+
+                    return minimum.first;
+                }
+
+            public:
+                /** Generate lookup table
+                 */
+                void init()
+                {
+                    // there is a margin of one cell to make the linear interpolation valid for border cells.
+                    this->dBufTheta = MyBuf(new pmacc::container::DeviceBuffer<float_X, DIM2>(
+                        photon::NUM_SAMPLES_DELTA + 1,
+                        photon::NUM_SAMPLES_GAMMA + 1));
+
+                    pmacc::container::HostBuffer<float_X, DIM2> hBufTheta(this->dBufTheta->size());
+                    hBufTheta.assign(float_X(0.0));
+                    auto curTheta = hBufTheta.origin();
+
+                    const float_64 lnMinGamma = math::log(photon::MIN_GAMMA);
+                    const float_64 lnMaxGamma = math::log(photon::MAX_GAMMA);
+
+                    for(uint32_t gammaIdx = 0; gammaIdx < photon::NUM_SAMPLES_GAMMA; gammaIdx++)
+                    {
+                        const float_64 lnGamma_norm
+                            = static_cast<float_64>(gammaIdx) / static_cast<float_64>(photon::NUM_SAMPLES_GAMMA - 1);
+                        const float_64 gamma = math::exp(lnMinGamma + (lnMaxGamma - lnMinGamma) * lnGamma_norm);
+                        const float_64 maxTheta = this->maxTheta(gamma);
+
+                        for(uint32_t deltaIdx = 0; deltaIdx < photon::NUM_SAMPLES_DELTA; deltaIdx++)
+                        {
+                            const float_64 delta = photon::MAX_DELTA * static_cast<float_64>(deltaIdx)
+                                / static_cast<float_64>(photon::NUM_SAMPLES_DELTA - 1);
+
+                            *curTheta(deltaIdx, gammaIdx) = static_cast<float_X>(this->theta(delta, gamma, maxTheta));
+                        }
+                    }
+
+                    *this->dBufTheta = hBufTheta;
+                }
+
+                /** Return a functor mapping `delta` to the photon emission polar angle `theta`,
+                 * where delta is a uniformly distributed random number within [0, 1] or (0, 1)
+                 */
+                GetPhotonAngleFunctor getPhotonAngleFunctor() const
+                {
+                    GetPhotonAngleFunctor::LinInterpCursor linInterpCursor
+                        = pmacc::cursor::tools::LinearInterp<float_X>()(this->dBufTheta->origin());
+
+                    return GetPhotonAngleFunctor(linInterpCursor);
+                }
+            };
+
+        } // namespace bremsstrahlung
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/bremsstrahlung/ScaledSpectrum.hpp b/include/picongpu/particles/bremsstrahlung/ScaledSpectrum.hpp
index 6ca09c1a1e..83c3c739bb 100644
--- a/include/picongpu/particles/bremsstrahlung/ScaledSpectrum.hpp
+++ b/include/picongpu/particles/bremsstrahlung/ScaledSpectrum.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -30,14 +30,14 @@
 #include <pmacc/particles/meta/FindByNameOrType.hpp>
 
 #include <boost/array.hpp>
-#if( BOOST_VERSION == 106400 )
-    /* `array_wrapper.hpp` must be included before `integrate.hpp` to avoid
-     * the error
-     * `boost/numeric/ublas/matrix.hpp(5977): error: namespace "boost::serialization" has no member "make_array"`
-     * in boost 1.64.0
-     * see boost issue https://svn.boost.org/trac/boost/ticket/12516
-     */
-#   include <boost/serialization/array_wrapper.hpp>
+#if(BOOST_VERSION == 106400)
+/* `array_wrapper.hpp` must be included before `integrate.hpp` to avoid
+ * the error
+ * `boost/numeric/ublas/matrix.hpp(5977): error: namespace "boost::serialization" has no member "make_array"`
+ * in boost 1.64.0
+ * see boost issue https://svn.boost.org/trac/boost/ticket/12516
+ */
+#    include <boost/serialization/array_wrapper.hpp>
 #endif
 #include <boost/numeric/odeint/integrate/integrate.hpp>
 #include <boost/shared_ptr.hpp>
@@ -45,156 +45,154 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace bremsstrahlung
-{
-
-namespace detail
-{
-
-/** Functor for the scaled differential cross section (dcs) which
- * equals to the electron energy loss times the cross section per unit energy.
- */
-struct LookupTableFunctor
-{
-    using LinInterpCursor = typename ::pmacc::result_of::Functor<
-        ::pmacc::cursor::tools::LinearInterp<float_X>,
-        ::pmacc::cursor::BufferCursor<float_X, DIM2>
-    >::type;
-
-    using type = float_X;
-
-    LinInterpCursor linInterpCursor;
-    float_X lnEMin;
-    float_X lnEMax;
-
-    /** constructor
-     *
-     * @param linInterpCursor
-     */
-    HDINLINE LookupTableFunctor(LinInterpCursor linInterpCursor);
-    /** scaled differential cross section
-     *
-     * @param Ekin kinetic energy of the incident electron
-     * @param kappa energy loss normalized to Ekin
-     */
-    HDINLINE float_X operator()(const float_X Ekin, const float_X kappa) const;
-};
-
-} // namespace detail
-
-
-/** Generates and holds the lookup tables for the scaled differential cross section
- * and the stopping power.
- *
- * scaled differential cross section = electron energy loss times cross section per unit energy
- *
- * stopping power = energy loss per unit length
- *
- * The lookup tables are generated from the screened Bethe-Heitler cross section. See e.g.:
- * Salvat, F., et al. "Monte Carlo simulation of bremsstrahlung emission by electrons."
- * Radiation Physics and Chemistry 75.10 (2006): 1201-1219.
- */
-struct ScaledSpectrum
-{
-public:
-    using LookupTableFunctor = detail::LookupTableFunctor;
-private:
-
-    using MyBuf = boost::shared_ptr<pmacc::container::DeviceBuffer<float_X, DIM2> >;
-    MyBuf dBufScaledSpectrum;
-    MyBuf dBufStoppingPower;
-
-    /** differential cross section: cross section per unit energy
-     *
-     * This is the screened Bethe-Heitler cross section. See e.g.:
-     * Salvat, F., et al. "Monte Carlo simulation of bremsstrahlung emission by electrons."
-     * Radiation Physics and Chemistry 75.10 (2006): 1201-1219.
-     *
-     * @param Ekin kinetic electron energy
-     * @param kappa energy loss normalized to Ekin
-     * @param targetZ atomic number of the target material
-     */
-    HINLINE float_64 dcs(const float_64 Ekin, const float_64 kappa, const float_64 targetZ) const;
-
-    /** differential cross section times energy loss
-     */
-    struct StoppingPowerIntegrand
+    namespace particles
     {
-        const float_64 Ekin;
-        const float_64 targetZ;
-        const ScaledSpectrum& scaledSpectrum;
-
-        StoppingPowerIntegrand(const float_64 Ekin, const ScaledSpectrum& scaledSpectrum, const float_64 targetZ) :
-            Ekin(Ekin), scaledSpectrum(scaledSpectrum), targetZ(targetZ) {}
-
-        template<typename T_State, typename T_W>
-        void operator()(const T_State &x, T_State &dxdW, T_W W) const
-        {
-            dxdW[0] = this->scaledSpectrum.dcs(this->Ekin, W / this->Ekin, this->targetZ) * W;
-        }
-    };
-
-public:
-
-    /** Generate lookup tables
-     *
-     * @param targetZ atomic number of the target material
-     */
-    HINLINE void init(const float_64 targetZ);
-
-    /** Return a functor representing the scaled differential cross section
-     *
-     * scaled differential cross section = electron energy loss times cross section per unit energy
-     */
-    HINLINE LookupTableFunctor getScaledSpectrumFunctor() const;
-
-    /** Return a functor representing the stopping power
-     *
-     * stopping power = energy loss per unit length
-     */
-    HINLINE LookupTableFunctor getStoppingPowerFunctor() const;
-};
-
-
-/** Creates a `ScaledSpectrum` instance for a given electron species
- * and stores it in a map<atomic number, ScaledSpectrum> object.
- *
- * This functor is called from MySimulation::init() to generate lookup tables.
- *
- * @tparam T_ElectronSpecies type or name as boost::mpl::string of the electron species
- */
-template<typename T_ElectronSpecies>
-struct FillScaledSpectrumMap
-{
-    using ElectronSpecies = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_ElectronSpecies
-    >;
-
-    using IonSpecies = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        typename pmacc::particles::traits::ResolveAliasFromSpecies<
-            ElectronSpecies,
-            bremsstrahlungIons<>
-        >::type
-    >;
-
-    template<typename T_Map>
-    void operator()(T_Map& map) const
-    {
-        const float_X targetZ = GetAtomicNumbers<IonSpecies>::type::numberOfProtons;
-
-        if(map.count(targetZ) == 0)
+        namespace bremsstrahlung
         {
-            ScaledSpectrum scaledSpectrum;
-            scaledSpectrum.init(static_cast<float_64>(targetZ));
-            map[targetZ] = scaledSpectrum;
-        }
-    }
-};
-
-} // namespace bremsstrahlung
-} // namespace particles
+            namespace detail
+            {
+                /** Functor for the scaled differential cross section (dcs) which
+                 * equals to the electron energy loss times the cross section per unit energy.
+                 */
+                struct LookupTableFunctor
+                {
+                    using LinInterpCursor = typename ::pmacc::result_of::Functor<
+                        ::pmacc::cursor::tools::LinearInterp<float_X>,
+                        ::pmacc::cursor::BufferCursor<float_X, DIM2>>::type;
+
+                    using type = float_X;
+
+                    LinInterpCursor linInterpCursor;
+                    float_X lnEMin;
+                    float_X lnEMax;
+
+                    /** constructor
+                     *
+                     * @param linInterpCursor
+                     */
+                    HDINLINE LookupTableFunctor(LinInterpCursor linInterpCursor);
+                    /** scaled differential cross section
+                     *
+                     * @param Ekin kinetic energy of the incident electron
+                     * @param kappa energy loss normalized to Ekin
+                     */
+                    HDINLINE float_X operator()(const float_X Ekin, const float_X kappa) const;
+                };
+
+            } // namespace detail
+
+
+            /** Generates and holds the lookup tables for the scaled differential cross section
+             * and the stopping power.
+             *
+             * scaled differential cross section = electron energy loss times cross section per unit energy
+             *
+             * stopping power = energy loss per unit length
+             *
+             * The lookup tables are generated from the screened Bethe-Heitler cross section. See e.g.:
+             * Salvat, F., et al. "Monte Carlo simulation of bremsstrahlung emission by electrons."
+             * Radiation Physics and Chemistry 75.10 (2006): 1201-1219.
+             */
+            struct ScaledSpectrum
+            {
+            public:
+                using LookupTableFunctor = detail::LookupTableFunctor;
+
+            private:
+                using MyBuf = boost::shared_ptr<pmacc::container::DeviceBuffer<float_X, DIM2>>;
+                MyBuf dBufScaledSpectrum;
+                MyBuf dBufStoppingPower;
+
+                /** differential cross section: cross section per unit energy
+                 *
+                 * This is the screened Bethe-Heitler cross section. See e.g.:
+                 * Salvat, F., et al. "Monte Carlo simulation of bremsstrahlung emission by electrons."
+                 * Radiation Physics and Chemistry 75.10 (2006): 1201-1219.
+                 *
+                 * @param Ekin kinetic electron energy
+                 * @param kappa energy loss normalized to Ekin
+                 * @param targetZ atomic number of the target material
+                 */
+                HINLINE float_64 dcs(const float_64 Ekin, const float_64 kappa, const float_64 targetZ) const;
+
+                /** differential cross section times energy loss
+                 */
+                struct StoppingPowerIntegrand
+                {
+                    const float_64 Ekin;
+                    const float_64 targetZ;
+                    const ScaledSpectrum& scaledSpectrum;
+
+                    StoppingPowerIntegrand(
+                        const float_64 Ekin,
+                        const ScaledSpectrum& scaledSpectrum,
+                        const float_64 targetZ)
+                        : Ekin(Ekin)
+                        , scaledSpectrum(scaledSpectrum)
+                        , targetZ(targetZ)
+                    {
+                    }
+
+                    template<typename T_State, typename T_W>
+                    void operator()(const T_State& x, T_State& dxdW, T_W W) const
+                    {
+                        dxdW[0] = this->scaledSpectrum.dcs(this->Ekin, W / this->Ekin, this->targetZ) * W;
+                    }
+                };
+
+            public:
+                /** Generate lookup tables
+                 *
+                 * @param targetZ atomic number of the target material
+                 */
+                HINLINE void init(const float_64 targetZ);
+
+                /** Return a functor representing the scaled differential cross section
+                 *
+                 * scaled differential cross section = electron energy loss times cross section per unit energy
+                 */
+                HINLINE LookupTableFunctor getScaledSpectrumFunctor() const;
+
+                /** Return a functor representing the stopping power
+                 *
+                 * stopping power = energy loss per unit length
+                 */
+                HINLINE LookupTableFunctor getStoppingPowerFunctor() const;
+            };
+
+
+            /** Creates a `ScaledSpectrum` instance for a given electron species
+             * and stores it in a map<atomic number, ScaledSpectrum> object.
+             *
+             * This functor is called from Simulation::init() to generate lookup tables.
+             *
+             * @tparam T_ElectronSpecies type or name as boost::mpl::string of the electron species
+             */
+            template<typename T_ElectronSpecies>
+            struct FillScaledSpectrumMap
+            {
+                using ElectronSpecies
+                    = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_ElectronSpecies>;
+
+                using IonSpecies = pmacc::particles::meta::FindByNameOrType_t<
+                    VectorAllSpecies,
+                    typename pmacc::particles::traits::ResolveAliasFromSpecies<ElectronSpecies, bremsstrahlungIons<>>::
+                        type>;
+
+                template<typename T_Map>
+                void operator()(T_Map& map) const
+                {
+                    const float_X targetZ = GetAtomicNumbers<IonSpecies>::type::numberOfProtons;
+
+                    if(map.count(targetZ) == 0)
+                    {
+                        ScaledSpectrum scaledSpectrum;
+                        scaledSpectrum.init(static_cast<float_64>(targetZ));
+                        map[targetZ] = scaledSpectrum;
+                    }
+                }
+            };
+
+        } // namespace bremsstrahlung
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/bremsstrahlung/ScaledSpectrum.tpp b/include/picongpu/particles/bremsstrahlung/ScaledSpectrum.tpp
index 884c719d01..01a0d3a749 100644
--- a/include/picongpu/particles/bremsstrahlung/ScaledSpectrum.tpp
+++ b/include/picongpu/particles/bremsstrahlung/ScaledSpectrum.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -23,205 +23,207 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace bremsstrahlung
-{
-
-namespace detail
-{
-
-
-/** constructor
- *
- * @param linInterpCursor
- */
-HDINLINE LookupTableFunctor::LookupTableFunctor(LinInterpCursor linInterpCursor)
-    : linInterpCursor(linInterpCursor)
-{
-    float_X const lnEMinTmp( electron::MIN_ENERGY );
-    float_X const lnEMaxTmp( electron::MAX_ENERGY );
-    this->lnEMin = math::log( lnEMinTmp );
-    this->lnEMax = math::log( lnEMaxTmp);
-}
-
-/** scaled differential cross section
- *
- * @param Ekin kinetic energy of the incident electron
- * @param kappa energy loss normalized to Ekin
- */
-HDINLINE float_X LookupTableFunctor::operator()(const float_X Ekin, const float_X kappa) const
-{
-    const float_X lnE = math::log(Ekin);
-
-    const float_X binE = (lnE - this->lnEMin) / (this->lnEMax - this->lnEMin) * static_cast<float_X>(electron::NUM_SAMPLES_EKIN - 1);
-    // in the low-energy limit Bremsstrahlung is not taken into account
-    if(binE < float_X(0.0))
-        return float_X(0.0);
-    const float_X binKappa = kappa * static_cast<float_X>(electron::NUM_SAMPLES_KAPPA - 1);
-
-    if (picLog::log_level & picLog::CRITICAL::lvl)
+    namespace particles
     {
-        if(Ekin < electron::MIN_ENERGY || Ekin > electron::MAX_ENERGY)
+        namespace bremsstrahlung
         {
-            const float_64 Ekin_SI = Ekin * UNIT_ENERGY;
-            printf("[Bremsstrahlung] error lookup table: Ekin=%g MeV is out of range.\n",
-                   float_X(Ekin_SI * UNITCONV_Joule_to_keV * float_X(1.0e-3)));
-        }
-        if(kappa < float_X(0.0) || kappa > float_X(1.0))
-            printf("[Bremsstrahlung] error lookup table: kappa=%f is out of range.\n",
-                   kappa);
-    }
-
-    return this->linInterpCursor[float2_X(binE, binKappa)];
-}
-
-
-} // namespace detail
-
-
-
-/** differential cross section: cross section per unit energy
- *
- * This is the screened Bethe-Heitler cross section. See e.g.:
- * Salvat, F., et al. "Monte Carlo simulation of bremsstrahlung emission by electrons."
- * Radiation Physics and Chemistry 75.10 (2006): 1201-1219.
- *
- * @param Ekin kinetic electron energy
- * @param kappa energy loss normalized to Ekin
- */
-float_64 ScaledSpectrum::dcs(const float_64 Ekin, const float_64 kappa, const float_64 targetZ) const
-{
-    constexpr float_64 pi = pmacc::algorithms::math::Pi<float_64>::value;
-    constexpr float_64 bohrRadius = pi * 4.0 * EPS0 * HBAR * HBAR /
-        (float_64(ELECTRON_MASS) * ELECTRON_CHARGE * ELECTRON_CHARGE);
-    constexpr float_64 classicalElRadius = float_64(ELECTRON_CHARGE*ELECTRON_CHARGE) / (pi * 4.0 * EPS0 * ELECTRON_MASS * SPEED_OF_LIGHT*SPEED_OF_LIGHT);
-    constexpr float_64 fineStructureConstant = float_64(ELECTRON_CHARGE*ELECTRON_CHARGE) / (pi * 4.0 * EPS0 * HBAR * SPEED_OF_LIGHT);
-
-    constexpr float_64 c = SPEED_OF_LIGHT;
-    constexpr float_64 c2 = c*c;
-    constexpr float_64 m_e = ELECTRON_MASS;
-    constexpr float_64 r_e = classicalElRadius;
-    constexpr float_64 alpha = fineStructureConstant;
-
-    const float_64 W = kappa * Ekin;
-    const float_64 eps = W / (Ekin + m_e * c2);
-    const float_64 R = math::pow(targetZ, float_64(-1.0/3.0)) * bohrRadius;
-    const float_64 gamma = Ekin / (m_e * c2) + float_64(1.0);
-    const float_64 b = R * m_e * c / HBAR / (float_64(2.0) * gamma) * eps / (float_64(1.0) - eps);
-
-    const float_64 phi_1 = float_64(4.0) * math::log(R * m_e * c / HBAR) + float_64(2.0) - float_64(2.0) * math::log(float_64(1.0) + b*b)
-        - float_64(4.0) * b * math::atan(float_64(1.0) / b);
-    const float_64 phi_2 = float_64(4.0) * math::log(R * m_e * c / HBAR) + float_64(7.0) / float_64(3.0) - float_64(2.0) * math::log(float_64(1.0) + b*b)
-        - float_64(6.0) * b * math::atan(float_64(1.0) / b)
-        - b*b * (float_64(4.0) - float_64(4.0) * b * math::atan(float_64(1.0) / b) - float_64(3.0) * math::log(float_64(1.0) + float_64(1.0) / (b*b)));
-
-    return r_e*r_e * alpha * targetZ*targetZ / W * (eps*eps * phi_1 + float_64(4.0) / float_64(3.0) * (float_64(1.0) - eps) * phi_2);
-}
-
-
-
-void ScaledSpectrum::init(const float_64 targetZ)
-{
-    namespace odeint = boost::numeric::odeint;
-
-    // there is a margin of one cell to make the linear interpolation valid for border cells.
-    this->dBufScaledSpectrum = MyBuf(
-        new pmacc::container::DeviceBuffer<float_X, DIM2>(
-            electron::NUM_SAMPLES_EKIN + 1,
-            electron::NUM_SAMPLES_KAPPA + 1));
-    this->dBufStoppingPower = MyBuf(
-        new pmacc::container::DeviceBuffer<float_X, DIM2>(
-            electron::NUM_SAMPLES_EKIN + 1,
-            electron::NUM_SAMPLES_KAPPA + 1));
-
-    pmacc::container::HostBuffer<float_X, DIM2> hBufScaledSpectrum(this->dBufScaledSpectrum->size());
-    pmacc::container::HostBuffer<float_X, DIM2> hBufStoppingPower(this->dBufStoppingPower->size());
-    hBufScaledSpectrum.assign(float_X(0.0));
-    hBufStoppingPower.assign(float_X(0.0));
-
-    auto curScaledSpectrum = hBufScaledSpectrum.origin();
-    auto curStoppingPower = hBufStoppingPower.origin();
-
-    const float_64 lnEMin = math::log(electron::MIN_ENERGY);
-    const float_64 lnEMax = math::log(electron::MAX_ENERGY);
-
-    using state_type = boost::array<float_64, 1>;
-
-    for(uint32_t EkinIdx = 0; EkinIdx < electron::NUM_SAMPLES_EKIN; EkinIdx++)
-    {
-        for(uint32_t kappaIdx = 0; kappaIdx < electron::NUM_SAMPLES_KAPPA; kappaIdx++)
-        {
-            float_64 kappa = static_cast<float_64>(kappaIdx) /
-                             static_cast<float_64>(electron::NUM_SAMPLES_KAPPA - 1);
-            if(kappa == 0.0)
-                kappa = electron::MIN_KAPPA;
-
-            const float_64 lnE_norm = static_cast<float_64>(EkinIdx) /
-                                      static_cast<float_64>(electron::NUM_SAMPLES_EKIN - 1);
-            const float_64 Ekin = math::exp(lnEMin + (lnEMax - lnEMin) * lnE_norm);
-
-            *curScaledSpectrum(EkinIdx, kappaIdx) = Ekin * kappa * static_cast<float_X>(this->dcs(Ekin, kappa, targetZ));
-
-            state_type integral_result = {0.0};
-            const float_64 lowerLimit = electron::MIN_KAPPA * Ekin;
-            const float_64 upperLimit = kappa * Ekin;
-            const float_64 stepwidth = upperLimit / electron::NUM_STEPS_STOPPING_POWER_INTERGRAL;
-            StoppingPowerIntegrand integrand(Ekin, *this, targetZ);
-            odeint::integrate(integrand, integral_result, lowerLimit, upperLimit, stepwidth);
-            *curStoppingPower(EkinIdx, kappaIdx) = static_cast<float_X>(integral_result[0]);
-
-            // check for nans
-            if(*curScaledSpectrum(EkinIdx, kappaIdx) != *curScaledSpectrum(EkinIdx, kappaIdx))
+            namespace detail
             {
-                const float_64 Ekin_SI = Ekin * UNIT_ENERGY;
-                const float_64 Ekin_MeV = Ekin_SI * UNITCONV_Joule_to_keV / 1.0e3;
-                std::stringstream errMsg;
-                errMsg << "[Bremsstrahlung] lookup table (scaled spectrum) has NaN-entry at Ekin = "
-                       << Ekin_MeV << " MeV, kappa = " << kappa << std::endl;
-                throw std::runtime_error(errMsg.str().c_str());
-            }
-            if(*curStoppingPower(EkinIdx, kappaIdx) != *curStoppingPower(EkinIdx, kappaIdx))
+                /** constructor
+                 *
+                 * @param linInterpCursor
+                 */
+                HDINLINE LookupTableFunctor::LookupTableFunctor(LinInterpCursor linInterpCursor)
+                    : linInterpCursor(linInterpCursor)
+                {
+                    float_X const lnEMinTmp(electron::MIN_ENERGY);
+                    float_X const lnEMaxTmp(electron::MAX_ENERGY);
+                    this->lnEMin = math::log(lnEMinTmp);
+                    this->lnEMax = math::log(lnEMaxTmp);
+                }
+
+                /** scaled differential cross section
+                 *
+                 * @param Ekin kinetic energy of the incident electron
+                 * @param kappa energy loss normalized to Ekin
+                 */
+                HDINLINE float_X LookupTableFunctor::operator()(const float_X Ekin, const float_X kappa) const
+                {
+                    const float_X lnE = math::log(Ekin);
+
+                    const float_X binE = (lnE - this->lnEMin) / (this->lnEMax - this->lnEMin)
+                        * static_cast<float_X>(electron::NUM_SAMPLES_EKIN - 1);
+                    // in the low-energy limit Bremsstrahlung is not taken into account
+                    if(binE < float_X(0.0))
+                        return float_X(0.0);
+                    const float_X binKappa = kappa * static_cast<float_X>(electron::NUM_SAMPLES_KAPPA - 1);
+
+                    if(picLog::log_level & picLog::CRITICAL::lvl)
+                    {
+                        if(Ekin < electron::MIN_ENERGY || Ekin > electron::MAX_ENERGY)
+                        {
+                            const float_64 Ekin_SI = Ekin * UNIT_ENERGY;
+                            printf(
+                                "[Bremsstrahlung] error lookup table: Ekin=%g MeV is out of range.\n",
+                                float_X(Ekin_SI * UNITCONV_Joule_to_keV * float_X(1.0e-3)));
+                        }
+                        if(kappa < float_X(0.0) || kappa > float_X(1.0))
+                            printf("[Bremsstrahlung] error lookup table: kappa=%f is out of range.\n", kappa);
+                    }
+
+                    return this->linInterpCursor[float2_X(binE, binKappa)];
+                }
+
+
+            } // namespace detail
+
+
+            /** differential cross section: cross section per unit energy
+             *
+             * This is the screened Bethe-Heitler cross section. See e.g.:
+             * Salvat, F., et al. "Monte Carlo simulation of bremsstrahlung emission by electrons."
+             * Radiation Physics and Chemistry 75.10 (2006): 1201-1219.
+             *
+             * @param Ekin kinetic electron energy
+             * @param kappa energy loss normalized to Ekin
+             */
+            float_64 ScaledSpectrum::dcs(const float_64 Ekin, const float_64 kappa, const float_64 targetZ) const
             {
-                const float_64 Ekin_SI = Ekin * UNIT_ENERGY;
-                const float_64 Ekin_MeV = Ekin_SI * UNITCONV_Joule_to_keV / 1.0e3;
-                std::stringstream errMsg;
-                errMsg << "[Bremsstrahlung] lookup table (stopping power) has NaN-entry at Ekin = "
-                       << Ekin_MeV << " MeV, kappa = " << kappa << std::endl;
-                throw std::runtime_error(errMsg.str().c_str());
+                constexpr float_64 pi = pmacc::math::Pi<float_64>::value;
+                constexpr float_64 bohrRadius
+                    = pi * 4.0 * EPS0 * HBAR * HBAR / (float_64(ELECTRON_MASS) * ELECTRON_CHARGE * ELECTRON_CHARGE);
+                constexpr float_64 classicalElRadius = float_64(ELECTRON_CHARGE * ELECTRON_CHARGE)
+                    / (pi * 4.0 * EPS0 * ELECTRON_MASS * SPEED_OF_LIGHT * SPEED_OF_LIGHT);
+                constexpr float_64 fineStructureConstant
+                    = float_64(ELECTRON_CHARGE * ELECTRON_CHARGE) / (pi * 4.0 * EPS0 * HBAR * SPEED_OF_LIGHT);
+
+                constexpr float_64 c = SPEED_OF_LIGHT;
+                constexpr float_64 c2 = c * c;
+                constexpr float_64 m_e = ELECTRON_MASS;
+                constexpr float_64 r_e = classicalElRadius;
+                constexpr float_64 alpha = fineStructureConstant;
+
+                const float_64 W = kappa * Ekin;
+                const float_64 eps = W / (Ekin + m_e * c2);
+                const float_64 R = math::pow(targetZ, float_64(-1.0 / 3.0)) * bohrRadius;
+                const float_64 gamma = Ekin / (m_e * c2) + float_64(1.0);
+                const float_64 b = R * m_e * c / HBAR / (float_64(2.0) * gamma) * eps / (float_64(1.0) - eps);
+
+                const float_64 phi_1 = float_64(4.0) * math::log(R * m_e * c / HBAR) + float_64(2.0)
+                    - float_64(2.0) * math::log(float_64(1.0) + b * b)
+                    - float_64(4.0) * b * math::atan(float_64(1.0) / b);
+                const float_64 phi_2 = float_64(4.0) * math::log(R * m_e * c / HBAR) + float_64(7.0) / float_64(3.0)
+                    - float_64(2.0) * math::log(float_64(1.0) + b * b)
+                    - float_64(6.0) * b * math::atan(float_64(1.0) / b)
+                    - b * b
+                        * (float_64(4.0) - float_64(4.0) * b * math::atan(float_64(1.0) / b)
+                           - float_64(3.0) * math::log(float_64(1.0) + float_64(1.0) / (b * b)));
+
+                return r_e * r_e * alpha * targetZ * targetZ / W
+                    * (eps * eps * phi_1 + float_64(4.0) / float_64(3.0) * (float_64(1.0) - eps) * phi_2);
             }
-        }
-    }
 
-    *this->dBufScaledSpectrum = hBufScaledSpectrum;
-    *this->dBufStoppingPower = hBufStoppingPower;
-}
 
-/** Return a functor representing the scaled differential cross section
- *
- * scaled differential cross section = electron energy loss times cross section per unit energy
- */
-detail::LookupTableFunctor ScaledSpectrum::getScaledSpectrumFunctor() const
-{
-    LookupTableFunctor::LinInterpCursor linInterpCursor =
-        pmacc::cursor::tools::LinearInterp<float_X>()(this->dBufScaledSpectrum->origin());
+            void ScaledSpectrum::init(const float_64 targetZ)
+            {
+                namespace odeint = boost::numeric::odeint;
+
+                // there is a margin of one cell to make the linear interpolation valid for border cells.
+                this->dBufScaledSpectrum = MyBuf(new pmacc::container::DeviceBuffer<float_X, DIM2>(
+                    electron::NUM_SAMPLES_EKIN + 1,
+                    electron::NUM_SAMPLES_KAPPA + 1));
+                this->dBufStoppingPower = MyBuf(new pmacc::container::DeviceBuffer<float_X, DIM2>(
+                    electron::NUM_SAMPLES_EKIN + 1,
+                    electron::NUM_SAMPLES_KAPPA + 1));
+
+                pmacc::container::HostBuffer<float_X, DIM2> hBufScaledSpectrum(this->dBufScaledSpectrum->size());
+                pmacc::container::HostBuffer<float_X, DIM2> hBufStoppingPower(this->dBufStoppingPower->size());
+                hBufScaledSpectrum.assign(float_X(0.0));
+                hBufStoppingPower.assign(float_X(0.0));
+
+                auto curScaledSpectrum = hBufScaledSpectrum.origin();
+                auto curStoppingPower = hBufStoppingPower.origin();
+
+                const float_64 lnEMin = math::log(electron::MIN_ENERGY);
+                const float_64 lnEMax = math::log(electron::MAX_ENERGY);
+
+                using state_type = boost::array<float_64, 1>;
+
+                for(uint32_t EkinIdx = 0; EkinIdx < electron::NUM_SAMPLES_EKIN; EkinIdx++)
+                {
+                    for(uint32_t kappaIdx = 0; kappaIdx < electron::NUM_SAMPLES_KAPPA; kappaIdx++)
+                    {
+                        float_64 kappa
+                            = static_cast<float_64>(kappaIdx) / static_cast<float_64>(electron::NUM_SAMPLES_KAPPA - 1);
+                        if(kappa == 0.0)
+                            kappa = electron::MIN_KAPPA;
+
+                        const float_64 lnE_norm
+                            = static_cast<float_64>(EkinIdx) / static_cast<float_64>(electron::NUM_SAMPLES_EKIN - 1);
+                        const float_64 Ekin = math::exp(lnEMin + (lnEMax - lnEMin) * lnE_norm);
+
+                        *curScaledSpectrum(EkinIdx, kappaIdx)
+                            = Ekin * kappa * static_cast<float_X>(this->dcs(Ekin, kappa, targetZ));
+
+                        state_type integral_result = {0.0};
+                        const float_64 lowerLimit = electron::MIN_KAPPA * Ekin;
+                        const float_64 upperLimit = kappa * Ekin;
+                        const float_64 stepwidth = upperLimit / electron::NUM_STEPS_STOPPING_POWER_INTERGRAL;
+                        StoppingPowerIntegrand integrand(Ekin, *this, targetZ);
+                        odeint::integrate(integrand, integral_result, lowerLimit, upperLimit, stepwidth);
+                        *curStoppingPower(EkinIdx, kappaIdx) = static_cast<float_X>(integral_result[0]);
+
+                        // check for nans
+                        if(*curScaledSpectrum(EkinIdx, kappaIdx) != *curScaledSpectrum(EkinIdx, kappaIdx))
+                        {
+                            const float_64 Ekin_SI = Ekin * UNIT_ENERGY;
+                            const float_64 Ekin_MeV = Ekin_SI * UNITCONV_Joule_to_keV / 1.0e3;
+                            std::stringstream errMsg;
+                            errMsg << "[Bremsstrahlung] lookup table (scaled spectrum) has NaN-entry at Ekin = "
+                                   << Ekin_MeV << " MeV, kappa = " << kappa << std::endl;
+                            throw std::runtime_error(errMsg.str().c_str());
+                        }
+                        if(*curStoppingPower(EkinIdx, kappaIdx) != *curStoppingPower(EkinIdx, kappaIdx))
+                        {
+                            const float_64 Ekin_SI = Ekin * UNIT_ENERGY;
+                            const float_64 Ekin_MeV = Ekin_SI * UNITCONV_Joule_to_keV / 1.0e3;
+                            std::stringstream errMsg;
+                            errMsg << "[Bremsstrahlung] lookup table (stopping power) has NaN-entry at Ekin = "
+                                   << Ekin_MeV << " MeV, kappa = " << kappa << std::endl;
+                            throw std::runtime_error(errMsg.str().c_str());
+                        }
+                    }
+                }
+
+                *this->dBufScaledSpectrum = hBufScaledSpectrum;
+                *this->dBufStoppingPower = hBufStoppingPower;
+            }
 
-    return LookupTableFunctor(linInterpCursor);
-}
+            /** Return a functor representing the scaled differential cross section
+             *
+             * scaled differential cross section = electron energy loss times cross section per unit energy
+             */
+            detail::LookupTableFunctor ScaledSpectrum::getScaledSpectrumFunctor() const
+            {
+                LookupTableFunctor::LinInterpCursor linInterpCursor
+                    = pmacc::cursor::tools::LinearInterp<float_X>()(this->dBufScaledSpectrum->origin());
 
-/** Return a functor representing the stopping power
- *
- * stopping power = energy loss per unit length
- */
-detail::LookupTableFunctor ScaledSpectrum::getStoppingPowerFunctor() const
-{
-    LookupTableFunctor::LinInterpCursor linInterpCursor =
-        pmacc::cursor::tools::LinearInterp<float_X>()(this->dBufStoppingPower->origin());
+                return LookupTableFunctor(linInterpCursor);
+            }
 
-    return LookupTableFunctor(linInterpCursor);
-}
+            /** Return a functor representing the stopping power
+             *
+             * stopping power = energy loss per unit length
+             */
+            detail::LookupTableFunctor ScaledSpectrum::getStoppingPowerFunctor() const
+            {
+                LookupTableFunctor::LinInterpCursor linInterpCursor
+                    = pmacc::cursor::tools::LinearInterp<float_X>()(this->dBufStoppingPower->origin());
+
+                return LookupTableFunctor(linInterpCursor);
+            }
 
 
-} // namespace bremsstrahlung
-} // namespace particles
+        } // namespace bremsstrahlung
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/creation/creation.hpp b/include/picongpu/particles/creation/creation.hpp
index 12d7d7c2a8..65adc80457 100644
--- a/include/picongpu/particles/creation/creation.hpp
+++ b/include/picongpu/particles/creation/creation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -28,57 +28,59 @@
 
 namespace picongpu
 {
+    namespace particles
+    {
+        namespace creation
+        {
+            /** Calls the `createParticlesKernel` kernel to create new particles.
+             *
+             * @param sourceSpecies species from which new particles are created
+             * @param targetSpecies species of the created particles
+             * @param particleCreator functor that defines the particle creation
+             * @param cellDesc mapping description
+             *
+             * `particleCreator` must define: `init()`, `numNewParticles()` and `operator()()`
+             * \see `PhotonCreator.hpp` for a further description.
+             */
+            template<
+                typename T_SourceSpecies,
+                typename T_TargetSpecies,
+                typename T_ParticleCreator,
+                typename T_CellDescription>
+            void createParticlesFromSpecies(
+                T_SourceSpecies& sourceSpecies,
+                T_TargetSpecies& targetSpecies,
+                T_ParticleCreator particleCreator,
+                T_CellDescription cellDesc)
+            {
+                using SuperCellSize = typename MappingDesc::SuperCellSize;
+                const pmacc::math::Int<simDim> coreBorderGuardSuperCells = cellDesc.getGridSuperCells();
+                const pmacc::math::Int<simDim> guardSuperCells = cellDesc.getGuardingSuperCells();
+                const pmacc::math::Int<simDim> coreBorderSuperCells = coreBorderGuardSuperCells - 2 * guardSuperCells;
 
-namespace particles
-{
-
-namespace creation
-{
-
-/** Calls the `createParticlesKernel` kernel to create new particles.
- *
- * @param sourceSpecies species from which new particles are created
- * @param targetSpecies species of the created particles
- * @param particleCreator functor that defines the particle creation
- * @param cellDesc mapping description
- *
- * `particleCreator` must define: `init()`, `numNewParticles()` and `operator()()`
- * \see `PhotonCreator.hpp` for a further description.
- */
-template<typename T_SourceSpecies, typename T_TargetSpecies, typename T_ParticleCreator, typename T_CellDescription>
-void createParticlesFromSpecies(T_SourceSpecies& sourceSpecies,
-                                T_TargetSpecies& targetSpecies,
-                                T_ParticleCreator particleCreator,
-                                T_CellDescription cellDesc)
-{
-    using SuperCellSize = typename MappingDesc::SuperCellSize;
-    const pmacc::math::Int<simDim> coreBorderGuardSuperCells = cellDesc.getGridSuperCells();
-    const pmacc::math::Int<simDim> guardSuperCells = cellDesc.getGuardingSuperCells();
-    const pmacc::math::Int<simDim> coreBorderSuperCells = coreBorderGuardSuperCells - 2 * guardSuperCells;
-
-    constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-        pmacc::math::CT::volume< SuperCellSize >::type::value
-    >::value;
+                constexpr uint32_t numWorkers
+                    = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
 
-    /* Functor holding the actual generic particle creation kernel */
-    auto createParticlesKernel = make_CreateParticlesKernel< numWorkers >(
-        sourceSpecies.getDeviceParticlesBox(),
-        targetSpecies.getDeviceParticlesBox(),
-        particleCreator,
-        guardSuperCells);
+                /* Functor holding the actual generic particle creation kernel */
+                auto createParticlesKernel = make_CreateParticlesKernel<numWorkers>(
+                    sourceSpecies.getDeviceParticlesBox(),
+                    targetSpecies.getDeviceParticlesBox(),
+                    particleCreator,
+                    guardSuperCells);
 
-    /* This zone represents the core+border area with guard offset in unit of cells */
-    const zone::SphericZone<simDim> zone(
-        static_cast<pmacc::math::Size_t<simDim> >(coreBorderSuperCells * SuperCellSize::toRT()),
-        guardSuperCells * SuperCellSize::toRT());
+                /* This zone represents the core+border area with guard offset in unit of cells */
+                const zone::SphericZone<simDim> zone(
+                    static_cast<pmacc::math::Size_t<simDim>>(coreBorderSuperCells * SuperCellSize::toRT()),
+                    guardSuperCells * SuperCellSize::toRT());
 
-    algorithm::kernel::ForeachLockstep<numWorkers, SuperCellSize> foreach;
-    foreach(zone, createParticlesKernel, cursor::make_MultiIndexCursor<simDim>());
+                algorithm::kernel::ForeachLockstep<numWorkers, SuperCellSize> foreach;
+                foreach(zone, createParticlesKernel, cursor::make_MultiIndexCursor<simDim>())
+                    ;
 
-    /* Make sure to leave no gaps in newly created frames */
-    targetSpecies.fillAllGaps();
-}
+                /* Make sure to leave no gaps in newly created frames */
+                targetSpecies.fillAllGaps();
+            }
 
-} // namespace creation
-} // namespace particles
+        } // namespace creation
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/creation/creation.kernel b/include/picongpu/particles/creation/creation.kernel
index 81e5fa8960..21224ebd43 100644
--- a/include/picongpu/particles/creation/creation.kernel
+++ b/include/picongpu/particles/creation/creation.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten, Axel Huebl, Heiko Burau, Rene Widera,
+/* Copyright 2015-2021 Marco Garten, Axel Huebl, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Felix Schmitt
  *
  * This file is part of PIConGPU.
@@ -40,429 +40,309 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace creation
-{
-
-    /** Functor with main kernel for particle creation
-     *
-     * - maps the frame dimensions and gathers the particle boxes
-     * - contains / calls the Creator
-     *
-     * @tparam T_numWorkers number of workers
-     * @tparam T_ParBoxSource container of the source species
-     * @tparam T_ParBoxTarget container of the target species
-     * @tparam T_ParticleCreator type of the particle creation functor
-     */
-    template<
-        uint32_t T_numWorkers,
-        typename T_ParBoxSource,
-        typename T_ParBoxTarget,
-        typename T_ParticleCreator
-    >
-    struct CreateParticlesKernel
+    namespace particles
     {
-        using ParBoxSource = T_ParBoxSource;
-        using ParBoxTarget = T_ParBoxTarget;
-        using ParticleCreator = T_ParticleCreator;
-
-        ParBoxSource sourceBox;
-        ParBoxTarget targetBox;
-        ParticleCreator particleCreator;
-        DataSpace< simDim > const guardSuperCells;
-
-        CreateParticlesKernel(
-            ParBoxSource const & sourceBox,
-            ParBoxTarget const & targetBox,
-            ParticleCreator const & particleCreator,
-            DataSpace< simDim > const guardSuperCells
-        ) :
-            sourceBox( sourceBox ),
-            targetBox( targetBox ),
-            particleCreator( particleCreator ),
-            guardSuperCells( guardSuperCells )
-        { }
-
-        /** Goes over all frames and calls `ParticleCreator`
-         *
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param blockCell n-dim. block offset (in cells) relative to the origin
-         *                  of the local domain plus guarding cells
-         */
-        template< typename T_Acc >
-        DINLINE void operator( )(
-            T_Acc const & acc,
-            pmacc::math::Int< simDim > const & blockCell
-        )
+        namespace creation
         {
+            /** Functor with main kernel for particle creation
+             *
+             * - maps the frame dimensions and gathers the particle boxes
+             * - contains / calls the Creator
+             *
+             * @tparam T_numWorkers number of workers
+             * @tparam T_ParBoxSource container of the source species
+             * @tparam T_ParBoxTarget container of the target species
+             * @tparam T_ParticleCreator type of the particle creation functor
+             */
+            template<
+                uint32_t T_numWorkers,
+                typename T_ParBoxSource,
+                typename T_ParBoxTarget,
+                typename T_ParticleCreator>
+            struct CreateParticlesKernel
+            {
+                using ParBoxSource = T_ParBoxSource;
+                using ParBoxTarget = T_ParBoxTarget;
+                using ParticleCreator = T_ParticleCreator;
+
+                ParBoxSource sourceBox;
+                ParBoxTarget targetBox;
+                ParticleCreator particleCreator;
+                DataSpace<simDim> const guardSuperCells;
+
+                CreateParticlesKernel(
+                    ParBoxSource const& sourceBox,
+                    ParBoxTarget const& targetBox,
+                    ParticleCreator const& particleCreator,
+                    DataSpace<simDim> const guardSuperCells)
+                    : sourceBox(sourceBox)
+                    , targetBox(targetBox)
+                    , particleCreator(particleCreator)
+                    , guardSuperCells(guardSuperCells)
+                {
+                }
 
-            using namespace mappings::threads;
+                /** Goes over all frames and calls `ParticleCreator`
+                 *
+                 * @tparam T_Acc alpaka accelerator type
+                 *
+                 * @param blockCell n-dim. block offset (in cells) relative to the origin
+                 *                  of the local domain plus guarding cells
+                 */
+                template<typename T_Acc>
+                DINLINE void operator()(T_Acc const& acc, pmacc::math::Int<simDim> const& blockCell)
+                {
+                    using namespace mappings::threads;
 
-            constexpr uint32_t numWorkers = T_numWorkers;
+                    constexpr uint32_t numWorkers = T_numWorkers;
 
-            uint32_t const workerIdx = threadIdx.x;
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-            /* multi-dimensional offset vector from local domain origin on GPU in units of super cells */
-            pmacc::math::Int< simDim > const block = blockCell / SuperCellSize::toRT( );
+                    /* multi-dimensional offset vector from local domain origin on GPU in units of super cells */
+                    pmacc::math::Int<simDim> const block = blockCell / SuperCellSize::toRT();
 
-            // relative offset to the origin of the local domain (without any guarding cells)
-            pmacc::math::Int<simDim> const supercellCellOffset = blockCell - this->guardSuperCells * SuperCellSize::toRT( );
+                    // relative offset to the origin of the local domain (without any guarding cells)
+                    pmacc::math::Int<simDim> const supercellCellOffset
+                        = blockCell - this->guardSuperCells * SuperCellSize::toRT();
 
-            /* "particle box" : container/iterator where the particles live in
-             * and where one can get the frame in a super cell from
-             */
-            using SourceFramePtr = typename ParBoxSource::FramePtr;
-            using TargetFramePtr = typename ParBoxTarget::FramePtr;
+                    /* "particle box" : container/iterator where the particles live in
+                     * and where one can get the frame in a super cell from
+                     */
+                    using SourceFramePtr = typename ParBoxSource::FramePtr;
+                    using TargetFramePtr = typename ParBoxTarget::FramePtr;
 
-            /* for not mixing operations::assign up with the nvidia functor assign */
-            namespace partOp = pmacc::particles::operations;
+                    /* for not mixing operations::assign up with the nvidia functor assign */
+                    namespace partOp = pmacc::particles::operations;
 
-            constexpr lcellId_t maxParticlesInFrame = pmacc::math::CT::volume< SuperCellSize >::type::value;
+                    constexpr lcellId_t maxParticlesInFrame = pmacc::math::CT::volume<SuperCellSize>::type::value;
 
-            /* use two frames to allow that all virtual workers can create new particles
-             * even if newFrameFillLvl is not zero.
-             */
-            using FrameArray = memory::Array<
-                TargetFramePtr,
-                2
-            >;
-
-            PMACC_SMEM(
-                acc,
-                targetFrames,
-                FrameArray
-            );
-
-            // find last frame in super cell
-            SourceFramePtr sourceFrame( sourceBox.getLastFrame( block ) );
-
-            // end method if we have no frames
-            if( !sourceFrame.isValid( ) )
-                return;
-
-            using ParticleDomCfg = IdxConfig<
-                maxParticlesInFrame,
-                numWorkers
-            >;
-
-            ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
-
-            // initialize the collective part of the functor (e.g. field caching)
-            particleCreator.collectiveInit(
-                acc,
-                blockCell,
-                WorkerCfg< numWorkers >{ workerIdx }
-            );
-
-            memory::CtxArray<
-                ParticleCreator,
-                ParticleDomCfg
-            >
-            particleCreatorCtx{ };
-
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    // cell index within the superCell
-                    DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
-
-                    // cell offset with respect to the local domain origin (without any guarding cells
-                    pmacc::math::Int< simDim > const localCellIndex = supercellCellOffset + cellIdx;
-
-                    // create a copy of the functor for each virtual worker
-                    particleCreatorCtx[ idx ] = particleCreator;
-
-                    // init particle creator functor for each virtual worker
-                    particleCreatorCtx[ idx ].init(
-                        acc,
-                        blockCell,
-                        linearIdx,
-                        localCellIndex
-                    );
-                }
-            );
+                    /* use two frames to allow that all virtual workers can create new particles
+                     * even if newFrameFillLvl is not zero.
+                     */
+                    using FrameArray = memory::Array<TargetFramePtr, 2>;
 
-            /* Declare counter in shared memory that will later tell the current fill level or
-             * occupation of the newly created target frames.
-             */
-            PMACC_SMEM(
-                acc,
-                newFrameFillLvl,
-                int
-            );
-
-            ForEachIdx<
-                IdxConfig<
-                    2,
-                    numWorkers
-                >
-            > onlyMasters{ workerIdx };
-
-            // Declare local variable oldFrameFillLvl for each thread
-            int oldFrameFillLvl;
-
-            /* Initialize local (register) counter for each thread
-             * - describes how many new macro target particles should be created
-             */
-            memory::CtxArray<
-                uint32_t,
-                ParticleDomCfg
-            >
-            numNewParticlesCtx( 0 );
-
-            // Master initializes the frame fill level with 0
-            onlyMasters(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    if( linearIdx == 0 )
-                        newFrameFillLvl = 0;
-                    targetFrames[ linearIdx ] = nullptr;
-                }
-            );
+                    PMACC_SMEM(acc, targetFrames, FrameArray);
 
-            __syncthreads( );
+                    // find last frame in super cell
+                    SourceFramePtr sourceFrame(sourceBox.getLastFrame(block));
 
-            /* move over source species frames and call particleCreator
-             * frames are worked on in backwards order to avoid asking if there is another frame
-             * --> performance
-             * Because all frames are completely filled except the last and apart from that last frame
-             * one wants to make sure that all threads are working and every frame is worked on.
-             */
-            while( sourceFrame.isValid( ) )
-            {
+                    // end method if we have no frames
+                    if(!sourceFrame.isValid())
+                        return;
 
-                memory::CtxArray<
-                    bool,
-                    ParticleDomCfg
-                >
-                isParticleCtx(
-                    workerIdx,
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const
-                    )
-                    {
-                        return static_cast< bool >( sourceFrame[ linearIdx ][ multiMask_ ] );
-                    }
-                );
-                forEachParticle(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
-                    {
-                        bool const isParticle = static_cast< bool >( sourceFrame[ linearIdx ][ multiMask_ ] );
-                        numNewParticlesCtx[ idx ] = 0u;
-                        if( isParticle )
-                            /* ask the particle creator functor how many new particles to create. */
-                            numNewParticlesCtx[ idx ] = particleCreatorCtx[ idx ].numNewParticles(
-                                acc,
-                                *sourceFrame,
-                                linearIdx
-                            );
-                    }
-                );
+                    using ParticleDomCfg = IdxConfig<maxParticlesInFrame, numWorkers>;
 
-                __syncthreads( );
+                    ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
 
-                /* always true while-loop over all particles inside source frame until each thread breaks out individually
-                 *
-                 * **Attention**: Speaking of 1st and 2nd frame only may seem odd.
-                 * The question might arise what happens if more target particles are created than would fit into two frames.
-                 * Well, multi-particle creation during a time step is accounted for. The number of new target particles is
-                 * determined inside the outer loop over the valid frames while in the inner loop each thread can create only ONE
-                 * new macro target particle. But the loop repeats until each thread has created all the target particles needed in the time step.
-                 */
-                while( true )
-                {
-                    /* < INIT >
-                     * - targetParId is initialized as -1 (meaning: invalid)
-                     * - (local) oldFrameFillLvl set equal to (shared) newFrameFillLvl for each thread
-                     * --> each thread remembers the old "counter"
-                     */
+                    // initialize the collective part of the functor (e.g. field caching)
+                    particleCreator.collectiveInit(acc, blockCell, WorkerCfg<numWorkers>{workerIdx});
 
-                    /* Declare local target particle ID
-                     * - describes at which position in the new frame the new target particle is to be created
-                     */
-                    memory::CtxArray<
-                        int,
-                        ParticleDomCfg
-                    >
-                    targetParIdCtx( -1 );
+                    memory::CtxArray<ParticleCreator, ParticleDomCfg> particleCreatorCtx{};
 
-                    oldFrameFillLvl = newFrameFillLvl;
+                    forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                        // cell index within the superCell
+                        DataSpace<simDim> const cellIdx
+                            = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
 
-                    __syncthreads( );
+                        // cell offset with respect to the local domain origin (without any guarding cells
+                        pmacc::math::Int<simDim> const localCellIndex = supercellCellOffset + cellIdx;
 
-                    /* < CHECK & ADD >
-                     * - if a thread wants to create target particles in each cycle it can do that only once
-                     * and before that it atomically adds to the shared counter and uses the current
-                     * value as targetParId in the new frame
-                     */
-                    forEachParticle(
-                        [&](
-                            uint32_t const linearIdx,
-                            uint32_t const idx
-                        )
-                        {
-                            if( numNewParticlesCtx[ idx ] > 0u )
-                                targetParIdCtx[ idx ] = nvidia::atomicAllInc(
-                                    acc,
-                                    &newFrameFillLvl,
-                                    ::alpaka::hierarchy::Threads{}
-                                );
-                        }
-                    );
+                        // create a copy of the functor for each virtual worker
+                        particleCreatorCtx[idx] = particleCreator;
 
-                    __syncthreads( );
+                        // init particle creator functor for each virtual worker
+                        particleCreatorCtx[idx].init(acc, blockCell, linearIdx, localCellIndex);
+                    });
 
-                    /* < EXIT? >
-                     * - if the counter hasn't changed all threads break out of the loop
+                    /* Declare counter in shared memory that will later tell the current fill level or
+                     * occupation of the newly created target frames.
                      */
-                    if( oldFrameFillLvl == newFrameFillLvl )
-                        break;
+                    PMACC_SMEM(acc, newFrameFillLvl, int);
 
-                    __syncthreads( );
+                    ForEachIdx<IdxConfig<2, numWorkers>> onlyMasters{workerIdx};
 
-                    /* < NEW FRAME >
-                     * - if there is no frame, yet, the master will create a new target particle frame
-                     * and attach it to the back of the frame list
-                     */
-                    onlyMasters(
-                        [&](
-                            uint32_t const linearIdx,
-                            uint32_t const
-                        )
-                        {
-                            uint32_t const numFramesNeeded = ( newFrameFillLvl + maxParticlesInFrame - 1u ) / maxParticlesInFrame;
-                            if( linearIdx < numFramesNeeded && !targetFrames[ linearIdx ].isValid( ) )
-                            {
-                                targetFrames[ linearIdx ] = targetBox.getEmptyFrame( );
-                                targetBox.setAsLastFrame(
-                                    acc,
-                                    targetFrames[ linearIdx ],
-                                    block
-                                );
-                            }
-                        }
-                    );
-
-                    __syncthreads( );
+                    // Declare local variable oldFrameFillLvl for each thread
+                    int oldFrameFillLvl;
 
-                    /* < CREATE >
-                     * - all target particles were created
-                     * - internal particle creation counter is decremented by 1
+                    /* Initialize local (register) counter for each thread
+                     * - describes how many new macro target particles should be created
                      */
-                    forEachParticle(
-                        [&](
-                            uint32_t const linearIdx,
-                            uint32_t const idx
-                        )
+                    memory::CtxArray<uint32_t, ParticleDomCfg> numNewParticlesCtx(0);
+
+                    // Master initializes the frame fill level with 0
+                    onlyMasters([&](uint32_t const linearIdx, uint32_t const) {
+                        if(linearIdx == 0)
+                            newFrameFillLvl = 0;
+                        targetFrames[linearIdx] = nullptr;
+                    });
+
+                    cupla::__syncthreads(acc);
+
+                    /* move over source species frames and call particleCreator
+                     * frames are worked on in backwards order to avoid asking if there is another frame
+                     * --> performance
+                     * Because all frames are completely filled except the last and apart from that last frame
+                     * one wants to make sure that all threads are working and every frame is worked on.
+                     */
+                    while(sourceFrame.isValid())
+                    {
+                        memory::CtxArray<bool, ParticleDomCfg> isParticleCtx(
+                            workerIdx,
+                            [&](uint32_t const linearIdx, uint32_t const) {
+                                return static_cast<bool>(sourceFrame[linearIdx][multiMask_]);
+                            });
+                        forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                            bool const isParticle = static_cast<bool>(sourceFrame[linearIdx][multiMask_]);
+                            numNewParticlesCtx[idx] = 0u;
+                            if(isParticle)
+                                /* ask the particle creator functor how many new particles to create. */
+                                numNewParticlesCtx[idx]
+                                    = particleCreatorCtx[idx].numNewParticles(acc, *sourceFrame, linearIdx);
+                        });
+
+                        cupla::__syncthreads(acc);
+
+                        /* always true while-loop over all particles inside source frame until each thread breaks out
+                         * individually
+                         *
+                         * **Attention**: Speaking of 1st and 2nd frame only may seem odd.
+                         * The question might arise what happens if more target particles are created than would fit
+                         * into two frames. Well, multi-particle creation during a time step is accounted for. The
+                         * number of new target particles is determined inside the outer loop over the valid frames
+                         * while in the inner loop each thread can create only ONE new macro target particle. But the
+                         * loop repeats until each thread has created all the target particles needed in the time step.
+                         */
+                        while(true)
                         {
-                            uint32_t targetFrameIdx = 0;
-                            if( targetParIdCtx[ idx ] >= maxParticlesInFrame )
-                            {
-                                targetFrameIdx = 1;
-                                targetParIdCtx[ idx ] -= maxParticlesInFrame;
-                            }
-                            if( 0 <= targetParIdCtx[ idx ] )
-                            {
-                                // each virtual worker makes the attributes of its source particle accessible
-                                auto sourceParticle = sourceFrame[ linearIdx ];
-                                // each virtual worker initializes a target particle if one should be created
-                                auto targetParticle = targetFrames[ targetFrameIdx ][ targetParIdCtx[ idx ] ];
-
-                                // create a target particle in the new target particle frame:
-                                particleCreatorCtx[ idx ](
-                                    acc,
-                                    sourceParticle,
-                                    targetParticle
-                                );
-
-                                    numNewParticlesCtx[ idx ] -= 1;
+                            /* < INIT >
+                             * - targetParId is initialized as -1 (meaning: invalid)
+                             * - (local) oldFrameFillLvl set equal to (shared) newFrameFillLvl for each thread
+                             * --> each thread remembers the old "counter"
+                             */
+
+                            /* Declare local target particle ID
+                             * - describes at which position in the new frame the new target particle is to be created
+                             */
+                            memory::CtxArray<int, ParticleDomCfg> targetParIdCtx(-1);
+
+                            oldFrameFillLvl = newFrameFillLvl;
+
+                            cupla::__syncthreads(acc);
+
+                            /* < CHECK & ADD >
+                             * - if a thread wants to create target particles in each cycle it can do that only once
+                             * and before that it atomically adds to the shared counter and uses the current
+                             * value as targetParId in the new frame
+                             */
+                            forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                                if(numNewParticlesCtx[idx] > 0u)
+                                    targetParIdCtx[idx]
+                                        = nvidia::atomicAllInc(acc, &newFrameFillLvl, ::alpaka::hierarchy::Threads{});
+                            });
+
+                            cupla::__syncthreads(acc);
+
+                            /* < EXIT? >
+                             * - if the counter hasn't changed all threads break out of the loop
+                             */
+                            if(oldFrameFillLvl == newFrameFillLvl)
+                                break;
+
+                            cupla::__syncthreads(acc);
+
+                            /* < NEW FRAME >
+                             * - if there is no frame, yet, the master will create a new target particle frame
+                             * and attach it to the back of the frame list
+                             */
+                            onlyMasters([&](uint32_t const linearIdx, uint32_t const) {
+                                uint32_t const numFramesNeeded
+                                    = (newFrameFillLvl + maxParticlesInFrame - 1u) / maxParticlesInFrame;
+                                if(linearIdx < numFramesNeeded && !targetFrames[linearIdx].isValid())
+                                {
+                                    targetFrames[linearIdx] = targetBox.getEmptyFrame(acc);
+                                    targetBox.setAsLastFrame(acc, targetFrames[linearIdx], block);
                                 }
-                        }
-                    );
-
-                    __syncthreads( );
+                            });
+
+                            cupla::__syncthreads(acc);
+
+                            /* < CREATE >
+                             * - all target particles were created
+                             * - internal particle creation counter is decremented by 1
+                             */
+                            forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                                uint32_t targetFrameIdx = 0;
+                                if(targetParIdCtx[idx] >= maxParticlesInFrame)
+                                {
+                                    targetFrameIdx = 1;
+                                    targetParIdCtx[idx] -= maxParticlesInFrame;
+                                }
+                                if(0 <= targetParIdCtx[idx])
+                                {
+                                    // each virtual worker makes the attributes of its source particle accessible
+                                    auto sourceParticle = sourceFrame[linearIdx];
+                                    // each virtual worker initializes a target particle if one should be created
+                                    auto targetParticle = targetFrames[targetFrameIdx][targetParIdCtx[idx]];
 
-                    onlyMasters(
-                        [&](
-                            uint32_t const linearIdx,
-                            uint32_t const
-                        )
-                        {
-                            if( linearIdx == 0  && newFrameFillLvl >= maxParticlesInFrame )
-                            {
-                                 newFrameFillLvl -= maxParticlesInFrame;
-                                 // copy the not filled frame pointer to the beginning
-                                 targetFrames[ 0 ] = targetFrames[ 1 ];
-                                 // reset second frame
-                                 targetFrames[ 1 ] = nullptr;
-                            }
-                        }
-                    );
+                                    // create a target particle in the new target particle frame:
+                                    particleCreatorCtx[idx](acc, sourceParticle, targetParticle);
 
-                    __syncthreads( );
-                }
+                                    numNewParticlesCtx[idx] -= 1;
+                                }
+                            });
+
+                            cupla::__syncthreads(acc);
+
+                            onlyMasters([&](uint32_t const linearIdx, uint32_t const) {
+                                if(linearIdx == 0 && newFrameFillLvl >= maxParticlesInFrame)
+                                {
+                                    newFrameFillLvl -= maxParticlesInFrame;
+                                    // copy the not filled frame pointer to the beginning
+                                    targetFrames[0] = targetFrames[1];
+                                    // reset second frame
+                                    targetFrames[1] = nullptr;
+                                }
+                            });
 
-                __syncthreads( );
+                            cupla::__syncthreads(acc);
+                        }
 
-                sourceFrame = sourceBox.getPreviousFrame( sourceFrame );
+                        cupla::__syncthreads(acc);
 
+                        sourceFrame = sourceBox.getPreviousFrame(sourceFrame);
+                    }
+                }
+            };
+
+            /** Convenient function to create a `CreateParticlesKernel` instance
+             *
+             * @tparam T_numWorkers number of workers
+             *
+             * @param parBoxSource particle box of the source species
+             * @param parBoxTarget particle box of the target species
+             * @param particleCreator particle creation functor
+             * @param guardSuperCells number of guard cells per dimension
+             * @return new `CreateParticlesKernel` instance
+             */
+            template<
+                uint32_t T_numWorkers,
+                typename T_ParBoxSource,
+                typename T_ParBoxTarget,
+                typename T_ParticleCreator>
+            CreateParticlesKernel<T_numWorkers, T_ParBoxSource, T_ParBoxTarget, T_ParticleCreator>
+            make_CreateParticlesKernel(
+                T_ParBoxSource const& parBoxSource,
+                T_ParBoxTarget const& parBoxTarget,
+                T_ParticleCreator const& particleCreator,
+                DataSpace<simDim> const& guardSuperCells)
+            {
+                return CreateParticlesKernel<T_numWorkers, T_ParBoxSource, T_ParBoxTarget, T_ParticleCreator>(
+                    parBoxSource,
+                    parBoxTarget,
+                    particleCreator,
+                    guardSuperCells);
             }
-        }
-    };
-
-    /** Convenient function to create a `CreateParticlesKernel` instance
-     *
-     * @tparam T_numWorkers number of workers
-     *
-     * @param parBoxSource particle box of the source species
-     * @param parBoxTarget particle box of the target species
-     * @param particleCreator particle creation functor
-     * @param guardSuperCells number of guard cells per dimension
-     * @return new `CreateParticlesKernel` instance
-     */
-    template<
-        uint32_t T_numWorkers,
-        typename T_ParBoxSource,
-        typename T_ParBoxTarget,
-        typename T_ParticleCreator
-    >
-    CreateParticlesKernel<
-        T_numWorkers,
-        T_ParBoxSource,
-        T_ParBoxTarget,
-        T_ParticleCreator
-    >
-    make_CreateParticlesKernel(
-        T_ParBoxSource const & parBoxSource,
-        T_ParBoxTarget const & parBoxTarget,
-        T_ParticleCreator const & particleCreator,
-        DataSpace< simDim > const & guardSuperCells)
-    {
-        return CreateParticlesKernel<
-            T_numWorkers,
-            T_ParBoxSource,
-            T_ParBoxTarget,
-            T_ParticleCreator
-        >(
-            parBoxSource,
-            parBoxTarget,
-            particleCreator,
-            guardSuperCells
-        );
-    }
-
-} // namespace creation
-} // namespace particles
+
+        } // namespace creation
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/EveryNthCellImpl.def b/include/picongpu/particles/densityProfiles/EveryNthCellImpl.def
index ab3578f6fa..4d5c959b7c 100644
--- a/include/picongpu/particles/densityProfiles/EveryNthCellImpl.def
+++ b/include/picongpu/particles/densityProfiles/EveryNthCellImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -24,32 +24,29 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    /** A density profile which only initializes each nth cell
-     *
-     * Useful to initialize probe particles or material dopings. The result is
-     * either 0 (no particle) or the full density. The result of this particular
-     * functor can be larger 1.0 with T_SkipCells::toRT().productOfComponents()
-     * in order to properly fulfill the density of a species via increased
-     * weighting.
-     *
-     * @tparam T_SkipCells The period for the number of cells to skip for each
-     *                     direction before initializing a particle. Signature
-     *                     of a pmacc::math::CT::UInt32
-     */
-    template<
-        typename T_SkipCells
-    >
-    struct EveryNthCellImpl
+    namespace densityProfiles
     {
-        // note: `sizeof(ANY_TYPE) != 0` defers the evaluation
-        PMACC_CASSERT_MSG_TYPE(
-            __Density_Profile_EveryNthCellImpl_expects_a_PMacc_math_CT_UInt32,
-            T_SkipCells,
-            false && sizeof( T_SkipCells ) != 0
-        );
-    };
+        /** A density profile which only initializes each nth cell
+         *
+         * Useful to initialize probe particles or material dopings. The result is
+         * either 0 (no particle) or the full density. The result of this particular
+         * functor can be larger 1.0 with T_SkipCells::toRT().productOfComponents()
+         * in order to properly fulfill the density of a species via increased
+         * weighting.
+         *
+         * @tparam T_SkipCells The period for the number of cells to skip for each
+         *                     direction before initializing a particle. Signature
+         *                     of a pmacc::math::CT::UInt32
+         */
+        template<typename T_SkipCells>
+        struct EveryNthCellImpl
+        {
+            // note: `sizeof(ANY_TYPE) != 0` defers the evaluation
+            PMACC_CASSERT_MSG_TYPE(
+                __Density_Profile_EveryNthCellImpl_expects_a_PMacc_math_CT_UInt32,
+                T_SkipCells,
+                false && sizeof(T_SkipCells) != 0);
+        };
 
-} // namespace densityProfiles
+    } // namespace densityProfiles
 } // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/EveryNthCellImpl.hpp b/include/picongpu/particles/densityProfiles/EveryNthCellImpl.hpp
index d86e83d8c1..5c558414ca 100644
--- a/include/picongpu/particles/densityProfiles/EveryNthCellImpl.hpp
+++ b/include/picongpu/particles/densityProfiles/EveryNthCellImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -27,64 +27,48 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template<
-        uint32_t ... Args
-    >
-    struct EveryNthCellImpl<
-        pmacc::math::CT::UInt32<
-            Args ...
-        >
-    >
+    namespace densityProfiles
     {
-        using OrgSkipCells = pmacc::math::CT::UInt32< Args ... >;
-        using SkipCells = typename pmacc::math::CT::shrinkTo<
-            OrgSkipCells,
-            simDim
-        >::type;
-
-        template<typename T_SpeciesType>
-        struct apply
+        template<uint32_t... Args>
+        struct EveryNthCellImpl<pmacc::math::CT::UInt32<Args...>>
         {
-            using type = EveryNthCellImpl< OrgSkipCells >;
-        };
+            using OrgSkipCells = pmacc::math::CT::UInt32<Args...>;
+            using SkipCells = typename pmacc::math::CT::shrinkTo<OrgSkipCells, simDim>::type;
 
-        HINLINE
-        EveryNthCellImpl( uint32_t currentStep )
-        {
-        }
+            template<typename T_SpeciesType>
+            struct apply
+            {
+                using type = EveryNthCellImpl<OrgSkipCells>;
+            };
 
-        /** Calculate the normalized density
-         *
-         * @param totalCellOffset total offset including all slides [in cells]
-         */
-        HDINLINE float_X
-        operator()( DataSpace< simDim > const & totalCellOffset )
-        {
-            // modulo!
-            auto const isThisCellWithProbe( totalCellOffset % SkipCells::toRT() );
+            HINLINE
+            EveryNthCellImpl(uint32_t currentStep)
+            {
+            }
+
+            /** Calculate the normalized density
+             *
+             * @param totalCellOffset total offset including all slides [in cells]
+             */
+            HDINLINE float_X operator()(DataSpace<simDim> const& totalCellOffset)
+            {
+                // modulo!
+                auto const isThisCellWithProbe(totalCellOffset % SkipCells::toRT());
 
-            // is this cell populated with a probe particle?
-            bool const isPopulated(
-                isThisCellWithProbe == DataSpace< simDim >::create( 0 )
-            );
+                // is this cell populated with a probe particle?
+                bool const isPopulated(isThisCellWithProbe == DataSpace<simDim>::create(0));
 
-            /* every how many (volumentric) cells do we set a particle:
-             * scale up weighting accordingly */
-            float_X const weightingScaling(
-                precisionCast< float_X >(
-                    SkipCells::toRT().productOfComponents()
-                )
-            );
+                /* every how many (volumentric) cells do we set a particle:
+                 * scale up weighting accordingly */
+                float_X const weightingScaling(precisionCast<float_X>(SkipCells::toRT().productOfComponents()));
 
-            // fill only the selected cells
-            float_X result( 0.0 );
-            if( isPopulated )
-                result = weightingScaling;
+                // fill only the selected cells
+                float_X result(0.0);
+                if(isPopulated)
+                    result = weightingScaling;
 
-            return result;
-        }
-    };
-} // namespace densityProfiles
+                return result;
+            }
+        };
+    } // namespace densityProfiles
 } // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/FreeFormulaImpl.def b/include/picongpu/particles/densityProfiles/FreeFormulaImpl.def
index 63a7ed5569..decdfd822c 100644
--- a/include/picongpu/particles/densityProfiles/FreeFormulaImpl.def
+++ b/include/picongpu/particles/densityProfiles/FreeFormulaImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,9 +22,9 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template<typename T_ParamClass>
-    struct FreeFormulaImpl;
-}
-}
+    namespace densityProfiles
+    {
+        template<typename T_ParamClass>
+        struct FreeFormulaImpl;
+    }
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/FreeFormulaImpl.hpp b/include/picongpu/particles/densityProfiles/FreeFormulaImpl.hpp
index 40b7ebd112..6f1bf20022 100644
--- a/include/picongpu/particles/densityProfiles/FreeFormulaImpl.hpp
+++ b/include/picongpu/particles/densityProfiles/FreeFormulaImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Richard Pausch, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Richard Pausch, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,38 +26,38 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template< typename T_UserFunctor >
-    struct FreeFormulaImpl : public particles::functor::User< T_UserFunctor >
+    namespace densityProfiles
     {
-        using UserFunctor = particles::functor::User< T_UserFunctor >;
-
-        template< typename T_SpeciesType >
-        struct apply
+        template<typename T_UserFunctor>
+        struct FreeFormulaImpl : public particles::functor::User<T_UserFunctor>
         {
-            using type = FreeFormulaImpl< UserFunctor >;
+            using UserFunctor = particles::functor::User<T_UserFunctor>;
+
+            template<typename T_SpeciesType>
+            struct apply
+            {
+                using type = FreeFormulaImpl<UserFunctor>;
+            };
+
+            HINLINE FreeFormulaImpl(uint32_t currentStep) : UserFunctor(currentStep)
+            {
+            }
+
+            /** Calculate the normalized density
+             *
+             * @param totalCellOffset total offset including all slides [in cells]
+             */
+            HDINLINE float_X operator()(DataSpace<simDim> const& totalCellOffset)
+            {
+                float_64 const unitLength(UNIT_LENGTH); // workaround to use UNIT_LENGTH on device
+                float3_64 const cellSize_SI(precisionCast<float_64>(cellSize) * unitLength);
+                // evaluate at cell center for a more accurate estimate for the cell
+                floatD_64 const totalCenterCellOffset
+                    = precisionCast<float_64>(totalCellOffset) + floatD_64::create(0.5);
+                floatD_64 const position_SI(totalCenterCellOffset * cellSize_SI.shrink<simDim>());
+
+                return UserFunctor::operator()(position_SI, cellSize_SI);
+            }
         };
-
-        HINLINE FreeFormulaImpl( uint32_t currentStep ) : UserFunctor( currentStep )
-        {
-        }
-
-        /** Calculate the normalized density
-         *
-         * @param totalCellOffset total offset including all slides [in cells]
-         */
-        HDINLINE float_X operator()( DataSpace< simDim > const & totalCellOffset )
-        {
-            float_64 const unitLength( UNIT_LENGTH ); // workaround to use UNIT_LENGTH on device
-            float3_64 const cellSize_SI( precisionCast< float_64 >( cellSize ) * unitLength );
-            floatD_64 const position_SI( precisionCast< float_64 >( totalCellOffset ) * cellSize_SI.shrink<simDim>( ) );
-
-            return UserFunctor::operator()(
-                position_SI,
-                cellSize_SI
-            );
-        }
-    };
-} // namespace densityProfiles
+    } // namespace densityProfiles
 } // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/FromHDF5Impl.def b/include/picongpu/particles/densityProfiles/FromHDF5Impl.def
index b1b9801187..07888ee2b0 100644
--- a/include/picongpu/particles/densityProfiles/FromHDF5Impl.def
+++ b/include/picongpu/particles/densityProfiles/FromHDF5Impl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,9 +22,9 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template<typename T_ParamClass>
-    struct FromHDF5Impl;
-}
-}
+    namespace densityProfiles
+    {
+        template<typename T_ParamClass>
+        struct FromHDF5Impl;
+    }
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/FromHDF5Impl.hpp b/include/picongpu/particles/densityProfiles/FromHDF5Impl.hpp
index 9795c33c32..eac4936874 100644
--- a/include/picongpu/particles/densityProfiles/FromHDF5Impl.hpp
+++ b/include/picongpu/particles/densityProfiles/FromHDF5Impl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -33,211 +33,206 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-
-template<typename T_ParamClass>
-struct FromHDF5Impl : public T_ParamClass
-{
-    using ParamClass = T_ParamClass;
-
-    template<typename T_SpeciesType>
-    struct apply
-    {
-        using type = FromHDF5Impl<ParamClass>;
-    };
-
-    HINLINE FromHDF5Impl(uint32_t currentStep)
-    {
-        const uint32_t numSlides = MovingWindow::getInstance( ).getSlideCounter( currentStep );
-        auto window = MovingWindow::getInstance().getWindow(currentStep);
-        loadHDF5(window);
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-        DataSpace<simDim> localCells = subGrid.getLocalDomain( ).size;
-        totalGpuOffset = subGrid.getLocalDomain( ).offset;
-        totalGpuOffset.y( ) += numSlides * localCells.y( );
-    }
-
-    /** Calculate the normalized density from HDF5 file
-     *
-     * @param totalCellOffset total offset including all slides [in cells]
-     */
-    HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
-    {
-        const DataSpace<simDim> localCellIdx(totalCellOffset - totalGpuOffset);
-        return precisionCast<float_X>(deviceDataBox(localCellIdx + SuperCellSize::toRT() * GuardSize::toRT()).x());
-    }
-
-private:
-
-    void loadHDF5(Window &window)
+    namespace densityProfiles
     {
-        using namespace splash;
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        PMACC_CASSERT_MSG(
-            _please_allocate_at_least_one_FieldTmp_in_memory_param,
-            fieldTmpNumSlots > 0
-        );
-        auto fieldTmp = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true );
-        auto& fieldBuffer = fieldTmp->getGridBuffer();
-
-        deviceDataBox = fieldBuffer.getDeviceBuffer().getDataBox();
-
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(0);
-        const uint32_t maxOpenFilesPerNode = 1;
-
-        /* get a new ParallelDomainCollector for our MPI rank only*/
-        ParallelDomainCollector pdc(
-                                    MPI_COMM_SELF,
-                                    gc.getCommunicator().getMPIInfo(),
-                                    Dimensions(1, 1, 1),
-                                    maxOpenFilesPerNode);
-
-        try
+        template<typename T_ParamClass>
+        struct FromHDF5Impl : public T_ParamClass
         {
-            /* setup ParallelDomainCollector pdc to read the density information from hdf5 */
-            DataCollector::FileCreationAttr attr;
-            DataCollector::initFileCreationAttr(attr);
-            attr.fileAccType = DataCollector::FAT_READ;
+            using ParamClass = T_ParamClass;
 
-            pdc.open(ParamClass::filename, attr);
-
-            /* set which part of the hdf5 file our MPI rank reads */
-            DataSpace<simDim> globalSlideOffset;
-            globalSlideOffset.y() = numSlides * localDomain.size.y();
-
-            Dimensions domainOffset(0, 0, 0);
-            for (uint32_t d = 0; d < simDim; ++d)
-                domainOffset[d] = localDomain.offset[d] + globalSlideOffset[d];
-
-            if (gc.getPosition().y() == 0)
-                domainOffset[1] += window.globalDimensions.offset.y();
-
-            DataSpace<simDim> localDomainSize = localDomain.size;
-            Dimensions domainSize(1, 1, 1);
-            for (uint32_t d = 0; d < simDim; ++d)
-                domainSize[d] = localDomainSize[d];
-
-            /* clear host buffer with default value */
-            fieldBuffer.getHostBuffer().setValue(float1_X(ParamClass::defaultDensity));
-
-            /* get dimensions and offsets (collective call) */
-            Domain fileDomain = pdc.getGlobalDomain(ParamClass::iteration, ParamClass::datasetName);
-            Dimensions fileDomainEnd = fileDomain.getOffset() + fileDomain.getSize();
-            DataSpace<simDim> accessSpace;
-            DataSpace<simDim> accessOffset;
-
-            Dimensions fileAccessSpace(1, 1, 1);
-            Dimensions fileAccessOffset(0, 0, 0);
-
-            /* For each dimension, compute how file domain and local simulation domain overlap
-             * and which sizes and offsets are required for loading data from the file.
-             **/
-            for (uint32_t d = 0; d < simDim; ++d)
+            template<typename T_SpeciesType>
+            struct apply
             {
-                /* file domain in/in-after sim domain */
-                if (fileDomain.getOffset()[d] >= domainOffset[d] &&
-                    fileDomain.getOffset()[d] <= domainOffset[d] + domainSize[d])
-                {
-                    accessSpace[d] = std::min(domainOffset[d] + domainSize[d] - fileDomain.getOffset()[d],
-                                              fileDomain.getSize()[d]);
-                    fileAccessSpace[d] = accessSpace[d];
+                using type = FromHDF5Impl<ParamClass>;
+            };
 
-                    accessOffset[d] = fileDomain.getOffset()[d] - domainOffset[d];
-                    fileAccessOffset[d] = 0;
-                    continue;
-                }
+            HINLINE FromHDF5Impl(uint32_t currentStep)
+            {
+                const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+                auto window = MovingWindow::getInstance().getWindow(currentStep);
+                loadHDF5(window);
+                const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                DataSpace<simDim> localCells = subGrid.getLocalDomain().size;
+                totalGpuOffset = subGrid.getLocalDomain().offset;
+                totalGpuOffset.y() += numSlides * localCells.y();
+            }
 
-                /* file domain before-in sim domain */
-                if (fileDomainEnd[d] >= domainOffset[d] &&
-                    fileDomainEnd[d] <= domainOffset[d] + domainSize[d])
-                {
-                    accessSpace[d] = fileDomainEnd[d] - domainOffset[d];
-                    fileAccessSpace[d] = accessSpace[d];
+            /** Calculate the normalized density from HDF5 file
+             *
+             * @param totalCellOffset total offset including all slides [in cells]
+             */
+            HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
+            {
+                const DataSpace<simDim> localCellIdx(totalCellOffset - totalGpuOffset);
+                return precisionCast<float_X>(
+                    deviceDataBox(localCellIdx + SuperCellSize::toRT() * GuardSize::toRT()).x());
+            }
 
-                    accessOffset[d] = 0;
-                    fileAccessOffset[d] = domainOffset[d] - fileDomain.getOffset()[d];
-                    continue;
-                }
+        private:
+            void loadHDF5(Window& window)
+            {
+                using namespace splash;
+                DataConnector& dc = Environment<>::get().DataConnector();
 
-                /* sim domain in file domain */
-                if (domainOffset[d] >= fileDomain.getOffset()[d] &&
-                    domainOffset[d] + domainSize[d] <= fileDomainEnd[d])
-                {
-                    accessSpace[d] = domainSize[d];
-                    fileAccessSpace[d] = accessSpace[d];
+                PMACC_CASSERT_MSG(_please_allocate_at_least_one_FieldTmp_in_memory_param, fieldTmpNumSlots > 0);
+                auto fieldTmp = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+                auto& fieldBuffer = fieldTmp->getGridBuffer();
 
-                    accessOffset[d] = 0;
-                    fileAccessOffset[d] = domainOffset[d] - fileDomain.getOffset()[d];
-                    continue;
-                }
+                deviceDataBox = fieldBuffer.getDeviceBuffer().getDataBox();
 
-                /* file domain and sim domain do not intersect, do not load anything */
-                accessSpace[d] = 0;
-                break;
-            }
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+                const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(0);
+                const uint32_t maxOpenFilesPerNode = 1;
 
-            /* allocate temporary buffer for hdf5 data */
-            using ValueType = typename FieldTmp::ValueType::type;
-            ValueType *tmpBfr = nullptr;
+                /* get a new ParallelDomainCollector for our MPI rank only*/
+                ParallelDomainCollector pdc(
+                    MPI_COMM_SELF,
+                    gc.getCommunicator().getMPIInfo(),
+                    Dimensions(1, 1, 1),
+                    maxOpenFilesPerNode);
 
-            size_t accessSize = accessSpace.productOfComponents();
-            if (accessSize > 0)
-            {
-                tmpBfr = new ValueType[accessSize];
-
-                Dimensions sizeRead(0, 0, 0);
-                pdc.read(
-                         ParamClass::iteration,
-                         fileAccessSpace,
-                         fileAccessOffset,
-                         ParamClass::datasetName,
-                         sizeRead,
-                         tmpBfr);
-
-                if (sizeRead.getScalarSize() != accessSize)
+                try
                 {
-                    __delete(tmpBfr);
-                    return;
+                    /* setup ParallelDomainCollector pdc to read the density information from hdf5 */
+                    DataCollector::FileCreationAttr attr;
+                    DataCollector::initFileCreationAttr(attr);
+                    attr.fileAccType = DataCollector::FAT_READ;
+
+                    pdc.open(ParamClass::filename, attr);
+
+                    /* set which part of the hdf5 file our MPI rank reads */
+                    DataSpace<simDim> globalSlideOffset;
+                    globalSlideOffset.y() = numSlides * localDomain.size.y();
+
+                    Dimensions domainOffset(0, 0, 0);
+                    for(uint32_t d = 0; d < simDim; ++d)
+                        domainOffset[d] = localDomain.offset[d] + globalSlideOffset[d];
+
+                    if(gc.getPosition().y() == 0)
+                        domainOffset[1] += window.globalDimensions.offset.y();
+
+                    DataSpace<simDim> localDomainSize = localDomain.size;
+                    Dimensions domainSize(1, 1, 1);
+                    for(uint32_t d = 0; d < simDim; ++d)
+                        domainSize[d] = localDomainSize[d];
+
+                    /* clear host buffer with default value */
+                    fieldBuffer.getHostBuffer().setValue(float1_X(ParamClass::defaultDensity));
+
+                    /* get dimensions and offsets (collective call) */
+                    Domain fileDomain = pdc.getGlobalDomain(ParamClass::iteration, ParamClass::datasetName);
+                    Dimensions fileDomainEnd = fileDomain.getOffset() + fileDomain.getSize();
+                    DataSpace<simDim> accessSpace;
+                    DataSpace<simDim> accessOffset;
+
+                    Dimensions fileAccessSpace(1, 1, 1);
+                    Dimensions fileAccessOffset(0, 0, 0);
+
+                    /* For each dimension, compute how file domain and local simulation domain overlap
+                     * and which sizes and offsets are required for loading data from the file.
+                     **/
+                    for(uint32_t d = 0; d < simDim; ++d)
+                    {
+                        /* file domain in/in-after sim domain */
+                        if(fileDomain.getOffset()[d] >= domainOffset[d]
+                           && fileDomain.getOffset()[d] <= domainOffset[d] + domainSize[d])
+                        {
+                            accessSpace[d] = std::min(
+                                domainOffset[d] + domainSize[d] - fileDomain.getOffset()[d],
+                                fileDomain.getSize()[d]);
+                            fileAccessSpace[d] = accessSpace[d];
+
+                            accessOffset[d] = fileDomain.getOffset()[d] - domainOffset[d];
+                            fileAccessOffset[d] = 0;
+                            continue;
+                        }
+
+                        /* file domain before-in sim domain */
+                        if(fileDomainEnd[d] >= domainOffset[d] && fileDomainEnd[d] <= domainOffset[d] + domainSize[d])
+                        {
+                            accessSpace[d] = fileDomainEnd[d] - domainOffset[d];
+                            fileAccessSpace[d] = accessSpace[d];
+
+                            accessOffset[d] = 0;
+                            fileAccessOffset[d] = domainOffset[d] - fileDomain.getOffset()[d];
+                            continue;
+                        }
+
+                        /* sim domain in file domain */
+                        if(domainOffset[d] >= fileDomain.getOffset()[d]
+                           && domainOffset[d] + domainSize[d] <= fileDomainEnd[d])
+                        {
+                            accessSpace[d] = domainSize[d];
+                            fileAccessSpace[d] = accessSpace[d];
+
+                            accessOffset[d] = 0;
+                            fileAccessOffset[d] = domainOffset[d] - fileDomain.getOffset()[d];
+                            continue;
+                        }
+
+                        /* file domain and sim domain do not intersect, do not load anything */
+                        accessSpace[d] = 0;
+                        break;
+                    }
+
+                    /* allocate temporary buffer for hdf5 data */
+                    using ValueType = typename FieldTmp::ValueType::type;
+                    ValueType* tmpBfr = nullptr;
+
+                    size_t accessSize = accessSpace.productOfComponents();
+                    if(accessSize > 0)
+                    {
+                        tmpBfr = new ValueType[accessSize];
+
+                        Dimensions sizeRead(0, 0, 0);
+                        pdc.read(
+                            ParamClass::iteration,
+                            fileAccessSpace,
+                            fileAccessOffset,
+                            ParamClass::datasetName,
+                            sizeRead,
+                            tmpBfr);
+
+                        if(sizeRead.getScalarSize() != accessSize)
+                        {
+                            __delete(tmpBfr);
+                            return;
+                        }
+
+                        /* get the databox of the host buffer */
+                        auto dataBox = fieldBuffer.getHostBuffer().getDataBox();
+                        /* get a 1D access object to the databox */
+                        using D1Box = DataBoxDim1Access<typename FieldTmp::DataBoxType>;
+                        DataSpace<simDim> guards = fieldBuffer.getGridLayout().getGuard();
+                        D1Box d1RAccess(dataBox.shift(guards + accessOffset), accessSpace);
+
+                        /* copy from temporary buffer to fieldTmp host buffer */
+                        for(int i = 0; i < accessSpace.productOfComponents(); ++i)
+                        {
+                            d1RAccess[i].x() = tmpBfr[i];
+                        }
+
+                        __delete(tmpBfr);
+                    }
+
+                    pdc.close();
+
+                    /* copy host data to the device */
+                    fieldBuffer.hostToDevice();
+                    __getTransactionEvent().waitForFinished();
                 }
-
-                /* get the databox of the host buffer */
-                auto dataBox = fieldBuffer.getHostBuffer().getDataBox();
-                /* get a 1D access object to the databox */
-                using D1Box = DataBoxDim1Access< typename FieldTmp::DataBoxType >;
-                DataSpace<simDim> guards = fieldBuffer.getGridLayout().getGuard();
-                D1Box d1RAccess(dataBox.shift(guards + accessOffset), accessSpace);
-
-                /* copy from temporary buffer to fieldTmp host buffer */
-                for (int i = 0; i < accessSpace.productOfComponents(); ++i)
+                catch(const DCException& e)
                 {
-                    d1RAccess[i].x() = tmpBfr[i];
+                    std::cerr << e.what() << std::endl;
+                    return;
                 }
 
-                __delete(tmpBfr);
+                return;
             }
 
-            pdc.close();
-
-            /* copy host data to the device */
-            fieldBuffer.hostToDevice();
-            __getTransactionEvent().waitForFinished();
-
-        }
-        catch (const DCException& e)
-        {
-            std::cerr << e.what() << std::endl;
-            return;
-        }
-
-        return;
-    }
-
-    PMACC_ALIGN(deviceDataBox,FieldTmp::DataBoxType);
-    PMACC_ALIGN(totalGpuOffset,DataSpace<simDim>);
-};
-}
-}
+            PMACC_ALIGN(deviceDataBox, FieldTmp::DataBoxType);
+            PMACC_ALIGN(totalGpuOffset, DataSpace<simDim>);
+        };
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/GaussianCloudImpl.def b/include/picongpu/particles/densityProfiles/GaussianCloudImpl.def
index d03178f90e..822af5e1fe 100644
--- a/include/picongpu/particles/densityProfiles/GaussianCloudImpl.def
+++ b/include/picongpu/particles/densityProfiles/GaussianCloudImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,9 +22,9 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template<typename T_ParamClass>
-    struct GaussianCloudImpl;
-}
-}
+    namespace densityProfiles
+    {
+        template<typename T_ParamClass>
+        struct GaussianCloudImpl;
+    }
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/GaussianCloudImpl.hpp b/include/picongpu/particles/densityProfiles/GaussianCloudImpl.hpp
index 591879015a..9bbfde77ab 100644
--- a/include/picongpu/particles/densityProfiles/GaussianCloudImpl.hpp
+++ b/include/picongpu/particles/densityProfiles/GaussianCloudImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -25,54 +25,51 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-
-template<typename T_ParamClass>
-struct GaussianCloudImpl : public T_ParamClass
-{
-    using ParamClass = T_ParamClass;
-
-    template<typename T_SpeciesType>
-    struct apply
+    namespace densityProfiles
     {
-        using type = GaussianCloudImpl<ParamClass>;
-    };
+        template<typename T_ParamClass>
+        struct GaussianCloudImpl : public T_ParamClass
+        {
+            using ParamClass = T_ParamClass;
 
-    HINLINE GaussianCloudImpl(uint32_t currentStep)
-    {
-    }
+            template<typename T_SpeciesType>
+            struct apply
+            {
+                using type = GaussianCloudImpl<ParamClass>;
+            };
 
-    /** Calculate the normalized density
-     *
-     * @param totalCellOffset total offset including all slides [in cells]
-     */
-    HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
-    {
-        const float_64 unit_length = UNIT_LENGTH;
-        const float_X vacuum_y = float_X(ParamClass::vacuumCellsY) * cellSize.y();
-        const floatD_X center = precisionCast<float_X>(ParamClass::center_SI / unit_length);
-        const floatD_X sigma = precisionCast<float_X>(ParamClass::sigma_SI / unit_length);
+            HINLINE GaussianCloudImpl(uint32_t currentStep)
+            {
+            }
+
+            /** Calculate the normalized density
+             *
+             * @param totalCellOffset total offset including all slides [in cells]
+             */
+            HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
+            {
+                const float_64 unit_length = UNIT_LENGTH;
+                const float_X vacuum_y = float_X(ParamClass::vacuumCellsY) * cellSize.y();
+                const floatD_X center = precisionCast<float_X>(ParamClass::center_SI / unit_length);
+                const floatD_X sigma = precisionCast<float_X>(ParamClass::sigma_SI / unit_length);
 
-        const floatD_X globalCellPos(
-                                     precisionCast<float_X>(totalCellOffset) *
-                                     cellSize.shrink<simDim>()
-                                     );
+                const floatD_X globalCellPos(precisionCast<float_X>(totalCellOffset) * cellSize.shrink<simDim>());
 
-        if (globalCellPos.y() < vacuum_y) return float_X(0.0);
+                if(globalCellPos.y() < vacuum_y)
+                    return float_X(0.0);
 
-        /* for x, y, z calculate: x-x0 / sigma_x */
-        const floatD_X r0overSigma = (globalCellPos - center) / sigma;
-        /* get lenghts of r0 over sigma */
-        const float_X exponent = math::abs(r0overSigma);
+                /* for x, y, z calculate: x-x0 / sigma_x */
+                const floatD_X r0overSigma = (globalCellPos - center) / sigma;
+                /* get lenghts of r0 over sigma */
+                const float_X exponent = math::abs(r0overSigma);
 
-        /* calculate exp(factor * exponent**power) */
-        const float_X power  = ParamClass::gasPower;
-        const float_X factor = ParamClass::gasFactor;
-        const float_X density = math::exp(factor * math::pow(exponent, power));
+                /* calculate exp(factor * exponent**power) */
+                const float_X power = ParamClass::gasPower;
+                const float_X factor = ParamClass::gasFactor;
+                const float_X density = math::exp(factor * math::pow(exponent, power));
 
-        return density;
-    }
-};
-}
-}
+                return density;
+            }
+        };
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/GaussianImpl.def b/include/picongpu/particles/densityProfiles/GaussianImpl.def
index 4e9f340828..eea88488e8 100644
--- a/include/picongpu/particles/densityProfiles/GaussianImpl.def
+++ b/include/picongpu/particles/densityProfiles/GaussianImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,9 +22,9 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template<typename T_ParamClass>
-    struct GaussianImpl;
-}
-}
+    namespace densityProfiles
+    {
+        template<typename T_ParamClass>
+        struct GaussianImpl;
+    }
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/GaussianImpl.hpp b/include/picongpu/particles/densityProfiles/GaussianImpl.hpp
index ddbdc141ef..c9c6799b4e 100644
--- a/include/picongpu/particles/densityProfiles/GaussianImpl.hpp
+++ b/include/picongpu/particles/densityProfiles/GaussianImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -25,60 +25,56 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-
-template<typename T_ParamClass>
-struct GaussianImpl : public T_ParamClass
-{
-    using ParamClass = T_ParamClass;
-
-    template<typename T_SpeciesType>
-    struct apply
+    namespace densityProfiles
     {
-        using type = GaussianImpl<ParamClass>;
-    };
+        template<typename T_ParamClass>
+        struct GaussianImpl : public T_ParamClass
+        {
+            using ParamClass = T_ParamClass;
 
-    HINLINE GaussianImpl(uint32_t currentStep)
-    {
-    }
+            template<typename T_SpeciesType>
+            struct apply
+            {
+                using type = GaussianImpl<ParamClass>;
+            };
 
-    /** Calculate the normalized density
-     *
-     * @param totalCellOffset total offset including all slides [in cells]
-     */
-    HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
-    {
-        const float_X vacuum_y = float_X(ParamClass::vacuumCellsY) * cellSize.y();
-        const float_X gas_center_left = ParamClass::gasCenterLeft_SI / UNIT_LENGTH;
-        const float_X gas_center_right = ParamClass::gasCenterRight_SI / UNIT_LENGTH;
-        const float_X gas_sigma_left = ParamClass::gasSigmaLeft_SI / UNIT_LENGTH;
-        const float_X gas_sigma_right = ParamClass::gasSigmaRight_SI / UNIT_LENGTH;
+            HINLINE GaussianImpl(uint32_t currentStep)
+            {
+            }
 
-        const floatD_X globalCellPos(
-                                     precisionCast<float_X>(totalCellOffset) *
-                                     cellSize.shrink<simDim>()
-                                     );
+            /** Calculate the normalized density
+             *
+             * @param totalCellOffset total offset including all slides [in cells]
+             */
+            HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
+            {
+                const float_X vacuum_y = float_X(ParamClass::vacuumCellsY) * cellSize.y();
+                const float_X gas_center_left = ParamClass::gasCenterLeft_SI / UNIT_LENGTH;
+                const float_X gas_center_right = ParamClass::gasCenterRight_SI / UNIT_LENGTH;
+                const float_X gas_sigma_left = ParamClass::gasSigmaLeft_SI / UNIT_LENGTH;
+                const float_X gas_sigma_right = ParamClass::gasSigmaRight_SI / UNIT_LENGTH;
 
-        if (globalCellPos.y() * cellSize.y() < vacuum_y)
-        {
-            return float_X(0.0);
-        }
+                const floatD_X globalCellPos(precisionCast<float_X>(totalCellOffset) * cellSize.shrink<simDim>());
 
-        float_X exponent = float_X(0.0);
-        if (globalCellPos.y() < gas_center_left)
-        {
-            exponent = math::abs((globalCellPos.y() - gas_center_left) / gas_sigma_left);
-        }
-        else if (globalCellPos.y() >= gas_center_right)
-        {
-            exponent = math::abs((globalCellPos.y() - gas_center_right) / gas_sigma_right);
-        }
+                if(globalCellPos.y() * cellSize.y() < vacuum_y)
+                {
+                    return float_X(0.0);
+                }
+
+                float_X exponent = float_X(0.0);
+                if(globalCellPos.y() < gas_center_left)
+                {
+                    exponent = math::abs((globalCellPos.y() - gas_center_left) / gas_sigma_left);
+                }
+                else if(globalCellPos.y() >= gas_center_right)
+                {
+                    exponent = math::abs((globalCellPos.y() - gas_center_right) / gas_sigma_right);
+                }
 
-        const float_X gas_power = ParamClass::gasPower;
-        const float_X density = math::exp(float_X(ParamClass::gasFactor) * math::pow(exponent, gas_power));
-        return density;
-    }
-};
-}
-}
+                const float_X gas_power = ParamClass::gasPower;
+                const float_X density = math::exp(float_X(ParamClass::gasFactor) * math::pow(exponent, gas_power));
+                return density;
+            }
+        };
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/HomogenousImpl.def b/include/picongpu/particles/densityProfiles/HomogenousImpl.def
index 4373124129..266a6d4609 100644
--- a/include/picongpu/particles/densityProfiles/HomogenousImpl.def
+++ b/include/picongpu/particles/densityProfiles/HomogenousImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,8 +22,8 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    struct HomogenousImpl;
-}
-}
+    namespace densityProfiles
+    {
+        struct HomogenousImpl;
+    }
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/HomogenousImpl.hpp b/include/picongpu/particles/densityProfiles/HomogenousImpl.hpp
index 58d86cca33..cdfdc95db0 100644
--- a/include/picongpu/particles/densityProfiles/HomogenousImpl.hpp
+++ b/include/picongpu/particles/densityProfiles/HomogenousImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -24,30 +24,29 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-
-struct HomogenousImpl
-{
-    template<typename T_SpeciesType>
-    struct apply
+    namespace densityProfiles
     {
-        using type = HomogenousImpl;
-    };
+        struct HomogenousImpl
+        {
+            template<typename T_SpeciesType>
+            struct apply
+            {
+                using type = HomogenousImpl;
+            };
 
-    HINLINE HomogenousImpl(uint32_t currentStep)
-    {
-    }
+            HINLINE HomogenousImpl(uint32_t currentStep)
+            {
+            }
 
-    /** Calculate the normalized density
-     *
-     * @param totalCellOffset total offset including all slides [in cells]
-     * @return float_X always 1.0
-     */
-    HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
-    {
-        return float_X(1.0);
-    }
-};
-}
-}
+            /** Calculate the normalized density
+             *
+             * @param totalCellOffset total offset including all slides [in cells]
+             * @return float_X always 1.0
+             */
+            HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
+            {
+                return float_X(1.0);
+            }
+        };
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/IProfile.def b/include/picongpu/particles/densityProfiles/IProfile.def
index bca4e955fd..29c540cf27 100644
--- a/include/picongpu/particles/densityProfiles/IProfile.def
+++ b/include/picongpu/particles/densityProfiles/IProfile.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,9 +22,9 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template<typename T_Base>
-    struct IProfile;
-}
-}
+    namespace densityProfiles
+    {
+        template<typename T_Base>
+        struct IProfile;
+    }
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/IProfile.hpp b/include/picongpu/particles/densityProfiles/IProfile.hpp
index 3d6665eee1..865ec74969 100644
--- a/include/picongpu/particles/densityProfiles/IProfile.hpp
+++ b/include/picongpu/particles/densityProfiles/IProfile.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -22,26 +22,47 @@
 #include "picongpu/simulation_defines.hpp"
 #include "picongpu/particles/densityProfiles/IProfile.def"
 
+#include <cstdlib>
 
-namespace picongpu
-{
-namespace densityProfiles
-{
 
-template<typename T_Base>
-struct IProfile : private T_Base
+namespace picongpu
 {
-
-    using Base = T_Base;
-
-    HINLINE IProfile(uint32_t currentStep) : Base(currentStep)
+    namespace densityProfiles
     {
-    }
+        /** Wrapper around a given density profile functor
+         *
+         * Defines density profile "concept" interface and compile-time checks that
+         * the given profile type is compatible to it
+         *
+         * @tparam T_Profile wrapped density profile functor type
+         */
+        template<typename T_Profile>
+        struct IProfile : private T_Profile
+        {
+            /** Create a profile functor for the given time iteration
+             *
+             * @param currentStep current time iteration
+             */
+            HINLINE IProfile(uint32_t const currentStep) : T_Profile(currentStep)
+            {
+            }
 
-    HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
-    {
-        return Base::operator()(totalCellOffset);
-    }
-};
-}
-}
+            /** Calculate physical particle density value for the given cell
+             *
+             * It concerns real (physical, not macro-) particles.
+             * The result is in units of BASE_DENSITY times PIC units of volume**-3.
+             *
+             * The density is assumed constant inside a cell, so the underlying
+             * functor should preferably return a value in the cell center.
+             *
+             * @param totalCellOffset total offset from the start of the global
+             *                        simulation area, including all slides [in cells]
+             */
+            HDINLINE float_X operator()(pmacc::DataSpace<simDim> const& totalCellOffset)
+            {
+                return T_Profile::operator()(totalCellOffset);
+            }
+        };
+
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/LinearExponentialImpl.def b/include/picongpu/particles/densityProfiles/LinearExponentialImpl.def
index 6cd80440b6..beb3baf659 100644
--- a/include/picongpu/particles/densityProfiles/LinearExponentialImpl.def
+++ b/include/picongpu/particles/densityProfiles/LinearExponentialImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,9 +22,9 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template<typename T_ParamClass>
-    struct LinearExponentialImpl;
-}
-}
+    namespace densityProfiles
+    {
+        template<typename T_ParamClass>
+        struct LinearExponentialImpl;
+    }
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/LinearExponentialImpl.hpp b/include/picongpu/particles/densityProfiles/LinearExponentialImpl.hpp
index a341a8367f..5a31c4a7d5 100644
--- a/include/picongpu/particles/densityProfiles/LinearExponentialImpl.hpp
+++ b/include/picongpu/particles/densityProfiles/LinearExponentialImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -24,55 +24,51 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-
-template<typename T_ParamClass>
-struct LinearExponentialImpl : public T_ParamClass
-{
-    using ParamClass = T_ParamClass;
-
-    template<typename T_SpeciesType>
-    struct apply
+    namespace densityProfiles
     {
-        using type = LinearExponentialImpl<ParamClass>;
-    };
+        template<typename T_ParamClass>
+        struct LinearExponentialImpl : public T_ParamClass
+        {
+            using ParamClass = T_ParamClass;
 
-    HINLINE LinearExponentialImpl(uint32_t currentStep)
-    {
+            template<typename T_SpeciesType>
+            struct apply
+            {
+                using type = LinearExponentialImpl<ParamClass>;
+            };
 
-    }
+            HINLINE LinearExponentialImpl(uint32_t currentStep)
+            {
+            }
 
-    /* Calculate the normalized density
-     *
-     * @param totalCellOffset total offset including all slides [in cells]
-     */
-    HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
-    {
-        const float_X vacuum_y = float_X(ParamClass::vacuumCellsY) * cellSize.y();
-        const float_X gas_a = ParamClass::gasA_SI * UNIT_LENGTH;
-        const float_X gas_d = ParamClass::gasD_SI * UNIT_LENGTH;
-        const float_X gas_y_max = ParamClass::gasYMax_SI / UNIT_LENGTH;
+            /* Calculate the normalized density
+             *
+             * @param totalCellOffset total offset including all slides [in cells]
+             */
+            HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
+            {
+                const float_X vacuum_y = float_X(ParamClass::vacuumCellsY) * cellSize.y();
+                const float_X gas_a = ParamClass::gasA_SI * UNIT_LENGTH;
+                const float_X gas_d = ParamClass::gasD_SI * UNIT_LENGTH;
+                const float_X gas_y_max = ParamClass::gasYMax_SI / UNIT_LENGTH;
 
-        const floatD_X globalCellPos(
-                                     precisionCast<float_X>(totalCellOffset) *
-                                     cellSize.shrink<simDim>()
-                                     );
-        float_X density = float_X(0.0);
+                const floatD_X globalCellPos(precisionCast<float_X>(totalCellOffset) * cellSize.shrink<simDim>());
+                float_X density = float_X(0.0);
 
-        if (globalCellPos.y() < vacuum_y) return density;
+                if(globalCellPos.y() < vacuum_y)
+                    return density;
 
-        if (globalCellPos.y() <= gas_y_max) // linear slope
-            density = gas_a * globalCellPos.y() + ParamClass::gasB;
-        else // exponential slope
-            density = math::exp((globalCellPos.y() - gas_y_max) * gas_d);
+                if(globalCellPos.y() <= gas_y_max) // linear slope
+                    density = gas_a * globalCellPos.y() + ParamClass::gasB;
+                else // exponential slope
+                    density = math::exp((globalCellPos.y() - gas_y_max) * gas_d);
 
-        // avoid < 0 densities for the linear slope
-        if (density < float_X(0.0))
-            density = float_X(0.0);
+                // avoid < 0 densities for the linear slope
+                if(density < float_X(0.0))
+                    density = float_X(0.0);
 
-        return density;
-    }
-};
-}
-}
+                return density;
+            }
+        };
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/SphereFlanksImpl.def b/include/picongpu/particles/densityProfiles/SphereFlanksImpl.def
index c9e0c0ff8e..7325dcc398 100644
--- a/include/picongpu/particles/densityProfiles/SphereFlanksImpl.def
+++ b/include/picongpu/particles/densityProfiles/SphereFlanksImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,9 +22,9 @@
 
 namespace picongpu
 {
-namespace densityProfiles
-{
-    template<typename T_ParamClass>
-    struct SphereFlanksImpl;
-}
-}
+    namespace densityProfiles
+    {
+        template<typename T_ParamClass>
+        struct SphereFlanksImpl;
+    }
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/SphereFlanksImpl.hpp b/include/picongpu/particles/densityProfiles/SphereFlanksImpl.hpp
index 1a1e7f41c3..ea53395ad3 100644
--- a/include/picongpu/particles/densityProfiles/SphereFlanksImpl.hpp
+++ b/include/picongpu/particles/densityProfiles/SphereFlanksImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -24,61 +24,57 @@
 
 namespace picongpu
 {
-
-namespace densityProfiles
-{
-
-template<typename T_ParamClass>
-struct SphereFlanksImpl : public T_ParamClass
-{
-    using ParamClass = T_ParamClass;
-
-    template<typename T_SpeciesType>
-    struct apply
+    namespace densityProfiles
     {
-        using type = SphereFlanksImpl<ParamClass>;
-    };
+        template<typename T_ParamClass>
+        struct SphereFlanksImpl : public T_ParamClass
+        {
+            using ParamClass = T_ParamClass;
 
-    HINLINE SphereFlanksImpl(uint32_t currentStep)
-    {
-    }
+            template<typename T_SpeciesType>
+            struct apply
+            {
+                using type = SphereFlanksImpl<ParamClass>;
+            };
 
-    /** Calculate the normalized density
-     *
-     * @param totalCellOffset total offset including all slides [in cells]
-     */
-    HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
-    {
-        const float_64 unit_length = UNIT_LENGTH;
-        const float_X vacuum_y = float_X(ParamClass::vacuumCellsY) * cellSize.y();
-        const floatD_X center = precisionCast<float_32>(ParamClass::center_SI / unit_length);
-        const float_X r = ParamClass::r_SI / unit_length;
-        const float_X ri = ParamClass::ri_SI / unit_length;
-        const float_X exponent = ParamClass::exponent_SI * unit_length;
+            HINLINE SphereFlanksImpl(uint32_t currentStep)
+            {
+            }
+
+            /** Calculate the normalized density
+             *
+             * @param totalCellOffset total offset including all slides [in cells]
+             */
+            HDINLINE float_X operator()(const DataSpace<simDim>& totalCellOffset)
+            {
+                const float_64 unit_length = UNIT_LENGTH;
+                const float_X vacuum_y = float_X(ParamClass::vacuumCellsY) * cellSize.y();
+                const floatD_X center = precisionCast<float_32>(ParamClass::center_SI / unit_length);
+                const float_X r = ParamClass::r_SI / unit_length;
+                const float_X ri = ParamClass::ri_SI / unit_length;
+                const float_X exponent = ParamClass::exponent_SI * unit_length;
 
 
-        const floatD_X globalCellPos(
-                                     precisionCast<float_X>(totalCellOffset) *
-                                     cellSize.shrink<simDim>()
-                                     );
+                const floatD_X globalCellPos(precisionCast<float_X>(totalCellOffset) * cellSize.shrink<simDim>());
 
-        if (globalCellPos.y() < vacuum_y) return float_X(0.0);
+                if(globalCellPos.y() < vacuum_y)
+                    return float_X(0.0);
 
-        const float_X distance = math::abs(globalCellPos - center);
+                const float_X distance = math::abs(globalCellPos - center);
 
-        /* "shell": inner radius */
-        if (distance < ri)
-            return float_X(0.0);
-            /* "hard core" */
-        else if (distance <= r)
-            return float_X(1.0);
+                /* "shell": inner radius */
+                if(distance < ri)
+                    return float_X(0.0);
+                /* "hard core" */
+                else if(distance <= r)
+                    return float_X(1.0);
 
-        /* "soft exp. flanks"
-         *   note: by definition (return, see above) the
-         *         argument [ r - distance ] will be element of (-inf, 0) */
-        else
-            return math::exp((r - distance) * exponent);
-    }
-};
-}
-}
+                /* "soft exp. flanks"
+                 *   note: by definition (return, see above) the
+                 *         argument [ r - distance ] will be element of (-inf, 0) */
+                else
+                    return math::exp((r - distance) * exponent);
+            }
+        };
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/include/picongpu/particles/densityProfiles/profiles.def b/include/picongpu/particles/densityProfiles/profiles.def
index 1160d49b2d..f59dfe7778 100644
--- a/include/picongpu/particles/densityProfiles/profiles.def
+++ b/include/picongpu/particles/densityProfiles/profiles.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Alexander Grund
+/* Copyright 2014-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/densityProfiles/profiles.hpp b/include/picongpu/particles/densityProfiles/profiles.hpp
index f71c178cab..084b5f857d 100644
--- a/include/picongpu/particles/densityProfiles/profiles.hpp
+++ b/include/picongpu/particles/densityProfiles/profiles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,6 +28,6 @@
 #include "picongpu/particles/densityProfiles/SphereFlanksImpl.hpp"
 #include "picongpu/particles/densityProfiles/EveryNthCellImpl.hpp"
 
-#if( ENABLE_HDF5 == 1 )
+#if(ENABLE_HDF5 == 1)
 #    include "picongpu/particles/densityProfiles/FromHDF5Impl.hpp"
 #endif
diff --git a/include/picongpu/particles/filter/All.def b/include/picongpu/particles/filter/All.def
index c638b2c17e..3f685026d4 100644
--- a/include/picongpu/particles/filter/All.def
+++ b/include/picongpu/particles/filter/All.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,17 +22,16 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-
-    /** check if a particle handle is valid
-     *
-     * the particle method `::isValidHandle()` is called.
-     */
-    struct All;
+    namespace particles
+    {
+        namespace filter
+        {
+            /** check if a particle handle is valid
+             *
+             * the particle method `::isValidHandle()` is called.
+             */
+            struct All;
 
-} //namespace filter
-} //namespace particles
-} //namespace picongpu
+        } // namespace filter
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/filter/All.hpp b/include/picongpu/particles/filter/All.hpp
index 2893a61cb6..114d5ba302 100644
--- a/include/picongpu/particles/filter/All.hpp
+++ b/include/picongpu/particles/filter/All.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,80 +24,60 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-
-namespace acc
-{
-
-    //! check the particle handle
-    struct All
+    namespace particles
     {
-
-        /** check particle handle
-         *
-         * @tparam T_Particle pmacc::Particles, type of the particle
-         * @tparam alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param particle  particle which is checked
-         * @return true if particle handle is valid, else false
-         */
-        template<
-            typename T_Particle,
-            typename T_Acc
-        >
-        HDINLINE bool operator()(
-            T_Acc const &,
-            T_Particle const & particle
-        )
+        namespace filter
         {
-            return  particle.isHandleValid( );
-        }
-    };
+            namespace acc
+            {
+                //! check the particle handle
+                struct All
+                {
+                    /** check particle handle
+                     *
+                     * @tparam T_Particle pmacc::Particles, type of the particle
+                     * @tparam alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param particle  particle which is checked
+                     * @return true if particle handle is valid, else false
+                     */
+                    template<typename T_Particle, typename T_Acc>
+                    HDINLINE bool operator()(T_Acc const&, T_Particle const& particle)
+                    {
+                        return particle.isHandleValid();
+                    }
+                };
 
-} // namespace acc
+            } // namespace acc
 
-    struct All
-    {
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = All;
-        };
+            struct All
+            {
+                template<typename T_SpeciesType>
+                struct apply
+                {
+                    using type = All;
+                };
 
-        /** create filter for the accelerator
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @param offset (in superCells, without any guards) relative
-         *                        to the origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::All
-        operator( )(
-            T_Acc const & acc,
-            DataSpace< simDim > const &,
-            T_WorkerCfg const &
-        ) const
-        {
-            return acc::All{ };
+                /** create filter for the accelerator
+                 *
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @param offset (in superCells, without any guards) relative
+                 *                        to the origin of the local domain
+                 * @param configuration of the worker
+                 */
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::All operator()(T_Acc const& acc, DataSpace<simDim> const&, T_WorkerCfg const&) const
+                {
+                    return acc::All{};
+                }
 
-        }
-
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return std::string("all");
-        }
-    };
+                static HINLINE std::string getName()
+                {
+                    return std::string("all");
+                }
+            };
 
-} //namespace filter
-} //namespace particles
-} //namespace picongpu
+        } // namespace filter
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/filter/IUnary.def b/include/picongpu/particles/filter/IUnary.def
index 4b7aac2595..9b5af034bb 100644
--- a/include/picongpu/particles/filter/IUnary.def
+++ b/include/picongpu/particles/filter/IUnary.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,23 +27,17 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-
-    /** interface for a unary particle filter
-     *
-     * @tparam T_UnaryFilter unary particle filter must contain `bool operator()(P && particle)`
-     */
-    template<
-        typename T_UnaryFilter
-    >
-    using IUnary = pmacc::filter::Interface<
-        T_UnaryFilter,
-        1u
-    >;
+    namespace particles
+    {
+        namespace filter
+        {
+            /** interface for a unary particle filter
+             *
+             * @tparam T_UnaryFilter unary particle filter must contain `bool operator()(P && particle)`
+             */
+            template<typename T_UnaryFilter>
+            using IUnary = pmacc::filter::Interface<T_UnaryFilter, 1u>;
 
-} // namespace filter
-} // namespace particles
+        } // namespace filter
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/filter/RelativeGlobalDomainPosition.def b/include/picongpu/particles/filter/RelativeGlobalDomainPosition.def
index c451c90c00..9a072af569 100644
--- a/include/picongpu/particles/filter/RelativeGlobalDomainPosition.def
+++ b/include/picongpu/particles/filter/RelativeGlobalDomainPosition.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,39 +22,39 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-namespace param
-{
-    struct RelativeGlobalDomainPosition
+    namespace particles
     {
-        /* lowerBound is included in the range*/
-        static constexpr float_X lowerBound = 0.0;
-        /* upperBound is excluded in the range*/
-        static constexpr float_X upperBound = 1.0;
-        /* dimension for the filter
-         * x = 0; y= 1; z = 2
-         */
-        static constexpr uint32_t dimension = 0;
+        namespace filter
+        {
+            namespace param
+            {
+                struct RelativeGlobalDomainPosition
+                {
+                    /* lowerBound is included in the range*/
+                    static constexpr float_X lowerBound = 0.0;
+                    /* upperBound is excluded in the range*/
+                    static constexpr float_X upperBound = 1.0;
+                    /* dimension for the filter
+                     * x = 0; y= 1; z = 2
+                     */
+                    static constexpr uint32_t dimension = 0;
 
-        // name of the filter
-        static constexpr char const * name = "relativeGlobalDomainPosition";
-    };
-} // namespace param
+                    // name of the filter
+                    static constexpr char const* name = "relativeGlobalDomainPosition";
+                };
+            } // namespace param
 
-    /** filter particle dependent on the global position
-     *
-     * Check if a particle is within a relative area in one direction of the global
-     * domain.
-     *
-     * @tparam T_Params picongpu::particles::filter::param::RelativeGlobalDomainPosition,
-     *                  parameter to configure the functor
-     */
-    template< typename T_Params = param::RelativeGlobalDomainPosition >
-    struct RelativeGlobalDomainPosition;
+            /** filter particle dependent on the global position
+             *
+             * Check if a particle is within a relative area in one direction of the global
+             * domain.
+             *
+             * @tparam T_Params picongpu::particles::filter::param::RelativeGlobalDomainPosition,
+             *                  parameter to configure the functor
+             */
+            template<typename T_Params = param::RelativeGlobalDomainPosition>
+            struct RelativeGlobalDomainPosition;
 
-} //namespace filter
-} //namespace particles
-} //namespace picongpu
+        } // namespace filter
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/filter/RelativeGlobalDomainPosition.hpp b/include/picongpu/particles/filter/RelativeGlobalDomainPosition.hpp
index ecea8ba1fc..544d84783a 100644
--- a/include/picongpu/particles/filter/RelativeGlobalDomainPosition.hpp
+++ b/include/picongpu/particles/filter/RelativeGlobalDomainPosition.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  *
  * This file is part of PIConGPU.
@@ -29,161 +29,132 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-
-namespace acc
-{
-    template< typename T_Params >
-    struct RelativeGlobalDomainPosition
+    namespace particles
     {
-        using Params = T_Params;
-
-        HDINLINE RelativeGlobalDomainPosition(
-            DataSpace< simDim > const & localDomainOffset,
-            DataSpace< simDim > const & globalDomainSize,
-            DataSpace< simDim > const & localSuperCellOffset
-        ) :
-            m_localDomainOffset( localDomainOffset ),
-            m_globalDomainSize( globalDomainSize ),
-            m_localSuperCellOffset( localSuperCellOffset )
+        namespace filter
         {
-        }
-
-        template<
-            typename T_Acc,
-            typename T_Particle
-        >
-        HDINLINE bool operator()(
-            T_Acc const &,
-            T_Particle const & particle
-        )
-        {
-            if( particle.isHandleValid( ) )
+            namespace acc
             {
-                using SuperCellSize = typename T_Particle::SuperCellSize;
-                /* offset of the superCell (in cells, without any guards) to the origin of the global domain */
-                DataSpace< simDim > globalSuperCellOffset = m_localDomainOffset + (
-                    m_localSuperCellOffset *
-                    SuperCellSize::toRT( )
-                );
-                return isParticleInsideRange( particle, globalSuperCellOffset);
-            }
-            return false;
-        }
-
-    private:
-
-        /** check if a particle is located in the user defined range
-         *
-         * @tparam T_Particle type of the particle
-         * @param particle particle than needs to be checked
-         * @param globalSuperCellOffset offset of the superCell (in cells, without any guards)
-         *                              to the origin of the global domain
-         */
-        template< typename T_Particle >
-        HDINLINE bool isParticleInsideRange( T_Particle const & particle, DataSpace< simDim > const & globalSuperCellOffset ) const
-        {
-            using SuperCellSize = typename T_Particle::SuperCellSize;
-
-            int const particleCellIdx = particle[ localCellIdx_ ];
-            DataSpace< simDim > const cellInSuperCell( DataSpaceOperations< simDim >::
-                template map< SuperCellSize >( particleCellIdx ) );
-            DataSpace< simDim > const globalParticleOffset(
-                globalSuperCellOffset +
-                cellInSuperCell
-            );
-
-            float_X const relativePosition = float_X( globalParticleOffset[ Params::dimension ] ) /
-                float_X( m_globalDomainSize[ Params::dimension ] );
-
-            return ( Params::lowerBound <= relativePosition &&
-                relativePosition < Params::upperBound );
-        }
-
-        DataSpace< simDim > const m_localDomainOffset;
-        DataSpace< simDim > const m_globalDomainSize;
-        DataSpace< simDim > const m_localSuperCellOffset;
-    };
-
-} // namespace acc
-
-    template< typename T_Params >
-    struct RelativeGlobalDomainPosition
-    {
-        using Params = T_Params;
-
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = RelativeGlobalDomainPosition;
-        };
-
-        HINLINE RelativeGlobalDomainPosition( )
-        {
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get( ).SubGrid( );
-            globalDomainSize = subGrid.getGlobalDomain( ).size;
-            localDomainOffset = subGrid.getLocalDomain( ).offset;
-        }
-
-        /** create filter for the accelerator
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @param localSupercellOffset offset (in superCells, without any guards) relative
-         *                        to the origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::RelativeGlobalDomainPosition< Params >
-        operator( )(
-            T_Acc const & acc,
-            DataSpace< simDim > const & localSuperCellOffset,
-            T_WorkerCfg const &
-        ) const
-        {
-            return acc::RelativeGlobalDomainPosition< Params >(
-                localDomainOffset,
-                globalDomainSize,
-                localSuperCellOffset
-            );
-
-        }
-
-        static
-        HINLINE std::string
-        getName( )
+                template<typename T_Params>
+                struct RelativeGlobalDomainPosition
+                {
+                    using Params = T_Params;
+
+                    HDINLINE RelativeGlobalDomainPosition(
+                        DataSpace<simDim> const& localDomainOffset,
+                        DataSpace<simDim> const& globalDomainSize,
+                        DataSpace<simDim> const& localSuperCellOffset)
+                        : m_localDomainOffset(localDomainOffset)
+                        , m_globalDomainSize(globalDomainSize)
+                        , m_localSuperCellOffset(localSuperCellOffset)
+                    {
+                    }
+
+                    template<typename T_Acc, typename T_Particle>
+                    HDINLINE bool operator()(T_Acc const&, T_Particle const& particle)
+                    {
+                        if(particle.isHandleValid())
+                        {
+                            using SuperCellSize = typename T_Particle::SuperCellSize;
+                            /* offset of the superCell (in cells, without any guards) to the origin of the global
+                             * domain */
+                            DataSpace<simDim> globalSuperCellOffset
+                                = m_localDomainOffset + (m_localSuperCellOffset * SuperCellSize::toRT());
+                            return isParticleInsideRange(particle, globalSuperCellOffset);
+                        }
+                        return false;
+                    }
+
+                private:
+                    /** check if a particle is located in the user defined range
+                     *
+                     * @tparam T_Particle type of the particle
+                     * @param particle particle than needs to be checked
+                     * @param globalSuperCellOffset offset of the superCell (in cells, without any guards)
+                     *                              to the origin of the global domain
+                     */
+                    template<typename T_Particle>
+                    HDINLINE bool isParticleInsideRange(
+                        T_Particle const& particle,
+                        DataSpace<simDim> const& globalSuperCellOffset) const
+                    {
+                        using SuperCellSize = typename T_Particle::SuperCellSize;
+
+                        int const particleCellIdx = particle[localCellIdx_];
+                        DataSpace<simDim> const cellInSuperCell(
+                            DataSpaceOperations<simDim>::template map<SuperCellSize>(particleCellIdx));
+                        DataSpace<simDim> const globalParticleOffset(globalSuperCellOffset + cellInSuperCell);
+
+                        float_X const relativePosition = float_X(globalParticleOffset[Params::dimension])
+                            / float_X(m_globalDomainSize[Params::dimension]);
+
+                        return (Params::lowerBound <= relativePosition && relativePosition < Params::upperBound);
+                    }
+
+                    DataSpace<simDim> const m_localDomainOffset;
+                    DataSpace<simDim> const m_globalDomainSize;
+                    DataSpace<simDim> const m_localSuperCellOffset;
+                };
+
+            } // namespace acc
+
+            template<typename T_Params>
+            struct RelativeGlobalDomainPosition
+            {
+                using Params = T_Params;
+
+                template<typename T_SpeciesType>
+                struct apply
+                {
+                    using type = RelativeGlobalDomainPosition;
+                };
+
+                HINLINE RelativeGlobalDomainPosition()
+                {
+                    SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                    globalDomainSize = subGrid.getGlobalDomain().size;
+                    localDomainOffset = subGrid.getLocalDomain().offset;
+                }
+
+                /** create filter for the accelerator
+                 *
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 * @param localSupercellOffset offset (in superCells, without any guards) relative
+                 *                        to the origin of the local domain
+                 * @param configuration of the worker
+                 */
+                template<typename T_WorkerCfg, typename T_Acc>
+                HDINLINE acc::RelativeGlobalDomainPosition<Params> operator()(
+                    T_Acc const& acc,
+                    DataSpace<simDim> const& localSuperCellOffset,
+                    T_WorkerCfg const&) const
+                {
+                    return acc::RelativeGlobalDomainPosition<Params>(
+                        localDomainOffset,
+                        globalDomainSize,
+                        localSuperCellOffset);
+                }
+
+                static HINLINE std::string getName()
+                {
+                    // we provide the name from the param class
+                    return T_Params::name;
+                }
+
+                DataSpace<simDim> localDomainOffset;
+                DataSpace<simDim> globalDomainSize;
+            };
+
+        } // namespace filter
+
+        namespace traits
         {
-            // we provide the name from the param class
-            return T_Params::name;
-        }
-
-        DataSpace< simDim > localDomainOffset;
-        DataSpace< simDim > globalDomainSize;
-    };
-
-} //namespace filter
-
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_Params
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        filter::RelativeGlobalDomainPosition< T_Params >
-    >
-    {
-        using type = typename pmacc::traits::HasIdentifiers<
-            typename T_Species::FrameType,
-            MakeSeq_t< localCellIdx >
-        >::type;
-    };
-} // namespace traits
-} // namespace particles
+            template<typename T_Species, typename T_Params>
+            struct SpeciesEligibleForSolver<T_Species, filter::RelativeGlobalDomainPosition<T_Params>>
+            {
+                using type = typename pmacc::traits::
+                    HasIdentifiers<typename T_Species::FrameType, MakeSeq_t<localCellIdx>>::type;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/filter/filter.def b/include/picongpu/particles/filter/filter.def
index f304e066ea..dfe6820cc9 100644
--- a/include/picongpu/particles/filter/filter.def
+++ b/include/picongpu/particles/filter/filter.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/filter/filter.hpp b/include/picongpu/particles/filter/filter.hpp
index 47c8022daa..f80c643db3 100644
--- a/include/picongpu/particles/filter/filter.hpp
+++ b/include/picongpu/particles/filter/filter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/filter/generic/Free.def b/include/picongpu/particles/filter/generic/Free.def
index 4b33ffa07f..de2786d237 100644
--- a/include/picongpu/particles/filter/generic/Free.def
+++ b/include/picongpu/particles/filter/generic/Free.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,44 +22,43 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-namespace generic
-{
-
-    /** call simple free user defined filter
-     *
-     * @tparam T_Functor user defined filter
-     *                   **optional**: can implement **one** host side constructor
-     *                   `T_Functor()` or `T_Functor(uint32_t currentTimeStep)`
-     *
-     * example for `particleFilters.param`: each particle with in-cell position greater than 0.5
-     *   @code{.cpp}
-     *
-     *   struct FunctorEachParticleAboveMiddleOfTheCell
-     *   {
-     *       template< typename T_Particle >
-     *       HDINLINE bool operator()( T_Particle const & particle )
-     *       {
-     *           bool result = false;
-     *           if( particle[ position_ ].y() >= float_X( 0.5 ) )
-     *               result = true;
-     *           return result;
-     *       }
-     *       static constexpr char const * name = "eachParticleAboveMiddleOfTheCell";
-     *   };
-     *
-     *   using EachParticleAboveMiddleOfTheCell = generic::Free<
-     *      FunctorEachParticleAboveMiddleOfTheCell
-     *   >;
-     *   @endcode
-     */
-    template< typename T_Functor >
-    struct Free;
+    namespace particles
+    {
+        namespace filter
+        {
+            namespace generic
+            {
+                /** call simple free user defined filter
+                 *
+                 * @tparam T_Functor user defined filter
+                 *                   **optional**: can implement **one** host side constructor
+                 *                   `T_Functor()` or `T_Functor(uint32_t currentTimeStep)`
+                 *
+                 * example for `particleFilters.param`: each particle with in-cell position greater than 0.5
+                 *   @code{.cpp}
+                 *
+                 *   struct FunctorEachParticleAboveMiddleOfTheCell
+                 *   {
+                 *       template< typename T_Particle >
+                 *       HDINLINE bool operator()( T_Particle const & particle )
+                 *       {
+                 *           bool result = false;
+                 *           if( particle[ position_ ].y() >= float_X( 0.5 ) )
+                 *               result = true;
+                 *           return result;
+                 *       }
+                 *       static constexpr char const * name = "eachParticleAboveMiddleOfTheCell";
+                 *   };
+                 *
+                 *   using EachParticleAboveMiddleOfTheCell = generic::Free<
+                 *      FunctorEachParticleAboveMiddleOfTheCell
+                 *   >;
+                 *   @endcode
+                 */
+                template<typename T_Functor>
+                struct Free;
 
-} // namespace generic
-} // namespace filter
-} // namespace particles
+            } // namespace generic
+        } // namespace filter
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/filter/generic/Free.hpp b/include/picongpu/particles/filter/generic/Free.hpp
index 9db8566a50..9065b1c9d8 100644
--- a/include/picongpu/particles/filter/generic/Free.hpp
+++ b/include/picongpu/particles/filter/generic/Free.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,109 +28,89 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-namespace generic
-{
-namespace acc
-{
-    /** wrapper for the user filter on the accelerator
-     *
-     * @tparam T_Functor user defined filter
-     */
-    template< typename T_Functor >
-    struct Free : private T_Functor
-    {
-        //! type of the user filter
-        using Functor = T_Functor;
-
-        //! store user filter instance
-        HDINLINE Free( Functor const & filter ) :
-            Functor( filter )
-        {
-        }
-
-        /** execute the user filter
-         *
-         * @tparam T_Args type of the arguments passed to the user filter
-         *
-         * @param particle particle to use for the filtering
-         */
-        template<
-            typename T_Acc,
-            typename T_Particle
-        >
-        HDINLINE
-        bool operator( )(
-            T_Acc const &,
-            T_Particle const & particle
-        )
-        {
-            bool const isValid = particle.isHandleValid( );
-
-            return isValid && Functor::operator( )( particle );
-        }
-
-    };
-} // namespace acc
-
-    template< typename T_Functor >
-    struct Free : protected functor::User< T_Functor >
+    namespace particles
     {
-
-        using Functor = functor::User< T_Functor >;
-
-        template< typename T_SpeciesType >
-        struct apply
+        namespace filter
         {
-            using type = Free;
-        };
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE Free( uint32_t currentStep ) : Functor( currentStep )
-        {
-        }
-
-        /** create device filter
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param offset (in supercells, without any guards) to the
-         *         origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::Free< Functor >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const &,
-            T_WorkerCfg const &
-        ) const
-        {
-            return acc::Free< Functor >( *static_cast< Functor const * >( this ) );
-        }
-
-        static
-        HINLINE std::string
-        getName( )
-        {
-            // provide the name from the user functor
-            return Functor::name;
-        }
-
-    };
-
-} // namespace generic
-} // namespace filter
-} // namespace particles
+            namespace generic
+            {
+                namespace acc
+                {
+                    /** wrapper for the user filter on the accelerator
+                     *
+                     * @tparam T_Functor user defined filter
+                     */
+                    template<typename T_Functor>
+                    struct Free : private T_Functor
+                    {
+                        //! type of the user filter
+                        using Functor = T_Functor;
+
+                        //! store user filter instance
+                        HDINLINE Free(Functor const& filter) : Functor(filter)
+                        {
+                        }
+
+                        /** execute the user filter
+                         *
+                         * @tparam T_Args type of the arguments passed to the user filter
+                         *
+                         * @param particle particle to use for the filtering
+                         */
+                        template<typename T_Acc, typename T_Particle>
+                        HDINLINE bool operator()(T_Acc const&, T_Particle const& particle)
+                        {
+                            bool const isValid = particle.isHandleValid();
+
+                            return isValid && Functor::operator()(particle);
+                        }
+                    };
+                } // namespace acc
+
+                template<typename T_Functor>
+                struct Free : protected functor::User<T_Functor>
+                {
+                    using Functor = functor::User<T_Functor>;
+
+                    template<typename T_SpeciesType>
+                    struct apply
+                    {
+                        using type = Free;
+                    };
+
+                    /** constructor
+                     *
+                     * @param currentStep current simulation time step
+                     */
+                    HINLINE Free(uint32_t currentStep) : Functor(currentStep)
+                    {
+                    }
+
+                    /** create device filter
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param offset (in supercells, without any guards) to the
+                     *         origin of the local domain
+                     * @param configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE acc::Free<Functor> operator()(T_Acc const&, DataSpace<simDim> const&, T_WorkerCfg const&)
+                        const
+                    {
+                        return acc::Free<Functor>(*static_cast<Functor const*>(this));
+                    }
+
+                    static HINLINE std::string getName()
+                    {
+                        // provide the name from the user functor
+                        return Functor::name;
+                    }
+                };
+
+            } // namespace generic
+        } // namespace filter
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/filter/generic/FreeRng.def b/include/picongpu/particles/filter/generic/FreeRng.def
index 82643f78f2..aa8165cc49 100644
--- a/include/picongpu/particles/filter/generic/FreeRng.def
+++ b/include/picongpu/particles/filter/generic/FreeRng.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,52 +27,48 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-namespace generic
-{
-
-    /** call simple free user defined functor and provide a random number generator
-     *
-     *
-     * @tparam T_Functor user defined unary functor
-     * @tparam T_Distribution pmacc::random::distributions, random number distribution
-     *
-     * example for `particleFilters.param`: get every second particle
-     *                                      (random sample of 50%)
-     *   @code{.cpp}
-     *
-     *   struct FunctorEachSecondParticle
-     *   {
-     *       template< typename T_Rng, typename T_Particle >
-     *       HDINLINE bool operator()(
-     *           T_Rng & rng,
-     *           T_Particle const & particle
-     *       )
-     *       {
-     *           bool result = false;
-     *           if( rng() >= float_X( 0.5 ) )
-     *               result = true;
-     *           return result;
-     *       }
-     *       static constexpr char const * name = "eachSecondParticle";
-     *   };
-     *
-     *   using EachSecondParticle = generic::FreeRng<
-     *      FunctorEachSecondParticle,
-     *      pmacc::random::distributions::Uniform< float_X >
-     *   >;
-     *   @endcode
-     */
-    template<
-        typename T_Functor,
-        typename T_Distribution
-    >
-    struct FreeRng;
+    namespace particles
+    {
+        namespace filter
+        {
+            namespace generic
+            {
+                /** call simple free user defined functor and provide a random number generator
+                 *
+                 *
+                 * @tparam T_Functor user defined unary functor
+                 * @tparam T_Distribution pmacc::random::distributions, random number distribution
+                 *
+                 * example for `particleFilters.param`: get every second particle
+                 *                                      (random sample of 50%)
+                 *   @code{.cpp}
+                 *
+                 *   struct FunctorEachSecondParticle
+                 *   {
+                 *       template< typename T_Rng, typename T_Particle >
+                 *       HDINLINE bool operator()(
+                 *           T_Rng & rng,
+                 *           T_Particle const & particle
+                 *       )
+                 *       {
+                 *           bool result = false;
+                 *           if( rng() >= float_X( 0.5 ) )
+                 *               result = true;
+                 *           return result;
+                 *       }
+                 *       static constexpr char const * name = "eachSecondParticle";
+                 *   };
+                 *
+                 *   using EachSecondParticle = generic::FreeRng<
+                 *      FunctorEachSecondParticle,
+                 *      pmacc::random::distributions::Uniform< float_X >
+                 *   >;
+                 *   @endcode
+                 */
+                template<typename T_Functor, typename T_Distribution>
+                struct FreeRng;
 
-} // namespace generic
-} // namespace filter
-} // namespace particles
+            } // namespace generic
+        } // namespace filter
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/filter/generic/FreeRng.hpp b/include/picongpu/particles/filter/generic/FreeRng.hpp
index 2667644f29..2f5f282d80 100644
--- a/include/picongpu/particles/filter/generic/FreeRng.hpp
+++ b/include/picongpu/particles/filter/generic/FreeRng.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -29,156 +29,105 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-namespace generic
-{
-namespace acc
-{
-    template<
-        typename T_Functor,
-        typename T_RngType
-    >
-    struct FreeRng : private T_Functor
-    {
-
-        using Functor = T_Functor;
-        using RngType = T_RngType;
-
-        HDINLINE FreeRng(
-            Functor const & functor,
-            RngType const & rng
-        ) :
-            T_Functor( functor ), m_rng( rng )
-        {
-        }
-
-        /** call user functor
-         *
-         * The random number generator is initialized with the first call.
-         *
-         * @tparam T_Particle type of the particle to manipulate
-         * @tparam T_Args type of the arguments passed to the user functor
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param particle particle which is given to the user functor
-         * @return void is used to enable the operator if the user functor except two arguments
-         */
-        template<
-            typename T_Particle,
-            typename ... T_Args,
-            typename T_Acc
-        >
-        HDINLINE
-        bool operator()(
-            T_Acc const &,
-            T_Particle const & particle
-        )
-        {
-            namespace nvrng = nvidia::rng;
-
-            bool const isValid = particle.isHandleValid( );
-
-            return isValid && Functor::operator()(
-                m_rng,
-                particle
-            );
-        }
-
-    private:
-
-        RngType m_rng;
-    };
-} // namespace acc
-
-    template<
-        typename T_Functor,
-        typename T_Distribution
-    >
-    struct FreeRng :
-    protected functor::User< T_Functor >,
-        private picongpu::particles::functor::misc::Rng<
-            T_Distribution
-        >
+    namespace particles
     {
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = FreeRng;
-        };
-
-        using RngGenerator = picongpu::particles::functor::misc::Rng<
-            T_Distribution
-        >;
-
-        using RngType = typename RngGenerator::RandomGen;
-
-        using Functor = functor::User< T_Functor >;
-        using Distribution = T_Distribution;
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE FreeRng( uint32_t currentStep ) :
-            Functor( currentStep ),
-            RngGenerator( currentStep )
-        {
-        }
-
-        /** create functor for the accelerator
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset offset (in superCells, without any guards) relative
-         *                        to the origin of the local domain
-         * @param workerCfg configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE auto
-        operator()(
-            T_Acc const & acc,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const & workerCfg
-        ) const
-        -> acc::FreeRng<
-            Functor,
-            RngType
-        >
-        {
-            RngType const rng = ( *static_cast< RngGenerator const * >( this ) )(
-                acc,
-                localSupercellOffset,
-                workerCfg
-            );
-
-            return acc::FreeRng<
-                Functor,
-                RngType
-            >(
-                *static_cast< Functor const * >( this ),
-                rng
-            );
-        }
-
-        static
-        HINLINE std::string
-        getName( )
+        namespace filter
         {
-            // we provide the name from the param class
-            return Functor::name;
-        }
-    };
-
-} // namespace generic
-} // namespace filter
-} // namespace particles
+            namespace generic
+            {
+                namespace acc
+                {
+                    template<typename T_Functor, typename T_RngType>
+                    struct FreeRng : private T_Functor
+                    {
+                        using Functor = T_Functor;
+                        using RngType = T_RngType;
+
+                        HDINLINE FreeRng(Functor const& functor, RngType const& rng) : T_Functor(functor), m_rng(rng)
+                        {
+                        }
+
+                        /** call user functor
+                         *
+                         * The random number generator is initialized with the first call.
+                         *
+                         * @tparam T_Particle type of the particle to manipulate
+                         * @tparam T_Args type of the arguments passed to the user functor
+                         * @tparam T_Acc alpaka accelerator type
+                         *
+                         * @param alpaka accelerator
+                         * @param particle particle which is given to the user functor
+                         * @return void is used to enable the operator if the user functor except two arguments
+                         */
+                        template<typename T_Particle, typename... T_Args, typename T_Acc>
+                        HDINLINE bool operator()(T_Acc const&, T_Particle const& particle)
+                        {
+                            bool const isValid = particle.isHandleValid();
+
+                            return isValid && Functor::operator()(m_rng, particle);
+                        }
+
+                    private:
+                        RngType m_rng;
+                    };
+                } // namespace acc
+
+                template<typename T_Functor, typename T_Distribution>
+                struct FreeRng
+                    : protected functor::User<T_Functor>
+                    , private picongpu::particles::functor::misc::Rng<T_Distribution>
+                {
+                    template<typename T_SpeciesType>
+                    struct apply
+                    {
+                        using type = FreeRng;
+                    };
+
+                    using RngGenerator = picongpu::particles::functor::misc::Rng<T_Distribution>;
+
+                    using RngType = typename RngGenerator::RandomGen;
+
+                    using Functor = functor::User<T_Functor>;
+                    using Distribution = T_Distribution;
+
+                    /** constructor
+                     *
+                     * @param currentStep current simulation time step
+                     */
+                    HINLINE FreeRng(uint32_t currentStep) : Functor(currentStep), RngGenerator(currentStep)
+                    {
+                    }
+
+                    /** create functor for the accelerator
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param localSupercellOffset offset (in superCells, without any guards) relative
+                     *                        to the origin of the local domain
+                     * @param workerCfg configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE auto operator()(
+                        T_Acc const& acc,
+                        DataSpace<simDim> const& localSupercellOffset,
+                        T_WorkerCfg const& workerCfg) const -> acc::FreeRng<Functor, RngType>
+                    {
+                        RngType const rng
+                            = (*static_cast<RngGenerator const*>(this))(acc, localSupercellOffset, workerCfg);
+
+                        return acc::FreeRng<Functor, RngType>(*static_cast<Functor const*>(this), rng);
+                    }
+
+                    static HINLINE std::string getName()
+                    {
+                        // we provide the name from the param class
+                        return Functor::name;
+                    }
+                };
+
+            } // namespace generic
+        } // namespace filter
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/filter/generic/FreeTotalCellOffset.def b/include/picongpu/particles/filter/generic/FreeTotalCellOffset.def
index a51a2401ca..1bb3732aea 100644
--- a/include/picongpu/particles/filter/generic/FreeTotalCellOffset.def
+++ b/include/picongpu/particles/filter/generic/FreeTotalCellOffset.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,49 +27,48 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-namespace generic
-{
-
-    /** call simple free user defined functor and provide the cell information
-     *
-     * The functor passes the cell offset of the particle relative to the total
-     * domain origin into the functor.
-     *
-     * @tparam T_Functor user defined unary functor
-     *
-     * example for `particleFilters.param`: each particle with a cell offset of 5
-     * in X direction
-     *   @code{.cpp}
-     *
-     *   struct FunctorEachParticleInXCell5
-     *   {
-     *       template< typename T_Particle >
-     *       HDINLINE bool operator()(
-     *           DataSpace< simDim > const & particleOffsetToTotalOrigin,
-     *           T_Particle const & particle
-     *       )
-     *       {
-     *           bool result = false;
-     *           if( particleOffsetToTotalOrigin.x() == 5 )
-     *               result = true;
-     *           return result;
-     *       }
-     *       static constexpr char const * name = "eachParticleInXCell5";
-     *   };
-     *
-     *   using EachParticleInXCell5 = generic::FreeTotalCellOffset<
-     *      FunctorEachParticleInXCell5
-     *   >;
-     *   @endcode
-     */
-    template< typename T_Functor >
-    struct FreeTotalCellOffset;
+    namespace particles
+    {
+        namespace filter
+        {
+            namespace generic
+            {
+                /** call simple free user defined functor and provide the cell information
+                 *
+                 * The functor passes the cell offset of the particle relative to the total
+                 * domain origin into the functor.
+                 *
+                 * @tparam T_Functor user defined unary functor
+                 *
+                 * example for `particleFilters.param`: each particle with a cell offset of 5
+                 * in X direction
+                 *   @code{.cpp}
+                 *
+                 *   struct FunctorEachParticleInXCell5
+                 *   {
+                 *       template< typename T_Particle >
+                 *       HDINLINE bool operator()(
+                 *           DataSpace< simDim > const & particleOffsetToTotalOrigin,
+                 *           T_Particle const & particle
+                 *       )
+                 *       {
+                 *           bool result = false;
+                 *           if( particleOffsetToTotalOrigin.x() == 5 )
+                 *               result = true;
+                 *           return result;
+                 *       }
+                 *       static constexpr char const * name = "eachParticleInXCell5";
+                 *   };
+                 *
+                 *   using EachParticleInXCell5 = generic::FreeTotalCellOffset<
+                 *      FunctorEachParticleInXCell5
+                 *   >;
+                 *   @endcode
+                 */
+                template<typename T_Functor>
+                struct FreeTotalCellOffset;
 
-} // namespace generic
-} // namespace filter
-} // namespace particles
+            } // namespace generic
+        } // namespace filter
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/filter/generic/FreeTotalCellOffset.hpp b/include/picongpu/particles/filter/generic/FreeTotalCellOffset.hpp
index 744c3e6ee8..b0d76bdfd1 100644
--- a/include/picongpu/particles/filter/generic/FreeTotalCellOffset.hpp
+++ b/include/picongpu/particles/filter/generic/FreeTotalCellOffset.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,138 +28,113 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-namespace generic
-{
-namespace acc
-{
-    template< typename T_Functor >
-    struct FreeTotalCellOffset : private T_Functor
+    namespace particles
     {
-
-        using Functor = T_Functor;
-
-        HDINLINE FreeTotalCellOffset(
-            Functor const & functor,
-            DataSpace< simDim > const & superCellToLocalOriginCellOffset
-        ) :
-            T_Functor( functor ),
-            m_superCellToLocalOriginCellOffset( superCellToLocalOriginCellOffset )
+        namespace filter
         {
-        }
-
-        /** call user functor
-         *
-         * The random number generator is initialized with the first call.
-         *
-         * @tparam T_Particle type of the particle to manipulate
-         * @tparam T_Args type of the arguments passed to the user functor
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param particle particle which is given to the user functor
-         * @return void is used to enable the operator if the user functor except two arguments
-         */
-        template<
-            typename T_Particle,
-            typename T_Acc
-        >
-        HDINLINE
-        bool operator()(
-            T_Acc const &,
-            T_Particle const & particle
-        )
-        {
-            bool filterResult = false;
-            if( particle.isHandleValid( ) )
+            namespace generic
             {
-                DataSpace< simDim > const cellInSuperCell(
-                    DataSpaceOperations< simDim >::template map< SuperCellSize > ( particle[ localCellIdx_ ] )
-                );
-                filterResult = Functor::operator( )(
-                    m_superCellToLocalOriginCellOffset + cellInSuperCell,
-                    particle
-                );
-            }
-            return filterResult;
-        }
-
-    private:
-
-        DataSpace< simDim > const m_superCellToLocalOriginCellOffset;
-    };
-} // namespace acc
-
-    template< typename T_Functor >
-    struct FreeTotalCellOffset :
-        protected functor::User< T_Functor >,
-        private functor::misc::TotalCellOffset
-    {
-        using CellOffsetFunctor = functor::misc::TotalCellOffset;
-        using Functor = functor::User< T_Functor >;
-
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = FreeTotalCellOffset;
-        };
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE FreeTotalCellOffset( uint32_t currentStep ) :
-            Functor( currentStep ),
-            CellOffsetFunctor( currentStep )
-        {
-        }
-
-        /** create functor for the accelerator
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset offset (in superCells, without any guards) relative
-         *                        to the origin of the local domain
-         * @param workerCfg configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE auto
-        operator()(
-            T_Acc const & acc,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const & workerCfg
-        ) const
-        -> acc::FreeTotalCellOffset< Functor >
-        {
-            auto & cellOffsetFunctor = *static_cast< CellOffsetFunctor const * >( this );
-            return acc::FreeTotalCellOffset< Functor >(
-                *static_cast< Functor const * >( this ),
-                cellOffsetFunctor(
-                    acc,
-                    localSupercellOffset,
-                    workerCfg
-                )
-            );
-        }
-
-        static
-        HINLINE std::string
-        getName( )
-        {
-            // we provide the name from the param class
-            return Functor::name;
-        }
-    };
-
-} // namespace generic
-} // namespace filter
-} // namespace particles
+                namespace acc
+                {
+                    template<typename T_Functor>
+                    struct FreeTotalCellOffset : private T_Functor
+                    {
+                        using Functor = T_Functor;
+
+                        HDINLINE FreeTotalCellOffset(
+                            Functor const& functor,
+                            DataSpace<simDim> const& superCellToLocalOriginCellOffset)
+                            : T_Functor(functor)
+                            , m_superCellToLocalOriginCellOffset(superCellToLocalOriginCellOffset)
+                        {
+                        }
+
+                        /** call user functor
+                         *
+                         * The random number generator is initialized with the first call.
+                         *
+                         * @tparam T_Particle type of the particle to manipulate
+                         * @tparam T_Args type of the arguments passed to the user functor
+                         * @tparam T_Acc alpaka accelerator type
+                         *
+                         * @param alpaka accelerator
+                         * @param particle particle which is given to the user functor
+                         * @return void is used to enable the operator if the user functor except two arguments
+                         */
+                        template<typename T_Particle, typename T_Acc>
+                        HDINLINE bool operator()(T_Acc const&, T_Particle const& particle)
+                        {
+                            bool filterResult = false;
+                            if(particle.isHandleValid())
+                            {
+                                DataSpace<simDim> const cellInSuperCell(
+                                    DataSpaceOperations<simDim>::template map<SuperCellSize>(particle[localCellIdx_]));
+                                filterResult = Functor::operator()(
+                                    m_superCellToLocalOriginCellOffset + cellInSuperCell,
+                                    particle);
+                            }
+                            return filterResult;
+                        }
+
+                    private:
+                        DataSpace<simDim> const m_superCellToLocalOriginCellOffset;
+                    };
+                } // namespace acc
+
+                template<typename T_Functor>
+                struct FreeTotalCellOffset
+                    : protected functor::User<T_Functor>
+                    , private functor::misc::TotalCellOffset
+                {
+                    using CellOffsetFunctor = functor::misc::TotalCellOffset;
+                    using Functor = functor::User<T_Functor>;
+
+                    template<typename T_SpeciesType>
+                    struct apply
+                    {
+                        using type = FreeTotalCellOffset;
+                    };
+
+                    /** constructor
+                     *
+                     * @param currentStep current simulation time step
+                     */
+                    HINLINE FreeTotalCellOffset(uint32_t currentStep)
+                        : Functor(currentStep)
+                        , CellOffsetFunctor(currentStep)
+                    {
+                    }
+
+                    /** create functor for the accelerator
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param localSupercellOffset offset (in superCells, without any guards) relative
+                     *                        to the origin of the local domain
+                     * @param workerCfg configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE auto operator()(
+                        T_Acc const& acc,
+                        DataSpace<simDim> const& localSupercellOffset,
+                        T_WorkerCfg const& workerCfg) const -> acc::FreeTotalCellOffset<Functor>
+                    {
+                        auto& cellOffsetFunctor = *static_cast<CellOffsetFunctor const*>(this);
+                        return acc::FreeTotalCellOffset<Functor>(
+                            *static_cast<Functor const*>(this),
+                            cellOffsetFunctor(acc, localSupercellOffset, workerCfg));
+                    }
+
+                    static HINLINE std::string getName()
+                    {
+                        // we provide the name from the param class
+                        return Functor::name;
+                    }
+                };
+
+            } // namespace generic
+        } // namespace filter
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/IFlyLite.hpp b/include/picongpu/particles/flylite/IFlyLite.hpp
index 7a0b9b8cae..1f54dc3164 100644
--- a/include/picongpu/particles/flylite/IFlyLite.hpp
+++ b/include/picongpu/particles/flylite/IFlyLite.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  *
  * This file is part of PIConGPU.
@@ -31,57 +31,48 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-    /** Interface for a method of solving population kinetics
-     */
-    class IFlyLite
+    namespace particles
     {
-    public:
-        /** Allocate & Initialize Memory Buffers for Algorithms
-         *
-         * @param gridSizeLocal local size of electro-magnetic fields on the cells
-         * @param ionSpeciesName unique name for the ion species
-         */
-        virtual void init(
-            pmacc::DataSpace< simDim > const & gridSizeLocal,
-            std::string const & ionSpeciesName
-        ) = 0;
-
-        /** Calculate Evolution of Populations for One Time Step
-         *
-         * Interface for the update of the atomic populations during the PIC
-         * cycle.
-         *
-         * @param ionSpeciesName unique name for the ion species
-         * @param currentStep the current time step of the simulation
-         */
-        template<
-            typename T_IonSpecies
-        >
-        void update(
-            std::string const & ionSpeciesName,
-            uint32_t currentStep
-        )
+        namespace flylite
         {
-            boost::ignore_unused( ionSpeciesName, currentStep );
-            /* The compiler is allowed to evaluate an expression those not depends on a template parameter
-             * even if the class is never instantiated. In that case static assert is always
-             * evaluated (e.g. with clang), this results in an error if the condition is false.
-             * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
-             *
-             * A workaround is to add a template dependency to the expression.
-             * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+            /** Interface for a method of solving population kinetics
              */
-            PMACC_STATIC_ASSERT_MSG(
-                false && sizeof(T_IonSpecies) != 0,
-                FLYlite_the_update_method_for_ion_population_kinetics_is_not_implemented
-            );
-        }
+            class IFlyLite
+            {
+            public:
+                /** Allocate & Initialize Memory Buffers for Algorithms
+                 *
+                 * @param gridSizeLocal local size of electro-magnetic fields on the cells
+                 * @param ionSpeciesName unique name for the ion species
+                 */
+                virtual void init(pmacc::DataSpace<simDim> const& gridSizeLocal, std::string const& ionSpeciesName)
+                    = 0;
 
-    };
-} // namespace flylite
-} // namespace particles
+                /** Calculate Evolution of Populations for One Time Step
+                 *
+                 * Interface for the update of the atomic populations during the PIC
+                 * cycle.
+                 *
+                 * @param ionSpeciesName unique name for the ion species
+                 * @param currentStep the current time step of the simulation
+                 */
+                template<typename T_IonSpecies>
+                void update(std::string const& ionSpeciesName, uint32_t currentStep)
+                {
+                    boost::ignore_unused(ionSpeciesName, currentStep);
+                    /* The compiler is allowed to evaluate an expression those not depends on a template parameter
+                     * even if the class is never instantiated. In that case static assert is always
+                     * evaluated (e.g. with clang), this results in an error if the condition is false.
+                     * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
+                     *
+                     * A workaround is to add a template dependency to the expression.
+                     * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+                     */
+                    PMACC_STATIC_ASSERT_MSG(
+                        false && sizeof(T_IonSpecies) != 0,
+                        FLYlite_the_update_method_for_ion_population_kinetics_is_not_implemented, );
+                }
+            };
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/NonLTE.def b/include/picongpu/particles/flylite/NonLTE.def
index 11cf3c1a58..8b78f6f2ee 100644
--- a/include/picongpu/particles/flylite/NonLTE.def
+++ b/include/picongpu/particles/flylite/NonLTE.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  *
  * This file is part of PIConGPU.
@@ -23,32 +23,31 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-    /** Non-LTE Steady-State
-     *
-     * Implementation of non-LTE ionization dynamics.
-     * @todo later on, add references on the overall model here.
-     *
-     * @todo add T_OtherIonsList for multi ion species IPD
-     *
-     * @tparam T_ElectronsList A mpl sequence of picongpu::Particles with a list
-     *                         of electron species for local density and energy
-     *                         histogram binning
-     *
-     * @tparam T_PhotonsList A mpl sequence of picongpu::Particles with a list
-     *                       of photon species for local energy histogram
-     *                       binning
-     */
-    template<
-        /* typename T_OtherIonsList, */
-        typename T_ElectronsList,
-        typename T_PhotonsList
-    >
-    class NonLTE;
+    namespace particles
+    {
+        namespace flylite
+        {
+            /** Non-LTE Steady-State
+             *
+             * Implementation of non-LTE ionization dynamics.
+             * @todo later on, add references on the overall model here.
+             *
+             * @todo add T_OtherIonsList for multi ion species IPD
+             *
+             * @tparam T_ElectronsList A mpl sequence of picongpu::Particles with a list
+             *                         of electron species for local density and energy
+             *                         histogram binning
+             *
+             * @tparam T_PhotonsList A mpl sequence of picongpu::Particles with a list
+             *                       of photon species for local energy histogram
+             *                       binning
+             */
+            template<
+                /* typename T_OtherIonsList, */
+                typename T_ElectronsList,
+                typename T_PhotonsList>
+            class NonLTE;
 
-} // namespace flylite
-} // namespace particles
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/NonLTE.hpp b/include/picongpu/particles/flylite/NonLTE.hpp
index 7550c52a19..f1b61c1aa3 100644
--- a/include/picongpu/particles/flylite/NonLTE.hpp
+++ b/include/picongpu/particles/flylite/NonLTE.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -31,70 +31,51 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-    template<
-        //! @todo for multi ion species IPD: typename T_OtherIonsList,
-        typename T_ElectronsList,
-        typename T_PhotonsList
-    >
-    class NonLTE : public IFlyLite
+    namespace particles
     {
-    public:
-        //! @todo for multi ion species IPD: using OtherIonsList = T_OtherIonsList;
-
-        using ElectronsList = T_ElectronsList;
-        using PhotonsList = T_PhotonsList;
+        namespace flylite
+        {
+            template<
+                //! @todo for multi ion species IPD: typename T_OtherIonsList,
+                typename T_ElectronsList,
+                typename T_PhotonsList>
+            class NonLTE : public IFlyLite
+            {
+            public:
+                //! @todo for multi ion species IPD: using OtherIonsList = T_OtherIonsList;
 
-        virtual
-        void
-        init(
-            pmacc::DataSpace< simDim > const & gridSizeLocal,
-            std::string const & ionSpeciesName
-        );
+                using ElectronsList = T_ElectronsList;
+                using PhotonsList = T_PhotonsList;
 
-        /** Update atomic configurations
-         *
-         * Prepares auxiliary fields for the non-LTE atomic physics model and
-         * updates the configurations & charge states of an ion species.
-         *
-         * @tparam T_IonSpeciesType a picongpu::Particles class with an ion
-         *                          species
-         *
-         * @param ionSpeciesName unique name of the ion species in T_IonSpeciesType
-         * @param currentStep the current time step
-         */
-        template<
-            typename T_IonSpeciesType
-        >
-        void
-        update(
-            std::string const & ionSpeciesName,
-            uint32_t currentStep
-        );
+                virtual void init(pmacc::DataSpace<simDim> const& gridSizeLocal, std::string const& ionSpeciesName);
 
-    private:
-        /** Calculate new values in helper fields
-         *
-         * Prepares helper fields by calculating local densities and energy
-         * histograms.
-         *
-         * @param ionSpeciesName unique name of the ion species in T_IonSpeciesType
-         * @param currentStep the current time step
-         */
-        template<
-            typename T_IonSpeciesType
-        >
-        void
-        fillHelpers(
-            std::string const & ionSpeciesName,
-            uint32_t currentStep
-        );
+                /** Update atomic configurations
+                 *
+                 * Prepares auxiliary fields for the non-LTE atomic physics model and
+                 * updates the configurations & charge states of an ion species.
+                 *
+                 * @tparam T_IonSpeciesType a picongpu::Particles class with an ion
+                 *                          species
+                 *
+                 * @param ionSpeciesName unique name of the ion species in T_IonSpeciesType
+                 * @param currentStep the current time step
+                 */
+                template<typename T_IonSpeciesType>
+                void update(std::string const& ionSpeciesName, uint32_t currentStep);
 
-    };
+            private:
+                /** Calculate new values in helper fields
+                 *
+                 * Prepares helper fields by calculating local densities and energy
+                 * histograms.
+                 *
+                 * @param ionSpeciesName unique name of the ion species in T_IonSpeciesType
+                 * @param currentStep the current time step
+                 */
+                template<typename T_IonSpeciesType>
+                void fillHelpers(std::string const& ionSpeciesName, uint32_t currentStep);
+            };
 
-} // namespace flylite
-} // namespace particles
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/NonLTE.tpp b/include/picongpu/particles/flylite/NonLTE.tpp
index aae44341bc..0dda1cbf11 100644
--- a/include/picongpu/particles/flylite/NonLTE.tpp
+++ b/include/picongpu/particles/flylite/NonLTE.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -27,172 +27,115 @@
 #include "picongpu/particles/particleToGrid/derivedAttributes/Density.def"
 #include "picongpu/particles/traits/GetShape.hpp"
 
-/* pmacc */
 #include <pmacc/Environment.hpp>
 #include <pmacc/dataManagement/ISimulationData.hpp>
 #include <pmacc/traits/GetNumWorkers.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 
 #include <memory>
 
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-    template<
-        //! @todo for multi ion species IPD: typename T_OtherIonsList
-
-        typename T_ElectronsList,
-        typename T_PhotonsList
-    >
-    void
-    NonLTE<
-        T_ElectronsList,
-        T_PhotonsList
-    >::init(
-        pmacc::DataSpace< simDim > const & gridSizeLocal,
-        std::string const & ionSpeciesName
-    )
+    namespace particles
     {
-        //! GPU-local number of cells in regular resolution (like FieldE & B)
-        pmacc::DataSpace< simDim > m_gridSizeLocal = gridSizeLocal;
-        //! GPU-local number of cells in averaged (reduced) resolution
-        pmacc::DataSpace< simDim > m_avgGridSizeLocal = m_gridSizeLocal / picongpu::flylite::spatialAverageBox::toRT();
-
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        using pmacc::memory::makeUnique;
-        // once allocated for all ion species to share
-        if( ! dc.hasId( helperFields::LocalEnergyHistogram::getName( "electrons" ) ) )
-            dc.consume(
-                makeUnique< helperFields::LocalEnergyHistogram >(
+        namespace flylite
+        {
+            template<
+                //! @todo for multi ion species IPD: typename T_OtherIonsList
+
+                typename T_ElectronsList,
+                typename T_PhotonsList>
+            void NonLTE<T_ElectronsList, T_PhotonsList>::init(
+                pmacc::DataSpace<simDim> const& gridSizeLocal,
+                std::string const& ionSpeciesName)
+            {
+                //! GPU-local number of cells in regular resolution (like FieldE & B)
+                pmacc::DataSpace<simDim> m_gridSizeLocal = gridSizeLocal;
+                //! GPU-local number of cells in averaged (reduced) resolution
+                pmacc::DataSpace<simDim> m_avgGridSizeLocal
+                    = m_gridSizeLocal / picongpu::flylite::spatialAverageBox::toRT();
+
+                DataConnector& dc = Environment<>::get().DataConnector();
+
+                // once allocated for all ion species to share
+                if(!dc.hasId(helperFields::LocalEnergyHistogram::getName("electrons")))
+                    dc.consume(std::make_unique<helperFields::LocalEnergyHistogram>("electrons", m_avgGridSizeLocal));
+
+                if(!dc.hasId(helperFields::LocalEnergyHistogram::getName("photons")))
+                    dc.consume(std::make_unique<helperFields::LocalEnergyHistogram>("photons", m_avgGridSizeLocal));
+
+                if(!dc.hasId(helperFields::LocalDensity::getName("electrons")))
+                    dc.consume(std::make_unique<helperFields::LocalDensity>("electrons", m_avgGridSizeLocal));
+
+                // for each ion species
+                if(!dc.hasId(helperFields::LocalRateMatrix::getName(ionSpeciesName)))
+                    dc.consume(std::make_unique<helperFields::LocalRateMatrix>(ionSpeciesName, m_avgGridSizeLocal));
+
+                if(!dc.hasId(helperFields::LocalDensity::getName(ionSpeciesName)))
+                    dc.consume(std::make_unique<helperFields::LocalDensity>(ionSpeciesName, m_avgGridSizeLocal));
+            }
+
+            template<
+                //! @todo for multi ion species IPD: typename T_OtherIonsList,
+
+                typename T_ElectronsList,
+                typename T_PhotonsList>
+            template<typename T_IonSpeciesType>
+            void NonLTE<
+                //! @todo for multi ion species IPD: T_OtherIonsList,
+
+                T_ElectronsList,
+                T_PhotonsList>::update(std::string const& ionSpeciesName, uint32_t currentStep)
+            {
+                using IonSpeciesType = T_IonSpeciesType;
+
+                // calculate density fields and energy histograms
+                fillHelpers<IonSpeciesType>(ionSpeciesName, currentStep);
+
+                //! @todo calculate rate matrix
+                //! @todo implicit ODE solve to evolve populations
+                //! @todo modify f_e of free electrons
+                //! @todo modify f_ph of photon field (absorb)
+                //! @todo change charges, create electrons & photons
+            }
+
+            template<
+                //! @todo for multi ion species IPD: typename T_OtherIonsList,
+
+                typename T_ElectronsList,
+                typename T_PhotonsList>
+            template<typename T_IonSpeciesType>
+            void NonLTE<
+                //! @todo for multi ion species IPD: T_OtherIonsList,
+
+                T_ElectronsList,
+                T_PhotonsList>::fillHelpers(std::string const& ionSpeciesName, uint32_t currentStep)
+            {
+                using IonSpeciesType = T_IonSpeciesType;
+
+                // calculate density fields
+                helperFields::FillLocalDensity<MakeSeq_t<IonSpeciesType>> fillDensityIons{};
+                fillDensityIons(currentStep, ionSpeciesName);
+
+                helperFields::FillLocalDensity<T_ElectronsList> fillDensityElectrons{};
+                fillDensityElectrons(currentStep, "electrons");
+
+                // calculate energy histograms: f(e), f(ph)
+                helperFields::FillLocalEnergyHistogram<T_ElectronsList> fillEnergyHistogramElectrons{};
+                fillEnergyHistogramElectrons(
+                    currentStep,
                     "electrons",
-                    m_avgGridSizeLocal
-                )
-            );
+                    picongpu::flylite::electronMinEnergy,
+                    picongpu::flylite::electronMaxEnergy);
 
-        if( ! dc.hasId( helperFields::LocalEnergyHistogram::getName( "photons" ) ) )
-            dc.consume(
-                makeUnique< helperFields::LocalEnergyHistogram >(
+                helperFields::FillLocalEnergyHistogram<T_PhotonsList> fillEnergyHistogramPhotons{};
+                fillEnergyHistogramPhotons(
+                    currentStep,
                     "photons",
-                    m_avgGridSizeLocal
-                )
-            );
+                    picongpu::flylite::photonMinEnergy,
+                    picongpu::flylite::photonMaxEnergy);
+            }
 
-        if( ! dc.hasId( helperFields::LocalDensity::getName( "electrons" ) ) )
-            dc.consume(
-                makeUnique< helperFields::LocalDensity >(
-                    "electrons",
-                    m_avgGridSizeLocal
-                )
-            );
-
-        // for each ion species
-        if( ! dc.hasId( helperFields::LocalRateMatrix::getName( ionSpeciesName ) ) )
-            dc.consume(
-                makeUnique< helperFields::LocalRateMatrix >(
-                    ionSpeciesName,
-                    m_avgGridSizeLocal
-                )
-            );
-
-        if( ! dc.hasId( helperFields::LocalDensity::getName( ionSpeciesName ) ) )
-            dc.consume(
-                makeUnique< helperFields::LocalDensity >(
-                    ionSpeciesName,
-                    m_avgGridSizeLocal
-                )
-            );
-    }
-
-    template<
-        //! @todo for multi ion species IPD: typename T_OtherIonsList,
-
-        typename T_ElectronsList,
-        typename T_PhotonsList
-    >
-    template<
-        typename T_IonSpeciesType
-    >
-    void
-    NonLTE<
-        //! @todo for multi ion species IPD: T_OtherIonsList,
-
-        T_ElectronsList,
-        T_PhotonsList
-    >::update(
-        std::string const & ionSpeciesName,
-        uint32_t currentStep
-    )
-    {
-        using IonSpeciesType = T_IonSpeciesType;
-
-        // calculate density fields and energy histograms
-        fillHelpers< IonSpeciesType >( ionSpeciesName, currentStep );
-
-        //! @todo calculate rate matrix
-        //! @todo implicit ODE solve to evolve populations
-        //! @todo modify f_e of free electrons
-        //! @todo modify f_ph of photon field (absorb)
-        //! @todo change charges, create electrons & photons
-    }
-
-    template<
-        //! @todo for multi ion species IPD: typename T_OtherIonsList,
-
-        typename T_ElectronsList,
-        typename T_PhotonsList
-    >
-    template<
-        typename T_IonSpeciesType
-    >
-    void
-    NonLTE<
-        //! @todo for multi ion species IPD: T_OtherIonsList,
-
-        T_ElectronsList,
-        T_PhotonsList
-    >::fillHelpers(
-        std::string const & ionSpeciesName,
-        uint32_t currentStep
-    )
-    {
-        using IonSpeciesType = T_IonSpeciesType;
-
-        // calculate density fields
-        helperFields::FillLocalDensity< MakeSeq_t< IonSpeciesType > > fillDensityIons{};
-        fillDensityIons(
-            currentStep,
-            ionSpeciesName
-        );
-
-        helperFields::FillLocalDensity< T_ElectronsList > fillDensityElectrons{};
-        fillDensityElectrons(
-            currentStep,
-            "electrons"
-        );
-
-        // calculate energy histograms: f(e), f(ph)
-        helperFields::FillLocalEnergyHistogram< T_ElectronsList > fillEnergyHistogramElectrons{};
-        fillEnergyHistogramElectrons(
-            currentStep,
-            "electrons",
-            picongpu::flylite::electronMinEnergy,
-            picongpu::flylite::electronMaxEnergy
-        );
-
-        helperFields::FillLocalEnergyHistogram< T_PhotonsList > fillEnergyHistogramPhotons{};
-        fillEnergyHistogramPhotons(
-            currentStep,
-            "photons",
-            picongpu::flylite::photonMinEnergy,
-            picongpu::flylite::photonMaxEnergy
-        );
-    }
-
-} // namespace flylite
-} // namespace particles
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/helperFields/LocalDensity.hpp b/include/picongpu/particles/flylite/helperFields/LocalDensity.hpp
index 86fb052be5..a2e0a9e76a 100644
--- a/include/picongpu/particles/flylite/helperFields/LocalDensity.hpp
+++ b/include/picongpu/particles/flylite/helperFields/LocalDensity.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,78 +33,69 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-namespace helperFields
-{
-    class LocalDensity :
-        public ISimulationData
+    namespace particles
     {
-    public:
-        using ValueType = float_X;
+        namespace flylite
+        {
+            namespace helperFields
+            {
+                class LocalDensity : public ISimulationData
+                {
+                public:
+                    using ValueType = float_X;
 
-    private:
-        GridBuffer< ValueType, simDim >* m_density;
-        std::string m_speciesGroup;
+                private:
+                    GridBuffer<ValueType, simDim>* m_density;
+                    std::string m_speciesGroup;
 
-    public:
-        /** Allocate and initialize local (number) density
-         *
-         * @param speciesGroup unique naming for the species inside this density,
-         *                     e.g. a collection of electron species or ions
-         * @param sizeLocal spatial size of the local density value
-         */
-        LocalDensity(
-            std::string const & speciesGroup,
-            DataSpace< simDim > const & sizeLocal
-        ) :
-            m_density( nullptr ),
-            m_speciesGroup( speciesGroup )
-        {
-            // without guards
-            m_density = new GridBuffer< ValueType, simDim >( sizeLocal );
-        }
+                public:
+                    /** Allocate and initialize local (number) density
+                     *
+                     * @param speciesGroup unique naming for the species inside this density,
+                     *                     e.g. a collection of electron species or ions
+                     * @param sizeLocal spatial size of the local density value
+                     */
+                    LocalDensity(std::string const& speciesGroup, DataSpace<simDim> const& sizeLocal)
+                        : m_density(nullptr)
+                        , m_speciesGroup(speciesGroup)
+                    {
+                        // without guards
+                        m_density = new GridBuffer<ValueType, simDim>(sizeLocal);
+                    }
 
-        ~LocalDensity()
-        {
-            __delete( m_density );
-        }
+                    ~LocalDensity()
+                    {
+                        __delete(m_density);
+                    }
 
-        static std::string
-        getName( std::string const & speciesGroup )
-        {
-            return speciesGroup + "_LocalDensity";
-        }
+                    static std::string getName(std::string const& speciesGroup)
+                    {
+                        return speciesGroup + "_LocalDensity";
+                    }
 
-        std::string
-        getName( )
-        {
-            return getName( m_speciesGroup );
-        }
+                    std::string getName()
+                    {
+                        return getName(m_speciesGroup);
+                    }
 
-        GridBuffer< ValueType, simDim >&
-        getGridBuffer( )
-        {
-            return *m_density;
-        }
+                    GridBuffer<ValueType, simDim>& getGridBuffer()
+                    {
+                        return *m_density;
+                    }
 
-        /* implement ISimulationData members */
-        void
-        synchronize() override
-        {
-            m_density->deviceToHost( );
-        }
+                    /* implement ISimulationData members */
+                    void synchronize() override
+                    {
+                        m_density->deviceToHost();
+                    }
 
-        SimulationDataId
-        getUniqueId() override
-        {
-            return getName();
-        }
-    };
+                    SimulationDataId getUniqueId() override
+                    {
+                        return getName();
+                    }
+                };
 
-} // namespace helperFields
-} // namespace flylite
-} // namespace particles
+            } // namespace helperFields
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/helperFields/LocalDensity.kernel b/include/picongpu/particles/flylite/helperFields/LocalDensity.kernel
index e2be05a3a4..b82dbac0c0 100644
--- a/include/picongpu/particles/flylite/helperFields/LocalDensity.kernel
+++ b/include/picongpu/particles/flylite/helperFields/LocalDensity.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl, Rene Widera
+/* Copyright 2017-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -34,124 +34,98 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-namespace helperFields
-{
-    /** Average a FieldTmp density to a smaller resolution
-     *
-     * Average a FieldTmp density to a smaller (per-supercell) resolution and
-     * add it to a local density field.
-     *
-     * @tparam T_numWorkers number of workers for lockstep execution per block,
-     *                      arbitrary for reduce since it will loop over the
-     *                      source size when necessary
-     */
-    template<
-        uint32_t T_numWorkers
-    >
-    struct KernelAverageDensity
+    namespace particles
     {
-        /** Functor
-         *
-         * @tparam T_TmpBox pmacc::DataBox with full-resolution density
-         * @tparam T_LocalDensityBox pmacc::DataBox local density with less
-         *                           resolution
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param fieldTmp pmacc::DataBox with FieldTmp density scalar field
-         * @param localDensity pmacc::DataBox with global memory, e.g. for each
-         *                     supercell's density
-         */
-        template<
-            typename T_TmpBox,
-            typename T_LocalDensityBox,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_TmpBox fieldTmp,
-            T_LocalDensityBox localDensity
-        ) const
+        namespace flylite
         {
-            using picongpu::flylite::spatialAverageBox;
-            using ValueType = typename T_TmpBox::ValueType;
-            constexpr uint32_t numWorkers = T_numWorkers;
-
-            // cell index in the average box in reduced resolution
-            DataSpace< simDim > const avgBoxCell( blockIdx );
-            // first cell index inside FieldTmp (originating from BORDER) for block
-            DataSpace< simDim > const fieldTmpBlockOriginCell = avgBoxCell * spatialAverageBox::toRT();
-            // our workers per block are started 1D
-            uint32_t const linearThreadIdx( threadIdx.x );
-
-            // shift the fieldTmp to the start of average box
-            auto fieldTmpBlock = fieldTmp.shift( fieldTmpBlockOriginCell );
-
-            // shared memory for reduce
-            PMACC_SMEM(
-                acc,
-                shReduceBuffer,
-                memory::Array<
-                    ValueType,
-                    numWorkers
-                >
-            );
-
-            // re-map access indices to local average view
-            using D1Box = DataBoxDim1Access< T_TmpBox >;
-            D1Box d1access(
-                fieldTmpBlock,
-                spatialAverageBox::toRT()
-            );
-
-            __syncthreads();
-
-            uint32_t const numAvgCells = pmacc::math::CT::volume< spatialAverageBox >::type::value;
-
-            nvidia::reduce::kernel::Reduce<
-                ValueType,
-                numAvgCells,
-                numWorkers
-            > reduce{};
-
-
-            reduce(
-                acc,
-                mappings::threads::WorkerCfg< numWorkers >( linearThreadIdx ),
-                numAvgCells,
-                /* access inside local average view */
-                d1access,
-                numAvgCells,
-                nvidia::functors::Add(),
-                shReduceBuffer
-            );
-
-            /* continue with master
-             *
-             * - before working with this field, multiply by
-             *   particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE
-             * - divide by for average by numAvgCells
-             * - write back to global
-             *
-             * - change those lines if you want to re-use this kernel for a vector field
-             */
-            if( linearThreadIdx == 0 )
+            namespace helperFields
             {
-                ValueType localAverageResult = shReduceBuffer[ 0 ] *
-                    particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE /
-                    float_X( numAvgCells );
-
-                localDensity( avgBoxCell ) =
-                    static_cast< typename T_LocalDensityBox::ValueType >( localAverageResult );
-            }
-        }
-    };
-
-} // namespace helperFields
-} // namespace flylite
-} // namespace particles
+                /** Average a FieldTmp density to a smaller resolution
+                 *
+                 * Average a FieldTmp density to a smaller (per-supercell) resolution and
+                 * add it to a local density field.
+                 *
+                 * @tparam T_numWorkers number of workers for lockstep execution per block,
+                 *                      arbitrary for reduce since it will loop over the
+                 *                      source size when necessary
+                 */
+                template<uint32_t T_numWorkers>
+                struct KernelAverageDensity
+                {
+                    /** Functor
+                     *
+                     * @tparam T_TmpBox pmacc::DataBox with full-resolution density
+                     * @tparam T_LocalDensityBox pmacc::DataBox local density with less
+                     *                           resolution
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param fieldTmp pmacc::DataBox with FieldTmp density scalar field
+                     * @param localDensity pmacc::DataBox with global memory, e.g. for each
+                     *                     supercell's density
+                     */
+                    template<typename T_TmpBox, typename T_LocalDensityBox, typename T_Acc>
+                    DINLINE void operator()(T_Acc const& acc, T_TmpBox fieldTmp, T_LocalDensityBox localDensity) const
+                    {
+                        using picongpu::flylite::spatialAverageBox;
+                        using ValueType = typename T_TmpBox::ValueType;
+                        constexpr uint32_t numWorkers = T_numWorkers;
+
+                        // cell index in the average box in reduced resolution
+                        DataSpace<simDim> const avgBoxCell(cupla::blockIdx(acc));
+                        // first cell index inside FieldTmp (originating from BORDER) for block
+                        DataSpace<simDim> const fieldTmpBlockOriginCell = avgBoxCell * spatialAverageBox::toRT();
+                        // our workers per block are started 1D
+                        uint32_t const linearThreadIdx(cupla::threadIdx(acc).x);
+
+                        // shift the fieldTmp to the start of average box
+                        auto fieldTmpBlock = fieldTmp.shift(fieldTmpBlockOriginCell);
+
+                        // shared memory for reduce
+                        PMACC_SMEM(acc, shReduceBuffer, memory::Array<ValueType, numWorkers>);
+
+                        // re-map access indices to local average view
+                        using D1Box = DataBoxDim1Access<T_TmpBox>;
+                        D1Box d1access(fieldTmpBlock, spatialAverageBox::toRT());
+
+                        cupla::__syncthreads(acc);
+
+                        uint32_t const numAvgCells = pmacc::math::CT::volume<spatialAverageBox>::type::value;
+
+                        nvidia::reduce::kernel::Reduce<ValueType, numAvgCells, numWorkers> reduce{};
+
+
+                        reduce(
+                            acc,
+                            mappings::threads::WorkerCfg<numWorkers>(linearThreadIdx),
+                            numAvgCells,
+                            /* access inside local average view */
+                            d1access,
+                            numAvgCells,
+                            nvidia::functors::Add(),
+                            shReduceBuffer);
+
+                        /* continue with master
+                         *
+                         * - before working with this field, multiply by
+                         *   particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE
+                         * - divide by for average by numAvgCells
+                         * - write back to global
+                         *
+                         * - change those lines if you want to re-use this kernel for a vector field
+                         */
+                        if(linearThreadIdx == 0)
+                        {
+                            ValueType localAverageResult = shReduceBuffer[0]
+                                * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE / float_X(numAvgCells);
+
+                            localDensity(avgBoxCell)
+                                = static_cast<typename T_LocalDensityBox::ValueType>(localAverageResult);
+                        }
+                    }
+                };
+
+            } // namespace helperFields
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/helperFields/LocalDensityFunctors.hpp b/include/picongpu/particles/flylite/helperFields/LocalDensityFunctors.hpp
index a8cc35d766..d7b67d6018 100644
--- a/include/picongpu/particles/flylite/helperFields/LocalDensityFunctors.hpp
+++ b/include/picongpu/particles/flylite/helperFields/LocalDensityFunctors.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -35,136 +35,113 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-namespace helperFields
-{
-namespace detail
-{
-    /** Average a group of species to a local density
-     *
-     * Takes a single species and fills the LocalDensity with it.
-     *
-     * @tparam T_SpeciesType a picongpu::Particles class with a particle species
-     */
-    template<
-        typename T_SpeciesType
-    >
-    struct AddSingleDensity
-    {
-        using SpeciesType = T_SpeciesType;
-        using FrameType = typename SpeciesType::FrameType;
-        using ShapeType = typename GetShape< SpeciesType >::type;
-
-        /** Functor
-         *
-         * @param currentStep the current time step
-         * @param fieldTmp a slot of FieldTmp to add a density to
-         */
-        void operator()(
-            uint32_t currentStep,
-            std::shared_ptr< FieldTmp > & fieldTmp
-        )
-        {
-            DataConnector &dc = Environment<>::get().DataConnector();
-
-            // load particle without copy particle data to host
-            auto speciesTmp = dc.get< SpeciesType >( FrameType::getName(), true );
-
-            using Density = particleToGrid::ComputeGridValuePerFrame<
-                ShapeType,
-                particleToGrid::derivedAttributes::Density
-            >;
-            fieldTmp->template computeValue< CORE + BORDER, Density >( *speciesTmp, currentStep );
-
-            dc.releaseData( FrameType::getName() );
-        }
-    };
-}
-    /** Average a group of species to a local density
-     *
-     * Takes a list of species and fills the LocalDensity with it.
-     * Ideally executed for a list of electron species or an ion species.
-     *
-     * @tparam T_SpeciesList sequence of picongpu::Particles to create a
-     *                       local density from
-     */
-    template<
-        typename T_SpeciesList
-    >
-    struct FillLocalDensity
+    namespace particles
     {
-        using SpeciesList = T_SpeciesList;
-
-        /** Functor
-         *
-         * @param currentStep the current time step
-         * @param speciesGroup naming for the group of species in T_SpeciesList
-         */
-        void operator()(
-            uint32_t currentStep,
-            std::string const & speciesGroup
-        )
+        namespace flylite
         {
-            // generating a density requires at least one slot in FieldTmp
-            PMACC_CASSERT_MSG(
-                _please_allocate_at_least_one_FieldTmp_in_memory_param,
-                fieldTmpNumSlots > 0
-            );
-
-            DataConnector &dc = Environment<>::get().DataConnector();
-
-            // load FieldTmp without copy data to host and zero it
-            auto fieldTmp = dc.get< FieldTmp >(
-                FieldTmp::getUniqueId( 0 ),
-                true
-            );
-            using DensityValueType = typename FieldTmp::ValueType;
-            fieldTmp->getGridBuffer().getDeviceBuffer().setValue( DensityValueType::create(0.0) );
-
-            // add density of each species in list to FieldTmp
-            meta::ForEach< SpeciesList, detail::AddSingleDensity< bmpl::_1 > > addSingleDensity;
-            addSingleDensity( currentStep, fieldTmp );
-
-            /* create valid density in the BORDER region
-             * note: for average != supercell multiples the GUARD of fieldTmp
-             *       also needs to be filled in the communication above
-             */
-            EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
-            __setTransactionEvent(fieldTmpEvent);
-
-            /* average summed density in FieldTmp down to local resolution and
-             * write in new field
-             */
-            auto nlocal = dc.get< LocalDensity >(
-                helperFields::LocalDensity::getName( speciesGroup ),
-                true
-            );
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-            PMACC_KERNEL( helperFields::KernelAverageDensity< numWorkers >{ } )
-            (
-                // one block per averaged density value
-                nlocal->getGridBuffer().getGridLayout().getDataSpaceWithoutGuarding(),
-                numWorkers
-            )
-            (
-                // start in border (jump over GUARD area)
-                fieldTmp->getDeviceDataBox().shift( SuperCellSize::toRT() * GuardSize::toRT() ),
-                // start in border (has no GUARD area)
-                nlocal->getGridBuffer().getDeviceBuffer( ).getDataBox( )
-            );
-
-            // release fields
-            dc.releaseData( FieldTmp::getUniqueId( 0 ) );
-            dc.releaseData( helperFields::LocalDensity::getName( speciesGroup ) );
-        }
-    };
-
-} // namespace helperFields
-} // namespace flylite
-} // namespace particles
+            namespace helperFields
+            {
+                namespace detail
+                {
+                    /** Average a group of species to a local density
+                     *
+                     * Takes a single species and fills the LocalDensity with it.
+                     *
+                     * @tparam T_SpeciesType a picongpu::Particles class with a particle species
+                     */
+                    template<typename T_SpeciesType>
+                    struct AddSingleDensity
+                    {
+                        using SpeciesType = T_SpeciesType;
+                        using FrameType = typename SpeciesType::FrameType;
+                        using ShapeType = typename GetShape<SpeciesType>::type;
+
+                        /** Functor
+                         *
+                         * @param currentStep the current time step
+                         * @param fieldTmp a slot of FieldTmp to add a density to
+                         */
+                        void operator()(uint32_t currentStep, std::shared_ptr<FieldTmp>& fieldTmp)
+                        {
+                            DataConnector& dc = Environment<>::get().DataConnector();
+
+                            // load particle without copy particle data to host
+                            auto speciesTmp = dc.get<SpeciesType>(FrameType::getName(), true);
+
+                            using Density = particleToGrid::
+                                ComputeGridValuePerFrame<ShapeType, particleToGrid::derivedAttributes::Density>;
+                            fieldTmp->template computeValue<CORE + BORDER, Density>(*speciesTmp, currentStep);
+
+                            dc.releaseData(FrameType::getName());
+                        }
+                    };
+                } // namespace detail
+                /** Average a group of species to a local density
+                 *
+                 * Takes a list of species and fills the LocalDensity with it.
+                 * Ideally executed for a list of electron species or an ion species.
+                 *
+                 * @tparam T_SpeciesList sequence of picongpu::Particles to create a
+                 *                       local density from
+                 */
+                template<typename T_SpeciesList>
+                struct FillLocalDensity
+                {
+                    using SpeciesList = T_SpeciesList;
+
+                    /** Functor
+                     *
+                     * @param currentStep the current time step
+                     * @param speciesGroup naming for the group of species in T_SpeciesList
+                     */
+                    void operator()(uint32_t currentStep, std::string const& speciesGroup)
+                    {
+                        // generating a density requires at least one slot in FieldTmp
+                        PMACC_CASSERT_MSG(
+                            _please_allocate_at_least_one_FieldTmp_in_memory_param,
+                            fieldTmpNumSlots > 0);
+
+                        DataConnector& dc = Environment<>::get().DataConnector();
+
+                        // load FieldTmp without copy data to host and zero it
+                        auto fieldTmp = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+                        using DensityValueType = typename FieldTmp::ValueType;
+                        fieldTmp->getGridBuffer().getDeviceBuffer().setValue(DensityValueType::create(0.0));
+
+                        // add density of each species in list to FieldTmp
+                        meta::ForEach<SpeciesList, detail::AddSingleDensity<bmpl::_1>> addSingleDensity;
+                        addSingleDensity(currentStep, fieldTmp);
+
+                        /* create valid density in the BORDER region
+                         * note: for average != supercell multiples the GUARD of fieldTmp
+                         *       also needs to be filled in the communication above
+                         */
+                        EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
+                        __setTransactionEvent(fieldTmpEvent);
+
+                        /* average summed density in FieldTmp down to local resolution and
+                         * write in new field
+                         */
+                        auto nlocal = dc.get<LocalDensity>(helperFields::LocalDensity::getName(speciesGroup), true);
+                        constexpr uint32_t numWorkers
+                            = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+                        PMACC_KERNEL(helperFields::KernelAverageDensity<numWorkers>{})
+                        (
+                            // one block per averaged density value
+                            nlocal->getGridBuffer().getGridLayout().getDataSpaceWithoutGuarding(),
+                            numWorkers)(
+                            // start in border (jump over GUARD area)
+                            fieldTmp->getDeviceDataBox().shift(SuperCellSize::toRT() * GuardSize::toRT()),
+                            // start in border (has no GUARD area)
+                            nlocal->getGridBuffer().getDeviceBuffer().getDataBox());
+
+                        // release fields
+                        dc.releaseData(FieldTmp::getUniqueId(0));
+                        dc.releaseData(helperFields::LocalDensity::getName(speciesGroup));
+                    }
+                };
+
+            } // namespace helperFields
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogram.hpp b/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogram.hpp
index bfe601bda5..9a7303e812 100644
--- a/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogram.hpp
+++ b/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogram.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -32,91 +32,68 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-namespace helperFields
-{
-    using namespace pmacc;
-
-    class LocalEnergyHistogram :
-        public ISimulationData
+    namespace particles
     {
-    private:
-        using EnergyHistogram =
-            memory::Array<
-                float_X,
-                picongpu::flylite::energies
-            >;
-        GridBuffer<
-            EnergyHistogram,
-            simDim
-        > * m_energyHistogram;
-        std::string m_speciesGroup;
-
-    public:
-        /** Allocate and Initialize local Energy Histogram
-         *
-         * @param speciesGroup unique naming for the species inside this histogram,
-         *                     e.g. a collection of electron species or photon species
-         * @param histSizeLocal spatial size of the local energy histogram
-         */
-        LocalEnergyHistogram(
-            std::string const & speciesGroup,
-            DataSpace< simDim > const & histSizeLocal
-        ) :
-            m_energyHistogram( nullptr ),
-            m_speciesGroup( speciesGroup )
+        namespace flylite
         {
-            m_energyHistogram =
-                new GridBuffer<
-                    EnergyHistogram,
-                    simDim
-                >( histSizeLocal );
-        }
+            namespace helperFields
+            {
+                using namespace pmacc;
 
-        ~LocalEnergyHistogram()
-        {
-            __delete( m_energyHistogram );
-        }
+                class LocalEnergyHistogram : public ISimulationData
+                {
+                private:
+                    using EnergyHistogram = memory::Array<float_X, picongpu::flylite::energies>;
+                    GridBuffer<EnergyHistogram, simDim>* m_energyHistogram;
+                    std::string m_speciesGroup;
 
-        static std::string
-        getName( std::string const & speciesGroup )
-        {
-            return speciesGroup + "_LocalEnergyHistogram";
-        }
+                public:
+                    /** Allocate and Initialize local Energy Histogram
+                     *
+                     * @param speciesGroup unique naming for the species inside this histogram,
+                     *                     e.g. a collection of electron species or photon species
+                     * @param histSizeLocal spatial size of the local energy histogram
+                     */
+                    LocalEnergyHistogram(std::string const& speciesGroup, DataSpace<simDim> const& histSizeLocal)
+                        : m_energyHistogram(nullptr)
+                        , m_speciesGroup(speciesGroup)
+                    {
+                        m_energyHistogram = new GridBuffer<EnergyHistogram, simDim>(histSizeLocal);
+                    }
 
-        std::string
-        getName( )
-        {
-            return getName( m_speciesGroup );
-        }
+                    ~LocalEnergyHistogram()
+                    {
+                        __delete(m_energyHistogram);
+                    }
 
-        GridBuffer<
-            EnergyHistogram,
-            simDim
-        > &
-        getGridBuffer( )
-        {
-            return *m_energyHistogram;
-        }
+                    static std::string getName(std::string const& speciesGroup)
+                    {
+                        return speciesGroup + "_LocalEnergyHistogram";
+                    }
 
-        /* implement ISimulationData members */
-        void
-        synchronize() override
-        {
-            m_energyHistogram->deviceToHost( );
-        }
+                    std::string getName()
+                    {
+                        return getName(m_speciesGroup);
+                    }
 
-        SimulationDataId
-        getUniqueId() override
-        {
-            return getName();
-        }
-    };
+                    GridBuffer<EnergyHistogram, simDim>& getGridBuffer()
+                    {
+                        return *m_energyHistogram;
+                    }
+
+                    /* implement ISimulationData members */
+                    void synchronize() override
+                    {
+                        m_energyHistogram->deviceToHost();
+                    }
+
+                    SimulationDataId getUniqueId() override
+                    {
+                        return getName();
+                    }
+                };
 
-} // namespace helperFields
-} // namespace flylite
-} // namespace particles
+            } // namespace helperFields
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogram.kernel b/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogram.kernel
index c2b237c961..bfe8754d6d 100644
--- a/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogram.kernel
+++ b/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogram.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl, Rene Widera
+/* Copyright 2017-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -35,253 +35,178 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-namespace helperFields
-{
-    /** Generate and add a local energy histogram
-     *
-     * Generate a (per-supercell) energy histogram and add it to global memory.
-     *
-     * @tparam T_numWorkers number of workers for lockstep execution per block,
-     *                      usually equal to the number of particles per frame
-     *                      (which is equal to the supercell size)
-     */
-    template< uint32_t T_numWorkers >
-    struct KernelAddLocalEnergyHistogram
+    namespace particles
     {
-        /** Functor
-         *
-         * The functor is executed frame-list-wise, meaning locally per
-         * supercell. All particles of a supercell generate a shared memory
-         * histogram and write that back into global memory. Particles outside
-         * of the range of the histogram are ignored and not counted.
-         *
-         * @todo In case the local averging in flylite shall be larger then a
-         * supercell (in multiples of integers), the results need to be merged.
-         *
-         * @tparam T_ParBox pmacc::ParticlesBox, particle box type
-         * @tparam T_LocalEnergyHistogramBox pmacc::DataBox, local energy histograms,
-         *                                   e.g. for each supercell
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param acc alpaka accelerator
-         * @param pb particles of a species
-         * @param energyHistogramBox box with global memory for each supercell's histogram
-         * @param minEnergy minimum energy to account for (eV)
-         * @param maxEnergy maximum energy to account for (eV)
-         */
-        template<
-            typename T_ParBox,
-            typename T_LocalEnergyHistogramBox,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_ParBox & pb,
-            T_LocalEnergyHistogramBox & energyHistogramBox,
-            float_X const minEnergy,
-            float_X const maxEnergy,
-            T_Mapping const mapper
-        ) const
+        namespace flylite
         {
-            using picongpu::flylite::spatialAverageBox;
-            constexpr uint16_t numBins = picongpu::flylite::energies;
-            constexpr uint32_t numWorkers = T_numWorkers;
-
-            using namespace pmacc::mappings::threads;
-            using SuperCellSize = typename MappingDesc::SuperCellSize;
-            using FramePtr = typename T_ParBox::FramePtr;
-            constexpr uint32_t maxParticlesPerFrame = pmacc::math::CT::volume< SuperCellSize >::type::value;
-
-            PMACC_SMEM(
-                acc,
-                frame,
-                FramePtr
-            );
-            PMACC_SMEM(
-                acc,
-                particlesInSuperCell,
-                lcellId_t
-            );
-
-            // our workers per block are started 1D
-            uint32_t const workerIdx = threadIdx.x;
-
-            // supercell index of current (frame-wise) supercell including GUARD
-            DataSpace< simDim > const superCellIdx(
-                mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) )
-            );
-            /* index inside local energy histogram in averaged space (has no GUARD)
-             * integer division: we average over multiples of supercells;
-             *                   this index selects the according local energy
-             *                   histogram in global RAM
-             */
-            DataSpace< simDim > const localEnergyBlock =
-                ( superCellIdx - GuardSize::toRT() ) *
-                SuperCellSize::toRT() / spatialAverageBox::toRT();
-
-            /* shift the energyHistogramBox to the local spatial average box and
-             * get a reference on the histogram
-             */
-            auto & localEnergyHistogram = *energyHistogramBox.shift( localEnergyBlock );
-
-            // shared memory for local energy histogram
-            PMACC_SMEM(
-                acc,
-                shLocalEnergyHistogram,
-                memory::Array<
-                    float_X,
-                    numBins
-                >
-            );
-
-            using MasterOnly = IdxConfig<
-                1,
-                numWorkers
-            >;
-
-            // get frame lists of this supercell
-            ForEachIdx< MasterOnly >{ workerIdx }(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    frame = pb.getLastFrame( superCellIdx );
-                    particlesInSuperCell = pb.getSuperCell( superCellIdx ).getSizeLastFrame( );
-                }
-            );
-
-            // empty the histogram to contain only zeroes
-            ForEachIdx<
-                IdxConfig<
-                    numWorkers,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    /* set all bins to 0 */
-                    for( int i = linearIdx; i < numBins; i += numWorkers )
-                        shLocalEnergyHistogram[ i ] = float_X( 0. );
-                }
-            );
-
-            __syncthreads();
-
-            // return if the supercell has no particles
-            if( !frame.isValid( ) )
-                return;
-
-            // iterate the frame list
-            while( frame.isValid() )
+            namespace helperFields
             {
-                // move over all particles in a frame
-                ForEachIdx<
-                    IdxConfig<
-                        maxParticlesPerFrame,
-                        numWorkers
-                    >
-                >{ workerIdx }(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const
-                    )
+                /** Generate and add a local energy histogram
+                 *
+                 * Generate a (per-supercell) energy histogram and add it to global memory.
+                 *
+                 * @tparam T_numWorkers number of workers for lockstep execution per block,
+                 *                      usually equal to the number of particles per frame
+                 *                      (which is equal to the supercell size)
+                 */
+                template<uint32_t T_numWorkers>
+                struct KernelAddLocalEnergyHistogram
+                {
+                    /** Functor
+                     *
+                     * The functor is executed frame-list-wise, meaning locally per
+                     * supercell. All particles of a supercell generate a shared memory
+                     * histogram and write that back into global memory. Particles outside
+                     * of the range of the histogram are ignored and not counted.
+                     *
+                     * @todo In case the local averging in flylite shall be larger then a
+                     * supercell (in multiples of integers), the results need to be merged.
+                     *
+                     * @tparam T_ParBox pmacc::ParticlesBox, particle box type
+                     * @tparam T_LocalEnergyHistogramBox pmacc::DataBox, local energy histograms,
+                     *                                   e.g. for each supercell
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param acc alpaka accelerator
+                     * @param pb particles of a species
+                     * @param energyHistogramBox box with global memory for each supercell's histogram
+                     * @param minEnergy minimum energy to account for (eV)
+                     * @param maxEnergy maximum energy to account for (eV)
+                     */
+                    template<typename T_ParBox, typename T_LocalEnergyHistogramBox, typename T_Mapping, typename T_Acc>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_ParBox& pb,
+                        T_LocalEnergyHistogramBox& energyHistogramBox,
+                        float_X const minEnergy,
+                        float_X const maxEnergy,
+                        T_Mapping const mapper) const
                     {
-                        if( linearIdx < particlesInSuperCell )
+                        using picongpu::flylite::spatialAverageBox;
+                        constexpr uint16_t numBins = picongpu::flylite::energies;
+                        constexpr uint32_t numWorkers = T_numWorkers;
+
+                        using namespace pmacc::mappings::threads;
+                        using SuperCellSize = typename MappingDesc::SuperCellSize;
+                        using FramePtr = typename T_ParBox::FramePtr;
+                        constexpr uint32_t maxParticlesPerFrame = pmacc::math::CT::volume<SuperCellSize>::type::value;
+
+                        PMACC_SMEM(acc, frame, FramePtr);
+                        PMACC_SMEM(acc, particlesInSuperCell, lcellId_t);
+
+                        // our workers per block are started 1D
+                        uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                        // supercell index of current (frame-wise) supercell including GUARD
+                        DataSpace<simDim> const superCellIdx(
+                            mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+                        /* index inside local energy histogram in averaged space (has no GUARD)
+                         * integer division: we average over multiples of supercells;
+                         *                   this index selects the according local energy
+                         *                   histogram in global RAM
+                         */
+                        DataSpace<simDim> const localEnergyBlock
+                            = (superCellIdx - GuardSize::toRT()) * SuperCellSize::toRT() / spatialAverageBox::toRT();
+
+                        /* shift the energyHistogramBox to the local spatial average box and
+                         * get a reference on the histogram
+                         */
+                        auto& localEnergyHistogram = *energyHistogramBox.shift(localEnergyBlock);
+
+                        // shared memory for local energy histogram
+                        PMACC_SMEM(acc, shLocalEnergyHistogram, memory::Array<float_X, numBins>);
+
+                        using MasterOnly = IdxConfig<1, numWorkers>;
+
+                        // get frame lists of this supercell
+                        ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
+                            frame = pb.getLastFrame(superCellIdx);
+                            particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame();
+                        });
+
+                        // empty the histogram to contain only zeroes
+                        ForEachIdx<IdxConfig<numWorkers, numWorkers>>{workerIdx}(
+                            [&](uint32_t const linearIdx, uint32_t const) {
+                                /* set all bins to 0 */
+                                for(int i = linearIdx; i < numBins; i += numWorkers)
+                                    shLocalEnergyHistogram[i] = float_X(0.);
+                            });
+
+                        cupla::__syncthreads(acc);
+
+                        // return if the supercell has no particles
+                        if(!frame.isValid())
+                            return;
+
+                        // iterate the frame list
+                        while(frame.isValid())
                         {
-                            auto const particle = frame[ linearIdx ];
-                            /* kinetic Energy for Particles: E^2 = p^2*c^2 + m^2*c^4
-                             *                                   = c^2 * [p^2 + m^2*c^2]
-                             */
-                            float3_X const mom = particle[ momentum_ ];
-
-                            float_X const weighting = particle[ weighting_ ];
-                            float_X const mass = attribute::getMass(
-                                weighting,
-                                particle
-                            );
-
-                            // calculate kinetic energy of the macro particle
-                            float_X particleEnergy = KinEnergy< >( )(
-                                mom,
-                                mass
-                            );
-
-                            particleEnergy /= weighting;
-
-                            // calculate bin number
-                            int binNumber = math::floor(
-                                ( particleEnergy - minEnergy ) /
-                                ( maxEnergy - minEnergy ) * static_cast< float_X >( numBins )
-                            );
-
-                            /* all entries larger than maxEnergy or smaller
-                             * than minEnergy are ignored
-                             */
-                            if( binNumber >= 0 and binNumber < numBins )
-                            {
-                                // artifical norm for reduce
-                                float_X const normedWeighting = weighting /
-                                    float_X( particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE );
-
-                                atomicAdd(
-                                    &( shLocalEnergyHistogram[ binNumber ] ),
-                                    normedWeighting,
-                                    ::alpaka::hierarchy::Threads{}
-                                );
-                            }
+                            // move over all particles in a frame
+                            ForEachIdx<IdxConfig<maxParticlesPerFrame, numWorkers>>{
+                                workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                                if(linearIdx < particlesInSuperCell)
+                                {
+                                    auto const particle = frame[linearIdx];
+                                    /* kinetic Energy for Particles: E^2 = p^2*c^2 + m^2*c^4
+                                     *                                   = c^2 * [p^2 + m^2*c^2]
+                                     */
+                                    float3_X const mom = particle[momentum_];
+
+                                    float_X const weighting = particle[weighting_];
+                                    float_X const mass = attribute::getMass(weighting, particle);
+
+                                    // calculate kinetic energy of the macro particle
+                                    float_X particleEnergy = KinEnergy<>()(mom, mass);
+
+                                    particleEnergy /= weighting;
+
+                                    // calculate bin number
+                                    int binNumber = math::floor(
+                                        (particleEnergy - minEnergy) / (maxEnergy - minEnergy)
+                                        * static_cast<float_X>(numBins));
+
+                                    /* all entries larger than maxEnergy or smaller
+                                     * than minEnergy are ignored
+                                     */
+                                    if(binNumber >= 0 and binNumber < numBins)
+                                    {
+                                        // artifical norm for reduce
+                                        float_X const normedWeighting
+                                            = weighting / float_X(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
+
+                                        cupla::atomicAdd(
+                                            acc,
+                                            &(shLocalEnergyHistogram[binNumber]),
+                                            normedWeighting,
+                                            ::alpaka::hierarchy::Threads{});
+                                    }
+                                }
+                            });
+
+                            cupla::__syncthreads(acc);
+
+                            // go to next frame
+                            ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
+                                frame = pb.getPreviousFrame(frame);
+                                particlesInSuperCell = maxParticlesPerFrame;
+                            });
+                            cupla::__syncthreads(acc);
                         }
-                    }
-                );
-
-                __syncthreads();
 
-                // go to next frame
-                ForEachIdx< MasterOnly >{ workerIdx }(
-                    [&](
-                        uint32_t const,
-                        uint32_t const
-                    )
-                    {
-                        frame = pb.getPreviousFrame( frame );
-                        particlesInSuperCell = maxParticlesPerFrame;
+                        // write histogram back to global memory (add)
+                        ForEachIdx<IdxConfig<numWorkers, numWorkers>>{workerIdx}(
+                            [&](uint32_t const linearIdx, uint32_t const) {
+                                for(int i = linearIdx; i < numBins; i += numWorkers)
+                                    cupla::atomicAdd(
+                                        acc,
+                                        &(localEnergyHistogram[i]),
+                                        shLocalEnergyHistogram[i],
+                                        ::alpaka::hierarchy::Blocks{});
+                            });
                     }
-                );
-                __syncthreads();
-            }
-
-            // write histogram back to global memory (add)
-            ForEachIdx<
-                IdxConfig<
-                    numWorkers,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    for( int i = linearIdx; i < numBins; i += numWorkers )
-                        atomicAdd(
-                            &( localEnergyHistogram[ i ] ),
-                            shLocalEnergyHistogram[ i ],
-                            ::alpaka::hierarchy::Blocks{}
-                        );
-                }
-            );
-        }
-    };
+                };
 
-} // namespace helperFields
-} // namespace flylite
-} // namespace particles
+            } // namespace helperFields
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogramFunctors.hpp b/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogramFunctors.hpp
index 08fe303aee..d5c55d6dc6 100644
--- a/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogramFunctors.hpp
+++ b/include/picongpu/particles/flylite/helperFields/LocalEnergyHistogramFunctors.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -34,135 +34,122 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-namespace helperFields
-{
-namespace detail
-{
-    /** Takes a single species and adds it to a LocalEnergyHistogram
-     *
-     * @tparam T_SpeciesType a picongpu::Particles class with a particle species
-     */
-    template<
-        typename T_SpeciesType
-    >
-    struct AddSingleEnergyHistogram
-    {
-        using SpeciesType = T_SpeciesType;
-        using FrameType = typename SpeciesType::FrameType;
-
-        /** Functor
-         *
-         * @param currentStep the current time step
-         * @param eneHistLocal the GridBuffer for local energy histograms
-         * @param minEnergy minimum energy to account for (eV)
-         * @param maxEnergy maximum energy to account for (eV)
-         */
-        void operator()(
-            uint32_t currentStep,
-            std::shared_ptr< LocalEnergyHistogram > & eneHistLocal,
-            float_X const minEnergy,
-            float_X const maxEnergy
-        )
-        {
-            DataConnector &dc = Environment<>::get().DataConnector();
-
-            // load particle without copy particle data to host
-            auto speciesTmp = dc.get< SpeciesType >( FrameType::getName(), true );
-
-            // mapper to access species in CORE & BORDER only
-            MappingDesc cellDescription(
-                speciesTmp->getParticlesBuffer().getSuperCellsLayout().getDataSpace() * SuperCellSize::toRT(),
-                GuardSize::toRT()
-            );
-            AreaMapping<
-                CORE + BORDER,
-                MappingDesc
-            > mapper( cellDescription );
-
-            // add energy histogram on top of existing data
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-            PMACC_KERNEL( helperFields::KernelAddLocalEnergyHistogram< numWorkers >{ } )
-            (
-                // one block per local energy histogram
-                mapper.getGridDim(),
-                numWorkers
-            )
-            (
-                // start in border (jump over GUARD area)
-                speciesTmp->getDeviceParticlesBox(),
-                // start in border (has no GUARD area)
-                eneHistLocal->getGridBuffer().getDeviceBuffer( ).getDataBox( ),
-                minEnergy,
-                maxEnergy,
-                mapper
-            );
-
-            dc.releaseData( FrameType::getName() );
-        }
-    };
-}
-    /** Add a group of species to a local energy histogram
-     *
-     * Takes a list of species and fills the LocalEnergyHistogram with it.
-     * Ideally executed for a list of electron species or an photon species.
-     *
-     * @tparam T_SpeciesList sequence of picongpu::Particles to create a
-     *                       local energy histogram from
-     */
-    template<
-        typename T_SpeciesList
-    >
-    struct FillLocalEnergyHistogram
+    namespace particles
     {
-        using SpeciesList = T_SpeciesList;
-
-        /** Functor
-         *
-         * @param currentStep the current time step
-         * @param speciesGroup naming for the group of species in T_SpeciesList
-         * @param minEnergy minimum energy to account for (eV)
-         * @param maxEnergy maximum energy to account for (eV)
-         */
-        void operator()(
-            uint32_t currentStep,
-            std::string const & speciesGroup,
-            float_X const minEnergy,
-            float_X const maxEnergy
-        )
+        namespace flylite
         {
-            DataConnector &dc = Environment<>::get().DataConnector();
-
-            /* load local energy histogram field without copy data to host and
-             * zero it
-             */
-            auto eneHistLocal = dc.get< LocalEnergyHistogram >(
-                helperFields::LocalEnergyHistogram::getName( speciesGroup ),
-                true
-            );
-
-            // reset local energy histograms
-            eneHistLocal->getGridBuffer().getDeviceBuffer().setValue( float_X( 0.0 ) );
-
-            // add local energy histogram of each species in list
-            meta::ForEach< SpeciesList, detail::AddSingleEnergyHistogram< bmpl::_1 > > addSingleEnergyHistogram;
-            addSingleEnergyHistogram( currentStep, eneHistLocal, minEnergy, maxEnergy );
-
-            /* note: for average != supercell the BORDER region would need to be
-             *       build up via communication accordingly
-             */
-
-            // release fields
-            dc.releaseData( helperFields::LocalEnergyHistogram::getName( speciesGroup ) );
-        }
-    };
-
-} // namespace helperFields
-} // namespace flylite
-} // namespace particles
+            namespace helperFields
+            {
+                namespace detail
+                {
+                    /** Takes a single species and adds it to a LocalEnergyHistogram
+                     *
+                     * @tparam T_SpeciesType a picongpu::Particles class with a particle species
+                     */
+                    template<typename T_SpeciesType>
+                    struct AddSingleEnergyHistogram
+                    {
+                        using SpeciesType = T_SpeciesType;
+                        using FrameType = typename SpeciesType::FrameType;
+
+                        /** Functor
+                         *
+                         * @param currentStep the current time step
+                         * @param eneHistLocal the GridBuffer for local energy histograms
+                         * @param minEnergy minimum energy to account for (eV)
+                         * @param maxEnergy maximum energy to account for (eV)
+                         */
+                        void operator()(
+                            uint32_t currentStep,
+                            std::shared_ptr<LocalEnergyHistogram>& eneHistLocal,
+                            float_X const minEnergy,
+                            float_X const maxEnergy)
+                        {
+                            DataConnector& dc = Environment<>::get().DataConnector();
+
+                            // load particle without copy particle data to host
+                            auto speciesTmp = dc.get<SpeciesType>(FrameType::getName(), true);
+
+                            // mapper to access species in CORE & BORDER only
+                            MappingDesc cellDescription(
+                                speciesTmp->getParticlesBuffer().getSuperCellsLayout().getDataSpace()
+                                    * SuperCellSize::toRT(),
+                                GuardSize::toRT());
+                            AreaMapping<CORE + BORDER, MappingDesc> mapper(cellDescription);
+
+                            // add energy histogram on top of existing data
+                            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
+                                pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+                            PMACC_KERNEL(helperFields::KernelAddLocalEnergyHistogram<numWorkers>{})
+                            (
+                                // one block per local energy histogram
+                                mapper.getGridDim(),
+                                numWorkers)(
+                                // start in border (jump over GUARD area)
+                                speciesTmp->getDeviceParticlesBox(),
+                                // start in border (has no GUARD area)
+                                eneHistLocal->getGridBuffer().getDeviceBuffer().getDataBox(),
+                                minEnergy,
+                                maxEnergy,
+                                mapper);
+
+                            dc.releaseData(FrameType::getName());
+                        }
+                    };
+                } // namespace detail
+                /** Add a group of species to a local energy histogram
+                 *
+                 * Takes a list of species and fills the LocalEnergyHistogram with it.
+                 * Ideally executed for a list of electron species or an photon species.
+                 *
+                 * @tparam T_SpeciesList sequence of picongpu::Particles to create a
+                 *                       local energy histogram from
+                 */
+                template<typename T_SpeciesList>
+                struct FillLocalEnergyHistogram
+                {
+                    using SpeciesList = T_SpeciesList;
+
+                    /** Functor
+                     *
+                     * @param currentStep the current time step
+                     * @param speciesGroup naming for the group of species in T_SpeciesList
+                     * @param minEnergy minimum energy to account for (eV)
+                     * @param maxEnergy maximum energy to account for (eV)
+                     */
+                    void operator()(
+                        uint32_t currentStep,
+                        std::string const& speciesGroup,
+                        float_X const minEnergy,
+                        float_X const maxEnergy)
+                    {
+                        DataConnector& dc = Environment<>::get().DataConnector();
+
+                        /* load local energy histogram field without copy data to host and
+                         * zero it
+                         */
+                        auto eneHistLocal = dc.get<LocalEnergyHistogram>(
+                            helperFields::LocalEnergyHistogram::getName(speciesGroup),
+                            true);
+
+                        // reset local energy histograms
+                        eneHistLocal->getGridBuffer().getDeviceBuffer().setValue(float_X(0.0));
+
+                        // add local energy histogram of each species in list
+                        meta::ForEach<SpeciesList, detail::AddSingleEnergyHistogram<bmpl::_1>>
+                            addSingleEnergyHistogram;
+                        addSingleEnergyHistogram(currentStep, eneHistLocal, minEnergy, maxEnergy);
+
+                        /* note: for average != supercell the BORDER region would need to be
+                         *       build up via communication accordingly
+                         */
+
+                        // release fields
+                        dc.releaseData(helperFields::LocalEnergyHistogram::getName(speciesGroup));
+                    }
+                };
+
+            } // namespace helperFields
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/helperFields/LocalRateMatrix.hpp b/include/picongpu/particles/flylite/helperFields/LocalRateMatrix.hpp
index d2baff549b..e1bb20098c 100644
--- a/include/picongpu/particles/flylite/helperFields/LocalRateMatrix.hpp
+++ b/include/picongpu/particles/flylite/helperFields/LocalRateMatrix.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -32,86 +32,71 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace flylite
-{
-namespace helperFields
-{
-    using namespace pmacc;
-
-    class LocalRateMatrix :
-        public ISimulationData
+    namespace particles
     {
-    private:
-        /** A[iz, numpop, numpop] */
-        using RateMatrix = memory::Array<
-                memory::Array<
-                    memory::Array<
-                        float_X,
-                        picongpu::flylite::populations
-                    >,
-                    picongpu::flylite::populations
-                >,
-                picongpu::flylite::ionizationStates
-        >;
-         GridBuffer< RateMatrix, simDim >* m_rateMatrix;
-        std::string m_speciesName;
-
-    public:
-        /** Allocate and initialize local rate matrix for ion state transitions
-         *
-         * @param histSizeLocal spatial size of the local energy histogram
-         */
-        LocalRateMatrix(
-            std::string const & ionSpeciesName,
-            DataSpace< simDim > const & histSizeLocal
-        ) :
-            m_rateMatrix( nullptr ),
-            m_speciesName( ionSpeciesName )
+        namespace flylite
         {
-            m_rateMatrix =
-                new GridBuffer< RateMatrix, simDim >( histSizeLocal );
-        }
+            namespace helperFields
+            {
+                using namespace pmacc;
 
-        ~LocalRateMatrix()
-        {
-            __delete( m_rateMatrix );
-        }
+                class LocalRateMatrix : public ISimulationData
+                {
+                private:
+                    /** A[iz, numpop, numpop] */
+                    using RateMatrix = memory::Array<
+                        memory::Array<
+                            memory::Array<float_X, picongpu::flylite::populations>,
+                            picongpu::flylite::populations>,
+                        picongpu::flylite::ionizationStates>;
+                    GridBuffer<RateMatrix, simDim>* m_rateMatrix;
+                    std::string m_speciesName;
 
-        static std::string
-        getName( std::string const & speciesGroup )
-        {
-            return speciesGroup + "_RateMatrix";
-        }
+                public:
+                    /** Allocate and initialize local rate matrix for ion state transitions
+                     *
+                     * @param histSizeLocal spatial size of the local energy histogram
+                     */
+                    LocalRateMatrix(std::string const& ionSpeciesName, DataSpace<simDim> const& histSizeLocal)
+                        : m_rateMatrix(nullptr)
+                        , m_speciesName(ionSpeciesName)
+                    {
+                        m_rateMatrix = new GridBuffer<RateMatrix, simDim>(histSizeLocal);
+                    }
 
-        std::string
-        getName( )
-        {
-            return getName( m_speciesName );
-        }
+                    ~LocalRateMatrix()
+                    {
+                        __delete(m_rateMatrix);
+                    }
 
-        GridBuffer< RateMatrix, simDim >&
-        getGridBuffer( )
-        {
-            return *m_rateMatrix;
-        }
+                    static std::string getName(std::string const& speciesGroup)
+                    {
+                        return speciesGroup + "_RateMatrix";
+                    }
 
-        /* implement ISimulationData members */
-        void
-        synchronize() override
-        {
-            m_rateMatrix->deviceToHost( );
-        }
+                    std::string getName()
+                    {
+                        return getName(m_speciesName);
+                    }
 
-        SimulationDataId
-        getUniqueId() override
-        {
-            return getName();
-        }
-    };
+                    GridBuffer<RateMatrix, simDim>& getGridBuffer()
+                    {
+                        return *m_rateMatrix;
+                    }
+
+                    /* implement ISimulationData members */
+                    void synchronize() override
+                    {
+                        m_rateMatrix->deviceToHost();
+                    }
+
+                    SimulationDataId getUniqueId() override
+                    {
+                        return getName();
+                    }
+                };
 
-} // namespace helperFields
-} // namespace flylite
-} // namespace particles
+            } // namespace helperFields
+        } // namespace flylite
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/flylite/types/Superconfig.hpp b/include/picongpu/particles/flylite/types/Superconfig.hpp
index 5c99c1f906..e6925f3053 100644
--- a/include/picongpu/particles/flylite/types/Superconfig.hpp
+++ b/include/picongpu/particles/flylite/types/Superconfig.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -25,34 +25,28 @@
 
 namespace picongpu
 {
-namespace flylite
-{
-namespace types
-{
-    /** Ion Superconfiguration
-     *
-     * This is the attribute type for an ion's screened hydrogenic
-     * superconfiguration.
-     *
-     * See for details on screened hydrogenic levels:
-     *   H.-K. Chung, S.H. Hansen, H.A. Scott.
-     *   *Generalized Collisional Radiative Model Using*
-     *   *Screened Hydrogenic Levels*,
-     *   in Modern Methods in Collisional-Radiative Modeling of Plasmas,
-     *   edited by Y. Ralchenko (Springer, 2016) pp.51-79
-     *
-     * @tparam T_Type the float type to use, e.g. float_64
-     * @tparam T_populations the number of populations to store for each ion,
-     *                       range: [0, 255]
-     */
-    template<
-        typename T_Type,
-        uint8_t T_populations
-    >
-    using Superconfig = pmacc::math::Vector<
-        T_Type,
-        T_populations
-    >;
-} // namespace types
-} // namespace flylite
+    namespace flylite
+    {
+        namespace types
+        {
+            /** Ion Superconfiguration
+             *
+             * This is the attribute type for an ion's screened hydrogenic
+             * superconfiguration.
+             *
+             * See for details on screened hydrogenic levels:
+             *   H.-K. Chung, S.H. Hansen, H.A. Scott.
+             *   *Generalized Collisional Radiative Model Using*
+             *   *Screened Hydrogenic Levels*,
+             *   in Modern Methods in Collisional-Radiative Modeling of Plasmas,
+             *   edited by Y. Ralchenko (Springer, 2016) pp.51-79
+             *
+             * @tparam T_Type the float type to use, e.g. float_64
+             * @tparam T_populations the number of populations to store for each ion,
+             *                       range: [0, 255]
+             */
+            template<typename T_Type, uint8_t T_populations>
+            using Superconfig = pmacc::math::Vector<T_Type, T_populations>;
+        } // namespace types
+    } // namespace flylite
 } // namespace picongpu
diff --git a/include/picongpu/particles/functor/User.def b/include/picongpu/particles/functor/User.def
index 077f0112da..ba98aa1904 100644
--- a/include/picongpu/particles/functor/User.def
+++ b/include/picongpu/particles/functor/User.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,20 +22,19 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace functor
-{
-
-    /** call simple free user defined functor
-     *
-     * @tparam T_Functor user defined functor
-     *                   **optional**: can implement **one** host side constructor
-     *                   `T_Functor()` or `T_Functor(uint32_t currentTimeStep)`
-     */
-    template< typename T_Functor >
-    struct User;
+    namespace particles
+    {
+        namespace functor
+        {
+            /** call simple free user defined functor
+             *
+             * @tparam T_Functor user defined functor
+             *                   **optional**: can implement **one** host side constructor
+             *                   `T_Functor()` or `T_Functor(uint32_t currentTimeStep)`
+             */
+            template<typename T_Functor>
+            struct User;
 
-} // namespace functor
-} // namespace particles
+        } // namespace functor
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/functor/User.hpp b/include/picongpu/particles/functor/User.hpp
index eb9644eb86..3bc05b9902 100644
--- a/include/picongpu/particles/functor/User.hpp
+++ b/include/picongpu/particles/functor/User.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -27,58 +27,48 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace functor
-{
-    template< typename T_Functor >
-    struct User : public T_Functor
+    namespace particles
     {
-
-        using Functor = T_Functor;
-
-        /** constructor
-         *
-         * This constructor is only compiled if the user functor has
-         * a host side constructor with one (uint32_t) argument.
-         *
-         * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
-         *                      the constructor
-         * @param currentStep current simulation time step
-         * @param is used to enable/disable the constructor (do not pass any value to this parameter)
-         */
-        template< typename DeferFunctor = Functor >
-        HINLINE User(
-            uint32_t currentStep,
-            typename std::enable_if<
-                std::is_constructible<
-                    DeferFunctor,
-                    uint32_t
-                >::value
-            >::type* = 0
-        ) : Functor( currentStep )
+        namespace functor
         {
-        }
+            template<typename T_Functor>
+            struct User : public T_Functor
+            {
+                using Functor = T_Functor;
 
-        /** constructor
-         *
-         * This constructor is only compiled if the user functor has a default constructor.
-         *
-         * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
-         *                      the constructor
-         * @param current simulation time step
-         * @param is used to enable/disable the constructor (do not pass any value to this parameter)
-         */
-        template< typename DeferFunctor = Functor >
-        HINLINE User(
-            uint32_t,
-            typename std::enable_if<
-                std::is_constructible< DeferFunctor >::value
-            >::type* = 0
-        ) : Functor( )
-        {
-        }
-    };
-} // namespace functor
-} // namespace particles
+                /** constructor
+                 *
+                 * This constructor is only compiled if the user functor has
+                 * a host side constructor with one (uint32_t) argument.
+                 *
+                 * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
+                 *                      the constructor
+                 * @param currentStep current simulation time step
+                 * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+                 */
+                template<typename DeferFunctor = Functor>
+                HINLINE User(
+                    uint32_t currentStep,
+                    typename std::enable_if<std::is_constructible<DeferFunctor, uint32_t>::value>::type* = 0)
+                    : Functor(currentStep)
+                {
+                }
+
+                /** constructor
+                 *
+                 * This constructor is only compiled if the user functor has a default constructor.
+                 *
+                 * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
+                 *                      the constructor
+                 * @param current simulation time step
+                 * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+                 */
+                template<typename DeferFunctor = Functor>
+                HINLINE User(uint32_t, typename std::enable_if<std::is_constructible<DeferFunctor>::value>::type* = 0)
+                    : Functor()
+                {
+                }
+            };
+        } // namespace functor
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/functor/functor.def b/include/picongpu/particles/functor/functor.def
index 3677d5efab..827631ecc9 100644
--- a/include/picongpu/particles/functor/functor.def
+++ b/include/picongpu/particles/functor/functor.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Axel Huebl
+/* Copyright 2014-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/functor/functor.hpp b/include/picongpu/particles/functor/functor.hpp
index bb183e1cbe..ccdd2c2d7a 100644
--- a/include/picongpu/particles/functor/functor.hpp
+++ b/include/picongpu/particles/functor/functor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Axel Huebl
+/* Copyright 2014-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/functor/misc/Rng.def b/include/picongpu/particles/functor/misc/Rng.def
index 707cf2b175..6e47b82e7f 100644
--- a/include/picongpu/particles/functor/misc/Rng.def
+++ b/include/picongpu/particles/functor/misc/Rng.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,23 +27,20 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace functor
-{
-namespace misc
-{
-
-    /** provide a random number generator
-     *
-     * @tparam T_Distribution pmacc::random::distributions, random number distribution
-     */
-    template<
-        typename T_Distribution
-    >
-    struct Rng;
+    namespace particles
+    {
+        namespace functor
+        {
+            namespace misc
+            {
+                /** provide a random number generator
+                 *
+                 * @tparam T_Distribution pmacc::random::distributions, random number distribution
+                 */
+                template<typename T_Distribution>
+                struct Rng;
 
-} // namespace misc
-} // namespace functor
-} // namespace particles
+            } // namespace misc
+        } // namespace functor
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/functor/misc/Rng.hpp b/include/picongpu/particles/functor/misc/Rng.hpp
index c141fb1ddc..b820036137 100644
--- a/include/picongpu/particles/functor/misc/Rng.hpp
+++ b/include/picongpu/particles/functor/misc/Rng.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Alexander Grund
+/* Copyright 2015-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PIConGPU.
  *
@@ -22,8 +22,6 @@
 #include "picongpu/simulation_defines.hpp"
 #include "picongpu/particles/functor/misc/RngWrapper.hpp"
 
-#include <pmacc/nvidia/rng/RNG.hpp>
-#include <pmacc/nvidia/rng/methods/Xor.hpp>
 #include <pmacc/mpi/SeedPerRank.hpp>
 #include <pmacc/traits/GetUniqueTypeId.hpp>
 #include <pmacc/random/methods/methods.hpp>
@@ -36,81 +34,61 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace functor
-{
-namespace misc
-{
-    /** call simple free user defined functor and provide a random number generator
-     *
-     * @tparam T_Distribution random number distribution
-     */
-    template<
-        typename T_Distribution
-    >
-    struct Rng
+    namespace particles
     {
-        using Distribution = T_Distribution;
-        using RNGFactory = pmacc::random::RNGProvider<
-            simDim,
-            random::Generator
-        >;
-        using RngHandle = typename RNGFactory::Handle;
-        using RandomGen = RngWrapper<
-            cupla::Acc,
-            typename RngHandle::GetRandomType< Distribution >::type
-        >;
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE Rng( uint32_t currentStep ) : rngHandle( RNGFactory::createHandle() )
+        namespace functor
         {
-        }
+            namespace misc
+            {
+                /** call simple free user defined functor and provide a random number generator
+                 *
+                 * @tparam T_Distribution random number distribution
+                 */
+                template<typename T_Distribution>
+                struct Rng
+                {
+                    using Distribution = T_Distribution;
+                    using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
+                    using RngHandle = typename RNGFactory::Handle;
+                    using RandomGen = RngWrapper<cupla::Acc, typename RngHandle::GetRandomType<Distribution>::type>;
 
+                    /** constructor
+                     *
+                     * @param currentStep current simulation time step
+                     */
+                    HINLINE Rng(uint32_t currentStep) : rngHandle(RNGFactory::createHandle())
+                    {
+                    }
 
-        /** create functor a random number generator
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset offset (in superCells, without any guards) relative
-         *                        to the origin of the local domain
-         * @param workerCfg configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE
-        RandomGen
-        operator()(
-            T_Acc const & acc,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const & workerCfg
-        ) const
-        {
-            namespace nvrng = nvidia::rng;
 
-            RngHandle tmp( rngHandle );
-            tmp.init(
-                localSupercellOffset * SuperCellSize::toRT() +
-                DataSpaceOperations< simDim >::template map< SuperCellSize >( workerCfg.getWorkerIdx( ) )
-            );
-            return RandomGen(
-                acc,
-                tmp.applyDistribution< Distribution >()
-            );
-        }
+                    /** create functor a random number generator
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param localSupercellOffset offset (in superCells, without any guards) relative
+                     *                        to the origin of the local domain
+                     * @param workerCfg configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE RandomGen operator()(
+                        T_Acc const& acc,
+                        DataSpace<simDim> const& localSupercellOffset,
+                        T_WorkerCfg const& workerCfg) const
+                    {
+                        RngHandle tmp(rngHandle);
+                        tmp.init(
+                            localSupercellOffset * SuperCellSize::toRT()
+                            + DataSpaceOperations<simDim>::template map<SuperCellSize>(workerCfg.getWorkerIdx()));
+                        return RandomGen(acc, tmp.applyDistribution<Distribution>());
+                    }
 
-    private:
-        RngHandle rngHandle;
-    };
+                private:
+                    RngHandle rngHandle;
+                };
 
-} // namepsace misc
-} // namespace functor
-} // namespace particles
+            } // namespace misc
+        } // namespace functor
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/functor/misc/RngWrapper.hpp b/include/picongpu/particles/functor/misc/RngWrapper.hpp
index 727dd1a01f..13bdf47964 100644
--- a/include/picongpu/particles/functor/misc/RngWrapper.hpp
+++ b/include/picongpu/particles/functor/misc/RngWrapper.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,49 +26,45 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace functor
-{
-namespace misc
-{
-
-    /** wraps an random number generator together with an alpaka accelerator
-     *
-     * This class allows to generate random numbers without passing the accelerator
-     * to each functor call.
-     *
-     * @tparam T_Acc type of the alpaka accelerator
-     * @tparam T_Rng type of the random number generator
-     */
-    template<
-        typename T_Acc,
-        typename T_Rng
-    >
-    struct RngWrapper
+    namespace particles
     {
-        DINLINE RngWrapper(
-            T_Acc const & acc,
-            T_Rng const & rng
+        namespace functor
+        {
+            namespace misc
+            {
+                /** wraps an random number generator together with an alpaka accelerator
+                 *
+                 * This class allows to generate random numbers without passing the accelerator
+                 * to each functor call.
+                 *
+                 * @tparam T_Acc type of the alpaka accelerator
+                 * @tparam T_Rng type of the random number generator
+                 */
+                template<typename T_Acc, typename T_Rng>
+                struct RngWrapper
+                {
+                    DINLINE RngWrapper(
+                        T_Acc const& acc,
+                        T_Rng const& rng
 
-        ) :
-            m_acc( &acc ),
-            m_rng( rng )
-        { }
+                        )
+                        : m_acc(&acc)
+                        , m_rng(rng)
+                    {
+                    }
 
-        //! generate a random number
-        DINLINE
-        typename T_Rng::result_type
-        operator()()
-        {
-            return m_rng( *m_acc );
-        }
+                    //! generate a random number
+                    DINLINE
+                    typename T_Rng::result_type operator()()
+                    {
+                        return m_rng(*m_acc);
+                    }
 
-        T_Acc const * m_acc;
-        mutable T_Rng m_rng;
-    };
+                    T_Acc const* m_acc;
+                    mutable T_Rng m_rng;
+                };
 
-} // namepsace misc
-} // namespace functor
-} // namespace particles
+            } // namespace misc
+        } // namespace functor
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/functor/misc/TotalCellOffset.def b/include/picongpu/particles/functor/misc/TotalCellOffset.def
index f4fac65b49..d3e46da377 100644
--- a/include/picongpu/particles/functor/misc/TotalCellOffset.def
+++ b/include/picongpu/particles/functor/misc/TotalCellOffset.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,15 +22,15 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace functor
-{
-namespace misc
-{
-    //! Provide the cell offset of a supercell to the total domain origin
-    struct TotalCellOffset;
-} // namespace misc
-} // namespace functor
-} // namespace particles
+    namespace particles
+    {
+        namespace functor
+        {
+            namespace misc
+            {
+                //! Provide the cell offset of a supercell to the total domain origin
+                struct TotalCellOffset;
+            } // namespace misc
+        } // namespace functor
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/functor/misc/TotalCellOffset.hpp b/include/picongpu/particles/functor/misc/TotalCellOffset.hpp
index 8d5a8edc81..33ba9e725c 100644
--- a/include/picongpu/particles/functor/misc/TotalCellOffset.hpp
+++ b/include/picongpu/particles/functor/misc/TotalCellOffset.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,63 +25,55 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace functor
-{
-namespace misc
-{
-    struct TotalCellOffset
+    namespace particles
     {
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE TotalCellOffset( uint32_t currentStep )
-        {
-            uint32_t const numSlides = MovingWindow::getInstance( ).getSlideCounter( currentStep );
-            SubGrid< simDim > const & subGrid = Environment< simDim >::get( ).SubGrid( );
-            DataSpace< simDim > const localCells = subGrid.getLocalDomain( ).size;
-            gpuCellOffsetToTotalOrigin = subGrid.getLocalDomain( ).offset;
-            gpuCellOffsetToTotalOrigin.y( ) += numSlides * localCells.y( );
-        }
-
-        /** get cell offset of the supercell
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param offset (in supercells, without any guards) to the
-         *         origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE DataSpace< simDim >
-        operator()(
-            T_Acc const & acc,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const &
-        ) const
+        namespace functor
         {
-            DataSpace< simDim > const superCellToLocalOriginCellOffset(
-                localSupercellOffset * SuperCellSize::toRT( )
-            );
+            namespace misc
+            {
+                struct TotalCellOffset
+                {
+                    /** constructor
+                     *
+                     * @param currentStep current simulation time step
+                     */
+                    HINLINE TotalCellOffset(uint32_t currentStep)
+                    {
+                        uint32_t const numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+                        SubGrid<simDim> const& subGrid = Environment<simDim>::get().SubGrid();
+                        DataSpace<simDim> const localCells = subGrid.getLocalDomain().size;
+                        gpuCellOffsetToTotalOrigin = subGrid.getLocalDomain().offset;
+                        gpuCellOffsetToTotalOrigin.y() += numSlides * localCells.y();
+                    }
 
-            return gpuCellOffsetToTotalOrigin + superCellToLocalOriginCellOffset;
-        }
+                    /** get cell offset of the supercell
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param offset (in supercells, without any guards) to the
+                     *         origin of the local domain
+                     * @param configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE DataSpace<simDim> operator()(
+                        T_Acc const& acc,
+                        DataSpace<simDim> const& localSupercellOffset,
+                        T_WorkerCfg const&) const
+                    {
+                        DataSpace<simDim> const superCellToLocalOriginCellOffset(
+                            localSupercellOffset * SuperCellSize::toRT());
 
-    private:
+                        return gpuCellOffsetToTotalOrigin + superCellToLocalOriginCellOffset;
+                    }
 
-        //! offset in cells to the total domain origin
-        DataSpace< simDim > gpuCellOffsetToTotalOrigin;
-    };
+                private:
+                    //! offset in cells to the total domain origin
+                    DataSpace<simDim> gpuCellOffsetToTotalOrigin;
+                };
 
-} // namespace misc
-} // namespace functor
-} // namespace particles
+            } // namespace misc
+        } // namespace functor
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/interpolationMemoryPolicy/ShiftToValidRange.hpp b/include/picongpu/particles/interpolationMemoryPolicy/ShiftToValidRange.hpp
index b59791d6e8..d28b6dcb47 100644
--- a/include/picongpu/particles/interpolationMemoryPolicy/ShiftToValidRange.hpp
+++ b/include/picongpu/particles/interpolationMemoryPolicy/ShiftToValidRange.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Richard Pausch
+/* Copyright 2016-2021 Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -18,39 +18,35 @@
  */
 
 
-
 namespace picongpu
 {
-namespace particles
-{
-
-namespace interpolationMemoryPolicy
-{
-/** Shift position to valid range [0,1)
- *  and repositions memory accordingly.
- *  This is necessary if a particle moves
- *  outside of its cell during a sub-stepping cycle
- *  Returns: shifted position and shifted memory. */
-struct ShiftToValidRange
-{
-    template< typename T_MemoryType, typename T_PosType >
-    HDINLINE
-    T_MemoryType memory( const T_MemoryType& mem, const T_PosType& pos ) const
+    namespace particles
     {
-        const T_PosType pos_floor = math::floor(pos);
-        return mem( precisionCast<int>(pos_floor) );
-    }
+        namespace interpolationMemoryPolicy
+        {
+            /** Shift position to valid range [0,1)
+             *  and repositions memory accordingly.
+             *  This is necessary if a particle moves
+             *  outside of its cell during a sub-stepping cycle
+             *  Returns: shifted position and shifted memory. */
+            struct ShiftToValidRange
+            {
+                template<typename T_MemoryType, typename T_PosType>
+                HDINLINE T_MemoryType memory(const T_MemoryType& mem, const T_PosType& pos) const
+                {
+                    const T_PosType pos_floor = math::floor(pos);
+                    return mem(precisionCast<int>(pos_floor));
+                }
 
-    template< typename T_PosType >
-    HDINLINE
-    T_PosType position( const T_PosType& pos ) const
-    {
-        const T_PosType pos_floor = math::floor(pos);
-        return pos - pos_floor;
-    }
-};
+                template<typename T_PosType>
+                HDINLINE T_PosType position(const T_PosType& pos) const
+                {
+                    const T_PosType pos_floor = math::floor(pos);
+                    return pos - pos_floor;
+                }
+            };
 
-} // namespace interpolationMemoryShift
+        } // namespace interpolationMemoryPolicy
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/None/AlgorithmNone.hpp b/include/picongpu/particles/ionization/None/AlgorithmNone.hpp
index f50d82735a..8755803ec4 100644
--- a/include/picongpu/particles/ionization/None/AlgorithmNone.hpp
+++ b/include/picongpu/particles/ionization/None/AlgorithmNone.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten
+/* Copyright 2014-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -31,36 +31,32 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** \struct AlgorithmNone
-     *
-     * \brief ionization algorithm that does nothing
-     */
-    struct AlgorithmNone
+    namespace particles
     {
-
-        /** Functor implementation
-         *
-         * \tparam EType type of electric field
-         * \tparam BType type of magnetic field
-         * \tparam ParticleType type of particle to be ionized
-         *
-         * \param bField magnetic field value at t=0
-         * \param eField electric field value at t=0
-         * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
-         */
-        template<typename EType, typename BType, typename ParticleType >
-        HDINLINE void
-        operator()( const BType bField, const EType eField, ParticleType& parentIon )
+        namespace ionization
         {
-
-        }
-    };
-
-} // namespace ionization
-} // namespace particles
+            /** \struct AlgorithmNone
+             *
+             * \brief ionization algorithm that does nothing
+             */
+            struct AlgorithmNone
+            {
+                /** Functor implementation
+                 *
+                 * \tparam EType type of electric field
+                 * \tparam BType type of magnetic field
+                 * \tparam ParticleType type of particle to be ionized
+                 *
+                 * \param bField magnetic field value at t=0
+                 * \param eField electric field value at t=0
+                 * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
+                 */
+                template<typename EType, typename BType, typename ParticleType>
+                HDINLINE void operator()(const BType bField, const EType eField, ParticleType& parentIon)
+                {
+                }
+            };
+
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byCollision/ThomasFermi/AlgorithmThomasFermi.hpp b/include/picongpu/particles/ionization/byCollision/ThomasFermi/AlgorithmThomasFermi.hpp
index 324b2354ee..dc90fde0b3 100644
--- a/include/picongpu/particles/ionization/byCollision/ThomasFermi/AlgorithmThomasFermi.hpp
+++ b/include/picongpu/particles/ionization/byCollision/ThomasFermi/AlgorithmThomasFermi.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Marco Garten, Axel Huebl
+/* Copyright 2016-2021 Marco Garten, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -36,218 +36,216 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** AlgorithmThomasFermi
-     *
-     * ionization prediction for the Thomas-Fermi ionization model
-     *
-     */
-    struct AlgorithmThomasFermi
+    namespace particles
     {
-        /** Detailed Balance implementation of the Thomas-Fermi model
-         *
-         * This model uses local ion density and "temperature" values as input
-         * parameters to calculate an average charge state.
-         * A physical temperature requires a defined equilibrium state.
-         * Typical high power laser-plasma interaction is highly
-         * non-equilibrated, though. The name "temperature" is kept to illustrate
-         * the origination from the Thomas-Fermi model. It is nevertheless
-         * more accurate to think of it as an averaged kinetic energy
-         * which is not backed by the model and should therefore only be used with
-         * a certain suspicion in such Non-LTE scenarios.
-         *
-         * @tparam ParticleType type of particle for which to calculate
-         *     an average charge state
-         *
-         * @param temperature electron "temperature" value calculated from average
-         *        kinetic electron energy per ion in units of eV
-         * @param massDensity ion mass density in units of g/cm^3
-         *
-         * @return average charge state prediction according to the Thomas-Fermi model
-         */
-        template< typename ParticleType >
-        HDINLINE float_X
-        detailedBalanceThomasFermi( float_X const temperature, float_X const massDensity, ParticleType & parentIon )
+        namespace ionization
         {
+            /** AlgorithmThomasFermi
+             *
+             * ionization prediction for the Thomas-Fermi ionization model
+             *
+             */
+            struct AlgorithmThomasFermi
+            {
+                /** Detailed Balance implementation of the Thomas-Fermi model
+                 *
+                 * This model uses local ion density and "temperature" values as input
+                 * parameters to calculate an average charge state.
+                 * A physical temperature requires a defined equilibrium state.
+                 * Typical high power laser-plasma interaction is highly
+                 * non-equilibrated, though. The name "temperature" is kept to illustrate
+                 * the origination from the Thomas-Fermi model. It is nevertheless
+                 * more accurate to think of it as an averaged kinetic energy
+                 * which is not backed by the model and should therefore only be used with
+                 * a certain suspicion in such Non-LTE scenarios.
+                 *
+                 * @tparam ParticleType type of particle for which to calculate
+                 *     an average charge state
+                 *
+                 * @param temperature electron "temperature" value calculated from average
+                 *        kinetic electron energy per ion in units of eV
+                 * @param massDensity ion mass density in units of g/cm^3
+                 *
+                 * @return average charge state prediction according to the Thomas-Fermi model
+                 */
+                template<typename ParticleType>
+                HDINLINE float_X detailedBalanceThomasFermi(
+                    float_X const temperature,
+                    float_X const massDensity,
+                    ParticleType& parentIon)
+                {
+                    /* @TODO replace the float_64 with float_X and make sure the values are scaled to PIConGPU units */
+                    constexpr float_64 protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
+                    constexpr float_64 neutronNumber = GetAtomicNumbers<ParticleType>::type::numberOfNeutrons;
 
-            /* @TODO replace the float_64 with float_X and make sure the values are scaled to PIConGPU units */
-            constexpr float_64 protonNumber = GetAtomicNumbers< ParticleType >::type::numberOfProtons;
-            constexpr float_64 neutronNumber = GetAtomicNumbers< ParticleType >::type::numberOfNeutrons;
-
-            /* atomic mass number (usually A) A = N + Z */
-            constexpr float_64 massNumber = neutronNumber + protonNumber;
-
-            float_64 const T_0 = temperature / math::pow( protonNumber, float_64( 4. / 3. ) );
-
-            float_64 const T_F = T_0 / ( float_64( 1. ) + T_0 );
-
-            /* for all the fitting parameters @see ionizer.param */
-
-            /** this is weird - I have to define temporary variables because
-             * otherwise the math::pow function won't recognize those at the
-             * exponent position */
-            constexpr float_64 TFA2_temp = thomasFermi::TFA2;
-            constexpr float_64 TFA4_temp = thomasFermi::TFA4;
-            constexpr float_64 TFBeta_temp = thomasFermi::TFBeta;
-
-            float_64 const A = thomasFermi::TFA1 * math::pow( T_0, TFA2_temp ) + thomasFermi::TFA3 * math::pow( T_0, TFA4_temp );
-
-            float_64 const B = -math::exp( thomasFermi::TFB0 + thomasFermi::TFB1 * T_F + thomasFermi::TFB2 * math::pow( T_F, float_64( 7. ) ) );
-
-            float_64 const C = thomasFermi::TFC1 * T_F + thomasFermi::TFC2;
-
-            constexpr float_64 invAtomicTimesMassNumber = float_64( 1. ) / ( protonNumber * massNumber );
-            float_64 const R = massDensity * invAtomicTimesMassNumber;
-
-            float_64 const Q_1 = A * math::pow( R, B );
-
-            float_64 const Q = math::pow( math::pow( R, C ) + math::pow( Q_1, C ), float_64( 1. ) / C );
+                    /* atomic mass number (usually A) A = N + Z */
+                    constexpr float_64 massNumber = neutronNumber + protonNumber;
 
-            float_64 const x = thomasFermi::TFAlpha * math::pow( Q, TFBeta_temp );
+                    float_64 const T_0 = temperature / math::pow(protonNumber, float_64(4. / 3.));
 
-            /* Thomas-Fermi average ionization state */
-            float_X const ZStar = static_cast< float_X >(
-                protonNumber * x / (
-                    float_64( 1. ) + x +
-                    math::sqrt( float_64( 1. ) + float_64( 2. ) * x )
-                )
-            );
+                    float_64 const T_F = T_0 / (float_64(1.) + T_0);
 
-            return ZStar;
-        }
+                    /* for all the fitting parameters @see ionizer.param */
 
-        /** Functor implementation
-         *
-         * Calling this functor gives a prediction for an integer number of new
-         * free macro electrons to create. This prediction is based on the
-         * average charge state in the Thomas-Fermi model.
-         * The functor calculates the integer number of bound electrons from
-         * this state by a Monte-Carlo step.
-         *
-         * @tparam ParticleType type of particle to be ionized
-         *
-         * @param ZStar average charge state in the Thomas-Fermi model
-         * @param parentIon particle instance to be ionized
-         * @param randNr random number
-         *
-         * @return numNewFreeMacroElectrons number of new macro electrons to
-         *         create, range: [0, boundElectrons]
-         */
-        template< typename ParticleType >
-        HDINLINE uint32_t
-        operator( )( float_X const kinEnergyDensity, float_X const density, ParticleType & parentIon, float_X randNr )
-        {
+                    /** this is weird - I have to define temporary variables because
+                     * otherwise the math::pow function won't recognize those at the
+                     * exponent position */
+                    constexpr float_64 TFA2_temp = thomasFermi::TFA2;
+                    constexpr float_64 TFA4_temp = thomasFermi::TFA4;
+                    constexpr float_64 TFBeta_temp = thomasFermi::TFBeta;
 
-            /* initialize functor return value: number of new macro electrons to create */
-            uint32_t numNewFreeMacroElectrons = 0u;
+                    float_64 const A = thomasFermi::TFA1 * math::pow(T_0, TFA2_temp)
+                        + thomasFermi::TFA3 * math::pow(T_0, TFA4_temp);
 
-            float_64 const densityUnit = static_cast< float_64 >( particleToGrid::derivedAttributes::Density( ).getUnit( )[ 0 ] );
-            float_64 const kinEnergyDensityUnit = static_cast< float_64 >( particleToGrid::derivedAttributes::EnergyDensity( ).getUnit( )[ 0 ] );
-            /* convert from kinetic energy density to average kinetic energy per particle */
-            float_64 const kinEnergyUnit = kinEnergyDensityUnit / densityUnit;
-            float_64 const avgKinEnergy = kinEnergyDensity / density * kinEnergyUnit;
-            /** convert kinetic energy in J to "temperature" in eV by assuming an ideal electron gas
-             * E_kin = 3/2 k*T
-             */
-            constexpr float_64 convKinEnergyToTemperature = UNITCONV_Joule_to_keV * float_64( 1.e3 ) * float_64( 2./3. );
-            /** electron "temperature" in electron volts */
-            float_64 const temperature = avgKinEnergy * convKinEnergyToTemperature;
+                    float_64 const B = -math::exp(
+                        thomasFermi::TFB0 + thomasFermi::TFB1 * T_F
+                        + thomasFermi::TFB2 * math::pow(T_F, float_64(7.)));
 
-            /* conversion factors from number density to mass density */
-            constexpr float_64 nAvogadro = SI::N_AVOGADRO;
-            constexpr float_64 convM3ToCM3 = 1.e6;
+                    float_64 const C = thomasFermi::TFC1 * T_F + thomasFermi::TFC2;
 
-            /* @TODO replace the float_64 with float_X and make sure the values are scaled to PIConGPU units */
-            constexpr float_64 protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
-            constexpr float_64 neutronNumber = GetAtomicNumbers<ParticleType>::type::numberOfNeutrons;
+                    constexpr float_64 invAtomicTimesMassNumber = float_64(1.) / (protonNumber * massNumber);
+                    float_64 const R = massDensity * invAtomicTimesMassNumber;
 
-            /* atomic mass number (usually A) A = N + Z */
-            constexpr float_64 massNumber = neutronNumber + protonNumber;
+                    float_64 const Q_1 = A * math::pow(R, B);
 
-            float_64 const convToMassDensity = densityUnit * massNumber / nAvogadro / convM3ToCM3;
-            /** mass density in units of g/cm^3 */
-            float_64 const massDensity = density * convToMassDensity;
+                    float_64 const Q = math::pow(math::pow(R, C) + math::pow(Q_1, C), float_64(1.) / C);
 
-            /** lower ion density cutoff
-             *
-             * The Thomas-Fermi model yields unphysical artifacts for low densities.
-             * If `density` is lower than a user-definable ion number density value the model will not be applied.
-             */
-            constexpr float_X lowerDensityCutoff = particles::ionization::thomasFermi::CUTOFF_LOW_DENSITY;
-            /** lower electron temperature cutoff
-             *
-             * The Thomas-Fermi model also yields partly unphysical artifacts for low electron temperatures.
-             * If `temperature` is lower than a user-definable ion number temperature value the model will not be applied.
-             */
-            constexpr float_X lowerTemperatureCutoff = particles::ionization::thomasFermi::CUTOFF_LOW_TEMPERATURE_EV;
+                    float_64 const x = thomasFermi::TFAlpha * math::pow(Q, TFBeta_temp);
 
-            if(
-                density * densityUnit >= lowerDensityCutoff &&
-                temperature >= lowerTemperatureCutoff
-            )
-            {
+                    /* Thomas-Fermi average ionization state */
+                    float_X const ZStar = static_cast<float_X>(
+                        protonNumber * x / (float_64(1.) + x + math::sqrt(float_64(1.) + float_64(2.) * x)));
 
-                float_64 const chargeState = attribute::getChargeState( parentIon );
-                /* @TODO replace the float_64 with float_X and make sure the values are scaled to PIConGPU units */
-                constexpr float_64 protonNumber = GetAtomicNumbers< ParticleType >::type::numberOfProtons;
+                    return ZStar;
+                }
 
-                /* only ionize not-fully ionized ions */
-                if( chargeState < protonNumber )
+                /** Functor implementation
+                 *
+                 * Calling this functor gives a prediction for an integer number of new
+                 * free macro electrons to create. This prediction is based on the
+                 * average charge state in the Thomas-Fermi model.
+                 * The functor calculates the integer number of bound electrons from
+                 * this state by a Monte-Carlo step.
+                 *
+                 * @tparam ParticleType type of particle to be ionized
+                 *
+                 * @param ZStar average charge state in the Thomas-Fermi model
+                 * @param parentIon particle instance to be ionized
+                 * @param randNr random number
+                 *
+                 * @return numNewFreeMacroElectrons number of new macro electrons to
+                 *         create, range: [0, boundElectrons]
+                 */
+                template<typename ParticleType>
+                HDINLINE uint32_t operator()(
+                    float_X const kinEnergyDensity,
+                    float_X const density,
+                    ParticleType& parentIon,
+                    float_X randNr)
                 {
-                    /* Thomas-Fermi calculation step:
-                     * Determines the new average charge state for each ion under
-                     * LTE conditions.
+                    /* initialize functor return value: number of new macro electrons to create */
+                    uint32_t numNewFreeMacroElectrons = 0u;
+
+                    float_64 const densityUnit
+                        = static_cast<float_64>(particleToGrid::derivedAttributes::Density().getUnit()[0]);
+                    float_64 const kinEnergyDensityUnit
+                        = static_cast<float_64>(particleToGrid::derivedAttributes::EnergyDensity().getUnit()[0]);
+                    /* convert from kinetic energy density to average kinetic energy per particle */
+                    float_64 const kinEnergyUnit = kinEnergyDensityUnit / densityUnit;
+                    float_64 const avgKinEnergy = kinEnergyDensity / density * kinEnergyUnit;
+                    /** convert kinetic energy in J to "temperature" in eV by assuming an ideal electron gas
+                     * E_kin = 3/2 k*T
                      */
-                    float_X const ZStar = detailedBalanceThomasFermi(
-                        temperature,
-                        massDensity,
-                        parentIon
-                    );
-
-                    /* integral part of the average charge state */
-                    float_X intZStar;
-                    /* fractional part of the average charge state */
-                    float_X const fracZStar = math::modf( ZStar, &intZStar );
-
-                    /* Determine new charge state.
-                     * We do a Monte-Carlo step to distribute charge states between
-                     * the two "surrounding" integer numbers if ZStar has a non-zero
-                     * fractional part.
-                     */
-                    float_X const newChargeState =
-                        intZStar +
-                        float_X( 1.0 ) * ( randNr < fracZStar );
-
-                    /* define number of bound macro electrons before ionization */
-                    float_X const prevBoundElectrons = parentIon[ boundElectrons_ ];
-
-                    /** determine the new number of bound electrons from the TF ionization state
-                     * @TODO introduce partial macroparticle ionization / ionization distribution at some point
+                    constexpr float_64 convKinEnergyToTemperature
+                        = UNITCONV_Joule_to_keV * float_64(1.e3) * float_64(2. / 3.);
+                    /** electron "temperature" in electron volts */
+                    float_64 const temperature = avgKinEnergy * convKinEnergyToTemperature;
+
+                    /* conversion factors from number density to mass density */
+                    constexpr float_64 nAvogadro = SI::N_AVOGADRO;
+                    constexpr float_64 convM3ToCM3 = 1.e6;
+
+                    /* @TODO replace the float_64 with float_X and make sure the values are scaled to PIConGPU units */
+                    constexpr float_64 protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
+                    constexpr float_64 neutronNumber = GetAtomicNumbers<ParticleType>::type::numberOfNeutrons;
+
+                    /* atomic mass number (usually A) A = N + Z */
+                    constexpr float_64 massNumber = neutronNumber + protonNumber;
+
+                    float_64 const convToMassDensity = densityUnit * massNumber / nAvogadro / convM3ToCM3;
+                    /** mass density in units of g/cm^3 */
+                    float_64 const massDensity = density * convToMassDensity;
+
+                    /** lower ion density cutoff
+                     *
+                     * The Thomas-Fermi model yields unphysical artifacts for low densities.
+                     * If `density` is lower than a user-definable ion number density value the model will not be
+                     * applied.
                      */
-                    float_X const newBoundElectrons = protonNumber - newChargeState;
-
-                    /* Only account for ionization: we only increase the charge
-                     * state of an ion if necessary, but ignore recombination of
-                     * electrons as prediced by the implemented detailed balance
-                     * algorithm.
+                    constexpr float_X lowerDensityCutoff = particles::ionization::thomasFermi::CUTOFF_LOW_DENSITY;
+                    /** lower electron temperature cutoff
+                     *
+                     * The Thomas-Fermi model also yields partly unphysical artifacts for low electron temperatures.
+                     * If `temperature` is lower than a user-definable ion number temperature value the model will not
+                     * be applied.
                      */
-                    if( prevBoundElectrons > newBoundElectrons )
-                       /* determine number of new free macro electrons
-                        * to be created in the ionization routine
-                        */
-                        numNewFreeMacroElectrons = static_cast< uint32_t >( prevBoundElectrons - newBoundElectrons );
+                    constexpr float_X lowerTemperatureCutoff
+                        = particles::ionization::thomasFermi::CUTOFF_LOW_TEMPERATURE_EV;
+
+                    if(density * densityUnit >= lowerDensityCutoff && temperature >= lowerTemperatureCutoff)
+                    {
+                        float_64 const chargeState = attribute::getChargeState(parentIon);
+                        /* @TODO replace the float_64 with float_X and make sure the values are scaled to PIConGPU
+                         * units */
+                        constexpr float_64 protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
+
+                        /* only ionize not-fully ionized ions */
+                        if(chargeState < protonNumber)
+                        {
+                            /* Thomas-Fermi calculation step:
+                             * Determines the new average charge state for each ion under
+                             * LTE conditions.
+                             */
+                            float_X const ZStar = detailedBalanceThomasFermi(temperature, massDensity, parentIon);
+
+                            /* integral part of the average charge state */
+                            float_X intZStar;
+                            /* fractional part of the average charge state */
+                            float_X const fracZStar = pmacc::math::modf(ZStar, &intZStar);
+
+                            /* Determine new charge state.
+                             * We do a Monte-Carlo step to distribute charge states between
+                             * the two "surrounding" integer numbers if ZStar has a non-zero
+                             * fractional part.
+                             */
+                            float_X const newChargeState = intZStar + float_X(1.0) * (randNr < fracZStar);
+
+                            /* define number of bound macro electrons before ionization */
+                            float_X const prevBoundElectrons = parentIon[boundElectrons_];
+
+                            /** determine the new number of bound electrons from the TF ionization state
+                             * @TODO introduce partial macroparticle ionization / ionization distribution at some point
+                             */
+                            float_X const newBoundElectrons = protonNumber - newChargeState;
+
+                            /* Only account for ionization: we only increase the charge
+                             * state of an ion if necessary, but ignore recombination of
+                             * electrons as prediced by the implemented detailed balance
+                             * algorithm.
+                             */
+                            if(prevBoundElectrons > newBoundElectrons)
+                                /* determine number of new free macro electrons
+                                 * to be created in the ionization routine
+                                 */
+                                numNewFreeMacroElectrons
+                                    = static_cast<uint32_t>(prevBoundElectrons - newBoundElectrons);
+                        }
+                    }
+
+                    return numNewFreeMacroElectrons;
                 }
-            }
-
-            return numNewFreeMacroElectrons;
-        }
-
-    };
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi.def b/include/picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi.def
index 52a3dd523f..1a4848d9e2 100644
--- a/include/picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi.def
+++ b/include/picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi.def
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Marco Garten
+/* Copyright 2016-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -23,54 +23,54 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-    /** Thomas-Fermi impact ionization model
-     *
-     * \tparam T_DestSpecies electron species to be created
-     * \tparam T_SrcSpecies particle species that is ionized
-     *         default is boost::mpl placeholder because specialization
-     *         cannot be known in list of particle species' flags
-     *         \see speciesDefinition.param
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies = bmpl::_1>
-    struct ThomasFermi_Impl;
-
-    /** Thomas-Fermi impact ionization model
-     *
-     * This ionization model describes the atom inside the Thomas-Fermi framework
-     * in a self-consistent way. There the electrons are modeled as a density
-     * with respect to the distance from the core while the atomic core is often
-     * assumed as a point charge. The atomic potential is considered to be finite
-     * as a result of matter density and it defines the so-called "ion sphere".
-     * Due to the overlap of adjacent ion spheres the ionization barrier can be
-     * lowered and electrons become quasi-free in the system (resonance states).
-     * The Thomas-Fermi model calculates an average ionization degree only with
-     * respect to charge density and temperature. Through further assumptions
-     * and fitting parameters the model gets extended to arbitrary temperatures
-     * and atoms.
-     *
-     * See table IV from Pressure Ionization, Resonances, and the Continuity of
-     * Bound and Free States
-     * \url http://www.sciencedirect.com/science/article/pii/S0065219908601451
-     * doi:10.1016/S0065-2199(08)60145-1
-     *
-     * \tparam T_DestSpecies electron species to be created
-     *
-     * wrapper class,
-     * needed because the SrcSpecies cannot be known during the
-     * first specialization of the ionization model in the particle definition
-     * \see speciesDefinition.param
-     */
-    template<typename T_DestSpecies>
-    struct ThomasFermi
+    namespace particles
     {
-        using IonizationAlgorithm = particles::ionization::AlgorithmThomasFermi;
-        using type = ThomasFermi_Impl<IonizationAlgorithm, T_DestSpecies>;
-    };
+        namespace ionization
+        {
+            /** Thomas-Fermi impact ionization model
+             *
+             * \tparam T_DestSpecies electron species to be created
+             * \tparam T_SrcSpecies particle species that is ionized
+             *         default is boost::mpl placeholder because specialization
+             *         cannot be known in list of particle species' flags
+             *         \see speciesDefinition.param
+             */
+            template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies = bmpl::_1>
+            struct ThomasFermi_Impl;
+
+            /** Thomas-Fermi impact ionization model
+             *
+             * This ionization model describes the atom inside the Thomas-Fermi framework
+             * in a self-consistent way. There the electrons are modeled as a density
+             * with respect to the distance from the core while the atomic core is often
+             * assumed as a point charge. The atomic potential is considered to be finite
+             * as a result of matter density and it defines the so-called "ion sphere".
+             * Due to the overlap of adjacent ion spheres the ionization barrier can be
+             * lowered and electrons become quasi-free in the system (resonance states).
+             * The Thomas-Fermi model calculates an average ionization degree only with
+             * respect to charge density and temperature. Through further assumptions
+             * and fitting parameters the model gets extended to arbitrary temperatures
+             * and atoms.
+             *
+             * See table IV from Pressure Ionization, Resonances, and the Continuity of
+             * Bound and Free States
+             * \url http://www.sciencedirect.com/science/article/pii/S0065219908601451
+             * doi:10.1016/S0065-2199(08)60145-1
+             *
+             * \tparam T_DestSpecies electron species to be created
+             *
+             * wrapper class,
+             * needed because the SrcSpecies cannot be known during the
+             * first specialization of the ionization model in the particle definition
+             * \see speciesDefinition.param
+             */
+            template<typename T_DestSpecies>
+            struct ThomasFermi
+            {
+                using IonizationAlgorithm = particles::ionization::AlgorithmThomasFermi;
+                using type = ThomasFermi_Impl<IonizationAlgorithm, T_DestSpecies>;
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi_Impl.hpp b/include/picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi_Impl.hpp
index 5687955ced..823bfd1268 100644
--- a/include/picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi_Impl.hpp
+++ b/include/picongpu/particles/ionization/byCollision/ThomasFermi/ThomasFermi_Impl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Marco Garten, Axel Huebl
+/* Copyright 2016-2021 Marco Garten, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -22,7 +22,6 @@
 #include "picongpu/simulation_defines.hpp"
 #include <pmacc/traits/Resolve.hpp>
 #include <pmacc/particles/meta/FindByNameOrType.hpp>
-#include "picongpu/traits/UsesRNG.hpp"
 
 #include "picongpu/fields/CellType.hpp"
 #include "picongpu/fields/FieldTmp.hpp"
@@ -43,357 +42,289 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    /** specialization of the UsesRNG trait
-     * --> ionization module uses random number generation
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies>
-    struct UsesRNG<particles::ionization::ThomasFermi_Impl<T_IonizationAlgorithm, T_DestSpecies, T_SrcSpecies> > :
-    public boost::true_type
+    namespace particles
     {
-    };
-} // namespace traits
-
-namespace particles
-{
-namespace ionization
-{
-
-    /** ThomasFermi_Impl
-     *
-     * Thomas-Fermi pressure ionization - Implementation
-     *
-     * @tparam T_IonizationAlgorithm functor that returns a number of
-     *         new free macro electrons to create, range: [0, boundElectrons]
-     * @tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
-     * @tparam T_SrcSpecies type or name as boost::mpl::string of the particle species that is ionized
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies>
-    struct ThomasFermi_Impl
-    {
-
-        using DestSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_DestSpecies
-        >;
-        using SrcSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_SrcSpecies
-        >;
-
-        using FrameType =  typename SrcSpecies::FrameType;
-
-        /** specify field to particle interpolation scheme
-         *
-         * @todo this needs to be done independently/twice if ion species (rho) and electron
-         *       species (ene) are of different shape
-         */
-        using Field2ParticleInterpolation = typename pmacc::traits::Resolve<
-            typename GetFlagType<FrameType,interpolation<> >::type
-        >::type;
-
-        /* margins around the supercell for the interpolation of the field on the cells */
-        using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin ;
-        using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
-
-        /* relevant area of a block */
-        using BlockArea = SuperCellDescription<
-            typename MappingDesc::SuperCellSize,
-            LowerMargin,
-            UpperMargin
-            >;
-
-        BlockArea BlockDescription;
-
-        /* parameter class containing the energy cutoff parameter for electron temperature calculation */
-        struct CutoffMaxEnergy
+        namespace ionization
         {
-            static constexpr float_X cutoffMaxEnergy =
-                particles::ionization::thomasFermi::CUTOFF_MAX_ENERGY;
-        };
-
-        private:
-
-            /* define ionization ALGORITHM (calculation) for ionization MODEL */
-            using IonizationAlgorithm =  T_IonizationAlgorithm;
-
-            /* random number generator */
-            using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
-            using Distribution = pmacc::random::distributions::Uniform<float_X>;
-            using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
-            RandomGen randomGen;
-
-            using SuperCellSize = MappingDesc::SuperCellSize;
-
-            using ValueType_Rho = FieldTmp::ValueType;
-            using ValueType_Ene = FieldTmp::ValueType ;
-
-            /* global memory EM-field device databoxes */
-            PMACC_ALIGN(rhoBox, FieldTmp::DataBoxType);
-            PMACC_ALIGN(eneBox, FieldTmp::DataBoxType);
-
-            /* shared memory EM-field device databoxes */
-            PMACC_ALIGN(cachedRho, DataBox<SharedBox<ValueType_Rho, typename BlockArea::FullSuperCellSize,0> >);
-            PMACC_ALIGN(cachedEne, DataBox<SharedBox<ValueType_Ene, typename BlockArea::FullSuperCellSize,1> >);
-
-        public:
-            /* host constructor initializing member : random number generator */
-            ThomasFermi_Impl(const uint32_t currentStep) : randomGen(RNGFactory::createRandom<Distribution>())
+            /** ThomasFermi_Impl
+             *
+             * Thomas-Fermi pressure ionization - Implementation
+             *
+             * @tparam T_IonizationAlgorithm functor that returns a number of
+             *         new free macro electrons to create, range: [0, boundElectrons]
+             * @tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
+             * @tparam T_SrcSpecies type or name as boost::mpl::string of the particle species that is ionized
+             */
+            template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies>
+            struct ThomasFermi_Impl
             {
-                /* create handle for access to host and device data */
-                DataConnector &dc = Environment<>::get().DataConnector();
+                using DestSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_DestSpecies>;
+                using SrcSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SrcSpecies>;
 
-                /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
-                 * even if the class is never instantiated. In that case static assert is always
-                 * evaluated (e.g. with clang), this results in an error if the condition is false.
-                 * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
+                using FrameType = typename SrcSpecies::FrameType;
+
+                /** specify field to particle interpolation scheme
                  *
-                 * A workaround is to add a template dependency to the expression.
-                 * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+                 * @todo this needs to be done independently/twice if ion species (rho) and electron
+                 *       species (ene) are of different shape
                  */
-                PMACC_CASSERT_MSG(
-                    _please_allocate_at_least_two_FieldTmp_slots_in_memory_param,
-                    ( fieldTmpNumSlots >= 2 ) && ( sizeof( T_IonizationAlgorithm ) != 0 )
-                );
-                /* initialize pointers on host-side density-/energy density field databoxes */
-                auto density = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true );
-                auto eneKinDens = dc.get< FieldTmp >( FieldTmp::getUniqueId( 1 ), true );
-
-                /* reset density and kinetic energy values to zero */
-                density->getGridBuffer().getDeviceBuffer().setValue( FieldTmp::ValueType( 0. ) );
-                eneKinDens->getGridBuffer().getDeviceBuffer().setValue( FieldTmp::ValueType( 0. ) );
-
-                /* load species without copying the particle data to the host */
-                auto srcSpecies = dc.get< SrcSpecies >( SrcSpecies::FrameType::getName(), true );
-
-                /** Calculate weighted ion density
+                using Field2ParticleInterpolation =
+                    typename pmacc::traits::Resolve<typename GetFlagType<FrameType, interpolation<>>::type>::type;
+
+                /* margins around the supercell for the interpolation of the field on the cells */
+                using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
+                using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
+
+                /* relevant area of a block */
+                using BlockArea = SuperCellDescription<typename MappingDesc::SuperCellSize, LowerMargin, UpperMargin>;
+
+                BlockArea BlockDescription;
+
+                /* parameter class containing the energy cutoff parameter for electron temperature calculation */
+                struct CutoffMaxEnergy
+                {
+                    static constexpr float_X cutoffMaxEnergy = particles::ionization::thomasFermi::CUTOFF_MAX_ENERGY;
+                };
+
+            private:
+                /* define ionization ALGORITHM (calculation) for ionization MODEL */
+                using IonizationAlgorithm = T_IonizationAlgorithm;
+
+                /* random number generator */
+                using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
+                using Distribution = pmacc::random::distributions::Uniform<float_X>;
+                using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
+                RandomGen randomGen;
+
+                using SuperCellSize = MappingDesc::SuperCellSize;
+
+                using ValueType_Rho = FieldTmp::ValueType;
+                using ValueType_Ene = FieldTmp::ValueType;
+
+                /* global memory EM-field device databoxes */
+                PMACC_ALIGN(rhoBox, FieldTmp::DataBoxType);
+                PMACC_ALIGN(eneBox, FieldTmp::DataBoxType);
+
+                /* shared memory EM-field device databoxes */
+                PMACC_ALIGN(cachedRho, DataBox<SharedBox<ValueType_Rho, typename BlockArea::FullSuperCellSize, 0>>);
+                PMACC_ALIGN(cachedEne, DataBox<SharedBox<ValueType_Ene, typename BlockArea::FullSuperCellSize, 1>>);
+
+            public:
+                /* host constructor initializing member : random number generator */
+                ThomasFermi_Impl(const uint32_t currentStep) : randomGen(RNGFactory::createRandom<Distribution>())
+                {
+                    /* create handle for access to host and device data */
+                    DataConnector& dc = Environment<>::get().DataConnector();
+
+                    /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
+                     * even if the class is never instantiated. In that case static assert is always
+                     * evaluated (e.g. with clang), this results in an error if the condition is false.
+                     * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
+                     *
+                     * A workaround is to add a template dependency to the expression.
+                     * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+                     */
+                    PMACC_CASSERT_MSG(
+                        _please_allocate_at_least_two_FieldTmp_slots_in_memory_param,
+                        (fieldTmpNumSlots >= 2) && (sizeof(T_IonizationAlgorithm) != 0));
+                    /* initialize pointers on host-side density-/energy density field databoxes */
+                    auto density = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+                    auto eneKinDens = dc.get<FieldTmp>(FieldTmp::getUniqueId(1), true);
+
+                    /* reset density and kinetic energy values to zero */
+                    density->getGridBuffer().getDeviceBuffer().setValue(FieldTmp::ValueType(0.));
+                    eneKinDens->getGridBuffer().getDeviceBuffer().setValue(FieldTmp::ValueType(0.));
+
+                    /* load species without copying the particle data to the host */
+                    auto srcSpecies = dc.get<SrcSpecies>(SrcSpecies::FrameType::getName(), true);
+
+                    /** Calculate weighted ion density
+                     *
+                     * @todo Include all ion species because the model requires the
+                     *       density of ionic potential wells
+                     */
+                    using DensitySolver = typename particleToGrid::
+                        CreateFieldTmpOperation_t<SrcSpecies, particleToGrid::derivedAttributes::Density>::Solver;
+                    density->template computeValue<CORE + BORDER, DensitySolver>(*srcSpecies, currentStep);
+                    dc.releaseData(SrcSpecies::FrameType::getName());
+                    EventTask densityEvent = density->asyncCommunication(__getTransactionEvent());
+                    densityEvent += density->asyncCommunicationGather(densityEvent);
+
+                    /* load species without copying the particle data to the host */
+                    auto destSpecies = dc.get<DestSpecies>(DestSpecies::FrameType::getName(), true);
+
+                    /** Calculate energy density of the electron species with maximum energy cutoff
+                     *
+                     *  @todo Include all electron species with a meta::ForEach<VectorallSpecies,...>
+                     * instead of just the destination species
+                     */
+                    using EnergyDensitySolver = typename particleToGrid::CreateFieldTmpOperation_t<
+                        DestSpecies,
+                        particleToGrid::derivedAttributes::EnergyDensityCutoff<CutoffMaxEnergy>>::Solver;
+                    eneKinDens->template computeValue<CORE + BORDER, EnergyDensitySolver>(*destSpecies, currentStep);
+                    dc.releaseData(DestSpecies::FrameType::getName());
+                    EventTask eneKinEvent = eneKinDens->asyncCommunication(__getTransactionEvent());
+                    eneKinEvent += eneKinDens->asyncCommunicationGather(eneKinEvent);
+
+                    /* contributions from neighboring GPUs to our border area */
+                    __setTransactionEvent(densityEvent + eneKinEvent);
+
+                    /* initialize device-side density- and energy density field databox pointers */
+                    rhoBox = density->getDeviceDataBox();
+                    eneBox = eneKinDens->getDeviceDataBox();
+                }
+
+                /** cache fields used by this functor
+                 *
+                 * @warning this is a collective method and calls synchronize
+                 *
+                 * @tparam T_Acc alpaka accelerator type
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
                  *
-                 * @todo Include all ion species because the model requires the
-                 *       density of ionic potential wells
+                 * @param acc alpaka accelerator
+                 * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
+                 * @param workerCfg configuration of the worker
                  */
-                using DensitySolver = typename particleToGrid::CreateFieldTmpOperation_t<
-                    SrcSpecies,
-                    particleToGrid::derivedAttributes::Density
-                >::Solver;
-                density->template computeValue< CORE + BORDER, DensitySolver >(*srcSpecies, currentStep);
-                dc.releaseData( SrcSpecies::FrameType::getName() );
-                EventTask densityEvent = density->asyncCommunication( __getTransactionEvent() );
-                densityEvent += density->asyncCommunicationGather( densityEvent );
-
-                /* load species without copying the particle data to the host */
-                auto destSpecies = dc.get< DestSpecies >( DestSpecies::FrameType::getName(), true );
-
-                /** Calculate energy density of the electron species with maximum energy cutoff
+                template<typename T_Acc, typename T_WorkerCfg>
+                DINLINE void collectiveInit(
+                    const T_Acc& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const T_WorkerCfg& workerCfg)
+                {
+                    /* caching of density and "temperature" fields */
+                    cachedRho = CachedBox::create<0, ValueType_Rho>(acc, BlockArea());
+                    cachedEne = CachedBox::create<1, ValueType_Ene>(acc, BlockArea());
+
+                    /* instance of nvidia assignment operator */
+                    nvidia::functors::Assign assign;
+                    /* copy fields from global to shared */
+                    auto fieldRhoBlock = rhoBox.shift(blockCell);
+                    ThreadCollective<BlockArea, T_WorkerCfg::numWorkers> collective(workerCfg.getWorkerIdx());
+                    collective(acc, assign, cachedRho, fieldRhoBlock);
+                    /* copy fields from global to shared */
+                    auto fieldEneBlock = eneBox.shift(blockCell);
+                    collective(acc, assign, cachedEne, fieldEneBlock);
+
+                    /* wait for shared memory to be initialized */
+                    cupla::__syncthreads(acc);
+                }
+
+                /** Initialization function on device
                  *
-                 *  @todo Include all electron species with a meta::ForEach<VectorallSpecies,...>
-                 * instead of just the destination species
+                 * Cache density and energy density fields on device and initialize
+                 * possible prerequisites for ionization, like e.g. random number
+                 * generator.
+                 *
+                 * This function will be called inline on the device which must happen BEFORE threads diverge
+                 * during loop execution. The reason for this is the `cupla::__syncthreads( acc )` call which is
+                 * necessary after initializing the field shared boxes in shared memory.
+                 *
+                 * @param blockCell Offset of the cell from the origin of the local domain
+                 *                  *including guarding supercells* in units of cells
+                 * @param linearThreadIdx Linearized thread ID inside the block
+                 * @param localCellOffset Offset of the cell from the origin of the local
+                 *                        domain, i.e. from the @see BORDER
+                 *                        *without guarding supercells*
                  */
-                using EnergyDensitySolver = typename particleToGrid::CreateFieldTmpOperation_t<
-                    DestSpecies,
-                    particleToGrid::derivedAttributes::EnergyDensityCutoff< CutoffMaxEnergy >
-                >::Solver;
-                eneKinDens->template computeValue< CORE + BORDER, EnergyDensitySolver >(*destSpecies, currentStep);
-                dc.releaseData( DestSpecies::FrameType::getName() );
-                EventTask eneKinEvent = eneKinDens->asyncCommunication( __getTransactionEvent() );
-                eneKinEvent += eneKinDens->asyncCommunicationGather( eneKinEvent );
-
-                /* contributions from neighboring GPUs to our border area */
-                __setTransactionEvent( densityEvent + eneKinEvent );
-
-                /* initialize device-side density- and energy density field databox pointers */
-                rhoBox = density->getDeviceDataBox();
-                eneBox = eneKinDens->getDeviceDataBox();
-
-            }
-
-            /** cache fields used by this functor
-             *
-             * @warning this is a collective method and calls synchronize
-             *
-             * @tparam T_Acc alpaka accelerator type
-             * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-             *
-             * @param acc alpaka accelerator
-             * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
-             * @param workerCfg configuration of the worker
-             */
-            template<
-                typename T_Acc ,
-                typename T_WorkerCfg
-            >
-            DINLINE void collectiveInit(
-                const T_Acc & acc,
-                const DataSpace<simDim>& blockCell,
-                const T_WorkerCfg & workerCfg
-            )
-            {
-                /* caching of density and "temperature" fields */
-                cachedRho = CachedBox::create<
-                    0,
-                    ValueType_Rho
-                >(
-                    acc,
-                    BlockArea()
-                );
-                cachedEne = CachedBox::create<
-                    1,
-                    ValueType_Ene
-                >(
-                    acc,
-                    BlockArea()
-                );
-
-                /* instance of nvidia assignment operator */
-                nvidia::functors::Assign assign;
-                /* copy fields from global to shared */
-                auto fieldRhoBlock = rhoBox.shift(blockCell);
-                ThreadCollective<
-                    BlockArea,
-                    T_WorkerCfg::numWorkers
-                > collective( workerCfg.getWorkerIdx( ) );
-                collective(
-                          acc,
-                          assign,
-                          cachedRho,
-                          fieldRhoBlock
-                          );
-                /* copy fields from global to shared */
-                auto fieldEneBlock = eneBox.shift(blockCell);
-                collective(
-                          acc,
-                          assign,
-                          cachedEne,
-                          fieldEneBlock
-                          );
-
-                /* wait for shared memory to be initialized */
-                __syncthreads();
-            }
-
-            /** Initialization function on device
-             *
-             * Cache density and energy density fields on device and initialize
-             * possible prerequisites for ionization, like e.g. random number
-             * generator.
-             *
-             * This function will be called inline on the device which must happen BEFORE threads diverge
-             * during loop execution. The reason for this is the `__syncthreads()` call which is necessary after
-             * initializing the field shared boxes in shared memory.
-             *
-             * @param blockCell Offset of the cell from the origin of the local domain
-             *                  *including guarding supercells* in units of cells
-             * @param linearThreadIdx Linearized thread ID inside the block
-             * @param localCellOffset Offset of the cell from the origin of the local
-             *                        domain, i.e. from the @see BORDER
-             *                        *without guarding supercells*
-             */
-            template< typename T_Acc >
-            DINLINE void init(
-                T_Acc const & acc,
-                const DataSpace<simDim>& blockCell,
-                const int& linearThreadIdx,
-                const DataSpace<simDim>& localCellOffset
-            )
-            {
-                /* initialize random number generator with the local cell index in the simulation */
-                this->randomGen.init(localCellOffset);
-            }
-
-            /** Determine number of new macro electrons due to ionization
-             *
-             * @param ionFrame reference to frame of the to-be-ionized particles
-             * @param localIdx local (linear) index in super cell / frame
-             */
-            template< typename T_Acc >
-            DINLINE uint32_t numNewParticles(T_Acc const & acc,  FrameType& ionFrame, int localIdx)
-            {
-                /* alias for the single macro-particle */
-                auto particle = ionFrame[localIdx];
-                /* particle position, used for field-to-particle interpolation */
-                floatD_X const pos = particle[position_];
-                int const particleCellIdx = particle[localCellIdx_];
-                /* multi-dim coordinate of the local cell inside the super cell */
-                DataSpace<SuperCellSize::dim> localCell(DataSpaceOperations<SuperCellSize::dim>::template map<SuperCellSize > (particleCellIdx));
-                /* interpolation of density */
-                const picongpu::traits::FieldPosition<fields::CellType, FieldTmp> fieldPosRho;
-                ValueType_Rho densityV = Field2ParticleInterpolation()
-                    (cachedRho.shift(localCell).toCursor(), pos, fieldPosRho());
-                /*                          and energy density field on the particle position */
-                const picongpu::traits::FieldPosition<fields::CellType, FieldTmp> fieldPosEne;
-                ValueType_Ene kinEnergyV = Field2ParticleInterpolation()
-                    (cachedEne.shift(localCell).toCursor(), pos, fieldPosEne());
-
-                /* density in sim units */
-                float_X const density = densityV[0];
-                /* energy density in sim units */
-                float_X const kinEnergyDensity = kinEnergyV[0];
-
-                /* Returns the new number of bound electrons for an integer number of macro electrons */
-                IonizationAlgorithm ionizeAlgo;
-                uint32_t newMacroElectrons = ionizeAlgo(
-                    kinEnergyDensity,
-                    density,
-                    particle,
-                    this->randomGen(acc)
-                );
-
-
-                return newMacroElectrons;
-
-            }
-
-            /* Functor implementation
-             *
-             * Ionization model specific particle creation
-             *
-             * \tparam T_parentIon type of ion species that is being ionized
-             * \tparam T_childElectron type of electron species that is created
-             * \param parentIon ion instance that is ionized
-             * \param childElectron electron instance that is created
-             */
-            template<typename T_parentIon, typename T_childElectron, typename T_Acc>
-            DINLINE void operator()(T_Acc const & acc, T_parentIon& parentIon,T_childElectron& childElectron)
-            {
-                /* for not mixing operations::assign up with the nvidia functor assign */
-                namespace partOp = pmacc::particles::operations;
-                /* each thread sets the multiMask hard on "particle" (=1) */
-                childElectron[multiMask_] = 1u;
-                const float_X weighting = parentIon[weighting_];
-
-                /* each thread initializes a clone of the parent ion but leaving out
-                 * some attributes:
-                 * - multiMask: reading from global memory takes longer than just setting it again explicitly
-                 * - momentum: because the electron would get a higher energy because of the ion mass
-                 * - boundElectrons: because species other than ions or atoms do not have them
-                 * (gets AUTOMATICALLY deselected because electrons do not have this attribute)
+                template<typename T_Acc>
+                DINLINE void init(
+                    T_Acc const& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const int& linearThreadIdx,
+                    const DataSpace<simDim>& localCellOffset)
+                {
+                    /* initialize random number generator with the local cell index in the simulation */
+                    this->randomGen.init(localCellOffset);
+                }
+
+                /** Determine number of new macro electrons due to ionization
+                 *
+                 * @param ionFrame reference to frame of the to-be-ionized particles
+                 * @param localIdx local (linear) index in super cell / frame
                  */
-                auto targetElectronClone = partOp::deselect<bmpl::vector2<multiMask, momentum> >(childElectron);
-
-                partOp::assign(targetElectronClone, partOp::deselect<particleId>(parentIon));
-
-                const float_X massIon = attribute::getMass(weighting,parentIon);
-                const float_X massElectron = attribute::getMass(weighting,childElectron);
-
-                const float3_X electronMomentum (parentIon[momentum_]*(massElectron/massIon));
-
-                childElectron[momentum_] = electronMomentum;
-
-                /* conservation of momentum
-                 * \todo add conservation of mass */
-                parentIon[momentum_] -= electronMomentum;
-
-                /** ionization of the ion by reducing the number of bound electrons
+                template<typename T_Acc>
+                DINLINE uint32_t numNewParticles(T_Acc const& acc, FrameType& ionFrame, int localIdx)
+                {
+                    /* alias for the single macro-particle */
+                    auto particle = ionFrame[localIdx];
+                    /* particle position, used for field-to-particle interpolation */
+                    floatD_X const pos = particle[position_];
+                    int const particleCellIdx = particle[localCellIdx_];
+                    /* multi-dim coordinate of the local cell inside the super cell */
+                    DataSpace<SuperCellSize::dim> localCell(
+                        DataSpaceOperations<SuperCellSize::dim>::template map<SuperCellSize>(particleCellIdx));
+                    /* interpolation of density */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldTmp> fieldPosRho;
+                    ValueType_Rho densityV
+                        = Field2ParticleInterpolation()(cachedRho.shift(localCell).toCursor(), pos, fieldPosRho());
+                    /*                          and energy density field on the particle position */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldTmp> fieldPosEne;
+                    ValueType_Ene kinEnergyV
+                        = Field2ParticleInterpolation()(cachedEne.shift(localCell).toCursor(), pos, fieldPosEne());
+
+                    /* density in sim units */
+                    float_X const density = densityV[0];
+                    /* energy density in sim units */
+                    float_X const kinEnergyDensity = kinEnergyV[0];
+
+                    /* Returns the new number of bound electrons for an integer number of macro electrons */
+                    IonizationAlgorithm ionizeAlgo;
+                    uint32_t newMacroElectrons = ionizeAlgo(kinEnergyDensity, density, particle, this->randomGen(acc));
+
+
+                    return newMacroElectrons;
+                }
+
+                /* Functor implementation
                  *
-                 * @warning substracting a float from a float can potentially
-                 *          create a negative boundElectrons number for the ion,
-                 *          see #1850 for details
+                 * Ionization model specific particle creation
+                 *
+                 * \tparam T_parentIon type of ion species that is being ionized
+                 * \tparam T_childElectron type of electron species that is created
+                 * \param parentIon ion instance that is ionized
+                 * \param childElectron electron instance that is created
                  */
-                parentIon[boundElectrons_] -= float_X(1.);
-            }
-
-    };
-
-} // namespace ionization
-} // namespace particles
+                template<typename T_parentIon, typename T_childElectron, typename T_Acc>
+                DINLINE void operator()(T_Acc const& acc, T_parentIon& parentIon, T_childElectron& childElectron)
+                {
+                    /* for not mixing operations::assign up with the nvidia functor assign */
+                    namespace partOp = pmacc::particles::operations;
+                    /* each thread sets the multiMask hard on "particle" (=1) */
+                    childElectron[multiMask_] = 1u;
+                    const float_X weighting = parentIon[weighting_];
+
+                    /* each thread initializes a clone of the parent ion but leaving out
+                     * some attributes:
+                     * - multiMask: reading from global memory takes longer than just setting it again explicitly
+                     * - momentum: because the electron would get a higher energy because of the ion mass
+                     * - boundElectrons: because species other than ions or atoms do not have them
+                     * (gets AUTOMATICALLY deselected because electrons do not have this attribute)
+                     */
+                    auto targetElectronClone = partOp::deselect<bmpl::vector2<multiMask, momentum>>(childElectron);
+
+                    partOp::assign(targetElectronClone, partOp::deselect<particleId>(parentIon));
+
+                    const float_X massIon = attribute::getMass(weighting, parentIon);
+                    const float_X massElectron = attribute::getMass(weighting, childElectron);
+
+                    const float3_X electronMomentum(parentIon[momentum_] * (massElectron / massIon));
+
+                    childElectron[momentum_] = electronMomentum;
+
+                    /* conservation of momentum
+                     * \todo add conservation of mass */
+                    parentIon[momentum_] -= electronMomentum;
+
+                    /** ionization of the ion by reducing the number of bound electrons
+                     *
+                     * @warning substracting a float from a float can potentially
+                     *          create a negative boundElectrons number for the ion,
+                     *          see #1850 for details
+                     */
+                    parentIon[boundElectrons_] -= float_X(1.);
+                }
+            };
+
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byCollision/collisionalIonizationCalc.def b/include/picongpu/particles/ionization/byCollision/collisionalIonizationCalc.def
index e8773886c0..16c4bb0c94 100644
--- a/include/picongpu/particles/ionization/byCollision/collisionalIonizationCalc.def
+++ b/include/picongpu/particles/ionization/byCollision/collisionalIonizationCalc.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -28,17 +28,14 @@
 
 namespace picongpu
 {
+    namespace particles
+    {
+        namespace ionization
+        {
+            struct AlgorithmThomasFermi;
 
-namespace particles
-{
-
-namespace ionization
-{
-
-    struct AlgorithmThomasFermi;
-
-} // namespace ionization
+        } // namespace ionization
 
-} // namespace particles
+    } // namespace particles
 
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byCollision/collisionalIonizationCalc.hpp b/include/picongpu/particles/ionization/byCollision/collisionalIonizationCalc.hpp
index 4608ddb5d0..5b819262f6 100644
--- a/include/picongpu/particles/ionization/byCollision/collisionalIonizationCalc.hpp
+++ b/include/picongpu/particles/ionization/byCollision/collisionalIonizationCalc.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/ionization/byCollision/ionizers.def b/include/picongpu/particles/ionization/byCollision/ionizers.def
index 165b934ecc..ec6dcf3d92 100644
--- a/include/picongpu/particles/ionization/byCollision/ionizers.def
+++ b/include/picongpu/particles/ionization/byCollision/ionizers.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/ionization/byCollision/ionizers.hpp b/include/picongpu/particles/ionization/byCollision/ionizers.hpp
index e40eec175e..8153ba4bec 100644
--- a/include/picongpu/particles/ionization/byCollision/ionizers.hpp
+++ b/include/picongpu/particles/ionization/byCollision/ionizers.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/ionization/byField/ADK/ADK.def b/include/picongpu/particles/ionization/byField/ADK/ADK.def
index 27d4922bec..535c507af9 100644
--- a/include/picongpu/particles/ionization/byField/ADK/ADK.def
+++ b/include/picongpu/particles/ionization/byField/ADK/ADK.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -19,78 +19,83 @@
 
 #pragma once
 
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.def"
 #include <pmacc/types.hpp>
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** Ammosov-Delone-Krainov tunneling model
-     *
-     * \tparam T_DestSpecies electron species to be created
-     * \tparam T_SrcSpecies ion species to be ionized
-     *         default is boost::mpl placeholder because specialization
-     *         cannot be known in list of particle species' flags
-     *         \see speciesDefinition.param
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies = bmpl::_1>
-    struct ADK_Impl;
-
-    /** Ammosov-Delone-Krainov tunneling model - linear laser polarization
-     *
-     * - takes the ionization energies of the various charge states of ions
-     * - calculates the ionization rates and then the ionization probabilities from them
-     * - ATTENTION: this approach is not very applicable for rapidly changing high intensity laser fields
-     * - this is a Monte Carlo method: if a random number is smaller
-     *   or equal than the ionization probability -> increase the charge state
-     * - see for example: Delone, N. B.; Krainov, V. P. (1998).
-     *   "Tunneling and barrier-suppression ionization of atoms and ions in a laser radiation field"
-     *   doi:10.1070/PU1998v041n05ABEH000393
-     *
-     * wrapper class,
-     * needed because the SrcSpecies cannot be known during the
-     * first specialization of the ionization model in the particle definition
-     * \see speciesDefinition.param
-     */
-    template<typename T_DestSpecies>
-    struct ADKLinPol
+    namespace particles
     {
-        /* Boolean value that results in an additional polarization factor in
-         * the ionization rate for linear polarization */
-        static constexpr bool linPol = true;
-        using IonizationAlgorithm = particles::ionization::AlgorithmADK< linPol >;
-        using type = ADK_Impl< IonizationAlgorithm, T_DestSpecies >;
-    };
+        namespace ionization
+        {
+            /** Ammosov-Delone-Krainov tunneling model
+             *
+             * \tparam T_DestSpecies electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             * \tparam T_SrcSpecies ion species to be ionized
+             *         default is boost::mpl placeholder because specialization
+             *         cannot be known in list of particle species' flags
+             *         \see speciesDefinition.param
+             */
+            template<
+                typename T_IonizationAlgorithm,
+                typename T_DestSpecies,
+                typename T_IonizationCurrent,
+                typename T_SrcSpecies = bmpl::_1>
+            struct ADK_Impl;
 
-    /** Ammosov-Delone-Krainov tunneling model - circular laser polarization
-     *
-     * - takes the ionization energies of the various charge states of ions
-     * - calculates the ionization rates and then the ionization probabilities from them
-     * - ATTENTION: this approach is not very applicable for rapidly changing high intensity laser fields
-     * - this is a Monte Carlo method: if a random number is smaller
-     *   or equal than the ionization probability -> increase the charge state
-     * - see for example: Delone, N. B.; Krainov, V. P. (1998).
-     *   "Tunneling and barrier-suppression ionization of atoms and ions in a laser radiation field"
-     *   doi:10.1070/PU1998v041n05ABEH000393
-     *
-     * wrapper class,
-     * needed because the SrcSpecies cannot be known during the
-     * first specialization of the ionization model in the particle definition
-     * \see speciesDefinition.param
-     */
-    template<typename T_DestSpecies>
-    struct ADKCircPol
-    {
-        /* Boolean value that results in an additional polarization factor in
-         * the ionization rate for linear polarization */
-        static constexpr bool linPol = false;
-        using IonizationAlgorithm = particles::ionization::AlgorithmADK< linPol >;
-        using type = ADK_Impl< IonizationAlgorithm, T_DestSpecies >;
-    };
+            /** Ammosov-Delone-Krainov tunneling model - linear laser polarization
+             *
+             * - takes the ionization energies of the various charge states of ions
+             * - calculates the ionization rates and then the ionization probabilities from them
+             * - ATTENTION: this approach is not very applicable for rapidly changing high intensity laser fields
+             * - this is a Monte Carlo method: if a random number is smaller
+             *   or equal than the ionization probability -> increase the charge state
+             * - see for example: Delone, N. B.; Krainov, V. P. (1998).
+             *   "Tunneling and barrier-suppression ionization of atoms and ions in a laser radiation field"
+             *   doi:10.1070/PU1998v041n05ABEH000393
+             *
+             * wrapper class,
+             * needed because the SrcSpecies cannot be known during the
+             * first specialization of the ionization model in the particle definition
+             * \see speciesDefinition.param
+             */
+            template<typename T_DestSpecies, typename T_IonizationCurrent = current::None>
+            struct ADKLinPol
+            {
+                /* Boolean value that results in an additional polarization factor in
+                 * the ionization rate for linear polarization */
+                static constexpr bool linPol = true;
+                using IonizationAlgorithm = particles::ionization::AlgorithmADK<linPol>;
+                using type = ADK_Impl<IonizationAlgorithm, T_DestSpecies, T_IonizationCurrent>;
+            };
+
+            /** Ammosov-Delone-Krainov tunneling model - circular laser polarization
+             *
+             * - takes the ionization energies of the various charge states of ions
+             * - calculates the ionization rates and then the ionization probabilities from them
+             * - ATTENTION: this approach is not very applicable for rapidly changing high intensity laser fields
+             * - this is a Monte Carlo method: if a random number is smaller
+             *   or equal than the ionization probability -> increase the charge state
+             * - see for example: Delone, N. B.; Krainov, V. P. (1998).
+             *   "Tunneling and barrier-suppression ionization of atoms and ions in a laser radiation field"
+             *   doi:10.1070/PU1998v041n05ABEH000393
+             *
+             * wrapper class,
+             * needed because the SrcSpecies cannot be known during the
+             * first specialization of the ionization model in the particle definition
+             * \see speciesDefinition.param
+             */
+            template<typename T_DestSpecies, typename T_IonizationCurrent = current::None>
+            struct ADKCircPol
+            {
+                /* Boolean value that results in an additional polarization factor in
+                 * the ionization rate for linear polarization */
+                static constexpr bool linPol = false;
+                using IonizationAlgorithm = particles::ionization::AlgorithmADK<linPol>;
+                using type = ADK_Impl<IonizationAlgorithm, T_DestSpecies, T_IonizationCurrent>;
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/ADK/ADK_Impl.hpp b/include/picongpu/particles/ionization/byField/ADK/ADK_Impl.hpp
index 1fa1a13650..a6c8563f91 100644
--- a/include/picongpu/particles/ionization/byField/ADK/ADK_Impl.hpp
+++ b/include/picongpu/particles/ionization/byField/ADK/ADK_Impl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -22,7 +22,6 @@
 #include "picongpu/simulation_defines.hpp"
 #include <pmacc/traits/Resolve.hpp>
 #include <pmacc/particles/meta/FindByNameOrType.hpp>
-#include "picongpu/traits/UsesRNG.hpp"
 
 #include "picongpu/fields/CellType.hpp"
 #include "picongpu/fields/FieldB.hpp"
@@ -30,6 +29,8 @@
 #include "picongpu/traits/FieldPosition.hpp"
 #include "picongpu/particles/ionization/byField/ADK/ADK.def"
 #include "picongpu/particles/ionization/byField/ADK/AlgorithmADK.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/JIonizationCalc.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/JIonizationAssignment.hpp"
 
 #include <pmacc/random/methods/methods.hpp>
 #include <pmacc/random/distributions/Uniform.hpp>
@@ -45,277 +46,231 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    /** specialization of the UsesRNG trait
-     * --> ionization module uses random number generation
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies>
-    struct UsesRNG<particles::ionization::ADK_Impl<T_IonizationAlgorithm, T_DestSpecies, T_SrcSpecies> > :
-    public boost::true_type
-    {
-    };
-} // namespace traits
-
-namespace particles
-{
-namespace ionization
-{
-
-    /** \struct ADK_Impl
-     *
-     * \brief Ammosov-Delone-Krainov
-     *        Tunneling ionization for hydrogenlike atoms
-     *
-     * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
-     * \tparam T_SrcSpecies type or name as boost::mpl::string of the particle species that is ionized
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies>
-    struct ADK_Impl
+    namespace particles
     {
-
-        using DestSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_DestSpecies
-        >;
-        using SrcSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_SrcSpecies
-        >;
-
-        using FrameType = typename SrcSpecies::FrameType;
-
-        /* specify field to particle interpolation scheme */
-        using Field2ParticleInterpolation = typename pmacc::traits::Resolve<
-            typename GetFlagType<FrameType,interpolation<> >::type
-        >::type;
-
-        /* margins around the supercell for the interpolation of the field on the cells */
-        using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
-        using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
-
-        /* relevant area of a block */
-        using BlockArea = SuperCellDescription<
-            typename MappingDesc::SuperCellSize,
-            LowerMargin,
-            UpperMargin
-        >;
-
-        BlockArea BlockDescription;
-
-        private:
-
-            /* define ionization ALGORITHM (calculation) for ionization MODEL */
-            using IonizationAlgorithm = T_IonizationAlgorithm;
-
-            /* random number generator */
-            using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
-            using Distribution = pmacc::random::distributions::Uniform<float_X>;
-            using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
-            RandomGen randomGen;
-
-            using TVec = MappingDesc::SuperCellSize;
-
-            using ValueType_E = FieldE::ValueType;
-            using ValueType_B = FieldB::ValueType;
-            /* global memory EM-field device databoxes */
-            PMACC_ALIGN(eBox, FieldE::DataBoxType);
-            PMACC_ALIGN(bBox, FieldB::DataBoxType);
-            /* shared memory EM-field device databoxes */
-            PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize,1> >);
-            PMACC_ALIGN(cachedB, DataBox<SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize,0> >);
-
-        public:
-            /* host constructor initializing member : random number generator */
-            ADK_Impl(const uint32_t currentStep) : randomGen(RNGFactory::createRandom<Distribution>())
-            {
-                DataConnector &dc = Environment<>::get().DataConnector();
-                /* initialize pointers on host-side E-(B-)field databoxes */
-                auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-                auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
-                /* initialize device-side E-(B-)field databoxes */
-                eBox = fieldE->getDeviceDataBox();
-                bBox = fieldB->getDeviceDataBox();
-
-            }
-
-            /** cache fields used by this functor
+        namespace ionization
+        {
+            /** \struct ADK_Impl
              *
-             * @warning this is a collective method and calls synchronize
+             * \brief Ammosov-Delone-Krainov
+             *        Tunneling ionization for hydrogenlike atoms
              *
-             * @tparam T_Acc alpaka accelerator type
-             * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-             *
-             * @param acc alpaka accelerator
-             * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
-             * @param workerCfg configuration of the worker
+             * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             * \tparam T_SrcSpecies type or name as boost::mpl::string of the particle species that is ionized
              */
             template<
-                typename T_Acc ,
-                typename T_WorkerCfg
-            >
-            DINLINE void collectiveInit(
-                const T_Acc & acc,
-                const DataSpace<simDim>& blockCell,
-                const T_WorkerCfg & workerCfg
-            )
-            {
-                /* caching of E and B fields */
-                cachedB = CachedBox::create<
-                    0,
-                    ValueType_B
-                >(
-                    acc,
-                    BlockArea()
-                );
-                cachedE = CachedBox::create<
-                    1,
-                    ValueType_E
-                >(
-                    acc,
-                    BlockArea()
-                );
-
-                /* instance of nvidia assignment operator */
-                nvidia::functors::Assign assign;
-                /* copy fields from global to shared */
-                auto fieldBBlock = bBox.shift(blockCell);
-                ThreadCollective<
-                    BlockArea,
-                    T_WorkerCfg::numWorkers
-                > collective( workerCfg.getWorkerIdx( ) );
-                collective(
-                          acc,
-                          assign,
-                          cachedB,
-                          fieldBBlock
-                          );
-                /* copy fields from global to shared */
-                auto fieldEBlock = eBox.shift(blockCell);
-                collective(
-                          acc,
-                          assign,
-                          cachedE,
-                          fieldEBlock
-                          );
-
-                /* wait for shared memory to be initialized */
-                __syncthreads();
-            }
-
-            /** Initialization function on device
-             *
-             * \brief Cache EM-fields on device
-             *         and initialize possible prerequisites for ionization, like e.g. random number generator.
-             *
-             * This function will be called inline on the device which must happen BEFORE threads diverge
-             * during loop execution. The reason for this is the `__syncthreads()` call which is necessary after
-             * initializing the E-/B-field shared boxes in shared memory.
-             */
-            template< typename T_Acc >
-            DINLINE void init(
-                T_Acc const & acc,
-                const DataSpace<simDim>& blockCell,
-                const int& linearThreadIdx,
-                const DataSpace<simDim>& localCellOffset
-            )
-            {
-                /* initialize random number generator with the local cell index in the simulation */
-                this->randomGen.init(localCellOffset);
-            }
-
-            /** Determine number of new macro electrons due to ionization
-             *
-             * \param ionFrame reference to frame of the to-be-ionized particles
-             * \param localIdx local (linear) index in super cell / frame
-             */
-            template< typename T_Acc >
-            DINLINE uint32_t numNewParticles(const T_Acc& acc, FrameType& ionFrame, int localIdx)
-            {
-                /* alias for the single macro-particle */
-                auto particle = ionFrame[localIdx];
-                /* particle position, used for field-to-particle interpolation */
-                floatD_X pos = particle[position_];
-                const int particleCellIdx = particle[localCellIdx_];
-                /* multi-dim coordinate of the local cell inside the super cell */
-                DataSpace<TVec::dim> localCell(DataSpaceOperations<TVec::dim>::template map<TVec > (particleCellIdx));
-                /* interpolation of E- */
-                const picongpu::traits::FieldPosition<
-                    fields::CellType,
-                    FieldE
-                > fieldPosE;
-                ValueType_E eField = Field2ParticleInterpolation()
-                    (cachedE.shift(localCell).toCursor(), pos, fieldPosE());
-                /*                     and B-field on the particle position */
-                const picongpu::traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
-                ValueType_B bField = Field2ParticleInterpolation()
-                    (cachedB.shift(localCell).toCursor(), pos, fieldPosB());
-
-                /* define number of bound macro electrons before ionization */
-                float_X prevBoundElectrons = particle[boundElectrons_];
-
-                IonizationAlgorithm ionizeAlgo;
-                /* determine number of new macro electrons to be created */
-                uint32_t newMacroElectrons = ionizeAlgo(
-                                                bField, eField,
-                                                particle, this->randomGen(acc)
-                                                );
-
-                return newMacroElectrons;
-
-            }
-
-            /* Functor implementation
-             *
-             * Ionization model specific particle creation
-             *
-             * \tparam T_parentIon type of ion species that is being ionized
-             * \tparam T_childElectron type of electron species that is created
-             * \param parentIon ion instance that is ionized
-             * \param childElectron electron instance that is created
-             */
-            template<typename T_parentIon, typename T_childElectron, typename T_Acc>
-            DINLINE void operator()(const T_Acc& acc, T_parentIon& parentIon,T_childElectron& childElectron)
+                typename T_IonizationAlgorithm,
+                typename T_DestSpecies,
+                typename T_IonizationCurrent,
+                typename T_SrcSpecies>
+            struct ADK_Impl
             {
-                /* for not mixing operations::assign up with the nvidia functor assign */
-                namespace partOp = pmacc::particles::operations;
-                /* each thread sets the multiMask hard on "particle" (=1) */
-                childElectron[multiMask_] = 1u;
-                const float_X weighting = parentIon[weighting_];
-
-                /* each thread initializes a clone of the parent ion but leaving out
-                 * some attributes:
-                 * - multiMask: reading from global memory takes longer than just setting it again explicitly
-                 * - momentum: because the electron would get a higher energy because of the ion mass
-                 * - boundElectrons: because species other than ions or atoms do not have them
-                 * (gets AUTOMATICALLY deselected because electrons do not have this attribute)
+                using DestSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_DestSpecies>;
+                using SrcSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SrcSpecies>;
+
+                using FrameType = typename SrcSpecies::FrameType;
+
+                /* specify field to particle interpolation scheme */
+                using Field2ParticleInterpolation =
+                    typename pmacc::traits::Resolve<typename GetFlagType<FrameType, interpolation<>>::type>::type;
+
+                /* margins around the supercell for the interpolation of the field on the cells */
+                using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
+                using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
+
+                /* relevant area of a block */
+                using BlockArea = SuperCellDescription<typename MappingDesc::SuperCellSize, LowerMargin, UpperMargin>;
+
+                BlockArea BlockDescription;
+
+            private:
+                /* define ionization ALGORITHM (calculation) for ionization MODEL */
+                using IonizationAlgorithm = T_IonizationAlgorithm;
+
+                /* random number generator */
+                using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
+                using Distribution = pmacc::random::distributions::Uniform<float_X>;
+                using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
+                RandomGen randomGen;
+
+                using TVec = MappingDesc::SuperCellSize;
+
+                using ValueType_E = FieldE::ValueType;
+                using ValueType_B = FieldB::ValueType;
+                /* global memory EM-field and current density device databoxes */
+                PMACC_ALIGN(eBox, FieldE::DataBoxType);
+                PMACC_ALIGN(bBox, FieldB::DataBoxType);
+                PMACC_ALIGN(jBox, FieldJ::DataBoxType);
+                /* shared memory EM-field device databoxes */
+                PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1>>);
+                PMACC_ALIGN(cachedB, DataBox<SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize, 0>>);
+
+            public:
+                /* host constructor initializing member : random number generator */
+                ADK_Impl(const uint32_t currentStep) : randomGen(RNGFactory::createRandom<Distribution>())
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    /* initialize pointers on host-side E-(B-)field and current density databoxes */
+                    auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+                    auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
+                    auto fieldJ = dc.get<FieldJ>(FieldJ::getName(), true);
+                    /* initialize device-side E-(B-)field and current density databoxes */
+                    eBox = fieldE->getDeviceDataBox();
+                    bBox = fieldB->getDeviceDataBox();
+                    jBox = fieldJ->getDeviceDataBox();
+                }
+
+                /** cache fields used by this functor
+                 *
+                 * @warning this is a collective method and calls synchronize
+                 *
+                 * @tparam T_Acc alpaka accelerator type
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 *
+                 * @param acc alpaka accelerator
+                 * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
+                 * @param workerCfg configuration of the worker
                  */
-                auto targetElectronClone = partOp::deselect<bmpl::vector2<multiMask, momentum> >(childElectron);
-
-                partOp::assign(targetElectronClone, partOp::deselect<particleId>(parentIon));
-
-                const float_X massIon = attribute::getMass(weighting,parentIon);
-                const float_X massElectron = attribute::getMass(weighting,childElectron);
-
-                const float3_X electronMomentum (parentIon[momentum_]*(massElectron/massIon));
-
-                childElectron[momentum_] = electronMomentum;
-
-                /* conservation of momentum
-                 * \todo add conservation of mass */
-                parentIon[momentum_] -= electronMomentum;
-
-                /** ionization of the ion by reducing the number of bound electrons
+                template<typename T_Acc, typename T_WorkerCfg>
+                DINLINE void collectiveInit(
+                    const T_Acc& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const T_WorkerCfg& workerCfg)
+                {
+                    /* shift origin of jbox to supercell of particle */
+                    jBox = jBox.shift(blockCell);
+
+                    /* caching of E and B fields */
+                    cachedB = CachedBox::create<0, ValueType_B>(acc, BlockArea());
+                    cachedE = CachedBox::create<1, ValueType_E>(acc, BlockArea());
+
+                    /* instance of nvidia assignment operator */
+                    nvidia::functors::Assign assign;
+                    /* copy fields from global to shared */
+                    auto fieldBBlock = bBox.shift(blockCell);
+                    ThreadCollective<BlockArea, T_WorkerCfg::numWorkers> collective(workerCfg.getWorkerIdx());
+                    collective(acc, assign, cachedB, fieldBBlock);
+                    /* copy fields from global to shared */
+                    auto fieldEBlock = eBox.shift(blockCell);
+                    collective(acc, assign, cachedE, fieldEBlock);
+
+                    /* wait for shared memory to be initialized */
+                    cupla::__syncthreads(acc);
+                }
+
+                /** Initialization function on device
+                 *
+                 * \brief Cache EM-fields on device
+                 *         and initialize possible prerequisites for ionization, like e.g. random number generator.
                  *
-                 * @warning substracting a float from a float can potentially
-                 *          create a negative boundElectrons number for the ion,
-                 *          see #1850 for details
+                 * This function will be called inline on the device which must happen BEFORE threads diverge
+                 * during loop execution. The reason for this is the `cupla::__syncthreads( acc )` call which is
+                 * necessary after initializing the E-/B-field shared boxes in shared memory.
                  */
-                parentIon[boundElectrons_] -= float_X(1.);
-            }
-
-    };
-
-} // namespace ionization
-} // namespace particles
+                template<typename T_Acc>
+                DINLINE void init(
+                    T_Acc const& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const int& linearThreadIdx,
+                    const DataSpace<simDim>& localCellOffset)
+                {
+                    /* initialize random number generator with the local cell index in the simulation */
+                    this->randomGen.init(localCellOffset);
+                }
+
+                /** Determine number of new macro electrons due to ionization
+                 *
+                 * \param ionFrame reference to frame of the to-be-ionized particles
+                 * \param localIdx local (linear) index in super cell / frame
+                 */
+                template<typename T_Acc>
+                DINLINE uint32_t numNewParticles(const T_Acc& acc, FrameType& ionFrame, int localIdx)
+                {
+                    /* alias for the single macro-particle */
+                    auto particle = ionFrame[localIdx];
+                    /* particle position, used for field-to-particle interpolation */
+                    floatD_X pos = particle[position_];
+                    const int particleCellIdx = particle[localCellIdx_];
+                    /* multi-dim coordinate of the local cell inside the super cell */
+                    DataSpace<TVec::dim> localCell(
+                        DataSpaceOperations<TVec::dim>::template map<TVec>(particleCellIdx));
+                    /* interpolation of E- */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
+                    ValueType_E eField
+                        = Field2ParticleInterpolation()(cachedE.shift(localCell).toCursor(), pos, fieldPosE());
+                    /*                     and B-field on the particle position */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
+                    ValueType_B bField
+                        = Field2ParticleInterpolation()(cachedB.shift(localCell).toCursor(), pos, fieldPosB());
+
+                    /* define number of bound macro electrons before ionization */
+                    float_X prevBoundElectrons = particle[boundElectrons_];
+
+                    IonizationAlgorithm ionizeAlgo;
+                    /* determine number of new macro electrons to be created and energy used for ionization */
+                    auto retValue = ionizeAlgo(bField, eField, particle, this->randomGen(acc));
+                    IonizationCurrent<T_Acc, T_DestSpecies, simDim, T_IonizationCurrent>{}(
+                        retValue,
+                        particle[weighting_],
+                        jBox.shift(localCell),
+                        eField,
+                        acc,
+                        pos);
+
+                    return retValue.newMacroElectrons;
+                }
+
+                /* Functor implementation
+                 *
+                 * Ionization model specific particle creation
+                 *
+                 * \tparam T_parentIon type of ion species that is being ionized
+                 * \tparam T_childElectron type of electron species that is created
+                 * \param parentIon ion instance that is ionized
+                 * \param childElectron electron instance that is created
+                 */
+                template<typename T_parentIon, typename T_childElectron, typename T_Acc>
+                DINLINE void operator()(const T_Acc& acc, T_parentIon& parentIon, T_childElectron& childElectron)
+                {
+                    /* for not mixing operations::assign up with the nvidia functor assign */
+                    namespace partOp = pmacc::particles::operations;
+                    /* each thread sets the multiMask hard on "particle" (=1) */
+                    childElectron[multiMask_] = 1u;
+                    const float_X weighting = parentIon[weighting_];
+
+                    /* each thread initializes a clone of the parent ion but leaving out
+                     * some attributes:
+                     * - multiMask: reading from global memory takes longer than just setting it again explicitly
+                     * - momentum: because the electron would get a higher energy because of the ion mass
+                     * - boundElectrons: because species other than ions or atoms do not have them
+                     * (gets AUTOMATICALLY deselected because electrons do not have this attribute)
+                     */
+                    auto targetElectronClone = partOp::deselect<bmpl::vector2<multiMask, momentum>>(childElectron);
+
+                    partOp::assign(targetElectronClone, partOp::deselect<particleId>(parentIon));
+
+                    const float_X massIon = attribute::getMass(weighting, parentIon);
+                    const float_X massElectron = attribute::getMass(weighting, childElectron);
+
+                    const float3_X electronMomentum(parentIon[momentum_] * (massElectron / massIon));
+
+                    childElectron[momentum_] = electronMomentum;
+
+                    /* conservation of momentum
+                     * \todo add conservation of mass */
+                    parentIon[momentum_] -= electronMomentum;
+
+                    /** ionization of the ion by reducing the number of bound electrons
+                     *
+                     * @warning substracting a float from a float can potentially
+                     *          create a negative boundElectrons number for the ion,
+                     *          see #1850 for details
+                     */
+                    parentIon[boundElectrons_] -= float_X(1.);
+                }
+            };
+
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/ADK/AlgorithmADK.hpp b/include/picongpu/particles/ionization/byField/ADK/AlgorithmADK.hpp
index a6ed85592f..663e1357bf 100644
--- a/include/picongpu/particles/ionization/byField/ADK/AlgorithmADK.hpp
+++ b/include/picongpu/particles/ionization/byField/ADK/AlgorithmADK.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -27,6 +27,7 @@
 #include <pmacc/algorithms/math/defines/pi.hpp>
 #include <pmacc/algorithms/math/floatMath/floatingPoint.tpp>
 #include "picongpu/particles/ionization/utilities.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp"
 
 /** \file AlgorithmADK.hpp
  *
@@ -36,110 +37,103 @@
  *   states by decreasing the number of bound electrons
  * - is called with the IONIZATION MODEL, specifically by setting the flag in
  *   speciesDefinition.param
-*/
+ */
 
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** Calculation for the Ammosov-Delone-Krainov tunneling model
-     *
-     * for either linear or circular laser polarization
-     *
-     * \tparam T_linPol boolean value that is true for lin. pol. and false for circ. pol.
-     */
-    template<bool T_linPol>
-    struct AlgorithmADK
+    namespace particles
     {
-        /** Functor implementation
-         * \tparam EType type of electric field
-         * \tparam BType type of magnetic field
-         * \tparam ParticleType type of particle to be ionized
-         *
-         * \param bField magnetic field value at t=0
-         * \param eField electric field value at t=0
-         * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
-         * \param randNr random number, equally distributed in range [0.:1.0]
-         *
-         * \return number of new macro electrons to be created
-         */
-        template<typename EType, typename BType, typename ParticleType >
-        HDINLINE uint32_t
-        operator()( const BType bField, const EType eField, ParticleType& parentIon, float_X randNr )
+        namespace ionization
         {
-
-            float_X const protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
-            float_X const chargeState = attribute::getChargeState(parentIon);
-
-            /* verify that ion is not completely ionized */
-            if( chargeState < protonNumber )
+            /** Calculation for the Ammosov-Delone-Krainov tunneling model
+             *
+             * for either linear or circular laser polarization
+             *
+             * \tparam T_linPol boolean value that is true for lin. pol. and false for circ. pol.
+             */
+            template<bool T_linPol>
+            struct AlgorithmADK
             {
-                uint32_t const cs = math::float2int_rd(chargeState);
-                float_X const iEnergy = typename GetIonizationEnergies<ParticleType>::type{ }[cs];
-
-                constexpr float_X pi = pmacc::algorithms::math::Pi< float_X >::value;
-                /* electric field in atomic units - only absolute value */
-                float_X const eInAU = math::abs( eField ) / ATOMIC_UNIT_EFIELD;
-
-                /* the charge that attracts the electron that is to be ionized:
-                 * equals `protonNumber - #allInnerElectrons`
-                 */
-                float_X const effectiveCharge = chargeState + float_X( 1.0 );
-                /* effective principal quantum number (unitless) */
-                float_X const nEff = effectiveCharge / math::sqrt( float_X( 2.0 ) * iEnergy );
-                /* nameless variable for convenience dFromADK*/
-                float_X const dBase = float_X( 4.0 ) * util::cube( effectiveCharge ) /
-                    ( eInAU * util::quad( nEff ) ) ;
-                float_X const dFromADK = math::pow( dBase, nEff );
-
-                /* ionization rate (for CIRCULAR polarization)*/
-                float_X rateADK = eInAU * util::square( dFromADK ) /
-                    ( float_X( 8.0 ) * pi * effectiveCharge ) *
-                    math::exp( float_X( -2.0 ) * util::cube( effectiveCharge ) /
-                               ( float_X( 3.0 ) * util::cube( nEff ) * eInAU )
-                    );
-
-                /* in case of linear polarization the rate is modified by an additional factor */
-                if( T_linPol )
-                {
-                    /* factor from averaging over one laser cycle with LINEAR polarization */
-                    float_X const polarizationFactor = math::sqrt(
-                        float_X( 3.0 ) * util::cube( nEff ) * eInAU /
-                        ( pi * util::cube( effectiveCharge ) )
-                    );
-
-                    rateADK *= polarizationFactor;
-                }
-
-                /* simulation time step in atomic units */
-                float_X const timeStepAU = float_X( DELTA_T / ATOMIC_UNIT_TIME );
-                /* ionization probability
+                /** Functor implementation
+                 * \tparam EType type of electric field
+                 * \tparam BType type of magnetic field
+                 * \tparam ParticleType type of particle to be ionized
                  *
-                 * probability = rate * time step
-                 * --> for infinitesimal time steps
+                 * \param bField magnetic field value at t=0
+                 * \param eField electric field value at t=0
+                 * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
+                 * \param randNr random number, equally distributed in range [0.:1.0]
                  *
-                 * the whole ensemble should then follow
-                 * P = 1 - exp(-rate * time step) if the laser wavelength is
-                 * sampled well enough
+                 * \return ionization energy and number of new macro electrons to be created
                  */
-                float_X const probADK = rateADK * timeStepAU;
-
-                /* ionization condition */
-                if( randNr < probADK )
+                template<typename EType, typename BType, typename ParticleType>
+                HDINLINE IonizerReturn
+                operator()(const BType bField, const EType eField, ParticleType& parentIon, float_X randNr)
                 {
-                    /* return number of macro electrons to produce */
-                    return 1u;
+                    float_X const protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
+                    float_X const chargeState = attribute::getChargeState(parentIon);
+
+                    /* verify that ion is not completely ionized */
+                    if(chargeState < protonNumber)
+                    {
+                        uint32_t const cs = pmacc::math::float2int_rd(chargeState);
+                        float_X const iEnergy = typename GetIonizationEnergies<ParticleType>::type{}[cs];
+
+                        constexpr float_X pi = pmacc::math::Pi<float_X>::value;
+                        /* electric field in atomic units - only absolute value */
+                        float_X const eInAU = math::abs(eField) / ATOMIC_UNIT_EFIELD;
+
+                        /* the charge that attracts the electron that is to be ionized:
+                         * equals `protonNumber - #allInnerElectrons`
+                         */
+                        float_X const effectiveCharge = chargeState + float_X(1.0);
+                        /* effective principal quantum number (unitless) */
+                        float_X const nEff = effectiveCharge / math::sqrt(float_X(2.0) * iEnergy);
+                        /* nameless variable for convenience dFromADK*/
+                        float_X const dBase = float_X(4.0) * util::cube(effectiveCharge) / (eInAU * util::quad(nEff));
+                        float_X const dFromADK = math::pow(dBase, nEff);
+
+                        /* ionization rate (for CIRCULAR polarization)*/
+                        float_X rateADK = eInAU * util::square(dFromADK) / (float_X(8.0) * pi * effectiveCharge)
+                            * math::exp(float_X(-2.0) * util::cube(effectiveCharge)
+                                        / (float_X(3.0) * util::cube(nEff) * eInAU));
+
+                        /* in case of linear polarization the rate is modified by an additional factor */
+                        if(T_linPol)
+                        {
+                            /* factor from averaging over one laser cycle with LINEAR polarization */
+                            float_X const polarizationFactor = math::sqrt(
+                                float_X(3.0) * util::cube(nEff) * eInAU / (pi * util::cube(effectiveCharge)));
+
+                            rateADK *= polarizationFactor;
+                        }
+
+                        /* simulation time step in atomic units */
+                        float_X const timeStepAU = float_X(DELTA_T / ATOMIC_UNIT_TIME);
+                        /* ionization probability
+                         *
+                         * probability = rate * time step
+                         * --> for infinitesimal time steps
+                         *
+                         * the whole ensemble should then follow
+                         * P = 1 - exp(-rate * time step) if the laser wavelength is
+                         * sampled well enough
+                         */
+                        float_X const probADK = rateADK * timeStepAU;
+
+                        /* ionization condition */
+                        if(randNr < probADK)
+                        {
+                            /* return ionization energy and number of macro electrons to produce */
+                            return IonizerReturn{iEnergy, 1u};
+                        }
+                    }
+                    /* no ionization */
+                    return IonizerReturn{0.0, 0u};
                 }
-            }
-            /* no ionization */
-            return 0u;
-        }
-    };
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSI.hpp b/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSI.hpp
index 0192943bf3..264d8d3803 100644
--- a/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSI.hpp
+++ b/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSI.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten
+/* Copyright 2014-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -23,6 +23,7 @@
 #include "picongpu/particles/traits/GetIonizationEnergies.hpp"
 #include "picongpu/particles/traits/GetAtomicNumbers.hpp"
 #include "picongpu/traits/attribute/GetChargeState.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp"
 
 /** @file AlgorithmBSI.hpp
  *
@@ -35,61 +36,56 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** Calculation for the Barrier Suppression Ionization model
-     */
-    struct AlgorithmBSI
+    namespace particles
     {
-
-        /** Functor implementation
-         *
-         * \tparam EType type of electric field
-         * \tparam ParticleType type of particle to be ionized
-         *
-         * \param eField electric field value at t=0
-         * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
-         *
-         * and "t" being with respect to the current time step (on step/half a step backward/-""-forward)
-         *
-         * \return the number of electrons to produce
-         * (current implementation supports only 0 or 1 per execution)
-         */
-        template<typename EType, typename ParticleType >
-        HDINLINE uint32_t
-        operator()( const EType eField, ParticleType& parentIon )
+        namespace ionization
         {
-
-            float_X const protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
-            float_X const chargeState = attribute::getChargeState(parentIon);
-
-            /* verify that ion is not completely ionized */
-            if (chargeState < protonNumber)
+            /** Calculation for the Barrier Suppression Ionization model
+             */
+            struct AlgorithmBSI
             {
-                uint32_t const cs = math::float2int_rd(chargeState);
-                /* ionization potential in atomic units */
-                float_X const iEnergy = typename GetIonizationEnergies<ParticleType>::type{ }[cs];
-                /* the charge that attracts the electron that is to be ionized:
-                 * equals `protonNumber - no. allInnerElectrons`
+                /** Functor implementation
+                 *
+                 * \tparam EType type of electric field
+                 * \tparam ParticleType type of particle to be ionized
+                 *
+                 * \param eField electric field value at t=0
+                 * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
+                 *
+                 * and "t" being with respect to the current time step (on step/half a step backward/-""-forward)
+                 *
+                 * \return ionization energy and number of new macro electrons to be created
+                 * (current implementation supports only 0 or 1 per execution)
                  */
-                float_X const effectiveCharge = chargeState + float_X(1.0);
-                /* critical field strength in atomic units */
-                float_X const critField = iEnergy*iEnergy / (float_X(4.0) * effectiveCharge);
-                /* ionization condition */
-                if (math::abs(eField) / ATOMIC_UNIT_EFIELD >= critField)
+                template<typename EType, typename ParticleType>
+                HDINLINE IonizerReturn operator()(const EType eField, ParticleType& parentIon)
                 {
-                    /* return number of macro electrons to produce */
-                    return 1u;
+                    float_X const protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
+                    float_X const chargeState = attribute::getChargeState(parentIon);
+
+                    /* verify that ion is not completely ionized */
+                    if(chargeState < protonNumber)
+                    {
+                        uint32_t const cs = pmacc::math::float2int_rd(chargeState);
+                        /* ionization potential in atomic units */
+                        float_X const iEnergy = typename GetIonizationEnergies<ParticleType>::type{}[cs];
+                        /* the charge that attracts the electron that is to be ionized:
+                         * equals `protonNumber - no. allInnerElectrons`
+                         */
+                        float_X const effectiveCharge = chargeState + float_X(1.0);
+                        /* critical field strength in atomic units */
+                        float_X const critField = iEnergy * iEnergy / (float_X(4.0) * effectiveCharge);
+                        /* ionization condition */
+                        if(math::abs(eField) / ATOMIC_UNIT_EFIELD >= critField)
+                        {
+                            /* return ionization energy and number of macro electrons to produce */
+                            return IonizerReturn{iEnergy, 1u};
+                        }
+                    }
+                    return IonizerReturn{0.0, 0u};
                 }
-            }
-            /* no ionization */
-            return 0u;
-        }
-    };
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSIEffectiveZ.hpp b/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSIEffectiveZ.hpp
index 56fa3fd4d4..472e219969 100644
--- a/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSIEffectiveZ.hpp
+++ b/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSIEffectiveZ.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten
+/* Copyright 2014-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -24,6 +24,7 @@
 #include "picongpu/particles/traits/GetAtomicNumbers.hpp"
 #include "picongpu/particles/traits/GetEffectiveNuclearCharge.hpp"
 #include "picongpu/traits/attribute/GetChargeState.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp"
 
 /** @file AlgorithmBSIEffectiveZ.hpp
  *
@@ -36,59 +37,55 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** Calculation for the Barrier Suppression Ionization model
-     */
-    struct AlgorithmBSIEffectiveZ
+    namespace particles
     {
-
-        /** Functor implementation
-         *
-         * \tparam EType type of electric field
-         * \tparam ParticleType type of particle to be ionized
-         *
-         * \param eField electric field value at t=0
-         * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
-         *
-         * and "t" being with respect to the current time step (on step/half a step backward/-""-forward)
-         *
-         * \return the number of electrons to produce
-         * (current implementation supports only 0 or 1 per execution)
-         */
-        template<typename EType, typename ParticleType >
-        HDINLINE uint32_t
-        operator()( const EType eField, ParticleType& parentIon )
+        namespace ionization
         {
-
-            const float_X protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
-            float_X chargeState = attribute::getChargeState(parentIon);
-
-            /* verify that ion is not completely ionized */
-            if (chargeState < protonNumber)
+            /** Calculation for the Barrier Suppression Ionization model
+             */
+            struct AlgorithmBSIEffectiveZ
             {
-                uint32_t cs = math::float2int_rd(chargeState);
-                /* ionization potential in atomic units */
-                const float_X iEnergy = typename GetIonizationEnergies<ParticleType>::type{ }[cs];
-                const float_X ZEff = typename GetEffectiveNuclearCharge<ParticleType>::type{ }[cs];
-                /* critical field strength in atomic units */
-                float_X critField = iEnergy*iEnergy / (float_X(4.0) * ZEff);
-
-                /* ionization condition */
-                if (math::abs(eField) / ATOMIC_UNIT_EFIELD >= critField)
+                /** Functor implementation
+                 *
+                 * \tparam EType type of electric field
+                 * \tparam ParticleType type of particle to be ionized
+                 *
+                 * \param eField electric field value at t=0
+                 * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
+                 *
+                 * and "t" being with respect to the current time step (on step/half a step backward/-""-forward)
+                 *
+                 * \return ionization energy and number of new macro electrons to be created
+                 * (current implementation supports only 0 or 1 per execution)
+                 */
+                template<typename EType, typename ParticleType>
+                HDINLINE IonizerReturn operator()(const EType eField, ParticleType& parentIon)
                 {
-                    /* return number of macro electrons to produce */
-                    return 1u;
+                    const float_X protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
+                    float_X chargeState = attribute::getChargeState(parentIon);
+
+                    /* verify that ion is not completely ionized */
+                    if(chargeState < protonNumber)
+                    {
+                        uint32_t cs = pmacc::math::float2int_rd(chargeState);
+                        /* ionization potential in atomic units */
+                        const float_X iEnergy = typename GetIonizationEnergies<ParticleType>::type{}[cs];
+                        const float_X ZEff = typename GetEffectiveNuclearCharge<ParticleType>::type{}[cs];
+                        /* critical field strength in atomic units */
+                        float_X critField = iEnergy * iEnergy / (float_X(4.0) * ZEff);
+
+                        /* ionization condition */
+                        if(math::abs(eField) / ATOMIC_UNIT_EFIELD >= critField)
+                        {
+                            /* return ionization energy and number of macro electrons to produce */
+                            return IonizerReturn{iEnergy, 1u};
+                        }
+                    }
+                    /* no ionization */
+                    return IonizerReturn{0.0, 0u};
                 }
-            }
-            /* no ionization */
-            return 0u;
-        }
-    };
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSIStarkShifted.hpp b/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSIStarkShifted.hpp
index bf8e898648..cb5e6320ba 100644
--- a/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSIStarkShifted.hpp
+++ b/include/picongpu/particles/ionization/byField/BSI/AlgorithmBSIStarkShifted.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten
+/* Copyright 2014-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -23,6 +23,7 @@
 #include "picongpu/particles/traits/GetIonizationEnergies.hpp"
 #include "picongpu/particles/traits/GetAtomicNumbers.hpp"
 #include "picongpu/traits/attribute/GetChargeState.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp"
 
 /** @file AlgorithmBSIStarkShifted.hpp
  *
@@ -35,58 +36,55 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** Calculation for the Barrier Suppression Ionization model
-     */
-    struct AlgorithmBSIStarkShifted
+    namespace particles
     {
-
-        /** Functor implementation
-         *
-         * \tparam EType type of electric field
-         * \tparam ParticleType type of particle to be ionized
-         *
-         * \param eField electric field value at t=0
-         * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
-         *
-         * and "t" being with respect to the current time step (on step/half a step backward/-""-forward)
-         *
-         * \return the number of electrons to produce
-         * (current implementation supports only 0 or 1 per execution)
-         */
-        template<typename EType, typename ParticleType >
-        HDINLINE uint32_t
-        operator()( const EType eField, ParticleType& parentIon )
+        namespace ionization
         {
-
-            const float_X protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
-            float_X chargeState = attribute::getChargeState(parentIon);
-
-            /* verify that ion is not completely ionized */
-            if (chargeState < protonNumber)
+            /** Calculation for the Barrier Suppression Ionization model
+             */
+            struct AlgorithmBSIStarkShifted
             {
-                uint32_t cs = math::float2int_rd(chargeState);
-                /* ionization potential in atomic units */
-                const float_X iEnergy = typename GetIonizationEnergies<ParticleType>::type{ }[cs];
-                /* critical field strength in atomic units */
-                float_X critField = (math::sqrt(float_X(2.))-float_X(1.)) * math::pow(iEnergy,float_X(3./2.));
-
-                /* ionization condition */
-                if (math::abs(eField) / ATOMIC_UNIT_EFIELD >= critField)
+                /** Functor implementation
+                 *
+                 * \tparam EType type of electric field
+                 * \tparam ParticleType type of particle to be ionized
+                 *
+                 * \param eField electric field value at t=0
+                 * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
+                 *
+                 * and "t" being with respect to the current time step (on step/half a step backward/-""-forward)
+                 *
+                 * \return ionization energy and number of new macro electrons to be created
+                 * (current implementation supports only 0 or 1 per execution)
+                 */
+                template<typename EType, typename ParticleType>
+                HDINLINE IonizerReturn operator()(const EType eField, ParticleType& parentIon)
                 {
-                    /* return number of electrons to produce */
-                    return 1u;
+                    const float_X protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
+                    float_X chargeState = attribute::getChargeState(parentIon);
+
+                    /* verify that ion is not completely ionized */
+                    if(chargeState < protonNumber)
+                    {
+                        uint32_t cs = pmacc::math::float2int_rd(chargeState);
+                        /* ionization potential in atomic units */
+                        const float_X iEnergy = typename GetIonizationEnergies<ParticleType>::type{}[cs];
+                        /* critical field strength in atomic units */
+                        float_X critField
+                            = (math::sqrt(float_X(2.)) - float_X(1.)) * math::pow(iEnergy, float_X(3. / 2.));
+
+                        /* ionization condition */
+                        if(math::abs(eField) / ATOMIC_UNIT_EFIELD >= critField)
+                        {
+                            /* return ionization energy number of electrons to produce */
+                            return IonizerReturn{iEnergy, 1u};
+                        }
+                    }
+                    /* no ionization */
+                    return IonizerReturn{0.0, 0u};
                 }
-            }
-            /* no ionization */
-            return 0u;
-        }
-    };
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/BSI/BSI.def b/include/picongpu/particles/ionization/byField/BSI/BSI.def
index 5a2373b673..bc62925c8d 100644
--- a/include/picongpu/particles/ionization/byField/BSI/BSI.def
+++ b/include/picongpu/particles/ionization/byField/BSI/BSI.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -20,88 +20,97 @@
 #pragma once
 
 #include <pmacc/types.hpp>
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.def"
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-    /** Barrier Suppression Ionization - Implementation
-     *
-     * \tparam T_DestSpecies electron species to be created
-     * \tparam T_SrcSpecies particle species that is ionized
-     *         default is boost::mpl placeholder because specialization
-     *         cannot be known in list of particle species' flags
-     *         \see speciesDefinition.param
-     */
-    template< typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies = bmpl::_1 >
-    struct BSI_Impl;
-
-    /** Barrier Suppression Ionization - Hydrogen-Like
-     *
-     * - takes the ionization energies of the various charge states of ions
-     * - calculates the corresponding field strengths necessary to overcome
-     *   the binding energy of the electron to the core
-     * - if the field strength is locally exceeded: increase the charge state
-     * - see for example: Delone, N. B.; Krainov, V. P. (1998).
-     *   "Tunneling and barrier-suppression ionization of atoms and ions in a laser radiation field"
-     *   doi:10.1070/PU1998v041n05ABEH000393
-     *
-     * - This model accounts for naive ion charge shielding by inner electrons
-     *   as it assumes that the charge the electron 'feels' is equal to
-     *   `proton number - number of inner shell electrons`.
-     * - This model neglects the Stark upshift of ionization energies.
-     *
-     * \tparam T_DestSpecies electron species to be created
-     *
-     * wrapper class,
-     * needed because the SrcSpecies cannot be known during the
-     * first specialization of the ionization model in the particle definition
-     * \see speciesDefinition.param
-     */
-    template< typename T_DestSpecies >
-    struct BSI
+    namespace particles
     {
-        using IonizationAlgorithm = particles::ionization::AlgorithmBSI;
-        using type = BSI_Impl< IonizationAlgorithm, T_DestSpecies >;
-    };
+        namespace ionization
+        {
+            /** Barrier Suppression Ionization - Implementation
+             *
+             * \tparam T_DestSpecies electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             * \tparam T_SrcSpecies particle species that is ionized
+             *         default is boost::mpl placeholder because specialization
+             *         cannot be known in list of particle species' flags
+             *         \see speciesDefinition.param
+             */
+            template<
+                typename T_IonizationAlgorithm,
+                typename T_DestSpecies,
+                typename T_IonizationCurrent,
+                typename T_SrcSpecies = bmpl::_1>
+            struct BSI_Impl;
 
-    /** Barrier Suppression Ionization - Effective Atomic Numbers
-     *
-     * - similar to BSI
-     *
-     * - tries to account for electron shielding by issuing a lookup table of
-     *   effective atomic numbers for each filled electron shell @see ionizer.param
-     * - unvalidated and still in development
-     *
-     * \tparam T_DestSpecies electron species to be created
-     */
-    template< typename T_DestSpecies >
-    struct BSIEffectiveZ
-    {
-        using IonizationAlgorithm = particles::ionization::AlgorithmBSIEffectiveZ;
-        using type = BSI_Impl< IonizationAlgorithm, T_DestSpecies >;
-    };
+            /** Barrier Suppression Ionization - Hydrogen-Like
+             *
+             * - takes the ionization energies of the various charge states of ions
+             * - calculates the corresponding field strengths necessary to overcome
+             *   the binding energy of the electron to the core
+             * - if the field strength is locally exceeded: increase the charge state
+             * - see for example: Delone, N. B.; Krainov, V. P. (1998).
+             *   "Tunneling and barrier-suppression ionization of atoms and ions in a laser radiation field"
+             *   doi:10.1070/PU1998v041n05ABEH000393
+             *
+             * - This model accounts for naive ion charge shielding by inner electrons
+             *   as it assumes that the charge the electron 'feels' is equal to
+             *   `proton number - number of inner shell electrons`.
+             * - This model neglects the Stark upshift of ionization energies.
+             *
+             * \tparam T_DestSpecies electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             *
+             * wrapper class,
+             * needed because the SrcSpecies cannot be known during the
+             * first specialization of the ionization model in the particle definition
+             * \see speciesDefinition.param
+             */
+            template<typename T_DestSpecies, typename T_IonizationCurrent = current::None>
+            struct BSI
+            {
+                using IonizationAlgorithm = particles::ionization::AlgorithmBSI;
+                using type = BSI_Impl<IonizationAlgorithm, T_DestSpecies, T_IonizationCurrent>;
+            };
 
-    /** Barrier Suppression Ionization - Ion. energies Stark-upshifted
-     *
-     * - similar to BSI
-     *
-     * - developed by Bauer and Mulser (book: High Power Laser Matter Interaction)
-     * - accounts for stark upshift of ionization energy but only covers the
-     *   hydrogenlike ions originally
-     * - \todo needs to be extrapolated to arbitrary ions
-     *
-     * \tparam T_DestSpecies electron species to be created
-     */
-    template< typename T_DestSpecies >
-    struct BSIStarkShifted
-    {
-        using IonizationAlgorithm = particles::ionization::AlgorithmBSIStarkShifted;
-        using type = BSI_Impl< IonizationAlgorithm, T_DestSpecies >;
-    };
+            /** Barrier Suppression Ionization - Effective Atomic Numbers
+             *
+             * - similar to BSI
+             *
+             * - tries to account for electron shielding by issuing a lookup table of
+             *   effective atomic numbers for each filled electron shell @see ionizer.param
+             * - unvalidated and still in development
+             *
+             * \tparam T_DestSpecies electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             */
+            template<typename T_DestSpecies, typename T_IonizationCurrent = current::None>
+            struct BSIEffectiveZ
+            {
+                using IonizationAlgorithm = particles::ionization::AlgorithmBSIEffectiveZ;
+                using type = BSI_Impl<IonizationAlgorithm, T_DestSpecies, T_IonizationCurrent>;
+            };
+
+            /** Barrier Suppression Ionization - Ion. energies Stark-upshifted
+             *
+             * - similar to BSI
+             *
+             * - developed by Bauer and Mulser (book: High Power Laser Matter Interaction)
+             * - accounts for stark upshift of ionization energy but only covers the
+             *   hydrogenlike ions originally
+             * - \todo needs to be extrapolated to arbitrary ions
+             *
+             * \tparam T_DestSpecies electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             */
+            template<typename T_DestSpecies, typename T_IonizationCurrent = current::None>
+            struct BSIStarkShifted
+            {
+                using IonizationAlgorithm = particles::ionization::AlgorithmBSIStarkShifted;
+                using type = BSI_Impl<IonizationAlgorithm, T_DestSpecies, T_IonizationCurrent>;
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/BSI/BSI_Impl.hpp b/include/picongpu/particles/ionization/byField/BSI/BSI_Impl.hpp
index 3afeed6d0d..5c97a514d1 100644
--- a/include/picongpu/particles/ionization/byField/BSI/BSI_Impl.hpp
+++ b/include/picongpu/particles/ionization/byField/BSI/BSI_Impl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -24,11 +24,13 @@
 #include "picongpu/fields/CellType.hpp"
 #include "picongpu/fields/FieldB.hpp"
 #include "picongpu/fields/FieldE.hpp"
+#include "picongpu/fields/FieldJ.hpp"
 #include "picongpu/traits/FieldPosition.hpp"
 #include "picongpu/particles/ionization/byField/BSI/BSI.def"
 #include "picongpu/particles/ionization/byField/BSI/AlgorithmBSI.hpp"
 #include "picongpu/particles/ionization/byField/BSI/AlgorithmBSIEffectiveZ.hpp"
 #include "picongpu/particles/ionization/byField/BSI/AlgorithmBSIStarkShifted.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.hpp"
 
 #include "picongpu/particles/ParticlesFunctors.hpp"
 
@@ -43,240 +45,218 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** \struct BSI_Impl
-     *
-     * \brief Barrier Suppression Ionization - Implementation
-     *
-     * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
-     * \tparam T_SrcSpecies type or name as boost::mpl::string of the particle species that is ionized
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies>
-    struct BSI_Impl
+    namespace particles
     {
-
-        using DestSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_DestSpecies
-        >;
-        using SrcSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_SrcSpecies
-        >;
-
-        using FrameType = typename SrcSpecies::FrameType;
-
-        /* specify field to particle interpolation scheme */
-        using Field2ParticleInterpolation = typename pmacc::traits::Resolve<
-            typename GetFlagType<FrameType,interpolation<> >::type
-        >::type;
-
-        /* margins around the supercell for the interpolation of the field on the cells */
-        using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
-        using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
-
-        /* relevant area of a block */
-        using BlockArea = SuperCellDescription<
-            typename MappingDesc::SuperCellSize,
-            LowerMargin,
-            UpperMargin
-        >;
-
-        BlockArea BlockDescription;
-
-        private:
-
-            /* define ionization ALGORITHM (calculation) for ionization MODEL */
-            using IonizationAlgorithm = T_IonizationAlgorithm;
-
-            using TVec = MappingDesc::SuperCellSize;
-
-            using ValueType_E = FieldE::ValueType;
-            /* global memory EM-field device databoxes */
-            FieldE::DataBoxType eBox;
-            /* shared memory EM-field device databoxes */
-            PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize,1> >);
-
-        public:
-            /* host constructor */
-            BSI_Impl(const uint32_t currentStep)
-            {
-                DataConnector &dc = Environment<>::get().DataConnector();
-                /* initialize pointers on host-side E-(B-)field databoxes */
-                auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-                /* initialize device-side E-(B-)field databoxes */
-                eBox = fieldE->getDeviceDataBox();
-
-            }
-
-            /** cache fields used by this functor
+        namespace ionization
+        {
+            /** \struct BSI_Impl
              *
-             * @warning this is a collective method and calls synchronize
+             * \brief Barrier Suppression Ionization - Implementation
              *
-             * @tparam T_Acc alpaka accelerator type
-             * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-             *
-             * @param acc alpaka accelerator
-             * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
-             * @param workerCfg configuration of the worker
+             * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             * \tparam T_SrcSpecies type or name as boost::mpl::string of the particle species that is ionized
              */
             template<
-                typename T_Acc ,
-                typename T_WorkerCfg
-            >
-            DINLINE void collectiveInit(
-                const T_Acc & acc,
-                const DataSpace<simDim>& blockCell,
-                const T_WorkerCfg & workerCfg
-            )
+                typename T_IonizationAlgorithm,
+                typename T_DestSpecies,
+                typename T_IonizationCurrent,
+                typename T_SrcSpecies>
+            struct BSI_Impl
             {
-
-                /* caching of E field */
-                cachedE = CachedBox::create<
-                    1,
-                    ValueType_E
-                >(
-                    acc,
-                    BlockArea()
-                );
-
-                /* instance of nvidia assignment operator */
-                nvidia::functors::Assign assign;
-
-                ThreadCollective<
-                    BlockArea,
-                    T_WorkerCfg::numWorkers
-                > collective( workerCfg.getWorkerIdx( ) );
-                /* copy fields from global to shared */
-                auto fieldEBlock = eBox.shift(blockCell);
-                collective(
-                          acc,
-                          assign,
-                          cachedE,
-                          fieldEBlock
-                          );
-
-                /* wait for shared memory to be initialized */
-                __syncthreads();
-            }
-
-            /** Initialization function on device
-             *
-             * \brief Cache EM-fields on device
-             *         and initialize possible prerequisites for ionization, like e.g. random number generator.
-             *
-             * This function will be called inline on the device which must happen BEFORE threads diverge
-             * during loop execution. The reason for this is the `__syncthreads()` call which is necessary after
-             * initializing the E-/B-field shared boxes in shared memory.
-             *
-             * @param blockCell Offset of the cell from the origin of the local domain
-             *                  <b>including guarding supercells</b> in units of cells
-             * @param linearThreadIdx Linearized thread ID inside the block
-             * @param localCellOffset Offset of the cell from the origin of the local
-             *                        domain, i.e. from the @see BORDER
-             *                        <b>without guarding supercells</b>
-             */
-            template< typename T_Acc >
-            DINLINE void init(
-                T_Acc const & acc,
-                const DataSpace<simDim>& blockCell,
-                const int& linearThreadIdx,
-                const DataSpace<simDim>& localCellOffset
-            )
-            {
-            }
-
-            /** Determine number of new macro electrons due to ionization
-             *
-             * \param ionFrame reference to frame of the to-be-ionized particles
-             * \param localIdx local (linear) index in super cell / frame
-             */
-            template< typename T_Acc >
-            DINLINE uint32_t numNewParticles(T_Acc const & acc, FrameType& ionFrame, int localIdx)
-            {
-                /* alias for the single macro-particle */
-                auto particle = ionFrame[localIdx];
-                /* particle position, used for field-to-particle interpolation */
-                floatD_X pos = particle[position_];
-                const int particleCellIdx = particle[localCellIdx_];
-                /* multi-dim coordinate of the local cell inside the super cell */
-                DataSpace<TVec::dim> localCell(DataSpaceOperations<TVec::dim>::template map<TVec > (particleCellIdx));
-                /* interpolation of E */
-                const picongpu::traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
-                ValueType_E eField = Field2ParticleInterpolation()
-                    (cachedE.shift(localCell).toCursor(), pos, fieldPosE());
-
-                /* define number of bound macro electrons before ionization */
-                float_X prevBoundElectrons = particle[boundElectrons_];
-
-                /* this is the point where actual ionization takes place */
-                IonizationAlgorithm ionizeAlgo;
-                /* determine number of new macro electrons to be created */
-                uint32_t newMacroElectrons = ionizeAlgo(
-                                                eField,
-                                                particle
-                                              );
-
-                return newMacroElectrons;
-
-            }
-
-            /* Functor implementation
-             *
-             * Ionization model specific particle creation
-             *
-             * \tparam T_parentIon type of ion species that is being ionized
-             * \tparam T_childElectron type of electron species that is created
-             * \param parentIon ion instance that is ionized
-             * \param childElectron electron instance that is created
-             */
-            template<typename T_parentIon, typename T_childElectron, typename T_Acc>
-            DINLINE void operator()(T_Acc const & acc, T_parentIon& parentIon,T_childElectron& childElectron)
-            {
-
-                /* for not mixing operations::assign up with the nvidia functor assign */
-                namespace partOp = pmacc::particles::operations;
-                /* each thread sets the multiMask hard on "particle" (=1) */
-                childElectron[multiMask_] = 1u;
-                const float_X weighting = parentIon[weighting_];
-
-                /* each thread initializes a clone of the parent ion but leaving out
-                 * some attributes:
-                 * - multiMask: reading from global memory takes longer than just setting it again explicitly
-                 * - momentum: because the electron would get a higher energy because of the ion mass
-                 * - boundElectrons: because species other than ions or atoms do not have them
-                 * (gets AUTOMATICALLY deselected because electrons do not have this attribute)
+                using DestSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_DestSpecies>;
+                using SrcSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SrcSpecies>;
+
+                using FrameType = typename SrcSpecies::FrameType;
+
+                /* specify field to particle interpolation scheme */
+                using Field2ParticleInterpolation =
+                    typename pmacc::traits::Resolve<typename GetFlagType<FrameType, interpolation<>>::type>::type;
+
+                /* margins around the supercell for the interpolation of the field on the cells */
+                using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
+                using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
+
+                /* relevant area of a block */
+                using BlockArea = SuperCellDescription<typename MappingDesc::SuperCellSize, LowerMargin, UpperMargin>;
+
+                BlockArea BlockDescription;
+
+            private:
+                /* define ionization ALGORITHM (calculation) for ionization MODEL */
+                using IonizationAlgorithm = T_IonizationAlgorithm;
+
+                using TVec = MappingDesc::SuperCellSize;
+
+                using ValueType_E = FieldE::ValueType;
+                /* global memory E-field and current density device databoxes */
+                FieldE::DataBoxType eBox;
+                FieldJ::DataBoxType jBox;
+                /* shared memory EM-field device databoxes */
+                PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1>>);
+
+            public:
+                /* host constructor */
+                BSI_Impl(const uint32_t currentStep)
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    /* initialize pointers on host-side E-field and current density databoxes */
+                    auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+                    auto fieldJ = dc.get<FieldJ>(FieldJ::getName(), true);
+                    /* initialize device-side E-(J-)field databoxes */
+                    eBox = fieldE->getDeviceDataBox();
+                    jBox = fieldJ->getDeviceDataBox();
+                }
+
+                /** cache fields used by this functor
+                 *
+                 * @warning this is a collective method and calls synchronize
+                 *
+                 * @tparam T_Acc alpaka accelerator type
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 *
+                 * @param acc alpaka accelerator
+                 * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
+                 * @param workerCfg configuration of the worker
                  */
-                auto targetElectronClone = partOp::deselect<bmpl::vector2<multiMask, momentum> >(childElectron);
-
-                partOp::assign(targetElectronClone, partOp::deselect<particleId>(parentIon));
-
-                const float_X massIon = attribute::getMass(weighting,parentIon);
-                const float_X massElectron = attribute::getMass(weighting,childElectron);
-
-                const float3_X electronMomentum (parentIon[momentum_]*(massElectron/massIon));
-
-                childElectron[momentum_] = electronMomentum;
-
-                /* conservation of momentum
-                 * \todo add conservation of mass */
-                parentIon[momentum_] -= electronMomentum;
-
-                /** ionization of the ion by reducing the number of bound electrons
+                template<typename T_Acc, typename T_WorkerCfg>
+                DINLINE void collectiveInit(
+                    const T_Acc& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const T_WorkerCfg& workerCfg)
+                {
+                    /* shift origin of jbox to supercell of particle */
+                    jBox = jBox.shift(blockCell);
+
+                    /* caching of E field */
+                    cachedE = CachedBox::create<1, ValueType_E>(acc, BlockArea());
+
+                    /* instance of nvidia assignment operator */
+                    nvidia::functors::Assign assign;
+
+                    ThreadCollective<BlockArea, T_WorkerCfg::numWorkers> collective(workerCfg.getWorkerIdx());
+                    /* copy fields from global to shared */
+                    auto fieldEBlock = eBox.shift(blockCell);
+                    collective(acc, assign, cachedE, fieldEBlock);
+
+                    /* wait for shared memory to be initialized */
+                    cupla::__syncthreads(acc);
+                }
+
+                /** Initialization function on device
                  *
-                 * @warning substracting a float from a float can potentially
-                 *          create a negative boundElectrons number for the ion,
-                 *          see #1850 for details
+                 * \brief Cache EM-fields on device
+                 *         and initialize possible prerequisites for ionization, like e.g. random number generator.
+                 *
+                 * This function will be called inline on the device which must happen BEFORE threads diverge
+                 * during loop execution. The reason for this is the `cupla::__syncthreads( acc )` call which is
+                 * necessary after initializing the E-/B-field shared boxes in shared memory.
+                 *
+                 * @param blockCell Offset of the cell from the origin of the local domain
+                 *                  <b>including guarding supercells</b> in units of cells
+                 * @param linearThreadIdx Linearized thread ID inside the block
+                 * @param localCellOffset Offset of the cell from the origin of the local
+                 *                        domain, i.e. from the @see BORDER
+                 *                        <b>without guarding supercells</b>
                  */
-                parentIon[boundElectrons_] -= float_X(1.);
-            }
-
-    };
-
-} // namespace ionization
-} // namespace particles
+                template<typename T_Acc>
+                DINLINE void init(
+                    T_Acc const& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const int& linearThreadIdx,
+                    const DataSpace<simDim>& localCellOffset)
+                {
+                }
+
+                /** Determine number of new macro electrons due to ionization
+                 *
+                 * \param ionFrame reference to frame of the to-be-ionized particles
+                 * \param localIdx local (linear) index in super cell / frame
+                 */
+                template<typename T_Acc>
+                DINLINE uint32_t numNewParticles(T_Acc const& acc, FrameType& ionFrame, int localIdx)
+                {
+                    /* alias for the single macro-particle */
+                    auto particle = ionFrame[localIdx];
+                    /* particle position, used for field-to-particle interpolation */
+                    floatD_X pos = particle[position_];
+                    const int particleCellIdx = particle[localCellIdx_];
+                    /* multi-dim coordinate of the local cell inside the super cell */
+                    DataSpace<TVec::dim> localCell(
+                        DataSpaceOperations<TVec::dim>::template map<TVec>(particleCellIdx));
+                    /* interpolation of E */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
+                    ValueType_E eField
+                        = Field2ParticleInterpolation()(cachedE.shift(localCell).toCursor(), pos, fieldPosE());
+
+                    /* define number of bound macro electrons before ionization */
+                    float_X prevBoundElectrons = particle[boundElectrons_];
+
+                    /* this is the point where actual ionization takes place */
+                    IonizationAlgorithm ionizeAlgo{};
+                    auto retValue = ionizeAlgo(eField, particle);
+                    /* determine number of new macro electrons to be created and calculate ionization current */
+                    IonizationCurrent<T_Acc, T_DestSpecies, simDim, T_IonizationCurrent>{}(
+                        retValue,
+                        particle[weighting_],
+                        jBox.shift(localCell),
+                        eField,
+                        acc,
+                        pos);
+
+                    return retValue.newMacroElectrons;
+                }
+
+                /* Functor implementation
+                 *
+                 * Ionization model specific particle creation
+                 *
+                 * \tparam T_parentIon type of ion species that is being ionized
+                 * \tparam T_childElectron type of electron species that is created
+                 * \param parentIon ion instance that is ionized
+                 * \param childElectron electron instance that is created
+                 */
+                template<typename T_parentIon, typename T_childElectron, typename T_Acc>
+                DINLINE void operator()(T_Acc const& acc, T_parentIon& parentIon, T_childElectron& childElectron)
+                {
+                    /* for not mixing operations::assign up with the nvidia functor assign */
+                    namespace partOp = pmacc::particles::operations;
+                    /* each thread sets the multiMask hard on "particle" (=1) */
+                    childElectron[multiMask_] = 1u;
+                    const float_X weighting = parentIon[weighting_];
+
+                    /* each thread initializes a clone of the parent ion but leaving out
+                     * some attributes:
+                     * - multiMask: reading from global memory takes longer than just setting it again explicitly
+                     * - momentum: because the electron would get a higher energy because of the ion mass
+                     * - boundElectrons: because species other than ions or atoms do not have them
+                     * (gets AUTOMATICALLY deselected because electrons do not have this attribute)
+                     */
+                    auto targetElectronClone = partOp::deselect<bmpl::vector2<multiMask, momentum>>(childElectron);
+
+                    partOp::assign(targetElectronClone, partOp::deselect<particleId>(parentIon));
+
+                    const float_X massIon = attribute::getMass(weighting, parentIon);
+                    const float_X massElectron = attribute::getMass(weighting, childElectron);
+
+                    const float3_X electronMomentum(parentIon[momentum_] * (massElectron / massIon));
+
+                    childElectron[momentum_] = electronMomentum;
+
+                    /* conservation of momentum
+                     * \todo add conservation of mass */
+                    parentIon[momentum_] -= electronMomentum;
+
+                    /** ionization of the ion by reducing the number of bound electrons
+                     *
+                     * @warning substracting a float from a float can potentially
+                     *          create a negative boundElectrons number for the ion,
+                     *          see #1850 for details
+                     */
+                    parentIon[boundElectrons_] -= float_X(1.);
+                }
+            };
+
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.def b/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.def
new file mode 100644
index 0000000000..2db8199078
--- /dev/null
+++ b/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.def
@@ -0,0 +1,55 @@
+/* Copyright 2020-2021 Jakob Trojok
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace ionization
+        {
+            namespace current
+            {
+                /** possible inputs for T_IonizationCurrent
+                 * EnergyConservation -> with ionization current
+                 * None -> without
+                 */
+                struct EnergyConservation;
+                struct None;
+            } // namespace current
+            /** Implementation of Ionization Current
+             *
+             * In order to conserve energy, PIConGPU supports an ionization current
+             * to decrease the electric field according to the amount of energy lost to field ioniztion processes.
+             *
+             * Reference: P. Mulser et al.
+             *            Modeling field ionization in an energy conserving form and resulting nonstandard fluid
+             * dynamcis, Physics of Plasmas 5, 4466 (1998) https://doi.org/10.1063/1.873184
+             *
+             * \tparam T_Acc alpaka accelerator type
+             * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
+             * \tparam T_Dim dimension of simulation
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             */
+            template<typename T_Acc, typename T_DestSpecies, unsigned T_Dim, typename T_IonizationCurrent>
+            struct IonizationCurrent;
+        } // namespace ionization
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.hpp b/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.hpp
new file mode 100644
index 0000000000..78b3e3c8d4
--- /dev/null
+++ b/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.hpp
@@ -0,0 +1,98 @@
+/* Copyright 2020-2021 Jakob Trojok
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/particles/ParticlesFunctors.hpp"
+#include "picongpu/fields/FieldE.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/JIonizationCalc.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/JIonizationAssignment.hpp"
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace ionization
+        {
+            /**@{*/
+            /** Implementation of actual ionization current
+             *
+             * \tparam T_Acc alpaka accelerator type
+             * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
+             * \tparam T_Dim dimension of simulation
+             */
+            template<typename T_Acc, typename T_DestSpecies, unsigned T_Dim>
+            struct IonizationCurrent<T_Acc, T_DestSpecies, T_Dim, current::EnergyConservation>
+            {
+                using ValueType_E = FieldE::ValueType;
+
+                /** Ionization current routine
+                 *
+                 * \tparam T_JBox type of current density data box
+                 */
+                template<typename T_JBox>
+                HDINLINE void operator()(
+                    IonizerReturn retValue,
+                    float_X const weighting,
+                    T_JBox jBoxPar,
+                    ValueType_E eField,
+                    T_Acc const& acc,
+                    floatD_X const pos)
+                {
+                    /* If there is no ionization, the ionization energy is zero. In that case, there is no need for an
+                     * ionization current. */
+                    if(retValue.ionizationEnergy != 0.0_X)
+                    {
+                        auto ionizationEnergy = weighting * retValue.ionizationEnergy * SI::ATOMIC_UNIT_ENERGY
+                            / UNIT_ENERGY; // convert to PIConGPU units
+                        /* calculate ionization current at particle position */
+                        float3_X jIonizationPar = JIonizationCalc{}(ionizationEnergy, eField);
+                        /* assign ionization current to grid points */
+                        JIonizationAssignment<T_Acc, T_DestSpecies, simDim>{}(acc, jIonizationPar, pos, jBoxPar);
+                    }
+                }
+            };
+
+            /** Ionization current deactivated
+             */
+            template<typename T_Acc, typename T_DestSpecies, unsigned T_Dim>
+            struct IonizationCurrent<T_Acc, T_DestSpecies, T_Dim, current::None>
+            {
+                using ValueType_E = FieldE::ValueType;
+
+                /** no ionization current
+                 */
+                template<typename T_JBox>
+                HDINLINE void operator()(
+                    IonizerReturn,
+                    float_X const,
+                    T_JBox,
+                    ValueType_E,
+                    T_Acc const&,
+                    floatD_X const)
+                {
+                }
+                /**@}*/
+            };
+        } // namespace ionization
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp b/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp
new file mode 100644
index 0000000000..0142dfae4b
--- /dev/null
+++ b/include/picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp
@@ -0,0 +1,39 @@
+/* Copyright 2020-2021 Jakob Trojok
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace ionization
+        {
+            /** return type for ionization algorithms
+             */
+            struct IonizerReturn
+            {
+                float_X ionizationEnergy = 0._X;
+                uint32_t newMacroElectrons = 0u;
+            };
+        } // namespace ionization
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/IonizationCurrent/JIonizationAssignment.hpp b/include/picongpu/particles/ionization/byField/IonizationCurrent/JIonizationAssignment.hpp
new file mode 100644
index 0000000000..915a80cdd2
--- /dev/null
+++ b/include/picongpu/particles/ionization/byField/IonizationCurrent/JIonizationAssignment.hpp
@@ -0,0 +1,146 @@
+/* Copyright 2020-2021 Jakob Trojok
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/ParticlesFunctors.hpp"
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/fields/FieldJ.hpp"
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace ionization
+        {
+            /** defining traits for current assignment
+             *
+             * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
+             */
+            template<typename T_DestSpecies>
+            struct JIonizationAssignmentParent
+            {
+                using Shape = typename ::picongpu::traits::GetShape<T_DestSpecies>::type;
+                using AssignmentFunction = typename Shape::ChargeAssignmentOnSupport;
+                static constexpr int supp = AssignmentFunction::support;
+                /*(supp + 1) % 2 is 1 for even supports else 0*/
+                static constexpr int begin = -supp / 2 + (supp + 1) % 2;
+                static constexpr int end = begin + supp;
+            };
+
+            /**@{*/
+            /** implementation of current assignment
+             *
+             * \tparam T_Acc alpaka accelerator type
+             * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
+             * \tparam T_Dim dimension of simulation
+             */
+            template<typename T_Acc, typename T_DestSpecies, unsigned T_Dim>
+            struct JIonizationAssignment;
+
+            /** 3d case
+             */
+            template<typename T_Acc, typename T_DestSpecies>
+            struct JIonizationAssignment<T_Acc, T_DestSpecies, DIM3>
+                : public JIonizationAssignmentParent<T_DestSpecies>
+            {
+                /** functor for  assigning current to databox
+                 *
+                 * \tparam T_JBox type of current density data box
+                 */
+                template<typename T_JBox>
+                HDINLINE void operator()(
+                    T_Acc const& acc,
+                    float3_X const jIonizationPar,
+                    float3_X const pos,
+                    T_JBox jBoxPar)
+                {
+                    /* actual assignment */
+                    for(int z = JIonizationAssignmentParent<T_DestSpecies>::begin;
+                        z < JIonizationAssignmentParent<T_DestSpecies>::end;
+                        ++z)
+                    {
+                        float3_X jGridz = jIonizationPar;
+                        jGridz *= typename JIonizationAssignmentParent<T_DestSpecies>::AssignmentFunction{}(
+                            float_X(z) - pos.z());
+                        for(int y = JIonizationAssignmentParent<T_DestSpecies>::begin;
+                            y < JIonizationAssignmentParent<T_DestSpecies>::end;
+                            ++y)
+                        {
+                            float3_X jGridy = jGridz;
+                            jGridy *= typename JIonizationAssignmentParent<T_DestSpecies>::AssignmentFunction{}(
+                                float_X(y) - pos.y());
+                            for(int x = JIonizationAssignmentParent<T_DestSpecies>::begin;
+                                x < JIonizationAssignmentParent<T_DestSpecies>::end;
+                                ++x)
+                            {
+                                float3_X jGridx = jGridy;
+                                jGridx *= typename JIonizationAssignmentParent<T_DestSpecies>::AssignmentFunction{}(
+                                    float_X(x) - pos.x());
+                                for(int i = 0; i <= 2; i++)
+                                {
+                                    cupla::atomicAdd(acc, &(jBoxPar(DataSpace<DIM3>(x, y, z))[i]), jGridx[i]);
+                                }
+                            }
+                        }
+                    }
+                }
+            };
+
+            /** 2d case
+             */
+            template<typename T_Acc, typename T_DestSpecies>
+            struct JIonizationAssignment<T_Acc, T_DestSpecies, DIM2>
+                : public JIonizationAssignmentParent<T_DestSpecies>
+            {
+                /** functor for assigning current to databox
+                 */
+                template<typename T_JBox>
+                HDINLINE void operator()(
+                    T_Acc const& acc,
+                    float3_X const jIonizationPar,
+                    float2_X const pos,
+                    T_JBox jBoxPar)
+                {
+                    for(int y = JIonizationAssignmentParent<T_DestSpecies>::begin;
+                        y < JIonizationAssignmentParent<T_DestSpecies>::end;
+                        ++y)
+                    {
+                        float3_X jGridy = jIonizationPar;
+                        jGridy *= typename JIonizationAssignmentParent<T_DestSpecies>::AssignmentFunction{}(
+                            float_X(y) - pos.y());
+                        for(int x = JIonizationAssignmentParent<T_DestSpecies>::begin;
+                            x < JIonizationAssignmentParent<T_DestSpecies>::end;
+                            ++x)
+                        {
+                            float3_X jGridx = jGridy;
+                            jGridx *= typename JIonizationAssignmentParent<T_DestSpecies>::AssignmentFunction{}(
+                                float_X(x) - pos.x());
+                            for(int i = 0; i <= 2; i++)
+                            {
+                                cupla::atomicAdd(acc, &(jBoxPar(DataSpace<DIM2>(x, y))[i]), jGridx[i]);
+                            }
+                        }
+                    }
+                }
+            };
+            /**@}*/
+        } // namespace ionization
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/IonizationCurrent/JIonizationCalc.hpp b/include/picongpu/particles/ionization/byField/IonizationCurrent/JIonizationCalc.hpp
new file mode 100644
index 0000000000..9a540f7e76
--- /dev/null
+++ b/include/picongpu/particles/ionization/byField/IonizationCurrent/JIonizationCalc.hpp
@@ -0,0 +1,46 @@
+/* Copyright 2020-2021 Jakob Trojok
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace ionization
+        {
+            /** calculates ionization current
+             */
+            struct JIonizationCalc
+            {
+                /** Functor calculating ionization current.
+                 * Is only called if ionization energy is not zero,
+                 * thus we ensure the field is different from zero.
+                 */
+                HDINLINE float3_X operator()(float_X const ionizationEnergy, float3_X const eField)
+                {
+                    float3_X jion = ionizationEnergy * eField / pmacc::math::abs2(eField) / DELTA_T / CELL_VOLUME;
+                    return jion;
+                }
+            };
+        } // namespace ionization
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/Keldysh/AlgorithmKeldysh.hpp b/include/picongpu/particles/ionization/byField/Keldysh/AlgorithmKeldysh.hpp
index ae847f05db..bc3f255551 100644
--- a/include/picongpu/particles/ionization/byField/Keldysh/AlgorithmKeldysh.hpp
+++ b/include/picongpu/particles/ionization/byField/Keldysh/AlgorithmKeldysh.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Marco Garten
+/* Copyright 2016-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -27,6 +27,7 @@
 #include <pmacc/algorithms/math/defines/pi.hpp>
 #include <pmacc/algorithms/math/floatMath/floatingPoint.tpp>
 #include "picongpu/particles/ionization/utilities.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizerReturn.hpp"
 
 /** @file AlgorithmKeldysh.hpp
  *
@@ -36,83 +37,80 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** Calculation for the Keldysh ionization model
-     *
-     * for linear laser polarization
-     */
-    struct AlgorithmKeldysh
+    namespace particles
     {
-        /** Functor implementation
-         * \tparam EType type of electric field
-         * \tparam BType type of magnetic field
-         * \tparam ParticleType type of particle to be ionized
-         *
-         * \param bField magnetic field value at t=0
-         * \param eField electric field value at t=0
-         * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
-         * \param randNr random number, equally distributed in range [0.:1.0]
-         *
-         * \return number of new macro electrons to be created
-         */
-        template<typename EType, typename BType, typename ParticleType >
-        HDINLINE uint32_t
-        operator()( const BType bField, const EType eField, ParticleType& parentIon, float_X randNr )
+        namespace ionization
         {
-
-            const float_X protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
-            float_X chargeState = attribute::getChargeState(parentIon);
-
-            /* verify that ion is not completely ionized */
-            if ( chargeState < protonNumber )
+            /** Calculation for the Keldysh ionization model
+             *
+             * for linear laser polarization
+             */
+            struct AlgorithmKeldysh
             {
-                uint32_t const cs = math::float2int_rd(chargeState);
-                const float_X iEnergy = typename GetIonizationEnergies<ParticleType>::type{ }[cs];
+                /** Functor implementation
+                 * \tparam EType type of electric field
+                 * \tparam BType type of magnetic field
+                 * \tparam ParticleType type of particle to be ionized
+                 *
+                 * \param bField magnetic field value at t=0
+                 * \param eField electric field value at t=0
+                 * \param parentIon particle instance to be ionized with position at t=0 and momentum at t=-1/2
+                 * \param randNr random number, equally distributed in range [0.:1.0]
+                 *
+                 * \return ionization energy and number of new macro electrons to be created
+                 */
+                template<typename EType, typename BType, typename ParticleType>
+                HDINLINE IonizerReturn
+                operator()(const BType bField, const EType eField, ParticleType& parentIon, float_X randNr)
+                {
+                    const float_X protonNumber = GetAtomicNumbers<ParticleType>::type::numberOfProtons;
+                    float_X chargeState = attribute::getChargeState(parentIon);
 
-                constexpr float_X pi = pmacc::algorithms::math::Pi< float_X >::value;
-                /* electric field in atomic units - only absolute value */
-                float_X eInAU = math::abs(eField) / ATOMIC_UNIT_EFIELD;
+                    /* verify that ion is not completely ionized */
+                    if(chargeState < protonNumber)
+                    {
+                        uint32_t const cs = pmacc::math::float2int_rd(chargeState);
+                        const float_X iEnergy = typename GetIonizationEnergies<ParticleType>::type{}[cs];
 
-                /* factor two avoid calculation math::pow(2,5./4.); */
-                const float_X twoToFiveQuarters = 2.3784142300054;
+                        constexpr float_X pi = pmacc::math::Pi<float_X>::value;
+                        /* electric field in atomic units - only absolute value */
+                        float_X eInAU = math::abs(eField) / ATOMIC_UNIT_EFIELD;
 
-                /* characteristic exponential function argument */
-                const float_X charExpArg = math::sqrt(util::cube(float_X(2.)*iEnergy))/eInAU;
+                        /* factor two avoid calculation math::pow(2,5./4.); */
+                        const float_X twoToFiveQuarters = 2.3784142300054;
 
-                /* ionization rate */
-                float_X rateKeldysh = math::sqrt(float_X(6.)*pi) / twoToFiveQuarters \
-                                * iEnergy * math::sqrt(float_X(1.)/charExpArg) \
-                                * math::exp(-float_X(2./3.) * charExpArg);
+                        /* characteristic exponential function argument */
+                        const float_X charExpArg = math::sqrt(util::cube(float_X(2.) * iEnergy)) / eInAU;
 
-                /* simulation time step in atomic units */
-                const float_X timeStepAU = float_X(DELTA_T / ATOMIC_UNIT_TIME);
-                /* ionization probability
-                 *
-                 * probability = rate * time step
-                 * --> for infinitesimal time steps
-                 *
-                 * the whole ensemble should then follow
-                 * P = 1 - exp(-rate * time step) if the laser wavelength is
-                 * sampled well enough
-                 */
-                float_X const probKeldysh = rateKeldysh * timeStepAU;
+                        /* ionization rate */
+                        float_X rateKeldysh = math::sqrt(float_X(6.) * pi) / twoToFiveQuarters * iEnergy
+                            * math::sqrt(float_X(1.) / charExpArg) * math::exp(-float_X(2. / 3.) * charExpArg);
 
-                /* ionization condition */
-                if( randNr < probKeldysh )
-                {
-                    /* return number of macro electrons to produce */
-                    return 1u;
+                        /* simulation time step in atomic units */
+                        const float_X timeStepAU = float_X(DELTA_T / ATOMIC_UNIT_TIME);
+                        /* ionization probability
+                         *
+                         * probability = rate * time step
+                         * --> for infinitesimal time steps
+                         *
+                         * the whole ensemble should then follow
+                         * P = 1 - exp(-rate * time step) if the laser wavelength is
+                         * sampled well enough
+                         */
+                        float_X const probKeldysh = rateKeldysh * timeStepAU;
+
+                        /* ionization condition */
+                        if(randNr < probKeldysh)
+                        {
+                            /* return ionization energy number of macro electrons to produce */
+                            return IonizerReturn{iEnergy, 1u};
+                        }
+                    }
+                    /* no ionization */
+                    return IonizerReturn{0.0, 0u};
                 }
-            }
-            /* no ionization */
-            return 0u;
-        }
-    };
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/Keldysh/Keldysh.def b/include/picongpu/particles/ionization/byField/Keldysh/Keldysh.def
index 6a4915a56a..cfa116aad4 100644
--- a/include/picongpu/particles/ionization/byField/Keldysh/Keldysh.def
+++ b/include/picongpu/particles/ionization/byField/Keldysh/Keldysh.def
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Marco Garten
+/* Copyright 2016-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -20,51 +20,56 @@
 #pragma once
 
 #include <pmacc/types.hpp>
+#include "picongpu/particles/ionization/byField/IonizationCurrent/IonizationCurrent.def"
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-    /** Keldysh model
-     *
-     * \tparam T_DestSpecies electron species to be created
-     * \tparam T_SrcSpecies ion species to be ionized
-     *         default is boost::mpl placeholder because specialization
-     *         cannot be known in list of particle species' flags
-     *         \see speciesDefinition.param
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies = bmpl::_1>
-    struct Keldysh_Impl;
-
-    /** Keldysh ionization model
-     *
-     * - Keldysh viewed ionization not as multiple different effects but rather as
-     *   one that can be classified in different ionization regimes characterized
-     *   by certain values of the Keldysh parameter
-     * - takes the ionization energies of the various charge states of ions
-     * - calculates the ionization rates and then the ionization probabilities from them
-     * - ATTENTION: this approach is not very applicable for rapidly changing high intensity laser fields
-     * - this is a Monte Carlo method: if a random number is smaller
-     *   or equal than the ionization probability -> increase the charge state
-     * - see for example: D. Bauer and P. Mulser. Exact field ionization rates in the barrier-suppression
-     *   regime from numerical time-dependent Schroedinger-equation calculations.
-     *   Physical Review A, 59(1):569+, January 1999.
-     *
-     * wrapper class,
-     * needed because the SrcSpecies cannot be known during the
-     * first specialization of the ionization model in the particle definition
-     * \see speciesDefinition.param
-     */
-    template<typename T_DestSpecies>
-    struct Keldysh
+    namespace particles
     {
-        using IonizationAlgorithm = particles::ionization::AlgorithmKeldysh;
-        using type = Keldysh_Impl< IonizationAlgorithm, T_DestSpecies >;
-    };
+        namespace ionization
+        {
+            /** Keldysh model
+             *
+             * \tparam T_DestSpecies electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             * \tparam T_SrcSpecies ion species to be ionized
+             *         default is boost::mpl placeholder because specialization
+             *         cannot be known in list of particle species' flags
+             *         \see speciesDefinition.param
+             */
+            template<
+                typename T_IonizationAlgorithm,
+                typename T_DestSpecies,
+                typename T_IonizationCurrent,
+                typename T_SrcSpecies = bmpl::_1>
+            struct Keldysh_Impl;
+
+            /** Keldysh ionization model
+             *
+             * - Keldysh viewed ionization not as multiple different effects but rather as
+             *   one that can be classified in different ionization regimes characterized
+             *   by certain values of the Keldysh parameter
+             * - takes the ionization energies of the various charge states of ions
+             * - calculates the ionization rates and then the ionization probabilities from them
+             * - ATTENTION: this approach is not very applicable for rapidly changing high intensity laser fields
+             * - this is a Monte Carlo method: if a random number is smaller
+             *   or equal than the ionization probability -> increase the charge state
+             * - see for example: D. Bauer and P. Mulser. Exact field ionization rates in the barrier-suppression
+             *   regime from numerical time-dependent Schroedinger-equation calculations.
+             *   Physical Review A, 59(1):569+, January 1999.
+             *
+             * wrapper class,
+             * needed because the SrcSpecies cannot be known during the
+             * first specialization of the ionization model in the particle definition
+             * \see speciesDefinition.param
+             */
+            template<typename T_DestSpecies, typename T_IonizationCurrent = current::None>
+            struct Keldysh
+            {
+                using IonizationAlgorithm = particles::ionization::AlgorithmKeldysh;
+                using type = Keldysh_Impl<IonizationAlgorithm, T_DestSpecies, T_IonizationCurrent>;
+            };
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/Keldysh/Keldysh_Impl.hpp b/include/picongpu/particles/ionization/byField/Keldysh/Keldysh_Impl.hpp
index 10c0fd9440..c54a1fa698 100644
--- a/include/picongpu/particles/ionization/byField/Keldysh/Keldysh_Impl.hpp
+++ b/include/picongpu/particles/ionization/byField/Keldysh/Keldysh_Impl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Marco Garten
+/* Copyright 2016-2021 Marco Garten, Jakob Trojok
  *
  * This file is part of PIConGPU.
  *
@@ -20,7 +20,6 @@
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
-#include "picongpu/traits/UsesRNG.hpp"
 
 #include "picongpu/fields/CellType.hpp"
 #include "picongpu/fields/FieldB.hpp"
@@ -28,6 +27,8 @@
 #include "picongpu/traits/FieldPosition.hpp"
 #include "picongpu/particles/ionization/byField/Keldysh/Keldysh.def"
 #include "picongpu/particles/ionization/byField/Keldysh/AlgorithmKeldysh.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/JIonizationCalc.hpp"
+#include "picongpu/particles/ionization/byField/IonizationCurrent/JIonizationAssignment.hpp"
 
 #include <pmacc/random/methods/methods.hpp>
 #include <pmacc/random/distributions/Uniform.hpp>
@@ -46,274 +47,231 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    /** specialization of the UsesRNG trait
-     * --> ionization module uses random number generation
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies>
-    struct UsesRNG<particles::ionization::Keldysh_Impl<T_IonizationAlgorithm, T_DestSpecies, T_SrcSpecies> > :
-    public boost::true_type
-    {
-    };
-} // namespace traits
-
-namespace particles
-{
-namespace ionization
-{
-
-    /** \struct Keldysh_Impl
-     *
-     * \brief Ammosov-Delone-Krainov
-     *        Tunneling ionization for hydrogenlike atoms
-     *
-     * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
-     * \tparam T_SrcSpecies type or name as boost::mpl::string of the particle species that is ionized
-     */
-    template<typename T_IonizationAlgorithm, typename T_DestSpecies, typename T_SrcSpecies>
-    struct Keldysh_Impl
+    namespace particles
     {
-
-        using DestSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_DestSpecies
-        >;
-        using SrcSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_SrcSpecies
-        >;
-
-        using FrameType = typename SrcSpecies::FrameType;
-
-        /* specify field to particle interpolation scheme */
-        using Field2ParticleInterpolation = typename pmacc::traits::Resolve<
-            typename GetFlagType<FrameType,interpolation<> >::type
-        >::type;
-
-        /* margins around the supercell for the interpolation of the field on the cells */
-        using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
-        using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
-
-        /* relevant area of a block */
-        using BlockArea = SuperCellDescription<
-            typename MappingDesc::SuperCellSize,
-            LowerMargin,
-            UpperMargin
-        >;
-
-        BlockArea BlockDescription;
-
-        private:
-
-            /* define ionization ALGORITHM (calculation) for ionization MODEL */
-            using IonizationAlgorithm = T_IonizationAlgorithm;
-
-            /* random number generator */
-            using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
-            using Distribution = pmacc::random::distributions::Uniform<float_X>;
-            using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
-            RandomGen randomGen;
-
-            using TVec = MappingDesc::SuperCellSize;
-
-            using ValueType_E = FieldE::ValueType;
-            using ValueType_B = FieldB::ValueType;
-            /* global memory EM-field device databoxes */
-            PMACC_ALIGN(eBox, FieldE::DataBoxType);
-            PMACC_ALIGN(bBox, FieldB::DataBoxType);
-            /* shared memory EM-field device databoxes */
-            PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize,1> >);
-            PMACC_ALIGN(cachedB, DataBox<SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize,0> >);
-
-        public:
-            /* host constructor initializing member : random number generator */
-            Keldysh_Impl(const uint32_t currentStep) : randomGen(RNGFactory::createRandom<Distribution>())
-            {
-                DataConnector &dc = Environment<>::get().DataConnector();
-                /* initialize pointers on host-side E-(B-)field databoxes */
-                auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-                auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
-                /* initialize device-side E-(B-)field databoxes */
-                eBox = fieldE->getDeviceDataBox();
-                bBox = fieldB->getDeviceDataBox();
-
-            }
-
-            /** cache fields used by this functor
+        namespace ionization
+        {
+            /** \struct Keldysh_Impl
              *
-             * @warning this is a collective method and calls synchronize
+             * \brief Ammosov-Delone-Krainov
+             *        Tunneling ionization for hydrogenlike atoms
              *
-             * @tparam T_Acc alpaka accelerator type
-             * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-             *
-             * @param acc alpaka accelerator
-             * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
-             * @param workerCfg configuration of the worker
+             * \tparam T_DestSpecies type or name as boost::mpl::string of the electron species to be created
+             * \tparam T_IonizationCurrent select type of ionization current (None or EnergyConservation)
+             * \tparam T_SrcSpecies type or name as boost::mpl::string of the particle species that is ionized
              */
             template<
-                typename T_Acc ,
-                typename T_WorkerCfg
-            >
-            DINLINE void collectiveInit(
-                const T_Acc & acc,
-                const DataSpace<simDim>& blockCell,
-                const T_WorkerCfg & workerCfg
-            )
-            {
-                /* caching of E and B fields */
-                cachedB = CachedBox::create<
-                    0,
-                    ValueType_B
-                >(
-                    acc,
-                    BlockArea()
-                );
-                cachedE = CachedBox::create<
-                    1,
-                    ValueType_E
-                >(
-                    acc,
-                    BlockArea()
-                );
-
-                /* instance of nvidia assignment operator */
-                nvidia::functors::Assign assign;
-                /* copy fields from global to shared */
-                auto fieldBBlock = bBox.shift(blockCell);
-                ThreadCollective<
-                    BlockArea,
-                    T_WorkerCfg::numWorkers
-                > collective( workerCfg.getWorkerIdx( ) );
-                collective(
-                          acc,
-                          assign,
-                          cachedB,
-                          fieldBBlock
-                          );
-                /* copy fields from global to shared */
-                auto fieldEBlock = eBox.shift(blockCell);
-                collective(
-                          acc,
-                          assign,
-                          cachedE,
-                          fieldEBlock
-                          );
-
-                /* wait for shared memory to be initialized */
-                __syncthreads();
-            }
-
-            /** Initialization function on device
-             *
-             * \brief Cache EM-fields on device
-             *         and initialize possible prerequisites for ionization, like e.g. random number generator.
-             *
-             * This function will be called inline on the device which must happen BEFORE threads diverge
-             * during loop execution. The reason for this is the `__syncthreads()` call which is necessary after
-             * initializing the E-/B-field shared boxes in shared memory.
-             */
-            template< typename T_Acc >
-            DINLINE void init(
-                T_Acc const & acc,
-                const DataSpace<simDim>& blockCell,
-                const int& linearThreadIdx,
-                const DataSpace<simDim>& localCellOffset
-            )
-            {
-                /* initialize random number generator with the local cell index in the simulation */
-                this->randomGen.init(localCellOffset);
-            }
-
-            /** Determine number of new macro electrons due to ionization
-             *
-             * \param ionFrame reference to frame of the to-be-ionized particles
-             * \param localIdx local (linear) index in super cell / frame
-             */
-            template< typename T_Acc >
-            DINLINE uint32_t numNewParticles(T_Acc const & acc, FrameType& ionFrame, int localIdx)
-            {
-                /* alias for the single macro-particle */
-                auto particle = ionFrame[localIdx];
-                /* particle position, used for field-to-particle interpolation */
-                floatD_X pos = particle[position_];
-                const int particleCellIdx = particle[localCellIdx_];
-                /* multi-dim coordinate of the local cell inside the super cell */
-                DataSpace<TVec::dim> localCell(DataSpaceOperations<TVec::dim>::template map<TVec > (particleCellIdx));
-                /* interpolation of E- */
-                const picongpu::traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
-                ValueType_E eField = Field2ParticleInterpolation()
-                    (cachedE.shift(localCell).toCursor(), pos, fieldPosE());
-                /*                     and B-field on the particle position */
-                const picongpu::traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
-                ValueType_B bField = Field2ParticleInterpolation()
-                    (cachedB.shift(localCell).toCursor(), pos, fieldPosB());
-
-                /* define number of bound macro electrons before ionization */
-                float_X prevBoundElectrons = particle[boundElectrons_];
-
-                IonizationAlgorithm ionizeAlgo;
-                /* determine number of new macro electrons to be created */
-                uint32_t newMacroElectrons = ionizeAlgo(
-                     bField, eField,
-                     particle, this->randomGen(acc)
-                     );
-
-                return newMacroElectrons;
-
-            }
-
-            /* Functor implementation
-             *
-             * Ionization model specific particle creation
-             *
-             * \tparam T_parentIon type of ion species that is being ionized
-             * \tparam T_childElectron type of electron species that is created
-             * \param parentIon ion instance that is ionized
-             * \param childElectron electron instance that is created
-             */
-            template<typename T_parentIon, typename T_childElectron, typename T_Acc>
-            DINLINE void operator()(T_Acc const & acc, T_parentIon& parentIon,T_childElectron& childElectron)
+                typename T_IonizationAlgorithm,
+                typename T_DestSpecies,
+                typename T_IonizationCurrent,
+                typename T_SrcSpecies>
+            struct Keldysh_Impl
             {
-                /* for not mixing operations::assign up with the nvidia functor assign */
-                namespace partOp = pmacc::particles::operations;
-                /* each thread sets the multiMask hard on "particle" (=1) */
-                childElectron[multiMask_] = 1u;
-                const float_X weighting = parentIon[weighting_];
-
-                /* each thread initializes a clone of the parent ion but leaving out
-                 * some attributes:
-                 * - multiMask: reading from global memory takes longer than just setting it again explicitly
-                 * - momentum: because the electron would get a higher energy because of the ion mass
-                 * - boundElectrons: because species other than ions or atoms do not have them
-                 * (gets AUTOMATICALLY deselected because electrons do not have this attribute)
+                using DestSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_DestSpecies>;
+                using SrcSpecies = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SrcSpecies>;
+
+                using FrameType = typename SrcSpecies::FrameType;
+
+                /* specify field to particle interpolation scheme */
+                using Field2ParticleInterpolation =
+                    typename pmacc::traits::Resolve<typename GetFlagType<FrameType, interpolation<>>::type>::type;
+
+                /* margins around the supercell for the interpolation of the field on the cells */
+                using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
+                using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
+
+                /* relevant area of a block */
+                using BlockArea = SuperCellDescription<typename MappingDesc::SuperCellSize, LowerMargin, UpperMargin>;
+
+                BlockArea BlockDescription;
+
+            private:
+                /* define ionization ALGORITHM (calculation) for ionization MODEL */
+                using IonizationAlgorithm = T_IonizationAlgorithm;
+
+                /* random number generator */
+                using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
+                using Distribution = pmacc::random::distributions::Uniform<float_X>;
+                using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
+                RandomGen randomGen;
+
+                using TVec = MappingDesc::SuperCellSize;
+
+                using ValueType_E = FieldE::ValueType;
+                using ValueType_B = FieldB::ValueType;
+                /* global memory EM-field and current density device databoxes */
+                PMACC_ALIGN(eBox, FieldE::DataBoxType);
+                PMACC_ALIGN(bBox, FieldB::DataBoxType);
+                PMACC_ALIGN(jBox, FieldJ::DataBoxType);
+                /* shared memory EM-field device databoxes */
+                PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1>>);
+                PMACC_ALIGN(cachedB, DataBox<SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize, 0>>);
+
+            public:
+                /* host constructor initializing member : random number generator */
+                Keldysh_Impl(const uint32_t currentStep) : randomGen(RNGFactory::createRandom<Distribution>())
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    /* initialize pointers on host-side E-(B-)field and current density databoxes */
+                    auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+                    auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
+                    auto fieldJ = dc.get<FieldJ>(FieldJ::getName(), true);
+                    /* initialize device-side E-(B-)field and current density databoxes */
+                    eBox = fieldE->getDeviceDataBox();
+                    bBox = fieldB->getDeviceDataBox();
+                    jBox = fieldJ->getDeviceDataBox();
+                }
+
+                /** cache fields used by this functor
+                 *
+                 * @warning this is a collective method and calls synchronize
+                 *
+                 * @tparam T_Acc alpaka accelerator type
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 *
+                 * @param acc alpaka accelerator
+                 * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
+                 * @param workerCfg configuration of the worker
                  */
-                auto targetElectronClone = partOp::deselect<bmpl::vector2<multiMask, momentum> >(childElectron);
-
-                partOp::assign(targetElectronClone, partOp::deselect<particleId>(parentIon));
-
-                const float_X massIon = attribute::getMass(weighting,parentIon);
-                const float_X massElectron = attribute::getMass(weighting,childElectron);
-
-                const float3_X electronMomentum (parentIon[momentum_]*(massElectron/massIon));
-
-                childElectron[momentum_] = electronMomentum;
-
-                /* conservation of momentum
-                 * \todo add conservation of mass */
-                parentIon[momentum_] -= electronMomentum;
-
-                /** ionization of the ion by reducing the number of bound electrons
+                template<typename T_Acc, typename T_WorkerCfg>
+                DINLINE void collectiveInit(
+                    const T_Acc& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const T_WorkerCfg& workerCfg)
+                {
+                    /* shifting origin of jbox to supercell of particle */
+                    jBox = jBox.shift(blockCell);
+
+                    /* caching of E and B fields */
+                    cachedB = CachedBox::create<0, ValueType_B>(acc, BlockArea());
+                    cachedE = CachedBox::create<1, ValueType_E>(acc, BlockArea());
+
+                    /* instance of nvidia assignment operator */
+                    nvidia::functors::Assign assign;
+                    /* copy fields from global to shared */
+                    auto fieldBBlock = bBox.shift(blockCell);
+                    ThreadCollective<BlockArea, T_WorkerCfg::numWorkers> collective(workerCfg.getWorkerIdx());
+                    collective(acc, assign, cachedB, fieldBBlock);
+                    /* copy fields from global to shared */
+                    auto fieldEBlock = eBox.shift(blockCell);
+                    collective(acc, assign, cachedE, fieldEBlock);
+
+                    /* wait for shared memory to be initialized */
+                    cupla::__syncthreads(acc);
+                }
+
+                /** Initialization function on device
+                 *
+                 * \brief Cache EM-fields on device
+                 *         and initialize possible prerequisites for ionization, like e.g. random number generator.
                  *
-                 * @warning substracting a float from a float can potentially
-                 *          create a negative boundElectrons number for the ion,
-                 *          see #1850 for details
+                 * This function will be called inline on the device which must happen BEFORE threads diverge
+                 * during loop execution. The reason for this is the `cupla::__syncthreads( acc )` call which is
+                 * necessary after initializing the E-/B-field shared boxes in shared memory.
                  */
-                parentIon[boundElectrons_] -= float_X(1.);
-            }
-
-    };
-
-} // namespace ionization
-} // namespace particles
+                template<typename T_Acc>
+                DINLINE void init(
+                    T_Acc const& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const int& linearThreadIdx,
+                    const DataSpace<simDim>& localCellOffset)
+                {
+                    /* initialize random number generator with the local cell index in the simulation */
+                    this->randomGen.init(localCellOffset);
+                }
+
+                /** Determine number of new macro electrons due to ionization
+                 *
+                 * \param ionFrame reference to frame of the to-be-ionized particles
+                 * \param localIdx local (linear) index in super cell / frame
+                 */
+                template<typename T_Acc>
+                DINLINE uint32_t numNewParticles(T_Acc const& acc, FrameType& ionFrame, int localIdx)
+                {
+                    /* alias for the single macro-particle */
+                    auto particle = ionFrame[localIdx];
+                    /* particle position, used for field-to-particle interpolation */
+                    floatD_X pos = particle[position_];
+                    const int particleCellIdx = particle[localCellIdx_];
+                    /* multi-dim coordinate of the local cell inside the super cell */
+                    DataSpace<TVec::dim> localCell(
+                        DataSpaceOperations<TVec::dim>::template map<TVec>(particleCellIdx));
+                    /* interpolation of E- */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
+                    ValueType_E eField
+                        = Field2ParticleInterpolation()(cachedE.shift(localCell).toCursor(), pos, fieldPosE());
+                    /*                     and B-field on the particle position */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
+                    ValueType_B bField
+                        = Field2ParticleInterpolation()(cachedB.shift(localCell).toCursor(), pos, fieldPosB());
+
+                    /* define number of bound macro electrons before ionization */
+                    float_X prevBoundElectrons = particle[boundElectrons_];
+
+                    IonizationAlgorithm ionizeAlgo;
+                    /* determine number of new macro electrons to be created and energy used for ionization */
+                    auto retValue = ionizeAlgo(bField, eField, particle, this->randomGen(acc));
+                    IonizationCurrent<T_Acc, T_DestSpecies, simDim, T_IonizationCurrent>{}(
+                        retValue,
+                        particle[weighting_],
+                        jBox.shift(localCell),
+                        eField,
+                        acc,
+                        pos);
+
+                    return retValue.newMacroElectrons;
+                }
+
+                /* Functor implementation
+                 *
+                 * Ionization model specific particle creation
+                 *
+                 * \tparam T_parentIon type of ion species that is being ionized
+                 * \tparam T_childElectron type of electron species that is created
+                 * \param parentIon ion instance that is ionized
+                 * \param childElectron electron instance that is created
+                 */
+                template<typename T_parentIon, typename T_childElectron, typename T_Acc>
+                DINLINE void operator()(T_Acc const& acc, T_parentIon& parentIon, T_childElectron& childElectron)
+                {
+                    /* for not mixing operations::assign up with the nvidia functor assign */
+                    namespace partOp = pmacc::particles::operations;
+                    /* each thread sets the multiMask hard on "particle" (=1) */
+                    childElectron[multiMask_] = 1u;
+                    const float_X weighting = parentIon[weighting_];
+
+                    /* each thread initializes a clone of the parent ion but leaving out
+                     * some attributes:
+                     * - multiMask: reading from global memory takes longer than just setting it again explicitly
+                     * - momentum: because the electron would get a higher energy because of the ion mass
+                     * - boundElectrons: because species other than ions or atoms do not have them
+                     * (gets AUTOMATICALLY deselected because electrons do not have this attribute)
+                     */
+                    auto targetElectronClone = partOp::deselect<bmpl::vector2<multiMask, momentum>>(childElectron);
+
+                    partOp::assign(targetElectronClone, partOp::deselect<particleId>(parentIon));
+
+                    const float_X massIon = attribute::getMass(weighting, parentIon);
+                    const float_X massElectron = attribute::getMass(weighting, childElectron);
+
+                    const float3_X electronMomentum(parentIon[momentum_] * (massElectron / massIon));
+
+                    childElectron[momentum_] = electronMomentum;
+
+                    /* conservation of momentum
+                     * \todo add conservation of mass */
+                    parentIon[momentum_] -= electronMomentum;
+
+                    /** ionization of the ion by reducing the number of bound electrons
+                     *
+                     * @warning substracting a float from a float can potentially
+                     *          create a negative boundElectrons number for the ion,
+                     *          see #1850 for details
+                     */
+                    parentIon[boundElectrons_] -= float_X(1.);
+                }
+            };
+
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/fieldIonizationCalc.def b/include/picongpu/particles/ionization/byField/fieldIonizationCalc.def
index a5b94e7b94..fba51b7a4c 100644
--- a/include/picongpu/particles/ionization/byField/fieldIonizationCalc.def
+++ b/include/picongpu/particles/ionization/byField/fieldIonizationCalc.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -28,28 +28,25 @@
 
 namespace picongpu
 {
+    namespace particles
+    {
+        namespace ionization
+        {
+            struct AlgorithmNone;
 
-namespace particles
-{
-
-namespace ionization
-{
-
-    struct AlgorithmNone;
-
-    template<bool T_polarizationType>
-    struct AlgorithmADK;
+            template<bool T_polarizationType>
+            struct AlgorithmADK;
 
-    struct AlgorithmBSI;
+            struct AlgorithmBSI;
 
-    struct AlgorithmBSIEffectiveZ;
+            struct AlgorithmBSIEffectiveZ;
 
-    struct AlgorithmBSIStarkShifted;
+            struct AlgorithmBSIStarkShifted;
 
-    struct AlgorithmKeldysh;
+            struct AlgorithmKeldysh;
 
-} // namespace ionization
+        } // namespace ionization
 
-} // namespace particles
+    } // namespace particles
 
 } // namespace picongpu
diff --git a/include/picongpu/particles/ionization/byField/fieldIonizationCalc.hpp b/include/picongpu/particles/ionization/byField/fieldIonizationCalc.hpp
index c8204a9f4c..e81ec118e8 100644
--- a/include/picongpu/particles/ionization/byField/fieldIonizationCalc.hpp
+++ b/include/picongpu/particles/ionization/byField/fieldIonizationCalc.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/ionization/byField/ionizers.def b/include/picongpu/particles/ionization/byField/ionizers.def
index 85141b4032..6933c5a151 100644
--- a/include/picongpu/particles/ionization/byField/ionizers.def
+++ b/include/picongpu/particles/ionization/byField/ionizers.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/ionization/byField/ionizers.hpp b/include/picongpu/particles/ionization/byField/ionizers.hpp
index eb9e2f6814..c422cd7f3c 100644
--- a/include/picongpu/particles/ionization/byField/ionizers.hpp
+++ b/include/picongpu/particles/ionization/byField/ionizers.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten
+/* Copyright 2015-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/ionization/utilities.hpp b/include/picongpu/particles/ionization/utilities.hpp
index c0f68bde27..2f072d4ebf 100644
--- a/include/picongpu/particles/ionization/utilities.hpp
+++ b/include/picongpu/particles/ionization/utilities.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Marco Garten, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Marco Garten, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -28,55 +28,53 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace ionization
-{
-
-namespace util
-{
-
-    /* power 2 function */
-    template<typename A>
-    HDINLINE A square(A a)
-    {
-        return a*a;
-    }
-    /* power 2 function with different result type */
-    template<typename A, typename R>
-    HDINLINE R square(A a)
-    {
-        return a*a;
-    }
-    /* power 3 function */
-    template<typename A>
-    HDINLINE A cube(A a)
-    {
-        return a * a*a;
-    }
-    /* power 3 function with different result type */
-    template<typename A, typename R>
-    HDINLINE R cube(A a)
-    {
-        return a * a*a;
-    }
-    /* power 4 function */
-    template<typename A>
-    HDINLINE A quad(A a)
-    {
-        const   A b = a*a;
-        return  b*b;
-    }
-    /* power 4 function with different result type */
-    template<typename A, typename R>
-    HDINLINE R quad(A a)
+    namespace particles
     {
-        const   R b = a*a;
-        return  b*b;
-    }
+        namespace ionization
+        {
+            namespace util
+            {
+                /* power 2 function */
+                template<typename A>
+                HDINLINE A square(A a)
+                {
+                    return a * a;
+                }
+                /* power 2 function with different result type */
+                template<typename A, typename R>
+                HDINLINE R square(A a)
+                {
+                    return a * a;
+                }
+                /* power 3 function */
+                template<typename A>
+                HDINLINE A cube(A a)
+                {
+                    return a * a * a;
+                }
+                /* power 3 function with different result type */
+                template<typename A, typename R>
+                HDINLINE R cube(A a)
+                {
+                    return a * a * a;
+                }
+                /* power 4 function */
+                template<typename A>
+                HDINLINE A quad(A a)
+                {
+                    const A b = a * a;
+                    return b * b;
+                }
+                /* power 4 function with different result type */
+                template<typename A, typename R>
+                HDINLINE R quad(A a)
+                {
+                    const R b = a * a;
+                    return b * b;
+                }
 
-}
+            } // namespace util
 
-} // namespace ionization
-} // namespace particles
+        } // namespace ionization
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/IBinary.def b/include/picongpu/particles/manipulators/IBinary.def
index 3d5baf80e9..5df86d8f41 100644
--- a/include/picongpu/particles/manipulators/IBinary.def
+++ b/include/picongpu/particles/manipulators/IBinary.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,42 +27,30 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-
-    /** interface for a binary filtered particle functor
-     *
-     * The result of the filter is linked by a logic AND operation and the functor
-     * is only called if the filter result is `true`.
-     * The user functor and filter is passed by the manipulation algorithm
-     * (e.g. picongpu::particles::ManipulateDerive, ...) to this interface, there is
-     * no need to do this explicitly in the param files.
-     *
-     * @tparam T_BinaryFunctor binary particle functor, must contain
-     *                         `void operator()(P1 & particle1, P2 & particle2, ...)`
-     *                          and support at least two particles
-     * @tparam T_UnaryFilter unary particle filter, must contain `bool operator()(P particle)`
-     *                       each particle of the `T_BinaryFunctor::operator()`is passed through the filter
-     */
-    template<
-        typename T_BinaryFunctor,
-        typename T_UnaryFilter = filter::All
-    >
-    using IBinary = pmacc::functor::Filtered<
-        pmacc::filter::operators::And,
-        pmacc::filter::Interface<
-            T_UnaryFilter,
-            1u
-        >,
-        pmacc::functor::Interface<
-            T_BinaryFunctor,
-            2u,
-            void
-        >
-    >;
+    namespace particles
+    {
+        namespace manipulators
+        {
+            /** interface for a binary filtered particle functor
+             *
+             * The result of the filter is linked by a logic AND operation and the functor
+             * is only called if the filter result is `true`.
+             * The user functor and filter is passed by the manipulation algorithm
+             * (e.g. picongpu::particles::ManipulateDerive, ...) to this interface, there is
+             * no need to do this explicitly in the param files.
+             *
+             * @tparam T_BinaryFunctor binary particle functor, must contain
+             *                         `void operator()(P1 & particle1, P2 & particle2, ...)`
+             *                          and support at least two particles
+             * @tparam T_UnaryFilter unary particle filter, must contain `bool operator()(P particle)`
+             *                       each particle of the `T_BinaryFunctor::operator()`is passed through the filter
+             */
+            template<typename T_BinaryFunctor, typename T_UnaryFilter = filter::All>
+            using IBinary = pmacc::functor::Filtered<
+                pmacc::filter::operators::And,
+                pmacc::filter::Interface<T_UnaryFilter, 1u>,
+                pmacc::functor::Interface<T_BinaryFunctor, 2u, void>>;
 
-} // namespace manipulators
-} // namespace particles
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/IUnary.def b/include/picongpu/particles/manipulators/IUnary.def
index e06984ab94..d5a6dd7db0 100644
--- a/include/picongpu/particles/manipulators/IUnary.def
+++ b/include/picongpu/particles/manipulators/IUnary.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,41 +27,29 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-
-    /** interface for a unary filtered particle functor
-     *
-     * The functor is only called if the filter result is `true`.
-     * The user functor and filter is passed by the manipulation algorithm
-     * (e.g. picongpu::particles::Manipulate, ...) to this interface, there is
-     * no need to do this explicitly in the param files.
-     *
-     * @tparam T_UnaryFunctor unary particle functor, must contain
-     *                         `void operator()(P & particle, ...)`
-     *                          and support at least one particle
-     * @tparam T_UnaryFilter unary particle filter must contain `bool operator()(P particle)`,
-     *                       each particle of the `T_UnaryFunctor::operator()` is passed through the filter
-     */
-    template<
-        typename T_UnaryFunctor,
-        typename T_UnaryFilter = filter::All
-    >
-    using IUnary = pmacc::functor::Filtered<
-        pmacc::filter::operators::And,
-        pmacc::filter::Interface<
-            T_UnaryFilter,
-            1u
-        >,
-        pmacc::functor::Interface<
-            T_UnaryFunctor,
-            1u,
-            void
-        >
-    >;
+    namespace particles
+    {
+        namespace manipulators
+        {
+            /** interface for a unary filtered particle functor
+             *
+             * The functor is only called if the filter result is `true`.
+             * The user functor and filter is passed by the manipulation algorithm
+             * (e.g. picongpu::particles::Manipulate, ...) to this interface, there is
+             * no need to do this explicitly in the param files.
+             *
+             * @tparam T_UnaryFunctor unary particle functor, must contain
+             *                         `void operator()(P & particle, ...)`
+             *                          and support at least one particle
+             * @tparam T_UnaryFilter unary particle filter must contain `bool operator()(P particle)`,
+             *                       each particle of the `T_UnaryFunctor::operator()` is passed through the filter
+             */
+            template<typename T_UnaryFunctor, typename T_UnaryFilter = filter::All>
+            using IUnary = pmacc::functor::Filtered<
+                pmacc::filter::operators::And,
+                pmacc::filter::Interface<T_UnaryFilter, 1u>,
+                pmacc::functor::Interface<T_UnaryFunctor, 1u, void>>;
 
-} // namespace manipulators
-} // namespace particles
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/binary/Assign.def b/include/picongpu/particles/manipulators/binary/Assign.def
index baafdf33bb..4c6f647675 100644
--- a/include/picongpu/particles/manipulators/binary/Assign.def
+++ b/include/picongpu/particles/manipulators/binary/Assign.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -25,62 +25,47 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace binary
-{
-namespace acc
-{
-
-    //! assign attributes of one particle to another
-    struct Assign
+    namespace particles
     {
-        /** execute assign operator
-         *
-         * @tparam T_DestParticle pmacc::Particle, type of the destination particle
-         * @tparam T_SrcParticle pmacc::Particle, type of the source particle
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param particleDest destination particle
-         * @param particleSrc source particle
-         * @param ... unused particles
-         */
-        template<
-            typename T_DestParticle,
-            typename T_SrcParticle,
-            typename ... T_Args
-        >
-        HDINLINE void
-        operator( )(
-            T_DestParticle & particleDest,
-            T_SrcParticle & particleSrc,
-            T_Args && ...
-        )
+        namespace manipulators
         {
-            pmacc::particles::operations::assign(
-                    particleDest,
-                    particleSrc
-            );
-        }
-    };
-} // namespace acc
+            namespace binary
+            {
+                namespace acc
+                {
+                    //! assign attributes of one particle to another
+                    struct Assign
+                    {
+                        /** execute assign operator
+                         *
+                         * @tparam T_DestParticle pmacc::Particle, type of the destination particle
+                         * @tparam T_SrcParticle pmacc::Particle, type of the source particle
+                         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                         *
+                         * @param particleDest destination particle
+                         * @param particleSrc source particle
+                         * @param ... unused particles
+                         */
+                        template<typename T_DestParticle, typename T_SrcParticle, typename... T_Args>
+                        HDINLINE void operator()(T_DestParticle& particleDest, T_SrcParticle& particleSrc, T_Args&&...)
+                        {
+                            pmacc::particles::operations::assign(particleDest, particleSrc);
+                        }
+                    };
+                } // namespace acc
 
-    /** assign attributes of one particle to another
-     *
-     * Can be used as binary and higher order operator but only the first two
-     * particles are used for the assign operation.
-     *
-     * Assign all matching attributes of a source particle to the destination
-     * particle. Attributes that only exist in the destination species are initialized
-     * with the default value. Attributes that only exists in the source particle will be ignored.
-     */
-    using Assign = generic::Free<
-        acc::Assign
-    >;
+                /** assign attributes of one particle to another
+                 *
+                 * Can be used as binary and higher order operator but only the first two
+                 * particles are used for the assign operation.
+                 *
+                 * Assign all matching attributes of a source particle to the destination
+                 * particle. Attributes that only exist in the destination species are initialized
+                 * with the default value. Attributes that only exists in the source particle will be ignored.
+                 */
+                using Assign = generic::Free<acc::Assign>;
 
-} // namespace binary
-} // namespace manipulators
-} // namespace particles
+            } // namespace binary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/binary/DensityWeighting.def b/include/picongpu/particles/manipulators/binary/DensityWeighting.def
index 60c79e0d99..653ad9a257 100644
--- a/include/picongpu/particles/manipulators/binary/DensityWeighting.def
+++ b/include/picongpu/particles/manipulators/binary/DensityWeighting.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,79 +26,68 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace binary
-{
-namespace acc
-{
-
-    //! Re-scale the weighting of a cloned particle by densityRatio
-    struct DensityWeighting
+    namespace particles
     {
-        /** Adjust the weighting of particleDes by densityRatio of particleDes & Src particle
-         *
-         * While deriving a particle (particleDes) from another (T_SrcParticle), one
-         * can afterwards directly normalize the weighting back to the intended density:
-         * - divide weighting with the `T_SrcParticle`'s densityRatio
-         *   (to get macro particle weighting according to reference BASE_DENSITY * profile
-         *    at this specific point in space & time)
-         * - multiply weighting with own densityRatio (to get this species'
-         *    densityRatio * BASE_DENSITY * profile)
-         *
-         * This is useful when the profile and number of macro particles for both species
-         * shall be the same and the initialization of another profile via `CreateDensity`
-         * would be expensive (or one wants to keep the exact same position while deriving).
-         *
-         * @tparam T_DesParticle type of the particle species with weighting to manipulate
-         * @tparam T_SrcParticle type of the particle species one cloned from
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param particleDest destination particle
-         * @param particleSrc source particle (the density ratio of this particle is used)
-         * @param ... unused particles
-         *
-         * @see picongpu::particles::ManipulateDerive, picongpu::kernelCloneParticles
-         */
-        template<
-            typename T_DesParticle,
-            typename T_SrcParticle,
-            typename ... T_Args
-        >
-        HDINLINE void operator()(
-            T_DesParticle & particleDes,
-            T_SrcParticle const &,
-            T_Args && ...
-        )
+        namespace manipulators
         {
-            const float_X densityRatioDes =
-                traits::GetDensityRatio< T_DesParticle >::type::getValue( );
-            const float_X densityRatioSrc =
-                traits::GetDensityRatio< T_SrcParticle >::type::getValue( );
+            namespace binary
+            {
+                namespace acc
+                {
+                    //! Re-scale the weighting of a cloned particle by densityRatio
+                    struct DensityWeighting
+                    {
+                        /** Adjust the weighting of particleDes by densityRatio of particleDes & Src particle
+                         *
+                         * While deriving a particle (particleDes) from another (T_SrcParticle), one
+                         * can afterwards directly normalize the weighting back to the intended density:
+                         * - divide weighting with the `T_SrcParticle`'s densityRatio
+                         *   (to get macro particle weighting according to reference BASE_DENSITY * profile
+                         *    at this specific point in space & time)
+                         * - multiply weighting with own densityRatio (to get this species'
+                         *    densityRatio * BASE_DENSITY * profile)
+                         *
+                         * This is useful when the profile and number of macro particles for both species
+                         * shall be the same and the initialization of another profile via `CreateDensity`
+                         * would be expensive (or one wants to keep the exact same position while deriving).
+                         *
+                         * @tparam T_DesParticle type of the particle species with weighting to manipulate
+                         * @tparam T_SrcParticle type of the particle species one cloned from
+                         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                         *
+                         * @param particleDest destination particle
+                         * @param particleSrc source particle (the density ratio of this particle is used)
+                         * @param ... unused particles
+                         *
+                         * @see picongpu::particles::ManipulateDerive, picongpu::kernelCloneParticles
+                         */
+                        template<typename T_DesParticle, typename T_SrcParticle, typename... T_Args>
+                        HDINLINE void operator()(T_DesParticle& particleDes, T_SrcParticle const&, T_Args&&...)
+                        {
+                            const float_X densityRatioDes = traits::GetDensityRatio<T_DesParticle>::type::getValue();
+                            const float_X densityRatioSrc = traits::GetDensityRatio<T_SrcParticle>::type::getValue();
 
-            particleDes[ weighting_ ] *= densityRatioDes / densityRatioSrc;
-        }
-    };
+                            particleDes[weighting_] *= densityRatioDes / densityRatioSrc;
+                        }
+                    };
 
-} // namespace acc
+                } // namespace acc
 
-    /** Re-scale the weighting of a cloned species by densityRatio
-     *
-     * When deriving species from each other, the new
-     * species "inherits" the macro-particle weighting
-     * of the first one.
-     * This functor can be used to manipulate the weighting
-     * of the new species' macro particles to satisfy the
-     * input densityRatio of it.
-     *
-     * note: needs the densityRatio flag on both species,
-     *       used by the GetDensityRatio trait.
-     */
-    using DensityWeighting = generic::Free< acc::DensityWeighting >;
+                /** Re-scale the weighting of a cloned species by densityRatio
+                 *
+                 * When deriving species from each other, the new
+                 * species "inherits" the macro-particle weighting
+                 * of the first one.
+                 * This functor can be used to manipulate the weighting
+                 * of the new species' macro particles to satisfy the
+                 * input densityRatio of it.
+                 *
+                 * note: needs the densityRatio flag on both species,
+                 *       used by the GetDensityRatio trait.
+                 */
+                using DensityWeighting = generic::Free<acc::DensityWeighting>;
 
-} // namespace binary
-} // namespace manipulators
-} // namespace particles
+            } // namespace binary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/binary/ProtonTimesWeighting.def b/include/picongpu/particles/manipulators/binary/ProtonTimesWeighting.def
index 25550c0faa..cd56793410 100644
--- a/include/picongpu/particles/manipulators/binary/ProtonTimesWeighting.def
+++ b/include/picongpu/particles/manipulators/binary/ProtonTimesWeighting.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,75 +26,66 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace binary
-{
-namespace acc
-{
-
-    //! Re-scale the weighting of a cloned species by numberOfProtons
-    struct ProtonTimesWeighting
+    namespace particles
     {
-
-        /** Increase weighting of particleDest by proton number of SrcParticle
-         *
-         * The frame's `numberOfProtons`of `T_SrcParticle`
-         * is used to increase the weighting of particleDest.
-         * Useful to increase the weighting of macro electrons when cloned from an
-         * ion with Z>1. Otherwise one would need Z macro electrons (each with the
-         * same weighting as the initial ion) to keep the charge of a pre-ionized
-         * atom neutral.
-         *
-         * @tparam T_DestParticle type of the particle species with weighting to manipulate
-         * @tparam T_SrcParticle type of the particle species with proton number Z
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param particleDest destination particle
-         * @param source particle (the number of protons of this particle is used)
-         * @param unused particles
-         *
-         * @see picongpu::particles::ManipulateDerive, picongpu::particles::Manipulate
-         */
-        template<
-            typename T_DesParticle,
-            typename T_SrcParticle,
-            typename ... T_Args
-        >
-        HDINLINE void operator()(
-            T_DesParticle & particleDest,
-            T_SrcParticle const &,
-            T_Args && ...
-        )
+        namespace manipulators
         {
-            float_X const protonNumber = traits::GetAtomicNumbers< T_SrcParticle >::type::numberOfProtons;
-            particleDest[ weighting_ ] *= protonNumber;
-        }
-    };
-} // namespace acc
+            namespace binary
+            {
+                namespace acc
+                {
+                    //! Re-scale the weighting of a cloned species by numberOfProtons
+                    struct ProtonTimesWeighting
+                    {
+                        /** Increase weighting of particleDest by proton number of SrcParticle
+                         *
+                         * The frame's `numberOfProtons`of `T_SrcParticle`
+                         * is used to increase the weighting of particleDest.
+                         * Useful to increase the weighting of macro electrons when cloned from an
+                         * ion with Z>1. Otherwise one would need Z macro electrons (each with the
+                         * same weighting as the initial ion) to keep the charge of a pre-ionized
+                         * atom neutral.
+                         *
+                         * @tparam T_DestParticle type of the particle species with weighting to manipulate
+                         * @tparam T_SrcParticle type of the particle species with proton number Z
+                         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                         *
+                         * @param particleDest destination particle
+                         * @param source particle (the number of protons of this particle is used)
+                         * @param unused particles
+                         *
+                         * @see picongpu::particles::ManipulateDerive, picongpu::particles::Manipulate
+                         */
+                        template<typename T_DesParticle, typename T_SrcParticle, typename... T_Args>
+                        HDINLINE void operator()(T_DesParticle& particleDest, T_SrcParticle const&, T_Args&&...)
+                        {
+                            float_X const protonNumber
+                                = traits::GetAtomicNumbers<T_SrcParticle>::type::numberOfProtons;
+                            particleDest[weighting_] *= protonNumber;
+                        }
+                    };
+                } // namespace acc
 
-    /** Re-scale the weighting of a cloned species by numberOfProtons
-     *
-     * When deriving species from each other, the new
-     * species "inherits" the macro-particle weighting
-     * of the first one.
-     * This functor can be used to manipulate the weighting
-     * of the new species' macro particles to be a multiplied by
-     * the number of protons of the initial species.
-     *
-     * As an example, this is useful when initializing a quasi-neutral,
-     * pre-ionized plasma of ions and electrons. Electrons can be created
-     * from ions via deriving and increasing their weight to avoid simulating
-     * multiple macro electrons per macro ion (with Z>1).
-     *
-     * note: needs the atomicNumbers flag on the initial species,
-     *       used by the GetAtomicNumbers trait.
-     */
-    using ProtonTimesWeighting = generic::Free< acc::ProtonTimesWeighting >;
+                /** Re-scale the weighting of a cloned species by numberOfProtons
+                 *
+                 * When deriving species from each other, the new
+                 * species "inherits" the macro-particle weighting
+                 * of the first one.
+                 * This functor can be used to manipulate the weighting
+                 * of the new species' macro particles to be a multiplied by
+                 * the number of protons of the initial species.
+                 *
+                 * As an example, this is useful when initializing a quasi-neutral,
+                 * pre-ionized plasma of ions and electrons. Electrons can be created
+                 * from ions via deriving and increasing their weight to avoid simulating
+                 * multiple macro electrons per macro ion (with Z>1).
+                 *
+                 * note: needs the atomicNumbers flag on the initial species,
+                 *       used by the GetAtomicNumbers trait.
+                 */
+                using ProtonTimesWeighting = generic::Free<acc::ProtonTimesWeighting>;
 
-} // namespace binary
-} // namespace manipulators
-} // namespace particles
+            } // namespace binary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/binary/UnboundElectronsTimesWeighting.def b/include/picongpu/particles/manipulators/binary/UnboundElectronsTimesWeighting.def
index 21ce240411..041436b2e6 100644
--- a/include/picongpu/particles/manipulators/binary/UnboundElectronsTimesWeighting.def
+++ b/include/picongpu/particles/manipulators/binary/UnboundElectronsTimesWeighting.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,77 +26,71 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace binary
-{
-namespace acc
-{
-
-    //! Re-scale the weighting of a cloned species by numberOfProtons - boundElectrons
-    struct UnboundElectronsTimesWeighting
+    namespace particles
     {
-
-        /** Increase weighting of particleDest by ... number of SrcParticle
-         *
-         * The frame's `numberOfProtons`of `T_SrcParticle`
-         * is used to increase the weighting of particleDest.
-         * Useful to increase the weighting of macro electrons when cloned from an
-         * ion with Z>1. Otherwise one would need Z macro electrons (each with the
-         * same weighting as the initial ion) to keep the charge of a pre-ionized
-         * atom neutral.
-         *
-         * @tparam T_DestParticle type of the particle species with weighting to manipulate
-         * @tparam T_SrcParticle type of the particle species with proton number Z
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param particleDest destination particle
-         * @param source particle (the number of protons of this particle is used)
-         * @param unused particles
-         *
-         * @see picongpu::particles::ManipulateDerive, picongpu::particles::Manipulate
-         */
-        template<
-            typename T_DesParticle,
-            typename T_SrcParticle,
-            typename ... T_Args
-        >
-        DINLINE void operator()(
-            T_DesParticle & particleDest,
-            T_SrcParticle const & particleSrc,
-            T_Args && ...
-        )
+        namespace manipulators
         {
-            float_X const protonNumber = traits::GetAtomicNumbers< T_SrcParticle >::type::numberOfProtons;
-            float_X const boundElectrons = particleSrc[ boundElectrons_ ];
-            float_X const freeElectrons = protonNumber - boundElectrons;
-            particleDest[ weighting_ ] *= freeElectrons;
-        }
-    };
-} // namespace acc
+            namespace binary
+            {
+                namespace acc
+                {
+                    //! Re-scale the weighting of a cloned species by numberOfProtons - boundElectrons
+                    struct UnboundElectronsTimesWeighting
+                    {
+                        /** Increase weighting of particleDest by ... number of SrcParticle
+                         *
+                         * The frame's `numberOfProtons`of `T_SrcParticle`
+                         * is used to increase the weighting of particleDest.
+                         * Useful to increase the weighting of macro electrons when cloned from an
+                         * ion with Z>1. Otherwise one would need Z macro electrons (each with the
+                         * same weighting as the initial ion) to keep the charge of a pre-ionized
+                         * atom neutral.
+                         *
+                         * @tparam T_DestParticle type of the particle species with weighting to manipulate
+                         * @tparam T_SrcParticle type of the particle species with proton number Z
+                         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                         *
+                         * @param particleDest destination particle
+                         * @param source particle (the number of protons of this particle is used)
+                         * @param unused particles
+                         *
+                         * @see picongpu::particles::ManipulateDerive, picongpu::particles::Manipulate
+                         */
+                        template<typename T_DesParticle, typename T_SrcParticle, typename... T_Args>
+                        DINLINE void operator()(
+                            T_DesParticle& particleDest,
+                            T_SrcParticle const& particleSrc,
+                            T_Args&&...)
+                        {
+                            float_X const protonNumber
+                                = traits::GetAtomicNumbers<T_SrcParticle>::type::numberOfProtons;
+                            float_X const boundElectrons = particleSrc[boundElectrons_];
+                            float_X const freeElectrons = protonNumber - boundElectrons;
+                            particleDest[weighting_] *= freeElectrons;
+                        }
+                    };
+                } // namespace acc
 
-    /** Re-scale the weighting of a cloned species by numberOfProtons - ...
-     *
-     * When deriving species from each other, the new
-     * species "inherits" the macro-particle weighting
-     * of the first one.
-     * This functor can be used to manipulate the weighting
-     * of the new species' macro particles to be a multiplied by
-     * the number of protons of the initial species.
-     *
-     * As an example, this is useful when initializing a quasi-neutral,
-     * pre-ionized plasma of ions and electrons. Electrons can be created
-     * from ions via deriving and increasing their weight to avoid simulating
-     * multiple macro electrons per macro ion (with Z>1).
-     *
-     * note: needs the atomicNumbers flag on the initial species,
-     *       used by the GetAtomicNumbers trait.
-     */
-    using UnboundElectronsTimesWeighting = generic::Free< acc::UnboundElectronsTimesWeighting >;
+                /** Re-scale the weighting of a cloned species by numberOfProtons - ...
+                 *
+                 * When deriving species from each other, the new
+                 * species "inherits" the macro-particle weighting
+                 * of the first one.
+                 * This functor can be used to manipulate the weighting
+                 * of the new species' macro particles to be a multiplied by
+                 * the number of protons of the initial species.
+                 *
+                 * As an example, this is useful when initializing a quasi-neutral,
+                 * pre-ionized plasma of ions and electrons. Electrons can be created
+                 * from ions via deriving and increasing their weight to avoid simulating
+                 * multiple macro electrons per macro ion (with Z>1).
+                 *
+                 * note: needs the atomicNumbers flag on the initial species,
+                 *       used by the GetAtomicNumbers trait.
+                 */
+                using UnboundElectronsTimesWeighting = generic::Free<acc::UnboundElectronsTimesWeighting>;
 
-} // namespace binary
-} // namespace manipulators
-} // namespace particles
+            } // namespace binary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/generic/Free.def b/include/picongpu/particles/manipulators/generic/Free.def
index 4ab31c5725..cfc40074a4 100644
--- a/include/picongpu/particles/manipulators/generic/Free.def
+++ b/include/picongpu/particles/manipulators/generic/Free.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,41 +22,40 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace generic
-{
-
-    /** call simple free user defined manipulators
-     *
-     * @tparam T_Functor user defined manipulators
-     *                   **optional**: can implement **one** host side constructor
-     *                   `T_Functor()` or `T_Functor(uint32_t currentTimeStep)`
-     *
-     * example for `particle.param`: set in cell position to zero
-     *   @code{.cpp}
-     *
-     *   struct FunctorInCellPositionZero
-     *   {
-     *       template< typename T_Particle >
-     *       HDINLINE void operator()( T_Particle & particle )
-     *       {
-     *           particle[ position_ ] = floatD_X::create( 0.0 );
-     *       }
-     *       static constexpr char const * name = "inCellPositionZero";
-     *   };
-     *
-     *   using InCellPositionZero = generic::Free<
-     *      FunctorInCellPositionZero
-     *   >;
-     *   @endcode
-     */
-    template< typename T_Functor >
-    struct Free;
+    namespace particles
+    {
+        namespace manipulators
+        {
+            namespace generic
+            {
+                /** call simple free user defined manipulators
+                 *
+                 * @tparam T_Functor user defined manipulators
+                 *                   **optional**: can implement **one** host side constructor
+                 *                   `T_Functor()` or `T_Functor(uint32_t currentTimeStep)`
+                 *
+                 * example for `particle.param`: set in cell position to zero
+                 *   @code{.cpp}
+                 *
+                 *   struct FunctorInCellPositionZero
+                 *   {
+                 *       template< typename T_Particle >
+                 *       HDINLINE void operator()( T_Particle & particle )
+                 *       {
+                 *           particle[ position_ ] = floatD_X::create( 0.0 );
+                 *       }
+                 *       static constexpr char const * name = "inCellPositionZero";
+                 *   };
+                 *
+                 *   using InCellPositionZero = generic::Free<
+                 *      FunctorInCellPositionZero
+                 *   >;
+                 *   @endcode
+                 */
+                template<typename T_Functor>
+                struct Free;
 
-} // namespace generic
-} // namespace manipulators
-} // namespace particles
+            } // namespace generic
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/generic/Free.hpp b/include/picongpu/particles/manipulators/generic/Free.hpp
index 6f76bb78cf..6dc0e1ca98 100644
--- a/include/picongpu/particles/manipulators/generic/Free.hpp
+++ b/include/picongpu/particles/manipulators/generic/Free.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -28,106 +28,88 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace generic
-{
-namespace acc
-{
-    /** wrapper for the user manipulator functor on the accelerator
-     *
-     * @tparam T_Functor user defined manipulators
-     */
-    template< typename T_Functor >
-    struct Free : private T_Functor
+    namespace particles
     {
-        //! type of the user manipulators
-        using Functor = T_Functor;
-
-        //! store user manipulators instance
-        HDINLINE Free( Functor const & manipulators ) :
-            Functor( manipulators )
-        {
-        }
-
-        /** execute the user manipulator functor
-         *
-         * @tparam T_Args type of the arguments passed to the user manipulator functor
-         *
-         * @param args arguments passed to the user functor
-         */
-        template<
-            typename T_Acc,
-            typename ... T_Args >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            T_Args && ... args
-        )
+        namespace manipulators
         {
-            Functor::operator( )( args ... );
-        }
-    };
-} // namespace acc
+            namespace generic
+            {
+                namespace acc
+                {
+                    /** wrapper for the user manipulator functor on the accelerator
+                     *
+                     * @tparam T_Functor user defined manipulators
+                     */
+                    template<typename T_Functor>
+                    struct Free : private T_Functor
+                    {
+                        //! type of the user manipulators
+                        using Functor = T_Functor;
 
-    template< typename T_Functor >
-    struct Free : protected functor::User< T_Functor >
-    {
+                        //! store user manipulators instance
+                        HDINLINE Free(Functor const& manipulators) : Functor(manipulators)
+                        {
+                        }
 
-        using Functor = functor::User< T_Functor >;
+                        /** execute the user manipulator functor
+                         *
+                         * @tparam T_Args type of the arguments passed to the user manipulator functor
+                         *
+                         * @param args arguments passed to the user functor
+                         */
+                        template<typename T_Acc, typename... T_Args>
+                        HDINLINE void operator()(T_Acc const&, T_Args&&... args)
+                        {
+                            Functor::operator()(args...);
+                        }
+                    };
+                } // namespace acc
 
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = Free;
-        };
+                template<typename T_Functor>
+                struct Free : protected functor::User<T_Functor>
+                {
+                    using Functor = functor::User<T_Functor>;
 
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE Free( uint32_t currentStep ) : Functor( currentStep )
-        {
-        }
+                    template<typename T_SpeciesType>
+                    struct apply
+                    {
+                        using type = Free;
+                    };
 
-        /** create device manipulator functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param offset (in supercells, without any guards) to the
-         *         origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::Free< Functor >
-        operator()(
-            T_Acc const &,
-            DataSpace< simDim > const &,
-            T_WorkerCfg const &
-        ) const
-        {
-            return acc::Free< Functor >( *static_cast< Functor const * >( this ) );
-        }
+                    /** constructor
+                     *
+                     * @param currentStep current simulation time step
+                     */
+                    HINLINE Free(uint32_t currentStep) : Functor(currentStep)
+                    {
+                    }
 
-        //! get the name of the functor
-        static
-        HINLINE std::string
-        getName( )
-        {
-            // we provide the name from the param class
-            return Functor::name;
-        }
+                    /** create device manipulator functor
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param offset (in supercells, without any guards) to the
+                     *         origin of the local domain
+                     * @param configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE acc::Free<Functor> operator()(T_Acc const&, DataSpace<simDim> const&, T_WorkerCfg const&)
+                        const
+                    {
+                        return acc::Free<Functor>(*static_cast<Functor const*>(this));
+                    }
 
-    };
+                    //! get the name of the functor
+                    static HINLINE std::string getName()
+                    {
+                        // we provide the name from the param class
+                        return Functor::name;
+                    }
+                };
 
-} // namespace generic
-} // namespace manipulators
-} // namespace particles
+            } // namespace generic
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/generic/FreeRng.def b/include/picongpu/particles/manipulators/generic/FreeRng.def
index e11c8c2b4e..3914986810 100644
--- a/include/picongpu/particles/manipulators/generic/FreeRng.def
+++ b/include/picongpu/particles/manipulators/generic/FreeRng.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,51 +27,47 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace generic
-{
-
-    /** call simple free user defined functor and provide a random number generator
-     *
-     *
-     * @tparam T_Functor user defined unary functor
-     * @tparam T_Distribution pmacc::random::distributions, random number distribution
-     *
-     * example for `particle.param`: add
-     *   @code{.cpp}
-     *   #include <pmacc/nvidia/rng/distributions/Uniform_float.hpp>
-     *
-     *   struct FunctorRandomX
-     *   {
-     *       template< typename T_Rng, typename T_Particle >
-     *       HDINLINE void operator()( T_Rng& rng, T_Particle& particle )
-     *       {
-     *           particle[ position_ ].x() = rng();
-     *       }
-     *       static constexpr char const * name = "randomXPos";
-     *   };
-     *
-     *   using RandomXPos = generic::FreeRng<
-     *      FunctorRandomX,
-     *      pmacc::random::distributions::Uniform< float_X >
-     *   >;
-     *   @endcode
-     *
-     * and to `InitPipeline` in `speciesInitialization.param`:
-     *   @code{.cpp}
-     *   Manipulate< manipulators::RandomXPos, SPECIES_NAME >
-     *   @endcode
-     */
-    template<
-        typename T_Functor,
-        typename T_Distribution
-    >
-    struct FreeRng;
+    namespace particles
+    {
+        namespace manipulators
+        {
+            namespace generic
+            {
+                /** call simple free user defined functor and provide a random number generator
+                 *
+                 *
+                 * @tparam T_Functor user defined unary functor
+                 * @tparam T_Distribution pmacc::random::distributions, random number distribution
+                 *
+                 * example for `particle.param`: add
+                 *   @code{.cpp}
+                 *   #include <pmacc/random/distributions/Uniform.hpp>
+                 *
+                 *   struct FunctorRandomX
+                 *   {
+                 *       template< typename T_Rng, typename T_Particle >
+                 *       HDINLINE void operator()( T_Rng& rng, T_Particle& particle )
+                 *       {
+                 *           particle[ position_ ].x() = rng();
+                 *       }
+                 *       static constexpr char const * name = "randomXPos";
+                 *   };
+                 *
+                 *   using RandomXPos = generic::FreeRng<
+                 *      FunctorRandomX,
+                 *      pmacc::random::distributions::Uniform< float_X >
+                 *   >;
+                 *   @endcode
+                 *
+                 * and to `InitPipeline` in `speciesInitialization.param`:
+                 *   @code{.cpp}
+                 *   Manipulate< manipulators::RandomXPos, SPECIES_NAME >
+                 *   @endcode
+                 */
+                template<typename T_Functor, typename T_Distribution>
+                struct FreeRng;
 
-} // namespace generic
-} // namespace manipulators
-} // namespace particles
+            } // namespace generic
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/generic/FreeRng.hpp b/include/picongpu/particles/manipulators/generic/FreeRng.hpp
index adf5954b43..7937e33171 100644
--- a/include/picongpu/particles/manipulators/generic/FreeRng.hpp
+++ b/include/picongpu/particles/manipulators/generic/FreeRng.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Alexander Grund
+/* Copyright 2015-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PIConGPU.
  *
@@ -31,157 +31,103 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace generic
-{
-namespace acc
-{
-    template<
-        typename T_Functor,
-        typename T_RngType
-    >
-    struct FreeRng : private T_Functor
-    {
-
-        using Functor = T_Functor;
-        using RngType = T_RngType;
-
-        HDINLINE FreeRng(
-            Functor const & functor,
-            RngType const & rng
-        ) :
-            T_Functor( functor ), m_rng( rng )
-        {
-        }
-
-        /** call user functor
-         *
-         * The random number generator is initialized with the first call.
-         *
-         * @tparam T_Particle type of the particle to manipulate
-         * @tparam T_Args type of the arguments passed to the user functor
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param particle particle which is given to the user functor
-         * @return void is used to enable the operator if the user functor except two arguments
-         */
-        template<
-            typename T_Particle,
-            typename ... T_Args,
-            typename T_Acc
-        >
-        HDINLINE
-        void operator()(
-            T_Acc const &,
-            T_Particle& particle,
-            T_Args && ... args
-        )
-        {
-            namespace nvrng = nvidia::rng;
-
-            Functor::operator()(
-                m_rng,
-                particle,
-                args ...
-            );
-        }
-
-    private:
-
-        RngType m_rng;
-    };
-} // namespace acc
-
-    template<
-        typename T_Functor,
-        typename T_Distribution
-    >
-    struct FreeRng :
-    protected functor::User< T_Functor >,
-        private picongpu::particles::functor::misc::Rng<
-            T_Distribution
-        >
+    namespace particles
     {
-
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = FreeRng;
-        };
-
-        using RngGenerator = picongpu::particles::functor::misc::Rng<
-            T_Distribution
-        >;
-
-        using RngType = typename RngGenerator::RandomGen;
-
-        using Functor = functor::User< T_Functor >;
-        using Distribution = T_Distribution;
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE FreeRng( uint32_t currentStep ) :
-            Functor( currentStep ),
-            RngGenerator( currentStep )
-        {
-        }
-
-        /** create functor for the accelerator
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset offset (in superCells, without any guards) relative
-         *                        to the origin of the local domain
-         * @param workerCfg configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE auto
-        operator()(
-            T_Acc const & acc,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const & workerCfg
-        ) const
-        -> acc::FreeRng<
-            Functor,
-            RngType
-        >
-        {
-            RngType const rng = ( *static_cast< RngGenerator const * >( this ) )(
-                acc,
-                localSupercellOffset,
-                workerCfg
-            );
-
-            return acc::FreeRng<
-                Functor,
-                RngType
-            >(
-                *static_cast< Functor const * >( this ),
-                rng
-            );
-        }
-
-        static
-        HINLINE std::string
-        getName( )
+        namespace manipulators
         {
-            // we provide the name from the param class
-            return Functor::name;
-        }
-    };
-
-} // namespace generic
-} // namespace manipulators
-} // namespace particles
+            namespace generic
+            {
+                namespace acc
+                {
+                    template<typename T_Functor, typename T_RngType>
+                    struct FreeRng : private T_Functor
+                    {
+                        using Functor = T_Functor;
+                        using RngType = T_RngType;
+
+                        HDINLINE FreeRng(Functor const& functor, RngType const& rng) : T_Functor(functor), m_rng(rng)
+                        {
+                        }
+
+                        /** call user functor
+                         *
+                         * The random number generator is initialized with the first call.
+                         *
+                         * @tparam T_Particle type of the particle to manipulate
+                         * @tparam T_Args type of the arguments passed to the user functor
+                         * @tparam T_Acc alpaka accelerator type
+                         *
+                         * @param alpaka accelerator
+                         * @param particle particle which is given to the user functor
+                         * @return void is used to enable the operator if the user functor except two arguments
+                         */
+                        template<typename T_Particle, typename... T_Args, typename T_Acc>
+                        HDINLINE void operator()(T_Acc const&, T_Particle& particle, T_Args&&... args)
+                        {
+                            Functor::operator()(m_rng, particle, args...);
+                        }
+
+                    private:
+                        RngType m_rng;
+                    };
+                } // namespace acc
+
+                template<typename T_Functor, typename T_Distribution>
+                struct FreeRng
+                    : protected functor::User<T_Functor>
+                    , private picongpu::particles::functor::misc::Rng<T_Distribution>
+                {
+                    template<typename T_SpeciesType>
+                    struct apply
+                    {
+                        using type = FreeRng;
+                    };
+
+                    using RngGenerator = picongpu::particles::functor::misc::Rng<T_Distribution>;
+
+                    using RngType = typename RngGenerator::RandomGen;
+
+                    using Functor = functor::User<T_Functor>;
+                    using Distribution = T_Distribution;
+
+                    /** constructor
+                     *
+                     * @param currentStep current simulation time step
+                     */
+                    HINLINE FreeRng(uint32_t currentStep) : Functor(currentStep), RngGenerator(currentStep)
+                    {
+                    }
+
+                    /** create functor for the accelerator
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param localSupercellOffset offset (in superCells, without any guards) relative
+                     *                        to the origin of the local domain
+                     * @param workerCfg configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE auto operator()(
+                        T_Acc const& acc,
+                        DataSpace<simDim> const& localSupercellOffset,
+                        T_WorkerCfg const& workerCfg) const -> acc::FreeRng<Functor, RngType>
+                    {
+                        RngType const rng
+                            = (*static_cast<RngGenerator const*>(this))(acc, localSupercellOffset, workerCfg);
+
+                        return acc::FreeRng<Functor, RngType>(*static_cast<Functor const*>(this), rng);
+                    }
+
+                    static HINLINE std::string getName()
+                    {
+                        // we provide the name from the param class
+                        return Functor::name;
+                    }
+                };
+
+            } // namespace generic
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/generic/None.def b/include/picongpu/particles/manipulators/generic/None.def
index 3ada06dd0a..272aba07e6 100644
--- a/include/picongpu/particles/manipulators/generic/None.def
+++ b/include/picongpu/particles/manipulators/generic/None.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -22,35 +22,32 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace generic
-{
-namespace acc
-{
-    struct None
+    namespace particles
     {
-        template< typename ... T_Args >
-        HDINLINE void
-        operator( )( T_Args && ... )
+        namespace manipulators
         {
-        }
+            namespace generic
+            {
+                namespace acc
+                {
+                    struct None
+                    {
+                        template<typename... T_Args>
+                        HDINLINE void operator()(T_Args&&...)
+                        {
+                        }
 
-        static constexpr char const * name = "None";
-    };
-} // namespace acc
+                        static constexpr char const* name = "None";
+                    };
+                } // namespace acc
 
-    /** do nothing with the particle
-     *
-     * The call of this functor results in an empty operation
-     */
-    using None = Free<
-        acc::None
-    >;
+                /** do nothing with the particle
+                 *
+                 * The call of this functor results in an empty operation
+                 */
+                using None = Free<acc::None>;
 
-} // namespace generic
-} // namespace manipulators
-} // namespace particles
+            } // namespace generic
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/manipulators.def b/include/picongpu/particles/manipulators/manipulators.def
index 0e0e2b012f..148b0a7459 100644
--- a/include/picongpu/particles/manipulators/manipulators.def
+++ b/include/picongpu/particles/manipulators/manipulators.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Axel Huebl
+/* Copyright 2014-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/manipulators/manipulators.hpp b/include/picongpu/particles/manipulators/manipulators.hpp
index 9466f70b69..bf4ac3e714 100644
--- a/include/picongpu/particles/manipulators/manipulators.hpp
+++ b/include/picongpu/particles/manipulators/manipulators.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Axel Huebl
+/* Copyright 2014-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/manipulators/unary/CopyAttribute.def b/include/picongpu/particles/manipulators/unary/CopyAttribute.def
index 97a3fbf722..9b83c48e80 100644
--- a/include/picongpu/particles/manipulators/unary/CopyAttribute.def
+++ b/include/picongpu/particles/manipulators/unary/CopyAttribute.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,68 +24,50 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace unary
-{
-namespace acc
-{
-
-    /** helper functor to copy a particle source attribute to a destination attribute
-     *
-     * @tparam T_DestAttribute type of the destination attribute e.g. `momentumPrev1`
-     * @tparam T_SrcAttribute type of the source attribute e.g. `momentum`
-     */
-    template<
-        typename T_DestAttribute,
-        typename T_SrcAttribute
-    >
-    struct CopyAttribute
+    namespace particles
     {
-        /** copy attribute
-         *
-         * @tparam T_Particle pmacc::Particle, particle type
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param particle particle to be manipulated
-         * @param ... unused particles
-         */
-        template<
-            typename T_Particle,
-            typename ... T_Args
-        >
-        HDINLINE void operator( )(
-            T_Particle & particle,
-            T_Args && ...
-        )
+        namespace manipulators
         {
-            particle[ T_DestAttribute{ } ] = particle[ T_SrcAttribute{ } ];
-        }
-    };
+            namespace unary
+            {
+                namespace acc
+                {
+                    /** helper functor to copy a particle source attribute to a destination attribute
+                     *
+                     * @tparam T_DestAttribute type of the destination attribute e.g. `momentumPrev1`
+                     * @tparam T_SrcAttribute type of the source attribute e.g. `momentum`
+                     */
+                    template<typename T_DestAttribute, typename T_SrcAttribute>
+                    struct CopyAttribute
+                    {
+                        /** copy attribute
+                         *
+                         * @tparam T_Particle pmacc::Particle, particle type
+                         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                         *
+                         * @param particle particle to be manipulated
+                         * @param ... unused particles
+                         */
+                        template<typename T_Particle, typename... T_Args>
+                        HDINLINE void operator()(T_Particle& particle, T_Args&&...)
+                        {
+                            particle[T_DestAttribute{}] = particle[T_SrcAttribute{}];
+                        }
+                    };
 
-} // namespace acc
+                } // namespace acc
 
-    /** copy a particle source attribute to a destination attribute
-     *
-     * This is an unary functor and operates on one particle.
-     *
-     * @tparam T_DestAttribute type of the destination attribute e.g. `momentumPrev1`
-     * @tparam T_SrcAttribute type of the source attribute e.g. `momentum`
-     */
-    template<
-        typename T_DestAttribute,
-        typename T_SrcAttribute
-    >
-    using CopyAttribute = generic::Free<
-        acc::CopyAttribute<
-            T_DestAttribute,
-            T_SrcAttribute
-        >
-    >;
+                /** copy a particle source attribute to a destination attribute
+                 *
+                 * This is an unary functor and operates on one particle.
+                 *
+                 * @tparam T_DestAttribute type of the destination attribute e.g. `momentumPrev1`
+                 * @tparam T_SrcAttribute type of the source attribute e.g. `momentum`
+                 */
+                template<typename T_DestAttribute, typename T_SrcAttribute>
+                using CopyAttribute = generic::Free<acc::CopyAttribute<T_DestAttribute, T_SrcAttribute>>;
 
-} // namespace unary
-} // namespace manipulators
-} // namespace particles
+            } // namespace unary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/unary/Drift.def b/include/picongpu/particles/manipulators/unary/Drift.def
index 2f1adcaf0c..106997c130 100644
--- a/include/picongpu/particles/manipulators/unary/Drift.def
+++ b/include/picongpu/particles/manipulators/unary/Drift.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,64 +25,49 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace unary
-{
-namespace param
-{
-    CONST_VECTOR(
-        float_X,
-        3,
-        DriftNegative_direction,
-        -1.0,
-        0.0,
-        0.0
-    );
-
-    //! configuration for the unary manipulator functor Drift
-    struct DriftCfg
+    namespace particles
     {
-        /** Initial particle drift velocity for electrons and ions
-         *  Examples:
-         *    - No drift is equal to 1.0
-         *  unit: none
-         */
-        static constexpr float_64 gamma = 1.021;
-        DriftNegative_direction_t const direction;
-    };
-} // namespace param
+        namespace manipulators
+        {
+            namespace unary
+            {
+                namespace param
+                {
+                    CONST_VECTOR(float_X, 3, DriftNegative_direction, -1.0, 0.0, 0.0);
 
-namespace acc
-{
-    template<
-        typename T_ParamClass,
-        typename T_ValueFunctor
-    >
-    struct Drift;
-} // namespace acc
+                    //! configuration for the unary manipulator functor Drift
+                    struct DriftCfg
+                    {
+                        /** Initial particle drift velocity for electrons and ions
+                         *  Examples:
+                         *    - No drift is equal to 1.0
+                         *  unit: none
+                         */
+                        static constexpr float_64 gamma = 1.021;
+                        DriftNegative_direction_t const direction;
+                    };
+                } // namespace param
+
+                namespace acc
+                {
+                    template<typename T_ParamClass, typename T_ValueFunctor>
+                    struct Drift;
+                } // namespace acc
 
-    /** change particle's momentum based on speed
-     *
-     * allow to manipulate a speed to a particle
-     *
-     * @tparam T_ParamClass param::DriftCfg, configuration parameter
-     * @tparam T_ValueFunctor pmacc::nvidia::functors::*,  binary functor type to manipulate the momentum attribute
-     */
-    template<
-        typename T_ParamClass = param::DriftCfg,
-        typename T_ValueFunctor = pmacc::nvidia::functors::Add
-    >
-    using Drift = generic::Free<
-        acc::Drift<
-            T_ParamClass,
-            T_ValueFunctor
-        >
-    >;
+                /** change particle's momentum based on speed
+                 *
+                 * allow to manipulate a speed to a particle
+                 *
+                 * @tparam T_ParamClass param::DriftCfg, configuration parameter
+                 * @tparam T_ValueFunctor pmacc::nvidia::functors::*,  binary functor type to manipulate the momentum
+                 * attribute
+                 */
+                template<
+                    typename T_ParamClass = param::DriftCfg,
+                    typename T_ValueFunctor = pmacc::nvidia::functors::Add>
+                using Drift = generic::Free<acc::Drift<T_ParamClass, T_ValueFunctor>>;
 
-} // namespace unary
-} // namespace manipulators
-} // namespace particles
+            } // namespace unary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/unary/Drift.hpp b/include/picongpu/particles/manipulators/unary/Drift.hpp
index 0c29a83e6a..6fdb01f906 100644
--- a/include/picongpu/particles/manipulators/unary/Drift.hpp
+++ b/include/picongpu/particles/manipulators/unary/Drift.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,76 +25,58 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace unary
-{
-namespace acc
-{
-
-    /** manipulate the speed
-     *
-     * @tparam T_ParamClass picongpu::particles::manipulators::unary::param::DriftCfg,
-     *                      type with compile configuration
-     * @tparam T_ValueFunctor pmacc::nvidia::functors, binary operator type to reduce current and new value
-     */
-    template<
-        typename T_ParamClass,
-        typename T_ValueFunctor
-    >
-    struct Drift : private T_ValueFunctor
+    namespace particles
     {
-        /** manipulate the speed of the particle
-         *
-         * @tparam T_Particle pmacc::Particle, particle type
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param particle particle to be manipulated
-         * @param ... unused particles
-         */
-        template<
-            typename T_Particle,
-            typename ... T_Args
-        >
-        HDINLINE void operator()(
-            T_Particle & particle,
-            T_Args && ...
-        )
+        namespace manipulators
         {
-            using ParamClass = T_ParamClass;
-            using ValueFunctor = T_ValueFunctor;
+            namespace unary
+            {
+                namespace acc
+                {
+                    /** manipulate the speed
+                     *
+                     * @tparam T_ParamClass picongpu::particles::manipulators::unary::param::DriftCfg,
+                     *                      type with compile configuration
+                     * @tparam T_ValueFunctor pmacc::nvidia::functors, binary operator type to reduce current and new
+                     * value
+                     */
+                    template<typename T_ParamClass, typename T_ValueFunctor>
+                    struct Drift : private T_ValueFunctor
+                    {
+                        /** manipulate the speed of the particle
+                         *
+                         * @tparam T_Particle pmacc::Particle, particle type
+                         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                         *
+                         * @param particle particle to be manipulated
+                         * @param ... unused particles
+                         */
+                        template<typename T_Particle, typename... T_Args>
+                        HDINLINE void operator()(T_Particle& particle, T_Args&&...)
+                        {
+                            using ParamClass = T_ParamClass;
+                            using ValueFunctor = T_ValueFunctor;
 
-            float_X const macroWeighting = particle[ weighting_ ];
-            float_X const  macroMass = attribute::getMass(
-                macroWeighting,
-                particle
-            );
+                            float_X const macroWeighting = particle[weighting_];
+                            float_X const macroMass = attribute::getMass(macroWeighting, particle);
 
-            float_64 const myGamma = ParamClass::gamma;
+                            float_64 const myGamma = ParamClass::gamma;
 
-            float_64 const initFreeBeta =
-                math::sqrt( 1.0 -
-                            1.0 / ( myGamma * myGamma) );
+                            float_64 const initFreeBeta = math::sqrt(1.0 - 1.0 / (myGamma * myGamma));
 
-            float3_X const driftDirection( ParamClass( ).direction );
-            float3_X const normDir = driftDirection / math::abs( driftDirection );
+                            float3_X const driftDirection(ParamClass().direction);
+                            float3_X const normDir = driftDirection / math::abs(driftDirection);
 
-            float3_X const mom( normDir *
-                float_X(
-                    myGamma * initFreeBeta *
-                    float_64( macroMass ) *
-                    float_64( SPEED_OF_LIGHT )
-                )
-            );
+                            float3_X const mom(
+                                normDir
+                                * float_X(myGamma * initFreeBeta * float_64(macroMass) * float_64(SPEED_OF_LIGHT)));
 
-            ValueFunctor::operator( )( particle[ momentum_ ], mom );
-        }
-    };
+                            ValueFunctor::operator()(particle[momentum_], mom);
+                        }
+                    };
 
-} // namespace acc
-} // namespace unary
-} // namespace manipulators
-} // namespace particles
+                } // namespace acc
+            } // namespace unary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/unary/FreeTotalCellOffset.def b/include/picongpu/particles/manipulators/unary/FreeTotalCellOffset.def
index ce55688216..ea9da28478 100644
--- a/include/picongpu/particles/manipulators/unary/FreeTotalCellOffset.def
+++ b/include/picongpu/particles/manipulators/unary/FreeTotalCellOffset.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera, Axel Huebl
+/* Copyright 2017-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -27,44 +27,44 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace unary
-{
-    /** call simple free user defined manipulators and provide the cell information
-     *
-     * The functor passes the cell offset of the particle relative to the total
-     * domain origin into the functor.
-     *
-     * @tparam T_Functor user defined unary functor
-     *
-     * example for `particle.param`: set a user-defined species attribute y0
-     * (type: uint32_t) to the current total y-cell index
-     *   @code{.cpp}
-     *   struct FunctorSaveYcell
-     *   {
-     *       template< typename T_Particle >
-     *       HDINLINE void operator()(
-     *          DataSpace< simDim > const & particleOffsetToTotalOrigin,
-     *          T_Particle & particle
-     *       )
-     *       {
-     *           particle[ y0_ ] = particleOffsetToTotalOrigin.y();
-     *       }
-     *       static constexpr char const * name = "saveYcell";
-     *   };
-     *
-     *   using SaveYcell = unary::FreeTotalCellOffset<
-     *      FunctorSaveYcell
-     *   >;
-     *   @endcode
-     */
-    template< typename T_Functor >
-    struct FreeTotalCellOffset;
+    namespace particles
+    {
+        namespace manipulators
+        {
+            namespace unary
+            {
+                /** call simple free user defined manipulators and provide the cell information
+                 *
+                 * The functor passes the cell offset of the particle relative to the total
+                 * domain origin into the functor.
+                 *
+                 * @tparam T_Functor user defined unary functor
+                 *
+                 * example for `particle.param`: set a user-defined species attribute y0
+                 * (type: uint32_t) to the current total y-cell index
+                 *   @code{.cpp}
+                 *   struct FunctorSaveYcell
+                 *   {
+                 *       template< typename T_Particle >
+                 *       HDINLINE void operator()(
+                 *          DataSpace< simDim > const & particleOffsetToTotalOrigin,
+                 *          T_Particle & particle
+                 *       )
+                 *       {
+                 *           particle[ y0_ ] = particleOffsetToTotalOrigin.y();
+                 *       }
+                 *       static constexpr char const * name = "saveYcell";
+                 *   };
+                 *
+                 *   using SaveYcell = unary::FreeTotalCellOffset<
+                 *      FunctorSaveYcell
+                 *   >;
+                 *   @endcode
+                 */
+                template<typename T_Functor>
+                struct FreeTotalCellOffset;
 
-} // namespace unary
-} // namespace manipulators
-} // namespace particles
+            } // namespace unary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/unary/FreeTotalCellOffset.hpp b/include/picongpu/particles/manipulators/unary/FreeTotalCellOffset.hpp
index e56bfa7485..3d1ee36c25 100644
--- a/include/picongpu/particles/manipulators/unary/FreeTotalCellOffset.hpp
+++ b/include/picongpu/particles/manipulators/unary/FreeTotalCellOffset.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera, Axel Huebl
+/* Copyright 2017-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -28,133 +28,106 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace unary
-{
-namespace acc
-{
-    template< typename T_Functor >
-    struct FreeTotalCellOffset : private T_Functor
+    namespace particles
     {
-
-        using Functor = T_Functor;
-
-        HDINLINE FreeTotalCellOffset(
-            Functor const & functor,
-            DataSpace< simDim > const & superCellToLocalOriginCellOffset
-        ) :
-            T_Functor( functor ),
-            m_superCellToLocalOriginCellOffset( superCellToLocalOriginCellOffset )
+        namespace manipulators
         {
-        }
-
-        /** call user functor
-         *
-         * The random number generator is initialized with the first call.
-         *
-         * @tparam T_Particle type of the particle to manipulate
-         * @tparam T_Args type of the arguments passed to the user functor
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param particle particle which is given to the user functor
-         * @return void is used to enable the operator if the user functor expects two arguments
-         */
-        template<
-            typename T_Particle,
-            typename T_Acc
-        >
-        HDINLINE
-        void operator()(
-            T_Acc const &,
-            T_Particle & particle
-        )
-        {
-            DataSpace< simDim > const cellInSuperCell(
-                DataSpaceOperations< simDim >::template map< SuperCellSize >( particle[ localCellIdx_ ] )
-            );
-            Functor::operator( )(
-                m_superCellToLocalOriginCellOffset + cellInSuperCell,
-                particle
-            );
-        }
-
-    private:
-
-        DataSpace< simDim > const m_superCellToLocalOriginCellOffset;
-    };
-} // namespace acc
-
-    template< typename T_Functor >
-    struct FreeTotalCellOffset :
-        protected functor::User< T_Functor >,
-        private functor::misc::TotalCellOffset
-    {
-        using CellOffsetFunctor = functor::misc::TotalCellOffset;
-        using Functor = functor::User< T_Functor >;
-
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = FreeTotalCellOffset;
-        };
-
-        /** constructor
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE FreeTotalCellOffset( uint32_t currentStep ) :
-            Functor( currentStep ),
-            CellOffsetFunctor( currentStep )
-        {
-        }
-
-        /** create functor for the accelerator
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset offset (in superCells, without any guards) relative
-         *                             to the origin of the local domain
-         * @param workerCfg configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE auto
-        operator()(
-            T_Acc const & acc,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const & workerCfg
-        ) const
-        -> acc::FreeTotalCellOffset< Functor >
-        {
-            auto & cellOffsetFunctor = *static_cast< CellOffsetFunctor const * >( this );
-            return acc::FreeTotalCellOffset< Functor >(
-                *static_cast< Functor const * >( this ),
-                cellOffsetFunctor(
-                    acc,
-                    localSupercellOffset,
-                    workerCfg
-                )
-            );
-        }
-
-        static
-        HINLINE std::string
-        getName( )
-        {
-            // we provide the name from the param class
-            return Functor::name;
-        }
-    };
-
-} // namespace unary
-} // namespace manipulators
-} // namespace particles
+            namespace unary
+            {
+                namespace acc
+                {
+                    template<typename T_Functor>
+                    struct FreeTotalCellOffset : private T_Functor
+                    {
+                        using Functor = T_Functor;
+
+                        HDINLINE FreeTotalCellOffset(
+                            Functor const& functor,
+                            DataSpace<simDim> const& superCellToLocalOriginCellOffset)
+                            : T_Functor(functor)
+                            , m_superCellToLocalOriginCellOffset(superCellToLocalOriginCellOffset)
+                        {
+                        }
+
+                        /** call user functor
+                         *
+                         * The random number generator is initialized with the first call.
+                         *
+                         * @tparam T_Particle type of the particle to manipulate
+                         * @tparam T_Args type of the arguments passed to the user functor
+                         * @tparam T_Acc alpaka accelerator type
+                         *
+                         * @param alpaka accelerator
+                         * @param particle particle which is given to the user functor
+                         * @return void is used to enable the operator if the user functor expects two arguments
+                         */
+                        template<typename T_Particle, typename T_Acc>
+                        HDINLINE void operator()(T_Acc const&, T_Particle& particle)
+                        {
+                            DataSpace<simDim> const cellInSuperCell(
+                                DataSpaceOperations<simDim>::template map<SuperCellSize>(particle[localCellIdx_]));
+                            Functor::operator()(m_superCellToLocalOriginCellOffset + cellInSuperCell, particle);
+                        }
+
+                    private:
+                        DataSpace<simDim> const m_superCellToLocalOriginCellOffset;
+                    };
+                } // namespace acc
+
+                template<typename T_Functor>
+                struct FreeTotalCellOffset
+                    : protected functor::User<T_Functor>
+                    , private functor::misc::TotalCellOffset
+                {
+                    using CellOffsetFunctor = functor::misc::TotalCellOffset;
+                    using Functor = functor::User<T_Functor>;
+
+                    template<typename T_SpeciesType>
+                    struct apply
+                    {
+                        using type = FreeTotalCellOffset;
+                    };
+
+                    /** constructor
+                     *
+                     * @param currentStep current simulation time step
+                     */
+                    HINLINE FreeTotalCellOffset(uint32_t currentStep)
+                        : Functor(currentStep)
+                        , CellOffsetFunctor(currentStep)
+                    {
+                    }
+
+                    /** create functor for the accelerator
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param localSupercellOffset offset (in superCells, without any guards) relative
+                     *                             to the origin of the local domain
+                     * @param workerCfg configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE auto operator()(
+                        T_Acc const& acc,
+                        DataSpace<simDim> const& localSupercellOffset,
+                        T_WorkerCfg const& workerCfg) const -> acc::FreeTotalCellOffset<Functor>
+                    {
+                        auto& cellOffsetFunctor = *static_cast<CellOffsetFunctor const*>(this);
+                        return acc::FreeTotalCellOffset<Functor>(
+                            *static_cast<Functor const*>(this),
+                            cellOffsetFunctor(acc, localSupercellOffset, workerCfg));
+                    }
+
+                    static HINLINE std::string getName()
+                    {
+                        // we provide the name from the param class
+                        return Functor::name;
+                    }
+                };
+
+            } // namespace unary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/unary/RandomPosition.def b/include/picongpu/particles/manipulators/unary/RandomPosition.def
index 91868f5047..f71f6b07da 100644
--- a/include/picongpu/particles/manipulators/unary/RandomPosition.def
+++ b/include/picongpu/particles/manipulators/unary/RandomPosition.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,71 +28,60 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace unary
-{
-namespace acc
-{
-
-    /** set the particle attribute position
-     *
-     * The particle attribute position is overwritten with a random
-     * in-cell position.
-     */
-    struct RandomPosition
+    namespace particles
     {
-        /** set in-cell position
-         *
-         * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator
-         * @tparam T_Particle pmacc::Particle, particle type
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param rng random number generator
-         * @param particle particle to be manipulated
-         * @param ... unused particles
-         */
-        template<
-            typename T_Rng,
-            typename T_Particle,
-            typename ... T_Args
-        >
-        HDINLINE void operator()(
-            T_Rng & rng,
-            T_Particle & particle,
-            T_Args && ...
-        )
+        namespace manipulators
         {
-            floatD_X tmpPos;
+            namespace unary
+            {
+                namespace acc
+                {
+                    /** set the particle attribute position
+                     *
+                     * The particle attribute position is overwritten with a random
+                     * in-cell position.
+                     */
+                    struct RandomPosition
+                    {
+                        /** set in-cell position
+                         *
+                         * @tparam T_Rng functor::misc::RngWrapper, type of the random number generator
+                         * @tparam T_Particle pmacc::Particle, particle type
+                         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                         *
+                         * @param rng random number generator
+                         * @param particle particle to be manipulated
+                         * @param ... unused particles
+                         */
+                        template<typename T_Rng, typename T_Particle, typename... T_Args>
+                        HDINLINE void operator()(T_Rng& rng, T_Particle& particle, T_Args&&...)
+                        {
+                            floatD_X tmpPos;
 
-            for( uint32_t d = 0; d < simDim; ++d )
-                tmpPos[ d ] = rng( );
+                            for(uint32_t d = 0; d < simDim; ++d)
+                                tmpPos[d] = rng();
 
-            particle[ position_ ] = tmpPos;
-        }
-    };
+                            particle[position_] = tmpPos;
+                        }
+                    };
 
-} // namespace acc
+                } // namespace acc
 
 
-    /** Change the in cell position
-     *
-     * This functor changes the in-cell position of a particle.
-     * The new in-cell position is uniformly distributed position between [0.0;1.0).
-     *
-     * example: add
-     *   ```
-     *     particles::Manipulate<RandomPosition,SPECIES_NAME>
-     *   ```
-     *   to `InitPipeline` in `speciesInitialization.param`
-     */
-    using RandomPosition = generic::FreeRng<
-        acc::RandomPosition,
-        pmacc::random::distributions::Uniform< float_X >
-    >;
-} // namespace unary
-} // namespace manipulators
-} // namespace particles
+                /** Change the in cell position
+                 *
+                 * This functor changes the in-cell position of a particle.
+                 * The new in-cell position is uniformly distributed position between [0.0;1.0).
+                 *
+                 * example: add
+                 *   ```
+                 *     particles::Manipulate<RandomPosition,SPECIES_NAME>
+                 *   ```
+                 *   to `InitPipeline` in `speciesInitialization.param`
+                 */
+                using RandomPosition
+                    = generic::FreeRng<acc::RandomPosition, pmacc::random::distributions::Uniform<float_X>>;
+            } // namespace unary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/unary/Temperature.def b/include/picongpu/particles/manipulators/unary/Temperature.def
index 178f861668..a327b75825 100644
--- a/include/picongpu/particles/manipulators/unary/Temperature.def
+++ b/include/picongpu/particles/manipulators/unary/Temperature.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,59 +24,47 @@
 #include <pmacc/random/distributions/Normal.hpp>
 #include <pmacc/nvidia/functors/Add.hpp>
 
-#include <boost/mpl/integral_c.hpp>
-
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace unary
-{
-namespace acc
-{
-    template<
-        typename T_ParamClass,
-        typename T_ValueFunctor
-    >
-    struct Temperature;
-} // namespace acc
-
-namespace param
-{
-    //! configuration for the unary manipulator functor Temperature
-    struct TemperatureCfg
+    namespace particles
     {
-        /** Initial temperature
-         *  unit: keV
-         */
-        static constexpr float_64 temperature = 0.0;
-    };
-} // namespace param
+        namespace manipulators
+        {
+            namespace unary
+            {
+                namespace acc
+                {
+                    template<typename T_ParamClass, typename T_ValueFunctor>
+                    struct Temperature;
+                } // namespace acc
+
+                namespace param
+                {
+                    //! configuration for the unary manipulator functor Temperature
+                    struct TemperatureCfg
+                    {
+                        /** Initial temperature
+                         *  unit: keV
+                         */
+                        static constexpr float_64 temperature = 0.0;
+                    };
+                } // namespace param
 
-    /** change particle's momentum based on a temperature
-     *
-     * allow to change the temperature (randomly normal distributed)
-     * of a particle.
-     *
-     * @tparam T_ParamClass param::TemperatureCfg, configuration parameter
-     * @tparam T_ValueFunctor pmacc::nvidia::functors::*,  binary functor type to manipulate the momentum attribute
-     */
-    template<
-        typename T_ParamClass = param::TemperatureCfg,
-        typename T_ValueFunctor = pmacc::nvidia::functors::Add
-    >
-    using Temperature = generic::FreeRng<
-        acc::Temperature<
-            T_ParamClass,
-            T_ValueFunctor
-        >,
-        pmacc::random::distributions::Normal< float_X >
-    >;
+                /** Modify particle momentum based on temperature
+                 *
+                 * @tparam T_ParamClass param::TemperatureCfg, configuration parameter
+                 * @tparam T_ValueFunctor pmacc::nvidia::functors::*, binary functor type to
+                 *                        add a new momentum to an old one
+                 */
+                template<
+                    typename T_ParamClass = param::TemperatureCfg,
+                    typename T_ValueFunctor = pmacc::nvidia::functors::Add>
+                using Temperature = generic::FreeRng<
+                    acc::Temperature<T_ParamClass, T_ValueFunctor>,
+                    pmacc::random::distributions::Normal<float_X>>;
 
-} // namespace unary
-} // namespace manipulators
-} // namespace particles
+            } // namespace unary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/manipulators/unary/Temperature.hpp b/include/picongpu/particles/manipulators/unary/Temperature.hpp
index 6d2c0a8fe4..8e608413ee 100644
--- a/include/picongpu/particles/manipulators/unary/Temperature.hpp
+++ b/include/picongpu/particles/manipulators/unary/Temperature.hpp
@@ -1,5 +1,5 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera,
- *                     Alexander Grund
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera,
+ *                     Alexander Grund, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -25,93 +25,68 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace manipulators
-{
-namespace unary
-{
-namespace acc
-{
-
-    /** manipulate the speed based on a temperature
-     *
-     * @tparam T_ParamClass picongpu::particles::manipulators::unary::param::TemperatureCfg,
-     *                      type with compile configuration
-     * @tparam T_ValueFunctor pmacc::nvidia::functors, binary operator type to reduce current and new value
-     */
-    template<
-        typename T_ParamClass,
-        typename T_ValueFunctor
-    >
-    struct Temperature : private T_ValueFunctor
+    namespace particles
     {
-        /** manipulate the speed of the particle
-         *
-         * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator
-         * @tparam T_Particle pmacc::Particle, particle type
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param rng random number generator
-         * @param particle particle to be manipulated
-         * @param ... unused particles
-         */
-        template<
-            typename T_Rng,
-            typename T_Particle,
-            typename ... T_Args
-        >
-        HDINLINE void operator()(
-            T_Rng & rng,
-            T_Particle & particle,
-            T_Args && ...
-        )
+        namespace manipulators
         {
-            using ParamClass = T_ParamClass;
-
-            const float3_X tmpRand = float3_X(
-                rng(),
-                rng(),
-                rng()
-            );
-            float_X const macroWeighting = particle[ weighting_ ];
-
-            float_X const energy = ( ParamClass::temperature * UNITCONV_keV_to_Joule ) / UNIT_ENERGY;
-
-            // since energy is related to one particle
-            // and our units are normalized for macro particle quanities
-            // energy is quite small
-            float_X const macroEnergy = macroWeighting * energy;
-            // non-rel, MW:
-            //    p = m * v
-            //            v ~ sqrt(k*T/m), k*T = E
-            // => p = sqrt(m)
-            //
-            // Note on macro particle energies, with weighting w:
-            //    p_1 = m_1 * v
-            //                v = v_1 = v_w
-            //    p_w = p_1 * w
-            //    E_w = E_1 * w
-            // Since masses, energies and momenta add up linear, we can
-            // just take w times the p_1. Take care, E means E_1 !
-            // This goes to:
-            //    p_w = w * p_1 = w * m_1 * sqrt( E / m_1 )
-            //        = sqrt( E * w^2 * m_1 )
-            //        = sqrt( E * w * m_w )
-            // Which makes sense, since it means that we use a macroMass
-            // and a macroEnergy now.
-            float3_X const mom = tmpRand * ( float_X )math::sqrt(
-                precisionCast< sqrt_X >(
-                    macroEnergy *
-                    attribute::getMass(macroWeighting,particle)
-                )
-            );
-            T_ValueFunctor::operator( )( particle[ momentum_ ], mom );
-        }
-    };
+            namespace unary
+            {
+                namespace acc
+                {
+                    /** Modify particle momentum based on temperature
+                     *
+                     * Generate a new random momentum distributed according to the given
+                     * temperature and add it to the existing particle momentum.
+                     * This functor is for the non-relativistic case only.
+                     * In this case the new momentums follow the Maxwell-Boltzmann distribution.
+                     *
+                     * @tparam T_ParamClass picongpu::particles::manipulators::unary::param::TemperatureCfg,
+                     *                      type with compile configuration
+                     * @tparam T_ValueFunctor pmacc::nvidia::functors::*, binary functor type to
+                     *                        add a new momentum to an old one
+                     */
+                    template<typename T_ParamClass, typename T_ValueFunctor>
+                    struct Temperature : private T_ValueFunctor
+                    {
+                        /** manipulate the speed of the particle
+                         *
+                         * @tparam T_StandardNormalRng functor::misc::RngWrapper, standard
+                         *                             normal random number generator type
+                         * @tparam T_Particle pmacc::Particle, particle type
+                         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                         *
+                         * @param rng standard normal random number generator
+                         * @param particle particle to be manipulated
+                         * @param ... unused parameters
+                         */
+                        template<typename T_StandardNormalRng, typename T_Particle, typename... T_Args>
+                        HDINLINE void operator()(
+                            T_StandardNormalRng& standardNormalRng,
+                            T_Particle& particle,
+                            T_Args&&...)
+                        {
+                            /* In the non-relativistic case, particle momentums are following
+                             * the Maxwell-Boltzmann distribution: each component is
+                             * independently normally distributed with zero mean and variance of
+                             * m * k * T = m * E.
+                             * For the macroweighted momentums we store as particle[ momentum_ ],
+                             * the same relation holds, just m and E are also macroweighted
+                             */
+                            float_X const energy = (T_ParamClass::temperature * UNITCONV_keV_to_Joule) / UNIT_ENERGY;
+                            float_X const macroWeighting = particle[weighting_];
+                            float_X const macroEnergy = macroWeighting * energy;
+                            float_X const macroMass = attribute::getMass(macroWeighting, particle);
+                            float_X const standardDeviation
+                                = static_cast<float_X>(math::sqrt(precisionCast<sqrt_X>(macroEnergy * macroMass)));
+                            float3_X const mom
+                                = float3_X(standardNormalRng(), standardNormalRng(), standardNormalRng())
+                                * standardDeviation;
+                            T_ValueFunctor::operator()(particle[momentum_], mom);
+                        }
+                    };
 
-} // namespace acc
-} // namespace unary
-} // namespace manipulators
-} // namespace particles
+                } // namespace acc
+            } // namespace unary
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def b/include/picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def
index a2ff179db9..11a56845f8 100644
--- a/include/picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def
+++ b/include/picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Marco Garten
  *
  * This file is part of PIConGPU.
@@ -47,189 +47,136 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-    template<class T_ParticleShape, class T_DerivedAttribute>
-    class ComputeGridValuePerFrame
+    namespace particles
     {
-    public:
-
-        using AssignmentFunction = typename T_ParticleShape::ChargeAssignment;
-        static constexpr int supp = AssignmentFunction::support;
-
-        static constexpr int lowerMargin = supp / 2;
-        static constexpr int upperMargin = (supp + 1) / 2;
-        using LowerMargin = typename pmacc::math::CT::make_Int<simDim, lowerMargin>::type;
-        using UpperMargin = typename pmacc::math::CT::make_Int<simDim, upperMargin>::type;
-
-        HDINLINE ComputeGridValuePerFrame()
+        namespace particleToGrid
         {
-        }
-
-        /** return unit for this solver
-         *
-         * @return solver unit
-         */
-        HDINLINE float1_64 getUnit() const;
-
-        /** return powers of the 7 base measures for this solver
-         *
-         * characterizing the unit of the result of the solver in SI
-         * (length L, mass M, time T, electric current I,
-         *  thermodynamic temperature theta, amount of substance N,
-         *  luminous intensity J) */
-        HINLINE std::vector<float_64> getUnitDimension() const;
-
-        /** return name of the this solver
-         * @return name of solver
-         */
-        HINLINE static
-        std::string getName();
-
-        template<
-            typename FrameType,
-            typename TVecSuperCell,
-            typename BoxTmp,
-            typename T_Acc >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            FrameType& frame,
-            const int localIdx,
-            const TVecSuperCell superCell,
-            BoxTmp& tmpBox
-        );
-    };
-
-namespace detail
-{
-    /** Most derived fields just operate on the particle shape
-     *
-     * But some "debug" diagnostics just need the nearest cell, e.g. for
-     * counting, so we specialize such options here.
-     */
-    template<
-        typename T_Species,
-        typename T_DerivedAttribute
-    >
-    struct GetAttributeShape
-    {
-        using type = typename GetShape< T_Species >::type;
-    };
-
-    template< typename T_Species >
-    struct GetAttributeShape<
-        T_Species,
-        derivedAttributes::Counter
-    >
-    {
-        using type = shapes::Counter;
-    };
-    template< typename T_Species >
-    struct GetAttributeShape<
-        T_Species,
-        derivedAttributes::MacroCounter
-    >
-    {
-        using type = shapes::Counter;
-    };
-
-    template<
-        typename T_Species,
-        typename T_DerivedAttribute
-    >
-    using GetAttributeShape_t = typename GetAttributeShape<
-        T_Species,
-        T_DerivedAttribute
-    >::type;
-
-} // namespace detail
-
-    /** Solver Operation for Particle to Grid Projections
-     *
-     * Derives a scalar field from a particle species at runtime.
-     * Values are mapped to cells according either according to the
-     * species' spatial shape or a specifically overwritten (counter) shape
-     * depending on the implementation of the derived attribute
-     *
-     * @tparam T_Species a see picongpu::Particles class with a species definition,
-     *                   see see speciesDefinition.param
-     *
-     * @tparam T_DerivedAttribute a derived particle attribute from
-     *         picongpu::particles::particleToGrid::derivedAttributes
-     *
-     * @typedef defines a FieldTmpOperation class
-     */
-    template<
-        typename T_Species,
-        typename T_DerivedAttribute
-    >
-    struct CreateFieldTmpOperation
-    {
-        using shapeType = detail::GetAttributeShape_t<
-            T_Species,
-            T_DerivedAttribute
-        >;
-
-        using OperationPerFrame = ComputeGridValuePerFrame<
-            shapeType,
-            T_DerivedAttribute
-        >;
-        using type = FieldTmpOperation<
-            OperationPerFrame,
-            T_Species
-        >;
-    };
-    template<
-        typename T_Species,
-        typename T_DerivedAttribute
-    >
-    using CreateFieldTmpOperation_t = typename CreateFieldTmpOperation<
-        T_Species,
-        T_DerivedAttribute
-    >::type;
-
-    /** Create a list solvers for derived fields for eligible species
-     *
-     * Returns a list of FieldTmpOperation classes.
-     *
-     * @tparam T_SeqSpecies a sequence of particle species to check if they are
-     *                      eligible to derive the attribute T_DerivedAttribute
-     *                      from, also allows a single type instead of a sequence
-     * @tparam T_DerivedAttribute a derived attribute to map to the field grid,
-     *                            see defines in
-     *                            picongpu::particles::particleToGrid::derivedAttributes
-     */
-    template<
-        typename T_SeqSpecies,
-        typename T_DerivedAttribute
-    >
-    struct CreateEligible
-    {
-        // wrap single arguments to sequence
-        using SeqSpecies = typename pmacc::ToSeq< T_SeqSpecies >::type;
-        using DerivedAttribute = T_DerivedAttribute;
-
-        using type = typename traits::GenerateSolversIfSpeciesEligible<
-            CreateFieldTmpOperation<
-                bmpl::_1,
-                DerivedAttribute
-            >,
-            SeqSpecies,
-            DerivedAttribute
-        >::type;
-    };
-
-    template<
-        typename T_SeqSpecies,
-        typename T_DerivedAttribute
-    >
-    using CreateEligible_t = typename CreateEligible<
-        T_SeqSpecies,
-        T_DerivedAttribute
-    >::type;
-
-} // namespace particleToGrid
-} // namespace particles
+            template<class T_ParticleShape, class T_DerivedAttribute>
+            class ComputeGridValuePerFrame
+            {
+            public:
+                using AssignmentFunction = typename T_ParticleShape::ChargeAssignment;
+                static constexpr int supp = AssignmentFunction::support;
+
+                static constexpr int lowerMargin = supp / 2;
+                static constexpr int upperMargin = (supp + 1) / 2;
+                using LowerMargin = typename pmacc::math::CT::make_Int<simDim, lowerMargin>::type;
+                using UpperMargin = typename pmacc::math::CT::make_Int<simDim, upperMargin>::type;
+
+                HDINLINE ComputeGridValuePerFrame()
+                {
+                }
+
+                /** return unit for this solver
+                 *
+                 * @return solver unit
+                 */
+                HDINLINE float1_64 getUnit() const;
+
+                /** return powers of the 7 base measures for this solver
+                 *
+                 * characterizing the unit of the result of the solver in SI
+                 * (length L, mass M, time T, electric current I,
+                 *  thermodynamic temperature theta, amount of substance N,
+                 *  luminous intensity J) */
+                HINLINE std::vector<float_64> getUnitDimension() const;
+
+                /** return name of the this solver
+                 * @return name of solver
+                 */
+                HINLINE static std::string getName();
+
+                template<typename FrameType, typename TVecSuperCell, typename BoxTmp, typename T_Acc>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    FrameType& frame,
+                    const int localIdx,
+                    const TVecSuperCell superCell,
+                    BoxTmp& tmpBox);
+            };
+
+            namespace detail
+            {
+                /** Most derived fields just operate on the particle shape
+                 *
+                 * But some "debug" diagnostics just need the nearest cell, e.g. for
+                 * counting, so we specialize such options here.
+                 */
+                template<typename T_Species, typename T_DerivedAttribute>
+                struct GetAttributeShape
+                {
+                    using type = typename GetShape<T_Species>::type;
+                };
+
+                template<typename T_Species>
+                struct GetAttributeShape<T_Species, derivedAttributes::Counter>
+                {
+                    using type = shapes::Counter;
+                };
+                template<typename T_Species>
+                struct GetAttributeShape<T_Species, derivedAttributes::MacroCounter>
+                {
+                    using type = shapes::Counter;
+                };
+
+                template<typename T_Species, typename T_DerivedAttribute>
+                using GetAttributeShape_t = typename GetAttributeShape<T_Species, T_DerivedAttribute>::type;
+
+            } // namespace detail
+
+            /** Solver Operation for Particle to Grid Projections
+             *
+             * Derives a scalar field from a particle species at runtime.
+             * Values are mapped to cells according either according to the
+             * species' spatial shape or a specifically overwritten (counter) shape
+             * depending on the implementation of the derived attribute
+             *
+             * @tparam T_Species a see picongpu::Particles class with a species definition,
+             *                   see see speciesDefinition.param
+             *
+             * @tparam T_DerivedAttribute a derived particle attribute from
+             *         picongpu::particles::particleToGrid::derivedAttributes
+             *
+             * @typedef defines a FieldTmpOperation class
+             */
+            template<typename T_Species, typename T_DerivedAttribute>
+            struct CreateFieldTmpOperation
+            {
+                using shapeType = detail::GetAttributeShape_t<T_Species, T_DerivedAttribute>;
+
+                using OperationPerFrame = ComputeGridValuePerFrame<shapeType, T_DerivedAttribute>;
+                using type = FieldTmpOperation<OperationPerFrame, T_Species>;
+            };
+            template<typename T_Species, typename T_DerivedAttribute>
+            using CreateFieldTmpOperation_t = typename CreateFieldTmpOperation<T_Species, T_DerivedAttribute>::type;
+
+            /** Create a list solvers for derived fields for eligible species
+             *
+             * Returns a list of FieldTmpOperation classes.
+             *
+             * @tparam T_SeqSpecies a sequence of particle species to check if they are
+             *                      eligible to derive the attribute T_DerivedAttribute
+             *                      from, also allows a single type instead of a sequence
+             * @tparam T_DerivedAttribute a derived attribute to map to the field grid,
+             *                            see defines in
+             *                            picongpu::particles::particleToGrid::derivedAttributes
+             */
+            template<typename T_SeqSpecies, typename T_DerivedAttribute>
+            struct CreateEligible
+            {
+                // wrap single arguments to sequence
+                using SeqSpecies = typename pmacc::ToSeq<T_SeqSpecies>::type;
+                using DerivedAttribute = T_DerivedAttribute;
+
+                using type = typename traits::GenerateSolversIfSpeciesEligible<
+                    CreateFieldTmpOperation<bmpl::_1, DerivedAttribute>,
+                    SeqSpecies,
+                    DerivedAttribute>::type;
+            };
+
+            template<typename T_SeqSpecies, typename T_DerivedAttribute>
+            using CreateEligible_t = typename CreateEligible<T_SeqSpecies, T_DerivedAttribute>::type;
+
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/ComputeGridValuePerFrame.hpp b/include/picongpu/particles/particleToGrid/ComputeGridValuePerFrame.hpp
index 67f64cce58..ee73d64af2 100644
--- a/include/picongpu/particles/particleToGrid/ComputeGridValuePerFrame.hpp
+++ b/include/picongpu/particles/particleToGrid/ComputeGridValuePerFrame.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -33,103 +33,96 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-
-template<class T_ParticleShape, class T_DerivedAttribute>
-HDINLINE float1_64
-ComputeGridValuePerFrame<T_ParticleShape, T_DerivedAttribute>::getUnit() const
-{
-    return T_DerivedAttribute().getUnit();
-}
-
-template<class T_ParticleShape, class T_DerivedAttribute>
-HINLINE std::vector<float_64>
-ComputeGridValuePerFrame<T_ParticleShape, T_DerivedAttribute>::getUnitDimension() const
-{
-    return T_DerivedAttribute().getUnitDimension();
-}
-
-template<class T_ParticleShape, class T_DerivedAttribute>
-HINLINE std::string
-ComputeGridValuePerFrame<T_ParticleShape, T_DerivedAttribute>::getName()
-{
-    return T_DerivedAttribute::getName();
-}
-
-template<class T_ParticleShape, class T_DerivedAttribute>
-template<class FrameType, class TVecSuperCell, class BoxTmp, typename T_Acc>
-DINLINE void
-ComputeGridValuePerFrame<T_ParticleShape, T_DerivedAttribute>::operator()
-(
-    T_Acc const & acc,
-    FrameType& frame,
-    const int localIdx,
-    const TVecSuperCell superCell,
-    BoxTmp& tmpBox
-)
-{
-    /* \todo in the future and if useful, the functor can be a parameter */
-    T_DerivedAttribute particleAttribute;
-
-    auto particle = frame[localIdx];
-
-    /* particle attributes: in-cell position and generic, derived attribute */
-    const floatD_X pos = particle[position_];
-    const auto particleAttr = particleAttribute( particle );
-
-    /** Shift to the cell the particle belongs to
-     * range of particleCell: [DataSpace<simDim>::create(0), TVecSuperCell]
-     */
-    const int particleCellIdx = particle[localCellIdx_];
-    const DataSpace<TVecSuperCell::dim> particleCell(
-        DataSpaceOperations<TVecSuperCell::dim>::map( superCell, particleCellIdx )
-    );
-    auto fieldTmpShiftToParticle = tmpBox.shift(particleCell);
-
-    /* loop around the particle's cell (according to shape) */
-    const DataSpace<simDim> lowMargin(LowerMargin().toRT());
-    const DataSpace<simDim> upMargin(UpperMargin().toRT());
-
-    const DataSpace<simDim> marginSpace(upMargin + lowMargin + 1);
-
-    const int numWriteCells = marginSpace.productOfComponents();
-
-    for (int i = 0; i < numWriteCells; ++i)
+    namespace particles
     {
-        /** for the current cell i the multi dimensional index currentCell is only positive:
-         * allowed range = [DataSpace<simDim>::create(0), LowerMargin+UpperMargin]
-         */
-        const DataSpace<simDim> currentCell = DataSpaceOperations<simDim>::map(marginSpace, i);
-
-        /** calculate the offset between the current cell i with simDim index currentCell
-         * and the cell of the particle (particleCell) in cells
-         */
-        const DataSpace<simDim> offsetParticleCellToCurrentCell = currentCell - lowMargin;
-
-        /** assign particle contribution component-wise to the lower left corner of
-         * the cell i
-         * \todo take care of non-yee cells
-         */
-        float_X assign( 1.0 );
-        for (uint32_t d = 0; d < simDim; ++d)
-            assign *= AssignmentFunction()(float_X(offsetParticleCellToCurrentCell[d]) - pos[d]);
-
-        /** add contribution of the particle times the generic attribute
-         * to cell i
-         * note: the .x() is used because FieldTmp is a scalar field with only
-         * one "x" component
-         */
-        atomicAdd(
-            &(fieldTmpShiftToParticle(offsetParticleCellToCurrentCell).x()),
-            assign * particleAttr,
-            ::alpaka::hierarchy::Threads{}
-        );
-    }
-}
-
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            template<class T_ParticleShape, class T_DerivedAttribute>
+            HDINLINE float1_64 ComputeGridValuePerFrame<T_ParticleShape, T_DerivedAttribute>::getUnit() const
+            {
+                return T_DerivedAttribute().getUnit();
+            }
+
+            template<class T_ParticleShape, class T_DerivedAttribute>
+            HINLINE std::vector<float_64> ComputeGridValuePerFrame<T_ParticleShape, T_DerivedAttribute>::
+                getUnitDimension() const
+            {
+                return T_DerivedAttribute().getUnitDimension();
+            }
+
+            template<class T_ParticleShape, class T_DerivedAttribute>
+            HINLINE std::string ComputeGridValuePerFrame<T_ParticleShape, T_DerivedAttribute>::getName()
+            {
+                return T_DerivedAttribute::getName();
+            }
+
+            template<class T_ParticleShape, class T_DerivedAttribute>
+            template<class FrameType, class TVecSuperCell, class BoxTmp, typename T_Acc>
+            DINLINE void ComputeGridValuePerFrame<T_ParticleShape, T_DerivedAttribute>::operator()(
+                T_Acc const& acc,
+                FrameType& frame,
+                const int localIdx,
+                const TVecSuperCell superCell,
+                BoxTmp& tmpBox)
+            {
+                /* \todo in the future and if useful, the functor can be a parameter */
+                T_DerivedAttribute particleAttribute;
+
+                auto particle = frame[localIdx];
+
+                /* particle attributes: in-cell position and generic, derived attribute */
+                const floatD_X pos = particle[position_];
+                const auto particleAttr = particleAttribute(particle);
+
+                /** Shift to the cell the particle belongs to
+                 * range of particleCell: [DataSpace<simDim>::create(0), TVecSuperCell]
+                 */
+                const int particleCellIdx = particle[localCellIdx_];
+                const DataSpace<TVecSuperCell::dim> particleCell(
+                    DataSpaceOperations<TVecSuperCell::dim>::map(superCell, particleCellIdx));
+                auto fieldTmpShiftToParticle = tmpBox.shift(particleCell);
+
+                /* loop around the particle's cell (according to shape) */
+                const DataSpace<simDim> lowMargin(LowerMargin().toRT());
+                const DataSpace<simDim> upMargin(UpperMargin().toRT());
+
+                const DataSpace<simDim> marginSpace(upMargin + lowMargin + 1);
+
+                const int numWriteCells = marginSpace.productOfComponents();
+
+                for(int i = 0; i < numWriteCells; ++i)
+                {
+                    /** for the current cell i the multi dimensional index currentCell is only positive:
+                     * allowed range = [DataSpace<simDim>::create(0), LowerMargin+UpperMargin]
+                     */
+                    const DataSpace<simDim> currentCell = DataSpaceOperations<simDim>::map(marginSpace, i);
+
+                    /** calculate the offset between the current cell i with simDim index currentCell
+                     * and the cell of the particle (particleCell) in cells
+                     */
+                    const DataSpace<simDim> offsetParticleCellToCurrentCell = currentCell - lowMargin;
+
+                    /** assign particle contribution component-wise to the lower left corner of
+                     * the cell i
+                     * \todo take care of non-yee cells
+                     */
+                    float_X assign(1.0);
+                    for(uint32_t d = 0; d < simDim; ++d)
+                        assign *= AssignmentFunction()(float_X(offsetParticleCellToCurrentCell[d]) - pos[d]);
+
+                    /** add contribution of the particle times the generic attribute
+                     * to cell i
+                     * note: the .x() is used because FieldTmp is a scalar field with only
+                     * one "x" component
+                     */
+                    cupla::atomicAdd(
+                        acc,
+                        &(fieldTmpShiftToParticle(offsetParticleCellToCurrentCell).x()),
+                        assign * particleAttr,
+                        ::alpaka::hierarchy::Threads{});
+                }
+            }
+
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/BoundElectronDensity.def b/include/picongpu/particles/particleToGrid/derivedAttributes/BoundElectronDensity.def
index c0b728b2ef..2741f64192 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/BoundElectronDensity.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/BoundElectronDensity.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -30,86 +30,70 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Density of Bound Electrons Operation for Particle to Grid Projections
-     *
-     * Derives a scalar density field from a particle species at runtime.
-     * Each value is mapped per cell according to the species' spatial shape.
-     *
-     * @note only makes sense for partially ionized ions
-     */
-    struct BoundElectronDensity
+    namespace particles
     {
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector< float_64 >
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * Density is in inverse cubic meter: m^-3
-            *   -> L^-3
-            */
-           std::vector< float_64 > unitDimension( 7, 0.0 );
-           unitDimension.at( SIBaseUnits::length ) = -3.0;
+            namespace derivedAttributes
+            {
+                /** Density of Bound Electrons Operation for Particle to Grid Projections
+                 *
+                 * Derives a scalar density field from a particle species at runtime.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 *
+                 * @note only makes sense for partially ionized ions
+                 */
+                struct BoundElectronDensity
+                {
+                    HDINLINE float1_64 getUnit() const;
 
-           return unitDimension;
-        }
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * Density is in inverse cubic meter: m^-3
+                         *   -> L^-3
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
+                        unitDimension.at(SIBaseUnits::length) = -3.0;
 
-        static HINLINE
-        std::string
-        getName()
-        {
-            return "boundElectronDensity";
-        }
+                        return unitDimension;
+                    }
 
-        /** Calculate a new attribute per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * @tparam T_Particle particle in the frame
-         * @param particle particle in the frame
-         *
-         * @return new attribute for the particle (type @see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
+                    static HINLINE std::string getName()
+                    {
+                        return "boundElectronDensity";
+                    }
 
-namespace traits
-{
-    template< typename T_Species >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::BoundElectronDensity
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
+                    /** Calculate a new attribute per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * @tparam T_Particle particle in the frame
+                     * @param particle particle in the frame
+                     *
+                     * @return new attribute for the particle (type @see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+
+        namespace traits
+        {
+            template<typename T_Species>
+            struct SpeciesEligibleForSolver<T_Species, particleToGrid::derivedAttributes::BoundElectronDensity>
+            {
+                using FrameType = typename T_Species::FrameType;
 
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            position<>,
-            boundElectrons
-        >;
+                using RequiredIdentifiers = MakeSeq_t<weighting, position<>, boundElectrons>;
 
-        using type = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-    };
-} // namespace traits
-} // namespace particles
+                using type = typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/BoundElectronDensity.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/BoundElectronDensity.hpp
index 199d5af94b..8721f3caa2 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/BoundElectronDensity.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/BoundElectronDensity.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -25,35 +25,32 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    HDINLINE float1_64
-    BoundElectronDensity::getUnit() const
+    namespace particles
     {
-        constexpr float_64 UNIT_VOLUME = UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH;
-        return particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE / UNIT_VOLUME;
-    }
-
-    template< class T_Particle >
-    DINLINE float_X
-    BoundElectronDensity::operator()( T_Particle& particle ) const
-    {
-        // read existing attributes
-        float_X const weighting = particle[ weighting_ ];
-        float_X const boundElectrons = particle[ boundElectrons_ ];
-
-        // calculate new attribute
-        float_X const boundElectronDensity = weighting * boundElectrons /
-            ( particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * CELL_VOLUME );
-
-        return boundElectronDensity;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                HDINLINE float1_64 BoundElectronDensity::getUnit() const
+                {
+                    constexpr float_64 UNIT_VOLUME = UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH;
+                    return particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE / UNIT_VOLUME;
+                }
+
+                template<class T_Particle>
+                DINLINE float_X BoundElectronDensity::operator()(T_Particle& particle) const
+                {
+                    // read existing attributes
+                    float_X const weighting = particle[weighting_];
+                    float_X const boundElectrons = particle[boundElectrons_];
+
+                    // calculate new attribute
+                    float_X const boundElectronDensity = weighting * boundElectrons
+                        / (particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * CELL_VOLUME);
+
+                    return boundElectronDensity;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/ChargeDensity.def b/include/picongpu/particles/particleToGrid/derivedAttributes/ChargeDensity.def
index 8340d9299e..d6d2a0a7ac 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/ChargeDensity.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/ChargeDensity.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -33,99 +33,79 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Charge Density Operation for Particle to Grid Projections
-     *
-     * Derives a scalar charge density field from a particle species at runtime.
-     * Each value is mapped per cell according to the species' spatial shape.
-     *
-     * @note for species that do not change their charge state, this is identical
-     *       to the density times the (constant) particles' charge,
-     *       @see CreateDensityOperation
-     */
-    struct ChargeDensity
+    namespace particles
     {
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * ChargeDensity is in Coulomb / cubic meter: Q / m^3 = A * s / m^3
-            *   -> L^-3 * T * I
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
-           unitDimension.at(SIBaseUnits::length) = -3.0;
-           unitDimension.at(SIBaseUnits::time)   =  1.0;
-           unitDimension.at(SIBaseUnits::electricCurrent) =  1.0;
-
-           return unitDimension;
-        }
-
-        HINLINE static
-        std::string
-        getName()
+            namespace derivedAttributes
+            {
+                /** Charge Density Operation for Particle to Grid Projections
+                 *
+                 * Derives a scalar charge density field from a particle species at runtime.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 *
+                 * @note for species that do not change their charge state, this is identical
+                 *       to the density times the (constant) particles' charge,
+                 *       @see CreateDensityOperation
+                 */
+                struct ChargeDensity
+                {
+                    HDINLINE float1_64 getUnit() const;
+
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * ChargeDensity is in Coulomb / cubic meter: Q / m^3 = A * s / m^3
+                         *   -> L^-3 * T * I
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
+                        unitDimension.at(SIBaseUnits::length) = -3.0;
+                        unitDimension.at(SIBaseUnits::time) = 1.0;
+                        unitDimension.at(SIBaseUnits::electricCurrent) = 1.0;
+
+                        return unitDimension;
+                    }
+
+                    HINLINE static std::string getName()
+                    {
+                        return "chargeDensity";
+                    }
+
+                    /** Calculate a new attribute  per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+
+        namespace traits
         {
-            return "chargeDensity";
-        }
-
-        /** Calculate a new attribute  per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
-
-namespace traits
-{
-    template< typename T_Species >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::ChargeDensity
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
-
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            position<>
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        using SpeciesHasFlags = typename pmacc::traits::HasFlag<
-            FrameType,
-            chargeRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasFlags
-        >;
-    };
-} // namespace traits
-} // namespace particles
+            template<typename T_Species>
+            struct SpeciesEligibleForSolver<T_Species, particleToGrid::derivedAttributes::ChargeDensity>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                using RequiredIdentifiers = MakeSeq_t<weighting, position<>>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                using SpeciesHasFlags = typename pmacc::traits::HasFlag<FrameType, chargeRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasFlags>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/ChargeDensity.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/ChargeDensity.hpp
index 63c6db51f6..e91f506c0c 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/ChargeDensity.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/ChargeDensity.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,35 +26,32 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    HDINLINE float1_64
-    ChargeDensity::getUnit() const
-    {
-        const float_64 UNIT_VOLUME = (UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH);
-        return UNIT_CHARGE / UNIT_VOLUME;
-    }
-
-    template< class T_Particle >
-    DINLINE float_X
-    ChargeDensity::operator()( T_Particle& particle ) const
+    namespace particles
     {
-        /* read existing attributes */
-        const float_X weighting = particle[weighting_];
-        const float_X charge = attribute::getCharge( weighting, particle );
-
-        /* calculate new attribute */
-        const float_X particleChargeDensity = charge / CELL_VOLUME;
-
-        /* return attribute */
-        return particleChargeDensity;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                HDINLINE float1_64 ChargeDensity::getUnit() const
+                {
+                    const float_64 UNIT_VOLUME = (UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH);
+                    return UNIT_CHARGE / UNIT_VOLUME;
+                }
+
+                template<class T_Particle>
+                DINLINE float_X ChargeDensity::operator()(T_Particle& particle) const
+                {
+                    /* read existing attributes */
+                    const float_X weighting = particle[weighting_];
+                    const float_X charge = attribute::getCharge(weighting, particle);
+
+                    /* calculate new attribute */
+                    const float_X particleChargeDensity = charge / CELL_VOLUME;
+
+                    /* return attribute */
+                    return particleChargeDensity;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/Counter.def b/include/picongpu/particles/particleToGrid/derivedAttributes/Counter.def
index 1d3f8e78ac..7130f8213f 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/Counter.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/Counter.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -29,88 +29,74 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Real-Particle Counter Operation for Particle to Grid Projections
-     *
-     * Derives a scalar field with real particle numbers per cell from a particle
-     * species at runtime.
-     * Each macro particle's weighting is assigned straight to the cell it belongs
-     * to, which is in most cases a floor operation in space (and not necessarily
-     * the "nearest" cell-origin).
-     *
-     * @note Use this only for debug purposes, since the deposition "shape" is
-     *       non-physical (inconsistent with charge & momentum-conserving shapes).
-     *       Be aware that this is NOT the same as NGP (0. order shape) assignment
-     *       in a staggered grid.
-     */
-    struct Counter
+    namespace particles
     {
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * Counter is unitless
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
+            namespace derivedAttributes
+            {
+                /** Real-Particle Counter Operation for Particle to Grid Projections
+                 *
+                 * Derives a scalar field with real particle numbers per cell from a particle
+                 * species at runtime.
+                 * Each macro particle's weighting is assigned straight to the cell it belongs
+                 * to, which is in most cases a floor operation in space (and not necessarily
+                 * the "nearest" cell-origin).
+                 *
+                 * @note Use this only for debug purposes, since the deposition "shape" is
+                 *       non-physical (inconsistent with charge & momentum-conserving shapes).
+                 *       Be aware that this is NOT the same as NGP (0. order shape) assignment
+                 *       in a staggered grid.
+                 */
+                struct Counter
+                {
+                    HDINLINE float1_64 getUnit() const;
 
-           return unitDimension;
-        }
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * Counter is unitless
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
 
-        HINLINE static
-        std::string
-        getName()
-        {
-            return "particleCounter";
-        }
+                        return unitDimension;
+                    }
 
-        /** Calculate a new attribute  per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
+                    HINLINE static std::string getName()
+                    {
+                        return "particleCounter";
+                    }
 
-namespace traits
-{
-    template< typename T_Species >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::Counter
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
+                    /** Calculate a new attribute  per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+
+        namespace traits
+        {
+            template<typename T_Species>
+            struct SpeciesEligibleForSolver<T_Species, particleToGrid::derivedAttributes::Counter>
+            {
+                using FrameType = typename T_Species::FrameType;
 
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting
-        >;
+                using RequiredIdentifiers = MakeSeq_t<weighting>;
 
-        using type = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-    };
-} // namespace traits
-} // namespace particles
+                using type = typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/Counter.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/Counter.hpp
index 453511d980..8a0949aad8 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/Counter.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/Counter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,34 +26,30 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    HDINLINE float1_64
-    Counter::getUnit() const
-    {
-        return particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE;
-    }
-
-    template< class T_Particle >
-    DINLINE float_X
-    Counter::operator()( T_Particle& particle ) const
+    namespace particles
     {
-        /* read existing attributes */
-        const float_X weighting = particle[weighting_];
-
-        /* calculate new attribute */
-        const float_X particleCounter = weighting /
-            particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE;
-
-        /* return attribute */
-        return particleCounter;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                HDINLINE float1_64 Counter::getUnit() const
+                {
+                    return particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE;
+                }
+
+                template<class T_Particle>
+                DINLINE float_X Counter::operator()(T_Particle& particle) const
+                {
+                    /* read existing attributes */
+                    const float_X weighting = particle[weighting_];
+
+                    /* calculate new attribute */
+                    const float_X particleCounter = weighting / particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE;
+
+                    /* return attribute */
+                    return particleCounter;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/Density.def b/include/picongpu/particles/particleToGrid/derivedAttributes/Density.def
index f2682d0b0a..e040cb9d05 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/Density.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/Density.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -30,83 +30,68 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Density Operation for Particle to Grid Projections
-     *
-     * Derives a scalar density field from a particle species at runtime.
-     * Each value is mapped per cell according to the species' spatial shape.
-     */
-    struct Density
+    namespace particles
     {
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * Density is in inverse cubic meter: m^-3
-            *   -> L^-3
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
-           unitDimension.at(SIBaseUnits::length) = -3.0;
+            namespace derivedAttributes
+            {
+                /** Density Operation for Particle to Grid Projections
+                 *
+                 * Derives a scalar density field from a particle species at runtime.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 */
+                struct Density
+                {
+                    HDINLINE float1_64 getUnit() const;
 
-           return unitDimension;
-        }
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * Density is in inverse cubic meter: m^-3
+                         *   -> L^-3
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
+                        unitDimension.at(SIBaseUnits::length) = -3.0;
 
-        HINLINE static
-        std::string
-        getName()
-        {
-            return "density";
-        }
+                        return unitDimension;
+                    }
 
-        /** Calculate a new attribute  per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
+                    HINLINE static std::string getName()
+                    {
+                        return "density";
+                    }
 
-namespace traits
-{
-    template< typename T_Species >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::Density
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
+                    /** Calculate a new attribute  per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+
+        namespace traits
+        {
+            template<typename T_Species>
+            struct SpeciesEligibleForSolver<T_Species, particleToGrid::derivedAttributes::Density>
+            {
+                using FrameType = typename T_Species::FrameType;
 
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            position<>
-        >;
+                using RequiredIdentifiers = MakeSeq_t<weighting, position<>>;
 
-        using type = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-    };
-} // namespace traits
-} // namespace particles
+                using type = typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/Density.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/Density.hpp
index 4cc88fa7d6..b8501fc6ef 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/Density.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/Density.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,35 +26,32 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    HDINLINE float1_64
-    Density::getUnit() const
-    {
-        const float_64 UNIT_VOLUME = (UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH);
-        return particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE / UNIT_VOLUME;
-    }
-
-    template< class T_Particle >
-    DINLINE float_X
-    Density::operator()( T_Particle& particle ) const
+    namespace particles
     {
-        /* read existing attributes */
-        const float_X weighting = particle[weighting_];
-
-        /* calculate new attribute */
-        const float_X particleDensity = weighting /
-            ( particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * CELL_VOLUME );
-
-        /* return attribute */
-        return particleDensity;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                HDINLINE float1_64 Density::getUnit() const
+                {
+                    const float_64 UNIT_VOLUME = (UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH);
+                    return particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE / UNIT_VOLUME;
+                }
+
+                template<class T_Particle>
+                DINLINE float_X Density::operator()(T_Particle& particle) const
+                {
+                    /* read existing attributes */
+                    const float_X weighting = particle[weighting_];
+
+                    /* calculate new attribute */
+                    const float_X particleDensity
+                        = weighting / (particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * CELL_VOLUME);
+
+                    /* return attribute */
+                    return particleDensity;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.def b/include/picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.def
index ab25f4267c..d64d0fa48e 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl, Richard Pausch
+/* Copyright 2015-2021 Axel Huebl, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.hpp
index cd396fdc1b..a1a605d8e0 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl, Richard Pausch
+/* Copyright 2015-2021 Axel Huebl, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/Energy.def b/include/picongpu/particles/particleToGrid/derivedAttributes/Energy.def
index 42128a0f6e..eb1565e6f1 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/Energy.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/Energy.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -33,100 +33,79 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Kinetic Energy Operation for Particle to Grid Projections
-     *
-     * Derives a scalar field for summed kinetic particle energy from a particle
-     * species at runtime.
-     * Each value is mapped per cell according to the species' spatial shape.
-     *
-     * @note this is the same as @see CreateEnergyDensityOperation times the cell
-     *       volume
-     */
-    struct Energy
+    namespace particles
     {
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * Energy is in Joule: J = kg * m^2 / s^2
-            *   -> L^2 * M * T^-2
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
-           unitDimension.at(SIBaseUnits::length) =  2.0;
-           unitDimension.at(SIBaseUnits::mass)   =  1.0;
-           unitDimension.at(SIBaseUnits::time)   = -2.0;
-
-           return unitDimension;
-        }
-
-        HINLINE static
-        std::string
-        getName()
+            namespace derivedAttributes
+            {
+                /** Kinetic Energy Operation for Particle to Grid Projections
+                 *
+                 * Derives a scalar field for summed kinetic particle energy from a particle
+                 * species at runtime.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 *
+                 * @note this is the same as @see CreateEnergyDensityOperation times the cell
+                 *       volume
+                 */
+                struct Energy
+                {
+                    HDINLINE float1_64 getUnit() const;
+
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * Energy is in Joule: J = kg * m^2 / s^2
+                         *   -> L^2 * M * T^-2
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
+                        unitDimension.at(SIBaseUnits::length) = 2.0;
+                        unitDimension.at(SIBaseUnits::mass) = 1.0;
+                        unitDimension.at(SIBaseUnits::time) = -2.0;
+
+                        return unitDimension;
+                    }
+
+                    HINLINE static std::string getName()
+                    {
+                        return "particleEnergy";
+                    }
+
+                    /** Calculate a new attribute  per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+
+        namespace traits
         {
-            return "particleEnergy";
-        }
-
-        /** Calculate a new attribute  per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
-
-namespace traits
-{
-    template< typename T_Species >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::Energy
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
-
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            position<>,
-            momentum
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        using SpeciesHasFlags = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasFlags
-        >;
-    };
-} // namespace traits
-} // namespace particles
+            template<typename T_Species>
+            struct SpeciesEligibleForSolver<T_Species, particleToGrid::derivedAttributes::Energy>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                using RequiredIdentifiers = MakeSeq_t<weighting, position<>, momentum>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                using SpeciesHasFlags = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasFlags>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/Energy.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/Energy.hpp
index ae36c2d87c..fbad56efd8 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/Energy.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/Energy.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,31 +27,28 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    HDINLINE float1_64
-    Energy::getUnit() const
+    namespace particles
     {
-        return UNIT_ENERGY;
-    }
-
-    template< class T_Particle >
-    DINLINE float_X
-    Energy::operator()( T_Particle& particle ) const
-    {
-        /* read existing attributes */
-        const float_X weighting = particle[weighting_];
-        const float3_X mom = particle[momentum_];
-        const float_X mass = attribute::getMass( weighting, particle );
-
-        return KinEnergy<>()( mom, mass );
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                HDINLINE float1_64 Energy::getUnit() const
+                {
+                    return UNIT_ENERGY;
+                }
+
+                template<class T_Particle>
+                DINLINE float_X Energy::operator()(T_Particle& particle) const
+                {
+                    /* read existing attributes */
+                    const float_X weighting = particle[weighting_];
+                    const float3_X mom = particle[momentum_];
+                    const float_X mass = attribute::getMass(weighting, particle);
+
+                    return KinEnergy<>()(mom, mass);
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensity.def b/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensity.def
index 0a92d1a156..098048b0f6 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensity.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensity.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -33,102 +33,81 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Energy Density Operation for Particle to Grid Projections
-     *
-     * Derives a scalar field for average kinetic particle energy per cell times the
-     * particle density from a particle species at runtime.
-     * Each value is mapped per cell according to the species' spatial shape.
-     *
-     * @note this is the same as the sum of kinetic particle energy
-     *       divided by a constant for the cell volume
-     *       @see CreateEnergyOperation
-     */
-    struct EnergyDensity
+    namespace particles
     {
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * EnergyDensity is in Joule / cubic meter: J / m^3 = kg * m^2 / s^2 / m^3
-            *                                                  = kg / (s^2 * m)
-            *   -> L^-1 * M * T^-2
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
-           unitDimension.at(SIBaseUnits::length) = -1.0;
-           unitDimension.at(SIBaseUnits::mass)   =  1.0;
-           unitDimension.at(SIBaseUnits::time)   = -2.0;
-
-           return unitDimension;
-        }
-
-        HINLINE static
-        std::string
-        getName()
+            namespace derivedAttributes
+            {
+                /** Energy Density Operation for Particle to Grid Projections
+                 *
+                 * Derives a scalar field for average kinetic particle energy per cell times the
+                 * particle density from a particle species at runtime.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 *
+                 * @note this is the same as the sum of kinetic particle energy
+                 *       divided by a constant for the cell volume
+                 *       @see CreateEnergyOperation
+                 */
+                struct EnergyDensity
+                {
+                    HDINLINE float1_64 getUnit() const;
+
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * EnergyDensity is in Joule / cubic meter: J / m^3 = kg * m^2 / s^2 / m^3
+                         *                                                  = kg / (s^2 * m)
+                         *   -> L^-1 * M * T^-2
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
+                        unitDimension.at(SIBaseUnits::length) = -1.0;
+                        unitDimension.at(SIBaseUnits::mass) = 1.0;
+                        unitDimension.at(SIBaseUnits::time) = -2.0;
+
+                        return unitDimension;
+                    }
+
+                    HINLINE static std::string getName()
+                    {
+                        return "energyDensity";
+                    }
+
+                    /** Calculate a new attribute  per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+
+        namespace traits
         {
-            return "energyDensity";
-        }
-
-        /** Calculate a new attribute  per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
-
-namespace traits
-{
-    template< typename T_Species >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::EnergyDensity
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
-
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            position<>,
-            momentum
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        using SpeciesHasFlags = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasFlags
-        >;
-    };
-} // namespace traits
-} // namespace particles
+            template<typename T_Species>
+            struct SpeciesEligibleForSolver<T_Species, particleToGrid::derivedAttributes::EnergyDensity>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                using RequiredIdentifiers = MakeSeq_t<weighting, position<>, momentum>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                using SpeciesHasFlags = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasFlags>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensity.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensity.hpp
index d0b336f569..456d029de8 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensity.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensity.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Heiko Burau
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -27,34 +27,31 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    HDINLINE float1_64
-    EnergyDensity::getUnit() const
-    {
-        constexpr float_64 UNIT_VOLUME = (UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH);
-        return UNIT_ENERGY / UNIT_VOLUME;
-    }
-
-    template< class T_Particle >
-    DINLINE float_X
-    EnergyDensity::operator()( T_Particle& particle ) const
+    namespace particles
     {
-        /* read existing attributes */
-        const float_X weighting = particle[weighting_];
-        const float3_X mom = particle[momentum_];
-        const float_X mass = attribute::getMass( weighting, particle );
-
-        constexpr float_X INV_CELL_VOLUME = float_X(1.0) / CELL_VOLUME;
-
-        return KinEnergy<>()( mom, mass ) * INV_CELL_VOLUME;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                HDINLINE float1_64 EnergyDensity::getUnit() const
+                {
+                    constexpr float_64 UNIT_VOLUME = (UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH);
+                    return UNIT_ENERGY / UNIT_VOLUME;
+                }
+
+                template<class T_Particle>
+                DINLINE float_X EnergyDensity::operator()(T_Particle& particle) const
+                {
+                    /* read existing attributes */
+                    const float_X weighting = particle[weighting_];
+                    const float3_X mom = particle[momentum_];
+                    const float_X mass = attribute::getMass(weighting, particle);
+
+                    constexpr float_X INV_CELL_VOLUME = float_X(1.0) / CELL_VOLUME;
+
+                    return KinEnergy<>()(mom, mass) * INV_CELL_VOLUME;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensityCutoff.def b/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensityCutoff.def
index 8c9df4c6a6..f97240c191 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensityCutoff.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensityCutoff.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Marco Garten
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -33,70 +33,60 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Energy Density Operation with Maxmimum Energy Cut-Off for Particle to Grid Projections
-     *
-     * Derives a scalar field for average kinetic particle energy per cell times the
-     * particle density from a particle species at runtime.
-     * Each value is mapped per cell according to the species' spatial shape.
-     *
-     * @note Only energies below a user-definable cut-off energy are taken for
-     *       calculation!
-     *
-     * @tparam T_ParamClass parameter class containing the maximum energy cutoff
-     *
-     * @note T_ParamClass requires the member `constexpr float_X cutoffMaxEnergy`.
-     */
-    template< typename T_ParamClass >
-    struct EnergyDensityCutoff : public EnergyDensity
+    namespace particles
     {
-
-        HINLINE static
-        std::string
-        getName()
+        namespace particleToGrid
         {
-            return "energyDensityCutoff";
-        }
+            namespace derivedAttributes
+            {
+                /** Energy Density Operation with Maxmimum Energy Cut-Off for Particle to Grid Projections
+                 *
+                 * Derives a scalar field for average kinetic particle energy per cell times the
+                 * particle density from a particle species at runtime.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 *
+                 * @note Only energies below a user-definable cut-off energy are taken for
+                 *       calculation!
+                 *
+                 * @tparam T_ParamClass parameter class containing the maximum energy cutoff
+                 *
+                 * @note T_ParamClass requires the member `constexpr float_X cutoffMaxEnergy`.
+                 */
+                template<typename T_ParamClass>
+                struct EnergyDensityCutoff : public EnergyDensity
+                {
+                    HINLINE static std::string getName()
+                    {
+                        return "energyDensityCutoff";
+                    }
 
-        /** Calculate a new attribute  per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * @tparam T_Particle particle in the frame
-         * @param particle particle in the frame
-         *
-         * @return new attribute for the particle (type @see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
+                    /** Calculate a new attribute  per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * @tparam T_Particle particle in the frame
+                     * @param particle particle in the frame
+                     *
+                     * @return new attribute for the particle (type @see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
 
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_ParamClass
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::EnergyDensityCutoff< T_ParamClass >
-    > : public SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::EnergyDensity
-    >
-    {
-    };
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            template<typename T_Species, typename T_ParamClass>
+            struct SpeciesEligibleForSolver<
+                T_Species,
+                particleToGrid::derivedAttributes::EnergyDensityCutoff<T_ParamClass>>
+                : public SpeciesEligibleForSolver<T_Species, particleToGrid::derivedAttributes::EnergyDensity>
+            {
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensityCutoff.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensityCutoff.hpp
index b32a0a73d3..080071dfa7 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensityCutoff.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/EnergyDensityCutoff.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Heiko Burau, Marco Garten
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Heiko Burau, Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -26,46 +26,38 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    template< class T_ParamClass >
-    template< class T_Particle >
-    DINLINE float_X
-    EnergyDensityCutoff< T_ParamClass >::operator()( T_Particle& particle ) const
+    namespace particles
     {
-        using ParamClass =  T_ParamClass;
-
-        /* read existing attributes */
-        float_X const weighting = particle[ weighting_ ];
-        float3_X const mom = particle[ momentum_ ];
-        float_X const mass = attribute::getMass(
-            weighting,
-            particle
-        );
-
-        constexpr float_X INV_CELL_VOLUME = float_X( 1.0 ) / CELL_VOLUME;
-
-        /* value for energy cut-off */
-        float_X const cutoffMaxEnergy = ParamClass::cutoffMaxEnergy;
-        float_X const cutoff = cutoffMaxEnergy / UNIT_ENERGY * weighting;
-
-        float_X const kinEnergy = KinEnergy< >( )(
-            mom,
-            mass
-        );
-
-        float_X result( 0. );
-        if( kinEnergy < cutoff )
-            result =  kinEnergy * INV_CELL_VOLUME;
-
-        return result;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                template<class T_ParamClass>
+                template<class T_Particle>
+                DINLINE float_X EnergyDensityCutoff<T_ParamClass>::operator()(T_Particle& particle) const
+                {
+                    using ParamClass = T_ParamClass;
+
+                    /* read existing attributes */
+                    float_X const weighting = particle[weighting_];
+                    float3_X const mom = particle[momentum_];
+                    float_X const mass = attribute::getMass(weighting, particle);
+
+                    constexpr float_X INV_CELL_VOLUME = float_X(1.0) / CELL_VOLUME;
+
+                    /* value for energy cut-off */
+                    float_X const cutoffMaxEnergy = ParamClass::cutoffMaxEnergy;
+                    float_X const cutoff = cutoffMaxEnergy / UNIT_ENERGY * weighting;
+
+                    float_X const kinEnergy = KinEnergy<>()(mom, mass);
+
+                    float_X result(0.);
+                    if(kinEnergy < cutoff)
+                        result = kinEnergy * INV_CELL_VOLUME;
+
+                    return result;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/LarmorPower.def b/include/picongpu/particles/particleToGrid/derivedAttributes/LarmorPower.def
index a1e7dee591..b7ed695a3e 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/LarmorPower.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/LarmorPower.def
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -33,103 +33,77 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Radiated Larmor Power Operation for Particle to Grid Projections
-     *
-     * Derives a scalar field with the radiated power according to the Larmor
-     * formula from a particle species at runtime.
-     * Each value is mapped per cell according to the species' spatial shape.
-     */
-    struct LarmorPower
+    namespace particles
     {
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * LarmorEnergy is in Joule: J = kg * m^2 / s^2
-            *   -> L^2 * M * T^-2
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
-           unitDimension.at(SIBaseUnits::length) =  2.0;
-           unitDimension.at(SIBaseUnits::mass)   =  1.0;
-           unitDimension.at(SIBaseUnits::time)   = -2.0;
-
-           return unitDimension;
-        }
-
-        HINLINE static
-        std::string
-        getName()
+            namespace derivedAttributes
+            {
+                /** Radiated Larmor Power Operation for Particle to Grid Projections
+                 *
+                 * Derives a scalar field with the radiated power according to the Larmor
+                 * formula from a particle species at runtime.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 */
+                struct LarmorPower
+                {
+                    HDINLINE float1_64 getUnit() const;
+
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * LarmorEnergy is in Joule: J = kg * m^2 / s^2
+                         *   -> L^2 * M * T^-2
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
+                        unitDimension.at(SIBaseUnits::length) = 2.0;
+                        unitDimension.at(SIBaseUnits::mass) = 1.0;
+                        unitDimension.at(SIBaseUnits::time) = -2.0;
+
+                        return unitDimension;
+                    }
+
+                    HINLINE static std::string getName()
+                    {
+                        return "larmorPower";
+                    }
+
+                    /** Calculate a new attribute  per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+
+        namespace traits
         {
-            return "larmorPower";
-        }
-
-        /** Calculate a new attribute  per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
-
-namespace traits
-{
-    template< typename T_Species >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::LarmorPower
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
-
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            position<>,
-            momentum,
-            momentumPrev1
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        using SpeciesHasMass = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-        using SpeciesHasCharge = typename pmacc::traits::HasFlag<
-            FrameType,
-            chargeRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasMass,
-            SpeciesHasCharge
-        >;
-    };
-} // namespace traits
-} // namespace particles
+            template<typename T_Species>
+            struct SpeciesEligibleForSolver<T_Species, particleToGrid::derivedAttributes::LarmorPower>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                using RequiredIdentifiers = MakeSeq_t<weighting, position<>, momentum, momentumPrev1>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                using SpeciesHasMass = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+                using SpeciesHasCharge = typename pmacc::traits::HasFlag<FrameType, chargeRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasMass, SpeciesHasCharge>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/LarmorPower.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/LarmorPower.hpp
index 93a0a8da33..2c44e91314 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/LarmorPower.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/LarmorPower.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -27,58 +27,53 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    HDINLINE float1_64
-    LarmorPower::getUnit() const
-    {
-        return UNIT_ENERGY;
-    }
-
-    template< class T_Particle >
-    DINLINE float_X
-    LarmorPower::operator()( T_Particle& particle ) const
+    namespace particles
     {
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                HDINLINE float1_64 LarmorPower::getUnit() const
+                {
+                    return UNIT_ENERGY;
+                }
 
-        constexpr bool hasMomentumPrev1 = pmacc::traits::HasIdentifier<
-            typename T_Particle::FrameType,
-            momentumPrev1
-        >::type::value;
-        PMACC_CASSERT_MSG_TYPE( species_must_have_the_attribute_momentumPrev1, T_Particle, hasMomentumPrev1 );
+                template<class T_Particle>
+                DINLINE float_X LarmorPower::operator()(T_Particle& particle) const
+                {
+                    constexpr bool hasMomentumPrev1
+                        = pmacc::traits::HasIdentifier<typename T_Particle::FrameType, momentumPrev1>::type::value;
+                    PMACC_CASSERT_MSG_TYPE(
+                        species_must_have_the_attribute_momentumPrev1,
+                        T_Particle,
+                        hasMomentumPrev1);
 
-        /* read existing attributes */
-        const float3_X mom = particle[momentum_];
-        const float3_X mom_mt1 = particle[momentumPrev1_];
-        const float_X weighting = particle[weighting_];
-        const float_X charge = attribute::getCharge( weighting, particle );
-        const float_X mass = attribute::getMass( weighting, particle );
+                    /* read existing attributes */
+                    const float3_X mom = particle[momentum_];
+                    const float3_X mom_mt1 = particle[momentumPrev1_];
+                    const float_X weighting = particle[weighting_];
+                    const float_X charge = attribute::getCharge(weighting, particle);
+                    const float_X mass = attribute::getMass(weighting, particle);
 
-        /* calculate new attribute */
-        Gamma<float_X> calcGamma;
-        const typename Gamma<float_X>::valueType gamma = calcGamma( mom, mass );
-        const float_X gamma2 = gamma * gamma;
-        const float_X c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
+                    /* calculate new attribute */
+                    Gamma<float_X> calcGamma;
+                    const typename Gamma<float_X>::valueType gamma = calcGamma(mom, mass);
+                    const float_X gamma2 = gamma * gamma;
+                    const float_X c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
 
-        const float3_X mom_dt = (mom - mom_mt1) / float_X(DELTA_T);
-        const float_X el_factor = charge * charge
-            / (float_X(6.0) * PI * EPS0 *
-               c2 * SPEED_OF_LIGHT * mass * mass) * gamma2 * gamma2;
-        const float_X momentumToBetaConvert = float_X(1.0)/ (mass * SPEED_OF_LIGHT * gamma);
-        const float_X larmorPower = el_factor
-                                    * ( math::abs2(mom_dt)
-                                        - momentumToBetaConvert * momentumToBetaConvert
-                                          * math::abs2(math::cross(mom, mom_dt))
-                                      );
+                    const float3_X mom_dt = (mom - mom_mt1) / float_X(DELTA_T);
+                    const float_X el_factor = charge * charge
+                        / (float_X(6.0) * PI * EPS0 * c2 * SPEED_OF_LIGHT * mass * mass) * gamma2 * gamma2;
+                    const float_X momentumToBetaConvert = float_X(1.0) / (mass * SPEED_OF_LIGHT * gamma);
+                    const float_X larmorPower = el_factor
+                        * (pmacc::math::abs2(mom_dt)
+                           - momentumToBetaConvert * momentumToBetaConvert
+                               * pmacc::math::abs2(pmacc::math::cross(mom, mom_dt)));
 
-        /* return attribute */
-        return larmorPower;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+                    /* return attribute */
+                    return larmorPower;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/MacroCounter.def b/include/picongpu/particles/particleToGrid/derivedAttributes/MacroCounter.def
index 496ed7cad9..30ee6cd20a 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/MacroCounter.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/MacroCounter.def
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -29,64 +29,58 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Macro-Particle Counter Operation for Particle to Grid Projections
-     *
-     * Derives a scalar field with number of macro particles per cell from a particle
-     * species at runtime.
-     * Each macro particle is counted straight to the cell it belongs to, which is
-     * in most cases as floor operation in space (and not necessarily the "nearest"
-     * cell-origin).
-     *
-     * @note Use this only for debug purposes, e.g. to validate particle memory.
-     */
-    struct MacroCounter
+    namespace particles
     {
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * Counter is unitless
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
+            namespace derivedAttributes
+            {
+                /** Macro-Particle Counter Operation for Particle to Grid Projections
+                 *
+                 * Derives a scalar field with number of macro particles per cell from a particle
+                 * species at runtime.
+                 * Each macro particle is counted straight to the cell it belongs to, which is
+                 * in most cases as floor operation in space (and not necessarily the "nearest"
+                 * cell-origin).
+                 *
+                 * @note Use this only for debug purposes, e.g. to validate particle memory.
+                 */
+                struct MacroCounter
+                {
+                    HDINLINE float1_64 getUnit() const;
 
-           return unitDimension;
-        }
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * Counter is unitless
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
 
-        HINLINE static
-        std::string
-        getName()
-        {
-            return "macroParticleCounter";
-        }
+                        return unitDimension;
+                    }
+
+                    HINLINE static std::string getName()
+                    {
+                        return "macroParticleCounter";
+                    }
 
-        /** Calculate a new attribute  per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+                    /** Calculate a new attribute  per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/MacroCounter.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/MacroCounter.hpp
index 5a03a97c33..53048a1681 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/MacroCounter.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/MacroCounter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,27 +26,24 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    HDINLINE float1_64
-    MacroCounter::getUnit() const
+    namespace particles
     {
-        return 1.0;
-    }
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                HDINLINE float1_64 MacroCounter::getUnit() const
+                {
+                    return 1.0;
+                }
 
-    template< class T_Particle >
-    DINLINE float_X
-    MacroCounter::operator()( T_Particle& particle ) const
-    {
-        /* return attribute */
-        return 1.0;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+                template<class T_Particle>
+                DINLINE float_X MacroCounter::operator()(T_Particle& particle) const
+                {
+                    /* return attribute */
+                    return 1.0;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/MidCurrentDensityComponent.def b/include/picongpu/particles/particleToGrid/derivedAttributes/MidCurrentDensityComponent.def
index 2b5c0d450d..0f3bc83747 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/MidCurrentDensityComponent.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/MidCurrentDensityComponent.def
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -35,117 +35,96 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Current Density Operation for Particle to Grid Projections
-     *
-     * Calculate the on-charge current density in a selected direction.
-     * Derives a scalar field with density * charge * velocity_component from a
-     * particle species at runtime.
-     * Each value is mapped per cell according to the species' spatial shape.
-     *
-     * @note Mainly useful for debug purposes, e.g. when implementing a new current
-     *       solver.
-     *
-     * @tparam T_direction perpendicular direction x=0, y=1, z=2
-     */
-    template< size_t T_direction >
-    struct MidCurrentDensityComponent
+    namespace particles
     {
-        PMACC_CASSERT_MSG( Valid_directions_are_0_to_2_for_X_to_Z__in_fileOutput_param, ((T_direction)>=0) );
-        PMACC_CASSERT_MSG( Valid_directions_are_0_to_2_for_X_to_Z__in_fileOutput_param, ((T_direction)<3) );
-
-        HDINLINE float1_64
-        getUnit() const;
-
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
+        namespace particleToGrid
         {
-           /* L, M, T, I, theta, N, J
-            *
-            * MidCurrentDensity is in Ampere / square meters: A / m^2
-            *   charge density: Coulomb / m^3
-            *   velocity:       m / s
-            *   current density = charge density * velocity
-            *   -> L^-2 * I
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
-           unitDimension.at(SIBaseUnits::length) = -2.0;
-           unitDimension.at(SIBaseUnits::electricCurrent) =  1.0;
-
-           return unitDimension;
-        }
-
-        HINLINE static
-        std::string
-        getName()
+            namespace derivedAttributes
+            {
+                /** Current Density Operation for Particle to Grid Projections
+                 *
+                 * Calculate the on-charge current density in a selected direction.
+                 * Derives a scalar field with density * charge * velocity_component from a
+                 * particle species at runtime.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 *
+                 * @note Mainly useful for debug purposes, e.g. when implementing a new current
+                 *       solver.
+                 *
+                 * @tparam T_direction perpendicular direction x=0, y=1, z=2
+                 */
+                template<size_t T_direction>
+                struct MidCurrentDensityComponent
+                {
+                    PMACC_CASSERT_MSG(
+                        Valid_directions_are_0_to_2_for_X_to_Z__in_fileOutput_param,
+                        ((T_direction) >= 0));
+                    PMACC_CASSERT_MSG(
+                        Valid_directions_are_0_to_2_for_X_to_Z__in_fileOutput_param,
+                        ((T_direction) < 3));
+
+                    HDINLINE float1_64 getUnit() const;
+
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * MidCurrentDensity is in Ampere / square meters: A / m^2
+                         *   charge density: Coulomb / m^3
+                         *   velocity:       m / s
+                         *   current density = charge density * velocity
+                         *   -> L^-2 * I
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
+                        unitDimension.at(SIBaseUnits::length) = -2.0;
+                        unitDimension.at(SIBaseUnits::electricCurrent) = 1.0;
+
+                        return unitDimension;
+                    }
+
+                    HINLINE static std::string getName()
+                    {
+                        auto const componentNames = plugins::misc::getComponentNames(3);
+                        return "midCurrentDensity/" + componentNames[T_direction];
+                    }
+
+                    /** Calculate a new attribute per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+
+        namespace traits
         {
-            auto const componentNames = plugins::misc::getComponentNames( 3 );
-            return "midCurrentDensity/" + componentNames[T_direction];
-        }
-
-        /** Calculate a new attribute per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
-
-namespace traits
-{
-    template<
-        typename T_Species,
-        size_t T_direction
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::MidCurrentDensityComponent< T_direction >
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
-
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            position<>,
-            momentum
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        using SpeciesHasMass = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-        using SpeciesHasCharge = typename pmacc::traits::HasFlag<
-            FrameType,
-            chargeRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasMass,
-            SpeciesHasCharge
-        >;
-    };
-} // namespace traits
-} // namespace particles
+            template<typename T_Species, size_t T_direction>
+            struct SpeciesEligibleForSolver<
+                T_Species,
+                particleToGrid::derivedAttributes::MidCurrentDensityComponent<T_direction>>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                using RequiredIdentifiers = MakeSeq_t<weighting, position<>, momentum>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                using SpeciesHasMass = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+                using SpeciesHasCharge = typename pmacc::traits::HasFlag<FrameType, chargeRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasMass, SpeciesHasCharge>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/MidCurrentDensityComponent.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/MidCurrentDensityComponent.hpp
index b964000bde..dfd8166a67 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/MidCurrentDensityComponent.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/MidCurrentDensityComponent.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,46 +26,42 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    template< size_t T_direction>
-    HDINLINE float1_64
-    MidCurrentDensityComponent<T_direction>::getUnit() const
+    namespace particles
     {
-        const float_64 UNIT_AREA = UNIT_LENGTH * UNIT_LENGTH;
-        return UNIT_CHARGE / ( UNIT_TIME * UNIT_AREA );
-    }
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                template<size_t T_direction>
+                HDINLINE float1_64 MidCurrentDensityComponent<T_direction>::getUnit() const
+                {
+                    const float_64 UNIT_AREA = UNIT_LENGTH * UNIT_LENGTH;
+                    return UNIT_CHARGE / (UNIT_TIME * UNIT_AREA);
+                }
 
-    template< size_t T_direction>
-    template< class T_Particle >
-    DINLINE float_X
-    MidCurrentDensityComponent<T_direction>::operator()( T_Particle& particle ) const
-    {
-        /* read existing attributes */
-        const float_X weighting = particle[weighting_];
-        const float_X charge = attribute::getCharge( weighting, particle );
-        const float3_X mom = particle[momentum_];
-        const float_X momCom = mom[T_direction];
-        const float_X mass = attribute::getMass( weighting, particle );
+                template<size_t T_direction>
+                template<class T_Particle>
+                DINLINE float_X MidCurrentDensityComponent<T_direction>::operator()(T_Particle& particle) const
+                {
+                    /* read existing attributes */
+                    const float_X weighting = particle[weighting_];
+                    const float_X charge = attribute::getCharge(weighting, particle);
+                    const float3_X mom = particle[momentum_];
+                    const float_X momCom = mom[T_direction];
+                    const float_X mass = attribute::getMass(weighting, particle);
 
-        /* calculate new attribute */
-        Gamma<float_X> calcGamma;
-        const typename Gamma<float_X>::valueType gamma = calcGamma( mom, mass );
+                    /* calculate new attribute */
+                    Gamma<float_X> calcGamma;
+                    const typename Gamma<float_X>::valueType gamma = calcGamma(mom, mass);
 
-        /* calculate new attribute */
-        const float_X particleCurrentDensity =
-            charge / CELL_VOLUME *     /* rho */
-            momCom / ( gamma * mass ); /* v_component */
+                    /* calculate new attribute */
+                    const float_X particleCurrentDensity = charge / CELL_VOLUME * /* rho */
+                        momCom / (gamma * mass); /* v_component */
 
-        /* return attribute */
-        return particleCurrentDensity;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+                    /* return attribute */
+                    return particleCurrentDensity;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/MomentumComponent.def b/include/picongpu/particles/particleToGrid/derivedAttributes/MomentumComponent.def
index 7bfd2d06d0..f9d640eb2c 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/MomentumComponent.def
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/MomentumComponent.def
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -31,93 +31,82 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-    /** Momentum Ratio Operation for Particle to Grid Projections
-     *
-     * Calculate the ratio of momentum in a selected direction to total momentum.
-     * Derives a scalar field with ratio between a selected momentum component and
-     * total momentum from a particle species at runtime. Results are in the range
-     * [ -1. : 1. ], resting particles are set to 0.
-     * Each value is mapped per cell according to the species' spatial shape.
-     *
-     * @param T_direction perpendicular direction x=0, y=1, z=2
-     */
-    template< size_t T_direction >
-    struct MomentumComponent
+    namespace particles
     {
-        PMACC_CASSERT_MSG( Valid_directions_are_0_to_2_for_X_to_Z__in_fileOutput_param, ((T_direction)>=0) );
-        PMACC_CASSERT_MSG( Valid_directions_are_0_to_2_for_X_to_Z__in_fileOutput_param, ((T_direction)<3) );
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                /** Momentum Ratio Operation for Particle to Grid Projections
+                 *
+                 * Calculate the ratio of momentum in a selected direction to total momentum.
+                 * Derives a scalar field with ratio between a selected momentum component and
+                 * total momentum from a particle species at runtime. Results are in the range
+                 * [ -1. : 1. ], resting particles are set to 0.
+                 * Each value is mapped per cell according to the species' spatial shape.
+                 *
+                 * @param T_direction perpendicular direction x=0, y=1, z=2
+                 */
+                template<size_t T_direction>
+                struct MomentumComponent
+                {
+                    PMACC_CASSERT_MSG(
+                        Valid_directions_are_0_to_2_for_X_to_Z__in_fileOutput_param,
+                        ((T_direction) >= 0));
+                    PMACC_CASSERT_MSG(
+                        Valid_directions_are_0_to_2_for_X_to_Z__in_fileOutput_param,
+                        ((T_direction) < 3));
 
-        HDINLINE float1_64
-        getUnit() const;
+                    HDINLINE float1_64 getUnit() const;
 
-        HINLINE std::vector<float_64>
-        getUnitDimension() const
-        {
-           /* L, M, T, I, theta, N, J
-            *
-            * The ratio between momentum in a certain direction direction to
-            * total momentum is unitless.
-            */
-           std::vector<float_64> unitDimension( 7, 0.0 );
+                    HINLINE std::vector<float_64> getUnitDimension() const
+                    {
+                        /* L, M, T, I, theta, N, J
+                         *
+                         * The ratio between momentum in a certain direction direction to
+                         * total momentum is unitless.
+                         */
+                        std::vector<float_64> unitDimension(7, 0.0);
 
-           return unitDimension;
-        }
+                        return unitDimension;
+                    }
 
-        HINLINE static
-        std::string
-        getName()
-        {
-            return "particleMomentumComponent";
-        }
+                    HINLINE static std::string getName()
+                    {
+                        return "particleMomentumComponent";
+                    }
 
-        /** Calculate a new attribute per particle
-         *
-         * Returns a new (on-the-fly calculated) attribute of a particle
-         * that can then be mapped to the cells the particle contributes to.
-         * This method is called on a per-thread basis (each thread of a block
-         * handles a particle of a frame).
-         *
-         * \tparam T_Particle particle in the frame
-         * \param particle particle in the frame
-         *
-         * \return new attribute for the particle (type \see T_AttributeType)
-         */
-        template< class T_Particle >
-        DINLINE float_X
-        operator()( T_Particle& particle ) const;
-    };
-} // namespace derivedAttributes
-} // namespace particleToGrid
+                    /** Calculate a new attribute per particle
+                     *
+                     * Returns a new (on-the-fly calculated) attribute of a particle
+                     * that can then be mapped to the cells the particle contributes to.
+                     * This method is called on a per-thread basis (each thread of a block
+                     * handles a particle of a frame).
+                     *
+                     * \tparam T_Particle particle in the frame
+                     * \param particle particle in the frame
+                     *
+                     * \return new attribute for the particle (type \see T_AttributeType)
+                     */
+                    template<class T_Particle>
+                    DINLINE float_X operator()(T_Particle& particle) const;
+                };
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
 
-namespace traits
-{
-    template<
-        typename T_Species,
-        size_t T_direction
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        particleToGrid::derivedAttributes::MomentumComponent< T_direction >
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
+        namespace traits
+        {
+            template<typename T_Species, size_t T_direction>
+            struct SpeciesEligibleForSolver<
+                T_Species,
+                particleToGrid::derivedAttributes::MomentumComponent<T_direction>>
+            {
+                using FrameType = typename T_Species::FrameType;
 
-        using RequiredIdentifiers = MakeSeq_t<
-            position<>,
-            momentum
-        >;
+                using RequiredIdentifiers = MakeSeq_t<position<>, momentum>;
 
-        using type = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-    };
-} // namespace traits
-} // namespace particles
+                using type = typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/particleToGrid/derivedAttributes/MomentumComponent.hpp b/include/picongpu/particles/particleToGrid/derivedAttributes/MomentumComponent.hpp
index 652fede7a5..94ecba2a78 100644
--- a/include/picongpu/particles/particleToGrid/derivedAttributes/MomentumComponent.hpp
+++ b/include/picongpu/particles/particleToGrid/derivedAttributes/MomentumComponent.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,42 +26,37 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace particleToGrid
-{
-namespace derivedAttributes
-{
-
-    template< size_t T_direction>
-    HDINLINE float1_64
-    MomentumComponent<T_direction>::getUnit() const
+    namespace particles
     {
-        return 1.0;
-    }
-
-    template< size_t T_direction>
-    template< class T_Particle >
-    DINLINE float_X
-    MomentumComponent<T_direction>::operator()( T_Particle& particle ) const
-    {
-        // read existing attributes
-        const float3_X mom = particle[momentum_];
-
-        // calculate new attribute: |p| and p.[x|y|z]
-        const float_X momAbs = math::abs(mom);
-        const float_X momCom = mom[T_direction];
-
-        // total momentum == 0 then perpendicular measure shall be zero, too
-        // values: [-1.:1.]
-        const float_X momComOverTotal = (momAbs > float_X(0.)) ?
-            momCom / momAbs :
-            float_X(0.);
-
-        // return attribute
-        return momComOverTotal;
-    }
-} // namespace derivedAttributes
-} // namespace particleToGrid
-} // namespace particles
+        namespace particleToGrid
+        {
+            namespace derivedAttributes
+            {
+                template<size_t T_direction>
+                HDINLINE float1_64 MomentumComponent<T_direction>::getUnit() const
+                {
+                    return 1.0;
+                }
+
+                template<size_t T_direction>
+                template<class T_Particle>
+                DINLINE float_X MomentumComponent<T_direction>::operator()(T_Particle& particle) const
+                {
+                    // read existing attributes
+                    const float3_X mom = particle[momentum_];
+
+                    // calculate new attribute: |p| and p.[x|y|z]
+                    const float_X momAbs = math::abs(mom);
+                    const float_X momCom = mom[T_direction];
+
+                    // total momentum == 0 then perpendicular measure shall be zero, too
+                    // values: [-1.:1.]
+                    const float_X momComOverTotal = (momAbs > float_X(0.)) ? momCom / momAbs : float_X(0.);
+
+                    // return attribute
+                    return momComOverTotal;
+                }
+            } // namespace derivedAttributes
+        } // namespace particleToGrid
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/pusher/Traits.hpp b/include/picongpu/particles/pusher/Traits.hpp
new file mode 100644
index 0000000000..76696fcf66
--- /dev/null
+++ b/include/picongpu/particles/pusher/Traits.hpp
@@ -0,0 +1,50 @@
+/* Copyright 2020-2021 Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/pusher/particlePusherComposite.hpp"
+
+#include <pmacc/traits/IsBaseTemplateOf.hpp>
+
+#include <type_traits>
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace pusher
+        {
+            /** Check if pusher type is composite (use several underlying pushers)
+             *
+             * The only composite pusher types are children of
+             * particlePusherComposite::Push template classes
+             *
+             * @tparam T_Pusher pusher type
+             * @treturn ::type std::true_type or std::false_type
+             */
+            template<typename T_Pusher>
+            struct IsComposite : public pmacc::traits::IsBaseTemplateOf_t<particlePusherComposite::Push, T_Pusher>
+            {
+            };
+
+        } // namespace pusher
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherAcceleration.hpp b/include/picongpu/particles/pusher/particlePusherAcceleration.hpp
index 8bd5fe001a..8254a84a40 100644
--- a/include/picongpu/particles/pusher/particlePusherAcceleration.hpp
+++ b/include/picongpu/particles/pusher/particlePusherAcceleration.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera,
  *                     Richard Pausch, Klaus Steiniger
  *
  * This file is part of PIConGPU.
@@ -27,75 +27,71 @@
 
 namespace picongpu
 {
-namespace particlePusherAcceleration
-{
-
-struct UnitlessParam : public particlePusherAccelerationParam
-{
-    /** Normalize input values from `pusher.param` to PIC units */
-    static constexpr float_X AMPLITUDEx = float_X(AMPLITUDEx_SI / UNIT_EFIELD); // unit: Volt / meter
-    static constexpr float_X AMPLITUDEy = float_X(AMPLITUDEy_SI / UNIT_EFIELD); // unit: Volt / meter
-    static constexpr float_X AMPLITUDEz = float_X(AMPLITUDEz_SI / UNIT_EFIELD); // unit: Volt / meter
-
-    static constexpr float_X ACCELERATION_TIME =  float_X(ACCELERATION_TIME_SI / UNIT_TIME); // unit: second
-
-};
-
-template<class Velocity, class Gamma>
-struct Push
-{
-    /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
-     * for particle positions outside the super cell in one push
-     */
-    using LowerMargin = pmacc::math::CT::make_Int<simDim,0>::type;
-    using UpperMargin = pmacc::math::CT::make_Int<simDim,0>::type;
-
-    template< typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos >
-    HDINLINE void operator()(
-        const T_FunctorFieldB,
-        const T_FunctorFieldE,
-        T_Particle & particle,
-        T_Pos & pos,
-        const uint32_t currentStep
-    )
+    namespace particlePusherAcceleration
     {
-        using UnitlessParam = ::picongpu::particlePusherAcceleration::UnitlessParam;
-
-        float_X const weighting = particle[ weighting_ ];
-        float_X const mass = attribute::getMass( weighting, particle );
-        float_X const charge = attribute::getCharge( weighting, particle );
-
-        using MomType = momentum::type;
-        MomType new_mom = particle[ momentum_ ];
-
-        const float_X deltaT = DELTA_T;
-
-        // normalize input SI values to
-        const float3_X eField(UnitlessParam::AMPLITUDEx, UnitlessParam::AMPLITUDEy, UnitlessParam::AMPLITUDEz);
-
-        /* ToDo: Refactor to ensure a smooth and slow increase of eField with time
-         * which may help to reduce radiation due to acceleration, if present.
-         */
-        if ( currentStep * DELTA_T <= UnitlessParam::ACCELERATION_TIME )
-            new_mom += charge * eField * deltaT;
-
-        particle[ momentum_ ] = new_mom;
-
-        Velocity velocity;
-        const float3_X vel = velocity( new_mom, mass );
-
-        for( uint32_t d = 0; d < simDim; ++d )
+        struct UnitlessParam : public particlePusherAccelerationParam
         {
-            pos[d] += ( vel[d] * deltaT ) / cellSize[d];
-        }
+            /** Normalize input values from `pusher.param` to PIC units */
+            static constexpr float_X AMPLITUDEx = float_X(AMPLITUDEx_SI / UNIT_EFIELD); // unit: Volt / meter
+            static constexpr float_X AMPLITUDEy = float_X(AMPLITUDEy_SI / UNIT_EFIELD); // unit: Volt / meter
+            static constexpr float_X AMPLITUDEz = float_X(AMPLITUDEz_SI / UNIT_EFIELD); // unit: Volt / meter
 
-    }
+            static constexpr float_X ACCELERATION_TIME = float_X(ACCELERATION_TIME_SI / UNIT_TIME); // unit: second
+        };
 
-    static pmacc::traits::StringProperty getStringProperties()
-    {
-        pmacc::traits::StringProperty propList( "name", "Acceleration" );
-        return propList;
-    }
-};
-} // namespace particlePusherAcceleration
+        template<class Velocity, class Gamma>
+        struct Push
+        {
+            /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
+             * for particle positions outside the super cell in one push
+             */
+            using LowerMargin = pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = pmacc::math::CT::make_Int<simDim, 0>::type;
+
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
+            HDINLINE void operator()(
+                const T_FunctorFieldB,
+                const T_FunctorFieldE,
+                T_Particle& particle,
+                T_Pos& pos,
+                const uint32_t currentStep)
+            {
+                using UnitlessParam = ::picongpu::particlePusherAcceleration::UnitlessParam;
+
+                float_X const weighting = particle[weighting_];
+                float_X const mass = attribute::getMass(weighting, particle);
+                float_X const charge = attribute::getCharge(weighting, particle);
+
+                using MomType = momentum::type;
+                MomType new_mom = particle[momentum_];
+
+                const float_X deltaT = DELTA_T;
+
+                // normalize input SI values to
+                const float3_X eField(UnitlessParam::AMPLITUDEx, UnitlessParam::AMPLITUDEy, UnitlessParam::AMPLITUDEz);
+
+                /* ToDo: Refactor to ensure a smooth and slow increase of eField with time
+                 * which may help to reduce radiation due to acceleration, if present.
+                 */
+                if(currentStep * DELTA_T <= UnitlessParam::ACCELERATION_TIME)
+                    new_mom += charge * eField * deltaT;
+
+                particle[momentum_] = new_mom;
+
+                Velocity velocity;
+                const float3_X vel = velocity(new_mom, mass);
+
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    pos[d] += (vel[d] * deltaT) / cellSize[d];
+                }
+            }
+
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "Acceleration");
+                return propList;
+            }
+        };
+    } // namespace particlePusherAcceleration
 } // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherAxel.hpp b/include/picongpu/particles/pusher/particlePusherAxel.hpp
index c3c410b917..cec0a358ee 100644
--- a/include/picongpu/particles/pusher/particlePusherAxel.hpp
+++ b/include/picongpu/particles/pusher/particlePusherAxel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -27,7 +27,7 @@
 
 // That is a sum over two out of 3 coordinates, as described in the script
 // above. (See Ref.!)
-#define FOR_JK_NOT_I(I,J,K,code) (code(I,J,K)) + (code(I, K, J))
+#define FOR_JK_NOT_I(I, J, K, code) (code(I, J, K)) + (code(I, K, J))
 
 #include <pmacc/types.hpp>
 
@@ -35,15 +35,14 @@ namespace picongpu
 {
     namespace particlePusherAxel
     {
-
         template<class Velocity, class Gamma>
         struct Push
         {
             /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
              * for particle positions outside the super cell in one push
              */
-            using LowerMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
-            using UpperMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
 
             enum coords
             {
@@ -52,80 +51,83 @@ namespace picongpu
                 z = 2
             };
 
-            HDINLINE float_X levichivita( const unsigned int i, const unsigned int j, const unsigned int k )
+            HDINLINE float_X levichivita(const unsigned int i, const unsigned int j, const unsigned int k)
             {
-                if( i == j || j == k || i == k ) return float_X(0.0);
+                if(i == j || j == k || i == k)
+                    return float_X(0.0);
 
-                if( i == x && j == y ) return float_X(1.0);
-                if( i == z && j == x ) return float_X(1.0);
-                if( i == y && j == z ) return float_X(1.0);
+                if(i == x && j == y)
+                    return float_X(1.0);
+                if(i == z && j == x)
+                    return float_X(1.0);
+                if(i == y && j == z)
+                    return float_X(1.0);
 
                 return float_X(-1.0);
             }
 
-            template<
-                typename T_FunctorFieldE,
-                typename T_FunctorFieldB,
-                typename T_Particle,
-                typename T_Pos
-            >
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
             HDINLINE void operator()(
                 const T_FunctorFieldB functorBField, /* at t=0 */
                 const T_FunctorFieldE functorEField, /* at t=0 */
-                T_Particle & particle,
-                T_Pos & pos, /* at t=0 */
-                const uint32_t
-            )
+                T_Particle& particle,
+                T_Pos& pos, /* at t=0 */
+                const uint32_t)
             {
-                float_X const weighting = particle[ weighting_ ];
-                float_X const mass = attribute::getMass( weighting, particle );
-                float_X const charge = attribute::getCharge( weighting, particle );
+                float_X const weighting = particle[weighting_];
+                float_X const mass = attribute::getMass(weighting, particle);
+                float_X const charge = attribute::getCharge(weighting, particle);
 
                 using MomType = momentum::type;
-                MomType mom = particle[ momentum_ ];
+                MomType mom = particle[momentum_];
 
-                auto bField  = functorBField(pos);
-                auto eField  = functorEField(pos);
+                auto bField = functorBField(pos);
+                auto eField = functorEField(pos);
 
                 Gamma gammaCalc;
                 Velocity velocityCalc;
                 const float_X epsilon = 1.0e-6;
                 const float_X deltaT = DELTA_T;
 
-                //const float3_X velocity_atMinusHalf = velocity(mom, mass);
-                const float_X gamma = gammaCalc( mom, mass );
+                // const float3_X velocity_atMinusHalf = velocity(mom, mass);
+                const float_X gamma = gammaCalc(mom, mass);
 
                 const MomType mom_old = mom;
 
-                const float_X B2 = math::abs2( bField );
-                const float_X B = math::abs( bField );
+                const float_X B2 = pmacc::math::abs2(bField);
+                const float_X B = math::abs(bField);
 
-                if( B2 > epsilon )
+                if(B2 > epsilon)
                 {
                     trigo_X sinres;
                     trigo_X cosres;
                     trigo_X arg = B * charge * deltaT / gamma;
-                    math::sincos( arg, sinres, cosres );
+                    pmacc::math::sincos(arg, sinres, cosres);
 
-                    mom.x() = bField.x() * bField.x() * ( eField.x() * charge * deltaT + mom_old.x() );
-                    mom.y() = bField.y() * bField.y() * ( eField.y() * charge * deltaT + mom_old.y() );
-                    mom.z() = bField.z() * bField.z() * ( eField.z() * charge * deltaT + mom_old.z() );
+                    mom.x() = bField.x() * bField.x() * (eField.x() * charge * deltaT + mom_old.x());
+                    mom.y() = bField.y() * bField.y() * (eField.y() * charge * deltaT + mom_old.y());
+                    mom.z() = bField.z() * bField.z() * (eField.z() * charge * deltaT + mom_old.z());
 
-#define SUM_PLINE1(I,J,K) bField.J() * ( -levichivita(I,J,K) * gamma * eField.K() + bField.I() * ( eField.J() * charge * deltaT + mom_old.J() ) )
-#define SUM_PLINE2(I,J,K) -bField.J() * ( -levichivita(I,J,K) * gamma * eField.K() + bField.I() * mom_old.J() - bField.J() * mom_old.I() )
-#define SUM_PLINE3(I,J,K) bField.J() * bField.J() * gamma * eField.I() - bField.I() * bField.J() * gamma * eField.J() + levichivita(I,J,K) * mom_old.J() * bField.K() * B2
+#define SUM_PLINE1(I, J, K)                                                                                           \
+    bField.J()                                                                                                        \
+        * (-levichivita(I, J, K) * gamma * eField.K() + bField.I() * (eField.J() * charge * deltaT + mom_old.J()))
+#define SUM_PLINE2(I, J, K)                                                                                           \
+    -bField.J() * (-levichivita(I, J, K) * gamma * eField.K() + bField.I() * mom_old.J() - bField.J() * mom_old.I())
+#define SUM_PLINE3(I, J, K)                                                                                           \
+    bField.J() * bField.J() * gamma* eField.I() - bField.I() * bField.J() * gamma* eField.J()                         \
+        + levichivita(I, J, K) * mom_old.J() * bField.K() * B2
 
-                    mom.x() += FOR_JK_NOT_I( x, y, z, SUM_PLINE1 );
-                    mom.x() += float_X(cosres ) * ( FOR_JK_NOT_I( x, y, z, SUM_PLINE2 ) );
-                    mom.x() += float_X(sinres ) / B * ( FOR_JK_NOT_I( x, y, z, SUM_PLINE3 ) );
+                    mom.x() += FOR_JK_NOT_I(x, y, z, SUM_PLINE1);
+                    mom.x() += float_X(cosres) * (FOR_JK_NOT_I(x, y, z, SUM_PLINE2));
+                    mom.x() += float_X(sinres) / B * (FOR_JK_NOT_I(x, y, z, SUM_PLINE3));
 
-                    mom.y() += FOR_JK_NOT_I( y, z, x, SUM_PLINE1 );
-                    mom.y() += float_X(cosres ) * ( FOR_JK_NOT_I( y, z, x, SUM_PLINE2 ) );
-                    mom.y() += float_X(sinres ) / B * ( FOR_JK_NOT_I( y, z, x, SUM_PLINE3 ) );
+                    mom.y() += FOR_JK_NOT_I(y, z, x, SUM_PLINE1);
+                    mom.y() += float_X(cosres) * (FOR_JK_NOT_I(y, z, x, SUM_PLINE2));
+                    mom.y() += float_X(sinres) / B * (FOR_JK_NOT_I(y, z, x, SUM_PLINE3));
 
-                    mom.z() += FOR_JK_NOT_I( z, x, y, SUM_PLINE1 );
-                    mom.z() += float_X(cosres ) * ( FOR_JK_NOT_I( z, x, y, SUM_PLINE2 ) );
-                    mom.z() += float_X(sinres ) / B * ( FOR_JK_NOT_I( z, x, y, SUM_PLINE3 ) );
+                    mom.z() += FOR_JK_NOT_I(z, x, y, SUM_PLINE1);
+                    mom.z() += float_X(cosres) * (FOR_JK_NOT_I(z, x, y, SUM_PLINE2));
+                    mom.z() += float_X(sinres) / B * (FOR_JK_NOT_I(z, x, y, SUM_PLINE3));
 
                     mom *= float_X(1.0) / B2;
                 }
@@ -134,58 +136,79 @@ namespace picongpu
                     mom += eField * charge * deltaT;
                 }
 
-                particle[ momentum_ ] = mom;
+                particle[momentum_] = mom;
 
                 float3_X dr(float3_X::create(0.0));
 
                 // old spacial change calculation: linear step
-                if( TrajectoryInterpolation == LINEAR )
+                if(TrajectoryInterpolation == LINEAR)
                 {
-                    const float3_X vel = velocityCalc( mom, mass );
-                    dr = float3_X( vel.x() * deltaT / CELL_WIDTH,
-                                                   vel.y() * deltaT / CELL_HEIGHT,
-                                                   vel.z() * deltaT / CELL_DEPTH );
+                    const float3_X vel = velocityCalc(mom, mass);
+                    dr = float3_X(
+                        vel.x() * deltaT / CELL_WIDTH,
+                        vel.y() * deltaT / CELL_HEIGHT,
+                        vel.z() * deltaT / CELL_DEPTH);
                 }
 
                 // new spacial change calculation
-                if( TrajectoryInterpolation == NONLINEAR )
+                if(TrajectoryInterpolation == NONLINEAR)
                 {
-                    const float3_X vel_old = velocityCalc( mom_old, mass );
+                    const float3_X vel_old = velocityCalc(mom_old, mass);
                     const float_X QoM = charge / mass;
                     const float_X B4 = B2 * B2;
                     float3_X r = pos;
 
-                    if( B4 > epsilon )
+                    if(B4 > epsilon)
                     {
                         trigo_X sinres;
                         trigo_X cosres;
                         trigo_X arg = B * QoM * deltaT / SPEED_OF_LIGHT;
-                        math::sincos( arg, sinres, cosres );
+                        pmacc::math::sincos(arg, sinres, cosres);
 
                         r.x() = bField.x() * bField.x() * bField.x() * bField.x() * QoM
-                            * ( eField.x() * QoM * deltaT * deltaT + 2.0f * ( deltaT * vel_old.x() + pos.x() ) );
-
-#define SUM_RLINE1(I,J,K) 2.0 * bField.J() * bField.J() * bField.J() * bField.J() * QoM * pos.x() \
-                    + 2.0 * bField.J() * bField.J() * bField.K() * bField.K() * QoM * pos.x() \
-                    + bField.J() * bField.J() * bField.J() * ( -levichivita(I,J,K) * 2.0 * SPEED_OF_LIGHT * ( eField.K() * QoM * deltaT + vel_old.K() ) + bField.I() * QoM * deltaT * ( eField.J() * QoM * deltaT + 2.0 * vel_old.J() ) ) \
-                    + bField.J() * bField.J() * ( 2.0 * SPEED_OF_LIGHT * SPEED_OF_LIGHT * eField.I() + bField.I() * bField.I() * QoM * ( eField.I() * QoM * deltaT * deltaT + 2.0 * deltaT * vel_old.I() + 4.0 * pos.I() ) + levichivita(I,J,K) * 2.0 * SPEED_OF_LIGHT * bField.K() * vel_old.J() + bField.K() * QoM * ( levichivita(I,J,K) * 2.0 * eField.J() * SPEED_OF_LIGHT * deltaT + bField.I() * bField.K() * QoM * deltaT * deltaT ) ) \
-                    + bField.I() * bField.J() * ( bField.I() * bField.I() * QoM * deltaT * ( eField.J() * QoM * deltaT + 2.0 * vel_old.J() ) - levichivita(I,J,K) * 2.0 * bField.I() * SPEED_OF_LIGHT * ( eField.K() * QoM * deltaT + vel_old.K() ) - 2.0 * SPEED_OF_LIGHT * SPEED_OF_LIGHT * eField.J() )
-
-#define SUM_RLINE2(I,J,K) - bField.J() * ( SPEED_OF_LIGHT * eField.I() * bField.J() - levichivita(I,J,K) * bField.J() * bField.J() * vel_old.K() - bField.I() * SPEED_OF_LIGHT * eField.J() - levichivita(I,J,K) * bField.J() * vel_old.K() * ( bField.I() * bField.I() + bField.K() *bField.K() ) )
-
-#define SUM_RLINE3(I,J,K) levichivita(I,J,K) * bField.J() * ( SPEED_OF_LIGHT * eField.K() + levichivita(I,J,K) * ( bField.J() * vel_old.I() - bField.I() * vel_old.J() ) )
-
-                        r.x() += FOR_JK_NOT_I( x, y, z, SUM_RLINE1 );
-                        r.x() += float_X(cosres ) * 2.0 * SPEED_OF_LIGHT * ( FOR_JK_NOT_I( x, y, z, SUM_RLINE2 ) );
-                        r.x() += float_X(sinres ) * 2.0 * SPEED_OF_LIGHT * B * ( FOR_JK_NOT_I( x, y, z, SUM_RLINE3 ) );
-
-                        r.y() += FOR_JK_NOT_I( y, z, x, SUM_RLINE1 );
-                        r.y() += float_X(cosres ) * 2.0 * SPEED_OF_LIGHT * ( FOR_JK_NOT_I( y, z, x, SUM_RLINE2 ) );
-                        r.y() += float_X(sinres ) * 2.0 * SPEED_OF_LIGHT * B * ( FOR_JK_NOT_I( y, z, x, SUM_RLINE3 ) );
-
-                        r.z() += FOR_JK_NOT_I( z, x, y, SUM_RLINE1 );
-                        r.z() += float_X(cosres ) * 2.0 * SPEED_OF_LIGHT * ( FOR_JK_NOT_I( z, x, y, SUM_RLINE2 ) );
-                        r.z() += float_X(sinres ) * 2.0 * SPEED_OF_LIGHT * B * ( FOR_JK_NOT_I( z, x, y, SUM_RLINE3 ) );
+                            * (eField.x() * QoM * deltaT * deltaT + 2.0f * (deltaT * vel_old.x() + pos.x()));
+
+#define SUM_RLINE1(I, J, K)                                                                                           \
+    2.0 * bField.J() * bField.J() * bField.J() * bField.J() * QoM* pos.x()                                            \
+        + 2.0 * bField.J() * bField.J() * bField.K() * bField.K() * QoM* pos.x()                                      \
+        + bField.J() * bField.J() * bField.J()                                                                        \
+            * (-levichivita(I, J, K) * 2.0 * SPEED_OF_LIGHT * (eField.K() * QoM * deltaT + vel_old.K())               \
+               + bField.I() * QoM * deltaT * (eField.J() * QoM * deltaT + 2.0 * vel_old.J()))                         \
+        + bField.J() * bField.J()                                                                                     \
+            * (2.0 * SPEED_OF_LIGHT * SPEED_OF_LIGHT * eField.I()                                                     \
+               + bField.I() * bField.I() * QoM                                                                        \
+                   * (eField.I() * QoM * deltaT * deltaT + 2.0 * deltaT * vel_old.I() + 4.0 * pos.I())                \
+               + levichivita(I, J, K) * 2.0 * SPEED_OF_LIGHT * bField.K() * vel_old.J()                               \
+               + bField.K() * QoM                                                                                     \
+                   * (levichivita(I, J, K) * 2.0 * eField.J() * SPEED_OF_LIGHT * deltaT                               \
+                      + bField.I() * bField.K() * QoM * deltaT * deltaT))                                             \
+        + bField.I() * bField.J()                                                                                     \
+            * (bField.I() * bField.I() * QoM * deltaT * (eField.J() * QoM * deltaT + 2.0 * vel_old.J())               \
+               - levichivita(I, J, K) * 2.0 * bField.I() * SPEED_OF_LIGHT * (eField.K() * QoM * deltaT + vel_old.K()) \
+               - 2.0 * SPEED_OF_LIGHT * SPEED_OF_LIGHT * eField.J())
+
+#define SUM_RLINE2(I, J, K)                                                                                           \
+    -bField.J()                                                                                                       \
+        * (SPEED_OF_LIGHT * eField.I() * bField.J() - levichivita(I, J, K) * bField.J() * bField.J() * vel_old.K()    \
+           - bField.I() * SPEED_OF_LIGHT * eField.J()                                                                 \
+           - levichivita(I, J, K) * bField.J() * vel_old.K() * (bField.I() * bField.I() + bField.K() * bField.K()))
+
+#define SUM_RLINE3(I, J, K)                                                                                           \
+    levichivita(I, J, K) * bField.J()                                                                                 \
+        * (SPEED_OF_LIGHT * eField.K()                                                                                \
+           + levichivita(I, J, K) * (bField.J() * vel_old.I() - bField.I() * vel_old.J()))
+
+                        r.x() += FOR_JK_NOT_I(x, y, z, SUM_RLINE1);
+                        r.x() += float_X(cosres) * 2.0 * SPEED_OF_LIGHT * (FOR_JK_NOT_I(x, y, z, SUM_RLINE2));
+                        r.x() += float_X(sinres) * 2.0 * SPEED_OF_LIGHT * B * (FOR_JK_NOT_I(x, y, z, SUM_RLINE3));
+
+                        r.y() += FOR_JK_NOT_I(y, z, x, SUM_RLINE1);
+                        r.y() += float_X(cosres) * 2.0 * SPEED_OF_LIGHT * (FOR_JK_NOT_I(y, z, x, SUM_RLINE2));
+                        r.y() += float_X(sinres) * 2.0 * SPEED_OF_LIGHT * B * (FOR_JK_NOT_I(y, z, x, SUM_RLINE3));
+
+                        r.z() += FOR_JK_NOT_I(z, x, y, SUM_RLINE1);
+                        r.z() += float_X(cosres) * 2.0 * SPEED_OF_LIGHT * (FOR_JK_NOT_I(z, x, y, SUM_RLINE2));
+                        r.z() += float_X(sinres) * 2.0 * SPEED_OF_LIGHT * B * (FOR_JK_NOT_I(z, x, y, SUM_RLINE3));
 
                         r *= float_X(0.5) / B4 / QoM;
                     }
@@ -196,7 +219,6 @@ namespace picongpu
                     dr = r - pos;
 
                     dr *= float3_X::create(1.0) / cellSize;
-
                 }
 
                 pos += dr;
@@ -204,12 +226,10 @@ namespace picongpu
 
             static pmacc::traits::StringProperty getStringProperties()
             {
-                pmacc::traits::StringProperty propList( "name", "other" );
+                pmacc::traits::StringProperty propList("name", "other");
                 propList["param"] = "semi analytical, Axel Huebl (2011)";
                 return propList;
             }
         };
-    } //namespace
-}
-
-
+    } // namespace particlePusherAxel
+} // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherBoris.hpp b/include/picongpu/particles/pusher/particlePusherBoris.hpp
index 69736c9b85..46c22e63b9 100644
--- a/include/picongpu/particles/pusher/particlePusherBoris.hpp
+++ b/include/picongpu/particles/pusher/particlePusherBoris.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -26,70 +26,67 @@
 
 namespace picongpu
 {
-namespace particlePusherBoris
-{
-
-template<class Velocity, class Gamma>
-struct Push
-{
-    /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
-     * for particle positions outside the super cell in one push
-     */
-    using LowerMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
-    using UpperMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
-
-    template< typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos >
-    HDINLINE void operator()(
-        const T_FunctorFieldB functorBField,
-        const T_FunctorFieldE functorEField,
-        T_Particle & particle,
-        T_Pos & pos,
-        const uint32_t
-    )
+    namespace particlePusherBoris
     {
-        float_X const weighting = particle[ weighting_ ];
-        float_X const mass = attribute::getMass( weighting, particle );
-        float_X const charge = attribute::getCharge( weighting, particle );
-
-        using MomType = momentum::type;
-        MomType const mom = particle[ momentum_ ];
-
-        auto bField  = functorBField(pos);
-        auto eField  = functorEField(pos);
-
-        const float_X QoM = charge / mass;
-
-        const float_X deltaT = DELTA_T;
-
-        const MomType mom_minus = mom + float_X(0.5) * charge * eField * deltaT;
-
-        Gamma gamma;
-        const float_X gamma_reci = float_X(1.0) / gamma(mom_minus, mass);
-        const float3_X t = float_X(0.5) * QoM * bField * gamma_reci * deltaT;
-        auto s  = float_X(2.0) * t * (float_X(1.0) / (float_X(1.0) + math::abs2(t)));
-
-        const MomType mom_prime = mom_minus + math::cross(mom_minus, t);
-        const MomType mom_plus = mom_minus + math::cross(mom_prime, s);
-
-        const MomType new_mom = mom_plus + float_X(0.5) * charge * eField * deltaT;
-
-        particle[ momentum_ ] = new_mom;
-
-        Velocity velocity;
-        const float3_X vel = velocity(new_mom, mass);
-
-        for(uint32_t d=0;d<simDim;++d)
+        template<class Velocity, class Gamma>
+        struct Push
         {
-            pos[d] += (vel[d] * deltaT) / cellSize[d];
-        }
-
-    }
-
-    static pmacc::traits::StringProperty getStringProperties()
-    {
-        pmacc::traits::StringProperty propList( "name", "Boris" );
-        return propList;
-    }
-};
-} // namespace particlePusherBoris
+            /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
+             * for particle positions outside the super cell in one push
+             */
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
+            HDINLINE void operator()(
+                const T_FunctorFieldB functorBField,
+                const T_FunctorFieldE functorEField,
+                T_Particle& particle,
+                T_Pos& pos,
+                const uint32_t)
+            {
+                float_X const weighting = particle[weighting_];
+                float_X const mass = attribute::getMass(weighting, particle);
+                float_X const charge = attribute::getCharge(weighting, particle);
+
+                using MomType = momentum::type;
+                MomType const mom = particle[momentum_];
+
+                auto bField = functorBField(pos);
+                auto eField = functorEField(pos);
+
+                const float_X QoM = charge / mass;
+
+                const float_X deltaT = DELTA_T;
+
+                const MomType mom_minus = mom + float_X(0.5) * charge * eField * deltaT;
+
+                Gamma gamma;
+                const float_X gamma_reci = float_X(1.0) / gamma(mom_minus, mass);
+                const float3_X t = float_X(0.5) * QoM * bField * gamma_reci * deltaT;
+                auto s = float_X(2.0) * t * (float_X(1.0) / (float_X(1.0) + pmacc::math::abs2(t)));
+
+                const MomType mom_prime = mom_minus + pmacc::math::cross(mom_minus, t);
+                const MomType mom_plus = mom_minus + pmacc::math::cross(mom_prime, s);
+
+                const MomType new_mom = mom_plus + float_X(0.5) * charge * eField * deltaT;
+
+                particle[momentum_] = new_mom;
+
+                Velocity velocity;
+                const float3_X vel = velocity(new_mom, mass);
+
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    pos[d] += (vel[d] * deltaT) / cellSize[d];
+                }
+            }
+
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "Boris");
+                return propList;
+            }
+        };
+    } // namespace particlePusherBoris
 } // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherComposite.hpp b/include/picongpu/particles/pusher/particlePusherComposite.hpp
new file mode 100644
index 0000000000..04e6a43f98
--- /dev/null
+++ b/include/picongpu/particles/pusher/particlePusherComposite.hpp
@@ -0,0 +1,139 @@
+/* Copyright 2020-2021 Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+#include <pmacc/math/vector/compile-time/Vector.hpp>
+
+#include <cstdint>
+#include <string>
+
+
+namespace picongpu
+{
+    namespace particlePusherComposite
+    {
+        /** Concept for an activation functor for a composite pusher
+         *
+         * This concept defines an interface for the corresponding template
+         * argument. This class is not supposed to be used directly.
+         * However, a helper activator class to be reused is provided below.
+         */
+        struct ActivationFunctor
+        {
+            /** Return a 1-based index of which pusher of the composite to use
+             *
+             * Return value out of the range [1, #pushers] means no pusher to be used.
+             *
+             * @param currentStep current time iteration
+             */
+            HDINLINE uint32_t operator()(uint32_t const currentStep) const;
+        };
+
+        /** Helper activation functor for a composite of two pushers
+         *
+         * Uses the first pusher for currentStep < T_switchTimeStep and the second
+         * one otherwise.
+         */
+        template<uint32_t T_switchTimeStep>
+        struct BinarySwitchActivationFunctor
+        {
+            HDINLINE constexpr uint32_t operator()(uint32_t const currentStep) const
+            {
+                return currentStep < T_switchTimeStep ? 1 : 2;
+            }
+        };
+
+        /** Composite of two particle pushers, each implementing the pusher concept.
+         *
+         * The decision which pusher to use is made by the activation functor.
+         * The composite pushers implement the pusher concept themselves, however
+         * for performance reasons special treatment is recommended during the
+         * particle push simulation stage.
+         *
+         * @tparam T_FirstPusher first pusher type
+         * @tparam T_SecondPusher second pusher type
+         * @tparam T_ActivationFunctor activation functor to decide which pusher to use,
+         *                             implements the ActivationFunctor concept
+         */
+        template<typename T_FirstPusher, typename T_SecondPusher, typename T_ActivationFunctor>
+        struct Push
+            : public T_FirstPusher
+            , T_SecondPusher
+        {
+            using FirstPusher = T_FirstPusher;
+            using SecondPusher = T_SecondPusher;
+            using ActivationFunctor = T_ActivationFunctor;
+
+            /* These are done logically correct, but should not be used directly for
+             * the particle push stage.
+             */
+            using LowerMargin = typename pmacc::math::CT::max<
+                typename traits::GetLowerMargin<FirstPusher>::type,
+                typename traits::GetLowerMargin<SecondPusher>::type>::type;
+            using UpperMargin = typename pmacc::math::CT::
+                max<typename GetUpperMargin<FirstPusher>::type, typename GetUpperMargin<SecondPusher>::type>::type;
+
+            /** Get active pusher 1-based index
+             *
+             * Result other than 1 or 2 means no pusher should be used
+             *
+             * @param currentStep current time iteration
+             */
+            static HDINLINE uint32_t activePusherIdx(uint32_t const currentStep)
+            {
+                return ActivationFunctor{}(currentStep);
+            }
+
+            /** Push one particle, this is compatibility-only
+             *
+             * Should not be used for the particle push stage due to shared memory
+             * and register consumption.
+             */
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
+            HDINLINE void operator()(
+                T_FunctorFieldB const functorBField,
+                T_FunctorFieldE const functorEField,
+                T_Particle& particle,
+                T_Pos& pos,
+                uint32_t const currentStep) const
+            {
+                auto const pusherIdx = activePusherIdx(currentStep);
+                if(pusherIdx == 1)
+                    FirstPusher::operator()(functorBField, functorEField, particle, pos, currentStep);
+                else if(pusherIdx == 2)
+                    SecondPusher::operator()(functorBField, functorEField, particle, pos, currentStep);
+            }
+
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                auto firstProperty = FirstPusher::getStringProperties();
+                auto secondProperty = SecondPusher::getStringProperties();
+                pmacc::traits::StringProperty propList(
+                    "name",
+                    std::string("Composite of ") + firstProperty["name"].value + " and "
+                        + secondProperty["name"].value);
+                return propList;
+            }
+        };
+
+    } // namespace particlePusherComposite
+} // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherFree.hpp b/include/picongpu/particles/pusher/particlePusherFree.hpp
index 38a28f1ed4..9103a2aa23 100644
--- a/include/picongpu/particles/pusher/particlePusherFree.hpp
+++ b/include/picongpu/particles/pusher/particlePusherFree.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -34,29 +34,28 @@ namespace picongpu
             /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
              * for particle positions outside the super cell in one push
              */
-            using LowerMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
-            using UpperMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
 
-            template< typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos >
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
             HDINLINE void operator()(
                 const T_FunctorFieldB functorBField,
                 const T_FunctorFieldE functorEField,
-                T_Particle & particle,
-                T_Pos & pos,
-                const uint32_t
-            )
+                T_Particle& particle,
+                T_Pos& pos,
+                const uint32_t)
             {
-                float_X const weighting = particle[ weighting_ ];
-                float_X const mass = attribute::getMass( weighting, particle );
+                float_X const weighting = particle[weighting_];
+                float_X const mass = attribute::getMass(weighting, particle);
 
                 using MomType = momentum::type;
-                MomType const mom = particle[ momentum_ ];
+                MomType const mom = particle[momentum_];
 
                 Velocity velocity;
                 const MomType vel = velocity(mom, mass);
 
 
-                for(uint32_t d=0;d<simDim;++d)
+                for(uint32_t d = 0; d < simDim; ++d)
                 {
                     pos[d] += (vel[d] * DELTA_T) / cellSize[d];
                 }
@@ -64,10 +63,10 @@ namespace picongpu
 
             static pmacc::traits::StringProperty getStringProperties()
             {
-                pmacc::traits::StringProperty propList( "name", "other" );
+                pmacc::traits::StringProperty propList("name", "other");
                 propList["param"] = "free streaming";
                 return propList;
             }
         };
-    } //namespace
-}
+    } // namespace particlePusherFree
+} // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherHigueraCary.hpp b/include/picongpu/particles/pusher/particlePusherHigueraCary.hpp
new file mode 100644
index 0000000000..27403528cf
--- /dev/null
+++ b/include/picongpu/particles/pusher/particlePusherHigueraCary.hpp
@@ -0,0 +1,145 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Annegret Roeszler, Klaus Steiniger
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/traits/attribute/GetMass.hpp"
+#include "picongpu/traits/attribute/GetCharge.hpp"
+
+
+namespace picongpu
+{
+    namespace particlePusherHigueraCary
+    {
+        /** Implementation of the Higuera-Cary pusher as presented in doi:10.1063/1.4979989.
+         *
+         * A correction is applied to the given formulas as documented by the WarpX team:
+         * (https://github.com/ECP-WarpX/WarpX/issues/320).
+         *
+         * Note, while Higuera and Ripperda present the formulas for the quantity u = gamma * v,
+         * PIConGPU uses the real momentum p = gamma * m * v = u * m for calculations.
+         * Here, all auxiliary quantities are equal to those used in Ripperda's article.
+         *
+         * Further references:
+         * [Higuera's article on arxiv](https://arxiv.org/abs/1701.05605)
+         * [Riperda's comparison of relativistic particle integrators](https://doi.org/10.3847/1538-4365/aab114)
+         *
+         * @tparam Velocity functor to compute the velocity of a particle with momentum p and mass m
+         * @tparam Gamma functor to compute the Lorentz factor (= Energy/mc^2) of a particle with momentum p and mass m
+         */
+        template<typename Velocity, typename Gamma>
+        struct Push
+        {
+            /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
+             * for particle positions outside the super cell in one push
+             */
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
+            HDINLINE void operator()(
+                T_FunctorFieldB const functorBField,
+                T_FunctorFieldE const functorEField,
+                T_Particle& particle,
+                T_Pos& pos,
+                uint32_t const)
+            {
+                float_X const weighting = particle[weighting_];
+                float_X const mass = attribute::getMass(weighting, particle);
+                float_X const charge = attribute::getCharge(weighting, particle);
+
+                using MomType = momentum::type;
+                MomType const mom = particle[momentum_];
+
+                auto bField = functorBField(pos);
+                auto eField = functorEField(pos);
+
+                float_X const deltaT = DELTA_T;
+
+
+                Gamma gamma;
+
+                /* Momentum update
+                 * Notation is according to Ripperda's paper
+                 */
+                // First half electric field acceleration
+                namespace sqrt_HC = sqrt_HigueraCary;
+
+                sqrt_HC::float3_X const mom_minus
+                    = precisionCast<sqrt_HC::float_X>(mom + float_X(0.5) * charge * eField * deltaT);
+
+                // Auxiliary quantitites
+                sqrt_HC::float_X const gamma_minus = gamma(mom_minus, mass);
+
+                sqrt_HC::float3_X const tau
+                    = precisionCast<sqrt_HC::float_X>(float_X(0.5) * bField * charge * deltaT / mass);
+
+                sqrt_HC::float_X const sigma = pmacc::math::abs2(gamma_minus) - pmacc::math::abs2(tau);
+
+                sqrt_HC::float_X const u_star
+                    = pmacc::math::dot(mom_minus, tau) / precisionCast<sqrt_HC::float_X>(mass * SPEED_OF_LIGHT);
+
+                sqrt_HC::float_X const gamma_plus = math::sqrt(
+                    sqrt_HC::float_X(0.5)
+                    * (sigma
+                       + math::sqrt(
+                           pmacc::math::abs2(sigma)
+                           + sqrt_HC::float_X(4.0) * (pmacc::math::abs2(tau) + pmacc::math::abs2(u_star)))));
+
+                sqrt_HC::float3_X const t_vector = tau / gamma_plus;
+
+                sqrt_HC::float_X const s
+                    = sqrt_HC::float_X(1.0) / (sqrt_HC::float_X(1.0) + pmacc::math::abs2(t_vector));
+
+                // Rotation step
+                sqrt_HC::float3_X const mom_plus = s
+                    * (mom_minus + pmacc::math::dot(mom_minus, t_vector) * t_vector
+                       + pmacc::math::cross(mom_minus, t_vector));
+
+                // Second half electric field acceleration (Note correction mom_minus -> mom_plus here compared to
+                // Ripperda)
+                MomType const mom_diff1 = float_X(0.5) * charge * eField * deltaT;
+                MomType const mom_diff2 = precisionCast<float_X>(pmacc::math::cross(mom_plus, t_vector));
+                MomType const mom_diff = mom_diff1 + mom_diff2;
+
+                MomType const new_mom = precisionCast<float_X>(mom_plus) + mom_diff;
+
+                particle[momentum_] = new_mom;
+
+                // Position update
+                Velocity velocity;
+
+                float3_X const vel = velocity(new_mom, mass);
+
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    pos[d] += (vel[d] * deltaT) / cellSize[d];
+                }
+            }
+
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "other:Higuera-Cary");
+                return propList;
+            }
+        };
+
+    } // namespace particlePusherHigueraCary
+} // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherPhoton.hpp b/include/picongpu/particles/pusher/particlePusherPhoton.hpp
index 3de76d8eae..89cec3dba3 100644
--- a/include/picongpu/particles/pusher/particlePusherPhoton.hpp
+++ b/include/picongpu/particles/pusher/particlePusherPhoton.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera,
  *                     Alexander Grund, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -33,25 +33,24 @@ namespace picongpu
             /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
              * for particle positions outside the super cell in one push
              */
-            using LowerMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
-            using UpperMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
 
-            template< typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos >
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
             HDINLINE void operator()(
                 const T_FunctorFieldB functorBField,
                 const T_FunctorFieldE functorEField,
-                T_Particle & particle,
-                T_Pos & pos,
-                const uint32_t
-            )
+                T_Particle& particle,
+                T_Pos& pos,
+                const uint32_t)
             {
                 using MomType = momentum::type;
-                MomType const mom = particle[ momentum_ ];
+                MomType const mom = particle[momentum_];
 
-                const float_X mom_abs = math::abs( mom );
-                const MomType vel = mom * ( SPEED_OF_LIGHT / mom_abs );
+                const float_X mom_abs = math::abs(mom);
+                const MomType vel = mom * (SPEED_OF_LIGHT / mom_abs);
 
-                for(uint32_t d=0;d<simDim;++d)
+                for(uint32_t d = 0; d < simDim; ++d)
                 {
                     pos[d] += (vel[d] * DELTA_T) / cellSize[d];
                 }
@@ -59,10 +58,10 @@ namespace picongpu
 
             static pmacc::traits::StringProperty getStringProperties()
             {
-                pmacc::traits::StringProperty propList( "name", "other" );
+                pmacc::traits::StringProperty propList("name", "other");
                 propList["param"] = "free streaming photon pusher";
                 return propList;
             }
         };
-    } //namespace
-}
+    } // namespace particlePusherPhoton
+} // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherProbe.hpp b/include/picongpu/particles/pusher/particlePusherProbe.hpp
index 5ee69747f0..bda9a19f25 100644
--- a/include/picongpu/particles/pusher/particlePusherProbe.hpp
+++ b/include/picongpu/particles/pusher/particlePusherProbe.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -25,133 +25,82 @@
 
 namespace picongpu
 {
-namespace particlePusherProbe
-{
-    /** Probe electro-magnetic fields and store the result with a particle
-     *
-     * @tparam T_ValueFunctor pmacc::nvidia::functors::*, binary functor
-     *         handling how to store the obtained field on the particle,
-     *         default is assigning a new value
-     * @tparam T_ActualPush allows to perform a real particle push after
-     *         probing the electro-magnetic field (e.g. to let a probe
-     *         particle stream with a moving window or to define a tracer
-     *         particle species that records its fields),
-     *         default is void and means no push (just a static probe)
-     */
-    template<
-        typename T_ValueFunctor = pmacc::nvidia::functors::Assign,
-        typename T_ActualPush = void
-    >
-    struct Push
+    namespace particlePusherProbe
     {
-        using ActualPush = T_ActualPush;
-
-        /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
-         * for particle positions outside the super cell in one push
+        /** Probe electro-magnetic fields and store the result with a particle
+         *
+         * @tparam T_ValueFunctor pmacc::nvidia::functors::*, binary functor
+         *         handling how to store the obtained field on the particle,
+         *         default is assigning a new value
+         * @tparam T_ActualPush allows to perform a real particle push after
+         *         probing the electro-magnetic field (e.g. to let a probe
+         *         particle stream with a moving window or to define a tracer
+         *         particle species that records its fields),
+         *         default is void and means no push (just a static probe)
          */
-        using LowerMargin = typename ActualPush::LowerMargin;
-        using UpperMargin = typename ActualPush::UpperMargin;
-
-        template<
-            typename T_FunctorFieldE,
-            typename T_FunctorFieldB,
-            typename T_Particle,
-            typename T_Pos
-        >
-        HDINLINE void
-        operator()(
-            T_FunctorFieldB const functorBField,
-            T_FunctorFieldE const functorEField,
-            T_Particle & particle,
-            T_Pos & pos,
-            uint32_t const currentStep
-        )
+        template<typename T_ValueFunctor = pmacc::nvidia::functors::Assign, typename T_ActualPush = void>
+        struct Push
         {
-            T_ValueFunctor valueFunctor;
-            valueFunctor(
-                particle[ probeB_ ],
-                functorBField( pos )
-            );
-            valueFunctor(
-                particle[ probeE_ ],
-                functorEField( pos )
-            );
+            using ActualPush = T_ActualPush;
 
-            ActualPush actualPush;
-            actualPush(
-                functorBField,
-                functorEField,
-                particle,
-                pos,
-                currentStep
-            );
-        }
+            /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
+             * for particle positions outside the super cell in one push
+             */
+            using LowerMargin = typename ActualPush::LowerMargin;
+            using UpperMargin = typename ActualPush::UpperMargin;
 
-        static
-        pmacc::traits::StringProperty
-        getStringProperties()
-        {
-            pmacc::traits::GetStringProperties< ActualPush > propList;
-            propList[ "param" ] = "moving probe";
-            return propList;
-        }
-    };
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
+            HDINLINE void operator()(
+                T_FunctorFieldB const functorBField,
+                T_FunctorFieldE const functorEField,
+                T_Particle& particle,
+                T_Pos& pos,
+                uint32_t const currentStep)
+            {
+                T_ValueFunctor valueFunctor;
+                valueFunctor(particle[probeB_], functorBField(pos));
+                valueFunctor(particle[probeE_], functorEField(pos));
 
-    template< typename T_ValueFunctor >
-    struct Push<
-        T_ValueFunctor,
-        void
-    >
-    {
-        /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
-         * for particle positions outside the super cell in one push
-         */
-        using LowerMargin = typename pmacc::math::CT::make_Int<
-            simDim,
-            0
-        >::type;
-        using UpperMargin = typename pmacc::math::CT::make_Int<
-            simDim,
-            0
-        >::type;
+                ActualPush actualPush;
+                actualPush(functorBField, functorEField, particle, pos, currentStep);
+            }
 
-        template<
-            typename T_FunctorFieldE,
-            typename T_FunctorFieldB,
-            typename T_Particle,
-            typename T_Pos
-        >
-        HDINLINE void
-        operator()(
-            T_FunctorFieldB const functorBField,
-            T_FunctorFieldE const functorEField,
-            T_Particle & particle,
-            T_Pos & pos,
-            uint32_t const
-        )
-        {
-            T_ValueFunctor valueFunctor;
-            valueFunctor(
-                particle[ probeB_ ],
-                functorBField( pos )
-            );
-            valueFunctor(
-                particle[ probeE_ ],
-                functorEField( pos )
-            );
-        }
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::GetStringProperties<ActualPush> propList;
+                propList["param"] = "moving probe";
+                return propList;
+            }
+        };
 
-        static
-        pmacc::traits::StringProperty
-        getStringProperties()
+        template<typename T_ValueFunctor>
+        struct Push<T_ValueFunctor, void>
         {
-            pmacc::traits::StringProperty propList(
-                "name",
-                "other"
-            );
-            propList[ "param" ] = "static probe";
-            return propList;
-        }
-    };
-} // namespace particlePusherProbe
+            /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
+             * for particle positions outside the super cell in one push
+             */
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
+            HDINLINE void operator()(
+                T_FunctorFieldB const functorBField,
+                T_FunctorFieldE const functorEField,
+                T_Particle& particle,
+                T_Pos& pos,
+                uint32_t const)
+            {
+                T_ValueFunctor valueFunctor;
+                valueFunctor(particle[probeB_], functorBField(pos));
+                valueFunctor(particle[probeE_], functorEField(pos));
+            }
+
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "other");
+                propList["param"] = "static probe";
+                return propList;
+            }
+        };
+    } // namespace particlePusherProbe
 } // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherReducedLandauLifshitz.hpp b/include/picongpu/particles/pusher/particlePusherReducedLandauLifshitz.hpp
index 8ee17d51d2..920b56869c 100644
--- a/include/picongpu/particles/pusher/particlePusherReducedLandauLifshitz.hpp
+++ b/include/picongpu/particles/pusher/particlePusherReducedLandauLifshitz.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -29,182 +29,207 @@
 
 namespace picongpu
 {
-namespace particlePusherReducedLandauLifshitz
-{
-/* This pusher uses the Lorentz force and a reduced
- * Landau Lifshitz term to push particles based on the
- * Runge Kutta solver 4th order. It takes into account
- * the energy loss due to radiation.
- *
- * More details on this approach can be found in
- * Marija Vranic's paper: Classical Radiation Reaction
- * in Particle-In-Cell Simulations
- * http://arxiv.org/abs/1502.02432
- */
-template<class Velocity, class Gamma>
-struct Push
-{
-  /* this is an optional extention for sub-sampling pushes that enables grid to particle interpolation
-   * for particle positions outside the super cell in one push
-   */
-  using LowerMargin = typename pmacc::math::CT::make_Int<simDim,1>::type;
-  using UpperMargin = typename pmacc::math::CT::make_Int<simDim,1>::type;
-
-  template<
-    typename T_FunctorFieldE,
-    typename T_FunctorFieldB,
-    typename T_Particle,
-    typename T_Pos
-  >
-  HDINLINE void operator()(
-    const T_FunctorFieldB functorBField, /* at t=0 */
-    const T_FunctorFieldE functorEField, /* at t=0 */
-    T_Particle & particle,
-    T_Pos & pos, /* at t=0 */
-    const uint32_t
-  )
-  {
-    float_X const weighting = particle[ weighting_ ];
-    float_X const mass = attribute::getMass( weighting, particle );
-    float_X const charge = attribute::getCharge( weighting, particle );
-
-    using TypeBFieldFunctor = T_FunctorFieldB;
-    using TypeEFieldFunctor = T_FunctorFieldE;
-    using TypePosition = position_pic::type;
-    using TypeMomentum = momentum::type;
-    using TypeMass = float_X;
-    using TypeCharge = float_X;
-    using TypeWeighting = weighting::type;
-
-    TypeMomentum mom = particle[ momentum_ ];
-
-    const float_X deltaT = DELTA_T;
-    const uint32_t dimMomentum = GetNComponents<TypeMomentum>::value;
-    // the transver data type adjust to 3D3V, 2D3V, 2D2V, ...
-    using VariableType = pmacc::math::Vector< picongpu::float_X, simDim + dimMomentum >;
-    VariableType var;
-
-    // transfer position
-    for(uint32_t i=0; i<picongpu::simDim; ++i)
-      var[i] = pos[i];
-
-    // transfer momentum
-    for(uint32_t i=0; i<dimMomentum; ++i)
-      var[simDim + i] = mom[i];
-
-    using DiffEqType = DiffEquation<VariableType, float_X, TypeEFieldFunctor, TypeBFieldFunctor, TypePosition, TypeMomentum, TypeMass, TypeCharge, TypeWeighting, Velocity, Gamma>;
-    DiffEqType diffEq(functorEField, functorBField, mass, charge, weighting);
-
-    VariableType varNew = pmacc::math::RungeKutta4()(diffEq, var, float_X(0.0), deltaT);
-
-    // transfer position
-    for(uint32_t i=0; i<picongpu::simDim; ++i)
-      pos[i] = varNew[i];
-
-    // transfer momentum
-    for(uint32_t i=0; i<dimMomentum; ++i)
-      mom[i] = varNew[simDim+i];
-
-    particle[ momentum_ ] = mom;
-  }
-
-  template<typename T_Var, typename T_Time,
-           typename T_FieldEFunc, typename T_FieldBFunc,
-           typename T_Pos, typename T_Mom,
-           typename T_Mass, typename T_Charge, typename T_Weighting,
-           typename T_Velocity, typename T_Gamma>
-  struct DiffEquation
-  {
-
-    // alias for types to  follow coding guide line
-    using VariableType = T_Var;
-    using TimeType = T_Time;
-    using EFieldFuncType = T_FieldEFunc;
-    using BFieldFuncType = T_FieldBFunc;
-    using PositionType = T_Pos;
-    using MomentumType = T_Mom;
-    using MassType = T_Mass;
-    using ChargeType = T_Charge;
-    using WeightingType = T_Weighting;
-    using VelocityType = T_Velocity;
-    using GammaType = T_Gamma;
-
-
-    HDINLINE DiffEquation(EFieldFuncType funcE, BFieldFuncType funcB, MassType m, ChargeType q, WeightingType w)
-      : fieldEFunc(funcE), fieldBFunc(funcB), mass(m), charge(q), weighting(w)
-    { }
-
-    HDINLINE VariableType operator()(TimeType time, VariableType var) const
+    namespace particlePusherReducedLandauLifshitz
     {
-      PositionType pos;
-      PositionType posInterpolation;
-      MomentumType mom;
-      // transfer position
-      for(uint32_t i=0; i<picongpu::simDim; ++i)
-      {
-          posInterpolation[i] = var[i];
-          pos[i] = var[i] * cellSize[i];
-      }
-
-      auto fieldE = fieldEFunc( posInterpolation,
-                                       picongpu::particles::interpolationMemoryPolicy::ShiftToValidRange() );
-      auto fieldB = fieldBFunc( posInterpolation,
-                                       picongpu::particles::interpolationMemoryPolicy::ShiftToValidRange() );
-
-      // transfer momentum
-      const uint32_t dimMomentum = GetNComponents<MomentumType>::value;
-      for(uint32_t i=0; i<dimMomentum; ++i)
-        mom[i] = var[simDim+i];
-
-      VelocityType velocityCalc;
-      GammaType gammaCalc;
-      const float_X c = SPEED_OF_LIGHT;
-      const float3_X velocity = velocityCalc(mom, mass);
-      const float_X gamma = gammaCalc(mom, mass);
-      const float_X conversionMomentum2Beta = 1.0 / (gamma * mass * c);
-
-      const float_X c2 = c*c;
-      const float_X charge2 = charge*charge;
-      const float3_X beta = velocity / c;
-
-      const float_X prefactorRR = 2./3. * charge2 * charge2 / (4.*PI*EPS0 * mass*mass * c2*c2);
-      const float3_X lorentz = fieldE + conversionMomentum2Beta * c * math::cross(mom, fieldB);
-      const float_X fieldETimesBeta = math::dot(fieldE, mom) * conversionMomentum2Beta;
-      const float3_X radReactionVec = c * (math::cross(fieldE, fieldB) +
-                                           c * conversionMomentum2Beta * math::cross(fieldB, math::cross(fieldB, mom)))
-                                      + conversionMomentum2Beta * fieldE * math::dot(mom, fieldE)
-                                      - gamma * gamma * conversionMomentum2Beta * (mom * (math::dot(lorentz, lorentz) - fieldETimesBeta*fieldETimesBeta));
-
-      const float3_X diffMom = charge * lorentz + (prefactorRR / weighting) * radReactionVec;
-      const float3_X diffPos = velocity;
-
-      VariableType returnVar;
-      for(uint32_t i=0; i<picongpu::simDim; ++i)
-        returnVar[i] = diffPos[i] / cellSize[i];
-
-      for(uint32_t i=0; i<dimMomentum; ++i)
-        returnVar[simDim+i] = diffMom[i];
-
-      return returnVar;
-    }
-
-
-  private:
-    EFieldFuncType fieldEFunc; /* functor E field interpolation */
-    BFieldFuncType fieldBFunc; /* functor B field interpolation */
-    MassType mass;             /* mass of the macro particle */
-    ChargeType charge;         /* charge of the macro particle */
-    WeightingType weighting;   /* weighting of the macro particle */
-  };
-
-  static pmacc::traits::StringProperty getStringProperties()
-  {
-      pmacc::traits::StringProperty propList( "name", "other" );
-      propList["param"] = "reduced Landau-Lifshitz pusher via RK4 and "
-                          "classical radiation reaction, Marija Vranic (2015)";
-      return propList;
-  }
-
-};
-} //namespace particlePusherReducedLandauLifshitz
-} //namespace picongpu
+        /* This pusher uses the Lorentz force and a reduced
+         * Landau Lifshitz term to push particles based on the
+         * Runge Kutta solver 4th order. It takes into account
+         * the energy loss due to radiation.
+         *
+         * More details on this approach can be found in
+         * Marija Vranic's paper: Classical Radiation Reaction
+         * in Particle-In-Cell Simulations
+         * http://arxiv.org/abs/1502.02432
+         */
+        template<class Velocity, class Gamma>
+        struct Push
+        {
+            /* this is an optional extention for sub-sampling pushes that enables grid to particle interpolation
+             * for particle positions outside the super cell in one push
+             */
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 1>::type;
+            using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 1>::type;
+
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
+            HDINLINE void operator()(
+                const T_FunctorFieldB functorBField, /* at t=0 */
+                const T_FunctorFieldE functorEField, /* at t=0 */
+                T_Particle& particle,
+                T_Pos& pos, /* at t=0 */
+                const uint32_t)
+            {
+                float_X const weighting = particle[weighting_];
+                float_X const mass = attribute::getMass(weighting, particle);
+                float_X const charge = attribute::getCharge(weighting, particle);
+
+                using TypeBFieldFunctor = T_FunctorFieldB;
+                using TypeEFieldFunctor = T_FunctorFieldE;
+                using TypePosition = position_pic::type;
+                using TypeMomentum = momentum::type;
+                using TypeMass = float_X;
+                using TypeCharge = float_X;
+                using TypeWeighting = weighting::type;
+
+                TypeMomentum mom = particle[momentum_];
+
+                const float_X deltaT = DELTA_T;
+                const uint32_t dimMomentum = GetNComponents<TypeMomentum>::value;
+                // the transver data type adjust to 3D3V, 2D3V, 2D2V, ...
+                using VariableType = pmacc::math::Vector<picongpu::float_X, simDim + dimMomentum>;
+                VariableType var;
+
+                // transfer position
+                for(uint32_t i = 0; i < picongpu::simDim; ++i)
+                    var[i] = pos[i];
+
+                // transfer momentum
+                for(uint32_t i = 0; i < dimMomentum; ++i)
+                    var[simDim + i] = mom[i];
+
+                using DiffEqType = DiffEquation<
+                    VariableType,
+                    float_X,
+                    TypeEFieldFunctor,
+                    TypeBFieldFunctor,
+                    TypePosition,
+                    TypeMomentum,
+                    TypeMass,
+                    TypeCharge,
+                    TypeWeighting,
+                    Velocity,
+                    Gamma>;
+                DiffEqType diffEq(functorEField, functorBField, mass, charge, weighting);
+
+                VariableType varNew = pmacc::math::RungeKutta4()(diffEq, var, float_X(0.0), deltaT);
+
+                // transfer position
+                for(uint32_t i = 0; i < picongpu::simDim; ++i)
+                    pos[i] = varNew[i];
+
+                // transfer momentum
+                for(uint32_t i = 0; i < dimMomentum; ++i)
+                    mom[i] = varNew[simDim + i];
+
+                particle[momentum_] = mom;
+            }
+
+            template<
+                typename T_Var,
+                typename T_Time,
+                typename T_FieldEFunc,
+                typename T_FieldBFunc,
+                typename T_Pos,
+                typename T_Mom,
+                typename T_Mass,
+                typename T_Charge,
+                typename T_Weighting,
+                typename T_Velocity,
+                typename T_Gamma>
+            struct DiffEquation
+            {
+                // alias for types to  follow coding guide line
+                using VariableType = T_Var;
+                using TimeType = T_Time;
+                using EFieldFuncType = T_FieldEFunc;
+                using BFieldFuncType = T_FieldBFunc;
+                using PositionType = T_Pos;
+                using MomentumType = T_Mom;
+                using MassType = T_Mass;
+                using ChargeType = T_Charge;
+                using WeightingType = T_Weighting;
+                using VelocityType = T_Velocity;
+                using GammaType = T_Gamma;
+
+
+                HDINLINE DiffEquation(
+                    EFieldFuncType funcE,
+                    BFieldFuncType funcB,
+                    MassType m,
+                    ChargeType q,
+                    WeightingType w)
+                    : fieldEFunc(funcE)
+                    , fieldBFunc(funcB)
+                    , mass(m)
+                    , charge(q)
+                    , weighting(w)
+                {
+                }
+
+                HDINLINE VariableType operator()(TimeType time, VariableType var) const
+                {
+                    PositionType pos;
+                    PositionType posInterpolation;
+                    MomentumType mom;
+                    // transfer position
+                    for(uint32_t i = 0; i < picongpu::simDim; ++i)
+                    {
+                        posInterpolation[i] = var[i];
+                        pos[i] = var[i] * cellSize[i];
+                    }
+
+                    auto fieldE = fieldEFunc(
+                        posInterpolation,
+                        picongpu::particles::interpolationMemoryPolicy::ShiftToValidRange());
+                    auto fieldB = fieldBFunc(
+                        posInterpolation,
+                        picongpu::particles::interpolationMemoryPolicy::ShiftToValidRange());
+
+                    // transfer momentum
+                    const uint32_t dimMomentum = GetNComponents<MomentumType>::value;
+                    for(uint32_t i = 0; i < dimMomentum; ++i)
+                        mom[i] = var[simDim + i];
+
+                    VelocityType velocityCalc;
+                    GammaType gammaCalc;
+                    const float_X c = SPEED_OF_LIGHT;
+                    const float3_X velocity = velocityCalc(mom, mass);
+                    const float_X gamma = gammaCalc(mom, mass);
+                    const float_X conversionMomentum2Beta = 1.0 / (gamma * mass * c);
+
+                    const float_X c2 = c * c;
+                    const float_X charge2 = charge * charge;
+                    const float3_X beta = velocity / c;
+
+                    const float_X prefactorRR = 2. / 3. * charge2 * charge2 / (4. * PI * EPS0 * mass * mass * c2 * c2);
+                    const float3_X lorentz = fieldE + conversionMomentum2Beta * c * pmacc::math::cross(mom, fieldB);
+                    const float_X fieldETimesBeta = pmacc::math::dot(fieldE, mom) * conversionMomentum2Beta;
+                    const float3_X radReactionVec = c
+                            * (pmacc::math::cross(fieldE, fieldB)
+                               + c * conversionMomentum2Beta
+                                   * pmacc::math::cross(fieldB, pmacc::math::cross(fieldB, mom)))
+                        + conversionMomentum2Beta * fieldE * pmacc::math::dot(mom, fieldE)
+                        - gamma * gamma * conversionMomentum2Beta
+                            * (mom * (pmacc::math::dot(lorentz, lorentz) - fieldETimesBeta * fieldETimesBeta));
+
+                    const float3_X diffMom = charge * lorentz + (prefactorRR / weighting) * radReactionVec;
+                    const float3_X diffPos = velocity;
+
+                    VariableType returnVar;
+                    for(uint32_t i = 0; i < picongpu::simDim; ++i)
+                        returnVar[i] = diffPos[i] / cellSize[i];
+
+                    for(uint32_t i = 0; i < dimMomentum; ++i)
+                        returnVar[simDim + i] = diffMom[i];
+
+                    return returnVar;
+                }
+
+
+            private:
+                EFieldFuncType fieldEFunc; /* functor E field interpolation */
+                BFieldFuncType fieldBFunc; /* functor B field interpolation */
+                MassType mass; /* mass of the macro particle */
+                ChargeType charge; /* charge of the macro particle */
+                WeightingType weighting; /* weighting of the macro particle */
+            };
+
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "other");
+                propList["param"] = "reduced Landau-Lifshitz pusher via RK4 and "
+                                    "classical radiation reaction, Marija Vranic (2015)";
+                return propList;
+            }
+        };
+    } // namespace particlePusherReducedLandauLifshitz
+} // namespace picongpu
diff --git a/include/picongpu/particles/pusher/particlePusherVay.hpp b/include/picongpu/particles/pusher/particlePusherVay.hpp
index e5c22f90bd..b262a2f844 100644
--- a/include/picongpu/particles/pusher/particlePusherVay.hpp
+++ b/include/picongpu/particles/pusher/particlePusherVay.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -26,90 +26,87 @@
 
 namespace picongpu
 {
-namespace particlePusherVay
-{
+    namespace particlePusherVay
+    {
+        template<class Velocity, class Gamma>
+        struct Push
+        {
+            /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
+             * for particle positions outside the super cell in one push
+             */
+            using LowerMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
+            using UpperMargin = typename pmacc::math::CT::make_Int<simDim, 0>::type;
 
-template<class Velocity, class Gamma>
-struct Push
-{
-    /* this is an optional extension for sub-sampling pushes that enables grid to particle interpolation
-     * for particle positions outside the super cell in one push
-     */
-    using LowerMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
-    using UpperMargin = typename pmacc::math::CT::make_Int<simDim,0>::type;
+            template<typename T_FunctorFieldE, typename T_FunctorFieldB, typename T_Particle, typename T_Pos>
+            HDINLINE void operator()(
+                const T_FunctorFieldB functorBField, /* at t=0 */
+                const T_FunctorFieldE functorEField, /* at t=0 */
+                T_Particle& particle,
+                T_Pos& pos, /* at t=0 */
+                const uint32_t)
+            {
+                float_X const weighting = particle[weighting_];
+                float_X const mass = attribute::getMass(weighting, particle);
+                float_X const charge = attribute::getCharge(weighting, particle);
 
-    template<
-        typename T_FunctorFieldE,
-        typename T_FunctorFieldB,
-        typename T_Particle,
-        typename T_Pos
-    >
-    HDINLINE void operator()(
-        const T_FunctorFieldB functorBField, /* at t=0 */
-        const T_FunctorFieldE functorEField, /* at t=0 */
-        T_Particle & particle,
-        T_Pos & pos, /* at t=0 */
-        const uint32_t
-    )
-    {
-        float_X const weighting = particle[ weighting_ ];
-        float_X const mass = attribute::getMass( weighting, particle );
-        float_X const charge = attribute::getCharge( weighting, particle );
+                using MomType = momentum::type;
+                MomType const mom = particle[momentum_];
 
-        using MomType = momentum::type;
-        MomType const mom = particle[ momentum_ ];
+                auto bField = functorBField(pos);
+                auto eField = functorEField(pos);
+                /*
+                     time index in paper is reduced by a half: i=0 --> i=-1/2 so that momenta are
+                     at half time steps and fields and locations are at full time steps
 
-        auto bField  = functorBField(pos);
-        auto eField  = functorEField(pos);
-        /*
-             time index in paper is reduced by a half: i=0 --> i=-1/2 so that momenta are
-             at half time steps and fields and locations are at full time steps
+             Here the real (PIConGPU) momentum (p) is used, not the momentum from the Vay paper (u)
+             p = m_0 * u
+                 */
+                const float_X deltaT = DELTA_T;
+                const float_X factor = 0.5 * charge * deltaT;
+                Gamma gamma;
+                Velocity velocity;
 
-     Here the real (PIConGPU) momentum (p) is used, not the momentum from the Vay paper (u)
-     p = m_0 * u
-         */
-        const float_X deltaT = DELTA_T;
-        const float_X factor = 0.5 * charge * deltaT;
-        Gamma gamma;
-        Velocity velocity;
+                // first step in Vay paper:
+                const float3_X velocity_atMinusHalf = velocity(mom, mass);
+                // mom /(mass*mass + abs2(mom)/(SPEED_OF_LIGHT*SPEED_OF_LIGHT));
+                const MomType momentum_atZero
+                    = mom + factor * (eField + pmacc::math::cross(velocity_atMinusHalf, bField));
 
-        // first step in Vay paper:
-        const float3_X velocity_atMinusHalf = velocity(mom, mass);
-        //mom /(mass*mass + abs2(mom)/(SPEED_OF_LIGHT*SPEED_OF_LIGHT));
-        const MomType momentum_atZero = mom + factor * (eField + math::cross(velocity_atMinusHalf, bField));
+                // second step in Vay paper:
+                const MomType momentum_prime = momentum_atZero + factor * eField;
+                const float_X gamma_prime = gamma(momentum_prime, mass);
 
-        // second step in Vay paper:
-        const MomType momentum_prime = momentum_atZero + factor * eField;
-        const float_X gamma_prime = gamma(momentum_prime, mass);
-        //algorithms::math::sqrt(1.0 + abs2(momentum_prime*(1.0/(mass * SPEED_OF_LIGHT))));
-        const sqrt_Vay::float3_X tau(factor / mass * bField);
-        const sqrt_Vay::float_X u_star = math::dot( precisionCast<sqrt_Vay::float_X>(momentum_prime), tau ) / precisionCast<sqrt_Vay::float_X>( SPEED_OF_LIGHT * mass );
-        const sqrt_Vay::float_X sigma = gamma_prime * gamma_prime - math::abs2( tau );
-        const sqrt_Vay::float_X gamma_atPlusHalf = math::sqrt( sqrt_Vay::float_X(0.5) *
-            ( sigma +
-              math::sqrt( sigma * sigma +
-                          sqrt_Vay::float_X(4.0) * ( math::abs2( tau ) + u_star * u_star ) )
-            )
-                                                    );
-        const float3_X t(tau * (float_X(1.0) / gamma_atPlusHalf));
-        const float_X s = float_X(1.0) / (float_X(1.0) + math::abs2(t));
-        const MomType momentum_atPlusHalf = s * (momentum_prime + math::dot(momentum_prime, t) * t + math::cross(momentum_prime, t));
+                const sqrt_Vay::float3_X tau(factor / mass * bField);
+                const sqrt_Vay::float_X u_star
+                    = pmacc::math::dot(precisionCast<sqrt_Vay::float_X>(momentum_prime), tau)
+                    / precisionCast<sqrt_Vay::float_X>(SPEED_OF_LIGHT * mass);
+                const sqrt_Vay::float_X sigma = gamma_prime * gamma_prime - pmacc::math::abs2(tau);
+                const sqrt_Vay::float_X gamma_atPlusHalf = math::sqrt(
+                    sqrt_Vay::float_X(0.5)
+                    * (sigma
+                       + math::sqrt(
+                           sigma * sigma + sqrt_Vay::float_X(4.0) * (pmacc::math::abs2(tau) + u_star * u_star))));
+                const float3_X t(tau * (float_X(1.0) / gamma_atPlusHalf));
+                const float_X s = float_X(1.0) / (float_X(1.0) + pmacc::math::abs2(t));
+                const MomType momentum_atPlusHalf = s
+                    * (momentum_prime + pmacc::math::dot(momentum_prime, t) * t
+                       + pmacc::math::cross(momentum_prime, t));
 
-        particle[ momentum_ ] = momentum_atPlusHalf;
+                particle[momentum_] = momentum_atPlusHalf;
 
-        const float3_X vel = velocity(momentum_atPlusHalf, mass);
+                const float3_X vel = velocity(momentum_atPlusHalf, mass);
 
-        for(uint32_t d=0;d<simDim;++d)
-        {
-            pos[d] += (vel[d] * DELTA_T) / cellSize[d];
-        }
-    }
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    pos[d] += (vel[d] * DELTA_T) / cellSize[d];
+                }
+            }
 
-    static pmacc::traits::StringProperty getStringProperties()
-    {
-        pmacc::traits::StringProperty propList( "name", "Vay" );
-        return propList;
-    }
-};
-} //namespace particlePusherVay
-} //namespace picongpu
+            static pmacc::traits::StringProperty getStringProperties()
+            {
+                pmacc::traits::StringProperty propList("name", "Vay");
+                return propList;
+            }
+        };
+    } // namespace particlePusherVay
+} // namespace picongpu
diff --git a/include/picongpu/particles/shapes.hpp b/include/picongpu/particles/shapes.hpp
index 168e95ab5b..7a3aea949a 100644
--- a/include/picongpu/particles/shapes.hpp
+++ b/include/picongpu/particles/shapes.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 #include "picongpu/simulation_defines.hpp"
 
@@ -27,5 +26,5 @@
 #include "picongpu/particles/shapes/NGP.hpp"
 #include "picongpu/particles/shapes/CIC.hpp"
 #include "picongpu/particles/shapes/TSC.hpp"
+#include "picongpu/particles/shapes/PQS.hpp"
 #include "picongpu/particles/shapes/PCS.hpp"
-#include "picongpu/particles/shapes/P4S.hpp"
diff --git a/include/picongpu/particles/shapes/CIC.hpp b/include/picongpu/particles/shapes/CIC.hpp
index f9fe85bf14..a975d82582 100644
--- a/include/picongpu/particles/shapes/CIC.hpp
+++ b/include/picongpu/particles/shapes/CIC.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -21,72 +21,77 @@
 
 #include "picongpu/simulation_defines.hpp"
 
-namespace picongpu
-{
-namespace particles
-{
-namespace shapes
-{
-namespace shared_CIC
-{
+#include <cstdint>
 
-struct CIC
-{
-    /**
-     * width of the support of this form_factor. This is the area where the function
-     * is non-zero.
-     */
-    static constexpr int support = 2;
-};
 
-}//namespace shared_CIC
-
-struct CIC : public shared_CIC::CIC
+namespace picongpu
 {
-    using CloudShape = picongpu::particles::shapes::NGP;
-
-    struct ChargeAssignment : public shared_CIC::CIC
+    namespace particles
     {
-
-        HDINLINE float_X operator()( float_X const x )
+        namespace shapes
         {
-            /*       -
-             *       |  1-|x|           if |x|<1
-             * W(x)=<|
-             *       |  0               otherwise
-             *       -
+            namespace detail
+            {
+                struct CIC
+                {
+                    /** Support of the assignment function in cells
+                     *
+                     * Specifies width of the area where the function can be non-zero.
+                     * Is the same for all directions
+                     */
+                    static constexpr uint32_t support = 2;
+                };
+
+            } // namespace detail
+
+            /** Cloud-in-cell particle shape
+             *
+             * Cloud density form: piecewise constant
+             * Assignment function: first order B-spline
              */
-            float_X const abs_x = algorithms::math::abs( x );
-
-            bool const below_1 = abs_x < 1.0_X;
-            float_X const onSupport = 1.0_X - abs_x;
-
-            float_X result( 0.0 );
-            if( below_1 )
-                result = onSupport;
-
-            return result;
-        }
-    };
-
-    struct ChargeAssignmentOnSupport : public shared_CIC::CIC
-    {
-
-        /** form factor of this particle shape.
-         * \param x has to be within [-support/2, support/2]
-         */
-        HDINLINE float_X operator()( float_X const x )
-        {
-            /*
-             * W(x)=1-|x|
-             */
-            return 1.0_X - algorithms::math::abs( x );
-        }
-
-    };
-
-};
-
-} // namespace shapes
-} // namespace particles
+            struct CIC
+            {
+                //! Order of the assignment function spline
+                static constexpr uint32_t assignmentFunctionOrder = detail::CIC::support - 1u;
+
+                struct ChargeAssignment : public detail::CIC
+                {
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       |  1-|x|           if |x|<1
+                         * W(x)=<|
+                         *       |  0               otherwise
+                         *       -
+                         */
+                        float_X const abs_x = math::abs(x);
+
+                        bool const below_1 = abs_x < 1.0_X;
+                        float_X const onSupport = 1.0_X - abs_x;
+
+                        float_X result(0.0);
+                        if(below_1)
+                            result = onSupport;
+
+                        return result;
+                    }
+                };
+
+                struct ChargeAssignmentOnSupport : public detail::CIC
+                {
+                    /** form factor of this particle shape.
+                     * \param x has to be within [-support/2, support/2]
+                     */
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*
+                         * W(x)=1-|x|
+                         */
+                        return 1.0_X - math::abs(x);
+                    }
+                };
+            };
+
+        } // namespace shapes
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/shapes/Counter.hpp b/include/picongpu/particles/shapes/Counter.hpp
index 8b0f7a5208..cf20fc19e3 100644
--- a/include/picongpu/particles/shapes/Counter.hpp
+++ b/include/picongpu/particles/shapes/Counter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -17,70 +17,83 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
 
-namespace picongpu
-{
-namespace particles
-{
-namespace shapes
-{
+#include <cstdint>
 
-    namespace shared_Counter
-    {
 
-        struct Counter
-        {
-            /**
-             * width of the support of this form_factor. This is the area where the function
-             * is non-zero.
-             */
-            static constexpr int support = 0;
-        };
-
-    } // namespace shared_Counter
-
-    struct Counter : public shared_Counter::Counter
+namespace picongpu
+{
+    namespace particles
     {
-
-        struct ChargeAssignment : public shared_Counter::Counter
+        namespace shapes
         {
-
-            HDINLINE float_X operator()( float_X const x )
+            namespace detail
             {
-                /*       -
-                 *       | -1               if -1<x<=0
-                 * W(x)=<|
-                 *       |  0               otherwise
-                 *       -
-                 */
-
-                bool const in_cell = -1.0_X < x && x <= 0.0_X;
-
-                return float_X( in_cell );
-            }
-        };
-
-        struct ChargeAssignmentOnSupport : public shared_Counter::Counter
-        {
-
-            /** form factor of this particle shape.
-             * \param x has to be within [-support/2, support/2)
+                struct Counter
+                {
+                    /** Support of the assignment function in cells
+                     *
+                     * Specifies width of the area where the function can be non-zero.
+                     * Is the same for all directions.
+                     * Note that the support is actually 1, but this shape is used only for
+                     * certain operations and not as the main simulation shape, and so for
+                     * enabling more generic implementations is set to one.
+                     */
+                    static constexpr uint32_t support = 0;
+                };
+
+            } // namespace detail
+
+            /** Version of nearest grid point particle shape used for counting particles
+             *
+             * Not to be used as a general particle shape in a simulation
+             *
+             * Cloud density form: delta function, shifted by half cell
+             * Assignment function: zero order B-spline, shifted by half cell
              */
-            HDINLINE float_X operator()( float_X const x )
+            struct Counter
             {
-                bool const in_cell = 0.0_X <= x && x < 1.0_X;
-
-                return float_X( in_cell );
-            }
-
-        };
-
-    };
-
-} // namespace shapes
-} // namespace particles
+                /** Order of the assignment function spline
+                 *
+                 * Note that here the detail::Counter::support - 1u expression would
+                 * not work, as the support of that shape is artificially set to 0
+                 */
+                static constexpr uint32_t assignmentFunctionOrder = 0u;
+
+                struct ChargeAssignment : public detail::Counter
+                {
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       | -1               if -1<x<=0
+                         * W(x)=<|
+                         *       |  0               otherwise
+                         *       -
+                         */
+
+                        bool const in_cell = -1.0_X < x && x <= 0.0_X;
+
+                        return float_X(in_cell);
+                    }
+                };
+
+                struct ChargeAssignmentOnSupport : public detail::Counter
+                {
+                    /** form factor of this particle shape.
+                     * \param x has to be within [0, 1)
+                     */
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        bool const in_cell = 0.0_X <= x && x < 1.0_X;
+
+                        return float_X(in_cell);
+                    }
+                };
+            };
+
+        } // namespace shapes
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/shapes/NGP.hpp b/include/picongpu/particles/shapes/NGP.hpp
index 4a047eda21..5df407e9fd 100644
--- a/include/picongpu/particles/shapes/NGP.hpp
+++ b/include/picongpu/particles/shapes/NGP.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -17,71 +17,75 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
 
-namespace picongpu
-{
-namespace particles
-{
-namespace shapes
-{
+#include <cstdint>
 
-    namespace shared_NGP
-    {
 
-        struct NGP
-        {
-            /**
-             * width of the support of this form_factor. This is the area where the function
-             * is non-zero.
-             */
-            static constexpr int support = 1;
-        };
-
-    } // namespace shared_NGP
-
-    struct NGP : public shared_NGP::NGP
+namespace picongpu
+{
+    namespace particles
     {
-
-        struct ChargeAssignment : public shared_NGP::NGP
+        namespace shapes
         {
-
-            HDINLINE float_X operator()( float_X const x )
+            namespace detail
             {
-                /*       -
-                 *       |  1               if -1/2<=x<1/2
-                 * W(x)=<|
-                 *       |  0               otherwise
-                 *       -
-                 */
-
-                bool const below_half = -0.5_X <= x && x < 0.5_X;
-
-                return float_X( below_half );
-            }
-        };
-
-        struct ChargeAssignmentOnSupport : public shared_NGP::NGP
-        {
-
-            /** form factor of this particle shape.
-             * \param x has to be within [-support/2, support/2)
+                struct NGP
+                {
+                    /** Support of the assignment function in cells
+                     *
+                     * Specifies width of the area where the function can be non-zero.
+                     * Is the same for all directions
+                     */
+                    static constexpr uint32_t support = 1;
+                };
+
+            } // namespace detail
+
+            /** Nearest grid point particle shape
+             *
+             * Cloud density form: delta function
+             * Assignment function: zero order B-spline
              */
-            HDINLINE float_X operator()( float_X const )
+            struct NGP
             {
-                /*
-                 * W(x)=1
-                 */
-                return 1.0_X;
-            }
-
-        };
-
-    };
-
-} // namespace shapes
-} // namespace particles
+                //! Order of the assignment function spline
+                static constexpr uint32_t assignmentFunctionOrder = detail::NGP::support - 1u;
+
+                struct ChargeAssignment : public detail::NGP
+                {
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       |  1               if -1/2<=x<1/2
+                         * W(x)=<|
+                         *       |  0               otherwise
+                         *       -
+                         */
+
+                        bool const below_half = -0.5_X <= x && x < 0.5_X;
+
+                        return float_X(below_half);
+                    }
+                };
+
+                struct ChargeAssignmentOnSupport : public detail::NGP
+                {
+                    /** form factor of this particle shape.
+                     * \param x has to be within [-support/2, support/2)
+                     */
+                    HDINLINE float_X operator()(float_X const)
+                    {
+                        /*
+                         * W(x)=1
+                         */
+                        return 1.0_X;
+                    }
+                };
+            };
+
+        } // namespace shapes
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/shapes/P4S.hpp b/include/picongpu/particles/shapes/P4S.hpp
deleted file mode 100644
index 772d301a1b..0000000000
--- a/include/picongpu/particles/shapes/P4S.hpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-
-namespace picongpu
-{
-namespace particles
-{
-namespace shapes
-{
-
-namespace shared_P4S
-{
-
-struct P4S
-{
-    static constexpr int support = 5;
-
-    HDINLINE static float_X ff_1st_radius( float_X const x )
-    {
-        /*
-         * W(x)= 115/192 - 5/8 * x^2 + 1/4 * x^4
-         *     = 115/192 + x^2 * (-5/8 + 1/4 * x^2)
-         */
-        float_X const square_x = x * x;
-        return 115._X / 192._X + square_x * (
-            -5._X / 8._X +
-            1.0_X / 4.0_X * square_x
-        );
-    }
-
-    HDINLINE static float_X ff_2nd_radius( float_X const x )
-    {
-        /*
-         * W(x)= 1/96 * (55 + 20 * x - 120 * x^2 + 80 * x^3 - 16 * x^4)
-         *     = 1/96 * (55 + 4 * x * (5 - 2 * x * (15 + 2 * x * (-5 + x))))
-         */
-        return 1._X / 96._X * (
-            55._X + 4._X * x * (
-                5._X - 2._X * x * (
-                    15._X + 2._X * x * (
-                        -5._X + x
-                    )
-                )
-            )
-        );
-    }
-
-    HDINLINE static float_X ff_3rd_radius( float_X const x )
-    {
-        /*
-         * W(x)=1/384 * (5 - 2*x)^4
-         */
-        float_X const tmp = 5._X - 2._X * x;
-        float_X const square_tmp = tmp * tmp;
-        float_X const biquadratic_tmp = square_tmp * square_tmp;
-
-        return 1._X / 384._X * biquadratic_tmp;
-    }
-};
-
-} //namespace shared_P4S
-
-/** particle assignment shape `piecewise biquadratic spline`
- */
-struct P4S : public shared_P4S::P4S
-{
-    using CloudShape = picongpu::particles::shapes::PCS;
-
-    struct ChargeAssignmentOnSupport : public shared_P4S::P4S
-    {
-
-        HDINLINE float_X operator()( float_X const x )
-        {
-            /*       -
-             *       |  115/192 + x^2 * (-5/8 + 1/4 * x^2)                          if -1/2 < x < 1/2
-             * W(x)=<|
-             *       |  1/96 * (55 + 4 * x * (5 - 2 * x * (15 + 2 * x * (-5 + x)))) if 1/2 <= |x| < 3/2
-             *       |
-             *       |  1/384 * (5 - 2 * x)^4                                       if 3/2 <= |x| < 5/2
-             *       -
-             */
-            float_X const abs_x = algorithms::math::abs( x );
-
-            bool const below_2nd_radius = abs_x < 1.5_X;
-            bool const below_1st_radius = abs_x < 0.5_X;
-
-            float_X const rad1 = ff_1st_radius( abs_x );
-            float_X const rad2 = ff_2nd_radius( abs_x );
-            float_X const rad3 = ff_3rd_radius( abs_x );
-
-            float_X result = rad3;
-            if( below_1st_radius )
-                result = rad1;
-            else if( below_2nd_radius )
-                result = rad2;
-
-            return result;
-        }
-
-    };
-
-    struct ChargeAssignment : public shared_P4S::P4S
-    {
-
-        HDINLINE float_X operator()( float_X const x )
-        {
-
-            /*       -
-             *       |  115/192 + x^2 * (-5/8 + 1/4 * x^2)                          if -1/2 < x < 1/2
-             * W(x)=<|
-             *       |  1/96 * (55 + 4 * x * (5 - 2 * x * (15 + 2 * x * (-5 + x)))) if 1/2 <= |x| < 3/2
-             *       |
-             *       |  1/384 * (5 - 2*x)^4                                         if 3/2 <= |x| < 5/2
-             *       |
-             *       |  0                                                           otherwise
-             *       -
-             */
-            float_X const abs_x = algorithms::math::abs( x );
-
-            bool const below_max = abs_x < 2.5_X;
-
-            float_X const onSupport = ChargeAssignmentOnSupport()( abs_x );
-
-            float_X result( 0.0 );
-            if( below_max )
-                result = onSupport;
-
-            return result;
-        }
-    };
-};
-
-} // namespace shapes
-} //namespace particles
-} //namespace picongpu
diff --git a/include/picongpu/particles/shapes/PCS.hpp b/include/picongpu/particles/shapes/PCS.hpp
index d0dec31b94..0450c4eac0 100644
--- a/include/picongpu/particles/shapes/PCS.hpp
+++ b/include/picongpu/particles/shapes/PCS.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl, Sergei Bastrakov, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -21,110 +21,131 @@
 
 #include "picongpu/simulation_defines.hpp"
 
-namespace picongpu
-{
-namespace particles
-{
-namespace shapes
-{
-
-namespace shared_PCS
-{
-struct PCS
-{
-    static constexpr int support = 4;
-
+#include <cstdint>
 
 
-    HDINLINE static float_X ff_1st_radius( float_X const x )
-    {
-        /*
-         * W(x)=1/6*(4 - 6*x^2 + 3*|x|^3)
-         */
-        float_X const square_x = x * x;
-        float_X const triple_x = square_x * x;
-        return 1.0_X / 6.0_X * ( 4.0_X - 6.0_X * square_x + 3.0_X * triple_x );
-    }
-
-    HDINLINE static float_X ff_2nd_radius( float_X const x )
-    {
-        /*
-         * W(x)=1/6*(2 - |x|)^3
-         */
-        float_X const tmp = 2.0_X - x;
-        float_X const triple_tmp = tmp * tmp * tmp;
-        return 1.0_X / 6.0_X * triple_tmp;
-    }
-};
-
-} //namespace shared_PCS
-struct PCS : public shared_PCS::PCS
+namespace picongpu
 {
-    using CloudShape = picongpu::particles::shapes::TSC;
-
-    struct ChargeAssignment : public shared_PCS::PCS
-    {
-
-        HDINLINE float_X operator()( float_X const x )
-        {
-            /*       -
-             *       |  1/6*(4 - 6*x^2 + 3*|x|^3)   if 0<=|x|<1
-             * W(x)=<|  1/6*(2 - |x|)^3             if 1<=|x|<2
-             *       |  0                           otherwise
-             *       -
-             */
-            float_X const abs_x = algorithms::math::abs( x );
-
-            bool const below_1 = abs_x < 1.0_X;
-            bool const below_2 = abs_x < 2.0_X;
-
-            float_X const rad1 = ff_1st_radius( abs_x );
-            float_X const rad2 = ff_2nd_radius( abs_x );
-
-            float_X result( 0.0 );
-            if( below_1 )
-                result = rad1;
-            else if( below_2 )
-                result = rad2;
-
-            return result;
-        }
-    };
-
-    struct ChargeAssignmentOnSupport : public shared_PCS::PCS
+    namespace particles
     {
-
-        HDINLINE float_X operator()( float_X const x )
+        namespace shapes
         {
-            /*       -
-             *       |  1/6*(4 - 6*x^2 + 3*|x|^3)   if 0<=|x|<1
-             * W(x)=<|
-             *       |  1/6*(2 - |x|)^3             if 1<=|x|<2
-             *       -
+            namespace detail
+            {
+                struct PCS
+                {
+                    /** Support of the assignment function in cells
+                     *
+                     * Specifies width of the area where the function can be non-zero.
+                     * Is the same for all directions
+                     */
+                    static constexpr uint32_t support = 5;
+
+                    HDINLINE static float_X ff_1st_radius(float_X const x)
+                    {
+                        /*
+                         * W(x)= 115/192 - 5/8 * x^2 + 1/4 * x^4
+                         *     = 115/192 + x^2 * (-5/8 + 1/4 * x^2)
+                         */
+                        float_X const square_x = x * x;
+                        return 115._X / 192._X + square_x * (-5._X / 8._X + 1.0_X / 4.0_X * square_x);
+                    }
+
+                    HDINLINE static float_X ff_2nd_radius(float_X const x)
+                    {
+                        /*
+                         * W(x)= 1/96 * (55 + 20 * x - 120 * x^2 + 80 * x^3 - 16 * x^4)
+                         *     = 1/96 * (55 + 4 * x * (5 - 2 * x * (15 + 2 * x * (-5 + x))))
+                         */
+                        return 1._X / 96._X
+                            * (55._X + 4._X * x * (5._X - 2._X * x * (15._X + 2._X * x * (-5._X + x))));
+                    }
+
+                    HDINLINE static float_X ff_3rd_radius(float_X const x)
+                    {
+                        /*
+                         * W(x)=1/384 * (5 - 2*x)^4
+                         */
+                        float_X const tmp = 5._X - 2._X * x;
+                        float_X const square_tmp = tmp * tmp;
+                        float_X const biquadratic_tmp = square_tmp * square_tmp;
+
+                        return 1._X / 384._X * biquadratic_tmp;
+                    }
+                };
+
+            } // namespace detail
+
+            /** Piecewise cubic cloud particle shape
+             *
+             * Cloud density form: piecewise cubic B-Spline
+             * Assignment function: piecewise quartic B-spline
              */
-            float_X const abs_x = algorithms::math::abs( x );
-
-            bool const below_1 = abs_x < 1.0_X;
-            float_X const rad1 = ff_1st_radius( abs_x );
-            float_X const rad2 = ff_2nd_radius( abs_x );
-
-            float_X result = rad2;
-            if( below_1 )
-                result = rad1;
-
-            return result;
-
-            /* Semantics:
-            if( abs_x < 1.0_X )
-                return ff_1st_radius( abs_x );
-            return ff_2nd_radius( abs_x );
-             */
-        }
-
-    };
-
-};
-
-} // namespace shapes
-} // namespace particles
+            struct PCS
+            {
+                //! Order of the assignment function spline
+                static constexpr uint32_t assignmentFunctionOrder = detail::PCS::support - 1u;
+
+                struct ChargeAssignmentOnSupport : public detail::PCS
+                {
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       |  115/192 + x^2 * (-5/8 + 1/4 * x^2)                          if -1/2 < x < 1/2
+                         * W(x)=<|
+                         *       |  1/96 * (55 + 4 * x * (5 - 2 * x * (15 + 2 * x * (-5 + x)))) if 1/2 <= |x| < 3/2
+                         *       |
+                         *       |  1/384 * (5 - 2 * x)^4                                       if 3/2 <= |x| < 5/2
+                         *       -
+                         */
+                        float_X const abs_x = math::abs(x);
+
+                        bool const below_2nd_radius = abs_x < 1.5_X;
+                        bool const below_1st_radius = abs_x < 0.5_X;
+
+                        float_X const rad1 = ff_1st_radius(abs_x);
+                        float_X const rad2 = ff_2nd_radius(abs_x);
+                        float_X const rad3 = ff_3rd_radius(abs_x);
+
+                        float_X result = rad3;
+                        if(below_1st_radius)
+                            result = rad1;
+                        else if(below_2nd_radius)
+                            result = rad2;
+
+                        return result;
+                    }
+                };
+
+                struct ChargeAssignment : public detail::PCS
+                {
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       |  115/192 + x^2 * (-5/8 + 1/4 * x^2)                          if -1/2 < x < 1/2
+                         * W(x)=<|
+                         *       |  1/96 * (55 + 4 * x * (5 - 2 * x * (15 + 2 * x * (-5 + x)))) if 1/2 <= |x| < 3/2
+                         *       |
+                         *       |  1/384 * (5 - 2*x)^4                                         if 3/2 <= |x| < 5/2
+                         *       |
+                         *       |  0                                                           otherwise
+                         *       -
+                         */
+                        float_X const abs_x = math::abs(x);
+
+                        bool const below_max = abs_x < 2.5_X;
+
+                        float_X const onSupport = ChargeAssignmentOnSupport()(abs_x);
+
+                        float_X result(0.0);
+                        if(below_max)
+                            result = onSupport;
+
+                        return result;
+                    }
+                };
+            };
+
+        } // namespace shapes
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/shapes/PQS.hpp b/include/picongpu/particles/shapes/PQS.hpp
new file mode 100644
index 0000000000..8780e95900
--- /dev/null
+++ b/include/picongpu/particles/shapes/PQS.hpp
@@ -0,0 +1,138 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov, Klaus Steiniger
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace shapes
+        {
+            namespace detail
+            {
+                struct PQS
+                {
+                    /** Support of the assignment function in cells
+                     *
+                     * Specifies width of the area where the function can be non-zero.
+                     * Is the same for all directions
+                     */
+                    static constexpr uint32_t support = 4;
+
+                    HDINLINE static float_X ff_1st_radius(float_X const x)
+                    {
+                        /*
+                         * W(x)=1/6*(4 - 6*x^2 + 3*|x|^3)
+                         */
+                        float_X const square_x = x * x;
+                        float_X const triple_x = square_x * x;
+                        return 1.0_X / 6.0_X * (4.0_X - 6.0_X * square_x + 3.0_X * triple_x);
+                    }
+
+                    HDINLINE static float_X ff_2nd_radius(float_X const x)
+                    {
+                        /*
+                         * W(x)=1/6*(2 - |x|)^3
+                         */
+                        float_X const tmp = 2.0_X - x;
+                        float_X const triple_tmp = tmp * tmp * tmp;
+                        return 1.0_X / 6.0_X * triple_tmp;
+                    }
+                };
+
+            } // namespace detail
+
+            /** Piecewise quadratic cloud particle shape
+             *
+             * Cloud density form: piecewise quadratic B-spline
+             * Assignment function: piecewise cubic B-spline
+             */
+            struct PQS
+            {
+                //! Order of the assignment function spline
+                static constexpr uint32_t assignmentFunctionOrder = detail::PQS::support - 1u;
+
+                struct ChargeAssignment : public detail::PQS
+                {
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       |  1/6*(4 - 6*x^2 + 3*|x|^3)   if 0<=|x|<1
+                         * W(x)=<|  1/6*(2 - |x|)^3             if 1<=|x|<2
+                         *       |  0                           otherwise
+                         *       -
+                         */
+                        float_X const abs_x = math::abs(x);
+
+                        bool const below_1 = abs_x < 1.0_X;
+                        bool const below_2 = abs_x < 2.0_X;
+
+                        float_X const rad1 = ff_1st_radius(abs_x);
+                        float_X const rad2 = ff_2nd_radius(abs_x);
+
+                        float_X result(0.0);
+                        if(below_1)
+                            result = rad1;
+                        else if(below_2)
+                            result = rad2;
+
+                        return result;
+                    }
+                };
+
+                struct ChargeAssignmentOnSupport : public detail::PQS
+                {
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       |  1/6*(4 - 6*x^2 + 3*|x|^3)   if 0<=|x|<1
+                         * W(x)=<|
+                         *       |  1/6*(2 - |x|)^3             if 1<=|x|<2
+                         *       -
+                         */
+                        float_X const abs_x = math::abs(x);
+
+                        bool const below_1 = abs_x < 1.0_X;
+                        float_X const rad1 = ff_1st_radius(abs_x);
+                        float_X const rad2 = ff_2nd_radius(abs_x);
+
+                        float_X result = rad2;
+                        if(below_1)
+                            result = rad1;
+
+                        return result;
+
+                        /* Semantics:
+                        if( abs_x < 1.0_X )
+                            return ff_1st_radius( abs_x );
+                        return ff_2nd_radius( abs_x );
+                         */
+                    }
+                };
+            };
+
+        } // namespace shapes
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/particles/shapes/TSC.hpp b/include/picongpu/particles/shapes/TSC.hpp
index cfd6e1d83b..3067713744 100644
--- a/include/picongpu/particles/shapes/TSC.hpp
+++ b/include/picongpu/particles/shapes/TSC.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -22,113 +22,115 @@
 
 #include "picongpu/simulation_defines.hpp"
 
-namespace picongpu
-{
-namespace particles
-{
-namespace shapes
-{
-
-namespace shared_TSC
-{
-
-struct TSC
-{
-    /**
-     * width of the support of this form_factor. This is the area where the function
-     * is non-zero.
-     */
-    static constexpr int support = 3;
+#include <cstdint>
 
 
-    HDINLINE static float_X ff_1st_radius( float_X const x )
-    {
-        /*
-         * W(x)=3/4 - x^2
-         */
-        float_X const square_x = x * x;
-        return 0.75_X - square_x;
-    }
-
-    HDINLINE static float_X ff_2nd_radius( float_X const x )
-    {
-        /*
-         * W(x)=1/2*(3/2 - |x|)^2
-         */
-        float_X const tmp = 3.0_X / 2.0_X - x;
-        float_X const square_tmp = tmp * tmp;
-        return 0.5_X * square_tmp;
-    }
-};
-
-} //namespace shared_TSC
-
-struct TSC : public shared_TSC::TSC
+namespace picongpu
 {
-    using CloudShape = picongpu::particles::shapes::CIC;
-
-    struct ChargeAssignment : public shared_TSC::TSC
-    {
-
-        HDINLINE float_X operator()( float_X const x )
-        {
-            /*       -
-             *       |  3/4 - x^2                  if |x|<1/2
-             * W(x)=<|  1/2*(3/2 - |x|)^2          if 1/2<=|x|<3/2
-             *       |  0                          otherwise
-             *       -
-             */
-            float_X const abs_x = algorithms::math::abs( x );
-
-            bool const below_05 = abs_x < 0.5_X;
-            bool const below_1_5 = abs_x < 1.5_X;
-
-            float_X const rad1 = ff_1st_radius( abs_x );
-            float_X const rad2 = ff_2nd_radius( abs_x );
-
-            float_X result( 0.0 );
-            if( below_05 )
-                result = rad1;
-            else if( below_1_5 )
-                result = rad2;
-
-            return result;
-
-        }
-    };
-
-    struct ChargeAssignmentOnSupport : public shared_TSC::TSC
+    namespace particles
     {
-
-        /** form factor of this particle shape.
-         * \param x has to be within [-support/2, support/2]
-         */
-        HDINLINE float_X operator()( float_X const x )
+        namespace shapes
         {
-            /*       -
-             *       |  3/4 - x^2                  if |x|<1/2
-             * W(x)=<|
-             *       |  1/2*(3/2 - |x|)^2          if 1/2<=|x|<3/2
-             *       -
+            namespace detail
+            {
+                struct TSC
+                {
+                    /** Support of the assignment function in cells
+                     *
+                     * Specifies width of the area where the function can be non-zero.
+                     * Is the same for all directions
+                     */
+                    static constexpr uint32_t support = 3;
+
+                    HDINLINE static float_X ff_1st_radius(float_X const x)
+                    {
+                        /*
+                         * W(x)=3/4 - x^2
+                         */
+                        float_X const square_x = x * x;
+                        return 0.75_X - square_x;
+                    }
+
+                    HDINLINE static float_X ff_2nd_radius(float_X const x)
+                    {
+                        /*
+                         * W(x)=1/2*(3/2 - |x|)^2
+                         */
+                        float_X const tmp = 3.0_X / 2.0_X - x;
+                        float_X const square_tmp = tmp * tmp;
+                        return 0.5_X * square_tmp;
+                    }
+                };
+
+            } // namespace detail
+
+            /** Triagle-shaped cloud particle shape
+             *
+             * Cloud density form: piecewise linear
+             * Assignment function: second order B-spline
              */
-            float_X const abs_x = algorithms::math::abs( x );
-
-            bool const below_05 = abs_x < 0.5_X;
-
-            float_X const rad1 = ff_1st_radius( abs_x );
-            float_X const rad2 = ff_2nd_radius( abs_x );
-
-            float_X result = rad2;
-            if( below_05 )
-                result = rad1;
-
-            return result;
-        }
-
-    };
-
-};
-
-} // namespace shapes
-} // namespace partciles
+            struct TSC
+            {
+                //! Order of the assignment function spline
+                static constexpr uint32_t assignmentFunctionOrder = detail::TSC::support - 1u;
+
+                struct ChargeAssignment : public detail::TSC
+                {
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       |  3/4 - x^2                  if |x|<1/2
+                         * W(x)=<|  1/2*(3/2 - |x|)^2          if 1/2<=|x|<3/2
+                         *       |  0                          otherwise
+                         *       -
+                         */
+                        float_X const abs_x = math::abs(x);
+
+                        bool const below_05 = abs_x < 0.5_X;
+                        bool const below_1_5 = abs_x < 1.5_X;
+
+                        float_X const rad1 = ff_1st_radius(abs_x);
+                        float_X const rad2 = ff_2nd_radius(abs_x);
+
+                        float_X result(0.0);
+                        if(below_05)
+                            result = rad1;
+                        else if(below_1_5)
+                            result = rad2;
+
+                        return result;
+                    }
+                };
+
+                struct ChargeAssignmentOnSupport : public detail::TSC
+                {
+                    /** form factor of this particle shape.
+                     * \param x has to be within [-support/2, support/2]
+                     */
+                    HDINLINE float_X operator()(float_X const x)
+                    {
+                        /*       -
+                         *       |  3/4 - x^2                  if |x|<1/2
+                         * W(x)=<|
+                         *       |  1/2*(3/2 - |x|)^2          if 1/2<=|x|<3/2
+                         *       -
+                         */
+                        float_X const abs_x = math::abs(x);
+
+                        bool const below_05 = abs_x < 0.5_X;
+
+                        float_X const rad1 = ff_1st_radius(abs_x);
+                        float_X const rad2 = ff_2nd_radius(abs_x);
+
+                        float_X result = rad2;
+                        if(below_05)
+                            result = rad1;
+
+                        return result;
+                    }
+                };
+            };
+
+        } // namespace shapes
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/OnePositionImpl.def b/include/picongpu/particles/startPosition/OnePositionImpl.def
index 5f692dc62c..18bba3f4e3 100644
--- a/include/picongpu/particles/startPosition/OnePositionImpl.def
+++ b/include/picongpu/particles/startPosition/OnePositionImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl, Rene Widera
+/* Copyright 2016-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,34 +27,33 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace acc
-{
-
-    /** set the particle attribute position
-     *
-     * This functor also sets the macro particle weighting.
-     */
-    template< typename T_ParamClass >
-    struct OnePositionImpl;
-
-} // namespace acc
-
-
-    /** Set the in cell position
-     *
-     * All macro particles are set to the same in cell position defined in
-     * T_ParamClass.
-     *
-     * @tparam T_ParamClass Parameter class with off `InCellOffset` defined as
-     *                      CONST_VECTOR of 3 float_X [0.0, 1.0).
-     */
-    template< typename T_ParamClass >
-    using OnePositionImpl = generic::Free< acc::OnePositionImpl< T_ParamClass > >;
-
-} // namespace startPosition
-} // namespace particles
+    namespace particles
+    {
+        namespace startPosition
+        {
+            namespace acc
+            {
+                /** set the particle attribute position
+                 *
+                 * This functor also sets the macro particle weighting.
+                 */
+                template<typename T_ParamClass>
+                struct OnePositionImpl;
+
+            } // namespace acc
+
+
+            /** Set the in cell position
+             *
+             * All macro particles are set to the same in cell position defined in
+             * T_ParamClass.
+             *
+             * @tparam T_ParamClass Parameter class with off `InCellOffset` defined as
+             *                      CONST_VECTOR of 3 float_X [0.0, 1.0).
+             */
+            template<typename T_ParamClass>
+            using OnePositionImpl = generic::Free<acc::OnePositionImpl<T_ParamClass>>;
+
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/OnePositionImpl.hpp b/include/picongpu/particles/startPosition/OnePositionImpl.hpp
index c8e2f715d9..30173cbb2a 100644
--- a/include/picongpu/particles/startPosition/OnePositionImpl.hpp
+++ b/include/picongpu/particles/startPosition/OnePositionImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -30,106 +30,81 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace acc
-{
-namespace detail
-{
-    template< bool T_hasWeighting >
-    struct SetWeighting
-    {
-        template< typename T_Particle >
-        HDINLINE void
-        operator()
-        (
-            T_Particle & particle,
-            float_X const weighting
-        )
-        {
-            particle[ weighting_ ] = weighting;
-        }
-    };
-
-    template<>
-    struct SetWeighting< false >
-    {
-        template< typename T_Particle >
-        HDINLINE void
-        operator()
-        (
-            T_Particle &,
-            float_X const
-        )
-        {
-        }
-    };
-
-} // namespace detail
-
-    template< typename T_ParamClass >
-    struct OnePositionImpl
+    namespace particles
     {
-        /** set in-cell position and weighting
-         *
-         * @tparam T_Particle pmacc::Particle, particle type
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param particle particle to be manipulated
-         * @param ... unused particles
-         */
-        template<
-            typename T_Particle,
-            typename ... T_Args
-        >
-        HDINLINE void operator()(
-            T_Particle & particle,
-            T_Args && ...
-        )
-        {
-            particle[ position_ ] = T_ParamClass{}.inCellOffset.template shrink< simDim >( );
-
-            // set the weighting attribute if the particle species has it
-            bool const hasWeighting = pmacc::traits::HasIdentifier<
-                typename T_Particle::FrameType,
-                weighting
-            >::type::value;
-            detail::SetWeighting< hasWeighting > setWeighting;
-            setWeighting(
-                particle,
-                m_weighting
-            );
-        }
-
-        template< typename T_Particle >
-        HDINLINE uint32_t
-        numberOfMacroParticles( float_X const realParticlesPerCell )
+        namespace startPosition
         {
-            bool const hasWeighting = pmacc::traits::HasIdentifier<
-                typename T_Particle::FrameType,
-                weighting
-            >::type::value;
-
-            // note: m_weighting member might stay uninitialized!
-            uint32_t result( T_ParamClass::numParticlesPerCell );
-
-            if( hasWeighting )
-                result = startPosition::detail::WeightMacroParticles{}(
-                    realParticlesPerCell,
-                    T_ParamClass::numParticlesPerCell,
-                    m_weighting
-                );
-
-            return result;
-        }
-
-    private:
-        float_X m_weighting;
-    };
-
-} // namespace acc
-} // namespace startPosition
-} // namespace particles
+            namespace acc
+            {
+                namespace detail
+                {
+                    template<bool T_hasWeighting>
+                    struct SetWeighting
+                    {
+                        template<typename T_Particle>
+                        HDINLINE void operator()(T_Particle& particle, float_X const weighting)
+                        {
+                            particle[weighting_] = weighting;
+                        }
+                    };
+
+                    template<>
+                    struct SetWeighting<false>
+                    {
+                        template<typename T_Particle>
+                        HDINLINE void operator()(T_Particle&, float_X const)
+                        {
+                        }
+                    };
+
+                } // namespace detail
+
+                template<typename T_ParamClass>
+                struct OnePositionImpl
+                {
+                    /** set in-cell position and weighting
+                     *
+                     * @tparam T_Particle pmacc::Particle, particle type
+                     * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                     *
+                     * @param particle particle to be manipulated
+                     * @param ... unused particles
+                     */
+                    template<typename T_Particle, typename... T_Args>
+                    HDINLINE void operator()(T_Particle& particle, T_Args&&...)
+                    {
+                        particle[position_] = T_ParamClass{}.inCellOffset.template shrink<simDim>();
+
+                        // set the weighting attribute if the particle species has it
+                        bool const hasWeighting
+                            = pmacc::traits::HasIdentifier<typename T_Particle::FrameType, weighting>::type::value;
+                        detail::SetWeighting<hasWeighting> setWeighting;
+                        setWeighting(particle, m_weighting);
+                    }
+
+                    template<typename T_Particle>
+                    HDINLINE uint32_t numberOfMacroParticles(float_X const realParticlesPerCell)
+                    {
+                        bool const hasWeighting
+                            = pmacc::traits::HasIdentifier<typename T_Particle::FrameType, weighting>::type::value;
+
+                        // note: m_weighting member might stay uninitialized!
+                        uint32_t result(T_ParamClass::numParticlesPerCell);
+
+                        if(hasWeighting)
+                            result = startPosition::detail::WeightMacroParticles{}(
+                                realParticlesPerCell,
+                                T_ParamClass::numParticlesPerCell,
+                                m_weighting);
+
+                        return result;
+                    }
+
+                private:
+                    float_X m_weighting;
+                };
+
+            } // namespace acc
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/QuietImpl.def b/include/picongpu/particles/startPosition/QuietImpl.def
index 050f957d10..fcd835d738 100644
--- a/include/picongpu/particles/startPosition/QuietImpl.def
+++ b/include/picongpu/particles/startPosition/QuietImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,32 +27,31 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace acc
-{
-
-    /** Set the in cell position (accelerator)
-     *
-     * Set the in cell position and the weighting of the macro particle.
-     */
-    template< typename T_ParamClass >
-    struct QuietImpl;
-
-} // namespace acc
-
-
-    /** Set the in cell position
-     *
-     * The position of the particle is chosen depending of the number of
-     * macro particles within the cell so that the distance to the next particle
-     * is equal.
-     */
-    template< typename T_ParamClass >
-    using QuietImpl = generic::Free< acc::QuietImpl< T_ParamClass > >;
-
-} // namespace startPosition
-} // namespace particles
+    namespace particles
+    {
+        namespace startPosition
+        {
+            namespace acc
+            {
+                /** Set the in cell position (accelerator)
+                 *
+                 * Set the in cell position and the weighting of the macro particle.
+                 */
+                template<typename T_ParamClass>
+                struct QuietImpl;
+
+            } // namespace acc
+
+
+            /** Set the in cell position
+             *
+             * The position of the particle is chosen depending of the number of
+             * macro particles within the cell so that the distance to the next particle
+             * is equal.
+             */
+            template<typename T_ParamClass>
+            using QuietImpl = generic::Free<acc::QuietImpl<T_ParamClass>>;
+
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/QuietImpl.hpp b/include/picongpu/particles/startPosition/QuietImpl.hpp
index 9873e34a52..715845f08f 100644
--- a/include/picongpu/particles/startPosition/QuietImpl.hpp
+++ b/include/picongpu/particles/startPosition/QuietImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,115 +27,99 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace acc
-{
-
-    template< typename T_ParamClass >
-    struct QuietImpl
+    namespace particles
     {
-        /** set in-cell position and weighting
-         *
-         * @warning It is not allowed to call this functor as many times as
-         *          the resulting value of numberOfMacroParticles.
-         *
-         * @tparam T_Particle pmacc::Particle, particle type
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param particle particle to be manipulated
-         * @param ... unused particles
-         */
-        template<
-            typename T_Particle,
-            typename ... T_Args
-        >
-        HDINLINE void operator()(
-            T_Particle & particle,
-            T_Args && ...
-        )
+        namespace startPosition
         {
-            uint32_t maxNumMacroParticles = pmacc::math::CT::volume<
-                typename T_ParamClass::numParticlesPerDimension
-            >::type::value;
-
-            /* reset the particle position if the operator is called more times
-             * than allowed (m_currentMacroParticles underflow protection for)
-             */
-            if( maxNumMacroParticles <=  m_currentMacroParticles )
-                m_currentMacroParticles = maxNumMacroParticles - 1u;
-
-            // spacing between particles in each direction in the cell
-            DataSpace< simDim > const numParDirection( T_ParamClass::numParticlesPerDimension::toRT() );
-            floatD_X spacing;
-            for( uint32_t i = 0; i < simDim; ++i )
-                spacing[i] = float_X( 1.0 ) / float_X( numParDirection[ i ] );
-
-            /* coordinate in the local in-cell lattice
-             *   x = [0, numParsPerCell_X-1]
-             *   y = [0, numParsPerCell_Y-1]
-             *   z = [0, numParsPerCell_Z-1]
-             */
-            DataSpace< simDim > inCellCoordinate = DataSpaceOperations< simDim >::map(
-                numParDirection,
-                m_currentMacroParticles
-            );
-
-            particle[ position_ ] = precisionCast< float_X >( inCellCoordinate ) * spacing +
-                spacing * float_X( 0.5 );
-            particle[ weighting_ ] = m_weighting;
-
-            --m_currentMacroParticles;
-
-        }
-
-        template< typename T_Particle >
-        HDINLINE uint32_t
-        numberOfMacroParticles( float_X const realParticlesPerCell )
-        {
-            auto numParInCell = T_ParamClass::numParticlesPerDimension::toRT();
-
-            m_weighting = float_X( 0.0 );
-            uint32_t numMacroParticles = pmacc::math::CT::volume<
-                typename T_ParamClass::numParticlesPerDimension
-            >::type::value;
-
-            if( numMacroParticles > 0u )
-                m_weighting = realParticlesPerCell / float_X( numMacroParticles );
-
-            while(
-                m_weighting < MIN_WEIGHTING &&
-                numMacroParticles > 0u
-            )
+            namespace acc
             {
-                /* decrement component with greatest value*/
-                uint32_t max_component = 0u;
-                for( uint32_t i = 1; i < simDim; ++i )
+                template<typename T_ParamClass>
+                struct QuietImpl
                 {
-                    if( numParInCell[ i ] > numParInCell[ max_component ] )
-                        max_component = i;
-                }
-                numParInCell[ max_component ] -= 1u;
-
-                numMacroParticles = numParInCell.productOfComponents( );
-
-                if( numMacroParticles > 0u )
-                    m_weighting = realParticlesPerCell / float_X( numMacroParticles );
-                else
-                    m_weighting = float_X( 0.0 );
-            }
-            m_currentMacroParticles = numMacroParticles - 1u;
-            return numMacroParticles;
-        }
-    private:
-
-        float_X m_weighting;
-        uint32_t m_currentMacroParticles;
-    };
-
-} // namespace acc
-} // namespace startPosition
-} // namespace particles
+                    /** set in-cell position and weighting
+                     *
+                     * @warning It is not allowed to call this functor as many times as
+                     *          the resulting value of numberOfMacroParticles.
+                     *
+                     * @tparam T_Particle pmacc::Particle, particle type
+                     * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                     *
+                     * @param particle particle to be manipulated
+                     * @param ... unused particles
+                     */
+                    template<typename T_Particle, typename... T_Args>
+                    HDINLINE void operator()(T_Particle& particle, T_Args&&...)
+                    {
+                        uint32_t maxNumMacroParticles
+                            = pmacc::math::CT::volume<typename T_ParamClass::numParticlesPerDimension>::type::value;
+
+                        /* reset the particle position if the operator is called more times
+                         * than allowed (m_currentMacroParticles underflow protection for)
+                         */
+                        if(maxNumMacroParticles <= m_currentMacroParticles)
+                            m_currentMacroParticles = maxNumMacroParticles - 1u;
+
+                        // spacing between particles in each direction in the cell
+                        DataSpace<simDim> const numParDirection(T_ParamClass::numParticlesPerDimension::toRT());
+                        floatD_X spacing;
+                        for(uint32_t i = 0; i < simDim; ++i)
+                            spacing[i] = float_X(1.0) / float_X(numParDirection[i]);
+
+                        /* coordinate in the local in-cell lattice
+                         *   x = [0, numParsPerCell_X-1]
+                         *   y = [0, numParsPerCell_Y-1]
+                         *   z = [0, numParsPerCell_Z-1]
+                         */
+                        DataSpace<simDim> inCellCoordinate
+                            = DataSpaceOperations<simDim>::map(numParDirection, m_currentMacroParticles);
+
+                        particle[position_]
+                            = precisionCast<float_X>(inCellCoordinate) * spacing + spacing * float_X(0.5);
+                        particle[weighting_] = m_weighting;
+
+                        --m_currentMacroParticles;
+                    }
+
+                    template<typename T_Particle>
+                    HDINLINE uint32_t numberOfMacroParticles(float_X const realParticlesPerCell)
+                    {
+                        auto numParInCell = T_ParamClass::numParticlesPerDimension::toRT();
+
+                        m_weighting = float_X(0.0);
+                        uint32_t numMacroParticles
+                            = pmacc::math::CT::volume<typename T_ParamClass::numParticlesPerDimension>::type::value;
+
+                        if(numMacroParticles > 0u)
+                            m_weighting = realParticlesPerCell / float_X(numMacroParticles);
+
+                        while(m_weighting < MIN_WEIGHTING && numMacroParticles > 0u)
+                        {
+                            /* decrement component with greatest value*/
+                            uint32_t max_component = 0u;
+                            for(uint32_t i = 1; i < simDim; ++i)
+                            {
+                                if(numParInCell[i] > numParInCell[max_component])
+                                    max_component = i;
+                            }
+                            numParInCell[max_component] -= 1u;
+
+                            numMacroParticles = numParInCell.productOfComponents();
+
+                            if(numMacroParticles > 0u)
+                                m_weighting = realParticlesPerCell / float_X(numMacroParticles);
+                            else
+                                m_weighting = float_X(0.0);
+                        }
+                        m_currentMacroParticles = numMacroParticles - 1u;
+                        return numMacroParticles;
+                    }
+
+                private:
+                    float_X m_weighting;
+                    uint32_t m_currentMacroParticles;
+                };
+
+            } // namespace acc
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/RandomImpl.def b/include/picongpu/particles/startPosition/RandomImpl.def
index aea44713d3..48a71ba2ca 100644
--- a/include/picongpu/particles/startPosition/RandomImpl.def
+++ b/include/picongpu/particles/startPosition/RandomImpl.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -29,34 +29,31 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace acc
-{
-
-    /** set the particle attribute position to in-cell random
-     *
-     * The particle attribute position is assigned with a random
-     * in-cell position.
-     * This functor also sets the macro particle weighting.
-     */
-    template< typename T_ParamClass >
-    struct RandomImpl;
-
-} // namespace acc
-
-
-    /** Set the in cell position to in-cell random
-     *
-     * The new in-cell position is uniformly distributed position between [0.0;1.0).
-     */
-    template< typename T_ParamClass >
-    using RandomImpl = generic::FreeRng<
-        acc::RandomImpl< T_ParamClass >,
-        pmacc::random::distributions::Uniform< float_X >
-    >;
-} // namespace startPosition
-} // namespace particles
+    namespace particles
+    {
+        namespace startPosition
+        {
+            namespace acc
+            {
+                /** set the particle attribute position to in-cell random
+                 *
+                 * The particle attribute position is assigned with a random
+                 * in-cell position.
+                 * This functor also sets the macro particle weighting.
+                 */
+                template<typename T_ParamClass>
+                struct RandomImpl;
+
+            } // namespace acc
+
+
+            /** Set the in cell position to in-cell random
+             *
+             * The new in-cell position is uniformly distributed position between [0.0;1.0).
+             */
+            template<typename T_ParamClass>
+            using RandomImpl
+                = generic::FreeRng<acc::RandomImpl<T_ParamClass>, pmacc::random::distributions::Uniform<float_X>>;
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/RandomImpl.hpp b/include/picongpu/particles/startPosition/RandomImpl.hpp
index 4727d54aee..88a0df3a66 100644
--- a/include/picongpu/particles/startPosition/RandomImpl.hpp
+++ b/include/picongpu/particles/startPosition/RandomImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera,
  *                     Alexander Grund
  *
  * This file is part of PIConGPU.
@@ -24,68 +24,55 @@
 #include "picongpu/particles/startPosition/generic/FreeRng.def"
 #include "picongpu/particles/startPosition/detail/WeightMacroParticles.hpp"
 
-#include <pmacc/nvidia/rng/distributions/Uniform_float.hpp>
-
 #include <boost/mpl/integral_c.hpp>
 
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace acc
-{
-
-    template< typename T_ParamClass >
-    struct RandomImpl
+    namespace particles
     {
-        /** set in-cell position and weighting
-         *
-         * @tparam T_Rng pmacc::nvidia::rng::RNG, type of the random number generator
-         * @tparam T_Particle pmacc::Particle, particle type
-         * @tparam T_Args pmacc::Particle, arbitrary number of particles types
-         *
-         * @param rng random number generator
-         * @param particle particle to be manipulated
-         * @param ... unused particles
-         */
-        template<
-            typename T_Rng,
-            typename T_Particle,
-            typename ... T_Args
-        >
-        HDINLINE void operator()(
-            T_Rng & rng,
-            T_Particle & particle,
-            T_Args && ...
-        )
+        namespace startPosition
         {
-            floatD_X tmpPos;
+            namespace acc
+            {
+                template<typename T_ParamClass>
+                struct RandomImpl
+                {
+                    /** set in-cell position and weighting
+                     *
+                     * @tparam T_Rng functor::misc::RngWrapper, type of the random number generator
+                     * @tparam T_Particle pmacc::Particle, particle type
+                     * @tparam T_Args pmacc::Particle, arbitrary number of particles types
+                     *
+                     * @param rng random number generator
+                     * @param particle particle to be manipulated
+                     * @param ... unused particles
+                     */
+                    template<typename T_Rng, typename T_Particle, typename... T_Args>
+                    HDINLINE void operator()(T_Rng& rng, T_Particle& particle, T_Args&&...)
+                    {
+                        floatD_X tmpPos;
 
-            for( uint32_t d = 0; d < simDim; ++d )
-                tmpPos[ d ] = rng( );
+                        for(uint32_t d = 0; d < simDim; ++d)
+                            tmpPos[d] = rng();
 
-            particle[ position_ ] = tmpPos;
-            particle[ weighting_ ] = m_weighting;
-        }
+                        particle[position_] = tmpPos;
+                        particle[weighting_] = m_weighting;
+                    }
 
-        template< typename T_Particle >
-        HDINLINE uint32_t
-        numberOfMacroParticles( float_X const realParticlesPerCell )
-        {
-            return startPosition::detail::WeightMacroParticles{}(
-                realParticlesPerCell,
-                T_ParamClass::numParticlesPerCell,
-                m_weighting
-            );
-        }
+                    template<typename T_Particle>
+                    HDINLINE uint32_t numberOfMacroParticles(float_X const realParticlesPerCell)
+                    {
+                        return startPosition::detail::WeightMacroParticles{}(
+                            realParticlesPerCell,
+                            T_ParamClass::numParticlesPerCell,
+                            m_weighting);
+                    }
 
-        float_X m_weighting;
-    };
+                    float_X m_weighting;
+                };
 
-} // namespace acc
-} // namespace startPosition
-} // namespace particles
+            } // namespace acc
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/detail/WeightMacroParticles.hpp b/include/picongpu/particles/startPosition/detail/WeightMacroParticles.hpp
index a56909e7a4..d4d42de80e 100644
--- a/include/picongpu/particles/startPosition/detail/WeightMacroParticles.hpp
+++ b/include/picongpu/particles/startPosition/detail/WeightMacroParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera,
  *                     Alexander Grund
  *
  * This file is part of PIConGPU.
@@ -25,60 +25,50 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace detail
-{
-
-    /** Calculate the weighting per macro-particle in a cell
-     *
-     * Note: In the density regions where the weighting of macro particles would
-     * violate the user-specified MIN_WEIGHTING, we reduce the number of
-     * macro particles per cell to still initialize particles
-     * (see particle.param).
-     *
-     * This calculates the number of macro particles and the weighting per macro
-     * particle with respect to MIN_WEIGHTING.
-     */
-    struct WeightMacroParticles
+    namespace particles
     {
-        /** get number of and the weighting per macro particle(s)
-         *
-         * @param realParticlesPerCell number of real particles per cell
-         * @param macroParticlesPerCell maximum number of macro particles per cell
-         * @param[out] weighting weighting per macro particle
-         * @return number of macro particles per cell with respect to
-         *         MIN_WEIGHTING, range: [0;macroParticlesPerCell]
-         */
-        HDINLINE uint32_t
-        operator()(
-            float_X const realParticlesPerCell,
-            uint32_t numMacroParticles,
-            float_X & weighting
-        ) const
+        namespace startPosition
         {
-            PMACC_CASSERT_MSG(
-                __MIN_WEIGHTING_must_be_greater_than_zero,
-                MIN_WEIGHTING > float_X( 0.0 )
-            );
-            weighting = float_X( 0.0 );
-            float_X const maxParPerCell = realParticlesPerCell / MIN_WEIGHTING;
-            numMacroParticles = math::float2int_rd(
-                math::min(
-                    float_X( numMacroParticles ),
-                    maxParPerCell
-                )
-            );
-            if( numMacroParticles > 0u )
-                weighting = realParticlesPerCell / float_X( numMacroParticles );
+            namespace detail
+            {
+                /** Calculate the weighting per macro-particle in a cell
+                 *
+                 * Note: In the density regions where the weighting of macro particles would
+                 * violate the user-specified MIN_WEIGHTING, we reduce the number of
+                 * macro particles per cell to still initialize particles
+                 * (see particle.param).
+                 *
+                 * This calculates the number of macro particles and the weighting per macro
+                 * particle with respect to MIN_WEIGHTING.
+                 */
+                struct WeightMacroParticles
+                {
+                    /** get number of and the weighting per macro particle(s)
+                     *
+                     * @param realParticlesPerCell number of real particles per cell
+                     * @param macroParticlesPerCell maximum number of macro particles per cell
+                     * @param[out] weighting weighting per macro particle
+                     * @return number of macro particles per cell with respect to
+                     *         MIN_WEIGHTING, range: [0;macroParticlesPerCell]
+                     */
+                    HDINLINE uint32_t operator()(
+                        float_X const realParticlesPerCell,
+                        uint32_t numMacroParticles,
+                        float_X& weighting) const
+                    {
+                        PMACC_CASSERT_MSG(__MIN_WEIGHTING_must_be_greater_than_zero, MIN_WEIGHTING > float_X(0.0));
+                        weighting = float_X(0.0);
+                        float_X const maxParPerCell = realParticlesPerCell / MIN_WEIGHTING;
+                        numMacroParticles
+                            = pmacc::math::float2int_rd(math::min(float_X(numMacroParticles), maxParPerCell));
+                        if(numMacroParticles > 0u)
+                            weighting = realParticlesPerCell / float_X(numMacroParticles);
 
-            return numMacroParticles;
-        }
-    };
+                        return numMacroParticles;
+                    }
+                };
 
-} // namespace detail
-} // namespace startPosition
-} // namespace particles
+            } // namespace detail
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/functors.def b/include/picongpu/particles/startPosition/functors.def
index 9a039786e0..2a35631ccd 100644
--- a/include/picongpu/particles/startPosition/functors.def
+++ b/include/picongpu/particles/startPosition/functors.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/startPosition/functors.hpp b/include/picongpu/particles/startPosition/functors.hpp
index 4f8bc265db..3a40125462 100644
--- a/include/picongpu/particles/startPosition/functors.hpp
+++ b/include/picongpu/particles/startPosition/functors.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/particles/startPosition/generic/Free.def b/include/picongpu/particles/startPosition/generic/Free.def
index a9cd485a8d..a6cad122a2 100644
--- a/include/picongpu/particles/startPosition/generic/Free.def
+++ b/include/picongpu/particles/startPosition/generic/Free.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,23 +22,22 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace generic
-{
-
-    /** call simple free user defined functor
-     *
-     * @tparam T_Functor user defined functor
-     *                   **optional**: can implement **one** host side constructor
-     *                   `T_Functor()` or `T_Functor(uint32_t currentTimeStep)`
-     */
-    template< typename T_Functor >
-    struct Free;
+    namespace particles
+    {
+        namespace startPosition
+        {
+            namespace generic
+            {
+                /** call simple free user defined functor
+                 *
+                 * @tparam T_Functor user defined functor
+                 *                   **optional**: can implement **one** host side constructor
+                 *                   `T_Functor()` or `T_Functor(uint32_t currentTimeStep)`
+                 */
+                template<typename T_Functor>
+                struct Free;
 
-} // namespace generic
-} // namespace startPosition
-} // namespace particles
+            } // namespace generic
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/generic/Free.hpp b/include/picongpu/particles/startPosition/generic/Free.hpp
index 57139e1db6..93f4761991 100644
--- a/include/picongpu/particles/startPosition/generic/Free.hpp
+++ b/include/picongpu/particles/startPosition/generic/Free.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -27,141 +27,115 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace generic
-{
-namespace acc
-{
-    /** wrapper for the user functor on the accelerator
-     *
-     * @tparam T_Functor user defined functor
-     */
-    template< typename T_Functor >
-    struct Free : private T_Functor
+    namespace particles
     {
-
-        //! type of the user functor
-        using Functor = T_Functor;
-
-        //! store user functor instance
-        HDINLINE Free( Functor const & functor ) :
-            Functor( functor )
-        { }
-
-        /** execute the user functor
-         *
-         * @tparam T_Args type of the arguments passed to the user functor
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param args arguments passed to the user functor
-         */
-        template<
-            typename ... T_Args,
-            typename T_Acc
-        >
-        HDINLINE
-        void operator( )(
-            T_Acc const &,
-            T_Args && ... args
-        )
+        namespace startPosition
         {
-            Functor::operator( )( args ... );
-        }
+            namespace generic
+            {
+                namespace acc
+                {
+                    /** wrapper for the user functor on the accelerator
+                     *
+                     * @tparam T_Functor user defined functor
+                     */
+                    template<typename T_Functor>
+                    struct Free : private T_Functor
+                    {
+                        //! type of the user functor
+                        using Functor = T_Functor;
 
-        template< typename T_Particle >
-        HDINLINE uint32_t
-        numberOfMacroParticles( float_X const realParticlesPerCell )
-        {
-            return Functor::template numberOfMacroParticles< T_Particle >( realParticlesPerCell );
-        }
-    };
-} // namespace acc
+                        //! store user functor instance
+                        HDINLINE Free(Functor const& functor) : Functor(functor)
+                        {
+                        }
 
-    template< typename T_Functor >
-    struct Free : protected T_Functor
-    {
+                        /** execute the user functor
+                         *
+                         * @tparam T_Args type of the arguments passed to the user functor
+                         * @tparam T_Acc alpaka accelerator type
+                         *
+                         * @param alpaka accelerator
+                         * @param args arguments passed to the user functor
+                         */
+                        template<typename... T_Args, typename T_Acc>
+                        HDINLINE void operator()(T_Acc const&, T_Args&&... args)
+                        {
+                            Functor::operator()(args...);
+                        }
 
-        using Functor = T_Functor;
+                        template<typename T_Particle>
+                        HDINLINE uint32_t numberOfMacroParticles(float_X const realParticlesPerCell)
+                        {
+                            return Functor::template numberOfMacroParticles<T_Particle>(realParticlesPerCell);
+                        }
+                    };
+                } // namespace acc
 
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = Free;
-        };
+                template<typename T_Functor>
+                struct Free : protected T_Functor
+                {
+                    using Functor = T_Functor;
 
-        /** constructor
-         *
-         * This constructor is only compiled if the user functor has
-         * a host side constructor with one (uint32_t) argument.
-         *
-         * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
-         *                      the constructor
-         * @param currentStep current simulation time step
-         * @param is used to enable/disable the constructor (do not pass any value to this parameter)
-         */
-        template< typename DeferFunctor = Functor >
-        HINLINE Free(
-            uint32_t currentStep,
-            typename std::enable_if<
-                std::is_constructible<
-                    DeferFunctor,
-                    uint32_t
-                >::value
-            >::type* = 0
-        ) : Functor( currentStep )
-        {
-        }
+                    template<typename T_SpeciesType>
+                    struct apply
+                    {
+                        using type = Free;
+                    };
 
-        /** constructor
-         *
-         * This constructor is only compiled if the user functor has a default constructor.
-         *
-         * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
-         *                      the constructor
-         * @param current simulation time step
-         * @param is used to enable/disable the constructor (do not pass any value to this parameter)
-         */
-        template< typename DeferFunctor = Functor >
-        HINLINE Free(
-            uint32_t,
-            typename std::enable_if<
-                std::is_constructible< DeferFunctor >::value
-            >::type* = 0
-        ) : Functor( )
-        {
-        }
+                    /** constructor
+                     *
+                     * This constructor is only compiled if the user functor has
+                     * a host side constructor with one (uint32_t) argument.
+                     *
+                     * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
+                     *                      the constructor
+                     * @param currentStep current simulation time step
+                     * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+                     */
+                    template<typename DeferFunctor = Functor>
+                    HINLINE Free(
+                        uint32_t currentStep,
+                        typename std::enable_if<std::is_constructible<DeferFunctor, uint32_t>::value>::type* = 0)
+                        : Functor(currentStep)
+                    {
+                    }
 
-        /** create device functor
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param offset (in supercells, without any guards) to the
-         *         origin of the local domain
-         * @param configuration of the worker
-         */
-        template<
-            typename T,
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE acc::Free< Functor >
-        operator()(
-            T_Acc const & acc,
-            T const &,
-            T_WorkerCfg const &
-        ) const
-        {
-            return acc::Free< Functor >( *static_cast< Functor const * >( this ) );
-        }
-    };
+                    /** constructor
+                     *
+                     * This constructor is only compiled if the user functor has a default constructor.
+                     *
+                     * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
+                     *                      the constructor
+                     * @param current simulation time step
+                     * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+                     */
+                    template<typename DeferFunctor = Functor>
+                    HINLINE Free(
+                        uint32_t,
+                        typename std::enable_if<std::is_constructible<DeferFunctor>::value>::type* = 0)
+                        : Functor()
+                    {
+                    }
+
+                    /** create device functor
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param offset (in supercells, without any guards) to the
+                     *         origin of the local domain
+                     * @param configuration of the worker
+                     */
+                    template<typename T, typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE acc::Free<Functor> operator()(T_Acc const& acc, T const&, T_WorkerCfg const&) const
+                    {
+                        return acc::Free<Functor>(*static_cast<Functor const*>(this));
+                    }
+                };
 
-} // namespace generic
-} // namespace startPosition
-} // namespace particles
+            } // namespace generic
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/generic/FreeRng.def b/include/picongpu/particles/startPosition/generic/FreeRng.def
index c0a0572618..0f2f12793f 100644
--- a/include/picongpu/particles/startPosition/generic/FreeRng.def
+++ b/include/picongpu/particles/startPosition/generic/FreeRng.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,26 +27,22 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace generic
-{
-
-    /** call simple free user defined functor and provide a random number generator
-     *
-     *
-     * @tparam T_Functor user defined unary functor
-     * @tparam T_Distribution pmacc::random::distributions, random number distribution
-     */
-    template<
-        typename T_Functor,
-        typename T_Distribution
-    >
-    struct FreeRng;
+    namespace particles
+    {
+        namespace startPosition
+        {
+            namespace generic
+            {
+                /** call simple free user defined functor and provide a random number generator
+                 *
+                 *
+                 * @tparam T_Functor user defined unary functor
+                 * @tparam T_Distribution pmacc::random::distributions, random number distribution
+                 */
+                template<typename T_Functor, typename T_Distribution>
+                struct FreeRng;
 
-} // namespace generic
-} // namespace startPosition
-} // namespace particles
+            } // namespace generic
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/startPosition/generic/FreeRng.hpp b/include/picongpu/particles/startPosition/generic/FreeRng.hpp
index c7a37b952d..11a608c3b0 100644
--- a/include/picongpu/particles/startPosition/generic/FreeRng.hpp
+++ b/include/picongpu/particles/startPosition/generic/FreeRng.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Alexander Grund
+/* Copyright 2015-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PIConGPU.
  *
@@ -30,198 +30,137 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace startPosition
-{
-namespace generic
-{
-namespace acc
-{
-    template<
-        typename T_Functor,
-        typename T_RngType
-    >
-    struct FreeRng : private T_Functor
+    namespace particles
     {
-
-        using Functor = T_Functor;
-        using RngType = T_RngType;
-
-        HDINLINE FreeRng(
-            Functor const & functor,
-            RngType const & rng
-        ) :
-            T_Functor( functor ), m_rng( rng )
-        {
-        }
-
-        /** call user functor
-         *
-         * The random number generator is initialized with the first call.
-         *
-         * @tparam T_Particle type of the particle to manipulate
-         * @tparam T_Args type of the arguments passed to the user functor
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param particle particle which is given to the user functor
-         * @return void is used to enable the operator if the user functor except two arguments
-         */
-        template<
-            typename T_Particle,
-            typename ... T_Args,
-            typename T_Acc
-        >
-        HDINLINE
-        void operator()(
-            T_Acc const &,
-            T_Particle& particle,
-            T_Args && ... args
-        )
-        {
-            namespace nvrng = nvidia::rng;
-
-            Functor::operator()(
-                m_rng,
-                particle,
-                args ...
-            );
-        }
-
-        template< typename T_Particle >
-        HDINLINE uint32_t
-        numberOfMacroParticles( float_X const realParticlesPerCell )
+        namespace startPosition
         {
-            return Functor::template numberOfMacroParticles< T_Particle >( realParticlesPerCell );
-        }
-
-    private:
-
-        RngType m_rng;
-    };
-} // namespace acc
-
-    template<
-        typename T_Functor,
-        typename T_Distribution
-    >
-    struct FreeRng :
-        protected T_Functor,
-        private picongpu::particles::functor::misc::Rng<
-            T_Distribution
-        >
-    {
-        template< typename T_SpeciesType >
-        struct apply
-        {
-            using type = FreeRng;
-        };
-
-        using RngGenerator = picongpu::particles::functor::misc::Rng<
-            T_Distribution
-        >;
-
-        using RngType = typename RngGenerator::RandomGen;
-
-        using Functor = T_Functor;
-        using Distribution = T_Distribution;
-
-        /** constructor
-         *
-         * This constructor is only compiled if the user functor has
-         * a host side constructor with one (uint32_t) argument.
-         *
-         * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
-         *                      the constructor
-         * @param currentStep current simulation time step
-         * @param is used to enable/disable the constructor (do not pass any value to this parameter)
-         */
-        template< typename DeferFunctor = Functor >
-        HINLINE FreeRng(
-            uint32_t currentStep,
-            typename std::enable_if<
-                std::is_constructible<
-                    DeferFunctor,
-                    uint32_t
-                >::value
-            >::type* = 0
-        ) :
-            Functor( currentStep ),
-            RngGenerator( currentStep )
-        {
-        }
-
-        /** constructor
-         *
-         * This constructor is only compiled if the user functor has a default constructor.
-         *
-         * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
-         *                      the constructor
-         * @param currentStep simulation time step
-         * @param is used to enable/disable the constructor (do not pass any value to this parameter)
-         */
-        template< typename DeferFunctor = Functor >
-        HINLINE FreeRng(
-            uint32_t currentStep,
-            typename std::enable_if<
-                std::is_constructible< DeferFunctor >::value
-            >::type* = 0
-        ) :
-            Functor( ),
-            RngGenerator( currentStep )
-        {
-        }
-
-        /** create functor for the accelerator
-         *
-         * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param localSupercellOffset offset (in superCells, without any guards) relative
-         *                        to the origin of the local domain
-         * @param workerCfg configuration of the worker
-         */
-        template<
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        HDINLINE auto
-        operator()(
-            T_Acc const & acc,
-            DataSpace< simDim > const & localSupercellOffset,
-            T_WorkerCfg const & workerCfg
-        ) const
-        -> acc::FreeRng<
-            Functor,
-            RngType
-        >
-        {
-            RngType const rng = ( *static_cast< RngGenerator const * >( this ) )(
-                acc,
-                localSupercellOffset,
-                workerCfg
-            );
-
-            return acc::FreeRng<
-                Functor,
-                RngType
-            >(
-                *static_cast< Functor const * >( this ),
-                rng
-            );
-        }
-
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return std::string("FreeRNG");
-        }
-    };
-
-} // namespace generic
-} // namespace startPosition
-} // namespace particles
+            namespace generic
+            {
+                namespace acc
+                {
+                    template<typename T_Functor, typename T_RngType>
+                    struct FreeRng : private T_Functor
+                    {
+                        using Functor = T_Functor;
+                        using RngType = T_RngType;
+
+                        HDINLINE FreeRng(Functor const& functor, RngType const& rng) : T_Functor(functor), m_rng(rng)
+                        {
+                        }
+
+                        /** call user functor
+                         *
+                         * The random number generator is initialized with the first call.
+                         *
+                         * @tparam T_Particle type of the particle to manipulate
+                         * @tparam T_Args type of the arguments passed to the user functor
+                         * @tparam T_Acc alpaka accelerator type
+                         *
+                         * @param alpaka accelerator
+                         * @param particle particle which is given to the user functor
+                         * @return void is used to enable the operator if the user functor except two arguments
+                         */
+                        template<typename T_Particle, typename... T_Args, typename T_Acc>
+                        HDINLINE void operator()(T_Acc const&, T_Particle& particle, T_Args&&... args)
+                        {
+                            Functor::operator()(m_rng, particle, args...);
+                        }
+
+                        template<typename T_Particle>
+                        HDINLINE uint32_t numberOfMacroParticles(float_X const realParticlesPerCell)
+                        {
+                            return Functor::template numberOfMacroParticles<T_Particle>(realParticlesPerCell);
+                        }
+
+                    private:
+                        RngType m_rng;
+                    };
+                } // namespace acc
+
+                template<typename T_Functor, typename T_Distribution>
+                struct FreeRng
+                    : protected T_Functor
+                    , private picongpu::particles::functor::misc::Rng<T_Distribution>
+                {
+                    template<typename T_SpeciesType>
+                    struct apply
+                    {
+                        using type = FreeRng;
+                    };
+
+                    using RngGenerator = picongpu::particles::functor::misc::Rng<T_Distribution>;
+
+                    using RngType = typename RngGenerator::RandomGen;
+
+                    using Functor = T_Functor;
+                    using Distribution = T_Distribution;
+
+                    /** constructor
+                     *
+                     * This constructor is only compiled if the user functor has
+                     * a host side constructor with one (uint32_t) argument.
+                     *
+                     * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
+                     *                      the constructor
+                     * @param currentStep current simulation time step
+                     * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+                     */
+                    template<typename DeferFunctor = Functor>
+                    HINLINE FreeRng(
+                        uint32_t currentStep,
+                        typename std::enable_if<std::is_constructible<DeferFunctor, uint32_t>::value>::type* = 0)
+                        : Functor(currentStep)
+                        , RngGenerator(currentStep)
+                    {
+                    }
+
+                    /** constructor
+                     *
+                     * This constructor is only compiled if the user functor has a default constructor.
+                     *
+                     * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
+                     *                      the constructor
+                     * @param currentStep simulation time step
+                     * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+                     */
+                    template<typename DeferFunctor = Functor>
+                    HINLINE FreeRng(
+                        uint32_t currentStep,
+                        typename std::enable_if<std::is_constructible<DeferFunctor>::value>::type* = 0)
+                        : Functor()
+                        , RngGenerator(currentStep)
+                    {
+                    }
+
+                    /** create functor for the accelerator
+                     *
+                     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param alpaka accelerator
+                     * @param localSupercellOffset offset (in superCells, without any guards) relative
+                     *                        to the origin of the local domain
+                     * @param workerCfg configuration of the worker
+                     */
+                    template<typename T_WorkerCfg, typename T_Acc>
+                    HDINLINE auto operator()(
+                        T_Acc const& acc,
+                        DataSpace<simDim> const& localSupercellOffset,
+                        T_WorkerCfg const& workerCfg) const -> acc::FreeRng<Functor, RngType>
+                    {
+                        RngType const rng
+                            = (*static_cast<RngGenerator const*>(this))(acc, localSupercellOffset, workerCfg);
+
+                        return acc::FreeRng<Functor, RngType>(*static_cast<Functor const*>(this), rng);
+                    }
+
+                    static HINLINE std::string getName()
+                    {
+                        return std::string("FreeRNG");
+                    }
+                };
+
+            } // namespace generic
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/synchrotronPhotons/PhotonCreator.def b/include/picongpu/particles/synchrotronPhotons/PhotonCreator.def
index 23dc70f441..bc678c73b6 100644
--- a/include/picongpu/particles/synchrotronPhotons/PhotonCreator.def
+++ b/include/picongpu/particles/synchrotronPhotons/PhotonCreator.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -21,27 +21,26 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace synchrotronPhotons
-{
-
-/** Functor creating photons from electrons according to synchrotron radiation.
- *
- * The numerical model is taken from:
- *
- * Gonoskov, A., et al. "Extended particle-in-cell schemes for physics
- * in ultrastrong laser fields: Review and developments."
- * Physical Review E 92.2 (2015): 023305.
- *
- * This functor is called by the general particle creation module.
- *
- * \tparam T_ElectronSpecies
- * \tparam T_PhotonSpecies
- */
-template<typename T_ElectronSpecies, typename T_PhotonSpecies>
-struct PhotonCreator;
+    namespace particles
+    {
+        namespace synchrotronPhotons
+        {
+            /** Functor creating photons from electrons according to synchrotron radiation.
+             *
+             * The numerical model is taken from:
+             *
+             * Gonoskov, A., et al. "Extended particle-in-cell schemes for physics
+             * in ultrastrong laser fields: Review and developments."
+             * Physical Review E 92.2 (2015): 023305.
+             *
+             * This functor is called by the general particle creation module.
+             *
+             * \tparam T_ElectronSpecies
+             * \tparam T_PhotonSpecies
+             */
+            template<typename T_ElectronSpecies, typename T_PhotonSpecies>
+            struct PhotonCreator;
 
-} // namespace synchrotronPhotons
-} // namespace particles
+        } // namespace synchrotronPhotons
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/synchrotronPhotons/PhotonCreator.hpp b/include/picongpu/particles/synchrotronPhotons/PhotonCreator.hpp
index aa3ada95bd..5418e4c42b 100644
--- a/include/picongpu/particles/synchrotronPhotons/PhotonCreator.hpp
+++ b/include/picongpu/particles/synchrotronPhotons/PhotonCreator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -23,7 +23,6 @@
 
 #include "SynchrotronFunctions.hpp"
 #include "picongpu/algorithms/Gamma.hpp"
-#include <pmacc/algorithms/math/defines/sqrt.hpp>
 #include <pmacc/algorithms/math/defines/dot.hpp>
 #include <pmacc/algorithms/math/defines/cross.hpp>
 #include "picongpu/traits/frame/GetMass.hpp"
@@ -51,344 +50,303 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace synchrotronPhotons
-{
-
-/** Functor creating photons from electrons according to synchrotron radiation.
- *
- * The numerical model is taken from:
- *
- * Gonoskov, A., et al. "Extended particle-in-cell schemes for physics
- * in ultrastrong laser fields: Review and developments."
- * Physical Review E 92.2 (2015): 023305.
- *
- * This functor is called by the general particle creation module.
- *
- * \tparam T_ElectronSpecies
- * \tparam T_PhotonSpecies
- */
-template<typename T_ElectronSpecies, typename T_PhotonSpecies>
-struct PhotonCreator
-{
-    using ElectronSpecies = T_ElectronSpecies;
-    using PhotonSpecies = T_PhotonSpecies;
-
-    using FrameType = typename ElectronSpecies::FrameType;
-
-    /* specify field to particle interpolation scheme */
-    using Field2ParticleInterpolation = typename pmacc::particles::traits::ResolveAliasFromSpecies<
-        ElectronSpecies,
-        interpolation<>
-    >::type;
-
-    /* margins around the supercell for the interpolation of the field on the cells */
-    using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
-    using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
-
-    /* relevant area of a block */
-    using BlockArea = SuperCellDescription<
-        typename MappingDesc::SuperCellSize,
-        LowerMargin,
-        UpperMargin
-    >;
-
-    BlockArea BlockDescription;
-
-    using TVec = MappingDesc::SuperCellSize;
-
-    using ValueType_E = FieldE::ValueType;
-    using ValueType_B = FieldB::ValueType;
-
-private:
-    /* global memory EM-field device databoxes */
-    PMACC_ALIGN(eBox, FieldE::DataBoxType);
-    PMACC_ALIGN(bBox, FieldB::DataBoxType);
-    /* shared memory EM-field device databoxes */
-    PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize,1> >);
-    PMACC_ALIGN(cachedB, DataBox<SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize,0> >);
-
-    PMACC_ALIGN(curF_1, SynchrotronFunctions::SyncFuncCursor);
-    PMACC_ALIGN(curF_2, SynchrotronFunctions::SyncFuncCursor);
-
-    PMACC_ALIGN(photon_mom, float3_X);
-
-    /* random number generator */
-    using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
-    using Distribution = pmacc::random::distributions::Uniform<float_X>;
-    using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
-    RandomGen randomGen;
-
-public:
-    /* host constructor initializing member : random number generator */
-    PhotonCreator(
-        const SynchrotronFunctions::SyncFuncCursor& curF_1,
-        const SynchrotronFunctions::SyncFuncCursor& curF_2)
-            : curF_1(curF_1),
-              curF_2(curF_2),
-              photon_mom(float3_X::create(0)),
-              randomGen(RNGFactory::createRandom<Distribution>())
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        /* initialize pointers on host-side E-(B-)field databoxes */
-        auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-        auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
-        /* initialize device-side E-(B-)field databoxes */
-        eBox = fieldE->getDeviceDataBox();
-        bBox = fieldB->getDeviceDataBox();
-    }
-
-    /** cache fields used by this functor
-     *
-     * @warning this is a collective method and calls synchronize
-     *
-     * @tparam T_Acc alpaka accelerator type
-     * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
-     *
-     * @param acc alpaka accelerator
-     * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
-     * @param workerCfg configuration of the worker
-     */
-    template<
-        typename T_Acc ,
-        typename T_WorkerCfg
-    >
-    DINLINE void collectiveInit(
-        const T_Acc & acc,
-        const DataSpace<simDim>& blockCell,
-        const T_WorkerCfg & workerCfg
-    )
-    {
-        /* caching of E and B fields */
-        cachedB = CachedBox::create<
-            0,
-            ValueType_B
-        >(
-            acc,
-            BlockArea()
-        );
-        cachedE = CachedBox::create<
-            1,
-            ValueType_E
-        >(
-            acc,
-            BlockArea()
-        );
-
-        /* instance of nvidia assignment operator */
-        nvidia::functors::Assign assign;
-        /* copy fields from global to shared */
-        auto fieldBBlock = bBox.shift(blockCell);
-        ThreadCollective<
-            BlockArea,
-            T_WorkerCfg::numWorkers
-        > collective( workerCfg.getWorkerIdx( ) );
-        collective(
-                  acc,
-                  assign,
-                  cachedB,
-                  fieldBBlock
-                  );
-        /* copy fields from global to shared */
-        auto fieldEBlock = eBox.shift(blockCell);
-        collective(
-                  acc,
-                  assign,
-                  cachedE,
-                  fieldEBlock
-                  );
-
-        /* wait for shared memory to be initialized */
-        __syncthreads();
-    }
-
-    /** Initialization function on device
-     *
-     * \brief Cache EM-fields on device
-     *         and initialize possible prerequisites for ionization, like e.g. random number generator.
-     *
-     * This function will be called inline on the device which must happen BEFORE threads diverge
-     * during loop execution. The reason for this is the `__syncthreads()` call which is necessary after
-     * initializing the E-/B-field shared boxes in shared memory.
-     */
-    template< typename T_Acc >
-    DINLINE void init(
-        T_Acc const & acc,
-        const DataSpace<simDim>& blockCell,
-        const int& linearThreadIdx,
-        const DataSpace<simDim>& localCellOffset
-    )
-    {
-        /* initialize random number generator with the local cell index in the simulation */
-        this->randomGen.init(localCellOffset);
-    }
-
-    /** Get the photon emission probability
-     *
-     * @param delta normalized (to the electron energy) photon energy
-     * @param chi quantum-nonlinearity parameter
-     * @param gamma electron gamma
-     */
-    DINLINE float_X emission_prob(
-        const float_X delta,
-        const float_X chi,
-        const float_X gamma) const
+    namespace particles
     {
-        // catch these special values because otherwise a NaN is returned whereas it should be a zero.
-        if(chi == float_X(0.0) || delta == float_X(0.0) || (float_X(1.0) - delta) == float_X(0.0))
-            return float_X(0.0);
-
-        const float_X mass = frame::getMass<FrameType>();
-        const float_X charge = frame::getCharge<FrameType>();
-
-        const float_X sqrtOf3 = 1.7320508075688772;
-        const float_X factor = DELTA_T * charge*charge * mass * SPEED_OF_LIGHT / (float_X(4.0) * PI * EPS0 * HBAR*HBAR) *
-                               sqrtOf3 / (float_X(2.0) * PI) * chi / gamma;
-
-        if(enableQEDTerm)
-        {
-            // quantum
-            const float_X z = float_X(2.0/3.0) * delta / ((float_X(1.0) - delta) * chi);
-
-            return factor * (float_X(1.0) - delta) / delta *
-                (this->curF_1[z] + float_X(1.5) * delta * chi * z * this->curF_2[z]);
-        }
-        else
-        {
-            // classical
-            const float_X z = float_X(2.0/3.0) * delta / chi;
-
-            return factor / delta * this->curF_1[z];
-        }
-    }
-
-    /** Get the *scaled* photon emission probability
-     *
-     * The scaling avoids an infrared divergence.
-     *
-     * @param deltaScaled scaled and normalized (to the electron energy) photon energy
-     * @param chi quantum-nonlinearity parameter
-     * @param gamma electron gamma
-     */
-    DINLINE float_X emission_prob_scaled(
-        const float_X deltaScaled,
-        const float_X chi,
-        const float_X gamma) const
-    {
-        const float_X delta = deltaScaled*deltaScaled*deltaScaled;
-        return float_X(3.0) * deltaScaled*deltaScaled * emission_prob(delta, chi, gamma);
-    }
-
-    /** Return the number of target particles to create from each source particle.
-     *
-     * Called for each frame of the source species.
-     *
-     * @param sourceFrame Frame of the source species
-     * @param localIdx Index of the source particle within frame
-     * @return number of particle to be created from each source particle
-     */
-    template< typename T_Acc >
-    DINLINE unsigned int numNewParticles(const T_Acc& acc, FrameType& sourceFrame, int localIdx)
-    {
-        using namespace pmacc::algorithms;
-
-        auto particle = sourceFrame[localIdx];
-
-        /* particle position, used for field-to-particle interpolation */
-        const floatD_X pos = particle[position_];
-        const int particleCellIdx = particle[localCellIdx_];
-        /* multi-dim coordinate of the local cell inside the super cell */
-        DataSpace<TVec::dim> localCell(DataSpaceOperations<TVec::dim>::template map<TVec > (particleCellIdx));
-        /* interpolation of E-field on the particle position */
-        const picongpu::traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
-        ValueType_E fieldE = Field2ParticleInterpolation()
-            (cachedE.shift(localCell).toCursor(), pos, fieldPosE());
-        /* interpolation of B-field on the particle position */
-        const picongpu::traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
-        ValueType_B fieldB = Field2ParticleInterpolation()
-            (cachedB.shift(localCell).toCursor(), pos, fieldPosB());
-
-        /* All computation below is in the single "real" particle picture.
-         * The macroparticle weighting factor is reintroduced at the end of this code block. */
-        const float3_X mom = particle[momentum_] / particle[weighting_];
-        const float_X mom2 = math::dot(mom, mom);
-        const float3_X mom_norm = mom * math::rsqrt(mom2);
-        const float_X mass = frame::getMass<FrameType>();
-
-        const float_X gamma = Gamma<>()(mom, mass);
-        const float3_X vel = mom / (gamma * mass); // low accuracy?
-
-        const float3_X lorentzForceOverCharge = fieldE + math::cross(vel, fieldB);
-        const float_X lorentzForceOverCharge2 = math::dot(lorentzForceOverCharge, lorentzForceOverCharge);
-        const float_X fieldE_long = math::dot(mom_norm, fieldE);
-
-        // effective magnetic strength (in cgs)
-        const float_X H_eff = math::sqrt(lorentzForceOverCharge2 - fieldE_long*fieldE_long);
-
-        const float_X charge = math::abs(frame::getCharge<FrameType>());
-
-        const float_X c = SPEED_OF_LIGHT;
-        // Schwinger limit, unit: V/m (in cgs)
-        const float_X E_S = mass*mass * c*c*c / (charge * HBAR);
-        // quantum-nonlinearity parameter
-        const float_X chi = gamma * H_eff / E_S;
-
-        const float_X deltaScaled = this->randomGen(acc);
-
-        const float_X x = emission_prob_scaled(deltaScaled, chi, gamma);
-
-        // raise a warning if the emission probability is too high.
-        if(picLog::log_level & picLog::CRITICAL::lvl)
-        {
-            if(x > float_X(SINGLE_EMISSION_PROB_LIMIT))
-            {
-                const float_X delta = deltaScaled*deltaScaled*deltaScaled;
-                printf("[SynchrotronPhotons] warning: emission probability is too high: p = %g, at delta = %g, chi = %g, gamma = %g\n",
-                    x, delta, chi, gamma);
-            }
-        }
-
-        if(this->randomGen(acc) < x)
+        namespace synchrotronPhotons
         {
-            const float_X delta = deltaScaled*deltaScaled*deltaScaled;
-            const float_X photonMom_abs = delta * mass*c * gamma;
-            if(photonMom_abs > SOFT_PHOTONS_CUTOFF_MOM)
+            /** Functor creating photons from electrons according to synchrotron radiation.
+             *
+             * The numerical model is taken from:
+             *
+             * Gonoskov, A., et al. "Extended particle-in-cell schemes for physics
+             * in ultrastrong laser fields: Review and developments."
+             * Physical Review E 92.2 (2015): 023305.
+             *
+             * This functor is called by the general particle creation module.
+             *
+             * \tparam T_ElectronSpecies
+             * \tparam T_PhotonSpecies
+             */
+            template<typename T_ElectronSpecies, typename T_PhotonSpecies>
+            struct PhotonCreator
             {
-                this->photon_mom = mom_norm * photonMom_abs * particle[weighting_];
-                return 1;
-            }
-        }
-
-        return 0;
-    }
-
-    /** Functor implementation: setting photon and electron properties
-     *
-     * Called once for each single particle creation.
-     *
-     * \tparam Electron type of electron which creates the photon
-     * \tparam Photon type of photon that is created
-     */
-    template<typename Electron, typename Photon, typename T_Acc>
-    DINLINE void operator()(const T_Acc& acc, Electron& electron, Photon& photon) const
-    {
-        namespace parOp = pmacc::particles::operations;
-        auto destPhoton =
-            parOp::deselect<
-                boost::mpl::vector<
-                    multiMask,
-                    momentum
-                >
-            >(photon);
-        parOp::assign( destPhoton, parOp::deselect<particleId>(electron) );
-
-        photon[multiMask_] = 1;
-        photon[momentum_] = this->photon_mom;
-        electron[momentum_] -= this->photon_mom;
-    }
-};
-
-} // namespace synchrotronPhotons
-} // namespace particles
+                using ElectronSpecies = T_ElectronSpecies;
+                using PhotonSpecies = T_PhotonSpecies;
+
+                using FrameType = typename ElectronSpecies::FrameType;
+
+                /* specify field to particle interpolation scheme */
+                using Field2ParticleInterpolation =
+                    typename pmacc::particles::traits::ResolveAliasFromSpecies<ElectronSpecies, interpolation<>>::type;
+
+                /* margins around the supercell for the interpolation of the field on the cells */
+                using LowerMargin = typename GetMargin<Field2ParticleInterpolation>::LowerMargin;
+                using UpperMargin = typename GetMargin<Field2ParticleInterpolation>::UpperMargin;
+
+                /* relevant area of a block */
+                using BlockArea = SuperCellDescription<typename MappingDesc::SuperCellSize, LowerMargin, UpperMargin>;
+
+                BlockArea BlockDescription;
+
+                using TVec = MappingDesc::SuperCellSize;
+
+                using ValueType_E = FieldE::ValueType;
+                using ValueType_B = FieldB::ValueType;
+
+            private:
+                /* global memory EM-field device databoxes */
+                PMACC_ALIGN(eBox, FieldE::DataBoxType);
+                PMACC_ALIGN(bBox, FieldB::DataBoxType);
+                /* shared memory EM-field device databoxes */
+                PMACC_ALIGN(cachedE, DataBox<SharedBox<ValueType_E, typename BlockArea::FullSuperCellSize, 1>>);
+                PMACC_ALIGN(cachedB, DataBox<SharedBox<ValueType_B, typename BlockArea::FullSuperCellSize, 0>>);
+
+                PMACC_ALIGN(curF_1, SynchrotronFunctions::SyncFuncCursor);
+                PMACC_ALIGN(curF_2, SynchrotronFunctions::SyncFuncCursor);
+
+                PMACC_ALIGN(photon_mom, float3_X);
+
+                /* random number generator */
+                using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
+                using Distribution = pmacc::random::distributions::Uniform<float_X>;
+                using RandomGen = typename RNGFactory::GetRandomType<Distribution>::type;
+                RandomGen randomGen;
+
+            public:
+                /* host constructor initializing member : random number generator */
+                PhotonCreator(
+                    const SynchrotronFunctions::SyncFuncCursor& curF_1,
+                    const SynchrotronFunctions::SyncFuncCursor& curF_2)
+                    : curF_1(curF_1)
+                    , curF_2(curF_2)
+                    , photon_mom(float3_X::create(0))
+                    , randomGen(RNGFactory::createRandom<Distribution>())
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    /* initialize pointers on host-side E-(B-)field databoxes */
+                    auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+                    auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
+                    /* initialize device-side E-(B-)field databoxes */
+                    eBox = fieldE->getDeviceDataBox();
+                    bBox = fieldB->getDeviceDataBox();
+                }
+
+                /** cache fields used by this functor
+                 *
+                 * @warning this is a collective method and calls synchronize
+                 *
+                 * @tparam T_Acc alpaka accelerator type
+                 * @tparam T_WorkerCfg pmacc::mappings::threads::WorkerCfg, configuration of the worker
+                 *
+                 * @param acc alpaka accelerator
+                 * @param blockCell relative offset (in cells) to the local domain plus the guarding cells
+                 * @param workerCfg configuration of the worker
+                 */
+                template<typename T_Acc, typename T_WorkerCfg>
+                DINLINE void collectiveInit(
+                    const T_Acc& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const T_WorkerCfg& workerCfg)
+                {
+                    /* caching of E and B fields */
+                    cachedB = CachedBox::create<0, ValueType_B>(acc, BlockArea());
+                    cachedE = CachedBox::create<1, ValueType_E>(acc, BlockArea());
+
+                    /* instance of nvidia assignment operator */
+                    nvidia::functors::Assign assign;
+                    /* copy fields from global to shared */
+                    auto fieldBBlock = bBox.shift(blockCell);
+                    ThreadCollective<BlockArea, T_WorkerCfg::numWorkers> collective(workerCfg.getWorkerIdx());
+                    collective(acc, assign, cachedB, fieldBBlock);
+                    /* copy fields from global to shared */
+                    auto fieldEBlock = eBox.shift(blockCell);
+                    collective(acc, assign, cachedE, fieldEBlock);
+
+                    /* wait for shared memory to be initialized */
+                    cupla::__syncthreads(acc);
+                }
+
+                /** Initialization function on device
+                 *
+                 * \brief Cache EM-fields on device
+                 *         and initialize possible prerequisites for ionization, like e.g. random number generator.
+                 *
+                 * This function will be called inline on the device which must happen BEFORE threads diverge
+                 * during loop execution. The reason for this is the `cupla::__syncthreads( acc )` call which is
+                 * necessary after initializing the E-/B-field shared boxes in shared memory.
+                 */
+                template<typename T_Acc>
+                DINLINE void init(
+                    T_Acc const& acc,
+                    const DataSpace<simDim>& blockCell,
+                    const int& linearThreadIdx,
+                    const DataSpace<simDim>& localCellOffset)
+                {
+                    /* initialize random number generator with the local cell index in the simulation */
+                    this->randomGen.init(localCellOffset);
+                }
+
+                /** Get the photon emission probability
+                 *
+                 * @param delta normalized (to the electron energy) photon energy
+                 * @param chi quantum-nonlinearity parameter
+                 * @param gamma electron gamma
+                 */
+                DINLINE float_X emission_prob(const float_X delta, const float_X chi, const float_X gamma) const
+                {
+                    // catch these special values because otherwise a NaN is returned whereas it should be a zero.
+                    if(chi == float_X(0.0) || delta == float_X(0.0) || (float_X(1.0) - delta) == float_X(0.0))
+                        return float_X(0.0);
+
+                    const float_X mass = frame::getMass<FrameType>();
+                    const float_X charge = frame::getCharge<FrameType>();
+
+                    const float_X sqrtOf3 = 1.7320508075688772;
+                    const float_X factor = DELTA_T * charge * charge * mass * SPEED_OF_LIGHT
+                        / (float_X(4.0) * PI * EPS0 * HBAR * HBAR) * sqrtOf3 / (float_X(2.0) * PI) * chi / gamma;
+
+                    if(enableQEDTerm)
+                    {
+                        // quantum
+                        const float_X z = float_X(2.0 / 3.0) * delta / ((float_X(1.0) - delta) * chi);
+
+                        return factor * (float_X(1.0) - delta) / delta
+                            * (this->curF_1[z] + float_X(1.5) * delta * chi * z * this->curF_2[z]);
+                    }
+                    else
+                    {
+                        // classical
+                        const float_X z = float_X(2.0 / 3.0) * delta / chi;
+
+                        return factor / delta * this->curF_1[z];
+                    }
+                }
+
+                /** Get the *scaled* photon emission probability
+                 *
+                 * The scaling avoids an infrared divergence.
+                 *
+                 * @param deltaScaled scaled and normalized (to the electron energy) photon energy
+                 * @param chi quantum-nonlinearity parameter
+                 * @param gamma electron gamma
+                 */
+                DINLINE float_X
+                emission_prob_scaled(const float_X deltaScaled, const float_X chi, const float_X gamma) const
+                {
+                    const float_X delta = deltaScaled * deltaScaled * deltaScaled;
+                    return float_X(3.0) * deltaScaled * deltaScaled * emission_prob(delta, chi, gamma);
+                }
+
+                /** Return the number of target particles to create from each source particle.
+                 *
+                 * Called for each frame of the source species.
+                 *
+                 * @param sourceFrame Frame of the source species
+                 * @param localIdx Index of the source particle within frame
+                 * @return number of particle to be created from each source particle
+                 */
+                template<typename T_Acc>
+                DINLINE unsigned int numNewParticles(const T_Acc& acc, FrameType& sourceFrame, int localIdx)
+                {
+                    using namespace pmacc::algorithms;
+
+                    auto particle = sourceFrame[localIdx];
+
+                    /* particle position, used for field-to-particle interpolation */
+                    const floatD_X pos = particle[position_];
+                    const int particleCellIdx = particle[localCellIdx_];
+                    /* multi-dim coordinate of the local cell inside the super cell */
+                    DataSpace<TVec::dim> localCell(
+                        DataSpaceOperations<TVec::dim>::template map<TVec>(particleCellIdx));
+                    /* interpolation of E-field on the particle position */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldE> fieldPosE;
+                    ValueType_E fieldE
+                        = Field2ParticleInterpolation()(cachedE.shift(localCell).toCursor(), pos, fieldPosE());
+                    /* interpolation of B-field on the particle position */
+                    const picongpu::traits::FieldPosition<fields::CellType, FieldB> fieldPosB;
+                    ValueType_B fieldB
+                        = Field2ParticleInterpolation()(cachedB.shift(localCell).toCursor(), pos, fieldPosB());
+
+                    /* All computation below is in the single "real" particle picture.
+                     * The macroparticle weighting factor is reintroduced at the end of this code block. */
+                    const float3_X mom = particle[momentum_] / particle[weighting_];
+                    const float_X mom2 = pmacc::math::dot(mom, mom);
+                    const float3_X mom_norm = mom * math::rsqrt(mom2);
+                    const float_X mass = frame::getMass<FrameType>();
+
+                    const float_X gamma = Gamma<>()(mom, mass);
+                    const float3_X vel = mom / (gamma * mass); // low accuracy?
+
+                    const float3_X lorentzForceOverCharge = fieldE + pmacc::math::cross(vel, fieldB);
+                    const float_X lorentzForceOverCharge2
+                        = pmacc::math::dot(lorentzForceOverCharge, lorentzForceOverCharge);
+                    const float_X fieldE_long = pmacc::math::dot(mom_norm, fieldE);
+
+                    // effective magnetic strength (in cgs)
+                    const float_X H_eff = math::sqrt(lorentzForceOverCharge2 - fieldE_long * fieldE_long);
+
+                    const float_X charge = math::abs(frame::getCharge<FrameType>());
+
+                    const float_X c = SPEED_OF_LIGHT;
+                    // Schwinger limit, unit: V/m (in cgs)
+                    const float_X E_S = mass * mass * c * c * c / (charge * HBAR);
+                    // quantum-nonlinearity parameter
+                    const float_X chi = gamma * H_eff / E_S;
+
+                    const float_X deltaScaled = this->randomGen(acc);
+
+                    const float_X x = emission_prob_scaled(deltaScaled, chi, gamma);
+
+                    // raise a warning if the emission probability is too high.
+                    if(picLog::log_level & picLog::CRITICAL::lvl)
+                    {
+                        if(x > float_X(SINGLE_EMISSION_PROB_LIMIT))
+                        {
+                            const float_X delta = deltaScaled * deltaScaled * deltaScaled;
+                            printf(
+                                "[SynchrotronPhotons] warning: emission probability is too high: p = %g, at delta = "
+                                "%g, chi = %g, gamma = %g\n",
+                                x,
+                                delta,
+                                chi,
+                                gamma);
+                        }
+                    }
+
+                    if(this->randomGen(acc) < x)
+                    {
+                        const float_X delta = deltaScaled * deltaScaled * deltaScaled;
+                        const float_X photonMom_abs = delta * mass * c * gamma;
+                        if(photonMom_abs > SOFT_PHOTONS_CUTOFF_MOM)
+                        {
+                            this->photon_mom = mom_norm * photonMom_abs * particle[weighting_];
+                            return 1;
+                        }
+                    }
+
+                    return 0;
+                }
+
+                /** Functor implementation: setting photon and electron properties
+                 *
+                 * Called once for each single particle creation.
+                 *
+                 * \tparam Electron type of electron which creates the photon
+                 * \tparam Photon type of photon that is created
+                 */
+                template<typename Electron, typename Photon, typename T_Acc>
+                DINLINE void operator()(const T_Acc& acc, Electron& electron, Photon& photon) const
+                {
+                    namespace parOp = pmacc::particles::operations;
+                    auto destPhoton = parOp::deselect<boost::mpl::vector<multiMask, momentum>>(photon);
+                    parOp::assign(destPhoton, parOp::deselect<particleId>(electron));
+
+                    photon[multiMask_] = 1;
+                    photon[momentum_] = this->photon_mom;
+                    electron[momentum_] -= this->photon_mom;
+                }
+            };
+
+        } // namespace synchrotronPhotons
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/synchrotronPhotons/SynchrotronFunctions.hpp b/include/picongpu/particles/synchrotronPhotons/SynchrotronFunctions.hpp
index fafab8ca0b..f1aefaefce 100644
--- a/include/picongpu/particles/synchrotronPhotons/SynchrotronFunctions.hpp
+++ b/include/picongpu/particles/synchrotronPhotons/SynchrotronFunctions.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -32,97 +32,93 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace synchrotronPhotons
-{
-
-namespace detail
-{
-
-/** Map `x` to the internal lookup table and return the result of the
- * first or the second synchrotron function for `x`.
- */
-struct MapToLookupTable
-{
-    using LinInterpCursor = typename ::pmacc::result_of::Functor<
-        ::pmacc::cursor::tools::LinearInterp<float_X>,
-        ::pmacc::cursor::BufferCursor<float_X, DIM1>
-    >::type;
-
-    using type = float_X;
-
-    LinInterpCursor linInterpCursor;
-
-    /** constructor
-     *
-     * @param linInterpCursor lookup table of the first or the second
-     * synchrotron function.
-     */
-    HDINLINE MapToLookupTable(LinInterpCursor linInterpCursor)
-        : linInterpCursor(linInterpCursor) {}
-
-    /** Returns F_1(x) or F_2(x)
-
-     * @param x position of the synchrotron function to be evaluated
-     */
-    HDINLINE float_X operator()(const float_X x) const;
-};
-
-using SyncFuncCursor = ::pmacc::cursor::Cursor<
-    MapToLookupTable,
-    ::pmacc::cursor::PlusNavigator,
-    float_X
->;
-
-} // namespace detail
-
-
-/** Lookup table for synchrotron functions.
- *
- * Provides cursors for the first and the second synchrotron function
- */
-class SynchrotronFunctions
-{
-public:
-    using SyncFuncCursor = detail::SyncFuncCursor;
-private:
-
-    using MyBuf = boost::shared_ptr<pmacc::container::DeviceBuffer<float_X, DIM1> >;
-    MyBuf dBuf_SyncFuncs[2]; // two synchrotron functions
-
-    struct BesselK
+    namespace particles
     {
-        template<typename T_State, typename T_Time>
-        void operator()(const T_State &x, T_State &dxdt, T_Time t) const
+        namespace synchrotronPhotons
         {
-            dxdt[0] = boost::math::tr1::cyl_bessel_k(5.0/3.0, t);
-        }
-    };
-
-    /** First synchrotron function
-     */
-    HINLINE float_64 F_1(const float_64 x) const;
-    /** Second synchrotron function
-     */
-    HINLINE float_64 F_2(const float_64 x) const;
-
-public:
-    enum Select
-    {
-        first=0, second=1
-    };
-
-    HINLINE void init();
-    /** Return a cursor representing a synchrotron function
-     *
-     * @param syncFunction first or second synchrotron function
-     * @see: SynchrotronFunctions::Select
-     */
-    HINLINE SyncFuncCursor getCursor(Select syncFunction) const;
-
-}; // class SynchrotronFunctions
-
-} // namespace synchrotronPhotons
-} // namespace particles
+            namespace detail
+            {
+                /** Map `x` to the internal lookup table and return the result of the
+                 * first or the second synchrotron function for `x`.
+                 */
+                struct MapToLookupTable
+                {
+                    using LinInterpCursor = typename ::pmacc::result_of::Functor<
+                        ::pmacc::cursor::tools::LinearInterp<float_X>,
+                        ::pmacc::cursor::BufferCursor<float_X, DIM1>>::type;
+
+                    using type = float_X;
+
+                    LinInterpCursor linInterpCursor;
+
+                    /** constructor
+                     *
+                     * @param linInterpCursor lookup table of the first or the second
+                     * synchrotron function.
+                     */
+                    HDINLINE MapToLookupTable(LinInterpCursor linInterpCursor) : linInterpCursor(linInterpCursor)
+                    {
+                    }
+
+                    /** Returns F_1(x) or F_2(x)
+
+                     * @param x position of the synchrotron function to be evaluated
+                     */
+                    HDINLINE float_X operator()(const float_X x) const;
+                };
+
+                using SyncFuncCursor
+                    = ::pmacc::cursor::Cursor<MapToLookupTable, ::pmacc::cursor::PlusNavigator, float_X>;
+
+            } // namespace detail
+
+
+            /** Lookup table for synchrotron functions.
+             *
+             * Provides cursors for the first and the second synchrotron function
+             */
+            class SynchrotronFunctions
+            {
+            public:
+                using SyncFuncCursor = detail::SyncFuncCursor;
+
+            private:
+                using MyBuf = boost::shared_ptr<pmacc::container::DeviceBuffer<float_X, DIM1>>;
+                MyBuf dBuf_SyncFuncs[2]; // two synchrotron functions
+
+                struct BesselK
+                {
+                    template<typename T_State, typename T_Time>
+                    void operator()(const T_State& x, T_State& dxdt, T_Time t) const
+                    {
+                        dxdt[0] = boost::math::tr1::cyl_bessel_k(5.0 / 3.0, t);
+                    }
+                };
+
+                /** First synchrotron function
+                 */
+                HINLINE float_64 F_1(const float_64 x) const;
+                /** Second synchrotron function
+                 */
+                HINLINE float_64 F_2(const float_64 x) const;
+
+            public:
+                enum Select
+                {
+                    first = 0,
+                    second = 1
+                };
+
+                HINLINE void init();
+                /** Return a cursor representing a synchrotron function
+                 *
+                 * @param syncFunction first or second synchrotron function
+                 * @see: SynchrotronFunctions::Select
+                 */
+                HINLINE SyncFuncCursor getCursor(Select syncFunction) const;
+
+            }; // class SynchrotronFunctions
+
+        } // namespace synchrotronPhotons
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/synchrotronPhotons/SynchrotronFunctions.tpp b/include/picongpu/particles/synchrotronPhotons/SynchrotronFunctions.tpp
index 62b93c2bcd..15a8a0ff90 100644
--- a/include/picongpu/particles/synchrotronPhotons/SynchrotronFunctions.tpp
+++ b/include/picongpu/particles/synchrotronPhotons/SynchrotronFunctions.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -22,127 +22,125 @@
 #include "picongpu/particles/synchrotronPhotons/SynchrotronFunctions.hpp"
 #include "picongpu/simulation_defines.hpp"
 #include <boost/array.hpp>
-#if( BOOST_VERSION == 106400 )
-    /* `array_wrapper.hpp` must be included before `integrate.hpp` to avoid
-     * the error
-     * `boost/numeric/ublas/matrix.hpp(5977): error: namespace "boost::serialization" has no member "make_array"`
-     * in boost 1.64.0
-     * see boost issue https://svn.boost.org/trac/boost/ticket/12516
-     */
-#   include <boost/serialization/array_wrapper.hpp>
+#if(BOOST_VERSION == 106400)
+/* `array_wrapper.hpp` must be included before `integrate.hpp` to avoid
+ * the error
+ * `boost/numeric/ublas/matrix.hpp(5977): error: namespace "boost::serialization" has no member "make_array"`
+ * in boost 1.64.0
+ * see boost issue https://svn.boost.org/trac/boost/ticket/12516
+ */
+#    include <boost/serialization/array_wrapper.hpp>
 #endif
 #include <boost/numeric/odeint/integrate/integrate.hpp>
 
 
 namespace picongpu
 {
-namespace particles
-{
-namespace synchrotronPhotons
-{
-
-namespace detail
-{
-
-/** Returns F_1(x) or F_2(x)
-
- * @param x position of the synchrotron function to be evaluated
- */
-HDINLINE float_X MapToLookupTable::operator()(const float_X x) const
-{
-    /* This mapping increases the sample point density for small values of x
-     * where the synchrotron functions have a divergent slope. Without this mapping
-     * the emission probabilty of low-energy photons is underestimated.
-     *
-     * This is the inverse mapping of the mapping in @see:`SynchrotronFunctions::init()`
-     */
-    const float_X x_m = math::pow(x, float_X(1.0/3.0));
-
-    const float_X cutOff = static_cast<float_X>(SYNC_FUNCS_CUTOFF);
-
-    if(x_m >= cutOff)
-        return float_X(0.0);
-    else
-        return this->linInterpCursor[x_m / static_cast<float_X>(SYNC_FUNCS_STEP_WIDTH)];
-}
-
-} // namespace detail
-
-
-/** First synchrotron function
- */
-float_64 SynchrotronFunctions::F_1(const float_64 x) const
-{
-    if(x == float_64(0.0))
-        return float_64(0.0);
-
-    using namespace boost::numeric::odeint;
-    using state_type = boost::array<float_64, 1>;
-
-    state_type integral_result = {0.0};
-    const float_64 upper_bound(SYNC_FUNCS_F1_INTEGRAL_BOUND);
-    const float_64 stepwidth(SYNC_FUNCS_BESSEL_INTEGRAL_STEPWIDTH);
-    integrate(BesselK(), integral_result, x, upper_bound, stepwidth);
-
-    return x * integral_result[0];
-}
-/** Second synchrotron function
- */
-float_64 SynchrotronFunctions::F_2(const float_64 x) const
-{
-    if(x == float_64(0.0))
-        return float_64(0.0);
-
-    return x * boost::math::tr1::cyl_bessel_k(2.0/3.0, x);
-}
-
-
-void SynchrotronFunctions::init()
-{
-    const uint32_t numSamples = SYNC_FUNCS_NUM_SAMPLES;
-
-    this->dBuf_SyncFuncs[first] = MyBuf(new pmacc::container::DeviceBuffer<float_X, DIM1>(numSamples));
-    this->dBuf_SyncFuncs[second] = MyBuf(new pmacc::container::DeviceBuffer<float_X, DIM1>(numSamples));
-
-    pmacc::container::HostBuffer<float_X, DIM1> hBuf_F_1(numSamples);
-    pmacc::container::HostBuffer<float_X, DIM1> hBuf_F_2(numSamples);
-
-    for(uint32_t sampleIdx = 0u; sampleIdx < numSamples; sampleIdx++)
+    namespace particles
     {
-        const float_64 x_m = float_64(sampleIdx) * SYNC_FUNCS_STEP_WIDTH;
-        /* This mapping increases the sample point density for small values of x
-         * where the synchrotron functions have a divergent slope. Without this mapping
-         * the emission probabilty of low-energy photons is underestimated.
-         */
-        const float_64 x = x_m * x_m * x_m;
-
-        hBuf_F_1.origin()[sampleIdx] = static_cast<float_X>(this->F_1(x));
-        hBuf_F_2.origin()[sampleIdx] = static_cast<float_X>(this->F_2(x));
-    }
-
-    *this->dBuf_SyncFuncs[first] = hBuf_F_1;
-    *this->dBuf_SyncFuncs[second] = hBuf_F_2;
-}
-
-/** Return a cursor representing a synchrotron function
- *
- * @param syncFunction first or second synchrotron function
- * @see: SynchrotronFunctions::Select
- */
-SynchrotronFunctions::SyncFuncCursor
-SynchrotronFunctions::getCursor(SynchrotronFunctions::Select syncFunction) const
-{
-    using namespace pmacc;
-
-    detail::MapToLookupTable::LinInterpCursor linInterpCursor =
-        cursor::tools::LinearInterp<float_X>()(this->dBuf_SyncFuncs[syncFunction]->origin());
-
-    return cursor::make_Cursor(
-        detail::MapToLookupTable(linInterpCursor),
-        cursor::PlusNavigator(),
-        float_X(0.0));
-}
-
-} // namespace synchrotronPhotons
-} // namespace particles
+        namespace synchrotronPhotons
+        {
+            namespace detail
+            {
+                /** Returns F_1(x) or F_2(x)
+
+                 * @param x position of the synchrotron function to be evaluated
+                 */
+                HDINLINE float_X MapToLookupTable::operator()(const float_X x) const
+                {
+                    /* This mapping increases the sample point density for small values of x
+                     * where the synchrotron functions have a divergent slope. Without this mapping
+                     * the emission probabilty of low-energy photons is underestimated.
+                     *
+                     * This is the inverse mapping of the mapping in @see:`SynchrotronFunctions::init()`
+                     */
+                    const float_X x_m = math::pow(x, float_X(1.0 / 3.0));
+
+                    const float_X cutOff = static_cast<float_X>(SYNC_FUNCS_CUTOFF);
+
+                    if(x_m >= cutOff)
+                        return float_X(0.0);
+                    else
+                        return this->linInterpCursor[x_m / static_cast<float_X>(SYNC_FUNCS_STEP_WIDTH)];
+                }
+
+            } // namespace detail
+
+
+            /** First synchrotron function
+             */
+            float_64 SynchrotronFunctions::F_1(const float_64 x) const
+            {
+                if(x == float_64(0.0))
+                    return float_64(0.0);
+
+                using namespace boost::numeric::odeint;
+                using state_type = boost::array<float_64, 1>;
+
+                state_type integral_result = {0.0};
+                const float_64 upper_bound(SYNC_FUNCS_F1_INTEGRAL_BOUND);
+                const float_64 stepwidth(SYNC_FUNCS_BESSEL_INTEGRAL_STEPWIDTH);
+                integrate(BesselK(), integral_result, x, upper_bound, stepwidth);
+
+                return x * integral_result[0];
+            }
+            /** Second synchrotron function
+             */
+            float_64 SynchrotronFunctions::F_2(const float_64 x) const
+            {
+                if(x == float_64(0.0))
+                    return float_64(0.0);
+
+                return x * boost::math::tr1::cyl_bessel_k(2.0 / 3.0, x);
+            }
+
+
+            void SynchrotronFunctions::init()
+            {
+                const uint32_t numSamples = SYNC_FUNCS_NUM_SAMPLES;
+
+                this->dBuf_SyncFuncs[first] = MyBuf(new pmacc::container::DeviceBuffer<float_X, DIM1>(numSamples));
+                this->dBuf_SyncFuncs[second] = MyBuf(new pmacc::container::DeviceBuffer<float_X, DIM1>(numSamples));
+
+                pmacc::container::HostBuffer<float_X, DIM1> hBuf_F_1(numSamples);
+                pmacc::container::HostBuffer<float_X, DIM1> hBuf_F_2(numSamples);
+
+                for(uint32_t sampleIdx = 0u; sampleIdx < numSamples; sampleIdx++)
+                {
+                    const float_64 x_m = float_64(sampleIdx) * SYNC_FUNCS_STEP_WIDTH;
+                    /* This mapping increases the sample point density for small values of x
+                     * where the synchrotron functions have a divergent slope. Without this mapping
+                     * the emission probabilty of low-energy photons is underestimated.
+                     */
+                    const float_64 x = x_m * x_m * x_m;
+
+                    hBuf_F_1.origin()[sampleIdx] = static_cast<float_X>(this->F_1(x));
+                    hBuf_F_2.origin()[sampleIdx] = static_cast<float_X>(this->F_2(x));
+                }
+
+                *this->dBuf_SyncFuncs[first] = hBuf_F_1;
+                *this->dBuf_SyncFuncs[second] = hBuf_F_2;
+            }
+
+            /** Return a cursor representing a synchrotron function
+             *
+             * @param syncFunction first or second synchrotron function
+             * @see: SynchrotronFunctions::Select
+             */
+            SynchrotronFunctions::SyncFuncCursor SynchrotronFunctions::getCursor(
+                SynchrotronFunctions::Select syncFunction) const
+            {
+                using namespace pmacc;
+
+                detail::MapToLookupTable::LinInterpCursor linInterpCursor
+                    = cursor::tools::LinearInterp<float_X>()(this->dBuf_SyncFuncs[syncFunction]->origin());
+
+                return cursor::make_Cursor(
+                    detail::MapToLookupTable(linInterpCursor),
+                    cursor::PlusNavigator(),
+                    float_X(0.0));
+            }
+
+        } // namespace synchrotronPhotons
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/GenerateSolversIfSpeciesEligible.hpp b/include/picongpu/particles/traits/GenerateSolversIfSpeciesEligible.hpp
index 62da48791a..581655db2e 100644
--- a/include/picongpu/particles/traits/GenerateSolversIfSpeciesEligible.hpp
+++ b/include/picongpu/particles/traits/GenerateSolversIfSpeciesEligible.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -30,62 +30,47 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace traits
-{
-    /** Return a list of Solvers specialized to all matching species
-     *
-     * Solvers can define the trait SpeciesEligibleForSolver to check a
-     * particle species if it fulfills requirements of the solver.
-     *
-     * The compile-time factory here returns a list of particle solvers (of the
-     * same solver given by T_Solver), but fully specialized with matching
-     * particle species from a sequence of species (T_SeqSpecies).
-     *
-     * @tparam T_Solver a particle solver which shall be specialized for all
-     *                  eligible particle species
-     * @tparam T_SeqSpecies a sequence of particle species to check if they are
-     *                      eligible to specialize T_Solver, also allows a
-     *                      single type instead of a sequence
-     * @tparam T_Eligible allows to specialize a solver but only if the check
-     *                    of the T_Eligible class fulfills the
-     *                    SpeciesEligibleForSolver trait, per default the
-     *                    T_Solver argument is checked
-     */
-    template<
-        typename T_Solver,
-        typename T_SeqSpecies,
-        typename T_Eligible = T_Solver
-    >
-    struct GenerateSolversIfSpeciesEligible
+    namespace particles
     {
-        // wrap single arguments to sequence
-        using SeqSpecies = typename pmacc::ToSeq< T_SeqSpecies >::type;
-        // unspecialized solver
-        using Solver = T_Solver;
-
-        template< typename T_Species >
-        struct Op : bmpl::apply1<
-            Solver,
-            T_Species
-        >
+        namespace traits
         {
-        };
+            /** Return a list of Solvers specialized to all matching species
+             *
+             * Solvers can define the trait SpeciesEligibleForSolver to check a
+             * particle species if it fulfills requirements of the solver.
+             *
+             * The compile-time factory here returns a list of particle solvers (of the
+             * same solver given by T_Solver), but fully specialized with matching
+             * particle species from a sequence of species (T_SeqSpecies).
+             *
+             * @tparam T_Solver a particle solver which shall be specialized for all
+             *                  eligible particle species
+             * @tparam T_SeqSpecies a sequence of particle species to check if they are
+             *                      eligible to specialize T_Solver, also allows a
+             *                      single type instead of a sequence
+             * @tparam T_Eligible allows to specialize a solver but only if the check
+             *                    of the T_Eligible class fulfills the
+             *                    SpeciesEligibleForSolver trait, per default the
+             *                    T_Solver argument is checked
+             */
+            template<typename T_Solver, typename T_SeqSpecies, typename T_Eligible = T_Solver>
+            struct GenerateSolversIfSpeciesEligible
+            {
+                // wrap single arguments to sequence
+                using SeqSpecies = typename pmacc::ToSeq<T_SeqSpecies>::type;
+                // unspecialized solver
+                using Solver = T_Solver;
+
+                template<typename T_Species>
+                struct Op : bmpl::apply1<Solver, T_Species>
+                {
+                };
 
-        using SeqEligibleSpecies = typename bmpl::copy_if<
-            SeqSpecies,
-            particles::traits::SpeciesEligibleForSolver<
-                bmpl::_1,
-                T_Eligible
-            >
-        >::type;
+                using SeqEligibleSpecies = typename bmpl::
+                    copy_if<SeqSpecies, particles::traits::SpeciesEligibleForSolver<bmpl::_1, T_Eligible>>::type;
 
-        using type = typename bmpl::transform<
-            SeqEligibleSpecies,
-            Op< bmpl::_1 >
-        >::type;
-    };
-} // namespace traits
-} // namespace particles
+                using type = typename bmpl::transform<SeqEligibleSpecies, Op<bmpl::_1>>::type;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetAtomicNumbers.hpp b/include/picongpu/particles/traits/GetAtomicNumbers.hpp
index 152c1c40d7..e1966efb0c 100644
--- a/include/picongpu/particles/traits/GetAtomicNumbers.hpp
+++ b/include/picongpu/particles/traits/GetAtomicNumbers.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten, Rene Widera
+/* Copyright 2015-2021 Marco Garten, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,20 +27,20 @@
 
 namespace picongpu
 {
-namespace traits
-{
-template<typename T_Species>
-struct GetAtomicNumbers
-{
-    using FrameType = typename T_Species::FrameType;
+    namespace traits
+    {
+        template<typename T_Species>
+        struct GetAtomicNumbers
+        {
+            using FrameType = typename T_Species::FrameType;
 
-    using hasAtomicNumbers = typename HasFlag<FrameType, atomicNumbers<> >::type;
-    /* throw static assert if species has no protons or neutrons */
-    PMACC_CASSERT_MSG(This_species_has_no_atomic_numbers,hasAtomicNumbers::value==true);
+            using hasAtomicNumbers = typename HasFlag<FrameType, atomicNumbers<>>::type;
+            /* throw static assert if species has no protons or neutrons */
+            PMACC_CASSERT_MSG(This_species_has_no_atomic_numbers, hasAtomicNumbers::value == true);
 
-    using FoundAtomicNumbersAlias = typename GetFlagType<FrameType,atomicNumbers<> >::type;
-    using type = typename pmacc::traits::Resolve<FoundAtomicNumbersAlias >::type;
-};
-} //namespace traits
+            using FoundAtomicNumbersAlias = typename GetFlagType<FrameType, atomicNumbers<>>::type;
+            using type = typename pmacc::traits::Resolve<FoundAtomicNumbersAlias>::type;
+        };
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetCurrentSolver.hpp b/include/picongpu/particles/traits/GetCurrentSolver.hpp
index 9bd2caf73d..dde3027e65 100644
--- a/include/picongpu/particles/traits/GetCurrentSolver.hpp
+++ b/include/picongpu/particles/traits/GetCurrentSolver.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,15 +25,14 @@
 
 namespace picongpu
 {
-namespace traits
-{
-template<typename T_Species>
-struct GetCurrentSolver
-{
-    using type = typename pmacc::traits::Resolve<
-        typename GetFlagType<typename T_Species::FrameType, current<> >::type
-    >::type;
-};
-} //namespace traits
+    namespace traits
+    {
+        template<typename T_Species>
+        struct GetCurrentSolver
+        {
+            using type = typename pmacc::traits::Resolve<
+                typename GetFlagType<typename T_Species::FrameType, current<>>::type>::type;
+        };
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetDensityRatio.hpp b/include/picongpu/particles/traits/GetDensityRatio.hpp
index 76f9ab6624..ede76820da 100644
--- a/include/picongpu/particles/traits/GetDensityRatio.hpp
+++ b/include/picongpu/particles/traits/GetDensityRatio.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Richard Pausch
+/* Copyright 2015-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -29,38 +29,30 @@
 
 namespace picongpu
 {
-namespace traits
-{
-
-namespace detail
-{
-    value_identifier(float_X, DefaultDensityRatio, 1.0);
-} // namespace detail
-
-
-/** get density ratio of a species
- *
- * ratio is set to 1.0 if no alias `densityRatio<>` is defined
- *
- * @treturn ::type `value_identifier` with the default density
- */
-template<typename T_Species>
-struct GetDensityRatio
-{
-    using FrameType = typename T_Species::FrameType;
-    typedef typename HasFlag<FrameType, densityRatio<> >::type hasDensityRatio;
-    typedef typename pmacc::traits::Resolve<
-        typename GetFlagType<
-            FrameType, densityRatio<>
-        >::type
-    >::type DensityRatioOfSpecies;
-
-    typedef typename bmpl::if_<
-        hasDensityRatio,
-        DensityRatioOfSpecies,
-        detail::DefaultDensityRatio
-    >::type type;
-};
-
-} // namespace traits
+    namespace traits
+    {
+        namespace detail
+        {
+            value_identifier(float_X, DefaultDensityRatio, 1.0);
+        } // namespace detail
+
+
+        /** get density ratio of a species
+         *
+         * ratio is set to 1.0 if no alias `densityRatio<>` is defined
+         *
+         * @treturn ::type `value_identifier` with the default density
+         */
+        template<typename T_Species>
+        struct GetDensityRatio
+        {
+            using FrameType = typename T_Species::FrameType;
+            typedef typename HasFlag<FrameType, densityRatio<>>::type hasDensityRatio;
+            typedef typename pmacc::traits::Resolve<typename GetFlagType<FrameType, densityRatio<>>::type>::type
+                DensityRatioOfSpecies;
+
+            typedef typename bmpl::if_<hasDensityRatio, DensityRatioOfSpecies, detail::DefaultDensityRatio>::type type;
+        };
+
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetEffectiveNuclearCharge.hpp b/include/picongpu/particles/traits/GetEffectiveNuclearCharge.hpp
index fe673cf671..ab88a6f38c 100644
--- a/include/picongpu/particles/traits/GetEffectiveNuclearCharge.hpp
+++ b/include/picongpu/particles/traits/GetEffectiveNuclearCharge.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten, Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Marco Garten, Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -28,30 +28,31 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    template<typename T_Species>
-    struct GetEffectiveNuclearCharge
+    namespace traits
     {
-        using SpeciesType = T_Species;
-        using FrameType = typename SpeciesType::FrameType;
+        template<typename T_Species>
+        struct GetEffectiveNuclearCharge
+        {
+            using SpeciesType = T_Species;
+            using FrameType = typename SpeciesType::FrameType;
 
-        using hasEffectiveNuclearCharge = typename HasFlag<FrameType, effectiveNuclearCharge<> >::type;
-        /* throw static assert if species has no predefined effective atomic numbers */
-        PMACC_CASSERT_MSG(No_effective_atomic_numbers_are_defined_for_this_species,hasEffectiveNuclearCharge::value==true);
+            using hasEffectiveNuclearCharge = typename HasFlag<FrameType, effectiveNuclearCharge<>>::type;
+            /* throw static assert if species has no predefined effective atomic numbers */
+            PMACC_CASSERT_MSG(
+                No_effective_atomic_numbers_are_defined_for_this_species,
+                hasEffectiveNuclearCharge::value == true);
 
-        using FoundEffectiveNuclearChargeAlias = typename GetFlagType<FrameType,effectiveNuclearCharge<> >::type;
-        /* Extract vector of effective atomic numbers */
-        using type = typename pmacc::traits::Resolve<FoundEffectiveNuclearChargeAlias >::type;
+            using FoundEffectiveNuclearChargeAlias = typename GetFlagType<FrameType, effectiveNuclearCharge<>>::type;
+            /* Extract vector of effective atomic numbers */
+            using type = typename pmacc::traits::Resolve<FoundEffectiveNuclearChargeAlias>::type;
 
-        static constexpr int protonNumber = static_cast<int>(GetAtomicNumbers<SpeciesType>::type::numberOfProtons);
-        /* length of the ionization energy vector */
-        static constexpr int vecLength = type::dim;
-        /* assert that the number of arguments in the vector equal the proton number */
-        PMACC_CASSERT_MSG(
-            __The_given_number_of_effective_atomic_numbers_Z_eff_should_be_exactly_the_proton_number_of_the_species__,
-            vecLength == protonNumber
-        );
-    };
-} // namespace traits
+            static constexpr int protonNumber = static_cast<int>(GetAtomicNumbers<SpeciesType>::type::numberOfProtons);
+            /* length of the ionization energy vector */
+            static constexpr int vecLength = type::dim;
+            /* assert that the number of arguments in the vector equal the proton number */
+            PMACC_CASSERT_MSG(
+                __The_given_number_of_effective_atomic_numbers_Z_eff_should_be_exactly_the_proton_number_of_the_species__,
+                vecLength == protonNumber);
+        };
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetExchangeMemCfg.hpp b/include/picongpu/particles/traits/GetExchangeMemCfg.hpp
index cac77438e4..47056999ea 100644
--- a/include/picongpu/particles/traits/GetExchangeMemCfg.hpp
+++ b/include/picongpu/particles/traits/GetExchangeMemCfg.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -30,41 +30,31 @@
 
 namespace picongpu
 {
-namespace traits
-{
-
-    /** get a memory configuration for species exchange buffer
-     *
-     * If exchangeMemCfg is not defined for a species than the default memory
-     * exchange size from the file memory.param are used.
-     *
-     * @tparam T_Species picongpu::Particles, type of the species
-     * @return class with buffer sizes for each direction
-     */
-    template< typename T_Species >
-    struct GetExchangeMemCfg
+    namespace traits
     {
-        using FrameType = typename T_Species::FrameType;
-        using hasMemCfg = typename HasFlag<
-            FrameType,
-            exchangeMemCfg< >
-        >::type;
-
-        using type = typename bmpl::if_<
-            hasMemCfg,
-            typename pmacc::traits::Resolve<
-                typename GetFlagType<
-                    FrameType,
-                    exchangeMemCfg< >
-                >::type
-            >::type,
-            ::picongpu::DefaultExchangeMemCfg
-        >::type;
-    };
-
-    //! short hand traits for GetExchangeMemCfg
-    template< typename T_Species >
-    using GetExchangeMemCfg_t = typename traits::GetExchangeMemCfg< T_Species >::type;
-
-} // namespace traits
+        /** get a memory configuration for species exchange buffer
+         *
+         * If exchangeMemCfg is not defined for a species than the default memory
+         * exchange size from the file memory.param are used.
+         *
+         * @tparam T_Species picongpu::Particles, type of the species
+         * @return class with buffer sizes for each direction
+         */
+        template<typename T_Species>
+        struct GetExchangeMemCfg
+        {
+            using FrameType = typename T_Species::FrameType;
+            using hasMemCfg = typename HasFlag<FrameType, exchangeMemCfg<>>::type;
+
+            using type = typename bmpl::if_<
+                hasMemCfg,
+                typename pmacc::traits::Resolve<typename GetFlagType<FrameType, exchangeMemCfg<>>::type>::type,
+                ::picongpu::DefaultExchangeMemCfg>::type;
+        };
+
+        //! short hand traits for GetExchangeMemCfg
+        template<typename T_Species>
+        using GetExchangeMemCfg_t = typename traits::GetExchangeMemCfg<T_Species>::type;
+
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetInterpolation.hpp b/include/picongpu/particles/traits/GetInterpolation.hpp
index f9c0145688..f77e652ab4 100644
--- a/include/picongpu/particles/traits/GetInterpolation.hpp
+++ b/include/picongpu/particles/traits/GetInterpolation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,16 +25,14 @@
 
 namespace picongpu
 {
-namespace traits
-{
-
-template<typename T_Species>
-struct GetInterpolation
-{
-    using type = typename pmacc::traits::Resolve<
-        typename GetFlagType<typename T_Species::FrameType, interpolation<> >::type
-    >::type;
-};
-} //namespace traits
+    namespace traits
+    {
+        template<typename T_Species>
+        struct GetInterpolation
+        {
+            using type = typename pmacc::traits::Resolve<
+                typename GetFlagType<typename T_Species::FrameType, interpolation<>>::type>::type;
+        };
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetIonizationEnergies.hpp b/include/picongpu/particles/traits/GetIonizationEnergies.hpp
index dc8bcc247a..504cfb0f16 100644
--- a/include/picongpu/particles/traits/GetIonizationEnergies.hpp
+++ b/include/picongpu/particles/traits/GetIonizationEnergies.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Marco Garten, Rene Widera
+/* Copyright 2015-2021 Marco Garten, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,31 +27,32 @@
 
 namespace picongpu
 {
-namespace traits
-{
-template<typename T_Species>
-struct GetIonizationEnergies
-{
-    using SpeciesType = T_Species;
-    using FrameType = typename SpeciesType::FrameType;
+    namespace traits
+    {
+        template<typename T_Species>
+        struct GetIonizationEnergies
+        {
+            using SpeciesType = T_Species;
+            using FrameType = typename SpeciesType::FrameType;
 
-    using hasIonizationEnergies = typename HasFlag<FrameType, ionizationEnergies<> >::type;
-    /* throw static assert if species has no protons or neutrons */
-    PMACC_CASSERT_MSG(No_ionization_energies_are_defined_for_this_species,hasIonizationEnergies::value==true);
+            using hasIonizationEnergies = typename HasFlag<FrameType, ionizationEnergies<>>::type;
+            /* throw static assert if species has no protons or neutrons */
+            PMACC_CASSERT_MSG(
+                No_ionization_energies_are_defined_for_this_species,
+                hasIonizationEnergies::value == true);
 
-    using FoundIonizationEnergiesAlias = typename GetFlagType<FrameType,ionizationEnergies<> >::type;
-    /* Extract ionization energy vector from AU namespace */
-    using type = typename pmacc::traits::Resolve<FoundIonizationEnergiesAlias >::type;
+            using FoundIonizationEnergiesAlias = typename GetFlagType<FrameType, ionizationEnergies<>>::type;
+            /* Extract ionization energy vector from AU namespace */
+            using type = typename pmacc::traits::Resolve<FoundIonizationEnergiesAlias>::type;
 
-    static constexpr int protonNumber = static_cast<int>(GetAtomicNumbers<SpeciesType>::type::numberOfProtons);
-    /* length of the ionization energy vector */
-    static constexpr int vecLength = type::dim;
-    /* assert that the number of arguments in the vector equal the proton number */
-    PMACC_CASSERT_MSG(
-        __The_given_number_of_ionization_energies_should_be_exactly_the_proton_number_of_the_species__,
-        vecLength == protonNumber
-    );
-};
-} //namespace traits
+            static constexpr int protonNumber = static_cast<int>(GetAtomicNumbers<SpeciesType>::type::numberOfProtons);
+            /* length of the ionization energy vector */
+            static constexpr int vecLength = type::dim;
+            /* assert that the number of arguments in the vector equal the proton number */
+            PMACC_CASSERT_MSG(
+                __The_given_number_of_ionization_energies_should_be_exactly_the_proton_number_of_the_species__,
+                vecLength == protonNumber);
+        };
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetIonizerList.hpp b/include/picongpu/particles/traits/GetIonizerList.hpp
index 49f59c3b92..7b38c2aea1 100644
--- a/include/picongpu/particles/traits/GetIonizerList.hpp
+++ b/include/picongpu/particles/traits/GetIonizerList.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten, Axel Huebl
+/* Copyright 2014-2021 Marco Garten, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -30,43 +30,36 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace traits
-{
-    /** Returns a sequence with ionizers for a species
-     *
-     * Several ionization methods can be assigned to a species which are called
-     * consecutively (in the same order as the user inputs them) within a single
-     * time step.
-     *
-     * @tparam T_SpeciesType ion species
-     */
-    template< typename T_SpeciesType >
-    struct GetIonizerList
+    namespace particles
     {
-        using SpeciesType = T_SpeciesType;
-        using FrameType = typename SpeciesType::FrameType;
+        namespace traits
+        {
+            /** Returns a sequence with ionizers for a species
+             *
+             * Several ionization methods can be assigned to a species which are called
+             * consecutively (in the same order as the user inputs them) within a single
+             * time step.
+             *
+             * @tparam T_SpeciesType ion species
+             */
+            template<typename T_SpeciesType>
+            struct GetIonizerList
+            {
+                using SpeciesType = T_SpeciesType;
+                using FrameType = typename SpeciesType::FrameType;
 
-        // the following line only fetches the alias
-        using FoundIonizersAlias = typename GetFlagType<
-            FrameType,
-            ionizers<>
-        >::type;
+                // the following line only fetches the alias
+                using FoundIonizersAlias = typename GetFlagType<FrameType, ionizers<>>::type;
 
-        // this now resolves the alias into the actual object type, a list of ionizers
-        using FoundIonizerList = typename pmacc::traits::Resolve< FoundIonizersAlias >::type;
+                // this now resolves the alias into the actual object type, a list of ionizers
+                using FoundIonizerList = typename pmacc::traits::Resolve<FoundIonizersAlias>::type;
 
-        using type = typename pmacc::OperateOnSeq<
-            FoundIonizerList,
-            bmpl::apply1<
-                bmpl::_1,
-                SpeciesType
-            >,
-            pmacc::meta::accessors::Type<>
-        >::type;
-    };
+                using type = typename pmacc::OperateOnSeq<
+                    FoundIonizerList,
+                    bmpl::apply1<bmpl::_1, SpeciesType>,
+                    pmacc::meta::accessors::Type<>>::type;
+            };
 
-} // namespace traits
-} // namespace particles
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetMarginPusher.hpp b/include/picongpu/particles/traits/GetMarginPusher.hpp
index e789a33c64..e1ebb80645 100644
--- a/include/picongpu/particles/traits/GetMarginPusher.hpp
+++ b/include/picongpu/particles/traits/GetMarginPusher.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Richard Pausch
+/* Copyright 2015-2021 Richard Pausch, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -27,36 +27,80 @@
 
 namespace picongpu
 {
+    namespace traits
+    {
+        /** Get margins of a pusher for species
+         *
+         * @tparam T_Species particle species type
+         * @tparam T_GetLowerMargin lower margin for pusher getter type
+         * @tparam T_GetUpperMargin upper margin for pusher getter type
+         */
+        template<
+            typename T_Species,
+            typename T_GetLowerMargin = GetLowerMargin<GetPusher<bmpl::_1>>,
+            typename T_GetUpperMargin = GetUpperMargin<GetPusher<bmpl::_1>>>
+        struct GetMarginPusher
+        {
+            using AddLowerMargins = pmacc::math::CT::add<GetLowerMargin<GetInterpolation<bmpl::_1>>, T_GetLowerMargin>;
+            using LowerMargin = typename bmpl::apply<AddLowerMargins, T_Species>::type;
 
-namespace traits
-{
-template<typename T_Species>
-struct GetMarginPusher
-{
-    using AddLowerMargins = pmacc::math::CT::add<
-        GetLowerMargin< GetInterpolation< bmpl::_1 > >,
-        GetLowerMargin< GetPusher< bmpl::_1 > >
-    >;
-    using LowerMargin = typename bmpl::apply<AddLowerMargins, T_Species>::type;
-
-    using AddUpperMargins = pmacc::math::CT::add<
-        GetUpperMargin< GetInterpolation< bmpl::_1 > >,
-        GetUpperMargin< GetPusher< bmpl::_1 > >
-    >;
-    using UpperMargin = typename bmpl::apply<AddUpperMargins, T_Species>::type;
-};
-
-template<typename T_Species>
-struct GetLowerMarginPusher
-{
-    using type = typename traits::GetMarginPusher<T_Species>::LowerMargin;
-};
+            using AddUpperMargins = pmacc::math::CT::add<GetUpperMargin<GetInterpolation<bmpl::_1>>, T_GetUpperMargin>;
+            using UpperMargin = typename bmpl::apply<AddUpperMargins, T_Species>::type;
+        };
 
-template<typename T_Species>
-struct GetUpperMarginPusher
-{
-    using type = typename traits::GetMarginPusher<T_Species>::UpperMargin;
-};
+        /** Get lower margin of a pusher for species
+         *
+         * @tparam T_Species particle species type
+         */
+        template<typename T_Species>
+        struct GetLowerMarginPusher
+        {
+            using type = typename traits::GetMarginPusher<T_Species>::LowerMargin;
+        };
+
+        /** Get lower margin of the given pusher for species
+         *
+         * Normally, the pusher does not have to be given explicitly.
+         * However, it is needed for composite pushers
+         *
+         * @tparam T_Species particle species type
+         * @tparam T_Pusher pusher type
+         */
+        template<typename T_Species, typename T_Pusher>
+        struct GetLowerMarginForPusher
+        {
+            using type = typename traits::GetMarginPusher<
+                T_Species,
+                typename GetLowerMargin<T_Pusher>::type,
+                typename GetUpperMargin<T_Pusher>::type>::LowerMargin;
+        };
+
+        /** Get upper margin of a pusher for species
+         *
+         * @tparam T_Species particle species type
+         */
+        template<typename T_Species>
+        struct GetUpperMarginPusher
+        {
+            using type = typename traits::GetMarginPusher<T_Species>::UpperMargin;
+        };
+
+        /** Get upper margin of the given pusher for species
+         *
+         * Normally, the pusher does not have to be given explicitly.
+         * However, it is needed for composite pushers
+         *
+         * @tparam T_Species particle species type
+         * @tparam T_Pusher pusher type
+         */
+        template<typename T_Species, typename T_Pusher>
+        struct GetUpperMarginForPusher
+        {
+            using type = typename traits::GetMarginPusher<
+                T_Species,
+                typename GetLowerMargin<T_Pusher>::type,
+                typename GetUpperMargin<T_Pusher>::type>::UpperMargin;
+        };
 
-}// namespace traits
-}// namespace picongpu
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetPhotonCreator.hpp b/include/picongpu/particles/traits/GetPhotonCreator.hpp
index 77bd08f287..fc8777fc32 100644
--- a/include/picongpu/particles/traits/GetPhotonCreator.hpp
+++ b/include/picongpu/particles/traits/GetPhotonCreator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -31,44 +31,33 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace traits
-{
-
-    /** Get the functor to create photons from a species
-     *
-     * @tparam T_SpeciesType type or name as boost::mpl::string
-     */
-    template< typename T_SpeciesType >
-    struct GetPhotonCreator
+    namespace particles
     {
-        using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            T_SpeciesType
-        >;
-        using FrameType = typename SpeciesType::FrameType;
-
-        // The following line only fetches the alias
-        using FoundSynchrotronPhotonsAlias = typename GetFlagType<
-            FrameType,
-            picongpu::synchrotronPhotons<>
-        >::type;
-
-        // This now resolves the alias into the actual object type and select the species from the species list
-        using FoundPhotonSpecies = pmacc::particles::meta::FindByNameOrType_t<
-            VectorAllSpecies,
-            typename pmacc::traits::Resolve< FoundSynchrotronPhotonsAlias >::type
-        >;
-
-        // This specifies the target species as the second template parameter of the photon creator
-        using type = synchrotronPhotons::PhotonCreator<
-            SpeciesType,
-            FoundPhotonSpecies
-        >;
-
-    };
-
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            /** Get the functor to create photons from a species
+             *
+             * @tparam T_SpeciesType type or name as boost::mpl::string
+             */
+            template<typename T_SpeciesType>
+            struct GetPhotonCreator
+            {
+                using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+                using FrameType = typename SpeciesType::FrameType;
+
+                // The following line only fetches the alias
+                using FoundSynchrotronPhotonsAlias =
+                    typename GetFlagType<FrameType, picongpu::synchrotronPhotons<>>::type;
+
+                // This now resolves the alias into the actual object type and select the species from the species list
+                using FoundPhotonSpecies = pmacc::particles::meta::FindByNameOrType_t<
+                    VectorAllSpecies,
+                    typename pmacc::traits::Resolve<FoundSynchrotronPhotonsAlias>::type>;
+
+                // This specifies the target species as the second template parameter of the photon creator
+                using type = synchrotronPhotons::PhotonCreator<SpeciesType, FoundPhotonSpecies>;
+            };
+
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetPusher.hpp b/include/picongpu/particles/traits/GetPusher.hpp
index bcfe0d0ab4..e16bb2f50f 100644
--- a/include/picongpu/particles/traits/GetPusher.hpp
+++ b/include/picongpu/particles/traits/GetPusher.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Richard Pausch
+/* Copyright 2014-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -25,16 +25,14 @@
 
 namespace picongpu
 {
+    namespace traits
+    {
+        template<typename T_Species>
+        struct GetPusher
+        {
+            using type = typename pmacc::traits::Resolve<
+                typename GetFlagType<typename T_Species::FrameType, particlePusher<>>::type>::type;
+        };
 
-namespace traits
-{
-template<typename T_Species>
-struct GetPusher
-{
-    using type = typename pmacc::traits::Resolve<
-        typename GetFlagType<typename T_Species::FrameType, particlePusher<> >::type
-      >::type;
-};
-
-}// namespace traits
-}// namespace picongpu
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetShape.hpp b/include/picongpu/particles/traits/GetShape.hpp
index 4a4194a4ba..f448ee0ec4 100644
--- a/include/picongpu/particles/traits/GetShape.hpp
+++ b/include/picongpu/particles/traits/GetShape.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,17 +25,15 @@
 
 namespace picongpu
 {
-namespace traits
-{
-
-template<typename T_Species>
-struct GetShape
-{
-    using type = typename pmacc::traits::Resolve<
-        typename GetFlagType<typename T_Species::FrameType, shape<> >::type
-    >::type;
-};
+    namespace traits
+    {
+        template<typename T_Species>
+        struct GetShape
+        {
+            using type = typename pmacc::traits::Resolve<
+                typename GetFlagType<typename T_Species::FrameType, shape<>>::type>::type;
+        };
 
-} //namespace traits
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/particles/traits/GetSpeciesFlagName.hpp b/include/picongpu/particles/traits/GetSpeciesFlagName.hpp
index bf808c9b58..9bce118077 100644
--- a/include/picongpu/particles/traits/GetSpeciesFlagName.hpp
+++ b/include/picongpu/particles/traits/GetSpeciesFlagName.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -29,49 +29,36 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    /** Get the GetStringProperties "name" attribute of a Species' Flag
-     *
-     * Returns the "name" attribute of a species string attribute list as
-     * std::string and if not present, returns "none".
-     */
-    template<
-        typename T_Species,
-        typename T_Flag,
-        bool T_hasFlag = HasFlag<
-            typename T_Species::FrameType,
-            T_Flag
-        >::type::value
-    >
-    struct
-    GetSpeciesFlagName
+    namespace traits
     {
-        using SpeciesFlag = typename pmacc::traits::Resolve<
-            typename GetFlagType<
-                typename T_Species::FrameType,
-                T_Flag
-            >::type
-        >::type;
-
-        std::string operator()() const
+        /** Get the GetStringProperties "name" attribute of a Species' Flag
+         *
+         * Returns the "name" attribute of a species string attribute list as
+         * std::string and if not present, returns "none".
+         */
+        template<
+            typename T_Species,
+            typename T_Flag,
+            bool T_hasFlag = HasFlag<typename T_Species::FrameType, T_Flag>::type::value>
+        struct GetSpeciesFlagName
         {
-            GetStringProperties< SpeciesFlag > stringProps;
-            return stringProps["name"].value;
-        }
-    };
+            using SpeciesFlag = typename pmacc::traits::Resolve<
+                typename GetFlagType<typename T_Species::FrameType, T_Flag>::type>::type;
 
-    template<
-        typename T_Species,
-        typename T_Flag
-    >
-    struct
-    GetSpeciesFlagName<T_Species, T_Flag, false>
-    {
-        std::string operator()() const
+            std::string operator()() const
+            {
+                GetStringProperties<SpeciesFlag> stringProps;
+                return stringProps["name"].value;
+            }
+        };
+
+        template<typename T_Species, typename T_Flag>
+        struct GetSpeciesFlagName<T_Species, T_Flag, false>
         {
-            return "none";
-        }
-    };
-} // namespace traits
+            std::string operator()() const
+            {
+                return "none";
+            }
+        };
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/HasIonizersWithRNG.hpp b/include/picongpu/particles/traits/HasIonizersWithRNG.hpp
deleted file mode 100644
index c23fff39f5..0000000000
--- a/include/picongpu/particles/traits/HasIonizersWithRNG.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2017-2020 Axel Huebl, Marco Garten
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-
-#include <pmacc/meta/conversion/OperateOnSeq.hpp>
-#include <pmacc/meta/conversion/MakeSeqFromNestedSeq.hpp>
-#include <pmacc/particles/traits/FilterByFlag.hpp>
-
-#include "picongpu/traits/UsesRNG.hpp"
-#include "picongpu/particles/traits/GetIonizerList.hpp"
-
-#include <boost/type_traits/integral_constant.hpp>
-#include <boost/mpl/contains.hpp>
-#include <boost/mpl/placeholders.hpp>
-
-
-namespace picongpu
-{
-namespace particles
-{
-namespace traits
-{
-    /** Check Ionizers for RNG Need
-     *
-     * Checks all species for ionizers and within those if a random number generator is needed.
-     * Returns a true-valued boost integral constant in ::type if a RNG is needed.
-     *
-     * @tparam T_VectorSpecies sequence of (ion) species to check ionizers for
-     */
-    template< typename T_VectorSpecies >
-    struct HasIonizersWithRNG
-    {
-        using VectorSpecies = T_VectorSpecies;
-
-        // make a list of all species that can be ionized
-        using VectorSpeciesWithIonizer = typename pmacc::particles::traits::FilterByFlag<
-            VectorSpecies,
-            ionizers<>
-        >::type;
-
-        // make a list of all ionizers that will be used by all species
-        using AllUsedIonizers = typename pmacc::MakeSeqFromNestedSeq<
-            typename pmacc::OperateOnSeq<
-                VectorSpeciesWithIonizer,
-                GetIonizerList< bmpl::_1 >
-            >::type
-        >::type;
-
-        /* make a list of `boost::true_type`s and `boost::false_type`s for species that use or do
-         * not use the RNG during ionization
-         */
-        using AllIonizersUsingRNG = typename pmacc::OperateOnSeq<
-            AllUsedIonizers,
-            picongpu::traits::UsesRNG< bmpl::_1 >
-        >::type;
-
-        // check if at least one RNG is needed
-        using type = typename boost::mpl::contains<
-            AllIonizersUsingRNG,
-            boost::true_type
-        >::type;
-    };
-
-} // namespace traits
-} // namespace particles
-} // namespace picongpu
diff --git a/include/picongpu/particles/traits/MacroWeighted.hpp b/include/picongpu/particles/traits/MacroWeighted.hpp
index efee5f473c..2fe8ba7d63 100644
--- a/include/picongpu/particles/traits/MacroWeighted.hpp
+++ b/include/picongpu/particles/traits/MacroWeighted.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -19,36 +19,36 @@
 
 #pragma once
 
-//include "simulation_defines.hpp"
+// include "simulation_defines.hpp"
 #include <pmacc/types.hpp>
 
 namespace picongpu
 {
-namespace traits
-{
-    /** Describe if a particle attribute describes the quantity of a macro
-     *  particle
-     *
-     * Depending on the implementation of an attribute, it might be sometimes
-     * useful to return a quantity regarding one of the underlying real
-     * particles (false: "this attribute is not weighted accordingly for the
-     * whole ensemble of particles in its macro particle) or just handle the
-     * whole macro particle at once
-     * (true: "this attribute is already weighted").
-     *
-     * This trait defines for each attribute if it needs to be scaled with the
-     * weighting. *How* the scaling with weighting is applied can be seen in
-     * \see WeightingPower
-     *   \see http://www.openPMD.org
-     *   \see http://dx.doi.org/10.5281/zenodo.33624
-     *   \see https://git.io/vwlWa
-     *
-     * \tparam T_Identifier any picongpu identifier
-     * \return \p bool ::get() as static public method
-     *
-     */
-    template<typename T_Identifier>
-    struct MacroWeighted;
+    namespace traits
+    {
+        /** Describe if a particle attribute describes the quantity of a macro
+         *  particle
+         *
+         * Depending on the implementation of an attribute, it might be sometimes
+         * useful to return a quantity regarding one of the underlying real
+         * particles (false: "this attribute is not weighted accordingly for the
+         * whole ensemble of particles in its macro particle) or just handle the
+         * whole macro particle at once
+         * (true: "this attribute is already weighted").
+         *
+         * This trait defines for each attribute if it needs to be scaled with the
+         * weighting. *How* the scaling with weighting is applied can be seen in
+         * \see WeightingPower
+         *   \see http://www.openPMD.org
+         *   \see http://dx.doi.org/10.5281/zenodo.33624
+         *   \see https://git.io/vwlWa
+         *
+         * \tparam T_Identifier any picongpu identifier
+         * \return \p bool ::get() as static public method
+         *
+         */
+        template<typename T_Identifier>
+        struct MacroWeighted;
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/SpeciesEligibleForSolver.hpp b/include/picongpu/particles/traits/SpeciesEligibleForSolver.hpp
index 0f3603bd03..c27d52c2f7 100644
--- a/include/picongpu/particles/traits/SpeciesEligibleForSolver.hpp
+++ b/include/picongpu/particles/traits/SpeciesEligibleForSolver.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -24,27 +24,24 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace traits
-{
-    /** Check if species fulfills requirements of a solver
-     *
-     * Defines a boost::mpl::bool_ true type is the particle species as all
-     * requirements fulfilled for a solver.
-     *
-     * @tparam T_Species Species to check
-     * @tparam T_Solver Solver with requirements
-     */
-    template<
-        typename T_Species,
-        typename T_Solver
-    >
-    struct SpeciesEligibleForSolver
+    namespace particles
     {
-        using type = boost::mpl::bool_< true >;
-    };
+        namespace traits
+        {
+            /** Check if species fulfills requirements of a solver
+             *
+             * Defines a boost::mpl::bool_ true type is the particle species as all
+             * requirements fulfilled for a solver.
+             *
+             * @tparam T_Species Species to check
+             * @tparam T_Solver Solver with requirements
+             */
+            template<typename T_Species, typename T_Solver>
+            struct SpeciesEligibleForSolver
+            {
+                using type = boost::mpl::bool_<true>;
+            };
 
-} // namespace traits
-} // namespace particles
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/particles/traits/WeightingPower.hpp b/include/picongpu/particles/traits/WeightingPower.hpp
index 507a68ac35..add45ad094 100644
--- a/include/picongpu/particles/traits/WeightingPower.hpp
+++ b/include/picongpu/particles/traits/WeightingPower.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -19,33 +19,33 @@
 
 #pragma once
 
-//include "simulation_defines.hpp"
+// include "simulation_defines.hpp"
 #include <pmacc/types.hpp>
 
 namespace picongpu
 {
-namespace traits
-{
-    /** Describe if a particle attribute describes the quantity of a macro
-     *  particle
-     *
-     * Depending on the implementation of an attribute, it might be sometimes
-     * useful to return a quantity regarding one of the underlying real
-     * particles (\see MacroWeighted).
-     *
-     * This trait defines how each attribute needs to be scaled with the
-     * weighting (linear, quadratic, ...) to convert between "real" and "macro"
-     * particle attributes.
-     *   \see http://www.openPMD.org
-     *   \see http://dx.doi.org/10.5281/zenodo.33624
-     *   \see https://git.io/vwlWa
-     *
-     * \tparam T_Identifier any picongpu identifier
-     * \return \p float_64 ::get() as static public method
-     *
-     */
-    template<typename T_Identifier>
-    struct WeightingPower;
+    namespace traits
+    {
+        /** Describe if a particle attribute describes the quantity of a macro
+         *  particle
+         *
+         * Depending on the implementation of an attribute, it might be sometimes
+         * useful to return a quantity regarding one of the underlying real
+         * particles (\see MacroWeighted).
+         *
+         * This trait defines how each attribute needs to be scaled with the
+         * weighting (linear, quadratic, ...) to convert between "real" and "macro"
+         * particle attributes.
+         *   \see http://www.openPMD.org
+         *   \see http://dx.doi.org/10.5281/zenodo.33624
+         *   \see https://git.io/vwlWa
+         *
+         * \tparam T_Identifier any picongpu identifier
+         * \return \p float_64 ::get() as static public method
+         *
+         */
+        template<typename T_Identifier>
+        struct WeightingPower;
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/plugins/BinEnergyParticles.hpp b/include/picongpu/plugins/BinEnergyParticles.hpp
index 6c6073fd64..0da0bbf8a6 100644
--- a/include/picongpu/plugins/BinEnergyParticles.hpp
+++ b/include/picongpu/plugins/BinEnergyParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau,
  *                     Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -58,688 +58,489 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-namespace po = boost::program_options;
+    namespace po = boost::program_options;
 
-/** calculate a energy histogram of a species
- *
- * if a particle filter is given, only the filtered particles are counted
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelBinEnergyParticles
-{
-    /* sum up the energy of all particles
-     *
-     * the kinetic energy of all active particles will be calculated
+    /** calculate a energy histogram of a species
      *
-     * @tparam T_ParBox pmacc::ParticlesBox, particle box type
-     * @tparam T_BinBox pmacc::DataBox, box type for the histogram in global memory
-     * @tparam T_Mapping type of the mapper to map a cuda block to a supercell index
-     * @tparam T_Acc alpaka accelerator type
+     * if a particle filter is given, only the filtered particles are counted
      *
-     * @param acc alpaka accelerator
-     * @param pb box with access to the particles of the current used species
-     * @param gBins box with memory for resulting histogram
-     * @param numBins number of bins in the histogram (must be fit into the shared memory)
-     * @param minEnergy particle energy for the first bin
-     * @param maxEnergy particle energy for the last bin
-     * @param mapper functor to map a cuda block to a supercells index
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParBox,
-        typename T_BinBox,
-        typename T_Mapping,
-        typename T_Filter,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParBox pb,
-        T_BinBox gBins,
-        int const numBins,
-        float_X const minEnergy,
-        float_X const maxEnergy,
-        T_Mapping const mapper,
-        T_Filter filter
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelBinEnergyParticles
     {
-        using namespace pmacc::mappings::threads;
-        using SuperCellSize = typename MappingDesc::SuperCellSize;
-        using FramePtr = typename T_ParBox::FramePtr;
-        constexpr uint32_t maxParticlesPerFrame = pmacc::math::CT::volume< SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
-
-        PMACC_SMEM(
-            acc,
-            frame,
-            FramePtr
-        );
-
-        PMACC_SMEM(
-            acc,
-            particlesInSuperCell,
-            lcellId_t
-        );
-
-        /* shBins index can go from 0 to (numBins+2)-1
-         * 0 is for <minEnergy
-         * (numBins+2)-1 is for >maxEnergy
+        /* sum up the energy of all particles
+         *
+         * the kinetic energy of all active particles will be calculated
+         *
+         * @tparam T_ParBox pmacc::ParticlesBox, particle box type
+         * @tparam T_BinBox pmacc::DataBox, box type for the histogram in global memory
+         * @tparam T_Mapping type of the mapper to map a cupla block to a supercell index
+         * @tparam T_Acc alpaka accelerator type
+         *
+         * @param acc alpaka accelerator
+         * @param pb box with access to the particles of the current used species
+         * @param gBins box with memory for resulting histogram
+         * @param numBins number of bins in the histogram (must be fit into the shared memory)
+         * @param minEnergy particle energy for the first bin
+         * @param maxEnergy particle energy for the last bin
+         * @param mapper functor to map a cupla block to a supercells index
          */
-        sharedMemExtern(shBin,float_X); /* size must be numBins+2 because we have <min and >max */
+        template<typename T_ParBox, typename T_BinBox, typename T_Mapping, typename T_Filter, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_ParBox pb,
+            T_BinBox gBins,
+            int const numBins,
+            float_X const minEnergy,
+            float_X const maxEnergy,
+            T_Mapping const mapper,
+            T_Filter filter) const
+        {
+            using namespace pmacc::mappings::threads;
+            using SuperCellSize = typename MappingDesc::SuperCellSize;
+            using FramePtr = typename T_ParBox::FramePtr;
+            constexpr uint32_t maxParticlesPerFrame = pmacc::math::CT::volume<SuperCellSize>::type::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
 
+            PMACC_SMEM(acc, frame, FramePtr);
 
-        int const realNumBins = numBins + 2;
+            PMACC_SMEM(acc, particlesInSuperCell, lcellId_t);
 
-        uint32_t const workerIdx = threadIdx.x;
+            /* shBins index can go from 0 to (numBins+2)-1
+             * 0 is for <minEnergy
+             * (numBins+2)-1 is for >maxEnergy
+             */
+            sharedMemExtern(shBin, float_X); /* size must be numBins+2 because we have <min and >max */
 
-        using MasterOnly = IdxConfig<
-            1,
-            numWorkers
-        >;
 
-        DataSpace< simDim > const superCellIdx(
-            mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) )
-        );
+            int const realNumBins = numBins + 2;
 
-        ForEachIdx< MasterOnly >{ workerIdx }(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                frame = pb.getLastFrame( superCellIdx );
-                particlesInSuperCell = pb.getSuperCell( superCellIdx ).getSizeLastFrame( );
-            }
-        );
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-        ForEachIdx<
-            IdxConfig<
-                numWorkers,
-                numWorkers
-            >
-        >{ workerIdx }(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
+            using MasterOnly = IdxConfig<1, numWorkers>;
+
+            DataSpace<simDim> const superCellIdx(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+
+            ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
+                frame = pb.getLastFrame(superCellIdx);
+                particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame();
+            });
+
+            ForEachIdx<IdxConfig<numWorkers, numWorkers>>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
                 /* set all bins to 0 */
-                for( int i = linearIdx; i < realNumBins; i += numWorkers )
-                    shBin[ i ] = float_X( 0. );
-            }
-        );
+                for(int i = linearIdx; i < realNumBins; i += numWorkers)
+                    shBin[i] = float_X(0.);
+            });
 
-        __syncthreads();
+            cupla::__syncthreads(acc);
 
-        if( !frame.isValid( ) )
-          return; /* end kernel if we have no frames */
+            if(!frame.isValid())
+                return; /* end kernel if we have no frames */
 
-        auto accFilter = filter(
-            acc,
-            superCellIdx - mapper.getGuardingSuperCells( ),
-            WorkerCfg< numWorkers >{ workerIdx }
-        );
+            auto accFilter
+                = filter(acc, superCellIdx - mapper.getGuardingSuperCells(), WorkerCfg<numWorkers>{workerIdx});
 
-        while( frame.isValid() )
-        {
-            // move over all particles in a frame
-            ForEachIdx<
-                IdxConfig<
-                    maxParticlesPerFrame,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    if( linearIdx < particlesInSuperCell )
-                    {
-                        auto const particle = frame[ linearIdx ];
-                        if(
-                            accFilter(
-                                acc,
-                                particle
-                            )
-                        )
+            while(frame.isValid())
+            {
+                // move over all particles in a frame
+                ForEachIdx<IdxConfig<maxParticlesPerFrame, numWorkers>>{workerIdx}(
+                    [&](uint32_t const linearIdx, uint32_t const) {
+                        if(linearIdx < particlesInSuperCell)
                         {
-                            /* kinetic Energy for Particles: E^2 = p^2*c^2 + m^2*c^4
-                             *                                   = c^2 * [p^2 + m^2*c^2]
-                             */
-                            float3_X const mom = particle[ momentum_ ];
-                            float_X const weighting = particle[ weighting_ ];
-                            float_X const mass = attribute::getMass(
-                                weighting,
-                                particle
-                            );
-
-                            // calculate kinetic energy of the macro particle
-                            float_X localEnergy = KinEnergy< >( )(
-                                mom,
-                                mass
-                            );
-
-                            localEnergy /= weighting;
-
-                            /* +1 move value from 1 to numBins+1 */
-                            int binNumber = math::floor(
-                                ( localEnergy - minEnergy ) /
-                                ( maxEnergy - minEnergy ) * static_cast< float_X >( numBins )
-                            )  + 1;
-
-                            int const maxBin = numBins + 1;
-
-                            /* all entries larger than maxEnergy go into bin maxBin */
-                            binNumber = binNumber < maxBin ? binNumber : maxBin;
-
-                            /* all entries smaller than minEnergy go into bin zero */
-                            binNumber = binNumber > 0 ? binNumber : 0;
-
-                            /*!\todo: we can't use 64bit type on this place (NVIDIA BUG?)
-                             * COMPILER ERROR: ptxas /tmp/tmpxft_00005da6_00000000-2_main.ptx, line 4246; error   : Global state space expected for instruction 'atom'
-                             * I think this is a problem with extern shared mem and atmic (only on TESLA)
-                             * NEXT BUG: don't do uint32_t w=__float2uint_rn(weighting); and use w for atomic, this create wrong results
-                             *
-                             * uses a normed float weighting to avoid an overflow of the floating point result
-                             * for the reduced weighting if the particle weighting is very large
-                             */
-                            float_X const normedWeighting = weighting /
-                                float_X( particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE );
-                            atomicAdd(
-                                &( shBin[ binNumber ] ),
-                                normedWeighting,
-                                ::alpaka::hierarchy::Threads{}
-                            );
+                            auto const particle = frame[linearIdx];
+                            if(accFilter(acc, particle))
+                            {
+                                /* kinetic Energy for Particles: E^2 = p^2*c^2 + m^2*c^4
+                                 *                                   = c^2 * [p^2 + m^2*c^2]
+                                 */
+                                float3_X const mom = particle[momentum_];
+                                float_X const weighting = particle[weighting_];
+                                float_X const mass = attribute::getMass(weighting, particle);
+
+                                // calculate kinetic energy of the macro particle
+                                float_X localEnergy = KinEnergy<>()(mom, mass);
+
+                                localEnergy /= weighting;
+
+                                /* +1 move value from 1 to numBins+1 */
+                                int binNumber = math::floor(
+                                                    (localEnergy - minEnergy) / (maxEnergy - minEnergy)
+                                                    * static_cast<float_X>(numBins))
+                                    + 1;
+
+                                int const maxBin = numBins + 1;
+
+                                /* all entries larger than maxEnergy go into bin maxBin */
+                                binNumber = binNumber < maxBin ? binNumber : maxBin;
+
+                                /* all entries smaller than minEnergy go into bin zero */
+                                binNumber = binNumber > 0 ? binNumber : 0;
+
+                                /*!\todo: we can't use 64bit type on this place (NVIDIA BUG?)
+                                 * COMPILER ERROR: ptxas /tmp/tmpxft_00005da6_00000000-2_main.ptx, line 4246; error   :
+                                 * Global state space expected for instruction 'atom' I think this is a problem with
+                                 * extern shared mem and atmic (only on TESLA) NEXT BUG: don't do uint32_t
+                                 * w=__float2uint_rn(weighting); and use w for atomic, this create wrong results
+                                 *
+                                 * uses a normed float weighting to avoid an overflow of the floating point result
+                                 * for the reduced weighting if the particle weighting is very large
+                                 */
+                                float_X const normedWeighting
+                                    = weighting / float_X(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
+                                cupla::atomicAdd(
+                                    acc,
+                                    &(shBin[binNumber]),
+                                    normedWeighting,
+                                    ::alpaka::hierarchy::Threads{});
+                            }
                         }
-                    }
-                }
-            );
+                    });
 
-            __syncthreads();
+                cupla::__syncthreads(acc);
 
-            ForEachIdx< MasterOnly >{ workerIdx }(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    frame = pb.getPreviousFrame( frame );
+                ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
+                    frame = pb.getPreviousFrame(frame);
                     particlesInSuperCell = maxParticlesPerFrame;
-                }
-            );
-            __syncthreads();
-        }
-
-        ForEachIdx<
-            IdxConfig<
-                numWorkers,
-                numWorkers
-            >
-        >{ workerIdx }(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
-                for( int i = linearIdx; i < realNumBins; i += numWorkers )
-                    atomicAdd(
-                        &( gBins[ i ] ),
-                        float_64( shBin[ i ] ),
-                        ::alpaka::hierarchy::Blocks{}
-                    );
+                });
+                cupla::__syncthreads(acc);
             }
-        );
-    }
-};
 
-template<class ParticlesType>
-class BinEnergyParticles : public plugins::multi::ISlave
-{
-private:
+            ForEachIdx<IdxConfig<numWorkers, numWorkers>>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                for(int i = linearIdx; i < realNumBins; i += numWorkers)
+                    cupla::atomicAdd(acc, &(gBins[i]), float_64(shBin[i]), ::alpaka::hierarchy::Blocks{});
+            });
+        }
+    };
 
-    struct Help : public plugins::multi::IHelp
+    template<class ParticlesType>
+    class BinEnergyParticles : public plugins::multi::ISlave
     {
-
-        /** creates a instance of ISlave
-         *
-         * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
-         * @param help plugin defined help
-         * @param id index of the plugin, range: [0;help->getNumPlugins())
-         */
-        std::shared_ptr< ISlave > create(
-            std::shared_ptr< IHelp > & help,
-            size_t const id,
-            MappingDesc* cellDescription
-        )
+    private:
+        struct Help : public plugins::multi::IHelp
         {
-            return std::shared_ptr< ISlave >(
-                new BinEnergyParticles< ParticlesType >(
-                    help,
-                    id,
-                    cellDescription
-                )
-            );
-        }
+            /** creates a instance of ISlave
+             *
+             * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
+             * @param help plugin defined help
+             * @param id index of the plugin, range: [0;help->getNumPlugins())
+             */
+            std::shared_ptr<ISlave> create(std::shared_ptr<IHelp>& help, size_t const id, MappingDesc* cellDescription)
+            {
+                return std::shared_ptr<ISlave>(new BinEnergyParticles<ParticlesType>(help, id, cellDescription));
+            }
 
-        // find all valid filter for the current used species
-        using EligibleFilters = typename MakeSeqFromNestedSeq<
-            typename bmpl::transform<
+            // find all valid filter for the current used species
+            using EligibleFilters = typename MakeSeqFromNestedSeq<typename bmpl::transform<
                 particles::filter::AllParticleFilters,
-                particles::traits::GenerateSolversIfSpeciesEligible<
-                    bmpl::_1,
-                    ParticlesType
-                >
-            >::type
-        >::type;
-
-        //! periodicity of computing the particle energy
-        plugins::multi::Option< std::string > notifyPeriod = {
-            "period",
-            "enable plugin [for each n-th step]"
-        };
-        plugins::multi::Option< std::string > filter = {
-            "filter",
-            "particle filter: "
-        };
-        plugins::multi::Option< int > numBins = {
-            "binCount",
-            "number of bins for the energy range",
-            1024
-        };
-        plugins::multi::Option< float_X > minEnergy_keV = {
-            "minEnergy",
-            "minEnergy[in keV]",
-            0.0
-        };
-        plugins::multi::Option< float_X > maxEnergy_keV = {
-            "maxEnergy",
-            "maxEnergy[in keV]"
-        };
-
-        //! string list with all possible particle filters
-        std::string concatenatedFilterNames;
-        std::vector< std::string > allowedFilters;
+                particles::traits::GenerateSolversIfSpeciesEligible<bmpl::_1, ParticlesType>>::type>::type;
+
+            //! periodicity of computing the particle energy
+            plugins::multi::Option<std::string> notifyPeriod = {"period", "enable plugin [for each n-th step]"};
+            plugins::multi::Option<std::string> filter = {"filter", "particle filter: "};
+            plugins::multi::Option<int> numBins = {"binCount", "number of bins for the energy range", 1024};
+            plugins::multi::Option<float_X> minEnergy_keV = {"minEnergy", "minEnergy[in keV]", 0.0};
+            plugins::multi::Option<float_X> maxEnergy_keV = {"maxEnergy", "maxEnergy[in keV]"};
+
+            //! string list with all possible particle filters
+            std::string concatenatedFilterNames;
+            std::vector<std::string> allowedFilters;
+
+            ///! method used by plugin controller to get --help description
+            void registerHelp(
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
+            {
+                meta::ForEach<EligibleFilters, plugins::misc::AppendName<bmpl::_1>> getEligibleFilterNames;
+                getEligibleFilterNames(allowedFilters);
 
-        ///! method used by plugin controller to get --help description
-        void registerHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        )
-        {
-            meta::ForEach<
-                EligibleFilters,
-                plugins::misc::AppendName< bmpl::_1 >
-            > getEligibleFilterNames;
-            getEligibleFilterNames( allowedFilters );
-
-            concatenatedFilterNames = plugins::misc::concatenateToString(
-                allowedFilters,
-                ", "
-            );
-
-            notifyPeriod.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            filter.registerHelp(
-                desc,
-                masterPrefix + prefix,
-                std::string( "[" ) + concatenatedFilterNames + "]"
-            );
-            numBins.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            minEnergy_keV.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            maxEnergy_keV.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-        }
+                concatenatedFilterNames = plugins::misc::concatenateToString(allowedFilters, ", ");
 
-        void expandHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        )
-        {
-        }
+                notifyPeriod.registerHelp(desc, masterPrefix + prefix);
+                filter.registerHelp(desc, masterPrefix + prefix, std::string("[") + concatenatedFilterNames + "]");
+                numBins.registerHelp(desc, masterPrefix + prefix);
+                minEnergy_keV.registerHelp(desc, masterPrefix + prefix);
+                maxEnergy_keV.registerHelp(desc, masterPrefix + prefix);
+            }
 
+            void expandHelp(
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
+            {
+            }
 
-        void validateOptions()
-        {
-            if( notifyPeriod.size() != filter.size() )
-                throw std::runtime_error( name + ": parameter filter and period are not used the same number of times" );
-            if( notifyPeriod.size() != maxEnergy_keV.size() )
-                throw std::runtime_error( name + ": parameter maxEnergy and period are not used the same number of times" );
 
-            // check if user passed filter name are valid
-            for( auto const & filterName : filter)
+            void validateOptions()
             {
-                if(
-                    std::find(
-                        allowedFilters.begin(),
-                        allowedFilters.end(),
-                        filterName
-                    ) == allowedFilters.end()
-                )
+                if(notifyPeriod.size() != filter.size())
+                    throw std::runtime_error(
+                        name + ": parameter filter and period are not used the same number of times");
+                if(notifyPeriod.size() != maxEnergy_keV.size())
+                    throw std::runtime_error(
+                        name + ": parameter maxEnergy and period are not used the same number of times");
+
+                // check if user passed filter name are valid
+                for(auto const& filterName : filter)
                 {
-                    throw std::runtime_error( name + ": unknown filter '" + filterName + "'" );
+                    if(std::find(allowedFilters.begin(), allowedFilters.end(), filterName) == allowedFilters.end())
+                    {
+                        throw std::runtime_error(name + ": unknown filter '" + filterName + "'");
+                    }
                 }
             }
-        }
-
-        size_t getNumPlugins() const
-        {
-            return notifyPeriod.size();
-        }
-
-        std::string getDescription() const
-        {
-            return description;
-        }
-
-        std::string getOptionPrefix() const
-        {
-            return prefix;
-        }
 
-        std::string getName() const
-        {
-            return name;
-        }
+            size_t getNumPlugins() const
+            {
+                return notifyPeriod.size();
+            }
 
-        std::string const name = "BinEnergyParticles";
-        //! short description of the plugin
-        std::string const description = "calculate a energy histogram of a species";
-        //! prefix used for command line arguments
-        std::string const prefix = ParticlesType::FrameType::getName( ) + std::string( "_energyHistogram" );
-    };
+            std::string getDescription() const
+            {
+                return description;
+            }
 
-    GridBuffer<float_64, DIM1> *gBins = nullptr;
-    MappingDesc *m_cellDescription = nullptr;
+            std::string getOptionPrefix() const
+            {
+                return prefix;
+            }
 
-    std::string filename;
+            std::string getName() const
+            {
+                return name;
+            }
 
-    float_64 * binReduced = nullptr;
+            std::string const name = "BinEnergyParticles";
+            //! short description of the plugin
+            std::string const description = "calculate a energy histogram of a species";
+            //! prefix used for command line arguments
+            std::string const prefix = ParticlesType::FrameType::getName() + std::string("_energyHistogram");
+        };
 
-    int numBins;
-    int realNumBins;
-    /* variables for energy limits of the histogram in keV */
-    float_X minEnergy_keV;
-    float_X maxEnergy_keV;
+        GridBuffer<float_64, DIM1>* gBins = nullptr;
+        MappingDesc* m_cellDescription = nullptr;
 
-    std::ofstream outFile;
+        std::string filename;
 
-    /* only rank 0 create a file */
-    bool writeToFile = false;
+        float_64* binReduced = nullptr;
 
-    mpi::MPIReduce reduce;
+        int numBins;
+        int realNumBins;
+        /* variables for energy limits of the histogram in keV */
+        float_X minEnergy_keV;
+        float_X maxEnergy_keV;
 
-    std::shared_ptr< Help > m_help;
-    size_t m_id;
+        std::ofstream outFile;
 
-public:
+        /* only rank 0 create a file */
+        bool writeToFile = false;
 
-    //! must be implemented by the user
-    static std::shared_ptr< plugins::multi::IHelp > getHelp()
-    {
-        return std::shared_ptr< plugins::multi::IHelp >( new Help{ } );
-    }
-
-    BinEnergyParticles(
-        std::shared_ptr< plugins::multi::IHelp > & help,
-        size_t const id,
-        MappingDesc* cellDescription
-    ) :
-        m_help( std::static_pointer_cast< Help >(help) ),
-        m_id( id ),
-        m_cellDescription( cellDescription )
-    {
-        filename = m_help->getOptionPrefix() + "_" + m_help->filter.get( m_id ) + ".dat";
+        mpi::MPIReduce reduce;
 
-        numBins = m_help->numBins.get( m_id );
+        std::shared_ptr<Help> m_help;
+        size_t m_id;
 
-        if( numBins <= 0 )
+    public:
+        //! must be implemented by the user
+        static std::shared_ptr<plugins::multi::IHelp> getHelp()
         {
-            throw std::runtime_error(
-                std::string("[Plugin] [") + m_help->getOptionPrefix( ) +
-                "] error since " + m_help->getOptionPrefix( ) +
-                ".binCount) must be > 0 (input " +
-                std::to_string( numBins ) + " bins)"
-            );
+            return std::shared_ptr<plugins::multi::IHelp>(new Help{});
         }
 
-        minEnergy_keV = m_help->minEnergy_keV.get( m_id );
-        maxEnergy_keV = m_help->maxEnergy_keV.get( m_id );
+        BinEnergyParticles(std::shared_ptr<plugins::multi::IHelp>& help, size_t const id, MappingDesc* cellDescription)
+            : m_help(std::static_pointer_cast<Help>(help))
+            , m_id(id)
+            , m_cellDescription(cellDescription)
+        {
+            filename = m_help->getOptionPrefix() + "_" + m_help->filter.get(m_id) + ".dat";
 
-        realNumBins = numBins + 2;
+            numBins = m_help->numBins.get(m_id);
 
-        /* create an array of float_64 on gpu und host */
-        gBins = new GridBuffer<float_64, DIM1 > (DataSpace<DIM1 > (realNumBins));
-        binReduced = new float_64[realNumBins];
-        for (int i = 0; i < realNumBins; ++i)
-        {
-            binReduced[i] = 0.0;
-        }
+            if(numBins <= 0)
+            {
+                throw std::runtime_error(
+                    std::string("[Plugin] [") + m_help->getOptionPrefix() + "] error since "
+                    + m_help->getOptionPrefix() + ".binCount) must be > 0 (input " + std::to_string(numBins)
+                    + " bins)");
+            }
 
-        writeToFile = reduce.hasResult(mpi::reduceMethods::Reduce());
-        if( writeToFile )
-            openNewFile();
+            minEnergy_keV = m_help->minEnergy_keV.get(m_id);
+            maxEnergy_keV = m_help->maxEnergy_keV.get(m_id);
 
-        // set how often the plugin should be executed while PIConGPU is running
-        Environment<>::get( ).PluginConnector( ).setNotificationPeriod(
-            this,
-            m_help->notifyPeriod.get( id )
-        );
+            realNumBins = numBins + 2;
 
-    }
+            /* create an array of float_64 on gpu und host */
+            gBins = new GridBuffer<float_64, DIM1>(DataSpace<DIM1>(realNumBins));
+            binReduced = new float_64[realNumBins];
+            for(int i = 0; i < realNumBins; ++i)
+            {
+                binReduced[i] = 0.0;
+            }
 
-    virtual ~BinEnergyParticles()
-    {
-        if (writeToFile)
-        {
-            outFile.flush();
-            outFile << std::endl; /* now all data are written to file */
-            if (outFile.fail())
-                std::cerr << "Error on flushing file [" << filename << "]. " << std::endl;
-            outFile.close();
+            writeToFile = reduce.hasResult(mpi::reduceMethods::Reduce());
+            if(writeToFile)
+                openNewFile();
+
+            // set how often the plugin should be executed while PIConGPU is running
+            Environment<>::get().PluginConnector().setNotificationPeriod(this, m_help->notifyPeriod.get(id));
         }
 
-        __delete(gBins);
-        __deleteArray(binReduced);
-    }
+        virtual ~BinEnergyParticles()
+        {
+            if(writeToFile)
+            {
+                outFile.flush();
+                outFile << std::endl; /* now all data are written to file */
+                if(outFile.fail())
+                    std::cerr << "Error on flushing file [" << filename << "]. " << std::endl;
+                outFile.close();
+            }
 
-    void notify(uint32_t currentStep)
-    {
-        calBinEnergyParticles < CORE + BORDER > (currentStep);
-    }
+            __delete(gBins);
+            __deleteArray(binReduced);
+        }
 
-    void restart(
-        uint32_t restartStep,
-        std::string const & restartDirectory
-    )
-    {
-        if( !writeToFile )
-            return;
-
-        writeToFile = restoreTxtFile(
-            outFile,
-            filename,
-            restartStep,
-            restartDirectory
-        );
-    }
-
-    void checkpoint(
-        uint32_t currentStep,
-        std::string const & checkpointDirectory
-    )
-    {
-        if( !writeToFile )
-            return;
+        void notify(uint32_t currentStep)
+        {
+            calBinEnergyParticles<CORE + BORDER>(currentStep);
+        }
 
-        checkpointTxtFile(
-            outFile,
-            filename,
-            currentStep,
-            checkpointDirectory
-        );
-    }
+        void restart(uint32_t restartStep, std::string const& restartDirectory)
+        {
+            if(!writeToFile)
+                return;
 
-private:
+            writeToFile = restoreTxtFile(outFile, filename, restartStep, restartDirectory);
+        }
 
-    /* Open a New Output File
-     *
-     * Must only be called by the rank with writeToFile == true
-     */
-    void openNewFile()
-    {
-        outFile.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
-        if (!outFile)
+        void checkpoint(uint32_t currentStep, std::string const& checkpointDirectory)
         {
-            std::cerr << "[Plugin] [" << m_help->getOptionPrefix( )
-                      << "] Can't open file '" << filename
-                      << "', output disabled" << std::endl;
-            writeToFile = false;
+            if(!writeToFile)
+                return;
+
+            checkpointTxtFile(outFile, filename, currentStep, checkpointDirectory);
         }
-        else
+
+    private:
+        /* Open a New Output File
+         *
+         * Must only be called by the rank with writeToFile == true
+         */
+        void openNewFile()
         {
-            /* create header of the file */
-            outFile << "#step <" << minEnergy_keV << " ";
-            float_X binEnergy = (maxEnergy_keV - minEnergy_keV) / (float_32) numBins;
-            for (int i = 1; i < realNumBins - 1; ++i)
-                outFile << minEnergy_keV + ((float_32) i * binEnergy) << " ";
+            outFile.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
+            if(!outFile)
+            {
+                std::cerr << "[Plugin] [" << m_help->getOptionPrefix() << "] Can't open file '" << filename
+                          << "', output disabled" << std::endl;
+                writeToFile = false;
+            }
+            else
+            {
+                /* create header of the file */
+                outFile << "#step <" << minEnergy_keV << " ";
+                float_X binEnergy = (maxEnergy_keV - minEnergy_keV) / (float_32) numBins;
+                for(int i = 1; i < realNumBins - 1; ++i)
+                    outFile << minEnergy_keV + ((float_32) i * binEnergy) << " ";
 
-            outFile << ">" << maxEnergy_keV << " count" << std::endl;
+                outFile << ">" << maxEnergy_keV << " count" << std::endl;
+            }
         }
-    }
 
-    template< uint32_t AREA >
-    void calBinEnergyParticles(uint32_t currentStep)
-    {
-        gBins->getDeviceBuffer().setValue(0);
-
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto particles = dc.get< ParticlesType >( ParticlesType::FrameType::getName(), true );
-
-        /* convert energy values from keV to PIConGPU units */
-        float_X const minEnergy = minEnergy_keV * UNITCONV_keV_to_Joule / UNIT_ENERGY;
-        float_X const maxEnergy = maxEnergy_keV * UNITCONV_keV_to_Joule / UNIT_ENERGY;
-
-        constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-             pmacc::math::CT::volume< SuperCellSize >::type::value
-         >::value;
-
-        AreaMapping<
-            AREA,
-            MappingDesc
-        > mapper( *m_cellDescription );
-
-        auto kernel = PMACC_KERNEL( KernelBinEnergyParticles< numWorkers >{ } )(
-            mapper.getGridDim( ),
-            numWorkers,
-            realNumBins * sizeof( float_X )
-        );
-
-        auto bindKernel = std::bind(
-            kernel,
-            particles->getDeviceParticlesBox( ),
-            gBins->getDeviceBuffer( ).getDataBox( ),
-            numBins,
-            minEnergy,
-            maxEnergy,
-            mapper,
-            std::placeholders::_1
-        );
-
-        meta::ForEach<
-            typename Help::EligibleFilters,
-            plugins::misc::ExecuteIfNameIsEqual< bmpl::_1 >
-        >{ }(
-            m_help->filter.get( m_id ),
-            currentStep,
-            bindKernel
-        );
-
-        dc.releaseData( ParticlesType::FrameType::getName() );
-        gBins->deviceToHost();
-
-        reduce(nvidia::functors::Add(),
-               binReduced,
-               gBins->getHostBuffer().getBasePointer(),
-               realNumBins, mpi::reduceMethods::Reduce());
-
-
-        if (writeToFile)
+        template<uint32_t AREA>
+        void calBinEnergyParticles(uint32_t currentStep)
         {
-            using dbl = std::numeric_limits<float_64>;
+            gBins->getDeviceBuffer().setValue(0);
 
-            outFile.precision(dbl::digits10);
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
 
-            /* write data to file */
-            float_64 count_particles = 0.0;
-            outFile << currentStep << " "
-                    << std::scientific; /*  for floating points, ignored for ints */
+            /* convert energy values from keV to PIConGPU units */
+            float_X const minEnergy = minEnergy_keV * UNITCONV_keV_to_Joule / UNIT_ENERGY;
+            float_X const maxEnergy = maxEnergy_keV * UNITCONV_keV_to_Joule / UNIT_ENERGY;
 
-            for (int i = 0; i < realNumBins; ++i)
+            constexpr uint32_t numWorkers
+                = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+            AreaMapping<AREA, MappingDesc> mapper(*m_cellDescription);
+
+            auto kernel = PMACC_KERNEL(KernelBinEnergyParticles<numWorkers>{})(
+                mapper.getGridDim(),
+                numWorkers,
+                realNumBins * sizeof(float_X));
+
+            auto bindKernel = std::bind(
+                kernel,
+                particles->getDeviceParticlesBox(),
+                gBins->getDeviceBuffer().getDataBox(),
+                numBins,
+                minEnergy,
+                maxEnergy,
+                mapper,
+                std::placeholders::_1);
+
+            meta::ForEach<typename Help::EligibleFilters, plugins::misc::ExecuteIfNameIsEqual<bmpl::_1>>{}(
+                m_help->filter.get(m_id),
+                currentStep,
+                bindKernel);
+
+            dc.releaseData(ParticlesType::FrameType::getName());
+            gBins->deviceToHost();
+
+            reduce(
+                nvidia::functors::Add(),
+                binReduced,
+                gBins->getHostBuffer().getBasePointer(),
+                realNumBins,
+                mpi::reduceMethods::Reduce());
+
+
+            if(writeToFile)
             {
-                count_particles += float_64( binReduced[i]);
-                outFile << std::scientific << (binReduced[i]) * float_64(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE) << " ";
+                using dbl = std::numeric_limits<float_64>;
+
+                outFile.precision(dbl::digits10);
+
+                /* write data to file */
+                float_64 count_particles = 0.0;
+                outFile << currentStep << " " << std::scientific; /*  for floating points, ignored for ints */
+
+                for(int i = 0; i < realNumBins; ++i)
+                {
+                    count_particles += float_64(binReduced[i]);
+                    outFile << std::scientific
+                            << (binReduced[i]) * float_64(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE) << " ";
+                }
+                /* endl: Flush any step to the file.
+                 * Thus, we will have data if the program should crash.
+                 */
+                outFile << std::scientific
+                        << count_particles * float_64(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE) << std::endl;
             }
-            /* endl: Flush any step to the file.
-             * Thus, we will have data if the program should crash.
-             */
-            outFile << std::scientific << count_particles * float_64(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE)
-                << std::endl;
         }
-    }
-
-};
+    };
 
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_UnspecifiedSpecies
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        BinEnergyParticles< T_UnspecifiedSpecies >
-    >
+    namespace particles
     {
-        using FrameType = typename T_Species::FrameType;
-
-        // this plugin needs at least the weighting and momentum attributes
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            momentum
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        // and also a mass ratio for energy calculation from momentum
-        using SpeciesHasFlags = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasFlags
-        >;
-    };
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            template<typename T_Species, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<T_Species, BinEnergyParticles<T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                // this plugin needs at least the weighting and momentum attributes
+                using RequiredIdentifiers = MakeSeq_t<weighting, momentum>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                // and also a mass ratio for energy calculation from momentum
+                using SpeciesHasFlags = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasFlags>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/plugins/ChargeConservation.hpp b/include/picongpu/plugins/ChargeConservation.hpp
index 888fcb0f0a..d4cfafc334 100644
--- a/include/picongpu/plugins/ChargeConservation.hpp
+++ b/include/picongpu/plugins/ChargeConservation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,82 +33,70 @@
 
 namespace picongpu
 {
-using namespace pmacc;
-
-namespace po = boost::program_options;
-
-/**
- * @class ChargeConservation
- * @brief maximum difference between electron charge density and div E
- *
- * WARNING: This plugin assumes a Yee-cell!
- * Do not use it together with other field solvers like `directional splitting` or `Lehe`
- */
-class ChargeConservation : public ISimulationPlugin
-{
-private:
-    std::string name;
-    std::string prefix;
-    std::string notifyPeriod;
-    const std::string filename;
-    MappingDesc* cellDescription;
-    std::ofstream output_file;
-
-    using AllGPU_reduce = boost::shared_ptr<pmacc::algorithm::mpi::Reduce<simDim> >;
-    AllGPU_reduce allGPU_reduce;
-
-    HINLINE void restart(uint32_t restartStep, const std::string restartDirectory);
-    HINLINE void checkpoint(uint32_t currentStep, const std::string checkpointDirectory);
-
-    HINLINE void pluginLoad();
-public:
-    HINLINE ChargeConservation();
-    virtual ~ChargeConservation() {}
-
-    HINLINE void notify(uint32_t currentStep);
-    HINLINE void setMappingDescription(MappingDesc*);
-    HINLINE void pluginRegisterHelp(po::options_description& desc);
-    HINLINE std::string pluginGetName() const;
-};
-
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        ChargeConservation
-    >
+    using namespace pmacc;
+
+    namespace po = boost::program_options;
+
+    /**
+     * @class ChargeConservation
+     * @brief maximum difference between electron charge density and div E
+     *
+     * WARNING: This plugin assumes a Yee-cell!
+     * Do not use it together with other field solvers like `directional splitting` or `Lehe`
+     */
+    class ChargeConservation : public ISimulationPlugin
     {
-        using FrameType = typename T_Species::FrameType;
-
-        // this plugin needs at least the weighting particle attribute
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        // and also a charge ratio for a charge density
-        using SpeciesHasFlags = typename pmacc::traits::HasFlag<
-            FrameType,
-            chargeRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasFlags
-        >;
+    private:
+        std::string name;
+        std::string prefix;
+        std::string notifyPeriod;
+        const std::string filename;
+        MappingDesc* cellDescription;
+        std::ofstream output_file;
+
+        using AllGPU_reduce = boost::shared_ptr<pmacc::algorithm::mpi::Reduce<simDim>>;
+        AllGPU_reduce allGPU_reduce;
+
+        HINLINE void restart(uint32_t restartStep, const std::string restartDirectory);
+        HINLINE void checkpoint(uint32_t currentStep, const std::string checkpointDirectory);
+
+        HINLINE void pluginLoad();
+
+    public:
+        HINLINE ChargeConservation();
+        virtual ~ChargeConservation()
+        {
+        }
+
+        HINLINE void notify(uint32_t currentStep);
+        HINLINE void setMappingDescription(MappingDesc*);
+        HINLINE void pluginRegisterHelp(po::options_description& desc);
+        HINLINE std::string pluginGetName() const;
     };
 
-} // namespace traits
-} // namespace particles
+    namespace particles
+    {
+        namespace traits
+        {
+            template<typename T_Species>
+            struct SpeciesEligibleForSolver<T_Species, ChargeConservation>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                // this plugin needs at least the weighting particle attribute
+                using RequiredIdentifiers = MakeSeq_t<weighting>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                // and also a charge ratio for a charge density
+                using SpeciesHasFlags = typename pmacc::traits::HasFlag<FrameType, chargeRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasFlags>;
+            };
+
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
 
 #include "ChargeConservation.tpp"
diff --git a/include/picongpu/plugins/ChargeConservation.tpp b/include/picongpu/plugins/ChargeConservation.tpp
index f4c623c7f4..f76a3bed5e 100644
--- a/include/picongpu/plugins/ChargeConservation.tpp
+++ b/include/picongpu/plugins/ChargeConservation.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -26,8 +26,6 @@
 #include <pmacc/math/vector/Int.hpp>
 #include <pmacc/math/vector/Float.hpp>
 #include <pmacc/math/vector/Size_t.hpp>
-#include <pmacc/math/vector/math_functor/abs.hpp>
-#include <pmacc/math/vector/math_functor/max.hpp>
 #include <pmacc/dataManagement/DataConnector.hpp>
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/cuSTL/container/DeviceBuffer.hpp>
@@ -50,254 +48,228 @@
 
 namespace picongpu
 {
-
-ChargeConservation::ChargeConservation()
-    : name("ChargeConservation: Print the maximum charge deviation between particles and div E to textfile 'chargeConservation.dat'"),
-      prefix("chargeConservation"), filename("chargeConservation.dat"),
-      cellDescription(nullptr)
-{
-    Environment<>::get().PluginConnector().registerPlugin(this);
-}
-
-void ChargeConservation::pluginRegisterHelp(po::options_description& desc)
-{
-    desc.add_options()
-        ((this->prefix + ".period").c_str(),
-        po::value<std::string> (&this->notifyPeriod), "enable plugin [for each n-th step]");
-}
-
-std::string ChargeConservation::pluginGetName() const {return this->name;}
-
-void ChargeConservation::pluginLoad()
-{
-    if(this->notifyPeriod.empty())
-        return;
-
-    Environment<>::get().PluginConnector().setNotificationPeriod(this, this->notifyPeriod);
-
-    pmacc::GridController<simDim>& con = pmacc::Environment<simDim>::get().GridController();
-    using namespace pmacc::math;
-    Size_t<simDim> gpuDim = (Size_t<simDim>)con.getGpuNodes();
-    zone::SphericZone<simDim> zone_allGPUs(gpuDim);
-    this->allGPU_reduce = AllGPU_reduce(new pmacc::algorithm::mpi::Reduce<simDim>(zone_allGPUs));
-
-    if(this->allGPU_reduce->root())
+    ChargeConservation::ChargeConservation()
+        : name("ChargeConservation: Print the maximum charge deviation between particles and div E to textfile "
+               "'chargeConservation.dat'")
+        , prefix("chargeConservation")
+        , filename("chargeConservation.dat")
+        , cellDescription(nullptr)
     {
-        this->output_file.open(this->filename.c_str(), std::ios_base::app);
-        this->output_file << "#timestep max-charge-deviation unit[As]" << std::endl;
+        Environment<>::get().PluginConnector().registerPlugin(this);
     }
-}
-
-void ChargeConservation::restart(uint32_t restartStep, const std::string restartDirectory)
-{
-    if(this->notifyPeriod.empty())
-        return;
-
-    if(!this->allGPU_reduce->root())
-        return;
-
-    restoreTxtFile( this->output_file,
-                    this->filename,
-                    restartStep,
-                    restartDirectory );
-}
-
-void ChargeConservation::checkpoint(uint32_t currentStep, const std::string checkpointDirectory)
-{
-    if(this->notifyPeriod.empty())
-        return;
-
-    if(!this->allGPU_reduce->root())
-        return;
-
-    checkpointTxtFile( this->output_file,
-                       this->filename,
-                       currentStep,
-                       checkpointDirectory );
-}
-
-void ChargeConservation::setMappingDescription(MappingDesc* cellDescription)
-{
-    this->cellDescription = cellDescription;
-}
 
-namespace detail
-{
-
-/**
- * @class Div
- * @brief divergence functor for 2D and 3D
- *
- * NOTE: This functor uses a Yee-cell stencil.
- */
-template<int dim, typename ValueType>
-struct Div;
-
-template<typename ValueType>
-struct Div<DIM3, ValueType>
-{
-    using result_type = ValueType;
-
-    template<typename Field>
-    HDINLINE ValueType operator()(Field field) const
+    void ChargeConservation::pluginRegisterHelp(po::options_description& desc)
     {
-        const ValueType reciWidth = float_X(1.0) / cellSize.x();
-        const ValueType reciHeight = float_X(1.0) / cellSize.y();
-        const ValueType reciDepth = float_X(1.0) / cellSize.z();
-        return ((*field).x() - (*field(-1,0,0)).x()) * reciWidth +
-               ((*field).y() - (*field(0,-1,0)).y()) * reciHeight +
-               ((*field).z() - (*field(0,0,-1)).z()) * reciDepth;
+        desc.add_options()(
+            (this->prefix + ".period").c_str(),
+            po::value<std::string>(&this->notifyPeriod),
+            "enable plugin [for each n-th step]");
     }
-};
 
-template<typename ValueType>
-struct Div<DIM2, ValueType>
-{
-    using result_type = ValueType;
-
-    template<typename Field>
-    HDINLINE ValueType operator()(Field field) const
+    std::string ChargeConservation::pluginGetName() const
     {
-        const ValueType reciWidth = float_X(1.0) / cellSize.x();
-        const ValueType reciHeight = float_X(1.0) / cellSize.y();
-        return ((*field).x() - (*field(-1,0)).x()) * reciWidth +
-               ((*field).y() - (*field(0,-1)).y()) * reciHeight;
+        return this->name;
     }
-};
 
-// functor for all species to calculate density
-template<typename T_SpeciesType, typename T_Area>
-struct ComputeChargeDensity
-{
-    using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<
-        VectorAllSpecies,
-        T_SpeciesType
-    >;
-    static const uint32_t area = T_Area::value;
-
-    HINLINE void operator()( FieldTmp* fieldTmp,
-                             const uint32_t currentStep) const
+    void ChargeConservation::pluginLoad()
     {
-        DataConnector &dc = Environment<>::get().DataConnector();
+        if(this->notifyPeriod.empty())
+            return;
+
+        Environment<>::get().PluginConnector().setNotificationPeriod(this, this->notifyPeriod);
+
+        pmacc::GridController<simDim>& con = pmacc::Environment<simDim>::get().GridController();
+        using namespace pmacc::math;
+        Size_t<simDim> gpuDim = (Size_t<simDim>) con.getGpuNodes();
+        zone::SphericZone<simDim> zone_allGPUs(gpuDim);
+        this->allGPU_reduce = AllGPU_reduce(new pmacc::algorithm::mpi::Reduce<simDim>(zone_allGPUs));
+
+        if(this->allGPU_reduce->root())
+        {
+            this->output_file.open(this->filename.c_str(), std::ios_base::app);
+            this->output_file << "#timestep max-charge-deviation unit[As]" << std::endl;
+        }
+    }
 
-        /* load species without copying the particle data to the host */
-        auto speciesTmp = dc.get< SpeciesType >( SpeciesType::FrameType::getName(), true );
+    void ChargeConservation::restart(uint32_t restartStep, const std::string restartDirectory)
+    {
+        if(this->notifyPeriod.empty())
+            return;
 
-        /* run algorithm */
-        using ChargeDensitySolver = typename particles::particleToGrid::CreateFieldTmpOperation_t<
-            SpeciesType,
-            particles::particleToGrid::derivedAttributes::ChargeDensity
-        >::Solver;
+        if(!this->allGPU_reduce->root())
+            return;
 
-        fieldTmp->computeValue < area, ChargeDensitySolver > (*speciesTmp, currentStep);
-        dc.releaseData( SpeciesType::FrameType::getName() );
+        restoreTxtFile(this->output_file, this->filename, restartStep, restartDirectory);
     }
-};
 
-struct CalculateAndAssignChargeDeviation
-{
-    template<typename T_Rho, typename T_FieldE, typename T_Acc>
-    HDINLINE void operator()(
-        const T_Acc& acc,
-        T_Rho& rho,
-        const T_FieldE& fieldECursor
-    ) const
+    void ChargeConservation::checkpoint(uint32_t currentStep, const std::string checkpointDirectory)
     {
-        typedef Div<simDim, typename FieldTmp::ValueType> MyDiv;
+        if(this->notifyPeriod.empty())
+            return;
 
-        /* rho := | div E * eps_0 - rho | */
-        rho.x() = math::abs((MyDiv{}(fieldECursor) * EPS0 - rho).x());
+        if(!this->allGPU_reduce->root())
+            return;
+
+        checkpointTxtFile(this->output_file, this->filename, currentStep, checkpointDirectory);
     }
-};
 
-} // namespace detail
+    void ChargeConservation::setMappingDescription(MappingDesc* cellDescription)
+    {
+        this->cellDescription = cellDescription;
+    }
 
-void ChargeConservation::notify(uint32_t currentStep)
-{
-    typedef SuperCellSize BlockDim;
-
-    DataConnector &dc = Environment<>::get().DataConnector();
-
-    /* load FieldTmp without copy data to host */
-    PMACC_CASSERT_MSG(
-        _please_allocate_at_least_one_FieldTmp_in_memory_param,
-        fieldTmpNumSlots > 0
-    );
-    auto fieldTmp = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true );
-    /* reset density values to zero */
-    fieldTmp->getGridBuffer().getDeviceBuffer().setValue(FieldTmp::ValueType(0.0));
-
-    using EligibleSpecies = typename bmpl::copy_if<
-        VectorAllSpecies,
-        particles::traits::SpeciesEligibleForSolver<
-            bmpl::_1,
-            ChargeConservation
-        >
-    >::type;
-
-    // todo: log species that are used / ignored in this plugin with INFO
-
-    /* calculate and add the charge density values from all species in FieldTmp */
-    meta::ForEach<
-        EligibleSpecies,
-        picongpu::detail::ComputeChargeDensity<
-            bmpl::_1,
-            bmpl::int_< CORE + BORDER >
-        >,
-        bmpl::_1
-    > computeChargeDensity;
-    computeChargeDensity(fieldTmp.get(), currentStep);
-
-    /* add results of all species that are still in GUARD to next GPUs BORDER */
-    EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
-    __setTransactionEvent(fieldTmpEvent);
-
-    /* cast PMacc Buffer to cuSTL Buffer */
-    auto fieldTmp_coreBorder =
-                 fieldTmp->getGridBuffer().
-                 getDeviceBuffer().cartBuffer().
-                 view(this->cellDescription->getGuardingSuperCells()*BlockDim::toRT(),
-                      this->cellDescription->getGuardingSuperCells()*-BlockDim::toRT());
-
-    /* cast PMacc Buffer to cuSTL Buffer */
-    auto fieldE_coreBorder =
-                 dc.get< FieldE >( FieldE::getName(), true )->getGridBuffer().
-                 getDeviceBuffer().cartBuffer().
-                 view(this->cellDescription->getGuardingSuperCells()*BlockDim::toRT(),
-                      this->cellDescription->getGuardingSuperCells()*-BlockDim::toRT());
-
-    /* run calculation: fieldTmp = | div E * eps_0 - rho | */
-    using namespace pmacc::math::math_functor;
-    typedef picongpu::detail::Div<simDim, typename FieldTmp::ValueType> myDiv;
-    algorithm::kernel::Foreach<BlockDim>()(
-        fieldTmp_coreBorder.zone(),
-        fieldTmp_coreBorder.origin(),
-        cursor::make_NestedCursor(fieldE_coreBorder.origin()),
-        ::picongpu::detail::CalculateAndAssignChargeDeviation()
-    );
-
-    /* reduce charge derivation (fieldTmp) to get the maximum value */
-    typename FieldTmp::ValueType maxChargeDiff =
-        algorithm::kernel::Reduce()
-            (fieldTmp_coreBorder.origin(), fieldTmp_coreBorder.zone(), pmacc::nvidia::functors::Max());
-
-    /* reduce again across mpi cluster */
-    container::HostBuffer<typename FieldTmp::ValueType, 1> maxChargeDiff_host(1);
-    *maxChargeDiff_host.origin() = maxChargeDiff;
-    container::HostBuffer<typename FieldTmp::ValueType, 1> maxChargeDiff_cluster(1);
-    (*this->allGPU_reduce)(
-        maxChargeDiff_cluster,
-        maxChargeDiff_host,
-        ::pmacc::algorithms::math::Max<
-            typename FieldTmp::ValueType,
-            typename FieldTmp::ValueType
-        >()
-    );
-
-    if(!this->allGPU_reduce->root()) return;
-
-    this->output_file << currentStep << " " << (*maxChargeDiff_cluster.origin() * CELL_VOLUME).x()
-        << " " << UNIT_CHARGE << std::endl;
-}
+    namespace detail
+    {
+        /**
+         * @class Div
+         * @brief divergence functor for 2D and 3D
+         *
+         * NOTE: This functor uses a Yee-cell stencil.
+         */
+        template<int dim, typename ValueType>
+        struct Div;
+
+        template<typename ValueType>
+        struct Div<DIM3, ValueType>
+        {
+            using result_type = ValueType;
+
+            template<typename Field>
+            HDINLINE ValueType operator()(Field field) const
+            {
+                const ValueType reciWidth = float_X(1.0) / cellSize.x();
+                const ValueType reciHeight = float_X(1.0) / cellSize.y();
+                const ValueType reciDepth = float_X(1.0) / cellSize.z();
+                return ((*field).x() - (*field(-1, 0, 0)).x()) * reciWidth
+                    + ((*field).y() - (*field(0, -1, 0)).y()) * reciHeight
+                    + ((*field).z() - (*field(0, 0, -1)).z()) * reciDepth;
+            }
+        };
+
+        template<typename ValueType>
+        struct Div<DIM2, ValueType>
+        {
+            using result_type = ValueType;
+
+            template<typename Field>
+            HDINLINE ValueType operator()(Field field) const
+            {
+                const ValueType reciWidth = float_X(1.0) / cellSize.x();
+                const ValueType reciHeight = float_X(1.0) / cellSize.y();
+                return ((*field).x() - (*field(-1, 0)).x()) * reciWidth
+                    + ((*field).y() - (*field(0, -1)).y()) * reciHeight;
+            }
+        };
+
+        // functor for all species to calculate density
+        template<typename T_SpeciesType, typename T_Area>
+        struct ComputeChargeDensity
+        {
+            using SpeciesType = pmacc::particles::meta::FindByNameOrType_t<VectorAllSpecies, T_SpeciesType>;
+            static const uint32_t area = T_Area::value;
+
+            HINLINE void operator()(FieldTmp* fieldTmp, const uint32_t currentStep) const
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+
+                /* load species without copying the particle data to the host */
+                auto speciesTmp = dc.get<SpeciesType>(SpeciesType::FrameType::getName(), true);
+
+                /* run algorithm */
+                using ChargeDensitySolver = typename particles::particleToGrid::CreateFieldTmpOperation_t<
+                    SpeciesType,
+                    particles::particleToGrid::derivedAttributes::ChargeDensity>::Solver;
+
+                fieldTmp->computeValue<area, ChargeDensitySolver>(*speciesTmp, currentStep);
+                dc.releaseData(SpeciesType::FrameType::getName());
+            }
+        };
+
+        struct CalculateAndAssignChargeDeviation
+        {
+            template<typename T_Rho, typename T_FieldE, typename T_Acc>
+            HDINLINE void operator()(const T_Acc& acc, T_Rho& rho, const T_FieldE& fieldECursor) const
+            {
+                typedef Div<simDim, typename FieldTmp::ValueType> MyDiv;
+
+                /* rho := | div E * eps_0 - rho | */
+                rho.x() = math::abs((MyDiv{}(fieldECursor) *EPS0 - rho).x());
+            }
+        };
+
+    } // namespace detail
+
+    void ChargeConservation::notify(uint32_t currentStep)
+    {
+        typedef SuperCellSize BlockDim;
+
+        DataConnector& dc = Environment<>::get().DataConnector();
+
+        /* load FieldTmp without copy data to host */
+        PMACC_CASSERT_MSG(_please_allocate_at_least_one_FieldTmp_in_memory_param, fieldTmpNumSlots > 0);
+        auto fieldTmp = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+        /* reset density values to zero */
+        fieldTmp->getGridBuffer().getDeviceBuffer().setValue(FieldTmp::ValueType(0.0));
+
+        using EligibleSpecies = typename bmpl::
+            copy_if<VectorAllSpecies, particles::traits::SpeciesEligibleForSolver<bmpl::_1, ChargeConservation>>::type;
+
+        // todo: log species that are used / ignored in this plugin with INFO
+
+        /* calculate and add the charge density values from all species in FieldTmp */
+        meta::ForEach<
+            EligibleSpecies,
+            picongpu::detail::ComputeChargeDensity<bmpl::_1, bmpl::int_<CORE + BORDER>>,
+            bmpl::_1>
+            computeChargeDensity;
+        computeChargeDensity(fieldTmp.get(), currentStep);
+
+        /* add results of all species that are still in GUARD to next GPUs BORDER */
+        EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
+        __setTransactionEvent(fieldTmpEvent);
+
+        /* cast PMacc Buffer to cuSTL Buffer */
+        auto fieldTmp_coreBorder = fieldTmp->getGridBuffer().getDeviceBuffer().cartBuffer().view(
+            this->cellDescription->getGuardingSuperCells() * BlockDim::toRT(),
+            this->cellDescription->getGuardingSuperCells() * -BlockDim::toRT());
+
+        /* cast PMacc Buffer to cuSTL Buffer */
+        auto fieldE_coreBorder = dc.get<FieldE>(FieldE::getName(), true)
+                                     ->getGridBuffer()
+                                     .getDeviceBuffer()
+                                     .cartBuffer()
+                                     .view(
+                                         this->cellDescription->getGuardingSuperCells() * BlockDim::toRT(),
+                                         this->cellDescription->getGuardingSuperCells() * -BlockDim::toRT());
+
+        /* run calculation: fieldTmp = | div E * eps_0 - rho | */
+        typedef picongpu::detail::Div<simDim, typename FieldTmp::ValueType> myDiv;
+        algorithm::kernel::Foreach<BlockDim>()(
+            fieldTmp_coreBorder.zone(),
+            fieldTmp_coreBorder.origin(),
+            cursor::make_NestedCursor(fieldE_coreBorder.origin()),
+            ::picongpu::detail::CalculateAndAssignChargeDeviation());
+
+        /* reduce charge derivation (fieldTmp) to get the maximum value */
+        typename FieldTmp::ValueType maxChargeDiff = algorithm::kernel::Reduce()(
+            fieldTmp_coreBorder.origin(),
+            fieldTmp_coreBorder.zone(),
+            pmacc::nvidia::functors::Max());
+
+        /* reduce again across mpi cluster */
+        container::HostBuffer<typename FieldTmp::ValueType, 1> maxChargeDiff_host(1);
+        *maxChargeDiff_host.origin() = maxChargeDiff;
+        container::HostBuffer<typename FieldTmp::ValueType, 1> maxChargeDiff_cluster(1);
+        (*this->allGPU_reduce)(
+            maxChargeDiff_cluster,
+            maxChargeDiff_host,
+            ::pmacc::math::Max<typename FieldTmp::ValueType, typename FieldTmp::ValueType>());
+
+        if(!this->allGPU_reduce->root())
+            return;
+
+        this->output_file << currentStep << " " << (*maxChargeDiff_cluster.origin() * CELL_VOLUME).x() << " "
+                          << UNIT_CHARGE << std::endl;
+    }
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/Checkpoint.hpp b/include/picongpu/plugins/Checkpoint.hpp
index 2a4f369fc2..0e02d87d0d 100644
--- a/include/picongpu/plugins/Checkpoint.hpp
+++ b/include/picongpu/plugins/Checkpoint.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera, Franz Poeschel
  *
  * This file is part of PIConGPU.
  *
@@ -25,10 +25,10 @@
 #include "picongpu/plugins/ISimulationPlugin.hpp"
 
 #if(ENABLE_ADIOS == 1)
-#   include "picongpu/plugins/adios/ADIOSWriter.hpp"
+#    include "picongpu/plugins/adios/ADIOSWriter.hpp"
 #endif
-#if(ENABLE_HDF5 == 1)
-#   include "picongpu/plugins/hdf5/HDF5Writer.hpp"
+#if(ENABLE_OPENPMD == 1)
+#    include "picongpu/plugins/openPMD/openPMDWriter.hpp"
 #endif
 #include <pmacc/pluginSystem/PluginConnector.hpp>
 
@@ -47,191 +47,151 @@ namespace picongpu
     class Checkpoint : public ISimulationPlugin
     {
     public:
-
-        Checkpoint( ) :
-            checkpointFilename( "checkpoint" ),
-            restartChunkSize( 0u )
+        Checkpoint() : checkpointFilename("checkpoint"), restartChunkSize(0u)
         {
-#if(ENABLE_ADIOS == 1)
-            ioBackendsHelp[ "adios" ] = std::shared_ptr< plugins::multi::IHelp >( adios::ADIOSWriter::getHelp() );
+#if(ENABLE_OPENPMD == 1)
+            ioBackendsHelp["openPMD"] = std::shared_ptr<plugins::multi::IHelp>(openPMD::openPMDWriter::getHelp());
 #endif
-#if(ENABLE_HDF5 == 1)
-            ioBackendsHelp[ "hdf5" ] = std::shared_ptr< plugins::multi::IHelp >( hdf5::HDF5Writer::getHelp() );
+#if(ENABLE_ADIOS == 1)
+            ioBackendsHelp["adios"] = std::shared_ptr<plugins::multi::IHelp>(adios::ADIOSWriter::getHelp());
 #endif
             // if adios is enabled the default is adios
-            if( !ioBackendsHelp.empty( ) )
+            if(!ioBackendsHelp.empty())
             {
-                checkpointBackendName = ioBackendsHelp.begin( )->first;
-                restartBackendName = ioBackendsHelp.begin( )->first;
+                checkpointBackendName = ioBackendsHelp.begin()->first;
+                restartBackendName = ioBackendsHelp.begin()->first;
             }
 
             uint32_t backendCount = 0u;
-            for( auto & backend : ioBackendsHelp )
+            for(auto& backend : ioBackendsHelp)
             {
-                if( backendCount >= 1u )
+                if(backendCount >= 1u)
                     activeBackends += ", ";
                 activeBackends += backend.first;
                 ++backendCount;
             }
 
-            Environment<>::get( ).PluginConnector( ).registerPlugin( this );
+            Environment<>::get().PluginConnector().registerPlugin(this);
         }
 
-        virtual ~Checkpoint( )
+        virtual ~Checkpoint()
         {
-
         }
 
-        void pluginRegisterHelp(boost::program_options::options_description & desc)
+        void pluginRegisterHelp(boost::program_options::options_description& desc)
         {
-
             namespace po = boost::program_options;
-            if( ioBackendsHelp.empty( ) )
-                desc.add_options( )(
-                    "checkpoint",
-                    "plugin disabled [compiled without dependency HDF5 or Adios]"
-                );
+            if(ioBackendsHelp.empty())
+                desc.add_options()("checkpoint", "plugin disabled [compiled without dependency HDF5 or Adios]");
             else
-                desc.add_options( )
-                    (
-                        "checkpoint.backend",
-                        po::value <std::string >( &checkpointBackendName ),
-                        ( std::string( "Optional backend for checkpointing [" ) + activeBackends + "] default: " + checkpointBackendName ).c_str( )
-                    )(
-                        "checkpoint.file",
-                        po::value< std::string >( &checkpointFilename ),
-                        "Optional checkpoint filename (prefix)"
-                    )(
-                        "checkpoint.restart.backend",
-                        po::value< std::string >( &restartBackendName ),
-                        ( std::string( "Optional backend for restarting [" ) + activeBackends + "] default: " + restartBackendName ).c_str( )
-                    )(
-                        "checkpoint.restart.file",
-                        po::value< std::string >( &restartFilename ),
-                        "checkpoint restart filename (prefix)"
-                    )(
-                        /* 1,000,000 particles are around 3900 frames at 256 particles per frame
-                         * and match ~30MiB with typical picongpu particles.
-                         * The only reason why we use 1M particles per chunk is that we can get a
-                         * frame overflow in our memory manager if we process all particles in one kernel.
-                         **/
-                        "checkpoint.restart.chunkSize",
-                        po::value< uint32_t >(&restartChunkSize)->default_value( 1000000u ),
-                        "Number of particles processed in one kernel call during restart to prevent frame count blowup"
-                    );
-
-            for( auto & backend : ioBackendsHelp )
-                backend.second->expandHelp(
-                    desc,
-                    "checkpoint."
-                );
-
+                desc.add_options()(
+                    "checkpoint.backend",
+                    po::value<std::string>(&checkpointBackendName),
+                    (std::string("Optional backend for checkpointing [") + activeBackends
+                     + "] default: " + checkpointBackendName)
+                        .c_str())(
+                    "checkpoint.file",
+                    po::value<std::string>(&checkpointFilename),
+                    "Optional checkpoint filename (prefix)")(
+                    "checkpoint.restart.backend",
+                    po::value<std::string>(&restartBackendName),
+                    (std::string("Optional backend for restarting [") + activeBackends
+                     + "] default: " + restartBackendName)
+                        .c_str())(
+                    "checkpoint.restart.file",
+                    po::value<std::string>(&restartFilename),
+                    "checkpoint restart filename (prefix)")(
+                    /* 1,000,000 particles are around 3900 frames at 256 particles per frame
+                     * and match ~30MiB with typical picongpu particles.
+                     * The only reason why we use 1M particles per chunk is that we can get a
+                     * frame overflow in our memory manager if we process all particles in one kernel.
+                     **/
+                    "checkpoint.restart.chunkSize",
+                    po::value<uint32_t>(&restartChunkSize)->default_value(1000000u),
+                    "Number of particles processed in one kernel call during restart to prevent frame count blowup");
+
+            for(auto& backend : ioBackendsHelp)
+                backend.second->expandHelp(desc, "checkpoint.");
         }
 
-        std::string pluginGetName( ) const
+        std::string pluginGetName() const
         {
             return "Checkpoint";
         }
 
-        void notify( uint32_t )
+        void notify(uint32_t)
         {
         }
 
-        void setMappingDescription(MappingDesc *cellDescription)
+        void setMappingDescription(MappingDesc* cellDescription)
         {
             m_cellDescription = cellDescription;
         }
 
-        void checkpoint(
-            uint32_t currentStep,
-            const std::string checkpointDirectory
-        )
+        void checkpoint(uint32_t currentStep, const std::string checkpointDirectory)
         {
-            auto cBackend = ioBackends.find( checkpointBackendName );
-            if( cBackend != ioBackends.end( ) )
+            auto cBackend = ioBackends.find(checkpointBackendName);
+            if(cBackend != ioBackends.end())
             {
-                cBackend->second->dumpCheckpoint(
-                    currentStep,
-                    checkpointDirectory,
-                    checkpointFilename
-                );
+                cBackend->second->dumpCheckpoint(currentStep, checkpointDirectory, checkpointFilename);
             }
         }
 
         void restart(uint32_t restartStep, const std::string restartDirectory)
         {
-            auto rBackend = ioBackends.find( restartBackendName );
-            if( rBackend != ioBackends.end( ) )
+            auto rBackend = ioBackends.find(restartBackendName);
+            if(rBackend != ioBackends.end())
             {
-                rBackend->second->doRestart(
-                    restartStep,
-                    restartDirectory,
-                    restartFilename,
-                    restartChunkSize
-                );
+                rBackend->second->doRestart(restartStep, restartDirectory, restartFilename, restartChunkSize);
             }
         }
 
     private:
-
-        void pluginLoad( )
+        void pluginLoad()
         {
-            for( auto & backendHelp : ioBackendsHelp )
+            for(auto& backendHelp : ioBackendsHelp)
             {
-                if( backendHelp.second->getNumPlugins() > 0u )
+                if(backendHelp.second->getNumPlugins() > 0u)
                     backendHelp.second->validateOptions();
 
-                size_t const numSlaves = backendHelp.second->getNumPlugins( );
-                if( numSlaves > 1u )
-                    throw std::runtime_error( pluginGetName() + ": is no a multi plugin, each option can only be selected once." );
+                size_t const numSlaves = backendHelp.second->getNumPlugins();
+                if(numSlaves > 1u)
+                    throw std::runtime_error(
+                        pluginGetName() + ": is no a multi plugin, each option can only be selected once.");
             }
 
             // create checkpoint creation backend
-            if( !ioBackendsHelp.empty( ) )
+            if(!ioBackendsHelp.empty())
             {
-                auto cBackendHelp = ioBackendsHelp.find( checkpointBackendName );
-                if( cBackendHelp == ioBackendsHelp.end( ) )
-                    throw std::runtime_error( std::string( "IO-backend " ) +
-                        checkpointBackendName +
-                        " for checkpoints not found, possible backends: " +
-                        activeBackends
-                    );
+                auto cBackendHelp = ioBackendsHelp.find(checkpointBackendName);
+                if(cBackendHelp == ioBackendsHelp.end())
+                    throw std::runtime_error(
+                        std::string("IO-backend ") + checkpointBackendName
+                        + " for checkpoints not found, possible backends: " + activeBackends);
                 else
-                    ioBackends[ checkpointBackendName ] = std::static_pointer_cast< IIOBackend >(
-                        cBackendHelp->second->create(
-                            cBackendHelp->second,
-                            0,
-                            m_cellDescription
-                        )
-                    );
+                    ioBackends[checkpointBackendName] = std::static_pointer_cast<IIOBackend>(
+                        cBackendHelp->second->create(cBackendHelp->second, 0, m_cellDescription));
             }
             // create restart backend
-            if( !ioBackendsHelp.empty( ) && checkpointBackendName != restartBackendName )
+            if(!ioBackendsHelp.empty() && checkpointBackendName != restartBackendName)
             {
-                auto rBackend = ioBackendsHelp.find( restartBackendName );
-                if( rBackend == ioBackendsHelp.end( ) )
-                    throw std::runtime_error( std::string( "IO-backend " ) +
-                        restartBackendName +
-                        " for restarts not found, possible backends: " +
-                        activeBackends
-                    );
+                auto rBackend = ioBackendsHelp.find(restartBackendName);
+                if(rBackend == ioBackendsHelp.end())
+                    throw std::runtime_error(
+                        std::string("IO-backend ") + restartBackendName
+                        + " for restarts not found, possible backends: " + activeBackends);
                 else
-                    ioBackends[ restartBackendName ] = std::static_pointer_cast< IIOBackend >(
-                        rBackend->second->create(
-                            rBackend->second,
-                            0,
-                            m_cellDescription
-                        )
-                    );
+                    ioBackends[restartBackendName] = std::static_pointer_cast<IIOBackend>(
+                        rBackend->second->create(rBackend->second, 0, m_cellDescription));
             }
 
-            if( restartFilename.empty( ) )
+            if(restartFilename.empty())
             {
                 restartFilename = checkpointFilename;
             }
         }
 
-        virtual void pluginUnload( )
+        virtual void pluginUnload()
         {
             ioBackends.clear();
         }
@@ -255,16 +215,10 @@ namespace picongpu
          */
         uint32_t restartChunkSize;
 
-        // can be "adios" and "hdf5"
-        std::map<
-            std::string,
-            std::shared_ptr< IIOBackend >
-        > ioBackends;
+        // can be "adios", "hdf5" and "openPMD"
+        std::map<std::string, std::shared_ptr<IIOBackend>> ioBackends;
 
-        std::map<
-            std::string,
-            std::shared_ptr< plugins::multi::IHelp >
-        > ioBackendsHelp;
+        std::map<std::string, std::shared_ptr<plugins::multi::IHelp>> ioBackendsHelp;
 
         MappingDesc* m_cellDescription = nullptr;
     };
diff --git a/include/picongpu/plugins/CountParticles.hpp b/include/picongpu/plugins/CountParticles.hpp
index 13686c48a6..3ac2ece70d 100644
--- a/include/picongpu/plugins/CountParticles.hpp
+++ b/include/picongpu/plugins/CountParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -44,177 +44,165 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-template<class ParticlesType>
-class CountParticles : public ISimulationPlugin
-{
-private:
-    typedef MappingDesc::SuperCellSize SuperCellSize;
-
-    MappingDesc *cellDescription;
-    std::string notifyPeriod;
-
-    std::string pluginName;
-    std::string pluginPrefix;
-    std::string filename;
-
-    std::ofstream outFile;
-    /*only rank 0 create a file*/
-    bool writeToFile;
+    template<class ParticlesType>
+    class CountParticles : public ISimulationPlugin
+    {
+    private:
+        typedef MappingDesc::SuperCellSize SuperCellSize;
 
-    mpi::MPIReduce reduce;
-public:
+        MappingDesc* cellDescription;
+        std::string notifyPeriod;
 
-    CountParticles() :
-    pluginName("CountParticles: count macro particles of a species"),
-    pluginPrefix(ParticlesType::FrameType::getName() + std::string("_macroParticlesCount")),
-    filename(pluginPrefix + ".dat"),
-    cellDescription(nullptr),
-    writeToFile(false)
-    {
-        Environment<>::get().PluginConnector().registerPlugin(this);
-    }
+        std::string pluginName;
+        std::string pluginPrefix;
+        std::string filename;
 
-    virtual ~CountParticles()
-    {
+        std::ofstream outFile;
+        /*only rank 0 create a file*/
+        bool writeToFile;
 
-    }
+        mpi::MPIReduce reduce;
 
-    void notify(uint32_t currentStep)
-    {
-        countParticles < CORE + BORDER > (currentStep);
-    }
+    public:
+        CountParticles()
+            : pluginName("CountParticles: count macro particles of a species")
+            , pluginPrefix(ParticlesType::FrameType::getName() + std::string("_macroParticlesCount"))
+            , filename(pluginPrefix + ".dat")
+            , cellDescription(nullptr)
+            , writeToFile(false)
+        {
+            Environment<>::get().PluginConnector().registerPlugin(this);
+        }
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        desc.add_options()
-            ((pluginPrefix + ".period").c_str(),
-             po::value<std::string> (&notifyPeriod), "enable plugin [for each n-th step]");
-    }
+        virtual ~CountParticles()
+        {
+        }
 
-    std::string pluginGetName() const
-    {
-        return pluginName;
-    }
+        void notify(uint32_t currentStep)
+        {
+            countParticles<CORE + BORDER>(currentStep);
+        }
 
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        this->cellDescription = cellDescription;
-    }
+        void pluginRegisterHelp(po::options_description& desc)
+        {
+            desc.add_options()(
+                (pluginPrefix + ".period").c_str(),
+                po::value<std::string>(&notifyPeriod),
+                "enable plugin [for each n-th step]");
+        }
 
-private:
+        std::string pluginGetName() const
+        {
+            return pluginName;
+        }
 
-    void pluginLoad()
-    {
-        if(!notifyPeriod.empty())
+        void setMappingDescription(MappingDesc* cellDescription)
         {
-            writeToFile = reduce.hasResult(mpi::reduceMethods::Reduce());
+            this->cellDescription = cellDescription;
+        }
 
-            if (writeToFile)
+    private:
+        void pluginLoad()
+        {
+            if(!notifyPeriod.empty())
             {
-                outFile.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
-                if (!outFile)
+                writeToFile = reduce.hasResult(mpi::reduceMethods::Reduce());
+
+                if(writeToFile)
                 {
-                    std::cerr << "Can't open file [" << filename << "] for output, disable plugin output. " << std::endl;
-                    writeToFile = false;
+                    outFile.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
+                    if(!outFile)
+                    {
+                        std::cerr << "Can't open file [" << filename << "] for output, disable plugin output. "
+                                  << std::endl;
+                        writeToFile = false;
+                    }
+                    // create header of the file
+                    outFile << "#step count"
+                            << " \n";
                 }
-                //create header of the file
-                outFile << "#step count" << " \n";
-            }
 
-            Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+            }
         }
-    }
 
-    void pluginUnload()
-    {
-        if(!notifyPeriod.empty())
+        void pluginUnload()
         {
-            if (writeToFile)
+            if(!notifyPeriod.empty())
             {
-                outFile.flush();
-                outFile << std::endl; //now all data are written to file
-                if (outFile.fail())
-                    std::cerr << "Error on flushing file [" << filename << "]. " << std::endl;
-                outFile.close();
+                if(writeToFile)
+                {
+                    outFile.flush();
+                    outFile << std::endl; // now all data are written to file
+                    if(outFile.fail())
+                        std::cerr << "Error on flushing file [" << filename << "]. " << std::endl;
+                    outFile.close();
+                }
             }
         }
-    }
 
-    void restart(uint32_t restartStep, const std::string restartDirectory)
-    {
-        if( !writeToFile )
-            return;
+        void restart(uint32_t restartStep, const std::string restartDirectory)
+        {
+            if(!writeToFile)
+                return;
 
-        writeToFile = restoreTxtFile( outFile,
-                                      filename,
-                                      restartStep,
-                                      restartDirectory );
-    }
+            writeToFile = restoreTxtFile(outFile, filename, restartStep, restartDirectory);
+        }
 
-    void checkpoint(uint32_t currentStep, const std::string checkpointDirectory)
-    {
-        if( !writeToFile )
-            return;
+        void checkpoint(uint32_t currentStep, const std::string checkpointDirectory)
+        {
+            if(!writeToFile)
+                return;
 
-        checkpointTxtFile( outFile,
-                           filename,
-                           currentStep,
-                           checkpointDirectory );
-    }
+            checkpointTxtFile(outFile, filename, currentStep, checkpointDirectory);
+        }
 
-    template< uint32_t AREA>
-    void countParticles(uint32_t currentStep)
-    {
-        uint64_cu size;
+        template<uint32_t AREA>
+        void countParticles(uint32_t currentStep)
+        {
+            uint64_cu size;
 
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-        const DataSpace<simDim> localSize(subGrid.getLocalDomain().size);
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+            const DataSpace<simDim> localSize(subGrid.getLocalDomain().size);
 
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto particles = dc.get< ParticlesType >( ParticlesType::FrameType::getName(), true );
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
 
-        // enforce that the filter interface is fulfilled
-        particles::filter::IUnary< particles::filter::All > parFilter{ currentStep };
+            // enforce that the filter interface is fulfilled
+            particles::filter::IUnary<particles::filter::All> parFilter{currentStep};
 
-        /*count local particles*/
-        size = pmacc::CountParticles::countOnDevice<AREA>(*particles,
-                                                          *cellDescription,
-                                                          DataSpace<simDim>(),
-                                                          localSize,
-                                                          parFilter);
-        dc.releaseData( ParticlesType::FrameType::getName() );
+            /*count local particles*/
+            size = pmacc::CountParticles::countOnDevice<AREA>(
+                *particles,
+                *cellDescription,
+                DataSpace<simDim>(),
+                localSize,
+                parFilter);
+            dc.releaseData(ParticlesType::FrameType::getName());
 
-        uint64_cu reducedValueMax;
-        if (picLog::log_level & picLog::CRITICAL::lvl)
-        {
-            reduce(nvidia::functors::Max(),
-                   &reducedValueMax,
-                   &size,
-                   1,
-                   mpi::reduceMethods::Reduce());
-        }
+            uint64_cu reducedValueMax;
+            if(picLog::log_level & picLog::CRITICAL::lvl)
+            {
+                reduce(nvidia::functors::Max(), &reducedValueMax, &size, 1, mpi::reduceMethods::Reduce());
+            }
 
 
-        uint64_cu reducedValue;
-        reduce(nvidia::functors::Add(),
-               &reducedValue,
-               &size,
-               1,
-               mpi::reduceMethods::Reduce());
+            uint64_cu reducedValue;
+            reduce(nvidia::functors::Add(), &reducedValue, &size, 1, mpi::reduceMethods::Reduce());
 
-        if (writeToFile)
-        {
-            if (picLog::log_level & picLog::CRITICAL::lvl)
+            if(writeToFile)
             {
-                log<picLog::CRITICAL > ("maximum number of  particles on a GPU : %d\n") % reducedValueMax;
-            }
+                if(picLog::log_level & picLog::CRITICAL::lvl)
+                {
+                    log<picLog::CRITICAL>("maximum number of  particles on a GPU : %d\n") % reducedValueMax;
+                }
 
-            outFile << currentStep << " " << reducedValue << " " << std::scientific << (float_64) reducedValue << std::endl;
+                outFile << currentStep << " " << reducedValue << " " << std::scientific << (float_64) reducedValue
+                        << std::endl;
+            }
         }
-    }
-
-};
+    };
 
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/Emittance.hpp b/include/picongpu/plugins/Emittance.hpp
index d4b3bc4465..318aa0bd30 100644
--- a/include/picongpu/plugins/Emittance.hpp
+++ b/include/picongpu/plugins/Emittance.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau,
  *                     Rene Widera, Richard Pausch, Benjamin Worpitz,
  *                     Sophie Rudat
  *
@@ -62,13 +62,11 @@
 
 namespace picongpu
 {
-
     /** calculates the emittance in x direction along the y axis
      */
-    template< uint32_t T_numWorkers >
+    template<uint32_t T_numWorkers>
     struct KernelCalcEmittance
     {
-
         /** calculates the sum of x^2, ux^2 and x*ux and counts electrons
          *
          * @tparam T_ParBox pmacc::ParticlesBox, particle box type
@@ -84,381 +82,245 @@ namespace picongpu
          * @param gCount_e global real particle counter
          * @param mapper functor to map a block to a supercell
          */
-        template<
-            typename T_ParBox,
-            typename T_DBox,
-            typename T_Mapping,
-            typename T_Acc,
-            typename T_Filter
-        >
-        DINLINE void operator( )(
-            T_Acc const & acc,
+        template<typename T_ParBox, typename T_DBox, typename T_Mapping, typename T_Acc, typename T_Filter>
+        DINLINE void operator()(
+            T_Acc const& acc,
             T_ParBox pb,
             T_DBox gSumMom2,
             T_DBox gSumPos2,
             T_DBox gSumMomPos,
             T_DBox gCount_e,
-            DataSpace< simDim > globalOffset,
+            DataSpace<simDim> globalOffset,
             const int subGridY,
             T_Mapping mapper,
-            T_Filter filter
-        ) const
+            T_Filter filter) const
         {
             using namespace mappings::threads;
             constexpr uint32_t numWorkers = T_numWorkers;
-            constexpr uint32_t numParticlesPerFrame = pmacc::math::CT::volume<
-                typename T_ParBox::FrameType::SuperCellSize
-            >::type::value;
+            constexpr uint32_t numParticlesPerFrame
+                = pmacc::math::CT::volume<typename T_ParBox::FrameType::SuperCellSize>::type::value;
 
-            uint32_t const workerIdx = threadIdx.x;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
             using FramePtr = typename T_ParBox::FramePtr;
 
             // shared sums of x^2, ux^2, x*ux, particle counter
-            PMACC_SMEM(
-                acc,
-                shSumMom2,
-                memory::Array<
-                    float_X,
-                    SuperCellSize::y::value
-                >
-            );
-            PMACC_SMEM(
-                acc,
-                shSumPos2,
-                memory::Array<
-                    float_X,
-                    SuperCellSize::y::value
-                >
-            );
-            PMACC_SMEM(
-                acc,
-                shSumMomPos,
-                memory::Array<
-                    float_X,
-                    SuperCellSize::y::value
-                >
-            );
-            PMACC_SMEM(
-                acc,
-                shCount_e,
-                memory::Array<
-                    float_X,
-                    SuperCellSize::y::value
-                >
-            );
-
-            using ParticleDomCfg = IdxConfig<
-                numParticlesPerFrame,
-                numWorkers
-            >;
-
-            using SuperCellYDom = IdxConfig<
-                SuperCellSize::y::value,
-                numWorkers
-            >;
-
-
-            ForEachIdx< SuperCellYDom >{ workerIdx }(
-                [ & ](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    // set shared sums of x^2, ux^2, x*ux, particle counter to zero
-                    shSumMom2[ linearIdx ] = 0.0_X;
-                    shSumPos2[ linearIdx ] = 0.0_X;
-                    shSumMomPos[ linearIdx ] = 0.0_X;
-                    shCount_e[ linearIdx ] = 0.0_X;
-                }
-            );
-            __syncthreads( );
+            PMACC_SMEM(acc, shSumMom2, memory::Array<float_X, SuperCellSize::y::value>);
+            PMACC_SMEM(acc, shSumPos2, memory::Array<float_X, SuperCellSize::y::value>);
+            PMACC_SMEM(acc, shSumMomPos, memory::Array<float_X, SuperCellSize::y::value>);
+            PMACC_SMEM(acc, shCount_e, memory::Array<float_X, SuperCellSize::y::value>);
+
+            using ParticleDomCfg = IdxConfig<numParticlesPerFrame, numWorkers>;
+
+            using SuperCellYDom = IdxConfig<SuperCellSize::y::value, numWorkers>;
+
 
-            DataSpace< simDim > const superCellIdx( mapper.getSuperCellIndex(
-                DataSpace< simDim >( blockIdx )
-            ) );
+            ForEachIdx<SuperCellYDom>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                // set shared sums of x^2, ux^2, x*ux, particle counter to zero
+                shSumMom2[linearIdx] = 0.0_X;
+                shSumPos2[linearIdx] = 0.0_X;
+                shSumMomPos[linearIdx] = 0.0_X;
+                shCount_e[linearIdx] = 0.0_X;
+            });
+            cupla::__syncthreads(acc);
+
+            DataSpace<simDim> const superCellIdx(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
 
             // each virtual thread is working on an own frame
-            FramePtr frame = pb.getLastFrame( superCellIdx );
+            FramePtr frame = pb.getLastFrame(superCellIdx);
 
             // end kernel if we have no frames within the supercell
-            if( !frame.isValid( ) )
+            if(!frame.isValid())
                 return;
 
-            auto accFilter = filter(
-                acc,
-                superCellIdx - mapper.getGuardingSuperCells( ),
-                WorkerCfg< numWorkers >{ workerIdx }
-            );
-
-            memory::CtxArray<
-                typename FramePtr::type::ParticleType,
-                ParticleDomCfg
-            >
-            currentParticleCtx(
+            auto accFilter
+                = filter(acc, superCellIdx - mapper.getGuardingSuperCells(), WorkerCfg<numWorkers>{workerIdx});
+
+            memory::CtxArray<typename FramePtr::type::ParticleType, ParticleDomCfg> currentParticleCtx(
                 workerIdx,
-                [ & ](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    auto particle = frame[ linearIdx ];
+                [&](uint32_t const linearIdx, uint32_t const) {
+                    auto particle = frame[linearIdx];
                     /* - only particles from the last frame must be checked
                      * - all other particles are always valid
                      */
-                    if( particle[ multiMask_ ] != 1 )
-                        particle.setHandleInvalid( );
+                    if(particle[multiMask_] != 1)
+                        particle.setHandleInvalid();
                     return particle;
-                }
-            );
+                });
 
-            while( frame.isValid( ) )
+            while(frame.isValid())
             {
                 // loop over all particles in the frame
-                ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
+                ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
 
-                forEachParticle(
-                    [ & ](
-                        uint32_t const,
-                        uint32_t const idx
-                    )
+                forEachParticle([&](uint32_t const, uint32_t const idx) {
+                    /* get one particle */
+                    auto& particle = currentParticleCtx[idx];
+                    if(accFilter(acc, particle))
                     {
-                        /* get one particle */
-                        auto & particle = currentParticleCtx[ idx ];
-                        if( accFilter( acc, particle ) )
-                        {
-                            float_X const weighting = particle[ weighting_ ];
-                            float_X const normedWeighting = weighting / float_X( particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE );
-                            float3_X const mom = particle[ momentum_ ] / weighting;
-                            floatD_X const pos = particle[ position_ ];
-                            lcellId_t const cellIdx = particle[ localCellIdx_ ];
-                            DataSpace< simDim > const frameCellOffset(
-                                DataSpaceOperations< simDim >::template
-                                map< MappingDesc::SuperCellSize > ( cellIdx )
-                            );
-                            auto const localSupercellStart = (
-                                superCellIdx -
-                                mapper.getGuardingSuperCells( )
-                            ) * MappingDesc::SuperCellSize::toRT( );
-                            int const index_y = frameCellOffset.y( );
-                            auto const globalCellOffset = globalOffset
-                                                        + localSupercellStart
-                                                        + frameCellOffset;
-                            float_X const posX = ( float_X( globalCellOffset.x( ) ) + pos.x( ) ) * cellSize.x( );
-
-                            atomicAdd(
-                                &( shCount_e[ index_y ] ),
-                                normedWeighting,
-                                ::alpaka::hierarchy::Threads{ }
-                            );
-                            //weighted sum of single Electron values (Momentum = particle_momentum/weighting)
-                            atomicAdd(
-                                &( shSumMom2[ index_y ] ),
-                                mom.x( ) * mom.x( ) * normedWeighting,
-                                ::alpaka::hierarchy::Threads{ }
-                            );
-                            atomicAdd(
-                                &( shSumPos2[ index_y ] ),
-                                posX * posX * normedWeighting,
-                                ::alpaka::hierarchy::Threads{ }
-                            );
-                            atomicAdd(
-                                &( shSumMomPos[ index_y ] ),
-                                mom.x( ) * posX * normedWeighting,
-                                ::alpaka::hierarchy::Threads{ }
-                            );
-                        }
+                        float_X const weighting = particle[weighting_];
+                        float_X const normedWeighting
+                            = weighting / float_X(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
+                        float3_X const mom = particle[momentum_] / weighting;
+                        floatD_X const pos = particle[position_];
+                        lcellId_t const cellIdx = particle[localCellIdx_];
+                        DataSpace<simDim> const frameCellOffset(
+                            DataSpaceOperations<simDim>::template map<MappingDesc::SuperCellSize>(cellIdx));
+                        auto const localSupercellStart
+                            = (superCellIdx - mapper.getGuardingSuperCells()) * MappingDesc::SuperCellSize::toRT();
+                        int const index_y = frameCellOffset.y();
+                        auto const globalCellOffset = globalOffset + localSupercellStart + frameCellOffset;
+                        float_X const posX = (float_X(globalCellOffset.x()) + pos.x()) * cellSize.x();
+
+                        cupla::atomicAdd(acc, &(shCount_e[index_y]), normedWeighting, ::alpaka::hierarchy::Threads{});
+                        // weighted sum of single Electron values (Momentum = particle_momentum/weighting)
+                        cupla::atomicAdd(
+                            acc,
+                            &(shSumMom2[index_y]),
+                            mom.x() * mom.x() * normedWeighting,
+                            ::alpaka::hierarchy::Threads{});
+                        cupla::atomicAdd(
+                            acc,
+                            &(shSumPos2[index_y]),
+                            posX * posX * normedWeighting,
+                            ::alpaka::hierarchy::Threads{});
+                        cupla::atomicAdd(
+                            acc,
+                            &(shSumMomPos[index_y]),
+                            mom.x() * posX * normedWeighting,
+                            ::alpaka::hierarchy::Threads{});
                     }
-                );
+                });
 
                 // set frame to next particle frame
-                frame = pb.getPreviousFrame( frame );
-                forEachParticle(
-                    [ & ](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
-                    {
-                        /* Update particle for the next round.
-                         * The frame list is traversed from the last to the first frame.
-                         * Only the last frame can contain gaps therefore all following
-                         * frames are fully filled with particles.
-                         */
-                        currentParticleCtx[ idx ] = frame[ linearIdx ];
-                    }
-                );
+                frame = pb.getPreviousFrame(frame);
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    /* Update particle for the next round.
+                     * The frame list is traversed from the last to the first frame.
+                     * Only the last frame can contain gaps therefore all following
+                     * frames are fully filled with particles.
+                     */
+                    currentParticleCtx[idx] = frame[linearIdx];
+                });
             }
 
 
             // wait that all virtual threads updated the shared memory
-            __syncthreads( );
-
-            const int gOffset = (
-                (
-                    superCellIdx -
-                    mapper.getGuardingSuperCells( )
-                ) *
-                MappingDesc::SuperCellSize::toRT( ) ).y( );
-
-            ForEachIdx< SuperCellYDom >{ workerIdx }(
-                [ & ](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    atomicAdd(
-                        &( gSumMom2[ gOffset + linearIdx ] ),
-                        static_cast< float_64 >( shSumMom2[ linearIdx ] ),
-                        ::alpaka::hierarchy::Blocks{ }
-                    );
-                    atomicAdd(
-                        &( gSumPos2[ gOffset + linearIdx ] ),
-                        static_cast< float_64 >( shSumPos2[ linearIdx ] ),
-                        ::alpaka::hierarchy::Blocks{ }
-                    );
-                    atomicAdd(
-                        &( gSumMomPos[ gOffset + linearIdx ] ),
-                        static_cast< float_64 >( shSumMomPos[ linearIdx ] ),
-                        ::alpaka::hierarchy::Blocks{ }
-                    );
-                    atomicAdd(
-                        &( gCount_e[ gOffset + linearIdx ] ),
-                        static_cast< float_64 >( shCount_e[ linearIdx ] ),
-                        ::alpaka::hierarchy::Blocks{ }
-                    );
-                }
-            );
+            cupla::__syncthreads(acc);
+
+            const int gOffset
+                = ((superCellIdx - mapper.getGuardingSuperCells()) * MappingDesc::SuperCellSize::toRT()).y();
+
+            ForEachIdx<SuperCellYDom>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                cupla::atomicAdd(
+                    acc,
+                    &(gSumMom2[gOffset + linearIdx]),
+                    static_cast<float_64>(shSumMom2[linearIdx]),
+                    ::alpaka::hierarchy::Blocks{});
+                cupla::atomicAdd(
+                    acc,
+                    &(gSumPos2[gOffset + linearIdx]),
+                    static_cast<float_64>(shSumPos2[linearIdx]),
+                    ::alpaka::hierarchy::Blocks{});
+                cupla::atomicAdd(
+                    acc,
+                    &(gSumMomPos[gOffset + linearIdx]),
+                    static_cast<float_64>(shSumMomPos[linearIdx]),
+                    ::alpaka::hierarchy::Blocks{});
+                cupla::atomicAdd(
+                    acc,
+                    &(gCount_e[gOffset + linearIdx]),
+                    static_cast<float_64>(shCount_e[linearIdx]),
+                    ::alpaka::hierarchy::Blocks{});
+            });
         }
     };
 
 
-    template< typename ParticlesType >
+    template<typename ParticlesType>
     class CalcEmittance : public plugins::multi::ISlave
     {
     public:
-
         struct Help : public plugins::multi::IHelp
         {
-
             /** creates an instance of ISlave
              *
              * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
              * @param help plugin defined help
              * @param id index of the plugin, range: [ 0;help->getNumPlugins( ) )
              */
-            std::shared_ptr< ISlave > create(
-                std::shared_ptr< IHelp > & help,
-                size_t const id,
-                MappingDesc* cellDescription
-            )
+            std::shared_ptr<ISlave> create(std::shared_ptr<IHelp>& help, size_t const id, MappingDesc* cellDescription)
             {
-                return std::shared_ptr< ISlave >(
-                     new CalcEmittance< ParticlesType >(
-                        help,
-                        id,
-                        cellDescription
-                    )
-                );
+                return std::shared_ptr<ISlave>(new CalcEmittance<ParticlesType>(help, id, cellDescription));
             }
 
             // find all valid filter for the current used species
-            using EligibleFilters = typename MakeSeqFromNestedSeq<
-                typename bmpl::transform<
-                    particles::filter::AllParticleFilters,
-                    particles::traits::GenerateSolversIfSpeciesEligible<
-                        bmpl::_1,
-                        ParticlesType
-                    >
-                >::type
-            >::type;
+            using EligibleFilters = typename MakeSeqFromNestedSeq<typename bmpl::transform<
+                particles::filter::AllParticleFilters,
+                particles::traits::GenerateSolversIfSpeciesEligible<bmpl::_1, ParticlesType>>::type>::type;
 
             //! periodicity of computing the particle energy
-            plugins::multi::Option< std::string > notifyPeriod = {
-                "period",
-                "compute slice emittance[for each n-th step] enable plugin by setting a non-zero value"
-            };
-            plugins::multi::Option< std::string > filter = {
-                "filter",
-                "particle filter: "
-            };
+            plugins::multi::Option<std::string> notifyPeriod
+                = {"period", "compute slice emittance[for each n-th step] enable plugin by setting a non-zero value"};
+            plugins::multi::Option<std::string> filter = {"filter", "particle filter: "};
 
             //! string list with all possible particle filters
             std::string concatenatedFilterNames;
-            std::vector< std::string > allowedFilters;
+            std::vector<std::string> allowedFilters;
 
             ///! method used by plugin controller to get --help description
             void registerHelp(
-                boost::program_options::options_description & desc,
-                std::string const & masterPrefix = std::string{ }
-            )
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
             {
-                meta::ForEach<
-                    EligibleFilters,
-                    plugins::misc::AppendName< bmpl::_1 >
-                > getEligibleFilterNames;
-                getEligibleFilterNames( allowedFilters );
-
-                concatenatedFilterNames = plugins::misc::concatenateToString(
-                    allowedFilters,
-                    ", "
-                );
-
-                notifyPeriod.registerHelp(
-                    desc,
-                    masterPrefix + prefix
-                );
-                filter.registerHelp(
-                    desc,
-                    masterPrefix + prefix,
-                    std::string( "[" ) + concatenatedFilterNames + "]"
-                );
+                meta::ForEach<EligibleFilters, plugins::misc::AppendName<bmpl::_1>> getEligibleFilterNames;
+                getEligibleFilterNames(allowedFilters);
+
+                concatenatedFilterNames = plugins::misc::concatenateToString(allowedFilters, ", ");
+
+                notifyPeriod.registerHelp(desc, masterPrefix + prefix);
+                filter.registerHelp(desc, masterPrefix + prefix, std::string("[") + concatenatedFilterNames + "]");
             }
 
             void expandHelp(
-                boost::program_options::options_description & desc,
-                std::string const & masterPrefix = std::string{ }
-            )
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
             {
             }
 
 
-            void validateOptions( )
+            void validateOptions()
             {
-                if( notifyPeriod.size( ) != filter.size( ) )
-                    throw std::runtime_error( name + ": parameter filter and period are not used the same number of times" );
+                if(notifyPeriod.size() != filter.size())
+                    throw std::runtime_error(
+                        name + ": parameter filter and period are not used the same number of times");
 
                 // check if user passed filter name is valid
-                for( auto const & filterName : filter )
+                for(auto const& filterName : filter)
                 {
-                    if(
-                        std::find(
-                            allowedFilters.begin( ),
-                            allowedFilters.end( ),
-                            filterName
-                        ) == allowedFilters.end( )
-                    )
+                    if(std::find(allowedFilters.begin(), allowedFilters.end(), filterName) == allowedFilters.end())
                     {
-                        throw std::runtime_error( name + ": unknown filter '" + filterName + "'" );
+                        throw std::runtime_error(name + ": unknown filter '" + filterName + "'");
                     }
                 }
             }
 
-            size_t getNumPlugins( ) const
+            size_t getNumPlugins() const
             {
-                return notifyPeriod.size( );
+                return notifyPeriod.size();
             }
 
-            std::string getDescription( ) const
+            std::string getDescription() const
             {
                 return description;
             }
 
-            std::string getOptionPrefix( ) const
+            std::string getOptionPrefix() const
             {
                 return prefix;
             }
 
-            std::string getName( ) const
+            std::string getName() const
             {
                 return name;
             }
@@ -467,25 +329,21 @@ namespace picongpu
             //! short description of the plugin
             std::string const description = "calculate the slice emittance of a species";
             //! prefix used for command line arguments
-            std::string const prefix = ParticlesType::FrameType::getName( ) + std::string( "_emittance" );
+            std::string const prefix = ParticlesType::FrameType::getName() + std::string("_emittance");
         };
 
         //! must be implemented by the user
-        static std::shared_ptr< plugins::multi::IHelp > getHelp( )
+        static std::shared_ptr<plugins::multi::IHelp> getHelp()
         {
-            return std::shared_ptr< plugins::multi::IHelp >( new Help{ } );
+            return std::shared_ptr<plugins::multi::IHelp>(new Help{});
         }
 
-        CalcEmittance(
-            std::shared_ptr< plugins::multi::IHelp > & help,
-            size_t const id,
-            MappingDesc* cellDescription
-        ) :
-            m_help( std::static_pointer_cast< Help >( help ) ),
-            m_id( id ),
-            m_cellDescription( cellDescription )
+        CalcEmittance(std::shared_ptr<plugins::multi::IHelp>& help, size_t const id, MappingDesc* cellDescription)
+            : m_help(std::static_pointer_cast<Help>(help))
+            , m_id(id)
+            , m_cellDescription(cellDescription)
         {
-            filename = m_help->getOptionPrefix( ) + "_" + m_help->filter.get( m_id ) + ".dat";
+            filename = m_help->getOptionPrefix() + "_" + m_help->filter.get(m_id) + ".dat";
 
             // reduce in same x-z plane
             constexpr uint32_t r_element = 1u; // y-direction
@@ -496,462 +354,388 @@ namespace picongpu
              *                         spatial x and z direction to node with
              *                         lowest x and z position ("corner") and same x range
              */
-            pmacc::GridController< simDim >& gc = pmacc::Environment< simDim >::get( ).GridController( );
-            pmacc::math::Size_t< simDim > gpuDim = gc.getGpuNodes( );
-            pmacc::math::Int< simDim > gpuPos = gc.getPosition( );
+            pmacc::GridController<simDim>& gc = pmacc::Environment<simDim>::get().GridController();
+            pmacc::math::Size_t<simDim> gpuDim = gc.getGpuNodes();
+            pmacc::math::Int<simDim> gpuPos = gc.getPosition();
 
             /* my plane means: the r_element I am calculating should be 1GPU in width */
-            pmacc::math::Size_t< simDim > sizeTransversalPlane( gpuDim );
-            sizeTransversalPlane[ r_element ] = 1;
+            pmacc::math::Size_t<simDim> sizeTransversalPlane(gpuDim);
+            sizeTransversalPlane[r_element] = 1;
 
             // avoid deadlock for following, blocking MPI operations
-            __getTransactionEvent( ).waitForFinished( );
+            __getTransactionEvent().waitForFinished();
 
-            for( int planePos = 0; planePos <= ( int )gpuDim[ r_element ]; ++planePos )
+            for(int planePos = 0; planePos <= (int) gpuDim[r_element]; ++planePos)
             {
                 /* my plane means: the offset for the transversal plane to my r_element
                  * should be zero
                  */
-                pmacc::math::Int< simDim > longOffset( pmacc::math::Int< simDim >::create( 0 ) );
-                longOffset[ r_element ] = planePos;
+                pmacc::math::Int<simDim> longOffset(pmacc::math::Int<simDim>::create(0));
+                longOffset[r_element] = planePos;
 
-                zone::SphericZone< simDim > zoneTransversalPlane( sizeTransversalPlane, longOffset );
+                zone::SphericZone<simDim> zoneTransversalPlane(sizeTransversalPlane, longOffset);
 
                 /* Am I the lowest GPU in my plane? */
                 bool isGroupRoot = false;
-                bool isInGroup = ( gpuPos[ r_element ] == planePos );
-                if( isInGroup )
+                bool isInGroup = (gpuPos[r_element] == planePos);
+                if(isInGroup)
                 {
-                    pmacc::math::Int< simDim > inPlaneGPU( gpuPos );
-                    inPlaneGPU[ r_element ] = 0;
-                    if( inPlaneGPU == pmacc::math::Int< simDim >::create( 0 ) )
+                    pmacc::math::Int<simDim> inPlaneGPU(gpuPos);
+                    inPlaneGPU[r_element] = 0;
+                    if(inPlaneGPU == pmacc::math::Int<simDim>::create(0))
                         isGroupRoot = true;
                 }
-                algorithm::mpi::Reduce< simDim >* createReduce =
-                    new algorithm::mpi::Reduce< simDim >( zoneTransversalPlane,
-                                                        isGroupRoot );
-                if( isInGroup )
+                algorithm::mpi::Reduce<simDim>* createReduce
+                    = new algorithm::mpi::Reduce<simDim>(zoneTransversalPlane, isGroupRoot);
+                if(isInGroup)
                 {
                     planeReduce = createReduce;
                     isPlaneReduceRoot = isGroupRoot;
                 }
                 else
-                    __delete( createReduce );
+                    __delete(createReduce);
             }
 
             /* Create communicator with ranks of each plane reduce root */
             {
                 /* Array with root ranks of the planeReduce operations */
-                std::vector< int > planeReduceRootRanks( gc.getGlobalSize( ), -1 );
+                std::vector<int> planeReduceRootRanks(gc.getGlobalSize(), -1);
                 /* Am I one of the planeReduce root ranks? my global rank : -1 */
-                int myRootRank = gc.getGlobalRank( ) * isPlaneReduceRoot
-                    - ( !isPlaneReduceRoot );
+                int myRootRank = gc.getGlobalRank() * isPlaneReduceRoot - (!isPlaneReduceRoot);
 
                 MPI_Group world_group, new_group;
-                MPI_CHECK(
-                    MPI_Allgather(
-                        &myRootRank,
-                        1,
-                        MPI_INT,
-                        planeReduceRootRanks.data( ),
-                        1,
-                        MPI_INT,
-                        gc.getCommunicator().getMPIComm()
-                    )
-                );
+                MPI_CHECK(MPI_Allgather(
+                    &myRootRank,
+                    1,
+                    MPI_INT,
+                    planeReduceRootRanks.data(),
+                    1,
+                    MPI_INT,
+                    gc.getCommunicator().getMPIComm()));
 
                 /* remove all non-roots (-1 values) */
-                std::sort( planeReduceRootRanks.begin( ), planeReduceRootRanks.end( ) );
-                std::vector< int > ranks(
-                    std::lower_bound(
-                        planeReduceRootRanks.begin( ),
-                        planeReduceRootRanks.end( ),
-                        0
-                    ),
-                    planeReduceRootRanks.end( )
-                );
-
-                MPI_CHECK( MPI_Comm_group( gc.getCommunicator().getMPIComm(), &world_group ) );
-                MPI_CHECK( MPI_Group_incl( world_group, ranks.size( ), ranks.data( ), &new_group ) );
-                MPI_CHECK( MPI_Comm_create( gc.getCommunicator().getMPIComm(), new_group, &commGather ) );
-                MPI_CHECK( MPI_Group_free( &new_group ) );
-                MPI_CHECK( MPI_Group_free( &world_group ) );
+                std::sort(planeReduceRootRanks.begin(), planeReduceRootRanks.end());
+                std::vector<int> ranks(
+                    std::lower_bound(planeReduceRootRanks.begin(), planeReduceRootRanks.end(), 0),
+                    planeReduceRootRanks.end());
+
+                MPI_CHECK(MPI_Comm_group(gc.getCommunicator().getMPIComm(), &world_group));
+                MPI_CHECK(MPI_Group_incl(world_group, ranks.size(), ranks.data(), &new_group));
+                MPI_CHECK(MPI_Comm_create(gc.getCommunicator().getMPIComm(), new_group, &commGather));
+                MPI_CHECK(MPI_Group_free(&new_group));
+                MPI_CHECK(MPI_Group_free(&world_group));
             }
 
             // decide which MPI-rank writes output
             int gatherRank = -1;
-            if( commGather != MPI_COMM_NULL )
-                MPI_CHECK( MPI_Comm_rank( commGather, &gatherRank ) );
-            writeToFile = ( gatherRank == 0 );
-
-            const SubGrid< simDim >& subGrid = Environment< simDim >::get( ).SubGrid( );
-            gSumMom2 = new GridBuffer<
-                float_64,
-                DIM1
-            >( DataSpace< DIM1 >( subGrid.getLocalDomain( ).size.y( ) ) );
-            gSumPos2 = new GridBuffer<
-                float_64,
-                DIM1
-            >( DataSpace< DIM1 >( subGrid.getLocalDomain( ).size.y( ) ) );
-            gSumMomPos = new GridBuffer<
-                float_64,
-                DIM1
-            >( DataSpace< DIM1 >( subGrid.getLocalDomain( ).size.y( ) ) );
-            gCount_e = new GridBuffer<
-                float_64,
-                DIM1
-            >( DataSpace< DIM1 >( subGrid.getLocalDomain( ).size.y( ) ) );
+            if(commGather != MPI_COMM_NULL)
+                MPI_CHECK(MPI_Comm_rank(commGather, &gatherRank));
+            writeToFile = (gatherRank == 0);
+
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+            gSumMom2 = new GridBuffer<float_64, DIM1>(DataSpace<DIM1>(subGrid.getLocalDomain().size.y()));
+            gSumPos2 = new GridBuffer<float_64, DIM1>(DataSpace<DIM1>(subGrid.getLocalDomain().size.y()));
+            gSumMomPos = new GridBuffer<float_64, DIM1>(DataSpace<DIM1>(subGrid.getLocalDomain().size.y()));
+            gCount_e = new GridBuffer<float_64, DIM1>(DataSpace<DIM1>(subGrid.getLocalDomain().size.y()));
 
             // only MPI rank that writes to file
-            if( writeToFile )
+            if(writeToFile)
             {
                 // open output file
-                outFile.open(
-                    filename.c_str( ),
-                    std::ofstream::out | std::ostream::trunc
-                );
+                outFile.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
 
                 // error handling
-                if( !outFile )
+                if(!outFile)
                 {
-                    std::cerr <<
-                        "Can't open file [" <<
-                        filename <<
-                        "] for output, diasble plugin output. " <<
-                        std::endl;
+                    std::cerr << "Can't open file [" << filename << "] for output, diasble plugin output. "
+                              << std::endl;
                     writeToFile = false;
                 }
             }
 
             // set how often the plugin should be executed while PIConGPU is running
-            Environment< >::get( ).PluginConnector( ).setNotificationPeriod(
-                this,
-                m_help->notifyPeriod.get( id )
-            );
+            Environment<>::get().PluginConnector().setNotificationPeriod(this, m_help->notifyPeriod.get(id));
         }
 
-        virtual ~CalcEmittance( )
+        virtual ~CalcEmittance()
         {
-            if( writeToFile )
+            if(writeToFile)
             {
                 // flush cached data to file
-                outFile.flush( ) << std::endl;
+                outFile.flush() << std::endl;
 
-                if( outFile.fail( ) )
+                if(outFile.fail())
                     std::cerr << "Error on flushing file [" << filename << "]. " << std::endl;
-                outFile.close( );
+                outFile.close();
             }
             // free global memory on GPU
-            __delete( gSumMom2 );
-            __delete( gSumPos2 );
-            __delete( gSumMomPos );
-            __delete( gCount_e );
+            __delete(gSumMom2);
+            __delete(gSumPos2);
+            __delete(gSumMomPos);
+            __delete(gCount_e);
         }
 
         /** this code is executed if the current time step is supposed to compute
          * gSumMom2, gSumPos2, gSumMomPos, gCount_e
          */
-        void notify( uint32_t currentStep )
+        void notify(uint32_t currentStep)
         {
             // call the method that calls the plugin kernel
-            calculateCalcEmittance < CORE + BORDER > ( currentStep );
+            calculateCalcEmittance<CORE + BORDER>(currentStep);
         }
 
 
-        void restart(
-            uint32_t restartStep,
-            std::string const & restartDirectory
-        )
+        void restart(uint32_t restartStep, std::string const& restartDirectory)
         {
-            if( !writeToFile )
+            if(!writeToFile)
                 return;
 
-            writeToFile = restoreTxtFile(
-                outFile,
-                filename,
-                restartStep,
-                restartDirectory
-            );
+            writeToFile = restoreTxtFile(outFile, filename, restartStep, restartDirectory);
         }
 
-        void checkpoint(
-            uint32_t currentStep,
-            std::string const & checkpointDirectory
-        )
+        void checkpoint(uint32_t currentStep, std::string const& checkpointDirectory)
         {
-            if( !writeToFile )
+            if(!writeToFile)
                 return;
 
-            checkpointTxtFile(
-                outFile,
-                filename,
-                currentStep,
-                checkpointDirectory
-            );
+            checkpointTxtFile(outFile, filename, currentStep, checkpointDirectory);
         }
 
     private:
         //! method to call analysis and plugin-kernel calls
-        template< uint32_t AREA >
-        void calculateCalcEmittance( uint32_t currentStep )
+        template<uint32_t AREA>
+        void calculateCalcEmittance(uint32_t currentStep)
         {
-            DataConnector &dc = Environment< >::get( ).DataConnector( );
+            DataConnector& dc = Environment<>::get().DataConnector();
 
             // use data connector to get particle data
-            auto particles = dc.get< ParticlesType >(
-                ParticlesType::FrameType::getName( ),
-                true
-            );
-
-            gSumMom2->getDeviceBuffer( ).setValue( 0.0 );
-            gSumPos2->getDeviceBuffer( ).setValue( 0.0 );
-            gSumMomPos->getDeviceBuffer( ).setValue( 0.0 );
-            gCount_e->getDeviceBuffer( ).setValue( 0.0 );
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            AreaMapping<
-                AREA,
-                MappingDesc
-            > mapper( *m_cellDescription );
-
-            auto kernel = PMACC_KERNEL( KernelCalcEmittance< numWorkers >{ } )(
-                mapper.getGridDim( ),
-                numWorkers
-            );
+            auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
+
+            gSumMom2->getDeviceBuffer().setValue(0.0);
+            gSumPos2->getDeviceBuffer().setValue(0.0);
+            gSumMomPos->getDeviceBuffer().setValue(0.0);
+            gCount_e->getDeviceBuffer().setValue(0.0);
+
+            constexpr uint32_t numWorkers
+                = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+            AreaMapping<AREA, MappingDesc> mapper(*m_cellDescription);
+
+            auto kernel = PMACC_KERNEL(KernelCalcEmittance<numWorkers>{})(mapper.getGridDim(), numWorkers);
 
             // Some variables required so that it is possible for the kernel
             // to calculate the absolute position of the particles
-            DataSpace< simDim > localSize( m_cellDescription->getGridLayout( ).getDataSpaceWithoutGuarding( ) );
-            const SubGrid< simDim >& subGrid = Environment< simDim >::get( ).SubGrid( );
-            const int subGridY = subGrid.getGlobalDomain( ).size.y( );
-            auto movingWindow = MovingWindow::getInstance( ).getWindow( currentStep );
-            DataSpace< simDim > globalOffset( subGrid.getLocalDomain( ).offset );
+            DataSpace<simDim> localSize(m_cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+            const int subGridY = subGrid.getGlobalDomain().size.y();
+            auto movingWindow = MovingWindow::getInstance().getWindow(currentStep);
+            DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset);
 
             auto binaryKernel = std::bind(
                 kernel,
-                particles->getDeviceParticlesBox( ),
-                gSumMom2->getDeviceBuffer( ).getDataBox( ),
-                gSumPos2->getDeviceBuffer( ).getDataBox( ),
-                gSumMomPos->getDeviceBuffer( ).getDataBox( ),
-                gCount_e->getDeviceBuffer( ).getDataBox( ),
+                particles->getDeviceParticlesBox(),
+                gSumMom2->getDeviceBuffer().getDataBox(),
+                gSumPos2->getDeviceBuffer().getDataBox(),
+                gSumMomPos->getDeviceBuffer().getDataBox(),
+                gCount_e->getDeviceBuffer().getDataBox(),
                 globalOffset,
                 subGridY,
                 mapper,
-                std::placeholders::_1
-            );
-
-            meta::ForEach<
-                typename Help::EligibleFilters,
-                plugins::misc::ExecuteIfNameIsEqual< bmpl::_1 >
-            >{ }(
-                m_help->filter.get( m_id ),
+                std::placeholders::_1);
+
+            meta::ForEach<typename Help::EligibleFilters, plugins::misc::ExecuteIfNameIsEqual<bmpl::_1>>{}(
+                m_help->filter.get(m_id),
                 currentStep,
-                binaryKernel
-            );
+                binaryKernel);
 
-            dc.releaseData( ParticlesType::FrameType::getName( ) );
+            dc.releaseData(ParticlesType::FrameType::getName());
 
             // get gSum, ... from GPU
-            gSumMom2->deviceToHost( );
-            gSumPos2->deviceToHost( );
-            gSumMomPos->deviceToHost( );
-            gCount_e->deviceToHost( );
-
-            container::HostBuffer< float_64, DIM1 > reducedSumMom2( subGrid.getLocalDomain( ).size.y( ) );
-            container::HostBuffer< float_64, DIM1 > reducedSumPos2( subGrid.getLocalDomain( ).size.y( ) );
-            container::HostBuffer< float_64, DIM1 > reducedSumMomPos( subGrid.getLocalDomain( ).size.y( ) );
-            container::HostBuffer< float_64, DIM1 > reducedCount_e( subGrid.getLocalDomain( ).size.y( ) );
-            reducedSumMom2.assign( 0.0 );
-            reducedSumPos2.assign( 0.0 );
-            reducedSumMomPos.assign( 0.0 );
-            reducedCount_e.assign( 0.0 );
+            gSumMom2->deviceToHost();
+            gSumPos2->deviceToHost();
+            gSumMomPos->deviceToHost();
+            gCount_e->deviceToHost();
+
+            container::HostBuffer<float_64, DIM1> reducedSumMom2(subGrid.getLocalDomain().size.y());
+            container::HostBuffer<float_64, DIM1> reducedSumPos2(subGrid.getLocalDomain().size.y());
+            container::HostBuffer<float_64, DIM1> reducedSumMomPos(subGrid.getLocalDomain().size.y());
+            container::HostBuffer<float_64, DIM1> reducedCount_e(subGrid.getLocalDomain().size.y());
+            reducedSumMom2.assign(0.0);
+            reducedSumPos2.assign(0.0);
+            reducedSumMomPos.assign(0.0);
+            reducedCount_e.assign(0.0);
 
             // add gSum values from all GPUs using MPI
-            planeReduce->template operator( )( /* parameters: dest, source */
-                reducedSumMom2,
-                gSumMom2->getHostBuffer( ).cartBuffer( ),
-                /* the functors return value will be written to dst */
-                pmacc::algorithm::functor::Add( )
-            );
-            planeReduce->template operator( )( /* parameters: dest, source */
-                reducedSumPos2,
-                gSumPos2->getHostBuffer( ).cartBuffer( ),
-                /* the functors return value will be written to dst */
-                pmacc::algorithm::functor::Add( )
-            );
-            planeReduce->template operator( )( /* parameters: dest, source */
-                reducedSumMomPos,
-                gSumMomPos->getHostBuffer( ).cartBuffer( ),
-                /* the functors return value will be written to dst */
-                pmacc::algorithm::functor::Add( )
-            );
-            planeReduce->template operator( )( /* parameters: dest, source */
-                reducedCount_e,
-                gCount_e->getHostBuffer( ).cartBuffer( ),
-                /* the functors return value will be written to dst */
-                pmacc::algorithm::functor::Add( )
-            );
+            planeReduce->template operator()(/* parameters: dest, source */
+                                             reducedSumMom2,
+                                             gSumMom2->getHostBuffer().cartBuffer(),
+                                             /* the functors return value will be written to dst */
+                                             pmacc::algorithm::functor::Add());
+            planeReduce->template operator()(/* parameters: dest, source */
+                                             reducedSumPos2,
+                                             gSumPos2->getHostBuffer().cartBuffer(),
+                                             /* the functors return value will be written to dst */
+                                             pmacc::algorithm::functor::Add());
+            planeReduce->template operator()(/* parameters: dest, source */
+                                             reducedSumMomPos,
+                                             gSumMomPos->getHostBuffer().cartBuffer(),
+                                             /* the functors return value will be written to dst */
+                                             pmacc::algorithm::functor::Add());
+            planeReduce->template operator()(/* parameters: dest, source */
+                                             reducedCount_e,
+                                             gCount_e->getHostBuffer().cartBuffer(),
+                                             /* the functors return value will be written to dst */
+                                             pmacc::algorithm::functor::Add());
 
             /** all non-reduce-root processes are done now */
-            if( ! isPlaneReduceRoot )
+            if(!isPlaneReduceRoot)
                 return;
 
             // gather to file writer
-            container::HostBuffer< float_64, DIM1 > globalSumMom2( subGrid.getGlobalDomain( ).size.y( ) );
-            container::HostBuffer< float_64, DIM1 > globalSumPos2( subGrid.getGlobalDomain( ).size.y( ) );
-            container::HostBuffer< float_64, DIM1 > globalSumMomPos( subGrid.getGlobalDomain( ).size.y( ) );
-            container::HostBuffer< float_64, DIM1 > globalCount_e( subGrid.getGlobalDomain( ).size.y( ) );
+            container::HostBuffer<float_64, DIM1> globalSumMom2(subGrid.getGlobalDomain().size.y());
+            container::HostBuffer<float_64, DIM1> globalSumPos2(subGrid.getGlobalDomain().size.y());
+            container::HostBuffer<float_64, DIM1> globalSumMomPos(subGrid.getGlobalDomain().size.y());
+            container::HostBuffer<float_64, DIM1> globalCount_e(subGrid.getGlobalDomain().size.y());
 
             // gather y offsets, so we can store our gathered data in the right order
             int gatherSize = -1;
-            MPI_CHECK( MPI_Comm_size( commGather, &gatherSize ) );
-            std::vector< int > y_offsets( gatherSize );
-            std::vector< int > y_sizes( gatherSize );
-            long int const y_off = subGrid.getLocalDomain( ).offset.y( );
-            int const y_siz = subGrid.getLocalDomain( ).size.y( );
-
-            MPI_CHECK( MPI_Gather(
-                &y_off,
-                1,
-                MPI_INT,
-                y_offsets.data( ),
-                1,
-                MPI_INT,
-                0,
-                commGather
-            ) );
-            MPI_CHECK( MPI_Gather(
-                &y_siz,
-                1,
-                MPI_INT,
-                y_sizes.data( ),
-                1,
-                MPI_INT,
-                0,
-                commGather
-            ) );
+            MPI_CHECK(MPI_Comm_size(commGather, &gatherSize));
+            std::vector<int> y_offsets(gatherSize);
+            std::vector<int> y_sizes(gatherSize);
+            long int const y_off = subGrid.getLocalDomain().offset.y();
+            int const y_siz = subGrid.getLocalDomain().size.y();
 
+            MPI_CHECK(MPI_Gather(&y_off, 1, MPI_INT, y_offsets.data(), 1, MPI_INT, 0, commGather));
+            MPI_CHECK(MPI_Gather(&y_siz, 1, MPI_INT, y_sizes.data(), 1, MPI_INT, 0, commGather));
 
-            std::vector< int > recvcounts( gatherSize, 1 );
 
-            MPI_CHECK( MPI_Gatherv(
-                reducedSumMom2.getDataPointer( ),
-                subGrid.getLocalDomain( ).size.y( ),
+            std::vector<int> recvcounts(gatherSize, 1);
+
+            MPI_CHECK(MPI_Gatherv(
+                reducedSumMom2.getDataPointer(),
+                subGrid.getLocalDomain().size.y(),
                 MPI_DOUBLE,
-                globalSumMom2.getDataPointer( ),
-                y_sizes.data( ),
-                y_offsets.data( ),
+                globalSumMom2.getDataPointer(),
+                y_sizes.data(),
+                y_offsets.data(),
                 MPI_DOUBLE,
                 0,
-                commGather
-            ) );
-            MPI_CHECK( MPI_Gatherv(
-                reducedSumPos2.getDataPointer( ),
-                subGrid.getLocalDomain( ).size.y( ),
+                commGather));
+            MPI_CHECK(MPI_Gatherv(
+                reducedSumPos2.getDataPointer(),
+                subGrid.getLocalDomain().size.y(),
                 MPI_DOUBLE,
-                globalSumPos2.getDataPointer( ),
-                y_sizes.data( ),
-                y_offsets.data( ),
+                globalSumPos2.getDataPointer(),
+                y_sizes.data(),
+                y_offsets.data(),
                 MPI_DOUBLE,
                 0,
-                commGather
-            ) );
-            MPI_CHECK( MPI_Gatherv(
-                reducedSumMomPos.getDataPointer( ),
-                subGrid.getLocalDomain( ).size.y( ),
+                commGather));
+            MPI_CHECK(MPI_Gatherv(
+                reducedSumMomPos.getDataPointer(),
+                subGrid.getLocalDomain().size.y(),
                 MPI_DOUBLE,
-                globalSumMomPos.getDataPointer( ),
-                y_sizes.data( ),
-                y_offsets.data( ),
+                globalSumMomPos.getDataPointer(),
+                y_sizes.data(),
+                y_offsets.data(),
                 MPI_DOUBLE,
                 0,
-                commGather
-            ) );
-            MPI_CHECK( MPI_Gatherv(
-                reducedCount_e.getDataPointer( ),
-                subGrid.getLocalDomain( ).size.y( ),
+                commGather));
+            MPI_CHECK(MPI_Gatherv(
+                reducedCount_e.getDataPointer(),
+                subGrid.getLocalDomain().size.y(),
                 MPI_DOUBLE,
-                globalCount_e.getDataPointer( ),
-                y_sizes.data( ),
-                y_offsets.data( ),
+                globalCount_e.getDataPointer(),
+                y_sizes.data(),
+                y_offsets.data(),
                 MPI_DOUBLE,
                 0,
-                commGather
-            ) );
+                commGather));
 
             /* print timestep, emittance to file: */
-            if( writeToFile )
+            if(writeToFile)
             {
-                using dbl = std::numeric_limits< float_64 >;
-                outFile.precision( dbl::digits10 );
-                if ( currentStep > 0.0 ){
-                    int startWindow_y = movingWindow.globalDimensions.offset.y( );
-                    int endWindow_y = movingWindow.globalDimensions.size.y( ) + startWindow_y;
-                    if ( fisttimestep == true )
+                using dbl = std::numeric_limits<float_64>;
+                outFile.precision(dbl::digits10);
+                if(currentStep > 0.0)
+                {
+                    int startWindow_y = movingWindow.globalDimensions.offset.y();
+                    int endWindow_y = movingWindow.globalDimensions.size.y() + startWindow_y;
+                    if(fisttimestep == true)
                     {
                         outFile << "#step emit_all" << std::scientific;
-                        for ( int i = startWindow_y; i < ( endWindow_y + 10 ); i += 10 )
+                        for(int i = startWindow_y; i < (endWindow_y + 10); i += 10)
                         {
                             outFile << " " << i * SI::CELL_HEIGHT_SI;
                         }
                         outFile << std::endl;
                         fisttimestep = false;
                     }
-                    outFile << currentStep << " "
-                            << std::scientific;
+                    outFile << currentStep << " " << std::scientific;
 
                     long double numElec_all = 0.0;
                     long double ux2_all = 0.0;
                     long double pos2_SI_all = 0.0;
                     long double xux_all = 0.0;
 
-                    for ( int i = startWindow_y; i < endWindow_y; i++ )
+                    for(int i = startWindow_y; i < endWindow_y; i++)
                     {
-                            numElec_all += static_cast< long double >( globalCount_e.getDataPointer( )[ i ] );
-                            ux2_all += static_cast< long double >( globalSumMom2.getDataPointer( )[ i ] ) * UNIT_MASS * UNIT_MASS / ( SI::ELECTRON_MASS_SI * SI::ELECTRON_MASS_SI );
-                            pos2_SI_all += static_cast< long double >( globalSumPos2.getDataPointer( )[ i ] ) * UNIT_LENGTH * UNIT_LENGTH ;
-                            xux_all += static_cast< long double >( globalSumMomPos.getDataPointer( )[ i ] ) * UNIT_MASS * UNIT_LENGTH / SI::ELECTRON_MASS_SI;
+                        numElec_all += static_cast<long double>(globalCount_e.getDataPointer()[i]);
+                        ux2_all += static_cast<long double>(globalSumMom2.getDataPointer()[i]) * UNIT_MASS * UNIT_MASS
+                            / (SI::ELECTRON_MASS_SI * SI::ELECTRON_MASS_SI);
+                        pos2_SI_all
+                            += static_cast<long double>(globalSumPos2.getDataPointer()[i]) * UNIT_LENGTH * UNIT_LENGTH;
+                        xux_all += static_cast<long double>(globalSumMomPos.getDataPointer()[i]) * UNIT_MASS
+                            * UNIT_LENGTH / SI::ELECTRON_MASS_SI;
                     }
-                    /* the scaling with normalized weighting (weighting / particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE)
-                     * is compendated by the division by (normalized) number of particles
+                    /* the scaling with normalized weighting (weighting /
+                     * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE) is compendated by the division by
+                     * (normalized) number of particles
                      */
-                    float_64 emit_all = algorithms::math::sqrt(
-                        static_cast< float_64 >( pos2_SI_all ) * static_cast< float_64 >( ux2_all ) -
-                        static_cast< float_64 >( xux_all ) * static_cast< float_64 >( xux_all )
-                    ) / static_cast< float_64 >( numElec_all );
+                    float_64 emit_all = math::sqrt(
+                                            static_cast<float_64>(pos2_SI_all) * static_cast<float_64>(ux2_all)
+                                            - static_cast<float_64>(xux_all) * static_cast<float_64>(xux_all))
+                        / static_cast<float_64>(numElec_all);
 
-                    if ( emit_all > 0.0 ){
+                    if(emit_all > 0.0)
+                    {
                         outFile << emit_all << " ";
                     }
-                    else {
+                    else
+                    {
                         outFile << "0.0 ";
                     }
 
-                    for ( int i = startWindow_y; i < endWindow_y; i += 10 )
+                    for(int i = startWindow_y; i < endWindow_y; i += 10)
                     {
-                        float_64 numElec = globalCount_e.getDataPointer( )[ i ];
-                        float_64 mom2_SI = globalSumMom2.getDataPointer( )[ i ] * UNIT_MASS * UNIT_SPEED * UNIT_MASS * UNIT_SPEED;
-                        float_64 pos2_SI = globalSumPos2.getDataPointer( )[ i ] * UNIT_LENGTH * UNIT_LENGTH ;
-                        float_64 mompos_SI = globalSumMomPos.getDataPointer( )[ i ] * UNIT_MASS * UNIT_SPEED * UNIT_LENGTH;
-                        for ( int j = i + 1; j < i + 10 && j < endWindow_y; j++ ){
-                                numElec += globalCount_e.getDataPointer( )[ j ];
-                                mom2_SI += globalSumMom2.getDataPointer( )[ j ] * UNIT_MASS * UNIT_SPEED * UNIT_MASS * UNIT_SPEED;
-                                pos2_SI += globalSumPos2.getDataPointer( )[ j ] * UNIT_LENGTH * UNIT_LENGTH;
-                                mompos_SI += globalSumMomPos.getDataPointer( )[ j ] * UNIT_MASS * UNIT_SPEED * UNIT_LENGTH;
+                        float_64 numElec = globalCount_e.getDataPointer()[i];
+                        float_64 mom2_SI
+                            = globalSumMom2.getDataPointer()[i] * UNIT_MASS * UNIT_SPEED * UNIT_MASS * UNIT_SPEED;
+                        float_64 pos2_SI = globalSumPos2.getDataPointer()[i] * UNIT_LENGTH * UNIT_LENGTH;
+                        float_64 mompos_SI
+                            = globalSumMomPos.getDataPointer()[i] * UNIT_MASS * UNIT_SPEED * UNIT_LENGTH;
+                        for(int j = i + 1; j < i + 10 && j < endWindow_y; j++)
+                        {
+                            numElec += globalCount_e.getDataPointer()[j];
+                            mom2_SI
+                                += globalSumMom2.getDataPointer()[j] * UNIT_MASS * UNIT_SPEED * UNIT_MASS * UNIT_SPEED;
+                            pos2_SI += globalSumPos2.getDataPointer()[j] * UNIT_LENGTH * UNIT_LENGTH;
+                            mompos_SI += globalSumMomPos.getDataPointer()[j] * UNIT_MASS * UNIT_SPEED * UNIT_LENGTH;
                         }
-                        float_64 ux2 = mom2_SI / ( UNIT_SPEED * UNIT_SPEED * SI::ELECTRON_MASS_SI * SI::ELECTRON_MASS_SI );
-                        float_64 xux = mompos_SI / ( UNIT_SPEED * SI::ELECTRON_MASS_SI );
-                        float_64 emit = algorithms::math::sqrt( ( pos2_SI * ux2 - xux * xux ) ) / numElec;
-                        if( numElec < std::numeric_limits< float_64 >::epsilon( ) ){
+                        float_64 ux2
+                            = mom2_SI / (UNIT_SPEED * UNIT_SPEED * SI::ELECTRON_MASS_SI * SI::ELECTRON_MASS_SI);
+                        float_64 xux = mompos_SI / (UNIT_SPEED * SI::ELECTRON_MASS_SI);
+                        float_64 emit = math::sqrt((pos2_SI * ux2 - xux * xux)) / numElec;
+                        if(numElec < std::numeric_limits<float_64>::epsilon())
+                        {
                             outFile << "0.0 ";
                         }
-                        else if( emit > 0.0 && emit < std::numeric_limits< float_64 >::max( ) ){
+                        else if(emit > 0.0 && emit < std::numeric_limits<float_64>::max())
+                        {
                             outFile << emit << " ";
                         }
-                        else{
+                        else
+                        {
                             outFile << "-0.0 ";
                         }
                     }
@@ -960,25 +744,13 @@ namespace picongpu
             }
         }
 
-        GridBuffer<
-            float_64,
-            DIM1
-        >* gSumMom2 = nullptr;
+        GridBuffer<float_64, DIM1>* gSumMom2 = nullptr;
 
-        GridBuffer<
-            float_64,
-            DIM1
-        >* gSumPos2 = nullptr;
+        GridBuffer<float_64, DIM1>* gSumPos2 = nullptr;
 
-        GridBuffer<
-            float_64,
-            DIM1
-        >* gSumMomPos = nullptr;
+        GridBuffer<float_64, DIM1>* gSumMomPos = nullptr;
 
-        GridBuffer<
-            float_64,
-            DIM1
-        >* gCount_e = nullptr;
+        GridBuffer<float_64, DIM1>* gCount_e = nullptr;
 
         MappingDesc* m_cellDescription = nullptr;
 
@@ -996,54 +768,37 @@ namespace picongpu
         bool fisttimestep = true;
 
         /** reduce functor to a single host per plane */
-        pmacc::algorithm::mpi::Reduce< simDim >* planeReduce = nullptr;
+        pmacc::algorithm::mpi::Reduce<simDim>* planeReduce = nullptr;
         bool isPlaneReduceRoot = false;
 
         /** MPI communicator that contains the root ranks of the \p planeReduce
          */
         MPI_Comm commGather = MPI_COMM_NULL;
 
-        std::shared_ptr< Help > m_help;
+        std::shared_ptr<Help> m_help;
         size_t m_id;
     };
 
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_UnspecifiedSpecies
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        CalcEmittance< T_UnspecifiedSpecies >
-    >
+    namespace particles
     {
-        using FrameType = typename T_Species::FrameType;
-
-        // this plugin needs at least the weighting and momentum attributes
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            momentum
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        // and also a mass ratio for energy calculation from momentum
-        using SpeciesHasFlags = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio< >
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasFlags
-        >;
-    };
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            template<typename T_Species, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<T_Species, CalcEmittance<T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                // this plugin needs at least the weighting and momentum attributes
+                using RequiredIdentifiers = MakeSeq_t<weighting, momentum>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                // and also a mass ratio for energy calculation from momentum
+                using SpeciesHasFlags = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasFlags>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/plugins/EnergyFields.hpp b/include/picongpu/plugins/EnergyFields.hpp
index 84ea8bde9d..764ff45498 100644
--- a/include/picongpu/plugins/EnergyFields.hpp
+++ b/include/picongpu/plugins/EnergyFields.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -45,236 +45,228 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-namespace po = boost::program_options;
+    namespace po = boost::program_options;
 
-namespace energyFields
-{
-
-template<typename T_Type>
-struct cast64Bit
-{
-    typedef typename TypeCast<float_64, T_Type>::result result;
-
-    HDINLINE result operator()(const T_Type& value) const
-    {
-        return precisionCast<float_64>(value);
-    }
-};
-
-template<typename T_Type>
-struct squareComponentWise
-{
-    using result = T_Type;
-
-    HDINLINE result operator()(const T_Type& value) const
+    namespace energyFields
     {
-        return value*value;
-    }
-};
-
-}
-
-class EnergyFields : public ISimulationPlugin
-{
-private:
-    MappingDesc *cellDescription;
-    std::string notifyPeriod;
-
-    std::string pluginName;
-    std::string pluginPrefix;
-    std::string filename;
-    std::ofstream outFile;
-    /*only rank 0 create a file*/
-    bool writeToFile;
-
-    mpi::MPIReduce mpiReduce;
-
-    nvidia::reduce::Reduce* localReduce;
-
-    typedef promoteType<float_64, FieldB::ValueType>::type EneVectorType;
+        template<typename T_Type>
+        struct cast64Bit
+        {
+            typedef typename TypeCast<float_64, T_Type>::result result;
 
-public:
+            HDINLINE result operator()(const T_Type& value) const
+            {
+                return precisionCast<float_64>(value);
+            }
+        };
 
-    EnergyFields() :
-    cellDescription(nullptr),
-    pluginName("EnergyFields: calculate the energy of the fields"),
-    pluginPrefix(std::string("fields_energy")),
-    filename(pluginPrefix + ".dat"),
-    writeToFile(false),
-    localReduce(nullptr)
-    {
-        Environment<>::get().PluginConnector().registerPlugin(this);
-    }
+        template<typename T_Type>
+        struct squareComponentWise
+        {
+            using result = T_Type;
 
-    virtual ~EnergyFields()
-    {
+            HDINLINE result operator()(const T_Type& value) const
+            {
+                return value * value;
+            }
+        };
 
-    }
+    } // namespace energyFields
 
-    void notify(uint32_t currentStep)
+    class EnergyFields : public ISimulationPlugin
     {
-        getEnergyFields(currentStep);
-    }
+    private:
+        MappingDesc* cellDescription;
+        std::string notifyPeriod;
+
+        std::string pluginName;
+        std::string pluginPrefix;
+        std::string filename;
+        std::ofstream outFile;
+        /*only rank 0 create a file*/
+        bool writeToFile;
+
+        mpi::MPIReduce mpiReduce;
+
+        nvidia::reduce::Reduce* localReduce;
+
+        typedef promoteType<float_64, FieldB::ValueType>::type EneVectorType;
+
+    public:
+        EnergyFields()
+            : cellDescription(nullptr)
+            , pluginName("EnergyFields: calculate the energy of the fields")
+            , pluginPrefix(std::string("fields_energy"))
+            , filename(pluginPrefix + ".dat")
+            , writeToFile(false)
+            , localReduce(nullptr)
+        {
+            Environment<>::get().PluginConnector().registerPlugin(this);
+        }
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        desc.add_options()
-            ((pluginPrefix + ".period").c_str(),
-             po::value<std::string> (&notifyPeriod), "enable plugin [for each n-th step]");
-    }
+        virtual ~EnergyFields()
+        {
+        }
 
-    std::string pluginGetName() const
-    {
-        return pluginName;
-    }
+        void notify(uint32_t currentStep)
+        {
+            getEnergyFields(currentStep);
+        }
 
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        this->cellDescription = cellDescription;
-    }
+        void pluginRegisterHelp(po::options_description& desc)
+        {
+            desc.add_options()(
+                (pluginPrefix + ".period").c_str(),
+                po::value<std::string>(&notifyPeriod),
+                "enable plugin [for each n-th step]");
+        }
 
-private:
+        std::string pluginGetName() const
+        {
+            return pluginName;
+        }
 
-    void pluginLoad()
-    {
-        if(!notifyPeriod.empty())
+        void setMappingDescription(MappingDesc* cellDescription)
         {
-            localReduce = new nvidia::reduce::Reduce(1024);
-            writeToFile = mpiReduce.hasResult(mpi::reduceMethods::Reduce());
+            this->cellDescription = cellDescription;
+        }
 
-            if (writeToFile)
+    private:
+        void pluginLoad()
+        {
+            if(!notifyPeriod.empty())
             {
-                outFile.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
-                if (!outFile)
+                localReduce = new nvidia::reduce::Reduce(1024);
+                writeToFile = mpiReduce.hasResult(mpi::reduceMethods::Reduce());
+
+                if(writeToFile)
                 {
-                    std::cerr << "Can't open file [" << filename << "] for output, disable plugin output. " << std::endl;
-                    writeToFile = false;
+                    outFile.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
+                    if(!outFile)
+                    {
+                        std::cerr << "Can't open file [" << filename << "] for output, disable plugin output. "
+                                  << std::endl;
+                        writeToFile = false;
+                    }
+                    // create header of the file
+                    outFile << "#step total[Joule] Bx[Joule] By[Joule] Bz[Joule] Ex[Joule] Ey[Joule] Ez[Joule]"
+                            << " \n";
                 }
-                //create header of the file
-                outFile << "#step total[Joule] Bx[Joule] By[Joule] Bz[Joule] Ex[Joule] Ey[Joule] Ez[Joule]" << " \n";
+                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
             }
-            Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
         }
-    }
 
-    void pluginUnload()
-    {
-        if(!notifyPeriod.empty())
+        void pluginUnload()
         {
-            if (writeToFile)
+            if(!notifyPeriod.empty())
             {
-                outFile.flush();
-                outFile << std::endl; //now all data are written to file
-                if (outFile.fail())
-                    std::cerr << "Error on flushing file [" << filename << "]. " << std::endl;
-                outFile.close();
+                if(writeToFile)
+                {
+                    outFile.flush();
+                    outFile << std::endl; // now all data are written to file
+                    if(outFile.fail())
+                        std::cerr << "Error on flushing file [" << filename << "]. " << std::endl;
+                    outFile.close();
+                }
+                __delete(localReduce);
             }
-            __delete(localReduce);
         }
-    }
 
-    void restart(uint32_t restartStep, const std::string restartDirectory)
-    {
-        if( !writeToFile )
-            return;
+        void restart(uint32_t restartStep, const std::string restartDirectory)
+        {
+            if(!writeToFile)
+                return;
 
-        writeToFile = restoreTxtFile( outFile,
-                                      filename,
-                                      restartStep,
-                                      restartDirectory );
-    }
+            writeToFile = restoreTxtFile(outFile, filename, restartStep, restartDirectory);
+        }
 
-    void checkpoint(uint32_t currentStep, const std::string checkpointDirectory)
-    {
-        if( !writeToFile )
-            return;
+        void checkpoint(uint32_t currentStep, const std::string checkpointDirectory)
+        {
+            if(!writeToFile)
+                return;
 
-        checkpointTxtFile( outFile,
-                           filename,
-                           currentStep,
-                           checkpointDirectory );
-    }
+            checkpointTxtFile(outFile, filename, currentStep, checkpointDirectory);
+        }
 
-    void getEnergyFields(uint32_t currentStep)
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
+        void getEnergyFields(uint32_t currentStep)
+        {
+            DataConnector& dc = Environment<>::get().DataConnector();
 
-        auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-        auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
+            auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+            auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
 
-        /* idx == 0 -> fieldB
-         * idx == 1 -> fieldE
-         */
-        EneVectorType globalFieldEnergy[2];
-        globalFieldEnergy[0]=EneVectorType::create(0.0);
-        globalFieldEnergy[1]=EneVectorType::create(0.0);
+            /* idx == 0 -> fieldB
+             * idx == 1 -> fieldE
+             */
+            EneVectorType globalFieldEnergy[2];
+            globalFieldEnergy[0] = EneVectorType::create(0.0);
+            globalFieldEnergy[1] = EneVectorType::create(0.0);
 
-        EneVectorType localReducedFieldEnergy[2];
-        localReducedFieldEnergy[0] = reduceField(fieldB);
-        localReducedFieldEnergy[1] = reduceField(fieldE);
+            EneVectorType localReducedFieldEnergy[2];
+            localReducedFieldEnergy[0] = reduceField(fieldB);
+            localReducedFieldEnergy[1] = reduceField(fieldE);
 
-        mpiReduce(nvidia::functors::Add(),
-                  globalFieldEnergy,
-                  localReducedFieldEnergy,
-                  2,
-                  mpi::reduceMethods::Reduce());
+            mpiReduce(
+                nvidia::functors::Add(),
+                globalFieldEnergy,
+                localReducedFieldEnergy,
+                2,
+                mpi::reduceMethods::Reduce());
 
-        float_64 energyFieldBReduced=0.0;
-        float_64 energyFieldEReduced=0.0;
+            float_64 energyFieldBReduced = 0.0;
+            float_64 energyFieldEReduced = 0.0;
 
-        for(int d=0; d<FieldB::numComponents; ++d)
-        {
-            /* B field convert */
-            globalFieldEnergy[0][d] *= float_64(0.5 / MUE0 * CELL_VOLUME);
-            /* E field convert */
-            globalFieldEnergy[1][d] *= float_64(EPS0 * CELL_VOLUME * 0.5);
-
-            /* add all to one */
-            energyFieldBReduced+= globalFieldEnergy[0][d];
-            energyFieldEReduced+= globalFieldEnergy[1][d];
-        }
+            for(int d = 0; d < FieldB::numComponents; ++d)
+            {
+                /* B field convert */
+                globalFieldEnergy[0][d] *= float_64(0.5 / MUE0 * CELL_VOLUME);
+                /* E field convert */
+                globalFieldEnergy[1][d] *= float_64(EPS0 * CELL_VOLUME * 0.5);
+
+                /* add all to one */
+                energyFieldBReduced += globalFieldEnergy[0][d];
+                energyFieldEReduced += globalFieldEnergy[1][d];
+            }
 
-        float_64 globalEnergy = energyFieldEReduced + energyFieldBReduced;
+            float_64 globalEnergy = energyFieldEReduced + energyFieldBReduced;
 
 
-        if (writeToFile)
-        {
-            using dbl = std::numeric_limits<float_64>;
+            if(writeToFile)
+            {
+                using dbl = std::numeric_limits<float_64>;
 
-            outFile.precision(dbl::digits10);
-            outFile << currentStep << " " << std::scientific << globalEnergy * UNIT_ENERGY << " "
-                    << (globalFieldEnergy[0] * UNIT_ENERGY).toString(" ","") << " "
-                    << (globalFieldEnergy[1] * UNIT_ENERGY).toString(" ","") << std::endl;
+                outFile.precision(dbl::digits10);
+                outFile << currentStep << " " << std::scientific << globalEnergy * UNIT_ENERGY << " "
+                        << (globalFieldEnergy[0] * UNIT_ENERGY).toString(" ", "") << " "
+                        << (globalFieldEnergy[1] * UNIT_ENERGY).toString(" ", "") << std::endl;
+            }
         }
-    }
 
-private:
-
-    template<typename T_Field>
-    EneVectorType reduceField( std::shared_ptr< T_Field > field )
-    {
-        /*define stacked DataBox's for reduce algorithm*/
-        typedef DataBoxUnaryTransform<typename T_Field::DataBoxType, energyFields::squareComponentWise > TransformedBox;
-        typedef DataBoxUnaryTransform<TransformedBox, energyFields::cast64Bit > Box64bit;
-        using D1Box = DataBoxDim1Access<Box64bit>;
-
-        /* reduce field E*/
-        DataSpace<simDim> fieldSize = field->getGridLayout().getDataSpaceWithoutGuarding();
-        DataSpace<simDim> fieldGuard = field->getGridLayout().getGuard();
+    private:
+        template<typename T_Field>
+        EneVectorType reduceField(std::shared_ptr<T_Field> field)
+        {
+            /*define stacked DataBox's for reduce algorithm*/
+            typedef DataBoxUnaryTransform<typename T_Field::DataBoxType, energyFields::squareComponentWise>
+                TransformedBox;
+            typedef DataBoxUnaryTransform<TransformedBox, energyFields::cast64Bit> Box64bit;
+            using D1Box = DataBoxDim1Access<Box64bit>;
 
-        TransformedBox fieldTransform(field->getDeviceDataBox().shift(fieldGuard));
-        Box64bit field64bit(fieldTransform);
-        D1Box d1Access(field64bit, fieldSize);
+            /* reduce field E*/
+            DataSpace<simDim> fieldSize = field->getGridLayout().getDataSpaceWithoutGuarding();
+            DataSpace<simDim> fieldGuard = field->getGridLayout().getGuard();
 
-        EneVectorType fieldEnergyReduced = (*localReduce)(nvidia::functors::Add(),
-                                               d1Access,
-                                               fieldSize.productOfComponents());
+            TransformedBox fieldTransform(field->getDeviceDataBox().shift(fieldGuard));
+            Box64bit field64bit(fieldTransform);
+            D1Box d1Access(field64bit, fieldSize);
 
-        return fieldEnergyReduced;
-    }
+            EneVectorType fieldEnergyReduced
+                = (*localReduce)(nvidia::functors::Add(), d1Access, fieldSize.productOfComponents());
 
-};
+            return fieldEnergyReduced;
+        }
+    };
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/plugins/EnergyParticles.hpp b/include/picongpu/plugins/EnergyParticles.hpp
index 0c2881dbcf..974261195f 100644
--- a/include/picongpu/plugins/EnergyParticles.hpp
+++ b/include/picongpu/plugins/EnergyParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau,
  *                     Rene Widera, Richard Pausch, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -54,17 +54,15 @@
 
 namespace picongpu
 {
-
     /** accumulate the kinetic and total energy
      *
      * All energies are summed over all particles of a species.
      *
      * @tparam T_numWorkers number of workers
      */
-    template< uint32_t T_numWorkers >
+    template<uint32_t T_numWorkers>
     struct KernelEnergyParticles
     {
-
         /** accumulate particle energies
          *
          * @tparam T_ParBox pmacc::ParticlesBox, particle box type
@@ -76,327 +74,192 @@ namespace picongpu
          *                (two elements 0 == kinetic; 1 == total energy)
          * @param mapper functor to map a block to a supercell
          */
-        template<
-            typename T_ParBox,
-            typename T_DBox,
-            typename T_Mapping,
-            typename T_Acc,
-            typename T_Filter
-        >
-        DINLINE void operator( )(
-            T_Acc const & acc,
-            T_ParBox pb,
-            T_DBox gEnergy,
-            T_Mapping mapper,
-            T_Filter filter
-        ) const
+        template<typename T_ParBox, typename T_DBox, typename T_Mapping, typename T_Acc, typename T_Filter>
+        DINLINE void operator()(T_Acc const& acc, T_ParBox pb, T_DBox gEnergy, T_Mapping mapper, T_Filter filter) const
         {
             using namespace mappings::threads;
 
             constexpr uint32_t numWorkers = T_numWorkers;
-            constexpr uint32_t numParticlesPerFrame = pmacc::math::CT::volume<
-                typename T_ParBox::FrameType::SuperCellSize
-            >::type::value;
+            constexpr uint32_t numParticlesPerFrame
+                = pmacc::math::CT::volume<typename T_ParBox::FrameType::SuperCellSize>::type::value;
 
-            uint32_t const workerIdx = threadIdx.x;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
             using FramePtr = typename T_ParBox::FramePtr;
 
             // shared kinetic energy
-            PMACC_SMEM(
-                acc,
-                shEnergyKin,
-                float_X
-            );
+            PMACC_SMEM(acc, shEnergyKin, float_X);
             // shared total energy
-            PMACC_SMEM(
-                acc,
-                shEnergy,
-                float_X
-            );
+            PMACC_SMEM(acc, shEnergy, float_X);
 
-            using ParticleDomCfg = IdxConfig<
-                numParticlesPerFrame,
-                numWorkers
-            >;
+            using ParticleDomCfg = IdxConfig<numParticlesPerFrame, numWorkers>;
 
             // sum kinetic energy for all particles touched by the virtual thread
-            float_X localEnergyKin( 0.0 );
-            float_X localEnergy( 0.0 );
+            float_X localEnergyKin(0.0);
+            float_X localEnergy(0.0);
 
-            using MasterOnly = IdxConfig<
-                1,
-                numWorkers
-            >;
+            using MasterOnly = IdxConfig<1, numWorkers>;
 
 
-            ForEachIdx< MasterOnly >{ workerIdx }(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    // set shared kinetic energy to zero
-                    shEnergyKin = float_X( 0.0 );
-                    // set shared total energy to zero
-                    shEnergy = float_X( 0.0 );
-                }
-            );
+            ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
+                // set shared kinetic energy to zero
+                shEnergyKin = float_X(0.0);
+                // set shared total energy to zero
+                shEnergy = float_X(0.0);
+            });
 
-            __syncthreads( );
+            cupla::__syncthreads(acc);
 
-            DataSpace< simDim > const superCellIdx( mapper.getSuperCellIndex(
-                DataSpace< simDim >( blockIdx )
-            ));
+            DataSpace<simDim> const superCellIdx(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
 
             // each virtual thread is working on an own frame
-            FramePtr frame = pb.getLastFrame( superCellIdx );
+            FramePtr frame = pb.getLastFrame(superCellIdx);
 
             // end kernel if we have no frames within the supercell
-            if( !frame.isValid( ) )
+            if(!frame.isValid())
                 return;
 
-            auto accFilter = filter(
-                acc,
-                superCellIdx - mapper.getGuardingSuperCells( ),
-                WorkerCfg< numWorkers >{ workerIdx }
-            );
-
-            memory::CtxArray<
-                typename FramePtr::type::ParticleType,
-                ParticleDomCfg
-            >
-            currentParticleCtx(
+            auto accFilter
+                = filter(acc, superCellIdx - mapper.getGuardingSuperCells(), WorkerCfg<numWorkers>{workerIdx});
+
+            memory::CtxArray<typename FramePtr::type::ParticleType, ParticleDomCfg> currentParticleCtx(
                 workerIdx,
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    auto particle = frame[ linearIdx ];
+                [&](uint32_t const linearIdx, uint32_t const) {
+                    auto particle = frame[linearIdx];
                     /* - only particles from the last frame must be checked
                      * - all other particles are always valid
                      */
-                    if( particle[ multiMask_ ] != 1 )
-                        particle.setHandleInvalid( );
+                    if(particle[multiMask_] != 1)
+                        particle.setHandleInvalid();
                     return particle;
-                }
-            );
+                });
 
-            while( frame.isValid( ) )
+            while(frame.isValid())
             {
                 // loop over all particles in the frame
-                ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
+                ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
 
-                forEachParticle(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    /* get one particle */
+                    auto& particle = currentParticleCtx[idx];
+                    if(accFilter(acc, particle))
                     {
-                        /* get one particle */
-                        auto & particle = currentParticleCtx[ idx ];
-                        if(
-                            accFilter(
-                                acc,
-                                particle
-                            )
-                        )
-                        {
-                            float3_X const mom = particle[ momentum_ ];
-                            // compute square of absolute momentum of the particle
-                            float_X const mom2 = math::abs2( mom );
-                            float_X const weighting = particle[ weighting_ ];
-                            float_X const mass = attribute::getMass(
-                                weighting,
-                                particle
-                            );
-                            float_X const c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
-
-                            // calculate kinetic energy of the macro particle
-                            localEnergyKin += KinEnergy<>( )(
-                                mom,
-                                mass
-                            );
-
-                            /* total energy for particles:
-                             *    E^2 = p^2*c^2 + m^2*c^4
-                             *        = c^2 * [p^2 + m^2*c^2]
-                             */
-                            localEnergy += algorithms::math::sqrt(
-                                mom2 +
-                                mass * mass * c2
-                            ) * SPEED_OF_LIGHT;
-
-                        }
+                        float3_X const mom = particle[momentum_];
+                        // compute square of absolute momentum of the particle
+                        float_X const mom2 = pmacc::math::abs2(mom);
+                        float_X const weighting = particle[weighting_];
+                        float_X const mass = attribute::getMass(weighting, particle);
+                        float_X const c2 = SPEED_OF_LIGHT * SPEED_OF_LIGHT;
+
+                        // calculate kinetic energy of the macro particle
+                        localEnergyKin += KinEnergy<>()(mom, mass);
+
+                        /* total energy for particles:
+                         *    E^2 = p^2*c^2 + m^2*c^4
+                         *        = c^2 * [p^2 + m^2*c^2]
+                         */
+                        localEnergy += math::sqrt(mom2 + mass * mass * c2) * SPEED_OF_LIGHT;
                     }
-                );
+                });
 
                 // set frame to next particle frame
                 frame = pb.getPreviousFrame(frame);
-                forEachParticle(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
-                    {
-                        /* Update particle for the next round.
-                         * The frame list is traverse from the last to the first frame.
-                         * Only the last frame can contain gaps therefore all following
-                         * frames are filled with fully particles.
-                         */
-                        currentParticleCtx[ idx ] = frame[ linearIdx ];
-                    }
-                );
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    /* Update particle for the next round.
+                     * The frame list is traverse from the last to the first frame.
+                     * Only the last frame can contain gaps therefore all following
+                     * frames are filled with fully particles.
+                     */
+                    currentParticleCtx[idx] = frame[linearIdx];
+                });
             }
 
             // each virtual thread adds the energies to the shared memory
-            atomicAdd(
-                &shEnergyKin,
-                localEnergyKin,
-                ::alpaka::hierarchy::Threads{}
-            );
-            atomicAdd(
-                &shEnergy,
-                localEnergy,
-                ::alpaka::hierarchy::Threads{}
-            );
+            cupla::atomicAdd(acc, &shEnergyKin, localEnergyKin, ::alpaka::hierarchy::Threads{});
+            cupla::atomicAdd(acc, &shEnergy, localEnergy, ::alpaka::hierarchy::Threads{});
 
             // wait that all virtual threads updated the shared memory energies
-            __syncthreads( );
+            cupla::__syncthreads(acc);
 
             // add energies on global level using global memory
-            ForEachIdx< MasterOnly >{ workerIdx }(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    // add kinetic energy
-                    atomicAdd(
-                        &( gEnergy[ 0 ] ),
-                        static_cast< float_64 >( shEnergyKin ),
-                        ::alpaka::hierarchy::Blocks{}
-                    );
-                    // add total energy
-                    atomicAdd(
-                        &( gEnergy[ 1 ] ),
-                        static_cast< float_64 >( shEnergy ),
-                        ::alpaka::hierarchy::Blocks{}
-                    );
-                }
-            );
+            ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
+                // add kinetic energy
+                cupla::atomicAdd(
+                    acc,
+                    &(gEnergy[0]),
+                    static_cast<float_64>(shEnergyKin),
+                    ::alpaka::hierarchy::Blocks{});
+                // add total energy
+                cupla::atomicAdd(acc, &(gEnergy[1]), static_cast<float_64>(shEnergy), ::alpaka::hierarchy::Blocks{});
+            });
         }
     };
 
-    template< typename ParticlesType >
+    template<typename ParticlesType>
     class EnergyParticles : public plugins::multi::ISlave
     {
     public:
-
         struct Help : public plugins::multi::IHelp
         {
-
             /** creates an instance of ISlave
              *
              * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
              * @param help plugin defined help
              * @param id index of the plugin, range: [0;help->getNumPlugins())
              */
-            std::shared_ptr< ISlave > create(
-                std::shared_ptr< IHelp > & help,
-                size_t const id,
-                MappingDesc* cellDescription
-            )
+            std::shared_ptr<ISlave> create(std::shared_ptr<IHelp>& help, size_t const id, MappingDesc* cellDescription)
             {
-                return std::shared_ptr< ISlave >(
-                    new EnergyParticles< ParticlesType >(
-                        help,
-                        id,
-                        cellDescription
-                    )
-                );
+                return std::shared_ptr<ISlave>(new EnergyParticles<ParticlesType>(help, id, cellDescription));
             }
 
             // find all valid filter for the current used species
-            using EligibleFilters = typename MakeSeqFromNestedSeq<
-                typename bmpl::transform<
-                    particles::filter::AllParticleFilters,
-                    particles::traits::GenerateSolversIfSpeciesEligible<
-                        bmpl::_1,
-                        ParticlesType
-                    >
-                >::type
-            >::type;
+            using EligibleFilters = typename MakeSeqFromNestedSeq<typename bmpl::transform<
+                particles::filter::AllParticleFilters,
+                particles::traits::GenerateSolversIfSpeciesEligible<bmpl::_1, ParticlesType>>::type>::type;
 
             //! periodicity of computing the particle energy
-            plugins::multi::Option< std::string > notifyPeriod = {
-                "period",
-                "compute kinetic and total energy [for each n-th step] enable plugin by setting a non-zero value"
-            };
-            plugins::multi::Option< std::string > filter = {
-                "filter",
-                "particle filter: "
-            };
+            plugins::multi::Option<std::string> notifyPeriod
+                = {"period",
+                   "compute kinetic and total energy [for each n-th step] enable plugin by setting a non-zero value"};
+            plugins::multi::Option<std::string> filter = {"filter", "particle filter: "};
 
             //! string list with all possible particle filters
             std::string concatenatedFilterNames;
-            std::vector< std::string > allowedFilters;
+            std::vector<std::string> allowedFilters;
 
             ///! method used by plugin controller to get --help description
             void registerHelp(
-                boost::program_options::options_description & desc,
-                std::string const & masterPrefix = std::string{ }
-            )
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
             {
+                meta::ForEach<EligibleFilters, plugins::misc::AppendName<bmpl::_1>> getEligibleFilterNames;
+                getEligibleFilterNames(allowedFilters);
 
-                meta::ForEach<
-                    EligibleFilters,
-                    plugins::misc::AppendName< bmpl::_1 >
-                > getEligibleFilterNames;
-                getEligibleFilterNames( allowedFilters );
-
-                concatenatedFilterNames = plugins::misc::concatenateToString(
-                    allowedFilters,
-                    ", "
-                );
-
-                notifyPeriod.registerHelp(
-                    desc,
-                    masterPrefix + prefix
-                );
-                filter.registerHelp(
-                    desc,
-                    masterPrefix + prefix,
-                    std::string( "[" ) + concatenatedFilterNames + "]"
-                );
+                concatenatedFilterNames = plugins::misc::concatenateToString(allowedFilters, ", ");
+
+                notifyPeriod.registerHelp(desc, masterPrefix + prefix);
+                filter.registerHelp(desc, masterPrefix + prefix, std::string("[") + concatenatedFilterNames + "]");
             }
 
             void expandHelp(
-                boost::program_options::options_description & desc,
-                std::string const & masterPrefix = std::string{ }
-            )
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
             {
             }
 
 
             void validateOptions()
             {
-                if( notifyPeriod.size() != filter.size() )
-                    throw std::runtime_error( name + ": parameter filter and period are not used the same number of times" );
+                if(notifyPeriod.size() != filter.size())
+                    throw std::runtime_error(
+                        name + ": parameter filter and period are not used the same number of times");
 
                 // check if user passed filter name are valid
-                for( auto const & filterName : filter)
+                for(auto const& filterName : filter)
                 {
-                    if(
-                        std::find(
-                            allowedFilters.begin(),
-                            allowedFilters.end(),
-                            filterName
-                        ) == allowedFilters.end()
-                    )
+                    if(std::find(allowedFilters.begin(), allowedFilters.end(), filterName) == allowedFilters.end())
                     {
-                        throw std::runtime_error( name + ": unknown filter '" + filterName + "'" );
+                        throw std::runtime_error(name + ": unknown filter '" + filterName + "'");
                     }
                 }
             }
@@ -425,204 +288,153 @@ namespace picongpu
             //! short description of the plugin
             std::string const description = "calculate the energy of a species";
             //! prefix used for command line arguments
-            std::string const prefix = ParticlesType::FrameType::getName( ) + std::string( "_energy" );
+            std::string const prefix = ParticlesType::FrameType::getName() + std::string("_energy");
         };
 
         //! must be implemented by the user
-        static std::shared_ptr< plugins::multi::IHelp > getHelp()
+        static std::shared_ptr<plugins::multi::IHelp> getHelp()
         {
-            return std::shared_ptr< plugins::multi::IHelp >( new Help{ } );
+            return std::shared_ptr<plugins::multi::IHelp>(new Help{});
         }
 
-        EnergyParticles(
-            std::shared_ptr< plugins::multi::IHelp > & help,
-            size_t const id,
-            MappingDesc* cellDescription
-        ) :
-            m_help( std::static_pointer_cast< Help >(help) ),
-            m_id( id ),
-            m_cellDescription( cellDescription )
+        EnergyParticles(std::shared_ptr<plugins::multi::IHelp>& help, size_t const id, MappingDesc* cellDescription)
+            : m_help(std::static_pointer_cast<Help>(help))
+            , m_id(id)
+            , m_cellDescription(cellDescription)
         {
-            filename = m_help->getOptionPrefix() + "_" + m_help->filter.get( m_id ) + ".dat";
+            filename = m_help->getOptionPrefix() + "_" + m_help->filter.get(m_id) + ".dat";
 
             // decide which MPI-rank writes output
-            writeToFile = reduce.hasResult( mpi::reduceMethods::Reduce( ) );
+            writeToFile = reduce.hasResult(mpi::reduceMethods::Reduce());
 
             // create two ints on gpu and host
-            gEnergy = new GridBuffer<
-                float_64,
-                DIM1
-            >( DataSpace< DIM1 >( 2 ) );
+            gEnergy = new GridBuffer<float_64, DIM1>(DataSpace<DIM1>(2));
 
             // only MPI rank that writes to file
-            if( writeToFile )
+            if(writeToFile)
             {
                 // open output file
-                outFile.open(
-                    filename.c_str( ),
-                    std::ofstream::out | std::ostream::trunc
-                );
+                outFile.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
 
                 // error handling
-                if( !outFile )
+                if(!outFile)
                 {
-                    std::cerr <<
-                        "Can't open file [" <<
-                        filename <<
-                        "] for output, diasble plugin output. " <<
-                        std::endl;
+                    std::cerr << "Can't open file [" << filename << "] for output, diasble plugin output. "
+                              << std::endl;
                     writeToFile = false;
                 }
 
                 // create header of the file
-                outFile << "#step Ekin_Joule E_Joule" << " \n";
+                outFile << "#step Ekin_Joule E_Joule"
+                        << " \n";
             }
 
             // set how often the plugin should be executed while PIConGPU is running
-            Environment<>::get( ).PluginConnector( ).setNotificationPeriod(
-                this,
-                m_help->notifyPeriod.get( id )
-            );
+            Environment<>::get().PluginConnector().setNotificationPeriod(this, m_help->notifyPeriod.get(id));
         }
 
-        virtual ~EnergyParticles( )
+        virtual ~EnergyParticles()
         {
-            if( writeToFile )
+            if(writeToFile)
             {
-                outFile.flush( );
+                outFile.flush();
                 // flush cached data to file
                 outFile << std::endl;
 
-                if( outFile.fail( ) )
+                if(outFile.fail())
                     std::cerr << "Error on flushing file [" << filename << "]. " << std::endl;
-                outFile.close( );
+                outFile.close();
             }
             // free global memory on GPU
-            __delete( gEnergy );
+            __delete(gEnergy);
         }
 
         /** this code is executed if the current time step is supposed to compute
          * the energy
          */
-        void notify( uint32_t currentStep )
+        void notify(uint32_t currentStep)
         {
             // call the method that calls the plugin kernel
-            calculateEnergyParticles < CORE + BORDER > ( currentStep );
+            calculateEnergyParticles<CORE + BORDER>(currentStep);
         }
 
 
-        void restart(
-            uint32_t restartStep,
-            std::string const & restartDirectory
-        )
+        void restart(uint32_t restartStep, std::string const& restartDirectory)
         {
-            if( !writeToFile )
+            if(!writeToFile)
                 return;
 
-            writeToFile = restoreTxtFile(
-                outFile,
-                filename,
-                restartStep,
-                restartDirectory
-            );
+            writeToFile = restoreTxtFile(outFile, filename, restartStep, restartDirectory);
         }
 
-        void checkpoint(
-            uint32_t currentStep,
-            std::string const & checkpointDirectory
-        )
+        void checkpoint(uint32_t currentStep, std::string const& checkpointDirectory)
         {
-            if( !writeToFile )
+            if(!writeToFile)
                 return;
 
-            checkpointTxtFile(
-                outFile,
-                filename,
-                currentStep,
-                checkpointDirectory
-            );
+            checkpointTxtFile(outFile, filename, currentStep, checkpointDirectory);
         }
+
     private:
         //! method to call analysis and plugin-kernel calls
-        template< uint32_t AREA >
-        void calculateEnergyParticles( uint32_t currentStep )
+        template<uint32_t AREA>
+        void calculateEnergyParticles(uint32_t currentStep)
         {
-            DataConnector &dc = Environment<>::get( ).DataConnector( );
+            DataConnector& dc = Environment<>::get().DataConnector();
 
             // use data connector to get particle data
-            auto particles = dc.get< ParticlesType >(
-                ParticlesType::FrameType::getName( ),
-                true
-            );
+            auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
 
             // initialize global energies with zero
-            gEnergy->getDeviceBuffer( ).setValue( 0.0 );
+            gEnergy->getDeviceBuffer().setValue(0.0);
 
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
+            constexpr uint32_t numWorkers
+                = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
 
-            AreaMapping<
-                AREA,
-                MappingDesc
-            > mapper( *m_cellDescription );
+            AreaMapping<AREA, MappingDesc> mapper(*m_cellDescription);
 
-            auto kernel = PMACC_KERNEL( KernelEnergyParticles< numWorkers >{ } )(
-                mapper.getGridDim( ),
-                numWorkers
-            );
+            auto kernel = PMACC_KERNEL(KernelEnergyParticles<numWorkers>{})(mapper.getGridDim(), numWorkers);
             auto binaryKernel = std::bind(
                 kernel,
-                particles->getDeviceParticlesBox( ),
-                gEnergy->getDeviceBuffer( ).getDataBox( ),
+                particles->getDeviceParticlesBox(),
+                gEnergy->getDeviceBuffer().getDataBox(),
                 mapper,
-                std::placeholders::_1
-            );
-
-            meta::ForEach<
-                typename Help::EligibleFilters,
-                plugins::misc::ExecuteIfNameIsEqual< bmpl::_1 >
-            >{ }(
-                m_help->filter.get( m_id ),
+                std::placeholders::_1);
+
+            meta::ForEach<typename Help::EligibleFilters, plugins::misc::ExecuteIfNameIsEqual<bmpl::_1>>{}(
+                m_help->filter.get(m_id),
                 currentStep,
-                binaryKernel
-            );
+                binaryKernel);
 
-            dc.releaseData( ParticlesType::FrameType::getName( ) );
+            dc.releaseData(ParticlesType::FrameType::getName());
 
             // get energy from GPU
-            gEnergy->deviceToHost( );
+            gEnergy->deviceToHost();
 
             // create storage for the global reduced result
             float_64 reducedEnergy[2];
 
             // add energies from all GPUs using MPI
             reduce(
-                nvidia::functors::Add( ),
+                nvidia::functors::Add(),
                 reducedEnergy,
-                gEnergy->getHostBuffer( ).getBasePointer( ),
+                gEnergy->getHostBuffer().getBasePointer(),
                 2,
-                mpi::reduceMethods::Reduce( )
-            );
+                mpi::reduceMethods::Reduce());
 
             /* print timestep, kinetic energy and total energy to file: */
-            if( writeToFile )
+            if(writeToFile)
             {
-                using dbl = std::numeric_limits< float_64 >;
+                using dbl = std::numeric_limits<float_64>;
 
-                outFile.precision( dbl::digits10 );
-                outFile << currentStep << " "
-                        << std::scientific
-                        << reducedEnergy[ 0 ] * UNIT_ENERGY << " "
-                        << reducedEnergy[ 1 ] * UNIT_ENERGY << std::endl;
+                outFile.precision(dbl::digits10);
+                outFile << currentStep << " " << std::scientific << reducedEnergy[0] * UNIT_ENERGY << " "
+                        << reducedEnergy[1] * UNIT_ENERGY << std::endl;
             }
         }
 
         //! energy values (global on GPU)
-        GridBuffer<
-            float_64,
-            DIM1
-        > * gEnergy = nullptr;
+        GridBuffer<float_64, DIM1>* gEnergy = nullptr;
 
         MappingDesc* m_cellDescription;
 
@@ -641,47 +453,30 @@ namespace picongpu
         //! MPI reduce to add all energies over several GPUs
         mpi::MPIReduce reduce;
 
-        std::shared_ptr< Help > m_help;
+        std::shared_ptr<Help> m_help;
         size_t m_id;
     };
 
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_UnspecifiedSpecies
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        EnergyParticles< T_UnspecifiedSpecies >
-    >
+    namespace particles
     {
-        using FrameType = typename T_Species::FrameType;
-
-        // this plugin needs at least the weighting and momentum attributes
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            momentum
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        // and also a mass ratio for energy calculation from momentum
-        using SpeciesHasFlags = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasFlags
-        >;
-    };
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            template<typename T_Species, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<T_Species, EnergyParticles<T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                // this plugin needs at least the weighting and momentum attributes
+                using RequiredIdentifiers = MakeSeq_t<weighting, momentum>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                // and also a mass ratio for energy calculation from momentum
+                using SpeciesHasFlags = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasFlags>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/plugins/ILightweightPlugin.hpp b/include/picongpu/plugins/ILightweightPlugin.hpp
index 1922621d71..4185209c74 100644
--- a/include/picongpu/plugins/ILightweightPlugin.hpp
+++ b/include/picongpu/plugins/ILightweightPlugin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt
+/* Copyright 2014-2021 Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -42,7 +42,6 @@ namespace picongpu
 
         virtual ~ILightweightPlugin()
         {
-
         }
     };
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/plugins/ISimulationPlugin.hpp b/include/picongpu/plugins/ISimulationPlugin.hpp
index 118b90de14..fc9d1a47e5 100644
--- a/include/picongpu/plugins/ISimulationPlugin.hpp
+++ b/include/picongpu/plugins/ISimulationPlugin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
@@ -35,11 +34,10 @@ namespace picongpu
     class ISimulationPlugin : public IPlugin
     {
     public:
-        virtual void setMappingDescription(MappingDesc *cellDescription) = 0;
+        virtual void setMappingDescription(MappingDesc* cellDescription) = 0;
 
         virtual ~ISimulationPlugin()
         {
         }
     };
-}
-
+} // namespace picongpu
diff --git a/include/picongpu/plugins/IntensityPlugin.hpp b/include/picongpu/plugins/IntensityPlugin.hpp
index 71aa9ad84c..e28b14c7fa 100644
--- a/include/picongpu/plugins/IntensityPlugin.hpp
+++ b/include/picongpu/plugins/IntensityPlugin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -43,364 +43,365 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-/* count particles in an area
- * is not optimized, it checks any particle position if it is really a particle
- */
-struct KernelIntensity
-{
-    template<
-        typename FieldBox,
-        typename BoxMax,
-        typename BoxIntegral,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        FieldBox field,
-        DataSpace<simDim> cellsCount,
-        BoxMax boxMax,
-        BoxIntegral integralBox
-    ) const
+    /* count particles in an area
+     * is not optimized, it checks any particle position if it is really a particle
+     */
+    struct KernelIntensity
     {
-
-        typedef MappingDesc::SuperCellSize SuperCellSize;
-        PMACC_SMEM( acc, s_integrated, memory::Array< float_X,SuperCellSize::y::value > );
-        PMACC_SMEM( acc, s_max, memory::Array< float_X, SuperCellSize::y::value > );
+        template<typename FieldBox, typename BoxMax, typename BoxIntegral, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            FieldBox field,
+            DataSpace<simDim> cellsCount,
+            BoxMax boxMax,
+            BoxIntegral integralBox) const
+        {
+            typedef MappingDesc::SuperCellSize SuperCellSize;
+            PMACC_SMEM(acc, s_integrated, memory::Array<float_X, SuperCellSize::y::value>);
+            PMACC_SMEM(acc, s_max, memory::Array<float_X, SuperCellSize::y::value>);
 
 
-        /*descripe size of a worker block for cached memory*/
-        typedef SuperCellDescription<
-            pmacc::math::CT::Int<SuperCellSize::x::value,SuperCellSize::y::value>
-            > SuperCell2D;
+            /*descripe size of a worker block for cached memory*/
+            typedef SuperCellDescription<pmacc::math::CT::Int<SuperCellSize::x::value, SuperCellSize::y::value>>
+                SuperCell2D;
 
-        auto s_field = CachedBox::create<
-            0,
-            float_32
-        >(
-            acc,
-            SuperCell2D()
-        );
+            auto s_field = CachedBox::create<0, float_32>(acc, SuperCell2D());
 
-        int y = blockIdx.y * SuperCellSize::y::value + threadIdx.y;
-        int yGlobal = y + GuardSize::y::value * SuperCellSize::y::value;
-        const DataSpace<DIM2> threadId(threadIdx);
+            int y = cupla::blockIdx(acc).y * SuperCellSize::y::value + cupla::threadIdx(acc).y;
+            int yGlobal = y + GuardSize::y::value * SuperCellSize::y::value;
+            const DataSpace<DIM2> threadId(cupla::threadIdx(acc));
 
-        if (threadId.x() == 0)
-        {
-            // clear destination arrays
-            s_integrated[threadId.y()] = float_X(0.0);
-            s_max[threadId.y()] = float_X(0.0);
-        }
-        __syncthreads();
+            if(threadId.x() == 0)
+            {
+                // clear destination arrays
+                s_integrated[threadId.y()] = float_X(0.0);
+                s_max[threadId.y()] = float_X(0.0);
+            }
+            cupla::__syncthreads(acc);
 
-        // move cell-wise over z direction (without guarding cells)
-        for (int z = GuardSize::z::value * SuperCellSize::z::value; z < cellsCount.z() - GuardSize::z::value * SuperCellSize::z::value; ++z)
-        {
-            // move supercell-wise over x direction without guarding
-            for (int x = GuardSize::x::value * SuperCellSize::x::value + threadId.x(); x < cellsCount.x() - GuardSize::x::value * SuperCellSize::x::value; x += SuperCellSize::x::value)
+            // move cell-wise over z direction (without guarding cells)
+            for(int z = GuardSize::z::value * SuperCellSize::z::value;
+                z < cellsCount.z() - GuardSize::z::value * SuperCellSize::z::value;
+                ++z)
             {
-                const float3_X field_at_point(field(DataSpace<DIM3 > (x, yGlobal, z)));
-                s_field(threadId) = math::abs2(field_at_point);
-                __syncthreads();
-                if (threadId.x() == 0)
+                // move supercell-wise over x direction without guarding
+                for(int x = GuardSize::x::value * SuperCellSize::x::value + threadId.x();
+                    x < cellsCount.x() - GuardSize::x::value * SuperCellSize::x::value;
+                    x += SuperCellSize::x::value)
                 {
-                    // master thread moves cell-wise over 2D supercell
-                    for (int x_local = 0; x_local < SuperCellSize::x::value; ++x_local)
+                    const float3_X field_at_point(field(DataSpace<DIM3>(x, yGlobal, z)));
+                    s_field(threadId) = pmacc::math::abs2(field_at_point);
+                    cupla::__syncthreads(acc);
+                    if(threadId.x() == 0)
                     {
-                        DataSpace<DIM2> localId(x_local, threadId.y());
-                        s_integrated[threadId.y()] += s_field(localId);
-                        s_max[threadId.y()] = fmaxf(s_max[threadId.y()], s_field(localId));
-
+                        // master thread moves cell-wise over 2D supercell
+                        for(int x_local = 0; x_local < SuperCellSize::x::value; ++x_local)
+                        {
+                            DataSpace<DIM2> localId(x_local, threadId.y());
+                            s_integrated[threadId.y()] += s_field(localId);
+                            s_max[threadId.y()] = fmaxf(s_max[threadId.y()], s_field(localId));
+                        }
                     }
                 }
             }
-        }
-        __syncthreads();
+            cupla::__syncthreads(acc);
 
-        if (threadId.x() == 0)
-        {
-            /*copy result to global array*/
-            integralBox[y] = s_integrated[threadId.y()];
-            boxMax[y] = s_max[threadId.y()];
+            if(threadId.x() == 0)
+            {
+                /*copy result to global array*/
+                integralBox[y] = s_integrated[threadId.y()];
+                boxMax[y] = s_max[threadId.y()];
+            }
         }
+    };
 
-
-    }
-};
-
-class IntensityPlugin : public ILightweightPlugin
-{
-private:
-    typedef MappingDesc::SuperCellSize SuperCellSize;
-
-
-    GridBuffer<float_32, DIM1> *localMaxIntensity;
-    GridBuffer<float_32, DIM1> *localIntegratedIntensity;
-    MappingDesc *cellDescription;
-    std::string notifyPeriod;
-
-    std::string pluginName;
-    std::string pluginPrefix;
-
-    std::ofstream outFileMax;
-    std::ofstream outFileIntegrated;
-    /*only rank 0 create a file*/
-    bool writeToFile;
-public:
-
-    /*! Calculate the max und integrated E-Field energy over laser propagation direction (in our case Y)
-     * max is only the SI  value of the amplitude (V/m)
-     * integrated is the integral of amplidude of X and Z on Y position (is V/m in cell volume)
-     */
-    IntensityPlugin() :
-    pluginName("IntensityPlugin: calculate the maximum and integrated E-Field energy\nover laser propagation direction"),
-    pluginPrefix(FieldE::getName() + std::string("_intensity")),
-    localMaxIntensity(nullptr),
-    localIntegratedIntensity(nullptr),
-    cellDescription(nullptr),
-    writeToFile(false)
+    class IntensityPlugin : public ILightweightPlugin
     {
-        Environment<>::get().PluginConnector().registerPlugin(this);
-    }
+    private:
+        typedef MappingDesc::SuperCellSize SuperCellSize;
 
-    virtual ~IntensityPlugin()
-    {
 
-    }
+        GridBuffer<float_32, DIM1>* localMaxIntensity;
+        GridBuffer<float_32, DIM1>* localIntegratedIntensity;
+        MappingDesc* cellDescription;
+        std::string notifyPeriod;
+
+        std::string pluginName;
+        std::string pluginPrefix;
+
+        std::ofstream outFileMax;
+        std::ofstream outFileIntegrated;
+        /*only rank 0 create a file*/
+        bool writeToFile;
+
+    public:
+        /*! Calculate the max und integrated E-Field energy over laser propagation direction (in our case Y)
+         * max is only the SI  value of the amplitude (V/m)
+         * integrated is the integral of amplidude of X and Z on Y position (is V/m in cell volume)
+         */
+        IntensityPlugin()
+            : pluginName("IntensityPlugin: calculate the maximum and integrated E-Field energy\nover laser "
+                         "propagation direction")
+            , pluginPrefix(FieldE::getName() + std::string("_intensity"))
+            , localMaxIntensity(nullptr)
+            , localIntegratedIntensity(nullptr)
+            , cellDescription(nullptr)
+            , writeToFile(false)
+        {
+            Environment<>::get().PluginConnector().registerPlugin(this);
+        }
 
-    void notify(uint32_t currentStep)
-    {
-        calcIntensity(currentStep);
-        combineData(currentStep);
-    }
+        virtual ~IntensityPlugin()
+        {
+        }
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        desc.add_options()
-            ((pluginPrefix + ".period").c_str(),
-             po::value<std::string> (&notifyPeriod), "enable plugin [for each n-th step]");
-    }
+        void notify(uint32_t currentStep)
+        {
+            calcIntensity(currentStep);
+            combineData(currentStep);
+        }
 
-    std::string pluginGetName() const
-    {
-        return pluginName;
-    }
+        void pluginRegisterHelp(po::options_description& desc)
+        {
+            desc.add_options()(
+                (pluginPrefix + ".period").c_str(),
+                po::value<std::string>(&notifyPeriod),
+                "enable plugin [for each n-th step]");
+        }
 
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        this->cellDescription = cellDescription;
-    }
+        std::string pluginGetName() const
+        {
+            return pluginName;
+        }
 
-private:
+        void setMappingDescription(MappingDesc* cellDescription)
+        {
+            this->cellDescription = cellDescription;
+        }
 
-    void pluginLoad()
-    {
-        if(!notifyPeriod.empty())
+    private:
+        void pluginLoad()
         {
-            writeToFile = Environment<simDim>::get().GridController().getGlobalRank() == 0;
-            int yCells = cellDescription->getGridLayout().getDataSpaceWithoutGuarding().y();
+            if(!notifyPeriod.empty())
+            {
+                writeToFile = Environment<simDim>::get().GridController().getGlobalRank() == 0;
+                int yCells = cellDescription->getGridLayout().getDataSpaceWithoutGuarding().y();
 
-            localMaxIntensity = new GridBuffer<float_32, DIM1 > (DataSpace<DIM1 > (yCells)); //create one int on gpu und host
-            localIntegratedIntensity = new GridBuffer<float_32, DIM1 > (DataSpace<DIM1 > (yCells)); //create one int on gpu und host
+                localMaxIntensity
+                    = new GridBuffer<float_32, DIM1>(DataSpace<DIM1>(yCells)); // create one int on gpu und host
+                localIntegratedIntensity
+                    = new GridBuffer<float_32, DIM1>(DataSpace<DIM1>(yCells)); // create one int on gpu und host
 
-            if (writeToFile)
-            {
-                createFile(pluginPrefix + "_max.dat", outFileMax);
-                createFile(pluginPrefix + "_integrated.dat", outFileIntegrated);
-            }
+                if(writeToFile)
+                {
+                    createFile(pluginPrefix + "_max.dat", outFileMax);
+                    createFile(pluginPrefix + "_integrated.dat", outFileIntegrated);
+                }
 
-            Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+            }
         }
-    }
 
-    void pluginUnload()
-    {
-        if(!notifyPeriod.empty())
+        void pluginUnload()
         {
-            if (writeToFile)
+            if(!notifyPeriod.empty())
             {
-                flushAndCloseFile(outFileIntegrated);
-                flushAndCloseFile(outFileMax);
+                if(writeToFile)
+                {
+                    flushAndCloseFile(outFileIntegrated);
+                    flushAndCloseFile(outFileMax);
+                }
+                __delete(localMaxIntensity);
+                __delete(localIntegratedIntensity);
             }
-            __delete(localMaxIntensity);
-            __delete(localIntegratedIntensity);
         }
-    }
-
-private:
-
-    /* reduce data from all gpus to one array
-     * @param currentStep simulation step
-     */
-    void combineData(uint32_t currentStep)
-    {
-
-        const DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
-        Window window(MovingWindow::getInstance().getWindow( currentStep));
 
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-
-        const int yGlobalSize = subGrid.getGlobalDomain().size.y();
-        const int yLocalSize = localSize.y();
-
-        const int gpus = Environment<simDim>::get().GridController().getGpuNodes().productOfComponents();
-
-
-        /**\todo: fixme I cant work with not regular domains (use mpi_gatherv)*/
-        DataSpace<simDim> globalRootCell(subGrid.getLocalDomain().offset);
-        int yOffset = globalRootCell.y();
-        int* yOffsetsAll = new int[gpus];
-        float_32* maxAll = new float_32[yGlobalSize];
-        float_32* maxAllTmp = new float_32[yLocalSize * gpus];
-        memset(maxAll, 0, sizeof (float_32) *yGlobalSize);
-        float_32* integretedAll = new float_32[yGlobalSize];
-        float_32* integretedAllTmp = new float_32[yLocalSize * gpus];
-        memset(integretedAll, 0, sizeof (float_32) *yGlobalSize);
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Gather(&yOffset, 1, MPI_INT, yOffsetsAll, 1,
-                             MPI_INT, 0, MPI_COMM_WORLD));
-
-        MPI_CHECK(MPI_Gather(localMaxIntensity->getHostBuffer().getBasePointer(), yLocalSize, MPI_FLOAT,
-                             maxAllTmp, yLocalSize, MPI_FLOAT,
-                             0, MPI_COMM_WORLD));
-        MPI_CHECK(MPI_Gather(localIntegratedIntensity->getHostBuffer().getBasePointer(), yLocalSize, MPI_FLOAT,
-                             integretedAllTmp, yLocalSize, MPI_FLOAT,
-                             0, MPI_COMM_WORLD));
-
-        if (writeToFile)
+    private:
+        /* reduce data from all gpus to one array
+         * @param currentStep simulation step
+         */
+        void combineData(uint32_t currentStep)
         {
-            for (int i = 0; i < gpus; ++i)
+            const DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
+            Window window(MovingWindow::getInstance().getWindow(currentStep));
+
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+
+            const int yGlobalSize = subGrid.getGlobalDomain().size.y();
+            const int yLocalSize = localSize.y();
+
+            const int gpus = Environment<simDim>::get().GridController().getGpuNodes().productOfComponents();
+
+
+            /**\todo: fixme I cant work with not regular domains (use mpi_gatherv)*/
+            DataSpace<simDim> globalRootCell(subGrid.getLocalDomain().offset);
+            int yOffset = globalRootCell.y();
+            int* yOffsetsAll = new int[gpus];
+            float_32* maxAll = new float_32[yGlobalSize];
+            float_32* maxAllTmp = new float_32[yLocalSize * gpus];
+            memset(maxAll, 0, sizeof(float_32) * yGlobalSize);
+            float_32* integretedAll = new float_32[yGlobalSize];
+            float_32* integretedAllTmp = new float_32[yLocalSize * gpus];
+            memset(integretedAll, 0, sizeof(float_32) * yGlobalSize);
+
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_CHECK(MPI_Gather(&yOffset, 1, MPI_INT, yOffsetsAll, 1, MPI_INT, 0, MPI_COMM_WORLD));
+
+            MPI_CHECK(MPI_Gather(
+                localMaxIntensity->getHostBuffer().getBasePointer(),
+                yLocalSize,
+                MPI_FLOAT,
+                maxAllTmp,
+                yLocalSize,
+                MPI_FLOAT,
+                0,
+                MPI_COMM_WORLD));
+            MPI_CHECK(MPI_Gather(
+                localIntegratedIntensity->getHostBuffer().getBasePointer(),
+                yLocalSize,
+                MPI_FLOAT,
+                integretedAllTmp,
+                yLocalSize,
+                MPI_FLOAT,
+                0,
+                MPI_COMM_WORLD));
+
+            if(writeToFile)
             {
-                int gOffset = yOffsetsAll[i];
-                int tmpOff = yLocalSize*i;
-                for (int y = 0; y < yLocalSize; ++y)
+                for(int i = 0; i < gpus; ++i)
                 {
-                    maxAll[gOffset + y] = std::max(maxAllTmp[tmpOff + y], maxAll[gOffset + y]);
-                    integretedAll[gOffset + y] += integretedAllTmp[tmpOff + y];
+                    int gOffset = yOffsetsAll[i];
+                    int tmpOff = yLocalSize * i;
+                    for(int y = 0; y < yLocalSize; ++y)
+                    {
+                        maxAll[gOffset + y] = std::max(maxAllTmp[tmpOff + y], maxAll[gOffset + y]);
+                        integretedAll[gOffset + y] += integretedAllTmp[tmpOff + y];
+                    }
                 }
+
+                const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+                size_t physicelYCellOffset = numSlides * yLocalSize + window.globalDimensions.offset.y();
+                writeFile(
+                    currentStep,
+                    maxAll + window.globalDimensions.offset.y(),
+                    window.globalDimensions.size.y(),
+                    physicelYCellOffset,
+                    outFileMax,
+                    UNIT_EFIELD);
+
+                float_64 unit = UNIT_EFIELD * CELL_VOLUME * SI::EPS0_SI;
+                for(uint32_t i = 0; i < simDim; ++i)
+                    unit *= UNIT_LENGTH;
+
+                writeFile(
+                    currentStep,
+                    integretedAll + window.globalDimensions.offset.y(),
+                    window.globalDimensions.size.y(),
+                    physicelYCellOffset,
+                    outFileIntegrated,
+                    unit);
             }
 
-            const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
-            size_t physicelYCellOffset = numSlides * yLocalSize + window.globalDimensions.offset.y();
-            writeFile(currentStep,
-                      maxAll + window.globalDimensions.offset.y(),
-                      window.globalDimensions.size.y(),
-                      physicelYCellOffset,
-                      outFileMax,
-                      UNIT_EFIELD
-                      );
-
-            float_64 unit=UNIT_EFIELD*CELL_VOLUME*SI::EPS0_SI;
-            for(uint32_t i=0;i<simDim;++i)
-                unit*=UNIT_LENGTH;
-
-            writeFile(currentStep,
-                      integretedAll + window.globalDimensions.offset.y(),
-                      window.globalDimensions.size.y(),
-                      physicelYCellOffset,
-                      outFileIntegrated,
-                      unit
-                      );
+            __deleteArray(yOffsetsAll);
+            __deleteArray(maxAll);
+            __deleteArray(integretedAll);
+            __deleteArray(maxAllTmp);
+            __deleteArray(integretedAllTmp);
         }
 
-        __deleteArray(yOffsetsAll);
-        __deleteArray(maxAll);
-        __deleteArray(integretedAll);
-        __deleteArray(maxAllTmp);
-        __deleteArray(integretedAllTmp);
-    }
-
-    /* write data from array to a file
-     * write current step to first column
-     *
-     * @param currentStep simulation step
-     * @param array shifted source array (begin printing from first element)
-     * @param count number of elements to print
-     * @param physicalYOffset offset in cells to the absolute simulation begin
-     * @param stream destination stream
-     * @param unit unit to scale values from pic units to si units
-     */
-    void writeFile(size_t currentStep, float* array, size_t count, size_t physicalYOffset, std::ofstream& stream, float_64 unit)
-    {
-        stream << currentStep << " ";
-        for (size_t i = 0; i < count; ++i)
-        {
-            stream << (physicalYOffset + i) * SI::CELL_HEIGHT_SI << " ";
-        }
-        stream << std::endl << currentStep << " ";
-        for (size_t i = 0; i < count; ++i)
+        /* write data from array to a file
+         * write current step to first column
+         *
+         * @param currentStep simulation step
+         * @param array shifted source array (begin printing from first element)
+         * @param count number of elements to print
+         * @param physicalYOffset offset in cells to the absolute simulation begin
+         * @param stream destination stream
+         * @param unit unit to scale values from pic units to si units
+         */
+        void writeFile(
+            size_t currentStep,
+            float* array,
+            size_t count,
+            size_t physicalYOffset,
+            std::ofstream& stream,
+            float_64 unit)
         {
-            stream << sqrt((float_64) (array[i])) * unit << " ";
+            stream << currentStep << " ";
+            for(size_t i = 0; i < count; ++i)
+            {
+                stream << (physicalYOffset + i) * SI::CELL_HEIGHT_SI << " ";
+            }
+            stream << std::endl << currentStep << " ";
+            for(size_t i = 0; i < count; ++i)
+            {
+                stream << sqrt((float_64)(array[i])) * unit << " ";
+            }
+            stream << std::endl;
         }
-        stream << std::endl;
-    }
 
-    /* run calculation of intensity
-     * sync all result data to host side
-     *
-     * @param currenstep simulation step
-     */
-    void calcIntensity(uint32_t)
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
+        /* run calculation of intensity
+         * sync all result data to host side
+         *
+         * @param currenstep simulation step
+         */
+        void calcIntensity(uint32_t)
+        {
+            DataConnector& dc = Environment<>::get().DataConnector();
 
-        /*start only worker for any supercell in laser propagation direction*/
-        DataSpace<DIM2> grid(1,cellDescription->getGridSuperCells().y() - cellDescription->getGuardingSuperCells().y());
-        /*use only 2D slice XY for supercell handling*/
-        typedef typename MappingDesc::SuperCellSize SuperCellSize;
-        auto block = pmacc::math::CT::Vector<SuperCellSize::x,SuperCellSize::y>::toRT();
+            auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
 
-        PMACC_KERNEL(KernelIntensity{})
-            (grid, block)
-            (
-             fieldE->getDeviceDataBox(),
-             fieldE->getGridLayout().getDataSpace(),
-             localMaxIntensity->getDeviceBuffer().getDataBox(),
-             localIntegratedIntensity->getDeviceBuffer().getDataBox()
-             );
+            /*start only worker for any supercell in laser propagation direction*/
+            DataSpace<DIM2> grid(
+                1,
+                cellDescription->getGridSuperCells().y() - cellDescription->getGuardingSuperCells().y());
+            /*use only 2D slice XY for supercell handling*/
+            typedef typename MappingDesc::SuperCellSize SuperCellSize;
+            auto block = pmacc::math::CT::Vector<SuperCellSize::x, SuperCellSize::y>::toRT();
 
-        dc.releaseData( FieldE::getName() );
+            PMACC_KERNEL(KernelIntensity{})
+            (grid, block)(
+                fieldE->getDeviceDataBox(),
+                fieldE->getGridLayout().getDataSpace(),
+                localMaxIntensity->getDeviceBuffer().getDataBox(),
+                localIntegratedIntensity->getDeviceBuffer().getDataBox());
 
-        localMaxIntensity->deviceToHost();
-        localIntegratedIntensity->deviceToHost();
+            dc.releaseData(FieldE::getName());
 
-    }
+            localMaxIntensity->deviceToHost();
+            localIntegratedIntensity->deviceToHost();
+        }
 
-    /*create a file with given filename
-     * @param filename name of the output file
-     * @param stream ref on a stream object
-     */
-    void createFile(std::string filename, std::ofstream& stream)
-    {
-        stream.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
-        if (!stream)
+        /*create a file with given filename
+         * @param filename name of the output file
+         * @param stream ref on a stream object
+         */
+        void createFile(std::string filename, std::ofstream& stream)
         {
-            std::cerr << "Can't open file [" << filename << "] for output, diasble plugin output. " << std::endl;
-            writeToFile = false;
+            stream.open(filename.c_str(), std::ofstream::out | std::ostream::trunc);
+            if(!stream)
+            {
+                std::cerr << "Can't open file [" << filename << "] for output, diasble plugin output. " << std::endl;
+                writeToFile = false;
+            }
+            stream << "#step position_in_laser_propagation_direction" << std::endl;
+            stream << "#step amplitude_data[*]" << std::endl;
         }
-        stream << "#step position_in_laser_propagation_direction" << std::endl;
-        stream << "#step amplitude_data[*]" << std::endl;
-    }
 
-    /* close and flash a file stream object
-     * @param stream stream which must closed
-     */
-    void flushAndCloseFile(std::ofstream& stream)
-    {
-        stream.flush();
-        stream << std::endl; //now all data are written to file
-        if (stream.fail())
-            std::cerr << "Error on flushing file in IntensityPlugin. " << std::endl;
-        stream.close();
-    }
-
-};
-
-}
+        /* close and flash a file stream object
+         * @param stream stream which must closed
+         */
+        void flushAndCloseFile(std::ofstream& stream)
+        {
+            stream.flush();
+            stream << std::endl; // now all data are written to file
+            if(stream.fail())
+                std::cerr << "Error on flushing file in IntensityPlugin. " << std::endl;
+            stream.close();
+        }
+    };
 
+} // namespace picongpu
diff --git a/include/picongpu/plugins/IsaacPlugin.hpp b/include/picongpu/plugins/IsaacPlugin.hpp
index ee4dce3617..88bf4f9ded 100644
--- a/include/picongpu/plugins/IsaacPlugin.hpp
+++ b/include/picongpu/plugins/IsaacPlugin.hpp
@@ -1,26 +1,26 @@
 /*
-* Copyright 2013-2020 Alexander Matthes,
-*
-* This file is part of PIConGPU.
-*
-* PIConGPU is free software: you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation, either version 3 of the License, or
-* (at your option) any later version.
-*
-* PIConGPU is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License
-* along with PIConGPU.
-* If not, see <http://www.gnu.org/licenses/>.
-*/
+ * Copyright 2013-2021 Alexander Matthes,
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
 
 #pragma once
 
-//Needs to be the very first
+// Needs to be the very first
 #include <boost/fusion/include/mpl.hpp>
 
 #include "picongpu/plugins/ILightweightPlugin.hpp"
@@ -41,587 +41,636 @@
 
 namespace picongpu
 {
-namespace isaacP
-{
-
-
-using namespace pmacc;
-using namespace ::isaac;
+    namespace isaacP
+    {
+        using namespace pmacc;
+        using namespace ::isaac;
 
-ISAAC_NO_HOST_DEVICE_WARNING
-template < typename FieldType >
-class TFieldSource
-{
-    public:
-        static const size_t feature_dim = 3;
-        static const bool has_guard = bmpl::not_<boost::is_same<FieldType, FieldJ > >::value;
-        static const bool persistent = bmpl::not_<boost::is_same<FieldType, FieldJ > >::value;
-        typename FieldType::DataBoxType shifted;
-        MappingDesc *cellDescription;
-        bool movingWindow;
-        TFieldSource() : cellDescription(nullptr), movingWindow(false) {}
-
-        void init(MappingDesc *cellDescription, bool movingWindow)
+        ISAAC_NO_HOST_DEVICE_WARNING
+        template<typename FieldType>
+        class TFieldSource
         {
-            this->cellDescription = cellDescription;
-            this->movingWindow = movingWindow;
-        }
+        public:
+            static const size_t feature_dim = 3;
+            static const bool has_guard = bmpl::not_<boost::is_same<FieldType, FieldJ>>::value;
+            static const bool persistent = bmpl::not_<boost::is_same<FieldType, FieldJ>>::value;
+            typename FieldType::DataBoxType shifted;
+            MappingDesc* cellDescription;
+            bool movingWindow;
+            TFieldSource() : cellDescription(nullptr), movingWindow(false)
+            {
+            }
 
-        static std::string getName()
-        {
-            return FieldType::getName() + std::string(" field");
-        }
+            void init(MappingDesc* cellDescription, bool movingWindow)
+            {
+                this->cellDescription = cellDescription;
+                this->movingWindow = movingWindow;
+            }
 
-        void update(bool enabled, void* pointer)
-        {
-            if(enabled)
+            static std::string getName()
             {
-                const SubGrid<simDim>& subGrid = Environment< simDim >::get().SubGrid();
-                DataConnector &dc = Environment< simDim >::get().DataConnector();
-                auto pField = dc.get< FieldType >( FieldType::getName(), true );
-                DataSpace< simDim > guarding = SuperCellSize::toRT() * cellDescription->getGuardingSuperCells();
-                if (movingWindow)
+                return FieldType::getName() + std::string(" field");
+            }
+
+            void update(bool enabled, void* pointer)
+            {
+                if(enabled)
                 {
-                    GridController<simDim> &gc = Environment<simDim>::get().GridController();
-                    if (gc.getPosition()[1] == 0) //first gpu
+                    const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                    DataConnector& dc = Environment<simDim>::get().DataConnector();
+                    auto pField = dc.get<FieldType>(FieldType::getName(), true);
+                    DataSpace<simDim> guarding = SuperCellSize::toRT() * cellDescription->getGuardingSuperCells();
+                    if(movingWindow)
                     {
-                        uint32_t* currentStep = (uint32_t*)pointer;
-                        Window window( MovingWindow::getInstance().getWindow( *currentStep ) );
-                        guarding += subGrid.getLocalDomain().size - window.localDimensions.size;
+                        GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                        if(gc.getPosition()[1] == 0) // first gpu
+                        {
+                            uint32_t* currentStep = (uint32_t*) pointer;
+                            Window window(MovingWindow::getInstance().getWindow(*currentStep));
+                            guarding += subGrid.getLocalDomain().size - window.localDimensions.size;
+                        }
                     }
+                    typename FieldType::DataBoxType dataBox = pField->getDeviceDataBox();
+                    shifted = dataBox.shift(guarding);
+                    dc.releaseData(FieldType::getName());
+                    /* avoid deadlock between not finished pmacc tasks and potential blocking operations
+                     * within ISAAC
+                     */
+                    __getTransactionEvent().waitForFinished();
                 }
-                typename FieldType::DataBoxType dataBox = pField->getDeviceDataBox();
-                shifted = dataBox.shift( guarding );
-                dc.releaseData( FieldType::getName() );
-                /* avoid deadlock between not finished pmacc tasks and potential blocking operations
-                * within ISAAC
-                */
-                __getTransactionEvent().waitForFinished();
             }
 
-        }
+            ISAAC_NO_HOST_DEVICE_WARNING
+            ISAAC_HOST_DEVICE_INLINE isaac_float_dim<feature_dim> operator[](const isaac_int3& nIndex) const
+            {
+                auto value = shifted[nIndex.z][nIndex.y][nIndex.x];
+                return isaac_float_dim<feature_dim>(value.x(), value.y(), value.z());
+            }
+        };
 
         ISAAC_NO_HOST_DEVICE_WARNING
-        ISAAC_HOST_DEVICE_INLINE isaac_float_dim< feature_dim > operator[] (const isaac_int3& nIndex) const
+        template<typename FrameSolver, typename ParticleType>
+        class TFieldSource<FieldTmpOperation<FrameSolver, ParticleType>>
         {
-            auto value = shifted[nIndex.z][nIndex.y][nIndex.x];
-            isaac_float_dim< feature_dim > result =
+        public:
+            static const size_t feature_dim = 1;
+            static const bool has_guard = false;
+            static const bool persistent = false;
+            typename FieldTmp::DataBoxType shifted;
+            MappingDesc* cellDescription;
+            bool movingWindow;
+
+            TFieldSource() : cellDescription(nullptr), movingWindow(false)
             {
-                isaac_float( value.x() ),
-                isaac_float( value.y() ),
-                isaac_float( value.z() )
-            };
-            return result;
-        }
-};
-
-ISAAC_NO_HOST_DEVICE_WARNING
-template< typename FrameSolver, typename ParticleType >
-class TFieldSource< FieldTmpOperation< FrameSolver, ParticleType > >
-{
-    public:
-        static const size_t feature_dim = 1;
-        static const bool has_guard = false;
-        static const bool persistent = false;
-        typename FieldTmp::DataBoxType shifted;
-        MappingDesc *cellDescription;
-        bool movingWindow;
-
-        TFieldSource() : cellDescription(nullptr), movingWindow(false) {}
+            }
 
-        void init(MappingDesc *cellDescription, bool movingWindow)
-        {
-            this->cellDescription = cellDescription;
-            this->movingWindow = movingWindow;
-        }
+            void init(MappingDesc* cellDescription, bool movingWindow)
+            {
+                this->cellDescription = cellDescription;
+                this->movingWindow = movingWindow;
+            }
 
-        static std::string getName()
-        {
-            return ParticleType::FrameType::getName() + std::string(" ") + FrameSolver().getName();
-        }
+            static std::string getName()
+            {
+                return ParticleType::FrameType::getName() + std::string(" ") + FrameSolver().getName();
+            }
 
-        void update(bool enabled, void* pointer)
-        {
-            if (enabled)
+            void update(bool enabled, void* pointer)
             {
-                uint32_t* currentStep = (uint32_t*)pointer;
-                const SubGrid<simDim>& subGrid = Environment< simDim >::get().SubGrid();
-                DataConnector &dc = Environment< simDim >::get().DataConnector();
+                if(enabled)
+                {
+                    uint32_t* currentStep = (uint32_t*) pointer;
+                    const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                    DataConnector& dc = Environment<simDim>::get().DataConnector();
 
-                PMACC_CASSERT_MSG(
-                    _please_allocate_at_least_one_FieldTmp_in_memory_param,
-                    fieldTmpNumSlots > 0
-                );
-                auto fieldTmp = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true );
-                auto particles = dc.get< ParticleType >( ParticleType::FrameType::getName(), true );
+                    PMACC_CASSERT_MSG(_please_allocate_at_least_one_FieldTmp_in_memory_param, fieldTmpNumSlots > 0);
+                    auto fieldTmp = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+                    auto particles = dc.get<ParticleType>(ParticleType::FrameType::getName(), true);
 
-                fieldTmp->getGridBuffer().getDeviceBuffer().setValue( FieldTmp::ValueType(0.0) );
-                fieldTmp->template computeValue < CORE + BORDER, FrameSolver > (*particles, *currentStep);
-                EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
+                    fieldTmp->getGridBuffer().getDeviceBuffer().setValue(FieldTmp::ValueType(0.0));
+                    fieldTmp->template computeValue<CORE + BORDER, FrameSolver>(*particles, *currentStep);
+                    EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
 
-                __setTransactionEvent(fieldTmpEvent);
-                __getTransactionEvent().waitForFinished();
+                    __setTransactionEvent(fieldTmpEvent);
+                    __getTransactionEvent().waitForFinished();
 
-                dc.releaseData( ParticleType::FrameType::getName() );
+                    dc.releaseData(ParticleType::FrameType::getName());
 
-                DataSpace< simDim > guarding = SuperCellSize::toRT() * cellDescription->getGuardingSuperCells();
-                if (movingWindow)
-                {
-                    GridController<simDim> &gc = Environment<simDim>::get().GridController();
-                    if (gc.getPosition()[1] == 0) //first gpu
+                    DataSpace<simDim> guarding = SuperCellSize::toRT() * cellDescription->getGuardingSuperCells();
+                    if(movingWindow)
                     {
-                        Window window(MovingWindow::getInstance().getWindow( *currentStep ));
-                        guarding += subGrid.getLocalDomain().size - window.localDimensions.size;
+                        GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                        if(gc.getPosition()[1] == 0) // first gpu
+                        {
+                            Window window(MovingWindow::getInstance().getWindow(*currentStep));
+                            guarding += subGrid.getLocalDomain().size - window.localDimensions.size;
+                        }
                     }
+                    typename FieldTmp::DataBoxType dataBox = fieldTmp->getDeviceDataBox();
+                    shifted = dataBox.shift(guarding);
+                    dc.releaseData(FieldTmp::getUniqueId(0));
                 }
-                typename FieldTmp::DataBoxType dataBox = fieldTmp->getDeviceDataBox();
-                shifted = dataBox.shift( guarding );
-                dc.releaseData( FieldTmp::getUniqueId( 0 ) );
             }
-        }
-
-        ISAAC_NO_HOST_DEVICE_WARNING
-        ISAAC_HOST_DEVICE_INLINE isaac_float_dim< feature_dim > operator[] (const isaac_int3& nIndex) const
-        {
-            auto value = shifted[nIndex.z][nIndex.y][nIndex.x];
-            isaac_float_dim< feature_dim > result = { isaac_float( value.x() ) };
-            return result;
-        }
-};
 
+            ISAAC_NO_HOST_DEVICE_WARNING
+            ISAAC_HOST_DEVICE_INLINE isaac_float_dim<feature_dim> operator[](const isaac_int3& nIndex) const
+            {
+                auto value = shifted[nIndex.z][nIndex.y][nIndex.x];
+                return isaac_float_dim<feature_dim>(value.x());
+            }
+        };
 
-template<size_t feature_dim, typename ParticlesBoxType>
-class ParticleIterator
-{
-    public:
-        using FramePtr = typename ParticlesBoxType::FramePtr;
-        // size of the particle list
-        size_t size;
 
-        ISAAC_NO_HOST_DEVICE_WARNING
-        ISAAC_HOST_DEVICE_INLINE ParticleIterator(size_t size, ParticlesBoxType pb, FramePtr firstFrame, int frameSize) :
-            size(size),
-            pb(pb),
-            frame(firstFrame),
-            frameSize(frameSize),
-            i(0)
-            {}
-
-        ISAAC_HOST_DEVICE_INLINE void next()
+        template<size_t feature_dim, typename ParticlesBoxType>
+        class ParticleIterator
         {
-            // iterate particles look for next frame
-            i++;
-            if(i >= frameSize)
+        public:
+            using FramePtr = typename ParticlesBoxType::FramePtr;
+            // size of the particle list
+            size_t size;
+
+            ISAAC_NO_HOST_DEVICE_WARNING
+            ISAAC_HOST_DEVICE_INLINE ParticleIterator(
+                size_t size,
+                ParticlesBoxType pb,
+                FramePtr firstFrame,
+                int frameSize)
+                : size(size)
+                , pb(pb)
+                , frame(firstFrame)
+                , frameSize(frameSize)
+                , i(0)
             {
-                frame = pb.getNextFrame(frame);
-                i = 0;
             }
-        }
-
-        // returns current particle position
-        ISAAC_HOST_DEVICE_INLINE isaac_float3 getPosition() const
-        {
-            auto const particle = frame[ i ];
-
-            // storage number in the actual frame
-            const auto frameCellNr = particle[ localCellIdx_];
 
-            // offset in the actual superCell = cell offset in the supercell
-            const DataSpace<simDim> frameCellOffset(DataSpaceOperations<simDim>::template map<MappingDesc::SuperCellSize > (frameCellNr));
-
-            // added offsets
-            float3_X const absoluteOffset(particle[ position_ ] + float3_X(frameCellOffset));
+            ISAAC_HOST_DEVICE_INLINE void next()
+            {
+                // iterate particles look for next frame
+                ++i;
+                if(i >= frameSize)
+                {
+                    frame = pb.getNextFrame(frame);
+                    i = 0;
+                }
+            }
 
-            // calculate scaled position
-            float3_X const pos(
-            absoluteOffset.x() * (1._X / float_X(MappingDesc::SuperCellSize::x::value)),
-            absoluteOffset.y() * (1._X / float_X(MappingDesc::SuperCellSize::y::value)),
-            absoluteOffset.z() * (1._X / float_X(MappingDesc::SuperCellSize::z::value))
+            // returns current particle position
+            ISAAC_HOST_DEVICE_INLINE isaac_float3 getPosition() const
+            {
+                auto const particle = frame[i];
 
-            );
+                // storage number in the actual frame
+                const auto frameCellNr = particle[localCellIdx_];
 
-            return {pos[0], pos[1], pos[2]};
-        }
+                // offset in the actual superCell = cell offset in the supercell
+                const DataSpace<simDim> frameCellOffset(
+                    DataSpaceOperations<simDim>::template map<MappingDesc::SuperCellSize>(frameCellNr));
 
-        // returns particle momentum as color attribute
-        ISAAC_HOST_DEVICE_INLINE isaac_float_dim<feature_dim> getAttribute() const
-        {
-            auto const particle = frame[ i ];
-            float3_X const mom = particle[ momentum_ ];
-            return {mom[0], mom[1], mom[2]};
-        }
+                // added offsets
+                float3_X const absoluteOffset(particle[position_] + float3_X(frameCellOffset));
 
+                // calculate scaled position
+                isaac_float3 const pos(
+                    absoluteOffset.x() * (1._X / float_X(MappingDesc::SuperCellSize::x::value)),
+                    absoluteOffset.y() * (1._X / float_X(MappingDesc::SuperCellSize::y::value)),
+                    absoluteOffset.z() * (1._X / float_X(MappingDesc::SuperCellSize::z::value)));
 
-        // returns constant radius
-        ISAAC_HOST_DEVICE_INLINE isaac_float getRadius() const
-        {
-            return 0.2f;
-        }
+                return pos;
+            }
 
+            // returns particle momentum as color attribute
+            ISAAC_HOST_DEVICE_INLINE isaac_float_dim<feature_dim> getAttribute() const
+            {
+                auto const particle = frame[i];
+                float3_X const mom = particle[momentum_];
+                return isaac_float_dim<feature_dim>(mom[0], mom[1], mom[2]);
+            }
 
-    private:
-        ParticlesBoxType pb;
-        FramePtr frame;
-        int i;
-        int frameSize;
-};
 
+            // returns constant radius
+            ISAAC_HOST_DEVICE_INLINE isaac_float getRadius() const
+            {
+                return 0.2f;
+            }
 
 
-ISAAC_NO_HOST_DEVICE_WARNING
-template< typename ParticlesType >
-class ParticleSource
-{
+        private:
+            ParticlesBoxType pb;
+            FramePtr frame;
+            int i;
+            int frameSize;
+        };
 
-    using ParticlesBoxType = typename ParticlesType::ParticlesBoxType;
-    using FramePtr = typename ParticlesBoxType::FramePtr;
-    using FrameType = typename ParticlesBoxType::FrameType;
 
-    public:
-        static const size_t feature_dim = 3;
-        bool movingWindow;
-        DataSpace< simDim > guarding;
         ISAAC_NO_HOST_DEVICE_WARNING
-        ParticleSource ()
-        {}
-
-        ISAAC_HOST_INLINE static std::string getName()
+        template<typename ParticlesType>
+        class ParticleSource
         {
-            return ParticlesType::FrameType::getName() + std::string(" particle");
-        }
+            using ParticlesBoxType = typename ParticlesType::ParticlesBoxType;
+            using FramePtr = typename ParticlesBoxType::FramePtr;
+            using FrameType = typename ParticlesBoxType::FrameType;
+
+        public:
+            static const size_t feature_dim = 3;
+            bool movingWindow;
+            DataSpace<simDim> guarding;
+            ISAAC_NO_HOST_DEVICE_WARNING
+            ParticleSource()
+            {
+            }
 
-        pmacc::memory::Array<ParticlesBoxType,1> pb;
+            ISAAC_HOST_INLINE static std::string getName()
+            {
+                return ParticlesType::FrameType::getName() + std::string(" particle");
+            }
 
-        void init(bool movingWindow)
-        {
-            this->movingWindow = movingWindow;
-        }
+            pmacc::memory::Array<ParticlesBoxType, 1> pb;
 
-        void update(bool enabled, void* pointer)
-        {
-            // update movingWindow cells
-            if (enabled)
+            void init(bool movingWindow)
             {
-                uint32_t* currentStep = (uint32_t*)pointer;
-                DataConnector &dc = Environment<>::get().DataConnector();
-                auto particles = dc.get< ParticlesType >( ParticlesType::FrameType::getName(), true );
-                pb[0] = particles->getDeviceParticlesBox();
-
-                const SubGrid<simDim>& subGrid = Environment< simDim >::get().SubGrid();
-                guarding = GuardSize::toRT();
-                if (movingWindow)
+                this->movingWindow = movingWindow;
+            }
+
+            void update(bool enabled, void* pointer)
+            {
+                // update movingWindow cells
+                if(enabled)
                 {
-                    GridController<simDim> &gc = Environment<simDim>::get().GridController();
-                    if (gc.getPosition()[1] == 0) //first gpu
+                    uint32_t* currentStep = (uint32_t*) pointer;
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
+                    pb[0] = particles->getDeviceParticlesBox();
+
+                    const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                    guarding = GuardSize::toRT();
+                    if(movingWindow)
                     {
-                        Window window(MovingWindow::getInstance().getWindow( *currentStep ));
-                        for(uint i = 0; i < simDim; i++)
-                            guarding[i] += int(math::ceil((subGrid.getLocalDomain().size[i] - window.localDimensions.size[i]) / (float)MappingDesc::SuperCellSize::toRT()[i]));
+                        GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                        if(gc.getPosition()[1] == 0) // first gpu
+                        {
+                            Window window(MovingWindow::getInstance().getWindow(*currentStep));
+                            for(uint32_t i = 0; i < simDim; i++)
+                                guarding[i] += int(math::ceil(
+                                    (subGrid.getLocalDomain().size[i] - window.localDimensions.size[i])
+                                    / (float) MappingDesc::SuperCellSize::toRT()[i]));
+                        }
                     }
+                    dc.releaseData(ParticlesType::FrameType::getName());
                 }
-                dc.releaseData( ParticlesType::FrameType::getName() );
             }
-        }
 
-        // returns particleIterator with correct feature_dim and cell specific particlebox
-        ISAAC_NO_HOST_DEVICE_WARNING
-        ISAAC_HOST_DEVICE_INLINE ParticleIterator<feature_dim, ParticlesBoxType> getIterator(const isaac_uint3& local_grid_coord) const
+            // returns particleIterator with correct feature_dim and cell specific particlebox
+            ISAAC_NO_HOST_DEVICE_WARNING
+            ISAAC_HOST_DEVICE_INLINE ParticleIterator<feature_dim, ParticlesBoxType> getIterator(
+                const isaac_uint3& local_grid_coord) const
+            {
+                constexpr uint32_t frameSize = pmacc::math::CT::volume<typename FrameType::SuperCellSize>::type::value;
+                DataSpace<simDim> const superCellIdx(
+                    local_grid_coord.x + guarding[0],
+                    local_grid_coord.y + guarding[1],
+                    local_grid_coord.z + guarding[2]);
+                const auto& superCell = pb[0].getSuperCell(superCellIdx);
+                size_t size = superCell.getNumParticles();
+                FramePtr currentFrame = pb[0].getFirstFrame(superCellIdx);
+                return ParticleIterator<feature_dim, ParticlesBoxType>(size, pb[0], currentFrame, frameSize);
+            }
+        };
+
+        template<typename T>
+        struct Transformoperator
         {
-            constexpr uint32_t frameSize = pmacc::math::CT::volume< typename FrameType::SuperCellSize >::type::value;
-            DataSpace< simDim > const superCellIdx( local_grid_coord.x + guarding[0], local_grid_coord.y + guarding[1], local_grid_coord.z + guarding[2] );
-            const auto & superCell = pb[0].getSuperCell( superCellIdx );
-            size_t size = superCell.getNumParticles();
-            FramePtr currentFrame = pb[0].getFirstFrame( superCellIdx );
-            return ParticleIterator<feature_dim, ParticlesBoxType>( size, pb[0], currentFrame, frameSize );
-        }
-};
-
-template< typename T >
-struct Transformoperator
-{
-    typedef TFieldSource< T > type;
-};
-template< typename T >
-struct ParticleTransformoperator
-{
-    typedef ParticleSource< T > type;
-};
+            typedef TFieldSource<T> type;
+        };
+        template<typename T>
+        struct ParticleTransformoperator
+        {
+            typedef ParticleSource<T> type;
+        };
 
-struct SourceInitIterator
-{
-    template
-    <
-        typename TSource,
-        typename TCellDescription,
-        typename TMovingWindow
-    >
-    void operator()( const int I, TSource& s, TCellDescription& c, TMovingWindow& w) const
-    {
-        s.init(c,w);
-    }
-};
+        struct SourceInitIterator
+        {
+            template<typename TSource, typename TCellDescription, typename TMovingWindow>
+            void operator()(const int I, TSource& s, TCellDescription& c, TMovingWindow& w) const
+            {
+                s.init(c, w);
+            }
+        };
 
-struct ParticleSourceInitIterator
-{
-    template
-    <
-        typename TParticleSource,
-        typename TMovingWindow
-    >
-    void operator()( const int I, TParticleSource& s, TMovingWindow& w) const
-    {
-        s.init(w);
-    }
-};
+        struct ParticleSourceInitIterator
+        {
+            template<typename TParticleSource, typename TMovingWindow>
+            void operator()(const int I, TParticleSource& s, TMovingWindow& w) const
+            {
+                s.init(w);
+            }
+        };
 
 
-class IsaacPlugin : public ILightweightPlugin
-{
-public:
-    typedef boost::mpl::int_< simDim > SimDim;
-    static const size_t textureDim = 1024;
-    using SourceList = bmpl::transform<boost::fusion::result_of::as_list< Fields_Seq >::type,Transformoperator<bmpl::_1>>::type;
-    // create compile time particle list
-    using ParticleList = bmpl::transform<boost::fusion::result_of::as_list< Particle_Seq >::type,ParticleTransformoperator<bmpl::_1>>::type;
-    using VisualizationType = IsaacVisualization
-    <
-        cupla::AccHost,
-        cupla::Acc,
-        cupla::AccStream,
-        cupla::KernelDim,
-        SimDim,
-        ParticleList,
-        SourceList,
-        DataSpace< simDim >,
-        textureDim,
-        float3_X,
-#if( ISAAC_STEREO == 0 )
-            isaac::DefaultController,
-            isaac::DefaultCompositor
+        class IsaacPlugin : public ILightweightPlugin
+        {
+        public:
+            static const ISAAC_IDX_TYPE textureDim = 1024;
+            using SourceList = bmpl::
+                transform<boost::fusion::result_of::as_list<Fields_Seq>::type, Transformoperator<bmpl::_1>>::type;
+            // create compile time particle list
+            using ParticleList = bmpl::transform<
+                boost::fusion::result_of::as_list<Particle_Seq>::type,
+                ParticleTransformoperator<bmpl::_1>>::type;
+            using VisualizationType = IsaacVisualization<
+                cupla::AccHost,
+                cupla::Acc,
+                cupla::AccStream,
+                cupla::KernelDim,
+                ParticleList,
+                SourceList,
+                textureDim,
+#if(ISAAC_STEREO == 0)
+                isaac::DefaultController,
+                isaac::DefaultCompositor
 #else
-            isaac::StereoController,
-#   if( ISAAC_STEREO == 1 )
+                isaac::StereoController,
+#    if(ISAAC_STEREO == 1)
                 isaac::StereoCompositorSideBySide<isaac::StereoController>
-#   else
-                isaac::StereoCompositorAnaglyph<isaac::StereoController,0x000000FF,0x00FFFF00>
-#   endif
+#    else
+                isaac::StereoCompositorAnaglyph<isaac::StereoController, 0x000000FF, 0x00FFFF00>
+#    endif
 #endif
-    >;
-    VisualizationType * visualization;
-
-    IsaacPlugin() :
-        visualization(nullptr),
-        cellDescription(nullptr),
-        movingWindow(false),
-        render_interval(1),
-        step(0),
-        drawing_time(0),
-        cell_count(0),
-        particle_count(0),
-        last_notify(0)
-    {
-        Environment<>::get().PluginConnector().registerPlugin(this);
-    }
+                >;
+            VisualizationType* visualization;
+
+            IsaacPlugin()
+                : visualization(nullptr)
+                , cellDescription(nullptr)
+                , movingWindow(false)
+                , render_interval(1)
+                , step(0)
+                , drawing_time(0)
+                , cell_count(0)
+                , particle_count(0)
+                , last_notify(0)
+            {
+                Environment<>::get().PluginConnector().registerPlugin(this);
+            }
 
-    std::string pluginGetName() const
-    {
-        return "IsaacPlugin";
-    }
+            std::string pluginGetName() const
+            {
+                return "IsaacPlugin";
+            }
 
-    void notify(uint32_t currentStep)
-    {
-        uint64_t simulation_time = visualization->getTicksUs() - last_notify;
-        step++;
-        if (step >= render_interval)
-        {
-            step = 0;
-            bool pause = false;
-            do
+            void notify(uint32_t currentStep)
             {
-                //update of the position for moving window simulations
-                if ( movingWindow )
+                uint64_t simulation_time = visualization->getTicksUs() - last_notify;
+                step++;
+                if(step >= render_interval)
                 {
-                    Window window(MovingWindow::getInstance().getWindow( currentStep ));
-                    visualization->updatePosition( window.localDimensions.offset );
-                    visualization->updateLocalSize( window.localDimensions.size );
-                    visualization->updateLocalParticleSize( window.localDimensions.size / MappingDesc::SuperCellSize::toRT());
-                    visualization->updateBounding();
-                }
-                if (rank == 0 && visualization->kernel_time)
-                {
-                    json_object_set_new( visualization->getJsonMetaRoot(), "time step", json_integer( currentStep ) );
-                    json_object_set_new( visualization->getJsonMetaRoot(), "drawing_time" , json_integer( drawing_time ) );
-                    json_object_set_new( visualization->getJsonMetaRoot(), "simulation_time", json_integer( simulation_time ) );
-                    simulation_time = 0;
-                    json_object_set_new( visualization->getJsonMetaRoot(), "cell count", json_integer( cell_count ) );
-                    json_object_set_new( visualization->getJsonMetaRoot(), "particle count", json_integer( particle_count ) );
-                }
-                uint64_t start = visualization->getTicksUs();
-                json_t* meta = visualization->doVisualization(META_MASTER, &currentStep, !pause);
-                drawing_time = visualization->getTicksUs() - start;
-                json_t* json_pause = nullptr;
-                if ( meta && (json_pause = json_object_get(meta, "pause")) && json_boolean_value( json_pause ) )
-                    pause = !pause;
-                if ( meta && json_integer_value( json_object_get(meta, "exit") ) )
-                    exit(1);
-                json_t* js;
-                if ( meta && ( js = json_object_get(meta, "interval") ) )
-                {
-                    render_interval = math::max( int(1), int( json_integer_value ( js ) ) );
-                    //Feedback for other clients than the changing one
-                    if (rank == 0)
-                        json_object_set_new( visualization->getJsonMetaRoot(), "interval", json_integer( render_interval ) );
-                }
-                json_decref( meta );
-                if (direct_pause)
-                {
-                    pause = true;
-                    direct_pause = false;
+                    step = 0;
+                    bool pause = false;
+                    do
+                    {
+                        // update of the position for moving window simulations
+                        if(movingWindow)
+                        {
+                            Window window(MovingWindow::getInstance().getWindow(currentStep));
+                            isaac_size3 position;
+                            isaac_size3 local_size;
+                            isaac_size3 particle_size;
+
+                            for(ISAAC_IDX_TYPE i = 0; i < 3; ++i)
+                            {
+                                position[i] = window.localDimensions.offset[i];
+                                local_size[i] = window.localDimensions.size[i];
+                                particle_size[i]
+                                    = window.localDimensions.size[i] / MappingDesc::SuperCellSize::toRT()[i];
+                            }
+                            visualization->updatePosition(position);
+                            visualization->updateLocalSize(local_size);
+                            visualization->updateLocalParticleSize(particle_size);
+                            visualization->updateBounding();
+                        }
+                        if(rank == 0 && visualization->kernel_time)
+                        {
+                            json_object_set_new(
+                                visualization->getJsonMetaRoot(),
+                                "time step",
+                                json_integer(currentStep));
+                            json_object_set_new(
+                                visualization->getJsonMetaRoot(),
+                                "drawing_time",
+                                json_integer(drawing_time));
+                            json_object_set_new(
+                                visualization->getJsonMetaRoot(),
+                                "simulation_time",
+                                json_integer(simulation_time));
+                            simulation_time = 0;
+                            json_object_set_new(
+                                visualization->getJsonMetaRoot(),
+                                "cell count",
+                                json_integer(cell_count));
+                            json_object_set_new(
+                                visualization->getJsonMetaRoot(),
+                                "particle count",
+                                json_integer(particle_count));
+                        }
+                        uint64_t start = visualization->getTicksUs();
+                        json_t* meta = visualization->doVisualization(META_MASTER, &currentStep, !pause);
+                        drawing_time = visualization->getTicksUs() - start;
+                        json_t* json_pause = nullptr;
+                        if(meta && (json_pause = json_object_get(meta, "pause")) && json_boolean_value(json_pause))
+                            pause = !pause;
+                        if(meta && json_integer_value(json_object_get(meta, "exit")))
+                            exit(1);
+                        json_t* js;
+                        if(meta && (js = json_object_get(meta, "interval")))
+                        {
+                            render_interval = math::max(int(1), int(json_integer_value(js)));
+                            // Feedback for other clients than the changing one
+                            if(rank == 0)
+                                json_object_set_new(
+                                    visualization->getJsonMetaRoot(),
+                                    "interval",
+                                    json_integer(render_interval));
+                        }
+                        json_decref(meta);
+                        if(direct_pause)
+                        {
+                            pause = true;
+                            direct_pause = false;
+                        }
+                    } while(pause);
                 }
+                last_notify = visualization->getTicksUs();
             }
-            while (pause);
-        }
-        last_notify = visualization->getTicksUs();
-    }
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        /* register command line parameters for your plugin */
-        desc.add_options()
-            ("isaac.period", po::value< std::string > (&notifyPeriod),
-             "Enable IsaacPlugin [for each n-th step].")
-            ("isaac.name", po::value< std::string > (&name)->default_value("default"),
-             "The name of the simulation. Default is \"default\".")
-            ("isaac.url", po::value< std::string > (&url)->default_value("localhost"),
-             "The url of the isaac server to connect to. Default is \"localhost\".")
-            ("isaac.port", po::value< uint16_t > (&port)->default_value(2460),
-             "The port of the isaac server to connect to. Default is 2460.")
-            ("isaac.width", po::value< uint32_t > (&width)->default_value(1024),
-             "The width per isaac framebuffer. Default is 1024.")
-            ("isaac.height", po::value< uint32_t > (&height)->default_value(768),
-             "The height per isaac framebuffer. Default is 768.")
-            ("isaac.directPause", po::value< bool > (&direct_pause)->default_value(false),
-             "Direct pausing after starting simulation. Default is false.")
-            ("isaac.quality", po::value< uint32_t > (&jpeg_quality)->default_value(90),
-             "JPEG quality. Default is 90.")
-            ("isaac.reconnect", po::value< bool > (&reconnect)->default_value(true),
-             "Trying to reconnect every time an image is rendered if the connection is lost or could never established at all.")
-            ;
-    }
-
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        this->cellDescription = cellDescription;
-    }
-
-private:
-    MappingDesc *cellDescription;
-    std::string notifyPeriod;
-    std::string url;
-    std::string name;
-    uint16_t port;
-    uint32_t count;
-    uint32_t width;
-    uint32_t height;
-    uint32_t jpeg_quality;
-    int rank;
-    int numProc;
-    bool movingWindow;
-    ParticleList particleSources;
-    SourceList sources;
-    /** render interval within the notify period
-     *
-     * render each n-th time step within an interval defined by notifyPeriod
-     */
-    uint32_t render_interval;
-    uint32_t step;
-    int drawing_time;
-    bool direct_pause;
-    int cell_count;
-    int particle_count;
-    uint64_t last_notify;
-    bool reconnect;
-
-    void pluginLoad()
-    {
-        if(!notifyPeriod.empty())
-        {
-            MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-            MPI_Comm_size(MPI_COMM_WORLD, &numProc);
-            if ( MovingWindow::getInstance().isEnabled() )
-                movingWindow = true;
-            float_X minCellSize = math::min( cellSize[0], math::min( cellSize[1], cellSize[2] ) );
-            float3_X cellSizeFactor = cellSize / minCellSize;
-
-            const SubGrid<simDim>& subGrid = Environment< simDim >::get().SubGrid();
-
-            isaac_size2 framebuffer_size =
+            void pluginRegisterHelp(po::options_description& desc)
             {
-                cupla::IdxType(width),
-                cupla::IdxType(height)
-            };
-
-            isaac_for_each_params( sources, SourceInitIterator(), cellDescription, movingWindow );
-            isaac_for_each_params( particleSources, ParticleSourceInitIterator(), movingWindow);
-
-            visualization = new VisualizationType (
-                cupla::manager::Device< cupla::AccHost >::get().current( ),
-                cupla::manager::Device< cupla::AccDev >::get().current( ),
-                cupla::manager::Stream< cupla::AccDev, cupla::AccStream >::get().stream( ),
-                name,
-                0,
-                url,
-                port,
-                framebuffer_size,
-                MovingWindow::getInstance().getWindow( 0 ).globalDimensions.size,
-                subGrid.getLocalDomain().size,
-                subGrid.getLocalDomain().size / SuperCellSize::toRT(),
-                subGrid.getLocalDomain().offset,
-                particleSources,
-                sources,
-                cellSizeFactor
-            );
-            visualization->setJpegQuality(jpeg_quality);
-            //Defining the later periodicly sent meta data
-            if (rank == 0)
+                /* register command line parameters for your plugin */
+                desc.add_options()(
+                    "isaac.period",
+                    po::value<std::string>(&notifyPeriod),
+                    "Enable IsaacPlugin [for each n-th step].")(
+                    "isaac.name",
+                    po::value<std::string>(&name)->default_value("default"),
+                    "The name of the simulation. Default is \"default\".")(
+                    "isaac.url",
+                    po::value<std::string>(&url)->default_value("localhost"),
+                    "The url of the isaac server to connect to. Default is \"localhost\".")(
+                    "isaac.port",
+                    po::value<uint16_t>(&port)->default_value(2460),
+                    "The port of the isaac server to connect to. Default is 2460.")(
+                    "isaac.width",
+                    po::value<uint32_t>(&width)->default_value(1024),
+                    "The width per isaac framebuffer. Default is 1024.")(
+                    "isaac.height",
+                    po::value<uint32_t>(&height)->default_value(768),
+                    "The height per isaac framebuffer. Default is 768.")(
+                    "isaac.directPause",
+                    po::value<bool>(&direct_pause)->default_value(false),
+                    "Direct pausing after starting simulation. Default is false.")(
+                    "isaac.quality",
+                    po::value<uint32_t>(&jpeg_quality)->default_value(90),
+                    "JPEG quality. Default is 90.")(
+                    "isaac.reconnect",
+                    po::value<bool>(&reconnect)->default_value(true),
+                    "Trying to reconnect every time an image is rendered if the connection is lost or could never "
+                    "established at all.");
+            }
+
+            void setMappingDescription(MappingDesc* cellDescription)
             {
-                json_object_set_new( visualization->getJsonMetaRoot(), "time step", json_string( "Time step" ) );
-                json_object_set_new( visualization->getJsonMetaRoot(), "drawing time", json_string( "Drawing time in us" ) );
-                json_object_set_new( visualization->getJsonMetaRoot(), "simulation time", json_string( "Simulation time in us" ) );
-                json_object_set_new( visualization->getJsonMetaRoot(), "cell count", json_string( "Total numbers of cells" ) );
-                json_object_set_new( visualization->getJsonMetaRoot(), "particle count", json_string( "Total numbers of particles" ) );
+                this->cellDescription = cellDescription;
             }
-            CommunicatorSetting communicatorBehaviour = reconnect ? RetryEverySend : ReturnAtError;
-            if (visualization->init( communicatorBehaviour ) != 0)
+
+        private:
+            MappingDesc* cellDescription;
+            std::string notifyPeriod;
+            std::string url;
+            std::string name;
+            uint16_t port;
+            uint32_t count;
+            uint32_t width;
+            uint32_t height;
+            uint32_t jpeg_quality;
+            int rank;
+            int numProc;
+            bool movingWindow;
+            ParticleList particleSources;
+            SourceList sources;
+            /** render interval within the notify period
+             *
+             * render each n-th time step within an interval defined by notifyPeriod
+             */
+            uint32_t render_interval;
+            uint32_t step;
+            int drawing_time;
+            bool direct_pause;
+            int cell_count;
+            int particle_count;
+            uint64_t last_notify;
+            bool reconnect;
+
+            void pluginLoad()
             {
-                if (rank == 0)
-                    log<picLog::INPUT_OUTPUT > ("ISAAC Init failed, disable plugin");
-                notifyPeriod = "";
+                if(!notifyPeriod.empty())
+                {
+                    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+                    MPI_Comm_size(MPI_COMM_WORLD, &numProc);
+                    if(MovingWindow::getInstance().isEnabled())
+                        movingWindow = true;
+                    isaac_float minCellSize = math::min(cellSize[0], math::min(cellSize[1], cellSize[2]));
+                    isaac_float3 cellSizeFactor(
+                        cellSize[0] / minCellSize,
+                        cellSize[1] / minCellSize,
+                        cellSize[2] / minCellSize);
+
+                    const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+
+                    isaac_size2 framebuffer_size = {cupla::IdxType(width), cupla::IdxType(height)};
+
+                    isaac_for_each_params(sources, SourceInitIterator(), cellDescription, movingWindow);
+                    isaac_for_each_params(particleSources, ParticleSourceInitIterator(), movingWindow);
+
+                    isaac_size3 global_size;
+                    isaac_size3 local_size;
+                    isaac_size3 particle_size;
+                    isaac_size3 position;
+                    for(ISAAC_IDX_TYPE i = 0; i < 3; ++i)
+                    {
+                        global_size[i] = MovingWindow::getInstance().getWindow(0).globalDimensions.size[i];
+                        local_size[i] = subGrid.getLocalDomain().size[i];
+                        particle_size[i] = subGrid.getLocalDomain().size[i] / SuperCellSize::toRT()[i];
+                        position[i] = subGrid.getLocalDomain().offset[i];
+                    }
+                    visualization = new VisualizationType(
+                        cupla::manager::Device<cupla::AccHost>::get().current(),
+                        cupla::manager::Device<cupla::AccDev>::get().current(),
+                        cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(),
+                        name,
+                        0,
+                        url,
+                        port,
+                        framebuffer_size,
+                        global_size,
+                        local_size,
+                        particle_size,
+                        position,
+                        particleSources,
+                        sources,
+                        cellSizeFactor);
+                    visualization->setJpegQuality(jpeg_quality);
+                    // Defining the later periodicly sent meta data
+                    if(rank == 0)
+                    {
+                        json_object_set_new(visualization->getJsonMetaRoot(), "time step", json_string("Time step"));
+                        json_object_set_new(
+                            visualization->getJsonMetaRoot(),
+                            "drawing time",
+                            json_string("Drawing time in us"));
+                        json_object_set_new(
+                            visualization->getJsonMetaRoot(),
+                            "simulation time",
+                            json_string("Simulation time in us"));
+                        json_object_set_new(
+                            visualization->getJsonMetaRoot(),
+                            "cell count",
+                            json_string("Total numbers of cells"));
+                        json_object_set_new(
+                            visualization->getJsonMetaRoot(),
+                            "particle count",
+                            json_string("Total numbers of particles"));
+                    }
+                    CommunicatorSetting communicatorBehaviour = reconnect ? RetryEverySend : ReturnAtError;
+                    if(visualization->init(communicatorBehaviour) != 0)
+                    {
+                        if(rank == 0)
+                            log<picLog::INPUT_OUTPUT>("ISAAC Init failed, disable plugin");
+                        notifyPeriod = "";
+                    }
+                    else
+                    {
+                        const int localNrOfCells
+                            = cellDescription->getGridLayout().getDataSpaceWithoutGuarding().productOfComponents();
+                        cell_count = localNrOfCells * numProc;
+                        particle_count = localNrOfCells * particles::TYPICAL_PARTICLES_PER_CELL
+                            * (bmpl::size<VectorAllSpecies>::type::value) * numProc;
+                        last_notify = visualization->getTicksUs();
+                        if(rank == 0)
+                            log<picLog::INPUT_OUTPUT>("ISAAC Init succeded");
+                    }
+                }
+                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
             }
-            else
+
+            void pluginUnload()
             {
-                const int localNrOfCells = cellDescription->getGridLayout().getDataSpaceWithoutGuarding().productOfComponents();
-                cell_count = localNrOfCells * numProc;
-                particle_count = localNrOfCells * particles::TYPICAL_PARTICLES_PER_CELL * (bmpl::size<VectorAllSpecies>::type::value) * numProc;
-                last_notify = visualization->getTicksUs();
-                if (rank == 0)
-                    log<picLog::INPUT_OUTPUT > ("ISAAC Init succeded");
+                if(!notifyPeriod.empty())
+                {
+                    delete visualization;
+                    visualization = nullptr;
+                    if(rank == 0)
+                        log<picLog::INPUT_OUTPUT>("ISAAC finished");
+                }
             }
-        }
-        Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
-    }
+        };
 
-    void pluginUnload()
-    {
-        if(!notifyPeriod.empty())
-        {
-            delete visualization;
-            visualization = nullptr;
-            if (rank == 0)
-                log<picLog::INPUT_OUTPUT > ("ISAAC finished");
-        }
-    }
-};
-
-} //namespace isaac;
-} //namespace picongpu;
+    } // namespace isaacP
+} // namespace picongpu
diff --git a/include/picongpu/plugins/PhaseSpace/AxisDescription.hpp b/include/picongpu/plugins/PhaseSpace/AxisDescription.hpp
index 8c27b0ade5..1fee962109 100644
--- a/include/picongpu/plugins/PhaseSpace/AxisDescription.hpp
+++ b/include/picongpu/plugins/PhaseSpace/AxisDescription.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl
+/* Copyright 2014-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -19,6 +19,8 @@
 
 #pragma once
 
+#include <string>
+
 namespace picongpu
 {
     /** 2D Phase Space Selection
@@ -35,10 +37,48 @@ namespace picongpu
 
         /** short hand enums */
         enum element_momentum
-        { px = 0u, py = 1u, pz = 2u };
+        {
+            px = 0u,
+            py = 1u,
+            pz = 2u
+        };
 
         enum element_coordinate
-        { x = 0u, y = 1u, z = 2u };
+        {
+            x = 0u,
+            y = 1u,
+            z = 2u
+        };
+
+        std::string momentumAsString() const
+        {
+            switch(momentum)
+            {
+            case px:
+                return "px";
+            case py:
+                return "py";
+            case pz:
+                return "pz";
+            default:
+                throw std::runtime_error("Unreachable!");
+            }
+        }
+
+        std::string spaceAsString() const
+        {
+            switch(space)
+            {
+            case x:
+                return "x";
+            case y:
+                return "y";
+            case z:
+                return "z";
+            default:
+                throw std::runtime_error("Unreachable!");
+            }
+        }
     };
 
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/PhaseSpace/DumpHBufferOpenPMD.hpp b/include/picongpu/plugins/PhaseSpace/DumpHBufferOpenPMD.hpp
new file mode 100644
index 0000000000..a425562a39
--- /dev/null
+++ b/include/picongpu/plugins/PhaseSpace/DumpHBufferOpenPMD.hpp
@@ -0,0 +1,224 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+#include "picongpu/plugins/PhaseSpace/AxisDescription.hpp"
+#include <pmacc/communication/manager_common.hpp>
+#include <pmacc/mappings/simulation/GridController.hpp>
+#include <pmacc/mappings/simulation/SubGrid.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include <pmacc/cuSTL/container/HostBuffer.hpp>
+#include <pmacc/math/vector/Int.hpp>
+#include <pmacc/verify.hpp>
+
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <utility>
+#include <mpi.h>
+#include <openPMD/openPMD.hpp>
+#include <vector>
+
+namespace picongpu
+{
+    class DumpHBuffer
+    {
+    private:
+        using SuperCellSize = typename MappingDesc::SuperCellSize;
+
+    public:
+        /** Dump the PhaseSpace host Buffer
+         *
+         * \tparam Type the HBuffers element type
+         * \tparam int the HBuffers dimension
+         * \param hBuffer const reference to the hBuffer, including guard cells in spatial dimension
+         * \param axis_element plot to create: e.g. py, x from momentum/spatial-coordinate
+         * \param unit sim unit of the buffer
+         * \param strSpecies unique short hand name of the species
+         * \param filenameSuffix infix + extension part of openPMD filename
+         * \param currentStep current time step
+         * \param mpiComm communicator of the participating ranks
+         */
+        template<typename T_Type, int T_bufDim>
+        void operator()(
+            const pmacc::container::HostBuffer<T_Type, T_bufDim>& hBuffer,
+            const AxisDescription axis_element,
+            const std::pair<float_X, float_X> axis_p_range,
+            const float_64 pRange_unit,
+            const float_64 unit,
+            const std::string strSpecies,
+            const std::string filenameExtension,
+            const std::string jsonConfig,
+            const uint32_t currentStep,
+            MPI_Comm mpiComm) const
+        {
+            using Type = T_Type;
+
+            /** file name *****************************************************
+             *    phaseSpace/PhaseSpace_xpy_timestep.h5                       */
+            std::string fCoords("xyz");
+            std::ostringstream openPMDFilename;
+            openPMDFilename << "phaseSpace/PhaseSpace_" << strSpecies << "_" << fCoords.at(axis_element.space) << "p"
+                            << fCoords.at(axis_element.momentum) << "_%T." << filenameExtension;
+
+            /** get size of the fileWriter communicator ***********************/
+            int size;
+            MPI_CHECK(MPI_Comm_size(mpiComm, &size));
+
+            /** create parallel domain collector ******************************/
+            ::openPMD::Series series(openPMDFilename.str(), ::openPMD::Access::CREATE, mpiComm, jsonConfig);
+            ::openPMD::Iteration iteration = series.iterations[currentStep];
+
+            const std::string software("PIConGPU");
+
+            std::stringstream softwareVersion;
+            softwareVersion << PICONGPU_VERSION_MAJOR << "." << PICONGPU_VERSION_MINOR << "."
+                            << PICONGPU_VERSION_PATCH;
+            if(!std::string(PICONGPU_VERSION_LABEL).empty())
+                softwareVersion << "-" << PICONGPU_VERSION_LABEL;
+            series.setSoftware(software, softwareVersion.str());
+
+            pmacc::GridController<simDim>& gc = pmacc::Environment<simDim>::get().GridController();
+
+            /** calculate GUARD offset in the source hBuffer *****************/
+            const uint32_t rGuardCells
+                = SuperCellSize().toRT()[axis_element.space] * GuardSize::toRT()[axis_element.space];
+
+            /** calculate local and global size of the phase space ***********/
+            const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+            const std::uint64_t rLocalOffset = subGrid.getLocalDomain().offset[axis_element.space];
+            const std::uint64_t rLocalSize = int(hBuffer.size().y() - 2 * rGuardCells);
+            const std::uint64_t rGlobalSize = subGrid.getGlobalDomain().size[axis_element.space];
+            PMACC_VERIFY(int(rLocalSize) == subGrid.getLocalDomain().size[axis_element.space]);
+
+            /* globalDomain of the phase space */
+            ::openPMD::Extent globalPhaseSpace_extent{rGlobalSize, hBuffer.size().x()};
+
+            /* global moving window meta information */
+            ::openPMD::Offset globalPhaseSpace_offset{0, 0};
+            std::uint64_t globalMovingWindowOffset = 0;
+            std::uint64_t globalMovingWindowSize = rGlobalSize;
+            if(axis_element.space == AxisDescription::y) /* spatial axis == y */
+            {
+                globalPhaseSpace_offset[0] = numSlides * rLocalSize;
+                Window window = MovingWindow::getInstance().getWindow(currentStep);
+                globalMovingWindowOffset = window.globalDimensions.offset[axis_element.space];
+                globalMovingWindowSize = window.globalDimensions.size[axis_element.space];
+            }
+
+            /* localDomain: offset of it in the globalDomain and size */
+            ::openPMD::Offset localPhaseSpace_offset{rLocalOffset, 0};
+            ::openPMD::Extent localPhaseSpace_extent{rLocalSize, hBuffer.size().x()};
+
+            /** Dataset Name **************************************************/
+            std::ostringstream dataSetName;
+            /* xpx or ypz or ... */
+            dataSetName << strSpecies << "_" << fCoords.at(axis_element.space) << "p"
+                        << fCoords.at(axis_element.momentum);
+
+            /** debug log *****************************************************/
+            int rank;
+            MPI_CHECK(MPI_Comm_rank(mpiComm, &rank));
+            {
+                std::stringstream offsetAsString, localExtentAsString, globalExtentAsString;
+                offsetAsString << "[" << localPhaseSpace_offset[0] << ", " << localPhaseSpace_offset[1] << "]";
+                localExtentAsString << "[" << localPhaseSpace_extent[0] << ", " << localPhaseSpace_extent[1] << "]";
+                globalExtentAsString << "[" << globalPhaseSpace_extent[0] << ", " << globalPhaseSpace_extent[1] << "]";
+                log<picLog::INPUT_OUTPUT>(
+                    "Dump buffer %1% to %2% at offset %3% with size %4% for total size %5% for rank %6% / %7%")
+                    % (*(hBuffer.origin()(0, rGuardCells))) % dataSetName.str() % offsetAsString.str()
+                    % localExtentAsString.str() % globalExtentAsString.str() % rank % size;
+            }
+
+            /** write local domain ********************************************/
+
+            ::openPMD::Mesh mesh = iteration.meshes[dataSetName.str()];
+            ::openPMD::MeshRecordComponent dataset = mesh[::openPMD::RecordComponent::SCALAR];
+
+            dataset.resetDataset({::openPMD::determineDatatype<Type>(), globalPhaseSpace_extent});
+            std::shared_ptr<Type> data(&(*hBuffer.origin()(0, rGuardCells)), [](auto const&) {});
+            dataset.storeChunk<Type>(data, localPhaseSpace_offset, localPhaseSpace_extent);
+
+            /** meta attributes for the data set: unit, range, moving window **/
+
+            pmacc::Selection<simDim> globalDomain = subGrid.getGlobalDomain();
+            pmacc::Selection<simDim> totalDomain = subGrid.getTotalDomain();
+            // convert things to std::vector<> for the openPMD API to enjoy
+            std::vector<int> globalDomainSize{&globalDomain.size[0], &globalDomain.size[0] + simDim};
+            std::vector<int> globalDomainOffset{&globalDomain.offset[0], &globalDomain.offset[0] + simDim};
+            std::vector<int> totalDomainSize{&totalDomain.size[0], &totalDomain.size[0] + simDim};
+            std::vector<int> totalDomainOffset{&totalDomain.offset[0], &totalDomain.offset[0] + simDim};
+            std::vector<std::string> globalDomainAxisLabels;
+            if(simDim == DIM2)
+            {
+                globalDomainAxisLabels = {"y", "x"}; // 2D: F[y][x]
+            }
+            if(simDim == DIM3)
+            {
+                globalDomainAxisLabels = {"z", "y", "x"}; // 3D: F[z][y][x]
+            }
+
+            float_X const dr = cellSize[axis_element.space];
+
+            mesh.setAttribute("globalDomainSize", globalDomainSize);
+            mesh.setAttribute("globalDomainOffset", globalDomainOffset);
+            mesh.setAttribute("totalDomainSize", totalDomainSize);
+            mesh.setAttribute("totalDomainOffset", totalDomainOffset);
+            mesh.setAttribute("globalDomainAxisLabels", globalDomainAxisLabels);
+            mesh.setAttribute("totalDomainAxisLabels", globalDomainAxisLabels);
+            mesh.setAttribute("_global_start", globalPhaseSpace_offset);
+            mesh.setAttribute("_global_size", globalPhaseSpace_extent);
+            mesh.setAxisLabels({axis_element.spaceAsString(), axis_element.momentumAsString()});
+            mesh.setAttribute("sim_unit", unit);
+            dataset.setUnitSI(unit);
+            {
+                using UD = ::openPMD::UnitDimension;
+                mesh.setUnitDimension({{UD::I, 1.0}, {UD::T, 1.0}, {UD::L, -1.0}}); // charge density
+            }
+            mesh.setAttribute("p_unit", pRange_unit);
+            mesh.setAttribute("p_min", axis_p_range.first);
+            mesh.setAttribute("p_max", axis_p_range.second);
+            mesh.setGridGlobalOffset({globalMovingWindowOffset * dr, axis_p_range.first});
+            mesh.setAttribute("movingWindowOffset", globalMovingWindowOffset);
+            mesh.setAttribute("movingWindowSize", globalMovingWindowSize);
+            mesh.setAttribute("dr", dr);
+            mesh.setAttribute("dV", CELL_VOLUME);
+            mesh.setGridSpacing(std::vector<float_X>{dr, CELL_VOLUME / dr});
+            mesh.setAttribute("dr_unit", UNIT_LENGTH);
+            iteration.setDt(DELTA_T);
+            iteration.setTimeUnitSI(UNIT_TIME);
+            /*
+             * The value represents an aggregation over one cell, so any value is correct for the mesh position.
+             * Just use the center.
+             */
+            dataset.setPosition(std::vector<float>{0.5, 0.5});
+
+            // avoid deadlock between not finished pmacc tasks and mpi calls in openPMD
+            __getTransactionEvent().waitForFinished();
+
+            /** close file ****************************************************/
+            iteration.close();
+        }
+    };
+
+} /* namespace picongpu */
diff --git a/include/picongpu/plugins/PhaseSpace/DumpHBufferSplashP.hpp b/include/picongpu/plugins/PhaseSpace/DumpHBufferSplashP.hpp
deleted file mode 100644
index fde3952c74..0000000000
--- a/include/picongpu/plugins/PhaseSpace/DumpHBufferSplashP.hpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-
-#include "picongpu/traits/SplashToPIC.hpp"
-#include "picongpu/traits/PICToSplash.hpp"
-
-#include "picongpu/plugins/PhaseSpace/AxisDescription.hpp"
-#include <pmacc/communication/manager_common.hpp>
-#include <pmacc/mappings/simulation/GridController.hpp>
-#include <pmacc/mappings/simulation/SubGrid.hpp>
-#include <pmacc/dimensions/DataSpace.hpp>
-#include <pmacc/cuSTL/container/HostBuffer.hpp>
-#include <pmacc/math/vector/Int.hpp>
-#include <pmacc/verify.hpp>
-
-#include <string>
-#include <fstream>
-#include <sstream>
-#include <utility>
-#include <mpi.h>
-#include <splash/splash.h>
-
-namespace picongpu
-{
-    class DumpHBuffer
-    {
-    private:
-       typedef typename MappingDesc::SuperCellSize SuperCellSize;
-
-    public:
-        /** Dump the PhaseSpace host Buffer
-         *
-         * \tparam Type the HBuffers element type
-         * \tparam int the HBuffers dimension
-         * \param hBuffer const reference to the hBuffer, including guard cells in spatial dimension
-         * \param axis_element plot to create: e.g. py, x from momentum/spatial-coordinate
-         * \param unit sim unit of the buffer
-         * \param strSpecies unique short hand name of the species
-         * \param currentStep current time step
-         * \param mpiComm communicator of the participating ranks
-         */
-        template<typename T_Type, int T_bufDim>
-        void operator()( const pmacc::container::HostBuffer<T_Type, T_bufDim>& hBuffer,
-                         const AxisDescription axis_element,
-                         const std::pair<float_X, float_X> axis_p_range,
-                         const float_64 pRange_unit,
-                         const float_64 unit,
-                         const std::string strSpecies,
-                         const uint32_t currentStep,
-                         MPI_Comm mpiComm ) const
-        {
-            using namespace splash;
-            typedef T_Type Type;
-            const int bufDim = T_bufDim;
-
-            /** file name *****************************************************
-             *    phaseSpace/PhaseSpace_xpy_timestep.h5                       */
-            std::string fCoords("xyz");
-            std::ostringstream filename;
-            filename << "phaseSpace/PhaseSpace_"
-                     << strSpecies << "_"
-                     << fCoords.at(axis_element.space)
-                     << "p" << fCoords.at(axis_element.momentum);
-
-            /** get size of the fileWriter communicator ***********************/
-            int size;
-            MPI_CHECK(MPI_Comm_size( mpiComm, &size ));
-
-            /** create parallel domain collector ******************************/
-            ParallelDomainCollector pdc(
-                mpiComm, MPI_INFO_NULL, Dimensions(size, 1, 1), 10 );
-
-            pmacc::GridController<simDim>& gc =
-                pmacc::Environment<simDim>::get().GridController();
-            DataCollector::FileCreationAttr fAttr;
-            Dimensions mpiPosition( gc.getPosition()[axis_element.space], 0, 0 );
-            fAttr.mpiPosition.set( mpiPosition );
-
-            DataCollector::initFileCreationAttr(fAttr);
-
-            pdc.open( filename.str().c_str(), fAttr );
-
-            /** calculate GUARD offset in the source hBuffer *****************/
-            const uint32_t rGuardCells =
-                SuperCellSize().toRT()[axis_element.space] * GuardSize::toRT()[axis_element.space];
-
-            /** calculate local and global size of the phase space ***********/
-            const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
-            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-            const int rLocalOffset = subGrid.getLocalDomain().offset[axis_element.space];
-            const int rLocalSize = int(hBuffer.size().y() - 2*rGuardCells);
-            const int rGlobalSize = subGrid.getGlobalDomain().size[axis_element.space];
-            PMACC_VERIFY( rLocalSize == subGrid.getLocalDomain().size[axis_element.space] );
-
-            /* globalDomain of the phase space */
-            splash::Dimensions globalPhaseSpace_size( hBuffer.size().x(),
-                                                      rGlobalSize,
-                                                      1 );
-
-            /* global moving window meta information */
-            splash::Dimensions globalPhaseSpace_offset( 0, 0, 0 );
-            int globalMovingWindowOffset = 0;
-            int globalMovingWindowSize   = rGlobalSize;
-            if( axis_element.space == AxisDescription::y ) /* spatial axis == y */
-            {
-                globalPhaseSpace_offset.set( 0, numSlides * rLocalSize, 0 );
-                Window window = MovingWindow::getInstance( ).getWindow( currentStep );
-                globalMovingWindowOffset = window.globalDimensions.offset[axis_element.space];
-                globalMovingWindowSize = window.globalDimensions.size[axis_element.space];
-            }
-
-            /* localDomain: offset of it in the globalDomain and size */
-            splash::Dimensions localPhaseSpace_offset( 0, rLocalOffset, 0 );
-            splash::Dimensions localPhaseSpace_size( hBuffer.size().x(),
-                                                     rLocalSize,
-                                                     1 );
-
-            /** Dataset Name **************************************************/
-            std::ostringstream dataSetName;
-            /* xpx or ypz or ... */
-            dataSetName << fCoords.at(axis_element.space)
-                        << "p" << fCoords.at(axis_element.momentum);
-
-            /** debug log *****************************************************/
-            int rank;
-            MPI_CHECK(MPI_Comm_rank( mpiComm, &rank ));
-            log<picLog::INPUT_OUTPUT > ("Dump buffer %1% to %2% at offset %3% with size %4% for total size %5% for rank %6% / %7%")
-                % ( *(hBuffer.origin()(0,rGuardCells)) ) % dataSetName.str() % localPhaseSpace_offset.toString()
-                % localPhaseSpace_size.toString() % globalPhaseSpace_size.toString()
-                % rank % size;
-
-            /** write local domain ********************************************/
-            typename PICToSplash<Type>::type ctPhaseSpace;
-
-            // avoid deadlock between not finished pmacc tasks and mpi calls in HDF5
-            __getTransactionEvent().waitForFinished();
-
-            pdc.writeDomain( currentStep,
-                             /* global domain and my local offset within it */
-                             globalPhaseSpace_size,
-                             localPhaseSpace_offset,
-                             /* */
-                             ctPhaseSpace,
-                             bufDim,
-                             /* local data set dimensions */
-                             splash::Selection(localPhaseSpace_size),
-                             /* data set name */
-                             dataSetName.str().c_str(),
-                             /* global domain */
-                             splash::Domain(
-                                    globalPhaseSpace_offset,
-                                    globalPhaseSpace_size
-                             ),
-                             /* dataClass, buffer */
-                             DomainCollector::GridType,
-                             &(*hBuffer.origin()(0,rGuardCells)) );
-
-            /** meta attributes for the data set: unit, range, moving window **/
-            typedef PICToSplash<float_X>::type  SplashFloatXType;
-            typedef PICToSplash<float_64>::type SplashFloat64Type;
-            ColTypeInt ctInt;
-            SplashFloat64Type ctFloat64;
-            SplashFloatXType  ctFloatX;
-
-            pdc.writeAttribute( currentStep, ctFloat64, dataSetName.str().c_str(),
-                                "sim_unit", &unit );
-            pdc.writeAttribute( currentStep, ctFloat64, dataSetName.str().c_str(),
-                                "p_unit", &pRange_unit );
-            pdc.writeAttribute( currentStep, ctFloatX, dataSetName.str().c_str(),
-                                "p_min", &(axis_p_range.first) );
-            pdc.writeAttribute( currentStep, ctFloatX, dataSetName.str().c_str(),
-                                "p_max", &(axis_p_range.second) );
-            pdc.writeAttribute( currentStep, ctInt, dataSetName.str().c_str(),
-                                "movingWindowOffset", &globalMovingWindowOffset );
-            pdc.writeAttribute( currentStep, ctInt, dataSetName.str().c_str(),
-                                "movingWindowSize", &globalMovingWindowSize );
-
-            pdc.writeAttribute( currentStep, ctFloatX, dataSetName.str().c_str(),
-                                "dr", &(cellSize[axis_element.space]) );
-            pdc.writeAttribute( currentStep, ctFloatX, dataSetName.str().c_str(),
-                                "dV", &CELL_VOLUME );
-            pdc.writeAttribute( currentStep, ctFloat64, dataSetName.str().c_str(),
-                                "dr_unit", &UNIT_LENGTH );
-            pdc.writeAttribute( currentStep, ctFloatX, dataSetName.str().c_str(),
-                                "dt", &DELTA_T );
-            pdc.writeAttribute( currentStep, ctFloat64, dataSetName.str().c_str(),
-                                "dt_unit", &UNIT_TIME );
-
-            /** close file ****************************************************/
-            pdc.finalize();
-            pdc.close();
-        }
-    };
-
-} /* namespace picongpu */
diff --git a/include/picongpu/plugins/PhaseSpace/PhaseSpace.hpp b/include/picongpu/plugins/PhaseSpace/PhaseSpace.hpp
index 8f4cb6354c..438e71a5a3 100644
--- a/include/picongpu/plugins/PhaseSpace/PhaseSpace.hpp
+++ b/include/picongpu/plugins/PhaseSpace/PhaseSpace.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -61,152 +61,101 @@ namespace picongpu
 
         struct Help : public plugins::multi::IHelp
         {
-
             /** creates an instance of ISlave
              *
              * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
              * @param help plugin defined help
              * @param id index of the plugin, range: [0;help->getNumPlugins())
              */
-            std::shared_ptr< ISlave > create(
-                std::shared_ptr< IHelp > & help,
-                size_t const id,
-                MappingDesc* cellDescription
-            )
+            std::shared_ptr<ISlave> create(std::shared_ptr<IHelp>& help, size_t const id, MappingDesc* cellDescription)
             {
-                return std::shared_ptr< ISlave >(
-                    new PhaseSpace<
-                        T_AssignmentFunction,
-                        Species
-                    >(
-                        help,
-                        id,
-                        cellDescription
-                    )
-                );
+                return std::shared_ptr<ISlave>(
+                    new PhaseSpace<T_AssignmentFunction, Species>(help, id, cellDescription));
             }
 
             // find all valid filter for the current used species
-            using EligibleFilters = typename MakeSeqFromNestedSeq<
-                typename bmpl::transform<
-                    particles::filter::AllParticleFilters,
-                    particles::traits::GenerateSolversIfSpeciesEligible<
-                        bmpl::_1,
-                        Species
-                    >
-                >::type
-            >::type;
+            using EligibleFilters = typename MakeSeqFromNestedSeq<typename bmpl::transform<
+                particles::filter::AllParticleFilters,
+                particles::traits::GenerateSolversIfSpeciesEligible<bmpl::_1, Species>>::type>::type;
 
             //! periodicity of computing the particle energy
-            plugins::multi::Option< std::string > notifyPeriod = {
-                "period",
-                "notify period"
-            };
-            plugins::multi::Option< std::string > filter = {
-                "filter",
-                "particle filter: "
-            };
+            plugins::multi::Option<std::string> notifyPeriod = {"period", "notify period"};
+            plugins::multi::Option<std::string> filter = {"filter", "particle filter: "};
 
-            plugins::multi::Option< std::string > element_space = {
-                "space",
-                "spatial component (x, y, z)"
-            };
-            plugins::multi::Option< std::string > element_momentum = {
-                "momentum",
-                "momentum component (px, py, pz)"
-            };
-            plugins::multi::Option< float_X > momentum_range_min = {
-                "min",
-                "min range momentum [m_species c]"
-            };
-            plugins::multi::Option< float_X > momentum_range_max = {
-                "max",
-                "max range momentum [m_species c]"
-            };
+            plugins::multi::Option<std::string> element_space = {"space", "spatial component (x, y, z)"};
+            plugins::multi::Option<std::string> element_momentum = {"momentum", "momentum component (px, py, pz)"};
+            plugins::multi::Option<float_X> momentum_range_min = {"min", "min range momentum [m_species c]"};
+            plugins::multi::Option<float_X> momentum_range_max = {"max", "max range momentum [m_species c]"};
+
+            /*
+             * Set to h5 for now at least, to make for easier comparison of
+             * output with old outpu
+             */
+            plugins::multi::Option<std::string> file_name_extension
+                = {"ext",
+                   "openPMD filename extension (this controls the"
+                   "backend picked by the openPMD API)",
+                   "h5"};
+
+            plugins::multi::Option<std::string> json_config
+                = {"json", "advanced (backend) configuration for openPMD in JSON format", "{}"};
 
             //! string list with all possible particle filters
             std::string concatenatedFilterNames;
-            std::vector< std::string > allowedFilters;
+            std::vector<std::string> allowedFilters;
 
             ///! method used by plugin controller to get --help description
             void registerHelp(
-                boost::program_options::options_description & desc,
-                std::string const & masterPrefix = std::string{ }
-            )
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
             {
+                meta::ForEach<EligibleFilters, plugins::misc::AppendName<bmpl::_1>> getEligibleFilterNames;
+                getEligibleFilterNames(allowedFilters);
+
+                concatenatedFilterNames = plugins::misc::concatenateToString(allowedFilters, ", ");
+
+                notifyPeriod.registerHelp(desc, masterPrefix + prefix);
+                filter.registerHelp(desc, masterPrefix + prefix, std::string("[") + concatenatedFilterNames + "]");
 
-                meta::ForEach<
-                    EligibleFilters,
-                    plugins::misc::AppendName< bmpl::_1 >
-                > getEligibleFilterNames;
-                getEligibleFilterNames( allowedFilters );
-
-                concatenatedFilterNames = plugins::misc::concatenateToString(
-                    allowedFilters,
-                    ", "
-                );
-
-                notifyPeriod.registerHelp(
-                    desc,
-                    masterPrefix + prefix
-                );
-                filter.registerHelp(
-                    desc,
-                    masterPrefix + prefix,
-                    std::string( "[" ) + concatenatedFilterNames + "]"
-                );
-
-                element_space.registerHelp(
-                    desc,
-                    masterPrefix + prefix
-                );
-                element_momentum.registerHelp(
-                    desc,
-                    masterPrefix + prefix
-                );
-                momentum_range_min.registerHelp(
-                    desc,
-                    masterPrefix + prefix
-                );
-                momentum_range_max.registerHelp(
-                    desc,
-                    masterPrefix + prefix
-                );
+                element_space.registerHelp(desc, masterPrefix + prefix);
+                element_momentum.registerHelp(desc, masterPrefix + prefix);
+                momentum_range_min.registerHelp(desc, masterPrefix + prefix);
+                momentum_range_max.registerHelp(desc, masterPrefix + prefix);
+                file_name_extension.registerHelp(desc, masterPrefix + prefix);
+                json_config.registerHelp(desc, masterPrefix + prefix);
             }
 
             void expandHelp(
-                boost::program_options::options_description & desc,
-                std::string const & masterPrefix = std::string{ }
-            )
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
             {
             }
 
 
             void validateOptions()
             {
-                if( notifyPeriod.size() != filter.size() )
-                    throw std::runtime_error( name + ": parameter filter and period are not used the same number of times" );
-                if( notifyPeriod.size() != element_space.size() )
-                    throw std::runtime_error( name + ": parameter space and period are not used the same number of times" );
-                if( notifyPeriod.size() != element_momentum.size() )
-                    throw std::runtime_error( name + ": parameter momentum and period are not used the same number of times" );
-                if( notifyPeriod.size() != momentum_range_min.size() )
-                    throw std::runtime_error( name + ": parameter min and period are not used the same number of times" );
-                if( notifyPeriod.size() != momentum_range_max.size() )
-                    throw std::runtime_error( name + ": parameter max and period are not used the same number of times" );
+                if(notifyPeriod.size() != filter.size())
+                    throw std::runtime_error(
+                        name + ": parameter filter and period are not used the same number of times");
+                if(notifyPeriod.size() != element_space.size())
+                    throw std::runtime_error(
+                        name + ": parameter space and period are not used the same number of times");
+                if(notifyPeriod.size() != element_momentum.size())
+                    throw std::runtime_error(
+                        name + ": parameter momentum and period are not used the same number of times");
+                if(notifyPeriod.size() != momentum_range_min.size())
+                    throw std::runtime_error(
+                        name + ": parameter min and period are not used the same number of times");
+                if(notifyPeriod.size() != momentum_range_max.size())
+                    throw std::runtime_error(
+                        name + ": parameter max and period are not used the same number of times");
 
                 // check if user passed filter name are valid
-                for( auto const & filterName : filter)
+                for(auto const& filterName : filter)
                 {
-                    if(
-                        std::find(
-                            allowedFilters.begin(),
-                            allowedFilters.end(),
-                            filterName
-                        ) == allowedFilters.end()
-                    )
+                    if(std::find(allowedFilters.begin(), allowedFilters.end(), filterName) == allowedFilters.end())
                     {
-                        throw std::runtime_error( name + ": unknown filter '" + filterName + "'" );
+                        throw std::runtime_error(name + ": unknown filter '" + filterName + "'");
                     }
                 }
             }
@@ -235,13 +184,12 @@ namespace picongpu
             //! short description of the plugin
             std::string const description = "create phase space of a species";
             //! prefix used for command line arguments
-            std::string const prefix = Species::FrameType::getName( ) + std::string( "_phaseSpace" );
+            std::string const prefix = Species::FrameType::getName() + std::string("_phaseSpace");
         };
 
 
     private:
-
-        MappingDesc *m_cellDescription = nullptr;
+        MappingDesc* m_cellDescription = nullptr;
 
         /** plot to create: e.g. py, x from element_coordinate/momentum */
         AxisDescription axis_element;
@@ -249,7 +197,7 @@ namespace picongpu
         std::pair<float_X, float_X> axis_p_range;
         uint32_t r_bins;
 
-        std::shared_ptr< Help > m_help;
+        std::shared_ptr<Help> m_help;
         size_t m_id;
 
         typedef float_32 float_PS;
@@ -257,13 +205,16 @@ namespace picongpu
          *  we use not more than 32KB shared memory
          *  Note: checking the longest edge for all phase space configurations
          *        is a conservative work around until #469 is implemented */
-        typedef typename bmpl::accumulate<
-            typename SuperCellSize::mplVector,
-            bmpl::int_<0>,
-            bmpl::max<bmpl::_1, bmpl::_2>
-            >::type SuperCellsLongestEdge;
-        static constexpr uint32_t maxShared = 32*1024; /* 32 KB */
-        static constexpr uint32_t num_pbins = maxShared/(sizeof(float_PS)*SuperCellsLongestEdge::value);
+        typedef typename bmpl::
+            accumulate<typename SuperCellSize::mplVector, bmpl::int_<0>, bmpl::max<bmpl::_1, bmpl::_2>>::type
+                SuperCellsLongestEdge;
+        /* Note: the previously used 32 KB shared memory size is not correct
+         * for CPUs, as discovered in #3329. As a quick patch, slightly reduce
+         * it so that the buffer plus a few small shared memory variables
+         * together fit 30 KB as set by default on CPUs. So set to 30 000 bytes.
+         */
+        static constexpr uint32_t maxShared = 30000;
+        static constexpr uint32_t num_pbins = maxShared / (sizeof(float_PS) * SuperCellsLongestEdge::value);
 
         container::DeviceBuffer<float_PS, 2>* dBuffer = nullptr;
 
@@ -274,7 +225,7 @@ namespace picongpu
          */
         MPI_Comm commFileWriter = MPI_COMM_NULL;
 
-        template< uint32_t r_dir >
+        template<uint32_t r_dir>
 
         struct StartBlockFunctor
         {
@@ -289,139 +240,73 @@ namespace picongpu
                 const TParticlesBox& pb,
                 cursor::BufferCursor<float_PS, 2> cur,
                 const uint32_t p_dir,
-                const std::pair<float_X, float_X>& p_range
-            ) :
-                particlesBox(pb), curOriginPhaseSpace(cur), p_element(p_dir),
-                axis_p_range(p_range)
-            {}
-
-            template<
-                typename T_Filter,
-                typename T_Zone,
-                typename ... T_Args
-            >
-            void operator()(
-                T_Filter const & filter,
-                T_Zone const & zone,
-                T_Args && ... args
-            ) const
+                const std::pair<float_X, float_X>& p_range)
+                : particlesBox(pb)
+                , curOriginPhaseSpace(cur)
+                , p_element(p_dir)
+                , axis_p_range(p_range)
+            {
+            }
+
+            template<typename T_Filter, typename T_Zone, typename... T_Args>
+            void operator()(T_Filter const& filter, T_Zone const& zone, T_Args&&... args) const
             {
-                constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                    pmacc::math::CT::volume< SuperCellSize >::type::value
-                >::value;
-                algorithm::kernel::ForeachLockstep<
-                    numWorkers,
-                    SuperCellSize
-                > forEachSuperCell;
-
-                FunctorBlock<
-                    Species,
-                    SuperCellSize,
-                    float_PS,
-                    num_pbins,
-                    r_dir,
-                    T_Filter,
-                    numWorkers
-                > functorBlock(
-                    particlesBox,
-                    curOriginPhaseSpace,
-                    p_element,
-                    axis_p_range,
-                    filter
-                );
-
-                forEachSuperCell(
-                    zone,
-                    functorBlock,
-                    args ...
-                );
+                constexpr uint32_t numWorkers
+                    = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+                algorithm::kernel::ForeachLockstep<numWorkers, SuperCellSize> forEachSuperCell;
+
+                FunctorBlock<Species, SuperCellSize, float_PS, num_pbins, r_dir, T_Filter, numWorkers>
+                    functorBlock(particlesBox, curOriginPhaseSpace, p_element, axis_p_range, filter);
+
+                forEachSuperCell(zone, functorBlock, args...);
             }
         };
 
     public:
-
         //! must be implemented by the user
-        static std::shared_ptr< plugins::multi::IHelp > getHelp()
+        static std::shared_ptr<plugins::multi::IHelp> getHelp()
         {
-            return std::shared_ptr< plugins::multi::IHelp >( new Help{ } );
+            return std::shared_ptr<plugins::multi::IHelp>(new Help{});
         }
 
-        PhaseSpace(
-            std::shared_ptr< plugins::multi::IHelp > & help,
-            size_t const id,
-            MappingDesc* cellDescription
-        );
+        PhaseSpace(std::shared_ptr<plugins::multi::IHelp>& help, size_t const id, MappingDesc* cellDescription);
         virtual ~PhaseSpace();
 
-        void notify( uint32_t currentStep );
+        void notify(uint32_t currentStep);
 
-        void restart(
-            uint32_t restartStep,
-            std::string const & restartDirectory
-        )
+        void restart(uint32_t restartStep, std::string const& restartDirectory)
         {
-
         }
 
-        void checkpoint(
-            uint32_t currentStep,
-            std::string const & checkpointDirectory
-        )
+        void checkpoint(uint32_t currentStep, std::string const& checkpointDirectory)
         {
-
         }
 
         template<uint32_t Direction>
-        void calcPhaseSpace( const uint32_t currentStep );
+        void calcPhaseSpace(const uint32_t currentStep);
     };
 
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_AssignmentFunction,
-        typename T_UnspecifiedSpecies
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        PhaseSpace<
-            T_AssignmentFunction,
-            T_UnspecifiedSpecies
-        >
-    >
+    namespace particles
     {
-        using FrameType = typename T_Species::FrameType;
-
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            position<>,
-            momentum
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        using SpeciesHasMass = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-        using SpeciesHasCharge = typename pmacc::traits::HasFlag<
-            FrameType,
-            chargeRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasMass,
-            SpeciesHasCharge
-        >;
-    };
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            template<typename T_Species, typename T_AssignmentFunction, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<T_Species, PhaseSpace<T_AssignmentFunction, T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                using RequiredIdentifiers = MakeSeq_t<weighting, position<>, momentum>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                using SpeciesHasMass = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+                using SpeciesHasCharge = typename pmacc::traits::HasFlag<FrameType, chargeRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasMass, SpeciesHasCharge>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
 
 #include "PhaseSpace.tpp"
diff --git a/include/picongpu/plugins/PhaseSpace/PhaseSpace.tpp b/include/picongpu/plugins/PhaseSpace/PhaseSpace.tpp
index faa5d69306..a92f5779d9 100644
--- a/include/picongpu/plugins/PhaseSpace/PhaseSpace.tpp
+++ b/include/picongpu/plugins/PhaseSpace/PhaseSpace.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten
  *
  * This file is part of PIConGPU.
  *
@@ -20,7 +20,7 @@
 #pragma once
 
 #include "PhaseSpace.hpp"
-#include "DumpHBufferSplashP.hpp"
+#include "DumpHBufferOpenPMD.hpp"
 
 #include <pmacc/cuSTL/container/DeviceBuffer.hpp>
 #include <pmacc/cuSTL/cursor/MultiIndexCursor.hpp>
@@ -43,58 +43,52 @@ namespace picongpu
 {
     template<class AssignmentFunction, class Species>
     PhaseSpace<AssignmentFunction, Species>::PhaseSpace(
-        std::shared_ptr< plugins::multi::IHelp > & help,
+        std::shared_ptr<plugins::multi::IHelp>& help,
         size_t const id,
-        MappingDesc* cellDescription
-    ) :
-        m_help( std::static_pointer_cast< Help >(help) ),
-        m_id( id ),
-        m_cellDescription( cellDescription )
+        MappingDesc* cellDescription)
+        : m_help(std::static_pointer_cast<Help>(help))
+        , m_id(id)
+        , m_cellDescription(cellDescription)
     {
         // unit is m_species c (for a single "real" particle)
-        float_X pRangeSingle_unit(
-            frame::getMass< typename Species::FrameType >() *
-            SPEED_OF_LIGHT
-        );
+        float_X pRangeSingle_unit(frame::getMass<typename Species::FrameType>() * SPEED_OF_LIGHT);
 
-        axis_p_range.first = m_help->momentum_range_min.get( id ) * pRangeSingle_unit;
-        axis_p_range.second = m_help->momentum_range_max.get( id ) * pRangeSingle_unit ;
+        axis_p_range.first = m_help->momentum_range_min.get(id) * pRangeSingle_unit;
+        axis_p_range.second = m_help->momentum_range_max.get(id) * pRangeSingle_unit;
         /* String to Enum conversion */
         uint32_t el_space;
-        if( m_help->element_space.get( id ) == "x" )
-           el_space = AxisDescription::x;
-        else if( m_help->element_space.get( id ) == "y" )
-           el_space = AxisDescription::y;
-        else if( m_help->element_space.get( id ) == "z" )
-           el_space = AxisDescription::z;
+        if(m_help->element_space.get(id) == "x")
+            el_space = AxisDescription::x;
+        else if(m_help->element_space.get(id) == "y")
+            el_space = AxisDescription::y;
+        else if(m_help->element_space.get(id) == "z")
+            el_space = AxisDescription::z;
         else
-           throw PluginException("[Plugin] [" + m_help->getOptionPrefix() + "] space must be x, y or z" );
+            throw PluginException("[Plugin] [" + m_help->getOptionPrefix() + "] space must be x, y or z");
 
         uint32_t el_momentum = AxisDescription::px;
-        if( m_help->element_momentum.get( id ) == "px" )
-           el_momentum = AxisDescription::px;
-        else if( m_help->element_momentum.get( id ) == "py" )
-           el_momentum = AxisDescription::py;
-        else if( m_help->element_momentum.get( id ) == "pz" )
-           el_momentum = AxisDescription::pz;
+        if(m_help->element_momentum.get(id) == "px")
+            el_momentum = AxisDescription::px;
+        else if(m_help->element_momentum.get(id) == "py")
+            el_momentum = AxisDescription::py;
+        else if(m_help->element_momentum.get(id) == "pz")
+            el_momentum = AxisDescription::pz;
         else
-           throw PluginException("[Plugin] [" + m_help->getOptionPrefix() + "] momentum must be px, py or pz" );
+            throw PluginException("[Plugin] [" + m_help->getOptionPrefix() + "] momentum must be px, py or pz");
 
         axis_element.momentum = el_momentum;
         axis_element.space = el_space;
 
         bool activatePlugin = true;
 
-        if( simDim == DIM2 && el_space == AxisDescription::z )
+        if(simDim == DIM2 && el_space == AxisDescription::z)
         {
             std::cerr << "[Plugin] [" + m_help->getOptionPrefix() + "] Skip requested output for "
-                << m_help->element_space.get( id )
-                << m_help->element_momentum.get( id )
-                << std::endl;
+                      << m_help->element_space.get(id) << m_help->element_momentum.get(id) << std::endl;
             activatePlugin = false;
         }
 
-        if( activatePlugin )
+        if(activatePlugin)
         {
             /** create dir */
             Environment<simDim>::get().Filesystem().createDirectoryWithPermissions("phaseSpace");
@@ -102,10 +96,9 @@ namespace picongpu
             const uint32_t r_element = axis_element.space;
 
             /* CORE + BORDER + GUARD elements for spatial bins */
-            this->r_bins = SuperCellSize().toRT()[r_element]
-                         * this->m_cellDescription->getGridSuperCells()[r_element];
+            this->r_bins = SuperCellSize().toRT()[r_element] * this->m_cellDescription->getGridSuperCells()[r_element];
 
-            this->dBuffer = new container::DeviceBuffer<float_PS, 2>( this->num_pbins, r_bins );
+            this->dBuffer = new container::DeviceBuffer<float_PS, 2>(this->num_pbins, r_bins);
 
             /* reduce-add phase space from other GPUs in range [p0;p1]x[r;r+dr]
              * to "lowest" node in range
@@ -121,7 +114,7 @@ namespace picongpu
             pmacc::math::Size_t<simDim> sizeTransversalPlane(gpuDim);
             sizeTransversalPlane[this->axis_element.space] = 1;
 
-            for( int planePos = 0; planePos <= (int)gpuDim[this->axis_element.space]; ++planePos )
+            for(int planePos = 0; planePos <= (int) gpuDim[this->axis_element.space]; ++planePos)
             {
                 /* my plane means: the offset for the transversal plane to my r_element
                  * should be zero
@@ -129,67 +122,64 @@ namespace picongpu
                 pmacc::math::Int<simDim> longOffset(pmacc::math::Int<simDim>::create(0));
                 longOffset[this->axis_element.space] = planePos;
 
-                zone::SphericZone<simDim> zoneTransversalPlane( sizeTransversalPlane, longOffset );
+                zone::SphericZone<simDim> zoneTransversalPlane(sizeTransversalPlane, longOffset);
 
                 /* Am I the lowest GPU in my plane? */
                 bool isGroupRoot = false;
-                bool isInGroup   = ( gpuPos[this->axis_element.space] == planePos );
-                if( isInGroup )
+                bool isInGroup = (gpuPos[this->axis_element.space] == planePos);
+                if(isInGroup)
                 {
                     pmacc::math::Int<simDim> inPlaneGPU(gpuPos);
                     inPlaneGPU[this->axis_element.space] = 0;
-                    if( inPlaneGPU == pmacc::math::Int<simDim>::create(0) )
+                    if(inPlaneGPU == pmacc::math::Int<simDim>::create(0))
                         isGroupRoot = true;
                 }
 
-                algorithm::mpi::Reduce<simDim>* createReduce =
-                    new algorithm::mpi::Reduce<simDim>( zoneTransversalPlane,
-                                                        isGroupRoot );
-                if( isInGroup )
+                algorithm::mpi::Reduce<simDim>* createReduce
+                    = new algorithm::mpi::Reduce<simDim>(zoneTransversalPlane, isGroupRoot);
+                if(isInGroup)
                 {
                     this->planeReduce = createReduce;
                     this->isPlaneReduceRoot = isGroupRoot;
                 }
                 else
-                    __delete( createReduce );
+                    __delete(createReduce);
             }
 
             /* Create communicator with ranks of each plane reduce root */
             {
                 /* Array with root ranks of the planeReduce operations */
-                std::vector<int> planeReduceRootRanks( gc.getGlobalSize(), -1 );
+                std::vector<int> planeReduceRootRanks(gc.getGlobalSize(), -1);
                 /* Am I one of the planeReduce root ranks? my global rank : -1 */
-                int myRootRank = gc.getGlobalRank() * this->isPlaneReduceRoot
-                               - ( ! this->isPlaneReduceRoot );
+                int myRootRank = gc.getGlobalRank() * this->isPlaneReduceRoot - (!this->isPlaneReduceRoot);
 
                 // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
                 __getTransactionEvent().waitForFinished();
                 MPI_Group world_group, new_group;
-                MPI_CHECK(MPI_Allgather( &myRootRank, 1, MPI_INT,
-                                         &(planeReduceRootRanks.front()),
-                                         1,
-                                         MPI_INT,
-                                         MPI_COMM_WORLD ));
+                MPI_CHECK(MPI_Allgather(
+                    &myRootRank,
+                    1,
+                    MPI_INT,
+                    &(planeReduceRootRanks.front()),
+                    1,
+                    MPI_INT,
+                    MPI_COMM_WORLD));
 
                 /* remove all non-roots (-1 values) */
-                std::sort( planeReduceRootRanks.begin(), planeReduceRootRanks.end() );
-                std::vector<int> ranks( std::lower_bound( planeReduceRootRanks.begin(),
-                                                          planeReduceRootRanks.end(),
-                                                          0 ),
-                                        planeReduceRootRanks.end() );
-
-                MPI_CHECK(MPI_Comm_group( MPI_COMM_WORLD, &world_group ));
-                MPI_CHECK(MPI_Group_incl( world_group, ranks.size(), ranks.data(), &new_group ));
-                MPI_CHECK(MPI_Comm_create( MPI_COMM_WORLD, new_group, &commFileWriter ));
-                MPI_CHECK(MPI_Group_free( &new_group ));
-                MPI_CHECK(MPI_Group_free( &world_group ));
+                std::sort(planeReduceRootRanks.begin(), planeReduceRootRanks.end());
+                std::vector<int> ranks(
+                    std::lower_bound(planeReduceRootRanks.begin(), planeReduceRootRanks.end(), 0),
+                    planeReduceRootRanks.end());
+
+                MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &world_group));
+                MPI_CHECK(MPI_Group_incl(world_group, ranks.size(), ranks.data(), &new_group));
+                MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, new_group, &commFileWriter));
+                MPI_CHECK(MPI_Group_free(&new_group));
+                MPI_CHECK(MPI_Group_free(&world_group));
             }
 
             // set how often the plugin should be executed while PIConGPU is running
-            Environment<>::get( ).PluginConnector( ).setNotificationPeriod(
-                this,
-                m_help->notifyPeriod.get(id)
-            );
+            Environment<>::get().PluginConnector().setNotificationPeriod(this, m_help->notifyPeriod.get(id));
         }
     }
 
@@ -197,41 +187,41 @@ namespace picongpu
     template<class AssignmentFunction, class Species>
     PhaseSpace<AssignmentFunction, Species>::~PhaseSpace()
     {
-        __delete( this->dBuffer );
-        __delete( planeReduce );
+        __delete(this->dBuffer);
+        __delete(planeReduce);
 
-        if( commFileWriter != MPI_COMM_NULL )
+        if(commFileWriter != MPI_COMM_NULL)
         {
             // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
             __getTransactionEvent().waitForFinished();
-            MPI_CHECK_NO_EXCEPT(MPI_Comm_free( &commFileWriter ));
+            MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&commFileWriter));
         }
     }
 
-    template<class AssignmentFunction, class Species >
+    template<class AssignmentFunction, class Species>
     template<uint32_t r_dir>
-    void PhaseSpace<AssignmentFunction, Species>::calcPhaseSpace( const uint32_t currentStep )
+    void PhaseSpace<AssignmentFunction, Species>::calcPhaseSpace(const uint32_t currentStep)
     {
         const pmacc::math::Int<simDim> guardCells = SuperCellSize().toRT() * GuardSize::toRT();
-        const pmacc::math::Size_t<simDim> coreBorderSuperCells( this->m_cellDescription->getGridSuperCells() - 2 * GuardSize::toRT() );
-        const pmacc::math::Size_t<simDim> coreBorderCells = coreBorderSuperCells *
-            precisionCast<size_t>( SuperCellSize().toRT() );
+        const pmacc::math::Size_t<simDim> coreBorderSuperCells(
+            this->m_cellDescription->getGridSuperCells() - 2 * GuardSize::toRT());
+        const pmacc::math::Size_t<simDim> coreBorderCells
+            = coreBorderSuperCells * precisionCast<size_t>(SuperCellSize().toRT());
 
         /* register particle species observer */
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto particles = dc.get< Species >( Species::FrameType::getName(), true );
+        DataConnector& dc = Environment<>::get().DataConnector();
+        auto particles = dc.get<Species>(Species::FrameType::getName(), true);
 
         /* select CORE + BORDER for all cells
          * CORE + BORDER is contiguous, in cuSTL we call this a "topological spheric zone"
          */
-        zone::SphericZone<simDim> zoneCoreBorder( coreBorderCells, guardCells );
+        zone::SphericZone<simDim> zoneCoreBorder(coreBorderCells, guardCells);
 
-        StartBlockFunctor< r_dir > startBlockFunctor(
+        StartBlockFunctor<r_dir> startBlockFunctor(
             particles->getDeviceParticlesBox(),
             dBuffer->origin(),
             this->axis_element.momentum,
-            this->axis_p_range
-        );
+            this->axis_p_range);
 
         auto bindFunctor = std::bind(
             startBlockFunctor,
@@ -240,39 +230,34 @@ namespace picongpu
             // area to work on
             zoneCoreBorder,
             // data below - passed to functor operator()
-            cursor::make_MultiIndexCursor<simDim>()
-        );
-
-        meta::ForEach<
-            typename Help::EligibleFilters,
-            plugins::misc::ExecuteIfNameIsEqual< bmpl::_1 >
-        >{ }(
-            m_help->filter.get( m_id ),
+            cursor::make_MultiIndexCursor<simDim>());
+
+        meta::ForEach<typename Help::EligibleFilters, plugins::misc::ExecuteIfNameIsEqual<bmpl::_1>>{}(
+            m_help->filter.get(m_id),
             currentStep,
-            bindFunctor
-        );
+            bindFunctor);
 
-        dc.releaseData( Species::FrameType::getName() );
+        dc.releaseData(Species::FrameType::getName());
     }
 
     template<class AssignmentFunction, class Species>
-    void PhaseSpace<AssignmentFunction, Species>::notify( uint32_t currentStep )
+    void PhaseSpace<AssignmentFunction, Species>::notify(uint32_t currentStep)
     {
         /* reset device buffer */
-        this->dBuffer->assign( float_PS(0.0) );
+        this->dBuffer->assign(float_PS(0.0));
 
         /* calculate local phase space */
-        if( this->axis_element.space == AxisDescription::x )
-            calcPhaseSpace<AxisDescription::x>( currentStep );
-        else if( this->axis_element.space == AxisDescription::y )
-            calcPhaseSpace<AxisDescription::y>( currentStep );
-#if(SIMDIM==DIM3)
+        if(this->axis_element.space == AxisDescription::x)
+            calcPhaseSpace<AxisDescription::x>(currentStep);
+        else if(this->axis_element.space == AxisDescription::y)
+            calcPhaseSpace<AxisDescription::y>(currentStep);
+#if(SIMDIM == DIM3)
         else
-            calcPhaseSpace<AxisDescription::z>( currentStep );
+            calcPhaseSpace<AxisDescription::z>(currentStep);
 #endif
 
         /* transfer to host */
-        container::HostBuffer<float_PS, 2> hBuffer( this->dBuffer->size() );
+        container::HostBuffer<float_PS, 2> hBuffer(this->dBuffer->size());
         hBuffer = *this->dBuffer;
 
         /* reduce-add phase space from other GPUs in range [p0;p1]x[r;r+dr]
@@ -281,17 +266,17 @@ namespace picongpu
          *                         spatial y and z direction to node with
          *                         lowest y and z position and same x range
          */
-        container::HostBuffer<float_PS, 2> hReducedBuffer( hBuffer.size() );
-        hReducedBuffer.assign( float_PS(0.0) );
+        container::HostBuffer<float_PS, 2> hReducedBuffer(hBuffer.size());
+        hReducedBuffer.assign(float_PS(0.0));
 
-        planeReduce->template operator()( /* parameters: dest, source */
-                             hReducedBuffer,
-                             hBuffer,
-                             /* the functors return value will be written to dst */
-                             pmacc::algorithm::functor::Add() );
+        planeReduce->template operator()(/* parameters: dest, source */
+                                         hReducedBuffer,
+                                         hBuffer,
+                                         /* the functors return value will be written to dst */
+                                         pmacc::algorithm::functor::Add());
 
         /** all non-reduce-root processes are done now */
-        if( !this->isPlaneReduceRoot )
+        if(!this->isPlaneReduceRoot)
             return;
 
         /** \todo communicate GUARD and add it to the two neighbors BORDER */
@@ -306,17 +291,22 @@ namespace picongpu
          *   on the p-axis should be scaled to represent single/real particles.
          *   \see PhaseSpaceMulti::pluginLoad( )
          */
-        float_64 const pRange_unit =
-            UNIT_MASS *
-            UNIT_SPEED;
+        float_64 const pRange_unit = UNIT_MASS * UNIT_SPEED;
 
         DumpHBuffer dumpHBuffer;
 
-        if( this->commFileWriter != MPI_COMM_NULL )
-            dumpHBuffer( hReducedBuffer, this->axis_element,
-                         this->axis_p_range, pRange_unit,
-                         unit, Species::FrameType::getName() + "_" + m_help->filter.get( m_id ),
-                         currentStep, this->commFileWriter );
+        if(this->commFileWriter != MPI_COMM_NULL)
+            dumpHBuffer(
+                hReducedBuffer,
+                this->axis_element,
+                this->axis_p_range,
+                pRange_unit,
+                unit,
+                Species::FrameType::getName() + "_" + m_help->filter.get(m_id),
+                m_help->file_name_extension.get(m_id),
+                m_help->json_config.get(m_id),
+                currentStep,
+                this->commFileWriter);
     }
 
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/PhaseSpace/PhaseSpaceFunctors.hpp b/include/picongpu/plugins/PhaseSpace/PhaseSpaceFunctors.hpp
index 15f0508014..be7e5c9368 100644
--- a/include/picongpu/plugins/PhaseSpace/PhaseSpaceFunctors.hpp
+++ b/include/picongpu/plugins/PhaseSpace/PhaseSpaceFunctors.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Richard Pausch, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Richard Pausch, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,7 +22,7 @@
 #include <utility>
 
 #include <pmacc/cuSTL/cursor/MultiIndexCursor.hpp>
-#include <pmacc/cuSTL/algorithm/cudaBlock/Foreach.hpp>
+#include <pmacc/cuSTL/algorithm/cuplaBlock/Foreach.hpp>
 #include <pmacc/cuSTL/container/compile-time/SharedBuffer.hpp>
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/math/VectorOperations.hpp>
@@ -45,11 +45,10 @@ namespace picongpu
     {
         typedef void result_type;
 
-        template< typename T_Acc >
-        DINLINE void
-        operator()( const T_Acc& acc, Type& dest, const Type src ) const
+        template<typename T_Acc>
+        DINLINE void operator()(const T_Acc& acc, Type& dest, const Type src) const
         {
-            atomicAdd( &dest, src, ::alpaka::hierarchy::Blocks{} );
+            cupla::atomicAdd(acc, &dest, src, ::alpaka::hierarchy::Blocks{});
         }
     };
 
@@ -77,14 +76,14 @@ namespace picongpu
          * \param el_p coordinate of the momentum \see PhaseSpace::axis_element \see AxisDescription
          * \param axis_p_range range of the momentum coordinate \see PhaseSpace::axis_p_range
          */
-        template<typename FramePtr, typename float_PS, typename Pitch, typename T_Acc >
-        DINLINE void
-        operator()( const T_Acc & acc,
+        template<typename FramePtr, typename float_PS, typename Pitch, typename T_Acc>
+        DINLINE void operator()(
+            const T_Acc& acc,
             FramePtr frame,
             uint16_t particleID,
             cursor::CT::BufferCursor<float_PS, Pitch> curDBufferOriginInBlock,
             const uint32_t el_p,
-            const std::pair<float_X, float_X>& axis_p_range )
+            const std::pair<float_X, float_X>& axis_p_range)
         {
             auto particle = frame[particleID];
             /** \todo this can become a functor to be even more flexible */
@@ -92,31 +91,29 @@ namespace picongpu
 
             /* cell id in this block */
             const int linearCellIdx = particle[localCellIdx_];
-            const pmacc::math::UInt32<simDim> cellIdx(
-                pmacc::math::MapToPos<simDim>()( SuperCellSize(), linearCellIdx ) );
+            const pmacc::math::UInt32<simDim> cellIdx(pmacc::math::MapToPos<simDim>()(SuperCellSize(), linearCellIdx));
 
-            const uint32_t r_bin    = cellIdx[r_dir];
+            const uint32_t r_bin = cellIdx[r_dir];
             const float_X weighting = particle[weighting_];
-            const float_X charge    = attribute::getCharge( weighting,particle );
-            const float_PS particleChargeDensity =
-              precisionCast<float_PS>( charge / CELL_VOLUME );
+            const float_X charge = attribute::getCharge(weighting, particle);
+            const float_PS particleChargeDensity = precisionCast<float_PS>(charge / CELL_VOLUME);
 
-            const float_X rel_bin = (mom_i / weighting - axis_p_range.first)
-                                  / (axis_p_range.second - axis_p_range.first);
-            int p_bin = int( rel_bin * float_X(num_pbins) );
+            const float_X rel_bin
+                = (mom_i / weighting - axis_p_range.first) / (axis_p_range.second - axis_p_range.first);
+            int p_bin = int(rel_bin * float_X(num_pbins));
 
             /* out-of-range bins back to min/max */
-            if( p_bin < 0 )
+            if(p_bin < 0)
                 p_bin = 0;
-            if( p_bin >= num_pbins )
+            if(p_bin >= num_pbins)
                 p_bin = num_pbins - 1;
 
             /** \todo take particle shape into account */
-            atomicAdd(
-                &(*curDBufferOriginInBlock( p_bin, r_bin )),
+            cupla::atomicAdd(
+                acc,
+                &(*curDBufferOriginInBlock(p_bin, r_bin)),
                 particleChargeDensity,
-                ::alpaka::hierarchy::Threads{}
-            );
+                ::alpaka::hierarchy::Threads{});
         }
     };
 
@@ -141,8 +138,7 @@ namespace picongpu
         uint32_t num_pbins,
         uint32_t r_dir,
         typename T_Filter,
-        uint32_t T_numWorkers
-    >
+        uint32_t T_numWorkers>
     struct FunctorBlock
     {
         typedef void result_type;
@@ -169,11 +165,14 @@ namespace picongpu
             cursor::BufferCursor<float_PS, 2> cur,
             const uint32_t p_dir,
             const std::pair<float_X, float_X>& p_range,
-            const T_Filter & parFilter
-        ) :
-            particlesBox(pb), curOriginPhaseSpace(cur), p_element(p_dir),
-            axis_p_range(p_range), particleFilter(parFilter)
-        {}
+            const T_Filter& parFilter)
+            : particlesBox(pb)
+            , curOriginPhaseSpace(cur)
+            , p_element(p_dir)
+            , axis_p_range(p_range)
+            , particleFilter(parFilter)
+        {
+        }
 
         /** Called for the first cell of each block #-of-cells-in-block times
          *
@@ -181,12 +180,11 @@ namespace picongpu
          *                         the current block starts
          *                         \see cuSTL/algorithm/kernel/Foreach.hpp
          */
-        template< typename T_Acc >
-        DINLINE void
-        operator()( const T_Acc& acc,  const pmacc::math::Int<simDim>& indexBlockOffset )
+        template<typename T_Acc>
+        DINLINE void operator()(const T_Acc& acc, const pmacc::math::Int<simDim>& indexBlockOffset)
         {
             constexpr uint32_t numWorkers = T_numWorkers;
-            const uint32_t workerIdx = threadIdx.x;
+            const uint32_t workerIdx = cupla::threadIdx(acc).x;
 
             /** \todo write math::Vector constructor that supports dim3 */
             const pmacc::math::Int<simDim> indexGlobal = indexBlockOffset;
@@ -194,24 +192,20 @@ namespace picongpu
             /* create shared mem */
             const int blockCellsInDir = SuperCellSize::template at<r_dir>::type::value;
             typedef typename pmacc::math::CT::Int<num_pbins, blockCellsInDir> dBufferSizeInBlock;
-            container::CT::SharedBuffer<float_PS, dBufferSizeInBlock > dBufferInBlock( acc );
+            container::CT::SharedBuffer<float_PS, dBufferSizeInBlock> dBufferInBlock(acc);
 
             /* init shared mem */
-            pmacc::algorithm::cudaBlock::Foreach<
-                pmacc::math::CT::Int< numWorkers >
-            > forEachThreadInBlock(workerIdx);
-            forEachThreadInBlock( acc,
-                                  dBufferInBlock.zone(),
-                                  dBufferInBlock.origin(),
-                                  pmacc::algorithm::functor::AssignValue<float_PS>(0.0) );
-            __syncthreads();
+            pmacc::algorithm::cuplaBlock::Foreach<pmacc::math::CT::Int<numWorkers>> forEachThreadInBlock(workerIdx);
+            forEachThreadInBlock(
+                acc,
+                dBufferInBlock.zone(),
+                dBufferInBlock.origin(),
+                pmacc::algorithm::functor::AssignValue<float_PS>(0.0));
+            cupla::__syncthreads(acc);
 
             FunctorParticle<r_dir, num_pbins, SuperCellSize> functorParticle;
 
-            particleAccess::Cell2Particle<
-                SuperCellSize,
-                numWorkers
-            > forEachParticleInCell;
+            particleAccess::Cell2Particle<SuperCellSize, numWorkers> forEachParticleInCell;
             forEachParticleInCell(
                 acc,
                 /* mandatory params */
@@ -223,20 +217,20 @@ namespace picongpu
                 /* optional params */
                 dBufferInBlock.origin(),
                 p_element,
-                axis_p_range
-            );
+                axis_p_range);
 
-            __syncthreads();
+            cupla::__syncthreads(acc);
             /* add to global dBuffer */
-            forEachThreadInBlock( acc,
-                                  /* area to work on */
-                                  dBufferInBlock.zone(),
-                                  /* data below - cursors will be shifted and
-                                   * dereferenced */
-                                  curOriginPhaseSpace(0, indexBlockOffset[r_dir]),
-                                  dBufferInBlock.origin(),
-                                  /* functor */
-                                  FunctorAtomicAdd<float_PS>() );
+            forEachThreadInBlock(
+                acc,
+                /* area to work on */
+                dBufferInBlock.zone(),
+                /* data below - cursors will be shifted and
+                 * dereferenced */
+                curOriginPhaseSpace(0, indexBlockOffset[r_dir]),
+                dBufferInBlock.origin(),
+                /* functor */
+                FunctorAtomicAdd<float_PS>());
         }
     };
 
diff --git a/include/picongpu/plugins/PluginController.hpp b/include/picongpu/plugins/PluginController.hpp
index 9162b5b77e..865b069c9e 100644
--- a/include/picongpu/plugins/PluginController.hpp
+++ b/include/picongpu/plugins/PluginController.hpp
@@ -1,6 +1,7 @@
-/* Copyright 2013-2020 Axel Huebl, Benjamin Schneider, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Benjamin Schneider, Felix Schmitt,
  *                     Heiko Burau, Rene Widera, Richard Pausch,
- *                     Benjamin Worpitz, Erik Zenker, Finn-Ole Carstens
+ *                     Benjamin Worpitz, Erik Zenker, Finn-Ole Carstens,
+ *                     Franz Poeschel
  *
  * This file is part of PIConGPU.
  *
@@ -41,34 +42,39 @@
  */
 #include "picongpu/plugins/PngPlugin.hpp"
 
-#if (ENABLE_ADIOS == 1)
-#   include "picongpu/plugins/adios/ADIOSWriter.hpp"
+#if(ENABLE_ADIOS == 1)
+#    include "picongpu/plugins/adios/ADIOSWriter.hpp"
 #endif
 
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "picongpu/plugins/PositionsParticles.hpp"
-#   include "picongpu/plugins/ChargeConservation.hpp"
-#   include "picongpu/plugins/particleMerging/ParticleMerger.hpp"
-#   if(ENABLE_HDF5 == 1)
-#       include "picongpu/plugins/makroParticleCounter/PerSuperCell.hpp"
-#   endif
-
-#   include "picongpu/plugins/SliceFieldPrinterMulti.hpp"
-#   if(SIMDIM==DIM3)
-#       include "picongpu/plugins/IntensityPlugin.hpp"
-#   endif
+#if(ENABLE_OPENPMD == 1)
+#    include "picongpu/plugins/openPMD/openPMDWriter.hpp"
+#    include "picongpu/plugins/PhaseSpace/PhaseSpace.hpp"
+#    include "picongpu/plugins/xrayScattering/XrayScattering.hpp"
 #endif
 
-#if (ENABLE_ISAAC == 1) && (SIMDIM==DIM3)
+#if(PMACC_CUDA_ENABLED == 1)
+#    include "picongpu/plugins/PositionsParticles.hpp"
+#    include "picongpu/plugins/ChargeConservation.hpp"
+#    include "picongpu/plugins/particleMerging/ParticleMerger.hpp"
+#    include "picongpu/plugins/randomizedParticleMerger/RandomizedParticleMerger.hpp"
+#    if(ENABLE_HDF5 == 1)
+#        include "picongpu/plugins/makroParticleCounter/PerSuperCell.hpp"
+#    endif
+
+#    include "picongpu/plugins/SliceFieldPrinterMulti.hpp"
+#    if(SIMDIM == DIM3)
+#        include "picongpu/plugins/IntensityPlugin.hpp"
+#    endif
+#endif
+
+#if(ENABLE_ISAAC == 1) && (SIMDIM == DIM3)
 #    include "picongpu/plugins/IsaacPlugin.hpp"
 #endif
 
-#if (ENABLE_HDF5 == 1)
-#   include "picongpu/plugins/PhaseSpace/PhaseSpace.hpp"
-#   include "picongpu/plugins/particleCalorimeter/ParticleCalorimeter.hpp"
-#   include "picongpu/plugins/radiation/VectorTypes.hpp"
-#   include "picongpu/plugins/radiation/Radiation.hpp"
-#   include "picongpu/plugins/hdf5/HDF5Writer.hpp"
+#if(ENABLE_HDF5 == 1)
+#    include "picongpu/plugins/particleCalorimeter/ParticleCalorimeter.hpp"
+#    include "picongpu/plugins/radiation/VectorTypes.hpp"
+#    include "picongpu/plugins/radiation/Radiation.hpp"
 #endif
 
 #include "picongpu/plugins/Checkpoint.hpp"
@@ -85,234 +91,206 @@
 
 namespace picongpu
 {
+    using namespace pmacc;
 
-using namespace pmacc;
-
-/**
- * Plugin management controller for user-level plugins.
- */
-class PluginController : public ILightweightPlugin
-{
-private:
-
-    std::list<ISimulationPlugin*> plugins;
-
-    template<typename T_Type>
-    struct PushBack
+    /**
+     * Plugin management controller for user-level plugins.
+     */
+    class PluginController : public ILightweightPlugin
     {
+    private:
+        std::list<ISimulationPlugin*> plugins;
 
-        template<typename T>
-        void operator()(T& list)
-        {
-            list.push_back(new T_Type());
-        }
-    };
-
-    struct TupleSpeciesPlugin
-    {
-        enum Names
+        template<typename T_Type>
+        struct PushBack
         {
-            species = 0,
-            plugin = 1
+            template<typename T>
+            void operator()(T& list)
+            {
+                list.push_back(new T_Type());
+            }
         };
 
-        /** apply the 1st vector component to the 2nd
-         *
-         * @tparam T_TupleVector vector of type
-         *                       pmacc::math::CT::vector< Species, Plugin >
-         *                       with two components
-         */
-        template< typename T_TupleVector >
-        struct Apply :
-            bmpl::apply1<
-                typename pmacc::math::CT::At<
-                    T_TupleVector,
-                    bmpl::int_< plugin >
-                >::type,
-                typename pmacc::math::CT::At<
-                    T_TupleVector,
-                    bmpl::int_< species >
-                >::type
-            >
+        struct TupleSpeciesPlugin
         {
+            enum Names
+            {
+                species = 0,
+                plugin = 1
+            };
+
+            /** apply the 1st vector component to the 2nd
+             *
+             * @tparam T_TupleVector vector of type
+             *                       pmacc::math::CT::vector< Species, Plugin >
+             *                       with two components
+             */
+            template<typename T_TupleVector>
+            struct Apply
+                : bmpl::apply1<
+                      typename pmacc::math::CT::At<T_TupleVector, bmpl::int_<plugin>>::type,
+                      typename pmacc::math::CT::At<T_TupleVector, bmpl::int_<species>>::type>
+            {
+            };
+
+            /** Check the combination Species+Plugin in the Tuple
+             *
+             * @tparam T_TupleVector with Species, Plugin
+             */
+            template<typename T_TupleVector>
+            struct IsEligible
+            {
+                using Species = typename pmacc::math::CT::At<T_TupleVector, bmpl::int_<species>>::type;
+                using Solver = typename pmacc::math::CT::At<T_TupleVector, bmpl::int_<plugin>>::type;
+
+                using type = typename particles::traits::SpeciesEligibleForSolver<Species, Solver>::type;
+            };
         };
 
-        /** Check the combination Species+Plugin in the Tuple
-         *
-         * @tparam T_TupleVector with Species, Plugin
-         */
-        template< typename T_TupleVector >
-        struct IsEligible
-        {
-            using Species = typename pmacc::math::CT::At<
-                T_TupleVector,
-                bmpl::int_< species >
-            >::type;
-            using Solver = typename pmacc::math::CT::At<
-                T_TupleVector,
-                bmpl::int_< plugin >
-            >::type;
-
-            using type = typename particles::traits::SpeciesEligibleForSolver<
-                Species,
-                Solver
-            >::type;
-        };
-    };
+        /* define stand alone plugins */
+        using StandAlonePlugins = bmpl::vector<
+            Checkpoint,
+            EnergyFields
+#if(ENABLE_ADIOS == 1)
+            ,
+            plugins::multi::Master<adios::ADIOSWriter>
+#endif
 
-    /* define stand alone plugins */
-    using StandAlonePlugins = bmpl::vector<
-        Checkpoint,
-        EnergyFields
-#if (ENABLE_ADIOS == 1)
-        , plugins::multi::Master< adios::ADIOSWriter >
+#if(ENABLE_OPENPMD == 1)
+            ,
+            plugins::multi::Master<openPMD::openPMDWriter>
 #endif
 
-#if( PMACC_CUDA_ENABLED == 1 )
-        , SumCurrents
-        , ChargeConservation
-#   if(SIMDIM==DIM3)
-        , IntensityPlugin
-#   endif
+#if(PMACC_CUDA_ENABLED == 1)
+            ,
+            SumCurrents,
+            ChargeConservation
+#    if(SIMDIM == DIM3)
+            ,
+            IntensityPlugin
+#    endif
 #endif
 
-#if (ENABLE_ISAAC == 1) && (SIMDIM==DIM3)
-        , isaacP::IsaacPlugin
+#if(ENABLE_ISAAC == 1) && (SIMDIM == DIM3)
+            ,
+            isaacP::IsaacPlugin
 #endif
+            ,
+            ResourceLog>;
+
 
-#if (ENABLE_HDF5 == 1)
-        , plugins::multi::Master< hdf5::HDF5Writer >
+        /* define field plugins */
+        using UnspecializedFieldPlugins = bmpl::vector<
+#if(PMACC_CUDA_ENABLED == 1)
+            SliceFieldPrinterMulti<bmpl::_1>
 #endif
-        , ResourceLog
-    >;
+            >;
+
+        using AllFields = bmpl::vector<FieldB, FieldE, FieldJ>;
 
+        using CombinedUnspecializedFieldPlugins =
+            typename AllCombinations<bmpl::vector<AllFields, UnspecializedFieldPlugins>>::type;
 
-    /* define field plugins */
-    using UnspecializedFieldPlugins = bmpl::vector<
-#if( PMACC_CUDA_ENABLED == 1 )
-        SliceFieldPrinterMulti< bmpl::_1 >
+        using FieldPlugins = typename bmpl::
+            transform<CombinedUnspecializedFieldPlugins, typename TupleSpeciesPlugin::Apply<bmpl::_1>>::type;
+
+
+        /* define species plugins */
+        using UnspecializedSpeciesPlugins = bmpl::vector<
+            plugins::multi::Master<EnergyParticles<bmpl::_1>>,
+            plugins::multi::Master<CalcEmittance<bmpl::_1>>,
+            plugins::multi::Master<BinEnergyParticles<bmpl::_1>>,
+            CountParticles<bmpl::_1>,
+            PngPlugin<Visualisation<bmpl::_1, PngCreator>>,
+            plugins::transitionRadiation::TransitionRadiation<bmpl::_1>
+#if(ENABLE_OPENPMD == 1)
+            ,
+            plugins::xrayScattering::XrayScattering<bmpl::_1>
 #endif
-    >;
-
-    using AllFields = bmpl::vector< FieldB, FieldE, FieldJ >;
-
-    using CombinedUnspecializedFieldPlugins = typename AllCombinations<
-        bmpl::vector<
-            AllFields,
-            UnspecializedFieldPlugins
-        >
-    >::type;
-
-    using FieldPlugins = typename bmpl::transform<
-        CombinedUnspecializedFieldPlugins,
-        typename TupleSpeciesPlugin::Apply< bmpl::_1 >
-    >::type;
-
-
-    /* define species plugins */
-    using UnspecializedSpeciesPlugins = bmpl::vector <
-        plugins::multi::Master< EnergyParticles<bmpl::_1> >,
-        plugins::multi::Master< CalcEmittance<bmpl::_1> >,
-        plugins::multi::Master< BinEnergyParticles<bmpl::_1> >,
-        CountParticles<bmpl::_1>,
-        PngPlugin< Visualisation<bmpl::_1, PngCreator> >,
-        plugins::transitionRadiation::TransitionRadiation<bmpl::_1>
-#if(ENABLE_HDF5 == 1)
-        , plugins::radiation::Radiation<bmpl::_1>
-        , plugins::multi::Master< ParticleCalorimeter<bmpl::_1> >
-        , plugins::multi::Master< PhaseSpace<particles::shapes::Counter::ChargeAssignment, bmpl::_1> >
+#if(ENABLE_HDF5 * ENABLE_OPENPMD == 1)
+            ,
+            plugins::radiation::Radiation<bmpl::_1>,
+            plugins::multi::Master<ParticleCalorimeter<bmpl::_1>>
 #endif
-#if( PMACC_CUDA_ENABLED == 1 )
-        , PositionsParticles<bmpl::_1>
-        , plugins::particleMerging::ParticleMerger<bmpl::_1>
-#   if(ENABLE_HDF5 == 1)
-        , PerSuperCell<bmpl::_1>
-#   endif
+#if(ENABLE_OPENPMD == 1)
+            ,
+            plugins::multi::Master<PhaseSpace<particles::shapes::Counter::ChargeAssignment, bmpl::_1>>
 #endif
-    >;
-
-    using CombinedUnspecializedSpeciesPlugins = typename AllCombinations<
-        bmpl::vector<
-            VectorAllSpecies,
-            UnspecializedSpeciesPlugins
-        >
-    >::type;
-
-    using CombinedUnspecializedSpeciesPluginsEligible = typename bmpl::copy_if<
-        CombinedUnspecializedSpeciesPlugins,
-        typename TupleSpeciesPlugin::IsEligible< bmpl::_1 >
-    >::type;
-
-    using SpeciesPlugins = typename bmpl::transform<
-        CombinedUnspecializedSpeciesPluginsEligible,
-        typename TupleSpeciesPlugin::Apply< bmpl::_1 >
-    >::type;
-
-    /* create sequence with all fully specialized plugins */
-    using AllPlugins = MakeSeq_t<
-        StandAlonePlugins,
-        FieldPlugins,
-        SpeciesPlugins
-    >;
+#if(PMACC_CUDA_ENABLED == 1)
+            ,
+            PositionsParticles<bmpl::_1>,
+            plugins::particleMerging::ParticleMerger<bmpl::_1>,
+            plugins::randomizedParticleMerger::RandomizedParticleMerger<bmpl::_1>
+#    if(ENABLE_HDF5 == 1)
+            ,
+            PerSuperCell<bmpl::_1>
+#    endif
+#endif
+            >;
 
-    /**
-     * Initializes the controller by adding all user plugins to its internal list.
-     */
-    virtual void init()
-    {
-        meta::ForEach<AllPlugins, PushBack<bmpl::_1> > pushBack;
-        pushBack(plugins);
-    }
+        using CombinedUnspecializedSpeciesPlugins =
+            typename AllCombinations<bmpl::vector<VectorAllSpecies, UnspecializedSpeciesPlugins>>::type;
 
-public:
+        using CombinedUnspecializedSpeciesPluginsEligible = typename bmpl::
+            copy_if<CombinedUnspecializedSpeciesPlugins, typename TupleSpeciesPlugin::IsEligible<bmpl::_1>>::type;
 
-    PluginController()
-    {
-        init();
-    }
+        using SpeciesPlugins = typename bmpl::
+            transform<CombinedUnspecializedSpeciesPluginsEligible, typename TupleSpeciesPlugin::Apply<bmpl::_1>>::type;
 
-    virtual ~PluginController()
-    {
+        /* create sequence with all fully specialized plugins */
+        using AllPlugins = MakeSeq_t<StandAlonePlugins, FieldPlugins, SpeciesPlugins>;
 
-    }
+        /**
+         * Initializes the controller by adding all user plugins to its internal list.
+         */
+        virtual void init()
+        {
+            meta::ForEach<AllPlugins, PushBack<bmpl::_1>> pushBack;
+            pushBack(plugins);
+        }
 
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        PMACC_ASSERT(cellDescription != nullptr);
+    public:
+        PluginController()
+        {
+            init();
+        }
 
-        for (std::list<ISimulationPlugin*>::iterator iter = plugins.begin();
-             iter != plugins.end();
-             ++iter)
+        virtual ~PluginController()
         {
-            (*iter)->setMappingDescription(cellDescription);
         }
-    }
 
-    virtual void pluginRegisterHelp(po::options_description&)
-    {
-        // no help required at the moment
-    }
+        void setMappingDescription(MappingDesc* cellDescription)
+        {
+            PMACC_ASSERT(cellDescription != nullptr);
 
-    std::string pluginGetName() const
-    {
-        return "PluginController";
-    }
+            for(std::list<ISimulationPlugin*>::iterator iter = plugins.begin(); iter != plugins.end(); ++iter)
+            {
+                (*iter)->setMappingDescription(cellDescription);
+            }
+        }
 
-    void notify(uint32_t)
-    {
+        virtual void pluginRegisterHelp(po::options_description&)
+        {
+            // no help required at the moment
+        }
 
-    }
+        std::string pluginGetName() const
+        {
+            return "PluginController";
+        }
 
-    virtual void pluginUnload()
-    {
-        for (std::list<ISimulationPlugin*>::iterator iter = plugins.begin();
-             iter != plugins.end();
-             ++iter)
+        void notify(uint32_t)
+        {
+        }
+
+        virtual void pluginUnload()
         {
-            __delete(*iter);
+            for(std::list<ISimulationPlugin*>::iterator iter = plugins.begin(); iter != plugins.end(); ++iter)
+            {
+                __delete(*iter);
+            }
+            plugins.clear();
         }
-        plugins.clear();
-    }
-};
+    };
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/plugins/PngPlugin.hpp b/include/picongpu/plugins/PngPlugin.hpp
index 51a2b3c1d5..aaafec8ae9 100644
--- a/include/picongpu/plugins/PngPlugin.hpp
+++ b/include/picongpu/plugins/PngPlugin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -48,21 +48,19 @@ namespace picongpu
     class PngPlugin : public ILightweightPlugin
     {
     public:
-
         typedef VisClass VisType;
         typedef std::list<VisType*> VisPointerList;
 
-        PngPlugin() :
-        pluginName("PngPlugin: create png's of a species and fields"),
-        pluginPrefix(VisType::FrameType::getName() + "_" + VisClass::CreatorType::getName()),
-        cellDescription(nullptr)
+        PngPlugin()
+            : pluginName("PngPlugin: create png's of a species and fields")
+            , pluginPrefix(VisType::FrameType::getName() + "_" + VisClass::CreatorType::getName())
+            , cellDescription(nullptr)
         {
             Environment<>::get().PluginConnector().registerPlugin(this);
         }
 
         virtual ~PngPlugin()
         {
-
         }
 
         std::string pluginGetName() const
@@ -72,50 +70,54 @@ namespace picongpu
 
         void pluginRegisterHelp(po::options_description& desc)
         {
-#if( PIC_ENABLE_PNG == 1 )
-            desc.add_options()
-                    ((pluginPrefix + ".period").c_str(), po::value<std::vector<std::string> > (&notifyPeriod)->multitoken(), "enable data output [for each n-th step]")
-                    ((pluginPrefix + ".axis").c_str(), po::value<std::vector<std::string > > (&axis)->multitoken(), "axis which are shown [valid values x,y,z] example: yz")
-                    ((pluginPrefix + ".slicePoint").c_str(), po::value<std::vector<float_32> > (&slicePoints)->multitoken(), "value range: 0 <= x <= 1 , point of the slice")
-                    ((pluginPrefix + ".folder").c_str(), po::value<std::vector<std::string> > (&folders)->multitoken(), "folder for output files");
+#if(PIC_ENABLE_PNG == 1)
+            desc.add_options()(
+                (pluginPrefix + ".period").c_str(),
+                po::value<std::vector<std::string>>(&notifyPeriod)->multitoken(),
+                "enable data output [for each n-th step]")(
+                (pluginPrefix + ".axis").c_str(),
+                po::value<std::vector<std::string>>(&axis)->multitoken(),
+                "axis which are shown [valid values x,y,z] example: yz")(
+                (pluginPrefix + ".slicePoint").c_str(),
+                po::value<std::vector<float_32>>(&slicePoints)->multitoken(),
+                "value range: 0 <= x <= 1 , point of the slice")(
+                (pluginPrefix + ".folder").c_str(),
+                po::value<std::vector<std::string>>(&folders)->multitoken(),
+                "folder for output files");
 #else
-            desc.add_options()
-                    ((pluginPrefix).c_str(), "plugin disabled [compiled without dependency PNGwriter]");
+            desc.add_options()((pluginPrefix).c_str(), "plugin disabled [compiled without dependency PNGwriter]");
 #endif
         }
 
-        void setMappingDescription(MappingDesc *cellDescription)
+        void setMappingDescription(MappingDesc* cellDescription)
         {
             this->cellDescription = cellDescription;
         }
 
 
     private:
-
         void pluginLoad()
         {
-
-            if (0 != notifyPeriod.size())
+            if(0 != notifyPeriod.size())
             {
-                if (0 != slicePoints.size() &&
-                    0 != axis.size())
+                if(0 != slicePoints.size() && 0 != axis.size())
                 {
-                    for (int i = 0; i < (int) slicePoints.size(); ++i) /*!\todo: use vactor with max elements*/
+                    for(int i = 0; i < (int) slicePoints.size(); ++i) /*!\todo: use vactor with max elements*/
                     {
                         std::string period = getValue(notifyPeriod, i);
                         if(!period.empty())
                         {
-
-                            if (getValue(axis, i).length() == 2u)
+                            if(getValue(axis, i).length() == 2u)
                             {
                                 std::stringstream o_slicePoint;
                                 o_slicePoint << getValue(slicePoints, i);
                                 /*add default value for folder*/
-                                if (folders.empty())
+                                if(folders.empty())
                                 {
                                     folders.push_back(std::string("."));
                                 }
-                                std::string filename(pluginPrefix + "_" + getValue(axis, i) + "_" + o_slicePoint.str());
+                                std::string filename(
+                                    pluginPrefix + "_" + getValue(axis, i) + "_" + o_slicePoint.str());
                                 typename VisType::CreatorType pngCreator(filename, getValue(folders, i));
                                 /** \todo rename me: transpose is the wrong name `swivel` is better
                                  *
@@ -123,22 +125,26 @@ namespace picongpu
                                  *
                                  * example: transpose[2,1] means: use x and z from an other vector
                                  */
-                                DataSpace<DIM2 > transpose(
-                                                           charToAxisNumber(getValue(axis, i)[0]),
-                                                           charToAxisNumber(getValue(axis, i)[1])
-                                                           );
+                                DataSpace<DIM2> transpose(
+                                    charToAxisNumber(getValue(axis, i)[0]),
+                                    charToAxisNumber(getValue(axis, i)[1]));
                                 /* if simulation run in 2D ignore all xz, yz slices (we had no z direction)*/
-                                const bool isAllowed2DSlice = (simDim == DIM3) || (transpose.x() != 2 && transpose.y() != 2);
+                                const bool isAllowed2DSlice
+                                    = (simDim == DIM3) || (transpose.x() != 2 && transpose.y() != 2);
                                 const bool isSlidingWindowEnabled = MovingWindow::getInstance().isEnabled();
                                 /* if sliding window is active we are not allowed to create pngs from xz slice
                                  * This means one dimension in transpose must contain 1 (y direction)
                                  */
-                                const bool isAllowedMovingWindowSlice =
-                                    !isSlidingWindowEnabled ||
-                                    (transpose.x() == 1 || transpose.y() == 1);
-                                if( isAllowed2DSlice && isAllowedMovingWindowSlice )
+                                const bool isAllowedMovingWindowSlice
+                                    = !isSlidingWindowEnabled || (transpose.x() == 1 || transpose.y() == 1);
+                                if(isAllowed2DSlice && isAllowedMovingWindowSlice)
                                 {
-                                    VisType* tmp = new VisType(pluginName, pngCreator, period, transpose, getValue(slicePoints, i));
+                                    VisType* tmp = new VisType(
+                                        pluginName,
+                                        pngCreator,
+                                        period,
+                                        transpose,
+                                        getValue(slicePoints, i));
                                     visIO.push_back(tmp);
                                     tmp->setMappingDescription(cellDescription);
                                     tmp->init();
@@ -146,15 +152,19 @@ namespace picongpu
                                 else
                                 {
                                     if(!isAllowedMovingWindowSlice)
-                                        std::cerr << "[WARNING] You are running a simulation with moving window: png output along the axis "<<
-                                                 getValue(axis, i) << " will be ignored" << std::endl;
+                                        std::cerr << "[WARNING] You are running a simulation with moving window: png "
+                                                     "output along the axis "
+                                                  << getValue(axis, i) << " will be ignored" << std::endl;
                                     if(!isAllowed2DSlice)
-                                        std::cerr << "[WARNING] You are running a 2D simulation: png output along the axis "<<
-                                                 getValue(axis, i) << " will be ignored" << std::endl;
+                                        std::cerr
+                                            << "[WARNING] You are running a 2D simulation: png output along the axis "
+                                            << getValue(axis, i) << " will be ignored" << std::endl;
                                 }
                             }
                             else
-                                throw std::runtime_error((std::string("[Png Plugin] wrong charecter count in axis: ") + getValue(axis, i)).c_str());
+                                throw std::runtime_error(
+                                    (std::string("[Png Plugin] wrong charecter count in axis: ") + getValue(axis, i))
+                                        .c_str());
                         }
                     }
                 }
@@ -167,9 +177,7 @@ namespace picongpu
 
         void pluginUnload()
         {
-            for (typename VisPointerList::iterator iter = visIO.begin();
-                 iter != visIO.end();
-                 ++iter)
+            for(typename VisPointerList::iterator iter = visIO.begin(); iter != visIO.end(); ++iter)
             {
                 __delete(*iter);
             }
@@ -187,9 +195,10 @@ namespace picongpu
         template<class Vec>
         typename Vec::value_type getValue(Vec vec, size_t id)
         {
-            if (vec.size() == 0)
-                throw std::runtime_error("[Png Plugin] getValue is used with a parameter set with no parameters (count is 0)");
-            if (id >= vec.size())
+            if(vec.size() == 0)
+                throw std::runtime_error(
+                    "[Png Plugin] getValue is used with a parameter set with no parameters (count is 0)");
+            if(id >= vec.size())
             {
                 return vec[vec.size() - 1];
             }
@@ -198,9 +207,9 @@ namespace picongpu
 
         int charToAxisNumber(char c)
         {
-            if (c == 'x')
+            if(c == 'x')
                 return 0;
-            if (c == 'y')
+            if(c == 'y')
                 return 1;
             return 2;
         }
@@ -216,34 +225,21 @@ namespace picongpu
         VisPointerList visIO;
 
         MappingDesc* cellDescription;
-
     };
 
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_VisClass
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        PngPlugin< T_VisClass >
-    >
+    namespace particles
     {
-        using FrameType = typename T_Species::FrameType;
+        namespace traits
+        {
+            template<typename T_Species, typename T_VisClass>
+            struct SpeciesEligibleForSolver<T_Species, PngPlugin<T_VisClass>>
+            {
+                using FrameType = typename T_Species::FrameType;
 
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting
-        >;
+                using RequiredIdentifiers = MakeSeq_t<weighting>;
 
-        using type = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-    };
-} // namespace traits
-} // namespace particles
+                using type = typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
-
diff --git a/include/picongpu/plugins/PositionsParticles.hpp b/include/picongpu/plugins/PositionsParticles.hpp
index f0201e19f6..b5778849d2 100644
--- a/include/picongpu/plugins/PositionsParticles.hpp
+++ b/include/picongpu/plugins/PositionsParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -39,295 +39,262 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-namespace po = boost::program_options;
+    namespace po = boost::program_options;
 
-template<class FloatPos>
-struct SglParticle
-{
-    FloatPos position;
-    float3_X momentum;
-    float_X mass;
-    float_X weighting;
-    float_X charge;
-    float_X gamma;
-
-    SglParticle() : position(FloatPos::create(0.0)), momentum(float3_X::create(0.0)), mass(0.0),
-        weighting(0.0), charge(0.0), gamma(0.0)
+    template<class FloatPos>
+    struct SglParticle
     {
-    }
+        FloatPos position;
+        float3_X momentum;
+        float_X mass;
+        float_X weighting;
+        float_X charge;
+        float_X gamma;
+
+        SglParticle()
+            : position(FloatPos::create(0.0))
+            , momentum(float3_X::create(0.0))
+            , mass(0.0)
+            , weighting(0.0)
+            , charge(0.0)
+            , gamma(0.0)
+        {
+        }
 
-    DataSpace<simDim> globalCellOffset;
+        DataSpace<simDim> globalCellOffset;
 
-    //! todo
+        //! todo
 
-    floatD_64 getGlobalCell() const
-    {
-        floatD_64 doubleGlobalCellOffset;
-        for(uint32_t i=0;i<simDim;++i)
-            doubleGlobalCellOffset[i]=float_64(globalCellOffset[i]);
+        floatD_64 getGlobalCell() const
+        {
+            floatD_64 doubleGlobalCellOffset;
+            for(uint32_t i = 0; i < simDim; ++i)
+                doubleGlobalCellOffset[i] = float_64(globalCellOffset[i]);
 
-        return floatD_64( doubleGlobalCellOffset + precisionCast<float_64>(position));
-    }
+            return floatD_64(doubleGlobalCellOffset + precisionCast<float_64>(position));
+        }
 
-    template<typename T>
+        template<typename T>
         friend std::ostream& operator<<(std::ostream& out, const SglParticle<T>& v)
-    {
-        floatD_64 pos;
-        for(uint32_t i=0;i<simDim;++i)
-            pos[i]=( v.getGlobalCell()[i] * cellSize[i]*UNIT_LENGTH);
-
-        const float3_64 mom( precisionCast<float_64>(v.momentum.x()) * UNIT_MASS * UNIT_SPEED,
-                             precisionCast<float_64>(v.momentum.y()) * UNIT_MASS * UNIT_SPEED,
-                             precisionCast<float_64>(v.momentum.z()) * UNIT_MASS * UNIT_SPEED );
-
-        const float_64 mass = precisionCast<float_64>(v.mass) * UNIT_MASS;
-        const float_64 charge = precisionCast<float_64>(v.charge) * UNIT_CHARGE;
-
-        using dbl = std::numeric_limits<float_64>;
-        out.precision(dbl::digits10);
-
-        out << std::scientific << pos << " " << mom << " " << mass << " "
-            << precisionCast<float_64>(v.weighting)
-            << " " << charge << " " << precisionCast<float_64>(v.gamma);
-        return out;
-    }
-};
-
-/** write the position of a single particle to a file
- * \warning this plugin MUST NOT be used with more than one (global!)
- * particle and is created for one-particle-test-purposes only
- */
-struct KernelPositionsParticles
-{
-    template<
-        typename ParBox,
-        typename FloatPos,
-        typename Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        ParBox pb,
-        SglParticle<FloatPos>* gParticle,
-        Mapping mapper
-    ) const
-    {
+        {
+            floatD_64 pos;
+            for(uint32_t i = 0; i < simDim; ++i)
+                pos[i] = (v.getGlobalCell()[i] * cellSize[i] * UNIT_LENGTH);
 
-        using FramePtr = typename ParBox::FramePtr;
-        PMACC_SMEM( acc, frame, FramePtr );
+            const float3_64 mom(
+                precisionCast<float_64>(v.momentum.x()) * UNIT_MASS * UNIT_SPEED,
+                precisionCast<float_64>(v.momentum.y()) * UNIT_MASS * UNIT_SPEED,
+                precisionCast<float_64>(v.momentum.z()) * UNIT_MASS * UNIT_SPEED);
 
+            const float_64 mass = precisionCast<float_64>(v.mass) * UNIT_MASS;
+            const float_64 charge = precisionCast<float_64>(v.charge) * UNIT_CHARGE;
 
-        using SuperCellSize = typename Mapping::SuperCellSize;
+            using dbl = std::numeric_limits<float_64>;
+            out.precision(dbl::digits10);
 
-        const DataSpace<simDim > threadIndex(threadIdx);
-        const int linearThreadIdx = DataSpaceOperations<simDim>::template map<SuperCellSize > (threadIndex);
-        const DataSpace<simDim> superCellIdx(mapper.getSuperCellIndex(DataSpace<simDim > (blockIdx)));
+            out << std::scientific << pos << " " << mom << " " << mass << " " << precisionCast<float_64>(v.weighting)
+                << " " << charge << " " << precisionCast<float_64>(v.gamma);
+            return out;
+        }
+    };
 
-        if (linearThreadIdx == 0)
+    /** write the position of a single particle to a file
+     * \warning this plugin MUST NOT be used with more than one (global!)
+     * particle and is created for one-particle-test-purposes only
+     */
+    struct KernelPositionsParticles
+    {
+        template<typename ParBox, typename FloatPos, typename Mapping, typename T_Acc>
+        DINLINE void operator()(T_Acc const& acc, ParBox pb, SglParticle<FloatPos>* gParticle, Mapping mapper) const
         {
-            frame = pb.getLastFrame(superCellIdx);
-        }
+            using FramePtr = typename ParBox::FramePtr;
+            PMACC_SMEM(acc, frame, FramePtr);
 
-        __syncthreads();
-        if (!frame.isValid())
-            return; //end kernel if we have no frames
 
-        /* BUGFIX to issue #538
-         * volatile prohibits that the compiler creates wrong code*/
-        volatile bool isParticle = frame[linearThreadIdx][multiMask_];
+            using SuperCellSize = typename Mapping::SuperCellSize;
 
-        while (frame.isValid())
-        {
-            if (isParticle)
-            {
-                auto particle = frame[linearThreadIdx];
-                gParticle->position = particle[position_];
-                gParticle->momentum = particle[momentum_];
-                gParticle->weighting = particle[weighting_];
-                gParticle->mass = attribute::getMass(gParticle->weighting,particle);
-                gParticle->charge = attribute::getCharge(gParticle->weighting,particle);
-                gParticle->gamma = Gamma<>()(gParticle->momentum, gParticle->mass);
+            const DataSpace<simDim> threadIndex(cupla::threadIdx(acc));
+            const int linearThreadIdx = DataSpaceOperations<simDim>::template map<SuperCellSize>(threadIndex);
+            const DataSpace<simDim> superCellIdx(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
 
-                // storage number in the actual frame
-                const lcellId_t frameCellNr = particle[localCellIdx_];
+            if(linearThreadIdx == 0)
+            {
+                frame = pb.getLastFrame(superCellIdx);
+            }
 
-                // offset in the actual superCell = cell offset in the supercell
-                const DataSpace<simDim> frameCellOffset(DataSpaceOperations<simDim>::template map<MappingDesc::SuperCellSize > (frameCellNr));
+            cupla::__syncthreads(acc);
+            if(!frame.isValid())
+                return; // end kernel if we have no frames
 
+            /* BUGFIX to issue #538
+             * volatile prohibits that the compiler creates wrong code*/
+            volatile bool isParticle = frame[linearThreadIdx][multiMask_];
 
-                gParticle->globalCellOffset = (superCellIdx - mapper.getGuardingSuperCells())
-                    * MappingDesc::SuperCellSize::toRT()
-                    + frameCellOffset;
-            }
-            __syncthreads();
-            if (linearThreadIdx == 0)
+            while(frame.isValid())
             {
-                frame = pb.getPreviousFrame(frame);
+                if(isParticle)
+                {
+                    auto particle = frame[linearThreadIdx];
+                    gParticle->position = particle[position_];
+                    gParticle->momentum = particle[momentum_];
+                    gParticle->weighting = particle[weighting_];
+                    gParticle->mass = attribute::getMass(gParticle->weighting, particle);
+                    gParticle->charge = attribute::getCharge(gParticle->weighting, particle);
+                    gParticle->gamma = Gamma<>()(gParticle->momentum, gParticle->mass);
+
+                    // storage number in the actual frame
+                    const lcellId_t frameCellNr = particle[localCellIdx_];
+
+                    // offset in the actual superCell = cell offset in the supercell
+                    const DataSpace<simDim> frameCellOffset(
+                        DataSpaceOperations<simDim>::template map<MappingDesc::SuperCellSize>(frameCellNr));
+
+
+                    gParticle->globalCellOffset
+                        = (superCellIdx - mapper.getGuardingSuperCells()) * MappingDesc::SuperCellSize::toRT()
+                        + frameCellOffset;
+                }
+                cupla::__syncthreads(acc);
+                if(linearThreadIdx == 0)
+                {
+                    frame = pb.getPreviousFrame(frame);
+                }
+                isParticle = true;
+                cupla::__syncthreads(acc);
             }
-            isParticle = true;
-            __syncthreads();
         }
+    };
 
-    }
-};
-
-template<class ParticlesType>
-class PositionsParticles : public ILightweightPlugin
-{
-private:
-    typedef MappingDesc::SuperCellSize SuperCellSize;
-    typedef floatD_X FloatPos;
-
-    GridBuffer<SglParticle<FloatPos>, DIM1> *gParticle;
+    template<class ParticlesType>
+    class PositionsParticles : public ILightweightPlugin
+    {
+    private:
+        typedef MappingDesc::SuperCellSize SuperCellSize;
+        typedef floatD_X FloatPos;
 
-    MappingDesc *cellDescription;
-    std::string notifyPeriod;
+        GridBuffer<SglParticle<FloatPos>, DIM1>* gParticle;
 
-    std::string pluginName;
-    std::string pluginPrefix;
+        MappingDesc* cellDescription;
+        std::string notifyPeriod;
 
-public:
+        std::string pluginName;
+        std::string pluginPrefix;
 
-    PositionsParticles() :
-    pluginName("PositionsParticles: write position of one particle of a species to std::cout"),
-    pluginPrefix(ParticlesType::FrameType::getName() + std::string("_position")),
-    gParticle(nullptr),
-    cellDescription(nullptr)
-    {
+    public:
+        PositionsParticles()
+            : pluginName("PositionsParticles: write position of one particle of a species to std::cout")
+            , pluginPrefix(ParticlesType::FrameType::getName() + std::string("_position"))
+            , gParticle(nullptr)
+            , cellDescription(nullptr)
+        {
+            Environment<>::get().PluginConnector().registerPlugin(this);
+        }
 
-        Environment<>::get().PluginConnector().registerPlugin(this);
-    }
+        virtual ~PositionsParticles()
+        {
+        }
 
-    virtual ~PositionsParticles()
-    {
-    }
+        void notify(uint32_t currentStep)
+        {
+            const int rank = Environment<simDim>::get().GridController().getGlobalRank();
+            const SglParticle<FloatPos> positionParticle = getPositionsParticles<CORE + BORDER>(currentStep);
+
+            /*FORMAT OUTPUT*/
+            if(positionParticle.mass != float_X(0.0))
+                std::cout << "[ANALYSIS] [" << rank << "] [COUNTER] [" << pluginPrefix << "] [" << currentStep << "] "
+                          << std::setprecision(16) << float_64(currentStep) * SI::DELTA_T_SI << " " << positionParticle
+                          << "\n"; // no flush
+        }
 
-    void notify(uint32_t currentStep)
-    {
-        const int rank = Environment<simDim>::get().GridController().getGlobalRank();
-        const SglParticle<FloatPos> positionParticle = getPositionsParticles < CORE + BORDER > (currentStep);
+        void pluginRegisterHelp(po::options_description& desc)
+        {
+            desc.add_options()(
+                (pluginPrefix + ".period").c_str(),
+                po::value<std::string>(&notifyPeriod),
+                "enable plugin [for each n-th step]");
+        }
 
-        /*FORMAT OUTPUT*/
-        if (positionParticle.mass != float_X(0.0))
-            std::cout << "[ANALYSIS] [" << rank << "] [COUNTER] [" << pluginPrefix << "] [" << currentStep << "] "
-            << std::setprecision(16) << float_64(currentStep) * SI::DELTA_T_SI << " "
-            << positionParticle << "\n"; // no flush
-    }
+        std::string pluginGetName() const
+        {
+            return pluginName;
+        }
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        desc.add_options()
-            ((pluginPrefix + ".period").c_str(),
-             po::value<std::string> (&notifyPeriod), "enable plugin [for each n-th step]");
-    }
+        void setMappingDescription(MappingDesc* cellDescription)
+        {
+            this->cellDescription = cellDescription;
+        }
 
-    std::string pluginGetName() const
-    {
-        return pluginName;
-    }
+    private:
+        void pluginLoad()
+        {
+            if(!notifyPeriod.empty())
+            {
+                // create one float3_X on gpu und host
+                gParticle = new GridBuffer<SglParticle<FloatPos>, DIM1>(DataSpace<DIM1>(1));
 
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        this->cellDescription = cellDescription;
-    }
+                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+            }
+        }
 
-private:
+        void pluginUnload()
+        {
+            __delete(gParticle);
+        }
 
-    void pluginLoad()
-    {
-        if(!notifyPeriod.empty())
+        template<uint32_t AREA>
+        SglParticle<FloatPos> getPositionsParticles(uint32_t currentStep)
         {
-            //create one float3_X on gpu und host
-            gParticle = new GridBuffer<SglParticle<FloatPos>, DIM1 > (DataSpace<DIM1 > (1));
+            typedef typename MappingDesc::SuperCellSize SuperCellSize;
+            SglParticle<FloatPos> positionParticleTmp;
 
-            Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
-        }
-    }
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
 
-    void pluginUnload()
-    {
-        __delete(gParticle);
-    }
+            gParticle->getDeviceBuffer().setValue(positionParticleTmp);
+            auto block = SuperCellSize::toRT();
 
-    template< uint32_t AREA>
-    SglParticle<FloatPos> getPositionsParticles(uint32_t currentStep)
-    {
-        typedef typename MappingDesc::SuperCellSize SuperCellSize;
-        SglParticle<FloatPos> positionParticleTmp;
+            AreaMapping<AREA, MappingDesc> mapper(*cellDescription);
+            PMACC_KERNEL(KernelPositionsParticles{})
+            (mapper.getGridDim(),
+             block)(particles->getDeviceParticlesBox(), gParticle->getDeviceBuffer().getBasePointer(), mapper);
 
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto particles = dc.get< ParticlesType >( ParticlesType::FrameType::getName(), true );
+            dc.releaseData(ParticlesType::FrameType::getName());
+            gParticle->deviceToHost();
 
-        gParticle->getDeviceBuffer().setValue(positionParticleTmp);
-        auto block = SuperCellSize::toRT();
+            DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
+            const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
 
-        AreaMapping<AREA, MappingDesc> mapper(*cellDescription);
-        PMACC_KERNEL(KernelPositionsParticles{})
-            (mapper.getGridDim(), block)
-            (particles->getDeviceParticlesBox(),
-             gParticle->getDeviceBuffer().getBasePointer(),
-             mapper);
+            DataSpace<simDim> gpuPhyCellOffset(Environment<simDim>::get().SubGrid().getLocalDomain().offset);
+            gpuPhyCellOffset.y() += (localSize.y() * numSlides);
 
-        dc.releaseData( ParticlesType::FrameType::getName() );
-        gParticle->deviceToHost();
+            gParticle->getHostBuffer().getDataBox()[0].globalCellOffset += gpuPhyCellOffset;
 
-        DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
-        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
 
-        DataSpace<simDim> gpuPhyCellOffset(Environment<simDim>::get().SubGrid().getLocalDomain().offset);
-        gpuPhyCellOffset.y() += (localSize.y() * numSlides);
+            return gParticle->getHostBuffer().getDataBox()[0];
+        }
+    };
 
-        gParticle->getHostBuffer().getDataBox()[0].globalCellOffset += gpuPhyCellOffset;
+    namespace particles
+    {
+        namespace traits
+        {
+            template<typename T_Species, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<T_Species, PositionsParticles<T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
 
+                using RequiredIdentifiers = MakeSeq_t<weighting, momentum, position<>>;
 
-        return gParticle->getHostBuffer().getDataBox()[0];
-    }
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
 
-};
+                using SpeciesHasMass = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+                using SpeciesHasCharge = typename pmacc::traits::HasFlag<FrameType, chargeRatio<>>::type;
 
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_UnspecifiedSpecies
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        PositionsParticles< T_UnspecifiedSpecies >
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
-
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            momentum,
-            position<>
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        using SpeciesHasMass = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-        using SpeciesHasCharge = typename pmacc::traits::HasFlag<
-            FrameType,
-            chargeRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasMass,
-            SpeciesHasCharge
-        >;
-    };
-} // namespace traits
-} // namespace particles
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasMass, SpeciesHasCharge>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/plugins/ResourceLog.cpp b/include/picongpu/plugins/ResourceLog.cpp
index 6fafbd1f8b..5fc24c8e8a 100644
--- a/include/picongpu/plugins/ResourceLog.cpp
+++ b/include/picongpu/plugins/ResourceLog.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Erik Zenker, Axel Huebl
+/* Copyright 2016-2021 Erik Zenker, Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -25,9 +25,9 @@
 #include <boost/property_tree/xml_parser.hpp>
 
 // STL
-#include <string>    /* std::string */
-#include <sstream>   /* std::stringstream */
-#include <map>       /* std::map */
+#include <string> /* std::string */
+#include <sstream> /* std::stringstream */
+#include <map> /* std::map */
 #include <stdexcept> /* std::runtime_error */
 
 // C LIB
@@ -36,55 +36,45 @@
 
 namespace picongpu
 {
-namespace detail
-{
-    std::string
-    writeMapToPropertyTree(
-        std::map< std::string, size_t > valueMap,
-        std::string outputFormat
-    )
+    namespace detail
     {
-        // Create property tree which contains the resource information
-        using boost::property_tree::ptree;
-        ptree pt;
+        std::string writeMapToPropertyTree(std::map<std::string, size_t> valueMap, std::string outputFormat)
+        {
+            // Create property tree which contains the resource information
+            using boost::property_tree::ptree;
+            ptree pt;
 
-        for( auto it = valueMap.begin(); it != valueMap.end(); ++it ) {
-            pt.put( it->first, it->second );
-        }
+            for(auto it = valueMap.begin(); it != valueMap.end(); ++it)
+            {
+                pt.put(it->first, it->second);
+            }
 
-        // Write property tree to string stream
-        std::stringstream ss;
-        if( outputFormat == "json" )
-        {
-            write_json( ss, pt, false );
-        }
-        else if( outputFormat == "jsonpp" )
-        {
-            write_json( ss, pt, true );
-        }
-        else if( outputFormat == "xml" )
-        {
-            write_xml( ss, pt );
-        }
-        else if( outputFormat == "xmlpp" )
-        {
-            write_xml(
-                ss,
-                pt,
-                boost::property_tree::xml_writer_make_settings< std::string >( '\t', 1 )
-            );
-        }
-        else
-        {
-            throw std::runtime_error(
-                std::string( "resourcelog.format " ) +
-                outputFormat +
-                std::string( " is not known, use json or xml." )
-            );
-        }
+            // Write property tree to string stream
+            std::stringstream ss;
+            if(outputFormat == "json")
+            {
+                write_json(ss, pt, false);
+            }
+            else if(outputFormat == "jsonpp")
+            {
+                write_json(ss, pt, true);
+            }
+            else if(outputFormat == "xml")
+            {
+                write_xml(ss, pt);
+            }
+            else if(outputFormat == "xmlpp")
+            {
+                write_xml(ss, pt, boost::property_tree::xml_writer_make_settings<std::string>('\t', 1));
+            }
+            else
+            {
+                throw std::runtime_error(
+                    std::string("resourcelog.format ") + outputFormat
+                    + std::string(" is not known, use json or xml."));
+            }
 
-        return ss.str();
-    }
-} // namespace detail
+            return ss.str();
+        }
+    } // namespace detail
 } // namespace picongpu
-
diff --git a/include/picongpu/plugins/ResourceLog.hpp b/include/picongpu/plugins/ResourceLog.hpp
index 878e88b8f8..76228f36c4 100644
--- a/include/picongpu/plugins/ResourceLog.hpp
+++ b/include/picongpu/plugins/ResourceLog.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Erik Zenker
+/* Copyright 2016-2021 Erik Zenker
  *
  * This file is part of PMacc.
  *
@@ -35,12 +35,12 @@
 #include <boost/filesystem.hpp>
 
 // STL
-#include <iostream>  /* std::cout, std::ostream */
-#include <numeric>   /* std::accumulate */
-#include <string>    /* std::string */
-#include <sstream>   /* std::stringstream */
-#include <fstream>   /* std::filebuf */
-#include <map>       /* std::map */
+#include <iostream> /* std::cout, std::ostream */
+#include <numeric> /* std::accumulate */
+#include <string> /* std::string */
+#include <sstream> /* std::stringstream */
+#include <fstream> /* std::filebuf */
+#include <map> /* std::map */
 #include <algorithm> /* std::accumulate */
 
 // C LIB
@@ -52,19 +52,15 @@ namespace picongpu
 {
     using namespace pmacc; /** @todo do not pull into global (header) scope */
 
-namespace detail
-{
-    std::string
-    writeMapToPropertyTree(
-        std::map<std::string, size_t> valueMap,
-        std::string outputFormat
-    );
-}
+    namespace detail
+    {
+        std::string writeMapToPropertyTree(std::map<std::string, size_t> valueMap, std::string outputFormat);
+    }
 
     class ResourceLog : public ILightweightPlugin
     {
     private:
-        MappingDesc *cellDescription;
+        MappingDesc* cellDescription;
         ResourceMonitor<simDim> resourceMonitor;
 
         // programm options
@@ -77,9 +73,7 @@ namespace detail
         std::map<std::string, bool> propertyMap;
 
     public:
-
-        ResourceLog() :
-                cellDescription(NULL)
+        ResourceLog() : cellDescription(NULL)
         {
             Environment<>::get().PluginConnector().registerPlugin(this);
         }
@@ -94,13 +88,14 @@ namespace detail
             std::map<std::string, size_t> valueMap;
 
             if(contains(propertyMap, "rank"))
-                valueMap["resourceLog.rank"] = static_cast<size_t>(Environment<simDim>::get().GridController().getGlobalRank());
+                valueMap["resourceLog.rank"]
+                    = static_cast<size_t>(Environment<simDim>::get().GridController().getGlobalRank());
 
-            if(contains(propertyMap,"position"))
+            if(contains(propertyMap, "position"))
             {
                 auto const currentPosition = Environment<simDim>::get().GridController().getPosition();
                 char const axisName[] = {'x', 'y', 'z'};
-                for( size_t d = 0; d < simDim; ++d )
+                for(size_t d = 0; d < simDim; ++d)
                     valueMap[std::string("resourceLog.position.") + axisName[d]] = currentPosition[d];
             }
 
@@ -110,17 +105,19 @@ namespace detail
             if(contains(propertyMap, "cellCount"))
                 valueMap["resourceLog.cellCount"] = resourceMonitor.getCellCount();
 
-            if(contains(propertyMap,"particleCount"))
+            if(contains(propertyMap, "particleCount"))
             {
                 // enforce that the filter interface is fulfilled
-                particles::filter::IUnary< particles::filter::All > parFilter{ currentStep };
-                std::vector<size_t> particleCounts = resourceMonitor.getParticleCounts<VectorAllSpecies>(*cellDescription, parFilter );
-                valueMap["resourceLog.particleCount"] = std::accumulate(particleCounts.begin(), particleCounts.end(), 0);
+                particles::filter::IUnary<particles::filter::All> parFilter{currentStep};
+                std::vector<size_t> particleCounts
+                    = resourceMonitor.getParticleCounts<VectorAllSpecies>(*cellDescription, parFilter);
+                valueMap["resourceLog.particleCount"]
+                    = std::accumulate(particleCounts.begin(), particleCounts.end(), 0);
             }
 
             //
             // Write property tree to a string
-            std::string properties = ::picongpu::detail::writeMapToPropertyTree( valueMap, outputFormat );
+            std::string properties = ::picongpu::detail::writeMapToPropertyTree(valueMap, outputFormat);
 
             //
             // Write property tree to the output stream
@@ -128,38 +125,45 @@ namespace detail
             {
                 std::cout << properties;
             }
-            else if (streamType == "stderr")
+            else if(streamType == "stderr")
             {
                 std::cerr << properties;
             }
-            else if (streamType == "file")
+            else if(streamType == "file")
             {
                 std::ostream os(&fileBuf);
                 os << properties;
             }
             else
             {
-                throw std::runtime_error(std::string("resourcelog.stream ") + streamType + std::string(" is not known, use stdout, stderr or file instead."));
+                throw std::runtime_error(
+                    std::string("resourcelog.stream ") + streamType
+                    + std::string(" is not known, use stdout, stderr or file instead."));
             }
         }
 
         void pluginRegisterHelp(po::options_description& desc)
         {
             /* register command line parameters for your plugin */
-            desc.add_options()
-                    ("resourceLog.period", po::value<std::string>(&notifyPeriod),
-                     "Enable ResourceLog plugin [for each n-th step]")
-                    ("resourceLog.prefix", po::value<std::string>(&outputFilePrefix)->default_value("resourceLog_"),
-                     "Set the filename prefix for output file if a filestream was selected")
-                    ("resourceLog.stream", po::value<std::string>(&streamType)->default_value("file"),
-                     "Output stream [stdout, stderr, file]")
-                    ("resourceLog.properties", po::value<std::vector<std::string> >(&properties)->multitoken(),
-                     "List of properties to log [rank, position, currentStep, cellCount, particleCount]")
-                    ("resourceLog.format", po::value<std::string>(&outputFormat)->default_value("json"),
-                     "Output format of log (pp for pretty print) [json, jsonpp, xml, xmlpp]");
+            desc.add_options()(
+                "resourceLog.period",
+                po::value<std::string>(&notifyPeriod),
+                "Enable ResourceLog plugin [for each n-th step]")(
+                "resourceLog.prefix",
+                po::value<std::string>(&outputFilePrefix)->default_value("resourceLog_"),
+                "Set the filename prefix for output file if a filestream was selected")(
+                "resourceLog.stream",
+                po::value<std::string>(&streamType)->default_value("file"),
+                "Output stream [stdout, stderr, file]")(
+                "resourceLog.properties",
+                po::value<std::vector<std::string>>(&properties)->multitoken(),
+                "List of properties to log [rank, position, currentStep, cellCount, particleCount]")(
+                "resourceLog.format",
+                po::value<std::string>(&outputFormat)->default_value("json"),
+                "Output format of log (pp for pretty print) [json, jsonpp, xml, xmlpp]");
         }
 
-        void setMappingDescription(MappingDesc *cellDescription)
+        void setMappingDescription(MappingDesc* cellDescription)
         {
             this->cellDescription = cellDescription;
         }
@@ -167,12 +171,15 @@ namespace detail
     private:
         std::string notifyPeriod;
 
-        void pluginLoad() {
-            if(!notifyPeriod.empty()) {
+        void pluginLoad()
+        {
+            if(!notifyPeriod.empty())
+            {
                 Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
 
                 // Set default resources to log
-                if (properties.empty()) {
+                if(properties.empty())
+                {
                     properties.push_back("rank");
                     properties.push_back("position");
                     properties.push_back("currentStep");
@@ -184,14 +191,17 @@ namespace detail
                     propertyMap["particleCount"] = true;
                     propertyMap["cellCount"] = true;
                 }
-                else {
-                    for (size_t i = 0; i < properties.size(); ++i) {
+                else
+                {
+                    for(size_t i = 0; i < properties.size(); ++i)
+                    {
                         propertyMap[properties[i]] = true;
                     }
                 }
 
                 // Prepare file for output stream
-                if (streamType == "file") {
+                if(streamType == "file")
+                {
                     size_t rank = static_cast<size_t>(Environment<simDim>::get().GridController().getGlobalRank());
                     std::stringstream ss;
                     ss << outputFilePrefix << rank;
@@ -204,19 +214,19 @@ namespace detail
 
         void pluginUnload()
         {
-            if(fileBuf.is_open()){
+            if(fileBuf.is_open())
+            {
                 fileBuf.close();
             }
             /* called when plugin is unloaded, cleanup here */
         }
 
-        template <typename T_MAP>
+        template<typename T_MAP>
         bool contains(T_MAP const map, std::string const value)
         {
             return (map.find(value) != map.end());
         }
-
     };
-}
+} // namespace picongpu
 
 #include <pmacc/mappings/simulation/ResourceMonitor.tpp>
diff --git a/include/picongpu/plugins/SliceFieldPrinter.hpp b/include/picongpu/plugins/SliceFieldPrinter.hpp
index 161248425c..fe91e1165d 100644
--- a/include/picongpu/plugins/SliceFieldPrinter.hpp
+++ b/include/picongpu/plugins/SliceFieldPrinter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -28,40 +28,43 @@
 
 namespace picongpu
 {
+    using namespace pmacc;
 
-using namespace pmacc;
+    namespace po = boost::program_options;
 
-namespace po = boost::program_options;
+    template<typename Field>
+    class SliceFieldPrinterMulti;
 
-template<typename Field>
-class SliceFieldPrinterMulti;
+    template<typename Field>
+    class SliceFieldPrinter : public ILightweightPlugin
+    {
+    private:
+        std::string notifyPeriod;
+        bool sliceIsOK;
+        std::string fileName;
+        int plane;
+        float_X slicePoint;
+        MappingDesc* cellDescription;
+        container::DeviceBuffer<float3_64, simDim - 1>* dBuffer_SI;
 
-template<typename Field>
-class SliceFieldPrinter : public ILightweightPlugin
-{
-private:
-    std::string notifyPeriod;
-    bool sliceIsOK;
-    std::string fileName;
-    int plane;
-    float_X slicePoint;
-    MappingDesc *cellDescription;
-    container::DeviceBuffer<float3_64, simDim-1>* dBuffer_SI;
+        void pluginLoad();
+        void pluginUnload();
 
-    void pluginLoad();
-    void pluginUnload();
+        template<typename TField>
+        void printSlice(const TField& field, int nAxis, float slicePoint, std::string filename);
 
-    template<typename TField>
-    void printSlice(const TField& field, int nAxis, float slicePoint, std::string filename);
+        friend class SliceFieldPrinterMulti<Field>;
 
-    friend class SliceFieldPrinterMulti<Field>;
-public:
-    void notify(uint32_t currentStep);
-    std::string pluginGetName() const;
-    void pluginRegisterHelp(po::options_description& desc);
-    void setMappingDescription(MappingDesc* desc) {this->cellDescription = desc;}
-};
+    public:
+        void notify(uint32_t currentStep);
+        std::string pluginGetName() const;
+        void pluginRegisterHelp(po::options_description& desc);
+        void setMappingDescription(MappingDesc* desc)
+        {
+            this->cellDescription = desc;
+        }
+    };
 
-}
+} // namespace picongpu
 
 #include "SliceFieldPrinter.tpp"
diff --git a/include/picongpu/plugins/SliceFieldPrinter.tpp b/include/picongpu/plugins/SliceFieldPrinter.tpp
index bbe336ed23..29cce64e36 100644
--- a/include/picongpu/plugins/SliceFieldPrinter.tpp
+++ b/include/picongpu/plugins/SliceFieldPrinter.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -41,148 +41,149 @@
 
 namespace picongpu
 {
-
-namespace SliceFieldPrinterHelper
-{
-template<class Field>
-class ConversionFunctor
-{
-public:
-    /* convert field data to higher precision and convert to SI units on GPUs */
-    template< typename T_Acc >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        float3_64& target,
-        const typename Field::ValueType fieldData
-    ) const
+    namespace SliceFieldPrinterHelper
     {
-      target = precisionCast<float_64>(fieldData) *  float_64((Field::getUnit())[0]) ;
+        template<class Field>
+        class ConversionFunctor
+        {
+        public:
+            /* convert field data to higher precision and convert to SI units on GPUs */
+            template<typename T_Acc>
+            DINLINE void operator()(T_Acc const& acc, float3_64& target, const typename Field::ValueType fieldData)
+                const
+            {
+                target = precisionCast<float_64>(fieldData) * float_64((Field::getUnit())[0]);
+            }
+        };
+    } // end namespace SliceFieldPrinterHelper
+
+
+    template<typename Field>
+    void SliceFieldPrinter<Field>::pluginLoad()
+    {
+        if(float_X(0.0) <= slicePoint && slicePoint <= float_X(1.0))
+        {
+            /* in case the slice point is inside of [0.0,1.0] */
+            sliceIsOK = true;
+            Environment<>::get().PluginConnector().setNotificationPeriod(this, this->notifyPeriod);
+            namespace vec = ::pmacc::math;
+            typedef SuperCellSize BlockDim;
+
+            vec::Size_t<simDim> size = vec::Size_t<simDim>(this->cellDescription->getGridSuperCells())
+                    * precisionCast<size_t>(BlockDim::toRT())
+                - precisionCast<size_t>(2 * BlockDim::toRT());
+            this->dBuffer_SI = new container::DeviceBuffer<float3_64, simDim - 1>(
+                size.shrink<simDim - 1>((this->plane + 1) % simDim));
+        }
+        else
+        {
+            /* in case the slice point is outside of [0.0,1.0] */
+            sliceIsOK = false;
+            std::cerr << "In the SliceFieldPrinter plugin a slice point"
+                      << " (slice_point=" << slicePoint << ") is outside of [0.0, 1.0]. " << std::endl
+                      << "The request will be ignored. " << std::endl;
+        }
     }
-};
-} // end namespace SliceFieldPrinterHelper
-
 
-template<typename Field>
-void SliceFieldPrinter<Field>::pluginLoad()
-{
-    if( float_X(0.0) <= slicePoint && slicePoint <= float_X(1.0))
-      {
-        /* in case the slice point is inside of [0.0,1.0] */
-        sliceIsOK = true;
-        Environment<>::get().PluginConnector().setNotificationPeriod(this, this->notifyPeriod);
-        namespace vec = ::pmacc::math;
-        typedef SuperCellSize BlockDim;
-
-        vec::Size_t<simDim> size = vec::Size_t<simDim>(this->cellDescription->getGridSuperCells()) * precisionCast<size_t>(BlockDim::toRT())
-          - precisionCast<size_t>(2 * BlockDim::toRT());
-        this->dBuffer_SI = new container::DeviceBuffer<float3_64, simDim-1>(
-                        size.shrink<simDim-1>((this->plane+1)%simDim));
-      }
-    else
-      {
-        /* in case the slice point is outside of [0.0,1.0] */
-        sliceIsOK = false;
-        std::cerr << "In the SliceFieldPrinter plugin a slice point"
-                  << " (slice_point=" << slicePoint
-                  << ") is outside of [0.0, 1.0]. " << std::endl
-                  << "The request will be ignored. " << std::endl;
-      }
-}
-
-template<typename Field>
-void SliceFieldPrinter<Field>::pluginUnload()
-{
-    __delete(this->dBuffer_SI);
-}
+    template<typename Field>
+    void SliceFieldPrinter<Field>::pluginUnload()
+    {
+        __delete(this->dBuffer_SI);
+    }
 
-template<typename Field>
-void SliceFieldPrinter<Field>::pluginRegisterHelp(po::options_description&)
-{
-    // nothing to do here
-}
+    template<typename Field>
+    void SliceFieldPrinter<Field>::pluginRegisterHelp(po::options_description&)
+    {
+        // nothing to do here
+    }
 
-template<typename Field>
-std::string SliceFieldPrinter<Field>::pluginGetName() const
-{
-    return "SliceFieldPrinter";
-}
+    template<typename Field>
+    std::string SliceFieldPrinter<Field>::pluginGetName() const
+    {
+        return "SliceFieldPrinter";
+    }
 
-template<typename Field>
-void SliceFieldPrinter<Field>::notify(uint32_t currentStep)
-{
-    if(sliceIsOK)
+    template<typename Field>
+    void SliceFieldPrinter<Field>::notify(uint32_t currentStep)
     {
-      namespace vec = ::pmacc::math;
-      typedef SuperCellSize BlockDim;
-      DataConnector &dc = Environment<>::get().DataConnector();
-      auto field_coreBorder =
-                 dc.get< Field >( Field::getName(), true )->getGridBuffer().
-                 getDeviceBuffer().cartBuffer().
-                 view(BlockDim::toRT(), -BlockDim::toRT());
-
-      std::ostringstream filename;
-      filename << this->fileName << "_" << currentStep << ".dat";
-      printSlice(field_coreBorder, this->plane, this->slicePoint, filename.str());
+        if(sliceIsOK)
+        {
+            namespace vec = ::pmacc::math;
+            typedef SuperCellSize BlockDim;
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto field_coreBorder = dc.get<Field>(Field::getName(), true)
+                                        ->getGridBuffer()
+                                        .getDeviceBuffer()
+                                        .cartBuffer()
+                                        .view(BlockDim::toRT(), -BlockDim::toRT());
+
+            std::ostringstream filename;
+            filename << this->fileName << "_" << currentStep << ".dat";
+            printSlice(field_coreBorder, this->plane, this->slicePoint, filename.str());
+        }
     }
-}
 
-template<typename Field>
-template<typename TField>
-void SliceFieldPrinter<Field>::printSlice(const TField& field, int nAxis, float slicePoint, std::string filename)
-{
-    namespace vec = pmacc::math;
+    template<typename Field>
+    template<typename TField>
+    void SliceFieldPrinter<Field>::printSlice(const TField& field, int nAxis, float slicePoint, std::string filename)
+    {
+        namespace vec = pmacc::math;
 
-    pmacc::GridController<simDim>& con = pmacc::Environment<simDim>::get().GridController();
-    vec::Size_t<simDim> gpuDim = (vec::Size_t<simDim>)con.getGpuNodes();
-    vec::Size_t<simDim> globalGridSize = gpuDim * field.size();
-    int globalPlane = globalGridSize[nAxis] * slicePoint;
-    int localPlane = globalPlane % field.size()[nAxis];
-    int gpuPlane = globalPlane / field.size()[nAxis];
+        pmacc::GridController<simDim>& con = pmacc::Environment<simDim>::get().GridController();
+        vec::Size_t<simDim> gpuDim = (vec::Size_t<simDim>) con.getGpuNodes();
+        vec::Size_t<simDim> globalGridSize = gpuDim * field.size();
+        int globalPlane = globalGridSize[nAxis] * slicePoint;
+        int localPlane = globalPlane % field.size()[nAxis];
+        int gpuPlane = globalPlane / field.size()[nAxis];
 
-    vec::Int<simDim> nVector(vec::Int<simDim>::create(0));
-    nVector[nAxis] = 1;
+        vec::Int<simDim> nVector(vec::Int<simDim>::create(0));
+        nVector[nAxis] = 1;
 
-    zone::SphericZone<simDim> gpuGatheringZone(gpuDim, nVector * gpuPlane);
-    gpuGatheringZone.size[nAxis] = 1;
+        zone::SphericZone<simDim> gpuGatheringZone(gpuDim, nVector * gpuPlane);
+        gpuGatheringZone.size[nAxis] = 1;
 
-    algorithm::mpi::Gather<simDim> gather(gpuGatheringZone);
+        algorithm::mpi::Gather<simDim> gather(gpuGatheringZone);
 
-    if(!gather.participate()) return;
+        if(!gather.participate())
+            return;
 
-#if(SIMDIM==DIM3)
-    vec::UInt32<3> twistedAxesVec((nAxis+1)%3, (nAxis+2)%3, nAxis);
+#if(SIMDIM == DIM3)
+        vec::UInt32<3> twistedAxesVec((nAxis + 1) % 3, (nAxis + 2) % 3, nAxis);
 
-    /* convert data to higher precision and to SI units */
-    SliceFieldPrinterHelper::ConversionFunctor<Field> cf;
-    algorithm::kernel::RT::Foreach()(
-      dBuffer_SI->zone(), dBuffer_SI->origin(),
-      cursor::tools::slice(field.originCustomAxes(twistedAxesVec)(0,0,localPlane)),
-      cf );
+        /* convert data to higher precision and to SI units */
+        SliceFieldPrinterHelper::ConversionFunctor<Field> cf;
+        algorithm::kernel::RT::Foreach()(
+            dBuffer_SI->zone(),
+            dBuffer_SI->origin(),
+            cursor::tools::slice(field.originCustomAxes(twistedAxesVec)(0, 0, localPlane)),
+            cf);
 #endif
-#if(SIMDIM==DIM2)
-    vec::UInt32<2> twistedAxesVec((nAxis+1)%2, nAxis);
-
-    /* convert data to higher precision and to SI units */
-    SliceFieldPrinterHelper::ConversionFunctor<Field> cf;
-    algorithm::kernel::RT::Foreach()(
-      dBuffer_SI->zone(), dBuffer_SI->origin(),
-      cursor::tools::slice(field.originCustomAxes(twistedAxesVec)(0,localPlane)),
-      cf );
+#if(SIMDIM == DIM2)
+        vec::UInt32<2> twistedAxesVec((nAxis + 1) % 2, nAxis);
+
+        /* convert data to higher precision and to SI units */
+        SliceFieldPrinterHelper::ConversionFunctor<Field> cf;
+        algorithm::kernel::RT::Foreach()(
+            dBuffer_SI->zone(),
+            dBuffer_SI->origin(),
+            cursor::tools::slice(field.originCustomAxes(twistedAxesVec)(0, localPlane)),
+            cf);
 #endif
 
-    /* copy selected plane from device to host */
-    container::HostBuffer<float3_64, simDim-1> hBuffer(dBuffer_SI->size());
-    hBuffer = *dBuffer_SI;
+        /* copy selected plane from device to host */
+        container::HostBuffer<float3_64, simDim - 1> hBuffer(dBuffer_SI->size());
+        hBuffer = *dBuffer_SI;
 
-    /* collect data from all nodes/GPUs */
-    vec::Size_t<simDim> globalDomainSize = Environment<simDim>::get().SubGrid().getGlobalDomain().size;
-    vec::Size_t<simDim-1> globalSliceSize = globalDomainSize.shrink<simDim-1>((nAxis+1)%simDim);
-    container::HostBuffer<float3_64, simDim-1> globalBuffer(globalSliceSize);
-    gather(globalBuffer, hBuffer, nAxis);
-    if(!gather.root()) return;
+        /* collect data from all nodes/GPUs */
+        vec::Size_t<simDim> globalDomainSize = Environment<simDim>::get().SubGrid().getGlobalDomain().size;
+        vec::Size_t<simDim - 1> globalSliceSize = globalDomainSize.shrink<simDim - 1>((nAxis + 1) % simDim);
+        container::HostBuffer<float3_64, simDim - 1> globalBuffer(globalSliceSize);
+        gather(globalBuffer, hBuffer, nAxis);
+        if(!gather.root())
+            return;
 
-    std::ofstream file(filename.c_str());
-    file << globalBuffer;
-}
+        std::ofstream file(filename.c_str());
+        file << globalBuffer;
+    }
 
 } /* end namespace picongpu */
diff --git a/include/picongpu/plugins/SliceFieldPrinterMulti.hpp b/include/picongpu/plugins/SliceFieldPrinterMulti.hpp
index 8dc75da1b5..7623e41d28 100644
--- a/include/picongpu/plugins/SliceFieldPrinterMulti.hpp
+++ b/include/picongpu/plugins/SliceFieldPrinterMulti.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -29,37 +29,39 @@
 
 namespace picongpu
 {
+    using namespace pmacc;
+    namespace po = boost::program_options;
 
-using namespace pmacc;
-namespace po = boost::program_options;
+    template<typename Field>
+    class SliceFieldPrinterMulti : public ILightweightPlugin
+    {
+    private:
+        std::string name;
+        std::string prefix;
+        std::vector<std::string> notifyPeriod;
+        std::vector<std::string> fileName;
+        std::vector<int> plane;
+        std::vector<float_X> slicePoint;
+        MappingDesc* cellDescription;
+        std::vector<SliceFieldPrinter<Field>> childs;
 
-template<typename Field>
-class SliceFieldPrinterMulti : public ILightweightPlugin
-{
-private:
-    std::string name;
-    std::string prefix;
-    std::vector<std::string> notifyPeriod;
-    std::vector<std::string> fileName;
-    std::vector<int> plane;
-    std::vector<float_X> slicePoint;
-    MappingDesc *cellDescription;
-    std::vector<SliceFieldPrinter<Field> > childs;
-
-    void pluginLoad();
-    void pluginUnload();
+        void pluginLoad();
+        void pluginUnload();
 
-public:
-    SliceFieldPrinterMulti();
-    virtual ~SliceFieldPrinterMulti() {}
+    public:
+        SliceFieldPrinterMulti();
+        virtual ~SliceFieldPrinterMulti()
+        {
+        }
 
-    void notify(uint32_t) {}
-    void setMappingDescription(MappingDesc* desc);
-    void pluginRegisterHelp(po::options_description& desc);
-    std::string pluginGetName() const;
-};
+        void notify(uint32_t)
+        {
+        }
+        void setMappingDescription(MappingDesc* desc);
+        void pluginRegisterHelp(po::options_description& desc);
+        std::string pluginGetName() const;
+    };
 
-}
+} // namespace picongpu
 
 #include "SliceFieldPrinterMulti.tpp"
-
diff --git a/include/picongpu/plugins/SliceFieldPrinterMulti.tpp b/include/picongpu/plugins/SliceFieldPrinterMulti.tpp
index c6ca959073..2cb785825b 100644
--- a/include/picongpu/plugins/SliceFieldPrinterMulti.tpp
+++ b/include/picongpu/plugins/SliceFieldPrinterMulti.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -38,61 +38,67 @@
 
 namespace picongpu
 {
+    template<typename Field>
+    SliceFieldPrinterMulti<Field>::SliceFieldPrinterMulti()
+        : name("SliceFieldPrinter: prints a slice of a field")
+        , prefix(Field::getName() + std::string("_slice"))
+    {
+        Environment<>::get().PluginConnector().registerPlugin(this);
+    }
 
-template<typename Field>
-SliceFieldPrinterMulti<Field>::SliceFieldPrinterMulti()
-    : name("SliceFieldPrinter: prints a slice of a field"),
-      prefix(Field::getName() + std::string("_slice"))
-{
-    Environment<>::get().PluginConnector().registerPlugin(this);
-}
-
-template<typename Field>
-void SliceFieldPrinterMulti<Field>::pluginRegisterHelp(po::options_description& desc)
-{
-    desc.add_options()
-        ((this->prefix + ".period").c_str(),
-        po::value<std::vector<std::string> > (&this->notifyPeriod)->multitoken(), "notify period");
-    desc.add_options()
-        ((this->prefix + ".fileName").c_str(),
-        po::value<std::vector<std::string> > (&this->fileName)->multitoken(), "file name to store slices in");
-    desc.add_options()
-        ((this->prefix + ".plane").c_str(),
-        po::value<std::vector<int> > (&this->plane)->multitoken(), "specifies the axis which stands on the cutting plane (0,1,2)");
-    desc.add_options()
-        ((this->prefix + ".slicePoint").c_str(),
-        po::value<std::vector<float_X> > (&this->slicePoint)->multitoken(), "slice point 0.0 <= x <= 1.0");
-}
+    template<typename Field>
+    void SliceFieldPrinterMulti<Field>::pluginRegisterHelp(po::options_description& desc)
+    {
+        desc.add_options()(
+            (this->prefix + ".period").c_str(),
+            po::value<std::vector<std::string>>(&this->notifyPeriod)->multitoken(),
+            "notify period");
+        desc.add_options()(
+            (this->prefix + ".fileName").c_str(),
+            po::value<std::vector<std::string>>(&this->fileName)->multitoken(),
+            "file name to store slices in");
+        desc.add_options()(
+            (this->prefix + ".plane").c_str(),
+            po::value<std::vector<int>>(&this->plane)->multitoken(),
+            "specifies the axis which stands on the cutting plane (0,1,2)");
+        desc.add_options()(
+            (this->prefix + ".slicePoint").c_str(),
+            po::value<std::vector<float_X>>(&this->slicePoint)->multitoken(),
+            "slice point 0.0 <= x <= 1.0");
+    }
 
-template<typename Field>
-std::string SliceFieldPrinterMulti<Field>::pluginGetName() const {return this->name;}
+    template<typename Field>
+    std::string SliceFieldPrinterMulti<Field>::pluginGetName() const
+    {
+        return this->name;
+    }
 
-template<typename Field>
-void SliceFieldPrinterMulti<Field>::pluginLoad()
-{
-    this->childs.resize(this->notifyPeriod.size());
-    for(uint32_t i = 0; i < this->childs.size(); i++)
+    template<typename Field>
+    void SliceFieldPrinterMulti<Field>::pluginLoad()
     {
-        this->childs[i].setMappingDescription(this->cellDescription);
-        this->childs[i].notifyPeriod = this->notifyPeriod[i];
-        this->childs[i].fileName = this->fileName[i];
-        this->childs[i].plane = this->plane[i];
-        this->childs[i].slicePoint = this->slicePoint[i];
-        this->childs[i].pluginLoad();
+        this->childs.resize(this->notifyPeriod.size());
+        for(uint32_t i = 0; i < this->childs.size(); i++)
+        {
+            this->childs[i].setMappingDescription(this->cellDescription);
+            this->childs[i].notifyPeriod = this->notifyPeriod[i];
+            this->childs[i].fileName = this->fileName[i];
+            this->childs[i].plane = this->plane[i];
+            this->childs[i].slicePoint = this->slicePoint[i];
+            this->childs[i].pluginLoad();
+        }
     }
-}
 
-template<typename Field>
-void SliceFieldPrinterMulti<Field>::pluginUnload()
-{
-    for(uint32_t i = 0; i < this->childs.size(); i++)
-        this->childs[i].pluginUnload();
-}
+    template<typename Field>
+    void SliceFieldPrinterMulti<Field>::pluginUnload()
+    {
+        for(uint32_t i = 0; i < this->childs.size(); i++)
+            this->childs[i].pluginUnload();
+    }
 
-template<typename Field>
-void SliceFieldPrinterMulti<Field>::setMappingDescription(MappingDesc* desc)
-{
-    this->cellDescription = desc;
-}
+    template<typename Field>
+    void SliceFieldPrinterMulti<Field>::setMappingDescription(MappingDesc* desc)
+    {
+        this->cellDescription = desc;
+    }
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/plugins/SumCurrents.hpp b/include/picongpu/plugins/SumCurrents.hpp
index 87565861ee..885b7a3c54 100644
--- a/include/picongpu/plugins/SumCurrents.hpp
+++ b/include/picongpu/plugins/SumCurrents.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Felix Schmitt, Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -35,174 +35,157 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-namespace po = boost::program_options;
+    namespace po = boost::program_options;
 
-using J_DataBox = FieldJ::DataBoxType;
+    using J_DataBox = FieldJ::DataBoxType;
 
-struct KernelSumCurrents
-{
-    template<
-        typename Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        J_DataBox fieldJ,
-        float3_X* gCurrent,
-        Mapping mapper
-    ) const
+    struct KernelSumCurrents
     {
-        using SuperCellSize = typename Mapping::SuperCellSize;
-
-        PMACC_SMEM( acc, sh_sumJ, float3_X );
-
-        const DataSpace<simDim > threadIndex(threadIdx);
-        const int linearThreadIdx = DataSpaceOperations<simDim>::template map<SuperCellSize > (threadIndex);
-
-        if (linearThreadIdx == 0)
+        template<typename Mapping, typename T_Acc>
+        DINLINE void operator()(T_Acc const& acc, J_DataBox fieldJ, float3_X* gCurrent, Mapping mapper) const
         {
-            sh_sumJ = float3_X::create(0.0);
-        }
+            using SuperCellSize = typename Mapping::SuperCellSize;
 
-        __syncthreads();
+            PMACC_SMEM(acc, sh_sumJ, float3_X);
 
+            const DataSpace<simDim> threadIndex(cupla::threadIdx(acc));
+            const int linearThreadIdx = DataSpaceOperations<simDim>::template map<SuperCellSize>(threadIndex);
 
-        const DataSpace<simDim> superCellIdx(mapper.getSuperCellIndex(DataSpace<simDim > (blockIdx)));
-        const DataSpace<simDim> cell(superCellIdx * SuperCellSize::toRT() + threadIndex);
+            if(linearThreadIdx == 0)
+            {
+                sh_sumJ = float3_X::create(0.0);
+            }
 
-        const float3_X myJ = fieldJ(cell);
+            cupla::__syncthreads(acc);
 
-        atomicAdd( &(sh_sumJ.x()), myJ.x(), ::alpaka::hierarchy::Threads{});
-        atomicAdd( &(sh_sumJ.y()), myJ.y(), ::alpaka::hierarchy::Threads{});
-        atomicAdd( &(sh_sumJ.z()), myJ.z(), ::alpaka::hierarchy::Threads{});
 
-        __syncthreads();
+            const DataSpace<simDim> superCellIdx(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+            const DataSpace<simDim> cell(superCellIdx * SuperCellSize::toRT() + threadIndex);
 
-        if (linearThreadIdx == 0)
-        {
-            atomicAdd( &(gCurrent->x()), sh_sumJ.x(), ::alpaka::hierarchy::Blocks{});
-            atomicAdd( &(gCurrent->y()), sh_sumJ.y(), ::alpaka::hierarchy::Blocks{});
-            atomicAdd( &(gCurrent->z()), sh_sumJ.z(), ::alpaka::hierarchy::Blocks{});
-        }
-    }
-};
+            const float3_X myJ = fieldJ(cell);
 
-class SumCurrents : public ILightweightPlugin
-{
-private:
-    MappingDesc *cellDescription;
-    std::string notifyPeriod;
+            cupla::atomicAdd(acc, &(sh_sumJ.x()), myJ.x(), ::alpaka::hierarchy::Threads{});
+            cupla::atomicAdd(acc, &(sh_sumJ.y()), myJ.y(), ::alpaka::hierarchy::Threads{});
+            cupla::atomicAdd(acc, &(sh_sumJ.z()), myJ.z(), ::alpaka::hierarchy::Threads{});
 
-    GridBuffer<float3_X, DIM1> *sumcurrents;
+            cupla::__syncthreads(acc);
 
-public:
+            if(linearThreadIdx == 0)
+            {
+                cupla::atomicAdd(acc, &(gCurrent->x()), sh_sumJ.x(), ::alpaka::hierarchy::Blocks{});
+                cupla::atomicAdd(acc, &(gCurrent->y()), sh_sumJ.y(), ::alpaka::hierarchy::Blocks{});
+                cupla::atomicAdd(acc, &(gCurrent->z()), sh_sumJ.z(), ::alpaka::hierarchy::Blocks{});
+            }
+        }
+    };
 
-    SumCurrents() :
-    cellDescription(nullptr)
+    class SumCurrents : public ILightweightPlugin
     {
+    private:
+        MappingDesc* cellDescription;
+        std::string notifyPeriod;
 
-        Environment<>::get().PluginConnector().registerPlugin(this);
-    }
+        GridBuffer<float3_X, DIM1>* sumcurrents;
 
-    virtual ~SumCurrents()
-    {
+    public:
+        SumCurrents() : cellDescription(nullptr)
+        {
+            Environment<>::get().PluginConnector().registerPlugin(this);
+        }
 
-    }
+        virtual ~SumCurrents()
+        {
+        }
 
-    void notify(uint32_t currentStep)
-    {
-        const int rank = Environment<simDim>::get().GridController().getGlobalRank();
-        const float3_X gCurrent = getSumCurrents();
-
-        // gCurrent is just j
-        // j = I/A
-#if(SIMDIM==DIM3)
-        const float3_X realCurrent(
-                                   gCurrent.x() * CELL_HEIGHT * CELL_DEPTH,
-                                   gCurrent.y() * CELL_WIDTH * CELL_DEPTH,
-                                   gCurrent.z() * CELL_WIDTH * CELL_HEIGHT);
-#elif(SIMDIM==DIM2)
-        const float3_X realCurrent(
-                                   gCurrent.x() * CELL_HEIGHT,
-                                   gCurrent.y() * CELL_WIDTH,
-                                   gCurrent.z() * CELL_WIDTH * CELL_HEIGHT);
+        void notify(uint32_t currentStep)
+        {
+            const int rank = Environment<simDim>::get().GridController().getGlobalRank();
+            const float3_X gCurrent = getSumCurrents();
+
+            // gCurrent is just j
+            // j = I/A
+#if(SIMDIM == DIM3)
+            const float3_X realCurrent(
+                gCurrent.x() * CELL_HEIGHT * CELL_DEPTH,
+                gCurrent.y() * CELL_WIDTH * CELL_DEPTH,
+                gCurrent.z() * CELL_WIDTH * CELL_HEIGHT);
+#elif(SIMDIM == DIM2)
+            const float3_X realCurrent(
+                gCurrent.x() * CELL_HEIGHT,
+                gCurrent.y() * CELL_WIDTH,
+                gCurrent.z() * CELL_WIDTH * CELL_HEIGHT);
 #endif
-        float3_64 realCurrent_SI(
-                                 float_64(realCurrent.x()) * (UNIT_CHARGE / UNIT_TIME),
-                                 float_64(realCurrent.y()) * (UNIT_CHARGE / UNIT_TIME),
-                                 float_64(realCurrent.z()) * (UNIT_CHARGE / UNIT_TIME));
-
-        /*FORMAT OUTPUT*/
-        using dbl = std::numeric_limits<float_64>;
-
-        std::cout.precision(dbl::digits10);
-        if (math::abs(gCurrent.x()) + math::abs(gCurrent.y()) + math::abs(gCurrent.z()) != float_X(0.0))
-            std::cout << "[ANALYSIS] [" << rank << "] [COUNTER] [SumCurrents] [" << currentStep
-            << std::scientific << "] " <<
-            realCurrent_SI << " Abs:" << math::abs(realCurrent_SI) << std::endl;
-    }
-
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        desc.add_options()
-            ("sumcurr.period", po::value<std::string> (&notifyPeriod), "enable plugin [for each n-th step]");
-    }
-
-    std::string pluginGetName() const
-    {
-        return "SumCurrents";
-    }
-
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        this->cellDescription = cellDescription;
-    }
-
-private:
+            float3_64 realCurrent_SI(
+                float_64(realCurrent.x()) * (UNIT_CHARGE / UNIT_TIME),
+                float_64(realCurrent.y()) * (UNIT_CHARGE / UNIT_TIME),
+                float_64(realCurrent.z()) * (UNIT_CHARGE / UNIT_TIME));
+
+            /*FORMAT OUTPUT*/
+            using dbl = std::numeric_limits<float_64>;
+
+            std::cout.precision(dbl::digits10);
+            if(math::abs(gCurrent.x()) + math::abs(gCurrent.y()) + math::abs(gCurrent.z()) != float_X(0.0))
+                std::cout << "[ANALYSIS] [" << rank << "] [COUNTER] [SumCurrents] [" << currentStep << std::scientific
+                          << "] " << realCurrent_SI << " Abs:" << math::abs(realCurrent_SI) << std::endl;
+        }
 
-    void pluginLoad()
-    {
-        if (!notifyPeriod.empty())
+        void pluginRegisterHelp(po::options_description& desc)
         {
-            sumcurrents = new GridBuffer<float3_X, DIM1 > (DataSpace<DIM1 > (1)); //create one int on gpu und host
+            desc.add_options()(
+                "sumcurr.period",
+                po::value<std::string>(&notifyPeriod),
+                "enable plugin [for each n-th step]");
+        }
 
-            Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+        std::string pluginGetName() const
+        {
+            return "SumCurrents";
         }
-    }
 
-    void pluginUnload()
-    {
-        if(!notifyPeriod.empty())
+        void setMappingDescription(MappingDesc* cellDescription)
         {
-            __delete(sumcurrents);
+            this->cellDescription = cellDescription;
         }
-    }
 
-    float3_X getSumCurrents()
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto fieldJ = dc.get< FieldJ >( FieldJ::getName(), true );
+    private:
+        void pluginLoad()
+        {
+            if(!notifyPeriod.empty())
+            {
+                sumcurrents = new GridBuffer<float3_X, DIM1>(DataSpace<DIM1>(1)); // create one int on gpu und host
 
-        sumcurrents->getDeviceBuffer().setValue(float3_X::create(0.0));
-        auto block = MappingDesc::SuperCellSize::toRT();
+                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+            }
+        }
 
-        AreaMapping<CORE + BORDER, MappingDesc> mapper(*cellDescription);
-        PMACC_KERNEL(KernelSumCurrents{})
-            (mapper.getGridDim(), block)
-            (fieldJ->getDeviceDataBox(),
-             sumcurrents->getDeviceBuffer().getBasePointer(),
-             mapper);
+        void pluginUnload()
+        {
+            if(!notifyPeriod.empty())
+            {
+                __delete(sumcurrents);
+            }
+        }
 
-        dc.releaseData( FieldJ::getName() );
+        float3_X getSumCurrents()
+        {
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto fieldJ = dc.get<FieldJ>(FieldJ::getName(), true);
 
-        sumcurrents->deviceToHost();
-        return sumcurrents->getHostBuffer().getDataBox()[0];
-    }
+            sumcurrents->getDeviceBuffer().setValue(float3_X::create(0.0));
+            auto block = MappingDesc::SuperCellSize::toRT();
 
-};
+            AreaMapping<CORE + BORDER, MappingDesc> mapper(*cellDescription);
+            PMACC_KERNEL(KernelSumCurrents{})
+            (mapper.getGridDim(),
+             block)(fieldJ->getDeviceDataBox(), sumcurrents->getDeviceBuffer().getBasePointer(), mapper);
 
-}
+            dc.releaseData(FieldJ::getName());
 
+            sumcurrents->deviceToHost();
+            return sumcurrents->getHostBuffer().getDataBox()[0];
+        }
+    };
 
+} // namespace picongpu
diff --git a/include/picongpu/plugins/adios/ADIOSCountParticles.hpp b/include/picongpu/plugins/adios/ADIOSCountParticles.hpp
index 6efeac564d..18f065c6be 100644
--- a/include/picongpu/plugins/adios/ADIOSCountParticles.hpp
+++ b/include/picongpu/plugins/adios/ADIOSCountParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt, Axel Huebl
+/* Copyright 2014-2021 Felix Schmitt, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -51,149 +51,167 @@
 
 namespace picongpu
 {
-
-namespace adios
-{
-using namespace pmacc;
-
-
-
-/** Count number of particles for a species
- *
- * @tparam T_Species type of species
- *
- */
-template< typename T_SpeciesFilter >
-struct ADIOSCountParticles
-{
-public:
-
-    typedef typename T_SpeciesFilter::Species ThisSpecies;
-    typedef typename ThisSpecies::FrameType FrameType;
-    typedef typename FrameType::ParticleDescription ParticleDescription;
-    typedef typename FrameType::ValueTypeSeq ParticleAttributeList;
-
-    /* delete multiMask and localCellIdx in adios particle*/
-    typedef bmpl::vector<multiMask,localCellIdx> TypesToDelete;
-    typedef typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type ParticleCleanedAttributeList;
-
-    /* add totalCellIdx for adios particle*/
-    typedef typename MakeSeq<
-            ParticleCleanedAttributeList,
-            totalCellIdx
-    >::type ParticleNewAttributeList;
-
-    typedef
-    typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type
-    NewParticleDescription;
-
-    typedef Frame<OperatorCreateVectorBox, NewParticleDescription> AdiosFrameType;
-
-    HINLINE void operator()(ThreadParams* params)
+    namespace adios
     {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        GridController<simDim>& gc = Environment<simDim>::get().GridController();
-        uint64_t mpiSize = gc.getGlobalSize();
-        uint64_t mpiRank = gc.getGlobalRank();
-
-        const std::string speciesGroup( T_SpeciesFilter::getName() + "/" );
-        const std::string speciesPath( params->adiosBasePath +
-            std::string(ADIOS_PATH_PARTICLES) + speciesGroup );
-
-        /* load particle without copy particle data to host */
-        auto speciesTmp = dc.get< ThisSpecies >( ThisSpecies::FrameType::getName(), true );
-        // enforce that the filter interface is fulfilled
-        particles::filter::IUnary< typename T_SpeciesFilter::Filter > particleFilter{ params->currentStep };
-        /* count total number of particles on the device */
-        uint64_cu totalNumParticles = 0;
-        totalNumParticles = pmacc::CountParticles::countOnDevice < CORE + BORDER > (
-                                                                                    *speciesTmp,
-                                                                                    *(params->cellDescription),
-                                                                                    params->localWindowToDomainOffset,
-                                                                                    params->window.localDimensions.size,
-                                                                                    particleFilter);
-
-        /* MPI_Allgather to compute global size and my offset */
-        uint64_t myNumParticles = totalNumParticles;
-        uint64_t allNumParticles[mpiSize];
-        uint64_t globalNumParticles = 0;
-        uint64_t myParticleOffset = 0;
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Allgather(
-                &myNumParticles, 1, MPI_UNSIGNED_LONG_LONG,
-                allNumParticles, 1, MPI_UNSIGNED_LONG_LONG,
-                gc.getCommunicator().getMPIComm()));
-
-        for (uint64_t i = 0; i < mpiSize; ++i)
-        {
-            globalNumParticles += allNumParticles[i];
-            if (i < mpiRank)
-                myParticleOffset += allNumParticles[i];
-        }
-
-        /* iterate over all attributes of this species */
-        meta::ForEach<typename AdiosFrameType::ValueTypeSeq, adios::ParticleAttributeSize<bmpl::_1> > attributeSize;
-        attributeSize(params, speciesGroup, myNumParticles, globalNumParticles, myParticleOffset);
-
-        /* TODO: constant particle records */
-
-        /* openPMD ED-PIC: additional attributes */
-        traits::PICToAdios<float_64> adiosDoubleType;
-        const float_64 particleShape( GetShape<ThisSpecies>::type::support - 1 );
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "particleShape", speciesPath.c_str(),
-            adiosDoubleType.type, 1, (void*)&particleShape ));
-
-        traits::GetSpeciesFlagName<ThisSpecies, current<> > currentDepositionName;
-        const std::string currentDeposition( currentDepositionName() );
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "currentDeposition", speciesPath.c_str(),
-            adios_string, 1, (void*)currentDeposition.c_str() ));
-
-        traits::GetSpeciesFlagName<ThisSpecies, particlePusher<> > particlePushName;
-        const std::string particlePush( particlePushName() );
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "particlePush", speciesPath.c_str(),
-            adios_string, 1, (void*)particlePush.c_str() ));
-
-        traits::GetSpeciesFlagName<ThisSpecies, interpolation<> > particleInterpolationName;
-        const std::string particleInterpolation( particleInterpolationName() );
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "particleInterpolation", speciesPath.c_str(),
-            adios_string, 1, (void*)particleInterpolation.c_str() ));
-
-        const std::string particleSmoothing( "none" );
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "particleSmoothing", speciesPath.c_str(),
-            adios_string, 1, (void*)particleSmoothing.c_str() ));
-
-        /* define adios var for species index/info table */
-        {
-            const uint64_t localTableSize = 5;
-            traits::PICToAdios<uint64_t> adiosIndexType;
+        using namespace pmacc;
 
-            const char* path = nullptr;
-            int64_t adiosSpeciesIndexVar = defineAdiosVar<DIM1>(
-                params->adiosGroupHandle,
-                (speciesPath + "particles_info").c_str(),
-                path,
-                adiosIndexType.type,
-                pmacc::math::UInt64<DIM1>(localTableSize),
-                pmacc::math::UInt64<DIM1>(localTableSize * uint64_t(gc.getGlobalSize()) ),
-                pmacc::math::UInt64<DIM1>(localTableSize * uint64_t(gc.getGlobalRank()) ),
-                true,
-                params->adiosCompression);
 
-            params->adiosSpeciesIndexVarIds.push_back(adiosSpeciesIndexVar);
-
-            params->adiosGroupSize += sizeof(uint64_t) * localTableSize * gc.getGlobalSize();
-        }
-    }
-};
-
-
-} //namspace adios
-
-} //namespace picongpu
+        /** Count number of particles for a species
+         *
+         * @tparam T_Species type of species
+         *
+         */
+        template<typename T_SpeciesFilter>
+        struct ADIOSCountParticles
+        {
+        public:
+            typedef typename T_SpeciesFilter::Species ThisSpecies;
+            typedef typename ThisSpecies::FrameType FrameType;
+            typedef typename FrameType::ParticleDescription ParticleDescription;
+            typedef typename FrameType::ValueTypeSeq ParticleAttributeList;
+
+            /* delete multiMask and localCellIdx in adios particle*/
+            typedef bmpl::vector<multiMask, localCellIdx> TypesToDelete;
+            typedef typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type ParticleCleanedAttributeList;
+
+            /* add totalCellIdx for adios particle*/
+            typedef typename MakeSeq<ParticleCleanedAttributeList, totalCellIdx>::type ParticleNewAttributeList;
+
+            typedef typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type
+                NewParticleDescription;
+
+            typedef Frame<OperatorCreateVectorBox, NewParticleDescription> AdiosFrameType;
+
+            HINLINE void operator()(ThreadParams* params)
+            {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                uint64_t mpiSize = gc.getGlobalSize();
+                uint64_t mpiRank = gc.getGlobalRank();
+
+                const std::string speciesGroup(T_SpeciesFilter::getName() + "/");
+                const std::string speciesPath(
+                    params->adiosBasePath + std::string(ADIOS_PATH_PARTICLES) + speciesGroup);
+
+                /* load particle without copy particle data to host */
+                auto speciesTmp = dc.get<ThisSpecies>(ThisSpecies::FrameType::getName(), true);
+                // enforce that the filter interface is fulfilled
+                particles::filter::IUnary<typename T_SpeciesFilter::Filter> particleFilter{params->currentStep};
+                /* count total number of particles on the device */
+                uint64_cu totalNumParticles = 0;
+                totalNumParticles = pmacc::CountParticles::countOnDevice<CORE + BORDER>(
+                    *speciesTmp,
+                    *(params->cellDescription),
+                    params->localWindowToDomainOffset,
+                    params->window.localDimensions.size,
+                    particleFilter);
+
+                /* MPI_Allgather to compute global size and my offset */
+                uint64_t myNumParticles = totalNumParticles;
+                uint64_t allNumParticles[mpiSize];
+                uint64_t globalNumParticles = 0;
+                uint64_t myParticleOffset = 0;
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Allgather(
+                    &myNumParticles,
+                    1,
+                    MPI_UNSIGNED_LONG_LONG,
+                    allNumParticles,
+                    1,
+                    MPI_UNSIGNED_LONG_LONG,
+                    gc.getCommunicator().getMPIComm()));
+
+                for(uint64_t i = 0; i < mpiSize; ++i)
+                {
+                    globalNumParticles += allNumParticles[i];
+                    if(i < mpiRank)
+                        myParticleOffset += allNumParticles[i];
+                }
+
+                /* iterate over all attributes of this species */
+                meta::ForEach<typename AdiosFrameType::ValueTypeSeq, adios::ParticleAttributeSize<bmpl::_1>>
+                    attributeSize;
+                attributeSize(params, speciesGroup, myNumParticles, globalNumParticles, myParticleOffset);
+
+                /* TODO: constant particle records */
+
+                /* openPMD ED-PIC: additional attributes */
+                traits::PICToAdios<float_64> adiosDoubleType;
+                const float_64 particleShape(GetShape<ThisSpecies>::type::assignmentFunctionOrder);
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "particleShape",
+                    speciesPath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &particleShape));
+
+                traits::GetSpeciesFlagName<ThisSpecies, current<>> currentDepositionName;
+                const std::string currentDeposition(currentDepositionName());
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "currentDeposition",
+                    speciesPath.c_str(),
+                    adios_string,
+                    1,
+                    (void*) currentDeposition.c_str()));
+
+                traits::GetSpeciesFlagName<ThisSpecies, particlePusher<>> particlePushName;
+                const std::string particlePush(particlePushName());
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "particlePush",
+                    speciesPath.c_str(),
+                    adios_string,
+                    1,
+                    (void*) particlePush.c_str()));
+
+                traits::GetSpeciesFlagName<ThisSpecies, interpolation<>> particleInterpolationName;
+                const std::string particleInterpolation(particleInterpolationName());
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "particleInterpolation",
+                    speciesPath.c_str(),
+                    adios_string,
+                    1,
+                    (void*) particleInterpolation.c_str()));
+
+                const std::string particleSmoothing("none");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "particleSmoothing",
+                    speciesPath.c_str(),
+                    adios_string,
+                    1,
+                    (void*) particleSmoothing.c_str()));
+
+                /* define adios var for species index/info table */
+                {
+                    const uint64_t localTableSize = 5;
+                    traits::PICToAdios<uint64_t> adiosIndexType;
+
+                    const char* path = nullptr;
+                    int64_t adiosSpeciesIndexVar = defineAdiosVar<DIM1>(
+                        params->adiosGroupHandle,
+                        (speciesPath + "particles_info").c_str(),
+                        path,
+                        adiosIndexType.type,
+                        pmacc::math::UInt64<DIM1>(localTableSize),
+                        pmacc::math::UInt64<DIM1>(localTableSize * uint64_t(gc.getGlobalSize())),
+                        pmacc::math::UInt64<DIM1>(localTableSize * uint64_t(gc.getGlobalRank())),
+                        true,
+                        params->adiosCompression);
+
+                    params->adiosSpeciesIndexVarIds.push_back(adiosSpeciesIndexVar);
+
+                    params->adiosGroupSize += sizeof(uint64_t) * localTableSize * gc.getGlobalSize();
+                }
+            }
+        };
+
+
+    } // namespace adios
+
+} // namespace picongpu
diff --git a/include/picongpu/plugins/adios/ADIOSWriter.def b/include/picongpu/plugins/adios/ADIOSWriter.def
index de506f1196..ce5765f189 100644
--- a/include/picongpu/plugins/adios/ADIOSWriter.def
+++ b/include/picongpu/plugins/adios/ADIOSWriter.def
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt, Axel Huebl
+/* Copyright 2014-2021 Felix Schmitt, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -26,7 +26,7 @@
 #include <limits>
 #include <sstream>
 #include <string>
-#include <iostream>  // std::cerr
+#include <iostream> // std::cerr
 #include <stdexcept> // throw std::runtime_error
 
 #include <pmacc/types.hpp>
@@ -38,133 +38,133 @@
 
 namespace picongpu
 {
-
-namespace adios
-{
-using namespace pmacc;
+    namespace adios
+    {
+        using namespace pmacc;
 
 
-namespace po = boost::program_options;
+        namespace po = boost::program_options;
 
 #define ADIOS_INVALID_HANDLE -1
-#define ADIOS_SUCCESS       err_no_error
-#define ADIOS_GROUP_NAME     "data"
+#define ADIOS_SUCCESS err_no_error
+#define ADIOS_GROUP_NAME "data"
 
-#define ADIOS_PATH_ROOT      "/data/"
-#define ADIOS_PATH_FIELDS    "fields/"
+#define ADIOS_PATH_ROOT "/data/"
+#define ADIOS_PATH_FIELDS "fields/"
 #define ADIOS_PATH_PARTICLES "particles/"
 
-#define ADIOS_SIZE_LOCAL     "size_"
-#define ADIOS_SIZE_GLOBAL    "totalSize_"
-#define ADIOS_OFFSET_GLOBAL  "offset_"
-
-#define ADIOS_CMD(_cmd)                                                       \
-{                                                                             \
-    int _err_code = _cmd;                                                     \
-    if (_err_code != ADIOS_SUCCESS)                                           \
-    {                                                                         \
-        std::string errMsg( adios_errmsg() );                                 \
-        if( errMsg.empty() ) errMsg = '\n';                                   \
-        std::stringstream s;                                                  \
-        s << "ADIOS: error at cmd '" << #_cmd                                 \
-          << "' (" << _err_code << ", " << adios_errno << ") in "             \
-          << __FILE__ << ":" << __LINE__ << " " << errMsg;                    \
-        throw std::runtime_error(s.str());                                    \
-    }                                                                         \
-}
-
-#define ADIOS_CMD_EXPECT_NONNULL(_cmd)                                        \
-{                                                                             \
-    if (!(_cmd))                                                              \
-    {                                                                         \
-        std::string errMsg( adios_errmsg() );                                 \
-        if( errMsg.empty() ) errMsg = '\n';                                   \
-        std::stringstream s;                                                  \
-        s << "ADIOS: error at cmd '" << #_cmd                                 \
-          << "' (" << adios_errno << ") in "                                  \
-          << __FILE__ << ":" << __LINE__ << " " << errMsg;                    \
-        throw std::runtime_error(s.str());                                    \
-    }                                                                         \
-}
-
-struct ThreadParams
-{
-    uint32_t currentStep;                   /** current simulation step */
-    std::string adiosFilename;              /* e.g., simData */
-    std::string fullFilename;               /* e.g., simData_1000.bp */
-
-    /** current dump is a checkpoint */
-    bool isCheckpoint;
-    ADIOS_FILE* fp;                          /* file pointer for checkpoint file */
-
-    MPI_Comm adiosComm;                     /* MPI communicator for adios lib */
-    bool adiosBufferInitialized;            /* set if ADIOS buffer has been allocated */
-    int64_t adiosFileHandle;                /* ADIOS file handle */
-    int64_t adiosGroupHandle;               /* ADIOS group handle */
-    uint64_t adiosGroupSize;                /* size of ADIOS group in bytes */
-    uint32_t adiosAggregators;              /* number of ADIOS aggregators for MPI_AGGREGATE */
-    uint32_t adiosOST;                      /* number of ADIOS OST for MPI_AGGREGATE */
-    bool adiosDisableMeta;                  /* disable online gather and write of a meta file */
-    std::string adiosTransportParams;       /* additional transport params */
-    std::string adiosBasePath;              /* base path for the current step */
-    std::string adiosCompression;           /* ADIOS data transform compression method */
-
-    pmacc::math::UInt64<simDim> fieldsSizeDims;
-    pmacc::math::UInt64<simDim> fieldsGlobalSizeDims;
-    pmacc::math::UInt64<simDim> fieldsOffsetDims;
-
-    std::list<int64_t> adiosFieldVarIds;        /* var IDs for fields in order of appearance */
-    std::list<int64_t> adiosParticleAttrVarIds; /* var IDs for particle attributes in order of appearance */
-    std::list<int64_t> adiosSpeciesIndexVarIds; /* var IDs for species index tables in order of appearance */
-
-    GridLayout<simDim> gridLayout;
-    MappingDesc *cellDescription;
-
-    float_X *fieldBfr;                              /* temp. buffer for fields */
-
-    Window window;                                  /* window describing the volume to be dumped */
-
-    DataSpace<simDim> localWindowToDomainOffset;    /** offset from local moving window to local domain */
-};
-
-/**
- * Writes simulation data to adios files.
- * Implements the ILightweightPlugin interface.
- */
-
-class ADIOSWriter;
-
-/** Default ADIOS types we will use */
-typedef PICToAdios<uint32_t> AdiosUInt32Type;
-typedef PICToAdios<float_X> AdiosFloatXType;
-typedef PICToAdios<double> AdiosDoubleType;
-
-/**
- * Wrapper for adios_define_var that sets data transform method
- *
- * @tparam DIM number of variable dimensions
- *
- * @param group_id pointer to the internal group structure
- * @param name string containing the name part of a variable
- * @param path string containing the path of an variable
- * @param type variable type
- * @param dimensions variable local dimension
- * @param globalDimensions variable global dimension
- * @param offset variable local offset
- * @param compression enable compression data transform
- * @param compressionMethod string denoting the data transform to use
- * @return ADIOS variable ID
- */
-template <unsigned DIM>
-int64_t defineAdiosVar(int64_t group_id,
-                       const char * name,
-                       const char * path,
-                       enum ADIOS_DATATYPES type,
-                       pmacc::math::UInt64<DIM> dimensions,
-                       pmacc::math::UInt64<DIM> globalDimensions,
-                       pmacc::math::UInt64<DIM> offset,
-                       bool compression,
-                       std::string compressionMethod);
-
-} //namespace adios
-} //namespace picongpu
+#define ADIOS_SIZE_LOCAL "size_"
+#define ADIOS_SIZE_GLOBAL "totalSize_"
+#define ADIOS_OFFSET_GLOBAL "offset_"
+
+#define ADIOS_CMD(_cmd)                                                                                               \
+    {                                                                                                                 \
+        int _err_code = _cmd;                                                                                         \
+        if(_err_code != ADIOS_SUCCESS)                                                                                \
+        {                                                                                                             \
+            std::string errMsg(adios_errmsg());                                                                       \
+            if(errMsg.empty())                                                                                        \
+                errMsg = '\n';                                                                                        \
+            std::stringstream s;                                                                                      \
+            s << "ADIOS: error at cmd '" << #_cmd << "' (" << _err_code << ", " << adios_errno << ") in " << __FILE__ \
+              << ":" << __LINE__ << " " << errMsg;                                                                    \
+            throw std::runtime_error(s.str());                                                                        \
+        }                                                                                                             \
+    }
+
+#define ADIOS_CMD_EXPECT_NONNULL(_cmd)                                                                                \
+    {                                                                                                                 \
+        if(!(_cmd))                                                                                                   \
+        {                                                                                                             \
+            std::string errMsg(adios_errmsg());                                                                       \
+            if(errMsg.empty())                                                                                        \
+                errMsg = '\n';                                                                                        \
+            std::stringstream s;                                                                                      \
+            s << "ADIOS: error at cmd '" << #_cmd << "' (" << adios_errno << ") in " << __FILE__ << ":" << __LINE__   \
+              << " " << errMsg;                                                                                       \
+            throw std::runtime_error(s.str());                                                                        \
+        }                                                                                                             \
+    }
+
+        struct ThreadParams
+        {
+            uint32_t currentStep; /** current simulation step */
+            std::string adiosFilename; /* e.g., simData */
+            std::string fullFilename; /* e.g., simData_1000.bp */
+
+            /** current dump is a checkpoint */
+            bool isCheckpoint;
+            ADIOS_FILE* fp; /* file pointer for checkpoint file */
+
+            MPI_Comm adiosComm; /* MPI communicator for adios lib */
+            bool adiosBufferInitialized; /* set if ADIOS buffer has been allocated */
+            int64_t adiosFileHandle; /* ADIOS file handle */
+            int64_t adiosGroupHandle; /* ADIOS group handle */
+            uint64_t adiosGroupSize; /* size of ADIOS group in bytes */
+            uint32_t adiosAggregators; /* number of ADIOS aggregators for MPI_AGGREGATE */
+            uint32_t adiosOST; /* number of ADIOS OST for MPI_AGGREGATE */
+            bool adiosDisableMeta; /* disable online gather and write of a meta file */
+            std::string adiosTransportParams; /* additional transport params */
+            std::string adiosBasePath; /* base path for the current step */
+            std::string adiosCompression; /* ADIOS data transform compression method */
+
+            pmacc::math::UInt64<simDim> fieldsSizeDims;
+            pmacc::math::UInt64<simDim> fieldsGlobalSizeDims;
+            pmacc::math::UInt64<simDim> fieldsOffsetDims;
+
+            std::list<int64_t> adiosFieldVarIds; /* var IDs for fields in order of appearance */
+            std::list<int64_t> adiosParticleAttrVarIds; /* var IDs for particle attributes in order of appearance */
+            std::list<int64_t> adiosSpeciesIndexVarIds; /* var IDs for species index tables in order of appearance */
+
+            GridLayout<simDim> gridLayout;
+            MappingDesc* cellDescription;
+
+            float_X* fieldBfr; /* temp. buffer for fields */
+
+            Window window; /* window describing the volume to be dumped */
+
+            DataSpace<simDim> localWindowToDomainOffset; /** offset from local moving window to local domain */
+        };
+
+        /**
+         * Writes simulation data to adios files.
+         * Implements the ILightweightPlugin interface.
+         */
+
+        class ADIOSWriter;
+
+        /** Default ADIOS types we will use */
+        typedef PICToAdios<uint32_t> AdiosUInt32Type;
+        typedef PICToAdios<float_X> AdiosFloatXType;
+        typedef PICToAdios<double> AdiosDoubleType;
+
+        /**
+         * Wrapper for adios_define_var that sets data transform method
+         *
+         * @tparam DIM number of variable dimensions
+         *
+         * @param group_id pointer to the internal group structure
+         * @param name string containing the name part of a variable
+         * @param path string containing the path of an variable
+         * @param type variable type
+         * @param dimensions variable local dimension
+         * @param globalDimensions variable global dimension
+         * @param offset variable local offset
+         * @param compression enable compression data transform
+         * @param compressionMethod string denoting the data transform to use
+         * @return ADIOS variable ID
+         */
+        template<unsigned DIM>
+        int64_t defineAdiosVar(
+            int64_t group_id,
+            const char* name,
+            const char* path,
+            enum ADIOS_DATATYPES type,
+            pmacc::math::UInt64<DIM> dimensions,
+            pmacc::math::UInt64<DIM> globalDimensions,
+            pmacc::math::UInt64<DIM> offset,
+            bool compression,
+            std::string compressionMethod);
+
+    } // namespace adios
+} // namespace picongpu
diff --git a/include/picongpu/plugins/adios/ADIOSWriter.hpp b/include/picongpu/plugins/adios/ADIOSWriter.hpp
index cc4ec9602f..7e4ec93722 100644
--- a/include/picongpu/plugins/adios/ADIOSWriter.hpp
+++ b/include/picongpu/plugins/adios/ADIOSWriter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2014-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -42,16 +42,16 @@
 #include "picongpu/fields/MaxwellSolver/YeePML/Field.hpp"
 #include <pmacc/particles/operations/CountParticles.hpp>
 
+#include <pmacc/communication/manager_common.hpp>
 #include <pmacc/dataManagement/DataConnector.hpp>
+#include <pmacc/Environment.hpp>
 #include <pmacc/mappings/simulation/GridController.hpp>
 #include <pmacc/mappings/simulation/SubGrid.hpp>
 #include <pmacc/dimensions/GridLayout.hpp>
 #include <pmacc/pluginSystem/PluginConnector.hpp>
 #include "picongpu/simulation/control/MovingWindow.hpp"
 #include <pmacc/math/Vector.hpp>
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include <pmacc/particles/memory/buffers/MallocMCBuffer.hpp>
-#endif
+#include <pmacc/particles/memory/buffers/MallocMCBuffer.hpp>
 #include <pmacc/traits/Limits.hpp>
 
 #include "picongpu/plugins/output/IIOBackend.hpp"
@@ -80,7 +80,7 @@
 #include <boost/type_traits.hpp>
 
 #if !defined(_WIN32)
-#include <unistd.h>
+#    include <unistd.h>
 #endif
 
 #include <sstream>
@@ -92,1590 +92,1480 @@
 
 namespace picongpu
 {
+    namespace adios
+    {
+        using namespace pmacc;
+
+
+        namespace po = boost::program_options;
+
+        template<unsigned DIM>
+        int64_t defineAdiosVar(
+            int64_t group_id,
+            const char* name,
+            const char* path,
+            enum ADIOS_DATATYPES type,
+            pmacc::math::UInt64<DIM> dimensions,
+            pmacc::math::UInt64<DIM> globalDimensions,
+            pmacc::math::UInt64<DIM> offset,
+            bool compression,
+            std::string compressionMethod)
+        {
+            int64_t var_id = 0;
+
+            std::string const revertedDimensions = dimensions.revert().toString(",", "");
+            std::string const revertedGlobalDimensions = globalDimensions.revert().toString(",", "");
+            std::string const revertedOffset = offset.revert().toString(",", "");
+            var_id = adios_define_var(
+                group_id,
+                name,
+                path,
+                type,
+                revertedDimensions.c_str(),
+                revertedGlobalDimensions.c_str(),
+                revertedOffset.c_str());
+
+            if(compression)
+            {
+                /* enable adios transform layer for variable */
+                adios_set_transform(var_id, compressionMethod.c_str());
+            }
 
-namespace adios
-{
+            log<picLog::INPUT_OUTPUT>("ADIOS: Defined varID=%1% for '%2%' at %3% for %4%/%5% elements") % var_id
+                % std::string(name) % offset.toString() % dimensions.toString() % globalDimensions.toString();
+            return var_id;
+        }
 
-using namespace pmacc;
+        /** Writes simulation data to adios files.
+         *
+         * Implements the IIOBackend interface.
+         */
+        class ADIOSWriter : public IIOBackend
+        {
+        public:
+            struct Help : public plugins::multi::IHelp
+            {
+                /** creates a instance of ISlave
+                 *
+                 * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
+                 * @param help plugin defined help
+                 * @param id index of the plugin, range: [0;help->getNumPlugins())
+                 */
+                std::shared_ptr<ISlave> create(
+                    std::shared_ptr<IHelp>& help,
+                    size_t const id,
+                    MappingDesc* cellDescription)
+                {
+                    return std::shared_ptr<ISlave>(new ADIOSWriter(help, id, cellDescription));
+                }
 
+                plugins::multi::Option<std::string> notifyPeriod = {"period", "enable ADIOS IO [for each n-th step]"};
 
+                plugins::multi::Option<std::string> source = {"source", "data sources: ", "species_all, fields_all"};
 
-namespace po = boost::program_options;
+                plugins::multi::Option<std::string> fileName = {"file", "ADIOS output filename (prefix)"};
 
-template <unsigned DIM>
-int64_t defineAdiosVar(int64_t group_id,
-                       const char * name,
-                       const char * path,
-                       enum ADIOS_DATATYPES type,
-                       pmacc::math::UInt64<DIM> dimensions,
-                       pmacc::math::UInt64<DIM> globalDimensions,
-                       pmacc::math::UInt64<DIM> offset,
-                       bool compression,
-                       std::string compressionMethod)
-{
-    int64_t var_id = 0;
-
-    std::string const revertedDimensions =
-        dimensions.revert().toString(",", "");
-    std::string const revertedGlobalDimensions =
-        globalDimensions.revert().toString(",", "");
-    std::string const revertedOffset =
-        offset.revert().toString(",", "");
-    var_id = adios_define_var(
-        group_id, name, path, type,
-        revertedDimensions.c_str(),
-        revertedGlobalDimensions.c_str(),
-        revertedOffset.c_str()
-    );
-
-    if(compression)
-    {
-        /* enable adios transform layer for variable */
-        adios_set_transform(var_id, compressionMethod.c_str());
-    }
+                std::vector<std::string> allowedDataSources = {"species_all", "fields_all"};
 
-    log<picLog::INPUT_OUTPUT > ("ADIOS: Defined varID=%1% for '%2%' at %3% for %4%/%5% elements") %
-                var_id % std::string(name) % offset.toString() % dimensions.toString() % globalDimensions.toString();
-    return var_id;
-}
+                plugins::multi::Option<uint32_t> numAggregators
+                    = {"aggregators", "Number of aggregators [0 == number of MPI processes]", 0u};
 
-/** Writes simulation data to adios files.
- *
- * Implements the IIOBackend interface.
- */
-class ADIOSWriter : public IIOBackend
-{
-public:
-    struct Help : public plugins::multi::IHelp
-    {
-        /** creates a instance of ISlave
-         *
-         * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
-         * @param help plugin defined help
-         * @param id index of the plugin, range: [0;help->getNumPlugins())
-         */
-        std::shared_ptr< ISlave > create(
-            std::shared_ptr< IHelp > & help,
-            size_t const id,
-            MappingDesc* cellDescription
-        )
-        {
-            return std::shared_ptr< ISlave >(
-                new ADIOSWriter(
-                    help,
-                    id,
-                    cellDescription
-                )
-            );
-        }
+                plugins::multi::Option<uint32_t> numOSTs = {"ost", "Number of OST", 1u};
 
-        plugins::multi::Option< std::string > notifyPeriod = {
-            "period",
-            "enable ADIOS IO [for each n-th step]"
-        };
+                plugins::multi::Option<uint32_t> disableMeta
+                    = {"disable-meta",
+                       "Disable online gather and write of a global meta file, can be time consuming (use `bpmeta` "
+                       "post-mortem)",
+                       0u};
 
-        plugins::multi::Option< std::string > source = {
-            "source",
-            "data sources: ",
-            "species_all, fields_all"
-        };
+                /* select MPI method, #OSTs and #aggregators */
+                plugins::multi::Option<std::string> transportParams
+                    = {"transport-params",
+                       "additional transport parameters, see ADIOS manual chapter 6.1.5, e.g., "
+                       "'random_offset=1;stripe_count=4'",
+                       ""};
 
-        plugins::multi::Option< std::string > fileName = {
-            "file",
-            "ADIOS output filename (prefix)"
-        };
+                plugins::multi::Option<std::string> compression
+                    = {"compression", "ADIOS compression method, e.g., zlib (see `adios_config -m` for help)", "none"};
 
-        std::vector< std::string > allowedDataSources = {
-            "species_all",
-            "fields_all"
-        };
+                /** defines if the plugin must register itself to the PMacc plugin system
+                 *
+                 * true = the plugin is registering it self
+                 * false = the plugin is not registering itself (plugin is controlled by another class)
+                 */
+                bool selfRegister = false;
 
-        plugins::multi::Option< uint32_t > numAggregators = {
-            "aggregators",
-            "Number of aggregators [0 == number of MPI processes]",
-            0u
-        };
+                template<typename T_TupleVector>
+                struct CreateSpeciesFilter
+                {
+                    using type = plugins::misc::SpeciesFilter<
+                        typename pmacc::math::CT::At<T_TupleVector, bmpl::int_<0>>::type,
+                        typename pmacc::math::CT::At<T_TupleVector, bmpl::int_<1>>::type>;
+                };
 
-        plugins::multi::Option< uint32_t > numOSTs = {
-            "ost",
-            "Number of OST",
-            1u
-        };
+                using AllParticlesTimesAllFilters = typename AllCombinations<
+                    bmpl::vector<FileOutputParticles, particles::filter::AllParticleFilters>>::type;
 
-        plugins::multi::Option< uint32_t > disableMeta = {
-            "disable-meta",
-            "Disable online gather and write of a global meta file, can be time consuming (use `bpmeta` post-mortem)",
-            0u
-        };
+                using AllSpeciesFilter =
+                    typename bmpl::transform<AllParticlesTimesAllFilters, CreateSpeciesFilter<bmpl::_1>>::type;
 
-        /* select MPI method, #OSTs and #aggregators */
-        plugins::multi::Option< std::string > transportParams = {
-            "transport-params",
-            "additional transport parameters, see ADIOS manual chapter 6.1.5, e.g., 'random_offset=1;stripe_count=4'",
-            ""
-        };
+                using AllEligibleSpeciesSources =
+                    typename bmpl::copy_if<AllSpeciesFilter, plugins::misc::speciesFilter::IsEligible<bmpl::_1>>::type;
 
-        plugins::multi::Option< std::string > compression = {
-            "compression",
-            "ADIOS compression method, e.g., zlib (see `adios_config -m` for help)",
-            "none"
-        };
+                using AllFieldSources = FileOutputFields;
 
-        /** defines if the plugin must register itself to the PMacc plugin system
-         *
-         * true = the plugin is registering it self
-         * false = the plugin is not registering itself (plugin is controlled by another class)
-         */
-        bool selfRegister = false;
+                ///! method used by plugin controller to get --help description
+                void registerHelp(
+                    boost::program_options::options_description& desc,
+                    std::string const& masterPrefix = std::string{})
+                {
+                    meta::ForEach<AllEligibleSpeciesSources, plugins::misc::AppendName<bmpl::_1>>
+                        getEligibleDataSourceNames;
+                    getEligibleDataSourceNames(allowedDataSources);
 
-        template<typename T_TupleVector>
-        struct CreateSpeciesFilter
-        {
-            using type = plugins::misc::SpeciesFilter<
-                typename pmacc::math::CT::At<
-                    T_TupleVector,
-                    bmpl::int_<0>
-                >::type,
-                typename pmacc::math::CT::At<
-                    T_TupleVector,
-                    bmpl::int_<1>
-                >::type
-            >;
-        };
+                    meta::ForEach<AllFieldSources, plugins::misc::AppendName<bmpl::_1>> appendFieldSourceNames;
+                    appendFieldSourceNames(allowedDataSources);
 
-        using AllParticlesTimesAllFilters = typename AllCombinations<
-            bmpl::vector<
-                FileOutputParticles,
-                particles::filter::AllParticleFilters
-            >
-         >::type;
-
-        using AllSpeciesFilter = typename bmpl::transform<
-            AllParticlesTimesAllFilters,
-            CreateSpeciesFilter< bmpl::_1 >
-        >::type;
-
-        using AllEligibleSpeciesSources = typename bmpl::copy_if<
-            AllSpeciesFilter,
-            plugins::misc::speciesFilter::IsEligible< bmpl::_1 >
-        >::type;
-
-        using AllFieldSources = FileOutputFields;
-
-        ///! method used by plugin controller to get --help description
-        void registerHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        )
-        {
-            meta::ForEach<
-                AllEligibleSpeciesSources,
-                plugins::misc::AppendName< bmpl::_1 >
-            > getEligibleDataSourceNames;
-            getEligibleDataSourceNames( allowedDataSources );
-
-            meta::ForEach<
-                AllFieldSources,
-                plugins::misc::AppendName< bmpl::_1 >
-            > appendFieldSourceNames;
-            appendFieldSourceNames( allowedDataSources );
-
-            // string list with all possible particle sources
-            std::string concatenatedSourceNames = plugins::misc::concatenateToString(
-                allowedDataSources,
-                ", "
-            );
-
-            notifyPeriod.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            source.registerHelp(
-                desc,
-                masterPrefix + prefix,
-                std::string( "[" ) + concatenatedSourceNames + "]"
-            );
-            fileName.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-
-            expandHelp(desc, "");
-            selfRegister = true;
-        }
+                    // string list with all possible particle sources
+                    std::string concatenatedSourceNames = plugins::misc::concatenateToString(allowedDataSources, ", ");
 
-        void expandHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        )
-        {
-            numAggregators.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            numOSTs.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            disableMeta.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            transportParams.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            compression.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-        }
+                    notifyPeriod.registerHelp(desc, masterPrefix + prefix);
+                    source.registerHelp(desc, masterPrefix + prefix, std::string("[") + concatenatedSourceNames + "]");
+                    fileName.registerHelp(desc, masterPrefix + prefix);
 
-        void validateOptions()
-        {
-            if( selfRegister )
-            {
-                if( notifyPeriod.empty() || fileName.empty() )
-                    throw std::runtime_error(
-                        name +
-                        ": parameter period and file must be defined"
-                    );
-
-                // check if user passed data source names are valid
-                for( auto const & dataSourceNames : source)
+                    expandHelp(desc, "");
+                    selfRegister = true;
+                }
+
+                void expandHelp(
+                    boost::program_options::options_description& desc,
+                    std::string const& masterPrefix = std::string{})
                 {
-                    auto vectorOfDataSourceNames = plugins::misc::splitString(
-                        plugins::misc::removeSpaces( dataSourceNames )
-                    );
+                    numAggregators.registerHelp(desc, masterPrefix + prefix);
+                    numOSTs.registerHelp(desc, masterPrefix + prefix);
+                    disableMeta.registerHelp(desc, masterPrefix + prefix);
+                    transportParams.registerHelp(desc, masterPrefix + prefix);
+                    compression.registerHelp(desc, masterPrefix + prefix);
+                }
 
-                    for( auto const & f : vectorOfDataSourceNames )
+                void validateOptions()
+                {
+                    if(selfRegister)
                     {
-                        if(
-                            !plugins::misc::containsObject(
-                                allowedDataSources,
-                                f
-                            )
-                        )
+                        if(notifyPeriod.empty() || fileName.empty())
+                            throw std::runtime_error(name + ": parameter period and file must be defined");
+
+                        // check if user passed data source names are valid
+                        for(auto const& dataSourceNames : source)
                         {
-                            throw std::runtime_error( name + ": unknown data source '" + f + "'" );
+                            auto vectorOfDataSourceNames
+                                = plugins::misc::splitString(plugins::misc::removeSpaces(dataSourceNames));
+
+                            for(auto const& f : vectorOfDataSourceNames)
+                            {
+                                if(!plugins::misc::containsObject(allowedDataSources, f))
+                                {
+                                    throw std::runtime_error(name + ": unknown data source '" + f + "'");
+                                }
+                            }
                         }
                     }
                 }
-            }
-        }
 
-        size_t getNumPlugins() const
-        {
-            if( selfRegister )
-                return notifyPeriod.size();
-            else
-                return 1;
-        }
+                size_t getNumPlugins() const
+                {
+                    if(selfRegister)
+                        return notifyPeriod.size();
+                    else
+                        return 1;
+                }
 
-        std::string getDescription() const
-        {
-            return description;
-        }
+                std::string getDescription() const
+                {
+                    return description;
+                }
 
-        std::string getOptionPrefix() const
-        {
-            return prefix;
-        }
+                std::string getOptionPrefix() const
+                {
+                    return prefix;
+                }
 
-        std::string getName() const
-        {
-            return name;
-        }
+                std::string getName() const
+                {
+                    return name;
+                }
 
-        std::string const name = "ADIOSWriter";
-        //! short description of the plugin
-        std::string const description = "dump simulation data with ADIOS";
-        //! prefix used for command line arguments
-        std::string const prefix = "adios";
-    };
+                std::string const name = "ADIOSWriter";
+                //! short description of the plugin
+                std::string const description = "dump simulation data with ADIOS";
+                //! prefix used for command line arguments
+                std::string const prefix = "adios";
+            };
 
-    //! must be implemented by the user
-    static std::shared_ptr< plugins::multi::IHelp > getHelp()
-    {
-        return std::shared_ptr< plugins::multi::IHelp >( new Help{ } );
-    }
-private:
+            //! must be implemented by the user
+            static std::shared_ptr<plugins::multi::IHelp> getHelp()
+            {
+                return std::shared_ptr<plugins::multi::IHelp>(new Help{});
+            }
 
-    template<typename UnitType>
-    static std::vector<float_64> createUnit(UnitType unit, uint32_t numComponents)
-    {
-        std::vector<float_64> tmp(numComponents);
-        for (uint32_t i = 0; i < numComponents; ++i)
-            tmp[i] = unit[i];
-        return tmp;
-    }
-
-    /**
-     * Write calculated fields to adios file.
-     */
-    template< typename T_Field >
-    struct GetFields
-    {
-    private:
-        using ValueType = typename T_Field::ValueType;
-        using ComponentType = typename GetComponentsType<ValueType>::type;
+        private:
+            template<typename UnitType>
+            static std::vector<float_64> createUnit(UnitType unit, uint32_t numComponents)
+            {
+                std::vector<float_64> tmp(numComponents);
+                for(uint32_t i = 0; i < numComponents; ++i)
+                    tmp[i] = unit[i];
+                return tmp;
+            }
 
-    public:
+            /**
+             * Write calculated fields to adios file.
+             */
+            template<typename T_Field>
+            struct GetFields
+            {
+            private:
+                using ValueType = typename T_Field::ValueType;
+                using ComponentType = typename GetComponentsType<ValueType>::type;
 
-        HDINLINE void operator()(ThreadParams* params)
-        {
+            public:
+                HDINLINE void operator()(ThreadParams* params)
+                {
 #ifndef __CUDA_ARCH__
-            DataConnector &dc = Environment<simDim>::get().DataConnector();
-
-            auto field = dc.get< T_Field >( T_Field::getName() );
-            params->gridLayout = field->getGridLayout();
-            const bool isDomainBound = traits::IsFieldDomainBound< T_Field >::value;
-
-            PICToAdios<ComponentType> adiosType;
-            ADIOSWriter::template writeField<ComponentType>(
-                params,
-                sizeof(ComponentType),
-                adiosType.type,
-                GetNComponents<ValueType>::value,
-                T_Field::getName(),
-                field->getHostDataBox().getPointer(),
-                isDomainBound
-            );
-
-            dc.releaseData( T_Field::getName() );
+                    DataConnector& dc = Environment<simDim>::get().DataConnector();
+
+                    auto field = dc.get<T_Field>(T_Field::getName());
+                    params->gridLayout = field->getGridLayout();
+                    const bool isDomainBound = traits::IsFieldDomainBound<T_Field>::value;
+
+                    PICToAdios<ComponentType> adiosType;
+                    ADIOSWriter::template writeField<ComponentType>(
+                        params,
+                        sizeof(ComponentType),
+                        adiosType.type,
+                        GetNComponents<ValueType>::value,
+                        T_Field::getName(),
+                        field->getHostDataBox().getPointer(),
+                        isDomainBound);
+
+                    dc.releaseData(T_Field::getName());
 #endif
-        }
+                }
+            };
 
-    };
+            /** Calculate FieldTmp with given solver and particle species
+             * and write them to adios.
+             *
+             * FieldTmp is calculated on device and than dumped to adios.
+             */
+            template<typename Solver, typename Species>
+            struct GetFields<FieldTmpOperation<Solver, Species>>
+            {
+                /*
+                 * This is only a wrapper function to allow disable nvcc warnings.
+                 * Warning: calling a __host__ function from __host__ __device__
+                 * function.
+                 * Use of PMACC_NO_NVCC_HDWARNING is not possible if we call a virtual
+                 * method inside of the method were we disable the warnings.
+                 * Therefore we create this method and call a new method were we can
+                 * call virtual functions.
+                 */
+                PMACC_NO_NVCC_HDWARNING
+                HDINLINE void operator()(ThreadParams* tparam)
+                {
+                    this->operator_impl(tparam);
+                }
 
-    /** Calculate FieldTmp with given solver and particle species
-     * and write them to adios.
-     *
-     * FieldTmp is calculated on device and than dumped to adios.
-     */
-    template< typename Solver, typename Species >
-    struct GetFields<FieldTmpOperation<Solver, Species> >
-    {
+            private:
+                typedef typename FieldTmp::ValueType ValueType;
+                typedef typename GetComponentsType<ValueType>::type ComponentType;
 
-        /*
-         * This is only a wrapper function to allow disable nvcc warnings.
-         * Warning: calling a __host__ function from __host__ __device__
-         * function.
-         * Use of PMACC_NO_NVCC_HDWARNING is not possible if we call a virtual
-         * method inside of the method were we disable the warnings.
-         * Therefore we create this method and call a new method were we can
-         * call virtual functions.
-         */
-        PMACC_NO_NVCC_HDWARNING
-        HDINLINE void operator()(ThreadParams* tparam)
-        {
-            this->operator_impl(tparam);
-        }
-    private:
-        typedef typename FieldTmp::ValueType ValueType;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
+                /** Create a name for the adios identifier.
+                 */
+                static std::string getName()
+                {
+                    return FieldTmpOperation<Solver, Species>::getName();
+                }
 
-        /** Create a name for the adios identifier.
-         */
-        static std::string getName()
-        {
-            return FieldTmpOperation<Solver, Species>::getName();
-        }
+                HINLINE void operator_impl(ThreadParams* params)
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+
+                    /*## update field ##*/
+
+                    /*load FieldTmp without copy data to host*/
+                    PMACC_CASSERT_MSG(_please_allocate_at_least_one_FieldTmp_in_memory_param, fieldTmpNumSlots > 0);
+                    auto fieldTmp = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+                    /*load particle without copy particle data to host*/
+                    auto speciesTmp = dc.get<Species>(Species::FrameType::getName(), true);
+
+                    fieldTmp->getGridBuffer().getDeviceBuffer().setValue(ValueType::create(0.0));
+                    /*run algorithm*/
+                    fieldTmp->template computeValue<CORE + BORDER, Solver>(*speciesTmp, params->currentStep);
+
+                    EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
+                    __setTransactionEvent(fieldTmpEvent);
+                    /* copy data to host that we can write same to disk*/
+                    fieldTmp->getGridBuffer().deviceToHost();
+                    dc.releaseData(Species::FrameType::getName());
+                    /*## finish update field ##*/
+
+                    const uint32_t components = GetNComponents<ValueType>::value;
+                    PICToAdios<ComponentType> adiosType;
+
+                    params->gridLayout = fieldTmp->getGridLayout();
+                    const bool isDomainBound = traits::IsFieldDomainBound<FieldTmp>::value;
+                    /*write data to ADIOS file*/
+                    ADIOSWriter::template writeField<ComponentType>(
+                        params,
+                        sizeof(ComponentType),
+                        adiosType.type,
+                        components,
+                        getName(),
+                        fieldTmp->getHostDataBox().getPointer(),
+                        isDomainBound);
+
+                    dc.releaseData(FieldTmp::getUniqueId(0));
+                }
+            };
+
+            template<typename T_Field>
+            static void defineFieldVar(
+                ThreadParams* params,
+                uint32_t nComponents,
+                ADIOS_DATATYPES adiosType,
+                const std::string name,
+                std::vector<float_64> unit,
+                std::vector<float_64> unitDimension,
+                std::vector<std::vector<float_X>> inCellPosition,
+                float_X timeOffset)
+            {
+                PICToAdios<float_64> adiosDoubleType;
+                PICToAdios<float_X> adiosFloatXType;
 
-        HINLINE void operator_impl(ThreadParams* params)
-        {
-            DataConnector &dc = Environment<>::get().DataConnector();
-
-            /*## update field ##*/
-
-            /*load FieldTmp without copy data to host*/
-            PMACC_CASSERT_MSG(
-                _please_allocate_at_least_one_FieldTmp_in_memory_param,
-                fieldTmpNumSlots > 0
-            );
-            auto fieldTmp = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true );
-            /*load particle without copy particle data to host*/
-            auto speciesTmp = dc.get< Species >( Species::FrameType::getName(), true );
-
-            fieldTmp->getGridBuffer().getDeviceBuffer().setValue(ValueType::create(0.0));
-            /*run algorithm*/
-            fieldTmp->template computeValue< CORE + BORDER, Solver >(*speciesTmp, params->currentStep);
-
-            EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
-            __setTransactionEvent(fieldTmpEvent);
-            /* copy data to host that we can write same to disk*/
-            fieldTmp->getGridBuffer().deviceToHost();
-            dc.releaseData(Species::FrameType::getName());
-            /*## finish update field ##*/
-
-            const uint32_t components = GetNComponents<ValueType>::value;
-            PICToAdios<ComponentType> adiosType;
-
-            params->gridLayout = fieldTmp->getGridLayout();
-            const bool isDomainBound = traits::IsFieldDomainBound< FieldTmp >::value;
-            /*write data to ADIOS file*/
-            ADIOSWriter::template writeField<ComponentType>(
-                params,
-                sizeof(ComponentType),
-                adiosType.type,
-                components,
-                getName(),
-                fieldTmp->getHostDataBox().getPointer(),
-                isDomainBound
-            );
-
-            dc.releaseData( FieldTmp::getUniqueId( 0 ) );
+                auto const componentNames = plugins::misc::getComponentNames(nComponents);
 
-        }
+                /* parameter checking */
+                PMACC_ASSERT(unit.size() == nComponents);
+                PMACC_ASSERT(inCellPosition.size() == nComponents);
+                for(uint32_t n = 0; n < nComponents; ++n)
+                    PMACC_ASSERT(inCellPosition.at(n).size() == simDim);
+                PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
 
-    };
+                const std::string recordName(params->adiosBasePath + std::string(ADIOS_PATH_FIELDS) + name);
 
-    template< typename T_Field >
-    static void defineFieldVar(ThreadParams* params,
-        uint32_t nComponents, ADIOS_DATATYPES adiosType, const std::string name,
-        std::vector<float_64> unit, std::vector<float_64> unitDimension,
-        std::vector<std::vector<float_X> > inCellPosition, float_X timeOffset)
-    {
-        PICToAdios<float_64> adiosDoubleType;
-        PICToAdios<float_X> adiosFloatXType;
+                auto fieldsSizeDims = params->fieldsSizeDims;
+                auto fieldsGlobalSizeDims = params->fieldsGlobalSizeDims;
+                auto fieldsOffsetDims = params->fieldsOffsetDims;
 
-        auto const componentNames = plugins::misc::getComponentNames( nComponents );
+                /* Patch for non-domain-bound fields
+                 * This is an ugly fix to allow output of reduced 1d PML buffers
+                 */
+                if(!traits::IsFieldDomainBound<T_Field>::value)
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto field = dc.get<T_Field>(T_Field::getName());
+                    fieldsSizeDims = precisionCast<uint64_t>(field->getGridLayout().getDataSpaceWithoutGuarding());
+                    dc.releaseData(T_Field::getName());
+
+                    /* Scan the PML buffer local size along all local domains
+                     * This code is based on the same operation in hdf5::Field::writeField(),
+                     * the same comments apply here
+                     */
+                    log<picLog::INPUT_OUTPUT>("ADIOS:  (begin) collect PML sizes for %1%") % name;
+                    auto& gridController = Environment<simDim>::get().GridController();
+                    auto const numRanks = uint64_t{gridController.getGlobalSize()};
+                    /* Use domain position-based rank, not MPI rank, to be independent
+                     * of the MPI rank assignment scheme
+                     */
+                    auto const rank = uint64_t{gridController.getScalarPosition()};
+                    std::vector<uint64_t> localSizes(2u * numRanks, 0u);
+                    uint64_t localSizeInfo[2] = {fieldsSizeDims[0], rank};
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK(MPI_Allgather(
+                        localSizeInfo,
+                        2,
+                        MPI_UINT64_T,
+                        &(*localSizes.begin()),
+                        2,
+                        MPI_UINT64_T,
+                        gridController.getCommunicator().getMPIComm()));
+                    uint64_t globalOffsetFile = 0;
+                    uint64_t globalSize = 0;
+                    for(uint64_t r = 0; r < numRanks; ++r)
+                    {
+                        globalSize += localSizes.at(2u * r);
+                        if(localSizes.at(2u * r + 1u) < rank)
+                            globalOffsetFile += localSizes.at(2u * r);
+                    }
+                    log<picLog::INPUT_OUTPUT>("ADIOS:  (end) collect PML sizes for %1%") % name;
 
-        /* parameter checking */
-        PMACC_ASSERT( unit.size() == nComponents );
-        PMACC_ASSERT( inCellPosition.size() == nComponents );
-        for( uint32_t n = 0; n < nComponents; ++n )
-            PMACC_ASSERT( inCellPosition.at(n).size() == simDim );
-        PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
+                    fieldsGlobalSizeDims = pmacc::math::UInt64<simDim>::create(1);
+                    fieldsGlobalSizeDims[0] = globalSize;
+                    fieldsOffsetDims = pmacc::math::UInt64<simDim>::create(0);
+                    fieldsOffsetDims[0] = globalOffsetFile;
+                }
 
-        const std::string recordName( params->adiosBasePath +
-            std::string(ADIOS_PATH_FIELDS) + name );
+                for(uint32_t c = 0; c < nComponents; c++)
+                {
+                    std::string datasetName = recordName;
+                    if(nComponents > 1)
+                        datasetName += "/" + componentNames[c];
+
+                    /* define adios var for field, e.g. field_FieldE_y */
+                    const char* path = nullptr;
+                    int64_t adiosFieldVarId = defineAdiosVar<simDim>(
+                        params->adiosGroupHandle,
+                        datasetName.c_str(),
+                        path,
+                        adiosType,
+                        fieldsSizeDims,
+                        fieldsGlobalSizeDims,
+                        fieldsOffsetDims,
+                        true,
+                        params->adiosCompression);
+
+                    params->adiosFieldVarIds.push_back(adiosFieldVarId);
+
+                    /* already add the unitSI and further attribute so `adios_group_size`
+                     * calculates the reservation for the buffer correctly */
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        params->adiosGroupHandle,
+                        "position",
+                        datasetName.c_str(),
+                        adiosFloatXType.type,
+                        simDim,
+                        &(*inCellPosition.at(c).begin())));
+
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        params->adiosGroupHandle,
+                        "unitSI",
+                        datasetName.c_str(),
+                        adiosDoubleType.type,
+                        1,
+                        &unit.at(c)));
+                }
 
-        auto fieldsSizeDims = params->fieldsSizeDims;
-        auto fieldsGlobalSizeDims = params->fieldsGlobalSizeDims;
-        auto fieldsOffsetDims = params->fieldsOffsetDims;
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "unitDimension",
+                    recordName.c_str(),
+                    adiosDoubleType.type,
+                    7,
+                    &(*unitDimension.begin())));
 
-        /* Patch for non-domain-bound fields
-         * This is an ugly fix to allow output of reduced 1d PML buffers,
-         * that are the same size on each domain.
-         * This code is to be replaced with the openPMD output plugin soon.
-         */
-        if( !traits::IsFieldDomainBound< T_Field >::value )
-        {
-            DataConnector &dc = Environment<>::get().DataConnector();
-            auto field = dc.get< T_Field >( T_Field::getName() );
-            fieldsSizeDims = precisionCast< uint64_t >( field->getGridLayout().getDataSpaceWithoutGuarding() );
-            dc.releaseData( T_Field::getName() );
-            auto const & gridController = Environment<simDim>::get().GridController();
-            auto const numRanks = gridController.getGlobalSize();
-            auto const rank = gridController.getGlobalRank();
-            fieldsGlobalSizeDims = pmacc::math::UInt64<simDim>::create( 1 );
-            fieldsGlobalSizeDims[ 0 ] = numRanks * fieldsSizeDims[ 0 ];
-            fieldsOffsetDims = pmacc::math::UInt64<simDim>::create( 0 );
-            fieldsOffsetDims[ 0 ] = rank * fieldsSizeDims[ 0 ];
-        }
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "timeOffset",
+                    recordName.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    &timeOffset));
+
+                const std::string geometry("cartesian");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "geometry",
+                    recordName.c_str(),
+                    adios_string,
+                    1,
+                    (void*) geometry.c_str()));
+
+                const std::string dataOrder("C");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "dataOrder",
+                    recordName.c_str(),
+                    adios_string,
+                    1,
+                    (void*) dataOrder.c_str()));
 
-        for( uint32_t c = 0; c < nComponents; c++ )
-        {
-            std::string datasetName = recordName;
-            if (nComponents > 1)
-                datasetName +=  "/" + componentNames[c];
+                if(simDim == DIM2)
+                {
+                    const char* axisLabels[] = {"y", "x"}; // 2D: F[y][x]
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        params->adiosGroupHandle,
+                        "axisLabels",
+                        recordName.c_str(),
+                        adios_string_array,
+                        simDim,
+                        axisLabels));
+                }
+                if(simDim == DIM3)
+                {
+                    const char* axisLabels[] = {"z", "y", "x"}; // 3D: F[z][y][x]
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        params->adiosGroupHandle,
+                        "axisLabels",
+                        recordName.c_str(),
+                        adios_string_array,
+                        simDim,
+                        axisLabels));
+                }
 
-            /* define adios var for field, e.g. field_FieldE_y */
-            const char* path = nullptr;
-            int64_t adiosFieldVarId = defineAdiosVar<simDim>(
-                    params->adiosGroupHandle,
-                    datasetName.c_str(),
-                    path,
-                    adiosType,
-                    fieldsSizeDims,
-                    fieldsGlobalSizeDims,
-                    fieldsOffsetDims,
-                    true,
-                    params->adiosCompression);
-
-            params->adiosFieldVarIds.push_back(adiosFieldVarId);
-
-            /* already add the unitSI and further attribute so `adios_group_size`
-             * calculates the reservation for the buffer correctly */
-            ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-                      "position", datasetName.c_str(),
-                      adiosFloatXType.type, simDim, &(*inCellPosition.at(c).begin()) ));
-
-            ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-                      "unitSI", datasetName.c_str(),
-                      adiosDoubleType.type, 1, &unit.at(c) ));
-        }
+                // cellSize is {x, y, z} but fields are F[z][y][x]
+                std::vector<float_X> gridSpacing(simDim, 0.0);
+                for(uint32_t d = 0; d < simDim; ++d)
+                    gridSpacing.at(simDim - 1 - d) = cellSize[d];
 
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "unitDimension", recordName.c_str(),
-            adiosDoubleType.type, 7, &(*unitDimension.begin()) ));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "gridSpacing",
+                    recordName.c_str(),
+                    adiosFloatXType.type,
+                    simDim,
+                    &(*gridSpacing.begin())));
+
+                /* globalSlideOffset due to gpu slides between origin at time step 0
+                 * and origin at current time step
+                 * ATTENTION: splash offset are globalSlideOffset + picongpu offsets
+                 */
+                DataSpace<simDim> globalSlideOffset;
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+                const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(params->currentStep);
+                globalSlideOffset.y() += numSlides * localDomain.size.y();
+
+                // globalDimensions is {x, y, z} but fields are F[z][y][x]
+                std::vector<float_64> gridGlobalOffset(simDim, 0.0);
+                for(uint32_t d = 0; d < simDim; ++d)
+                    gridGlobalOffset.at(simDim - 1 - d) = float_64(cellSize[d])
+                        * float_64(params->window.globalDimensions.offset[d] + globalSlideOffset[d]);
+
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "gridGlobalOffset",
+                    recordName.c_str(),
+                    adiosDoubleType.type,
+                    simDim,
+                    &(*gridGlobalOffset.begin())));
 
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "timeOffset", recordName.c_str(),
-            adiosFloatXType.type, 1, &timeOffset ));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "gridUnitSI",
+                    recordName.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_LENGTH));
+
+                const std::string fieldSmoothing("none");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "fieldSmoothing",
+                    recordName.c_str(),
+                    adios_string,
+                    1,
+                    (void*) fieldSmoothing.c_str()));
+            }
 
-        const std::string geometry( "cartesian" );
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "geometry", recordName.c_str(),
-            adios_string, 1, (void*)geometry.c_str() ));
+            /**
+             * Collect field sizes to set adios group size.
+             */
+            template<typename T>
+            struct CollectFieldsSizes
+            {
+            public:
+                typedef typename T::ValueType ValueType;
+                typedef typename T::UnitValueType UnitType;
+                typedef typename GetComponentsType<ValueType>::type ComponentType;
 
-        const std::string dataOrder( "C" );
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "dataOrder", recordName.c_str(),
-            adios_string, 1, (void*)dataOrder.c_str() ));
+                static std::vector<float_64> getUnit()
+                {
+                    UnitType unit = T::getUnit();
+                    return createUnit(unit, T::numComponents);
+                }
 
-        if( simDim == DIM2 )
-        {
-            const char* axisLabels[] = {"y", "x"};      // 2D: F[y][x]
-            ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-                "axisLabels", recordName.c_str(),
-                adios_string_array, simDim, axisLabels ));
-        }
-        if( simDim == DIM3 )
-        {
-            const char* axisLabels[] = {"z", "y", "x"}; // 3D: F[z][y][x]
-            ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-                "axisLabels", recordName.c_str(),
-                adios_string_array, simDim, axisLabels ));
-        }
+                HDINLINE void operator()(ThreadParams* params)
+                {
+#ifndef __CUDA_ARCH__
+                    const uint32_t components = T::numComponents;
+
+                    auto localSize = params->window.localDimensions.size;
+                    /* Patch for non-domain-bound fields
+                     * This is an ugly fix to allow output of reduced 1d PML buffers,
+                     * that are the same size on each domain.
+                     * This code is to be replaced with the openPMD output plugin soon.
+                     */
+                    if(!traits::IsFieldDomainBound<T>::value)
+                    {
+                        DataConnector& dc = Environment<>::get().DataConnector();
+                        auto field = dc.get<T>(T::getName());
+                        localSize = field->getGridLayout().getDataSpaceWithoutGuarding();
+                        dc.releaseData(T::getName());
+                    }
 
-        // cellSize is {x, y, z} but fields are F[z][y][x]
-        std::vector<float_X> gridSpacing(simDim, 0.0);
-        for( uint32_t d = 0; d < simDim; ++d )
-            gridSpacing.at(simDim-1-d) = cellSize[d];
+                    // adios buffer size for this dataset (all components)
+                    uint64_t localGroupSize = localSize.productOfComponents() * sizeof(ComponentType) * components;
 
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "gridSpacing", recordName.c_str(),
-            adiosFloatXType.type, simDim, &(*gridSpacing.begin()) ));
+                    params->adiosGroupSize += localGroupSize;
 
-        /* globalSlideOffset due to gpu slides between origin at time step 0
-         * and origin at current time step
-         * ATTENTION: splash offset are globalSlideOffset + picongpu offsets
-         */
-        DataSpace<simDim> globalSlideOffset;
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(params->currentStep);
-        globalSlideOffset.y() += numSlides * localDomain.size.y();
-
-        // globalDimensions is {x, y, z} but fields are F[z][y][x]
-        std::vector<float_64> gridGlobalOffset(simDim, 0.0);
-        for( uint32_t d = 0; d < simDim; ++d )
-            gridGlobalOffset.at(simDim-1-d) =
-                float_64(cellSize[d]) *
-                float_64(params->window.globalDimensions.offset[d] +
-                         globalSlideOffset[d]);
-
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "gridGlobalOffset", recordName.c_str(),
-            adiosDoubleType.type, simDim, &(*gridGlobalOffset.begin()) ));
-
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "gridUnitSI", recordName.c_str(),
-            adiosDoubleType.type, 1, (void*)&UNIT_LENGTH ));
-
-        const std::string fieldSmoothing( "none" );
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "fieldSmoothing", recordName.c_str(),
-            adios_string, 1, (void*)fieldSmoothing.c_str() ));
-    }
-
-    /**
-     * Collect field sizes to set adios group size.
-     */
-    template< typename T >
-    struct CollectFieldsSizes
-    {
-    public:
-        typedef typename T::ValueType ValueType;
-        typedef typename T::UnitValueType UnitType;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
+                    // convert in a std::vector of std::vector format for writeField API
+                    const traits::FieldPosition<fields::CellType, T> fieldPos;
 
-        static std::vector<float_64> getUnit()
-        {
-            UnitType unit = T::getUnit();
-            return createUnit(unit, T::numComponents);
-        }
+                    std::vector<std::vector<float_X>> inCellPosition;
+                    for(uint32_t n = 0; n < T::numComponents; ++n)
+                    {
+                        std::vector<float_X> inCellPositonComponent;
+                        for(uint32_t d = 0; d < simDim; ++d)
+                            inCellPositonComponent.push_back(fieldPos()[n][d]);
+                        inCellPosition.push_back(inCellPositonComponent);
+                    }
 
-        HDINLINE void operator()(ThreadParams* params)
-        {
-#ifndef __CUDA_ARCH__
-            const uint32_t components = T::numComponents;
+                    /** \todo check if always correct at this point, depends on solver
+                     *        implementation */
+                    const float_X timeOffset = 0.0;
+
+                    PICToAdios<ComponentType> adiosType;
+                    defineFieldVar<T>(
+                        params,
+                        components,
+                        adiosType.type,
+                        T::getName(),
+                        getUnit(),
+                        T::getUnitDimension(),
+                        inCellPosition,
+                        timeOffset);
+#endif
+                }
+            };
 
-            auto localSize = params->window.localDimensions.size;
-            /* Patch for non-domain-bound fields
-             * This is an ugly fix to allow output of reduced 1d PML buffers,
-             * that are the same size on each domain.
-             * This code is to be replaced with the openPMD output plugin soon.
+            /**
+             * Collect field sizes to set adios group size.
+             * Specialization.
              */
-            if( !traits::IsFieldDomainBound< T >::value )
+            template<typename Solver, typename Species>
+            struct CollectFieldsSizes<FieldTmpOperation<Solver, Species>>
             {
-                DataConnector &dc = Environment<>::get().DataConnector();
-                auto field = dc.get< T >( T::getName() );
-                localSize = field->getGridLayout().getDataSpaceWithoutGuarding();
-                dc.releaseData( T::getName() );
-            }
-
-            // adios buffer size for this dataset (all components)
-            uint64_t localGroupSize =
-                    localSize.productOfComponents() *
-                    sizeof(ComponentType) *
-                    components;
-
-            params->adiosGroupSize += localGroupSize;
+            public:
+                PMACC_NO_NVCC_HDWARNING
+                HDINLINE void operator()(ThreadParams* tparam)
+                {
+                    this->operator_impl(tparam);
+                }
 
-            // convert in a std::vector of std::vector format for writeField API
-            const traits::FieldPosition<fields::CellType, T> fieldPos;
+            private:
+                typedef typename FieldTmp::ValueType ValueType;
+                typedef typename FieldTmp::UnitValueType UnitType;
+                typedef typename GetComponentsType<ValueType>::type ComponentType;
 
-            std::vector<std::vector<float_X> > inCellPosition;
-            for( uint32_t n = 0; n < T::numComponents; ++n )
-            {
-                std::vector<float_X> inCellPositonComponent;
-                for( uint32_t d = 0; d < simDim; ++d )
-                    inCellPositonComponent.push_back( fieldPos()[n][d] );
-                inCellPosition.push_back( inCellPositonComponent );
-            }
+                /** Create a name for the adios identifier.
+                 */
+                static std::string getName()
+                {
+                    return FieldTmpOperation<Solver, Species>::getName();
+                }
 
-            /** \todo check if always correct at this point, depends on solver
-             *        implementation */
-            const float_X timeOffset = 0.0;
+                /** Get the unit for the result from the solver*/
+                static std::vector<float_64> getUnit()
+                {
+                    UnitType unit = FieldTmp::getUnit<Solver>();
+                    const uint32_t components = GetNComponents<ValueType>::value;
+                    return createUnit(unit, components);
+                }
 
-            PICToAdios<ComponentType> adiosType;
-            defineFieldVar< T >(params, components, adiosType.type, T::getName(), getUnit(),
-                T::getUnitDimension(), inCellPosition, timeOffset);
-#endif
-        }
-    };
-
-    /**
-     * Collect field sizes to set adios group size.
-     * Specialization.
-     */
-    template< typename Solver, typename Species >
-    struct CollectFieldsSizes<FieldTmpOperation<Solver, Species> >
-    {
-    public:
+                HINLINE void operator_impl(ThreadParams* params)
+                {
+                    const uint32_t components = GetNComponents<ValueType>::value;
+
+                    auto localSize = params->window.localDimensions.size;
+                    /* Patch for non-domain-bound fields
+                     * This is an ugly fix to allow output of reduced 1d PML buffers,
+                     * that are the same size on each domain.
+                     * This code is to be replaced with the openPMD output plugin soon.
+                     */
+                    if(!traits::IsFieldDomainBound<FieldTmp>::value)
+                    {
+                        DataConnector& dc = Environment<>::get().DataConnector();
+                        auto field = dc.get<FieldTmp>(FieldTmp::getName());
+                        localSize = field->getGridLayout().getDataSpaceWithoutGuarding();
+                        dc.releaseData(FieldTmp::getName());
+                    }
 
-        PMACC_NO_NVCC_HDWARNING
-        HDINLINE void operator()(ThreadParams* tparam)
-        {
-            this->operator_impl(tparam);
-        }
+                    // adios buffer size for this dataset (all components)
+                    uint64_t localGroupSize = localSize.productOfComponents() * sizeof(ComponentType) * components;
+
+                    params->adiosGroupSize += localGroupSize;
+
+                    /*wrap in a one-component vector for writeField API*/
+                    const traits::FieldPosition<fields::CellType, FieldTmp> fieldPos;
+
+                    std::vector<std::vector<float_X>> inCellPosition;
+                    std::vector<float_X> inCellPositonComponent;
+                    for(uint32_t d = 0; d < simDim; ++d)
+                        inCellPositonComponent.push_back(fieldPos()[0][d]);
+                    inCellPosition.push_back(inCellPositonComponent);
+
+                    /** \todo check if always correct at this point, depends on solver
+                     *        implementation */
+                    const float_X timeOffset = 0.0;
+
+                    PICToAdios<ComponentType> adiosType;
+                    defineFieldVar<FieldTmp>(
+                        params,
+                        components,
+                        adiosType.type,
+                        getName(),
+                        getUnit(),
+                        FieldTmp::getUnitDimension<Solver>(),
+                        inCellPosition,
+                        timeOffset);
+                }
+            };
 
-   private:
-        typedef typename FieldTmp::ValueType ValueType;
-        typedef typename FieldTmp::UnitValueType UnitType;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
+        public:
+            /** constructor
+             *
+             * @param help instance of the class Help
+             * @param id index of this plugin instance within help
+             * @param cellDescription PIConGPu cell description information for kernel index mapping
+             */
+            ADIOSWriter(std::shared_ptr<plugins::multi::IHelp>& help, size_t const id, MappingDesc* cellDescription)
+                : m_help(std::static_pointer_cast<Help>(help))
+                , m_id(id)
+                , m_cellDescription(cellDescription)
+                , outputDirectory("bp")
+                , lastSpeciesSyncStep(pmacc::traits::limits::Max<uint32_t>::value)
+            {
+                mThreadParams.adiosAggregators = m_help->numAggregators.get(id);
+                mThreadParams.adiosOST = m_help->numOSTs.get(id);
+                mThreadParams.adiosDisableMeta = m_help->disableMeta.get(id);
+                mThreadParams.adiosTransportParams = m_help->transportParams.get(id);
+                mThreadParams.adiosCompression = m_help->compression.get(id);
+
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                /* It is important that we never change the mpi_pos after this point
+                 * because we get problems with the restart.
+                 * Otherwise we do not know which gpu must load the ghost parts around
+                 * the sliding window.
+                 */
+                mpi_pos = gc.getPosition();
+                mpi_size = gc.getGpuNodes();
+
+                /* if number of aggregators is not set we use all mpi process as aggregator*/
+                if(mThreadParams.adiosAggregators == 0)
+                    mThreadParams.adiosAggregators = mpi_size.productOfComponents();
+
+                if(m_help->selfRegister)
+                {
+                    std::string notifyPeriod = m_help->notifyPeriod.get(id);
+                    /* only register for notify callback when .period is set on command line */
+                    if(!notifyPeriod.empty())
+                    {
+                        Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
 
-        /** Create a name for the adios identifier.
-         */
-        static std::string getName()
-        {
-            return FieldTmpOperation<Solver, Species>::getName();
-        }
+                        /** create notify directory */
+                        Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(outputDirectory);
+                    }
+                }
 
-        /** Get the unit for the result from the solver*/
-        static std::vector<float_64> getUnit()
-        {
-            UnitType unit = FieldTmp::getUnit<Solver>();
-            const uint32_t components = GetNComponents<ValueType>::value;
-            return createUnit(unit, components);
-        }
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                /* Initialize adios library */
+                mThreadParams.adiosComm = MPI_COMM_NULL;
+                MPI_CHECK(MPI_Comm_dup(gc.getCommunicator().getMPIComm(), &(mThreadParams.adiosComm)));
+                mThreadParams.adiosBufferInitialized = false;
+
+                /* select MPI method, #OSTs and #aggregators */
+                std::stringstream strMPITransportParams;
+                strMPITransportParams << "num_aggregators=" << mThreadParams.adiosAggregators
+                                      << ";num_ost=" << mThreadParams.adiosOST;
+                /* create meta file offline/post-mortem with bpmeta */
+                if(mThreadParams.adiosDisableMeta)
+                    strMPITransportParams << ";have_metadata_file=0";
+                /* additional, uncovered transport parameters, e.g.,
+                 * use system-defaults for striping per aggregated file */
+                if(!mThreadParams.adiosTransportParams.empty())
+                    strMPITransportParams << ";" << mThreadParams.adiosTransportParams;
+
+                mpiTransportParams = strMPITransportParams.str();
+            }
 
-        HINLINE void operator_impl(ThreadParams* params)
-        {
-            const uint32_t components = GetNComponents<ValueType>::value;
-
-            auto localSize = params->window.localDimensions.size;
-            /* Patch for non-domain-bound fields
-            * This is an ugly fix to allow output of reduced 1d PML buffers,
-            * that are the same size on each domain.
-            * This code is to be replaced with the openPMD output plugin soon.
-            */
-            if( !traits::IsFieldDomainBound< FieldTmp >::value )
+            virtual ~ADIOSWriter()
             {
-                DataConnector &dc = Environment<>::get().DataConnector();
-                auto field = dc.get< FieldTmp >( FieldTmp::getName() );
-                localSize = field->getGridLayout().getDataSpaceWithoutGuarding();
-                dc.releaseData( FieldTmp::getName() );
+                if(mThreadParams.adiosComm != MPI_COMM_NULL)
+                {
+                    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&(mThreadParams.adiosComm)));
+                }
             }
 
-            // adios buffer size for this dataset (all components)
-            uint64_t localGroupSize =
-                    localSize.productOfComponents() *
-                    sizeof(ComponentType) *
-                    components;
-
-            params->adiosGroupSize += localGroupSize;
-
-            /*wrap in a one-component vector for writeField API*/
-            const traits::FieldPosition<fields::CellType, FieldTmp>
-                fieldPos;
-
-            std::vector<std::vector<float_X> > inCellPosition;
-            std::vector<float_X> inCellPositonComponent;
-            for( uint32_t d = 0; d < simDim; ++d )
-                inCellPositonComponent.push_back( fieldPos()[0][d] );
-            inCellPosition.push_back( inCellPositonComponent );
-
-            /** \todo check if always correct at this point, depends on solver
-             *        implementation */
-            const float_X timeOffset = 0.0;
+            void notify(uint32_t currentStep)
+            {
+                // notify is only allowed if the plugin is not controlled by the class Checkpoint
+                assert(m_help->selfRegister);
 
-            PICToAdios<ComponentType> adiosType;
-            defineFieldVar< FieldTmp >(params, components, adiosType.type, getName(), getUnit(),
-                FieldTmp::getUnitDimension<Solver>(), inCellPosition, timeOffset);
-        }
+                __getTransactionEvent().waitForFinished();
 
-    };
-
-public:
-
-    /** constructor
-     *
-     * @param help instance of the class Help
-     * @param id index of this plugin instance within help
-     * @param cellDescription PIConGPu cell description information for kernel index mapping
-     */
-    ADIOSWriter(
-        std::shared_ptr< plugins::multi::IHelp > & help,
-        size_t const id,
-        MappingDesc* cellDescription
-    ) :
-    m_help( std::static_pointer_cast< Help >(help) ),
-    m_id( id ),
-    m_cellDescription( cellDescription ),
-    outputDirectory("bp"),
-    lastSpeciesSyncStep(pmacc::traits::limits::Max<uint32_t>::value)
-    {
+                std::string filename = m_help->fileName.get(m_id);
 
-        mThreadParams.adiosAggregators = m_help->numAggregators.get( id );
-        mThreadParams.adiosOST = m_help->numOSTs.get( id );
-        mThreadParams.adiosDisableMeta = m_help->disableMeta.get( id );
-        mThreadParams.adiosTransportParams = m_help->transportParams.get( id );
-        mThreadParams.adiosCompression = m_help->compression.get( id );
-
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-        /* It is important that we never change the mpi_pos after this point
-         * because we get problems with the restart.
-         * Otherwise we do not know which gpu must load the ghost parts around
-         * the sliding window.
-         */
-        mpi_pos = gc.getPosition();
-        mpi_size = gc.getGpuNodes();
+                /* if file name is relative, prepend with common directory */
+                if(boost::filesystem::path(filename).has_root_path())
+                    mThreadParams.adiosFilename = filename;
+                else
+                    mThreadParams.adiosFilename = outputDirectory + "/" + filename;
 
-        /* if number of aggregators is not set we use all mpi process as aggregator*/
-        if( mThreadParams.adiosAggregators == 0 )
-           mThreadParams.adiosAggregators=mpi_size.productOfComponents();
+                /* window selection */
+                mThreadParams.window = MovingWindow::getInstance().getWindow(currentStep);
+                mThreadParams.isCheckpoint = false;
+                dumpData(currentStep);
+            }
 
-        if( m_help->selfRegister )
-        {
-            std::string notifyPeriod = m_help->notifyPeriod.get( id );
-            /* only register for notify callback when .period is set on command line */
-            if(!notifyPeriod.empty())
+            virtual void restart(uint32_t restartStep, std::string const& restartDirectory)
             {
-                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
-
-                /** create notify directory */
-                Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(outputDirectory);
+                /* ISlave restart interface is not needed becase IIOBackend
+                 * restart interface is used
+                 */
             }
-        }
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        /* Initialize adios library */
-        mThreadParams.adiosComm = MPI_COMM_NULL;
-        MPI_CHECK(MPI_Comm_dup(gc.getCommunicator().getMPIComm(), &(mThreadParams.adiosComm)));
-        mThreadParams.adiosBufferInitialized = false;
-
-        /* select MPI method, #OSTs and #aggregators */
-        std::stringstream strMPITransportParams;
-        strMPITransportParams << "num_aggregators=" << mThreadParams.adiosAggregators
-                              << ";num_ost=" << mThreadParams.adiosOST;
-        /* create meta file offline/post-mortem with bpmeta */
-        if( mThreadParams.adiosDisableMeta )
-            strMPITransportParams << ";have_metadata_file=0";
-        /* additional, uncovered transport parameters, e.g.,
-         * use system-defaults for striping per aggregated file */
-        if( ! mThreadParams.adiosTransportParams.empty() )
-            strMPITransportParams << ";" << mThreadParams.adiosTransportParams;
-
-        mpiTransportParams = strMPITransportParams.str();
-    }
-
-    virtual ~ADIOSWriter()
-    {
-        if (mThreadParams.adiosComm != MPI_COMM_NULL)
-        {
-            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-            __getTransactionEvent().waitForFinished();
-            MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&(mThreadParams.adiosComm)));
-        }
-    }
-
-    void notify(uint32_t currentStep)
-    {
-        // notify is only allowed if the plugin is not controlled by the class Checkpoint
-        assert( m_help->selfRegister );
 
-        __getTransactionEvent().waitForFinished();
-
-        std::string filename = m_help->fileName.get( m_id );
-
-        /* if file name is relative, prepend with common directory */
-        if( boost::filesystem::path(filename).has_root_path() )
-            mThreadParams.adiosFilename = filename;
-        else
-            mThreadParams.adiosFilename = outputDirectory + "/" + filename;
+            virtual void checkpoint(uint32_t currentStep, std::string const& checkpointDirectory)
+            {
+                /* ISlave checkpoint interface is not needed becase IIOBackend
+                 * checkpoint interface is used
+                 */
+            }
 
-        /* window selection */
-        mThreadParams.window = MovingWindow::getInstance().getWindow(currentStep);
-        mThreadParams.isCheckpoint = false;
-        dumpData(currentStep);
-    }
+            void dumpCheckpoint(
+                const uint32_t currentStep,
+                const std::string& checkpointDirectory,
+                const std::string& checkpointFilename)
+            {
+                // checkpointing is only allowed if the plugin is controlled by the class Checkpoint
+                assert(!m_help->selfRegister);
 
-    virtual void restart(
-        uint32_t restartStep,
-        std::string const & restartDirectory
-    )
-    {
-        /* ISlave restart interface is not needed becase IIOBackend
-         * restart interface is used
-         */
-    }
+                __getTransactionEvent().waitForFinished();
+                /* if file name is relative, prepend with common directory */
+                if(boost::filesystem::path(checkpointFilename).has_root_path())
+                    mThreadParams.adiosFilename = checkpointFilename;
+                else
+                    mThreadParams.adiosFilename = checkpointDirectory + "/" + checkpointFilename;
 
-    virtual void checkpoint(
-        uint32_t currentStep,
-        std::string const & checkpointDirectory
-    )
-    {
-        /* ISlave checkpoint interface is not needed becase IIOBackend
-         * checkpoint interface is used
-         */
-    }
+                mThreadParams.window = MovingWindow::getInstance().getDomainAsWindow(currentStep);
+                mThreadParams.isCheckpoint = true;
 
-    void dumpCheckpoint(
-        const uint32_t currentStep,
-        const std::string& checkpointDirectory,
-        const std::string& checkpointFilename
-    )
-    {
-        // checkpointing is only allowed if the plugin is controlled by the class Checkpoint
-        assert(!m_help->selfRegister);
-
-        __getTransactionEvent().waitForFinished();
-        /* if file name is relative, prepend with common directory */
-        if( boost::filesystem::path(checkpointFilename).has_root_path() )
-            mThreadParams.adiosFilename = checkpointFilename;
-        else
-            mThreadParams.adiosFilename = checkpointDirectory + "/" + checkpointFilename;
-
-        mThreadParams.window = MovingWindow::getInstance().getDomainAsWindow(currentStep);
-        mThreadParams.isCheckpoint = true;
-
-        dumpData(currentStep);
-    }
-
-    void doRestart(
-        const uint32_t restartStep,
-        const std::string& restartDirectory,
-        const std::string& constRestartFilename,
-        const uint32_t restartChunkSize
-    )
-    {
-        // restart is only allowed if the plugin is controlled by the class Checkpoint
-        assert(!m_help->selfRegister);
-
-        // allow to modify the restart file name
-        std::string restartFilename{ constRestartFilename };
-
-        std::stringstream adiosPathBase;
-        adiosPathBase << ADIOS_PATH_ROOT << restartStep << "/";
-        mThreadParams.adiosBasePath = adiosPathBase.str();
-        //mThreadParams.isCheckpoint = isCheckpoint;
-        mThreadParams.currentStep = restartStep;
-        mThreadParams.cellDescription = m_cellDescription;
-
-        /** one could try ADIOS_READ_METHOD_BP_AGGREGATE too which might
-         *  be beneficial for re-distribution on a different number of GPUs
-         *    would need: - `export chunk_size=SIZE # in MB`
-         *                - `mpiTransportParams.c_str()` in `adios_read_init_method`
-         */
-        ADIOS_CMD(adios_read_init_method(ADIOS_READ_METHOD_BP,
-                                         mThreadParams.adiosComm,
-                                         "verbose=3;abort_on_error;"));
+                dumpData(currentStep);
+            }
 
-        /* if restartFilename is relative, prepend with restartDirectory */
-        if (!boost::filesystem::path(restartFilename).has_root_path())
-        {
-            restartFilename = restartDirectory + std::string("/") + restartFilename;
-        }
+            void doRestart(
+                const uint32_t restartStep,
+                const std::string& restartDirectory,
+                const std::string& constRestartFilename,
+                const uint32_t restartChunkSize)
+            {
+                // restart is only allowed if the plugin is controlled by the class Checkpoint
+                assert(!m_help->selfRegister);
+
+                // allow to modify the restart file name
+                std::string restartFilename{constRestartFilename};
+
+                std::stringstream adiosPathBase;
+                adiosPathBase << ADIOS_PATH_ROOT << restartStep << "/";
+                mThreadParams.adiosBasePath = adiosPathBase.str();
+                // mThreadParams.isCheckpoint = isCheckpoint;
+                mThreadParams.currentStep = restartStep;
+                mThreadParams.cellDescription = m_cellDescription;
+
+                /** one could try ADIOS_READ_METHOD_BP_AGGREGATE too which might
+                 *  be beneficial for re-distribution on a different number of GPUs
+                 *    would need: - `export chunk_size=SIZE # in MB`
+                 *                - `mpiTransportParams.c_str()` in `adios_read_init_method`
+                 */
+                ADIOS_CMD(adios_read_init_method(
+                    ADIOS_READ_METHOD_BP,
+                    mThreadParams.adiosComm,
+                    "verbose=3;abort_on_error;"));
+
+                /* if restartFilename is relative, prepend with restartDirectory */
+                if(!boost::filesystem::path(restartFilename).has_root_path())
+                {
+                    restartFilename = restartDirectory + std::string("/") + restartFilename;
+                }
 
-        std::stringstream strFname;
-        strFname << restartFilename << "_" << mThreadParams.currentStep << ".bp";
+                std::stringstream strFname;
+                strFname << restartFilename << "_" << mThreadParams.currentStep << ".bp";
 
-        const std::string filename = strFname.str( );
+                const std::string filename = strFname.str();
 
-        // adios_read_open( fname, method, comm, lock_mode, timeout_sec )
-        log< picLog::INPUT_OUTPUT > ("ADIOS: open file: %1%") % filename;
+                // adios_read_open( fname, method, comm, lock_mode, timeout_sec )
+                log<picLog::INPUT_OUTPUT>("ADIOS: open file: %1%") % filename;
 
-        // when reading in BG_AGGREGATE mode, adios can not distinguish between
-        // "file does not exist" and "stream is not (yet) available, so we
-        // test it our selves
-        if (!boost::filesystem::exists(strFname.str()))
-            throw std::runtime_error("ADIOS: File does not exist.");
+                // when reading in BG_AGGREGATE mode, adios can not distinguish between
+                // "file does not exist" and "stream is not (yet) available, so we
+                // test it our selves
+                if(!boost::filesystem::exists(strFname.str()))
+                    throw std::runtime_error("ADIOS: File does not exist.");
 
-        /* <0 sec: wait forever
-         * >=0 sec: return immediately if stream is not available */
-        float_32 timeout = 0.0f;
-        mThreadParams.fp = adios_read_open(filename.c_str(),
-                        ADIOS_READ_METHOD_BP, mThreadParams.adiosComm,
-                        ADIOS_LOCKMODE_CURRENT, timeout);
+                /* <0 sec: wait forever
+                 * >=0 sec: return immediately if stream is not available */
+                float_32 timeout = 0.0f;
+                mThreadParams.fp = adios_read_open(
+                    filename.c_str(),
+                    ADIOS_READ_METHOD_BP,
+                    mThreadParams.adiosComm,
+                    ADIOS_LOCKMODE_CURRENT,
+                    timeout);
 
-        /* stream reading is tricky, see ADIOS manual section 8.11.1 */
-        while (adios_errno == err_file_not_found)
-        {
-            /** \todo add c++11 platform independent sleep */
+                /* stream reading is tricky, see ADIOS manual section 8.11.1 */
+                while(adios_errno == err_file_not_found)
+                {
+                    /** \todo add c++11 platform independent sleep */
 #if !defined(_WIN32)
-            /* give the file system 1s of peace and quiet */
-            usleep(1e6);
+                    /* give the file system 1s of peace and quiet */
+                    usleep(1e6);
 #endif
-            mThreadParams.fp = adios_read_open(filename.c_str(),
-                        ADIOS_READ_METHOD_BP, mThreadParams.adiosComm,
-                        ADIOS_LOCKMODE_CURRENT, timeout);
-        }
-        if (adios_errno == err_end_of_stream )
-            /* could not read full stream */
-            throw std::runtime_error("ADIOS: Stream terminated too early: " +
-                                     std::string(adios_errmsg()) );
-        if (mThreadParams.fp == nullptr)
-            throw std::runtime_error("ADIOS: Error opening stream: " +
-                                     std::string(adios_errmsg()) );
-
-        /* ADIOS types */
-        AdiosUInt32Type adiosUInt32Type;
-
-        /* load number of slides to initialize MovingWindow */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) read attr (%1% available)") %
-            mThreadParams.fp->nattrs;
-        void* slidesPtr = nullptr;
-        int slideSize;
-        enum ADIOS_DATATYPES slidesType;
-        const std::string simSlidesPath =
-            mThreadParams.adiosBasePath + std::string("sim_slides");
-        ADIOS_CMD(adios_get_attr( mThreadParams.fp,
-                                  simSlidesPath.c_str(),
-                                  &slidesType,
-                                  &slideSize,
-                                  &slidesPtr ));
-
-        uint32_t slides = *( (uint32_t*)slidesPtr );
-        log<picLog::INPUT_OUTPUT > ("ADIOS: value of sim_slides = %1%") %
-            slides;
-
-        PMACC_ASSERT(slidesType == adiosUInt32Type.type);
-        PMACC_ASSERT(slideSize == sizeof(uint32_t)); // uint32_t in bytes
-
-        void* lastStepPtr = nullptr;
-        int lastStepSize;
-        enum ADIOS_DATATYPES lastStepType;
-        const std::string iterationPath =
-            mThreadParams.adiosBasePath + std::string("iteration");
-        ADIOS_CMD(adios_get_attr( mThreadParams.fp,
-                                  iterationPath.c_str(),
-                                  &lastStepType,
-                                  &lastStepSize,
-                                  &lastStepPtr ));
-        uint32_t lastStep = *( (uint32_t*)lastStepPtr );
-        log<picLog::INPUT_OUTPUT > ("ADIOS: value of iteration = %1%") %
-            lastStep;
-
-        PMACC_ASSERT(lastStepType == adiosUInt32Type.type);
-        PMACC_ASSERT(lastStep == restartStep);
-
-        /* apply slides to set gpus to last/written configuration */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: Setting slide count for moving window to %1%") % slides;
-        MovingWindow::getInstance().setSlideCounter(slides, restartStep);
-
-        /* re-distribute the local offsets in y-direction
-         * this will work for restarts with moving window still enabled
-         * and restarts that disable the moving window
-         * \warning enabling the moving window from a checkpoint that
-         *          had no moving window will not work
-         */
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-        gc.setStateAfterSlides(slides);
-
-        /* set window for restart, complete global domain */
-        mThreadParams.window = MovingWindow::getInstance().getDomainAsWindow(restartStep);
-        mThreadParams.localWindowToDomainOffset = DataSpace<simDim>::create(0);
-
-        /* load all fields */
-        meta::ForEach<FileCheckpointFields, LoadFields<bmpl::_1> > forEachLoadFields;
-        forEachLoadFields(&mThreadParams);
-
-        /* load all particles */
-        meta::ForEach<FileCheckpointParticles, LoadSpecies<bmpl::_1> > forEachLoadSpecies;
-        forEachLoadSpecies(&mThreadParams, restartChunkSize);
-
-        IdProvider<simDim>::State idProvState;
-        ReadNDScalars<uint64_t, uint64_t>()(mThreadParams,
-                "picongpu/idProvider/startId", &idProvState.startId,
-                "maxNumProc", &idProvState.maxNumProc);
-        ReadNDScalars<uint64_t>()(mThreadParams,
-                "picongpu/idProvider/nextId", &idProvState.nextId);
-        log<picLog::INPUT_OUTPUT > ("Setting next free id on current rank: %1%") % idProvState.nextId;
-        IdProvider<simDim>::setState(idProvState);
-
-        /* free memory allocated in ADIOS calls */
-        free(slidesPtr);
-        free(lastStepPtr);
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-        __getTransactionEvent().waitForFinished();
-
-        /* clean shut down: close file and finalize */
-        adios_release_step( mThreadParams.fp );
-        ADIOS_CMD(adios_read_close( mThreadParams.fp ));
-        ADIOS_CMD(adios_read_finalize_method(ADIOS_READ_METHOD_BP));
-    }
-
-private:
-
-    void endAdios()
-    {
-        /* Finalize adios library */
-        ADIOS_CMD(adios_finalize(Environment<simDim>::get().GridController()
-                .getCommunicator().getRank()));
-
-        __deleteArray(mThreadParams.fieldBfr);
-    }
-
-    void beginAdios(const std::string adiosFilename)
-    {
-        std::stringstream full_filename;
-        full_filename << adiosFilename << "_" << mThreadParams.currentStep << ".bp";
-
-        mThreadParams.fullFilename = full_filename.str();
-        mThreadParams.adiosFileHandle = ADIOS_INVALID_HANDLE;
-
-        // Note: here we always allocate for the domain-bound fields
-        mThreadParams.fieldBfr = new float_X[mThreadParams.window.localDimensions.size.productOfComponents()];
-
-        std::stringstream adiosPathBase;
-        adiosPathBase << ADIOS_PATH_ROOT << mThreadParams.currentStep << "/";
-        mThreadParams.adiosBasePath = adiosPathBase.str();
-
-        ADIOS_CMD(adios_init_noxml(mThreadParams.adiosComm));
-    }
-
-    /**
-     * Notification for dump or checkpoint received
-     *
-     * @param currentStep current simulation step
-     */
-    void dumpData(uint32_t currentStep)
-    {
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-        mThreadParams.cellDescription = m_cellDescription;
-        mThreadParams.currentStep = currentStep;
-
-        for (uint32_t i = 0; i < simDim; ++i)
-        {
-            mThreadParams.localWindowToDomainOffset[i] = 0;
-            if (mThreadParams.window.globalDimensions.offset[i] > localDomain.offset[i])
-            {
-                mThreadParams.localWindowToDomainOffset[i] =
-                    mThreadParams.window.globalDimensions.offset[i] -
-                    localDomain.offset[i];
+                    mThreadParams.fp = adios_read_open(
+                        filename.c_str(),
+                        ADIOS_READ_METHOD_BP,
+                        mThreadParams.adiosComm,
+                        ADIOS_LOCKMODE_CURRENT,
+                        timeout);
+                }
+                if(adios_errno == err_end_of_stream)
+                    /* could not read full stream */
+                    throw std::runtime_error("ADIOS: Stream terminated too early: " + std::string(adios_errmsg()));
+                if(mThreadParams.fp == nullptr)
+                    throw std::runtime_error("ADIOS: Error opening stream: " + std::string(adios_errmsg()));
+
+                /* ADIOS types */
+                AdiosUInt32Type adiosUInt32Type;
+
+                /* load number of slides to initialize MovingWindow */
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) read attr (%1% available)") % mThreadParams.fp->nattrs;
+                void* slidesPtr = nullptr;
+                int slideSize;
+                enum ADIOS_DATATYPES slidesType;
+                const std::string simSlidesPath = mThreadParams.adiosBasePath + std::string("sim_slides");
+                ADIOS_CMD(
+                    adios_get_attr(mThreadParams.fp, simSlidesPath.c_str(), &slidesType, &slideSize, &slidesPtr));
+
+                uint32_t slides = *((uint32_t*) slidesPtr);
+                log<picLog::INPUT_OUTPUT>("ADIOS: value of sim_slides = %1%") % slides;
+
+                PMACC_ASSERT(slidesType == adiosUInt32Type.type);
+                PMACC_ASSERT(slideSize == sizeof(uint32_t)); // uint32_t in bytes
+
+                void* lastStepPtr = nullptr;
+                int lastStepSize;
+                enum ADIOS_DATATYPES lastStepType;
+                const std::string iterationPath = mThreadParams.adiosBasePath + std::string("iteration");
+                ADIOS_CMD(adios_get_attr(
+                    mThreadParams.fp,
+                    iterationPath.c_str(),
+                    &lastStepType,
+                    &lastStepSize,
+                    &lastStepPtr));
+                uint32_t lastStep = *((uint32_t*) lastStepPtr);
+                log<picLog::INPUT_OUTPUT>("ADIOS: value of iteration = %1%") % lastStep;
+
+                PMACC_ASSERT(lastStepType == adiosUInt32Type.type);
+                PMACC_ASSERT(lastStep == restartStep);
+
+                /* apply slides to set gpus to last/written configuration */
+                log<picLog::INPUT_OUTPUT>("ADIOS: Setting slide count for moving window to %1%") % slides;
+                MovingWindow::getInstance().setSlideCounter(slides, restartStep);
+
+                /* re-distribute the local offsets in y-direction
+                 * this will work for restarts with moving window still enabled
+                 * and restarts that disable the moving window
+                 * \warning enabling the moving window from a checkpoint that
+                 *          had no moving window will not work
+                 */
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                gc.setStateAfterSlides(slides);
+
+                /* set window for restart, complete global domain */
+                mThreadParams.window = MovingWindow::getInstance().getDomainAsWindow(restartStep);
+                mThreadParams.localWindowToDomainOffset = DataSpace<simDim>::create(0);
+
+                /* load all fields */
+                meta::ForEach<FileCheckpointFields, LoadFields<bmpl::_1>> forEachLoadFields;
+                forEachLoadFields(&mThreadParams);
+
+                /* load all particles */
+                meta::ForEach<FileCheckpointParticles, LoadSpecies<bmpl::_1>> forEachLoadSpecies;
+                forEachLoadSpecies(&mThreadParams, restartChunkSize);
+
+                IdProvider<simDim>::State idProvState;
+                ReadNDScalars<uint64_t, uint64_t>()(
+                    mThreadParams,
+                    "picongpu/idProvider/startId",
+                    &idProvState.startId,
+                    "maxNumProc",
+                    &idProvState.maxNumProc);
+                ReadNDScalars<uint64_t>()(mThreadParams, "picongpu/idProvider/nextId", &idProvState.nextId);
+                log<picLog::INPUT_OUTPUT>("Setting next free id on current rank: %1%") % idProvState.nextId;
+                IdProvider<simDim>::setState(idProvState);
+
+                /* free memory allocated in ADIOS calls */
+                free(slidesPtr);
+                free(lastStepPtr);
+
+                // avoid deadlock between not finished pmacc tasks and mpi calls in adios
+                __getTransactionEvent().waitForFinished();
+
+                /* clean shut down: close file and finalize */
+                adios_release_step(mThreadParams.fp);
+                ADIOS_CMD(adios_read_close(mThreadParams.fp));
+                ADIOS_CMD(adios_read_finalize_method(ADIOS_READ_METHOD_BP));
             }
-        }
 
-        /* copy species only one time per timestep to the host */
-        if( lastSpeciesSyncStep != currentStep )
-        {
-            DataConnector &dc = Environment<>::get().DataConnector();
+        private:
+            void endAdios()
+            {
+                /* Finalize adios library */
+                ADIOS_CMD(adios_finalize(Environment<simDim>::get().GridController().getCommunicator().getRank()));
 
-#if( PMACC_CUDA_ENABLED == 1 )
-            /* synchronizes the MallocMCBuffer to the host side */
-            dc.get< MallocMCBuffer< DeviceHeap > >( MallocMCBuffer< DeviceHeap >::getName() );
-#endif
-            /* here we are copying all species to the host side since we
-             * can not say at this point if this time step will need all of them
-             * for sure (checkpoint) or just some user-defined species (dump)
-             */
-            meta::ForEach<FileCheckpointParticles, CopySpeciesToHost<bmpl::_1> > copySpeciesToHost;
-            copySpeciesToHost();
-            lastSpeciesSyncStep = currentStep;
-#if( PMACC_CUDA_ENABLED == 1 )
-            dc.releaseData(MallocMCBuffer<DeviceHeap>::getName());
-#endif
-        }
+                __deleteArray(mThreadParams.fieldBfr);
+            }
 
-        beginAdios(mThreadParams.adiosFilename);
+            void beginAdios(const std::string adiosFilename)
+            {
+                std::stringstream full_filename;
+                full_filename << adiosFilename << "_" << mThreadParams.currentStep << ".bp";
 
-        writeAdios((void*) &mThreadParams, mpiTransportParams);
+                mThreadParams.fullFilename = full_filename.str();
+                mThreadParams.adiosFileHandle = ADIOS_INVALID_HANDLE;
 
-        endAdios();
-    }
+                // Note: here we always allocate for the domain-bound fields
+                mThreadParams.fieldBfr = new float_X[mThreadParams.window.localDimensions.size.productOfComponents()];
 
-    template<typename ComponentType>
-    static void writeField(ThreadParams *params, const uint32_t sizePtrType,
-                           ADIOS_DATATYPES adiosType,
-                           const uint32_t nComponents, const std::string name,
-                           void *ptr,
-                           const bool isDomainBound)
-    {
-        log<picLog::INPUT_OUTPUT > ("ADIOS: write field: %1% %2% %3%") %
-            name % nComponents % ptr;
-
-        const bool fieldTypeCorrect( boost::is_same<ComponentType, float_X>::value );
-        PMACC_CASSERT_MSG(Precision_mismatch_in_Field_Components__ADIOS,fieldTypeCorrect);
-
-        /* data to describe source buffer */
-        GridLayout<simDim> field_layout = params->gridLayout;
-        DataSpace<simDim> field_full = field_layout.getDataSpace();
-        DataSpace<simDim> field_no_guard = params->window.localDimensions.size;
-        DataSpace<simDim> field_guard = field_layout.getGuard() + params->localWindowToDomainOffset;
-        float_X * dstBuffer = params->fieldBfr;
-
-        /* Patch for non-domain-bound fields
-         * This is an ugly fix to allow output of reduced 1d PML buffers,
-         * that are the same size on each domain.
-         * This code is to be replaced with the openPMD output plugin soon.
-         */
-        std::vector< float_X > nonDomainBoundStorage;
-        if( !isDomainBound )
-        {
-            field_no_guard = field_layout.getDataSpaceWithoutGuarding();
-            field_guard = field_layout.getGuard();
-            /* Since params->fieldBfr allocation was of different size,
-             * for this case allocate a new chunk for memory for dstBuffer
-             */
-            nonDomainBoundStorage.resize( field_no_guard.productOfComponents() );
-            dstBuffer = nonDomainBoundStorage.data();
-        }
+                std::stringstream adiosPathBase;
+                adiosPathBase << ADIOS_PATH_ROOT << mThreadParams.currentStep << "/";
+                mThreadParams.adiosBasePath = adiosPathBase.str();
 
-        /* write the actual field data */
-        for (uint32_t d = 0; d < nComponents; d++)
-        {
-            const size_t plane_full_size = field_full[1] * field_full[0] * nComponents;
-            const size_t plane_no_guard_size = field_no_guard[1] * field_no_guard[0];
+                ADIOS_CMD(adios_init_noxml(mThreadParams.adiosComm));
+            }
 
-            /* copy strided data from source to temporary buffer
+            /**
+             * Notification for dump or checkpoint received
              *
-             * \todo use d1Access as in `include/plugins/hdf5/writer/Field.hpp`
+             * @param currentStep current simulation step
              */
-            const int maxZ = simDim == DIM3 ? field_no_guard[2] : 1;
-            const int guardZ = simDim == DIM3 ? field_guard[2] : 0;
-            for (int z = 0; z < maxZ; ++z)
+            void dumpData(uint32_t currentStep)
             {
-                for (int y = 0; y < field_no_guard[1]; ++y)
-                {
-                    const size_t base_index_src =
-                                (z + guardZ) * plane_full_size +
-                                (y + field_guard[1]) * field_full[0] * nComponents;
-
-                    const size_t base_index_dst =
-                                z * plane_no_guard_size +
-                                y * field_no_guard[0];
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+                mThreadParams.cellDescription = m_cellDescription;
+                mThreadParams.currentStep = currentStep;
 
-                    for (int x = 0; x < field_no_guard[0]; ++x)
+                for(uint32_t i = 0; i < simDim; ++i)
+                {
+                    mThreadParams.localWindowToDomainOffset[i] = 0;
+                    if(mThreadParams.window.globalDimensions.offset[i] > localDomain.offset[i])
                     {
-                        size_t index_src = base_index_src + (x + field_guard[0]) * nComponents + d;
-                        size_t index_dst = base_index_dst + x;
-
-                        dstBuffer[index_dst] = ((float_X*)ptr)[index_src];
+                        mThreadParams.localWindowToDomainOffset[i]
+                            = mThreadParams.window.globalDimensions.offset[i] - localDomain.offset[i];
                     }
                 }
-            }
 
-            /* Write the actual field data. The id is on the front of the list. */
-            if (params->adiosFieldVarIds.empty())
-                throw std::runtime_error("Cannot write field (var id list is empty)");
-
-            int64_t adiosFieldVarId = *(params->adiosFieldVarIds.begin());
-            params->adiosFieldVarIds.pop_front();
-            ADIOS_CMD(adios_write_byid(params->adiosFileHandle, adiosFieldVarId, dstBuffer));
-        }
-    }
+                /* copy species only one time per timestep to the host */
+                if(lastSpeciesSyncStep != currentStep)
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+
+                    /* synchronizes the MallocMCBuffer to the host side */
+                    dc.get<MallocMCBuffer<DeviceHeap>>(MallocMCBuffer<DeviceHeap>::getName());
+
+                    /* here we are copying all species to the host side since we
+                     * can not say at this point if this time step will need all of them
+                     * for sure (checkpoint) or just some user-defined species (dump)
+                     */
+                    meta::ForEach<FileCheckpointParticles, CopySpeciesToHost<bmpl::_1>> copySpeciesToHost;
+                    copySpeciesToHost();
+                    lastSpeciesSyncStep = currentStep;
+                    dc.releaseData(MallocMCBuffer<DeviceHeap>::getName());
+                }
 
-    template< typename T_ParticleFilter>
-    struct CallCountParticles
-    {
+                beginAdios(mThreadParams.adiosFilename);
 
-        void operator()(
-            const std::vector< std::string > & vectorOfDataSourceNames,
-            ThreadParams* params
-        )
-        {
-            bool const containsDataSource = plugins::misc::containsObject(
-                vectorOfDataSourceNames,
-                T_ParticleFilter::getName()
-            );
+                writeAdios((void*) &mThreadParams, mpiTransportParams);
 
-            if( containsDataSource )
-            {
-                ADIOSCountParticles<
-                    T_ParticleFilter
-                > count;
-                count(params);
+                endAdios();
             }
 
-        }
-    };
-
-    template< typename T_ParticleFilter>
-    struct CallWriteSpecies
-    {
-
-        template< typename Space >
-        void operator()(
-            const std::vector< std::string > & vectorOfDataSourceNames,
-            ThreadParams* params,
-            const Space domainOffset
-        )
-        {
-            bool const containsDataSource = plugins::misc::containsObject(
-                vectorOfDataSourceNames,
-                T_ParticleFilter::getName()
-            );
-
-            if( containsDataSource )
+            template<typename ComponentType>
+            static void writeField(
+                ThreadParams* params,
+                const uint32_t sizePtrType,
+                ADIOS_DATATYPES adiosType,
+                const uint32_t nComponents,
+                const std::string name,
+                void* ptr,
+                const bool isDomainBound)
             {
-                WriteSpecies<
-                    T_ParticleFilter
-                > writeSpecies;
-                writeSpecies(params, domainOffset);
-            }
-
-        }
-    };
+                log<picLog::INPUT_OUTPUT>("ADIOS: write field: %1% %2% %3%") % name % nComponents % ptr;
+
+                const bool fieldTypeCorrect(boost::is_same<ComponentType, float_X>::value);
+                PMACC_CASSERT_MSG(Precision_mismatch_in_Field_Components__ADIOS, fieldTypeCorrect);
+
+                /* data to describe source buffer */
+                GridLayout<simDim> field_layout = params->gridLayout;
+                DataSpace<simDim> field_full = field_layout.getDataSpace();
+                DataSpace<simDim> field_no_guard = params->window.localDimensions.size;
+                DataSpace<simDim> field_guard = field_layout.getGuard() + params->localWindowToDomainOffset;
+                float_X* dstBuffer = params->fieldBfr;
+
+                /* Patch for non-domain-bound fields
+                 * This is an ugly fix to allow output of reduced 1d PML buffers,
+                 * that are the same size on each domain.
+                 * This code is to be replaced with the openPMD output plugin soon.
+                 */
+                std::vector<float_X> nonDomainBoundStorage;
+                if(!isDomainBound)
+                {
+                    field_no_guard = field_layout.getDataSpaceWithoutGuarding();
+                    field_guard = field_layout.getGuard();
+                    /* Since params->fieldBfr allocation was of different size,
+                     * for this case allocate a new chunk for memory for dstBuffer
+                     */
+                    nonDomainBoundStorage.resize(field_no_guard.productOfComponents());
+                    dstBuffer = nonDomainBoundStorage.data();
+                }
 
-    template< typename T_Fields >
-    struct CallCollectFieldsSizes
-    {
+                /* write the actual field data */
+                for(uint32_t d = 0; d < nComponents; d++)
+                {
+                    const size_t plane_full_size = field_full[1] * field_full[0] * nComponents;
+                    const size_t plane_no_guard_size = field_no_guard[1] * field_no_guard[0];
+
+                    /* copy strided data from source to temporary buffer
+                     *
+                     * \todo use d1Access as in `include/plugins/hdf5/writer/Field.hpp`
+                     */
+                    const int maxZ = simDim == DIM3 ? field_no_guard[2] : 1;
+                    const int guardZ = simDim == DIM3 ? field_guard[2] : 0;
+                    for(int z = 0; z < maxZ; ++z)
+                    {
+                        for(int y = 0; y < field_no_guard[1]; ++y)
+                        {
+                            const size_t base_index_src
+                                = (z + guardZ) * plane_full_size + (y + field_guard[1]) * field_full[0] * nComponents;
 
-        void operator()(
-            const std::vector< std::string > & vectorOfDataSourceNames,
-            ThreadParams* params
-        )
-        {
-            bool const containsDataSource = plugins::misc::containsObject(
-                vectorOfDataSourceNames,
-                T_Fields::getName()
-            );
+                            const size_t base_index_dst = z * plane_no_guard_size + y * field_no_guard[0];
 
-            if( containsDataSource )
-            {
-                CollectFieldsSizes<
-                    T_Fields
-                > count;
-                count(params);
-            }
+                            for(int x = 0; x < field_no_guard[0]; ++x)
+                            {
+                                size_t index_src = base_index_src + (x + field_guard[0]) * nComponents + d;
+                                size_t index_dst = base_index_dst + x;
 
-        }
-    };
+                                dstBuffer[index_dst] = ((float_X*) ptr)[index_src];
+                            }
+                        }
+                    }
 
-    template< typename T_Fields >
-    struct CallGetFields
-    {
+                    /* Write the actual field data. The id is on the front of the list. */
+                    if(params->adiosFieldVarIds.empty())
+                        throw std::runtime_error("Cannot write field (var id list is empty)");
 
-        void operator()(
-            const std::vector< std::string > & vectorOfDataSourceNames,
-            ThreadParams* params
-        )
-        {
-            bool const containsDataSource = plugins::misc::containsObject(
-                vectorOfDataSourceNames,
-                T_Fields::getName()
-            );
-
-            if( containsDataSource )
-            {
-                GetFields<
-                    T_Fields
-                > getFields;
-                getFields( params );
+                    int64_t adiosFieldVarId = *(params->adiosFieldVarIds.begin());
+                    params->adiosFieldVarIds.pop_front();
+                    ADIOS_CMD(adios_write_byid(params->adiosFileHandle, adiosFieldVarId, dstBuffer));
+                }
             }
 
-        }
-    };
-
-    void *writeAdios(void *p_args, std::string mpiTransportParams)
-    {
-
-        // synchronize, because following operations will be blocking anyway
-        ThreadParams *threadParams = (ThreadParams*) (p_args);
-        threadParams->adiosGroupSize = 0;
-
-        /* y direction can be negative for first gpu */
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-        DataSpace<simDim> particleOffset(localDomain.offset);
-        particleOffset.y() -= threadParams->window.globalDimensions.offset.y();
-
-        // do not generate statistics for variables on the fly
-        ADIOS_STATISTICS_FLAG noStatistics = adios_stat_no;
-
-        /* create adios group for fields without statistics */
-        const std::string iterationPath =
-            threadParams->adiosBasePath + std::string("iteration");
-        ADIOS_CMD(adios_declare_group(&(threadParams->adiosGroupHandle),
-                ADIOS_GROUP_NAME,
-                iterationPath.c_str(),
-                noStatistics));
-
-        /* select MPI method, #OSTs and #aggregators */
-        ADIOS_CMD(adios_select_method(threadParams->adiosGroupHandle,
-                  "MPI_AGGREGATE", mpiTransportParams.c_str(), ""));
-
-        threadParams->fieldsOffsetDims = precisionCast<uint64_t>(localDomain.offset);
-
-        /* write created variable values */
-        for (uint32_t d = 0; d < simDim; ++d)
-        {
-            /* dimension 1 is y and is the direction of the moving window (if any) */
-            if (1 == d)
+            template<typename T_ParticleFilter>
+            struct CallCountParticles
             {
-                uint64_t offset = std::max(0, localDomain.offset.y() -
-                                              threadParams->window.globalDimensions.offset.y());
-                threadParams->fieldsOffsetDims[d] = offset;
-            }
-
-            threadParams->fieldsSizeDims[d] = threadParams->window.localDimensions.size[d];
-            threadParams->fieldsGlobalSizeDims[d] = threadParams->window.globalDimensions.size[d];
-        }
+                void operator()(const std::vector<std::string>& vectorOfDataSourceNames, ThreadParams* params)
+                {
+                    bool const containsDataSource
+                        = plugins::misc::containsObject(vectorOfDataSourceNames, T_ParticleFilter::getName());
 
-        std::vector< std::string > vectorOfDataSourceNames;
-        if( m_help->selfRegister )
-        {
-            std::string dataSourceNames = m_help->source.get( m_id );
+                    if(containsDataSource)
+                    {
+                        ADIOSCountParticles<T_ParticleFilter> count;
+                        count(params);
+                    }
+                }
+            };
 
-            vectorOfDataSourceNames = plugins::misc::splitString(
-                plugins::misc::removeSpaces( dataSourceNames )
-            );
-        }
+            template<typename T_ParticleFilter>
+            struct CallWriteSpecies
+            {
+                template<typename Space>
+                void operator()(
+                    const std::vector<std::string>& vectorOfDataSourceNames,
+                    ThreadParams* params,
+                    const Space domainOffset)
+                {
+                    bool const containsDataSource
+                        = plugins::misc::containsObject(vectorOfDataSourceNames, T_ParticleFilter::getName());
 
-        bool dumpFields = plugins::misc::containsObject(
-            vectorOfDataSourceNames,
-            "fields_all"
-        );
+                    if(containsDataSource)
+                    {
+                        WriteSpecies<T_ParticleFilter> writeSpecies;
+                        writeSpecies(params, domainOffset);
+                    }
+                }
+            };
 
-        /* collect size information for each field to be written and define
-         * field variables
-         */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) collecting fields.");
-        threadParams->adiosFieldVarIds.clear();
-        if (threadParams->isCheckpoint)
-        {
-            meta::ForEach<
-                FileCheckpointFields,
-                CollectFieldsSizes< bmpl::_1 >
-            > forEachCollectFieldsSizes;
-            forEachCollectFieldsSizes(threadParams);
-        }
-        else
-        {
-            if( dumpFields )
+            template<typename T_Fields>
+            struct CallCollectFieldsSizes
             {
-                meta::ForEach<
-                    FileOutputFields,
-                    CollectFieldsSizes< bmpl::_1 >
-                > forEachCollectFieldsSizes;
-                forEachCollectFieldsSizes(threadParams);
-            }
+                void operator()(const std::vector<std::string>& vectorOfDataSourceNames, ThreadParams* params)
+                {
+                    bool const containsDataSource
+                        = plugins::misc::containsObject(vectorOfDataSourceNames, T_Fields::getName());
 
-            // move over all field data sources
-            meta::ForEach<
-                typename Help::AllFieldSources,
-                CallCollectFieldsSizes<
-                    bmpl::_1
-                >
-            >{}(vectorOfDataSourceNames, threadParams);
-        }
-        log<picLog::INPUT_OUTPUT > ("ADIOS: ( end ) collecting fields.");
+                    if(containsDataSource)
+                    {
+                        CollectFieldsSizes<T_Fields> count;
+                        count(params);
+                    }
+                }
+            };
 
-        /* collect size information for all attributes of all species and define
-         * particle variables
-         */
-        threadParams->adiosParticleAttrVarIds.clear();
-        threadParams->adiosSpeciesIndexVarIds.clear();
+            template<typename T_Fields>
+            struct CallGetFields
+            {
+                void operator()(const std::vector<std::string>& vectorOfDataSourceNames, ThreadParams* params)
+                {
+                    bool const containsDataSource
+                        = plugins::misc::containsObject(vectorOfDataSourceNames, T_Fields::getName());
 
-        bool dumpAllParticles = plugins::misc::containsObject(
-            vectorOfDataSourceNames,
-            "species_all"
-        );
+                    if(containsDataSource)
+                    {
+                        GetFields<T_Fields> getFields;
+                        getFields(params);
+                    }
+                }
+            };
 
-        log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) counting particles.");
-        if (threadParams->isCheckpoint)
-        {
-            meta::ForEach<
-                FileCheckpointParticles,
-                ADIOSCountParticles<
-                    plugins::misc::UnfilteredSpecies< bmpl::_1 >
-                >
-            > adiosCountParticles;
-            adiosCountParticles( threadParams );
-        }
-        else
-        {
-            // count particles if data source "species_all" is selected
-            if( dumpAllParticles )
+            void* writeAdios(void* p_args, std::string mpiTransportParams)
             {
-                // move over all species defined in FileOutputParticles
-                meta::ForEach<
-                    FileOutputParticles,
-                    ADIOSCountParticles<
-                        plugins::misc::UnfilteredSpecies< bmpl::_1 >
-                    >
-                > adiosCountParticles;
-                adiosCountParticles( threadParams );
-            }
+                // synchronize, because following operations will be blocking anyway
+                ThreadParams* threadParams = (ThreadParams*) (p_args);
+                threadParams->adiosGroupSize = 0;
+
+                /* y direction can be negative for first gpu */
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+                DataSpace<simDim> particleOffset(localDomain.offset);
+                particleOffset.y() -= threadParams->window.globalDimensions.offset.y();
+
+                // do not generate statistics for variables on the fly
+                ADIOS_STATISTICS_FLAG noStatistics = adios_stat_no;
+
+                /* create adios group for fields without statistics */
+                const std::string iterationPath = threadParams->adiosBasePath + std::string("iteration");
+                ADIOS_CMD(adios_declare_group(
+                    &(threadParams->adiosGroupHandle),
+                    ADIOS_GROUP_NAME,
+                    iterationPath.c_str(),
+                    noStatistics));
+
+                /* select MPI method, #OSTs and #aggregators */
+                ADIOS_CMD(adios_select_method(
+                    threadParams->adiosGroupHandle,
+                    "MPI_AGGREGATE",
+                    mpiTransportParams.c_str(),
+                    ""));
+
+                threadParams->fieldsOffsetDims = precisionCast<uint64_t>(localDomain.offset);
+
+                /* write created variable values */
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    /* dimension 1 is y and is the direction of the moving window (if any) */
+                    if(1 == d)
+                    {
+                        uint64_t offset
+                            = std::max(0, localDomain.offset.y() - threadParams->window.globalDimensions.offset.y());
+                        threadParams->fieldsOffsetDims[d] = offset;
+                    }
 
-            // move over all species data sources
-            meta::ForEach<
-                typename Help::AllEligibleSpeciesSources,
-                CallCountParticles<
-                    bmpl::_1
-                >
-            >{}(vectorOfDataSourceNames, threadParams);
-        }
-        log<picLog::INPUT_OUTPUT > ("ADIOS: ( end ) counting particles.");
+                    threadParams->fieldsSizeDims[d] = threadParams->window.localDimensions.size[d];
+                    threadParams->fieldsGlobalSizeDims[d] = threadParams->window.globalDimensions.size[d];
+                }
 
-        auto idProviderState = IdProvider<simDim>::getState();
-        WriteNDScalars<uint64_t, uint64_t> writeIdProviderStartId("picongpu/idProvider/startId", "maxNumProc");
-        WriteNDScalars<uint64_t, uint64_t> writeIdProviderNextId("picongpu/idProvider/nextId");
-        writeIdProviderStartId.prepare(*threadParams, idProviderState.maxNumProc);
-        writeIdProviderNextId.prepare(*threadParams);
+                std::vector<std::string> vectorOfDataSourceNames;
+                if(m_help->selfRegister)
+                {
+                    std::string dataSourceNames = m_help->source.get(m_id);
 
-        // in the past, we had to explicitly estiamte our buffers.
-        // this is now done automatically by ADIOS on `adios_write()`
-        threadParams->adiosBufferInitialized = true;
+                    vectorOfDataSourceNames = plugins::misc::splitString(plugins::misc::removeSpaces(dataSourceNames));
+                }
 
-        /* open adios file. all variables need to be defined at this point */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: open file: %1%") % threadParams->fullFilename;
-        ADIOS_CMD(adios_open(&(threadParams->adiosFileHandle), ADIOS_GROUP_NAME,
-                threadParams->fullFilename.c_str(), "w", threadParams->adiosComm));
+                bool dumpFields = plugins::misc::containsObject(vectorOfDataSourceNames, "fields_all");
 
-        if (threadParams->adiosFileHandle == ADIOS_INVALID_HANDLE)
-            throw std::runtime_error("ADIOS: Failed to open file.");
+                /* collect size information for each field to be written and define
+                 * field variables
+                 */
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) collecting fields.");
+                threadParams->adiosFieldVarIds.clear();
+                if(threadParams->isCheckpoint)
+                {
+                    meta::ForEach<FileCheckpointFields, CollectFieldsSizes<bmpl::_1>> forEachCollectFieldsSizes;
+                    forEachCollectFieldsSizes(threadParams);
+                }
+                else
+                {
+                    if(dumpFields)
+                    {
+                        meta::ForEach<FileOutputFields, CollectFieldsSizes<bmpl::_1>> forEachCollectFieldsSizes;
+                        forEachCollectFieldsSizes(threadParams);
+                    }
 
-        /* attributes written here are pure meta data */
-        WriteMeta writeMetaAttributes;
-        writeMetaAttributes(threadParams);
+                    // move over all field data sources
+                    meta::ForEach<typename Help::AllFieldSources, CallCollectFieldsSizes<bmpl::_1>>{}(
+                        vectorOfDataSourceNames,
+                        threadParams);
+                }
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( end ) collecting fields.");
 
-        /* set adios group size (total size of all data to be written)
-         * besides the number of bytes for variables, this call also
-         * calculates the overhead of meta data
-         */
-        uint64_t adiosTotalSize;
-        ADIOS_CMD(adios_group_size(threadParams->adiosFileHandle,
-                threadParams->adiosGroupSize, &adiosTotalSize));
+                /* collect size information for all attributes of all species and define
+                 * particle variables
+                 */
+                threadParams->adiosParticleAttrVarIds.clear();
+                threadParams->adiosSpeciesIndexVarIds.clear();
 
-        /* write fields */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) writing fields.");
-        if (threadParams->isCheckpoint)
-        {
-            meta::ForEach<
-                FileCheckpointFields,
-                GetFields< bmpl::_1 >
-            > forEachGetFields;
-            forEachGetFields(threadParams);
-        }
-        else
-        {
-            if( dumpFields )
-            {
-                meta::ForEach<
-                    FileOutputFields,
-                    GetFields< bmpl::_1 >
-                > forEachGetFields;
-                forEachGetFields(threadParams);
-            }
+                bool dumpAllParticles = plugins::misc::containsObject(vectorOfDataSourceNames, "species_all");
 
-            // move over all field data sources
-            meta::ForEach<
-                typename Help::AllFieldSources,
-                CallGetFields<
-                    bmpl::_1
-                >
-            >{}(vectorOfDataSourceNames, threadParams);
-        }
-        log<picLog::INPUT_OUTPUT > ("ADIOS: ( end ) writing fields.");
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) counting particles.");
+                if(threadParams->isCheckpoint)
+                {
+                    meta::ForEach<
+                        FileCheckpointParticles,
+                        ADIOSCountParticles<plugins::misc::UnfilteredSpecies<bmpl::_1>>>
+                        adiosCountParticles;
+                    adiosCountParticles(threadParams);
+                }
+                else
+                {
+                    // count particles if data source "species_all" is selected
+                    if(dumpAllParticles)
+                    {
+                        // move over all species defined in FileOutputParticles
+                        meta::ForEach<
+                            FileOutputParticles,
+                            ADIOSCountParticles<plugins::misc::UnfilteredSpecies<bmpl::_1>>>
+                            adiosCountParticles;
+                        adiosCountParticles(threadParams);
+                    }
 
-        /* print all particle species */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) writing particle species.");
-        if (threadParams->isCheckpoint)
-        {
-            meta::ForEach<
-                FileCheckpointParticles,
-                WriteSpecies<
-                    plugins::misc::SpeciesFilter< bmpl::_1 >
-                >
-            > writeSpecies;
-            writeSpecies(threadParams, particleOffset);
-        }
-        else
-        {
-            // dump data if data source "species_all" is selected
-            if( dumpAllParticles )
-            {
-                // move over all species defined in FileOutputParticles
-                meta::ForEach<
-                    FileOutputParticles,
-                    WriteSpecies<
-                        plugins::misc::UnfilteredSpecies< bmpl::_1 >
-                    >
-                > writeSpecies;
-                writeSpecies( threadParams, particleOffset );
-            }
+                    // move over all species data sources
+                    meta::ForEach<typename Help::AllEligibleSpeciesSources, CallCountParticles<bmpl::_1>>{}(
+                        vectorOfDataSourceNames,
+                        threadParams);
+                }
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( end ) counting particles.");
+
+                auto idProviderState = IdProvider<simDim>::getState();
+                WriteNDScalars<uint64_t, uint64_t> writeIdProviderStartId("picongpu/idProvider/startId", "maxNumProc");
+                WriteNDScalars<uint64_t, uint64_t> writeIdProviderNextId("picongpu/idProvider/nextId");
+                writeIdProviderStartId.prepare(*threadParams, idProviderState.maxNumProc);
+                writeIdProviderNextId.prepare(*threadParams);
+
+                // in the past, we had to explicitly estiamte our buffers.
+                // this is now done automatically by ADIOS on `adios_write()`
+                threadParams->adiosBufferInitialized = true;
+
+                /* open adios file. all variables need to be defined at this point */
+                log<picLog::INPUT_OUTPUT>("ADIOS: open file: %1%") % threadParams->fullFilename;
+                ADIOS_CMD(adios_open(
+                    &(threadParams->adiosFileHandle),
+                    ADIOS_GROUP_NAME,
+                    threadParams->fullFilename.c_str(),
+                    "w",
+                    threadParams->adiosComm));
+
+                if(threadParams->adiosFileHandle == ADIOS_INVALID_HANDLE)
+                    throw std::runtime_error("ADIOS: Failed to open file.");
+
+                /* attributes written here are pure meta data */
+                WriteMeta writeMetaAttributes;
+                writeMetaAttributes(threadParams);
+
+                /* set adios group size (total size of all data to be written)
+                 * besides the number of bytes for variables, this call also
+                 * calculates the overhead of meta data
+                 */
+                uint64_t adiosTotalSize;
+                ADIOS_CMD(
+                    adios_group_size(threadParams->adiosFileHandle, threadParams->adiosGroupSize, &adiosTotalSize));
+
+                /* write fields */
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) writing fields.");
+                if(threadParams->isCheckpoint)
+                {
+                    meta::ForEach<FileCheckpointFields, GetFields<bmpl::_1>> forEachGetFields;
+                    forEachGetFields(threadParams);
+                }
+                else
+                {
+                    if(dumpFields)
+                    {
+                        meta::ForEach<FileOutputFields, GetFields<bmpl::_1>> forEachGetFields;
+                        forEachGetFields(threadParams);
+                    }
 
-            // move over all species data sources
-            meta::ForEach<
-                typename Help::AllEligibleSpeciesSources,
-                CallWriteSpecies<
-                    bmpl::_1
-                >
-            >{}(vectorOfDataSourceNames, threadParams, particleOffset);
-        }
-        log<picLog::INPUT_OUTPUT > ("ADIOS: ( end ) writing particle species.");
+                    // move over all field data sources
+                    meta::ForEach<typename Help::AllFieldSources, CallGetFields<bmpl::_1>>{}(
+                        vectorOfDataSourceNames,
+                        threadParams);
+                }
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( end ) writing fields.");
 
-        log<picLog::INPUT_OUTPUT>("ADIOS: Writing IdProvider state (StartId: %1%, NextId: %2%, maxNumProc: %3%)")
-                % idProviderState.startId % idProviderState.nextId % idProviderState.maxNumProc;
-        writeIdProviderStartId(*threadParams, idProviderState.startId);
-        writeIdProviderNextId(*threadParams, idProviderState.nextId);
+                /* print all particle species */
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) writing particle species.");
+                if(threadParams->isCheckpoint)
+                {
+                    meta::ForEach<FileCheckpointParticles, WriteSpecies<plugins::misc::SpeciesFilter<bmpl::_1>>>
+                        writeSpecies;
+                    writeSpecies(threadParams, particleOffset);
+                }
+                else
+                {
+                    // dump data if data source "species_all" is selected
+                    if(dumpAllParticles)
+                    {
+                        // move over all species defined in FileOutputParticles
+                        meta::ForEach<FileOutputParticles, WriteSpecies<plugins::misc::UnfilteredSpecies<bmpl::_1>>>
+                            writeSpecies;
+                        writeSpecies(threadParams, particleOffset);
+                    }
 
-        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-        __getTransactionEvent().waitForFinished();
+                    // move over all species data sources
+                    meta::ForEach<typename Help::AllEligibleSpeciesSources, CallWriteSpecies<bmpl::_1>>{}(
+                        vectorOfDataSourceNames,
+                        threadParams,
+                        particleOffset);
+                }
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( end ) writing particle species.");
 
-        /* close adios file, most likely the actual write point */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: closing file: %1%") % threadParams->fullFilename;
-        ADIOS_CMD(adios_close(threadParams->adiosFileHandle));
+                log<picLog::INPUT_OUTPUT>(
+                    "ADIOS: Writing IdProvider state (StartId: %1%, NextId: %2%, maxNumProc: %3%)")
+                    % idProviderState.startId % idProviderState.nextId % idProviderState.maxNumProc;
+                writeIdProviderStartId(*threadParams, idProviderState.startId);
+                writeIdProviderNextId(*threadParams, idProviderState.nextId);
 
-        /*\todo: copied from adios example, we might not need this ? */
-        MPI_CHECK(MPI_Barrier(threadParams->adiosComm));
+                // avoid deadlock between not finished pmacc tasks and mpi calls in adios
+                __getTransactionEvent().waitForFinished();
 
-        return nullptr;
-    }
+                /* close adios file, most likely the actual write point */
+                log<picLog::INPUT_OUTPUT>("ADIOS: closing file: %1%") % threadParams->fullFilename;
+                ADIOS_CMD(adios_close(threadParams->adiosFileHandle));
 
-    ThreadParams mThreadParams;
+                /*\todo: copied from adios example, we might not need this ? */
+                MPI_CHECK(MPI_Barrier(threadParams->adiosComm));
 
-    std::shared_ptr< Help > m_help;
-    size_t m_id;
+                return nullptr;
+            }
 
-    MappingDesc *m_cellDescription;
+            ThreadParams mThreadParams;
 
-    std::string outputDirectory;
+            std::shared_ptr<Help> m_help;
+            size_t m_id;
 
-    /* select MPI method, #OSTs and #aggregators */
-    std::string mpiTransportParams;
+            MappingDesc* m_cellDescription;
 
-    uint32_t lastSpeciesSyncStep;
+            std::string outputDirectory;
 
-    DataSpace<simDim> mpi_pos;
-    DataSpace<simDim> mpi_size;
-};
+            /* select MPI method, #OSTs and #aggregators */
+            std::string mpiTransportParams;
 
-} //namespace adios
-} //namespace picongpu
+            uint32_t lastSpeciesSyncStep;
+
+            DataSpace<simDim> mpi_pos;
+            DataSpace<simDim> mpi_size;
+        };
 
+    } // namespace adios
+} // namespace picongpu
diff --git a/include/picongpu/plugins/adios/NDScalars.hpp b/include/picongpu/plugins/adios/NDScalars.hpp
index e44202f5ea..e86730585c 100644
--- a/include/picongpu/plugins/adios/NDScalars.hpp
+++ b/include/picongpu/plugins/adios/NDScalars.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PIConGPU.
  *
@@ -26,52 +26,54 @@
 #include <pmacc/Environment.hpp>
 #include <stdexcept>
 
-namespace picongpu {
-namespace adios {
-
-/** Functor for writing ND scalar fields with N=simDim
- * In the current implementation each process (of the ND grid of processes) writes 1 scalar value
- * Optionally the processes can also write an attribute for this dataset by using a non-empty attrName
- *
- * @tparam T_Scalar    Type of the scalar value to write
- * @tparam T_Attribute Type of the attribute (can be omitted if attribute is not written, defaults to uint64_t)
- */
-template<typename T_Scalar, typename T_Attribute = uint64_t>
-struct WriteNDScalars
+namespace picongpu
 {
-    WriteNDScalars(const std::string& name, const std::string& attrName = ""):
-        name(name), attrName(attrName){}
-
-    /** Prepare the write operation:
-     *  Define ADIOS variable, increase params.adiosGroupSize and write attribute (if attrName is non-empty)
-     *
-     *  Must be called before executing the functor
-     */
-    void prepare(ThreadParams& params, T_Attribute attribute = T_Attribute())
+    namespace adios
     {
-        typedef traits::PICToAdios<T_Scalar> AdiosSkalarType;
-        typedef pmacc::math::UInt64<simDim> Dimensions;
-
-        log<picLog::INPUT_OUTPUT> ("ADIOS: prepare write %1%D scalars: %2%") % simDim % name;
-
-        params.adiosGroupSize += sizeof(T_Scalar);
-        if(!attrName.empty())
-            params.adiosGroupSize += sizeof(T_Attribute);
-
-        // Size over all processes
-        Dimensions globalDomainSize = Dimensions::create(1);
-        // Offset for this process
-        Dimensions localDomainOffset = Dimensions::create(0);
-
-        for (uint32_t d = 0; d < simDim; ++d)
+        /** Functor for writing ND scalar fields with N=simDim
+         * In the current implementation each process (of the ND grid of processes) writes 1 scalar value
+         * Optionally the processes can also write an attribute for this dataset by using a non-empty attrName
+         *
+         * @tparam T_Scalar    Type of the scalar value to write
+         * @tparam T_Attribute Type of the attribute (can be omitted if attribute is not written, defaults to uint64_t)
+         */
+        template<typename T_Scalar, typename T_Attribute = uint64_t>
+        struct WriteNDScalars
         {
-            globalDomainSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d];
-            localDomainOffset[d] = Environment<simDim>::get().GridController().getPosition()[d];
-        }
-
-        std::string datasetName = params.adiosBasePath + name;
-
-        varId = defineAdiosVar<simDim>(
+            WriteNDScalars(const std::string& name, const std::string& attrName = "") : name(name), attrName(attrName)
+            {
+            }
+
+            /** Prepare the write operation:
+             *  Define ADIOS variable, increase params.adiosGroupSize and write attribute (if attrName is non-empty)
+             *
+             *  Must be called before executing the functor
+             */
+            void prepare(ThreadParams& params, T_Attribute attribute = T_Attribute())
+            {
+                typedef traits::PICToAdios<T_Scalar> AdiosSkalarType;
+                typedef pmacc::math::UInt64<simDim> Dimensions;
+
+                log<picLog::INPUT_OUTPUT>("ADIOS: prepare write %1%D scalars: %2%") % simDim % name;
+
+                params.adiosGroupSize += sizeof(T_Scalar);
+                if(!attrName.empty())
+                    params.adiosGroupSize += sizeof(T_Attribute);
+
+                // Size over all processes
+                Dimensions globalDomainSize = Dimensions::create(1);
+                // Offset for this process
+                Dimensions localDomainOffset = Dimensions::create(0);
+
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    globalDomainSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d];
+                    localDomainOffset[d] = Environment<simDim>::get().GridController().getPosition()[d];
+                }
+
+                std::string datasetName = params.adiosBasePath + name;
+
+                varId = defineAdiosVar<simDim>(
                     params.adiosGroupHandle,
                     datasetName.c_str(),
                     nullptr,
@@ -82,86 +84,96 @@ struct WriteNDScalars
                     true,
                     params.adiosCompression);
 
-        if(!attrName.empty())
-        {
-            typedef traits::PICToAdios<T_Attribute> AdiosAttrType;
-
-            log<picLog::INPUT_OUTPUT> ("ADIOS: write attribute %1% of %2%D scalars: %3%") % attrName % simDim % name;
-            ADIOS_CMD( adios_define_attribute_byvalue(params.adiosGroupHandle,
-                       attrName.c_str(), datasetName.c_str(),
-                       AdiosAttrType().type, 1, (void*)&attribute) );
-        }
-    }
-
-    void operator()(ThreadParams& params, T_Scalar value)
-    {
-        log<picLog::INPUT_OUTPUT> ("ADIOS: write %1%D scalars: %2%") % simDim % name;
-
-        ADIOS_CMD( adios_write_byid(params.adiosFileHandle, varId, &value) );
-    }
-private:
-    const std::string name, attrName;
-    int64_t varId;
-};
-
-/** Functor for reading ND scalar fields with N=simDim
- * In the current implementation each process (of the ND grid of processes) reads 1 scalar value
- * Optionally the processes can also read an attribute for this dataset by using a non-empty attrName
- *
- * @tparam T_Scalar    Type of the scalar value to read
- * @tparam T_Attribute Type of the attribute (can be omitted if attribute is not read, defaults to uint64_t)
- */
-template<typename T_Scalar, typename T_Attribute = uint64_t>
-struct ReadNDScalars
-{
-    /** Read the skalar field and optionally the attribute into the values referenced by the pointers */
-    void operator()(ThreadParams& params,
-                const std::string& name, T_Scalar* value,
-                const std::string& attrName = "", T_Attribute* attribute = nullptr)
-    {
-        log<picLog::INPUT_OUTPUT> ("ADIOS: read %1%D scalars: %2%") % simDim % name;
-        std::string datasetName = params.adiosBasePath + name;
-
-        ADIOS_VARINFO* varInfo;
-        ADIOS_CMD_EXPECT_NONNULL( varInfo = adios_inq_var(params.fp, datasetName.c_str()) );
-        if(varInfo->ndim != simDim)
-            throw std::runtime_error(std::string("Invalid dimensionality for ") + name);
-        if(varInfo->type != traits::PICToAdios<T_Scalar>().type)
-            throw std::runtime_error(std::string("Invalid type for ") + name);
-
-        DataSpace<simDim> gridPos = Environment<simDim>::get().GridController().getPosition();
-        uint64_t start[varInfo->ndim];
-        uint64_t count[varInfo->ndim];
-        for(int d = 0; d < varInfo->ndim; ++d)
-        {
-            /* \see adios_define_var: z,y,x in C-order */
-            start[d] = gridPos.revert()[d];
-            count[d] = 1;
-        }
-
-        ADIOS_SELECTION* fSel = adios_selection_boundingbox(varInfo->ndim, start, count);
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-        __getTransactionEvent().waitForFinished();
-
-        /* specify what we want to read, but start reading at below at `adios_perform_reads` */
-        /* magic parameters (0, 1): `from_step` (not used in streams), `nsteps` to read (must be 1 for stream) */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: Schedule read skalar %1%)") % datasetName;
-        ADIOS_CMD( adios_schedule_read(params.fp, fSel, datasetName.c_str(), 0, 1, (void*)value) );
-
-        /* start a blocking read of all scheduled variables */
-        ADIOS_CMD( adios_perform_reads(params.fp, 1) );
-
-        adios_selection_delete(fSel);
-        adios_free_varinfo(varInfo);
-
-        if(!attrName.empty())
+                if(!attrName.empty())
+                {
+                    typedef traits::PICToAdios<T_Attribute> AdiosAttrType;
+
+                    log<picLog::INPUT_OUTPUT>("ADIOS: write attribute %1% of %2%D scalars: %3%") % attrName % simDim
+                        % name;
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        params.adiosGroupHandle,
+                        attrName.c_str(),
+                        datasetName.c_str(),
+                        AdiosAttrType().type,
+                        1,
+                        (void*) &attribute));
+                }
+            }
+
+            void operator()(ThreadParams& params, T_Scalar value)
+            {
+                log<picLog::INPUT_OUTPUT>("ADIOS: write %1%D scalars: %2%") % simDim % name;
+
+                ADIOS_CMD(adios_write_byid(params.adiosFileHandle, varId, &value));
+            }
+
+        private:
+            const std::string name, attrName;
+            int64_t varId;
+        };
+
+        /** Functor for reading ND scalar fields with N=simDim
+         * In the current implementation each process (of the ND grid of processes) reads 1 scalar value
+         * Optionally the processes can also read an attribute for this dataset by using a non-empty attrName
+         *
+         * @tparam T_Scalar    Type of the scalar value to read
+         * @tparam T_Attribute Type of the attribute (can be omitted if attribute is not read, defaults to uint64_t)
+         */
+        template<typename T_Scalar, typename T_Attribute = uint64_t>
+        struct ReadNDScalars
         {
-            log<picLog::INPUT_OUTPUT> ("ADIOS: read attribute %1% for scalars: %2%") % attrName % name;
-            *attribute = readAttribute<T_Attribute>(params.fp, datasetName, attrName);
-        }
-    }
-};
-
-}  // namespace adios
-}  // namespace picongpu
+            /** Read the skalar field and optionally the attribute into the values referenced by the pointers */
+            void operator()(
+                ThreadParams& params,
+                const std::string& name,
+                T_Scalar* value,
+                const std::string& attrName = "",
+                T_Attribute* attribute = nullptr)
+            {
+                log<picLog::INPUT_OUTPUT>("ADIOS: read %1%D scalars: %2%") % simDim % name;
+                std::string datasetName = params.adiosBasePath + name;
+
+                ADIOS_VARINFO* varInfo;
+                ADIOS_CMD_EXPECT_NONNULL(varInfo = adios_inq_var(params.fp, datasetName.c_str()));
+                if(varInfo->ndim != simDim)
+                    throw std::runtime_error(std::string("Invalid dimensionality for ") + name);
+                if(varInfo->type != traits::PICToAdios<T_Scalar>().type)
+                    throw std::runtime_error(std::string("Invalid type for ") + name);
+
+                DataSpace<simDim> gridPos = Environment<simDim>::get().GridController().getPosition();
+                uint64_t start[varInfo->ndim];
+                uint64_t count[varInfo->ndim];
+                for(int d = 0; d < varInfo->ndim; ++d)
+                {
+                    /* \see adios_define_var: z,y,x in C-order */
+                    start[d] = gridPos.revert()[d];
+                    count[d] = 1;
+                }
+
+                ADIOS_SELECTION* fSel = adios_selection_boundingbox(varInfo->ndim, start, count);
+
+                // avoid deadlock between not finished pmacc tasks and mpi calls in adios
+                __getTransactionEvent().waitForFinished();
+
+                /* specify what we want to read, but start reading at below at `adios_perform_reads` */
+                /* magic parameters (0, 1): `from_step` (not used in streams), `nsteps` to read (must be 1 for stream)
+                 */
+                log<picLog::INPUT_OUTPUT>("ADIOS: Schedule read skalar %1%)") % datasetName;
+                ADIOS_CMD(adios_schedule_read(params.fp, fSel, datasetName.c_str(), 0, 1, (void*) value));
+
+                /* start a blocking read of all scheduled variables */
+                ADIOS_CMD(adios_perform_reads(params.fp, 1));
+
+                adios_selection_delete(fSel);
+                adios_free_varinfo(varInfo);
+
+                if(!attrName.empty())
+                {
+                    log<picLog::INPUT_OUTPUT>("ADIOS: read attribute %1% for scalars: %2%") % attrName % name;
+                    *attribute = readAttribute<T_Attribute>(params.fp, datasetName, attrName);
+                }
+            }
+        };
+
+    } // namespace adios
+} // namespace picongpu
diff --git a/include/picongpu/plugins/adios/WriteMeta.hpp b/include/picongpu/plugins/adios/WriteMeta.hpp
index 5ef96f6573..0d55bd705d 100644
--- a/include/picongpu/plugins/adios/WriteMeta.hpp
+++ b/include/picongpu/plugins/adios/WriteMeta.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl
+/* Copyright 2013-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -37,280 +37,440 @@
 
 namespace picongpu
 {
-namespace adios
-{
-using namespace pmacc;
-
-namespace writeMeta
-{
-    /** write openPMD species meta data
-     *
-     * @tparam numSpecies count of defined species
-     */
-    template< uint32_t numSpecies = bmpl::size<VectorAllSpecies>::type::value >
-    struct OfAllSpecies
+    namespace adios
     {
-        /** write meta data for species
-         *
-         * @param threadParams context of the adios plugin
-         * @param fullMeshesPath path to mesh entry
-         */
-        void operator()(
-            ThreadParams* threadParams,
-            const std::string& fullMeshesPath
-        ) const
+        using namespace pmacc;
+
+        namespace writeMeta
         {
-            // assume all boundaries are like the first species for openPMD 1.0.0
-            GetStringProperties<bmpl::at_c<VectorAllSpecies, 0>::type> particleBoundaryProp;
-            std::list<std::string> listParticleBoundary;
-            std::list<std::string> listParticleBoundaryParam;
-            for( uint32_t i = NumberOfExchanges<simDim>::value - 1; i > 0; --i )
+            /** write openPMD species meta data
+             *
+             * @tparam numSpecies count of defined species
+             */
+            template<uint32_t numSpecies = bmpl::size<VectorAllSpecies>::type::value>
+            struct OfAllSpecies
             {
-                if( FRONT % i == 0 )
+                /** write meta data for species
+                 *
+                 * @param threadParams context of the adios plugin
+                 * @param fullMeshesPath path to mesh entry
+                 */
+                void operator()(ThreadParams* threadParams, const std::string& fullMeshesPath) const
                 {
-                    listParticleBoundary.push_back(
-                        particleBoundaryProp[ExchangeTypeNames()[i]]["name"].value
-                    );
-                    listParticleBoundaryParam.push_back(
-                        particleBoundaryProp[ExchangeTypeNames()[i]]["param"].value
-                    );
+                    // assume all boundaries are like the first species for openPMD 1.0.0
+                    GetStringProperties<bmpl::at_c<VectorAllSpecies, 0>::type> particleBoundaryProp;
+                    std::list<std::string> listParticleBoundary;
+                    std::list<std::string> listParticleBoundaryParam;
+                    for(uint32_t i = NumberOfExchanges<simDim>::value - 1; i > 0; --i)
+                    {
+                        if(FRONT % i == 0)
+                        {
+                            listParticleBoundary.push_back(particleBoundaryProp[ExchangeTypeNames()[i]]["name"].value);
+                            listParticleBoundaryParam.push_back(
+                                particleBoundaryProp[ExchangeTypeNames()[i]]["param"].value);
+                        }
+                    }
+                    helper::GetADIOSArrayOfString getADIOSArrayOfString;
+                    auto arrParticleBoundary = getADIOSArrayOfString(listParticleBoundary);
+                    auto arrParticleBoundaryParam = getADIOSArrayOfString(listParticleBoundaryParam);
+
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        threadParams->adiosGroupHandle,
+                        "particleBoundary",
+                        fullMeshesPath.c_str(),
+                        adios_string_array,
+                        listParticleBoundary.size(),
+                        &(arrParticleBoundary.starts.at(0))));
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        threadParams->adiosGroupHandle,
+                        "particleBoundaryParameters",
+                        fullMeshesPath.c_str(),
+                        adios_string_array,
+                        listParticleBoundaryParam.size(),
+                        &(arrParticleBoundaryParam.starts.at(0))));
                 }
-            }
-            helper::GetADIOSArrayOfString getADIOSArrayOfString;
-            auto arrParticleBoundary = getADIOSArrayOfString( listParticleBoundary );
-            auto arrParticleBoundaryParam = getADIOSArrayOfString( listParticleBoundaryParam );
-
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "particleBoundary", fullMeshesPath.c_str(), adios_string_array,
-                listParticleBoundary.size(), &( arrParticleBoundary.starts.at( 0 ) )));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "particleBoundaryParameters", fullMeshesPath.c_str(), adios_string_array,
-                listParticleBoundaryParam.size(), &( arrParticleBoundaryParam.starts.at( 0 ) )));
-        }
-    };
-
-    /** specialization if no species are defined */
-    template< >
-    struct OfAllSpecies< 0 >
-    {
-        /** write meta data for species
-         *
-         * @param threadParams context of the adios plugin
-         * @param fullMeshesPath path to mesh entry
-         */
-        void operator()(
-            ThreadParams* /* threadParams */,
-            const std::string& /* fullMeshesPath */
-        ) const
-        {
-        }
-    };
-
-} // namespace writeMeta
+            };
 
-    struct WriteMeta
-    {
-        void operator()(ThreadParams *threadParams)
-        {
-            log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) write meta attributes.");
-
-            traits::PICToAdios<uint32_t> adiosUInt32Type;
-            traits::PICToAdios<float_X> adiosFloatXType;
-            traits::PICToAdios<float_64> adiosDoubleType;
-
-            /* openPMD attributes */
-            /*   required */
-            const std::string openPMDversion( "1.0.0" );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "openPMD", "/", adios_string, 1, (void*)openPMDversion.c_str()));
-
-            const uint32_t openPMDextension = 1; // ED-PIC ID
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "openPMDextension", "/", adiosUInt32Type.type, 1, (void*)&openPMDextension));
-
-            const std::string basePath( ADIOS_PATH_ROOT"%T/" );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "basePath", "/", adios_string, 1, (void*)basePath.c_str()));
-
-            const std::string meshesPath( ADIOS_PATH_FIELDS );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "meshesPath", "/", adios_string, 1, (void*)meshesPath.c_str()));
-
-            const std::string particlesPath( ADIOS_PATH_PARTICLES );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "particlesPath", "/", adios_string, 1, (void*)particlesPath.c_str()));
-
-            const std::string iterationEncoding( "fileBased" );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "iterationEncoding", "/", adios_string, 1, (void*)iterationEncoding.c_str()));
-
-            const std::string iterationFormat(
-                Environment< simDim >::get().Filesystem().basename( threadParams->adiosFilename ) +
-                std::string("_%T.bp")
-            );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "iterationFormat", "/", adios_string, 1, (void*)iterationFormat.c_str()));
-
-            /*   recommended */
-            const std::string author = Environment<>::get().SimulationDescription().getAuthor();
-            if( author.length() > 0 )
+            /** specialization if no species are defined */
+            template<>
+            struct OfAllSpecies<0>
             {
-                ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                    "author", "/", adios_string, 1, (void*)author.c_str()));
-            }
+                /** write meta data for species
+                 *
+                 * @param threadParams context of the adios plugin
+                 * @param fullMeshesPath path to mesh entry
+                 */
+                void operator()(
+                    ThreadParams* /* threadParams */,
+                    const std::string& /* fullMeshesPath */
+                ) const
+                {
+                }
+            };
 
-            const std::string software( "PIConGPU" );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "software", "/", adios_string, 1, (void*)software.c_str()));
-
-            std::stringstream softwareVersion;
-            softwareVersion << PICONGPU_VERSION_MAJOR << "."
-                            << PICONGPU_VERSION_MINOR << "."
-                            << PICONGPU_VERSION_PATCH;
-            if( ! std::string(PICONGPU_VERSION_LABEL).empty() )
-                softwareVersion << "-" << PICONGPU_VERSION_LABEL;
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "softwareVersion", "/", adios_string, 1, (void*)softwareVersion.str().c_str()));
-
-            const std::string date = helper::getDateString( "%F %T %z" );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "date", "/", adios_string, 1, (void*)date.c_str()));
-
-            /*   ED-PIC */
-            const std::string fullMeshesPath( threadParams->adiosBasePath +
-                std::string(ADIOS_PATH_FIELDS) );
-
-            GetStringProperties<fields::Solver> fieldSolverProps;
-            const std::string fieldSolver( fieldSolverProps["name"].value );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "fieldSolver", fullMeshesPath.c_str(), adios_string, 1, (void*)fieldSolver.c_str()));
-
-            /* order as in axisLabels:
-             *    3D: z-lower, z-upper, y-lower, y-upper, x-lower, x-upper
-             *    2D: y-lower, y-upper, x-lower, x-upper
-             */
-            GetStringProperties<fields::absorber::Absorber> fieldBoundaryProp;
-            std::list<std::string> listFieldBoundary;
-            std::list<std::string> listFieldBoundaryParam;
-            for( uint32_t i = NumberOfExchanges<simDim>::value - 1; i > 0; --i )
+        } // namespace writeMeta
+
+        struct WriteMeta
+        {
+            void operator()(ThreadParams* threadParams)
             {
-                if( FRONT % i == 0 )
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) write meta attributes.");
+
+                traits::PICToAdios<uint32_t> adiosUInt32Type;
+                traits::PICToAdios<float_X> adiosFloatXType;
+                traits::PICToAdios<float_64> adiosDoubleType;
+
+                /* openPMD attributes */
+                /*   required */
+                const std::string openPMDversion("1.0.0");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "openPMD",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) openPMDversion.c_str()));
+
+                const uint32_t openPMDextension = 1; // ED-PIC ID
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "openPMDextension",
+                    "/",
+                    adiosUInt32Type.type,
+                    1,
+                    (void*) &openPMDextension));
+
+                const std::string basePath(ADIOS_PATH_ROOT "%T/");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "basePath",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) basePath.c_str()));
+
+                const std::string meshesPath(ADIOS_PATH_FIELDS);
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "meshesPath",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) meshesPath.c_str()));
+
+                const std::string particlesPath(ADIOS_PATH_PARTICLES);
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "particlesPath",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) particlesPath.c_str()));
+
+                const std::string iterationEncoding("fileBased");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "iterationEncoding",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) iterationEncoding.c_str()));
+
+                const std::string iterationFormat(
+                    Environment<simDim>::get().Filesystem().basename(threadParams->adiosFilename)
+                    + std::string("_%T.bp"));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "iterationFormat",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) iterationFormat.c_str()));
+
+                /*   recommended */
+                const std::string author = Environment<>::get().SimulationDescription().getAuthor();
+                if(author.length() > 0)
                 {
-                    listFieldBoundary.push_back(
-                        fieldBoundaryProp[ExchangeTypeNames()[i]]["name"].value
-                    );
-                    listFieldBoundaryParam.push_back(
-                        fieldBoundaryProp[ExchangeTypeNames()[i]]["param"].value
-                    );
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        threadParams->adiosGroupHandle,
+                        "author",
+                        "/",
+                        adios_string,
+                        1,
+                        (void*) author.c_str()));
                 }
-            }
-            helper::GetADIOSArrayOfString getADIOSArrayOfString;
-            auto arrFieldBoundary = getADIOSArrayOfString( listFieldBoundary );
-            auto arrFieldBoundaryParam = getADIOSArrayOfString( listFieldBoundaryParam );
 
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "fieldBoundary", fullMeshesPath.c_str(), adios_string_array,
-                listFieldBoundary.size(), &( arrFieldBoundary.starts.at( 0 ) )));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "fieldBoundaryParameters", fullMeshesPath.c_str(), adios_string_array,
-                listFieldBoundaryParam.size(), &( arrFieldBoundaryParam.starts.at( 0 ) )));
-
-            writeMeta::OfAllSpecies<>()( threadParams, fullMeshesPath );
+                const std::string software("PIConGPU");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "software",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) software.c_str()));
+
+                std::stringstream softwareVersion;
+                softwareVersion << PICONGPU_VERSION_MAJOR << "." << PICONGPU_VERSION_MINOR << "."
+                                << PICONGPU_VERSION_PATCH;
+                if(!std::string(PICONGPU_VERSION_LABEL).empty())
+                    softwareVersion << "-" << PICONGPU_VERSION_LABEL;
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "softwareVersion",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) softwareVersion.str().c_str()));
+
+                const std::string date = helper::getDateString("%F %T %z");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "date",
+                    "/",
+                    adios_string,
+                    1,
+                    (void*) date.c_str()));
+
+                /*   ED-PIC */
+                const std::string fullMeshesPath(threadParams->adiosBasePath + std::string(ADIOS_PATH_FIELDS));
+
+                GetStringProperties<fields::Solver> fieldSolverProps;
+                const std::string fieldSolver(fieldSolverProps["name"].value);
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "fieldSolver",
+                    fullMeshesPath.c_str(),
+                    adios_string,
+                    1,
+                    (void*) fieldSolver.c_str()));
+                if(fieldSolverProps.find("param") != fieldSolverProps.end())
+                {
+                    const std::string fieldSolverParam(fieldSolverProps["param"].value);
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        threadParams->adiosGroupHandle,
+                        "fieldSolverParameters",
+                        fullMeshesPath.c_str(),
+                        adios_string,
+                        1,
+                        (void*) fieldSolverParam.c_str()));
+                }
 
-            GetStringProperties<typename fields::Solver::CurrentInterpolation> currentSmoothingProp;
-            const std::string currentSmoothing( currentSmoothingProp["name"].value );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "currentSmoothing", fullMeshesPath.c_str(), adios_string, 1, (void*)currentSmoothing.c_str()));
+                /* order as in axisLabels:
+                 *    3D: z-lower, z-upper, y-lower, y-upper, x-lower, x-upper
+                 *    2D: y-lower, y-upper, x-lower, x-upper
+                 */
+                GetStringProperties<fields::absorber::Absorber> fieldBoundaryProp;
+                std::list<std::string> listFieldBoundary;
+                std::list<std::string> listFieldBoundaryParam;
+                for(uint32_t i = NumberOfExchanges<simDim>::value - 1; i > 0; --i)
+                {
+                    if(FRONT % i == 0)
+                    {
+                        listFieldBoundary.push_back(fieldBoundaryProp[ExchangeTypeNames()[i]]["name"].value);
+                        listFieldBoundaryParam.push_back(fieldBoundaryProp[ExchangeTypeNames()[i]]["param"].value);
+                    }
+                }
+                helper::GetADIOSArrayOfString getADIOSArrayOfString;
+                auto arrFieldBoundary = getADIOSArrayOfString(listFieldBoundary);
+                auto arrFieldBoundaryParam = getADIOSArrayOfString(listFieldBoundaryParam);
+
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "fieldBoundary",
+                    fullMeshesPath.c_str(),
+                    adios_string_array,
+                    listFieldBoundary.size(),
+                    &(arrFieldBoundary.starts.at(0))));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "fieldBoundaryParameters",
+                    fullMeshesPath.c_str(),
+                    adios_string_array,
+                    listFieldBoundaryParam.size(),
+                    &(arrFieldBoundaryParam.starts.at(0))));
+
+                writeMeta::OfAllSpecies<>()(threadParams, fullMeshesPath);
+
+                GetStringProperties<currentInterpolation::CurrentInterpolationInfo> currentSmoothingProp;
+                const std::string currentSmoothing(currentSmoothingProp["name"].value);
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "currentSmoothing",
+                    fullMeshesPath.c_str(),
+                    adios_string,
+                    1,
+                    (void*) currentSmoothing.c_str()));
+
+                if(currentSmoothingProp.find("param") != currentSmoothingProp.end())
+                {
+                    const std::string currentSmoothingParam(currentSmoothingProp["param"].value);
+                    ADIOS_CMD(adios_define_attribute_byvalue(
+                        threadParams->adiosGroupHandle,
+                        "currentSmoothingParameters",
+                        fullMeshesPath.c_str(),
+                        adios_string,
+                        1,
+                        (void*) currentSmoothingParam.c_str()));
+                }
 
-            if( currentSmoothingProp.find( "param" ) != currentSmoothingProp.end() )
-            {
-                const std::string currentSmoothingParam( currentSmoothingProp["param"].value );
-                ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                    "currentSmoothingParameters", fullMeshesPath.c_str(), adios_string,
-                    1, (void*)currentSmoothingParam.c_str()));
+                const std::string chargeCorrection("none");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "chargeCorrection",
+                    fullMeshesPath.c_str(),
+                    adios_string,
+                    1,
+                    (void*) chargeCorrection.c_str()));
+
+                /* write current iteration */
+                log<picLog::INPUT_OUTPUT>("ADIOS: meta: iteration");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "iteration",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosUInt32Type.type,
+                    1,
+                    (void*) &threadParams->currentStep));
+
+                /* write number of slides */
+                log<picLog::INPUT_OUTPUT>("ADIOS: meta: sim_slides");
+                uint32_t slides = MovingWindow::getInstance().getSlideCounter(threadParams->currentStep);
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "sim_slides",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosUInt32Type.type,
+                    1,
+                    (void*) &slides));
+
+                /* openPMD: required time attributes */
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "dt",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    (void*) &DELTA_T));
+                const float_X time = float_X(threadParams->currentStep) * DELTA_T;
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "time",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    (void*) &time));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "timeUnitSI",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_TIME));
+
+                /* write normed grid parameters */
+                log<picLog::INPUT_OUTPUT>("ADIOS: meta: grid");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "cell_width",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    (void*) &cellSize[0]));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "cell_height",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    (void*) &cellSize[1]));
+
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "cell_depth",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    (void*) &cellSize[2]));
+
+
+                /* write base units */
+                log<picLog::INPUT_OUTPUT>("ADIOS: meta: units");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "unit_energy",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_ENERGY));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "unit_length",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_LENGTH));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "unit_speed",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_SPEED));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "unit_time",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_TIME));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "unit_mass",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_MASS));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "unit_charge",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_CHARGE));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "unit_efield",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_EFIELD));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "unit_bfield",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &UNIT_BFIELD));
+
+                /* write physical constants */
+                log<picLog::INPUT_OUTPUT>("ADIOS: meta: mue0/eps0");
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "mue0",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    (void*) &MUE0));
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    threadParams->adiosGroupHandle,
+                    "eps0",
+                    threadParams->adiosBasePath.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    (void*) &EPS0));
+
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( end ) wite meta attributes.");
             }
-
-            const std::string chargeCorrection( "none" );
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                "chargeCorrection", fullMeshesPath.c_str(), adios_string, 1, (void*)chargeCorrection.c_str()));
-
-            /* write current iteration */
-            log<picLog::INPUT_OUTPUT > ("ADIOS: meta: iteration");
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "iteration", threadParams->adiosBasePath.c_str(),
-                      adiosUInt32Type.type, 1, (void*)&threadParams->currentStep ));
-
-            /* write number of slides */
-            log<picLog::INPUT_OUTPUT > ("ADIOS: meta: sim_slides");
-            uint32_t slides = MovingWindow::getInstance().getSlideCounter(threadParams->currentStep);
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "sim_slides", threadParams->adiosBasePath.c_str(),
-                      adiosUInt32Type.type, 1, (void*)&slides ));
-
-            /* openPMD: required time attributes */
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "dt", threadParams->adiosBasePath.c_str(),
-                      adiosFloatXType.type, 1, (void*)&DELTA_T ));
-            const float_X time = float_X( threadParams->currentStep ) * DELTA_T;
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "time", threadParams->adiosBasePath.c_str(),
-                      adiosFloatXType.type, 1, (void*)&time ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "timeUnitSI", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_TIME ));
-
-            /* write normed grid parameters */
-            log<picLog::INPUT_OUTPUT > ("ADIOS: meta: grid");
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "cell_width", threadParams->adiosBasePath.c_str(),
-                      adiosFloatXType.type, 1, (void*)&cellSize[0] ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "cell_height", threadParams->adiosBasePath.c_str(),
-                      adiosFloatXType.type, 1, (void*)&cellSize[1] ));
-
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "cell_depth", threadParams->adiosBasePath.c_str(),
-                      adiosFloatXType.type, 1, (void*)&cellSize[2] ));
-
-
-            /* write base units */
-            log<picLog::INPUT_OUTPUT > ("ADIOS: meta: units");
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "unit_energy", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_ENERGY ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "unit_length", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_LENGTH ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "unit_speed", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_SPEED ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "unit_time", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_TIME ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "unit_mass", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_MASS ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "unit_charge", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_CHARGE ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "unit_efield", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_EFIELD ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "unit_bfield", threadParams->adiosBasePath.c_str(),
-                      adiosDoubleType.type, 1, (void*)&UNIT_BFIELD ));
-
-            /* write physical constants */
-            log<picLog::INPUT_OUTPUT > ("ADIOS: meta: mue0/eps0");
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "mue0", threadParams->adiosBasePath.c_str(),
-                      adiosFloatXType.type, 1, (void*)&MUE0 ));
-            ADIOS_CMD(adios_define_attribute_byvalue(threadParams->adiosGroupHandle,
-                      "eps0", threadParams->adiosBasePath.c_str(),
-                      adiosFloatXType.type, 1, (void*)&EPS0 ));
-
-            log<picLog::INPUT_OUTPUT > ("ADIOS: ( end ) wite meta attributes.");
-        }
-    };
-} // namespace adios
+        };
+    } // namespace adios
 } // namespace picongpu
diff --git a/include/picongpu/plugins/adios/WriteSpecies.hpp b/include/picongpu/plugins/adios/WriteSpecies.hpp
index 1f22849ad4..17f1255ec5 100644
--- a/include/picongpu/plugins/adios/WriteSpecies.hpp
+++ b/include/picongpu/plugins/adios/WriteSpecies.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Felix Schmitt, Axel Huebl,
+/* Copyright 2014-2021 Rene Widera, Felix Schmitt, Axel Huebl,
  *                     Alexander Grund
  *
  * This file is part of PIConGPU.
@@ -37,9 +37,7 @@
 #include <pmacc/particles/operations/ConcatListOfFrames.hpp>
 #include <pmacc/particles/particleFilter/FilterFactory.hpp>
 #include <pmacc/particles/particleFilter/PositionFilter.hpp>
-#if( PMACC_CUDA_ENABLED == 1 )
 #include <pmacc/particles/memory/buffers/MallocMCBuffer.hpp>
-#endif
 
 #include <boost/mpl/vector.hpp>
 #include <boost/mpl/pair.hpp>
@@ -53,162 +51,159 @@
 
 namespace picongpu
 {
-
-namespace adios
-{
-using namespace pmacc;
-
-/** Write copy particle to host memory and dump to ADIOS file
- *
- * @tparam T_Species type of species
- *
- */
-template< typename T_SpeciesFilter >
-struct WriteSpecies
-{
-public:
-
-    typedef typename T_SpeciesFilter::Species ThisSpecies;
-    typedef typename ThisSpecies::FrameType FrameType;
-    typedef typename FrameType::ParticleDescription ParticleDescription;
-    typedef typename FrameType::ValueTypeSeq ParticleAttributeList;
-
-    /* delete multiMask and localCellIdx in adios particle*/
-    typedef bmpl::vector<multiMask,localCellIdx> TypesToDelete;
-    typedef typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type ParticleCleanedAttributeList;
-
-    /* add totalCellIdx for adios particle*/
-    typedef typename MakeSeq<
-        ParticleCleanedAttributeList,
-        totalCellIdx
-    >::type ParticleNewAttributeList;
-
-    typedef
-    typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type
-    NewParticleDescription;
-
-    typedef Frame<OperatorCreateVectorBox, NewParticleDescription> AdiosFrameType;
-
-    template<typename Space>
-    HINLINE void operator()(ThreadParams* params,
-                            const Space particleOffset)
+    namespace adios
     {
-        log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) write species: %1%") % T_SpeciesFilter::getName();
-        DataConnector &dc = Environment<>::get().DataConnector();
-        /* load particle without copy particle data to host */
-        auto speciesTmp = dc.get< ThisSpecies >( ThisSpecies::FrameType::getName(), true );
-
-        /* count total number of particles on the device */
-        log<picLog::INPUT_OUTPUT > ("ADIOS:   (begin) count particles: %1%") % T_SpeciesFilter::getName();
-        // enforce that the filter interface is fulfilled
-        particles::filter::IUnary< typename T_SpeciesFilter::Filter > particleFilter{ params->currentStep };
-        uint64_cu totalNumParticles = 0;
-        totalNumParticles = pmacc::CountParticles::countOnDevice < CORE + BORDER > (
-                                                                                    *speciesTmp,
-                                                                                    *(params->cellDescription),
-                                                                                    params->localWindowToDomainOffset,
-                                                                                    params->window.localDimensions.size,
-                                                                                    particleFilter);
-        log<picLog::INPUT_OUTPUT > ("ADIOS:   ( end ) count particles: %1% = %2%") % T_SpeciesFilter::getName() % totalNumParticles;
-
-        AdiosFrameType hostFrame;
-
-        /* malloc host memory */
-        log<picLog::INPUT_OUTPUT > ("ADIOS:   (begin) malloc host memory: %1%") % T_SpeciesFilter::getName();
-        meta::ForEach<typename AdiosFrameType::ValueTypeSeq, MallocHostMemory<bmpl::_1> > mallocMem;
-        mallocMem(hostFrame, totalNumParticles);
-        log<picLog::INPUT_OUTPUT > ("ADIOS:   ( end ) malloc host memory: %1%") % T_SpeciesFilter::getName();
-
-        if (totalNumParticles > 0)
+        using namespace pmacc;
+
+        /** Write copy particle to host memory and dump to ADIOS file
+         *
+         * @tparam T_Species type of species
+         *
+         */
+        template<typename T_SpeciesFilter>
+        struct WriteSpecies
         {
-            log<picLog::INPUT_OUTPUT > ("ADIOS:   (begin) copy particle host (with hierarchy) to host (without hierarchy): %1%") % T_SpeciesFilter::getName();
-            typedef bmpl::vector< typename GetPositionFilter<simDim>::type > usedFilters;
-            typedef typename FilterFactory<usedFilters>::FilterType MyParticleFilter;
-            MyParticleFilter filter;
-            /* activate filter pipeline if moving window is activated */
-            filter.setStatus(MovingWindow::getInstance().isEnabled());
-            filter.setWindowPosition(params->localWindowToDomainOffset,
-                                     params->window.localDimensions.size);
-
-            DataConnector &dc = Environment<>::get().DataConnector();
-#if( PMACC_CUDA_ENABLED == 1 )
-            auto mallocMCBuffer = dc.get< MallocMCBuffer< DeviceHeap > >( MallocMCBuffer< DeviceHeap >::getName(), true );
-#endif
-            int globalParticleOffset = 0;
-            AreaMapping < CORE + BORDER, MappingDesc > mapper(*(params->cellDescription));
-
-            pmacc::particles::operations::ConcatListOfFrames<simDim> concatListOfFrames(mapper.getGridDim());
-
-#if( PMACC_CUDA_ENABLED == 1 )
-            auto particlesBox = speciesTmp->getHostParticlesBox( mallocMCBuffer->getOffset() );
+        public:
+            typedef typename T_SpeciesFilter::Species ThisSpecies;
+            typedef typename ThisSpecies::FrameType FrameType;
+            typedef typename FrameType::ParticleDescription ParticleDescription;
+            typedef typename FrameType::ValueTypeSeq ParticleAttributeList;
+
+            /* delete multiMask and localCellIdx in adios particle*/
+            typedef bmpl::vector<multiMask, localCellIdx> TypesToDelete;
+            typedef typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type ParticleCleanedAttributeList;
+
+            /* add totalCellIdx for adios particle*/
+            typedef typename MakeSeq<ParticleCleanedAttributeList, totalCellIdx>::type ParticleNewAttributeList;
+
+            typedef typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type
+                NewParticleDescription;
+
+            typedef Frame<OperatorCreateVectorBox, NewParticleDescription> AdiosFrameType;
+
+            template<typename Space>
+            HINLINE void operator()(ThreadParams* params, const Space particleOffset)
+            {
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) write species: %1%") % T_SpeciesFilter::getName();
+                DataConnector& dc = Environment<>::get().DataConnector();
+                /* load particle without copy particle data to host */
+                auto speciesTmp = dc.get<ThisSpecies>(ThisSpecies::FrameType::getName(), true);
+
+                /* count total number of particles on the device */
+                log<picLog::INPUT_OUTPUT>("ADIOS:   (begin) count particles: %1%") % T_SpeciesFilter::getName();
+                // enforce that the filter interface is fulfilled
+                particles::filter::IUnary<typename T_SpeciesFilter::Filter> particleFilter{params->currentStep};
+                uint64_cu totalNumParticles = 0;
+                totalNumParticles = pmacc::CountParticles::countOnDevice<CORE + BORDER>(
+                    *speciesTmp,
+                    *(params->cellDescription),
+                    params->localWindowToDomainOffset,
+                    params->window.localDimensions.size,
+                    particleFilter);
+                log<picLog::INPUT_OUTPUT>("ADIOS:   ( end ) count particles: %1% = %2%") % T_SpeciesFilter::getName()
+                    % totalNumParticles;
+
+                AdiosFrameType hostFrame;
+
+                /* malloc host memory */
+                log<picLog::INPUT_OUTPUT>("ADIOS:   (begin) malloc host memory: %1%") % T_SpeciesFilter::getName();
+                meta::ForEach<typename AdiosFrameType::ValueTypeSeq, MallocHostMemory<bmpl::_1>> mallocMem;
+                mallocMem(hostFrame, totalNumParticles);
+                log<picLog::INPUT_OUTPUT>("ADIOS:   ( end ) malloc host memory: %1%") % T_SpeciesFilter::getName();
+
+                if(totalNumParticles > 0)
+                {
+                    log<picLog::INPUT_OUTPUT>(
+                        "ADIOS:   (begin) copy particle host (with hierarchy) to host (without hierarchy): %1%")
+                        % T_SpeciesFilter::getName();
+                    typedef bmpl::vector<typename GetPositionFilter<simDim>::type> usedFilters;
+                    typedef typename FilterFactory<usedFilters>::FilterType MyParticleFilter;
+                    MyParticleFilter filter;
+                    /* activate filter pipeline if moving window is activated */
+                    filter.setStatus(MovingWindow::getInstance().isEnabled());
+                    filter.setWindowPosition(params->localWindowToDomainOffset, params->window.localDimensions.size);
+
+                    DataConnector& dc = Environment<>::get().DataConnector();
+
+                    auto mallocMCBuffer
+                        = dc.get<MallocMCBuffer<DeviceHeap>>(MallocMCBuffer<DeviceHeap>::getName(), true);
+
+                    int globalParticleOffset = 0;
+                    AreaMapping<CORE + BORDER, MappingDesc> mapper(*(params->cellDescription));
+
+                    pmacc::particles::operations::ConcatListOfFrames<simDim> concatListOfFrames(mapper.getGridDim());
+
+#if(PMACC_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+                    auto particlesBox = speciesTmp->getHostParticlesBox(mallocMCBuffer->getOffset());
 #else
-            /* This separate code path is only a workaround until MallocMCBuffer
-             * is alpaka compatible.
-             *
-             * @todo remove this workaround: we know that we are allowed to access the
-             * device memory directly.
-             */
-            auto particlesBox = speciesTmp->getDeviceParticlesBox( );
-            /* Notify to the event system that the particles box is used on the host.
-             *
-             * @todo remove this workaround
-             */
-             __startOperation(ITask::TASK_HOST);
+                    /* This separate code path is only a workaround until MallocMCBuffer
+                     * is alpaka compatible.
+                     *
+                     * @todo remove this workaround: we know that we are allowed to access the
+                     * device memory directly.
+                     */
+                    auto particlesBox = speciesTmp->getDeviceParticlesBox();
+                    /* Notify to the event system that the particles box is used on the host.
+                     *
+                     * @todo remove this workaround
+                     */
+                    __startOperation(ITask::TASK_HOST);
 
 #endif
-            concatListOfFrames(
-                                globalParticleOffset,
-                                hostFrame,
-                                particlesBox,
-                                filter,
-                                particleOffset, /*relative to data domain (not to physical domain)*/
-                                totalCellIdx_,
-                                mapper,
-                                particleFilter
-                                );
-#if( PMACC_CUDA_ENABLED == 1 )
-            dc.releaseData( MallocMCBuffer< DeviceHeap >::getName() );
-#endif
-            /* this costs a little bit of time but adios writing is slower */
-            PMACC_ASSERT((uint64_cu) globalParticleOffset == totalNumParticles);
-        }
-        /* dump to adios file */
-        meta::ForEach<typename AdiosFrameType::ValueTypeSeq, adios::ParticleAttribute<bmpl::_1> > writeToAdios;
-        writeToAdios(params, hostFrame, totalNumParticles);
-
-        /* free host memory */
-        meta::ForEach<typename AdiosFrameType::ValueTypeSeq, FreeHostMemory<bmpl::_1> > freeMem;
-        freeMem(hostFrame);
-        log<picLog::INPUT_OUTPUT > ("ADIOS: ( end ) writing species: %1%") % T_SpeciesFilter::getName();
-
-        /* write species counter table to adios file */
-        log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) writing particle index table for %1%") % T_SpeciesFilter::getName();
-        {
-            GridController<simDim>& gc = Environment<simDim>::get().GridController();
-
-            const size_t pos_offset = 2;
-
-            /* particlesMetaInfo = (num particles, scalar position, particle offset x, y, z) */
-            uint64_t particlesMetaInfo[5] = {totalNumParticles, gc.getScalarPosition(), 0, 0, 0};
-            for (size_t d = 0; d < simDim; ++d)
-                particlesMetaInfo[pos_offset + d] = particleOffset[d];
-
-            /* prevent that top (y) gpus have negative value here */
-            if (gc.getPosition().y() == 0)
-                particlesMetaInfo[pos_offset + 1] = 0;
-
-            if (particleOffset[1] < 0) // 1 == y
-                particlesMetaInfo[pos_offset + 1] = 0;
-
-            int64_t adiosIndexVarId = *(params->adiosSpeciesIndexVarIds.begin());
-            params->adiosSpeciesIndexVarIds.pop_front();
-            ADIOS_CMD(adios_write_byid(params->adiosFileHandle, adiosIndexVarId, particlesMetaInfo));
-        }
-        log<picLog::INPUT_OUTPUT > ("ADIOS: ( end ) writing particle index table for %1%") % T_SpeciesFilter::getName();
-    }
-};
-
-
-} //namspace adios
-
-} //namespace picongpu
+                    concatListOfFrames(
+                        globalParticleOffset,
+                        hostFrame,
+                        particlesBox,
+                        filter,
+                        particleOffset, /*relative to data domain (not to physical domain)*/
+                        totalCellIdx_,
+                        mapper,
+                        particleFilter);
+
+                    dc.releaseData(MallocMCBuffer<DeviceHeap>::getName());
+
+                    /* this costs a little bit of time but adios writing is slower */
+                    PMACC_ASSERT((uint64_cu) globalParticleOffset == totalNumParticles);
+                }
+                /* dump to adios file */
+                meta::ForEach<typename AdiosFrameType::ValueTypeSeq, adios::ParticleAttribute<bmpl::_1>> writeToAdios;
+                writeToAdios(params, hostFrame, totalNumParticles);
+
+                /* free host memory */
+                meta::ForEach<typename AdiosFrameType::ValueTypeSeq, FreeHostMemory<bmpl::_1>> freeMem;
+                freeMem(hostFrame);
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( end ) writing species: %1%") % T_SpeciesFilter::getName();
+
+                /* write species counter table to adios file */
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) writing particle index table for %1%")
+                    % T_SpeciesFilter::getName();
+                {
+                    GridController<simDim>& gc = Environment<simDim>::get().GridController();
+
+                    const size_t pos_offset = 2;
+
+                    /* particlesMetaInfo = (num particles, scalar position, particle offset x, y, z) */
+                    uint64_t particlesMetaInfo[5] = {totalNumParticles, gc.getScalarPosition(), 0, 0, 0};
+                    for(size_t d = 0; d < simDim; ++d)
+                        particlesMetaInfo[pos_offset + d] = particleOffset[d];
+
+                    /* prevent that top (y) gpus have negative value here */
+                    if(gc.getPosition().y() == 0)
+                        particlesMetaInfo[pos_offset + 1] = 0;
+
+                    if(particleOffset[1] < 0) // 1 == y
+                        particlesMetaInfo[pos_offset + 1] = 0;
+
+                    int64_t adiosIndexVarId = *(params->adiosSpeciesIndexVarIds.begin());
+                    params->adiosSpeciesIndexVarIds.pop_front();
+                    ADIOS_CMD(adios_write_byid(params->adiosFileHandle, adiosIndexVarId, particlesMetaInfo));
+                }
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( end ) writing particle index table for %1%")
+                    % T_SpeciesFilter::getName();
+            }
+        };
+
+
+    } // namespace adios
+
+} // namespace picongpu
diff --git a/include/picongpu/plugins/adios/restart/LoadParticleAttributesFromADIOS.hpp b/include/picongpu/plugins/adios/restart/LoadParticleAttributesFromADIOS.hpp
index 91bd349f94..21f21f03b5 100644
--- a/include/picongpu/plugins/adios/restart/LoadParticleAttributesFromADIOS.hpp
+++ b/include/picongpu/plugins/adios/restart/LoadParticleAttributesFromADIOS.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 
@@ -34,110 +33,106 @@
 
 namespace picongpu
 {
-
-namespace adios
-{
-using namespace pmacc;
-
-/** Load attribute of a species from ADIOS checkpoint file
- *
- * @tparam T_Identifier identifier of species attribute
- */
-template< typename T_Identifier>
-struct LoadParticleAttributesFromADIOS
-{
-
-    /** read attributes from ADIOS file
-     *
-     * @param params thread params with ADIOS_FILE, ...
-     * @param frame frame with all particles
-     * @param particlePath path to the group in the ADIOS file
-     * @param particlesOffset read offset in the attribute array
-     * @param elements number of elements which should be read the attribute array
-     */
-    template<typename FrameType>
-    HINLINE void operator()(
-                            ThreadParams* params,
-                            FrameType& frame,
-                            const std::string particlePath,
-                            const uint64_t particlesOffset,
-                            const uint64_t elements)
+    namespace adios
     {
-
-        typedef T_Identifier Identifier;
-        typedef typename pmacc::traits::Resolve<Identifier>::type::type ValueType;
-        const uint32_t components = GetNComponents<ValueType>::value;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
-
-        log<picLog::INPUT_OUTPUT > ("ADIOS: ( begin ) load species attribute: %1%") % Identifier::getName();
-
-        const auto componentNames = plugins::misc::getComponentNames( components );
-
-        ComponentType* tmpArray = nullptr;
-        if( elements > 0 )
-            tmpArray = new ComponentType[elements];
-
-        // dev assert!
-        if( elements > 0 )
-            PMACC_ASSERT(tmpArray);
-
-        for (uint32_t n = 0; n < components; ++n)
+        using namespace pmacc;
+
+        /** Load attribute of a species from ADIOS checkpoint file
+         *
+         * @tparam T_Identifier identifier of species attribute
+         */
+        template<typename T_Identifier>
+        struct LoadParticleAttributesFromADIOS
         {
-            OpenPMDName<T_Identifier> openPMDName;
-            std::stringstream datasetName;
-            datasetName << particlePath << openPMDName();
-            if (components > 1)
-                datasetName << "/" << componentNames[n];
-
-            ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer();
-
-            ADIOS_VARINFO* varInfo = adios_inq_var( params->fp, datasetName.str().c_str() );
-            // it's possible to aquire the local block with that call again and
-            // the local elements to-be-read, but the block-ID must be known (MPI rank?)
-            //ADIOS_CMD(adios_inq_var_blockinfo( params->fp, varInfo ));
-
-            ADIOS_SELECTION* sel = adios_selection_boundingbox( 1, &particlesOffset, &elements );
-
-            /** Note: adios_schedule_read is not a collective call in any
-             *        ADIOS method and can therefore be skipped for empty reads */
-            if( elements > 0 )
+            /** read attributes from ADIOS file
+             *
+             * @param params thread params with ADIOS_FILE, ...
+             * @param frame frame with all particles
+             * @param particlePath path to the group in the ADIOS file
+             * @param particlesOffset read offset in the attribute array
+             * @param elements number of elements which should be read the attribute array
+             */
+            template<typename FrameType>
+            HINLINE void operator()(
+                ThreadParams* params,
+                FrameType& frame,
+                const std::string particlePath,
+                const uint64_t particlesOffset,
+                const uint64_t elements)
             {
-                // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-                __getTransactionEvent().waitForFinished();
-                ADIOS_CMD(adios_schedule_read( params->fp,
-                                               sel,
-                                               datasetName.str().c_str(),
-                                               0, /* from_step (not used in streams) */
-                                               1, /* nsteps to read (must be 1 for stream) */
-                                               (void*)tmpArray ));
+                typedef T_Identifier Identifier;
+                typedef typename pmacc::traits::Resolve<Identifier>::type::type ValueType;
+                const uint32_t components = GetNComponents<ValueType>::value;
+                typedef typename GetComponentsType<ValueType>::type ComponentType;
+
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( begin ) load species attribute: %1%") % Identifier::getName();
+
+                const auto componentNames = plugins::misc::getComponentNames(components);
+
+                ComponentType* tmpArray = nullptr;
+                if(elements > 0)
+                    tmpArray = new ComponentType[elements];
+
+                // dev assert!
+                if(elements > 0)
+                    PMACC_ASSERT(tmpArray);
+
+                for(uint32_t n = 0; n < components; ++n)
+                {
+                    OpenPMDName<T_Identifier> openPMDName;
+                    std::stringstream datasetName;
+                    datasetName << particlePath << openPMDName();
+                    if(components > 1)
+                        datasetName << "/" << componentNames[n];
+
+                    ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer();
+
+                    ADIOS_VARINFO* varInfo = adios_inq_var(params->fp, datasetName.str().c_str());
+                    // it's possible to aquire the local block with that call again and
+                    // the local elements to-be-read, but the block-ID must be known (MPI rank?)
+                    // ADIOS_CMD(adios_inq_var_blockinfo( params->fp, varInfo ));
+
+                    ADIOS_SELECTION* sel = adios_selection_boundingbox(1, &particlesOffset, &elements);
+
+                    /** Note: adios_schedule_read is not a collective call in any
+                     *        ADIOS method and can therefore be skipped for empty reads */
+                    if(elements > 0)
+                    {
+                        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
+                        __getTransactionEvent().waitForFinished();
+                        ADIOS_CMD(adios_schedule_read(
+                            params->fp,
+                            sel,
+                            datasetName.str().c_str(),
+                            0, /* from_step (not used in streams) */
+                            1, /* nsteps to read (must be 1 for stream) */
+                            (void*) tmpArray));
+                    }
+
+                    /** start a blocking read of all scheduled variables
+                     *  (this is collective call in many ADIOS methods) */
+                    ADIOS_CMD(adios_perform_reads(params->fp, 1));
+
+                    log<picLog::INPUT_OUTPUT>("ADIOS:  Did read %1% local of %2% global elements for %3%") % elements
+                        % varInfo->dims[0] % datasetName.str();
+
+/* copy component from temporary array to array of structs */
+#pragma omp parallel for
+                    for(size_t i = 0; i < elements; ++i)
+                    {
+                        ComponentType& ref = ((ComponentType*) dataPtr)[i * components + n];
+                        ref = tmpArray[i];
+                    }
+
+                    adios_selection_delete(sel);
+                    adios_free_varinfo(varInfo);
+                }
+                __deleteArray(tmpArray);
+
+                log<picLog::INPUT_OUTPUT>("ADIOS:  ( end ) load species attribute: %1%") % Identifier::getName();
             }
+        };
 
-            /** start a blocking read of all scheduled variables
-             *  (this is collective call in many ADIOS methods) */
-            ADIOS_CMD(adios_perform_reads( params->fp, 1 ));
-
-            log<picLog::INPUT_OUTPUT > ("ADIOS:  Did read %1% local of %2% global elements for %3%") %
-                elements % varInfo->dims[0] % datasetName.str();
-
-            /* copy component from temporary array to array of structs */
-            #pragma omp parallel for
-            for (size_t i = 0; i < elements; ++i)
-            {
-                ComponentType& ref = ((ComponentType*) dataPtr)[i * components + n];
-                ref = tmpArray[i];
-            }
-
-            adios_selection_delete( sel );
-            adios_free_varinfo( varInfo );
-        }
-        __deleteArray(tmpArray);
-
-        log<picLog::INPUT_OUTPUT > ("ADIOS:  ( end ) load species attribute: %1%") %
-            Identifier::getName();
-    }
-
-};
-
-} /* namespace adios */
+    } /* namespace adios */
 
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/adios/restart/LoadSpecies.hpp b/include/picongpu/plugins/adios/restart/LoadSpecies.hpp
index 52a2fad1ca..fc21cba1a2 100644
--- a/include/picongpu/plugins/adios/restart/LoadSpecies.hpp
+++ b/include/picongpu/plugins/adios/restart/LoadSpecies.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -47,151 +47,151 @@
 
 namespace picongpu
 {
-
-namespace adios
-{
-using namespace pmacc;
-
-/** Load species from ADIOS checkpoint file
- *
- * @tparam T_Species type of species
- *
- */
-template< typename T_Species >
-struct LoadSpecies
-{
-public:
-
-    typedef T_Species ThisSpecies;
-    typedef typename ThisSpecies::FrameType FrameType;
-    typedef typename FrameType::ParticleDescription ParticleDescription;
-    typedef typename FrameType::ValueTypeSeq ParticleAttributeList;
-
-
-    /* delete multiMask and localCellIdx in adios particle*/
-    typedef bmpl::vector2<multiMask, localCellIdx> TypesToDelete;
-    typedef typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type ParticleCleanedAttributeList;
-
-    /* add totalCellIdx for adios particle*/
-    typedef typename MakeSeq<
-        ParticleCleanedAttributeList,
-        totalCellIdx
-    >::type ParticleNewAttributeList;
-
-    typedef
-    typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type
-    NewParticleDescription;
-
-    typedef Frame<OperatorCreateVectorBox, NewParticleDescription> AdiosFrameType;
-
-    /** Load species from ADIOS checkpoint file
-     *
-     * @param params thread params with ADIOS_FILE, ...
-     * @param restartChunkSize number of particles processed in one kernel call
-     */
-    HINLINE void operator()(ThreadParams* params, const uint32_t restartChunkSize)
+    namespace adios
     {
-        std::string const speciesName = FrameType::getName();
-        log<picLog::INPUT_OUTPUT > ("ADIOS: (begin) load species: %1%") % speciesName;
-        DataConnector &dc = Environment<>::get().DataConnector();
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-
-        std::string particlePath = params->adiosBasePath + std::string(ADIOS_PATH_PARTICLES) +
-                                   speciesName + std::string("/");
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-
-        /* load particle without copying particle data to host */
-        auto speciesTmp = dc.get< ThisSpecies >( FrameType::getName(), true );
-
-        /* count total number of particles on the device */
-        uint64_t totalNumParticles = 0;
-
-        /* load particles info table entry for ONE process
-           (note: this is NOT necessarily THIS process!)
-           particlesInfo is (part-count, scalar pos, x, y, z) */
-        uint64_t particlesInfo[5];
-
-        uint64_t start = 5 * gc.getGlobalRank();
-        uint64_t count = 5; // ADIOSCountParticles: uint64_t
-        ADIOS_SELECTION* piSel = adios_selection_boundingbox( 1, &start, &count );
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-        __getTransactionEvent().waitForFinished();
-        ADIOS_CMD(adios_schedule_read( params->fp,
-                                       piSel,
-                                       (particlePath + std::string("particles_info")).c_str(),
-                                       0,
-                                       1,
-                                       (void*)particlesInfo ));
-
-        /* start a blocking read of all scheduled variables */
-        ADIOS_CMD(adios_perform_reads( params->fp, 1 ));
-        adios_selection_delete(piSel);
-
-        /* Run a prefix sum over the numParticles[0] element in particlesInfo
-         * to retreive the offset of particles before gc.getGlobalRank() */
-        uint64_t particleOffset = 0;
-
-        uint64_t fullParticlesInfo[gc.getGlobalSize()];
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Allgather( particlesInfo, 1, MPI_UINT64_T,
-                                 fullParticlesInfo, 1, MPI_UINT64_T,
-                                 gc.getCommunicator().getMPIComm() ));
-
-        for (size_t i = 0; i < gc.getGlobalSize(); ++i)
-        {
-            /* this comparison is potentially harmful, since the order of ranks
-               is not necessarily the same in subsequent MPI jobs.
-               But due to the wrong sorting by rank in `ADIOSCountParticles.hpp`
-               while calculating the `myParticleOffset` we have to immitate that. */
-            if( i < gc.getGlobalRank() )
-                particleOffset += fullParticlesInfo[i];
-            if( i == gc.getGlobalRank() )
-                totalNumParticles = fullParticlesInfo[i];
-        }
-
-        log<picLog::INPUT_OUTPUT > ("ADIOS: Loading %1% particles from offset %2%") %
-            (long long unsigned) totalNumParticles % (long long unsigned) particleOffset;
-
-        AdiosFrameType hostFrame;
-        log<picLog::INPUT_OUTPUT > ("ADIOS: malloc mapped memory: %1%") % speciesName;
-        /*malloc mapped memory*/
-        meta::ForEach<typename AdiosFrameType::ValueTypeSeq, MallocMemory<bmpl::_1> > mallocMem;
-        mallocMem(hostFrame, totalNumParticles);
-
-        log<picLog::INPUT_OUTPUT > ("ADIOS: get mapped memory device pointer: %1%") % speciesName;
-        /*load device pointer of mapped memory*/
-        AdiosFrameType deviceFrame;
-        meta::ForEach<typename AdiosFrameType::ValueTypeSeq, GetDevicePtr<bmpl::_1> > getDevicePtr;
-        getDevicePtr(deviceFrame, hostFrame);
-
-        meta::ForEach<typename AdiosFrameType::ValueTypeSeq, LoadParticleAttributesFromADIOS<bmpl::_1> > loadAttributes;
-        loadAttributes(params, hostFrame, particlePath, particleOffset, totalNumParticles);
-
-        if (totalNumParticles != 0)
+        using namespace pmacc;
+
+        /** Load species from ADIOS checkpoint file
+         *
+         * @tparam T_Species type of species
+         *
+         */
+        template<typename T_Species>
+        struct LoadSpecies
         {
-            pmacc::particles::operations::splitIntoListOfFrames(
-                *speciesTmp,
-                deviceFrame,
-                totalNumParticles,
-                restartChunkSize,
-                localDomain.offset,
-                totalCellIdx_,
-                *(params->cellDescription),
-                picLog::INPUT_OUTPUT()
-            );
-
-            /*free host memory*/
-            meta::ForEach<typename AdiosFrameType::ValueTypeSeq, FreeMemory<bmpl::_1> > freeMem;
-            freeMem(hostFrame);
-        }
-        log<picLog::INPUT_OUTPUT > ("ADIOS: ( end ) load species: %1%") % speciesName;
-    }
-};
-
-
-} /* namespace adios */
+        public:
+            typedef T_Species ThisSpecies;
+            typedef typename ThisSpecies::FrameType FrameType;
+            typedef typename FrameType::ParticleDescription ParticleDescription;
+            typedef typename FrameType::ValueTypeSeq ParticleAttributeList;
+
+
+            /* delete multiMask and localCellIdx in adios particle*/
+            typedef bmpl::vector2<multiMask, localCellIdx> TypesToDelete;
+            typedef typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type ParticleCleanedAttributeList;
+
+            /* add totalCellIdx for adios particle*/
+            typedef typename MakeSeq<ParticleCleanedAttributeList, totalCellIdx>::type ParticleNewAttributeList;
+
+            typedef typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type
+                NewParticleDescription;
+
+            typedef Frame<OperatorCreateVectorBox, NewParticleDescription> AdiosFrameType;
+
+            /** Load species from ADIOS checkpoint file
+             *
+             * @param params thread params with ADIOS_FILE, ...
+             * @param restartChunkSize number of particles processed in one kernel call
+             */
+            HINLINE void operator()(ThreadParams* params, const uint32_t restartChunkSize)
+            {
+                std::string const speciesName = FrameType::getName();
+                log<picLog::INPUT_OUTPUT>("ADIOS: (begin) load species: %1%") % speciesName;
+                DataConnector& dc = Environment<>::get().DataConnector();
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+
+                std::string particlePath
+                    = params->adiosBasePath + std::string(ADIOS_PATH_PARTICLES) + speciesName + std::string("/");
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+
+                /* load particle without copying particle data to host */
+                auto speciesTmp = dc.get<ThisSpecies>(FrameType::getName(), true);
+
+                /* count total number of particles on the device */
+                uint64_t totalNumParticles = 0;
+
+                /* load particles info table entry for ONE process
+                   (note: this is NOT necessarily THIS process!)
+                   particlesInfo is (part-count, scalar pos, x, y, z) */
+                uint64_t particlesInfo[5];
+
+                uint64_t start = 5 * gc.getGlobalRank();
+                uint64_t count = 5; // ADIOSCountParticles: uint64_t
+                ADIOS_SELECTION* piSel = adios_selection_boundingbox(1, &start, &count);
+
+                // avoid deadlock between not finished pmacc tasks and mpi calls in adios
+                __getTransactionEvent().waitForFinished();
+                ADIOS_CMD(adios_schedule_read(
+                    params->fp,
+                    piSel,
+                    (particlePath + std::string("particles_info")).c_str(),
+                    0,
+                    1,
+                    (void*) particlesInfo));
+
+                /* start a blocking read of all scheduled variables */
+                ADIOS_CMD(adios_perform_reads(params->fp, 1));
+                adios_selection_delete(piSel);
+
+                /* Run a prefix sum over the numParticles[0] element in particlesInfo
+                 * to retreive the offset of particles before gc.getGlobalRank() */
+                uint64_t particleOffset = 0;
+
+                uint64_t fullParticlesInfo[gc.getGlobalSize()];
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Allgather(
+                    particlesInfo,
+                    1,
+                    MPI_UINT64_T,
+                    fullParticlesInfo,
+                    1,
+                    MPI_UINT64_T,
+                    gc.getCommunicator().getMPIComm()));
+
+                for(size_t i = 0; i < gc.getGlobalSize(); ++i)
+                {
+                    /* this comparison is potentially harmful, since the order of ranks
+                       is not necessarily the same in subsequent MPI jobs.
+                       But due to the wrong sorting by rank in `ADIOSCountParticles.hpp`
+                       while calculating the `myParticleOffset` we have to immitate that. */
+                    if(i < gc.getGlobalRank())
+                        particleOffset += fullParticlesInfo[i];
+                    if(i == gc.getGlobalRank())
+                        totalNumParticles = fullParticlesInfo[i];
+                }
+
+                log<picLog::INPUT_OUTPUT>("ADIOS: Loading %1% particles from offset %2%")
+                    % (long long unsigned) totalNumParticles % (long long unsigned) particleOffset;
+
+                AdiosFrameType hostFrame;
+                log<picLog::INPUT_OUTPUT>("ADIOS: malloc mapped memory: %1%") % speciesName;
+                /*malloc mapped memory*/
+                meta::ForEach<typename AdiosFrameType::ValueTypeSeq, MallocMemory<bmpl::_1>> mallocMem;
+                mallocMem(hostFrame, totalNumParticles);
+
+                log<picLog::INPUT_OUTPUT>("ADIOS: get mapped memory device pointer: %1%") % speciesName;
+                /*load device pointer of mapped memory*/
+                AdiosFrameType deviceFrame;
+                meta::ForEach<typename AdiosFrameType::ValueTypeSeq, GetDevicePtr<bmpl::_1>> getDevicePtr;
+                getDevicePtr(deviceFrame, hostFrame);
+
+                meta::ForEach<typename AdiosFrameType::ValueTypeSeq, LoadParticleAttributesFromADIOS<bmpl::_1>>
+                    loadAttributes;
+                loadAttributes(params, hostFrame, particlePath, particleOffset, totalNumParticles);
+
+                if(totalNumParticles != 0)
+                {
+                    pmacc::particles::operations::splitIntoListOfFrames(
+                        *speciesTmp,
+                        deviceFrame,
+                        totalNumParticles,
+                        restartChunkSize,
+                        localDomain.offset,
+                        totalCellIdx_,
+                        *(params->cellDescription),
+                        picLog::INPUT_OUTPUT());
+
+                    /*free host memory*/
+                    meta::ForEach<typename AdiosFrameType::ValueTypeSeq, FreeMemory<bmpl::_1>> freeMem;
+                    freeMem(hostFrame);
+                }
+                log<picLog::INPUT_OUTPUT>("ADIOS: ( end ) load species: %1%") % speciesName;
+            }
+        };
+
+
+    } /* namespace adios */
 
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/adios/restart/ReadAttribute.hpp b/include/picongpu/plugins/adios/restart/ReadAttribute.hpp
index b74668467b..e7997c8b45 100644
--- a/include/picongpu/plugins/adios/restart/ReadAttribute.hpp
+++ b/include/picongpu/plugins/adios/restart/ReadAttribute.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PIConGPU.
  *
@@ -28,45 +28,42 @@
 #include <adios_error.h>
 #include <stdexcept>
 
-namespace picongpu {
-namespace adios {
-
-    /**
-     * Read an attribute from an open ADIOS file, check that its type is correct and return it
-     *
-     * @param fp       Open ADIOS file handle
-     * @param basePath Path where the attribute is located in the file (with or w/o trailing slash)
-     * @param attrName Name of the attribute. Used for status output and concatenated with basePath
-     * @retval Attribute value
-     */
-    template<typename T_Attribute>
-    T_Attribute readAttribute(ADIOS_FILE* fp, const std::string& basePath, const std::string& attrName)
+namespace picongpu
+{
+    namespace adios
     {
-        // Build full path
-        std::string attrPath = basePath;
-        if(!attrPath.empty() && attrPath[attrPath.size() - 1] != '/')
-            attrPath += '/';
-        attrPath += attrName;
-        // Actually read the data
-        enum ADIOS_DATATYPES attrType;
-        int attrSize;
-        T_Attribute* attrValuePtr;
-        ADIOS_CMD( adios_get_attr(fp,
-                                  attrPath.c_str(),
-                                  &attrType,
-                                  &attrSize,
-                                  (void**) &attrValuePtr) );
-        // Sanity checks
-        if(attrType != traits::PICToAdios<T_Attribute>().type)
-            throw std::runtime_error(std::string("Invalid type of ADIOS attribute ") + attrName);
-        if(attrSize != sizeof(T_Attribute))
-            throw std::runtime_error(std::string("Invalid size of ADIOS attribute ") + attrName);
+        /**
+         * Read an attribute from an open ADIOS file, check that its type is correct and return it
+         *
+         * @param fp       Open ADIOS file handle
+         * @param basePath Path where the attribute is located in the file (with or w/o trailing slash)
+         * @param attrName Name of the attribute. Used for status output and concatenated with basePath
+         * @retval Attribute value
+         */
+        template<typename T_Attribute>
+        T_Attribute readAttribute(ADIOS_FILE* fp, const std::string& basePath, const std::string& attrName)
+        {
+            // Build full path
+            std::string attrPath = basePath;
+            if(!attrPath.empty() && attrPath[attrPath.size() - 1] != '/')
+                attrPath += '/';
+            attrPath += attrName;
+            // Actually read the data
+            enum ADIOS_DATATYPES attrType;
+            int attrSize;
+            T_Attribute* attrValuePtr;
+            ADIOS_CMD(adios_get_attr(fp, attrPath.c_str(), &attrType, &attrSize, (void**) &attrValuePtr));
+            // Sanity checks
+            if(attrType != traits::PICToAdios<T_Attribute>().type)
+                throw std::runtime_error(std::string("Invalid type of ADIOS attribute ") + attrName);
+            if(attrSize != sizeof(T_Attribute))
+                throw std::runtime_error(std::string("Invalid size of ADIOS attribute ") + attrName);
 
-        T_Attribute attribute = *attrValuePtr;
-        __delete(attrValuePtr);
-        log<picLog::INPUT_OUTPUT > ("ADIOS: value of %1% = %2%") % attrName % attribute;
-        return attribute;
-    }
+            T_Attribute attribute = *attrValuePtr;
+            __delete(attrValuePtr);
+            log<picLog::INPUT_OUTPUT>("ADIOS: value of %1% = %2%") % attrName % attribute;
+            return attribute;
+        }
 
-}  // namespace adios
-}  // namespace picongpu
+    } // namespace adios
+} // namespace picongpu
diff --git a/include/picongpu/plugins/adios/restart/RestartFieldLoader.hpp b/include/picongpu/plugins/adios/restart/RestartFieldLoader.hpp
index 8cad0b32b9..3f57817b15 100644
--- a/include/picongpu/plugins/adios/restart/RestartFieldLoader.hpp
+++ b/include/picongpu/plugins/adios/restart/RestartFieldLoader.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2014-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
  *                     Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -26,10 +26,12 @@
 #include "picongpu/plugins/misc/ComponentNames.hpp"
 #include "picongpu/traits/IsFieldDomainBound.hpp"
 
+#include <pmacc/communication/manager_common.hpp>
 #include <pmacc/particles/frame_types.hpp>
 #include <pmacc/dataManagement/DataConnector.hpp>
 #include <pmacc/dimensions/DataSpace.hpp>
 #include <pmacc/dimensions/GridLayout.hpp>
+#include <pmacc/Environment.hpp>
 #include "picongpu/simulation/control/MovingWindow.hpp"
 
 #include <adios.h>
@@ -43,187 +45,212 @@
 
 namespace picongpu
 {
-
-namespace adios
-{
-
-/**
- * Helper class for ADIOS plugin to load fields from parallel ADIOS BP files.
- */
-class RestartFieldLoader
-{
-public:
-    template<class Data>
-    static void loadField(
-        Data& field,
-        const uint32_t numComponents,
-        std::string objectName,
-        ThreadParams *params,
-        const bool isDomainBound
-    )
+    namespace adios
     {
-        log<picLog::INPUT_OUTPUT > ("Begin loading field '%1%'") % objectName;
-
-        const auto componentNames = plugins::misc::getComponentNames( numComponents );
-        const DataSpace<simDim> field_guard = field.getGridLayout().getGuard();
-
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-
-        using ValueType = typename Data::ValueType;
-        field.getHostBuffer().setValue(ValueType::create(0.0));
-
-        DataSpace<simDim> domain_offset = localDomain.offset;
-        DataSpace<simDim> local_domain_size = params->window.localDimensions.size;
-        bool useLinearIdxAsDestination = false;
-
-        /* Patch for non-domain-bound fields
-        * This is an ugly fix to allow output of reduced 1d PML buffers,
-        * that are the same size on each domain.
-        * This code is to be replaced with the openPMD output plugin soon.
-        */
-        if( !isDomainBound )
-        {
-            auto const field_layout = params->gridLayout;
-            auto const field_no_guard = field_layout.getDataSpaceWithoutGuarding();
-            auto const elementCount = field_no_guard.productOfComponents();
-            auto const & gridController = Environment<simDim>::get().GridController();
-            auto const rank = gridController.getGlobalRank();
-            domain_offset = DataSpace<simDim>::create( 0 );
-            domain_offset[ 0 ] = rank * elementCount;
-            local_domain_size = DataSpace<simDim>::create( 1 );
-            local_domain_size[ 0 ] = elementCount;
-            useLinearIdxAsDestination = true;
-        }
-
-        auto destBox = field.getHostBuffer().getDataBox();
-        for (uint32_t n = 0; n < numComponents; ++n)
+        /**
+         * Helper class for ADIOS plugin to load fields from parallel ADIOS BP files.
+         */
+        class RestartFieldLoader
         {
-            // Read the subdomain which belongs to our mpi position.
-            // The total grid size must match the grid size of the stored data.
-            log<picLog::INPUT_OUTPUT > ("ADIOS: Read from domain: offset=%1% size=%2%") %
-                domain_offset % local_domain_size;
-
-            std::stringstream datasetName;
-            datasetName << params->adiosBasePath << ADIOS_PATH_FIELDS << objectName;
-            if (numComponents > 1)
-                datasetName << "/" << componentNames[n];
-
-            log<picLog::INPUT_OUTPUT > ("ADIOS: Read from field '%1%'") %
-                datasetName.str();
-
-            ADIOS_VARINFO* varInfo = adios_inq_var( params->fp, datasetName.str().c_str() );
-            if( varInfo == nullptr )
+        public:
+            template<class Data>
+            static void loadField(
+                Data& field,
+                const uint32_t numComponents,
+                std::string objectName,
+                ThreadParams* params,
+                const bool isDomainBound)
             {
-                std::string errMsg( adios_errmsg() );
-                if( errMsg.empty() ) errMsg = '\n';
-                std::stringstream s;
-                s << "ADIOS: error at adios_inq_var '"
-                  << "' (" << adios_errno << ") in "
-                  << __FILE__ << ":" << __LINE__ << " " << errMsg;
-                throw std::runtime_error(s.str());
-            }
-            uint64_t start[varInfo->ndim];
-            uint64_t count[varInfo->ndim];
-            for(int d = 0; d < varInfo->ndim; ++d)
-            {
-                /* \see adios_define_var: z,y,x in C-order */
-                start[d] = domain_offset.revert()[d];
-                count[d] = local_domain_size.revert()[d];
-            }
-
-            ADIOS_SELECTION* fSel = adios_selection_boundingbox( varInfo->ndim, start, count );
+                log<picLog::INPUT_OUTPUT>("Begin loading field '%1%'") % objectName;
 
-            /* specify what we want to read, but start reading at below at
-             * `adios_perform_reads` */
-            log<picLog::INPUT_OUTPUT > ("ADIOS: Allocate %1% elements") %
-                local_domain_size.productOfComponents();
+                const auto componentNames = plugins::misc::getComponentNames(numComponents);
+                const DataSpace<simDim> field_guard = field.getGridLayout().getGuard();
 
-            /// \todo float_X should be some kind of gridBuffer's GetComponentsType<ValueType>::type
-            float_X* field_container = new float_X[local_domain_size.productOfComponents()];
-            /* magic parameters (0, 1): `from_step` (not used in streams), `nsteps` to read (must be 1 for stream) */
-            log<picLog::INPUT_OUTPUT > ("ADIOS: Schedule read from field (%1%, %2%, %3%, %4%)") %
-                                        params->fp % fSel % datasetName.str() % (void*)field_container;
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
 
-            // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-            __getTransactionEvent().waitForFinished();
-            ADIOS_CMD(adios_schedule_read( params->fp, fSel, datasetName.str().c_str(), 0, 1, (void*)field_container ));
+                using ValueType = typename Data::ValueType;
+                field.getHostBuffer().setValue(ValueType::create(0.0));
 
-            /* start a blocking read of all scheduled variables */
-            ADIOS_CMD(adios_perform_reads( params->fp, 1 ));
+                DataSpace<simDim> domain_offset = localDomain.offset;
+                DataSpace<simDim> local_domain_size = params->window.localDimensions.size;
+                bool useLinearIdxAsDestination = false;
 
-            int const elementCount = local_domain_size.productOfComponents();
-
-            #pragma omp parallel for
-            for (int linearId = 0; linearId < elementCount; ++linearId)
-            {
-                DataSpace<simDim> destIdx;
-                if( useLinearIdxAsDestination )
+                /* Patch for non-domain-bound fields
+                 * This is an ugly fix to allow output of reduced 1d PML buffers
+                 */
+                if(!isDomainBound)
                 {
-                    destIdx[ 0 ] = linearId;
+                    auto const field_layout = params->gridLayout;
+                    auto const field_no_guard = field_layout.getDataSpaceWithoutGuarding();
+                    auto const elementCount = field_no_guard.productOfComponents();
+
+                    /* Scan the PML buffer local size along all local domains
+                     * This code is symmetric to one in Field::writeField()
+                     */
+                    log<picLog::INPUT_OUTPUT>("ADIOS:  (begin) collect PML sizes for %1%") % objectName;
+                    auto& gridController = Environment<simDim>::get().GridController();
+                    auto const numRanks = uint64_t{gridController.getGlobalSize()};
+                    /* Use domain position-based rank, not MPI rank, to be independent
+                     * of the MPI rank assignment scheme
+                     */
+                    auto const rank = uint64_t{gridController.getScalarPosition()};
+                    std::vector<uint64_t> localSizes(2 * numRanks, 0u);
+                    uint64_t localSizeInfo[2] = {static_cast<uint64_t>(elementCount), rank};
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK(MPI_Allgather(
+                        localSizeInfo,
+                        2,
+                        MPI_UINT64_T,
+                        &(*localSizes.begin()),
+                        2,
+                        MPI_UINT64_T,
+                        gridController.getCommunicator().getMPIComm()));
+                    uint64_t domainOffset = 0;
+                    for(uint64_t r = 0; r < numRanks; ++r)
+                    {
+                        if(localSizes.at(2u * r + 1u) < rank)
+                            domainOffset += localSizes.at(2u * r);
+                    }
+                    log<picLog::INPUT_OUTPUT>("ADIOS:  (end) collect PML sizes for %1%") % objectName;
+
+                    domain_offset = DataSpace<simDim>::create(0);
+                    domain_offset[0] = static_cast<int>(domainOffset);
+                    local_domain_size = DataSpace<simDim>::create(1);
+                    local_domain_size[0] = elementCount;
+                    useLinearIdxAsDestination = true;
                 }
-                else
+
+                auto destBox = field.getHostBuffer().getDataBox();
+                for(uint32_t n = 0; n < numComponents; ++n)
                 {
-                    /* calculate index inside the moving window domain which is located on the local grid*/
-                    destIdx = DataSpaceOperations<simDim>::map(params->window.localDimensions.size, linearId);
-                    /* jump over guard and local sliding window offset*/
-                    destIdx += field_guard + params->localWindowToDomainOffset;
+                    // Read the subdomain which belongs to our mpi position.
+                    // The total grid size must match the grid size of the stored data.
+                    log<picLog::INPUT_OUTPUT>("ADIOS: Read from domain: offset=%1% size=%2%") % domain_offset
+                        % local_domain_size;
+
+                    std::stringstream datasetName;
+                    datasetName << params->adiosBasePath << ADIOS_PATH_FIELDS << objectName;
+                    if(numComponents > 1)
+                        datasetName << "/" << componentNames[n];
+
+                    log<picLog::INPUT_OUTPUT>("ADIOS: Read from field '%1%'") % datasetName.str();
+
+                    ADIOS_VARINFO* varInfo = adios_inq_var(params->fp, datasetName.str().c_str());
+                    if(varInfo == nullptr)
+                    {
+                        std::string errMsg(adios_errmsg());
+                        if(errMsg.empty())
+                            errMsg = '\n';
+                        std::stringstream s;
+                        s << "ADIOS: error at adios_inq_var '"
+                          << "' (" << adios_errno << ") in " << __FILE__ << ":" << __LINE__ << " " << errMsg;
+                        throw std::runtime_error(s.str());
+                    }
+                    uint64_t start[varInfo->ndim];
+                    uint64_t count[varInfo->ndim];
+                    for(int d = 0; d < varInfo->ndim; ++d)
+                    {
+                        /* \see adios_define_var: z,y,x in C-order */
+                        start[d] = domain_offset.revert()[d];
+                        count[d] = local_domain_size.revert()[d];
+                    }
+
+                    ADIOS_SELECTION* fSel = adios_selection_boundingbox(varInfo->ndim, start, count);
+
+                    /* specify what we want to read, but start reading at below at
+                     * `adios_perform_reads` */
+                    log<picLog::INPUT_OUTPUT>("ADIOS: Allocate %1% elements")
+                        % local_domain_size.productOfComponents();
+
+                    /// \todo float_X should be some kind of gridBuffer's GetComponentsType<ValueType>::type
+                    float_X* field_container = new float_X[local_domain_size.productOfComponents()];
+                    /* magic parameters (0, 1): `from_step` (not used in streams), `nsteps` to read (must be 1 for
+                     * stream) */
+                    log<picLog::INPUT_OUTPUT>("ADIOS: Schedule read from field (%1%, %2%, %3%, %4%)") % params->fp
+                        % fSel % datasetName.str() % (void*) field_container;
+
+                    // avoid deadlock between not finished pmacc tasks and mpi calls in adios
+                    __getTransactionEvent().waitForFinished();
+                    ADIOS_CMD(adios_schedule_read(
+                        params->fp,
+                        fSel,
+                        datasetName.str().c_str(),
+                        0,
+                        1,
+                        (void*) field_container));
+
+                    /* start a blocking read of all scheduled variables */
+                    ADIOS_CMD(adios_perform_reads(params->fp, 1));
+
+                    int const elementCount = local_domain_size.productOfComponents();
+
+#pragma omp parallel for
+                    for(int linearId = 0; linearId < elementCount; ++linearId)
+                    {
+                        DataSpace<simDim> destIdx;
+                        if(useLinearIdxAsDestination)
+                        {
+                            destIdx[0] = linearId;
+                        }
+                        else
+                        {
+                            /* calculate index inside the moving window domain which is located on the local grid*/
+                            destIdx = DataSpaceOperations<simDim>::map(params->window.localDimensions.size, linearId);
+                            /* jump over guard and local sliding window offset*/
+                            destIdx += field_guard + params->localWindowToDomainOffset;
+                        }
+                        destBox(destIdx)[n] = field_container[linearId];
+                    }
+
+                    __deleteArray(field_container);
+                    adios_selection_delete(fSel);
+                    adios_free_varinfo(varInfo);
                 }
-                destBox(destIdx)[n] = field_container[linearId];
-            }
-
-            __deleteArray(field_container);
-            adios_selection_delete(fSel);
-            adios_free_varinfo(varInfo);
-        }
 
-        field.hostToDevice();
+                field.hostToDevice();
 
-        __getTransactionEvent().waitForFinished();
+                __getTransactionEvent().waitForFinished();
 
-        log<picLog::INPUT_OUTPUT > ("ADIOS: Read from domain: offset=%1% size=%2%") %
-            domain_offset % local_domain_size;
-        log<picLog::INPUT_OUTPUT > ("ADIOS: Finished loading field '%1%'") % objectName;
-    }
-
-};
-
-/**
- * Helper class for ADIOSWriter (forEach operator) to load a field from ADIOS
- *
- * @tparam T_Field field class to load
- */
-template< typename T_Field >
-struct LoadFields
-{
-public:
-
-    HDINLINE void operator()(ThreadParams* params)
-    {
+                log<picLog::INPUT_OUTPUT>("ADIOS: Read from domain: offset=%1% size=%2%") % domain_offset
+                    % local_domain_size;
+                log<picLog::INPUT_OUTPUT>("ADIOS: Finished loading field '%1%'") % objectName;
+            }
+        };
+
+        /**
+         * Helper class for ADIOSWriter (forEach operator) to load a field from ADIOS
+         *
+         * @tparam T_Field field class to load
+         */
+        template<typename T_Field>
+        struct LoadFields
+        {
+        public:
+            HDINLINE void operator()(ThreadParams* params)
+            {
 #ifndef __CUDA_ARCH__
-        DataConnector &dc = Environment<>::get().DataConnector();
-        ThreadParams *tp = params;
-
-        /* load field without copying data to host */
-        auto field = dc.get< T_Field >( T_Field::getName(), true );
-        tp->gridLayout = field->getGridLayout();
-
-        /* load from ADIOS */
-        bool const isDomainBound = traits::IsFieldDomainBound< T_Field >::value;
-        RestartFieldLoader::loadField(
-            field->getGridBuffer(),
-            (uint32_t)T_Field::numComponents,
-            T_Field::getName(),
-            tp,
-            isDomainBound
-        );
-
-        dc.releaseData(T_Field::getName());
+                DataConnector& dc = Environment<>::get().DataConnector();
+                ThreadParams* tp = params;
+
+                /* load field without copying data to host */
+                auto field = dc.get<T_Field>(T_Field::getName(), true);
+                tp->gridLayout = field->getGridLayout();
+
+                /* load from ADIOS */
+                bool const isDomainBound = traits::IsFieldDomainBound<T_Field>::value;
+                RestartFieldLoader::loadField(
+                    field->getGridBuffer(),
+                    (uint32_t) T_Field::numComponents,
+                    T_Field::getName(),
+                    tp,
+                    isDomainBound);
+
+                dc.releaseData(T_Field::getName());
 #endif
-    }
-
-};
+            }
+        };
 
-using namespace pmacc;
+        using namespace pmacc;
 
-} /* namespace adios */
+    } /* namespace adios */
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/adios/writer/ParticleAttribute.hpp b/include/picongpu/plugins/adios/writer/ParticleAttribute.hpp
index 1af0b8f427..3757851bab 100644
--- a/include/picongpu/plugins/adios/writer/ParticleAttribute.hpp
+++ b/include/picongpu/plugins/adios/writer/ParticleAttribute.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2014-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,69 +27,60 @@
 
 namespace picongpu
 {
-
-namespace adios
-{
-using namespace pmacc;
-
-/** write attribute of a particle to adios file
- *
- * @tparam T_Identifier identifier of a particle attribute
- */
-template< typename T_Identifier>
-struct ParticleAttribute
-{
-
-    /** write attribute to adios file
-     *
-     * @param params wrapped params
-     * @param elements elements of this attribute
-     */
-    template<typename FrameType>
-    HINLINE void operator()(
-                            ThreadParams* params,
-                            FrameType& frame,
-                            const size_t elements)
+    namespace adios
     {
+        using namespace pmacc;
+
+        /** write attribute of a particle to adios file
+         *
+         * @tparam T_Identifier identifier of a particle attribute
+         */
+        template<typename T_Identifier>
+        struct ParticleAttribute
+        {
+            /** write attribute to adios file
+             *
+             * @param params wrapped params
+             * @param elements elements of this attribute
+             */
+            template<typename FrameType>
+            HINLINE void operator()(ThreadParams* params, FrameType& frame, const size_t elements)
+            {
+                typedef T_Identifier Identifier;
+                typedef typename pmacc::traits::Resolve<Identifier>::type::type ValueType;
+                const uint32_t components = GetNComponents<ValueType>::value;
+                typedef typename GetComponentsType<ValueType>::type ComponentType;
 
-        typedef T_Identifier Identifier;
-        typedef typename pmacc::traits::Resolve<Identifier>::type::type ValueType;
-        const uint32_t components = GetNComponents<ValueType>::value;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
-
-        log<picLog::INPUT_OUTPUT > ("ADIOS:  (begin) write species attribute: %1%") % Identifier::getName();
-
-        ComponentType* tmpBfr = nullptr;
-
-        if (elements > 0)
-            tmpBfr = new ComponentType[elements];
+                log<picLog::INPUT_OUTPUT>("ADIOS:  (begin) write species attribute: %1%") % Identifier::getName();
 
-        for (uint32_t d = 0; d < components; d++)
-        {
-            ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer();
+                ComponentType* tmpBfr = nullptr;
 
-            /* copy strided data from source to temporary buffer */
-            #pragma omp parallel for
-            for (size_t i = 0; i < elements; ++i)
-            {
-                tmpBfr[i] = ((ComponentType*) dataPtr)[d + i * components];
-            }
+                if(elements > 0)
+                    tmpBfr = new ComponentType[elements];
 
-            int64_t adiosAttributeVarId = *(params->adiosParticleAttrVarIds.begin());
-            params->adiosParticleAttrVarIds.pop_front();
+                for(uint32_t d = 0; d < components; d++)
+                {
+                    ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer();
 
-            ADIOS_CMD(adios_write_byid(params->adiosFileHandle, adiosAttributeVarId, tmpBfr));
-        }
+/* copy strided data from source to temporary buffer */
+#pragma omp parallel for
+                    for(size_t i = 0; i < elements; ++i)
+                    {
+                        tmpBfr[i] = ((ComponentType*) dataPtr)[d + i * components];
+                    }
 
-        __deleteArray(tmpBfr);
+                    int64_t adiosAttributeVarId = *(params->adiosParticleAttrVarIds.begin());
+                    params->adiosParticleAttrVarIds.pop_front();
 
-        log<picLog::INPUT_OUTPUT > ("ADIOS:  ( end ) write species attribute: %1%") %
-            Identifier::getName();
-    }
+                    ADIOS_CMD(adios_write_byid(params->adiosFileHandle, adiosAttributeVarId, tmpBfr));
+                }
 
-};
+                __deleteArray(tmpBfr);
 
-} //namspace adios
+                log<picLog::INPUT_OUTPUT>("ADIOS:  ( end ) write species attribute: %1%") % Identifier::getName();
+            }
+        };
 
-} //namespace picongpu
+    } // namespace adios
 
+} // namespace picongpu
diff --git a/include/picongpu/plugins/adios/writer/ParticleAttributeSize.hpp b/include/picongpu/plugins/adios/writer/ParticleAttributeSize.hpp
index b62bb1f816..63a448300e 100644
--- a/include/picongpu/plugins/adios/writer/ParticleAttributeSize.hpp
+++ b/include/picongpu/plugins/adios/writer/ParticleAttributeSize.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt, Axel Huebl
+/* Copyright 2014-2021 Felix Schmitt, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -31,119 +31,133 @@
 
 namespace picongpu
 {
-
-namespace adios
-{
-using namespace pmacc;
-
-
-
-/** collect size of a particle attribute
- *
- * @tparam T_Identifier identifier of a particle attribute
- */
-template< typename T_Identifier>
-struct ParticleAttributeSize
-{
-    /** collect size of attribute
-     *
-     * @param params wrapped params
-     * @param elements number of particles for this attribute
-     */
-    HINLINE void operator()(
-                            ThreadParams* params,
-                            const std::string speciesGroup,
-                            const uint64_t elements,
-                            const uint64_t globalElements,
-                            const uint64_t globalOffset)
+    namespace adios
     {
+        using namespace pmacc;
 
-        typedef T_Identifier Identifier;
-        typedef typename pmacc::traits::Resolve<Identifier>::type::type ValueType;
-        const uint32_t components = GetNComponents<ValueType>::value;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
 
-        params->adiosGroupSize += elements * components * sizeof(ComponentType);
-
-        /* define adios var for particle attribute */
-        PICToAdios<ComponentType> adiosType;
-        PICToAdios<float_X> adiosFloatXType;
-        PICToAdios<float_64> adiosDoubleType;
-        PICToAdios<uint32_t> adiosUInt32Type;
-
-        const auto componentNames = plugins::misc::getComponentNames( components );
-
-        OpenPMDName<T_Identifier> openPMDName;
-        const std::string recordPath( params->adiosBasePath +
-            std::string(ADIOS_PATH_PARTICLES) + speciesGroup + openPMDName() );
-
-        // get the SI scaling, dimensionality and weighting of the attribute
-        OpenPMDUnit<T_Identifier> openPMDUnit;
-        std::vector<float_64> unit = openPMDUnit();
-        OpenPMDUnitDimension<T_Identifier> openPMDUnitDimension;
-        std::vector<float_64> unitDimension = openPMDUnitDimension();
-        const bool macroWeightedBool = MacroWeighted<T_Identifier>::get();
-        const uint32_t macroWeighted = (macroWeightedBool ? 1 : 0);
-        const float_64 weightingPower = WeightingPower<T_Identifier>::get();
-
-        PMACC_ASSERT(unit.size() == components); // unitSI for each component
-        PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
-
-        for (uint32_t d = 0; d < components; d++)
+        /** collect size of a particle attribute
+         *
+         * @tparam T_Identifier identifier of a particle attribute
+         */
+        template<typename T_Identifier>
+        struct ParticleAttributeSize
         {
-            std::stringstream datasetName;
-            datasetName << recordPath;
-            if (components > 1)
-                datasetName << "/" << componentNames[d];
-
-            const char* path = nullptr;
-            int64_t adiosParticleAttrId = defineAdiosVar<DIM1>(
-                params->adiosGroupHandle,
-                datasetName.str().c_str(),
-                path,
-                adiosType.type,
-                pmacc::math::UInt64<DIM1>(elements),
-                pmacc::math::UInt64<DIM1>(globalElements),
-                pmacc::math::UInt64<DIM1>(globalOffset),
-                true,
-                params->adiosCompression);
-
-            params->adiosParticleAttrVarIds.push_back(adiosParticleAttrId);
-
-            /* already add the unitSI and further attribute so `adios_group_size`
-             * calculates the reservation for the buffer correctly */
-
-            /* check if this attribute actually has a unit (unit.size() == 0 is no unit) */
-            if (unit.size() >= (d + 1))
-                ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-                          "unitSI", datasetName.str().c_str(),
-                          adiosDoubleType.type, 1, &unit.at(d) ));
-        }
-
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "unitDimension", recordPath.c_str(),
-            adiosDoubleType.type, 7, &(*unitDimension.begin()) ));
-
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "macroWeighted", recordPath.c_str(),
-            adiosUInt32Type.type, 1, (void*)&macroWeighted ));
-
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "weightingPower", recordPath.c_str(),
-            adiosDoubleType.type, 1, (void*)&weightingPower ));
-
-        /** \todo check if always correct at this point, depends on attribute
-         *        and MW-solver/pusher implementation */
-        const float_X timeOffset = 0.0;
-        ADIOS_CMD(adios_define_attribute_byvalue(params->adiosGroupHandle,
-            "timeOffset", recordPath.c_str(),
-            adiosFloatXType.type, 1, (void*)&timeOffset ));
-
-    }
-
-};
-
-} //namspace adios
-
-} //namespace picongpu
-
+            /** collect size of attribute
+             *
+             * @param params wrapped params
+             * @param elements number of particles for this attribute
+             */
+            HINLINE void operator()(
+                ThreadParams* params,
+                const std::string speciesGroup,
+                const uint64_t elements,
+                const uint64_t globalElements,
+                const uint64_t globalOffset)
+            {
+                typedef T_Identifier Identifier;
+                typedef typename pmacc::traits::Resolve<Identifier>::type::type ValueType;
+                const uint32_t components = GetNComponents<ValueType>::value;
+                typedef typename GetComponentsType<ValueType>::type ComponentType;
+
+                params->adiosGroupSize += elements * components * sizeof(ComponentType);
+
+                /* define adios var for particle attribute */
+                PICToAdios<ComponentType> adiosType;
+                PICToAdios<float_X> adiosFloatXType;
+                PICToAdios<float_64> adiosDoubleType;
+                PICToAdios<uint32_t> adiosUInt32Type;
+
+                const auto componentNames = plugins::misc::getComponentNames(components);
+
+                OpenPMDName<T_Identifier> openPMDName;
+                const std::string recordPath(
+                    params->adiosBasePath + std::string(ADIOS_PATH_PARTICLES) + speciesGroup + openPMDName());
+
+                // get the SI scaling, dimensionality and weighting of the attribute
+                OpenPMDUnit<T_Identifier> openPMDUnit;
+                std::vector<float_64> unit = openPMDUnit();
+                OpenPMDUnitDimension<T_Identifier> openPMDUnitDimension;
+                std::vector<float_64> unitDimension = openPMDUnitDimension();
+                const bool macroWeightedBool = MacroWeighted<T_Identifier>::get();
+                const uint32_t macroWeighted = (macroWeightedBool ? 1 : 0);
+                const float_64 weightingPower = WeightingPower<T_Identifier>::get();
+
+                PMACC_ASSERT(unit.size() == components); // unitSI for each component
+                PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
+
+                for(uint32_t d = 0; d < components; d++)
+                {
+                    std::stringstream datasetName;
+                    datasetName << recordPath;
+                    if(components > 1)
+                        datasetName << "/" << componentNames[d];
+
+                    const char* path = nullptr;
+                    int64_t adiosParticleAttrId = defineAdiosVar<DIM1>(
+                        params->adiosGroupHandle,
+                        datasetName.str().c_str(),
+                        path,
+                        adiosType.type,
+                        pmacc::math::UInt64<DIM1>(elements),
+                        pmacc::math::UInt64<DIM1>(globalElements),
+                        pmacc::math::UInt64<DIM1>(globalOffset),
+                        true,
+                        params->adiosCompression);
+
+                    params->adiosParticleAttrVarIds.push_back(adiosParticleAttrId);
+
+                    /* already add the unitSI and further attribute so `adios_group_size`
+                     * calculates the reservation for the buffer correctly */
+
+                    /* check if this attribute actually has a unit (unit.size() == 0 is no unit) */
+                    if(unit.size() >= (d + 1))
+                        ADIOS_CMD(adios_define_attribute_byvalue(
+                            params->adiosGroupHandle,
+                            "unitSI",
+                            datasetName.str().c_str(),
+                            adiosDoubleType.type,
+                            1,
+                            &unit.at(d)));
+                }
+
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "unitDimension",
+                    recordPath.c_str(),
+                    adiosDoubleType.type,
+                    7,
+                    &(*unitDimension.begin())));
+
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "macroWeighted",
+                    recordPath.c_str(),
+                    adiosUInt32Type.type,
+                    1,
+                    (void*) &macroWeighted));
+
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "weightingPower",
+                    recordPath.c_str(),
+                    adiosDoubleType.type,
+                    1,
+                    (void*) &weightingPower));
+
+                /** \todo check if always correct at this point, depends on attribute
+                 *        and MW-solver/pusher implementation */
+                const float_X timeOffset = 0.0;
+                ADIOS_CMD(adios_define_attribute_byvalue(
+                    params->adiosGroupHandle,
+                    "timeOffset",
+                    recordPath.c_str(),
+                    adiosFloatXType.type,
+                    1,
+                    (void*) &timeOffset));
+            }
+        };
+
+    } // namespace adios
+
+} // namespace picongpu
diff --git a/include/picongpu/plugins/common/particlePatches.cpp b/include/picongpu/plugins/common/particlePatches.cpp
deleted file mode 100644
index 0248204ecd..0000000000
--- a/include/picongpu/plugins/common/particlePatches.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2016-2020 Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "picongpu/plugins/common/particlePatches.hpp"
-
-
-namespace picongpu
-{
-namespace openPMD
-{
-
-    ParticlePatches::ParticlePatches( const size_t n )
-    {
-        /* zero particles */
-        numParticles = std::vector<uint64_t>( n, 0u );
-        numParticlesOffset = std::vector<uint64_t>( n, 0u );
-
-        /* zero offsets */
-        offsetX = std::vector<uint64_t>( n, 0u );
-        offsetY = std::vector<uint64_t>( n, 0u );
-        offsetZ = std::vector<uint64_t>( n, 0u );
-
-        /* zero extents */
-        extentX = std::vector<uint64_t>( n, 0u );
-        extentY = std::vector<uint64_t>( n, 0u );
-        extentZ = std::vector<uint64_t>( n, 0u );
-    }
-
-    uint64_t* ParticlePatches::getOffsetComp( const uint32_t comp )
-    {
-        if( comp == 0 )
-            return &(*offsetX.begin());
-        if( comp == 1 )
-            return &(*offsetY.begin());
-        if( comp == 2 )
-            return &(*offsetZ.begin());
-
-        return nullptr;
-    }
-
-    uint64_t* ParticlePatches::getExtentComp( const uint32_t comp )
-    {
-        if( comp == 0 )
-            return &(*extentX.begin());
-        if( comp == 1 )
-            return &(*extentY.begin());
-        if( comp == 2 )
-            return &(*extentZ.begin());
-
-        return nullptr;
-    }
-
-    size_t ParticlePatches::size() const
-    {
-        return numParticles.size();
-    }
-
-    void ParticlePatches::print()
-    {
-        std::cout << "id | numParticles numParticlesOffset "
-                  << "offsetX offsetY offsetZ extentX extentY extentZ"
-                  << std::endl;
-        for( size_t i = 0; i < this->size(); ++i )
-        {
-            std::cout << i << " | "
-                      << numParticles.at(i) << " "
-                      << numParticlesOffset.at(i) << " "
-                      << offsetX.at(i) << " "
-                      << offsetY.at(i) << " "
-                      << offsetZ.at(i) << " "
-                      << extentX.at(i) << " "
-                      << extentY.at(i) << " "
-                      << extentZ.at(i) << std::endl;
-        }
-    }
-
-} // namespace openPMD
-} // namespace picongpu
diff --git a/include/picongpu/plugins/common/particlePatches.hpp b/include/picongpu/plugins/common/particlePatches.hpp
deleted file mode 100644
index be3fb115fc..0000000000
--- a/include/picongpu/plugins/common/particlePatches.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2016-2020 Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <vector>
-#include <list>
-#include <iostream>
-#include <cstdint>
-
-namespace picongpu
-{
-namespace openPMD
-{
-
-    /** Struct for a list of particle patches
-     *
-     * Object for all particle patches.
-     * @see https://github.com/openPMD/openPMD-standard/blob/1.0.0/STANDARD.md#sub-group-for-each-particle-species
-     */
-    class ParticlePatches
-    {
-    private:
-        /** Disallow (empty) default contructor
-         */
-        ParticlePatches ();
-
-    public:
-        std::vector<uint64_t> numParticles;
-        std::vector<uint64_t> numParticlesOffset;
-
-        std::vector<uint64_t> offsetX;
-        std::vector<uint64_t> offsetY;
-        std::vector<uint64_t> offsetZ;
-
-        std::vector<uint64_t> extentX;
-        std::vector<uint64_t> extentY;
-        std::vector<uint64_t> extentZ;
-
-        /** Fill-Constructor with n empty-sized patches
-         *
-         * @param n number of patches to store
-         */
-        ParticlePatches( const size_t n );
-
-        /** Return the beginning of one of the components of the
-         *  offset as pointer
-         *
-         * Be aware that the pointer is pointing to the beginning
-         * of a C-array of size `size()` and is only allocated as long
-         * as the `ParticlePatches` object is alive.
-         *
-         * @param comp component (0=x, 1=y, 2=z) of offset array
-         *             for the list of patches
-         * @return uint64_t* pointing to the beginning of a c-array
-         *                   with length as given in size()
-         */
-        uint64_t* getOffsetComp( const uint32_t comp );
-
-        /** Return the beginning of one of the components of the
-         *  extent as pointer
-         *
-         * Be aware that the pointer is pointing to the beginning
-         * of a C-array of size `size()` and is only allocated as long
-         * as the `ParticlePatches` object is alive.
-         *
-         * @param comp component (0=x, 1=y, 2=z) of extent array
-         *             for the list of patches
-         * @return uint64_t* pointing to the beginning of a c-array
-         *                   with length as given in size()
-         */
-        uint64_t* getExtentComp( const uint32_t comp );
-
-        /** Returns the number of patches
-         */
-        size_t size() const;
-
-        /** Helper function printing to std::cout
-         */
-        void print();
-    };
-
-} // namespace openPMD
-} // namespace picongpu
diff --git a/include/picongpu/plugins/common/stringHelpers.cpp b/include/picongpu/plugins/common/stringHelpers.cpp
index 0041cc46cb..d4694597e9 100644
--- a/include/picongpu/plugins/common/stringHelpers.cpp
+++ b/include/picongpu/plugins/common/stringHelpers.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -22,119 +22,87 @@
 
 namespace picongpu
 {
-namespace helper
-{
-    /** Return the current date as string
-     *
-     * \param format, \see http://www.cplusplus.com/reference/ctime/strftime/
-     * \return std::string with formatted date
-     */
-    std::string getDateString( std::string format )
-    {
-        time_t rawtime;
-        struct tm* timeinfo;
-        const size_t maxLen = 30;
-        char buffer [maxLen];
-
-        time( &rawtime );
-        timeinfo = localtime( &rawtime );
-
-        strftime( buffer, maxLen, format.c_str(), timeinfo );
-
-        std::stringstream dateString;
-        dateString << buffer;
-
-        return dateString.str();
-    }
-
-    GetSplashArrayOfString::Result
-    GetSplashArrayOfString::operator()(
-        std::list<std::string> listOfStrings,
-        char padding
-    )
+    namespace helper
     {
+        /** Return the current date as string
+         *
+         * \param format, \see http://www.cplusplus.com/reference/ctime/strftime/
+         * \return std::string with formatted date
+         */
+        std::string getDateString(std::string format)
+        {
+            time_t rawtime;
+            struct tm* timeinfo;
+            const size_t maxLen = 30;
+            char buffer[maxLen];
+
+            time(&rawtime);
+            timeinfo = localtime(&rawtime);
+
+            strftime(buffer, maxLen, format.c_str(), timeinfo);
+
+            std::stringstream dateString;
+            dateString << buffer;
+
+            return dateString.str();
+        }
+
+        GetSplashArrayOfString::Result GetSplashArrayOfString::operator()(
+            std::list<std::string> listOfStrings,
+            char padding)
+        {
             Result result;
 
             // find length of longest string in list
             CompStrBySize compStrBySize;
-            std::string longestString = *std::max_element(
-                    listOfStrings.begin(),
-                    listOfStrings.end(),
-                    compStrBySize
-            );
+            std::string longestString = *std::max_element(listOfStrings.begin(), listOfStrings.end(), compStrBySize);
             result.maxLen = longestString.size();
 
             // allocate & prepare buffer with padding
             //   size per buffer must include terminator \0 !
             const size_t bytesPerEntry = result.maxLen + 1;
             const size_t lenAllBuffers = listOfStrings.size() * bytesPerEntry;
-            result.buffers.assign( lenAllBuffers, padding );
+            result.buffers.assign(lenAllBuffers, padding);
 
             // copy buffers
             std::list<std::string>::iterator listIt = listOfStrings.begin();
-            for(
-                size_t i = 0;
-                i < listOfStrings.size();
-                ++i, ++listIt
-            )
+            for(size_t i = 0; i < listOfStrings.size(); ++i, ++listIt)
             {
                 // index points to each part of the buffer individually
                 const size_t startIdx = i * bytesPerEntry;
-                std::vector<char>::iterator startIt =
-                    result.buffers.begin() + startIdx;
+                std::vector<char>::iterator startIt = result.buffers.begin() + startIdx;
 
                 // copy byte-wise onto padding
-                std::copy(
-                    listIt->begin(),
-                    listIt->end(),
-                    startIt
-                );
-                if( padding != '\0' )
-                    result.buffers.at( startIdx + result.maxLen ) = '\0';
+                std::copy(listIt->begin(), listIt->end(), startIt);
+                if(padding != '\0')
+                    result.buffers.at(startIdx + result.maxLen) = '\0';
             }
 
             // return
             return result;
-    }
+        }
 
-    GetADIOSArrayOfString::Result
-    GetADIOSArrayOfString::operator()(
-        std::list<std::string> listOfStrings
-    )
-    {
+        GetADIOSArrayOfString::Result GetADIOSArrayOfString::operator()(std::list<std::string> listOfStrings)
+        {
             Result result;
 
             // sum of all strings + their null terminators
             StrSize strSize;
-            const size_t sumLen = std::accumulate(
-                listOfStrings.begin(),
-                listOfStrings.end(),
-                0u,
-                strSize
-            );
+            const size_t sumLen = std::accumulate(listOfStrings.begin(), listOfStrings.end(), 0u, strSize);
 
             // allocate & prepare buffer, starts
-            result.buffers.assign( sumLen, '\0' );
-            result.starts.assign( listOfStrings.size(), nullptr );
+            result.buffers.assign(sumLen, '\0');
+            result.starts.assign(listOfStrings.size(), nullptr);
 
             // concat all strings, \0 terminated
             size_t startIdx = 0;
             std::list<std::string>::iterator listIt = listOfStrings.begin();
-            for(
-                size_t i = 0;
-                i < listOfStrings.size();
-                ++i, ++listIt
-            )
+            for(size_t i = 0; i < listOfStrings.size(); ++i, ++listIt)
             {
-                std::vector<char>::iterator startIt =
-                    result.buffers.begin() + startIdx;
+                std::vector<char>::iterator startIt = result.buffers.begin() + startIdx;
 
                 // copy byte-wise onto padding
-                std::copy(
-                    listIt->begin(),
-                    listIt->end(),
-                    startIt
-                );
+                std::copy(listIt->begin(), listIt->end(), startIt);
 
                 // start pointer
                 result.starts.at(i) = &(*startIt);
@@ -144,6 +112,6 @@ namespace helper
 
             // return
             return result;
-    }
-} // namespace helper
+        }
+    } // namespace helper
 } // namespace picongpu
diff --git a/include/picongpu/plugins/common/stringHelpers.hpp b/include/picongpu/plugins/common/stringHelpers.hpp
index 8eb681b90b..b70a9e39c2 100644
--- a/include/picongpu/plugins/common/stringHelpers.hpp
+++ b/include/picongpu/plugins/common/stringHelpers.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -30,83 +30,80 @@
 
 namespace picongpu
 {
-namespace helper
-{
-    /** Return the current date as string
-     *
-     * \param format, \see http://www.cplusplus.com/reference/ctime/strftime/
-     * \return std::string with formatted date
-     */
-    std::string getDateString( std::string format );
-
-    /** Create array of c-strings suitable for libSplash
-     *
-     * Convert a std::list of strings to a format that is suitable to
-     * be written into libSplash (concated and padded array of constant
-     * c-strings). Strings will be padded to longest string.
-     *
-     * Independent of the padding you chose, the strings will be '\0'
-     * separated & terminated. \0 padding is default and recommended.
-     */
-    class GetSplashArrayOfString
+    namespace helper
     {
-    private:
-        // compare two std::string by their size
-        struct CompStrBySize
+        /** Return the current date as string
+         *
+         * \param format, \see http://www.cplusplus.com/reference/ctime/strftime/
+         * \return std::string with formatted date
+         */
+        std::string getDateString(std::string format);
+
+        /** Create array of c-strings suitable for libSplash
+         *
+         * Convert a std::list of strings to a format that is suitable to
+         * be written into libSplash (concated and padded array of constant
+         * c-strings). Strings will be padded to longest string.
+         *
+         * Independent of the padding you chose, the strings will be '\0'
+         * separated & terminated. \0 padding is default and recommended.
+         */
+        class GetSplashArrayOfString
         {
-            bool operator()( std::string i, std::string j )
+        private:
+            // compare two std::string by their size
+            struct CompStrBySize
             {
-                return i.size() < j.size();
-            }
-        };
+                bool operator()(std::string i, std::string j)
+                {
+                    return i.size() < j.size();
+                }
+            };
 
-    public:
-        // resulting type containing all attributes for a libSplash write call
-        struct Result
-        {
-            size_t maxLen;                // size of the longest string
-            std::vector<char> buffers;    // all of same length lenMax
+        public:
+            // resulting type containing all attributes for a libSplash write call
+            struct Result
+            {
+                size_t maxLen; // size of the longest string
+                std::vector<char> buffers; // all of same length lenMax
 
-            Result() : maxLen(0)
-            {}
-        };
+                Result() : maxLen(0)
+                {
+                }
+            };
 
-        Result operator()(
-            std::list<std::string> listOfStrings,
-            char padding = '\0'
-        );
-    };
+            Result operator()(std::list<std::string> listOfStrings, char padding = '\0');
+        };
 
-    /** Create array of c-strings suitable for ADIOS
-     *
-     * Convert a std::list of strings to a format that is suitable to
-     * be written into ADIOS (`char *strings[]`).
-     */
-    class GetADIOSArrayOfString
-    {
-    private:
-        // accumulate the size of a string + \0 to an initial value
-        struct StrSize
+        /** Create array of c-strings suitable for ADIOS
+         *
+         * Convert a std::list of strings to a format that is suitable to
+         * be written into ADIOS (`char *strings[]`).
+         */
+        class GetADIOSArrayOfString
         {
-            size_t operator()( size_t init, std::string s )
+        private:
+            // accumulate the size of a string + \0 to an initial value
+            struct StrSize
             {
-                return init +     // previous length
-                       s.size() + // this strings length
-                       1;         // this strings null terminator
-           }
-        };
-    public:
-        // resulting type containing all attributes for a ADIOS write call
-        struct Result
-        {
-            std::vector<char> buffers;
-            std::vector<char*> starts;
-        };
+                size_t operator()(size_t init, std::string s)
+                {
+                    return init + // previous length
+                        s.size() + // this strings length
+                        1; // this strings null terminator
+                }
+            };
 
-        Result operator()(
-            std::list<std::string> listOfStrings
-        );
-    };
+        public:
+            // resulting type containing all attributes for a ADIOS write call
+            struct Result
+            {
+                std::vector<char> buffers;
+                std::vector<char*> starts;
+            };
+
+            Result operator()(std::list<std::string> listOfStrings);
+        };
 
-} // namespace helper
+    } // namespace helper
 } // namespace picongpu
diff --git a/include/picongpu/plugins/common/txtFileHandling.hpp b/include/picongpu/plugins/common/txtFileHandling.hpp
index d9f745bdfc..1c2e80a2cb 100644
--- a/include/picongpu/plugins/common/txtFileHandling.hpp
+++ b/include/picongpu/plugins/common/txtFileHandling.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl, Richard Pausch
+/* Copyright 2015-2021 Axel Huebl, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -28,7 +28,7 @@
 
 namespace picongpu
 {
-using namespace boost::filesystem;
+    using namespace boost::filesystem;
 
     /** Restore a txt file from the checkpoint dir
      *
@@ -43,41 +43,41 @@ using namespace boost::filesystem;
      *
      * \return operation was successful or not
      */
-    HINLINE bool restoreTxtFile( std::ofstream& outFile, std::string filename,
-                         uint32_t restartStep, const std::string restartDirectory )
+    HINLINE bool restoreTxtFile(
+        std::ofstream& outFile,
+        std::string filename,
+        uint32_t restartStep,
+        const std::string restartDirectory)
     {
         /* get restart time step as string */
         std::stringstream sStep;
         sStep << restartStep;
 
         /* set location of restart file and output file */
-        path src( restartDirectory + std::string("/") + filename +
-                  std::string(".") + sStep.str() );
-        path dst( filename );
+        path src(restartDirectory + std::string("/") + filename + std::string(".") + sStep.str());
+        path dst(filename);
 
         /* check whether restart file exists */
-        if( !boost::filesystem::exists( src ) )
+        if(!boost::filesystem::exists(src))
         {
             /* restart file does not exists */
-            log<picLog::INPUT_OUTPUT> ("Plugin restart file: %1% was not found. \
-                                       --> Starting plugin from current time step.") % src;
+            log<picLog::INPUT_OUTPUT>("Plugin restart file: %1% was not found. \
+                                       --> Starting plugin from current time step.")
+                % src;
             return true;
         }
         else
         {
             /* restart file found - fix output file created at restart */
-            if( outFile.is_open() )
+            if(outFile.is_open())
                 outFile.close();
 
-            copy_file( src,
-                       dst,
-                       copy_option::overwrite_if_exists );
+            copy_file(src, dst, copy_option::overwrite_if_exists);
 
-            outFile.open( filename.c_str(), std::ofstream::out | std::ostream::app );
-            if( !outFile )
+            outFile.open(filename.c_str(), std::ofstream::out | std::ostream::app);
+            if(!outFile)
             {
-                std::cerr << "[Plugin] Can't open file '" << filename
-                          << "', output disabled" << std::endl;
+                std::cerr << "[Plugin] Can't open file '" << filename << "', output disabled" << std::endl;
                 return false;
             }
             return true;
@@ -93,21 +93,21 @@ using namespace boost::filesystem;
      * \param currentStep the current time step
      * \param checkpointDirectory path to the checkpoint directory
      */
-    HINLINE void checkpointTxtFile( std::ofstream& outFile, std::string filename,
-                            uint32_t currentStep, const std::string checkpointDirectory )
+    HINLINE void checkpointTxtFile(
+        std::ofstream& outFile,
+        std::string filename,
+        uint32_t currentStep,
+        const std::string checkpointDirectory)
     {
         outFile.flush();
 
         std::stringstream sStep;
         sStep << currentStep;
 
-        path src( filename );
-        path dst( checkpointDirectory + std::string("/") + filename +
-        std::string(".") + sStep.str() );
+        path src(filename);
+        path dst(checkpointDirectory + std::string("/") + filename + std::string(".") + sStep.str());
 
-        copy_file( src,
-                   dst,
-                   copy_option::overwrite_if_exists );
+        copy_file(src, dst, copy_option::overwrite_if_exists);
     }
 
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/hdf5/HDF5Writer.def b/include/picongpu/plugins/hdf5/HDF5Writer.def
deleted file mode 100644
index df5aba9a55..0000000000
--- a/include/picongpu/plugins/hdf5/HDF5Writer.def
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-
-#pragma once
-
-#include "picongpu/simulation_types.hpp"
-#include <pmacc/particles/frame_types.hpp>
-#include "picongpu/simulation/control/MovingWindow.hpp"
-#include <splash/splash.h>
-
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-using namespace pmacc;
-
-using namespace splash;
-
-
-namespace po = boost::program_options;
-
-struct ThreadParams
-{
-    /* set at least the pointers to nullptr by default */
-    ThreadParams() :
-        dataCollector(nullptr),
-        cellDescription(nullptr)
-    {}
-
-    /** current simulation step */
-    uint32_t currentStep;
-
-    /** current dump is a checkpoint */
-    bool isCheckpoint;
-
-    /** libSplash class */
-    ParallelDomainCollector *dataCollector;
-
-    /** libSplash file's base name */
-    std::string h5Filename;
-
-    /** description of the grid/field layout, including guards etc. */
-    GridLayout<simDim> gridLayout;
-
-    /** cell description */
-    MappingDesc *cellDescription;
-
-    /** window describing the volume to be dumped */
-    Window window;
-
-    /** offset from local moving window to local domain */
-    DataSpace<simDim> localWindowToDomainOffset;
-};
-
-/**
- * Writes simulation data to hdf5 files.
- * Implements the ISimulationPlugin interface.
- */
-
-class HDF5Writer;
-
-} //namespace hdf5
-} //namespace picongpu
-
diff --git a/include/picongpu/plugins/hdf5/HDF5Writer.hpp b/include/picongpu/plugins/hdf5/HDF5Writer.hpp
deleted file mode 100644
index 03da760e79..0000000000
--- a/include/picongpu/plugins/hdf5/HDF5Writer.hpp
+++ /dev/null
@@ -1,776 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
- *                     Alexander Grund
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include <sstream>
-#include <string>
-#include <list>
-#include <vector>
-#include <regex>
-
-#include "picongpu/simulation_defines.hpp"
-
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/traits/SplashToPIC.hpp"
-#include "picongpu/traits/PICToSplash.hpp"
-#include "picongpu/plugins/misc/misc.hpp"
-#include "picongpu/plugins/multi/Option.hpp"
-#include "picongpu/plugins/misc/SpeciesFilter.hpp"
-#include "picongpu/particles/traits/SpeciesEligibleForSolver.hpp"
-#include "picongpu/particles/filter/filter.hpp"
-
-#include <pmacc/particles/frame_types.hpp>
-
-#include "picongpu/fields/FieldB.hpp"
-#include "picongpu/fields/FieldE.hpp"
-#include "picongpu/fields/FieldJ.hpp"
-#include "picongpu/fields/FieldTmp.hpp"
-#include "picongpu/fields/MaxwellSolver/YeePML/Field.hpp"
-#include <pmacc/particles/particleFilter/FilterFactory.hpp>
-#include <pmacc/particles/particleFilter/PositionFilter.hpp>
-#include <pmacc/particles/operations/CountParticles.hpp>
-#include <pmacc/particles/IdProvider.def>
-
-#include <pmacc/dataManagement/DataConnector.hpp>
-#include <pmacc/mappings/simulation/GridController.hpp>
-#include <pmacc/mappings/simulation/SubGrid.hpp>
-#include <pmacc/dimensions/GridLayout.hpp>
-#include <pmacc/pluginSystem/PluginConnector.hpp>
-#include "picongpu/simulation/control/MovingWindow.hpp"
-#include <pmacc/math/Vector.hpp>
-
-#include "picongpu/plugins/output/IIOBackend.hpp"
-#include <boost/mpl/vector.hpp>
-#include <boost/mpl/pair.hpp>
-#include <boost/type_traits/is_same.hpp>
-#include <boost/mpl/size.hpp>
-#include <boost/mpl/at.hpp>
-#include <boost/mpl/begin_end.hpp>
-#include <boost/mpl/find.hpp>
-
-#include <boost/type_traits.hpp>
-
-#include "picongpu/plugins/hdf5/WriteMeta.hpp"
-#include "picongpu/plugins/hdf5/WriteFields.hpp"
-#include "picongpu/plugins/hdf5/WriteSpecies.hpp"
-#include "picongpu/plugins/hdf5/restart/LoadSpecies.hpp"
-#include "picongpu/plugins/hdf5/restart/RestartFieldLoader.hpp"
-#include "picongpu/plugins/hdf5/NDScalars.hpp"
-#include "picongpu/plugins/misc/SpeciesFilter.hpp"
-
-#include <pmacc/memory/boxes/DataBoxDim1Access.hpp>
-
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-
-using namespace pmacc;
-
-using namespace splash;
-
-/** Writes simulation data to hdf5 files using libSplash.
- *
- * Implements the IIOBackend interface.
- */
-class HDF5Writer :
-    public IIOBackend
-{
-public:
-
-    struct Help : public plugins::multi::IHelp
-    {
-        /** creates a instance of ISlave
-         *
-         * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
-         * @param help plugin defined help
-         * @param id index of the plugin, range: [0;help->getNumPlugins())
-         */
-        std::shared_ptr< ISlave > create(
-            std::shared_ptr< IHelp > & help,
-            size_t const id,
-            MappingDesc* cellDescription
-        )
-        {
-            return std::shared_ptr< ISlave >(
-                new HDF5Writer(
-                    help,
-                    id,
-                    cellDescription
-                )
-            );
-        }
-
-        plugins::multi::Option< std::string > notifyPeriod = {
-            "period",
-            "enable HDF5 IO [for each n-th step]"
-        };
-        plugins::multi::Option< std::string > source = {
-            "source",
-            "data sources: ",
-            "species_all, fields_all"
-        };
-
-        plugins::multi::Option< std::string > fileName = {
-            "file",
-            "HDF5 output filename (prefix)"
-        };
-
-        /** defines if the plugin must register itself to the PMacc plugin system
-         *
-         * true = the plugin is registering it self
-         * false = the plugin is not registering itself (plugin is controlled by another class)
-         */
-        bool selfRegister = false;
-
-        std::vector< std::string > allowedDataSources  = {
-            "species_all",
-            "fields_all"
-        };
-
-        template<typename T_TupleVector>
-        struct CreateSpeciesFilter
-        {
-            using type = plugins::misc::SpeciesFilter<
-                typename pmacc::math::CT::At<
-                    T_TupleVector,
-                    bmpl::int_<0>
-                >::type,
-                typename pmacc::math::CT::At<
-                    T_TupleVector,
-                    bmpl::int_<1>
-                >::type
-            >;
-        };
-
-        using AllParticlesTimesAllFilters = typename AllCombinations<
-            bmpl::vector<
-                FileOutputParticles,
-                particles::filter::AllParticleFilters
-            >
-         >::type;
-
-        using AllSpeciesFilter = typename bmpl::transform<
-            AllParticlesTimesAllFilters,
-            CreateSpeciesFilter< bmpl::_1 >
-        >::type;
-
-        using AllEligibleSpeciesSources = typename bmpl::copy_if<
-            AllSpeciesFilter,
-            plugins::misc::speciesFilter::IsEligible< bmpl::_1 >
-        >::type;
-
-        using AllFieldSources = FileOutputFields;
-
-        ///! method used by plugin controller to get --help description
-        void registerHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        )
-        {
-            meta::ForEach<
-                AllEligibleSpeciesSources,
-                plugins::misc::AppendName< bmpl::_1 >
-            > getEligibleDataSourceNames;
-            getEligibleDataSourceNames( allowedDataSources );
-
-            meta::ForEach<
-                AllFieldSources,
-                plugins::misc::AppendName< bmpl::_1 >
-            > appendFieldSourceNames;
-            appendFieldSourceNames( allowedDataSources );
-
-            // string list with all possible data sources
-            std::string concatenatedSourceNames = plugins::misc::concatenateToString(
-                allowedDataSources,
-                ", "
-            );
-
-            notifyPeriod.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            source.registerHelp(
-                desc,
-                masterPrefix + prefix,
-                std::string( "[" ) + concatenatedSourceNames + "]"
-            );
-            fileName.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            selfRegister = true;
-
-        }
-
-        void expandHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        )
-        {
-        }
-
-        void validateOptions()
-        {
-            if( selfRegister )
-            {
-                if( notifyPeriod.empty() || fileName.empty() )
-                    throw std::runtime_error(
-                        name +
-                        ": parameter period and file must be defined"
-                    );
-
-                // check if user passed data source names are valid
-                for( auto const & dateSourceNames : source )
-                {
-                    auto vectorOfDataSourceNames = plugins::misc::splitString(
-                        plugins::misc::removeSpaces( dateSourceNames )
-                    );
-
-                    for( auto const & f : vectorOfDataSourceNames )
-                    {
-                        if(
-                            !plugins::misc::containsObject(
-                                allowedDataSources,
-                                f
-                            )
-                        )
-                        {
-                            throw std::runtime_error( name + ": unknown data source '" + f + "'" );
-                        }
-                    }
-                }
-            }
-        }
-
-        size_t getNumPlugins() const
-        {
-            if( selfRegister )
-                return notifyPeriod.size();
-            else
-                return 1;
-        }
-
-        std::string getDescription() const
-        {
-            return description;
-        }
-
-        std::string getOptionPrefix() const
-        {
-            return prefix;
-        }
-
-        std::string getName() const
-        {
-            return name;
-        }
-
-        std::string const name = "HDF5Writer";
-        //! short description of the plugin
-        std::string const description = "dump simulation data with hdf5";
-        //! prefix used for command line arguments
-        std::string const prefix = "hdf5";
-    };
-
-    //! must be implemented by the user
-    static std::shared_ptr< plugins::multi::IHelp > getHelp()
-    {
-        return std::shared_ptr< plugins::multi::IHelp >( new Help{ } );
-    }
-
-    /** constructor
-     *
-     * @param help instance of the class Help
-     * @param id index of this plugin instance within help
-     * @param cellDescription PIConGPu cell description information for kernel index mapping
-     */
-    HDF5Writer(
-        std::shared_ptr< plugins::multi::IHelp > & help,
-        size_t const id,
-        MappingDesc* cellDescription
-    ) :
-    m_help( std::static_pointer_cast< Help >(help) ),
-    m_id( id ),
-    m_cellDescription( cellDescription ),
-    outputDirectory("h5")
-    {
-        mThreadParams.cellDescription = m_cellDescription;
-
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-
-        /* It is important that we never change the mpi_pos after this point
-         * because we get problems with the restart.
-         * Otherwise we do not know which gpu must load the ghost parts around
-         * the sliding window.
-         */
-        mpi_pos = gc.getPosition();
-        mpi_size = gc.getGpuNodes();
-
-        splashMpiPos.set(0, 0, 0);
-        splashMpiSize.set(1, 1, 1);
-
-        for (uint32_t i = 0; i < simDim; ++i)
-        {
-            splashMpiPos[i] = mpi_pos[i];
-            splashMpiSize[i] = mpi_size[i];
-        }
-
-        if( m_help->selfRegister )
-        {
-            std::string notifyPeriod = m_help->notifyPeriod.get( id );
-            /* only register for notify callback when .period is set on command line */
-            if(!notifyPeriod.empty())
-            {
-                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
-
-                /** create notify directory */
-                Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(outputDirectory);
-            }
-        }
-    }
-
-    virtual ~HDF5Writer()
-    {
-        if (mThreadParams.dataCollector)
-                mThreadParams.dataCollector->finalize();
-
-         __delete(mThreadParams.dataCollector);
-    }
-
-    void notify(uint32_t currentStep)
-    {
-        // notify is only allowed if the plugin is not controlled by the class Checkpoint
-        assert( m_help->selfRegister );
-
-        __getTransactionEvent().waitForFinished();
-
-        std::string filename = m_help->fileName.get( m_id );
-        /* if file name is relative, prepend with common directory */
-        if( boost::filesystem::path(filename).has_root_path() )
-            mThreadParams.h5Filename = filename;
-        else
-            mThreadParams.h5Filename = outputDirectory + "/" + filename;
-
-        /* window selection */
-        mThreadParams.window = MovingWindow::getInstance().getWindow(currentStep);
-        mThreadParams.isCheckpoint = false;
-        dumpData(currentStep);
-    }
-
-    virtual void restart(
-        uint32_t restartStep,
-        std::string const & restartDirectory
-    )
-    {
-        /* ISlave restart interface is not needed becase IIOBackend
-         * restart interface is used
-         */
-    }
-
-    virtual void checkpoint(
-        uint32_t currentStep,
-        std::string const & checkpointDirectory
-    )
-    {
-        /* ISlave checkpoint interface is not needed becase IIOBackend
-         * checkpoint interface is used
-         */
-    }
-
-    void doRestart(
-        const uint32_t restartStep,
-        const std::string& restartDirectory,
-        const std::string& constRestartFilename,
-        const uint32_t restartChunkSize
-    )
-    {
-        // restart is only allowed if the plugin is controlled by the class Checkpoint
-        assert(!m_help->selfRegister);
-
-        // allow to modify the restart file name
-        std::string restartFilename{ constRestartFilename };
-
-        const uint32_t maxOpenFilesPerNode = 4;
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-        mThreadParams.dataCollector = new ParallelDomainCollector(
-                                                                  gc.getCommunicator().getMPIComm(),
-                                                                  gc.getCommunicator().getMPIInfo(),
-                                                                  splashMpiSize,
-                                                                  maxOpenFilesPerNode);
-
-        mThreadParams.currentStep = restartStep;
-
-        /* set attributes for datacollector files */
-        DataCollector::FileCreationAttr attr;
-        attr.fileAccType = DataCollector::FAT_READ;
-        attr.mpiPosition.set(splashMpiPos);
-        attr.mpiSize.set(splashMpiSize);
-
-        /* if restartFilename is relative, prepend with restartDirectory */
-        if (!boost::filesystem::path(restartFilename).has_root_path())
-        {
-            restartFilename = restartDirectory + std::string("/") + restartFilename;
-        }
-
-        /* open datacollector */
-        try
-        {
-            log<picLog::INPUT_OUTPUT > ("HDF5 open DataCollector with file: %1%") % restartFilename;
-            mThreadParams.dataCollector->open(restartFilename.c_str(), attr);
-        }
-        catch (const DCException& e)
-        {
-            std::cerr << e.what() << std::endl;
-            throw std::runtime_error("HDF5 failed to open DataCollector");
-        }
-
-        /* load number of slides to initialize MovingWindow */
-        uint32_t slides = 0;
-        mThreadParams.dataCollector->readAttributeInfo(restartStep, nullptr, "sim_slides").read(&slides, sizeof(slides));
-
-        /* apply slides to set gpus to last/written configuration */
-        log<picLog::INPUT_OUTPUT > ("HDF5 setting slide count for moving window to %1%") % slides;
-        MovingWindow::getInstance().setSlideCounter(slides, restartStep);
-
-        /* re-distribute the local offsets in y-direction
-         * this will work for restarts with moving window still enabled
-         * and restarts that disable the moving window
-         * \warning enabling the moving window from a checkpoint that
-         *          had no moving window will not work
-         */
-        gc.setStateAfterSlides(slides);
-
-        /* set window for restart, complete global domain */
-        mThreadParams.window = MovingWindow::getInstance().getDomainAsWindow(restartStep);
-        for (uint32_t i = 0; i < simDim; ++i)
-        {
-            mThreadParams.localWindowToDomainOffset[i] = 0;
-        }
-
-        ThreadParams *params = &mThreadParams;
-
-        /* load all fields */
-        meta::ForEach<FileCheckpointFields, LoadFields<bmpl::_1> > forEachLoadFields;
-        forEachLoadFields(params);
-
-        /* load all particles */
-        meta::ForEach<FileCheckpointParticles, LoadSpecies<bmpl::_1> > forEachLoadSpecies;
-        forEachLoadSpecies(params, restartChunkSize);
-
-        IdProvider<simDim>::State idProvState;
-        ReadNDScalars<uint64_t, uint64_t>()(mThreadParams,
-                "picongpu/idProvider/startId", &idProvState.startId,
-                "maxNumProc", &idProvState.maxNumProc);
-        ReadNDScalars<uint64_t>()(mThreadParams,
-                "picongpu/idProvider/nextId", &idProvState.nextId);
-        log<picLog::INPUT_OUTPUT > ("Setting next free id on current rank: %1%") % idProvState.nextId;
-        IdProvider<simDim>::setState(idProvState);
-
-        /* close datacollector */
-        log<picLog::INPUT_OUTPUT > ("HDF5 close DataCollector with file: %1%") % restartFilename;
-        mThreadParams.dataCollector->close();
-
-        if (mThreadParams.dataCollector)
-            mThreadParams.dataCollector->finalize();
-
-        __delete(mThreadParams.dataCollector);
-    }
-
-    void dumpCheckpoint(
-        const uint32_t currentStep,
-        const std::string& checkpointDirectory,
-        const std::string& checkpointFilename
-    )
-    {
-        // checkpointing is only allowed if the plugin is controlled by the class Checkpoint
-        assert(!m_help->selfRegister);
-
-        __getTransactionEvent().waitForFinished();
-        /* if file name is relative, prepend with common directory */
-        if( boost::filesystem::path(checkpointFilename).has_root_path() )
-            mThreadParams.h5Filename = checkpointFilename;
-        else
-            mThreadParams.h5Filename = checkpointDirectory + "/" + checkpointFilename;
-
-        mThreadParams.window = MovingWindow::getInstance().getDomainAsWindow(currentStep);
-        mThreadParams.isCheckpoint = true;
-
-        dumpData(currentStep);
-    }
-
-private:
-
-    void closeH5File()
-    {
-        if (mThreadParams.dataCollector != nullptr)
-        {
-            log<picLog::INPUT_OUTPUT > ("HDF5 close DataCollector");
-            mThreadParams.dataCollector->close();
-        }
-    }
-
-    void openH5File(const std::string h5Filename)
-    {
-        const uint32_t maxOpenFilesPerNode = 4;
-        if (mThreadParams.dataCollector == nullptr)
-        {
-            GridController<simDim> &gc = Environment<simDim>::get().GridController();
-            mThreadParams.dataCollector = new ParallelDomainCollector(
-                                                                      gc.getCommunicator().getMPIComm(),
-                                                                      gc.getCommunicator().getMPIInfo(),
-                                                                      splashMpiSize,
-                                                                      maxOpenFilesPerNode);
-        }
-        // set attributes for datacollector files
-        DataCollector::FileCreationAttr attr;
-        attr.enableCompression = false;
-        attr.fileAccType = DataCollector::FAT_CREATE;
-        attr.mpiPosition.set(splashMpiPos);
-        attr.mpiSize.set(splashMpiSize);
-
-        // open datacollector
-        try
-        {
-            log<picLog::INPUT_OUTPUT > ("HDF5 open DataCollector with file: %1%") % h5Filename;
-            mThreadParams.dataCollector->open(h5Filename.c_str(), attr);
-        }
-        catch (const DCException& e)
-        {
-            std::cerr << e.what() << std::endl;
-            throw std::runtime_error("HDF5 failed to open DataCollector");
-        }
-    }
-
-    /** dump data
-     *
-     * @param currentStep current simulation step
-     * @param isCheckpoint checkpoint notification
-     */
-    void dumpData(uint32_t currentStep)
-    {
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-        mThreadParams.cellDescription = m_cellDescription;
-        mThreadParams.currentStep = currentStep;
-
-        for (uint32_t i = 0; i < simDim; ++i)
-        {
-            mThreadParams.localWindowToDomainOffset[i] = 0;
-            if (mThreadParams.window.globalDimensions.offset[i] > localDomain.offset[i])
-            {
-                mThreadParams.localWindowToDomainOffset[i] =
-                    mThreadParams.window.globalDimensions.offset[i] -
-                    localDomain.offset[i];
-            }
-        }
-
-        openH5File(mThreadParams.h5Filename);
-
-        writeHDF5((void*) &mThreadParams);
-
-        closeH5File();
-    }
-
-    template< typename T_ParticleFilter>
-    struct CallWriteSpecies
-    {
-
-        template<typename Space>
-        void operator()(
-            const std::vector< std::string > & vectorOfDataSourceNames,
-            ThreadParams* params,
-            const Space domainOffset
-        )
-        {
-            bool const containsDataSource = plugins::misc::containsObject(
-                vectorOfDataSourceNames,
-                T_ParticleFilter::getName()
-            );
-
-            if( containsDataSource )
-            {
-                WriteSpecies<
-                    T_ParticleFilter
-                > writeSpecies;
-                writeSpecies(params, domainOffset);
-            }
-
-        }
-    };
-
-    template< typename T_Field >
-    struct CallWriteFields
-    {
-
-        void operator()(
-            const std::vector< std::string > & vectorOfDataSourceNames,
-            ThreadParams* params
-        )
-        {
-            bool const containsDataSource = plugins::misc::containsObject(
-                vectorOfDataSourceNames,
-                T_Field::getName()
-            );
-
-            if( containsDataSource )
-            {
-                WriteFields<
-                    T_Field
-                > writeFields;
-                writeFields(params);
-            }
-
-        }
-    };
-
-    void writeHDF5(void *p_args)
-    {
-        ThreadParams *threadParams = (ThreadParams*) (p_args);
-
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-        DataSpace<simDim> domainOffset(
-            subGrid.getGlobalDomain().offset +
-            subGrid.getLocalDomain().offset
-        );
-
-        std::vector< std::string > vectorOfDataSourceNames;
-        if( m_help->selfRegister )
-        {
-            std::string dateSourceNames = m_help->source.get( m_id );
-
-            vectorOfDataSourceNames = plugins::misc::splitString(
-                plugins::misc::removeSpaces( dateSourceNames )
-            );
-        }
-
-        /* write all fields */
-        log<picLog::INPUT_OUTPUT > ("HDF5: (begin) writing fields.");
-        if (threadParams->isCheckpoint)
-        {
-            meta::ForEach<FileCheckpointFields, WriteFields<bmpl::_1> > forEachWriteFields;
-            forEachWriteFields(threadParams);
-        }
-        else
-        {
-            bool dumpFields = plugins::misc::containsObject(
-                vectorOfDataSourceNames,
-                "fields_all"
-            );
-            if( dumpFields )
-            {
-                meta::ForEach<
-                    FileOutputFields,
-                    WriteFields< bmpl::_1 >
-                > forEachWriteFields;
-                forEachWriteFields(threadParams);
-            }
-
-            meta::ForEach<
-                typename Help::AllFieldSources,
-                CallWriteFields<
-                    bmpl::_1
-                >
-            >{}(
-                vectorOfDataSourceNames,
-                threadParams
-            );
-        }
-        log<picLog::INPUT_OUTPUT > ("HDF5: ( end ) writing fields.");
-
-        /* write all particle species */
-        log<picLog::INPUT_OUTPUT > ("HDF5: (begin) writing particle species.");
-        if (threadParams->isCheckpoint)
-        {
-            meta::ForEach<
-                FileCheckpointParticles,
-                WriteSpecies<
-                    plugins::misc::UnfilteredSpecies< bmpl::_1 >
-                >
-            > writeSpecies;
-            writeSpecies(threadParams, domainOffset);
-        }
-        else
-        {
-            bool dumpAllParticles = plugins::misc::containsObject(
-                vectorOfDataSourceNames,
-                "species_all"
-            );
-
-            if( dumpAllParticles )
-            {
-                meta::ForEach<
-                    FileOutputParticles,
-                    WriteSpecies<
-                        plugins::misc::UnfilteredSpecies< bmpl::_1 >
-                    >
-                > writeSpecies;
-                writeSpecies(threadParams, domainOffset);
-            }
-
-            meta::ForEach<
-                typename Help::AllEligibleSpeciesSources,
-                CallWriteSpecies<
-                    bmpl::_1
-                >
-            >{}(
-                vectorOfDataSourceNames,
-                threadParams,
-                domainOffset
-            );
-
-        }
-        log<picLog::INPUT_OUTPUT > ("HDF5: ( end ) writing particle species.");
-
-        auto idProviderState = IdProvider<simDim>::getState();
-        log<picLog::INPUT_OUTPUT>("HDF5: Writing IdProvider state (StartId: %1%, NextId: %2%, maxNumProc: %3%)")
-                % idProviderState.startId % idProviderState.nextId % idProviderState.maxNumProc;
-        WriteNDScalars<uint64_t, uint64_t>()(*threadParams,
-                "picongpu/idProvider/startId", idProviderState.startId,
-                "maxNumProc", idProviderState.maxNumProc);
-        WriteNDScalars<uint64_t>()(*threadParams,
-                "picongpu/idProvider/nextId", idProviderState.nextId);
-
-        // write global meta attributes
-        WriteMeta writeMetaAttributes;
-        writeMetaAttributes(threadParams);
-    }
-
-    ThreadParams mThreadParams;
-
-    std::shared_ptr< Help > m_help;
-    size_t m_id;
-
-    MappingDesc *m_cellDescription;
-
-    std::string outputDirectory;
-
-    DataSpace<simDim> mpi_pos;
-    DataSpace<simDim> mpi_size;
-
-    Dimensions splashMpiPos;
-    Dimensions splashMpiSize;
-};
-
-} //namespace hdf5
-} //namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/NDScalars.hpp b/include/picongpu/plugins/hdf5/NDScalars.hpp
deleted file mode 100644
index 163855999a..0000000000
--- a/include/picongpu/plugins/hdf5/NDScalars.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2016-2020 Alexander Grund
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <pmacc/types.hpp>
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/traits/PICToSplash.hpp"
-#include <pmacc/Environment.hpp>
-
-namespace picongpu {
-namespace hdf5 {
-
-/** Functor for writing ND scalar fields with N=simDim
- * In the current implementation each process (of the ND grid of processes) writes 1 scalar value
- * Optionally the processes can also write an attribute for this dataset by using a non-empty attrName
- *
- * @tparam T_Scalar    Type of the scalar value to write
- * @tparam T_Attribute Type of the attribute (can be omitted if attribute is not written, defaults to uint64_t)
- */
-template<typename T_Scalar, typename T_Attribute = uint64_t>
-struct WriteNDScalars
-{
-    void operator()(ThreadParams& params,
-            const std::string& name, T_Scalar value,
-            const std::string& attrName = "", T_Attribute attribute = T_Attribute())
-    {
-        log<picLog::INPUT_OUTPUT>("HDF5: write %1%D scalars: %2%") % simDim % name;
-
-        // Size over all processes
-        Dimensions globalSize(1, 1, 1);
-        // Offset for this process
-        Dimensions localOffset(0, 0, 0);
-        // Offset for all processes
-        Dimensions globalOffset(0, 0, 0);
-
-        for (uint32_t d = 0; d < simDim; ++d)
-        {
-            globalSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d];
-            localOffset[d] = Environment<simDim>::get().GridController().getPosition()[d];
-        }
-
-        Dimensions localSize(1, 1, 1);
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-        __getTransactionEvent().waitForFinished();
-
-        typename traits::PICToSplash<T_Scalar>::type splashType;
-        params.dataCollector->writeDomain(params.currentStep,            /* id == time step */
-                                           globalSize,                   /* total size of dataset over all processes */
-                                           localOffset,                  /* write offset for this process */
-                                           splashType,                   /* data type */
-                                           simDim,                       /* NDims spatial dimensionality of the field */
-                                           splash::Selection(localSize), /* data size of this process */
-                                           name.c_str(),                 /* data set name */
-                                           splash::Domain(
-                                                  globalOffset,          /* offset of the global domain */
-                                                  globalSize             /* size of the global domain */
-                                           ),
-                                           DomainCollector::GridType,
-                                           &value);
-
-        if(!attrName.empty())
-        {
-            /*simulation attribute for data*/
-            typename traits::PICToSplash<T_Attribute>::type attType;
-
-            log<picLog::INPUT_OUTPUT>("HDF5: write attribute %1% for scalars: %2%") % attrName % name;
-            params.dataCollector->writeAttribute(params.currentStep,
-                                                  attType, name.c_str(),
-                                                  attrName.c_str(), &attribute);
-        }
-    }
-};
-
-/** Functor for reading ND scalar fields with N=simDim
- * In the current implementation each process (of the ND grid of processes) reads 1 scalar value
- * Optionally the processes can also read an attribute for this dataset by using a non-empty attrName
- *
- * @tparam T_Scalar    Type of the scalar value to read
- * @tparam T_Attribute Type of the attribute (can be omitted if attribute is not read, defaults to uint64_t)
- */
-template<typename T_Scalar, typename T_Attribute = uint64_t>
-struct ReadNDScalars
-{
-    void operator()(ThreadParams& params,
-                const std::string& name, T_Scalar* value,
-                const std::string& attrName = "", T_Attribute* attribute = nullptr)
-    {
-        log<picLog::INPUT_OUTPUT>("HDF5: read %1%D scalars: %2%") % simDim % name;
-
-        Dimensions domain_offset(0, 0, 0);
-        for (uint32_t d = 0; d < simDim; ++d)
-            domain_offset[d] = Environment<simDim>::get().GridController().getPosition()[d];
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-        __getTransactionEvent().waitForFinished();
-
-        DomainCollector::DomDataClass data_class;
-        DataContainer *dataContainer =
-            params.dataCollector->readDomain(params.currentStep,
-                                               name.c_str(),
-                                               Domain(domain_offset, Dimensions(1, 1, 1)),
-                                               &data_class);
-
-        typename traits::PICToSplash<T_Scalar>::type splashType;
-        *value = *static_cast<T_Scalar*>(dataContainer->getIndex(0)->getData());
-        __delete(dataContainer);
-
-        if(!attrName.empty())
-        {
-            log<picLog::INPUT_OUTPUT>("HDF5: read attribute %1% for scalars: %2%") % attrName % name;
-            params.dataCollector->readAttributeInfo(params.currentStep, name.c_str(), attrName.c_str()).read(attribute, sizeof(T_Attribute));
-            log<picLog::INPUT_OUTPUT>("HDF5: attribute %1% = %2%") % attrName % *attribute;
-        }
-    }
-};
-
-}  // namespace hdf5
-}  // namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/WriteFields.hpp b/include/picongpu/plugins/hdf5/WriteFields.hpp
deleted file mode 100644
index 8763211369..0000000000
--- a/include/picongpu/plugins/hdf5/WriteFields.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright 2014-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
- *                     Benjamin Worpitz, Sergei Bastrakov
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <pmacc/static_assert.hpp>
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/fields/CellType.hpp"
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/plugins/hdf5/writer/Field.hpp"
-#include "picongpu/traits/IsFieldDomainBound.hpp"
-
-#include <pmacc/dataManagement/DataConnector.hpp>
-
-#include <vector>
-
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-
-using namespace pmacc;
-using namespace splash;
-
-/**
- * Helper class to create a unit vector of type float_64
- */
-class CreateUnit
-{
-public:
-    template<typename UnitType>
-    static std::vector<float_64> createUnit(UnitType unit, uint32_t numComponents)
-    {
-        std::vector<float_64> tmp(numComponents);
-        for (uint32_t i = 0; i < numComponents; ++i)
-            tmp[i] = unit[i];
-        return tmp;
-    }
-};
-
-
-/**
- * Write calculated fields to HDF5 file.
- *
- * @tparam T_Field field class
- */
-template< typename T_Field >
-class WriteFields
-{
-private:
-
-    using ValueType = typename T_Field::ValueType;
-
-    static std::vector<float_64> getUnit()
-    {
-        using UnitType = typename T_Field::UnitValueType ;
-        UnitType unit = T_Field::getUnit();
-        return CreateUnit::createUnit(unit, T_Field::numComponents);
-    }
-
-public:
-
-    HDINLINE void operator()(ThreadParams* params)
-    {
-#ifndef __CUDA_ARCH__
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        auto field = dc.get< T_Field >( T_Field::getName() );
-        params->gridLayout = field->getGridLayout();
-
-        // convert in a std::vector of std::vector format for writeField API
-        const traits::FieldPosition<fields::CellType, T_Field> fieldPos;
-
-        std::vector<std::vector<float_X> > inCellPosition;
-        for( uint32_t n = 0; n < T_Field::numComponents; ++n )
-        {
-            std::vector<float_X> inCellPositonComponent;
-            for( uint32_t d = 0; d < simDim; ++d )
-                inCellPositonComponent.push_back( fieldPos()[n][d] );
-            inCellPosition.push_back( inCellPositonComponent );
-        }
-
-        /** \todo check if always correct at this point, depends on solver
-         *        implementation */
-        const float_X timeOffset = 0.0;
-
-        const bool isDomainBound = traits::IsFieldDomainBound< T_Field >::value;
-        Field::writeField(
-            params,
-            T_Field::getName(),
-            getUnit(),
-            T_Field::getUnitDimension(),
-            inCellPosition,
-            timeOffset,
-            field->getHostDataBox(),
-            ValueType(),
-            isDomainBound
-        );
-
-        dc.releaseData( T_Field::getName() );
-#endif
-    }
-
-};
-
-/** Calculate FieldTmp with given solver and particle species
- * and write them to hdf5.
- *
- * FieldTmp is calculated on device and than dumped to HDF5.
- *
- * @tparam Solver solver class for species
- * @tparam Species species/particles class
- */
-template< typename Solver, typename Species >
-class WriteFields<FieldTmpOperation<Solver, Species> >
-{
-public:
-    /*
-     * This is only a wrapper function to allow disable nvcc warnings.
-     * Warning: calling a __host__ function from __host__ __device__
-     * function.
-     * Use of PMACC_NO_NVCC_HDWARNING is not possible if we call a virtual
-     * method inside of the method were we disable the warnings.
-     * Therefore we create this method and call a new method were we can
-     * call virtual functions.
-     */
-    PMACC_NO_NVCC_HDWARNING
-    HDINLINE void operator()(ThreadParams* tparam)
-    {
-        this->operator_impl(tparam);
-    }
-
-private:
-    typedef typename FieldTmp::ValueType ValueType;
-
-    /** Create a name for the hdf5 identifier.
-     */
-    static std::string getName()
-    {
-        return FieldTmpOperation<Solver, Species>::getName();
-    }
-
-    /** Get the unit for the result from the solver*/
-    static std::vector<float_64> getUnit()
-    {
-        typedef typename FieldTmp::UnitValueType UnitType;
-        UnitType unit = FieldTmp::getUnit<Solver>();
-        const uint32_t components = GetNComponents<ValueType>::value;
-        return CreateUnit::createUnit(unit, components);
-    }
-
-    HINLINE void operator_impl(ThreadParams* params)
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        /*## update field ##*/
-
-        /*load FieldTmp without copy data to host*/
-        PMACC_CASSERT_MSG(
-            _please_allocate_at_least_one_FieldTmp_in_memory_param,
-            fieldTmpNumSlots > 0
-        );
-        auto fieldTmp = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true );
-        /*load particle without copy particle data to host*/
-        auto speciesTmp = dc.get< Species >( Species::FrameType::getName(), true );
-
-        fieldTmp->getGridBuffer().getDeviceBuffer().setValue(ValueType::create(0.0));
-        /*run algorithm*/
-        fieldTmp->template computeValue< CORE + BORDER, Solver >(*speciesTmp, params->currentStep);
-
-        EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
-        __setTransactionEvent(fieldTmpEvent);
-        /* copy data to host that we can write same to disk*/
-        fieldTmp->getGridBuffer().deviceToHost();
-        dc.releaseData( Species::FrameType::getName() );
-        /*## finish update field ##*/
-
-        /*wrap in a one-component vector for writeField API*/
-        const traits::FieldPosition<fields::CellType, FieldTmp>
-            fieldPos;
-
-        std::vector<std::vector<float_X> > inCellPosition;
-        std::vector<float_X> inCellPositonComponent;
-        for( uint32_t d = 0; d < simDim; ++d )
-            inCellPositonComponent.push_back( fieldPos()[0][d] );
-        inCellPosition.push_back( inCellPositonComponent );
-
-        /** \todo check if always correct at this point, depends on solver
-         *        implementation */
-        const float_X timeOffset = 0.0;
-
-        params->gridLayout = fieldTmp->getGridLayout();
-        const bool isDomainBound = traits::IsFieldDomainBound< FieldTmp >::value;
-        /*write data to HDF5 file*/
-        Field::writeField(
-            params,
-            getName(),
-            getUnit(),
-            FieldTmp::getUnitDimension<Solver>(),
-            inCellPosition,
-            timeOffset,
-            fieldTmp->getHostDataBox(),
-            ValueType(),
-            isDomainBound
-        );
-
-        dc.releaseData( FieldTmp::getUniqueId( 0 ) );
-
-    }
-
-};
-
-} //namspace hdf5
-
-} //namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/WriteMeta.hpp b/include/picongpu/plugins/hdf5/WriteMeta.hpp
deleted file mode 100644
index 3168e950d6..0000000000
--- a/include/picongpu/plugins/hdf5/WriteMeta.hpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/plugins/common/stringHelpers.hpp"
-#include <pmacc/Environment.hpp>
-
-#include "picongpu/fields/absorber/Absorber.hpp"
-#include "picongpu/fields/currentInterpolation/CurrentInterpolation.hpp"
-
-#include "picongpu/traits/SIBaseUnits.hpp"
-#include "picongpu/traits/SplashToPIC.hpp"
-#include "picongpu/traits/PICToSplash.hpp"
-
-#include <string>
-#include <sstream>
-#include <list>
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-using namespace pmacc;
-
-namespace writeMeta
-{
-    /** write openPMD species meta data
-     *
-     * @tparam numSpecies count of defined species
-     */
-    template< uint32_t numSpecies = bmpl::size<VectorAllSpecies>::type::value >
-    struct OfAllSpecies
-    {
-        /** write meta data for species
-         *
-         * @param dc hdf5 data connector
-         * @param meshesPath path to mesh entry
-         * @param currentStep current simulation time step
-         */
-        void operator()(
-            ParallelDomainCollector* dc,
-            const std::string& meshesPath,
-            const uint32_t currentStep
-        ) const
-        {
-            // assume all boundaries are like the first species for openPMD 1.0.0
-            GetStringProperties<bmpl::at_c<VectorAllSpecies, 0>::type> particleBoundaryProp;
-            std::list<std::string> listParticleBoundary;
-            std::list<std::string> listParticleBoundaryParam;
-            for( uint32_t i = NumberOfExchanges<simDim>::value - 1; i > 0; --i )
-            {
-                if( FRONT % i == 0 )
-                {
-                    listParticleBoundary.push_back(
-                        particleBoundaryProp[ExchangeTypeNames()[i]]["name"].value
-                    );
-                    listParticleBoundaryParam.push_back(
-                        particleBoundaryProp[ExchangeTypeNames()[i]]["param"].value
-                    );
-                }
-            }
-            helper::GetSplashArrayOfString getSplashArrayOfString;
-            auto arrParticleBoundary = getSplashArrayOfString( listParticleBoundary );
-            ColTypeString ctParticleBoundary( arrParticleBoundary.maxLen );
-            auto arrParticleBoundaryParam = getSplashArrayOfString( listParticleBoundaryParam );
-            ColTypeString ctParticleBoundaryParam( arrParticleBoundaryParam.maxLen );
-
-            dc->writeAttribute( currentStep, ctParticleBoundary, meshesPath.c_str(),
-                "particleBoundary",
-                1u, Dimensions( listParticleBoundary.size(), 0, 0 ),
-                &( arrParticleBoundary.buffers.at( 0 ) )
-            );
-            dc->writeAttribute( currentStep, ctParticleBoundaryParam, meshesPath.c_str(),
-                "particleBoundaryParameters",
-                1u, Dimensions( listParticleBoundaryParam.size(), 0, 0 ),
-                &( arrParticleBoundaryParam.buffers.at( 0 ) )
-            );
-        }
-    };
-
-    /** specialization if no species are defined */
-    template< >
-    struct OfAllSpecies< 0 >
-    {
-        /** write meta data for species
-         *
-         * @param dc hdf5 data connector
-         * @param meshesPath path to mesh entry
-         * @param currentStep current simulation time step
-         */
-        void operator()(
-            ParallelDomainCollector* /* dc */,
-            const std::string& /* meshesPath */,
-            const uint32_t /* currentStep */
-        ) const
-        {
-        }
-    };
-
-} // namespace writeMeta
-
-    struct WriteMeta
-    {
-        typedef PICToSplash<float_X>::type SplashFloatXType;
-
-        void operator()(ThreadParams *threadParams)
-        {
-            ColTypeUInt32 ctUInt32;
-            ColTypeUInt64 ctUInt64;
-            ColTypeDouble ctDouble;
-            SplashFloatXType splashFloatXType;
-
-            ParallelDomainCollector *dc = threadParams->dataCollector;
-            uint32_t currentStep = threadParams->currentStep;
-
-            /* openPMD attributes */
-            /*   required */
-            const std::string openPMDversion( "1.0.0" );
-            ColTypeString ctOpenPMDversion( openPMDversion.length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctOpenPMDversion, "openPMD",
-                                      openPMDversion.c_str() );
-
-            const uint32_t openPMDextension = 1; // ED-PIC ID
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctUInt32, "openPMDextension",
-                                      &openPMDextension );
-
-            const std::string basePath( "/data/%T/" );
-            ColTypeString ctBasePath( basePath.length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctBasePath, "basePath",
-                                      basePath.c_str() );
-
-            const std::string meshesPath( "fields/" );
-            ColTypeString ctMeshesPath( meshesPath.length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctMeshesPath, "meshesPath",
-                                      meshesPath.c_str() );
-
-            const std::string particlesPath( "particles/" );
-            ColTypeString ctParticlesPath( particlesPath.length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctParticlesPath, "particlesPath",
-                                      particlesPath.c_str() );
-
-            const std::string iterationEncoding( "fileBased" );
-            ColTypeString ctIterationEncoding( iterationEncoding.length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctIterationEncoding, "iterationEncoding",
-                                      iterationEncoding.c_str() );
-
-            const std::string iterationFormat(
-                Environment< simDim >::get().Filesystem().basename( threadParams->h5Filename ) +
-                std::string("_%T.h5")
-            );
-            ColTypeString ctIterationFormat( iterationFormat.length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctIterationFormat, "iterationFormat",
-                                      iterationFormat.c_str() );
-
-            /*   recommended */
-            const std::string author = Environment<>::get().SimulationDescription().getAuthor();
-            if( author.length() > 0 )
-            {
-                ColTypeString ctAuthor( author.length() );
-                dc->writeGlobalAttribute( threadParams->currentStep,
-                                          ctAuthor, "author",
-                                          author.c_str() );
-            }
-            const std::string software( "PIConGPU" );
-            ColTypeString ctSoftware( software.length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctSoftware, "software",
-                                      software.c_str() );
-
-            std::stringstream softwareVersion;
-            softwareVersion << PICONGPU_VERSION_MAJOR << "."
-                            << PICONGPU_VERSION_MINOR << "."
-                            << PICONGPU_VERSION_PATCH;
-            if( ! std::string(PICONGPU_VERSION_LABEL).empty() )
-                softwareVersion << "-" << PICONGPU_VERSION_LABEL;
-            ColTypeString ctSoftwareVersion( softwareVersion.str().length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctSoftwareVersion, "softwareVersion",
-                                      softwareVersion.str().c_str() );
-
-            const std::string date = helper::getDateString( "%F %T %z" );
-            ColTypeString ctDate( date.length() );
-            dc->writeGlobalAttribute( threadParams->currentStep,
-                                      ctDate, "date",
-                                      date.c_str() );
-            /*   ED-PIC */
-            GetStringProperties<fields::Solver> fieldSolverProps;
-            const std::string fieldSolver( fieldSolverProps["name"].value );
-            ColTypeString ctFieldSolver( fieldSolver.length() );
-            dc->writeAttribute(currentStep, ctFieldSolver, meshesPath.c_str(),
-                "fieldSolver", fieldSolver.c_str());
-
-            /* order as in axisLabels:
-             *    3D: z-lower, z-upper, y-lower, y-upper, x-lower, x-upper
-             *    2D: y-lower, y-upper, x-lower, x-upper
-             */
-            GetStringProperties<fields::absorber::Absorber> fieldBoundaryProp;
-            std::list<std::string> listFieldBoundary;
-            std::list<std::string> listFieldBoundaryParam;
-            for( uint32_t i = NumberOfExchanges<simDim>::value - 1; i > 0; --i )
-            {
-                if( FRONT % i == 0 )
-                {
-                    listFieldBoundary.push_back(
-                        fieldBoundaryProp[ExchangeTypeNames()[i]]["name"].value
-                    );
-                    listFieldBoundaryParam.push_back(
-                        fieldBoundaryProp[ExchangeTypeNames()[i]]["param"].value
-                    );
-                }
-            }
-            helper::GetSplashArrayOfString getSplashArrayOfString;
-            auto arrFieldBoundary = getSplashArrayOfString( listFieldBoundary );
-            ColTypeString ctFieldBoundaries( arrFieldBoundary.maxLen );
-            auto arrFieldBoundaryParam = getSplashArrayOfString( listFieldBoundaryParam );
-            ColTypeString ctFieldBoundariesParam( arrFieldBoundaryParam.maxLen );
-
-            dc->writeAttribute( currentStep, ctFieldBoundaries, meshesPath.c_str(),
-                "fieldBoundary",
-                1u, Dimensions( listFieldBoundary.size(), 0, 0 ),
-                &( arrFieldBoundary.buffers.at( 0 ) )
-            );
-            dc->writeAttribute( currentStep, ctFieldBoundariesParam, meshesPath.c_str(),
-                "fieldBoundaryParameters",
-                1u, Dimensions( listFieldBoundaryParam.size(), 0, 0 ),
-                &( arrFieldBoundaryParam.buffers.at( 0 ) )
-            );
-
-            writeMeta::OfAllSpecies<>()( dc, meshesPath, currentStep );
-
-            GetStringProperties<typename fields::Solver::CurrentInterpolation> currentSmoothingProp;
-            const std::string currentSmoothing( currentSmoothingProp["name"].value );
-            ColTypeString ctCurrentSmoothing( currentSmoothing.length() );
-            dc->writeAttribute( currentStep, ctCurrentSmoothing, meshesPath.c_str(),
-                "currentSmoothing", currentSmoothing.c_str() );
-
-            if( currentSmoothingProp.find( "param" ) != currentSmoothingProp.end() )
-            {
-                const std::string currentSmoothingParam( currentSmoothingProp["param"].value );
-                ColTypeString ctCurrentSmoothingParam( currentSmoothingParam.length() );
-                dc->writeAttribute( currentStep, ctCurrentSmoothingParam, meshesPath.c_str(),
-                    "currentSmoothingParameters", currentSmoothingParam.c_str() );
-            }
-
-            const std::string chargeCorrection( "none" );
-            ColTypeString ctChargeCorrection( chargeCorrection.length() );
-            dc->writeAttribute( currentStep, ctChargeCorrection, meshesPath.c_str(),
-                "chargeCorrection", chargeCorrection.c_str() );
-
-            /* write number of slides */
-            const uint32_t slides = MovingWindow::getInstance().getSlideCounter(
-                threadParams->currentStep
-            );
-
-            dc->writeAttribute( threadParams->currentStep,
-                                ctUInt32, nullptr, "sim_slides", &slides );
-
-
-            /* openPMD: required time attributes */
-            dc->writeAttribute( currentStep, splashFloatXType, nullptr, "dt", &DELTA_T );
-            const float_X time = float_X( threadParams->currentStep ) * DELTA_T;
-            dc->writeAttribute( currentStep, splashFloatXType, nullptr, "time", &time );
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "timeUnitSI", &UNIT_TIME );
-
-            /* write normed grid parameters */
-            dc->writeAttribute( currentStep, splashFloatXType, nullptr, "cell_width", &CELL_WIDTH );
-            dc->writeAttribute( currentStep, splashFloatXType, nullptr, "cell_height", &CELL_HEIGHT );
-            if( simDim == DIM3 )
-            {
-                dc->writeAttribute( currentStep, splashFloatXType, nullptr, "cell_depth", &CELL_DEPTH );
-            }
-
-            /* write base units */
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "unit_energy", &UNIT_ENERGY );
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "unit_length", &UNIT_LENGTH );
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "unit_speed", &UNIT_SPEED );
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "unit_time", &UNIT_TIME );
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "unit_mass", &UNIT_MASS );
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "unit_charge", &UNIT_CHARGE );
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "unit_efield", &UNIT_EFIELD );
-            dc->writeAttribute( currentStep, ctDouble, nullptr, "unit_bfield", &UNIT_BFIELD );
-
-            /* write physical constants */
-            dc->writeAttribute( currentStep, splashFloatXType, nullptr, "mue0", &MUE0 );
-            dc->writeAttribute( currentStep, splashFloatXType, nullptr, "eps0", &EPS0 );
-        }
-    };
-} // namespace hdf5
-} // namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/WriteSpecies.hpp b/include/picongpu/plugins/hdf5/WriteSpecies.hpp
deleted file mode 100644
index d0e34adcd4..0000000000
--- a/include/picongpu/plugins/hdf5/WriteSpecies.hpp
+++ /dev/null
@@ -1,603 +0,0 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/traits/SIBaseUnits.hpp"
-#include "picongpu/traits/PICToOpenPMD.hpp"
-#include "picongpu/plugins/ISimulationPlugin.hpp"
-#include "picongpu/plugins/misc/ComponentNames.hpp"
-#include "picongpu/plugins/output/WriteSpeciesCommon.hpp"
-#include "picongpu/plugins/kernel/CopySpecies.kernel"
-#include "picongpu/particles/traits/GetSpeciesFlagName.hpp"
-#include "picongpu/plugins/hdf5/writer/ParticleAttribute.hpp"
-
-#include <pmacc/meta/conversion/MakeSeq.hpp>
-#include <pmacc/meta/conversion/RemoveFromSeq.hpp>
-#include <pmacc/dataManagement/DataConnector.hpp>
-#include <pmacc/particles/ParticleDescription.hpp>
-#include <pmacc/traits/GetNumWorkers.hpp>
-#include <pmacc/mappings/kernel/AreaMapping.hpp>
-#include <pmacc/traits/HasIdentifier.hpp>
-#include <pmacc/assert.hpp>
-
-#include <boost/mpl/vector.hpp>
-#include <boost/mpl/pair.hpp>
-#include <boost/type_traits/is_same.hpp>
-#include <boost/mpl/size.hpp>
-#include <boost/mpl/at.hpp>
-#include <boost/mpl/begin_end.hpp>
-#include <boost/mpl/find.hpp>
-#include <boost/type_traits.hpp>
-
-#include <string>
-#include <type_traits>
-
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-using namespace pmacc;
-
-// = ColTypeUInt64_5Array
-TYPE_ARRAY(UInt64_5, H5T_INTEL_U64, uint64_t, 5);
-
-using namespace splash;
-
-namespace detail
-{
-    template< typename T_FrameType >
-    struct GetChargeOrZero
-    {
-        static constexpr bool hasChargeRatio = pmacc::traits::HasFlag<
-            T_FrameType,
-            chargeRatio<>
-        >::type::value;
-
-        template< typename T_Defer = float_X >
-        typename std::enable_if<
-            hasChargeRatio,
-            T_Defer
-        >::type
-        operator()() const
-        {
-            return frame::getCharge< T_FrameType >();
-        }
-
-        template< typename T_Defer = float_X >
-        typename std::enable_if<
-            !hasChargeRatio,
-            T_Defer
-        >::type
-        operator()() const
-        {
-            return float_X( 0. );
-        }
-    };
-
-    template< typename T_FrameType >
-    struct GetMassOrZero
-    {
-        static constexpr bool hasMassRatio = pmacc::traits::HasFlag<
-            T_FrameType,
-            massRatio<>
-        >::type::value;
-
-        template< typename T_Defer = float_X >
-        typename std::enable_if<
-            hasMassRatio,
-            T_Defer
-        >::type
-        operator()() const
-        {
-            return frame::getMass< T_FrameType >();
-        }
-
-        template< typename T_Defer = float_X >
-        typename std::enable_if<
-            !hasMassRatio,
-            T_Defer
-        >::type
-        operator()() const
-        {
-            return float_X( 0. );
-        }
-    };
-}
-
-/** Write copy particle to host memory and dump to HDF5 file
- *
- * @tparam T_SpeciesFilter type and filter of species
- *
- */
-template< typename T_SpeciesFilter >
-struct WriteSpecies
-{
-public:
-
-    typedef typename T_SpeciesFilter::Species ThisSpecies;
-    typedef typename ThisSpecies::FrameType FrameType;
-    typedef typename FrameType::ParticleDescription ParticleDescription;
-    typedef typename FrameType::ValueTypeSeq ParticleAttributeList;
-
-
-    /* delete multiMask and localCellIdx in hdf5 particle*/
-    typedef bmpl::vector<multiMask,localCellIdx> TypesToDelete;
-    typedef typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type ParticleCleanedAttributeList;
-
-    /* add totalCellIdx for hdf5 particle*/
-    typedef typename MakeSeq<
-            ParticleCleanedAttributeList,
-            totalCellIdx
-    >::type ParticleNewAttributeList;
-
-    typedef
-    typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type
-    NewParticleDescription;
-
-    typedef Frame<OperatorCreateVectorBox, NewParticleDescription> Hdf5FrameType;
-
-    /**
-     * @param domainOffset offset to the local domain: globalDomain.offset + localDomain.offset
-     */
-    template<typename Space>
-    HINLINE void operator()(ThreadParams* params,
-                            const Space domainOffset)
-    {
-        log<picLog::INPUT_OUTPUT > ("HDF5: (begin) write species: %1%") % T_SpeciesFilter::getName();
-        DataConnector &dc = Environment<>::get().DataConnector();
-        /* load particle without copy particle data to host */
-        auto speciesTmp = dc.get< ThisSpecies >( ThisSpecies::FrameType::getName(), true );
-
-        /* count number of particles for this species on the device */
-        uint64_t numParticles = 0;
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  (begin) count particles: %1%") % T_SpeciesFilter::getName();
-
-        // enforce that the filter interface is fulfilled
-        particles::filter::IUnary< typename T_SpeciesFilter::Filter > particleFilter{ params->currentStep };
-        /* at this point we cast to uint64_t, before we assume that per device
-         * less then 1e9 (int range) particles will be counted
-         */
-        numParticles = uint64_t( pmacc::CountParticles::countOnDevice< CORE + BORDER >(
-            *speciesTmp,
-            *(params->cellDescription),
-            params->localWindowToDomainOffset,
-            params->window.localDimensions.size,
-            particleFilter
-        ));
-
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  ( end ) count particles: %1% = %2%") % T_SpeciesFilter::getName() % numParticles;
-        Hdf5FrameType hostFrame;
-        log<picLog::INPUT_OUTPUT > ("HDF5:  (begin) malloc mapped memory: %1%") % T_SpeciesFilter::getName();
-        /*malloc mapped memory*/
-        meta::ForEach<typename Hdf5FrameType::ValueTypeSeq, MallocMemory<bmpl::_1> > mallocMem;
-        mallocMem(hostFrame, numParticles);
-        log<picLog::INPUT_OUTPUT > ("HDF5:  ( end ) malloc mapped memory: %1%") % T_SpeciesFilter::getName();
-
-        if (numParticles != 0)
-        {
-
-            log<picLog::INPUT_OUTPUT > ("HDF5:  (begin) get mapped memory device pointer: %1%") % T_SpeciesFilter::getName();
-            /*load device pointer of mapped memory*/
-            Hdf5FrameType deviceFrame;
-            meta::ForEach<typename Hdf5FrameType::ValueTypeSeq, GetDevicePtr<bmpl::_1> > getDevicePtr;
-            getDevicePtr(deviceFrame, hostFrame);
-            log<picLog::INPUT_OUTPUT > ("HDF5:  ( end ) get mapped memory device pointer: %1%") % T_SpeciesFilter::getName();
-
-            log<picLog::INPUT_OUTPUT > ("HDF5:  (begin) copy particle to host: %1%") % T_SpeciesFilter::getName();
-            typedef bmpl::vector< typename GetPositionFilter<simDim>::type > usedFilters;
-            typedef typename FilterFactory<usedFilters>::FilterType MyParticleFilter;
-            MyParticleFilter filter;
-            /* activate filter pipeline if moving window is activated */
-            filter.setStatus(MovingWindow::getInstance().isEnabled());
-            filter.setWindowPosition(params->localWindowToDomainOffset,
-                                     params->window.localDimensions.size);
-
-            /* int: assume < 2e9 particles per device */
-            GridBuffer<int, DIM1> counterBuffer(DataSpace<DIM1>(1));
-            AreaMapping < CORE + BORDER, MappingDesc > mapper(*(params->cellDescription));
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            /* this sanity check costs a little bit of time but hdf5 writing is slower */
-            PMACC_KERNEL( CopySpecies< numWorkers >{} )(
-                mapper.getGridDim(),
-                numWorkers
-            )(
-                counterBuffer.getDeviceBuffer().getPointer(),
-                deviceFrame, speciesTmp->getDeviceParticlesBox(),
-                filter,
-                domainOffset,
-                totalCellIdx_,
-                mapper,
-                particleFilter
-            );
-            counterBuffer.deviceToHost();
-            log<picLog::INPUT_OUTPUT > ("HDF5:  ( end ) copy particle to host: %1%") % T_SpeciesFilter::getName();
-            __getTransactionEvent().waitForFinished();
-            log<picLog::INPUT_OUTPUT > ("HDF5:  all events are finished: %1%") % T_SpeciesFilter::getName();
-
-            PMACC_ASSERT((uint64_t) counterBuffer.getHostBuffer().getDataBox()[0] == numParticles);
-        }
-
-        /* We rather do an allgather at this point then letting libSplash
-         * do an allgather during write to find out the global number of
-         * particles.
-         */
-        log<picLog::INPUT_OUTPUT > ("HDF5:  (begin) collect particle sizes for %1%") % T_SpeciesFilter::getName();
-
-        ColTypeUInt64 ctUInt64;
-        ColTypeDouble ctDouble;
-        GridController<simDim>& gc = Environment<simDim>::get().GridController();
-
-        const uint64_t numRanks( gc.getGlobalSize() );
-        const uint64_t myRank( gc.getGlobalRank() );
-
-        /* For collective write calls we need the information:
-         *   - how many particles will be written globally
-         *   - what is my particle offset within this global data set
-         *
-         * interleaved in array:
-         *   numParticles for mpi rank, mpi rank
-         *
-         * the mpi rank is an arbitrary quantity and might change after a
-         * restart, but we only use it to order our patches and offsets
-         */
-        std::vector<uint64_t> particleCounts( 2 * numRanks, 0u );
-        uint64_t myParticlePatch[ 2 ];
-        myParticlePatch[ 0 ] = numParticles;
-        myParticlePatch[ 1 ] = myRank;
-
-        /* we do the scan over MPI ranks since it does not matter how the
-         * global rank or scalar position (which are not idential) are
-         * ordered as long as the particle attributes are also written in
-         * the same order (which is by global rank) */
-        uint64_t numParticlesOffset = 0;
-        uint64_t numParticlesGlobal = 0;
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Allgather(
-            myParticlePatch, 2, MPI_UINT64_T,
-            &(*particleCounts.begin()), 2, MPI_UINT64_T,
-            gc.getCommunicator().getMPIComm()
-        ));
-
-        for( uint64_t r = 0; r < numRanks; ++r )
-        {
-            numParticlesGlobal += particleCounts.at(2 * r);
-            if( particleCounts.at(2 * r + 1) < myParticlePatch[ 1 ] )
-                numParticlesOffset += particleCounts.at(2 * r);
-        }
-        log<picLog::INPUT_OUTPUT > ("HDF5:  (end) collect particle sizes for %1%") % T_SpeciesFilter::getName();
-
-        /* dump non-constant particle records to hdf5 file */
-        log<picLog::INPUT_OUTPUT > ("HDF5:  (begin) write particle records for %1%") % T_SpeciesFilter::getName();
-
-        const std::string speciesPath( std::string("particles/") + T_SpeciesFilter::getName() );
-
-        meta::ForEach<typename Hdf5FrameType::ValueTypeSeq, hdf5::ParticleAttribute<bmpl::_1> > writeToHdf5;
-        writeToHdf5(
-            params,
-            hostFrame,
-            speciesPath,
-            numParticles,
-            numParticlesOffset,
-            numParticlesGlobal
-        );
-
-        /* write constant particle records to hdf5 file
-         *   ions with variable charge due to a boundElectrons attribute do not write charge
-         */
-        using hasBoundElectrons = typename pmacc::traits::HasIdentifier<
-            FrameType,
-            boundElectrons
-        >::type;
-        detail::GetChargeOrZero< FrameType > const getChargeOrZero;
-        if( ! hasBoundElectrons::value && getChargeOrZero.hasChargeRatio )
-        {
-            const float_64 charge( getChargeOrZero() );
-            std::vector<float_64> chargeUnitDimension( NUnitDimension, 0.0 );
-            chargeUnitDimension.at(SIBaseUnits::time) = 1.0;
-            chargeUnitDimension.at(SIBaseUnits::electricCurrent) = 1.0;
-
-            writeConstantRecord(
-                params,
-                speciesPath + std::string("/charge"),
-                numParticlesGlobal,
-                charge,
-                UNIT_CHARGE,
-                chargeUnitDimension
-            );
-        }
-
-        detail::GetMassOrZero< FrameType > const getMassOrZero;
-        if( getMassOrZero.hasMassRatio )
-        {
-            const float_64 mass( getMassOrZero() );
-            std::vector<float_64> massUnitDimension( NUnitDimension, 0.0 );
-            massUnitDimension.at(SIBaseUnits::mass) = 1.0;
-
-            writeConstantRecord(
-                params,
-                speciesPath + std::string("/mass"),
-                numParticlesGlobal,
-                mass,
-                UNIT_MASS,
-                massUnitDimension
-            );
-        }
-
-        /* openPMD ED-PIC: write additional attributes */
-        const float_64 particleShape( GetShape<ThisSpecies>::type::support - 1 );
-        params->dataCollector->writeAttribute( params->currentStep,
-                            ctDouble,
-                            speciesPath.c_str(),
-                            "particleShape",
-                            &particleShape );
-
-        traits::GetSpeciesFlagName<ThisSpecies, current<> > currentDepositionName;
-        const std::string currentDeposition( currentDepositionName() );
-        ColTypeString ctCurrentDeposition( currentDeposition.length() );
-        params->dataCollector->writeAttribute( params->currentStep,
-                            ctCurrentDeposition,
-                            speciesPath.c_str(),
-                            "currentDeposition",
-                            currentDeposition.c_str() );
-
-        traits::GetSpeciesFlagName<ThisSpecies, particlePusher<> > particlePushName;
-        const std::string particlePush( particlePushName() );
-        ColTypeString ctParticlePush( particlePush.length() );
-        params->dataCollector->writeAttribute( params->currentStep,
-                            ctParticlePush,
-                            speciesPath.c_str(),
-                            "particlePush",
-                            particlePush.c_str() );
-
-        traits::GetSpeciesFlagName<ThisSpecies, interpolation<> > particleInterpolationName;
-        const std::string particleInterpolation( particleInterpolationName() );
-        ColTypeString ctParticleInterpolation( particleInterpolation.length() );
-        params->dataCollector->writeAttribute( params->currentStep,
-                            ctParticleInterpolation,
-                            speciesPath.c_str(),
-                            "particleInterpolation",
-                            particleInterpolation.c_str() );
-
-        const std::string particleSmoothing("none");
-        ColTypeString ctParticleSmoothing(particleSmoothing.length());
-        params->dataCollector->writeAttribute( params->currentStep,
-                            ctParticleSmoothing,
-                            speciesPath.c_str(),
-                            "particleSmoothing",
-                            particleSmoothing.c_str() );
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  (end) write particle records for %1%") % T_SpeciesFilter::getName();
-
-        /* write species particle patch meta information */
-        log<picLog::INPUT_OUTPUT > ("HDF5:  (begin) writing particlePatches for %1%") % T_SpeciesFilter::getName();
-
-        std::string particlePatchesPath( speciesPath + std::string("/particlePatches") );
-
-        /* offset and size of our particle patches
-         *   - numPatches: we write as many patches as MPI ranks
-         *   - myPatchOffset: we write in the order of the MPI ranks
-         *   - myPatchEntries: every MPI rank writes exactly one patch
-         */
-        const Dimensions numPatches( numRanks, 1, 1 );
-        const Dimensions myPatchOffset( myRank, 0, 0 );
-        const Dimensions myPatchEntries( 1, 1, 1 );
-
-        /* numParticles: number of particles in this patch */
-        params->dataCollector->write(
-            params->currentStep,
-            numPatches,
-            myPatchOffset,
-            ctUInt64, 1,
-            myPatchEntries,
-            (particlePatchesPath + std::string("/numParticles")).c_str(),
-            &numParticles);
-
-        /* numParticlesOffset: number of particles before this patch */
-        params->dataCollector->write(
-            params->currentStep,
-            numPatches,
-            myPatchOffset,
-            ctUInt64, 1,
-            myPatchEntries,
-            (particlePatchesPath + std::string("/numParticlesOffset")).c_str(),
-            &numParticlesOffset);
-
-        /* offset: absolute position where this particle patch begins including
-         *         global domain offsets (slides), etc.
-         * extent: size of this particle patch, upper bound is excluded
-         */
-        const pmacc::Selection<simDim>& globalDomain = Environment<simDim>::get().SubGrid().getGlobalDomain();
-        const auto componentNames = plugins::misc::getComponentNames( simDim );
-        for (uint32_t d = 0; d < simDim; ++d)
-        {
-            const uint64_t patchOffset =
-                globalDomain.offset[d] +
-                params->window.globalDimensions.offset[d] +
-                params->window.localDimensions.offset[d];
-            const uint64_t patchExtent =
-                params->window.localDimensions.size[d];
-
-            params->dataCollector->write(
-                params->currentStep,
-                numPatches,
-                myPatchOffset,
-                ctUInt64, 1,
-                myPatchEntries,
-                (particlePatchesPath + std::string("/offset/") +
-                 componentNames[d]).c_str(),
-                &patchOffset);
-            params->dataCollector->write(
-                params->currentStep,
-                numPatches,
-                myPatchOffset,
-                ctUInt64, 1,
-                myPatchEntries,
-                (particlePatchesPath + std::string("/extent/") +
-                 componentNames[d]).c_str(),
-                &patchExtent);
-
-            /* offsets and extent of the patch are positions (lengths)
-             * and need to be scaled like the cell idx of a particle
-             */
-            OpenPMDUnit<totalCellIdx> openPMDUnitCellIdx;
-            std::vector<float_64> unitCellIdx = openPMDUnitCellIdx();
-
-            params->dataCollector->writeAttribute(
-                params->currentStep,
-                ctDouble,
-                (particlePatchesPath + std::string("/offset/") +
-                 componentNames[d]).c_str(),
-                "unitSI",
-                &(unitCellIdx.at(d)));
-            params->dataCollector->writeAttribute(
-                params->currentStep,
-                ctDouble,
-                (particlePatchesPath + std::string("/extent/") +
-                 componentNames[d]).c_str(),
-                "unitSI",
-                &(unitCellIdx.at(d)));
-        }
-
-        OpenPMDUnitDimension<totalCellIdx> openPMDUnitDimension;
-        std::vector<float_64> unitDimensionCellIdx = openPMDUnitDimension();
-
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            ctDouble,
-            (particlePatchesPath + std::string("/offset")).c_str(),
-            "unitDimension",
-            1u, Dimensions(7,0,0),
-            &(*unitDimensionCellIdx.begin()));
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            ctDouble,
-            (particlePatchesPath + std::string("/extent")).c_str(),
-            "unitDimension",
-            1u, Dimensions(7,0,0),
-            &(*unitDimensionCellIdx.begin()));
-
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  ( end ) writing particlePatches for %1%") % T_SpeciesFilter::getName();
-
-        /*free host memory*/
-        meta::ForEach<typename Hdf5FrameType::ValueTypeSeq, FreeMemory<bmpl::_1> > freeMem;
-        freeMem(hostFrame);
-        log<picLog::INPUT_OUTPUT > ("HDF5: ( end ) writing species: %1%") % T_SpeciesFilter::getName();
-    }
-
-private:
-
-    /** Writes a constant particle record (weighted for a real particle)
-     *
-     * @param params thread parameters
-     * @param recordPath path to the record
-     * @param numParticlesGlobal global number of particles in the species
-     * @param value of the record
-     * @param unitSI conversion factor to SI
-     * @param unitDimension power in terms of SI base units for this record
-     */
-    static void writeConstantRecord(
-        ThreadParams* params,
-        const std::string recordPath,
-        const uint64_t numParticlesGlobal,
-        const float_64 value,
-        const float_64 unitSI,
-        const std::vector<float_64>& unitDimension
-    )
-    {
-        typedef typename PICToSplash<float_X>::type SplashFloatXType;
-
-        ColTypeUInt32 ctUInt32;
-        ColTypeUInt64 ctUInt64;
-        ColTypeDouble ctDouble;
-        SplashFloatXType splashFloatXType;
-
-        /* openPMD base standard
-         *   write constant record
-         */
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            ctDouble, recordPath.c_str(),
-            "value", &value);
-
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            ctUInt64, recordPath.c_str(),
-            "shape",
-            1u, Dimensions(1,0,0),
-            &numParticlesGlobal);
-
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            ctDouble, recordPath.c_str(),
-            "unitSI", &unitSI);
-
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            ctDouble, recordPath.c_str(),
-            "unitDimension",
-            1u, Dimensions(7,0,0),
-            &(*unitDimension.begin()));
-
-        /** \todo check if always correct at this point, depends on attribute
-         *        and MW-solver/pusher implementation */
-        const float_X timeOffset( 0.0 );      // same type as "time" in basePath
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            splashFloatXType, recordPath.c_str(),
-            "timeOffset", &timeOffset);
-
-        /* ED-PIC extension:
-         *   - this is a record describing a *real* particle (0: false)
-         *   - it needs to be scaled linearly (w^1.0) to get the *macro*
-         *     particle record
-         */
-        const uint32_t macroWeighted( 0 );
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            ctUInt32, recordPath.c_str(),
-            "macroWeighted",
-            &macroWeighted);
-
-        const float_64 weightingPower( 1.0 );
-        params->dataCollector->writeAttribute(
-            params->currentStep,
-            ctDouble, recordPath.c_str(),
-            "weightingPower",
-            &weightingPower);
-    }
-};
-
-
-} //namspace hdf5
-
-} //namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/openPMD/patchReader.cpp b/include/picongpu/plugins/hdf5/openPMD/patchReader.cpp
deleted file mode 100644
index d16ab342c0..0000000000
--- a/include/picongpu/plugins/hdf5/openPMD/patchReader.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2016-2020 Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#if( ENABLE_HDF5 == 1 )
-
-#  include "picongpu/plugins/hdf5/openPMD/patchReader.hpp"
-#include "picongpu/plugins/misc/ComponentNames.hpp"
-
-
-namespace picongpu
-{
-namespace hdf5
-{
-namespace openPMD
-{
-    void PatchReader::checkSpatialTypeSize(
-            splash::DataCollector* const dc,
-            const uint32_t availableRanks,
-            const int32_t id,
-            const std::string particlePatchPathComponent
-    ) const
-    {
-        // will later read into 1D buffer from first position on
-        splash::Dimensions dstBuffer(availableRanks, 1, 1);
-        splash::Dimensions dstOffset(0, 0, 0);
-        // sizeRead will be set
-        splash::Dimensions sizeRead(0, 0, 0);
-
-        splash::CollectionType* colType = dc->readMeta(
-            id,
-            particlePatchPathComponent.c_str(),
-            dstBuffer,
-            dstOffset,
-            sizeRead );
-
-        // check if the 1D list of patches has the right length
-        assert( sizeRead[0] == availableRanks );
-
-        // currently only support uint64_t types to spare type conversation
-        assert( typeid(*colType) == typeid(splash::ColTypeUInt64) );
-
-        // free collections
-        delete( colType );
-        colType = nullptr;
-    }
-
-    void PatchReader::readPatchAttribute(
-        splash::DataCollector* const dc,
-        const uint32_t availableRanks,
-        const int32_t id,
-        const std::string particlePatchPathComponent,
-        uint64_t* const dest
-    ) const
-    {
-        // will later read into 1D buffer from first position on
-        splash::Dimensions dstBuffer(availableRanks, 1, 1);
-        splash::Dimensions dstOffset(0, 0, 0);
-        // sizeRead will be set
-        splash::Dimensions sizeRead(0, 0, 0);
-
-        // check if types, number of patches and names are supported
-        checkSpatialTypeSize( dc, availableRanks, id, particlePatchPathComponent.c_str() );
-
-        // read actual offset and extent data of particle patch component
-        dc->read( id,
-                  particlePatchPathComponent.c_str(),
-                  sizeRead,
-                  (void*)dest );
-    }
-
-    picongpu::openPMD::ParticlePatches PatchReader::operator()(
-        splash::DataCollector* const dc,
-        const uint32_t availableRanks,
-        const uint32_t dimensionality,
-        const int32_t id,
-        const std::string particlePatchPath
-    ) const
-    {
-        // allocate memory for patches
-        picongpu::openPMD::ParticlePatches particlePatches( availableRanks );
-        const auto componentNames = plugins::misc::getComponentNames( dimensionality );
-        for( uint32_t d = 0; d < dimensionality; ++d )
-        {
-            readPatchAttribute(
-                dc, availableRanks, id,
-                particlePatchPath + std::string("offset/") + componentNames[d],
-                particlePatches.getOffsetComp( d )
-            );
-            readPatchAttribute(
-                dc, availableRanks, id,
-                particlePatchPath + std::string("extent/") + componentNames[d],
-                particlePatches.getExtentComp( d )
-            );
-        }
-
-        // read number of particles and their starting point (offset), too
-        readPatchAttribute(
-            dc, availableRanks, id,
-            particlePatchPath + std::string("numParticles"),
-            &(*particlePatches.numParticles.begin())
-        );
-        readPatchAttribute(
-            dc, availableRanks, id,
-            particlePatchPath + std::string("numParticlesOffset"),
-            &(*particlePatches.numParticlesOffset.begin())
-        );
-
-        // return struct of array with particle patches
-        return particlePatches;
-    }
-
-} // namespace openPMD
-} // namespace hdf5
-} // namespace picongpu
-
-#endif
diff --git a/include/picongpu/plugins/hdf5/openPMD/patchReader.hpp b/include/picongpu/plugins/hdf5/openPMD/patchReader.hpp
deleted file mode 100644
index 04e8510047..0000000000
--- a/include/picongpu/plugins/hdf5/openPMD/patchReader.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2016-2020 Axel Huebl
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/plugins/common/particlePatches.hpp"
-
-#if( ENABLE_HDF5 == 1 )
-#  include <splash/splash.h>
-#endif
-
-#include <vector>
-#include <list>
-#include <string>
-#include <iostream>
-#include <typeinfo>
-
-namespace picongpu
-{
-namespace hdf5
-{
-namespace openPMD
-{
-    class PatchReader;
-
-#if( ENABLE_HDF5 == 1 )
-    /** Functor to populate and validate the list of particle patches
-     */
-    class PatchReader
-    {
-    private:
-        /** Determine the variable type for `offset` and `extent`
-         *
-         * In particle patches, the `offset` and `extent` can be of
-         * user-defined types. This function allows to determine which
-         * one was used and how many patches exist.
-         *
-         * @note currently we force the type to be `uint64_t`,
-         *       we can implement type conversions later on
-         * @note currently we force the number of patches
-         *       to stay constant during restarts
-         *
-         * @param dc parallel libSplash DataCollector
-         * @param availableRanks MPI ranks in the restarted simulation
-         *        that are currently waiting to find patches
-         * @param id iteration in file
-         * @param particlePatchPathComponent string such as
-         *             "particles/e/particlePatches/numParticles" or
-         *             "particles/e/particlePatches/offset/x"
-         */
-        void checkSpatialTypeSize(
-            splash::DataCollector* const dc,
-            const uint32_t availableRanks,
-            const int32_t id,
-            const std::string particlePatchPathComponent
-        ) const;
-
-        /** Read a specific record component of the particle patch
-         *
-         * Read for example: numParticles or offset/x
-         *
-         * @param[in]  dc pointer to an open splash::DataCollector
-         * @param[in]  availableRanks MPI ranks in the restarted simulation
-         *             that are currently waiting to find patches
-         * @param[in]  id time step to read
-         * @param[in]  particlePatchPathComponent string such as
-         *             "particles/e/particlePatches/numParticles" or
-         *             "particles/e/particlePatches/offset/x"
-         * @param[out] dest beginning of c-array of length size()
-         *             to write the patch record component to
-         */
-        void readPatchAttribute(
-            splash::DataCollector* const dc,
-            const uint32_t availableRanks,
-            const int32_t id,
-            const std::string particlePatchPathComponent,
-            uint64_t* const dest
-        ) const;
-
-    public:
-        /** Build up the global list of patches
-         *
-         * @param dc parallel libSplash DataCollector
-         * @param availableRanks MPI ranks in the restarted simulation
-         *        that are currently waiting to find patches
-         * @param dimensionality the PIConGPU simDim
-         * @param id iteration in file
-         * @param particlePatchPath in-file path to a specific particle patch dir
-         *
-         * @return picongpu::openPMD::ParticlePatches struct of arrays with patches
-         */
-        picongpu::openPMD::ParticlePatches operator()(
-            splash::DataCollector* const dc,
-            const uint32_t availableRanks,
-            const uint32_t dimensionality,
-            const int32_t id,
-            const std::string particlePatchPath
-        ) const;
-    };
-#endif
-
-} // namespace openPMD
-} // namespace hdf5
-} // namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/restart/LoadParticleAttributesFromHDF5.hpp b/include/picongpu/plugins/hdf5/restart/LoadParticleAttributesFromHDF5.hpp
deleted file mode 100644
index 3ce6268b5f..0000000000
--- a/include/picongpu/plugins/hdf5/restart/LoadParticleAttributesFromHDF5.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/plugins/misc/ComponentNames.hpp"
-#include "picongpu/traits/PICToSplash.hpp"
-#include "picongpu/traits/PICToOpenPMD.hpp"
-#include <pmacc/traits/GetComponentsType.hpp>
-#include <pmacc/traits/GetNComponents.hpp>
-#include <pmacc/traits/Resolve.hpp>
-#include <pmacc/assert.hpp>
-
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-using namespace pmacc;
-
-using namespace splash;
-
-/** Load attribute of a species from HDF5 checkpoint file
- *
- * @tparam T_Identifier identifier of species attribute
- */
-template< typename T_Identifier>
-struct LoadParticleAttributesFromHDF5
-{
-
-    /** read attributes from hdf5 file
-     *
-     * @param params thread params with domainwriter, ...
-     * @param frame frame with all particles
-     * @param subGroup path to the group in the hdf5 file
-     * @param particlesOffset read offset in the attribute array
-     * @param elements number of elements which should be read the attribute array
-     */
-    template<typename FrameType>
-    HINLINE void operator()(
-                            ThreadParams* params,
-                            FrameType& frame,
-                            const std::string subGroup,
-                            const uint64_t particlesOffset,
-                            const uint64_t elements)
-    {
-
-        typedef T_Identifier Identifier;
-        typedef typename pmacc::traits::Resolve<Identifier>::type::type ValueType;
-        const uint32_t components = GetNComponents<ValueType>::value;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
-        typedef typename PICToSplash<ComponentType>::type SplashType;
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  ( begin ) load species attribute: %1%") % Identifier::getName();
-
-        const auto componentNames = plugins::misc::getComponentNames( components );
-
-        ComponentType* tmpArray = nullptr;
-        if( elements > 0 )
-            tmpArray = new ComponentType[elements];
-
-        ParallelDomainCollector* dataCollector = params->dataCollector;
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in splash/HDF5
-        __getTransactionEvent().waitForFinished();
-
-        for (uint32_t d = 0; d < components; d++)
-        {
-            OpenPMDName<T_Identifier> openPMDName;
-            std::stringstream datasetName;
-            datasetName << subGroup << "/" << openPMDName();
-            if (components > 1)
-                datasetName << "/" << componentNames[d];
-
-            ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer();
-            Dimensions sizeRead(0, 0, 0);
-            // read one component from file to temporary array
-            dataCollector->read(params->currentStep,
-                               Dimensions(elements, 1, 1),
-                               Dimensions(particlesOffset, 0, 0),
-                               datasetName.str().c_str(),
-                               sizeRead,
-                               tmpArray
-                               );
-            PMACC_ASSERT(sizeRead[0] == elements);
-
-            /* copy component from temporary array to array of structs */
-            #pragma omp parallel for
-            for (size_t i = 0; i < elements; ++i)
-            {
-                ComponentType& ref = ((ComponentType*) dataPtr)[i * components + d];
-                ref = tmpArray[i];
-            }
-        }
-        __deleteArray(tmpArray);
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  ( end ) load species attribute: %1%") %
-            Identifier::getName();
-    }
-
-};
-
-} //namspace hdf5
-
-} //namespace picongpu
-
diff --git a/include/picongpu/plugins/hdf5/restart/LoadSpecies.hpp b/include/picongpu/plugins/hdf5/restart/LoadSpecies.hpp
deleted file mode 100644
index 7dd94f2984..0000000000
--- a/include/picongpu/plugins/hdf5/restart/LoadSpecies.hpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/plugins/ISimulationPlugin.hpp"
-
-#include "picongpu/plugins/output/WriteSpeciesCommon.hpp"
-#include "picongpu/plugins/hdf5/restart/LoadParticleAttributesFromHDF5.hpp"
-
-#include "picongpu/plugins/common/particlePatches.hpp"
-#include "picongpu/plugins/hdf5/openPMD/patchReader.hpp"
-
-#include <pmacc/meta/conversion/MakeSeq.hpp>
-#include <pmacc/meta/conversion/RemoveFromSeq.hpp>
-#include <pmacc/mappings/kernel/AreaMapping.hpp>
-#include <pmacc/particles/ParticleDescription.hpp>
-#include <pmacc/particles/operations/splitIntoListOfFrames.kernel>
-#include <pmacc/dataManagement/DataConnector.hpp>
-
-#include <boost/mpl/vector.hpp>
-#include <boost/mpl/pair.hpp>
-#include <boost/mpl/size.hpp>
-#include <boost/mpl/at.hpp>
-#include <boost/mpl/begin_end.hpp>
-#include <boost/mpl/find.hpp>
-#include <boost/type_traits.hpp>
-#include <boost/type_traits/is_same.hpp>
-
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-using namespace pmacc;
-
-using namespace splash;
-
-/** Load species from HDF5 checkpoint file
- *
- * @tparam T_Species type of species
- *
- */
-template< typename T_Species >
-struct LoadSpecies
-{
-public:
-
-    typedef T_Species ThisSpecies;
-    typedef typename ThisSpecies::FrameType FrameType;
-    typedef typename FrameType::ParticleDescription ParticleDescription;
-    typedef typename FrameType::ValueTypeSeq ParticleAttributeList;
-
-
-    /* delete multiMask and localCellIdx in hdf5 particle*/
-    typedef bmpl::vector2<multiMask, localCellIdx> TypesToDelete;
-    typedef typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type ParticleCleanedAttributeList;
-
-    /* add totalCellIdx for hdf5 particle*/
-    typedef typename MakeSeq<
-        ParticleCleanedAttributeList,
-        totalCellIdx
-    >::type ParticleNewAttributeList;
-
-    typedef
-    typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type
-    NewParticleDescription;
-
-    typedef Frame<OperatorCreateVectorBox, NewParticleDescription> Hdf5FrameType;
-
-    /** Load species from HDF5 checkpoint file
-     *
-     * @param params thread params with domainwriter, ...
-     * @param restartChunkSize number of particles processed in one kernel call
-     */
-    HINLINE void operator()(ThreadParams* params, const uint32_t restartChunkSize)
-    {
-        std::string const speciesName = FrameType::getName();
-        log<picLog::INPUT_OUTPUT > ("HDF5: (begin) load species: %1%") % speciesName;
-        DataConnector &dc = Environment<>::get().DataConnector();
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-
-        const std::string speciesSubGroup(
-            std::string("particles/") + speciesName + std::string("/")
-        );
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-        const pmacc::Selection<simDim>& globalDomain = Environment<simDim>::get().SubGrid().getGlobalDomain();
-
-        // load particle without copying particle data to host
-        auto speciesTmp = dc.get< ThisSpecies >( FrameType::getName(), true );
-
-        // count total number of particles on the device
-        uint64_cu totalNumParticles = 0;
-        uint64_t particleOffset = 0;
-
-        // load particle patches offsets to find own patch
-        const std::string particlePatchesPath(
-            speciesSubGroup + std::string("particlePatches/")
-        );
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in splash/HDF5
-        __getTransactionEvent().waitForFinished();
-
-        // read particle patches
-        openPMD::PatchReader patchReader;
-
-        picongpu::openPMD::ParticlePatches particlePatches(
-            patchReader(
-                params->dataCollector,
-                gc.getGlobalSize(),
-                simDim,
-                params->currentStep,
-                particlePatchesPath
-            )
-        );
-
-        /** search my entry (using my cell offset and my local grid size)
-         *
-         * \note if you want to restart with a changed GPU configuration, either
-         * post-process the particle-patches in the file or implement to find
-         * all contributing patches and then filter the particles inside those
-         * by position
-         *
-         * \see plugins/hdf5/WriteSpecies.hpp `WriteSpecies::operator()`
-         *      as its counterpart
-         */
-        const DataSpace<simDim> patchOffset =
-            globalDomain.offset +
-            params->window.globalDimensions.offset +
-            params->window.localDimensions.offset;
-        const DataSpace<simDim> patchExtent =
-            params->window.localDimensions.size;
-
-        for( size_t i = 0; i < gc.getGlobalSize(); ++i )
-        {
-            bool exactlyMyPatch = true;
-
-            for( uint32_t d = 0; d < simDim; ++d )
-            {
-                if( particlePatches.getOffsetComp( d )[ i ] != (uint64_t)patchOffset[ d ] )
-                    exactlyMyPatch = false;
-                if( particlePatches.getExtentComp( d )[ i ] != (uint64_t)patchExtent[ d ] )
-                    exactlyMyPatch = false;
-            }
-
-            if( exactlyMyPatch )
-            {
-                totalNumParticles = particlePatches.numParticles[ i ];
-                particleOffset = particlePatches.numParticlesOffset[ i ];
-                break;
-            }
-        }
-
-        log<picLog::INPUT_OUTPUT > ("Loading %1% particles from offset %2%") %
-            (long long unsigned) totalNumParticles % (long long unsigned) particleOffset;
-
-        Hdf5FrameType hostFrame;
-        log<picLog::INPUT_OUTPUT > ("HDF5:  malloc mapped memory: %1%") % speciesName;
-        /*malloc mapped memory*/
-        meta::ForEach<typename Hdf5FrameType::ValueTypeSeq, MallocMemory<bmpl::_1> > mallocMem;
-        mallocMem(hostFrame, totalNumParticles);
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  get mapped memory device pointer: %1%") % speciesName;
-        /*load device pointer of mapped memory*/
-        Hdf5FrameType deviceFrame;
-        meta::ForEach<typename Hdf5FrameType::ValueTypeSeq, GetDevicePtr<bmpl::_1> > getDevicePtr;
-        getDevicePtr(deviceFrame, hostFrame);
-
-        meta::ForEach<typename Hdf5FrameType::ValueTypeSeq, LoadParticleAttributesFromHDF5<bmpl::_1> > loadAttributes;
-        loadAttributes(params, hostFrame, speciesSubGroup, particleOffset, totalNumParticles);
-
-        if (totalNumParticles != 0)
-        {
-            pmacc::particles::operations::splitIntoListOfFrames(
-                *speciesTmp,
-                deviceFrame,
-                totalNumParticles,
-                restartChunkSize,
-                globalDomain.offset + localDomain.offset,
-                totalCellIdx_,
-                *(params->cellDescription),
-                picLog::INPUT_OUTPUT()
-            );
-
-            /*free host memory*/
-            meta::ForEach<typename Hdf5FrameType::ValueTypeSeq, FreeMemory<bmpl::_1> > freeMem;
-            freeMem(hostFrame);
-            log<picLog::INPUT_OUTPUT > ("HDF5: ( end ) load species: %1%") % speciesName;
-        }
-    }
-};
-
-
-} //namspace hdf5
-
-} //namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/restart/RestartFieldLoader.hpp b/include/picongpu/plugins/hdf5/restart/RestartFieldLoader.hpp
deleted file mode 100644
index 2552ea41f3..0000000000
--- a/include/picongpu/plugins/hdf5/restart/RestartFieldLoader.hpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright 2014-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-
-#include <pmacc/particles/frame_types.hpp>
-#include "picongpu/fields/FieldE.hpp"
-#include "picongpu/fields/FieldB.hpp"
-#include "picongpu/plugins/misc/ComponentNames.hpp"
-#include "picongpu/simulation/control/MovingWindow.hpp"
-#include "picongpu/traits/IsFieldDomainBound.hpp"
-
-#include <pmacc/dataManagement/DataConnector.hpp>
-#include <pmacc/dimensions/DataSpace.hpp>
-#include <pmacc/dimensions/GridLayout.hpp>
-
-#include <splash/splash.h>
-
-#include <string>
-#include <sstream>
-
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-
-/**
- * Helper class for HDF5Writer plugin to load fields from parallel libSplash files.
- */
-class RestartFieldLoader
-{
-public:
-    template<class Data>
-    static void loadField(
-        Data& field,
-        const uint32_t numComponents,
-        std::string objectName,
-        ThreadParams *params,
-        const bool isDomainBound
-    )
-    {
-        log<picLog::INPUT_OUTPUT > ("Begin loading field '%1%'") % objectName;
-        const DataSpace<simDim> field_guard = field.getGridLayout().getGuard();
-
-        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(params->currentStep);
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-
-        using ValueType = typename Data::ValueType;
-        field.getHostBuffer().setValue(ValueType::create(0.0));
-
-        const auto componentNames = plugins::misc::getComponentNames( numComponents );
-
-        /* globalSlideOffset due to gpu slides between origin at time step 0
-         * and origin at current time step
-         * ATTENTION: splash offset are globalSlideOffset + picongpu offsets
-         */
-        DataSpace<simDim> globalSlideOffset;
-        globalSlideOffset.y() = numSlides * localDomain.size.y();
-
-        Dimensions domain_offset(0, 0, 0);
-        for (uint32_t d = 0; d < simDim; ++d)
-            domain_offset[d] = localDomain.offset[d] + globalSlideOffset[d];
-
-        if (Environment<simDim>::get().GridController().getPosition().y() == 0)
-            domain_offset[1] += params->window.globalDimensions.offset.y();
-
-        Dimensions local_domain_size;
-        for (uint32_t d = 0; d < simDim; ++d)
-            local_domain_size[d] = params->window.localDimensions.size[d];
-        int elementCount = params->window.localDimensions.size.productOfComponents();
-        bool useLinearIdxAsDestination = false;
-
-        /* Patch for non-domain-bound fields
-         * This is an ugly fix to allow output of reduced 1d PML buffers,
-         * that are the same size on each domain.
-         * This code is to be replaced with the openPMD output plugin soon.
-         */
-        if( !isDomainBound )
-        {
-            auto const field_layout = params->gridLayout;
-            auto const field_no_guard = field_layout.getDataSpaceWithoutGuarding();
-            elementCount = field_no_guard.productOfComponents();
-            // Number of elements on each local domain
-            local_domain_size = Dimensions(
-                elementCount,
-                1,
-                1
-            );
-            auto const & gridController = Environment<simDim>::get().GridController();
-            auto const rank = gridController.getGlobalRank();
-            domain_offset = Dimensions(
-                rank * elementCount,
-                0,
-                0
-            );
-            useLinearIdxAsDestination = true;
-        }
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in splash/HDF5
-        __getTransactionEvent().waitForFinished();
-
-        auto destBox = field.getHostBuffer().getDataBox();
-        for (uint32_t i = 0; i < numComponents; ++i)
-        {
-            // Read the subdomain which belongs to our mpi position.
-            // The total grid size must match the grid size of the stored data.
-            log<picLog::INPUT_OUTPUT > ("Read from domain: offset=%1% size=%2%") %
-                domain_offset.toString() % local_domain_size.toString();
-            DomainCollector::DomDataClass data_class;
-            DataContainer *field_container =
-                params->dataCollector->readDomain(params->currentStep,
-                                           (std::string("fields/") + objectName +
-                                            std::string("/") + componentNames[i]).c_str(),
-                                           Domain(domain_offset, local_domain_size),
-                                           &data_class);
-
-            for (int linearId = 0; linearId < elementCount; ++linearId)
-            {
-                DataSpace<simDim> destIdx;
-                if( useLinearIdxAsDestination )
-                {
-                    destIdx[ 0 ] = linearId;
-                }
-                else
-                {
-                    /* calculate index inside the moving window domain which is located on the local grid*/
-                    destIdx = DataSpaceOperations<simDim>::map(params->window.localDimensions.size, linearId);
-                    /* jump over guard and local sliding window offset*/
-                    destIdx += field_guard + params->localWindowToDomainOffset;
-                }
-                destBox(destIdx)[i] = ((float_X*) (field_container->getIndex(0)->getData()))[linearId];
-            }
-
-            delete field_container;
-        }
-
-        field.hostToDevice();
-
-        __getTransactionEvent().waitForFinished();
-
-        log<picLog::INPUT_OUTPUT > ("Read from domain: offset=%1% size=%2%") %
-            domain_offset.toString() % local_domain_size.toString();
-        log<picLog::INPUT_OUTPUT > ("Finished loading field '%1%'") % objectName;
-    }
-
-    template<class Data>
-    static void cloneField(Data& fieldDest, Data& fieldSrc, std::string objectName)
-    {
-        log<picLog::INPUT_OUTPUT > ("Begin cloning field '%1%'") % objectName;
-        DataSpace<simDim> field_grid = fieldDest.getGridLayout().getDataSpace();
-
-        size_t elements = field_grid.productOfComponents();
-        float3_X *ptrDest = fieldDest.getHostBuffer().getDataBox().getPointer();
-        float3_X *ptrSrc = fieldSrc.getHostBuffer().getDataBox().getPointer();
-
-        for (size_t k = 0; k < elements; ++k)
-        {
-            ptrDest[k] = ptrSrc[k];
-        }
-
-        fieldDest.hostToDevice();
-
-        __getTransactionEvent().waitForFinished();
-
-        log<picLog::INPUT_OUTPUT > ("Finished cloning field '%1%'") % objectName;
-    }
-};
-
-/**
- * Hepler class for HDF5Writer (forEach operator) to load a field from HDF5
- *
- * @tparam T_Field field class to load
- */
-template< typename T_Field >
-struct LoadFields
-{
-public:
-
-    HDINLINE void operator()(ThreadParams* params)
-    {
-#ifndef __CUDA_ARCH__
-        DataConnector &dc = Environment<>::get().DataConnector();
-        ThreadParams *tp = params;
-
-        /* load field without copying data to host */
-        std::shared_ptr< T_Field > field = dc.get< T_Field >( T_Field::getName(), true );
-        tp->gridLayout = field->getGridLayout();
-
-        /* load from HDF5 */
-        bool const isDomainBound = traits::IsFieldDomainBound< T_Field >::value;
-        RestartFieldLoader::loadField(
-            field->getGridBuffer(),
-            static_cast< uint32_t >( T_Field::numComponents ),
-            T_Field::getName(),
-            tp,
-            isDomainBound
-        );
-
-        dc.releaseData( T_Field::getName() );
-#endif
-    }
-
-};
-
-using namespace pmacc;
-using namespace splash;
-
-} //namespace hdf5
-} //namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/writer/Field.hpp b/include/picongpu/plugins/hdf5/writer/Field.hpp
deleted file mode 100644
index bdf778d759..0000000000
--- a/include/picongpu/plugins/hdf5/writer/Field.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright 2014-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
- *                     Sergei Bastrakov
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/plugins/misc/ComponentNames.hpp"
-#include "picongpu/traits/PICToSplash.hpp"
-#include <pmacc/traits/GetComponentsType.hpp>
-#include <pmacc/traits/GetNComponents.hpp>
-#include <pmacc/assert.hpp>
-
-#include <string>
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-
-using namespace pmacc;
-using namespace splash;
-
-struct Field
-{
-
-    /* \param inCellPosition std::vector<std::vector<float_X> > with the outer
-     *                       vector for each component and the inner vector for
-     *                       the simDim position offset within the cell [0.0; 1.0)
-     */
-    template<
-        typename T_ValueType,
-        typename T_DataBoxType
-    >
-    static void writeField(
-        ThreadParams *params,
-        const std::string name,
-        std::vector<float_64> unit,
-        std::vector<float_64> unitDimension,
-        std::vector<std::vector<float_X> > inCellPosition,
-        float_X timeOffset,
-        T_DataBoxType dataBox,
-        const T_ValueType&,
-        const bool isDomainBound
-    )
-    {
-        typedef T_DataBoxType NativeDataBoxType;
-        typedef T_ValueType ValueType;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
-        typedef typename PICToSplash<ComponentType>::type SplashType;
-        typedef typename PICToSplash<float_X>::type SplashFloatXType;
-
-        const uint32_t nComponents = GetNComponents<ValueType>::value;
-
-        SplashType splashType;
-        ColTypeDouble ctDouble;
-        SplashFloatXType splashFloatXType;
-
-        log<picLog::INPUT_OUTPUT > ("HDF5 write field: %1% %2%") %
-            name % nComponents;
-
-        /* parameter checking */
-        PMACC_ASSERT( unit.size() == nComponents );
-        PMACC_ASSERT( inCellPosition.size() == nComponents );
-        for( uint32_t n = 0; n < nComponents; ++n )
-            PMACC_ASSERT( inCellPosition.at(n).size() == simDim );
-        PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
-
-        /* component names */
-        const std::string recordName = std::string("fields/") + name;
-
-        const auto componentNames = plugins::misc::getComponentNames( nComponents );
-
-        /*data to describe source buffer*/
-        GridLayout<simDim> field_layout = params->gridLayout;
-        DataSpace<simDim> field_no_guard = params->window.localDimensions.size;
-        DataSpace<simDim> field_guard = field_layout.getGuard() + params->localWindowToDomainOffset;
-        /* globalSlideOffset due to gpu slides between origin at time step 0
-         * and origin at current time step
-         * ATTENTION: splash offset are globalSlideOffset + picongpu offsets
-         */
-        DataSpace<simDim> globalSlideOffset;
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(params->currentStep);
-        globalSlideOffset.y() += numSlides * localDomain.size.y();
-
-        Dimensions splashGlobalDomainOffset(0, 0, 0);
-        Dimensions splashGlobalOffsetFile(0, 0, 0);
-        Dimensions splashGlobalDomainSize(1, 1, 1);
-
-        for (uint32_t d = 0; d < simDim; ++d)
-        {
-            splashGlobalOffsetFile[d] = localDomain.offset[d];
-            splashGlobalDomainOffset[d] = params->window.globalDimensions.offset[d] + globalSlideOffset[d];
-            splashGlobalDomainSize[d] = params->window.globalDimensions.size[d];
-        }
-
-        splashGlobalOffsetFile[1] = std::max(0, localDomain.offset[1] -
-                                             params->window.globalDimensions.offset[1]);
-
-        /* Patch for non-domain-bound fields
-         * This is an ugly fix to allow output of reduced 1d PML buffers,
-         * that are the same size on each domain.
-         * This code is to be replaced with the openPMD output plugin soon.
-         */
-        if( !isDomainBound )
-        {
-            field_no_guard = field_layout.getDataSpaceWithoutGuarding();
-            auto const localSize = field_no_guard.productOfComponents();
-            auto const & gridController = Environment<simDim>::get().GridController();
-            auto const numRanks = gridController.getGlobalSize();
-            auto const rank = gridController.getGlobalRank();
-            // Number of elements on all domains combined
-            splashGlobalDomainSize = Dimensions(
-                localSize * numRanks,
-                1,
-                1
-            );
-            // Offset for this rank
-            splashGlobalOffsetFile = Dimensions(
-                localSize * rank,
-                0,
-                0
-            );
-            // We are not affected by moving window, so all have offset to 0
-            splashGlobalDomainOffset = Dimensions(
-                0,
-                0,
-                0
-            );
-        }
-
-        size_t tmpArraySize = field_no_guard.productOfComponents();
-        ComponentType* tmpArray = new ComponentType[tmpArraySize];
-
-        typedef DataBoxDim1Access<NativeDataBoxType > D1Box;
-        D1Box d1Access(dataBox.shift(field_guard), field_no_guard);
-
-        for (uint32_t n = 0; n < nComponents; n++)
-        {
-            /* copy data to temp array
-             * tmpArray has the size of the data without any offsets
-             */
-            for (size_t i = 0; i < tmpArraySize; ++i)
-            {
-                tmpArray[i] = d1Access[i][n];
-            }
-
-            std::stringstream datasetName;
-            datasetName << recordName;
-            if (nComponents > 1)
-                datasetName << "/" << componentNames.at(n);
-
-            Dimensions sizeSrcData(1, 1, 1);
-
-            for (uint32_t d = 0; d < simDim; ++d)
-            {
-                sizeSrcData[d] = field_no_guard[d];
-            }
-
-            // avoid deadlock between not finished pmacc tasks and mpi calls in splash/HDF5
-            __getTransactionEvent().waitForFinished();
-            params->dataCollector->writeDomain(params->currentStep,             /* id == time step */
-                                               splashGlobalDomainSize,          /* total size of dataset over all processes */
-                                               splashGlobalOffsetFile,          /* write offset for this process */
-                                               splashType,                      /* data type */
-                                               simDim,                          /* NDims spatial dimensionality of the field */
-                                               splash::Selection(sizeSrcData),  /* data size of this process */
-                                               datasetName.str().c_str(),       /* data set name */
-                                               splash::Domain(
-                                                      splashGlobalDomainOffset, /* offset of the global domain */
-                                                      splashGlobalDomainSize    /* size of the global domain */
-                                               ),
-                                               DomainCollector::GridType,
-                                               tmpArray);
-
-            /* attributes */
-            params->dataCollector->writeAttribute(params->currentStep,
-                                                  splashFloatXType, datasetName.str().c_str(),
-                                                  "position",
-                                                  1u, Dimensions(simDim,0,0),
-                                                  &(*inCellPosition.at(n).begin()));
-
-            params->dataCollector->writeAttribute(params->currentStep,
-                                                  ctDouble, datasetName.str().c_str(),
-                                                  "unitSI", &(unit.at(n)));
-        }
-        __deleteArray(tmpArray);
-
-
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              ctDouble, recordName.c_str(),
-                                              "unitDimension",
-                                              1u, Dimensions(7,0,0),
-                                              &(*unitDimension.begin()));
-
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              splashFloatXType, recordName.c_str(),
-                                              "timeOffset", &timeOffset);
-
-        const std::string geometry("cartesian");
-        ColTypeString ctGeometry(geometry.length());
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              ctGeometry, recordName.c_str(),
-                                              "geometry", geometry.c_str());
-
-        const std::string dataOrder("C");
-        ColTypeString ctDataOrder(dataOrder.length());
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              ctDataOrder, recordName.c_str(),
-                                              "dataOrder", dataOrder.c_str());
-
-        char axisLabels[simDim][2];
-        ColTypeString ctAxisLabels(1);
-        for( uint32_t d = 0; d < simDim; ++d )
-        {
-            axisLabels[simDim-1-d][0] = char('x' + d); // 3D: F[z][y][x], 2D: F[y][x]
-            axisLabels[simDim-1-d][1] = '\0';          // terminator is important!
-        }
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              ctAxisLabels, recordName.c_str(),
-                                              "axisLabels",
-                                              1u, Dimensions(simDim,0,0),
-                                              axisLabels);
-
-        // cellSize is {x, y, z} but fields are F[z][y][x]
-        std::vector<float_X> gridSpacing(simDim, 0.0);
-        for( uint32_t d = 0; d < simDim; ++d )
-            gridSpacing.at(simDim-1-d) = cellSize[d];
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              splashFloatXType, recordName.c_str(),
-                                              "gridSpacing",
-                                              1u, Dimensions(simDim,0,0),
-                                              &(*gridSpacing.begin()));
-
-        // splashGlobalDomainOffset is {x, y, z} but fields are F[z][y][x]
-        std::vector<float_64> gridGlobalOffset(simDim, 0.0);
-        for( uint32_t d = 0; d < simDim; ++d )
-            gridGlobalOffset.at(simDim-1-d) =
-                float_64(cellSize[d]) *
-                float_64(splashGlobalDomainOffset[d]);
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              ctDouble, recordName.c_str(),
-                                              "gridGlobalOffset",
-                                              1u, Dimensions(simDim,0,0),
-                                              &(*gridGlobalOffset.begin()));
-
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              ctDouble, recordName.c_str(),
-                                              "gridUnitSI", &UNIT_LENGTH);
-
-        const std::string fieldSmoothing("none");
-        ColTypeString ctFieldSmoothing(fieldSmoothing.length());
-        params->dataCollector->writeAttribute(params->currentStep,
-                                              ctFieldSmoothing, recordName.c_str(),
-                                              "fieldSmoothing", fieldSmoothing.c_str());
-    }
-
-};
-
-} //namspace hdf5
-
-} //namespace picongpu
diff --git a/include/picongpu/plugins/hdf5/writer/ParticleAttribute.hpp b/include/picongpu/plugins/hdf5/writer/ParticleAttribute.hpp
deleted file mode 100644
index 4ed258a99b..0000000000
--- a/include/picongpu/plugins/hdf5/writer/ParticleAttribute.hpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-
-#pragma once
-
-
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/plugins/hdf5/HDF5Writer.def"
-#include "picongpu/plugins/misc/ComponentNames.hpp"
-#include "picongpu/traits/PICToSplash.hpp"
-#include "picongpu/traits/PICToOpenPMD.hpp"
-#include <pmacc/traits/GetComponentsType.hpp>
-#include <pmacc/traits/GetNComponents.hpp>
-#include <pmacc/traits/Resolve.hpp>
-#include <pmacc/assert.hpp>
-
-namespace picongpu
-{
-
-namespace hdf5
-{
-using namespace pmacc;
-
-using namespace splash;
-
-
-/** write attribute of a particle to hdf5 file
- *
- * @tparam T_Identifier identifier of a particle record
- */
-template< typename T_Identifier>
-struct ParticleAttribute
-{
-    /** write attribute to hdf5 file
-     *
-     * @param params wrapped thread params such as domainwriter, ...
-     * @param frame frame with all particles
-     * @param speciesPath path for the current species (of FrameType)
-     * @param elements number of particles in this patch
-     * @param elementsOffset number of particles in this patch
-     * @param numParticlesGlobal number of particles globally
-     */
-    template<typename FrameType>
-    HINLINE void operator()(
-                            ThreadParams* params,
-                            FrameType& frame,
-                            const std::string speciesPath,
-                            const uint64_t elements,
-                            const uint64_t elementsOffset,
-                            const uint64_t numParticlesGlobal
-    )
-    {
-
-        typedef T_Identifier Identifier;
-        typedef typename pmacc::traits::Resolve<Identifier>::type::type ValueType;
-        const uint32_t components = GetNComponents<ValueType>::value;
-        typedef typename GetComponentsType<ValueType>::type ComponentType;
-        typedef typename PICToSplash<ComponentType>::type SplashType;
-        typedef typename PICToSplash<float_X>::type SplashFloatXType;
-
-        const ThreadParams *threadParams = params;
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  (begin) write species attribute: %1%") % Identifier::getName();
-
-        SplashType splashType;
-        ColTypeDouble ctDouble;
-        ColTypeUInt32 ctUInt32;
-        SplashFloatXType splashFloatXType;
-
-        OpenPMDName<T_Identifier> openPMDName;
-        const std::string recordPath( speciesPath + std::string("/") + openPMDName() );
-
-        const auto componentNames = plugins::misc::getComponentNames( components );
-
-        // get the SI scaling, dimensionality and weighting of the attribute
-        OpenPMDUnit<T_Identifier> openPMDUnit;
-        std::vector<float_64> unit = openPMDUnit();
-        OpenPMDUnitDimension<T_Identifier> openPMDUnitDimension;
-        std::vector<float_64> unitDimension = openPMDUnitDimension();
-        const bool macroWeightedBool = MacroWeighted<T_Identifier>::get();
-        const uint32_t macroWeighted = (macroWeightedBool ? 1 : 0);
-        const float_64 weightingPower = WeightingPower<T_Identifier>::get();
-
-        PMACC_ASSERT(unit.size() == components); // unitSI for each component
-        PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
-
-        /* globalSlideOffset due to gpu slides between origin at time step 0
-         * and origin at current time step
-         * ATTENTION: splash offset are globalSlideOffset + picongpu offsets
-         */
-        DataSpace<simDim> globalSlideOffset;
-        const pmacc::Selection<simDim>& localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
-        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(threadParams->currentStep);
-        globalSlideOffset.y() += numSlides * localDomain.size.y();
-
-        Dimensions splashDomainOffset(0, 0, 0);
-        Dimensions splashGlobalDomainOffset(0, 0, 0);
-
-        Dimensions splashDomainSize(1, 1, 1);
-        Dimensions splashGlobalDomainSize(1, 1, 1);
-
-        for (uint32_t d = 0; d < simDim; ++d)
-        {
-            splashDomainOffset[d] = threadParams->window.localDimensions.offset[d] + globalSlideOffset[d];
-            splashGlobalDomainOffset[d] = threadParams->window.globalDimensions.offset[d] + globalSlideOffset[d];
-            splashGlobalDomainSize[d] = threadParams->window.globalDimensions.size[d];
-            splashDomainSize[d] = threadParams->window.localDimensions.size[d];
-        }
-
-        typedef typename GetComponentsType<ValueType>::type ComponentValueType;
-
-        ComponentValueType* tmpArray = new ComponentValueType[elements];
-
-        for (uint32_t d = 0; d < components; d++)
-        {
-            std::stringstream datasetName;
-            datasetName << recordPath;
-            if (components > 1)
-                datasetName << "/" << componentNames[d];
-
-            ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer();
-            #pragma omp parallel for
-            for( uint64_t i = 0; i < elements; ++i )
-            {
-                tmpArray[i] = ((ComponentValueType*)dataPtr)[i * components + d];
-            }
-
-            // avoid deadlock between not finished pmacc tasks and mpi calls in splash/HDF5
-            __getTransactionEvent().waitForFinished();
-            threadParams->dataCollector->writeDomain(
-                threadParams->currentStep,
-                /* Dimensions for global collective buffer */
-                Dimensions(numParticlesGlobal, 1, 1),
-                /* 3D-offset in the globalSize-buffer this process writes to */
-                Dimensions(elementsOffset, 1, 1),
-                /* Type information for data */
-                splashType,
-                /* Number of dimensions (1-3) of the buffer */
-                1u,
-                /* Selection: size in src buffer */
-                splash::Selection(
-                    Dimensions(elements, 1, 1)
-                ),
-                /* Name of the dataset */
-                datasetName.str().c_str(),
-                /* Global domain information */
-                splash::Domain(
-                    splashGlobalDomainOffset,
-                    splashGlobalDomainSize
-                ),
-                /* Domain type annotation */
-                DomainCollector::PolyType,
-                /* Buffer with data */
-                tmpArray
-            );
-
-            threadParams->dataCollector->writeAttribute(
-                threadParams->currentStep,
-                ctDouble, datasetName.str().c_str(),
-                "unitSI", &(unit.at(d)));
-
-        }
-        __deleteArray(tmpArray);
-
-
-        threadParams->dataCollector->writeAttribute(
-            params->currentStep,
-            ctDouble, recordPath.c_str(),
-            "unitDimension",
-            1u, Dimensions(7,0,0),
-            &(*unitDimension.begin()));
-
-        threadParams->dataCollector->writeAttribute(
-            params->currentStep,
-            ctUInt32, recordPath.c_str(),
-            "macroWeighted",
-            &macroWeighted);
-
-        threadParams->dataCollector->writeAttribute(
-            params->currentStep,
-            ctDouble, recordPath.c_str(),
-            "weightingPower",
-            &weightingPower);
-
-        /** \todo check if always correct at this point, depends on attribute
-         *        and MW-solver/pusher implementation */
-        const float_X timeOffset = 0.0;
-        threadParams->dataCollector->writeAttribute(params->currentStep,
-                                                    splashFloatXType, recordPath.c_str(),
-                                                    "timeOffset", &timeOffset);
-
-        log<picLog::INPUT_OUTPUT > ("HDF5:  ( end ) write species attribute: %1%") %
-            Identifier::getName();
-    }
-
-};
-
-} //namspace hdf5
-
-} //namespace picongpu
-
diff --git a/include/picongpu/plugins/kernel/CopySpecies.kernel b/include/picongpu/plugins/kernel/CopySpecies.kernel
index a3c2db7814..0693909167 100644
--- a/include/picongpu/plugins/kernel/CopySpecies.kernel
+++ b/include/picongpu/plugins/kernel/CopySpecies.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 
@@ -33,14 +32,11 @@
 
 namespace picongpu
 {
-
     /** copy particle from the device to the host frame
      *
      * @tparam T_numWorkers number of workers
      */
-    template<
-        uint32_t T_numWorkers
-    >
+    template<uint32_t T_numWorkers>
     struct CopySpecies
     {
         /** copy particle of a species to a host frame
@@ -50,7 +46,7 @@ namespace picongpu
          * @tparam T_Filter type of filer with particle selection rules
          * @tparam T_Space type of coordinate description
          * @tparam T_Identifier type of identifier for the particle cellIdx
-         * @tparam T_Mapping type of the mapper to map cuda idx to supercells
+         * @tparam T_Mapping type of the mapper to map cupla idx to supercells
          * @tparam T_Acc alpaka accelerator type
          *
          * @param acc alpaka accelerator type
@@ -64,7 +60,7 @@ namespace picongpu
          * @param domainCellIdxIdentifier the identifier for the particle cellIdx
          *                                that is calculated with respect to
          *                                domainOffset
-         * @param mapper map cuda idx to supercells
+         * @param mapper map cupla idx to supercells
          */
         template<
             typename T_DestFrame,
@@ -74,20 +70,17 @@ namespace picongpu
             typename T_Identifier,
             typename T_Mapping,
             typename T_Acc,
-            typename T_ParticleFilter
-        >
-        DINLINE void
-        operator()(
-            T_Acc const & acc,
-            int * counter,
+            typename T_ParticleFilter>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            int* counter,
             T_DestFrame destFrame,
             T_SrcBox srcBox,
             T_Filter filter,
             T_Space const domainOffset,
             T_Identifier const domainCellIdxIdentifier,
             T_Mapping const mapper,
-            T_ParticleFilter parFilter
-        ) const
+            T_ParticleFilter parFilter) const
         {
             using namespace pmacc::particles::operations;
             using namespace mappings::threads;
@@ -96,160 +89,90 @@ namespace picongpu
             using SrcFrameType = typename T_SrcBox::FrameType;
             using SrcFramePtr = typename T_SrcBox::FramePtr;
 
-            constexpr uint32_t numParticlesPerFrame = pmacc::math::CT::volume< typename SrcFrameType::SuperCellSize >::type::value;
+            constexpr uint32_t numParticlesPerFrame
+                = pmacc::math::CT::volume<typename SrcFrameType::SuperCellSize>::type::value;
             constexpr uint32_t numWorkers = T_numWorkers;
 
-            uint32_t const workerIdx = threadIdx.x;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-            PMACC_SMEM( acc, srcFramePtr, SrcFramePtr );
-            PMACC_SMEM( acc, localCounter, int );
-            PMACC_SMEM( acc, globalOffset, int );
+            PMACC_SMEM(acc, srcFramePtr, SrcFramePtr);
+            PMACC_SMEM(acc, localCounter, int);
+            PMACC_SMEM(acc, globalOffset, int);
 
-            using ParticlesDomCfg = IdxConfig<
-                numParticlesPerFrame,
-                numWorkers
-            >;
+            using ParticlesDomCfg = IdxConfig<numParticlesPerFrame, numWorkers>;
 
             // loop over all particles in a frame
-            ForEachIdx< ParticlesDomCfg > forEachParticle( workerIdx );
+            ForEachIdx<ParticlesDomCfg> forEachParticle(workerIdx);
 
-            memory::CtxArray<
-                int,
-                ParticlesDomCfg
-            >
-            storageOffsetCtx{};
+            memory::CtxArray<int, ParticlesDomCfg> storageOffsetCtx{};
 
 
-            DataSpace< simDim > const supcerCellIdx = mapper.getSuperCellIndex( DataSpace< simDim > ( blockIdx ) );
+            DataSpace<simDim> const supcerCellIdx = mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc)));
             /* offset (in cells) of the supercell relative to the origin of the
              * local domain (without any guards)
              */
-            DataSpace< simDim > const localSuperCellCellOffset(
-                ( supcerCellIdx - mapper.getGuardingSuperCells() ) *
-                mapper.getSuperCellSize()
-            );
+            DataSpace<simDim> const localSuperCellCellOffset(
+                (supcerCellIdx - mapper.getGuardingSuperCells()) * mapper.getSuperCellSize());
 
             // each virtual worker needs only one filter
-            filter.setSuperCellPosition( localSuperCellCellOffset );
-            auto accParFilter = parFilter(
-                acc,
-                supcerCellIdx - mapper.getGuardingSuperCells( ),
-                WorkerCfg< numWorkers >{ workerIdx }
-            );
-
-            ForEachIdx<
-                IdxConfig<
-                    1,
-                    numWorkers
-                >
-            > onlyMaster{ workerIdx };
-
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    localCounter = 0;
-                    srcFramePtr = srcBox.getFirstFrame( supcerCellIdx );
-                }
-            );
+            filter.setSuperCellPosition(localSuperCellCellOffset);
+            auto accParFilter
+                = parFilter(acc, supcerCellIdx - mapper.getGuardingSuperCells(), WorkerCfg<numWorkers>{workerIdx});
 
-            __syncthreads();
+            ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
 
-            // move over all Frames in the supercell
-            while( srcFramePtr.isValid() )
-            {
-                forEachParticle(
-                    [&](
-                        uint32_t const localIdx,
-                        uint32_t const idx
-                    )
-                    {
-                        auto parSrc = ( srcFramePtr[ localIdx ] );
-                        storageOffsetCtx[ idx ] = -1;
-                        // count particle in frame
-                        if( parSrc[ multiMask_ ] == 1 &&
-                            filter(
-                                *srcFramePtr,
-                                localIdx
-                            )
-                        )
-                            if(
-                                accParFilter(
-                                    acc,
-                                    parSrc
-                                )
-                            )
-                                storageOffsetCtx[ idx ] = nvidia::atomicAllInc(
-                                    acc,
-                                    &localCounter,
-                                    ::alpaka::hierarchy::Threads{}
-                                );
-                    }
-                );
-                __syncthreads();
-
-                onlyMaster(
-                    [&](
-                        uint32_t const,
-                        uint32_t const
-                    )
-                    {
-                        // reserve host memory for particle
-                        globalOffset = atomicAdd(
-                            counter,
-                            localCounter,
-                            ::alpaka::hierarchy::Blocks{}
-                        );
-                    }
-                );
+            onlyMaster([&](uint32_t const, uint32_t const) {
+                localCounter = 0;
+                srcFramePtr = srcBox.getFirstFrame(supcerCellIdx);
+            });
 
-                __syncthreads();
+            cupla::__syncthreads(acc);
 
-                forEachParticle(
-                    [&](
-                        uint32_t const localIdx,
-                        uint32_t const idx
-                    )
+            // move over all Frames in the supercell
+            while(srcFramePtr.isValid())
+            {
+                forEachParticle([&](uint32_t const localIdx, uint32_t const idx) {
+                    auto parSrc = (srcFramePtr[localIdx]);
+                    storageOffsetCtx[idx] = -1;
+                    // count particle in frame
+                    if(parSrc[multiMask_] == 1 && filter(*srcFramePtr, localIdx))
+                        if(accParFilter(acc, parSrc))
+                            storageOffsetCtx[idx]
+                                = nvidia::atomicAllInc(acc, &localCounter, ::alpaka::hierarchy::Threads{});
+                });
+                cupla::__syncthreads(acc);
+
+                onlyMaster([&](uint32_t const, uint32_t const) {
+                    // reserve host memory for particle
+                    globalOffset = cupla::atomicAdd(acc, counter, localCounter, ::alpaka::hierarchy::Blocks{});
+                });
+
+                cupla::__syncthreads(acc);
+
+                forEachParticle([&](uint32_t const localIdx, uint32_t const idx) {
+                    if(storageOffsetCtx[idx] != -1)
                     {
-                        if( storageOffsetCtx[ idx ] != -1 )
-                        {
-                            auto parDest = destFrame[ globalOffset + storageOffsetCtx[ idx ] ];
-                            auto parDestNoDomainIdx = deselect< T_Identifier >( parDest );
-                            auto parSrc = ( srcFramePtr[ localIdx ] );
-                            assign(
-                                parDestNoDomainIdx,
-                                parSrc
-                            );
-                            // calculate cell index for user-defined domain
-                            DataSpace< simDim > const localCell(
-                                DataSpaceOperations< simDim >::template map<
-                                    SuperCellSize
-                                >( parSrc[ localCellIdx_ ] )
-                            );
-                            parDest[domainCellIdxIdentifier] =
-                                domainOffset + localSuperCellCellOffset + localCell;
-                        }
+                        auto parDest = destFrame[globalOffset + storageOffsetCtx[idx]];
+                        auto parDestNoDomainIdx = deselect<T_Identifier>(parDest);
+                        auto parSrc = (srcFramePtr[localIdx]);
+                        assign(parDestNoDomainIdx, parSrc);
+                        // calculate cell index for user-defined domain
+                        DataSpace<simDim> const localCell(
+                            DataSpaceOperations<simDim>::template map<SuperCellSize>(parSrc[localCellIdx_]));
+                        parDest[domainCellIdxIdentifier] = domainOffset + localSuperCellCellOffset + localCell;
                     }
-                );
+                });
 
-                __syncthreads();
+                cupla::__syncthreads(acc);
 
-                onlyMaster(
-                    [&](
-                        uint32_t const,
-                        uint32_t const
-                    )
-                    {
-                        // get next frame in supercell
-                        srcFramePtr = srcBox.getNextFrame( srcFramePtr );
-                        localCounter = 0;
-                    }
-                );
-                __syncthreads();
+                onlyMaster([&](uint32_t const, uint32_t const) {
+                    // get next frame in supercell
+                    srcFramePtr = srcBox.getNextFrame(srcFramePtr);
+                    localCounter = 0;
+                });
+                cupla::__syncthreads(acc);
             }
         }
     };
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/plugins/makroParticleCounter/PerSuperCell.hpp b/include/picongpu/plugins/makroParticleCounter/PerSuperCell.hpp
index cd55016db1..1945d6e2c8 100644
--- a/include/picongpu/plugins/makroParticleCounter/PerSuperCell.hpp
+++ b/include/picongpu/plugins/makroParticleCounter/PerSuperCell.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Richard Pausch
+/* Copyright 2014-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -39,308 +39,290 @@
 
 namespace picongpu
 {
-using namespace pmacc;
-using namespace splash;
+    using namespace pmacc;
+    using namespace splash;
 
-struct CountMakroParticle
-{
-    template<
-        typename ParBox,
-        typename CounterBox,
-        typename Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        ParBox parBox,
-        CounterBox counterBox,
-        Mapping mapper
-    ) const
+    struct CountMakroParticle
     {
+        template<typename ParBox, typename CounterBox, typename Mapping, typename T_Acc>
+        DINLINE void operator()(T_Acc const& acc, ParBox parBox, CounterBox counterBox, Mapping mapper) const
+        {
+            typedef MappingDesc::SuperCellSize SuperCellSize;
+            typedef typename ParBox::FrameType FrameType;
+            typedef typename ParBox::FramePtr FramePtr;
 
-        typedef MappingDesc::SuperCellSize SuperCellSize;
-        typedef typename ParBox::FrameType FrameType;
-        typedef typename ParBox::FramePtr FramePtr;
-
-        const DataSpace<simDim> block(mapper.getSuperCellIndex(DataSpace<simDim > (blockIdx)));
-        /* counterBox has no guarding supercells*/
-        const DataSpace<simDim> counterCell = block - mapper.getGuardingSuperCells();
+            const DataSpace<simDim> block(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
+            /* counterBox has no guarding supercells*/
+            const DataSpace<simDim> counterCell = block - mapper.getGuardingSuperCells();
 
-        const DataSpace<simDim > threadIndex(threadIdx);
-        const int linearThreadIdx = DataSpaceOperations<simDim>::template map<SuperCellSize > (threadIndex);
+            const DataSpace<simDim> threadIndex(cupla::threadIdx(acc));
+            const int linearThreadIdx = DataSpaceOperations<simDim>::template map<SuperCellSize>(threadIndex);
 
-        PMACC_SMEM( acc, counterValue, uint64_cu );
-        PMACC_SMEM( acc, frame, FramePtr );
+            PMACC_SMEM(acc, counterValue, uint64_cu);
+            PMACC_SMEM(acc, frame, FramePtr);
 
-        if (linearThreadIdx == 0)
-        {
-            counterValue = 0;
-            frame = parBox.getLastFrame(block);
-            if (!frame.isValid())
+            if(linearThreadIdx == 0)
             {
-                counterBox(counterCell) = counterValue;
+                counterValue = 0;
+                frame = parBox.getLastFrame(block);
+                if(!frame.isValid())
+                {
+                    counterBox(counterCell) = counterValue;
+                }
             }
-        }
-        __syncthreads();
-        if (!frame.isValid())
-            return; //end kernel if we have no frames
+            cupla::__syncthreads(acc);
+            if(!frame.isValid())
+                return; // end kernel if we have no frames
 
-        bool isParticle = frame[linearThreadIdx][multiMask_];
+            bool isParticle = frame[linearThreadIdx][multiMask_];
 
-        while (frame.isValid())
-        {
-            if (isParticle)
+            while(frame.isValid())
             {
-                atomicAdd(&counterValue, static_cast<uint64_cu> (1LU), ::alpaka::hierarchy::Blocks{});
+                if(isParticle)
+                {
+                    cupla::atomicAdd(acc, &counterValue, static_cast<uint64_cu>(1LU), ::alpaka::hierarchy::Blocks{});
+                }
+                cupla::__syncthreads(acc);
+                if(linearThreadIdx == 0)
+                {
+                    frame = parBox.getPreviousFrame(frame);
+                }
+                isParticle = true;
+                cupla::__syncthreads(acc);
             }
-            __syncthreads();
-            if (linearThreadIdx == 0)
-            {
-                frame = parBox.getPreviousFrame(frame);
-            }
-            isParticle = true;
-            __syncthreads();
-        }
-
-        if (linearThreadIdx == 0)
-            counterBox(counterCell) = counterValue;
-    }
-};
-/** Count makro particle of a species and write down the result to a global HDF5 file.
- *
- * - count the total number of makro particle per supercell
- * - store one number (size_t) per supercell in a mesh
- * - Output: - create a folder with the name of the plugin
- *           - per time step one file with the name "result_[currentStep].h5" is created
- * - HDF5 Format: - default lib splash output for meshes
- *                - the attribute name in the HDF5 file is "makroParticleCount"
- *
- */
-template<class ParticlesType>
-class PerSuperCell : public ILightweightPlugin
-{
-private:
-
-
-    typedef MappingDesc::SuperCellSize SuperCellSize;
-    typedef GridBuffer<size_t, simDim> GridBufferType;
-
-    MappingDesc *cellDescription;
-    std::string notifyPeriod;
-
-    std::string pluginName;
-    std::string pluginPrefix;
-    std::string foldername;
-    mpi::MPIReduce reduce;
-
-    GridBufferType* localResult;
 
-    ParallelDomainCollector *dataCollector;
-    // set attributes for datacollector files
-    DataCollector::FileCreationAttr h5_attr;
-
-public:
-
-    PerSuperCell() :
-    pluginName("PerSuperCell: create hdf5 with macro particle count per superCell"),
-    pluginPrefix(ParticlesType::FrameType::getName() + std::string("_macroParticlesPerSuperCell")),
-    foldername(pluginPrefix),
-    cellDescription(nullptr),
-    localResult(nullptr),
-    dataCollector(nullptr)
-    {
-        Environment<>::get().PluginConnector().registerPlugin(this);
-    }
-
-    virtual ~PerSuperCell()
-    {
-
-    }
-
-    void notify(uint32_t currentStep)
-    {
-        countMakroParticles < CORE + BORDER > (currentStep);
-    }
-
-    void pluginRegisterHelp(po::options_description& desc)
+            if(linearThreadIdx == 0)
+                counterBox(counterCell) = counterValue;
+        }
+    };
+    /** Count makro particle of a species and write down the result to a global HDF5 file.
+     *
+     * - count the total number of makro particle per supercell
+     * - store one number (size_t) per supercell in a mesh
+     * - Output: - create a folder with the name of the plugin
+     *           - per time step one file with the name "result_[currentStep].h5" is created
+     * - HDF5 Format: - default lib splash output for meshes
+     *                - the attribute name in the HDF5 file is "makroParticleCount"
+     *
+     */
+    template<class ParticlesType>
+    class PerSuperCell : public ILightweightPlugin
     {
-        desc.add_options()
-            ((pluginPrefix + ".period").c_str(),
-             po::value<std::string > (&notifyPeriod), "enable plugin [for each n-th step]");
-    }
+    private:
+        typedef MappingDesc::SuperCellSize SuperCellSize;
+        typedef GridBuffer<size_t, simDim> GridBufferType;
+
+        MappingDesc* cellDescription;
+        std::string notifyPeriod;
+
+        std::string pluginName;
+        std::string pluginPrefix;
+        std::string foldername;
+        mpi::MPIReduce reduce;
+
+        GridBufferType* localResult;
+
+        ParallelDomainCollector* dataCollector;
+        // set attributes for datacollector files
+        DataCollector::FileCreationAttr h5_attr;
+
+    public:
+        PerSuperCell()
+            : pluginName("PerSuperCell: create hdf5 with macro particle count per superCell")
+            , pluginPrefix(ParticlesType::FrameType::getName() + std::string("_macroParticlesPerSuperCell"))
+            , foldername(pluginPrefix)
+            , cellDescription(nullptr)
+            , localResult(nullptr)
+            , dataCollector(nullptr)
+        {
+            Environment<>::get().PluginConnector().registerPlugin(this);
+        }
 
-    std::string pluginGetName() const
-    {
-        return pluginName;
-    }
+        virtual ~PerSuperCell()
+        {
+        }
 
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        this->cellDescription = cellDescription;
-    }
+        void notify(uint32_t currentStep)
+        {
+            countMakroParticles<CORE + BORDER>(currentStep);
+        }
 
-private:
+        void pluginRegisterHelp(po::options_description& desc)
+        {
+            desc.add_options()(
+                (pluginPrefix + ".period").c_str(),
+                po::value<std::string>(&notifyPeriod),
+                "enable plugin [for each n-th step]");
+        }
 
-    void pluginLoad()
-    {
-        if(!notifyPeriod.empty())
+        std::string pluginGetName() const
         {
-            Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
-            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-            /* local count of supercells without any guards*/
-            DataSpace<simDim> localSuperCells(subGrid.getLocalDomain().size / SuperCellSize::toRT());
-            localResult = new GridBufferType(localSuperCells);
+            return pluginName;
+        }
 
-            /* create folder for hdf5 files*/
-            Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(foldername);
+        void setMappingDescription(MappingDesc* cellDescription)
+        {
+            this->cellDescription = cellDescription;
         }
-    }
 
-    void pluginUnload()
-    {
-        __delete(localResult);
+    private:
+        void pluginLoad()
+        {
+            if(!notifyPeriod.empty())
+            {
+                Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+                const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                /* local count of supercells without any guards*/
+                DataSpace<simDim> localSuperCells(subGrid.getLocalDomain().size / SuperCellSize::toRT());
+                localResult = new GridBufferType(localSuperCells);
+
+                /* create folder for hdf5 files*/
+                Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(foldername);
+            }
+        }
 
-        if (dataCollector)
-            dataCollector->finalize();
+        void pluginUnload()
+        {
+            __delete(localResult);
 
-        __delete(dataCollector);
-    }
+            if(dataCollector)
+                dataCollector->finalize();
 
-    template< uint32_t AREA>
-    void countMakroParticles(uint32_t currentStep)
-    {
-        openH5File();
+            __delete(dataCollector);
+        }
 
-        DataConnector &dc = Environment<>::get().DataConnector();
+        template<uint32_t AREA>
+        void countMakroParticles(uint32_t currentStep)
+        {
+            openH5File();
 
-        auto particles = dc.get< ParticlesType >( ParticlesType::FrameType::getName(), true );
+            DataConnector& dc = Environment<>::get().DataConnector();
 
-        /*############ count particles #######################################*/
-        typedef MappingDesc::SuperCellSize SuperCellSize;
-        AreaMapping<AREA, MappingDesc> mapper(*cellDescription);
+            auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
 
-        PMACC_KERNEL(CountMakroParticle{})
-            (mapper.getGridDim(), SuperCellSize::toRT())
-            (particles->getDeviceParticlesBox(),
-             localResult->getDeviceBuffer().getDataBox(), mapper);
+            /*############ count particles #######################################*/
+            typedef MappingDesc::SuperCellSize SuperCellSize;
+            AreaMapping<AREA, MappingDesc> mapper(*cellDescription);
 
-        dc.releaseData( ParticlesType::FrameType::getName() );
+            PMACC_KERNEL(CountMakroParticle{})
+            (mapper.getGridDim(), SuperCellSize::toRT())(
+                particles->getDeviceParticlesBox(),
+                localResult->getDeviceBuffer().getDataBox(),
+                mapper);
 
-        localResult->deviceToHost();
+            dc.releaseData(ParticlesType::FrameType::getName());
 
+            localResult->deviceToHost();
 
 
-        /*############ dump data #############################################*/
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+            /*############ dump data #############################################*/
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
 
-        DataSpace<simDim> localSize(subGrid.getLocalDomain().size / SuperCellSize::toRT());
-        DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset / SuperCellSize::toRT());
-        DataSpace<simDim> globalSize(subGrid.getGlobalDomain().size / SuperCellSize::toRT());
+            DataSpace<simDim> localSize(subGrid.getLocalDomain().size / SuperCellSize::toRT());
+            DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset / SuperCellSize::toRT());
+            DataSpace<simDim> globalSize(subGrid.getGlobalDomain().size / SuperCellSize::toRT());
 
 
+            Dimensions splashGlobalDomainOffset(0, 0, 0);
+            Dimensions splashGlobalOffset(0, 0, 0);
+            Dimensions splashGlobalDomainSize(1, 1, 1);
+            Dimensions splashGlobalSize(1, 1, 1);
+            Dimensions localBufferSize(1, 1, 1);
 
-        Dimensions splashGlobalDomainOffset(0, 0, 0);
-        Dimensions splashGlobalOffset(0, 0, 0);
-        Dimensions splashGlobalDomainSize(1, 1, 1);
-        Dimensions splashGlobalSize(1, 1, 1);
-        Dimensions localBufferSize(1, 1, 1);
+            for(uint32_t d = 0; d < simDim; ++d)
+            {
+                splashGlobalOffset[d] = globalOffset[d];
+                splashGlobalSize[d] = globalSize[d];
+                splashGlobalDomainSize[d] = globalSize[d];
+                localBufferSize[d] = localSize[d];
+            }
 
-        for (uint32_t d = 0; d < simDim; ++d)
-        {
-            splashGlobalOffset[d] = globalOffset[d];
-            splashGlobalSize[d] = globalSize[d];
-            splashGlobalDomainSize[d] = globalSize[d];
-            localBufferSize[d] = localSize[d];
+            size_t* ptr = localResult->getHostBuffer().getPointer();
+
+            // avoid deadlock between not finished pmacc tasks and mpi calls in adios
+            __getTransactionEvent().waitForFinished();
+
+            dataCollector->writeDomain(
+                currentStep, /* id == time step */
+                splashGlobalSize, /* total size of dataset over all processes */
+                splashGlobalOffset, /* write offset for this process */
+                ColTypeUInt64(), /* data type */
+                simDim, /* NDims of the field data (scalar, vector, ...) */
+                splash::Selection(localBufferSize),
+                "makroParticlePerSupercell", /* data set name */
+                splash::Domain(
+                    splashGlobalDomainOffset, /* offset of the global domain */
+                    splashGlobalDomainSize /* size of the global domain */
+                    ),
+                DomainCollector::GridType,
+                ptr);
+
+            closeH5File();
         }
 
-        size_t* ptr = localResult->getHostBuffer().getPointer();
-
-        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
-        __getTransactionEvent().waitForFinished();
-
-        dataCollector->writeDomain(currentStep,                     /* id == time step */
-                                   splashGlobalSize,                /* total size of dataset over all processes */
-                                   splashGlobalOffset,              /* write offset for this process */
-                                   ColTypeUInt64(),                 /* data type */
-                                   simDim,                          /* NDims of the field data (scalar, vector, ...) */
-                                   splash::Selection(localBufferSize),
-                                   "makroParticlePerSupercell",     /* data set name */
-                                   splash::Domain(
-                                          splashGlobalDomainOffset, /* offset of the global domain */
-                                          splashGlobalDomainSize    /* size of the global domain */
-                                   ),
-                                   DomainCollector::GridType,
-                                   ptr);
-
-        closeH5File();
-    }
-
-    void closeH5File()
-    {
-        if (dataCollector != nullptr)
+        void closeH5File()
         {
-            std::string filename = (foldername + std::string("/makroParticlePerSupercell"));
-            log<picLog::INPUT_OUTPUT > ("HDF5 close DataCollector with file: %1%") % filename;
-            dataCollector->close();
+            if(dataCollector != nullptr)
+            {
+                std::string filename = (foldername + std::string("/makroParticlePerSupercell"));
+                log<picLog::INPUT_OUTPUT>("HDF5 close DataCollector with file: %1%") % filename;
+                dataCollector->close();
+            }
         }
-    }
 
-    void openH5File()
-    {
-
-        if (dataCollector == nullptr)
+        void openH5File()
         {
-            DataSpace<simDim> mpi_pos;
-            DataSpace<simDim> mpi_size;
-
-            Dimensions splashMpiPos;
-            Dimensions splashMpiSize;
-
-            GridController<simDim> &gc = Environment<simDim>::get().GridController();
-
-            mpi_pos = gc.getPosition();
-            mpi_size = gc.getGpuNodes();
-
-            splashMpiPos.set(0, 0, 0);
-            splashMpiSize.set(1, 1, 1);
-
-            for (uint32_t i = 0; i < simDim; ++i)
+            if(dataCollector == nullptr)
             {
-                splashMpiPos[i] = mpi_pos[i];
-                splashMpiSize[i] = mpi_size[i];
+                DataSpace<simDim> mpi_pos;
+                DataSpace<simDim> mpi_size;
+
+                Dimensions splashMpiPos;
+                Dimensions splashMpiSize;
+
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+
+                mpi_pos = gc.getPosition();
+                mpi_size = gc.getGpuNodes();
+
+                splashMpiPos.set(0, 0, 0);
+                splashMpiSize.set(1, 1, 1);
+
+                for(uint32_t i = 0; i < simDim; ++i)
+                {
+                    splashMpiPos[i] = mpi_pos[i];
+                    splashMpiSize[i] = mpi_size[i];
+                }
+
+
+                const uint32_t maxOpenFilesPerNode = 1;
+                dataCollector = new ParallelDomainCollector(
+                    gc.getCommunicator().getMPIComm(),
+                    gc.getCommunicator().getMPIInfo(),
+                    splashMpiSize,
+                    maxOpenFilesPerNode);
+                // set attributes for datacollector files
+                DataCollector::FileCreationAttr h5_attr;
+                h5_attr.enableCompression = false;
+                h5_attr.fileAccType = DataCollector::FAT_CREATE;
+                h5_attr.mpiPosition.set(splashMpiPos);
+                h5_attr.mpiSize.set(splashMpiSize);
             }
 
 
-            const uint32_t maxOpenFilesPerNode = 1;
-            dataCollector = new ParallelDomainCollector(
-                                                        gc.getCommunicator().getMPIComm(),
-                                                        gc.getCommunicator().getMPIInfo(),
-                                                        splashMpiSize,
-                                                        maxOpenFilesPerNode);
-            // set attributes for datacollector files
-            DataCollector::FileCreationAttr h5_attr;
-            h5_attr.enableCompression = false;
-            h5_attr.fileAccType = DataCollector::FAT_CREATE;
-            h5_attr.mpiPosition.set(splashMpiPos);
-            h5_attr.mpiSize.set(splashMpiSize);
-        }
-
-
-        // open datacollector
-        try
-        {
-            std::string filename = (foldername + std::string("/makroParticlePerSupercell"));
-            log<picLog::INPUT_OUTPUT > ("HDF5 open DataCollector with file: %1%") %
-                filename;
-            dataCollector->open(filename.c_str(), h5_attr);
-        }
-        catch (const DCException& e)
-        {
-            std::cerr << e.what() << std::endl;
-            throw std::runtime_error("Failed to open datacollector");
+            // open datacollector
+            try
+            {
+                std::string filename = (foldername + std::string("/makroParticlePerSupercell"));
+                log<picLog::INPUT_OUTPUT>("HDF5 open DataCollector with file: %1%") % filename;
+                dataCollector->open(filename.c_str(), h5_attr);
+            }
+            catch(const DCException& e)
+            {
+                std::cerr << e.what() << std::endl;
+                throw std::runtime_error("Failed to open datacollector");
+            }
         }
-    }
-
-};
+    };
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/plugins/misc/AppendName.hpp b/include/picongpu/plugins/misc/AppendName.hpp
index 15d59c55ca..bda45a6855 100644
--- a/include/picongpu/plugins/misc/AppendName.hpp
+++ b/include/picongpu/plugins/misc/AppendName.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,22 +25,22 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-    /** append the name of an filter to a vector
-     *
-     * @tparam T_Filter filter class (required interface: `getName( )`)
-     */
-    template< typename T_Filter >
-    struct AppendName
+    namespace plugins
     {
-        void operator( )( std::vector< std::string > & vector ) const
+        namespace misc
         {
-            vector.emplace_back( T_Filter::getName() );
-        }
-    };
-} // namespace misc
-} // namespace plugins
+            /** append the name of an filter to a vector
+             *
+             * @tparam T_Filter filter class (required interface: `getName( )`)
+             */
+            template<typename T_Filter>
+            struct AppendName
+            {
+                void operator()(std::vector<std::string>& vector) const
+                {
+                    vector.emplace_back(T_Filter::getName());
+                }
+            };
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/ComponentNames.cpp b/include/picongpu/plugins/misc/ComponentNames.cpp
index fd5f44eac9..ba33ff41da 100644
--- a/include/picongpu/plugins/misc/ComponentNames.cpp
+++ b/include/picongpu/plugins/misc/ComponentNames.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Sergei Bastrakov
+/* Copyright 2020-2021 Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -26,39 +26,33 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-
-    std::vector< std::string > getComponentNames(
-        uint32_t const numComponents
-    )
+    namespace plugins
     {
-        /* For low number of components, fall back to the previously used
-         * "xyzw" naming scheme for backward compatibility
-         */
-        if( numComponents <= 4 )
-        {
-            std::array< std::string, 4 > names = { "x" , "y", "z", "w" };
-            return std::vector< std::string >{
-                names.begin(),
-                names.begin() + numComponents
-            };
-        }
-        // Special case for 6 PML components
-        else if( numComponents == 6 )
-            return { "xy" , "xz", "yx", "yz", "zx", "zy" };
-        else
+        namespace misc
         {
-            // Otherwise use different generic names
-            auto result = std::vector< std::string >( numComponents );
-            for( auto i = 0u; i < result.size(); i++ )
-                result[ i ] = "component" + std::to_string( i );
-            return result;
-        }
-    }
+            std::vector<std::string> getComponentNames(uint32_t const numComponents)
+            {
+                /* For low number of components, fall back to the previously used
+                 * "xyzw" naming scheme for backward compatibility
+                 */
+                if(numComponents <= 4)
+                {
+                    std::array<std::string, 4> names = {"x", "y", "z", "w"};
+                    return std::vector<std::string>{names.begin(), names.begin() + numComponents};
+                }
+                // Special case for 6 PML components
+                else if(numComponents == 6)
+                    return {"xy", "xz", "yx", "yz", "zx", "zy"};
+                else
+                {
+                    // Otherwise use different generic names
+                    auto result = std::vector<std::string>(numComponents);
+                    for(auto i = 0u; i < result.size(); i++)
+                        result[i] = "component" + std::to_string(i);
+                    return result;
+                }
+            }
 
-} // namespace misc
-} // namespace plugins
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/ComponentNames.hpp b/include/picongpu/plugins/misc/ComponentNames.hpp
index 1a439bf70c..ca33f7a8e8 100644
--- a/include/picongpu/plugins/misc/ComponentNames.hpp
+++ b/include/picongpu/plugins/misc/ComponentNames.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Sergei Bastrakov
+/* Copyright 2020-2021 Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -25,22 +25,19 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-
-    /** Get text names of vector components
-     *
-     * For 1-4 and 6 components use predefined names,
-     * for other amounts use generic different names
-     *
-     * @param numComponents number of components
-     */
-    std::vector< std::string > getComponentNames(
-        uint32_t numComponents
-    );
+    namespace plugins
+    {
+        namespace misc
+        {
+            /** Get text names of vector components
+             *
+             * For 1-4 and 6 components use predefined names,
+             * for other amounts use generic different names
+             *
+             * @param numComponents number of components
+             */
+            std::vector<std::string> getComponentNames(uint32_t numComponents);
 
-} // namespace misc
-} // namespace plugins
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/ExecuteIfNameIsEqual.hpp b/include/picongpu/plugins/misc/ExecuteIfNameIsEqual.hpp
index 4f981da0f9..018aa5e670 100644
--- a/include/picongpu/plugins/misc/ExecuteIfNameIsEqual.hpp
+++ b/include/picongpu/plugins/misc/ExecuteIfNameIsEqual.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,36 +24,29 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-    /** execute an unary functor if the name is equal
-     *
-     * @tparam T_Filter filter class (required interface: `getName( )` and default constructor)
-     */
-    template< typename T_Filter >
-    struct ExecuteIfNameIsEqual
+    namespace plugins
     {
-        /** evaluate if functor must executed
-         *
-         * @param filterName name of the filter which should started
-         * @param unaryFunctor any unary functor
-         */
-        template<
-            typename T_Kernel,
-            typename ... T_Args
-        >
-        void operator( )(
-            std::string filterName,
-            uint32_t const currentStep,
-            T_Kernel const unaryFunctor
-        ) const
+        namespace misc
         {
-            if( filterName == T_Filter::getName( ) )
-                unaryFunctor( particles::filter::IUnary< T_Filter >{ currentStep } );
-        }
-    };
-} // namespace misc
-} // namespace plugins
+            /** execute an unary functor if the name is equal
+             *
+             * @tparam T_Filter filter class (required interface: `getName( )` and default constructor)
+             */
+            template<typename T_Filter>
+            struct ExecuteIfNameIsEqual
+            {
+                /** evaluate if functor must executed
+                 *
+                 * @param filterName name of the filter which should started
+                 * @param unaryFunctor any unary functor
+                 */
+                template<typename T_Kernel, typename... T_Args>
+                void operator()(std::string filterName, uint32_t const currentStep, T_Kernel const unaryFunctor) const
+                {
+                    if(filterName == T_Filter::getName())
+                        unaryFunctor(particles::filter::IUnary<T_Filter>{currentStep});
+                }
+            };
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/SpeciesFilter.hpp b/include/picongpu/plugins/misc/SpeciesFilter.hpp
index 0fae441ef0..1107c7d10b 100644
--- a/include/picongpu/plugins/misc/SpeciesFilter.hpp
+++ b/include/picongpu/plugins/misc/SpeciesFilter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,73 +26,68 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-
-    /** combines a particle species with a filter
-     *
-     * @tparam T_Species picongpu::Particle, type of the species
-     * @tparam T_Filter pmacc::filter::Interface, type of the filter
-     */
-    template<
-        typename T_Species,
-        typename T_Filter = particles::filter::All
-    >
-    struct SpeciesFilter
+    namespace plugins
     {
-        using Filter = T_Filter;
-        using Species = T_Species;
-
-        /** name of the filtered species
-         *
-         * @return <speciesName>_<filterName>`
-         */
-        static std::string getName()
+        namespace misc
         {
-            return Species::FrameType::getName() + "_" + Filter::getName();
-        }
-    };
+            /** combines a particle species with a filter
+             *
+             * @tparam T_Species picongpu::Particle, type of the species
+             * @tparam T_Filter pmacc::filter::Interface, type of the filter
+             */
+            template<typename T_Species, typename T_Filter = particles::filter::All>
+            struct SpeciesFilter
+            {
+                using Filter = T_Filter;
+                using Species = T_Species;
 
-    /** species without a filter
-     *
-     * This class fulfills the interface of SpeciesFilter for a species
-     * but keeps the species name without adding the filter suffix.
-     */
-    template< typename T_Species >
-    struct UnfilteredSpecies
-    {
-        using Filter = particles::filter::All;
-        using Species = T_Species;
+                /** name of the filtered species
+                 *
+                 * @return <speciesName>_<filterName>`
+                 */
+                static std::string getName()
+                {
+                    return Species::FrameType::getName() + "_" + Filter::getName();
+                }
+            };
 
-        /** get name of the filtered species
-         *
-         * @return <speciesName>
-         */
-        static std::string getName()
-        {
-            return Species::FrameType::getName();
-        }
-    };
+            /** species without a filter
+             *
+             * This class fulfills the interface of SpeciesFilter for a species
+             * but keeps the species name without adding the filter suffix.
+             */
+            template<typename T_Species>
+            struct UnfilteredSpecies
+            {
+                using Filter = particles::filter::All;
+                using Species = T_Species;
 
-namespace speciesFilter
-{
-    /** evaluate if the filter and species combination is valid
-     *
-     * @tparam T_SpeciesFilter SpeciesFilter, type of the filter and species
-     * @return ::type boost::mpl::bool_<>, if the species is eligible for the filter
-     */
-    template< typename T_SpeciesFilter >
-    struct IsEligible
-    {
-        using type = typename particles::traits::SpeciesEligibleForSolver<
-            typename T_SpeciesFilter::Species,
-            typename T_SpeciesFilter::Filter
-        >::type;
-    };
-} // namespace speciesFilter
+                /** get name of the filtered species
+                 *
+                 * @return <speciesName>
+                 */
+                static std::string getName()
+                {
+                    return Species::FrameType::getName();
+                }
+            };
+
+            namespace speciesFilter
+            {
+                /** evaluate if the filter and species combination is valid
+                 *
+                 * @tparam T_SpeciesFilter SpeciesFilter, type of the filter and species
+                 * @return ::type boost::mpl::bool_<>, if the species is eligible for the filter
+                 */
+                template<typename T_SpeciesFilter>
+                struct IsEligible
+                {
+                    using type = typename particles::traits::SpeciesEligibleForSolver<
+                        typename T_SpeciesFilter::Species,
+                        typename T_SpeciesFilter::Filter>::type;
+                };
+            } // namespace speciesFilter
 
-} //namespace misc
-} //namespace plugins
-} //namespace picongpu
+        } // namespace misc
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/misc/concatenateToString.hpp b/include/picongpu/plugins/misc/concatenateToString.hpp
index 162d53f1e9..4f18adc911 100644
--- a/include/picongpu/plugins/misc/concatenateToString.hpp
+++ b/include/picongpu/plugins/misc/concatenateToString.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,36 +25,28 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-    /** concatenate all values of an string container
-     *
-     * @tparam T_Container type of the container
-     *
-     * @param vector source container (required interface: `begin(), end()`)
-     * @param separator separator between two elements
-     */
-    template< typename T_Container >
-    std::string concatenateToString(
-        T_Container & container,
-        std::string const & separator = ","
-    )
+    namespace plugins
     {
-        return std::accumulate(
-            container.begin(),
-            container.end(),
-            std::string(),
-            [ & ](
-                std::string & result,
-                std::string & inString
-            )
+        namespace misc
+        {
+            /** concatenate all values of an string container
+             *
+             * @tparam T_Container type of the container
+             *
+             * @param vector source container (required interface: `begin(), end()`)
+             * @param separator separator between two elements
+             */
+            template<typename T_Container>
+            std::string concatenateToString(T_Container& container, std::string const& separator = ",")
             {
-                return result.empty() ? inString : result + separator + inString;
+                return std::accumulate(
+                    container.begin(),
+                    container.end(),
+                    std::string(),
+                    [&](std::string& result, std::string& inString) {
+                        return result.empty() ? inString : result + separator + inString;
+                    });
             }
-        );
-    }
-} // namespace misc
-} // namespace plugins
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/containsObject.hpp b/include/picongpu/plugins/misc/containsObject.hpp
index aa109be2cf..96c51632a3 100644
--- a/include/picongpu/plugins/misc/containsObject.hpp
+++ b/include/picongpu/plugins/misc/containsObject.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,32 +24,25 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-    /** search for an element within a STL container
-     *
-     * @tparam T_Container standard container, type of the container
-     *
-     * @param container object to query
-     * @param value object to search
-     * @return true if container contains the element, else false
-     */
-    template< typename T_Container >
-    bool containsObject(
-        T_Container const & container,
-        typename T_Container::value_type const & value
-    )
+    namespace plugins
     {
-        auto it = std::find(
-            container.begin(),
-            container.end(),
-            value
-        );
+        namespace misc
+        {
+            /** search for an element within a STL container
+             *
+             * @tparam T_Container standard container, type of the container
+             *
+             * @param container object to query
+             * @param value object to search
+             * @return true if container contains the element, else false
+             */
+            template<typename T_Container>
+            bool containsObject(T_Container const& container, typename T_Container::value_type const& value)
+            {
+                auto it = std::find(container.begin(), container.end(), value);
 
-        return it != container.end();
-    }
-} // namespace misc
-} // namespace plugins
+                return it != container.end();
+            }
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/misc.hpp b/include/picongpu/plugins/misc/misc.hpp
index 063abadf00..aa74c9e77e 100644
--- a/include/picongpu/plugins/misc/misc.hpp
+++ b/include/picongpu/plugins/misc/misc.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/plugins/misc/removeSpaces.cpp b/include/picongpu/plugins/misc/removeSpaces.cpp
index 46765b744e..f7342548e8 100644
--- a/include/picongpu/plugins/misc/removeSpaces.cpp
+++ b/include/picongpu/plugins/misc/removeSpaces.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,23 +25,16 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-    std::string removeSpaces( std::string value )
+    namespace plugins
     {
-        value.erase(
-            std::remove(
-                value.begin(),
-                value.end(),
-                ' '
-            ),
-            value.end()
-        );
+        namespace misc
+        {
+            std::string removeSpaces(std::string value)
+            {
+                value.erase(std::remove(value.begin(), value.end(), ' '), value.end());
 
-        return value;
-    }
-} // namespace misc
-} // namespace plugins
+                return value;
+            }
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/removeSpaces.hpp b/include/picongpu/plugins/misc/removeSpaces.hpp
index c897c886eb..989e24e686 100644
--- a/include/picongpu/plugins/misc/removeSpaces.hpp
+++ b/include/picongpu/plugins/misc/removeSpaces.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,16 +24,16 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-    /** removes all spaces within a string
-     *
-     * @param value input string
-     * @return string without any spaces
-     */
-    std::string removeSpaces( std::string value );
-} // namespace misc
-} // namespace plugins
+    namespace plugins
+    {
+        namespace misc
+        {
+            /** removes all spaces within a string
+             *
+             * @param value input string
+             * @return string without any spaces
+             */
+            std::string removeSpaces(std::string value);
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/splitString.cpp b/include/picongpu/plugins/misc/splitString.cpp
index dd827f3f77..d6aaa459b5 100644
--- a/include/picongpu/plugins/misc/splitString.cpp
+++ b/include/picongpu/plugins/misc/splitString.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,30 +26,19 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-    std::vector< std::string > splitString(
-        std::string const & input,
-        std::string const & regex
-    )
+    namespace plugins
     {
-        std::regex re( regex );
-        // passing -1 as the submatch index parameter performs splitting
-        std::sregex_token_iterator first{
-            input.begin(),
-            input.end(),
-            re,
-            -1
-        };
-        std::sregex_token_iterator last;
+        namespace misc
+        {
+            std::vector<std::string> splitString(std::string const& input, std::string const& regex)
+            {
+                std::regex re(regex);
+                // passing -1 as the submatch index parameter performs splitting
+                std::sregex_token_iterator first{input.begin(), input.end(), re, -1};
+                std::sregex_token_iterator last;
 
-        return {
-            first,
-            last
-        };
-    }
-} // namespace misc
-} // namespace plugins
+                return {first, last};
+            }
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/misc/splitString.hpp b/include/picongpu/plugins/misc/splitString.hpp
index 09fd8d268c..d5590a5f33 100644
--- a/include/picongpu/plugins/misc/splitString.hpp
+++ b/include/picongpu/plugins/misc/splitString.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,24 +25,21 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace misc
-{
-    /** split a string in a vector of strings
-     *
-     * Based on Stack Overflow post:
-     *   source: https://stackoverflow.com/a/28142357
-     *   author: Marcin
-     *   date: Jan 25 '15
-     *
-     * @param input string to split
-     * @param regex separator between two elements
-     */
-    std::vector< std::string > splitString(
-        std::string const & input,
-        std::string const & regex = ","
-    );
-} // namespace misc
-} // namespace plugins
+    namespace plugins
+    {
+        namespace misc
+        {
+            /** split a string in a vector of strings
+             *
+             * Based on Stack Overflow post:
+             *   source: https://stackoverflow.com/a/28142357
+             *   author: Marcin
+             *   date: Jan 25 '15
+             *
+             * @param input string to split
+             * @param regex separator between two elements
+             */
+            std::vector<std::string> splitString(std::string const& input, std::string const& regex = ",");
+        } // namespace misc
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/multi/IHelp.hpp b/include/picongpu/plugins/multi/IHelp.hpp
index 67c66779f1..bf5c2775ae 100644
--- a/include/picongpu/plugins/multi/IHelp.hpp
+++ b/include/picongpu/plugins/multi/IHelp.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,54 +25,53 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace multi
-{
-
-    //! Interface to expose a help of a plugin
-    struct IHelp
+    namespace plugins
     {
-        //! creates a ISlave instance
-        virtual std::shared_ptr< ISlave > create(
-            std::shared_ptr< IHelp > & help,
-            size_t const id,
-            MappingDesc* cellDescription
-        ) = 0;
+        namespace multi
+        {
+            //! Interface to expose a help of a plugin
+            struct IHelp
+            {
+                //! creates a ISlave instance
+                virtual std::shared_ptr<ISlave> create(
+                    std::shared_ptr<IHelp>& help,
+                    size_t const id,
+                    MappingDesc* cellDescription)
+                    = 0;
 
-        /** register help options
-         *
-         * The options are used if the plugin is a ISlave and is handling
-         * there own notification period.
-         */
-        virtual void registerHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        ) = 0;
+                /** register help options
+                 *
+                 * The options are used if the plugin is a ISlave and is handling
+                 * there own notification period.
+                 */
+                virtual void registerHelp(
+                    boost::program_options::options_description& desc,
+                    std::string const& masterPrefix = std::string{})
+                    = 0;
 
-        /** register independent help options
-         *
-         * This options can be used even if the plugin is not handling there
-         * own notification period.
-         */
-        virtual void expandHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        ) = 0;
+                /** register independent help options
+                 *
+                 * This options can be used even if the plugin is not handling there
+                 * own notification period.
+                 */
+                virtual void expandHelp(
+                    boost::program_options::options_description& desc,
+                    std::string const& masterPrefix = std::string{})
+                    = 0;
 
-        //! validate if the command line interface options are well formated
-        virtual void validateOptions() = 0;
+                //! validate if the command line interface options are well formated
+                virtual void validateOptions() = 0;
 
-        //! number of plugin which must be created
-        virtual size_t getNumPlugins() const = 0;
+                //! number of plugin which must be created
+                virtual size_t getNumPlugins() const = 0;
 
-        //! short description of the plugin functionality
-        virtual std::string getDescription() const = 0;
+                //! short description of the plugin functionality
+                virtual std::string getDescription() const = 0;
 
-        //! name of the plugin
-        virtual std::string getName() const = 0;
-    };
+                //! name of the plugin
+                virtual std::string getName() const = 0;
+            };
 
-} // namespace multi
-} // namespace plugins
+        } // namespace multi
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/multi/ISlave.hpp b/include/picongpu/plugins/multi/ISlave.hpp
index 5eedeba45a..2c040a6de8 100644
--- a/include/picongpu/plugins/multi/ISlave.hpp
+++ b/include/picongpu/plugins/multi/ISlave.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,37 +27,31 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace multi
-{
-    struct IHelp;
-
-    /** Interface for a slave plugin
-     *
-     * A plugin which fulfil l this interface can be used as slave plugin for
-     * multi::Master.
-     *
-     * A slave must register itself to the PluginConnector to receive the notify calls.
-     */
-    struct ISlave : public pmacc::INotify
+    namespace plugins
     {
-        //! must be implemented by the user
-        static std::shared_ptr< IHelp > getHelp();
-
-        //! restart the plugin from a checkpoint
-        virtual void restart(
-            uint32_t restartStep,
-            std::string const & restartDirectory
-        ) = 0;
-
-        //! create a check point forthe plugin
-        virtual void checkpoint(
-            uint32_t currentStep,
-            std::string const & checkpointDirectory
-        ) = 0;
-    };
-
-} // namespace multi
-} // namespace plugins
+        namespace multi
+        {
+            struct IHelp;
+
+            /** Interface for a slave plugin
+             *
+             * A plugin which fulfil l this interface can be used as slave plugin for
+             * multi::Master.
+             *
+             * A slave must register itself to the PluginConnector to receive the notify calls.
+             */
+            struct ISlave : public pmacc::INotify
+            {
+                //! must be implemented by the user
+                static std::shared_ptr<IHelp> getHelp();
+
+                //! restart the plugin from a checkpoint
+                virtual void restart(uint32_t restartStep, std::string const& restartDirectory) = 0;
+
+                //! create a check point forthe plugin
+                virtual void checkpoint(uint32_t currentStep, std::string const& checkpointDirectory) = 0;
+            };
+
+        } // namespace multi
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/multi/Master.hpp b/include/picongpu/plugins/multi/Master.hpp
index 42706b19e0..8bad1f7afc 100644
--- a/include/picongpu/plugins/multi/Master.hpp
+++ b/include/picongpu/plugins/multi/Master.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -32,140 +32,109 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace multi
-{
-    /** Master class to create multi plugins
-     *
-     * Create and handle a plugin as multi plugin. Parameter of a multi plugin
-     * can be used multiple times on the command line.
-     *
-     * @tparam T_Slave type of the plugin (must inherit from ISlave)
-     */
-    template< typename T_Slave >
-    class Master : public ISimulationPlugin
+    namespace plugins
     {
-    public:
-
-        using Slave = T_Slave;
-        using SlaveList =  std::list< std::shared_ptr< ISlave > >;
-        SlaveList slaveList;
-
-        std::shared_ptr< IHelp > slaveHelp;
-
-        MappingDesc* m_cellDescription = nullptr;
-
-        Master( ) : slaveHelp( Slave::getHelp() )
-        {
-            Environment<>::get( ).PluginConnector( ).registerPlugin(this);
-        }
-
-        virtual ~Master( )
-        {
-
-        }
-
-        std::string pluginGetName( ) const
-        {
-            // the PMacc plugin system needs a short description instead of the plugin name
-            return slaveHelp->getName( ) + ": " + slaveHelp->getDescription( );
-        }
-
-        void pluginRegisterHelp( boost::program_options::options_description& desc )
-        {
-            slaveHelp->registerHelp( desc );
-        }
-
-        void setMappingDescription( MappingDesc* cellDescription )
+        namespace multi
         {
-            m_cellDescription = cellDescription;
-        }
-
-        /** restart a checkpoint
-         *
-         * Trigger the method restart() for all slave instances.
-         */
-        void restart(
-            uint32_t restartStep,
-            std::string const restartDirectory
-        )
-        {
-            for( auto & slave : slaveList )
-                slave->restart(
-                    restartStep,
-                    restartDirectory
-                );
-        }
-
-        /** create a checkpoint
-         *
-         * Trigger the method checkpoint() for all slave instances.
-         */
-        void checkpoint(
-            uint32_t currentStep,
-            std::string const checkpointDirectory
-        )
-        {
-            for( auto & slave : slaveList )
-                slave->checkpoint(
-                    currentStep,
-                    checkpointDirectory
-                );
-        }
-
-    private:
-
-        void pluginLoad( )
-        {
-            size_t const numSlaves = slaveHelp->getNumPlugins( );
-            if( numSlaves > 0u )
-                slaveHelp->validateOptions( );
-            for( size_t i = 0; i < numSlaves; ++i )
+            /** Master class to create multi plugins
+             *
+             * Create and handle a plugin as multi plugin. Parameter of a multi plugin
+             * can be used multiple times on the command line.
+             *
+             * @tparam T_Slave type of the plugin (must inherit from ISlave)
+             */
+            template<typename T_Slave>
+            class Master : public ISimulationPlugin
             {
-                slaveList.emplace_back(
-                    slaveHelp->create(
-                        slaveHelp,
-                        i,
-                        m_cellDescription
-                    )
-                );
-            }
-        }
-
-        void pluginUnload( )
-        {
-            slaveList.clear( );
-        }
-
-        void notify(uint32_t currentStep)
-        {
-            // nothing to do here
-        }
-
-    };
-
-} // namespace multi
-} // namespace plugins
-
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_Slave
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        plugins::multi::Master< T_Slave >
-    >
+            public:
+                using Slave = T_Slave;
+                using SlaveList = std::list<std::shared_ptr<ISlave>>;
+                SlaveList slaveList;
+
+                std::shared_ptr<IHelp> slaveHelp;
+
+                MappingDesc* m_cellDescription = nullptr;
+
+                Master() : slaveHelp(Slave::getHelp())
+                {
+                    Environment<>::get().PluginConnector().registerPlugin(this);
+                }
+
+                virtual ~Master()
+                {
+                }
+
+                std::string pluginGetName() const
+                {
+                    // the PMacc plugin system needs a short description instead of the plugin name
+                    return slaveHelp->getName() + ": " + slaveHelp->getDescription();
+                }
+
+                void pluginRegisterHelp(boost::program_options::options_description& desc)
+                {
+                    slaveHelp->registerHelp(desc);
+                }
+
+                void setMappingDescription(MappingDesc* cellDescription)
+                {
+                    m_cellDescription = cellDescription;
+                }
+
+                /** restart a checkpoint
+                 *
+                 * Trigger the method restart() for all slave instances.
+                 */
+                void restart(uint32_t restartStep, std::string const restartDirectory)
+                {
+                    for(auto& slave : slaveList)
+                        slave->restart(restartStep, restartDirectory);
+                }
+
+                /** create a checkpoint
+                 *
+                 * Trigger the method checkpoint() for all slave instances.
+                 */
+                void checkpoint(uint32_t currentStep, std::string const checkpointDirectory)
+                {
+                    for(auto& slave : slaveList)
+                        slave->checkpoint(currentStep, checkpointDirectory);
+                }
+
+            private:
+                void pluginLoad()
+                {
+                    size_t const numSlaves = slaveHelp->getNumPlugins();
+                    if(numSlaves > 0u)
+                        slaveHelp->validateOptions();
+                    for(size_t i = 0; i < numSlaves; ++i)
+                    {
+                        slaveList.emplace_back(slaveHelp->create(slaveHelp, i, m_cellDescription));
+                    }
+                }
+
+                void pluginUnload()
+                {
+                    slaveList.clear();
+                }
+
+                void notify(uint32_t currentStep)
+                {
+                    // nothing to do here
+                }
+            };
+
+        } // namespace multi
+    } // namespace plugins
+
+    namespace particles
     {
-        using type = typename SpeciesEligibleForSolver<
-            T_Species,
-            T_Slave
-        >::type;
-    };
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            template<typename T_Species, typename T_Slave>
+            struct SpeciesEligibleForSolver<T_Species, plugins::multi::Master<T_Slave>>
+            {
+                using type = typename SpeciesEligibleForSolver<T_Species, T_Slave>::type;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/plugins/multi/Option.hpp b/include/picongpu/plugins/multi/Option.hpp
index b5a7677a90..063b0ab9aa 100644
--- a/include/picongpu/plugins/multi/Option.hpp
+++ b/include/picongpu/plugins/multi/Option.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,169 +27,161 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace multi
-{
-
-    /** multi option storage
-     *
-     * This option stores the values of a multi command line option
-     * and allows to set a default value.
-     *
-     * @tparam T_ValueType type of the option
-     */
-    template< typename T_ValueType >
-    struct Option : public std::vector< T_ValueType >
+    namespace plugins
     {
-        using StorageType = std::vector< T_ValueType >;
-
-        //! type of the value
-        using ValueType = T_ValueType;
-
-
-        /** create a option with a default value
-         *
-         * @param name name of the option
-         * @param description description for the option
-         * @param defaultValue default value of the option
-         */
-        Option(
-            std::string const & name,
-            std::string const & description,
-            ValueType const & defaultValue
-        ) :
-            m_name( name ),
-            m_description( description ),
-            m_defaultValue( defaultValue ),
-            m_hasDefaultValue( true )
-        {
-        }
-
-        /** create a option without a default value
-         *
-         * @param name name of the option
-         * @param description description for the option
-         */
-        Option(
-            std::string const & name,
-            std::string const & description
-        ) :
-            m_name( name ),
-            m_description( description ),
-            m_hasDefaultValue( false )
-        {
-        }
-
-        /** get the name of the option
-         *
-         * @return name
-         */
-        std::string getName()
-        {
-            return m_name;
-        }
-
-        /** get the description of the option
-         *
-         * @return description
-         */
-        std::string getDescription()
-        {
-            return m_description;
-        }
-
-        /** register the option
-         *
-         * @param desc option object where the option is appended
-         * @param prefix prefix to add to the option name
-         * @param additionalDescription extent the default description
-         */
-        void registerHelp(
-            boost::program_options::options_description & desc,
-            std::string const & prefix = std::string{ },
-            std::string const & additionalDescription = std::string{ }
-        )
-        {
-            std::string printDefault;
-            if( m_hasDefaultValue )
-                printDefault = std::string( " | default: " ) + getDefaultAsStr();
-
-            desc.add_options( )(
-                ( prefix + "." + getName() ).c_str( ),
-                boost::program_options::value( getStorage() )->multitoken( ),
-                ( getDescription() + additionalDescription + printDefault ).c_str()
-            );
-        }
-
-        /** get the default value
-         *
-         * Throw an exception if there is no default value defined.
-         *
-         * @param get the default value defined during the construction of this class
-         */
-        T_ValueType getDefault()
-        {
-            if( !m_hasDefaultValue )
-                throw std::runtime_error( std::string("There is no default value defined for the option: " ) + getName() );
-            return m_defaultValue;
-        }
-
-        /** set a default value
-         *
-         * The old default value will be overwritten if already exists.
-         *
-         * @param defaultValue new default value
-         */
-        void setDefault( T_ValueType const & defaultValue )
+        namespace multi
         {
-            m_hasDefaultValue = true;
-            m_defaultValue = defaultValue;
-        }
-
-        //! get the default value as string
-        std::string getDefaultAsStr()
-        {
-            std::stringstream ss;
-            ss << getDefault( );
-            return ss.str();
-        }
-
-        /** get the value set by the user
-         *
-         * Throw an exception if there is no default value defined and idx is
-         * larger than the number of options provided by the user.
-         *
-         * @param idx index of the multi plugin
-         * @return if number of user provided option <= idx then the user defined
-         *         value else the default value if defined
-         */
-        T_ValueType get( uint32_t idx )
-        {
-            if( StorageType::size() <= idx )
+            /** multi option storage
+             *
+             * This option stores the values of a multi command line option
+             * and allows to set a default value.
+             *
+             * @tparam T_ValueType type of the option
+             */
+            template<typename T_ValueType>
+            struct Option : public std::vector<T_ValueType>
             {
-                if( !m_hasDefaultValue )
-                    throw std::runtime_error( std::string("There is no default value defined for the option " + getName() + " and idx is out of range") );
-                return m_defaultValue;
-            }
-
-            return StorageType::operator[]( idx );
-        }
-
-    private:
-
-        std::string const m_name;
-        std::string const m_description;
-
-        T_ValueType m_defaultValue;
-        bool m_hasDefaultValue = false;
-
-        StorageType* getStorage()
-        {
-            return static_cast<StorageType*>(this);
-        }
-    };
-
-} // namespace multi
-} // namespace plugins
+                using StorageType = std::vector<T_ValueType>;
+
+                //! type of the value
+                using ValueType = T_ValueType;
+
+
+                /** create a option with a default value
+                 *
+                 * @param name name of the option
+                 * @param description description for the option
+                 * @param defaultValue default value of the option
+                 */
+                Option(std::string const& name, std::string const& description, ValueType const& defaultValue)
+                    : m_name(name)
+                    , m_description(description)
+                    , m_defaultValue(defaultValue)
+                    , m_hasDefaultValue(true)
+                {
+                }
+
+                /** create a option without a default value
+                 *
+                 * @param name name of the option
+                 * @param description description for the option
+                 */
+                Option(std::string const& name, std::string const& description)
+                    : m_name(name)
+                    , m_description(description)
+                    , m_hasDefaultValue(false)
+                {
+                }
+
+                /** get the name of the option
+                 *
+                 * @return name
+                 */
+                std::string getName()
+                {
+                    return m_name;
+                }
+
+                /** get the description of the option
+                 *
+                 * @return description
+                 */
+                std::string getDescription()
+                {
+                    return m_description;
+                }
+
+                /** register the option
+                 *
+                 * @param desc option object where the option is appended
+                 * @param prefix prefix to add to the option name
+                 * @param additionalDescription extent the default description
+                 */
+                void registerHelp(
+                    boost::program_options::options_description& desc,
+                    std::string const& prefix = std::string{},
+                    std::string const& additionalDescription = std::string{})
+                {
+                    std::string printDefault;
+                    if(m_hasDefaultValue)
+                        printDefault = std::string(" | default: ") + getDefaultAsStr();
+
+                    desc.add_options()(
+                        (prefix + "." + getName()).c_str(),
+                        boost::program_options::value(getStorage())->multitoken(),
+                        (getDescription() + additionalDescription + printDefault).c_str());
+                }
+
+                /** get the default value
+                 *
+                 * Throw an exception if there is no default value defined.
+                 *
+                 * @param get the default value defined during the construction of this class
+                 */
+                T_ValueType getDefault()
+                {
+                    if(!m_hasDefaultValue)
+                        throw std::runtime_error(
+                            std::string("There is no default value defined for the option: ") + getName());
+                    return m_defaultValue;
+                }
+
+                /** set a default value
+                 *
+                 * The old default value will be overwritten if already exists.
+                 *
+                 * @param defaultValue new default value
+                 */
+                void setDefault(T_ValueType const& defaultValue)
+                {
+                    m_hasDefaultValue = true;
+                    m_defaultValue = defaultValue;
+                }
+
+                //! get the default value as string
+                std::string getDefaultAsStr()
+                {
+                    std::stringstream ss;
+                    ss << getDefault();
+                    return ss.str();
+                }
+
+                /** get the value set by the user
+                 *
+                 * Throw an exception if there is no default value defined and idx is
+                 * larger than the number of options provided by the user.
+                 *
+                 * @param idx index of the multi plugin
+                 * @return if number of user provided option <= idx then the user defined
+                 *         value else the default value if defined
+                 */
+                T_ValueType get(uint32_t idx)
+                {
+                    if(StorageType::size() <= idx)
+                    {
+                        if(!m_hasDefaultValue)
+                            throw std::runtime_error(std::string(
+                                "There is no default value defined for the option " + getName()
+                                + " and idx is out of range"));
+                        return m_defaultValue;
+                    }
+
+                    return StorageType::operator[](idx);
+                }
+
+            private:
+                std::string const m_name;
+                std::string const m_description;
+
+                T_ValueType m_defaultValue;
+                bool m_hasDefaultValue = false;
+
+                StorageType* getStorage()
+                {
+                    return static_cast<StorageType*>(this);
+                }
+            };
+
+        } // namespace multi
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/multi/multi.hpp b/include/picongpu/plugins/multi/multi.hpp
index 0cd75d9005..56481a4cbb 100644
--- a/include/picongpu/plugins/multi/multi.hpp
+++ b/include/picongpu/plugins/multi/multi.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/plugins/openPMD/Json.cpp b/include/picongpu/plugins/openPMD/Json.cpp
new file mode 100644
index 0000000000..42bcb280e1
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/Json.cpp
@@ -0,0 +1,335 @@
+/* Copyright 2021 Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if ENABLE_OPENPMD == 1
+
+#    include "picongpu/plugins/openPMD/Json.hpp"
+#    include "picongpu/plugins/openPMD/Json_private.hpp"
+
+#    include <algorithm> // std::copy_n, std::find
+#    include <cctype> // std::isspace
+#    include <fstream>
+#    include <sstream>
+
+/*
+ * Note:
+ * This is a hostonly .cpp file because CMake will not use -isystem for system
+ * include paths on NVCC targets created with cupla_add_executable.
+ * Since <nlohmann/json.hpp> throws a number of warnings, this .cpp file
+ * ensures that NVCC never sees that library.
+ */
+
+// Anonymous namespace so these helpers don't get exported
+namespace
+{
+    /**
+     * @brief Remove leading and trailing characters from a string.
+     *
+     * @tparam F Functor type for to_remove
+     * @param s String to trim.
+     * @param to_remove Functor deciding which characters to remove.
+     */
+    template<typename F>
+    std::string trim(std::string const& s, F&& to_remove)
+    {
+        auto begin = s.begin();
+        for(; begin != s.end(); ++begin)
+        {
+            if(!to_remove(*begin))
+            {
+                break;
+            }
+        }
+        auto end = s.rbegin();
+        for(; end != s.rend(); ++end)
+        {
+            if(!to_remove(*end))
+            {
+                break;
+            }
+        }
+        return s.substr(begin - s.begin(), end.base() - begin);
+    }
+
+    /**
+     * @brief Check whether the string points to a filename or not.
+     *
+     * A string is considered to point to a filename if its first
+     * non-whitespace character is an '@'.
+     * The filename will be trimmed of whitespace using trim().
+     *
+     * @param unparsed The string that possibly points to a file.
+     * @return The filename if the string points to the file, an empty
+     *         string otherwise.
+     *
+     * @todo Upon switching to C++17, use std::optional to make the return
+     *       type clearer.
+     *       Until then, this is somewhat safe anyway since filenames need
+     *       to be non-empty.
+     */
+    std::string extractFilename(std::string const& unparsed)
+    {
+        std::string trimmed = trim(unparsed, [](char c) { return std::isspace(c); });
+        if(trimmed.at(0) == '@')
+        {
+            trimmed = trimmed.substr(1);
+            trimmed = trim(trimmed, [](char c) { return std::isspace(c); });
+            return trimmed;
+        }
+        else
+        {
+            return {};
+        }
+    }
+
+    /**
+     * @brief Read a file in MPI-collective manner.
+     *
+     * The file is read on rank 0 and its contents subsequently distributed
+     * to all other ranks.
+     *
+     * @param path Path for the file to read.
+     * @param comm MPI communicator.
+     * @return std::string Full file content.
+     */
+    std::string collective_file_read(std::string const& path, MPI_Comm comm)
+    {
+        int rank, size;
+        MPI_Comm_rank(comm, &rank);
+        MPI_Comm_size(comm, &size);
+
+        std::string res;
+        size_t stringLength = 0;
+        if(rank == 0)
+        {
+            std::fstream handle;
+            handle.open(path, std::ios_base::in);
+            std::stringstream stream;
+            stream << handle.rdbuf();
+            res = stream.str();
+            if(!handle.good())
+            {
+                throw std::runtime_error("Failed reading JSON config from file " + path + ".");
+            }
+            stringLength = res.size() + 1;
+        }
+        MPI_Datatype datatype = MPI_Types<size_t>{}.value;
+        int err = MPI_Bcast(&stringLength, 1, datatype, 0, comm);
+        if(err)
+        {
+            throw std::runtime_error("[collective_file_read] MPI_Bcast stringLength failure.");
+        }
+        std::vector<char> recvbuf(stringLength, 0);
+        if(rank == 0)
+        {
+            std::copy_n(res.c_str(), stringLength, recvbuf.data());
+        }
+        err = MPI_Bcast(recvbuf.data(), stringLength, MPI_CHAR, 0, comm);
+        if(err)
+        {
+            throw std::runtime_error("[collective_file_read] MPI_Bcast file content failure.");
+        }
+        if(rank != 0)
+        {
+            res = recvbuf.data();
+        }
+        return res;
+    }
+
+    KindOfConfig readPattern(
+        std::vector<Pattern>& patterns,
+        nlohmann::json& defaultConfig,
+        nlohmann::json const& object)
+    {
+        static std::string const errorMsg = R"END(
+[openPMD plugin] Each single pattern in an extended JSON configuration
+must be a JSON object with keys 'select' and 'cfg'.
+The key 'select' is optional, indicating a default configuration if it is
+not set.
+The key 'select' must point to either a single string or an array of strings.)END";
+
+        if(!object.is_object())
+        {
+            throw std::runtime_error(errorMsg);
+        }
+        try
+        {
+            nlohmann::json const& cfg = object.at("cfg");
+            if(!object.contains("select"))
+            {
+                nlohmann::json const& cfg = object.at("cfg");
+                defaultConfig = cfg;
+                return KindOfConfig::Default;
+            }
+            else
+            {
+                nlohmann::json const& pattern = object.at("select");
+                auto cfgShared = std::make_shared<nlohmann::json>(cfg);
+                if(pattern.is_string())
+                {
+                    patterns.emplace_back(pattern.get<std::string>(), std::move(cfgShared));
+                }
+                else if(pattern.is_array())
+                {
+                    patterns.reserve(pattern.size());
+                    for(size_t i = 0; i < pattern.size(); ++i)
+                    {
+                        patterns.emplace_back(pattern[i].get<std::string>(), cfgShared);
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error(errorMsg);
+                }
+                return KindOfConfig::Pattern;
+            }
+        }
+        catch(nlohmann::json::out_of_range const&)
+        {
+            throw std::runtime_error(errorMsg);
+        }
+    }
+
+    void MatcherPerBackend::init(nlohmann::json const& config)
+    {
+        if(config.is_object())
+        {
+            // simple layout: only one global JSON object was passed
+            // forward this one directly to openPMD
+            m_patterns.emplace_back("", std::make_shared<nlohmann::json>(config));
+        }
+        else if(config.is_array())
+        {
+            bool defaultEmplaced = false;
+            // enhanced PIConGPU-defined layout
+            for(size_t i = 0; i < config.size(); ++i)
+            {
+                auto kindOfConfig = readPattern(m_patterns, m_defaultConfig, config[i]);
+                if(kindOfConfig == KindOfConfig::Default)
+                {
+                    if(defaultEmplaced)
+                    {
+                        throw std::runtime_error("[openPMD plugin] Specified more than one default configuration.");
+                    }
+                    else
+                    {
+                        defaultEmplaced = true;
+                    }
+                }
+            }
+        }
+        else
+        {
+            throw std::runtime_error("[openPMD plugin] Expecting an object or an array as JSON configuration.");
+        }
+    }
+
+    /**
+     * @brief Get the JSON config associated with a regex pattern.
+     *
+     * @param datasetPath The regex.
+     * @return The matched JSON configuration, as a string.
+     */
+    nlohmann::json const& MatcherPerBackend::get(std::string const& datasetPath) const
+    {
+        for(auto const& pattern : m_patterns)
+        {
+            if(std::regex_match(datasetPath, pattern.pattern))
+            {
+                return *pattern.config;
+            }
+        }
+        static nlohmann::json const emptyConfig; // null
+        return emptyConfig;
+    }
+} // namespace
+
+namespace picongpu
+{
+    namespace json
+    {
+        void JsonMatcher::init(std::string const& config, MPI_Comm comm)
+        {
+            auto const filename = extractFilename(config);
+            m_wholeConfig = nlohmann::json::parse(filename.empty() ? config : collective_file_read(filename, comm));
+            if(!m_wholeConfig.is_object())
+            {
+                throw std::runtime_error("[openPMD plugin] Expected an object for the JSON configuration.");
+            }
+            m_perBackend.reserve(m_wholeConfig.size());
+            for(auto it = m_wholeConfig.begin(); it != m_wholeConfig.end(); ++it)
+            {
+                std::string const& backendName = it.key();
+                if(std::find(m_recognizedBackends.begin(), m_recognizedBackends.end(), backendName)
+                   == m_recognizedBackends.end())
+                {
+                    // The key does not point to the configuration of a backend recognized by PIConGPU
+                    // Ignore it.
+                    continue;
+                }
+                if(!it.value().is_object())
+                {
+                    throw std::runtime_error(
+                        "[openPMD plugin] Each backend's configuration must be a JSON object (config for backend "
+                        + backendName + ").");
+                }
+                if(it.value().contains("dataset"))
+                {
+                    m_perBackend.emplace_back(PerBackend{backendName, MatcherPerBackend{it.value().at("dataset")}});
+                }
+            }
+        }
+        std::string JsonMatcher::get(std::string const& datasetPath) const
+        {
+            nlohmann::json result = nlohmann::json::object();
+            for(auto const& backend : m_perBackend)
+            {
+                auto const& datasetConfig = backend.matcher.get(datasetPath);
+                if(datasetConfig.empty())
+                {
+                    continue;
+                }
+                result[backend.backendName]["dataset"] = datasetConfig;
+            }
+            return result.dump();
+        }
+
+        std::string JsonMatcher::getDefault() const
+        {
+            nlohmann::json result = m_wholeConfig;
+            for(auto const& backend : m_perBackend)
+            {
+                auto const& datasetConfig = backend.matcher.getDefault();
+                if(datasetConfig.empty())
+                {
+                    continue;
+                }
+                result[backend.backendName]["dataset"] = datasetConfig;
+            }
+            return result.dump();
+        }
+
+        std::unique_ptr<AbstractJsonMatcher> AbstractJsonMatcher::construct(std::string const& config, MPI_Comm comm)
+        {
+            return std::unique_ptr<AbstractJsonMatcher>{new JsonMatcher{config, comm}};
+        }
+    } // namespace json
+} // namespace picongpu
+
+#endif // ENABLE_OPENPMD
diff --git a/include/picongpu/plugins/openPMD/Json.hpp b/include/picongpu/plugins/openPMD/Json.hpp
new file mode 100644
index 0000000000..436e550287
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/Json.hpp
@@ -0,0 +1,77 @@
+/* Copyright 2021 Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <mpi.h>
+
+#include <memory> // std::unique_ptr
+#include <string>
+
+namespace picongpu
+{
+    namespace json
+    {
+        /**
+         * @brief Class to handle extended JSON configurations as used by
+         *        the openPMD plugin.
+         *
+         * This class handles parsing of the extended JSON patterns as well as
+         * selection of one JSON configuration by regex.
+         *
+         */
+        class AbstractJsonMatcher
+        {
+        public:
+            /**
+             * @brief Construct a JSON matcher to hand out dataset-specific configurations
+             *
+             * This function will parse the given config, after reading it
+             * from a file if needed. In this case, the constructor is
+             * MPI-collective.
+             * It will distinguish per backend between ordinary openPMD JSON configurations
+             * and extended configurations as defined by PIConGPU.
+             * If an ordinary JSON configuration was detected, given regex
+             * patterns will be matched against "" (the empty string).
+             *
+             * @param config The JSON configuration, exactly as in --openPMD.json.
+             * @param comm MPI communicator for collective file reading, if needed.
+             * @return std::unique_ptr<AbstractJsonMatcher>
+             */
+            static std::unique_ptr<AbstractJsonMatcher> construct(std::string const& config, MPI_Comm comm);
+
+            virtual ~AbstractJsonMatcher() = default;
+
+            /**
+             * @brief Get the JSON config associated with a regex pattern.
+             *
+             * @param datasetPath The regex.
+             * @return The matched JSON configuration, as a string.
+             */
+            virtual std::string get(std::string const& datasetPath) const = 0;
+
+            /**
+             * @brief Get the default JSON config.
+             *
+             * @return The default JSON configuration, as a string.
+             */
+            virtual std::string getDefault() const = 0;
+        };
+    } // namespace json
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/Json_private.hpp b/include/picongpu/plugins/openPMD/Json_private.hpp
new file mode 100644
index 0000000000..f6dec2da5a
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/Json_private.hpp
@@ -0,0 +1,273 @@
+/* Copyright 2021 Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/plugins/openPMD/Json.hpp"
+
+#include <mpi.h>
+#include <nlohmann/json.hpp>
+
+#include <regex>
+#include <string>
+#include <vector>
+
+/*
+ * Note:
+ * This header is included only into hostonly .cpp files because CMake
+ * will not use -isystem for system include paths on NVCC targets created
+ * with cupla_add_executable.
+ * Since <nlohmann/json.hpp> throws a number of warnings, this design
+ * ensures that NVCC never sees that library.
+ */
+
+// Anonymous namespace so these helpers don't get exported
+namespace
+{
+    /**
+     * @brief Remove leading and trailing characters from a string.
+     *
+     * @tparam F Functor type for to_remove
+     * @param s String to trim.
+     * @param to_remove Functor deciding which characters to remove.
+     */
+    template<typename F>
+    std::string trim(std::string const& s, F&& to_remove);
+
+    /**
+     * @brief Check whether the string points to a filename or not.
+     *
+     * A string is considered to point to a filename if its first
+     * non-whitespace character is an '@'.
+     * The filename will be trimmed of whitespace using trim().
+     *
+     * @param unparsed The string that possibly points to a file.
+     * @return The filename if the string points to the file, an empty
+     *         string otherwise.
+     *
+     * @todo Upon switching to C++17, use std::optional to make the return
+     *       type clearer.
+     *       Until then, this is somewhat safe anyway since filenames need
+     *       to be non-empty.
+     */
+    std::string extractFilename(std::string const& unparsed);
+
+    /**
+     * @brief Helper class to help figure out a platform-independent
+     *        MPI_Datatype for size_t.
+     */
+    template<typename>
+    struct MPI_Types;
+
+    template<>
+    struct MPI_Types<unsigned long>
+    {
+        // can't make this constexpr due to MPI
+        // so, make this non-static for simplicity
+        MPI_Datatype value = MPI_UNSIGNED_LONG;
+    };
+
+    template<>
+    struct MPI_Types<unsigned long long>
+    {
+        MPI_Datatype value = MPI_UNSIGNED_LONG_LONG;
+    };
+
+    template<>
+    struct MPI_Types<unsigned>
+    {
+        MPI_Datatype value = MPI_UNSIGNED;
+    };
+
+    /**
+     * @brief Read a file in MPI-collective manner.
+     *
+     * The file is read on rank 0 and its contents subsequently distributed
+     * to all other ranks.
+     *
+     * @param path Path for the file to read.
+     * @param comm MPI communicator.
+     * @return std::string Full file content.
+     */
+    std::string collective_file_read(std::string const& path, MPI_Comm comm);
+
+    struct Pattern
+    {
+        std::regex pattern;
+        std::shared_ptr<nlohmann::json const> config;
+
+        Pattern(std::string pattern_in, std::shared_ptr<nlohmann::json const> config_in)
+            // we construct the patterns once and use them often, so let's ask for some optimization
+            : pattern{std::move(pattern_in), std::regex_constants::egrep | std::regex_constants::optimize}
+            , config{std::move(config_in)}
+        {
+        }
+    };
+
+    enum class KindOfConfig : char
+    {
+        Pattern,
+        Default
+    };
+
+    /**
+     * @brief Read a single JSON pattern of the form {"select": ..., "cfg": ...}
+     *
+     * The "select" key is optional, indicating the default configuration if it
+     * is missing.
+     *
+     * @param patterns Output parameter: Emplace a parsed pattern into this list.
+     * @param defaultConfig Output parameter: If the pattern was the default pattern,
+     *                      emplace it here.
+     * @param object The JSON object that is parsed as the pattern.
+     * @return Whether the pattern was the default configuration or not.
+     */
+    KindOfConfig readPattern(
+        std::vector<Pattern>& patterns,
+        nlohmann::json& defaultConfig,
+        nlohmann::json const& object);
+
+    /**
+     * @brief Matcher for dataset configurations per backend.
+     *
+     */
+    class MatcherPerBackend
+    {
+    private:
+        nlohmann::json m_defaultConfig;
+        std::vector<Pattern> m_patterns;
+
+        void init(nlohmann::json const& config);
+
+    public:
+        /**
+         * @brief For default construction.
+         */
+        explicit MatcherPerBackend() = default;
+
+        /**
+         * @brief Initialize one backend's JSON matcher from its configuration.
+         *
+         * This constructor will parse the given config.
+         * It will distinguish between ordinary openPMD JSON configurations
+         * and extended configurations as defined by PIConGPU.
+         * If an ordinary JSON configuration was detected, given regex
+         * patterns will be matched against "" (the empty string).
+         *
+         * @param config The JSON configuration for one backend.
+         *               E.g. for ADIOS2, this will be the sub-object/array found under
+         *               config["adios2"]["dataset"].
+         */
+        MatcherPerBackend(nlohmann::json const& config)
+        {
+            init(config);
+        }
+
+        /**
+         * @brief Get the JSON config associated with a regex pattern.
+         *
+         * @param datasetPath The regex.
+         * @return The matched JSON configuration, as a string.
+         */
+        nlohmann::json const& get(std::string const& datasetPath) const;
+
+        /**
+         * @brief Get the default JSON config.
+         *
+         * @return The default JSON configuration, as a string.
+         */
+        nlohmann::json const& getDefault() const
+        {
+            return m_defaultConfig;
+        }
+    };
+} // namespace
+
+namespace picongpu
+{
+    namespace json
+    {
+        /**
+         * @brief Class to handle extended JSON configurations as used by
+         *        the openPMD plugin.
+         *
+         * This class handles parsing of the extended JSON patterns as well as
+         * selection of one JSON configuration by regex.
+         *
+         */
+        class JsonMatcher : public AbstractJsonMatcher
+        {
+        private:
+            struct PerBackend
+            {
+                std::string backendName;
+                MatcherPerBackend matcher;
+            };
+            std::vector<PerBackend> m_perBackend;
+            nlohmann::json m_wholeConfig;
+            static std::vector<std::string> const m_recognizedBackends;
+
+            void init(std::string const& config, MPI_Comm comm);
+
+        public:
+            /**
+             * @brief For default construction.
+             */
+            explicit JsonMatcher() = default;
+
+            /**
+             * @brief Initialize JSON matcher from command line arguments.
+             *
+             * This constructor will parse the given config, after reading it
+             * from a file if needed. In this case, the constructor is
+             * MPI-collective.
+             * It will distinguish between ordinary openPMD JSON configurations
+             * and extended configurations as defined by PIConGPU.
+             * If an ordinary JSON configuration was detected, given regex
+             * patterns will be matched against "" (the empty string).
+             *
+             * @param config The JSON configuration, exactly as in
+             *               --openPMD.json.
+             * @param comm MPI communicator for collective file reading,
+             *             if needed.
+             */
+            JsonMatcher(std::string const& config, MPI_Comm comm)
+            {
+                init(config, comm);
+            }
+
+            /**
+             * @brief Get the JSON config associated with a regex pattern.
+             *
+             * @param datasetPath The regex.
+             * @return The matched JSON configuration, as a string.
+             */
+            std::string get(std::string const& datasetPath) const override;
+
+            /**
+             * @brief Get the default JSON config.
+             *
+             * @return The default JSON configuration, as a string.
+             */
+            std::string getDefault() const override;
+        };
+
+        std::vector<std::string> const JsonMatcher::m_recognizedBackends = {"adios1", "adios2", "hdf5", "json"};
+    } // namespace json
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/NDScalars.hpp b/include/picongpu/plugins/openPMD/NDScalars.hpp
new file mode 100644
index 0000000000..b83b910fd5
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/NDScalars.hpp
@@ -0,0 +1,200 @@
+/* Copyright 2016-2021 Alexander Grund, Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/plugins/openPMD/openPMDWriter.def"
+#include "picongpu/plugins/openPMD/openPMDVersion.def"
+
+#include <pmacc/Environment.hpp>
+#include <pmacc/types.hpp>
+
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        /** Functor for writing N-dimensional scalar fields with N=simDim
+         * In the current implementation each process (of the ND grid of processes)
+         * writes 1 scalar value Optionally the processes can also write an
+         * attribute for this dataset by using a non-empty attrName
+         *
+         * @tparam T_Scalar    Type of the scalar value to write
+         * @tparam T_Attribute Type of the attribute (can be omitted if attribute is
+         * not written, defaults to uint64_t)
+         */
+        template<typename T_Scalar, typename T_Attribute = uint64_t>
+        struct WriteNDScalars
+        {
+            WriteNDScalars(
+                const std::string& baseName,
+                const std::string& group,
+                const std::string& dataset,
+                const std::string& attrName = "")
+                : baseName(baseName)
+                , group(group)
+                , dataset(dataset)
+                , attrName(attrName)
+            {
+            }
+
+        private:
+            /** Prepare the write operation:
+             *  Define openPMD dataset and write
+             * attribute (if attrName is non-empty)
+             *
+             *  Must be called before executing the functor
+             */
+            std::tuple<::openPMD::MeshRecordComponent, ::openPMD::Offset, ::openPMD::Extent> prepare(
+                ThreadParams& params,
+                T_Attribute attribute)
+            {
+                auto name = baseName + "/" + group + "/" + dataset;
+                const auto openPMDScalarType = ::openPMD::determineDatatype<T_Scalar>();
+                using Dimensions = pmacc::math::UInt64<simDim>;
+
+                log<picLog::INPUT_OUTPUT>("openPMD: prepare write %1%D scalars: %2%") % simDim % name;
+
+                // Size over all processes
+                Dimensions globalDomainSize = Dimensions::create(1);
+                Dimensions localDomainOffset = Dimensions::create(0);
+
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    globalDomainSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d];
+                    localDomainOffset[d] = Environment<simDim>::get().GridController().getPosition()[d];
+                }
+
+                ::openPMD::Series& series = *params.openPMDSeries;
+                ::openPMD::MeshRecordComponent mrc
+                    = series.WRITE_ITERATIONS[params.currentStep].meshes[baseName + "_" + group][dataset];
+
+                if(!attrName.empty())
+                {
+                    log<picLog::INPUT_OUTPUT>("openPMD: write attribute %1% of %2%D scalars: %3%") % attrName % simDim
+                        % name;
+
+                    mrc.setAttribute(attrName, attribute);
+                }
+
+                std::string datasetName = series.meshesPath() + baseName + "_" + group + "/" + dataset;
+                params.initDataset<simDim>(
+                    mrc,
+                    openPMDScalarType,
+                    std::move(globalDomainSize),
+                    true,
+                    params.compressionMethod,
+                    datasetName);
+
+                return std::make_tuple(
+                    std::move(mrc),
+                    static_cast<::openPMD::Offset>(asStandardVector(std::move(localDomainOffset))),
+                    static_cast<::openPMD::Extent>(asStandardVector(Dimensions::create(1))));
+            }
+
+        public:
+            void operator()(ThreadParams& params, T_Scalar value, T_Attribute attribute = T_Attribute())
+            {
+                auto tuple = prepare(params, std::move(attribute));
+                auto name = baseName + "/" + group + "/" + dataset;
+                log<picLog::INPUT_OUTPUT>("openPMD: write %1%D scalars: %2%") % simDim % name;
+
+                std::get<0>(tuple).storeChunk(
+                    std::make_shared<T_Scalar>(value),
+                    std::move(std::get<1>(tuple)),
+                    std::move(std::get<2>(tuple)));
+                params.openPMDSeries->flush();
+            }
+
+        private:
+            const std::string baseName, group, dataset, attrName;
+            int64_t varId;
+        };
+
+        /** Functor for reading ND scalar fields with N=simDim
+         * In the current implementation each process (of the ND grid of processes)
+         * reads 1 scalar value Optionally the processes can also read an attribute
+         * for this dataset by using a non-empty attrName
+         *
+         * @tparam T_Scalar    Type of the scalar value to read
+         * @tparam T_Attribute Type of the attribute (can be omitted if attribute is
+         * not read, defaults to uint64_t)
+         */
+        template<typename T_Scalar, typename T_Attribute = uint64_t>
+        struct ReadNDScalars
+        {
+            /** Read the skalar field and optionally the attribute into the values
+             * referenced by the pointers */
+            void operator()(
+                ThreadParams& params,
+                const std::string& baseName,
+                const std::string& group,
+                const std::string& dataset,
+                T_Scalar* value,
+                const std::string& attrName = "",
+                T_Attribute* attribute = nullptr)
+            {
+                auto name = baseName + "/" + group + "/" + dataset;
+                log<picLog::INPUT_OUTPUT>("openPMD: read %1%D scalars: %2%") % simDim % name;
+
+
+                auto datasetName = baseName + "/" + group + "/" + dataset;
+                ::openPMD::Series& series = *params.openPMDSeries;
+                ::openPMD::MeshRecordComponent mrc
+                    = series.iterations[params.currentStep].meshes[baseName + "_" + group][dataset];
+                auto ndim = mrc.getDimensionality();
+                if(ndim != simDim)
+                {
+                    throw std::runtime_error(std::string("Invalid dimensionality for ") + name);
+                }
+
+                DataSpace<simDim> gridPos = Environment<simDim>::get().GridController().getPosition();
+                ::openPMD::Offset start;
+                ::openPMD::Extent count;
+                start.reserve(ndim);
+                count.reserve(ndim);
+                for(int d = 0; d < ndim; ++d)
+                {
+                    start.push_back(gridPos.revert()[d]);
+                    count.push_back(1);
+                }
+
+                __getTransactionEvent().waitForFinished();
+
+                log<picLog::INPUT_OUTPUT>("openPMD: Schedule read scalar %1%)") % datasetName;
+
+                std::shared_ptr<T_Scalar> readValue = mrc.loadChunk<T_Scalar>(start, count);
+
+                series.flush();
+
+                *value = *readValue;
+
+                if(!attrName.empty())
+                {
+                    log<picLog::INPUT_OUTPUT>("openPMD: read attribute %1% for scalars: %2%") % attrName % name;
+                    *attribute = mrc.getAttribute(attrName).get<T_Attribute>();
+                }
+            }
+        };
+
+    } // namespace openPMD
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/WriteMeta.hpp b/include/picongpu/plugins/openPMD/WriteMeta.hpp
new file mode 100644
index 0000000000..15cf42aad0
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/WriteMeta.hpp
@@ -0,0 +1,239 @@
+/* Copyright 2013-2021 Axel Huebl, Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include "picongpu/fields/absorber/ExponentialDamping.hpp"
+#include "picongpu/fields/currentInterpolation/CurrentInterpolation.hpp"
+#include "picongpu/plugins/common/stringHelpers.hpp"
+#include "picongpu/plugins/openPMD/openPMDWriter.def"
+#include "picongpu/plugins/openPMD/openPMDVersion.def"
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/traits/SIBaseUnits.hpp"
+
+#include <pmacc/Environment.hpp>
+
+#include <openPMD/openPMD.hpp>
+
+#include <list>
+#include <sstream>
+#include <string>
+
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        using namespace pmacc;
+
+        namespace writeMeta
+        {
+            /** write openPMD species meta data
+             *
+             * @tparam numSpecies count of defined species
+             */
+            template<uint32_t numSpecies = bmpl::size<VectorAllSpecies>::type::value>
+            struct OfAllSpecies
+            {
+                /** write meta data for species
+                 *
+                 * @param threadParams context of the openPMD plugin
+                 * @param fullMeshesPath path to mesh entry
+                 */
+                void operator()(ThreadParams* threadParams) const
+                {
+                    /*
+                     * @todo set boundary per species
+                     */
+                    GetStringProperties<bmpl::at_c<VectorAllSpecies, 0>::type> particleBoundaryProp;
+                    std::vector<std::string> listParticleBoundary;
+                    std::vector<std::string> listParticleBoundaryParam;
+                    auto n = NumberOfExchanges<simDim>::value;
+                    listParticleBoundary.reserve(n - 1);
+                    listParticleBoundaryParam.reserve(n - 1);
+                    for(uint32_t i = n - 1; i > 0; --i)
+                    {
+                        if(FRONT % i == 0)
+                        {
+                            listParticleBoundary.push_back(particleBoundaryProp[ExchangeTypeNames()[i]]["name"].value);
+                            listParticleBoundaryParam.push_back(
+                                particleBoundaryProp[ExchangeTypeNames()[i]]["param"].value);
+                        }
+                    }
+
+                    ::openPMD::Iteration iteration
+                        = threadParams->openPMDSeries->WRITE_ITERATIONS[threadParams->currentStep];
+                    iteration.setAttribute("particleBoundary", listParticleBoundary);
+                    iteration.setAttribute("particleBoundaryParameters", listParticleBoundaryParam);
+                }
+            };
+
+            /** specialization if no species are defined */
+            template<>
+            struct OfAllSpecies<0>
+            {
+                /** write meta data for species
+                 *
+                 * @param threadParams context of the openPMD plugin
+                 * @param fullMeshesPath path to mesh entry
+                 */
+                void operator()(
+                    ThreadParams* /* threadParams */,
+                    const std::string& /* fullMeshesPath */
+                ) const
+                {
+                }
+            };
+
+        } // namespace writeMeta
+
+        struct WriteMeta
+        {
+            void operator()(ThreadParams* threadParams)
+            {
+                log<picLog::INPUT_OUTPUT>("openPMD: (begin) write meta attributes.");
+
+                ::openPMD::Series& series = *threadParams->openPMDSeries;
+
+                /*
+                 * The openPMD API will kindly write the obligatory metadata by
+                 * itself, so we don't need to do this manually. We give the
+                 * optional metadata:
+                 */
+
+                /*   recommended */
+                const std::string author = Environment<>::get().SimulationDescription().getAuthor();
+                if(author.length() > 0)
+                {
+                    series.setAuthor(author);
+                }
+
+                const std::string software("PIConGPU");
+
+                std::stringstream softwareVersion;
+                softwareVersion << PICONGPU_VERSION_MAJOR << "." << PICONGPU_VERSION_MINOR << "."
+                                << PICONGPU_VERSION_PATCH;
+                if(!std::string(PICONGPU_VERSION_LABEL).empty())
+                    softwareVersion << "-" << PICONGPU_VERSION_LABEL;
+                series.setSoftware(software, softwareVersion.str());
+
+                const std::string date = helper::getDateString("%F %T %z");
+                series.setDate(date);
+
+                ::openPMD::Iteration iteration = series.WRITE_ITERATIONS[threadParams->currentStep];
+                ::openPMD::Container<::openPMD::Mesh>& meshes = iteration.meshes;
+
+                // iteration-level attributes
+                iteration.setDt<float_X>(DELTA_T);
+                iteration.setTime(float_X(threadParams->currentStep) * DELTA_T);
+                iteration.setTimeUnitSI(UNIT_TIME);
+
+                GetStringProperties<fields::Solver> fieldSolverProps;
+                const std::string fieldSolver(fieldSolverProps["name"].value);
+                meshes.setAttribute("fieldSolver", fieldSolver);
+
+                if(fieldSolverProps.find("param") != fieldSolverProps.end())
+                {
+                    const std::string fieldSolverParam(fieldSolverProps["param"].value);
+                    meshes.setAttribute("fieldSolverParameters", fieldSolverParam);
+                }
+
+                /* order as in axisLabels:
+                 *    3D: z-lower, z-upper, y-lower, y-upper, x-lower, x-upper
+                 *    2D: y-lower, y-upper, x-lower, x-upper
+                 */
+                GetStringProperties<fields::absorber::Absorber> fieldBoundaryProp;
+                std::vector<std::string> listFieldBoundary;
+                std::vector<std::string> listFieldBoundaryParam;
+                auto n = NumberOfExchanges<simDim>::value;
+                listFieldBoundary.reserve(n - 1);
+                listFieldBoundaryParam.reserve(n - 1);
+                for(uint32_t i = n - 1; i > 0; --i)
+                {
+                    if(FRONT % i == 0)
+                    {
+                        listFieldBoundary.push_back(fieldBoundaryProp[ExchangeTypeNames()[i]]["name"].value);
+                        listFieldBoundaryParam.push_back(fieldBoundaryProp[ExchangeTypeNames()[i]]["param"].value);
+                    }
+                }
+
+                meshes.setAttribute("fieldBoundary", listFieldBoundary);
+                meshes.setAttribute("fieldBoundaryParameters", listFieldBoundaryParam);
+
+                writeMeta::OfAllSpecies<>()(threadParams);
+
+                GetStringProperties<currentInterpolation::CurrentInterpolationInfo> currentSmoothingProp;
+                const std::string currentSmoothing(currentSmoothingProp["name"].value);
+                meshes.setAttribute("currentSmoothing", currentSmoothing);
+
+                if(currentSmoothingProp.find("param") != currentSmoothingProp.end())
+                {
+                    const std::string currentSmoothingParam(currentSmoothingProp["param"].value);
+                    meshes.setAttribute("currentSmoothingParameters", currentSmoothingParam);
+                }
+
+                const std::string chargeCorrection("none");
+                meshes.setAttribute("chargeCorrection", chargeCorrection);
+
+                /* write current iteration */
+                log<picLog::INPUT_OUTPUT>("openPMD: meta: iteration");
+                iteration.setAttribute(
+                    "iteration",
+                    threadParams->currentStep); // openPMD API will not write this
+                                                // automatically
+
+                /* write number of slides */
+                log<picLog::INPUT_OUTPUT>("openPMD: meta: sim_slides");
+                uint32_t slides = MovingWindow::getInstance().getSlideCounter(threadParams->currentStep);
+                iteration.setAttribute("sim_slides", slides);
+
+                /*
+                 * Required time attributes are written automatically by openPMD API
+                 */
+
+
+                /* write normed grid parameters */
+                log<picLog::INPUT_OUTPUT>("openPMD: meta: grid");
+                std::string names[3] = {"cell_width", "cell_height", "cell_depth"};
+                for(unsigned i = 0; i < 3; ++i)
+                {
+                    iteration.setAttribute(names[i], cellSize[i]);
+                }
+
+
+                /* write base units */
+                log<picLog::INPUT_OUTPUT>("openPMD: meta: units");
+                iteration.setAttribute<double>("unit_energy", UNIT_ENERGY);
+                iteration.setAttribute<double>("unit_length", UNIT_LENGTH);
+                iteration.setAttribute<double>("unit_speed", UNIT_SPEED);
+                iteration.setAttribute<double>("unit_time", UNIT_TIME);
+                iteration.setAttribute<double>("unit_mass", UNIT_MASS);
+                iteration.setAttribute<double>("unit_charge", UNIT_CHARGE);
+                iteration.setAttribute<double>("unit_efield", UNIT_EFIELD);
+                iteration.setAttribute<double>("unit_bfield", UNIT_BFIELD);
+
+
+                /* write physical constants */
+                iteration.setAttribute("mue0", MUE0);
+                iteration.setAttribute("eps0", EPS0);
+
+                log<picLog::INPUT_OUTPUT>("openPMD: ( end ) wite meta attributes.");
+            }
+        };
+    } // namespace openPMD
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/WriteSpecies.hpp b/include/picongpu/plugins/openPMD/WriteSpecies.hpp
new file mode 100644
index 0000000000..0cde47aab4
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/WriteSpecies.hpp
@@ -0,0 +1,520 @@
+/* Copyright 2014-2021 Rene Widera, Felix Schmitt, Axel Huebl,
+ *                     Alexander Grund, Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/particles/traits/GetSpeciesFlagName.hpp"
+#include "picongpu/plugins/ISimulationPlugin.hpp"
+#include "picongpu/plugins/kernel/CopySpecies.kernel"
+#include "picongpu/plugins/openPMD/openPMDWriter.def"
+#include "picongpu/plugins/openPMD/openPMDVersion.def"
+#include "picongpu/plugins/openPMD/writer/ParticleAttribute.hpp"
+#include "picongpu/plugins/output/WriteSpeciesCommon.hpp"
+#include "picongpu/plugins/output/ConstSpeciesAttributes.hpp"
+#include "picongpu/plugins/openPMD/openPMDDimension.hpp"
+
+#include <pmacc/assert.hpp>
+#include <pmacc/dataManagement/DataConnector.hpp>
+#include <pmacc/eventSystem/events/kernelEvents.hpp>
+#include <pmacc/mappings/kernel/AreaMapping.hpp>
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+#include <pmacc/meta/conversion/RemoveFromSeq.hpp>
+#include <pmacc/particles/ParticleDescription.hpp>
+#include <pmacc/particles/operations/ConcatListOfFrames.hpp>
+#include <pmacc/particles/particleFilter/FilterFactory.hpp>
+#include <pmacc/particles/particleFilter/PositionFilter.hpp>
+#include <pmacc/particles/memory/buffers/MallocMCBuffer.hpp>
+
+
+#include <boost/mpl/at.hpp>
+#include <boost/mpl/begin_end.hpp>
+#include <boost/mpl/find.hpp>
+#include <boost/mpl/pair.hpp>
+#include <boost/mpl/size.hpp>
+#include <boost/mpl/vector.hpp>
+#include <boost/type_traits.hpp>
+#include <boost/type_traits/is_same.hpp>
+
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        using namespace pmacc;
+
+        template<typename SpeciesTmp, typename Filter, typename ParticleFilter, typename ParticleOffset>
+        struct StrategyRunParameters
+        {
+            pmacc::DataConnector& dc;
+            ThreadParams& params;
+            SpeciesTmp& speciesTmp;
+            Filter& filter;
+            ParticleFilter& particleFilter;
+            ParticleOffset& particleOffset;
+            uint64_t myNumParticles, globalNumParticles;
+            StrategyRunParameters(
+                pmacc::DataConnector& c_dc,
+                ThreadParams& c_params,
+                SpeciesTmp& c_speciesTmp,
+                Filter& c_filter,
+                ParticleFilter& c_particleFilter,
+                ParticleOffset& c_particleOffset,
+                uint64_t c_myNumParticles,
+                uint64_t c_globalNumParticles)
+                : dc(c_dc)
+                , params(c_params)
+                , speciesTmp(c_speciesTmp)
+                , filter(c_filter)
+                , particleFilter(c_particleFilter)
+                , particleOffset(c_particleOffset)
+                , myNumParticles(c_globalNumParticles)
+                , globalNumParticles(c_globalNumParticles)
+            {
+            }
+        };
+
+        template<typename openPMDFrameType, typename RunParameters>
+        struct Strategy
+        {
+            virtual void malloc(std::string name, openPMDFrameType&, uint64_cu const myNumParticles) = 0;
+
+            virtual void free(openPMDFrameType& hostFrame) = 0;
+
+            virtual void prepare(std::string name, openPMDFrameType& hostFrame, RunParameters) = 0;
+
+            virtual ~Strategy() = default;
+        };
+
+        /*
+         * Use double buffer.
+         */
+        template<typename openPMDFrameType, typename RunParameters>
+        struct StrategyADIOS : Strategy<openPMDFrameType, RunParameters>
+        {
+            void malloc(std::string name, openPMDFrameType& hostFrame, uint64_cu const myNumParticles) override
+            {
+                /* malloc host memory */
+                log<picLog::INPUT_OUTPUT>("openPMD:   (begin) malloc host memory: %1%") % name;
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, MallocHostMemory<bmpl::_1>> mallocMem;
+                mallocMem(hostFrame, myNumParticles);
+                log<picLog::INPUT_OUTPUT>("openPMD:   ( end ) malloc host memory: %1%") % name;
+            }
+
+            void free(openPMDFrameType& hostFrame) override
+            {
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, FreeHostMemory<bmpl::_1>> freeMem;
+                freeMem(hostFrame);
+            }
+
+
+            void prepare(std::string name, openPMDFrameType& hostFrame, RunParameters rp) override
+            {
+                log<picLog::INPUT_OUTPUT>("openPMD:   (begin) copy particle host (with hierarchy) to "
+                                          "host (without hierarchy): %1%")
+                    % name;
+                auto mallocMCBuffer
+                    = rp.dc.template get<MallocMCBuffer<DeviceHeap>>(MallocMCBuffer<DeviceHeap>::getName(), true);
+
+                int globalParticleOffset = 0;
+                AreaMapping<CORE + BORDER, MappingDesc> mapper(*(rp.params.cellDescription));
+
+                pmacc::particles::operations::ConcatListOfFrames<simDim> concatListOfFrames(mapper.getGridDim());
+
+#if(PMACC_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+                auto particlesBox = rp.speciesTmp->getHostParticlesBox(mallocMCBuffer->getOffset());
+#else
+                /* This separate code path is only a workaround until
+                 * MallocMCBuffer is alpaka compatible.
+                 *
+                 * @todo remove this workaround: we know that we are allowed to
+                 * access the device memory directly.
+                 */
+                auto particlesBox = rp.speciesTmp->getDeviceParticlesBox();
+                /* Notify to the event system that the particles box is used on
+                 * the host.
+                 *
+                 * @todo remove this workaround
+                 */
+                __startOperation(ITask::TASK_HOST);
+
+#endif
+                concatListOfFrames(
+                    globalParticleOffset,
+                    hostFrame,
+                    particlesBox,
+                    rp.filter,
+                    rp.particleOffset, /*relative to data domain (not to physical
+                                       domain)*/
+                    totalCellIdx_,
+                    mapper,
+                    rp.particleFilter);
+
+                rp.dc.releaseData(MallocMCBuffer<DeviceHeap>::getName());
+
+                /* this costs a little bit of time but writing to external is
+                 * slower in general */
+                PMACC_ASSERT((uint64_cu) globalParticleOffset == rp.globalNumParticles);
+            }
+        };
+
+        /*
+         * Use mapped memory.
+         */
+        template<typename openPMDFrameType, typename RunParameters>
+        struct StrategyHDF5 : Strategy<openPMDFrameType, RunParameters>
+        {
+            void malloc(std::string name, openPMDFrameType& hostFrame, uint64_cu const myNumParticles) override
+            {
+                log<picLog::INPUT_OUTPUT>("openPMD:  (begin) malloc mapped memory: %1%") % name;
+                /*malloc mapped memory*/
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, MallocMemory<bmpl::_1>> mallocMem;
+                mallocMem(hostFrame, myNumParticles);
+                log<picLog::INPUT_OUTPUT>("openPMD:  ( end ) malloc mapped memory: %1%") % name;
+            }
+
+            void free(openPMDFrameType& hostFrame) override
+            {
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, FreeMemory<bmpl::_1>> freeMem;
+                freeMem(hostFrame);
+            }
+
+            void prepare(std::string name, openPMDFrameType& hostFrame, RunParameters rp) override
+            {
+                log<picLog::INPUT_OUTPUT>("openPMD:  (begin) copy particle to host: %1%") % name;
+
+                log<picLog::INPUT_OUTPUT>("openPMD:  (begin) get mapped memory device pointer: %1%") % name;
+                /*load device pointer of mapped memory*/
+                openPMDFrameType deviceFrame;
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, GetDevicePtr<bmpl::_1>> getDevicePtr;
+                getDevicePtr(deviceFrame, hostFrame);
+                log<picLog::INPUT_OUTPUT>("openPMD:  ( end ) get mapped memory device pointer: %1%") % name;
+
+                GridBuffer<int, DIM1> counterBuffer(DataSpace<DIM1>(1));
+                AreaMapping<CORE + BORDER, MappingDesc> mapper(*(rp.params.cellDescription));
+
+                constexpr uint32_t numWorkers
+                    = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                /* this sanity check costs a little bit of time but hdf5 writing is
+                 * slower */
+                PMACC_KERNEL(CopySpecies<numWorkers>{})
+                (mapper.getGridDim(), numWorkers)(
+                    counterBuffer.getDeviceBuffer().getPointer(),
+                    deviceFrame,
+                    rp.speciesTmp->getDeviceParticlesBox(),
+                    rp.filter,
+                    rp.particleOffset,
+                    totalCellIdx_,
+                    mapper,
+                    rp.particleFilter);
+                counterBuffer.deviceToHost();
+                log<picLog::INPUT_OUTPUT>("openPMD:  ( end ) copy particle to host: %1%") % name;
+                __getTransactionEvent().waitForFinished();
+                log<picLog::INPUT_OUTPUT>("openPMD:  all events are finished: %1%") % name;
+
+                PMACC_ASSERT((uint64_t) counterBuffer.getHostBuffer().getDataBox()[0] == rp.myNumParticles);
+            }
+        };
+
+        /** Write copy particle to host memory and dump to openPMD file
+         *
+         * @tparam T_Species type of species
+         */
+        template<typename T_SpeciesFilter, typename T_Species = T_SpeciesFilter>
+        struct WriteSpecies
+        {
+        public:
+            using ThisSpecies = typename T_SpeciesFilter::Species;
+            using FrameType = typename ThisSpecies::FrameType;
+            using ParticleDescription = typename FrameType::ParticleDescription;
+            using ParticleAttributeList = typename FrameType::ValueTypeSeq;
+
+            /* delete multiMask and localCellIdx in openPMD particle*/
+            using TypesToDelete = bmpl::vector<multiMask, localCellIdx>;
+            using ParticleCleanedAttributeList = typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type;
+
+            /* add totalCellIdx for openPMD particle*/
+            using ParticleNewAttributeList = typename MakeSeq<ParticleCleanedAttributeList, totalCellIdx>::type;
+
+            using NewParticleDescription =
+                typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type;
+
+            using openPMDFrameType = Frame<OperatorCreateVectorBox, NewParticleDescription>;
+
+            void setParticleAttributes(
+                ::openPMD::ParticleSpecies& record,
+                AbstractJsonMatcher& matcher,
+                std::string const& basename)
+            {
+                const float_64 particleShape(GetShape<ThisSpecies>::type::assignmentFunctionOrder);
+                record.setAttribute("particleShape", particleShape);
+
+                traits::GetSpeciesFlagName<ThisSpecies, current<>> currentDepositionName;
+                const std::string currentDeposition(currentDepositionName());
+                record.setAttribute("currentDeposition", currentDeposition.c_str());
+
+                traits::GetSpeciesFlagName<ThisSpecies, particlePusher<>> particlePushName;
+                const std::string particlePush(particlePushName());
+                record.setAttribute("particlePush", particlePush.c_str());
+
+                traits::GetSpeciesFlagName<ThisSpecies, interpolation<>> particleInterpolationName;
+                const std::string particleInterpolation(particleInterpolationName());
+                record.setAttribute("particleInterpolation", particleInterpolation.c_str());
+
+                const std::string particleSmoothing("none");
+                record.setAttribute("particleSmoothing", particleSmoothing.c_str());
+
+                // now we have a map in a writeable format with all zeroes
+                // for each record copy it and modify the copy, e.g.
+
+                // const records stuff
+                ::openPMD::Datatype dataType = ::openPMD::Datatype::DOUBLE;
+                ::openPMD::Extent extent = {0};
+                ::openPMD::Dataset dataSet = ::openPMD::Dataset(dataType, extent);
+
+                // mass
+                plugins::output::GetMassOrZero<FrameType> const getMassOrZero;
+                if(getMassOrZero.hasMassRatio)
+                {
+                    const float_64 mass(getMassOrZero());
+                    auto& massRecord = record["mass"];
+                    auto& massComponent = massRecord[::openPMD::RecordComponent::SCALAR];
+                    setDatasetOptions(dataSet, matcher.get(basename + "/mass"));
+                    massComponent.resetDataset(dataSet);
+                    massComponent.makeConstant(mass);
+
+                    auto unitMap = convertToUnitDimension(getMassOrZero.dimension());
+                    massRecord.setUnitDimension(unitMap);
+                    massComponent.setUnitSI(::picongpu::UNIT_MASS);
+                    massRecord.setAttribute("macroWeighted", int32_t(false));
+                    massRecord.setAttribute("weightingPower", float_64(1.0));
+                    massRecord.setAttribute("timeOffset", float_64(0.0));
+                }
+
+                // charge
+                using hasBoundElectrons = typename pmacc::traits::HasIdentifier<FrameType, boundElectrons>::type;
+                plugins::output::GetChargeOrZero<FrameType> const getChargeOrZero;
+                if(!hasBoundElectrons::value && getChargeOrZero.hasChargeRatio)
+                {
+                    const float_64 charge(getChargeOrZero());
+                    auto& chargeRecord = record["charge"];
+                    auto& chargeComponent = chargeRecord[::openPMD::RecordComponent::SCALAR];
+                    setDatasetOptions(dataSet, matcher.get(basename + "/charge"));
+                    chargeComponent.resetDataset(dataSet);
+                    chargeComponent.makeConstant(charge);
+
+                    auto unitMap = convertToUnitDimension(getChargeOrZero.dimension());
+                    chargeRecord.setUnitDimension(unitMap);
+                    chargeComponent.setUnitSI(::picongpu::UNIT_CHARGE);
+                    chargeRecord.setAttribute("macroWeighted", int32_t(false));
+                    chargeRecord.setAttribute("weightingPower", float_64(1.0));
+                    chargeRecord.setAttribute("timeOffset", float_64(0.0));
+                }
+            }
+
+            template<typename Space> // has operator[] -> integer type
+            HINLINE void operator()(ThreadParams* params, const Space particleOffset)
+            {
+                log<picLog::INPUT_OUTPUT>("openPMD: (begin) write species: %1%") % T_SpeciesFilter::getName();
+                DataConnector& dc = Environment<>::get().DataConnector();
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                uint64_t mpiSize = gc.getGlobalSize();
+                uint64_t mpiRank = gc.getGlobalRank();
+                /* load particle without copy particle data to host */
+                auto speciesTmp = dc.get<ThisSpecies>(ThisSpecies::FrameType::getName(), true);
+                const std::string speciesGroup(T_Species::getName());
+
+                ::openPMD::Series& series = *params->openPMDSeries;
+                ::openPMD::Iteration iteration = series.WRITE_ITERATIONS[params->currentStep];
+                const std::string basename = series.particlesPath() + speciesGroup;
+
+                // enforce that the filter interface is fulfilled
+                particles::filter::IUnary<typename T_SpeciesFilter::Filter> particleFilter{params->currentStep};
+                using usedFilters = bmpl::vector<typename GetPositionFilter<simDim>::type>;
+                using MyParticleFilter = typename FilterFactory<usedFilters>::FilterType;
+                MyParticleFilter filter;
+                /* activate filter pipeline if moving window is activated */
+                filter.setStatus(MovingWindow::getInstance().isSlidingWindowActive(params->currentStep));
+                filter.setWindowPosition(params->localWindowToDomainOffset, params->window.localDimensions.size);
+
+                using RunParameters_T = StrategyRunParameters<
+                    decltype(speciesTmp),
+                    decltype(filter),
+                    decltype(particleFilter),
+                    const Space>;
+
+                using AStrategy = Strategy<openPMDFrameType, RunParameters_T>;
+                std::unique_ptr<AStrategy> strategy;
+
+                switch(params->strategy)
+                {
+                case WriteSpeciesStrategy::ADIOS:
+                {
+                    using type = StrategyADIOS<openPMDFrameType, RunParameters_T>;
+                    strategy = std::unique_ptr<AStrategy>(dynamic_cast<AStrategy*>(new type));
+                    break;
+                }
+                case WriteSpeciesStrategy::HDF5:
+                {
+                    using type = StrategyHDF5<openPMDFrameType, RunParameters_T>;
+                    strategy = std::unique_ptr<AStrategy>(dynamic_cast<AStrategy*>(new type));
+                    break;
+                }
+                }
+
+
+                /* count total number of particles on the device */
+                log<picLog::INPUT_OUTPUT>("openPMD:   (begin) count particles: %1%") % T_SpeciesFilter::getName();
+                uint64_cu const myNumParticles = pmacc::CountParticles::countOnDevice<CORE + BORDER>(
+                    *speciesTmp,
+                    *(params->cellDescription),
+                    params->localWindowToDomainOffset,
+                    params->window.localDimensions.size,
+                    particleFilter);
+                uint64_t allNumParticles[mpiSize];
+                uint64_t globalNumParticles = 0;
+                uint64_t myParticleOffset = 0;
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking
+                // collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Allgather(
+                    &myNumParticles,
+                    1,
+                    MPI_UNSIGNED_LONG_LONG,
+                    allNumParticles,
+                    1,
+                    MPI_UNSIGNED_LONG_LONG,
+                    gc.getCommunicator().getMPIComm()));
+
+                for(uint64_t i = 0; i < mpiSize; ++i)
+                {
+                    globalNumParticles += allNumParticles[i];
+                    if(i < mpiRank)
+                        myParticleOffset += allNumParticles[i];
+                }
+                log<picLog::INPUT_OUTPUT>("openPMD:   ( end ) count particles: %1% = %2%") % T_SpeciesFilter::getName()
+                    % globalNumParticles;
+
+                ::openPMD::ParticleSpecies& particleSpecies = iteration.particles[speciesGroup];
+
+                // copy over particles to host
+                openPMDFrameType hostFrame;
+
+                strategy->malloc(T_SpeciesFilter::getName(), hostFrame, myNumParticles);
+                RunParameters_T runParameters(
+                    dc,
+                    *params,
+                    speciesTmp,
+                    filter,
+                    particleFilter,
+                    particleOffset,
+                    myNumParticles,
+                    globalNumParticles);
+                if(globalNumParticles > 0)
+                {
+                    strategy->prepare(T_SpeciesFilter::getName(), hostFrame, std::move(runParameters));
+                }
+                log<picLog::INPUT_OUTPUT>("openPMD:  (begin) write particle records for %1%")
+                    % T_SpeciesFilter::getName();
+
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, openPMD::ParticleAttribute<bmpl::_1>>
+                    writeToOpenPMD;
+                writeToOpenPMD(
+                    params,
+                    hostFrame,
+                    particleSpecies,
+                    basename,
+                    myNumParticles,
+                    globalNumParticles,
+                    myParticleOffset);
+
+                log<picLog::INPUT_OUTPUT>("openPMD:  (begin) free memory: %1%") % T_SpeciesFilter::getName();
+                /* free host memory */
+                strategy->free(hostFrame);
+                log<picLog::INPUT_OUTPUT>("openPMD:  (end) free memory: %1%") % T_SpeciesFilter::getName();
+
+                log<picLog::INPUT_OUTPUT>("openPMD: ( end ) writing species: %1%") % T_SpeciesFilter::getName();
+
+                /* write species counter table to openPMD storage */
+                log<picLog::INPUT_OUTPUT>("openPMD: (begin) writing particle patches for %1%")
+                    % T_SpeciesFilter::getName();
+                {
+                    using index_t = uint64_t;
+                    ::openPMD::Datatype const datatype = ::openPMD::determineDatatype<index_t>();
+                    // not const, we'll switch out the JSON config
+                    ::openPMD::Dataset ds(datatype, {mpiSize});
+
+                    ::openPMD::ParticlePatches particlePatches = particleSpecies.particlePatches;
+                    ::openPMD::PatchRecordComponent numParticles
+                        = particlePatches["numParticles"][::openPMD::RecordComponent::SCALAR];
+                    ::openPMD::PatchRecordComponent numParticlesOffset
+                        = particlePatches["numParticlesOffset"][::openPMD::RecordComponent::SCALAR];
+
+                    setDatasetOptions(ds, params->jsonMatcher->get(basename + "/particlePatches/numParticles"));
+                    numParticles.resetDataset(ds);
+                    setDatasetOptions(ds, params->jsonMatcher->get(basename + "/particlePatches/numParticlesOffset"));
+                    numParticlesOffset.resetDataset(ds);
+
+                    /* It is safe to use the mpi rank to write the data even if the rank can differ between simulation
+                     * runs. During the restart the plugin is using patch information to find the corresponding data.
+                     */
+                    numParticles.store<index_t>(mpiRank, myNumParticles);
+                    numParticlesOffset.store<index_t>(mpiRank, myParticleOffset);
+
+                    ::openPMD::PatchRecord offset = particlePatches["offset"];
+                    ::openPMD::PatchRecord extent = particlePatches["extent"];
+                    auto const patchExtent = params->window.localDimensions.size;
+
+                    for(size_t d = 0; d < simDim; ++d)
+                    {
+                        ::openPMD::PatchRecordComponent offset_x = offset[name_lookup[d]];
+                        ::openPMD::PatchRecordComponent extent_x = extent[name_lookup[d]];
+                        setDatasetOptions(
+                            ds,
+                            params->jsonMatcher->get(basename + "/particlePatches/offset/" + name_lookup[d]));
+                        offset_x.resetDataset(ds);
+                        setDatasetOptions(
+                            ds,
+                            params->jsonMatcher->get(basename + "/particlePatches/extent/" + name_lookup[d]));
+                        extent_x.resetDataset(ds);
+
+                        offset_x.store<index_t>(mpiRank, particleOffset[d]);
+                        extent_x.store<index_t>(mpiRank, patchExtent[d]);
+                    }
+
+                    /* openPMD ED-PIC: additional attributes */
+                    setParticleAttributes(
+                        particleSpecies,
+                        *params->jsonMatcher,
+                        series.particlesPath() + speciesGroup);
+                    params->openPMDSeries->flush();
+                }
+
+                log<picLog::INPUT_OUTPUT>("openPMD: ( end ) writing particle patches for %1%")
+                    % T_SpeciesFilter::getName();
+            }
+        };
+
+
+    } // namespace openPMD
+
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/openPMDDimension.hpp b/include/picongpu/plugins/openPMD/openPMDDimension.hpp
new file mode 100644
index 0000000000..acc86d2032
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/openPMDDimension.hpp
@@ -0,0 +1,58 @@
+/* Copyright 2014-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+ *                     Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include <openPMD/openPMD.hpp>
+
+#include <vector>
+#include <map>
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        /** convert PIConGPU dimension unit into a corresponding openPMD map
+         *
+         * @param unitDimension PIConGPU dimension vector
+         * @return openPMD-api dimension map
+         */
+        inline auto convertToUnitDimension(std::vector<float_64> const& unitDimension)
+        {
+            PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
+            constexpr ::openPMD::UnitDimension openPMDUnitDimensions[7]
+                = {::openPMD::UnitDimension::L,
+                   ::openPMD::UnitDimension::M,
+                   ::openPMD::UnitDimension::T,
+                   ::openPMD::UnitDimension::I,
+                   ::openPMD::UnitDimension::theta,
+                   ::openPMD::UnitDimension::N,
+                   ::openPMD::UnitDimension::J};
+            std::map<::openPMD::UnitDimension, double> unitMap;
+            for(unsigned i = 0; i < 7; ++i)
+            {
+                unitMap[openPMDUnitDimensions[i]] = unitDimension[i];
+            }
+
+            return unitMap;
+        }
+    } // namespace openPMD
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/openPMDVersion.def b/include/picongpu/plugins/openPMD/openPMDVersion.def
new file mode 100644
index 0000000000..b38f8dea71
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/openPMDVersion.def
@@ -0,0 +1,75 @@
+/* Copyright 2020-2021 Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include "openPMD/openPMD.hpp"
+
+#if OPENPMDAPI_VERSION_GE(0, 13, 0)
+// Streaming API is available, use it
+#    define WRITE_ITERATIONS writeIterations()
+#else
+// Not available, don't use it
+#    define WRITE_ITERATIONS iterations
+#endif
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        /*
+         * Do some SFINAE tricks to detect whether the openPMD API has
+         * dataset-specific configuration or not.
+         */
+        namespace detail
+        {
+            // As std::void_t in C++17.
+            template<typename>
+            using void_t = void;
+
+            template<typename = ::openPMD::Dataset, typename = void>
+            struct SetDatasetOptions
+            {
+                static void run(::openPMD::Dataset const&, std::string const& options)
+                {
+                    if(options != "{}")
+                    {
+                        std::cerr
+                            << "[openPMD plugin] Setting dataset-specific JSON options requires openPMD API 0.13.0 "
+                               "or later."
+                            << std::endl;
+                    }
+                }
+            };
+
+            template<typename Dataset>
+            struct SetDatasetOptions<Dataset, void_t<decltype(Dataset::options)>>
+            {
+                static void run(Dataset& ds, std::string options)
+                {
+                    ds.options = std::move(options);
+                }
+            };
+        } // namespace detail
+
+        void setDatasetOptions(::openPMD::Dataset& ds, std::string options)
+        {
+            detail::SetDatasetOptions<>::run(ds, std::move(options));
+        }
+    } // namespace openPMD
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/openPMDWriter.def b/include/picongpu/plugins/openPMD/openPMDWriter.def
new file mode 100644
index 0000000000..6ed571f1cd
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/openPMDWriter.def
@@ -0,0 +1,138 @@
+/* Copyright 2014-2021 Felix Schmitt, Axel Huebl, Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation/control/MovingWindow.hpp"
+#include "picongpu/simulation_defines.hpp"
+
+#include "picongpu/plugins/openPMD/Json.hpp"
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/particles/frame_types.hpp>
+#include <pmacc/types.hpp>
+
+#include <openPMD/openPMD.hpp>
+
+#include <iostream> // std::cerr
+#include <limits>
+#include <list>
+#include <memory> // std::unique_ptr
+#include <sstream>
+#include <stdexcept> // throw std::runtime_error
+#include <string>
+
+#include <type_traits>
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        using namespace pmacc;
+        using AbstractJsonMatcher = json::AbstractJsonMatcher;
+
+
+        namespace po = boost::program_options;
+
+
+#define MESHES_PATH "fields"
+#define PARTICLES_PATH "particles"
+
+        template<typename T_Vec, typename T_Ret = std::vector<typename std::remove_reference<T_Vec>::type::type>>
+        T_Ret asStandardVector(T_Vec const&);
+
+        enum class WriteSpeciesStrategy
+        {
+            ADIOS,
+            HDF5
+        };
+
+
+        /**
+         * Writes simulation data to openPMD series.
+         * Implements the ILightweightPlugin interface.
+         */
+
+        class openPMDWriter;
+        class Help;
+
+        struct ThreadParams
+        {
+            uint32_t currentStep; /** current simulation step */
+
+
+            std::unique_ptr<::openPMD::Series> openPMDSeries; /* is null iff there is no series currently open */
+
+            /** current dump is a checkpoint */
+            bool isCheckpoint;
+
+            MPI_Comm communicator; /* MPI communicator for openPMD API */
+            std::string compressionMethod; /* openPMD data transform compression method */
+            std::string fileName; /* Name of the openPMDSeries, excluding the extension */
+            std::string fileExtension; /* Extension of the file name */
+            std::string fileInfix;
+
+            std::unique_ptr<AbstractJsonMatcher> jsonMatcher;
+
+            WriteSpeciesStrategy strategy = WriteSpeciesStrategy::ADIOS;
+
+            pmacc::math::UInt64<simDim> fieldsSizeDims;
+            pmacc::math::UInt64<simDim> fieldsGlobalSizeDims;
+            pmacc::math::UInt64<simDim> fieldsOffsetDims;
+
+            GridLayout<simDim> gridLayout;
+            MappingDesc* cellDescription;
+
+            std::vector<float_X> fieldBuffer; /* temp. buffer for fields */
+
+            Window window; /* window describing the volume to be dumped */
+
+            DataSpace<simDim> localWindowToDomainOffset; /** offset from local moving
+                                                            window to local domain */
+
+            std::vector<double> times;
+
+            ::openPMD::Series& openSeries(::openPMD::Access at);
+
+            void closeSeries();
+
+            void initFromConfig(Help&, size_t id, std::string const& file, std::string const& dir);
+
+            /**
+             * Wrapper for ::openPMD::resetDataset, set dataset parameters
+             * @tparam DIM number of variable dimensions
+             * @param recordComponent Location of the dataset within the openPMD
+             * Series
+             * @param datatype Variable type
+             * @param globalDimensions Dataset global dimensions
+             * @param compression Enable compression data transform
+             * @param compressionMethod String denoting the data transform to use
+             * @return The input recordComponent
+             */
+            template<unsigned DIM>
+            ::openPMD::RecordComponent& initDataset(
+                ::openPMD::RecordComponent& recordComponent,
+                ::openPMD::Datatype datatype,
+                pmacc::math::UInt64<DIM> const& globalDimensions,
+                bool compression,
+                std::string const& compressionMethod,
+                std::string const& datasetName);
+        };
+    } // namespace openPMD
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/openPMDWriter.hpp b/include/picongpu/plugins/openPMD/openPMDWriter.hpp
new file mode 100644
index 0000000000..542bbb7779
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/openPMDWriter.hpp
@@ -0,0 +1,1260 @@
+/* Copyright 2014-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+ *                     Benjamin Worpitz, Alexander Grund, Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/fields/FieldB.hpp"
+#include "picongpu/fields/FieldE.hpp"
+#include "picongpu/fields/FieldJ.hpp"
+#include "picongpu/fields/FieldTmp.hpp"
+#include "picongpu/particles/filter/filter.hpp"
+#include "picongpu/particles/traits/SpeciesEligibleForSolver.hpp"
+#include "picongpu/plugins/misc/ComponentNames.hpp"
+#include "picongpu/plugins/misc/SpeciesFilter.hpp"
+#include "picongpu/plugins/misc/misc.hpp"
+#include "picongpu/plugins/multi/IHelp.hpp"
+#include "picongpu/plugins/multi/Option.hpp"
+#include "picongpu/plugins/openPMD/openPMDWriter.def"
+#include "picongpu/simulation/control/MovingWindow.hpp"
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/traits/IsFieldDomainBound.hpp"
+
+#include <pmacc/Environment.hpp>
+#include <pmacc/assert.hpp>
+#include <pmacc/communication/manager_common.hpp>
+#include <pmacc/dataManagement/DataConnector.hpp>
+#include <pmacc/dimensions/GridLayout.hpp>
+#include <pmacc/mappings/simulation/GridController.hpp>
+#include <pmacc/mappings/simulation/SubGrid.hpp>
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/particles/IdProvider.def>
+#include <pmacc/particles/frame_types.hpp>
+#include <pmacc/particles/operations/CountParticles.hpp>
+#include <pmacc/pluginSystem/PluginConnector.hpp>
+#include <pmacc/simulationControl/TimeInterval.hpp>
+#include <pmacc/static_assert.hpp>
+#include <pmacc/particles/memory/buffers/MallocMCBuffer.hpp>
+
+#include "picongpu/plugins/misc/SpeciesFilter.hpp"
+#include "picongpu/plugins/openPMD/Json.hpp"
+#include "picongpu/plugins/openPMD/NDScalars.hpp"
+#include "picongpu/plugins/openPMD/WriteMeta.hpp"
+#include "picongpu/plugins/openPMD/openPMDVersion.def"
+#include "picongpu/plugins/openPMD/WriteSpecies.hpp"
+#include "picongpu/plugins/openPMD/restart/LoadSpecies.hpp"
+#include "picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp"
+#include "picongpu/plugins/output/IIOBackend.hpp"
+
+#include <pmacc/traits/Limits.hpp>
+
+#include <boost/filesystem.hpp>
+#include <boost/mpl/at.hpp>
+#include <boost/mpl/begin_end.hpp>
+#include <boost/mpl/find.hpp>
+#include <boost/mpl/pair.hpp>
+#include <boost/mpl/size.hpp>
+#include <boost/mpl/vector.hpp>
+#include <boost/type_traits.hpp>
+#include <boost/type_traits/is_same.hpp>
+
+#include <openPMD/openPMD.hpp>
+
+#if !defined(_WIN32)
+#    include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib> // getenv
+#include <list>
+#include <pthread.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        using namespace pmacc;
+
+
+        namespace po = boost::program_options;
+
+        template<unsigned DIM>
+        ::openPMD::RecordComponent& ThreadParams::initDataset(
+            ::openPMD::RecordComponent& recordComponent,
+            ::openPMD::Datatype datatype,
+            pmacc::math::UInt64<DIM> const& globalDimensions,
+            bool compression,
+            std::string const& compressionMethod,
+            std::string const& datasetName)
+        {
+            std::vector<uint64_t> v = asStandardVector(globalDimensions);
+            ::openPMD::Dataset dataset{datatype, std::move(v)};
+            setDatasetOptions(dataset, jsonMatcher->get(datasetName));
+            if(compression && compressionMethod != "none")
+            {
+                dataset.compression = compressionMethod;
+            }
+            recordComponent.resetDataset(std::move(dataset));
+            return recordComponent;
+        }
+
+
+        template<typename T_Vec, typename T_Ret>
+        T_Ret asStandardVector(T_Vec const& v)
+        {
+            using __T_Vec = typename std::remove_reference<T_Vec>::type;
+            constexpr auto dim = __T_Vec::dim;
+            T_Ret res(dim);
+            for(unsigned i = 0; i < dim; ++i)
+            {
+                res[dim - i - 1] = v[i];
+            }
+            return res;
+        }
+
+        ::openPMD::Series& ThreadParams::openSeries(::openPMD::Access at)
+        {
+            if(!openPMDSeries)
+            {
+                std::string fullName = fileName + fileInfix + "." + fileExtension;
+                log<picLog::INPUT_OUTPUT>("openPMD: open file: %1%") % fullName;
+                // avoid deadlock between not finished pmacc tasks and mpi calls in
+                // openPMD
+                __getTransactionEvent().waitForFinished();
+                openPMDSeries
+                    = std::make_unique<::openPMD::Series>(fullName, at, communicator, jsonMatcher->getDefault());
+                if(openPMDSeries->backend() == "MPI_ADIOS1")
+                {
+                    throw std::runtime_error(R"END(
+Using ADIOS1 through PIConGPU's openPMD plugin is not supported.
+Please pick either of the following:
+* Use the ADIOS plugin.
+* Use the openPMD plugin with another backend, such as ADIOS2.
+  If the openPMD API has been compiled with support for ADIOS2, the openPMD API
+  will automatically prefer using ADIOS2 over ADIOS1.
+  Make sure that environment variable OPENPMD_BP_BACKEND is not set to ADIOS1.
+                )END");
+                }
+                if(at == ::openPMD::Access::CREATE)
+                {
+                    openPMDSeries->setMeshesPath(MESHES_PATH);
+                    openPMDSeries->setParticlesPath(PARTICLES_PATH);
+                }
+                log<picLog::INPUT_OUTPUT>("openPMD: successfully opened file: %1%") % fullName;
+                return *openPMDSeries;
+            }
+            else
+            {
+                throw std::runtime_error("openPMD: Tried opening a Series while old Series was still "
+                                         "active");
+            }
+        }
+
+        void ThreadParams::closeSeries()
+        {
+            if(openPMDSeries)
+            {
+                log<picLog::INPUT_OUTPUT>("openPMD: close file: %1%") % fileName;
+                openPMDSeries.reset();
+                MPI_Barrier(this->communicator);
+                log<picLog::INPUT_OUTPUT>("openPMD: successfully closed file: %1%") % fileName;
+            }
+            else
+            {
+                throw std::runtime_error("openPMD: Tried closing a Series that was not active");
+            }
+        }
+
+
+        struct Help : public plugins::multi::IHelp
+        {
+            /** creates a instance of ISlave
+             *
+             * @param help plugin defined help
+             * @param id index of the plugin, range: [0;help->getNumPlugins())
+             */
+            std::shared_ptr<plugins::multi::ISlave> create(
+                std::shared_ptr<IHelp>& help,
+                size_t const id,
+                MappingDesc* cellDescription);
+            // defined later since we need openPMDWriter constructor
+
+            plugins::multi::Option<std::string> notifyPeriod = {"period", "enable openPMD IO [for each n-th step]"};
+
+            plugins::multi::Option<std::string> source = {"source", "data sources: ", "species_all, fields_all"};
+
+            std::vector<std::string> allowedDataSources = {"species_all", "fields_all"};
+
+            plugins::multi::Option<std::string> fileName = {"file", "openPMD file basename"};
+
+            plugins::multi::Option<std::string> fileNameExtension
+                = {"ext",
+                   "openPMD filename extension (this controls the"
+                   "backend picked by the openPMD API)",
+                   "bp"};
+
+            plugins::multi::Option<std::string> fileNameInfix
+                = {"infix",
+                   "openPMD filename infix (use to pick file- or group-based "
+                   "layout in openPMD)\nSet to NULL to keep empty (e.g. to pick"
+                   " group-based iteration layout). Parameter will be ignored"
+                   " if a streaming backend is detected in 'ext' parameter and"
+                   " an empty string will be assumed instead.",
+                   "_%06T"};
+
+            plugins::multi::Option<std::string> jsonConfig
+                = {"json", "advanced (backend) configuration for openPMD in JSON format", "{}"};
+
+            plugins::multi::Option<std::string> dataPreparationStrategy
+                = {"dataPreparationStrategy",
+                   "Strategy for preparation of particle data ('doubleBuffer' or "
+                   "'mappedMemory'). Aliases 'adios' and 'hdf5' may be used "
+                   "respectively.",
+                   "doubleBuffer"};
+
+            plugins::multi::Option<std::string> compression
+                = {"compression",
+                   "Backend-specific openPMD compression method, e.g., zlib (see "
+                   "`adios_config -m` for help). Legacy parameter until compression"
+                   " can be fully configured via JSON in the openPMD API.",
+                   "none"};
+
+            /** defines if the plugin must register itself to the PMacc plugin
+             * system
+             *
+             * true = the plugin is registering it self
+             * false = the plugin is not registering itself (plugin is
+             * controlled by another class)
+             */
+            bool selfRegister = false;
+
+            template<typename T_TupleVector>
+            struct CreateSpeciesFilter
+            {
+                using type = plugins::misc::SpeciesFilter<
+                    typename pmacc::math::CT::At<T_TupleVector, bmpl::int_<0>>::type,
+                    typename pmacc::math::CT::At<T_TupleVector, bmpl::int_<1>>::type>;
+            };
+
+            using AllParticlesTimesAllFilters = typename AllCombinations<
+                bmpl::vector<FileOutputParticles, particles::filter::AllParticleFilters>>::type;
+
+            using AllSpeciesFilter =
+                typename bmpl::transform<AllParticlesTimesAllFilters, CreateSpeciesFilter<bmpl::_1>>::type;
+
+            using AllEligibleSpeciesSources =
+                typename bmpl::copy_if<AllSpeciesFilter, plugins::misc::speciesFilter::IsEligible<bmpl::_1>>::type;
+
+            using AllFieldSources = FileOutputFields;
+
+            ///! method used by plugin controller to get --help description
+            void registerHelp(
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
+            {
+                meta::ForEach<AllEligibleSpeciesSources, plugins::misc::AppendName<bmpl::_1>>
+                    getEligibleDataSourceNames;
+                getEligibleDataSourceNames(allowedDataSources);
+
+                meta::ForEach<AllFieldSources, plugins::misc::AppendName<bmpl::_1>> appendFieldSourceNames;
+                appendFieldSourceNames(allowedDataSources);
+
+                // string list with all possible particle sources
+                std::string concatenatedSourceNames = plugins::misc::concatenateToString(allowedDataSources, ", ");
+
+                notifyPeriod.registerHelp(desc, masterPrefix + prefix);
+                source.registerHelp(desc, masterPrefix + prefix, std::string("[") + concatenatedSourceNames + "]");
+
+                expandHelp(desc, "");
+                selfRegister = true;
+            }
+
+            void expandHelp(
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
+            {
+                compression.registerHelp(desc, masterPrefix + prefix);
+                fileName.registerHelp(desc, masterPrefix + prefix);
+                fileNameExtension.registerHelp(desc, masterPrefix + prefix);
+                fileNameInfix.registerHelp(desc, masterPrefix + prefix);
+                jsonConfig.registerHelp(desc, masterPrefix + prefix);
+                dataPreparationStrategy.registerHelp(desc, masterPrefix + prefix);
+            }
+
+            void validateOptions()
+            {
+                if(selfRegister)
+                {
+                    if(notifyPeriod.empty() || fileName.empty())
+                        throw std::runtime_error(name + ": parameter period and file must be defined");
+
+                    // check if user passed data source names are valid
+                    for(auto const& dataSourceNames : source)
+                    {
+                        auto vectorOfDataSourceNames
+                            = plugins::misc::splitString(plugins::misc::removeSpaces(dataSourceNames));
+
+                        for(auto const& f : vectorOfDataSourceNames)
+                        {
+                            if(!plugins::misc::containsObject(allowedDataSources, f))
+                            {
+                                throw std::runtime_error(name + ": unknown data source '" + f + "'");
+                            }
+                        }
+                    }
+                }
+            }
+
+            size_t getNumPlugins() const
+            {
+                if(selfRegister)
+                    return notifyPeriod.size();
+                else
+                    return 1;
+            }
+
+            std::string getDescription() const
+            {
+                return description;
+            }
+
+            std::string getOptionPrefix() const
+            {
+                return prefix;
+            }
+
+            std::string getName() const
+            {
+                return name;
+            }
+
+            std::string const name = "openPMDWriter";
+            //! short description of the plugin
+            std::string const description = "dump simulation data with openPMD";
+            //! prefix used for command line arguments
+            std::string const prefix = "openPMD";
+        };
+
+        void ThreadParams::initFromConfig(Help& help, size_t id, std::string const& file, std::string const& dir)
+        {
+            fileExtension = help.fileNameExtension.get(id);
+            fileInfix = help.fileNameInfix.get(id);
+            /*
+             * Enforce group-based iteration layout for streaming backends
+             */
+            if(fileInfix == "NULL" || fileExtension == "sst")
+            {
+                fileInfix = "";
+            }
+            /* if file name is relative, prepend with common directory */
+            fileName = boost::filesystem::path(file).has_root_path() ? file : dir + "/" + file;
+
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+
+            log<picLog::INPUT_OUTPUT>("openPMD: setting file pattern: %1%%2%.%3%") % fileName % fileInfix
+                % fileExtension;
+
+            // Avoid repeatedly parsing the JSON config
+            if(!jsonMatcher)
+            {
+                jsonMatcher = AbstractJsonMatcher::construct(help.jsonConfig.get(id), communicator);
+            }
+
+            log<picLog::INPUT_OUTPUT>("openPMD: global JSON config: %1%") % jsonMatcher->getDefault();
+
+            {
+                std::string strategyString = help.dataPreparationStrategy.get(id);
+                if(strategyString == "adios" || strategyString == "doubleBuffer")
+                {
+                    strategy = WriteSpeciesStrategy::ADIOS;
+                }
+                else if(strategyString == "hdf5" || strategyString == "mappedMemory")
+                {
+                    strategy = WriteSpeciesStrategy::HDF5;
+                }
+                else
+                {
+                    std::cerr << "Passed dataPreparationStrategy for openPMD"
+                                 " plugin is invalid."
+                              << std::endl;
+                }
+            }
+        }
+
+        /** Writes simulation data to openPMD.
+         *
+         * Implements the IIOBackend interface.
+         */
+        class openPMDWriter : public IIOBackend
+        {
+        public:
+            //! must be implemented by the user
+            static std::shared_ptr<plugins::multi::IHelp> getHelp()
+            {
+                return std::shared_ptr<plugins::multi::IHelp>(new Help{});
+            }
+
+        private:
+            template<typename UnitType>
+            static std::vector<float_64> createUnit(UnitType unit, uint32_t numComponents)
+            {
+                std::vector<float_64> tmp(numComponents);
+                for(uint32_t i = 0; i < numComponents; ++i)
+                    tmp[i] = unit[i];
+                return tmp;
+            }
+
+            /**
+             * Write calculated fields to openPMD.
+             */
+            template<typename T_Field>
+            struct GetFields
+            {
+            private:
+                using ValueType = typename T_Field::ValueType;
+                using ComponentType = typename GetComponentsType<ValueType>::type;
+                using UnitType = typename T_Field::UnitValueType;
+
+            public:
+                static std::vector<float_64> getUnit()
+                {
+                    UnitType unit = T_Field::getUnit();
+                    return createUnit(unit, T_Field::numComponents);
+                }
+
+                HDINLINE void operator()(ThreadParams* params)
+                {
+#ifndef __CUDA_ARCH__
+                    DataConnector& dc = Environment<simDim>::get().DataConnector();
+
+                    auto field = dc.get<T_Field>(T_Field::getName());
+                    params->gridLayout = field->getGridLayout();
+                    bool const isDomainBound = traits::IsFieldDomainBound<T_Field>::value;
+
+                    const traits::FieldPosition<fields::CellType, T_Field> fieldPos;
+
+                    std::vector<std::vector<float_X>> inCellPosition;
+                    for(uint32_t n = 0; n < T_Field::numComponents; ++n)
+                    {
+                        std::vector<float_X> inCellPositonComponent;
+                        for(uint32_t d = 0; d < simDim; ++d)
+                            inCellPositonComponent.push_back(fieldPos()[n][d]);
+                        inCellPosition.push_back(inCellPositonComponent);
+                    }
+
+                    /** \todo check if always correct at this point, depends on
+                     * solver implementation */
+                    const float_X timeOffset = 0.0;
+
+                    openPMDWriter::writeField<ComponentType>(
+                        params,
+                        sizeof(ComponentType),
+                        ::openPMD::determineDatatype<ComponentType>(),
+                        GetNComponents<ValueType>::value,
+                        T_Field::getName(),
+                        field->getHostDataBox().getPointer(),
+                        getUnit(),
+                        T_Field::getUnitDimension(),
+                        std::move(inCellPosition),
+                        timeOffset,
+                        isDomainBound);
+
+                    dc.releaseData(T_Field::getName());
+#endif
+                }
+            };
+
+            /** Calculate FieldTmp with given solver and particle species
+             * and write them to openPMD.
+             *
+             * FieldTmp is calculated on device and then dumped to openPMD.
+             */
+            template<typename Solver, typename Species>
+            struct GetFields<FieldTmpOperation<Solver, Species>>
+            {
+                /*
+                 * This is only a wrapper function to allow disable nvcc warnings.
+                 * Warning: calling a __host__ function from __host__ __device__
+                 * function.
+                 * Use of PMACC_NO_NVCC_HDWARNING is not possible if we call a
+                 * virtual method inside of the method were we disable the warnings.
+                 * Therefore we create this method and call a new method were we can
+                 * call virtual functions.
+                 */
+                PMACC_NO_NVCC_HDWARNING
+                HDINLINE void operator()(ThreadParams* tparam)
+                {
+                    this->operator_impl(tparam);
+                }
+
+            private:
+                using UnitType = typename FieldTmp::UnitValueType;
+                using ValueType = typename FieldTmp::ValueType;
+                using ComponentType = typename GetComponentsType<ValueType>::type;
+
+                /** Get the unit for the result from the solver*/
+                static std::vector<float_64> getUnit()
+                {
+                    UnitType unit = FieldTmp::getUnit<Solver>();
+                    const uint32_t components = GetNComponents<ValueType>::value;
+                    return createUnit(unit, components);
+                }
+
+                /** Create a name for the openPMD identifier.
+                 */
+                static std::string getName()
+                {
+                    return FieldTmpOperation<Solver, Species>::getName();
+                }
+
+                HINLINE void operator_impl(ThreadParams* params)
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+
+                    /*## update field ##*/
+
+                    /*load FieldTmp without copy data to host*/
+                    PMACC_CASSERT_MSG(_please_allocate_at_least_one_FieldTmp_in_memory_param, fieldTmpNumSlots > 0);
+                    auto fieldTmp = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+                    /*load particle without copy particle data to host*/
+                    auto speciesTmp = dc.get<Species>(Species::FrameType::getName(), true);
+
+                    fieldTmp->getGridBuffer().getDeviceBuffer().setValue(ValueType::create(0.0));
+                    /*run algorithm*/
+                    fieldTmp->template computeValue<CORE + BORDER, Solver>(*speciesTmp, params->currentStep);
+
+                    EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
+                    __setTransactionEvent(fieldTmpEvent);
+                    /* copy data to host that we can write same to disk*/
+                    fieldTmp->getGridBuffer().deviceToHost();
+                    dc.releaseData(Species::FrameType::getName());
+                    /*## finish update field ##*/
+
+                    const uint32_t components = GetNComponents<ValueType>::value;
+
+                    /*wrap in a one-component vector for writeField API*/
+                    const traits::FieldPosition<typename fields::CellType, FieldTmp> fieldPos;
+
+                    std::vector<std::vector<float_X>> inCellPosition;
+                    std::vector<float_X> inCellPositonComponent;
+                    for(uint32_t d = 0; d < simDim; ++d)
+                        inCellPositonComponent.push_back(fieldPos()[0][d]);
+                    inCellPosition.push_back(inCellPositonComponent);
+
+                    /** \todo check if always correct at this point, depends on
+                     * solver implementation */
+                    const float_X timeOffset = 0.0;
+
+                    params->gridLayout = fieldTmp->getGridLayout();
+                    bool const isDomainBound = traits::IsFieldDomainBound<FieldTmp>::value;
+                    /*write data to openPMD Series*/
+                    openPMDWriter::template writeField<ComponentType>(
+                        params,
+                        sizeof(ComponentType),
+                        ::openPMD::determineDatatype<ComponentType>(),
+                        components,
+                        getName(),
+                        fieldTmp->getHostDataBox().getPointer(),
+                        getUnit(),
+                        FieldTmp::getUnitDimension<Solver>(),
+                        std::move(inCellPosition),
+                        timeOffset,
+                        isDomainBound);
+
+                    dc.releaseData(FieldTmp::getUniqueId(0));
+                }
+            };
+
+        public:
+            /** constructor
+             *
+             * @param help instance of the class Help
+             * @param id index of this plugin instance within help
+             * @param cellDescription PIConGPu cell description information for
+             * kernel index mapping
+             */
+            openPMDWriter(std::shared_ptr<plugins::multi::IHelp>& help, size_t const id, MappingDesc* cellDescription)
+                : m_help(std::static_pointer_cast<Help>(help))
+                , m_id(id)
+                , m_cellDescription(cellDescription)
+                , outputDirectory("openPMD")
+                , lastSpeciesSyncStep(pmacc::traits::limits::Max<uint32_t>::value)
+            {
+                mThreadParams.compressionMethod = m_help->compression.get(id);
+
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                /* It is important that we never change the mpi_pos after this point
+                 * because we get problems with the restart.
+                 * Otherwise we do not know which gpu must load the ghost parts
+                 * around the sliding window.
+                 */
+                mpi_pos = gc.getPosition();
+                mpi_size = gc.getGpuNodes();
+
+                if(m_help->selfRegister)
+                {
+                    std::string notifyPeriod = m_help->notifyPeriod.get(id);
+                    /* only register for notify callback when .period is set on
+                     * command line */
+                    if(!notifyPeriod.empty())
+                    {
+                        Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+
+                        /** create notify directory */
+                        Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(outputDirectory);
+                    }
+                }
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking
+                // collectives
+                __getTransactionEvent().waitForFinished();
+                mThreadParams.communicator = MPI_COMM_NULL;
+                MPI_CHECK(MPI_Comm_dup(gc.getCommunicator().getMPIComm(), &(mThreadParams.communicator)));
+            }
+
+            virtual ~openPMDWriter()
+            {
+                if(mThreadParams.communicator != MPI_COMM_NULL)
+                {
+                    // avoid deadlock between not finished pmacc tasks and mpi
+                    // blocking collectives
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&(mThreadParams.communicator)));
+                }
+            }
+
+            void notify(uint32_t currentStep)
+            {
+                // notify is only allowed if the plugin is not controlled by the
+                // class Checkpoint
+                assert(m_help->selfRegister);
+
+                __getTransactionEvent().waitForFinished();
+
+                mThreadParams.initFromConfig(*m_help, m_id, m_help->fileName.get(m_id), outputDirectory);
+
+                /* window selection */
+                mThreadParams.window = MovingWindow::getInstance().getWindow(currentStep);
+                mThreadParams.isCheckpoint = false;
+                dumpData(currentStep);
+            }
+
+            virtual void restart(uint32_t restartStep, std::string const& restartDirectory)
+            {
+                /* ISlave restart interface is not needed becase IIOBackend
+                 * restart interface is used
+                 */
+            }
+
+            virtual void checkpoint(uint32_t currentStep, std::string const& checkpointDirectory)
+            {
+                /* ISlave checkpoint interface is not needed becase IIOBackend
+                 * checkpoint interface is used
+                 */
+            }
+
+            void dumpCheckpoint(
+                const uint32_t currentStep,
+                const std::string& checkpointDirectory,
+                const std::string& checkpointFilename)
+            {
+                // checkpointing is only allowed if the plugin is controlled by the
+                // class Checkpoint
+                assert(!m_help->selfRegister);
+
+                __getTransactionEvent().waitForFinished();
+                /* if file name is relative, prepend with common directory */
+
+                mThreadParams.isCheckpoint = true;
+                mThreadParams.initFromConfig(*m_help, m_id, checkpointFilename, checkpointDirectory);
+
+                mThreadParams.window = MovingWindow::getInstance().getDomainAsWindow(currentStep);
+
+                dumpData(currentStep);
+            }
+
+            void doRestart(
+                const uint32_t restartStep,
+                const std::string& restartDirectory,
+                const std::string& constRestartFilename,
+                const uint32_t restartChunkSize)
+            {
+                // restart is only allowed if the plugin is controlled by the class
+                // Checkpoint
+                assert(!m_help->selfRegister);
+
+                mThreadParams.initFromConfig(*m_help, m_id, constRestartFilename, restartDirectory);
+
+                // mThreadParams.isCheckpoint = isCheckpoint;
+                mThreadParams.currentStep = restartStep;
+                mThreadParams.cellDescription = m_cellDescription;
+
+                mThreadParams.openSeries(::openPMD::Access::READ_ONLY);
+
+                ::openPMD::Iteration iteration = mThreadParams.openPMDSeries->iterations[mThreadParams.currentStep];
+
+                /* load number of slides to initialize MovingWindow */
+                log<picLog::INPUT_OUTPUT>("openPMD: (begin) read attr (%1% available)") % iteration.numAttributes();
+
+
+                uint32_t slides = iteration.getAttribute("sim_slides").get<uint32_t>();
+                log<picLog::INPUT_OUTPUT>("openPMD: value of sim_slides = %1%") % slides;
+
+                uint32_t lastStep = iteration.getAttribute("iteration").get<uint32_t>();
+                log<picLog::INPUT_OUTPUT>("openPMD: value of iteration = %1%") % lastStep;
+
+                PMACC_ASSERT(lastStep == restartStep);
+
+                /* apply slides to set gpus to last/written configuration */
+                log<picLog::INPUT_OUTPUT>("openPMD: Setting slide count for moving window to %1%") % slides;
+                MovingWindow::getInstance().setSlideCounter(slides, restartStep);
+
+                /* re-distribute the local offsets in y-direction
+                 * this will work for restarts with moving window still enabled
+                 * and restarts that disable the moving window
+                 * \warning enabling the moving window from a checkpoint that
+                 *          had no moving window will not work
+                 */
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                gc.setStateAfterSlides(slides);
+
+                /* set window for restart, complete global domain */
+                mThreadParams.window = MovingWindow::getInstance().getDomainAsWindow(restartStep);
+                mThreadParams.localWindowToDomainOffset = DataSpace<simDim>::create(0);
+
+                /* load all fields */
+                meta::ForEach<FileCheckpointFields, LoadFields<bmpl::_1>> ForEachLoadFields;
+                ForEachLoadFields(&mThreadParams);
+
+                /* load all particles */
+                meta::ForEach<FileCheckpointParticles, LoadSpecies<bmpl::_1>> ForEachLoadSpecies;
+                ForEachLoadSpecies(&mThreadParams, restartChunkSize);
+
+                IdProvider<simDim>::State idProvState;
+                ReadNDScalars<uint64_t, uint64_t>()(
+                    mThreadParams,
+                    "picongpu",
+                    "idProvider",
+                    "startId",
+                    &idProvState.startId,
+                    "maxNumProc",
+                    &idProvState.maxNumProc);
+                ReadNDScalars<uint64_t>()(mThreadParams, "picongpu", "idProvider", "nextId", &idProvState.nextId);
+                log<picLog::INPUT_OUTPUT>("Setting next free id on current rank: %1%") % idProvState.nextId;
+                IdProvider<simDim>::setState(idProvState);
+
+                // avoid deadlock between not finished pmacc tasks and mpi calls in
+                // openPMD
+                __getTransactionEvent().waitForFinished();
+
+                // Finalize the openPMD Series by calling its destructor
+                mThreadParams.closeSeries();
+            }
+
+        private:
+            void endWrite()
+            {
+                mThreadParams.fieldBuffer.resize(0);
+            }
+
+            void initWrite()
+            {
+                // may be zero
+                auto size = mThreadParams.window.localDimensions.size.productOfComponents();
+                mThreadParams.fieldBuffer.resize(size);
+            }
+
+            /**
+             * Notification for dump or checkpoint received
+             *
+             * @param currentStep current simulation step
+             */
+            void dumpData(uint32_t currentStep)
+            {
+                // local offset + extent
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+                mThreadParams.cellDescription = m_cellDescription;
+                mThreadParams.currentStep = currentStep;
+
+                for(uint32_t i = 0; i < simDim; ++i)
+                {
+                    mThreadParams.localWindowToDomainOffset[i] = 0;
+                    if(mThreadParams.window.globalDimensions.offset[i] > localDomain.offset[i])
+                    {
+                        mThreadParams.localWindowToDomainOffset[i]
+                            = mThreadParams.window.globalDimensions.offset[i] - localDomain.offset[i];
+                    }
+                }
+
+                /* copy species only one time per timestep to the host */
+                if(mThreadParams.strategy == WriteSpeciesStrategy::ADIOS && lastSpeciesSyncStep != currentStep)
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+
+                    /* synchronizes the MallocMCBuffer to the host side */
+                    dc.get<MallocMCBuffer<DeviceHeap>>(MallocMCBuffer<DeviceHeap>::getName());
+
+                    /* here we are copying all species to the host side since we
+                     * can not say at this point if this time step will need all of
+                     * them for sure (checkpoint) or just some user-defined species
+                     * (dump)
+                     */
+                    meta::ForEach<FileCheckpointParticles, CopySpeciesToHost<bmpl::_1>> copySpeciesToHost;
+                    copySpeciesToHost();
+                    lastSpeciesSyncStep = currentStep;
+
+                    dc.releaseData(MallocMCBuffer<DeviceHeap>::getName());
+                }
+
+                TimeIntervall timer;
+                timer.toggleStart();
+                initWrite();
+
+                write(&mThreadParams, mpiTransportParams);
+
+                endWrite();
+                timer.toggleEnd();
+                double interval = timer.getInterval();
+                mThreadParams.times.push_back(interval);
+                double average = std::accumulate(mThreadParams.times.begin(), mThreadParams.times.end(), 0);
+                average /= mThreadParams.times.size();
+                log<picLog::INPUT_OUTPUT>("openPMD: IO plugin ran for %1% (average: %2%)") % timer.printeTime(interval)
+                    % timer.printeTime(average);
+            }
+
+            static void writeFieldAttributes(
+                ThreadParams* params,
+                std::vector<float_64> const& unitDimension,
+                float_X timeOffset,
+                ::openPMD::Mesh& mesh)
+            {
+                static constexpr ::openPMD::UnitDimension openPMDUnitDimensions[7]
+                    = {::openPMD::UnitDimension::L,
+                       ::openPMD::UnitDimension::M,
+                       ::openPMD::UnitDimension::T,
+                       ::openPMD::UnitDimension::I,
+                       ::openPMD::UnitDimension::theta,
+                       ::openPMD::UnitDimension::N,
+                       ::openPMD::UnitDimension::J};
+                std::map<::openPMD::UnitDimension, double> unitMap;
+                for(unsigned i = 0; i < 7; ++i)
+                {
+                    unitMap[openPMDUnitDimensions[i]] = unitDimension[i];
+                }
+
+                mesh.setUnitDimension(unitMap);
+                mesh.setTimeOffset<float_X>(timeOffset);
+                mesh.setGeometry(::openPMD::Mesh::Geometry::cartesian);
+                mesh.setDataOrder(::openPMD::Mesh::DataOrder::C);
+
+                if(simDim == DIM2)
+                {
+                    std::vector<std::string> axisLabels = {"y", "x"}; // 2D: F[y][x]
+                    mesh.setAxisLabels(axisLabels);
+                }
+                if(simDim == DIM3)
+                {
+                    std::vector<std::string> axisLabels = {"z", "y", "x"}; // 3D: F[z][y][x]
+                    mesh.setAxisLabels(axisLabels);
+                }
+
+                // cellSize is {x, y, z} but fields are F[z][y][x]
+                std::vector<float_X> gridSpacing(simDim, 0.0);
+                for(uint32_t d = 0; d < simDim; ++d)
+                    gridSpacing.at(simDim - 1 - d) = cellSize[d];
+
+                mesh.setGridSpacing(gridSpacing);
+
+                /* globalSlideOffset due to gpu slides between origin at time step 0
+                 * and origin at current time step
+                 * ATTENTION: splash offset are globalSlideOffset + picongpu offsets
+                 */
+                DataSpace<simDim> globalSlideOffset;
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+                const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(params->currentStep);
+                globalSlideOffset.y() += numSlides * localDomain.size.y();
+
+                // globalDimensions is {x, y, z} but fields are F[z][y][x]
+                std::vector<float_64> gridGlobalOffset(simDim, 0.0);
+                for(uint32_t d = 0; d < simDim; ++d)
+                    gridGlobalOffset.at(simDim - 1 - d) = float_64(cellSize[d])
+                        * float_64(params->window.globalDimensions.offset[d] + globalSlideOffset[d]);
+
+                mesh.setGridGlobalOffset(std::move(gridGlobalOffset));
+                mesh.setGridUnitSI(UNIT_LENGTH);
+                mesh.setAttribute("fieldSmoothing", "none");
+            }
+
+            template<typename ComponentType>
+            static void writeField(
+                ThreadParams* params,
+                const uint32_t sizePtrType,
+                ::openPMD::Datatype openPMDType,
+                const uint32_t nComponents,
+                const std::string name,
+                void* ptr,
+                std::vector<float_64> unit,
+                std::vector<float_64> unitDimension,
+                std::vector<std::vector<float_X>> inCellPosition,
+                float_X timeOffset,
+                bool isDomainBound)
+            {
+                auto const name_lookup_tpl = plugins::misc::getComponentNames(nComponents);
+
+                /* parameter checking */
+                PMACC_ASSERT(unit.size() == nComponents);
+                PMACC_ASSERT(inCellPosition.size() == nComponents);
+                for(uint32_t n = 0; n < nComponents; ++n)
+                    PMACC_ASSERT(inCellPosition.at(n).size() == simDim);
+                PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
+
+                log<picLog::INPUT_OUTPUT>("openPMD: write field: %1% %2% %3%") % name % nComponents % ptr;
+
+                const bool fieldTypeCorrect(boost::is_same<ComponentType, float_X>::value);
+                PMACC_CASSERT_MSG(Precision_mismatch_in_Field_Components__ADIOS, fieldTypeCorrect);
+
+                ::openPMD::Iteration iteration = params->openPMDSeries->WRITE_ITERATIONS[params->currentStep];
+                ::openPMD::Mesh mesh = iteration.meshes[name];
+
+                // set mesh attributes
+                writeFieldAttributes(params, unitDimension, timeOffset, mesh);
+
+                /* data to describe source buffer */
+                GridLayout<simDim> field_layout = params->gridLayout;
+                DataSpace<simDim> field_full = field_layout.getDataSpace();
+
+                DataSpace<simDim> field_no_guard = params->window.localDimensions.size;
+                DataSpace<simDim> field_guard = field_layout.getGuard() + params->localWindowToDomainOffset;
+                std::vector<float_X>& dstBuffer = params->fieldBuffer;
+
+                auto fieldsSizeDims = params->fieldsSizeDims;
+                auto fieldsGlobalSizeDims = params->fieldsGlobalSizeDims;
+                auto fieldsOffsetDims = params->fieldsOffsetDims;
+
+                /* Patch for non-domain-bound fields
+                 * Allow for the output of reduced 1d PML buffer
+                 */
+                if(!isDomainBound)
+                {
+                    field_no_guard = field_layout.getDataSpaceWithoutGuarding();
+                    field_guard = field_layout.getGuard();
+                    dstBuffer.resize(field_no_guard.productOfComponents());
+
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    fieldsSizeDims = precisionCast<uint64_t>(params->gridLayout.getDataSpaceWithoutGuarding());
+                    dc.releaseData(name);
+
+                    /* Scan the PML buffer local size along all local domains
+                     * This code is based on the same operation in hdf5::Field::writeField(),
+                     * the same comments apply here
+                     */
+                    log<picLog::INPUT_OUTPUT>("openPMD:  (begin) collect PML sizes for %1%") % name;
+                    auto& gridController = Environment<simDim>::get().GridController();
+                    auto const numRanks = uint64_t{gridController.getGlobalSize()};
+                    /* Use domain position-based rank, not MPI rank, to be independent
+                     * of the MPI rank assignment scheme
+                     */
+                    auto const rank = uint64_t{gridController.getScalarPosition()};
+                    std::vector<uint64_t> localSizes(2u * numRanks, 0u);
+                    uint64_t localSizeInfo[2] = {fieldsSizeDims[0], rank};
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK(MPI_Allgather(
+                        localSizeInfo,
+                        2,
+                        MPI_UINT64_T,
+                        &(*localSizes.begin()),
+                        2,
+                        MPI_UINT64_T,
+                        gridController.getCommunicator().getMPIComm()));
+                    uint64_t globalOffsetFile = 0;
+                    uint64_t globalSize = 0;
+                    for(uint64_t r = 0; r < numRanks; ++r)
+                    {
+                        globalSize += localSizes.at(2u * r);
+                        if(localSizes.at(2u * r + 1u) < rank)
+                            globalOffsetFile += localSizes.at(2u * r);
+                    }
+                    log<picLog::INPUT_OUTPUT>("openPMD:  (end) collect PML sizes for %1%") % name;
+
+                    fieldsGlobalSizeDims = pmacc::math::UInt64<simDim>::create(1);
+                    fieldsGlobalSizeDims[0] = globalSize;
+                    fieldsOffsetDims = pmacc::math::UInt64<simDim>::create(0);
+                    fieldsOffsetDims[0] = globalOffsetFile;
+                }
+
+                /* write the actual field data */
+                for(uint32_t d = 0; d < nComponents; d++)
+                {
+                    const size_t plane_full_size = field_full[1] * field_full[0] * nComponents;
+                    const size_t plane_no_guard_size = field_no_guard[1] * field_no_guard[0];
+
+                    /* copy strided data from source to temporary buffer
+                     *
+                     * \todo use d1Access as in
+                     * `include/plugins/hdf5/writer/Field.hpp`
+                     */
+                    const int maxZ = simDim == DIM3 ? field_no_guard[2] : 1;
+                    const int guardZ = simDim == DIM3 ? field_guard[2] : 0;
+                    for(int z = 0; z < maxZ; ++z)
+                    {
+                        for(int y = 0; y < field_no_guard[1]; ++y)
+                        {
+                            const size_t base_index_src
+                                = (z + guardZ) * plane_full_size + (y + field_guard[1]) * field_full[0] * nComponents;
+
+                            const size_t base_index_dst = z * plane_no_guard_size + y * field_no_guard[0];
+
+                            for(int x = 0; x < field_no_guard[0]; ++x)
+                            {
+                                size_t index_src = base_index_src + (x + field_guard[0]) * nComponents + d;
+                                size_t index_dst = base_index_dst + x;
+
+                                dstBuffer[index_dst] = reinterpret_cast<float_X*>(ptr)[index_src];
+                            }
+                        }
+                    }
+
+                    ::openPMD::MeshRecordComponent mrc
+                        = mesh[nComponents > 1 ? name_lookup_tpl[d] : ::openPMD::RecordComponent::SCALAR];
+
+                    std::string datasetName = nComponents > 1
+                        ? params->openPMDSeries->meshesPath() + name + "/" + name_lookup_tpl[d]
+                        : params->openPMDSeries->meshesPath() + name;
+
+                    params->initDataset<simDim>(
+                        mrc,
+                        openPMDType,
+                        fieldsGlobalSizeDims,
+                        true,
+                        params->compressionMethod,
+                        datasetName);
+                    if(dstBuffer.size() > 0)
+                        mrc.storeChunk<std::vector<float_X>>(
+                            dstBuffer,
+                            asStandardVector(fieldsOffsetDims),
+                            asStandardVector(fieldsSizeDims));
+
+                    // define record component level attributes
+                    mrc.setPosition(inCellPosition.at(d));
+                    mrc.setUnitSI(unit.at(d));
+
+                    params->openPMDSeries->flush();
+                }
+            }
+
+
+            template<typename T_ParticleFilter>
+            struct CallWriteSpecies
+            {
+                template<typename Space>
+                void operator()(
+                    const std::vector<std::string>& vectorOfDataSourceNames,
+                    ThreadParams* params,
+                    const Space domainOffset)
+                {
+                    bool const containsDataSource
+                        = plugins::misc::containsObject(vectorOfDataSourceNames, T_ParticleFilter::getName());
+
+                    if(containsDataSource)
+                    {
+                        WriteSpecies<T_ParticleFilter> writeSpecies;
+                        writeSpecies(params, domainOffset);
+                    }
+                }
+            };
+
+            template<typename T_Fields>
+            struct CallGetFields
+            {
+                void operator()(const std::vector<std::string>& vectorOfDataSourceNames, ThreadParams* params)
+                {
+                    bool const containsDataSource
+                        = plugins::misc::containsObject(vectorOfDataSourceNames, T_Fields::getName());
+
+                    if(containsDataSource)
+                    {
+                        GetFields<T_Fields> getFields;
+                        getFields(params);
+                    }
+                }
+            };
+
+            void write(ThreadParams* threadParams, std::string mpiTransportParams)
+            {
+                /* y direction can be negative for first gpu */
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+                DataSpace<simDim> particleOffset(localDomain.offset);
+                particleOffset.y() -= threadParams->window.globalDimensions.offset.y();
+
+                threadParams->fieldsOffsetDims = precisionCast<uint64_t>(localDomain.offset);
+
+                /* write created variable values */
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    /* dimension 1 is y and is the direction of the moving window
+                     * (if any) */
+                    if(1 == d)
+                    {
+                        uint64_t offset
+                            = std::max(0, localDomain.offset.y() - threadParams->window.globalDimensions.offset.y());
+                        threadParams->fieldsOffsetDims[d] = offset;
+                    }
+
+                    threadParams->fieldsSizeDims[d] = threadParams->window.localDimensions.size[d];
+                    threadParams->fieldsGlobalSizeDims[d] = threadParams->window.globalDimensions.size[d];
+                }
+
+                std::vector<std::string> vectorOfDataSourceNames;
+                if(m_help->selfRegister)
+                {
+                    std::string dataSourceNames = m_help->source.get(m_id);
+
+                    vectorOfDataSourceNames = plugins::misc::splitString(plugins::misc::removeSpaces(dataSourceNames));
+                }
+
+                bool dumpFields = plugins::misc::containsObject(vectorOfDataSourceNames, "fields_all");
+
+                if(threadParams->openPMDSeries)
+                {
+                    log<picLog::INPUT_OUTPUT>("openPMD: Series still open, reusing");
+                    // TODO check for same configuration
+                }
+                else
+                {
+                    log<picLog::INPUT_OUTPUT>("openPMD: opening Series %1%") % threadParams->fileName;
+                    threadParams->openSeries(::openPMD::Access::CREATE);
+                }
+
+                bool dumpAllParticles = plugins::misc::containsObject(vectorOfDataSourceNames, "species_all");
+
+                /* write fields */
+                log<picLog::INPUT_OUTPUT>("openPMD: (begin) writing fields.");
+                if(threadParams->isCheckpoint)
+                {
+                    meta::ForEach<FileCheckpointFields, GetFields<bmpl::_1>> ForEachGetFields;
+                    ForEachGetFields(threadParams);
+                }
+                else
+                {
+                    if(dumpFields)
+                    {
+                        meta::ForEach<FileOutputFields, GetFields<bmpl::_1>> ForEachGetFields;
+                        ForEachGetFields(threadParams);
+                    }
+
+                    // move over all field data sources
+                    meta::ForEach<typename Help::AllFieldSources, CallGetFields<bmpl::_1>>{}(
+                        vectorOfDataSourceNames,
+                        threadParams);
+                }
+                log<picLog::INPUT_OUTPUT>("openPMD: ( end ) writing fields.");
+
+
+                /* print all particle species */
+                log<picLog::INPUT_OUTPUT>("openPMD: (begin) writing particle species.");
+                if(threadParams->isCheckpoint)
+                {
+                    meta::ForEach<
+                        FileCheckpointParticles,
+                        WriteSpecies<
+                            plugins::misc::SpeciesFilter<bmpl::_1>,
+                            plugins::misc::UnfilteredSpecies<bmpl::_1>>>
+                        writeSpecies;
+                    writeSpecies(threadParams, particleOffset);
+                }
+                else
+                {
+                    // dump data if data source "species_all" is selected
+                    if(dumpAllParticles)
+                    {
+                        // move over all species defined in FileOutputParticles
+                        meta::ForEach<FileOutputParticles, WriteSpecies<plugins::misc::UnfilteredSpecies<bmpl::_1>>>
+                            writeSpecies;
+                        writeSpecies(threadParams, particleOffset);
+                    }
+
+                    // move over all species data sources
+                    meta::ForEach<typename Help::AllEligibleSpeciesSources, CallWriteSpecies<bmpl::_1>>{}(
+                        vectorOfDataSourceNames,
+                        threadParams,
+                        particleOffset);
+                }
+                log<picLog::INPUT_OUTPUT>("openPMD: ( end ) writing particle species.");
+
+
+                auto idProviderState = IdProvider<simDim>::getState();
+                log<picLog::INPUT_OUTPUT>("openPMD: Writing IdProvider state (StartId: %1%, NextId: %2%, "
+                                          "maxNumProc: %3%)")
+                    % idProviderState.startId % idProviderState.nextId % idProviderState.maxNumProc;
+
+                WriteNDScalars<uint64_t, uint64_t> writeIdProviderStartId(
+                    "picongpu",
+                    "idProvider",
+                    "startId",
+                    "maxNumProc");
+                WriteNDScalars<uint64_t, uint64_t> writeIdProviderNextId("picongpu", "idProvider", "nextId");
+                writeIdProviderStartId(*threadParams, idProviderState.startId, idProviderState.maxNumProc);
+                writeIdProviderNextId(*threadParams, idProviderState.nextId);
+
+                /* attributes written here are pure meta data */
+                WriteMeta writeMetaAttributes;
+                writeMetaAttributes(threadParams);
+
+                // avoid deadlock between not finished pmacc tasks and mpi calls in
+                // openPMD
+                __getTransactionEvent().waitForFinished();
+                mThreadParams.openPMDSeries->WRITE_ITERATIONS[mThreadParams.currentStep].close();
+
+                return;
+            }
+
+            ThreadParams mThreadParams;
+
+            std::shared_ptr<Help> m_help;
+            size_t m_id;
+
+            MappingDesc* m_cellDescription;
+
+            std::string outputDirectory;
+
+            /* select MPI method, #OSTs and #aggregators */
+            std::string mpiTransportParams;
+
+            uint32_t lastSpeciesSyncStep;
+
+            DataSpace<simDim> mpi_pos;
+            DataSpace<simDim> mpi_size;
+        };
+
+        std::shared_ptr<plugins::multi::ISlave> Help::create(
+            std::shared_ptr<plugins::multi::IHelp>& help,
+            size_t const id,
+            MappingDesc* cellDescription)
+        {
+            return std::shared_ptr<plugins::multi::ISlave>(new openPMDWriter(help, id, cellDescription));
+        }
+
+    } // namespace openPMD
+} // namespace picongpu
diff --git a/include/picongpu/plugins/openPMD/restart/LoadParticleAttributesFromOpenPMD.hpp b/include/picongpu/plugins/openPMD/restart/LoadParticleAttributesFromOpenPMD.hpp
new file mode 100644
index 0000000000..c13a21b601
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/restart/LoadParticleAttributesFromOpenPMD.hpp
@@ -0,0 +1,134 @@
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera, Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+
+#include "picongpu/plugins/openPMD/openPMDWriter.def"
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/traits/PICToOpenPMD.hpp"
+
+#include <pmacc/assert.hpp>
+#include <pmacc/traits/GetComponentsType.hpp>
+#include <pmacc/traits/GetNComponents.hpp>
+#include <pmacc/traits/Resolve.hpp>
+
+#include <openPMD/openPMD.hpp>
+
+#include <memory>
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        using namespace pmacc;
+
+        /** Load attribute of a species from openPMD checkpoint storage
+         *
+         * @tparam T_Identifier identifier of species attribute
+         */
+        template<typename T_Identifier>
+        struct LoadParticleAttributesFromOpenPMD
+        {
+            /** read attributes from openPMD file
+             *
+             * @param params thread params
+             * @param frame frame with all particles
+             * @param particleSpecies the openpmd representation of the species
+             * @param particlesOffset read offset in the attribute array
+             * @param elements number of elements which should be read the attribute
+             * array
+             */
+            template<typename FrameType>
+            HINLINE void operator()(
+                ThreadParams* params,
+                FrameType& frame,
+                ::openPMD::ParticleSpecies particleSpecies,
+                const uint64_t particlesOffset,
+                const uint64_t elements)
+            {
+                using Identifier = T_Identifier;
+                using ValueType = typename pmacc::traits::Resolve<Identifier>::type::type;
+                const uint32_t components = GetNComponents<ValueType>::value;
+                using ComponentType = typename GetComponentsType<ValueType>::type;
+                OpenPMDName<Identifier> openPMDName;
+
+                log<picLog::INPUT_OUTPUT>("openPMD: ( begin ) load species attribute: %1%") % openPMDName();
+
+                const std::string name_lookup[] = {"x", "y", "z"};
+
+                std::shared_ptr<ComponentType> loadBfr;
+                if(elements > 0)
+                {
+                    loadBfr = std::shared_ptr<ComponentType>{new ComponentType[elements], [](ComponentType* ptr) {
+                                                                 delete[] ptr;
+                                                             }};
+                }
+
+                for(uint32_t n = 0; n < components; ++n)
+                {
+                    ::openPMD::Record record = particleSpecies[openPMDName()];
+                    ::openPMD::RecordComponent rc
+                        = components > 1 ? record[name_lookup[n]] : record[::openPMD::RecordComponent::SCALAR];
+
+                    ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer();
+
+                    if(elements > 0)
+                    {
+                        // avoid deadlock between not finished pmacc tasks and mpi
+                        // calls in openPMD
+                        __getTransactionEvent().waitForFinished();
+                        rc.loadChunk<ComponentType>(
+                            loadBfr,
+                            ::openPMD::Offset{particlesOffset},
+                            ::openPMD::Extent{elements});
+                    }
+
+                    /** start a blocking read of all scheduled variables
+                     *  (this is collective call in many methods of openPMD
+                     * backends)
+                     */
+                    params->openPMDSeries->flush();
+
+                    uint64_t globalNumElements = 1;
+                    for(auto ext : rc.getExtent())
+                    {
+                        globalNumElements *= ext;
+                    }
+
+                    log<picLog::INPUT_OUTPUT>("openPMD:  Did read %1% local of %2% global elements for "
+                                              "%3%")
+                        % elements % globalNumElements % openPMDName();
+
+/* copy component from temporary array to array of structs */
+#pragma omp parallel for simd
+                    for(size_t i = 0; i < elements; ++i)
+                    {
+                        ComponentType* ref = &reinterpret_cast<ComponentType*>(dataPtr)[i * components + n];
+                        *ref = loadBfr.get()[i];
+                    }
+                }
+
+                log<picLog::INPUT_OUTPUT>("openPMD:  ( end ) load species attribute: %1%") % openPMDName();
+            }
+        };
+
+    } /* namespace openPMD */
+} /* namespace picongpu */
diff --git a/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp
new file mode 100644
index 0000000000..1c8e7847cc
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/restart/LoadSpecies.hpp
@@ -0,0 +1,232 @@
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl, Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/plugins/ISimulationPlugin.hpp"
+#include "picongpu/plugins/openPMD/openPMDWriter.def"
+#include "picongpu/plugins/openPMD/restart/LoadParticleAttributesFromOpenPMD.hpp"
+#include "picongpu/plugins/output/WriteSpeciesCommon.hpp"
+#include "picongpu/simulation_defines.hpp"
+
+#include <pmacc/dataManagement/DataConnector.hpp>
+#include <pmacc/mappings/kernel/AreaMapping.hpp>
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+#include <pmacc/meta/conversion/RemoveFromSeq.hpp>
+#include <pmacc/particles/ParticleDescription.hpp>
+#include <pmacc/particles/operations/splitIntoListOfFrames.kernel>
+
+#include <boost/mpl/at.hpp>
+#include <boost/mpl/begin_end.hpp>
+#include <boost/mpl/find.hpp>
+#include <boost/mpl/pair.hpp>
+#include <boost/mpl/size.hpp>
+#include <boost/mpl/vector.hpp>
+#include <boost/type_traits.hpp>
+#include <boost/type_traits/is_same.hpp>
+
+#include <openPMD/openPMD.hpp>
+
+#include <cassert>
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        using namespace pmacc;
+
+        /** Load species from openPMD checkpoint storage
+         *
+         * @tparam T_Species type of species
+         */
+        template<typename T_Species>
+        struct LoadSpecies
+        {
+        public:
+            using ThisSpecies = T_Species;
+            using FrameType = typename ThisSpecies::FrameType;
+            using ParticleDescription = typename FrameType::ParticleDescription;
+            using ParticleAttributeList = typename FrameType::ValueTypeSeq;
+
+
+            /* delete multiMask and localCellIdx in openPMD particle*/
+            using TypesToDelete = bmpl::vector2<multiMask, localCellIdx>;
+            using ParticleCleanedAttributeList = typename RemoveFromSeq<ParticleAttributeList, TypesToDelete>::type;
+
+            /* add totalCellIdx for openPMD particle*/
+            using ParticleNewAttributeList = typename MakeSeq<ParticleCleanedAttributeList, totalCellIdx>::type;
+
+            using NewParticleDescription =
+                typename ReplaceValueTypeSeq<ParticleDescription, ParticleNewAttributeList>::type;
+
+            using openPMDFrameType = Frame<OperatorCreateVectorBox, NewParticleDescription>;
+
+            /** Load species from openPMD checkpoint storage
+             *
+             * @param params thread params
+             * @param restartChunkSize number of particles processed in one kernel
+             * call
+             */
+            HINLINE void operator()(ThreadParams* params, const uint32_t restartChunkSize)
+            {
+                std::string const speciesName = FrameType::getName();
+                log<picLog::INPUT_OUTPUT>("openPMD: (begin) load species: %1%") % speciesName;
+                DataConnector& dc = Environment<>::get().DataConnector();
+                GridController<simDim>& gc = Environment<simDim>::get().GridController();
+
+                ::openPMD::Series& series = *params->openPMDSeries;
+                ::openPMD::Container<::openPMD::ParticleSpecies>& particles
+                    = series.iterations[params->currentStep].particles;
+                ::openPMD::ParticleSpecies particleSpecies = particles[speciesName];
+
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+
+                /* load particle without copying particle data to host */
+                auto speciesTmp = dc.get<ThisSpecies>(FrameType::getName(), true);
+
+                // avoid deadlock between not finished pmacc tasks and mpi calls in
+                // openPMD
+                __getTransactionEvent().waitForFinished();
+
+                auto numRanks = gc.getGlobalSize();
+
+                size_t patchIdx = getPatchIdx(params, series, particleSpecies, numRanks);
+
+                std::shared_ptr<uint64_t> fullParticlesInfoShared
+                    = particleSpecies.particlePatches["numParticles"][::openPMD::RecordComponent::SCALAR]
+                          .load<uint64_t>();
+                series.flush();
+                uint64_t* fullParticlesInfo = fullParticlesInfoShared.get();
+
+                /* Run a prefix sum over the numParticles[0] element in
+                 * particlesInfo to retreive the offset of particles
+                 */
+                uint64_t particleOffset = 0u;
+                /* count total number of particles on the device */
+                uint64_t totalNumParticles = 0u;
+
+                assert(patchIdx < numRanks);
+
+                for(size_t i = 0u; i <= patchIdx; ++i)
+                {
+                    if(i < patchIdx)
+                        particleOffset += fullParticlesInfo[i];
+                    if(i == patchIdx)
+                        totalNumParticles = fullParticlesInfo[i];
+                }
+
+                log<picLog::INPUT_OUTPUT>("openPMD: Loading %1% particles from offset %2%")
+                    % (long long unsigned) totalNumParticles % (long long unsigned) particleOffset;
+
+                openPMDFrameType hostFrame;
+                log<picLog::INPUT_OUTPUT>("openPMD: malloc mapped memory: %1%") % speciesName;
+                /*malloc mapped memory*/
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, MallocMemory<bmpl::_1>> mallocMem;
+                mallocMem(hostFrame, totalNumParticles);
+
+                log<picLog::INPUT_OUTPUT>("openPMD: get mapped memory device pointer: %1%") % speciesName;
+                /*load device pointer of mapped memory*/
+                openPMDFrameType deviceFrame;
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, GetDevicePtr<bmpl::_1>> getDevicePtr;
+                getDevicePtr(deviceFrame, hostFrame);
+
+                meta::ForEach<typename openPMDFrameType::ValueTypeSeq, LoadParticleAttributesFromOpenPMD<bmpl::_1>>
+                    loadAttributes;
+                loadAttributes(params, hostFrame, particleSpecies, particleOffset, totalNumParticles);
+
+                if(totalNumParticles != 0)
+                {
+                    pmacc::particles::operations::splitIntoListOfFrames(
+                        *speciesTmp,
+                        deviceFrame,
+                        totalNumParticles,
+                        restartChunkSize,
+                        localDomain.offset,
+                        totalCellIdx_,
+                        *(params->cellDescription),
+                        picLog::INPUT_OUTPUT());
+
+                    /*free host memory*/
+                    meta::ForEach<typename openPMDFrameType::ValueTypeSeq, FreeMemory<bmpl::_1>> freeMem;
+                    freeMem(hostFrame);
+                }
+                log<picLog::INPUT_OUTPUT>("openPMD: ( end ) load species: %1%") % speciesName;
+            }
+
+        private:
+            /** get index for particle data within the openPMD patch data
+             *
+             * It is not possible to assume that we can use the MPI rank to load the particle data.
+             * There is no guarantee that the MPI rank is corresponding to the position within
+             * the simulation volume.
+             *
+             * Use patch information offset and extent to find the index which should be used
+             * to load openPMD particle patch data.
+             *
+             * @return index of the particle patch within the openPMD data
+             */
+            HINLINE size_t getPatchIdx(
+                ThreadParams* params,
+                ::openPMD::Series& series,
+                ::openPMD::ParticleSpecies particleSpecies,
+                size_t numRanks)
+            {
+                const std::string name_lookup[] = {"x", "y", "z"};
+
+                std::vector<DataSpace<simDim>> offsets(numRanks);
+                std::vector<DataSpace<simDim>> extents(numRanks);
+
+                // transform openPMD particle patch data into PIConGPU data objects
+                for(uint32_t d = 0; d < simDim; ++d)
+                {
+                    std::shared_ptr<uint64_t> patchOffsetsInfoShared
+                        = particleSpecies.particlePatches["offset"][name_lookup[d]].load<uint64_t>();
+                    std::shared_ptr<uint64_t> patchExtentsInfoShared
+                        = particleSpecies.particlePatches["extent"][name_lookup[d]].load<uint64_t>();
+                    series.flush();
+                    for(size_t i = 0; i < numRanks; ++i)
+                    {
+                        offsets[i][d] = patchOffsetsInfoShared.get()[i];
+                        extents[i][d] = patchExtentsInfoShared.get()[i];
+                    }
+                }
+
+                pmacc::Selection<simDim> const globalDomain = Environment<simDim>::get().SubGrid().getGlobalDomain();
+                DataSpace<simDim> const patchOffset = globalDomain.offset + params->window.globalDimensions.offset
+                    + params->window.localDimensions.offset;
+                DataSpace<simDim> const patchExtent = params->window.localDimensions.size;
+
+                size_t patchIdx = 0;
+                // search the patch index based on the offset and extents of local domain size
+                for(size_t i = 0; i < numRanks; ++i)
+                {
+                    if(patchOffset == offsets[i] && patchExtent == extents[i])
+                    {
+                        patchIdx = i;
+                        break;
+                    }
+                }
+                return patchIdx;
+            }
+        };
+
+
+    } /* namespace openPMD */
+
+} /* namespace picongpu */
diff --git a/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp b/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp
new file mode 100644
index 0000000000..d311ed9922
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/restart/RestartFieldLoader.hpp
@@ -0,0 +1,231 @@
+/* Copyright 2014-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
+ *                     Benjamin Worpitz, Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/plugins/openPMD/openPMDWriter.def"
+#include "picongpu/plugins/misc/ComponentNames.hpp"
+#include "picongpu/simulation/control/MovingWindow.hpp"
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/traits/IsFieldDomainBound.hpp"
+
+#include <pmacc/communication/manager_common.hpp>
+#include <pmacc/dataManagement/DataConnector.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include <pmacc/dimensions/GridLayout.hpp>
+#include <pmacc/Environment.hpp>
+#include <pmacc/particles/frame_types.hpp>
+#include <pmacc/types.hpp>
+
+#include <openPMD/openPMD.hpp>
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        /**
+         * Helper class for openPMD plugin to load fields from parallel openPMD
+         * storages.
+         */
+        class RestartFieldLoader
+        {
+        public:
+            template<class Data>
+            static void loadField(
+                Data& field,
+                const uint32_t numComponents,
+                std::string objectName,
+                ThreadParams* params,
+                bool const isDomainBound)
+            {
+                log<picLog::INPUT_OUTPUT>("Begin loading field '%1%'") % objectName;
+
+                auto const name_lookup_tpl = plugins::misc::getComponentNames(numComponents);
+                const DataSpace<simDim> field_guard = field.getGridLayout().getGuard();
+
+                const pmacc::Selection<simDim> localDomain = Environment<simDim>::get().SubGrid().getLocalDomain();
+
+                using ValueType = typename Data::ValueType;
+                field.getHostBuffer().setValue(ValueType::create(0.0));
+
+                DataSpace<simDim> domain_offset = localDomain.offset;
+                DataSpace<simDim> local_domain_size = params->window.localDimensions.size;
+                bool useLinearIdxAsDestination = false;
+
+                /* Patch for non-domain-bound fields
+                 * This is an ugly fix to allow output of reduced 1d PML buffers
+                 */
+                if(!isDomainBound)
+                {
+                    auto const field_layout = params->gridLayout;
+                    auto const field_no_guard = field_layout.getDataSpaceWithoutGuarding();
+                    auto const elementCount = field_no_guard.productOfComponents();
+
+                    /* Scan the PML buffer local size along all local domains
+                     * This code is symmetric to one in Field::writeField()
+                     */
+                    log<picLog::INPUT_OUTPUT>("openPMD:  (begin) collect PML sizes for %1%") % objectName;
+                    auto& gridController = Environment<simDim>::get().GridController();
+                    auto const numRanks = uint64_t{gridController.getGlobalSize()};
+                    /* Use domain position-based rank, not MPI rank, to be independent
+                     * of the MPI rank assignment scheme
+                     */
+                    auto const rank = uint64_t{gridController.getScalarPosition()};
+                    std::vector<uint64_t> localSizes(2 * numRanks, 0u);
+                    uint64_t localSizeInfo[2] = {static_cast<uint64_t>(elementCount), rank};
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK(MPI_Allgather(
+                        localSizeInfo,
+                        2,
+                        MPI_UINT64_T,
+                        &(*localSizes.begin()),
+                        2,
+                        MPI_UINT64_T,
+                        gridController.getCommunicator().getMPIComm()));
+                    uint64_t domainOffset = 0;
+                    for(uint64_t r = 0; r < numRanks; ++r)
+                    {
+                        if(localSizes.at(2u * r + 1u) < rank)
+                            domainOffset += localSizes.at(2u * r);
+                    }
+                    log<picLog::INPUT_OUTPUT>("openPMD:  (end) collect PML sizes for %1%") % objectName;
+
+                    domain_offset = DataSpace<simDim>::create(0);
+                    domain_offset[0] = static_cast<int>(domainOffset);
+                    local_domain_size = DataSpace<simDim>::create(1);
+                    local_domain_size[0] = elementCount;
+                    useLinearIdxAsDestination = true;
+                }
+
+                ::openPMD::Series& series = *params->openPMDSeries;
+                ::openPMD::Container<::openPMD::Mesh>& meshes = series.iterations[params->currentStep].meshes;
+
+                auto destBox = field.getHostBuffer().getDataBox();
+                for(uint32_t n = 0; n < numComponents; ++n)
+                {
+                    // Read the subdomain which belongs to our mpi position.
+                    // The total grid size must match the grid size of the stored
+                    // data.
+                    log<picLog::INPUT_OUTPUT>("openPMD: Read from domain: offset=%1% size=%2%") % domain_offset
+                        % local_domain_size;
+                    ::openPMD::RecordComponent rc = numComponents > 1
+                        ? meshes[objectName][name_lookup_tpl[n]]
+                        : meshes[objectName][::openPMD::RecordComponent::SCALAR];
+
+                    log<picLog::INPUT_OUTPUT>("openPMD: Read from field '%1%'") % objectName;
+
+                    auto ndim = rc.getDimensionality();
+                    ::openPMD::Offset start = asStandardVector<DataSpace<simDim>&, ::openPMD::Offset>(domain_offset);
+                    ::openPMD::Extent count
+                        = asStandardVector<DataSpace<simDim>&, ::openPMD::Extent>(local_domain_size);
+
+                    log<picLog::INPUT_OUTPUT>("openPMD: Allocate %1% elements")
+                        % local_domain_size.productOfComponents();
+
+                    // avoid deadlock between not finished pmacc tasks and mpi calls
+                    // in openPMD backends
+                    __getTransactionEvent().waitForFinished();
+
+                    /*
+                     * @todo float_X should be some kind of gridBuffer's
+                     *       GetComponentsType<ValueType>::type
+                     */
+                    std::shared_ptr<float_X> field_container = rc.loadChunk<float_X>(start, count);
+
+                    /* start a blocking read of all scheduled variables */
+                    series.flush();
+
+
+                    int const elementCount = local_domain_size.productOfComponents();
+
+#pragma omp parallel for simd
+                    for(int linearId = 0; linearId < elementCount; ++linearId)
+                    {
+                        DataSpace<simDim> destIdx;
+                        if(useLinearIdxAsDestination)
+                        {
+                            destIdx[0] = linearId;
+                        }
+                        else
+                        {
+                            /* calculate index inside the moving window domain which
+                             * is located on the local grid*/
+                            destIdx = DataSpaceOperations<simDim>::map(params->window.localDimensions.size, linearId);
+                            /* jump over guard and local sliding window offset*/
+                            destIdx += field_guard + params->localWindowToDomainOffset;
+                        }
+
+                        destBox(destIdx)[n] = field_container.get()[linearId];
+                    }
+                }
+
+                field.hostToDevice();
+
+                __getTransactionEvent().waitForFinished();
+
+                log<picLog::INPUT_OUTPUT>("openPMD: Read from domain: offset=%1% size=%2%") % domain_offset
+                    % local_domain_size;
+                log<picLog::INPUT_OUTPUT>("openPMD: Finished loading field '%1%'") % objectName;
+            }
+        };
+
+        /**
+         * Helper class for openPMDWriter (forEach operator) to load a field from
+         * openPMD
+         *
+         * @tparam T_Field field class to load
+         */
+        template<typename T_Field>
+        struct LoadFields
+        {
+        public:
+            HDINLINE void operator()(ThreadParams* params)
+            {
+#ifndef __CUDA_ARCH__
+                DataConnector& dc = Environment<>::get().DataConnector();
+                ThreadParams* tp = params;
+
+                /* load field without copying data to host */
+                auto field = dc.get<T_Field>(T_Field::getName(), true);
+                tp->gridLayout = field->getGridLayout();
+
+                /* load from openPMD */
+                bool const isDomainBound = traits::IsFieldDomainBound<T_Field>::value;
+                RestartFieldLoader::loadField(
+                    field->getGridBuffer(),
+                    (uint32_t) T_Field::numComponents,
+                    T_Field::getName(),
+                    tp,
+                    isDomainBound);
+
+                dc.releaseData(T_Field::getName());
+#endif
+            }
+        };
+
+        using namespace pmacc;
+
+    } /* namespace openPMD */
+} /* namespace picongpu */
diff --git a/include/picongpu/plugins/openPMD/writer/ParticleAttribute.hpp b/include/picongpu/plugins/openPMD/writer/ParticleAttribute.hpp
new file mode 100644
index 0000000000..149281f906
--- /dev/null
+++ b/include/picongpu/plugins/openPMD/writer/ParticleAttribute.hpp
@@ -0,0 +1,143 @@
+/* Copyright 2014-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+ *                     Franz Poeschel
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/plugins/openPMD/openPMDWriter.def"
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/traits/PICToOpenPMD.tpp"
+#include "picongpu/plugins/openPMD/openPMDDimension.hpp"
+
+#include <pmacc/traits/GetComponentsType.hpp>
+#include <pmacc/traits/GetNComponents.hpp>
+#include <pmacc/traits/Resolve.hpp>
+
+namespace picongpu
+{
+    namespace openPMD
+    {
+        using namespace pmacc;
+
+        static const std::string name_lookup[] = {"x", "y", "z"};
+
+
+        /** write attribute of a particle to openPMD series
+         *
+         * @tparam T_Identifier identifier of a particle attribute
+         */
+        template<typename T_Identifier>
+        struct ParticleAttribute
+        {
+            /** write attribute to openPMD series
+             *
+             * @param params wrapped params
+             * @param elements elements of this attribute
+             */
+            template<typename FrameType>
+            HINLINE void operator()(
+                ThreadParams* params,
+                FrameType& frame,
+                ::openPMD::Container<::openPMD::Record>& particleSpecies,
+                std::string const& basepath,
+                const size_t elements,
+                const size_t globalElements,
+                const size_t globalOffset)
+            {
+                using Identifier = T_Identifier;
+                using ValueType = typename pmacc::traits::Resolve<Identifier>::type::type;
+                const uint32_t components = GetNComponents<ValueType>::value;
+                using ComponentType = typename GetComponentsType<ValueType>::type;
+
+                OpenPMDName<T_Identifier> openPMDName;
+                ::openPMD::Record record = particleSpecies[openPMDName()];
+                std::string baseName = basepath + "/" + openPMDName();
+                ::openPMD::Datatype openPMDType = ::openPMD::determineDatatype<ComponentType>();
+
+                // get the SI scaling, dimensionality and weighting of the attribute
+                OpenPMDUnit<T_Identifier> openPMDUnit;
+                std::vector<float_64> unit = openPMDUnit();
+                OpenPMDUnitDimension<T_Identifier> openPMDUnitDimension;
+                std::vector<float_64> unitDimension = openPMDUnitDimension();
+                const bool macroWeightedBool = MacroWeighted<T_Identifier>::get();
+                const uint32_t macroWeighted = (macroWeightedBool ? 1 : 0);
+                const float_64 weightingPower = WeightingPower<T_Identifier>::get();
+
+                PMACC_ASSERT(unit.size() == components); // unitSI for each component
+                PMACC_ASSERT(unitDimension.size() == 7); // seven openPMD base units
+
+                log<picLog::INPUT_OUTPUT>("openPMD:  (begin) write species attribute: %1%") % Identifier::getName();
+
+                std::shared_ptr<ComponentType> storeBfr;
+                if(elements > 0)
+                    storeBfr = std::shared_ptr<ComponentType>{new ComponentType[elements], [](ComponentType* ptr) {
+                                                                  delete[] ptr;
+                                                              }};
+
+                for(uint32_t d = 0; d < components; d++)
+                {
+                    ::openPMD::RecordComponent recordComponent
+                        = components > 1 ? record[name_lookup[d]] : record[::openPMD::MeshRecordComponent::SCALAR];
+                    std::string datasetName = components > 1 ? baseName + "/" + name_lookup[d] : baseName;
+
+                    ValueType* dataPtr = frame.getIdentifier(Identifier()).getPointer(); // can be moved up?
+                    auto storePtr = storeBfr.get();
+
+/* copy strided data from source to temporary buffer */
+#pragma omp parallel for simd
+                    for(size_t i = 0; i < elements; ++i)
+                    {
+                        storePtr[i] = reinterpret_cast<ComponentType*>(dataPtr)[d + i * components];
+                    }
+
+                    params->initDataset<DIM1>(
+                        recordComponent,
+                        openPMDType,
+                        {globalElements},
+                        true,
+                        params->compressionMethod,
+                        datasetName);
+                    if(storeBfr)
+                        recordComponent.storeChunk(storeBfr, {globalOffset}, {elements});
+
+                    if(unit.size() >= (d + 1))
+                    {
+                        recordComponent.setUnitSI(unit[d]);
+                    }
+                    params->openPMDSeries->flush();
+                }
+
+                auto unitMap = convertToUnitDimension(unitDimension);
+
+                record.setUnitDimension(unitMap);
+                record.setAttribute("macroWeighted", macroWeighted);
+                record.setAttribute("weightingPower", weightingPower);
+
+                /* @todo check if always correct at this point,
+                 * depends on attribute and MW-solver/pusher implementation
+                 */
+                float_X const timeOffset = 0.0;
+                record.setAttribute("timeOffset", timeOffset);
+
+                log<picLog::INPUT_OUTPUT>("openPMD:  ( end ) write species attribute: %1%") % Identifier::getName();
+            }
+        };
+
+    } // namespace openPMD
+} // namespace picongpu
diff --git a/include/picongpu/plugins/output/ConstSpeciesAttributes.hpp b/include/picongpu/plugins/output/ConstSpeciesAttributes.hpp
new file mode 100644
index 0000000000..627cdc9f4b
--- /dev/null
+++ b/include/picongpu/plugins/output/ConstSpeciesAttributes.hpp
@@ -0,0 +1,93 @@
+/* Copyright 2014-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+ *                     Franz Poeschel, Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/traits/frame/GetMass.hpp"
+#include "picongpu/traits/frame/GetCharge.hpp"
+
+#include <pmacc/traits/HasFlag.hpp>
+
+#include <type_traits>
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace output
+        {
+            template<typename T_FrameType>
+            struct GetChargeOrZero
+            {
+                static constexpr bool hasChargeRatio = pmacc::traits::HasFlag<T_FrameType, chargeRatio<>>::type::value;
+
+                template<typename T_Defer = float_X>
+                typename std::enable_if<hasChargeRatio, T_Defer>::type operator()() const
+                {
+                    return frame::getCharge<T_FrameType>();
+                }
+
+                template<typename T_Defer = float_X>
+                typename std::enable_if<!hasChargeRatio, T_Defer>::type operator()() const
+                {
+                    return float_X(0.);
+                }
+
+                std::vector<float_64> dimension() const
+                {
+                    // L, M, T, I, theta, N, J
+                    std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+                    unitDimension.at(SIBaseUnits::electricCurrent) = 1.0;
+                    unitDimension.at(SIBaseUnits::time) = 1.0;
+
+                    return unitDimension;
+                }
+            };
+
+            template<typename T_FrameType>
+            struct GetMassOrZero
+            {
+                static constexpr bool hasMassRatio = pmacc::traits::HasFlag<T_FrameType, massRatio<>>::type::value;
+
+                template<typename T_Defer = float_X>
+                typename std::enable_if<hasMassRatio, T_Defer>::type operator()() const
+                {
+                    return frame::getMass<T_FrameType>();
+                }
+
+                template<typename T_Defer = float_X>
+                typename std::enable_if<!hasMassRatio, T_Defer>::type operator()() const
+                {
+                    return float_X(0.);
+                }
+
+                std::vector<float_64> dimension() const
+                {
+                    // L, M, T, I, theta, N, J
+                    std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+                    unitDimension.at(SIBaseUnits::mass) = 1.0;
+
+                    return unitDimension;
+                }
+            };
+        } // namespace output
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/output/GatherSlice.hpp b/include/picongpu/plugins/output/GatherSlice.hpp
index 48b44108b1..aa67e95096 100644
--- a/include/picongpu/plugins/output/GatherSlice.hpp
+++ b/include/picongpu/plugins/output/GatherSlice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -34,216 +34,226 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-struct GatherSlice
-{
-
-    GatherSlice() :
-        mpiRank(-1),
-        numRanks(0),
-        filteredData(nullptr),
-        comm(MPI_COMM_NULL),
-        fullData(nullptr),
-        masterRank(0),
-        isMPICommInitialized(false)
-    {
-    }
-
-    ~GatherSlice()
+    struct GatherSlice
     {
-        reset();
-    }
-
-    /*
-     * @return true if object has reduced data after reduce call else false
-     */
-    bool init(bool isActive)
-    {
-        static int masterRankOffset = 0;
-
-        /* free old communicator if `init()` is called again */
-        if (isMPICommInitialized)
+        GatherSlice()
+            : mpiRank(-1)
+            , numRanks(0)
+            , filteredData(nullptr)
+            , comm(MPI_COMM_NULL)
+            , fullData(nullptr)
+            , masterRank(0)
+            , isMPICommInitialized(false)
         {
-            reset();
         }
 
-        int countRanks = Environment<simDim>::get().GridController().getGpuNodes().productOfComponents();
-        std::vector<int> gatherRanks(countRanks);
-        std::vector<int> groupRanks(countRanks);
-        mpiRank = Environment<simDim>::get().GridController().getGlobalRank();
-        if (!isActive)
-            mpiRank = -1;
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Allgather(&mpiRank, 1, MPI_INT, &gatherRanks[0], 1, MPI_INT, MPI_COMM_WORLD));
-
-        for (int i = 0; i < countRanks; ++i)
+        ~GatherSlice()
         {
-            if (gatherRanks[i] != -1)
-            {
-                groupRanks[numRanks] = gatherRanks[i];
-                numRanks++;
-            }
+            reset();
         }
 
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_Group group = MPI_GROUP_NULL;
-        MPI_Group newgroup = MPI_GROUP_NULL;
-        MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &group));
-        MPI_CHECK(MPI_Group_incl(group, numRanks, &groupRanks[0], &newgroup));
-
-        MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, newgroup, &comm));
-
-        if (mpiRank != -1)
+        /*
+         * @return true if object has reduced data after reduce call else false
+         */
+        bool init(bool isActive)
         {
-            MPI_Comm_rank(comm, &mpiRank);
-            isMPICommInitialized = true;
-        }
-        MPI_CHECK(MPI_Group_free(&group));
-        MPI_CHECK(MPI_Group_free(&newgroup));
+            static int masterRankOffset = 0;
 
-        masterRankOffset++;
-        /* avoid that only rank zero is the master
-         * this reduces the load of rank zero
-         */
-        masterRank = (masterRankOffset % numRanks);
+            /* free old communicator if `init()` is called again */
+            if(isMPICommInitialized)
+            {
+                reset();
+            }
 
-        return mpiRank == masterRank;
-    }
+            int countRanks = Environment<simDim>::get().GridController().getGpuNodes().productOfComponents();
+            std::vector<int> gatherRanks(countRanks);
+            std::vector<int> groupRanks(countRanks);
+            mpiRank = Environment<simDim>::get().GridController().getGlobalRank();
+            if(!isActive)
+                mpiRank = -1;
 
-    template<class Box >
-    Box operator()(Box & data, const MessageHeader & header)
-    {
-        using ValueType = typename Box::ValueType;
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_CHECK(MPI_Allgather(&mpiRank, 1, MPI_INT, &gatherRanks[0], 1, MPI_INT, MPI_COMM_WORLD));
 
-        Box dstBox = Box(PitchedBox<ValueType, DIM2 > (
-                                                       (ValueType*) filteredData,
-                                                       DataSpace<DIM2 > (),
-                                                       header.sim.size,
-                                                       header.sim.size.x() * sizeof (ValueType)
-                                                       ));
+            for(int i = 0; i < countRanks; ++i)
+            {
+                if(gatherRanks[i] != -1)
+                {
+                    groupRanks[numRanks] = gatherRanks[i];
+                    numRanks++;
+                }
+            }
 
-        MessageHeader* fakeHeader = MessageHeader::create();
-        memcpy(fakeHeader, &header, sizeof (MessageHeader));
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_Group group = MPI_GROUP_NULL;
+            MPI_Group newgroup = MPI_GROUP_NULL;
+            MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &group));
+            MPI_CHECK(MPI_Group_incl(group, numRanks, &groupRanks[0], &newgroup));
 
-        char* recvHeader = new char[ MessageHeader::bytes * numRanks];
+            MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, newgroup, &comm));
 
-        if (fullData == nullptr && mpiRank == masterRank)
-            fullData = (char*) new ValueType[header.sim.size.productOfComponents()];
+            if(mpiRank != -1)
+            {
+                MPI_Comm_rank(comm, &mpiRank);
+                isMPICommInitialized = true;
+            }
+            MPI_CHECK(MPI_Group_free(&group));
+            MPI_CHECK(MPI_Group_free(&newgroup));
 
+            masterRankOffset++;
+            /* avoid that only rank zero is the master
+             * this reduces the load of rank zero
+             */
+            masterRank = (masterRankOffset % numRanks);
 
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Gather(fakeHeader, MessageHeader::bytes, MPI_CHAR, recvHeader, MessageHeader::bytes,
-                             MPI_CHAR, masterRank, comm));
+            return mpiRank == masterRank;
+        }
 
-        std::vector<int> counts(numRanks);
-        std::vector<int> displs(numRanks);
-        int offset = 0;
-        for (int i = 0; i < numRanks; ++i)
+        template<class Box>
+        Box operator()(Box& data, const MessageHeader& header)
         {
-            MessageHeader* head = (MessageHeader*) (recvHeader + MessageHeader::bytes * i);
-            counts[i] = head->node.maxSize.productOfComponents() * sizeof (ValueType);
-            displs[i] = offset;
-            offset += counts[i];
-        }
+            using ValueType = typename Box::ValueType;
 
-        const size_t elementsCount = header.node.maxSize.productOfComponents() * sizeof (ValueType);
+            Box dstBox = Box(PitchedBox<ValueType, DIM2>(
+                (ValueType*) filteredData,
+                DataSpace<DIM2>(),
+                header.sim.size,
+                header.sim.size.x() * sizeof(ValueType)));
 
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Gatherv(
-                              (char*) (data.getPointer()), elementsCount, MPI_CHAR,
-                              fullData, &counts[0], &displs[0], MPI_CHAR,
-                              masterRank, comm));
+            MessageHeader* fakeHeader = MessageHeader::create();
+            *fakeHeader = header;
 
+            char* recvHeader = new char[MessageHeader::bytes * numRanks];
 
+            if(fullData == nullptr && mpiRank == masterRank)
+                fullData = (char*) new ValueType[header.sim.size.productOfComponents()];
 
-        if (mpiRank == masterRank)
-        {
-            log<picLog::DOMAINS > ("Master create image");
-            if (filteredData == nullptr)
-                filteredData = (char*) new ValueType[header.sim.size.productOfComponents()];
-
-            /*create box with valid memory*/
-            dstBox = Box(PitchedBox<ValueType, DIM2 > (
-                                                       (ValueType*) filteredData,
-                                                       DataSpace<DIM2 > (),
-                                                       header.sim.size,
-                                                       header.sim.size.x() * sizeof (ValueType)
-                                                       ));
-
-            for (int i = 0; i < numRanks; ++i)
+
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_CHECK(MPI_Gather(
+                fakeHeader,
+                MessageHeader::bytes,
+                MPI_CHAR,
+                recvHeader,
+                MessageHeader::bytes,
+                MPI_CHAR,
+                masterRank,
+                comm));
+
+            std::vector<int> counts(numRanks);
+            std::vector<int> displs(numRanks);
+            int offset = 0;
+            for(int i = 0; i < numRanks; ++i)
             {
                 MessageHeader* head = (MessageHeader*) (recvHeader + MessageHeader::bytes * i);
-
-                log<picLog::DOMAINS > ("part image with offset %1%byte=%2%elements | size %3%  | offset %4%") %
-                    displs[i] % (displs[i] / sizeof (ValueType)) %
-                    head->node.maxSize.toString() %
-                    head->node.offset.toString();
-                Box srcBox = Box(PitchedBox<ValueType, DIM2 > (
-                                                               (ValueType*) (fullData + displs[i]),
-                                                               DataSpace<DIM2 > (),
-                                                               head->node.maxSize,
-                                                               head->node.maxSize.x() * sizeof (ValueType)
-                                                               ));
-
-                insertData(dstBox, srcBox, head->node.offset, head->node.maxSize);
+                counts[i] = head->node.maxSize.productOfComponents() * sizeof(ValueType);
+                displs[i] = offset;
+                offset += counts[i];
             }
 
-            __deleteArray(fullData);
-        }
+            const size_t elementsCount = header.node.maxSize.productOfComponents() * sizeof(ValueType);
 
-        delete[] recvHeader;
-        MessageHeader::destroy(fakeHeader);
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_CHECK(MPI_Gatherv(
+                (char*) (data.getPointer()),
+                elementsCount,
+                MPI_CHAR,
+                fullData,
+                &counts[0],
+                &displs[0],
+                MPI_CHAR,
+                masterRank,
+                comm));
+
+
+            if(mpiRank == masterRank)
+            {
+                log<picLog::DOMAINS>("Master create image");
+                if(filteredData == nullptr)
+                    filteredData = (char*) new ValueType[header.sim.size.productOfComponents()];
+
+                /*create box with valid memory*/
+                dstBox = Box(PitchedBox<ValueType, DIM2>(
+                    (ValueType*) filteredData,
+                    DataSpace<DIM2>(),
+                    header.sim.size,
+                    header.sim.size.x() * sizeof(ValueType)));
+
+                for(int i = 0; i < numRanks; ++i)
+                {
+                    MessageHeader* head = (MessageHeader*) (recvHeader + MessageHeader::bytes * i);
+
+                    log<picLog::DOMAINS>("part image with offset %1%byte=%2%elements | size %3%  | offset %4%")
+                        % displs[i] % (displs[i] / sizeof(ValueType)) % head->node.maxSize.toString()
+                        % head->node.offset.toString();
+                    Box srcBox = Box(PitchedBox<ValueType, DIM2>(
+                        (ValueType*) (fullData + displs[i]),
+                        DataSpace<DIM2>(),
+                        head->node.maxSize,
+                        head->node.maxSize.x() * sizeof(ValueType)));
+
+                    insertData(dstBox, srcBox, head->node.offset, head->node.maxSize);
+                }
+
+                __deleteArray(fullData);
+            }
 
-        return dstBox;
-    }
+            delete[] recvHeader;
+            MessageHeader::destroy(fakeHeader);
 
-    template<class DstBox, class SrcBox>
-    void insertData(DstBox& dst, const SrcBox& src, MessageHeader::Size2D offsetToSimNull, MessageHeader::Size2D srcSize)
-    {
-        for (int y = 0; y < srcSize.y(); ++y)
+            return dstBox;
+        }
+
+        template<class DstBox, class SrcBox>
+        void insertData(
+            DstBox& dst,
+            const SrcBox& src,
+            MessageHeader::Size2D offsetToSimNull,
+            MessageHeader::Size2D srcSize)
         {
-            for (int x = 0; x < srcSize.x(); ++x)
+            for(int y = 0; y < srcSize.y(); ++y)
             {
-                dst[y + offsetToSimNull.y()][x + offsetToSimNull.x()] = src[y][x];
+                for(int x = 0; x < srcSize.x(); ++x)
+                {
+                    dst[y + offsetToSimNull.y()][x + offsetToSimNull.x()] = src[y][x];
+                }
             }
         }
-    }
 
-private:
-
-    /*reset this object und set all values to initial state*/
-    void reset()
-    {
-        mpiRank = -1;
-        numRanks = 0;
-        if (filteredData != nullptr)
-            delete[] filteredData;
-        filteredData = nullptr;
-        if (fullData != nullptr)
-            delete[] fullData;
-        fullData = nullptr;
-        if (isMPICommInitialized)
+    private:
+        /*reset this object und set all values to initial state*/
+        void reset()
         {
-            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-            __getTransactionEvent().waitForFinished();
-            MPI_CHECK(MPI_Comm_free(&comm));
+            mpiRank = -1;
+            numRanks = 0;
+            if(filteredData != nullptr)
+                delete[] filteredData;
+            filteredData = nullptr;
+            if(fullData != nullptr)
+                delete[] fullData;
+            fullData = nullptr;
+            if(isMPICommInitialized)
+            {
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Comm_free(&comm));
+            }
+            isMPICommInitialized = false;
         }
-        isMPICommInitialized = false;
-    }
-
-    char* filteredData;
-    char* fullData;
-    MPI_Comm comm;
-    int mpiRank;
-    int numRanks;
-    int masterRank;
-    bool isMPICommInitialized;
-};
-
-}//namespace
+
+        char* filteredData;
+        char* fullData;
+        MPI_Comm comm;
+        int mpiRank;
+        int numRanks;
+        int masterRank;
+        bool isMPICommInitialized;
+    };
+
+} // namespace picongpu
diff --git a/include/picongpu/plugins/output/IIOBackend.hpp b/include/picongpu/plugins/output/IIOBackend.hpp
index 618cbb08ef..8dbe0f4fa7 100644
--- a/include/picongpu/plugins/output/IIOBackend.hpp
+++ b/include/picongpu/plugins/output/IIOBackend.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,37 +27,32 @@
 
 namespace picongpu
 {
-
     //! Interface for IO-backends with restart capability
     class IIOBackend : public plugins::multi::ISlave
     {
     public:
-
-
         IIOBackend()
         {
-
         }
 
-         virtual ~IIOBackend()
+        virtual ~IIOBackend()
         {
-
         }
 
         //! create a checkpoint
         virtual void dumpCheckpoint(
             uint32_t currentStep,
-            std::string const & checkpointDirectory,
-            std::string const & checkpointFilename
-        ) = 0;
+            std::string const& checkpointDirectory,
+            std::string const& checkpointFilename)
+            = 0;
 
         //! restart from a checkpoint
         virtual void doRestart(
             uint32_t restartStep,
-            std::string const & restartDirectory,
-            std::string const & restartFilename,
-            uint32_t restartChunkSize
-        ) = 0;
+            std::string const& restartDirectory,
+            std::string const& restartFilename,
+            uint32_t restartChunkSize)
+            = 0;
     };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/output/WriteSpeciesCommon.hpp b/include/picongpu/plugins/output/WriteSpeciesCommon.hpp
index d9e1b4bb9c..ad272045fc 100644
--- a/include/picongpu/plugins/output/WriteSpeciesCommon.hpp
+++ b/include/picongpu/plugins/output/WriteSpeciesCommon.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Felix Schmitt
+/* Copyright 2014-2021 Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -42,109 +42,109 @@
 
 namespace picongpu
 {
+    using namespace pmacc;
 
-using namespace pmacc;
 
-
-
-template<typename T_Type>
-struct MallocMemory
-{
-    template<typename ValueType >
-    HINLINE void operator()(ValueType& v1, const size_t size) const
+    template<typename T_Type>
+    struct MallocMemory
     {
-        typedef typename pmacc::traits::Resolve<T_Type>::type::type type;
-
-        type* ptr = nullptr;
-        if (size != 0)
+        template<typename ValueType>
+        HINLINE void operator()(ValueType& v1, const size_t size) const
         {
-#if( PMACC_CUDA_ENABLED == 1 )
-            CUDA_CHECK((cuplaError_t)cudaHostAlloc(&ptr, size * sizeof (type), cudaHostAllocMapped));
+            typedef typename pmacc::traits::Resolve<T_Type>::type::type type;
+
+            type* ptr = nullptr;
+            if(size != 0)
+            {
+#if(PMACC_CUDA_ENABLED == 1)
+                CUDA_CHECK((cuplaError_t) cudaHostAlloc(&ptr, size * sizeof(type), cudaHostAllocMapped));
+#elif(ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+                CUDA_CHECK((cuplaError_t) hipHostMalloc((void**) &ptr, size * sizeof(type), hipHostRegisterMapped));
 #else
-            ptr = new type[size];
+                ptr = new type[size];
 #endif
+            }
+            v1.getIdentifier(T_Type()) = VectorDataBox<type>(ptr);
         }
-        v1.getIdentifier(T_Type()) = VectorDataBox<type>(ptr);
-
-    }
-};
+    };
 
-/** allocate memory on host
- *
- * This functor use `new[]` to allocate memory
- */
-template<typename T_Attribute>
-struct MallocHostMemory
-{
-    template<typename ValueType >
-    HINLINE void operator()(ValueType& v1, const size_t size) const
+    /** allocate memory on host
+     *
+     * This functor use `new[]` to allocate memory
+     */
+    template<typename T_Attribute>
+    struct MallocHostMemory
     {
-        typedef T_Attribute Attribute;
-        typedef typename pmacc::traits::Resolve<Attribute>::type::type type;
-
-        type* ptr = nullptr;
-        if (size != 0)
+        template<typename ValueType>
+        HINLINE void operator()(ValueType& v1, const size_t size) const
         {
-            ptr = new type[size];
+            typedef T_Attribute Attribute;
+            typedef typename pmacc::traits::Resolve<Attribute>::type::type type;
+
+            type* ptr = nullptr;
+            if(size != 0)
+            {
+                ptr = new type[size];
+            }
+            v1.getIdentifier(Attribute()) = VectorDataBox<type>(ptr);
         }
-        v1.getIdentifier(Attribute()) = VectorDataBox<type>(ptr);
+    };
 
-    }
-};
 
+    /** copy species to host memory
+     *
+     * use `DataConnector::get<...>()` to copy data
+     */
+    template<typename T_SpeciesType>
+    struct CopySpeciesToHost
+    {
+        typedef T_SpeciesType SpeciesType;
 
-/** copy species to host memory
- *
- * use `DataConnector::get<...>()` to copy data
- */
-template<typename T_SpeciesType>
-struct CopySpeciesToHost
-{
-    typedef T_SpeciesType SpeciesType;
+        HINLINE void operator()() const
+        {
+            /* DataConnector copies data to host */
+            DataConnector& dc = Environment<>::get().DataConnector();
+            dc.get<SpeciesType>(SpeciesType::FrameType::getName());
+            dc.releaseData(SpeciesType::FrameType::getName());
+        }
+    };
 
-    HINLINE void operator()() const
+    template<typename T_Type>
+    struct GetDevicePtr
     {
-        /* DataConnector copies data to host */
-        DataConnector &dc = Environment<>::get().DataConnector();
-        dc.get< SpeciesType >( SpeciesType::FrameType::getName() );
-        dc.releaseData( SpeciesType::FrameType::getName() );
-    }
-};
-
-template<typename T_Type>
-struct GetDevicePtr
-{
-    template<typename ValueType >
-    HINLINE void operator()(ValueType& dest, ValueType& src)
-    {
-        typedef typename pmacc::traits::Resolve<T_Type>::type::type type;
-
-        type* ptr = nullptr;
-        type* srcPtr = src.getIdentifier(T_Type()).getPointer();
-        if (srcPtr != nullptr)
+        template<typename ValueType>
+        HINLINE void operator()(ValueType& dest, ValueType& src)
         {
-#if( PMACC_CUDA_ENABLED == 1 )
-            CUDA_CHECK((cuplaError_t)cudaHostGetDevicePointer(&ptr, srcPtr, 0));
+            typedef typename pmacc::traits::Resolve<T_Type>::type::type type;
+
+            type* ptr = nullptr;
+            type* srcPtr = src.getIdentifier(T_Type()).getPointer();
+            if(srcPtr != nullptr)
+            {
+#if(PMACC_CUDA_ENABLED == 1)
+                CUDA_CHECK((cuplaError_t) cudaHostGetDevicePointer(&ptr, srcPtr, 0));
+#elif(ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+                CUDA_CHECK((cuplaError_t) hipHostGetDevicePointer((void**) &ptr, srcPtr, 0));
 #else
-            ptr = srcPtr;
+                ptr = srcPtr;
 #endif
+            }
+            dest.getIdentifier(T_Type()) = VectorDataBox<type>(ptr);
         }
-        dest.getIdentifier(T_Type()) = VectorDataBox<type>(ptr);
-    }
-};
+    };
 
-template<typename T_Type>
-struct FreeMemory
-{
-    template<typename ValueType >
-    HINLINE void operator()(ValueType& value) const
+    template<typename T_Type>
+    struct FreeMemory
     {
-        typedef typename pmacc::traits::Resolve<T_Type>::type::type type;
-
-        type* ptr = value.getIdentifier(T_Type()).getPointer();
-        if (ptr != nullptr)
+        template<typename ValueType>
+        HINLINE void operator()(ValueType& value) const
         {
-#if( PMACC_CUDA_ENABLED == 1 )
+            typedef typename pmacc::traits::Resolve<T_Type>::type::type type;
+
+            type* ptr = value.getIdentifier(T_Type()).getPointer();
+            if(ptr != nullptr)
+            {
+#if(PMACC_CUDA_ENABLED == 1)
 /* cupla 0.2.0 does not support the function cudaHostAlloc to create mapped memory.
  * Therefore we need to call the native CUDA function cudaFreeHost to free memory.
  * Due to the renaming of cuda functions with cupla via macros we need to remove
@@ -156,52 +156,49 @@ struct FreeMemory
  *   https://github.com/ComputationalRadiationPhysics/alpaka/issues/296
  *   https://github.com/ComputationalRadiationPhysics/alpaka/issues/612
  */
-#   undef cudaFreeHost
-            CUDA_CHECK((cuplaError_t)cudaFreeHost(ptr));
+#    undef cudaFreeHost
+                CUDA_CHECK((cuplaError_t) cudaFreeHost(ptr));
 // re-introduce the cupla macro
-#   define cudaFreeHost(...) cuplaFreeHost(__VA_ARGS__)
+#    define cudaFreeHost(...) cuplaFreeHost(__VA_ARGS__)
+#elif(ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+                CUDA_CHECK((cuplaError_t) hipHostFree(ptr));
 #else
-            __deleteArray(ptr);
+                __deleteArray(ptr);
 #endif
+            }
         }
-    }
-};
-
-/** free memory
- *
- * use `__deleteArray()` to free memory
- */
-template<typename T_Attribute>
-struct FreeHostMemory
-{
+    };
 
-    template<typename ValueType >
-    HINLINE void operator()(ValueType& value) const
+    /** free memory
+     *
+     * use `__deleteArray()` to free memory
+     */
+    template<typename T_Attribute>
+    struct FreeHostMemory
     {
-        typedef T_Attribute Attribute;
-        typedef typename pmacc::traits::Resolve<Attribute>::type::type type;
-
-        type* ptr = value.getIdentifier(Attribute()).getPointer();
-        if (ptr != nullptr)
+        template<typename ValueType>
+        HINLINE void operator()(ValueType& value) const
         {
-            __deleteArray(ptr);
-            ptr=nullptr;
+            typedef T_Attribute Attribute;
+            typedef typename pmacc::traits::Resolve<Attribute>::type::type type;
+
+            type* ptr = value.getIdentifier(Attribute()).getPointer();
+            if(ptr != nullptr)
+            {
+                __deleteArray(ptr);
+                ptr = nullptr;
+            }
         }
-    }
-};
+    };
 
-/*functor to create a pair for a MapTuple map*/
-struct OperatorCreateVectorBox
-{
-    template<typename InType>
-    struct apply
+    /*functor to create a pair for a MapTuple map*/
+    struct OperatorCreateVectorBox
     {
-        typedef
-        bmpl::pair< InType,
-        pmacc::VectorDataBox< typename pmacc::traits::Resolve<InType>::type::type > >
-        type;
+        template<typename InType>
+        struct apply
+        {
+            typedef bmpl::pair<InType, pmacc::VectorDataBox<typename pmacc::traits::Resolve<InType>::type::type>> type;
+        };
     };
-};
-
-} //namespace picongpu
 
+} // namespace picongpu
diff --git a/include/picongpu/plugins/output/header/ColorHeader.hpp b/include/picongpu/plugins/output/header/ColorHeader.hpp
index a7478de109..bf631444b1 100644
--- a/include/picongpu/plugins/output/header/ColorHeader.hpp
+++ b/include/picongpu/plugins/output/header/ColorHeader.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,54 +28,53 @@
 
 namespace picongpu
 {
-/** Color Header for Preview Images
- *
- *  Used to store the relation of color channels to min/max units
- *  and data names they represent.
- */
-struct ColorHeader
-{
-    struct channel {
-        /// assign a physical meaningful name to the channel
-        std::string name;
-        /// assign a unit to the range values
-        std::string unitName;
-        /// min/max real values for 0 and 255
-        picongpu::float_32 range[2];
-    };
-
-    channel particles;
-    channel channel1;
-    channel channel2;
-    channel channel3;
-
-    ColorHeader()
+    /** Color Header for Preview Images
+     *
+     *  Used to store the relation of color channels to min/max units
+     *  and data names they represent.
+     */
+    struct ColorHeader
     {
-        particles.range[0] = 0.f;
-        particles.range[1] = 0.f;
+        struct channel
+        {
+            /// assign a physical meaningful name to the channel
+            std::string name;
+            /// assign a unit to the range values
+            std::string unitName;
+            /// min/max real values for 0 and 255
+            picongpu::float_32 range[2];
+        };
 
-        channel1.range[0] = 0.f;
-        channel1.range[1] = 0.f;
+        channel particles;
+        channel channel1;
+        channel channel2;
+        channel channel3;
 
-        channel2.range[0] = 0.f;
-        channel2.range[1] = 0.f;
+        ColorHeader()
+        {
+            particles.range[0] = 0.f;
+            particles.range[1] = 0.f;
 
-        channel3.range[0] = 0.f;
-        channel3.range[1] = 0.f;
-    }
+            channel1.range[0] = 0.f;
+            channel1.range[1] = 0.f;
 
-    //void setScale(picongpu::float_32 x, picongpu::float_32 y)
-    //{
-    //    scale[0] = x;
-    //    scale[1] = y;
-    //}
+            channel2.range[0] = 0.f;
+            channel2.range[1] = 0.f;
 
-    void writeToConsole(std::ostream& ocons) const
-    {
-        //ocons << "ColorHeader.XYZ " << "..." << std::endl;
+            channel3.range[0] = 0.f;
+            channel3.range[1] = 0.f;
+        }
 
-    }
+        // void setScale(picongpu::float_32 x, picongpu::float_32 y)
+        //{
+        //    scale[0] = x;
+        //    scale[1] = y;
+        //}
 
-};
+        void writeToConsole(std::ostream& ocons) const
+        {
+            // ocons << "ColorHeader.XYZ " << "..." << std::endl;
+        }
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/output/header/DataHeader.hpp b/include/picongpu/plugins/output/header/DataHeader.hpp
index 2d8ce4fb2d..20b86c1b33 100644
--- a/include/picongpu/plugins/output/header/DataHeader.hpp
+++ b/include/picongpu/plugins/output/header/DataHeader.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,21 +22,18 @@
 
 namespace picongpu
 {
-
-struct DataHeader
-{
-
-    uint32_t byte;
-
-    DataHeader() : byte(0)
+    struct DataHeader
     {
-    }
+        uint32_t byte;
 
-    void writeToConsole(std::ostream& ocons) const
-    {
-        ocons << "DataHeader.byte " << byte << std::endl;
-    }
+        DataHeader() : byte(0)
+        {
+        }
 
-};
+        void writeToConsole(std::ostream& ocons) const
+        {
+            ocons << "DataHeader.byte " << byte << std::endl;
+        }
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/output/header/MessageHeader.hpp b/include/picongpu/plugins/output/header/MessageHeader.hpp
index f5d8b059fd..8c1325073b 100644
--- a/include/picongpu/plugins/output/header/MessageHeader.hpp
+++ b/include/picongpu/plugins/output/header/MessageHeader.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -35,140 +35,142 @@
 
 namespace picongpu
 {
-
-struct MessageHeader
-{
-    using Size2D = WindowHeader::Size2D;
-
-    enum
+    struct MessageHeader
     {
-        realBytes = sizeof (DataHeader) + sizeof (SimHeader) + sizeof (WindowHeader) + sizeof (NodeHeader),
-        bytes = realBytes < 120 ? 128 : 256
-    };
-
-    template<class CellDesc >
-    void update(CellDesc & cellDesc,
-                picongpu::Window vWindow,
-                Size2D transpose,
-                uint32_t currentStep,
-                picongpu::float_32* cellSizeArr = nullptr,
-                const pmacc::DataSpace<CellDesc::Dim> gpus = pmacc::DataSpace<CellDesc::Dim > ())
-    {
-        using namespace pmacc;
-        using namespace picongpu;
+        using Size2D = WindowHeader::Size2D;
 
         enum
         {
-            Dim = CellDesc::Dim
+            realBytes = sizeof(DataHeader) + sizeof(SimHeader) + sizeof(WindowHeader) + sizeof(NodeHeader),
+            bytes = realBytes < 120 ? 128 : 256
         };
 
-        const DataSpace<Dim> localSize(cellDesc.getGridLayout().getDataSpaceWithoutGuarding());
-        const DataSpace<DIM2> localSize2D(localSize[transpose.x()], localSize[transpose.y()]);
+        template<class CellDesc>
+        void update(
+            CellDesc& cellDesc,
+            picongpu::Window vWindow,
+            Size2D transpose,
+            uint32_t currentStep,
+            picongpu::float_32* cellSizeArr = nullptr,
+            const pmacc::DataSpace<CellDesc::Dim> gpus = pmacc::DataSpace<CellDesc::Dim>())
+        {
+            using namespace pmacc;
+            using namespace picongpu;
 
-        /*update only if nuber of gpus are set, else use old value*/
-        if (gpus.productOfComponents() != 0)
-            sim.nodes = DataSpace<DIM2 > (gpus[transpose.x()], gpus[transpose.y()]);
+            enum
+            {
+                Dim = CellDesc::Dim
+            };
 
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+            const DataSpace<Dim> localSize(cellDesc.getGridLayout().getDataSpaceWithoutGuarding());
+            const DataSpace<DIM2> localSize2D(localSize[transpose.x()], localSize[transpose.y()]);
 
-        const DataSpace<Dim> globalSize(subGrid.getGlobalDomain().size);
-        sim.size.x() = globalSize[transpose.x()];
-        sim.size.y() = globalSize[transpose.y()];
+            /*update only if nuber of gpus are set, else use old value*/
+            if(gpus.productOfComponents() != 0)
+                sim.nodes = DataSpace<DIM2>(gpus[transpose.x()], gpus[transpose.y()]);
 
-        node.maxSize = DataSpace<DIM2 > (localSize[transpose.x()], localSize[transpose.y()]);
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
 
-        const DataSpace<Dim> windowSize = vWindow.globalDimensions.size;
-        window.size = DataSpace<DIM2 > (windowSize[transpose.x()], windowSize[transpose.y()]);
+            const DataSpace<Dim> globalSize(subGrid.getGlobalDomain().size);
+            sim.size.x() = globalSize[transpose.x()];
+            sim.size.y() = globalSize[transpose.y()];
 
-        if (cellSizeArr != nullptr)
-        {
-            picongpu::float_32 scale[2];
-            scale[0] = cellSizeArr[transpose.x()];
-            scale[1] = cellSizeArr[transpose.y()];
-            sim.cellSizeArr[0] = cellSizeArr[transpose.x()];
-            sim.cellSizeArr[1] = cellSizeArr[transpose.y()];
+            node.maxSize = DataSpace<DIM2>(localSize[transpose.x()], localSize[transpose.y()]);
 
-            const picongpu::float_32 scale0to1 = scale[0] / scale[1];
+            const DataSpace<Dim> windowSize = vWindow.globalDimensions.size;
+            window.size = DataSpace<DIM2>(windowSize[transpose.x()], windowSize[transpose.y()]);
 
-            if (scale0to1 > 1.0f)
-            {
-                sim.setScale(scale0to1, 1.f);
-            }
-            else if (scale0to1 < 1.0f)
-            {
-                sim.setScale(1.f, 1.0f / scale0to1);
-            }
-            else
+            if(cellSizeArr != nullptr)
             {
-                sim.setScale(1.f, 1.f);
+                picongpu::float_32 scale[2];
+                scale[0] = cellSizeArr[transpose.x()];
+                scale[1] = cellSizeArr[transpose.y()];
+                sim.cellSizeArr[0] = cellSizeArr[transpose.x()];
+                sim.cellSizeArr[1] = cellSizeArr[transpose.y()];
+
+                const picongpu::float_32 scale0to1 = scale[0] / scale[1];
+
+                if(scale0to1 > 1.0f)
+                {
+                    sim.setScale(scale0to1, 1.f);
+                }
+                else if(scale0to1 < 1.0f)
+                {
+                    sim.setScale(1.f, 1.0f / scale0to1);
+                }
+                else
+                {
+                    sim.setScale(1.f, 1.f);
+                }
             }
-        }
 
-        const DataSpace<Dim> offsetToSimNull(subGrid.getLocalDomain().offset);
-        const DataSpace<Dim> windowOffsetToSimNull(vWindow.globalDimensions.offset);
-        const DataSpace<Dim> localOffset(vWindow.localDimensions.offset);
+            const DataSpace<Dim> offsetToSimNull(subGrid.getLocalDomain().offset);
+            const DataSpace<Dim> windowOffsetToSimNull(vWindow.globalDimensions.offset);
+            const DataSpace<Dim> localOffset(vWindow.localDimensions.offset);
 
-        const DataSpace<DIM2> localOffset2D(localOffset[transpose.x()], localOffset[transpose.y()]);
-        node.localOffset = localOffset2D;
+            const DataSpace<DIM2> localOffset2D(localOffset[transpose.x()], localOffset[transpose.y()]);
+            node.localOffset = localOffset2D;
 
-        DataSpace<Dim> offsetToWindow(offsetToSimNull - windowOffsetToSimNull);
+            DataSpace<Dim> offsetToWindow(offsetToSimNull - windowOffsetToSimNull);
 
-        const DataSpace<DIM2> offsetToWindow2D(offsetToWindow[transpose.x()], offsetToWindow[transpose.y()]);
-        node.offsetToWindow = offsetToWindow2D;
+            const DataSpace<DIM2> offsetToWindow2D(offsetToWindow[transpose.x()], offsetToWindow[transpose.y()]);
+            node.offsetToWindow = offsetToWindow2D;
 
-        const DataSpace<DIM2> offsetToSimNull2D(offsetToSimNull[transpose.x()], offsetToSimNull[transpose.y()]);
-        node.offset = offsetToSimNull2D;
+            const DataSpace<DIM2> offsetToSimNull2D(offsetToSimNull[transpose.x()], offsetToSimNull[transpose.y()]);
+            node.offset = offsetToSimNull2D;
 
-        const DataSpace<DIM2> windowOffsetToSimNull2D(windowOffsetToSimNull[transpose.x()], windowOffsetToSimNull[transpose.y()]);
-        window.offset = windowOffsetToSimNull2D;
+            const DataSpace<DIM2> windowOffsetToSimNull2D(
+                windowOffsetToSimNull[transpose.x()],
+                windowOffsetToSimNull[transpose.y()]);
+            window.offset = windowOffsetToSimNull2D;
 
-        const DataSpace<Dim> currentLocalSize(vWindow.localDimensions.size);
-        const DataSpace<DIM2> currentLocalSize2D(currentLocalSize[transpose.x()], currentLocalSize[transpose.y()]);
-        node.size = currentLocalSize2D;
+            const DataSpace<Dim> currentLocalSize(vWindow.localDimensions.size);
+            const DataSpace<DIM2> currentLocalSize2D(currentLocalSize[transpose.x()], currentLocalSize[transpose.y()]);
+            node.size = currentLocalSize2D;
 
-        sim.step = currentStep;
+            sim.step = currentStep;
 
-        /*add sliding windo informations to header*/
-        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
-        sim.simOffsetToNull = DataSpace<DIM2 > ();
-        if (transpose.x() == 1)
-            sim.simOffsetToNull.x() = node.maxSize.x() * numSlides;
-        else if (transpose.y() == 1)
-            sim.simOffsetToNull.y() = node.maxSize.y() * numSlides;
+            /*add sliding windo informations to header*/
+            const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+            sim.simOffsetToNull = DataSpace<DIM2>();
+            if(transpose.x() == 1)
+                sim.simOffsetToNull.x() = node.maxSize.x() * numSlides;
+            else if(transpose.y() == 1)
+                sim.simOffsetToNull.y() = node.maxSize.y() * numSlides;
+        }
 
-    }
+        static MessageHeader* create()
+        {
+            return (MessageHeader*) new uint8_t[bytes];
+        }
 
-    static MessageHeader * create()
-    {
-        return (MessageHeader*) new uint8_t[bytes];
-    }
+        static void destroy(MessageHeader* obj)
+        {
+            __deleteArray(obj);
+        }
 
-    static void destroy(MessageHeader * obj)
-    {
-        __deleteArray(obj);
-    }
+        MessageHeader& operator=(MessageHeader const&) = default;
 
-    DataHeader data;
-    SimHeader sim;
-    WindowHeader window;
-    NodeHeader node;
-    //ColorHeader color; will be used later on to save channel ranges
+        DataHeader data;
+        SimHeader sim;
+        WindowHeader window;
+        NodeHeader node;
+        // ColorHeader color; will be used later on to save channel ranges
 
-    void writeToConsole(std::ostream& ocons) const
-    {
-        data.writeToConsole(ocons);
-        sim.writeToConsole(ocons);
-        window.writeToConsole(ocons);
-        node.writeToConsole(ocons);
-    }
-
-private:
-    /** constructor
-     *
-     * it is only allowed to create Message header with @see create()
-     */
-    MessageHeader();
-
-};
+        void writeToConsole(std::ostream& ocons) const
+        {
+            data.writeToConsole(ocons);
+            sim.writeToConsole(ocons);
+            window.writeToConsole(ocons);
+            node.writeToConsole(ocons);
+        }
+
+    private:
+        /** constructor
+         *
+         * it is only allowed to create Message header with @see create()
+         */
+        MessageHeader();
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/output/header/NodeHeader.hpp b/include/picongpu/plugins/output/header/NodeHeader.hpp
index 8020525362..2b6a9e9414 100644
--- a/include/picongpu/plugins/output/header/NodeHeader.hpp
+++ b/include/picongpu/plugins/output/header/NodeHeader.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,36 +28,34 @@
 
 namespace picongpu
 {
-
-struct NodeHeader
-{
-    typedef pmacc::DataSpace<DIM2> Size2D;
-
-    Size2D maxSize;
-    Size2D size;
-    Size2D offset;
-    Size2D localOffset; //not valid data
-    Size2D offsetToWindow;
-
-    Size2D getLocalOffsetToWindow()
-    {
-        Size2D tmp(offsetToWindow);
-        if (tmp.x() < 0)
-            tmp.x() = 0;
-        if (tmp.y() < 0)
-            tmp.y() = 0;
-        return tmp;
-    }
-
-    void writeToConsole(std::ostream& ocons) const
+    struct NodeHeader
     {
-        ocons << "NodeHeader.maxSize " << maxSize.x() << " " << maxSize.y() << std::endl;
-        ocons << "NodeHeader.size " << size.x() << " " << size.y() << std::endl;
-        ocons << "NodeHeader.localOffset " << localOffset.x() << " " << localOffset.y() << std::endl;
-        ocons << "NodeHeader.offset " << offset.x() << " " << offset.y() << std::endl;
-        ocons << "NodeHeader.offsetToWindow " << offsetToWindow.x() << " " << offsetToWindow.y() << std::endl;
-    }
-
-};
+        typedef pmacc::DataSpace<DIM2> Size2D;
+
+        Size2D maxSize;
+        Size2D size;
+        Size2D offset;
+        Size2D localOffset; // not valid data
+        Size2D offsetToWindow;
+
+        Size2D getLocalOffsetToWindow()
+        {
+            Size2D tmp(offsetToWindow);
+            if(tmp.x() < 0)
+                tmp.x() = 0;
+            if(tmp.y() < 0)
+                tmp.y() = 0;
+            return tmp;
+        }
+
+        void writeToConsole(std::ostream& ocons) const
+        {
+            ocons << "NodeHeader.maxSize " << maxSize.x() << " " << maxSize.y() << std::endl;
+            ocons << "NodeHeader.size " << size.x() << " " << size.y() << std::endl;
+            ocons << "NodeHeader.localOffset " << localOffset.x() << " " << localOffset.y() << std::endl;
+            ocons << "NodeHeader.offset " << offset.x() << " " << offset.y() << std::endl;
+            ocons << "NodeHeader.offsetToWindow " << offsetToWindow.x() << " " << offsetToWindow.y() << std::endl;
+        }
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/output/header/SimHeader.hpp b/include/picongpu/plugins/output/header/SimHeader.hpp
index b829a63d70..b57035528c 100644
--- a/include/picongpu/plugins/output/header/SimHeader.hpp
+++ b/include/picongpu/plugins/output/header/SimHeader.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,42 +28,40 @@
 
 namespace picongpu
 {
+    struct SimHeader
+    {
+        typedef pmacc::DataSpace<DIM2> Size2D;
 
-struct SimHeader
-{
-    typedef pmacc::DataSpace<DIM2> Size2D;
-
-    Size2D size;
-    Size2D nodes;
-    Size2D simOffsetToNull;
-    uint32_t step;
-    picongpu::float_32 scale[2];
-    picongpu::float_32 cellSizeArr[2];
+        Size2D size;
+        Size2D nodes;
+        Size2D simOffsetToNull;
+        uint32_t step;
+        picongpu::float_32 scale[2];
+        picongpu::float_32 cellSizeArr[2];
 
 
-    SimHeader() : step(0)
-    {
-        scale[0] = 1.f;
-        scale[1] = 1.f;
-        cellSizeArr[0] = 0.f;
-        cellSizeArr[1] = 0.f;
-    }
+        SimHeader() : step(0)
+        {
+            scale[0] = 1.f;
+            scale[1] = 1.f;
+            cellSizeArr[0] = 0.f;
+            cellSizeArr[1] = 0.f;
+        }
 
-    void setScale(picongpu::float_32 x, picongpu::float_32 y)
-    {
-        scale[0] = x;
-        scale[1] = y;
-    }
-
-    void writeToConsole(std::ostream& ocons) const
-    {
-        ocons << "SimHeader.size " << size.x() << " " << size.y() << std::endl;
-        ocons << "SimHeader.nodes " << nodes.x() << " " << nodes.y() << std::endl;
-        ocons << "SimHeader.step " << step << std::endl;
-        ocons << "SimHeader.scale " << scale[0] << " " << scale[1] << std::endl;
-        ocons << "SimHeader.cellSize " << cellSizeArr[0] << " " << cellSizeArr[1] << std::endl;
-    }
+        void setScale(picongpu::float_32 x, picongpu::float_32 y)
+        {
+            scale[0] = x;
+            scale[1] = y;
+        }
 
-};
+        void writeToConsole(std::ostream& ocons) const
+        {
+            ocons << "SimHeader.size " << size.x() << " " << size.y() << std::endl;
+            ocons << "SimHeader.nodes " << nodes.x() << " " << nodes.y() << std::endl;
+            ocons << "SimHeader.step " << step << std::endl;
+            ocons << "SimHeader.scale " << scale[0] << " " << scale[1] << std::endl;
+            ocons << "SimHeader.cellSize " << cellSizeArr[0] << " " << cellSizeArr[1] << std::endl;
+        }
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/output/header/WindowHeader.hpp b/include/picongpu/plugins/output/header/WindowHeader.hpp
index 0a7729dbec..663fc43b35 100644
--- a/include/picongpu/plugins/output/header/WindowHeader.hpp
+++ b/include/picongpu/plugins/output/header/WindowHeader.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -28,20 +28,18 @@
 
 namespace picongpu
 {
-
-struct WindowHeader
-{
-    using Size2D = pmacc::DataSpace< DIM2 >;
-
-    Size2D size;
-    Size2D offset;
-
-    void writeToConsole(std::ostream& ocons) const
+    struct WindowHeader
     {
-        ocons << "WindowHeader.size " << size.x() << " " << size.y() << std::endl;
-        ocons << "WindowHeader.offset " << offset.x() << " " << offset.y() << std::endl;
-    }
+        using Size2D = pmacc::DataSpace<DIM2>;
+
+        Size2D size;
+        Size2D offset;
 
-};
+        void writeToConsole(std::ostream& ocons) const
+        {
+            ocons << "WindowHeader.size " << size.x() << " " << size.y() << std::endl;
+            ocons << "WindowHeader.offset " << offset.x() << " " << offset.y() << std::endl;
+        }
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/output/images/PngCreator.hpp b/include/picongpu/plugins/output/images/PngCreator.hpp
index 90b3cc802a..29383cb3bc 100644
--- a/include/picongpu/plugins/output/images/PngCreator.hpp
+++ b/include/picongpu/plugins/output/images/PngCreator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -39,12 +39,11 @@ namespace picongpu
 
     struct PngCreator
     {
-
-        PngCreator(std::string name, std::string folder) :
-            m_name(folder + "/" + name),
-            m_folder(folder),
-            m_createFolder(true),
-            m_isThreadActive(false)
+        PngCreator(std::string name, std::string folder)
+            : m_name(folder + "/" + name)
+            , m_folder(folder)
+            , m_createFolder(true)
+            , m_isThreadActive(false)
         {
         }
 
@@ -93,10 +92,7 @@ namespace picongpu
          * @param header meta information about the simulation
          */
         template<class Box>
-        void operator()(
-                        const Box data,
-                        const MessageHeader::Size2D size,
-                        const MessageHeader  header)
+        void operator()(const Box data, const MessageHeader::Size2D size, const MessageHeader header)
         {
             if(m_isThreadActive)
             {
@@ -107,11 +103,8 @@ namespace picongpu
         }
 
     private:
-
         template<class Box>
-        void createImage(const Box data,
-                        const MessageHeader::Size2D size,
-                        const MessageHeader header);
+        void createImage(const Box data, const MessageHeader::Size2D size, const MessageHeader header);
 
         std::string m_name;
         std::string m_folder;
@@ -119,7 +112,6 @@ namespace picongpu
         std::thread workerThread;
         /* status whether a thread is currently active */
         bool m_isThreadActive;
-
     };
 
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/output/images/PngCreator.tpp b/include/picongpu/plugins/output/images/PngCreator.tpp
index 3a9b12ae14..42cd3045db 100644
--- a/include/picongpu/plugins/output/images/PngCreator.tpp
+++ b/include/picongpu/plugins/output/images/PngCreator.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -34,90 +34,83 @@
 #include <sstream>
 #include <iomanip>
 
-#if( PIC_ENABLE_PNG == 1 )
-#   include <pngwriter.h>
+#if(PIC_ENABLE_PNG == 1)
+#    include <pngwriter.h>
 #endif
 
 namespace picongpu
 {
-    template< class Box >
-    inline void PngCreator::createImage(
-        const Box data,
-        const MessageHeader::Size2D size,
-        const MessageHeader header
-    )
+    template<class Box>
+    inline void PngCreator::createImage(const Box data, const MessageHeader::Size2D size, const MessageHeader header)
     {
-#if( PIC_ENABLE_PNG == 1 )
-        if ( m_createFolder )
+#if(PIC_ENABLE_PNG == 1)
+        if(m_createFolder)
         {
-            Environment< simDim >::get( ).Filesystem( ).createDirectoryWithPermissions( m_folder );
+            Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(m_folder);
             m_createFolder = false;
         }
 
         std::stringstream step;
-        step << std::setw( 6 ) << std::setfill( '0' ) << header.sim.step;
-        std::string filename( m_name + "_" + step.str( ) + ".png" );
+        step << std::setw(6) << std::setfill('0') << header.sim.step;
+        std::string filename(m_name + "_" + step.str() + ".png");
 
-        pngwriter png( size.x( ), size.y( ), 0, filename.c_str( ) );
+        pngwriter png(size.x(), size.y(), 0, filename.c_str());
 
         /* default compression: 6
          * zlib level 1 is ~12% bigger but ~2.3x faster in write_png( )
          */
-        png.setcompressionlevel( 1 );
+        png.setcompressionlevel(1);
 
-        //PngWriter coordinate system begin with 1,1
-        for( int y = 0; y < size.y( ); ++y)
+        // PngWriter coordinate system begin with 1,1
+        for(int y = 0; y < size.y(); ++y)
         {
-            for( int x = 0; x < size.x( ); ++x )
+            for(int x = 0; x < size.x(); ++x)
             {
-                float3_X p = data[ y ][ x ];
-                png.plot( x + 1, size.y( ) - y, p.x( ), p.y( ), p.z( ) );
+                float3_X p = data[y][x];
+                png.plot(x + 1, size.y() - y, p.x(), p.y(), p.z());
             }
         }
 
         /* scale the image by a user defined relative factor
          * `scale_image` is defined in `png.param`
          */
-        float_X scale_x( scale_image );
-        float_X scale_y( scale_image );
+        float_X scale_x(scale_image);
+        float_X scale_y(scale_image);
 
 
-        if( scale_to_cellsize )
+        if(scale_to_cellsize)
         {
             // scale to real cell size
-            scale_x *= header.sim.scale[ 0 ];
-            scale_y *= header.sim.scale[ 1 ];
+            scale_x *= header.sim.scale[0];
+            scale_y *= header.sim.scale[1];
         }
 
         /* to prevent artifacts scale only, if at least one of scale_x and
          * scale_y is != 1.0
          */
-        if( ( scale_x != float_X( 1.0 ) ) ||
-            ( scale_y != float_X( 1.0 ) )
-        )
-            //process the cell size and by factor scaling within one step
-            png.scale_kxky( scale_x, scale_y );
+        if((scale_x != float_X(1.0)) || (scale_y != float_X(1.0)))
+            // process the cell size and by factor scaling within one step
+            png.scale_kxky(scale_x, scale_y);
 
         // add some meta information
-        //header.writeToConsole( std::cout );
+        // header.writeToConsole( std::cout );
 
-        std::ostringstream description( std::ostringstream::out );
-        header.writeToConsole( description );
+        std::ostringstream description(std::ostringstream::out);
+        header.writeToConsole(description);
 
-        char title[ ] = "PIConGPU preview image";
-        std::string author = Environment<>::get().SimulationDescription().getAuthor( );
-        char software[ ] = "PIConGPU with PNGwriter";
+        char title[] = "PIConGPU preview image";
+        std::string author = Environment<>::get().SimulationDescription().getAuthor();
+        char software[] = "PIConGPU with PNGwriter";
 
-        png.settext( title, author.c_str( ), description.str( ).c_str( ), software );
+        png.settext(title, author.c_str(), description.str().c_str(), software);
 
         // write to disk and close object
-        png.close( );
+        png.close();
 #else
-        boost::ignore_unused( data, size, header );
+        boost::ignore_unused(data, size, header);
         /* always fail with an exception at runtime */
-        PMACC_VERIFY_MSG( false, "not allowed to call createImage (missing dependency PNGwriter)" );
+        PMACC_VERIFY_MSG(false, "not allowed to call createImage (missing dependency PNGwriter)");
 #endif
-
     }
 
 } /* namespace picongpu */
diff --git a/include/picongpu/plugins/output/images/Visualisation.hpp b/include/picongpu/plugins/output/images/Visualisation.hpp
index df35e5aa4b..61ffadbe6e 100644
--- a/include/picongpu/plugins/output/images/Visualisation.hpp
+++ b/include/picongpu/plugins/output/images/Visualisation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -60,278 +60,240 @@
 
 namespace picongpu
 {
-
-// normalize EM fields to typical laser or plasma quantities
-//-1: Auto:    enable adaptive scaling for each output
-// 1: Laser:   typical fields calculated out of the laser amplitude
-// 2: Drift:   outdated
-// 3: PlWave:  typical fields calculated out of the plasma freq.,
-//             assuming the wave moves approx. with c
-// 4: Thermal: outdated
-// 5: BlowOut: typical fields, assuming that a LWFA in the blowout
-//             regime causes a bubble with radius of approx. the laser's
-//             beam waist (use for bubble fields)
-///  \return float3_X( tyBField, tyEField, tyCurrent )
-
-template< int T >
-struct typicalFields
-{
-
-    HDINLINE static float3_X get()
+    // normalize EM fields to typical laser or plasma quantities
+    //-1: Auto:    enable adaptive scaling for each output
+    // 1: Laser:   typical fields calculated out of the laser amplitude
+    // 2: Drift:   outdated
+    // 3: PlWave:  typical fields calculated out of the plasma freq.,
+    //             assuming the wave moves approx. with c
+    // 4: Thermal: outdated
+    // 5: BlowOut: typical fields, assuming that a LWFA in the blowout
+    //             regime causes a bubble with radius of approx. the laser's
+    //             beam waist (use for bubble fields)
+    ///  \return float3_X( tyBField, tyEField, tyCurrent )
+
+    template<int T>
+    struct typicalFields
     {
-        return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
-    }
-};
-
-template< >
-struct typicalFields < -1 >
-{
+        HDINLINE static float3_X get()
+        {
+            return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
+        }
+    };
 
-    HDINLINE static float3_X get()
+    template<>
+    struct typicalFields<-1>
     {
-        return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
-    }
-};
-
-template< >
-struct typicalFields < 1 >
-{
+        HDINLINE static float3_X get()
+        {
+            return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
+        }
+    };
 
-    HDINLINE static float3_X get()
+    template<>
+    struct typicalFields<1>
     {
+        HDINLINE static float3_X get()
+        {
 #if !(EM_FIELD_SCALE_CHANNEL1 == 1 || EM_FIELD_SCALE_CHANNEL2 == 1 || EM_FIELD_SCALE_CHANNEL3 == 1)
-        return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
+            return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
 #else
-        const float_X tyCurrent = particles::TYPICAL_PARTICLES_PER_CELL * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE
-            * abs(BASE_CHARGE) / DELTA_T;
-        const float_X tyEField = fields::laserProfiles::Selected::Unitless::AMPLITUDE + FLT_MIN;
-        const float_X tyBField = tyEField * MUE0_EPS0;
+            constexpr auto baseCharge = BASE_CHARGE;
+            const float_X tyCurrent = particles::TYPICAL_PARTICLES_PER_CELL
+                * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * math::abs(baseCharge) / DELTA_T;
+            const float_X tyEField = fields::laserProfiles::Selected::Unitless::AMPLITUDE + FLT_MIN;
+            const float_X tyBField = tyEField * MUE0_EPS0;
 
-        return float3_X(tyBField, tyEField, tyCurrent);
+            return float3_X(tyBField, tyEField, tyCurrent);
 #endif
-    }
-};
-
+        }
+    };
 
-/* outdated drift normalization */
-template< >
-struct typicalFields < 2 >;
 
-template< >
-struct typicalFields < 3 >
-{
+    /* outdated drift normalization */
+    template<>
+    struct typicalFields<2>;
 
-    HDINLINE static float3_X get()
+    template<>
+    struct typicalFields<3>
     {
+        HDINLINE static float3_X get()
+        {
 #if !(EM_FIELD_SCALE_CHANNEL1 == 3 || EM_FIELD_SCALE_CHANNEL2 == 3 || EM_FIELD_SCALE_CHANNEL3 == 3)
-        return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
+            return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
 #else
-        const float_X lambda_pl = pmacc::algorithms::math::Pi< float_X >::doubleValue *
-            SPEED_OF_LIGHT * sqrt(BASE_MASS * EPS0 / BASE_DENSITY / BASE_CHARGE / BASE_CHARGE);
-        const float_X tyEField = lambda_pl * BASE_DENSITY / 3.0f / EPS0;
-        const float_X tyBField = tyEField * MUE0_EPS0;
-        const float_X tyCurrent = tyBField / MUE0;
-
-        return float3_X(tyBField, tyEField, tyCurrent);
+            constexpr auto baseCharge = BASE_CHARGE;
+            const float_X lambda_pl = pmacc::math::Pi<float_X>::doubleValue * SPEED_OF_LIGHT
+                * sqrt(BASE_MASS * EPS0 / BASE_DENSITY / baseCharge / baseCharge);
+            const float_X tyEField = lambda_pl * BASE_DENSITY / 3.0f / EPS0;
+            const float_X tyBField = tyEField * MUE0_EPS0;
+            const float_X tyCurrent = tyBField / MUE0;
+
+            return float3_X(tyBField, tyEField, tyCurrent);
 #endif
-    }
-};
-
-/* outdated ELECTRON_TEMPERATURE normalization */
-template< >
-struct typicalFields < 4 >;
+        }
+    };
 
-template< >
-struct typicalFields < 5 >
-{
+    /* outdated ELECTRON_TEMPERATURE normalization */
+    template<>
+    struct typicalFields<4>;
 
-    HDINLINE static float3_X get()
+    template<>
+    struct typicalFields<5>
     {
+        HDINLINE static float3_X get()
+        {
 #if !(EM_FIELD_SCALE_CHANNEL1 == 5 || EM_FIELD_SCALE_CHANNEL2 == 5 || EM_FIELD_SCALE_CHANNEL3 == 5)
-        return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
+            return float3_X(float_X(1.0), float_X(1.0), float_X(1.0));
 #else
-        const float_X tyEField = fields::laserProfiles::Selected::Unitless::W0 * BASE_DENSITY / 3.0f / EPS0;
-        const float_X tyBField = tyEField * MUE0_EPS0;
-        const float_X tyCurrent = particles::TYPICAL_PARTICLES_PER_CELL * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE
-            * abs(BASE_CHARGE) / DELTA_T;
+            constexpr auto baseCharge = BASE_CHARGE;
+            const float_X tyEField = fields::laserProfiles::Selected::Unitless::W0 * BASE_DENSITY / 3.0f / EPS0;
+            const float_X tyBField = tyEField * MUE0_EPS0;
+            const float_X tyCurrent = particles::TYPICAL_PARTICLES_PER_CELL
+                * particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * math::abs(baseCharge) / DELTA_T;
 
-        return float3_X(tyBField, tyEField, tyCurrent);
+            return float3_X(tyBField, tyEField, tyCurrent);
 #endif
-    }
-};
-
+        }
+    };
 
-/** Check if an offset is part of the slicing domain
- *
- * Check if a N dimensional local domain offset is equal to a scalar offset of
- * a given dimension.
- * The results can be taken to decide if a cell is within a slice of a volume.
- */
-template< uint32_t T_dim = simDim >
-struct IsPartOfSlice;
 
-template< >
-struct IsPartOfSlice< DIM3 >
-{
-    /** perform check
-     *
-     * @param cellOffset cell offset relative to the origin of the local domain
-     * @param sliceDim dimension of the slice
-     * @param localDomainOffset local domain offset relative to the origin of the global domain
-     *                          (in the slice dimension)
-     * @param sliceOffset cell offset of the slice relative to the origin of the global domain
-     *                         ( in the slice dimension)
-     * @return true if cellOffset is part of the slicing domain, else false
+    /** Check if an offset is part of the slicing domain
      *
-     * @return always true
+     * Check if a N dimensional local domain offset is equal to a scalar offset of
+     * a given dimension.
+     * The results can be taken to decide if a cell is within a slice of a volume.
      */
-    template< typename T_Space >
-    HDINLINE bool operator()(
-        T_Space const & cellOffset,
-        uint32_t const sliceDim,
-        uint32_t const localDomainOffset,
-        uint32_t const sliceOffset
-    )
+    template<uint32_t T_dim = simDim>
+    struct IsPartOfSlice;
+
+    template<>
+    struct IsPartOfSlice<DIM3>
     {
-        // offset of the cell relative to the global origin
-        uint32_t const localCellOffset = cellOffset[ sliceDim ] + localDomainOffset;
-        return localCellOffset == sliceOffset;
-    }
-};
-
-template< >
-struct IsPartOfSlice< DIM2 >
-{
-    /** perform check
-     *
-     * @return always true
-     */
-    template< typename T_Space >
-    HDINLINE bool operator()(
-        T_Space const &,
-        uint32_t const,
-        uint32_t const,
-        uint32_t const
-    )
+        /** perform check
+         *
+         * @param cellOffset cell offset relative to the origin of the local domain
+         * @param sliceDim dimension of the slice
+         * @param localDomainOffset local domain offset relative to the origin of the global domain
+         *                          (in the slice dimension)
+         * @param sliceOffset cell offset of the slice relative to the origin of the global domain
+         *                         ( in the slice dimension)
+         * @return true if cellOffset is part of the slicing domain, else false
+         *
+         * @return always true
+         */
+        template<typename T_Space>
+        HDINLINE bool operator()(
+            T_Space const& cellOffset,
+            uint32_t const sliceDim,
+            uint32_t const localDomainOffset,
+            uint32_t const sliceOffset)
+        {
+            // offset of the cell relative to the global origin
+            uint32_t const localCellOffset = cellOffset[sliceDim] + localDomainOffset;
+            return localCellOffset == sliceOffset;
+        }
+    };
+
+    template<>
+    struct IsPartOfSlice<DIM2>
     {
-        return true;
-    }
-};
+        /** perform check
+         *
+         * @return always true
+         */
+        template<typename T_Space>
+        HDINLINE bool operator()(T_Space const&, uint32_t const, uint32_t const, uint32_t const)
+        {
+            return true;
+        }
+    };
 
-/** derives two dimensional field from a slice of field
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelPaintFields
-{
-    /** derive field values
+    /** derives two dimensional field from a slice of field
      *
-     * @tparam T_EBox pmacc::DataBox, electric field box type
-     * @tparam T_BBox pmacc::DataBox, magnetic field box type
-     * @tparam T_JBox particle current box type
-     * @tparam T_Mapping mapper functor type
-     * @tparam T_Acc alpaka accelerator type
-     *
-     * @param acc alpaka accelerator
-     * @param fieldE electric field
-     * @param fieldB magnetic field
-     * @param fieldJ field with particle current
-     * @param image[in,out] two dimensional image (without guarding cells)
-     * @param transpose indices to transpose dimensions range per dimension [0,simDim)
-     * @param slice offset (in cells) of the slice in the dimension sliceDim relative to
-     *              the origin of the global domain
-     * @param localDomainOffset offset (in cells) of the local domain relative to the
-     *                          origin of the global domain
-     * @param sliceDim dimension to slice range [0,simDim)
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_EBox,
-        typename T_BBox,
-        typename T_JBox,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_EBox const fieldE,
-        T_BBox const fieldB,
-        T_JBox const fieldJ,
-        DataBox<
-            PitchedBox<
-                float3_X,
-                DIM2
-            >
-        > image,
-        DataSpace< DIM2 > const transpose,
-        int const slice,
-        uint32_t const localDomainOffset,
-        uint32_t const sliceDim,
-        T_Mapping mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelPaintFields
     {
-        using namespace mappings::threads;
+        /** derive field values
+         *
+         * @tparam T_EBox pmacc::DataBox, electric field box type
+         * @tparam T_BBox pmacc::DataBox, magnetic field box type
+         * @tparam T_JBox particle current box type
+         * @tparam T_Mapping mapper functor type
+         * @tparam T_Acc alpaka accelerator type
+         *
+         * @param acc alpaka accelerator
+         * @param fieldE electric field
+         * @param fieldB magnetic field
+         * @param fieldJ field with particle current
+         * @param image[in,out] two dimensional image (without guarding cells)
+         * @param transpose indices to transpose dimensions range per dimension [0,simDim)
+         * @param slice offset (in cells) of the slice in the dimension sliceDim relative to
+         *              the origin of the global domain
+         * @param localDomainOffset offset (in cells) of the local domain relative to the
+         *                          origin of the global domain
+         * @param sliceDim dimension to slice range [0,simDim)
+         * @param mapper functor to map a block to a supercell
+         */
+        template<typename T_EBox, typename T_BBox, typename T_JBox, typename T_Mapping, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_EBox const fieldE,
+            T_BBox const fieldB,
+            T_JBox const fieldJ,
+            DataBox<PitchedBox<float3_X, DIM2>> image,
+            DataSpace<DIM2> const transpose,
+            int const slice,
+            uint32_t const localDomainOffset,
+            uint32_t const sliceDim,
+            T_Mapping mapper) const
+        {
+            using namespace mappings::threads;
 
-        using SuperCellSize = typename T_Mapping::SuperCellSize;
+            using SuperCellSize = typename T_Mapping::SuperCellSize;
 
-        constexpr uint32_t cellsPerSupercell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
+            constexpr uint32_t cellsPerSupercell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
 
-        uint32_t const workerIdx = threadIdx.x;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-        DataSpace< simDim > const suplercellIdx = mapper.getSuperCellIndex( DataSpace< simDim >( blockIdx ) );
-        // offset of the supercell (in cells) to the origin of the local domain
-        DataSpace< simDim > const supercellCellOffset(
-            ( suplercellIdx - mapper.getGuardingSuperCells( ) ) * SuperCellSize::toRT( )
-        );
+            DataSpace<simDim> const suplercellIdx = mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc)));
+            // offset of the supercell (in cells) to the origin of the local domain
+            DataSpace<simDim> const supercellCellOffset(
+                (suplercellIdx - mapper.getGuardingSuperCells()) * SuperCellSize::toRT());
 
-        using SupercellDomCfg = IdxConfig<
-            cellsPerSupercell,
-            numWorkers
-        >;
+            using SupercellDomCfg = IdxConfig<cellsPerSupercell, numWorkers>;
 
-        // each cell in a supercell is handled as a virtual worker
-        ForEachIdx< SupercellDomCfg > forEachCell( workerIdx );
+            // each cell in a supercell is handled as a virtual worker
+            ForEachIdx<SupercellDomCfg> forEachCell(workerIdx);
 
-        forEachCell(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
+            forEachCell([&](uint32_t const linearIdx, uint32_t const) {
                 // cell index within the superCell
-                DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
+                DataSpace<simDim> const cellIdx = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
                 // offset to the origin of the local domain + guarding cells
-                DataSpace< simDim > const cellOffset( suplercellIdx * SuperCellSize::toRT() + cellIdx );
+                DataSpace<simDim> const cellOffset(suplercellIdx * SuperCellSize::toRT() + cellIdx);
                 // cell offset without guarding cells
-                DataSpace< simDim > const realCell( supercellCellOffset + cellIdx );
+                DataSpace<simDim> const realCell(supercellCellOffset + cellIdx);
                 // offset within the two dimensional result buffer
-                DataSpace< DIM2 > const imageCell(
-                    realCell[ transpose.x( ) ],
-                    realCell[ transpose.y( ) ]
-                );
-
-                bool const isCellOnSlice = IsPartOfSlice< >{}(
-                    realCell,
-                    sliceDim,
-                    localDomainOffset,
-                    slice
-                );
+                DataSpace<DIM2> const imageCell(realCell[transpose.x()], realCell[transpose.y()]);
+
+                bool const isCellOnSlice = IsPartOfSlice<>{}(realCell, sliceDim, localDomainOffset, slice);
 
                 /* if the virtual worker is not calculating a cell out of the
                  * selected slice then exit
                  */
-                if( !isCellOnSlice )
+                if(!isCellOnSlice)
                     return;
 
                 // set fields of this cell to vars
-                typename T_BBox::ValueType field_b = fieldB( cellOffset );
-                typename T_EBox::ValueType field_e = fieldE( cellOffset );
-                typename T_JBox::ValueType field_j = fieldJ( cellOffset );
+                typename T_BBox::ValueType field_b = fieldB(cellOffset);
+                typename T_EBox::ValueType field_e = fieldE(cellOffset);
+                typename T_JBox::ValueType field_j = fieldJ(cellOffset);
 
                 // multiply with the area size of each plane
-                field_j *= float3_X::create( CELL_VOLUME ) / cellSize;
+                field_j *= float3_X::create(CELL_VOLUME) / cellSize;
 
                 /* reset picture to black
                  *   color range for each RGB channel: [0.0, 1.0]
@@ -342,303 +304,202 @@ struct KernelPaintFields
                      * [1] = EField normalization, [2] = Current normalization
                      */
                     visPreview::preChannel1(
-                        field_b / typicalFields< EM_FIELD_SCALE_CHANNEL1 >::get( )[ 0 ],
-                        field_e / typicalFields< EM_FIELD_SCALE_CHANNEL1 >::get( )[ 1 ],
-                        field_j / typicalFields< EM_FIELD_SCALE_CHANNEL1 >::get( )[ 2 ]
-                    ),
+                        field_b / typicalFields<EM_FIELD_SCALE_CHANNEL1>::get()[0],
+                        field_e / typicalFields<EM_FIELD_SCALE_CHANNEL1>::get()[1],
+                        field_j / typicalFields<EM_FIELD_SCALE_CHANNEL1>::get()[2]),
                     visPreview::preChannel2(
-                        field_b / typicalFields< EM_FIELD_SCALE_CHANNEL2 >::get( )[ 0 ],
-                        field_e / typicalFields< EM_FIELD_SCALE_CHANNEL2 >::get( )[ 1 ],
-                        field_j / typicalFields< EM_FIELD_SCALE_CHANNEL2 >::get( )[ 2 ]
-                    ),
+                        field_b / typicalFields<EM_FIELD_SCALE_CHANNEL2>::get()[0],
+                        field_e / typicalFields<EM_FIELD_SCALE_CHANNEL2>::get()[1],
+                        field_j / typicalFields<EM_FIELD_SCALE_CHANNEL2>::get()[2]),
                     visPreview::preChannel3(
-                        field_b / typicalFields< EM_FIELD_SCALE_CHANNEL3 >::get( )[ 0 ],
-                        field_e / typicalFields< EM_FIELD_SCALE_CHANNEL3 >::get( )[ 1 ],
-                        field_j / typicalFields< EM_FIELD_SCALE_CHANNEL3 >::get( )[ 2 ]
-                    )
-                );
+                        field_b / typicalFields<EM_FIELD_SCALE_CHANNEL3>::get()[0],
+                        field_e / typicalFields<EM_FIELD_SCALE_CHANNEL3>::get()[1],
+                        field_j / typicalFields<EM_FIELD_SCALE_CHANNEL3>::get()[2]));
 
                 // draw to (perhaps smaller) image cell
-                image( imageCell ) = pic;
-            }
-        );
-    }
-};
+                image(imageCell) = pic;
+            });
+        }
+    };
 
-/** derives two dimensional field from a particle slice
- *
- * The shape of a particle is not taken in account.
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelPaintParticles3D
-{
-    /** derive particle values
+    /** derives two dimensional field from a particle slice
      *
-     * @tparam T_ParBox pmacc::ParticlesBox, particle box type
-     * @tparam T_Mapping mapper functor type
-     * @tparam T_Acc alpaka accelerator type
+     * The shape of a particle is not taken in account.
      *
-     * @param acc alpaka accelerator
-     * @param pb particle memory
-     * @param image[in,out] two dimensional image (without guarding cells)
-     * @param transpose indices to transpose dimensions range per dimension [0,simDim)
-     * @param slice offset (in cells) of the slice in the dimension sliceDim relative to
-     *              the origin of the global domain
-     * @param localDomainOffset offset (in cells) of the local domain relative to the
-     *                          origin of the global domain
-     * @param sliceDim dimension to slice range [0,simDim)
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParBox,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void
-    operator()(
-        T_Acc const & acc,
-        T_ParBox pb,
-        DataBox<
-            PitchedBox<
-                float3_X,
-                DIM2
-            >
-        > image,
-        DataSpace< DIM2 > const transpose,
-        int const slice,
-        uint32_t const localDomainOffset,
-        uint32_t const sliceDim,
-        T_Mapping mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelPaintParticles3D
     {
-        using namespace mappings::threads;
+        /** derive particle values
+         *
+         * @tparam T_ParBox pmacc::ParticlesBox, particle box type
+         * @tparam T_Mapping mapper functor type
+         * @tparam T_Acc alpaka accelerator type
+         *
+         * @param acc alpaka accelerator
+         * @param pb particle memory
+         * @param image[in,out] two dimensional image (without guarding cells)
+         * @param transpose indices to transpose dimensions range per dimension [0,simDim)
+         * @param slice offset (in cells) of the slice in the dimension sliceDim relative to
+         *              the origin of the global domain
+         * @param localDomainOffset offset (in cells) of the local domain relative to the
+         *                          origin of the global domain
+         * @param sliceDim dimension to slice range [0,simDim)
+         * @param mapper functor to map a block to a supercell
+         */
+        template<typename T_ParBox, typename T_Mapping, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_ParBox pb,
+            DataBox<PitchedBox<float3_X, DIM2>> image,
+            DataSpace<DIM2> const transpose,
+            int const slice,
+            uint32_t const localDomainOffset,
+            uint32_t const sliceDim,
+            T_Mapping mapper) const
+        {
+            using namespace mappings::threads;
 
-        using SuperCellSize = typename T_Mapping::SuperCellSize;
+            using SuperCellSize = typename T_Mapping::SuperCellSize;
 
-        constexpr uint32_t numParticlesPerFrame = pmacc::math::CT::volume< SuperCellSize >::type::value;
-        constexpr uint32_t numCellsPerSupercell = numParticlesPerFrame;
-        constexpr uint32_t numWorkers = T_numWorkers;
+            constexpr uint32_t numParticlesPerFrame = pmacc::math::CT::volume<SuperCellSize>::type::value;
+            constexpr uint32_t numCellsPerSupercell = numParticlesPerFrame;
+            constexpr uint32_t numWorkers = T_numWorkers;
 
-        uint32_t const workerIdx = threadIdx.x;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-        using ParticleDomCfg = IdxConfig<
-            numParticlesPerFrame,
-            numWorkers
-        >;
+            using ParticleDomCfg = IdxConfig<numParticlesPerFrame, numWorkers>;
 
-        using SupercellDomCfg = IdxConfig<
-            numCellsPerSupercell,
-            numWorkers
-        >;
+            using SupercellDomCfg = IdxConfig<numCellsPerSupercell, numWorkers>;
 
-        ForEachIdx<
-            IdxConfig<
-                1,
-                numWorkers
-            >
-        > onlyMaster{ workerIdx };
+            ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
 
-        // each virtual worker works on a cell in the supercell
-        ForEachIdx< SupercellDomCfg > forEachCell( workerIdx );
+            // each virtual worker works on a cell in the supercell
+            ForEachIdx<SupercellDomCfg> forEachCell(workerIdx);
 
-        /* is 1 if a offset of a cell in the supercell is equal the slice (offset)
-         * else 0
-         */
-        PMACC_SMEM(
-            acc,
-            superCellParticipate,
-            int
-        );
-
-        /* true if the virtual worker is processing a pixel within the resulting image,
-         * else false
-         */
-        memory::CtxArray<
-            bool,
-            SupercellDomCfg
-        > isImageThreadCtx( false );
-
-        DataSpace< simDim > const suplercellIdx = mapper.getSuperCellIndex(DataSpace<simDim > (blockIdx));
-        // offset of the supercell (in cells) to the origin of the local domain
-        DataSpace< simDim > const supercellCellOffset(
-            ( suplercellIdx - mapper.getGuardingSuperCells( ) ) * SuperCellSize::toRT( )
-        );
-
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                superCellParticipate = 0;
-            }
-        );
+            /* is 1 if a offset of a cell in the supercell is equal the slice (offset)
+             * else 0
+             */
+            PMACC_SMEM(acc, superCellParticipate, int);
 
-        __syncthreads();
+            /* true if the virtual worker is processing a pixel within the resulting image,
+             * else false
+             */
+            memory::CtxArray<bool, SupercellDomCfg> isImageThreadCtx(false);
 
-        forEachCell(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const idx
-            )
-            {
+            DataSpace<simDim> const suplercellIdx = mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc)));
+            // offset of the supercell (in cells) to the origin of the local domain
+            DataSpace<simDim> const supercellCellOffset(
+                (suplercellIdx - mapper.getGuardingSuperCells()) * SuperCellSize::toRT());
+
+            onlyMaster([&](uint32_t const, uint32_t const) { superCellParticipate = 0; });
+
+            cupla::__syncthreads(acc);
+
+            forEachCell([&](uint32_t const linearIdx, uint32_t const idx) {
                 // cell index within the superCell
-                DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
+                DataSpace<simDim> const cellIdx = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
 
                 // cell offset to origin of the local domain
-                DataSpace< simDim > const realCell( supercellCellOffset + cellIdx );
+                DataSpace<simDim> const realCell(supercellCellOffset + cellIdx);
 
-                bool const isCellOnSlice = IsPartOfSlice< >{}(
-                    realCell,
-                    sliceDim,
-                    localDomainOffset,
-                    slice
-                );
+                bool const isCellOnSlice = IsPartOfSlice<>{}(realCell, sliceDim, localDomainOffset, slice);
 
-                if( isCellOnSlice )
+                if(isCellOnSlice)
                 {
                     // atomic avoids: WAW Error in cuda-memcheck racecheck
-                    nvidia::atomicAllExch(
-                        acc,
-                        &superCellParticipate,
-                        1,
-                        ::alpaka::hierarchy::Threads{ }
-                    );
-                    isImageThreadCtx[ idx ] = true;
+                    nvidia::atomicAllExch(acc, &superCellParticipate, 1, ::alpaka::hierarchy::Threads{});
+                    isImageThreadCtx[idx] = true;
                 }
-            }
-        );
-
-        __syncthreads();
-
-        if( superCellParticipate == 0 )
-            return;
-
-        // slice is always two dimensional
-        using SharedMem = DataBox<
-            PitchedBox<
-                float_X,
-                DIM2
-            >
-        >;
-
-        sharedMemExtern(
-            shBlock,
-            float_X
-        );
-
-        // shared memory box for particle counter
-        SharedMem counter(
-            PitchedBox<
-                float_X,
-                DIM2
-            >(
-                ( float_X* ) shBlock,
-                DataSpace< DIM2 > (),
+            });
+
+            cupla::__syncthreads(acc);
+
+            if(superCellParticipate == 0)
+                return;
+
+            // slice is always two dimensional
+            using SharedMem = DataBox<PitchedBox<float_X, DIM2>>;
+
+            sharedMemExtern(shBlock, float_X);
+
+            // shared memory box for particle counter
+            SharedMem counter(PitchedBox<float_X, DIM2>(
+                (float_X*) shBlock,
+                DataSpace<DIM2>(),
                 // pitch in byte
-                SuperCellSize::toRT( )[ transpose.x() ] * sizeof( float_X )
-            )
-        );
-
-        forEachCell(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const idx
-            )
-            {
+                SuperCellSize::toRT()[transpose.x()] * sizeof(float_X)));
+
+            forEachCell([&](uint32_t const linearIdx, uint32_t const idx) {
                 /* cell index within the superCell */
-                DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
+                DataSpace<simDim> const cellIdx = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
 
-                DataSpace< DIM2 > const localCell(
-                    cellIdx[ transpose.x() ],
-                    cellIdx[ transpose.y() ]
-                );
+                DataSpace<DIM2> const localCell(cellIdx[transpose.x()], cellIdx[transpose.y()]);
 
-                if( isImageThreadCtx[ idx ] )
+                if(isImageThreadCtx[idx])
                 {
-                    counter( localCell ) = float_X(0.0);
+                    counter(localCell) = float_X(0.0);
                 }
-            }
-        );
+            });
 
-        // wait that shared memory  is set to zero
-        __syncthreads();
+            // wait that shared memory  is set to zero
+            cupla::__syncthreads(acc);
 
-        using FramePtr = typename T_ParBox::FramePtr;
-        FramePtr frame = pb.getFirstFrame( suplercellIdx );
+            using FramePtr = typename T_ParBox::FramePtr;
+            FramePtr frame = pb.getFirstFrame(suplercellIdx);
 
-        // each virtual worker works on a particle in the frame
-        ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
+            // each virtual worker works on a particle in the frame
+            ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
 
-        while( frame.isValid( ) )
-        {
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    auto particle = frame[ linearIdx ] ;
-                    if( particle[ multiMask_ ] == 1)
+            while(frame.isValid())
+            {
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    auto particle = frame[linearIdx];
+                    if(particle[multiMask_] == 1)
                     {
-                        int const linearCellIdx = particle[ localCellIdx_ ];
+                        int const linearCellIdx = particle[localCellIdx_];
                         // we only draw the first slice of cells in the super cell (z == 0)
-                        DataSpace< simDim > const particleCellOffset(
-                            DataSpaceOperations< simDim >::template map< SuperCellSize >( linearCellIdx )
-                        );
-                        bool const isParticleOnSlice = IsPartOfSlice< >{}(
+                        DataSpace<simDim> const particleCellOffset(
+                            DataSpaceOperations<simDim>::template map<SuperCellSize>(linearCellIdx));
+                        bool const isParticleOnSlice = IsPartOfSlice<>{}(
                             particleCellOffset + supercellCellOffset,
                             sliceDim,
                             localDomainOffset,
-                            slice
-                        );
-                        if( isParticleOnSlice )
+                            slice);
+                        if(isParticleOnSlice)
                         {
-                            DataSpace< DIM2 > const reducedCell(
-                                particleCellOffset[ transpose.x( ) ],
-                                particleCellOffset[ transpose.y( ) ]
-                            );
-                            atomicAdd(
-                                &( counter( reducedCell ) ),
+                            DataSpace<DIM2> const reducedCell(
+                                particleCellOffset[transpose.x()],
+                                particleCellOffset[transpose.y()]);
+                            cupla::atomicAdd(
+                                acc,
+                                &(counter(reducedCell)),
                                 // normalize the value to avoid bad precision for large macro particle weightings
-                                particle[ weighting_ ] / particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE,
-                                ::alpaka::hierarchy::Threads{ }
-                            );
+                                particle[weighting_] / particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE,
+                                ::alpaka::hierarchy::Threads{});
                         }
                     }
-                }
-            );
+                });
 
-            frame = pb.getNextFrame(frame);
-        }
+                frame = pb.getNextFrame(frame);
+            }
 
-        // wait that all worker finsihed the reduce operation
-        __syncthreads();
+            // wait that all worker finsihed the reduce operation
+            cupla::__syncthreads(acc);
 
-        forEachCell(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const idx
-            )
-            {
-                if( isImageThreadCtx[ idx ] )
+            forEachCell([&](uint32_t const linearIdx, uint32_t const idx) {
+                if(isImageThreadCtx[idx])
                 {
                     // cell index within the superCell
-                    DataSpace< simDim > const cellIdx = DataSpaceOperations< simDim >::template map< SuperCellSize >( linearIdx );
+                    DataSpace<simDim> const cellIdx
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(linearIdx);
                     // cell offset to origin of the local domain
-                    DataSpace< simDim > const realCell( supercellCellOffset + cellIdx );
+                    DataSpace<simDim> const realCell(supercellCellOffset + cellIdx);
                     // index in image
-                    DataSpace< DIM2 > const imageCell(
-                        realCell[ transpose.x( ) ],
-                        realCell[ transpose.y( ) ]
-                    );
+                    DataSpace<DIM2> const imageCell(realCell[transpose.x()], realCell[transpose.y()]);
 
-                    DataSpace< DIM2 > const localCell(
-                        cellIdx[ transpose.x( ) ],
-                        cellIdx[ transpose.y( ) ]
-                    );
+                    DataSpace<DIM2> const localCell(cellIdx[transpose.x()], cellIdx[transpose.y()]);
 
                     /** Note: normally, we would multiply by particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE again.
                      *  BUT: since we are interested in a simple value between 0 and 1,
@@ -646,482 +507,396 @@ struct KernelPaintParticles3D
                      *       particles) and devide by the number of typical macro particles
                      *       per cell
                      */
-                    float_X value = counter( localCell ) /
-                        float_X( particles::TYPICAL_PARTICLES_PER_CELL );
-                    if( value > 1.0 )
+                    float_X value = counter(localCell) / float_X(particles::TYPICAL_PARTICLES_PER_CELL);
+                    if(value > 1.0)
                         value = 1.0;
 
 
                     visPreview::preParticleDensCol::addRGB(
-                        image( imageCell ),
+                        image(imageCell),
                         value,
-                        visPreview::preParticleDens_opacity
-                    );
+                        visPreview::preParticleDens_opacity);
 
                     // cut to [0, 1]
-                    for( uint32_t d = 0; d < DIM3; ++d )
+                    for(uint32_t d = 0; d < DIM3; ++d)
                     {
-                        if( image( imageCell )[ d ] < float_X( 0.0 ) )
-                            image( imageCell )[ d ] = float_X( 0.0 );
-                        if( image( imageCell )[ d ] > float_X( 1.0 ) )
-                            image( imageCell )[ d ] = float_X( 1.0 );
+                        if(image(imageCell)[d] < float_X(0.0))
+                            image(imageCell)[d] = float_X(0.0);
+                        if(image(imageCell)[d] > float_X(1.0))
+                            image(imageCell)[d] = float_X(1.0);
                     }
                 }
-            }
-        );
-    }
-};
-
-namespace vis_kernels
-{
+            });
+        }
+    };
 
-/** divide each cell by a value
- *
- * @tparam T_numWorkers number of workers
- * @tparam T_blockSize number of elements which will be handled
- *                     within a kernel block
- */
-template<
-    uint32_t T_numWorkers,
-    uint32_t T_blockSize
->
-struct DivideAnyCell
-{
-    /** derive particle values
-     *
-     * @tparam T_Mem pmacc::DataBox, type of the on dimensional memory
-     * @tparam T_Type divisor type
-     * @tparam T_Acc alpaka accelerator type
-     *
-     * @param acc alpaka accelerator
-     * @param mem memory[in,out] to manipulate, must provide the `operator[](int)`
-     * @param n number of elements in mem
-     * @param divisor divisor for the division
-     */
-    template<
-        typename T_Mem,
-        typename T_Type,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_Mem mem,
-        uint32_t n,
-        T_Type divisor
-    ) const
+    namespace vis_kernels
     {
-        using namespace mappings::threads;
+        /** divide each cell by a value
+         *
+         * @tparam T_numWorkers number of workers
+         * @tparam T_blockSize number of elements which will be handled
+         *                     within a kernel block
+         */
+        template<uint32_t T_numWorkers, uint32_t T_blockSize>
+        struct DivideAnyCell
+        {
+            /** derive particle values
+             *
+             * @tparam T_Mem pmacc::DataBox, type of the on dimensional memory
+             * @tparam T_Type divisor type
+             * @tparam T_Acc alpaka accelerator type
+             *
+             * @param acc alpaka accelerator
+             * @param mem memory[in,out] to manipulate, must provide the `operator[](int)`
+             * @param n number of elements in mem
+             * @param divisor divisor for the division
+             */
+            template<typename T_Mem, typename T_Type, typename T_Acc>
+            DINLINE void operator()(T_Acc const& acc, T_Mem mem, uint32_t n, T_Type divisor) const
+            {
+                using namespace mappings::threads;
 
-        constexpr uint32_t numWorkers = T_numWorkers;
+                constexpr uint32_t numWorkers = T_numWorkers;
 
-        uint32_t const workerIdx = threadIdx.x;
+                uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-        using SupercellDomCfg = IdxConfig<
-            T_blockSize,
-            numWorkers
-        >;
-        // each virtual worker works on a cell
-        ForEachIdx< SupercellDomCfg > forEachCell( workerIdx );
+                using SupercellDomCfg = IdxConfig<T_blockSize, numWorkers>;
+                // each virtual worker works on a cell
+                ForEachIdx<SupercellDomCfg> forEachCell(workerIdx);
 
-        forEachCell(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
-                uint32_t tid = blockIdx.x * T_blockSize + linearIdx;
-                if( tid >= n )
-                    return;
+                forEachCell([&](uint32_t const linearIdx, uint32_t const) {
+                    uint32_t tid = cupla::blockIdx(acc).x * T_blockSize + linearIdx;
+                    if(tid >= n)
+                        return;
 
-                float3_X const FLT3_MIN = float3_X::create( FLT_MIN );
-                mem[ tid ] /= ( divisor + FLT3_MIN );
+                    float3_X const FLT3_MIN = float3_X::create(FLT_MIN);
+                    mem[tid] /= (divisor + FLT3_MIN);
+                });
             }
-        );
-    }
-};
+        };
 
 
-/** convert channel value to an RGB color
- *
- * @tparam T_numWorkers number of workers
- * @tparam T_blockSize number of elements which will be handled
- *                     within a kernel block
- */
-template<
-    uint32_t T_numWorkers,
-    uint32_t T_blockSize
->
-struct ChannelsToRGB
-{
-    /** convert each element to an RGB color
-     *
-     * @tparam T_Mem pmacc::DataBox, type of the on dimensional memory
-     * @tparam T_Acc alpaka accelerator type
-     *
-     * @param acc alpaka accelerator
-     * @param mem memory[in,out] to manipulate, must provide the `operator[](int)`
-     * @param n number of elements in mem
-     */
-    template<
-        typename T_Mem,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_Mem mem,
-        uint32_t n
-    ) const
-    {
-        using namespace mappings::threads;
+        /** convert channel value to an RGB color
+         *
+         * @tparam T_numWorkers number of workers
+         * @tparam T_blockSize number of elements which will be handled
+         *                     within a kernel block
+         */
+        template<uint32_t T_numWorkers, uint32_t T_blockSize>
+        struct ChannelsToRGB
+        {
+            /** convert each element to an RGB color
+             *
+             * @tparam T_Mem pmacc::DataBox, type of the on dimensional memory
+             * @tparam T_Acc alpaka accelerator type
+             *
+             * @param acc alpaka accelerator
+             * @param mem memory[in,out] to manipulate, must provide the `operator[](int)`
+             * @param n number of elements in mem
+             */
+            template<typename T_Mem, typename T_Acc>
+            DINLINE void operator()(T_Acc const& acc, T_Mem mem, uint32_t n) const
+            {
+                using namespace mappings::threads;
 
-        constexpr uint32_t numWorkers = T_numWorkers;
+                constexpr uint32_t numWorkers = T_numWorkers;
 
-        uint32_t const workerIdx = threadIdx.x;
+                uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-        using SupercellDomCfg = IdxConfig<
-            T_blockSize,
-            numWorkers
-        >;
-        // each virtual worker works on a cell
-        ForEachIdx< SupercellDomCfg > forEachCell( workerIdx );
+                using SupercellDomCfg = IdxConfig<T_blockSize, numWorkers>;
+                // each virtual worker works on a cell
+                ForEachIdx<SupercellDomCfg> forEachCell(workerIdx);
 
-        forEachCell(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
-                uint32_t const tid = blockIdx.x * T_blockSize + linearIdx;
-                if( tid >= n )
-                    return;
+                forEachCell([&](uint32_t const linearIdx, uint32_t const) {
+                    uint32_t const tid = cupla::blockIdx(acc).x * T_blockSize + linearIdx;
+                    if(tid >= n)
+                        return;
+
+                    float3_X rgb(float3_X::create(0.0));
 
-                float3_X rgb(float3_X::create(0.0));
-
-                visPreview::preChannel1Col::addRGB(
-                    rgb,
-                    mem[ tid ].x( ),
-                    visPreview::preChannel1_opacity
-                );
-                visPreview::preChannel2Col::addRGB(
-                    rgb,
-                    mem[ tid ].y( ),
-                    visPreview::preChannel2_opacity
-                );
-                visPreview::preChannel3Col::addRGB(
-                    rgb,
-                    mem[ tid ].z( ),
-                    visPreview::preChannel3_opacity
-                );
-                mem[ tid ] = rgb;
+                    visPreview::preChannel1Col::addRGB(rgb, mem[tid].x(), visPreview::preChannel1_opacity);
+                    visPreview::preChannel2Col::addRGB(rgb, mem[tid].y(), visPreview::preChannel2_opacity);
+                    visPreview::preChannel3Col::addRGB(rgb, mem[tid].z(), visPreview::preChannel3_opacity);
+                    mem[tid] = rgb;
+                });
             }
-        );
-    }
-};
+        };
 
-}
+    } // namespace vis_kernels
 
-/**
- * Visualizes simulation data by writing png files.
- * Visulization is performed in an additional thread.
- */
-template<class ParticlesType, class Output>
-class Visualisation : public ILightweightPlugin
-{
-private:
-    typedef MappingDesc::SuperCellSize SuperCellSize;
-
-
-public:
-    using FrameType = typename ParticlesType::FrameType;
-    using CreatorType = Output;
-
-    Visualisation(std::string name, Output output, std::string notifyPeriod, DataSpace<DIM2> transpose, float_X slicePoint) :
-    m_output(output),
-    pluginName(name),
-    cellDescription(nullptr),
-    particleTag(ParticlesType::FrameType::getName()),
-    m_notifyPeriod(notifyPeriod),
-    m_transpose(transpose),
-    m_slicePoint(slicePoint),
-    isMaster(false),
-    header(nullptr),
-    reduce(1024),
-    img(nullptr)
+    /**
+     * Visualizes simulation data by writing png files.
+     * Visulization is performed in an additional thread.
+     */
+    template<class ParticlesType, class Output>
+    class Visualisation : public ILightweightPlugin
     {
-        sliceDim = 0;
-        if (m_transpose.x() == 0 || m_transpose.y() == 0)
-            sliceDim = 1;
-        if ((m_transpose.x() == 1 || m_transpose.y() == 1) && sliceDim == 1)
-            sliceDim = 2;
+    private:
+        typedef MappingDesc::SuperCellSize SuperCellSize;
+
+
+    public:
+        using FrameType = typename ParticlesType::FrameType;
+        using CreatorType = Output;
+
+        Visualisation(
+            std::string name,
+            Output output,
+            std::string notifyPeriod,
+            DataSpace<DIM2> transpose,
+            float_X slicePoint)
+            : m_output(output)
+            , pluginName(name)
+            , cellDescription(nullptr)
+            , particleTag(ParticlesType::FrameType::getName())
+            , m_notifyPeriod(notifyPeriod)
+            , m_transpose(transpose)
+            , m_slicePoint(slicePoint)
+            , isMaster(false)
+            , header(nullptr)
+            , reduce(1024)
+            , img(nullptr)
+        {
+            sliceDim = 0;
+            if(m_transpose.x() == 0 || m_transpose.y() == 0)
+                sliceDim = 1;
+            if((m_transpose.x() == 1 || m_transpose.y() == 1) && sliceDim == 1)
+                sliceDim = 2;
+
+            Environment<>::get().PluginConnector().registerPlugin(this);
+            Environment<>::get().PluginConnector().setNotificationPeriod(this, m_notifyPeriod);
+        }
 
-        Environment<>::get().PluginConnector().registerPlugin(this);
-        Environment<>::get().PluginConnector().setNotificationPeriod(this, m_notifyPeriod);
-    }
+        virtual ~Visualisation()
+        {
+            /* wait that shared buffers can destroyed */
+            m_output.join();
+            if(!m_notifyPeriod.empty())
+            {
+                __delete(img);
+                MessageHeader::destroy(header);
+            }
+        }
 
-    virtual ~Visualisation()
-    {
-        /* wait that shared buffers can destroyed */
-        m_output.join();
-        if(!m_notifyPeriod.empty())
+        std::string pluginGetName() const
         {
-            __delete(img);
-            MessageHeader::destroy(header);
+            return "Visualisation";
         }
-    }
 
-    std::string pluginGetName() const
-    {
-        return "Visualisation";
-    }
+        void notify(uint32_t currentStep)
+        {
+            PMACC_ASSERT(cellDescription != nullptr);
+            const DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
+            Window window(MovingWindow::getInstance().getWindow(currentStep));
 
-    void notify(uint32_t currentStep)
-    {
-        PMACC_ASSERT(cellDescription != nullptr);
-        const DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
-        Window window(MovingWindow::getInstance().getWindow(currentStep));
+            /*sliceOffset is only used in 3D*/
+            sliceOffset = (int) ((float_32)(window.globalDimensions.size[sliceDim]) * m_slicePoint)
+                + window.globalDimensions.offset[sliceDim];
 
-        /*sliceOffset is only used in 3D*/
-        sliceOffset = (int) ((float_32) (window.globalDimensions.size[sliceDim]) * m_slicePoint) + window.globalDimensions.offset[sliceDim];
+            if(!doDrawing())
+            {
+                return;
+            }
+            createImage(currentStep, window);
+        }
 
-        if (!doDrawing())
+        void setMappingDescription(MappingDesc* cellDescription)
         {
-            return;
+            PMACC_ASSERT(cellDescription != nullptr);
+            this->cellDescription = cellDescription;
         }
-        createImage(currentStep, window);
-    }
 
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        PMACC_ASSERT(cellDescription != nullptr);
-        this->cellDescription = cellDescription;
-    }
+        void createImage(uint32_t currentStep, Window window)
+        {
+            DataConnector& dc = Environment<>::get().DataConnector();
+            // Data does not need to be synchronized as visualization is
+            // done at the device.
+            auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
+            auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+            auto fieldJ = dc.get<FieldJ>(FieldJ::getName(), true);
+            auto particles = dc.get<ParticlesType>(particleTag, true);
 
-    void createImage(uint32_t currentStep, Window window)
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-        // Data does not need to be synchronized as visualization is
-        // done at the device.
-        auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
-        auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-        auto fieldJ = dc.get< FieldJ >( FieldJ::getName(), true );
-        auto particles = dc.get< ParticlesType >( particleTag, true );
-
-        /* wait that shared buffers can accessed without conflicts */
-        m_output.join();
-
-        uint32_t localDomainOffset = 0;
-        if( simDim == DIM3 )
-            localDomainOffset = Environment<simDim>::get().SubGrid().getLocalDomain().offset[ sliceDim ];
-
-        constexpr uint32_t cellsPerSupercell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-            cellsPerSupercell
-        >::value;
-
-        PMACC_ASSERT(cellDescription != nullptr);
-
-        AreaMapping<
-            CORE + BORDER,
-            MappingDesc
-        > mapper( *cellDescription );
-
-        //create image fields
-        PMACC_KERNEL( KernelPaintFields< numWorkers >{} )(
-            mapper.getGridDim(),
-            numWorkers
-        )(
-            fieldE->getDeviceDataBox(),
-            fieldB->getDeviceDataBox(),
-            fieldJ->getDeviceDataBox(),
-            img->getDeviceBuffer().getDataBox(),
-            m_transpose,
-            sliceOffset,
-            localDomainOffset,
-            sliceDim,
-            mapper
-        );
-
-        // find maximum for img.x()/y and z and return it as float3_X
-        int elements = img->getGridLayout().getDataSpace().productOfComponents();
-
-        //Add one dimension access to 2d DataBox
-        typedef DataBoxDim1Access<typename GridBuffer<float3_X, DIM2 >::DataBoxType> D1Box;
-        D1Box d1access(img->getDeviceBuffer().getDataBox(), img->getGridLayout().getDataSpace());
-
-#if (EM_FIELD_SCALE_CHANNEL1 == -1 || EM_FIELD_SCALE_CHANNEL2 == -1 || EM_FIELD_SCALE_CHANNEL3 == -1)
-        //reduce with functor max
-        float3_X max = reduce(nvidia::functors::Max(),
-                              d1access,
-                              elements);
-        //reduce with functor min
-        //float3_X min = reduce(nvidia::functors::Min(),
-        //                    d1access,
-        //                    elements);
-#if (EM_FIELD_SCALE_CHANNEL1 != -1 )
-        max.x() = float_X(1.0);
-#endif
-#if (EM_FIELD_SCALE_CHANNEL2 != -1 )
-        max.y() = float_X(1.0);
-#endif
-#if (EM_FIELD_SCALE_CHANNEL3 != -1 )
-        max.z() = float_X(1.0);
-#endif
+            /* wait that shared buffers can accessed without conflicts */
+            m_output.join();
 
-        /* We don't know the size of the supercell plane at compile time
-         * (because of the runtime dimension selection in any plugin),
-         * thus we must use a one dimension kernel and no mapper
-         */
-        PMACC_KERNEL(
-            vis_kernels::DivideAnyCell<
-                numWorkers,
-                cellsPerSupercell
-            >{ }
-        )(
-            ( elements + cellsPerSupercell - 1u ) / cellsPerSupercell,
-            numWorkers
-        )(
-            d1access,
-            elements,
-            max
-        );
+            uint32_t localDomainOffset = 0;
+            if(simDim == DIM3)
+                localDomainOffset = Environment<simDim>::get().SubGrid().getLocalDomain().offset[sliceDim];
+
+            constexpr uint32_t cellsPerSupercell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<cellsPerSupercell>::value;
+
+            PMACC_ASSERT(cellDescription != nullptr);
+
+            AreaMapping<CORE + BORDER, MappingDesc> mapper(*cellDescription);
+
+            // create image fields
+            PMACC_KERNEL(KernelPaintFields<numWorkers>{})
+            (mapper.getGridDim(), numWorkers)(
+                fieldE->getDeviceDataBox(),
+                fieldB->getDeviceDataBox(),
+                fieldJ->getDeviceDataBox(),
+                img->getDeviceBuffer().getDataBox(),
+                m_transpose,
+                sliceOffset,
+                localDomainOffset,
+                sliceDim,
+                mapper);
+
+            // find maximum for img.x()/y and z and return it as float3_X
+            int elements = img->getGridLayout().getDataSpace().productOfComponents();
+
+            // Add one dimension access to 2d DataBox
+            typedef DataBoxDim1Access<typename GridBuffer<float3_X, DIM2>::DataBoxType> D1Box;
+            D1Box d1access(img->getDeviceBuffer().getDataBox(), img->getGridLayout().getDataSpace());
+
+#if(EM_FIELD_SCALE_CHANNEL1 == -1 || EM_FIELD_SCALE_CHANNEL2 == -1 || EM_FIELD_SCALE_CHANNEL3 == -1)
+            // reduce with functor max
+            float3_X max = reduce(nvidia::functors::Max(), d1access, elements);
+            // reduce with functor min
+            // float3_X min = reduce(nvidia::functors::Min(),
+            //                    d1access,
+            //                    elements);
+#    if(EM_FIELD_SCALE_CHANNEL1 != -1)
+            max.x() = float_X(1.0);
+#    endif
+#    if(EM_FIELD_SCALE_CHANNEL2 != -1)
+            max.y() = float_X(1.0);
+#    endif
+#    if(EM_FIELD_SCALE_CHANNEL3 != -1)
+            max.z() = float_X(1.0);
+#    endif
+
+            /* We don't know the size of the supercell plane at compile time
+             * (because of the runtime dimension selection in any plugin),
+             * thus we must use a one dimension kernel and no mapper
+             */
+            PMACC_KERNEL(vis_kernels::DivideAnyCell<numWorkers, cellsPerSupercell>{})
+            ((elements + cellsPerSupercell - 1u) / cellsPerSupercell, numWorkers)(d1access, elements, max);
 #endif
 
-        // convert channels to RGB
-        PMACC_KERNEL(
-            vis_kernels::ChannelsToRGB<
-                numWorkers,
-                cellsPerSupercell
-            >{ }
-        )(
-           ( elements + cellsPerSupercell - 1u ) / cellsPerSupercell,
-            numWorkers
-        )(
-            d1access,
-            elements
-        );
-
-        // add density color channel
-        DataSpace<simDim> blockSize(MappingDesc::SuperCellSize::toRT());
-        DataSpace<DIM2> blockSize2D(blockSize[m_transpose.x()], blockSize[m_transpose.y()]);
-
-        //create image particles
-        PMACC_KERNEL( KernelPaintParticles3D< numWorkers >{} )(
-            mapper.getGridDim(),
-            numWorkers,
-            blockSize2D.productOfComponents() * sizeof( float_X )
-        )(
-            particles->getDeviceParticlesBox(),
-            img->getDeviceBuffer().getDataBox(),
-            m_transpose,
-            sliceOffset,
-            localDomainOffset,
-            sliceDim,
-            mapper
-        );
-
-        // send the RGB image back to host
-        img->deviceToHost();
-
-
-        header->update(*cellDescription, window, m_transpose, currentStep);
-
-
-        __getTransactionEvent().waitForFinished(); //wait for copy picture
-
-        DataSpace<DIM2> size = img->getGridLayout().getDataSpace();
-
-        auto hostBox = img->getHostBuffer().getDataBox();
-
-        if (picongpu::white_box_per_GPU)
-        {
-            hostBox[0 ][0 ] = float3_X(1.0, 1.0, 1.0);
-            hostBox[size.y() - 1 ][0 ] = float3_X(1.0, 1.0, 1.0);
-            hostBox[0 ][size.x() - 1] = float3_X(1.0, 1.0, 1.0);
-            hostBox[size.y() - 1 ][size.x() - 1] = float3_X(1.0, 1.0, 1.0);
-        }
-        auto resultBox = gather(hostBox, *header);
-        if (isMaster)
-        {
-            m_output(resultBox.shift(header->window.offset), header->window.size, *header);
-        }
+            // convert channels to RGB
+            PMACC_KERNEL(vis_kernels::ChannelsToRGB<numWorkers, cellsPerSupercell>{})
+            ((elements + cellsPerSupercell - 1u) / cellsPerSupercell, numWorkers)(d1access, elements);
 
-    }
+            // add density color channel
+            DataSpace<simDim> blockSize(MappingDesc::SuperCellSize::toRT());
+            DataSpace<DIM2> blockSize2D(blockSize[m_transpose.x()], blockSize[m_transpose.y()]);
 
-    void init()
-    {
-        if(!m_notifyPeriod.empty())
-        {
-            PMACC_ASSERT(cellDescription != nullptr);
-            const DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
+            // create image particles
+            PMACC_KERNEL(KernelPaintParticles3D<numWorkers>{})
+            (mapper.getGridDim(), numWorkers, blockSize2D.productOfComponents() * sizeof(float_X))(
+                particles->getDeviceParticlesBox(),
+                img->getDeviceBuffer().getDataBox(),
+                m_transpose,
+                sliceOffset,
+                localDomainOffset,
+                sliceDim,
+                mapper);
+
+            // send the RGB image back to host
+            img->deviceToHost();
 
-            Window window(MovingWindow::getInstance().getWindow(0));
-            sliceOffset = (int) ((float_32) (window.globalDimensions.size[sliceDim]) * m_slicePoint) + window.globalDimensions.offset[sliceDim];
 
+            header->update(*cellDescription, window, m_transpose, currentStep);
 
-            const DataSpace<simDim> gpus = Environment<simDim>::get().GridController().getGpuNodes();
 
-            float_32 cellSizeArr[3] = {0, 0, 0};
-            for (uint32_t i = 0; i < simDim; ++i)
-                cellSizeArr[i] = cellSize[i];
+            __getTransactionEvent().waitForFinished(); // wait for copy picture
 
-            header = MessageHeader::create();
-            header->update(*cellDescription, window, m_transpose, 0, cellSizeArr, gpus);
+            DataSpace<DIM2> size = img->getGridLayout().getDataSpace();
 
-            bool isDrawing = doDrawing();
-            isMaster = gather.init(isDrawing);
-            reduce.participate(isDrawing);
+            auto hostBox = img->getHostBuffer().getDataBox();
 
-            /* create memory for the local picture if the gpu participate on the visualization */
-            if(isDrawing)
-                img = new GridBuffer<float3_X, DIM2 > (header->node.maxSize);
+            if(picongpu::white_box_per_GPU)
+            {
+                hostBox[0][0] = float3_X(1.0, 1.0, 1.0);
+                hostBox[size.y() - 1][0] = float3_X(1.0, 1.0, 1.0);
+                hostBox[0][size.x() - 1] = float3_X(1.0, 1.0, 1.0);
+                hostBox[size.y() - 1][size.x() - 1] = float3_X(1.0, 1.0, 1.0);
+            }
+            auto resultBox = gather(hostBox, *header);
+            if(isMaster)
+            {
+                m_output(resultBox.shift(header->window.offset), header->window.size, *header);
+            }
         }
-    }
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        // nothing to do here
-    }
+        void init()
+        {
+            if(!m_notifyPeriod.empty())
+            {
+                PMACC_ASSERT(cellDescription != nullptr);
+                const DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
 
-private:
+                Window window(MovingWindow::getInstance().getWindow(0));
+                sliceOffset = (int) ((float_32)(window.globalDimensions.size[sliceDim]) * m_slicePoint)
+                    + window.globalDimensions.offset[sliceDim];
 
-    bool doDrawing()
-    {
-        PMACC_ASSERT(cellDescription != nullptr);
-        const DataSpace<simDim> globalRootCellPos(Environment<simDim>::get().SubGrid().getLocalDomain().offset);
-#if(SIMDIM==DIM3)
-        const bool tmp = globalRootCellPos[sliceDim] + Environment<simDim>::get().SubGrid().getLocalDomain().size[sliceDim] > sliceOffset &&
-            globalRootCellPos[sliceDim] <= sliceOffset;
-        return tmp;
-#else
-        return true;
-#endif
-    }
 
+                const DataSpace<simDim> gpus = Environment<simDim>::get().GridController().getGpuNodes();
+
+                float_32 cellSizeArr[3] = {0, 0, 0};
+                for(uint32_t i = 0; i < simDim; ++i)
+                    cellSizeArr[i] = cellSize[i];
+
+                header = MessageHeader::create();
+                header->update(*cellDescription, window, m_transpose, 0, cellSizeArr, gpus);
 
-    MappingDesc *cellDescription;
-    SimulationDataId particleTag;
+                bool isDrawing = doDrawing();
+                isMaster = gather.init(isDrawing);
+                reduce.participate(isDrawing);
+
+                /* create memory for the local picture if the gpu participate on the visualization */
+                if(isDrawing)
+                    img = new GridBuffer<float3_X, DIM2>(header->node.maxSize);
+            }
+        }
+
+        void pluginRegisterHelp(po::options_description& desc)
+        {
+            // nothing to do here
+        }
+
+    private:
+        bool doDrawing()
+        {
+            PMACC_ASSERT(cellDescription != nullptr);
+            const DataSpace<simDim> globalRootCellPos(Environment<simDim>::get().SubGrid().getLocalDomain().offset);
+#if(SIMDIM == DIM3)
+            const bool tmp
+                = globalRootCellPos[sliceDim] + Environment<simDim>::get().SubGrid().getLocalDomain().size[sliceDim]
+                    > sliceOffset
+                && globalRootCellPos[sliceDim] <= sliceOffset;
+            return tmp;
+#else
+            return true;
+#endif
+        }
 
-    GridBuffer<float3_X, DIM2 > *img;
 
-    int sliceOffset;
-    std::string m_notifyPeriod;
-    float_X m_slicePoint;
+        MappingDesc* cellDescription;
+        SimulationDataId particleTag;
 
-    std::string pluginName;
+        GridBuffer<float3_X, DIM2>* img;
 
+        int sliceOffset;
+        std::string m_notifyPeriod;
+        float_X m_slicePoint;
 
-    DataSpace<DIM2> m_transpose;
-    uint32_t sliceDim;
+        std::string pluginName;
 
-    MessageHeader* header;
 
-    Output m_output;
-    GatherSlice gather;
-    bool isMaster;
-    algorithms::GlobalReduce reduce;
-};
+        DataSpace<DIM2> m_transpose;
+        uint32_t sliceDim;
 
+        MessageHeader* header;
 
+        Output m_output;
+        GatherSlice gather;
+        bool isMaster;
+        algorithms::GlobalReduce reduce;
+    };
 
-}
 
+} // namespace picongpu
diff --git a/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeter.hpp b/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeter.hpp
index 97d0c40d9e..b66e70d3b9 100644
--- a/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeter.hpp
+++ b/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau, Rene Widera
+/* Copyright 2016-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -43,813 +43,701 @@
 #include <pmacc/traits/HasIdentifiers.hpp>
 #include <pmacc/traits/HasFlag.hpp>
 
+#include <openPMD/openPMD.hpp>
 #include <splash/splash.h>
 #include <boost/filesystem.hpp>
 #include <boost/mpl/and.hpp>
 #include <boost/shared_ptr.hpp>
 
+#include <memory>
 #include <string>
 #include <iostream>
 #include <fstream>
 #include <stdlib.h>
+#include <vector>
 
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-namespace po = boost::program_options;
+    namespace po = boost::program_options;
 
 
-/** Virtual particle calorimeter plugin.
- *
- * (virtually) propagates and collects particles to infinite distance.
- *
- */
-template<class ParticlesType>
-class ParticleCalorimeter : public plugins::multi::ISlave
-{
-    typedef pmacc::container::DeviceBuffer<float_X, DIM3> DBufCalorimeter;
-    typedef pmacc::container::HostBuffer<float_X, DIM3> HBufCalorimeter;
+    /** Virtual particle calorimeter plugin.
+     *
+     * (virtually) propagates and collects particles to infinite distance.
+     *
+     */
+    template<class ParticlesType>
+    class ParticleCalorimeter : public plugins::multi::ISlave
+    {
+        typedef pmacc::container::DeviceBuffer<float_X, DIM3> DBufCalorimeter;
+        typedef pmacc::container::HostBuffer<float_X, DIM3> HBufCalorimeter;
 
 
-    template<typename T_Type>
-    struct DivideInPlace
-    {
-        using Type = T_Type;
-        const Type divisor;
+        template<typename T_Type>
+        struct DivideInPlace
+        {
+            using Type = T_Type;
+            const Type divisor;
 
-        DivideInPlace( const Type& divisor ) : divisor( divisor ) {}
+            DivideInPlace(const Type& divisor) : divisor(divisor)
+            {
+            }
 
-        template< typename T_Acc >
-        HDINLINE void operator()( T_Acc const &, T_Type& val ) const
+            template<typename T_Acc>
+            HDINLINE void operator()(T_Acc const&, T_Type& val) const
+            {
+                val = val / this->divisor;
+            }
+        };
+
+    public:
+        typedef CalorimeterFunctor<typename DBufCalorimeter::Cursor> MyCalorimeterFunctor;
+
+    private:
+        typedef boost::shared_ptr<MyCalorimeterFunctor> MyCalorimeterFunctorPtr;
+        MyCalorimeterFunctorPtr calorimeterFunctor;
+
+        typedef boost::shared_ptr<pmacc::algorithm::mpi::Reduce<simDim>> AllGPU_reduce;
+        AllGPU_reduce allGPU_reduce;
+
+    public:
+        void restart(uint32_t restartStep, const std::string& restartDirectory)
         {
-            val = val / this->divisor;
-        }
-    };
+            HBufCalorimeter hBufLeftParsCalorimeter(this->dBufLeftParsCalorimeter->size());
+
+            pmacc::GridController<simDim>& gridCon = pmacc::Environment<simDim>::get().GridController();
+            pmacc::CommunicatorMPI<simDim>& comm = gridCon.getCommunicator();
+            uint32_t rank = comm.getRank();
 
-public:
-    typedef CalorimeterFunctor<typename DBufCalorimeter::Cursor> MyCalorimeterFunctor;
-private:
-    typedef boost::shared_ptr<MyCalorimeterFunctor> MyCalorimeterFunctorPtr;
-    MyCalorimeterFunctorPtr calorimeterFunctor;
+            if(rank == 0)
+            {
+                splash::SerialDataCollector hdf5DataFile(1);
+                splash::DataCollector::FileCreationAttr fAttr;
 
-    typedef boost::shared_ptr<pmacc::algorithm::mpi::Reduce<simDim> > AllGPU_reduce;
-    AllGPU_reduce allGPU_reduce;
+                splash::DataCollector::initFileCreationAttr(fAttr);
+                fAttr.fileAccType = splash::DataCollector::FAT_READ;
 
-public:
-    void restart(uint32_t restartStep, const std::string & restartDirectory)
-    {
-        HBufCalorimeter hBufLeftParsCalorimeter(this->dBufLeftParsCalorimeter->size());
+                std::stringstream filename;
+                filename << restartDirectory << "/" << (this->foldername + "/" + filenamePrefix) << "_" << restartStep;
+
+                hdf5DataFile.open(filename.str().c_str(), fAttr);
+
+                splash::Dimensions dimensions;
+
+                hdf5DataFile.read(
+                    restartStep,
+                    this->leftParticlesDatasetName.c_str(),
+                    dimensions,
+                    &(*hBufLeftParsCalorimeter.origin()));
 
-        pmacc::GridController<simDim>& gridCon = pmacc::Environment<simDim>::get().GridController();
-        pmacc::CommunicatorMPI<simDim>& comm = gridCon.getCommunicator();
-        uint32_t rank = comm.getRank();
+                hdf5DataFile.close();
+
+                /* rank 0 divides and distributes the calorimeter to all ranks in equal parts */
+                uint32_t numRanks = gridCon.getGlobalSize();
+                // get a host accelerator
+                auto hostDev = cupla::manager::Device<cupla::AccHost>::get().device();
+                pmacc::algorithm::host::Foreach()(
+                    hostDev,
+                    hBufLeftParsCalorimeter.zone(),
+                    hBufLeftParsCalorimeter.origin(),
+                    DivideInPlace<float_X>(float_X(numRanks)));
+            }
+
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_Bcast(
+                &(*hBufLeftParsCalorimeter.origin()),
+                hBufLeftParsCalorimeter.size().productOfComponents() * sizeof(float_X),
+                MPI_CHAR,
+                0, /* rank 0 */
+                comm.getMPIComm());
+
+            *this->dBufLeftParsCalorimeter = hBufLeftParsCalorimeter;
+        }
 
-        if(rank == 0)
+
+        void checkpoint(uint32_t currentStep, const std::string& checkpointDirectory)
         {
+            /* create folder for hdf5 checkpoint files*/
+            Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(
+                checkpointDirectory + "/" + this->foldername);
+            HBufCalorimeter hBufLeftParsCalorimeter(this->dBufLeftParsCalorimeter->size());
+            HBufCalorimeter hBufTotal(hBufLeftParsCalorimeter.size());
+
+            hBufLeftParsCalorimeter = *this->dBufLeftParsCalorimeter;
+
+            /* mpi reduce */
+            (*this->allGPU_reduce)(hBufTotal, hBufLeftParsCalorimeter, pmacc::algorithm::functor::Add{});
+            if(!this->allGPU_reduce->root())
+                return;
+
             splash::SerialDataCollector hdf5DataFile(1);
             splash::DataCollector::FileCreationAttr fAttr;
 
             splash::DataCollector::initFileCreationAttr(fAttr);
-            fAttr.fileAccType = splash::DataCollector::FAT_READ;
 
             std::stringstream filename;
-            filename << restartDirectory << "/" << ( this->foldername + "/" + filenamePrefix ) << "_" << restartStep;
+            filename << checkpointDirectory << "/" << (this->foldername + "/" + filenamePrefix) << "_" << currentStep;
 
             hdf5DataFile.open(filename.str().c_str(), fAttr);
 
-            splash::Dimensions dimensions;
+            typename PICToSplash<float_X>::type SplashTypeX;
 
-            hdf5DataFile.read(restartStep,
-                              this->leftParticlesDatasetName.c_str(),
-                              dimensions,
-                              &(*hBufLeftParsCalorimeter.origin()));
+            splash::Dimensions bufferSize(hBufTotal.size().x(), hBufTotal.size().y(), hBufTotal.size().z());
 
-            hdf5DataFile.close();
+            /* if there is only one energy bin, omit the energy axis */
+            uint32_t dimension = this->numBinsEnergy == 1 ? DIM2 : DIM3;
+            hdf5DataFile.write(
+                currentStep,
+                SplashTypeX,
+                dimension,
+                splash::Selection(bufferSize),
+                this->leftParticlesDatasetName.c_str(),
+                &(*hBufTotal.origin()));
 
-            /* rank 0 divides and distributes the calorimeter to all ranks in equal parts */
-            uint32_t numRanks = gridCon.getGlobalSize();
-            // get a host accelerator
-            auto hostDev = cupla::manager::Device< cupla::AccHost >::get().device( );
-            pmacc::algorithm::host::Foreach()(hostDev,
-                                              hBufLeftParsCalorimeter.zone(),
-                                              hBufLeftParsCalorimeter.origin(),
-                                              DivideInPlace<float_X>(float_X(numRanks)));
+            hdf5DataFile.close();
         }
 
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_Bcast(&(*hBufLeftParsCalorimeter.origin()),
-                  hBufLeftParsCalorimeter.size().productOfComponents() * sizeof(float_X),
-                  MPI_CHAR,
-                  0, /* rank 0 */
-                  comm.getMPIComm());
+    private:
+        void initPlugin()
+        {
+            namespace pm = pmacc::math;
 
-        *this->dBufLeftParsCalorimeter = hBufLeftParsCalorimeter;
-    }
+            if(!(this->openingYaw_deg > float_X(0.0) && this->openingYaw_deg <= float_X(360.0)))
+            {
+                std::stringstream msg;
+                msg << "[Plugin] [" << m_help->getOptionPrefix() << "] openingYaw has to be within (0, 360]."
+                    << std::endl;
+                throw std::runtime_error(msg.str());
+            }
+            if(!(this->openingPitch_deg > float_X(0.0) && this->openingPitch_deg <= float_X(180.0)))
+            {
+                std::stringstream msg;
+                msg << "[Plugin] [" << m_help->getOptionPrefix() << "] openingPitch has to be within (0, 180]."
+                    << std::endl;
+                throw std::runtime_error(msg.str());
+            }
+            if(this->minEnergy < float_X(0.0))
+            {
+                std::stringstream msg;
+                msg << "[Plugin] [" << m_help->getOptionPrefix() << "] minEnergy can not be negative." << std::endl;
+                throw std::runtime_error(msg.str());
+            }
+            if(this->logScale && this->minEnergy == float_X(0.0))
+            {
+                std::stringstream msg;
+                msg << "[Plugin] [" << m_help->getOptionPrefix()
+                    << "] minEnergy can not be zero in logarithmic scaling." << std::endl;
+                throw std::runtime_error(msg.str());
+            }
+            if(this->numBinsEnergy > 1 && this->maxEnergy <= this->minEnergy)
+            {
+                std::stringstream msg;
+                msg << "[Plugin] [" << m_help->getOptionPrefix() << "] minEnergy has to be less than maxEnergy."
+                    << std::endl;
+                throw std::runtime_error(msg.str());
+            }
 
+            this->maxYaw_deg = float_X(0.5) * this->openingYaw_deg;
+            this->maxPitch_deg = float_X(0.5) * this->openingPitch_deg;
+            /* convert units */
+            const float_64 minEnergy_SI = this->minEnergy * UNITCONV_keV_to_Joule;
+            const float_64 maxEnergy_SI = this->maxEnergy * UNITCONV_keV_to_Joule;
+            this->minEnergy = minEnergy_SI / UNIT_ENERGY;
+            this->maxEnergy = maxEnergy_SI / UNIT_ENERGY;
+
+            /* allocate memory buffers */
+            this->dBufCalorimeter = new DBufCalorimeter(this->numBinsYaw, this->numBinsPitch, this->numBinsEnergy);
+            this->dBufLeftParsCalorimeter = new DBufCalorimeter(this->dBufCalorimeter->size());
+            this->hBufCalorimeter = new HBufCalorimeter(this->dBufCalorimeter->size());
+            this->hBufTotalCalorimeter = new HBufCalorimeter(this->dBufCalorimeter->size());
+
+            /* fill calorimeter for left particles with zero */
+            this->dBufLeftParsCalorimeter->assign(float_X(0.0));
+
+            /* create mpi reduce algorithm */
+            pmacc::GridController<simDim>& con = pmacc::Environment<simDim>::get().GridController();
+            pm::Size_t<simDim> gpuDim = (pm::Size_t<simDim>) con.getGpuNodes();
+            zone::SphericZone<simDim> zone_allGPUs(gpuDim);
+            this->allGPU_reduce = AllGPU_reduce(new pmacc::algorithm::mpi::Reduce<simDim>(zone_allGPUs));
+
+            /* calculate rotated calorimeter frame from posYaw_deg and posPitch_deg */
+            constexpr float_64 radsInDegree = pmacc::math::Pi<float_64>::value / float_64(180.0);
+            const float_64 posYaw_rad = this->posYaw_deg * radsInDegree;
+            const float_64 posPitch_rad = this->posPitch_deg * radsInDegree;
+            this->calorimeterFrameVecY = float3_X(
+                math::sin(posYaw_rad) * math::cos(posPitch_rad),
+                math::cos(posYaw_rad) * math::cos(posPitch_rad),
+                math::sin(posPitch_rad));
+            /* If the y-axis is pointing exactly up- or downwards we need to define the x-axis manually */
+            if(math::abs(this->calorimeterFrameVecY.z()) == float_X(1.0))
+            {
+                this->calorimeterFrameVecX = float3_X(1.0, 0.0, 0.0);
+            }
+            else
+            {
+                /* choose `calorimeterFrameVecX` so that the roll is zero. */
+                const float3_X vecUp(0.0, 0.0, -1.0);
+                this->calorimeterFrameVecX = pmacc::math::cross(vecUp, this->calorimeterFrameVecY);
+                /* normalize vector */
+                this->calorimeterFrameVecX /= math::abs(this->calorimeterFrameVecX);
+            }
+            this->calorimeterFrameVecZ = pmacc::math::cross(this->calorimeterFrameVecX, this->calorimeterFrameVecY);
+
+            /* create calorimeter functor instance */
+            this->calorimeterFunctor = MyCalorimeterFunctorPtr(new MyCalorimeterFunctor(
+                this->maxYaw_deg * radsInDegree,
+                this->maxPitch_deg * radsInDegree,
+                this->numBinsYaw,
+                this->numBinsPitch,
+                this->numBinsEnergy,
+                this->logScale ? pmacc::math::log10(this->minEnergy) : this->minEnergy,
+                this->logScale ? pmacc::math::log10(this->maxEnergy) : this->maxEnergy,
+                this->logScale,
+                this->calorimeterFrameVecX,
+                this->calorimeterFrameVecY,
+                this->calorimeterFrameVecZ));
+
+            /* create folder for hdf5 files*/
+            Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(this->foldername);
+
+            // set how often the plugin should be executed while PIConGPU is running
+            Environment<>::get().PluginConnector().setNotificationPeriod(this, m_help->notifyPeriod.get(m_id));
+        }
 
-    void checkpoint(uint32_t currentStep, const std::string & checkpointDirectory)
-    {
-        /* create folder for hdf5 checkpoint files*/
-        Environment<simDim>::get().Filesystem().createDirectoryWithPermissions( checkpointDirectory + "/" + this->foldername);
-        HBufCalorimeter hBufLeftParsCalorimeter(this->dBufLeftParsCalorimeter->size());
-        HBufCalorimeter hBufTotal(hBufLeftParsCalorimeter.size());
+        void writeToHDF5File(uint32_t currentStep)
+        {
+            splash::SerialDataCollector hdf5DataFile(1);
+            splash::DataCollector::FileCreationAttr fAttr;
 
-        hBufLeftParsCalorimeter = *this->dBufLeftParsCalorimeter;
+            splash::DataCollector::initFileCreationAttr(fAttr);
 
-        /* mpi reduce */
-        (*this->allGPU_reduce)(hBufTotal, hBufLeftParsCalorimeter, pmacc::algorithm::functor::Add{});
-        if(!this->allGPU_reduce->root())
-            return;
+            std::stringstream filename;
+            filename << this->foldername << "/" << filenamePrefix << "_" << currentStep;
 
-        splash::SerialDataCollector hdf5DataFile(1);
-        splash::DataCollector::FileCreationAttr fAttr;
+            hdf5DataFile.open(filename.str().c_str(), fAttr);
 
-        splash::DataCollector::initFileCreationAttr(fAttr);
+            typename PICToSplash<float_X>::type SplashTypeX;
+            typename PICToSplash<float_64>::type SplashType64;
+            typename PICToSplash<bool>::type SplashTypeBool;
 
-        std::stringstream filename;
-        filename << checkpointDirectory << "/" << ( this->foldername + "/" + filenamePrefix ) << "_" << currentStep;
+            splash::Dimensions bufferSize(
+                this->hBufTotalCalorimeter->size().x(),
+                this->hBufTotalCalorimeter->size().y(),
+                this->hBufTotalCalorimeter->size().z());
 
-        hdf5DataFile.open(filename.str().c_str(), fAttr);
+            hdf5DataFile.write(
+                currentStep,
+                SplashTypeX,
+                this->numBinsEnergy == 1 ? DIM2 : DIM3,
+                splash::Selection(bufferSize),
+                "calorimeter",
+                &(*this->hBufTotalCalorimeter->origin()));
 
-        typename PICToSplash<float_X>::type SplashTypeX;
+            const float_64 unitSI = particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * UNIT_ENERGY;
 
-        splash::Dimensions bufferSize(hBufTotal.size().x(),
-                                      hBufTotal.size().y(),
-                                      hBufTotal.size().z());
+            hdf5DataFile.writeAttribute(currentStep, SplashType64, "calorimeter", "unitSI", &unitSI);
 
-        /* if there is only one energy bin, omit the energy axis */
-        uint32_t dimension = this->numBinsEnergy == 1 ? DIM2 : DIM3;
-        hdf5DataFile.write(currentStep,
-                           SplashTypeX,
-                           dimension,
-                           splash::Selection(bufferSize),
-                           this->leftParticlesDatasetName.c_str(),
-                           &(*hBufTotal.origin()));
+            hdf5DataFile.writeAttribute(currentStep, SplashType64, "calorimeter", "posYaw[deg]", &posYaw_deg);
 
-        hdf5DataFile.close();
-    }
+            hdf5DataFile.writeAttribute(currentStep, SplashType64, "calorimeter", "posPitch[deg]", &posPitch_deg);
 
-private:
-    void initPlugin()
-    {
-        namespace pm = pmacc::math;
+            hdf5DataFile.writeAttribute(currentStep, SplashTypeX, "calorimeter", "maxYaw[deg]", &this->maxYaw_deg);
 
-        if(!(this->openingYaw_deg > float_X(0.0) && this->openingYaw_deg <= float_X(360.0)))
-        {
-            std::stringstream msg;
-            msg << "[Plugin] [" << m_help->getOptionPrefix()
-                << "] openingYaw has to be within (0, 360]."
-                << std::endl;
-            throw std::runtime_error(msg.str());
-        }
-        if(!(this->openingPitch_deg > float_X(0.0) && this->openingPitch_deg <= float_X(180.0)))
-        {
-            std::stringstream msg;
-            msg << "[Plugin] [" << m_help->getOptionPrefix()
-                << "] openingPitch has to be within (0, 180]."
-                << std::endl;
-            throw std::runtime_error(msg.str());
-        }
-        if(this->minEnergy < float_X(0.0))
-        {
-            std::stringstream msg;
-            msg << "[Plugin] [" << m_help->getOptionPrefix()
-                << "] minEnergy can not be negative."
-                << std::endl;
-            throw std::runtime_error(msg.str());
-        }
-        if(this->logScale && this->minEnergy == float_X(0.0))
-        {
-            std::stringstream msg;
-            msg << "[Plugin] [" << m_help->getOptionPrefix()
-                << "] minEnergy can not be zero in logarithmic scaling."
-                << std::endl;
-            throw std::runtime_error(msg.str());
-        }
-        if(this->numBinsEnergy > 1 && this->maxEnergy <= this->minEnergy)
-        {
-            std::stringstream msg;
-            msg << "[Plugin] [" << m_help->getOptionPrefix()
-                << "] minEnergy has to be less than maxEnergy."
-                << std::endl;
-            throw std::runtime_error(msg.str());
-        }
+            hdf5DataFile.writeAttribute(currentStep, SplashTypeX, "calorimeter", "maxPitch[deg]", &this->maxPitch_deg);
 
-        this->maxYaw_deg = float_X(0.5) * this->openingYaw_deg;
-        this->maxPitch_deg = float_X(0.5) * this->openingPitch_deg;
-        /* convert units */
-        const float_64 minEnergy_SI = this->minEnergy * UNITCONV_keV_to_Joule;
-        const float_64 maxEnergy_SI = this->maxEnergy * UNITCONV_keV_to_Joule;
-        this->minEnergy = minEnergy_SI / UNIT_ENERGY;
-        this->maxEnergy = maxEnergy_SI / UNIT_ENERGY;
-
-        /* allocate memory buffers */
-        this->dBufCalorimeter = new DBufCalorimeter(this->numBinsYaw, this->numBinsPitch, this->numBinsEnergy);
-        this->dBufLeftParsCalorimeter = new DBufCalorimeter(this->dBufCalorimeter->size());
-        this->hBufCalorimeter = new HBufCalorimeter(this->dBufCalorimeter->size());
-        this->hBufTotalCalorimeter = new HBufCalorimeter(this->dBufCalorimeter->size());
-
-        /* fill calorimeter for left particles with zero */
-        this->dBufLeftParsCalorimeter->assign(float_X(0.0));
-
-        /* create mpi reduce algorithm */
-        pmacc::GridController<simDim>& con = pmacc::Environment<simDim>::get().GridController();
-        pm::Size_t<simDim> gpuDim = (pm::Size_t<simDim>)con.getGpuNodes();
-        zone::SphericZone<simDim> zone_allGPUs(gpuDim);
-        this->allGPU_reduce = AllGPU_reduce(new pmacc::algorithm::mpi::Reduce<simDim>(zone_allGPUs));
-
-        /* calculate rotated calorimeter frame from posYaw_deg and posPitch_deg */
-        constexpr float_64 radsInDegree = pmacc::algorithms::math::Pi<float_64>::value / float_64(180.0);
-        const float_64 posYaw_rad = this->posYaw_deg * radsInDegree;
-        const float_64 posPitch_rad = this->posPitch_deg * radsInDegree;
-        this->calorimeterFrameVecY = float3_X(math::sin(posYaw_rad) * math::cos(posPitch_rad),
-                                              math::cos(posYaw_rad) * math::cos(posPitch_rad),
-                                              math::sin(posPitch_rad));
-        /* If the y-axis is pointing exactly up- or downwards we need to define the x-axis manually */
-        if(math::abs(this->calorimeterFrameVecY.z()) == float_X(1.0))
-        {
-            this->calorimeterFrameVecX = float3_X(1.0, 0.0, 0.0);
-        }
-        else
-        {
-            /* choose `calorimeterFrameVecX` so that the roll is zero. */
-            const float3_X vecUp(0.0, 0.0, -1.0);
-            this->calorimeterFrameVecX = math::cross(vecUp, this->calorimeterFrameVecY);
-            /* normalize vector */
-            this->calorimeterFrameVecX /= math::abs(this->calorimeterFrameVecX);
-        }
-        this->calorimeterFrameVecZ = math::cross(this->calorimeterFrameVecX, this->calorimeterFrameVecY);
-
-        /* create calorimeter functor instance */
-        this->calorimeterFunctor = MyCalorimeterFunctorPtr(new MyCalorimeterFunctor(
-            this->maxYaw_deg * radsInDegree,
-            this->maxPitch_deg * radsInDegree,
-            this->numBinsYaw,
-            this->numBinsPitch,
-            this->numBinsEnergy,
-            this->logScale ? math::log10(this->minEnergy) : this->minEnergy,
-            this->logScale ? math::log10(this->maxEnergy) : this->maxEnergy,
-            this->logScale,
-            this->calorimeterFrameVecX,
-            this->calorimeterFrameVecY,
-            this->calorimeterFrameVecZ));
-
-        /* create folder for hdf5 files*/
-        Environment<simDim>::get().Filesystem().createDirectoryWithPermissions(this->foldername);
-
-        // set how often the plugin should be executed while PIConGPU is running
-        Environment<>::get( ).PluginConnector( ).setNotificationPeriod(
-            this,
-            m_help->notifyPeriod.get( m_id )
-        );
-    }
-
-    void writeToHDF5File(uint32_t currentStep)
-    {
-        splash::SerialDataCollector hdf5DataFile(1);
-        splash::DataCollector::FileCreationAttr fAttr;
-
-        splash::DataCollector::initFileCreationAttr(fAttr);
-
-        std::stringstream filename;
-        filename << this->foldername << "/" << filenamePrefix << "_" << currentStep;
-
-        hdf5DataFile.open(filename.str().c_str(), fAttr);
-
-        typename PICToSplash<float_X>::type SplashTypeX;
-        typename PICToSplash<float_64>::type SplashType64;
-        typename PICToSplash<bool>::type SplashTypeBool;
-
-        splash::Dimensions bufferSize(this->hBufTotalCalorimeter->size().x(),
-                                      this->hBufTotalCalorimeter->size().y(),
-                                      this->hBufTotalCalorimeter->size().z());
-
-        hdf5DataFile.write(currentStep,
-                           SplashTypeX,
-                           this->numBinsEnergy == 1 ? DIM2 : DIM3,
-                           splash::Selection(bufferSize),
-                           "calorimeter",
-                           &(*this->hBufTotalCalorimeter->origin()));
-
-        const float_64 unitSI = particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * UNIT_ENERGY;
-
-        hdf5DataFile.writeAttribute(currentStep,
-                                    SplashType64,
-                                    "calorimeter",
-                                    "unitSI",
-                                    &unitSI);
-
-        hdf5DataFile.writeAttribute(currentStep,
-                                    SplashType64,
-                                    "calorimeter",
-                                    "posYaw[deg]",
-                                    &posYaw_deg);
-
-        hdf5DataFile.writeAttribute(currentStep,
-                                    SplashType64,
-                                    "calorimeter",
-                                    "posPitch[deg]",
-                                    &posPitch_deg);
-
-        hdf5DataFile.writeAttribute(currentStep,
-                                    SplashTypeX,
-                                    "calorimeter",
-                                    "maxYaw[deg]",
-                                    &this->maxYaw_deg);
-
-        hdf5DataFile.writeAttribute(currentStep,
-                                    SplashTypeX,
-                                    "calorimeter",
-                                    "maxPitch[deg]",
-                                    &this->maxPitch_deg);
-
-        if(this->numBinsEnergy > 1)
-        {
-            const float_64 minEnergy_SI = this->minEnergy * UNIT_ENERGY;
-            const float_64 maxEnergy_SI = this->maxEnergy * UNIT_ENERGY;
-            const float_64 minEnergy_keV = minEnergy_SI * UNITCONV_Joule_to_keV;
-            const float_64 maxEnergy_keV = maxEnergy_SI * UNITCONV_Joule_to_keV;
-
-            hdf5DataFile.writeAttribute(currentStep,
-                                        SplashType64,
-                                        "calorimeter",
-                                        "minEnergy[keV]",
-                                        &minEnergy_keV);
-
-            hdf5DataFile.writeAttribute(currentStep,
-                                        SplashType64,
-                                        "calorimeter",
-                                        "maxEnergy[keV]",
-                                        &maxEnergy_keV);
-
-            hdf5DataFile.writeAttribute(currentStep,
-                                        SplashTypeBool,
-                                        "calorimeter",
-                                        "logScale",
-                                        &this->logScale);
-        }
+            if(this->numBinsEnergy > 1)
+            {
+                const float_64 minEnergy_SI = this->minEnergy * UNIT_ENERGY;
+                const float_64 maxEnergy_SI = this->maxEnergy * UNIT_ENERGY;
+                const float_64 minEnergy_keV = minEnergy_SI * UNITCONV_Joule_to_keV;
+                const float_64 maxEnergy_keV = maxEnergy_SI * UNITCONV_Joule_to_keV;
 
-        hdf5DataFile.close();
-    }
+                hdf5DataFile
+                    .writeAttribute(currentStep, SplashType64, "calorimeter", "minEnergy[keV]", &minEnergy_keV);
 
-public:
+                hdf5DataFile
+                    .writeAttribute(currentStep, SplashType64, "calorimeter", "maxEnergy[keV]", &maxEnergy_keV);
 
-    struct Help : public plugins::multi::IHelp
-    {
+                hdf5DataFile.writeAttribute(currentStep, SplashTypeBool, "calorimeter", "logScale", &this->logScale);
+            }
 
-        /** creates an instance of ISlave
-         *
-         * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
-         * @param help plugin defined help
-         * @param id index of the plugin, range: [0;help->getNumPlugins())
-         */
-        std::shared_ptr< ISlave > create(
-            std::shared_ptr< IHelp > & help,
-            size_t const id,
-            MappingDesc* cellDescription
-        )
-        {
-            return std::shared_ptr< ISlave >(
-                new ParticleCalorimeter< ParticlesType >(
-                    help,
-                    id,
-                    cellDescription
-                )
-            );
+            hdf5DataFile.close();
         }
 
-        // find all valid filter for the current used species
-        using EligibleFilters = typename MakeSeqFromNestedSeq<
-            typename bmpl::transform<
-                particles::filter::AllParticleFilters,
-                particles::traits::GenerateSolversIfSpeciesEligible<
-                    bmpl::_1,
-                    ParticlesType
-                >
-            >::type
-        >::type;
-
-        //! periodicity of computing the particle energy
-        plugins::multi::Option< std::string > notifyPeriod = {
-            "period",
-            "enable plugin [for each n-th step]"
-        };
-        plugins::multi::Option< std::string > fileName = {
-            "file",
-            "output filename (prefix)"
-        };
-        plugins::multi::Option< std::string > filter = {
-            "filter",
-            "particle filter: "
-        };
-        plugins::multi::Option< uint32_t > numBinsYaw = {
-            "numBinsYaw",
-            "number of bins for angle yaw.",
-            64
-        };
-        plugins::multi::Option< uint32_t > numBinsPitch = {
-            "numBinsPitch",
-            "number of bins for angle pitch.",
-            64
-        };
-        plugins::multi::Option< uint32_t > numBinsEnergy = {
-            "numBinsEnergy",
-            "number of bins for the energy spectrum. Disabled by default.",
-            1
-        };
-        plugins::multi::Option< float_X > minEnergy = {
-            "minEnergy",
-            "minimal detectable energy in keV.",
-            0.0
-        };
-        plugins::multi::Option< float_X > maxEnergy = {
-            "maxEnergy",
-            "maximal detectable energy in keV.",
-            1.0e3
-        };
-        plugins::multi::Option< uint32_t > logScale = {
-            "logScale",
-            "enable logarithmic energy scale.",
-            0
-        };
-        plugins::multi::Option< float_X > openingYaw = {
-            "openingYaw",
-            "opening angle yaw in degrees. 0 <= x <= 360.",
-            360.0
-        };
-        plugins::multi::Option< float_X > openingPitch = {
-            "openingPitch",
-            "opening angle pitch in degrees. 0 <= x <= 180.",
-            180.0
-        };
-        plugins::multi::Option< float_64 > posYaw = {
-            "posYaw",
-            "yaw coordinate of calorimeter position in degrees. Defaults to +y direction.",
-            0.0
-        };
-        plugins::multi::Option< float_64 > posPitch = {
-            "posPitch",
-            "pitch coordinate of calorimeter position in degrees. Defaults to +y direction.",
-            0.0
-        };
+        void writeToOpenPMDFile(uint32_t currentStep)
+        {
+            std::stringstream filename;
+            filename << this->foldername << "/" << filenamePrefix << "_%T." << filenameExtension;
+            ::openPMD::Series series(filename.str(), ::openPMD::Access::CREATE);
 
-        //! string list with all possible particle filters
-        std::string concatenatedFilterNames;
-        std::vector< std::string > allowedFilters;
+            auto twoDimensional = [this](auto vector) -> decltype(vector) {
+                if(this->numBinsEnergy == 1)
+                {
+                    vector.erase(vector.begin());
+                }
+                return vector;
+            };
+
+            auto offset = twoDimensional(::openPMD::Offset{0, 0, 0});
+
+            auto extent = twoDimensional(::openPMD::Extent{
+                this->hBufTotalCalorimeter->size().z(),
+                this->hBufTotalCalorimeter->size().y(),
+                this->hBufTotalCalorimeter->size().x()});
+
+            auto mesh = series.iterations[currentStep].meshes["calorimeter"];
+            auto calorimeter = mesh[::openPMD::RecordComponent::SCALAR];
+            calorimeter.resetDataset({::openPMD::determineDatatype<float_X>(), extent});
+            calorimeter.storeChunk(
+                std::shared_ptr<float_X>{&(*this->hBufTotalCalorimeter->origin()), [](auto const*) {}},
+                std::move(offset),
+                std::move(extent));
+
+            // Write attributes
+
+            constexpr float_64 unitSI = particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE * UNIT_ENERGY;
+            calorimeter.setAttribute<float_X>("maxPitch[deg]", maxPitch_deg);
+            calorimeter.setAttribute<float_X>("maxYaw[deg]", maxYaw_deg);
+            calorimeter.setAttribute<float_64>("posPitch[deg]", posPitch_deg);
+            calorimeter.setAttribute<float_64>("posYaw[deg]", posYaw_deg);
+            calorimeter.setPosition(twoDimensional(std::vector<double>{0.5, 0.5, 0.5}));
+            calorimeter.setUnitSI(unitSI);
+            mesh.setAxisLabels(twoDimensional(std::vector<std::string>{"z", "y", "x"}));
+            mesh.setGridGlobalOffset(twoDimensional(std::vector<double>{0., 0., 0.})); // @todo
+            mesh.setGridSpacing(twoDimensional(std::vector<double>{1., 1., 1.})); // @todo
+            mesh.setGridUnitSI(1.); // @todo
+            mesh.setUnitDimension({/* @todo */});
+
+            if(this->numBinsEnergy > 1)
+            {
+                const float_64 minEnergy_SI = this->minEnergy * UNIT_ENERGY;
+                const float_64 maxEnergy_SI = this->maxEnergy * UNIT_ENERGY;
+                const float_64 minEnergy_keV = minEnergy_SI * UNITCONV_Joule_to_keV;
+                const float_64 maxEnergy_keV = maxEnergy_SI * UNITCONV_Joule_to_keV;
+
+                calorimeter.setAttribute<float_64>("minEnergy[keV]", minEnergy_keV);
+                calorimeter.setAttribute<float_64>("maxEnergy[keV]", maxEnergy_keV);
+                calorimeter.setAttribute<bool>("logScale", this->logScale);
+            }
 
-        ///! method used by plugin controller to get --help description
-        void registerHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        )
-        {
-            meta::ForEach<
-                EligibleFilters,
-                plugins::misc::AppendName< bmpl::_1 >
-            > getEligibleFilterNames;
-            getEligibleFilterNames( allowedFilters );
-
-            concatenatedFilterNames = plugins::misc::concatenateToString(
-                allowedFilters,
-                ", "
-            );
-
-            notifyPeriod.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            fileName.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            filter.registerHelp(
-                desc,
-                masterPrefix + prefix,
-                std::string( "[" ) + concatenatedFilterNames + "]"
-            );
-            numBinsYaw.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            numBinsPitch.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            numBinsEnergy.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            minEnergy.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            maxEnergy.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            logScale.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            openingYaw.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            openingPitch.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            posYaw.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
-            posPitch.registerHelp(
-                desc,
-                masterPrefix + prefix
-            );
+            series.iterations[currentStep].close();
         }
 
-        void expandHelp(
-            boost::program_options::options_description & desc,
-            std::string const & masterPrefix = std::string{ }
-        )
+    public:
+        struct Help : public plugins::multi::IHelp
         {
-        }
+            /** creates an instance of ISlave
+             *
+             * @tparam T_Slave type of the interface implementation (must inherit from ISlave)
+             * @param help plugin defined help
+             * @param id index of the plugin, range: [0;help->getNumPlugins())
+             */
+            std::shared_ptr<ISlave> create(std::shared_ptr<IHelp>& help, size_t const id, MappingDesc* cellDescription)
+            {
+                return std::shared_ptr<ISlave>(new ParticleCalorimeter<ParticlesType>(help, id, cellDescription));
+            }
 
+            // find all valid filter for the current used species
+            using EligibleFilters = typename MakeSeqFromNestedSeq<typename bmpl::transform<
+                particles::filter::AllParticleFilters,
+                particles::traits::GenerateSolversIfSpeciesEligible<bmpl::_1, ParticlesType>>::type>::type;
+
+            //! periodicity of computing the particle energy
+            plugins::multi::Option<std::string> notifyPeriod = {"period", "enable plugin [for each n-th step]"};
+            plugins::multi::Option<std::string> fileName = {"file", "output filename (prefix)"};
+            plugins::multi::Option<std::string> filter = {"filter", "particle filter: "};
+            plugins::multi::Option<std::string> extension = {"ext", "openPMD filename extension", "h5"};
+            plugins::multi::Option<uint32_t> numBinsYaw = {"numBinsYaw", "number of bins for angle yaw.", 64};
+            plugins::multi::Option<uint32_t> numBinsPitch = {"numBinsPitch", "number of bins for angle pitch.", 64};
+            plugins::multi::Option<uint32_t> numBinsEnergy
+                = {"numBinsEnergy", "number of bins for the energy spectrum. Disabled by default.", 1};
+            plugins::multi::Option<float_X> minEnergy = {"minEnergy", "minimal detectable energy in keV.", 0.0};
+            plugins::multi::Option<float_X> maxEnergy = {"maxEnergy", "maximal detectable energy in keV.", 1.0e3};
+            plugins::multi::Option<uint32_t> logScale = {"logScale", "enable logarithmic energy scale.", 0};
+            plugins::multi::Option<float_X> openingYaw
+                = {"openingYaw", "opening angle yaw in degrees. 0 <= x <= 360.", 360.0};
+            plugins::multi::Option<float_X> openingPitch
+                = {"openingPitch", "opening angle pitch in degrees. 0 <= x <= 180.", 180.0};
+            plugins::multi::Option<float_64> posYaw
+                = {"posYaw", "yaw coordinate of calorimeter position in degrees. Defaults to +y direction.", 0.0};
+            plugins::multi::Option<float_64> posPitch
+                = {"posPitch", "pitch coordinate of calorimeter position in degrees. Defaults to +y direction.", 0.0};
+
+            //! string list with all possible particle filters
+            std::string concatenatedFilterNames;
+            std::vector<std::string> allowedFilters;
+
+            ///! method used by plugin controller to get --help description
+            void registerHelp(
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
+            {
+                meta::ForEach<EligibleFilters, plugins::misc::AppendName<bmpl::_1>> getEligibleFilterNames;
+                getEligibleFilterNames(allowedFilters);
+
+                concatenatedFilterNames = plugins::misc::concatenateToString(allowedFilters, ", ");
+
+                notifyPeriod.registerHelp(desc, masterPrefix + prefix);
+                fileName.registerHelp(desc, masterPrefix + prefix);
+                extension.registerHelp(desc, masterPrefix + prefix);
+                filter.registerHelp(desc, masterPrefix + prefix, std::string("[") + concatenatedFilterNames + "]");
+                numBinsYaw.registerHelp(desc, masterPrefix + prefix);
+                numBinsPitch.registerHelp(desc, masterPrefix + prefix);
+                numBinsEnergy.registerHelp(desc, masterPrefix + prefix);
+                minEnergy.registerHelp(desc, masterPrefix + prefix);
+                maxEnergy.registerHelp(desc, masterPrefix + prefix);
+                logScale.registerHelp(desc, masterPrefix + prefix);
+                openingYaw.registerHelp(desc, masterPrefix + prefix);
+                openingPitch.registerHelp(desc, masterPrefix + prefix);
+                posYaw.registerHelp(desc, masterPrefix + prefix);
+                posPitch.registerHelp(desc, masterPrefix + prefix);
+            }
 
-        void validateOptions()
-        {
-            if( notifyPeriod.size() != fileName.size() )
-                throw std::runtime_error( name + ": parameter fileName and period are not used the same number of times" );
+            void expandHelp(
+                boost::program_options::options_description& desc,
+                std::string const& masterPrefix = std::string{})
+            {
+            }
 
-            if( notifyPeriod.size() != filter.size() )
-                    throw std::runtime_error( name + ": parameter filter and period are not used the same number of times" );
 
-            // check if user passed filter name are valid
-            for( auto const & filterName : filter)
+            void validateOptions()
             {
-                if(
-                    std::find(
-                        allowedFilters.begin(),
-                        allowedFilters.end(),
-                        filterName
-                    ) == allowedFilters.end()
-                )
+                if(notifyPeriod.size() != fileName.size())
+                    throw std::runtime_error(
+                        name + ": parameter fileName and period are not used the same number of times");
+
+                if(notifyPeriod.size() != filter.size())
+                    throw std::runtime_error(
+                        name + ": parameter filter and period are not used the same number of times");
+
+                // check if user passed filter name are valid
+                for(auto const& filterName : filter)
                 {
-                    throw std::runtime_error( name + ": unknown filter '" + filterName + "'" );
+                    if(std::find(allowedFilters.begin(), allowedFilters.end(), filterName) == allowedFilters.end())
+                    {
+                        throw std::runtime_error(name + ": unknown filter '" + filterName + "'");
+                    }
                 }
             }
+
+            size_t getNumPlugins() const
+            {
+                return notifyPeriod.size();
+            }
+
+            std::string getDescription() const
+            {
+                return description;
+            }
+
+            std::string getOptionPrefix() const
+            {
+                return prefix;
+            }
+
+            std::string getName() const
+            {
+                return name;
+            }
+
+            std::string const name = "ParticleCalorimeter";
+            //! short description of the plugin
+            std::string const description = "(virtually) propagates and collects particles to infinite distance";
+            //! prefix used for command line arguments
+            std::string const prefix = ParticlesType::FrameType::getName() + std::string("_calorimeter");
+        };
+
+        static std::shared_ptr<plugins::multi::IHelp> getHelp()
+        {
+            return std::shared_ptr<plugins::multi::IHelp>(new Help{});
         }
 
-        size_t getNumPlugins() const
+        ParticleCalorimeter(
+            std::shared_ptr<plugins::multi::IHelp>& help,
+            size_t const id,
+            MappingDesc* cellDescription)
+            : m_help(std::static_pointer_cast<Help>(help))
+            , m_id(id)
+            , m_cellDescription(cellDescription)
+            , leftParticlesDatasetName("calorimeterLeftParticles")
+            , dBufCalorimeter(nullptr)
+            , dBufLeftParsCalorimeter(nullptr)
+            , hBufCalorimeter(nullptr)
+            , hBufTotalCalorimeter(nullptr)
         {
-            return notifyPeriod.size();
+            foldername = m_help->getOptionPrefix() + "/" + m_help->filter.get(m_id);
+            filenamePrefix
+                = m_help->getOptionPrefix() + "_" + m_help->fileName.get(m_id) + "_" + m_help->filter.get(m_id);
+            filenameExtension = m_help->extension.get(m_id);
+            numBinsYaw = m_help->numBinsYaw.get(m_id);
+            numBinsPitch = m_help->numBinsPitch.get(m_id);
+            numBinsEnergy = m_help->numBinsEnergy.get(m_id);
+            minEnergy = m_help->minEnergy.get(m_id);
+            maxEnergy = m_help->maxEnergy.get(m_id);
+            logScale = m_help->logScale.get(m_id);
+            openingYaw_deg = m_help->openingYaw.get(m_id);
+            openingPitch_deg = m_help->openingPitch.get(m_id);
+            posYaw_deg = m_help->posYaw.get(m_id);
+            posPitch_deg = m_help->posPitch.get(m_id);
+
+            initPlugin();
         }
 
-        std::string getDescription() const
+        virtual ~ParticleCalorimeter()
         {
-            return description;
+            __delete(this->dBufCalorimeter);
+            __delete(this->dBufLeftParsCalorimeter);
+            __delete(this->hBufCalorimeter);
+            __delete(this->hBufTotalCalorimeter);
         }
 
-        std::string getOptionPrefix() const
+
+        void notify(uint32_t currentStep)
         {
-            return prefix;
+            /* initialize calorimeter with already detected particles */
+            *this->dBufCalorimeter = *this->dBufLeftParsCalorimeter;
+
+            /* data is written to dBufCalorimeter */
+            this->calorimeterFunctor->setCalorimeterCursor(this->dBufCalorimeter->origin());
+
+            /* create kernel functor instance */
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
+
+            AreaMapping<CORE + BORDER, MappingDesc> const mapper(*this->m_cellDescription);
+            auto const grid = mapper.getGridDim();
+
+            constexpr uint32_t numWorkers
+                = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+            auto kernel = PMACC_KERNEL(KernelParticleCalorimeter<numWorkers>{})(grid, numWorkers);
+            auto unaryKernel = std::bind(
+                kernel,
+                particles->getDeviceParticlesBox(),
+                *this->calorimeterFunctor,
+                mapper,
+                std::placeholders::_1);
+
+            meta::ForEach<typename Help::EligibleFilters, plugins::misc::ExecuteIfNameIsEqual<bmpl::_1>>{}(
+                m_help->filter.get(m_id),
+                currentStep,
+                unaryKernel);
+
+            dc.releaseData(ParticlesType::FrameType::getName());
+
+            /* copy to host */
+            *this->hBufCalorimeter = *this->dBufCalorimeter;
+
+            /* mpi reduce */
+            (*this->allGPU_reduce)(
+                *this->hBufTotalCalorimeter,
+                *this->hBufCalorimeter,
+                pmacc::algorithm::functor::Add{});
+            if(!this->allGPU_reduce->root())
+                return;
+
+            this->writeToHDF5File(currentStep);
+            this->writeToOpenPMDFile(currentStep);
         }
 
-        std::string getName() const
+        void onParticleLeave(const std::string& speciesName, int32_t direction)
         {
-            return name;
+            if(this->notifyPeriod.empty())
+                return;
+            if(speciesName != ParticlesType::FrameType::getName())
+                return;
+
+            /* data is written to dBufLeftParsCalorimeter */
+            this->calorimeterFunctor->setCalorimeterCursor(this->dBufLeftParsCalorimeter->origin());
+
+            ExchangeMapping<GUARD, MappingDesc> mapper(*this->cellDescription, direction);
+            auto grid = mapper.getGridDim();
+
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto particles = dc.get<ParticlesType>(speciesName, true);
+
+            constexpr uint32_t numWorkers
+                = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+            auto kernel = PMACC_KERNEL(KernelParticleCalorimeter<numWorkers>{})(grid, numWorkers);
+            auto unaryKernel = std::bind(
+                kernel,
+                particles->getDeviceParticlesBox(),
+                (MyCalorimeterFunctor) * this->calorimeterFunctor,
+                mapper,
+                std::placeholders::_1);
+
+            meta::ForEach<typename Help::EligibleFilters, plugins::misc::ExecuteIfNameIsEqual<bmpl::_1>>{}(
+                m_help->filter.get(m_id),
+                Environment<>::get().SimulationDescription().getCurrentStep(),
+                unaryKernel);
+
+            dc.releaseData(speciesName);
         }
 
-        std::string const name = "ParticleCalorimeter";
-        //! short description of the plugin
-        std::string const description = "(virtually) propagates and collects particles to infinite distance";
-        //! prefix used for command line arguments
-        std::string const prefix = ParticlesType::FrameType::getName( ) + std::string( "_calorimeter" );
+    private:
+        std::shared_ptr<Help> m_help;
+        size_t m_id;
+        std::string foldername;
+        std::string filenamePrefix;
+        std::string filenameExtension;
+        MappingDesc* m_cellDescription;
+        std::ofstream outFile;
+        const std::string leftParticlesDatasetName;
+
+        uint32_t numBinsYaw;
+        uint32_t numBinsPitch;
+        uint32_t numBinsEnergy;
+        float_X minEnergy;
+        float_X maxEnergy;
+        bool logScale;
+        float_X openingYaw_deg;
+        float_X openingPitch_deg;
+        float_X maxYaw_deg;
+        float_X maxPitch_deg;
+
+        float_64 posYaw_deg;
+        float_64 posPitch_deg;
+
+        //! Rotated calorimeter frame
+        float3_X calorimeterFrameVecX;
+        float3_X calorimeterFrameVecY;
+        float3_X calorimeterFrameVecZ;
+
+        //! device calorimeter buffer for a single gpu
+        DBufCalorimeter* dBufCalorimeter;
+        //! device calorimeter buffer for all particles which have left the simulation volume
+        DBufCalorimeter* dBufLeftParsCalorimeter;
+        //! host calorimeter buffer for a single mpi rank
+        HBufCalorimeter* hBufCalorimeter;
+        //! host calorimeter buffer for summation of all mpi ranks
+        HBufCalorimeter* hBufTotalCalorimeter;
     };
 
-    static std::shared_ptr< plugins::multi::IHelp > getHelp()
-    {
-        return std::shared_ptr< plugins::multi::IHelp >( new Help{ } );
-    }
-
-    ParticleCalorimeter(
-        std::shared_ptr< plugins::multi::IHelp > & help,
-        size_t const id,
-        MappingDesc* cellDescription
-    ) :
-        m_help( std::static_pointer_cast< Help >(help) ),
-        m_id( id ),
-        m_cellDescription( cellDescription ),
-        leftParticlesDatasetName("calorimeterLeftParticles"),
-        dBufCalorimeter(nullptr),
-        dBufLeftParsCalorimeter(nullptr),
-        hBufCalorimeter(nullptr),
-        hBufTotalCalorimeter(nullptr)
+    namespace particles
     {
-        foldername = m_help->getOptionPrefix() + "/" + m_help->filter.get( m_id );
-        filenamePrefix = m_help->getOptionPrefix() + "_" + m_help->fileName.get( m_id ) + "_" + m_help->filter.get( m_id );
-        numBinsYaw = m_help->numBinsYaw.get( m_id );
-        numBinsPitch = m_help->numBinsPitch.get( m_id );
-        numBinsEnergy = m_help->numBinsEnergy.get( m_id );
-        minEnergy = m_help->minEnergy.get( m_id );
-        maxEnergy = m_help->maxEnergy.get( m_id );
-        logScale = m_help->logScale.get( m_id );
-        openingYaw_deg = m_help->openingYaw.get( m_id );
-        openingPitch_deg = m_help->openingPitch.get( m_id );
-        posYaw_deg = m_help->posYaw.get( m_id );
-        posPitch_deg = m_help->posPitch.get( m_id );
-
-        initPlugin();
-    }
-
-    virtual ~ParticleCalorimeter()
-    {
-        __delete(this->dBufCalorimeter);
-        __delete(this->dBufLeftParsCalorimeter);
-        __delete(this->hBufCalorimeter);
-        __delete(this->hBufTotalCalorimeter);
-    }
+        namespace traits
+        {
+            template<typename T_Species, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<T_Species, ParticleCalorimeter<T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
 
+                // this plugin needs at least the weighting and momentum attributes
+                using RequiredIdentifiers = MakeSeq_t<weighting, momentum>;
 
-    void notify(uint32_t currentStep)
-    {
-        /* initialize calorimeter with already detected particles */
-        *this->dBufCalorimeter = *this->dBufLeftParsCalorimeter;
-
-        /* data is written to dBufCalorimeter */
-        this->calorimeterFunctor->setCalorimeterCursor(this->dBufCalorimeter->origin());
-
-        /* create kernel functor instance */
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto particles = dc.get< ParticlesType >( ParticlesType::FrameType::getName(), true );
-
-        AreaMapping<
-            CORE + BORDER,
-            MappingDesc
-        > const mapper( *this->m_cellDescription );
-        auto const grid = mapper.getGridDim();
-
-        constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-            pmacc::math::CT::volume< SuperCellSize >::type::value
-        >::value;
-
-        auto kernel = PMACC_KERNEL( KernelParticleCalorimeter< numWorkers >{ } )(
-            grid,
-            numWorkers
-        );
-        auto unaryKernel = std::bind(
-            kernel,
-            particles->getDeviceParticlesBox( ),
-            *this->calorimeterFunctor,
-            mapper,
-            std::placeholders::_1
-        );
-
-        meta::ForEach<
-            typename Help::EligibleFilters,
-            plugins::misc::ExecuteIfNameIsEqual< bmpl::_1 >
-        >{ }(
-            m_help->filter.get( m_id ),
-            currentStep,
-            unaryKernel
-        );
-
-        dc.releaseData( ParticlesType::FrameType::getName() );
-
-        /* copy to host */
-        *this->hBufCalorimeter = *this->dBufCalorimeter;
-
-        /* mpi reduce */
-        (*this->allGPU_reduce)(*this->hBufTotalCalorimeter, *this->hBufCalorimeter, pmacc::algorithm::functor::Add{});
-        if(!this->allGPU_reduce->root())
-            return;
-
-        this->writeToHDF5File(currentStep);
-    }
-
-    void onParticleLeave(const std::string& speciesName, int32_t direction)
-    {
-        if(this->notifyPeriod.empty())
-            return;
-        if(speciesName != ParticlesType::FrameType::getName())
-            return;
-
-        /* data is written to dBufLeftParsCalorimeter */
-        this->calorimeterFunctor->setCalorimeterCursor(this->dBufLeftParsCalorimeter->origin());
-
-        ExchangeMapping<GUARD, MappingDesc> mapper(*this->cellDescription, direction);
-        auto grid = mapper.getGridDim();
-
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto particles = dc.get< ParticlesType >( speciesName, true );
-
-        constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-            pmacc::math::CT::volume< SuperCellSize >::type::value
-        >::value;
-
-        auto kernel = PMACC_KERNEL( KernelParticleCalorimeter< numWorkers >{ } )(
-            grid,
-            numWorkers
-        );
-        auto unaryKernel = std::bind(
-            kernel,
-            particles->getDeviceParticlesBox( ),
-            (MyCalorimeterFunctor)*this->calorimeterFunctor,
-            mapper,
-            std::placeholders::_1
-        );
-
-        meta::ForEach<
-            typename Help::EligibleFilters,
-            plugins::misc::ExecuteIfNameIsEqual< bmpl::_1 >
-        >{ }(
-            m_help->filter.get( m_id ),
-            Environment<>::get().SimulationDescription().getCurrentStep(),
-            unaryKernel
-        );
-
-        dc.releaseData( speciesName );
-    }
-
-private:
-    std::shared_ptr< Help > m_help;
-    size_t m_id;
-    std::string foldername;
-    std::string filenamePrefix;
-    MappingDesc* m_cellDescription;
-    std::ofstream outFile;
-    const std::string leftParticlesDatasetName;
-
-    uint32_t numBinsYaw;
-    uint32_t numBinsPitch;
-    uint32_t numBinsEnergy;
-    float_X minEnergy;
-    float_X maxEnergy;
-    bool logScale;
-    float_X openingYaw_deg;
-    float_X openingPitch_deg;
-    float_X maxYaw_deg;
-    float_X maxPitch_deg;
-
-    float_64 posYaw_deg;
-    float_64 posPitch_deg;
-
-    //! Rotated calorimeter frame
-    float3_X calorimeterFrameVecX;
-    float3_X calorimeterFrameVecY;
-    float3_X calorimeterFrameVecZ;
-
-    //! device calorimeter buffer for a single gpu
-    DBufCalorimeter* dBufCalorimeter;
-    //! device calorimeter buffer for all particles which have left the simulation volume
-    DBufCalorimeter* dBufLeftParsCalorimeter;
-    //! host calorimeter buffer for a single mpi rank
-    HBufCalorimeter* hBufCalorimeter;
-    //! host calorimeter buffer for summation of all mpi ranks
-    HBufCalorimeter* hBufTotalCalorimeter;
-};
-
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_UnspecifiedSpecies
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        ParticleCalorimeter< T_UnspecifiedSpecies >
-    >
-    {
-        using FrameType = typename T_Species::FrameType;
-
-        // this plugin needs at least the weighting and momentum attributes
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            momentum
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        // and also a mass ratio for energy calculation from momentum
-        using SpeciesHasFlags = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasFlags
-        >;
-    };
-} // namespace traits
-} // namespace particles
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                // and also a mass ratio for energy calculation from momentum
+                using SpeciesHasFlags = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasFlags>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeter.kernel b/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeter.kernel
index 5c880d6bea..fe0f4e17c4 100644
--- a/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeter.kernel
+++ b/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeter.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -24,112 +24,87 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-/** This kernel is only called for guard particles.
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelParticleCalorimeter
-{
-    /** call functor calorimeterFunctor for each particle
-     *
-     * @tparam T_ParticlesBox pmacc::ParticlesBox, particle box type
-     * @tparam T_CalorimeterFunctor type of the functor
-     * @tparam T_Mapping supercell mapper functor type
-     * @tparam T_Acc alpaka accelerator type
+    /** This kernel is only called for guard particles.
      *
-     * @param alpaka accelerator
-     * @param particlesBox particle memory
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParticlesBox,
-        typename T_CalorimeterFunctor,
-        typename T_Mapper,
-        typename T_Acc,
-        typename T_Filter
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParticlesBox particlesBox,
-        T_CalorimeterFunctor calorimeterFunctor,
-        T_Mapper mapper,
-        T_Filter filter
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelParticleCalorimeter
     {
-        using namespace mappings::threads;
+        /** call functor calorimeterFunctor for each particle
+         *
+         * @tparam T_ParticlesBox pmacc::ParticlesBox, particle box type
+         * @tparam T_CalorimeterFunctor type of the functor
+         * @tparam T_Mapping supercell mapper functor type
+         * @tparam T_Acc alpaka accelerator type
+         *
+         * @param alpaka accelerator
+         * @param particlesBox particle memory
+         * @param mapper functor to map a block to a supercell
+         */
+        template<
+            typename T_ParticlesBox,
+            typename T_CalorimeterFunctor,
+            typename T_Mapper,
+            typename T_Acc,
+            typename T_Filter>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_ParticlesBox particlesBox,
+            T_CalorimeterFunctor calorimeterFunctor,
+            T_Mapper mapper,
+            T_Filter filter) const
+        {
+            using namespace mappings::threads;
 
-        constexpr uint32_t numWorkers = T_numWorkers;
-        constexpr lcellId_t maxParticlesInFrame = pmacc::math::CT::volume< SuperCellSize >::type::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
+            constexpr lcellId_t maxParticlesInFrame = pmacc::math::CT::volume<SuperCellSize>::type::value;
 
-        uint32_t const workerIdx = threadIdx.x;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-        /* multi-dimensional offset vector from local domain origin on GPU in units of super cells */
-        DataSpace< simDim > const block( mapper.getSuperCellIndex( DataSpace< simDim > ( blockIdx ) )) ;
+            /* multi-dimensional offset vector from local domain origin on GPU in units of super cells */
+            DataSpace<simDim> const block(mapper.getSuperCellIndex(DataSpace<simDim>(cupla::blockIdx(acc))));
 
-        using ParticlesFramePtr = typename T_ParticlesBox::FramePtr;
+            using ParticlesFramePtr = typename T_ParticlesBox::FramePtr;
 
-        ParticlesFramePtr particlesFrame;
+            ParticlesFramePtr particlesFrame;
 
-        particlesFrame = particlesBox.getLastFrame( block );
+            particlesFrame = particlesBox.getLastFrame(block);
 
-        // end kernel if we have no frames within the supercell
-        if( !particlesFrame.isValid( ) )
-            return;
+            // end kernel if we have no frames within the supercell
+            if(!particlesFrame.isValid())
+                return;
 
-        auto accFilter = filter(
-            acc,
-            block - mapper.getGuardingSuperCells( ),
-            WorkerCfg< numWorkers >{ workerIdx }
-        );
+            auto accFilter = filter(acc, block - mapper.getGuardingSuperCells(), WorkerCfg<numWorkers>{workerIdx});
 
-        // number of particles in the current frame
-        auto numParticles = particlesBox.getSuperCell( block ).getSizeLastFrame( );
+            // number of particles in the current frame
+            auto numParticles = particlesBox.getSuperCell(block).getSizeLastFrame();
 
-        while( particlesFrame.isValid( ) )
-        {
-            using ParticleDomCfg = IdxConfig<
-                maxParticlesInFrame,
-                numWorkers
-            >;
-
-            // loop over all particles in the frame
-            ForEachIdx< ParticleDomCfg >{ workerIdx }
-            (
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    auto particle = particlesFrame[ linearIdx ];
-                    if( linearIdx >= numParticles )
+            while(particlesFrame.isValid())
+            {
+                using ParticleDomCfg = IdxConfig<maxParticlesInFrame, numWorkers>;
+
+                // loop over all particles in the frame
+                ForEachIdx<ParticleDomCfg>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                    auto particle = particlesFrame[linearIdx];
+                    if(linearIdx >= numParticles)
                     {
-                        particle.setHandleInvalid( );
+                        particle.setHandleInvalid();
                     }
 
-                    if(
-                        accFilter(
-                            acc,
-                            particle
-                        )
-                    )
+                    if(accFilter(acc, particle))
                     {
-                        calorimeterFunctor(
-                            acc,
-                            particlesFrame,
-                            linearIdx
-                        );
+                        calorimeterFunctor(acc, particlesFrame, linearIdx);
                     }
-                }
-            );
+                });
 
-            // independent for each worker
-            particlesFrame = particlesBox.getPreviousFrame( particlesFrame );
-            numParticles = maxParticlesInFrame;
+                // independent for each worker
+                particlesFrame = particlesBox.getPreviousFrame(particlesFrame);
+                numParticles = maxParticlesInFrame;
+            }
         }
-    }
-};
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeterFunctors.hpp b/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeterFunctors.hpp
index f69107f0a6..3232c86061 100644
--- a/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeterFunctors.hpp
+++ b/include/picongpu/plugins/particleCalorimeter/ParticleCalorimeterFunctors.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -30,119 +30,128 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-template<typename CalorimeterCur>
-struct CalorimeterFunctor
-{
-    CalorimeterCur calorimeterCur;
-
-    const float_X maxYaw;
-    const float_X maxPitch;
-    const uint32_t numBinsYaw;
-    const uint32_t numBinsPitch;
-    const int32_t numBinsEnergy;
-    /* depending on `logScale` the energy range is initialized
-     * with the logarithmic or the linear value. */
-    const float_X minEnergy;
-    const float_X maxEnergy;
-    const bool logScale;
-
-    const float3_X calorimeterFrameVecX;
-    const float3_X calorimeterFrameVecY;
-    const float3_X calorimeterFrameVecZ;
-
-    CalorimeterFunctor(const float_X maxYaw,
-                       const float_X maxPitch,
-                       const uint32_t numBinsYaw,
-                       const uint32_t numBinsPitch,
-                       const uint32_t numBinsEnergy,
-                       const float_X minEnergy,
-                       const float_X maxEnergy,
-                       const bool logScale,
-                       const float3_X calorimeterFrameVecX,
-                       const float3_X calorimeterFrameVecY,
-                       const float3_X calorimeterFrameVecZ) :
-        calorimeterCur(nullptr, pmacc::math::Size_t<DIM2>::create(0)),
-        maxYaw(maxYaw),
-        maxPitch(maxPitch),
-        numBinsYaw(numBinsYaw),
-        numBinsPitch(numBinsPitch),
-        numBinsEnergy(numBinsEnergy),
-        minEnergy(minEnergy),
-        maxEnergy(maxEnergy),
-        logScale(logScale),
-        calorimeterFrameVecX(calorimeterFrameVecX),
-        calorimeterFrameVecY(calorimeterFrameVecY),
-        calorimeterFrameVecZ(calorimeterFrameVecZ)
-    {}
-
-    HINLINE void setCalorimeterCursor(const CalorimeterCur& calorimeterCur)
-    {
-        this->calorimeterCur = calorimeterCur;
-    }
-
-    template<typename ParticlesFrame, typename T_Acc>
-    DINLINE void operator()(const T_Acc& acc, ParticlesFrame& particlesFrame, const uint32_t linearThreadIdx)
+    template<typename CalorimeterCur>
+    struct CalorimeterFunctor
     {
-        const float3_X mom = particlesFrame[linearThreadIdx][momentum_];
-        const float_X mom2 = math::dot(mom, mom);
-        float3_X dirVec = mom * math::rsqrt(mom2);
-
-        /* rotate dirVec into the calorimeter frame. This coordinate transformation
-         * is performed by a matrix vector multiplication. */
-        using namespace pmacc::algorithms::math;
-        dirVec = float3_X(dot(this->calorimeterFrameVecX, dirVec),
-                          dot(this->calorimeterFrameVecY, dirVec),
-                          dot(this->calorimeterFrameVecZ, dirVec));
-
-        /* convert dirVec to yaw and pitch */
-        const float_X yaw = atan2(dirVec.x(), dirVec.y());
-        const float_X pitch = asin(dirVec.z());
-
-        if(abs(yaw) < this->maxYaw && abs(pitch) < this->maxPitch)
+        CalorimeterCur calorimeterCur;
+
+        const float_X maxYaw;
+        const float_X maxPitch;
+        const uint32_t numBinsYaw;
+        const uint32_t numBinsPitch;
+        const int32_t numBinsEnergy;
+        /* depending on `logScale` the energy range is initialized
+         * with the logarithmic or the linear value. */
+        const float_X minEnergy;
+        const float_X maxEnergy;
+        const bool logScale;
+
+        const float3_X calorimeterFrameVecX;
+        const float3_X calorimeterFrameVecY;
+        const float3_X calorimeterFrameVecZ;
+
+        CalorimeterFunctor(
+            const float_X maxYaw,
+            const float_X maxPitch,
+            const uint32_t numBinsYaw,
+            const uint32_t numBinsPitch,
+            const uint32_t numBinsEnergy,
+            const float_X minEnergy,
+            const float_X maxEnergy,
+            const bool logScale,
+            const float3_X calorimeterFrameVecX,
+            const float3_X calorimeterFrameVecY,
+            const float3_X calorimeterFrameVecZ)
+            : calorimeterCur(nullptr, pmacc::math::Size_t<DIM2>::create(0))
+            , maxYaw(maxYaw)
+            , maxPitch(maxPitch)
+            , numBinsYaw(numBinsYaw)
+            , numBinsPitch(numBinsPitch)
+            , numBinsEnergy(numBinsEnergy)
+            , minEnergy(minEnergy)
+            , maxEnergy(maxEnergy)
+            , logScale(logScale)
+            , calorimeterFrameVecX(calorimeterFrameVecX)
+            , calorimeterFrameVecY(calorimeterFrameVecY)
+            , calorimeterFrameVecZ(calorimeterFrameVecZ)
         {
-            const float2_X calorimeterPos = particleCalorimeter::mapYawPitchToNormedRange(
-                yaw, pitch, this->maxYaw, this->maxPitch);
-
-            // yaw
-            int32_t yawBin = calorimeterPos.x() * static_cast<float_X>(numBinsYaw);
-            // catch out-of-range values
-            yawBin = yawBin >= numBinsYaw ? numBinsYaw - 1 : yawBin;
-            yawBin = yawBin < 0 ? 0 : yawBin;
-
-            // pitch
-            int32_t pitchBin = calorimeterPos.y() * static_cast<float_X>(numBinsPitch);
-            // catch out-of-range values
-            pitchBin = pitchBin >= numBinsPitch ? numBinsPitch - 1 : pitchBin;
-            pitchBin = pitchBin < 0 ? 0 : pitchBin;
-
-            // energy
-            const float_X weighting = particlesFrame[linearThreadIdx][weighting_];
-            const float_X normedWeighting = weighting /
-                                            static_cast<float_X>(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
-            const auto particle = particlesFrame[linearThreadIdx];
-            const float_X mass = attribute::getMass(weighting, particle);
-            const float_X energy = KinEnergy<>()(mom, mass) / weighting;
-
-            int32_t energyBin = 0;
-            if(this->numBinsEnergy > 1)
-            {
-                const int32_t numBinsOutOfRange = 2;
-                energyBin = math::float2int_rd(((logScale ? log10(energy) : energy) - minEnergy) /
-                    (maxEnergy - minEnergy) * static_cast<float_X>(this->numBinsEnergy - numBinsOutOfRange)) + 1;
+        }
 
-                // all entries larger than maxEnergy go into last bin
-                energyBin = energyBin < this->numBinsEnergy ? energyBin : this->numBinsEnergy - 1;
+        HINLINE void setCalorimeterCursor(const CalorimeterCur& calorimeterCur)
+        {
+            this->calorimeterCur = calorimeterCur;
+        }
 
-                // all entries smaller than minEnergy go into bin zero
-                energyBin = energyBin > 0 ? energyBin : 0;
+        template<typename ParticlesFrame, typename T_Acc>
+        DINLINE void operator()(const T_Acc& acc, ParticlesFrame& particlesFrame, const uint32_t linearThreadIdx)
+        {
+            const float3_X mom = particlesFrame[linearThreadIdx][momentum_];
+            const float_X mom2 = pmacc::math::dot(mom, mom);
+            float3_X dirVec = mom * math::rsqrt(mom2);
+
+            /* rotate dirVec into the calorimeter frame. This coordinate transformation
+             * is performed by a matrix vector multiplication. */
+            using namespace pmacc::math;
+            dirVec = float3_X(
+                pmacc::math::dot(this->calorimeterFrameVecX, dirVec),
+                pmacc::math::dot(this->calorimeterFrameVecY, dirVec),
+                pmacc::math::dot(this->calorimeterFrameVecZ, dirVec));
+
+            /* convert dirVec to yaw and pitch */
+            const float_X yaw = atan2(dirVec.x(), dirVec.y());
+            const float_X pitch = asin(dirVec.z());
+
+            if(abs(yaw) < this->maxYaw && abs(pitch) < this->maxPitch)
+            {
+                const float2_X calorimeterPos
+                    = particleCalorimeter::mapYawPitchToNormedRange(yaw, pitch, this->maxYaw, this->maxPitch);
+
+                // yaw
+                int32_t yawBin = calorimeterPos.x() * static_cast<float_X>(numBinsYaw);
+                // catch out-of-range values
+                yawBin = yawBin >= numBinsYaw ? numBinsYaw - 1 : yawBin;
+                yawBin = yawBin < 0 ? 0 : yawBin;
+
+                // pitch
+                int32_t pitchBin = calorimeterPos.y() * static_cast<float_X>(numBinsPitch);
+                // catch out-of-range values
+                pitchBin = pitchBin >= numBinsPitch ? numBinsPitch - 1 : pitchBin;
+                pitchBin = pitchBin < 0 ? 0 : pitchBin;
+
+                // energy
+                const float_X weighting = particlesFrame[linearThreadIdx][weighting_];
+                const float_X normedWeighting
+                    = weighting / static_cast<float_X>(particles::TYPICAL_NUM_PARTICLES_PER_MACROPARTICLE);
+                const auto particle = particlesFrame[linearThreadIdx];
+                const float_X mass = attribute::getMass(weighting, particle);
+                const float_X energy = KinEnergy<>()(mom, mass) / weighting;
+
+                int32_t energyBin = 0;
+                if(this->numBinsEnergy > 1)
+                {
+                    const int32_t numBinsOutOfRange = 2;
+                    energyBin
+                        = pmacc::math::float2int_rd(
+                              ((logScale ? pmacc::math::log10(energy) : energy) - minEnergy) / (maxEnergy - minEnergy)
+                              * static_cast<float_X>(this->numBinsEnergy - numBinsOutOfRange))
+                        + 1;
+
+                    // all entries larger than maxEnergy go into last bin
+                    energyBin = energyBin < this->numBinsEnergy ? energyBin : this->numBinsEnergy - 1;
+
+                    // all entries smaller than minEnergy go into bin zero
+                    energyBin = energyBin > 0 ? energyBin : 0;
+                }
+
+                cupla::atomicAdd(
+                    acc,
+                    &(*this->calorimeterCur(yawBin, pitchBin, energyBin)),
+                    energy * normedWeighting,
+                    ::alpaka::hierarchy::Threads{});
             }
-
-            atomicAdd( &(*this->calorimeterCur(yawBin, pitchBin, energyBin)),
-                             energy * normedWeighting, ::alpaka::hierarchy::Threads{});
         }
-    }
-};
+    };
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/particleMerging/ParticleMerger.hpp b/include/picongpu/plugins/particleMerging/ParticleMerger.hpp
index 9d48fd426a..d0073f5cdf 100644
--- a/include/picongpu/plugins/particleMerging/ParticleMerger.hpp
+++ b/include/picongpu/plugins/particleMerging/ParticleMerger.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Heiko Burau
+/* Copyright 2017-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -34,290 +34,247 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace particleMerging
-{
-
-    using namespace pmacc;
-    namespace bmpl = boost::mpl;
-
-    /** Implements a particle merging algorithm based on
-    *
-    * Luu, P. T., Tueckmantel, T., & Pukhov, A. (2016).
-    * Voronoi particle merging algorithm for PIC codes.
-    * Computer Physics Communications, 202, 165-174.
-    *
-    * \tparam T_ParticlesType particle species
-    */
-    template<
-        class T_ParticlesType,
-        bool hasVoronoiCellId =
-            pmacc::traits::HasIdentifier<
-                typename T_ParticlesType::FrameType,
-                voronoiCellId
-            >::type::value
-    >
-    struct ParticleMergerWrapped;
-
-
-    template< class T_ParticlesType >
-    struct ParticleMergerWrapped< T_ParticlesType, true > : ISimulationPlugin
-    {
-    private:
-        std::string name;
-        std::string prefix;
-        std::string notifyPeriod;
-        MappingDesc* cellDescription;
-
-        uint32_t minParticlesToMerge;
-        float_X posSpreadThreshold;
-        float_X absMomSpreadThreshold_mc;
-        float_X absMomSpreadThreshold;
-        float_X relMomSpreadThreshold;
-        float_64 minMeanEnergy_keV;
-        float_X minMeanEnergy;
-
-    public:
-        using ParticlesType = T_ParticlesType;
-
-        ParticleMergerWrapped() :
-            name(
-                "ParticleMerger: merges several macroparticles with"
-                " similar position and momentum into a single one"
-            ),
-            prefix( ParticlesType::FrameType::getName() + std::string("_merger") ),
-            cellDescription( nullptr )
-        {
-            Environment<>::get().PluginConnector().registerPlugin( this );
-        }
-
-        void notify(uint32_t currentStep)
-        {
-            using SuperCellSize = MappingDesc::SuperCellSize;
-
-            const pmacc::math::Int<simDim> coreBorderGuardSuperCells =
-                this->cellDescription->getGridSuperCells();
-            const pmacc::math::Int<simDim> guardSuperCells =
-                this->cellDescription->getGuardingSuperCells();
-            const pmacc::math::Int<simDim> coreBorderSuperCells =
-                coreBorderGuardSuperCells - 2 * guardSuperCells;
-
-            /* this zone represents the core+border area with guard offset in unit of cells */
-            const zone::SphericZone< simDim > zone(
-                static_cast< pmacc::math::Size_t< simDim > >(
-                    coreBorderSuperCells * SuperCellSize::toRT()
-                ),
-                guardSuperCells * SuperCellSize::toRT()
-            );
-
-            /* get particles instance */
-            DataConnector &dc = Environment<>::get().DataConnector();
-            auto particles = dc.get< ParticlesType >(
-                ParticlesType::FrameType::getName(),
-                true
-            );
-
-            /* create `ParticleMergerKernel` instance */
-            ParticleMergerKernel< typename ParticlesType::ParticlesBoxType >
-            particleMergerKernel(
-                particles->getDeviceParticlesBox(),
-                this->minParticlesToMerge,
-                this->posSpreadThreshold,
-                this->absMomSpreadThreshold,
-                this->relMomSpreadThreshold,
-                this->minMeanEnergy
-            );
-
-            /* execute particle merging alorithm */
-            algorithm::kernel::Foreach< SuperCellSize > foreach;
-            foreach(
-                zone,
-                cursor::make_MultiIndexCursor< simDim >(),
-                particleMergerKernel
-            );
-
-            /* close all gaps caused by removal of particles */
-            particles->fillAllGaps();
-        }
-
-
-        void setMappingDescription(MappingDesc* cellDescription)
-        {
-            this->cellDescription = cellDescription;
-        }
-
-
-        void pluginRegisterHelp(po::options_description& desc)
-        {
-            desc.add_options()
-            (
-                ( this->prefix + ".period" ).c_str(),
-                po::value< std::string > (
-                    &this->notifyPeriod
-                ),
-                "enable plugin [for each n-th step]"
-            )
-            (
-                ( this->prefix + ".minParticlesToMerge" ).c_str(),
-                po::value< uint32_t > (
-                    &this->minParticlesToMerge
-                )->default_value( 8 ),
-                "minimal number of macroparticles needed to merge"
-                " the macroparticle collection into a single macroparticle."
-            )
-            (
-                ( this->prefix + ".posSpreadThreshold" ).c_str(),
-                po::value< float_X > (
-                    &this->posSpreadThreshold
-                )->default_value( 0.5 ),
-                "Below this threshold of spread in position macroparticles"
-                " can be merged [unit: cell edge length]."
-            )
-            (
-                ( this->prefix + ".absMomSpreadThreshold" ).c_str(),
-                po::value< float_X > (
-                    &this->absMomSpreadThreshold_mc
-                )->default_value( -1.0 ),
-                "Below this absolute threshold of spread in momentum"
-                " macroparticles can be merged [unit: m_el * c]."
-                " Disabled for -1 (default)."
-            )
-            (
-                ( this->prefix + ".relMomSpreadThreshold" ).c_str(),
-                po::value< float_X > (
-                    &this->relMomSpreadThreshold
-                )->default_value( -1.0 ),
-                "Below this relative (to mean momentum) threshold of spread in"
-                " momentum macroparticles can be merged [unit: none]."
-                " Disabled for -1 (default)."
-            )
-            (
-                ( this->prefix + ".minMeanEnergy" ).c_str(),
-                po::value< float_64 > (
-                    &this->minMeanEnergy_keV
-                )->default_value( 511.0 ),
-                "minimal mean kinetic energy needed to merge the macroparticle"
-                " collection into a single macroparticle [unit: keV]."
-            );
-        }
-
-        std::string pluginGetName() const
-        {
-            return this->name;
-        }
-
-    protected:
-
-        void pluginLoad()
-        {
-            if( notifyPeriod.empty() )
-                return;
-
-            Environment<>::get().PluginConnector().setNotificationPeriod(
-                this,
-                notifyPeriod
-            );
-
-            // clean user parameters
-            PMACC_VERIFY_MSG(
-                this->minParticlesToMerge > 1,
-                std::string("[Plugin: ") + this->prefix + "] minParticlesToMerge"
-                " has to be greater than one."
-            );
-            PMACC_VERIFY_MSG(
-                this->posSpreadThreshold >= float_X(0.0),
-                std::string("[Plugin: ") + this->prefix + "] posSpreadThreshold"
-                " has to be non-negative."
-            );
-            PMACC_VERIFY_MSG(
-                this->absMomSpreadThreshold_mc * this->relMomSpreadThreshold < float( 0.0 ),
-                std::string("[Plugin: ") + this->prefix + "] either"
-                " absMomSpreadThreshold or relMomSpreadThreshold has to be given"
-            );
-            PMACC_VERIFY_MSG(
-                this->minMeanEnergy >= float_X(0.0),
-                std::string("[Plugin: ") + this->prefix + "] minMeanEnergy"
-                " has to be non-negative."
-            );
-
-            // convert units of user parameters
-            this->absMomSpreadThreshold = this->absMomSpreadThreshold_mc *
-                ELECTRON_MASS * SPEED_OF_LIGHT;
-
-            const float_64 minMeanEnergy_SI = this->minMeanEnergy_keV *
-                UNITCONV_keV_to_Joule;
-            this->minMeanEnergy = static_cast< float_X >(
-                minMeanEnergy_SI / UNIT_ENERGY
-            );
-        }
-
-        void pluginUnload()
-        {}
-
-        void restart( uint32_t, const std::string )
-        {}
-
-        void checkpoint( uint32_t, const std::string )
-        {}
-    };
-
-
-    template< class T_ParticlesType >
-    struct ParticleMergerWrapped< T_ParticlesType, false > : ISimulationPlugin
+    namespace plugins
     {
-    private:
-        std::string name;
-        std::string prefix;
-        std::string notifyPeriod;
-        MappingDesc* cellDescription;
-
-    public:
-        using ParticlesType = T_ParticlesType;
-
-        ParticleMergerWrapped() :
-            name(
-                "ParticleMerger: merges several macroparticles with"
-                " similar position and momentum into a single one.\n"
-                "plugin disabled. Enable plugin by adding the `voronoiCellId`"
-                " attribute to the particle attribute list."
-            ),
-            prefix( ParticlesType::FrameType::getName() + std::string("_merger") ),
-            cellDescription( nullptr )
-        {
-            Environment<>::get().PluginConnector().registerPlugin( this );
-        }
-
-        std::string pluginGetName() const
+        namespace particleMerging
         {
-            return this->name;
-        }
-
-    protected:
-        void setMappingDescription( MappingDesc* )
-        {}
-
-        void pluginRegisterHelp( po::options_description& )
-        {}
-
-        void pluginUnload()
-        {}
-
-        void restart( uint32_t, const std::string )
-        {}
-
-        void checkpoint( uint32_t, const std::string )
-        {}
-
-        void notify( uint32_t )
-        {}
-    };
-
-
-    template< typename T_ParticlesType >
-    struct ParticleMerger : ParticleMergerWrapped< T_ParticlesType >
-    {};
-
-} // namespace particleMerging
-} // namespace plugins
+            using namespace pmacc;
+            namespace bmpl = boost::mpl;
+
+            /** Implements a particle merging algorithm based on
+             *
+             * Luu, P. T., Tueckmantel, T., & Pukhov, A. (2016).
+             * Voronoi particle merging algorithm for PIC codes.
+             * Computer Physics Communications, 202, 165-174.
+             *
+             * \tparam T_ParticlesType particle species
+             */
+            template<
+                class T_ParticlesType,
+                bool hasVoronoiCellId
+                = pmacc::traits::HasIdentifier<typename T_ParticlesType::FrameType, voronoiCellId>::type::value>
+            struct ParticleMergerWrapped;
+
+
+            template<class T_ParticlesType>
+            struct ParticleMergerWrapped<T_ParticlesType, true> : ISimulationPlugin
+            {
+            private:
+                std::string name;
+                std::string prefix;
+                std::string notifyPeriod;
+                MappingDesc* cellDescription;
+
+                uint32_t minParticlesToMerge;
+                float_X posSpreadThreshold;
+                float_X absMomSpreadThreshold_mc;
+                float_X absMomSpreadThreshold;
+                float_X relMomSpreadThreshold;
+                float_64 minMeanEnergy_keV;
+                float_X minMeanEnergy;
+
+            public:
+                using ParticlesType = T_ParticlesType;
+
+                ParticleMergerWrapped()
+                    : name("ParticleMerger: merges several macroparticles with"
+                           " similar position and momentum into a single one")
+                    , prefix(ParticlesType::FrameType::getName() + std::string("_merger"))
+                    , cellDescription(nullptr)
+                {
+                    Environment<>::get().PluginConnector().registerPlugin(this);
+                }
+
+                void notify(uint32_t currentStep)
+                {
+                    using SuperCellSize = MappingDesc::SuperCellSize;
+
+                    const pmacc::math::Int<simDim> coreBorderGuardSuperCells
+                        = this->cellDescription->getGridSuperCells();
+                    const pmacc::math::Int<simDim> guardSuperCells = this->cellDescription->getGuardingSuperCells();
+                    const pmacc::math::Int<simDim> coreBorderSuperCells
+                        = coreBorderGuardSuperCells - 2 * guardSuperCells;
+
+                    /* this zone represents the core+border area with guard offset in unit of cells */
+                    const zone::SphericZone<simDim> zone(
+                        static_cast<pmacc::math::Size_t<simDim>>(coreBorderSuperCells * SuperCellSize::toRT()),
+                        guardSuperCells * SuperCellSize::toRT());
+
+                    /* get particles instance */
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
+
+                    /* create `ParticleMergerKernel` instance */
+                    ParticleMergerKernel<typename ParticlesType::ParticlesBoxType> particleMergerKernel(
+                        particles->getDeviceParticlesBox(),
+                        this->minParticlesToMerge,
+                        this->posSpreadThreshold,
+                        this->absMomSpreadThreshold,
+                        this->relMomSpreadThreshold,
+                        this->minMeanEnergy);
+
+                    /* execute particle merging alorithm */
+                    algorithm::kernel::Foreach<SuperCellSize> foreach;
+                    foreach(zone, cursor::make_MultiIndexCursor<simDim>(), particleMergerKernel)
+                        ;
+
+                    /* close all gaps caused by removal of particles */
+                    particles->fillAllGaps();
+                }
+
+
+                void setMappingDescription(MappingDesc* cellDescription)
+                {
+                    this->cellDescription = cellDescription;
+                }
+
+
+                void pluginRegisterHelp(po::options_description& desc)
+                {
+                    desc.add_options()(
+                        (this->prefix + ".period").c_str(),
+                        po::value<std::string>(&this->notifyPeriod),
+                        "enable plugin [for each n-th step]")(
+                        (this->prefix + ".minParticlesToMerge").c_str(),
+                        po::value<uint32_t>(&this->minParticlesToMerge)->default_value(8),
+                        "minimal number of macroparticles needed to merge"
+                        " the macroparticle collection into a single macroparticle.")(
+                        (this->prefix + ".posSpreadThreshold").c_str(),
+                        po::value<float_X>(&this->posSpreadThreshold)->default_value(0.5),
+                        "Below this threshold of spread in position macroparticles"
+                        " can be merged [unit: cell edge length].")(
+                        (this->prefix + ".absMomSpreadThreshold").c_str(),
+                        po::value<float_X>(&this->absMomSpreadThreshold_mc)->default_value(-1.0),
+                        "Below this absolute threshold of spread in momentum"
+                        " macroparticles can be merged [unit: m_el * c]."
+                        " Disabled for -1 (default).")(
+                        (this->prefix + ".relMomSpreadThreshold").c_str(),
+                        po::value<float_X>(&this->relMomSpreadThreshold)->default_value(-1.0),
+                        "Below this relative (to mean momentum) threshold of spread in"
+                        " momentum macroparticles can be merged [unit: none]."
+                        " Disabled for -1 (default).")(
+                        (this->prefix + ".minMeanEnergy").c_str(),
+                        po::value<float_64>(&this->minMeanEnergy_keV)->default_value(511.0),
+                        "minimal mean kinetic energy needed to merge the macroparticle"
+                        " collection into a single macroparticle [unit: keV].");
+                }
+
+                std::string pluginGetName() const
+                {
+                    return this->name;
+                }
+
+            protected:
+                void pluginLoad()
+                {
+                    if(notifyPeriod.empty())
+                        return;
+
+                    Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+
+                    // clean user parameters
+                    PMACC_VERIFY_MSG(
+                        this->minParticlesToMerge > 1,
+                        std::string("[Plugin: ") + this->prefix
+                            + "] minParticlesToMerge"
+                              " has to be greater than one.");
+                    PMACC_VERIFY_MSG(
+                        this->posSpreadThreshold >= float_X(0.0),
+                        std::string("[Plugin: ") + this->prefix
+                            + "] posSpreadThreshold"
+                              " has to be non-negative.");
+                    PMACC_VERIFY_MSG(
+                        this->absMomSpreadThreshold_mc * this->relMomSpreadThreshold < float(0.0),
+                        std::string("[Plugin: ") + this->prefix
+                            + "] either"
+                              " absMomSpreadThreshold or relMomSpreadThreshold has to be given");
+                    PMACC_VERIFY_MSG(
+                        this->minMeanEnergy >= float_X(0.0),
+                        std::string("[Plugin: ") + this->prefix
+                            + "] minMeanEnergy"
+                              " has to be non-negative.");
+
+                    // convert units of user parameters
+                    this->absMomSpreadThreshold = this->absMomSpreadThreshold_mc * ELECTRON_MASS * SPEED_OF_LIGHT;
+
+                    const float_64 minMeanEnergy_SI = this->minMeanEnergy_keV * UNITCONV_keV_to_Joule;
+                    this->minMeanEnergy = static_cast<float_X>(minMeanEnergy_SI / UNIT_ENERGY);
+                }
+
+                void pluginUnload()
+                {
+                }
+
+                void restart(uint32_t, const std::string)
+                {
+                }
+
+                void checkpoint(uint32_t, const std::string)
+                {
+                }
+            };
+
+
+            template<class T_ParticlesType>
+            struct ParticleMergerWrapped<T_ParticlesType, false> : ISimulationPlugin
+            {
+            private:
+                std::string name;
+                std::string prefix;
+                std::string notifyPeriod;
+                MappingDesc* cellDescription;
+
+            public:
+                using ParticlesType = T_ParticlesType;
+
+                ParticleMergerWrapped()
+                    : name("ParticleMerger: merges several macroparticles with"
+                           " similar position and momentum into a single one.\n"
+                           "plugin disabled. Enable plugin by adding the `voronoiCellId`"
+                           " attribute to the particle attribute list.")
+                    , prefix(ParticlesType::FrameType::getName() + std::string("_merger"))
+                    , cellDescription(nullptr)
+                {
+                    Environment<>::get().PluginConnector().registerPlugin(this);
+                }
+
+                std::string pluginGetName() const
+                {
+                    return this->name;
+                }
+
+            protected:
+                void setMappingDescription(MappingDesc*)
+                {
+                }
+
+                void pluginRegisterHelp(po::options_description&)
+                {
+                }
+
+                void pluginUnload()
+                {
+                }
+
+                void restart(uint32_t, const std::string)
+                {
+                }
+
+                void checkpoint(uint32_t, const std::string)
+                {
+                }
+
+                void notify(uint32_t)
+                {
+                }
+            };
+
+
+            template<typename T_ParticlesType>
+            struct ParticleMerger : ParticleMergerWrapped<T_ParticlesType>
+            {
+            };
+
+        } // namespace particleMerging
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/particleMerging/ParticleMerger.kernel b/include/picongpu/plugins/particleMerging/ParticleMerger.kernel
index 83fc430b10..25b082d143 100644
--- a/include/picongpu/plugins/particleMerging/ParticleMerger.kernel
+++ b/include/picongpu/plugins/particleMerging/ParticleMerger.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Heiko Burau
+/* Copyright 2017-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -28,444 +28,387 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace particleMerging
-{
-
-    /** Implements a particle merging algorithm based on
-     *
-     * Luu, P. T., Tueckmantel, T., & Pukhov, A. (2016).
-     * Voronoi particle merging algorithm for PIC codes.
-     * Computer Physics Communications, 202, 165-174.
-     *
-     * \tparam T_ParticlesBox container of the particle species
-     */
-    template< class T_ParticlesBox >
-    struct ParticleMergerKernel
+    namespace plugins
     {
-        using ParticlesBox = T_ParticlesBox;
-        using FramePtr = typename ParticlesBox::FramePtr;
-        using FrameType = typename ParticlesBox::FrameType;
-        using ArrayVoronoiCells = memory::Array<
-            VoronoiCell,
-            MAX_VORONOI_CELLS
-        >;
-        using VoronoiIndexPool = memory::IndexPool<
-            voronoiCellId::type,
-            MAX_VORONOI_CELLS
-        >;
-
-
-        ParticlesBox particlesBox;
-        /** minimal number of macroparticles needed to merge
-            the macroparticle collection into a single macroparticle */
-        uint32_t minParticlesToMerge;
-        /** Below this threshold of spread in position (squared) macroparticles
-            can be merged [unit: cell edge length] */
-        float_X posSpreadThreshold2;
-        /** Below this absolute threshold of spread in momentum
-            macroparticles can be merged [unit: m_el * c]. */
-        float_X absMomSpreadThreshold;
-        /** Below this relative (to mean momentum) threshold of spread in
-            momentum macroparticles can be merged [unit: none]. */
-        float_X relMomSpreadThreshold;
-        /** minimal mean kinetic energy needed to merge the macroparticle
-            collection into a single macroparticle [unit: keV] */
-        float_X minMeanEnergy;
-
-        ParticleMergerKernel(
-            ParticlesBox particlesBox,
-            uint32_t minParticlesToMerge,
-            float_X posSpreadThreshold,
-            float_X absMomSpreadThreshold,
-            float_X relMomSpreadThreshold,
-            float_X minMeanEnergy
-        ) :
-            particlesBox( particlesBox ),
-            minParticlesToMerge( minParticlesToMerge ),
-            posSpreadThreshold2( posSpreadThreshold * posSpreadThreshold ),
-            absMomSpreadThreshold( absMomSpreadThreshold ),
-            relMomSpreadThreshold( relMomSpreadThreshold ),
-            minMeanEnergy ( minMeanEnergy )
-        {}
-
-        /** map cell index to the initial Voronoi cell by aggregating N^simDim 'normal'
-         * cells to a single Voronoi cell.
-         *
-         * @param cellIdx cell index
-         */
-        DINLINE voronoiCellId::type mapCellIdxToInitialVoronoiCell( const uint32_t cellIdx ) const
+        namespace particleMerging
         {
-            const DataSpace< simDim > cellIdxDim = DataSpaceOperations< simDim >::template map<
-                SuperCellSize
-            >( cellIdx );
-
-            const DataSpace< simDim > voronoiCellDim = cellIdxDim / 2;
-
-            return static_cast< voronoiCellId::type >(
-                pmacc::math::linearize(
-                    pmacc::math::CT::shrinkTo< SuperCellSize, simDim - 1 >::type::toRT() / 2,
-                    voronoiCellDim
-                )
-            );
-        }
-
-
-        /** init the Voronoi cell id attribute for each particle in the super cell.
-         *
-         * The initial Voronoi cell is chosen by aggregating N^simDim 'normal' cells
-         * to a single Voronoi cell.
-         *
-         * @param cellIdx cell index
-         */
-        template< typename T_Acc >
-        DINLINE void initVoronoiCellIdAttribute(
-            T_Acc const & acc,
-            const pmacc::math::Int<simDim>& cellIdx
-        )
-        {
-            //! \todo change this as soon as the kernel support lock step programming
-            constexpr uint32_t numWorkers = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            const uint32_t workerIdx = DataSpaceOperations< simDim >::template map<
-                SuperCellSize
-            >( cellIdx % SuperCellSize::toRT() );
-            particleAccess::Cell2Particle< SuperCellSize, numWorkers > forEachFrame;
-            forEachFrame(
-                acc,
-                this->particlesBox,
-                workerIdx,
-                cellIdx,
-                [this]( const T_Acc & acc, FramePtr frame, const int linearThreadIdx )
+            /** Implements a particle merging algorithm based on
+             *
+             * Luu, P. T., Tueckmantel, T., & Pukhov, A. (2016).
+             * Voronoi particle merging algorithm for PIC codes.
+             * Computer Physics Communications, 202, 165-174.
+             *
+             * \tparam T_ParticlesBox container of the particle species
+             */
+            template<class T_ParticlesBox>
+            struct ParticleMergerKernel
+            {
+                using ParticlesBox = T_ParticlesBox;
+                using FramePtr = typename ParticlesBox::FramePtr;
+                using FrameType = typename ParticlesBox::FrameType;
+                using ArrayVoronoiCells = memory::Array<VoronoiCell, MAX_VORONOI_CELLS>;
+                using VoronoiIndexPool = memory::IndexPool<voronoiCellId::type, MAX_VORONOI_CELLS>;
+
+
+                ParticlesBox particlesBox;
+                /** minimal number of macroparticles needed to merge
+                    the macroparticle collection into a single macroparticle */
+                uint32_t minParticlesToMerge;
+                /** Below this threshold of spread in position (squared) macroparticles
+                    can be merged [unit: cell edge length] */
+                float_X posSpreadThreshold2;
+                /** Below this absolute threshold of spread in momentum
+                    macroparticles can be merged [unit: m_el * c]. */
+                float_X absMomSpreadThreshold;
+                /** Below this relative (to mean momentum) threshold of spread in
+                    momentum macroparticles can be merged [unit: none]. */
+                float_X relMomSpreadThreshold;
+                /** minimal mean kinetic energy needed to merge the macroparticle
+                    collection into a single macroparticle [unit: keV] */
+                float_X minMeanEnergy;
+
+                ParticleMergerKernel(
+                    ParticlesBox particlesBox,
+                    uint32_t minParticlesToMerge,
+                    float_X posSpreadThreshold,
+                    float_X absMomSpreadThreshold,
+                    float_X relMomSpreadThreshold,
+                    float_X minMeanEnergy)
+                    : particlesBox(particlesBox)
+                    , minParticlesToMerge(minParticlesToMerge)
+                    , posSpreadThreshold2(posSpreadThreshold * posSpreadThreshold)
+                    , absMomSpreadThreshold(absMomSpreadThreshold)
+                    , relMomSpreadThreshold(relMomSpreadThreshold)
+                    , minMeanEnergy(minMeanEnergy)
                 {
-                    auto particle = frame[linearThreadIdx];
-
-                    const lcellId_t particleCellIdx = particle[localCellIdx_];
-
-                    particle[voronoiCellId_] = this->mapCellIdxToInitialVoronoiCell( particleCellIdx );
-                },
-                particles::filter::All{}
-            );
-        }
-
-        /** calculate position of particle within a super cell.
-         *
-         * @param particleCellIdx local particle cell index
-         * @param positionWithinCell position within cell
-         * @return position of particle with respect to its super cell's origin
-         */
-        DINLINE floatD_X getParticlePosWithinSuperCell(
-            const lcellId_t particleCellIdx,
-            const floatD_X positionWithinCell
-        ) const
-        {
-            const DataSpace< simDim > particleCellIdxDim = DataSpaceOperations< simDim >::template map<
-                SuperCellSize
-            >( particleCellIdx );
+                }
 
-            floatD_X result;
-            for( int i = 0; i < simDim; i++ )
-            {
-                result[i] = static_cast< float_X >( particleCellIdxDim[i] ) + positionWithinCell[i];
-            }
-
-            return result;
-        }
-
-        /** This method handles the merging process on the single-particle level.
-         *
-         * It is called in the main loop of the merging algorithm.
-         * Depending on the state of the Voronoi cell where the particle belongs
-         * to the execution is forked into distinct sub-processes.
-         *
-         * @param cellIdx n-dim. cell index from the origin of the local domain
-         * @param listVoronoiCells fixed-sized array of Voronoi cells
-         */
-        template< typename T_Acc >
-        DINLINE void processParticles(
-            T_Acc const & acc,
-            const pmacc::math::Int<simDim>& cellIdx,
-            ArrayVoronoiCells& listVoronoiCells
-        )
-        {
-            //! \todo change this as soon as the kernel support lock step programming
-            constexpr uint32_t numWorkers = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            const uint32_t workerIdx = DataSpaceOperations< simDim >::template map<
-                SuperCellSize
-            >( cellIdx % SuperCellSize::toRT() );
-            particleAccess::Cell2Particle< SuperCellSize, numWorkers > forEachFrame;
-            forEachFrame(
-                acc,
-                this->particlesBox,
-                workerIdx,
-                cellIdx,
-                [&]( const T_Acc & acc, FramePtr frame, const int linearThreadIdx )
+                /** map cell index to the initial Voronoi cell by aggregating N^simDim 'normal'
+                 * cells to a single Voronoi cell.
+                 *
+                 * @param cellIdx cell index
+                 */
+                DINLINE voronoiCellId::type mapCellIdxToInitialVoronoiCell(const uint32_t cellIdx) const
                 {
-                    auto particle = frame[linearThreadIdx];
+                    const DataSpace<simDim> cellIdxDim
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(cellIdx);
+
+                    const DataSpace<simDim> voronoiCellDim = cellIdxDim / 2;
 
-                    const voronoiCellId::type voronoiCellId = particle[voronoiCellId_];
-                    if( voronoiCellId == -1 )
-                        return;
+                    return static_cast<voronoiCellId::type>(pmacc::math::linearize(
+                        pmacc::math::CT::shrinkTo<SuperCellSize, simDim - 1>::type::toRT() / 2,
+                        voronoiCellDim));
+                }
 
-                    VoronoiCell& voronoiCell = listVoronoiCells[voronoiCellId];
 
-                    const floatD_X position = this->getParticlePosWithinSuperCell(
-                        particle[localCellIdx_],
-                        particle[position_]
-                    );
-                    const float_X weighting = particle[weighting_];
-                    const float3_X momentum = particle[momentum_] / weighting;
+                /** init the Voronoi cell id attribute for each particle in the super cell.
+                 *
+                 * The initial Voronoi cell is chosen by aggregating N^simDim 'normal' cells
+                 * to a single Voronoi cell.
+                 *
+                 * @param cellIdx cell index
+                 */
+                template<typename T_Acc>
+                DINLINE void initVoronoiCellIdAttribute(T_Acc const& acc, const pmacc::math::Int<simDim>& cellIdx)
+                {
+                    //! \todo change this as soon as the kernel support lock step programming
+                    constexpr uint32_t numWorkers = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    const uint32_t workerIdx
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(cellIdx % SuperCellSize::toRT());
+                    particleAccess::Cell2Particle<SuperCellSize, numWorkers> forEachFrame;
+                    forEachFrame(
+                        acc,
+                        this->particlesBox,
+                        workerIdx,
+                        cellIdx,
+                        [this](const T_Acc& acc, FramePtr frame, const int linearThreadIdx) {
+                            auto particle = frame[linearThreadIdx];
+
+                            const lcellId_t particleCellIdx = particle[localCellIdx_];
+
+                            particle[voronoiCellId_] = this->mapCellIdxToInitialVoronoiCell(particleCellIdx);
+                        },
+                        particles::filter::All{});
+                }
 
-                    switch( voronoiCell.status )
+                /** calculate position of particle within a super cell.
+                 *
+                 * @param particleCellIdx local particle cell index
+                 * @param positionWithinCell position within cell
+                 * @return position of particle with respect to its super cell's origin
+                 */
+                DINLINE floatD_X
+                getParticlePosWithinSuperCell(const lcellId_t particleCellIdx, const floatD_X positionWithinCell) const
+                {
+                    const DataSpace<simDim> particleCellIdxDim
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(particleCellIdx);
+
+                    floatD_X result;
+                    for(int i = 0; i < simDim; i++)
                     {
-                    case VoronoiStatus::collecting:
-                        voronoiCell.addParticle(
-                            acc,
-                            position,
-                            momentum,
-                            weighting
-                        );
+                        result[i] = static_cast<float_X>(particleCellIdxDim[i]) + positionWithinCell[i];
+                    }
 
-                        break;
+                    return result;
+                }
 
-                    case VoronoiStatus::splitting:
-                        {
-                            /* determine in what sub-Voronoi cell the particle falls */
-                            const voronoiCellId::type subVoronoiCellId = voronoiCell.getSubVoronoiCell(
-                                position,
-                                momentum
-                            );
-
-                            particle[voronoiCellId_] = subVoronoiCellId;
-
-                            /* place particle into one of the two sub-Voronoi cells */
-                            listVoronoiCells[subVoronoiCellId].addParticle(
-                                acc,
-                                position,
-                                momentum,
-                                weighting
-                            );
-                        }
+                /** This method handles the merging process on the single-particle level.
+                 *
+                 * It is called in the main loop of the merging algorithm.
+                 * Depending on the state of the Voronoi cell where the particle belongs
+                 * to the execution is forked into distinct sub-processes.
+                 *
+                 * @param cellIdx n-dim. cell index from the origin of the local domain
+                 * @param listVoronoiCells fixed-sized array of Voronoi cells
+                 */
+                template<typename T_Acc>
+                DINLINE void processParticles(
+                    T_Acc const& acc,
+                    const pmacc::math::Int<simDim>& cellIdx,
+                    ArrayVoronoiCells& listVoronoiCells)
+                {
+                    //! \todo change this as soon as the kernel support lock step programming
+                    constexpr uint32_t numWorkers = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    const uint32_t workerIdx
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(cellIdx % SuperCellSize::toRT());
+                    particleAccess::Cell2Particle<SuperCellSize, numWorkers> forEachFrame;
+                    forEachFrame(
+                        acc,
+                        this->particlesBox,
+                        workerIdx,
+                        cellIdx,
+                        [&](const T_Acc& acc, FramePtr frame, const int linearThreadIdx) {
+                            auto particle = frame[linearThreadIdx];
+
+                            const voronoiCellId::type voronoiCellId = particle[voronoiCellId_];
+                            if(voronoiCellId == -1)
+                                return;
+
+                            VoronoiCell& voronoiCell = listVoronoiCells[voronoiCellId];
+
+                            const floatD_X position
+                                = this->getParticlePosWithinSuperCell(particle[localCellIdx_], particle[position_]);
+                            const float_X weighting = particle[weighting_];
+                            const float3_X momentum = particle[momentum_] / weighting;
+
+                            switch(voronoiCell.status)
+                            {
+                            case VoronoiStatus::collecting:
+                                voronoiCell.addParticle(acc, position, momentum, weighting);
 
-                        break;
+                                break;
 
-                    case VoronoiStatus::abort:
-                        /* check out of the Voronoi cell */
-                        particle[voronoiCellId_] = -1;
+                            case VoronoiStatus::splitting:
+                            {
+                                /* determine in what sub-Voronoi cell the particle falls */
+                                const voronoiCellId::type subVoronoiCellId
+                                    = voronoiCell.getSubVoronoiCell(position, momentum);
 
-                        break;
+                                particle[voronoiCellId_] = subVoronoiCellId;
 
-                    case VoronoiStatus::readyForMerging:
-                        /* merge all particles of this Voronoi cell */
-                        if( voronoiCell.isFirstParticle( acc ) )
-                        {
-                            /* I am the first particle in the Voronoi cell
-                             * => get dressed with Voronoi cell's attributes */
-                            particle[momentum_] = voronoiCell.meanValue * voronoiCell.numRealParticles;
-                            particle[weighting_] = voronoiCell.numRealParticles;
-                        }
-                        else
-                        {
-                            /* I am not the first particle in the Voronoi cell
-                             * => remove me */
-                            particle[multiMask_] = 0;
-                        }
+                                /* place particle into one of the two sub-Voronoi cells */
+                                listVoronoiCells[subVoronoiCellId].addParticle(acc, position, momentum, weighting);
+                            }
 
-                        /* check out of the Voronoi cell */
-                        particle[voronoiCellId_] = -1;
-                    }
-                },
-                particles::filter::All{}
-            );
-        }
-
-        /** This method handles the merging process on the Voronoi cell level.
-         *
-         * It is called in the main loop of the merging algorithm.
-         * It does the transition of the distinct states of each Voronoi cell.
-         *
-         * @param listVoronoiCells fixed-sized array of Voronoi cells
-         * @param voronoiIndexPool holds indices of active Voronoi cells within `listVoronoiCells`
-         */
-        DINLINE void processVoronoiCells(
-            ArrayVoronoiCells& listVoronoiCells,
-            VoronoiIndexPool& voronoiIndexPool
-        ) const
-        {
-            for( voronoiCellId::type voronoiCellId : voronoiIndexPool )
-            {
-                VoronoiCell& voronoiCell = listVoronoiCells[voronoiCellId];
+                            break;
+
+                            case VoronoiStatus::abort:
+                                /* check out of the Voronoi cell */
+                                particle[voronoiCellId_] = -1;
+
+                                break;
 
-                switch( voronoiCell.status )
+                            case VoronoiStatus::readyForMerging:
+                                /* merge all particles of this Voronoi cell */
+                                if(voronoiCell.isFirstParticle(acc))
+                                {
+                                    /* I am the first particle in the Voronoi cell
+                                     * => get dressed with Voronoi cell's attributes */
+                                    particle[momentum_] = voronoiCell.meanValue * voronoiCell.numRealParticles;
+                                    particle[weighting_] = voronoiCell.numRealParticles;
+                                }
+                                else
+                                {
+                                    /* I am not the first particle in the Voronoi cell
+                                     * => remove me */
+                                    particle[multiMask_] = 0;
+                                }
+
+                                /* check out of the Voronoi cell */
+                                particle[voronoiCellId_] = -1;
+                            }
+                        },
+                        particles::filter::All{});
+                }
+
+                /** This method handles the merging process on the Voronoi cell level.
+                 *
+                 * It is called in the main loop of the merging algorithm.
+                 * It does the transition of the distinct states of each Voronoi cell.
+                 *
+                 * @param listVoronoiCells fixed-sized array of Voronoi cells
+                 * @param voronoiIndexPool holds indices of active Voronoi cells within `listVoronoiCells`
+                 */
+                DINLINE void processVoronoiCells(
+                    ArrayVoronoiCells& listVoronoiCells,
+                    VoronoiIndexPool& voronoiIndexPool) const
                 {
-                case VoronoiStatus::collecting:
+                    for(voronoiCellId::type voronoiCellId : voronoiIndexPool)
                     {
-                        /* check if Voronoi cell is too small of count */
-                        if( voronoiCell.numMacroParticles < this->minParticlesToMerge )
+                        VoronoiCell& voronoiCell = listVoronoiCells[voronoiCellId];
+
+                        switch(voronoiCell.status)
+                        {
+                        case VoronoiStatus::collecting:
                         {
-                            voronoiCell.setToAbort();
+                            /* check if Voronoi cell is too small of count */
+                            if(voronoiCell.numMacroParticles < this->minParticlesToMerge)
+                            {
+                                voronoiCell.setToAbort();
 
-                            break;
-                        }
+                                break;
+                            }
 
-                        /* finalize mean value calculation */
-                        voronoiCell.finalizeMeanValues();
+                            /* finalize mean value calculation */
+                            voronoiCell.finalizeMeanValues();
 
-                        /* abort if mean energy of Voronoi cell is below limit */
-                        if( voronoiCell.getMeanEnergy( frame::getMass<FrameType>() ) < this->minMeanEnergy )
-                        {
-                            voronoiCell.setToAbort();
+                            /* abort if mean energy of Voronoi cell is below limit */
+                            if(voronoiCell.getMeanEnergy(frame::getMass<FrameType>()) < this->minMeanEnergy)
+                            {
+                                voronoiCell.setToAbort();
 
-                            break;
-                        }
+                                break;
+                            }
 
-                        /* choose threshold of spread of momentum */
-                        const float_X momSpreadThreshold2 =
-                            this->relMomSpreadThreshold != float_X( -1.0 ) ?
-                            this->relMomSpreadThreshold * this->relMomSpreadThreshold * voronoiCell.getMeanMomentum2() :
-                            this->absMomSpreadThreshold * this->absMomSpreadThreshold;
-
-                        /* check if Voronoi cell is too large in spread of position or momentum */
-                        uint8_t splittingComponent;
-                        if(
-                            (
-                                voronoiCell.splittingStage == VoronoiSplittingStage::position &&
-                                voronoiCell.getMaxPositionSpread2( splittingComponent ) > this->posSpreadThreshold2
-                            ) ||
-                            (
-                                voronoiCell.splittingStage == VoronoiSplittingStage::momentum &&
-                                voronoiCell.getMaxMomentumSpread2( splittingComponent ) > momSpreadThreshold2
-                            )
-                        )
-                        {
-                            /* create two new sub Voronoi cells */
-                            voronoiCell.setToSplitting(
-                                splittingComponent,
-                                voronoiIndexPool.get(), /* lower Voronoi cell id */
-                                voronoiIndexPool.get()  /* higher Voronoi cell id */
-                            );
-
-                            /* abort if Voronoi index pool is full */
-                            if( voronoiCell.lowerCellId == -1 || voronoiCell.higherCellId == -1 )
+                            /* choose threshold of spread of momentum */
+                            const float_X momSpreadThreshold2 = this->relMomSpreadThreshold != float_X(-1.0)
+                                ? this->relMomSpreadThreshold * this->relMomSpreadThreshold
+                                    * voronoiCell.getMeanMomentum2()
+                                : this->absMomSpreadThreshold * this->absMomSpreadThreshold;
+
+                            /* check if Voronoi cell is too large in spread of position or momentum */
+                            uint8_t splittingComponent;
+                            if((voronoiCell.splittingStage == VoronoiSplittingStage::position
+                                && voronoiCell.getMaxPositionSpread2(splittingComponent) > this->posSpreadThreshold2)
+                               || (voronoiCell.splittingStage == VoronoiSplittingStage::momentum
+                                   && voronoiCell.getMaxMomentumSpread2(splittingComponent) > momSpreadThreshold2))
                             {
-                                voronoiCell.setToAbort();
+                                /* create two new sub Voronoi cells */
+                                voronoiCell.setToSplitting(
+                                    splittingComponent,
+                                    voronoiIndexPool.get(), /* lower Voronoi cell id */
+                                    voronoiIndexPool.get() /* higher Voronoi cell id */
+                                );
+
+                                /* abort if Voronoi index pool is full */
+                                if(voronoiCell.lowerCellId == -1 || voronoiCell.higherCellId == -1)
+                                {
+                                    voronoiCell.setToAbort();
+
+                                    break;
+                                }
+
+                                /* initialize the two new sub Voronoi cells in `collecting` state */
+                                listVoronoiCells[voronoiCell.lowerCellId] = VoronoiCell(voronoiCell.splittingStage);
+                                listVoronoiCells[voronoiCell.higherCellId] = VoronoiCell(voronoiCell.splittingStage);
 
                                 break;
                             }
 
-                            /* initialize the two new sub Voronoi cells in `collecting` state */
-                            listVoronoiCells[voronoiCell.lowerCellId] = VoronoiCell( voronoiCell.splittingStage );
-                            listVoronoiCells[voronoiCell.higherCellId] = VoronoiCell( voronoiCell.splittingStage );
+                            /* switch to momentum-splitting-stage after position-splitting-stage */
+                            if(voronoiCell.splittingStage == VoronoiSplittingStage::position)
+                            {
+                                voronoiCell = VoronoiCell(VoronoiSplittingStage::momentum);
 
-                            break;
-                        }
+                                break;
+                            }
 
-                        /* switch to momentum-splitting-stage after position-splitting-stage */
-                        if( voronoiCell.splittingStage == VoronoiSplittingStage::position )
-                        {
-                            voronoiCell = VoronoiCell( VoronoiSplittingStage::momentum );
+                            /* if the Voronoi cell is neither too small in count
+                             * nor too large in spread of position or momentum
+                             * nor too low in mean energy it is ready to be merged
+                             */
+                            voronoiCell.setToReadyForMerging();
 
                             break;
                         }
 
-                        /* if the Voronoi cell is neither too small in count
-                         * nor too large in spread of position or momentum
-                         * nor too low in mean energy it is ready to be merged
-                         */
-                        voronoiCell.setToReadyForMerging();
+                        default:
+                            /* delete Voronoi cell */
+                            voronoiIndexPool.release(voronoiCellId);
 
-                        break;
+                            break;
+                        }
                     }
+                }
 
-                default:
-                    /* delete Voronoi cell */
-                    voronoiIndexPool.release( voronoiCellId );
 
-                    break;
-                }
-            }
-        }
-
-
-        /** Entry point of the particle merging algorithm
-         *
-         * @param cellIndex n-dim. cell index from the origin of the local domain
-         */
-        template< typename T_Acc>
-        DINLINE void operator()(
-            T_Acc const & acc,
-            const pmacc::math::Int<simDim>& cellIndex
-        )
-        {
-            /* multi-dim vector from origin of the super cell to a cell in units of cells */
-            const pmacc::math::Int<simDim> threadIndex = cellIndex % SuperCellSize::toRT();
-
-            /* conversion from a multi-dim cell coordinate to a linear coordinate
-             * of the cell in its super cell */
-            const int linearThreadIdx = pmacc::math::linearize(
-                pmacc::math::CT::shrinkTo<SuperCellSize, simDim-1>::type::toRT(),
-                threadIndex
-            );
-
-            /* fixed-sized array of Voronoi cells */
-            PMACC_SMEM( acc, listVoronoiCells, ArrayVoronoiCells );
-            /* holds indices of active Voronoi cells within `listVoronoiCells` */
-            PMACC_SMEM( acc, voronoiIndexPool, VoronoiIndexPool );
-
-            /* number of initial Voronoi cells
-             *
-             * `1u << simDim` is equivalent to `pow(2, simDim)` but can be
-             * calculated at compile-time to save a shared variable.
-             */
-            constexpr uint16_t numInitialVoronoiCells = pmacc::math::CT::volume<
-                SuperCellSize
-            >::type::value / ( 1u << simDim );
+                /** Entry point of the particle merging algorithm
+                 *
+                 * @param cellIndex n-dim. cell index from the origin of the local domain
+                 */
+                template<typename T_Acc>
+                DINLINE void operator()(T_Acc const& acc, const pmacc::math::Int<simDim>& cellIndex)
+                {
+                    /* multi-dim vector from origin of the super cell to a cell in units of cells */
+                    const pmacc::math::Int<simDim> threadIndex = cellIndex % SuperCellSize::toRT();
+
+                    /* conversion from a multi-dim cell coordinate to a linear coordinate
+                     * of the cell in its super cell */
+                    const int linearThreadIdx = pmacc::math::linearize(
+                        pmacc::math::CT::shrinkTo<SuperCellSize, simDim - 1>::type::toRT(),
+                        threadIndex);
+
+                    /* fixed-sized array of Voronoi cells */
+                    PMACC_SMEM(acc, listVoronoiCells, ArrayVoronoiCells);
+                    /* holds indices of active Voronoi cells within `listVoronoiCells` */
+                    PMACC_SMEM(acc, voronoiIndexPool, VoronoiIndexPool);
+
+                    /* number of initial Voronoi cells
+                     *
+                     * `1u << simDim` is equivalent to `pow(2, simDim)` but can be
+                     * calculated at compile-time to save a shared variable.
+                     */
+                    constexpr uint16_t numInitialVoronoiCells
+                        = pmacc::math::CT::volume<SuperCellSize>::type::value / (1u << simDim);
+
+                    if(linearThreadIdx == 0)
+                    {
+                        /* init index pool of Voronoi Cells */
+                        voronoiIndexPool = VoronoiIndexPool(numInitialVoronoiCells);
+                    }
 
-            if( linearThreadIdx == 0 )
-            {
-                /* init index pool of Voronoi Cells */
-                voronoiIndexPool = VoronoiIndexPool( numInitialVoronoiCells );
-            }
+                    cupla::__syncthreads(acc);
 
-            __syncthreads();
+                    /* set initial Voronoi cells into `collecting` state */
+                    if(linearThreadIdx < numInitialVoronoiCells)
+                        listVoronoiCells[linearThreadIdx] = VoronoiCell();
 
-            /* set initial Voronoi cells into `collecting` state */
-            if( linearThreadIdx < numInitialVoronoiCells )
-                listVoronoiCells[linearThreadIdx] = VoronoiCell();
+                    cupla::__syncthreads(acc);
 
-            __syncthreads();
+                    /* init the voronoiCellId attribute for each particle */
+                    this->initVoronoiCellIdAttribute(acc, cellIndex);
 
-            /* init the voronoiCellId attribute for each particle */
-            this->initVoronoiCellIdAttribute( acc, cellIndex );
+                    cupla::__syncthreads(acc);
 
-            __syncthreads();
+                    /* main loop of the merging algorithm */
+                    while(voronoiIndexPool.size() > 0)
+                    {
+                        this->processParticles(acc, cellIndex, listVoronoiCells);
 
-            /* main loop of the merging algorithm */
-            while( voronoiIndexPool.size() > 0 )
-            {
-                this->processParticles(
-                    acc,
-                    cellIndex,
-                    listVoronoiCells
-                );
+                        cupla::__syncthreads(acc);
 
-                __syncthreads();
+                        /* TODO: parallelize */
+                        if(linearThreadIdx == 0)
+                        {
+                            this->processVoronoiCells(listVoronoiCells, voronoiIndexPool);
+                        }
 
-                /* TODO: parallelize */
-                if( linearThreadIdx == 0 )
-                {
-                    this->processVoronoiCells(
-                        listVoronoiCells,
-                        voronoiIndexPool
-                    );
+                        cupla::__syncthreads(acc);
+                    }
                 }
-
-                __syncthreads();
-            }
-        }
-    };
+            };
 
 
-} // namespace particleMerging
-} // namespace plugins
+        } // namespace particleMerging
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/particleMerging/VoronoiCell.hpp b/include/picongpu/plugins/particleMerging/VoronoiCell.hpp
index 75bbb56b90..e6699dd192 100644
--- a/include/picongpu/plugins/particleMerging/VoronoiCell.hpp
+++ b/include/picongpu/plugins/particleMerging/VoronoiCell.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Heiko Burau
+/* Copyright 2017-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -24,241 +24,243 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace particleMerging
-{
-
-    /** Status of a Voronoi cell */
-    enum struct VoronoiStatus : uint8_t
-    {
-        /* !< a Voronoi cell is collecting particles (first state) */
-        collecting,
-        /* !< the Voronoi cell is splitting thus all its particles have
-         * to move to one of two sub-Voronoi cells */
-        splitting,
-        /* !< the cell needs to be destroyed. Before this can happen
-         * all its particles need to clear their voronoiCellId attribute. */
-        abort,
-        /* !< the Voronoi cell is ready for merging. After merging it is destroyed. */
-        readyForMerging,
-    };
-
-
-    /** Stage of a Voronoi cell
-     *
-     * The spliiting process is two-fold: at first, the splitting is done regarding
-     * only the spread in position and then by looking at the spread of momentum.
-     */
-    enum struct VoronoiSplittingStage : bool
-    {
-        /* !< the spatial distribution is splitted */
-        position,
-        /* !< the momentum distribution is splitted */
-        momentum
-    };
-
-
-    /** Represents a Voronoi cell */
-    struct VoronoiCell
+    namespace plugins
     {
-        VoronoiStatus status;
-        VoronoiSplittingStage splittingStage;
-        uint32_t numMacroParticles;
-        float_X numRealParticles;
-
-        float3_X meanValue;
-        float3_X meanSquaredValue;
-
-        uint8_t splittingComponent;
-        int32_t lowerCellId;
-        int32_t higherCellId;
-        int firstParticleFlag;
-
-        HDINLINE
-        VoronoiCell( VoronoiSplittingStage splittingStage = VoronoiSplittingStage::position ) :
-            status( VoronoiStatus::collecting ),
-            splittingStage( splittingStage ),
-            numMacroParticles( 0 ),
-            numRealParticles( float_X( 0.0 ) ),
-            meanValue( float3_X::create( 0.0 ) ),
-            meanSquaredValue( float3_X::create( 0.0 ) ),
-            firstParticleFlag( 0 )
-        {}
-
-        /** status setter */
-        HDINLINE
-        void setToAbort()
+        namespace particleMerging
         {
-            this->status = VoronoiStatus::abort;
-        }
+            /** Status of a Voronoi cell */
+            enum struct VoronoiStatus : uint8_t
+            {
+                /* !< a Voronoi cell is collecting particles (first state) */
+                collecting,
+                /* !< the Voronoi cell is splitting thus all its particles have
+                 * to move to one of two sub-Voronoi cells */
+                splitting,
+                /* !< the cell needs to be destroyed. Before this can happen
+                 * all its particles need to clear their voronoiCellId attribute. */
+                abort,
+                /* !< the Voronoi cell is ready for merging. After merging it is destroyed. */
+                readyForMerging,
+            };
+
+
+            /** Stage of a Voronoi cell
+             *
+             * The spliiting process is two-fold: at first, the splitting is done regarding
+             * only the spread in position and then by looking at the spread of momentum.
+             */
+            enum struct VoronoiSplittingStage : bool
+            {
+                /* !< the spatial distribution is splitted */
+                position,
+                /* !< the momentum distribution is splitted */
+                momentum
+            };
 
 
-        /** status setter */
-        HDINLINE
-        void setToSplitting(
-            const uint8_t splittingComponent,
-            const int32_t lowerCellId,
-            const int32_t higherCellId)
-        {
-            this->status = VoronoiStatus::splitting;
-            this->splittingComponent = splittingComponent;
-            this->lowerCellId = lowerCellId;
-            this->higherCellId = higherCellId;
-        }
+            /** Represents a Voronoi cell */
+            struct VoronoiCell
+            {
+                VoronoiStatus status;
+                VoronoiSplittingStage splittingStage;
+                uint32_t numMacroParticles;
+                float_X numRealParticles;
+
+                float3_X meanValue;
+                float3_X meanSquaredValue;
+
+                uint8_t splittingComponent;
+                int32_t lowerCellId;
+                int32_t higherCellId;
+                int firstParticleFlag;
+
+                HDINLINE
+                VoronoiCell(VoronoiSplittingStage splittingStage = VoronoiSplittingStage::position)
+                    : status(VoronoiStatus::collecting)
+                    , splittingStage(splittingStage)
+                    , numMacroParticles(0)
+                    , numRealParticles(float_X(0.0))
+                    , meanValue(float3_X::create(0.0))
+                    , meanSquaredValue(float3_X::create(0.0))
+                    , firstParticleFlag(0)
+                {
+                }
 
+                /** status setter */
+                HDINLINE
+                void setToAbort()
+                {
+                    this->status = VoronoiStatus::abort;
+                }
 
-        /** status setter */
-        HDINLINE
-        void setToReadyForMerging()
-        {
-            this->status = VoronoiStatus::readyForMerging;
-        }
 
-        /** check if the current thread is associated to the first particle */
-        template< typename T_Acc >
-        DINLINE
-        bool isFirstParticle(T_Acc const & acc)
-        {
-            return atomicExch( &this->firstParticleFlag, 1 ) == 0;
-        }
-
-
-        /** add a particle to this Voronoi cell */
-        template< typename T_Acc >
-        DINLINE
-        void addParticle(
-            T_Acc const & acc,
-            const floatD_X position,
-            const float3_X momentum,
-            const float_X weighting
-        )
-        {
-            atomicAdd( &this->numMacroParticles, static_cast<uint32_t>(1), ::alpaka::hierarchy::Threads{} );
-            atomicAdd( &this->numRealParticles, weighting, ::alpaka::hierarchy::Threads{} );
+                /** status setter */
+                HDINLINE
+                void setToSplitting(
+                    const uint8_t splittingComponent,
+                    const int32_t lowerCellId,
+                    const int32_t higherCellId)
+                {
+                    this->status = VoronoiStatus::splitting;
+                    this->splittingComponent = splittingComponent;
+                    this->lowerCellId = lowerCellId;
+                    this->higherCellId = higherCellId;
+                }
 
-            if( this->splittingStage == VoronoiSplittingStage::position )
-            {
-                const floatD_X position2 = position * position;
 
-                for( int i = 0; i < simDim; i++ )
+                /** status setter */
+                HDINLINE
+                void setToReadyForMerging()
                 {
-                    atomicAdd( &this->meanValue[i], weighting * position[i], ::alpaka::hierarchy::Threads{} );
-                    atomicAdd( &this->meanSquaredValue[i], weighting * position2[i], ::alpaka::hierarchy::Threads{} );
+                    this->status = VoronoiStatus::readyForMerging;
                 }
-            }
-            else
-            {
-                const float3_X momentum2 = momentum * momentum;
 
-                for( int i = 0; i < DIM3; i++ )
+                /** check if the current thread is associated to the first particle */
+                template<typename T_Acc>
+                DINLINE bool isFirstParticle(T_Acc const& acc)
                 {
-                    atomicAdd( &this->meanValue[i], weighting * momentum[i], ::alpaka::hierarchy::Threads{} );
-                    atomicAdd( &this->meanSquaredValue[i], weighting * momentum2[i], ::alpaka::hierarchy::Threads{} );
+                    return cupla::atomicExch(acc, &this->firstParticleFlag, 1) == 0;
                 }
-            }
-        }
 
 
-        /** finalize mean value calculation */
-        HDINLINE
-        void finalizeMeanValues()
-        {
-            this->meanValue /= this->numRealParticles;
-            this->meanSquaredValue /= this->numRealParticles;
-        }
+                /** add a particle to this Voronoi cell */
+                template<typename T_Acc>
+                DINLINE void addParticle(
+                    T_Acc const& acc,
+                    const floatD_X position,
+                    const float3_X momentum,
+                    const float_X weighting)
+                {
+                    cupla::atomicAdd(
+                        acc,
+                        &this->numMacroParticles,
+                        static_cast<uint32_t>(1),
+                        ::alpaka::hierarchy::Threads{});
+                    cupla::atomicAdd(acc, &this->numRealParticles, weighting, ::alpaka::hierarchy::Threads{});
+
+                    if(this->splittingStage == VoronoiSplittingStage::position)
+                    {
+                        const floatD_X position2 = position * position;
+
+                        for(int i = 0; i < simDim; i++)
+                        {
+                            cupla::atomicAdd(
+                                acc,
+                                &this->meanValue[i],
+                                weighting * position[i],
+                                ::alpaka::hierarchy::Threads{});
+                            cupla::atomicAdd(
+                                acc,
+                                &this->meanSquaredValue[i],
+                                weighting * position2[i],
+                                ::alpaka::hierarchy::Threads{});
+                        }
+                    }
+                    else
+                    {
+                        const float3_X momentum2 = momentum * momentum;
+
+                        for(int i = 0; i < DIM3; i++)
+                        {
+                            cupla::atomicAdd(
+                                acc,
+                                &this->meanValue[i],
+                                weighting * momentum[i],
+                                ::alpaka::hierarchy::Threads{});
+                            cupla::atomicAdd(
+                                acc,
+                                &this->meanSquaredValue[i],
+                                weighting * momentum2[i],
+                                ::alpaka::hierarchy::Threads{});
+                        }
+                    }
+                }
 
-        /** get the mean energy of this Voronoi cell if called in momentum stage */
-        HDINLINE
-        float_X getMeanEnergy( const float_X mass ) const
-        {
-            return KinEnergy<>()(
-                this->meanValue,
-                mass
-            );
-        }
-
-        /** get the mean momentum squared of this Voronoi cell if called in momentum stage */
-        HDINLINE
-        float_X getMeanMomentum2() const
-        {
-            return math::abs2( this->meanValue );
-        }
 
+                /** finalize mean value calculation */
+                HDINLINE
+                void finalizeMeanValues()
+                {
+                    this->meanValue /= this->numRealParticles;
+                    this->meanSquaredValue /= this->numRealParticles;
+                }
 
-        /** determine in which of the two sub-Voronoi cells a particle falls */
-        HDINLINE
-        int32_t getSubVoronoiCell(
-            const floatD_X position,
-            const float3_X momentum
-        ) const
-        {
-            const float_X valParticle =
-                this->splittingStage == VoronoiSplittingStage::position ?
-                position[this->splittingComponent] :
-                momentum[this->splittingComponent]
-            ;
-
-            const float_X meanVoronoi = this->meanValue[this->splittingComponent];
-
-            return
-                valParticle < meanVoronoi ?
-                this->lowerCellId :
-                this->higherCellId
-            ;
-        }
-
-
-        /** auxillary function for getting the mean squared deviation in position or momentum */
-        HDINLINE
-        float_X getMaxValueSpread2(
-            uint8_t& component,
-            const uint8_t dimension
-        ) const
-        {
-            const float3_X meanValue2 = this->meanValue * this->meanValue;
-            const float3_X valueSpread2 = this->meanSquaredValue - meanValue2;
+                /** get the mean energy of this Voronoi cell if called in momentum stage */
+                HDINLINE
+                float_X getMeanEnergy(const float_X mass) const
+                {
+                    return KinEnergy<>()(this->meanValue, mass);
+                }
 
-            /* find component of most spread in position */
-            component = 0;
-            float_X maxValueSpread2 = valueSpread2[0];
-            for( uint8_t i = 1; i < dimension; i++ )
-            {
-                if( valueSpread2[i] > maxValueSpread2 )
+                /** get the mean momentum squared of this Voronoi cell if called in momentum stage */
+                HDINLINE
+                float_X getMeanMomentum2() const
                 {
-                    maxValueSpread2 = valueSpread2[i];
-                    component = i;
+                    return pmacc::math::abs2(this->meanValue);
                 }
-            }
 
-            return maxValueSpread2;
-        }
 
+                /** determine in which of the two sub-Voronoi cells a particle falls */
+                HDINLINE
+                int32_t getSubVoronoiCell(const floatD_X position, const float3_X momentum) const
+                {
+                    const float_X valParticle = this->splittingStage == VoronoiSplittingStage::position
+                        ? position[this->splittingComponent]
+                        : momentum[this->splittingComponent];
 
-        /** calculate the maxmimum squared spread in position
-         *
-         * @param component index of position component of maxmimum spread
-         * @return maxmimum squared spread in position
-         */
-        HDINLINE
-        float_X getMaxPositionSpread2( uint8_t& component ) const
-        {
-            return this->getMaxValueSpread2( component, simDim );
-        }
+                    const float_X meanVoronoi = this->meanValue[this->splittingComponent];
+
+                    return valParticle < meanVoronoi ? this->lowerCellId : this->higherCellId;
+                }
 
 
-        /** calculate the maxmimum squared spread in momentum
-         *
-         * @param component index of momentum component of maxmimum spread
-         * @return maxmimum squared spread in momentum
-         */
-        HDINLINE
-        float_X getMaxMomentumSpread2( uint8_t& component ) const
-        {
-            return this->getMaxValueSpread2( component, DIM3 );
-        }
-    };
+                /** auxillary function for getting the mean squared deviation in position or momentum */
+                HDINLINE
+                float_X getMaxValueSpread2(uint8_t& component, const uint8_t dimension) const
+                {
+                    const float3_X meanValue2 = this->meanValue * this->meanValue;
+                    const float3_X valueSpread2 = this->meanSquaredValue - meanValue2;
+
+                    /* find component of most spread in position */
+                    component = 0;
+                    float_X maxValueSpread2 = valueSpread2[0];
+                    for(uint8_t i = 1; i < dimension; i++)
+                    {
+                        if(valueSpread2[i] > maxValueSpread2)
+                        {
+                            maxValueSpread2 = valueSpread2[i];
+                            component = i;
+                        }
+                    }
+
+                    return maxValueSpread2;
+                }
+
+
+                /** calculate the maxmimum squared spread in position
+                 *
+                 * @param component index of position component of maxmimum spread
+                 * @return maxmimum squared spread in position
+                 */
+                HDINLINE
+                float_X getMaxPositionSpread2(uint8_t& component) const
+                {
+                    return this->getMaxValueSpread2(component, simDim);
+                }
+
+
+                /** calculate the maxmimum squared spread in momentum
+                 *
+                 * @param component index of momentum component of maxmimum spread
+                 * @return maxmimum squared spread in momentum
+                 */
+                HDINLINE
+                float_X getMaxMomentumSpread2(uint8_t& component) const
+                {
+                    return this->getMaxValueSpread2(component, DIM3);
+                }
+            };
 
-} // namespace particleMerging
-} // namespace plugins
+        } // namespace particleMerging
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/ExecuteParticleFilter.hpp b/include/picongpu/plugins/radiation/ExecuteParticleFilter.hpp
index e2d10111d3..678920f2f0 100644
--- a/include/picongpu/plugins/radiation/ExecuteParticleFilter.hpp
+++ b/include/picongpu/plugins/radiation/ExecuteParticleFilter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -29,70 +29,65 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-    /** read the `radiationMask` of a species */
-    template< bool hasFilter >
-    struct ExecuteParticleFilter
+    namespace plugins
     {
-        /** get the attribute value of `radiationMask`
-         *
-         * @param species buffer
-         * @param currentStep current simulation time step
-         * @return value of the attribute `radiationMask`
-         */
-        template< typename T_Species >
-        void operator()( std::shared_ptr<T_Species> const &, const uint32_t currentStep )
+        namespace radiation
         {
-            particles::Manipulate<
-            picongpu::plugins::radiation::RadiationParticleFilter,
-                T_Species
-            >{}( currentStep );
-        }
-    };
+            /** read the `radiationMask` of a species */
+            template<bool hasFilter>
+            struct ExecuteParticleFilter
+            {
+                /** get the attribute value of `radiationMask`
+                 *
+                 * @param species buffer
+                 * @param currentStep current simulation time step
+                 * @return value of the attribute `radiationMask`
+                 */
+                template<typename T_Species>
+                void operator()(std::shared_ptr<T_Species> const&, const uint32_t currentStep)
+                {
+                    particles::Manipulate<picongpu::plugins::radiation::RadiationParticleFilter, T_Species>{}(
+                        currentStep);
+                }
+            };
 
-    /** specialization
-     *
-     * specialization for the case that the species not owns the attribute
-     * `radiationMask`
-     */
-    template< >
-    struct ExecuteParticleFilter< false >
-    {
-        /** get the attribute value of `radiationMask`
-         *
-         * @param particle to be used
-         * @return always true
-         */
-        template< typename T_Species >
-        void operator()( const std::shared_ptr<T_Species>, const uint32_t currentStep )
-        {
-        }
-    };
+            /** specialization
+             *
+             * specialization for the case that the species not owns the attribute
+             * `radiationMask`
+             */
+            template<>
+            struct ExecuteParticleFilter<false>
+            {
+                /** get the attribute value of `radiationMask`
+                 *
+                 * @param particle to be used
+                 * @return always true
+                 */
+                template<typename T_Species>
+                void operator()(const std::shared_ptr<T_Species>, const uint32_t currentStep)
+                {
+                }
+            };
 
-    /** execute the particle filter on a species
-     *
-     * It is **allowed** to call this function even if the species does not contain
-     * the attribute `radiationMask`.
-     * The filter is **not** executed if the species does not contain the attribute `radiationMask`.
-     *
-     * @tparam T_Species species type
-     * @param species species to be filtered
-     */
-    template< typename T_Species >
-    void executeParticleFilter( std::shared_ptr<T_Species>& species, const uint32_t currentStep )
-    {
-        constexpr bool hasRadiationFilter = pmacc::traits::HasIdentifier<
-            typename T_Species::FrameType,
-            radiationMask
-        >::type::value;
+            /** execute the particle filter on a species
+             *
+             * It is **allowed** to call this function even if the species does not contain
+             * the attribute `radiationMask`.
+             * The filter is **not** executed if the species does not contain the attribute `radiationMask`.
+             *
+             * @tparam T_Species species type
+             * @param species species to be filtered
+             */
+            template<typename T_Species>
+            void executeParticleFilter(std::shared_ptr<T_Species>& species, const uint32_t currentStep)
+            {
+                constexpr bool hasRadiationFilter
+                    = pmacc::traits::HasIdentifier<typename T_Species::FrameType, radiationMask>::type::value;
 
-        return ExecuteParticleFilter< hasRadiationFilter >{}( species, currentStep );
-    }
+                return ExecuteParticleFilter<hasRadiationFilter>{}(species, currentStep);
+            }
 
-} // namespace radiation
-} // namespace plugins
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/GetRadiationMask.hpp b/include/picongpu/plugins/radiation/GetRadiationMask.hpp
index 5ca1c58661..ab8db86968 100644
--- a/include/picongpu/plugins/radiation/GetRadiationMask.hpp
+++ b/include/picongpu/plugins/radiation/GetRadiationMask.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,64 +26,63 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-    /** read the `radiationMask` of a species */
-    template< bool hasRadiationMask >
-    struct GetRadiationMask
+    namespace plugins
     {
-        /** get the attribute value of `radiationMask`
-         *
-         * @param particle particle to be used
-         * @return value of the attribute `radiationMask`
-         */
-        template< typename T_Particle >
-        HDINLINE bool operator()( const T_Particle& particle ) const
+        namespace radiation
         {
-          return particle[ picongpu::radiationMask_ ];
-        }
-    };
+            /** read the `radiationMask` of a species */
+            template<bool hasRadiationMask>
+            struct GetRadiationMask
+            {
+                /** get the attribute value of `radiationMask`
+                 *
+                 * @param particle particle to be used
+                 * @return value of the attribute `radiationMask`
+                 */
+                template<typename T_Particle>
+                HDINLINE bool operator()(const T_Particle& particle) const
+                {
+                    return particle[picongpu::radiationMask_];
+                }
+            };
 
-    /** specialization
-     *
-     * specialization for the case that the species not owns the attribute
-     * `radiationMask`
-     */
-    template< >
-    struct GetRadiationMask< false >
-    {
-        /** get the attribute value of `radiationMask`
-         *
-         * @param particle to be used
-         * @return always true
-         */
-        template< typename T_Particle >
-        HDINLINE bool operator()( const T_Particle& ) const
-        {
-            return true;
-        }
-    };
+            /** specialization
+             *
+             * specialization for the case that the species not owns the attribute
+             * `radiationMask`
+             */
+            template<>
+            struct GetRadiationMask<false>
+            {
+                /** get the attribute value of `radiationMask`
+                 *
+                 * @param particle to be used
+                 * @return always true
+                 */
+                template<typename T_Particle>
+                HDINLINE bool operator()(const T_Particle&) const
+                {
+                    return true;
+                }
+            };
 
-    /** get the value of the particle attribute `radiationMask`
-     *
-     * Allow to read out the value of the attribute `radiationMask` also if
-     * it is not defined for the particle.
-     *
-     * @tparam T_Particle particle type
-     * @param particle valid particle
-     * @return particle attribute value `radiationMask`, always `true` if attribute `radiationMask` is not defined
-     */
-    template< typename T_Particle >
-    HDINLINE bool getRadiationMask( const T_Particle& particle )
-    {
-        constexpr bool hasRadiationMask = pmacc::traits::HasIdentifier<
-            typename T_Particle::FrameType,
-            radiationMask
-        >::type::value;
-        return GetRadiationMask< hasRadiationMask >{}( particle );
-    }
-} // namespace radiation
-} // namespace plugins
+            /** get the value of the particle attribute `radiationMask`
+             *
+             * Allow to read out the value of the attribute `radiationMask` also if
+             * it is not defined for the particle.
+             *
+             * @tparam T_Particle particle type
+             * @param particle valid particle
+             * @return particle attribute value `radiationMask`, always `true` if attribute `radiationMask` is not
+             * defined
+             */
+            template<typename T_Particle>
+            HDINLINE bool getRadiationMask(const T_Particle& particle)
+            {
+                constexpr bool hasRadiationMask
+                    = pmacc::traits::HasIdentifier<typename T_Particle::FrameType, radiationMask>::type::value;
+                return GetRadiationMask<hasRadiationMask>{}(particle);
+            }
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/Radiation.hpp b/include/picongpu/plugins/radiation/Radiation.hpp
index fe8c23ac1f..e62abb3292 100644
--- a/include/picongpu/plugins/radiation/Radiation.hpp
+++ b/include/picongpu/plugins/radiation/Radiation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Klaus Steiniger, Felix Schmitt, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -21,7 +21,7 @@
 #pragma once
 
 #if(ENABLE_HDF5 != 1)
-#error The activated radiation plugin (radiation.param) requires HDF5
+#    error The activated radiation plugin (radiation.param) requires HDF5
 #endif
 
 #include "picongpu/simulation_defines.hpp"
@@ -51,1226 +51,1214 @@
 #include <iostream>
 #include <fstream>
 #include <cstdlib>
-
+#include <vector>
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-using namespace pmacc;
+    namespace plugins
+    {
+        namespace radiation
+        {
+            using namespace pmacc;
 
-namespace po = boost::program_options;
+            namespace po = boost::program_options;
 
+            namespace idLabels
+            {
+                enum meshRecordLabelsEnum
+                {
+                    Amplitude = 0,
+                    Detector = 1,
+                    Frequency = 2
+                };
+            } // end namespace idLabels
 
 
-namespace idLabels
-{
-  enum meshRecordLabelsEnum
-  {
-      Amplitude = 0,
-      Detector = 1,
-      Frequency = 2
-  };
-}// end namespace idLabels
+            ///////////////////////////////////////////////////////////////////////////////////////////////
+            ///////////////////////////////  Radiation Plugin Class  ////////////////////////////////////
+            ///////////////////////////////////////////////////////////////////////////////////////////////
 
+            template<class ParticlesType>
+            class Radiation : public ISimulationPlugin
+            {
+                using Amplitude = picongpu::plugins::radiation::Amplitude<>;
 
+            private:
+                typedef MappingDesc::SuperCellSize SuperCellSize;
 
-///////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////  Radiation Plugin Class  ////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////
+                typedef PIConGPUVerboseRadiation radLog;
 
-template<class ParticlesType>
-class Radiation : public ISimulationPlugin
-{
-private:
-
-    typedef MappingDesc::SuperCellSize SuperCellSize;
-
-    typedef PIConGPUVerboseRadiation radLog;
-
-    /**
-     * Object that stores the complex radiated amplitude on host and device.
-     * Radiated amplitude is a function of theta (looking direction) and
-     * frequency. Layout of the radiation array is:
-     * [omega_1(theta_1),omega_2(theta_1),...,omega_N-omega(theta_1),
-     *   omega_1(theta_2),omega_2(theta_2),...,omega_N-omega(theta_N-theta)]
-     */
-    GridBuffer<Amplitude, DIM1> *radiation;
-    radiation_frequencies::InitFreqFunctor freqInit;
-    radiation_frequencies::FreqFunctor freqFkt;
-
-    MappingDesc *cellDescription;
-    std::string notifyPeriod;
-    uint32_t dumpPeriod;
-    uint32_t radStart;
-    uint32_t radEnd;
-
-    std::string speciesName;
-    std::string pluginName;
-    std::string pluginPrefix;
-    std::string filename_prefix;
-    bool totalRad;
-    bool lastRad;
-    std::string folderLastRad;
-    std::string folderTotalRad;
-    bool radPerGPU;
-    std::string folderRadPerGPU;
-    DataSpace<simDim> lastGPUpos;
-
-    /**
-     * Data structure for storage and summation of the intermediate values of
-     * the calculated Amplitude from every host for every direction and
-     * frequency.
-     */
-    Amplitude* timeSumArray;
-    Amplitude *tmp_result;
-    vector_64* detectorPositions;
-    float_64* detectorFrequencies;
-
-    bool isMaster;
-
-    uint32_t currentStep;
-    uint32_t lastStep;
-
-    std::string pathRestart;
-    std::string meshesPathName;
-    std::string particlesPathName;
-
-    mpi::MPIReduce reduce;
-    bool compressionOn;
-    static const int numberMeshRecords = 3;
-
-public:
-
-    Radiation() :
-    pluginName("Radiation: calculate the radiation of a species"),
-    speciesName(ParticlesType::FrameType::getName()),
-    pluginPrefix(speciesName + std::string("_radiation")),
-    filename_prefix(pluginPrefix),
-    radiation(nullptr),
-    cellDescription(nullptr),
-    dumpPeriod(0),
-    totalRad(false),
-    lastRad(false),
-    timeSumArray(nullptr),
-    tmp_result(nullptr),
-    detectorPositions(nullptr),
-    detectorFrequencies(nullptr),
-    isMaster(false),
-    currentStep(0),
-    radPerGPU(false),
-    lastStep(0),
-    meshesPathName("DetectorMesh/"),
-    particlesPathName("DetectorParticle/"),
-    compressionOn(false)
-    {
-        Environment<>::get().PluginConnector().registerPlugin(this);
-    }
+                /**
+                 * Object that stores the complex radiated amplitude on host and device.
+                 * Radiated amplitude is a function of theta (looking direction) and
+                 * frequency. Layout of the radiation array is:
+                 * [omega_1(theta_1),omega_2(theta_1),...,omega_N-omega(theta_1),
+                 *   omega_1(theta_2),omega_2(theta_2),...,omega_N-omega(theta_N-theta)]
+                 * The second dimension is used to store intermediate results if command
+                 * line option numJobs is > 1.
+                 */
+                GridBuffer<Amplitude, 2>* radiation;
+                radiation_frequencies::InitFreqFunctor freqInit;
+                radiation_frequencies::FreqFunctor freqFkt;
+
+                MappingDesc* cellDescription;
+                std::string notifyPeriod;
+                uint32_t dumpPeriod;
+                uint32_t radStart;
+                uint32_t radEnd;
+
+                std::string speciesName;
+                std::string pluginName;
+                std::string pluginPrefix;
+                std::string filename_prefix;
+                bool totalRad;
+                bool lastRad;
+                std::string folderLastRad;
+                std::string folderTotalRad;
+                bool radPerGPU;
+                std::string folderRadPerGPU;
+                DataSpace<simDim> lastGPUpos;
+                int numJobs;
+
+                /**
+                 * Data structure for storage and summation of the intermediate values of
+                 * the calculated Amplitude from every host for every direction and
+                 * frequency.
+                 */
+                std::vector<Amplitude> timeSumArray;
+                std::vector<Amplitude> tmp_result;
+                std::vector<vector_64> detectorPositions;
+                std::vector<float_64> detectorFrequencies;
+
+                bool isMaster;
+
+                uint32_t currentStep;
+                uint32_t lastStep;
+
+                std::string pathRestart;
+                std::string meshesPathName;
+                std::string particlesPathName;
+
+                mpi::MPIReduce reduce;
+                bool compressionOn;
+                static const int numberMeshRecords = 3;
+
+            public:
+                Radiation()
+                    : pluginName("Radiation: calculate the radiation of a species")
+                    , speciesName(ParticlesType::FrameType::getName())
+                    , pluginPrefix(speciesName + std::string("_radiation"))
+                    , filename_prefix(pluginPrefix)
+                    , radiation(nullptr)
+                    , cellDescription(nullptr)
+                    , dumpPeriod(0)
+                    , totalRad(false)
+                    , lastRad(false)
+                    , isMaster(false)
+                    , currentStep(0)
+                    , radPerGPU(false)
+                    , lastStep(0)
+                    , meshesPathName("DetectorMesh/")
+                    , particlesPathName("DetectorParticle/")
+                    , compressionOn(false)
+                {
+                    Environment<>::get().PluginConnector().registerPlugin(this);
+                }
 
-    virtual ~Radiation()
-    {
-    }
-
-    /**
-     * This function represents what is actually calculated if the plugin
-     * is called. Here, one only sets the particles pointer to the data of
-     * the latest time step and calls the 'calculateRadiationParticles'
-     * function if for the actual time step radiation is to be calculated.
-     * @param currentStep
-     */
-    void notify(uint32_t currentStep)
-    {
-        if (currentStep >= radStart)
-        {
-            // radEnd = 0 is default, calculates radiation until simulation
-            // end
-            if (currentStep <= radEnd || radEnd == 0)
-            {
-                log<radLog::SIMULATION_STATE > ("Radiation (%1%): calculate time step %2% ") % speciesName % currentStep;
+                virtual ~Radiation()
+                {
+                }
 
-                /* CORE + BORDER is PIC black magic, currently not needed
-                 *
+                /**
+                 * This function represents what is actually calculated if the plugin
+                 * is called. Here, one only sets the particles pointer to the data of
+                 * the latest time step and calls the 'calculateRadiationParticles'
+                 * function if for the actual time step radiation is to be calculated.
+                 * @param currentStep
                  */
-                calculateRadiationParticles < CORE + BORDER > (currentStep);
+                void notify(uint32_t currentStep)
+                {
+                    if(currentStep >= radStart)
+                    {
+                        // radEnd = 0 is default, calculates radiation until simulation
+                        // end
+                        if(currentStep <= radEnd || radEnd == 0)
+                        {
+                            log<radLog::SIMULATION_STATE>("Radiation (%1%): calculate time step %2% ") % speciesName
+                                % currentStep;
+
+                            /* CORE + BORDER is PIC black magic, currently not needed
+                             *
+                             */
+                            calculateRadiationParticles<CORE + BORDER>(currentStep);
+
+                            log<radLog::SIMULATION_STATE>("Radiation (%1%): finished time step %2% ") % speciesName
+                                % currentStep;
+                        }
+                    }
+                }
 
-                log<radLog::SIMULATION_STATE > ("Radiation (%1%): finished time step %2% ") % speciesName % currentStep;
-            }
-        }
-    }
+                void pluginRegisterHelp(po::options_description& desc)
+                {
+                    desc.add_options()(
+                        (pluginPrefix + ".period").c_str(),
+                        po::value<std::string>(&notifyPeriod),
+                        "enable plugin [for each n-th step]")(
+                        (pluginPrefix + ".dump").c_str(),
+                        po::value<uint32_t>(&dumpPeriod)->default_value(0),
+                        "dump integrated radiation from last dumped step [for each n-th step] (0 = only print data at "
+                        "end of simulation)")(
+                        (pluginPrefix + ".lastRadiation").c_str(),
+                        po::bool_switch(&lastRad),
+                        "enable calculation of integrated radiation from last dumped step")(
+                        (pluginPrefix + ".folderLastRad").c_str(),
+                        po::value<std::string>(&folderLastRad)->default_value("lastRad"),
+                        "folder in which the integrated radiation from last dumped step is written")(
+                        (pluginPrefix + ".totalRadiation").c_str(),
+                        po::bool_switch(&totalRad),
+                        "enable calculation of integrated radiation from start of simulation")(
+                        (pluginPrefix + ".folderTotalRad").c_str(),
+                        po::value<std::string>(&folderTotalRad)->default_value("totalRad"),
+                        "folder in which the integrated radiation from start of simulation is written")(
+                        (pluginPrefix + ".start").c_str(),
+                        po::value<uint32_t>(&radStart)->default_value(2),
+                        "time index when radiation should start with calculation")(
+                        (pluginPrefix + ".end").c_str(),
+                        po::value<uint32_t>(&radEnd)->default_value(0),
+                        "time index when radiation should end with calculation")(
+                        (pluginPrefix + ".radPerGPU").c_str(),
+                        po::bool_switch(&radPerGPU),
+                        "enable radiation output from each GPU individually")(
+                        (pluginPrefix + ".folderRadPerGPU").c_str(),
+                        po::value<std::string>(&folderRadPerGPU)->default_value("radPerGPU"),
+                        "folder in which the radiation of each GPU is written")(
+                        (pluginPrefix + ".compression").c_str(),
+                        po::bool_switch(&compressionOn),
+                        "enable compression of hdf5 output")(
+                        (pluginPrefix + ".numJobs").c_str(),
+                        po::value<int>(&numJobs)->default_value(2),
+                        "Number of independent jobs used for the radiation calculation.");
+                }
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        desc.add_options()
-            ((pluginPrefix + ".period").c_str(), po::value<std::string> (&notifyPeriod), "enable plugin [for each n-th step]")
-            ((pluginPrefix + ".dump").c_str(), po::value<uint32_t > (&dumpPeriod)->default_value(0), "dump integrated radiation from last dumped step [for each n-th step] (0 = only print data at end of simulation)")
-            ((pluginPrefix + ".lastRadiation").c_str(), po::bool_switch(&lastRad), "enable calculation of integrated radiation from last dumped step")
-            ((pluginPrefix + ".folderLastRad").c_str(), po::value<std::string > (&folderLastRad)->default_value("lastRad"), "folder in which the integrated radiation from last dumped step is written")
-            ((pluginPrefix + ".totalRadiation").c_str(), po::bool_switch(&totalRad), "enable calculation of integrated radiation from start of simulation")
-            ((pluginPrefix + ".folderTotalRad").c_str(), po::value<std::string > (&folderTotalRad)->default_value("totalRad"), "folder in which the integrated radiation from start of simulation is written")
-            ((pluginPrefix + ".start").c_str(), po::value<uint32_t > (&radStart)->default_value(2), "time index when radiation should start with calculation")
-            ((pluginPrefix + ".end").c_str(), po::value<uint32_t > (&radEnd)->default_value(0), "time index when radiation should end with calculation")
-            ((pluginPrefix + ".radPerGPU").c_str(), po::bool_switch(&radPerGPU), "enable radiation output from each GPU individually")
-            ((pluginPrefix + ".folderRadPerGPU").c_str(), po::value<std::string > (&folderRadPerGPU)->default_value("radPerGPU"), "folder in which the radiation of each GPU is written")
-            ((pluginPrefix + ".compression").c_str(), po::bool_switch(&compressionOn), "enable compression of hdf5 output");
-    }
-
-
-    std::string pluginGetName() const
-    {
-        return pluginName;
-    }
 
+                std::string pluginGetName() const
+                {
+                    return pluginName;
+                }
 
-    void setMappingDescription(MappingDesc *cellDescription)
-    {
-        this->cellDescription = cellDescription;
-    }
 
+                void setMappingDescription(MappingDesc* cellDescription)
+                {
+                    this->cellDescription = cellDescription;
+                }
 
-    void restart(uint32_t timeStep, const std::string restartDirectory)
-    {
-        // only load backup if radiation is calculated:
-        if(notifyPeriod.empty())
-            return;
 
-        if(isMaster)
-        {
-            // this will lead to wrong lastRad output right after the checkpoint if the restart point is
-            // not a dump point. The correct lastRad data can be reconstructed from hdf5 data
-            // since text based lastRad output will be obsolete soon, this is not a problem
-            readHDF5file(timeSumArray, restartDirectory + "/" + speciesName + std::string("_radRestart_"), timeStep);
-            log<radLog::SIMULATION_STATE > ("Radiation (%1%): restart finished") % speciesName;
-        }
-    }
+                void restart(uint32_t timeStep, const std::string restartDirectory)
+                {
+                    // only load backup if radiation is calculated:
+                    if(notifyPeriod.empty())
+                        return;
+
+                    if(isMaster)
+                    {
+                        // this will lead to wrong lastRad output right after the checkpoint if the restart point is
+                        // not a dump point. The correct lastRad data can be reconstructed from hdf5 data
+                        // since text based lastRad output will be obsolete soon, this is not a problem
+                        readHDF5file(
+                            timeSumArray,
+                            restartDirectory + "/" + speciesName + std::string("_radRestart_"),
+                            timeStep);
+                        log<radLog::SIMULATION_STATE>("Radiation (%1%): restart finished") % speciesName;
+                    }
+                }
 
 
-    void checkpoint(uint32_t timeStep, const std::string restartDirectory)
-    {
-        // only write backup if radiation is calculated:
-        if(notifyPeriod.empty())
-            return;
+                void checkpoint(uint32_t timeStep, const std::string restartDirectory)
+                {
+                    // only write backup if radiation is calculated:
+                    if(notifyPeriod.empty())
+                        return;
+
+                    // collect data GPU -> CPU -> Master
+                    copyRadiationDeviceToHost();
+                    collectRadiationOnMaster();
+                    sumAmplitudesOverTime(tmp_result, timeSumArray);
+
+                    // write backup file
+                    if(isMaster)
+                    {
+                        writeHDF5file(tmp_result, restartDirectory + "/" + speciesName + std::string("_radRestart_"));
+                    }
+                }
 
-        // collect data GPU -> CPU -> Master
-        copyRadiationDeviceToHost();
-        collectRadiationOnMaster();
-        sumAmplitudesOverTime(tmp_result, timeSumArray);
 
-        // write backup file
-        if (isMaster)
-        {
-            writeHDF5file(tmp_result, restartDirectory + "/" + speciesName + std::string("_radRestart_"));
-        }
-    }
-
-
-private:
-
-    /**
-     * The plugin is loaded on every MPI rank, and therefor this function is
-     * executed on every MPI rank.
-     * One host with MPI rank 0 is defined to be the master.
-     * It creates a folder where all the
-     * results are saved and, depending on the type of radiation calculation,
-     * creates an additional data structure for the summation of all
-     * intermediate values.
-     * On every host data structure for storage of the calculated radiation
-     * is created.       */
-    void pluginLoad()
-    {
-        if(!notifyPeriod.empty())
-        {
-            // allocate memory for all amplitudes for temporal data collection
-            tmp_result = new Amplitude[elements_amplitude()];
+            private:
+                /**
+                 * The plugin is loaded on every MPI rank, and therefor this function is
+                 * executed on every MPI rank.
+                 * One host with MPI rank 0 is defined to be the master.
+                 * It creates a folder where all the
+                 * results are saved and, depending on the type of radiation calculation,
+                 * creates an additional data structure for the summation of all
+                 * intermediate values.
+                 * On every host data structure for storage of the calculated radiation
+                 * is created.       */
+                void pluginLoad()
+                {
+                    if(!notifyPeriod.empty())
+                    {
+                        if(numJobs <= 0)
+                        {
+                            std::cerr << "'numJobs' must be '>=1' value is adjusted from" << numJobs << " to '1'."
+                                      << std::endl;
+                            numJobs = 1;
+                        }
+                        /* allocate memory for all amplitudes for temporal data collection
+                         * ACCUMULATOR! Should be in double precision for numerical stability.
+                         */
+                        tmp_result.resize(elements_amplitude(), Amplitude::zero());
+
+                        /*only rank 0 creates a file*/
+                        isMaster = reduce.hasResult(mpi::reduceMethods::Reduce());
+
+                        /* Buffer for GPU results.
+                         * The second dimension is used to store intermediate results if command
+                         * line option numJobs is > 1.
+                         */
+                        radiation = new GridBuffer<Amplitude, 2>(DataSpace<2>(elements_amplitude(), numJobs));
+
+                        freqInit.Init(frequencies_from_list::listLocation);
+                        freqFkt = freqInit.getFunctor();
+
+                        Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+                        pmacc::Filesystem<simDim>& fs = Environment<simDim>::get().Filesystem();
+
+                        if(isMaster)
+                        {
+                            timeSumArray.resize(elements_amplitude(), Amplitude::zero());
+
+                            /* save detector position / observation direction */
+                            detectorPositions.resize(parameters::N_observer);
+                            for(uint32_t detectorIndex = 0; detectorIndex < parameters::N_observer; ++detectorIndex)
+                            {
+                                detectorPositions[detectorIndex]
+                                    = radiation_observer::observation_direction(detectorIndex);
+                            }
+
+                            /* save detector frequencies */
+                            detectorFrequencies.resize(radiation_frequencies::N_omega);
+                            for(uint32_t detectorIndex = 0; detectorIndex < radiation_frequencies::N_omega;
+                                ++detectorIndex)
+                            {
+                                detectorFrequencies[detectorIndex] = freqFkt.get(detectorIndex);
+                            }
+                        }
+
+                        if(isMaster)
+                        {
+                            fs.createDirectory("radiationHDF5");
+                            fs.setDirectoryPermissions("radiationHDF5");
+                        }
+
+
+                        if(isMaster && radPerGPU)
+                        {
+                            fs.createDirectory(folderRadPerGPU);
+                            fs.setDirectoryPermissions(folderRadPerGPU);
+                        }
+
+                        if(isMaster && totalRad)
+                        {
+                            // create folder for total output
+                            fs.createDirectory(folderTotalRad);
+                            fs.setDirectoryPermissions(folderTotalRad);
+                        }
+                        if(isMaster && lastRad)
+                        {
+                            // create folder for total output
+                            fs.createDirectory(folderLastRad);
+                            fs.setDirectoryPermissions(folderLastRad);
+                        }
+                    }
+                }
 
-            /*only rank 0 create a file*/
-            isMaster = reduce.hasResult(mpi::reduceMethods::Reduce());
 
-            radiation = new GridBuffer<Amplitude, DIM1 > (DataSpace<DIM1 > (elements_amplitude())); //create one int on GPU and host
+                void pluginUnload()
+                {
+                    if(!notifyPeriod.empty())
+                    {
+                        // Some funny things that make it possible for the kernel to calculate
+                        // the absolute position of the particles
+                        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                        DataSpace<simDim> localSize(subGrid.getLocalDomain().size);
+                        const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+                        DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset);
+                        globalOffset.y() += (localSize.y() * numSlides);
+
+                        // only print data at end of simulation if no dump period was set
+                        if(dumpPeriod == 0)
+                        {
+                            collectDataGPUToMaster();
+                            writeAllFiles(globalOffset);
+                        }
+
+
+                        __delete(radiation);
+                        CUDA_CHECK(cuplaGetLastError());
+                    }
+                }
 
-            freqInit.Init(frequencies_from_list::listLocation);
-            freqFkt = freqInit.getFunctor();
 
-            Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
-            pmacc::Filesystem<simDim>& fs = Environment<simDim>::get().Filesystem();
+                /** Method to copy data from GPU to CPU */
+                void copyRadiationDeviceToHost()
+                {
+                    radiation->deviceToHost();
+                    __getTransactionEvent().waitForFinished();
+
+                    auto dbox = radiation->getHostBuffer().getDataBox();
+                    int numAmp = elements_amplitude();
+                    // update the main result matrix (y index zero)
+                    for(int resultIdx = 1; resultIdx < numJobs; ++resultIdx)
+                        for(int ampIdx = 0; ampIdx < numAmp; ++ampIdx)
+                        {
+                            dbox(DataSpace<2>(ampIdx, 0)) += dbox(DataSpace<2>(ampIdx, resultIdx));
+                        }
+                }
 
-            if (isMaster)
-            {
-                timeSumArray = new Amplitude[elements_amplitude()];
-                for (unsigned int i = 0; i < elements_amplitude(); ++i)
-                    timeSumArray[i] = Amplitude::zero();
 
-                /* save detector position / observation direction */
-                detectorPositions = new vector_64[parameters::N_observer];
-                for(uint32_t detectorIndex=0; detectorIndex < parameters::N_observer; ++detectorIndex)
+                /** write radiation from each GPU to file individually
+                 *  requires call of copyRadiationDeviceToHost() before */
+                void saveRadPerGPU(const DataSpace<simDim> currentGPUpos)
                 {
-                    detectorPositions[detectorIndex] = radiation_observer::observation_direction(detectorIndex);
+                    if(radPerGPU)
+                    {
+                        // only print lastGPUrad if full time period was covered
+                        if(lastGPUpos == currentGPUpos)
+                        {
+                            std::stringstream last_time_step_str;
+                            std::stringstream current_time_step_str;
+                            std::stringstream GPUpos_str;
+
+                            last_time_step_str << lastStep;
+                            current_time_step_str << currentStep;
+
+                            for(uint32_t dimIndex = 0; dimIndex < simDim; ++dimIndex)
+                                GPUpos_str << "_" << currentGPUpos[dimIndex];
+
+                            writeFile(
+                                radiation->getHostBuffer().getBasePointer(),
+                                folderRadPerGPU + "/" + speciesName + "_radPerGPU_pos" + GPUpos_str.str() + "_time_"
+                                    + last_time_step_str.str() + "-" + current_time_step_str.str() + ".dat");
+                        }
+                        lastGPUpos = currentGPUpos;
+                    }
                 }
 
-                /* save detector frequencies */
-                detectorFrequencies = new float_64[radiation_frequencies::N_omega];
-                for(uint32_t detectorIndex=0; detectorIndex < radiation_frequencies::N_omega; ++detectorIndex)
+
+                /** returns number of observers (radiation detectors) */
+                static unsigned int elements_amplitude()
                 {
-                    detectorFrequencies[detectorIndex] = freqFkt.get(detectorIndex);
+                    return radiation_frequencies::N_omega
+                        * parameters::N_observer; // storage for amplitude results on GPU
                 }
-            }
 
-            if (isMaster)
-            {
-                fs.createDirectory("radiationHDF5");
-                fs.setDirectoryPermissions("radiationHDF5");
-            }
 
+                /** combine radiation data from each CPU and store result on master
+                 *  copyRadiationDeviceToHost() should be called before */
+                void collectRadiationOnMaster()
+                {
+                    reduce(
+                        nvidia::functors::Add(),
+                        tmp_result.data(),
+                        radiation->getHostBuffer().getBasePointer(),
+                        elements_amplitude(),
+                        mpi::reduceMethods::Reduce());
+                }
 
-            if (isMaster && radPerGPU)
-            {
-                fs.createDirectory(folderRadPerGPU);
-                fs.setDirectoryPermissions(folderRadPerGPU);
-            }
 
-            if (isMaster && totalRad)
-            {
-                //create folder for total output
-                fs.createDirectory(folderTotalRad);
-                fs.setDirectoryPermissions(folderTotalRad);
-            }
-            if (isMaster && lastRad)
-            {
-                //create folder for total output
-                fs.createDirectory(folderLastRad);
-                fs.setDirectoryPermissions(folderLastRad);
-            }
+                /** add collected radiation data to previously stored data
+                 *  should be called after collectRadiationOnMaster() */
+                void sumAmplitudesOverTime(std::vector<Amplitude>& targetArray, std::vector<Amplitude>& summandArray)
+                {
+                    if(isMaster)
+                    {
+                        // add last amplitudes to previous amplitudes
+                        for(unsigned int i = 0; i < elements_amplitude(); ++i)
+                            targetArray[i] += summandArray[i];
+                    }
+                }
 
-        }
-    }
 
+                /** writes to file the emitted radiation only from the current
+                 *  time step. Radiation from previous time steps is neglected. */
+                void writeLastRadToText()
+                {
+                    // only the master rank writes data
+                    if(isMaster)
+                    {
+                        // write file only if lastRad flag was selected
+                        if(lastRad)
+                        {
+                            // get time step as string
+                            std::stringstream o_step;
+                            o_step << currentStep;
+
+                            // write lastRad data to txt
+                            writeFile(
+                                tmp_result.data(),
+                                folderLastRad + "/" + filename_prefix + "_" + o_step.str() + ".dat");
+                        }
+                    }
+                }
 
-    void pluginUnload()
-    {
-        if(!notifyPeriod.empty())
-        {
 
-            // Some funny things that make it possible for the kernel to calculate
-            // the absolute position of the particles
-            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-            DataSpace<simDim> localSize(subGrid.getLocalDomain().size);
-            const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
-            DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset);
-            globalOffset.y() += (localSize.y() * numSlides);
+                /** writes the total radiation (over entire simulation time) to file */
+                void writeTotalRadToText()
+                {
+                    // only the master rank writes data
+                    if(isMaster)
+                    {
+                        // write file only if totalRad flag was selected
+                        if(totalRad)
+                        {
+                            // get time step as string
+                            std::stringstream o_step;
+                            o_step << currentStep;
+
+                            // write totalRad data to txt
+                            writeFile(
+                                timeSumArray.data(),
+                                folderTotalRad + "/" + filename_prefix + "_" + o_step.str() + ".dat");
+                        }
+                    }
+                }
 
-            // only print data at end of simulation if no dump period was set
-            if (dumpPeriod == 0)
-            {
-                collectDataGPUToMaster();
-                writeAllFiles(globalOffset);
-            }
 
-            if (isMaster)
-            {
-                __deleteArray(timeSumArray);
-                delete[] detectorPositions;
-                delete[] detectorFrequencies;
-            }
-
-            __delete(radiation);
-            CUDA_CHECK(cudaGetLastError());
-
-            __deleteArray(tmp_result);
-        }
-    }
-
-
-  /** Method to copy data from GPU to CPU */
-  void copyRadiationDeviceToHost()
-  {
-    radiation->deviceToHost();
-    __getTransactionEvent().waitForFinished();
-  }
-
-
-  /** write radiation from each GPU to file individually
-   *  requires call of copyRadiationDeviceToHost() before */
-  void saveRadPerGPU(const DataSpace<simDim> currentGPUpos)
-  {
-    if (radPerGPU)
-      {
-        // only print lastGPUrad if full time period was covered
-        if (lastGPUpos == currentGPUpos)
-          {
-            std::stringstream last_time_step_str;
-            std::stringstream current_time_step_str;
-            std::stringstream GPUpos_str;
-
-            last_time_step_str << lastStep;
-            current_time_step_str << currentStep;
-
-            for(uint32_t dimIndex=0; dimIndex<simDim; ++dimIndex)
-                GPUpos_str << "_" <<currentGPUpos[dimIndex];
-
-            writeFile(radiation->getHostBuffer().getBasePointer(), folderRadPerGPU + "/" + speciesName
-                      + "_radPerGPU_pos" + GPUpos_str.str()
-                      + "_time_" + last_time_step_str.str()
-                      + "-" + current_time_step_str.str() + ".dat");
-          }
-        lastGPUpos = currentGPUpos;
-      }
-
-  }
-
-
-  /** returns number of observers (radiation detectors) */
-  static unsigned int elements_amplitude()
-  {
-    return radiation_frequencies::N_omega * parameters::N_observer; // storage for amplitude results on GPU
-  }
-
-
-  /** combine radiation data from each CPU and store result on master
-   *  copyRadiationDeviceToHost() should be called before */
-  void collectRadiationOnMaster()
-  {
-      reduce(nvidia::functors::Add(),
-             tmp_result,
-             radiation->getHostBuffer().getBasePointer(),
-             elements_amplitude(),
-             mpi::reduceMethods::Reduce()
-             );
-  }
-
-
-  /** add collected radiation data to previously stored data
-   *  should be called after collectRadiationOnMaster() */
-  void sumAmplitudesOverTime(Amplitude* targetArray, Amplitude* summandArray)
-  {
-    if (isMaster)
-      {
-        // add last amplitudes to previous amplitudes
-        for (unsigned int i = 0; i < elements_amplitude(); ++i)
-          targetArray[i] += summandArray[i];
-      }
-  }
-
-
-
-  /** writes to file the emitted radiation only from the current
-   *  time step. Radiation from previous time steps is neglected. */
-  void writeLastRadToText()
-  {
-      // only the master rank writes data
-      if (isMaster)
-      {
-          // write file only if lastRad flag was selected
-          if (lastRad)
-          {
-              // get time step as string
-              std::stringstream o_step;
-              o_step << currentStep;
-
-              // write lastRad data to txt
-              writeFile(tmp_result, folderLastRad + "/" + filename_prefix + "_" + o_step.str() + ".dat");
-          }
-      }
-  }
-
-
-  /** writes the total radiation (over entire simulation time) to file */
-  void writeTotalRadToText()
-  {
-      // only the master rank writes data
-      if (isMaster)
-      {
-          // write file only if totalRad flag was selected
-          if (totalRad)
-          {
-              // get time step as string
-              std::stringstream o_step;
-              o_step << currentStep;
-
-              // write totalRad data to txt
-              writeFile(timeSumArray, folderTotalRad + "/" + filename_prefix + "_" + o_step.str() + ".dat");
-          }
-      }
-  }
-
-
-  /** write total radiation data as HDF5 file */
-  void writeAmplitudesToHDF5()
-  {
-      if (isMaster)
-      {
-        writeHDF5file(timeSumArray, std::string("radiationHDF5/") + speciesName + std::string("_radAmplitudes_"));
-      }
-  }
-
-
-  /** perform all operations to get data from GPU to master */
-  void collectDataGPUToMaster()
-  {
-      // collect data GPU -> CPU -> Master
-      copyRadiationDeviceToHost();
-      collectRadiationOnMaster();
-      sumAmplitudesOverTime(timeSumArray, tmp_result);
-  }
-
-
-  /** write all possible/selected output */
-  void writeAllFiles(const DataSpace<simDim> currentGPUpos)
-  {
-      // write data to files
-      saveRadPerGPU(currentGPUpos);
-      writeLastRadToText();
-      writeTotalRadToText();
-      writeAmplitudesToHDF5();
-  }
-
-
-  /** This method returns hdf5 data structure names for amplitudes
-   *
-   *  Arguments:
-   *  int index - index of Amplitude
-   *              "-1" return record name
-   *
-   *  Return:
-   *  std::string - name
-   *
-   * This method avoids initializing static constexpr string arrays.
-   */
-  static const std::string dataLabels(int index)
-  {
-      const std::string path("Amplitude/");
-
-      /* return record name if handed -1 */
-      if(index == -1)
-          return path;
-
-      const std::string dataLabelsList[] = {"x_Re",
-                                            "x_Im",
-                                            "y_Re",
-                                            "y_Im",
-                                            "z_Re",
-                                            "z_Im"};
-
-      return path + dataLabelsList[index];
-  }
-
-  /** This method returns hdf5 data structure names for detector directions
-   *
-   *  Arguments:
-   *  int index - index of detector
-   *              "-1" return record name
-   *
-   *  Return:
-   *  std::string - name
-   *
-   * This method avoids initializing static const string arrays.
-   */
-  static const std::string dataLabelsDetectorDirection(int index)
-  {
-      const std::string path("DetectorDirection/");
-
-      /* return record name if handed -1 */
-      if(index == -1)
-          return path;
-
-      const std::string dataLabelsList[] = {"x",
-                                            "y",
-                                            "z"};
-
-      return path + dataLabelsList[index];
-  }
-
-
-  /** This method returns hdf5 data structure names for detector frequencies
-   *
-   *  Arguments:
-   *  int index - index of detector
-   *              "-1" return record name
-   *
-   *  Return:
-   *  std::string - name
-   *
-   * This method avoids initializing static const string arrays.
-   */
-  static const std::string dataLabelsDetectorFrequency(int index)
-  {
-      const std::string path("DetectorFrequency/");
-
-      /* return record name if handed -1 */
-      if(index == -1)
-          return path;
-
-      const std::string dataLabelsList[] = {"omega"};
-
-      return path + dataLabelsList[index];
-  }
-
-  /** This method returns hdf5 data structure names for all mesh records
-   *
-   *  Arguments:
-   *  int index - index of record
-   *              "-1" return number of mesh records
-   *
-   *  Return:
-   *  std::string - name
-   *
-   * This method avoids initializing static const string arrays.
-   */
-  static const std::string meshRecordLabels(int index)
-  {
-      if(index == idLabels::Amplitude)
-          return dataLabels(-1);
-      else if (index == idLabels::Detector)
-          return dataLabelsDetectorDirection(-1);
-      else if (index == idLabels::Frequency)
-          return dataLabelsDetectorFrequency(-1);
-      else
-        return std::string("this-record-does-not-exist");
-  }
-
-
-
-
-
-  /** Write Amplitude data to HDF5 file
-   *
-   * Arguments:
-   * Amplitude* values - array of complex amplitude values
-   * std::string name - path and beginning of file name to store data to
-   */
-  void writeHDF5file(Amplitude* values, std::string name)
-  {
-      splash::SerialDataCollector hdf5DataFile(1);
-      splash::DataCollector::FileCreationAttr fAttr;
-
-      splash::DataCollector::initFileCreationAttr(fAttr);
-      fAttr.enableCompression = compressionOn;
-
-      std::ostringstream filename;
-      filename << name << currentStep;
-
-      hdf5DataFile.open(filename.str().c_str(), fAttr);
-
-      typename PICToSplash<float_64>::type radSplashType;
-
-
-      splash::Dimensions bufferSize(Amplitude::numComponents,
-                                    radiation_frequencies::N_omega,
-                                    parameters::N_observer);
-
-      splash::Dimensions componentSize(1,
-                                       radiation_frequencies::N_omega,
-                                       parameters::N_observer);
-
-      splash::Dimensions stride(Amplitude::numComponents,1,1);
-
-      /* get the radiation amplitude unit */
-      Amplitude UnityAmplitude(1., 0., 0., 0., 0., 0.);
-      const picongpu::float_64 factor = UnityAmplitude.calc_radiation() * UNIT_ENERGY * UNIT_TIME ;
-
-      typedef PICToSplash<float_X>::type SplashFloatXType;
-      SplashFloatXType splashFloatXType;
-
-      for(uint32_t ampIndex=0; ampIndex < Amplitude::numComponents; ++ampIndex)
-      {
-          splash::Dimensions offset(ampIndex,0,0);
-          splash::Selection dataSelection(bufferSize,
-                                          componentSize,
-                                          offset,
-                                          stride);
-
-          /* save data for each x/y/z * Re/Im amplitude */
-          hdf5DataFile.write(currentStep,
-                             radSplashType,
-                             3,
-                             dataSelection,
-                             (meshesPathName + dataLabels(ampIndex)).c_str(),
-                             values);
-
-          /* save SI unit as attribute together with data set */
-          hdf5DataFile.writeAttribute(currentStep,
-                                      radSplashType,
-                                      (meshesPathName + dataLabels(ampIndex)).c_str(),
-                                      "unitSI",
-                                      &factor);
-
-          /* position */
-          std::vector<float_X> positionMesh(simDim, 0.0); /* there is no offset - zero */
-          hdf5DataFile.writeAttribute(currentStep,
-                                      splashFloatXType,
-                                      (meshesPathName + dataLabels(ampIndex)).c_str(),
-                                      "position",
-                                      1u,
-                                      splash::Dimensions(simDim,0,0),
-                                      &(*positionMesh.begin()));
-      }
-
-      /* save SI unit as attribute in the Amplitude group (for convenience) */
-      hdf5DataFile.writeAttribute(currentStep,
-                                  radSplashType,
-                                  (meshesPathName + std::string("Amplitude")).c_str(),
-                                  "unitSI",
-                                  &factor);
-
-      /* save detector position / observation direction */
-      splash::Dimensions bufferSizeDetector(3,
-                                            1,
-                                            parameters::N_observer);
-
-      splash::Dimensions componentSizeDetector(1,
-                                               1,
-                                               parameters::N_observer);
-
-      splash::Dimensions strideDetector(3,1,1);
-
-      for(uint32_t detectorDim=0; detectorDim < 3; ++detectorDim)
-      {
-          splash::Dimensions offset(detectorDim,0,0);
-          splash::Selection dataSelection(bufferSizeDetector,
-                                      componentSizeDetector,
-                                      offset,
-                                      strideDetector);
-
-          hdf5DataFile.write(currentStep,
-                             radSplashType,
-                             3,
-                             dataSelection,
-                             (meshesPathName + dataLabelsDetectorDirection(detectorDim)).c_str(),
-                             detectorPositions);
-
-          /* save SI unit as attribute together with data set */
-          const picongpu::float_64 factorDirection = 1.0  ;
-          hdf5DataFile.writeAttribute(currentStep,
-                                      radSplashType,
-                                      (meshesPathName + dataLabelsDetectorDirection(detectorDim)).c_str(),
-                                      "unitSI",
-                                      &factorDirection);
-
-          /* position */
-          std::vector<float_X> positionMesh(simDim, 0.0); /* there is no offset - zero */
-          hdf5DataFile.writeAttribute(currentStep,
-                                      splashFloatXType,
-                                      (meshesPathName + dataLabelsDetectorDirection(detectorDim)).c_str(),
-                                      "position",
-                                      1u,
-                                      splash::Dimensions(simDim,0,0),
-                                      &(*positionMesh.begin()));
-
-      }
-
-
-
-      /* save detector frequencies */
-      splash::Dimensions bufferSizeOmega(1,
-                                         radiation_frequencies::N_omega,
-                                         1);
-
-      splash::Dimensions strideOmega(1,1,1);
-
-      splash::Dimensions offset(0,0,0);
-      splash::Selection dataSelection(bufferSizeOmega,
-                                      bufferSizeOmega,
-                                      offset,
-                                      strideOmega);
-
-      hdf5DataFile.write(currentStep,
-                         radSplashType,
-                         3,
-                         dataSelection,
-                         (meshesPathName + dataLabelsDetectorFrequency(0)).c_str(),
-                         detectorFrequencies);
-
-      /* save SI unit as attribute together with data set */
-      const picongpu::float_64 factorOmega = 1.0 / UNIT_TIME ;
-      hdf5DataFile.writeAttribute(currentStep,
-                                  radSplashType,
-                                  (meshesPathName + dataLabelsDetectorFrequency(0)).c_str(),
-                                  "unitSI",
-                                  &factorOmega);
-
-      /* position */
-      std::vector<float_X> positionMesh(simDim, 0.0); /* there is no offset - zero */
-      hdf5DataFile.writeAttribute(currentStep,
-                                  splashFloatXType,
-                                  (meshesPathName + dataLabelsDetectorFrequency(0)).c_str(),
-                                  "position",
-                                  1u,
-                                  splash::Dimensions(simDim,0,0),
-                                  &(*positionMesh.begin()));
-
-
-      /* begin openPMD attributes */
-      /* begin required openPMD global attributes */
-      std::string openPMDversion("1.0.0");
-      splash::ColTypeString ctOpenPMDversion(openPMDversion.length());
-      hdf5DataFile.writeGlobalAttribute( ctOpenPMDversion,
-                                         "openPMD",
-                                         openPMDversion.c_str() );
-
-      const uint32_t openPMDextension = 0; // no extension
-      splash::ColTypeUInt32 ctUInt32;
-      hdf5DataFile.writeGlobalAttribute( ctUInt32,
-                                         "openPMDextension",
-                                         &openPMDextension );
-
-      std::string basePath("/data/%T/");
-      splash::ColTypeString ctBasePath(basePath.length());
-      hdf5DataFile.writeGlobalAttribute(ctBasePath,
-                                        "basePath",
-                                        basePath.c_str() );
-
-      splash::ColTypeString ctMeshesPath(meshesPathName.length());
-      hdf5DataFile.writeGlobalAttribute(ctMeshesPath,
-                                        "meshesPath",
-                                        meshesPathName.c_str() );
-
-
-      splash::ColTypeString ctParticlesPath(particlesPathName.length());
-      hdf5DataFile.writeGlobalAttribute( ctParticlesPath,
-                                         "particlesPath",
-                                         particlesPathName.c_str() );
-
-      std::string iterationEncoding("fileBased");
-      splash::ColTypeString ctIterationEncoding(iterationEncoding.length());
-      hdf5DataFile.writeGlobalAttribute( ctIterationEncoding,
-                                         "iterationEncoding",
-                                         iterationEncoding.c_str() );
-
-      /* the ..._0_0_0... extension comes from the current filename
-         formating of the serial data colector in libSplash */
-      const int indexCutDirectory = name.rfind('/');
-      std::string iterationFormat(name.substr(indexCutDirectory + 1) +  std::string("%T_0_0_0.h5"));
-      splash::ColTypeString ctIterationFormat(iterationFormat.length());
-      hdf5DataFile.writeGlobalAttribute( ctIterationFormat,
-                                         "iterationFormat",
-                                         iterationFormat.c_str() );
-
-      hdf5DataFile.writeAttribute(currentStep, splashFloatXType, nullptr, "dt", &DELTA_T);
-      const float_X time = float_X(currentStep) * DELTA_T;
-      hdf5DataFile.writeAttribute(currentStep, splashFloatXType, nullptr, "time", &time);
-      splash::ColTypeDouble ctDouble;
-      hdf5DataFile.writeAttribute(currentStep, ctDouble, nullptr, "timeUnitSI", &UNIT_TIME);
-
-      /* end required openPMD global attributes */
-
-      /* begin recommended openPMD global attributes */
-
-      std::string author = Environment<>::get().SimulationDescription().getAuthor();
-      if( author.length() > 0 )
-        {
-          splash::ColTypeString ctAuthor(author.length());
-          hdf5DataFile.writeGlobalAttribute( ctAuthor,
-                                             "author",
-                                             author.c_str() );
-        }
-
-      std::string software("PIConGPU");
-      splash::ColTypeString ctSoftware(software.length());
-      hdf5DataFile.writeGlobalAttribute( ctSoftware,
-                                         "software",
-                                         software.c_str() );
-
-      std::stringstream softwareVersion;
-      softwareVersion << PICONGPU_VERSION_MAJOR << "."
-                      << PICONGPU_VERSION_MINOR << "."
-                      << PICONGPU_VERSION_PATCH;
-      if( ! std::string(PICONGPU_VERSION_LABEL).empty() )
-          softwareVersion << "-" << PICONGPU_VERSION_LABEL;
-      splash::ColTypeString ctSoftwareVersion(softwareVersion.str().length());
-      hdf5DataFile.writeGlobalAttribute( ctSoftwareVersion,
-                                         "softwareVersion",
-                                         softwareVersion.str().c_str() );
-
-      std::string date  = helper::getDateString("%F %T %z");
-      splash::ColTypeString ctDate(date.length());
-      hdf5DataFile.writeGlobalAttribute( ctDate,
-                                         "date",
-                                         date.c_str() );
-
-      /* end recommended openPMD global attributes */
-
-      /* begin required openPMD attributes for meshes records */
-
-      for(int i = 0; i<numberMeshRecords; ++i)
-      {
-          /* timeOffset */
-          const float_X timeOffset = 0.0;
-          hdf5DataFile.writeAttribute(currentStep, splashFloatXType,
-                                      (meshesPathName + meshRecordLabels(i)).c_str(),
-                                      "timeOffset", &timeOffset);
-
-          /* gridGlobalOffset */
-          std::vector<float_64> gridGlobalOffset(simDim, 0.0); /* there is no offset - zero */
-          hdf5DataFile.writeAttribute(currentStep,
-                                      ctDouble,
-                                      (meshesPathName + meshRecordLabels(i)).c_str(),
-                                      "gridGlobalOffset",
-                                      1u,
-                                      splash::Dimensions(simDim,0,0),
-                                      &(*gridGlobalOffset.begin()));
-
-          /* gridUnit */
-          /* ALL grids have indices as axises - thus no unit conversion */
-          const double unitNone = 1.0;
-          hdf5DataFile.writeAttribute(currentStep,
-                                      ctDouble,
-                                      (meshesPathName + meshRecordLabels(i)).c_str(),
-                                      "gridUnitSI",
-                                      &unitNone);
-
-          /* geometry */
-          const std::string geometry("cartesian");
-          splash::ColTypeString ctGeometry(geometry.length());
-          hdf5DataFile.writeAttribute(currentStep,
-                                      ctGeometry,
-                                      (meshesPathName + meshRecordLabels(i)).c_str(),
-                                      "geometry",
-                                      geometry.c_str());
-
-          /* dataOrder */
-          const std::string dataOrder("C");
-          splash::ColTypeString ctDataOrder(dataOrder.length());
-          hdf5DataFile.writeAttribute(currentStep,
-                                      ctDataOrder,
-                                      (meshesPathName + meshRecordLabels(i)).c_str(),
-                                      "dataOrder",
-                                      dataOrder.c_str());
-
-          std::vector<float_X> gridSpacing(simDim, 0.0);
-          for( uint32_t d = 0; d < simDim; ++d )
-              gridSpacing.at(d) = float_X(1.0);
-          hdf5DataFile.writeAttribute(currentStep,
-                                      splashFloatXType,
-                                      (meshesPathName + meshRecordLabels(i)).c_str(),
-                                      "gridSpacing",
-                                      1u,
-                                      splash::Dimensions(simDim,0,0),
-                                      &(*gridSpacing.begin()));
-
-          /* axisLabels */
-          std::list<std::string> myListOfStr;
-          if( i == idLabels::Amplitude ) /* amplitude record */
-          {
-              myListOfStr.push_back("detector direction index");
-              myListOfStr.push_back("detector frequency index");
-          }
-          else if( i == idLabels::Detector ) /* detector direction record */
-          {
-              myListOfStr.push_back("detector direction index");
-              myListOfStr.push_back("None");
-          }
-          else if( i == idLabels::Frequency ) /* detector frequency record */
-          {
-              myListOfStr.push_back("None");
-              myListOfStr.push_back("detector frequency index");
-          }
-          myListOfStr.push_back("None");
-
-          // convert to splash format
-          helper::GetSplashArrayOfString getSplashArrayOfString;
-          helper::GetSplashArrayOfString::Result myArrOfStr;
-          myArrOfStr = getSplashArrayOfString( myListOfStr );
-          splash::ColTypeString ctSomeListOfStr( myArrOfStr.maxLen );
-
-          hdf5DataFile.writeAttribute(currentStep,
-                                      ctSomeListOfStr,
-                                      (meshesPathName + meshRecordLabels(i)).c_str(),
-                                      "axisLabels",
-                                      1u, /* ndims: 1D array */
-                                      splash::Dimensions(myListOfStr.size(),0,0), /* size of 1D array */
-                                      &(myArrOfStr.buffers.at(0)));
-
-
-          /* unitDimension */
-          std::vector<float_64> unitDimension( traits::NUnitDimension, 0.0 );
-          if( i == idLabels::Amplitude ) /* amplitude record */
-          {
-              /* units Joule seconds -> Length^2 * Time^-1 * Mass^1 */
-              unitDimension[traits::SIBaseUnits::length] = 2.0;
-              unitDimension[traits::SIBaseUnits::time] = -1.0;
-              unitDimension[traits::SIBaseUnits::mass] = 1.0;
-          }
-          else if( i == idLabels::Detector ) /* detector direction record */
-          {
-              /* units none */
-          }
-          else if( i == idLabels::Frequency ) /* detector frequency record */
-          {
-              /* units 1./second -> Time^-1  */
-              unitDimension[traits::SIBaseUnits::time] = -1.0;
-          }
-          hdf5DataFile.writeAttribute(currentStep,
-                                      ctDouble,
-                                      (meshesPathName + meshRecordLabels(i)).c_str(),
-                                      "unitDimension",
-                                      1u,
-                                      splash::Dimensions(traits::NUnitDimension,0,0),
-                                      &(*unitDimension.begin()));
-
-
-      }
-      /* end required openPMD attributes for meshes */
-      /* end openPMD attributes */
-
-      hdf5DataFile.close();
-    }
-
-
-
-  /** Read Amplitude data from HDF5 file
-   *
-   * Arguments:
-   * Amplitude* values - array of complex amplitudes to store data in
-   * std::string name - path and beginning of file name with data stored in
-   * const int timeStep - time step to read
-   */
-  void readHDF5file(Amplitude* values, std::string name, const int timeStep)
-  {
-      splash::SerialDataCollector hdf5DataFile(1);
-      splash::DataCollector::FileCreationAttr fAttr;
-
-      splash::DataCollector::initFileCreationAttr(fAttr);
-
-      fAttr.fileAccType = splash::DataCollector::FAT_READ;
-
-      std::ostringstream filename;
-      /* add to standard ending added by libSplash for SerialDataCollector */
-      filename << name << timeStep << "_0_0_0.h5";
-
-      /* check if restart file exists */
-      if( !boost::filesystem::exists(filename.str()) )
-      {
-          log<picLog::INPUT_OUTPUT > ("Radiation (%1%): restart file not found (%2%) - start with zero values") %
-                                      speciesName % filename.str();
-      }
-      else
-      {
-          hdf5DataFile.open(filename.str().c_str(), fAttr);
-
-          typename PICToSplash<float_64>::type radSplashType;
-
-          splash::Dimensions componentSize(1,
-                                           radiation_frequencies::N_omega,
-                                           parameters::N_observer);
-
-          const int N_tmpBuffer = radiation_frequencies::N_omega * parameters::N_observer;
-          picongpu::float_64* tmpBuffer = new picongpu::float_64[N_tmpBuffer];
-
-          for(uint32_t ampIndex=0; ampIndex < Amplitude::numComponents; ++ampIndex)
-          {
-              hdf5DataFile.read(timeStep,
+                /** write total radiation data as HDF5 file */
+                void writeAmplitudesToHDF5()
+                {
+                    if(isMaster)
+                    {
+                        writeHDF5file(
+                            timeSumArray,
+                            std::string("radiationHDF5/") + speciesName + std::string("_radAmplitudes_"));
+                    }
+                }
+
+
+                /** perform all operations to get data from GPU to master */
+                void collectDataGPUToMaster()
+                {
+                    // collect data GPU -> CPU -> Master
+                    copyRadiationDeviceToHost();
+                    collectRadiationOnMaster();
+                    sumAmplitudesOverTime(timeSumArray, tmp_result);
+                }
+
+
+                /** write all possible/selected output */
+                void writeAllFiles(const DataSpace<simDim> currentGPUpos)
+                {
+                    // write data to files
+                    saveRadPerGPU(currentGPUpos);
+                    writeLastRadToText();
+                    writeTotalRadToText();
+                    writeAmplitudesToHDF5();
+                }
+
+
+                /** This method returns hdf5 data structure names for amplitudes
+                 *
+                 *  Arguments:
+                 *  int index - index of Amplitude
+                 *              "-1" return record name
+                 *
+                 *  Return:
+                 *  std::string - name
+                 *
+                 * This method avoids initializing static constexpr string arrays.
+                 */
+                static const std::string dataLabels(int index)
+                {
+                    const std::string path("Amplitude/");
+
+                    /* return record name if handed -1 */
+                    if(index == -1)
+                        return path;
+
+                    const std::string dataLabelsList[] = {"x_Re", "x_Im", "y_Re", "y_Im", "z_Re", "z_Im"};
+
+                    return path + dataLabelsList[index];
+                }
+
+                /** This method returns hdf5 data structure names for detector directions
+                 *
+                 *  Arguments:
+                 *  int index - index of detector
+                 *              "-1" return record name
+                 *
+                 *  Return:
+                 *  std::string - name
+                 *
+                 * This method avoids initializing static const string arrays.
+                 */
+                static const std::string dataLabelsDetectorDirection(int index)
+                {
+                    const std::string path("DetectorDirection/");
+
+                    /* return record name if handed -1 */
+                    if(index == -1)
+                        return path;
+
+                    const std::string dataLabelsList[] = {"x", "y", "z"};
+
+                    return path + dataLabelsList[index];
+                }
+
+
+                /** This method returns hdf5 data structure names for detector frequencies
+                 *
+                 *  Arguments:
+                 *  int index - index of detector
+                 *              "-1" return record name
+                 *
+                 *  Return:
+                 *  std::string - name
+                 *
+                 * This method avoids initializing static const string arrays.
+                 */
+                static const std::string dataLabelsDetectorFrequency(int index)
+                {
+                    const std::string path("DetectorFrequency/");
+
+                    /* return record name if handed -1 */
+                    if(index == -1)
+                        return path;
+
+                    const std::string dataLabelsList[] = {"omega"};
+
+                    return path + dataLabelsList[index];
+                }
+
+                /** This method returns hdf5 data structure names for all mesh records
+                 *
+                 *  Arguments:
+                 *  int index - index of record
+                 *              "-1" return number of mesh records
+                 *
+                 *  Return:
+                 *  std::string - name
+                 *
+                 * This method avoids initializing static const string arrays.
+                 */
+                static const std::string meshRecordLabels(int index)
+                {
+                    if(index == idLabels::Amplitude)
+                        return dataLabels(-1);
+                    else if(index == idLabels::Detector)
+                        return dataLabelsDetectorDirection(-1);
+                    else if(index == idLabels::Frequency)
+                        return dataLabelsDetectorFrequency(-1);
+                    else
+                        return std::string("this-record-does-not-exist");
+                }
+
+
+                /** Write Amplitude data to HDF5 file
+                 *
+                 * Arguments:
+                 * Amplitude* values - array of complex amplitude values
+                 * std::string name - path and beginning of file name to store data to
+                 */
+                void writeHDF5file(std::vector<Amplitude>& values, std::string name)
+                {
+                    splash::SerialDataCollector hdf5DataFile(1);
+                    splash::DataCollector::FileCreationAttr fAttr;
+
+                    splash::DataCollector::initFileCreationAttr(fAttr);
+                    fAttr.enableCompression = compressionOn;
+
+                    std::ostringstream filename;
+                    filename << name << currentStep;
+
+                    hdf5DataFile.open(filename.str().c_str(), fAttr);
+
+                    typename PICToSplash<Amplitude::complex_T::type>::type radSplashType;
+
+
+                    splash::Dimensions bufferSize(
+                        Amplitude::numComponents,
+                        radiation_frequencies::N_omega,
+                        parameters::N_observer);
+
+                    splash::Dimensions componentSize(1, radiation_frequencies::N_omega, parameters::N_observer);
+
+                    splash::Dimensions stride(Amplitude::numComponents, 1, 1);
+
+                    /* get the radiation amplitude unit */
+                    Amplitude UnityAmplitude(1., 0., 0., 0., 0., 0.);
+                    const picongpu::float_64 factor = UnityAmplitude.calc_radiation() * UNIT_ENERGY * UNIT_TIME;
+
+                    typedef PICToSplash<float_X>::type SplashFloatXType;
+                    SplashFloatXType splashFloatXType;
+
+                    for(uint32_t ampIndex = 0; ampIndex < Amplitude::numComponents; ++ampIndex)
+                    {
+                        splash::Dimensions offset(ampIndex, 0, 0);
+                        splash::Selection dataSelection(bufferSize, componentSize, offset, stride);
+
+                        /* save data for each x/y/z * Re/Im amplitude */
+                        hdf5DataFile.write(
+                            currentStep,
+                            radSplashType,
+                            3,
+                            dataSelection,
+                            (meshesPathName + dataLabels(ampIndex)).c_str(),
+                            values.data());
+
+                        /* save SI unit as attribute together with data set */
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            radSplashType,
+                            (meshesPathName + dataLabels(ampIndex)).c_str(),
+                            "unitSI",
+                            &factor);
+
+                        /* position */
+                        std::vector<float_X> positionMesh(simDim, 0.0); /* there is no offset - zero */
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            splashFloatXType,
+                            (meshesPathName + dataLabels(ampIndex)).c_str(),
+                            "position",
+                            1u,
+                            splash::Dimensions(simDim, 0, 0),
+                            &(*positionMesh.begin()));
+                    }
+
+                    /* save SI unit as attribute in the Amplitude group (for convenience) */
+                    hdf5DataFile.writeAttribute(
+                        currentStep,
+                        radSplashType,
+                        (meshesPathName + std::string("Amplitude")).c_str(),
+                        "unitSI",
+                        &factor);
+
+                    /* save detector position / observation direction */
+                    splash::Dimensions bufferSizeDetector(3, 1, parameters::N_observer);
+
+                    splash::Dimensions componentSizeDetector(1, 1, parameters::N_observer);
+
+                    splash::Dimensions strideDetector(3, 1, 1);
+
+                    for(uint32_t detectorDim = 0; detectorDim < 3; ++detectorDim)
+                    {
+                        splash::Dimensions offset(detectorDim, 0, 0);
+                        splash::Selection dataSelection(
+                            bufferSizeDetector,
+                            componentSizeDetector,
+                            offset,
+                            strideDetector);
+
+                        hdf5DataFile.write(
+                            currentStep,
+                            radSplashType,
+                            3,
+                            dataSelection,
+                            (meshesPathName + dataLabelsDetectorDirection(detectorDim)).c_str(),
+                            detectorPositions.data());
+
+                        /* save SI unit as attribute together with data set */
+                        const picongpu::float_64 factorDirection = 1.0;
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            radSplashType,
+                            (meshesPathName + dataLabelsDetectorDirection(detectorDim)).c_str(),
+                            "unitSI",
+                            &factorDirection);
+
+                        /* position */
+                        std::vector<float_X> positionMesh(simDim, 0.0); /* there is no offset - zero */
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            splashFloatXType,
+                            (meshesPathName + dataLabelsDetectorDirection(detectorDim)).c_str(),
+                            "position",
+                            1u,
+                            splash::Dimensions(simDim, 0, 0),
+                            &(*positionMesh.begin()));
+                    }
+
+
+                    /* save detector frequencies */
+                    splash::Dimensions bufferSizeOmega(1, radiation_frequencies::N_omega, 1);
+
+                    splash::Dimensions strideOmega(1, 1, 1);
+
+                    splash::Dimensions offset(0, 0, 0);
+                    splash::Selection dataSelection(bufferSizeOmega, bufferSizeOmega, offset, strideOmega);
+
+                    hdf5DataFile.write(
+                        currentStep,
+                        radSplashType,
+                        3,
+                        dataSelection,
+                        (meshesPathName + dataLabelsDetectorFrequency(0)).c_str(),
+                        detectorFrequencies.data());
+
+                    /* save SI unit as attribute together with data set */
+                    const picongpu::float_64 factorOmega = 1.0 / UNIT_TIME;
+                    hdf5DataFile.writeAttribute(
+                        currentStep,
+                        radSplashType,
+                        (meshesPathName + dataLabelsDetectorFrequency(0)).c_str(),
+                        "unitSI",
+                        &factorOmega);
+
+                    /* position */
+                    std::vector<float_X> positionMesh(simDim, 0.0); /* there is no offset - zero */
+                    hdf5DataFile.writeAttribute(
+                        currentStep,
+                        splashFloatXType,
+                        (meshesPathName + dataLabelsDetectorFrequency(0)).c_str(),
+                        "position",
+                        1u,
+                        splash::Dimensions(simDim, 0, 0),
+                        &(*positionMesh.begin()));
+
+
+                    /* begin openPMD attributes */
+                    /* begin required openPMD global attributes */
+                    std::string openPMDversion("1.0.0");
+                    splash::ColTypeString ctOpenPMDversion(openPMDversion.length());
+                    hdf5DataFile.writeGlobalAttribute(ctOpenPMDversion, "openPMD", openPMDversion.c_str());
+
+                    const uint32_t openPMDextension = 0; // no extension
+                    splash::ColTypeUInt32 ctUInt32;
+                    hdf5DataFile.writeGlobalAttribute(ctUInt32, "openPMDextension", &openPMDextension);
+
+                    std::string basePath("/data/%T/");
+                    splash::ColTypeString ctBasePath(basePath.length());
+                    hdf5DataFile.writeGlobalAttribute(ctBasePath, "basePath", basePath.c_str());
+
+                    splash::ColTypeString ctMeshesPath(meshesPathName.length());
+                    hdf5DataFile.writeGlobalAttribute(ctMeshesPath, "meshesPath", meshesPathName.c_str());
+
+
+                    splash::ColTypeString ctParticlesPath(particlesPathName.length());
+                    hdf5DataFile.writeGlobalAttribute(ctParticlesPath, "particlesPath", particlesPathName.c_str());
+
+                    std::string iterationEncoding("fileBased");
+                    splash::ColTypeString ctIterationEncoding(iterationEncoding.length());
+                    hdf5DataFile.writeGlobalAttribute(
+                        ctIterationEncoding,
+                        "iterationEncoding",
+                        iterationEncoding.c_str());
+
+                    /* the ..._0_0_0... extension comes from the current filename
+                       formating of the serial data colector in libSplash */
+                    const int indexCutDirectory = name.rfind('/');
+                    std::string iterationFormat(name.substr(indexCutDirectory + 1) + std::string("%T_0_0_0.h5"));
+                    splash::ColTypeString ctIterationFormat(iterationFormat.length());
+                    hdf5DataFile.writeGlobalAttribute(ctIterationFormat, "iterationFormat", iterationFormat.c_str());
+
+                    hdf5DataFile.writeAttribute(currentStep, splashFloatXType, nullptr, "dt", &DELTA_T);
+                    const float_X time = float_X(currentStep) * DELTA_T;
+                    hdf5DataFile.writeAttribute(currentStep, splashFloatXType, nullptr, "time", &time);
+                    splash::ColTypeDouble ctDouble;
+                    hdf5DataFile.writeAttribute(currentStep, ctDouble, nullptr, "timeUnitSI", &UNIT_TIME);
+
+                    /* end required openPMD global attributes */
+
+                    /* begin recommended openPMD global attributes */
+
+                    std::string author = Environment<>::get().SimulationDescription().getAuthor();
+                    if(author.length() > 0)
+                    {
+                        splash::ColTypeString ctAuthor(author.length());
+                        hdf5DataFile.writeGlobalAttribute(ctAuthor, "author", author.c_str());
+                    }
+
+                    std::string software("PIConGPU");
+                    splash::ColTypeString ctSoftware(software.length());
+                    hdf5DataFile.writeGlobalAttribute(ctSoftware, "software", software.c_str());
+
+                    std::stringstream softwareVersion;
+                    softwareVersion << PICONGPU_VERSION_MAJOR << "." << PICONGPU_VERSION_MINOR << "."
+                                    << PICONGPU_VERSION_PATCH;
+                    if(!std::string(PICONGPU_VERSION_LABEL).empty())
+                        softwareVersion << "-" << PICONGPU_VERSION_LABEL;
+                    splash::ColTypeString ctSoftwareVersion(softwareVersion.str().length());
+                    hdf5DataFile.writeGlobalAttribute(
+                        ctSoftwareVersion,
+                        "softwareVersion",
+                        softwareVersion.str().c_str());
+
+                    std::string date = helper::getDateString("%F %T %z");
+                    splash::ColTypeString ctDate(date.length());
+                    hdf5DataFile.writeGlobalAttribute(ctDate, "date", date.c_str());
+
+                    /* end recommended openPMD global attributes */
+
+                    /* begin required openPMD attributes for meshes records */
+
+                    for(int i = 0; i < numberMeshRecords; ++i)
+                    {
+                        /* timeOffset */
+                        const float_X timeOffset = 0.0;
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            splashFloatXType,
+                            (meshesPathName + meshRecordLabels(i)).c_str(),
+                            "timeOffset",
+                            &timeOffset);
+
+                        /* gridGlobalOffset */
+                        std::vector<float_64> gridGlobalOffset(simDim, 0.0); /* there is no offset - zero */
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            ctDouble,
+                            (meshesPathName + meshRecordLabels(i)).c_str(),
+                            "gridGlobalOffset",
+                            1u,
+                            splash::Dimensions(simDim, 0, 0),
+                            &(*gridGlobalOffset.begin()));
+
+                        /* gridUnit */
+                        /* ALL grids have indices as axises - thus no unit conversion */
+                        const double unitNone = 1.0;
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            ctDouble,
+                            (meshesPathName + meshRecordLabels(i)).c_str(),
+                            "gridUnitSI",
+                            &unitNone);
+
+                        /* geometry */
+                        const std::string geometry("cartesian");
+                        splash::ColTypeString ctGeometry(geometry.length());
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            ctGeometry,
+                            (meshesPathName + meshRecordLabels(i)).c_str(),
+                            "geometry",
+                            geometry.c_str());
+
+                        /* dataOrder */
+                        const std::string dataOrder("C");
+                        splash::ColTypeString ctDataOrder(dataOrder.length());
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            ctDataOrder,
+                            (meshesPathName + meshRecordLabels(i)).c_str(),
+                            "dataOrder",
+                            dataOrder.c_str());
+
+                        std::vector<float_X> gridSpacing(simDim, 0.0);
+                        for(uint32_t d = 0; d < simDim; ++d)
+                            gridSpacing.at(d) = float_X(1.0);
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            splashFloatXType,
+                            (meshesPathName + meshRecordLabels(i)).c_str(),
+                            "gridSpacing",
+                            1u,
+                            splash::Dimensions(simDim, 0, 0),
+                            &(*gridSpacing.begin()));
+
+                        /* axisLabels */
+                        std::list<std::string> myListOfStr;
+                        if(i == idLabels::Amplitude) /* amplitude record */
+                        {
+                            myListOfStr.push_back("detector direction index");
+                            myListOfStr.push_back("detector frequency index");
+                        }
+                        else if(i == idLabels::Detector) /* detector direction record */
+                        {
+                            myListOfStr.push_back("detector direction index");
+                            myListOfStr.push_back("None");
+                        }
+                        else if(i == idLabels::Frequency) /* detector frequency record */
+                        {
+                            myListOfStr.push_back("None");
+                            myListOfStr.push_back("detector frequency index");
+                        }
+                        myListOfStr.push_back("None");
+
+                        // convert to splash format
+                        helper::GetSplashArrayOfString getSplashArrayOfString;
+                        helper::GetSplashArrayOfString::Result myArrOfStr;
+                        myArrOfStr = getSplashArrayOfString(myListOfStr);
+                        splash::ColTypeString ctSomeListOfStr(myArrOfStr.maxLen);
+
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            ctSomeListOfStr,
+                            (meshesPathName + meshRecordLabels(i)).c_str(),
+                            "axisLabels",
+                            1u, /* ndims: 1D array */
+                            splash::Dimensions(myListOfStr.size(), 0, 0), /* size of 1D array */
+                            &(myArrOfStr.buffers.at(0)));
+
+
+                        /* unitDimension */
+                        std::vector<float_64> unitDimension(traits::NUnitDimension, 0.0);
+                        if(i == idLabels::Amplitude) /* amplitude record */
+                        {
+                            /* units Joule seconds -> Length^2 * Time^-1 * Mass^1 */
+                            unitDimension[traits::SIBaseUnits::length] = 2.0;
+                            unitDimension[traits::SIBaseUnits::time] = -1.0;
+                            unitDimension[traits::SIBaseUnits::mass] = 1.0;
+                        }
+                        else if(i == idLabels::Detector) /* detector direction record */
+                        {
+                            /* units none */
+                        }
+                        else if(i == idLabels::Frequency) /* detector frequency record */
+                        {
+                            /* units 1./second -> Time^-1  */
+                            unitDimension[traits::SIBaseUnits::time] = -1.0;
+                        }
+                        hdf5DataFile.writeAttribute(
+                            currentStep,
+                            ctDouble,
+                            (meshesPathName + meshRecordLabels(i)).c_str(),
+                            "unitDimension",
+                            1u,
+                            splash::Dimensions(traits::NUnitDimension, 0, 0),
+                            &(*unitDimension.begin()));
+                    }
+                    /* end required openPMD attributes for meshes */
+                    /* end openPMD attributes */
+
+                    hdf5DataFile.close();
+                }
+
+
+                /** Read Amplitude data from HDF5 file
+                 *
+                 * Arguments:
+                 * Amplitude* values - array of complex amplitudes to store data in
+                 * std::string name - path and beginning of file name with data stored in
+                 * const int timeStep - time step to read
+                 */
+                void readHDF5file(std::vector<Amplitude>& values, std::string name, const int timeStep)
+                {
+                    splash::SerialDataCollector hdf5DataFile(1);
+                    splash::DataCollector::FileCreationAttr fAttr;
+
+                    splash::DataCollector::initFileCreationAttr(fAttr);
+
+                    fAttr.fileAccType = splash::DataCollector::FAT_READ;
+
+                    std::ostringstream filename;
+                    /* add to standard ending added by libSplash for SerialDataCollector */
+                    filename << name << timeStep << "_0_0_0.h5";
+
+                    /* check if restart file exists */
+                    if(!boost::filesystem::exists(filename.str()))
+                    {
+                        log<picLog::INPUT_OUTPUT>(
+                            "Radiation (%1%): restart file not found (%2%) - start with zero values")
+                            % speciesName % filename.str();
+                    }
+                    else
+                    {
+                        hdf5DataFile.open(filename.str().c_str(), fAttr);
+
+                        typename PICToSplash<float_64>::type radSplashType;
+
+                        splash::Dimensions componentSize(1, radiation_frequencies::N_omega, parameters::N_observer);
+
+                        const int N_tmpBuffer = radiation_frequencies::N_omega * parameters::N_observer;
+                        picongpu::float_64* tmpBuffer = new picongpu::float_64[N_tmpBuffer];
+
+                        for(uint32_t ampIndex = 0; ampIndex < Amplitude::numComponents; ++ampIndex)
+                        {
+                            hdf5DataFile.read(
+                                timeStep,
                                 (meshesPathName + dataLabels(ampIndex)).c_str(),
                                 componentSize,
                                 tmpBuffer);
 
-              for(int copyIndex = 0; copyIndex < N_tmpBuffer; ++copyIndex)
-              {
-                  /* convert data directly because Amplitude is just 6 float_64 */
-                  ((picongpu::float_64*)values)[ampIndex + Amplitude::numComponents*copyIndex] = tmpBuffer[copyIndex];
-              }
-
-          }
-
-          delete[] tmpBuffer;
-          hdf5DataFile.close();
-
-          log<picLog::INPUT_OUTPUT > ("Radiation (%1%): read radiation data from HDF5") % speciesName;
-      }
-  }
-
-
-  /**
-   * From the collected data from all hosts the radiated intensity is
-   * calculated by calculating the absolute value squared and multiplying
-   * this with with the appropriate physics constants.
-   * @param values
-   * @param name
-   */
-  void writeFile(Amplitude* values, std::string name)
-  {
-      std::ofstream outFile;
-      outFile.open(name.c_str(), std::ofstream::out | std::ostream::trunc);
-      if (!outFile)
-      {
-          std::cerr << "Can't open file [" << name << "] for output, disable plugin output. " << std::endl;
-          isMaster = false; // no Master anymore -> no process is able to write
-      }
-      else
-      {
-          for (unsigned int index_direction = 0; index_direction < parameters::N_observer; ++index_direction) // over all directions
-          {
-              for (unsigned index_omega = 0; index_omega < radiation_frequencies::N_omega; ++index_omega) // over all frequencies
-              {
-                  // Take Amplitude for one direction and frequency,
-                  // calculate the square of the absolute value
-                  // and write to file.
-                  outFile <<
-                    values[index_omega + index_direction * radiation_frequencies::N_omega].calc_radiation() * UNIT_ENERGY * UNIT_TIME << "\t";
-
-              }
-              outFile << std::endl;
-          }
-          outFile.flush();
-          outFile << std::endl; //now all data are written to file
-
-          if (outFile.fail())
-              std::cerr << "Error on flushing file [" << name << "]. " << std::endl;
-
-          outFile.close();
-      }
-  }
-
-  /**
-   * This functions calls the radiation kernel. It specifies how the
-   * calculation is parallelized.
-   *      gridDim_rad is the number of Thread-Blocks in a grid
-   *      blockDim_rad is the number of threads per block
-   *
-   * -----------------------------------------------------------
-   * | Grid                                                    |
-   * |   --------------   --------------                       |
-   * |   |   Block 0  |   |   Block 1  |                       |
-   * |   |o      o    |   |o      o    |                       |
-   * |   |o      o    |   |o      o    |                       |
-   * |   |th1    th2  |   |th1    th2  |                       |
-   * |   --------------   --------------                       |
-   * -----------------------------------------------------------
-   *
-   * !!! The TEMPLATE parameter is not used anymore.
-   * !!! But the calculations it is supposed to do is hard coded in the
-   *     kernel.
-   * !!! THIS NEEDS TO BE CHANGED !!!
-   *
-   * @param currentStep
-   */
-  template< uint32_t AREA> /*This Template Parameter is not used anymore*/
-  void calculateRadiationParticles(uint32_t currentStep)
-  {
-      this->currentStep = currentStep;
-
-      DataConnector &dc = Environment<>::get().DataConnector();
-      auto particles = dc.get< ParticlesType >( ParticlesType::FrameType::getName(), true );
-
-      /* execute the particle filter */
-      radiation::executeParticleFilter( particles, currentStep );
-
-      /* the parallelization is ONLY over directions:
-       * (a combined parallelization over direction AND frequencies
-       * turned out to be slower on GPUs of the Fermi generation (sm_2x) (couple
-       * percent) and definitely slower on Kepler GPUs (sm_3x, tested on K20))
-       */
-      const int N_observer = parameters::N_observer;
-      const auto gridDim_rad = N_observer;
-
-      /* number of threads per block = number of cells in a super cell
-       *          = number of particles in a Frame
-       *          (THIS IS PIConGPU SPECIFIC)
-       * A Frame is the entity that stores particles.
-       * A super cell can have many Frames.
-       * Particles in a Frame can be accessed in parallel.
-       */
-
-      // Some funny things that make it possible for the kernel to calculate
-      // the absolute position of the particles
-      DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
-      const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
-      const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-      DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset);
-      globalOffset.y() += (localSize.y() * numSlides);
-
-      constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-          pmacc::math::CT::volume< SuperCellSize >::type::value
-      >::value;
-
-
-      // PIC-like kernel call of the radiation kernel
-      PMACC_KERNEL( KernelRadiationParticles<
-          numWorkers
-      >{} )(
-          gridDim_rad,
-          numWorkers
-      )(
-         /*Pointer to particles memory on the device*/
-         particles->getDeviceParticlesBox(),
-
-         /*Pointer to memory of radiated amplitude on the device*/
-         radiation->getDeviceBuffer().getDataBox(),
-         globalOffset,
-         currentStep, *cellDescription,
-         freqFkt,
-         subGrid.getGlobalDomain().size
-      );
-
-      dc.releaseData( ParticlesType::FrameType::getName() );
-
-      if (dumpPeriod != 0 && currentStep % dumpPeriod == 0)
-      {
-          collectDataGPUToMaster();
-          writeAllFiles(globalOffset);
-
-          // update time steps
-          lastStep = currentStep;
-
-          // reset amplitudes on GPU back to zero
-          radiation->getDeviceBuffer().reset(false);
-      }
-
-  }
-
-};
-
-} // namespace radiation
-} // namespace plugins
-
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_UnspecifiedSpecies
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        plugins::radiation::Radiation< T_UnspecifiedSpecies >
-    >
+                            for(int copyIndex = 0; copyIndex < N_tmpBuffer; ++copyIndex)
+                            {
+                                /* convert data directly because Amplitude is just 6 float_32 */
+                                ((picongpu::float_64*) values.data())[ampIndex + Amplitude::numComponents * copyIndex]
+                                    = tmpBuffer[copyIndex];
+                            }
+                        }
+
+                        delete[] tmpBuffer;
+                        hdf5DataFile.close();
+
+                        log<picLog::INPUT_OUTPUT>("Radiation (%1%): read radiation data from HDF5") % speciesName;
+                    }
+                }
+
+
+                /**
+                 * From the collected data from all hosts the radiated intensity is
+                 * calculated by calculating the absolute value squared and multiplying
+                 * this with with the appropriate physics constants.
+                 * @param values
+                 * @param name
+                 */
+                void writeFile(Amplitude* values, std::string name)
+                {
+                    std::ofstream outFile;
+                    outFile.open(name.c_str(), std::ofstream::out | std::ostream::trunc);
+                    if(!outFile)
+                    {
+                        std::cerr << "Can't open file [" << name << "] for output, disable plugin output. "
+                                  << std::endl;
+                        isMaster = false; // no Master anymore -> no process is able to write
+                    }
+                    else
+                    {
+                        for(unsigned int index_direction = 0; index_direction < parameters::N_observer;
+                            ++index_direction) // over all directions
+                        {
+                            for(unsigned index_omega = 0; index_omega < radiation_frequencies::N_omega;
+                                ++index_omega) // over all frequencies
+                            {
+                                // Take Amplitude for one direction and frequency,
+                                // calculate the square of the absolute value
+                                // and write to file.
+                                outFile << values[index_omega + index_direction * radiation_frequencies::N_omega]
+                                               .calc_radiation()
+                                        * UNIT_ENERGY * UNIT_TIME
+                                        << "\t";
+                            }
+                            outFile << std::endl;
+                        }
+                        outFile.flush();
+                        outFile << std::endl; // now all data are written to file
+
+                        if(outFile.fail())
+                            std::cerr << "Error on flushing file [" << name << "]. " << std::endl;
+
+                        outFile.close();
+                    }
+                }
+
+                /**
+                 * This functions calls the radiation kernel. It specifies how the
+                 * calculation is parallelized.
+                 *      gridDim_rad is the number of Thread-Blocks in a grid
+                 *      blockDim_rad is the number of threads per block
+                 *
+                 * -----------------------------------------------------------
+                 * | Grid                                                    |
+                 * |   --------------   --------------                       |
+                 * |   |   Block 0  |   |   Block 1  |                       |
+                 * |   |o      o    |   |o      o    |                       |
+                 * |   |o      o    |   |o      o    |                       |
+                 * |   |th1    th2  |   |th1    th2  |                       |
+                 * |   --------------   --------------                       |
+                 * -----------------------------------------------------------
+                 *
+                 * !!! The TEMPLATE parameter is not used anymore.
+                 * !!! But the calculations it is supposed to do is hard coded in the
+                 *     kernel.
+                 * !!! THIS NEEDS TO BE CHANGED !!!
+                 *
+                 * @param currentStep
+                 */
+                template<uint32_t AREA> /*This Template Parameter is not used anymore*/
+                void calculateRadiationParticles(uint32_t currentStep)
+                {
+                    this->currentStep = currentStep;
+
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
+
+                    /* execute the particle filter */
+                    radiation::executeParticleFilter(particles, currentStep);
+
+                    /* the parallelization is ONLY over directions:
+                     * (a combined parallelization over direction AND frequencies
+                     * turned out to be slower on GPUs of the Fermi generation (sm_2x) (couple
+                     * percent) and definitely slower on Kepler GPUs (sm_3x, tested on K20))
+                     */
+                    const int N_observer = parameters::N_observer;
+                    const auto gridDim_rad = N_observer;
+
+                    /* number of threads per block = number of cells in a super cell
+                     *          = number of particles in a Frame
+                     *          (THIS IS PIConGPU SPECIFIC)
+                     * A Frame is the entity that stores particles.
+                     * A super cell can have many Frames.
+                     * Particles in a Frame can be accessed in parallel.
+                     */
+
+                    // Some funny things that make it possible for the kernel to calculate
+                    // the absolute position of the particles
+                    DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
+                    const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+                    const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                    DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset);
+                    globalOffset.y() += (localSize.y() * numSlides);
+
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+
+                    // PIC-like kernel call of the radiation kernel
+                    PMACC_KERNEL(KernelRadiationParticles<numWorkers>{})
+                    (DataSpace<2>(gridDim_rad, numJobs), DataSpace<2>(numWorkers, 1))(
+                        /*Pointer to particles memory on the device*/
+                        particles->getDeviceParticlesBox(),
+
+                        /*Pointer to memory of radiated amplitude on the device*/
+                        radiation->getDeviceBuffer().getDataBox(),
+                        globalOffset,
+                        currentStep,
+                        *cellDescription,
+                        freqFkt,
+                        subGrid.getGlobalDomain().size);
+
+                    dc.releaseData(ParticlesType::FrameType::getName());
+
+                    if(dumpPeriod != 0 && currentStep % dumpPeriod == 0)
+                    {
+                        collectDataGPUToMaster();
+                        writeAllFiles(globalOffset);
+
+                        // update time steps
+                        lastStep = currentStep;
+
+                        // reset amplitudes on GPU back to zero
+                        radiation->getDeviceBuffer().reset(false);
+                    }
+                }
+            };
+
+        } // namespace radiation
+    } // namespace plugins
+
+    namespace particles
     {
-        using FrameType = typename T_Species::FrameType;
-
-        // this plugin needs at least the position, a weighting, momentum and momentumPrev1 to run
-        using RequiredIdentifiers = MakeSeq_t<
-            position<>,
-            weighting,
-            momentum,
-            momentumPrev1
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        using SpeciesHasMass = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-
-        using SpeciesHasCharge = typename pmacc::traits::HasFlag<
-            FrameType,
-            chargeRatio<>
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasMass,
-            SpeciesHasCharge
-        >;
-    };
-
-} // namespace traits
-} // namespace particles
-} // namespace picongpu
+        namespace traits
+        {
+            template<typename T_Species, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<T_Species, plugins::radiation::Radiation<T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                // this plugin needs at least the position, a weighting, momentum and momentumPrev1 to run
+                using RequiredIdentifiers = MakeSeq_t<position<>, weighting, momentum, momentumPrev1>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
 
+                using SpeciesHasMass = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
 
+                using SpeciesHasCharge = typename pmacc::traits::HasFlag<FrameType, chargeRatio<>>::type;
 
+                using type = typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasMass, SpeciesHasCharge>;
+            };
+
+        } // namespace traits
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/Radiation.kernel b/include/picongpu/plugins/radiation/Radiation.kernel
index c6fd10db60..b0f4173901 100644
--- a/include/picongpu/plugins/radiation/Radiation.kernel
+++ b/include/picongpu/plugins/radiation/Radiation.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Klaus Steiniger, Felix Schmitt, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -21,10 +21,6 @@
 
 #pragma once
 
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <cstdlib>
 
 #include "picongpu/simulation_defines.hpp"
 
@@ -55,459 +51,393 @@
 #include "picongpu/plugins/radiation/radFormFactor.hpp"
 #include "sys/stat.h"
 
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-    /** calculate the radiation of a species
-     *
-     * If \p T_dependenciesFulfilled is false a dummy kernel without functionality is created
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template<
-        uint32_t T_numWorkers
-    >
-    struct KernelRadiationParticles
+    namespace plugins
     {
-        /**
-         * The radiation kernel calculates for all particles on the device the
-         * emitted radiation for every direction and every frequency.
-         * The parallelization is as follows:
-         *  - There are as many Blocks of threads as there are directions for which
-         *    radiation needs to be calculated. (A block of threads shares
-         *    shared memory)
-         *  - The number of threads per block is equal to the number of cells per
-         *    super cells which is also equal to the number of particles per frame
-         *
-         * The procedure starts with calculating unique ids for the threads and
-         * initializing the shared memory.
-         * Then a loop over all super cells starts.
-         * Every thread loads a particle from that super cell and calculates its
-         * retarded time and its real amplitude (both is dependent of the direction).
-         * For every Particle
-         * exists therefor a unique space within the shared memory.
-         * After that, a thread calculates for a specific frequency the emitted
-         * radiation of all particles.
-         * @param pb
-         * @param radiation
-         * @param globalOffset
-         * @param currentStep
-         * @param mapper
-         * @param freqFkt
-         * @param simBoxSize
-         */
-        template<
-            typename ParBox,
-            typename DBox,
-            typename Mapping,
-            typename T_Acc
-        >
-        DINLINE
-        /*__launch_bounds__(256, 4)*/
-        void operator()(
-            T_Acc const & acc,
-            ParBox pb,
-            DBox radiation,
-            DataSpace<simDim> globalOffset,
-            uint32_t currentStep,
-            Mapping mapper,
-            radiation_frequencies::FreqFunctor freqFkt,
-            DataSpace<simDim> simBoxSize
-        ) const
+        namespace radiation
         {
-            using namespace mappings::threads;
-            namespace po = boost::program_options;
-
-            constexpr uint32_t frameSize = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorker = T_numWorkers;
+            /** calculate the radiation of a species
+             *
+             * If \p T_dependenciesFulfilled is false a dummy kernel without functionality is created
+             *
+             * @tparam T_numWorkers number of workers
+             */
+            template<uint32_t T_numWorkers>
+            struct KernelRadiationParticles
+            {
+                /**
+                 * The radiation kernel calculates for all particles on the device the
+                 * emitted radiation for every direction and every frequency.
+                 * The parallelization is as follows:
+                 *  - There are as many Blocks of threads as there are directions for which
+                 *    radiation needs to be calculated. (A block of threads shares
+                 *    shared memory)
+                 *  - The number of threads per block is equal to the number of cells per
+                 *    super cells which is also equal to the number of particles per frame
+                 *
+                 * The procedure starts with calculating unique ids for the threads and
+                 * initializing the shared memory.
+                 * Then a loop over all super cells starts.
+                 * Every thread loads a particle from that super cell and calculates its
+                 * retarded time and its real amplitude (both is dependent of the direction).
+                 * For every Particle
+                 * exists therefor a unique space within the shared memory.
+                 * After that, a thread calculates for a specific frequency the emitted
+                 * radiation of all particles.
+                 * @param pb
+                 * @param radiation
+                 * @param globalOffset
+                 * @param currentStep
+                 * @param mapper
+                 * @param freqFkt
+                 * @param simBoxSize
+                 */
+                template<typename ParBox, typename DBox, typename Mapping, typename T_Acc>
+                DINLINE
+                    /*__launch_bounds__(256, 4)*/
+                    void
+                    operator()(
+                        T_Acc const& acc,
+                        ParBox pb,
+                        DBox radiation,
+                        DataSpace<simDim> globalOffset,
+                        uint32_t currentStep,
+                        Mapping mapper,
+                        radiation_frequencies::FreqFunctor freqFkt,
+                        DataSpace<simDim> simBoxSize) const
+                {
+                    using namespace mappings::threads;
+                    namespace po = boost::program_options;
+                    using Amplitude = picongpu::plugins::radiation::Amplitude<>;
+                    constexpr uint32_t frameSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    constexpr uint32_t numWorker = T_numWorkers;
+
+                    using FrameType = typename ParBox::FrameType;
+                    using FramePtr = typename ParBox::FramePtr;
+
+                    using namespace parameters; // parameters of radiation
+
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                    /// calculate radiated Amplitude
+                    /* parallelized in 1 dimensions:
+                     * looking direction (theta)
+                     * (not anymore data handling)
+                     * create shared memory for particle data to reduce global memory calls
+                     * every thread in a block loads one particle and every thread runs
+                     * through all particles and calculates the radiation for one direction
+                     * for all frequencies
+                     */
+                    constexpr int blockSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
 
-            using FrameType = typename ParBox::FrameType ;
-            using FramePtr = typename ParBox::FramePtr;
+                    // vectorial part of the integrand in the Jackson formula
+                    PMACC_SMEM(acc, real_amplitude_s, memory::Array<vector_64, blockSize>);
 
-            using namespace parameters; // parameters of radiation
+                    // retarded time
+                    PMACC_SMEM(acc, t_ret_s, memory::Array<picongpu::float_64, blockSize>);
 
-            uint32_t const workerIdx = threadIdx.x;
+                    // storage for macro particle weighting needed if
+                    // the coherent and incoherent radiation of a single
+                    // macro-particle needs to be considered
+                    PMACC_SMEM(acc, radWeighting_s, memory::Array<float_X, blockSize>);
 
-            /// calculate radiated Amplitude
-            /* parallelized in 1 dimensions:
-             * looking direction (theta)
-             * (not anymore data handling)
-             * create shared memory for particle data to reduce global memory calls
-             * every thread in a block loads one particle and every thread runs
-             * through all particles and calculates the radiation for one direction
-             * for all frequencies
-             */
-            constexpr int blockSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    // particle counter used if not all particles are considered for
+                    // radiation calculation
+                    PMACC_SMEM(acc, counter_s, int);
 
-            // vectorial part of the integrand in the Jackson formula
-            PMACC_SMEM( acc, real_amplitude_s, memory::Array< vector_64, blockSize > );
+                    PMACC_SMEM(acc, lowpass_s, memory::Array<NyquistLowPass, blockSize>);
 
-            // retarded time
-            PMACC_SMEM( acc, t_ret_s, memory::Array< picongpu::float_64, blockSize > );
 
-            // storage for macro particle weighting needed if
-            // the coherent and incoherent radiation of a single
-            // macro-particle needs to be considered
-            PMACC_SMEM( acc, radWeighting_s, memory::Array< float_X, blockSize > );
+                    int const theta_idx = cupla::blockIdx(acc).x; // cupla::blockIdx(acc).x is used to determine theta
 
-            // particle counter used if not all particles are considered for
-            // radiation calculation
-            PMACC_SMEM( acc, counter_s, int );
+                    // simulation time (needed for retarded time)
+                    picongpu::float_64 const t(picongpu::float_64(currentStep) * picongpu::float_64(DELTA_T));
 
-            PMACC_SMEM( acc, lowpass_s, memory::Array< NyquistLowPass, blockSize > );
+                    // looking direction (needed for observer) used in the thread
+                    vector_64 const look = radiation_observer::observation_direction(theta_idx);
 
+                    // get extent of guarding super cells (needed to ignore them)
+                    DataSpace<simDim> const guardingSuperCells = mapper.getGuardingSuperCells();
 
-            int const theta_idx = blockIdx.x; //blockIdx.x is used to determine theta
+                    /* number of super cells on GPU per dimension (still including guard cells)
+                     * remove both guards from count [later one sided guard needs to be added again]
+                     */
+                    DataSpace<simDim> const superCellsCount(mapper.getGridSuperCells() - 2 * guardingSuperCells);
 
-            // simulation time (needed for retarded time)
-            picongpu::float_64 const t(
-                picongpu::float_64( currentStep ) * picongpu::float_64( DELTA_T)
-            );
+                    // get absolute number of relevant super cells
+                    int const numSuperCells = superCellsCount.productOfComponents();
 
-            // looking direction (needed for observer) used in the thread
-            vector_64 const look = radiation_observer::observation_direction( theta_idx );
+                    int const numJobs = cupla::gridDim(acc).y;
+                    int const jobIdx = cupla::blockIdx(acc).y;
 
-            // get extent of guarding super cells (needed to ignore them)
-            DataSpace< simDim > const guardingSuperCells = mapper.getGuardingSuperCells();
+                    /* go over all super cells on GPU with a stride depending on number of temporary results
+                     * but ignore all guarding supercells
+                     */
+                    for(int super_cell_index = jobIdx; super_cell_index <= numSuperCells; super_cell_index += numJobs)
+                    {
+                        // select SuperCell and add one sided guard again
+                        DataSpace<simDim> const superCell
+                            = DataSpaceOperations<simDim>::map(superCellsCount, super_cell_index) + guardingSuperCells;
 
-            /* number of super cells on GPU per dimension (still including guard cells)
-             * remove both guards from count [later one sided guard needs to be added again]
-             */
-            DataSpace< simDim > const superCellsCount( mapper.getGridSuperCells( ) - 2 * guardingSuperCells );
+                        // -guardingSuperCells remove guarding block
+                        DataSpace<simDim> const superCellOffset(
+                            globalOffset + ((superCell - guardingSuperCells) * SuperCellSize::toRT()));
 
-            // get absolute number of relevant super cells
-            int const numSuperCells = superCellsCount.productOfComponents();
+                        // pointer to  frame storing particles
+                        FramePtr frame = pb.getLastFrame(superCell);
 
+                        // number  of particles in current frame
+                        lcellId_t particlesInFrame = pb.getSuperCell(superCell).getSizeLastFrame();
 
-            /* go over all super cells on GPU
-             * but ignore all guarding supercells
-             */
-            for( int super_cell_index = 0; super_cell_index <= numSuperCells; ++super_cell_index )
-            {
-                // select SuperCell and add one sided guard again
-                DataSpace< simDim > const superCell =
-                    DataSpaceOperations<simDim>::map(
-                        superCellsCount,
-                        super_cell_index
-                    ) +
-                    guardingSuperCells;
-
-                // -guardingSuperCells remove guarding block
-                DataSpace< simDim > const superCellOffset(
-                    globalOffset +
-                    (
-                        ( superCell - guardingSuperCells ) *
-                        SuperCellSize::toRT()
-                    )
-                );
-
-                // pointer to  frame storing particles
-                FramePtr frame = pb.getLastFrame( superCell );
-
-                // number  of particles in current frame
-                lcellId_t particlesInFrame = pb.getSuperCell( superCell ).getSizeLastFrame();
-
-                /* go to next supercell
-                 *
-                 * if "isValid" is false then there is no frame
-                 * inside the superCell (anymore)
-                 */
-                while( frame.isValid() )
-                {
-                    /* since a race condition can occur if "continue loop" is called,
-                     *  all threads must wait for the selection of a new frame
-                     *  until all threads have evaluated "isValid"
-                     */
-                    __syncthreads();
-
-                    ForEachIdx<
-                        IdxConfig<
-                            1,
-                            numWorker
-                        >
-                    > onlyMaster{ workerIdx };
-
-                    /* The Master process (thread 0) in every thread block is in
-                     * charge of loading a frame from
-                     * the current super cell and evaluate the total number of
-                     * particles in this frame.
-                     */
-                    onlyMaster(
-                        [&](
-                            uint32_t const,
-                            uint32_t const
-                        )
+                        /* go to next supercell
+                         *
+                         * if "isValid" is false then there is no frame
+                         * inside the superCell (anymore)
+                         */
+                        while(frame.isValid())
                         {
-                            counter_s = 0;
-                        }
-                    );
+                            /* since a race condition can occur if "continue loop" is called,
+                             *  all threads must wait for the selection of a new frame
+                             *  until all threads have evaluated "isValid"
+                             */
+                            cupla::__syncthreads(acc);
 
-                    __syncthreads();
+                            ForEachIdx<IdxConfig<1, numWorker>> onlyMaster{workerIdx};
 
-                    using ParticleDomCfg = IdxConfig<
-                        frameSize,
-                        numWorker
-                    >;
+                            /* The Master process (thread 0) in every thread block is in
+                             * charge of loading a frame from
+                             * the current super cell and evaluate the total number of
+                             * particles in this frame.
+                             */
+                            onlyMaster([&](uint32_t const, uint32_t const) { counter_s = 0; });
 
-                    // loop over all particles in the frame
-                    ForEachIdx< ParticleDomCfg > forEachParticle{ workerIdx };
+                            cupla::__syncthreads(acc);
 
-                    forEachParticle(
-                        [&](
-                            uint32_t const linearIdx,
-                            uint32_t const
-                        )
-                        {
-                            // only threads with particles are running
-                            if( linearIdx < particlesInFrame )
-                            {
+                            using ParticleDomCfg = IdxConfig<frameSize, numWorker>;
 
-                                auto par = frame[ linearIdx ];
-                                // get old and new particle momenta
-                                vector_X const particle_momentumNow = vector_X( par[momentum_] );
-                                vector_X const particle_momentumOld = vector_X( par[momentumPrev1_] );
-                                /* initializes "saveParticleAt" flag with -1
-                                 * because "counter_s" will never be -1
-                                 * therefore, if a particle is saved, a value of counter
-                                 * is stored in "saveParticleAt" != -1
-                                 * THIS IS ACTUALLY ONLY NEEDED IF: the radiation flag was set
-                                 * LATER: can this be optimized?
-                                 */
-                                int saveParticleAt = -1;
+                            // loop over all particles in the frame
+                            ForEachIdx<ParticleDomCfg> forEachParticle{workerIdx};
 
-                                /* if particle is not accelerated we skip all calculations
-                                 *
-                                 * this is a component-wise comparison
-                                 */
-                                if( particle_momentumNow != particle_momentumOld )
+                            forEachParticle([&](uint32_t const linearIdx, uint32_t const) {
+                                // only threads with particles are running
+                                if(linearIdx < particlesInFrame)
                                 {
-                                    if( getRadiationMask(par) )
-                                        saveParticleAt = nvidia::atomicAllInc(
-                                            acc,
-                                            &counter_s,
-                                            ::alpaka::hierarchy::Threads{}
-                                        );
-
-                                    /* for information:
-                                     *   atomicAdd returns an int with the previous
-                                     *   value of "counter_s" != -1
-                                     *   therefore, if a particle is selected
-                                     *   "saveParticleAs" != -1
+                                    auto par = frame[linearIdx];
+                                    // get old and new particle momenta
+                                    vector_X const particle_momentumNow = vector_X(par[momentum_]);
+                                    vector_X const particle_momentumOld = vector_X(par[momentumPrev1_]);
+                                    /* initializes "saveParticleAt" flag with -1
+                                     * because "counter_s" will never be -1
+                                     * therefore, if a particle is saved, a value of counter
+                                     * is stored in "saveParticleAt" != -1
+                                     * THIS IS ACTUALLY ONLY NEEDED IF: the radiation flag was set
+                                     * LATER: can this be optimized?
                                      */
+                                    int saveParticleAt = -1;
 
-                                    // if a particle needs to be considered
-                                    if( saveParticleAt != -1 )
+                                    /* if particle is not accelerated we skip all calculations
+                                     *
+                                     * this is a component-wise comparison
+                                     */
+                                    if(particle_momentumNow != particle_momentumOld)
                                     {
-
-                                        // calculate global position
-                                        lcellId_t const cellIdx = par[ localCellIdx_ ];
-
-                                        // position inside of the cell
-                                        floatD_X const pos = par[ position_ ];
-
-                                        // calculate global position of cell
-                                        DataSpace< simDim > const globalPos(
-                                            superCellOffset +
-                                            DataSpaceOperations< simDim >::
-                                                template map< SuperCellSize >( cellIdx )
-                                        );
-
-                                        // add global position of cell with local position of particle in cell
-                                        vector_X particle_locationNow;
-                                        // set z component to zero in case of simDim==DIM2
-                                        particle_locationNow[ 2 ] = 0.0;
-                                        // run over all components and compute gobal position
-                                        for( int i = 0; i < simDim; ++i )
-                                          particle_locationNow[ i ] =
-                                              ( float_X( globalPos[ i ] ) + pos[ i ] ) *
-                                              cellSize[ i ];
-
-                                        /* get macro-particle weighting
-                                         *
-                                         * Info:
-                                         * the weighting is the number of real particles described
-                                         * by a macro-particle
+                                        if(getRadiationMask(par))
+                                            saveParticleAt = nvidia::atomicAllInc(
+                                                acc,
+                                                &counter_s,
+                                                ::alpaka::hierarchy::Threads{});
+
+                                        /* for information:
+                                         *   atomicAdd returns an int with the previous
+                                         *   value of "counter_s" != -1
+                                         *   therefore, if a particle is selected
+                                         *   "saveParticleAs" != -1
                                          */
-                                        float_X const weighting = par[ weighting_ ];
 
-                                        /* only of coherent and incoherent radiation of a single macro-particle is
-                                         * considered, the weighting of each macro-particle needs to be stored
-                                         * in order to be considered when the actual frequency calculation is done
-                                         */
-                                        radWeighting_s[ saveParticleAt ] = weighting;
+                                        // if a particle needs to be considered
+                                        if(saveParticleAt != -1)
+                                        {
+                                            // calculate global position
+                                            lcellId_t const cellIdx = par[localCellIdx_];
+
+                                            // position inside of the cell
+                                            floatD_X const pos = par[position_];
+
+                                            // calculate global position of cell
+                                            DataSpace<simDim> const globalPos(
+                                                superCellOffset
+                                                + DataSpaceOperations<simDim>::template map<SuperCellSize>(cellIdx));
+
+                                            // add global position of cell with local position of particle in cell
+                                            vector_X particle_locationNow;
+                                            // set z component to zero in case of simDim==DIM2
+                                            particle_locationNow[2] = 0.0;
+                                            // run over all components and compute gobal position
+                                            for(int i = 0; i < simDim; ++i)
+                                                particle_locationNow[i]
+                                                    = (float_X(globalPos[i]) + pos[i]) * cellSize[i];
+
+                                            /* get macro-particle weighting
+                                             *
+                                             * Info:
+                                             * the weighting is the number of real particles described
+                                             * by a macro-particle
+                                             */
+                                            float_X const weighting = par[weighting_];
+
+                                            /* only of coherent and incoherent radiation of a single macro-particle is
+                                             * considered, the weighting of each macro-particle needs to be stored
+                                             * in order to be considered when the actual frequency calculation is done
+                                             */
+                                            radWeighting_s[saveParticleAt] = weighting;
+
+                                            // mass of macro-particle
+                                            float_X const particle_mass = attribute::getMass(weighting, par);
+
+                                            /****************************************************
+                                             **** Here happens the true physical calculation ****
+                                             ****************************************************/
+
+                                            // set up particle using the radiation's own particle class
+                                            /*!\todo please add a namespace for Particle class*/
+                                            Particle const particle(
+                                                particle_locationNow,
+                                                particle_momentumOld,
+                                                particle_momentumNow,
+                                                particle_mass);
+
+                                            // set up amplitude calculator
+                                            using Calc_Amplitude_n_sim_1 = Calc_Amplitude<Retarded_time_1, Old_DFT>;
+
+                                            // calculate amplitude
+                                            Calc_Amplitude_n_sim_1 const amplitude3(particle, DELTA_T, t);
+
+                                            // get charge of single electron ! (weighting=1.0f)
+                                            float_X const particle_charge = frame::getCharge<FrameType>();
+
+                                            /* compute real amplitude of macro-particle with a charge of
+                                             * a single electron
+                                             */
+                                            real_amplitude_s[saveParticleAt] = amplitude3.get_vector(look)
+                                                * particle_charge * picongpu::float_64(DELTA_T);
+
+                                            // retarded time stored in shared memory
+                                            t_ret_s[saveParticleAt] = amplitude3.get_t_ret(look);
+
+                                            lowpass_s[saveParticleAt] = NyquistLowPass(look, particle);
+
+                                            /* the particle amplitude is used to include the weighting
+                                             * of the window function filter without needing more memory
+                                             */
+                                            radWindowFunction::radWindowFunction const winFkt;
+
+                                            /* start with a factor of one */
+                                            float_X windowFactor = 1.0;
+
+                                            for(uint32_t d = 0; d < simDim; ++d)
+                                            {
+                                                windowFactor
+                                                    *= winFkt(particle_locationNow[d], simBoxSize[d] * cellSize[d]);
+                                            }
+
+                                            /* apply window function factor to amplitude */
+                                            real_amplitude_s[saveParticleAt] *= windowFactor;
+
+                                        } // END: if a particle needs to be considered
+                                    } // END: check if particle is accelerated
+                                } // END: only threads with particles are running
+                            });
+
+                            cupla::__syncthreads(acc); // wait till every thread has loaded its particle data
+
+
+                            // run over all  valid omegas for this thread
+                            for(int o = workerIdx; o < radiation_frequencies::N_omega; o += T_numWorkers)
+                            {
+                                /* storage for amplitude (complex 3D vector)
+                                 * it  is initialized with zeros (  0 +  i 0 )
+                                 * Attention: This is an accumulator and should
+                                 * be in double precision to ameliorate roundoff
+                                 * errors!
+                                 */
+                                Amplitude amplitude = Amplitude::zero();
 
-                                        // mass of macro-particle
-                                        float_X const particle_mass = attribute::getMass(
-                                            weighting,
-                                            par
-                                        );
+                                // compute frequency "omega" using for-loop-index "o"
+                                picongpu::float_64 const omega = freqFkt(o);
 
+                                // create a form factor object
+                                radFormFactor::radFormFactor const myRadFormFactor{};
+
+                                /* Particle loop: thread runs through loaded particle data
+                                 *
+                                 * Summation of Jackson radiation formula integrand
+                                 * over all electrons for fixed, thread-specific
+                                 * frequency
+                                 */
+                                for(int j = 0; j < counter_s; ++j)
+                                {
+                                    // check Nyquist-limit for each particle "j" and each frequency "omega"
+                                    if(lowpass_s[j].check(omega))
+                                    {
                                         /****************************************************
                                          **** Here happens the true physical calculation ****
                                          ****************************************************/
 
-                                        // set up particle using the radiation's own particle class
-                                        /*!\todo please add a namespace for Particle class*/
-                                        Particle const particle(
-                                            particle_locationNow,
-                                            particle_momentumOld,
-                                            particle_momentumNow,
-                                            particle_mass
-                                        );
-
-                                        // set up amplitude calculator
-                                        using Calc_Amplitude_n_sim_1 = Calc_Amplitude<
-                                            Retarded_time_1,
-                                            Old_DFT
-                                        >;
-
-                                        // calculate amplitude
-                                        Calc_Amplitude_n_sim_1 const amplitude3(
-                                            particle,
-                                            DELTA_T,
-                                            t
-                                        );
-
-                                        // get charge of single electron ! (weighting=1.0f)
-                                        float_X const particle_charge = frame::getCharge<FrameType>();
-
-                                        /* compute real amplitude of macro-particle with a charge of
-                                         * a single electron
-                                         */
-                                        real_amplitude_s[ saveParticleAt ] =
-                                            amplitude3.get_vector( look ) *
-                                            particle_charge *
-                                            picongpu::float_64( DELTA_T );
-
-                                        // retarded time stored in shared memory
-                                        t_ret_s[ saveParticleAt ] = amplitude3.get_t_ret( look );
+                                        // calulate the form factor's' influences to the real amplitude
+                                        vector_64 const weighted_real_amp = real_amplitude_s[j]
+                                            * precisionCast<float_64>(myRadFormFactor(radWeighting_s[j], omega, look));
 
-                                        lowpass_s[ saveParticleAt ] = NyquistLowPass(
-                                            look,
-                                            particle
-                                        );
-
-                                        /* the particle amplitude is used to include the weighting
-                                         * of the window function filter without needing more memory
+                                        /* complex amplitude increment for j-th particle
+                                         * It is local to the loop and can be single precision
                                          */
-                                        radWindowFunction::radWindowFunction const winFkt;
-
-                                        /* start with a factor of one */
-                                        float_X windowFactor = 1.0;
-
-                                        for( uint32_t d = 0; d < simDim; ++d )
-                                        {
-                                            windowFactor *= winFkt(
-                                                particle_locationNow[ d ],
-                                                simBoxSize[d] * cellSize[ d ]
-                                            );
-                                        }
+                                        Amplitude amplitude_add(weighted_real_amp, t_ret_s[j] * omega);
 
-                                        /* apply window function factor to amplitude */
-                                        real_amplitude_s[ saveParticleAt ] *= windowFactor;
+                                        // add this single amplitude those previously considered
+                                        amplitude += amplitude_add;
 
-                                    } // END: if a particle needs to be considered
-                                } // END: check if particle is accelerated
-                            } // END: only threads with particles are running
-                        }
-                    );
+                                    } // END: check Nyquist-limit for each particle "j" and each frequency "omega"
 
-                    __syncthreads(); // wait till every thread has loaded its particle data
+                                } // END: Particle loop
 
+                                /* the radiation contribution of the following is added to global memory:
+                                 *     - valid particles of last super cell
+                                 *     - from this (one) time step
+                                 *     - omega_id = theta_idx * radiation_frequencies::N_omega + o
+                                 */
+                                radiation(DataSpace<2>(theta_idx * radiation_frequencies::N_omega + o, jobIdx))
+                                    += amplitude;
 
-
-                    // run over all  valid omegas for this thread
-                    for( int o = workerIdx; o < radiation_frequencies::N_omega; o += T_numWorkers )
-                    {
-
-                        /* storage for amplitude (complex 3D vector)
-                         * it  is initialized with zeros (  0 +  i 0 )
-                         */
-                        Amplitude amplitude = Amplitude::zero();
-
-                        // compute frequency "omega" using for-loop-index "o"
-                        picongpu::float_64 const omega = freqFkt( o );
-
-                        // create a form factor object
-                        radFormFactor::radFormFactor const myRadFormFactor{ };
-
-                        /* Particle loop: thread runs through loaded particle data
-                         *
-                         * Summation of Jackson radiation formula integrand
-                         * over all electrons for fixed, thread-specific
-                         * frequency
-                         */
-                        for( int j = 0; j < counter_s; ++j )
-                        {
-
-                            // check Nyquist-limit for each particle "j" and each frequency "omega"
-                            if( lowpass_s[ j ].check( omega ) )
-                            {
-
-                                /****************************************************
-                                 **** Here happens the true physical calculation ****
-                                 ****************************************************/
-
-                                // calulate the form factor's' influences to the real amplitude
-                                vector_64 const weighted_real_amp = real_amplitude_s[ j ] *
-                                    precisionCast< float_64 >(
-                                        myRadFormFactor(
-                                            radWeighting_s[ j ],
-                                            omega,
-                                            look
-                                        )
-                                    );
-
-                                // complex amplitude for j-th particle
-                                Amplitude amplitude_add(
-                                    weighted_real_amp,
-                                    t_ret_s[ j ] * omega
-                                );
-
-                                // add this single amplitude those previously considered
-                                amplitude += amplitude_add;
-
-                            }// END: check Nyquist-limit for each particle "j" and each frequency "omega"
-
-                        }// END: Particle loop
-
-                        /* the radiation contribution of the following is added to global memory:
-                         *     - valid particles of last super cell
-                         *     - from this (one) time step
-                         *     - omega_id = theta_idx * radiation_frequencies::N_omega + o
-                         */
-                        radiation[ theta_idx * radiation_frequencies::N_omega + o] += amplitude;
-
-                    } // end frequency loop
+                            } // end frequency loop
 
 
-                    // wait till all radiation contributions for this super cell are done
-                    __syncthreads();
+                            // wait till all radiation contributions for this super cell are done
+                            cupla::__syncthreads(acc);
 
-                    /* First threads starts loading next frame of the super-cell:
-                     *
-                     * Info:
-                     *   The calculation starts with the last SuperCell (must not be full filled)
-                     *   all previous SuperCells are full with particles
-                     */
-                    particlesInFrame = frameSize;
-                    frame = pb.getPreviousFrame( frame );
+                            /* First threads starts loading next frame of the super-cell:
+                             *
+                             * Info:
+                             *   The calculation starts with the last SuperCell (must not be full filled)
+                             *   all previous SuperCells are full with particles
+                             */
+                            particlesInFrame = frameSize;
+                            frame = pb.getPreviousFrame(frame);
 
-                  } // end while(frame.isValid())
+                        } // end while(frame.isValid())
 
-              } // end loop over all super cells
+                    } // end loop over all super cells
 
 
-        } // end radiation kernel
-    };
+                } // end radiation kernel
+            };
 
-} // namespace radiation
+        } // namespace radiation
 
-} // namespace plugins
+    } // namespace plugins
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/VectorTypes.hpp b/include/picongpu/plugins/radiation/VectorTypes.hpp
index 255dbf9aa4..70067be846 100644
--- a/include/picongpu/plugins/radiation/VectorTypes.hpp
+++ b/include/picongpu/plugins/radiation/VectorTypes.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -26,13 +26,13 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-    using vector_X = cuda_vec< picongpu::float3_X, picongpu::float_X >;
-    using vector_32 = /*__align__(16)*/ cuda_vec< picongpu::float3_32, picongpu::float_32 >;
-    using vector_64 = /*__align__(32)*/ cuda_vec< picongpu::float3_64, picongpu::float_64 >;
-} // namespace radiation
-} // namespace plugins
+    namespace plugins
+    {
+        namespace radiation
+        {
+            using vector_X = cuda_vec<picongpu::float3_X, picongpu::float_X>;
+            using vector_32 = /*__align__(16)*/ cuda_vec<picongpu::float3_32, picongpu::float_32>;
+            using vector_64 = /*__align__(32)*/ cuda_vec<picongpu::float3_64, picongpu::float_64>;
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/amplitude.hpp b/include/picongpu/plugins/radiation/amplitude.hpp
index 4e143ba3cc..4a5170fa26 100644
--- a/include/picongpu/plugins/radiation/amplitude.hpp
+++ b/include/picongpu/plugins/radiation/amplitude.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Alexander Debus
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -19,144 +19,227 @@
 
 #pragma once
 
+#include "VectorTypes.hpp"
+
 #include <pmacc/algorithms/math/defines/pi.hpp>
 #include <pmacc/math/Complex.hpp>
-#include "VectorTypes.hpp"
 #include <pmacc/mpi/GetMPI_StructAsArray.hpp>
 
-
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-/** class to store 3 complex numbers for the radiated amplitude
- */
-class Amplitude
-{
-public:
-  using complex_64 = pmacc::math::Complex< picongpu::float_64 >;
-
-  /* number of scalar components in Amplitude = 3 (3D) * 2 (complex) = 6 */
-  static constexpr uint32_t numComponents = uint32_t(3) * uint32_t(sizeof(complex_64) / sizeof(typename complex_64::type));
-
-  /** constructor
-   *
-   * Arguments:
-   * - vector_64: real 3D vector
-   * - float: complex phase */
-  DINLINE Amplitude(vector_64 vec, picongpu::float_X phase)
-  {
-      picongpu::float_X cosValue;
-      picongpu::float_X sinValue;
-      picongpu::math::sincos(phase, sinValue, cosValue);
-      amp_x=picongpu::math::euler(vec.x(), picongpu::precisionCast<picongpu::float_64>(sinValue), picongpu::precisionCast<picongpu::float_64>(cosValue) );
-      amp_y=picongpu::math::euler(vec.y(), picongpu::precisionCast<picongpu::float_64>(sinValue), picongpu::precisionCast<picongpu::float_64>(cosValue) );
-      amp_z=picongpu::math::euler(vec.z(), picongpu::precisionCast<picongpu::float_64>(sinValue), picongpu::precisionCast<picongpu::float_64>(cosValue) );
-  }
-
-
-  /** default constructor
-   *
-   * \warning does not initialize values! */
-  HDINLINE Amplitude(void)
-  {
-
-  }
-
-
-  /** constructor
-   *
-   * Arguments:
-   * - 6x float: Re(x), Im(x), Re(y), Im(y), Re(z), Im(z) */
-  HDINLINE Amplitude(const picongpu::float_64 x_re, const picongpu::float_64 x_im,
-                     const picongpu::float_64 y_re, const picongpu::float_64 y_im,
-                     const picongpu::float_64 z_re, const picongpu::float_64 z_im)
-      : amp_x(x_re, x_im), amp_y(y_re, y_im), amp_z(z_re, z_im)
-  {
-
-  }
-
-
-  /** returns a zero amplitude vector
-   *
-   * used to initialize amplitudes to zero */
-  HDINLINE static Amplitude zero(void)
-  {
-      Amplitude result;
-      result.amp_x = complex_64::zero();
-      result.amp_y = complex_64::zero();
-      result.amp_z = complex_64::zero();
-      return result;
-  }
-
-  /** assign addition */
-  HDINLINE Amplitude& operator+=(const Amplitude& other)
-  {
-      amp_x += other.amp_x;
-      amp_y += other.amp_y;
-      amp_z += other.amp_z;
-      return *this;
-  }
-
-
-  /** assign difference */
-  HDINLINE Amplitude& operator-=(const Amplitude& other)
-  {
-      amp_x -= other.amp_x;
-      amp_y -= other.amp_y;
-      amp_z -= other.amp_z;
-      return *this;
-  }
-
-
-  /** calculate radiation from *this amplitude
-   *
-   * Returns: \f$\frac{d^2 I}{d \Omega d \omega} = const*Amplitude^2\f$ */
-  HDINLINE picongpu::float_64 calc_radiation(void)
-  {
-      // const SI factor radiation
-      const picongpu::float_64 factor = 1.0 /
-        (16. * util::cube(pmacc::algorithms::math::Pi< picongpu::float_64 >::value) * picongpu::EPS0 * picongpu::SPEED_OF_LIGHT);
-
-      return factor * (picongpu::math::abs2(amp_x) + picongpu::math::abs2(amp_y) + picongpu::math::abs2(amp_z));
-  }
-
-
-  /** debugging method
-   *
-   * Returns: real-x-value */
-  HDINLINE picongpu::float_64 debug(void)
-  {
-      return amp_x.get_real();
-  }
-
-
-private:
-  complex_64 amp_x; // complex amplitude x-component
-  complex_64 amp_y; // complex amplitude y-component
-  complex_64 amp_z; // complex amplitude z-component
-
-};
-} // namespace radiation
-} // namespace plugins
+    namespace plugins
+    {
+        namespace radiation
+        {
+            /** class to store 3 complex numbers for the radiated amplitude
+             */
+            template<typename T_Float = picongpu::float_64>
+            class Amplitude
+            {
+            public:
+                /* For the intermediate amplitude values we may use single precision,
+                 * for the final accumulation we will have to use double precision.
+                 */
+                using complex_T = pmacc::math::Complex<T_Float>;
+                /* number of scalar components in Amplitude = 3 (3D) * 2 (complex) = 6 */
+                static constexpr uint32_t numComponents
+                    = uint32_t(3) * uint32_t(sizeof(complex_T) / sizeof(typename complex_T::type));
+
+                /** constructor
+                 *
+                 * Arguments:
+                 * - vector_64: real 3D vector
+                 * - float: complex phase */
+                DINLINE Amplitude(vector_64 vec, picongpu::float_X phase)
+                {
+                    picongpu::float_X cosValue;
+                    picongpu::float_X sinValue;
+                    pmacc::math::sincos(phase, sinValue, cosValue);
+                    amp_x = pmacc::math::euler(
+                        precisionCast<T_Float>(vec.x()),
+                        precisionCast<T_Float>(sinValue),
+                        precisionCast<T_Float>(cosValue));
+                    amp_y = pmacc::math::euler(
+                        precisionCast<T_Float>(vec.y()),
+                        precisionCast<T_Float>(sinValue),
+                        precisionCast<T_Float>(cosValue));
+                    amp_z = pmacc::math::euler(
+                        precisionCast<T_Float>(vec.z()),
+                        precisionCast<T_Float>(sinValue),
+                        precisionCast<T_Float>(cosValue));
+                }
+
+                /** default constructor
+                 *
+                 * \warning does not initialize values! */
+                HDINLINE Amplitude(void)
+                {
+                }
+
+
+                /** constructor
+                 *
+                 * Arguments:
+                 * - 6x float: Re(x), Im(x), Re(y), Im(y), Re(z), Im(z) */
+                HDINLINE Amplitude(
+                    const picongpu::float_64 x_re,
+                    const picongpu::float_64 x_im,
+                    const picongpu::float_64 y_re,
+                    const picongpu::float_64 y_im,
+                    const picongpu::float_64 z_re,
+                    const picongpu::float_64 z_im)
+                    : amp_x(x_re, x_im)
+                    , amp_y(y_re, y_im)
+                    , amp_z(z_re, z_im)
+                {
+                }
+
+                /** constructor with member initialization
+                 *
+                 *  @param x pmacc::math::complex x component of the amplitude vector.
+                 *  @param y pmacc::math::complex y component of the amplitude vector.
+                 *  @param z pmacc::math::complex z component of the amplitude vector.
+                 */
+                HDINLINE Amplitude(const complex_T& x, const complex_T& y, const complex_T& z)
+                    : amp_x(x)
+                    , amp_y(y)
+                    , amp_z(z)
+                {
+                }
+
+                /** returns a zero amplitude vector
+                 *
+                 * used to initialize amplitudes to zero */
+                HDINLINE static Amplitude zero(void)
+                {
+                    Amplitude result;
+                    result.amp_x = complex_T::zero();
+                    result.amp_y = complex_T::zero();
+                    result.amp_z = complex_T::zero();
+                    return result;
+                }
+
+                /** assign addition */
+                HDINLINE Amplitude& operator+=(const Amplitude& other)
+                {
+                    amp_x += other.amp_x;
+                    amp_y += other.amp_y;
+                    amp_z += other.amp_z;
+                    return *this;
+                }
+
+
+                /** assign difference */
+                HDINLINE Amplitude& operator-=(const Amplitude& other)
+                {
+                    amp_x -= other.amp_x;
+                    amp_y -= other.amp_y;
+                    amp_z -= other.amp_z;
+                    return *this;
+                }
+
+
+                /** calculate radiation from *this amplitude
+                 *
+                 * Returns: \f$\frac{d^2 I}{d \Omega d \omega} = const*Amplitude^2\f$ */
+                HDINLINE picongpu::float_64 calc_radiation(void)
+                {
+                    // const SI factor radiation
+                    const picongpu::float_64 factor = 1.0
+                        / (16. * util::cube(pmacc::math::Pi<picongpu::float_64>::value) * picongpu::EPS0
+                           * picongpu::SPEED_OF_LIGHT);
+
+                    return factor * (pmacc::math::abs2(amp_x) + pmacc::math::abs2(amp_y) + pmacc::math::abs2(amp_z));
+                }
+
+
+                /** debugging method
+                 *
+                 * Returns: real-x-value */
+                HDINLINE picongpu::float_64 debug(void)
+                {
+                    return amp_x.get_real();
+                }
+
+                /** Getters for the components
+                 */
+                HDINLINE complex_T getXcomponent() const
+                {
+                    return this->amp_x;
+                }
+                HDINLINE complex_T getYcomponent() const
+                {
+                    return this->amp_y;
+                }
+                HDINLINE complex_T getZcomponent() const
+                {
+                    return this->amp_z;
+                }
+
+            private:
+                complex_T amp_x; // complex amplitude x-component
+                complex_T amp_y; // complex amplitude y-component
+                complex_T amp_z; // complex amplitude z-component
+            };
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
 
 namespace pmacc
 {
-namespace mpi
-{
+    namespace mpi
+    {
+        /** implementation of MPI transaction on Amplitude class */
+        template<>
+        HINLINE MPI_StructAsArray getMPI_StructAsArray<picongpu::plugins::radiation::Amplitude<>>()
+        {
+            MPI_StructAsArray result
+                = getMPI_StructAsArray<picongpu::plugins::radiation::Amplitude<>::complex_T::type>();
+            result.sizeMultiplier *= picongpu::plugins::radiation::Amplitude<>::numComponents;
+            return result;
+        };
+
+    } // namespace mpi
+} // namespace pmacc
 
-  /** implementation of MPI transaction on Amplitude class */
-  template<>
-  HINLINE MPI_StructAsArray getMPI_StructAsArray< picongpu::plugins::radiation::Amplitude >()
-  {
-      MPI_StructAsArray result = getMPI_StructAsArray< picongpu::plugins::radiation::Amplitude::complex_64::type > ();
-      result.sizeMultiplier *= picongpu::plugins::radiation::Amplitude::numComponents;
-      return result;
-  };
 
-} // namespace mpi
+namespace pmacc
+{
+    namespace algorithms
+    {
+        namespace precisionCast
+        {
+            /* We want to be able to cast a low
+             * precision amplitude to a high-precision one.
+             * The functors create temporary Amplitude objects and can
+             * be detrimental to performance.
+             */
+            template<typename CastToType>
+            struct TypeCast<CastToType, picongpu::plugins::radiation::Amplitude<CastToType>>
+            {
+                using result = const picongpu::plugins::radiation::Amplitude<CastToType>&;
+
+                HDINLINE result operator()(result amplitude) const
+                {
+                    return amplitude;
+                }
+            };
+
+            template<typename CastToType, typename OldType>
+            struct TypeCast<CastToType, picongpu::plugins::radiation::Amplitude<OldType>>
+            {
+                using result = picongpu::plugins::radiation::Amplitude<CastToType>;
+                using ParamType = picongpu::plugins::radiation::Amplitude<OldType>;
+                HDINLINE result operator()(const ParamType& amplitude) const
+                {
+                    result Result(
+                        precisionCast<result::complex_T::type>(amplitude.getXcomponent()),
+                        precisionCast<result::complex_T::type>(amplitude.getYcomponent()),
+                        precisionCast<result::complex_T::type>(amplitude.getZcomponent()));
+                    return Result;
+                }
+            };
+
+        } // namespace precisionCast
+    } // namespace algorithms
 } // namespace pmacc
diff --git a/include/picongpu/plugins/radiation/calc_amplitude.hpp b/include/picongpu/plugins/radiation/calc_amplitude.hpp
index afc39203a5..cc9d43e1f6 100644
--- a/include/picongpu/plugins/radiation/calc_amplitude.hpp
+++ b/include/picongpu/plugins/radiation/calc_amplitude.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -26,155 +26,158 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-//protected:
-// error class for wrong time access
-
-class Error_Accessing_Time
-{
-public:
-
-    Error_Accessing_Time(void)
-    {
-    }
-};
-
-
-struct One_minus_beta_times_n
-{
-    /// Class to calculate \f$1-\beta \times \vec n\f$
-    /// using the best suiting method depending on energy
-    /// to achieve the best numerical results
-    /// it will be used as base class for amplitude calculations
-
-    //  Taylor just includes a method, When includes just enum
-
-    HDINLINE picongpu::float_32 operator()(const vector_64& n, const Particle & particle) const
+    namespace plugins
     {
-        // 1/gamma^2:
-
-        const picongpu::float_64 gamma_inv_square(particle.get_gamma_inv_square<When::now > ());
-
-        //picongpu::float_64 value; // storage for 1-\beta \times \vec n
-
-        // if energy is high enough to cause numerical errors ( equals if 1/gamma^2 is close enough to zero)
-        // chose a Taylor approximation to to better calculate 1-\beta \times \vec n (which is close to 1-1)
-        // is energy is low, then the approximation will cause a larger error, therefor calculate
-        // 1-\beta \times \vec n directly
-        // with 0.18 the relative error will be below 0.001% for a Taylor series of 1-sqrt(1-x) of 5th order
-        if (gamma_inv_square < picongpu::GAMMA_INV_SQUARE_RAD_THRESH)
+        namespace radiation
         {
-            const picongpu::float_64 cos_theta(particle.get_cos_theta<When::now > (n)); // cosine between looking vector and momentum of particle
-            const picongpu::float_64 taylor_approx(cos_theta * Taylor()(gamma_inv_square) + (1.0 - cos_theta));
-            return  (taylor_approx);
-        }
-        else
-        {
-            const vector_64 beta(particle.get_beta<When::now > ()); // calculate v/c=beta
-            return  (1.0 - beta * n);
-        }
-
-    }
-};
-
-struct Retarded_time_1
-{
-    // interface for combined 'Amplitude_Calc' classes
-    // contains more parameters than needed to have the
-    // same interface as 'Retarded_time_2'
-
-    HDINLINE picongpu::float_64 operator()(const picongpu::float_64 t,
-                                const vector_64& n, const Particle & particle) const
-    {
-        const vector_64 r(particle.get_location<When::now > ()); // location
-        return (picongpu::float_64) (t - (n * r) / (picongpu::SPEED_OF_LIGHT));
-    }
-
-};
-
-template<typename Exponent> // divisor to the power of 'Exponent'
-struct Old_Method
-{
-    /// classical method to calculate the real vector part of the radiation's amplitude
-    /// this base class includes both possible interpretations:
-    /// with Exponent=Cube the integration over t_ret will be assumed (old FFT)
-    /// with Exponent=Square the integration over t_sim will be assumed (old DFT)
-
-    HDINLINE vector_64 operator()(const vector_64& n, const Particle& particle, const picongpu::float_64 delta_t) const
-    {
-        const vector_64 beta(particle.get_beta<When::now > ()); // beta = v/c
-        const vector_64 beta_dot((beta - particle.get_beta < When::now + 1 > ()) / delta_t); // numeric differentiation (backward difference)
-        const Exponent exponent; // instance of the Exponent class // ???is a static class and no instance possible???
-         //const One_minus_beta_times_n one_minus_beta_times_n;
-        const picongpu::float_64 factor(exponent(1.0 / (One_minus_beta_times_n()(n, particle))));
-        // factor=1/(1-beta*n)^g   g=2 for DFT and g=3 for FFT
-        return (n % ((n - beta) % beta_dot)) * factor;
-    }
-};
-
-// typedef of all possible forms of Old_Method
-//typedef Old_Method<util::Cube<picongpu::float_64> > Old_FFT;
-typedef Old_Method<util::Square<picongpu::float_64> > Old_DFT;
-
-
-
-
-// ------- Calculate Amplitude class ------------- //
-
-template<typename TimeCalc, typename VecCalc>
-class Calc_Amplitude
-{
-    /// final class for amplitude calculations
-    /// derived from a class to calculate the retarded time (TimeCalc; possibilities:
-    /// Retarded_Time_1 and Retarded_Time_2) and from a class to  calculate
-    /// the real vector part of the amplitude (VecCalc; possibilities:
-    /// Old_FFT, Old_DFT, Partial_Integral_Method_1, Partial_Integral_Method_2)
-public:
-    /// constructor
-    // takes a lot of parameters to have a general interface
-    // not all parameters are needed for all possible combinations
-    // of base classes
-
-    HDINLINE Calc_Amplitude(const Particle& particle,
-                           const picongpu::float_64 delta_t,
-                           const picongpu::float_64 t_sim)
-    : m_particle(particle), m_delta_t(delta_t), m_t_sim(t_sim)
-    {
-    }
-
-    // get real vector part of amplitude
-
-    HDINLINE vector_64 get_vector(const vector_64& n) const
-    {
-        const vector_64 look_direction(n.unit_vec()); // make sure look_direction is a unit vector
-        VecCalc vecC;
-        return vecC(look_direction, m_particle, m_delta_t);
-    }
-
-    // get retarded time
-
-    HDINLINE picongpu::float_64 get_t_ret(const vector_64 look_direction) const
-    {
-        TimeCalc timeC;
-        return timeC(m_t_sim, look_direction, m_particle);
-
-        //  const vector_64 r = particle.get_location<When::now > (); // location
-        //  return (picongpu::float_64) (t - (n * r) / (picongpu::SPEED_OF_LIGHT));
-    }
-
-private:
-    // data:
-    const Particle& m_particle; // one particle
-    const picongpu::float_64 m_delta_t; // length of one time step in simulation
-    const picongpu::float_64 m_t_sim; // simulation time (for methods not using index*delta_t )
-
-
-};
-
-} // namespace radiation
-} // namespace plugins
+            // protected:
+            // error class for wrong time access
+
+            class Error_Accessing_Time
+            {
+            public:
+                Error_Accessing_Time(void)
+                {
+                }
+            };
+
+
+            struct One_minus_beta_times_n
+            {
+                /// Class to calculate \f$1-\beta \times \vec n\f$
+                /// using the best suiting method depending on energy
+                /// to achieve the best numerical results
+                /// it will be used as base class for amplitude calculations
+
+                //  Taylor just includes a method, When includes just enum
+
+                HDINLINE picongpu::float_32 operator()(const vector_64& n, const Particle& particle) const
+                {
+                    // 1/gamma^2:
+
+                    const picongpu::float_64 gamma_inv_square(particle.get_gamma_inv_square<When::now>());
+
+                    // picongpu::float_64 value; // storage for 1-\beta \times \vec n
+
+                    // if energy is high enough to cause numerical errors ( equals if 1/gamma^2 is close enough to
+                    // zero) chose a Taylor approximation to to better calculate 1-\beta \times \vec n (which is close
+                    // to 1-1) is energy is low, then the approximation will cause a larger error, therefor calculate
+                    // 1-\beta \times \vec n directly
+                    // with 0.18 the relative error will be below 0.001% for a Taylor series of 1-sqrt(1-x) of 5th
+                    // order
+                    if(gamma_inv_square < picongpu::GAMMA_INV_SQUARE_RAD_THRESH)
+                    {
+                        const picongpu::float_64 cos_theta(particle.get_cos_theta<When::now>(
+                            n)); // cosine between looking vector and momentum of particle
+                        const picongpu::float_64 taylor_approx(
+                            cos_theta * Taylor()(gamma_inv_square) + (1.0 - cos_theta));
+                        return (taylor_approx);
+                    }
+                    else
+                    {
+                        const vector_64 beta(particle.get_beta<When::now>()); // calculate v/c=beta
+                        return (1.0 - beta * n);
+                    }
+                }
+            };
+
+            struct Retarded_time_1
+            {
+                // interface for combined 'Amplitude_Calc' classes
+                // contains more parameters than needed to have the
+                // same interface as 'Retarded_time_2'
+
+                HDINLINE picongpu::float_64 operator()(
+                    const picongpu::float_64 t,
+                    const vector_64& n,
+                    const Particle& particle) const
+                {
+                    const vector_64 r(particle.get_location<When::now>()); // location
+                    return (picongpu::float_64)(t - (n * r) / (picongpu::SPEED_OF_LIGHT));
+                }
+            };
+
+            template<typename Exponent> // divisor to the power of 'Exponent'
+            struct Old_Method
+            {
+                /// classical method to calculate the real vector part of the radiation's amplitude
+                /// this base class includes both possible interpretations:
+                /// with Exponent=Cube the integration over t_ret will be assumed (old FFT)
+                /// with Exponent=Square the integration over t_sim will be assumed (old DFT)
+
+                HDINLINE vector_64
+                operator()(const vector_64& n, const Particle& particle, const picongpu::float_64 delta_t) const
+                {
+                    const vector_64 beta(particle.get_beta<When::now>()); // beta = v/c
+                    const vector_64 beta_dot(
+                        (beta - particle.get_beta<When::now + 1>())
+                        / delta_t); // numeric differentiation (backward difference)
+                    const Exponent exponent; // instance of the Exponent class // ???is a static class and no instance
+                                             // possible??? const One_minus_beta_times_n one_minus_beta_times_n;
+                    const picongpu::float_64 factor(exponent(1.0 / (One_minus_beta_times_n()(n, particle))));
+                    // factor=1/(1-beta*n)^g   g=2 for DFT and g=3 for FFT
+                    return (n % ((n - beta) % beta_dot)) * factor;
+                }
+            };
+
+            // typedef of all possible forms of Old_Method
+            // typedef Old_Method<util::Cube<picongpu::float_64> > Old_FFT;
+            typedef Old_Method<util::Square<picongpu::float_64>> Old_DFT;
+
+
+            // ------- Calculate Amplitude class ------------- //
+
+            template<typename TimeCalc, typename VecCalc>
+            class Calc_Amplitude
+            {
+                /// final class for amplitude calculations
+                /// derived from a class to calculate the retarded time (TimeCalc; possibilities:
+                /// Retarded_Time_1 and Retarded_Time_2) and from a class to  calculate
+                /// the real vector part of the amplitude (VecCalc; possibilities:
+                /// Old_FFT, Old_DFT, Partial_Integral_Method_1, Partial_Integral_Method_2)
+            public:
+                /// constructor
+                // takes a lot of parameters to have a general interface
+                // not all parameters are needed for all possible combinations
+                // of base classes
+
+                HDINLINE Calc_Amplitude(
+                    const Particle& particle,
+                    const picongpu::float_64 delta_t,
+                    const picongpu::float_64 t_sim)
+                    : m_particle(particle)
+                    , m_delta_t(delta_t)
+                    , m_t_sim(t_sim)
+                {
+                }
+
+                // get real vector part of amplitude
+
+                HDINLINE vector_64 get_vector(const vector_64& n) const
+                {
+                    const vector_64 look_direction(n.unit_vec()); // make sure look_direction is a unit vector
+                    VecCalc vecC;
+                    return vecC(look_direction, m_particle, m_delta_t);
+                }
+
+                // get retarded time
+
+                HDINLINE picongpu::float_64 get_t_ret(const vector_64 look_direction) const
+                {
+                    TimeCalc timeC;
+                    return timeC(m_t_sim, look_direction, m_particle);
+
+                    //  const vector_64 r = particle.get_location<When::now > (); // location
+                    //  return (picongpu::float_64) (t - (n * r) / (picongpu::SPEED_OF_LIGHT));
+                }
+
+            private:
+                // data:
+                const Particle& m_particle; // one particle
+                const picongpu::float_64 m_delta_t; // length of one time step in simulation
+                const picongpu::float_64 m_t_sim; // simulation time (for methods not using index*delta_t )
+            };
+
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/check_consistency.hpp b/include/picongpu/plugins/radiation/check_consistency.hpp
index 635db595f1..d3cc7f03ae 100644
--- a/include/picongpu/plugins/radiation/check_consistency.hpp
+++ b/include/picongpu/plugins/radiation/check_consistency.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -25,22 +25,21 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-HINLINE void check_consistency(void)
-{
-    using namespace parameters;
-    std::cout << " checking efficiency of radiation code: " ;
-    if(radiation_frequencies::N_omega%radiation_frequencies::blocksize_omega == 0)
-        std::cout << "OK" << std::endl;
-    else
-        std::cout << "better use power of two for N_omega" << std::endl;
-    // \@todo is there a way to do this with  compile time asserts???
-}
+    namespace plugins
+    {
+        namespace radiation
+        {
+            HINLINE void check_consistency(void)
+            {
+                using namespace parameters;
+                std::cout << " checking efficiency of radiation code: ";
+                if(radiation_frequencies::N_omega % radiation_frequencies::blocksize_omega == 0)
+                    std::cout << "OK" << std::endl;
+                else
+                    std::cout << "better use power of two for N_omega" << std::endl;
+                // \@todo is there a way to do this with  compile time asserts???
+            }
 
-} // namespace radiation
-} // namespace plugins
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/debug/PIConGPUVerboseLogRadiation.hpp b/include/picongpu/plugins/radiation/debug/PIConGPUVerboseLogRadiation.hpp
index c34fb0440e..c10fc8ff99 100644
--- a/include/picongpu/plugins/radiation/debug/PIConGPUVerboseLogRadiation.hpp
+++ b/include/picongpu/plugins/radiation/debug/PIConGPUVerboseLogRadiation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include <stdint.h>
@@ -26,26 +25,21 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-/*create verbose class*/
-DEFINE_VERBOSE_CLASS(PIConGPUVerboseRadiation)
-(
-    /* define log levels for later use
-     * e.g. log<pmaccLogLvl::NOTHING>("TEXT");*/
-    DEFINE_LOGLVL(0,NOTHING);
-    DEFINE_LOGLVL(1,PHYSICS);
-    DEFINE_LOGLVL(2,SIMULATION_STATE);
-    DEFINE_LOGLVL(4,MEMORY);
-    DEFINE_LOGLVL(8,CRITICAL);
-)
-/*set default verbose levels (integer number)*/
-(NOTHING::lvl|PIC_VERBOSE_RADIATION);
-
-} // namespace radiation
-} // namespace plugins
+    namespace plugins
+    {
+        namespace radiation
+        {
+            /*create verbose class*/
+            DEFINE_VERBOSE_CLASS(PIConGPUVerboseRadiation)
+            (
+                /* define log levels for later use
+                 * e.g. log<pmaccLogLvl::NOTHING>("TEXT");*/
+                DEFINE_LOGLVL(0, NOTHING); DEFINE_LOGLVL(1, PHYSICS); DEFINE_LOGLVL(2, SIMULATION_STATE);
+                DEFINE_LOGLVL(4, MEMORY);
+                DEFINE_LOGLVL(8, CRITICAL);)
+                /*set default verbose levels (integer number)*/
+                (NOTHING::lvl | PIC_VERBOSE_RADIATION);
+
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
-
diff --git a/include/picongpu/plugins/radiation/frequencies/radiation_lin_freq.hpp b/include/picongpu/plugins/radiation/frequencies/radiation_lin_freq.hpp
index 3b540c8c08..cd30b3270d 100644
--- a/include/picongpu/plugins/radiation/frequencies/radiation_lin_freq.hpp
+++ b/include/picongpu/plugins/radiation/frequencies/radiation_lin_freq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -24,50 +24,50 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-namespace linear_frequencies
-{
-
-
-    class FreqFunctor
+    namespace plugins
     {
-    public:
-      FreqFunctor(void)
-      { }
+        namespace radiation
+        {
+            namespace linear_frequencies
+            {
+                class FreqFunctor
+                {
+                public:
+                    FreqFunctor(void)
+                    {
+                    }
 
-      HDINLINE float_X operator()(const int ID)
-      {
-          return omega_min + float_X(ID) * delta_omega;
-      }
+                    HDINLINE float_X operator()(const int ID)
+                    {
+                        return omega_min + float_X(ID) * delta_omega;
+                    }
 
-      HINLINE float_X get(const int ID)
-      {
-          return operator()(ID);
-      }
-    };
+                    HINLINE float_X get(const int ID)
+                    {
+                        return operator()(ID);
+                    }
+                };
 
 
-    class InitFreqFunctor
-    {
-    public:
-      InitFreqFunctor(void)
-      { }
+                class InitFreqFunctor
+                {
+                public:
+                    InitFreqFunctor(void)
+                    {
+                    }
 
-      HINLINE void Init(const std::string path )
-      { }
+                    HINLINE void Init(const std::string path)
+                    {
+                    }
 
 
-      HINLINE FreqFunctor getFunctor(void)
-      {
-    return FreqFunctor();
-      }
-    };
+                    HINLINE FreqFunctor getFunctor(void)
+                    {
+                        return FreqFunctor();
+                    }
+                };
 
-} // namespace linear_frequencies
-} // namespace radiation
-} // namespace plugins
+            } // namespace linear_frequencies
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/frequencies/radiation_list_freq.hpp b/include/picongpu/plugins/radiation/frequencies/radiation_list_freq.hpp
index 65608a9aa9..aecf48003c 100644
--- a/include/picongpu/plugins/radiation/frequencies/radiation_list_freq.hpp
+++ b/include/picongpu/plugins/radiation/frequencies/radiation_list_freq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -27,108 +27,106 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace frequencies_from_list
-{
-
-
-    class FreqFunctor
+    namespace plugins
     {
-    public:
-
-      typedef GridBuffer<float_X, DIM1>::DataBoxType DBoxType;
-
-      FreqFunctor(void)
-      { }
-
-      template< typename T >
-      FreqFunctor(T frequencies_handed)
-      {
-          this->frequencies_dev = frequencies_handed->getDeviceBuffer().getDataBox();
-          this->frequencies_host = frequencies_handed->getHostBuffer().getDataBox();
-      }
-
-      DINLINE float_X operator()(const unsigned int ID)
-      {
-          return (ID < radiation_frequencies::N_omega) ?  frequencies_dev[ID] : 0.0  ;
-      }
-
-      HINLINE float_X get(const unsigned int ID)
-      {
-          return (ID < radiation_frequencies::N_omega) ?  frequencies_host[ID] : 0.0  ;
-      }
-
-    private:
-      DBoxType frequencies_dev;
-      DBoxType frequencies_host;
-
-    };
-
-
-
-    class InitFreqFunctor
-    {
-    public:
-      InitFreqFunctor(void)
-      { }
-
-      ~InitFreqFunctor(void)
-      {
-           __delete(frequencyBuffer);
-      }
-
-      typedef GridBuffer<picongpu::float_X, DIM1>::DataBoxType DBoxType;
-
-      HINLINE void Init(const std::string path )
-      {
-
-          frequencyBuffer = new GridBuffer<float_X, DIM1>(DataSpace<DIM1> (N_omega));
-
-
-          DBoxType frequencyDB = frequencyBuffer->getHostBuffer().getDataBox();
-
-          std::ifstream freqListFile(path.c_str());
-          unsigned int i;
-
-          printf("freq: %s\n", path.c_str());
-
-          if(!freqListFile)
-          {
-              throw std::runtime_error(std::string("The radiation-frequency-file ") + path + std::string(" could not be found.\n"));
-          }
-
-
-          for(i=0; i<N_omega && !freqListFile.eof(); ++i)
-          {
-              freqListFile >> frequencyDB[i];
-              // verbose output of loaded frequencies if verbose level PHYSICS is set:
-              log<PIConGPUVerboseRadiation::PHYSICS >("freq: %1% \t %2%") % i % frequencyDB[i];
-              frequencyDB[i] *= UNIT_TIME;
-          }
-
-          if(i != N_omega)
-          {
-              throw std::runtime_error(std::string("The number of frequencies in the list and the number of frequencies in the parameters differ.\n"));
-          }
-
-          frequencyBuffer->hostToDevice();
-
-      }
-
-      FreqFunctor getFunctor(void)
-      {
-          return FreqFunctor(frequencyBuffer);
-      }
-
-    private:
-      GridBuffer<float_X, DIM1>* frequencyBuffer;
-    };
-
-
-} // namespace frequencies_from_list
-} // namespace radiation
-} // namespace plugins
+        namespace radiation
+        {
+            namespace frequencies_from_list
+            {
+                class FreqFunctor
+                {
+                public:
+                    typedef GridBuffer<float_X, DIM1>::DataBoxType DBoxType;
+
+                    FreqFunctor(void)
+                    {
+                    }
+
+                    template<typename T>
+                    FreqFunctor(T frequencies_handed)
+                    {
+                        this->frequencies_dev = frequencies_handed->getDeviceBuffer().getDataBox();
+                        this->frequencies_host = frequencies_handed->getHostBuffer().getDataBox();
+                    }
+
+                    DINLINE float_X operator()(const unsigned int ID)
+                    {
+                        return (ID < radiation_frequencies::N_omega) ? frequencies_dev[ID] : 0.0;
+                    }
+
+                    HINLINE float_X get(const unsigned int ID)
+                    {
+                        return (ID < radiation_frequencies::N_omega) ? frequencies_host[ID] : 0.0;
+                    }
+
+                private:
+                    DBoxType frequencies_dev;
+                    DBoxType frequencies_host;
+                };
+
+
+                class InitFreqFunctor
+                {
+                public:
+                    InitFreqFunctor(void)
+                    {
+                    }
+
+                    ~InitFreqFunctor(void)
+                    {
+                        __delete(frequencyBuffer);
+                    }
+
+                    typedef GridBuffer<picongpu::float_X, DIM1>::DataBoxType DBoxType;
+
+                    HINLINE void Init(const std::string path)
+                    {
+                        frequencyBuffer = new GridBuffer<float_X, DIM1>(DataSpace<DIM1>(N_omega));
+
+
+                        DBoxType frequencyDB = frequencyBuffer->getHostBuffer().getDataBox();
+
+                        std::ifstream freqListFile(path.c_str());
+                        unsigned int i;
+
+                        printf("freq: %s\n", path.c_str());
+
+                        if(!freqListFile)
+                        {
+                            throw std::runtime_error(
+                                std::string("The radiation-frequency-file ") + path
+                                + std::string(" could not be found.\n"));
+                        }
+
+
+                        for(i = 0; i < N_omega && !freqListFile.eof(); ++i)
+                        {
+                            freqListFile >> frequencyDB[i];
+                            // verbose output of loaded frequencies if verbose level PHYSICS is set:
+                            log<PIConGPUVerboseRadiation::PHYSICS>("freq: %1% \t %2%") % i % frequencyDB[i];
+                            frequencyDB[i] *= UNIT_TIME;
+                        }
+
+                        if(i != N_omega)
+                        {
+                            throw std::runtime_error(std::string("The number of frequencies in the list and the "
+                                                                 "number of frequencies in the parameters differ.\n"));
+                        }
+
+                        frequencyBuffer->hostToDevice();
+                    }
+
+                    FreqFunctor getFunctor(void)
+                    {
+                        return FreqFunctor(frequencyBuffer);
+                    }
+
+                private:
+                    GridBuffer<float_X, DIM1>* frequencyBuffer;
+                };
+
+
+            } // namespace frequencies_from_list
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/frequencies/radiation_log_freq.hpp b/include/picongpu/plugins/radiation/frequencies/radiation_log_freq.hpp
index f3ddb5432b..fc9787f0bd 100644
--- a/include/picongpu/plugins/radiation/frequencies/radiation_log_freq.hpp
+++ b/include/picongpu/plugins/radiation/frequencies/radiation_log_freq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -24,57 +24,57 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace log_frequencies
-{
-
-
-    class FreqFunctor
-    {
-    public:
-      FreqFunctor(void)
-      {
-          omega_log_min = math::log(omega_min);
-          delta_omega_log = (math::log(omega_max) - omega_log_min) / float_X(N_omega - 1);
-       }
-
-      HDINLINE float_X operator()(const int ID)
-      {
-          return  math::exp(omega_log_min + (float_X(ID)) * delta_omega_log) ;
-      }
-
-      HINLINE float_X get(const int ID)
-      {
-          return operator()(ID);
-      }
-
-    private:
-      float_X omega_log_min;
-      float_X delta_omega_log;
-    };
-
-
-    class InitFreqFunctor
+    namespace plugins
     {
-    public:
-      InitFreqFunctor(void)
-      { }
-
-      HINLINE void Init(const std::string path )
-      { }
-
-
-      HINLINE FreqFunctor getFunctor(void)
-      {
-          return FreqFunctor();
-      }
-    };
-
-
-} // namespace log_frequencies
-} // namespace radiation
-} // namespace plugins
+        namespace radiation
+        {
+            namespace log_frequencies
+            {
+                class FreqFunctor
+                {
+                public:
+                    FreqFunctor(void)
+                    {
+                        omega_log_min = math::log(omega_min);
+                        delta_omega_log = (math::log(omega_max) - omega_log_min) / float_X(N_omega - 1);
+                    }
+
+                    HDINLINE float_X operator()(const int ID)
+                    {
+                        return math::exp(omega_log_min + (float_X(ID)) * delta_omega_log);
+                    }
+
+                    HINLINE float_X get(const int ID)
+                    {
+                        return operator()(ID);
+                    }
+
+                private:
+                    float_X omega_log_min;
+                    float_X delta_omega_log;
+                };
+
+
+                class InitFreqFunctor
+                {
+                public:
+                    InitFreqFunctor(void)
+                    {
+                    }
+
+                    HINLINE void Init(const std::string path)
+                    {
+                    }
+
+
+                    HINLINE FreqFunctor getFunctor(void)
+                    {
+                        return FreqFunctor();
+                    }
+                };
+
+
+            } // namespace log_frequencies
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/nyquist_low_pass.hpp b/include/picongpu/plugins/radiation/nyquist_low_pass.hpp
index 097e5c253c..2c69a9524d 100644
--- a/include/picongpu/plugins/radiation/nyquist_low_pass.hpp
+++ b/include/picongpu/plugins/radiation/nyquist_low_pass.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -26,45 +26,43 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-class NyquistLowPass : public One_minus_beta_times_n
-{
-
-public:
-    /**
-     * calculates \f$omega_{Nyquist}\f$ for particle in a direction \f$n\f$
-     * \f$omega_{Nyquist} = (\pi - \epsilon )/(\delta t * (1 - \vec(\beta) * \vec(n)))\f$
-     * so that all Amplitudes for higher frequencies can be ignored
-    **/
-    HDINLINE NyquistLowPass(const vector_64& n, const Particle& particle)
-      : omegaNyquist((PI - 0.01)/
-           (DELTA_T *
-            One_minus_beta_times_n()(n, particle)))
-    { }
+    namespace plugins
+    {
+        namespace radiation
+        {
+            class NyquistLowPass : public One_minus_beta_times_n
+            {
+            public:
+                /**
+                 * calculates \f$omega_{Nyquist}\f$ for particle in a direction \f$n\f$
+                 * \f$omega_{Nyquist} = (\pi - \epsilon )/(\delta t * (1 - \vec(\beta) * \vec(n)))\f$
+                 * so that all Amplitudes for higher frequencies can be ignored
+                 **/
+                HDINLINE NyquistLowPass(const vector_64& n, const Particle& particle)
+                    : omegaNyquist((PI - 0.01) / (DELTA_T * One_minus_beta_times_n()(n, particle)))
+                {
+                }
 
-    /**
-     * default constructor - needed for allocating shared memory on GPU (Radiation.hpp kernel)
-    **/
-    HDINLINE NyquistLowPass(void)
-    { }
+                /**
+                 * default constructor - needed for allocating shared memory on GPU (Radiation.hpp kernel)
+                 **/
+                HDINLINE NyquistLowPass(void)
+                {
+                }
 
 
-    /**
-     * checks if frequency omega is below Nyquist frequency
-    **/
-    HDINLINE bool check(const float_32 omega)
-    {
-        return omega < omegaNyquist * radiationNyquist::NyquistFactor;
-    }
+                /**
+                 * checks if frequency omega is below Nyquist frequency
+                 **/
+                HDINLINE bool check(const float_32 omega)
+                {
+                    return omega < omegaNyquist * radiationNyquist::NyquistFactor;
+                }
 
-private:
-    float_32 omegaNyquist; // Nyquist frequency for a particle (at a certain time step) for one direction
-};
+            private:
+                float_32 omegaNyquist; // Nyquist frequency for a particle (at a certain time step) for one direction
+            };
 
-} // namespace radiation
-} // namespace plugins
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/particle.hpp b/include/picongpu/plugins/radiation/particle.hpp
index c7262f797d..5c623a87ee 100644
--- a/include/picongpu/plugins/radiation/particle.hpp
+++ b/include/picongpu/plugins/radiation/particle.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -28,151 +28,160 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-class When
-{
-    // a enum to describe all needed times
-public:
-
-    enum
-    {
-        first = 0u, now = 1u, old = 2u, older = 3u
-    };
-};
-
-class Particle : protected Taylor // Taylor includes just some methodes (no real derived class)
-{
-public:
-    //////////////////////////////////////////////////////////////////
-    // data:
-    // the first time (in above order) to be stored
-
-    enum
-    {
-        location_begin = When::now, momentum_begin = When::now, beta_begin = When::first
-    };
-    const vector_X momentum_now;
-    const vector_X momentum_old;
-    const vector_X location_now;
-    const picongpu::float_X mass;
-
-public:
-    //////////////////////////////////////////////////////////////////
-    // constructors:
-
-  HDINLINE Particle(const vector_X& locationNow_set, const vector_X& momentumOld_set, const vector_X& momentumNow_set, const picongpu::float_X mass_set)
-    : location_now(locationNow_set), momentum_old(momentumOld_set), momentum_now(momentumNow_set), mass(mass_set)
-    {
-
-    }
-
-
-    //////////////////////////////////////////////////////////////////
-    // getters:
-
-    template<unsigned int when>
-    HDINLINE vector_64 get_location(void) const;
-    // get location at time when
-
-    template<unsigned int when>
-    HDINLINE vector_64 get_momentum(void) const;
-    // get momentum at time when
-
-    template<unsigned int when>
-    HDINLINE vector_64 get_beta(void) const
-    {
-        return calc_beta(get_momentum<when > ());
-    } // get beta at time when except:
-    // first --> is specialized below
-
-    template<unsigned int when>
-    HDINLINE picongpu::float_64 get_gamma(void) const
-    {
-        return calc_gamma(get_momentum<when > ());
-    } // get gamma at time when
-
-    template<unsigned int when>
-    HDINLINE picongpu::float_64 get_gamma_inv_square(void) const
+    namespace plugins
     {
-        return calc_gamma_inv_square(get_momentum<when > ());
-    } // get 1/gamma^2
-
-    template< unsigned int when>
-    HDINLINE picongpu::float_64 get_cos_theta(const vector_64& n) const
-    {
-        // get cos(theta) at time when
-        const vector_64 beta = get_beta<when > ();
-        return calc_cos_theta(n, beta);
-    }
-
-
-private:
-    //////////////////////////////////////////////////////////////////
-    // private methods:
-
-    HDINLINE vector_64 calc_beta(const vector_X& momentum) const
-    {
-        // returns beta=v/c
-        const picongpu::float_32 gamma1 = calc_gamma(momentum);
-        return momentum * (1.0 / (mass * picongpu::SPEED_OF_LIGHT * gamma1));
-    }
-
-    HDINLINE picongpu::float_64 calc_gamma(const vector_X& momentum) const
-    {
-        // return gamma = E/(mc^2)
-        const picongpu::float_32 x = util::square<vector_X, picongpu::float_32 > (momentum * (1.0 / (mass * picongpu::SPEED_OF_LIGHT)));
-        return picongpu::math::sqrt(1.0 + x);
-
-    }
-
-    HDINLINE picongpu::float_64 calc_gamma_inv_square(const vector_X& momentum) const
-    {
-        // returns 1/gamma^2 = m^2*c^2/(m^2*c^2 + p^2)
-        const picongpu::float_32 Emass = mass * picongpu::SPEED_OF_LIGHT;
-        return Emass / (Emass + (util::square<vector_X, picongpu::float_32 > (momentum)) / Emass);
-    }
-
-    HDINLINE picongpu::float_64 calc_cos_theta(const vector_64& n, const vector_64& beta) const
-    {
-        // return cos of angle between looking and flight direction
-        return (n * beta) / (std::sqrt(beta * beta));
-    }
-
-
-    // setters:
-
-    HDINLINE picongpu::float_64 summand(void) const
-    {
-        // return \vec n independend summand (next value to add to \vec n independend sum)
-        const picongpu::float_64 x = get_gamma_inv_square<When::now > ();
-        return Taylor()(x);
-    }
-
-}; // end of Particle definition
-
-
-template<>
-HDINLINE vector_64 Particle::get_location<When::now>(void) const
-{
-    return location_now;
-} // get location at time when
-
-template<>
-HDINLINE vector_64 Particle::get_momentum<When::now>(void) const
-{
-    return momentum_now;
-} // get momentum at time when
-
-template<>
-HDINLINE vector_64 Particle::get_momentum<When::old>(void) const
-{
-    return momentum_old;
-} // get momentum at time when
-
-} // namespace radiation
-} // namespace plugins
+        namespace radiation
+        {
+            class When
+            {
+                // a enum to describe all needed times
+            public:
+                enum
+                {
+                    first = 0u,
+                    now = 1u,
+                    old = 2u,
+                    older = 3u
+                };
+            };
+
+            class Particle : protected Taylor // Taylor includes just some methodes (no real derived class)
+            {
+            public:
+                //////////////////////////////////////////////////////////////////
+                // data:
+                // the first time (in above order) to be stored
+
+                enum
+                {
+                    location_begin = When::now,
+                    momentum_begin = When::now,
+                    beta_begin = When::first
+                };
+                const vector_X momentum_now;
+                const vector_X momentum_old;
+                const vector_X location_now;
+                const picongpu::float_X mass;
+
+            public:
+                //////////////////////////////////////////////////////////////////
+                // constructors:
+
+                HDINLINE Particle(
+                    const vector_X& locationNow_set,
+                    const vector_X& momentumOld_set,
+                    const vector_X& momentumNow_set,
+                    const picongpu::float_X mass_set)
+                    : location_now(locationNow_set)
+                    , momentum_old(momentumOld_set)
+                    , momentum_now(momentumNow_set)
+                    , mass(mass_set)
+                {
+                }
+
+
+                //////////////////////////////////////////////////////////////////
+                // getters:
+
+                template<unsigned int when>
+                HDINLINE vector_64 get_location(void) const;
+                // get location at time when
+
+                template<unsigned int when>
+                HDINLINE vector_64 get_momentum(void) const;
+                // get momentum at time when
+
+                template<unsigned int when>
+                HDINLINE vector_64 get_beta(void) const
+                {
+                    return calc_beta(get_momentum<when>());
+                } // get beta at time when except:
+                // first --> is specialized below
+
+                template<unsigned int when>
+                HDINLINE picongpu::float_64 get_gamma(void) const
+                {
+                    return calc_gamma(get_momentum<when>());
+                } // get gamma at time when
+
+                template<unsigned int when>
+                HDINLINE picongpu::float_64 get_gamma_inv_square(void) const
+                {
+                    return calc_gamma_inv_square(get_momentum<when>());
+                } // get 1/gamma^2
+
+                template<unsigned int when>
+                HDINLINE picongpu::float_64 get_cos_theta(const vector_64& n) const
+                {
+                    // get cos(theta) at time when
+                    const vector_64 beta = get_beta<when>();
+                    return calc_cos_theta(n, beta);
+                }
+
+
+            private:
+                //////////////////////////////////////////////////////////////////
+                // private methods:
+
+                HDINLINE vector_64 calc_beta(const vector_X& momentum) const
+                {
+                    // returns beta=v/c
+                    const picongpu::float_32 gamma1 = calc_gamma(momentum);
+                    return momentum * (1.0 / (mass * picongpu::SPEED_OF_LIGHT * gamma1));
+                }
+
+                HDINLINE picongpu::float_64 calc_gamma(const vector_X& momentum) const
+                {
+                    // return gamma = E/(mc^2)
+                    const picongpu::float_32 x = util::square<vector_X, picongpu::float_32>(
+                        momentum * (1.0 / (mass * picongpu::SPEED_OF_LIGHT)));
+                    return picongpu::math::sqrt(1.0 + x);
+                }
+
+                HDINLINE picongpu::float_64 calc_gamma_inv_square(const vector_X& momentum) const
+                {
+                    // returns 1/gamma^2 = m^2*c^2/(m^2*c^2 + p^2)
+                    const picongpu::float_32 Emass = mass * picongpu::SPEED_OF_LIGHT;
+                    return Emass / (Emass + (util::square<vector_X, picongpu::float_32>(momentum)) / Emass);
+                }
+
+                HDINLINE picongpu::float_64 calc_cos_theta(const vector_64& n, const vector_64& beta) const
+                {
+                    // return cos of angle between looking and flight direction
+                    return (n * beta) / (std::sqrt(beta * beta));
+                }
+
+
+                // setters:
+
+                HDINLINE picongpu::float_64 summand(void) const
+                {
+                    // return \vec n independend summand (next value to add to \vec n independend sum)
+                    const picongpu::float_64 x = get_gamma_inv_square<When::now>();
+                    return Taylor()(x);
+                }
+
+            }; // end of Particle definition
+
+
+            template<>
+            HDINLINE vector_64 Particle::get_location<When::now>(void) const
+            {
+                return location_now;
+            } // get location at time when
+
+            template<>
+            HDINLINE vector_64 Particle::get_momentum<When::now>(void) const
+            {
+                return momentum_now;
+            } // get momentum at time when
+
+            template<>
+            HDINLINE vector_64 Particle::get_momentum<When::old>(void) const
+            {
+                return momentum_old;
+            } // get momentum at time when
+
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/radFormFactor.hpp b/include/picongpu/plugins/radiation/radFormFactor.hpp
index 46c195276a..4a150f943c 100644
--- a/include/picongpu/plugins/radiation/radFormFactor.hpp
+++ b/include/picongpu/plugins/radiation/radFormFactor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -25,193 +25,206 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-namespace radFormFactor_baseShape_3D
-{
-    /** general form factor class of discrete charge distribution of PIC particle shape of order T_shapeOrder
-     *
-     * @tparam T_shapeOrder order of charge distribution shape in PIC code used for radiation form factor
-     */
-
-    template< uint32_t T_shapeOrder >
-    struct radFormFactor
-    {
-      /** Form Factor for T_shapeOrder-order particle shape charge distribution of N discrete electrons:
-       * \f[ | \mathcal{F} |^2 = N + (N*N - N) * (sinc^2(n_x * L_x * \omega) * sinc^2(n_y * L_y * \omega) * sinc^2(n_z * L_z * \omega))^T_shapeOrder \f]
-       *
-       * with observation direction (unit vector) \f$ \vec{n} = (n_x, n_y, n_z) \f$
-       * and with:
-       * @param N     = weighting
-       * @param omega = frequency
-       * @param L_d   = the size of the CIC-particle / cell in dimension d
-       *
-       * @param N = macro particle weighting
-       * @param omega = frequency at which to calculate the  form factor
-       * @param observer_unit_vec = observation direction
-       * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 ) \f$
-       */
-      HDINLINE float_X operator()( const float_X N, const float_X omega, vector_X const & observer_unit_vec ) const
-      {
-          float_X sincValue = float_X( 1.0 );
-          for( uint32_t d = 0; d < DIM3; ++d )
-              sincValue *= math::sinc( observer_unit_vec[d] * cellSize[d] / ( SPEED_OF_LIGHT * float_X( 2.0 ) ) * omega );
-
-          // here we combine sinc^2(..) with (...)^T_shapeOrder to ...^(2 * T_shapeOrder)
-          return math::sqrt( N + ( N * N - N ) * util::pow( sincValue , 2 * T_shapeOrder ) );
-      }
-    };
-} // namespace radFormFactor_baseShape_3D
-
-
-namespace radFormFactor_CIC_3D
-{
-    struct radFormFactor : public radFormFactor_baseShape_3D::radFormFactor< 1 >
-    { };
-} // namespace radFormFactor_CIC_3D
-
-namespace radFormFactor_TSC_3D
-{
-    struct radFormFactor : public radFormFactor_baseShape_3D::radFormFactor< 2 >
-    { };
-} // namespace radFormFactor_TSC_3D
-
-namespace radFormFactor_PCS_3D
-{
-    struct radFormFactor : public radFormFactor_baseShape_3D::radFormFactor< 3 >
-    { };
-} // namespace radFormFactor_PCS_3D
-
-
-namespace radFormFactor_CIC_1Dy
-{
-    struct radFormFactor
-    {
-      /** Form Factor for 1-d CIC charge distribution iy y of N discrete electrons:
-       * \f[ | \mathcal{F} |^2 = N + (N*N - N) * sinc^2(n_y * L_y * \omega) \f]
-       *
-       * with observation direction (unit vector) \f$ \vec{n} = (n_x, n_y, n_z) \f$
-       * and with:
-       * @param N     = weighting
-       * @param omega = frequency
-       * @param L_d   = the size of the CIC-particle / cell in dimension d
-       *
-       * @param N = macro particle weighting
-       * @param omega = frequency at which to calculate the  form factor
-       * @param observer_unit_vec = observation direction
-       * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 ) \f$
-       */
-      HDINLINE float_X operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
-      {
-          return math::sqrt(
-              N + ( N * N - N ) * util::square(
-                  math::sinc( CELL_HEIGHT / ( SPEED_OF_LIGHT * float_X( 2.0 ) ) * omega )
-              )
-          );
-      }
-    };
-} // namespace radFormFactor_CIC_1Dy
-
-
-namespace radFormFactor_Gauss_spherical
-{
-    struct radFormFactor
-    {
-      /** Form Factor for point-symmetric Gauss-shaped charge distribution of N discrete electrons:
-        * \f[ <rho(r)> = N*q_e* 1/sqrt(2*pi*sigma^2) * exp(-0.5 * r^2/sigma^2) \f]
-        * with sigma = 0.5*c/delta_t (0.5 because sigma is defined around center)
-        *
-        * @param N = macro particle weighting
-        * @param omega = frequency at which to calculate the  form factor
-        * @param observer_unit_vec = observation direction
-        * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 ) \f$
-        */
-      HDINLINE float_X operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
-      {
-          /* currently a fixed sigma of DELTA_T * c is used to describe the distribution - might become a parameter */
-          return math::sqrt(
-              N + ( N * N - N ) * util::square(
-                  math::exp( float_X( -0.5 ) * util::square( omega * float_X( 0.5 ) * DELTA_T ) )
-              )
-          );
-      }
-    };
-} // namespace radFormFactor_Gauss_spherical
-
-
-namespace radFormFactor_Gauss_cell
-{
-    struct radFormFactor
-    {
-      /** Form Factor for per-dimension Gauss-shaped charge distribution of N discrete electrons:
-        * \f[ <rho(r)> = N*q_e* product[d={x,y,z}](1/sqrt(2*pi*sigma_d^2) * exp(-0.5 * d^2/sigma_d^2)) \f]
-        * with sigma_d = 0.5*cell_width_d*n_d
-        *
-        * @param N = macro particle weighting
-        * @param omega = frequency at which to calculate the  form factor
-        * @param observer_unit_vec = observation direction
-        * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 ) \f$
-        */
-      HDINLINE float_X operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
-      {
-        return math::sqrt(
-             N + ( N * N - N ) * util::square(
-                 math::exp(
-                     float_X( -0.5 ) * (
-                         util::square( observer_unit_vec.x() * CELL_WIDTH / ( SPEED_OF_LIGHT * float_X(2.0) ) * omega ) +
-                         util::square( observer_unit_vec.y() * CELL_HEIGHT / ( SPEED_OF_LIGHT * float_X(2.0) ) * omega ) +
-                         util::square( observer_unit_vec.z() * CELL_DEPTH / ( SPEED_OF_LIGHT * float_X(2.0) ) * omega )
-                     )
-                 )
-             )
-        );
-      }
-    };
-} // namespace radFormFactor_Gauss_cell
-
-
-
-namespace radFormFactor_incoherent
-{
-    struct radFormFactor
-    {
-      /** Form Factor for an incoherent charge distribution:
-        *
-        * @param N = macro particle weighting
-        * @param omega = frequency at which to calculate the  form factor
-        * @param observer_unit_vec = observation direction
-        * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 == \sqrt(weighting) \f$
-        */
-      HDINLINE float_X operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
-      {
-        return math::sqrt( N );
-
-      }
-    };
-} // namespace radFormFactor_incoherent
-
-
-namespace radFormFactor_coherent
-{
-    struct radFormFactor
+    namespace plugins
     {
-      /** Form Factor for a coherent charge distribution:
-        *
-        * @param N = macro particle weighting
-        * @param omega = frequency at which to calculate the  form factor
-        * @param observer_unit_vec = observation direction
-        * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 == \sqrt(weighting) \f$
-        */
-      HDINLINE float_X operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
-      {
-        return N;
-      }
-    };
-} // namespace radFormFactor_coherent
-
-} // namespace radiation
-} // namespace plugins
+        namespace radiation
+        {
+            namespace radFormFactor_baseShape_3D
+            {
+                /** general form factor class of discrete charge distribution of PIC particle shape of order
+                 * T_shapeOrder
+                 *
+                 * @tparam T_shapeOrder order of charge distribution shape in PIC code used for radiation form factor
+                 */
+
+                template<uint32_t T_shapeOrder>
+                struct radFormFactor
+                {
+                    /** Form Factor for T_shapeOrder-order particle shape charge distribution of N discrete electrons:
+                     * \f[ | \mathcal{F} |^2 = N + (N*N - N) * (sinc^2(n_x * L_x * \omega) * sinc^2(n_y * L_y * \omega)
+                     * * sinc^2(n_z * L_z * \omega))^T_shapeOrder \f]
+                     *
+                     * with observation direction (unit vector) \f$ \vec{n} = (n_x, n_y, n_z) \f$
+                     * and with:
+                     * @param N     = weighting
+                     * @param omega = frequency
+                     * @param L_d   = the size of the CIC-particle / cell in dimension d
+                     *
+                     * @param N = macro particle weighting
+                     * @param omega = frequency at which to calculate the  form factor
+                     * @param observer_unit_vec = observation direction
+                     * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 ) \f$
+                     */
+                    HDINLINE float_X
+                    operator()(const float_X N, const float_X omega, vector_X const& observer_unit_vec) const
+                    {
+                        float_X sincValue = float_X(1.0);
+                        for(uint32_t d = 0; d < DIM3; ++d)
+                            sincValue *= pmacc::math::sinc(
+                                observer_unit_vec[d] * cellSize[d] / (SPEED_OF_LIGHT * float_X(2.0)) * omega);
+
+                        // here we combine sinc^2(..) with (...)^T_shapeOrder to ...^(2 * T_shapeOrder)
+                        return math::sqrt(N + (N * N - N) * util::pow(sincValue, 2 * T_shapeOrder));
+                    }
+                };
+            } // namespace radFormFactor_baseShape_3D
+
+
+            namespace radFormFactor_CIC_3D
+            {
+                struct radFormFactor : public radFormFactor_baseShape_3D::radFormFactor<1>
+                {
+                };
+            } // namespace radFormFactor_CIC_3D
+
+            namespace radFormFactor_TSC_3D
+            {
+                struct radFormFactor : public radFormFactor_baseShape_3D::radFormFactor<2>
+                {
+                };
+            } // namespace radFormFactor_TSC_3D
+
+            namespace radFormFactor_PCS_3D
+            {
+                struct radFormFactor : public radFormFactor_baseShape_3D::radFormFactor<3>
+                {
+                };
+            } // namespace radFormFactor_PCS_3D
+
+
+            namespace radFormFactor_CIC_1Dy
+            {
+                struct radFormFactor
+                {
+                    /** Form Factor for 1-d CIC charge distribution iy y of N discrete electrons:
+                     * \f[ | \mathcal{F} |^2 = N + (N*N - N) * sinc^2(n_y * L_y * \omega) \f]
+                     *
+                     * with observation direction (unit vector) \f$ \vec{n} = (n_x, n_y, n_z) \f$
+                     * and with:
+                     * @param N     = weighting
+                     * @param omega = frequency
+                     * @param L_d   = the size of the CIC-particle / cell in dimension d
+                     *
+                     * @param N = macro particle weighting
+                     * @param omega = frequency at which to calculate the  form factor
+                     * @param observer_unit_vec = observation direction
+                     * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 ) \f$
+                     */
+                    HDINLINE float_X
+                    operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
+                    {
+                        return math::sqrt(
+                            N
+                            + (N * N - N)
+                                * util::square(
+                                    pmacc::math::sinc(CELL_HEIGHT / (SPEED_OF_LIGHT * float_X(2.0)) * omega)));
+                    }
+                };
+            } // namespace radFormFactor_CIC_1Dy
+
+
+            namespace radFormFactor_Gauss_spherical
+            {
+                struct radFormFactor
+                {
+                    /** Form Factor for point-symmetric Gauss-shaped charge distribution of N discrete electrons:
+                     * \f[ <rho(r)> = N*q_e* 1/sqrt(2*pi*sigma^2) * exp(-0.5 * r^2/sigma^2) \f]
+                     * with sigma = 0.5*c/delta_t (0.5 because sigma is defined around center)
+                     *
+                     * @param N = macro particle weighting
+                     * @param omega = frequency at which to calculate the  form factor
+                     * @param observer_unit_vec = observation direction
+                     * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 ) \f$
+                     */
+                    HDINLINE float_X
+                    operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
+                    {
+                        /* currently a fixed sigma of DELTA_T * c is used to describe the distribution - might become a
+                         * parameter */
+                        return math::sqrt(
+                            N
+                            + (N * N - N)
+                                * util::square(
+                                    math::exp(float_X(-0.5) * util::square(omega * float_X(0.5) * DELTA_T))));
+                    }
+                };
+            } // namespace radFormFactor_Gauss_spherical
+
+
+            namespace radFormFactor_Gauss_cell
+            {
+                struct radFormFactor
+                {
+                    /** Form Factor for per-dimension Gauss-shaped charge distribution of N discrete electrons:
+                     * \f[ <rho(r)> = N*q_e* product[d={x,y,z}](1/sqrt(2*pi*sigma_d^2) * exp(-0.5 * d^2/sigma_d^2)) \f]
+                     * with sigma_d = 0.5*cell_width_d*n_d
+                     *
+                     * @param N = macro particle weighting
+                     * @param omega = frequency at which to calculate the  form factor
+                     * @param observer_unit_vec = observation direction
+                     * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 ) \f$
+                     */
+                    HDINLINE float_X
+                    operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
+                    {
+                        return math::sqrt(
+                            N
+                            + (N * N - N)
+                                * util::square(math::exp(
+                                    float_X(-0.5)
+                                    * (util::square(
+                                           observer_unit_vec.x() * CELL_WIDTH / (SPEED_OF_LIGHT * float_X(2.0))
+                                           * omega)
+                                       + util::square(
+                                           observer_unit_vec.y() * CELL_HEIGHT / (SPEED_OF_LIGHT * float_X(2.0))
+                                           * omega)
+                                       + util::square(
+                                           observer_unit_vec.z() * CELL_DEPTH / (SPEED_OF_LIGHT * float_X(2.0))
+                                           * omega)))));
+                    }
+                };
+            } // namespace radFormFactor_Gauss_cell
+
+
+            namespace radFormFactor_incoherent
+            {
+                struct radFormFactor
+                {
+                    /** Form Factor for an incoherent charge distribution:
+                     *
+                     * @param N = macro particle weighting
+                     * @param omega = frequency at which to calculate the  form factor
+                     * @param observer_unit_vec = observation direction
+                     * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 == \sqrt(weighting) \f$
+                     */
+                    HDINLINE float_X
+                    operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
+                    {
+                        return math::sqrt(N);
+                    }
+                };
+            } // namespace radFormFactor_incoherent
+
+
+            namespace radFormFactor_coherent
+            {
+                struct radFormFactor
+                {
+                    /** Form Factor for a coherent charge distribution:
+                     *
+                     * @param N = macro particle weighting
+                     * @param omega = frequency at which to calculate the  form factor
+                     * @param observer_unit_vec = observation direction
+                     * @return the Form Factor: \f$ \sqrt( | \mathcal{F} |^2 == \sqrt(weighting) \f$
+                     */
+                    HDINLINE float_X
+                    operator()(const float_X N, const float_X omega, const vector_X observer_unit_vec) const
+                    {
+                        return N;
+                    }
+                };
+            } // namespace radFormFactor_coherent
+
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/taylor.hpp b/include/picongpu/plugins/radiation/taylor.hpp
index 39491b4195..e3217af05d 100644
--- a/include/picongpu/plugins/radiation/taylor.hpp
+++ b/include/picongpu/plugins/radiation/taylor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -23,24 +23,23 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-struct Taylor
-{
-    // a Taylor development for 1-sqrt(1-x)
-
-    HDINLINE picongpu::float_64 operator()(picongpu::float_64 x) const
+    namespace plugins
     {
-        // Taylor series of 1-sqrt(1-x) till 5th order
-        //same like 0.5*x + 0.125*x*x + 0.0625 * x*x*x + 0.0390625 * x*x*x*x + 0.02734375 *x*x*x*x*x;
-        const picongpu::float_64 x2 = (x * x);
-        return x * ((0.5 + 0.125 * x) + x2 * (0.0625 + (0.0390625 * x + 0.02734375 * x2)));
-    }
+        namespace radiation
+        {
+            struct Taylor
+            {
+                // a Taylor development for 1-sqrt(1-x)
 
-};
+                HDINLINE picongpu::float_64 operator()(picongpu::float_64 x) const
+                {
+                    // Taylor series of 1-sqrt(1-x) till 5th order
+                    // same like 0.5*x + 0.125*x*x + 0.0625 * x*x*x + 0.0390625 * x*x*x*x + 0.02734375 *x*x*x*x*x;
+                    const picongpu::float_64 x2 = (x * x);
+                    return x * ((0.5 + 0.125 * x) + x2 * (0.0625 + (0.0390625 * x + 0.02734375 * x2)));
+                }
+            };
 
-} // namespace radiation
-} // namespace plugins
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/utilities.hpp b/include/picongpu/plugins/radiation/utilities.hpp
index 434832d6e5..8a6bb708c3 100644
--- a/include/picongpu/plugins/radiation/utilities.hpp
+++ b/include/picongpu/plugins/radiation/utilities.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -21,100 +21,94 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-namespace util
-{
-
-    //goal: to increase readability of code
-
-    template<typename A> /// a generic square function
-    HDINLINE A square(A a)
-    {
-        return a*a;
-    }
-
-    template<typename A, typename R> /// a more generic square function
-    HDINLINE R square(A a)
-    {
-        return a*a;
-    }
-
-    template<typename A> /// a generic cube function
-    HDINLINE A cube(A a)
-    {
-        return a * a*a;
-    }
-
-    template<typename A, typename R> /// a more generic cube function
-    HDINLINE R cube(A a)
-    {
-        return a * a*a;
-    }
-
-    template<typename A, typename R = A> /// a more generic square struct
-            struct Cube
+    namespace plugins
     {
-
-        HDINLINE R operator()(A a)
+        namespace radiation
         {
-            return a * a*a;
-        }
-    };
-
-    template<typename A, typename R = A> /// a more generic square struct
-            struct Square
-    {
-
-        HDINLINE R operator()(A a) const
-        {
-            return a*a;
-        }
-    };
-
-
-
-namespace details
-{
-  /** power function - with extra const parameter for efficient code
-    *
-    * T_type requires cast from int and multiplication
-    * @tparam T_Type - base type
-    * @param x - base value
-    * @param exp - exponent
-    * @param results (=1) - do not change - workaround to produce efficient code
-    * @return std::pow(x, exp)
-    */
-  template< typename T_Type >
-  HDINLINE constexpr T_Type pow( T_Type const x , uint32_t const exp, const T_Type result = T_Type( 1 ) )
-  {
-    return exp == 0 ? result : (
-        exp == 1 ? x * result : util::details::pow( x, exp - 1, result * x )
-    );
-  }
-} // namespace details
-
-  /** power function
-    *
-    * T_type requires cast from int and multiplication
-    * @tparam T_Type - base type
-    * @param x - base value
-    * @param exp - exponent
-    * @return std::pow(x, exp)
-    */
-  template< typename T_Type >
-  HDINLINE constexpr T_Type pow( T_Type const x , uint32_t const exp )
-  {
-    return util::details::pow( x, exp );
-  }
-
-} // namespace util
-
-} // namespace radiation
-
-} // namespace plugins
+            namespace util
+            {
+                // goal: to increase readability of code
+
+                template<typename A> /// a generic square function
+                HDINLINE A square(A a)
+                {
+                    return a * a;
+                }
+
+                template<typename A, typename R> /// a more generic square function
+                HDINLINE R square(A a)
+                {
+                    return a * a;
+                }
+
+                template<typename A> /// a generic cube function
+                HDINLINE A cube(A a)
+                {
+                    return a * a * a;
+                }
+
+                template<typename A, typename R> /// a more generic cube function
+                HDINLINE R cube(A a)
+                {
+                    return a * a * a;
+                }
+
+                template<typename A, typename R = A> /// a more generic square struct
+                struct Cube
+                {
+                    HDINLINE R operator()(A a)
+                    {
+                        return a * a * a;
+                    }
+                };
+
+                template<typename A, typename R = A> /// a more generic square struct
+                struct Square
+                {
+                    HDINLINE R operator()(A a) const
+                    {
+                        return a * a;
+                    }
+                };
+
+
+                namespace details
+                {
+                    /** power function - with extra const parameter for efficient code
+                     *
+                     * T_type requires cast from int and multiplication
+                     * @tparam T_Type - base type
+                     * @param x - base value
+                     * @param exp - exponent
+                     * @param results (=1) - do not change - workaround to produce efficient code
+                     * @return std::pow(x, exp)
+                     */
+                    template<typename T_Type>
+                    HDINLINE constexpr T_Type pow(T_Type const x, uint32_t const exp, const T_Type result = T_Type(1))
+                    {
+                        return exp == 0 ? result
+                                        : (exp == 1 ? x * result : util::details::pow(x, exp - 1, result * x));
+                    }
+                } // namespace details
+
+                /** power function
+                 *
+                 * T_type requires cast from int and multiplication
+                 * @tparam T_Type - base type
+                 * @param x - base value
+                 * @param exp - exponent
+                 * @return std::pow(x, exp)
+                 */
+                template<typename T_Type>
+                HDINLINE constexpr T_Type pow(T_Type const x, uint32_t const exp)
+                {
+                    return util::details::pow(x, exp);
+                }
+
+            } // namespace util
+
+        } // namespace radiation
+
+    } // namespace plugins
 
 } // namespace picongpu
diff --git a/include/picongpu/plugins/radiation/vector.hpp b/include/picongpu/plugins/radiation/vector.hpp
index bdb46f9827..a0c533fb90 100644
--- a/include/picongpu/plugins/radiation/vector.hpp
+++ b/include/picongpu/plugins/radiation/vector.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -25,160 +25,158 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-template<typename V, typename T>
-struct cuda_vec : public V
-{
-    // constructor
-
-    HDINLINE cuda_vec(T x, T y, T z)
-    {
-        this->x() = x;
-        this->y() = y;
-        this->z() = z;
-    }
-
-    // default constructor
-
-    HDINLINE cuda_vec()
-    {
-
-    }
-
-    // constructor
-
-    HDINLINE cuda_vec(const V & other)
-    {
-        this->x() = other.x();
-        this->y() = other.y();
-        this->z() = other.z();
-    }
-
-    HDINLINE static cuda_vec<V, T> zero()
-    {
-        return cuda_vec(0, 0, 0);
-    }
-
-
-    // conversion between two cuda vectors with different types
-
-    template<typename O, typename Q >
-            HDINLINE cuda_vec(const cuda_vec<O, Q>& other)
-    {
-        this->x() = (T) other.x();
-        this->y() = (T) other.y();
-        this->z() = (T) other.z();
-    }
-
-    HDINLINE cuda_vec<V, T>& operator=(const cuda_vec<V, T>& other)
-    {
-        this->x() = other.x();
-        this->y() = other.y();
-        this->z() = other.z();
-        return (*this);
-    }
-
-    HDINLINE T &operator[](uint32_t dim)
-    {
-        return (&(this->x()))[dim];
-    }
-
-    HDINLINE const T &operator[](uint32_t dim) const
-    {
-        return (&(this->x()))[dim];
-    }
-
-
-    // addition
-
-    HDINLINE cuda_vec<V, T> operator+(const cuda_vec<V, T>& other) const
-    {
-        return cuda_vec<V, T > (this->x() + other.x(), this->y() + other.y(), this->z() + other.z());
-    }
-
-    // difference
-
-    HDINLINE cuda_vec<V, T> operator-(const cuda_vec<V, T>& other) const
-    {
-        return cuda_vec<V, T > (this->x() - other.x(), this->y() - other.y(), this->z() - other.z());
-    }
-
-    // vector multiplication
-
-    HDINLINE T operator*(const cuda_vec<V, T>& other) const
-    {
-        return this->x() * other.x() + this->y() * other.y() + this->z() * other.z();
-    }
-
-    // scalar multiplication
-
-    HDINLINE cuda_vec<V, T> operator*(const T scalar) const
-    {
-        return cuda_vec(scalar * this->x(), scalar * this->y(), scalar * this->z());
-    }
-
-    // division (scalar)
-
-    HDINLINE cuda_vec<V, T> operator/(const T scalar) const
-    {
-        return cuda_vec(this->x() / scalar, this->y() / scalar, this->z() / scalar);
-    }
-
-    // cross product (vector)
-
-    HDINLINE cuda_vec<V, T> operator%(const cuda_vec<V, T>& other) const
+    namespace plugins
     {
-        return cuda_vec(this->y() * other.z() - this->z() * other.y(), this->z() * other.x() - this->x() * other.z(), this->x() * other.y() - this->y() * other.x());
-    }
+        namespace radiation
+        {
+            template<typename V, typename T>
+            struct cuda_vec : public V
+            {
+                // constructor
+
+                HDINLINE cuda_vec(T x, T y, T z)
+                {
+                    this->x() = x;
+                    this->y() = y;
+                    this->z() = z;
+                }
+
+                // default constructor
 
-    // magnitude of vector (length of vector)
+                HDINLINE cuda_vec()
+                {
+                }
 
-    HDINLINE T magnitude(void) const
-    {
+                // constructor
+
+                HDINLINE cuda_vec(const V& other)
+                {
+                    this->x() = other.x();
+                    this->y() = other.y();
+                    this->z() = other.z();
+                }
 
-        return picongpu::math::sqrt(this->x() * this->x() + this->y() * this->y() + this->z() * this->z());
+                HDINLINE static cuda_vec<V, T> zero()
+                {
+                    return cuda_vec(0, 0, 0);
+                }
 
-    }
 
-    // unit vector in the direction of the vector
+                // conversion between two cuda vectors with different types
 
-    HDINLINE cuda_vec<V, T> unit_vec(void) const
-    {
-        return *this / magnitude();
-    }
+                template<typename O, typename Q>
+                HDINLINE cuda_vec(const cuda_vec<O, Q>& other)
+                {
+                    this->x() = (T) other.x();
+                    this->y() = (T) other.y();
+                    this->z() = (T) other.z();
+                }
+
+                HDINLINE cuda_vec<V, T>& operator=(const cuda_vec<V, T>& other)
+                {
+                    this->x() = other.x();
+                    this->y() = other.y();
+                    this->z() = other.z();
+                    return (*this);
+                }
+
+                HDINLINE T& operator[](uint32_t dim)
+                {
+                    return (&(this->x()))[dim];
+                }
+
+                HDINLINE const T& operator[](uint32_t dim) const
+                {
+                    return (&(this->x()))[dim];
+                }
+
+
+                // addition
+
+                HDINLINE cuda_vec<V, T> operator+(const cuda_vec<V, T>& other) const
+                {
+                    return cuda_vec<V, T>(this->x() + other.x(), this->y() + other.y(), this->z() + other.z());
+                }
+
+                // difference
+
+                HDINLINE cuda_vec<V, T> operator-(const cuda_vec<V, T>& other) const
+                {
+                    return cuda_vec<V, T>(this->x() - other.x(), this->y() - other.y(), this->z() - other.z());
+                }
+
+                // vector multiplication
+
+                HDINLINE T operator*(const cuda_vec<V, T>& other) const
+                {
+                    return this->x() * other.x() + this->y() * other.y() + this->z() * other.z();
+                }
+
+                // scalar multiplication
+
+                HDINLINE cuda_vec<V, T> operator*(const T scalar) const
+                {
+                    return cuda_vec(scalar * this->x(), scalar * this->y(), scalar * this->z());
+                }
+
+                // division (scalar)
+
+                HDINLINE cuda_vec<V, T> operator/(const T scalar) const
+                {
+                    return cuda_vec(this->x() / scalar, this->y() / scalar, this->z() / scalar);
+                }
+
+                // cross product (vector)
+
+                HDINLINE cuda_vec<V, T> operator%(const cuda_vec<V, T>& other) const
+                {
+                    return cuda_vec(
+                        this->y() * other.z() - this->z() * other.y(),
+                        this->z() * other.x() - this->x() * other.z(),
+                        this->x() * other.y() - this->y() * other.x());
+                }
+
+                // magnitude of vector (length of vector)
 
-    // assign add
+                HDINLINE T magnitude(void) const
+                {
+                    return picongpu::math::sqrt(this->x() * this->x() + this->y() * this->y() + this->z() * this->z());
+                }
 
-    HDINLINE void operator+=(const cuda_vec<V, T>& other)
-    {
-        this->x() += other.x();
-        this->y() += other.y();
-        this->z() += other.z();
-    }
+                // unit vector in the direction of the vector
 
-    // assign multiply
+                HDINLINE cuda_vec<V, T> unit_vec(void) const
+                {
+                    return *this / magnitude();
+                }
+
+                // assign add
+
+                HDINLINE void operator+=(const cuda_vec<V, T>& other)
+                {
+                    this->x() += other.x();
+                    this->y() += other.y();
+                    this->z() += other.z();
+                }
 
-    HDINLINE void operator*=(const T scalar)
-    {
-        this->x() *= scalar;
-        this->y() *= scalar;
-        this->z() *= scalar;
-    }
+                // assign multiply
 
-};
+                HDINLINE void operator*=(const T scalar)
+                {
+                    this->x() *= scalar;
+                    this->y() *= scalar;
+                    this->z() *= scalar;
+                }
+            };
 
-} // namespace radiation
-} // namespace plugins
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
 
 // print
 
 template<typename V, typename T>
-HINLINE std::ostream & operator <<(std::ostream & os, const picongpu::plugins::radiation::cuda_vec<V, T> & v)
+HINLINE std::ostream& operator<<(std::ostream& os, const picongpu::plugins::radiation::cuda_vec<V, T>& v)
 {
     os << " ( " << v.x() << " , " << v.y() << " , " << v.z() << " ) ";
     return os;
diff --git a/include/picongpu/plugins/radiation/windowFunctions.hpp b/include/picongpu/plugins/radiation/windowFunctions.hpp
index 33de38bedf..3f87328132 100644
--- a/include/picongpu/plugins/radiation/windowFunctions.hpp
+++ b/include/picongpu/plugins/radiation/windowFunctions.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Richard Pausch
+/* Copyright 2014-2021 Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -21,162 +21,157 @@
 
 #include <pmacc/algorithms/math/defines/pi.hpp>
 
-#include<cmath>
+#include <cmath>
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-
-  /* several window functions behind namespaces: */
-
-
-namespace radWindowFunctionTriangle
-{
-    struct radWindowFunction
-    {
-      /** 1D Window function according to the triangle window:
-       *
-       * x = position_x - L_x/2
-       * f(x) = {1+2x/L_x : (-L_x/2 <= x <= 0      )
-       *        {1-2x/L_x : (0      <= x <= +L_x/2 )
-       *        {0.0      : in any other case
-       *
-       * @param position_x = 1D position
-       * @param L_x        = length of the simulated area
-       *                     assuming that the simulation ranges
-       *                     from 0 to L_x in the chosen dimension
-       * @returns weighting factor to reduce ringing effects due to
-       *          sharp spacial boundaries
-       **/
-      HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
-      {
-        float_X x = position_x - float_X(0.5)*L_x;
-        return float_X(math::abs(x) <= float_X(0.5)*L_x)
-          * (float_X(1.0) - float_X(2.0)/L_x * math::abs(x) );
-      }
-    };
-} // namespace radWindowFunctionTriangle
-
-
-
-namespace radWindowFunctionHamming
-{
-    struct radWindowFunction
+    namespace plugins
     {
-      /** 1D Window function according to the Hamming window:
-       *
-       * x = position_x - L_x/2
-       * a = parameter of the Hamming window (ideal: 0.08)
-       * f(x) = {a+(1-a)*cos^2(pi*x/L_x)   : (-L_x/2 <= x <= +L_x/2 )
-       *        {0.0                       : in any other case
-       *
-       * @param position_x = 1D position
-       * @param L_x        = length of the simulated area
-       *                     assuming that the simulation ranges
-       *                     from 0 to L_x in the chosen dimension
-       * @returns weighting factor to reduce ringing effects due to
-       *          sharp spacial boundaries
-       **/
-      HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
-      {
-        const float_X x = position_x - L_x*float_X(0.5);
-        const float_X a = 0.08; /* ideal parameter: -43dB reduction */
-        const float_X cosinusValue = math::cos(pmacc::algorithms::math::Pi<float_X>::value*x/L_x);
-        return float_X(math::abs(x) <= float_X(0.5)*L_x)
-          * (a + (float_X(1.0)-a)*cosinusValue*cosinusValue);
-      }
-    };
-} // namespace radWindowFunctionHamming
-
-
-
-namespace radWindowFunctionTriplett
-{
-    struct radWindowFunction
-    {
-      /** 1D Window function according to the Triplett window:
-       *
-       * x      = position_x - L_x/2
-       * lambda = decay parameter of the Triplett window
-       * f(x) = {exp(-lambda*|x|)*cos^2(pi*x/L_x) : (-L_x/2 <= x <= +L_x/2 )
-       *        {0.0                              : in any other case
-       *
-       * @param position_x = 1D position
-       * @param L_x        = length of the simulated area
-       *                     assuming that the simulation ranges
-       *                     from 0 to L_x in the chosen dimension
-       * @returns weighting factor to reduce ringing effects due to
-       *          sharp spacial boundaries
-       **/
-      HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
-      {
-        const float_X x = position_x - L_x*float_X(0.5);
-        const float_X lambda = float_X(5.0)/L_x; /* larger is better, but too large means no data */
-        const float_X cosinusValue = math::cos(pmacc::algorithms::math::Pi<float_X>::value*x/L_x);
-        return float_X(math::abs(x) <= float_X(0.5)*L_x)
-          * (math::exp(float_X(-1.0)*lambda*math::abs(x))*cosinusValue*cosinusValue);
-      }
-    };
-} // namespace radWindowFunctionTriplett
-
-
-
-namespace radWindowFunctionGauss
-{
-    struct radWindowFunction
-    {
-      /** 1D Window function according to the Gauss window:
-       *
-       * x     = position_x - L_x/2
-       * sigma = standard deviation of the Gauss window
-       * f(x) = {exp(-0.5*x^2/sigma^2)   : (-L_x/2 <= x <= +L_x/2 )
-       *        {0.0                     : in any other case
-       *
-       * @param position_x = 1D position
-       * @param L_x        = length of the simulated area
-       *                     assuming that the simulation ranges
-       *                     from 0 to L_x in the chosen dimension
-       * @returns weighting factor to reduce ringing effects due to
-       *          sharp spacial boundaries
-       **/
-      HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
-      {
-        const float_X x = position_x - L_x*float_X(0.5);
-        const float_X sigma = float_X(0.4)*L_x; /* smaller is better, but too small means no data */
-        const float_X relativePosition = x/sigma; /* optimization */
-        return float_X(math::abs(x) <= float_X(0.5)*L_x)
-          * (math::exp(float_X(-0.5)*relativePosition*relativePosition));
-      }
-    };
-} // namespace radWindowFunctionGauss
-
-
-namespace radWindowFunctionNone
-{
-    struct radWindowFunction
-    {
-      /** 1D Window function according to the no window:
-       *
-       * f(position_x) = always 1.0
-       *
-       * @param position_x = 1D position
-       * @param L_x        = length of the simulated area
-       *                     assuming that the simulation ranges
-       *                     from 0 to L_x in the chosen dimension
-       * @returns 1.0
-       **/
-      HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
-      {
-        return float_X(1.0);
-      }
-    };
-} // namespace radWindowFunctionNone
-
-
-} // namespace radiation
-} // namespace plugins
+        namespace radiation
+        {
+            /* several window functions behind namespaces: */
+
+
+            namespace radWindowFunctionTriangle
+            {
+                struct radWindowFunction
+                {
+                    /** 1D Window function according to the triangle window:
+                     *
+                     * x = position_x - L_x/2
+                     * f(x) = {1+2x/L_x : (-L_x/2 <= x <= 0      )
+                     *        {1-2x/L_x : (0      <= x <= +L_x/2 )
+                     *        {0.0      : in any other case
+                     *
+                     * @param position_x = 1D position
+                     * @param L_x        = length of the simulated area
+                     *                     assuming that the simulation ranges
+                     *                     from 0 to L_x in the chosen dimension
+                     * @returns weighting factor to reduce ringing effects due to
+                     *          sharp spacial boundaries
+                     **/
+                    HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
+                    {
+                        float_X x = position_x - float_X(0.5) * L_x;
+                        return float_X(math::abs(x) <= float_X(0.5) * L_x)
+                            * (float_X(1.0) - float_X(2.0) / L_x * math::abs(x));
+                    }
+                };
+            } // namespace radWindowFunctionTriangle
+
+
+            namespace radWindowFunctionHamming
+            {
+                struct radWindowFunction
+                {
+                    /** 1D Window function according to the Hamming window:
+                     *
+                     * x = position_x - L_x/2
+                     * a = parameter of the Hamming window (ideal: 0.08)
+                     * f(x) = {a+(1-a)*cos^2(pi*x/L_x)   : (-L_x/2 <= x <= +L_x/2 )
+                     *        {0.0                       : in any other case
+                     *
+                     * @param position_x = 1D position
+                     * @param L_x        = length of the simulated area
+                     *                     assuming that the simulation ranges
+                     *                     from 0 to L_x in the chosen dimension
+                     * @returns weighting factor to reduce ringing effects due to
+                     *          sharp spacial boundaries
+                     **/
+                    HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
+                    {
+                        const float_X x = position_x - L_x * float_X(0.5);
+                        const float_X a = 0.08; /* ideal parameter: -43dB reduction */
+                        const float_X cosinusValue = math::cos(pmacc::math::Pi<float_X>::value * x / L_x);
+                        return float_X(math::abs(x) <= float_X(0.5) * L_x)
+                            * (a + (float_X(1.0) - a) * cosinusValue * cosinusValue);
+                    }
+                };
+            } // namespace radWindowFunctionHamming
+
+
+            namespace radWindowFunctionTriplett
+            {
+                struct radWindowFunction
+                {
+                    /** 1D Window function according to the Triplett window:
+                     *
+                     * x      = position_x - L_x/2
+                     * lambda = decay parameter of the Triplett window
+                     * f(x) = {exp(-lambda*|x|)*cos^2(pi*x/L_x) : (-L_x/2 <= x <= +L_x/2 )
+                     *        {0.0                              : in any other case
+                     *
+                     * @param position_x = 1D position
+                     * @param L_x        = length of the simulated area
+                     *                     assuming that the simulation ranges
+                     *                     from 0 to L_x in the chosen dimension
+                     * @returns weighting factor to reduce ringing effects due to
+                     *          sharp spacial boundaries
+                     **/
+                    HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
+                    {
+                        const float_X x = position_x - L_x * float_X(0.5);
+                        const float_X lambda = float_X(5.0) / L_x; /* larger is better, but too large means no data */
+                        const float_X cosinusValue = math::cos(pmacc::math::Pi<float_X>::value * x / L_x);
+                        return float_X(math::abs(x) <= float_X(0.5) * L_x)
+                            * (math::exp(float_X(-1.0) * lambda * math::abs(x)) * cosinusValue * cosinusValue);
+                    }
+                };
+            } // namespace radWindowFunctionTriplett
+
+
+            namespace radWindowFunctionGauss
+            {
+                struct radWindowFunction
+                {
+                    /** 1D Window function according to the Gauss window:
+                     *
+                     * x     = position_x - L_x/2
+                     * sigma = standard deviation of the Gauss window
+                     * f(x) = {exp(-0.5*x^2/sigma^2)   : (-L_x/2 <= x <= +L_x/2 )
+                     *        {0.0                     : in any other case
+                     *
+                     * @param position_x = 1D position
+                     * @param L_x        = length of the simulated area
+                     *                     assuming that the simulation ranges
+                     *                     from 0 to L_x in the chosen dimension
+                     * @returns weighting factor to reduce ringing effects due to
+                     *          sharp spacial boundaries
+                     **/
+                    HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
+                    {
+                        const float_X x = position_x - L_x * float_X(0.5);
+                        const float_X sigma = float_X(0.4) * L_x; /* smaller is better, but too small means no data */
+                        const float_X relativePosition = x / sigma; /* optimization */
+                        return float_X(math::abs(x) <= float_X(0.5) * L_x)
+                            * (math::exp(float_X(-0.5) * relativePosition * relativePosition));
+                    }
+                };
+            } // namespace radWindowFunctionGauss
+
+
+            namespace radWindowFunctionNone
+            {
+                struct radWindowFunction
+                {
+                    /** 1D Window function according to the no window:
+                     *
+                     * f(position_x) = always 1.0
+                     *
+                     * @param position_x = 1D position
+                     * @param L_x        = length of the simulated area
+                     *                     assuming that the simulation ranges
+                     *                     from 0 to L_x in the chosen dimension
+                     * @returns 1.0
+                     **/
+                    HDINLINE float_X operator()(const float_X position_x, const float_X L_x) const
+                    {
+                        return float_X(1.0);
+                    }
+                };
+            } // namespace radWindowFunctionNone
+
+
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
-
diff --git a/include/picongpu/plugins/randomizedParticleMerger/RandomizedParticleMerger.hpp b/include/picongpu/plugins/randomizedParticleMerger/RandomizedParticleMerger.hpp
new file mode 100644
index 0000000000..78eecf2320
--- /dev/null
+++ b/include/picongpu/plugins/randomizedParticleMerger/RandomizedParticleMerger.hpp
@@ -0,0 +1,285 @@
+/* Copyright 2017-2021 Heiko Burau, Xeinia Bastrakova, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/plugins/ISimulationPlugin.hpp"
+#include "picongpu/plugins/randomizedParticleMerger/RandomizedParticleMerger.kernel"
+#include "picongpu/particles/functor/misc/Rng.hpp"
+
+#include <pmacc/traits/HasIdentifier.hpp>
+#include <pmacc/cuSTL/cursor/MultiIndexCursor.hpp>
+#include <pmacc/random/distributions/Uniform.hpp>
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace randomizedParticleMerger
+        {
+            using namespace pmacc;
+            namespace bmpl = boost::mpl;
+
+            /** Implements a randomized modification of the particle merging algorithm.
+             *
+             * The original particle merging algorithms is
+             * Luu, P. T., Tueckmantel, T., & Pukhov, A. (2016).
+             * Voronoi particle merging algorithm for PIC codes.
+             * Computer Physics Communications, 202, 165-174.
+             *
+             * The randomized mofidication developed by S. Bastrakov and X. Bastrakova
+             *
+             * @tparam T_ParticlesType species type
+             * @tparam hasVoronoiCellId if the species type has the voronoiCellId attribute,
+             *                          the plugin will only be used for such types
+             */
+            template<
+                class T_ParticlesType,
+                bool hasVoronoiCellId
+                = pmacc::traits::HasIdentifier<typename T_ParticlesType::FrameType, voronoiCellId>::type::value>
+            struct RandomizedParticleMergerWrapped;
+
+            template<class T_ParticlesType>
+            struct RandomizedParticleMergerWrapped<T_ParticlesType, true> : ISimulationPlugin
+            {
+            private:
+                std::string name;
+                std::string prefix;
+                std::string notifyPeriod;
+                MappingDesc* cellDescription;
+
+                uint32_t maxParticlesToMerge;
+                float_X ratioDeletedParticles;
+                float_X posSpreadThreshold;
+                float_X momSpreadThreshold;
+
+            public:
+                using ParticlesType = T_ParticlesType;
+
+                RandomizedParticleMergerWrapped()
+                    : name("RandomizedParticleMerger: merges several macroparticles with"
+                           " similar position and momentum into a single one")
+                    , prefix(ParticlesType::FrameType::getName() + std::string("_randomizedMerger"))
+                    , cellDescription(nullptr)
+                {
+                    Environment<>::get().PluginConnector().registerPlugin(this);
+                }
+
+                void notify(uint32_t currentStep) override
+                {
+                    using SuperCellSize = MappingDesc::SuperCellSize;
+
+                    const pmacc::math::Int<simDim> coreBorderGuardSuperCells
+                        = this->cellDescription->getGridSuperCells();
+                    const pmacc::math::Int<simDim> guardSuperCells = this->cellDescription->getGuardingSuperCells();
+                    const pmacc::math::Int<simDim> coreBorderSuperCells
+                        = coreBorderGuardSuperCells - 2 * guardSuperCells;
+
+                    // this zone represents the core+border area with guard offset in unit of cells
+                    const zone::SphericZone<simDim> zone(
+                        static_cast<pmacc::math::Size_t<simDim>>(coreBorderSuperCells * SuperCellSize::toRT()),
+                        guardSuperCells * SuperCellSize::toRT());
+
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto particles = dc.get<ParticlesType>(ParticlesType::FrameType::getName(), true);
+                    using Kernel = RandomizedParticleMergerKernel<typename ParticlesType::ParticlesBoxType>;
+
+                    using namespace pmacc::random::distributions;
+                    using Distribution = Uniform<float_X>;
+                    using RngFactory = particles::functor::misc::Rng<Distribution>;
+
+                    RngFactory rngFactory(currentStep);
+                    auto kernel = Kernel{
+                        particles->getDeviceParticlesBox(),
+                        maxParticlesToMerge,
+                        ratioDeletedParticles,
+                        posSpreadThreshold,
+                        momSpreadThreshold,
+                        rngFactory,
+                        guardSuperCells};
+
+                    algorithm::kernel::Foreach<SuperCellSize> foreach;
+                    foreach(zone, cursor::make_MultiIndexCursor<simDim>(), kernel)
+                        ;
+
+                    // close all gaps caused by removal of particles
+                    particles->fillAllGaps();
+                }
+
+
+                void setMappingDescription(MappingDesc* cellDescription) override
+                {
+                    this->cellDescription = cellDescription;
+                }
+
+
+                void pluginRegisterHelp(po::options_description& desc) override
+                {
+                    desc.add_options()(
+                        (prefix + ".period").c_str(),
+                        po::value<std::string>(&notifyPeriod),
+                        "enable plugin [for each n-th step]")(
+                        (prefix + ".maxParticlesToMerge").c_str(),
+                        po::value<uint32_t>(&maxParticlesToMerge)->default_value(8),
+                        "minimum number of macroparticles at which we always divide the cell")(
+                        (prefix + ".posSpreadThreshold").c_str(),
+                        po::value<float_X>(&posSpreadThreshold)->default_value(1e-5),
+                        "Below this threshold of spread in position macroparticles"
+                        " can be merged [unit: cell edge length]")(
+                        (prefix + ".momSpreadThreshold").c_str(),
+                        po::value<float_X>(&momSpreadThreshold)->default_value(1e-5),
+                        "Below this absolute threshold of spread in momentum"
+                        " macroparticles can be merged [unit: m_el * c].")(
+                        (prefix + ".ratioDeletedParticles").c_str(),
+                        po::value<float_X>(&ratioDeletedParticles)->default_value(0.1),
+                        "Ratio of macroparticles to be deleted on average");
+                }
+
+                std::string pluginGetName() const override
+                {
+                    return name;
+                }
+
+            protected:
+                void pluginLoad()
+                {
+                    if(notifyPeriod.empty())
+                        return;
+
+                    Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+
+                    PMACC_VERIFY_MSG(
+                        maxParticlesToMerge > 1u,
+                        std::string("[Plugin: ") + prefix
+                            + "] maxParticlesToMerge"
+                              " has to be greater than one.");
+                    PMACC_VERIFY_MSG(
+                        ratioDeletedParticles > 0.0_X,
+                        std::string("[Plugin: ") + prefix
+                            + "] ratioDeletedParticles"
+                              " has to be > 0.");
+                    PMACC_VERIFY_MSG(
+                        ratioDeletedParticles < 1.0_X,
+                        std::string("[Plugin: ") + prefix
+                            + "] ratioDeletedParticles"
+                              " has to be < 1.");
+                    PMACC_VERIFY_MSG(
+                        posSpreadThreshold >= 0.0_X,
+                        std::string("[Plugin: ") + prefix
+                            + "] posSpreadThreshold"
+                              " has to be non-negative.");
+                    PMACC_VERIFY_MSG(
+                        momSpreadThreshold >= 0.0_X,
+                        std::string("[Plugin: ") + prefix
+                            + "] momSpreadThreshold"
+                              " has to be non-negative.");
+                }
+
+                void pluginUnload()
+                {
+                }
+
+                void restart(uint32_t, const std::string)
+                {
+                }
+
+                void checkpoint(uint32_t, const std::string)
+                {
+                }
+            };
+
+
+            /** Placeholder implementation for species without the required conditions
+             *
+             * @tparam T_ParticlesType species type
+             */
+            template<class T_ParticlesType>
+            struct RandomizedParticleMergerWrapped<T_ParticlesType, false> : ISimulationPlugin
+            {
+            private:
+                std::string name;
+                std::string prefix;
+                std::string notifyPeriod;
+                MappingDesc* cellDescription;
+
+            public:
+                using ParticlesType = T_ParticlesType;
+
+                RandomizedParticleMergerWrapped()
+                    : name("RandomizedParticleMerger: merges several macroparticles with"
+                           " similar position and momentum into a single one.\n"
+                           "plugin disabled. Enable plugin by adding the `voronoiCellId`"
+                           " attribute to the particle attribute list.")
+                    , prefix(ParticlesType::FrameType::getName() + std::string("_randomizedMerger"))
+                    , cellDescription(nullptr)
+                {
+                    Environment<>::get().PluginConnector().registerPlugin(this);
+                }
+
+                std::string pluginGetName() const
+                {
+                    return this->name;
+                }
+
+            protected:
+                void setMappingDescription(MappingDesc*)
+                {
+                }
+
+                void pluginRegisterHelp(po::options_description&)
+                {
+                }
+
+                void pluginUnload()
+                {
+                }
+
+                void restart(uint32_t, const std::string)
+                {
+                }
+
+                void checkpoint(uint32_t, const std::string)
+                {
+                }
+
+                void notify(uint32_t)
+                {
+                }
+            };
+
+            /** Randomized particle merger plugin
+             *
+             * @tparam T_ParticlesType species type
+             */
+            template<typename T_ParticlesType>
+            struct RandomizedParticleMerger : RandomizedParticleMergerWrapped<T_ParticlesType>
+            {
+            };
+
+        } // namespace randomizedParticleMerger
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/randomizedParticleMerger/RandomizedParticleMerger.kernel b/include/picongpu/plugins/randomizedParticleMerger/RandomizedParticleMerger.kernel
new file mode 100644
index 0000000000..467e242439
--- /dev/null
+++ b/include/picongpu/plugins/randomizedParticleMerger/RandomizedParticleMerger.kernel
@@ -0,0 +1,508 @@
+/* Copyright 2017-2021 Heiko Burau, Xeinia Bastrakova, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/plugins/randomizedParticleMerger/VoronoiCell.hpp"
+#include "picongpu/particles/access/Cell2Particle.hpp"
+#include "picongpu/particles/filter/filter.hpp"
+
+#include <pmacc/memory/Array.hpp>
+#include <pmacc/memory/IndexPool.hpp>
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace randomizedParticleMerger
+        {
+            /** Implements a randomized modification of the particle merging algorithm.
+             *
+             * The original particle merging algorithms is
+             * Luu, P. T., Tueckmantel, T., & Pukhov, A. (2016).
+             * Voronoi particle merging algorithm for PIC codes.
+             * Computer Physics Communications, 202, 165-174.
+             *
+             * The randomized mofidication developed by S. Bastrakov and X. Bastrakova
+             *
+             * @tparam T_ParticlesBox container of the particle species
+             */
+            template<class T_ParticlesBox>
+            struct RandomizedParticleMergerKernel
+            {
+                using ParticlesBox = T_ParticlesBox;
+
+                //! Random factory type
+                using RngFactory = particles::functor::misc::Rng<pmacc::random::distributions::Uniform<float>>;
+
+            private:
+                using FramePtr = typename ParticlesBox::FramePtr;
+                using FrameType = typename ParticlesBox::FrameType;
+                using ArrayVoronoiCells
+                    = memory::Array<VoronoiCell, picongpu::plugins::particleMerging::MAX_VORONOI_CELLS>;
+                using VoronoiIndexPool
+                    = memory::IndexPool<voronoiCellId::type, picongpu::plugins::particleMerging::MAX_VORONOI_CELLS>;
+
+                ParticlesBox particlesBox;
+                /** minimal number of macroparticles needed to divide
+                    the macroparticle collection  */
+                uint32_t maxParticlesToMerge;
+
+                pmacc::math::Int<simDim> guardSuperCells;
+                /** estimated fraction of macroparticles
+                    remaining after the merging process  */
+                float_X ratioKeptParticles;
+                /** min position threshold for
+                    macroparticles to be merged */
+                float_X posSpreadThreshold;
+                /** min momentum threshold for
+                    macroparticles to be merged */
+                float_X momSpreadThreshold;
+                /** factory of gitting random value */
+                RngFactory rngFactory;
+                using RandomGen = RngFactory::RandomGen;
+
+            public:
+                RandomizedParticleMergerKernel(
+                    ParticlesBox particlesBox,
+                    uint32_t maxParticlesToMerge,
+                    float_X ratioDeletedParticles,
+                    float_X posSpreadThreshold,
+                    float_X momSpreadThreshold,
+                    RngFactory rngFactory,
+                    const pmacc::math::Int<simDim> guardSuperCells)
+                    : particlesBox(particlesBox)
+                    , maxParticlesToMerge(maxParticlesToMerge)
+                    , ratioKeptParticles(1.0_X - ratioDeletedParticles)
+                    , posSpreadThreshold(posSpreadThreshold)
+                    , momSpreadThreshold(momSpreadThreshold)
+                    , rngFactory(rngFactory)
+                    , guardSuperCells(guardSuperCells)
+                {
+                }
+
+                /** map cell index to the initial Voronoi cell by aggregating N^simDim 'normal'
+                 * cells to a single Voronoi cell.
+                 *
+                 * @param cellIdx cell index
+                 */
+                DINLINE voronoiCellId::type mapCellIdxToInitialVoronoiCell(const uint32_t cellIdx) const
+                {
+                    const DataSpace<simDim> cellIdxDim
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(cellIdx);
+
+                    const DataSpace<simDim> voronoiCellDim = cellIdxDim / 2;
+
+                    return static_cast<voronoiCellId::type>(pmacc::math::linearize(
+                        pmacc::math::CT::shrinkTo<SuperCellSize, simDim - 1>::type::toRT() / 2,
+                        voronoiCellDim));
+                }
+
+                /** Init the Voronoi cell id attribute for each particle in the super cell.
+                 *
+                 * The initial Voronoi cell is chosen by aggregating N^simDim 'normal' cells
+                 * to a single Voronoi cell.
+                 *
+                 * @param cellIdx cell index
+                 */
+                template<typename T_Acc>
+                DINLINE void initVoronoiCellIdAttribute(T_Acc const& acc, const pmacc::math::Int<simDim>& cellIdx)
+                {
+                    //! \todo change this as soon as the kernel support lock step programming
+                    constexpr uint32_t numWorkers = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    const uint32_t workerIdx
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(cellIdx % SuperCellSize::toRT());
+                    particleAccess::Cell2Particle<SuperCellSize, numWorkers> forEachFrame;
+                    forEachFrame(
+                        acc,
+                        particlesBox,
+                        workerIdx,
+                        cellIdx,
+                        [this](const T_Acc& acc, FramePtr frame, const int linearThreadIdx) {
+                            auto particle = frame[linearThreadIdx];
+                            const lcellId_t particleCellIdx = particle[localCellIdx_];
+                            particle[voronoiCellId_] = this->mapCellIdxToInitialVoronoiCell(particleCellIdx);
+                        },
+                        particles::filter::All{});
+                }
+
+                /** Calculate position of particle within a super cell.
+                 *
+                 * @param particleCellIdx local particle cell index
+                 * @param positionWithinCell position within cell
+                 * @return position of particle with respect to its super cell's origin
+                 */
+                DINLINE floatD_X
+                getParticlePosWithinSuperCell(const lcellId_t particleCellIdx, const floatD_X positionWithinCell) const
+                {
+                    const DataSpace<simDim> particleCellIdxDim
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(particleCellIdx);
+
+                    floatD_X result;
+                    for(int i = 0; i < simDim; i++)
+                    {
+                        result[i] = static_cast<float_X>(particleCellIdxDim[i]) + positionWithinCell[i];
+                    }
+
+                    return result;
+                }
+
+                /** Calculate revative position of partilce in Supercell
+                 *
+                 * @param absoluteParticlePos absolute particle coordinates
+                 * @return particleCellIdx particle's supercell
+                 * @return positionWithinCell position particle inside supecell
+                 */
+                DINLINE void getSuperCellPos(
+                    const floatD_X absoluteParticlePos,
+                    ::pmacc::math::Vector<int, simDim>& particleCellIdx,
+                    floatD_X& positionWithinCell)
+                {
+                    for(int i = 0; i < simDim; i++)
+                    {
+                        particleCellIdx[i] = static_cast<int>(absoluteParticlePos[i]);
+                        positionWithinCell[i] = absoluteParticlePos[i] - particleCellIdx[i];
+                    }
+                }
+
+                /** Decide if subdivision should be done, based on
+                 *  information from parents voronoi cells
+                 *
+                 * @param randomGen ramdom generator functor
+                 * @param voronoiCell voronoi cell
+                 */
+                DINLINE bool isNeededSubdivision(RandomGen& randomGen, VoronoiCell const& voronoiCell) const
+                {
+                    // With large enough number of macroparticles we always subdivide
+                    if(voronoiCell.numMacroParticles > maxParticlesToMerge)
+                        return true;
+
+                    // Otherwise we compute subdivision probability based on the parameters
+                    // and the number of macroparticles in the cell
+                    float_X halfDivisionCoefficient
+                        = (voronoiCell.expectedNumMacroParticles + voronoiCell.numMacroParticles) / 2.0_X;
+
+                    float_X subdivisionProbability
+                        = (voronoiCell.expectedNumMacroParticles - 1.0_X) / (halfDivisionCoefficient - 1.0_X);
+
+                    // Spectial probability equations for small Voronoi cells
+                    if(voronoiCell.numMacroParticles == 2)
+                        subdivisionProbability = voronoiCell.expectedNumMacroParticles - 1.0_X;
+                    if(voronoiCell.numMacroParticles == 3)
+                        subdivisionProbability = (voronoiCell.expectedNumMacroParticles - 1.0_X) / 2.0_X;
+
+                    return randomGen() < subdivisionProbability;
+                }
+
+
+                DINLINE bool isSpreadEnoughForSubdivision(uint8_t& splittingComponent, VoronoiCell& voronoiCell) const
+                {
+                    bool isSpreadEnoughForSubdivision = true;
+                    float_X maxSpreadValue = voronoiCell.getMaxValueSpread2(splittingComponent, simDim);
+
+                    if(voronoiCell.splittingStage == VoronoiSplittingStage::position
+                       && maxSpreadValue < posSpreadThreshold)
+                    {
+                        voronoiCell.invertSplittingStage();
+                        maxSpreadValue = voronoiCell.getMaxValueSpread2(splittingComponent, simDim);
+                        if(maxSpreadValue < momSpreadThreshold)
+                        {
+                            voronoiCell.setToReadyForMerging();
+                            isSpreadEnoughForSubdivision = false;
+                        }
+                    }
+                    if(voronoiCell.splittingStage == VoronoiSplittingStage::momentum
+                       && maxSpreadValue < momSpreadThreshold)
+                    {
+                        voronoiCell.invertSplittingStage();
+                        maxSpreadValue = voronoiCell.getMaxValueSpread2(splittingComponent, simDim);
+                        if(maxSpreadValue < posSpreadThreshold)
+                        {
+                            voronoiCell.setToReadyForMerging();
+                            isSpreadEnoughForSubdivision = false;
+                        }
+                    }
+                    return isSpreadEnoughForSubdivision;
+                }
+
+                /** Merge all particles in voronoi cell into one
+                 *
+                 * New momentum is weighted average of all particles momentums.
+                 * New position is weighted average of all particles positions.
+                 * New weight is sum of all particles weights
+                 *
+                 * @tparam T_Particle particle type
+                 * @trapam T_Acc accelerator type
+                 *
+                 * @param particle current particle
+                 * @param voronoiCell current Voronoi cell
+                 */
+                template<typename T_Particle, typename T_Acc>
+                DINLINE void mergeVoronoiCell(T_Acc const& acc, T_Particle& particle, VoronoiCell& voronoiCell)
+                {
+                    if(voronoiCell.isFirstParticle(acc))
+                    {
+                        /* I am the first particle in the Voronoi cell
+                         * => get dressed with Voronoi cell's attributes
+                         */
+
+                        auto particleCellIdx = pmacc::DataSpace<simDim>::create(0);
+                        auto relativePosition = floatD_X::create(0.0_X);
+                        getSuperCellPos(voronoiCell.meanPositionValue, particleCellIdx, relativePosition);
+                        lcellId_t localCellIdx = pmacc::math::linearize(
+                            pmacc::math::CT::shrinkTo<SuperCellSize, simDim - 1>::type::toRT(),
+                            particleCellIdx);
+
+                        particle[localCellIdx_] = localCellIdx;
+                        particle[position_] = relativePosition;
+                        /* Here the voronoiCell.meanMomentumValue is for a single particle,
+                         * multiply to make it for macroparticle
+                         */
+                        particle[momentum_] = voronoiCell.meanMomentumValue * voronoiCell.numRealParticles;
+                        particle[weighting_] = voronoiCell.numRealParticles;
+                    }
+                    else
+                    {
+                        // I am not the first particle in the Voronoi cell => remove me
+                        particle[multiMask_] = 0;
+                    }
+                }
+
+                /** This method handles the merging process on the single-particle level.
+                 *
+                 * It is called in the main loop of the merging algorithm.
+                 * Depending on the state of the Voronoi cell where the particle belongs
+                 * to the execution is forked into distinct sub-processes.
+                 *
+                 * @tparam T_Acc accelerator type
+                 *
+                 * @param acc accelerator
+                 * @param cellIdx n-dimensional cell index from the origin of the local domain
+                 * @param listVoronoiCells fixed-sized array of Voronoi cells
+                 */
+                template<typename T_Acc>
+                DINLINE void processParticles(
+                    T_Acc const& acc,
+                    const pmacc::math::Int<simDim>& cellIdx,
+                    ArrayVoronoiCells& listVoronoiCells)
+                {
+                    //! \todo change this as soon as the kernel support lock step programming
+                    constexpr uint32_t numWorkers = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    const uint32_t workerIdx
+                        = DataSpaceOperations<simDim>::template map<SuperCellSize>(cellIdx % SuperCellSize::toRT());
+                    particleAccess::Cell2Particle<SuperCellSize, numWorkers> forEachFrame;
+                    forEachFrame(
+                        acc,
+                        this->particlesBox,
+                        workerIdx,
+                        cellIdx,
+                        [&](const T_Acc& acc, FramePtr frame, const int linearThreadIdx) {
+                            auto particle = frame[linearThreadIdx];
+                            const voronoiCellId::type voronoiCellId = particle[voronoiCellId_];
+
+                            if(voronoiCellId == -1)
+                                return;
+
+                            VoronoiCell& voronoiCell = listVoronoiCells[voronoiCellId];
+
+                            const floatD_X position
+                                = this->getParticlePosWithinSuperCell(particle[localCellIdx_], particle[position_]);
+
+                            const float_X weighting = particle[weighting_];
+                            /* Algorithm internally operates with momentums for single
+                             * particles, not macroparticles, so convert
+                             */
+                            const float3_X singleParticleMomentum = particle[momentum_] / weighting;
+
+                            switch(voronoiCell.status)
+                            {
+                            case VoronoiStatus::collecting:
+                                voronoiCell.addParticle(acc, position, singleParticleMomentum, weighting);
+                                break;
+
+                            case VoronoiStatus::splitting:
+                            {
+                                const voronoiCellId::type subVoronoiCellId
+                                    = voronoiCell.getSubVoronoiCell(position, singleParticleMomentum);
+                                particle[voronoiCellId_] = subVoronoiCellId;
+                                listVoronoiCells[subVoronoiCellId]
+                                    .addParticle(acc, position, singleParticleMomentum, weighting);
+
+                                break;
+                            }
+
+                            case VoronoiStatus::abort:
+                                particle[voronoiCellId_] = -1;
+                                break;
+
+                            case VoronoiStatus::readyForMerging:
+                                mergeVoronoiCell(acc, particle, voronoiCell);
+                                particle[voronoiCellId_] = -1;
+                            }
+                        },
+                        particles::filter::All{});
+                }
+
+                /** This method handles the merging process on the Voronoi cell level.
+                 *
+                 * It is called in the main loop of the merging algorithm.
+                 * It does the transition of the distinct states of each Voronoi cell.
+                 *
+                 * @param listVoronoiCells fixed-sized array of Voronoi cells
+                 * @param voronoiIndexPool holds indices of active Voronoi cells within `listVoronoiCells`
+                 * @param randomGen random generator functor
+                 */
+                DINLINE void processVoronoiCells(
+                    ArrayVoronoiCells& listVoronoiCells,
+                    VoronoiIndexPool& voronoiIndexPool,
+                    RandomGen& randomGen) const
+                {
+                    for(voronoiCellId::type voronoiCellId : voronoiIndexPool)
+                    {
+                        VoronoiCell& voronoiCell = listVoronoiCells[voronoiCellId];
+                        switch(voronoiCell.status)
+                        {
+                        case VoronoiStatus::collecting:
+                            if(voronoiCell.numMacroParticles < 2)
+                            {
+                                voronoiCell.setToAbort();
+                                break;
+                            }
+                            voronoiCell.finalizePrecalculationValues(maxParticlesToMerge, ratioKeptParticles);
+
+                            // Check if subdivision is needed probabilistically
+                            if(isNeededSubdivision(randomGen, voronoiCell))
+                            {
+                                uint8_t splittingComponent;
+                                float_X maxSpreadValue = voronoiCell.getMaxValueSpread2(splittingComponent, simDim);
+
+                                // Continue only when the subdivision makes sense in terms of the spread
+                                if(!isSpreadEnoughForSubdivision(splittingComponent, voronoiCell))
+                                    break;
+
+                                voronoiCell.setToSplitting(
+                                    splittingComponent,
+                                    voronoiIndexPool.get(),
+                                    voronoiIndexPool.get());
+
+                                // Abort when no memory for more Voronoi cells
+                                if(voronoiCell.lowerCellId == -1 || voronoiCell.higherCellId == -1)
+                                {
+                                    voronoiCell.setToAbort();
+                                    break;
+                                }
+
+                                // For better subdivision, change the splitting state each step
+                                VoronoiSplittingStage currentVoronoiStage;
+                                if(voronoiCell.splittingStage == VoronoiSplittingStage::position)
+                                    currentVoronoiStage = VoronoiSplittingStage::momentum;
+                                else
+                                    currentVoronoiStage = VoronoiSplittingStage::position;
+
+                                /* initialize the two new sub Voronoi cells in `collecting` state */
+                                listVoronoiCells[voronoiCell.lowerCellId] = VoronoiCell(
+                                    currentVoronoiStage,
+                                    voronoiCell.numMacroParticles,
+                                    voronoiCell.expectedNumMacroParticles);
+                                listVoronoiCells[voronoiCell.higherCellId] = VoronoiCell(
+                                    currentVoronoiStage,
+                                    voronoiCell.numMacroParticles,
+                                    voronoiCell.expectedNumMacroParticles);
+
+                                break;
+                            }
+                            else
+                            {
+                                voronoiCell.setToReadyForMerging();
+                                break;
+                            }
+
+                        default:
+                            voronoiIndexPool.release(voronoiCellId);
+                            break;
+                        }
+                    }
+                }
+
+                /** Entry point of the particle merging algorithm
+                 *
+                 * @tparam T_Acc accelerator type
+                 *
+                 * @param acc accelerator
+                 * @param cellIndex n-dimensional cell index from the origin of the local domain
+                 */
+                template<typename T_Acc>
+                DINLINE void operator()(T_Acc const& acc, const pmacc::math::Int<simDim>& cellIndex)
+                {
+                    // multi-dim vector from origin of the super cell to a cell in units of cells
+                    const pmacc::math::Int<simDim> threadIndex = cellIndex % SuperCellSize::toRT();
+                    const int linearThreadIdx = pmacc::math::linearize(
+                        pmacc::math::CT::shrinkTo<SuperCellSize, simDim - 1>::type::toRT(),
+                        threadIndex);
+
+                    // Storage for Voronoi cells in shared memory
+                    PMACC_SMEM(acc, listVoronoiCells, ArrayVoronoiCells);
+                    PMACC_SMEM(acc, voronoiIndexPool, VoronoiIndexPool);
+
+                    /* number of initial Voronoi cells
+                     * `1u << simDim` is equivalent to `pow(2, simDim)` but can be
+                     * calculated at compile-time to save a shared variable.
+                     */
+                    constexpr uint16_t numInitialVoronoiCells
+                        = pmacc::math::CT::volume<SuperCellSize>::type::value / (1u << simDim);
+
+
+                    pmacc::math::Int<simDim> localOffset = cellIndex / SuperCellSize::toRT() - guardSuperCells;
+                    constexpr uint32_t numWorkers = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    pmacc::mappings::threads::WorkerCfg<numWorkers> workerCfg(linearThreadIdx);
+
+                    // Thread 0 of each block creates Voronoi cells
+                    if(linearThreadIdx == 0)
+                    {
+                        voronoiIndexPool = VoronoiIndexPool(numInitialVoronoiCells);
+                    }
+                    __syncthreads();
+
+                    // Set initial Voronoi cells into `collecting` state
+                    if(linearThreadIdx < numInitialVoronoiCells)
+                        listVoronoiCells[linearThreadIdx] = VoronoiCell();
+                    __syncthreads();
+
+                    // Distribute particle between original cells
+                    initVoronoiCellIdAttribute(acc, cellIndex);
+                    __syncthreads();
+
+                    auto generator = rngFactory(acc, localOffset, workerCfg);
+                    // Main loop of the algorithm: while there are active cells left
+                    while(voronoiIndexPool.size() > 0)
+                    {
+                        processParticles(acc, cellIndex, listVoronoiCells);
+                        __syncthreads();
+
+                        // This part is not yet parallelized between blocks of a thread
+                        if(linearThreadIdx == 0)
+                            processVoronoiCells(listVoronoiCells, voronoiIndexPool, generator);
+                        __syncthreads();
+                    }
+                }
+            };
+
+        } // namespace randomizedParticleMerger
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/randomizedParticleMerger/VoronoiCell.hpp b/include/picongpu/plugins/randomizedParticleMerger/VoronoiCell.hpp
new file mode 100644
index 0000000000..6e12233f7a
--- /dev/null
+++ b/include/picongpu/plugins/randomizedParticleMerger/VoronoiCell.hpp
@@ -0,0 +1,372 @@
+/* Copyright 2017-2021 Heiko Burau, Xeinia Bastrakova, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/algorithms/KinEnergy.hpp"
+
+#include <pmacc/types.hpp>
+
+#include <cstdint>
+
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace randomizedParticleMerger
+        {
+            //! Status of a Voronoi cell
+            enum struct VoronoiStatus : uint8_t
+            {
+                /* !< a Voronoi cell is collecting particles (first state) */
+                collecting,
+                /* !< the Voronoi cell is splitting thus all its particles have
+                 * to move to one of two sub-Voronoi cells
+                 */
+                splitting,
+                /* !< the cell needs to be destroyed. Before this can happen
+                 * all its particles need to clear their voronoiCellId attribute.
+                 */
+                abort,
+                /* !< the Voronoi cell is ready for merging. After merging it is destroyed. */
+                readyForMerging,
+            };
+
+
+            /** Stage of a Voronoi cell
+             *
+             * The spliiting process is two-fold: at first, the splitting is done regarding
+             * only the spread in position and then by looking at the spread of momentum.
+             */
+            enum struct VoronoiSplittingStage : bool
+            {
+                /* !< the spatial distribution is splitted */
+                position,
+                /* !< the momentum distribution is splitted */
+                momentum
+            };
+
+            //! Voronoi cell representation
+            struct VoronoiCell
+            {
+                VoronoiStatus status;
+                VoronoiSplittingStage splittingStage;
+                /** number of macroparticles */
+                uint32_t numMacroParticles;
+                /** number of physical particles */
+                float_X numRealParticles;
+
+                /** value of weighted mean momentum for
+                    all physical particles in cell */
+                float3_X meanMomentumValue;
+                /** value of weighted mean position for
+                  all physical particles in cell */
+                float3_X meanPositionValue;
+                /** value of weighted squared mean momentum for
+                    all physical particles in cell */
+                float3_X meanMomentumSquaredValue;
+                /** value of weighted squared mean position for
+                    all physical particles in cell */
+                float3_X meanPositionSquaredValue;
+
+                /** axis on which the Voronoi cell is divided */
+                uint8_t splittingComponent;
+                /** cell index of a child "lower" subcelld */
+                int32_t lowerCellId;
+                /** cell index of a child "upper" subcelld */
+                int32_t higherCellId;
+                /** is this particle first in voronoi cell */
+                int firstParticleFlag;
+                /** necessary for probalic algorithm. expected number of particles,
+                    which should be obtained after merging the particles in the current cell */
+                float_X expectedNumMacroParticles;
+                /** number of macroparticles in parent cell */
+                uint32_t parentNumMacroParticles;
+                /** necessary for probalic algorithm.  expected number of particles,
+                    which should be obtained after merging the particles in the parent cell */
+                float_X parentExpectedNumMacroParticles;
+
+                HDINLINE
+                VoronoiCell(
+                    VoronoiSplittingStage splittingStage = VoronoiSplittingStage::position,
+                    float_X parentNumMacroParticles = 0.0_X,
+                    float_X parentExpectedNumMacroParticles = float_X(-1.0))
+                    : status(VoronoiStatus::collecting)
+                    , splittingStage(splittingStage)
+                    , numMacroParticles(0u)
+                    , numRealParticles(float_X(0.0_X))
+                    , meanMomentumValue(float3_X::create(0.0_X))
+                    , meanPositionValue(float3_X::create(0.0_X))
+                    , meanMomentumSquaredValue(float3_X::create(0.0_X))
+                    , meanPositionSquaredValue(float3_X::create(0.0_X))
+                    , firstParticleFlag(0)
+                    , expectedNumMacroParticles(0.0_X)
+                    , parentNumMacroParticles(parentNumMacroParticles)
+                    , parentExpectedNumMacroParticles(parentExpectedNumMacroParticles)
+
+                {
+                }
+
+                /** status setter */
+                HDINLINE
+                void setToAbort()
+                {
+                    status = VoronoiStatus::abort;
+                }
+
+
+                /** Mark the cell for splitting
+                 *
+                 * @param splittingComponent index of position or momentum component
+                 *                           to use for splitting
+                 * @param lowerCellId cell index of a new "lower" subcell
+                 * @param higherCellId cell index of a new "upper" subcell
+                 */
+                HDINLINE
+                void setToSplitting(
+                    const uint8_t splittingComponent,
+                    const int32_t lowerCellId,
+                    const int32_t higherCellId)
+                {
+                    status = VoronoiStatus::splitting;
+                    this->splittingComponent = splittingComponent;
+                    this->lowerCellId = lowerCellId;
+                    this->higherCellId = higherCellId;
+                }
+
+
+                /** status setter */
+                HDINLINE
+                void setToReadyForMerging()
+                {
+                    this->status = VoronoiStatus::readyForMerging;
+                }
+
+                /** check if the current thread is associated to the first particle */
+                template<typename T_Acc>
+                DINLINE bool isFirstParticle(const T_Acc& acc)
+                {
+                    return atomicExch(&this->firstParticleFlag, 1) == 0;
+                }
+
+
+                /** add a particle to this Voronoi cell */
+                template<typename T_Acc>
+                DINLINE void addParticle(
+                    const T_Acc& acc,
+                    const floatD_X position,
+                    const float3_X momentum,
+                    const float_X weighting)
+                {
+                    cupla::atomicAdd(
+                        acc,
+                        &this->numMacroParticles,
+                        static_cast<uint32_t>(1),
+                        ::alpaka::hierarchy::Threads{});
+                    cupla::atomicAdd(acc, &this->numRealParticles, weighting, ::alpaka::hierarchy::Threads{});
+
+                    const floatD_X position2 = position * position;
+
+                    for(int i = 0; i < simDim; i++)
+                    {
+                        cupla::atomicAdd(
+                            acc,
+                            &this->meanPositionValue[i],
+                            weighting * position[i],
+                            ::alpaka::hierarchy::Threads{});
+                        cupla::atomicAdd(
+                            acc,
+                            &this->meanPositionSquaredValue[i],
+                            weighting * position2[i],
+                            ::alpaka::hierarchy::Threads{});
+                    }
+
+                    const float3_X momentum2 = momentum * momentum;
+
+                    for(int i = 0; i < DIM3; i++)
+                    {
+                        cupla::atomicAdd(
+                            acc,
+                            &this->meanMomentumValue[i],
+                            weighting * momentum[i],
+                            ::alpaka::hierarchy::Threads{});
+                        cupla::atomicAdd(
+                            acc,
+                            &this->meanMomentumSquaredValue[i],
+                            weighting * momentum2[i],
+                            ::alpaka::hierarchy::Threads{});
+                    }
+                }
+
+                /** Counting parameters that are necessary before processing vornoi cell:
+                 *  mean values and expected number of macro particles
+                 *
+                 * @param minMacroParticlesToDivide min number of macroparticles in a cell
+                 *                                  such that the cell is always subdivided
+                 * @param ratioKeptParticles ratio of particles that are kept on average
+                 */
+                HDINLINE
+                void finalizePrecalculationValues(
+                    const uint32_t minMacroParticlesToDivide,
+                    const float_X ratioKeptParticles)
+                {
+                    finalizeMeanValues();
+                    finalizeExpectedNumberParticles(minMacroParticlesToDivide, ratioKeptParticles);
+                }
+
+                //! Finalize calculation of mean values
+                HDINLINE
+                void finalizeMeanValues()
+                {
+                    meanMomentumValue /= numRealParticles;
+                    meanPositionValue /= numRealParticles;
+                    meanMomentumSquaredValue /= numRealParticles;
+                    meanPositionSquaredValue /= numRealParticles;
+                }
+
+                /** Count expected number of particles in the cell
+                 *
+                 * @param minMacroParticlesToDivide min number of macroparticles in a cell
+                 *                                  such that the cell is always subdivided
+                 * @param ratioKeptParticles ratio of particles that are kept on average
+                 */
+                HDINLINE
+                void finalizeExpectedNumberParticles(
+                    const uint32_t minMacroParticlesToDivide,
+                    const float_X ratioKeptParticles)
+                {
+                    // Special case for the original voronoi cells
+                    if(parentExpectedNumMacroParticles < 0)
+                    {
+                        expectedNumMacroParticles = numMacroParticles * ratioKeptParticles;
+                        return;
+                    }
+
+                    // Algorithm stop conditions for 1 and 2 macroparticles
+                    if(numMacroParticles == 1u)
+                        expectedNumMacroParticles = 1.0_X;
+                    if(numMacroParticles == 2u && parentNumMacroParticles == 3u)
+                        expectedNumMacroParticles = 2.0_X;
+
+                    // Normal subdivision step
+                    if(parentNumMacroParticles > minMacroParticlesToDivide)
+                    {
+                        expectedNumMacroParticles = numMacroParticles * ratioKeptParticles;
+                    }
+                    else
+                    {
+                        float_X undividedCellCoeff
+                            = (parentExpectedNumMacroParticles + parentNumMacroParticles) / 2.0_X;
+                        float_X currentExpectedNumMacroParticles
+                            = numMacroParticles * undividedCellCoeff / parentNumMacroParticles;
+                        expectedNumMacroParticles = currentExpectedNumMacroParticles;
+                    }
+                }
+
+                /** determine in which of the two sub-Voronoi cells a particle falls */
+                HDINLINE
+                int32_t getSubVoronoiCell(const floatD_X position, const float3_X momentum) const
+                {
+                    const float_X valParticle = splittingStage == VoronoiSplittingStage::position
+                        ? position[splittingComponent]
+                        : momentum[splittingComponent];
+                    const float_X meanVoronoi = splittingStage == VoronoiSplittingStage::position
+                        ? meanPositionValue[splittingComponent]
+                        : meanMomentumValue[splittingComponent];
+                    return valParticle < meanVoronoi ? lowerCellId : higherCellId;
+                }
+
+                /** Counting parameters that are necessary before processing vornoi cell:
+                 *  mean values and expected number of macro particles
+                 *
+                 * @param minMacroParticlesToDivide min number of macroparticles in a cell
+                 *                                  such that the cell is always subdivided
+                 * @param ratioKeptParticles ratio of particles that are kept on average
+                 * @return maximum spread value
+                 * @return component of most spread in position (as function parameter)
+                 */
+
+                /** auxillary function for getting the mean squared deviation in position or momentum */
+                HDINLINE
+                float_X getMaxValueSpread2(uint8_t& component, const uint8_t dimension) const
+                {
+                    const float3_X meanValue2 = splittingStage == VoronoiSplittingStage::position
+                        ? meanPositionValue * meanPositionValue
+                        : meanMomentumValue * meanMomentumValue;
+
+                    const float3_X valueSpread2 = splittingStage == VoronoiSplittingStage::position
+                        ? meanPositionSquaredValue - meanValue2
+                        : meanMomentumSquaredValue - meanValue2;
+
+                    /* find component of most spread in position */
+                    component = 0;
+                    float_X maxValueSpread2 = valueSpread2[0];
+                    for(uint8_t i = 1; i < dimension; i++)
+                    {
+                        if(valueSpread2[i] > maxValueSpread2)
+                        {
+                            maxValueSpread2 = valueSpread2[i];
+                            component = i;
+                        }
+                    }
+
+                    return maxValueSpread2;
+                }
+
+
+                /** calculate the maxmimum squared spread in position
+                 *
+                 * @param component index of position component of maxmimum spread
+                 * @return maxmimum squared spread in position
+                 * @return from uint8_t& component argument -- axis of maximum spread
+                 */
+                HDINLINE
+                float_X getMaxPositionSpread2(uint8_t& component) const
+                {
+                    return getMaxValueSpread2(component, simDim);
+                }
+
+
+                /** calculate the maxmimum squared spread in momentum
+                 *
+                 * @param component index of momentum component of maxmimum spread
+                 * @return maxmimum squared spread in momentum
+                 * @return from uint8_t& component argument -- axis of maximum squared spread
+                 */
+                HDINLINE
+                float_X getMaxMomentumSpread2(uint8_t& component) const
+                {
+                    return getMaxValueSpread2(component, DIM3);
+                }
+
+                /** invesing splitting stage */
+                HDINLINE
+                void invertSplittingStage()
+                {
+                    if(splittingStage == VoronoiSplittingStage::position)
+                        splittingStage = VoronoiSplittingStage::momentum;
+                    else
+                        splittingStage = VoronoiSplittingStage::position;
+                }
+            };
+
+        } // namespace randomizedParticleMerger
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/Calculator.hpp b/include/picongpu/plugins/transitionRadiation/Calculator.hpp
index 9dd452592e..3ccf0a522e 100644
--- a/include/picongpu/plugins/transitionRadiation/Calculator.hpp
+++ b/include/picongpu/plugins/transitionRadiation/Calculator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Finn-Ole Carstens
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -24,217 +24,184 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-    using complex_X = pmacc::math::Complex< float_X >;
-    using complex_64 = pmacc::math::Complex< float_64 >;
-
-    /* Arbitrary margin which is necessary to prevent division by 0 error
-     * created by particles moving in the plane of the foil.
-     */
-    float_X const DIV_BY_ZERO_MINIMUM = 1.e-7;
-
-    /** Calculator class for calculation of transition radiation.
-     *
-     * @param particleSet transitionRadiation::Particle to compute transition radiation for
-     * @param lookDirection vector of observation direction
-     */
-    class Calculator
+    namespace plugins
     {
-
-    private:
-        transitionRadiation::Particle const & particle;
-        float3_X const & lookDirection;
-
-        float_X parMomSinTheta;
-        float_X parMomCosTheta;
-        float_X const parMomPhi;
-        float_X parMomSinPhi;
-        float_X parMomCosPhi;
-        float_X detectorSinTheta;
-        float_X detectorCosTheta;
-        float_X const detectorPhi;
-        float_X const uSquared;
-        float_X const parSqrtOnePlusUSquared;
-
-    public:
-        HDINLINE
-        Calculator(
-            transitionRadiation::Particle const & particleSet,
-            float3_X const & lookDirection
-        ) :
-            particle( particleSet ),
-            lookDirection( lookDirection ),
-            parMomPhi( particle.getMomPhi( ) ),
-            // one has to add pi to the polar angle, because phi is in the range of 0 to 2 \pi
-            detectorPhi(
-                picongpu::math::atan2(
-                    lookDirection.z( ),
-                    lookDirection.x( )
-                ) + picongpu::PI
-            ),
-            uSquared( particle.getU( ) * particle.getU( ) ),
-            parSqrtOnePlusUSquared(
-                picongpu::math::sqrt( 1 + uSquared )
-            )
-        {
-            // frequent calculations
-            // momentum Space for Particle:
-            picongpu::math::sincos(
-                particle.getMomTheta( ),
-                parMomSinTheta,
-                parMomCosTheta
-            );
-            picongpu::math::sincos(
-                parMomPhi - detectorPhi,
-                parMomSinPhi,
-                parMomCosPhi
-            );
-
-            // detector Position since lookDirection is normalized
-            float_X const detectorTheta = picongpu::math::acos( lookDirection.y( ) );
-
-            picongpu::math::sincos(
-                detectorTheta,
-                detectorSinTheta,
-                detectorCosTheta
-            );
-        }
-
-        /** Perpendicular part of normalized energy
-         *
-         * Calculates perpendicular part to movement direction of normalized energy
-         * determined by formula:
-         * @f[E_{perp} = (u^2 \cos{\psi} \sin{\psi} \sin{\phi} \cos{\theta}) /
-         *          ((\sqrt{1 + u^2} - u \sin{\psi} \cos{\phi} \sin{\theta})^2 - u^2 \cos{\phi}^2 \cos{\theta}^2)@f]
-         * where \psi is the azimuth angle of the particle momentum and \theta is
-         * the azimuth angle of the detector position to the movement direction y
-         *
-         * @return perpendicular part of normalized energy
-         */
-        HDINLINE
-        float_X
-        calcEnergyPerp( ) const
+        namespace transitionRadiation
         {
-            // a, x and y are temporary variables without an explicit physical meaning
-            float_X const a = uSquared * parMomCosTheta * parMomSinTheta *
-                parMomSinPhi * detectorCosTheta;
-
-            // Denominator
-            float_X const x = parSqrtOnePlusUSquared -
-                particle.getU( ) * parMomSinTheta * parMomCosPhi * detectorSinTheta;
-            float_X const y = particle.getU( ) * parMomCosTheta * detectorCosTheta;
-
-            float_X denominator = x * x - y * y;
-
-            // Preventing division by 0
-            if( math::abs( denominator ) < DIV_BY_ZERO_MINIMUM )
+            using complex_X = pmacc::math::Complex<float_X>;
+            using complex_64 = pmacc::math::Complex<float_64>;
+
+            /* Arbitrary margin which is necessary to prevent division by 0 error
+             * created by particles moving in the plane of the foil.
+             */
+            float_X const DIV_BY_ZERO_MINIMUM = 1.e-7;
+
+            /** Calculator class for calculation of transition radiation.
+             *
+             * @param particleSet transitionRadiation::Particle to compute transition radiation for
+             * @param lookDirection vector of observation direction
+             */
+            class Calculator
             {
-                if( denominator < 0.0 )
-                    denominator = -DIV_BY_ZERO_MINIMUM;
-                else
-                    denominator = DIV_BY_ZERO_MINIMUM;
-            }
-
-            return a / denominator;
-        }
-
-        /** Parallel part of normalized energy
-         *
-         * Calculates parallel part to movement direction of normalized energy
-         * determined by formula:
-         * @f[E_{para} = (u \cos{\psi} (u \sin{\psi} \cos{\phi} - \sqrt{1 + u^2} \sin{\theta}) /
-         *          ((\sqrt{1 + u^2} - u \sin{\psi} \cos{\phi} \sin{\theta})^2 - u^2 \cos{\phi}^2 \cos{\theta}^2)@f]
-         * where \psi is the azimuth angle of the particle momentum and \theta is
-         * the azimuth angle of the detector position to the movement direction y
-         *
-         * @return parallel part of normalized energy
-         */
-        HDINLINE
-        float_X
-        calcEnergyPara( ) const
-        {
-            // a, b, c, x and y are just temporary variables without an explicit physical meaning
-            float_X const a = particle.getU( ) * parMomCosTheta;
-            float_X const b = particle.getU( ) * parMomSinTheta * parMomCosPhi;
-            float_X const c = parSqrtOnePlusUSquared * detectorSinTheta;
-
-            // Denominator
-            float_X const x = parSqrtOnePlusUSquared -
-                particle.getU( ) * parMomSinTheta * parMomCosPhi * detectorSinTheta;
-            float_X const y = particle.getU( ) * parMomCosTheta * detectorCosTheta;
-
-            float_X denominator = x * x - y * y;
-
-            // Preventing division by 0
-            if( math::abs( denominator ) < DIV_BY_ZERO_MINIMUM )
+            private:
+                transitionRadiation::Particle const& particle;
+                float3_X const& lookDirection;
+
+                float_X parMomSinTheta;
+                float_X parMomCosTheta;
+                float_X const parMomPhi;
+                float_X parMomSinPhi;
+                float_X parMomCosPhi;
+                float_X detectorSinTheta;
+                float_X detectorCosTheta;
+                float_X const detectorPhi;
+                float_X const uSquared;
+                float_X const parSqrtOnePlusUSquared;
+
+            public:
+                HDINLINE
+                Calculator(transitionRadiation::Particle const& particleSet, float3_X const& lookDirection)
+                    : particle(particleSet)
+                    , lookDirection(lookDirection)
+                    , parMomPhi(particle.getMomPhi())
+                    ,
+                    // one has to add pi to the polar angle, because phi is in the range of 0 to 2 \pi
+                    detectorPhi(picongpu::math::atan2(lookDirection.z(), lookDirection.x()) + picongpu::PI)
+                    , uSquared(particle.getU() * particle.getU())
+                    , parSqrtOnePlusUSquared(picongpu::math::sqrt(1 + uSquared))
+                {
+                    // frequent calculations
+                    // momentum Space for Particle:
+                    pmacc::math::sincos(particle.getMomTheta(), parMomSinTheta, parMomCosTheta);
+                    pmacc::math::sincos(parMomPhi - detectorPhi, parMomSinPhi, parMomCosPhi);
+
+                    // detector Position since lookDirection is normalized
+                    float_X const detectorTheta = picongpu::math::acos(lookDirection.y());
+
+                    pmacc::math::sincos(detectorTheta, detectorSinTheta, detectorCosTheta);
+                }
+
+                /** Perpendicular part of normalized energy
+                 *
+                 * Calculates perpendicular part to movement direction of normalized energy
+                 * determined by formula:
+                 * @f[E_{perp} = (u^2 \cos{\psi} \sin{\psi} \sin{\phi} \cos{\theta}) /
+                 *          ((\sqrt{1 + u^2} - u \sin{\psi} \cos{\phi} \sin{\theta})^2 - u^2 \cos{\phi}^2
+                 * \cos{\theta}^2)@f] where \psi is the azimuth angle of the particle momentum and \theta is the
+                 * azimuth angle of the detector position to the movement direction y
+                 *
+                 * @return perpendicular part of normalized energy
+                 */
+                HDINLINE
+                float_X calcEnergyPerp() const
+                {
+                    // a, x and y are temporary variables without an explicit physical meaning
+                    float_X const a = uSquared * parMomCosTheta * parMomSinTheta * parMomSinPhi * detectorCosTheta;
+
+                    // Denominator
+                    float_X const x
+                        = parSqrtOnePlusUSquared - particle.getU() * parMomSinTheta * parMomCosPhi * detectorSinTheta;
+                    float_X const y = particle.getU() * parMomCosTheta * detectorCosTheta;
+
+                    float_X denominator = x * x - y * y;
+
+                    // Preventing division by 0
+                    if(math::abs(denominator) < DIV_BY_ZERO_MINIMUM)
+                    {
+                        if(denominator < 0.0)
+                            denominator = -DIV_BY_ZERO_MINIMUM;
+                        else
+                            denominator = DIV_BY_ZERO_MINIMUM;
+                    }
+
+                    return a / denominator;
+                }
+
+                /** Parallel part of normalized energy
+                 *
+                 * Calculates parallel part to movement direction of normalized energy
+                 * determined by formula:
+                 * @f[E_{para} = (u \cos{\psi} (u \sin{\psi} \cos{\phi} - \sqrt{1 + u^2} \sin{\theta}) /
+                 *          ((\sqrt{1 + u^2} - u \sin{\psi} \cos{\phi} \sin{\theta})^2 - u^2 \cos{\phi}^2
+                 * \cos{\theta}^2)@f] where \psi is the azimuth angle of the particle momentum and \theta is the
+                 * azimuth angle of the detector position to the movement direction y
+                 *
+                 * @return parallel part of normalized energy
+                 */
+                HDINLINE
+                float_X calcEnergyPara() const
+                {
+                    // a, b, c, x and y are just temporary variables without an explicit physical meaning
+                    float_X const a = particle.getU() * parMomCosTheta;
+                    float_X const b = particle.getU() * parMomSinTheta * parMomCosPhi;
+                    float_X const c = parSqrtOnePlusUSquared * detectorSinTheta;
+
+                    // Denominator
+                    float_X const x
+                        = parSqrtOnePlusUSquared - particle.getU() * parMomSinTheta * parMomCosPhi * detectorSinTheta;
+                    float_X const y = particle.getU() * parMomCosTheta * detectorCosTheta;
+
+                    float_X denominator = x * x - y * y;
+
+                    // Preventing division by 0
+                    if(math::abs(denominator) < DIV_BY_ZERO_MINIMUM)
+                    {
+                        if(denominator < 0.0)
+                            denominator = -DIV_BY_ZERO_MINIMUM;
+                        else
+                            denominator = DIV_BY_ZERO_MINIMUM;
+                    }
+
+                    return a * (b - c) / denominator;
+                }
+
+                /** Exponent of form factor
+                 *
+                 * Calculates the exponent of the formfactor divided by \omega
+                 * It represents the phase of a single electron in the bunch, but it is mostly
+                 * calculated for performance reasons.
+                 * \f[ F_exp = - i z ( 1 / v - \sin{\theta} \sin{\psi} \cos{\phi_P - \phi_D} / c ) / \cos{\phi}
+                 *          - i \sin{\theta} \rho \cos{\phi_P - \phi_D} \f]
+                 *
+                 */
+                HDINLINE
+                complex_X calcFormFactorExponent() const
+                {
+                    // If case for longitudinal moving particles... leads to 0 later in the kernel
+                    if(math::abs(parMomCosTheta) <= DIV_BY_ZERO_MINIMUM)
+                        return complex_X(-1.0, 0.0);
+
+                    float_X const a = detectorSinTheta * parMomSinTheta * math::cos(parMomPhi - detectorPhi);
+                    float_X const b
+                        = -(particle.getPosPara()) * (1 / particle.getVel() - a / SPEED_OF_LIGHT) / (parMomCosTheta);
+                    float_X const c
+                        = -detectorSinTheta * particle.getPosPerp() * math::cos(particle.getPosPhi() - detectorPhi);
+
+                    complex_X const fpara = complex_X(0.0, b);
+                    complex_X const fperp = complex_X(0.0, c);
+                    return fpara + fperp;
+                }
+            }; // class Calculator
+
+            /** Formfactor
+             *
+             * Calculates of the electron bunch with the exponent calculated by the
+             * Calculator class.
+             *
+             * @f[F = \exp{ F_{exp} * \omega }@f]
+             *
+             * @param omega observed frequency
+             * @param exponent exponent of exponential function
+             */
+            HDINLINE
+            complex_X calcFormFactor(float_X const omega, complex_X const exponent)
             {
-                if( denominator < 0.0 )
-                    denominator = -DIV_BY_ZERO_MINIMUM;
-                else
-                    denominator = DIV_BY_ZERO_MINIMUM;
+                // preventing division by 0
+                const bool longMovingParticle = exponent.get_real() == -1.0;
+                return float_X(longMovingParticle) * complex_X(0.0, 0.0)
+                    + float_X(!longMovingParticle) * complex_X(math::exp(exponent * omega));
             }
 
-            return a * ( b - c ) / denominator;
-        }
-
-        /** Exponent of form factor
-         *
-         * Calculates the exponent of the formfactor divided by \omega
-         * It represents the phase of a single electron in the bunch, but it is mostly
-         * calculated for performance reasons.
-         * \f[ F_exp = - i z ( 1 / v - \sin{\theta} \sin{\psi} \cos{\phi_P - \phi_D} / c ) / \cos{\phi}
-         *          - i \sin{\theta} \rho \cos{\phi_P - \phi_D} \f]
-         *
-         */
-        HDINLINE
-        complex_X
-        calcFormFactorExponent( ) const
-        {
-            // If case for longitudinal moving particles... leads to 0 later in the kernel
-            if ( math::abs( parMomCosTheta ) <= DIV_BY_ZERO_MINIMUM )
-                return complex_X( -1.0, 0.0 );
-
-            float_X const a = detectorSinTheta * parMomSinTheta * math::cos( parMomPhi - detectorPhi );
-            float_X const b = - ( particle.getPosPara( ) ) * ( 1 / particle.getVel( ) - a / SPEED_OF_LIGHT) / ( parMomCosTheta );
-            float_X const c = - detectorSinTheta * particle.getPosPerp( ) * math::cos( particle.getPosPhi( ) - detectorPhi );
-
-            complex_X const fpara = complex_X( 0.0, b );
-            complex_X const fperp = complex_X( 0.0, c );
-            return fpara + fperp;
-
-        }
-    }; // class Calculator
-
-    /** Formfactor
-     *
-     * Calculates of the electron bunch with the exponent calculated by the
-     * Calculator class.
-     *
-     * @f[F = \exp{ F_{exp} * \omega }@f]
-     *
-     * @param omega observed frequency
-     * @param exponent exponent of exponential function
-     */
-    HDINLINE
-    complex_X
-    calcFormFactor(
-        float_X const omega,
-        complex_X const exponent
-    )
-    {
-        // preventing division by 0
-        const bool longMovingParticle = exponent.get_real() == -1.0;
-        return float_X( longMovingParticle ) * complex_X( 0.0, 0.0 ) +
-            float_X( !longMovingParticle ) * complex_X(
-                math::exp(
-                    exponent * omega
-                )
-            );
-    }
-
-} // namespace transitionRadiation
-} // namespace plugins
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/ExecuteParticleFilter.hpp b/include/picongpu/plugins/transitionRadiation/ExecuteParticleFilter.hpp
index 7b967af8ce..1f7df26cd7 100644
--- a/include/picongpu/plugins/transitionRadiation/ExecuteParticleFilter.hpp
+++ b/include/picongpu/plugins/transitionRadiation/ExecuteParticleFilter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera, Finn-Ole Carstens
+/* Copyright 2017-2021 Rene Widera, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -31,69 +31,65 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-
-    /** read the `transitionRadiationMask` of a species */
-    template< bool hasFilter >
-    struct ExecuteParticleFilter
+    namespace plugins
     {
-        /** get the attribute value of `transitionRadiationMask`
-         *
-         * @param species buffer
-         * @param currentStep current simulation time step
-         * @return value of the attribute `transitionRadiationMask`
-         */
-        template< typename T_Species >
-        void operator()( std::shared_ptr<T_Species> const &, const uint32_t currentStep )
+        namespace transitionRadiation
         {
-            particles::Manipulate<
-                picongpu::plugins::transitionRadiation::GammaFilter,
-                T_Species
-            >{ }( currentStep );
-        }
-    };
+            /** read the `transitionRadiationMask` of a species */
+            template<bool hasFilter>
+            struct ExecuteParticleFilter
+            {
+                /** get the attribute value of `transitionRadiationMask`
+                 *
+                 * @param species buffer
+                 * @param currentStep current simulation time step
+                 * @return value of the attribute `transitionRadiationMask`
+                 */
+                template<typename T_Species>
+                void operator()(std::shared_ptr<T_Species> const&, const uint32_t currentStep)
+                {
+                    particles::Manipulate<picongpu::plugins::transitionRadiation::GammaFilter, T_Species>{}(
+                        currentStep);
+                }
+            };
 
-    /** specialization
-     *
-     * specialization for the case that the species does not have the attribute
-     * `transitionRadiationMask`
-     */
-    template< >
-    struct ExecuteParticleFilter< false >
-    {
-        /** get the attribute value of `transitionRadiationMask`
-         *
-         * @param particle to be used
-         * @return always true
-         */
-        template< typename T_Species >
-        void operator()( const std::shared_ptr<T_Species>, const uint32_t currentStep )
-        { }
-    };
+            /** specialization
+             *
+             * specialization for the case that the species does not have the attribute
+             * `transitionRadiationMask`
+             */
+            template<>
+            struct ExecuteParticleFilter<false>
+            {
+                /** get the attribute value of `transitionRadiationMask`
+                 *
+                 * @param particle to be used
+                 * @return always true
+                 */
+                template<typename T_Species>
+                void operator()(const std::shared_ptr<T_Species>, const uint32_t currentStep)
+                {
+                }
+            };
 
-    /** execute the particle filter on a species
-     *
-     * It is **allowed** to call this function even if the species does not contain
-     * the attribute `transitionRadiationMask`.
-     * The filter is **not** executed if the species does not contain the attribute `transitionRadiationMask`.
-     *
-     * @tparam T_Species species type
-     * @param species species to be filtered
-     */
-    template< typename T_Species >
-    void executeParticleFilter( std::shared_ptr<T_Species>& species, const uint32_t currentStep )
-    {
-        constexpr bool hasRadiationFilter = pmacc::traits::HasIdentifier<
-            typename T_Species::FrameType,
-            transitionRadiationMask
-        >::type::value;
+            /** execute the particle filter on a species
+             *
+             * It is **allowed** to call this function even if the species does not contain
+             * the attribute `transitionRadiationMask`.
+             * The filter is **not** executed if the species does not contain the attribute `transitionRadiationMask`.
+             *
+             * @tparam T_Species species type
+             * @param species species to be filtered
+             */
+            template<typename T_Species>
+            void executeParticleFilter(std::shared_ptr<T_Species>& species, const uint32_t currentStep)
+            {
+                constexpr bool hasRadiationFilter = pmacc::traits::
+                    HasIdentifier<typename T_Species::FrameType, transitionRadiationMask>::type::value;
 
-        return ExecuteParticleFilter< hasRadiationFilter >{ }( species, currentStep );
-    }
+                return ExecuteParticleFilter<hasRadiationFilter>{}(species, currentStep);
+            }
 
-} // namespace transitionRadiation
-} // namespace plugins
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/GammaMask.hpp b/include/picongpu/plugins/transitionRadiation/GammaMask.hpp
index 06172d696c..c55b847fe8 100644
--- a/include/picongpu/plugins/transitionRadiation/GammaMask.hpp
+++ b/include/picongpu/plugins/transitionRadiation/GammaMask.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera, Finn-Ole Carstens
+/* Copyright 2017-2021 Rene Widera, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -26,65 +26,64 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-    /** read the `transitionRadiationMask` of a species */
-    template< bool hasTransitionRadiationMask >
-    struct GetTransitionRadiationMask
+    namespace plugins
     {
-        /** get the attribute value of `transitionRadiationMask`
-         *
-         * @param particle particle to be used
-         * @return value of the attribute `transitionRadiationMask`
-         */
-        template< typename T_Particle >
-        HDINLINE bool operator()( const T_Particle& particle ) const
+        namespace transitionRadiation
         {
-            return particle[ transitionRadiationMask_ ];
-        }
-    };
+            /** read the `transitionRadiationMask` of a species */
+            template<bool hasTransitionRadiationMask>
+            struct GetTransitionRadiationMask
+            {
+                /** get the attribute value of `transitionRadiationMask`
+                 *
+                 * @param particle particle to be used
+                 * @return value of the attribute `transitionRadiationMask`
+                 */
+                template<typename T_Particle>
+                HDINLINE bool operator()(const T_Particle& particle) const
+                {
+                    return particle[transitionRadiationMask_];
+                }
+            };
 
-    /** specialization
-     *
-     * specialization for the case that the species not owns the attribute
-     * `transitionRadiationMask`
-     */
-    template< >
-    struct GetTransitionRadiationMask< false >
-    {
-        /** get the attribute value of `transitionRadiationMask`
-         *
-         * @param particle to be used
-         * @return always true
-         */
-        template< typename T_Particle >
-        HDINLINE bool operator()( const T_Particle& ) const
-        {
-            return true;
-        }
-    };
+            /** specialization
+             *
+             * specialization for the case that the species not owns the attribute
+             * `transitionRadiationMask`
+             */
+            template<>
+            struct GetTransitionRadiationMask<false>
+            {
+                /** get the attribute value of `transitionRadiationMask`
+                 *
+                 * @param particle to be used
+                 * @return always true
+                 */
+                template<typename T_Particle>
+                HDINLINE bool operator()(const T_Particle&) const
+                {
+                    return true;
+                }
+            };
 
-    /** get the value of the particle attribute `transitionRadiationMask`
-     *
-     * Allow to read out the value of the attribute `transitionRadiationMask` also if
-     * it is not defined for the particle.
-     *
-     * @tparam T_Particle particle type
-     * @param particle valid particle
-     * @return particle attribute value `transitionRadiationMask`, always `true` if attribute `transitionRadiationMask` is not defined
-     */
-    template< typename T_Particle >
-    HDINLINE bool getTransitionRadiationMask( const T_Particle& particle )
-    {
-        constexpr bool hasTransitionRadiationMask = pmacc::traits::HasIdentifier<
-            typename T_Particle::FrameType,
-            transitionRadiationMask
-        >::type::value;
-        return GetTransitionRadiationMask< hasTransitionRadiationMask >{}( particle );
-    }
+            /** get the value of the particle attribute `transitionRadiationMask`
+             *
+             * Allow to read out the value of the attribute `transitionRadiationMask` also if
+             * it is not defined for the particle.
+             *
+             * @tparam T_Particle particle type
+             * @param particle valid particle
+             * @return particle attribute value `transitionRadiationMask`, always `true` if attribute
+             * `transitionRadiationMask` is not defined
+             */
+            template<typename T_Particle>
+            HDINLINE bool getTransitionRadiationMask(const T_Particle& particle)
+            {
+                constexpr bool hasTransitionRadiationMask = pmacc::traits::
+                    HasIdentifier<typename T_Particle::FrameType, transitionRadiationMask>::type::value;
+                return GetTransitionRadiationMask<hasTransitionRadiationMask>{}(particle);
+            }
 
-} // namespace transitionRadiation
-} // namespace plugins
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/Particle.hpp b/include/picongpu/plugins/transitionRadiation/Particle.hpp
index 14fad463dd..a26dd697a6 100644
--- a/include/picongpu/plugins/transitionRadiation/Particle.hpp
+++ b/include/picongpu/plugins/transitionRadiation/Particle.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Finn-Ole Carstens
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -21,164 +21,135 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-    /** Particle class for transition radiation calculation.
-     *
-     * @param locationSet global position of the macro-particle
-     * @param momentumSet momentum of macro-particle
-     * @param charge
-     */
-    class Particle
+    namespace plugins
     {
-    private:
-        float3_X const & momentum;
-        float_X const mass;
-        float3_X location;
-        float_X gamma;
-        float3_X beta;
-        float_X betaAbs;
-
-    public:
-        HDINLINE
-        Particle(
-            float3_X const & locationSet,
-            float3_X const & momentumSet,
-            float_X const massSet
-        ) :
-            location( locationSet ),
-            momentum( momentumSet ),
-            mass( massSet )
-        {
-            gamma = calcGamma( );
-            beta = calcBeta( );
-            betaAbs = math::sqrt( ( beta * beta ).sumOfComponents( ) );
-        }
-
-        //! @return momentum
-        HDINLINE
-        float3_X
-        getMomentum( ) const
-        {
-            return momentum;
-        }
-
-        //! @return normalized momentum
-        HDINLINE
-        float_X
-        getU( ) const
-        {
-            return gamma * betaAbs;
-        }
-
-        //! @return velocity v = beta * c
-        HDINLINE
-        float_X
-        getVel( ) const
-        {
-            return betaAbs * picongpu::SPEED_OF_LIGHT;
-        }
-
-        //! propagates the current window to the foil position
-        HDINLINE
-        void
-        propagate( const float_X & propagationDistance )
-        {
-            location += propagationDistance * beta;
-        }
-
-        //! @return polar angle phi of momentum
-        HDINLINE
-        float_X
-        getMomPhi( ) const
-        {
-            // add pi to atan2 function, because phi is in range from 0 to 2 pi
-            return picongpu::math::atan2(
-                momentum.x( ),
-                momentum.z( )
-            ) + picongpu::PI;
-        }
-
-        //! @return azimuth angle psi of momentum
-        HDINLINE
-        float_X
-        getMomTheta( ) const
-        {
-            //because of floating point precision x^2+y^2+z^2<y^2 for x,z<<y
-            float_X const momAbs = getMomAbs( );
-            if( momAbs * momAbs <= momentum.y( ) * momentum.y( ) )
-                return 0.0;
-            else
-                return picongpu::math::acos( momentum.y( ) * ( 1.0 / momAbs ) );
-        }
-
-        //! @return absolute value of momentum
-        HDINLINE
-        float_X
-        getMomAbs( ) const
-        {
-            return picongpu::math::sqrt(
-                momentum.x( ) * momentum.x( ) +
-                momentum.y( ) * momentum.y( ) +
-                momentum.z( ) * momentum.z( )
-            );
-        }
-
-        //! @return radial, perpendicular component of location in cylindrical coordinates
-        HDINLINE
-        float_X
-        getPosPerp( ) const
-        {
-            return picongpu::math::sqrt(
-                location.x( ) * location.x( ) +
-                location.z( ) * location.z( )
-            );
-        }
-
-        //! @return parallel component to y of location in cylindrical coordinates
-        HDINLINE
-        float_X
-        getPosPara( ) const
-        {
-            return location.y( );
-        }
-
-        //! @return polar angle of location in cylindrical coordinates
-        HDINLINE
-        float_X
-        getPosPhi( ) const
-        {
-            // add pi to atan2 function, because phi is in range from 0 to 2 pi
-            return picongpu::math::atan2(
-                location.x( ),
-                location.z( )
-            ) + picongpu::PI;
-        }
-
-    private:
-        // gamma has to be calculated before calling this function
-        //! @return beta=v/c
-        HDINLINE
-        float3_X
-        calcBeta( ) const
-        {
-            return momentum * (1.0 / (mass * picongpu::SPEED_OF_LIGHT * gamma));
-        }
-
-        //! @return gamma = E/(mc^2)
-        HDINLINE
-        float_X
-        calcGamma( ) const
+        namespace transitionRadiation
         {
-            float_X const massTimesC = mass * picongpu::SPEED_OF_LIGHT;
-            return picongpu::math::sqrt(
-                1.0 + ( momentum * momentum ).sumOfComponents( ) / ( massTimesC * massTimesC )
-            );
-        }
-    }; // class Particle
-
-} // namespace transitionRadiation
-} // namespace plugins
+            /** Particle class for transition radiation calculation.
+             *
+             * @param locationSet global position of the macro-particle
+             * @param momentumSet momentum of macro-particle
+             * @param charge
+             */
+            class Particle
+            {
+            private:
+                float3_X const& momentum;
+                float_X const mass;
+                float3_X location;
+                float_X gamma;
+                float3_X beta;
+                float_X betaAbs;
+
+            public:
+                HDINLINE
+                Particle(float3_X const& locationSet, float3_X const& momentumSet, float_X const massSet)
+                    : location(locationSet)
+                    , momentum(momentumSet)
+                    , mass(massSet)
+                {
+                    gamma = calcGamma();
+                    beta = calcBeta();
+                    betaAbs = math::sqrt((beta * beta).sumOfComponents());
+                }
+
+                //! @return momentum
+                HDINLINE
+                float3_X getMomentum() const
+                {
+                    return momentum;
+                }
+
+                //! @return normalized momentum
+                HDINLINE
+                float_X getU() const
+                {
+                    return gamma * betaAbs;
+                }
+
+                //! @return velocity v = beta * c
+                HDINLINE
+                float_X getVel() const
+                {
+                    return betaAbs * picongpu::SPEED_OF_LIGHT;
+                }
+
+                //! propagates the current window to the foil position
+                HDINLINE
+                void propagate(const float_X& propagationDistance)
+                {
+                    location += propagationDistance * beta;
+                }
+
+                //! @return polar angle phi of momentum
+                HDINLINE
+                float_X getMomPhi() const
+                {
+                    // add pi to atan2 function, because phi is in range from 0 to 2 pi
+                    return picongpu::math::atan2(momentum.x(), momentum.z()) + picongpu::PI;
+                }
+
+                //! @return azimuth angle psi of momentum
+                HDINLINE
+                float_X getMomTheta() const
+                {
+                    // because of floating point precision x^2+y^2+z^2<y^2 for x,z<<y
+                    float_X const momAbs = getMomAbs();
+                    if(momAbs * momAbs <= momentum.y() * momentum.y())
+                        return 0.0;
+                    else
+                        return picongpu::math::acos(momentum.y() * (1.0 / momAbs));
+                }
+
+                //! @return absolute value of momentum
+                HDINLINE
+                float_X getMomAbs() const
+                {
+                    return picongpu::math::sqrt(
+                        momentum.x() * momentum.x() + momentum.y() * momentum.y() + momentum.z() * momentum.z());
+                }
+
+                //! @return radial, perpendicular component of location in cylindrical coordinates
+                HDINLINE
+                float_X getPosPerp() const
+                {
+                    return picongpu::math::sqrt(location.x() * location.x() + location.z() * location.z());
+                }
+
+                //! @return parallel component to y of location in cylindrical coordinates
+                HDINLINE
+                float_X getPosPara() const
+                {
+                    return location.y();
+                }
+
+                //! @return polar angle of location in cylindrical coordinates
+                HDINLINE
+                float_X getPosPhi() const
+                {
+                    // add pi to atan2 function, because phi is in range from 0 to 2 pi
+                    return picongpu::math::atan2(location.x(), location.z()) + picongpu::PI;
+                }
+
+            private:
+                // gamma has to be calculated before calling this function
+                //! @return beta=v/c
+                HDINLINE
+                float3_X calcBeta() const
+                {
+                    return momentum * (1.0 / (mass * picongpu::SPEED_OF_LIGHT * gamma));
+                }
+
+                //! @return gamma = E/(mc^2)
+                HDINLINE
+                float_X calcGamma() const
+                {
+                    float_X const massTimesC = mass * picongpu::SPEED_OF_LIGHT;
+                    return picongpu::math::sqrt(
+                        1.0 + (momentum * momentum).sumOfComponents() / (massTimesC * massTimesC));
+                }
+            }; // class Particle
+
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/TransitionRadiation.hpp b/include/picongpu/plugins/transitionRadiation/TransitionRadiation.hpp
index 13fc0d32f2..bd52ec211d 100644
--- a/include/picongpu/plugins/transitionRadiation/TransitionRadiation.hpp
+++ b/include/picongpu/plugins/transitionRadiation/TransitionRadiation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Klaus Steiniger, Felix Schmitt, Benjamin Worpitz
  *                     Finn-Ole Carstens
  *
@@ -58,599 +58,521 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-    using namespace pmacc;
-
-    namespace po = boost::program_options;
-    using complex_X = pmacc::math::Complex< float_X >;
-
-    /** Implementation of transition radiation for in situ calculation in PIConGPU
-     *
-     * The transition radiation implemented in this plugin is based on
-     * C. B. Schroeder, E. Esarey, J. van Tilborg, and W. P. Leemans:
-     * Theory of coherent transition radiation generated at a plasma-vacuum interface
-     * (DOI:https://doi.org/10.1103/PhysRevE.69.016501)
-     *
-     * Transition radiation is created by charged particles moving through an
-     * interface where one medium has a different diffraction index as the other
-     * medium. Since it is mostly used to analyze electron bunches, this plugin
-     * assumes that the analyzed particles have the mass and charge of electrons.
-     *
-     * @tparam T_ParticlesType particle type to compute transition radiation from
-     */
-    template<
-        typename T_ParticlesType
-    >
-    class TransitionRadiation : public ILightweightPlugin
+    namespace plugins
     {
-    private:
-
-        using SuperCellSize = MappingDesc::SuperCellSize;
-
-        using radLog = plugins::radiation::PIConGPUVerboseRadiation;
-
-        GridBuffer< float_X, DIM1 > * incTransRad = nullptr;
-        GridBuffer< complex_X, DIM1 > * cohTransRadPara = nullptr;
-        GridBuffer< complex_X, DIM1 > * cohTransRadPerp = nullptr;
-        GridBuffer< float_X, DIM1 > * numParticles = nullptr;
-
-        transitionRadiation::frequencies::InitFreqFunctor freqInit;
-        transitionRadiation::frequencies::FreqFunctor freqFkt;
-
-        float_X * tmpITR = nullptr;
-        complex_X * tmpCTRpara = nullptr;
-        complex_X * tmpCTRperp = nullptr;
-        float_X * tmpNum = nullptr;
-        float_X * theTransRad = nullptr;
-        MappingDesc * cellDescription = nullptr;
-        std::string notifyPeriod;
-        uint32_t timeStep;
-
-        std::string speciesName;
-        std::string pluginName;
-        std::string pluginPrefix;
-        std::string filenamePrefix;
-        std::string folderTransRad;
-
-        float3_X * detectorPositions = nullptr;
-        float_X * detectorFrequencies = nullptr;
-
-        bool isMaster = false;
-        uint32_t currentStep = 0;
-
-        mpi::MPIReduce reduce;
-
-    public:
-        //! Constructor
-        TransitionRadiation( ) :
-            pluginName( "TransitionRadiation: calculate transition radiation of species" ),
-            speciesName( T_ParticlesType::FrameType::getName( ) ),
-            pluginPrefix( speciesName + std::string( "_transRad" ) ),
-            folderTransRad( "transRad" ),
-            filenamePrefix( pluginPrefix )
-        {
-            Environment< >::get( ).PluginConnector( ).registerPlugin( this );
-        }
-
-        virtual
-        ~TransitionRadiation( )
-        { }
-
-        /** Plugin management
-         *
-         * Implementation of base class function. Calculates the transition radiation
-         * by calling the according function of the kernel file, writes data to a
-         * file and resets the buffers if transition radiation is calculated for
-         * multiple timesteps.
-         *
-         * @param currentStep current step of simulation
-         */
-        void
-        notify(
-            uint32_t currentStep
-        )
+        namespace transitionRadiation
         {
-            log< radLog::SIMULATION_STATE >( "Transition Radition (%1%): calculate time step %2% " ) % speciesName % currentStep;
+            using namespace pmacc;
+
+            namespace po = boost::program_options;
+            using complex_X = pmacc::math::Complex<float_X>;
+
+            /** Implementation of transition radiation for in situ calculation in PIConGPU
+             *
+             * The transition radiation implemented in this plugin is based on
+             * C. B. Schroeder, E. Esarey, J. van Tilborg, and W. P. Leemans:
+             * Theory of coherent transition radiation generated at a plasma-vacuum interface
+             * (DOI:https://doi.org/10.1103/PhysRevE.69.016501)
+             *
+             * Transition radiation is created by charged particles moving through an
+             * interface where one medium has a different diffraction index as the other
+             * medium. Since it is mostly used to analyze electron bunches, this plugin
+             * assumes that the analyzed particles have the mass and charge of electrons.
+             *
+             * @tparam T_ParticlesType particle type to compute transition radiation from
+             */
+            template<typename T_ParticlesType>
+            class TransitionRadiation : public ILightweightPlugin
+            {
+            private:
+                using SuperCellSize = MappingDesc::SuperCellSize;
+
+                using radLog = plugins::radiation::PIConGPUVerboseRadiation;
+
+                GridBuffer<float_X, DIM1>* incTransRad = nullptr;
+                GridBuffer<complex_X, DIM1>* cohTransRadPara = nullptr;
+                GridBuffer<complex_X, DIM1>* cohTransRadPerp = nullptr;
+                GridBuffer<float_X, DIM1>* numParticles = nullptr;
+
+                transitionRadiation::frequencies::InitFreqFunctor freqInit;
+                transitionRadiation::frequencies::FreqFunctor freqFkt;
+
+                float_X* tmpITR = nullptr;
+                complex_X* tmpCTRpara = nullptr;
+                complex_X* tmpCTRperp = nullptr;
+                float_X* tmpNum = nullptr;
+                float_X* theTransRad = nullptr;
+                MappingDesc* cellDescription = nullptr;
+                std::string notifyPeriod;
+                uint32_t timeStep;
+
+                std::string speciesName;
+                std::string pluginName;
+                std::string pluginPrefix;
+                std::string filenamePrefix;
+                std::string folderTransRad;
+
+                float3_X* detectorPositions = nullptr;
+                float_X* detectorFrequencies = nullptr;
+
+                bool isMaster = false;
+                uint32_t currentStep = 0;
+
+                mpi::MPIReduce reduce;
+
+            public:
+                //! Constructor
+                TransitionRadiation()
+                    : pluginName("TransitionRadiation: calculate transition radiation of species")
+                    , speciesName(T_ParticlesType::FrameType::getName())
+                    , pluginPrefix(speciesName + std::string("_transRad"))
+                    , folderTransRad("transRad")
+                    , filenamePrefix(pluginPrefix)
+                {
+                    Environment<>::get().PluginConnector().registerPlugin(this);
+                }
 
-            resetBuffers( );
-            this->currentStep = currentStep;
+                virtual ~TransitionRadiation()
+                {
+                }
 
-            calculateTransitionRadiation( currentStep );
+                /** Plugin management
+                 *
+                 * Implementation of base class function. Calculates the transition radiation
+                 * by calling the according function of the kernel file, writes data to a
+                 * file and resets the buffers if transition radiation is calculated for
+                 * multiple timesteps.
+                 *
+                 * @param currentStep current step of simulation
+                 */
+                void notify(uint32_t currentStep)
+                {
+                    log<radLog::SIMULATION_STATE>("Transition Radition (%1%): calculate time step %2% ") % speciesName
+                        % currentStep;
 
-            log< radLog::SIMULATION_STATE >( "Transition Radition (%1%): finished time step %2% " ) % speciesName % currentStep;
+                    resetBuffers();
+                    this->currentStep = currentStep;
 
-            collectDataGPUToMaster( );
-            writeTransRadToText( );
+                    calculateTransitionRadiation(currentStep);
 
-            log< radLog::SIMULATION_STATE >( "Transition Radition (%1%): printed to table %2% " ) % speciesName % currentStep;
-        }
+                    log<radLog::SIMULATION_STATE>("Transition Radition (%1%): finished time step %2% ") % speciesName
+                        % currentStep;
 
-        /** Implementation of base class function. Registers plugin options.
-         *
-         * @param desc boost::program_options description
-         */
-        void
-        pluginRegisterHelp(
-            po::options_description& desc
-        )
-        {
-            desc.add_options( )(
-                ( pluginPrefix + ".period" ).c_str( ),
-                po::value< std::string >( &notifyPeriod ),
-                "enable plugin [for each n-th step]"
-            );
-        }
-
-        /** Implementation of base class function.
-         *
-         * @return name of plugin
-         */
-        std::string
-        pluginGetName( ) const
-        {
-            return pluginName;
-        }
-
-        /** Implementation of base class function. Sets mapping description.
-         *
-         * @param cellDescription
-         */
-        void
-        setMappingDescription(
-            MappingDesc *cellDescription
-        )
-        {
-            this->cellDescription = cellDescription;
-        }
+                    collectDataGPUToMaster();
+                    writeTransRadToText();
 
-    private:
-        //! Resets buffers for multiple transition radiation calculation per simulation.
-        void
-        resetBuffers ( )
-        {
-            /* Resets all Databuffers and arrays for repeated calculation of the
-            * transition radiation
-            */
-            incTransRad->getDeviceBuffer( ).reset( false );
-            cohTransRadPara->getDeviceBuffer( ).reset( false );
-            cohTransRadPerp->getDeviceBuffer( ).reset( false );
-            numParticles->getDeviceBuffer( ).reset( false );
-
-            for( unsigned int i=0; i < elementsTransitionRadiation( ); ++i )
-            {
-                tmpITR[ i ] = 0;
-                tmpCTRpara[ i ] = 0;
-                tmpCTRperp[ i ] = 0;
-                tmpNum[ i ] = 0;
-                if( isMaster )
+                    log<radLog::SIMULATION_STATE>("Transition Radition (%1%): printed to table %2% ") % speciesName
+                        % currentStep;
+                }
+
+                /** Implementation of base class function. Registers plugin options.
+                 *
+                 * @param desc boost::program_options description
+                 */
+                void pluginRegisterHelp(po::options_description& desc)
                 {
-                    theTransRad[ i ] = 0;
+                    desc.add_options()(
+                        (pluginPrefix + ".period").c_str(),
+                        po::value<std::string>(&notifyPeriod),
+                        "enable plugin [for each n-th step]");
                 }
-            }
-        }
-
-        /** Create buffers and arrays
-         *
-         * Implementation of base class function. Create buffers and arrays for
-         * transition radiation calculation and create a folder for transition
-         * radiation storage.
-         */
-        void
-        pluginLoad( )
-        {
-            if( !notifyPeriod.empty( ) )
-            {
-                tmpITR = new float_X[ elementsTransitionRadiation( ) ];
-                tmpCTRpara = new complex_X[ elementsTransitionRadiation( ) ];
-                tmpCTRperp = new complex_X[ elementsTransitionRadiation( ) ];
-                tmpNum = new float_X[ elementsTransitionRadiation( ) ];
-
-                /*only rank 0 create a file*/
-                isMaster = reduce.hasResult( mpi::reduceMethods::Reduce( ) );
-                pmacc::Filesystem<simDim>& fs = Environment<simDim>::get( ).Filesystem( );
-
-                Environment<>::get( ).PluginConnector( ).setNotificationPeriod( this, notifyPeriod );
-
-                incTransRad = new GridBuffer< float_X, DIM1 >(
-                    DataSpace< DIM1 > ( elementsTransitionRadiation( ) ) );
-                cohTransRadPara = new GridBuffer< complex_X, DIM1 >(
-                    DataSpace< DIM1 > ( elementsTransitionRadiation( ) ) );
-                cohTransRadPerp = new GridBuffer< complex_X, DIM1 >(
-                    DataSpace< DIM1 > ( elementsTransitionRadiation( ) ) );
-                numParticles = new GridBuffer< float_X, DIM1 >(
-                    DataSpace< DIM1 > ( elementsTransitionRadiation( ) ) );
-
-                freqInit.Init( listFrequencies::listLocation );
-                freqFkt = freqInit.getFunctor( );
-
-                if ( isMaster )
+
+                /** Implementation of base class function.
+                 *
+                 * @return name of plugin
+                 */
+                std::string pluginGetName() const
                 {
-                    theTransRad = new float_X[ elementsTransitionRadiation( ) ];
-                    /* save detector position / observation direction */
-                    detectorPositions = new float3_X[ transitionRadiation::parameters::nObserver ];
-                    for(
-                        uint32_t detectorIndex=0;
-                        detectorIndex < transitionRadiation::parameters::nObserver;
-                        ++detectorIndex
-                    )
+                    return pluginName;
+                }
+
+                /** Implementation of base class function. Sets mapping description.
+                 *
+                 * @param cellDescription
+                 */
+                void setMappingDescription(MappingDesc* cellDescription)
+                {
+                    this->cellDescription = cellDescription;
+                }
+
+            private:
+                //! Resets buffers for multiple transition radiation calculation per simulation.
+                void resetBuffers()
+                {
+                    /* Resets all Databuffers and arrays for repeated calculation of the
+                     * transition radiation
+                     */
+                    incTransRad->getDeviceBuffer().reset(false);
+                    cohTransRadPara->getDeviceBuffer().reset(false);
+                    cohTransRadPerp->getDeviceBuffer().reset(false);
+                    numParticles->getDeviceBuffer().reset(false);
+
+                    for(unsigned int i = 0; i < elementsTransitionRadiation(); ++i)
                     {
-                        detectorPositions[ detectorIndex ] = transitionRadiation::observationDirection( detectorIndex );
+                        tmpITR[i] = 0;
+                        tmpCTRpara[i] = 0;
+                        tmpCTRperp[i] = 0;
+                        tmpNum[i] = 0;
+                        if(isMaster)
+                        {
+                            theTransRad[i] = 0;
+                        }
                     }
+                }
 
-                    /* save detector frequencies */
-                    detectorFrequencies = new float_X[ transitionRadiation::frequencies::nOmega ];
-                    for(
-                        uint32_t detectorIndex=0;
-                        detectorIndex < transitionRadiation::frequencies::nOmega;
-                        ++detectorIndex
-                    )
+                /** Create buffers and arrays
+                 *
+                 * Implementation of base class function. Create buffers and arrays for
+                 * transition radiation calculation and create a folder for transition
+                 * radiation storage.
+                 */
+                void pluginLoad()
+                {
+                    if(!notifyPeriod.empty())
                     {
-                        detectorFrequencies[ detectorIndex ] = freqFkt.get( detectorIndex );
+                        tmpITR = new float_X[elementsTransitionRadiation()];
+                        tmpCTRpara = new complex_X[elementsTransitionRadiation()];
+                        tmpCTRperp = new complex_X[elementsTransitionRadiation()];
+                        tmpNum = new float_X[elementsTransitionRadiation()];
+
+                        /*only rank 0 create a file*/
+                        isMaster = reduce.hasResult(mpi::reduceMethods::Reduce());
+                        pmacc::Filesystem<simDim>& fs = Environment<simDim>::get().Filesystem();
+
+                        Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+
+                        incTransRad = new GridBuffer<float_X, DIM1>(DataSpace<DIM1>(elementsTransitionRadiation()));
+                        cohTransRadPara
+                            = new GridBuffer<complex_X, DIM1>(DataSpace<DIM1>(elementsTransitionRadiation()));
+                        cohTransRadPerp
+                            = new GridBuffer<complex_X, DIM1>(DataSpace<DIM1>(elementsTransitionRadiation()));
+                        numParticles = new GridBuffer<float_X, DIM1>(DataSpace<DIM1>(elementsTransitionRadiation()));
+
+                        freqInit.Init(listFrequencies::listLocation);
+                        freqFkt = freqInit.getFunctor();
+
+                        if(isMaster)
+                        {
+                            theTransRad = new float_X[elementsTransitionRadiation()];
+                            /* save detector position / observation direction */
+                            detectorPositions = new float3_X[transitionRadiation::parameters::nObserver];
+                            for(uint32_t detectorIndex = 0; detectorIndex < transitionRadiation::parameters::nObserver;
+                                ++detectorIndex)
+                            {
+                                detectorPositions[detectorIndex]
+                                    = transitionRadiation::observationDirection(detectorIndex);
+                            }
+
+                            /* save detector frequencies */
+                            detectorFrequencies = new float_X[transitionRadiation::frequencies::nOmega];
+                            for(uint32_t detectorIndex = 0; detectorIndex < transitionRadiation::frequencies::nOmega;
+                                ++detectorIndex)
+                            {
+                                detectorFrequencies[detectorIndex] = freqFkt.get(detectorIndex);
+                            }
+
+                            for(unsigned int i = 0; i < elementsTransitionRadiation(); ++i)
+                            {
+                                theTransRad[i] = 0;
+                            }
+
+                            fs.createDirectory(folderTransRad);
+                            fs.setDirectoryPermissions(folderTransRad);
+                        }
                     }
+                }
 
-                    for ( unsigned int i=0; i< elementsTransitionRadiation( ); ++i )
+                //! Implementation of base class function. Deletes buffers andf arrays.
+                void pluginUnload()
+                {
+                    if(!notifyPeriod.empty())
                     {
-                        theTransRad[ i ] = 0;
+                        if(isMaster)
+                        {
+                            __deleteArray(theTransRad);
+                        }
+                        CUDA_CHECK(cuplaGetLastError());
+                        __delete(incTransRad);
+                        __delete(cohTransRadPara);
+                        __delete(cohTransRadPerp);
+                        __delete(numParticles);
+                        __deleteArray(tmpITR);
+                        __deleteArray(tmpCTRpara);
+                        __deleteArray(tmpCTRperp);
+                        __deleteArray(tmpNum);
                     }
+                }
 
-                    fs.createDirectory( folderTransRad );
-                    fs.setDirectoryPermissions( folderTransRad );
+                //! Moves transition radiation data from GPUs to CPUs.
+                void copyRadiationDeviceToHost()
+                {
+                    incTransRad->deviceToHost();
+                    __getTransactionEvent().waitForFinished();
+                    cohTransRadPara->deviceToHost();
+                    __getTransactionEvent().waitForFinished();
+                    cohTransRadPerp->deviceToHost();
+                    __getTransactionEvent().waitForFinished();
+                    numParticles->deviceToHost();
+                    __getTransactionEvent().waitForFinished();
                 }
-            }
-        }
 
-        //! Implementation of base class function. Deletes buffers andf arrays.
-        void
-        pluginUnload( )
-        {
-            if( !notifyPeriod.empty( ) )
-            {
-                if( isMaster )
+                /** Amount of transition radiation values
+                 *
+                 * Calculates amount of different transition radiation values, which
+                 * have to be computed.
+                 *
+                 * @return amount of transition radiation values to be calculated
+                 */
+                static unsigned int elementsTransitionRadiation()
                 {
-                    __deleteArray( theTransRad );
+                    return transitionRadiation::frequencies::nOmega
+                        * transitionRadiation::parameters::nObserver; // storage for amplitude results on GPU
                 }
-                CUDA_CHECK( cudaGetLastError( ) );
-                __delete( incTransRad );
-                __delete( cohTransRadPara );
-                __delete( cohTransRadPerp );
-                __delete( numParticles );
-                __deleteArray( tmpITR );
-                __deleteArray( tmpCTRpara );
-                __deleteArray( tmpCTRperp );
-                __deleteArray( tmpNum );
-            }
-        }
-
-        //! Moves transition radiation data from GPUs to CPUs.
-        void
-        copyRadiationDeviceToHost( )
-        {
-            incTransRad->deviceToHost( );
-            __getTransactionEvent( ).waitForFinished( );
-            cohTransRadPara->deviceToHost( );
-            __getTransactionEvent( ).waitForFinished( );
-            cohTransRadPerp->deviceToHost( );
-            __getTransactionEvent( ).waitForFinished( );
-            numParticles->deviceToHost( );
-            __getTransactionEvent( ).waitForFinished( );
-        }
-
-        /** Amount of transition radiation values
-         *
-         * Calculates amount of different transition radiation values, which
-         * have to be computed.
-         *
-         * @return amount of transition radiation values to be calculated
-         */
-        static
-        unsigned int
-        elementsTransitionRadiation( )
-        {
-            return transitionRadiation::frequencies::nOmega * transitionRadiation::parameters::nObserver; // storage for amplitude results on GPU
-        }
-
-        /** Combine transition radiation data from each CPU and store result on master.
-         *
-         * @remark copyRadiationDeviceToHost( ) should be called before.
-         */
-        void
-        collectRadiationOnMaster( )
-        {
-            reduce(
-                nvidia::functors::Add( ),
-                tmpITR,
-                incTransRad->getHostBuffer( ).getBasePointer( ),
-                elementsTransitionRadiation( ),
-                mpi::reduceMethods::Reduce( )
-            );
-            reduce(
-                nvidia::functors::Add( ),
-                tmpCTRpara,
-                cohTransRadPara->getHostBuffer( ).getBasePointer( ),
-                elementsTransitionRadiation( ),
-                mpi::reduceMethods::Reduce( )
-            );
-            reduce(
-                nvidia::functors::Add( ),
-                tmpCTRperp,
-                cohTransRadPerp->getHostBuffer( ).getBasePointer( ),
-                elementsTransitionRadiation( ),
-                mpi::reduceMethods::Reduce( )
-            );
-            reduce(
-                nvidia::functors::Add( ),
-                tmpNum,
-                numParticles->getHostBuffer( ).getBasePointer( ),
-                elementsTransitionRadiation( ),
-                mpi::reduceMethods::Reduce( )
-            );
-        }
-
-        //! Write transition radiation data to file.
-        void
-        writeTransRadToText( )
-        {
-            // only the master rank writes data
-            if (isMaster)
-            {
-                // get time step as string
-                std::stringstream o_step;
-                o_step << currentStep;
 
-                // write totalRad data to txt
-                writeFile(theTransRad, folderTransRad + "/" + filenamePrefix + "_" + o_step.str( ) + ".dat");
-            }
-        }
+                /** Combine transition radiation data from each CPU and store result on master.
+                 *
+                 * @remark copyRadiationDeviceToHost( ) should be called before.
+                 */
+                void collectRadiationOnMaster()
+                {
+                    reduce(
+                        nvidia::functors::Add(),
+                        tmpITR,
+                        incTransRad->getHostBuffer().getBasePointer(),
+                        elementsTransitionRadiation(),
+                        mpi::reduceMethods::Reduce());
+                    reduce(
+                        nvidia::functors::Add(),
+                        tmpCTRpara,
+                        cohTransRadPara->getHostBuffer().getBasePointer(),
+                        elementsTransitionRadiation(),
+                        mpi::reduceMethods::Reduce());
+                    reduce(
+                        nvidia::functors::Add(),
+                        tmpCTRperp,
+                        cohTransRadPerp->getHostBuffer().getBasePointer(),
+                        elementsTransitionRadiation(),
+                        mpi::reduceMethods::Reduce());
+                    reduce(
+                        nvidia::functors::Add(),
+                        tmpNum,
+                        numParticles->getHostBuffer().getBasePointer(),
+                        elementsTransitionRadiation(),
+                        mpi::reduceMethods::Reduce());
+                }
+
+                //! Write transition radiation data to file.
+                void writeTransRadToText()
+                {
+                    // only the master rank writes data
+                    if(isMaster)
+                    {
+                        // get time step as string
+                        std::stringstream o_step;
+                        o_step << currentStep;
+
+                        // write totalRad data to txt
+                        writeFile(theTransRad, folderTransRad + "/" + filenamePrefix + "_" + o_step.str() + ".dat");
+                    }
+                }
 
 
-        //! perform all operations to get data from GPU to master
-        void
-        collectDataGPUToMaster( )
-        {
-            // collect data GPU -> CPU -> Master
-            copyRadiationDeviceToHost( );
-            collectRadiationOnMaster( );
-            sumTransitionRadiation( theTransRad, tmpITR, tmpCTRpara, tmpCTRperp, tmpNum );
-        }
-
-        /** Final transition radiation calculation on CPU side
-         *
-         * Calculate transition radiation integrals. This can't happen on the GPU
-         * since the absolute square of a sum can't be moved within a sum.
-         *
-         * @param targetArray array to store transition radiation in
-         * @param itrArray array of calculated incoherent transition radiation
-         * @param ctrParaArray array of complex values of the parallel part of the coherent transition radiation
-         * @param ctrPerpArray array of complex values of the perpendicular part of coherent transition radiation
-         * @param numArray array of amount of particles
-         */
-        void
-        sumTransitionRadiation(
-            float_X * targetArray,
-            float_X * itrArray,
-            complex_X * ctrParaArray,
-            complex_X * ctrPerpArray,
-            float_X * numArray
-        )
-        {
-            if (isMaster)
-            {
-                /************************************************************
-                 ******** Here happens the true physical calculation ********
-                ************************************************************/
-                for( unsigned int i = 0; i < elementsTransitionRadiation( ); ++i )
+                //! perform all operations to get data from GPU to master
+                void collectDataGPUToMaster()
                 {
-                    const float_X ctrPara = math::abs2( ctrParaArray[ i ] );
-                    const float_X ctrPerp = math::abs2( ctrPerpArray[ i ] );
-                    if (numArray[i] != 0.0)
+                    // collect data GPU -> CPU -> Master
+                    copyRadiationDeviceToHost();
+                    collectRadiationOnMaster();
+                    sumTransitionRadiation(theTransRad, tmpITR, tmpCTRpara, tmpCTRperp, tmpNum);
+                }
+
+                /** Final transition radiation calculation on CPU side
+                 *
+                 * Calculate transition radiation integrals. This can't happen on the GPU
+                 * since the absolute square of a sum can't be moved within a sum.
+                 *
+                 * @param targetArray array to store transition radiation in
+                 * @param itrArray array of calculated incoherent transition radiation
+                 * @param ctrParaArray array of complex values of the parallel part of the coherent transition
+                 * radiation
+                 * @param ctrPerpArray array of complex values of the perpendicular part of coherent transition
+                 * radiation
+                 * @param numArray array of amount of particles
+                 */
+                void sumTransitionRadiation(
+                    float_X* targetArray,
+                    float_X* itrArray,
+                    complex_X* ctrParaArray,
+                    complex_X* ctrPerpArray,
+                    float_X* numArray)
+                {
+                    if(isMaster)
                     {
-                        targetArray[ i ] = (
-                            itrArray[ i ] + ( numArray[ i ] - 1.0 ) * ( ctrPara + ctrPerp ) / numArray[i]
-                        );
+                        /************************************************************
+                         ******** Here happens the true physical calculation ********
+                         ************************************************************/
+                        for(unsigned int i = 0; i < elementsTransitionRadiation(); ++i)
+                        {
+                            const float_X ctrPara = pmacc::math::abs2(ctrParaArray[i]);
+                            const float_X ctrPerp = pmacc::math::abs2(ctrPerpArray[i]);
+                            if(numArray[i] != 0.0)
+                            {
+                                targetArray[i]
+                                    = (itrArray[i] + (numArray[i] - 1.0) * (ctrPara + ctrPerp) / numArray[i]);
+                            }
+                            else
+                                targetArray[i] = 0.0;
+                        }
                     }
-                    else
-                        targetArray[ i ] = 0.0;
                 }
-            }
-        }
-
-        /** Writes file with transition radiation data with the right units.
-         *
-         * @param values transition radiation values
-         * @param name name of file
-         */
-        void
-        writeFile(
-            float_X * values,
-            std::string name
-        )
-        {
-            std::ofstream outFile;
-            outFile.open(
-                name.c_str( ),
-                std::ofstream::out | std::ostream::trunc
-            );
-            if ( !outFile )
-            {
-                std::cerr << "Can't open file [" << name << "] for output, disable plugin output. " << std::endl;
-                isMaster = false; // no Master anymore -> no process is able to write
-            }
-            else
-            {
-                outFile << "# \t";
-                outFile << transitionRadiation::frequencies::getParameters( );
-                outFile << transitionRadiation::parameters::nPhi << "\t";
-                outFile << transitionRadiation::parameters::phiMin << "\t";
-                outFile << transitionRadiation::parameters::phiMax << "\t";
-                outFile << transitionRadiation::parameters::nTheta << "\t";
-                outFile << transitionRadiation::parameters::thetaMin << "\t";
-                outFile << transitionRadiation::parameters::thetaMax << "\t";
-                outFile << std::endl;
-
-                for (
-                    unsigned int index_direction = 0;
-                    index_direction < transitionRadiation::parameters::nObserver;
-                    ++index_direction
-                ) // over all directions
+
+                /** Writes file with transition radiation data with the right units.
+                 *
+                 * @param values transition radiation values
+                 * @param name name of file
+                 */
+                void writeFile(float_X* values, std::string name)
                 {
-                    for (
-                        unsigned index_omega = 0;
-                        index_omega < transitionRadiation::frequencies::nOmega;
-                        ++index_omega
-                    ) // over all frequencies
+                    std::ofstream outFile;
+                    outFile.open(name.c_str(), std::ofstream::out | std::ostream::trunc);
+                    if(!outFile)
                     {
-                        // Take Amplitude for one direction and frequency,
-                        // calculate the square of the absolute value
-                        // and write to file.
-                        constexpr float_X transRadUnit =
-                            SI::ELECTRON_CHARGE_SI * SI::ELECTRON_CHARGE_SI *
-                            ( 1.0 / ( 4 * PI * SI::EPS0_SI * PI * PI * SI::SPEED_OF_LIGHT_SI ) );
-                        outFile <<
-                            values[
-                                index_direction * transitionRadiation::frequencies::nOmega + index_omega
-                            ] * transRadUnit << "\t";
-
-                    } // for loop over all frequencies
-
-                    outFile << std::endl;
-                } // for loop over all frequencies
-
-                outFile.flush( );
-                outFile << std::endl; //now all data are written to file
-
-                if ( outFile.fail( ) )
-                    std::cerr << "Error on flushing file [" << name << "]. " << std::endl;
-
-                outFile.close( );
-            }
-        }
-
-        /** Kernel call
-         *
-         * Executes the particle filter and calls the transition radiation kernel
-         * of the kernel file.
-         *
-         * @param currentStep current simulation iteration step
-         */
-        void
-        calculateTransitionRadiation(
-            uint32_t currentStep
-        )
-        {
-            DataConnector &dc = Environment< >::get( ).DataConnector( );
-            auto particles = dc.get< T_ParticlesType >(
-                T_ParticlesType::FrameType::getName( ),
-                true
-            );
-
-            /* execute the particle filter */
-            transitionRadiation::executeParticleFilter( particles, currentStep );
-
-            const auto gridDim_rad = transitionRadiation::parameters::nObserver;
-
-            /* number of threads per block = number of cells in a super cell
-            *          = number of particles in a Frame
-            *          (THIS IS PIConGPU SPECIFIC)
-            * A Frame is the entity that stores particles.
-            * A super cell can have many Frames.
-            * Particles in a Frame can be accessed in parallel.
-            */
-
-            // Some funny things that make it possible for the kernel to calculate
-            // the absolute position of the particles
-            DataSpace< simDim > localSize( cellDescription->getGridLayout( ).getDataSpaceWithoutGuarding( ) );
-            const uint32_t numSlides = MovingWindow::getInstance( ).getSlideCounter( currentStep );
-            const SubGrid< simDim >& subGrid = Environment< simDim >::get( ).SubGrid( );
-            DataSpace< simDim > globalOffset( subGrid.getLocalDomain( ).offset );
-            globalOffset.y( ) += ( localSize.y( ) * numSlides );
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            // PIC-like kernel call of the radiation kernel
-            PMACC_KERNEL( KernelTransRadParticles<
-                numWorkers
-            >{ } )(
-                gridDim_rad,
-                numWorkers
-            )(
-                /*Pointer to particles memory on the device*/
-                particles->getDeviceParticlesBox( ),
-
-                /*Pointer to memory of radiated amplitude on the device*/
-                incTransRad->getDeviceBuffer( ).getDataBox( ),
-                cohTransRadPara->getDeviceBuffer( ).getDataBox( ),
-                cohTransRadPerp->getDeviceBuffer( ).getDataBox( ),
-                numParticles->getDeviceBuffer( ).getDataBox( ),
-                globalOffset,
-                *cellDescription,
-                freqFkt,
-                subGrid.getGlobalDomain( ).size
-            );
-
-            dc.releaseData( T_ParticlesType::FrameType::getName( ) );
-        }
-    };
-
-} // namespace transitionRadiation
-} // namespace plugins
-
-namespace particles
-{
-namespace traits
-{
-    template<
-        typename T_Species,
-        typename T_UnspecifiedSpecies
-    >
-    struct SpeciesEligibleForSolver<
-        T_Species,
-        plugins::transitionRadiation::TransitionRadiation< T_UnspecifiedSpecies >
-    >
+                        std::cerr << "Can't open file [" << name << "] for output, disable plugin output. "
+                                  << std::endl;
+                        isMaster = false; // no Master anymore -> no process is able to write
+                    }
+                    else
+                    {
+                        outFile << "# \t";
+                        outFile << transitionRadiation::frequencies::getParameters();
+                        outFile << transitionRadiation::parameters::nPhi << "\t";
+                        outFile << transitionRadiation::parameters::phiMin << "\t";
+                        outFile << transitionRadiation::parameters::phiMax << "\t";
+                        outFile << transitionRadiation::parameters::nTheta << "\t";
+                        outFile << transitionRadiation::parameters::thetaMin << "\t";
+                        outFile << transitionRadiation::parameters::thetaMax << "\t";
+                        outFile << std::endl;
+
+                        for(unsigned int index_direction = 0;
+                            index_direction < transitionRadiation::parameters::nObserver;
+                            ++index_direction) // over all directions
+                        {
+                            for(unsigned index_omega = 0; index_omega < transitionRadiation::frequencies::nOmega;
+                                ++index_omega) // over all frequencies
+                            {
+                                // Take Amplitude for one direction and frequency,
+                                // calculate the square of the absolute value
+                                // and write to file.
+                                constexpr float_X transRadUnit = SI::ELECTRON_CHARGE_SI * SI::ELECTRON_CHARGE_SI
+                                    * (1.0 / (4 * PI * SI::EPS0_SI * PI * PI * SI::SPEED_OF_LIGHT_SI));
+                                outFile
+                                    << values[index_direction * transitionRadiation::frequencies::nOmega + index_omega]
+                                        * transRadUnit
+                                    << "\t";
+
+                            } // for loop over all frequencies
+
+                            outFile << std::endl;
+                        } // for loop over all frequencies
+
+                        outFile.flush();
+                        outFile << std::endl; // now all data are written to file
+
+                        if(outFile.fail())
+                            std::cerr << "Error on flushing file [" << name << "]. " << std::endl;
+
+                        outFile.close();
+                    }
+                }
+
+                /** Kernel call
+                 *
+                 * Executes the particle filter and calls the transition radiation kernel
+                 * of the kernel file.
+                 *
+                 * @param currentStep current simulation iteration step
+                 */
+                void calculateTransitionRadiation(uint32_t currentStep)
+                {
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto particles = dc.get<T_ParticlesType>(T_ParticlesType::FrameType::getName(), true);
+
+                    /* execute the particle filter */
+                    transitionRadiation::executeParticleFilter(particles, currentStep);
+
+                    const auto gridDim_rad = transitionRadiation::parameters::nObserver;
+
+                    /* number of threads per block = number of cells in a super cell
+                     *          = number of particles in a Frame
+                     *          (THIS IS PIConGPU SPECIFIC)
+                     * A Frame is the entity that stores particles.
+                     * A super cell can have many Frames.
+                     * Particles in a Frame can be accessed in parallel.
+                     */
+
+                    // Some funny things that make it possible for the kernel to calculate
+                    // the absolute position of the particles
+                    DataSpace<simDim> localSize(cellDescription->getGridLayout().getDataSpaceWithoutGuarding());
+                    const uint32_t numSlides = MovingWindow::getInstance().getSlideCounter(currentStep);
+                    const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                    DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset);
+                    globalOffset.y() += (localSize.y() * numSlides);
+
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                    // PIC-like kernel call of the radiation kernel
+                    PMACC_KERNEL(KernelTransRadParticles<numWorkers>{})
+                    (gridDim_rad, numWorkers)(
+                        /*Pointer to particles memory on the device*/
+                        particles->getDeviceParticlesBox(),
+
+                        /*Pointer to memory of radiated amplitude on the device*/
+                        incTransRad->getDeviceBuffer().getDataBox(),
+                        cohTransRadPara->getDeviceBuffer().getDataBox(),
+                        cohTransRadPerp->getDeviceBuffer().getDataBox(),
+                        numParticles->getDeviceBuffer().getDataBox(),
+                        globalOffset,
+                        *cellDescription,
+                        freqFkt,
+                        subGrid.getGlobalDomain().size);
+
+                    dc.releaseData(T_ParticlesType::FrameType::getName());
+                }
+            };
+
+        } // namespace transitionRadiation
+    } // namespace plugins
+
+    namespace particles
     {
-        using FrameType = typename T_Species::FrameType;
-
-        // this plugin needs at least the weighting and momentum attributes
-        using RequiredIdentifiers = MakeSeq_t<
-            weighting,
-            momentum,
-            position< >
-        >;
-
-        using SpeciesHasIdentifiers = typename pmacc::traits::HasIdentifiers<
-            FrameType,
-            RequiredIdentifiers
-        >::type;
-
-        // this plugin needs a mass ratio for energy calculation from momentum
-        using SpeciesHasMass = typename pmacc::traits::HasFlag<
-            FrameType,
-            massRatio<>
-        >::type;
-
-        // transition radiation requires charged particles
-        using SpeciesHasCharge = typename pmacc::traits::HasFlag<
-            FrameType,
-            chargeRatio<>
-        >::type;
-
-        // this plugin needs the transitionRadiationMask flag
-        using SpeciesHasMask = typename pmacc::traits::HasIdentifier<
-            FrameType,
-            transitionRadiationMask
-        >::type;
-
-        using type = typename bmpl::and_<
-            SpeciesHasIdentifiers,
-            SpeciesHasMass,
-            SpeciesHasCharge,
-            SpeciesHasMask
-        >;
-    };
-} // namespace traits
-} // namespace particles
+        namespace traits
+        {
+            template<typename T_Species, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<
+                T_Species,
+                plugins::transitionRadiation::TransitionRadiation<T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                // this plugin needs at least the weighting and momentum attributes
+                using RequiredIdentifiers = MakeSeq_t<weighting, momentum, position<>>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                // this plugin needs a mass ratio for energy calculation from momentum
+                using SpeciesHasMass = typename pmacc::traits::HasFlag<FrameType, massRatio<>>::type;
+
+                // transition radiation requires charged particles
+                using SpeciesHasCharge = typename pmacc::traits::HasFlag<FrameType, chargeRatio<>>::type;
+
+                // this plugin needs the transitionRadiationMask flag
+                using SpeciesHasMask = typename pmacc::traits::HasIdentifier<FrameType, transitionRadiationMask>::type;
+
+                using type =
+                    typename bmpl::and_<SpeciesHasIdentifiers, SpeciesHasMass, SpeciesHasCharge, SpeciesHasMask>;
+            };
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/TransitionRadiation.kernel b/include/picongpu/plugins/transitionRadiation/TransitionRadiation.kernel
index edb8706c0c..158a7952cb 100644
--- a/include/picongpu/plugins/transitionRadiation/TransitionRadiation.kernel
+++ b/include/picongpu/plugins/transitionRadiation/TransitionRadiation.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
  *                     Klaus Steiniger, Felix Schmitt, Benjamin Worpitz,
  *                     Finn-Ole Carstens
  *
@@ -36,353 +36,302 @@
 #include <pmacc/mappings/kernel/AreaMapping.hpp>
 
 
-
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-/** Kernel for computation of transition radiation on GPUs.
- *
- * @tparam T_numWorkers maximal CUDA threads
- * @tparam T_ParBox box with particles
- * @tparam T_DBox box with float data
- * @tparam T_DBoxComplex box with complex data
- * @tparam T_Mapping MappingDescription object
- * @tparam T_Acc alpaka accelerator
- * @param acc alpaka accelerator
- * @param incTransRad output array for storage incoherent transition radiation
- * @param cohTransRadPara output array for storage of parallel parts of coherent transition radiation
- * @param cohTransRadPerp output array for storage of perpendicular parts of coherent transition radiation
- * @param numParticles output array for amount of particles
- * @param globalOffset offset of simulation
- * @param mapper MappingDesction object
- * @param freqFkt frequency functor
- * @param simBoxSize size of simulation box
- */
-    template<
-        uint32_t T_numWorkers
-    >
-    struct KernelTransRadParticles
+    namespace plugins
     {
-        template<
-            typename T_ParBox,
-            typename T_DBox,
-            typename T_DBoxComplex, // Formfactor returns complex values
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE
-        void operator( )(
-            T_Acc const & acc,
-            T_ParBox pb,
-            T_DBox incTransRad,
-            T_DBoxComplex cohTransRadPara,
-            T_DBoxComplex cohTransRadPerp,
-            T_DBox numParticles,
-            DataSpace< simDim > globalOffset,
-            T_Mapping mapper,
-            transitionRadiation::frequencies::FreqFunctor freqFkt,
-            DataSpace< simDim > simBoxSize
-        ) const
+        namespace transitionRadiation
         {
-            using namespace mappings::threads;
-            using complex_X = pmacc::math::Complex< float_X >;
-            using complex_64 = pmacc::math::Complex< float_64 >;
+            /** Kernel for computation of transition radiation on GPUs.
+             *
+             * @tparam T_numWorkers maximal CUDA threads
+             * @tparam T_ParBox box with particles
+             * @tparam T_DBox box with float data
+             * @tparam T_DBoxComplex box with complex data
+             * @tparam T_Mapping MappingDescription object
+             * @tparam T_Acc alpaka accelerator
+             * @param acc alpaka accelerator
+             * @param incTransRad output array for storage incoherent transition radiation
+             * @param cohTransRadPara output array for storage of parallel parts of coherent transition radiation
+             * @param cohTransRadPerp output array for storage of perpendicular parts of coherent transition radiation
+             * @param numParticles output array for amount of particles
+             * @param globalOffset offset of simulation
+             * @param mapper MappingDesction object
+             * @param freqFkt frequency functor
+             * @param simBoxSize size of simulation box
+             */
+            template<uint32_t T_numWorkers>
+            struct KernelTransRadParticles
+            {
+                template<
+                    typename T_ParBox,
+                    typename T_DBox,
+                    typename T_DBoxComplex, // Formfactor returns complex values
+                    typename T_Mapping,
+                    typename T_Acc>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    T_ParBox pb,
+                    T_DBox incTransRad,
+                    T_DBoxComplex cohTransRadPara,
+                    T_DBoxComplex cohTransRadPerp,
+                    T_DBox numParticles,
+                    DataSpace<simDim> globalOffset,
+                    T_Mapping mapper,
+                    transitionRadiation::frequencies::FreqFunctor freqFkt,
+                    DataSpace<simDim> simBoxSize) const
+                {
+                    using namespace mappings::threads;
+                    using complex_X = pmacc::math::Complex<float_X>;
+                    using complex_64 = pmacc::math::Complex<float_64>;
 
-            constexpr uint32_t frameSize = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorker = T_numWorkers;
+                    constexpr uint32_t frameSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    constexpr uint32_t numWorker = T_numWorkers;
 
-            using FrameType = typename T_ParBox::FrameType;
-            using FramePtr = typename T_ParBox::FramePtr;
+                    using FrameType = typename T_ParBox::FrameType;
+                    using FramePtr = typename T_ParBox::FramePtr;
 
-            uint32_t const workerIdx = threadIdx.x;
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-            /* parallelized in 2 dimensions:
-            * looking direction (theta, phi)
-            * (not anymore data handling)
-            * create shared memory for particle data to reduce global memory calls
-            * every thread in a block loads one particle and every thread runs
-            * through all particles and calculates the radiation for one direction
-            * for all frequencies
-            */
-            constexpr int blockSize = pmacc::math::CT::volume< SuperCellSize >::type::value;
+                    /* parallelized in 2 dimensions:
+                     * looking direction (theta, phi)
+                     * (not anymore data handling)
+                     * create shared memory for particle data to reduce global memory calls
+                     * every thread in a block loads one particle and every thread runs
+                     * through all particles and calculates the radiation for one direction
+                     * for all frequencies
+                     */
+                    constexpr int blockSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
 
-            // perpendicular part of normalized energy
-            PMACC_SMEM( acc, energyPerp_s, memory::Array< float_X, blockSize > );
+                    // perpendicular part of normalized energy
+                    PMACC_SMEM(acc, energyPerp_s, memory::Array<float_X, blockSize>);
 
-            // parallel part of normalized energy
-            PMACC_SMEM( acc, energyPara_s, memory::Array< float_X, blockSize > );
+                    // parallel part of normalized energy
+                    PMACC_SMEM(acc, energyPara_s, memory::Array<float_X, blockSize>);
 
-            // exponent of the form factor
-            PMACC_SMEM( acc, formfactorExponent_s, memory::Array< complex_X, blockSize > );
+                    // exponent of the form factor
+                    PMACC_SMEM(acc, formfactorExponent_s, memory::Array<complex_X, blockSize>);
 
-            // storage for macro particle weighting needed if
-            // the coherent and incoherent radiation of a single
-            // macro-particle needs to be considered
-            PMACC_SMEM( acc, radWeighting_s, memory::Array< float_X, blockSize > );
+                    // storage for macro particle weighting needed if
+                    // the coherent and incoherent radiation of a single
+                    // macro-particle needs to be considered
+                    PMACC_SMEM(acc, radWeighting_s, memory::Array<float_X, blockSize>);
 
-            // particle counter used if not all particles are considered for
-            // radiation calculation
-            PMACC_SMEM( acc, counter_s, int );
+                    // particle counter used if not all particles are considered for
+                    // radiation calculation
+                    PMACC_SMEM(acc, counter_s, int);
 
 
-            int const theta_idx = blockIdx.x; //blockIdx.x is used to determine theta
+                    int const theta_idx = cupla::blockIdx(acc).x; // cupla::blockIdx(acc).x is used to determine theta
 
-            // looking direction (needed for observer) used in the thread
-            float3_X const look = transitionRadiation::observationDirection( theta_idx );
+                    // looking direction (needed for observer) used in the thread
+                    float3_X const look = transitionRadiation::observationDirection(theta_idx);
 
-            // get extent of guarding super cells (needed to ignore them)
-            DataSpace< simDim > const guardingSuperCells = mapper.getGuardingSuperCells( );
+                    // get extent of guarding super cells (needed to ignore them)
+                    DataSpace<simDim> const guardingSuperCells = mapper.getGuardingSuperCells();
 
-            /* number of super cells on GPU per dimension (still including guard cells)
-            * remove both guards from count [later one sided guard needs to be added again]
-            */
-            DataSpace< simDim > const superCellsCount( mapper.getGridSuperCells( ) - 2 * guardingSuperCells );
+                    /* number of super cells on GPU per dimension (still including guard cells)
+                     * remove both guards from count [later one sided guard needs to be added again]
+                     */
+                    DataSpace<simDim> const superCellsCount(mapper.getGridSuperCells() - 2 * guardingSuperCells);
 
-            // get absolute number of relevant super cells
-            int const numSuperCells = superCellsCount.productOfComponents( );
+                    // get absolute number of relevant super cells
+                    int const numSuperCells = superCellsCount.productOfComponents();
 
-            // propagation distance for the particle bunch
-            float_X const propagationDistance = parameters::foilPosition - globalOffset[ 1 ];
+                    // propagation distance for the particle bunch
+                    float_X const propagationDistance = parameters::foilPosition - globalOffset[1];
 
 
-            /* go over all super cells on GPU
-            * but ignore all guarding supercells
-            */
-            for( int superCellIndex = 0; superCellIndex <= numSuperCells; ++superCellIndex )
-            {
-                // select SuperCell and add one sided guard again
-                DataSpace< simDim > const superCell = DataSpaceOperations< simDim >::map(
-                    superCellsCount,
-                    superCellIndex
-                ) + guardingSuperCells;
-
-
-                // -guardingSuperCells remove guarding block
-                DataSpace< simDim > const superCellOffset(
-                    globalOffset + (
-                        ( superCell - guardingSuperCells ) *
-                        SuperCellSize::toRT( )
-                    )
-                );
-
-                // pointer to  frame storing particles
-                FramePtr frame = pb.getLastFrame( superCell );
-
-                // number  of particles in current frame
-                lcellId_t particlesInFrame = pb.getSuperCell( superCell ).getSizeLastFrame( );
-
-                /* go to next supercell
-                *
-                * if "isValid" is false then there is no frame
-                * inside the superCell (anymore)
-                */
-                while( frame.isValid( ) )
-                {
-                    /* since a race condition can occur if "continue loop" is called,
-                    *  all threads must wait for the selection of a new frame
-                    *  until all threads have evaluated "isValid"
-                    */
-                    __syncthreads( );
-
-                    ForEachIdx<
-                        IdxConfig<
-                            1,
-                            numWorker
-                        >
-                    > onlyMaster{ workerIdx };
-
-                    /* The Master process (thread 0) in every thread block is in
-                    * charge of loading a frame from
-                    * the current super cell and evaluate the total number of
-                    * particles in this frame.
-                    */
-                    onlyMaster(
-                        [ & ](
-                            uint32_t const,
-                            uint32_t const
-                        )
-                        {
-                            counter_s = 0;
-                        }
-                    );
+                    /* go over all super cells on GPU
+                     * but ignore all guarding supercells
+                     */
+                    for(int superCellIndex = 0; superCellIndex <= numSuperCells; ++superCellIndex)
+                    {
+                        // select SuperCell and add one sided guard again
+                        DataSpace<simDim> const superCell
+                            = DataSpaceOperations<simDim>::map(superCellsCount, superCellIndex) + guardingSuperCells;
+
 
-                    __syncthreads( );
+                        // -guardingSuperCells remove guarding block
+                        DataSpace<simDim> const superCellOffset(
+                            globalOffset + ((superCell - guardingSuperCells) * SuperCellSize::toRT()));
 
-                    using ParticleDomCfg = IdxConfig<
-                        frameSize,
-                        numWorker
-                    >;
+                        // pointer to  frame storing particles
+                        FramePtr frame = pb.getLastFrame(superCell);
 
-                    // loop over all particles in the frame
-                    ForEachIdx< ParticleDomCfg > forEachParticle{ workerIdx };
+                        // number  of particles in current frame
+                        lcellId_t particlesInFrame = pb.getSuperCell(superCell).getSizeLastFrame();
 
-                    forEachParticle(
-                        [ & ](
-                            uint32_t const linearIdx,
-                            uint32_t const
-                        )
+                        /* go to next supercell
+                         *
+                         * if "isValid" is false then there is no frame
+                         * inside the superCell (anymore)
+                         */
+                        while(frame.isValid())
                         {
-                            // only threads with particles are running
-                            if( linearIdx < particlesInFrame )
-                            {
-                                auto par = frame[ linearIdx ];
-                                // get particle momenta
-                                float3_X const particleMomentum = par[ momentum_ ];
-                                /* initializes "saveParticleAt" flag with -1
-                                * because "counter_s" will never be -1
-                                * therefore, if a particle is saved, a value of counter
-                                * is stored in "saveParticleAt" != -1
-                                * THIS IS ACTUALLY ONLY NEEDED IF: the radiation flag was set
-                                * LATER: can this be optimized?
-                                */
-
-                                int saveParticleAt = -1;
-
-                                // only moving particles create transition radiation
-                                if( ( particleMomentum * particleMomentum ).sumOfComponents( ) > 0.0)
+                            /* since a race condition can occur if "continue loop" is called,
+                             *  all threads must wait for the selection of a new frame
+                             *  until all threads have evaluated "isValid"
+                             */
+                            cupla::__syncthreads(acc);
+
+                            ForEachIdx<IdxConfig<1, numWorker>> onlyMaster{workerIdx};
+
+                            /* The Master process (thread 0) in every thread block is in
+                             * charge of loading a frame from
+                             * the current super cell and evaluate the total number of
+                             * particles in this frame.
+                             */
+                            onlyMaster([&](uint32_t const, uint32_t const) { counter_s = 0; });
+
+                            cupla::__syncthreads(acc);
+
+                            using ParticleDomCfg = IdxConfig<frameSize, numWorker>;
+
+                            // loop over all particles in the frame
+                            ForEachIdx<ParticleDomCfg> forEachParticle{workerIdx};
+
+                            forEachParticle([&](uint32_t const linearIdx, uint32_t const) {
+                                // only threads with particles are running
+                                if(linearIdx < particlesInFrame)
                                 {
-                                    if( transitionRadiation::getTransitionRadiationMask( par ) )
-                                        saveParticleAt = nvidia::atomicAllInc(
-                                            acc,
-                                            &counter_s,
-                                            ::alpaka::hierarchy::Threads{ }
-                                        );
-
-                                    /* for information:
-                                    *   atomicAdd returns an int with the previous
-                                    *   value of "counter_s" != -1
-                                    *   therefore, if a particle is selected
-                                    *   "saveParticleAt" != -1
-                                    */
-                                    // if a particle needs to be considered
-                                    if( saveParticleAt != -1 )
+                                    auto par = frame[linearIdx];
+                                    // get particle momenta
+                                    float3_X const particleMomentum = par[momentum_];
+                                    /* initializes "saveParticleAt" flag with -1
+                                     * because "counter_s" will never be -1
+                                     * therefore, if a particle is saved, a value of counter
+                                     * is stored in "saveParticleAt" != -1
+                                     * THIS IS ACTUALLY ONLY NEEDED IF: the radiation flag was set
+                                     * LATER: can this be optimized?
+                                     */
+
+                                    int saveParticleAt = -1;
+
+                                    // only moving particles create transition radiation
+                                    if((particleMomentum * particleMomentum).sumOfComponents() > 0.0)
                                     {
-                                        // calculate global position
-                                        lcellId_t const cellIdx = par[ localCellIdx_ ];
-
-                                        // position inside of the cell
-                                        floatD_X const pos = par[ position_ ];
-
-                                        // calculate global position of cell
-                                        DataSpace< simDim > const globalPos(
-                                            superCellOffset +
-                                            DataSpaceOperations< simDim >::
-                                                template map< SuperCellSize >( cellIdx )
-                                        );
-
-                                        // add global position of cell with local position of particle in cell
-                                        float3_X particleLocation;
-                                        // set z component to zero in case of simDim==DIM2
-                                        particleLocation[ 2 ] = 0.0;
-                                        // run over all components and compute gobal position
-                                        for( int i = 0; i < simDim; ++i )
-                                            particleLocation[ i ] = (
-                                                float_X( globalPos[ i ] ) + pos[ i ]
-                                            ) * cellSize[ i ];
-
-                                        /* get macro-particle weighting
-                                        *
-                                        * Info:
-                                        * the weighting is the number of real particles described
-                                        * by a macro-particle
-                                        */
-                                        float_X const weighting = par[ weighting_ ];
-                                        radWeighting_s[ saveParticleAt ] = weighting;
-
-                                        // mass of macro-particle
-                                        float_X const particleMass = attribute::getMass(
-                                            weighting,
-                                            par
-                                        );
-
-                                        // using transition radiation particle class
-                                        transitionRadiation::Particle particle(
-                                            particleLocation,
-                                            particleMomentum,
-                                            particleMass
-                                        );
-
-                                        // only propagate particles if it is set up in transitionRadiation.param
-                                        if(  parameters::foilPosition != 0.0 )
-                                            particle.propagate( propagationDistance );
-
-                                        // create calculator for TR calculations
-                                        transitionRadiation::Calculator const calculator =  transitionRadiation::Calculator(
-                                            particle,
-                                            look
-                                        );
-
-                                        // calculate values for transition radiation
-                                        energyPara_s[ saveParticleAt ] = calculator.calcEnergyPara( );
-
-                                        energyPerp_s[ saveParticleAt ] = calculator.calcEnergyPerp( );
-
-                                        formfactorExponent_s[ saveParticleAt ] = calculator.calcFormFactorExponent( );
-                                    }
-                                } // only moving particles
-                            } // only threads with particle
-                        }
-                    ); // for each particle
-                    __syncthreads( );
-
-                    // run over all  valid omegas for this thread
-                    for( int o = workerIdx; o < transitionRadiation::frequencies::nOmega; o += T_numWorkers )
-                    {
-                        float_X itrSum = 0.0;
-                        float_X totalParticles = 0.0;
-                        complex_X ctrSumPara = complex_X( 0.0, 0.0 );
-                        complex_X ctrSumPerp = complex_X( 0.0, 0.0 );
+                                        if(transitionRadiation::getTransitionRadiationMask(par))
+                                            saveParticleAt = nvidia::atomicAllInc(
+                                                acc,
+                                                &counter_s,
+                                                ::alpaka::hierarchy::Threads{});
+
+                                        /* for information:
+                                         *   atomicAdd returns an int with the previous
+                                         *   value of "counter_s" != -1
+                                         *   therefore, if a particle is selected
+                                         *   "saveParticleAt" != -1
+                                         */
+                                        // if a particle needs to be considered
+                                        if(saveParticleAt != -1)
+                                        {
+                                            // calculate global position
+                                            lcellId_t const cellIdx = par[localCellIdx_];
+
+                                            // position inside of the cell
+                                            floatD_X const pos = par[position_];
+
+                                            // calculate global position of cell
+                                            DataSpace<simDim> const globalPos(
+                                                superCellOffset
+                                                + DataSpaceOperations<simDim>::template map<SuperCellSize>(cellIdx));
+
+                                            // add global position of cell with local position of particle in cell
+                                            float3_X particleLocation;
+                                            // set z component to zero in case of simDim==DIM2
+                                            particleLocation[2] = 0.0;
+                                            // run over all components and compute gobal position
+                                            for(int i = 0; i < simDim; ++i)
+                                                particleLocation[i] = (float_X(globalPos[i]) + pos[i]) * cellSize[i];
+
+                                            /* get macro-particle weighting
+                                             *
+                                             * Info:
+                                             * the weighting is the number of real particles described
+                                             * by a macro-particle
+                                             */
+                                            float_X const weighting = par[weighting_];
+                                            radWeighting_s[saveParticleAt] = weighting;
+
+                                            // mass of macro-particle
+                                            float_X const particleMass = attribute::getMass(weighting, par);
+
+                                            // using transition radiation particle class
+                                            transitionRadiation::Particle particle(
+                                                particleLocation,
+                                                particleMomentum,
+                                                particleMass);
+
+                                            // only propagate particles if it is set up in transitionRadiation.param
+                                            if(parameters::foilPosition != 0.0)
+                                                particle.propagate(propagationDistance);
+
+                                            // create calculator for TR calculations
+                                            transitionRadiation::Calculator const calculator
+                                                = transitionRadiation::Calculator(particle, look);
+
+                                            // calculate values for transition radiation
+                                            energyPara_s[saveParticleAt] = calculator.calcEnergyPara();
+
+                                            energyPerp_s[saveParticleAt] = calculator.calcEnergyPerp();
+
+                                            formfactorExponent_s[saveParticleAt] = calculator.calcFormFactorExponent();
+                                        }
+                                    } // only moving particles
+                                } // only threads with particle
+                            }); // for each particle
+                            cupla::__syncthreads(acc);
+
+                            // run over all  valid omegas for this thread
+                            for(int o = workerIdx; o < transitionRadiation::frequencies::nOmega; o += T_numWorkers)
+                            {
+                                float_X itrSum = 0.0;
+                                float_X totalParticles = 0.0;
+                                complex_X ctrSumPara = complex_X(0.0, 0.0);
+                                complex_X ctrSumPerp = complex_X(0.0, 0.0);
 
-                        // create a form factor object for physical correct coherence effects within macro-particles
-                        macroParticleFormFactor::radFormFactor const macroParticleFormFactor{ };
+                                // create a form factor object for physical correct coherence effects within
+                                // macro-particles
+                                macroParticleFormFactor::radFormFactor const macroParticleFormFactor{};
 
-                        for( int j = 0; j < counter_s; ++j )
-                        {
-                            float_X const omega = freqFkt( o );
-                            complex_X const formfactor = transitionRadiation::calcFormFactor(
-                                omega,
-                                formfactorExponent_s[ j ]
-                            ) * macroParticleFormFactor(
-                                radWeighting_s[ j ],
-                                omega,
-                                look
-                            );
-
-                            itrSum += radWeighting_s[ j ] * (
-                                energyPerp_s[ j ] * energyPerp_s[ j ] +
-                                energyPara_s[ j ] * energyPara_s[ j ]
-                            );
-                            totalParticles += radWeighting_s[ j ];
-
-                            ctrSumPara += energyPara_s[ j ] * formfactor;
-                            ctrSumPerp += energyPerp_s[ j ] * formfactor;
-                        }
-
-                        int const index = theta_idx * transitionRadiation::frequencies::nOmega + o;
-                        incTransRad[ index ] += itrSum;
-                        numParticles[ index ] += totalParticles;
-                        cohTransRadPara[ index ] += ctrSumPara;
-                        cohTransRadPerp[ index ] += ctrSumPerp;
-                    }
-
-                    __syncthreads( );
-
-                    /* First threads starts loading next frame of the super-cell:
-                    *
-                    * Info:
-                    *   The calculation starts with the last SuperCell (must not be full filled)
-                    *   all previous SuperCells are full with particles
-                    */
-                    particlesInFrame = frameSize;
-                    frame = pb.getPreviousFrame( frame );
-                } // while frame is valid
-            } // for all supercells
-        }
-    }; // struct KernelTransRad
-
-} // namespace transitionRadiation
-} // namespace plugins
+                                for(int j = 0; j < counter_s; ++j)
+                                {
+                                    float_X const omega = freqFkt(o);
+                                    complex_X const formfactor
+                                        = transitionRadiation::calcFormFactor(omega, formfactorExponent_s[j])
+                                        * macroParticleFormFactor(radWeighting_s[j], omega, look);
+
+                                    itrSum += radWeighting_s[j]
+                                        * (energyPerp_s[j] * energyPerp_s[j] + energyPara_s[j] * energyPara_s[j]);
+                                    totalParticles += radWeighting_s[j];
+
+                                    ctrSumPara += energyPara_s[j] * formfactor;
+                                    ctrSumPerp += energyPerp_s[j] * formfactor;
+                                }
+
+                                int const index = theta_idx * transitionRadiation::frequencies::nOmega + o;
+                                incTransRad[index] += itrSum;
+                                numParticles[index] += totalParticles;
+                                cohTransRadPara[index] += ctrSumPara;
+                                cohTransRadPerp[index] += ctrSumPerp;
+                            }
+
+                            cupla::__syncthreads(acc);
+
+                            /* First threads starts loading next frame of the super-cell:
+                             *
+                             * Info:
+                             *   The calculation starts with the last SuperCell (must not be full filled)
+                             *   all previous SuperCells are full with particles
+                             */
+                            particlesInFrame = frameSize;
+                            frame = pb.getPreviousFrame(frame);
+                        } // while frame is valid
+                    } // for all supercells
+                }
+            }; // struct KernelTransRad
+
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/frequencies/LinearFrequencies.hpp b/include/picongpu/plugins/transitionRadiation/frequencies/LinearFrequencies.hpp
index 022a0cb0d2..d598bb5f4c 100644
--- a/include/picongpu/plugins/transitionRadiation/frequencies/LinearFrequencies.hpp
+++ b/include/picongpu/plugins/transitionRadiation/frequencies/LinearFrequencies.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Finn-Ole Carstens
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -24,58 +24,60 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-namespace linearFrequencies
-{
-    class FreqFunctor
+    namespace plugins
     {
-    public:
-        FreqFunctor( void )
-        { }
-
-        HDINLINE float_X operator( )( const int ID )
+        namespace transitionRadiation
         {
-            return omegaMin + float_X( ID ) * deltaOmega;
-        }
+            namespace linearFrequencies
+            {
+                class FreqFunctor
+                {
+                public:
+                    FreqFunctor(void)
+                    {
+                    }
 
-        HINLINE float_X get( const int ID )
-        {
-            return operator( )( ID );
-        }
-    }; // FreqFunctor
+                    HDINLINE float_X operator()(const int ID)
+                    {
+                        return omegaMin + float_X(ID) * deltaOmega;
+                    }
 
-    class InitFreqFunctor
-    {
-        public:
-            InitFreqFunctor( void )
-            { }
+                    HINLINE float_X get(const int ID)
+                    {
+                        return operator()(ID);
+                    }
+                }; // FreqFunctor
 
-            HINLINE void Init( const std::string path )
-            { }
+                class InitFreqFunctor
+                {
+                public:
+                    InitFreqFunctor(void)
+                    {
+                    }
 
+                    HINLINE void Init(const std::string path)
+                    {
+                    }
 
-        HINLINE FreqFunctor getFunctor( void )
-        {
-            return FreqFunctor( );
-        }
-    }; // InitFreqFunctor
 
-    //! @return frequency params as string
-    HINLINE
-    std::string
-    getParameters( void )
-    {
-        std::string params = std::string( "lin\t" );
-        params += std::to_string( nOmega ) + "\t";
-        params += std::to_string( SI::omegaMin ) + "\t";
-        params += std::to_string( SI::omegaMax ) + "\t";
-        return params;
-    }
+                    HINLINE FreqFunctor getFunctor(void)
+                    {
+                        return FreqFunctor();
+                    }
+                }; // InitFreqFunctor
+
+                //! @return frequency params as string
+                HINLINE
+                std::string getParameters(void)
+                {
+                    std::string params = std::string("lin\t");
+                    params += std::to_string(nOmega) + "\t";
+                    params += std::to_string(SI::omegaMin) + "\t";
+                    params += std::to_string(SI::omegaMax) + "\t";
+                    return params;
+                }
 
-} // namespace linearFrequencies
-} // namespace transitionRadiation
-} // namespace plugins
+            } // namespace linearFrequencies
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/frequencies/ListFrequencies.hpp b/include/picongpu/plugins/transitionRadiation/frequencies/ListFrequencies.hpp
index 2a18c5617d..e98be46953 100644
--- a/include/picongpu/plugins/transitionRadiation/frequencies/ListFrequencies.hpp
+++ b/include/picongpu/plugins/transitionRadiation/frequencies/ListFrequencies.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl, Finn-Ole Carstens
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -27,115 +27,115 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-namespace listFrequencies
-{
-    class FreqFunctor
+    namespace plugins
     {
-    public:
-
-        typedef GridBuffer< float_X, DIM1 >::DataBoxType DBoxType;
-
-        FreqFunctor( void )
-        { }
-
-        template< typename T >
-        FreqFunctor( T frequencies_handed )
-        {
-            this->frequencies_dev = frequencies_handed->getDeviceBuffer( ).getDataBox( );
-            this->frequencies_host = frequencies_handed->getHostBuffer( ).getDataBox( );
-        }
-
-        HDINLINE float_X operator( )( const unsigned int ID )
-        {
-            return ( ID < frequencies::nOmega ) ?  frequencies_dev[ ID ] : 0.0;
-        }
-
-        HINLINE float_X get( const unsigned int ID )
-        {
-            return ( ID < frequencies::nOmega ) ?  frequencies_host[ ID ] : 0.0;
-        }
-
-    private:
-        DBoxType frequencies_dev;
-        DBoxType frequencies_host;
-    }; // FreqFunctor
-
-
-
-    class InitFreqFunctor
-    {
-    public:
-        InitFreqFunctor( void )
-        { }
-
-        ~InitFreqFunctor( void )
-        {
-            __delete( frequencyBuffer );
-        }
-
-        typedef GridBuffer< picongpu::float_X, DIM1 >::DataBoxType DBoxType;
-
-        HINLINE void Init( const std::string path )
+        namespace transitionRadiation
         {
-
-            frequencyBuffer = new GridBuffer< float_X, DIM1 >( DataSpace< DIM1 >( nOmega ) );
-
-
-            DBoxType frequencyDB = frequencyBuffer->getHostBuffer( ).getDataBox( );
-
-            std::ifstream freqListFile( path.c_str( ) );
-            unsigned int i;
-
-            printf( "freq: %s\n", path.c_str( ) );
-
-            if( !freqListFile )
-            {
-                throw std::runtime_error( std::string( "The radiation-frequency-file " ) +
-                    path + std::string( " could not be found.\n" ) );
-            }
-
-
-            for( i = 0; i < nOmega && !freqListFile.eof( ); ++i )
+            namespace listFrequencies
             {
-                freqListFile >> frequencyDB[ i ];
-                // verbose output of loaded frequencies if verbose level PHYSICS is set:
-                log< plugins::radiation::PIConGPUVerboseRadiation::PHYSICS >("freq: %1% \t %2%") % i % frequencyDB[ i ];
-                frequencyDB[ i ] *= UNIT_TIME;
-            }
-
-            if( i != nOmega )
-            {
-                throw std::runtime_error( std::string( "The number of frequencies in the list and the number of frequencies in the parameters differ.\n" ) );
-            }
-
-            frequencyBuffer->hostToDevice( );
-
-        }
-
-        FreqFunctor getFunctor( void )
-        {
-            return FreqFunctor( frequencyBuffer );
-        }
-
-    private:
-        GridBuffer< float_X, DIM1 > * frequencyBuffer = nullptr;
-    }; // InitFreqFunctor
-
-    //! @return frequency params as string
-    HINLINE
-    std::string
-    getParameters( )
-    {
-        std::string params = std::string( "list\t" );
-        params += std::string( listLocation ) + std::string( "\t" );
-        return params;
-    }
-
-} // namespace listFrequencies
-} // namespace transitionRadiation
-} // namespace plugins
+                class FreqFunctor
+                {
+                public:
+                    typedef GridBuffer<float_X, DIM1>::DataBoxType DBoxType;
+
+                    FreqFunctor(void)
+                    {
+                    }
+
+                    template<typename T>
+                    FreqFunctor(T frequencies_handed)
+                    {
+                        this->frequencies_dev = frequencies_handed->getDeviceBuffer().getDataBox();
+                        this->frequencies_host = frequencies_handed->getHostBuffer().getDataBox();
+                    }
+
+                    HDINLINE float_X operator()(const unsigned int ID)
+                    {
+                        return (ID < frequencies::nOmega) ? frequencies_dev[ID] : 0.0;
+                    }
+
+                    HINLINE float_X get(const unsigned int ID)
+                    {
+                        return (ID < frequencies::nOmega) ? frequencies_host[ID] : 0.0;
+                    }
+
+                private:
+                    DBoxType frequencies_dev;
+                    DBoxType frequencies_host;
+                }; // FreqFunctor
+
+
+                class InitFreqFunctor
+                {
+                public:
+                    InitFreqFunctor(void)
+                    {
+                    }
+
+                    ~InitFreqFunctor(void)
+                    {
+                        __delete(frequencyBuffer);
+                    }
+
+                    typedef GridBuffer<picongpu::float_X, DIM1>::DataBoxType DBoxType;
+
+                    HINLINE void Init(const std::string path)
+                    {
+                        frequencyBuffer = new GridBuffer<float_X, DIM1>(DataSpace<DIM1>(nOmega));
+
+
+                        DBoxType frequencyDB = frequencyBuffer->getHostBuffer().getDataBox();
+
+                        std::ifstream freqListFile(path.c_str());
+                        unsigned int i;
+
+                        printf("freq: %s\n", path.c_str());
+
+                        if(!freqListFile)
+                        {
+                            throw std::runtime_error(
+                                std::string("The radiation-frequency-file ") + path
+                                + std::string(" could not be found.\n"));
+                        }
+
+
+                        for(i = 0; i < nOmega && !freqListFile.eof(); ++i)
+                        {
+                            freqListFile >> frequencyDB[i];
+                            // verbose output of loaded frequencies if verbose level PHYSICS is set:
+                            log<plugins::radiation::PIConGPUVerboseRadiation::PHYSICS>("freq: %1% \t %2%") % i
+                                % frequencyDB[i];
+                            frequencyDB[i] *= UNIT_TIME;
+                        }
+
+                        if(i != nOmega)
+                        {
+                            throw std::runtime_error(std::string("The number of frequencies in the list and the "
+                                                                 "number of frequencies in the parameters differ.\n"));
+                        }
+
+                        frequencyBuffer->hostToDevice();
+                    }
+
+                    FreqFunctor getFunctor(void)
+                    {
+                        return FreqFunctor(frequencyBuffer);
+                    }
+
+                private:
+                    GridBuffer<float_X, DIM1>* frequencyBuffer = nullptr;
+                }; // InitFreqFunctor
+
+                //! @return frequency params as string
+                HINLINE
+                std::string getParameters()
+                {
+                    std::string params = std::string("list\t");
+                    params += std::string(listLocation) + std::string("\t");
+                    return params;
+                }
+
+            } // namespace listFrequencies
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/transitionRadiation/frequencies/LogFrequencies.hpp b/include/picongpu/plugins/transitionRadiation/frequencies/LogFrequencies.hpp
index c669aba896..2f6ac9d5c9 100644
--- a/include/picongpu/plugins/transitionRadiation/frequencies/LogFrequencies.hpp
+++ b/include/picongpu/plugins/transitionRadiation/frequencies/LogFrequencies.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Finn-Ole Carstens
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -24,67 +24,68 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-namespace logFrequencies
-{
-    class FreqFunctor
+    namespace plugins
     {
-    public:
-        FreqFunctor( void )
+        namespace transitionRadiation
         {
-            omega_log_min = math::log( omegaMin );
-            delta_omega_log = ( math::log( omegaMax ) - omega_log_min ) / float_X( nOmega - 1 );
-        }
+            namespace logFrequencies
+            {
+                class FreqFunctor
+                {
+                public:
+                    FreqFunctor(void)
+                    {
+                        omega_log_min = math::log(omegaMin);
+                        delta_omega_log = (math::log(omegaMax) - omega_log_min) / float_X(nOmega - 1);
+                    }
 
-        HDINLINE float_X operator( )( const int ID )
-        {
-            return  math::exp( omega_log_min + ( float_X( ID ) ) * delta_omega_log );
-        }
+                    HDINLINE float_X operator()(const int ID)
+                    {
+                        return math::exp(omega_log_min + (float_X(ID)) * delta_omega_log);
+                    }
 
-        HINLINE float_X get( const int ID )
-        {
-            return operator( )( ID );
-        }
+                    HINLINE float_X get(const int ID)
+                    {
+                        return operator()(ID);
+                    }
 
-    private:
-        float_X omega_log_min;
-        float_X delta_omega_log;
-    }; // FreqFunctor
+                private:
+                    float_X omega_log_min;
+                    float_X delta_omega_log;
+                }; // FreqFunctor
 
 
-    class InitFreqFunctor
-    {
-    public:
-        InitFreqFunctor( void )
-        { }
+                class InitFreqFunctor
+                {
+                public:
+                    InitFreqFunctor(void)
+                    {
+                    }
 
-        HINLINE void Init( const std::string path )
-        { }
+                    HINLINE void Init(const std::string path)
+                    {
+                    }
 
 
-        HINLINE FreqFunctor getFunctor( void )
-        {
-            return FreqFunctor( );
-        }
-    }; // InitFreqFunctor
+                    HINLINE FreqFunctor getFunctor(void)
+                    {
+                        return FreqFunctor();
+                    }
+                }; // InitFreqFunctor
 
 
-    //! @return frequency params as string
-    HINLINE
-    std::string
-    getParameters( void )
-    {
-        std::string params = std::string( "log\t" );
-        params += std::to_string( nOmega ) + "\t";
-        params += std::to_string( SI::omegaMin ) + "\t";
-        params += std::to_string( SI::omegaMax ) + "\t";
-        return params;
-    }
+                //! @return frequency params as string
+                HINLINE
+                std::string getParameters(void)
+                {
+                    std::string params = std::string("log\t");
+                    params += std::to_string(nOmega) + "\t";
+                    params += std::to_string(SI::omegaMin) + "\t";
+                    params += std::to_string(SI::omegaMax) + "\t";
+                    return params;
+                }
 
-} // namespace logFrequencies
-} // namespace transitionRadiation
-} // namespace plugins
+            } // namespace logFrequencies
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/DetermineElectronDensitySolver.hpp b/include/picongpu/plugins/xrayScattering/DetermineElectronDensitySolver.hpp
new file mode 100644
index 0000000000..626962b6c1
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/DetermineElectronDensitySolver.hpp
@@ -0,0 +1,65 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include <pmacc/traits/HasFlag.hpp>
+#include "picongpu/particles/particleToGrid/derivedAttributes/DerivedAttributes.def"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            using namespace particles::particleToGrid;
+
+            template<typename T_ParticleType>
+            struct IsIon
+            {
+                using FrameType = typename T_ParticleType::FrameType;
+                using type = typename pmacc::traits::HasFlag<FrameType, boundElectrons>::type;
+            };
+
+
+            /** Chose an electron density solver for a given particle type.
+             *
+             * Switches between a bound electron number density solver for particles
+             * with the boundElectrons attribute (ions) and a particle number density
+             * solver for other particle types (electrons).
+             *
+             * @tparam T_ParticleType Scattering particles
+             * @return ::type TmpField solver to be used
+             */
+            template<typename T_ParticlesType>
+            struct DetermineElectronDensitySolver
+            {
+                using IonSolver =
+                    typename CreateFieldTmpOperation_t<T_ParticlesType, derivedAttributes::BoundElectronDensity>::
+                        Solver;
+
+                using ElectronSolver =
+                    typename CreateFieldTmpOperation_t<T_ParticlesType, derivedAttributes::Density>::Solver;
+
+                using type =
+                    typename boost::mpl::if_<typename IsIon<T_ParticlesType>::type, IonSolver, ElectronSolver>::type;
+            };
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/GetScatteringVector.hpp b/include/picongpu/plugins/xrayScattering/GetScatteringVector.hpp
new file mode 100644
index 0000000000..d9fcc8bc2e
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/GetScatteringVector.hpp
@@ -0,0 +1,74 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include <pmacc/math/Vector.hpp>
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            //! Maps a 1D output array index to the corresponding point in the q-space.
+            struct GetScatteringVector
+            {
+                /**
+                 * @param q_min Begin of the output range for all axis.
+                 * @param q_max End of the output range for all axis.
+                 * @param q_step Output array grid spacing.
+                 * @param numVectors The output array size.
+                 * @param iterOffset Offset for an index shift.
+                 */
+                HDINLINE GetScatteringVector(
+                    float2_X const q_min,
+                    float2_X const q_max,
+                    float2_X const q_step,
+                    DataSpace<DIM2> const numVectors,
+                    uint32_t const iterOffset)
+                    : m_q_min(q_min)
+                    , m_q_max(q_max)
+                    , m_q_step(q_step)
+                    , m_numVectors(numVectors)
+                    , m_iterOffset(iterOffset)
+                {
+                }
+
+                HDINLINE float2_X operator[](const uint32_t& idx)
+                {
+                    const uint32_t totalIdx = idx + m_iterOffset;
+                    uint32_t i_y(totalIdx % m_numVectors.y());
+                    uint32_t i_x(totalIdx / m_numVectors.y());
+
+                    return m_q_min + m_q_step * float2_X(i_x, i_y);
+                }
+
+            private:
+                // Pmacc struct members memory alignment for objects stored on devices.
+                PMACC_ALIGN(m_q_min, const float2_X);
+                PMACC_ALIGN(m_q_max, const float2_X);
+                PMACC_ALIGN(m_q_step, const float2_X);
+                PMACC_ALIGN(m_numVectors, const DataSpace<DIM2>);
+                PMACC_ALIGN(m_iterOffset, const uint32_t);
+            };
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/XrayScattering.hpp b/include/picongpu/plugins/xrayScattering/XrayScattering.hpp
new file mode 100644
index 0000000000..9bbcf08892
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/XrayScattering.hpp
@@ -0,0 +1,691 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch,
+ *                     Klaus Steiniger, Felix Schmitt, Benjamin Worpitz,
+ *                     Juncheng E, Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/particles/traits/SpeciesEligibleForSolver.hpp"
+#include "picongpu/plugins/ISimulationPlugin.hpp"
+#include "picongpu/plugins/common/stringHelpers.hpp"
+
+#include "picongpu/fields/FieldTmp.hpp"
+#include "picongpu/param/xrayScattering.param"
+#include "picongpu/plugins/xrayScattering/beam/XrayScatteringBeam.hpp"
+#include "picongpu/plugins/xrayScattering/XrayScattering.kernel"
+#include "picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp"
+#include "picongpu/plugins/xrayScattering/xrayScatteringUtilities.hpp"
+#include "picongpu/plugins/xrayScattering/GetScatteringVector.hpp"
+#include "picongpu/plugins/xrayScattering/DetermineElectronDensitySolver.hpp"
+#include "picongpu/particles/particleToGrid/derivedAttributes/Density.def"
+
+#include <pmacc/dataManagement/DataConnector.hpp>
+#include <pmacc/dimensions/DataSpaceOperations.hpp>
+#include <pmacc/mappings/kernel/AreaMapping.hpp>
+#include <pmacc/mpi/MPIReduce.hpp>
+#include <pmacc/mpi/reduceMethods/Reduce.hpp>
+#include <pmacc/nvidia/functors/Add.hpp>
+#include <pmacc/traits/GetNumWorkers.hpp>
+#include <pmacc/traits/HasFlag.hpp>
+#include <pmacc/assert.hpp>
+
+#include <boost/filesystem.hpp>
+#include <boost/mpl/bool.hpp>
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <memory>
+#include <map>
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            using namespace pmacc;
+            using namespace picongpu::SI;
+            namespace po = boost::program_options;
+            using complex_X = pmacc::math::Complex<float_X>;
+
+
+            /** xrayScattering plugin
+             * This  plugin simulates the SAXS scattering amplitude
+             * from the particles number density.
+             *
+             * @tparam T_ParticlesType Scatterers
+             **/
+            template<typename T_ParticlesType>
+            class XrayScattering : public ISimulationPlugin
+            {
+            private:
+                using SuperCellSize = MappingDesc::SuperCellSize;
+
+                MappingDesc cellDescription;
+                uint32_t currentStep;
+
+                //! Probing beam characterization
+                std::unique_ptr<beam::XrayScatteringBeam> probingBeam;
+
+                // memory:
+                using ComplexBuffer = GridBuffer<complex_X, DIM1>;
+                std::unique_ptr<ComplexBuffer> amplitude;
+                // Needed as long as opePMD-api doesn't support complex values:
+                //! Storage for amplitude real part used when dumping data
+                std::vector<float_X> realPart;
+                //! Storage for amplitude imaginary part used when dumping data
+                std::vector<float_X> imgPart;
+                // Used only in the distributed mode:
+                //! Storage for receiving amplitude data from another node
+                std::vector<complex_X> amplitudeReceive;
+                //! Number of scattering vectors on initialy last rank
+                uint64_t resOfVectors;
+                // Used only in the mirrored mode:
+                std::vector<complex_X> amplitudeMaster;
+
+                // Variables for plugin options:
+                std::string notifyPeriod;
+                std::string speciesName;
+                std::string pluginName;
+                std::string pluginPrefix;
+                std::string fileName;
+                std::string fileExtension;
+                std::string compressionMethod;
+                std::string outputPeriod_s;
+                std::string memoryLayout;
+                //! Plugin functioning mode
+                OutputMemoryLayout outputLayout;
+                //! Time steps at which the output is dumped
+                using SeqOfTimeSlices = std::vector<pluginSystem::TimeSlice>;
+                SeqOfTimeSlices outputPeriod;
+
+                /** Range of scattering vector
+                 * The scattering vector here is defined as
+                 * 4*pi*sin(theta)/lambda, where 2 * theta is the angle between the
+                 * incoming k-vector and the scattered one.
+                 * See the definition in this paper https://doi.org/10.1063/1.5008289.
+                 **/
+                float2_X q_min, q_max, q_step;
+                //! Number of scattering vectors
+                DataSpace<DIM2> numVectors;
+
+                uint32_t totalSimulationCells;
+
+                // Needed to handle the parallelization over multiple hosts.
+                bool isMaster;
+                uint32_t mpiRank;
+                //! Total number of nodes
+                uint32_t countRanks;
+                //! Number of Times the distributed output was passed along
+                uint32_t accumulatedRotations;
+                mpi::MPIReduce reduce;
+
+                //! Output writer
+                std::unique_ptr<XrayScatteringWriter<float_X>> dataWriter;
+
+
+            public:
+                //! XrayScattering object initializer.
+                XrayScattering()
+                    : pluginName("xrayScattering: Calculate the SAXS scattering intensity of a "
+                                 "species.")
+                    , speciesName(T_ParticlesType::FrameType::getName())
+                    , pluginPrefix(speciesName + std::string("_xrayScattering"))
+                    ,
+                    // this is bodged so it passes the verification at
+                    // MappingDescription.hpp:79
+                    cellDescription(DataSpace<simDim>(SuperCellSize::toRT()))
+                    , isMaster(false)
+                    , currentStep(0)
+                    , accumulatedRotations(0)
+                {
+                    Environment<>::get().PluginConnector().registerPlugin(this);
+                }
+
+                //! XrayScattering object destructor.
+                ~XrayScattering() override
+                {
+                }
+
+
+                //! Adds command line options and their descriptions.
+                void pluginRegisterHelp(po::options_description& desc) override
+                {
+                    desc.add_options()(
+                        (pluginPrefix + ".period").c_str(),
+                        po::value<std::string>(&notifyPeriod),
+                        "enable plugin [for each n-th step]")(
+                        (pluginPrefix + ".outputPeriod").c_str(),
+                        po::value<std::string>(&outputPeriod_s)->default_value("1"),
+                        "dump amplitude [for each n-th step]")(
+                        (pluginPrefix + ".qx_max").c_str(),
+                        po::value<float_X>(&q_max[0])->default_value(5),
+                        "reciprocal space range qx_max (A^-1)")(
+                        (pluginPrefix + ".qy_max").c_str(),
+                        po::value<float_X>(&q_max[1])->default_value(5),
+                        "reciprocal space range qy_max (A^-1)")(
+                        (pluginPrefix + ".qx_min").c_str(),
+                        po::value<float_X>(&q_min[0])->default_value(-5),
+                        "reciprocal space range qx_min (A^-1)")(
+                        (pluginPrefix + ".qy_min").c_str(),
+                        po::value<float_X>(&q_min[1])->default_value(-5),
+                        "reciprocal space range qy_min (A^-1)")(
+                        (pluginPrefix + ".n_qx").c_str(),
+                        po::value<int>(&numVectors[0])->default_value(100),
+                        "number of qx")(
+                        (pluginPrefix + ".n_qy").c_str(),
+                        po::value<int>(&numVectors[1])->default_value(100),
+                        "number of qy")(
+                        (pluginPrefix + ".file").c_str(),
+                        po::value<std::string>(&fileName)->default_value(pluginName + "Output"),
+                        "output file name")(
+                        (pluginPrefix + ".ext").c_str(),
+                        po::value<std::string>(&fileExtension)->default_value("bp"),
+                        "openPMD filename extension (this controls the backend "
+                        "picked by the openPMD API)")(
+                        (pluginPrefix + ".compression").c_str(),
+                        po::value<std::string>(&compressionMethod)->default_value(""),
+                        "Backend-specific openPMD compression method, e.g., zlib "
+                        "(see `adios_config -m` for help)")(
+                        (pluginPrefix + ".memoryLayout").c_str(),
+                        po::value<std::string>(&memoryLayout)->default_value("mirror"),
+                        "Possible values: 'mirror' and 'distribute'"
+                        "Output can be mirrored on all Host+Device pairs or"
+                        " uniformly distributed over all nodes. Distribute can be used "
+                        "when the output array is to big to store the complete "
+                        "computed q-space on one device.");
+                }
+
+
+                //! Get plugin name.
+                std::string pluginGetName() const override
+                {
+                    return pluginName;
+                }
+
+
+                //! Sets Mapping description for the xrayScattering plugin.
+                void setMappingDescription(MappingDesc* cellDescriptionLoc) override
+                {
+                    cellDescription = *cellDescriptionLoc;
+                }
+
+
+                void restart(uint32_t timeStep, const std::string restartDirectory) override
+                {
+                    log<picLog::INPUT_OUTPUT>("XrayScattering : restart not"
+                                              "yet implemented - start with zero values");
+                    // TODO: Support for restarting.
+                }
+
+
+                void checkpoint(uint32_t timeStep, const std::string restartDirectory) override
+                {
+                    log<picLog::INPUT_OUTPUT>("XrayScattering : checkpoint not"
+                                              "yet implemented - nothing was saved");
+
+                    // TODO: Support for restarting.
+                }
+
+
+            private:
+                //! Prepare the plugin in the simulation initialization phase.
+                void pluginLoad() override
+                {
+                    if(!notifyPeriod.empty())
+                    {
+                        /* Beam has to be initialized later as the domain sizes.
+                         *   The value retrieved by getDomainSize in
+                         *   CoordinateTransform.hpp is still set to (0,0,0) when the
+                         *   XrayScattering object is initialized.
+                         */
+                        probingBeam = std::make_unique<beam::XrayScatteringBeam>();
+                        // Set the steps at which the xrayScattering amplitude is
+                        // calculated.
+                        Environment<>::get().PluginConnector().setNotificationPeriod(this, notifyPeriod);
+                        // Set the memory layout in use.
+                        std::map<std::string, OutputMemoryLayout> layoutMap;
+                        layoutMap["mirror"] = OutputMemoryLayout::Mirror;
+                        layoutMap["distribute"] = OutputMemoryLayout::Distribute;
+                        outputLayout = layoutMap.at(memoryLayout);
+
+                        GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                        mpiRank = gc.getGlobalRank();
+                        isMaster = (mpiRank == 0);
+
+                        // Prepare amplitude buffer:
+                        uint32_t bufferSize;
+                        auto totalNumVectors = numVectors.productOfComponents();
+                        if(outputLayout == OutputMemoryLayout::Mirror)
+                        {
+                            // All vectors are stored on every node.
+                            bufferSize = totalNumVectors;
+                            // Initiate the additional amplitude storage for the reduce
+                            // operation and initiate it with zeros.
+                            amplitudeMaster.assign(totalNumVectors, complex_X(0.0));
+                        }
+                        else
+                        {
+                            countRanks = gc.getGpuNodes().productOfComponents();
+                            // Number of scattering vectors in all but last chunk.
+                            // (ceil integer division)
+                            bufferSize = totalNumVectors / countRanks + ((totalNumVectors % countRanks) != 0);
+                            // Number of scattering vectors on the last chunk.
+                            resOfVectors = bufferSize - (bufferSize * countRanks - totalNumVectors);
+                            // Initiate the additional amplitude storage for receiving
+                            // data and initiate it with zeros.
+                            amplitudeReceive.assign(bufferSize, complex_X(0.0));
+                        }
+                        // Allocate amplitude buffer.
+                        amplitude = std::make_unique<ComplexBuffer>(DataSpace<DIM1>(bufferSize));
+                        // Initialize, on device, its fields with zero.
+                        amplitude->getDeviceBuffer().setValue(0.0);
+
+                        // Go to PIC unit system.
+                        constexpr float_X invMeterToInvAngstrom = 1.0e10;
+                        q_min = q_min * invMeterToInvAngstrom * UNIT_LENGTH;
+                        q_max = q_max * invMeterToInvAngstrom * UNIT_LENGTH;
+                        // Set the q-space grid spacing.
+                        q_step = (q_max - q_min) / precisionCast<float_X>(numVectors);
+
+                        // Rank 0 creates the output directory.
+                        pmacc::Filesystem<simDim>& fs = Environment<simDim>::get().Filesystem();
+                        if(isMaster)
+                        {
+                            fs.createDirectory("xrayScatteringOutput");
+                            fs.setDirectoryPermissions("xrayScatteringOutput");
+                        }
+
+                        // Chose the solver for populating a TmpField with the electron
+                        // density (either the species density or the bound electron
+                        // density).
+                        using ElectronDensitySolver = typename DetermineElectronDensitySolver<T_ParticlesType>::type;
+                        // Output unit:
+                        const float_64 amplitudeUnit
+                            = static_cast<float_64>(FieldTmp::getUnit<ElectronDensitySolver>()[0]) * CELL_WIDTH_SI
+                            * CELL_HEIGHT_SI * CELL_DEPTH_SI * ELECTRON_RADIUS_SI;
+
+                        // Set the total number of cells in the simulation.
+                        totalSimulationCells
+                            = Environment<simDim>::get().SubGrid().getGlobalDomain().size.productOfComponents();
+
+                        // Initialize an object responsible for output writing.
+                        dataWriter = std::make_unique<XrayScatteringWriter<float_X>>(
+                            pluginPrefix + "Output",
+                            fileExtension,
+                            "xrayScatteringOutput",
+                            outputLayout,
+                            compressionMethod,
+                            precisionCast<uint64_t>(numVectors),
+                            q_step,
+                            amplitudeUnit,
+                            totalSimulationCells);
+                        // Set the output period.
+                        outputPeriod = pluginSystem::toTimeSlice(outputPeriod_s);
+                    }
+                }
+
+
+                void pluginUnload() override
+                {
+                }
+
+
+                //! Collect amplitude data from each CPU on the master node.
+                void collectIntensityOnMaster()
+                {
+                    amplitude->deviceToHost();
+                    __getTransactionEvent().waitForFinished();
+
+                    reduce(
+                        nvidia::functors::Add(),
+                        amplitudeMaster.data(),
+                        amplitude->getHostBuffer().getBasePointer(),
+                        amplitude->getHostBuffer().getCurrentSize(),
+                        mpi::reduceMethods::Reduce());
+                }
+
+
+                //! Calculates the offset to the the currently processed output chunk.
+                HINLINE uint32_t calcOffset(uint32_t const& step) const
+                {
+                    /* Chunks move with every "rotation" from left to the right (from
+                     * smaller to a higher rank). So after one rotation the rank n has
+                     * the n-1 chunk( counted from 0).
+                     * so: chunk = (rank - rotations) % countRanks
+                     * to avoid a negative number in the modulo operation countRanks
+                     * is added in the beginning and only totalRotations % countRanks
+                     * is subtracted.
+                     */
+                    uint32_t totalRotations = accumulatedRotations + step;
+                    uint32_t chunk = mpiRank + countRanks;
+                    chunk = ((chunk - (totalRotations % countRanks)) % countRanks);
+                    return chunk * amplitude->getHostBuffer().getCurrentSize();
+                }
+
+
+                //! Checks if this node hast the last output part.
+                HINLINE bool hasLastChunk(uint32_t const& step) const
+                {
+                    uint32_t totalRotations = accumulatedRotations + step;
+                    return mpiRank == (countRanks - 1 + totalRotations) % countRanks;
+                }
+
+
+                //! Writes amplitude data to disk.
+                HINLINE void writeOutput()
+                {
+                    if(outputLayout == OutputMemoryLayout::Distribute)
+                    {
+                        amplitude->deviceToHost();
+                        __getTransactionEvent().waitForFinished();
+                        realPart = extractReal(amplitude->getHostBuffer());
+                        imgPart = extractImag(amplitude->getHostBuffer());
+
+                        uint64_t offset = precisionCast<uint64_t>(calcOffset(countRanks - 1));
+                        uint64_t extent;
+                        if(hasLastChunk(countRanks - 1))
+                            extent = resOfVectors;
+                        else
+                            extent = amplitude->getHostBuffer().getCurrentSize();
+                        (*dataWriter)(currentStep, extent, offset, realPart, imgPart);
+                    }
+                    else
+                    {
+                        collectIntensityOnMaster();
+                        if(isMaster)
+                        {
+                            realPart = extractReal(amplitudeMaster);
+                            imgPart = extractImag(amplitudeMaster);
+                            (*dataWriter)(currentStep, realPart, imgPart);
+                        }
+                        // reset amplitudes back to zero
+                        amplitudeMaster.assign(amplitudeMaster.size(), complex_X(0.0));
+                    }
+                }
+
+
+                /** Passes output chunks from one device to another.
+                 *
+                 * @param step Current step in the Loop over kernel runs, in the current
+                 *     simulation step.
+                 */
+                HINLINE void communicationOnStep(uint32_t const& step)
+                {
+                    using namespace mpi;
+                    // No action is necessary on the first step.
+                    if(step == 0u)
+                        return;
+                    // Copy data calculated on GPU , on last step, to CPU memory.
+                    amplitude->deviceToHost();
+                    // Avoid deadlock between not finished pmacc tasks and mpi blocking
+                    // collectives.
+                    __getTransactionEvent().waitForFinished();
+                    // MPI asynchronous send & receive:
+                    int bytesToSend = sizeof(complex_X) / sizeof(char);
+                    bytesToSend *= amplitude->getHostBuffer().getCurrentSize();
+
+                    // An mpi request to monitor a non blocking send transaction.
+                    GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                    MPI_Request transactionRequest;
+                    // Pass data to the next node.
+                    MPI_CHECK(MPI_Isend(
+                        amplitude->getHostBuffer().getBasePointer(),
+                        bytesToSend,
+                        MPI_BYTE,
+                        (mpiRank + 1) % countRanks,
+                        0,
+                        gc.getCommunicator().getMPIComm(),
+                        &transactionRequest));
+                    // Receive from the proceeding node (blocking transaction).
+                    int receiveFrom = (mpiRank == 0u) ? countRanks - 1 : mpiRank - 1;
+                    MPI_CHECK(MPI_Recv(
+                        amplitudeReceive.data(),
+                        bytesToSend,
+                        MPI_BYTE,
+                        std::move(receiveFrom),
+                        0,
+                        gc.getCommunicator().getMPIComm(),
+                        MPI_STATUS_IGNORE));
+
+                    // Wait for the send transaction to end.
+                    MPI_Wait(&transactionRequest, MPI_STATUS_IGNORE);
+                    // Copy the received data to the host buffer.
+                    copyVectorToBuffer(amplitudeReceive, amplitude->getHostBuffer());
+                    // Copy the received data to the device so it can be used as
+                    // output in this step.
+                    amplitude->hostToDevice();
+                }
+
+
+                /** Calculates a form factor number density of the species.
+                 *
+                 * @param dc data connector
+                 * @param globalOffset offset from the global to the local domain
+                 * @return data box containing the calculated data.
+                 */
+                HINLINE FieldTmp::DataBoxType calculateDensity(DataConnector& dc, DataSpace<simDim>& globalOffset)
+                {
+                    // Check if there is at least one unused field available.
+                    PMACC_CASSERT_MSG(_please_allocate_at_least_one_FieldTmp_in_memory_param, fieldTmpNumSlots > 0);
+                    // Get a field for density storage.
+                    auto tmpField = dc.get<FieldTmp>(FieldTmp::getUniqueId(0), true);
+                    // Initiate with zeros.
+                    tmpField->getGridBuffer().getDeviceBuffer().setValue(FieldTmp::ValueType::create(0.0));
+                    // Chose species.
+                    auto species = dc.get<T_ParticlesType>(T_ParticlesType::FrameType::getName(), true);
+
+                    // Chose the solver for populating a TmpField with the form factor
+                    // density of the particles.
+                    using ElectronDensitySolver = typename DetermineElectronDensitySolver<T_ParticlesType>::type;
+                    // Calculate density.
+                    tmpField->template computeValue<CORE + BORDER, ElectronDensitySolver>(*species, currentStep);
+                    // Release particle data.
+                    dc.releaseData(T_ParticlesType::FrameType::getName());
+                    // Get the field data box.
+                    FieldTmp::DataBoxType tmpFieldBox = tmpField->getGridBuffer().getDeviceBuffer().getDataBox();
+                    return tmpFieldBox;
+                }
+
+
+                /** Runs kernel when the output is distributed over nodes.
+                 *
+                 * A single kernel run adds result only to that output part which
+                 * currently resides on the node. The output parts are passed along to
+                 * the neighbouring node, in a circle. This repeats until every node has
+                 * computed all scattering vectors.
+                 *
+                 * @param cellsGrid field grid, without GUARD, on one device
+                 * @param fieldTmpNoGuard field data
+                 * @param globalOffset offset from the global to the local domain
+                 * @param numBlocks number of virtual blocks used in a kernel run
+                 * @param fieldPos TmpField in cell position
+                 */
+                template<typename T_FieldPos>
+                HINLINE void runKernelInDistributeMode(
+                    DataSpace<simDim>& cellsGrid,
+                    FieldTmp::DataBoxType const& fieldTmpNoGuard,
+                    DataSpace<simDim>& globalOffset,
+                    uint32_t const& numBlocks,
+                    T_FieldPos const& fieldPos)
+                {
+                    // The available number of virtual workers.
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                    // Loop over kernel runs.
+                    for(uint32_t step = 0; step < countRanks; step++)
+                    {
+                        uint32_t countVectors, iterOffset;
+                        // Pass along the data.
+                        communicationOnStep(step);
+                        // 1D offset to the begin of the currently processed output
+                        // part.
+                        iterOffset = calcOffset(step);
+                        // Define scattering vectors for the output part.
+                        GetScatteringVector scatteringVectors{q_min, q_max, q_step, numVectors, iterOffset};
+                        // Handle possibly smaller amount of vectors to be processed
+                        // in the last output part.
+                        if(hasLastChunk(step))
+                        {
+                            countVectors = resOfVectors;
+                        }
+                        else
+                            countVectors = amplitude->getHostBuffer().getCurrentSize();
+                        // Start the kernel.
+                        PMACC_KERNEL(KernelXrayScattering<numWorkers>{})
+                        (numBlocks, numWorkers)(
+                            cellsGrid,
+                            fieldTmpNoGuard,
+                            globalOffset,
+                            fieldPos,
+                            amplitude->getDeviceBuffer().getDataBox(),
+                            countVectors,
+                            scatteringVectors,
+                            *probingBeam,
+                            currentStep,
+                            totalSimulationCells);
+                    }
+                }
+
+                /** Runs xrayScattering kernel when the output is mirrored over nodes.
+                 *
+                 * Kernel runs only once in a simulation time step and computes
+                 * the complete output at once.
+                 *
+                 * @param cellsGrid field grid, without GUARD, on one device
+                 * @param fieldTmpNoGuard field data
+                 * @param globalOffset offset from the global to the local domain
+                 * @param numBlocks number of virtual blocks used in a kernel run
+                 * @param fieldPos TmpField in cell position
+                 */
+                template<typename T_FieldPos>
+                HINLINE void runKernelInMirrorMode(
+                    DataSpace<simDim>& cellsGrid,
+                    FieldTmp::DataBoxType const& fieldTmpNoGuard,
+                    DataSpace<simDim>& globalOffset,
+                    uint32_t const& numBlocks,
+                    T_FieldPos const& fieldPos)
+                {
+                    // Get the available number of virtual workers.
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+                    // Define scattering vectors for the output part.
+                    GetScatteringVector scatteringVectors{q_min, q_max, q_step, numVectors, 0};
+                    // Run the kernel.
+                    PMACC_KERNEL(KernelXrayScattering<numWorkers>{})
+                    (numBlocks, numWorkers)(
+                        cellsGrid,
+                        fieldTmpNoGuard,
+                        globalOffset,
+                        fieldPos,
+                        amplitude->getDeviceBuffer().getDataBox(),
+                        amplitude->getHostBuffer().getCurrentSize(),
+                        scatteringVectors,
+                        *probingBeam,
+                        currentStep,
+                        totalSimulationCells);
+                }
+
+
+                /** Actions performed on every step included in the notify period.
+                 *
+                 * First the form factor density is calculated then the Kernel is
+                 * started. For steps in the output period, amplitude is written
+                 * to disk.
+                 *
+                 * @param currentStep
+                 **/
+                HINLINE void notify(uint32_t currentStep) override
+                {
+                    this->currentStep = currentStep;
+
+                    // Get the available number of virtual workers per block.
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                    // Form factor density:
+                    // Get the offset to the local domain (this HOST + DEVICE pair).
+                    const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+                    DataSpace<simDim> globalOffset(subGrid.getLocalDomain().offset);
+                    // Calculate the density and get a data box to access this TmpField.
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    FieldTmp::DataBoxType tmpFieldBox = calculateDensity(dc, globalOffset);
+                    // Get the in cell position of a TmpField.
+                    // Could probably remove it as it is the cell origin in all cell
+                    // types.
+                    const picongpu::traits::FieldPosition<typename fields::CellType, FieldTmp> fieldPos;
+                    // Shift the density box to exclude the GUARD.
+                    DataSpace<simDim> guardingSC = cellDescription.getGuardingSuperCells();
+                    auto const fieldTmpNoGuard = tmpFieldBox.shift(guardingSC * SuperCellSize::toRT());
+                    // Get the field size on this rank (no GUARD).
+                    DataSpace<simDim> cellsGrid
+                        = (cellDescription.getGridSuperCells() - 2 * guardingSC) * SuperCellSize::toRT();
+                    uint32_t const totalNumCells = cellsGrid.productOfComponents();
+                    // Get the number of, virtual, blocks.
+                    PMACC_ASSERT(totalNumCells % numWorkers == 0);
+                    uint32_t const numBlocks = totalNumCells / numWorkers;
+
+
+                    // Run Kernel.
+                    if(outputLayout == OutputMemoryLayout::Distribute)
+                    {
+                        runKernelInDistributeMode(cellsGrid, fieldTmpNoGuard, globalOffset, numBlocks, fieldPos);
+                    }
+                    else
+                    {
+                        runKernelInMirrorMode(cellsGrid, fieldTmpNoGuard, globalOffset, numBlocks, fieldPos);
+                    }
+                    // Release density data.
+                    dc.releaseData(FieldTmp::getUniqueId(0));
+                    // Write to disk.
+                    if(pluginSystem::containsStep(outputPeriod, currentStep))
+                        writeOutput();
+                    // Update the total number of rotations ( data passes ).
+                    if(outputLayout == OutputMemoryLayout::Distribute)
+                        accumulatedRotations += countRanks - 1;
+                }
+            };
+        } // namespace xrayScattering
+    } // namespace plugins
+    namespace particles
+    {
+        namespace traits
+        {
+            template<typename T_Species, typename T_UnspecifiedSpecies>
+            struct SpeciesEligibleForSolver<T_Species, plugins::xrayScattering::XrayScattering<T_UnspecifiedSpecies>>
+            {
+                using FrameType = typename T_Species::FrameType;
+
+                // This plugin needs at least the position and weighting.
+                using RequiredIdentifiers = MakeSeq_t<position<>, weighting>;
+
+                using SpeciesHasIdentifiers =
+                    typename pmacc::traits::HasIdentifiers<FrameType, RequiredIdentifiers>::type;
+
+                using type = SpeciesHasIdentifiers;
+            };
+
+        } // namespace traits
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/XrayScattering.kernel b/include/picongpu/plugins/xrayScattering/XrayScattering.kernel
new file mode 100644
index 0000000000..e28212a8c8
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/XrayScattering.kernel
@@ -0,0 +1,163 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/plugins/xrayScattering/XrayScattering.hpp"
+
+#include <pmacc/mappings/kernel/AreaMapping.hpp>
+#include <pmacc/dimensions/DataSpaceOperations.hpp>
+#include <pmacc/mappings/threads/ForEachIdx.hpp>
+#include <pmacc/mappings/threads/IdxConfig.hpp>
+
+#include <cstdio>
+
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            /** Kernel for xrayScattering calculation.
+             *
+             * @tparam T_numWorkers Number of virtual workers on block.
+             */
+            template<uint32_t T_numWorkers>
+            struct KernelXrayScattering
+            {
+                /** Kernel function.
+                 *
+                 * @param acc alpaka accelerator
+                 * @param cellsGrid Dimensions of BORDER + CORE in cells, not super
+                 *      cells.
+                 * @param densityBoxGPU Data box of the density device storage, shifted
+                 *      to exclude 1st GUARD.
+                 * @param globalOffset Offset from the global to the local domain.
+                 * @param fieldPos TmpField in cell position.
+                 * @param amplitudeBox Device side data box of the output Buffer.
+                 * @param totalNumVectors Number of scattering vectors to process.
+                 * @param scatteringVectors Scattering vectors to process.
+                 * @param probingBeam Probing beam characterization.
+                 * @param currentStep Current simulation step.
+                 */
+                template<
+                    typename T_Acc,
+                    typename T_DensityBoxGPU,
+                    typename T_FieldPos,
+                    typename T_DBox,
+                    typename T_TotalNumVectors,
+                    typename T_ScatteringVectors,
+                    typename T_ProbingBeam>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    DataSpace<simDim> cellsGrid,
+                    T_DensityBoxGPU densityBoxGPU,
+                    DataSpace<simDim> globalOffset,
+                    T_FieldPos fieldPos,
+                    T_DBox amplitudeBox,
+                    T_TotalNumVectors const totalNumVectors,
+                    T_ScatteringVectors scatteringVectors,
+                    T_ProbingBeam probingBeam,
+                    uint32_t currentStep,
+                    uint32_t totalSimulationCells
+
+                ) const
+                {
+                    constexpr uint32_t blockSize
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+                    constexpr uint32_t numWorkers = T_numWorkers;
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
+                    uint32_t const blockIdxLin = cupla::blockIdx(acc).x;
+
+                    using complex_X = pmacc::math::Complex<float_X>;
+                    using namespace pmacc::mappings::threads;
+                    using namespace pmacc;
+                    // Storage for positions in the beam coordinate system.
+                    PMACC_SMEM(acc, positions, memory::Array<float2_X, blockSize>);
+                    // Storage for (form factor density * beam intensity factor).
+                    PMACC_SMEM(acc, densities, memory::Array<float_X, blockSize>);
+
+                    uint32_t const linAccessBlockBegin = blockIdxLin * blockSize;
+
+                    ForEachIdx<IdxConfig<blockSize, numWorkers>>{workerIdx}(
+                        [&](uint32_t const linearIdx, uint32_t const) {
+                            // Each thread reads one field value and saves it together
+                            // with its position, in UNIT_LENGTH, in the global domain.
+                            DataSpace<simDim> const cellPosition(
+                                DataSpaceOperations<simDim>::map(cellsGrid, linearIdx + linAccessBlockBegin));
+
+                            DataSpace<simDim> const cellGlobalPosition(cellPosition + globalOffset);
+                            floatD_X fieldGlobalPosition = precisionCast<float_X>(cellGlobalPosition) + fieldPos()[0];
+                            fieldGlobalPosition *= cellSize.shrink<simDim>();
+
+                            float_X density = densityBoxGPU(cellPosition)[0];
+                            // Save the cell position in the beam coordinate system.
+                            float3_X position_b = probingBeam.coordinateTransform(currentStep, fieldGlobalPosition);
+                            float_X beamFactor = probingBeam(position_b);
+                            // Store the cell value of the function that has to be
+                            // Fourier transformed.
+                            densities[linearIdx] = density * beamFactor / totalSimulationCells;
+                            // Store position in the beam comoving system
+                            // 3rd component is not needed anymore since q_z = 0.
+                            positions[linearIdx] = position_b.shrink<DIM2>();
+
+                            // Wait for all threads on the block to finish.
+                            cupla::__syncthreads(acc);
+
+                            // Calculate the density fourier transform:
+                            // Loop over q-vectors in frequency space:
+                            // Each worker process every numWorkers vector.
+                            for(uint32_t qLoopIdx = workerIdx; qLoopIdx < totalNumVectors; qLoopIdx += numWorkers)
+                            {
+                                float2_X q = scatteringVectors[qLoopIdx];
+                                complex_X amplitude(0.0);
+                                // Loop over all previously loaded cells:
+                                // This is a volume integral over the local domain
+                                // in the beam coordinate system.
+                                for(uint32_t rLoopIdx = 0; rLoopIdx < blockSize; rLoopIdx++)
+                                {
+                                    float_X dotqr;
+                                    density = densities[rLoopIdx];
+                                    float2_X position = positions[rLoopIdx];
+                                    dotqr = pmacc::math::dot(position, q);
+                                    dotqr *= -1.0;
+                                    amplitude += pmacc::math::euler(density, dotqr);
+                                } // end loop over positions
+                                // Add the super cell contribution to the output.
+                                // Avoid racing conditions between blocks.
+                                cupla::atomicAdd(
+                                    acc,
+                                    &(amplitudeBox[qLoopIdx].get_real()),
+                                    amplitude.get_real(),
+                                    ::alpaka::hierarchy::Blocks{});
+                                cupla::atomicAdd(
+                                    acc,
+                                    &(amplitudeBox[qLoopIdx].get_imag()),
+                                    amplitude.get_imag(),
+                                    ::alpaka::hierarchy::Blocks{});
+                            } // end loop over scattering directions
+                        } // end lambda function body
+                    );
+                }
+            };
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp b/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp
new file mode 100644
index 0000000000..f7f6573423
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/XrayScatteringWriter.hpp
@@ -0,0 +1,396 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include <pmacc/assert.hpp>
+#include <pmacc/dataManagement/DataConnector.hpp>
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/static_assert.hpp>
+#include <pmacc/mappings/simulation/GridController.hpp>
+
+#include <openPMD/openPMD.hpp>
+
+#include <vector>
+#include <cstdint>
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            //! Specifies plugin functioning mode. Mirrored or chunked output possible.
+            enum class OutputMemoryLayout
+            {
+                Mirror,
+                Distribute
+            };
+
+
+            //! Specifies complex number component.
+            enum class Component
+            {
+                Real,
+                Imag
+            };
+
+
+            //! Maps a linear index to a 2D cell position vector.
+            HINLINE std::vector<uint64_t> map2d(pmacc::math::Vector<uint64_t, DIM2> const& size, uint64_t pos)
+            {
+                auto const y(pos % size.y());
+                auto const x(pos / size.y());
+                return std::vector<uint64_t>{x, y};
+            }
+
+
+            //! Converts a pmacc Vector to an std::Vector.
+            template<unsigned DIM, typename T>
+            HINLINE std::vector<T> asStandardVector(pmacc::math::Vector<T, DIM> const& vec)
+            {
+                std::vector<T> res;
+                res.reserve(DIM);
+                for(unsigned i = 0; i < DIM; ++i)
+                {
+                    res.push_back(vec[i]);
+                }
+                return res;
+            }
+
+
+            /** Output writer for the xrayScattering plugin.
+             *
+             * Handles either a serial, in the mirrored output mode, or a parallel, in
+             * the distributed (chunked) mode, data writing. Data is saved in the
+             * openPMD standard using the openPMD API.
+             * @tparam T_ValueType Type of the values stored in the output.
+             */
+            template<typename T_ValueType>
+            struct XrayScatteringWriter
+            {
+            private:
+                //! A pointer to an openPMD API Series object
+                std::unique_ptr<::openPMD::Series> openPMDSeries;
+                //! MPI Communicator for the parallel data write
+                MPI_Comm mpiCommunicator;
+                std::string const fileName, fileExtension, dir;
+                std::string const compressionMethod;
+                //! Functioning mode
+                OutputMemoryLayout outputMemoryLayout;
+                //! Output dimensions
+                pmacc::math::UInt64<DIM2> const globalExtent;
+                //! OpenPMD type specifier for the ValueType
+                ::openPMD::Datatype datatype;
+                //! Output SI unit
+                const float_64 unit;
+                //! GridSpacing
+                float2_X const gridSpacing;
+
+
+            public:
+                /** Initializes a XrayScatteringWriter object.
+                 *
+                 * @param fileName Output file name, without the  extensions.
+                 * @param fileExtension File extension, specifies the API backend.
+                 * @param dir Where to save the output file.
+                 * @param outputMemoryLayout  Functioning mode.
+                 * @param compressionMethod
+                 * @param globalExtent Output dimensions.
+                 */
+                HINLINE XrayScatteringWriter(
+                    std::string const fileName,
+                    std::string const fileExtension,
+                    std::string const dir,
+                    OutputMemoryLayout outputMemoryLayout,
+                    std::string const compressionMethod,
+                    pmacc::math::UInt64<DIM2> const globalExtent,
+                    float2_X const gridSpacing,
+                    float_64 const unit,
+                    uint32_t const totalSimulationCells)
+                    : fileName(fileName)
+                    , dir(dir)
+                    , fileExtension(fileExtension)
+                    , outputMemoryLayout(outputMemoryLayout)
+                    , compressionMethod(compressionMethod)
+                    , globalExtent(globalExtent)
+                    , gridSpacing(gridSpacing)
+                    , unit(unit)
+                {
+                    if(outputMemoryLayout == OutputMemoryLayout::Distribute)
+                    {
+                        // Set the MPI communicator.
+                        GridController<simDim>& gc = Environment<simDim>::get().GridController();
+                        __getTransactionEvent().waitForFinished();
+                        mpiCommunicator = MPI_COMM_NULL;
+                        MPI_CHECK(MPI_Comm_dup(gc.getCommunicator().getMPIComm(), &mpiCommunicator));
+                    }
+
+                    datatype = ::openPMD::determineDatatype<T_ValueType>();
+                    // Create the output file.
+                    openSeries(::openPMD::Access::CREATE);
+                    openPMDSeries->setMeshesPath("scatteringData");
+                    openPMDSeries->setAttribute("totalSimulationCells", totalSimulationCells);
+                    closeSeries();
+                }
+
+                virtual ~XrayScatteringWriter()
+                {
+                    if(outputMemoryLayout == OutputMemoryLayout::Distribute)
+                    {
+                        if(mpiCommunicator != MPI_COMM_NULL)
+                        {
+                            // avoid deadlock between not finished pmacc tasks and mpi
+                            // blocking collectives
+                            __getTransactionEvent().waitForFinished();
+                            MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&(mpiCommunicator)));
+                        }
+                    }
+                }
+
+            private:
+                HINLINE bool isADIOS1() const
+                {
+#if openPMD_HAVE_ADIOS1 && !openPMD_HAVE_ADIOS2
+                    return this->fileExtension == "bp";
+#else
+                    return false;
+#endif
+                }
+
+                /** Opens an openPMD Series in a given access mode.
+                 *
+                 * @param at OpenPMD API access type.
+                 */
+                HINLINE void openSeries(::openPMD::Access at)
+                {
+                    if(!openPMDSeries)
+                    {
+                        std::string fullName = dir + '/' + fileName + "." + fileExtension;
+                        log<picLog::INPUT_OUTPUT>("XrayScatteringWriter: Opening file: %1%") % fullName;
+
+                        if(outputMemoryLayout == OutputMemoryLayout::Distribute)
+                        {
+                            // Open a series for a parallel write.
+                            openPMDSeries = std::make_unique<::openPMD::Series>(fullName, at, mpiCommunicator);
+                        }
+                        else
+                        {
+                            // Open a series for a serial write.
+                            openPMDSeries = std::make_unique<::openPMD::Series>(fullName, at);
+                        }
+
+                        log<picLog::INPUT_OUTPUT>("XrayScatteringWriter: Successfully opened file: %1%") % fullName;
+                    }
+                    else
+                    {
+                        throw std::runtime_error("XrayScatteringWriter: Tried opening a Series while old "
+                                                 "Series was still active.");
+                    }
+                }
+
+                HINLINE void closeSeries()
+                {
+                    if(openPMDSeries)
+                    {
+                        log<picLog::INPUT_OUTPUT>("XrayScatteringWriter: Closing "
+                                                  "file: %1%")
+                            % fileName;
+                        openPMDSeries.reset();
+                        if(outputMemoryLayout == OutputMemoryLayout::Distribute)
+                        {
+                            MPI_Barrier(mpiCommunicator);
+                        }
+                        log<picLog::INPUT_OUTPUT>("XrayScatteringWriter: successfully closed file: %1%") % fileName;
+                    }
+                    else
+                    {
+                        throw std::runtime_error("XrayScatteringWriter: Tried closing a Series that was not"
+                                                 " active.");
+                    }
+                }
+
+
+                /** Prepare an openPMD mesh for the amplitude.
+                 * @param currentStep
+                 */
+                HINLINE ::openPMD::Mesh prepareMesh(uint32_t const currentStep)
+                {
+                    ::openPMD::Iteration iteration = openPMDSeries->iterations[currentStep];
+                    ::openPMD::Mesh mesh = iteration.meshes["amplitude"];
+                    mesh.setGridSpacing(asStandardVector<DIM2>(gridSpacing));
+                    // 1/angstrom to 1/meter conversion
+                    mesh.setGridUnitSI(1e10);
+                    mesh.setAxisLabels(std::vector<std::string>{"q_x", "q_y"});
+                    return mesh;
+                }
+
+
+                /**
+                 * @param currentStep
+                 * @param component Component to write, either real or imaginary
+                 */
+                HINLINE ::openPMD::MeshRecordComponent prepareMRC(Component component, ::openPMD::Mesh& mesh)
+                {
+                    const std::string name_lookup_tpl[] = {"x", "y"};
+                    ::openPMD::MeshRecordComponent mrc = mesh[name_lookup_tpl[static_cast<int>(component)]];
+
+                    std::vector<uint64_t> shape = asStandardVector<DIM2>(globalExtent);
+                    ::openPMD::Dataset dataset{datatype, std::move(shape)};
+
+                    if(isADIOS1())
+                    {
+                        dataset.transform = compressionMethod;
+                    }
+                    else
+                    {
+                        dataset.compression = compressionMethod;
+                    }
+                    mrc.resetDataset(std::move(dataset));
+                    mrc.setUnitSI(unit);
+                    return mrc;
+                }
+
+            public:
+                /** Write complex numbers to the whole output array.
+                 *
+                 * @param currentStep Current simulation step.
+                 * @param realVec Vector containing the real parts of the complex
+                 *      numbers.
+                 * @param imagVec Vector containing the imaginary parts of the
+                 *      complex numbers.
+                 */
+                HINLINE void operator()(
+                    uint32_t const currentStep,
+                    std::vector<T_ValueType>& realVec,
+                    std::vector<T_ValueType>& imagVec)
+                {
+                    openSeries(::openPMD::Access::READ_WRITE);
+
+                    ::openPMD::Mesh mesh = prepareMesh(currentStep);
+                    ::openPMD::MeshRecordComponent mrc_real = prepareMRC(Component::Real, mesh);
+                    ::openPMD::MeshRecordComponent mrc_imag = prepareMRC(Component::Imag, mesh);
+
+
+                    mrc_real.storeChunk<T_ValueType>(
+                        ::openPMD::shareRaw(&realVec[0]),
+                        ::openPMD::Offset(DIM2, 0u),
+                        asStandardVector<DIM2>(globalExtent));
+                    mrc_imag.storeChunk<T_ValueType>(
+                        ::openPMD::shareRaw(&imagVec[0]),
+                        ::openPMD::Offset(DIM2, 0u),
+                        asStandardVector<DIM2>(globalExtent));
+                    openPMDSeries->flush();
+
+                    // Avoid deadlock between not finished pmacc tasks and mpi calls in
+                    // openPMD.
+                    __getTransactionEvent().waitForFinished();
+                    // Close openPMD Series, most likely the actual write point.
+                    closeSeries();
+                }
+
+
+                /** Write complex numbers to a part of the output array.
+                 *
+                 * @param currentStep Current simulation step.
+                 * @param extent1D The length of the contiguous part of the output
+                 *      that is the write destination (1D access).
+                 * @param offset1D The linear (1D access) offset to the first datum
+                 *      in the write destination.
+                 * @param realVec Vector containing the real parts of the complex
+                 *      numbers.
+                 * @param imagVec Vector containing the imaginary parts of the
+                 *      complex numbers.
+                 */
+                HINLINE void operator()(
+                    uint32_t const currentStep,
+                    uint64_t extent1D,
+                    uint64_t offset1D,
+                    std::vector<T_ValueType>& realVec,
+                    std::vector<T_ValueType>& imagVec)
+                {
+                    openSeries(::openPMD::Access::READ_WRITE);
+
+                    // Get openPMD mesh record components for the real and imaginary
+                    // parts.
+                    ::openPMD::Mesh mesh = prepareMesh(currentStep);
+                    ::openPMD::MeshRecordComponent mrc_real = prepareMRC(Component::Real, mesh);
+                    ::openPMD::MeshRecordComponent mrc_imag = prepareMRC(Component::Imag, mesh);
+
+                    // Register chunks to write:
+                    // Since the extent1D and offset1D are indices used in a linear
+                    // access to the array (along last axis, C-order), they don't always
+                    // describe a rectangle in the 2D output space. For that reason it
+                    // is in general not possible to specify the write extend with a
+                    // 2D vector as it is required by the API. Here the output
+                    // destination is split into 3 parts. Two, not full, rows one at
+                    // the begining and one ad the end of the chunk and a rectangular
+                    // chunk in between.
+                    //
+
+                    std::vector<uint64_t> offset(2);
+                    std::vector<uint64_t> extent(2);
+                    // First line.
+                    // Map the beginning of the output chunk.
+                    offset = map2d(globalExtent, offset1D);
+
+                    // The first line has not always the maximum possible length.
+                    uint64_t firstLineLength = globalExtent[1] - offset[1];
+                    // Set the extent vector.
+                    extent = std::vector<uint64_t>{1, firstLineLength};
+                    // Register chunks for imag and real components.
+                    mrc_real.storeChunk<T_ValueType>(::openPMD::shareRaw(&realVec[0]), offset, extent);
+                    mrc_imag.storeChunk<T_ValueType>(::openPMD::shareRaw(&imagVec[0]), offset, extent);
+
+                    // Middle chunk.
+                    // These lines have the full length.
+                    uint64_t numFullLines = (extent1D - firstLineLength) / globalExtent[1];
+                    extent[0] = numFullLines;
+                    extent[1] = globalExtent[1];
+                    // Offset to the middle chunk.
+                    uint64_t localOffset = firstLineLength;
+                    offset = map2d(globalExtent, offset1D + localOffset);
+                    // Register the middle chunk.
+                    mrc_real.storeChunk<T_ValueType>(::openPMD::shareRaw(&realVec[localOffset]), offset, extent);
+                    mrc_imag.storeChunk<T_ValueType>(::openPMD::shareRaw(&imagVec[localOffset]), offset, extent);
+
+                    // Last line:
+                    // Find out the length of the last line in the 1D chunk.
+                    uint64_t lastLineLength((extent1D - firstLineLength - numFullLines * globalExtent[1]));
+                    if(lastLineLength != 0)
+                    {
+                        localOffset = firstLineLength + numFullLines * globalExtent[1];
+                        offset = map2d(globalExtent, offset1D + localOffset);
+                        extent[0] = 1;
+                        extent[1] = lastLineLength;
+                        mrc_real.storeChunk<T_ValueType>(::openPMD::shareRaw(&realVec[localOffset]), offset, extent);
+                        mrc_imag.storeChunk<T_ValueType>(::openPMD::shareRaw(&imagVec[localOffset]), offset, extent);
+                    }
+                    openPMDSeries->flush();
+                    // Avoid deadlock between not finished pmacc tasks and mpi calls in
+                    // openPMD.
+                    __getTransactionEvent().waitForFinished();
+                    // Close the openPMD Series, most likely the actual write point.
+                    closeSeries();
+                }
+            };
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/AxisSwap.hpp b/include/picongpu/plugins/xrayScattering/beam/AxisSwap.hpp
new file mode 100644
index 0000000000..d4fed2e57b
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/AxisSwap.hpp
@@ -0,0 +1,71 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                /** Swaps vector axes and multiplies the result with an integer vector.
+                 *
+                 * When the integer vector contains only 1 and -1 values, this swap
+                 * correspond to a vector rotation that consists only out of right angle
+                 * subrotations.
+                 *
+                 * @tparam axis0 Which old axis (0,1 or 2) is the new first axis (0).
+                 * @tparam axis1 Which old axis (0,1 or 2) is the new second axis (1).
+                 * @tparam axis2 Which old axis (0,1 or 2) is the new third axis (2).
+                 * @tparam a0 Integer vector first component.
+                 * @tparam a1 Integer vector second component.
+                 * @tparam a2 Integer vector third component.
+                 */
+                template<unsigned axis0, unsigned axis1, unsigned axis2, int a0, int a1, int a2>
+                struct AxisSwap
+                {
+                    //! Performs the axis swap and the multiplication.
+                    static HDINLINE float3_X rotate(float3_X const& vec)
+                    {
+                        return float3_X(a0 * vec[axis0], a1 * vec[axis1], a2 * vec[axis2]);
+                    }
+
+                    //! Performs the reversed operation (back rotation).
+                    static HDINLINE float3_X reverse(float3_X const& vec)
+                    {
+                        PMACC_ASSERT(a0 != 0);
+                        PMACC_ASSERT(a1 != 0);
+                        PMACC_ASSERT(a2 != 0);
+
+                        float3_X result;
+                        result[axis0] = vec[0] / a0;
+                        result[axis1] = vec[1] / a1;
+                        result[axis2] = vec[2] / a2;
+                        return result;
+                    }
+                };
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/CoordinateTransform.hpp b/include/picongpu/plugins/xrayScattering/beam/CoordinateTransform.hpp
new file mode 100644
index 0000000000..806672a776
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/CoordinateTransform.hpp
@@ -0,0 +1,146 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/plugins/xrayScattering/beam/Side.hpp"
+#include "picongpu/plugins/xrayScattering/beam/SecondaryRotation.hpp"
+#include "picongpu/param/xrayScattering.param"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                //! Get the global domain size as a 3D vector in 3D and 2D simulations.
+                template<unsigned DIM>
+                HINLINE float3_X getDomainSize();
+
+                // For 3D simulations:
+                template<>
+                HINLINE float3_X getDomainSize<DIM3>()
+                {
+                    DataSpace<DIM3> globalDomainSize = Environment<DIM3>::get().SubGrid().getGlobalDomain().size;
+                    return precisionCast<float_X>(globalDomainSize);
+                }
+
+                // For 2D simulations:
+                template<>
+                HINLINE float3_X getDomainSize<DIM2>()
+                {
+                    auto globalDomainSize = Environment<DIM2>::get().SubGrid().getGlobalDomain().size;
+                    return float3_X(globalDomainSize[0], globalDomainSize[1], 0.0);
+                }
+
+
+                /** Defines a coordinate transform from the PIC system into the beam system.
+                 *
+                 * @tparam T_Side Side from which the probing beam is shot at the target.
+                 * @tparam T_SecondaryRotation Rotation of the beam propagation direction.
+                 */
+                template<typename T_Side, typename T_SecondaryRotation>
+                struct CoordinateTransform
+                {
+                    using Side = T_Side;
+                    using SecondaryRotation = T_SecondaryRotation;
+
+
+                    HINLINE CoordinateTransform()
+                    {
+                        // TODO: Fix the translation in the coordinate transform. The
+                        //  position in the beam system is wrongly calculated.
+                        //  Orientation is correct.
+                        /*
+                        using namespace picongpu::plugins::xrayScattering::beam;
+                        // Find the coordinate system translation:
+                        // Starting in the beam coordinate system.
+                        // Transverse(to the beam propagation direction) offset from the
+                        // initial position (the middle of the simulation box side).
+                        float2_X offsetTrans_b
+                        {
+                            BEAM_OFFSET[ 0 ] / UNIT_LENGTH,
+                            BEAM_OFFSET[ 1 ] / UNIT_LENGTH
+                        };
+                        // Offset along the propagation direction, defined by the beam
+                        // delay.
+                        float_X offsetParallel_b = beamDelay_SI / UNIT_TIME *
+                            SPEED_OF_LIGHT;
+                        // Complete offset from the initial position.
+                        float3_X offsetFromMiddlePoint_b
+                            {
+                                offsetTrans_b[ 0 ],
+                                offsetTrans_b[ 1 ],
+                                -1 * offsetParallel_b
+                            };
+
+                         // Move to the PIC coordinate system.
+                         offsetFromMiddlePoint_b = SecondaryRotation::ReverseOperation::
+                            rotate( offsetFromMiddlePoint_b );
+                        float3_X offsetFromMiddlePoint_s = Side::FirstRotation::reverse(
+                            offsetFromMiddlePoint_b );
+
+                        // Find the initial position in the PIC coordinate system.
+                        float3_X toMiddlePoint_s = cellSize * getDomainSize< simDim >( );
+                        for ( uint32_t ii = 0; ii < 3; ii++ )
+                        {
+                            toMiddlePoint_s[ ii ] *= Side::beamStartPosition[ ii ];
+                        }
+                        // Combine both translations.
+                        translationVector_s =  toMiddlePoint_s + offsetFromMiddlePoint_s;
+                        */
+                    }
+
+
+                    /** Transforms a vector from the PIC system to the beam comoving system.
+                     *
+                     * @param currentStep Current simulation step.
+                     * @param position_s A 3D vector in the PIC coordinate system.
+                     */
+                    HDINLINE float3_X operator()(uint32_t const& currentStep, float3_X const& position_s)
+                    {
+                        // TODO: Uncomment after fixing the translation.
+                        float3_X result = position_s; /* - translationVector_s;
+                        result[ 2 ] -= currentStep * DELTA_T * SPEED_OF_LIGHT;
+                        */
+                        result = Side::FirstRotation::rotate(result);
+                        result = SecondaryRotation::rotate(result);
+                        return result;
+                    }
+
+
+                    //! Wrapper for 2D vectors.
+                    HDINLINE float3_X operator()(uint32_t const& currentStep, float2_X const& position_s)
+                    {
+                        float3_X pos{position_s[0], position_s[1], 0.0};
+                        return (*this)(currentStep, std::move(pos));
+                    }
+
+
+                private:
+                    // TODO: Uncomment after fixing the translation.
+                    // PMACC_ALIGN( translationVector_s, float3_X );
+                };
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/ProbingBeam.hpp b/include/picongpu/plugins/xrayScattering/beam/ProbingBeam.hpp
new file mode 100644
index 0000000000..970199d2af
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/ProbingBeam.hpp
@@ -0,0 +1,67 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/plugins/xrayScattering/beam/CoordinateTransform.hpp"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                /** Defines the probing beam characteristic.
+                 *
+                 * @tparam T_BeamProfile Beam transverse profile.
+                 * @tparam T_BeamShape Beam temporal shape.
+                 * @tparam T_CoordinateTransform Coordinate transform from the pic
+                 *      coordinate system to the beam coordinate system.
+                 */
+                template<typename T_BeamProfile, typename T_BeamShape, typename T_CoordinateTransform>
+                struct ProbingBeam
+                {
+                    using BeamProfile = T_BeamProfile;
+                    using BeamShape = T_BeamShape;
+                    PMACC_ALIGN(coordinateTransform, T_CoordinateTransform);
+
+                    HINLINE ProbingBeam() : coordinateTransform(){};
+
+                    /** Calculates the probing amplitude at a given position.
+                     * @param position_b Position in the beam comoving coordinate system
+                     *      (x, y, z__at_t_0 - c*t).
+                     * @returns Probing wave amplitude scaling at position_b.
+                     */
+                    HDINLINE float_X operator()(float3_X const& position_b)
+                    {
+                        float_X profileFactor = BeamProfile::getFactor(position_b[0], position_b[1]);
+
+                        float_X beamTime = position_b[2] / SPEED_OF_LIGHT;
+                        float_X shapeFactor = BeamShape::getFactor(beamTime);
+
+                        return profileFactor * shapeFactor;
+                    }
+                };
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/SecondaryRotation.hpp b/include/picongpu/plugins/xrayScattering/beam/SecondaryRotation.hpp
new file mode 100644
index 0000000000..cd1a2c0f3f
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/SecondaryRotation.hpp
@@ -0,0 +1,101 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+#include <iostream>
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                /** Defines a coordinate system rotation.
+                 *
+                 * The whole rotation consists of two rotations --- first by the yaw angle
+                 *  and then by a the pitch angle.
+                 *
+                 * @tparam T_ParamClass Param class defining the angles.
+                 */
+                template<typename T_ParamClass>
+                struct SecondaryRotation : T_ParamClass
+                {
+                    using Params = T_ParamClass;
+                    struct ReversedAngles
+                    {
+                        static constexpr float_X yawAngle = -1.0_X * Params::yawAngle;
+                        static constexpr float_X pitchAngle = -1.0_X * Params::pitchAngle;
+                    } reversedAngles;
+
+                    using ReverseOperation = SecondaryRotation<ReversedAngles>;
+
+                private:
+                    static constexpr float_X xAngle = Params::yawAngle;
+                    static constexpr float_X yAngle = Params::pitchAngle;
+
+                    //! X axis rotation (yaw angle).
+                    static HDINLINE void xRotation(float3_X& vec)
+                    {
+                        /* A coordinate change for a vector is equal to the inverse
+                         * of its basis transform. When the beam is rotated its coordinate
+                         * system rotates as well. So the coordinate transfer to such
+                         * a rotated basis is just a rotation by the opposite angle.
+                         */
+                        float_X cos;
+                        float_X sin;
+                        pmacc::math::sincos(-1.0_X * xAngle, sin, cos);
+                        float_X y = vec[1] * cos - vec[2] * sin;
+                        float_X z = vec[1] * sin + vec[2] * cos;
+                        vec[1] = y;
+                        vec[2] = z;
+                    }
+
+
+                    //! Y axis rotation (pitch angle).
+                    static HDINLINE void yRotation(float3_X& vec)
+                    {
+                        float_X cos;
+                        float_X sin;
+                        pmacc::math::sincos(-1.0_X * yAngle, sin, cos);
+                        float_X x = vec[0] * cos + vec[2] * sin;
+                        float_X z = -1.0_X * vec[0] * sin + vec[2] * cos;
+                        vec[0] = x;
+                        vec[2] = z;
+                    }
+
+                public:
+                    //! Coordinate transform into the rotated coordinate system.
+                    static HDINLINE float3_X rotate(float3_X const& vec)
+                    {
+                        float3_X result = vec;
+                        yRotation(result);
+                        xRotation(result);
+                        return result;
+                    }
+                };
+
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/Side.hpp b/include/picongpu/plugins/xrayScattering/beam/Side.hpp
new file mode 100644
index 0000000000..94a5a73ef3
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/Side.hpp
@@ -0,0 +1,103 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/plugins/xrayScattering/beam/AxisSwap.hpp"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                /* This file defines the possible base beam orientations.
+                 *
+                 *  Example: X Side
+                 *      The beam propagates along the x axis ( PIC coordinate system).
+                 *      The base position of the beam coordinate system (0,0,0) point it
+                 *      the beam system is placed at in the middle of the x_PIC=0 plane.
+                 *      That is at (0, 0.5 * Y, 0.5 * Z), where Y and Z are the lengths of
+                 *      the simulation box sides along y_PIC and z_PIC axes.
+                 *      Therefore beamStartPosition= ( 0.0, 0.5, 0.5 ) for the XSide.
+                 *
+                 *      AxisSwap defines the base rotation of the
+                 *      coordinate system. First three integers set how the 3 directions
+                 *      (x, y, z) in the PIC system correspond to the ones in the beam
+                 *      system. The last 3 numbers are the relative orientations. For XSide:
+                 *      AxisSwap< 2, 1, 0, -1, 1, 1 > says:
+                 *          * x_beam = - z_PIC,
+                 *          * y_beam = y_PIC,
+                 *          * z_beam = x_PIC,
+                 */
+
+                //! Probing along the PIC x basis vector.
+                struct XSide
+                {
+                    static constexpr float_X beamStartPosition[3] = {0.0, 0.5, 0.5};
+                    using FirstRotation = AxisSwap<2, 1, 0, -1, 1, 1>;
+                };
+
+
+                //! Probing against the PIC x basis vector.
+                struct XRSide
+                {
+                    static constexpr float_X beamStartPosition[3] = {1.0, 0.5, 0.5};
+                    using FirstRotation = AxisSwap<2, 1, 0, -1, -1, -1>;
+                };
+
+
+                //! Probing along the PIC y basis vector.
+                struct YSide
+                {
+                    static constexpr float_X beamStartPosition[3] = {0.5, 0.0, 0.5};
+                    using FirstRotation = AxisSwap<2, 0, 1, -1, -1, 1>;
+                };
+
+
+                //! Probing against the PIC y basis vector.
+                struct YRSide
+                {
+                    static constexpr float_X beamStartPosition[3] = {0.5, 1.0, 0.5};
+                    using FirstRotation = AxisSwap<2, 0, 1, -1, 1, -1>;
+                };
+
+
+                //! Probing along the PIC z basis vector.
+                struct ZSide
+                {
+                    static constexpr float_X beamStartPosition[3] = {0.5, 0.5, 0.0};
+                    using FirstRotation = AxisSwap<1, 0, 2, -1, 1, 1>;
+                };
+
+
+                //! Probing against the PIC z basis vector.
+                struct ZRSide
+                {
+                    static constexpr float_X beamStartPosition[3] = {0.5, 0.5, 0.0};
+                    using FirstRotation = AxisSwap<1, 0, 2, -1, -1, -1>;
+                };
+
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/XrayScatteringBeam.hpp b/include/picongpu/plugins/xrayScattering/beam/XrayScatteringBeam.hpp
new file mode 100644
index 0000000000..57c7042e85
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/XrayScatteringBeam.hpp
@@ -0,0 +1,50 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/plugins/xrayScattering/beam/CoordinateTransform.hpp"
+#include "picongpu/plugins/xrayScattering/beam/ProbingBeam.hpp"
+#include "picongpu/plugins/xrayScattering/beam/beamProfiles/profiles.hpp"
+#include "picongpu/plugins/xrayScattering/beam/beamShapes/shapes.hpp"
+#include "picongpu/param/xrayScattering.param"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                // TODO: Move this back to the param file after fixing the coordinate
+                // transform.
+                constexpr float_X BEAM_OFFSET[2] = {0.0, 0.0};
+                constexpr float_X BEAM_DELAY_SI = 0.0;
+                using BeamProfile = beamProfiles::ConstProfile;
+                using BeamShape = beamShapes::ConstShape;
+
+                using BeamCoordinates = CoordinateTransform<ProbingSide, SecondaryRotation<RotationParam>>;
+                using XrayScatteringBeam = ProbingBeam<BeamProfile, BeamShape, BeamCoordinates>;
+
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/beamProfiles/ConstProfile.hpp b/include/picongpu/plugins/xrayScattering/beam/beamProfiles/ConstProfile.hpp
new file mode 100644
index 0000000000..9319d4ca5c
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/beamProfiles/ConstProfile.hpp
@@ -0,0 +1,46 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                namespace beamProfiles
+                {
+                    //! Homogeneous beam profile.
+                    struct ConstProfile
+                    {
+                        static HDINLINE constexpr float_X getFactor(const float_X& positionX, const float_X& positionY)
+                        {
+                            return float_X(1.0);
+                        }
+                    };
+                } // namespace beamProfiles
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/beamProfiles/GaussianProfile.hpp b/include/picongpu/plugins/xrayScattering/beam/beamProfiles/GaussianProfile.hpp
new file mode 100644
index 0000000000..f6bc2876ee
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/beamProfiles/GaussianProfile.hpp
@@ -0,0 +1,57 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                namespace beamProfiles
+                {
+                    /** Gaussian beam transverse profile.
+                     *
+                     * @tparam T_ParamClass Param Class defining @f[ /sigma_x / /simga_y @f] .
+                     */
+                    template<typename T_ParamClass>
+                    struct GaussianProfile : public T_ParamClass
+                    {
+                        using ParamClass = T_ParamClass;
+
+                        static HDINLINE float_X getFactor(float_X const& x, float_X const& y)
+                        {
+                            constexpr float_X s_x = ParamClass::sigmaX_SI / UNIT_LENGTH;
+                            constexpr float_X s_y = ParamClass::sigmaY_SI / UNIT_LENGTH;
+                            constexpr float_X tmp_x = x / s_x;
+                            constexpr float_X tmp_y = y / s_y;
+                            float_X exponent = -0.5 * (tmp_x * tmp_x + tmp_y * tmp_y);
+                            return math::exp(exponent);
+                        }
+                    };
+                } // namespace beamProfiles
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/beamProfiles/profiles.hpp b/include/picongpu/plugins/xrayScattering/beam/beamProfiles/profiles.hpp
new file mode 100644
index 0000000000..a11cea20c0
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/beamProfiles/profiles.hpp
@@ -0,0 +1,23 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/plugins/xrayScattering/beam/beamProfiles/ConstProfile.hpp"
+#include "picongpu/plugins/xrayScattering/beam/beamProfiles/GaussianProfile.hpp"
diff --git a/include/picongpu/plugins/xrayScattering/beam/beamShapes/ConstShape.hpp b/include/picongpu/plugins/xrayScattering/beam/beamShapes/ConstShape.hpp
new file mode 100644
index 0000000000..fa4dbfc7d9
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/beamShapes/ConstShape.hpp
@@ -0,0 +1,46 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                namespace beamShapes
+                {
+                    //! Beam intensity homogeneous along the propagation direction.
+                    struct ConstShape
+                    {
+                        static HDINLINE constexpr float_X getFactor(const float_X& time)
+                        {
+                            return 1.0_X;
+                        }
+                    };
+                } // namespace beamShapes
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/plugins/xrayScattering/beam/beamShapes/shapes.hpp b/include/picongpu/plugins/xrayScattering/beam/beamShapes/shapes.hpp
new file mode 100644
index 0000000000..f93aaa73cd
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/beam/beamShapes/shapes.hpp
@@ -0,0 +1,22 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/plugins/xrayScattering/beam/beamShapes/ConstShape.hpp"
diff --git a/include/picongpu/plugins/xrayScattering/xrayScatteringUtilities.hpp b/include/picongpu/plugins/xrayScattering/xrayScatteringUtilities.hpp
new file mode 100644
index 0000000000..6f61a27074
--- /dev/null
+++ b/include/picongpu/plugins/xrayScattering/xrayScatteringUtilities.hpp
@@ -0,0 +1,109 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/math/Complex.hpp>
+#include <pmacc/memory/buffers/Buffer.hpp>
+
+#include <algorithm>
+#include <vector>
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            template<typename T>
+            std::vector<T> extractReal(Buffer<pmacc::math::Complex<T>, DIM1>& complexBuffer)
+            {
+                std::vector<T> realValues;
+                auto size = complexBuffer.getCurrentSize();
+                auto dataBox = complexBuffer.getDataBox();
+                realValues.reserve(size);
+                for(uint32_t ii = 0; ii < size; ii++)
+                {
+                    realValues.push_back(dataBox[ii].get_real());
+                }
+                return realValues;
+            }
+
+            template<typename T>
+            std::vector<T> extractImag(Buffer<pmacc::math::Complex<T>, DIM1>& complexBuffer)
+            {
+                std::vector<T> imagValues;
+                auto size = complexBuffer.getCurrentSize();
+                auto dataBox = complexBuffer.getDataBox();
+                imagValues.reserve(size);
+                for(uint32_t ii = 0; ii < size; ii++)
+                {
+                    imagValues.push_back(dataBox[ii].get_imag());
+                }
+                return imagValues;
+            }
+
+            template<typename T>
+            std::vector<T> extractReal(std::vector<pmacc::math::Complex<T>> const& complexVec)
+            {
+                std::vector<T> realValues;
+                realValues.reserve(complexVec.size());
+
+                std::transform(
+                    std::begin(complexVec),
+                    std::end(complexVec),
+                    std::back_inserter(realValues),
+                    [](pmacc::math::Complex<T> const& data) { return data.get_real(); });
+                return realValues;
+            }
+
+            template<typename T>
+            std::vector<T> extractImag(std::vector<pmacc::math::Complex<T>> const& complexVec)
+            {
+                std::vector<T> imagValues;
+                imagValues.reserve(complexVec.size());
+
+                std::transform(
+                    std::begin(complexVec),
+                    std::end(complexVec),
+                    std::back_inserter(imagValues),
+                    [](pmacc::math::Complex<T> const& data) { return data.get_imag(); });
+                return imagValues;
+            }
+
+            template<typename T>
+            void copyVectorToBuffer(std::vector<T> const& vec, Buffer<T, DIM1>& buffer)
+            {
+                if(buffer.getCurrentSize() == vec.size())
+                {
+                    auto dataBox = buffer.getDataBox();
+                    for(std::size_t ii = 0; ii < vec.size(); ii++)
+                    {
+                        dataBox[ii] = vec[ii];
+                    }
+                }
+                else
+                    throw std::runtime_error("XrayScattering: Tried to copy a vector"
+                                             " to a Buffer of a different size");
+            }
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/include/picongpu/pmacc_renamings.hpp b/include/picongpu/pmacc_renamings.hpp
index 98d6aefa50..49169388eb 100644
--- a/include/picongpu/pmacc_renamings.hpp
+++ b/include/picongpu/pmacc_renamings.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,4 +22,4 @@
 
 #include <pmacc/math/ConstVector.hpp>
 
-#define CONST_VECTOR(type,dim,name,...) PMACC_CONST_VECTOR(type,dim,name,__VA_ARGS__)
+#define CONST_VECTOR(type, dim, name, ...) PMACC_CONST_VECTOR(type, dim, name, __VA_ARGS__)
diff --git a/include/picongpu/random/seed/ISeed.hpp b/include/picongpu/random/seed/ISeed.hpp
index 6a2ca4ff9c..e711751610 100644
--- a/include/picongpu/random/seed/ISeed.hpp
+++ b/include/picongpu/random/seed/ISeed.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -26,26 +26,25 @@
 
 namespace picongpu
 {
-namespace random
-{
-namespace seed
-{
-    /** seed generator interface wrapper
-     *
-     * Generated seed is equal on all ranks and can be used together with an
-     * rank unique seed to initialize a random number generator.
-     * Depending of the generator T_SeedFunctor the seed is reproducible or
-     * or changed with each program execution.
-     */
-    template< typename T_SeedFunctor = seed::Value< 42 > >
-    struct ISeed
+    namespace random
     {
-        uint32_t
-        operator()() const
+        namespace seed
         {
-            return T_SeedFunctor{}();
-        }
-    };
-} // namespace seed
-} // namespace random
+            /** seed generator interface wrapper
+             *
+             * Generated seed is equal on all ranks and can be used together with an
+             * rank unique seed to initialize a random number generator.
+             * Depending of the generator T_SeedFunctor the seed is reproducible or
+             * or changed with each program execution.
+             */
+            template<typename T_SeedFunctor = seed::Value<42>>
+            struct ISeed
+            {
+                uint32_t operator()() const
+                {
+                    return T_SeedFunctor{}();
+                }
+            };
+        } // namespace seed
+    } // namespace random
 } // namespace picongpu
diff --git a/include/picongpu/random/seed/Seed.cpp b/include/picongpu/random/seed/Seed.cpp
index ce7c86171a..8c4c92deb6 100644
--- a/include/picongpu/random/seed/Seed.cpp
+++ b/include/picongpu/random/seed/Seed.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,42 +27,33 @@
 
 namespace picongpu
 {
-namespace random
-{
-namespace seed
-{
-
-    uint32_t
-    FromTime::operator()() const
+    namespace random
     {
-        auto now = std::chrono::system_clock::now();
-        uint32_t now_ms = std::chrono::time_point_cast< std::chrono::milliseconds >( now ).
-            time_since_epoch().count();
-
-        // receive time from rank zero
-        MPI_Bcast(
-            &now_ms,
-            1,
-            MPI_UINT32_T,
-            0,
-            MPI_COMM_WORLD
-        );
-
-        return now_ms;
-    }
-
-    uint32_t
-    FromEnvironment::operator()() const
-    {
-        char* seedStr = nullptr;
-        uint32_t seed = 0;
-        seedStr = std::getenv( "PIC_SEED" );
-        if( seedStr )
-            seed = std::stoi( seedStr );
-
-        return seed;
-    }
-
-} // namespace seed
-} // namespace random
+        namespace seed
+        {
+            uint32_t FromTime::operator()() const
+            {
+                auto now = std::chrono::system_clock::now();
+                uint32_t now_ms
+                    = std::chrono::time_point_cast<std::chrono::milliseconds>(now).time_since_epoch().count();
+
+                // receive time from rank zero
+                MPI_Bcast(&now_ms, 1, MPI_UINT32_T, 0, MPI_COMM_WORLD);
+
+                return now_ms;
+            }
+
+            uint32_t FromEnvironment::operator()() const
+            {
+                char* seedStr = nullptr;
+                uint32_t seed = 0;
+                seedStr = std::getenv("PIC_SEED");
+                if(seedStr)
+                    seed = std::stoi(seedStr);
+
+                return seed;
+            }
+
+        } // namespace seed
+    } // namespace random
 } // namespace picongpu
diff --git a/include/picongpu/random/seed/Seed.hpp b/include/picongpu/random/seed/Seed.hpp
index ff3a91b997..58f1948f61 100644
--- a/include/picongpu/random/seed/Seed.hpp
+++ b/include/picongpu/random/seed/Seed.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,47 +24,43 @@
 
 namespace picongpu
 {
-namespace random
-{
-namespace seed
-{
-
-    /** constant seed
-     *
-     * The seed is equal on each program program start.
-     */
-    template< uint32_t T_constSeedValue >
-    struct Value
+    namespace random
     {
-        uint32_t
-        operator()() const
+        namespace seed
         {
-            return T_constSeedValue;
-        }
-    };
+            /** constant seed
+             *
+             * The seed is equal on each program program start.
+             */
+            template<uint32_t T_constSeedValue>
+            struct Value
+            {
+                uint32_t operator()() const
+                {
+                    return T_constSeedValue;
+                }
+            };
 
-    /** time dependant seed
-     *
-     * The seed is derived from the current system time.
-     * The seed is different with each program start.
-     */
-    struct FromTime
-    {
-        uint32_t
-        operator()() const;
-    };
+            /** time dependant seed
+             *
+             * The seed is derived from the current system time.
+             * The seed is different with each program start.
+             */
+            struct FromTime
+            {
+                uint32_t operator()() const;
+            };
 
-    /** read the seed from the environment
-     *
-     * Read the seed from the environment variable `PIC_SEED`.
-     * If `PIC_SEED` is not defined zero will be returned.
-     */
-    struct FromEnvironment
-    {
-        uint32_t
-        operator()() const;
-    };
+            /** read the seed from the environment
+             *
+             * Read the seed from the environment variable `PIC_SEED`.
+             * If `PIC_SEED` is not defined zero will be returned.
+             */
+            struct FromEnvironment
+            {
+                uint32_t operator()() const;
+            };
 
-} // namespace seed
-} // namespace random
+        } // namespace seed
+    } // namespace random
 } // namespace picongpu
diff --git a/include/picongpu/simulation/control/DomainAdjuster.hpp b/include/picongpu/simulation/control/DomainAdjuster.hpp
index 7b9e33e13b..a621201d85 100644
--- a/include/picongpu/simulation/control/DomainAdjuster.hpp
+++ b/include/picongpu/simulation/control/DomainAdjuster.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -45,7 +45,6 @@ namespace picongpu
      */
     class DomainAdjuster
     {
-
     public:
         /** constructor
          *
@@ -56,16 +55,15 @@ namespace picongpu
          * @param movingWindowEnabled if moving window is enabled
          */
         DomainAdjuster(
-            DataSpace< simDim > const & numDevices,
-            DataSpace< simDim > const & mpiPosition,
-            DataSpace< simDim > const & isPeriodic,
-            bool const movingWindowEnabled
-        ) :
-            m_numDevices( numDevices),
-            m_mpiPosition( mpiPosition ),
-            m_isPeriodic( isPeriodic ),
-            m_movingWindowEnabled( movingWindowEnabled ),
-            m_isMaster( mpiPosition == DataSpace< simDim >::create( 0 ) )
+            DataSpace<simDim> const& numDevices,
+            DataSpace<simDim> const& mpiPosition,
+            DataSpace<simDim> const& isPeriodic,
+            bool const movingWindowEnabled)
+            : m_numDevices(numDevices)
+            , m_mpiPosition(mpiPosition)
+            , m_isPeriodic(isPeriodic)
+            , m_movingWindowEnabled(movingWindowEnabled)
+            , m_isMaster(mpiPosition == DataSpace<simDim>::create(0))
         {
         }
 
@@ -78,29 +76,26 @@ namespace picongpu
          * @param[out] localDomainOffset local offset [in cells] relative to the origin of the global domain
          */
         void operator()(
-            DataSpace< simDim > & globalDomainSize,
-            DataSpace< simDim > & localDomainSize,
-            DataSpace< simDim > & localDomainOffset
-        )
+            DataSpace<simDim>& globalDomainSize,
+            DataSpace<simDim>& localDomainSize,
+            DataSpace<simDim>& localDomainOffset)
         {
             m_globalDomainSize = globalDomainSize;
             m_localDomainSize = localDomainSize;
 
-            for( uint32_t d = 0; d < simDim; ++d )
+            for(uint32_t d = 0; d < simDim; ++d)
             {
-                multipleOfSuperCell( d );
-                minThreeSuperCells( d );
-                greaterEqualThanAbsorber( d );
-                deriveGlobalDomainSize( d );
-                updateLocalDomainOffset( d );
+                multipleOfSuperCell(d);
+                minThreeSuperCells(d);
+                greaterEqualThanAbsorber(d);
+                deriveGlobalDomainSize(d);
+                updateLocalDomainOffset(d);
             }
 
-            if( globalDomainSize != m_globalDomainSize || localDomainSize != m_localDomainSize )
+            if(globalDomainSize != m_globalDomainSize || localDomainSize != m_localDomainSize)
             {
-                std::cout << " new grid size (global|local|offset): " <<
-                    m_globalDomainSize.toString() << "|" <<
-                    m_localDomainSize.toString() << "|" <<
-                    m_localDomainOffset.toString() << std::endl;
+                std::cout << " new grid size (global|local|offset): " << m_globalDomainSize.toString() << "|"
+                          << m_localDomainSize.toString() << "|" << m_localDomainOffset.toString() << std::endl;
             }
 
             // write results back
@@ -120,7 +115,6 @@ namespace picongpu
         }
 
     private:
-
         /** update local domain offset
          *
          * Share the local domain size with all MPI ranks and calculate the offset of the
@@ -128,76 +122,70 @@ namespace picongpu
          *
          * @param dim dimension to update
          */
-        void updateLocalDomainOffset( size_t const dim )
+        void updateLocalDomainOffset(size_t const dim)
         {
-            pmacc::GridController< simDim > & gc = pmacc::Environment< simDim >::get( ).GridController( );
+            pmacc::GridController<simDim>& gc = pmacc::Environment<simDim>::get().GridController();
 
-            int mpiPos( gc.getPosition( )[ dim ] );
+            int mpiPos(gc.getPosition()[dim]);
             int numMpiRanks = gc.getGlobalSize();
 
             // gather mpi position in the direction we are checking
-            std::vector< int > mpiPositions( numMpiRanks );
-            MPI_CHECK( MPI_Allgather(
+            std::vector<int> mpiPositions(numMpiRanks);
+            MPI_CHECK(MPI_Allgather(
                 &mpiPos,
                 1,
                 MPI_INT,
                 mpiPositions.data(),
                 1,
                 MPI_INT,
-                gc.getCommunicator().getMPIComm()
-            ));
+                gc.getCommunicator().getMPIComm()));
 
             // gather local sizes in the direction we are checking
-            std::vector< uint64_t > allLocalSizes( numMpiRanks );
-            uint64_t lSize = static_cast< uint64_t >( m_localDomainSize[ dim ] );
-            MPI_CHECK( MPI_Allgather(
+            std::vector<uint64_t> allLocalSizes(numMpiRanks);
+            uint64_t lSize = static_cast<uint64_t>(m_localDomainSize[dim]);
+            MPI_CHECK(MPI_Allgather(
                 &lSize,
                 1,
                 MPI_UINT64_T,
                 allLocalSizes.data(),
                 1,
                 MPI_UINT64_T,
-                gc.getCommunicator().getMPIComm()
-            ));
+                gc.getCommunicator().getMPIComm()));
 
             uint64_t offset = 0u;
-            for( size_t i = 0u; i < mpiPositions.size(); ++i )
+            for(size_t i = 0u; i < mpiPositions.size(); ++i)
             {
-                if( mpiPositions[ i ] < mpiPos )
-                    offset += allLocalSizes[ i ];
+                if(mpiPositions[i] < mpiPos)
+                    offset += allLocalSizes[i];
             }
 
             /* since we are not doing independent reduces per slice we need
              * to adjust the offset result by dividing with the number of
              * MPI ranks in all other dimensions.
              */
-            offset /= static_cast< uint64_t >( m_numDevices.productOfComponents() / m_numDevices[ dim ] );
-            m_localDomainOffset[ dim ] = static_cast< int >( offset );
-
+            offset /= static_cast<uint64_t>(m_numDevices.productOfComponents() / m_numDevices[dim]);
+            m_localDomainOffset[dim] = static_cast<int>(offset);
         }
 
         /** ensure that the local size is a multiple of the supercell size
          *
          * @param dim dimension to update
          */
-        void multipleOfSuperCell( size_t const dim )
+        void multipleOfSuperCell(size_t const dim)
         {
-            int const sCellSize = SuperCellSize::toRT()[ dim ];
+            int const sCellSize = SuperCellSize::toRT()[dim];
             // round up to full supercells
-            int const validLocalSize =
-                ( ( m_localDomainSize[ dim ] + sCellSize - 1 ) / sCellSize ) *
-                sCellSize;
+            int const validLocalSize = ((m_localDomainSize[dim] + sCellSize - 1) / sCellSize) * sCellSize;
 
-            if( validLocalSize != m_localDomainSize[ dim ] )
+            if(validLocalSize != m_localDomainSize[dim])
             {
                 showMessage(
                     dim,
                     "Local grid size is not a multiple of supercell size.",
-                    m_localDomainSize[ dim ],
-                    validLocalSize
-                );
+                    m_localDomainSize[dim],
+                    validLocalSize);
 
-                m_localDomainSize[ dim ] = validLocalSize;
+                m_localDomainSize[dim] = validLocalSize;
             }
         }
 
@@ -207,21 +195,20 @@ namespace picongpu
          *
          * @param dim dimension to update
          */
-        void minThreeSuperCells( size_t const  dim )
+        void minThreeSuperCells(size_t const dim)
         {
-            int numSuperCells = m_localDomainSize[ dim ] / SuperCellSize::toRT()[ dim ];
+            int numSuperCells = m_localDomainSize[dim] / SuperCellSize::toRT()[dim];
 
-            if( numSuperCells < 3 )
+            if(numSuperCells < 3)
             {
-                int newLocalDomainSize = 3 * SuperCellSize::toRT()[ dim ];
+                int newLocalDomainSize = 3 * SuperCellSize::toRT()[dim];
                 showMessage(
                     dim,
                     "Local grid size is not containing at least 3 supercells.",
-                    m_localDomainSize[ dim ],
-                    newLocalDomainSize
-                );
+                    m_localDomainSize[dim],
+                    newLocalDomainSize);
 
-                m_localDomainSize[ dim ] = newLocalDomainSize;
+                m_localDomainSize[dim] = newLocalDomainSize;
             }
         }
 
@@ -232,49 +219,42 @@ namespace picongpu
          *
          * @param dim dimension to update
          */
-        void greaterEqualThanAbsorber( size_t const dim )
+        void greaterEqualThanAbsorber(size_t const dim)
         {
-            int validLocalSize = m_localDomainSize[ dim ];
+            int validLocalSize = m_localDomainSize[dim];
 
-            bool const isAbsorberEnabled = !m_isPeriodic[ dim ];
-            bool const isBoundaryDevice = ( m_mpiPosition[ dim ] == 0 || m_mpiPosition[ dim ] == m_numDevices[ dim ] - 1 );
-            if( isAbsorberEnabled && isBoundaryDevice )
+            bool const isAbsorberEnabled = !m_isPeriodic[dim];
+            bool const isBoundaryDevice = (m_mpiPosition[dim] == 0 || m_mpiPosition[dim] == m_numDevices[dim] - 1);
+            if(isAbsorberEnabled && isBoundaryDevice)
             {
-                size_t boundary = m_mpiPosition[ dim ] == 0u ? 0u : 1u;
-                int maxAbsorberCells = fields::absorber::numCells[ dim ][ boundary ];
+                size_t boundary = m_mpiPosition[dim] == 0u ? 0u : 1u;
+                int maxAbsorberCells = fields::absorber::numCells[dim][boundary];
 
-                if( m_movingWindowEnabled && dim == 1u )
+                if(m_movingWindowEnabled && dim == 1u)
                 {
                     /* since the device changes their position during the simulation
                      * the negative and positive absorber cells must fit into the domain
                      */
-                    maxAbsorberCells = static_cast< int >(
-                        std::max(
-                            fields::absorber::numCells[ dim ][ 0 ],
-                            fields::absorber::numCells[ dim ][ 1 ]
-                        )
-                    );
+                    maxAbsorberCells = static_cast<int>(
+                        std::max(fields::absorber::numCells[dim][0], fields::absorber::numCells[dim][1]));
                 }
 
-                if( m_localDomainSize[ dim ] < maxAbsorberCells )
+                if(m_localDomainSize[dim] < maxAbsorberCells)
                 {
-                    int const sCellSize = SuperCellSize::toRT()[ dim ];
+                    int const sCellSize = SuperCellSize::toRT()[dim];
                     // round up to full supercells
-                    validLocalSize =
-                        ( ( maxAbsorberCells + sCellSize - 1 ) / sCellSize ) *
-                        sCellSize;
+                    validLocalSize = ((maxAbsorberCells + sCellSize - 1) / sCellSize) * sCellSize;
                 }
 
-                if( validLocalSize != m_localDomainSize[ dim ] )
+                if(validLocalSize != m_localDomainSize[dim])
                 {
                     showMessage(
                         dim,
                         "Local grid size must be greater or equal than the largest absorber.",
-                        m_localDomainSize[ dim ],
-                        validLocalSize
-                    );
+                        m_localDomainSize[dim],
+                        validLocalSize);
 
-                    m_localDomainSize[ dim ] = validLocalSize;
+                    m_localDomainSize[dim] = validLocalSize;
                 }
             }
         }
@@ -287,43 +267,39 @@ namespace picongpu
          *
          * @param dim dimension to update
          */
-        void deriveLocalDomainSize( size_t const dim )
+        void deriveLocalDomainSize(size_t const dim)
         {
-            if( m_movingWindowEnabled && dim == 1u )
+            if(m_movingWindowEnabled && dim == 1u)
             {
-
                 pmacc::mpi::MPIReduce mpiReduce;
 
                 int globalMax;
                 mpiReduce(
                     pmacc::nvidia::functors::Max(),
                     &globalMax,
-                    &m_localDomainSize[ dim ],
+                    &m_localDomainSize[dim],
                     1,
-                    pmacc::mpi::reduceMethods::AllReduce()
-                );
+                    pmacc::mpi::reduceMethods::AllReduce());
 
                 int globalMin;
                 mpiReduce(
                     pmacc::nvidia::functors::Min(),
                     &globalMin,
-                    &m_localDomainSize[ dim ],
+                    &m_localDomainSize[dim],
                     1,
-                    pmacc::mpi::reduceMethods::AllReduce()
-                );
+                    pmacc::mpi::reduceMethods::AllReduce());
 
                 // local size must be equal for all devices in y direction
-                if( m_isMaster && globalMax != globalMin )
+                if(m_isMaster && globalMax != globalMin)
                 {
                     showMessage(
                         dim,
                         "Local grid size must be equal for all devices because moving window is enabled.",
-                        m_localDomainSize[ dim ],
-                        globalMax
-                    );
+                        m_localDomainSize[dim],
+                        globalMax);
                 }
 
-                m_localDomainSize[ dim ] = globalMax;
+                m_localDomainSize[dim] = globalMax;
             }
         }
 
@@ -333,49 +309,40 @@ namespace picongpu
          *
          * @param dim dimension to update
          */
-        void deriveGlobalDomainSize( size_t const dim )
+        void deriveGlobalDomainSize(size_t const dim)
         {
             uint64_t validGlobalGridSize = 0u;
 
-            deriveLocalDomainSize( dim );
+            deriveLocalDomainSize(dim);
 
-            if( m_movingWindowEnabled && dim == 1u )
+            if(m_movingWindowEnabled && dim == 1u)
             {
                 // the local sizes in slide direction must be equal sized
-                validGlobalGridSize = static_cast< uint64_t >( m_localDomainSize[ dim ] * m_numDevices[ dim ] );
+                validGlobalGridSize = static_cast<uint64_t>(m_localDomainSize[dim] * m_numDevices[dim]);
             }
             else
             {
-                uint64_t localDomainSize = static_cast< uint64_t >( m_localDomainSize[ dim ] );
+                uint64_t localDomainSize = static_cast<uint64_t>(m_localDomainSize[dim]);
                 pmacc::mpi::MPIReduce mpiReduce;
                 mpiReduce(
                     pmacc::nvidia::functors::Add(),
                     &validGlobalGridSize,
                     &localDomainSize,
                     1,
-                    pmacc::mpi::reduceMethods::AllReduce()
-                );
+                    pmacc::mpi::reduceMethods::AllReduce());
                 /* since we are not doing independent reduces per slice we need
                  * to adjust the reduce result by dividing the sizes of all other dimensions
                  * we are not check within the method call
                  */
-                validGlobalGridSize /= static_cast< uint64_t >(
-                    m_numDevices.productOfComponents() / m_numDevices[ dim ]
-                );
-
+                validGlobalGridSize /= static_cast<uint64_t>(m_numDevices.productOfComponents() / m_numDevices[dim]);
             }
 
-            if( m_isMaster && validGlobalGridSize != static_cast< uint64_t >( m_globalDomainSize[ dim ] ) )
+            if(m_isMaster && validGlobalGridSize != static_cast<uint64_t>(m_globalDomainSize[dim]))
             {
-                showMessage(
-                    dim,
-                    "Invalid global grid size.",
-                    m_globalDomainSize[ dim ],
-                    validGlobalGridSize
-                );
+                showMessage(dim, "Invalid global grid size.", m_globalDomainSize[dim], validGlobalGridSize);
             }
 
-            m_globalDomainSize[ dim ] = static_cast< int >( validGlobalGridSize );
+            m_globalDomainSize[dim] = static_cast<int>(validGlobalGridSize);
         }
 
         /** print a message to the user
@@ -387,38 +354,30 @@ namespace picongpu
          * @param currentSize current domain size in the given direction
          * @param updatedSize updated/corrected domain size for the given dimension
          */
-        void showMessage(
-            size_t const dim,
-            std::string const & msg,
-            int const currentSize,
-            int const updatedSize
-        ) const
+        void showMessage(size_t const dim, std::string const& msg, int const currentSize, int const updatedSize) const
         {
             /**! lookup table to translate a dimension index into a name
              *
              * \warning `= { { ... } }` is not required by the c++11 standard but
              * is necessary for g++ 4.9
              */
-            std::array< char, 3 > const dimNames = { { 'x', 'y', 'z' } };
+            std::array<char, 3> const dimNames = {{'x', 'y', 'z'}};
 
-            if( m_validateOnly )
+            if(m_validateOnly)
                 throw std::runtime_error(
-                    std::string( "Dimension " ) + dimNames[ dim ] + ": " +
-                    msg + " Suggestion: set " + std::to_string( currentSize ) +
-                    " to " + std::to_string( updatedSize )
-                );
+                    std::string("Dimension ") + dimNames[dim] + ": " + msg + " Suggestion: set "
+                    + std::to_string(currentSize) + " to " + std::to_string(updatedSize));
             else
-                std::cout << "Dimension " << dimNames[ dim ] << ": " <<
-                    msg << " Auto adjust from " <<
-                    currentSize << " to " << updatedSize << std::endl;
+                std::cout << "Dimension " << dimNames[dim] << ": " << msg << " Auto adjust from " << currentSize
+                          << " to " << updatedSize << std::endl;
         }
 
-        DataSpace< simDim > m_globalDomainSize;
-        DataSpace< simDim > m_localDomainSize;
-        DataSpace< simDim > m_localDomainOffset;
-        DataSpace< simDim > const m_numDevices;
-        DataSpace< simDim > const m_mpiPosition;
-        DataSpace< simDim > const m_isPeriodic;
+        DataSpace<simDim> m_globalDomainSize;
+        DataSpace<simDim> m_localDomainSize;
+        DataSpace<simDim> m_localDomainOffset;
+        DataSpace<simDim> const m_numDevices;
+        DataSpace<simDim> const m_mpiPosition;
+        DataSpace<simDim> const m_isPeriodic;
         bool const m_movingWindowEnabled;
         bool const m_isMaster;
 
diff --git a/include/picongpu/simulation/control/ISimulationStarter.hpp b/include/picongpu/simulation/control/ISimulationStarter.hpp
index 4584f20bea..23b7021f2f 100644
--- a/include/picongpu/simulation/control/ISimulationStarter.hpp
+++ b/include/picongpu/simulation/control/ISimulationStarter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -32,7 +32,6 @@ namespace picongpu
     class ISimulationStarter : public IPlugin
     {
     public:
-
         virtual ~ISimulationStarter()
         {
         }
@@ -43,7 +42,7 @@ namespace picongpu
          *
          * @return true if no error else false
          */
-        virtual ArgsParser::Status parseConfigs(int argc, char **argv) = 0;
+        virtual ArgsParser::Status parseConfigs(int argc, char** argv) = 0;
 
         /*start simulation
          * is called after parsConfig and pluginLoad
@@ -60,4 +59,4 @@ namespace picongpu
             // nothing to do here
         }
     };
-}
+} // namespace picongpu
diff --git a/include/picongpu/simulation/control/MovingWindow.hpp b/include/picongpu/simulation/control/MovingWindow.hpp
index 0f73cef019..71fd0b4aee 100644
--- a/include/picongpu/simulation/control/MovingWindow.hpp
+++ b/include/picongpu/simulation/control/MovingWindow.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt, Alexander Debus
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt, Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -26,424 +26,414 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-/**
- * Singleton class managing the moving window, slides.
- * Can be used to create window views on the grid.
- */
-class MovingWindow
-{
-private:
-
-    MovingWindow() = default;
-
-    MovingWindow(MovingWindow& cc);
-
-    void getCurrentSlideInfo(uint32_t currentStep, bool *doSlide, float_64 *offsetFirstGPU)
+    /**
+     * Singleton class managing the moving window, slides.
+     * Can be used to create window views on the grid.
+     */
+    class MovingWindow
     {
-        if (doSlide)
-            *doSlide = false;
+    private:
+        MovingWindow() = default;
 
-        if (offsetFirstGPU)
-            *offsetFirstGPU = 0.0;
+        MovingWindow(MovingWindow& cc);
 
-        if (slidingWindowEnabled)
+        void getCurrentSlideInfo(uint32_t currentStep, bool* doSlide, float_64* offsetFirstGPU)
         {
-            /* Sliding stayed enabled but if we reach the end step where we should stop sliding
-             * the moving window is freezed.
-             * All offsets will stay constant until the end of the simulation.
-             */
-            if (currentStep >= endSlidingOnStep)
-                currentStep = endSlidingOnStep;
+            if(doSlide)
+                *doSlide = false;
 
-            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+            if(offsetFirstGPU)
+                *offsetFirstGPU = 0.0;
 
-            /* speed of the moving window */
-            const float_64 windowMovingSpeed = float_64(SPEED_OF_LIGHT);
+            if(slidingWindowEnabled)
+            {
+                /* Sliding stayed enabled but if we reach the end step where we should stop sliding
+                 * the moving window is freezed.
+                 * All offsets will stay constant until the end of the simulation.
+                 */
+                if(currentStep >= endSlidingOnStep)
+                    currentStep = endSlidingOnStep;
 
-            /* defines in which direction the window moves
-             *
-             * 0 == x,  1 == y , 2 == z direction
-             *
-             * note: currently only y direction is supported
-             */
-            const uint32_t moveDirection = 1;
+                const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
 
-            /* the moving window is smaller than the global domain by exactly one
-             * GPU (local domain size)
-             * \todo calculation of the globalWindowSizeInMoveDirection is constant should be
-             * only done once in it's own central object/api
-             */
-            const uint32_t globalWindowSizeInMoveDirection =
-                subGrid.getGlobalDomain().size[moveDirection] - subGrid.getLocalDomain().size[moveDirection];
+                /* speed of the moving window */
+                const float_64 windowMovingSpeed = float_64(SPEED_OF_LIGHT);
 
-            const uint32_t gpuNumberOfCellsInMoveDirection = subGrid.getLocalDomain().size[moveDirection];
+                /* defines in which direction the window moves
+                 *
+                 * 0 == x,  1 == y , 2 == z direction
+                 *
+                 * note: currently only y direction is supported
+                 */
+                const uint32_t moveDirection = 1;
 
-            /* unit PIConGPU length */
-            const float_64 cellSizeInMoveDirection = float_64(cellSize[moveDirection]);
+                /* the moving window is smaller than the global domain by exactly one
+                 * GPU (local domain size)
+                 * \todo calculation of the globalWindowSizeInMoveDirection is constant should be
+                 * only done once in it's own central object/api
+                 */
+                const uint32_t globalWindowSizeInMoveDirection
+                    = subGrid.getGlobalDomain().size[moveDirection] - subGrid.getLocalDomain().size[moveDirection];
 
-            const float_64 deltaWayPerStep = (windowMovingSpeed * float_64(DELTA_T));
+                const uint32_t gpuNumberOfCellsInMoveDirection = subGrid.getLocalDomain().size[moveDirection];
 
-            /* How many cells the virtual particle with speed of light is pushed forward
-             * at the begin of the simulation.
-             * The number of cells is round up thus we avoid window moves and slides
-             * depends on half cells.
-             */
-            const uint32_t virtualParticleInitialStartCell = math::ceil(
-                float_64(globalWindowSizeInMoveDirection) * (float_64(1.0) - movePoint)
-            );
+                /* unit PIConGPU length */
+                const float_64 cellSizeInMoveDirection = float_64(cellSize[moveDirection]);
 
-            /* Is the time step when the virtual particle **passed** the GPU next to the last
-             * in the current to the next step
-             */
-            const uint32_t firstSlideStep = math::ceil(
-                float_64(subGrid.getGlobalDomain().size[moveDirection] - virtualParticleInitialStartCell) *
-                cellSizeInMoveDirection / deltaWayPerStep
-            ) - 1;
-
-            /* way which the virtual particle must move before the window begins
-             * to move the first time [in pic length] */
-            const float_64 wayToFirstMove =
-                float_64(globalWindowSizeInMoveDirection - virtualParticleInitialStartCell) *
-                cellSizeInMoveDirection;
-            /* Is the time step when the virtual particle **passed** the moving window
-             * in the current to the next step
-             * Signed type of firstMoveStep to allow for edge case movePoint = 0.0
-             * for a moving window right from the start of the simulation.
-             */
-            const int32_t firstMoveStep = math::ceil(
-                wayToFirstMove / deltaWayPerStep
-            ) - 1;
+                const float_64 deltaWayPerStep = (windowMovingSpeed * float_64(DELTA_T));
 
-            if (firstMoveStep <= int32_t(currentStep) )
-            {
-                /* calculate the current position of the virtual particle */
-                const float_64 virtualParticleWayPassed =
-                    deltaWayPerStep * float_64(currentStep);
-                const uint32_t virtualParticleWayPassedInCells = uint32_t(
-                    math::floor(virtualParticleWayPassed / cellSizeInMoveDirection)
-                );
-                const uint32_t virtualParticlePositionInCells =
-                    virtualParticleWayPassedInCells + virtualParticleInitialStartCell;
-
-                /* calculate the position of the virtual particle after the current step is calculated */
-                const float_64 nextVirtualParticleWayPassed =
-                    deltaWayPerStep * float_64(currentStep + 1);
-                const uint32_t nextVirtualParticleWayPassedInCells =
-                    uint32_t(math::floor(nextVirtualParticleWayPassed / cellSizeInMoveDirection));
-                /* This position is used to detect the point in time where the virtual particle
-                 * moves over a GPU border.
+                /* How many cells the virtual particle with speed of light is pushed forward
+                 * at the begin of the simulation.
+                 * The number of cells is round up thus we avoid window moves and slides
+                 * depends on half cells.
                  */
-                const uint32_t nextVirtualParticlePositionInCells =
-                    nextVirtualParticleWayPassedInCells + virtualParticleInitialStartCell;
+                const uint32_t virtualParticleInitialStartCell
+                    = math::ceil(float_64(globalWindowSizeInMoveDirection) * (float_64(1.0) - movePoint));
 
-                /* within the to be simulated time step (currentStep -> currentStep+1)
-                 * the virtual particle will have reached at least the position
-                 * of the cell behind the end of the initial global domain
-                 * (also true for all later time steps)
+                /* Is the time step when the virtual particle **passed** the GPU next to the last
+                 * in the current to the next step
                  */
-                const bool endOfInitialGlobalDomain = firstSlideStep <= currentStep;
-
-                /* virtual particle will pass a GPU border during the current
-                 * (to be simulated) time step
+                const uint32_t firstSlideStep
+                    = math::ceil(
+                          float_64(subGrid.getGlobalDomain().size[moveDirection] - virtualParticleInitialStartCell)
+                          * cellSizeInMoveDirection / deltaWayPerStep)
+                    - 1;
+
+                /* way which the virtual particle must move before the window begins
+                 * to move the first time [in pic length] */
+                const float_64 wayToFirstMove
+                    = float_64(globalWindowSizeInMoveDirection - virtualParticleInitialStartCell)
+                    * cellSizeInMoveDirection;
+                /* Is the time step when the virtual particle **passed** the moving window
+                 * in the current to the next step
+                 * Signed type of firstMoveStep to allow for edge case movePoint = 0.0
+                 * for a moving window right from the start of the simulation.
                  */
-                const bool virtualParticlePassesGPUBorder =
-                    (nextVirtualParticlePositionInCells % gpuNumberOfCellsInMoveDirection) <
-                    (virtualParticlePositionInCells % gpuNumberOfCellsInMoveDirection);
+                const int32_t firstMoveStep = math::ceil(wayToFirstMove / deltaWayPerStep) - 1;
 
-                if (endOfInitialGlobalDomain && virtualParticlePassesGPUBorder)
+                if(firstMoveStep <= int32_t(currentStep))
                 {
-                    incrementSlideCounter(currentStep);
-                    if (doSlide)
-                        *doSlide = true;
-                }
+                    /* calculate the current position of the virtual particle */
+                    const float_64 virtualParticleWayPassed = deltaWayPerStep * float_64(currentStep);
+                    const uint32_t virtualParticleWayPassedInCells
+                        = uint32_t(math::floor(virtualParticleWayPassed / cellSizeInMoveDirection));
+                    const uint32_t virtualParticlePositionInCells
+                        = virtualParticleWayPassedInCells + virtualParticleInitialStartCell;
+
+                    /* calculate the position of the virtual particle after the current step is calculated */
+                    const float_64 nextVirtualParticleWayPassed = deltaWayPerStep * float_64(currentStep + 1);
+                    const uint32_t nextVirtualParticleWayPassedInCells
+                        = uint32_t(math::floor(nextVirtualParticleWayPassed / cellSizeInMoveDirection));
+                    /* This position is used to detect the point in time where the virtual particle
+                     * moves over a GPU border.
+                     */
+                    const uint32_t nextVirtualParticlePositionInCells
+                        = nextVirtualParticleWayPassedInCells + virtualParticleInitialStartCell;
 
-                /* valid range for the offset is [0;GPU number of cells in move direction) */
-                if (offsetFirstGPU)
-                {
-                    /* since the moving window in PIConGPU always starts on the
-                     * first plane (3D) / row (2D) of GPUs in move direction, this
-                     * calculation is equal to the globalWindow.offset in move direction
-                     *
-                     * note: also works with windowMovingSpeed > c
+                    /* within the to be simulated time step (currentStep -> currentStep+1)
+                     * the virtual particle will have reached at least the position
+                     * of the cell behind the end of the initial global domain
+                     * (also true for all later time steps)
+                     */
+                    const bool endOfInitialGlobalDomain = firstSlideStep <= currentStep;
+
+                    /* virtual particle will pass a GPU border during the current
+                     * (to be simulated) time step
                      */
-                    *offsetFirstGPU = nextVirtualParticlePositionInCells % gpuNumberOfCellsInMoveDirection;
+                    const bool virtualParticlePassesGPUBorder
+                        = (nextVirtualParticlePositionInCells % gpuNumberOfCellsInMoveDirection)
+                        < (virtualParticlePositionInCells % gpuNumberOfCellsInMoveDirection);
+
+                    if(endOfInitialGlobalDomain && virtualParticlePassesGPUBorder)
+                    {
+                        incrementSlideCounter(currentStep);
+                        if(doSlide)
+                            *doSlide = true;
+                    }
+
+                    /* valid range for the offset is [0;GPU number of cells in move direction) */
+                    if(offsetFirstGPU)
+                    {
+                        /* since the moving window in PIConGPU always starts on the
+                         * first plane (3D) / row (2D) of GPUs in move direction, this
+                         * calculation is equal to the globalWindow.offset in move direction
+                         *
+                         * note: also works with windowMovingSpeed > c
+                         */
+                        *offsetFirstGPU = nextVirtualParticlePositionInCells % gpuNumberOfCellsInMoveDirection;
+                    }
                 }
             }
         }
 
-    }
-
-    /** increment slide counter
-     *
-     * It is allowed to call this function more than once per time step
-     * The function takes care that the counter is only incremented once
-     * per simulation step
-     *
-     * @param current simulation step
-     */
-    void incrementSlideCounter(const uint32_t currentStep)
-    {
-        // do not slide twice in one simulation step
-        if (isSlidingWindowActive( currentStep ) && lastSlideStep < currentStep)
+        /** increment slide counter
+         *
+         * It is allowed to call this function more than once per time step
+         * The function takes care that the counter is only incremented once
+         * per simulation step
+         *
+         * @param current simulation step
+         */
+        void incrementSlideCounter(const uint32_t currentStep)
         {
-            slideCounter++;
-            lastSlideStep = currentStep;
+            // do not slide twice in one simulation step
+            if(isSlidingWindowActive(currentStep) && lastSlideStep < currentStep)
+            {
+                slideCounter++;
+                lastSlideStep = currentStep;
+            }
         }
-    }
 
-    /** true is sliding window is activated
-     *
-     * How long the window is sliding is defined with endSlidingOnStep.
-     */
-    bool slidingWindowEnabled = false;
-
-    /** Defines when to start sliding the window
-     *
-     * A virtual photon starts at t=0 at the lower end (min y) of the global
-     * simulation box in the positive y direction. The window sliding starts at
-     * the moment of time when the particle covers the movePoint ratio of the
-     * global moving window size in the y direction.
-     *
-     * Note that with the moving window enabled, there is an additional "hidden"
-     * row of local domains (and devices simulating them) at the y-front.
-     * Therefore, the global moving window size in the y direction is the global
-     * domain size minus a local domain size (which is required to be the same
-     * for all domains).
-     *
-     * So, in short, the window starts sliding in time required to pass the
-     * distance of movePoint * (global window size in y) when moving with
-     * the speed of light.
-     *
-     * Setting movePoint to 0.0 makes the window start sliding at the start
-     * of a simulation, and setting it to 1.0 makes it start sliding when the
-     * virtual photon reaches the start of the "hidden" row of local domains.
-     * It is permitted to use values outside of the [0.0, 1.0] interval to
-     * achieve the effects of "pre-movement" and "delayed movement", however
-     * this might complicate the setup and so not recommended unless essential.
-     */
-    float_64 movePoint;
-
-    /** current number of slides since start of simulation */
-    uint32_t slideCounter = 0u;
-
-    /**
-     * last simulation step with slide
-     * used to prevent multiple slides per simulation step
-     */
-    uint32_t lastSlideStep = 0u;
+        /** true is sliding window is activated
+         *
+         * How long the window is sliding is defined with endSlidingOnStep.
+         */
+        bool slidingWindowEnabled = false;
 
-    //! time step where the sliding window is stopped
-    uint32_t endSlidingOnStep = 0u;
+        /** Defines when to start sliding the window
+         *
+         * A virtual photon starts at t=0 at the lower end (min y) of the global
+         * simulation box in the positive y direction. The window sliding starts at
+         * the moment of time when the particle covers the movePoint ratio of the
+         * global moving window size in the y direction.
+         *
+         * Note that with the moving window enabled, there is an additional "hidden"
+         * row of local domains (and devices simulating them) at the y-front.
+         * Therefore, the global moving window size in the y direction is the global
+         * domain size minus a local domain size (which is required to be the same
+         * for all domains).
+         *
+         * So, in short, the window starts sliding in time required to pass the
+         * distance of movePoint * (global window size in y) when moving with
+         * the speed of light.
+         *
+         * Setting movePoint to 0.0 makes the window start sliding at the start
+         * of a simulation, and setting it to 1.0 makes it start sliding when the
+         * virtual photon reaches the start of the "hidden" row of local domains.
+         * It is permitted to use values outside of the [0.0, 1.0] interval to
+         * achieve the effects of "pre-movement" and "delayed movement", however
+         * this might complicate the setup and so not recommended unless essential.
+         */
+        float_64 movePoint;
 
-public:
+        /** current number of slides since start of simulation */
+        uint32_t slideCounter = 0u;
 
-    /** Set window move point which defines when to start sliding the window
-     *
-     * See declaration of movePoint for a detailed explanation.
-     *
-     * @param point ratio of the global window size
-     */
-    void setMovePoint(float_64 const point)
-    {
-        movePoint = point;
-    }
+        /**
+         * last simulation step with slide
+         * used to prevent multiple slides per simulation step
+         */
+        uint32_t lastSlideStep = 0u;
 
-    /**
-     * Set step where the simulation stops the moving window
-     *
-     * @param step 0 means no sliding window, else sliding is enabled until step is reached.
-     */
-    void setEndSlideOnStep(int32_t step)
-    {
-        // maybe we have a underflow in the cast, this is fine because it results in a very large number
-        const uint32_t maxSlideStep = static_cast<uint32_t>(step);
-        if ( maxSlideStep < lastSlideStep)
-            throw std::runtime_error("It is not allowed to stop the moving window in the past.");
+        //! time step where the sliding window is stopped
+        uint32_t endSlidingOnStep = 0u;
 
-        endSlidingOnStep = maxSlideStep;
+    public:
+        /** Set window move point which defines when to start sliding the window
+         *
+         * See declaration of movePoint for a detailed explanation.
+         *
+         * @param point ratio of the global window size
+         */
+        void setMovePoint(float_64 const point)
+        {
+            movePoint = point;
+        }
 
-        static bool firstCall = true;
-        /* Disable or enable sliding window only in the first call.
-         * Later changes of step will not influence if the sliding window is activated.
+        /**
+         * Set step where the simulation stops the moving window
+         *
+         * @param step 0 means no sliding window, else sliding is enabled until step is reached.
          */
-        if (firstCall && endSlidingOnStep != 0u)
-            slidingWindowEnabled = true;
+        void setEndSlideOnStep(int32_t step)
+        {
+            // maybe we have a underflow in the cast, this is fine because it results in a very large number
+            const uint32_t maxSlideStep = static_cast<uint32_t>(step);
+            if(maxSlideStep < lastSlideStep)
+                throw std::runtime_error("It is not allowed to stop the moving window in the past.");
 
-        firstCall = false;
-    }
+            endSlidingOnStep = maxSlideStep;
 
-    /**
-     * Set the number of already performed moving window slides
-     *
-     * @param slides number of slides
-     * @param currentStep current simulation timestep
-     */
-    void setSlideCounter(uint32_t slides,uint32_t currentStep)
-    {
-        slideCounter = slides;
-        /* ensure that we will not change the slide counter with `incrementSlideCounter()`
-         * in the same time step again
-         */
-        lastSlideStep = currentStep;
-    }
+            static bool firstCall = true;
+            /* Disable or enable sliding window only in the first call.
+             * Later changes of step will not influence if the sliding window is activated.
+             */
+            if(firstCall && endSlidingOnStep != 0u)
+                slidingWindowEnabled = true;
 
-    /**
-     * Return the number of slides since start of simulation.
-     * If slide occurs in \p currentStep, it is included in the result.
-     *
-     * @param currentStep current simulation step
-     * @return number of slides
-     */
-    uint32_t getSlideCounter(uint32_t currentStep)
-    {
-        getCurrentSlideInfo(currentStep, nullptr, nullptr);
-        return slideCounter;
-    }
+            firstCall = false;
+        }
 
-    /**
-     * Returns if sliding window is enabled
-     *
-     * @return true if enabled, false otherwise
-     */
-    bool isEnabled() const
-    {
-        return slidingWindowEnabled;
-    }
+        /**
+         * Set the number of already performed moving window slides
+         *
+         * @param slides number of slides
+         * @param currentStep current simulation timestep
+         */
+        void setSlideCounter(uint32_t slides, uint32_t currentStep)
+        {
+            slideCounter = slides;
+            /* ensure that we will not change the slide counter with `incrementSlideCounter()`
+             * in the same time step again
+             */
+            lastSlideStep = currentStep;
+        }
 
-    /**
-     * Returns if the window can move in the current step
-     *
-     * @return false, if Moving window is activated (isEnabled() == true) but already stopped.
-     *         true if moving windows is enabled and simulation step is smaller than
-     */
-    bool isSlidingWindowActive(const uint32_t currenStep) const
-    {
-        return isEnabled() && currenStep < endSlidingOnStep;
-    }
+        /**
+         * Return the number of slides since start of simulation.
+         * If slide occurs in \p currentStep, it is included in the result.
+         *
+         * @param currentStep current simulation step
+         * @return number of slides
+         */
+        uint32_t getSlideCounter(uint32_t currentStep)
+        {
+            getCurrentSlideInfo(currentStep, nullptr, nullptr);
+            return slideCounter;
+        }
 
-    /**
-     * Return if a slide occurs in the current simulation step.
-     *
-     * @param currentStep current simulation step
-     * @return true if slide in current step, false otherwise
-     */
-    bool slideInCurrentStep(uint32_t currentStep)
-    {
-        bool doSlide = false;
+        /**
+         * Returns if sliding window is enabled
+         *
+         * @return true if enabled, false otherwise
+         */
+        bool isEnabled() const
+        {
+            return slidingWindowEnabled;
+        }
 
-        if (slidingWindowEnabled)
+        /**
+         * Returns if the window can move in the current step
+         *
+         * @return false, if Moving window is activated (isEnabled() == true) but already stopped.
+         *         true if moving windows is enabled and simulation step is smaller than
+         */
+        bool isSlidingWindowActive(const uint32_t currenStep) const
         {
-            getCurrentSlideInfo(currentStep, &doSlide, nullptr);
+            return isEnabled() && currenStep < endSlidingOnStep;
         }
 
-        return doSlide;
-    }
+        /**
+         * Return if a slide occurs in the current simulation step.
+         *
+         * @param currentStep current simulation step
+         * @return true if slide in current step, false otherwise
+         */
+        bool slideInCurrentStep(uint32_t currentStep)
+        {
+            bool doSlide = false;
 
-    /**
-     * Return true if this is a 'bottom' GPU (y position is y_size - 1), false otherwise
-     * only set if sliding window is active
-     */
-    bool isBottomGPU(void) const
-    {
-        Mask comm_mask = Environment<simDim>::get().GridController().getCommunicationMask();
-        return !comm_mask.isSet(BOTTOM);
-    }
+            if(slidingWindowEnabled)
+            {
+                getCurrentSlideInfo(currentStep, &doSlide, nullptr);
+            }
 
-    /**
-     * Returns an instance of MovingWindow
-     *
-     * @return an instance
-     */
-    static MovingWindow& getInstance()
-    {
-        static MovingWindow instance;
-        return instance;
-    }
+            return doSlide;
+        }
 
-    /**
-     * Return a window which describes the global and local moving window
-     *
-     * @param currentStep current simulation step
-     * @return moving window
-     */
-    Window getWindow(uint32_t currentStep)
-    {
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+        /**
+         * Return true if this is a 'bottom' GPU (y position is y_size - 1), false otherwise
+         * only set if sliding window is active
+         */
+        bool isBottomGPU(void) const
+        {
+            Mask comm_mask = Environment<simDim>::get().GridController().getCommunicationMask();
+            return !comm_mask.isSet(BOTTOM);
+        }
 
-        /* Without moving window, the selected window spans the whole global domain.
-         * \see https://github.com/ComputationalRadiationPhysics/picongpu/wiki/PIConGPU-domain-definitions
+        /**
+         * Returns an instance of MovingWindow
          *
-         * The window's global offset is therefore zero inside the global domain.
-         * The window's global and local size are equal to the SubGrid quantities.
-         * The local window offset is the offset within the global window which
-         * is equal to the local domain offset of the GPU.
+         * @return an instance
          */
-        Window window;
-        window.localDimensions = subGrid.getLocalDomain();
-        window.globalDimensions = Selection<simDim>(subGrid.getGlobalDomain().size);
-
-        /* moving window can only slide in y direction */
-        if (slidingWindowEnabled)
+        static MovingWindow& getInstance()
         {
-            /* the moving window is smaller than the global domain by exactly one
-             * GPU (local domain size) in moving (y) direction
-             */
-            window.globalDimensions.size.y() -= subGrid.getLocalDomain().size.y();
+            static MovingWindow instance;
+            return instance;
+        }
 
-            float_64 offsetFirstGPU = 0.0;
-            getCurrentSlideInfo(currentStep, nullptr, &offsetFirstGPU);
+        /**
+         * Return a window which describes the global and local moving window
+         *
+         * @param currentStep current simulation step
+         * @return moving window
+         */
+        Window getWindow(uint32_t currentStep)
+        {
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
 
-            /* while moving, the windows global offset within the global domain is between 0
-             * and smaller than the local domain's size in y.
+            /* Without moving window, the selected window spans the whole global domain.
+             * \see https://github.com/ComputationalRadiationPhysics/picongpu/wiki/PIConGPU-domain-definitions
+             *
+             * The window's global offset is therefore zero inside the global domain.
+             * The window's global and local size are equal to the SubGrid quantities.
+             * The local window offset is the offset within the global window which
+             * is equal to the local domain offset of the GPU.
              */
-            window.globalDimensions.offset.y() = offsetFirstGPU;
-
-            /* set top/bottom if there are no communication partners
-             * for this GPU in the respective direction */
-            const Mask comm_mask = Environment<simDim>::get().GridController().getCommunicationMask();
-            const bool isTopGpu = !comm_mask.isSet(TOP);
-            const bool isBottomGpu = !comm_mask.isSet(BOTTOM);
+            Window window;
+            window.localDimensions = subGrid.getLocalDomain();
+            window.globalDimensions = Selection<simDim>(subGrid.getGlobalDomain().size);
 
-            if (isTopGpu)
+            /* moving window can only slide in y direction */
+            if(slidingWindowEnabled)
             {
-                /* the windows local offset within the global window is reduced
-                 * by the global window offset within the global domain
+                /* the moving window is smaller than the global domain by exactly one
+                 * GPU (local domain size) in moving (y) direction
                  */
-                window.localDimensions.size.y() -= offsetFirstGPU;
-            }
-            else
-            {
-                window.localDimensions.offset.y() = subGrid.getLocalDomain().offset.y() - offsetFirstGPU;
-                if (isBottomGpu)
+                window.globalDimensions.size.y() -= subGrid.getLocalDomain().size.y();
+
+                float_64 offsetFirstGPU = 0.0;
+                getCurrentSlideInfo(currentStep, nullptr, &offsetFirstGPU);
+
+                /* while moving, the windows global offset within the global domain is between 0
+                 * and smaller than the local domain's size in y.
+                 */
+                window.globalDimensions.offset.y() = offsetFirstGPU;
+
+                /* set top/bottom if there are no communication partners
+                 * for this GPU in the respective direction */
+                const Mask comm_mask = Environment<simDim>::get().GridController().getCommunicationMask();
+                const bool isTopGpu = !comm_mask.isSet(TOP);
+                const bool isBottomGpu = !comm_mask.isSet(BOTTOM);
+
+                if(isTopGpu)
                 {
-                    window.localDimensions.size.y() = offsetFirstGPU;
+                    /* the windows local offset within the global window is reduced
+                     * by the global window offset within the global domain
+                     */
+                    window.localDimensions.size.y() -= offsetFirstGPU;
+                }
+                else
+                {
+                    window.localDimensions.offset.y() = subGrid.getLocalDomain().offset.y() - offsetFirstGPU;
+                    if(isBottomGpu)
+                    {
+                        window.localDimensions.size.y() = offsetFirstGPU;
+                    }
                 }
             }
-        }
-
-        return window;
-    }
-
-    /**
-     * Return a window which describes the global and local domain
-     *
-     * @param currentStep current simulation step
-     * @return window over global/local domain
-     */
-    Window getDomainAsWindow(uint32_t currentStep) const
-    {
-        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
-        Window window;
 
-        window.localDimensions = subGrid.getLocalDomain();
-        window.globalDimensions = Selection<simDim>(subGrid.getGlobalDomain().size);
+            return window;
+        }
 
-        return window;
-    }
+        /**
+         * Return a window which describes the global and local domain
+         *
+         * @param currentStep current simulation step
+         * @return window over global/local domain
+         */
+        Window getDomainAsWindow(uint32_t currentStep) const
+        {
+            const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
+            Window window;
 
-};
+            window.localDimensions = subGrid.getLocalDomain();
+            window.globalDimensions = Selection<simDim>(subGrid.getGlobalDomain().size);
 
-} //namespace picongpu
+            return window;
+        }
+    };
 
+} // namespace picongpu
diff --git a/include/picongpu/simulation/control/MySimulation.hpp b/include/picongpu/simulation/control/MySimulation.hpp
deleted file mode 100644
index ff9b536d5a..0000000000
--- a/include/picongpu/simulation/control/MySimulation.hpp
+++ /dev/null
@@ -1,731 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
- *                     Richard Pausch, Alexander Debus, Marco Garten,
- *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <pmacc/verify.hpp>
-#include <pmacc/assert.hpp>
-
-#include <algorithm>
-#include <array>
-#include <string>
-#include <vector>
-#include <boost/lexical_cast.hpp>
-#include <boost/mpl/count.hpp>
-
-#include <pmacc/types.hpp>
-#include <pmacc/simulationControl/SimulationHelper.hpp>
-#include "picongpu/simulation_defines.hpp"
-#include "picongpu/versionFormat.hpp"
-#include "picongpu/random/seed/ISeed.hpp"
-
-#include <pmacc/eventSystem/EventSystem.hpp>
-#include <pmacc/dimensions/GridLayout.hpp>
-#include <pmacc/nvidia/memory/MemoryInfo.hpp>
-#include <pmacc/mappings/kernel/MappingDescription.hpp>
-#include "picongpu/simulation/control/MovingWindow.hpp"
-#include <pmacc/mappings/simulation/SubGrid.hpp>
-#include <pmacc/mappings/simulation/GridController.hpp>
-
-#include "picongpu/fields/FieldE.hpp"
-#include "picongpu/fields/FieldB.hpp"
-#include "picongpu/fields/FieldJ.hpp"
-#include "picongpu/fields/FieldTmp.hpp"
-#include "picongpu/fields/MaxwellSolver/Solvers.hpp"
-#include "picongpu/fields/MaxwellSolver/YeePML/Field.hpp"
-#include "picongpu/fields/background/cellwiseOperation.hpp"
-#include "picongpu/initialization/IInitPlugin.hpp"
-#include "picongpu/initialization/ParserGridDistribution.hpp"
-#include "picongpu/particles/Manipulate.hpp"
-#include "picongpu/particles/manipulators/manipulators.hpp"
-#include "picongpu/particles/filter/filter.hpp"
-#include "picongpu/particles/flylite/NonLTE.tpp"
-#include "picongpu/simulation/control/DomainAdjuster.hpp"
-#include "picongpu/simulation/stage/Bremsstrahlung.hpp"
-#include "picongpu/simulation/stage/CurrentBackground.hpp"
-#include "picongpu/simulation/stage/CurrentDeposition.hpp"
-#include "picongpu/simulation/stage/CurrentInterpolationAndAdditionToEMF.hpp"
-#include "picongpu/simulation/stage/CurrentReset.hpp"
-#include "picongpu/simulation/stage/FieldBackground.hpp"
-#include "picongpu/simulation/stage/MomentumBackup.hpp"
-#include "picongpu/simulation/stage/ParticleIonization.hpp"
-#include "picongpu/simulation/stage/ParticlePush.hpp"
-#include "picongpu/simulation/stage/PopulationKinetics.hpp"
-#include "picongpu/simulation/stage/SynchrotronRadiation.hpp"
-#include <pmacc/random/methods/methods.hpp>
-#include <pmacc/random/RNGProvider.hpp>
-
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "picongpu/particles/bremsstrahlung/ScaledSpectrum.hpp"
-#   include "picongpu/particles/bremsstrahlung/PhotonEmissionAngle.hpp"
-#endif
-
-#include "picongpu/particles/synchrotronPhotons/SynchrotronFunctions.hpp"
-
-#include <pmacc/nvidia/reduce/Reduce.hpp>
-#include <pmacc/memory/boxes/DataBoxDim1Access.hpp>
-#include <pmacc/nvidia/functors/Add.hpp>
-#include <pmacc/nvidia/functors/Sub.hpp>
-
-#include <pmacc/meta/conversion/SeqToMap.hpp>
-#include <pmacc/meta/conversion/TypeToPointerPair.hpp>
-
-#include <pmacc/meta/ForEach.hpp>
-#include "picongpu/particles/ParticlesFunctors.hpp"
-#include "picongpu/particles/InitFunctors.hpp"
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include <pmacc/particles/memory/buffers/MallocMCBuffer.hpp>
-#endif
-#include <pmacc/particles/traits/FilterByFlag.hpp>
-#include <pmacc/particles/traits/FilterByIdentifier.hpp>
-#include "picongpu/particles/traits/HasIonizersWithRNG.hpp"
-#include <pmacc/particles/IdProvider.hpp>
-
-#include <boost/mpl/int.hpp>
-#include <memory>
-#include <functional>
-
-
-namespace picongpu
-{
-using namespace pmacc;
-
-/**
- * Global simulation controller class.
- *
- * Initialises simulation data and defines the simulation steps
- * for each iteration.
- *
- * @tparam DIM the dimension (2-3) for the simulation
- */
-class MySimulation : public SimulationHelper<simDim>
-{
-public:
-
-    /**
-     * Constructor
-     */
-    MySimulation() :
-    myFieldSolver(nullptr),
-    cellDescription(nullptr),
-    initialiserController(nullptr),
-    slidingWindow(false),
-    windowMovePoint(0.0),
-    endSlidingOnStep(-1),
-    showVersionOnce(false)
-    {
-    }
-
-    virtual void pluginRegisterHelp(po::options_description& desc)
-    {
-        SimulationHelper<simDim>::pluginRegisterHelp(desc);
-        desc.add_options()
-            ("versionOnce", po::value<bool>(&showVersionOnce)->zero_tokens(), "print version information once and start")
-
-            ("devices,d", po::value<std::vector<uint32_t> > (&devices)->multitoken(), "number of devices in each dimension")
-
-            ("grid,g", po::value<std::vector<uint32_t> > (&gridSize)->multitoken(),
-             "size of the simulation grid")
-
-            ("gridDist", po::value<std::vector<std::string> > (&gridDistribution)->multitoken(),
-             "Regex to describe the static distribution of the cells for each device,"
-             "default: equal distribution over all devices\n"
-             "  example:\n"
-             "    -d 2 4 1\n"
-             "    -g 128 192 12\n"
-             "    --gridDist \"64{2}\" \"64,32{2},64\"\n")
-
-            ("periodic", po::value<std::vector<uint32_t> > (&periodic)->multitoken(),
-             "specifying whether the grid is periodic (1) or not (0) in each dimension, default: no periodic dimensions")
-
-            ("moving,m", po::value<bool>(&slidingWindow)->zero_tokens(), "enable sliding/moving window")
-            /* For now we still use the compile-time movePoint variable to set
-             * the default value and provide backward compatibility
-             */
-            ("windowMovePoint", po::value<float_64>(&windowMovePoint)->default_value(movePoint),
-                "ratio of the global window size in y which defines when to "
-                "start sliding the window. "
-                "The window starts sliding at the time required to pass the "
-                "distance of windowMovePoint * (global window size in y) "
-                "when moving with the speed of light")
-            ("stopWindow", po::value<int32_t>(&endSlidingOnStep)->default_value(-1),
-                "stops the window at stimulation step, "
-                "-1 means that window is never stopping")
-            ("autoAdjustGrid", po::value<bool>(&autoAdjustGrid)->default_value(true),
-                "auto adjust the grid size if PIConGPU conditions are not fulfilled");
-    }
-
-    std::string pluginGetName() const
-    {
-        return "PIConGPU";
-    }
-
-    virtual void pluginLoad()
-    {
-        //fill periodic with 0
-        while (periodic.size() < 3)
-            periodic.push_back(0);
-
-        // check on correct number of devices. fill with default value 1 for missing dimensions
-        if (devices.size() > 3)
-        {
-            std::cerr << "Invalid number of devices.\nuse [-d dx=1 dy=1 dz=1]" << std::endl;
-        }
-        else
-            while (devices.size() < 3)
-                devices.push_back(1);
-
-        // check on correct grid size. fill with default grid size value 1 for missing 3. dimension
-        if (gridSize.size() < 2 || gridSize.size() > 3)
-        {
-            std::cerr << "Invalid or missing grid size.\nuse -g width height [depth=1]" << std::endl;
-        }
-        else
-            if (gridSize.size() == 2)
-            gridSize.push_back(1);
-
-        if (slidingWindow && devices[1] == 1)
-        {
-            std::cerr << "Invalid configuration. Can't use moving window with one device in Y direction" << std::endl;
-        }
-
-        DataSpace<simDim> gridSizeGlobal;
-        DataSpace<simDim> gpus;
-        DataSpace<simDim> isPeriodic;
-
-        for (uint32_t i = 0; i < simDim; ++i)
-        {
-            gridSizeGlobal[i] = gridSize[i];
-            gpus[i] = devices[i];
-            isPeriodic[i] = periodic[i];
-        }
-
-        Environment<simDim>::get().initDevices(gpus, isPeriodic);
-        pmacc::GridController< simDim > & gc = pmacc::Environment<simDim>::get().GridController();
-
-        DataSpace<simDim> myGPUpos(gc.getPosition());
-
-        if( gc.getGlobalRank() == 0 )
-        {
-            if( showVersionOnce )
-            {
-                void( getSoftwareVersions( std::cout ) );
-            }
-        }
-
-        // calculate the number of local grid cells and
-        // the local cell offset to the global box
-        for (uint32_t dim = 0; dim < gridDistribution.size() && dim < simDim; ++dim)
-        {
-            // parse string
-            ParserGridDistribution parserGD(gridDistribution.at(dim));
-
-            // verify number of blocks and devices in dimension match
-            parserGD.verifyDevices(gpus[dim]);
-
-            // calculate local grid points & offset
-            gridSizeLocal[dim] = parserGD.getLocalSize(myGPUpos[dim]);
-        }
-        // by default: use an equal distributed box for all omitted params
-        for (uint32_t dim = gridDistribution.size(); dim < simDim; ++dim)
-        {
-            gridSizeLocal[dim] = gridSizeGlobal[dim] / gpus[dim];
-        }
-
-        DataSpace<simDim> gridOffset;
-
-        DomainAdjuster domainAdjuster(
-            gpus,
-            myGPUpos,
-            isPeriodic,
-            slidingWindow
-        );
-
-        if(!autoAdjustGrid)
-            domainAdjuster.validateOnly();
-
-        domainAdjuster(gridSizeGlobal, gridSizeLocal, gridOffset);
-
-        Environment<simDim>::get().initGrids(gridSizeGlobal, gridSizeLocal, gridOffset);
-
-        if( !slidingWindow )
-        {
-            windowMovePoint = 0.0;
-            endSlidingOnStep = 0;
-        }
-        MovingWindow::getInstance().setMovePoint(windowMovePoint);
-        MovingWindow::getInstance().setEndSlideOnStep(endSlidingOnStep);
-
-        log<picLog::DOMAINS > ("rank %1%; localsize %2%; localoffset %3%;") %
-            myGPUpos.toString() % gridSizeLocal.toString() % gridOffset.toString();
-
-        SimulationHelper<simDim>::pluginLoad();
-
-        GridLayout<simDim> layout(gridSizeLocal, GuardSize::toRT() * SuperCellSize::toRT());
-        cellDescription = new MappingDesc(layout.getDataSpace(), DataSpace<simDim>(GuardSize::toRT()));
-
-        if (gc.getGlobalRank() == 0)
-        {
-            if (MovingWindow::getInstance().isEnabled())
-                log<picLog::PHYSICS > ("Sliding Window is ON");
-            else
-                log<picLog::PHYSICS > ("Sliding Window is OFF");
-        }
-    }
-
-    virtual void pluginUnload()
-    {
-        DataConnector &dc = Environment<>::get().DataConnector();
-
-        SimulationHelper<simDim>::pluginUnload();
-
-        __delete(myFieldSolver);
-
-        /** unshare all registered ISimulationData sets
-         *
-         * @todo can be removed as soon as our Environment learns to shutdown in
-         *       a distinct order, e.g. DataConnector before CUDA context
-         */
-        dc.clean();
-
-        __delete(cellDescription);
-    }
-
-    void notify(uint32_t)
-    {
-
-    }
-
-    virtual void init()
-    {
-        namespace nvmem = pmacc::nvidia::memory;
-
-        DataConnector &dc = Environment<>::get().DataConnector();
-        initFields(dc);
-
-        // create field solver
-        this->myFieldSolver = new fields::Solver(*cellDescription);
-
-        // Initialize random number generator and synchrotron functions, if there are synchrotron or bremsstrahlung Photons
-        using AllSynchrotronPhotonsSpecies = typename pmacc::particles::traits::FilterByFlag<
-            VectorAllSpecies,
-            synchrotronPhotons<>
-        >::type;
-        using AllBremsstrahlungPhotonsSpecies = typename pmacc::particles::traits::FilterByFlag<
-            VectorAllSpecies,
-            bremsstrahlungPhotons<>
-        >::type;
-
-        // create factory for the random number generator
-        const uint32_t userSeed = random::seed::ISeed< random::SeedGenerator >{}();
-        const uint32_t seed = std::hash<std::string>{}(
-            std::to_string( userSeed )
-        );
-
-        using RNGFactory = pmacc::random::RNGProvider< simDim, random::Generator >;
-        auto rngFactory = pmacc::memory::makeUnique< RNGFactory >(
-            Environment<simDim>::get().SubGrid().getLocalDomain().size
-        );
-        if (Environment<simDim>::get().GridController().getGlobalRank() == 0)
-        {
-            log<picLog::PHYSICS >("used Random Number Generator: %1% seed: %2%") %
-                rngFactory->getName() %
-                userSeed;
-        }
-
-        // init and share random number generator
-        pmacc::GridController<simDim>& gridCon = pmacc::Environment<simDim>::get().GridController();
-        rngFactory->init( gridCon.getScalarPosition() ^ seed );
-        dc.consume( std::move( rngFactory ) );
-
-        // Initialize synchrotron functions, if there are synchrotron photon species
-        if(!bmpl::empty<AllSynchrotronPhotonsSpecies>::value)
-        {
-            this->synchrotronFunctions.init();
-        }
-#if( PMACC_CUDA_ENABLED == 1 )
-        // Initialize bremsstrahlung lookup tables, if there are species containing bremsstrahlung photons
-        if(!bmpl::empty<AllBremsstrahlungPhotonsSpecies>::value)
-        {
-            meta::ForEach<
-                AllBremsstrahlungPhotonsSpecies,
-                particles::bremsstrahlung::FillScaledSpectrumMap< bmpl::_1 >
-            > fillScaledSpectrumMap;
-            fillScaledSpectrumMap(this->scaledBremsstrahlungSpectrumMap);
-
-            this->bremsstrahlungPhotonAngle.init();
-        }
-
-        /* Create an empty allocator. This one is resized after all exchanges
-         * for particles are created */
-        deviceHeap.reset(new DeviceHeap(0));
-#endif
-
-        /* Allocate helper fields for FLYlite population kinetics for atomic physics
-         * (histograms, rate matrix, etc.)
-         */
-        using AllFlyLiteIons = typename pmacc::particles::traits::FilterByFlag<
-            VectorAllSpecies,
-            populationKinetics<>
-        >::type;
-
-        meta::ForEach<
-            AllFlyLiteIons,
-            particles::CallPopulationKineticsInit< bmpl::_1 >,
-            bmpl::_1
-        > initPopulationKinetics;
-        initPopulationKinetics(
-            gridSizeLocal
-        );
-
-        // Allocate and initialize particle species with all left-over memory below
-        meta::ForEach< VectorAllSpecies, particles::CreateSpecies<bmpl::_1> > createSpeciesMemory;
-        createSpeciesMemory( deviceHeap, cellDescription );
-
-        size_t freeGpuMem(0);
-        Environment<>::get().MemoryInfo().getMemoryInfo(&freeGpuMem);
-        if(freeGpuMem < reservedGpuMemorySize)
-        {
-            pmacc::log< picLog::MEMORY > ("%1% MiB free memory < %2% MiB required reserved memory")
-                % (freeGpuMem / 1024 / 1024) % (reservedGpuMemorySize / 1024 / 1024) ;
-            std::stringstream msg;
-            msg << "Cannot reserve "
-                << (reservedGpuMemorySize / 1024 / 1024) << " MiB as there is only "
-                << (freeGpuMem / 1024 / 1024) << " MiB free device memory left";
-            throw std::runtime_error(msg.str());
-        }
-
-#if( PMACC_CUDA_ENABLED == 1 )
-        size_t heapSize = freeGpuMem - reservedGpuMemorySize;
-
-        if( Environment<>::get().MemoryInfo().isSharedMemoryPool() )
-        {
-            heapSize /= 2;
-            log<picLog::MEMORY > ("Shared RAM between GPU and host detected - using only half of the 'device' memory.");
-        }
-        else
-            log<picLog::MEMORY > ("RAM is NOT shared between GPU and host.");
-
-        // initializing the heap for particles
-        deviceHeap->destructiveResize(heapSize);
-        auto mallocMCBuffer = pmacc::memory::makeUnique< MallocMCBuffer<DeviceHeap> >( deviceHeap );
-        dc.consume( std::move( mallocMCBuffer ) );
-#endif
-        meta::ForEach< VectorAllSpecies, particles::LogMemoryStatisticsForSpecies<bmpl::_1> > logMemoryStatisticsForSpecies;
-        logMemoryStatisticsForSpecies( deviceHeap );
-
-        Environment<>::get().MemoryInfo().getMemoryInfo(&freeGpuMem);
-        log<picLog::MEMORY > ("free mem after all mem is allocated %1% MiB") % (freeGpuMem / 1024 / 1024);
-
-        IdProvider<simDim>::init();
-
-#if( PMACC_CUDA_ENABLED == 1 )
-        /* add CUDA streams to the StreamController for concurrent execution */
-        Environment<>::get().StreamController().addStreams(6);
-#endif
-    }
-
-    virtual uint32_t fillSimulation()
-    {
-        /* assume start (restart in initialiserController might change that) */
-        uint32_t step = 0;
-
-        /* set slideCounter properties for PIConGPU MovingWindow: assume start
-         * (restart in initialiserController might change this again)
-         */
-        MovingWindow::getInstance().setSlideCounter(0, 0);
-        /* Update MPI domain decomposition: will also update SubGrid domain
-         * information such as local offsets in y-direction
-         */
-        GridController<simDim> &gc = Environment<simDim>::get().GridController();
-        gc.setStateAfterSlides(0);
-
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
-        auto fieldB = dc.get< FieldB >( FieldB::getName(), true );
-
-        /* fill all objects registed in DataConnector */
-        if (initialiserController)
-        {
-            initialiserController->printInformation();
-            if (this->restartRequested)
-            {
-                /* we do not require '--checkpoint.restart.step' if a master checkpoint file is found */
-                if (this->restartStep < 0)
-                {
-                    std::vector<uint32_t> checkpoints = readCheckpointMasterFile();
-
-                    if (checkpoints.empty())
-                    {
-                        throw std::runtime_error(
-                            "Restart failed. You must provide the '--checkpoint.restart.step' argument. See picongpu --help."
-                        );
-                    } else
-                        this->restartStep = checkpoints.back();
-                }
-
-                initialiserController->restart((uint32_t)this->restartStep, this->restartDirectory);
-                step = this->restartStep;
-
-                /** restore background fields in GUARD
-                 *
-                 * loads the outer GUARDS of the global domain for absorbing/open boundary condtions
-                 *
-                 * @todo as soon as we add GUARD fields to the checkpoint data, e.g. for PML boundary
-                 *       conditions, this section needs to be removed
-                 */
-                cellwiseOperation::CellwiseOperation< GUARD > guardBGField( *cellDescription );
-                namespace nvfct = pmacc::nvidia::functors;
-                guardBGField( fieldE, nvfct::Add(), FieldBackgroundE( fieldE->getUnit() ),
-                              step, FieldBackgroundE::InfluenceParticlePusher );
-                guardBGField( fieldB, nvfct::Add(), FieldBackgroundB( fieldB->getUnit() ),
-                              step, FieldBackgroundB::InfluenceParticlePusher );
-            }
-            else
-            {
-                initialiserController->init();
-                meta::ForEach< particles::InitPipeline, particles::CallFunctor<bmpl::_1> > initSpecies;
-                initSpecies( step );
-            }
-        }
-
-        size_t freeGpuMem(0u);
-        Environment<>::get().MemoryInfo().getMemoryInfo(&freeGpuMem);
-        log<picLog::MEMORY > ("free mem after all particles are initialized %1% MiB") % (freeGpuMem / 1024 / 1024);
-
-        // generate valid GUARDS (overwrite)
-        EventTask eRfieldE = fieldE->asyncCommunication(__getTransactionEvent());
-        __setTransactionEvent(eRfieldE);
-        EventTask eRfieldB = fieldB->asyncCommunication(__getTransactionEvent());
-        __setTransactionEvent(eRfieldB);
-
-        dc.releaseData( FieldE::getName() );
-        dc.releaseData( FieldB::getName() );
-
-        return step;
-    }
-
-    /**
-     * Run one simulation step.
-     *
-     * @param currentStep iteration number of the current step
-     */
-    virtual void runOneStep(uint32_t currentStep)
-    {
-        using namespace simulation::stage;
-        MomentumBackup{ }( currentStep );
-        ParticleIonization{ *cellDescription }( currentStep );
-        PopulationKinetics{ }( currentStep );
-        SynchrotronRadiation{
-            *cellDescription,
-            synchrotronFunctions
-        }( currentStep );
-#if( PMACC_CUDA_ENABLED == 1 )
-        Bremsstrahlung{
-            *cellDescription,
-            scaledBremsstrahlungSpectrumMap,
-            bremsstrahlungPhotonAngle
-        }( currentStep );
-#endif
-        EventTask commEvent;
-        ParticlePush{ }( currentStep, commEvent );
-        FieldBackground{ *cellDescription }( currentStep, nvidia::functors::Sub( ) );
-        myFieldSolver->update_beforeCurrent( currentStep );
-        CurrentReset{ }( currentStep );
-        __setTransactionEvent( commEvent );
-        CurrentBackground{ *cellDescription }( currentStep );
-        CurrentDeposition{ }( currentStep );
-        CurrentInterpolationAndAdditionToEMF{ }( currentStep );
-        myFieldSolver->update_afterCurrent( currentStep );
-    }
-
-    virtual void movingWindowCheck(uint32_t currentStep)
-    {
-        if (MovingWindow::getInstance().slideInCurrentStep(currentStep))
-        {
-            slide(currentStep);
-        }
-
-        /* do not double-add background field on restarts
-         * (contained in checkpoint data)
-         */
-        bool addBgFields = true;
-        if( this->restartRequested )
-        {
-            if( this->restartStep == int32_t(currentStep) )
-                addBgFields = false;
-        }
-
-        if( addBgFields )
-        {
-            /** add background field: the movingWindowCheck is just at the start
-             * of a time step before all the plugins are called (and the step
-             * itself is performed for this time step).
-             * Hence the background field is visible for all plugins
-             * in between the time steps.
-             */
-            simulation::stage::FieldBackground{ *cellDescription }(
-                currentStep, nvidia::functors::Add( )
-            );
-        }
-    }
-
-    virtual void resetAll(uint32_t currentStep)
-    {
-        resetFields( currentStep );
-        meta::ForEach<
-            VectorAllSpecies,
-            particles::CallReset< bmpl::_1 >
-        > resetParticles;
-        resetParticles( currentStep );
-    }
-
-    void slide(uint32_t currentStep)
-    {
-        GridController<simDim>& gc = Environment<simDim>::get().GridController();
-
-        if (gc.slide())
-        {
-            log<picLog::SIMULATION_STATE > ("slide in step %1%") % currentStep;
-            resetAll(currentStep);
-            initialiserController->slide(currentStep);
-            meta::ForEach< particles::InitPipeline, particles::CallFunctor< bmpl::_1 > > initSpecies;
-            initSpecies( currentStep );
-        }
-    }
-
-    virtual void setInitController(IInitPlugin *initController)
-    {
-
-        PMACC_ASSERT(initController != nullptr);
-        this->initialiserController = initController;
-    }
-
-    MappingDesc* getMappingDescription()
-    {
-        return cellDescription;
-    }
-
-protected:
-
-    std::shared_ptr<DeviceHeap> deviceHeap;
-
-    fields::Solver* myFieldSolver;
-
-#if( PMACC_CUDA_ENABLED == 1 )
-    // creates lookup tables for the bremsstrahlung effect
-    // map<atomic number, scaled bremsstrahlung spectrum>
-    std::map<float_X, particles::bremsstrahlung::ScaledSpectrum> scaledBremsstrahlungSpectrumMap;
-    particles::bremsstrahlung::GetPhotonAngle bremsstrahlungPhotonAngle;
-#endif
-
-    // Synchrotron functions (used in synchrotronPhotons module)
-    particles::synchrotronPhotons::SynchrotronFunctions synchrotronFunctions;
-
-    // output classes
-
-    IInitPlugin* initialiserController;
-
-    MappingDesc* cellDescription;
-
-    // layout parameter
-    std::vector<uint32_t> devices;
-    std::vector<uint32_t> gridSize;
-    /** Without guards */
-    DataSpace<simDim> gridSizeLocal;
-    std::vector<uint32_t> periodic;
-
-    std::vector<std::string> gridDistribution;
-
-    bool slidingWindow;
-    int32_t endSlidingOnStep;
-    float_64 windowMovePoint;
-    bool showVersionOnce;
-    bool autoAdjustGrid = true;
-
-private:
-
-    void initFields( DataConnector& dataConnector )
-    {
-        using pmacc::memory::makeUnique;
-        auto fieldB = makeUnique< FieldB >( *cellDescription );
-        dataConnector.consume( std::move( fieldB ) );
-        auto fieldE = makeUnique< FieldE >( *cellDescription );
-        dataConnector.consume( std::move( fieldE ) );
-        auto fieldJ = makeUnique< FieldJ >( *cellDescription );
-        dataConnector.consume( std::move( fieldJ ) );
-        for( uint32_t slot = 0; slot < fieldTmpNumSlots; ++slot)
-        {
-            auto fieldTmp = makeUnique< FieldTmp >( *cellDescription, slot );
-            dataConnector.consume( std::move( fieldTmp ) );
-        }
-    }
-
-    /** Reset all fields
-     *
-     * @param currentStep iteration number of the current step
-     */
-    void resetFields( uint32_t const currentStep )
-    {
-        auto resetField = [currentStep]( std::string const name )
-        {
-            DataConnector & dc = Environment<>::get().DataConnector();
-            auto const fieldExists = dc.hasId( name );
-            if( fieldExists )
-            {
-                using FieldHelper = SimulationFieldHelper< MappingDesc >;
-                auto field = std::dynamic_pointer_cast< FieldHelper >(
-                    dc.get< ISimulationData >( name, true )
-                );
-                if( field )
-                    field->reset( currentStep );
-                dc.releaseData( name );
-            }
-        };
-
-        /* @todo for now the list of fields is hardcoded here, a more generic
-         * solution would require changes to design of DataConnector.
-         * FieldJ and FieldTmp are effectively cleared each time iteration and
-         * so do not need a reset.
-         */
-        std::array< std::string, 4 > const fieldNames{ {
-            FieldE::getName(),
-            FieldB::getName(),
-            fields::maxwellSolver::yeePML::FieldE::getName(),
-            fields::maxwellSolver::yeePML::FieldB::getName()
-        } };
-        std::for_each(
-            fieldNames.cbegin(),
-            fieldNames.cend(),
-            resetField
-        );
-    }
-
-};
-} /* namespace picongpu */
-
-#include "picongpu/fields/Fields.tpp"
-#include "picongpu/particles/synchrotronPhotons/SynchrotronFunctions.tpp"
-
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "picongpu/particles/bremsstrahlung/Bremsstrahlung.tpp"
-#   include "picongpu/particles/bremsstrahlung/ScaledSpectrum.tpp"
-#endif
diff --git a/include/picongpu/simulation/control/Simulation.hpp b/include/picongpu/simulation/control/Simulation.hpp
new file mode 100644
index 0000000000..e72140f204
--- /dev/null
+++ b/include/picongpu/simulation/control/Simulation.hpp
@@ -0,0 +1,719 @@
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+ *                     Richard Pausch, Alexander Debus, Marco Garten,
+ *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <pmacc/verify.hpp>
+#include <pmacc/assert.hpp>
+
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+#include <boost/lexical_cast.hpp>
+#include <boost/mpl/count.hpp>
+
+#include <pmacc/types.hpp>
+#include <pmacc/simulationControl/SimulationHelper.hpp>
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/versionFormat.hpp"
+#include "picongpu/random/seed/ISeed.hpp"
+
+#include <pmacc/eventSystem/EventSystem.hpp>
+#include <pmacc/dimensions/GridLayout.hpp>
+#include <pmacc/nvidia/memory/MemoryInfo.hpp>
+#include <pmacc/mappings/kernel/MappingDescription.hpp>
+#include "picongpu/simulation/control/MovingWindow.hpp"
+#include <pmacc/mappings/simulation/SubGrid.hpp>
+#include <pmacc/mappings/simulation/GridController.hpp>
+
+#include "picongpu/fields/FieldE.hpp"
+#include "picongpu/fields/FieldB.hpp"
+#include "picongpu/fields/FieldJ.hpp"
+#include "picongpu/fields/FieldTmp.hpp"
+#include "picongpu/fields/MaxwellSolver/Solvers.hpp"
+#include "picongpu/fields/MaxwellSolver/YeePML/Field.hpp"
+#include "picongpu/fields/background/cellwiseOperation.hpp"
+#include "picongpu/initialization/IInitPlugin.hpp"
+#include "picongpu/initialization/ParserGridDistribution.hpp"
+#include "picongpu/particles/Manipulate.hpp"
+#include "picongpu/particles/manipulators/manipulators.hpp"
+#include "picongpu/particles/filter/filter.hpp"
+#include "picongpu/particles/flylite/NonLTE.tpp"
+#include "picongpu/simulation/control/DomainAdjuster.hpp"
+#include "picongpu/simulation/stage/Bremsstrahlung.hpp"
+#include "picongpu/simulation/stage/CurrentBackground.hpp"
+#include "picongpu/simulation/stage/CurrentDeposition.hpp"
+#include "picongpu/simulation/stage/CurrentInterpolationAndAdditionToEMF.hpp"
+#include "picongpu/simulation/stage/CurrentReset.hpp"
+#include "picongpu/simulation/stage/FieldBackground.hpp"
+#include "picongpu/simulation/stage/MomentumBackup.hpp"
+#include "picongpu/simulation/stage/ParticleIonization.hpp"
+#include "picongpu/simulation/stage/ParticlePush.hpp"
+#include "picongpu/simulation/stage/PopulationKinetics.hpp"
+#include "picongpu/simulation/stage/SynchrotronRadiation.hpp"
+#include <pmacc/random/methods/methods.hpp>
+#include <pmacc/random/RNGProvider.hpp>
+
+#if(PMACC_CUDA_ENABLED == 1)
+#    include "picongpu/particles/bremsstrahlung/ScaledSpectrum.hpp"
+#    include "picongpu/particles/bremsstrahlung/PhotonEmissionAngle.hpp"
+#endif
+
+#include "picongpu/particles/synchrotronPhotons/SynchrotronFunctions.hpp"
+
+#include <pmacc/nvidia/reduce/Reduce.hpp>
+#include <pmacc/memory/boxes/DataBoxDim1Access.hpp>
+#include <pmacc/nvidia/functors/Add.hpp>
+#include <pmacc/nvidia/functors/Sub.hpp>
+
+#include <pmacc/meta/conversion/SeqToMap.hpp>
+#include <pmacc/meta/conversion/TypeToPointerPair.hpp>
+
+#include <pmacc/meta/ForEach.hpp>
+#include "picongpu/particles/ParticlesFunctors.hpp"
+#include "picongpu/particles/InitFunctors.hpp"
+#include <pmacc/particles/memory/buffers/MallocMCBuffer.hpp>
+#include <pmacc/particles/traits/FilterByFlag.hpp>
+#include <pmacc/particles/traits/FilterByIdentifier.hpp>
+#include <pmacc/particles/IdProvider.hpp>
+
+#include <boost/mpl/int.hpp>
+#include <memory>
+#include <functional>
+
+
+namespace picongpu
+{
+    using namespace pmacc;
+
+    /**
+     * Global simulation controller class.
+     *
+     * Initialises simulation data and defines the simulation steps
+     * for each iteration.
+     *
+     * @tparam DIM the dimension (2-3) for the simulation
+     */
+    class Simulation : public SimulationHelper<simDim>
+    {
+    public:
+        /**
+         * Constructor
+         */
+        Simulation()
+            : myFieldSolver(nullptr)
+            , cellDescription(nullptr)
+            , initialiserController(nullptr)
+            , slidingWindow(false)
+            , windowMovePoint(0.0)
+            , endSlidingOnStep(-1)
+            , showVersionOnce(false)
+        {
+        }
+
+        virtual void pluginRegisterHelp(po::options_description& desc)
+        {
+            SimulationHelper<simDim>::pluginRegisterHelp(desc);
+            desc.add_options()(
+                "versionOnce",
+                po::value<bool>(&showVersionOnce)->zero_tokens(),
+                "print version information once and start")
+
+                ("devices,d",
+                 po::value<std::vector<uint32_t>>(&devices)->multitoken(),
+                 "number of devices in each dimension")
+
+                    ("grid,g",
+                     po::value<std::vector<uint32_t>>(&gridSize)->multitoken(),
+                     "size of the simulation grid")
+
+                        ("gridDist",
+                         po::value<std::vector<std::string>>(&gridDistribution)->multitoken(),
+                         "Regex to describe the static distribution of the cells for each device,"
+                         "default: equal distribution over all devices\n"
+                         "  example:\n"
+                         "    -d 2 4 1\n"
+                         "    -g 128 192 12\n"
+                         "    --gridDist \"64{2}\" \"64,32{2},64\"\n")
+
+                            ("periodic",
+                             po::value<std::vector<uint32_t>>(&periodic)->multitoken(),
+                             "specifying whether the grid is periodic (1) or not (0) in each dimension, default: no "
+                             "periodic dimensions")
+
+                                ("moving,m",
+                                 po::value<bool>(&slidingWindow)->zero_tokens(),
+                                 "enable sliding/moving window")
+                /* For now we still use the compile-time movePoint variable to set
+                 * the default value and provide backward compatibility
+                 */
+                ("windowMovePoint",
+                 po::value<float_64>(&windowMovePoint)->default_value(movePoint),
+                 "ratio of the global window size in y which defines when to "
+                 "start sliding the window. "
+                 "The window starts sliding at the time required to pass the "
+                 "distance of windowMovePoint * (global window size in y) "
+                 "when moving with the speed of light")(
+                    "stopWindow",
+                    po::value<int32_t>(&endSlidingOnStep)->default_value(-1),
+                    "stops the window at stimulation step, "
+                    "-1 means that window is never stopping")(
+                    "autoAdjustGrid",
+                    po::value<bool>(&autoAdjustGrid)->default_value(true),
+                    "auto adjust the grid size if PIConGPU conditions are not fulfilled");
+        }
+
+        std::string pluginGetName() const
+        {
+            return "PIConGPU";
+        }
+
+        virtual void pluginLoad()
+        {
+            // fill periodic with 0
+            while(periodic.size() < 3)
+                periodic.push_back(0);
+
+            // check on correct number of devices. fill with default value 1 for missing dimensions
+            if(devices.size() > 3)
+            {
+                std::cerr << "Invalid number of devices.\nuse [-d dx=1 dy=1 dz=1]" << std::endl;
+            }
+            else
+                while(devices.size() < 3)
+                    devices.push_back(1);
+
+            // check on correct grid size. fill with default grid size value 1 for missing 3. dimension
+            if(gridSize.size() < 2 || gridSize.size() > 3)
+            {
+                std::cerr << "Invalid or missing grid size.\nuse -g width height [depth=1]" << std::endl;
+            }
+            else if(gridSize.size() == 2)
+                gridSize.push_back(1);
+
+            if(slidingWindow && devices[1] == 1)
+            {
+                std::cerr << "Invalid configuration. Can't use moving window with one device in Y direction"
+                          << std::endl;
+            }
+
+            DataSpace<simDim> gridSizeGlobal;
+            DataSpace<simDim> gpus;
+            DataSpace<simDim> isPeriodic;
+
+            for(uint32_t i = 0; i < simDim; ++i)
+            {
+                gridSizeGlobal[i] = gridSize[i];
+                gpus[i] = devices[i];
+                isPeriodic[i] = periodic[i];
+            }
+
+            Environment<simDim>::get().initDevices(gpus, isPeriodic);
+            pmacc::GridController<simDim>& gc = pmacc::Environment<simDim>::get().GridController();
+
+            DataSpace<simDim> myGPUpos(gc.getPosition());
+
+            if(gc.getGlobalRank() == 0)
+            {
+                if(showVersionOnce)
+                {
+                    void(getSoftwareVersions(std::cout));
+                }
+            }
+
+            // calculate the number of local grid cells and
+            // the local cell offset to the global box
+            for(uint32_t dim = 0; dim < gridDistribution.size() && dim < simDim; ++dim)
+            {
+                // parse string
+                ParserGridDistribution parserGD(gridDistribution.at(dim));
+
+                // verify number of blocks and devices in dimension match
+                parserGD.verifyDevices(gpus[dim]);
+
+                // calculate local grid points & offset
+                gridSizeLocal[dim] = parserGD.getLocalSize(myGPUpos[dim]);
+            }
+            // by default: use an equal distributed box for all omitted params
+            for(uint32_t dim = gridDistribution.size(); dim < simDim; ++dim)
+            {
+                gridSizeLocal[dim] = gridSizeGlobal[dim] / gpus[dim];
+            }
+
+            DataSpace<simDim> gridOffset;
+
+            DomainAdjuster domainAdjuster(gpus, myGPUpos, isPeriodic, slidingWindow);
+
+            if(!autoAdjustGrid)
+                domainAdjuster.validateOnly();
+
+            domainAdjuster(gridSizeGlobal, gridSizeLocal, gridOffset);
+
+            Environment<simDim>::get().initGrids(gridSizeGlobal, gridSizeLocal, gridOffset);
+
+            if(!slidingWindow)
+            {
+                windowMovePoint = 0.0;
+                endSlidingOnStep = 0;
+            }
+            MovingWindow::getInstance().setMovePoint(windowMovePoint);
+            MovingWindow::getInstance().setEndSlideOnStep(endSlidingOnStep);
+
+            log<picLog::DOMAINS>("rank %1%; localsize %2%; localoffset %3%;") % myGPUpos.toString()
+                % gridSizeLocal.toString() % gridOffset.toString();
+
+            SimulationHelper<simDim>::pluginLoad();
+
+            GridLayout<simDim> layout(gridSizeLocal, GuardSize::toRT() * SuperCellSize::toRT());
+            cellDescription = new MappingDesc(layout.getDataSpace(), DataSpace<simDim>(GuardSize::toRT()));
+
+            if(gc.getGlobalRank() == 0)
+            {
+                if(MovingWindow::getInstance().isEnabled())
+                    log<picLog::PHYSICS>("Sliding Window is ON");
+                else
+                    log<picLog::PHYSICS>("Sliding Window is OFF");
+            }
+        }
+
+        virtual void pluginUnload()
+        {
+            DataConnector& dc = Environment<>::get().DataConnector();
+
+            SimulationHelper<simDim>::pluginUnload();
+
+            __delete(myFieldSolver);
+
+            /** unshare all registered ISimulationData sets
+             *
+             * @todo can be removed as soon as our Environment learns to shutdown in
+             *       a distinct order, e.g. DataConnector before CUDA context
+             */
+            dc.clean();
+
+            __delete(cellDescription);
+        }
+
+        void notify(uint32_t)
+        {
+        }
+
+        virtual void init()
+        {
+            namespace nvmem = pmacc::nvidia::memory;
+
+            // This has to be called before initFields()
+            currentInterpolationAndAdditionToEMF.init();
+
+            DataConnector& dc = Environment<>::get().DataConnector();
+            initFields(dc);
+
+            // create field solver
+            this->myFieldSolver = new fields::Solver(*cellDescription);
+
+            // Initialize random number generator and synchrotron functions, if there are synchrotron or bremsstrahlung
+            // Photons
+            using AllSynchrotronPhotonsSpecies =
+                typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, synchrotronPhotons<>>::type;
+            using AllBremsstrahlungPhotonsSpecies =
+                typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, bremsstrahlungPhotons<>>::type;
+
+            // create factory for the random number generator
+            const uint32_t userSeed = random::seed::ISeed<random::SeedGenerator>{}();
+            const uint32_t seed = std::hash<std::string>{}(std::to_string(userSeed));
+
+            using RNGFactory = pmacc::random::RNGProvider<simDim, random::Generator>;
+            auto rngFactory = std::make_unique<RNGFactory>(Environment<simDim>::get().SubGrid().getLocalDomain().size);
+            if(Environment<simDim>::get().GridController().getGlobalRank() == 0)
+            {
+                log<picLog::PHYSICS>("used Random Number Generator: %1% seed: %2%") % rngFactory->getName() % userSeed;
+            }
+
+            // init and share random number generator
+            pmacc::GridController<simDim>& gridCon = pmacc::Environment<simDim>::get().GridController();
+            rngFactory->init(gridCon.getScalarPosition() ^ seed);
+            dc.consume(std::move(rngFactory));
+
+            // Initialize synchrotron functions, if there are synchrotron photon species
+            if(!bmpl::empty<AllSynchrotronPhotonsSpecies>::value)
+            {
+                this->synchrotronFunctions.init();
+            }
+#if(PMACC_CUDA_ENABLED == 1)
+            // Initialize bremsstrahlung lookup tables, if there are species containing bremsstrahlung photons
+            if(!bmpl::empty<AllBremsstrahlungPhotonsSpecies>::value)
+            {
+                meta::ForEach<
+                    AllBremsstrahlungPhotonsSpecies,
+                    particles::bremsstrahlung::FillScaledSpectrumMap<bmpl::_1>>
+                    fillScaledSpectrumMap;
+                fillScaledSpectrumMap(this->scaledBremsstrahlungSpectrumMap);
+
+                this->bremsstrahlungPhotonAngle.init();
+            }
+#endif
+
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+            auto nativeCudaStream = cupla::manager::Stream<cupla::AccDev, cupla::AccStream>::get().stream(0);
+            /* Create an empty allocator. This one is resized after all exchanges
+             * for particles are created */
+            deviceHeap.reset(
+
+                new DeviceHeap(cupla::manager::Device<cupla::AccDev>::get().current(), nativeCudaStream, 0u));
+            cuplaStreamSynchronize(0);
+#endif
+
+            /* Allocate helper fields for FLYlite population kinetics for atomic physics
+             * (histograms, rate matrix, etc.)
+             */
+            using AllFlyLiteIons =
+                typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, populationKinetics<>>::type;
+
+            meta::ForEach<AllFlyLiteIons, particles::CallPopulationKineticsInit<bmpl::_1>, bmpl::_1>
+                initPopulationKinetics;
+            initPopulationKinetics(gridSizeLocal);
+
+            // Allocate and initialize particle species with all left-over memory below
+            meta::ForEach<VectorAllSpecies, particles::CreateSpecies<bmpl::_1>> createSpeciesMemory;
+            createSpeciesMemory(deviceHeap, cellDescription);
+
+            size_t freeGpuMem(0);
+            Environment<>::get().MemoryInfo().getMemoryInfo(&freeGpuMem);
+            if(freeGpuMem < reservedGpuMemorySize)
+            {
+                pmacc::log<picLog::MEMORY>("%1% MiB free memory < %2% MiB required reserved memory")
+                    % (freeGpuMem / 1024 / 1024) % (reservedGpuMemorySize / 1024 / 1024);
+                std::stringstream msg;
+                msg << "Cannot reserve " << (reservedGpuMemorySize / 1024 / 1024) << " MiB as there is only "
+                    << (freeGpuMem / 1024 / 1024) << " MiB free device memory left";
+                throw std::runtime_error(msg.str());
+            }
+
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+            size_t heapSize = freeGpuMem - reservedGpuMemorySize;
+
+            if(Environment<>::get().MemoryInfo().isSharedMemoryPool())
+            {
+                heapSize /= 2;
+                log<picLog::MEMORY>(
+                    "Shared RAM between GPU and host detected - using only half of the 'device' memory.");
+            }
+            else
+                log<picLog::MEMORY>("RAM is NOT shared between GPU and host.");
+
+            // initializing the heap for particles
+            deviceHeap->destructiveResize(
+                cupla::manager::Device<cupla::AccDev>::get().current(),
+                nativeCudaStream,
+                heapSize);
+            cuplaStreamSynchronize(0);
+
+            auto mallocMCBuffer = std::make_unique<MallocMCBuffer<DeviceHeap>>(deviceHeap);
+            dc.consume(std::move(mallocMCBuffer));
+
+#endif
+
+            meta::ForEach<VectorAllSpecies, particles::LogMemoryStatisticsForSpecies<bmpl::_1>>
+                logMemoryStatisticsForSpecies;
+            logMemoryStatisticsForSpecies(deviceHeap);
+
+            Environment<>::get().MemoryInfo().getMemoryInfo(&freeGpuMem);
+            log<picLog::MEMORY>("free mem after all mem is allocated %1% MiB") % (freeGpuMem / 1024 / 1024);
+
+            IdProvider<simDim>::init();
+
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+            /* add CUDA streams to the StreamController for concurrent execution */
+            Environment<>::get().StreamController().addStreams(6);
+#endif
+        }
+
+        virtual uint32_t fillSimulation()
+        {
+            /* assume start (restart in initialiserController might change that) */
+            uint32_t step = 0;
+
+            /* set slideCounter properties for PIConGPU MovingWindow: assume start
+             * (restart in initialiserController might change this again)
+             */
+            MovingWindow::getInstance().setSlideCounter(0, 0);
+            /* Update MPI domain decomposition: will also update SubGrid domain
+             * information such as local offsets in y-direction
+             */
+            GridController<simDim>& gc = Environment<simDim>::get().GridController();
+            gc.setStateAfterSlides(0);
+
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+            auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
+
+            /* fill all objects registed in DataConnector */
+            if(initialiserController)
+            {
+                initialiserController->printInformation();
+                if(this->restartRequested)
+                {
+                    /* we do not require '--checkpoint.restart.step' if a master checkpoint file is found */
+                    if(this->restartStep < 0)
+                    {
+                        std::vector<uint32_t> checkpoints = readCheckpointMasterFile();
+
+                        if(checkpoints.empty())
+                        {
+                            throw std::runtime_error("Restart failed. You must provide the "
+                                                     "'--checkpoint.restart.step' argument. See picongpu --help.");
+                        }
+                        else
+                            this->restartStep = checkpoints.back();
+                    }
+
+                    initialiserController->restart((uint32_t) this->restartStep, this->restartDirectory);
+                    step = this->restartStep;
+
+                    /** restore background fields in GUARD
+                     *
+                     * loads the outer GUARDS of the global domain for absorbing/open boundary condtions
+                     *
+                     * @todo as soon as we add GUARD fields to the checkpoint data, e.g. for PML boundary
+                     *       conditions, this section needs to be removed
+                     */
+                    cellwiseOperation::CellwiseOperation<GUARD> guardBGField(*cellDescription);
+                    namespace nvfct = pmacc::nvidia::functors;
+                    guardBGField(
+                        fieldE,
+                        nvfct::Add(),
+                        FieldBackgroundE(fieldE->getUnit()),
+                        step,
+                        FieldBackgroundE::InfluenceParticlePusher);
+                    guardBGField(
+                        fieldB,
+                        nvfct::Add(),
+                        FieldBackgroundB(fieldB->getUnit()),
+                        step,
+                        FieldBackgroundB::InfluenceParticlePusher);
+                }
+                else
+                {
+                    initialiserController->init();
+                    meta::ForEach<particles::InitPipeline, particles::CallFunctor<bmpl::_1>> initSpecies;
+                    initSpecies(step);
+                }
+            }
+
+            size_t freeGpuMem(0u);
+            Environment<>::get().MemoryInfo().getMemoryInfo(&freeGpuMem);
+            log<picLog::MEMORY>("free mem after all particles are initialized %1% MiB") % (freeGpuMem / 1024 / 1024);
+
+            // generate valid GUARDS (overwrite)
+            EventTask eRfieldE = fieldE->asyncCommunication(__getTransactionEvent());
+            __setTransactionEvent(eRfieldE);
+            EventTask eRfieldB = fieldB->asyncCommunication(__getTransactionEvent());
+            __setTransactionEvent(eRfieldB);
+
+            dc.releaseData(FieldE::getName());
+            dc.releaseData(FieldB::getName());
+
+            return step;
+        }
+
+        /**
+         * Run one simulation step.
+         *
+         * @param currentStep iteration number of the current step
+         */
+        virtual void runOneStep(uint32_t currentStep)
+        {
+            using namespace simulation::stage;
+            MomentumBackup{}(currentStep);
+            CurrentReset{}(currentStep);
+            ParticleIonization{*cellDescription}(currentStep);
+            PopulationKinetics{}(currentStep);
+            SynchrotronRadiation{*cellDescription, synchrotronFunctions}(currentStep);
+#if(PMACC_CUDA_ENABLED == 1)
+            Bremsstrahlung{*cellDescription, scaledBremsstrahlungSpectrumMap, bremsstrahlungPhotonAngle}(currentStep);
+#endif
+            EventTask commEvent;
+            ParticlePush{}(currentStep, commEvent);
+            FieldBackground{*cellDescription}(currentStep, nvidia::functors::Sub());
+            myFieldSolver->update_beforeCurrent(currentStep);
+            __setTransactionEvent(commEvent);
+            CurrentBackground{*cellDescription}(currentStep);
+            CurrentDeposition{}(currentStep);
+            currentInterpolationAndAdditionToEMF(currentStep);
+            myFieldSolver->update_afterCurrent(currentStep);
+        }
+
+        virtual void movingWindowCheck(uint32_t currentStep)
+        {
+            if(MovingWindow::getInstance().slideInCurrentStep(currentStep))
+            {
+                slide(currentStep);
+            }
+
+            /* do not double-add background field on restarts
+             * (contained in checkpoint data)
+             */
+            bool addBgFields = true;
+            if(this->restartRequested)
+            {
+                if(this->restartStep == int32_t(currentStep))
+                    addBgFields = false;
+            }
+
+            if(addBgFields)
+            {
+                /** add background field: the movingWindowCheck is just at the start
+                 * of a time step before all the plugins are called (and the step
+                 * itself is performed for this time step).
+                 * Hence the background field is visible for all plugins
+                 * in between the time steps.
+                 */
+                simulation::stage::FieldBackground{*cellDescription}(currentStep, nvidia::functors::Add());
+            }
+        }
+
+        virtual void resetAll(uint32_t currentStep)
+        {
+            resetFields(currentStep);
+            meta::ForEach<VectorAllSpecies, particles::CallReset<bmpl::_1>> resetParticles;
+            resetParticles(currentStep);
+        }
+
+        void slide(uint32_t currentStep)
+        {
+            GridController<simDim>& gc = Environment<simDim>::get().GridController();
+
+            if(gc.slide())
+            {
+                log<picLog::SIMULATION_STATE>("slide in step %1%") % currentStep;
+                resetAll(currentStep);
+                initialiserController->slide(currentStep);
+                meta::ForEach<particles::InitPipeline, particles::CallFunctor<bmpl::_1>> initSpecies;
+                initSpecies(currentStep);
+            }
+        }
+
+        virtual void setInitController(IInitPlugin* initController)
+        {
+            PMACC_ASSERT(initController != nullptr);
+            this->initialiserController = initController;
+        }
+
+        MappingDesc* getMappingDescription()
+        {
+            return cellDescription;
+        }
+
+    protected:
+        std::shared_ptr<DeviceHeap> deviceHeap;
+
+        fields::Solver* myFieldSolver;
+        simulation::stage::CurrentInterpolationAndAdditionToEMF currentInterpolationAndAdditionToEMF;
+
+#if(PMACC_CUDA_ENABLED == 1)
+        // creates lookup tables for the bremsstrahlung effect
+        // map<atomic number, scaled bremsstrahlung spectrum>
+        std::map<float_X, particles::bremsstrahlung::ScaledSpectrum> scaledBremsstrahlungSpectrumMap;
+        particles::bremsstrahlung::GetPhotonAngle bremsstrahlungPhotonAngle;
+#endif
+
+        // Synchrotron functions (used in synchrotronPhotons module)
+        particles::synchrotronPhotons::SynchrotronFunctions synchrotronFunctions;
+
+        // output classes
+
+        IInitPlugin* initialiserController;
+
+        MappingDesc* cellDescription;
+
+        // layout parameter
+        std::vector<uint32_t> devices;
+        std::vector<uint32_t> gridSize;
+        /** Without guards */
+        DataSpace<simDim> gridSizeLocal;
+        std::vector<uint32_t> periodic;
+
+        std::vector<std::string> gridDistribution;
+
+        bool slidingWindow;
+        int32_t endSlidingOnStep;
+        float_64 windowMovePoint;
+        bool showVersionOnce;
+        bool autoAdjustGrid = true;
+
+    private:
+        void initFields(DataConnector& dataConnector)
+        {
+            auto fieldB = std::make_unique<FieldB>(*cellDescription);
+            dataConnector.consume(std::move(fieldB));
+            auto fieldE = std::make_unique<FieldE>(*cellDescription);
+            dataConnector.consume(std::move(fieldE));
+            auto fieldJ = std::make_unique<FieldJ>(*cellDescription);
+            dataConnector.consume(std::move(fieldJ));
+            for(uint32_t slot = 0; slot < fieldTmpNumSlots; ++slot)
+            {
+                auto fieldTmp = std::make_unique<FieldTmp>(*cellDescription, slot);
+                dataConnector.consume(std::move(fieldTmp));
+            }
+        }
+
+        /** Reset all fields
+         *
+         * @param currentStep iteration number of the current step
+         */
+        void resetFields(uint32_t const currentStep)
+        {
+            auto resetField = [currentStep](std::string const name) {
+                DataConnector& dc = Environment<>::get().DataConnector();
+                auto const fieldExists = dc.hasId(name);
+                if(fieldExists)
+                {
+                    using FieldHelper = SimulationFieldHelper<MappingDesc>;
+                    auto field = std::dynamic_pointer_cast<FieldHelper>(dc.get<ISimulationData>(name, true));
+                    if(field)
+                        field->reset(currentStep);
+                    dc.releaseData(name);
+                }
+            };
+
+            /* @todo for now the list of fields is hardcoded here, a more generic
+             * solution would require changes to design of DataConnector.
+             * FieldJ and FieldTmp are effectively cleared each time iteration and
+             * so do not need a reset.
+             */
+            std::array<std::string, 4> const fieldNames{
+                {FieldE::getName(),
+                 FieldB::getName(),
+                 fields::maxwellSolver::yeePML::FieldE::getName(),
+                 fields::maxwellSolver::yeePML::FieldB::getName()}};
+            std::for_each(fieldNames.cbegin(), fieldNames.cend(), resetField);
+        }
+    };
+} /* namespace picongpu */
+
+#include "picongpu/fields/Fields.tpp"
+#include "picongpu/particles/synchrotronPhotons/SynchrotronFunctions.tpp"
+
+#if(PMACC_CUDA_ENABLED == 1)
+#    include "picongpu/particles/bremsstrahlung/Bremsstrahlung.tpp"
+#    include "picongpu/particles/bremsstrahlung/ScaledSpectrum.tpp"
+#endif
diff --git a/include/picongpu/simulation/control/SimulationStarter.hpp b/include/picongpu/simulation/control/SimulationStarter.hpp
index 808d7b2c3f..a98bfe1371 100644
--- a/include/picongpu/simulation/control/SimulationStarter.hpp
+++ b/include/picongpu/simulation/control/SimulationStarter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -49,8 +49,8 @@ namespace picongpu
 
 
         MappingDesc* mappingDesc;
-    public:
 
+    public:
         SimulationStarter() : mappingDesc(nullptr)
         {
             simulationClass = new SimulationClass();
@@ -75,7 +75,7 @@ namespace picongpu
         {
             PluginConnector& pluginConnector = Environment<>::get().PluginConnector();
             pluginConnector.loadPlugins();
-            log<picLog::SIMULATION_STATE > ("Startup");
+            log<picLog::SIMULATION_STATE>("Startup");
             simulationClass->setInitController(initClass);
             simulationClass->startSimulation();
         }
@@ -88,7 +88,7 @@ namespace picongpu
         {
         }
 
-        ArgsParser::Status parseConfigs(int argc, char **argv)
+        ArgsParser::Status parseConfigs(int argc, char** argv)
         {
             ArgsParser& ap = ArgsParser::getInstance();
             PluginConnector& pluginConnector = Environment<>::get().PluginConnector();
@@ -108,8 +108,7 @@ namespace picongpu
             // setup all boost::program_options and add to ArgsParser
             BoostOptionsList options = pluginConnector.registerHelp();
 
-            for (BoostOptionsList::const_iterator iter = options.begin();
-                 iter != options.end(); ++iter)
+            for(BoostOptionsList::const_iterator iter = options.begin(); iter != options.end(); ++iter)
             {
                 ap.addOptions(*iter);
             }
@@ -117,8 +116,8 @@ namespace picongpu
             // parse environment variables, config files and command line
             return ap.parse(argc, argv);
         }
-    protected:
 
+    protected:
         void pluginLoad()
         {
             simulationClass->load();
@@ -135,17 +134,16 @@ namespace picongpu
             pluginClass->unload();
             simulationClass->unload();
         }
-    private:
 
-        void printStartParameters(int argc, char **argv)
+    private:
+        void printStartParameters(int argc, char** argv)
         {
             std::cout << "Start Parameters: ";
-            for (int i = 0; i < argc; ++i)
+            for(int i = 0; i < argc; ++i)
             {
                 std::cout << argv[i] << " ";
             }
             std::cout << std::endl;
         }
     };
-}
-
+} // namespace picongpu
diff --git a/include/picongpu/simulation/control/Window.hpp b/include/picongpu/simulation/control/Window.hpp
index 2f3c5354dd..fa43c20d92 100644
--- a/include/picongpu/simulation/control/Window.hpp
+++ b/include/picongpu/simulation/control/Window.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -24,21 +24,20 @@
 
 namespace picongpu
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-/**
- * Window describes sizes and offsets.
- *
- * For a detailed description of windows, see the PIConGPU wiki page:
- * https://github.com/ComputationalRadiationPhysics/picongpu/wiki/PIConGPU-domain-definitions
- */
-struct Window
-{
-    /* Dimensions (size/offset) of the global virtual window over all GPUs */
-    Selection<simDim> globalDimensions;
-
-    /* Dimensions (size/offset) of the local virtual window on this GPU */
-    Selection<simDim> localDimensions;
-};
-}
+    /**
+     * Window describes sizes and offsets.
+     *
+     * For a detailed description of windows, see the PIConGPU wiki page:
+     * https://github.com/ComputationalRadiationPhysics/picongpu/wiki/PIConGPU-domain-definitions
+     */
+    struct Window
+    {
+        /* Dimensions (size/offset) of the global virtual window over all GPUs */
+        Selection<simDim> globalDimensions;
 
+        /* Dimensions (size/offset) of the local virtual window on this GPU */
+        Selection<simDim> localDimensions;
+    };
+} // namespace picongpu
diff --git a/include/picongpu/simulation/stage/Bremsstrahlung.hpp b/include/picongpu/simulation/stage/Bremsstrahlung.hpp
index b6d3d4e34a..25052afe46 100644
--- a/include/picongpu/simulation/stage/Bremsstrahlung.hpp
+++ b/include/picongpu/simulation/stage/Bremsstrahlung.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -22,97 +22,79 @@
 #pragma once
 
 // Bremsstrahlung is available only with CUDA
-#if( PMACC_CUDA_ENABLED == 1 )
+#if(PMACC_CUDA_ENABLED == 1)
 
-#include "picongpu/particles/bremsstrahlung/PhotonEmissionAngle.hpp"
-#include "picongpu/particles/bremsstrahlung/ScaledSpectrum.hpp"
-#include "picongpu/particles/ParticlesFunctors.hpp"
+#    include "picongpu/particles/bremsstrahlung/PhotonEmissionAngle.hpp"
+#    include "picongpu/particles/bremsstrahlung/ScaledSpectrum.hpp"
+#    include "picongpu/particles/ParticlesFunctors.hpp"
 
-#include <pmacc/meta/ForEach.hpp>
-#include <pmacc/particles/traits/FilterByFlag.hpp>
+#    include <pmacc/meta/ForEach.hpp>
+#    include <pmacc/particles/traits/FilterByFlag.hpp>
 
-#include <cstdint>
-#include <map>
+#    include <cstdint>
+#    include <map>
 
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    /** Functor for the stage of the PIC loop computing Bremsstrahlung
-     *
-     * Only affects particle species with the bremsstrahlungIons attribute.
-     */
-    class Bremsstrahlung
+    namespace simulation
     {
-    public:
-
-        using ScaledSpectrumMap = std::map<
-            float_X,
-            particles::bremsstrahlung::ScaledSpectrum
-        >;
-
-        /** Create a Bremsstrahlung functor
-         *
-         * Having this in constructor is a temporary solution.
-         *
-         * @param cellDescription mapping for kernels
-         * @param scaledSpectrumMap initialized spectrum lookup table
-         * @param photonAngle initialized photon angle lookup table
-         */
-        Bremsstrahlung(
-            MappingDesc const cellDescription,
-            ScaledSpectrumMap & scaledSpectrumMap,
-            particles::bremsstrahlung::GetPhotonAngle & photonAngle
-        ):
-            cellDescription( cellDescription ),
-            scaledSpectrumMap( scaledSpectrumMap ),
-            photonAngle( photonAngle )
-        {
-        }
-
-        /** Ionize particles
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const step ) const
+        namespace stage
         {
-            using pmacc::particles::traits::FilterByFlag;
-            using SpeciesWithBremsstrahlung = typename FilterByFlag
-            <
-                VectorAllSpecies,
-                bremsstrahlungIons< >
-            >::type;
-            pmacc::meta::ForEach<
-                SpeciesWithBremsstrahlung,
-                particles::CallBremsstrahlung< bmpl::_1 >
-            > particleBremsstrahlung;
-            particleBremsstrahlung(
-                cellDescription,
-                step,
-                scaledSpectrumMap,
-                photonAngle
-            );
-        }
-
-    private:
-
-        //! Mapping for kernels
-        MappingDesc cellDescription;
-
-        //! Loopup table: atomic number -> scaled bremsstrahlung spectrum
-        ScaledSpectrumMap & scaledSpectrumMap;
-
-        //! Loopup table for photon angle
-        particles::bremsstrahlung::GetPhotonAngle & photonAngle;
-
-    };
-
-} // namespace stage
-} // namespace simulation
+            /** Functor for the stage of the PIC loop computing Bremsstrahlung
+             *
+             * Only affects particle species with the bremsstrahlungIons attribute.
+             */
+            class Bremsstrahlung
+            {
+            public:
+                using ScaledSpectrumMap = std::map<float_X, particles::bremsstrahlung::ScaledSpectrum>;
+
+                /** Create a Bremsstrahlung functor
+                 *
+                 * Having this in constructor is a temporary solution.
+                 *
+                 * @param cellDescription mapping for kernels
+                 * @param scaledSpectrumMap initialized spectrum lookup table
+                 * @param photonAngle initialized photon angle lookup table
+                 */
+                Bremsstrahlung(
+                    MappingDesc const cellDescription,
+                    ScaledSpectrumMap& scaledSpectrumMap,
+                    particles::bremsstrahlung::GetPhotonAngle& photonAngle)
+                    : cellDescription(cellDescription)
+                    , scaledSpectrumMap(scaledSpectrumMap)
+                    , photonAngle(photonAngle)
+                {
+                }
+
+                /** Ionize particles
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const step) const
+                {
+                    using pmacc::particles::traits::FilterByFlag;
+                    using SpeciesWithBremsstrahlung =
+                        typename FilterByFlag<VectorAllSpecies, bremsstrahlungIons<>>::type;
+                    pmacc::meta::ForEach<SpeciesWithBremsstrahlung, particles::CallBremsstrahlung<bmpl::_1>>
+                        particleBremsstrahlung;
+                    particleBremsstrahlung(cellDescription, step, scaledSpectrumMap, photonAngle);
+                }
+
+            private:
+                //! Mapping for kernels
+                MappingDesc cellDescription;
+
+                //! Loopup table: atomic number -> scaled bremsstrahlung spectrum
+                ScaledSpectrumMap& scaledSpectrumMap;
+
+                //! Loopup table for photon angle
+                particles::bremsstrahlung::GetPhotonAngle& photonAngle;
+            };
+
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
 
 #endif // ( PMACC_CUDA_ENABLED == 1 )
diff --git a/include/picongpu/simulation/stage/CurrentBackground.hpp b/include/picongpu/simulation/stage/CurrentBackground.hpp
index 3d43e292a3..64abe3b7fe 100644
--- a/include/picongpu/simulation/stage/CurrentBackground.hpp
+++ b/include/picongpu/simulation/stage/CurrentBackground.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -34,57 +34,49 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    //! Functor for the stage of the PIC loop applying current background
-    class CurrentBackground
+    namespace simulation
     {
-    public:
-
-        /** Create a current background functor
-         *
-         * Having this in constructor is a temporary solution.
-         *
-         * @param cellDescription mapping for kernels
-         */
-        CurrentBackground( MappingDesc const cellDescription ):
-            cellDescription( cellDescription )
+        namespace stage
         {
-        }
-
-        /** Add the current background to the current density
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const step ) const
-        {
-            using namespace pmacc;
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            auto & fieldJ = *dc.get< FieldJ >( FieldJ::getName( ), true );
-            using CurrentBackground = cellwiseOperation::CellwiseOperation<
-                type::CORE + type::BORDER
-            >;
-            CurrentBackground currentBackground( cellDescription );
-            currentBackground(
-                &fieldJ,
-                nvidia::functors::Add( ),
-                FieldBackgroundJ( fieldJ.getUnit() ),
-                step,
-                FieldBackgroundJ::activated
-            );
-            dc.releaseData( FieldJ::getName( ) );
-        }
-
-    private:
+            //! Functor for the stage of the PIC loop applying current background
+            class CurrentBackground
+            {
+            public:
+                /** Create a current background functor
+                 *
+                 * Having this in constructor is a temporary solution.
+                 *
+                 * @param cellDescription mapping for kernels
+                 */
+                CurrentBackground(MappingDesc const cellDescription) : cellDescription(cellDescription)
+                {
+                }
 
-        //! Mapping for kernels
-        MappingDesc cellDescription;
+                /** Add the current background to the current density
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const step) const
+                {
+                    using namespace pmacc;
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto& fieldJ = *dc.get<FieldJ>(FieldJ::getName(), true);
+                    using CurrentBackground = cellwiseOperation::CellwiseOperation<type::CORE + type::BORDER>;
+                    CurrentBackground currentBackground(cellDescription);
+                    currentBackground(
+                        &fieldJ,
+                        nvidia::functors::Add(),
+                        FieldBackgroundJ(fieldJ.getUnit()),
+                        step,
+                        FieldBackgroundJ::activated);
+                    dc.releaseData(FieldJ::getName());
+                }
 
-    };
+            private:
+                //! Mapping for kernels
+                MappingDesc cellDescription;
+            };
 
-} // namespace stage
-} // namespace simulation
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/CurrentDeposition.hpp b/include/picongpu/simulation/stage/CurrentDeposition.hpp
index 597d13ab81..7725072338 100644
--- a/include/picongpu/simulation/stage/CurrentDeposition.hpp
+++ b/include/picongpu/simulation/stage/CurrentDeposition.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -35,65 +35,52 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-namespace detail
-{
-
-    template<
-        typename T_SpeciesType,
-        typename T_Area
-    >
-    struct CurrentDeposition
+    namespace simulation
     {
-        using SpeciesType = T_SpeciesType;
-        using FrameType = typename SpeciesType::FrameType;
-
-        HINLINE void operator( )(
-            const uint32_t currentStep,
-            FieldJ & fieldJ,
-            pmacc::DataConnector & dc
-        ) const
+        namespace stage
         {
-            auto species = dc.get< SpeciesType >( FrameType::getName(), true );
-            fieldJ.computeCurrent< T_Area::value, SpeciesType >( *species, currentStep );
-            dc.releaseData( FrameType::getName() );
-        }
-    };
+            namespace detail
+            {
+                template<typename T_SpeciesType, typename T_Area>
+                struct CurrentDeposition
+                {
+                    using SpeciesType = T_SpeciesType;
+                    using FrameType = typename SpeciesType::FrameType;
 
-} // namespace detail
+                    HINLINE void operator()(const uint32_t currentStep, FieldJ& fieldJ, pmacc::DataConnector& dc) const
+                    {
+                        auto species = dc.get<SpeciesType>(FrameType::getName(), true);
+                        fieldJ.computeCurrent<T_Area::value, SpeciesType>(*species, currentStep);
+                        dc.releaseData(FrameType::getName());
+                    }
+                };
 
-    //! Functor for the stage of the PIC loop performing current deposition
-    struct CurrentDeposition
-    {
-        /** Compute the current created by particles and add it to the current
-         *  density
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const step ) const
-        {
-            using namespace pmacc;
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            auto & fieldJ = *dc.get< FieldJ >( FieldJ::getName( ), true );
-            using SpeciesWithCurrentSolver = typename pmacc::particles::traits::FilterByFlag<
-                VectorAllSpecies,
-                current< >
-            >::type;
-            meta::ForEach<
-                SpeciesWithCurrentSolver,
-                detail::CurrentDeposition<
-                    bmpl::_1,
-                    bmpl::int_< type::CORE + type::BORDER >
-                >
-            > depositCurrent;
-            depositCurrent( step, fieldJ, dc );
-            dc.releaseData( FieldJ::getName( ) );
-        }
-    };
+            } // namespace detail
+
+            //! Functor for the stage of the PIC loop performing current deposition
+            struct CurrentDeposition
+            {
+                /** Compute the current created by particles and add it to the current
+                 *  density
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const step) const
+                {
+                    using namespace pmacc;
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto& fieldJ = *dc.get<FieldJ>(FieldJ::getName(), true);
+                    using SpeciesWithCurrentSolver =
+                        typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, current<>>::type;
+                    meta::ForEach<
+                        SpeciesWithCurrentSolver,
+                        detail::CurrentDeposition<bmpl::_1, bmpl::int_<type::CORE + type::BORDER>>>
+                        depositCurrent;
+                    depositCurrent(step, fieldJ, dc);
+                    dc.releaseData(FieldJ::getName());
+                }
+            };
 
-} // namespace stage
-} // namespace simulation
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/CurrentInterpolationAndAdditionToEMF.hpp b/include/picongpu/simulation/stage/CurrentInterpolationAndAdditionToEMF.hpp
index 5a3a3c8b82..42ab2504a4 100644
--- a/include/picongpu/simulation/stage/CurrentInterpolationAndAdditionToEMF.hpp
+++ b/include/picongpu/simulation/stage/CurrentInterpolationAndAdditionToEMF.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -35,75 +35,108 @@
 #include <boost/mpl/count.hpp>
 
 #include <cstdint>
+#include <stdexcept>
+#include <type_traits>
 
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    /** Functor for the stage of the PIC loop performing current interpolation
-     *  and addition to grid values of the electromagnetic field
-     */
-    struct CurrentInterpolationAndAdditionToEMF
+    namespace simulation
     {
-        /** Compute the current created by particles and add it to the current
-         *  density
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const step ) const
+        namespace stage
         {
-            using namespace pmacc;
-            using SpeciesWithCurrentSolver = typename pmacc::particles::traits::FilterByFlag<
-                VectorAllSpecies,
-                current< >
-            >::type;
-            auto const numSpeciesWithCurrentSolver =
-                bmpl::size< SpeciesWithCurrentSolver >::type::value;
-            auto const existsCurrent = numSpeciesWithCurrentSolver > 0;
-            if( !existsCurrent )
-                return;
+            /** Functor for the stage of the PIC loop performing current interpolation
+             *  and addition to grid values of the electromagnetic field
+             */
+            class CurrentInterpolationAndAdditionToEMF
+            {
+            public:
+                /** Initialize the current interpolation stage
+                 *
+                 * This method has to be called during initialization of the simulation.
+                 * Before this method is called, the instance of CurrentInterpolation cannot be used safely.
+                 */
+                void init()
+                {
+                    // Convert compile-time current interpolation set in the field solver to a runtime value
+                    using namespace currentInterpolation;
+                    auto& interpolation = CurrentInterpolationInfo::get();
+                    if(std::is_same<fields::Solver::CurrentInterpolation, None>::value)
+                        interpolation.kind = CurrentInterpolationInfo::Kind::None;
+                    else if(std::is_same<fields::Solver::CurrentInterpolation, Binomial>::value)
+                        interpolation.kind = CurrentInterpolationInfo::Kind::Binomial;
+                    else
+                        throw std::runtime_error("Unsupported current interpolation type used in the field solver");
+                }
 
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            auto & fieldJ = *dc.get< FieldJ >( FieldJ::getName( ), true );
-            auto eRecvCurrent = fieldJ.asyncCommunication( __getTransactionEvent() );
-            using CurrentInterpolation = fields::Solver::CurrentInterpolation;
-            CurrentInterpolation currentInterpolation;
-            using Margin = traits::GetMargin< CurrentInterpolation >;
-            DataSpace< simDim > const currentRecvLower( Margin::LowerMargin().toRT() );
-            DataSpace< simDim > const currentRecvUpper( Margin::UpperMargin().toRT() );
+                /** Compute the current created by particles and add it to the current density
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const step) const
+                {
+                    using namespace pmacc;
+                    using SpeciesWithCurrentSolver =
+                        typename pmacc::particles::traits::FilterByFlag<VectorAllSpecies, current<>>::type;
+                    auto const numSpeciesWithCurrentSolver = bmpl::size<SpeciesWithCurrentSolver>::type::value;
+                    auto const existsCurrent = numSpeciesWithCurrentSolver > 0;
+                    if(!existsCurrent)
+                        return;
 
-            /* without interpolation, we do not need to access the FieldJ GUARD
-            * and can therefore overlap communication of GUARD->(ADD)BORDER & computation of CORE */
-            if( currentRecvLower == DataSpace< simDim >::create( 0 ) &&
-                currentRecvUpper == DataSpace< simDim >::create( 0 )
-            )
-            {
-                fieldJ.addCurrentToEMF< type::CORE >( currentInterpolation );
-                __setTransactionEvent( eRecvCurrent );
-                fieldJ.addCurrentToEMF< type::BORDER >( currentInterpolation );
-            }
-            else
-            {
-                /* in case we perform a current interpolation/filter, we need
-                * to access the BORDER area from the CORE (and the GUARD area
-                * from the BORDER)
-                * `fieldJ->asyncCommunication` first adds the neighbors' values
-                * to BORDER (send) and then updates the GUARD (receive)
-                * \todo split the last `receive` part in a separate method to
-                *       allow already a computation of CORE */
-                __setTransactionEvent( eRecvCurrent );
-                fieldJ.addCurrentToEMF<
-                    type::CORE + type::BORDER
-                >( currentInterpolation );
-            }
-            dc.releaseData( FieldJ::getName( ) );
-        }
-    };
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto& fieldJ = *dc.get<FieldJ>(FieldJ::getName(), true);
+                    auto eRecvCurrent = fieldJ.asyncCommunication(__getTransactionEvent());
+                    auto& interpolation = currentInterpolation::CurrentInterpolationInfo::get();
+                    auto const currentRecvLower = interpolation.getLowerMargin();
+                    auto const currentRecvUpper = interpolation.getUpperMargin();
+
+                    /* without interpolation, we do not need to access the FieldJ GUARD
+                     * and can therefore overlap communication of GUARD->(ADD)BORDER & computation of CORE
+                     */
+                    if(currentRecvLower == DataSpace<simDim>::create(0)
+                       && currentRecvUpper == DataSpace<simDim>::create(0))
+                    {
+                        addCurrentToEMF<type::CORE>(fieldJ);
+                        __setTransactionEvent(eRecvCurrent);
+                        addCurrentToEMF<type::BORDER>(fieldJ);
+                    }
+                    else
+                    {
+                        /* in case we perform a current interpolation/filter, we need
+                         * to access the BORDER area from the CORE (and the GUARD area
+                         * from the BORDER)
+                         * `fieldJ->asyncCommunication` first adds the neighbors' values
+                         * to BORDER (send) and then updates the GUARD (receive)
+                         * \todo split the last `receive` part in a separate method to
+                         *       allow already a computation of CORE */
+                        __setTransactionEvent(eRecvCurrent);
+                        addCurrentToEMF<type::CORE + type::BORDER>(fieldJ);
+                    }
+                    dc.releaseData(FieldJ::getName());
+                }
+
+            private:
+                /* Call addCurrentToEMF method of fieldJ for the given area
+                 *
+                 * This function performs a transition from the run-time realm of CurrentInterpolation into the
+                 * template realm of fieldJ.addCurrentToEMF() operating with interpolation functors.
+                 *
+                 * @tparam T_area area to operate once
+                 *
+                 * @param fieldJ object representing the current field
+                 */
+                template<std::uint32_t T_area>
+                void addCurrentToEMF(FieldJ& fieldJ) const
+                {
+                    using currentInterpolation::CurrentInterpolationInfo;
+                    auto const kind = CurrentInterpolationInfo::get().kind;
+                    if(kind == CurrentInterpolationInfo::Kind::None)
+                        fieldJ.addCurrentToEMF<T_area>(currentInterpolation::None{});
+                    else
+                        fieldJ.addCurrentToEMF<T_area>(currentInterpolation::Binomial{});
+                }
+            };
 
-} // namespace stage
-} // namespace simulation
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/CurrentReset.hpp b/include/picongpu/simulation/stage/CurrentReset.hpp
index e0e008b4aa..94d6913f21 100644
--- a/include/picongpu/simulation/stage/CurrentReset.hpp
+++ b/include/picongpu/simulation/stage/CurrentReset.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -31,29 +31,28 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    //! Functor for the stage of the PIC loop setting the current values to zero
-    struct CurrentReset
+    namespace simulation
     {
-        /** Set all current density values to zero
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const ) const
+        namespace stage
         {
-            using namespace pmacc;
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            auto & fieldJ = *dc.get< FieldJ >( FieldJ::getName( ), true );
-            FieldJ::ValueType zeroJ( FieldJ::ValueType::create( 0._X ) );
-            fieldJ.assign( zeroJ );
-            dc.releaseData( FieldJ::getName( ) );
-        }
-    };
-
-} // namespace stage
-} // namespace simulation
+            //! Functor for the stage of the PIC loop setting the current values to zero
+            struct CurrentReset
+            {
+                /** Set all current density values to zero
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const) const
+                {
+                    using namespace pmacc;
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto& fieldJ = *dc.get<FieldJ>(FieldJ::getName(), true);
+                    FieldJ::ValueType zeroJ(FieldJ::ValueType::create(0._X));
+                    fieldJ.assign(zeroJ);
+                    dc.releaseData(FieldJ::getName());
+                }
+            };
+
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/FieldBackground.hpp b/include/picongpu/simulation/stage/FieldBackground.hpp
index 34454dedc3..cec8c49461 100644
--- a/include/picongpu/simulation/stage/FieldBackground.hpp
+++ b/include/picongpu/simulation/stage/FieldBackground.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -34,70 +34,61 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    //! Functor for the stage of the PIC loop applying field background
-    class FieldBackground
+    namespace simulation
     {
-    public:
-
-        /** Create a field background functor
-         *
-         * Having this in constructor is a temporary solution.
-         *
-         * @param cellDescription mapping for kernels
-         */
-        FieldBackground( MappingDesc const cellDescription ):
-            cellDescription( cellDescription )
+        namespace stage
         {
-        }
-
-        /** Add the field background to the current density
-         *
-         * @tparam T_Functor functor type compatible to nvidia::functors
-         *
-         * @param step index of time iteration
-         * @param functor functor to apply to the background
-         */
-        template< typename T_Functor >
-        void operator( )( uint32_t const step, T_Functor functor ) const
-        {
-            using namespace pmacc;
-            DataConnector & dc = Environment< >::get( ).DataConnector( );
-            auto fieldE = dc.get< FieldE >( FieldE::getName( ), true );
-            auto fieldB = dc.get< FieldB >( FieldB::getName( ), true );
-            using Background = cellwiseOperation::CellwiseOperation<
-                CORE + BORDER + GUARD
-            >;
-            Background background( cellDescription );
-            background(
-                fieldE,
-                functor,
-                FieldBackgroundE( fieldE->getUnit( ) ),
-                step,
-                FieldBackgroundE::InfluenceParticlePusher
-            );
-            background(
-                fieldB,
-                functor,
-                FieldBackgroundB( fieldB->getUnit( ) ),
-                step,
-                FieldBackgroundB::InfluenceParticlePusher
-            );
-            dc.releaseData( FieldE::getName( ) );
-            dc.releaseData( FieldB::getName( ) );
-        }
-
-    private:
+            //! Functor for the stage of the PIC loop applying field background
+            class FieldBackground
+            {
+            public:
+                /** Create a field background functor
+                 *
+                 * Having this in constructor is a temporary solution.
+                 *
+                 * @param cellDescription mapping for kernels
+                 */
+                FieldBackground(MappingDesc const cellDescription) : cellDescription(cellDescription)
+                {
+                }
 
-        //! Mapping for kernels
-        MappingDesc cellDescription;
+                /** Add the field background to the current density
+                 *
+                 * @tparam T_Functor functor type compatible to nvidia::functors
+                 *
+                 * @param step index of time iteration
+                 * @param functor functor to apply to the background
+                 */
+                template<typename T_Functor>
+                void operator()(uint32_t const step, T_Functor functor) const
+                {
+                    using namespace pmacc;
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
+                    auto fieldB = dc.get<FieldB>(FieldB::getName(), true);
+                    using Background = cellwiseOperation::CellwiseOperation<CORE + BORDER + GUARD>;
+                    Background background(cellDescription);
+                    background(
+                        fieldE,
+                        functor,
+                        FieldBackgroundE(fieldE->getUnit()),
+                        step,
+                        FieldBackgroundE::InfluenceParticlePusher);
+                    background(
+                        fieldB,
+                        functor,
+                        FieldBackgroundB(fieldB->getUnit()),
+                        step,
+                        FieldBackgroundB::InfluenceParticlePusher);
+                    dc.releaseData(FieldE::getName());
+                    dc.releaseData(FieldB::getName());
+                }
 
-    };
+            private:
+                //! Mapping for kernels
+                MappingDesc cellDescription;
+            };
 
-} // namespace stage
-} // namespace simulation
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/MomentumBackup.hpp b/include/picongpu/simulation/stage/MomentumBackup.hpp
index 615e2b31e6..75d0ec5609 100644
--- a/include/picongpu/simulation/stage/MomentumBackup.hpp
+++ b/include/picongpu/simulation/stage/MomentumBackup.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -30,40 +30,31 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    /** Functor for the stage of the PIC loop copying particles' momentums
-     *  to momentumPrev1
-     *
-     * Only affects particle species with the momentumPrev1 attribute.
-     */
-    struct MomentumBackup
+    namespace simulation
     {
-        /** Copy the momentums
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const step ) const
+        namespace stage
         {
-            using pmacc::particles::traits::FilterByIdentifier;
-            using SpeciesWithMomentumPrev1 = typename FilterByIdentifier<
-                VectorAllSpecies,
-                momentumPrev1
-            >::type;
-            using CopyMomentum = particles::manipulators::unary::CopyAttribute<
-                momentumPrev1,
-                momentum
-            >;
-            particles::manipulate<
-                CopyMomentum,
-                SpeciesWithMomentumPrev1
-            >( step );
-        }
-    };
-
-} // namespace stage
-} // namespace simulation
+            /** Functor for the stage of the PIC loop copying particles' momentums
+             *  to momentumPrev1
+             *
+             * Only affects particle species with the momentumPrev1 attribute.
+             */
+            struct MomentumBackup
+            {
+                /** Copy the momentums
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const step) const
+                {
+                    using pmacc::particles::traits::FilterByIdentifier;
+                    using SpeciesWithMomentumPrev1 =
+                        typename FilterByIdentifier<VectorAllSpecies, momentumPrev1>::type;
+                    using CopyMomentum = particles::manipulators::unary::CopyAttribute<momentumPrev1, momentum>;
+                    particles::manipulate<CopyMomentum, SpeciesWithMomentumPrev1>(step);
+                }
+            };
+
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/ParticleIonization.hpp b/include/picongpu/simulation/stage/ParticleIonization.hpp
index cf74d9b8fc..638af55f8c 100644
--- a/include/picongpu/simulation/stage/ParticleIonization.hpp
+++ b/include/picongpu/simulation/stage/ParticleIonization.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -29,55 +29,44 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    /** Functor for the stage of the PIC loop performing particle ionization
-     *
-     * Only affects particle species with the ionizers attribute.
-     */
-    class ParticleIonization
+    namespace simulation
     {
-    public:
-
-        /** Create a particle ionization functor
-         *
-         * Having this in constructor is a temporary solution.
-         *
-         * @param cellDescription mapping for kernels
-         */
-        ParticleIonization( MappingDesc const cellDescription ):
-            cellDescription( cellDescription )
+        namespace stage
         {
-        }
-
-        /** Ionize particles
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const step ) const
-        {
-            using pmacc::particles::traits::FilterByFlag;
-            using SpeciesWithIonizers = typename FilterByFlag<
-                VectorAllSpecies,
-                ionizers< >
-            >::type;
-            pmacc::meta::ForEach<
-                SpeciesWithIonizers,
-                particles::CallIonization< bmpl::_1 >
-            > particleIonization;
-            particleIonization( cellDescription, step );
-        }
-
-    private:
+            /** Functor for the stage of the PIC loop performing particle ionization
+             *
+             * Only affects particle species with the ionizers attribute.
+             */
+            class ParticleIonization
+            {
+            public:
+                /** Create a particle ionization functor
+                 *
+                 * Having this in constructor is a temporary solution.
+                 *
+                 * @param cellDescription mapping for kernels
+                 */
+                ParticleIonization(MappingDesc const cellDescription) : cellDescription(cellDescription)
+                {
+                }
 
-        //! Mapping for kernels
-        MappingDesc cellDescription;
+                /** Ionize particles
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const step) const
+                {
+                    using pmacc::particles::traits::FilterByFlag;
+                    using SpeciesWithIonizers = typename FilterByFlag<VectorAllSpecies, ionizers<>>::type;
+                    pmacc::meta::ForEach<SpeciesWithIonizers, particles::CallIonization<bmpl::_1>> particleIonization;
+                    particleIonization(cellDescription, step);
+                }
 
-    };
+            private:
+                //! Mapping for kernels
+                MappingDesc cellDescription;
+            };
 
-} // namespace stage
-} // namespace simulation
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/ParticlePush.hpp b/include/picongpu/simulation/stage/ParticlePush.hpp
index 51c5440f16..7d161200f8 100644
--- a/include/picongpu/simulation/stage/ParticlePush.hpp
+++ b/include/picongpu/simulation/stage/ParticlePush.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -30,33 +30,28 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    //! Functor for the stage of the PIC loop performing particle push
-    struct ParticlePush
+    namespace simulation
     {
-        /** Push all particle species
-         *
-         * @param step index of time iteration
-         * @param[out] commEvent particle communication event
-         */
-        void operator( )( uint32_t const step, pmacc::EventTask & commEvent ) const
+        namespace stage
         {
-            pmacc::EventTask initEvent = __getTransactionEvent( );
-            pmacc::EventTask updateEvent;
-            particles::PushAllSpecies pushAllSpecies;
-            pushAllSpecies(
-                step, initEvent,
-                updateEvent,
-                commEvent
-            );
-            __setTransactionEvent( updateEvent );
-        }
-    };
-
-} // namespace stage
-} // namespace simulation
+            //! Functor for the stage of the PIC loop performing particle push
+            struct ParticlePush
+            {
+                /** Push all particle species
+                 *
+                 * @param step index of time iteration
+                 * @param[out] commEvent particle communication event
+                 */
+                void operator()(uint32_t const step, pmacc::EventTask& commEvent) const
+                {
+                    pmacc::EventTask initEvent = __getTransactionEvent();
+                    pmacc::EventTask updateEvent;
+                    particles::PushAllSpecies pushAllSpecies;
+                    pushAllSpecies(step, initEvent, updateEvent, commEvent);
+                    __setTransactionEvent(updateEvent);
+                }
+            };
+
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/PopulationKinetics.hpp b/include/picongpu/simulation/stage/PopulationKinetics.hpp
index dcdbae0e9b..435b327eb6 100644
--- a/include/picongpu/simulation/stage/PopulationKinetics.hpp
+++ b/include/picongpu/simulation/stage/PopulationKinetics.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -29,38 +29,31 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    /** Functor for the stage of the PIC loop performing FLYlite population
-     *  kinetics for atomic physics
-     *
-     *  Only affects particle species with the populationKinetics attribute.
-     */
-    struct PopulationKinetics
+    namespace simulation
     {
-        /** Perform FLYlite population kinetics for atomic physics
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const step ) const
+        namespace stage
         {
-            using pmacc::particles::traits::FilterByFlag;
-            using FlyLiteIons = typename FilterByFlag<
-                VectorAllSpecies,
-                populationKinetics< >
-            >::type;
-            pmacc::meta::ForEach<
-                FlyLiteIons,
-                particles::CallPopulationKinetics< bmpl::_1 >,
-                bmpl::_1
-            > populationKinetics;
-            populationKinetics( step );
-        }
-    };
+            /** Functor for the stage of the PIC loop performing FLYlite population
+             *  kinetics for atomic physics
+             *
+             *  Only affects particle species with the populationKinetics attribute.
+             */
+            struct PopulationKinetics
+            {
+                /** Perform FLYlite population kinetics for atomic physics
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const step) const
+                {
+                    using pmacc::particles::traits::FilterByFlag;
+                    using FlyLiteIons = typename FilterByFlag<VectorAllSpecies, populationKinetics<>>::type;
+                    pmacc::meta::ForEach<FlyLiteIons, particles::CallPopulationKinetics<bmpl::_1>, bmpl::_1>
+                        populationKinetics;
+                    populationKinetics(step);
+                }
+            };
 
-} // namespace stage
-} // namespace simulation
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation/stage/SynchrotronRadiation.hpp b/include/picongpu/simulation/stage/SynchrotronRadiation.hpp
index 64c7a39a14..7617f4da75 100644
--- a/include/picongpu/simulation/stage/SynchrotronRadiation.hpp
+++ b/include/picongpu/simulation/stage/SynchrotronRadiation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Richard Pausch, Alexander Debus, Marco Garten,
  *                     Benjamin Worpitz, Alexander Grund, Sergei Bastrakov
  *
@@ -31,67 +31,54 @@
 
 namespace picongpu
 {
-namespace simulation
-{
-namespace stage
-{
-
-    /** Functor for the stage of the PIC loop computing synchrotron radiation
-     *
-     * Only affects particle species with the synchrotronPhotons attribute.
-     */
-    class SynchrotronRadiation
+    namespace simulation
     {
-    public:
-
-        /** Create a synchrotron radiation functor
-         *
-         * Having this in constructor is a temporary solution.
-         *
-         * @param cellDescription mapping for kernels
-         * @param functions initialized synchrotron functions
-         */
-        SynchrotronRadiation(
-            MappingDesc const cellDescription,
-            particles::synchrotronPhotons::SynchrotronFunctions & functions
-        ):
-            cellDescription( cellDescription ),
-            functions( functions )
+        namespace stage
         {
-        }
-
-        /** Ionize particles
-         *
-         * @param step index of time iteration
-         */
-        void operator( )( uint32_t const step ) const
-        {
-            using pmacc::particles::traits::FilterByFlag;
-            using SynchrotronPhotonsSpecies = typename FilterByFlag<
-                VectorAllSpecies,
-                synchrotronPhotons< >
-            >::type;
-            pmacc::meta::ForEach<
-                SynchrotronPhotonsSpecies,
-                particles::CallSynchrotronPhotons< bmpl::_1 >
-            > synchrotronRadiation;
-            synchrotronRadiation(
-                cellDescription,
-                step,
-                functions
-            );
-        }
-
-    private:
+            /** Functor for the stage of the PIC loop computing synchrotron radiation
+             *
+             * Only affects particle species with the synchrotronPhotons attribute.
+             */
+            class SynchrotronRadiation
+            {
+            public:
+                /** Create a synchrotron radiation functor
+                 *
+                 * Having this in constructor is a temporary solution.
+                 *
+                 * @param cellDescription mapping for kernels
+                 * @param functions initialized synchrotron functions
+                 */
+                SynchrotronRadiation(
+                    MappingDesc const cellDescription,
+                    particles::synchrotronPhotons::SynchrotronFunctions& functions)
+                    : cellDescription(cellDescription)
+                    , functions(functions)
+                {
+                }
 
-        //! Mapping for kernels
-        MappingDesc cellDescription;
+                /** Ionize particles
+                 *
+                 * @param step index of time iteration
+                 */
+                void operator()(uint32_t const step) const
+                {
+                    using pmacc::particles::traits::FilterByFlag;
+                    using SynchrotronPhotonsSpecies =
+                        typename FilterByFlag<VectorAllSpecies, synchrotronPhotons<>>::type;
+                    pmacc::meta::ForEach<SynchrotronPhotonsSpecies, particles::CallSynchrotronPhotons<bmpl::_1>>
+                        synchrotronRadiation;
+                    synchrotronRadiation(cellDescription, step, functions);
+                }
 
-        //! Initialized synchrotron functions
-        particles::synchrotronPhotons::SynchrotronFunctions & functions;
+            private:
+                //! Mapping for kernels
+                MappingDesc cellDescription;
 
-    };
+                //! Initialized synchrotron functions
+                particles::synchrotronPhotons::SynchrotronFunctions& functions;
+            };
 
-} // namespace stage
-} // namespace simulation
+        } // namespace stage
+    } // namespace simulation
 } // namespace picongpu
diff --git a/include/picongpu/simulation_classTypes.hpp b/include/picongpu/simulation_classTypes.hpp
index abf9921976..2606037340 100644
--- a/include/picongpu/simulation_classTypes.hpp
+++ b/include/picongpu/simulation_classTypes.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -34,7 +34,7 @@ namespace picongpu
 {
     using namespace pmacc;
 
-    //short name for access verbose types of picongpu
+    // short name for access verbose types of picongpu
     typedef PIConGPUVerbose picLog;
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/simulation_defines.hpp b/include/picongpu/simulation_defines.hpp
index 1ab582d3fc..b0f5823f7f 100644
--- a/include/picongpu/simulation_defines.hpp
+++ b/include/picongpu/simulation_defines.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -40,7 +40,7 @@ namespace picongpu
 #include <picongpu/_defaultParam.loader>
 #include <picongpu/extensionParam.loader>
 
-//load starter after all user extension
+// load starter after all user extension
 #include <picongpu/param/starter.param>
 
 #include <picongpu/param/components.param>
@@ -49,5 +49,5 @@ namespace picongpu
 // ##### load unitless
 #include <picongpu/_defaultUnitless.loader>
 #include <picongpu/extensionUnitless.loader>
-//load starter after user extensions and all params are loaded
+// load starter after user extensions and all params are loaded
 #include <picongpu/unitless/starter.unitless>
diff --git a/include/picongpu/simulation_types.hpp b/include/picongpu/simulation_types.hpp
index ce46b4a3df..763dfbcd45 100644
--- a/include/picongpu/simulation_types.hpp
+++ b/include/picongpu/simulation_types.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,6 +25,7 @@
 #include <pmacc/algorithms/PromoteType.hpp>
 #include <pmacc/meta/ForEach.hpp>
 #include <pmacc/algorithms/math.hpp>
+#include <cupla/device/math.hpp>
 #include <pmacc/traits/GetStringProperties.hpp>
 #include "picongpu/traits/GetMargin.hpp"
 #include <pmacc/traits/GetComponentsType.hpp>
@@ -33,41 +34,32 @@
 
 namespace picongpu
 {
+    //! define all elements which can send and resive
 
-//! define all elements which can send and resive
+    enum CommunicationTag
+    {
+        NO_COMMUNICATION = 0u,
+        FIELD_B = 1u,
+        FIELD_E = 2u,
+        FIELD_J = 3u,
+        FIELD_JRECV = 4u,
+        SPECIES_FIRSTTAG = 42u
+    };
 
-enum CommunicationTag
-{
-    NO_COMMUNICATION = 0u,
-    FIELD_B = 1u,
-    FIELD_E = 2u,
-    FIELD_J = 3u,
-    FIELD_JRECV = 4u,
-    SPECIES_FIRSTTAG = 42u
-};
-
-
-//! defines field types some various methods (e.g. Laser::manipulate)
-
-enum FieldType
-{
-    FIELD_TYPE_E, FIELD_TYPE_B, FIELD_TYPE_TMP
-};
-
-namespace precision32Bit
-{
-using precisionType = float;
-}
+    namespace precision32Bit
+    {
+        using precisionType = float;
+    }
 
-namespace precision64Bit
-{
-using precisionType = double;
-}
+    namespace precision64Bit
+    {
+        using precisionType = double;
+    }
 
-namespace math = pmacc::algorithms::math;
-using namespace pmacc::algorithms::precisionCast;
-using namespace pmacc::algorithms::promoteType;
-using namespace pmacc::traits;
-using namespace picongpu::traits;
+    namespace math = cupla::device::math;
+    using namespace pmacc::algorithms::precisionCast;
+    using namespace pmacc::algorithms::promoteType;
+    using namespace pmacc::traits;
+    using namespace picongpu::traits;
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/traits/AdiosToPIC.hpp b/include/picongpu/traits/AdiosToPIC.hpp
index 8d313c507c..9069fe644a 100644
--- a/include/picongpu/traits/AdiosToPIC.hpp
+++ b/include/picongpu/traits/AdiosToPIC.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -23,20 +23,19 @@
 
 namespace picongpu
 {
+    namespace traits
+    {
+        /** Convert an Adios type to a PIConGPU Type
+         *
+         * implements a public type as result of the trait
+         *
+         * @tparam T_AdiosType Adios data type
+         */
+        template<typename T_AdiosType>
+        struct AdiosToPIC;
 
-namespace traits
-{
-    /** Convert an Adios type to a PIConGPU Type
-     *
-     * implements a public type as result of the trait
-     *
-     * @tparam T_AdiosType Adios data type
-     */
-    template<typename T_AdiosType>
-    struct AdiosToPIC;
-
-} //namespace traits
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
 
 #include "AdiosToPIC.tpp"
diff --git a/include/picongpu/traits/AdiosToPIC.tpp b/include/picongpu/traits/AdiosToPIC.tpp
index e82c6c8c3f..5274c7071b 100644
--- a/include/picongpu/traits/AdiosToPIC.tpp
+++ b/include/picongpu/traits/AdiosToPIC.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Alexander Debus
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -19,67 +19,65 @@
 
 #pragma once
 
-#if (ENABLE_ADIOS==1)
-#include <adios.h>
+#if(ENABLE_ADIOS == 1)
+#    include <adios.h>
 
-#include "picongpu/simulation_defines.hpp"
+#    include "picongpu/simulation_defines.hpp"
 
 namespace picongpu
 {
-
-namespace traits
-{
-
-    template<>
-    struct AdiosToPIC<adios_short>
-    {
-        typedef int16_t type;
-    };
-
-    template<>
-    struct AdiosToPIC<adios_unsigned_short>
+    namespace traits
     {
-        typedef uint16_t type;
-    };
-
-    template<>
-    struct AdiosToPIC<adios_integer>
-    {
-        typedef int32_t type;
-    };
-
-    template<>
-    struct AdiosToPIC<adios_unsigned_integer>
-    {
-        typedef uint32_t type;
-    };
-
-    template<>
-    struct AdiosToPIC<adios_long>
-    {
-        typedef int64_t type;
-    };
-
-    template<>
-    struct AdiosToPIC<adios_unsigned_long>
-    {
-        typedef uint64_t type;
-    };
-
-    template<>
-    struct AdiosToPIC<adios_real>
-    {
-        typedef float_32 type;
-    };
-
-    template<>
-    struct AdiosToPIC<adios_double>
-    {
-        typedef float_64 type;
-    };
-
-} //namespace traits
-
-}// namespace picongpu
+        template<>
+        struct AdiosToPIC<adios_short>
+        {
+            typedef int16_t type;
+        };
+
+        template<>
+        struct AdiosToPIC<adios_unsigned_short>
+        {
+            typedef uint16_t type;
+        };
+
+        template<>
+        struct AdiosToPIC<adios_integer>
+        {
+            typedef int32_t type;
+        };
+
+        template<>
+        struct AdiosToPIC<adios_unsigned_integer>
+        {
+            typedef uint32_t type;
+        };
+
+        template<>
+        struct AdiosToPIC<adios_long>
+        {
+            typedef int64_t type;
+        };
+
+        template<>
+        struct AdiosToPIC<adios_unsigned_long>
+        {
+            typedef uint64_t type;
+        };
+
+        template<>
+        struct AdiosToPIC<adios_real>
+        {
+            typedef float_32 type;
+        };
+
+        template<>
+        struct AdiosToPIC<adios_double>
+        {
+            typedef float_64 type;
+        };
+
+    } // namespace traits
+
+} // namespace picongpu
 
 #endif // (ENABLE_ADIOS==1)
diff --git a/include/picongpu/traits/FieldPosition.hpp b/include/picongpu/traits/FieldPosition.hpp
index 6f8ae40bf8..d463db31a1 100644
--- a/include/picongpu/traits/FieldPosition.hpp
+++ b/include/picongpu/traits/FieldPosition.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,14 +24,10 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    template<
-        typename T_CellType,
-        typename T_Field,
-        uint32_t T_simDim = simDim
-    >
-    struct FieldPosition;
+    namespace traits
+    {
+        template<typename T_CellType, typename T_Field, uint32_t T_simDim = simDim>
+        struct FieldPosition;
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/traits/GetCellType.hpp b/include/picongpu/traits/GetCellType.hpp
new file mode 100644
index 0000000000..a446c0ebbe
--- /dev/null
+++ b/include/picongpu/traits/GetCellType.hpp
@@ -0,0 +1,46 @@
+/* Copyright 2020-2021 Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+
+namespace picongpu
+{
+    namespace traits
+    {
+        /** Trait for cell type of a field solver
+         *
+         * Defines the resulting type as ::type.
+         * By default falls back to T_FieldSolver::CellType.
+         *
+         * Note: it was originally indented to be put to a new namespace
+         * picongpu::fields::traits, but this was not possible due to conflicts
+         * with pmacc names lookup.
+         *
+         * @tparam T_FieldSolver field solver type
+         */
+        template<typename T_FieldSolver>
+        struct GetCellType
+        {
+            //! Cell type, one of fields::cellType:: types
+            using type = typename T_FieldSolver::CellType;
+        };
+
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/traits/GetDataBoxType.hpp b/include/picongpu/traits/GetDataBoxType.hpp
index 572feb8b3e..f94727257d 100644
--- a/include/picongpu/traits/GetDataBoxType.hpp
+++ b/include/picongpu/traits/GetDataBoxType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,15 +22,15 @@
 
 namespace picongpu
 {
-namespace traits
-{
-/** Get data box type of a buffer
- *
- * \tparam T_Type type from which you need the DataBoxType
- * \treturn ::type
- */
-template<typename T_Type>
-struct GetDataBoxType;
+    namespace traits
+    {
+        /** Get data box type of a buffer
+         *
+         * \tparam T_Type type from which you need the DataBoxType
+         * \treturn ::type
+         */
+        template<typename T_Type>
+        struct GetDataBoxType;
 
-} //namespace traits
-}// namespace picongpu
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/traits/GetMargin.hpp b/include/picongpu/traits/GetMargin.hpp
index ffa0cde79d..ec82fceee8 100644
--- a/include/picongpu/traits/GetMargin.hpp
+++ b/include/picongpu/traits/GetMargin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -23,37 +23,36 @@
 
 namespace picongpu
 {
-
-namespace traits
-{
-/**Get margin of a solver
- * class must define a LowerMargin and UpperMargin for any valid solver
- *
- * \tparam Solver solver which needs ghost cells for solving a problem
- *         if solver not define `LowerMargin` and `UpperMargin` this trait (GetMargin)
- *         must be specialized
- * \tparam SubSetName a optional name (id) if solver needs different ghost cells
- * for different objects
- */
-template<class Solver,unsigned int SubSetName=0>
-struct GetMargin
-{
-    using LowerMargin = typename Solver::LowerMargin;
-    using UpperMargin = typename Solver::UpperMargin;
-};
-
-template<typename T_Type>
-struct GetLowerMargin
-{
-    typedef typename traits::GetMargin<T_Type>::LowerMargin type;
-};
-
-template<typename T_Type>
-struct GetUpperMargin
-{
-    typedef typename traits::GetMargin<T_Type>::UpperMargin type;
-};
-
-} //namespace traits
-
-}// namespace picongpu
+    namespace traits
+    {
+        /**Get margin of a solver
+         * class must define a LowerMargin and UpperMargin for any valid solver
+         *
+         * \tparam Solver solver which needs ghost cells for solving a problem
+         *         if solver not define `LowerMargin` and `UpperMargin` this trait (GetMargin)
+         *         must be specialized
+         * \tparam SubSetName a optional name (id) if solver needs different ghost cells
+         * for different objects
+         */
+        template<class Solver, unsigned int SubSetName = 0>
+        struct GetMargin
+        {
+            using LowerMargin = typename Solver::LowerMargin;
+            using UpperMargin = typename Solver::UpperMargin;
+        };
+
+        template<typename T_Type>
+        struct GetLowerMargin
+        {
+            typedef typename traits::GetMargin<T_Type>::LowerMargin type;
+        };
+
+        template<typename T_Type>
+        struct GetUpperMargin
+        {
+            typedef typename traits::GetMargin<T_Type>::UpperMargin type;
+        };
+
+    } // namespace traits
+
+} // namespace picongpu
diff --git a/include/picongpu/traits/IsFieldDomainBound.hpp b/include/picongpu/traits/IsFieldDomainBound.hpp
index e26ff54a52..76199b797d 100644
--- a/include/picongpu/traits/IsFieldDomainBound.hpp
+++ b/include/picongpu/traits/IsFieldDomainBound.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Sergei Bastrakov
+/* Copyright 2020-2021 Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -26,20 +26,19 @@
 
 namespace picongpu
 {
-namespace traits
-{
-
-    /** Whether a field is geometrically bound to the domain decomposition
-     *  with respect to size, guard size, and offset
-     *
-     * Inherits std::true_type, std::false_type or a compatible type.
-     *
-     * @tparam T_Field field type
-     */
-    template< typename T_Field >
-    struct IsFieldDomainBound : std::true_type
+    namespace traits
     {
-    };
+        /** Whether a field is geometrically bound to the domain decomposition
+         *  with respect to size, guard size, and offset
+         *
+         * Inherits std::true_type, std::false_type or a compatible type.
+         *
+         * @tparam T_Field field type
+         */
+        template<typename T_Field>
+        struct IsFieldDomainBound : std::true_type
+        {
+        };
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/traits/PICToAdios.hpp b/include/picongpu/traits/PICToAdios.hpp
index b0a2550686..fed4a6ee36 100644
--- a/include/picongpu/traits/PICToAdios.hpp
+++ b/include/picongpu/traits/PICToAdios.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
@@ -23,19 +23,18 @@
 
 namespace picongpu
 {
+    namespace traits
+    {
+        /** Convert a PIConGPU Type to an Adios data type
+         *
+         * \tparam T_Type Typename in PIConGPU
+         * \return \p ::type as public typedef of an Adios type
+         */
+        template<typename T_Type>
+        struct PICToAdios;
 
-namespace traits
-{
-    /** Convert a PIConGPU Type to an Adios data type
-     *
-     * \tparam T_Type Typename in PIConGPU
-     * \return \p ::type as public typedef of an Adios type
-     */
-    template<typename T_Type>
-    struct PICToAdios;
-
-} //namespace traits
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
 
 #include "PICToAdios.tpp"
diff --git a/include/picongpu/traits/PICToAdios.tpp b/include/picongpu/traits/PICToAdios.tpp
index 2da5b1db4d..2581459f44 100644
--- a/include/picongpu/traits/PICToAdios.tpp
+++ b/include/picongpu/traits/PICToAdios.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Alexander Debus
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -19,124 +19,125 @@
 
 #pragma once
 
-#if (ENABLE_ADIOS==1)
-#include <adios.h>
+#if(ENABLE_ADIOS == 1)
+#    include <adios.h>
 
-#include "picongpu/simulation_defines.hpp"
-#include <boost/mpl/if.hpp>
-#include <boost/type_traits.hpp>
+#    include "picongpu/simulation_defines.hpp"
+#    include <boost/mpl/if.hpp>
+#    include <boost/type_traits.hpp>
 
 namespace picongpu
 {
-
-namespace traits
-{
-    template<>
-    struct PICToAdios<bool>
-    {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_unsigned_byte) {}
-
-        PMACC_STATIC_ASSERT_MSG(
-            sizeof(bool) == 1,
-            ADIOS_Plugin__Can_not_find_a_one_byte_representation_of_bool
-        );
-    };
-
-    template<>
-    struct PICToAdios<int16_t>
-    {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_short) {}
-    };
-
-    template<>
-    struct PICToAdios<uint16_t>
-    {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_unsigned_short) {}
-    };
-
-    template<>
-    struct PICToAdios<int32_t>
-    {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_integer) {}
-    };
-
-    template<>
-    struct PICToAdios<uint32_t>
-    {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_unsigned_integer) {}
-    };
-
-    template<>
-    struct PICToAdios<int64_t>
-    {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_long) {}
-    };
-
-    template<>
-    struct PICToAdios<uint64_t>
+    namespace traits
     {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_unsigned_long) {}
-    };
-
-    /** Specialization for uint64_cu.
-     *  If uint64_cu happens to be the same as uint64_t we use an unused dummy type
-     *  to avoid duplicate specialization
-     */
-    struct uint64_cu_unused_adios;
-    template<>
-    struct PICToAdios<
-                        typename bmpl::if_<
-                            typename bmpl::or_<
-                                boost::is_same<uint64_t, uint64_cu>,
-                                bmpl::bool_<sizeof(uint64_cu) != sizeof(uint64_t)>
-                            >::type,
-                            uint64_cu_unused_adios,
-                            uint64_cu
-                        >::type
-                     >: public PICToAdios<uint64_t>
-    {};
-
-    template<>
-    struct PICToAdios<float_32>
-    {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_real) {}
-    };
-
-    template<>
-    struct PICToAdios<float_64>
-    {
-        ADIOS_DATATYPES type;
-
-        PICToAdios() :
-        type(adios_double) {}
-    };
-
-} //namespace traits
-
-}// namespace picongpu
+        template<>
+        struct PICToAdios<bool>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_unsigned_byte)
+            {
+            }
+
+            PMACC_STATIC_ASSERT_MSG(sizeof(bool) == 1, ADIOS_Plugin__Can_not_find_a_one_byte_representation_of_bool);
+        };
+
+        template<>
+        struct PICToAdios<int16_t>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_short)
+            {
+            }
+        };
+
+        template<>
+        struct PICToAdios<uint16_t>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_unsigned_short)
+            {
+            }
+        };
+
+        template<>
+        struct PICToAdios<int32_t>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_integer)
+            {
+            }
+        };
+
+        template<>
+        struct PICToAdios<uint32_t>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_unsigned_integer)
+            {
+            }
+        };
+
+        template<>
+        struct PICToAdios<int64_t>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_long)
+            {
+            }
+        };
+
+        template<>
+        struct PICToAdios<uint64_t>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_unsigned_long)
+            {
+            }
+        };
+
+        /** Specialization for uint64_cu.
+         *  If uint64_cu happens to be the same as uint64_t we use an unused dummy type
+         *  to avoid duplicate specialization
+         */
+        struct uint64_cu_unused_adios;
+        template<>
+        struct PICToAdios<typename bmpl::if_<
+            typename bmpl::
+                or_<boost::is_same<uint64_t, uint64_cu>, bmpl::bool_<sizeof(uint64_cu) != sizeof(uint64_t)>>::type,
+            uint64_cu_unused_adios,
+            uint64_cu>::type> : public PICToAdios<uint64_t>
+        {
+        };
+
+        template<>
+        struct PICToAdios<float_32>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_real)
+            {
+            }
+        };
+
+        template<>
+        struct PICToAdios<float_64>
+        {
+            ADIOS_DATATYPES type;
+
+            PICToAdios() : type(adios_double)
+            {
+            }
+        };
+
+    } // namespace traits
+
+} // namespace picongpu
 
 #endif // (ENABLE_ADIOS==1)
diff --git a/include/picongpu/traits/PICToOpenPMD.hpp b/include/picongpu/traits/PICToOpenPMD.hpp
index 719347a7c0..35ca277eb5 100644
--- a/include/picongpu/traits/PICToOpenPMD.hpp
+++ b/include/picongpu/traits/PICToOpenPMD.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -30,23 +30,23 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    /** Reinterpret attributes for openPMD
-     *
-     * Currently, this conversion tables are used to translate the PIConGPU
-     * totalCellIdx (unitless cell index) to the openPMD positionOffset (length)
-     */
-    template<typename T_Identifier>
-    struct OpenPMDName;
-
-    template<typename T_Identifier>
-    struct OpenPMDUnit;
-
-    template<typename T_Identifier>
-    struct OpenPMDUnitDimension;
-
-} // namespace traits
+    namespace traits
+    {
+        /** Reinterpret attributes for openPMD
+         *
+         * Currently, this conversion tables are used to translate the PIConGPU
+         * totalCellIdx (unitless cell index) to the openPMD positionOffset (length)
+         */
+        template<typename T_Identifier>
+        struct OpenPMDName;
+
+        template<typename T_Identifier>
+        struct OpenPMDUnit;
+
+        template<typename T_Identifier>
+        struct OpenPMDUnitDimension;
+
+    } // namespace traits
 } // namespace picongpu
 
 #include "PICToOpenPMD.tpp"
diff --git a/include/picongpu/traits/PICToOpenPMD.tpp b/include/picongpu/traits/PICToOpenPMD.tpp
index 3a0a780171..15e1ef3b33 100644
--- a/include/picongpu/traits/PICToOpenPMD.tpp
+++ b/include/picongpu/traits/PICToOpenPMD.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Axel Huebl
+/* Copyright 2016-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -21,103 +21,103 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    /** Forward names that are identical in PIConGPU & openPMD
-     */
-    template<typename T_Identifier>
-    struct OpenPMDName
+    namespace traits
     {
-        std::string operator()() const
+        /** Forward names that are identical in PIConGPU & openPMD
+         */
+        template<typename T_Identifier>
+        struct OpenPMDName
         {
-            return T_Identifier::getName();
-        }
-    };
+            std::string operator()() const
+            {
+                return T_Identifier::getName();
+            }
+        };
 
-    /** Translate the totalCellIdx (unitless index) into the openPMD
-     *  positionOffset (3D position vector, length)
-     */
-    template<>
-    struct OpenPMDName<totalCellIdx>
-    {
-        std::string operator()() const
+        /** Translate the totalCellIdx (unitless index) into the openPMD
+         *  positionOffset (3D position vector, length)
+         */
+        template<>
+        struct OpenPMDName<totalCellIdx>
         {
-            return std::string("positionOffset");
-        }
-    };
+            std::string operator()() const
+            {
+                return std::string("positionOffset");
+            }
+        };
 
-    /** Translate the particleId (unitless, global) into the openPMD
-     *  id (unitless, global)
-     */
-    template<>
-    struct OpenPMDName<particleId>
-    {
-        std::string operator()() const
+        /** Translate the particleId (unitless, global) into the openPMD
+         *  id (unitless, global)
+         */
+        template<>
+        struct OpenPMDName<particleId>
         {
-            return std::string("id");
-        }
-    };
+            std::string operator()() const
+            {
+                return std::string("id");
+            }
+        };
 
-    /** Forward units that are identical in PIConGPU & openPMD
-     */
-    template<typename T_Identifier>
-    struct OpenPMDUnit
-    {
-        std::vector<double> operator()() const
+        /** Forward units that are identical in PIConGPU & openPMD
+         */
+        template<typename T_Identifier>
+        struct OpenPMDUnit
         {
-            return Unit<T_Identifier>::get();
-        }
-    };
+            std::vector<double> operator()() const
+            {
+                return Unit<T_Identifier>::get();
+            }
+        };
 
-    /** the totalCellIdx can be converted into a positionOffset
-     *  until the beginning of the cell by multiplying with the component-wise
-     *  cell size in SI
-     */
-    template<>
-    struct OpenPMDUnit<totalCellIdx>
-    {
-        std::vector<double> operator()() const
+        /** the totalCellIdx can be converted into a positionOffset
+         *  until the beginning of the cell by multiplying with the component-wise
+         *  cell size in SI
+         */
+        template<>
+        struct OpenPMDUnit<totalCellIdx>
         {
-            std::vector<double> unit(simDim);
-            /* cell positionOffset needs two transformations to get to SI:
-               cell begin -> dimensionless scaling to grid -> SI */
-            for( uint32_t i=0; i < simDim; ++i )
-                unit[i] = cellSize[i] * UNIT_LENGTH;
+            std::vector<double> operator()() const
+            {
+                std::vector<double> unit(simDim);
+                /* cell positionOffset needs two transformations to get to SI:
+                   cell begin -> dimensionless scaling to grid -> SI */
+                for(uint32_t i = 0; i < simDim; ++i)
+                    unit[i] = cellSize[i] * UNIT_LENGTH;
 
-            return unit;
-        }
-    };
+                return unit;
+            }
+        };
 
-    /** Forward dimensionalities that are identical in PIConGPU & openPMD
-     */
-    template<typename T_Identifier>
-    struct OpenPMDUnitDimension
-    {
-        std::vector<float_64> operator()() const
+        /** Forward dimensionalities that are identical in PIConGPU & openPMD
+         */
+        template<typename T_Identifier>
+        struct OpenPMDUnitDimension
         {
-            return UnitDimension<T_Identifier>::get();
-        }
-    };
+            std::vector<float_64> operator()() const
+            {
+                return UnitDimension<T_Identifier>::get();
+            }
+        };
 
-    /** the openPMD positionOffset is an actual (vector) with a lengths that
-     *  is added to the position (vector) attribute
-     */
-    template<>
-    struct OpenPMDUnitDimension<totalCellIdx>
-    {
-        std::vector<float_64> operator()() const
+        /** the openPMD positionOffset is an actual (vector) with a lengths that
+         *  is added to the position (vector) attribute
+         */
+        template<>
+        struct OpenPMDUnitDimension<totalCellIdx>
         {
-            /* L, M, T, I, theta, N, J
-             *
-             * positionOffset is in meter: m
-             *   -> L
-             */
-            std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-            unitDimension.at(SIBaseUnits::length) = 1.0;
+            std::vector<float_64> operator()() const
+            {
+                /* L, M, T, I, theta, N, J
+                 *
+                 * positionOffset is in meter: m
+                 *   -> L
+                 */
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+                unitDimension.at(SIBaseUnits::length) = 1.0;
 
-            return unitDimension;
-        }
-    };
+                return unitDimension;
+            }
+        };
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/traits/PICToSplash.hpp b/include/picongpu/traits/PICToSplash.hpp
index 11238bdff3..3829be1850 100644
--- a/include/picongpu/traits/PICToSplash.hpp
+++ b/include/picongpu/traits/PICToSplash.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl
+/* Copyright 2013-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -23,19 +23,18 @@
 
 namespace picongpu
 {
+    namespace traits
+    {
+        /** Convert a PIConGPU Type to a Splash CollectionType
+         *
+         * \tparam T_Type Typename in PIConGPU
+         * \return \p ::type as public typedef of a Splash CollectionType
+         */
+        template<typename T_Type>
+        struct PICToSplash;
 
-namespace traits
-{
-    /** Convert a PIConGPU Type to a Splash CollectionType
-     *
-     * \tparam T_Type Typename in PIConGPU
-     * \return \p ::type as public typedef of a Splash CollectionType
-     */
-    template<typename T_Type>
-    struct PICToSplash;
-
-} //namespace traits
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
 
 #include "PICToSplash.tpp"
diff --git a/include/picongpu/traits/PICToSplash.tpp b/include/picongpu/traits/PICToSplash.tpp
index a0e14055c0..ce7dec98c8 100644
--- a/include/picongpu/traits/PICToSplash.tpp
+++ b/include/picongpu/traits/PICToSplash.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl
+/* Copyright 2013-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -19,99 +19,93 @@
 
 #pragma once
 
-#if (ENABLE_HDF5==1)
-#include <splash/splash.h>
+#if(ENABLE_HDF5 == 1)
+#    include <splash/splash.h>
 
-#include "picongpu/simulation_defines.hpp"
-#include <boost/mpl/if.hpp>
-#include <boost/type_traits.hpp>
+#    include "picongpu/simulation_defines.hpp"
+#    include <boost/mpl/if.hpp>
+#    include <boost/type_traits.hpp>
 
 namespace picongpu
 {
-
-namespace traits
-{
-
-    template<>
-    struct PICToSplash<bool>
-    {
-        typedef splash::ColTypeBool type;
-    };
-
-    template<>
-    struct PICToSplash<float_32>
-    {
-        typedef splash::ColTypeFloat type;
-    };
-
-    template<>
-    struct PICToSplash<float_64>
-    {
-        typedef splash::ColTypeDouble type;
-    };
-
-    template<>
-    struct PICToSplash<int16_t>
-    {
-        typedef splash::ColTypeInt16 type;
-    };
-
-    template<>
-    struct PICToSplash<uint16_t>
+    namespace traits
     {
-        typedef splash::ColTypeUInt16 type;
-    };
-
-    template<>
-    struct PICToSplash<int32_t>
-    {
-        typedef splash::ColTypeInt32 type;
-    };
-
-    template<>
-    struct PICToSplash<uint32_t>
-    {
-        typedef splash::ColTypeUInt32 type;
-    };
-
-    template<>
-    struct PICToSplash<int64_t>
-    {
-        typedef splash::ColTypeInt64 type;
-    };
-
-    template<>
-    struct PICToSplash<uint64_t>
-    {
-        typedef splash::ColTypeUInt64 type;
-    };
-
-    /** Specialization for uint64_cu.
-     *  If uint64_cu happens to be the same as uint64_t we use an unused dummy type
-     *  to avoid duplicate specialization
-     */
-    struct uint64_cu_unused_splash;
-    template<>
-    struct PICToSplash<
-                        typename bmpl::if_<
-                            typename bmpl::or_<
-                                boost::is_same<uint64_t, uint64_cu>,
-                                bmpl::bool_<sizeof(uint64_cu) != sizeof(uint64_t)>
-                            >::type,
-                            uint64_cu_unused_splash,
-                            uint64_cu
-                        >::type
-                     >: public PICToSplash<uint64_t>
-    {};
-
-    template<>
-    struct PICToSplash<splash::Dimensions>
-    {
-        typedef splash::ColTypeDim type;
-    };
-
-} //namespace traits
-
-}// namespace picongpu
+        template<>
+        struct PICToSplash<bool>
+        {
+            typedef splash::ColTypeBool type;
+        };
+
+        template<>
+        struct PICToSplash<float_32>
+        {
+            typedef splash::ColTypeFloat type;
+        };
+
+        template<>
+        struct PICToSplash<float_64>
+        {
+            typedef splash::ColTypeDouble type;
+        };
+
+        template<>
+        struct PICToSplash<int16_t>
+        {
+            typedef splash::ColTypeInt16 type;
+        };
+
+        template<>
+        struct PICToSplash<uint16_t>
+        {
+            typedef splash::ColTypeUInt16 type;
+        };
+
+        template<>
+        struct PICToSplash<int32_t>
+        {
+            typedef splash::ColTypeInt32 type;
+        };
+
+        template<>
+        struct PICToSplash<uint32_t>
+        {
+            typedef splash::ColTypeUInt32 type;
+        };
+
+        template<>
+        struct PICToSplash<int64_t>
+        {
+            typedef splash::ColTypeInt64 type;
+        };
+
+        template<>
+        struct PICToSplash<uint64_t>
+        {
+            typedef splash::ColTypeUInt64 type;
+        };
+
+        /** Specialization for uint64_cu.
+         *  If uint64_cu happens to be the same as uint64_t we use an unused dummy type
+         *  to avoid duplicate specialization
+         */
+        struct uint64_cu_unused_splash;
+        template<>
+        struct PICToSplash<typename bmpl::if_<
+            typename bmpl::
+                or_<boost::is_same<uint64_t, uint64_cu>, bmpl::bool_<sizeof(uint64_cu) != sizeof(uint64_t)>>::type,
+            uint64_cu_unused_splash,
+            uint64_cu>::type> : public PICToSplash<uint64_t>
+        {
+        };
+
+        template<>
+        struct PICToSplash<splash::Dimensions>
+        {
+            typedef splash::ColTypeDim type;
+        };
+
+    } // namespace traits
+
+} // namespace picongpu
 
 #endif // (ENABLE_HDF5==1)
diff --git a/include/picongpu/traits/SIBaseUnits.hpp b/include/picongpu/traits/SIBaseUnits.hpp
index 126b7d57f7..da21557850 100644
--- a/include/picongpu/traits/SIBaseUnits.hpp
+++ b/include/picongpu/traits/SIBaseUnits.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -21,27 +21,27 @@
 
 namespace picongpu
 {
-namespace traits
-{
-
-    /* openPMD uses the powers of the 7 SI base measures to describe
-     * the unit of a record
-     * \see http://git.io/vROmP */
-    constexpr uint32_t NUnitDimension = 7;
-
-    // pre-C++11 "scoped enumerator" work-around
-    namespace SIBaseUnits {
-    enum SIBaseUnits_t
+    namespace traits
     {
-        length = 0,                   // L
-        mass = 1,                     // M
-        time = 2,                     // T
-        electricCurrent = 3,          // I
-        thermodynamicTemperature = 4, // theta
-        amountOfSubstance = 5,        // N
-        luminousIntensity = 6,        // J
-    };
-    }
+        /* openPMD uses the powers of the 7 SI base measures to describe
+         * the unit of a record
+         * \see http://git.io/vROmP */
+        constexpr uint32_t NUnitDimension = 7;
+
+        // pre-C++11 "scoped enumerator" work-around
+        namespace SIBaseUnits
+        {
+            enum SIBaseUnits_t
+            {
+                length = 0, // L
+                mass = 1, // M
+                time = 2, // T
+                electricCurrent = 3, // I
+                thermodynamicTemperature = 4, // theta
+                amountOfSubstance = 5, // N
+                luminousIntensity = 6, // J
+            };
+        }
 
-} // namespace traits
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/traits/SplashToPIC.hpp b/include/picongpu/traits/SplashToPIC.hpp
index f15ac07f07..f0bb259462 100644
--- a/include/picongpu/traits/SplashToPIC.hpp
+++ b/include/picongpu/traits/SplashToPIC.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl
+/* Copyright 2013-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -23,19 +23,18 @@
 
 namespace picongpu
 {
+    namespace traits
+    {
+        /** Convert a Splash CollectionType to a PIConGPU Type
+         *
+         * \tparam T_SplashType Splash CollectionType
+         * \return \p ::type as public typedef
+         */
+        template<typename T_SplashType>
+        struct SplashToPIC;
 
-namespace traits
-{
-    /** Convert a Splash CollectionType to a PIConGPU Type
-     *
-     * \tparam T_SplashType Splash CollectionType
-     * \return \p ::type as public typedef
-     */
-    template<typename T_SplashType>
-    struct SplashToPIC;
-
-} //namespace traits
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
 
 #include "SplashToPIC.tpp"
diff --git a/include/picongpu/traits/SplashToPIC.tpp b/include/picongpu/traits/SplashToPIC.tpp
index 05e9a3df8f..d9c1d3c520 100644
--- a/include/picongpu/traits/SplashToPIC.tpp
+++ b/include/picongpu/traits/SplashToPIC.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl
+/* Copyright 2013-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -19,73 +19,72 @@
 
 #pragma once
 
-#if (ENABLE_HDF5==1)
-#include <splash/splash.h>
+#if(ENABLE_HDF5 == 1)
+#    include <splash/splash.h>
 
-#include "picongpu/simulation_defines.hpp"
+#    include "picongpu/simulation_defines.hpp"
 
 namespace picongpu
 {
-
-namespace traits
-{
-    template<>
-    struct SplashToPIC<splash::ColTypeBool>
-    {
-        typedef bool type;
-    };
-
-    template<>
-    struct SplashToPIC<splash::ColTypeFloat>
-    {
-        typedef float_32 type;
-    };
-
-    template<>
-    struct SplashToPIC<splash::ColTypeDouble>
-    {
-        typedef float_64 type;
-    };
-
-    /** Native int */
-    template<>
-    struct SplashToPIC<splash::ColTypeInt>
-    {
-        typedef int type;
-    };
-
-    template<>
-    struct SplashToPIC<splash::ColTypeInt32>
+    namespace traits
     {
-        typedef int32_t type;
-    };
-
-    template<>
-    struct SplashToPIC<splash::ColTypeUInt32>
-    {
-        typedef uint32_t type;
-    };
-
-    template<>
-    struct SplashToPIC<splash::ColTypeInt64>
-    {
-        typedef int64_t type;
-    };
-
-    template<>
-    struct SplashToPIC<splash::ColTypeUInt64>
-    {
-        typedef uint64_t type;
-    };
-
-    template<>
-    struct SplashToPIC<splash::ColTypeDim>
-    {
-        typedef splash::Dimensions type;
-    };
-
-} //namespace traits
-
-}// namespace picongpu
+        template<>
+        struct SplashToPIC<splash::ColTypeBool>
+        {
+            typedef bool type;
+        };
+
+        template<>
+        struct SplashToPIC<splash::ColTypeFloat>
+        {
+            typedef float_32 type;
+        };
+
+        template<>
+        struct SplashToPIC<splash::ColTypeDouble>
+        {
+            typedef float_64 type;
+        };
+
+        /** Native int */
+        template<>
+        struct SplashToPIC<splash::ColTypeInt>
+        {
+            typedef int type;
+        };
+
+        template<>
+        struct SplashToPIC<splash::ColTypeInt32>
+        {
+            typedef int32_t type;
+        };
+
+        template<>
+        struct SplashToPIC<splash::ColTypeUInt32>
+        {
+            typedef uint32_t type;
+        };
+
+        template<>
+        struct SplashToPIC<splash::ColTypeInt64>
+        {
+            typedef int64_t type;
+        };
+
+        template<>
+        struct SplashToPIC<splash::ColTypeUInt64>
+        {
+            typedef uint64_t type;
+        };
+
+        template<>
+        struct SplashToPIC<splash::ColTypeDim>
+        {
+            typedef splash::Dimensions type;
+        };
+
+    } // namespace traits
+
+} // namespace picongpu
 
 #endif // (ENABLE_HDF5==1)
diff --git a/include/picongpu/traits/Unit.hpp b/include/picongpu/traits/Unit.hpp
index 39f5f7c316..bd96b354e6 100644
--- a/include/picongpu/traits/Unit.hpp
+++ b/include/picongpu/traits/Unit.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -23,21 +23,20 @@
 
 namespace picongpu
 {
+    namespace traits
+    {
+        /** Get unit of a date that is represented by an identifier
+         *
+         * \tparam T_Identifier any PIConGPU identifier
+         * \return \p std::vector<float_64> ::get() as static public method
+         *
+         * Unitless identifies, see \UnitDimension, can still be scaled by a
+         * factor. If they are not scaled, implement the unit as 1.0;
+         * \see unitless/speciesAttributes.unitless
+         */
+        template<typename T_Identifier>
+        struct Unit;
 
-namespace traits
-{
-    /** Get unit of a date that is represented by an identifier
-     *
-     * \tparam T_Identifier any PIConGPU identifier
-     * \return \p std::vector<float_64> ::get() as static public method
-     *
-     * Unitless identifies, see \UnitDimension, can still be scaled by a
-     * factor. If they are not scaled, implement the unit as 1.0;
-     * \see unitless/speciesAttributes.unitless
-     */
-    template<typename T_Identifier>
-    struct Unit;
-
-} //namespace traits
+    } // namespace traits
 
-}// namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/traits/UnitDimension.hpp b/include/picongpu/traits/UnitDimension.hpp
index d48ed14128..b0bbc4d1c5 100644
--- a/include/picongpu/traits/UnitDimension.hpp
+++ b/include/picongpu/traits/UnitDimension.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -23,26 +23,25 @@
 
 namespace picongpu
 {
+    namespace traits
+    {
+        /** Get power of seven SI base units of date that is represented by an identifier
+         *
+         * Definition must follow the openPMD `unitDimension` definition:
+         * length L, mass M, time T, electric current I, thermodynamic temperature
+         * theta, amount of substance N, luminous intensity J
+         *   \see http://www.openPMD.org
+         *   \see http://dx.doi.org/10.5281/zenodo.33624
+         * Must return a vector of size() == 7, for unitless attributes all
+         * elements are zero.
+         *
+         * \tparam T_Identifier any picongpu identifier
+         * \return \p std::vector<float_64> ::get() as static public method
+         *
+         */
+        template<typename T_Identifier>
+        struct UnitDimension;
 
-namespace traits
-{
-    /** Get power of seven SI base units of date that is represented by an identifier
-     *
-     * Definition must follow the openPMD `unitDimension` definition:
-     * length L, mass M, time T, electric current I, thermodynamic temperature
-     * theta, amount of substance N, luminous intensity J
-     *   \see http://www.openPMD.org
-     *   \see http://dx.doi.org/10.5281/zenodo.33624
-     * Must return a vector of size() == 7, for unitless attributes all
-     * elements are zero.
-     *
-     * \tparam T_Identifier any picongpu identifier
-     * \return \p std::vector<float_64> ::get() as static public method
-     *
-     */
-    template<typename T_Identifier>
-    struct UnitDimension;
-
-} /* namespace traits */
+    } /* namespace traits */
 
 } /* namespace picongpu */
diff --git a/include/picongpu/traits/UsesRNG.hpp b/include/picongpu/traits/UsesRNG.hpp
deleted file mode 100644
index 78ef500102..0000000000
--- a/include/picongpu/traits/UsesRNG.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2016-2020 Marco Garten, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <boost/type_traits/integral_constant.hpp>
-
-namespace picongpu
-{
-namespace traits
-{
-
-/** Checks if an object requires the RNG
- *
- * @tparam T_Object any object (class or typename)
- *
- * This struct must inherit from (boost::true_type/false_type)
- */
-template<typename T_Object>
-struct UsesRNG : public boost::false_type
-{
-};
-
-}// namespace traits
-
-}// namespace picongpu
diff --git a/include/picongpu/traits/attribute/GetCharge.hpp b/include/picongpu/traits/attribute/GetCharge.hpp
index 05a59c2cc3..3f0d341016 100644
--- a/include/picongpu/traits/attribute/GetCharge.hpp
+++ b/include/picongpu/traits/attribute/GetCharge.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Axel Huebl
+/* Copyright 2014-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -29,93 +29,82 @@
 
 namespace picongpu
 {
-namespace traits
-{
-namespace attribute
-{
-namespace detail
-{
-
-/** Calculate the real charge of a particle
- *
- * use attribute `boundElectrons` and the proton number from
- * flag `atomicNumbers` to calculate the charge
- *
- * \tparam T_HasBoundElectrons boolean that describes if species allows multiple charge states
- * due to bound electrons
- */
-template<bool T_HasBoundElectrons>
-struct LoadBoundElectrons
-{
-    /** Functor implementation
-     *
-     * \tparam T_Particle particle type
-     * \param weighting the particle's weighting
-     * \param particle particle reference
-     */
-    template<typename T_Particle>
-    HDINLINE float_X operator()(const float_X weighting, const T_Particle& particle)
+    namespace traits
     {
-        using HasAtomicNumbers = typename pmacc::traits::HasFlag<
-            T_Particle,
-            atomicNumbers<>
-        >::type;
-        PMACC_CASSERT_MSG_TYPE(
-            Having_boundElectrons_particle_attribute_requires_atomicNumbers_flag,
-            T_Particle,
-            HasAtomicNumbers::value
-        );
-        const float_X protonNumber = GetAtomicNumbers<T_Particle>::type::numberOfProtons;
+        namespace attribute
+        {
+            namespace detail
+            {
+                /** Calculate the real charge of a particle
+                 *
+                 * use attribute `boundElectrons` and the proton number from
+                 * flag `atomicNumbers` to calculate the charge
+                 *
+                 * \tparam T_HasBoundElectrons boolean that describes if species allows multiple charge states
+                 * due to bound electrons
+                 */
+                template<bool T_HasBoundElectrons>
+                struct LoadBoundElectrons
+                {
+                    /** Functor implementation
+                     *
+                     * \tparam T_Particle particle type
+                     * \param weighting the particle's weighting
+                     * \param particle particle reference
+                     */
+                    template<typename T_Particle>
+                    HDINLINE float_X operator()(const float_X weighting, const T_Particle& particle)
+                    {
+                        using HasAtomicNumbers = typename pmacc::traits::HasFlag<T_Particle, atomicNumbers<>>::type;
+                        PMACC_CASSERT_MSG_TYPE(
+                            Having_boundElectrons_particle_attribute_requires_atomicNumbers_flag,
+                            T_Particle,
+                            HasAtomicNumbers::value);
+                        const float_X protonNumber = GetAtomicNumbers<T_Particle>::type::numberOfProtons;
 
-        /* note: ELECTRON_CHARGE is negative and the second term is also negative
-         */
-        return
-            ELECTRON_CHARGE *
-            ( particle[boundElectrons_] - protonNumber ) *
-            weighting;
-    }
-};
+                        /* note: ELECTRON_CHARGE is negative and the second term is also negative
+                         */
+                        return ELECTRON_CHARGE * (particle[boundElectrons_] - protonNumber) * weighting;
+                    }
+                };
 
-/**  Calculate the real charge of a particle
- *
- * This is the fallback implementation if no `boundElectrons` are available for a particle
- */
-template<>
-struct LoadBoundElectrons<false>
-{
-    /** Functor implementation
-     *
-     * \tparam T_Particle particle type
-     * \param weighting the particle's weighting
-     * \param particle particle reference
-     */
-    template<typename T_Particle>
-    HDINLINE float_X operator()(const float_X weighting, const T_Particle&)
-    {
-        return frame::getCharge< typename T_Particle::FrameType >() * weighting;
-    }
-};
-} // namespace detail
+                /**  Calculate the real charge of a particle
+                 *
+                 * This is the fallback implementation if no `boundElectrons` are available for a particle
+                 */
+                template<>
+                struct LoadBoundElectrons<false>
+                {
+                    /** Functor implementation
+                     *
+                     * \tparam T_Particle particle type
+                     * \param weighting the particle's weighting
+                     * \param particle particle reference
+                     */
+                    template<typename T_Particle>
+                    HDINLINE float_X operator()(const float_X weighting, const T_Particle&)
+                    {
+                        return frame::getCharge<typename T_Particle::FrameType>() * weighting;
+                    }
+                };
+            } // namespace detail
 
-/** get the charge of a macro particle
- *
- * This function trait considers the `boundElectrons` attribute if it is set
- *
- * @param weighting weighting of the particle
- * @param particle a reference to a particle
- * @return charge of the macro particle
- */
-template<typename T_Particle>
-HDINLINE float_X getCharge(const float_X weighting, const T_Particle& particle)
-{
-    using ParticleType = T_Particle;
-    typedef typename pmacc::traits::HasIdentifier<ParticleType, boundElectrons>::type hasBoundElectrons;
-    return detail::LoadBoundElectrons<hasBoundElectrons::value >()(
-        weighting,
-        particle
-    );
-}
+            /** get the charge of a macro particle
+             *
+             * This function trait considers the `boundElectrons` attribute if it is set
+             *
+             * @param weighting weighting of the particle
+             * @param particle a reference to a particle
+             * @return charge of the macro particle
+             */
+            template<typename T_Particle>
+            HDINLINE float_X getCharge(const float_X weighting, const T_Particle& particle)
+            {
+                using ParticleType = T_Particle;
+                typedef typename pmacc::traits::HasIdentifier<ParticleType, boundElectrons>::type hasBoundElectrons;
+                return detail::LoadBoundElectrons<hasBoundElectrons::value>()(weighting, particle);
+            }
 
-}// namespace attribute
-}// namespace traits
-}// namespace picongpu
+        } // namespace attribute
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/traits/attribute/GetChargeState.hpp b/include/picongpu/traits/attribute/GetChargeState.hpp
index bb2340deba..fc99c429cf 100644
--- a/include/picongpu/traits/attribute/GetChargeState.hpp
+++ b/include/picongpu/traits/attribute/GetChargeState.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten, Rene Widera
+/* Copyright 2014-2021 Marco Garten, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -29,83 +29,77 @@
 
 namespace picongpu
 {
-namespace traits
-{
-namespace attribute
-{
-namespace detail
-{
-
-/** Calculate the charge state of an atom / ion
- *
- * use attribute `boundElectrons` to calculate the charge state
- */
-template<bool T_HasBoundElectrons>
-struct LoadChargeState
-{
-    /** Functor implementation
-     *
-     * \return chargeState = number of electrons in neutral atom - number of currently bound electrons
-     */
-    template<typename T_Particle>
-    HDINLINE float_X operator()(const T_Particle& particle)
+    namespace traits
     {
-        using HasAtomicNumbers = typename pmacc::traits::HasFlag<
-            T_Particle,
-            atomicNumbers<>
-        >::type;
-        PMACC_CASSERT_MSG_TYPE(
-            Having_boundElectrons_particle_attribute_requires_atomicNumbers_flag,
-            T_Particle,
-            HasAtomicNumbers::value
-        );
-        const float_X protonNumber = GetAtomicNumbers<T_Particle>::type::numberOfProtons;
-        return protonNumber - particle[boundElectrons_];
-    }
-};
-
-/**  Calculate charge state of an atom / ion
- *
- * This is the fallback implementation to throw an error if no `boundElectrons`
- * are available for a species.
- */
-template<>
-struct LoadChargeState<false>
-{
+        namespace attribute
+        {
+            namespace detail
+            {
+                /** Calculate the charge state of an atom / ion
+                 *
+                 * use attribute `boundElectrons` to calculate the charge state
+                 */
+                template<bool T_HasBoundElectrons>
+                struct LoadChargeState
+                {
+                    /** Functor implementation
+                     *
+                     * \return chargeState = number of electrons in neutral atom - number of currently bound electrons
+                     */
+                    template<typename T_Particle>
+                    HDINLINE float_X operator()(const T_Particle& particle)
+                    {
+                        using HasAtomicNumbers = typename pmacc::traits::HasFlag<T_Particle, atomicNumbers<>>::type;
+                        PMACC_CASSERT_MSG_TYPE(
+                            Having_boundElectrons_particle_attribute_requires_atomicNumbers_flag,
+                            T_Particle,
+                            HasAtomicNumbers::value);
+                        const float_X protonNumber = GetAtomicNumbers<T_Particle>::type::numberOfProtons;
+                        return protonNumber - particle[boundElectrons_];
+                    }
+                };
 
-    template<typename T_Particle>
-    HDINLINE void operator()(const T_Particle& particle)
-    {
-        /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
-         * even if the class is never instantiated. In that case static assert is always
-         * evaluated (e.g. with clang), this results in an error if the condition is false.
-         * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
-         *
-         * A workaround is to add a template dependency to the expression.
-         * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
-         */
-        PMACC_CASSERT_MSG(This_species_has_only_one_charge_state,1==2 && (sizeof(T_Particle) != 0));
-    }
-};
-} // namespace detail
+                /**  Calculate charge state of an atom / ion
+                 *
+                 * This is the fallback implementation to throw an error if no `boundElectrons`
+                 * are available for a species.
+                 */
+                template<>
+                struct LoadChargeState<false>
+                {
+                    template<typename T_Particle>
+                    HDINLINE void operator()(const T_Particle& particle)
+                    {
+                        /* The compiler is allowed to evaluate an expression that does not depend on a template
+                         * parameter even if the class is never instantiated. In that case static assert is always
+                         * evaluated (e.g. with clang), this results in an error if the condition is false.
+                         * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
+                         *
+                         * A workaround is to add a template dependency to the expression.
+                         * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+                         */
+                        PMACC_CASSERT_MSG(This_species_has_only_one_charge_state, 1 == 2 && (sizeof(T_Particle) != 0));
+                    }
+                };
+            } // namespace detail
 
-/** get the charge state of a macro particle
- *
- * This function trait considers the `boundElectrons` attribute if it is set.
- * Charge states do not add up and also the various particles in a macro particle
- * do NOT have different charge states where one would average over them.
- *
- * @param particle a reference to a particle
- * @return charge of the macro particle
- */
-template<typename T_Particle>
-HDINLINE float_X getChargeState(const T_Particle& particle)
-{
-    using ParticleType = T_Particle;
-    typedef typename pmacc::traits::HasIdentifier<ParticleType, boundElectrons>::type hasBoundElectrons;
-    return detail::LoadChargeState<hasBoundElectrons::value >()(particle);
-}
+            /** get the charge state of a macro particle
+             *
+             * This function trait considers the `boundElectrons` attribute if it is set.
+             * Charge states do not add up and also the various particles in a macro particle
+             * do NOT have different charge states where one would average over them.
+             *
+             * @param particle a reference to a particle
+             * @return charge of the macro particle
+             */
+            template<typename T_Particle>
+            HDINLINE float_X getChargeState(const T_Particle& particle)
+            {
+                using ParticleType = T_Particle;
+                typedef typename pmacc::traits::HasIdentifier<ParticleType, boundElectrons>::type hasBoundElectrons;
+                return detail::LoadChargeState<hasBoundElectrons::value>()(particle);
+            }
 
-}// namespace attribute
-}// namespace traits
-}// namespace picongpu
+        } // namespace attribute
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/traits/attribute/GetMass.hpp b/include/picongpu/traits/attribute/GetMass.hpp
index 0261d04d1b..85875de12b 100644
--- a/include/picongpu/traits/attribute/GetMass.hpp
+++ b/include/picongpu/traits/attribute/GetMass.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,24 +25,23 @@
 
 namespace picongpu
 {
-namespace traits
-{
-namespace attribute
-{
-
-/** get the mass of a makro particle
- *
- * @param weighting weighting of the particle
- * @param particle a reference to a particle
- * @return mass of the makro particle
- */
-template<typename T_Particle>
-HDINLINE float_X getMass(const float_X weighting, const T_Particle& particle)
-{
-    using ParticleType = T_Particle;
-    return frame::getMass<typename ParticleType::FrameType > () * weighting;
-}
+    namespace traits
+    {
+        namespace attribute
+        {
+            /** get the mass of a makro particle
+             *
+             * @param weighting weighting of the particle
+             * @param particle a reference to a particle
+             * @return mass of the makro particle
+             */
+            template<typename T_Particle>
+            HDINLINE float_X getMass(const float_X weighting, const T_Particle& particle)
+            {
+                using ParticleType = T_Particle;
+                return frame::getMass<typename ParticleType::FrameType>() * weighting;
+            }
 
-}// namespace attribute
-}// namespace traits
-}// namespace picongpu
+        } // namespace attribute
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/traits/frame/GetCharge.hpp b/include/picongpu/traits/frame/GetCharge.hpp
index 2ef07c747e..402b402ac3 100644
--- a/include/picongpu/traits/frame/GetCharge.hpp
+++ b/include/picongpu/traits/frame/GetCharge.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,16 +24,15 @@
 
 namespace picongpu
 {
-namespace traits
-{
-namespace frame
-{
-
-/** get the charge value for a species frame
- */
-template<typename T_Frame>
-HDINLINE float_X getCharge();
+    namespace traits
+    {
+        namespace frame
+        {
+            /** get the charge value for a species frame
+             */
+            template<typename T_Frame>
+            HDINLINE float_X getCharge();
 
-}// namespace frame
-}// namespace traits
-}// namespace picongpu
+        } // namespace frame
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/traits/frame/GetMass.hpp b/include/picongpu/traits/frame/GetMass.hpp
index 5b412f6b08..a8f6ae82ab 100644
--- a/include/picongpu/traits/frame/GetMass.hpp
+++ b/include/picongpu/traits/frame/GetMass.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -23,14 +23,13 @@
 
 namespace picongpu
 {
-namespace traits
-{
-namespace frame
-{
-
-template<typename T_Frame>
-HDINLINE float_X getMass();
+    namespace traits
+    {
+        namespace frame
+        {
+            template<typename T_Frame>
+            HDINLINE float_X getMass();
 
-}// namespace frame
-}// namespace traits
-}// namespace picongpu
+        } // namespace frame
+    } // namespace traits
+} // namespace picongpu
diff --git a/include/picongpu/unitless/bremsstrahlung.unitless b/include/picongpu/unitless/bremsstrahlung.unitless
index b7dfdee2bd..a8308e018f 100644
--- a/include/picongpu/unitless/bremsstrahlung.unitless
+++ b/include/picongpu/unitless/bremsstrahlung.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -21,32 +21,29 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace bremsstrahlung
-{
-
-namespace electron
-{
+    namespace particles
+    {
+        namespace bremsstrahlung
+        {
+            namespace electron
+            {
+                constexpr float_64 MIN_ENERGY_SI = MIN_ENERGY_MeV * 1.0e3 * UNITCONV_keV_to_Joule;
+                constexpr float_X MIN_ENERGY = MIN_ENERGY_SI / UNIT_ENERGY;
 
-constexpr float_64 MIN_ENERGY_SI = MIN_ENERGY_MeV * 1.0e3 * UNITCONV_keV_to_Joule;
-constexpr float_X MIN_ENERGY = MIN_ENERGY_SI / UNIT_ENERGY;
+                constexpr float_64 MAX_ENERGY_SI = MAX_ENERGY_MeV * 1.0e3 * UNITCONV_keV_to_Joule;
+                constexpr float_X MAX_ENERGY = MAX_ENERGY_SI / UNIT_ENERGY;
 
-constexpr float_64 MAX_ENERGY_SI = MAX_ENERGY_MeV * 1.0e3 * UNITCONV_keV_to_Joule;
-constexpr float_X MAX_ENERGY = MAX_ENERGY_SI / UNIT_ENERGY;
+                constexpr float_X NUM_STEPS_STOPPING_POWER_INTERGRAL = 1.0e3;
 
-constexpr float_X NUM_STEPS_STOPPING_POWER_INTERGRAL = 1.0e3;
-
-} // namespace electron
-
-namespace photon
-{
+            } // namespace electron
 
-constexpr float_64 SOFT_PHOTONS_CUTOFF_SI = SOFT_PHOTONS_CUTOFF_keV * UNITCONV_keV_to_Joule;
-constexpr float_X SOFT_PHOTONS_CUTOFF = SOFT_PHOTONS_CUTOFF_SI / UNIT_ENERGY;
+            namespace photon
+            {
+                constexpr float_64 SOFT_PHOTONS_CUTOFF_SI = SOFT_PHOTONS_CUTOFF_keV * UNITCONV_keV_to_Joule;
+                constexpr float_X SOFT_PHOTONS_CUTOFF = SOFT_PHOTONS_CUTOFF_SI / UNIT_ENERGY;
 
-} // namespace photon
+            } // namespace photon
 
-} // namespace bremsstrahlung
-} // namespace particles
+        } // namespace bremsstrahlung
+    } // namespace particles
 } // namespace picongpu
diff --git a/include/picongpu/unitless/checkpoints.unitless b/include/picongpu/unitless/checkpoints.unitless
index 2cb994db5e..44cc056e6a 100644
--- a/include/picongpu/unitless/checkpoints.unitless
+++ b/include/picongpu/unitless/checkpoints.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt, Benjamin Worpitz,
  *                     Sergei Bastrakov
  *
  * This file is part of PIConGPU.
@@ -28,50 +28,40 @@
 
 namespace picongpu
 {
-namespace detail
-{
-
+    namespace detail
+    {
         /** Additional fields for checkpointing
          *
          * @tparam T_FieldSolver field solver type
          */
-        template< typename T_FieldSolver >
+        template<typename T_FieldSolver>
         struct AdditionalCheckpointFields
         {
-            using type = MakeSeq_t< >;
+            using type = MakeSeq_t<>;
         };
 
-        //! Only the YeePML solver needs additional fields for checkpointing
-        template< typename ... T_Args >
-        struct AdditionalCheckpointFields<
-            fields::maxwellSolver::YeePML< T_Args ... >
-        >
+        /** Only the YeePML solver needs additional fields for checkpointing
+         *
+         * Currently LehePML is YeePML so automatically works for it as well.
+         */
+        template<typename... T_Args>
+        struct AdditionalCheckpointFields<fields::maxwellSolver::YeePML<T_Args...>>
         {
-            using type = MakeSeq_t<
-                fields::maxwellSolver::yeePML::FieldE,
-                fields::maxwellSolver::yeePML::FieldB
-            >;
+            using type = MakeSeq_t<fields::maxwellSolver::yeePML::FieldE, fields::maxwellSolver::yeePML::FieldB>;
         };
 
-} // namespace detail
+    } // namespace detail
 
     /** Note: we need at least FieldE and FieldB for restart
      *        capabilities!
      */
-    using NativeFileCheckpointFields = MakeSeq_t<
-        FieldE,
-        FieldB
-    >;
+    using NativeFileCheckpointFields = MakeSeq_t<FieldE, FieldB>;
 
-    using AdditionalFileCheckpointFields =
-        typename picongpu::detail::AdditionalCheckpointFields< fields::Solver >::type;
+    using AdditionalFileCheckpointFields = typename picongpu::detail::AdditionalCheckpointFields<fields::Solver>::type;
 
     /* List of particle species for checkpoint/restart */
     using FileCheckpointParticles = VectorAllSpecies;
 
     /**  List of fields for checkpoint/restart */
-    using FileCheckpointFields = MakeSeq_t<
-        NativeFileCheckpointFields,
-        AdditionalFileCheckpointFields
-    >;
-}
+    using FileCheckpointFields = MakeSeq_t<NativeFileCheckpointFields, AdditionalFileCheckpointFields>;
+} // namespace picongpu
diff --git a/include/picongpu/unitless/density.unitless b/include/picongpu/unitless/density.unitless
index 88ae0e0ea8..9dcc7baf14 100644
--- a/include/picongpu/unitless/density.unitless
+++ b/include/picongpu/unitless/density.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -24,8 +24,7 @@
 namespace picongpu
 
 {
-    constexpr float_X BASE_DENSITY =
-        float_X( SI::BASE_DENSITY_SI * UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH );
+    constexpr float_X BASE_DENSITY = float_X(SI::BASE_DENSITY_SI * UNIT_LENGTH * UNIT_LENGTH * UNIT_LENGTH);
 }
 
 #include "picongpu/particles/densityProfiles/profiles.hpp"
diff --git a/include/picongpu/unitless/fieldBackground.unitless b/include/picongpu/unitless/fieldBackground.unitless
index ba701b0503..32d7471c07 100644
--- a/include/picongpu/unitless/fieldBackground.unitless
+++ b/include/picongpu/unitless/fieldBackground.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl
+/* Copyright 2014-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -22,3 +22,4 @@
 
 /** Load pre-defined templates (implementation) */
 #include "picongpu/fields/background/templates/TWTS/TWTS.tpp"
+#include "picongpu/fields/background/templates/twtsfast/twtsfast.tpp"
diff --git a/include/picongpu/unitless/fileOutput.unitless b/include/picongpu/unitless/fileOutput.unitless
index 8a078c6052..42881b7d72 100644
--- a/include/picongpu/unitless/fileOutput.unitless
+++ b/include/picongpu/unitless/fileOutput.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/unitless/grid.unitless b/include/picongpu/unitless/grid.unitless
index 9b5ef60942..2a0d0328ee 100644
--- a/include/picongpu/unitless/grid.unitless
+++ b/include/picongpu/unitless/grid.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -19,7 +19,6 @@
  */
 
 
-
 #pragma once
 
 #include <pmacc/math/Vector.hpp>
@@ -27,28 +26,23 @@
 namespace picongpu
 {
     // normed grid parameter
-    constexpr float_X DELTA_T = float_X( SI::DELTA_T_SI / UNIT_TIME );
-    constexpr float_X CELL_WIDTH = float_X( SI::CELL_WIDTH_SI / UNIT_LENGTH );
-    constexpr float_X CELL_HEIGHT = float_X( SI::CELL_HEIGHT_SI / UNIT_LENGTH );
-    constexpr float_X CELL_DEPTH = float_X( SI::CELL_DEPTH_SI / UNIT_LENGTH );
-    CONST_VECTOR( float_X, DIM3, cellSize, CELL_WIDTH, CELL_HEIGHT, CELL_DEPTH );
+    constexpr float_X DELTA_T = float_X(SI::DELTA_T_SI / UNIT_TIME);
+    constexpr float_X CELL_WIDTH = float_X(SI::CELL_WIDTH_SI / UNIT_LENGTH);
+    constexpr float_X CELL_HEIGHT = float_X(SI::CELL_HEIGHT_SI / UNIT_LENGTH);
+    constexpr float_X CELL_DEPTH = float_X(SI::CELL_DEPTH_SI / UNIT_LENGTH);
+    CONST_VECTOR(float_X, DIM3, cellSize, CELL_WIDTH, CELL_HEIGHT, CELL_DEPTH);
 
     // always a 3D cell, even in 1D3V or 2D3V
     constexpr float_X CELL_VOLUME = CELL_WIDTH * CELL_HEIGHT * CELL_DEPTH;
 
     // only used for CFL checks
-#if (SIMDIM==DIM3)
-    constexpr float_X INV_CELL2_SUM =
-        1.0 / ( CELL_WIDTH  * CELL_WIDTH  ) +
-        1.0 / ( CELL_HEIGHT * CELL_HEIGHT ) +
-        1.0 / ( CELL_DEPTH  * CELL_DEPTH  );
-#elif(SIMDIM==DIM2)
-    constexpr float_X INV_CELL2_SUM =
-        1.0 / ( CELL_WIDTH  * CELL_WIDTH  ) +
-        1.0 / ( CELL_HEIGHT * CELL_HEIGHT );
+#if(SIMDIM == DIM3)
+    constexpr float_X INV_CELL2_SUM
+        = 1.0 / (CELL_WIDTH * CELL_WIDTH) + 1.0 / (CELL_HEIGHT * CELL_HEIGHT) + 1.0 / (CELL_DEPTH * CELL_DEPTH);
+#elif(SIMDIM == DIM2)
+    constexpr float_X INV_CELL2_SUM = 1.0 / (CELL_WIDTH * CELL_WIDTH) + 1.0 / (CELL_HEIGHT * CELL_HEIGHT);
 #else
-    constexpr float_X INV_CELL2_SUM =
-        1.0 / ( CELL_WIDTH  * CELL_WIDTH );
+    constexpr float_X INV_CELL2_SUM = 1.0 / (CELL_WIDTH * CELL_WIDTH);
 #endif
 
-}
+} // namespace picongpu
diff --git a/include/picongpu/unitless/ionizer.unitless b/include/picongpu/unitless/ionizer.unitless
index a0b5ef0030..be860e1442 100644
--- a/include/picongpu/unitless/ionizer.unitless
+++ b/include/picongpu/unitless/ionizer.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Marco Garten
+/* Copyright 2014-2021 Marco Garten
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/unitless/particle.unitless b/include/picongpu/unitless/particle.unitless
index 7fc199ef76..6ecb65bd16 100644
--- a/include/picongpu/unitless/particle.unitless
+++ b/include/picongpu/unitless/particle.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 
diff --git a/include/picongpu/unitless/physicalConstants.unitless b/include/picongpu/unitless/physicalConstants.unitless
index d1865092d1..d10abe34d8 100644
--- a/include/picongpu/unitless/physicalConstants.unitless
+++ b/include/picongpu/unitless/physicalConstants.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Marco Garten, Heiko Burau
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Marco Garten, Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -23,18 +23,18 @@
 namespace picongpu
 {
     //! reduced Planck constant
-    constexpr float_X HBAR = (float_X) (SI::HBAR_SI / UNIT_ENERGY / UNIT_TIME);
+    constexpr float_X HBAR = (float_X)(SI::HBAR_SI / UNIT_ENERGY / UNIT_TIME);
 
     //! Charge of electron
-    constexpr float_X ELECTRON_CHARGE = (float_X) (SI::ELECTRON_CHARGE_SI / UNIT_CHARGE);
+    constexpr float_X ELECTRON_CHARGE = (float_X)(SI::ELECTRON_CHARGE_SI / UNIT_CHARGE);
     //! Mass of electron
-    constexpr float_X ELECTRON_MASS = (float_X) (SI::ELECTRON_MASS_SI / UNIT_MASS);
+    constexpr float_X ELECTRON_MASS = (float_X)(SI::ELECTRON_MASS_SI / UNIT_MASS);
 
     //! magnetic constexprant must be double 3.92907e-39
-    constexpr float_X MUE0 = (float_X) (SI::MUE0_SI / UNIT_LENGTH / UNIT_MASS * UNIT_CHARGE * UNIT_CHARGE);
+    constexpr float_X MUE0 = (float_X)(SI::MUE0_SI / UNIT_LENGTH / UNIT_MASS * UNIT_CHARGE * UNIT_CHARGE);
 
     //! electric constexprant must be double 2.54513e+38
-    constexpr float_X EPS0 = (float_X) (1. / MUE0 / SPEED_OF_LIGHT / SPEED_OF_LIGHT);
+    constexpr float_X EPS0 = (float_X)(1. / MUE0 / SPEED_OF_LIGHT / SPEED_OF_LIGHT);
 
     // = 1/c^2
     constexpr float_X MUE0_EPS0 = float_X(1. / SPEED_OF_LIGHT / SPEED_OF_LIGHT);
@@ -48,4 +48,4 @@ namespace picongpu
     /* Atomic unit of time in PIC units */
     constexpr float_X ATOMIC_UNIT_TIME = float_X(SI::ATOMIC_UNIT_TIME / UNIT_TIME);
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/unitless/pml.unitless b/include/picongpu/unitless/pml.unitless
index 4094eeeb99..82a5c716e4 100644
--- a/include/picongpu/unitless/pml.unitless
+++ b/include/picongpu/unitless/pml.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2019-2020 Sergei Bastrakov
+/* Copyright 2019-2021 Sergei Bastrakov, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -24,69 +24,82 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yeePML
-{
-
-    // Assert parameters are in the valid ranges
-    PMACC_CASSERT_MSG( You_can_not_set_negative_grading_order_for_pml_kappa_and_sigma___change_pml_param, (SIGMA_KAPPA_GRADING_ORDER >= 0.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_negative_value_pml_sigma_max_x___change_pml_param, (SIGMA_MAX_SI[ 0 ] >= 0.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_negative_value_pml_sigma_max_y___change_pml_param, (SIGMA_MAX_SI[ 1 ] >= 0.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_negative_value_pml_sigma_max_z___change_pml_param, (SIGMA_MAX_SI[ 2 ] >= 0.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_pml_kappa_max_x_value_less_than_one___change_pml_param, (KAPPA_MAX[ 0 ] >= 1.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_pml_kappa_max_y_value_less_than_one___change_pml_param, (KAPPA_MAX[ 1 ] >= 1.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_pml_kappa_max_z_value_less_than_one___change_pml_param, (KAPPA_MAX[ 2 ] >= 1.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_negative_grading_order_for_pml_alpha___change_pml_param, (ALPHA_GRADING_ORDER >= 0.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_negative_pml_alpha_max_x___change_pml_param, (ALPHA_MAX_SI[ 0 ] >= 0.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_negative_pml_alpha_max_y___change_pml_param, (ALPHA_MAX_SI[ 1 ] >= 0.0) );
-    PMACC_CASSERT_MSG( You_can_not_set_negative_pml_alpha_max_z___change_pml_param, (ALPHA_MAX_SI[ 2 ] >= 0.0) );
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace Pml
+            {
+                // Assert parameters are in the valid ranges
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_negative_grading_order_for_pml_kappa_and_sigma___change_pml_param,
+                    (SIGMA_KAPPA_GRADING_ORDER >= 0.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_negative_value_pml_sigma_max_x___change_pml_param,
+                    (SIGMA_MAX_SI[0] >= 0.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_negative_value_pml_sigma_max_y___change_pml_param,
+                    (SIGMA_MAX_SI[1] >= 0.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_negative_value_pml_sigma_max_z___change_pml_param,
+                    (SIGMA_MAX_SI[2] >= 0.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_pml_kappa_max_x_value_less_than_one___change_pml_param,
+                    (KAPPA_MAX[0] >= 1.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_pml_kappa_max_y_value_less_than_one___change_pml_param,
+                    (KAPPA_MAX[1] >= 1.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_pml_kappa_max_z_value_less_than_one___change_pml_param,
+                    (KAPPA_MAX[2] >= 1.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_negative_grading_order_for_pml_alpha___change_pml_param,
+                    (ALPHA_GRADING_ORDER >= 0.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_negative_pml_alpha_max_x___change_pml_param,
+                    (ALPHA_MAX_SI[0] >= 0.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_negative_pml_alpha_max_y___change_pml_param,
+                    (ALPHA_MAX_SI[1] >= 0.0));
+                PMACC_CASSERT_MSG(
+                    You_can_not_set_negative_pml_alpha_max_z___change_pml_param,
+                    (ALPHA_MAX_SI[2] >= 0.0));
 
-    /* Normalize artificial conductivity by eps0, so that the result can be used
-     * for matching electric conductivity and magnetic permeability
-     * unit: 1 / s
-     */
-    constexpr float_64 NORMALIZED_SIGMA_MAX_SI[ 3 ] = {
-        SIGMA_MAX_SI[ 0 ] / SI::EPS0_SI,
-        SIGMA_MAX_SI[ 1 ] / SI::EPS0_SI,
-        SIGMA_MAX_SI[ 2 ] / SI::EPS0_SI
-    };
+                /* Normalize artificial conductivity by eps0, so that the result can be used
+                 * for matching electric conductivity and magnetic permeability
+                 * unit: 1 / s
+                 */
+                constexpr float_64 NORMALIZED_SIGMA_MAX_SI[3]
+                    = {SIGMA_MAX_SI[0] / SI::EPS0_SI, SIGMA_MAX_SI[1] / SI::EPS0_SI, SIGMA_MAX_SI[2] / SI::EPS0_SI};
 
-    /** Max value of normalized conductivity in PIC units
-     *
-     * unit: 1 / time
-     * (that is why we multiply by UNIT_TIME and not divide)
-     */
-    constexpr float_64 NORMALIZED_SIGMA_MAX[ 3 ] = {
-        NORMALIZED_SIGMA_MAX_SI[ 0 ] * UNIT_TIME,
-        NORMALIZED_SIGMA_MAX_SI[ 1 ] * UNIT_TIME,
-        NORMALIZED_SIGMA_MAX_SI[ 2 ] * UNIT_TIME
-    };
+                /** Max value of normalized conductivity in PIC units
+                 *
+                 * unit: 1 / time
+                 * (that is why we multiply by UNIT_TIME and not divide)
+                 */
+                constexpr float_64 NORMALIZED_SIGMA_MAX[3]
+                    = {NORMALIZED_SIGMA_MAX_SI[0] * UNIT_TIME,
+                       NORMALIZED_SIGMA_MAX_SI[1] * UNIT_TIME,
+                       NORMALIZED_SIGMA_MAX_SI[2] * UNIT_TIME};
 
-    /* Normalize complex frequency shift by eps0, so that the result can be used
-     * for matching electric conductivity and magnetic permeability
-     * unit: 1 / s
-     */
-    constexpr float_64 NORMALIZED_ALPHA_MAX_SI[ 3 ] = {
-        ALPHA_MAX_SI[ 0 ] / SI::EPS0_SI,
-        ALPHA_MAX_SI[ 1 ] / SI::EPS0_SI,
-        ALPHA_MAX_SI[ 2 ] / SI::EPS0_SI
-    };
+                /* Normalize complex frequency shift by eps0, so that the result can be used
+                 * for matching electric conductivity and magnetic permeability
+                 * unit: 1 / s
+                 */
+                constexpr float_64 NORMALIZED_ALPHA_MAX_SI[3]
+                    = {ALPHA_MAX_SI[0] / SI::EPS0_SI, ALPHA_MAX_SI[1] / SI::EPS0_SI, ALPHA_MAX_SI[2] / SI::EPS0_SI};
 
-    /** Max value of normalized complex frequency shift in PIC units
-     *
-     * unit: 1 / time
-     * (that is why we multiply by UNIT_TIME and not divide)
-     */
-    constexpr float_64 NORMALIZED_ALPHA_MAX[ 3 ] = {
-        NORMALIZED_ALPHA_MAX_SI[ 0 ] * UNIT_TIME,
-        NORMALIZED_ALPHA_MAX_SI[ 1 ] * UNIT_TIME,
-        NORMALIZED_ALPHA_MAX_SI[ 2 ] * UNIT_TIME
-    };
+                /** Max value of normalized complex frequency shift in PIC units
+                 *
+                 * unit: 1 / time
+                 * (that is why we multiply by UNIT_TIME and not divide)
+                 */
+                constexpr float_64 NORMALIZED_ALPHA_MAX[3]
+                    = {NORMALIZED_ALPHA_MAX_SI[0] * UNIT_TIME,
+                       NORMALIZED_ALPHA_MAX_SI[1] * UNIT_TIME,
+                       NORMALIZED_ALPHA_MAX_SI[2] * UNIT_TIME};
 
-} // namespace yeePML
-} // namespace maxwellSolver
-} // namespace fields
+            } // namespace Pml
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/include/picongpu/unitless/png.unitless b/include/picongpu/unitless/png.unitless
index 19e548d6ef..99d479c33e 100644
--- a/include/picongpu/unitless/png.unitless
+++ b/include/picongpu/unitless/png.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -25,56 +25,70 @@
 
 namespace picongpu
 {
-namespace traits
-{
-    template< typename >
-    struct is_laser_none : std::false_type {};
+    namespace traits
+    {
+        template<typename>
+        struct is_laser_none : std::false_type
+        {
+        };
 
-    template< typename T >
-    struct is_laser_none< fields::laserProfiles::None< T > > : std::true_type {};
+        template<typename T>
+        struct is_laser_none<fields::laserProfiles::None<T>> : std::true_type
+        {
+        };
 
-    template< typename >
-    struct is_laser_planewave : std::false_type {};
+        template<typename>
+        struct is_laser_planewave : std::false_type
+        {
+        };
 
-    template< typename T >
-    struct is_laser_planewave< fields::laserProfiles::PlaneWave< T > > : std::true_type {};
-}
-    // asserts for wrong user configurations
-    //
-    // setting 1: Laser
-#if( EM_FIELD_SCALE_CHANNEL1 == 1 || EM_FIELD_SCALE_CHANNEL2 == 1 || EM_FIELD_SCALE_CHANNEL3 == 1 )
+        template<typename T>
+        struct is_laser_planewave<fields::laserProfiles::PlaneWave<T>> : std::true_type
+        {
+        };
+    } // namespace traits
+      // asserts for wrong user configurations
+      //
+      // setting 1: Laser
+#if(EM_FIELD_SCALE_CHANNEL1 == 1 || EM_FIELD_SCALE_CHANNEL2 == 1 || EM_FIELD_SCALE_CHANNEL3 == 1)
     PMACC_CASSERT_MSG(
         You_can_not_scale_your_preview_to_laser_without_using_a_laser___change_png_param,
-        !traits::is_laser_none< fields::laserProfiles::Selected >::value
-    );
+        !traits::is_laser_none<fields::laserProfiles::Selected>::value);
 #endif
 
     // setting 2: Drifting Plasma
-#if( EM_FIELD_SCALE_CHANNEL1 == 2 || EM_FIELD_SCALE_CHANNEL2 == 2 || EM_FIELD_SCALE_CHANNEL3 == 2 )
-    PMACC_CASSERT_MSG( You_can_not_scale_your_preview_to_drift_without_a_initially_drifting_plasma___change_png_param, ((PARTICLE_INIT_DRIFT_GAMMA)>1.0) );
+#if(EM_FIELD_SCALE_CHANNEL1 == 2 || EM_FIELD_SCALE_CHANNEL2 == 2 || EM_FIELD_SCALE_CHANNEL3 == 2)
+    PMACC_CASSERT_MSG(
+        You_can_not_scale_your_preview_to_drift_without_a_initially_drifting_plasma___change_png_param,
+        ((PARTICLE_INIT_DRIFT_GAMMA) > 1.0));
 #endif
 
     // setting 3: Plasma Wave
-#if( EM_FIELD_SCALE_CHANNEL1 == 3 || EM_FIELD_SCALE_CHANNEL2 == 3 || EM_FIELD_SCALE_CHANNEL3 == 3 )
-    PMACC_CASSERT_MSG( You_can_not_scale_your_preview_to_a_zero_plasma_density___change_png_param, (BASE_DENSITY>0.0) );
+#if(EM_FIELD_SCALE_CHANNEL1 == 3 || EM_FIELD_SCALE_CHANNEL2 == 3 || EM_FIELD_SCALE_CHANNEL3 == 3)
+    PMACC_CASSERT_MSG(
+        You_can_not_scale_your_preview_to_a_zero_plasma_density___change_png_param,
+        (BASE_DENSITY > 0.0));
 #endif
 
     // setting 4: Thermal Warm Plasma
-#if( EM_FIELD_SCALE_CHANNEL1 == 4 || EM_FIELD_SCALE_CHANNEL2 == 4 || EM_FIELD_SCALE_CHANNEL3 == 4 )
-    PMACC_CASSERT_MSG( You_can_not_scale_your_preview_to_a_zero_plasma_density___change_png_param, (BASE_DENSITY>0.0) );
-    PMACC_CASSERT_MSG( You_can_not_scale_your_preview_to_a_zero_electron_temperature___change_png_param, ((ELECTRON_TEMPERATURE)>0.0) );
+#if(EM_FIELD_SCALE_CHANNEL1 == 4 || EM_FIELD_SCALE_CHANNEL2 == 4 || EM_FIELD_SCALE_CHANNEL3 == 4)
+    PMACC_CASSERT_MSG(
+        You_can_not_scale_your_preview_to_a_zero_plasma_density___change_png_param,
+        (BASE_DENSITY > 0.0));
+    PMACC_CASSERT_MSG(
+        You_can_not_scale_your_preview_to_a_zero_electron_temperature___change_png_param,
+        ((ELECTRON_TEMPERATURE) > 0.0));
 #endif
 
     // setting 5: Blow Out
-#if( EM_FIELD_SCALE_CHANNEL1 == 5 || EM_FIELD_SCALE_CHANNEL2 == 5 || EM_FIELD_SCALE_CHANNEL3 == 5 )
-    //PMACC_CASSERT_MSG( You_can_not_scale_your_preview_to_a_zero_plasma_density___change_png_param, (BASE_DENSITY>0.0) );
+#if(EM_FIELD_SCALE_CHANNEL1 == 5 || EM_FIELD_SCALE_CHANNEL2 == 5 || EM_FIELD_SCALE_CHANNEL3 == 5)
+    // PMACC_CASSERT_MSG( You_can_not_scale_your_preview_to_a_zero_plasma_density___change_png_param,
+    // (BASE_DENSITY>0.0) );
     PMACC_CASSERT_MSG(
         You_can_not_scale_your_preview_to_blowout_without_a_laser___change_png_param,
-        !traits::is_laser_none< fields::laserProfiles::Selected >::value
-    );
+        !traits::is_laser_none<fields::laserProfiles::Selected>::value);
     PMACC_CASSERT_MSG(
         You_can_not_scale_your_preview_to_blowout_with_a_laser_without_beam_waist___change_png_param,
-        !traits::is_laser_planewave< fields::laserProfiles::Selected >::value
-    );
+        !traits::is_laser_planewave<fields::laserProfiles::Selected>::value);
 #endif
-}
+} // namespace picongpu
diff --git a/include/picongpu/unitless/precision.unitless b/include/picongpu/unitless/precision.unitless
index e45aca41e1..bfcf0a4131 100644
--- a/include/picongpu/unitless/precision.unitless
+++ b/include/picongpu/unitless/precision.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -24,37 +24,36 @@
 
 namespace picongpu
 {
-
     using float_X = precisionPIConGPU::precisionType;
 
     namespace precision32Bit
     {
         using float_X = precisionType;
         /* 32 Bit defines */
-        using float1_X = ::pmacc::math::Vector< float_X, 1 >;
-        using float2_X = ::pmacc::math::Vector< float_X, 2 >;
-        using float3_X = ::pmacc::math::Vector< float_X, 3 >;
-        using floatD_X = ::pmacc::math::Vector< float_X, simDim >;
-    }
+        using float1_X = ::pmacc::math::Vector<float_X, 1>;
+        using float2_X = ::pmacc::math::Vector<float_X, 2>;
+        using float3_X = ::pmacc::math::Vector<float_X, 3>;
+        using floatD_X = ::pmacc::math::Vector<float_X, simDim>;
+    } // namespace precision32Bit
 
     namespace precision64Bit
     {
         using float_X = precisionType;
         /* 64 Bit defines */
-        using float1_X = ::pmacc::math::Vector< float_X, 1 >;
-        using float2_X = ::pmacc::math::Vector< float_X, 2 >;
-        using float3_X = ::pmacc::math::Vector< float_X, 3 >;
-        using floatD_X = ::pmacc::math::Vector< float_X, simDim >;
-    }
+        using float1_X = ::pmacc::math::Vector<float_X, 1>;
+        using float2_X = ::pmacc::math::Vector<float_X, 2>;
+        using float3_X = ::pmacc::math::Vector<float_X, 3>;
+        using floatD_X = ::pmacc::math::Vector<float_X, simDim>;
+    } // namespace precision64Bit
 
     using float_32 = precision32Bit::float_X;
     using float_64 = precision64Bit::float_X;
 
     /* variable precision defines */
-    using float1_X = ::pmacc::math::Vector< float_X, 1 >;
-    using float2_X = ::pmacc::math::Vector< float_X, 2 >;
-    using float3_X = ::pmacc::math::Vector< float_X, 3 >;
-    using floatD_X = ::pmacc::math::Vector< float_X, simDim >;
+    using float1_X = ::pmacc::math::Vector<float_X, 1>;
+    using float2_X = ::pmacc::math::Vector<float_X, 2>;
+    using float3_X = ::pmacc::math::Vector<float_X, 3>;
+    using floatD_X = ::pmacc::math::Vector<float_X, simDim>;
     /* 32 Bit defines */
     using float1_32 = precision32Bit::float1_X;
     using float2_32 = precision32Bit::float2_X;
@@ -67,10 +66,9 @@ namespace picongpu
     using floatD_64 = precision64Bit::floatD_X;
 
     // literals for short-hand notations
-    constexpr float_X
-    operator""_X( long double x )
+    constexpr float_X operator""_X(long double x)
     {
-        return float_X( x );
+        return float_X(x);
     }
 
     // special functions
diff --git a/include/picongpu/unitless/pusher.unitless b/include/picongpu/unitless/pusher.unitless
index c43aa7d2e4..707ac191cf 100644
--- a/include/picongpu/unitless/pusher.unitless
+++ b/include/picongpu/unitless/pusher.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch, Annegret Roeszler
  *
  * This file is part of PIConGPU.
  *
@@ -24,13 +24,15 @@
 
 #include "picongpu/particles/pusher/particlePusherAcceleration.hpp"
 #include "picongpu/particles/pusher/particlePusherBoris.hpp"
+#include "picongpu/particles/pusher/particlePusherComposite.hpp"
 #include "picongpu/particles/pusher/particlePusherVay.hpp"
+#include "picongpu/particles/pusher/particlePusherHigueraCary.hpp"
 #include "picongpu/particles/pusher/particlePusherFree.hpp"
 #include "picongpu/particles/pusher/particlePusherPhoton.hpp"
 #include "picongpu/particles/pusher/particlePusherProbe.hpp"
 #include "picongpu/particles/pusher/particlePusherReducedLandauLifshitz.hpp"
-#if(SIMDIM==DIM3)
-#include "picongpu/particles/pusher/particlePusherAxel.hpp"
+#if(SIMDIM == DIM3)
+#    include "picongpu/particles/pusher/particlePusherAxel.hpp"
 #endif
 
 #include <pmacc/nvidia/functors/Assign.hpp>
@@ -39,58 +41,61 @@
 
 namespace picongpu
 {
-
-namespace particles
-{
-namespace pusher
-{
-
-struct Acceleration :
-public particlePusherAcceleration::Push<Velocity, Gamma<> >
-{
-};
-
-#if(SIMDIM==DIM3)
-
-struct Axel :
-public particlePusherAxel::Push<Velocity, Gamma<> >
-{
-};
+    namespace particles
+    {
+        namespace pusher
+        {
+            struct Acceleration : public particlePusherAcceleration::Push<Velocity, Gamma<>>
+            {
+            };
+
+#if(SIMDIM == DIM3)
+
+            struct Axel : public particlePusherAxel::Push<Velocity, Gamma<>>
+            {
+            };
 #endif
 
-struct Boris :
-public particlePusherBoris::Push<Velocity, Gamma<> >
-{
-};
-
-struct Vay :
-public particlePusherVay::Push<Velocity, Gamma<> >
-{
-};
-
-struct Free :
-public particlePusherFree::Push<Velocity, Gamma<> >
-{
-};
-
-struct Photon :
-public particlePusherPhoton::Push<Velocity, Gamma<> >
-{
-};
-
-struct ReducedLandauLifshitz :
-public particlePusherReducedLandauLifshitz::Push<Velocity, Gamma<> >
-{
-};
-
-struct Probe :
-public particlePusherProbe::Push<
-    pmacc::nvidia::functors::Assign,
-    particlePusherProbe::ActualPusher
->
-{
-};
-
-} //namespace pusher
-} //namespace particles
-} //namespace picongpu
+            struct Boris : public particlePusherBoris::Push<Velocity, Gamma<>>
+            {
+            };
+
+            struct Vay : public particlePusherVay::Push<Velocity, Gamma<>>
+            {
+            };
+
+            struct HigueraCary : public particlePusherHigueraCary::Push<Velocity, Gamma<>>
+            {
+            };
+
+            struct Free : public particlePusherFree::Push<Velocity, Gamma<>>
+            {
+            };
+
+            struct Photon : public particlePusherPhoton::Push<Velocity, Gamma<>>
+            {
+            };
+
+            struct ReducedLandauLifshitz : public particlePusherReducedLandauLifshitz::Push<Velocity, Gamma<>>
+            {
+            };
+
+            struct Probe
+                : public particlePusherProbe::Push<pmacc::nvidia::functors::Assign, particlePusherProbe::ActualPusher>
+            {
+            };
+
+            template<typename T_FirstPusher, typename T_SecondPusher, typename T_ActivationFunctor>
+            struct Composite : public particlePusherComposite::Push<T_FirstPusher, T_SecondPusher, T_ActivationFunctor>
+            {
+            };
+
+            template<uint32_t T_switchTimeStep>
+            struct CompositeBinarySwitchActivationFunctor
+                : public particlePusherComposite::BinarySwitchActivationFunctor<T_switchTimeStep>
+            {
+            };
+
+        } // namespace pusher
+    } // namespace particles
+} // namespace picongpu
diff --git a/include/picongpu/unitless/radiation.unitless b/include/picongpu/unitless/radiation.unitless
index 58e3dd0b81..0ffbc55585 100644
--- a/include/picongpu/unitless/radiation.unitless
+++ b/include/picongpu/unitless/radiation.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -22,46 +22,54 @@
 #include <pmacc/static_assert.hpp>
 
 
-PMACC_CASSERT_MSG( The_Nyquist_limit_needs_to_be_below_one, ( picongpu::plugins::radiation::radiationNyquist::NyquistFactor < 1.0 ) );
-PMACC_CASSERT_MSG( The_Nyquist_limit_needs_to_be_larger_than_zero, ( picongpu::plugins::radiation::radiationNyquist::NyquistFactor > 0.0 ) );
+PMACC_CASSERT_MSG(
+    The_Nyquist_limit_needs_to_be_below_one,
+    (picongpu::plugins::radiation::radiationNyquist::NyquistFactor < 1.0));
+PMACC_CASSERT_MSG(
+    The_Nyquist_limit_needs_to_be_larger_than_zero,
+    (picongpu::plugins::radiation::radiationNyquist::NyquistFactor > 0.0));
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace linear_frequencies
-{
-    constexpr float_X omega_min = SI::omega_min*UNIT_TIME;
-    constexpr float_X omega_max = SI::omega_max*UNIT_TIME;
-    constexpr float_X delta_omega = (float_X) ((omega_max - omega_min) / (float_X) (N_omega - 1)); // difference beween two omega
+    namespace plugins
+    {
+        namespace radiation
+        {
+            namespace linear_frequencies
+            {
+                constexpr float_X omega_min = SI::omega_min * UNIT_TIME;
+                constexpr float_X omega_max = SI::omega_max * UNIT_TIME;
+                constexpr float_X delta_omega
+                    = (float_X)((omega_max - omega_min) / (float_X)(N_omega - 1)); // difference beween two omega
 
-    constexpr unsigned int blocksize_omega = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
-    constexpr unsigned int gridsize_omega = N_omega / blocksize_omega; // size of grid (dim: x); radiation
-} // namespace linear_frequencies
+                constexpr unsigned int blocksize_omega
+                    = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
+                constexpr unsigned int gridsize_omega = N_omega / blocksize_omega; // size of grid (dim: x); radiation
+            } // namespace linear_frequencies
 
-namespace log_frequencies
-{
-    constexpr float_X omega_min = (SI::omega_min*UNIT_TIME);
-    constexpr float_X omega_max = (SI::omega_max*UNIT_TIME);
+            namespace log_frequencies
+            {
+                constexpr float_X omega_min = (SI::omega_min * UNIT_TIME);
+                constexpr float_X omega_max = (SI::omega_max * UNIT_TIME);
 
-    constexpr unsigned int blocksize_omega = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
-    constexpr unsigned int gridsize_omega = N_omega / blocksize_omega; // size of grid (dim: x); radiation
-} // namespace log_frequencies
+                constexpr unsigned int blocksize_omega
+                    = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
+                constexpr unsigned int gridsize_omega = N_omega / blocksize_omega; // size of grid (dim: x); radiation
+            } // namespace log_frequencies
 
-namespace frequencies_from_list
-{
-    constexpr unsigned int blocksize_omega = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
-    constexpr unsigned int gridsize_omega = N_omega / blocksize_omega; // size of grid (dim: x); radiation
-} // namespace frequencies_from_list
+            namespace frequencies_from_list
+            {
+                constexpr unsigned int blocksize_omega
+                    = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
+                constexpr unsigned int gridsize_omega = N_omega / blocksize_omega; // size of grid (dim: x); radiation
+            } // namespace frequencies_from_list
 
-namespace parameters
-{
-    constexpr unsigned int gridsize_theta = N_observer; // size of grid /dim: y); radiation
-} // namespace parameters
+            namespace parameters
+            {
+                constexpr unsigned int gridsize_theta = N_observer; // size of grid /dim: y); radiation
+            } // namespace parameters
 
-} // namespace radiation
-} // namespace plugins
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
 
 #include "picongpu/plugins/radiation/frequencies/radiation_lin_freq.hpp"
diff --git a/include/picongpu/unitless/speciesAttributes.unitless b/include/picongpu/unitless/speciesAttributes.unitless
index 613103dcca..1877b81b7d 100644
--- a/include/picongpu/unitless/speciesAttributes.unitless
+++ b/include/picongpu/unitless/speciesAttributes.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl,
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl,
  *                     Alexander Grund, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
@@ -53,583 +53,582 @@
  */
 namespace picongpu
 {
-namespace traits
-{
-
-template<typename T_Type>
-struct Unit<position<T_Type> >
-{
-    static std::vector<double> get()
-    {
-        std::vector<double> unit(simDim);
-        /* in-cell position needs two transformations to get to SI:
-           in-cell [0;1) -> dimensionless scaling to grid -> SI
-        */
-        for(uint32_t i=0;i<simDim;++i)
-            unit[i]=cellSize[i]*UNIT_LENGTH;
-
-        return unit;
-    }
-};
-template<typename T_Type>
-struct UnitDimension<position<T_Type> >
-{
-    static std::vector<float_64> get()
-    {
-        /* L, M, T, I, theta, N, J
-         *
-         * position is in meter: m
-         *   -> L
-         */
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-        unitDimension.at(SIBaseUnits::length) = 1.0;
-
-        return unitDimension;
-    }
-};
-template<typename T_Type>
-struct MacroWeighted<position<T_Type> >
-{
-    // the position is identical and can not be scaled by weightings
-    static bool get()
-    {
-        return false;
-    }
-};
-template<typename T_Type>
-struct WeightingPower<position<T_Type> >
-{
-    // x * weighting^0 == x: same for real and macro particle
-    static float_64 get()
-    {
-        return 0.0;
-    }
-};
-
-template<>
-struct Unit<radiationMask>
-{
-    // unitless and not scaled by a factor: by convention 1.0
-    static std::vector<double> get()
-    {
-        std::vector<double> unit( 1, 1.0 );
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<radiationMask>
-{
-    static std::vector<float_64> get()
-    {
-        // radiationMask is unitless
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<radiationMask>
-{
-    // identical and can not be scaled by weightings
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<radiationMask>
-{
-    // flag * weighting^0 == flag: same for real and macro particle
-    static float_64 get()
-    {
-        return 0.0;
-    }
-};
-
-template<>
-struct Unit<transitionRadiationMask>
-{
-    // unitless and not scaled by a factor: by convention 1.0
-    static std::vector<double> get()
-    {
-        std::vector<double> unit( 1, 1.0 );
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<transitionRadiationMask>
-{
-    static std::vector<float_64> get()
-    {
-        // transitionRadiationMask is unitless
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<transitionRadiationMask>
-{
-    // identical and can not be scaled by weightings
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<transitionRadiationMask>
-{
-    // flag * weighting^0 == flag: same for real and macro particle
-    static float_64 get()
-    {
-        return 0.0;
-    }
-};
-
-template<>
-struct Unit<momentum>
-{
-    static std::vector<double> get()
-    {
-        const uint32_t components = GetNComponents<typename momentum::type>::value;
-
-        std::vector<double> unit(components);
-        for(uint32_t i=0;i<components;++i)
-            unit[i]=UNIT_MASS*UNIT_SPEED;
-
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<momentum>
-{
-    static std::vector<float_64> get()
-    {
-        /* L, M, T, I, theta, N, J
-         *
-         * momentum is in mass times speed: kg * m / s
-         *   -> L * M * T^-1
-         */
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-        unitDimension.at(SIBaseUnits::length) =  1.0;
-        unitDimension.at(SIBaseUnits::mass)   =  1.0;
-        unitDimension.at(SIBaseUnits::time)   = -1.0;
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<momentum>
-{
-    // we currently push macro particle momentums
-    static bool get()
-    {
-        return true;
-    }
-};
-template<>
-struct WeightingPower<momentum>
-{
-    /* px * weighting^1 == px * weighting: momentum is contributed linearly
-     * in the macro-particle ensemble
-     */
-    static float_64 get()
-    {
-        return 1.0;
-    }
-};
-
-template<>
-struct Unit<momentumPrev1>
-{
-    static std::vector<double> get()
-    {
-        const uint32_t components = GetNComponents<typename momentumPrev1::type>::value;
-
-        std::vector<double> unit(components);
-        for(uint32_t i=0;i<components;++i)
-            unit[i]=UNIT_MASS*UNIT_SPEED;
-
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<momentumPrev1>
-{
-    static std::vector<float_64> get()
-    {
-        /* L, M, T, I, theta, N, J
-         *
-         * momentum is in mass times speed: kg * m / s
-         *   -> L * M * T^-1
-         */
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-        unitDimension.at(SIBaseUnits::length) =  1.0;
-        unitDimension.at(SIBaseUnits::mass)   =  1.0;
-        unitDimension.at(SIBaseUnits::time)   = -1.0;
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<momentumPrev1>
-{
-    // we currently push macro particle momentums
-    static bool get()
-    {
-        return true;
-    }
-};
-template<>
-struct WeightingPower<momentumPrev1>
-{
-    /* px_real * weighting^1 == px_macro * weighting: momentum is contributed
-     * linearly in the macro-particle ensemble
-     */
-    static float_64 get()
-    {
-        return 1.0;
-    }
-};
-
-template<>
-struct Unit<weighting>
-{
-    // unitless and not scaled by a factor: 1.0
-    static std::vector<double> get()
-    {
-        std::vector<double> unit( 1, 1.0 );
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<weighting>
-{
-    static std::vector<float_64> get()
-    {
-        // weighting is unitless
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<weighting>
-{
-    // the weighting attribute is an attribute of the macro particle
-    static bool get()
-    {
-        return true;
-    }
-};
-template<>
-struct WeightingPower<weighting>
-{
-    /* 1 * weighting^1 == weighting: real particles contibute linearily
-     * to the macro particle weighting
-     */
-    static float_64 get()
-    {
-        return 1.0;
-    }
-};
-
-
-template<>
-struct Unit<voronoiCellId>
-{
-    // unitless and not scaled by a factor: by convention 1.0
-    static std::vector<double> get()
-    {
-        std::vector<double> unit( 1, 1.0 );
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<voronoiCellId>
-{
-    static std::vector<float_64> get()
-    {
-        // voronoiCellId is unitless
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<voronoiCellId>
-{
-    // the voronoiCellId attribute is not a physical parameter
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<voronoiCellId>
-{
-    // the voronoiCellId attribute is not a physical parameter
-    static float_64 get()
-    {
-        return 0.0;
-    }
-};
-
-template<>
-struct Unit<probeE>
-{
-    static std::vector<double> get()
-    {
-        uint32_t const components = 3u;
-
-        std::vector< double > const unit( components, UNIT_EFIELD);
-
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<probeE>
-{
-    static std::vector<float_64> get()
-    {
-       /* L, M, T, I, theta, N, J
-        *
-        * E is in volts per meters: V / m = kg * m / (A * s^3)
-        *   -> L * M * T^-3 * I^-1
-        */
-       std::vector<float_64> unitDimension( 7, 0.0 );
-       unitDimension.at(SIBaseUnits::length) =  1.0;
-       unitDimension.at(SIBaseUnits::mass)   =  1.0;
-       unitDimension.at(SIBaseUnits::time)   = -3.0;
-       unitDimension.at(SIBaseUnits::electricCurrent) = -1.0;
-
-       return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<probeE>
-{
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<probeE>
-{
-    // local electric fields do not scale with weighting
-    static float_64 get()
-    {
-        return 0.0;
-    }
-};
-
-template<>
-struct Unit<probeB>
-{
-    static std::vector<double> get()
-    {
-        uint32_t const components = 3u;
-
-        std::vector< double > const unit( components, UNIT_BFIELD);
-
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<probeB>
-{
-    static std::vector<float_64> get()
-    {
-       /* L, M, T, I, theta, N, J
-        *
-        * B is in Tesla : kg / (A * s^2)
-        *   -> M * T^-2 * I^-1
-        */
-       std::vector<float_64> unitDimension( 7, 0.0 );
-       unitDimension.at(SIBaseUnits::mass) =  1.0;
-       unitDimension.at(SIBaseUnits::time) = -2.0;
-       unitDimension.at(SIBaseUnits::electricCurrent) = -1.0;
-
-       return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<probeB>
-{
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<probeB>
-{
-    // local magnetic fields do not scale with weighting
-    static float_64 get()
-    {
-        return 0.0;
-    }
-};
-
-template<>
-struct Unit<particleId>
-{
-    // unitless and not scaled by a factor: by convention 1.0
-    static std::vector<double> get()
-    {
-        std::vector<double> unit( 1, 1.0 );
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<particleId>
-{
-    static std::vector<float_64> get()
-    {
-        // unitless
-        return std::vector<float_64>( NUnitDimension, 0.0 );
-    }
-};
-template<>
-struct MacroWeighted<particleId>
-{
-    // we can only follow maro particles via ids
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<particleId>
-{
-    // particle ids do not scale with weighting
-    static float_64 get()
-    {
-        return 0.0;
-    }
-};
-
-template<>
-struct Unit<totalCellIdx>
-{
-    /* unitless index and not scaled by a factor: by convention 1.0 */
-    static std::vector<double> get()
-    {
-        std::vector<double> unit( simDim, 1.0 );
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<totalCellIdx>
-{
-    static std::vector<float_64> get()
-    {
-        /* totalCellIdx is a cell index and therefore unitless
-         */
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<totalCellIdx>
-{
-    // the cell idx is identical and can not be scaled by weightings
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<totalCellIdx>
-{
-    // idx * weighting^0 == idx: same for real and macro particle
-    static float_64 get()
-    {
-        return 0.0;
-    }
-};
-
-template<>
-struct Unit<boundElectrons>
-{
-    // unitless and not scaled by a factor: 1.0
-    static std::vector<double> get()
-    {
-        std::vector<double> unit( 1, 1.0 );
-        return unit;
-    }
-};
-template<>
-struct UnitDimension<boundElectrons>
-{
-    static std::vector<float_64> get()
-    {
-        // boundElectrons is unitless
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<boundElectrons>
-{
-    // bound electrons are counted for a single real ion
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<boundElectrons>
-{
-    /* #e-_real * weighting^1 == #e-_macro: bound electrons are contributed
-     * linearly from the underlying real particles
-     */
-    static float_64 get()
-    {
-        return 1.0;
-    }
-};
-
-template<>
-struct Unit<superconfig>
-{
-    // unitless and not scaled by a factor: 1.0
-    static std::vector<double> get()
-    {
-        return std::vector<double>( picongpu::flylite::populations, 1.0 );
-    }
-};
-template<>
-struct UnitDimension<superconfig>
-{
-    static std::vector<float_64> get()
-    {
-        // superconfig is unitless
-        std::vector<float_64> unitDimension( NUnitDimension, 0.0 );
-
-        return unitDimension;
-    }
-};
-template<>
-struct MacroWeighted<superconfig>
-{
-    // represented by (1) or (weighted) ions???
-    static bool get()
-    {
-        return false;
-    }
-};
-template<>
-struct WeightingPower<superconfig>
-{
-    static float_64 get()
-    {
-        return 1.0;
-    }
-};
-
-} // namespace traits
+    namespace traits
+    {
+        template<typename T_Type>
+        struct Unit<position<T_Type>>
+        {
+            static std::vector<double> get()
+            {
+                std::vector<double> unit(simDim);
+                /* in-cell position needs two transformations to get to SI:
+                   in-cell [0;1) -> dimensionless scaling to grid -> SI
+                */
+                for(uint32_t i = 0; i < simDim; ++i)
+                    unit[i] = cellSize[i] * UNIT_LENGTH;
+
+                return unit;
+            }
+        };
+        template<typename T_Type>
+        struct UnitDimension<position<T_Type>>
+        {
+            static std::vector<float_64> get()
+            {
+                /* L, M, T, I, theta, N, J
+                 *
+                 * position is in meter: m
+                 *   -> L
+                 */
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+                unitDimension.at(SIBaseUnits::length) = 1.0;
+
+                return unitDimension;
+            }
+        };
+        template<typename T_Type>
+        struct MacroWeighted<position<T_Type>>
+        {
+            // the position is identical and can not be scaled by weightings
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<typename T_Type>
+        struct WeightingPower<position<T_Type>>
+        {
+            // x * weighting^0 == x: same for real and macro particle
+            static float_64 get()
+            {
+                return 0.0;
+            }
+        };
+
+        template<>
+        struct Unit<radiationMask>
+        {
+            // unitless and not scaled by a factor: by convention 1.0
+            static std::vector<double> get()
+            {
+                std::vector<double> unit(1, 1.0);
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<radiationMask>
+        {
+            static std::vector<float_64> get()
+            {
+                // radiationMask is unitless
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<radiationMask>
+        {
+            // identical and can not be scaled by weightings
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<radiationMask>
+        {
+            // flag * weighting^0 == flag: same for real and macro particle
+            static float_64 get()
+            {
+                return 0.0;
+            }
+        };
+
+        template<>
+        struct Unit<transitionRadiationMask>
+        {
+            // unitless and not scaled by a factor: by convention 1.0
+            static std::vector<double> get()
+            {
+                std::vector<double> unit(1, 1.0);
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<transitionRadiationMask>
+        {
+            static std::vector<float_64> get()
+            {
+                // transitionRadiationMask is unitless
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<transitionRadiationMask>
+        {
+            // identical and can not be scaled by weightings
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<transitionRadiationMask>
+        {
+            // flag * weighting^0 == flag: same for real and macro particle
+            static float_64 get()
+            {
+                return 0.0;
+            }
+        };
+
+        template<>
+        struct Unit<momentum>
+        {
+            static std::vector<double> get()
+            {
+                const uint32_t components = GetNComponents<typename momentum::type>::value;
+
+                std::vector<double> unit(components);
+                for(uint32_t i = 0; i < components; ++i)
+                    unit[i] = UNIT_MASS * UNIT_SPEED;
+
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<momentum>
+        {
+            static std::vector<float_64> get()
+            {
+                /* L, M, T, I, theta, N, J
+                 *
+                 * momentum is in mass times speed: kg * m / s
+                 *   -> L * M * T^-1
+                 */
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+                unitDimension.at(SIBaseUnits::length) = 1.0;
+                unitDimension.at(SIBaseUnits::mass) = 1.0;
+                unitDimension.at(SIBaseUnits::time) = -1.0;
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<momentum>
+        {
+            // we currently push macro particle momentums
+            static bool get()
+            {
+                return true;
+            }
+        };
+        template<>
+        struct WeightingPower<momentum>
+        {
+            /* px * weighting^1 == px * weighting: momentum is contributed linearly
+             * in the macro-particle ensemble
+             */
+            static float_64 get()
+            {
+                return 1.0;
+            }
+        };
+
+        template<>
+        struct Unit<momentumPrev1>
+        {
+            static std::vector<double> get()
+            {
+                const uint32_t components = GetNComponents<typename momentumPrev1::type>::value;
+
+                std::vector<double> unit(components);
+                for(uint32_t i = 0; i < components; ++i)
+                    unit[i] = UNIT_MASS * UNIT_SPEED;
+
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<momentumPrev1>
+        {
+            static std::vector<float_64> get()
+            {
+                /* L, M, T, I, theta, N, J
+                 *
+                 * momentum is in mass times speed: kg * m / s
+                 *   -> L * M * T^-1
+                 */
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+                unitDimension.at(SIBaseUnits::length) = 1.0;
+                unitDimension.at(SIBaseUnits::mass) = 1.0;
+                unitDimension.at(SIBaseUnits::time) = -1.0;
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<momentumPrev1>
+        {
+            // we currently push macro particle momentums
+            static bool get()
+            {
+                return true;
+            }
+        };
+        template<>
+        struct WeightingPower<momentumPrev1>
+        {
+            /* px_real * weighting^1 == px_macro * weighting: momentum is contributed
+             * linearly in the macro-particle ensemble
+             */
+            static float_64 get()
+            {
+                return 1.0;
+            }
+        };
+
+        template<>
+        struct Unit<weighting>
+        {
+            // unitless and not scaled by a factor: 1.0
+            static std::vector<double> get()
+            {
+                std::vector<double> unit(1, 1.0);
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<weighting>
+        {
+            static std::vector<float_64> get()
+            {
+                // weighting is unitless
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<weighting>
+        {
+            // the weighting attribute is an attribute of the macro particle
+            static bool get()
+            {
+                return true;
+            }
+        };
+        template<>
+        struct WeightingPower<weighting>
+        {
+            /* 1 * weighting^1 == weighting: real particles contibute linearily
+             * to the macro particle weighting
+             */
+            static float_64 get()
+            {
+                return 1.0;
+            }
+        };
+
+
+        template<>
+        struct Unit<voronoiCellId>
+        {
+            // unitless and not scaled by a factor: by convention 1.0
+            static std::vector<double> get()
+            {
+                std::vector<double> unit(1, 1.0);
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<voronoiCellId>
+        {
+            static std::vector<float_64> get()
+            {
+                // voronoiCellId is unitless
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<voronoiCellId>
+        {
+            // the voronoiCellId attribute is not a physical parameter
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<voronoiCellId>
+        {
+            // the voronoiCellId attribute is not a physical parameter
+            static float_64 get()
+            {
+                return 0.0;
+            }
+        };
+
+        template<>
+        struct Unit<probeE>
+        {
+            static std::vector<double> get()
+            {
+                uint32_t const components = 3u;
+
+                std::vector<double> const unit(components, UNIT_EFIELD);
+
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<probeE>
+        {
+            static std::vector<float_64> get()
+            {
+                /* L, M, T, I, theta, N, J
+                 *
+                 * E is in volts per meters: V / m = kg * m / (A * s^3)
+                 *   -> L * M * T^-3 * I^-1
+                 */
+                std::vector<float_64> unitDimension(7, 0.0);
+                unitDimension.at(SIBaseUnits::length) = 1.0;
+                unitDimension.at(SIBaseUnits::mass) = 1.0;
+                unitDimension.at(SIBaseUnits::time) = -3.0;
+                unitDimension.at(SIBaseUnits::electricCurrent) = -1.0;
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<probeE>
+        {
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<probeE>
+        {
+            // local electric fields do not scale with weighting
+            static float_64 get()
+            {
+                return 0.0;
+            }
+        };
+
+        template<>
+        struct Unit<probeB>
+        {
+            static std::vector<double> get()
+            {
+                uint32_t const components = 3u;
+
+                std::vector<double> const unit(components, UNIT_BFIELD);
+
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<probeB>
+        {
+            static std::vector<float_64> get()
+            {
+                /* L, M, T, I, theta, N, J
+                 *
+                 * B is in Tesla : kg / (A * s^2)
+                 *   -> M * T^-2 * I^-1
+                 */
+                std::vector<float_64> unitDimension(7, 0.0);
+                unitDimension.at(SIBaseUnits::mass) = 1.0;
+                unitDimension.at(SIBaseUnits::time) = -2.0;
+                unitDimension.at(SIBaseUnits::electricCurrent) = -1.0;
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<probeB>
+        {
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<probeB>
+        {
+            // local magnetic fields do not scale with weighting
+            static float_64 get()
+            {
+                return 0.0;
+            }
+        };
+
+        template<>
+        struct Unit<particleId>
+        {
+            // unitless and not scaled by a factor: by convention 1.0
+            static std::vector<double> get()
+            {
+                std::vector<double> unit(1, 1.0);
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<particleId>
+        {
+            static std::vector<float_64> get()
+            {
+                // unitless
+                return std::vector<float_64>(NUnitDimension, 0.0);
+            }
+        };
+        template<>
+        struct MacroWeighted<particleId>
+        {
+            // we can only follow maro particles via ids
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<particleId>
+        {
+            // particle ids do not scale with weighting
+            static float_64 get()
+            {
+                return 0.0;
+            }
+        };
+
+        template<>
+        struct Unit<totalCellIdx>
+        {
+            /* unitless index and not scaled by a factor: by convention 1.0 */
+            static std::vector<double> get()
+            {
+                std::vector<double> unit(simDim, 1.0);
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<totalCellIdx>
+        {
+            static std::vector<float_64> get()
+            {
+                /* totalCellIdx is a cell index and therefore unitless
+                 */
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<totalCellIdx>
+        {
+            // the cell idx is identical and can not be scaled by weightings
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<totalCellIdx>
+        {
+            // idx * weighting^0 == idx: same for real and macro particle
+            static float_64 get()
+            {
+                return 0.0;
+            }
+        };
+
+        template<>
+        struct Unit<boundElectrons>
+        {
+            // unitless and not scaled by a factor: 1.0
+            static std::vector<double> get()
+            {
+                std::vector<double> unit(1, 1.0);
+                return unit;
+            }
+        };
+        template<>
+        struct UnitDimension<boundElectrons>
+        {
+            static std::vector<float_64> get()
+            {
+                // boundElectrons is unitless
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<boundElectrons>
+        {
+            // bound electrons are counted for a single real ion
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<boundElectrons>
+        {
+            /* #e-_real * weighting^1 == #e-_macro: bound electrons are contributed
+             * linearly from the underlying real particles
+             */
+            static float_64 get()
+            {
+                return 1.0;
+            }
+        };
+
+        template<>
+        struct Unit<superconfig>
+        {
+            // unitless and not scaled by a factor: 1.0
+            static std::vector<double> get()
+            {
+                return std::vector<double>(picongpu::flylite::populations, 1.0);
+            }
+        };
+        template<>
+        struct UnitDimension<superconfig>
+        {
+            static std::vector<float_64> get()
+            {
+                // superconfig is unitless
+                std::vector<float_64> unitDimension(NUnitDimension, 0.0);
+
+                return unitDimension;
+            }
+        };
+        template<>
+        struct MacroWeighted<superconfig>
+        {
+            // represented by (1) or (weighted) ions???
+            static bool get()
+            {
+                return false;
+            }
+        };
+        template<>
+        struct WeightingPower<superconfig>
+        {
+            static float_64 get()
+            {
+                return 1.0;
+            }
+        };
+
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/unitless/speciesConstants.unitless b/include/picongpu/unitless/speciesConstants.unitless
index c7c8c631f4..659da93719 100644
--- a/include/picongpu/unitless/speciesConstants.unitless
+++ b/include/picongpu/unitless/speciesConstants.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -22,10 +22,9 @@
 
 namespace picongpu
 {
-
     //! Charge of base particle
-    constexpr float_X BASE_CHARGE = (float_X) (SI::BASE_CHARGE_SI / UNIT_CHARGE);
+    constexpr float_X BASE_CHARGE = (float_X)(SI::BASE_CHARGE_SI / UNIT_CHARGE);
     //! Mass of base particle
-    constexpr float_X BASE_MASS = (float_X) (SI::BASE_MASS_SI / UNIT_MASS);
+    constexpr float_X BASE_MASS = (float_X)(SI::BASE_MASS_SI / UNIT_MASS);
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/include/picongpu/unitless/speciesDefinition.unitless b/include/picongpu/unitless/speciesDefinition.unitless
index a385dcb466..da3ae56677 100644
--- a/include/picongpu/unitless/speciesDefinition.unitless
+++ b/include/picongpu/unitless/speciesDefinition.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -32,48 +32,39 @@
 
 namespace picongpu
 {
-namespace traits
-{
-namespace frame
-{
+    namespace traits
+    {
+        namespace frame
+        {
+            /** default `getMass()` specialization
+             *
+             * - the default mass is `BASE_MASS * massRatio<>`
+             * - massRatio<> is the user defined ratio which is pinned as flag to a species
+             */
+            template<typename T_Frame>
+            HDINLINE float_X getMass()
+            {
+                using MassRatioValue =
+                    typename pmacc::traits::Resolve<typename GetFlagType<T_Frame, massRatio<>>::type>::type;
 
-/** default `getMass()` specialization
- *
- * - the default mass is `BASE_MASS * massRatio<>`
- * - massRatio<> is the user defined ratio which is pinned as flag to a species
- */
-template<typename T_Frame>
-HDINLINE float_X getMass()
-{
-    using MassRatioValue = typename pmacc::traits::Resolve<
-        typename GetFlagType<
-            T_Frame,
-            massRatio<>
-        >::type
-    >::type;
+                return BASE_MASS * MassRatioValue::getValue();
+            }
 
-    return BASE_MASS * MassRatioValue::getValue();
-};
 
+            /** default `getCharge()` specialization
+             *
+             * - the default charge is `BASE_CHARGE * chargeRatio<>`
+             * - chargeRatio<> is the user defined ratio which is pinned as flag to a species
+             */
+            template<typename T_Frame>
+            HDINLINE float_X getCharge()
+            {
+                using ChargeRatioValue =
+                    typename pmacc::traits::Resolve<typename GetFlagType<T_Frame, chargeRatio<>>::type>::type;
 
-/** default `getCharge()` specialization
- *
- * - the default charge is `BASE_CHARGE * chargeRatio<>`
- * - chargeRatio<> is the user defined ratio which is pinned as flag to a species
- */
-template<typename T_Frame>
-HDINLINE float_X getCharge()
-{
-    using ChargeRatioValue = typename pmacc::traits::Resolve<
-        typename GetFlagType<
-            T_Frame,
-            chargeRatio<>
-        >::type
-    >::type;
-
-    return BASE_CHARGE * ChargeRatioValue::getValue();
-};
+                return BASE_CHARGE * ChargeRatioValue::getValue();
+            }
 
-} // namespace frame
-} // namespace traits
+        } // namespace frame
+    } // namespace traits
 } // namespace picongpu
diff --git a/include/picongpu/unitless/speciesInitialization.unitless b/include/picongpu/unitless/speciesInitialization.unitless
index 948ca2a390..3077b32226 100644
--- a/include/picongpu/unitless/speciesInitialization.unitless
+++ b/include/picongpu/unitless/speciesInitialization.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
diff --git a/include/picongpu/unitless/starter.unitless b/include/picongpu/unitless/starter.unitless
index b4cffd2a07..d8214462c4 100644
--- a/include/picongpu/unitless/starter.unitless
+++ b/include/picongpu/unitless/starter.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -21,25 +21,19 @@
 
 #include "picongpu/initialization/InitialiserController.hpp"
 #include "picongpu/plugins/PluginController.hpp"
-#include "picongpu/simulation/control/MySimulation.hpp"
+#include "picongpu/simulation/control/Simulation.hpp"
 #include "picongpu/simulation/control/SimulationStarter.hpp"
 
 
 namespace picongpu
 {
-
     namespace defaultPIConGPU
     {
         /* Define a starter for the simulation with the name "SimStarter"
          *
          * etc.: using SimStarter = MyOwnStarterClass;
          */
-        using SimStarter = ::picongpu::SimulationStarter<
-            ::picongpu::InitialiserController,
-            ::picongpu::PluginController,
-            ::picongpu::MySimulation
-        >;
-    }
-}
-
-
+        using SimStarter = ::picongpu::
+            SimulationStarter<::picongpu::InitialiserController, ::picongpu::PluginController, ::picongpu::Simulation>;
+    } // namespace defaultPIConGPU
+} // namespace picongpu
diff --git a/include/picongpu/unitless/synchrotronPhotons.unitless b/include/picongpu/unitless/synchrotronPhotons.unitless
index 7be2a760c4..beabc3277a 100644
--- a/include/picongpu/unitless/synchrotronPhotons.unitless
+++ b/include/picongpu/unitless/synchrotronPhotons.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -24,24 +24,23 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace synchrotronPhotons
-{
-
-/** Sample point stepping */
-constexpr float_64 SYNC_FUNCS_STEP_WIDTH =
-    SYNC_FUNCS_CUTOFF / static_cast<float_64>(SYNC_FUNCS_NUM_SAMPLES - 1u);
-
-/** In the definition of the first synchrotron function the bessel function is integrated
- * up to infinity but in fact it is sufficient to integrate up to this constant. */
-constexpr float_64 SYNC_FUNCS_F1_INTEGRAL_BOUND = 50.0;
-
-constexpr float_X SOFT_PHOTONS_CUTOFF_MOM = static_cast<float_X>(
-    HBAR * pmacc::algorithms::math::Pi<float_X>::doubleValue / SOFT_PHOTONS_CUTOFF_RATIO / DELTA_T / SPEED_OF_LIGHT);
-
-} // namespace synchrotronPhotons
-} // namespace particles
+    namespace particles
+    {
+        namespace synchrotronPhotons
+        {
+            /** Sample point stepping */
+            constexpr float_64 SYNC_FUNCS_STEP_WIDTH
+                = SYNC_FUNCS_CUTOFF / static_cast<float_64>(SYNC_FUNCS_NUM_SAMPLES - 1u);
+
+            /** In the definition of the first synchrotron function the bessel function is integrated
+             * up to infinity but in fact it is sufficient to integrate up to this constant. */
+            constexpr float_64 SYNC_FUNCS_F1_INTEGRAL_BOUND = 50.0;
+
+            constexpr float_X SOFT_PHOTONS_CUTOFF_MOM = static_cast<float_X>(
+                HBAR * pmacc::math::Pi<float_X>::doubleValue / SOFT_PHOTONS_CUTOFF_RATIO / DELTA_T / SPEED_OF_LIGHT);
+
+        } // namespace synchrotronPhotons
+    } // namespace particles
 } // namespace picongpu
 
 #include "picongpu/particles/synchrotronPhotons/PhotonCreator.hpp"
diff --git a/include/picongpu/unitless/transitionRadiation.unitless b/include/picongpu/unitless/transitionRadiation.unitless
index b834b241f9..d6612489b4 100644
--- a/include/picongpu/unitless/transitionRadiation.unitless
+++ b/include/picongpu/unitless/transitionRadiation.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch, Finn-Ole Carstens
+/* Copyright 2013-2021 Rene Widera, Richard Pausch, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -23,46 +23,50 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace transitionRadiation
-{
-//! units for linear frequencies distribution for transition radiation plugin
-namespace linearFrequencies
-{
-    constexpr float_X omegaMin = SI::omegaMin*UNIT_TIME;
-    constexpr float_X omegaMax = SI::omegaMax*UNIT_TIME;
-    constexpr float_X deltaOmega = (float_X) ((omegaMax - omegaMin) / (float_X) (nOmega - 1)); // difference beween two omega
+    namespace plugins
+    {
+        namespace transitionRadiation
+        {
+            //! units for linear frequencies distribution for transition radiation plugin
+            namespace linearFrequencies
+            {
+                constexpr float_X omegaMin = SI::omegaMin * UNIT_TIME;
+                constexpr float_X omegaMax = SI::omegaMax * UNIT_TIME;
+                constexpr float_X deltaOmega
+                    = (float_X)((omegaMax - omegaMin) / (float_X)(nOmega - 1)); // difference beween two omega
 
-    constexpr unsigned int blocksizeOmega = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
-    constexpr unsigned int gridsizeOmega = nOmega / blocksizeOmega; // size of grid (dim: x); radiation
-}
+                constexpr unsigned int blocksizeOmega
+                    = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
+                constexpr unsigned int gridsizeOmega = nOmega / blocksizeOmega; // size of grid (dim: x); radiation
+            } // namespace linearFrequencies
 
-//! units for logarithmic frequencies distribution for transition radiation plugin
-namespace logFrequencies
-{
-    constexpr float_X omegaMin = (SI::omegaMin*UNIT_TIME);
-    constexpr float_X omegaMax = (SI::omegaMax*UNIT_TIME);
+            //! units for logarithmic frequencies distribution for transition radiation plugin
+            namespace logFrequencies
+            {
+                constexpr float_X omegaMin = (SI::omegaMin * UNIT_TIME);
+                constexpr float_X omegaMax = (SI::omegaMax * UNIT_TIME);
 
-    constexpr unsigned int blocksizeOmega = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
-    constexpr unsigned int gridsizeOmega = nOmega / blocksizeOmega; // size of grid (dim: x); radiation
-}
+                constexpr unsigned int blocksizeOmega
+                    = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
+                constexpr unsigned int gridsizeOmega = nOmega / blocksizeOmega; // size of grid (dim: x); radiation
+            } // namespace logFrequencies
 
-//! units for frequencies from list for transition radiation calculation
-namespace listFrequencies
-{
-    constexpr unsigned int blocksizeOmega = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
-    constexpr unsigned int gridsizeOmega = nOmega / blocksizeOmega; // size of grid (dim: x); radiation
-}
+            //! units for frequencies from list for transition radiation calculation
+            namespace listFrequencies
+            {
+                constexpr unsigned int blocksizeOmega
+                    = pmacc::math::CT::volume<typename MappingDesc::SuperCellSize>::type::value;
+                constexpr unsigned int gridsizeOmega = nOmega / blocksizeOmega; // size of grid (dim: x); radiation
+            } // namespace listFrequencies
 
-//! unit for foil position
-namespace parameters
-{
-    constexpr float_X foilPosition = SI::foilPosition / UNIT_LENGTH;
-}
+            //! unit for foil position
+            namespace parameters
+            {
+                constexpr float_X foilPosition = SI::foilPosition / UNIT_LENGTH;
+            }
 
-} // namespace transitionRadiation
-} // namespace plugins
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
 
 #include "picongpu/plugins/transitionRadiation/frequencies/LinearFrequencies.hpp"
diff --git a/include/picongpu/version.hpp b/include/picongpu/version.hpp
index dfa6a3d6b5..4543da54e1 100644
--- a/include/picongpu/version.hpp
+++ b/include/picongpu/version.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -20,6 +20,6 @@
 #pragma once
 
 #define PICONGPU_VERSION_MAJOR 0
-#define PICONGPU_VERSION_MINOR 5
+#define PICONGPU_VERSION_MINOR 6
 #define PICONGPU_VERSION_PATCH 0
-#define PICONGPU_VERSION_LABEL ""
+#define PICONGPU_VERSION_LABEL "dev"
diff --git a/include/picongpu/versionFormat.cpp b/include/picongpu/versionFormat.cpp
index 235f569a06..ea11b755a7 100644
--- a/include/picongpu/versionFormat.cpp
+++ b/include/picongpu/versionFormat.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl, Franz Poeschel
  *
  * This file is part of PIConGPU.
  *
@@ -17,6 +17,7 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <pmacc/boost_workaround.hpp>
 #include "picongpu/versionFormat.hpp"
 
 #include <boost/version.hpp>
@@ -25,18 +26,21 @@
 #include <boost/preprocessor/stringize.hpp>
 
 #ifdef __CUDACC_VER_MAJOR__
-#include <cuda.h>
-#include <mallocMC/mallocMC.hpp>
+#    include <cuda.h>
+#    include <mallocMC/mallocMC.hpp>
 #endif
 #include <mpi.h>
-#if( ENABLE_HDF5 == 1 )
-#   include <splash/splash.h>
+#if(ENABLE_HDF5 == 1)
+#    include <splash/splash.h>
 #endif
-#if( ENABLE_ADIOS == 1 )
-#   include <adios.h>
+#if(ENABLE_ADIOS == 1)
+#    include <adios.h>
 #endif
-#if( PIC_ENABLE_PNG == 1 )
-#   include <pngwriter.h>
+#if(PIC_ENABLE_PNG == 1)
+#    include <pngwriter.h>
+#endif
+#if(ENABLE_OPENPMD == 1)
+#    include <openPMD/openPMD.hpp>
 #endif
 
 #include <sstream>
@@ -44,16 +48,13 @@
 
 namespace picongpu
 {
-    std::list< std::string >
-    getSoftwareVersions( std::ostream & cliText )
+    std::list<std::string> getSoftwareVersions(std::ostream& cliText)
     {
-        std::string const versionNotFound( "NOTFOUND" );
+        std::string const versionNotFound("NOTFOUND");
 
         std::stringstream picongpu;
-        picongpu << PICONGPU_VERSION_MAJOR << "."
-                 << PICONGPU_VERSION_MINOR << "."
-                 << PICONGPU_VERSION_PATCH;
-        if( std::string( PICONGPU_VERSION_LABEL ).size() > 0 )
+        picongpu << PICONGPU_VERSION_MAJOR << "." << PICONGPU_VERSION_MINOR << "." << PICONGPU_VERSION_PATCH;
+        if(std::string(PICONGPU_VERSION_LABEL).size() > 0)
             picongpu << "-" << PICONGPU_VERSION_LABEL;
 
         std::stringstream buildType;
@@ -74,32 +75,25 @@ namespace picongpu
 
 #ifdef __CUDACC_VER_MAJOR__
         std::stringstream cuda;
-        cuda << __CUDACC_VER_MAJOR__ << "."
-             << __CUDACC_VER_MINOR__ << "."
-             << __CUDACC_VER_BUILD__;
+        cuda << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__ << "." << __CUDACC_VER_BUILD__;
 
         std::stringstream mallocMC;
-        mallocMC << MALLOCMC_VERSION_MAJOR << "."
-                 << MALLOCMC_VERSION_MINOR << "."
-                 << MALLOCMC_VERSION_PATCH;
+        mallocMC << MALLOCMC_VERSION_MAJOR << "." << MALLOCMC_VERSION_MINOR << "." << MALLOCMC_VERSION_PATCH;
 #endif
 
         std::stringstream boost;
-        boost << int(BOOST_VERSION / 100000) << "."
-              << int(BOOST_VERSION / 100 % 1000) << "."
+        boost << int(BOOST_VERSION / 100000) << "." << int(BOOST_VERSION / 100 % 1000) << "."
               << int(BOOST_VERSION % 100);
 
         std::stringstream mpiStandard;
         std::stringstream mpiFlavor;
         std::stringstream mpiFlavorVersion;
         mpiStandard << MPI_VERSION << "." << MPI_SUBVERSION;
-#if defined( OMPI_MAJOR_VERSION )
+#if defined(OMPI_MAJOR_VERSION)
         // includes derivates such as Bullx MPI, Sun, ...
         mpiFlavor << "OpenMPI";
-        mpiFlavorVersion << OMPI_MAJOR_VERSION << "."
-                         << OMPI_MINOR_VERSION << "."
-                         << OMPI_RELEASE_VERSION;
-#elif defined( MPICH_VERSION )
+        mpiFlavorVersion << OMPI_MAJOR_VERSION << "." << OMPI_MINOR_VERSION << "." << OMPI_RELEASE_VERSION;
+#elif defined(MPICH_VERSION)
         /* includes MPICH2 and MPICH3 and
          * derivates such as IBM, Cray, MS, Intel, MVAPICH(2), ... */
         mpiFlavor << "MPICH";
@@ -110,43 +104,42 @@ namespace picongpu
 #endif
 
         std::stringstream pngwriter;
-#if( PIC_ENABLE_PNG == 1 )
-        pngwriter << PNGWRITER_VERSION_MAJOR << "."
-                  << PNGWRITER_VERSION_MINOR << "."
-                  << PNGWRITER_VERSION_PATCH;
+#if(PIC_ENABLE_PNG == 1)
+        pngwriter << PNGWRITER_VERSION_MAJOR << "." << PNGWRITER_VERSION_MINOR << "." << PNGWRITER_VERSION_PATCH;
 #else
         pngwriter << versionNotFound;
 #endif
 
         std::stringstream splash;
         std::stringstream splashFormat;
-#if( ENABLE_HDF5 == 1 )
-        splash << SPLASH_VERSION_MAJOR << "."
-               << SPLASH_VERSION_MINOR << "."
-               << SPLASH_VERSION_PATCH;
-        splashFormat << SPLASH_FILE_FORMAT_MAJOR << "."
-                     << SPLASH_FILE_FORMAT_MINOR;
+#if(ENABLE_HDF5 == 1)
+        splash << SPLASH_VERSION_MAJOR << "." << SPLASH_VERSION_MINOR << "." << SPLASH_VERSION_PATCH;
+        splashFormat << SPLASH_FILE_FORMAT_MAJOR << "." << SPLASH_FILE_FORMAT_MINOR;
 #else
         splash << versionNotFound;
         splashFormat << versionNotFound;
 #endif
 
         std::stringstream adios;
-#if( ENABLE_ADIOS == 1 )
+#if(ENABLE_ADIOS == 1)
         adios << ADIOS_VERSION;
 #else
         adios << versionNotFound;
 #endif
 
+#if(ENABLE_OPENPMD == 1)
+        std::string openPMD = openPMD::getVersion();
+#else
+        std::string openPMD = versionNotFound;
+#endif
+
         // CLI Formatting
         cliText << "PIConGPU: " << picongpu.str() << std::endl;
-        cliText << "  Build-Type: " << buildType.str() << std::endl
-                << std::endl;
+        cliText << "  Build-Type: " << buildType.str() << std::endl << std::endl;
         cliText << "Third party:" << std::endl;
         cliText << "  OS:         " << os.str() << std::endl;
         cliText << "  arch:       " << arch.str() << std::endl;
-        cliText << "  CXX:        " << cxx.str()
-                << " (" << cxxVersion.str() << ")" << std::endl;
+        cliText << "  CXX:        " << cxx.str() << " (" << cxxVersion.str() << ")" << std::endl;
         cliText << "  CMake:      " << cmake.str() << std::endl;
 #ifdef __CUDACC_VER_MAJOR__
         cliText << "  CUDA:       " << cuda.str() << std::endl;
@@ -155,32 +148,33 @@ namespace picongpu
         cliText << "  Boost:      " << boost.str() << std::endl;
         cliText << "  MPI:        " << std::endl
                 << "    standard: " << mpiStandard.str() << std::endl
-                << "    flavor:   " << mpiFlavor.str()
-                << " (" << mpiFlavorVersion.str() << ")" << std::endl;
+                << "    flavor:   " << mpiFlavor.str() << " (" << mpiFlavorVersion.str() << ")" << std::endl;
         cliText << "  PNGwriter:  " << pngwriter.str() << std::endl;
-        cliText << "  libSplash:  " << splash.str()
-                << " (Format " << splashFormat.str() << ")" << std::endl;
+        cliText << "  libSplash:  " << splash.str() << " (Format " << splashFormat.str() << ")" << std::endl;
         cliText << "  ADIOS:      " << adios.str() << std::endl;
+        cliText << "  openPMD:    " << openPMD << std::endl;
 
         // Module-like formatting of software only
-        std::list< std::string > software;
-        software.push_back( std::string( "PIConGPU/" ) + picongpu.str() );
-        software.push_back( cxx.str() + std::string( "/" ) + cxxVersion.str() );
-        software.push_back( std::string( "CMake/" ) + cmake.str() );
+        std::list<std::string> software;
+        software.push_back(std::string("PIConGPU/") + picongpu.str());
+        software.push_back(cxx.str() + std::string("/") + cxxVersion.str());
+        software.push_back(std::string("CMake/") + cmake.str());
 #ifdef __CUDACC_VER_MAJOR__
-        software.push_back( std::string( "CUDA/" ) + cuda.str() );
+        software.push_back(std::string("CUDA/") + cuda.str());
 #endif
-        software.push_back( std::string( "Boost/" ) + boost.str() );
-        software.push_back( mpiFlavor.str() + std::string( "/" ) + mpiFlavorVersion.str() );
+        software.push_back(std::string("Boost/") + boost.str());
+        software.push_back(mpiFlavor.str() + std::string("/") + mpiFlavorVersion.str());
 #ifdef __CUDACC_VER_MAJOR__
-        software.push_back( std::string( "mallocMC/" ) + mallocMC.str() );
+        software.push_back(std::string("mallocMC/") + mallocMC.str());
 #endif
-        if( pngwriter.str().compare( versionNotFound ) != 0 )
-            software.push_back( std::string( "PNGwriter/" ) + pngwriter.str() );
-        if( splash.str().compare( versionNotFound ) != 0 )
-            software.push_back( std::string( "libSplash/" ) + splash.str() );
-        if( adios.str().compare( versionNotFound ) != 0 )
-            software.push_back( std::string( "ADIOS/" ) + adios.str() );
+        if(pngwriter.str().compare(versionNotFound) != 0)
+            software.push_back(std::string("PNGwriter/") + pngwriter.str());
+        if(splash.str().compare(versionNotFound) != 0)
+            software.push_back(std::string("libSplash/") + splash.str());
+        if(adios.str().compare(versionNotFound) != 0)
+            software.push_back(std::string("ADIOS/") + adios.str());
+        if(openPMD.compare(versionNotFound) != 0)
+            software.push_back(std::string("openPMD/") + openPMD);
 
         return software;
     }
diff --git a/include/picongpu/versionFormat.hpp b/include/picongpu/versionFormat.hpp
index b33a9c1be0..51f5ee42c4 100644
--- a/include/picongpu/versionFormat.hpp
+++ b/include/picongpu/versionFormat.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -35,6 +35,5 @@ namespace picongpu
      * @param[out] cliText formatted table for output to a command line
      * @return a list of strings in the form software/version
      */
-    std::list< std::string >
-    getSoftwareVersions( std::ostream & cliText );
+    std::list<std::string> getSoftwareVersions(std::ostream& cliText);
 } // namespace picongpu
diff --git a/include/pmacc/CMakeLists.txt b/include/pmacc/CMakeLists.txt
index 5862a8d927..10fa730351 100644
--- a/include/pmacc/CMakeLists.txt
+++ b/include/pmacc/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright 2015-2020 Erik Zenker, Alexander Grund
+# Copyright 2015-2021 Erik Zenker, Alexander Grund
 #
 # This file is part of PMacc.
 #
@@ -24,7 +24,7 @@
 # PMacc tests
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 project(PMaccTest)
 
 # set helper pathes to find libraries and packages
@@ -53,10 +53,19 @@ endif()
 # Language Flags
 ###############################################################################
 
-# enforce C++11
+# enforce C++14
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
+
+
+################################################################################
+# Directory of this file.
+################################################################################
+set(PMACC_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR})
+
+# Normalize the path (e.g. remove ../)
+get_filename_component(PMACC_ROOT_DIR "${PMACC_ROOT_DIR}" ABSOLUTE)
 
 ################################################################################
 # PMacc
@@ -69,16 +78,10 @@ add_definitions(${PMacc_DEFINITIONS})
 
 
 ###############################################################################
-# Boost.Test
+# Catch2
 ###############################################################################
 
-find_package(Boost 1.65.1 COMPONENTS unit_test_framework REQUIRED)
-if(TARGET Boost::unit_test_framework)
-    set(LIBS ${LIBS} Boost::boost Boost::unit_test_framework)
-else()
-    include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
-    set(LIBS ${LIBS} ${Boost_LIBRARIES})
-endif()
+add_subdirectory(${PMACC_ROOT_DIR}/../../thirdParty/catch2/catch_main ${CMAKE_BINARY_DIR}/catch2)
 
 
 ################################################################################
@@ -89,13 +92,17 @@ find_package(MPI REQUIRED)
 include_directories(SYSTEM ${MPI_C_INCLUDE_PATH})
 set(LIBS ${LIBS} ${MPI_C_LIBRARIES})
 
+option(USE_MPI_AS_ROOT_USER "add --allow-run-as-root mpiexec used by ctest" OFF)
+
+if(USE_MPI_AS_ROOT_USER)
+    set(MPI_RUNTIME_FLAGS "--allow-run-as-root")
+endif()
 
 ###############################################################################
 # Targets
 ###############################################################################
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test)
-add_definitions(-DBOOST_TEST_DYN_LINK)
 
 # CTest
 enable_testing()
@@ -108,10 +115,11 @@ foreach(dim 2 3)
         get_filename_component(testCaseFilename ${testCaseFilepath} NAME)
         string(REPLACE "UT.cpp" "" testCase ${testCaseFilename})
         set(testExe "${PROJECT_NAME}-${testCase}-${dim}D")
-        cupla_add_executable(${testExe} ${testCaseFilepath} ${CMAKE_CURRENT_SOURCE_DIR}/test/main.cpp)
+        cupla_add_executable(${testExe} ${testCaseFilepath})
         target_compile_definitions(${testExe} PRIVATE TEST_DIM=${dim})
         target_link_libraries(${testExe} PUBLIC ${LIBS})
-        add_test(NAME "${testCase}-${dim}D" COMMAND mpiexec -n 1 ./${testExe})
+        target_link_libraries(${testExe} PUBLIC CatchMain)
+        add_test(NAME "${testCase}-${dim}D" COMMAND mpiexec ${MPI_RUNTIME_FLAGS} -n 1 ./${testExe})
     endforeach()
     string(REPLACE "-DTEST_DIM=${dim}" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endforeach()
diff --git a/include/pmacc/Environment.def b/include/pmacc/Environment.def
index 5670c7a0f4..87b5fc969b 100644
--- a/include/pmacc/Environment.def
+++ b/include/pmacc/Environment.def
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -25,13 +25,12 @@
 
 namespace pmacc
 {
-
-    template< uint32_t T_dim = DIM1 >
+    template<uint32_t T_dim = DIM1>
     class Environment;
 
-namespace detail
-{
-    struct Environment;
+    namespace detail
+    {
+        struct Environment;
 
-} // namespace detail
+    } // namespace detail
 } // namespace pmacc
diff --git a/include/pmacc/Environment.hpp b/include/pmacc/Environment.hpp
index 5bf9f88d47..8695306538 100644
--- a/include/pmacc/Environment.hpp
+++ b/include/pmacc/Environment.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt, Conrad Schumann,
+/* Copyright 2014-2021 Felix Schmitt, Conrad Schumann,
  *                     Alexander Grund, Axel Huebl
  *
  * This file is part of PMacc.
@@ -43,507 +43,513 @@
 
 namespace pmacc
 {
-
-namespace detail
-{
-    /** collect state variables of the environment context
-     *
-     * This class handle the initialization and finalize of the
-     * MPI context and the selection of the GPU.
-     */
-    class EnvironmentContext
+    namespace detail
     {
+        /** collect state variables of the environment context
+         *
+         * This class handle the initialization and finalize of the
+         * MPI context and the selection of the GPU.
+         */
+        class EnvironmentContext
+        {
+            friend Environment;
 
-        friend Environment;
+            friend pmacc::Environment<DIM1>;
+            friend pmacc::Environment<DIM2>;
+            friend pmacc::Environment<DIM3>;
 
-        friend pmacc::Environment<DIM1>;
-        friend pmacc::Environment<DIM2>;
-        friend pmacc::Environment<DIM3>;
+            EnvironmentContext()
+                : m_isMpiInitialized(false)
+                , m_isDeviceSelected(false)
+                , m_isSubGridDefined(false)
+                , m_isMpiDirectEnabled(false)
+            {
+            }
 
-        EnvironmentContext( ) :
-            m_isMpiInitialized( false ),
-            m_isDeviceSelected( false ),
-            m_isSubGridDefined( false )
-        {
-        }
+            /** initialization state of MPI */
+            bool m_isMpiInitialized;
 
-        /** initialization state of MPI */
-        bool m_isMpiInitialized;
+            /** state if a computing device is selected */
+            bool m_isDeviceSelected;
 
-        /** state if a computing device is selected */
-        bool m_isDeviceSelected;
+            /** state if the SubGrid is defined */
+            bool m_isSubGridDefined;
 
-        /** state if the SubGrid is defined */
-        bool m_isSubGridDefined;
+            /** state shows if MPI direct is activated */
+            bool m_isMpiDirectEnabled;
 
-        /** get the singleton EnvironmentContext
-         *
-         * @return instance of EnvironmentContext
-         */
-        static EnvironmentContext& getInstance()
-        {
-            static EnvironmentContext instance;
-            return instance;
-        }
+            /** get the singleton EnvironmentContext
+             *
+             * @return instance of EnvironmentContext
+             */
+            static EnvironmentContext& getInstance()
+            {
+                static EnvironmentContext instance;
+                return instance;
+            }
 
-        /** state of the MPI context
-         *
-         * @return true if MPI is initialized else false
-         */
-        bool isMpiInitialized()
-        {
-            return m_isMpiInitialized;
-        }
+            /** state of the MPI context
+             *
+             * @return true if MPI is initialized else false
+             */
+            bool isMpiInitialized()
+            {
+                return m_isMpiInitialized;
+            }
 
-        /** is a computing device selected
-         *
-         * @return true if device is selected else false
-         */
-        bool isDeviceSelected()
-        {
-            return m_isDeviceSelected;
-        }
+            /** is a computing device selected
+             *
+             * @return true if device is selected else false
+             */
+            bool isDeviceSelected()
+            {
+                return m_isDeviceSelected;
+            }
+
+            /** is the SubGrid defined
+             *
+             * @return true if SubGrid is defined, else false
+             */
+            bool isSubGridDefined()
+            {
+                return m_isSubGridDefined;
+            }
+
+            /** initialize the environment
+             *
+             * After this call it is allowed to use MPI.
+             */
+            HINLINE void init();
+
+            /** cleanup the environment */
+            HINLINE void finalize();
 
-        /** is the SubGrid defined
+            /** select a computing device
+             *
+             * After this call it is allowed to use the computing device.
+             *
+             * @param deviceNumber number of the device
+             */
+            HINLINE void setDevice(int deviceNumber);
+
+            //! activate MPI direct usage
+            void enableMpiDirect()
+            {
+                m_isMpiDirectEnabled = true;
+            }
+
+            //! query if MPI direct support is activated
+            bool isMpiDirectEnabled() const
+            {
+                return m_isMpiDirectEnabled;
+            }
+        };
+
+        /** PMacc environment
          *
-         * @return true if SubGrid is defined, else false
+         * Get access to all PMacc singleton classes those not depend on a dimension.
          */
-        bool isSubGridDefined()
+        struct Environment
         {
-            return m_isSubGridDefined;
-        }
+            Environment()
+            {
+            }
 
-        /** initialize the environment
-         *
-         * After this call it is allowed to use MPI.
-         */
-        HINLINE void init();
+            /** cleanup the environment */
+            void finalize()
+            {
+                EnvironmentContext::getInstance().finalize();
+            }
 
-        /** cleanup the environment */
-        HINLINE void finalize();
+            /** get the singleton StreamController
+             *
+             * @return instance of StreamController
+             */
+            pmacc::StreamController& StreamController()
+            {
+                PMACC_ASSERT_MSG(
+                    EnvironmentContext::getInstance().isDeviceSelected(),
+                    "Environment< DIM >::initDevices() must be called before this method!");
+                return StreamController::getInstance();
+            }
 
-        /** select a computing device
-         *
-         * After this call it is allowed to use the computing device.
-         *
-         * @param deviceNumber number of the device
-         */
-        HINLINE void setDevice(int deviceNumber);
+            /** get the singleton Manager
+             *
+             * @return instance of Manager
+             */
+            pmacc::Manager& Manager()
+            {
+                return Manager::getInstance();
+            }
 
-    };
+            /** get the singleton TransactionManager
+             *
+             * @return instance of TransactionManager
+             */
+            pmacc::TransactionManager& TransactionManager() const
+            {
+                PMACC_ASSERT_MSG(
+                    EnvironmentContext::getInstance().isDeviceSelected(),
+                    "Environment< DIM >::initDevices() must be called before this method!");
+                return TransactionManager::getInstance();
+            }
 
-    /** PMacc environment
-     *
-     * Get access to all PMacc singleton classes those not depend on a dimension.
-     */
-    struct Environment
-    {
-        Environment()
-        {
-        }
+            /** get the singleton EnvironmentController
+             *
+             * @return instance of EnvironmentController
+             */
+            pmacc::EnvironmentController& EnvironmentController()
+            {
+                PMACC_ASSERT_MSG(
+                    EnvironmentContext::getInstance().isMpiInitialized(),
+                    "Environment< DIM >::initDevices() must be called before this method!");
+                return EnvironmentController::getInstance();
+            }
 
-        /** cleanup the environment */
-        void finalize()
-        {
-            EnvironmentContext::getInstance().finalize();
-        }
+            /** get the singleton Factory
+             *
+             * @return instance of Factory
+             */
+            pmacc::Factory& Factory()
+            {
+                PMACC_ASSERT_MSG(
+                    EnvironmentContext::getInstance().isMpiInitialized()
+                        && EnvironmentContext::getInstance().isDeviceSelected(),
+                    "Environment< DIM >::initDevices() must be called before this method!");
+                return Factory::getInstance();
+            }
 
-        /** get the singleton StreamController
-         *
-         * @return instance of StreamController
-         */
-        pmacc::StreamController& StreamController()
-        {
-            PMACC_ASSERT_MSG(
-                EnvironmentContext::getInstance().isDeviceSelected(),
-                "Environment< DIM >::initDevices() must be called before this method!"
-            );
-            return StreamController::getInstance();
-        }
+            /** get the singleton EventPool
+             *
+             * @return instance of EventPool
+             */
+            pmacc::EventPool& EventPool()
+            {
+                PMACC_ASSERT_MSG(
+                    EnvironmentContext::getInstance().isDeviceSelected(),
+                    "Environment< DIM >::initDevices() must be called before this method!");
+                return EventPool::getInstance();
+            }
 
-        /** get the singleton Manager
-         *
-         * @return instance of Manager
-         */
-        pmacc::Manager& Manager()
-        {
-            return Manager::getInstance();
-        }
+            /** get the singleton ParticleFactory
+             *
+             * @return instance of ParticleFactory
+             */
+            pmacc::ParticleFactory& ParticleFactory()
+            {
+                return ParticleFactory::getInstance();
+            }
 
-        /** get the singleton TransactionManager
-         *
-         * @return instance of TransactionManager
-         */
-        pmacc::TransactionManager& TransactionManager() const
-        {
-            PMACC_ASSERT_MSG(
-                EnvironmentContext::getInstance().isDeviceSelected(),
-                "Environment< DIM >::initDevices() must be called before this method!"
-            );
-            return TransactionManager::getInstance();
-        }
+            /** get the singleton DataConnector
+             *
+             * @return instance of DataConnector
+             */
+            pmacc::DataConnector& DataConnector()
+            {
+                return DataConnector::getInstance();
+            }
 
-        /** get the singleton EnvironmentController
-         *
-         * @return instance of EnvironmentController
-         */
-        pmacc::EnvironmentController& EnvironmentController()
+            /** get the singleton PluginConnector
+             *
+             * @return instance of PluginConnector
+             */
+            pmacc::PluginConnector& PluginConnector()
+            {
+                return PluginConnector::getInstance();
+            }
+
+            /** get the singleton MemoryInfo
+             *
+             * @return instance of MemoryInfo
+             */
+            nvidia::memory::MemoryInfo& MemoryInfo()
+            {
+                PMACC_ASSERT_MSG(
+                    EnvironmentContext::getInstance().isDeviceSelected(),
+                    "Environment< DIM >::initDevices() must be called before this method!");
+                return nvidia::memory::MemoryInfo::getInstance();
+            }
+
+            /** get the singleton SimulationDescription
+             *
+             * @return instance of SimulationDescription
+             */
+            simulationControl::SimulationDescription& SimulationDescription()
+            {
+                return simulationControl::SimulationDescription::getInstance();
+            }
+        };
+    } // namespace detail
+
+    /** Global Environment singleton for PMacc
+     */
+    template<uint32_t T_dim>
+    class Environment : public detail::Environment
+    {
+    public:
+        void enableMpiDirect()
         {
-            PMACC_ASSERT_MSG(
-                EnvironmentContext::getInstance().isMpiInitialized(),
-                "Environment< DIM >::initDevices() must be called before this method!"
-            );
-            return EnvironmentController::getInstance();
+            detail::EnvironmentContext::getInstance().enableMpiDirect();
         }
 
-        /** get the singleton Factory
-         *
-         * @return instance of Factory
-         */
-        pmacc::Factory& Factory()
+        bool isMpiDirectEnabled() const
         {
-            PMACC_ASSERT_MSG(
-                EnvironmentContext::getInstance().isMpiInitialized() &&
-                EnvironmentContext::getInstance().isDeviceSelected(),
-                "Environment< DIM >::initDevices() must be called before this method!"
-            );
-            return Factory::getInstance();
+            return detail::EnvironmentContext::getInstance().isMpiDirectEnabled();
         }
 
-        /** get the singleton EventPool
+        /** get the singleton GridController
          *
-         * @return instance of EventPool
+         * @return instance of GridController
          */
-        pmacc::EventPool& EventPool()
+        pmacc::GridController<T_dim>& GridController()
         {
             PMACC_ASSERT_MSG(
-                EnvironmentContext::getInstance().isDeviceSelected(),
-                "Environment< DIM >::initDevices() must be called before this method!"
-            );
-            return EventPool::getInstance();
+                detail::EnvironmentContext::getInstance().isMpiInitialized(),
+                "Environment< DIM >::initDevices() must be called before this method!");
+            return pmacc::GridController<T_dim>::getInstance();
         }
 
-        /** get the singleton ParticleFactory
+        /** get the singleton SubGrid
          *
-         * @return instance of ParticleFactory
+         * @return instance of SubGrid
          */
-        pmacc::ParticleFactory& ParticleFactory()
+        pmacc::SubGrid<T_dim>& SubGrid()
         {
-            return ParticleFactory::getInstance();
+            PMACC_ASSERT_MSG(
+                detail::EnvironmentContext::getInstance().isSubGridDefined(),
+                "Environment< DIM >::initGrids() must be called before this method!");
+            return pmacc::SubGrid<T_dim>::getInstance();
         }
 
-        /** get the singleton DataConnector
+        /** get the singleton Filesystem
          *
-         * @return instance of DataConnector
+         * @return instance of Filesystem
          */
-        pmacc::DataConnector& DataConnector()
+        pmacc::Filesystem<T_dim>& Filesystem()
         {
-            return DataConnector::getInstance();
+            return pmacc::Filesystem<T_dim>::getInstance();
         }
 
-        /** get the singleton PluginConnector
+        /** get the singleton Environment< DIM >
          *
-         * @return instance of PluginConnector
+         * @return instance of Environment<DIM >
          */
-        pmacc::PluginConnector& PluginConnector()
+        static Environment<T_dim>& get()
         {
-            return PluginConnector::getInstance();
+            static Environment<T_dim> instance;
+            return instance;
         }
 
-        /** get the singleton MemoryInfo
+        /** create and initialize the environment of PMacc
          *
-         * @return instance of MemoryInfo
-         */
-        nvidia::memory::MemoryInfo& MemoryInfo()
-        {
-            PMACC_ASSERT_MSG(
-                EnvironmentContext::getInstance().isDeviceSelected(),
-                "Environment< DIM >::initDevices() must be called before this method!"
-            );
-            return nvidia::memory::MemoryInfo::getInstance();
-        }
-
-        /** get the singleton SimulationDescription
+         * Usage of MPI or device(accelerator) function calls before this method
+         * are not allowed.
          *
-         * @return instance of SimulationDescription
+         * @param devices number of devices per simulation dimension
+         * @param periodic periodicity each simulation dimension
+         *                 (0 == not periodic, 1 == periodic)
          */
-        simulationControl::SimulationDescription& SimulationDescription()
+        void initDevices(DataSpace<T_dim> devices, DataSpace<T_dim> periodic)
         {
-            return simulationControl::SimulationDescription::getInstance();
-        }
-    };
-} // namespace detail
+            // initialize the MPI context
+            detail::EnvironmentContext::getInstance().init();
 
-/** Global Environment singleton for PMacc
- */
-template< uint32_t T_dim >
-class Environment : public detail::Environment
-{
-public:
+            // create singleton instances
+            GridController().init(devices, periodic);
 
-    /** get the singleton GridController
-     *
-     * @return instance of GridController
-     */
-    pmacc::GridController< T_dim >& GridController()
-    {
-        PMACC_ASSERT_MSG(
-            detail::EnvironmentContext::getInstance().isMpiInitialized(),
-            "Environment< DIM >::initDevices() must be called before this method!"
-        );
-        return pmacc::GridController< T_dim >::getInstance();
-    }
-
-    /** get the singleton SubGrid
-     *
-     * @return instance of SubGrid
-     */
-    pmacc::SubGrid< T_dim >& SubGrid()
-    {
-        PMACC_ASSERT_MSG(
-            detail::EnvironmentContext::getInstance().isSubGridDefined(),
-            "Environment< DIM >::initGrids() must be called before this method!"
-        );
-        return pmacc::SubGrid< T_dim >::getInstance();
-    }
-
-    /** get the singleton Filesystem
-     *
-     * @return instance of Filesystem
-     */
-    pmacc::Filesystem< T_dim >& Filesystem()
-    {
-        return pmacc::Filesystem< T_dim >::getInstance();
-    }
+            EnvironmentController();
 
-    /** get the singleton Environment< DIM >
-     *
-     * @return instance of Environment<DIM >
-     */
-    static Environment< T_dim >& get()
-    {
-        static Environment< T_dim > instance;
-        return instance;
-    }
-
-    /** create and initialize the environment of PMacc
-     *
-     * Usage of MPI or device(accelerator) function calls before this method
-     * are not allowed.
-     *
-     * @param devices number of devices per simulation dimension
-     * @param periodic periodicity each simulation dimension
-     *                 (0 == not periodic, 1 == periodic)
-     */
-    void initDevices(
-        DataSpace< T_dim > devices,
-        DataSpace< T_dim > periodic
-    )
-    {
-        // initialize the MPI context
-        detail::EnvironmentContext::getInstance().init();
-
-        // create singleton instances
-        GridController().init( devices, periodic );
-
-        EnvironmentController();
+            Filesystem();
 
-        Filesystem();
+            detail::EnvironmentContext::getInstance().setDevice(static_cast<int>(GridController().getHostRank()));
 
-        detail::EnvironmentContext::getInstance().setDevice(
-            static_cast<int>( GridController().getHostRank() )
-        );
+            StreamController().activate();
 
-        StreamController().activate();
+            MemoryInfo();
 
-        MemoryInfo();
-
-        TransactionManager();
-
-        SimulationDescription();
-
-    }
-
-    /** initialize the computing domain information of PMacc
-     *
-     * @param globalDomainSize size of the global simulation domain [cells]
-     * @param localDomainSize size of the local simulation domain [cells]
-     * @param localDomainOffset local domain offset [cells]
-     */
-    void initGrids(
-        DataSpace< T_dim > globalDomainSize,
-        DataSpace< T_dim > localDomainSize,
-        DataSpace< T_dim > localDomainOffset
-    )
-    {
-        PMACC_ASSERT_MSG(
-            detail::EnvironmentContext::getInstance().isMpiInitialized(),
-            "Environment< DIM >::initDevices() must be called before this method!"
-        );
+            TransactionManager();
 
-        detail::EnvironmentContext::getInstance().m_isSubGridDefined = true;
-
-        // create singleton instances
-        SubGrid().init(
-            localDomainSize,
-            globalDomainSize,
-            localDomainOffset
-        );
-
-        DataConnector();
-
-        PluginConnector();
-    }
-
-    Environment(const Environment&) = delete;
+            SimulationDescription();
+        }
 
-    Environment& operator=(const Environment&) = delete;
+        /** initialize the computing domain information of PMacc
+         *
+         * @param globalDomainSize size of the global simulation domain [cells]
+         * @param localDomainSize size of the local simulation domain [cells]
+         * @param localDomainOffset local domain offset [cells]
+         */
+        void initGrids(
+            DataSpace<T_dim> globalDomainSize,
+            DataSpace<T_dim> localDomainSize,
+            DataSpace<T_dim> localDomainOffset)
+        {
+            PMACC_ASSERT_MSG(
+                detail::EnvironmentContext::getInstance().isMpiInitialized(),
+                "Environment< DIM >::initDevices() must be called before this method!");
 
-private:
+            detail::EnvironmentContext::getInstance().m_isSubGridDefined = true;
 
-    Environment()
-    {
-    }
+            // create singleton instances
+            SubGrid().init(localDomainSize, globalDomainSize, localDomainOffset);
 
-    ~Environment()
-    {
+            DataConnector();
 
-    }
+            PluginConnector();
+        }
 
-};
+        Environment(const Environment&) = delete;
 
-namespace detail
-{
+        Environment& operator=(const Environment&) = delete;
 
-    void EnvironmentContext::init()
-    {
-        m_isMpiInitialized = true;
-
-        // MPI_Init with NULL is allowed since MPI 2.0
-        MPI_CHECK(MPI_Init(NULL,NULL));
-    }
+    private:
+        Environment()
+        {
+        }
 
-    void EnvironmentContext::finalize()
-    {
-        if( m_isMpiInitialized )
+        ~Environment()
         {
-            pmacc::Environment<>::get().Manager().waitForAllTasks();
-            // Required by scorep for flushing the buffers
-            cudaDeviceSynchronize();
-            m_isMpiInitialized = false;
-            /* Free the MPI context.
-             * The gpu context is freed by the `StreamController`, because
-             * MPI and CUDA are independent.
-             */
-            MPI_CHECK(MPI_Finalize());
         }
-    }
+    };
 
-    void EnvironmentContext::setDevice(int deviceNumber)
+    namespace detail
     {
-        int num_gpus = 0; //number of gpus
-        cudaGetDeviceCount(&num_gpus);
-#if (PMACC_CUDA_ENABLED == 1)
-        //##ERROR handling
-        if (num_gpus < 1) //check if cuda device is found
+        void EnvironmentContext::init()
         {
-            throw std::runtime_error("no CUDA capable devices detected");
-        }
-#endif
+            m_isMpiInitialized = true;
 
-        int maxTries = num_gpus;
-        bool deviceSelectionSuccessful = false;
+            // MPI_Init with NULL is allowed since MPI 2.0
+            MPI_CHECK(MPI_Init(NULL, NULL));
+        }
 
-        cudaError rc;
+        void EnvironmentContext::finalize()
+        {
+            if(m_isMpiInitialized)
+            {
+                pmacc::Environment<>::get().Manager().waitForAllTasks();
+                // Required by scorep for flushing the buffers
+                cuplaDeviceSynchronize();
+                m_isMpiInitialized = false;
+                /* Free the MPI context.
+                 * The gpu context is freed by the `StreamController`, because
+                 * MPI and CUDA are independent.
+                 */
+                MPI_CHECK(MPI_Finalize());
+            }
+        }
 
-        // search the first selectable device in the compute node
-        for (int deviceOffset = 0; deviceOffset < maxTries; ++deviceOffset)
+        void EnvironmentContext::setDevice(int deviceNumber)
         {
-            /* Modulo 'num_gpus' avoids invalid device indices for systems where the environment variable
-             * `CUDA_VISIBLE_DEVICES` is used to pre-select a device.
-             */
-            const int tryDeviceId = (deviceOffset + deviceNumber) % num_gpus;
-
-            log<ggLog::CUDA_RT>("Trying to allocate device %1%.") % tryDeviceId;
-#if (PMACC_CUDA_ENABLED == 1)
-            cudaDeviceProp devProp;
-            CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&devProp, tryDeviceId));
-
-            /* If the cuda gpu compute mode is 'default'
-             * (https://docs.nvidia.com/cuda/cuda-c-programming-guide/#compute-modes)
-             * then we try to get a device only once.
-             * The index used to select a device is based on the local MPI rank so
-             * that each rank tries a different device.
-             */
-            if (devProp.computeMode == cudaComputeModeDefault)
+            int num_gpus = 0; // number of gpus
+            cuplaGetDeviceCount(&num_gpus);
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+            //##ERROR handling
+            if(num_gpus < 1) // check if cupla device is found
             {
-                maxTries = 1;
-                log<ggLog::CUDA_RT>("Device %1% is running in default mode.") % tryDeviceId;
+                throw std::runtime_error("no CUDA capable devices detected");
             }
 #endif
 
-            rc = cudaSetDevice(tryDeviceId);
+            int maxTries = num_gpus;
+            bool deviceSelectionSuccessful = false;
 
-            if(rc == cudaSuccess)
-            {
-               cudaStream_t stream;
-               /* \todo: Check if this workaround is needed
-                *
-                * - since NVIDIA change something in driver cudaSetDevice never
-                * return an error if another process already use the selected
-                * device if gpu compute mode is set "process exclusive"
-                * - create a dummy stream to check if the device is already used by
-                * an other process.
-                * - cudaStreamCreate fails if gpu is already in use
-                */
-               rc = cudaStreamCreate(&stream);
-            }
+            cuplaError rc;
 
-            if (rc == cudaSuccess)
+            // search the first selectable device in the compute node
+            for(int deviceOffset = 0; deviceOffset < maxTries; ++deviceOffset)
             {
-#if (PMACC_CUDA_ENABLED == 1)
-                cudaDeviceProp dprop;
-                CUDA_CHECK((cuplaError_t)cudaGetDeviceProperties(&dprop, tryDeviceId));
-                log<ggLog::CUDA_RT> ("Set device to %1%: %2%") % tryDeviceId % dprop.name;
-                if(cudaErrorSetOnActiveProcess == cudaSetDeviceFlags(cudaDeviceScheduleSpin))
+                /* Modulo 'num_gpus' avoids invalid device indices for systems where the environment variable
+                 * `CUDA_VISIBLE_DEVICES` is used to pre-select a device.
+                 */
+                const int tryDeviceId = (deviceOffset + deviceNumber) % num_gpus;
+
+                log<ggLog::CUDA_RT>("Trying to allocate device %1%.") % tryDeviceId;
+
+#if(BOOST_LANG_CUDA || BOOST_LANG_HIP)
+#    if(BOOST_LANG_CUDA)
+                cudaDeviceProp devProp;
+#    elif(BOOST_LANG_HIP)
+                hipDeviceProp_t devProp;
+#    endif
+
+                CUDA_CHECK((cuplaError_t) ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId));
+
+                /* If the cuda gpu compute mode is 'default'
+                 * (https://docs.nvidia.com/cuda/cuda-c-programming-guide/#compute-modes)
+                 * then we try to get a device only once.
+                 * The index used to select a device is based on the local MPI rank so
+                 * that each rank tries a different device.
+                 */
+                if(devProp.computeMode == ALPAKA_API_PREFIX(ComputeModeDefault))
                 {
-                    cudaGetLastError(); //reset all errors
-                    /* - because of cudaStreamCreate was called cudaSetDeviceFlags crashed
-                     * - to set the flags reset the device and set flags again
+                    maxTries = 1;
+                    log<ggLog::CUDA_RT>("Device %1% is running in default mode.") % tryDeviceId;
+                }
+#endif
+
+                rc = cuplaSetDevice(tryDeviceId);
+
+                if(rc == cuplaSuccess)
+                {
+                    cuplaStream_t stream;
+                    /* \todo: Check if this workaround is needed
+                     *
+                     * - since NVIDIA change something in driver cuplaSetDevice never
+                     * return an error if another process already use the selected
+                     * device if gpu compute mode is set "process exclusive"
+                     * - create a dummy stream to check if the device is already used by
+                     * an other process.
+                     * - cuplaStreamCreate fails if gpu is already in use
                      */
-                    CUDA_CHECK(cudaDeviceReset());
-                    CUDA_CHECK((cuplaError_t)cudaSetDeviceFlags(cudaDeviceScheduleSpin));
+                    rc = cuplaStreamCreate(&stream);
                 }
+
+                if(rc == cuplaSuccess)
+                {
+#if(BOOST_LANG_CUDA || BOOST_LANG_HIP)
+                    CUDA_CHECK((cuplaError_t) ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, tryDeviceId));
+                    log<ggLog::CUDA_RT>("Set device to %1%: %2%") % tryDeviceId % devProp.name;
+                    if(ALPAKA_API_PREFIX(ErrorSetOnActiveProcess)
+                       == ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin)))
+                    {
+                        cuplaGetLastError(); // reset all errors
+                        /* - because of cuplaStreamCreate was called cuplaSetDeviceFlags crashed
+                         * - to set the flags reset the device and set flags again
+                         */
+                        CUDA_CHECK(cuplaDeviceReset());
+                        CUDA_CHECK(
+                            (cuplaError_t) ALPAKA_API_PREFIX(SetDeviceFlags)(ALPAKA_API_PREFIX(DeviceScheduleSpin)));
+                    }
 #endif
-                CUDA_CHECK(cudaGetLastError());
-                deviceSelectionSuccessful = true;
-                break;
-            }
-            else if (rc == cudaErrorDeviceAlreadyInUse
-#if (PMACC_CUDA_ENABLED == 1)
-                || rc==(cudaError)cudaErrorDevicesUnavailable
+                    CUDA_CHECK(cuplaGetLastError());
+                    deviceSelectionSuccessful = true;
+                    break;
+                }
+                else if(
+                    rc == cuplaErrorDeviceAlreadyInUse
+#if(PMACC_CUDA_ENABLED == 1)
+                    || rc == (cuplaError) cudaErrorDevicesUnavailable
 #endif
-            )
-            {
-                cudaGetLastError(); //reset all errors
-                log<ggLog::CUDA_RT > ("Device %1% already in use, try next.") % tryDeviceId;
-                continue;
+                )
+                {
+                    cuplaGetLastError(); // reset all errors
+                    log<ggLog::CUDA_RT>("Device %1% already in use, try next.") % tryDeviceId;
+                    continue;
+                }
+                else
+                {
+                    CUDA_CHECK(rc); /*error message*/
+                }
             }
-            else
+            if(!deviceSelectionSuccessful)
             {
-                CUDA_CHECK(rc); /*error message*/
+                std::cerr << "Failed to select one of the " << num_gpus << " devices." << std::endl;
+                throw std::runtime_error("Compute device selection failed.");
             }
-        }
-        if(!deviceSelectionSuccessful)
-        {
-            std::cerr << "Failed to select one of the " << num_gpus << " devices." << std::endl;
-            throw std::runtime_error("Compute device selection failed.");
-        }
 
-        m_isDeviceSelected = true;
-    }
+            m_isDeviceSelected = true;
+        }
 
-} // namespace detail
+    } // namespace detail
 } // namespace pmacc
 
 /* No namespace for macro defines */
@@ -559,7 +565,7 @@ namespace detail
  * depended on the opType this method is blocking
  *
  * @param opType place were the operation is running
- *               possible places are: `ITask::TASK_CUDA`, `ITask::TASK_MPI`, `ITask::TASK_HOST`
+ *               possible places are: `ITask::TASK_DEVICE`, `ITask::TASK_MPI`, `ITask::TASK_HOST`
  */
 #define __startOperation(opType) (pmacc::Environment<>::get().TransactionManager().startOperation(opType))
 
@@ -568,7 +574,7 @@ namespace detail
  * depended on the opType this method is blocking
  *
  * @param opType place were the operation is running
- *               possible places are: `ITask::TASK_CUDA`, `ITask::TASK_MPI`, `ITask::TASK_HOST`
+ *               possible places are: `ITask::TASK_DEVICE`, `ITask::TASK_MPI`, `ITask::TASK_HOST`
  */
 #define __getEventStream(opType) (pmacc::Environment<>::get().TransactionManager().getEventStream(opType))
 
diff --git a/include/pmacc/HandleGuardRegion.hpp b/include/pmacc/HandleGuardRegion.hpp
index d3387a7dae..a40eada221 100644
--- a/include/pmacc/HandleGuardRegion.hpp
+++ b/include/pmacc/HandleGuardRegion.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -21,8 +21,8 @@
 
 #pragma once
 
-namespace pmacc{
-
+namespace pmacc
+{
     /**
      * Property struct that exposes policies for handling data in the guard region
      * Each police must handle both sides of the (possible) exchange:
@@ -38,14 +38,11 @@ namespace pmacc{
      *         to/from any other rank, which is the case for the boundary of the total
      *         volume when non-periodic conditions are used
      */
-    template<
-    class T_HandleExchanged,
-    class T_HandleNotExchanged
-    >
+    template<class T_HandleExchanged, class T_HandleNotExchanged>
     struct HandleGuardRegion
     {
-        typedef T_HandleExchanged    HandleExchanged;
+        typedef T_HandleExchanged HandleExchanged;
         typedef T_HandleNotExchanged HandleNotExchanged;
     };
 
-}  // namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/PMaccConfig.cmake b/include/pmacc/PMaccConfig.cmake
index 9d69a854bf..6924256fad 100644
--- a/include/pmacc/PMaccConfig.cmake
+++ b/include/pmacc/PMaccConfig.cmake
@@ -1,4 +1,4 @@
-# Copyright 2015-2020 Erik Zenker, Rene Widera, Axel Huebl
+# Copyright 2015-2021 Erik Zenker, Rene Widera, Axel Huebl
 #
 # This file is part of PMacc.
 #
@@ -29,7 +29,7 @@
 ###############################################################################
 # PMacc
 ###############################################################################
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 # set helper pathes to find libraries and packages
 # Add specific hints
@@ -72,10 +72,10 @@ endif()
 # Language Flags
 ###############################################################################
 
-# enforce C++11
+# enforce C++14
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 
 
 ###############################################################################
@@ -108,7 +108,7 @@ set_property(CACHE PMACC_ALPAKA_PROVIDER PROPERTY STRINGS "intern;extern")
 mark_as_advanced(PMACC_ALPAKA_PROVIDER)
 
 if(${PMACC_ALPAKA_PROVIDER} STREQUAL "intern")
-    list(INSERT CMAKE_MODULE_PATH 0 "${PMacc_DIR}/../../thirdParty/alpaka")
+    list(INSERT CMAKE_MODULE_PATH 0 "${PMacc_DIR}/../../thirdParty/cupla/alpaka")
 endif()
 
 
@@ -139,12 +139,6 @@ if(
         ON)
 endif()
 
-if(NOT cupla_ALPAKA_PROVIDER)
-    # force cupla to use third party alpaka version
-    set(cupla_ALPAKA_PROVIDER "extern" CACHE STRING "Select which alpaka is used")
-    set(alpaka_DIR "${PMacc_DIR}/../../thirdParty/alpaka" CACHE PATH "path to alpaka")
-endif()
-
 if(${PMACC_CUPLA_PROVIDER} STREQUAL "intern")
     find_package(cupla
         REQUIRED
@@ -349,8 +343,8 @@ if(ALPAKA_ACC_GPU_CUDA_ENABLE)
                             "(Found ${CUDA_VERSION})")
     endif()
     # Newer CUDA releases: probably troublesome, warn at least
-    if(CUDA_VERSION VERSION_GREATER 10.2)
-        message(WARNING "Untested CUDA release >10.2 (Found ${CUDA_VERSION})! "
+    if(CUDA_VERSION VERSION_GREATER 11.2)
+        message(WARNING "Untested CUDA release >11.2 (Found ${CUDA_VERSION})! "
                         "Maybe use a newer PIConGPU?")
     endif()
 endif()
@@ -360,9 +354,13 @@ endif()
 # Find OpenMP
 ################################################################################
 
-find_package(OpenMP)
-if(OPENMP_FOUND)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND ALPAKA_ACC_GPU_CUDA_ENABLE AND ALPAKA_CUDA_COMPILER MATCHES "clang")
+    message(WARNING "OpenMP host side acceleration is disabled: CUDA compilation with clang is not supporting OpenMP.")
+else()
+    find_package(OpenMP)
+    if(OPENMP_FOUND)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    endif()
 endif()
 
 
@@ -370,13 +368,14 @@ endif()
 # Find mallocMC
 ################################################################################
 
-if(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    find_package(mallocMC 2.3.0 QUIET)
+if(ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE)
+    set(mallocMC_ALPAKA_PROVIDER "extern" CACHE STRING "Select which alpaka is used for mallocMC")
+    find_package(mallocMC 2.5.0 QUIET)
 
     if(NOT mallocMC_FOUND)
         message(STATUS "Using mallocMC from thirdParty/ directory")
         set(MALLOCMC_ROOT "${PMacc_DIR}/../../thirdParty/mallocMC")
-        find_package(mallocMC 2.3.0 REQUIRED)
+        find_package(mallocMC 2.5.0 REQUIRED)
     endif(NOT mallocMC_FOUND)
 
     set(PMacc_INCLUDE_DIRS ${PMacc_INCLUDE_DIRS} ${mallocMC_INCLUDE_DIRS})
diff --git a/include/pmacc/algorithms/GlobalReduce.hpp b/include/pmacc/algorithms/GlobalReduce.hpp
index a551cda9a6..3775b9b820 100644
--- a/include/pmacc/algorithms/GlobalReduce.hpp
+++ b/include/pmacc/algorithms/GlobalReduce.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,57 +29,52 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-
-/* Reduce values in GPU memory over all MPI instances
- */
-class GlobalReduce
-{
-public:
-
-    GlobalReduce(const uint32_t byte, const uint32_t sharedMemByte = 4 * 1024) : reduce(byte, sharedMemByte)
-    {
-    }
-
-    /* Activate participation for reduce algorithm.
-     * Must called from any mpi process. This function use global blocking mpi calls.
-     * Don't create a instance befor you have set you cuda device!
-     * @param isActive true if mpi rank should be part of reduce operation, else false
-     */
-    void participate(bool isActive)
-    {
-        mpi_reduce.participate(isActive);
-    }
-
-    /* Reduce elements in global gpu memeory
-     *
-     * @param func functor for reduce which takes two arguments, first argument is the source and get the new reduced value.
-     * Functor must specialize the function getMPI_Op.
-     * @param src a class or a pointer where the reduce algorithm can access the value by operator [] (one dimension access)
-     * @param n number of elements to reduce
-     *
-     * @return reduced value (same on every mpi instance)
-     */
-    template<class Functor, typename Src>
-    typename traits::GetValueType<Src>::ValueType operator()(Functor func,
-                                                           Src src,
-                                                           uint32_t n)
+    namespace algorithms
     {
-        typedef typename traits::GetValueType<Src>::ValueType Type;
+        /* Reduce values in GPU memory over all MPI instances
+         */
+        class GlobalReduce
+        {
+        public:
+            GlobalReduce(const uint32_t byte, const uint32_t sharedMemByte = 4 * 1024) : reduce(byte, sharedMemByte)
+            {
+            }
 
-        Type localResult = reduce(func, src, n);
-        Type globalResult;
+            /* Activate participation for reduce algorithm.
+             * Must called from any mpi process. This function use global blocking mpi calls.
+             * Don't create a instance befor you have set you cuda device!
+             * @param isActive true if mpi rank should be part of reduce operation, else false
+             */
+            void participate(bool isActive)
+            {
+                mpi_reduce.participate(isActive);
+            }
 
-        mpi_reduce(func, &globalResult, &localResult, 1);
-        return globalResult;
-    }
-private:
-    ::pmacc::nvidia::reduce::Reduce reduce;
-    ::pmacc::mpi::MPIReduce mpi_reduce;
-};
-}
-}
+            /* Reduce elements in global gpu memeory
+             *
+             * @param func functor for reduce which takes two arguments, first argument is the source and get the new
+             * reduced value. Functor must specialize the function getMPI_Op.
+             * @param src a class or a pointer where the reduce algorithm can access the value by operator [] (one
+             * dimension access)
+             * @param n number of elements to reduce
+             *
+             * @return reduced value (same on every mpi instance)
+             */
+            template<class Functor, typename Src>
+            typename traits::GetValueType<Src>::ValueType operator()(Functor func, Src src, uint32_t n)
+            {
+                typedef typename traits::GetValueType<Src>::ValueType Type;
 
+                Type localResult = reduce(func, src, n);
+                Type globalResult;
 
+                mpi_reduce(func, &globalResult, &localResult, 1);
+                return globalResult;
+            }
 
+        private:
+            ::pmacc::nvidia::reduce::Reduce reduce;
+            ::pmacc::mpi::MPIReduce mpi_reduce;
+        };
+    } // namespace algorithms
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/PromoteType.hpp b/include/pmacc/algorithms/PromoteType.hpp
index a54f61920d..ddd7284b6f 100644
--- a/include/pmacc/algorithms/PromoteType.hpp
+++ b/include/pmacc/algorithms/PromoteType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,24 +23,25 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace promoteType
-{
-
-    // general: use first type
-    template<class T1, class T2>
-    struct promoteType {
-        typedef T1 type;
-    };
+    namespace algorithms
+    {
+        namespace promoteType
+        {
+            // general: use first type
+            template<class T1, class T2>
+            struct promoteType
+            {
+                typedef T1 type;
+            };
 
-    // special: promote float to double
-    template< >
-    struct promoteType<float, double> {
-        typedef double type;
-    };
+            // special: promote float to double
+            template<>
+            struct promoteType<float, double>
+            {
+                typedef double type;
+            };
 
 
-} //namespace promoteType
-} //namespace algorithms
-} //namespace pmacc
+        } // namespace promoteType
+    } // namespace algorithms
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/TypeCast.hpp b/include/pmacc/algorithms/TypeCast.hpp
index 5d7975b72c..6b6ebea9fd 100644
--- a/include/pmacc/algorithms/TypeCast.hpp
+++ b/include/pmacc/algorithms/TypeCast.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,29 +23,28 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace precisionCast
-{
-
-template<typename CastToType, typename Type>
-struct TypeCast
-{
-    typedef CastToType result;
-
-    HDINLINE result operator()(const Type& value) const
+    namespace algorithms
     {
-        return static_cast<result>(value);
-    }
-};
+        namespace precisionCast
+        {
+            template<typename CastToType, typename Type>
+            struct TypeCast
+            {
+                typedef CastToType result;
 
+                HDINLINE result operator()(const Type& value) const
+                {
+                    return static_cast<result>(value);
+                }
+            };
 
-template<typename CastToType, typename Type>
-HDINLINE typename TypeCast<CastToType, Type>::result precisionCast(const Type& value)
-{
-    return TypeCast<CastToType, Type > ()(value);
-}
 
-} //namespace precisionCast
-} //namespace algorithms
-}//namespace pmacc
+            template<typename CastToType, typename Type>
+            HDINLINE typename TypeCast<CastToType, Type>::result precisionCast(const Type& value)
+            {
+                return TypeCast<CastToType, Type>()(value);
+            }
+
+        } // namespace precisionCast
+    } // namespace algorithms
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math.hpp b/include/pmacc/algorithms/math.hpp
index 938dad66d3..c2c9304283 100644
--- a/include/pmacc/algorithms/math.hpp
+++ b/include/pmacc/algorithms/math.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Alexander Debus
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Alexander Debus
  *
  * This file is part of PMacc.
  *
@@ -24,40 +24,28 @@
 #include "pmacc/types.hpp"
 
 #include "pmacc/algorithms/math/defines/abs.hpp"
-#include "pmacc/algorithms/math/defines/sqrt.hpp"
 #include "pmacc/algorithms/math/defines/exp.hpp"
-#include "pmacc/algorithms/math/defines/erf.hpp"
 #include "pmacc/algorithms/math/defines/trigo.hpp"
 #include "pmacc/algorithms/math/defines/cross.hpp"
 #include "pmacc/algorithms/math/defines/dot.hpp"
 #include "pmacc/algorithms/math/defines/comparison.hpp"
 #include "pmacc/algorithms/math/defines/floatingPoint.hpp"
-#include "pmacc/algorithms/math/defines/pow.hpp"
 #include "pmacc/algorithms/math/defines/modf.hpp"
-#include "pmacc/algorithms/math/defines/fmod.hpp"
 #include "pmacc/algorithms/math/defines/bessel.hpp"
 #include "pmacc/algorithms/math/defines/pi.hpp"
 
 #include "pmacc/algorithms/math/floatMath/abs.tpp"
-#include "pmacc/algorithms/math/floatMath/sqrt.tpp"
 #include "pmacc/algorithms/math/floatMath/exp.tpp"
-#include "pmacc/algorithms/math/floatMath/erf.tpp"
 #include "pmacc/algorithms/math/floatMath/trigo.tpp"
 #include "pmacc/algorithms/math/floatMath/comparison.tpp"
 #include "pmacc/algorithms/math/floatMath/floatingPoint.tpp"
-#include "pmacc/algorithms/math/floatMath/pow.tpp"
 #include "pmacc/algorithms/math/floatMath/modf.tpp"
-#include "pmacc/algorithms/math/floatMath/fmod.tpp"
 #include "pmacc/algorithms/math/floatMath/bessel.tpp"
 
 #include "pmacc/algorithms/math/doubleMath/abs.tpp"
-#include "pmacc/algorithms/math/doubleMath/sqrt.tpp"
 #include "pmacc/algorithms/math/doubleMath/exp.tpp"
-#include "pmacc/algorithms/math/doubleMath/erf.tpp"
 #include "pmacc/algorithms/math/doubleMath/trigo.tpp"
 #include "pmacc/algorithms/math/doubleMath/comparison.tpp"
 #include "pmacc/algorithms/math/doubleMath/floatingPoint.tpp"
-#include "pmacc/algorithms/math/doubleMath/pow.tpp"
 #include "pmacc/algorithms/math/doubleMath/modf.tpp"
-#include "pmacc/algorithms/math/doubleMath/fmod.tpp"
 #include "pmacc/algorithms/math/doubleMath/bessel.tpp"
diff --git a/include/pmacc/algorithms/math/defines/abs.hpp b/include/pmacc/algorithms/math/defines/abs.hpp
index 264d506156..98c8440480 100644
--- a/include/pmacc/algorithms/math/defines/abs.hpp
+++ b/include/pmacc/algorithms/math/defines/abs.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,31 +23,16 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-
-namespace math
-{
-
-template<typename Type>
-struct Abs;
-
-template<typename Type>
-struct Abs2;
-
-
-template<typename T1>
-HDINLINE typename Abs< T1>::result abs(T1 value)
-{
-    return Abs< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE typename Abs2< T1 >::result abs2(const T1& value)
-{
-    return Abs2< T1 > ()(value);
-}
-
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
+    namespace math
+    {
+        template<typename Type>
+        struct Abs2;
+
+        template<typename T1>
+        HDINLINE typename Abs2<T1>::result abs2(const T1& value)
+        {
+            return Abs2<T1>()(value);
+        }
+
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/bessel.hpp b/include/pmacc/algorithms/math/defines/bessel.hpp
index 4a11f258ad..31f8d05ec0 100644
--- a/include/pmacc/algorithms/math/defines/bessel.hpp
+++ b/include/pmacc/algorithms/math/defines/bessel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Debus
+/* Copyright 2016-2021 Alexander Debus
  *
  * This file is part of PMacc.
  *
@@ -23,191 +23,147 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-namespace bessel
-{
-
-    /** Modified cylindrical Bessel function of first kind of order 1
-     */
-    template< typename T_Type >
-    struct I1;
-
-    /** Modified cylindrical Bessel function of order 1
-     *
-     * @param x input value
-     * @return float value
-     */
-    template<typename T_Type >
-    HDINLINE typename I1<T_Type>::result i1( T_Type const & x )
-    {
-        return I1< T_Type >( )( x );
-    }
-
-    /** Modified cylindrical Bessel function of first kind of order 0.
-     */
-    template< typename T_Type >
-    struct I0;
-
-    /** Modified cylindrical Bessel function of first kind of order 0.
-     *
-     * @param x input argument
-     * @return float value
-     */
-    template< typename T_Type >
-    HDINLINE typename I0< T_Type >::result i0( T_Type const & x )
-    {
-        return I0< T_Type >( )( x );
-    }
-
-    /** Bessel function of first kind of order 0
-     */
-    template< typename T_Type >
-    struct J0;
-
-    /** Bessel function of first kind of order 0
-     *
-     * @param x input argument
-     * @return float value
-     */
-    template< typename T_Type >
-    HDINLINE typename J0< T_Type >::result j0( T_Type const & x )
-    {
-        return J0< T_Type >( )( x );
-    }
-
-    /** Bessel function of first kind of order 1
-     */
-    template< typename T_Type >
-    struct J1;
-
-    /** Bessel function of first kind of order 1
-     *
-     * @param x input value
-     * @return float value
-     */
-    template< typename T_Type >
-    HDINLINE typename J1< T_Type >::result j1( T_Type const & x )
-    {
-        return J1< T_Type >( )( x );
-    }
-
-    /** Bessel function of first kind of order n
-     */
-    template<
-        typename T_IntType,
-        typename T_FloatType
-    >
-    struct Jn;
-
-    /** Bessel function of first kind of order n
-     *
-     * Calculate the value of the Bessel function
-     * of first kind of order n for the input argument.
-     *
-     * @param n nth order
-     * @param x input argument
-     * @return float value
-     */
-    template<
-        typename T_IntType,
-        typename T_FloatType
-    >
-    HDINLINE
-    typename Jn<
-        T_IntType,
-        T_FloatType
-    >::result
-    jn(
-        T_IntType const & n,
-        T_FloatType const & x
-    )
-    {
-        return Jn<
-            T_IntType,
-            T_FloatType
-        >( )(
-            n,
-            x
-        );
-    }
-
-    /** Bessel function of second kind of order 0
-     *
-     */
-    template< typename T_Type >
-    struct Y0;
-
-    /**Bessel function of second kind of order 0
-     *
-     * @param x input argument
-     * @return float value
-     */
-    template< typename T_Type >
-    HDINLINE typename Y0< T_Type >::result y0( T_Type const & x )
-    {
-        return Y0< T_Type >( )( x );
-    }
-
-    /* Bessel function of second kind of order 1.
-     */
-    template< typename T_Type >
-    struct Y1;
-
-    /** Bessel function of second kind of order 1
-     *
-     * @param x input argument
-     * @return float value
-     */
-    template< typename T_Type >
-    HDINLINE typename Y1< T_Type >::result y1( T_Type const & x )
-    {
-        return Y1< T_Type >( )( x );
-    }
-
-    /** Bessel function of second kind of order n.
-     */
-    template<
-        typename T_IntType,
-        typename T_FloatType
-    >
-    struct Yn;
-
-    /** Bessel function of second kind of order n
-     *
-     * Calculate the value of the Bessel function
-     * of second kind of order n for the input argument.
-     *
-     * @param n nth order
-     * @param x input argument
-     * @return float value
-     */
-    template<
-        typename T_IntType,
-        typename T_FloatType
-    >
-    HDINLINE
-    typename Yn<
-        T_IntType,
-        T_FloatType
-    >::result
-    yn(
-        T_IntType const & n,
-        T_FloatType const & x
-    )
+    namespace math
     {
-        return Yn<
-            T_IntType,
-            T_FloatType
-        >( )(
-          n,
-          x
-        );
-    }
-
-} //namespace bessel
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
+        namespace bessel
+        {
+            /** Modified cylindrical Bessel function of first kind of order 1
+             */
+            template<typename T_Type>
+            struct I1;
+
+            /** Modified cylindrical Bessel function of order 1
+             *
+             * @param x input value
+             * @return float value
+             */
+            template<typename T_Type>
+            HDINLINE typename I1<T_Type>::result i1(T_Type const& x)
+            {
+                return I1<T_Type>()(x);
+            }
+
+            /** Modified cylindrical Bessel function of first kind of order 0.
+             */
+            template<typename T_Type>
+            struct I0;
+
+            /** Modified cylindrical Bessel function of first kind of order 0.
+             *
+             * @param x input argument
+             * @return float value
+             */
+            template<typename T_Type>
+            HDINLINE typename I0<T_Type>::result i0(T_Type const& x)
+            {
+                return I0<T_Type>()(x);
+            }
+
+            /** Bessel function of first kind of order 0
+             */
+            template<typename T_Type>
+            struct J0;
+
+            /** Bessel function of first kind of order 0
+             *
+             * @param x input argument
+             * @return float value
+             */
+            template<typename T_Type>
+            HDINLINE typename J0<T_Type>::result j0(T_Type const& x)
+            {
+                return J0<T_Type>()(x);
+            }
+
+            /** Bessel function of first kind of order 1
+             */
+            template<typename T_Type>
+            struct J1;
+
+            /** Bessel function of first kind of order 1
+             *
+             * @param x input value
+             * @return float value
+             */
+            template<typename T_Type>
+            HDINLINE typename J1<T_Type>::result j1(T_Type const& x)
+            {
+                return J1<T_Type>()(x);
+            }
+
+            /** Bessel function of first kind of order n
+             */
+            template<typename T_IntType, typename T_FloatType>
+            struct Jn;
+
+            /** Bessel function of first kind of order n
+             *
+             * Calculate the value of the Bessel function
+             * of first kind of order n for the input argument.
+             *
+             * @param n nth order
+             * @param x input argument
+             * @return float value
+             */
+            template<typename T_IntType, typename T_FloatType>
+            HDINLINE typename Jn<T_IntType, T_FloatType>::result jn(T_IntType const& n, T_FloatType const& x)
+            {
+                return Jn<T_IntType, T_FloatType>()(n, x);
+            }
+
+            /** Bessel function of second kind of order 0
+             *
+             */
+            template<typename T_Type>
+            struct Y0;
+
+            /**Bessel function of second kind of order 0
+             *
+             * @param x input argument
+             * @return float value
+             */
+            template<typename T_Type>
+            HDINLINE typename Y0<T_Type>::result y0(T_Type const& x)
+            {
+                return Y0<T_Type>()(x);
+            }
+
+            /* Bessel function of second kind of order 1.
+             */
+            template<typename T_Type>
+            struct Y1;
+
+            /** Bessel function of second kind of order 1
+             *
+             * @param x input argument
+             * @return float value
+             */
+            template<typename T_Type>
+            HDINLINE typename Y1<T_Type>::result y1(T_Type const& x)
+            {
+                return Y1<T_Type>()(x);
+            }
+
+            /** Bessel function of second kind of order n.
+             */
+            template<typename T_IntType, typename T_FloatType>
+            struct Yn;
+
+            /** Bessel function of second kind of order n
+             *
+             * Calculate the value of the Bessel function
+             * of second kind of order n for the input argument.
+             *
+             * @param n nth order
+             * @param x input argument
+             * @return float value
+             */
+            template<typename T_IntType, typename T_FloatType>
+            HDINLINE typename Yn<T_IntType, T_FloatType>::result yn(T_IntType const& n, T_FloatType const& x)
+            {
+                return Yn<T_IntType, T_FloatType>()(n, x);
+            }
+
+        } // namespace bessel
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/comparison.hpp b/include/pmacc/algorithms/math/defines/comparison.hpp
index 2516a83cac..0f3986cf36 100644
--- a/include/pmacc/algorithms/math/defines/comparison.hpp
+++ b/include/pmacc/algorithms/math/defines/comparison.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,52 +23,47 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
+    namespace math
+    {
+        template<typename T1, typename T2>
+        struct Max;
 
-namespace math
-{
+        template<typename T1, typename T2>
+        struct Min;
 
-template<typename T1,typename T2>
-struct Max;
+        template<typename T>
+        struct Max<T, T>
+        {
+            typedef T result;
 
-template<typename T1,typename T2>
-struct Min;
+            HDINLINE T operator()(T value1, T value2)
+            {
+                return value1 > value2 ? value1 : value2;
+            }
+        };
 
-template<typename T>
-struct Max<T,T>
-{
-    typedef T result;
+        template<typename T>
+        struct Min<T, T>
+        {
+            typedef T result;
 
-    HDINLINE T operator()(T value1, T value2)
-    {
-        return value1 > value2 ? value1 : value2;
-    }
-};
+            HDINLINE T operator()(T value1, T value2)
+            {
+                return value1 < value2 ? value1 : value2;
+            }
+        };
 
-template<typename T>
-struct Min<T,T>
-{
-    typedef T result;
+        template<typename T1, typename T2>
+        HDINLINE typename Min<T1, T2>::result min(const T1& value1, const T2& value2)
+        {
+            return Min<T1, T2>()(value1, value2);
+        }
 
-    HDINLINE T operator()(T value1, T value2)
-    {
-        return value1 < value2 ? value1 : value2;
-    }
-};
-
-template<typename T1,typename T2>
-HDINLINE typename Min< T1,T2>::result min(const T1& value1,const T2& value2)
-{
-    return Min< T1,T2 > ()(value1,value2);
-}
-
-template<typename T1,typename T2>
-HDINLINE typename Max< T1,T2 >::result max(const T1& value1,const T2& value2)
-{
-    return Max< T1,T2 > ()(value1,value2);
-}
+        template<typename T1, typename T2>
+        HDINLINE typename Max<T1, T2>::result max(const T1& value1, const T2& value2)
+        {
+            return Max<T1, T2>()(value1, value2);
+        }
 
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/cross.hpp b/include/pmacc/algorithms/math/defines/cross.hpp
index 9b0d8e313c..06545acbd6 100644
--- a/include/pmacc/algorithms/math/defines/cross.hpp
+++ b/include/pmacc/algorithms/math/defines/cross.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,21 +24,16 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-
-namespace math
-{
+    namespace math
+    {
+        template<typename Type1, typename Type2>
+        struct Cross;
 
-template<typename Type1, typename Type2>
-struct Cross;
 
-
-template<typename T1, typename T2>
-HDINLINE typename Cross< T1, T2 >::result cross(const T1& value, const T2& value2)
-{
-    return Cross< T1, T2 > ()(value, value2);
-}
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
+        template<typename T1, typename T2>
+        HDINLINE typename Cross<T1, T2>::result cross(const T1& value, const T2& value2)
+        {
+            return Cross<T1, T2>()(value, value2);
+        }
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/dot.hpp b/include/pmacc/algorithms/math/defines/dot.hpp
index e0c7a37b96..b03c66205d 100644
--- a/include/pmacc/algorithms/math/defines/dot.hpp
+++ b/include/pmacc/algorithms/math/defines/dot.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,20 +23,16 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
+    namespace math
+    {
+        template<typename Type1, typename Type2>
+        struct Dot;
 
-template<typename Type1, typename Type2>
-struct Dot;
 
-
-template<typename T1, typename T2>
-HDINLINE typename Dot< T1, T2 >::result dot(const T1& value, const T2& value2)
-{
-    return Dot< T1, T2 > ()(value, value2);
-}
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
+        template<typename T1, typename T2>
+        HDINLINE typename Dot<T1, T2>::result dot(const T1& value, const T2& value2)
+        {
+            return Dot<T1, T2>()(value, value2);
+        }
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/erf.hpp b/include/pmacc/algorithms/math/defines/erf.hpp
deleted file mode 100644
index 09709a642d..0000000000
--- a/include/pmacc/algorithms/math/defines/erf.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2014-2020 Axel Huebl
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<typename Type>
-struct Erf;
-
-template<typename T1>
-HDINLINE static typename Erf< T1 >::result erf(const T1& value)
-{
-    return Erf< T1 > ()(value);
-}
-
-} /* namespace math */
-} /* namespace algorithms */
-} /* namespace pmacc */
diff --git a/include/pmacc/algorithms/math/defines/exp.hpp b/include/pmacc/algorithms/math/defines/exp.hpp
index 39db92761a..b793a6ff35 100644
--- a/include/pmacc/algorithms/math/defines/exp.hpp
+++ b/include/pmacc/algorithms/math/defines/exp.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -24,39 +24,16 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-
-template<typename Type>
-struct Exp;
-
-template<typename Type>
-struct Log;
-
-template<typename Type>
-struct Log10;
-
-template<typename T1>
-HDINLINE typename Exp< T1 >::result exp(const T1& value)
-{
-    return Exp< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE typename Log< T1 >::result log(const T1& value)
-{
-    return Log< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE typename Log10< T1 >::result log10(const T1& value)
-{
-    return Log10< T1 > ()(value);
-}
-
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
+    namespace math
+    {
+        template<typename Type>
+        struct Log10;
+
+        template<typename T1>
+        HDINLINE typename Log10<T1>::result log10(const T1& value)
+        {
+            return Log10<T1>()(value);
+        }
+
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/floatingPoint.hpp b/include/pmacc/algorithms/math/defines/floatingPoint.hpp
index b78e674fa9..2d1e462b59 100644
--- a/include/pmacc/algorithms/math/defines/floatingPoint.hpp
+++ b/include/pmacc/algorithms/math/defines/floatingPoint.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -23,79 +23,50 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-
-namespace math
-{
-
-template<typename Type>
-struct Floor;
-
-template<typename Type>
-struct Ceil;
+    namespace math
+    {
+        template<typename Type>
+        struct Float2int_ru;
 
-template<typename Type>
-struct Float2int_ru;
-
-template<typename Type>
-struct Float2int_rd;
-
-template<typename Type>
-struct Float2int_rn;
-
-/**
- * Rounds down (towards -inf)
- */
-template<typename T1>
-HDINLINE typename Floor< T1>::result floor(T1 value)
-{
-    return Floor< T1 > ()(value);
-}
+        template<typename Type>
+        struct Float2int_rd;
 
-/**
- * Rounds up (towards +inf)
- */
-template<typename T1>
-HDINLINE typename Ceil< T1>::result ceil(T1 value)
-{
-    return Ceil< T1 > ()(value);
-}
+        template<typename Type>
+        struct Float2int_rn;
 
-/**
- * Returns the smallest int value that is at least as big as value
- * Note: Using values outside the range of an int is undefined
- * @return integer value
- */
-template<typename T1>
-HDINLINE typename Float2int_ru< T1>::result float2int_ru(T1 value)
-{
-    return Float2int_ru< T1 > ()(value);
-}
+        /**
+         * Returns the smallest int value that is at least as big as value
+         * Note: Using values outside the range of an int is undefined
+         * @return integer value
+         */
+        template<typename T1>
+        HDINLINE typename Float2int_ru<T1>::result float2int_ru(T1 value)
+        {
+            return Float2int_ru<T1>()(value);
+        }
 
-/**
- * Returns the largest int value that is not greater than value
- * Note: Using values outside the range of an int is undefined
- * @return integer value
- */
-template<typename T1>
-HDINLINE typename Float2int_rd< T1>::result float2int_rd(T1 value)
-{
-    return Float2int_rd< T1 > ()(value);
-}
+        /**
+         * Returns the largest int value that is not greater than value
+         * Note: Using values outside the range of an int is undefined
+         * @return integer value
+         */
+        template<typename T1>
+        HDINLINE typename Float2int_rd<T1>::result float2int_rd(T1 value)
+        {
+            return Float2int_rd<T1>()(value);
+        }
 
-/**
- * Rounds towards the nearest value returning an int
- * For the case of x.5 the even value is chosen from the 2 possible values
- * Note: Using values outside the range of an int is undefined
- * @return integer value
- */
-template<typename T1>
-HDINLINE typename Float2int_rn< T1>::result float2int_rn(T1 value)
-{
-    return Float2int_rn< T1 > ()(value);
-}
+        /**
+         * Rounds towards the nearest value returning an int
+         * For the case of x.5 the even value is chosen from the 2 possible values
+         * Note: Using values outside the range of an int is undefined
+         * @return integer value
+         */
+        template<typename T1>
+        HDINLINE typename Float2int_rn<T1>::result float2int_rn(T1 value)
+        {
+            return Float2int_rn<T1>()(value);
+        }
 
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/fmod.hpp b/include/pmacc/algorithms/math/defines/fmod.hpp
deleted file mode 100644
index 604824d9a1..0000000000
--- a/include/pmacc/algorithms/math/defines/fmod.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2016-2020 Alexander Debus
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<typename Type>
-struct Fmod;
-
-/**
- * Equivalent to the modulus-operator for float types
- * returns the floating-point remainder of x / y.
- * The functionality corresponds to the C++
- * math function fmod().
- * For details, see http://www.cplusplus.com/reference/cmath/fmod/ .
- * @return float value
- */
-template<typename T1>
-HDINLINE typename Fmod< T1>::result fmod(T1 x, T1 y)
-{
-    return Fmod< T1 > ()(x, y);
-}
-
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
-
diff --git a/include/pmacc/algorithms/math/defines/modf.hpp b/include/pmacc/algorithms/math/defines/modf.hpp
index 9eac4f19be..f1a8a1c1df 100644
--- a/include/pmacc/algorithms/math/defines/modf.hpp
+++ b/include/pmacc/algorithms/math/defines/modf.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -23,21 +23,16 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-
-namespace math
-{
+    namespace math
+    {
+        template<typename Type>
+        struct Modf;
 
-template<typename Type>
-struct Modf;
-
-template<typename T>
-HDINLINE typename Modf<T>::result modf(T value, T* intpart)
-{
-    return Modf<T>()(value, intpart);
-}
+        template<typename T>
+        HDINLINE typename Modf<T>::result modf(T value, T* intpart)
+        {
+            return Modf<T>()(value, intpart);
+        }
 
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/pi.hpp b/include/pmacc/algorithms/math/defines/pi.hpp
index 72f4636698..20479eaa39 100644
--- a/include/pmacc/algorithms/math/defines/pi.hpp
+++ b/include/pmacc/algorithms/math/defines/pi.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Sergei Bastrakov
+/* Copyright 2018-2021 Sergei Bastrakov
  *
  * This file is part of PMacc.
  *
@@ -24,25 +24,19 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-    /** Values of pi and related constants as T_Type
-     */
-    template< typename T_Type >
-    struct Pi
+    namespace math
     {
-        static constexpr T_Type value = static_cast< T_Type >(
-            3.141592653589793238462643383279502884197169399
-        );
-        static constexpr T_Type doubleValue = static_cast< T_Type >( 2.0 ) * value;
-        static constexpr T_Type halfValue = value / static_cast< T_Type >( 2.0 );
-        static constexpr T_Type quarterValue = value / static_cast< T_Type >( 4.0 );
-        static constexpr T_Type doubleReciprocalValue = static_cast< T_Type >( 2.0 ) / value;
-    };
+        /** Values of pi and related constants as T_Type
+         */
+        template<typename T_Type>
+        struct Pi
+        {
+            static constexpr T_Type value = static_cast<T_Type>(3.141592653589793238462643383279502884197169399);
+            static constexpr T_Type doubleValue = static_cast<T_Type>(2.0) * value;
+            static constexpr T_Type halfValue = value / static_cast<T_Type>(2.0);
+            static constexpr T_Type quarterValue = value / static_cast<T_Type>(4.0);
+            static constexpr T_Type doubleReciprocalValue = static_cast<T_Type>(2.0) / value;
+        };
 
-} // namespace math
-} // namespace algorithms
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/pow.hpp b/include/pmacc/algorithms/math/defines/pow.hpp
deleted file mode 100644
index 0a26c66f72..0000000000
--- a/include/pmacc/algorithms/math/defines/pow.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-namespace pmacc
-{
-namespace algorithms
-{
-
-namespace math
-{
-
-template<typename T1, typename T2>
-struct Pow;
-
-
-/** Raised the base to the power exponent
- *
- * @param base base value
- * @param exponent power exponent
- * @return base rased to the power exponent
- */
-template<typename T1, typename T2>
-HDINLINE typename Pow< T1, T2 >::result pow(const T1& base,const T2& exponent)
-{
-    return Pow< T1, T2 > ()(base, exponent);
-}
-
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
diff --git a/include/pmacc/algorithms/math/defines/sqrt.hpp b/include/pmacc/algorithms/math/defines/sqrt.hpp
deleted file mode 100644
index b380760d4d..0000000000
--- a/include/pmacc/algorithms/math/defines/sqrt.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<typename Type>
-struct Sqrt;
-
-template<typename Type>
-struct RSqrt;
-
-
-template<typename T1>
-HDINLINE typename Sqrt< T1 >::result sqrt(const T1& value)
-{
-    return Sqrt< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE typename RSqrt< T1 >::result rsqrt(const T1& value)
-{
-    return RSqrt< T1 > ()(value);
-}
-
-} //namespace math
-} //namespace algorithms
-}//namespace pmacc
-
diff --git a/include/pmacc/algorithms/math/defines/trigo.hpp b/include/pmacc/algorithms/math/defines/trigo.hpp
index 41aadd105f..afb2ad4ed5 100644
--- a/include/pmacc/algorithms/math/defines/trigo.hpp
+++ b/include/pmacc/algorithms/math/defines/trigo.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl, Alexander Debus
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Axel Huebl, Alexander Debus
  *
  * This file is part of PMacc.
  *
@@ -23,112 +23,29 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-
-namespace math
-{
-
-template<typename Type>
-struct Sin;
-
-template<typename Type>
-struct ASin;
-
-template<typename Type>
-struct Cos;
-
-template<typename Type>
-struct ACos;
+    namespace math
+    {
+        template<typename ArgType, typename SinType, typename CosType>
+        struct SinCos;
 
-template<typename Type>
-struct Tan;
+        template<typename Type>
+        struct Sinc;
 
-template<typename Type>
-struct ATan;
 
-template<typename Type>
-struct Atan2;
-
-template<typename ArgType, typename SinType, typename CosType>
-struct SinCos;
-
-template<typename Type>
-struct Sinc;
-
-
-template<typename T1>
-HDINLINE
-typename Sin< T1 >::result
-sin(const T1& value)
-{
-    return Sin< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE
-typename ASin< T1 >::result
-asin(const T1& value)
-{
-    return ASin< T1 > ()(value);
-}
+        template<typename ArgType, typename SinType, typename CosType>
+        HDINLINE typename SinCos<ArgType, SinType, CosType>::result sincos(
+            ArgType arg,
+            SinType& sinValue,
+            CosType& cosValue)
+        {
+            return SinCos<ArgType, SinType, CosType>()(arg, sinValue, cosValue);
+        }
 
-template<typename T1>
-HDINLINE
-typename Cos<T1>::result
-cos(const T1& value)
-{
-    return Cos< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE
-typename ACos<T1>::result
-acos(const T1& value)
-{
-    return ACos< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE
-typename Tan<T1>::result
-tan(const T1& value)
-{
-    return Tan< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE
-typename ATan<T1>::result
-atan(const T1& value)
-{
-    return ATan< T1 > ()(value);
-}
-
-template<typename ArgType, typename SinType, typename CosType>
-HDINLINE
-typename SinCos< ArgType, SinType, CosType >::result
-sincos(ArgType arg, SinType& sinValue, CosType& cosValue)
-{
-    return SinCos< ArgType, SinType, CosType > ()(arg, sinValue, cosValue);
-}
-
-template<typename T1>
-HDINLINE
-typename Sinc<T1>::result
-sinc(const T1& value)
-{
-    return Sinc< T1 > ()(value);
-}
-
-template<typename T1>
-HDINLINE
-typename Atan2<T1>::result
-atan2(const T1& val1, const T1& val2)
-{
-    return Atan2< T1 > ()(val1, val2);
-}
+        template<typename T1>
+        HDINLINE typename Sinc<T1>::result sinc(const T1& value)
+        {
+            return Sinc<T1>()(value);
+        }
 
-} /* namespace math */
-} /* namespace algorithms */
+    } /* namespace math */
 } /* namespace pmacc */
diff --git a/include/pmacc/algorithms/math/doubleMath/abs.tpp b/include/pmacc/algorithms/math/doubleMath/abs.tpp
index ea9c2d72b3..429ae0e70e 100644
--- a/include/pmacc/algorithms/math/doubleMath/abs.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/abs.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -28,40 +28,18 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Abs<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(double value)
+    namespace math
     {
-#ifdef __CUDA_ARCH__
-      return ::fabs( value );
-#else
-      /* \bug on cpu `::abs(double)` always return zero -> maybe this is the
-       * integer version of `abs()`
-       */
-      return std::abs( value );
-#endif
-    }
-};
-
-template<>
-struct Abs2<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
-    {
-        return value*value;
-    }
-};
-
-} //namespace math
-} //namespace algorithms
+        template<>
+        struct Abs2<double>
+        {
+            typedef double result;
+
+            HDINLINE double operator()(const double& value)
+            {
+                return value * value;
+            }
+        };
+
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/doubleMath/bessel.tpp b/include/pmacc/algorithms/math/doubleMath/bessel.tpp
index 66ba789060..2e96ec556a 100644
--- a/include/pmacc/algorithms/math/doubleMath/bessel.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/bessel.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Debus
+/* Copyright 2016-2021 Alexander Debus
  *
  * This file is part of PMacc.
  *
@@ -27,176 +27,130 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-namespace bessel
-{
-
-    template< >
-    struct I0< double >
+    namespace math
     {
-        using result = double;
-
-        HDINLINE result operator( )( result const & x )
+        namespace bessel
         {
-#if __CUDA_ARCH__
-            return ::cyl_bessel_i0( x );
+            template<>
+            struct I0<double>
+            {
+                using result = double;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::cyl_bessel_i0(x);
 #else
-            return boost::math::cyl_bessel_i(
-                0,
-                x
-            );
+                    return boost::math::cyl_bessel_i(0, x);
 #endif
-        }
-    };
-
-    template< >
-    struct I1< double >
-    {
-        using result = double;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::cyl_bessel_i1( x );
+                }
+            };
+
+            template<>
+            struct I1<double>
+            {
+                using result = double;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::cyl_bessel_i1(x);
 #else
-            return boost::math::cyl_bessel_i(
-                1,
-                x
-            );
+                    return boost::math::cyl_bessel_i(1, x);
 #endif
-        }
-    };
-
-    template< >
-    struct J0< double >
-    {
-        using result = double;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::j0( x );
+                }
+            };
+
+            template<>
+            struct J0<double>
+            {
+                using result = double;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::j0(x);
 #else
-            return boost::math::cyl_bessel_j(
-                0,
-                x
-            );
+                    return boost::math::cyl_bessel_j(0, x);
 #endif
-        }
-    };
-
-    template< >
-    struct J1< double >
-    {
-        using result = double;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::j1( x );
+                }
+            };
+
+            template<>
+            struct J1<double>
+            {
+                using result = double;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::j1(x);
 #else
-            return boost::math::cyl_bessel_j(
-                1,
-                x
-            );
+                    return boost::math::cyl_bessel_j(1, x);
 #endif
-        }
-    };
-
-    template< >
-    struct Jn<
-        int,
-        double
-    >
-    {
-        using result = double;
-
-        HDINLINE result operator( )(
-            int const & n,
-            result const & x
-        )
-        {
-#if __CUDA_ARCH__
-            return ::jn(
-                n,
-                x
-            );
+                }
+            };
+
+            template<>
+            struct Jn<int, double>
+            {
+                using result = double;
+
+                HDINLINE result operator()(int const& n, result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::jn(n, x);
 #else
-            return boost::math::cyl_bessel_j(
-                n,
-                x
-            );
+                    return boost::math::cyl_bessel_j(n, x);
 #endif
-        }
-    };
-
-    template< >
-    struct Y0< double >
-    {
-        using result = double;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::y0( x );
+                }
+            };
+
+            template<>
+            struct Y0<double>
+            {
+                using result = double;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::y0(x);
 #else
-            return boost::math::cyl_neumann(
-                0,
-                x
-            );
+                    return boost::math::cyl_neumann(0, x);
 #endif
-        }
-    };
-
-    template< >
-    struct Y1< double >
-    {
-        using result = double;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::y1( x );
+                }
+            };
+
+            template<>
+            struct Y1<double>
+            {
+                using result = double;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::y1(x);
 #else
-            return boost::math::cyl_neumann(
-                1,
-                x
-            );
+                    return boost::math::cyl_neumann(1, x);
 #endif
-        }
-    };
-
-    template< >
-    struct Yn<
-        int,
-        double
-    >
-    {
-        using result = double;
-
-        HDINLINE result operator( )(
-            int const & n,
-            result const & x
-        )
-        {
-#if __CUDA_ARCH__
-            return ::yn(
-                n,
-                x
-            );
+                }
+            };
+
+            template<>
+            struct Yn<int, double>
+            {
+                using result = double;
+
+                HDINLINE result operator()(int const& n, result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::yn(n, x);
 #else
-            return boost::math::cyl_neumann(
-                n,
-                x
-            );
+                    return boost::math::cyl_neumann(n, x);
 #endif
-        }
-    };
+                }
+            };
 
-} //namespace bessel
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
+        } // namespace bessel
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/doubleMath/comparison.tpp b/include/pmacc/algorithms/math/doubleMath/comparison.tpp
index 65529abcee..2771f1db22 100644
--- a/include/pmacc/algorithms/math/doubleMath/comparison.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/comparison.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Benjamin Worpitz, Richard Pausch
+/* Copyright 2015-2021 Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -27,33 +27,29 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Min<double, double>
-{
-    typedef double result;
-
-    HDINLINE double operator()(double value1, double value2)
+    namespace math
     {
-        return ::fmin(value1, value2);
-    }
-};
-
-template<>
-struct Max<double, double>
-{
-    typedef double result;
-
-    HDINLINE double operator()(double value1, double value2)
-    {
-        return ::fmax(value1, value2);
-    }
-};
-
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
+        template<>
+        struct Min<double, double>
+        {
+            typedef double result;
+
+            HDINLINE double operator()(double value1, double value2)
+            {
+                return ::fmin(value1, value2);
+            }
+        };
+
+        template<>
+        struct Max<double, double>
+        {
+            typedef double result;
+
+            HDINLINE double operator()(double value1, double value2)
+            {
+                return ::fmax(value1, value2);
+            }
+        };
+
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/doubleMath/erf.tpp b/include/pmacc/algorithms/math/doubleMath/erf.tpp
deleted file mode 100644
index 3e449de147..0000000000
--- a/include/pmacc/algorithms/math/doubleMath/erf.tpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2014-2020 Axel Huebl, Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include <cmath>
-
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Erf<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
-    {
-        return ::erf( value );
-    }
-};
-
-} /* namespace math */
-} /* namespace algorithms */
-} /* namespace pmacc */
diff --git a/include/pmacc/algorithms/math/doubleMath/exp.tpp b/include/pmacc/algorithms/math/doubleMath/exp.tpp
index db659d473f..4987ed38c4 100644
--- a/include/pmacc/algorithms/math/doubleMath/exp.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/exp.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -28,45 +28,19 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Exp<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
-    {
-        return ::exp( value );
-    }
-};
-
-template<>
-struct Log<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
+    namespace math
     {
-        return ::log( value );
-    }
-};
+        template<>
+        struct Log10<double>
+        {
+            typedef double result;
 
-template<>
-struct Log10<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value)
-    {
-        return ::log10( value );
-    }
-};
+            HDINLINE double operator()(const double& value)
+            {
+                return ::log10(value);
+            }
+        };
 
 
-} //namespace math
-} //namespace algorithms
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
index 33cc6bb6a5..a1e3519a37 100644
--- a/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/floatingPoint.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -29,102 +29,76 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Floor<double>
-{
-    typedef double result;
-
-    HDINLINE result operator( )(result value)
+    namespace math
     {
-        return ::floor( value );
-    }
-};
-
-template<>
-struct Ceil<double>
-{
-    typedef double result;
-
-    HDINLINE result operator( )(result value)
-    {
-        return ::ceil( value );
-    }
-};
-
-template<>
-struct Float2int_ru<double>
-{
-    typedef int result;
+        template<>
+        struct Float2int_ru<double>
+        {
+            typedef int result;
 
-    HDINLINE result operator( )(double value)
-    {
-#if __CUDA_ARCH__
-        return ::__double2int_ru( value );
+            HDINLINE result operator()(double value)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::__double2int_ru(value);
 #else
-        return static_cast<int>(ceil(value));
+                return static_cast<int>(ceil(value));
 #endif
-    }
-};
+            }
+        };
 
-template<>
-struct Float2int_rd<double>
-{
-    typedef int result;
+        template<>
+        struct Float2int_rd<double>
+        {
+            typedef int result;
 
-    HDINLINE result operator( )(double value)
-    {
-#if __CUDA_ARCH__
-        return ::__double2int_rd( value );
+            HDINLINE result operator()(double value)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::__double2int_rd(value);
 #else
-        return static_cast<int>(floor(value));
+                return static_cast<int>(floor(value));
 #endif
-    }
-};
+            }
+        };
 
-template<>
-struct Float2int_rn<double>
-{
-    typedef int result;
+        template<>
+        struct Float2int_rn<double>
+        {
+            typedef int result;
 
-    HDINLINE result operator( )(double value)
-    {
-#if __CUDA_ARCH__
-        return ::__double2int_rn( value );
+            HDINLINE result operator()(double value)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::__double2int_rn(value);
 #else
-        if(value < 0.0)
-            return -(*this)(-value);
-        double intPart;
-        double fracPart = std::modf(value, &intPart);
-        result res = static_cast<int>(intPart);
-        /* epsilon in the following code is used to consider values
-         * "very close" to x.5 also as x.5
-         */
-        if(fracPart > 0.5 + std::numeric_limits<double>::epsilon())
-        {
-            /* >x.5 --> Round up */
-            res = res + 1;
-        }
-        else if(!(fracPart < 0.5 - std::numeric_limits<double>::epsilon()))
-        {
-            /* We are NOT >x.5 AND NOT <x.5 --> ==x.5 --> use x if x is even, else x+1
-             * The "&~1" cancels the last bit which results in an even value
-             * res is even -> res+1 is odd -> (res+1)&~1 = res
-             * res is odd -> res+1 is even -> (res+1)&~1 = res+1
-             */
-            res = (res + 1) & ~1;
-        }
-        /* else res = res (round down) */
-        return res;
+                if(value < 0.0)
+                    return -(*this)(-value);
+                double intPart;
+                double fracPart = std::modf(value, &intPart);
+                result res = static_cast<int>(intPart);
+                /* epsilon in the following code is used to consider values
+                 * "very close" to x.5 also as x.5
+                 */
+                if(fracPart > 0.5 + std::numeric_limits<double>::epsilon())
+                {
+                    /* >x.5 --> Round up */
+                    res = res + 1;
+                }
+                else if(!(fracPart < 0.5 - std::numeric_limits<double>::epsilon()))
+                {
+                    /* We are NOT >x.5 AND NOT <x.5 --> ==x.5 --> use x if x is even, else x+1
+                     * The "&~1" cancels the last bit which results in an even value
+                     * res is even -> res+1 is odd -> (res+1)&~1 = res
+                     * res is odd -> res+1 is even -> (res+1)&~1 = res+1
+                     */
+                    res = (res + 1) & ~1;
+                }
+                /* else res = res (round down) */
+                return res;
 #endif
-    }
-};
+            }
+        };
 
 
-} //namespace math
-} //namespace algorithms
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/doubleMath/fmod.tpp b/include/pmacc/algorithms/math/doubleMath/fmod.tpp
deleted file mode 100644
index aa02abc8e3..0000000000
--- a/include/pmacc/algorithms/math/doubleMath/fmod.tpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2016-2020 Alexander Debus
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include <cmath>
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Fmod<double>
-{
-    typedef double result;
-
-    HDINLINE result operator( )(result x, result y)
-    {
-#if __CUDA_ARCH__
-        return ::fmod(x, y);
-#else
-        return std::fmod(x, y);
-#endif
-    }
-};
-
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
-
diff --git a/include/pmacc/algorithms/math/doubleMath/modf.tpp b/include/pmacc/algorithms/math/doubleMath/modf.tpp
index 10ec3c632d..89774db6f4 100644
--- a/include/pmacc/algorithms/math/doubleMath/modf.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/modf.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -26,26 +26,22 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Modf<double>
-{
-    typedef double result;
-
-    HDINLINE double operator()(double value, double* intpart)
+    namespace math
     {
-#if __CUDA_ARCH__
-        return ::modf(value, intpart);
+        template<>
+        struct Modf<double>
+        {
+            typedef double result;
+
+            HDINLINE double operator()(double value, double* intpart)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::modf(value, intpart);
 #else
-        return std::modf(value, intpart);
+                return std::modf(value, intpart);
 #endif
-    }
-};
+            }
+        };
 
-} //namespace math
-} //namespace algorithms
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/doubleMath/pow.tpp b/include/pmacc/algorithms/math/doubleMath/pow.tpp
deleted file mode 100644
index 73e169a2fe..0000000000
--- a/include/pmacc/algorithms/math/doubleMath/pow.tpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2013-2020 Rene Widera, Alexander Grund
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include <cmath>
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-/*C++98 standard define a separate version for int and double exponent*/
-
-template<>
-struct Pow<double, double>
-{
-    typedef double result;
-
-    HDINLINE result operator()(const double& base, const double& exponent)
-    {
-        return ::pow(base, exponent);
-    }
-};
-
-template<>
-struct Pow<double, int>
-{
-    typedef double result;
-
-    HDINLINE result operator()(const double& base, const int& exponent)
-    {
-#ifdef __CUDA_ARCH__ /*device version*/
-        /* @todo: There is an incompatibility with C++11 + CUDA + GCC where no device function
-         *        pow(double, int) is defined: http://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-tools-title-known
-         *        Use the pow(double, double) instead which reduces performance or implement an own (faster) version
-         */
-        return ::pow(base, static_cast<double>(exponent));
-#else
-        return ::pow(base, exponent);
-#endif
-    }
-};
-
-
-} //namespace math
-} //namespace algorithms
-} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/doubleMath/sqrt.tpp b/include/pmacc/algorithms/math/doubleMath/sqrt.tpp
deleted file mode 100644
index f5a25da0fb..0000000000
--- a/include/pmacc/algorithms/math/doubleMath/sqrt.tpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz,
- *                     Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include <cmath>
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Sqrt<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
-    {
-        return ::sqrt( value );
-    }
-};
-
-template<>
-struct RSqrt<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
-    {
-#if !defined(__CUDACC__)
-        return 1.0/::sqrt(value);
-#else
-        return ::rsqrt(value);
-#endif
-    }
-};
-
-} //namespace math
-} //namespace algorithms
-} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/doubleMath/trigo.tpp b/include/pmacc/algorithms/math/doubleMath/trigo.tpp
index 79e8e51826..acee91c2fc 100644
--- a/include/pmacc/algorithms/math/doubleMath/trigo.tpp
+++ b/include/pmacc/algorithms/math/doubleMath/trigo.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch,
  *                     Axel Huebl, Alexander Debus
  *
  * This file is part of PMacc.
@@ -28,119 +28,38 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Sin<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
-    {
-        return ::sin( value );
-    }
-};
-
-template<>
-struct ASin<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value)
-    {
-        return ::asin( value );
-    }
-};
-
-template<>
-struct Cos<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
-    {
-        return ::cos( value );
-    }
-};
-
-template<>
-struct ACos<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value)
+    namespace math
     {
-        return ::acos( value );
-    }
-};
+        template<>
+        struct SinCos<double, double, double>
+        {
+            typedef void result;
 
-template<>
-struct Tan<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value )
-    {
-        return ::tan( value );
-    }
-};
-
-template<>
-struct ATan<double>
-{
-    typedef double result;
-
-    HDINLINE double operator( )(const double& value)
-    {
-        return ::atan( value );
-    }
-};
-
-template<>
-struct SinCos<double, double, double>
-{
-    typedef void result;
-
-    HDINLINE void operator( )(double arg, double& sinValue, double& cosValue )
-    {
+            HDINLINE void operator()(double arg, double& sinValue, double& cosValue)
+            {
 #if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
-        sinValue = ::sin(arg);
-        cosValue = ::cos(arg);
+                sinValue = cupla::math::sin(arg);
+                cosValue = cupla::math::cos(arg);
 #else
-        ::sincos(arg, &sinValue, &cosValue);
+                ::sincos(arg, &sinValue, &cosValue);
 #endif
-    }
-};
-
-
-template<>
-struct Sinc<double>
-{
-    typedef double result;
+            }
+        };
 
-    HDINLINE double operator( )(const double& value )
-    {
-        if(pmacc::algorithms::math::abs(value) < DBL_EPSILON)
-            return 1.0;
-        else
-            return pmacc::algorithms::math::sin( value )/value;
-    }
-};
 
-template<>
-struct Atan2<double>
-{
-    typedef double result;
+        template<>
+        struct Sinc<double>
+        {
+            typedef double result;
 
-    HDINLINE double operator( )(const double& val1, const double& val2 )
-    {
-        return ::atan2( val1, val2 );
-    }
-};
+            HDINLINE double operator()(const double& value)
+            {
+                if(cupla::math::abs(value) < DBL_EPSILON)
+                    return 1.0;
+                else
+                    return cupla::math::sin(value) / value;
+            }
+        };
 
-} //namespace math
-} //namespace algorithms
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/abs.tpp b/include/pmacc/algorithms/math/floatMath/abs.tpp
index befec3d6db..82b92abc01 100644
--- a/include/pmacc/algorithms/math/floatMath/abs.tpp
+++ b/include/pmacc/algorithms/math/floatMath/abs.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -27,34 +27,18 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-
-template<>
-struct Abs<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(float value)
+    namespace math
     {
-        return ::fabsf( value );
-    }
-};
-
-template<>
-struct Abs2<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-        return value*value;
-    }
-};
-
-} //namespace math
-} //namespace algorithms
+        template<>
+        struct Abs2<float>
+        {
+            typedef float result;
+
+            HDINLINE float operator()(const float& value)
+            {
+                return value * value;
+            }
+        };
+
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/bessel.tpp b/include/pmacc/algorithms/math/floatMath/bessel.tpp
index 2fede97e33..b0597e7d50 100644
--- a/include/pmacc/algorithms/math/floatMath/bessel.tpp
+++ b/include/pmacc/algorithms/math/floatMath/bessel.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Debus
+/* Copyright 2016-2021 Alexander Debus
  *
  * This file is part of PMacc.
  *
@@ -27,176 +27,130 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-namespace bessel
-{
-
-    template< >
-    struct I0< float >
+    namespace math
     {
-        using result = float;
-
-        HDINLINE result operator( )( result const & x )
+        namespace bessel
         {
-#if __CUDA_ARCH__
-            return ::cyl_bessel_i0f( x );
+            template<>
+            struct I0<float>
+            {
+                using result = float;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::cyl_bessel_i0f(x);
 #else
-            return boost::math::cyl_bessel_i(
-                0,
-                x
-            );
+                    return boost::math::cyl_bessel_i(0, x);
 #endif
-        }
-    };
-
-    template< >
-    struct I1< float >
-    {
-        using result = float;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::cyl_bessel_i1f( x );
+                }
+            };
+
+            template<>
+            struct I1<float>
+            {
+                using result = float;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::cyl_bessel_i1f(x);
 #else
-            return boost::math::cyl_bessel_i(
-                1,
-                x
-            );
+                    return boost::math::cyl_bessel_i(1, x);
 #endif
-        }
-    };
-
-    template< >
-    struct J0< float >
-    {
-        using result = float;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::j0f( x );
+                }
+            };
+
+            template<>
+            struct J0<float>
+            {
+                using result = float;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu_
+                    return ::j0f(x);
 #else
-            return boost::math::cyl_bessel_j(
-                0,
-                x
-            );
+                    return boost::math::cyl_bessel_j(0, x);
 #endif
-        }
-    };
-
-    template< >
-    struct J1< float >
-    {
-        using result = float;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::j1f( x );
+                }
+            };
+
+            template<>
+            struct J1<float>
+            {
+                using result = float;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::j1f(x);
 #else
-            return boost::math::cyl_bessel_j(
-                1,
-                x
-            );
+                    return boost::math::cyl_bessel_j(1, x);
 #endif
-        }
-    };
-
-    template< >
-    struct Jn<
-        int,
-        float
-    >
-    {
-        using result = float;
-
-        HDINLINE result operator( )(
-            int const & n,
-            result const & x
-        )
-        {
-#if __CUDA_ARCH__
-            return ::jnf(
-                n,
-                x
-            );
+                }
+            };
+
+            template<>
+            struct Jn<int, float>
+            {
+                using result = float;
+
+                HDINLINE result operator()(int const& n, result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::jnf(n, x);
 #else
-            return boost::math::cyl_bessel_j(
-                n,
-                x
-            );
+                    return boost::math::cyl_bessel_j(n, x);
 #endif
-        }
-    };
-
-    template< >
-    struct Y0< float >
-    {
-        using result = float;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::y0f( x );
+                }
+            };
+
+            template<>
+            struct Y0<float>
+            {
+                using result = float;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::y0f(x);
 #else
-            return boost::math::cyl_neumann(
-                0,
-                x
-            );
+                    return boost::math::cyl_neumann(0, x);
 #endif
-        }
-    };
-
-    template< >
-    struct Y1< float >
-    {
-        using result = float;
-
-        HDINLINE result operator( )( result const & x )
-        {
-#if __CUDA_ARCH__
-            return ::y1f( x );
+                }
+            };
+
+            template<>
+            struct Y1<float>
+            {
+                using result = float;
+
+                HDINLINE result operator()(result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::y1f(x);
 #else
-            return boost::math::cyl_neumann(
-                1,
-                x
-            );
+                    return boost::math::cyl_neumann(1, x);
 #endif
-        }
-    };
-
-    template< >
-    struct Yn<
-        int,
-        float
-    >
-    {
-        using result = float;
-
-        HDINLINE result operator( )(
-            int const & n,
-            result const & x
-        )
-        {
-#if __CUDA_ARCH__
-            return ::ynf(
-                n,
-                x
-            );
+                }
+            };
+
+            template<>
+            struct Yn<int, float>
+            {
+                using result = float;
+
+                HDINLINE result operator()(int const& n, result const& x)
+                {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                    return ::ynf(n, x);
 #else
-            return boost::math::cyl_neumann(
-                n,
-                x
-            );
+                    return boost::math::cyl_neumann(n, x);
 #endif
-        }
-    };
+                }
+            };
 
-} //namespace bessel
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
+        } // namespace bessel
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/comparison.tpp b/include/pmacc/algorithms/math/floatMath/comparison.tpp
index 0897fbe393..ca27ada2ea 100644
--- a/include/pmacc/algorithms/math/floatMath/comparison.tpp
+++ b/include/pmacc/algorithms/math/floatMath/comparison.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Benjamin Worpitz, Richard Pausch
+/* Copyright 2015-2021 Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -27,33 +27,29 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Min<float, float>
-{
-    typedef float result;
-
-    HDINLINE float operator()(float value1, float value2)
+    namespace math
     {
-        return ::fminf(value1, value2);
-    }
-};
-
-template<>
-struct Max<float, float>
-{
-    typedef float result;
-
-    HDINLINE float operator()(float value1, float value2)
-    {
-        return ::fmaxf(value1, value2);
-    }
-};
-
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
+        template<>
+        struct Min<float, float>
+        {
+            typedef float result;
+
+            HDINLINE float operator()(float value1, float value2)
+            {
+                return ::fminf(value1, value2);
+            }
+        };
+
+        template<>
+        struct Max<float, float>
+        {
+            typedef float result;
+
+            HDINLINE float operator()(float value1, float value2)
+            {
+                return ::fmaxf(value1, value2);
+            }
+        };
+
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/erf.tpp b/include/pmacc/algorithms/math/floatMath/erf.tpp
deleted file mode 100644
index 10535fdf4d..0000000000
--- a/include/pmacc/algorithms/math/floatMath/erf.tpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2014-2020 Axel Huebl, Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include <cmath>
-
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Erf<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-        return ::erff( value );
-    }
-};
-
-} /* namespace math */
-} /* namespace algorithms */
-} /* namespace pmacc */
diff --git a/include/pmacc/algorithms/math/floatMath/exp.tpp b/include/pmacc/algorithms/math/floatMath/exp.tpp
index 19878a6202..d648fc8894 100644
--- a/include/pmacc/algorithms/math/floatMath/exp.tpp
+++ b/include/pmacc/algorithms/math/floatMath/exp.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -28,48 +28,22 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Exp<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-        return ::expf( value );
-    }
-};
-
-template<>
-struct Log<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-        return ::logf( value );
-    }
-};
-
-template<>
-struct Log10<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value)
+    namespace math
     {
-#if __CUDA_ARCH__
-        return ::log10f( value );
+        template<>
+        struct Log10<float>
+        {
+            typedef float result;
+
+            HDINLINE float operator()(const float& value)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::log10f(value);
 #else
-        return ::log10( value );
+                return ::log10(value);
 #endif
-    }
-};
+            }
+        };
 
-} //namespace math
-} //namespace algorithms
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
index a6d41ec318..11e88907d9 100644
--- a/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
+++ b/include/pmacc/algorithms/math/floatMath/floatingPoint.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -29,101 +29,75 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Floor<float>
-{
-    typedef float result;
-
-    HDINLINE result operator( )(result value)
+    namespace math
     {
-        return ::floorf( value );
-    }
-};
-
-template<>
-struct Ceil<float>
-{
-    typedef float result;
-
-    HDINLINE result operator( )(result value)
-    {
-        return ::ceil( value );
-    }
-};
-
-template<>
-struct Float2int_ru<float>
-{
-    typedef int result;
+        template<>
+        struct Float2int_ru<float>
+        {
+            typedef int result;
 
-    HDINLINE result operator( )(float value)
-    {
-#if __CUDA_ARCH__
-        return ::__float2int_ru( value );
+            HDINLINE result operator()(float value)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::__float2int_ru(value);
 #else
-        return static_cast<int>(ceil(value));
+                return static_cast<int>(ceil(value));
 #endif
-    }
-};
+            }
+        };
 
-template<>
-struct Float2int_rd<float>
-{
-    typedef int result;
+        template<>
+        struct Float2int_rd<float>
+        {
+            typedef int result;
 
-    HDINLINE result operator( )(float value)
-    {
-#if __CUDA_ARCH__
-        return ::__float2int_rd( value );
+            HDINLINE result operator()(float value)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::__float2int_rd(value);
 #else
-        return static_cast<int>(floor(value));
+                return static_cast<int>(floor(value));
 #endif
-    }
-};
+            }
+        };
 
-template<>
-struct Float2int_rn<float>
-{
-    typedef int result;
+        template<>
+        struct Float2int_rn<float>
+        {
+            typedef int result;
 
-    HDINLINE result operator( )(float value)
-    {
-#if __CUDA_ARCH__
-        return ::__float2int_rn( value );
+            HDINLINE result operator()(float value)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::__float2int_rn(value);
 #else
-        if(value < 0.0f)
-            return -(*this)(-value);
-        float intPart;
-        float fracPart = std::modf(value, &intPart);
-        result res = static_cast<int>(intPart);
-        /* epsilon in the following code is used to consider values
-         * "very close" to x.5 also as x.5
-         */
-        if(fracPart > 0.5f + std::numeric_limits<float>::epsilon())
-        {
-            /* >x.5 --> Round up */
-            res = res + 1;
-        }
-        else if(!(fracPart < 0.5f - std::numeric_limits<float>::epsilon()))
-        {
-            /* We are NOT >x.5 AND NOT <x.5 --> ==x.5 --> use x if x is even, else x+1
-             * The "&~1" cancels the last bit which results in an even value
-             * res is even -> res+1 is odd -> (res+1)&~1 = res
-             * res is odd -> res+1 is even -> (res+1)&~1 = res+1
-             */
-            res = (res + 1) & ~1;
-        }
-        /* else res = res (round down) */
-        return res;
+                if(value < 0.0f)
+                    return -(*this)(-value);
+                float intPart;
+                float fracPart = std::modf(value, &intPart);
+                result res = static_cast<int>(intPart);
+                /* epsilon in the following code is used to consider values
+                 * "very close" to x.5 also as x.5
+                 */
+                if(fracPart > 0.5f + std::numeric_limits<float>::epsilon())
+                {
+                    /* >x.5 --> Round up */
+                    res = res + 1;
+                }
+                else if(!(fracPart < 0.5f - std::numeric_limits<float>::epsilon()))
+                {
+                    /* We are NOT >x.5 AND NOT <x.5 --> ==x.5 --> use x if x is even, else x+1
+                     * The "&~1" cancels the last bit which results in an even value
+                     * res is even -> res+1 is odd -> (res+1)&~1 = res
+                     * res is odd -> res+1 is even -> (res+1)&~1 = res+1
+                     */
+                    res = (res + 1) & ~1;
+                }
+                /* else res = res (round down) */
+                return res;
 #endif
-    }
-};
+            }
+        };
 
-} //namespace math
-} //namespace algorithms
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/fmod.tpp b/include/pmacc/algorithms/math/floatMath/fmod.tpp
deleted file mode 100644
index 47ff15fc99..0000000000
--- a/include/pmacc/algorithms/math/floatMath/fmod.tpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2016-2020 Alexander Debus
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include <cmath>
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Fmod<float>
-{
-    typedef float result;
-
-    HDINLINE result operator( )(result x, result y)
-    {
-#if __CUDA_ARCH__
-        return ::fmodf(x, y);
-#else
-        return std::fmod(x, y);
-#endif
-    }
-};
-
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
-
diff --git a/include/pmacc/algorithms/math/floatMath/modf.tpp b/include/pmacc/algorithms/math/floatMath/modf.tpp
index afaa41412f..818298e028 100644
--- a/include/pmacc/algorithms/math/floatMath/modf.tpp
+++ b/include/pmacc/algorithms/math/floatMath/modf.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -26,26 +26,22 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Modf<float>
-{
-    typedef float result;
-
-    HDINLINE float operator()(float value, float* intpart)
+    namespace math
     {
-#if __CUDA_ARCH__
-        return ::modff(value, intpart);
+        template<>
+        struct Modf<float>
+        {
+            typedef float result;
+
+            HDINLINE float operator()(float value, float* intpart)
+            {
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+                return ::modff(value, intpart);
 #else
-        return std::modf(value, intpart);
+                return std::modf(value, intpart);
 #endif
-    }
-};
+            }
+        };
 
-} //namespace math
-} //namespace algorithms
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/pow.tpp b/include/pmacc/algorithms/math/floatMath/pow.tpp
deleted file mode 100644
index 43b1433ea6..0000000000
--- a/include/pmacc/algorithms/math/floatMath/pow.tpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2013-2020 Rene Widera, Alexander Grund
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include <cmath>
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-/*C++98 standard define a separate version for int and float exponent*/
-
-template<>
-struct Pow<float, float>
-{
-    typedef float result;
-
-    HDINLINE result operator()(const float& base, const float& exponent)
-    {
-#ifdef __CUDA_ARCH__ /*device version*/
-        /* CUDA seems to have an optimized version for powf which is faster and (maybe) less accurate. */
-        return ::powf(base, exponent);
-#else
-        return ::pow(base, exponent);
-#endif
-
-    }
-};
-
-template<>
-struct Pow<float, int>
-{
-    typedef float result;
-
-    HDINLINE result operator()(const float& base, const int& exponent)
-    {
-#ifdef __CUDA_ARCH__ /*device version*/
-        /* @todo: There is an incompatibility with C++11 + CUDA + GCC where no device function
-         *        pow(float, int) is defined: http://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-tools-title-known
-         *        Use the powf(float, float) instead which reduces performance or implement an own (faster) version
-         */
-        return ::powf(base, exponent);
-#else
-        return ::pow(base, exponent);
-#endif
-
-    }
-};
-
-} //namespace math
-} //namespace algorithms
-} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/sqrt.tpp b/include/pmacc/algorithms/math/floatMath/sqrt.tpp
deleted file mode 100644
index 4ababcb778..0000000000
--- a/include/pmacc/algorithms/math/floatMath/sqrt.tpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz,
- *                     Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include <cmath>
-
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Sqrt<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-        return ::sqrtf( value );
-    }
-};
-
-template<>
-struct RSqrt<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-#if !defined(__CUDACC__)
-        return 1.0f/::sqrtf(value);
-#else
-        return ::rsqrtf(value);
-#endif
-    }
-};
-
-} //namespace math
-} //namespace algorithms
-} // namespace pmacc
diff --git a/include/pmacc/algorithms/math/floatMath/trigo.tpp b/include/pmacc/algorithms/math/floatMath/trigo.tpp
index 543eb2c3e5..7b4151c1cb 100644
--- a/include/pmacc/algorithms/math/floatMath/trigo.tpp
+++ b/include/pmacc/algorithms/math/floatMath/trigo.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch,
  *                     Axel Huebl, Alexander Debus
  *
  * This file is part of PMacc.
@@ -29,128 +29,37 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-template<>
-struct Sin<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-        return ::sinf( value );
-    }
-};
-
-template<>
-struct ASin<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value)
-    {
-#if __CUDA_ARCH__
-        return ::asinf( value );
-#else
-        return ::asin( value );
-#endif
-    }
-};
-
-template<>
-struct Cos<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
+    namespace math
     {
-        return ::cosf( value );
-    }
-};
-
-template<>
-struct ACos<float>
-{
-    typedef float result;
+        template<>
+        struct SinCos<float, float, float>
+        {
+            typedef void result;
 
-    HDINLINE float operator( )(const float& value)
-    {
-#if __CUDA_ARCH__
-        return ::acosf( value );
-#else
-        return ::acos( value );
-#endif
-    }
-};
-
-template<>
-struct Tan<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-        return ::tanf( value );
-    }
-};
-
-template<>
-struct ATan<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value)
-    {
-        return ::atanf( value );
-    }
-};
-
-template<>
-struct SinCos<float, float, float>
-{
-    typedef void result;
-
-    HDINLINE void operator( )(float arg, float& sinValue, float& cosValue )
-    {
+            HDINLINE void operator()(float arg, float& sinValue, float& cosValue)
+            {
 #if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
-        sinValue = ::sinf(arg);
-        cosValue = ::cosf(arg);
+                sinValue = cupla::math::sin(arg);
+                cosValue = cupla::math::cos(arg);
 #else
-        ::sincosf( arg, &sinValue, &cosValue );
+                ::sincosf(arg, &sinValue, &cosValue);
 #endif
-    }
-};
-
-
-
-template<>
-struct Sinc<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& value )
-    {
-        if(pmacc::algorithms::math::abs(value) < FLT_EPSILON)
-            return 1.0;
-        else
-            return pmacc::algorithms::math::sin( value )/value;
-    }
-};
-
-template<>
-struct Atan2<float>
-{
-    typedef float result;
-
-    HDINLINE float operator( )(const float& val1, const float& val2 )
-    {
-        return ::atan2f( val1, val2 );
-    }
-};
-
-} //namespace math
-} //namespace algorithms
+            }
+        };
+
+        template<>
+        struct Sinc<float>
+        {
+            typedef float result;
+
+            HDINLINE float operator()(const float& value)
+            {
+                if(cupla::math::abs(value) < FLT_EPSILON)
+                    return 1.0f;
+                else
+                    return cupla::math::sin(value) / value;
+            }
+        };
+
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/algorithms/reverseBits.hpp b/include/pmacc/algorithms/reverseBits.hpp
index 2724284eb6..ea4234545e 100644
--- a/include/pmacc/algorithms/reverseBits.hpp
+++ b/include/pmacc/algorithms/reverseBits.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -26,37 +26,37 @@
 #include <boost/type_traits.hpp>
 #include <climits>
 
-namespace pmacc{
-
-/**
- * Reverses the bit in an unsigned integral value
- *
- * Based on "Bit Twiddling Hacks" by Sean Eron Anderson
- * published in public domain. Retrieved on 13th of August, 2015 from
- * http://www.graphics.stanford.edu/~seander/bithacks.html
- *
- * @param value Value which should be reversed
- * @return Reversed value
- */
-template<typename T>
-T
-reverseBits(T value)
+namespace pmacc
 {
-    PMACC_STATIC_ASSERT_MSG( boost::is_integral<T>::value && boost::is_unsigned<T>::value,
-                             Only_allowed_for_unsigned_integral_types );
-    /* init with value (to get LSB) */
-    T result = value;
-    /* extra shift needed at end */
-    int s = sizeof(T) * CHAR_BIT - 1;
-    for (value >>= 1; value; value >>= 1)
+    /**
+     * Reverses the bit in an unsigned integral value
+     *
+     * Based on "Bit Twiddling Hacks" by Sean Eron Anderson
+     * published in public domain. Retrieved on 13th of August, 2015 from
+     * http://www.graphics.stanford.edu/~seander/bithacks.html
+     *
+     * @param value Value which should be reversed
+     * @return Reversed value
+     */
+    template<typename T>
+    T reverseBits(T value)
     {
-        result <<= 1;
-        result |= value & 1;
-        s--;
+        PMACC_STATIC_ASSERT_MSG(
+            boost::is_integral<T>::value && boost::is_unsigned<T>::value,
+            Only_allowed_for_unsigned_integral_types, );
+        /* init with value (to get LSB) */
+        T result = value;
+        /* extra shift needed at end */
+        int s = sizeof(T) * CHAR_BIT - 1;
+        for(value >>= 1; value; value >>= 1)
+        {
+            result <<= 1;
+            result |= value & 1;
+            s--;
+        }
+        /* shift when values highest bits are zero */
+        result <<= s;
+        return result;
     }
-    /* shift when values highest bits are zero */
-    result <<= s;
-    return result;
-}
 
-}  // namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/assert.hpp b/include/pmacc/assert.hpp
index 245701de0a..5b985f78eb 100644
--- a/include/pmacc/assert.hpp
+++ b/include/pmacc/assert.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera, Pawel Ordyna
  *
  * This file is part of PMacc.
  *
@@ -24,37 +24,70 @@
 
 #include "pmacc/debug/abortWithError.hpp"
 
-#ifdef NDEBUG
-    // debug mode is disabled
+#include <cassert>
 
-    /* `(void)0` force a semicolon after the macro function */
-#   define PMACC_ASSERT( expr ) ( (void) 0 )
+// disabled for no-debug mode or for the device compile path
+#if defined(NDEBUG) || (CUPLA_DEVICE_COMPILE == 1)
 
-    /* `(void)0` force a semicolon after the macro function */
-#   define PMACC_ASSERT_MSG( expr, msg ) ( (void) 0 )
+/* `(void)0` force a semicolon after the macro function */
+#    define PMACC_ASSERT(expr) ((void) 0)
+
+/* `(void)0` force a semicolon after the macro function */
+#    define PMACC_ASSERT_MSG(expr, msg) ((void) 0)
+
+#else
+
+/** assert check (host side only)
+ *
+ * if `NDEBUG` is defined: macro expands to (void)0
+ *
+ * @param expr expression to be evaluated
+ */
+#    define PMACC_ASSERT(expr) (!!(expr)) ? ((void) 0) : pmacc::abortWithError(#    expr, __FILE__, __LINE__)
+
+/** assert check with message (host side only)
+ *
+ * if `NDEBUG` is defined: macro expands to (void)0
+ *
+ * @param expr expression to be evaluated
+ * @param msg output message (of type `std::string`) which is printed if the
+ *            expression is evaluated to false
+ */
+#    define PMACC_ASSERT_MSG(expr, msg) (!!(expr)) ? ((void) 0) : pmacc::abortWithError(#    expr, __FILE__, __LINE__, msg)
+
+#endif
+
+// disabled for no-debug mode or for the host compile path
+#if defined(NDEBUG) || (CUPLA_DEVICE_COMPILE == 0)
+
+/* `(void)0` force a semicolon after the macro function */
+#    define PMACC_DEVICE_ASSERT(expr) ((void) 0)
+
+// debug mode is disabled
+/* `(void)0` force a semicolon after the macro function */
+#    define PMACC_DEVICE_ASSERT_MSG(expr, ...) ((void) 0)
 
 #else
 
-    // debug mode is enabled
-
-    /** assert check
-     *
-     * if `NDEBUG` is not defined: macro expands to (void)0
-     *
-     * @param expr expression to be evaluated
-     */
-#   define PMACC_ASSERT( expr )                                                \
-    ( !!(expr) ) ? ( (void) 0 ) : pmacc::abortWithError( #expr, __FILE__, __LINE__ )
-
-    /** assert check with message
-     *
-     * if `NDEBUG` is not defined: macro expands to (void)0
-     *
-     * @param expr expression to be evaluated
-     * @param msg output message (of type `std::string`) which is printed if the
-     *            expression is evaluated to false
-     */
-#   define PMACC_ASSERT_MSG( expr, msg )                                       \
-    ( !!(expr) ) ? ( (void) 0 ) : pmacc::abortWithError( #expr, __FILE__, __LINE__, msg )
+/** assert check for kernels (device side)
+ *
+ * if `NDEBUG` is defined: macro expands to (void)0
+ * @param expr expression to be evaluated
+ */
+#    define PMACC_DEVICE_ASSERT(expr) assert(expr)
 
+/** assert check with message (device side)
+ *
+ * if `NDEBUG` is defined: macro expands to (void)0
+ *
+ * Beside the usual assert message an additional message is printed to stdout with `printf`.
+ * Pass your `printf` arguments after the evaluated expression, for example to print some local variables:
+ * @code{.cpp}
+ * PMACC_DEVICE_ASSERT_MSG((x > 0), "x was %e, a was %e", x, a);
+ * @endcode
+ *
+ * @param expr expression to be evaluated
+ * @param ... parameters passed to printf
+ */
+#    define PMACC_DEVICE_ASSERT_MSG(expr, ...) (!!(expr)) ? ((void) 0) : (printf(__VA_ARGS__), assert(expr))
 #endif
diff --git a/include/pmacc/attribute/Constexpr.hpp b/include/pmacc/attribute/Constexpr.hpp
index 92d1d48cbf..de241300ea 100644
--- a/include/pmacc/attribute/Constexpr.hpp
+++ b/include/pmacc/attribute/Constexpr.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -43,10 +43,10 @@
  * constexpr is captured, but also it has to remain constexpr inside a lambda.
  */
 #ifdef _MSC_VER
-#   define PMACC_CONSTEXPR_CAPTURE static constexpr
-#elif ( defined __GNUC__ ) && ( __GNUC__ > 7 )
+#    define PMACC_CONSTEXPR_CAPTURE static constexpr
+#elif(defined __GNUC__) && (__GNUC__ > 7)
 // workaround for GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91377
-#   define PMACC_CONSTEXPR_CAPTURE static constexpr
+#    define PMACC_CONSTEXPR_CAPTURE static constexpr
 #else
-#   define PMACC_CONSTEXPR_CAPTURE constexpr
+#    define PMACC_CONSTEXPR_CAPTURE constexpr
 #endif
diff --git a/include/pmacc/attribute/Fallthrough.hpp b/include/pmacc/attribute/Fallthrough.hpp
index 1d6056065d..2f48236191 100644
--- a/include/pmacc/attribute/Fallthrough.hpp
+++ b/include/pmacc/attribute/Fallthrough.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -36,10 +36,10 @@
  *
  * Use [[fallthrough]] in C++17
  */
-#if (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(7,0,0))
-#   define PMACC_FALLTHROUGH [[gnu::fallthrough]]
+#if(BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(7, 0, 0))
+#    define PMACC_FALLTHROUGH [[gnu::fallthrough]]
 #elif BOOST_COMP_CLANG
-#   define PMACC_FALLTHROUGH [[clang::fallthrough]]
+#    define PMACC_FALLTHROUGH [[clang::fallthrough]]
 #else
-#   define PMACC_FALLTHROUGH ( (void)0 )
+#    define PMACC_FALLTHROUGH ((void) 0)
 #endif
diff --git a/include/pmacc/attribute/FunctionSpecifier.hpp b/include/pmacc/attribute/FunctionSpecifier.hpp
index 5b243f428c..753be0dd26 100644
--- a/include/pmacc/attribute/FunctionSpecifier.hpp
+++ b/include/pmacc/attribute/FunctionSpecifier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -36,9 +36,9 @@
  * 0 for host compilation
  */
 #ifndef __CUDA_ARCH__
-#   define PMACC_CUDA_ARCH 0
+#    define PMACC_CUDA_ARCH 0
 #else
-#   define PMACC_CUDA_ARCH __CUDA_ARCH__
+#    define PMACC_CUDA_ARCH __CUDA_ARCH__
 #endif
 
 /** PMacc global identifier for CUDA kernel */
@@ -59,7 +59,7 @@
  * Most cases can solved by #ifdef __CUDA_ARCH__ or #ifdef __CUDACC__.
  */
 #if defined(__CUDACC__)
-#   define PMACC_NO_NVCC_HDWARNING _Pragma("hd_warning_disable")
+#    define PMACC_NO_NVCC_HDWARNING _Pragma("hd_warning_disable")
 #else
-#   define PMACC_NO_NVCC_HDWARNING
+#    define PMACC_NO_NVCC_HDWARNING
 #endif
diff --git a/include/pmacc/boost_workaround.hpp b/include/pmacc/boost_workaround.hpp
new file mode 100644
index 0000000000..ad755a32eb
--- /dev/null
+++ b/include/pmacc/boost_workaround.hpp
@@ -0,0 +1,44 @@
+/* Copyright 2020-2021 Rene Widera
+ *
+ * This file is part of PMacc.
+ *
+ * PMacc is free software: you can redistribute it and/or modify
+ * it under the terms of either the GNU General Public License or
+ * the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PMacc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License and the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * and the GNU Lesser General Public License along with PMacc.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+/** @file This file should be included in each `cpp`-file before any other boost include
+ * to workaround compiler errors when compiling with clang-cuda and boost <1.69.0
+ *
+ * https://github.com/ComputationalRadiationPhysics/picongpu/issues/3294
+ */
+#include <boost/version.hpp>
+#if(BOOST_VERSION < 106900 && defined(__CUDACC__) && defined(__clang__))
+#    if defined(__CUDACC__)
+#        include <boost/config/compiler/nvcc.hpp>
+#    endif
+#    if(!defined(__ibmxl__))
+#        include <boost/config/compiler/clang.hpp>
+#    endif
+#    undef __CUDACC__
+#    include <boost/config/detail/select_compiler_config.hpp>
+#    define __CUDACC__
+#endif
+/* workaround for compile error with clang-cuda
+ * boost/type_traits/is_base_and_derived.hpp:142:25: error: invalid application of 'sizeof' to an incomplete type
+ * 'boost::in_place_factory_base' BOOST_STATIC_ASSERT(sizeof(B) != 0);
+ */
+#include <boost/optional/optional.hpp>
diff --git a/include/pmacc/communication/AsyncCommunication.hpp b/include/pmacc/communication/AsyncCommunication.hpp
index 545f1e8499..5998be388b 100644
--- a/include/pmacc/communication/AsyncCommunication.hpp
+++ b/include/pmacc/communication/AsyncCommunication.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -21,53 +21,54 @@
 
 #pragma once
 
-namespace pmacc{
-namespace communication {
-
-    /**
-     * Wrapper to convert a bool into a type
-     */
-    template<bool T_value>
-    struct Bool2Type;
+namespace pmacc
+{
+    namespace communication
+    {
+        /**
+         * Wrapper to convert a bool into a type
+         */
+        template<bool T_value>
+        struct Bool2Type;
 
-    /**
-     * Implementations of \see AsyncCommunication should specialize this,
-     * but it is not intended to be called directly. Use \see AsyncCommunication
-     *
-     * The 2nd template parameter can be used to check for conditions on
-     * templated implementations. E.g.:
-     *
-     *     template<typename T_Data>
-     *     struct AsyncCommunicationImpl<
-     *         T_Data,
-     *         Bool2Type< boost::is_integral<T_Data>::value >
-     *     >{...}
-     */
-    template<typename T_Data, typename T_IsSpecialized = Bool2Type<true> >
-    struct AsyncCommunicationImpl;
+        /**
+         * Implementations of \see AsyncCommunication should specialize this,
+         * but it is not intended to be called directly. Use \see AsyncCommunication
+         *
+         * The 2nd template parameter can be used to check for conditions on
+         * templated implementations. E.g.:
+         *
+         *     template<typename T_Data>
+         *     struct AsyncCommunicationImpl<
+         *         T_Data,
+         *         Bool2Type< boost::is_integral<T_Data>::value >
+         *     >{...}
+         */
+        template<typename T_Data, typename T_IsSpecialized = Bool2Type<true>>
+        struct AsyncCommunicationImpl;
 
-    /**
-     * This policy starts an asynchronous communication of the given data
-     * (e.g. a particle species)
-     *
-     * It must be a functor with signature EventTask(T_Data&, EventTask parentEvent)
-     * but can be templated (again) over T_Data to get the actual type. This
-     * is helpful for generic implementations that apply to T_Data and all
-     * derived classes but want to use the possibly more derived type
-     *
-     * For different T_Data types you can either specialize this or the more
-     * generic \see AsyncCommunicationImpl
-     */
-    template<typename T_Data>
-    struct AsyncCommunication: public AsyncCommunicationImpl<T_Data>
-    {};
+        /**
+         * This policy starts an asynchronous communication of the given data
+         * (e.g. a particle species)
+         *
+         * It must be a functor with signature EventTask(T_Data&, EventTask parentEvent)
+         * but can be templated (again) over T_Data to get the actual type. This
+         * is helpful for generic implementations that apply to T_Data and all
+         * derived classes but want to use the possibly more derived type
+         *
+         * For different T_Data types you can either specialize this or the more
+         * generic \see AsyncCommunicationImpl
+         */
+        template<typename T_Data>
+        struct AsyncCommunication : public AsyncCommunicationImpl<T_Data>
+        {
+        };
 
-    template<typename T_Data>
-    EventTask
-    asyncCommunication(T_Data& data, EventTask parent)
-    {
-        return AsyncCommunication<T_Data>()(data, parent);
-    }
+        template<typename T_Data>
+        EventTask asyncCommunication(T_Data& data, EventTask parent)
+        {
+            return AsyncCommunication<T_Data>()(data, parent);
+        }
 
-}  // namespace communication
-}  // namespace pmacc
+    } // namespace communication
+} // namespace pmacc
diff --git a/include/pmacc/communication/CommunicatorMPI.hpp b/include/pmacc/communication/CommunicatorMPI.hpp
index 416169a51c..676a3dbad3 100644
--- a/include/pmacc/communication/CommunicatorMPI.hpp
+++ b/include/pmacc/communication/CommunicatorMPI.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
@@ -36,409 +36,418 @@
 
 namespace pmacc
 {
-
-namespace detail
-{
-    template <unsigned T_DIM>
-    struct LogRankCoords;
-
-    template <>
-    struct LogRankCoords<DIM1>
+    namespace detail
     {
-        void operator()(int rank, const int (&coords)[DIM1]) const
+        template<unsigned T_DIM>
+        struct LogRankCoords;
+
+        template<>
+        struct LogRankCoords<DIM1>
         {
-            log<ggLog::MPI>("Rank: %1% ; coords %2%") % rank % coords[0];
-        }
-    };
-    template <>
-    struct LogRankCoords<DIM2>
-    {
-        void operator()(int rank, const int (&coords)[DIM2]) const
+            void operator()(int rank, const int (&coords)[DIM1]) const
+            {
+                log<ggLog::MPI>("Rank: %1% ; coords %2%") % rank % coords[0];
+            }
+        };
+        template<>
+        struct LogRankCoords<DIM2>
         {
-            log<ggLog::MPI>("Rank: %1% ; coords %2% %3%") % rank % coords[0] % coords[1];
-        }
-    };
-    template <>
-    struct LogRankCoords<DIM3>
-    {
-        void operator()(int rank, const int (&coords)[DIM3]) const
+            void operator()(int rank, const int (&coords)[DIM2]) const
+            {
+                log<ggLog::MPI>("Rank: %1% ; coords %2% %3%") % rank % coords[0] % coords[1];
+            }
+        };
+        template<>
+        struct LogRankCoords<DIM3>
         {
-            log<ggLog::MPI>("Rank: %1% ; coords %2% %3% %4%") % rank % coords[0] % coords[1] % coords[2];
-        }
-    };
-
-}
+            void operator()(int rank, const int (&coords)[DIM3]) const
+            {
+                log<ggLog::MPI>("Rank: %1% ; coords %2% %3% %4%") % rank % coords[0] % coords[1] % coords[2];
+            }
+        };
 
-/*! communication via MPI
- */
-template <unsigned DIM>
-class CommunicatorMPI : public ICommunicator
-{
-public:
+    } // namespace detail
 
-    /*! ctor
+    /*! communication via MPI
      */
-    CommunicatorMPI() : hostRank(0)
+    template<unsigned DIM>
+    class CommunicatorMPI : public ICommunicator
     {
-        //MPI_Init(nullptr, nullptr);
-    }
-
-    /*! dtor
-     *
-     * calls MPI_Finalize
-     */
-    virtual ~CommunicatorMPI()
-    {}
-
-    virtual int getRank()
-    {
-        return mpiRank;
-    }
-
-    virtual int getSize()
-    {
-        return mpiSize;
-    }
-
-    MPI_Comm getMPIComm() const
-    {
-        return topology;
-    }
+    public:
+        /*! ctor
+         */
+        CommunicatorMPI() : hostRank(0)
+        {
+            // MPI_Init(nullptr, nullptr);
+        }
 
-    MPI_Info getMPIInfo() const
-    {
-        return MPI_INFO_NULL;
-    }
+        /*! dtor
+         *
+         * calls MPI_Finalize
+         */
+        virtual ~CommunicatorMPI()
+        {
+        }
 
-    DataSpace<DIM3> getPeriodic() const
-    {
-        return this->periodic;
-    }
-
-    /*! initializes all processes to build a 3D-grid
-     *
-     * @param nodes number of GPU nodes in each dimension
-     * @param periodic specifying whether the grid is periodic (1) or not (0) in each dimension
-     *
-     * \warning throws invalid argument if cx*cy*cz != totalnodes
-     */
-    void init(DataSpace<DIM3> numberProcesses, DataSpace<DIM3> periodic)
-    {
-        this->periodic = periodic;
+        virtual int getRank()
+        {
+            return mpiRank;
+        }
 
-        //check if parameters are correct
-        MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &mpiSize));
+        virtual int getSize()
+        {
+            return mpiSize;
+        }
 
-        if (numberProcesses.productOfComponents() != mpiSize)
+        MPI_Comm getMPIComm() const
         {
-            throw std::invalid_argument("wrong parameters or wrong mpirun-call!");
+            return topology;
         }
 
-        //1. create Communicator (computing_comm) of computing nodes (ranks 0...n)
-        MPI_Comm computing_comm = MPI_COMM_WORLD;
+        MPI_Info getMPIInfo() const
+        {
+            return MPI_INFO_NULL;
+        }
 
-        yoffset = 0;
+        DataSpace<DIM3> getPeriodic() const
+        {
+            return this->periodic;
+        }
 
-        // 2. create topology
+        /*! initializes all processes to build a 3D-grid
+         *
+         * @param nodes number of GPU nodes in each dimension
+         * @param periodic specifying whether the grid is periodic (1) or not (0) in each dimension
+         *
+         * \warning throws invalid argument if cx*cy*cz != totalnodes
+         */
+        void init(DataSpace<DIM3> numberProcesses, DataSpace<DIM3> periodic)
+        {
+            this->periodic = periodic;
 
-        dims[0] = numberProcesses.x();
-        dims[1] = numberProcesses.y();
-        dims[2] = numberProcesses.z();
+            // check if parameters are correct
+            MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &mpiSize));
 
-        topology = MPI_COMM_NULL;
+            if(numberProcesses.productOfComponents() != mpiSize)
+            {
+                throw std::invalid_argument("wrong parameters or wrong mpirun-call!");
+            }
 
-        int periods[] = {periodic.x(), periodic.y(), periodic.z()};
+            // 1. create Communicator (computing_comm) of computing nodes (ranks 0...n)
+            MPI_Comm computing_comm = MPI_COMM_WORLD;
 
-        /*create new communicator based on cartesian coordinates*/
-        MPI_CHECK(MPI_Cart_create(computing_comm, DIM, dims, periods, 0, &topology));
+            yoffset = 0;
 
-        // 3. update Host rank
-        updateHostRank();
+            // 2. create topology
 
-        //4. update Coordinates
-        updateCoordinates();
-    }
+            dims[0] = numberProcesses.x();
+            dims[1] = numberProcesses.y();
+            dims[2] = numberProcesses.z();
 
-    /*! returns a rank number (0-n) for each host
-     *
-     * E.g. if 8 GPUs are on 2 Hosts (4 GPUs each), the GPUs on each host will get hostrank 0 to 3
-     *
-     */
-    uint32_t getHostRank()
-    {
-        return hostRank;
-    }
+            topology = MPI_COMM_NULL;
 
-    // description in ICommunicator
+            int periods[] = {periodic.x(), periodic.y(), periodic.z()};
 
-    virtual const Mask& getCommunicationMask() const
-    {
-        return communicationMask;
-    }
+            /*create new communicator based on cartesian coordinates*/
+            MPI_CHECK(MPI_Cart_create(computing_comm, DIM, dims, periods, 0, &topology));
 
-    /*! returns coordinate of this process in (via init) created grid
-     *
-     * Coordinates are between [0-cx, 0-cy, 0-cz]
-     *
-     */
-    const DataSpace<DIM> getCoordinates() const
-    {
-        return this->coordinates;
-    }
+            // 3. update Host rank
+            updateHostRank();
 
-    // description in ICommunicator
+            // 4. update Coordinates
+            updateCoordinates();
+        }
 
-    MPI_Request* startSend(uint32_t ex, const char *send_data, size_t send_data_count, uint32_t tag)
-    {
-        MPI_Request *request = new MPI_Request;
+        /*! returns a rank number (0-n) for each host
+         *
+         * E.g. if 8 GPUs are on 2 Hosts (4 GPUs each), the GPUs on each host will get hostrank 0 to 3
+         *
+         */
+        uint32_t getHostRank()
+        {
+            return hostRank;
+        }
 
-        MPI_CHECK(MPI_Isend(
-                            (void*) send_data,
-                            static_cast<int>(send_data_count),
-                            MPI_CHAR,
-                            ExchangeTypeToRank(ex),
-                            gridExchangeTag + tag,
-                            topology,
-                            request));
+        // description in ICommunicator
 
-        return request;
-    }
+        virtual const Mask& getCommunicationMask() const
+        {
+            return communicationMask;
+        }
 
-    // description in ICommunicator
+        /*! returns coordinate of this process in (via init) created grid
+         *
+         * Coordinates are between [0-cx, 0-cy, 0-cz]
+         *
+         */
+        const DataSpace<DIM> getCoordinates() const
+        {
+            return this->coordinates;
+        }
 
-    MPI_Request* startReceive(uint32_t ex, char *recv_data, size_t recv_data_max, uint32_t tag)
-    {
+        // description in ICommunicator
 
-        MPI_Request *request = new MPI_Request;
+        MPI_Request* startSend(uint32_t ex, const char* send_data, size_t send_data_count, uint32_t tag)
+        {
+            MPI_Request* request = new MPI_Request;
+
+            MPI_CHECK(MPI_Isend(
+                (void*) send_data,
+                static_cast<int>(send_data_count),
+                MPI_CHAR,
+                ExchangeTypeToRank(ex),
+                gridExchangeTag + tag,
+                topology,
+                request));
+
+            return request;
+        }
 
-        MPI_CHECK(MPI_Irecv(
-                            recv_data,
-                            static_cast<int>(recv_data_max),
-                            MPI_CHAR,
-                            ExchangeTypeToRank(ex),
-                            gridExchangeTag + tag,
-                            topology,
-                            request));
+        // description in ICommunicator
 
-        return request;
-    }
+        MPI_Request* startReceive(uint32_t ex, char* recv_data, size_t recv_data_max, uint32_t tag)
+        {
+            MPI_Request* request = new MPI_Request;
+
+            MPI_CHECK(MPI_Irecv(
+                recv_data,
+                static_cast<int>(recv_data_max),
+                MPI_CHAR,
+                ExchangeTypeToRank(ex),
+                gridExchangeTag + tag,
+                topology,
+                request));
+
+            return request;
+        }
 
-    // description in ICommunicator
+        // description in ICommunicator
 
-    bool slide()
-    {
-        // we can only slide in y direction right now
-        if(DIM < DIM2)
-            return false;
+        bool slide()
+        {
+            // we can only slide in y direction right now
+            if(DIM < DIM2)
+                return false;
 
-        // MPI_Barrier(topology);
-        yoffset--;
-        if (yoffset == -dims[1])
-            yoffset = 0;
+            // MPI_Barrier(topology);
+            yoffset--;
+            if(yoffset == -dims[1])
+                yoffset = 0;
 
-        updateCoordinates();
+            updateCoordinates();
 
-        return coordinates[1] == dims[1] - 1;
-    }
+            return coordinates[1] == dims[1] - 1;
+        }
 
-    bool setStateAfterSlides(size_t numSlides)
-    {
-        // nothing happens
-        if(numSlides == 0)
-            return false;
+        bool setStateAfterSlides(size_t numSlides)
+        {
+            // nothing happens
+            if(numSlides == 0)
+                return false;
 
-        // we can only slide in y direction right now
-        if(DIM < DIM2)
-            return false;
+            // we can only slide in y direction right now
+            if(DIM < DIM2)
+                return false;
 
-        bool result = false;
+            bool result = false;
 
-        // only need to apply (numSlides % num-gpus-y) slides
-        for (size_t i = 0; i < (numSlides % dims[1]); ++i)
-            result = slide();
+            // only need to apply (numSlides % num-gpus-y) slides
+            for(size_t i = 0; i < (numSlides % dims[1]); ++i)
+                result = slide();
 
-        return result;
-    }
+            return result;
+        }
 
 
-protected:
-    /* Set the first found non charactor or number to 0 (nullptr)
-     * name like p1223(Pid=1233) is than p1223
-     * in some MPI implementation /mpich) the hostname is unique
-     */
-    void cleanHostname(char* name)
-    {
-        for (int i = 0; i < MPI_MAX_PROCESSOR_NAME; ++i)
+    protected:
+        /* Set the first found non charactor or number to 0 (nullptr)
+         * name like p1223(Pid=1233) is than p1223
+         * in some MPI implementation /mpich) the hostname is unique
+         */
+        void cleanHostname(char* name)
         {
-            if (!(name[i] >= 'A' && name[i] <= 'Z') &&
-                !(name[i] >= 'a' && name[i] <= 'z') &&
-                !(name[i] >= '0' && name[i] <= '9') &&
-                !(name[i] == '_') &&
-                !(name[i] == '-') )
+            for(int i = 0; i < MPI_MAX_PROCESSOR_NAME; ++i)
             {
-                name[i] = 0;
-                return;
+                if(!(name[i] >= 'A' && name[i] <= 'Z') && !(name[i] >= 'a' && name[i] <= 'z')
+                   && !(name[i] >= '0' && name[i] <= '9') && !(name[i] == '_') && !(name[i] == '-'))
+                {
+                    name[i] = 0;
+                    return;
+                }
             }
         }
-    }
-
-    /*! gets hostRank
-     *
-     * process with MPI-rank 0 is the master and builds a map with hostname
-     * and number of already known processes on this host.
-     * Each rank will provide its hostname via MPISend and gets its HostRank
-     * from the master.
-     *
-     */
-    void updateHostRank()
-    {
-        char hostname[MPI_MAX_PROCESSOR_NAME];
-        int length;
 
-        MPI_CHECK(MPI_Get_processor_name(hostname, &length));
-        cleanHostname(hostname);
-        hostname[length++] = '\0';
-
-        MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &mpiSize));
-        MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank));
-
-        if (mpiRank == 0)
+        /*! gets hostRank
+         *
+         * process with MPI-rank 0 is the master and builds a map with hostname
+         * and number of already known processes on this host.
+         * Each rank will provide its hostname via MPISend and gets its HostRank
+         * from the master.
+         *
+         */
+        void updateHostRank()
         {
-            std::map<std::string, int> hosts;
-            hosts[hostname] = 0;
-            hostRank = 0;
-            for (int rank = 1; rank < mpiSize; ++rank)
-            {
-                MPI_CHECK(MPI_Recv(hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, rank, gridHostnameTag, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
+            char hostname[MPI_MAX_PROCESSOR_NAME];
+            int length;
 
-                //printf("Hostname: %s\n", hostname);
-                int hostrank = 0;
-                if (hosts.count(hostname) > 0) hostrank = hosts[hostname] + 1;
+            MPI_CHECK(MPI_Get_processor_name(hostname, &length));
+            cleanHostname(hostname);
+            hostname[length++] = '\0';
 
-                MPI_CHECK(MPI_Send(&hostrank, 1, MPI_INT, rank, gridHostRankTag, MPI_COMM_WORLD));
+            MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &mpiSize));
+            MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank));
 
-                hosts[hostname] = hostrank;
+            if(mpiRank == 0)
+            {
+                std::map<std::string, int> hosts;
+                hosts[hostname] = 0;
+                hostRank = 0;
+                for(int rank = 1; rank < mpiSize; ++rank)
+                {
+                    MPI_CHECK(MPI_Recv(
+                        hostname,
+                        MPI_MAX_PROCESSOR_NAME,
+                        MPI_CHAR,
+                        rank,
+                        gridHostnameTag,
+                        MPI_COMM_WORLD,
+                        MPI_STATUS_IGNORE));
+
+                    // printf("Hostname: %s\n", hostname);
+                    int hostrank = 0;
+                    if(hosts.count(hostname) > 0)
+                        hostrank = hosts[hostname] + 1;
+
+                    MPI_CHECK(MPI_Send(&hostrank, 1, MPI_INT, rank, gridHostRankTag, MPI_COMM_WORLD));
+
+                    hosts[hostname] = hostrank;
+                }
+            }
+            else
+            {
+                MPI_CHECK(MPI_Send(hostname, length, MPI_CHAR, GridManagerRank, gridHostnameTag, MPI_COMM_WORLD));
+
+                MPI_CHECK(MPI_Recv(
+                    &hostRank,
+                    1,
+                    MPI_INT,
+                    GridManagerRank,
+                    gridHostRankTag,
+                    MPI_COMM_WORLD,
+                    MPI_STATUS_IGNORE));
+
+                // if(hostRank!=0) hostRank--; //!\todo fix mpi hostrank start with 1
             }
-
-        }
-        else
-        {
-            MPI_CHECK(MPI_Send(hostname, length, MPI_CHAR, GridManagerRank, gridHostnameTag, MPI_COMM_WORLD));
-
-            MPI_CHECK(MPI_Recv(&hostRank, 1, MPI_INT, GridManagerRank, gridHostRankTag, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
-
-            // if(hostRank!=0) hostRank--; //!\todo fix mpi hostrank start with 1
-        }
-
-    }
-
-    /*! update coordinates \see getCoordinates
-     */
-    void updateCoordinates()
-    {
-        // get own coordinates
-        int coords[DIM];
-        int rank;
-
-        MPI_CHECK(MPI_Comm_rank(topology, &rank));
-        MPI_CHECK(MPI_Cart_coords(topology, rank, DIM, coords));
-
-        if (DIM >= DIM2)
-        {
-            if (dims[1] > 1)
-                coords[1] = (coords[1] + yoffset) % dims[1];
-
-            while (coords[1] < 0)
-                coords[1] += dims[1];
         }
 
-        detail::LogRankCoords<DIM>()(rank, coords);
-
-        for (uint32_t i = 0; i < DIM; ++i)
-            this->coordinates[i] = coords[i];
-
-        // init ranks of other hosts
-        int mcoords[3];
-
-        communicationMask = Mask();
-
-        for (int i = 1; i<-12 * (int) DIM + 6 * (int) DIM * (int) DIM + 9; i++)
+        /*! update coordinates \see getCoordinates
+         */
+        void updateCoordinates()
         {
-            for (uint32_t j = 0; j < DIM; j++)
-                mcoords[j] = coords[j];
+            // get own coordinates
+            int coords[DIM];
+            int rank;
 
-            Mask m(i);
-            if (m.containsExchangeType(LEFT))
-                mcoords[0]--;
-            if (m.containsExchangeType(RIGHT))
-                mcoords[0]++;
+            MPI_CHECK(MPI_Comm_rank(topology, &rank));
+            MPI_CHECK(MPI_Cart_coords(topology, rank, DIM, coords));
 
-            if (DIM >= DIM2)
+            if(DIM >= DIM2)
             {
-                if (m.containsExchangeType(TOP))
-                    mcoords[1]--;
-                if (m.containsExchangeType(BOTTOM))
-                    mcoords[1]++;
-            }
+                if(dims[1] > 1)
+                    coords[1] = (coords[1] + yoffset) % dims[1];
 
-            if (DIM == DIM3)
-            {
-                if (m.containsExchangeType(BACK))
-                    mcoords[2]++;
-                if (m.containsExchangeType(FRONT))
-                    mcoords[2]--;
+                while(coords[1] < 0)
+                    coords[1] += dims[1];
             }
 
-            bool ok = true;
-            for (uint32_t j = 0; j < DIM; j++)
-                if (periodic[j] == 0 && (mcoords[j] < 0 || mcoords[j] >= dims[j])) /*only check if no perodic for j dimension is set*/
-                    ok = false;
+            detail::LogRankCoords<DIM>()(rank, coords);
 
-            if (ok)
-            {
-                if (dims[1] > 1)
-                    mcoords[1] = (mcoords[1] - yoffset) % dims[1];
+            for(uint32_t i = 0; i < DIM; ++i)
+                this->coordinates[i] = coords[i];
 
-                MPI_CHECK(MPI_Cart_rank(topology, mcoords, &ranks[i]));
-                communicationMask = communicationMask + Mask(i);
-            }
-            else
+            // init ranks of other hosts
+            int mcoords[3];
+
+            communicationMask = Mask();
+
+            for(int i = 1; i < -12 * (int) DIM + 6 * (int) DIM * (int) DIM + 9; i++)
             {
-                ranks[i] = -1;
+                for(uint32_t j = 0; j < DIM; j++)
+                    mcoords[j] = coords[j];
+
+                Mask m(i);
+                if(m.containsExchangeType(LEFT))
+                    mcoords[0]--;
+                if(m.containsExchangeType(RIGHT))
+                    mcoords[0]++;
+
+                if(DIM >= DIM2)
+                {
+                    if(m.containsExchangeType(TOP))
+                        mcoords[1]--;
+                    if(m.containsExchangeType(BOTTOM))
+                        mcoords[1]++;
+                }
+
+                if(DIM == DIM3)
+                {
+                    if(m.containsExchangeType(BACK))
+                        mcoords[2]++;
+                    if(m.containsExchangeType(FRONT))
+                        mcoords[2]--;
+                }
+
+                bool ok = true;
+                for(uint32_t j = 0; j < DIM; j++)
+                    if(periodic[j] == 0
+                       && (mcoords[j] < 0
+                           || mcoords[j] >= dims[j])) /*only check if no perodic for j dimension is set*/
+                        ok = false;
+
+                if(ok)
+                {
+                    if(dims[1] > 1)
+                        mcoords[1] = (mcoords[1] - yoffset) % dims[1];
+
+                    MPI_CHECK(MPI_Cart_rank(topology, mcoords, &ranks[i]));
+                    communicationMask = communicationMask + Mask(i);
+                }
+                else
+                {
+                    ranks[i] = -1;
+                }
+
+                // std::cout << "rank: " << rank << " " << i << " : " << ranks[i] << std::endl;
             }
+        }
 
-            //std::cout << "rank: " << rank << " " << i << " : " << ranks[i] << std::endl;
-
+        /*! converts an exchangeType (e.g. RIGHT) to an MPI-rank
+         */
+        int ExchangeTypeToRank(uint32_t type)
+        {
+            return ranks[type];
         }
-    }
 
-    /*! converts an exchangeType (e.g. RIGHT) to an MPI-rank
-     */
-    int ExchangeTypeToRank(uint32_t type)
-    {
-        return ranks[type];
-    }
-
-private:
-    //! coordinates in GPU-Grid [0:cx-1,0:cy-1,0:cz-1]
-    DataSpace<DIM> coordinates;
-
-    DataSpace<DIM3> periodic;
-    //! MPI communicator (currently MPI_COMM_WORLD)
-    MPI_Comm topology;
-    //! array for exchangetype-to-rank conversion \see ExchangeTypeToRank
-    int ranks[27];
-    //! size of pmacc [cx,cy,cz]
-    int dims[3];
-    //! \see getCommunicationMask
-    Mask communicationMask;
-    //! rank of this process local to its host (node)
-    int hostRank;
-    //! offset for sliding window
-    int yoffset;
-
-    int mpiRank;
-    int mpiSize;
-};
-
-} //namespace pmacc
+    private:
+        //! coordinates in GPU-Grid [0:cx-1,0:cy-1,0:cz-1]
+        DataSpace<DIM> coordinates;
+
+        DataSpace<DIM3> periodic;
+        //! MPI communicator (currently MPI_COMM_WORLD)
+        MPI_Comm topology;
+        //! array for exchangetype-to-rank conversion \see ExchangeTypeToRank
+        int ranks[27];
+        //! size of pmacc [cx,cy,cz]
+        int dims[3];
+        //! \see getCommunicationMask
+        Mask communicationMask;
+        //! rank of this process local to its host (node)
+        int hostRank;
+        //! offset for sliding window
+        int yoffset;
+
+        int mpiRank;
+        int mpiSize;
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/communication/ICommunicator.hpp b/include/pmacc/communication/ICommunicator.hpp
index ae254b341d..c8bd436bec 100644
--- a/include/pmacc/communication/ICommunicator.hpp
+++ b/include/pmacc/communication/ICommunicator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Wolfgang Hoenig, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Wolfgang Hoenig, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,63 +28,63 @@
 
 namespace pmacc
 {
-
-/*! Interface for communication
- */
-class ICommunicator
-{
-public:
-
-    /*! returns available communication partners
-     *
-     * returns a mask with neighbors, e.g. if there is a right neighbor result.isSet(RIGHT) returns true
+    /*! Interface for communication
      */
-    virtual const Mask& getCommunicationMask() const=0;
+    class ICommunicator
+    {
+    public:
+        /*! returns available communication partners
+         *
+         * returns a mask with neighbors, e.g. if there is a right neighbor result.isSet(RIGHT) returns true
+         */
+        virtual const Mask& getCommunicationMask() const = 0;
 
-    /*! moves all GPUs from top to bottom (y-coordinate)
-     *
-     * @return true if the position of gpu is switched to the end, else false
-     */
-    virtual bool slide() = 0;
+        /*! moves all GPUs from top to bottom (y-coordinate)
+         *
+         * @return true if the position of gpu is switched to the end, else false
+         */
+        virtual bool slide() = 0;
 
-    /*! slides multiple times
-     *
-     * @param[in] numSlides number of slides
-     * @return true if the position of gpu is switched to the end, else false
-     */
-    virtual bool setStateAfterSlides(size_t numSlides) = 0;
+        /*! slides multiple times
+         *
+         * @param[in] numSlides number of slides
+         * @return true if the position of gpu is switched to the end, else false
+         */
+        virtual bool setStateAfterSlides(size_t numSlides) = 0;
 
-    //!\todo Interface should not depend on MPI!
+        //!\todo Interface should not depend on MPI!
 
-    /*! starts sending via MPI (non-blocking)
-     *
-     * \param[in] ex                direction to send (enum ExchangeType)
-     * \param[in] send_data         pointer to data; should have at least send_data_count bytes
-     * \param[in] send_data_count   message size in bytes to sent
-     * \param[in] tag               user-defined tag; only message with the same tag can be exchanged (i.e. startSend and startReceive must use the same tag)
-     * \returns an request for testing if this operation has already finished
-     */
-    virtual MPI_Request* startSend(uint32_t ex, const char *send_data, size_t send_data_count, uint32_t tag) = 0;
+        /*! starts sending via MPI (non-blocking)
+         *
+         * \param[in] ex                direction to send (enum ExchangeType)
+         * \param[in] send_data         pointer to data; should have at least send_data_count bytes
+         * \param[in] send_data_count   message size in bytes to sent
+         * \param[in] tag               user-defined tag; only message with the same tag can be exchanged (i.e.
+         * startSend and startReceive must use the same tag) \returns an request for testing if this operation has
+         * already finished
+         */
+        virtual MPI_Request* startSend(uint32_t ex, const char* send_data, size_t send_data_count, uint32_t tag) = 0;
 
-    /*! starts receiving via MPI (non-blocking)
-     *
-     * If recv_data_max is less then send_data_count (on other host) multiple startReceive are needed!
-     *
-     * \param[in] ex                direction to send (enum ExchangeType)
-     * \param[in] recv_data         pointer to data; should have at least recv_data_max bytes
-     * \param[in] recv_data_max     maximum message size in bytes to receive
-     * \param[in] tag               user-defined tag; only message with the same tag can be exchanged (i.e. startSend and startReceive must use the same tag)
-     * \returns an request for testing if this operation has already finished
-     */
-    virtual MPI_Request* startReceive(uint32_t ex, char *recv_data, size_t recv_data_max, uint32_t tag) = 0;
+        /*! starts receiving via MPI (non-blocking)
+         *
+         * If recv_data_max is less then send_data_count (on other host) multiple startReceive are needed!
+         *
+         * \param[in] ex                direction to send (enum ExchangeType)
+         * \param[in] recv_data         pointer to data; should have at least recv_data_max bytes
+         * \param[in] recv_data_max     maximum message size in bytes to receive
+         * \param[in] tag               user-defined tag; only message with the same tag can be exchanged (i.e.
+         * startSend and startReceive must use the same tag) \returns an request for testing if this operation has
+         * already finished
+         */
+        virtual MPI_Request* startReceive(uint32_t ex, char* recv_data, size_t recv_data_max, uint32_t tag) = 0;
 
-    virtual int getRank()=0;
+        virtual int getRank() = 0;
 
-    /*! Return which of the three directions are periodic
-     *
-     * \return for each direction a false (0) or true(1) value
-     */
-    virtual DataSpace<DIM3> getPeriodic() const = 0;
-};
+        /*! Return which of the three directions are periodic
+         *
+         * \return for each direction a false (0) or true(1) value
+         */
+        virtual DataSpace<DIM3> getPeriodic() const = 0;
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/communication/manager_common.hpp b/include/pmacc/communication/manager_common.hpp
index 9aeb67eaea..71164cd55c 100644
--- a/include/pmacc/communication/manager_common.hpp
+++ b/include/pmacc/communication/manager_common.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Wolfgang Hoenig, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Wolfgang Hoenig, Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -27,14 +27,30 @@
 
 const int GridManagerRank = 0;
 
-enum {
-  gridInitTag = 1,
-  gridHostnameTag = 2,
-  gridHostRankTag = 3,
-  gridExitTag = 4,
-  gridExchangeTag = 5
+enum
+{
+    gridInitTag = 1,
+    gridHostnameTag = 2,
+    gridHostRankTag = 3,
+    gridExitTag = 4,
+    gridExchangeTag = 5
 };
 
-#define MPI_CHECK(cmd) {int error = cmd; if(error!=MPI_SUCCESS){std::cerr << "<" << __FILE__ << ">:" << __LINE__; throw std::runtime_error(std::string("[MPI] Error"));}}
+#define MPI_CHECK(cmd)                                                                                                \
+    {                                                                                                                 \
+        int error = cmd;                                                                                              \
+        if(error != MPI_SUCCESS)                                                                                      \
+        {                                                                                                             \
+            std::cerr << "<" << __FILE__ << ">:" << __LINE__;                                                         \
+            throw std::runtime_error(std::string("[MPI] Error"));                                                     \
+        }                                                                                                             \
+    }
 
-#define MPI_CHECK_NO_EXCEPT(cmd) {int error = cmd; if(error!=MPI_SUCCESS){std::cerr << "[MPI] Error code " << error << " in <" << __FILE__ << ">:" << __LINE__;}}
+#define MPI_CHECK_NO_EXCEPT(cmd)                                                                                      \
+    {                                                                                                                 \
+        int error = cmd;                                                                                              \
+        if(error != MPI_SUCCESS)                                                                                      \
+        {                                                                                                             \
+            std::cerr << "[MPI] Error code " << error << " in <" << __FILE__ << ">:" << __LINE__;                     \
+        }                                                                                                             \
+    }
diff --git a/include/pmacc/cuSTL/algorithm/cudaBlock/Foreach.hpp b/include/pmacc/cuSTL/algorithm/cudaBlock/Foreach.hpp
deleted file mode 100644
index 0bc20fce24..0000000000
--- a/include/pmacc/cuSTL/algorithm/cudaBlock/Foreach.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include "pmacc/algorithms/TypeCast.hpp"
-#include "pmacc/math/vector/Int.hpp"
-#include "pmacc/math/Vector.hpp"
-#include "pmacc/math/VectorOperations.hpp"
-
-namespace pmacc
-{
-namespace algorithm
-{
-namespace cudaBlock
-{
-
-#ifndef FOREACH_KERNEL_MAX_PARAMS
-#define FOREACH_KERNEL_MAX_PARAMS 4
-#endif
-
-#define SHIFTACCESS_CURSOR(Z, N, _) c ## N [pos]
-
-#define FOREACH_OPERATOR(Z, N, _)                                                  \
-    /*      <             , typename C0, ..., typename C(N-1)  ,              > */ \
-    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor, typename T_Acc> \
-    /*                     (      C0 c0, ..., C(N-1) c(N-1)           ,       ) */ \
-    DINLINE void operator()(T_Acc const & acc, Zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor) \
-    {                                                                              \
-        const int dataVolume = math::CT::volume<typename Zone::Size>::type::value; \
-        const int blockVolume = math::CT::volume<BlockDim>::type::value;           \
-                                                                                   \
-        typedef typename math::Int<Zone::dim> PosType;                             \
-        using namespace pmacc::algorithms::precisionCast;                          \
-                                                                                   \
-        for(int i = this->linearThreadIdx; i < dataVolume; i += blockVolume)       \
-        {                                                                          \
-            PosType pos = Zone::Offset::toRT() +                                   \
-                          precisionCast<typename PosType::type>(                   \
-                            math::MapToPos<Zone::dim>()( typename Zone::Size(), i ) ); \
-            functor(acc, BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _));                     \
-        }                                                                          \
-    }
-
-/** Foreach algorithm that is executed by one cuda thread block
- *
- * \tparam BlockDim 3D compile-time vector (pmacc::math::CT::Int) of the size of the cuda blockDim.
- *
- * BlockDim could also be obtained from cuda itself at runtime but
- * it is faster to know it at compile-time.
- */
-template<typename BlockDim>
-struct Foreach
-{
-private:
-    const int linearThreadIdx;
-public:
-
-    DINLINE Foreach(int linearThreadIdx) : linearThreadIdx(linearThreadIdx) {}
-
-    /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
-     *
-     * \param zone compile-time zone object, see zone::CT::SphericZone. (e.g. ContainerType::Zone())
-     * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
-     * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 = _2)
-     *
-     * The functor or lambdaFun is called for each cell within the zone.
-     * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
-     *
-     */
-    BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), FOREACH_OPERATOR, _)
-};
-
-#undef SHIFTACCESS_CURSOR
-#undef FOREACH_OPERATOR
-
-} // cudaBlock
-} // algorithm
-} // pmacc
diff --git a/include/pmacc/cuSTL/algorithm/cuplaBlock/Foreach.hpp b/include/pmacc/cuSTL/algorithm/cuplaBlock/Foreach.hpp
new file mode 100644
index 0000000000..c840bc430a
--- /dev/null
+++ b/include/pmacc/cuSTL/algorithm/cuplaBlock/Foreach.hpp
@@ -0,0 +1,102 @@
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl
+ *
+ * This file is part of PMacc.
+ *
+ * PMacc is free software: you can redistribute it and/or modify
+ * it under the terms of either the GNU General Public License or
+ * the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PMacc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License and the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * and the GNU Lesser General Public License along with PMacc.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pmacc/types.hpp"
+#include "pmacc/algorithms/TypeCast.hpp"
+#include "pmacc/math/vector/Int.hpp"
+#include "pmacc/math/Vector.hpp"
+#include "pmacc/math/VectorOperations.hpp"
+
+namespace pmacc
+{
+    namespace algorithm
+    {
+        namespace cuplaBlock
+        {
+#ifndef FOREACH_KERNEL_MAX_PARAMS
+#    define FOREACH_KERNEL_MAX_PARAMS 4
+#endif
+
+#define SHIFTACCESS_CURSOR(Z, N, _) c##N[pos]
+
+#define FOREACH_OPERATOR(Z, N, _)                                                                                     \
+    /*      <             , typename C0, ..., typename C(N-1)  ,              > */                                    \
+    template<                                                                                                         \
+        typename Zone,                                                                                                \
+        BOOST_PP_ENUM_PARAMS(N, typename C),                                                                          \
+        typename Functor,                                                                                             \
+        typename T_Acc> /*                     (      C0 c0, ..., C(N-1) c(N-1)           ,       ) */                \
+    DINLINE void operator()(T_Acc const& acc, Zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor)     \
+    {                                                                                                                 \
+        const int dataVolume = math::CT::volume<typename Zone::Size>::type::value;                                    \
+        const int blockVolume = math::CT::volume<BlockDim>::type::value;                                              \
+                                                                                                                      \
+        typedef typename math::Int<Zone::dim> PosType;                                                                \
+        using namespace pmacc::algorithms::precisionCast;                                                             \
+                                                                                                                      \
+        for(int i = this->linearThreadIdx; i < dataVolume; i += blockVolume)                                          \
+        {                                                                                                             \
+            PosType pos = Zone::Offset::toRT()                                                                        \
+                + precisionCast<typename PosType::type>(math::MapToPos<Zone::dim>()(typename Zone::Size(), i));       \
+            functor(acc, BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _));                                                    \
+        }                                                                                                             \
+    }
+
+            /** Foreach algorithm that is executed by one cupla thread block
+             *
+             * \tparam BlockDim 3D compile-time vector (pmacc::math::CT::Int) of the size of the cupla blockDim.
+             *
+             * BlockDim could also be obtained from cupla itself at runtime but
+             * it is faster to know it at compile-time.
+             */
+            template<typename BlockDim>
+            struct Foreach
+            {
+            private:
+                const int linearThreadIdx;
+
+            public:
+                DINLINE Foreach(int linearThreadIdx) : linearThreadIdx(linearThreadIdx)
+                {
+                }
+
+                /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
+                 *
+                 * \param zone compile-time zone object, see zone::CT::SphericZone. (e.g. ContainerType::Zone())
+                 * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
+                 * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 =
+                 * _2)
+                 *
+                 * The functor or lambdaFun is called for each cell within the zone.
+                 * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
+                 *
+                 */
+                BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), FOREACH_OPERATOR, _)
+            };
+
+#undef SHIFTACCESS_CURSOR
+#undef FOREACH_OPERATOR
+
+        } // namespace cuplaBlock
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/functor/Add.hpp b/include/pmacc/cuSTL/algorithm/functor/Add.hpp
index 85b0dc825b..964f8c3214 100644
--- a/include/pmacc/cuSTL/algorithm/functor/Add.hpp
+++ b/include/pmacc/cuSTL/algorithm/functor/Add.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Heiko Burau
+/* Copyright 2017-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -26,38 +26,25 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace functor
-{
-
-    struct Add
+    namespace algorithm
     {
-        template< typename T_Type >
-        HDINLINE T_Type
-        operator()(
-            T_Type const & first,
-            T_Type const & second
-        ) const
+        namespace functor
         {
-            return first + second;
-        }
+            struct Add
+            {
+                template<typename T_Type>
+                HDINLINE T_Type operator()(T_Type const& first, T_Type const& second) const
+                {
+                    return first + second;
+                }
 
-        template<
-            typename T_Type,
-            typename T_Acc
-        >
-        HDINLINE T_Type
-        operator()(
-            T_Acc const &,
-            T_Type const & first,
-            T_Type const & second
-        ) const
-        {
-            return first + second;
-        }
-    };
+                template<typename T_Type, typename T_Acc>
+                HDINLINE T_Type operator()(T_Acc const&, T_Type const& first, T_Type const& second) const
+                {
+                    return first + second;
+                }
+            };
 
-} // functor
-} // algorithm
-} // pmacc
+        } // namespace functor
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/functor/AssignValue.hpp b/include/pmacc/cuSTL/algorithm/functor/AssignValue.hpp
index 710b88eaee..e8697448d3 100644
--- a/include/pmacc/cuSTL/algorithm/functor/AssignValue.hpp
+++ b/include/pmacc/cuSTL/algorithm/functor/AssignValue.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Heiko Burau
+/* Copyright 2017-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -26,39 +26,33 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace functor
-{
-
-    template< typename T_Type>
-    struct AssignValue
+    namespace algorithm
     {
-        using Type = T_Type;
-        Type m_value;
-
-        HDINLINE
-        AssignValue( Type const & value ) :
-            m_value( value )
-        { }
-
-        HDINLINE void
-        operator()( Type & arg ) const
-        {
-            arg = m_value;
-        }
-
-        template< typename T_Acc >
-        HDINLINE void
-        operator()(
-            T_Acc const &,
-            Type & arg
-        ) const
+        namespace functor
         {
-            arg = m_value;
-        }
-    };
-
-} // functor
-} // algorithm
-} // pmacc
+            template<typename T_Type>
+            struct AssignValue
+            {
+                using Type = T_Type;
+                Type m_value;
+
+                HDINLINE
+                AssignValue(Type const& value) : m_value(value)
+                {
+                }
+
+                HDINLINE void operator()(Type& arg) const
+                {
+                    arg = m_value;
+                }
+
+                template<typename T_Acc>
+                HDINLINE void operator()(T_Acc const&, Type& arg) const
+                {
+                    arg = m_value;
+                }
+            };
+
+        } // namespace functor
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/functor/GetComponent.hpp b/include/pmacc/cuSTL/algorithm/functor/GetComponent.hpp
index f566d0e9fb..329cdfbe4b 100644
--- a/include/pmacc/cuSTL/algorithm/functor/GetComponent.hpp
+++ b/include/pmacc/cuSTL/algorithm/functor/GetComponent.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Heiko Burau
+/* Copyright 2017-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -26,43 +26,34 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace functor
-{
-
-    template< typename T_Type >
-    struct GetComponent
+    namespace algorithm
     {
-        using Type = T_Type;
-        using result_type = Type;
-        uint32_t m_component;
-
-        HDINLINE GetComponent( uint32_t const component ) :
-            m_component( component )
-        { }
-
-        template<
-            typename Array,
-            typename T_Acc
-        >
-        HDINLINE Type &
-        operator()(
-            T_Acc const &,
-            Array & array
-        ) const
-        {
-            return array[ m_component ];
-        }
-
-        template< typename Array >
-        HDINLINE Type &
-        operator()( Array & array ) const
+        namespace functor
         {
-            return array[ m_component ];
-        }
-    };
-
-} // functor
-} // algorithm
-} // pmacc
+            template<typename T_Type>
+            struct GetComponent
+            {
+                using Type = T_Type;
+                using result_type = Type;
+                uint32_t m_component;
+
+                HDINLINE GetComponent(uint32_t const component) : m_component(component)
+                {
+                }
+
+                template<typename Array, typename T_Acc>
+                HDINLINE Type& operator()(T_Acc const&, Array& array) const
+                {
+                    return array[m_component];
+                }
+
+                template<typename Array>
+                HDINLINE Type& operator()(Array& array) const
+                {
+                    return array[m_component];
+                }
+            };
+
+        } // namespace functor
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/host/Foreach.hpp b/include/pmacc/cuSTL/algorithm/host/Foreach.hpp
index ab7f31029d..a192c35133 100644
--- a/include/pmacc/cuSTL/algorithm/host/Foreach.hpp
+++ b/include/pmacc/cuSTL/algorithm/host/Foreach.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -33,96 +33,98 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace host
-{
-
+    namespace algorithm
+    {
+        namespace host
+        {
 #ifndef FOREACH_HOST_MAX_PARAMS
-#define FOREACH_HOST_MAX_PARAMS 4
+#    define FOREACH_HOST_MAX_PARAMS 4
 #endif
 
-#define SHIFT_CURSOR_ZONE(Z, N, _) C ## N c ## N ## _shifted = c ## N (p_zone.offset);
-#define SHIFTACCESS_SHIFTEDCURSOR(Z, N, _) c ## N ## _shifted [cellIndex]
+#define SHIFT_CURSOR_ZONE(Z, N, _) C##N c##N##_shifted = c##N(p_zone.offset);
+#define SHIFTACCESS_SHIFTEDCURSOR(Z, N, _) c##N##_shifted[cellIndex]
 
-namespace detail
-{
-    /** Return pseudo 3D-range of the zone as math::Int<dim> */
-    template< uint32_t dim >
-    struct GetRange;
+            namespace detail
+            {
+                /** Return pseudo 3D-range of the zone as math::Int<dim> */
+                template<uint32_t dim>
+                struct GetRange;
 
-    template<>
-    struct GetRange<3u>
-    {
-        template<typename Zone>
-        const math::Int<3u> operator()(const Zone p_zone) const
-        {
-            return math::Int<3u>(p_zone.size.x(), p_zone.size.y(), p_zone.size.z());
-        }
-    };
-    template<>
-    struct GetRange<2u>
-    {
-        template<typename Zone>
-        const math::Int<3u> operator()(const Zone p_zone) const
-        {
-            return math::Int<3u>(p_zone.size.x(), p_zone.size.y(), 1);
-        }
-    };
-    template<>
-    struct GetRange<1u>
-    {
-        template<typename Zone>
-        const math::Int<3u> operator()(const Zone p_zone) const
-        {
-            return math::Int<3u>(p_zone.size.x(), 1, 1);
-        }
-    };
-} // namespace detail
+                template<>
+                struct GetRange<3u>
+                {
+                    template<typename Zone>
+                    const math::Int<3u> operator()(const Zone p_zone) const
+                    {
+                        return math::Int<3u>(p_zone.size.x(), p_zone.size.y(), p_zone.size.z());
+                    }
+                };
+                template<>
+                struct GetRange<2u>
+                {
+                    template<typename Zone>
+                    const math::Int<3u> operator()(const Zone p_zone) const
+                    {
+                        return math::Int<3u>(p_zone.size.x(), p_zone.size.y(), 1);
+                    }
+                };
+                template<>
+                struct GetRange<1u>
+                {
+                    template<typename Zone>
+                    const math::Int<3u> operator()(const Zone p_zone) const
+                    {
+                        return math::Int<3u>(p_zone.size.x(), 1, 1);
+                    }
+                };
+            } // namespace detail
 
-#define FOREACH_OPERATOR(Z, N, _)                                              \
-    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor, typename T_Acc> \
-    void operator()(const T_Acc& acc, const Zone& p_zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor) \
-    {                                                                          \
-        BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                               \
-                                                                               \
-        detail::GetRange<Zone::dim> getRange;                                  \
-        for(int z = 0; z < getRange(p_zone).z(); z++)                           \
-        {                                                                      \
-            for(int y = 0; y < getRange(p_zone).y(); y++)                       \
-            {                                                                  \
-                for(int x = 0; x < getRange(p_zone).x(); x++)                   \
-                {                                                              \
-                    math::Int<Zone::dim> cellIndex =                           \
-                        math::Int<3u>(x, y, z).shrink<Zone::dim>();            \
-                    functor(acc, BOOST_PP_ENUM(N, SHIFTACCESS_SHIFTEDCURSOR, _));       \
-                }                                                              \
-            }                                                                  \
-        }                                                                      \
+#define FOREACH_OPERATOR(Z, N, _)                                                                                     \
+    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor, typename T_Acc>                    \
+    void operator()(                                                                                                  \
+        const T_Acc& acc,                                                                                             \
+        const Zone& p_zone,                                                                                           \
+        BOOST_PP_ENUM_BINARY_PARAMS(N, C, c),                                                                         \
+        const Functor& functor)                                                                                       \
+    {                                                                                                                 \
+        BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                                      \
+                                                                                                                      \
+        detail::GetRange<Zone::dim> getRange;                                                                         \
+        for(int z = 0; z < getRange(p_zone).z(); z++)                                                                 \
+        {                                                                                                             \
+            for(int y = 0; y < getRange(p_zone).y(); y++)                                                             \
+            {                                                                                                         \
+                for(int x = 0; x < getRange(p_zone).x(); x++)                                                         \
+                {                                                                                                     \
+                    math::Int<Zone::dim> cellIndex = math::Int<3u>(x, y, z).shrink<Zone::dim>();                      \
+                    functor(acc, BOOST_PP_ENUM(N, SHIFTACCESS_SHIFTEDCURSOR, _));                                     \
+                }                                                                                                     \
+            }                                                                                                         \
+        }                                                                                                             \
     }
 
-/** Foreach algorithm (restricted to 3D)
- */
-struct Foreach
-{
-    /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
-     *
-     * \param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
-     * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
-     * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 = _2)
-     *
-     * The functor or lambdaFun is called for each cell within the zone.
-     * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
-     *
-     */
-    BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_HOST_MAX_PARAMS), FOREACH_OPERATOR, _)
-};
+            /** Foreach algorithm (restricted to 3D)
+             */
+            struct Foreach
+            {
+                /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
+                 *
+                 * \param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
+                 * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
+                 * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 =
+                 * _2)
+                 *
+                 * The functor or lambdaFun is called for each cell within the zone.
+                 * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
+                 *
+                 */
+                BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_HOST_MAX_PARAMS), FOREACH_OPERATOR, _)
+            };
 
 #undef FOREACH_OPERATOR
 #undef SHIFT_CURSOR_ZONE
 #undef SHIFTACCESS_SHIFTEDCURSOR
 
-} // host
-} // algorithm
-} // pmacc
-
+        } // namespace host
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/kernel/FFT.hpp b/include/pmacc/cuSTL/algorithm/kernel/FFT.hpp
index 6f099c1921..35033cae0e 100644
--- a/include/pmacc/cuSTL/algorithm/kernel/FFT.hpp
+++ b/include/pmacc/cuSTL/algorithm/kernel/FFT.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,21 +23,19 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace kernel
-{
+    namespace algorithm
+    {
+        namespace kernel
+        {
+            template<int dim>
+            struct FFT
+            {
+                template<typename Zone, typename DestCursor, typename SrcCursor>
+                void operator()(const Zone& p_zone, const DestCursor& destCursor, const SrcCursor& srcCursor);
+            };
 
-template<int dim>
-struct FFT
-{
-    template<typename Zone, typename DestCursor, typename SrcCursor>
-    void operator()(const Zone& p_zone, const DestCursor& destCursor, const SrcCursor& srcCursor);
-};
-
-} // kernel
-} // algorithm
-} // pmacc
+        } // namespace kernel
+    } // namespace algorithm
+} // namespace pmacc
 
 #include "FFT.tpp"
-
diff --git a/include/pmacc/cuSTL/algorithm/kernel/FFT.tpp b/include/pmacc/cuSTL/algorithm/kernel/FFT.tpp
index c4b88c2cb4..86ae996539 100644
--- a/include/pmacc/cuSTL/algorithm/kernel/FFT.tpp
+++ b/include/pmacc/cuSTL/algorithm/kernel/FFT.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,24 +28,25 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace kernel
-{
-
-template<>
-template<typename Zone, typename DestCursor, typename SrcCursor>
-void FFT<2>::operator()(const Zone& p_zone, const DestCursor& destCursor, const SrcCursor& srcCursor)
-{
-    cufftHandle plan;
-    CUFFT_CHECK(cufftPlan2d(&plan, p_zone.size.x(), p_zone.size.y(), CUFFT_R2C));
+    namespace algorithm
+    {
+        namespace kernel
+        {
+            template<>
+            template<typename Zone, typename DestCursor, typename SrcCursor>
+            void FFT<2>::operator()(const Zone& p_zone, const DestCursor& destCursor, const SrcCursor& srcCursor)
+            {
+                cufftHandle plan;
+                CUFFT_CHECK(cufftPlan2d(&plan, p_zone.size.x(), p_zone.size.y(), CUFFT_R2C));
 
-    CUFFT_CHECK(cufftExecR2C(plan, (cufftReal*)&(*(srcCursor(p_zone.offset))),
-                        (cufftComplex*)&(*destCursor(p_zone.offset))));
+                CUFFT_CHECK(cufftExecR2C(
+                    plan,
+                    (cufftReal*) &(*(srcCursor(p_zone.offset))),
+                    (cufftComplex*) &(*destCursor(p_zone.offset))));
 
-    CUFFT_CHECK(cufftDestroy(plan));
-}
+                CUFFT_CHECK(cufftDestroy(plan));
+            }
 
-} // kernel
-} // algorithm
-} // pmacc
+        } // namespace kernel
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/kernel/Foreach.hpp b/include/pmacc/cuSTL/algorithm/kernel/Foreach.hpp
index fee62439d2..a954e4b4db 100644
--- a/include/pmacc/cuSTL/algorithm/kernel/Foreach.hpp
+++ b/include/pmacc/cuSTL/algorithm/kernel/Foreach.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -39,110 +39,88 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace kernel
-{
-
+    namespace algorithm
+    {
+        namespace kernel
+        {
 #ifndef FOREACH_KERNEL_MAX_PARAMS
-#define FOREACH_KERNEL_MAX_PARAMS 4
+#    define FOREACH_KERNEL_MAX_PARAMS 4
 #endif
-#define SHIFT_CURSOR_ZONE(Z, N, _) C ## N c ## N ## _shifted = c ## N (p_zone.offset);
-#define SHIFTED_CURSOR(Z, N, _) c ## N ## _shifted
+#define SHIFT_CURSOR_ZONE(Z, N, _) C##N c##N##_shifted = c##N(p_zone.offset);
+#define SHIFTED_CURSOR(Z, N, _) c##N##_shifted
 
-#define FOREACH_OPERATOR(Z, N, _)                                                                           \
-                         /* typename C0, typename C1, ... */                                                \
-    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor>                          \
-                                    /* C0 c0, C1 c1, ... */                                                 \
-    void operator()(const Zone& p_zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor)        \
-    {                                                                                                       \
-        /* C0 c0_shifted = c0(p_zone.offset); */                                                             \
-        /* C1 c1_shifted = c1(p_zone.offset); */                                                             \
-        /* ... */                                                                                           \
-        BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                            \
-                                                                                                            \
-        auto blockSize = BlockDim::toRT();                                                                   \
-        detail::SphericMapper<Zone::dim, BlockDim> mapper;                                                  \
-        using namespace pmacc;                                                                              \
-        PMACC_KERNEL(detail::KernelForeach{})(mapper.cudaGridDim(p_zone.size), blockSize)                    \
-                  /* c0_shifted, c1_shifted, ... */                                                         \
-            (mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), functor);                   \
+#define FOREACH_OPERATOR(Z, N, _)                                                                                     \
+    /* typename C0, typename C1, ... */                                                                               \
+    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor> /* C0 c0, C1 c1, ... */            \
+    void operator()(const Zone& p_zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor)                 \
+    {                                                                                                                 \
+        /* C0 c0_shifted = c0(p_zone.offset); */                                                                      \
+        /* C1 c1_shifted = c1(p_zone.offset); */                                                                      \
+        /* ... */                                                                                                     \
+        BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                                      \
+                                                                                                                      \
+        auto blockSize = BlockDim::toRT();                                                                            \
+        detail::SphericMapper<Zone::dim, BlockDim> mapper;                                                            \
+        using namespace pmacc;                                                                                        \
+        PMACC_KERNEL(detail::KernelForeach{})                                                                         \
+        (mapper.cuplaGridDim(p_zone.size), blockSize) /* c0_shifted, c1_shifted, ... */                               \
+            (mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), functor);                                                   \
     }
 
-/** Foreach algorithm that calls a cuda kernel
- *
- * \tparam BlockDim 3D compile-time vector (pmacc::math::CT::Int) of the size of the cuda blockDim.
- *
- * blockDim has to fit into the computing volume.
- * E.g. (8,8,4) fits into (256, 256, 256)
- */
-template<typename BlockDim>
-struct Foreach
-{
-    /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
-     *
-     * \param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
-     * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
-     * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 = _2)
-     *
-     * The functor or lambdaFun is called for each cell within the zone.
-     * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
-     *
-     */
-    BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), FOREACH_OPERATOR, _)
-};
+            /** Foreach algorithm that calls a cupla kernel
+             *
+             * \tparam BlockDim 3D compile-time vector (pmacc::math::CT::Int) of the size of the cupla blockDim.
+             *
+             * blockDim has to fit into the computing volume.
+             * E.g. (8,8,4) fits into (256, 256, 256)
+             */
+            template<typename BlockDim>
+            struct Foreach
+            {
+                /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
+                 *
+                 * \param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
+                 * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
+                 * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 =
+                 * _2)
+                 *
+                 * The functor or lambdaFun is called for each cell within the zone.
+                 * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
+                 *
+                 */
+                BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), FOREACH_OPERATOR, _)
+            };
 
 
 #undef FOREACH_OPERATOR
 #undef SHIFT_CURSOR_ZONE
 #undef SHIFTED_CURSOR
 
-template<
-    uint32_t T_numWorkers,
-    typename BlockDim
->
-struct ForeachLockstep
-{
-
-    /* operator()(zone, functor, cursor0, cursor1, ..., cursorN-1)
-     *
-     * @param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
-     * @param functor either a functor with N arguments
-     * @param args cursor for the N-th data source (e.g. containerObj.origin())
-     *
-     * The functor is called for each worker within the zone.
-     * It is called like
-     * @code[.cpp}
-     * functor(*cursor0(cellBlockOffset), ..., *cursorN(cellBlockOffset))
-     * @endcode
-     */
-    template<
-        int T_dim,
-        typename T_Functor,
-        typename... T_Args
-    >
-    void operator()(
-        zone::SphericZone< T_dim > const & p_zone,
-        T_Functor & functor,
-        T_Args ... args
-    )
-    {
-        detail::SphericMapper<
-            T_dim,
-            BlockDim
-        > mapper;
+            template<uint32_t T_numWorkers, typename BlockDim>
+            struct ForeachLockstep
+            {
+                /* operator()(zone, functor, cursor0, cursor1, ..., cursorN-1)
+                 *
+                 * @param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
+                 * @param functor either a functor with N arguments
+                 * @param args cursor for the N-th data source (e.g. containerObj.origin())
+                 *
+                 * The functor is called for each worker within the zone.
+                 * It is called like
+                 * @code[.cpp}
+                 * functor(*cursor0(cellBlockOffset), ..., *cursorN(cellBlockOffset))
+                 * @endcode
+                 */
+                template<int T_dim, typename T_Functor, typename... T_Args>
+                void operator()(zone::SphericZone<T_dim> const& p_zone, T_Functor& functor, T_Args... args)
+                {
+                    detail::SphericMapper<T_dim, BlockDim> mapper;
 
-         PMACC_KERNEL( detail::KernelForeachLockstep{ } )(
-            mapper.cudaGridDim( p_zone.size ),
-            T_numWorkers
-        )(
-            mapper,
-            functor,
-            args( p_zone.offset )...
-        );
-    }
-};
+                    PMACC_KERNEL(detail::KernelForeachLockstep{})
+                    (mapper.cuplaGridDim(p_zone.size), T_numWorkers)(mapper, functor, args(p_zone.offset)...);
+                }
+            };
 
-} // kernel
-} // algorithm
-} // pmacc
+        } // namespace kernel
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/kernel/ForeachBlock.hpp b/include/pmacc/cuSTL/algorithm/kernel/ForeachBlock.hpp
index c6f209fad0..3ee7e6362e 100644
--- a/include/pmacc/cuSTL/algorithm/kernel/ForeachBlock.hpp
+++ b/include/pmacc/cuSTL/algorithm/kernel/ForeachBlock.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -38,93 +38,92 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace kernel
-{
-
+    namespace algorithm
+    {
+        namespace kernel
+        {
 #ifndef FOREACH_KERNEL_MAX_PARAMS
-#define FOREACH_KERNEL_MAX_PARAMS 4
+#    define FOREACH_KERNEL_MAX_PARAMS 4
 #endif
 
-namespace detail
-{
-
-#define SHIFTACCESS_CURSOR(Z, N, _) c ## N [cellIndex]
-
-#define KERNEL_FOREACH(Z, N, _)                                                                             \
-                        /* typename C0, typename C1, ... */                                                 \
-template<typename Mapper, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor, typename T_Acc>                            \
-                                                /* C0 c0, C1 c1, ... */                                     \
-DINLINE void operator()(T_Acc const & acc, Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) const         \
-{                                                                                                           \
-    math::Int<Mapper::dim> cellIndex(mapper(acc, dim3(blockIdx)));                                               \
-         /* c0[cellIndex], c1[cellIndex], ... */                                                            \
-    functor(acc, BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _));                                                       \
-}
-
-struct KernelForeachBlock
-{
-
-BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREACH, _)
+            namespace detail
+            {
+#define SHIFTACCESS_CURSOR(Z, N, _) c##N[cellIndex]
+
+#define KERNEL_FOREACH(Z, N, _)                                                                                       \
+    /* typename C0, typename C1, ... */                                                                               \
+    template<                                                                                                         \
+        typename Mapper,                                                                                              \
+        BOOST_PP_ENUM_PARAMS(N, typename C),                                                                          \
+        typename Functor,                                                                                             \
+        typename T_Acc> /* C0 c0, C1 c1, ... */                                                                       \
+    DINLINE void operator()(T_Acc const& acc, Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor)   \
+        const                                                                                                         \
+    {                                                                                                                 \
+        math::Int<Mapper::dim> cellIndex(mapper(acc, cupla::dim3(cupla::blockIdx(acc))));                             \
+        /* c0[cellIndex], c1[cellIndex], ... */                                                                       \
+        functor(acc, BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _));                                                        \
+    }
 
-};
+                struct KernelForeachBlock
+                {
+                    BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREACH, _)
+                };
 #undef KERNEL_FOREACH
 #undef SHIFTACCESS_CURSOR
 
-}
-
-#define SHIFT_CURSOR_ZONE(Z, N, _) C ## N c ## N ## _shifted = c ## N (p_zone.offset);
-#define SHIFTED_CURSOR(Z, N, _) c ## N ## _shifted
-
-#define FOREACH_OPERATOR(Z, N, _)                                                                           \
-                         /* typename C0, typename C1, ... */                                                \
-    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor>                          \
-                                     /* C0 c0, C1 c1, ... */                                                \
-    void operator()(const Zone& p_zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor)        \
-    {                                                                                                       \
-        /* C0 c0_shifted = c0(p_zone.offset); */                                                             \
-        /* C1 c1_shifted = c1(p_zone.offset); */                                                             \
-        /* ... */                                                                                           \
-        BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                            \
-                                                                                                            \
-        auto blockDim = ThreadBlock::toRT();                                                                \
-        detail::SphericMapper<Zone::dim, BlockDim> mapper;                                                  \
-        using namespace pmacc;                                                                              \
-        PMACC_KERNEL(detail::KernelForeachBlock{})(mapper.cudaGridDim(p_zone.size), blockDim)               \
-                    /* c0_shifted, c1_shifted, ... */                                                       \
-            (mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), functor);                   \
+            } // namespace detail
+
+#define SHIFT_CURSOR_ZONE(Z, N, _) C##N c##N##_shifted = c##N(p_zone.offset);
+#define SHIFTED_CURSOR(Z, N, _) c##N##_shifted
+
+#define FOREACH_OPERATOR(Z, N, _)                                                                                     \
+    /* typename C0, typename C1, ... */                                                                               \
+    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor> /* C0 c0, C1 c1, ... */            \
+    void operator()(const Zone& p_zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor)                 \
+    {                                                                                                                 \
+        /* C0 c0_shifted = c0(p_zone.offset); */                                                                      \
+        /* C1 c1_shifted = c1(p_zone.offset); */                                                                      \
+        /* ... */                                                                                                     \
+        BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                                      \
+                                                                                                                      \
+        auto blockDim = ThreadBlock::toRT();                                                                          \
+        detail::SphericMapper<Zone::dim, BlockDim> mapper;                                                            \
+        using namespace pmacc;                                                                                        \
+        PMACC_KERNEL(detail::KernelForeachBlock{})                                                                    \
+        (mapper.cuplaGridDim(p_zone.size), blockDim) /* c0_shifted, c1_shifted, ... */                                \
+            (mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), functor);                                                   \
     }
 
-/** Special foreach algorithm that calls a cuda kernel
- *
- * Behaves like kernel::Foreach, except that is doesn't shift the cursors cell by cell, but
- * shifts them to the top left (front) corner cell of their corresponding cuda block.
- * So if BlockDim is 4x4x4 it shifts 64 cursors to (0,0,0), 64 to (4,0,0), 64 to (8,0,0), ...
- *
- * \tparam BlockDim 3D compile-time vector (pmacc::math::CT::Int) of the size of the cuda blockDim.
- * \tparam ThreadBlock ignored
- */
-template<typename BlockDim, typename ThreadBlock = BlockDim>
-struct ForeachBlock
-{
-    /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
-     *
-     * \param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
-     * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
-     * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 = _2)
-     *
-     * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
-     *
-     */
-    BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), FOREACH_OPERATOR, _)
-};
+            /** Special foreach algorithm that calls a cupla kernel
+             *
+             * Behaves like kernel::Foreach, except that is doesn't shift the cursors cell by cell, but
+             * shifts them to the top left (front) corner cell of their corresponding cupla block.
+             * So if BlockDim is 4x4x4 it shifts 64 cursors to (0,0,0), 64 to (4,0,0), 64 to (8,0,0), ...
+             *
+             * \tparam BlockDim 3D compile-time vector (pmacc::math::CT::Int) of the size of the cupla blockDim.
+             * \tparam ThreadBlock ignored
+             */
+            template<typename BlockDim, typename ThreadBlock = BlockDim>
+            struct ForeachBlock
+            {
+                /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
+                 *
+                 * \param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
+                 * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
+                 * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 =
+                 * _2)
+                 *
+                 * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
+                 *
+                 */
+                BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), FOREACH_OPERATOR, _)
+            };
 
 #undef FOREACH_OPERATOR
 #undef SHIFT_CURSOR_ZONE
 #undef SHIFTED_CURSOR
 
-} // kernel
-} // algorithm
-} // pmacc
-
+        } // namespace kernel
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/kernel/Reduce.hpp b/include/pmacc/cuSTL/algorithm/kernel/Reduce.hpp
index 4912835076..77ddaf8a1b 100644
--- a/include/pmacc/cuSTL/algorithm/kernel/Reduce.hpp
+++ b/include/pmacc/cuSTL/algorithm/kernel/Reduce.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,38 +28,37 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace kernel
-{
-
-/** Reduce algorithm that calls a cuda kernel
- *
- */
-struct Reduce
-{
-
-    /* \param srcCursor Cursor located at the origin of the area of reduce
-     * \param p_zone Zone of cells spanning the area of reduce
-     * \param functor Functor with two arguments which returns the result of the reduce operation.
-     */
-    template<typename SrcCursor, typename Zone, typename NVidiaFunctor>
-    typename SrcCursor::ValueType operator()(const SrcCursor& srcCursor, const Zone& p_zone, const NVidiaFunctor& functor)
+    namespace algorithm
     {
-        SrcCursor srcCursor_shifted = srcCursor(p_zone.offset);
-
-        cursor::MapTo1DNavigator<Zone::dim> myNavi(p_zone.size);
+        namespace kernel
+        {
+            /** Reduce algorithm that calls a cupla kernel
+             *
+             */
+            struct Reduce
+            {
+                /* \param srcCursor Cursor located at the origin of the area of reduce
+                 * \param p_zone Zone of cells spanning the area of reduce
+                 * \param functor Functor with two arguments which returns the result of the reduce operation.
+                 */
+                template<typename SrcCursor, typename Zone, typename NVidiaFunctor>
+                typename SrcCursor::ValueType operator()(
+                    const SrcCursor& srcCursor,
+                    const Zone& p_zone,
+                    const NVidiaFunctor& functor)
+                {
+                    SrcCursor srcCursor_shifted = srcCursor(p_zone.offset);
 
-        auto _srcCursor = cursor::make_Cursor(cursor::CursorAccessor<SrcCursor>(),
-                                                   myNavi,
-                                                   srcCursor_shifted);
+                    cursor::MapTo1DNavigator<Zone::dim> myNavi(p_zone.size);
 
-        pmacc::nvidia::reduce::Reduce reduce(1024);
-        return reduce(functor, _srcCursor, p_zone.size.productOfComponents());
-    }
+                    auto _srcCursor
+                        = cursor::make_Cursor(cursor::CursorAccessor<SrcCursor>(), myNavi, srcCursor_shifted);
 
-};
+                    pmacc::nvidia::reduce::Reduce reduce(1024);
+                    return reduce(functor, _srcCursor, p_zone.size.productOfComponents());
+                }
+            };
 
-} // kernel
-} // algorithm
-} // pmacc
+        } // namespace kernel
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/kernel/detail/ForeachKernel.hpp b/include/pmacc/cuSTL/algorithm/kernel/detail/ForeachKernel.hpp
index 1d2bb38816..d7fc904316 100644
--- a/include/pmacc/cuSTL/algorithm/kernel/detail/ForeachKernel.hpp
+++ b/include/pmacc/cuSTL/algorithm/kernel/detail/ForeachKernel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau
+/* Copyright 2013-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -29,148 +29,116 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace kernel
-{
-
+    namespace algorithm
+    {
+        namespace kernel
+        {
 #ifndef FOREACH_KERNEL_MAX_PARAMS
-#define FOREACH_KERNEL_MAX_PARAMS 4
+#    define FOREACH_KERNEL_MAX_PARAMS 4
 #endif
 
-namespace detail
-{
-
-#define SHIFTACCESS_CURSOR(Z, N, _) c ## N [cellIndex]
-
-#define KERNEL_FOREACH(Z, N, _) \
-/*                        typename C0, ..., typename CN     */ \
-template<typename Mapper, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor, typename T_Acc> \
-/*                                          C0 c0, ..., CN cN   */ \
-DINLINE void operator()(T_Acc const & acc, Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) const \
-{ \
-    math::Int<Mapper::dim> cellIndex(mapper(acc, dim3(blockIdx), dim3(threadIdx))); \
-/*          c0[cellIndex]), ..., cN[cellIndex]     */ \
-    functor(acc, BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _)); \
-}
+            namespace detail
+            {
+#define SHIFTACCESS_CURSOR(Z, N, _) c##N[cellIndex]
+
+#define KERNEL_FOREACH(Z, N, _)                                                                                       \
+    /*                        typename C0, ..., typename CN     */                                                    \
+    template<                                                                                                         \
+        typename Mapper,                                                                                              \
+        BOOST_PP_ENUM_PARAMS(N, typename C),                                                                          \
+        typename Functor,                                                                                             \
+        typename T_Acc> /*                                          C0 c0, ..., CN cN   */                            \
+    DINLINE void operator()(T_Acc const& acc, Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor)   \
+        const                                                                                                         \
+    {                                                                                                                 \
+        math::Int<Mapper::dim> cellIndex(                                                                             \
+            mapper(acc, cupla::dim3(cupla::blockIdx(acc)), cupla::dim3(cupla::threadIdx(acc))));                      \
+        /*          c0[cellIndex]), ..., cN[cellIndex]     */                                                         \
+        functor(acc, BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _));                                                        \
+    }
 
-struct KernelForeach
-{
-BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREACH, _)
-};
+                struct KernelForeach
+                {
+                    BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREACH, _)
+                };
 #undef KERNEL_FOREACH
 #undef SHIFTACCESS_CURSOR
 
-struct KernelForeachLockstep
-{
-    /** call functor
-     *
-     * Each argument is shifted to the origin of the block before it is passed
-     * to the functor.
-     */
-    template<
-        typename T_Acc,
-        typename T_Mapper,
-        typename T_Functor,
-        typename... T_Args>
-    ALPAKA_FN_ACC void operator()(
-        T_Acc const & acc,
-        T_Mapper const mapper,
-        T_Functor functor,
-        T_Args ... args
-    ) const
-    {
-        // map to the origin of the block
-        math::Int<
-            T_Mapper::dim
-        > cellIndex(
-            mapper(
-                acc,
-                dim3( blockIdx ),
-                dim3(
-                    0,
-                    0,
-                    0
-                )
-            )
-        );
-
-        functor(
-            acc,
-            args[ cellIndex ]...
-        );
-    }
-};
-
-namespace RT
-{
-    /** Run a cuSTL KernelForeach
-     *
-     * Allow to run the cuSTL foreach with runtime block sizes.
-     * @warning collective functors which contain synchronization are not supported
-     */
-    struct KernelForeachLockstep
-    {
-        /** call functor
-         *
-         * Each argument is shifted to the origin of the block before it is passed
-         * to the functor.
-         */
-        template<
-            typename T_Acc,
-            typename T_Mapper,
-            typename T_BlockSize,
-            typename T_Functor,
-            typename... T_Args>
-        ALPAKA_FN_ACC void operator()(
-            T_Acc const & acc,
-            T_Mapper const mapper,
-            T_BlockSize const blockSize,
-            T_Functor functor,
-            T_Args ... args
-        ) const
-        {
-            /* KernelForeachLockstep is always called as kernel with three dimensions
-             * therefore we have to reduce the dimension if the mapper is only 2D or 1D.
-             */
-            auto const blockSizeShrinked = blockSize.template shrink< T_Mapper::dim >( );
-            uint32_t const domainElementCount = blockSizeShrinked.productOfComponents();
-            DataSpace< T_Mapper::dim > const domainSize( blockSizeShrinked );
-
-            // map to the origin of the block
-            math::Int<
-                T_Mapper::dim
-            > blockCellOffset(
-                mapper(
-                    acc,
-                    domainSize.toDim3(),
-                    dim3( blockIdx ),
-                    dim3(
-                        0,
-                        0,
-                        0
-                    )
-                )
-            );
-
-
-
-            for( uint32_t i = threadIdx.x; i < domainElementCount; i += blockDim.x )
-            {
-                auto const inBlockOffset = DataSpaceOperations< T_Mapper::dim >::map(
-                    domainSize,
-                    i
-                );
-                auto const cellOffset = blockCellOffset + inBlockOffset;
-                functor(
-                    acc,
-                    args[ cellOffset ]...
-                );
-            }
-        }
-    };
-} // namespace RT
-} // namespace detail
-} // namespace kernel
-} // namespace algorithm
+                struct KernelForeachLockstep
+                {
+                    /** call functor
+                     *
+                     * Each argument is shifted to the origin of the block before it is passed
+                     * to the functor.
+                     */
+                    template<typename T_Acc, typename T_Mapper, typename T_Functor, typename... T_Args>
+                    ALPAKA_FN_ACC void operator()(
+                        T_Acc const& acc,
+                        T_Mapper const mapper,
+                        T_Functor functor,
+                        T_Args... args) const
+                    {
+                        // map to the origin of the block
+                        math::Int<T_Mapper::dim> cellIndex(
+                            mapper(acc, cupla::dim3(cupla::blockIdx(acc)), cupla::dim3(0, 0, 0)));
+
+                        functor(acc, args[cellIndex]...);
+                    }
+                };
+
+                namespace RT
+                {
+                    /** Run a cuSTL KernelForeach
+                     *
+                     * Allow to run the cuSTL foreach with runtime block sizes.
+                     * @warning collective functors which contain synchronization are not supported
+                     */
+                    struct KernelForeachLockstep
+                    {
+                        /** call functor
+                         *
+                         * Each argument is shifted to the origin of the block before it is passed
+                         * to the functor.
+                         */
+                        template<
+                            typename T_Acc,
+                            typename T_Mapper,
+                            typename T_BlockSize,
+                            typename T_Functor,
+                            typename... T_Args>
+                        ALPAKA_FN_ACC void operator()(
+                            T_Acc const& acc,
+                            T_Mapper const mapper,
+                            T_BlockSize const blockSize,
+                            T_Functor functor,
+                            T_Args... args) const
+                        {
+                            /* KernelForeachLockstep is always called as kernel with three dimensions
+                             * therefore we have to reduce the dimension if the mapper is only 2D or 1D.
+                             */
+                            auto const blockSizeShrinked = blockSize.template shrink<T_Mapper::dim>();
+                            uint32_t const domainElementCount = blockSizeShrinked.productOfComponents();
+                            DataSpace<T_Mapper::dim> const domainSize(blockSizeShrinked);
+
+                            // map to the origin of the block
+                            math::Int<T_Mapper::dim> blockCellOffset(mapper(
+                                acc,
+                                domainSize.toDim3(),
+                                cupla::dim3(cupla::blockIdx(acc)),
+                                cupla::dim3(0, 0, 0)));
+
+
+                            for(uint32_t i = cupla::threadIdx(acc).x; i < domainElementCount;
+                                i += cupla::blockDim(acc).x)
+                            {
+                                auto const inBlockOffset = DataSpaceOperations<T_Mapper::dim>::map(domainSize, i);
+                                auto const cellOffset = blockCellOffset + inBlockOffset;
+                                functor(acc, args[cellOffset]...);
+                            }
+                        }
+                    };
+                } // namespace RT
+            } // namespace detail
+        } // namespace kernel
+    } // namespace algorithm
 } // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/kernel/detail/SphericMapper.hpp b/include/pmacc/cuSTL/algorithm/kernel/detail/SphericMapper.hpp
index 658c0cdf07..1117df1906 100644
--- a/include/pmacc/cuSTL/algorithm/kernel/detail/SphericMapper.hpp
+++ b/include/pmacc/cuSTL/algorithm/kernel/detail/SphericMapper.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,295 +28,246 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace kernel
-{
-namespace detail
-{
-
-namespace mpl = boost::mpl;
-
-/** The SphericMapper maps from cuda blockIdx and/or threadIdx to the cell index
- * \tparam dim dimension
- * \tparam BlockSize compile-time vector of the cuda block size (optional)
- * \tparam dummy neccesary to implement the optional BlockSize parameter
- *
- * If BlockSize is given the cuda variable blockDim is not used which is faster.
- */
-template<int dim, typename BlockSize = mpl::void_, typename dummy = mpl::void_>
-struct SphericMapper;
-
-/* Compile-time BlockSize */
-
-template<typename BlockSize>
-struct SphericMapper<1, BlockSize>
-{
-    static constexpr int dim = 1;
-
-    typename math::Size_t<3>::BaseType
-    cudaGridDim(const math::Size_t<1>& size) const
-    {
-        return math::Size_t<3>(
-            size.x() / BlockSize::x::value,
-            1u,
-            1u
-        );
-    }
-
-    template< typename T_Acc >
-    HDINLINE
-    math::Int<1> operator()(
-        T_Acc const & acc,
-        const math::Int<1>& _blockIdx,
-        const math::Int<1>& _threadIdx
-    ) const
-    {
-        return _blockIdx.x() * BlockSize::x::value + _threadIdx.x();
-    }
-
-    template< typename T_Acc >
-    HDINLINE
-    math::Int<1> operator()(
-        T_Acc const & acc,
-        const dim3& _blockIdx,
-        const dim3& _threadIdx = dim3(0,0,0)
-    ) const
-    {
-        return operator()(
-            acc,
-            math::Int<1>((int)_blockIdx.x),
-            math::Int<1>((int)_threadIdx.x)
-            );
-    }
-};
-
-template<typename BlockSize>
-struct SphericMapper<2, BlockSize>
-{
-    static constexpr int dim = 2;
-
-    typename math::Size_t<3>::BaseType
-    cudaGridDim(const math::Size_t<2>& size) const
-    {
-        return math::Size_t<3>(
-            size.x() / BlockSize::x::value,
-            size.y() / BlockSize::y::value,
-            1u
-         );
-    }
-
-    template< typename T_Acc >
-    HDINLINE
-    math::Int<2> operator()(
-        T_Acc const & acc,
-        const math::Int<2>& _blockIdx,
-        const math::Int<2>& _threadIdx
-    ) const
-    {
-        return math::Int<2>( _blockIdx.x() * BlockSize::x::value + _threadIdx.x(),
-                             _blockIdx.y() * BlockSize::y::value + _threadIdx.y() );
-    }
-
-    template< typename T_Acc >
-    HDINLINE
-    math::Int<2> operator()(
-        T_Acc const & acc,
-        const dim3& _blockIdx,
-        const dim3& _threadIdx = dim3(0,0,0)
-    ) const
-    {
-        return operator()(
-            acc,
-            math::Int<2>(_blockIdx.x, _blockIdx.y),
-            math::Int<2>(_threadIdx.x, _threadIdx.y)
-        );
-    }
-};
-
-template<typename BlockSize>
-struct SphericMapper<3, BlockSize>
-{
-    static constexpr int dim = 3;
-
-    typename math::Size_t<3>::BaseType
-    cudaGridDim(const math::Size_t<3>& size) const
-    {
-        return math::Size_t<3>(
-            size.x() / BlockSize::x::value,
-            size.y() / BlockSize::y::value,
-            size.z() / BlockSize::z::value
-        );
-    }
-
-    template< typename T_Acc >
-    HDINLINE
-    math::Int<3> operator()(
-        T_Acc const & acc,
-        const math::Int<3>& _blockIdx,
-        const math::Int<3>& _threadIdx
-    ) const
-    {
-        return math::Int<3>( _blockIdx * (math::Int<3>)BlockSize().toRT() + _threadIdx );
-    }
-
-    template< typename T_Acc >
-    HDINLINE
-    math::Int<3> operator()(
-        T_Acc const & acc,
-        const dim3& _blockIdx,
-        const dim3& _threadIdx = dim3(0,0,0)
-    ) const
-    {
-        return operator()(
-            acc,
-            math::Int<3>(_blockIdx.x, _blockIdx.y, _blockIdx.z),
-            math::Int<3>(_threadIdx.x, _threadIdx.y, _threadIdx.z)
-        );
-    }
-};
-
-/* Runtime BlockSize */
-
-template<>
-struct SphericMapper<1, mpl::void_>
-{
-    static constexpr int dim = 1;
-
-    typename math::Size_t<3>::BaseType
-    cudaGridDim(const math::Size_t<1>& size, const math::Size_t<3>& blockSize) const
-    {
-        return math::Size_t<3>(
-            size.x() / blockSize.x(),
-            1u,
-            1u
-        );
-    }
-
-    template< typename T_Acc >
-    DINLINE
-    math::Int<1> operator()(
-        T_Acc const & acc,
-        const math::Int<1>& _blockDim,
-        const math::Int<1>& _blockIdx,
-        const math::Int<1>& _threadIdx
-    ) const
-    {
-        return _blockIdx.x() * _blockDim.x() + _threadIdx.x();
-    }
-
-    template< typename T_Acc >
-    DINLINE
-    math::Int<1> operator()(
-        T_Acc const & acc,
-        const dim3& _blockDim,
-        const dim3& _blockIdx,
-        const dim3& _threadIdx
-    ) const
-    {
-        return operator()(
-            acc,
-            math::Int<1>((int)_blockDim.x),
-            math::Int<1>((int)_blockIdx.x),
-            math::Int<1>((int)_threadIdx.x)
-        );
-    }
-};
-
-template<>
-struct SphericMapper<2, mpl::void_>
-{
-    static constexpr int dim = 2;
-
-    typename math::Size_t<3>::BaseType
-    cudaGridDim(const math::Size_t<2>& size, const math::Size_t<3>& blockSize) const
-    {
-        return math::Size_t<3>(
-            size.x() / blockSize.x(),
-            size.y() / blockSize.y(),
-            1
-        );
-    }
-
-    template< typename T_Acc >
-    DINLINE
-    math::Int<2> operator()(
-        T_Acc const & acc,
-        const math::Int<2>& _blockDim,
-        const math::Int<2>& _blockIdx,
-        const math::Int<2>& _threadIdx
-    ) const
-    {
-        return math::Int<2>( _blockIdx.x() * _blockDim.x() + _threadIdx.x(),
-                             _blockIdx.y() * _blockDim.y() + _threadIdx.y() );
-    }
-
-    template< typename T_Acc >
-    DINLINE
-    math::Int<2> operator()(
-        T_Acc const & acc,
-        const dim3& _blockDim,
-        const dim3& _blockIdx,
-        const dim3& _threadIdx
-    ) const
-    {
-        return operator()(
-            acc,
-            math::Int<2>(_blockDim.x, _blockDim.y),
-            math::Int<2>(_blockIdx.x, _blockIdx.y),
-            math::Int<2>(_threadIdx.x, _threadIdx.y)
-        );
-    }
-};
-
-template<>
-struct SphericMapper<3, mpl::void_>
-{
-    static constexpr int dim = 3;
-
-    typename math::Size_t<3>::BaseType
-    cudaGridDim(const math::Size_t<3>& size, const math::Size_t<3>& blockSize) const
-    {
-        return math::Size_t<3>(
-            size.x() / blockSize.x(),
-            size.y() / blockSize.y(),
-            size.z() / blockSize.z()
-        );
-    }
-
-    template< typename T_Acc >
-    DINLINE
-    math::Int<3> operator()(
-        T_Acc const & acc,
-        const math::Int<3>& _blockDim,
-        const math::Int<3>& _blockIdx,
-        const math::Int<3>& _threadIdx
-    ) const
-    {
-        return math::Int<3>( _blockIdx.x() * _blockDim.x() + _threadIdx.x(),
-                             _blockIdx.y() * _blockDim.y() + _threadIdx.y(),
-                             _blockIdx.z() * _blockDim.z() + _threadIdx.z() );
-    }
-
-    template< typename T_Acc >
-    DINLINE
-    math::Int<3> operator()(
-        T_Acc const & acc,
-        const dim3& _blockDim,
-        const dim3& _blockIdx,
-        const dim3& _threadIdx
-    ) const
+    namespace algorithm
     {
-        return operator()(
-            acc,
-            math::Int<3>(_blockDim.x, _blockDim.y, _blockDim.z),
-            math::Int<3>(_blockIdx.x, _blockIdx.y, _blockIdx.z),
-            math::Int<3>(_threadIdx.x, _threadIdx.y, _threadIdx.z)
-        );
-    }
-};
-
-} // detail
-} // kernel
-} // algorithm
-} // pmacc
+        namespace kernel
+        {
+            namespace detail
+            {
+                namespace mpl = boost::mpl;
+
+                /** The SphericMapper maps from cupla blockIdx and/or threadIdx to the cell index
+                 * \tparam dim dimension
+                 * \tparam BlockSize compile-time vector of the cupla block size (optional)
+                 * \tparam dummy neccesary to implement the optional BlockSize parameter
+                 *
+                 * If BlockSize is given the cupla variable blockDim is not used which is faster.
+                 */
+                template<int dim, typename BlockSize = mpl::void_, typename dummy = mpl::void_>
+                struct SphericMapper;
+
+                /* Compile-time BlockSize */
+
+                template<typename BlockSize>
+                struct SphericMapper<1, BlockSize>
+                {
+                    static constexpr int dim = 1;
+
+                    typename math::Size_t<3>::BaseType cuplaGridDim(const math::Size_t<1>& size) const
+                    {
+                        return math::Size_t<3>(size.x() / BlockSize::x::value, 1u, 1u);
+                    }
+
+                    template<typename T_Acc>
+                    HDINLINE math::Int<1> operator()(
+                        T_Acc const& acc,
+                        const math::Int<1>& _blockIdx,
+                        const math::Int<1>& _threadIdx) const
+                    {
+                        return _blockIdx.x() * BlockSize::x::value + _threadIdx.x();
+                    }
+
+                    template<typename T_Acc>
+                    HDINLINE math::Int<1> operator()(
+                        T_Acc const& acc,
+                        const cupla::dim3& _blockIdx,
+                        const cupla::dim3& _threadIdx = cupla::dim3(0, 0, 0)) const
+                    {
+                        return operator()(acc, math::Int<1>((int) _blockIdx.x), math::Int<1>((int) _threadIdx.x));
+                    }
+                };
+
+                template<typename BlockSize>
+                struct SphericMapper<2, BlockSize>
+                {
+                    static constexpr int dim = 2;
+
+                    typename math::Size_t<3>::BaseType cuplaGridDim(const math::Size_t<2>& size) const
+                    {
+                        return math::Size_t<3>(size.x() / BlockSize::x::value, size.y() / BlockSize::y::value, 1u);
+                    }
+
+                    template<typename T_Acc>
+                    HDINLINE math::Int<2> operator()(
+                        T_Acc const& acc,
+                        const math::Int<2>& _blockIdx,
+                        const math::Int<2>& _threadIdx) const
+                    {
+                        return math::Int<2>(
+                            _blockIdx.x() * BlockSize::x::value + _threadIdx.x(),
+                            _blockIdx.y() * BlockSize::y::value + _threadIdx.y());
+                    }
+
+                    template<typename T_Acc>
+                    HDINLINE math::Int<2> operator()(
+                        T_Acc const& acc,
+                        const cupla::dim3& _blockIdx,
+                        const cupla::dim3& _threadIdx = cupla::dim3(0, 0, 0)) const
+                    {
+                        return operator()(
+                            acc,
+                            math::Int<2>(_blockIdx.x, _blockIdx.y),
+                            math::Int<2>(_threadIdx.x, _threadIdx.y));
+                    }
+                };
+
+                template<typename BlockSize>
+                struct SphericMapper<3, BlockSize>
+                {
+                    static constexpr int dim = 3;
+
+                    typename math::Size_t<3>::BaseType cuplaGridDim(const math::Size_t<3>& size) const
+                    {
+                        return math::Size_t<3>(
+                            size.x() / BlockSize::x::value,
+                            size.y() / BlockSize::y::value,
+                            size.z() / BlockSize::z::value);
+                    }
+
+                    template<typename T_Acc>
+                    HDINLINE math::Int<3> operator()(
+                        T_Acc const& acc,
+                        const math::Int<3>& _blockIdx,
+                        const math::Int<3>& _threadIdx) const
+                    {
+                        return math::Int<3>(_blockIdx * (math::Int<3>) BlockSize().toRT() + _threadIdx);
+                    }
+
+                    template<typename T_Acc>
+                    HDINLINE math::Int<3> operator()(
+                        T_Acc const& acc,
+                        const cupla::dim3& _blockIdx,
+                        const cupla::dim3& _threadIdx = cupla::dim3(0, 0, 0)) const
+                    {
+                        return operator()(
+                            acc,
+                            math::Int<3>(_blockIdx.x, _blockIdx.y, _blockIdx.z),
+                            math::Int<3>(_threadIdx.x, _threadIdx.y, _threadIdx.z));
+                    }
+                };
+
+                /* Runtime BlockSize */
+
+                template<>
+                struct SphericMapper<1, mpl::void_>
+                {
+                    static constexpr int dim = 1;
+
+                    typename math::Size_t<3>::BaseType cuplaGridDim(
+                        const math::Size_t<1>& size,
+                        const math::Size_t<3>& blockSize) const
+                    {
+                        return math::Size_t<3>(size.x() / blockSize.x(), 1u, 1u);
+                    }
+
+                    template<typename T_Acc>
+                    DINLINE math::Int<1> operator()(
+                        T_Acc const& acc,
+                        const math::Int<1>& _blockDim,
+                        const math::Int<1>& _blockIdx,
+                        const math::Int<1>& _threadIdx) const
+                    {
+                        return _blockIdx.x() * _blockDim.x() + _threadIdx.x();
+                    }
+
+                    template<typename T_Acc>
+                    DINLINE math::Int<1> operator()(
+                        T_Acc const& acc,
+                        const cupla::dim3& _blockDim,
+                        const cupla::dim3& _blockIdx,
+                        const cupla::dim3& _threadIdx) const
+                    {
+                        return operator()(
+                            acc,
+                            math::Int<1>((int) _blockDim.x),
+                            math::Int<1>((int) _blockIdx.x),
+                            math::Int<1>((int) _threadIdx.x));
+                    }
+                };
+
+                template<>
+                struct SphericMapper<2, mpl::void_>
+                {
+                    static constexpr int dim = 2;
+
+                    typename math::Size_t<3>::BaseType cuplaGridDim(
+                        const math::Size_t<2>& size,
+                        const math::Size_t<3>& blockSize) const
+                    {
+                        return math::Size_t<3>(size.x() / blockSize.x(), size.y() / blockSize.y(), 1);
+                    }
+
+                    template<typename T_Acc>
+                    DINLINE math::Int<2> operator()(
+                        T_Acc const& acc,
+                        const math::Int<2>& _blockDim,
+                        const math::Int<2>& _blockIdx,
+                        const math::Int<2>& _threadIdx) const
+                    {
+                        return math::Int<2>(
+                            _blockIdx.x() * _blockDim.x() + _threadIdx.x(),
+                            _blockIdx.y() * _blockDim.y() + _threadIdx.y());
+                    }
+
+                    template<typename T_Acc>
+                    DINLINE math::Int<2> operator()(
+                        T_Acc const& acc,
+                        const cupla::dim3& _blockDim,
+                        const cupla::dim3& _blockIdx,
+                        const cupla::dim3& _threadIdx) const
+                    {
+                        return operator()(
+                            acc,
+                            math::Int<2>(_blockDim.x, _blockDim.y),
+                            math::Int<2>(_blockIdx.x, _blockIdx.y),
+                            math::Int<2>(_threadIdx.x, _threadIdx.y));
+                    }
+                };
+
+                template<>
+                struct SphericMapper<3, mpl::void_>
+                {
+                    static constexpr int dim = 3;
+
+                    typename math::Size_t<3>::BaseType cuplaGridDim(
+                        const math::Size_t<3>& size,
+                        const math::Size_t<3>& blockSize) const
+                    {
+                        return math::Size_t<3>(
+                            size.x() / blockSize.x(),
+                            size.y() / blockSize.y(),
+                            size.z() / blockSize.z());
+                    }
+
+                    template<typename T_Acc>
+                    DINLINE math::Int<3> operator()(
+                        T_Acc const& acc,
+                        const math::Int<3>& _blockDim,
+                        const math::Int<3>& _blockIdx,
+                        const math::Int<3>& _threadIdx) const
+                    {
+                        return math::Int<3>(
+                            _blockIdx.x() * _blockDim.x() + _threadIdx.x(),
+                            _blockIdx.y() * _blockDim.y() + _threadIdx.y(),
+                            _blockIdx.z() * _blockDim.z() + _threadIdx.z());
+                    }
+
+                    template<typename T_Acc>
+                    DINLINE math::Int<3> operator()(
+                        T_Acc const& acc,
+                        const cupla::dim3& _blockDim,
+                        const cupla::dim3& _blockIdx,
+                        const cupla::dim3& _threadIdx) const
+                    {
+                        return operator()(
+                            acc,
+                            math::Int<3>(_blockDim.x, _blockDim.y, _blockDim.z),
+                            math::Int<3>(_blockIdx.x, _blockIdx.y, _blockIdx.z),
+                            math::Int<3>(_threadIdx.x, _threadIdx.y, _threadIdx.z));
+                    }
+                };
+
+            } // namespace detail
+        } // namespace kernel
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/kernel/run-time/Foreach.hpp b/include/pmacc/cuSTL/algorithm/kernel/run-time/Foreach.hpp
index be0604789d..34015e212d 100644
--- a/include/pmacc/cuSTL/algorithm/kernel/run-time/Foreach.hpp
+++ b/include/pmacc/cuSTL/algorithm/kernel/run-time/Foreach.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -45,152 +45,155 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace kernel
-{
-namespace RT
-{
-
-/** Heuristic maximum threads per block and per axis
- * in agreement to sm_2.x - sm_5.3
- *
- * These values don't fully exploit the limits from the cuda specification
- * but they give reasonable speed.
- */
-template<int dim>
-struct MaxCudaBlockDim;
-
-template<>
-struct MaxCudaBlockDim<DIM1>
-{
-    typedef math::CT::Size_t<1024, 1, 1> type;
-};
-
-template<>
-struct MaxCudaBlockDim<DIM2>
-{
-    typedef math::CT::Size_t<32, 32, 1> type;
-};
-
-template<>
-struct MaxCudaBlockDim<DIM3>
-{
-    typedef math::CT::Size_t<8, 8, 8> type;
-};
-
-/** Check if MaxCudaBlockDim holds the cuda specification limits
- *
- * @cond
- */
-PMACC_CASSERT_MSG(_cuda_blockDim_exceeds_maximum_number_of_threads_per_block,
-    math::CT::volume<typename MaxCudaBlockDim<DIM1>::type >::type::value <= cudaSpecs::maxNumThreadsPerBlock);
-PMACC_CASSERT_MSG(_cuda_blockDim_exceeds_maximum_number_of_threads_per_block,
-    math::CT::volume<typename MaxCudaBlockDim<DIM2>::type >::type::value <= cudaSpecs::maxNumThreadsPerBlock);
-PMACC_CASSERT_MSG(_cuda_blockDim_exceeds_maximum_number_of_threads_per_block,
-    math::CT::volume<typename MaxCudaBlockDim<DIM3>::type >::type::value <= cudaSpecs::maxNumThreadsPerBlock);
-/** @endcond */
-
-/** Return a suitable cuda blockDim for a given gridDimension.
- *
- * @param gridDimension 1D, 2D or 3D grid size
- * @return cuda blockDim
- */
-template<int dim>
-math::Size_t<DIM3> getBestCudaBlockDim(const math::Size_t<dim> gridDimension)
-{
-    math::Size_t<DIM3> result = math::Size_t<DIM3>::create(1);
-
-    /* The greatest common divisor of each component of the volume size
-     * and a certain power of two value yield the best suitable block size */
-    const math::Size_t<DIM3> maxThreads =
-        MaxCudaBlockDim<dim>::type::toRT(); /* max threads per axis */
-    for(int i = 0; i < dim; i++)
+    namespace algorithm
     {
-        result[i] = boost::integer::gcd(gridDimension[i], maxThreads[i]);
-    }
-
-    return result;
-}
+        namespace kernel
+        {
+            namespace RT
+            {
+                /** Heuristic maximum threads per block and per axis
+                 * in agreement to sm_2.x - sm_5.3
+                 *
+                 * These values don't fully exploit the limits from the cupla specification
+                 * but they give reasonable speed.
+                 */
+                template<int dim>
+                struct MaxCudaBlockDim;
+
+                template<>
+                struct MaxCudaBlockDim<DIM1>
+                {
+                    typedef math::CT::Size_t<1024, 1, 1> type;
+                };
+
+                template<>
+                struct MaxCudaBlockDim<DIM2>
+                {
+                    typedef math::CT::Size_t<32, 32, 1> type;
+                };
+
+                template<>
+                struct MaxCudaBlockDim<DIM3>
+                {
+                    typedef math::CT::Size_t<8, 8, 8> type;
+                };
+
+                /** Check if MaxCudaBlockDim holds the cupla specification limits
+                 *
+                 * @cond
+                 */
+                PMACC_CASSERT_MSG(
+                    _cupla_blockDim_exceeds_maximum_number_of_threads_per_block,
+                    math::CT::volume<typename MaxCudaBlockDim<DIM1>::type>::type::value
+                        <= cudaSpecs::maxNumThreadsPerBlock);
+                PMACC_CASSERT_MSG(
+                    _cupla_blockDim_exceeds_maximum_number_of_threads_per_block,
+                    math::CT::volume<typename MaxCudaBlockDim<DIM2>::type>::type::value
+                        <= cudaSpecs::maxNumThreadsPerBlock);
+                PMACC_CASSERT_MSG(
+                    _cupla_blockDim_exceeds_maximum_number_of_threads_per_block,
+                    math::CT::volume<typename MaxCudaBlockDim<DIM3>::type>::type::value
+                        <= cudaSpecs::maxNumThreadsPerBlock);
+                /** @endcond */
+
+                /** Return a suitable cupla blockDim for a given gridDimension.
+                 *
+                 * @param gridDimension 1D, 2D or 3D grid size
+                 * @return cupla blockDim
+                 */
+                template<int dim>
+                math::Size_t<DIM3> getBestCudaBlockDim(const math::Size_t<dim> gridDimension)
+                {
+                    math::Size_t<DIM3> result = math::Size_t<DIM3>::create(1);
+
+                    /* The greatest common divisor of each component of the volume size
+                     * and a certain power of two value yield the best suitable block size */
+                    const math::Size_t<DIM3> maxThreads
+                        = MaxCudaBlockDim<dim>::type::toRT(); /* max threads per axis */
+                    for(int i = 0; i < dim; i++)
+                    {
+                        result[i] = boost::integer::gcd(gridDimension[i], maxThreads[i]);
+                    }
+
+                    return result;
+                }
 
 #ifndef FOREACH_KERNEL_MAX_PARAMS
-#define FOREACH_KERNEL_MAX_PARAMS 4
+#    define FOREACH_KERNEL_MAX_PARAMS 4
 #endif
 
-#define SHIFT_CURSOR_ZONE(Z, N, _) C ## N c ## N ## _shifted = c ## N (p_zone.offset);
-#define SHIFTED_CURSOR(Z, N, _) c ## N ## _shifted
+#define SHIFT_CURSOR_ZONE(Z, N, _) C##N c##N##_shifted = c##N(p_zone.offset);
+#define SHIFTED_CURSOR(Z, N, _) c##N##_shifted
 
-#define FOREACH_OPERATOR(Z, N, _)                                                                                   \
-    /*                      typename C0, ..., typename CN            */                                             \
-    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor>                                  \
-                                /*     C0 c0, ..., CN cN  */                                                        \
-    void operator()(const Zone& p_zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor)                \
-    {                                                                                                               \
+#define FOREACH_OPERATOR(Z, N, _)                                                                                     \
+    /*                      typename C0, ..., typename CN            */                                               \
+    template<typename Zone, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor> /*     C0 c0, ..., CN cN  */       \
+    void operator()(const Zone& p_zone, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), const Functor& functor)                 \
+    {                                                                                                                 \
         /* C0 c0_shifted = c0(p_zone.offset); ...; CN cN_shifted = cN(p_zone.offset); */                              \
-        BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                                    \
-                                                                                                                    \
-        if(this->_blockDim == math::Size_t<DIM3>::create(0))                                                        \
-            this->_blockDim = getBestCudaBlockDim(p_zone.size);                                                     \
-                                                                                                                    \
-        PMACC_VERIFY(this->_blockDim.productOfComponents() <= cudaSpecs::maxNumThreadsPerBlock);                    \
-        PMACC_VERIFY(this->_blockDim.x() <= cudaSpecs::MaxNumThreadsPerBlockDim::x::value);                         \
-        PMACC_VERIFY(this->_blockDim.y() <= cudaSpecs::MaxNumThreadsPerBlockDim::y::value);                         \
-        PMACC_VERIFY(this->_blockDim.z() <= cudaSpecs::MaxNumThreadsPerBlockDim::z::value);                         \
-                                                                                                                    \
-        typename math::Size_t<3>::BaseType blockSize(                                                                \
-            this->_blockDim.x(),                                                                                    \
-            this->_blockDim.y(),                                                                                    \
-            this->_blockDim.z()                                                                                     \
-        );                                                                                                          \
-        uint32_t numWorkers = traits::GetNumWorkers< cudaSpecs::MaxNumThreadsPerBlockDim::x::value >::value;        \
-        if( numWorkers > blockSize.productOfComponents() )                                                          \
-            numWorkers = blockSize.productOfComponents();                                                           \
-        kernel::detail::SphericMapper<Zone::dim> mapper;                                                            \
-        using namespace pmacc;                                                                                      \
-        PMACC_KERNEL(kernel::detail::RT::KernelForeachLockstep{})(mapper.cudaGridDim(p_zone.size, this->_blockDim), numWorkers) \
-                /*   c0_shifted, ..., cN_shifted    */                                                              \
-            (mapper, blockSize, functor, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _));                           \
+        BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                                      \
+                                                                                                                      \
+        if(this->_blockDim == math::Size_t<DIM3>::create(0))                                                          \
+            this->_blockDim = getBestCudaBlockDim(p_zone.size);                                                       \
+                                                                                                                      \
+        PMACC_VERIFY(this->_blockDim.productOfComponents() <= cudaSpecs::maxNumThreadsPerBlock);                      \
+        PMACC_VERIFY(this->_blockDim.x() <= cudaSpecs::MaxNumThreadsPerBlockDim::x::value);                           \
+        PMACC_VERIFY(this->_blockDim.y() <= cudaSpecs::MaxNumThreadsPerBlockDim::y::value);                           \
+        PMACC_VERIFY(this->_blockDim.z() <= cudaSpecs::MaxNumThreadsPerBlockDim::z::value);                           \
+                                                                                                                      \
+        typename math::Size_t<3>::BaseType blockSize(this->_blockDim.x(), this->_blockDim.y(), this->_blockDim.z());  \
+        uint32_t numWorkers = traits::GetNumWorkers<cudaSpecs::MaxNumThreadsPerBlockDim::x::value>::value;            \
+        if(numWorkers > blockSize.productOfComponents())                                                              \
+            numWorkers = blockSize.productOfComponents();                                                             \
+        kernel::detail::SphericMapper<Zone::dim> mapper;                                                              \
+        using namespace pmacc;                                                                                        \
+        PMACC_KERNEL(kernel::detail::RT::KernelForeachLockstep{})                                                     \
+        (mapper.cuplaGridDim(p_zone.size, this->_blockDim), numWorkers) /*   c0_shifted, ..., cN_shifted    */        \
+            (mapper, blockSize, functor, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _));                                        \
     }
 
-/** Foreach algorithm that calls a cuda kernel
- *
- * This is the run-time version of kernel::Foreach where the
- * cuda blockDim is specified in the constructor
- *
- * @warning collective functors (containing synchronization) are not supported
- */
-struct Foreach
-{
-    math::Size_t<DIM3> _blockDim;
-
-    /* \param _blockDim size of the cuda blockDim.
-     *
-     * blockDim has to fit into the computing volume.
-     * E.g. (8,8,4) fits into (256, 256, 256)
-     *
-     * If no argument is given, the blockDim will be computed heuristically.
-     *
-     */
-    Foreach(math::Size_t<DIM3> _blockDim = math::Size_t<DIM3>::create(0)) : _blockDim(_blockDim) {}
-
-    /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
-     *
-     * \param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
-     * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
-     * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g. _1 = _2)
-     *
-     * The functor or lambdaFun is called for each cell within the zone.
-     * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
-     *
-     */
-    BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), FOREACH_OPERATOR, _)
-};
+                /** Foreach algorithm that calls a cupla kernel
+                 *
+                 * This is the run-time version of kernel::Foreach where the
+                 * cupla blockDim is specified in the constructor
+                 *
+                 * @warning collective functors (containing synchronization) are not supported
+                 */
+                struct Foreach
+                {
+                    math::Size_t<DIM3> _blockDim;
+
+                    /* \param _blockDim size of the cupla blockDim.
+                     *
+                     * blockDim has to fit into the computing volume.
+                     * E.g. (8,8,4) fits into (256, 256, 256)
+                     *
+                     * If no argument is given, the blockDim will be computed heuristically.
+                     *
+                     */
+                    Foreach(math::Size_t<DIM3> _blockDim = math::Size_t<DIM3>::create(0)) : _blockDim(_blockDim)
+                    {
+                    }
+
+                    /* operator()(zone, cursor0, cursor1, ..., cursorN-1, functor or lambdaFun)
+                     *
+                     * \param zone Accepts currently only a zone::SphericZone object (e.g. containerObj.zone())
+                     * \param cursorN cursor for the N-th data source (e.g. containerObj.origin())
+                     * \param functor or lambdaFun either a functor with N arguments or a N-ary lambda function (e.g.
+                     * _1 = _2)
+                     *
+                     * The functor or lambdaFun is called for each cell within the zone.
+                     * It is called like functor(*cursor0(cellId), ..., *cursorN(cellId))
+                     *
+                     */
+                    BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), FOREACH_OPERATOR, _)
+                };
 
 #undef FOREACH_OPERATOR
 #undef SHIFT_CURSOR_ZONE
 #undef SHIFTED_CURSOR
 
-} // namespace RT
-} // namespace kernel
-} // namespace algorithm
+            } // namespace RT
+        } // namespace kernel
+    } // namespace algorithm
 } // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/mpi/Gather.hpp b/include/pmacc/cuSTL/algorithm/mpi/Gather.hpp
index c49830c37b..b045a7f7b0 100644
--- a/include/pmacc/cuSTL/algorithm/mpi/Gather.hpp
+++ b/include/pmacc/cuSTL/algorithm/mpi/Gather.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau
+/* Copyright 2013-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -29,48 +29,60 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace mpi
-{
-
-/**
- */
-template<int dim>
-class Gather
-{
-private:
-    MPI_Comm comm;
-    std::vector<math::Int<dim> > positions;
-    bool m_participate;
-
-    struct CopyToDest
+    namespace algorithm
     {
-        template<typename Type, int memDim, class T_Alloc, class T_Copy, class T_Assign>
-        void operator()(const Gather<dim>& gather,
+        namespace mpi
+        {
+            /**
+             */
+            template<int dim>
+            class Gather
+            {
+            private:
+                MPI_Comm comm;
+                std::vector<math::Int<dim>> positions;
+                bool m_participate;
+
+                struct CopyToDest
+                {
+                    template<typename Type, int memDim, class T_Alloc, class T_Copy, class T_Assign>
+                    void operator()(
+                        const Gather<dim>& gather,
                         container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign>& dest,
                         std::vector<Type>& tmpDest,
                         int dir,
-                        const std::vector<math::Size_t<memDim> >& srcSizes,
+                        const std::vector<math::Size_t<memDim>>& srcSizes,
                         const std::vector<size_t>& srcOffsets) const;
-    };
+                };
 
-public:
-    Gather(const zone::SphericZone<dim>& p_zone);
-    ~Gather();
+            public:
+                Gather(const zone::SphericZone<dim>& p_zone);
+                ~Gather();
 
-    template<typename Type, int memDim, class T_Alloc, class T_Copy, class T_Assign, class T_Alloc2, class T_Copy2, class T_Assign2>
-    void operator()(container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign>& dest,
+                template<
+                    typename Type,
+                    int memDim,
+                    class T_Alloc,
+                    class T_Copy,
+                    class T_Assign,
+                    class T_Alloc2,
+                    class T_Copy2,
+                    class T_Assign2>
+                void operator()(
+                    container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign>& dest,
                     container::CartBuffer<Type, memDim, T_Alloc2, T_Copy2, T_Assign2>& source,
                     int dir = -1) const;
 
-    inline bool participate() const {return m_participate;}
-    inline bool root() const;
-    inline int rank() const;
-};
+                inline bool participate() const
+                {
+                    return m_participate;
+                }
+                inline bool root() const;
+                inline int rank() const;
+            };
 
-} // mpi
-} // algorithm
-} // pmacc
+        } // namespace mpi
+    } // namespace algorithm
+} // namespace pmacc
 
 #include "Gather.tpp"
diff --git a/include/pmacc/cuSTL/algorithm/mpi/Gather.tpp b/include/pmacc/cuSTL/algorithm/mpi/Gather.tpp
index 3f4e92ce02..5a6ebaaffd 100644
--- a/include/pmacc/cuSTL/algorithm/mpi/Gather.tpp
+++ b/include/pmacc/cuSTL/algorithm/mpi/Gather.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Benjamin Worpitz, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -27,262 +27,288 @@
 #include "pmacc/communication/manager_common.hpp"
 
 #include <iostream>
-#include <numeric>      // std::partial_sum
-#include <algorithm>    // std::copy
+#include <numeric> // std::partial_sum
+#include <algorithm> // std::copy
 
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace mpi
-{
-
-namespace GatherHelper
-{
-
-template<int dim, typename Type>
-struct ContiguousPitch
-{
-    math::Size_t<dim-1> operator()(const math::Size_t<dim>& size)
-    {
-        math::Size_t<dim-1> pitch;
-
-        pitch[0] = size[0] * sizeof(Type);
-        for(int axis = 1; axis < dim-1; axis++)
-            pitch[axis] = pitch[axis-1] * size[axis];
-
-        return pitch;
-    }
-};
-
-template<typename Type>
-struct ContiguousPitch<DIM1, Type>
-{
-    math::Size_t<0> operator()(const math::Size_t<DIM1>&)
-    {
-        return math::Size_t<0>();
-    }
-};
-
-} // namespace GatherHelper
-
-template<int dim>
-Gather<dim>::Gather(const zone::SphericZone<dim>& p_zone) : comm(MPI_COMM_NULL)
-{
-    using namespace pmacc::math;
-
-    pmacc::GridController<dim>& con = pmacc::Environment<dim>::get().GridController();
-    Int<dim> pos = con.getPosition();
-
-    int numWorldRanks; MPI_Comm_size(MPI_COMM_WORLD, &numWorldRanks);
-    std::vector<Int<dim> > allPositions(numWorldRanks);
-
-    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-    __getTransactionEvent().waitForFinished();
-    MPI_CHECK(MPI_Allgather(static_cast<void*>(&pos), sizeof(Int<dim>), MPI_CHAR,
-                  static_cast<void*>(allPositions.data()), sizeof(Int<dim>), MPI_CHAR,
-                  MPI_COMM_WORLD));
-
-    std::vector<int> new_ranks;
-    int myWorldId; MPI_Comm_rank(MPI_COMM_WORLD, &myWorldId);
-
-    this->m_participate = false;
-    for(int i = 0; i < static_cast<int>(allPositions.size()); i++)
-    {
-        Int<dim> pos = allPositions[i];
-        if(!p_zone.within(pos)) continue;
-
-        new_ranks.push_back(i);
-        this->positions.push_back(allPositions[i]);
-        if(i == myWorldId) this->m_participate = true;
-    }
-    MPI_Group world_group, new_group;
-
-    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-    __getTransactionEvent().waitForFinished();
-    MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &world_group));
-    MPI_CHECK(MPI_Group_incl(world_group, new_ranks.size(), new_ranks.data(), &new_group));
-    MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, new_group, &this->comm));
-    MPI_CHECK(MPI_Group_free(&new_group));
-}
-
-template<int dim>
-Gather<dim>::~Gather()
-{
-    if(this->comm != MPI_COMM_NULL)
-    {
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&this->comm));
-    }
-}
-
-template<int dim>
-bool Gather<dim>::root() const
-{
-    if(!this->m_participate)
-    {
-        std::cerr << "error[mpi::Gather::root()]: this process does not participate in gathering.\n";
-        return false;
-    }
-    int myId; MPI_Comm_rank(this->comm, &myId);
-    return myId == 0;
-}
-
-template<int dim>
-int Gather<dim>::rank() const
-{
-    if(!this->m_participate)
-    {
-        std::cerr << "error[mpi::Gather::rank()]: this process does not participate in gathering.\n";
-        return -1;
-    }
-    int myId; MPI_Comm_rank(this->comm, &myId);
-    return myId;
-}
-
-template<int dim>
-template<typename Type, int memDim, class T_Alloc, class T_Copy, class T_Assign>
-void Gather<dim>::CopyToDest::operator()(
-                        const Gather<dim>& gather,
-                        container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign>& dest,
-                        std::vector<Type>& tmpDest,
-                        int dir,
-                        const std::vector<math::Size_t<memDim> >& srcSizes,
-                        const std::vector<size_t>& srcOffsets1D) const
-{
-    using namespace math;
-
-    int numRanks = static_cast<int>(gather.positions.size());
-
-    // calculate sizes per axis in destination buffer
-    std::vector<size_t> sizesPerAxis[memDim];
-
-    // sizes per axis
-    for(int i = 0; i < numRanks; i++)
-    {
-        Int<dim> pos = gather.positions[i];
-        Int<memDim> posInMem = pos.template shrink<memDim>(dir+1);
-        for(int axis = 0; axis < memDim; axis++)
-        {
-            size_t posOnAxis = static_cast<size_t>(posInMem[axis]);
-            if(posOnAxis >= sizesPerAxis[axis].size())
-                sizesPerAxis[axis].resize(posOnAxis + 1);
-            sizesPerAxis[axis][posOnAxis] = srcSizes[i][axis];
-        }
-    }
-
-    // calculate offsets per axis in destination buffer
-    std::vector<size_t> offsetsPerAxis[memDim];
-
-    // offsets per axis
-    for(int axis = 0; axis < memDim; axis++)
-    {
-        offsetsPerAxis[axis].resize(sizesPerAxis[axis].size());
-        std::vector<size_t> partialSum(offsetsPerAxis[axis].size());
-        std::partial_sum(sizesPerAxis[axis].begin(), sizesPerAxis[axis].end(), partialSum.begin());
-        offsetsPerAxis[axis][0] = 0;
-        std::copy(partialSum.begin(), partialSum.end()-1, offsetsPerAxis[axis].begin()+1);
-    }
-
-    // copy from one dimensional mpi buffer to n dimensional destination buffer
-    for(int i = 0; i < numRanks; i++)
-    {
-        Int<dim> pos = gather.positions[i];
-        Int<memDim> posInMem = pos.template shrink<memDim>(dir+1);
-        Int<memDim> ndim_offset;
-        for(int axis = 0; axis < memDim; axis++)
-            ndim_offset[axis] = offsetsPerAxis[axis][posInMem[axis]];
-
-        // calculate srcPitch (contiguous memory)
-        Size_t<memDim-1> srcPitch = GatherHelper::ContiguousPitch<memDim, Type>()(srcSizes[i]);
-
-        cudaWrapper::Memcopy<memDim>()(
-            &(*dest.origin()(ndim_offset)),
-            dest.getPitch(),
-            tmpDest.data() + srcOffsets1D[i],
-            srcPitch,
-            srcSizes[i],
-            cudaWrapper::flags::Memcopy::hostToHost);
-    }
-}
-
-template<int dim>
-template<typename Type, int memDim, class T_Alloc, class T_Copy, class T_Assign, class T_Alloc2, class T_Copy2, class T_Assign2>
-void Gather<dim>::operator()(container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign>& dest,
-                             container::CartBuffer<Type, memDim, T_Alloc2, T_Copy2, T_Assign2>& source, int dir) const
-{
-    using namespace pmacc::math;
-
-    if(!this->m_participate) return;
-    typedef container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign> DestBuffer;
-    typedef container::CartBuffer<Type, memDim, T_Alloc2, T_Copy2, T_Assign2> SrcBuffer;
-    PMACC_CASSERT_MSG(
-            Can_Only_Gather_Host_Memory,
-            boost::is_same<typename DestBuffer::memoryTag, allocator::tag::host>::value &&
-            boost::is_same<typename SrcBuffer::memoryTag, allocator::tag::host>::value);
-
-    const bool useTmpSrc = source.isContigousMemory();
-    int numRanks; MPI_Comm_size(this->comm, &numRanks);
-    std::vector<Type> tmpDest(root() ? numRanks * source.size().productOfComponents() : 0);
-    container::HostBuffer<Type, memDim> tmpSrc(useTmpSrc ? source.size() : math::Size_t<memDim>::create(0));
-    if(useTmpSrc)
-        tmpSrc = source; /* Mem copy */
-
-    // Get number of elements for each source buffer
-    std::vector<Size_t<memDim> > srcBufferSizes(numRanks);
-    Size_t<memDim> srcBufferSize = source.size();
-    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-    __getTransactionEvent().waitForFinished();
-    MPI_CHECK(MPI_Gather(
-        static_cast<void*>(&srcBufferSize),
-        sizeof(Size_t<memDim>),
-        MPI_CHAR,
-        static_cast<void*>(srcBufferSizes.data()),
-        sizeof(Size_t<memDim>),
-        MPI_CHAR,
-        0, this->comm));
-
-    // 1D offsets in destination buffer
-    std::vector<size_t> srcBufferOffsets1D(numRanks);
-    std::vector<size_t> srcBufferSizes1D(numRanks);
-    std::vector<int> srcBufferOffsets1D_char(numRanks); // `MPI_Gatherv` demands `int*`
-    std::vector<int> srcBufferSizes1D_char(numRanks);
-
-    if(this->root())
+    namespace algorithm
     {
-        for(int i = 0; i < numRanks; i++)
-            srcBufferSizes1D[i] = srcBufferSizes[i].productOfComponents();
-        std::vector<size_t> partialSum(numRanks);
-        std::partial_sum(srcBufferSizes1D.begin(), srcBufferSizes1D.end(), partialSum.begin());
-        srcBufferOffsets1D[0] = 0;
-        std::copy(partialSum.begin(), partialSum.end()-1, srcBufferOffsets1D.begin()+1);
-
-        for(int i = 0; i < numRanks; i++)
+        namespace mpi
         {
-            srcBufferOffsets1D_char[i] = static_cast<int>(srcBufferOffsets1D[i]) * sizeof(Type);
-            srcBufferSizes1D_char[i] = static_cast<int>(srcBufferSizes1D[i]) * sizeof(Type);
-        }
-    }
-
-    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-    __getTransactionEvent().waitForFinished();
-    // gather
-    MPI_CHECK(MPI_Gatherv(
-               useTmpSrc ? static_cast<void*>(tmpSrc.getDataPointer()) : static_cast<void*>(source.getDataPointer()),
-               source.size().productOfComponents() * sizeof(Type),
-               MPI_CHAR,
-               root() ? static_cast<void*>(tmpDest.data()) : nullptr,
-               srcBufferSizes1D_char.data(),
-               srcBufferOffsets1D_char.data(),
-               MPI_CHAR,
-               0, this->comm));
-    if(!root()) return;
-
-    CopyToDest()(*this, dest, tmpDest, dir, srcBufferSizes, srcBufferOffsets1D);
-}
-
-} // mpi
-} // algorithm
-} // pmacc
+            namespace GatherHelper
+            {
+                template<int dim, typename Type>
+                struct ContiguousPitch
+                {
+                    math::Size_t<dim - 1> operator()(const math::Size_t<dim>& size)
+                    {
+                        math::Size_t<dim - 1> pitch;
+
+                        pitch[0] = size[0] * sizeof(Type);
+                        for(int axis = 1; axis < dim - 1; axis++)
+                            pitch[axis] = pitch[axis - 1] * size[axis];
+
+                        return pitch;
+                    }
+                };
+
+                template<typename Type>
+                struct ContiguousPitch<DIM1, Type>
+                {
+                    math::Size_t<0> operator()(const math::Size_t<DIM1>&)
+                    {
+                        return math::Size_t<0>();
+                    }
+                };
+
+            } // namespace GatherHelper
+
+            template<int dim>
+            Gather<dim>::Gather(const zone::SphericZone<dim>& p_zone) : comm(MPI_COMM_NULL)
+            {
+                using namespace pmacc::math;
+
+                pmacc::GridController<dim>& con = pmacc::Environment<dim>::get().GridController();
+                Int<dim> pos = con.getPosition();
+
+                int numWorldRanks;
+                MPI_Comm_size(MPI_COMM_WORLD, &numWorldRanks);
+                std::vector<Int<dim>> allPositions(numWorldRanks);
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Allgather(
+                    static_cast<void*>(&pos),
+                    sizeof(Int<dim>),
+                    MPI_CHAR,
+                    static_cast<void*>(allPositions.data()),
+                    sizeof(Int<dim>),
+                    MPI_CHAR,
+                    MPI_COMM_WORLD));
+
+                std::vector<int> new_ranks;
+                int myWorldId;
+                MPI_Comm_rank(MPI_COMM_WORLD, &myWorldId);
+
+                this->m_participate = false;
+                for(int i = 0; i < static_cast<int>(allPositions.size()); i++)
+                {
+                    Int<dim> pos = allPositions[i];
+                    if(!p_zone.within(pos))
+                        continue;
+
+                    new_ranks.push_back(i);
+                    this->positions.push_back(allPositions[i]);
+                    if(i == myWorldId)
+                        this->m_participate = true;
+                }
+                MPI_Group world_group, new_group;
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &world_group));
+                MPI_CHECK(MPI_Group_incl(world_group, new_ranks.size(), new_ranks.data(), &new_group));
+                MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, new_group, &this->comm));
+                MPI_CHECK(MPI_Group_free(&new_group));
+            }
+
+            template<int dim>
+            Gather<dim>::~Gather()
+            {
+                if(this->comm != MPI_COMM_NULL)
+                {
+                    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&this->comm));
+                }
+            }
+
+            template<int dim>
+            bool Gather<dim>::root() const
+            {
+                if(!this->m_participate)
+                {
+                    std::cerr << "error[mpi::Gather::root()]: this process does not participate in gathering.\n";
+                    return false;
+                }
+                int myId;
+                MPI_Comm_rank(this->comm, &myId);
+                return myId == 0;
+            }
+
+            template<int dim>
+            int Gather<dim>::rank() const
+            {
+                if(!this->m_participate)
+                {
+                    std::cerr << "error[mpi::Gather::rank()]: this process does not participate in gathering.\n";
+                    return -1;
+                }
+                int myId;
+                MPI_Comm_rank(this->comm, &myId);
+                return myId;
+            }
+
+            template<int dim>
+            template<typename Type, int memDim, class T_Alloc, class T_Copy, class T_Assign>
+            void Gather<dim>::CopyToDest::operator()(
+                const Gather<dim>& gather,
+                container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign>& dest,
+                std::vector<Type>& tmpDest,
+                int dir,
+                const std::vector<math::Size_t<memDim>>& srcSizes,
+                const std::vector<size_t>& srcOffsets1D) const
+            {
+                using namespace math;
+
+                int numRanks = static_cast<int>(gather.positions.size());
+
+                // calculate sizes per axis in destination buffer
+                std::vector<size_t> sizesPerAxis[memDim];
+
+                // sizes per axis
+                for(int i = 0; i < numRanks; i++)
+                {
+                    Int<dim> pos = gather.positions[i];
+                    Int<memDim> posInMem = pos.template shrink<memDim>(dir + 1);
+                    for(int axis = 0; axis < memDim; axis++)
+                    {
+                        size_t posOnAxis = static_cast<size_t>(posInMem[axis]);
+                        if(posOnAxis >= sizesPerAxis[axis].size())
+                            sizesPerAxis[axis].resize(posOnAxis + 1);
+                        sizesPerAxis[axis][posOnAxis] = srcSizes[i][axis];
+                    }
+                }
+
+                // calculate offsets per axis in destination buffer
+                std::vector<size_t> offsetsPerAxis[memDim];
+
+                // offsets per axis
+                for(int axis = 0; axis < memDim; axis++)
+                {
+                    offsetsPerAxis[axis].resize(sizesPerAxis[axis].size());
+                    std::vector<size_t> partialSum(offsetsPerAxis[axis].size());
+                    std::partial_sum(sizesPerAxis[axis].begin(), sizesPerAxis[axis].end(), partialSum.begin());
+                    offsetsPerAxis[axis][0] = 0;
+                    std::copy(partialSum.begin(), partialSum.end() - 1, offsetsPerAxis[axis].begin() + 1);
+                }
+
+                // copy from one dimensional mpi buffer to n dimensional destination buffer
+                for(int i = 0; i < numRanks; i++)
+                {
+                    Int<dim> pos = gather.positions[i];
+                    Int<memDim> posInMem = pos.template shrink<memDim>(dir + 1);
+                    Int<memDim> ndim_offset;
+                    for(int axis = 0; axis < memDim; axis++)
+                        ndim_offset[axis] = offsetsPerAxis[axis][posInMem[axis]];
+
+                    // calculate srcPitch (contiguous memory)
+                    Size_t<memDim - 1> srcPitch = GatherHelper::ContiguousPitch<memDim, Type>()(srcSizes[i]);
+
+                    cuplaWrapper::Memcopy<memDim>()(
+                        &(*dest.origin()(ndim_offset)),
+                        dest.getPitch(),
+                        tmpDest.data() + srcOffsets1D[i],
+                        srcPitch,
+                        srcSizes[i],
+                        cuplaWrapper::flags::Memcopy::hostToHost);
+                }
+            }
+
+            template<int dim>
+            template<
+                typename Type,
+                int memDim,
+                class T_Alloc,
+                class T_Copy,
+                class T_Assign,
+                class T_Alloc2,
+                class T_Copy2,
+                class T_Assign2>
+            void Gather<dim>::operator()(
+                container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign>& dest,
+                container::CartBuffer<Type, memDim, T_Alloc2, T_Copy2, T_Assign2>& source,
+                int dir) const
+            {
+                using namespace pmacc::math;
+
+                if(!this->m_participate)
+                    return;
+                typedef container::CartBuffer<Type, memDim, T_Alloc, T_Copy, T_Assign> DestBuffer;
+                typedef container::CartBuffer<Type, memDim, T_Alloc2, T_Copy2, T_Assign2> SrcBuffer;
+                PMACC_CASSERT_MSG(
+                    Can_Only_Gather_Host_Memory,
+                    boost::is_same<typename DestBuffer::memoryTag, allocator::tag::host>::value
+                        && boost::is_same<typename SrcBuffer::memoryTag, allocator::tag::host>::value);
+
+                const bool useTmpSrc = source.isContigousMemory();
+                int numRanks;
+                MPI_Comm_size(this->comm, &numRanks);
+                std::vector<Type> tmpDest(root() ? numRanks * source.size().productOfComponents() : 0);
+                container::HostBuffer<Type, memDim> tmpSrc(
+                    useTmpSrc ? source.size() : math::Size_t<memDim>::create(0));
+                if(useTmpSrc)
+                    tmpSrc = source; /* Mem copy */
+
+                // Get number of elements for each source buffer
+                std::vector<Size_t<memDim>> srcBufferSizes(numRanks);
+                Size_t<memDim> srcBufferSize = source.size();
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Gather(
+                    static_cast<void*>(&srcBufferSize),
+                    sizeof(Size_t<memDim>),
+                    MPI_CHAR,
+                    static_cast<void*>(srcBufferSizes.data()),
+                    sizeof(Size_t<memDim>),
+                    MPI_CHAR,
+                    0,
+                    this->comm));
+
+                // 1D offsets in destination buffer
+                std::vector<size_t> srcBufferOffsets1D(numRanks);
+                std::vector<size_t> srcBufferSizes1D(numRanks);
+                std::vector<int> srcBufferOffsets1D_char(numRanks); // `MPI_Gatherv` demands `int*`
+                std::vector<int> srcBufferSizes1D_char(numRanks);
+
+                if(this->root())
+                {
+                    for(int i = 0; i < numRanks; i++)
+                        srcBufferSizes1D[i] = srcBufferSizes[i].productOfComponents();
+                    std::vector<size_t> partialSum(numRanks);
+                    std::partial_sum(srcBufferSizes1D.begin(), srcBufferSizes1D.end(), partialSum.begin());
+                    srcBufferOffsets1D[0] = 0;
+                    std::copy(partialSum.begin(), partialSum.end() - 1, srcBufferOffsets1D.begin() + 1);
+
+                    for(int i = 0; i < numRanks; i++)
+                    {
+                        srcBufferOffsets1D_char[i] = static_cast<int>(srcBufferOffsets1D[i]) * sizeof(Type);
+                        srcBufferSizes1D_char[i] = static_cast<int>(srcBufferSizes1D[i]) * sizeof(Type);
+                    }
+                }
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                // gather
+                MPI_CHECK(MPI_Gatherv(
+                    useTmpSrc ? static_cast<void*>(tmpSrc.getDataPointer())
+                              : static_cast<void*>(source.getDataPointer()),
+                    source.size().productOfComponents() * sizeof(Type),
+                    MPI_CHAR,
+                    root() ? static_cast<void*>(tmpDest.data()) : nullptr,
+                    srcBufferSizes1D_char.data(),
+                    srcBufferOffsets1D_char.data(),
+                    MPI_CHAR,
+                    0,
+                    this->comm));
+                if(!root())
+                    return;
+
+                CopyToDest()(*this, dest, tmpDest, dir, srcBufferSizes, srcBufferOffsets1D);
+            }
+
+        } // namespace mpi
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/algorithm/mpi/Reduce.hpp b/include/pmacc/cuSTL/algorithm/mpi/Reduce.hpp
index 6037987eaa..88a0409e04 100644
--- a/include/pmacc/cuSTL/algorithm/mpi/Reduce.hpp
+++ b/include/pmacc/cuSTL/algorithm/mpi/Reduce.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau
+/* Copyright 2013-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -29,66 +29,70 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace mpi
-{
+    namespace algorithm
+    {
+        namespace mpi
+        {
+            /** Reduce algorithm for mpi
+             *
+             * \tparam dim dimension of the mpi node volume which has to be reduced.
+             *
+             * This algorithm reduces node-wise. For each node you pass a data container as source
+             * and another container of the same size as destination. The result is stored in
+             * the destination container of the root node.
+             *
+             * The data values of the container are reduced independently of each other.
+             *
+             * The dimension of the container need not be the same as dim.
+             *
+             */
+            template<int dim>
+            class Reduce
+            {
+            private:
+                MPI_Comm comm;
+                bool m_participate;
 
-/** Reduce algorithm for mpi
- *
- * \tparam dim dimension of the mpi node volume which has to be reduced.
- *
- * This algorithm reduces node-wise. For each node you pass a data container as source
- * and another container of the same size as destination. The result is stored in
- * the destination container of the root node.
- *
- * The data values of the container are reduced independently of each other.
- *
- * The dimension of the container need not be the same as dim.
- *
- */
-template<int dim>
-class Reduce
-{
-private:
-    MPI_Comm comm;
-    bool m_participate;
-public:
-    /** constructor
-     *
-     * \param zone The zone specifies which mpi-nodes participate in the reduce operation.
-     * \param setThisAsRoot Set this node explicitly as root. May only be true for one node.
-     *
-     * if setThisAsRoot is not set mpi chooses the root node.
-     *
-     */
-    Reduce(const zone::SphericZone<dim>& zone, bool setThisAsRoot = false);
-    ~Reduce();
+            public:
+                /** constructor
+                 *
+                 * \param zone The zone specifies which mpi-nodes participate in the reduce operation.
+                 * \param setThisAsRoot Set this node explicitly as root. May only be true for one node.
+                 *
+                 * if setThisAsRoot is not set mpi chooses the root node.
+                 *
+                 */
+                Reduce(const zone::SphericZone<dim>& zone, bool setThisAsRoot = false);
+                ~Reduce();
 
-    /* execute the algorithm
-     *
-     * \param dest destination container
-     * \param src source container
-     * \param ExprOrFunctor functor with two arguments which returns the result of the reduce operation.
-     *
-     * Since only the functor's type is given, the functor must have a standart constructor.
-     *
-     */
-    template<typename Type, int conDim, typename ExprOrFunctor>
-    void operator()(container::HostBuffer<Type, conDim>& dest,
+                /* execute the algorithm
+                 *
+                 * \param dest destination container
+                 * \param src source container
+                 * \param ExprOrFunctor functor with two arguments which returns the result of the reduce operation.
+                 *
+                 * Since only the functor's type is given, the functor must have a standart constructor.
+                 *
+                 */
+                template<typename Type, int conDim, typename ExprOrFunctor>
+                void operator()(
+                    container::HostBuffer<Type, conDim>& dest,
                     const container::HostBuffer<Type, conDim>& src,
                     ExprOrFunctor) const;
 
-    // Returns whether this node is within the zone.
-    inline bool participate() const {return m_participate;}
-    // Returns whether this node is the root node.
-    inline bool root() const;
-    // Returns the mpi rank of this node.
-    inline int rank() const;
-};
+                // Returns whether this node is within the zone.
+                inline bool participate() const
+                {
+                    return m_participate;
+                }
+                // Returns whether this node is the root node.
+                inline bool root() const;
+                // Returns the mpi rank of this node.
+                inline int rank() const;
+            };
 
-} // mpi
-} // algorithm
-} // pmacc
+        } // namespace mpi
+    } // namespace algorithm
+} // namespace pmacc
 
 #include "pmacc/cuSTL/algorithm/mpi/Reduce.tpp"
diff --git a/include/pmacc/cuSTL/algorithm/mpi/Reduce.tpp b/include/pmacc/cuSTL/algorithm/mpi/Reduce.tpp
index 5afee1e020..f82afcd54e 100644
--- a/include/pmacc/cuSTL/algorithm/mpi/Reduce.tpp
+++ b/include/pmacc/cuSTL/algorithm/mpi/Reduce.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -33,138 +33,157 @@
 
 namespace pmacc
 {
-namespace algorithm
-{
-namespace mpi
-{
-
-template<int dim>
-Reduce<dim>::Reduce(const zone::SphericZone<dim>& p_zone, bool setThisAsRoot) : comm(MPI_COMM_NULL)
-{
-    using namespace math;
-
-    auto& con = Environment<dim>::get().GridController();
-
-    typedef std::pair<Int<dim>, bool> PosFlag;
-    PosFlag posFlag;
-    posFlag.first = (Int<dim>)con.getPosition();
-    posFlag.second = setThisAsRoot;
-
-    int numWorldRanks; MPI_Comm_size(MPI_COMM_WORLD, &numWorldRanks);
-    std::vector<PosFlag> allPositionsFlags(numWorldRanks);
-
-    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-    __getTransactionEvent().waitForFinished();
-    MPI_CHECK(MPI_Allgather((void*)&posFlag, sizeof(PosFlag), MPI_CHAR,
-                  (void*)allPositionsFlags.data(), sizeof(PosFlag), MPI_CHAR,
-                  MPI_COMM_WORLD));
-
-    std::vector<int> new_ranks;
-    int myWorldId; MPI_Comm_rank(MPI_COMM_WORLD, &myWorldId);
-
-    this->m_participate = false;
-    for(int i = 0; i < (int)allPositionsFlags.size(); i++)
-    {
-        Int<dim> pos = allPositionsFlags[i].first;
-        bool flag = allPositionsFlags[i].second;
-        if(!p_zone.within(pos)) continue;
-
-        new_ranks.push_back(i);
-        //if rank i is supposed to be the new root put him at the front
-        if(flag) std::swap(new_ranks.front(), new_ranks.back());
-        if(i == myWorldId) this->m_participate = true;
-    }
-
-    MPI_Group world_group = MPI_GROUP_NULL;
-    MPI_Group new_group = MPI_GROUP_NULL;
-
-    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-    __getTransactionEvent().waitForFinished();
-    MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &world_group));
-    MPI_CHECK(MPI_Group_incl(world_group, new_ranks.size(), &(new_ranks.front()), &new_group));
-    MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, new_group, &this->comm));
-    MPI_CHECK(MPI_Group_free(&new_group));
-    MPI_CHECK(MPI_Group_free(&world_group));
-}
-
-template<int dim>
-Reduce<dim>::~Reduce()
-{
-    if(this->comm != MPI_COMM_NULL)
-    {
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&this->comm));
-    }
-}
-
-template<int dim>
-bool Reduce<dim>::root() const
-{
-    if(!this->m_participate)
+    namespace algorithm
     {
-        std::cerr << "error[mpi::Reduce::root()]: this process does not participate in reducing.\n";
-        return false;
-    }
-    int myId; MPI_Comm_rank(this->comm, &myId);
-    return myId == 0;
-}
-
-template<int dim>
-int Reduce<dim>::rank() const
-{
-    if(!this->m_participate)
-    {
-        std::cerr << "error[mpi::Reduce::rank()]: this process does not participate in reducing.\n";
-        return -1;
-    }
-    int myId; MPI_Comm_rank(this->comm, &myId);
-    return myId;
-}
-
-namespace detail
-{
-
-template<typename Functor, typename type>
-struct MPI_User_Op
-{
-    static void callback(void* invec, void* inoutvec, int *len, MPI_Datatype*)
-    {
-        Functor functor;
-        type* inoutvec_t = (type*)inoutvec;
-        type* invec_t = (type*)invec;
-
-        int size = (*len)/sizeof(type);
-        for(int i = 0; i < size; i++)
+        namespace mpi
         {
-            inoutvec_t[i] = functor(inoutvec_t[i], invec_t[i]);
-        }
-    }
-};
-
-} // detail
-
-template<int dim>
-template<typename Type, int conDim, typename Functor>
-void Reduce<dim>::operator()
-                   (container::HostBuffer<Type, conDim>& dest,
-                    const container::HostBuffer<Type, conDim>& src,
-                    Functor) const
-{
-    if(!this->m_participate) return;
-
-    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-    __getTransactionEvent().waitForFinished();
-
-    MPI_Op user_op;
-    MPI_CHECK(MPI_Op_create(&detail::MPI_User_Op<Functor, Type>::callback, 1, &user_op));
-
-    MPI_CHECK(MPI_Reduce(&(*src.origin()), &(*dest.origin()), sizeof(Type) * dest.size().productOfComponents(),
-        MPI_CHAR, user_op, 0, this->comm));
-
-    MPI_CHECK(MPI_Op_free(&user_op));
-}
-
-} // mpi
-} // algorithm
-} // pmacc
+            template<int dim>
+            Reduce<dim>::Reduce(const zone::SphericZone<dim>& p_zone, bool setThisAsRoot) : comm(MPI_COMM_NULL)
+            {
+                using namespace math;
+
+                auto& con = Environment<dim>::get().GridController();
+
+                typedef std::pair<Int<dim>, bool> PosFlag;
+                PosFlag posFlag;
+                posFlag.first = (Int<dim>) con.getPosition();
+                posFlag.second = setThisAsRoot;
+
+                int numWorldRanks;
+                MPI_Comm_size(MPI_COMM_WORLD, &numWorldRanks);
+                std::vector<PosFlag> allPositionsFlags(numWorldRanks);
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Allgather(
+                    (void*) &posFlag,
+                    sizeof(PosFlag),
+                    MPI_CHAR,
+                    (void*) allPositionsFlags.data(),
+                    sizeof(PosFlag),
+                    MPI_CHAR,
+                    MPI_COMM_WORLD));
+
+                std::vector<int> new_ranks;
+                int myWorldId;
+                MPI_Comm_rank(MPI_COMM_WORLD, &myWorldId);
+
+                this->m_participate = false;
+                for(int i = 0; i < (int) allPositionsFlags.size(); i++)
+                {
+                    Int<dim> pos = allPositionsFlags[i].first;
+                    bool flag = allPositionsFlags[i].second;
+                    if(!p_zone.within(pos))
+                        continue;
+
+                    new_ranks.push_back(i);
+                    // if rank i is supposed to be the new root put him at the front
+                    if(flag)
+                        std::swap(new_ranks.front(), new_ranks.back());
+                    if(i == myWorldId)
+                        this->m_participate = true;
+                }
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                if(new_ranks.size())
+                {
+                    MPI_Group world_group = MPI_GROUP_NULL;
+                    MPI_Group new_group = MPI_GROUP_NULL;
+                    MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &world_group));
+                    MPI_CHECK(MPI_Group_incl(world_group, new_ranks.size(), &(new_ranks.front()), &new_group));
+                    MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, new_group, &this->comm));
+                    MPI_CHECK(MPI_Group_free(&new_group));
+                    MPI_CHECK(MPI_Group_free(&world_group));
+                }
+            }
+
+            template<int dim>
+            Reduce<dim>::~Reduce()
+            {
+                if(this->comm != MPI_COMM_NULL)
+                {
+                    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&this->comm));
+                }
+            }
+
+            template<int dim>
+            bool Reduce<dim>::root() const
+            {
+                if(!this->m_participate)
+                {
+                    std::cerr << "error[mpi::Reduce::root()]: this process does not participate in reducing.\n";
+                    return false;
+                }
+                int myId;
+                MPI_Comm_rank(this->comm, &myId);
+                return myId == 0;
+            }
+
+            template<int dim>
+            int Reduce<dim>::rank() const
+            {
+                if(!this->m_participate)
+                {
+                    std::cerr << "error[mpi::Reduce::rank()]: this process does not participate in reducing.\n";
+                    return -1;
+                }
+                int myId;
+                MPI_Comm_rank(this->comm, &myId);
+                return myId;
+            }
+
+            namespace detail
+            {
+                template<typename Functor, typename type>
+                struct MPI_User_Op
+                {
+                    static void callback(void* invec, void* inoutvec, int* len, MPI_Datatype*)
+                    {
+                        Functor functor;
+                        type* inoutvec_t = (type*) inoutvec;
+                        type* invec_t = (type*) invec;
+
+                        int size = (*len) / sizeof(type);
+                        for(int i = 0; i < size; i++)
+                        {
+                            inoutvec_t[i] = functor(inoutvec_t[i], invec_t[i]);
+                        }
+                    }
+                };
+
+            } // namespace detail
+
+            template<int dim>
+            template<typename Type, int conDim, typename Functor>
+            void Reduce<dim>::operator()(
+                container::HostBuffer<Type, conDim>& dest,
+                const container::HostBuffer<Type, conDim>& src,
+                Functor) const
+            {
+                if(!this->m_participate)
+                    return;
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+
+                MPI_Op user_op;
+                MPI_CHECK(MPI_Op_create(&detail::MPI_User_Op<Functor, Type>::callback, 1, &user_op));
+
+                MPI_CHECK(MPI_Reduce(
+                    &(*src.origin()),
+                    &(*dest.origin()),
+                    sizeof(Type) * dest.size().productOfComponents(),
+                    MPI_CHAR,
+                    user_op,
+                    0,
+                    this->comm));
+
+                MPI_CHECK(MPI_Op_free(&user_op));
+            }
+
+        } // namespace mpi
+    } // namespace algorithm
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/CartBuffer.hpp b/include/pmacc/cuSTL/container/CartBuffer.hpp
index 0dcfbdb2b0..be158e66a5 100644
--- a/include/pmacc/cuSTL/container/CartBuffer.hpp
+++ b/include/pmacc/cuSTL/container/CartBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -42,98 +42,111 @@
 
 namespace pmacc
 {
-namespace container
-{
+    namespace container
+    {
+        namespace bmpl = boost::mpl;
 
-namespace bmpl = boost::mpl;
+        /** Implementation of a box-shaped (cartesian) container type.
+         * Holds a reference counter so one can have several containers sharing one buffer.
+         * Is designed to be an RAII class, but does not fully obey the RAII rules (see copy-ctor).
+         * The way memory gets allocated, copied and assigned is
+         * fully controlled by three policy classes.
+         * \tparam Type type of a single value
+         * \tparam T_dim dimension of the container
+         * \tparam Allocator allocates and releases memory
+         * \tparam Copier copies one memory buffer to another
+         * \tparam Assigner assigns a value to every datum of a memory buffer
+         *
+         * Assigner policy has to support `apply2`: Assigner<Dim, CartBuffer>
+         *
+         */
+        template<
+            typename Type,
+            int T_dim,
+            typename Allocator = allocator::EmptyAllocator,
+            typename Copier = mpl::void_,
+            typename Assigner = bmpl::vector<bmpl::_1, bmpl::_2>>
+        class CartBuffer
+            : public
+              /* "Curiously recurring template pattern" */
+              bmpl::apply<Assigner, bmpl::int_<T_dim>, CartBuffer<Type, T_dim, Allocator, Copier, Assigner>>::type
+        {
+        public:
+            typedef Type type;
+            static constexpr int dim = T_dim;
+            typedef cursor::BufferCursor<Type, T_dim> Cursor;
+            typedef typename Allocator::tag memoryTag;
+            typedef math::Size_t<T_dim> SizeType;
+            typedef math::Size_t<T_dim - 1> PitchType;
 
-/** Implementation of a box-shaped (cartesian) container type.
- * Holds a reference counter so one can have several containers sharing one buffer.
- * Is designed to be an RAII class, but does not fully obey the RAII rules (see copy-ctor).
- * The way memory gets allocated, copied and assigned is
- * fully controlled by three policy classes.
- * \tparam Type type of a single value
- * \tparam T_dim dimension of the container
- * \tparam Allocator allocates and releases memory
- * \tparam Copier copies one memory buffer to another
- * \tparam Assigner assigns a value to every datum of a memory buffer
- *
- * Assigner policy has to support `apply2`: Assigner<Dim, CartBuffer>
- *
- */
-template<typename Type, int T_dim, typename Allocator = allocator::EmptyAllocator,
-                                  typename Copier = mpl::void_,
-                                  typename Assigner = bmpl::vector<bmpl::_1, bmpl::_2> >
-class CartBuffer : public
-    /* "Curiously recurring template pattern" */
-    bmpl::apply<Assigner, bmpl::int_<T_dim>, CartBuffer<Type, T_dim, Allocator, Copier, Assigner> >::type
-{
-public:
-    typedef Type type;
-    static constexpr int dim = T_dim;
-    typedef cursor::BufferCursor<Type, T_dim> Cursor;
-    typedef typename Allocator::tag memoryTag;
-    typedef math::Size_t<T_dim> SizeType;
-    typedef math::Size_t<T_dim-1> PitchType;
-public:
-    Type* dataPointer;
-    int* refCount;
-    SizeType _size;
-    PitchType pitch;
-    HDINLINE void init();
-    HDINLINE void exit();
-    HDINLINE CartBuffer() : refCount(nullptr) {}
+        public:
+            Type* dataPointer;
+            int* refCount;
+            SizeType _size;
+            PitchType pitch;
+            HDINLINE void init();
+            HDINLINE void exit();
+            HDINLINE CartBuffer() : refCount(nullptr)
+            {
+            }
 
-public:
-    HDINLINE CartBuffer(const math::Size_t<T_dim>& size);
-    HDINLINE CartBuffer(size_t x);
-    HDINLINE CartBuffer(size_t x, size_t y);
-    HDINLINE CartBuffer(size_t x, size_t y, size_t z);
-    /* the copy constructor just increments the reference counter but does not copy memory */
-    HDINLINE CartBuffer(const CartBuffer& other);
-    HDINLINE CartBuffer(CartBuffer&& other);
-    HDINLINE ~CartBuffer();
+        public:
+            HDINLINE CartBuffer(const math::Size_t<T_dim>& size);
+            HDINLINE CartBuffer(size_t x);
+            HDINLINE CartBuffer(size_t x, size_t y);
+            HDINLINE CartBuffer(size_t x, size_t y, size_t z);
+            /* the copy constructor just increments the reference counter but does not copy memory */
+            HDINLINE CartBuffer(const CartBuffer& other);
+            HDINLINE CartBuffer(CartBuffer&& other);
+            HDINLINE ~CartBuffer();
 
-    /* copy another container into this one (hard data copy) */
-    HDINLINE CartBuffer&
-    operator=(const CartBuffer& rhs);
-    /* use the memory from another container and increment the reference counter */
-    HDINLINE CartBuffer&
-    operator=(CartBuffer&& rhs);
+            /* copy another container into this one (hard data copy) */
+            HDINLINE CartBuffer& operator=(const CartBuffer& rhs);
+            /* use the memory from another container and increment the reference counter */
+            HDINLINE CartBuffer& operator=(CartBuffer&& rhs);
 
-    /* get a view. Views represent a clipped area of the container.
-     * \param a Top left corner of the view, inside the view.
-     * Negative values are remapped, e.g. Int<2>(-1,-2) == Int<2>(width-1, height-2)
-     * \param b Bottom right corner of the view, outside the view.
-     * Values are remapped, so that Int<2>(0,0) == Int<2>(width, height)
-     */
-    HDINLINE View<CartBuffer>
-        view(math::Int<T_dim> a = math::Int<T_dim>(0),
-             math::Int<T_dim> b = math::Int<T_dim>(0)) const;
+            /* get a view. Views represent a clipped area of the container.
+             * \param a Top left corner of the view, inside the view.
+             * Negative values are remapped, e.g. Int<2>(-1,-2) == Int<2>(width-1, height-2)
+             * \param b Bottom right corner of the view, outside the view.
+             * Values are remapped, so that Int<2>(0,0) == Int<2>(width, height)
+             */
+            HDINLINE View<CartBuffer> view(
+                math::Int<T_dim> a = math::Int<T_dim>(0),
+                math::Int<T_dim> b = math::Int<T_dim>(0)) const;
 
-    /* get a cursor at the container's origin cell */
-    HDINLINE cursor::BufferCursor<Type, T_dim> origin() const;
-    /* get a safe cursor at the container's origin cell */
-    HDINLINE cursor::SafeCursor<cursor::BufferCursor<Type, T_dim> > originSafe() const;
-    /* get a component-twisted cursor at the container's origin cell
-     * \param axes x-axis -> axes[0], y-axis -> axes[1], ...
-     * */
-    HDINLINE cursor::Cursor<cursor::PointerAccessor<Type>, cursor::CartNavigator<T_dim>, char*>
-    originCustomAxes(const math::UInt32<T_dim>& axes) const;
+            /* get a cursor at the container's origin cell */
+            HDINLINE cursor::BufferCursor<Type, T_dim> origin() const;
+            /* get a safe cursor at the container's origin cell */
+            HDINLINE cursor::SafeCursor<cursor::BufferCursor<Type, T_dim>> originSafe() const;
+            /* get a component-twisted cursor at the container's origin cell
+             * \param axes x-axis -> axes[0], y-axis -> axes[1], ...
+             * */
+            HDINLINE cursor::Cursor<cursor::PointerAccessor<Type>, cursor::CartNavigator<T_dim>, char*>
+            originCustomAxes(const math::UInt32<T_dim>& axes) const;
 
-    /* get a zone spanning the whole container */
-    HDINLINE zone::SphericZone<T_dim> zone() const;
+            /* get a zone spanning the whole container */
+            HDINLINE zone::SphericZone<T_dim> zone() const;
 
-    HDINLINE Type* getDataPointer() const {return dataPointer;}
-    HDINLINE math::Size_t<T_dim> size() const {return this->_size;}
-    HDINLINE math::Size_t<T_dim-1> getPitch() const {return this->pitch;}
-    /** Returns whether the buffer has no additional pitches
-     * The expected pitches are: 2D: size.x, 3D: size.x/size.x*size.y
-     */
-    HDINLINE bool isContigousMemory() const;
-};
+            HDINLINE Type* getDataPointer() const
+            {
+                return dataPointer;
+            }
+            HDINLINE math::Size_t<T_dim> size() const
+            {
+                return this->_size;
+            }
+            HDINLINE math::Size_t<T_dim - 1> getPitch() const
+            {
+                return this->pitch;
+            }
+            /** Returns whether the buffer has no additional pitches
+             * The expected pitches are: 2D: size.x, 3D: size.x/size.x*size.y
+             */
+            HDINLINE bool isContigousMemory() const;
+        };
 
-} // container
-} // pmacc
+    } // namespace container
+} // namespace pmacc
 
 #include "CartBuffer.tpp"
diff --git a/include/pmacc/cuSTL/container/CartBuffer.tpp b/include/pmacc/cuSTL/container/CartBuffer.tpp
index 0e08bad4b2..faa8190dbd 100644
--- a/include/pmacc/cuSTL/container/CartBuffer.tpp
+++ b/include/pmacc/cuSTL/container/CartBuffer.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -34,326 +34,325 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-namespace detail
-{
-    template<int dim>
-    struct PitchHelper;
-
-    template<>
-    struct PitchHelper<1>
+    namespace container
     {
-        template<typename TCursor>
-        HDINLINE math::Size_t<0u> operator()(const TCursor&) {return math::Size_t<0u>();}
+        namespace detail
+        {
+            template<int dim>
+            struct PitchHelper;
+
+            template<>
+            struct PitchHelper<1>
+            {
+                template<typename TCursor>
+                HDINLINE math::Size_t<0u> operator()(const TCursor&)
+                {
+                    return math::Size_t<0u>();
+                }
+
+                HDINLINE math::Size_t<0u> operator()(const math::Size_t<1u>&)
+                {
+                    return math::Size_t<0u>();
+                }
+            };
+            template<>
+            struct PitchHelper<2>
+            {
+                template<typename TCursor>
+                HDINLINE math::Size_t<1> operator()(const TCursor& cursor)
+                {
+                    return math::Size_t<1>(size_t((char*) cursor(0, 1).getMarker() - (char*) cursor.getMarker()));
+                }
+
+                HDINLINE math::Size_t<1> operator()(const math::Size_t<2>& size)
+                {
+                    return math::Size_t<1>(size.x());
+                }
+            };
+            template<>
+            struct PitchHelper<3>
+            {
+                template<typename TCursor>
+                HDINLINE math::Size_t<2> operator()(const TCursor& cursor)
+                {
+                    return math::Size_t<2>(
+                        (size_t)((char*) cursor(0, 1, 0).getMarker() - (char*) cursor.getMarker()),
+                        (size_t)((char*) cursor(0, 0, 1).getMarker() - (char*) cursor.getMarker()));
+                }
+
+                HDINLINE math::Size_t<2> operator()(const math::Size_t<3>& size)
+                {
+                    return math::Size_t<2>(size.x(), size.x() * size.y());
+                }
+            };
+
+            template<typename MemoryTag>
+            HDINLINE void notifyEventSystem()
+            {
+            }
+
+            template<>
+            HDINLINE void notifyEventSystem<allocator::tag::device>()
+            {
+#ifndef __CUDA_ARCH__
+                using namespace pmacc;
+                __startOperation(ITask::TASK_DEVICE);
+#endif
+            }
 
-        HDINLINE math::Size_t<0u> operator()(const math::Size_t<1u>&) {return math::Size_t<0u>();}
-    };
-    template<>
-    struct PitchHelper<2>
-    {
-        template<typename TCursor>
-        HDINLINE math::Size_t<1> operator()(const TCursor& cursor)
+            template<>
+            HDINLINE void notifyEventSystem<allocator::tag::host>()
+            {
+#ifndef __CUDA_ARCH__
+                using namespace pmacc;
+                __startOperation(ITask::TASK_HOST);
+#endif
+            }
+        } // namespace detail
+
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer(const math::Size_t<T_dim>& _size)
+            : refCount(nullptr)
         {
-            return math::Size_t<1>(size_t((char*)cursor(0, 1).getMarker() - (char*)cursor.getMarker()));
+            this->_size = _size;
+            init();
         }
 
-        HDINLINE math::Size_t<1> operator()(const math::Size_t<2>& size)
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer(size_t x) : refCount(nullptr)
         {
-            return math::Size_t<1>(size.x());
+            this->_size = math::Size_t<1>(x);
+            init();
         }
-    };
-    template<>
-    struct PitchHelper<3>
-    {
-        template<typename TCursor>
-        HDINLINE math::Size_t<2> operator()(const TCursor& cursor)
+
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer(size_t x, size_t y)
+            : refCount(nullptr)
         {
-            return math::Size_t<2>((size_t)((char*)cursor(0, 1, 0).getMarker() - (char*)cursor.getMarker()),
-                                   (size_t)((char*)cursor(0, 0, 1).getMarker() - (char*)cursor.getMarker()));
+            this->_size = math::Size_t<2>(x, y);
+            init();
         }
 
-        HDINLINE math::Size_t<2> operator()(const math::Size_t<3>& size)
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer(size_t x, size_t y, size_t z)
+            : refCount(nullptr)
         {
-            return math::Size_t<2>(size.x(), size.x() * size.y());
+            this->_size = math::Size_t<3>(x, y, z);
+            init();
         }
-    };
-
-    template<typename MemoryTag>
-    HDINLINE void notifyEventSystem() {}
-
-    template<>
-    HDINLINE void notifyEventSystem<allocator::tag::device>()
-    {
-#ifndef __CUDA_ARCH__
-        using namespace pmacc;
-        __startOperation(ITask::TASK_CUDA);
-#endif
-    }
-
-    template<>
-    HDINLINE void notifyEventSystem<allocator::tag::host>()
-    {
-#ifndef __CUDA_ARCH__
-        using namespace pmacc;
-        __startOperation(ITask::TASK_HOST);
-#endif
-    }
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer
-(const math::Size_t<T_dim>& _size) : refCount(nullptr)
-{
-    this->_size = _size;
-    init();
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer
-(size_t x) : refCount(nullptr)
-{
-    this->_size = math::Size_t<1>(x); init();
-}
 
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer
-(size_t x, size_t y) : refCount(nullptr)
-{
-    this->_size = math::Size_t<2>(x, y); init();
-}
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer(
+            const CartBuffer<Type, T_dim, Allocator, Copier, Assigner>& other)
+            : refCount(nullptr)
+        {
+            this->dataPointer = other.dataPointer;
+            this->refCount = other.refCount;
+            (*this->refCount)++;
+            this->_size = other._size;
+            this->pitch = other.pitch;
+        }
 
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer
-(size_t x, size_t y, size_t z) : refCount(nullptr)
-{
-    this->_size = math::Size_t<3>(x, y, z); init();
-}
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer(
+            CartBuffer<Type, T_dim, Allocator, Copier, Assigner>&& other)
+            : refCount(nullptr)
+        {
+            this->dataPointer = other.dataPointer;
+            this->refCount = other.refCount;
+            this->_size = other._size;
+            this->pitch = other.pitch;
+            other.dataPointer = nullptr;
+            other.refCount = nullptr;
+        }
 
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer
-(const CartBuffer<Type, T_dim, Allocator, Copier, Assigner>& other) : refCount(nullptr)
-{
-    this->dataPointer = other.dataPointer;
-    this->refCount = other.refCount;
-    (*this->refCount)++;
-    this->_size = other._size;
-    this->pitch = other.pitch;
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::CartBuffer
-(CartBuffer<Type, T_dim, Allocator, Copier, Assigner>&& other) : refCount(nullptr)
-{
-    this->dataPointer = other.dataPointer;
-    this->refCount = other.refCount;
-    this->_size = other._size;
-    this->pitch = other.pitch;
-    other.dataPointer = nullptr;
-    other.refCount = nullptr;
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-void CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::init()
-{
-    typename Allocator::Cursor cursor = Allocator::allocate(this->_size);
-    this->dataPointer = cursor.getMarker();
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE void CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::init()
+        {
+            typename Allocator::Cursor cursor = Allocator::allocate(this->_size);
+            this->dataPointer = cursor.getMarker();
 #ifndef __CUDA_ARCH__
-    this->refCount = new int;
+            this->refCount = new int;
+            *this->refCount = 1;
 #endif
-    *this->refCount = 1;
-    this->pitch = detail::PitchHelper<T_dim>()(cursor);
-}
+            this->pitch = detail::PitchHelper<T_dim>()(cursor);
+        }
 
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::~CartBuffer()
-{
-    exit();
-}
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::~CartBuffer()
+        {
+            exit();
+        }
 
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-void CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::exit()
-{
-    if(!this->refCount) return;
-    (*(this->refCount))--;
-    if(*(this->refCount) > 0)
-        return;
-    Allocator::deallocate(origin());
-    this->dataPointer = nullptr;
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE void CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::exit()
+        {
+            if(!this->refCount)
+                return;
+            (*(this->refCount))--;
+            if(*(this->refCount) > 0)
+                return;
+            Allocator::deallocate(origin());
+            this->dataPointer = nullptr;
 #ifndef __CUDA_ARCH__
-    delete this->refCount;
-    this->refCount = 0;
+            delete this->refCount;
+            this->refCount = 0;
 #endif
-}
+        }
 
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>&
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::operator=
-(const CartBuffer& rhs)
-{
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>&
+        CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::operator=(const CartBuffer& rhs)
+        {
 #ifndef __CUDA_ARCH__
-    if(rhs.size() != this->size())
-        throw std::invalid_argument(static_cast<std::stringstream&>(
-            std::stringstream() << "Assignment: Sizes of buffers do not match: "
-                << this->size() << " <-> " << rhs.size() << std::endl).str());
+            if(rhs.size() != this->size())
+                throw std::invalid_argument(static_cast<std::stringstream&>(
+                                                std::stringstream()
+                                                << "Assignment: Sizes of buffers do not match: " << this->size()
+                                                << " <-> " << rhs.size() << std::endl)
+                                                .str());
 #else
-    assert(rhs.size() == this->size());
+            assert(rhs.size() == this->size());
 #endif
 
-    if(this->dataPointer == rhs.dataPointer) return *this;
-    Copier::copy(this->dataPointer, this->pitch, rhs.dataPointer, rhs.pitch, rhs._size);
-    return *this;
-}
+            if(this->dataPointer == rhs.dataPointer)
+                return *this;
+            Copier::copy(this->dataPointer, this->pitch, rhs.dataPointer, rhs.pitch, rhs._size);
+            return *this;
+        }
 
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>&
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::operator=
-(CartBuffer&& rhs)
-{
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE CartBuffer<Type, T_dim, Allocator, Copier, Assigner>&
+        CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::operator=(CartBuffer&& rhs)
+        {
 #ifndef __CUDA_ARCH__
-    if(rhs.size() != this->size())
-        throw std::invalid_argument(static_cast<std::stringstream&>(
-            std::stringstream() << "Assignment: Sizes of buffers do not match: "
-                << this->size() << " <-> " << rhs.size() << std::endl).str());
+            if(rhs.size() != this->size())
+                throw std::invalid_argument(static_cast<std::stringstream&>(
+                                                std::stringstream()
+                                                << "Assignment: Sizes of buffers do not match: " << this->size()
+                                                << " <-> " << rhs.size() << std::endl)
+                                                .str());
 #else
-    assert(rhs.size() == this->size());
+            assert(rhs.size() == this->size());
 #endif
-    if(this->dataPointer == rhs.dataPointer) return *this;
-
-    exit();
-    this->dataPointer = rhs.dataPointer;
-    this->refCount = rhs.refCount;
-    this->_size = rhs._size;
-    this->pitch = rhs.pitch;
-    rhs.dataPointer = nullptr;
-    rhs.refCount = nullptr;
-    return *this;
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-View<CartBuffer<Type, T_dim, Allocator, Copier, Assigner> >
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::view
-(math::Int<T_dim> a, math::Int<T_dim> b) const
-{
-    a = (a + (math::Int<T_dim>)this->size()) % (math::Int<T_dim>)this->size();
-    b = (b + (math::Int<T_dim>)this->size())
-            % ((math::Int<T_dim>)this->size() + math::Int<T_dim>::create(1));
-
-    View<CartBuffer<Type, T_dim, Allocator, Copier, Assigner> > result;
-
-    result.dataPointer = &(*origin()(a));
-    result._size = (math::Size_t<T_dim>)(b - a);
-    result.pitch = this->pitch;
-    result.refCount = this->refCount;
-    return result;
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-cursor::BufferCursor<Type, T_dim> CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::origin() const
-{
-    detail::notifyEventSystem<typename Allocator::tag>();
-    return cursor::BufferCursor<Type, T_dim>(this->dataPointer, this->pitch);
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-cursor::SafeCursor<cursor::BufferCursor<Type, T_dim> >
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::originSafe() const
-{
-    return cursor::make_SafeCursor(this->origin(),
-                                   math::Int<T_dim>::create(0),
-                                   math::Int<T_dim>(size()));
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-cursor::Cursor<cursor::PointerAccessor<Type>, cursor::CartNavigator<T_dim>, char*>
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::originCustomAxes(const math::UInt32<T_dim>& axes) const
-{
-    math::Size_t<dim> factor;
-    factor[0] = sizeof(Type);
-    if(dim > 1) factor[1] = this->pitch[0];
-    if(dim > 2) factor[2] = this->pitch[1];
-    //\todo: is the conversation from size_t to int32_t allowed?
-    math::Int<dim> customFactor;
-    for(int i = 0; i < dim; i++)
-        customFactor[i] = (int)factor[axes[i]];
-    cursor::CartNavigator<dim> navi(customFactor);
-
-    detail::notifyEventSystem<typename Allocator::tag>();
-
-    return cursor::Cursor<cursor::PointerAccessor<Type>, cursor::CartNavigator<dim>, char*>
-            (cursor::PointerAccessor<Type>(), navi, (char*)this->dataPointer);
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-zone::SphericZone<T_dim>
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::zone() const
-{
-    zone::SphericZone<T_dim> myZone;
-    myZone.offset = math::Int<T_dim>::create(0);
-    myZone.size = this->_size;
-    return myZone;
-}
-
-template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
-HDINLINE
-bool
-CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::isContigousMemory() const
-{
-    return this->pitch == detail::PitchHelper<dim>()(this->_size);
-}
+            if(this->dataPointer == rhs.dataPointer)
+                return *this;
+
+            exit();
+            this->dataPointer = rhs.dataPointer;
+            this->refCount = rhs.refCount;
+            this->_size = rhs._size;
+            this->pitch = rhs.pitch;
+            rhs.dataPointer = nullptr;
+            rhs.refCount = nullptr;
+            return *this;
+        }
 
-template<typename Type, typename Allocator, typename Copier, typename Assigner>
-std::ostream& operator<<(std::ostream& s, const CartBuffer<Type, 1, Allocator, Copier, Assigner>& con)
-{
-    for(size_t x = 0; x < con.size().x(); x++)
-        s << con.origin()[x] << " ";
-    return s << std::endl;
-}
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE View<CartBuffer<Type, T_dim, Allocator, Copier, Assigner>>
+        CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::view(math::Int<T_dim> a, math::Int<T_dim> b) const
+        {
+            a = (a + (math::Int<T_dim>) this->size()) % (math::Int<T_dim>) this->size();
+            b = (b + (math::Int<T_dim>) this->size())
+                % ((math::Int<T_dim>) this->size() + math::Int<T_dim>::create(1));
 
-template<typename Type, typename Allocator, typename Copier, typename Assigner>
-std::ostream& operator<<(std::ostream& s, const CartBuffer<Type, 2, Allocator, Copier, Assigner>& con)
-{
-    for(size_t y = 0; y < con.size().y(); y++)
-    {
-        for(size_t x = 0; x < con.size().x(); x++)
-            s << *con.origin()(x,y) << " ";
-        s << std::endl;
-    }
-    return s << std::endl;
-}
-
-template<typename Type, typename Allocator, typename Copier, typename Assigner>
-std::ostream& operator<<(std::ostream& s, const CartBuffer<Type, 3, Allocator, Copier, Assigner>& con)
-{
-    for(size_t z = 0; z < con.size().z(); z++)
-    {
-        for(size_t y = 0; y < con.size().y(); y++)
+            View<CartBuffer<Type, T_dim, Allocator, Copier, Assigner>> result;
+
+            result.dataPointer = &(*origin()(a));
+            result._size = (math::Size_t<T_dim>) (b - a);
+            result.pitch = this->pitch;
+            result.refCount = this->refCount;
+            return result;
+        }
+
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE cursor::BufferCursor<Type, T_dim> CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::origin() const
+        {
+            detail::notifyEventSystem<typename Allocator::tag>();
+            return cursor::BufferCursor<Type, T_dim>(this->dataPointer, this->pitch);
+        }
+
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE cursor::SafeCursor<cursor::BufferCursor<Type, T_dim>>
+        CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::originSafe() const
+        {
+            return cursor::make_SafeCursor(this->origin(), math::Int<T_dim>::create(0), math::Int<T_dim>(size()));
+        }
+
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE cursor::Cursor<cursor::PointerAccessor<Type>, cursor::CartNavigator<T_dim>, char*>
+        CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::originCustomAxes(const math::UInt32<T_dim>& axes) const
+        {
+            math::Size_t<dim> factor;
+            factor[0] = sizeof(Type);
+            if(dim > 1)
+                factor[1] = this->pitch[0];
+            if(dim > 2)
+                factor[2] = this->pitch[1];
+            //\todo: is the conversation from size_t to int32_t allowed?
+            math::Int<dim> customFactor;
+            for(int i = 0; i < dim; i++)
+                customFactor[i] = (int) factor[axes[i]];
+            cursor::CartNavigator<dim> navi(customFactor);
+
+            detail::notifyEventSystem<typename Allocator::tag>();
+
+            return cursor::Cursor<cursor::PointerAccessor<Type>, cursor::CartNavigator<dim>, char*>(
+                cursor::PointerAccessor<Type>(),
+                navi,
+                (char*) this->dataPointer);
+        }
+
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE zone::SphericZone<T_dim> CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::zone() const
+        {
+            zone::SphericZone<T_dim> myZone;
+            myZone.offset = math::Int<T_dim>::create(0);
+            myZone.size = this->_size;
+            return myZone;
+        }
+
+        template<typename Type, int T_dim, typename Allocator, typename Copier, typename Assigner>
+        HDINLINE bool CartBuffer<Type, T_dim, Allocator, Copier, Assigner>::isContigousMemory() const
+        {
+            return this->pitch == detail::PitchHelper<dim>()(this->_size);
+        }
+
+        template<typename Type, typename Allocator, typename Copier, typename Assigner>
+        std::ostream& operator<<(std::ostream& s, const CartBuffer<Type, 1, Allocator, Copier, Assigner>& con)
         {
             for(size_t x = 0; x < con.size().x(); x++)
-                s << *con.origin()(x,y,z) << " ";
-            s << std::endl;
+                s << con.origin()[x] << " ";
+            return s << std::endl;
+        }
+
+        template<typename Type, typename Allocator, typename Copier, typename Assigner>
+        std::ostream& operator<<(std::ostream& s, const CartBuffer<Type, 2, Allocator, Copier, Assigner>& con)
+        {
+            for(size_t y = 0; y < con.size().y(); y++)
+            {
+                for(size_t x = 0; x < con.size().x(); x++)
+                    s << *con.origin()(x, y) << " ";
+                s << std::endl;
+            }
+            return s << std::endl;
+        }
+
+        template<typename Type, typename Allocator, typename Copier, typename Assigner>
+        std::ostream& operator<<(std::ostream& s, const CartBuffer<Type, 3, Allocator, Copier, Assigner>& con)
+        {
+            for(size_t z = 0; z < con.size().z(); z++)
+            {
+                for(size_t y = 0; y < con.size().y(); y++)
+                {
+                    for(size_t x = 0; x < con.size().x(); x++)
+                        s << *con.origin()(x, y, z) << " ";
+                    s << std::endl;
+                }
+                s << std::endl;
+            }
+            return s << std::endl;
         }
-        s << std::endl;
-    }
-    return s << std::endl;
-}
 
-} // container
-} // pmacc
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/DeviceBuffer.hpp b/include/pmacc/cuSTL/container/DeviceBuffer.hpp
index aef988a7f8..911e371f6b 100644
--- a/include/pmacc/cuSTL/container/DeviceBuffer.hpp
+++ b/include/pmacc/cuSTL/container/DeviceBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -40,108 +40,134 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-/** typedef version of a CartBuffer for a GPU.
- * Additional feature: Able to copy data from a HostBuffer
- * \tparam Type type of a single datum
- * \tparam T_dim Dimension of the container
- */
-template<typename Type, int T_dim>
-class DeviceBuffer
- : public CartBuffer<Type, T_dim, allocator::DeviceMemAllocator<Type, T_dim>,
-                                copier::D2DCopier<T_dim>,
-                                assigner::DeviceMemAssigner<> >
-{
-private:
-    typedef CartBuffer<Type, T_dim, allocator::DeviceMemAllocator<Type, T_dim>,
-                                  copier::D2DCopier<T_dim>,
-                                  assigner::DeviceMemAssigner<> > Base;
-
-protected:
-    HDINLINE DeviceBuffer() {}
-
-public:
-    typedef typename Base::PitchType PitchType;
-
-    /* constructors
-     *
-     * \param _size size of the container
-     *
-     * \param x,y,z convenient wrapper
-     *
-     */
-    HDINLINE DeviceBuffer(const math::Size_t<T_dim>& size) : Base(size) {}
-    HDINLINE DeviceBuffer(size_t x) : Base(x) {}
-    HDINLINE DeviceBuffer(size_t x, size_t y) : Base(x, y) {}
-    HDINLINE DeviceBuffer(size_t x, size_t y, size_t z) : Base(x, y, z) {}
-    /**
-     * Creates a device buffer from a pointer with a size. Assumes dense layout (no padding)
-     *
-     * @param ptr Pointer to the first element
-     * @param size Size of the buffer
-     * @param ownMemory Set to false if the memory is only a reference and managed outside of this class
-     *                  Ignored for device side creation!y
-     * @param pitch Pitch in bytes (number of bytes in the lower dimensions)
-     */
-    HDINLINE DeviceBuffer(Type* ptr, const math::Size_t<T_dim>& size, bool ownMemory, PitchType pitch = PitchType::create(0))
+    namespace container
     {
-        this->dataPointer = ptr;
-        this->_size = size;
-        if(T_dim >= 2)
-            this->pitch[0] = (pitch[0]) ? pitch[0] : size.x() * sizeof(Type);
-        if(T_dim == 3)
-            this->pitch[1] = (pitch[1]) ? pitch[1] : this->pitch[0] * size.y();
+        /** typedef version of a CartBuffer for a GPU.
+         * Additional feature: Able to copy data from a HostBuffer
+         * \tparam Type type of a single datum
+         * \tparam T_dim Dimension of the container
+         */
+        template<typename Type, int T_dim>
+        class DeviceBuffer
+            : public CartBuffer<
+                  Type,
+                  T_dim,
+                  allocator::DeviceMemAllocator<Type, T_dim>,
+                  copier::D2DCopier<T_dim>,
+                  assigner::DeviceMemAssigner<>>
+        {
+        private:
+            typedef CartBuffer<
+                Type,
+                T_dim,
+                allocator::DeviceMemAllocator<Type, T_dim>,
+                copier::D2DCopier<T_dim>,
+                assigner::DeviceMemAssigner<>>
+                Base;
+
+        protected:
+            HDINLINE DeviceBuffer()
+            {
+            }
+
+        public:
+            typedef typename Base::PitchType PitchType;
+
+            /* constructors
+             *
+             * \param _size size of the container
+             *
+             * \param x,y,z convenient wrapper
+             *
+             */
+            HDINLINE DeviceBuffer(const math::Size_t<T_dim>& size) : Base(size)
+            {
+            }
+            HDINLINE DeviceBuffer(size_t x) : Base(x)
+            {
+            }
+            HDINLINE DeviceBuffer(size_t x, size_t y) : Base(x, y)
+            {
+            }
+            HDINLINE DeviceBuffer(size_t x, size_t y, size_t z) : Base(x, y, z)
+            {
+            }
+            /**
+             * Creates a device buffer from a pointer with a size. Assumes dense layout (no padding)
+             *
+             * @param ptr Pointer to the first element
+             * @param size Size of the buffer
+             * @param ownMemory Set to false if the memory is only a reference and managed outside of this class
+             *                  Ignored for device side creation!y
+             * @param pitch Pitch in bytes (number of bytes in the lower dimensions)
+             */
+            HDINLINE DeviceBuffer(
+                Type* ptr,
+                const math::Size_t<T_dim>& size,
+                bool ownMemory,
+                PitchType pitch = PitchType::create(0))
+            {
+                this->dataPointer = ptr;
+                this->_size = size;
+                if(T_dim >= 2)
+                    this->pitch[0] = (pitch[0]) ? pitch[0] : size.x() * sizeof(Type);
+                if(T_dim == 3)
+                    this->pitch[1] = (pitch[1]) ? pitch[1] : this->pitch[0] * size.y();
 #ifndef __CUDA_ARCH__
-        this->refCount = new int;
-        *this->refCount = (ownMemory) ? 1 : 2;
+                this->refCount = new int;
+                *this->refCount = (ownMemory) ? 1 : 2;
 #endif
-    }
-    HDINLINE DeviceBuffer(const Base& base) : Base(base) {}
-    HDINLINE DeviceBuffer(DeviceBuffer&& obj): Base(std::move(static_cast<Base&>(obj))) {}
-
-    HDINLINE DeviceBuffer&
-    operator=(DeviceBuffer&& rhs)
-    {
-        Base::operator=(std::move(static_cast<Base&>(rhs)));
-        return *this;
-    }
-
-    template<typename HBuffer>
-    HINLINE
-    typename boost::enable_if<
-        boost::is_same<typename HBuffer::memoryTag, allocator::tag::host>,
-        DeviceBuffer&
-        >::type
-    operator=(const HBuffer& rhs)
-    {
-        BOOST_STATIC_ASSERT((boost::is_same<typename HBuffer::type, Type>::value));
-        BOOST_STATIC_ASSERT(HBuffer::dim == T_dim);
-        if(rhs.size() != this->size())
-            throw std::invalid_argument(static_cast<std::stringstream&>(
-                std::stringstream() << "Assignment: Sizes of buffers do not match: "
-                    << this->size() << " <-> " << rhs.size() << std::endl).str());
-
-        cudaWrapper::Memcopy<T_dim>()(this->dataPointer, this->pitch, rhs.getDataPointer(), rhs.getPitch(),
-                                this->_size, cudaWrapper::flags::Memcopy::hostToDevice);
-
-        return *this;
-    }
-
-    HINLINE DeviceBuffer& operator=(const Base& rhs)
-    {
-        Base::operator=(rhs);
-        return *this;
-    }
-
-    HINLINE DeviceBuffer& operator=(const DeviceBuffer& rhs)
-    {
-        Base::operator=(rhs);
-        return *this;
-    }
-};
-
-} // container
-} // pmacc
-
+            }
+            HDINLINE DeviceBuffer(const Base& base) : Base(base)
+            {
+            }
+            HDINLINE DeviceBuffer(DeviceBuffer&& obj) : Base(std::move(static_cast<Base&>(obj)))
+            {
+            }
+
+            HDINLINE DeviceBuffer& operator=(DeviceBuffer&& rhs)
+            {
+                Base::operator=(std::move(static_cast<Base&>(rhs)));
+                return *this;
+            }
+
+            template<typename HBuffer>
+            HINLINE typename boost::
+                enable_if<boost::is_same<typename HBuffer::memoryTag, allocator::tag::host>, DeviceBuffer&>::type
+                operator=(const HBuffer& rhs)
+            {
+                BOOST_STATIC_ASSERT((boost::is_same<typename HBuffer::type, Type>::value));
+                BOOST_STATIC_ASSERT(HBuffer::dim == T_dim);
+                if(rhs.size() != this->size())
+                    throw std::invalid_argument(static_cast<std::stringstream&>(
+                                                    std::stringstream()
+                                                    << "Assignment: Sizes of buffers do not match: " << this->size()
+                                                    << " <-> " << rhs.size() << std::endl)
+                                                    .str());
+
+                cuplaWrapper::Memcopy<T_dim>()(
+                    this->dataPointer,
+                    this->pitch,
+                    rhs.getDataPointer(),
+                    rhs.getPitch(),
+                    this->_size,
+                    cuplaWrapper::flags::Memcopy::hostToDevice);
+
+                return *this;
+            }
+
+            HINLINE DeviceBuffer& operator=(const Base& rhs)
+            {
+                Base::operator=(rhs);
+                return *this;
+            }
+
+            HINLINE DeviceBuffer& operator=(const DeviceBuffer& rhs)
+            {
+                Base::operator=(rhs);
+                return *this;
+            }
+        };
+
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/HostBuffer.hpp b/include/pmacc/cuSTL/container/HostBuffer.hpp
index 72d2a86690..0d1b10643c 100644
--- a/include/pmacc/cuSTL/container/HostBuffer.hpp
+++ b/include/pmacc/cuSTL/container/HostBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -39,118 +39,149 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-/** typedef version of a CartBuffer for a CPU.
- * Additional feature: Able to copy data from a DeviceBuffer
- * \tparam Type type of a single datum
- * \tparam T_dim Dimension of the container
- */
-template<typename Type, int T_dim>
-class HostBuffer
- : public CartBuffer<Type, T_dim, allocator::HostMemAllocator<Type, T_dim>,
-                                copier::H2HCopier<T_dim>,
-                                assigner::HostMemAssigner<> >
-{
-private:
-    using Base = CartBuffer<Type, T_dim, allocator::HostMemAllocator<Type, T_dim>,
-                                  copier::H2HCopier<T_dim>,
-                                  assigner::HostMemAssigner<> >;
-protected:
-    HostBuffer() {}
-public:
-    using PitchType = typename Base::PitchType;
-
-    /* constructors
-     *
-     * \param _size size of the container
-     *
-     * \param x,y,z convenient wrapper
-     *
-     */
-    HINLINE HostBuffer(const math::Size_t<T_dim>& size) : Base(size) {}
-    HINLINE HostBuffer(size_t x) : Base(x) {}
-    HINLINE HostBuffer(size_t x, size_t y) : Base(x, y) {}
-    HINLINE HostBuffer(size_t x, size_t y, size_t z) : Base(x, y, z) {}
-    /**
-     * Creates a host buffer from a pointer with a size. Assumes dense layout (no padding)
-     *
-     * @param ptr Pointer to the first element
-     * @param size Size of the buffer
-     * @param ownMemory Set to false if the memory is only a reference and managed outside of this class
-     * @param pitch Pitch in bytes (number of bytes in the lower dimensions)
-     */
-    HINLINE HostBuffer(Type* ptr, const math::Size_t<3>& size, bool ownMemory, math::Size_t<2> pitch = math::Size_t<2>::create(0) )
-    {
-        this->dataPointer = ptr;
-        this->_size = size;
-        this->pitch[0] = (pitch[0]) ? pitch[0] : size.x() * sizeof(Type);
-        this->pitch[1] = (pitch[1]) ? pitch[1] : this->pitch[0] * size.y();
-        this->refCount = new int;
-        *this->refCount = (ownMemory) ? 1 : 2;
-    }
-    HINLINE HostBuffer(Type* ptr, const math::Size_t<2>& size, bool ownMemory, math::Size_t<1> pitch = math::Size_t<1>::create(0) )
-    {
-        this->dataPointer = ptr;
-        this->_size = size;
-        this->pitch[0] = (pitch[0]) ? pitch[0] : size.x() * sizeof(Type);
-        this->refCount = new int;
-        *this->refCount = (ownMemory) ? 1 : 2;
-    }
-    HINLINE HostBuffer(Type* ptr, const math::Size_t<1>& size, bool ownMemory)
-    {
-        this->dataPointer = ptr;
-        this->_size = size;
-        // intentionally uninitialized and not RT accessible via []
-        // this->pitch = pitch;
-        this->refCount = new int;
-        *this->refCount = (ownMemory) ? 1 : 2;
-    }
-    HINLINE HostBuffer(const Base& base) : Base(base) {}
-    HINLINE HostBuffer(HostBuffer&& obj): Base(std::move(static_cast<Base&>(obj))) {}
-
-    HINLINE HostBuffer&
-    operator=(HostBuffer&& rhs)
-    {
-        Base::operator=(std::move(static_cast<Base&>(rhs)));
-        return *this;
-    }
-
-    template<typename DBuffer>
-    HINLINE
-    typename boost::enable_if<
-        boost::is_same<typename DBuffer::memoryTag, allocator::tag::device>,
-        HostBuffer&
-        >::type
-    operator=(const DBuffer& rhs)
+    namespace container
     {
-        BOOST_STATIC_ASSERT((boost::is_same<typename DBuffer::type, Type>::value));
-        BOOST_STATIC_ASSERT(DBuffer::dim == T_dim);
-        if(rhs.size() != this->size())
-            throw std::invalid_argument(static_cast<std::stringstream&>(
-                std::stringstream() << "Assignment: Sizes of buffers do not match: "
-                    << this->size() << " <-> " << rhs.size() << std::endl).str());
-
-        cudaWrapper::Memcopy<T_dim>()(this->dataPointer, this->pitch, rhs.getDataPointer(), rhs.getPitch(),
-                                this->_size, cudaWrapper::flags::Memcopy::deviceToHost);
-
-        return *this;
-    }
-
-    HINLINE HostBuffer& operator=(const Base& rhs)
-    {
-        Base::operator=(rhs);
-        return *this;
-    }
-
-    HINLINE HostBuffer& operator=(const HostBuffer& rhs)
-    {
-        Base::operator=(rhs);
-        return *this;
-    }
-};
-
-} // container
-} // pmacc
-
+        /** typedef version of a CartBuffer for a CPU.
+         * Additional feature: Able to copy data from a DeviceBuffer
+         * \tparam Type type of a single datum
+         * \tparam T_dim Dimension of the container
+         */
+        template<typename Type, int T_dim>
+        class HostBuffer
+            : public CartBuffer<
+                  Type,
+                  T_dim,
+                  allocator::HostMemAllocator<Type, T_dim>,
+                  copier::H2HCopier<T_dim>,
+                  assigner::HostMemAssigner<>>
+        {
+        private:
+            using Base = CartBuffer<
+                Type,
+                T_dim,
+                allocator::HostMemAllocator<Type, T_dim>,
+                copier::H2HCopier<T_dim>,
+                assigner::HostMemAssigner<>>;
+
+        protected:
+            HostBuffer()
+            {
+            }
+
+        public:
+            using PitchType = typename Base::PitchType;
+
+            /* constructors
+             *
+             * \param _size size of the container
+             *
+             * \param x,y,z convenient wrapper
+             *
+             */
+            HINLINE HostBuffer(const math::Size_t<T_dim>& size) : Base(size)
+            {
+            }
+            HINLINE HostBuffer(size_t x) : Base(x)
+            {
+            }
+            HINLINE HostBuffer(size_t x, size_t y) : Base(x, y)
+            {
+            }
+            HINLINE HostBuffer(size_t x, size_t y, size_t z) : Base(x, y, z)
+            {
+            }
+            /**
+             * Creates a host buffer from a pointer with a size. Assumes dense layout (no padding)
+             *
+             * @param ptr Pointer to the first element
+             * @param size Size of the buffer
+             * @param ownMemory Set to false if the memory is only a reference and managed outside of this class
+             * @param pitch Pitch in bytes (number of bytes in the lower dimensions)
+             */
+            HINLINE HostBuffer(
+                Type* ptr,
+                const math::Size_t<3>& size,
+                bool ownMemory,
+                math::Size_t<2> pitch = math::Size_t<2>::create(0))
+            {
+                this->dataPointer = ptr;
+                this->_size = size;
+                this->pitch[0] = (pitch[0]) ? pitch[0] : size.x() * sizeof(Type);
+                this->pitch[1] = (pitch[1]) ? pitch[1] : this->pitch[0] * size.y();
+                this->refCount = new int;
+                *this->refCount = (ownMemory) ? 1 : 2;
+            }
+            HINLINE HostBuffer(
+                Type* ptr,
+                const math::Size_t<2>& size,
+                bool ownMemory,
+                math::Size_t<1> pitch = math::Size_t<1>::create(0))
+            {
+                this->dataPointer = ptr;
+                this->_size = size;
+                this->pitch[0] = (pitch[0]) ? pitch[0] : size.x() * sizeof(Type);
+                this->refCount = new int;
+                *this->refCount = (ownMemory) ? 1 : 2;
+            }
+            HINLINE HostBuffer(Type* ptr, const math::Size_t<1>& size, bool ownMemory)
+            {
+                this->dataPointer = ptr;
+                this->_size = size;
+                // intentionally uninitialized and not RT accessible via []
+                // this->pitch = pitch;
+                this->refCount = new int;
+                *this->refCount = (ownMemory) ? 1 : 2;
+            }
+            HINLINE HostBuffer(const Base& base) : Base(base)
+            {
+            }
+            HINLINE HostBuffer(HostBuffer&& obj) : Base(std::move(static_cast<Base&>(obj)))
+            {
+            }
+
+            HINLINE HostBuffer& operator=(HostBuffer&& rhs)
+            {
+                Base::operator=(std::move(static_cast<Base&>(rhs)));
+                return *this;
+            }
+
+            template<typename DBuffer>
+            HINLINE typename boost::
+                enable_if<boost::is_same<typename DBuffer::memoryTag, allocator::tag::device>, HostBuffer&>::type
+                operator=(const DBuffer& rhs)
+            {
+                BOOST_STATIC_ASSERT((boost::is_same<typename DBuffer::type, Type>::value));
+                BOOST_STATIC_ASSERT(DBuffer::dim == T_dim);
+                if(rhs.size() != this->size())
+                    throw std::invalid_argument(static_cast<std::stringstream&>(
+                                                    std::stringstream()
+                                                    << "Assignment: Sizes of buffers do not match: " << this->size()
+                                                    << " <-> " << rhs.size() << std::endl)
+                                                    .str());
+
+                cuplaWrapper::Memcopy<T_dim>()(
+                    this->dataPointer,
+                    this->pitch,
+                    rhs.getDataPointer(),
+                    rhs.getPitch(),
+                    this->_size,
+                    cuplaWrapper::flags::Memcopy::deviceToHost);
+
+                return *this;
+            }
+
+            HINLINE HostBuffer& operator=(const Base& rhs)
+            {
+                Base::operator=(rhs);
+                return *this;
+            }
+
+            HINLINE HostBuffer& operator=(const HostBuffer& rhs)
+            {
+                Base::operator=(rhs);
+                return *this;
+            }
+        };
+
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/IndexBuffer.hpp b/include/pmacc/cuSTL/container/IndexBuffer.hpp
index 0265c2084a..a297cf17ff 100644
--- a/include/pmacc/cuSTL/container/IndexBuffer.hpp
+++ b/include/pmacc/cuSTL/container/IndexBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -32,63 +32,66 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-template<int dim>
-class IndexBuffer
-{
-private:
-    math::UInt32<dim> _size;
-public:
-    IndexBuffer(const math::UInt32<dim>& _size) : _size(_size) {}
-    IndexBuffer(uint32_t x) : _size(x) {}
-    IndexBuffer(uint32_t x, uint32_t y) : _size(x,y) {}
-    IndexBuffer(uint32_t x, uint32_t y, uint32_t z) : _size(x,y,z) {}
-
-    inline
-    cursor::Cursor<cursor::MarkerAccessor<math::Int<dim> >,
-                   cursor::CartNavigator<dim>,
-                   math::Int<dim> >
-    origin() const
+    namespace container
     {
-        math::Int<dim> factor;
-        factor[0] = 1; factor[1] = this->_size.x();
-        if(dim == 3) factor[2] = this->_size.x() * this->_size.y();
+        template<int dim>
+        class IndexBuffer
+        {
+        private:
+            math::UInt32<dim> _size;
 
-        return cursor::Cursor<cursor::MarkerAccessor<math::Int<dim> >,
-                              cursor::CartNavigator<dim>,
-                              math::Int<dim> >
-                              (cursor::MarkerAccessor<math::Int<dim> >(),
-                               cursor::CartNavigator<dim>(factor),
-                               math::Int<dim>(0));
-    }
-    inline
-    cursor::Cursor<cursor::MarkerAccessor<math::Int<dim> >,
-                   cursor::CartNavigator<dim>,
-                   math::Int<dim> >
-    originCustomAxes(const math::UInt32<dim>& axes) const
-    {
-        math::Int<dim> factor;
-        factor[0] = 1; factor[1] = this->_size.x();
-        if(dim == 3) factor[2] = this->_size.x() * this->_size.y();
-        math::Int<dim> customFactor;
-        for(uint32_t i = 0; i < dim; i++)
-            customFactor[i] = factor[axes[i]];
+        public:
+            IndexBuffer(const math::UInt32<dim>& _size) : _size(_size)
+            {
+            }
+            IndexBuffer(uint32_t x) : _size(x)
+            {
+            }
+            IndexBuffer(uint32_t x, uint32_t y) : _size(x, y)
+            {
+            }
+            IndexBuffer(uint32_t x, uint32_t y, uint32_t z) : _size(x, y, z)
+            {
+            }
 
-        return cursor::Cursor<cursor::MarkerAccessor<math::Int<dim> >,
-                              cursor::CartNavigator<dim>,
-                              math::Int<dim> >
-                              (cursor::MarkerAccessor<math::Int<dim> >(),
-                               cursor::CartNavigator<dim>(customFactor),
-                               math::Int<dim>(0));
-    }
-    inline zone::SphericZone<dim> zone() const
-    {
-        return zone::SphericZone<dim>((math::Size_t<dim>)this->_size);
-    }
-};
+            inline cursor::Cursor<cursor::MarkerAccessor<math::Int<dim>>, cursor::CartNavigator<dim>, math::Int<dim>>
+            origin() const
+            {
+                math::Int<dim> factor;
+                factor[0] = 1;
+                factor[1] = this->_size.x();
+                if(dim == 3)
+                    factor[2] = this->_size.x() * this->_size.y();
+
+                return cursor::
+                    Cursor<cursor::MarkerAccessor<math::Int<dim>>, cursor::CartNavigator<dim>, math::Int<dim>>(
+                        cursor::MarkerAccessor<math::Int<dim>>(),
+                        cursor::CartNavigator<dim>(factor),
+                        math::Int<dim>(0));
+            }
+            inline cursor::Cursor<cursor::MarkerAccessor<math::Int<dim>>, cursor::CartNavigator<dim>, math::Int<dim>>
+            originCustomAxes(const math::UInt32<dim>& axes) const
+            {
+                math::Int<dim> factor;
+                factor[0] = 1;
+                factor[1] = this->_size.x();
+                if(dim == 3)
+                    factor[2] = this->_size.x() * this->_size.y();
+                math::Int<dim> customFactor;
+                for(uint32_t i = 0; i < dim; i++)
+                    customFactor[i] = factor[axes[i]];
 
-} // container
-} // pmacc
+                return cursor::
+                    Cursor<cursor::MarkerAccessor<math::Int<dim>>, cursor::CartNavigator<dim>, math::Int<dim>>(
+                        cursor::MarkerAccessor<math::Int<dim>>(),
+                        cursor::CartNavigator<dim>(customFactor),
+                        math::Int<dim>(0));
+            }
+            inline zone::SphericZone<dim> zone() const
+            {
+                return zone::SphericZone<dim>((math::Size_t<dim>) this->_size);
+            }
+        };
 
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/PNGBuffer.hpp b/include/pmacc/cuSTL/container/PNGBuffer.hpp
index 0658c2186e..9fa88c0d0f 100644
--- a/include/pmacc/cuSTL/container/PNGBuffer.hpp
+++ b/include/pmacc/cuSTL/container/PNGBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -32,70 +32,86 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-/** Think of a container being a PNG-image
- * offers only write-only access
- */
-class PNGBuffer
-{
-private:
-    class Plotter
+    namespace container
     {
-    private:
-        pngwriter& png;
-        math::Int<2> pos;
-    public:
-        Plotter(pngwriter& png) : png(png) {}
-        inline Plotter& operator=(const math::Float<3>& color)
-        {
-            png.plot(pos.x()+1, pos.y()+1, (double)color.x(), (double)color.y(), (double)color.z());
-            return *this;
-        }
-        void setPos(const math::Int<2>& pos)
+        /** Think of a container being a PNG-image
+         * offers only write-only access
+         */
+        class PNGBuffer
         {
-            this->pos = pos;
-        }
-    };
-    struct Accessor
-    {
-        typedef Plotter& type;
-        pngwriter& png;
-        Plotter plotter;
-        Accessor(pngwriter& png) : png(png), plotter(png) {}
-        inline type operator()(math::Int<2>& index)
-        {
-            plotter.setPos(index);
-            return this->plotter;
-        }
-    };
-    pngwriter png;
-    math::Size_t<2> size;
-public:
-    typedef cursor::Cursor<PNGBuffer::Accessor, cursor::MultiIndexNavigator<2>, math::Int<2> > Cursor;
+        private:
+            class Plotter
+            {
+            private:
+                pngwriter& png;
+                math::Int<2> pos;
 
-    /* constructor
-     * \param x width of png image
-     * \param y height of png image
-     * \name name of png file
-     */
-    PNGBuffer(int x, int y, const std::string& name) : png(x, y, 0.0, name.data()), size(x,y) {}
-    PNGBuffer(math::Size_t<2> size, const std::string& name) : png(size.x(), size.y(), 0.0, name.data()), size(size) {}
-    ~PNGBuffer() {png.close();}
+            public:
+                Plotter(pngwriter& png) : png(png)
+                {
+                }
+                inline Plotter& operator=(const math::Float<3>& color)
+                {
+                    png.plot(pos.x() + 1, pos.y() + 1, (double) color.x(), (double) color.y(), (double) color.z());
+                    return *this;
+                }
+                void setPos(const math::Int<2>& pos)
+                {
+                    this->pos = pos;
+                }
+            };
+            struct Accessor
+            {
+                typedef Plotter& type;
+                pngwriter& png;
+                Plotter plotter;
+                Accessor(pngwriter& png) : png(png), plotter(png)
+                {
+                }
+                inline type operator()(math::Int<2>& index)
+                {
+                    plotter.setPos(index);
+                    return this->plotter;
+                }
+            };
+            pngwriter png;
+            math::Size_t<2> size;
 
-    /* get a cursor at the top left pixel
-     * access via a Float<3> reference
-     */
-    inline Cursor origin()
-    {
-        return Cursor(Accessor(this->png), cursor::MultiIndexNavigator<2>(), math::Int<2>(0));
-    }
+        public:
+            typedef cursor::Cursor<PNGBuffer::Accessor, cursor::MultiIndexNavigator<2>, math::Int<2>> Cursor;
+
+            /* constructor
+             * \param x width of png image
+             * \param y height of png image
+             * \name name of png file
+             */
+            PNGBuffer(int x, int y, const std::string& name) : png(x, y, 0.0, name.data()), size(x, y)
+            {
+            }
+            PNGBuffer(math::Size_t<2> size, const std::string& name)
+                : png(size.x(), size.y(), 0.0, name.data())
+                , size(size)
+            {
+            }
+            ~PNGBuffer()
+            {
+                png.close();
+            }
 
-    /* get a zone spanning the whole container */
-    inline zone::SphericZone<2> zone() const {return zone::SphericZone<2>(this->size);}
-};
+            /* get a cursor at the top left pixel
+             * access via a Float<3> reference
+             */
+            inline Cursor origin()
+            {
+                return Cursor(Accessor(this->png), cursor::MultiIndexNavigator<2>(), math::Int<2>(0));
+            }
 
-} // container
-} // pmacc
+            /* get a zone spanning the whole container */
+            inline zone::SphericZone<2> zone() const
+            {
+                return zone::SphericZone<2>(this->size);
+            }
+        };
 
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/PseudoBuffer.hpp b/include/pmacc/cuSTL/container/PseudoBuffer.hpp
index 348e2cff3b..b02675b41f 100644
--- a/include/pmacc/cuSTL/container/PseudoBuffer.hpp
+++ b/include/pmacc/cuSTL/container/PseudoBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,20 +27,18 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-template<typename Type, int dim>
-struct PseudoBuffer : public container::CartBuffer<Type, dim>
-{
-    template<typename _Type>
-    PseudoBuffer(pmacc::DeviceBuffer<_Type, dim>& devBuffer);
-    template<typename _Type>
-    PseudoBuffer(pmacc::HostBuffer<_Type, dim>& hostBuffer);
-};
+    namespace container
+    {
+        template<typename Type, int dim>
+        struct PseudoBuffer : public container::CartBuffer<Type, dim>
+        {
+            template<typename _Type>
+            PseudoBuffer(pmacc::DeviceBuffer<_Type, dim>& devBuffer);
+            template<typename _Type>
+            PseudoBuffer(pmacc::HostBuffer<_Type, dim>& hostBuffer);
+        };
 
-} // container
-} // pmacc
+    } // namespace container
+} // namespace pmacc
 
 #include "PseudoBuffer.tpp"
-
diff --git a/include/pmacc/cuSTL/container/PseudoBuffer.tpp b/include/pmacc/cuSTL/container/PseudoBuffer.tpp
index 53365ad561..c70372e71e 100644
--- a/include/pmacc/cuSTL/container/PseudoBuffer.tpp
+++ b/include/pmacc/cuSTL/container/PseudoBuffer.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,37 +23,38 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-template<typename Type, int dim>
-template<typename _Type>
-PseudoBuffer<Type, dim>::PseudoBuffer(pmacc::DeviceBuffer<_Type, dim>& devBuffer)
-{
-    cudaPitchedPtr cudaData = devBuffer.getCudaPitched();
-    this->dataPointer = (Type*)cudaData.ptr;
-    this->_size = (math::Size_t<dim>)devBuffer.getDataSpace();
-    if(dim == 2) this->pitch[0] = cudaData.pitch;
-    if(dim == 3)
+    namespace container
     {
-        this->pitch[0] = cudaData.pitch;
-        this->pitch[1] = cudaData.pitch * this->_size.y();
-    }
-}
+        template<typename Type, int dim>
+        template<typename _Type>
+        PseudoBuffer<Type, dim>::PseudoBuffer(pmacc::DeviceBuffer<_Type, dim>& devBuffer)
+        {
+            cuplaPitchedPtr cuplaData = devBuffer.getCudaPitched();
+            this->dataPointer = (Type*) cuplaData.ptr;
+            this->_size = (math::Size_t<dim>) devBuffer.getDataSpace();
+            if(dim == 2)
+                this->pitch[0] = cuplaData.pitch;
+            if(dim == 3)
+            {
+                this->pitch[0] = cuplaData.pitch;
+                this->pitch[1] = cuplaData.pitch * this->_size.y();
+            }
+        }
 
-template<typename Type, int dim>
-template<typename _Type>
-PseudoBuffer<Type, dim>::PseudoBuffer(pmacc::HostBuffer<_Type, dim>& hostBuffer)
-{
-    this->dataPointer = (Type*)hostBuffer.getBasePointer();
-    this->_size = (math::Size_t<dim>)hostBuffer.getDataSpace();
-    if(dim == 2) this->pitch[0] = sizeof(Type) * this->_size[0];
-    if(dim == 3)
-    {
-        this->pitch[0] = sizeof(Type) * this->_size[0];
-        this->pitch[1] = this->pitch[0] * this->_size[1];
-    }
-}
+        template<typename Type, int dim>
+        template<typename _Type>
+        PseudoBuffer<Type, dim>::PseudoBuffer(pmacc::HostBuffer<_Type, dim>& hostBuffer)
+        {
+            this->dataPointer = (Type*) hostBuffer.getBasePointer();
+            this->_size = (math::Size_t<dim>) hostBuffer.getDataSpace();
+            if(dim == 2)
+                this->pitch[0] = sizeof(Type) * this->_size[0];
+            if(dim == 3)
+            {
+                this->pitch[0] = sizeof(Type) * this->_size[0];
+                this->pitch[1] = this->pitch[0] * this->_size[1];
+            }
+        }
 
-} // container
-} // pmacc
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/allocator/DeviceMemAllocator.hpp b/include/pmacc/cuSTL/container/allocator/DeviceMemAllocator.hpp
index 8f2b601092..17aa2025f5 100644
--- a/include/pmacc/cuSTL/container/allocator/DeviceMemAllocator.hpp
+++ b/include/pmacc/cuSTL/container/allocator/DeviceMemAllocator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,41 +29,37 @@
 
 namespace pmacc
 {
-namespace allocator
-{
+    namespace allocator
+    {
+        template<typename Type, int T_dim>
+        struct DeviceMemAllocator
+        {
+            typedef Type type;
+            static constexpr int dim = T_dim;
+            typedef cursor::BufferCursor<type, dim> Cursor;
+            typedef allocator::tag::device tag;
 
-template<typename Type, int T_dim>
-struct DeviceMemAllocator
-{
-    typedef Type type;
-    static constexpr int dim = T_dim;
-    typedef cursor::BufferCursor<type, dim> Cursor;
-    typedef allocator::tag::device tag;
+            HDINLINE
+            static cursor::BufferCursor<type, T_dim> allocate(const math::Size_t<T_dim>& size);
+            template<typename TCursor>
+            HDINLINE static void deallocate(const TCursor& cursor);
+        };
 
-    HDINLINE
-    static cursor::BufferCursor<type, T_dim> allocate(const math::Size_t<T_dim>& size);
-    template<typename TCursor>
-    HDINLINE
-    static void deallocate(const TCursor& cursor);
-};
+        template<typename Type>
+        struct DeviceMemAllocator<Type, 1>
+        {
+            typedef Type type;
+            static constexpr int dim = 1;
+            typedef cursor::BufferCursor<type, 1> Cursor;
+            typedef allocator::tag::device tag;
 
-template<typename Type>
-struct DeviceMemAllocator<Type, 1>
-{
-    typedef Type type;
-    static constexpr int dim = 1;
-    typedef cursor::BufferCursor<type, 1> Cursor;
-    typedef allocator::tag::device tag;
+            HDINLINE
+            static cursor::BufferCursor<type, 1> allocate(const math::Size_t<1>& size);
+            template<typename TCursor>
+            HDINLINE static void deallocate(const TCursor& cursor);
+        };
 
-    HDINLINE
-    static cursor::BufferCursor<type, 1> allocate(const math::Size_t<1>& size);
-    template<typename TCursor>
-    HDINLINE
-    static void deallocate(const TCursor& cursor);
-};
-
-} // allocator
-} // pmacc
+    } // namespace allocator
+} // namespace pmacc
 
 #include "DeviceMemAllocator.tpp"
-
diff --git a/include/pmacc/cuSTL/container/allocator/DeviceMemAllocator.tpp b/include/pmacc/cuSTL/container/allocator/DeviceMemAllocator.tpp
index 07a8f70aac..939c1a317f 100644
--- a/include/pmacc/cuSTL/container/allocator/DeviceMemAllocator.tpp
+++ b/include/pmacc/cuSTL/container/allocator/DeviceMemAllocator.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -23,94 +23,88 @@
 
 namespace pmacc
 {
-namespace allocator
-{
-
-template<typename Type, int T_dim>
-HDINLINE
-cursor::BufferCursor<Type, T_dim>
-DeviceMemAllocator<Type, T_dim>::allocate(const math::Size_t<T_dim>& size)
-{
+    namespace allocator
+    {
+        template<typename Type, int T_dim>
+        HDINLINE cursor::BufferCursor<Type, T_dim> DeviceMemAllocator<Type, T_dim>::allocate(
+            const math::Size_t<T_dim>& size)
+        {
 #ifndef __CUDA_ARCH__
-    Type* dataPointer;
-    math::Size_t<T_dim-1> pitch;
-    cudaPitchedPtr cudaData;
+            Type* dataPointer;
+            math::Size_t<T_dim - 1> pitch;
+            cuplaPitchedPtr cuplaData;
 
-    cudaData.ptr = nullptr;
-    cudaData.pitch = 1;
-    cudaData.xsize = size[0] * sizeof (Type);
-    cudaData.ysize = 1;
+            cuplaData.ptr = nullptr;
+            cuplaData.pitch = 1;
+            cuplaData.xsize = size[0] * sizeof(Type);
+            cuplaData.ysize = 1;
 
-    if (dim == 2u)
-    {
-        cudaData.xsize = size[0] * sizeof (Type);
-        cudaData.ysize = size[1];
-        if(size.productOfComponents())
-            CUDA_CHECK(cudaMallocPitch(&cudaData.ptr, &cudaData.pitch, cudaData.xsize, cudaData.ysize));
-        pitch[0] = cudaData.pitch;
-    }
-    else if (dim == 3u)
-    {
-        cudaExtent extent;
-        extent.width = size[0] * sizeof (Type);
-        extent.height = size[1];
-        extent.depth = size[2];
-        if(size.productOfComponents())
-            CUDA_CHECK(cudaMalloc3D(&cudaData, extent));
-        pitch[0] = cudaData.pitch;
-        pitch[1] = cudaData.pitch * size[1];
-    }
-    dataPointer = (Type*)cudaData.ptr;
+            if(dim == 2u)
+            {
+                cuplaData.xsize = size[0] * sizeof(Type);
+                cuplaData.ysize = size[1];
+                if(size.productOfComponents())
+                    CUDA_CHECK(cuplaMallocPitch(&cuplaData.ptr, &cuplaData.pitch, cuplaData.xsize, cuplaData.ysize));
+                pitch[0] = cuplaData.pitch;
+            }
+            else if(dim == 3u)
+            {
+                cuplaExtent extent;
+                extent.width = size[0] * sizeof(Type);
+                extent.height = size[1];
+                extent.depth = size[2];
+                if(size.productOfComponents())
+                    CUDA_CHECK(cuplaMalloc3D(&cuplaData, extent));
+                pitch[0] = cuplaData.pitch;
+                pitch[1] = cuplaData.pitch * size[1];
+            }
+            dataPointer = (Type*) cuplaData.ptr;
 
-    return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
+            return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
 #endif
 
 #ifdef __CUDA_ARCH__
-    Type* dataPointer = nullptr;
-    math::Size_t<T_dim-1> pitch;
-    return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
+            Type* dataPointer = nullptr;
+            math::Size_t<T_dim - 1> pitch;
+            return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
 #endif
-}
+        }
 
-template<typename Type>
-HDINLINE
-cursor::BufferCursor<Type, 1>
-DeviceMemAllocator<Type, 1>::allocate(const math::Size_t<1>& size)
-{
+        template<typename Type>
+        HDINLINE cursor::BufferCursor<Type, 1> DeviceMemAllocator<Type, 1>::allocate(const math::Size_t<1>& size)
+        {
 #ifndef __CUDA_ARCH__
-    Type* dataPointer = nullptr;
+            Type* dataPointer = nullptr;
 
-    if(size[0])
-        CUDA_CHECK(cudaMalloc((void**)&dataPointer, size[0] * sizeof(Type)));
+            if(size[0])
+                CUDA_CHECK(cuplaMalloc((void**) &dataPointer, size[0] * sizeof(Type)));
 
-    return cursor::BufferCursor<Type, 1>(dataPointer, math::Size_t<0>());
+            return cursor::BufferCursor<Type, 1>(dataPointer, math::Size_t<0>());
 #endif
 
 #ifdef __CUDA_ARCH__
-    Type* dataPointer = nullptr;
-    return cursor::BufferCursor<Type, 1>(dataPointer, math::Size_t<0>());
+            Type* dataPointer = nullptr;
+            return cursor::BufferCursor<Type, 1>(dataPointer, math::Size_t<0>());
 #endif
-}
+        }
 
-template<typename Type, int T_dim>
-template<typename TCursor>
-HDINLINE
-void DeviceMemAllocator<Type, T_dim>::deallocate(const TCursor& cursor)
-{
+        template<typename Type, int T_dim>
+        template<typename TCursor>
+        HDINLINE void DeviceMemAllocator<Type, T_dim>::deallocate(const TCursor& cursor)
+        {
 #ifndef __CUDA_ARCH__
-    CUDA_CHECK(cudaFree(cursor.getMarker()));
+            CUDA_CHECK(cuplaFree(cursor.getMarker()));
 #endif
-}
+        }
 
-template<typename Type>
-template<typename TCursor>
-HDINLINE
-void DeviceMemAllocator<Type, 1>::deallocate(const TCursor& cursor)
-{
+        template<typename Type>
+        template<typename TCursor>
+        HDINLINE void DeviceMemAllocator<Type, 1>::deallocate(const TCursor& cursor)
+        {
 #ifndef __CUDA_ARCH__
-    CUDA_CHECK(cudaFree(cursor.getMarker()));
+            CUDA_CHECK(cuplaFree(cursor.getMarker()));
 #endif
-}
+        }
 
-} // allocator
-} // pmacc
+    } // namespace allocator
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.hpp b/include/pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.hpp
index d13fde5400..ea86594d9f 100644
--- a/include/pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.hpp
+++ b/include/pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,37 +29,35 @@
 
 namespace pmacc
 {
-namespace allocator
-{
-
-template<typename Type, int T_dim>
-struct DeviceMemEvenPitch
-{
-    typedef Type type;
-    static constexpr int dim = T_dim;
-    typedef cursor::BufferCursor<type, dim> Cursor;
-    typedef allocator::tag::device tag;
-
-    static cursor::BufferCursor<type, T_dim> allocate(const math::Size_t<T_dim>& size);
-    template<typename TCursor>
-    static void deallocate(const TCursor& cursor);
-};
-
-template<typename Type>
-struct DeviceMemEvenPitch<Type, 1>
-{
-    typedef Type type;
-    static constexpr int dim = 1;
-    typedef cursor::BufferCursor<type, 1> Cursor;
-    typedef allocator::tag::device tag;
-
-    static cursor::BufferCursor<type, 1> allocate(const math::Size_t<1>& size);
-    template<typename TCursor>
-    static void deallocate(const TCursor& cursor);
-};
-
-} // allocator
-} // pmacc
+    namespace allocator
+    {
+        template<typename Type, int T_dim>
+        struct DeviceMemEvenPitch
+        {
+            typedef Type type;
+            static constexpr int dim = T_dim;
+            typedef cursor::BufferCursor<type, dim> Cursor;
+            typedef allocator::tag::device tag;
+
+            static cursor::BufferCursor<type, T_dim> allocate(const math::Size_t<T_dim>& size);
+            template<typename TCursor>
+            static void deallocate(const TCursor& cursor);
+        };
+
+        template<typename Type>
+        struct DeviceMemEvenPitch<Type, 1>
+        {
+            typedef Type type;
+            static constexpr int dim = 1;
+            typedef cursor::BufferCursor<type, 1> Cursor;
+            typedef allocator::tag::device tag;
+
+            static cursor::BufferCursor<type, 1> allocate(const math::Size_t<1>& size);
+            template<typename TCursor>
+            static void deallocate(const TCursor& cursor);
+        };
+
+    } // namespace allocator
+} // namespace pmacc
 
 #include "DeviceMemEvenPitchAllocator.tpp"
-
diff --git a/include/pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.tpp b/include/pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.tpp
index 49fd3d88a9..d3de542aef 100644
--- a/include/pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.tpp
+++ b/include/pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -23,57 +23,54 @@
 
 namespace pmacc
 {
-namespace allocator
-{
+    namespace allocator
+    {
+        template<typename Type, int T_dim>
+        cursor::BufferCursor<Type, T_dim> DeviceMemEvenPitch<Type, T_dim>::allocate(const math::Size_t<T_dim>& size)
+        {
+            Type* dataPointer = nullptr;
+            math::Size_t<T_dim - 1> pitch;
 
-template<typename Type, int T_dim>
-cursor::BufferCursor<Type, T_dim>
-DeviceMemEvenPitch<Type, T_dim>::allocate(const math::Size_t<T_dim>& size)
-{
-    Type* dataPointer = nullptr;
-    math::Size_t<T_dim-1> pitch;
+            if(size.productOfComponents())
+                CUDA_CHECK(cuplaMalloc((void**) &dataPointer, sizeof(Type) * size.productOfComponents()));
 
-    if(size.productOfComponents())
-        CUDA_CHECK(cudaMalloc((void**)&dataPointer, sizeof(Type) * size.productOfComponents()));
+            if(dim == 2u)
+            {
+                pitch[0] = sizeof(Type) * size[0];
+            }
+            else if(dim == 3u)
+            {
+                pitch[0] = sizeof(Type) * size[0];
+                pitch[1] = pitch[0] * size[1];
+            }
 
-    if (dim == 2u)
-    {
-        pitch[0] = sizeof(Type) * size[0];
-    }
-    else if (dim == 3u)
-    {
-        pitch[0] = sizeof(Type) * size[0];
-        pitch[1] = pitch[0] * size[1];
-    }
+            return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
+        }
 
-    return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
-}
+        template<typename Type>
+        cursor::BufferCursor<Type, 1> DeviceMemEvenPitch<Type, 1>::allocate(const math::Size_t<1>& size)
+        {
+            Type* dataPointer = nullptr;
 
-template<typename Type>
-cursor::BufferCursor<Type, 1>
-DeviceMemEvenPitch<Type, 1>::allocate(const math::Size_t<1>& size)
-{
-    Type* dataPointer = nullptr;
-
-    if(size.productOfComponents())
-        CUDA_CHECK(cudaMalloc((void**)&dataPointer, size[0] * sizeof(Type)));
+            if(size.productOfComponents())
+                CUDA_CHECK(cuplaMalloc((void**) &dataPointer, size[0] * sizeof(Type)));
 
-    return cursor::BufferCursor<Type, 1>(dataPointer, math::Size_t<0>());
-}
+            return cursor::BufferCursor<Type, 1>(dataPointer, math::Size_t<0>());
+        }
 
-template<typename Type, int T_dim>
-template<typename TCursor>
-void DeviceMemEvenPitch<Type, T_dim>::deallocate(const TCursor& cursor)
-{
-    CUDA_CHECK(cudaFree(cursor.getMarker()));
-}
+        template<typename Type, int T_dim>
+        template<typename TCursor>
+        void DeviceMemEvenPitch<Type, T_dim>::deallocate(const TCursor& cursor)
+        {
+            CUDA_CHECK(cuplaFree(cursor.getMarker()));
+        }
 
-template<typename Type>
-template<typename TCursor>
-void DeviceMemEvenPitch<Type, 1>::deallocate(const TCursor& cursor)
-{
-    CUDA_CHECK(cudaFree(cursor.getMarker()));
-}
+        template<typename Type>
+        template<typename TCursor>
+        void DeviceMemEvenPitch<Type, 1>::deallocate(const TCursor& cursor)
+        {
+            CUDA_CHECK(cuplaFree(cursor.getMarker()));
+        }
 
-} // allocator
-} // pmacc
+    } // namespace allocator
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/allocator/EmptyAllocator.hpp b/include/pmacc/cuSTL/container/allocator/EmptyAllocator.hpp
index 4bc0bfea95..4c0879201d 100644
--- a/include/pmacc/cuSTL/container/allocator/EmptyAllocator.hpp
+++ b/include/pmacc/cuSTL/container/allocator/EmptyAllocator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,18 +28,17 @@
 
 namespace pmacc
 {
-namespace allocator
-{
-
-struct EmptyAllocator
-{
-    typedef allocator::tag::unspecified tag;
-
-    template<typename TCursor>
-    HDINLINE
-    static void deallocate(const TCursor&) {}
-};
-
-} // allocator
-} // pmacc
-
+    namespace allocator
+    {
+        struct EmptyAllocator
+        {
+            typedef allocator::tag::unspecified tag;
+
+            template<typename TCursor>
+            HDINLINE static void deallocate(const TCursor&)
+            {
+            }
+        };
+
+    } // namespace allocator
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/allocator/HostMemAllocator.hpp b/include/pmacc/cuSTL/container/allocator/HostMemAllocator.hpp
index 6a19edff95..1d54069392 100644
--- a/include/pmacc/cuSTL/container/allocator/HostMemAllocator.hpp
+++ b/include/pmacc/cuSTL/container/allocator/HostMemAllocator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -31,41 +31,37 @@
 
 namespace pmacc
 {
-namespace allocator
-{
+    namespace allocator
+    {
+        template<typename Type, int T_dim>
+        struct HostMemAllocator
+        {
+            typedef Type type;
+            static constexpr int dim = T_dim;
+            typedef cursor::BufferCursor<type, T_dim> Cursor;
+            typedef allocator::tag::host tag;
 
-template<typename Type, int T_dim>
-struct HostMemAllocator
-{
-    typedef Type type;
-    static constexpr int dim = T_dim;
-    typedef cursor::BufferCursor<type, T_dim> Cursor;
-    typedef allocator::tag::host tag;
+            HDINLINE
+            static cursor::BufferCursor<type, T_dim> allocate(const math::Size_t<T_dim>& size);
+            template<typename TCursor>
+            HDINLINE static void deallocate(const TCursor& cursor);
+        };
 
-    HDINLINE
-    static cursor::BufferCursor<type, T_dim> allocate(const math::Size_t<T_dim>& size);
-    template<typename TCursor>
-    HDINLINE
-    static void deallocate(const TCursor& cursor);
-};
+        template<typename Type>
+        struct HostMemAllocator<Type, 1>
+        {
+            typedef Type type;
+            static constexpr int dim = 1;
+            typedef cursor::BufferCursor<type, 1> Cursor;
+            typedef allocator::tag::host tag;
 
-template<typename Type>
-struct HostMemAllocator<Type, 1>
-{
-    typedef Type type;
-    static constexpr int dim = 1;
-    typedef cursor::BufferCursor<type, 1> Cursor;
-    typedef allocator::tag::host tag;
+            HDINLINE
+            static cursor::BufferCursor<type, 1> allocate(const math::Size_t<1>& size);
+            template<typename TCursor>
+            HDINLINE static void deallocate(const TCursor& cursor);
+        };
 
-    HDINLINE
-    static cursor::BufferCursor<type, 1> allocate(const math::Size_t<1>& size);
-    template<typename TCursor>
-    HDINLINE
-    static void deallocate(const TCursor& cursor);
-};
-
-} // allocator
-} // pmacc
+    } // namespace allocator
+} // namespace pmacc
 
 #include "HostMemAllocator.tpp"
-
diff --git a/include/pmacc/cuSTL/container/allocator/HostMemAllocator.tpp b/include/pmacc/cuSTL/container/allocator/HostMemAllocator.tpp
index be90462acb..2a9fdbb7b8 100644
--- a/include/pmacc/cuSTL/container/allocator/HostMemAllocator.tpp
+++ b/include/pmacc/cuSTL/container/allocator/HostMemAllocator.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -23,81 +23,75 @@
 
 namespace pmacc
 {
-namespace allocator
-{
-
-template<typename Type, int T_dim>
-HDINLINE
-cursor::BufferCursor<Type, T_dim>
-HostMemAllocator<Type, T_dim>::allocate(const math::Size_t<T_dim>& size)
-{
+    namespace allocator
+    {
+        template<typename Type, int T_dim>
+        HDINLINE cursor::BufferCursor<Type, T_dim> HostMemAllocator<Type, T_dim>::allocate(
+            const math::Size_t<T_dim>& size)
+        {
 #ifndef __CUDA_ARCH__
-    Type* dataPointer = nullptr;
-    math::Size_t<T_dim-1> pitch;
+            Type* dataPointer = nullptr;
+            math::Size_t<T_dim - 1> pitch;
 
-    if(size.productOfComponents())
-        CUDA_CHECK(cudaMallocHost((void**)&dataPointer, sizeof(Type) * size.productOfComponents()));
-    if(dim == 2u)
-    {
-        pitch[0] = size[0] * sizeof(Type);
-    }
-    else if(dim == 3u)
-    {
-        pitch[0] = size[0] * sizeof(Type);
-        pitch[1] = pitch[0] * size[1];
-    }
+            if(size.productOfComponents())
+                CUDA_CHECK(cuplaMallocHost((void**) &dataPointer, sizeof(Type) * size.productOfComponents()));
+            if(dim == 2u)
+            {
+                pitch[0] = size[0] * sizeof(Type);
+            }
+            else if(dim == 3u)
+            {
+                pitch[0] = size[0] * sizeof(Type);
+                pitch[1] = pitch[0] * size[1];
+            }
 
-    return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
+            return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
 #endif
 
 #ifdef __CUDA_ARCH__
-    Type* dataPointer = nullptr;
-    math::Size_t<T_dim-1> pitch;
-    return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
+            Type* dataPointer = nullptr;
+            math::Size_t<T_dim - 1> pitch;
+            return cursor::BufferCursor<Type, T_dim>(dataPointer, pitch);
 #endif
-}
+        }
 
-template<typename Type>
-HDINLINE
-cursor::BufferCursor<Type, 1>
-HostMemAllocator<Type, 1>::allocate(const math::Size_t<1>& size)
-{
+        template<typename Type>
+        HDINLINE cursor::BufferCursor<Type, 1> HostMemAllocator<Type, 1>::allocate(const math::Size_t<1>& size)
+        {
 #ifndef __CUDA_ARCH__
-    Type* dataPointer = nullptr;
-    math::Size_t<0> pitch;
+            Type* dataPointer = nullptr;
+            math::Size_t<0> pitch;
 
-    if(size.productOfComponents())
-        CUDA_CHECK(cudaMallocHost((void**)&dataPointer, sizeof(Type) * size.productOfComponents()));
+            if(size.productOfComponents())
+                CUDA_CHECK(cuplaMallocHost((void**) &dataPointer, sizeof(Type) * size.productOfComponents()));
 
-    return cursor::BufferCursor<Type, 1>(dataPointer, pitch);
+            return cursor::BufferCursor<Type, 1>(dataPointer, pitch);
 #endif
 
 #ifdef __CUDA_ARCH__
-    Type* dataPointer = nullptr;
-    math::Size_t<0> pitch;
-    return cursor::BufferCursor<Type, 1>(dataPointer, pitch);
+            Type* dataPointer = nullptr;
+            math::Size_t<0> pitch;
+            return cursor::BufferCursor<Type, 1>(dataPointer, pitch);
 #endif
-}
+        }
 
-template<typename Type, int T_dim>
-template<typename TCursor>
-HDINLINE
-void HostMemAllocator<Type, T_dim>::deallocate(const TCursor& cursor)
-{
+        template<typename Type, int T_dim>
+        template<typename TCursor>
+        HDINLINE void HostMemAllocator<Type, T_dim>::deallocate(const TCursor& cursor)
+        {
 #ifndef __CUDA_ARCH__
-    CUDA_CHECK(cudaFreeHost(cursor.getMarker()));
+            CUDA_CHECK(cuplaFreeHost(cursor.getMarker()));
 #endif
-}
+        }
 
-template<typename Type>
-template<typename TCursor>
-HDINLINE
-void HostMemAllocator<Type, 1>::deallocate(const TCursor& cursor)
-{
+        template<typename Type>
+        template<typename TCursor>
+        HDINLINE void HostMemAllocator<Type, 1>::deallocate(const TCursor& cursor)
+        {
 #ifndef __CUDA_ARCH__
-    CUDA_CHECK(cudaFreeHost(cursor.getMarker()));
+            CUDA_CHECK(cuplaFreeHost(cursor.getMarker()));
 #endif
-}
+        }
 
-} // allocator
-} // pmacc
+    } // namespace allocator
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/allocator/compile-time/SharedMemAllocator.hpp b/include/pmacc/cuSTL/container/allocator/compile-time/SharedMemAllocator.hpp
index b408f37907..c42308973a 100644
--- a/include/pmacc/cuSTL/container/allocator/compile-time/SharedMemAllocator.hpp
+++ b/include/pmacc/cuSTL/container/allocator/compile-time/SharedMemAllocator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,81 +30,65 @@
 
 namespace pmacc
 {
-namespace allocator
-{
-namespace CT
-{
-template<typename Type, typename Size, int dim = Size::dim, int uid = 0>
-struct SharedMemAllocator;
-
-template<typename Type, typename Size, int uid>
-struct SharedMemAllocator<Type, Size, 1, uid>
-{
-    typedef Type type;
-    typedef math::CT::UInt32<> Pitch;
-    static constexpr int dim = 1;
-    typedef cursor::CT::BufferCursor<type, math::CT::UInt32<> > Cursor;
-
-    template< typename T_Acc >
-    DEVICEONLY static Cursor allocate( T_Acc const & acc )
+    namespace allocator
     {
-        auto& shMem = pmacc::memory::shared::allocate<
-            uid,
-            memory::Array<
-                Type,
-                math::CT::volume< Size >::type::value
-            >
-        >( acc );
-        return Cursor(shMem.data());
-    }
-};
+        namespace CT
+        {
+            template<typename Type, typename Size, int dim = Size::dim, int uid = 0>
+            struct SharedMemAllocator;
 
-template<typename Type, typename Size, int uid>
-struct SharedMemAllocator<Type, Size, 2, uid>
-{
-    typedef Type type;
-    typedef math::CT::UInt32<sizeof(Type) * Size::x::value> Pitch;
-    static constexpr int dim = 2;
-    typedef cursor::CT::BufferCursor<type, Pitch> Cursor;
+            template<typename Type, typename Size, int uid>
+            struct SharedMemAllocator<Type, Size, 1, uid>
+            {
+                typedef Type type;
+                typedef math::CT::UInt32<> Pitch;
+                static constexpr int dim = 1;
+                typedef cursor::CT::BufferCursor<type, math::CT::UInt32<>> Cursor;
 
-    template< typename T_Acc >
-    DEVICEONLY static Cursor allocate( T_Acc const & acc )
-    {
-        auto& shMem = pmacc::memory::shared::allocate<
-            uid,
-            memory::Array<
-                Type,
-                math::CT::volume< Size >::type::value
-            >
-        >( acc );
-        return Cursor(shMem.data());
-    }
-};
+                template<typename T_Acc>
+                DINLINE static Cursor allocate(T_Acc const& acc)
+                {
+                    auto& shMem = pmacc::memory::shared::
+                        allocate<uid, memory::Array<Type, math::CT::volume<Size>::type::value>>(acc);
+                    return Cursor(shMem.data());
+                }
+            };
 
-template<typename Type, typename Size, int uid>
-struct SharedMemAllocator<Type, Size, 3, uid>
-{
-    typedef Type type;
-    typedef math::CT::UInt32<sizeof(Type) * Size::x::value,
-                             sizeof(Type) * Size::x::value * Size::y::value> Pitch;
-    static constexpr int dim = 3;
-    typedef cursor::CT::BufferCursor<type, Pitch> Cursor;
+            template<typename Type, typename Size, int uid>
+            struct SharedMemAllocator<Type, Size, 2, uid>
+            {
+                typedef Type type;
+                typedef math::CT::UInt32<sizeof(Type) * Size::x::value> Pitch;
+                static constexpr int dim = 2;
+                typedef cursor::CT::BufferCursor<type, Pitch> Cursor;
 
-    template< typename T_Acc >
-    DEVICEONLY static Cursor allocate( T_Acc const & acc )
-    {
-        auto& shMem = pmacc::memory::shared::allocate<
-            uid,
-            memory::Array<
-                Type,
-                math::CT::volume< Size >::type::value
-            >
-        >( acc );
-        return Cursor(shMem.data());
-    }
-};
+                template<typename T_Acc>
+                DINLINE static Cursor allocate(T_Acc const& acc)
+                {
+                    auto& shMem = pmacc::memory::shared::
+                        allocate<uid, memory::Array<Type, math::CT::volume<Size>::type::value>>(acc);
+                    return Cursor(shMem.data());
+                }
+            };
+
+            template<typename Type, typename Size, int uid>
+            struct SharedMemAllocator<Type, Size, 3, uid>
+            {
+                typedef Type type;
+                typedef math::CT::UInt32<sizeof(Type) * Size::x::value, sizeof(Type) * Size::x::value * Size::y::value>
+                    Pitch;
+                static constexpr int dim = 3;
+                typedef cursor::CT::BufferCursor<type, Pitch> Cursor;
 
-} // CT
-} // allocator
-} // pmacc
+                template<typename T_Acc>
+                DINLINE static Cursor allocate(T_Acc const& acc)
+                {
+                    auto& shMem = pmacc::memory::shared::
+                        allocate<uid, memory::Array<Type, math::CT::volume<Size>::type::value>>(acc);
+                    return Cursor(shMem.data());
+                }
+            };
 
+        } // namespace CT
+    } // namespace allocator
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/allocator/tag.hpp b/include/pmacc/cuSTL/container/allocator/tag.hpp
index b5c3ce63a3..482adb9c01 100644
--- a/include/pmacc/cuSTL/container/allocator/tag.hpp
+++ b/include/pmacc/cuSTL/container/allocator/tag.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,15 +24,15 @@
 
 namespace pmacc
 {
-namespace allocator
-{
-namespace tag
-{
-struct host;
-struct device;
-struct unspecified;
-} // tag
-} // allocator
-} // pmacc
+    namespace allocator
+    {
+        namespace tag
+        {
+            struct host;
+            struct device;
+            struct unspecified;
+        } // namespace tag
+    } // namespace allocator
+} // namespace pmacc
 
 #endif // ALLOCATOR_TAG_H
diff --git a/include/pmacc/cuSTL/container/assigner/DeviceMemAssigner.hpp b/include/pmacc/cuSTL/container/assigner/DeviceMemAssigner.hpp
index 23ab1aa4df..6440401328 100644
--- a/include/pmacc/cuSTL/container/assigner/DeviceMemAssigner.hpp
+++ b/include/pmacc/cuSTL/container/assigner/DeviceMemAssigner.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -38,42 +38,42 @@
 
 namespace pmacc
 {
-namespace assigner
-{
-
-namespace bmpl = boost::mpl;
+    namespace assigner
+    {
+        namespace bmpl = boost::mpl;
 
-template<typename T_Dim = bmpl::_1, typename T_CartBuffer = bmpl::_2>
-struct DeviceMemAssigner
-{
-    static constexpr int dim = T_Dim::value;
-    typedef T_CartBuffer CartBuffer;
+        template<typename T_Dim = bmpl::_1, typename T_CartBuffer = bmpl::_2>
+        struct DeviceMemAssigner
+        {
+            static constexpr int dim = T_Dim::value;
+            typedef T_CartBuffer CartBuffer;
 
-    template<typename Type>
-    HINLINE void assign(const Type& value)
-    {
-        // "Curiously recurring template pattern"
-        CartBuffer* buffer = static_cast<CartBuffer*>(this);
+            template<typename Type>
+            HINLINE void assign(const Type& value)
+            {
+                // "Curiously recurring template pattern"
+                CartBuffer* buffer = static_cast<CartBuffer*>(this);
 
-        zone::SphericZone<dim> myZone(buffer->size());
-        cursor::BufferCursor<Type, dim> cursor(buffer->dataPointer, buffer->pitch);
+                zone::SphericZone<dim> myZone(buffer->size());
+                cursor::BufferCursor<Type, dim> cursor(buffer->dataPointer, buffer->pitch);
 
-        /* The greatest common divisor of each component of the volume size
-         * and a certain power of two value gives the best suitable block size */
-        math::Size_t<3> blockSize(math::Size_t<3>::create(1));
-        size_t maxValues[] = {16, 16, 4}; // maximum values for each dimension
-        for(int i = 0; i < dim; i++)
-        {
-            blockSize[i] = boost::integer::gcd(buffer->size()[i], maxValues[dim-1]);
-        }
-        /* the maximum number of threads per block for devices with
-         * compute capability > 2.0 is 1024 */
-        PMACC_VERIFY(blockSize.productOfComponents() <= 1024);
+                /* The greatest common divisor of each component of the volume size
+                 * and a certain power of two value gives the best suitable block size */
+                math::Size_t<3> blockSize(math::Size_t<3>::create(1));
+                size_t maxValues[] = {16, 16, 4}; // maximum values for each dimension
+                for(int i = 0; i < dim; i++)
+                {
+                    blockSize[i] = boost::integer::gcd(buffer->size()[i], maxValues[dim - 1]);
+                }
+                /* the maximum number of threads per block for devices with
+                 * compute capability > 2.0 is 1024 */
+                PMACC_VERIFY(blockSize.productOfComponents() <= 1024);
 
-        algorithm::kernel::RT::Foreach foreach(blockSize);
-        foreach(myZone, cursor, pmacc::algorithm::functor::AssignValue<Type>(value));
-    }
-};
+                algorithm::kernel::RT::Foreach foreach(blockSize);
+                foreach(myZone, cursor, pmacc::algorithm::functor::AssignValue<Type>(value))
+                    ;
+            }
+        };
 
-} // assigner
-} // pmacc
+    } // namespace assigner
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/assigner/HostMemAssigner.hpp b/include/pmacc/cuSTL/container/assigner/HostMemAssigner.hpp
index a296247f76..c7a52f6845 100644
--- a/include/pmacc/cuSTL/container/assigner/HostMemAssigner.hpp
+++ b/include/pmacc/cuSTL/container/assigner/HostMemAssigner.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -33,31 +33,30 @@
 
 namespace pmacc
 {
-namespace assigner
-{
-
-namespace bmpl = boost::mpl;
-
-template<typename T_Dim = bmpl::_1, typename T_CartBuffer = bmpl::_2>
-struct HostMemAssigner
-{
-    static constexpr int dim = T_Dim::value;
-    typedef T_CartBuffer CartBuffer;
-
-    template<typename Type>
-    HINLINE void assign(const Type& value)
+    namespace assigner
     {
-        // "Curiously recurring template pattern"
-        CartBuffer* buffer = static_cast<CartBuffer*>(this);
-
-        // get a host accelerator
-        auto hostDev = cupla::manager::Device< cupla::AccHost >::get().device( );
-
-        algorithm::host::Foreach foreach;
-        foreach(hostDev, buffer->zone(), buffer->origin(), pmacc::algorithm::functor::AssignValue<Type>(value));
-    }
-};
-
-} // assigner
-} // pmacc
-
+        namespace bmpl = boost::mpl;
+
+        template<typename T_Dim = bmpl::_1, typename T_CartBuffer = bmpl::_2>
+        struct HostMemAssigner
+        {
+            static constexpr int dim = T_Dim::value;
+            typedef T_CartBuffer CartBuffer;
+
+            template<typename Type>
+            HINLINE void assign(const Type& value)
+            {
+                // "Curiously recurring template pattern"
+                CartBuffer* buffer = static_cast<CartBuffer*>(this);
+
+                // get a host accelerator
+                auto hostDev = cupla::manager::Device<cupla::AccHost>::get().device();
+
+                algorithm::host::Foreach foreach;
+                foreach(hostDev, buffer->zone(), buffer->origin(), pmacc::algorithm::functor::AssignValue<Type>(value))
+                    ;
+            }
+        };
+
+    } // namespace assigner
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/compile-time/CartBuffer.hpp b/include/pmacc/cuSTL/container/compile-time/CartBuffer.hpp
index 8858722587..cab81514e9 100644
--- a/include/pmacc/cuSTL/container/compile-time/CartBuffer.hpp
+++ b/include/pmacc/cuSTL/container/compile-time/CartBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,51 +30,59 @@
 
 namespace pmacc
 {
-namespace container
-{
-namespace CT
-{
+    namespace container
+    {
+        namespace CT
+        {
+            /** compile-time version of container::CartBuffer
+             * \tparam _Size compile-time vector specifying the size of the container
+             */
+            template<typename Type, typename _Size, typename Allocator, typename Copier, typename Assigner>
+            class CartBuffer
+            {
+            public:
+                typedef Type type;
+                typedef _Size Size;
+                typedef typename Allocator::Pitch Pitch;
+                typedef cursor::CT::BufferCursor<Type, Pitch> Cursor;
+                static constexpr int dim = Size::dim;
+                typedef zone::CT::SphericZone<_Size, typename math::CT::make_Int<dim, 0>::type> Zone;
 
-/** compile-time version of container::CartBuffer
- * \tparam _Size compile-time vector specifying the size of the container
- */
-template<typename Type, typename _Size, typename Allocator, typename Copier, typename Assigner>
-class CartBuffer
-{
-public:
-    typedef Type type;
-    typedef _Size Size;
-    typedef typename Allocator::Pitch Pitch;
-    typedef cursor::CT::BufferCursor<Type, Pitch> Cursor;
-    static constexpr int dim = Size::dim;
-    typedef zone::CT::SphericZone<_Size, typename math::CT::make_Int<dim, 0>::type> Zone;
-private:
-    Type* dataPointer;
-    //HDINLINE void init();
-public:
-    template< typename T_Acc >
-    DINLINE CartBuffer( T_Acc const & acc );
-    DINLINE CartBuffer(const CT::CartBuffer<Type, Size, Allocator, Copier, Assigner>& other);
+            private:
+                Type* dataPointer;
+                // HDINLINE void init();
+            public:
+                template<typename T_Acc>
+                DINLINE CartBuffer(T_Acc const& acc);
+                DINLINE CartBuffer(const CT::CartBuffer<Type, Size, Allocator, Copier, Assigner>& other);
 
-    DINLINE CT::CartBuffer<Type, Size, Allocator, Copier, Assigner>&
-    operator=(const CT::CartBuffer<Type, Size, Allocator, Copier, Assigner>& rhs);
+                DINLINE CT::CartBuffer<Type, Size, Allocator, Copier, Assigner>& operator=(
+                    const CT::CartBuffer<Type, Size, Allocator, Copier, Assigner>& rhs);
 
-    DINLINE void assign(const Type& value);
-    DINLINE Type* getDataPointer() const {return dataPointer;}
+                DINLINE void assign(const Type& value);
+                DINLINE Type* getDataPointer() const
+                {
+                    return dataPointer;
+                }
 
-    DINLINE cursor::CT::BufferCursor<Type, Pitch> origin() const;
-    /*
-    HDINLINE Cursor<PointerAccessor<Type>, CartNavigator<dim>, char*>
-    originCustomAxes(const math::UInt32<dim>& axes) const;
-    */
-    DINLINE math::Size_t<dim> size() const {return math::Size_t<dim>(Size());}
+                DINLINE cursor::CT::BufferCursor<Type, Pitch> origin() const;
+                /*
+                HDINLINE Cursor<PointerAccessor<Type>, CartNavigator<dim>, char*>
+                originCustomAxes(const math::UInt32<dim>& axes) const;
+                */
+                DINLINE math::Size_t<dim> size() const
+                {
+                    return math::Size_t<dim>(Size());
+                }
 
-    DINLINE Zone zone() const { return Zone(); }
-};
+                DINLINE Zone zone() const
+                {
+                    return Zone();
+                }
+            };
 
-} // CT
-} // container
-} // pmacc
+        } // namespace CT
+    } // namespace container
+} // namespace pmacc
 
 #include "CartBuffer.tpp"
-
diff --git a/include/pmacc/cuSTL/container/compile-time/CartBuffer.tpp b/include/pmacc/cuSTL/container/compile-time/CartBuffer.tpp
index 8561570cbc..2007ffe994 100644
--- a/include/pmacc/cuSTL/container/compile-time/CartBuffer.tpp
+++ b/include/pmacc/cuSTL/container/compile-time/CartBuffer.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,26 +23,24 @@
 
 namespace pmacc
 {
-namespace container
-{
-namespace CT
-{
+    namespace container
+    {
+        namespace CT
+        {
+            template<typename Type, typename _Size, typename Allocator, typename Copier, typename Assigner>
+            template<typename T_Acc>
+            DINLINE CartBuffer<Type, _Size, Allocator, Copier, Assigner>::CartBuffer(T_Acc const& acc)
+            {
+                this->dataPointer = Allocator::allocate(acc).getMarker();
+            }
 
-template<typename Type, typename _Size, typename Allocator, typename Copier, typename Assigner>
-template< typename T_Acc >
-DINLINE CartBuffer<Type, _Size, Allocator, Copier, Assigner>::CartBuffer( T_Acc const & acc )
-{
-    this->dataPointer = Allocator::allocate( acc ).getMarker();
-}
-
-template<typename Type, typename _Size, typename Allocator, typename Copier, typename Assigner>
-DINLINE
-cursor::CT::BufferCursor<Type, typename Allocator::Pitch>
-CartBuffer<Type, _Size, Allocator, Copier, Assigner>::origin() const
-{
-    return cursor::CT::BufferCursor<Type, Pitch>(this->dataPointer);
-}
+            template<typename Type, typename _Size, typename Allocator, typename Copier, typename Assigner>
+            DINLINE cursor::CT::BufferCursor<Type, typename Allocator::Pitch>
+            CartBuffer<Type, _Size, Allocator, Copier, Assigner>::origin() const
+            {
+                return cursor::CT::BufferCursor<Type, Pitch>(this->dataPointer);
+            }
 
-} // CT
-} // container
-} // pmacc
+        } // namespace CT
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/compile-time/SharedBuffer.hpp b/include/pmacc/cuSTL/container/compile-time/SharedBuffer.hpp
index 80f43f321c..7263813343 100644
--- a/include/pmacc/cuSTL/container/compile-time/SharedBuffer.hpp
+++ b/include/pmacc/cuSTL/container/compile-time/SharedBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -26,19 +26,18 @@
 
 namespace pmacc
 {
-namespace container
-{
-namespace CT
-{
-
-/* typedef version of container::CT::CartBuffer for shared mem on a GPU inside a cuda kernel.
- * \param uid If two containers in one kernel have the same Type and Size,
- * uid has to be different. This is due to a nvcc bug.
- */
-template<typename Type, typename Size, int uid = 0>
-using SharedBuffer = CT::CartBuffer<Type, Size,
-                         allocator::CT::SharedMemAllocator<Type, Size, Size::dim, uid>, void, void>;
+    namespace container
+    {
+        namespace CT
+        {
+            /* typedef version of container::CT::CartBuffer for shared mem on a GPU inside a cupla kernel.
+             * \param uid If two containers in one kernel have the same Type and Size,
+             * uid has to be different. This is due to a nvcc bug.
+             */
+            template<typename Type, typename Size, int uid = 0>
+            using SharedBuffer = CT::
+                CartBuffer<Type, Size, allocator::CT::SharedMemAllocator<Type, Size, Size::dim, uid>, void, void>;
 
-} // CT
-} // container
-} // pmacc
+        } // namespace CT
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/copier/D2DCopier.hpp b/include/pmacc/cuSTL/container/copier/D2DCopier.hpp
index d27191d5ff..f9714ff5e1 100644
--- a/include/pmacc/cuSTL/container/copier/D2DCopier.hpp
+++ b/include/pmacc/cuSTL/container/copier/D2DCopier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -30,43 +30,46 @@
 
 namespace pmacc
 {
-namespace copier
-{
-
-template<int T_dim>
-struct D2DCopier
-{
-    static constexpr int dim = T_dim;
-
-    PMACC_NO_NVCC_HDWARNING /* Handled via CUDA_ARCH */
-    template<typename Type>
-    HDINLINE static void copy(Type* dest, const math::Size_t<dim-1>& pitchDest,
-         Type* source, const math::Size_t<dim-1>& pitchSource,
-         const math::Size_t<dim>& size)
+    namespace copier
     {
+        template<int T_dim>
+        struct D2DCopier
+        {
+            static constexpr int dim = T_dim;
+
+            PMACC_NO_NVCC_HDWARNING /* Handled via CUDA_ARCH */
+                template<typename Type>
+                HDINLINE static void copy(
+                    Type* dest,
+                    const math::Size_t<dim - 1>& pitchDest,
+                    Type* source,
+                    const math::Size_t<dim - 1>& pitchSource,
+                    const math::Size_t<dim>& size)
+            {
 #ifdef __CUDA_ARCH__
-        typedef cursor::BufferCursor<Type, dim> Cursor;
-        Cursor bufCursorDest(dest, pitchDest);
-        Cursor bufCursorSrc(source, pitchSource);
-        cursor::MapTo1DNavigator<dim> myNavi(size);
+                typedef cursor::BufferCursor<Type, dim> Cursor;
+                Cursor bufCursorDest(dest, pitchDest);
+                Cursor bufCursorSrc(source, pitchSource);
+                cursor::MapTo1DNavigator<dim> myNavi(size);
 
-        auto srcCursor = cursor::make_Cursor(cursor::CursorAccessor<Cursor>(),
-                                                  myNavi,
-                                                  bufCursorSrc);
-        auto destCursor = cursor::make_Cursor(cursor::CursorAccessor<Cursor>(),
-                                                   myNavi,
-                                                   bufCursorDest);
-        size_t sizeProd = size.productOfComponents();
-        for(size_t i = 0; i < sizeProd; i++)
-        {
-            destCursor[i] = srcCursor[i];
-        }
+                auto srcCursor = cursor::make_Cursor(cursor::CursorAccessor<Cursor>(), myNavi, bufCursorSrc);
+                auto destCursor = cursor::make_Cursor(cursor::CursorAccessor<Cursor>(), myNavi, bufCursorDest);
+                size_t sizeProd = size.productOfComponents();
+                for(size_t i = 0; i < sizeProd; i++)
+                {
+                    destCursor[i] = srcCursor[i];
+                }
 #else
-        cudaWrapper::Memcopy<dim>()(dest, pitchDest, source, pitchSource,
-                                    size, cudaWrapper::flags::Memcopy::deviceToDevice);
+                cuplaWrapper::Memcopy<dim>()(
+                    dest,
+                    pitchDest,
+                    source,
+                    pitchSource,
+                    size,
+                    cuplaWrapper::flags::Memcopy::deviceToDevice);
 #endif
-    }
-};
+            }
+        };
 
-} // copier
-} // pmacc
+    } // namespace copier
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/copier/H2HCopier.hpp b/include/pmacc/cuSTL/container/copier/H2HCopier.hpp
index 666c51fc2f..7ce5ef286e 100644
--- a/include/pmacc/cuSTL/container/copier/H2HCopier.hpp
+++ b/include/pmacc/cuSTL/container/copier/H2HCopier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -27,24 +27,31 @@
 
 namespace pmacc
 {
-namespace copier
-{
-
-template<int T_dim>
-struct H2HCopier
-{
-    static constexpr int dim = T_dim;
-
-    PMACC_NO_NVCC_HDWARNING /* Should never be called from device functions */
-    template<typename Type>
-    HDINLINE static void copy(Type* dest, const math::Size_t<dim-1>& pitchDest,
-         Type* source, const math::Size_t<dim-1>& pitchSource,
-         const math::Size_t<dim>& size)
+    namespace copier
     {
-        cudaWrapper::Memcopy<dim>()(dest, pitchDest, source, pitchSource,
-                                    size, cudaWrapper::flags::Memcopy::hostToHost);
-    }
-};
+        template<int T_dim>
+        struct H2HCopier
+        {
+            static constexpr int dim = T_dim;
+
+            PMACC_NO_NVCC_HDWARNING /* Should never be called from device functions */
+                template<typename Type>
+                HDINLINE static void copy(
+                    Type* dest,
+                    const math::Size_t<dim - 1>& pitchDest,
+                    Type* source,
+                    const math::Size_t<dim - 1>& pitchSource,
+                    const math::Size_t<dim>& size)
+            {
+                cuplaWrapper::Memcopy<dim>()(
+                    dest,
+                    pitchDest,
+                    source,
+                    pitchSource,
+                    size,
+                    cuplaWrapper::flags::Memcopy::hostToHost);
+            }
+        };
 
-} // copier
-} // pmacc
+    } // namespace copier
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/copier/Memcopy.hpp b/include/pmacc/cuSTL/container/copier/Memcopy.hpp
index ebeb161230..6075b21586 100644
--- a/include/pmacc/cuSTL/container/copier/Memcopy.hpp
+++ b/include/pmacc/cuSTL/container/copier/Memcopy.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -26,82 +26,116 @@
 
 namespace pmacc
 {
-namespace cudaWrapper
-{
+    namespace cuplaWrapper
+    {
+        namespace flags
+        {
+            struct Memcopy
+            {
+                enum Direction
+                {
+                    hostToDevice = 0,
+                    deviceToHost,
+                    hostToHost,
+                    deviceToDevice
+                };
+            };
+        } // namespace flags
 
-namespace flags
-{
-struct Memcopy
-{
-    enum Direction {hostToDevice = 0, deviceToHost, hostToHost, deviceToDevice};
-};
-}
+        template<int dim>
+        struct Memcopy;
 
-template<int dim>
-struct Memcopy;
+        template<>
+        struct Memcopy<1>
+        {
+            template<typename Type>
+            void operator()(
+                Type* dest,
+                const math::Size_t<0>,
+                const Type* source,
+                const math::Size_t<0>,
+                const math::Size_t<1>& size,
+                flags::Memcopy::Direction direction)
+            {
+                const cuplaMemcpyKind kind[]
+                    = {cuplaMemcpyHostToDevice,
+                       cuplaMemcpyDeviceToHost,
+                       cuplaMemcpyHostToHost,
+                       cuplaMemcpyDeviceToDevice};
+                CUDA_CHECK(cuplaMemcpy(dest, source, sizeof(Type) * size.x(), kind[direction]));
+            }
+        };
 
-template<>
-struct Memcopy<1>
-{
-    template<typename Type>
-    void operator()(Type* dest, const math::Size_t<0>,
-                    const Type* source, const math::Size_t<0>, const math::Size_t<1>& size,
-                    flags::Memcopy::Direction direction)
-    {
-            const cudaMemcpyKind kind[] = {cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
-                                     cudaMemcpyHostToHost, cudaMemcpyDeviceToDevice};
-            CUDA_CHECK(cudaMemcpy(dest, source, sizeof(Type) * size.x(), kind[direction]));
-    }
-};
+        template<>
+        struct Memcopy<2u>
+        {
+            template<typename Type>
+            void operator()(
+                Type* dest,
+                const math::Size_t<1> pitchDest,
+                const Type* source,
+                const math::Size_t<1> pitchSource,
+                const math::Size_t<2u>& size,
+                flags::Memcopy::Direction direction)
+            {
+                const cuplaMemcpyKind kind[]
+                    = {cuplaMemcpyHostToDevice,
+                       cuplaMemcpyDeviceToHost,
+                       cuplaMemcpyHostToHost,
+                       cuplaMemcpyDeviceToDevice};
 
-template<>
-struct Memcopy<2u>
-{
-    template<typename Type>
-    void operator()(Type* dest, const math::Size_t<1> pitchDest,
-                    const Type* source, const math::Size_t<1> pitchSource, const math::Size_t<2u>& size,
-                    flags::Memcopy::Direction direction)
-    {
-            const cudaMemcpyKind kind[] = {cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
-                                     cudaMemcpyHostToHost, cudaMemcpyDeviceToDevice};
-
-            CUDA_CHECK(cudaMemcpy2D(dest, pitchDest.x(), source, pitchSource.x(), sizeof(Type) * size.x(), size.y(),
-                         kind[direction]));
-    }
-};
+                CUDA_CHECK(cuplaMemcpy2D(
+                    dest,
+                    pitchDest.x(),
+                    source,
+                    pitchSource.x(),
+                    sizeof(Type) * size.x(),
+                    size.y(),
+                    kind[direction]));
+            }
+        };
 
-template<>
-struct Memcopy<3>
-{
-    template<typename Type>
-    void operator()(Type* dest, const math::Size_t<2u> pitchDest,
-                    Type* source, const math::Size_t<2u> pitchSource, const math::Size_t<3>& size,
-                    flags::Memcopy::Direction direction)
-    {
-            const cudaMemcpyKind kind[] = {cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
-                                     cudaMemcpyHostToHost, cudaMemcpyDeviceToDevice};
+        template<>
+        struct Memcopy<3>
+        {
+            template<typename Type>
+            void operator()(
+                Type* dest,
+                const math::Size_t<2u> pitchDest,
+                Type* source,
+                const math::Size_t<2u> pitchSource,
+                const math::Size_t<3>& size,
+                flags::Memcopy::Direction direction)
+            {
+                const cuplaMemcpyKind kind[]
+                    = {cuplaMemcpyHostToDevice,
+                       cuplaMemcpyDeviceToHost,
+                       cuplaMemcpyHostToHost,
+                       cuplaMemcpyDeviceToDevice};
 
-            cudaPitchedPtr pitchedPtrDest;
-            pitchedPtrDest.pitch = pitchDest.x(); pitchedPtrDest.ptr = dest;
-            pitchedPtrDest.xsize = size.x() * sizeof (Type);
-            pitchedPtrDest.ysize = size.y();
-            cudaPitchedPtr pitchedPtrSource;
-            pitchedPtrSource.pitch = pitchSource.x(); pitchedPtrSource.ptr = source;
-            pitchedPtrSource.xsize = size.x() * sizeof (Type);
-            pitchedPtrSource.ysize = size.y();
+                cuplaPitchedPtr pitchedPtrDest;
+                pitchedPtrDest.pitch = pitchDest.x();
+                pitchedPtrDest.ptr = dest;
+                pitchedPtrDest.xsize = size.x() * sizeof(Type);
+                pitchedPtrDest.ysize = size.y();
+                cuplaPitchedPtr pitchedPtrSource;
+                pitchedPtrSource.pitch = pitchSource.x();
+                pitchedPtrSource.ptr = source;
+                pitchedPtrSource.xsize = size.x() * sizeof(Type);
+                pitchedPtrSource.ysize = size.y();
 
-            cudaMemcpy3DParms params;
-            params.srcArray = nullptr;
-            params.srcPos = make_cudaPos(0,0,0);
-            params.srcPtr = pitchedPtrSource;
-            params.dstArray = nullptr;
-            params.dstPos = make_cudaPos(0,0,0);
-            params.dstPtr = pitchedPtrDest;
-            params.extent = make_cudaExtent(size.x() * sizeof(Type), size.y(), size.z());
-            params.kind = kind[direction];
-            CUDA_CHECK(cudaMemcpy3D(&params));
-    }
-};
+                cuplaMemcpy3DParms params;
+                params.srcArray = nullptr;
+                params.srcPos = make_cuplaPos(0, 0, 0);
+                params.srcPtr = pitchedPtrSource;
+                params.dstArray = nullptr;
+                params.dstPos = make_cuplaPos(0, 0, 0);
+                params.dstPtr = pitchedPtrDest;
+                params.extent = make_cuplaExtent(size.x() * sizeof(Type), size.y(), size.z());
+                params.kind = kind[direction];
+                CUDA_CHECK(cuplaMemcpy3D(&params));
+            }
+        };
 
-} // cudaWrapper
-} // pmacc
+    } // namespace cuplaWrapper
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/container/tag.hpp b/include/pmacc/cuSTL/container/tag.hpp
index a1f493c831..6577f63093 100644
--- a/include/pmacc/cuSTL/container/tag.hpp
+++ b/include/pmacc/cuSTL/container/tag.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,16 +24,15 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-namespace tag
-{
-struct HostBuffer;
-struct DeviceBuffer;
-}
+    namespace container
+    {
+        namespace tag
+        {
+            struct HostBuffer;
+            struct DeviceBuffer;
+        } // namespace tag
 
-} // container
-} // pmacc
+    } // namespace container
+} // namespace pmacc
 
 #endif // CONTAINER_TAG_H
diff --git a/include/pmacc/cuSTL/container/view/View.hpp b/include/pmacc/cuSTL/container/view/View.hpp
index 79bfb7d308..5909e6f4de 100644
--- a/include/pmacc/cuSTL/container/view/View.hpp
+++ b/include/pmacc/cuSTL/container/view/View.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,53 +23,53 @@
 
 namespace pmacc
 {
-namespace container
-{
-
-/** Represents a clipped area of its inherited container.
- *
- * View are not designed to do hard data copies.
- * Views don't take care of reference counters. So if the corresponding
- * container dies, all views become invalid.
- * Usual way to contruct a view goes with container.view(...);
- * \tparam Buffer Corresponding container type
- */
-template<typename Buffer>
-struct View : public Buffer
-{
-    HDINLINE View() {}
-
-    template<typename TBuffer>
-    HDINLINE View(const View<TBuffer>& other)
+    namespace container
     {
-        *this = other;
-    }
-
-    HDINLINE ~View()
-    {
-        /* increment the reference counter because the container's destructor decrements it.
-         * We want to compensate this.
+        /** Represents a clipped area of its inherited container.
+         *
+         * View are not designed to do hard data copies.
+         * Views don't take care of reference counters. So if the corresponding
+         * container dies, all views become invalid.
+         * Usual way to contruct a view goes with container.view(...);
+         * \tparam Buffer Corresponding container type
          */
-        (*this->refCount)++;
-    }
+        template<typename Buffer>
+        struct View : public Buffer
+        {
+            HDINLINE View()
+            {
+            }
 
-    template<typename TBuffer>
-    HDINLINE View& operator=(const View<TBuffer>& other)
-    {
-        this->dataPointer = other.dataPointer;
-        this->_size = other._size;
-        this->pitch = other.pitch;
-        this->refCount = other.refCount;
+            template<typename TBuffer>
+            HDINLINE View(const View<TBuffer>& other)
+            {
+                *this = other;
+            }
+
+            HDINLINE ~View()
+            {
+                /* increment the reference counter because the container's destructor decrements it.
+                 * We want to compensate this.
+                 */
+                (*this->refCount)++;
+            }
+
+            template<typename TBuffer>
+            HDINLINE View& operator=(const View<TBuffer>& other)
+            {
+                this->dataPointer = other.dataPointer;
+                this->_size = other._size;
+                this->pitch = other.pitch;
+                this->refCount = other.refCount;
 
-        return *this;
-    }
+                return *this;
+            }
 
-private:
-    // forbid view = container
-    HDINLINE Buffer&
-    operator=(const Buffer& rhs);
-};
+        private:
+            // forbid view = container
+            HDINLINE Buffer& operator=(const Buffer& rhs);
+        };
 
 
-} // container
-} // pmacc
+    } // namespace container
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/BufferCursor.hpp b/include/pmacc/cuSTL/cursor/BufferCursor.hpp
index 65dff84f08..4d06c04586 100644
--- a/include/pmacc/cuSTL/cursor/BufferCursor.hpp
+++ b/include/pmacc/cuSTL/cursor/BufferCursor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -31,55 +31,51 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-/** The most common cursor typedef
- *
- * BufferCursor does access and jumping on a cartesian memory buffer.
- *
- * \tparam T_Type type of a single datum
- * \tparam T_dim dimension of the memory buffer
- */
-template<
-    typename T_Type,
-    int T_dim
->
-struct BufferCursor
- : public Cursor< PointerAccessor< T_Type >, BufferNavigator< T_dim >, T_Type * >
-{
-    /* \param pointer data pointer
-     * \param pitch pitch of the memory buffer
-     * pitch is a Size_t vector with one dimension less than dim
-     * pitch[0] is the distance in bytes to the incremented y-coordinate
-     * pitch[1] is the distance in bytes to the incremented z-coordiante
-     */
-    HDINLINE
-    BufferCursor( T_Type * pointer, math::Size_t< T_dim - 1 > pitch )
-     : Cursor< PointerAccessor< T_Type >, BufferNavigator< T_dim >, T_Type * >
-            ( PointerAccessor< T_Type >(), BufferNavigator< T_dim >( pitch ), pointer ) {}
+    namespace cursor
+    {
+        /** The most common cursor typedef
+         *
+         * BufferCursor does access and jumping on a cartesian memory buffer.
+         *
+         * \tparam T_Type type of a single datum
+         * \tparam T_dim dimension of the memory buffer
+         */
+        template<typename T_Type, int T_dim>
+        struct BufferCursor : public Cursor<PointerAccessor<T_Type>, BufferNavigator<T_dim>, T_Type*>
+        {
+            /* \param pointer data pointer
+             * \param pitch pitch of the memory buffer
+             * pitch is a Size_t vector with one dimension less than dim
+             * pitch[0] is the distance in bytes to the incremented y-coordinate
+             * pitch[1] is the distance in bytes to the incremented z-coordiante
+             */
+            HDINLINE
+            BufferCursor(T_Type* pointer, math::Size_t<T_dim - 1> pitch)
+                : Cursor<PointerAccessor<T_Type>, BufferNavigator<T_dim>, T_Type*>(
+                    PointerAccessor<T_Type>(),
+                    BufferNavigator<T_dim>(pitch),
+                    pointer)
+            {
+            }
 
-    HDINLINE
-    BufferCursor( const Cursor< PointerAccessor< T_Type >, BufferNavigator< T_dim >, T_Type * > & other )
-     : Cursor<PointerAccessor< T_Type >, BufferNavigator< T_dim >, T_Type * >( other ) {}
-};
+            HDINLINE
+            BufferCursor(const Cursor<PointerAccessor<T_Type>, BufferNavigator<T_dim>, T_Type*>& other)
+                : Cursor<PointerAccessor<T_Type>, BufferNavigator<T_dim>, T_Type*>(other)
+            {
+            }
+        };
 
-namespace traits
-{
+        namespace traits
+        {
+            /* type trait to get the BufferCursor's dimension if it has one */
+            template<typename T_Type, int T_dim>
+            struct dim<BufferCursor<T_Type, T_dim>>
+            {
+                static constexpr int value = pmacc::cursor::traits::dim<
+                    Cursor<PointerAccessor<T_Type>, BufferNavigator<T_dim>, T_Type*>>::value;
+            };
 
-/* type trait to get the BufferCursor's dimension if it has one */
-template<
-    typename T_Type,
-    int T_dim
->
-struct dim< BufferCursor< T_Type, T_dim > >
-{
-    static constexpr int value = pmacc::cursor::traits::dim<
-        Cursor< PointerAccessor< T_Type >, BufferNavigator< T_dim >, T_Type * > >::value;
-};
+        } // namespace traits
 
-} // namespace traits
-
-} // namespace cursor
+    } // namespace cursor
 } // namespace pmacc
-
diff --git a/include/pmacc/cuSTL/cursor/Cursor.hpp b/include/pmacc/cuSTL/cursor/Cursor.hpp
index 377689d590..8d69b755a1 100644
--- a/include/pmacc/cuSTL/cursor/Cursor.hpp
+++ b/include/pmacc/cuSTL/cursor/Cursor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -32,145 +32,169 @@ namespace mpl = boost::mpl;
 
 namespace pmacc
 {
-namespace cursor
-{
-
-/** A cursor is used to access a single datum and to jump to another one.
- * It is always located at a certain datum. Think of a generalized iterator.
- * \tparam _Accessor Policy functor class that is called inside operator*().
- * It typically returns a reference to the current selected datum.
- * \tparam _Navigator Policy functor class that is called inside operator()().
- * It jumps to another datum.
- * \tparam _Marker Runtime data that is used by the accessor and the navigator.
- * This is typically a data pointer.
- */
-template<typename _Accessor, typename _Navigator, typename _Marker>
-class Cursor : private _Accessor, _Navigator
-{
-public:
-    typedef typename _Accessor::type type;
-    typedef typename boost::remove_reference<type>::type ValueType;
-    typedef _Accessor Accessor;
-    typedef _Navigator Navigator;
-    typedef _Marker Marker;
-    typedef Cursor<Accessor, Navigator, Marker> This;
-    typedef This result_type;
-protected:
-    Marker marker;
-public:
-    HDINLINE
-    Cursor(const Accessor& accessor,
-             const Navigator& navigator,
-             const Marker& marker)
-                : Accessor(accessor), Navigator(navigator), marker(marker) {}
-
-    /** access
-     * \return Accessor's return type.
-     * Typically a reference to the current selected single datum.
-     */
-    HDINLINE
-    type operator*()
-    {
-        return Accessor::operator()(this->marker);
-    }
-
-    /* This is a const method which is called for a const cursor object.
-     * A const cursor object does *not* mean that the data it points to
-     * is neccessarily constant too. This is why here the return type is
-     * the same as for the non-const method above.
-     */
-    HDINLINE
-    type operator*() const
-    {
-        return Accessor::operator()(this->marker);
-    }
-
-    /** jumping
-     * \param jump Specifies a jump relative to the current selected datum.
-     * This is usually a int vector but may be any type that navigator accepts.
-     * \return A new cursor, which has jumped according to the jump param.
-     */
-    template<typename Jump>
-    HDINLINE This operator()(const Jump& jump) const
-    {
-        Navigator newNavigator(getNavigator());
-        Marker newMarker = newNavigator(this->marker, jump);
-        return This(getAccessor(), newNavigator, newMarker);
-    }
-
-    /* convenient method which is available if the navigator accepts a Int<1> */
-    HDINLINE This operator()(int x) const
-    {
-        return (*this)(math::Int<1>(x));
-    }
-
-    /* convenient method which is available if the navigator accepts a Int<2> */
-    HDINLINE This operator()(int x, int y) const
-    {
-        return (*this)(math::Int<2u>(x, y));
-    }
-
-    /* convenient method which is available if the navigator accepts a Int<3> */
-    HDINLINE This operator()(int x, int y, int z) const
+    namespace cursor
     {
-        return (*this)(math::Int<3>(x, y, z));
-    }
-
-    /* convenient method which is available if the navigator implements operator++ */
-    HDINLINE void operator++() {Navigator::operator++;}
-    /* convenient method which is available if the navigator implements operator-- */
-    HDINLINE void operator--() {Navigator::operator--;}
-
-    /* jump and access in one call */
-    template<typename Jump>
-    HDINLINE
-    type operator[](const Jump& jump)
-    {
-        return *((*this)(jump));
-    }
-
-    template<typename Jump>
-    HDINLINE
-    type operator[](const Jump& jump) const
-    {
-        return *((*this)(jump));
-    }
-
-    /* This is a dirty workaround to enable and disable safe-cursor checks.*/
-    /** \todo: Can be substituted by ordinary functions instead of methods.*/
-    HDINLINE void enableChecking() {this->marker.enableChecking();}
-    HDINLINE void disableChecking() {this->marker.disableChecking();}
-
-    /* getters */
-    HDINLINE
-    const _Accessor& getAccessor() const {return *this;}
-    HDINLINE
-    const _Navigator& getNavigator() const {return *this;}
-    HDINLINE
-    const Marker& getMarker() const {return this->marker;}
-};
-
-/* convenient function to construct a cursor by passing its constructor arguments */
-template<typename Accessor, typename Navigator, typename Marker>
-HDINLINE Cursor<Accessor, Navigator, Marker> make_Cursor
-(const Accessor& accessor, const Navigator& navigator, const Marker& marker)
-{
-    return Cursor<Accessor, Navigator, Marker>(accessor, navigator, marker);
-}
-
-namespace traits
-{
-
-/* type trait to get the cursor's dimension if it has one */
-template<typename _Accessor, typename _Navigator, typename _Marker>
-struct dim< pmacc::cursor::Cursor<_Accessor, _Navigator, _Marker> >
-{
-    static constexpr int value = pmacc::cursor::traits::dim<typename Cursor<_Accessor, _Navigator, _Marker>::Navigator >::value;
-};
-
-} // traits
-
-} // cursor
-} // pmacc
-
-
+        /** A cursor is used to access a single datum and to jump to another one.
+         * It is always located at a certain datum. Think of a generalized iterator.
+         * \tparam _Accessor Policy functor class that is called inside operator*().
+         * It typically returns a reference to the current selected datum.
+         * \tparam _Navigator Policy functor class that is called inside operator()().
+         * It jumps to another datum.
+         * \tparam _Marker Runtime data that is used by the accessor and the navigator.
+         * This is typically a data pointer.
+         */
+        template<typename _Accessor, typename _Navigator, typename _Marker>
+        class Cursor
+            : private _Accessor
+            , _Navigator
+        {
+        public:
+            typedef typename _Accessor::type type;
+            typedef typename boost::remove_reference<type>::type ValueType;
+            typedef _Accessor Accessor;
+            typedef _Navigator Navigator;
+            typedef _Marker Marker;
+            typedef Cursor<Accessor, Navigator, Marker> This;
+            typedef This result_type;
+
+        protected:
+            Marker marker;
+
+        public:
+            HDINLINE
+            Cursor(const Accessor& accessor, const Navigator& navigator, const Marker& marker)
+                : Accessor(accessor)
+                , Navigator(navigator)
+                , marker(marker)
+            {
+            }
+
+            /** access
+             * \return Accessor's return type.
+             * Typically a reference to the current selected single datum.
+             */
+            HDINLINE
+            type operator*()
+            {
+                return Accessor::operator()(this->marker);
+            }
+
+            /* This is a const method which is called for a const cursor object.
+             * A const cursor object does *not* mean that the data it points to
+             * is neccessarily constant too. This is why here the return type is
+             * the same as for the non-const method above.
+             */
+            HDINLINE
+            type operator*() const
+            {
+                return Accessor::operator()(this->marker);
+            }
+
+            /** jumping
+             * \param jump Specifies a jump relative to the current selected datum.
+             * This is usually a int vector but may be any type that navigator accepts.
+             * \return A new cursor, which has jumped according to the jump param.
+             */
+            template<typename Jump>
+            HDINLINE This operator()(const Jump& jump) const
+            {
+                Navigator newNavigator(getNavigator());
+                Marker newMarker = newNavigator(this->marker, jump);
+                return This(getAccessor(), newNavigator, newMarker);
+            }
+
+            /* convenient method which is available if the navigator accepts a Int<1> */
+            HDINLINE This operator()(int x) const
+            {
+                return (*this)(math::Int<1>(x));
+            }
+
+            /* convenient method which is available if the navigator accepts a Int<2> */
+            HDINLINE This operator()(int x, int y) const
+            {
+                return (*this)(math::Int<2u>(x, y));
+            }
+
+            /* convenient method which is available if the navigator accepts a Int<3> */
+            HDINLINE This operator()(int x, int y, int z) const
+            {
+                return (*this)(math::Int<3>(x, y, z));
+            }
+
+            /* convenient method which is available if the navigator implements operator++ */
+            HDINLINE void operator++()
+            {
+                Navigator::operator++;
+            }
+            /* convenient method which is available if the navigator implements operator-- */
+            HDINLINE void operator--()
+            {
+                Navigator::operator--;
+            }
+
+            /* jump and access in one call */
+            template<typename Jump>
+            HDINLINE type operator[](const Jump& jump)
+            {
+                return *((*this)(jump));
+            }
+
+            template<typename Jump>
+            HDINLINE type operator[](const Jump& jump) const
+            {
+                return *((*this)(jump));
+            }
+
+            /* This is a dirty workaround to enable and disable safe-cursor checks.*/
+            /** \todo: Can be substituted by ordinary functions instead of methods.*/
+            HDINLINE void enableChecking()
+            {
+                this->marker.enableChecking();
+            }
+            HDINLINE void disableChecking()
+            {
+                this->marker.disableChecking();
+            }
+
+            /* getters */
+            HDINLINE
+            const _Accessor& getAccessor() const
+            {
+                return *this;
+            }
+            HDINLINE
+            const _Navigator& getNavigator() const
+            {
+                return *this;
+            }
+            HDINLINE
+            const Marker& getMarker() const
+            {
+                return this->marker;
+            }
+        };
+
+        /* convenient function to construct a cursor by passing its constructor arguments */
+        template<typename Accessor, typename Navigator, typename Marker>
+        HDINLINE Cursor<Accessor, Navigator, Marker> make_Cursor(
+            const Accessor& accessor,
+            const Navigator& navigator,
+            const Marker& marker)
+        {
+            return Cursor<Accessor, Navigator, Marker>(accessor, navigator, marker);
+        }
+
+        namespace traits
+        {
+            /* type trait to get the cursor's dimension if it has one */
+            template<typename _Accessor, typename _Navigator, typename _Marker>
+            struct dim<pmacc::cursor::Cursor<_Accessor, _Navigator, _Marker>>
+            {
+                static constexpr int value
+                    = pmacc::cursor::traits::dim<typename Cursor<_Accessor, _Navigator, _Marker>::Navigator>::value;
+            };
+
+        } // namespace traits
+
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/FunctorCursor.hpp b/include/pmacc/cuSTL/cursor/FunctorCursor.hpp
index b4575a91b7..f5d8d2fb76 100644
--- a/include/pmacc/cuSTL/cursor/FunctorCursor.hpp
+++ b/include/pmacc/cuSTL/cursor/FunctorCursor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,33 +28,28 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-/** wraps a cursor into a new cursor
- *
- * On each access of the new cursor the result of the nested cursor access
- * is filtered through a user-defined functor.
- *
- * \param cursor Cursor to be wrapped
- * \param functor User functor acting as a filter.
- */
-template<typename TCursor, typename Functor>
-HDINLINE
-Cursor<FunctorAccessor<Functor,
-    typename boost::remove_reference<typename TCursor::type>::type>,
-    CursorNavigator, TCursor> make_FunctorCursor(const TCursor& cursor, const Functor& functor)
-{
-    return make_Cursor(
-        FunctorAccessor<
-            Functor,
-            typename TCursor::ValueType
-        >(functor),
-        CursorNavigator(),
-        cursor
-    );
-}
-
-} // cursor
-} // pmacc
+    namespace cursor
+    {
+        /** wraps a cursor into a new cursor
+         *
+         * On each access of the new cursor the result of the nested cursor access
+         * is filtered through a user-defined functor.
+         *
+         * \param cursor Cursor to be wrapped
+         * \param functor User functor acting as a filter.
+         */
+        template<typename TCursor, typename Functor>
+        HDINLINE Cursor<
+            FunctorAccessor<Functor, typename boost::remove_reference<typename TCursor::type>::type>,
+            CursorNavigator,
+            TCursor>
+        make_FunctorCursor(const TCursor& cursor, const Functor& functor)
+        {
+            return make_Cursor(
+                FunctorAccessor<Functor, typename TCursor::ValueType>(functor),
+                CursorNavigator(),
+                cursor);
+        }
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/MultiIndexCursor.hpp b/include/pmacc/cuSTL/cursor/MultiIndexCursor.hpp
index f5fbd2914f..9af083c0b4 100644
--- a/include/pmacc/cuSTL/cursor/MultiIndexCursor.hpp
+++ b/include/pmacc/cuSTL/cursor/MultiIndexCursor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,26 +28,20 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-/** construct a cursor where accessing means getting the current position
- * in terms of an 2D, 3D, ... index. Usefull to obtain for example the current cell index.
- * \tparam dim Dimension of the index (say: int-vector)
- * \param idx Initial index value
- * \return cursor with the behavior mentioned above
- */
-template<int dim>
-HDINLINE
-cursor::Cursor<cursor::MarkerAccessor<math::Int<dim> >, MultiIndexNavigator<dim>,
-               math::Int<dim> >
-               make_MultiIndexCursor(const math::Int<dim>& idx = math::Int<dim>::create(0))
-{
-    return make_Cursor(cursor::MarkerAccessor<math::Int<dim> >(),
-                       MultiIndexNavigator<dim>(),
-                       idx);
-}
-
-} // cursor
-} // pmacc
+    namespace cursor
+    {
+        /** construct a cursor where accessing means getting the current position
+         * in terms of an 2D, 3D, ... index. Usefull to obtain for example the current cell index.
+         * \tparam dim Dimension of the index (say: int-vector)
+         * \param idx Initial index value
+         * \return cursor with the behavior mentioned above
+         */
+        template<int dim>
+        HDINLINE cursor::Cursor<cursor::MarkerAccessor<math::Int<dim>>, MultiIndexNavigator<dim>, math::Int<dim>>
+        make_MultiIndexCursor(const math::Int<dim>& idx = math::Int<dim>::create(0))
+        {
+            return make_Cursor(cursor::MarkerAccessor<math::Int<dim>>(), MultiIndexNavigator<dim>(), idx);
+        }
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/NestedCursor.hpp b/include/pmacc/cuSTL/cursor/NestedCursor.hpp
index 9c6e1e69de..971f29419b 100644
--- a/include/pmacc/cuSTL/cursor/NestedCursor.hpp
+++ b/include/pmacc/cuSTL/cursor/NestedCursor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,21 +27,21 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-/** wraps a cursor into a new cursor in a way that accessing on the new cursor
- * means getting the nested cursor and jumping means jumping on the nested cursor.
- * \param cursor Cursor to be wrapped
- * \return A new cursor which wraps the given cursor
- */
-template<typename TCursor>
-HDINLINE
-Cursor<MarkerAccessor<TCursor>, CursorNavigator, TCursor> make_NestedCursor(const TCursor& cursor)
-{
-    return Cursor<MarkerAccessor<TCursor>, CursorNavigator, TCursor>(MarkerAccessor<TCursor>(), CursorNavigator(), cursor);
-}
-
-} // cursor
-} // pmacc
+    namespace cursor
+    {
+        /** wraps a cursor into a new cursor in a way that accessing on the new cursor
+         * means getting the nested cursor and jumping means jumping on the nested cursor.
+         * \param cursor Cursor to be wrapped
+         * \return A new cursor which wraps the given cursor
+         */
+        template<typename TCursor>
+        HDINLINE Cursor<MarkerAccessor<TCursor>, CursorNavigator, TCursor> make_NestedCursor(const TCursor& cursor)
+        {
+            return Cursor<MarkerAccessor<TCursor>, CursorNavigator, TCursor>(
+                MarkerAccessor<TCursor>(),
+                CursorNavigator(),
+                cursor);
+        }
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/SafeCursor.hpp b/include/pmacc/cuSTL/cursor/SafeCursor.hpp
index f510e20485..a52c8c28ee 100644
--- a/include/pmacc/cuSTL/cursor/SafeCursor.hpp
+++ b/include/pmacc/cuSTL/cursor/SafeCursor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,138 +27,153 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-/** A SafeCursor is like a cursor, except that it checks its validity before each access.
- */
-template<typename Cursor>
-class SafeCursor : public Cursor
-{
-public:
-    static constexpr int dim = pmacc::cursor::traits::dim<Cursor>::value;
-private:
-    /* \todo: Use a zone instead of lowerExtent and UpperExtent */
-    const math::Int<dim> lowerExtent;
-    const math::Int<dim> upperExtent;
-    math::Int<dim> offset;
-    bool enabled;
-public:
-    /**
-     * \param cursor Base cursor
-     * \param lowerExtent Top left corner of valid range, inside the range.
-     * \param upperExtent Bottom right corner of valid range, inside the range.
-     */
-    HDINLINE SafeCursor(const Cursor& cursor,
-                        const math::Int<dim>& lowerExtent,
-                        const math::Int<dim>& upperExtent)
-        : Cursor(cursor),
-          lowerExtent(lowerExtent),
-          upperExtent(upperExtent),
-          offset(math::Int<dim>(0)),
-          enabled(true)
-    {}
-
-    HDINLINE void enableChecking() {this->enabled = true;}
-    HDINLINE void disableChecking() {this->enabled = false;}
-
-    HDINLINE
-    typename Cursor::type operator*()
-    {
-        checkValidity();
-        return Cursor::operator*();
-    }
-
-    HDINLINE
-    typename boost::add_const<typename Cursor::type>::type operator*() const
-    {
-        checkValidity();
-        return Cursor::operator*();
-    }
-
-    template<typename Jump>
-    HDINLINE
-    SafeCursor<Cursor> operator()(const Jump& jump) const
+    namespace cursor
     {
-        SafeCursor<Cursor> result(Cursor::operator()(jump),
-                                  this->lowerExtent,
-                                  this->upperExtent);
-        result.offset = this->offset + jump;
-        result.enabled = this->enabled;
-        return result;
-    }
-
-    HDINLINE
-    SafeCursor<Cursor> operator()(int x) const
-    {
-        return (*this)(math::Int<1>(x));
-    }
-
-    HDINLINE
-    SafeCursor<Cursor> operator()(int x, int y) const
-    {
-        return (*this)(math::Int<2>(x, y));
-    }
-
-    HDINLINE
-    SafeCursor<Cursor> operator()(int x, int y, int z) const
-    {
-        return (*this)(math::Int<3>(x, y, z));
-    }
-
-    HDINLINE void operator++() {this->jump[0]++; Cursor::operator++;}
-    HDINLINE void operator--() {this->jump[0]--; Cursor::operator--;}
-
-    template<typename Jump>
-    HDINLINE
-    typename Cursor::type operator[](const Jump& jump)
-    {
-        return *((*this)(jump));
-    }
-
-    template<typename Jump>
-    HDINLINE
-    typename Cursor::type operator[](const Jump& jump) const
-    {
-        return *((*this)(jump));
-    }
-private:
-    HDINLINE void checkValidity() const
-    {
-        if(!this->enabled) return;
-        #pragma unroll
-        for(int i = 0; i < dim; i++)
+        /** A SafeCursor is like a cursor, except that it checks its validity before each access.
+         */
+        template<typename Cursor>
+        class SafeCursor : public Cursor
         {
-            if(this->offset[i] < this->lowerExtent[i] ||
-               this->offset[i] > this->upperExtent[i])
-                printf("error[cursor]: index %d out of range: %d is not within [%d, %d]\n",
-                    i, this->offset[i], this->lowerExtent[i], this->upperExtent[i]);
+        public:
+            static constexpr int dim = pmacc::cursor::traits::dim<Cursor>::value;
+
+        private:
+            /* \todo: Use a zone instead of lowerExtent and UpperExtent */
+            const math::Int<dim> lowerExtent;
+            const math::Int<dim> upperExtent;
+            math::Int<dim> offset;
+            bool enabled;
+
+        public:
+            /**
+             * \param cursor Base cursor
+             * \param lowerExtent Top left corner of valid range, inside the range.
+             * \param upperExtent Bottom right corner of valid range, inside the range.
+             */
+            HDINLINE SafeCursor(
+                const Cursor& cursor,
+                const math::Int<dim>& lowerExtent,
+                const math::Int<dim>& upperExtent)
+                : Cursor(cursor)
+                , lowerExtent(lowerExtent)
+                , upperExtent(upperExtent)
+                , offset(math::Int<dim>(0))
+                , enabled(true)
+            {
+            }
+
+            HDINLINE void enableChecking()
+            {
+                this->enabled = true;
+            }
+            HDINLINE void disableChecking()
+            {
+                this->enabled = false;
+            }
+
+            HDINLINE
+            typename Cursor::type operator*()
+            {
+                checkValidity();
+                return Cursor::operator*();
+            }
+
+            HDINLINE
+            typename boost::add_const<typename Cursor::type>::type operator*() const
+            {
+                checkValidity();
+                return Cursor::operator*();
+            }
+
+            template<typename Jump>
+            HDINLINE SafeCursor<Cursor> operator()(const Jump& jump) const
+            {
+                SafeCursor<Cursor> result(Cursor::operator()(jump), this->lowerExtent, this->upperExtent);
+                result.offset = this->offset + jump;
+                result.enabled = this->enabled;
+                return result;
+            }
+
+            HDINLINE
+            SafeCursor<Cursor> operator()(int x) const
+            {
+                return (*this)(math::Int<1>(x));
+            }
+
+            HDINLINE
+            SafeCursor<Cursor> operator()(int x, int y) const
+            {
+                return (*this)(math::Int<2>(x, y));
+            }
+
+            HDINLINE
+            SafeCursor<Cursor> operator()(int x, int y, int z) const
+            {
+                return (*this)(math::Int<3>(x, y, z));
+            }
+
+            HDINLINE void operator++()
+            {
+                this->jump[0]++;
+                Cursor::operator++;
+            }
+            HDINLINE void operator--()
+            {
+                this->jump[0]--;
+                Cursor::operator--;
+            }
+
+            template<typename Jump>
+            HDINLINE typename Cursor::type operator[](const Jump& jump)
+            {
+                return *((*this)(jump));
+            }
+
+            template<typename Jump>
+            HDINLINE typename Cursor::type operator[](const Jump& jump) const
+            {
+                return *((*this)(jump));
+            }
+
+        private:
+            HDINLINE void checkValidity() const
+            {
+                if(!this->enabled)
+                    return;
+#pragma unroll
+                for(int i = 0; i < dim; i++)
+                {
+                    if(this->offset[i] < this->lowerExtent[i] || this->offset[i] > this->upperExtent[i])
+                        printf(
+                            "error[cursor]: index %d out of range: %d is not within [%d, %d]\n",
+                            i,
+                            this->offset[i],
+                            this->lowerExtent[i],
+                            this->upperExtent[i]);
+                }
+            }
+        };
+
+        namespace traits
+        {
+            /* type trait to get the safe-cursor's dimension if it has one */
+            template<typename Cursor>
+            struct dim<SafeCursor<Cursor>>
+            {
+                static constexpr int value = SafeCursor<Cursor>::dim;
+            };
+
+        } // namespace traits
+
+        /* convenient function to construct a safe-cursor by passing its constructor arguments */
+        template<typename Cursor>
+        HDINLINE SafeCursor<Cursor> make_SafeCursor(
+            const Cursor& cursor,
+            const math::Int<traits::dim<SafeCursor<Cursor>>::value>& lowerExtent,
+            const math::Int<traits::dim<SafeCursor<Cursor>>::value>& upperExtent)
+        {
+            return SafeCursor<Cursor>(cursor, lowerExtent, upperExtent);
         }
-    }
-};
-
-namespace traits
-{
-
-/* type trait to get the safe-cursor's dimension if it has one */
-template<typename Cursor>
-struct dim<SafeCursor<Cursor> >
-{
-    static constexpr int value = SafeCursor<Cursor>::dim;
-};
-
-} // traits
-
-/* convenient function to construct a safe-cursor by passing its constructor arguments */
-template<typename Cursor>
-HDINLINE SafeCursor<Cursor> make_SafeCursor(
-    const Cursor& cursor,
-    const math::Int<traits::dim<SafeCursor<Cursor> >::value>& lowerExtent,
-    const math::Int<traits::dim<SafeCursor<Cursor> >::value>& upperExtent)
-{
-    return SafeCursor<Cursor>(cursor, lowerExtent, upperExtent);
-}
-
-} // cursor
-} // pmacc
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/accessor/CursorAccessor.hpp b/include/pmacc/cuSTL/cursor/accessor/CursorAccessor.hpp
index 5075b464ed..b4a1d95102 100644
--- a/include/pmacc/cuSTL/cursor/accessor/CursorAccessor.hpp
+++ b/include/pmacc/cuSTL/cursor/accessor/CursorAccessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,22 +25,20 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-template<typename TCursor>
-struct CursorAccessor
-{
-    typedef typename TCursor::type type;
-
-    HDINLINE type operator()(TCursor& cursor)
+    namespace cursor
     {
-        return *cursor;
-    }
+        template<typename TCursor>
+        struct CursorAccessor
+        {
+            typedef typename TCursor::type type;
 
-    ///\todo: implement const method here with a const TCursor& argument and 'type' as return type.
-};
+            HDINLINE type operator()(TCursor& cursor)
+            {
+                return *cursor;
+            }
 
-} // cursor
-} // pmacc
+            ///\todo: implement const method here with a const TCursor& argument and 'type' as return type.
+        };
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/accessor/FunctorAccessor.hpp b/include/pmacc/cuSTL/cursor/accessor/FunctorAccessor.hpp
index 5a464d95d1..8925b27346 100644
--- a/include/pmacc/cuSTL/cursor/accessor/FunctorAccessor.hpp
+++ b/include/pmacc/cuSTL/cursor/accessor/FunctorAccessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,27 +26,27 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-template<typename _Functor, typename ArgType>
-struct FunctorAccessor
-{
-    _Functor functor;
-
-    typedef typename ::pmacc::result_of::Functor<_Functor, ArgType>::type type;
+    namespace cursor
+    {
+        template<typename _Functor, typename ArgType>
+        struct FunctorAccessor
+        {
+            _Functor functor;
 
-    HDINLINE FunctorAccessor(const _Functor& functor) : functor(functor) {}
+            typedef typename ::pmacc::result_of::Functor<_Functor, ArgType>::type type;
 
-    template<typename TCursor>
-    HDINLINE type operator()(TCursor& cursor)
-    {
-        return this->functor(*cursor);
-    }
+            HDINLINE FunctorAccessor(const _Functor& functor) : functor(functor)
+            {
+            }
 
-    ///\todo: implement const method here with a const TCursor& argument and 'type' as return type.
-};
+            template<typename TCursor>
+            HDINLINE type operator()(TCursor& cursor)
+            {
+                return this->functor(*cursor);
+            }
 
-} // cursor
-} // pmacc
+            ///\todo: implement const method here with a const TCursor& argument and 'type' as return type.
+        };
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/accessor/LinearInterpAccessor.hpp b/include/pmacc/cuSTL/cursor/accessor/LinearInterpAccessor.hpp
index 20953c2826..13868e9543 100644
--- a/include/pmacc/cuSTL/cursor/accessor/LinearInterpAccessor.hpp
+++ b/include/pmacc/cuSTL/cursor/accessor/LinearInterpAccessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -29,144 +29,152 @@
 
 namespace pmacc
 {
-namespace cursor
-{
+    namespace cursor
+    {
+        /** Performs a 1D, 2D or 3D, linear interpolation on access.
+         *
+         * \tparam T_Cursor input data
+         */
+        template<typename T_Cursor, int dim = cursor::traits::dim<T_Cursor>::value>
+        struct LinearInterpAccessor;
+
+        template<typename T_Cursor>
+        struct LinearInterpAccessor<T_Cursor, DIM1>
+        {
+            typedef T_Cursor Cursor;
+            typedef typename Cursor::ValueType type;
 
-/** Performs a 1D, 2D or 3D, linear interpolation on access.
- *
- * \tparam T_Cursor input data
- */
-template<typename T_Cursor, int dim = cursor::traits::dim<T_Cursor>::value>
-struct LinearInterpAccessor;
+            Cursor cursor;
 
-template<typename T_Cursor>
-struct LinearInterpAccessor<T_Cursor, DIM1>
-{
-    typedef T_Cursor Cursor;
-    typedef typename Cursor::ValueType type;
+            /**
+             * @param cursor input data
+             */
+            HDINLINE LinearInterpAccessor(const Cursor& cursor) : cursor(cursor)
+            {
+            }
 
-    Cursor cursor;
+            template<typename T_Position>
+            HDINLINE type operator()(const T_Position pos) const
+            {
+                BOOST_STATIC_ASSERT(T_Position::dim == DIM1);
 
-    /**
-     * @param cursor input data
-     */
-    HDINLINE LinearInterpAccessor(const Cursor& cursor) : cursor(cursor) {}
+                T_Position intPart;
+                T_Position fracPart;
 
-    template<typename T_Position>
-    HDINLINE type operator()(const T_Position pos) const
-    {
-        BOOST_STATIC_ASSERT(T_Position::dim == DIM1);
+                fracPart[0] = pmacc::math::modf(pos[0], &(intPart[0]));
 
-        T_Position intPart;
-        T_Position fracPart;
+                const math::Int<DIM1> idx1D(static_cast<int>(intPart[0]));
 
-        fracPart[0] = pmacc::algorithms::math::modf(pos[0], &(intPart[0]));
+                type result = pmacc::traits::GetInitializedInstance<type>()(0.0);
+                typedef typename T_Position::type PositionComp;
+                for(int i = 0; i < 2; i++)
+                {
+                    const PositionComp weighting1D = (i == 0 ? (PositionComp(1.0) - fracPart[0]) : fracPart[0]);
+                    result += static_cast<type>(weighting1D * this->cursor[idx1D + math::Int<DIM1>(i)]);
+                }
 
-        const math::Int<DIM1> idx1D(static_cast<int>(intPart[0]));
+                return result;
+            }
+        };
 
-        type result = pmacc::traits::GetInitializedInstance<type>()(0.0);
-        typedef typename T_Position::type PositionComp;
-        for(int i = 0; i < 2; i++)
+        template<typename T_Cursor>
+        struct LinearInterpAccessor<T_Cursor, DIM2>
         {
-            const PositionComp weighting1D = (i == 0 ? (PositionComp(1.0) - fracPart[0]) : fracPart[0]);
-            result += static_cast<type>(weighting1D * this->cursor[idx1D + math::Int<DIM1>(i)]);
-        }
+            typedef T_Cursor Cursor;
+            typedef typename T_Cursor::ValueType type;
 
-        return result;
-    }
-};
+            Cursor cursor;
 
-template<typename T_Cursor>
-struct LinearInterpAccessor<T_Cursor, DIM2>
-{
-    typedef T_Cursor Cursor;
-    typedef typename T_Cursor::ValueType type;
-
-    Cursor cursor;
+            /**
+             * @param cursor input data
+             */
+            HDINLINE LinearInterpAccessor(const Cursor& cursor) : cursor(cursor)
+            {
+            }
 
-    /**
-     * @param cursor input data
-     */
-    HDINLINE LinearInterpAccessor(const Cursor& cursor) : cursor(cursor) {}
+            template<typename T_Position>
+            HDINLINE type operator()(const T_Position pos) const
+            {
+                BOOST_STATIC_ASSERT(T_Position::dim == DIM2);
 
-    template<typename T_Position>
-    HDINLINE type operator()(const T_Position pos) const
-    {
-        BOOST_STATIC_ASSERT(T_Position::dim == DIM2);
+                T_Position intPart;
+                T_Position fracPart;
 
-        T_Position intPart;
-        T_Position fracPart;
+                fracPart[0] = pmacc::math::modf(pos[0], &(intPart[0]));
+                fracPart[1] = pmacc::math::modf(pos[1], &(intPart[1]));
 
-        fracPart[0] = pmacc::algorithms::math::modf(pos[0], &(intPart[0]));
-        fracPart[1] = pmacc::algorithms::math::modf(pos[1], &(intPart[1]));
+                const math::Int<DIM2> idx2D(static_cast<int>(intPart[0]), static_cast<int>(intPart[1]));
 
-        const math::Int<DIM2> idx2D(static_cast<int>(intPart[0]),
-                                    static_cast<int>(intPart[1]));
+                type result = pmacc::traits::GetInitializedInstance<type>()(0.0);
+                typedef typename T_Position::type PositionComp;
+                for(int i = 0; i < 2; i++)
+                {
+                    const PositionComp weighting1D = (i == 0 ? (PositionComp(1.0) - fracPart[0]) : fracPart[0]);
+                    for(int j = 0; j < 2; j++)
+                    {
+                        const PositionComp weighting2D
+                            = weighting1D * (j == 0 ? (PositionComp(1.0) - fracPart[1]) : fracPart[1]);
+                        result += static_cast<type>(weighting2D * this->cursor[idx2D + math::Int<DIM2>(i, j)]);
+                    }
+                }
 
-        type result = pmacc::traits::GetInitializedInstance<type>()(0.0);
-        typedef typename T_Position::type PositionComp;
-        for(int i = 0; i < 2; i++)
-        {
-            const PositionComp weighting1D = (i == 0 ? (PositionComp(1.0) - fracPart[0]) : fracPart[0]);
-            for(int j = 0; j < 2; j++)
-            {
-                const PositionComp weighting2D = weighting1D * (j == 0 ? (PositionComp(1.0) - fracPart[1]) : fracPart[1]);
-                result += static_cast<type>(weighting2D * this->cursor[idx2D + math::Int<DIM2>(i, j)]);
+                return result;
             }
-        }
-
-        return result;
-    }
-};
+        };
 
-template<typename T_Cursor>
-struct LinearInterpAccessor<T_Cursor, DIM3>
-{
-    typedef T_Cursor Cursor;
-    typedef typename T_Cursor::ValueType type;
+        template<typename T_Cursor>
+        struct LinearInterpAccessor<T_Cursor, DIM3>
+        {
+            typedef T_Cursor Cursor;
+            typedef typename T_Cursor::ValueType type;
 
-    Cursor cursor;
+            Cursor cursor;
 
-    /**
-     * @param cursor input data
-     */
-    HDINLINE LinearInterpAccessor(const Cursor& cursor) : cursor(cursor) {}
+            /**
+             * @param cursor input data
+             */
+            HDINLINE LinearInterpAccessor(const Cursor& cursor) : cursor(cursor)
+            {
+            }
 
-    template<typename T_Position>
-    HDINLINE type operator()(const T_Position pos) const
-    {
-        BOOST_STATIC_ASSERT(T_Position::dim == DIM3);
+            template<typename T_Position>
+            HDINLINE type operator()(const T_Position pos) const
+            {
+                BOOST_STATIC_ASSERT(T_Position::dim == DIM3);
 
-        T_Position intPart;
-        T_Position fracPart;
+                T_Position intPart;
+                T_Position fracPart;
 
-        fracPart[0] = pmacc::algorithms::math::modf(pos[0], &(intPart[0]));
-        fracPart[1] = pmacc::algorithms::math::modf(pos[1], &(intPart[1]));
-        fracPart[2] = pmacc::algorithms::math::modf(pos[2], &(intPart[2]));
+                fracPart[0] = pmacc::math::modf(pos[0], &(intPart[0]));
+                fracPart[1] = pmacc::math::modf(pos[1], &(intPart[1]));
+                fracPart[2] = pmacc::math::modf(pos[2], &(intPart[2]));
 
-        const math::Int<DIM3> idx3D(static_cast<int>(intPart[0]),
-                                    static_cast<int>(intPart[1]),
-                                    static_cast<int>(intPart[2]));
+                const math::Int<DIM3> idx3D(
+                    static_cast<int>(intPart[0]),
+                    static_cast<int>(intPart[1]),
+                    static_cast<int>(intPart[2]));
 
-        type result = pmacc::traits::GetInitializedInstance<type>()(0.0);
-        typedef typename T_Position::type PositionComp;
-        for(int i = 0; i < 2; i++)
-        {
-            const PositionComp weighting1D = (i == 0 ? (PositionComp(1.0) - fracPart[0]) : fracPart[0]);
-            for(int j = 0; j < 2; j++)
-            {
-                const PositionComp weighting2D = weighting1D * (j == 0 ? (PositionComp(1.0) - fracPart[1]) : fracPart[1]);
-                for(int k = 0; k < 2; k++)
+                type result = pmacc::traits::GetInitializedInstance<type>()(0.0);
+                typedef typename T_Position::type PositionComp;
+                for(int i = 0; i < 2; i++)
                 {
-                    const PositionComp weighting3D = weighting2D * (k == 0 ? (PositionComp(1.0) - fracPart[2]) : fracPart[2]);
-                    result += static_cast<type>(weighting3D * this->cursor[idx3D + math::Int<DIM3>(i, j, k)]);
+                    const PositionComp weighting1D = (i == 0 ? (PositionComp(1.0) - fracPart[0]) : fracPart[0]);
+                    for(int j = 0; j < 2; j++)
+                    {
+                        const PositionComp weighting2D
+                            = weighting1D * (j == 0 ? (PositionComp(1.0) - fracPart[1]) : fracPart[1]);
+                        for(int k = 0; k < 2; k++)
+                        {
+                            const PositionComp weighting3D
+                                = weighting2D * (k == 0 ? (PositionComp(1.0) - fracPart[2]) : fracPart[2]);
+                            result += static_cast<type>(weighting3D * this->cursor[idx3D + math::Int<DIM3>(i, j, k)]);
+                        }
+                    }
                 }
-            }
-        }
 
-        return result;
-    }
-};
+                return result;
+            }
+        };
 
-} // namespace cursor
+    } // namespace cursor
 } // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/accessor/MarkerAccessor.hpp b/include/pmacc/cuSTL/cursor/accessor/MarkerAccessor.hpp
index f70b95fe1c..06d497f80e 100644
--- a/include/pmacc/cuSTL/cursor/accessor/MarkerAccessor.hpp
+++ b/include/pmacc/cuSTL/cursor/accessor/MarkerAccessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,27 +23,25 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-template<typename Marker>
-struct MarkerAccessor
-{
-    typedef const Marker type;
-    /** returns the cursor's marker.
-     *
-     * Here a copy of marker is returned because the cursor object
-     * could be a temporary object. Therefore any reference or const-reference
-     * of marker is dangerous. If you want to have a reference to marker use e.g.
-     * FunctorAccessor or Cursor::getMarker().
-     */
-    HDINLINE
-    type operator()(const Marker& marker) const
+    namespace cursor
     {
-        return marker;
-    }
-};
-
-} // cursor
-} // pmacc
+        template<typename Marker>
+        struct MarkerAccessor
+        {
+            typedef const Marker type;
+            /** returns the cursor's marker.
+             *
+             * Here a copy of marker is returned because the cursor object
+             * could be a temporary object. Therefore any reference or const-reference
+             * of marker is dangerous. If you want to have a reference to marker use e.g.
+             * FunctorAccessor or Cursor::getMarker().
+             */
+            HDINLINE
+            type operator()(const Marker& marker) const
+            {
+                return marker;
+            }
+        };
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/accessor/PointerAccessor.hpp b/include/pmacc/cuSTL/cursor/accessor/PointerAccessor.hpp
index aaa57cd0cb..f0b59b2cc3 100644
--- a/include/pmacc/cuSTL/cursor/accessor/PointerAccessor.hpp
+++ b/include/pmacc/cuSTL/cursor/accessor/PointerAccessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,28 +23,25 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-template<typename Type>
-struct PointerAccessor
-{
-    typedef Type& type;
-
-    /** Returns the dereferenced pointer of type 'Type'
-     *
-     * Here a reference is returned because one expects a reference
-     * if an ordinary c++ pointer is dereferenced too.
-     * There is no danger if the cursor object is temporary.
-     */
-    template<typename Data>
-    HDINLINE
-    type operator()(Data& data) const
+    namespace cursor
     {
-        return *((Type*)data);
-    }
-};
+        template<typename Type>
+        struct PointerAccessor
+        {
+            typedef Type& type;
 
-} // cursor
-} // pmacc
+            /** Returns the dereferenced pointer of type 'Type'
+             *
+             * Here a reference is returned because one expects a reference
+             * if an ordinary c++ pointer is dereferenced too.
+             * There is no danger if the cursor object is temporary.
+             */
+            template<typename Data>
+            HDINLINE type operator()(Data& data) const
+            {
+                return *((Type*) data);
+            }
+        };
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/accessor/TwistAxesAccessor.hpp b/include/pmacc/cuSTL/cursor/accessor/TwistAxesAccessor.hpp
index b28ee6f28c..674b2d48a0 100644
--- a/include/pmacc/cuSTL/cursor/accessor/TwistAxesAccessor.hpp
+++ b/include/pmacc/cuSTL/cursor/accessor/TwistAxesAccessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,27 +26,25 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-template<typename TCursor, typename Axes>
-struct TwistAxesAccessor
-{
-    typedef typename math::result_of::TwistComponents<
-        Axes, typename TCursor::ValueType>::type type;
-
-    /** Returns a reference to the result of '*cursor' (with twisted axes).
-     *
-     * Be aware that the underlying cursor must not be a temporary object if '*cursor'
-     * refers to something inside the cursor.
-     */
-    HDINLINE type operator()(TCursor& cursor)
+    namespace cursor
     {
-        return math::twistComponents<Axes>(*cursor);
-    }
+        template<typename TCursor, typename Axes>
+        struct TwistAxesAccessor
+        {
+            typedef typename math::result_of::TwistComponents<Axes, typename TCursor::ValueType>::type type;
+
+            /** Returns a reference to the result of '*cursor' (with twisted axes).
+             *
+             * Be aware that the underlying cursor must not be a temporary object if '*cursor'
+             * refers to something inside the cursor.
+             */
+            HDINLINE type operator()(TCursor& cursor)
+            {
+                return math::twistComponents<Axes>(*cursor);
+            }
 
-    ///\todo: implement const method here with a const TCursor& argument and 'type' as return type.
-};
+            ///\todo: implement const method here with a const TCursor& argument and 'type' as return type.
+        };
 
-} // cursor
-} // pmacc
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/compile-time/BufferCursor.hpp b/include/pmacc/cuSTL/cursor/compile-time/BufferCursor.hpp
index 8f02f29913..4983b56b22 100644
--- a/include/pmacc/cuSTL/cursor/compile-time/BufferCursor.hpp
+++ b/include/pmacc/cuSTL/cursor/compile-time/BufferCursor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,39 +28,40 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace CT
-{
-
-/** Compile-time version of cursor::BufferCursor where pitch is a compile-time vector
- */
-template<typename Type, typename Pitch>
-struct BufferCursor : public Cursor<PointerAccessor<Type>,
-                                   CT::BufferNavigator<Pitch>, Type*>
-{
-    HDINLINE BufferCursor(Type* pointer)
-        : Cursor<PointerAccessor<Type>, CT::BufferNavigator<Pitch>, Type*>
-            (PointerAccessor<Type>(), CT::BufferNavigator<Pitch>(), pointer) {}
+    namespace cursor
+    {
+        namespace CT
+        {
+            /** Compile-time version of cursor::BufferCursor where pitch is a compile-time vector
+             */
+            template<typename Type, typename Pitch>
+            struct BufferCursor : public Cursor<PointerAccessor<Type>, CT::BufferNavigator<Pitch>, Type*>
+            {
+                HDINLINE BufferCursor(Type* pointer)
+                    : Cursor<PointerAccessor<Type>, CT::BufferNavigator<Pitch>, Type*>(
+                        PointerAccessor<Type>(),
+                        CT::BufferNavigator<Pitch>(),
+                        pointer)
+                {
+                }
 
-    HDINLINE BufferCursor(const Cursor<PointerAccessor<Type>,
-                                   CT::BufferNavigator<Pitch>, Type*>& cur)
-        : Cursor<PointerAccessor<Type>, CT::BufferNavigator<Pitch>, Type*>(cur) {}
-};
+                HDINLINE BufferCursor(const Cursor<PointerAccessor<Type>, CT::BufferNavigator<Pitch>, Type*>& cur)
+                    : Cursor<PointerAccessor<Type>, CT::BufferNavigator<Pitch>, Type*>(cur)
+                {
+                }
+            };
 
-} // CT
-
-namespace traits
-{
-
-template<typename Type, typename Pitch>
-struct dim<CT::BufferCursor<Type, Pitch> >
-{
-    const static int value = Pitch::dim + 1;
-};
+        } // namespace CT
 
-} // traits
+        namespace traits
+        {
+            template<typename Type, typename Pitch>
+            struct dim<CT::BufferCursor<Type, Pitch>>
+            {
+                const static int value = Pitch::dim + 1;
+            };
 
-} // cursor
-} // pmacc
+        } // namespace traits
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/compile-time/SafeCursor.hpp b/include/pmacc/cuSTL/cursor/compile-time/SafeCursor.hpp
index a01145f72d..2a88860c0d 100644
--- a/include/pmacc/cuSTL/cursor/compile-time/SafeCursor.hpp
+++ b/include/pmacc/cuSTL/cursor/compile-time/SafeCursor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,105 +26,115 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace CT
-{
-
-/** Compile-time version of cursor::SafeCursor where LowerExtent and UpperExtent are
- * compile-time vectors.
- */
-template<typename Cursor, typename LowerExtent, typename UpperExtent>
-class SafeCursor : public Cursor
-{
-private:
-    typedef SafeCursor<Cursor, LowerExtent, UpperExtent> This;
-    static constexpr int dim = pmacc::cursor::traits::dim<Cursor>::value;
-    math::Int<dim> offset;
-public:
-    HDINLINE SafeCursor(const Cursor& cursor)
-        : Cursor(cursor), offset(math::Int<dim>(0))
-    {}
-
-    HDINLINE
-    typename Cursor::type operator*()
-    {
-        checkValidity();
-        return Cursor::operator*();
-    }
-
-    HDINLINE
-    typename boost::add_const<typename Cursor::type>::type operator*() const
-    {
-        checkValidity();
-        return Cursor::operator*();
-    }
-
-    template<typename Jump>
-    HDINLINE
-    This operator()(const Jump& jump) const
-    {
-        This result(Cursor::operator()(jump));
-        result.offset = this->offset + jump;
-        return result;
-    }
-
-    HDINLINE
-    This operator()(int x) const
-    {
-        return (*this)(math::Int<1>(x));
-    }
-
-    HDINLINE
-    This operator()(int x, int y) const
-    {
-        return (*this)(math::Int<2>(x, y));
-    }
-
-    HDINLINE
-    This operator()(int x, int y, int z) const
-    {
-        return (*this)(math::Int<3>(x, y, z));
-    }
-
-    HDINLINE void operator++() {this->jump[0]++; Cursor::operator++;}
-    HDINLINE void operator--() {this->jump[0]--; Cursor::operator--;}
-
-    template<typename Jump>
-    HDINLINE
-    typename Cursor::type operator[](const Jump& jump)
+    namespace cursor
     {
-        return *((*this)(jump));
-    }
-
-    template<typename Jump>
-    HDINLINE
-    typename Cursor::type operator[](const Jump& jump) const
-    {
-        return *((*this)(jump));
-    }
-private:
-    HDINLINE void checkValidity() const
-    {
-        #pragma unroll
-        for(int i = 0; i < dim; i++)
+        namespace CT
         {
-            if(this->offset[i] < LowerExtent().toRT()[i] ||
-               this->offset[i] > UpperExtent().toRT()[i])
-                printf("error[cursor]: index %d out of range: %d is not within [%d, %d]\n",
-                    i, this->offset[i], LowerExtent().toRT()[i], UpperExtent().toRT()[i]);
-        }
-    }
-};
-
-template<typename Cursor, typename LowerExtent, typename UpperExtent>
-HDINLINE SafeCursor<Cursor, LowerExtent, UpperExtent>
-make_SafeCursor(const Cursor& cursor, LowerExtent, UpperExtent)
-{
-    return SafeCursor<Cursor, LowerExtent, UpperExtent>(cursor);
-}
-
-} // CT
-} // cursor
-} // pmacc
-
+            /** Compile-time version of cursor::SafeCursor where LowerExtent and UpperExtent are
+             * compile-time vectors.
+             */
+            template<typename Cursor, typename LowerExtent, typename UpperExtent>
+            class SafeCursor : public Cursor
+            {
+            private:
+                typedef SafeCursor<Cursor, LowerExtent, UpperExtent> This;
+                static constexpr int dim = pmacc::cursor::traits::dim<Cursor>::value;
+                math::Int<dim> offset;
+
+            public:
+                HDINLINE SafeCursor(const Cursor& cursor) : Cursor(cursor), offset(math::Int<dim>(0))
+                {
+                }
+
+                HDINLINE
+                typename Cursor::type operator*()
+                {
+                    checkValidity();
+                    return Cursor::operator*();
+                }
+
+                HDINLINE
+                typename boost::add_const<typename Cursor::type>::type operator*() const
+                {
+                    checkValidity();
+                    return Cursor::operator*();
+                }
+
+                template<typename Jump>
+                HDINLINE This operator()(const Jump& jump) const
+                {
+                    This result(Cursor::operator()(jump));
+                    result.offset = this->offset + jump;
+                    return result;
+                }
+
+                HDINLINE
+                This operator()(int x) const
+                {
+                    return (*this)(math::Int<1>(x));
+                }
+
+                HDINLINE
+                This operator()(int x, int y) const
+                {
+                    return (*this)(math::Int<2>(x, y));
+                }
+
+                HDINLINE
+                This operator()(int x, int y, int z) const
+                {
+                    return (*this)(math::Int<3>(x, y, z));
+                }
+
+                HDINLINE void operator++()
+                {
+                    this->jump[0]++;
+                    Cursor::operator++;
+                }
+                HDINLINE void operator--()
+                {
+                    this->jump[0]--;
+                    Cursor::operator--;
+                }
+
+                template<typename Jump>
+                HDINLINE typename Cursor::type operator[](const Jump& jump)
+                {
+                    return *((*this)(jump));
+                }
+
+                template<typename Jump>
+                HDINLINE typename Cursor::type operator[](const Jump& jump) const
+                {
+                    return *((*this)(jump));
+                }
+
+            private:
+                HDINLINE void checkValidity() const
+                {
+#pragma unroll
+                    for(int i = 0; i < dim; i++)
+                    {
+                        if(this->offset[i] < LowerExtent().toRT()[i] || this->offset[i] > UpperExtent().toRT()[i])
+                            printf(
+                                "error[cursor]: index %d out of range: %d is not within [%d, %d]\n",
+                                i,
+                                this->offset[i],
+                                LowerExtent().toRT()[i],
+                                UpperExtent().toRT()[i]);
+                    }
+                }
+            };
+
+            template<typename Cursor, typename LowerExtent, typename UpperExtent>
+            HDINLINE SafeCursor<Cursor, LowerExtent, UpperExtent> make_SafeCursor(
+                const Cursor& cursor,
+                LowerExtent,
+                UpperExtent)
+            {
+                return SafeCursor<Cursor, LowerExtent, UpperExtent>(cursor);
+            }
+
+        } // namespace CT
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/BufferNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/BufferNavigator.hpp
index 3446eeea56..ba4cb74de8 100644
--- a/include/pmacc/cuSTL/cursor/navigator/BufferNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/BufferNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -32,68 +32,72 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-template<int T_dim>
-class BufferNavigator
-{
-public:
-    typedef tag::BufferNavigator tag;
-    static constexpr int dim = T_dim;
-private:
-    math::Size_t<dim-1> pitch;
-public:
-    HDINLINE
-    BufferNavigator(math::Size_t<dim-1> pitch) : pitch(pitch) {}
-
-    template<typename Data>
-    HDINLINE Data
-    operator()(const Data& data, const math::Int<dim>& jump) const
-    {
-        char* result = (char*)data;
-        result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type);
-        for(int i = 1; i < dim; i++)
-            result += jump[i] * this->pitch[i-1];
-        return (Data)result;
-    }
-
-    HDINLINE
-    const math::Size_t<dim-1>& getPitch() const {return pitch;}
-};
-
-template<>
-class BufferNavigator<1>
-{
-public:
-    typedef tag::BufferNavigator tag;
-    static constexpr int dim = 1;
-
-public:
-    HDINLINE
-    BufferNavigator(math::Size_t<dim-1>) {}
-
-    template<typename Data>
-    HDINLINE Data
-    operator()(const Data& data, const math::Int<dim>& jump) const
+    namespace cursor
     {
-        char* result = (char*)data;
-        result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type);
-        return (Data)result;
-    }
-};
-
-namespace traits
-{
-
-template<int T_dim>
-struct dim<BufferNavigator<T_dim> >
-{
-    static constexpr int value = T_dim;
-};
-
-} // traits
-
-} //cursor
-} // pmacc
-
+        template<int T_dim>
+        class BufferNavigator
+        {
+        public:
+            typedef tag::BufferNavigator tag;
+            static constexpr int dim = T_dim;
+
+        private:
+            math::Size_t<dim - 1> pitch;
+
+        public:
+            HDINLINE
+            BufferNavigator(math::Size_t<dim - 1> pitch) : pitch(pitch)
+            {
+            }
+
+            template<typename Data>
+            HDINLINE Data operator()(const Data& data, const math::Int<dim>& jump) const
+            {
+                char* result = (char*) data;
+                result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type);
+                for(int i = 1; i < dim; i++)
+                    result += jump[i] * this->pitch[i - 1];
+                return (Data) result;
+            }
+
+            HDINLINE
+            const math::Size_t<dim - 1>& getPitch() const
+            {
+                return pitch;
+            }
+        };
+
+        template<>
+        class BufferNavigator<1>
+        {
+        public:
+            typedef tag::BufferNavigator tag;
+            static constexpr int dim = 1;
+
+        public:
+            HDINLINE
+            BufferNavigator(math::Size_t<dim - 1>)
+            {
+            }
+
+            template<typename Data>
+            HDINLINE Data operator()(const Data& data, const math::Int<dim>& jump) const
+            {
+                char* result = (char*) data;
+                result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type);
+                return (Data) result;
+            }
+        };
+
+        namespace traits
+        {
+            template<int T_dim>
+            struct dim<BufferNavigator<T_dim>>
+            {
+                static constexpr int value = T_dim;
+            };
+
+        } // namespace traits
+
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/CartNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/CartNavigator.hpp
index 41f93ea8ce..560efdf124 100644
--- a/include/pmacc/cuSTL/cursor/navigator/CartNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/CartNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,47 +29,50 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-template<int T_dim>
-class CartNavigator
-{
-public:
-    typedef tag::CartNavigator tag;
-    static constexpr int dim = T_dim;
-private:
-    math::Int<dim> factor;
-public:
-    HDINLINE
-    CartNavigator(math::Int<dim> factor) : factor(factor) {}
-
-    template<typename Data>
-    HDINLINE
-    Data operator()(const Data& data, const math::Int<dim>& jump) const
+    namespace cursor
     {
-        char* result = (char*)data;
-        result += algorithms::math::dot(
-            static_cast<typename math::Int<dim>::BaseType>(jump),
-            static_cast<typename math::Int<dim>::BaseType>(this->factor));
-        return (Data)result;
-    }
+        template<int T_dim>
+        class CartNavigator
+        {
+        public:
+            typedef tag::CartNavigator tag;
+            static constexpr int dim = T_dim;
 
-    HDINLINE
-    const math::Int<dim>& getFactor() const {return factor;}
-};
+        private:
+            math::Int<dim> factor;
 
-namespace traits
-{
+        public:
+            HDINLINE
+            CartNavigator(math::Int<dim> factor) : factor(factor)
+            {
+            }
 
-template<int T_dim>
-struct dim<CartNavigator<T_dim> >
-{
-    static constexpr int value = T_dim;
-};
+            template<typename Data>
+            HDINLINE Data operator()(const Data& data, const math::Int<dim>& jump) const
+            {
+                char* result = (char*) data;
+                result += pmacc::math::dot(
+                    static_cast<typename math::Int<dim>::BaseType>(jump),
+                    static_cast<typename math::Int<dim>::BaseType>(this->factor));
+                return (Data) result;
+            }
+
+            HDINLINE
+            const math::Int<dim>& getFactor() const
+            {
+                return factor;
+            }
+        };
 
-} // traits
+        namespace traits
+        {
+            template<int T_dim>
+            struct dim<CartNavigator<T_dim>>
+            {
+                static constexpr int value = T_dim;
+            };
 
-} // cursor
-} // pmacc
+        } // namespace traits
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/CursorNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/CursorNavigator.hpp
index ecae903673..6371033a07 100644
--- a/include/pmacc/cuSTL/cursor/navigator/CursorNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/CursorNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,19 +25,16 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-struct CursorNavigator
-{
-    template<typename Cursor, typename Jump>
-    HDINLINE
-    Cursor operator()(const Cursor& cursor, const Jump& jump) const
+    namespace cursor
     {
-        return cursor(jump);
-    }
-};
-
-} // cursor
-} // pmacc
+        struct CursorNavigator
+        {
+            template<typename Cursor, typename Jump>
+            HDINLINE Cursor operator()(const Cursor& cursor, const Jump& jump) const
+            {
+                return cursor(jump);
+            }
+        };
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/EmptyNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/EmptyNavigator.hpp
index 85127b480b..9006251cdc 100644
--- a/include/pmacc/cuSTL/cursor/navigator/EmptyNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/EmptyNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,19 +23,16 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-struct EmptyNavigator
-{
-    template<typename Marker, typename Jump>
-    HDINLINE
-    Marker operator()(const Marker& marker, Jump) const
+    namespace cursor
     {
-        return marker;
-    }
-};
-
-} // cursor
-} // pmacc
+        struct EmptyNavigator
+        {
+            template<typename Marker, typename Jump>
+            HDINLINE Marker operator()(const Marker& marker, Jump) const
+            {
+                return marker;
+            }
+        };
 
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/MapTo1DNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/MapTo1DNavigator.hpp
index 7b2b676449..8116f39003 100644
--- a/include/pmacc/cuSTL/cursor/navigator/MapTo1DNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/MapTo1DNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -23,55 +23,55 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-/**
- * Use this navigator to wrap a ndim-cursor into a 1D cursor.
- */
-template<int T_dim>
-class MapTo1DNavigator
-{
-public:
-    static constexpr int dim = T_dim;
-private:
-    math::Size_t<dim> shape;
-    int pos;
-
-    HDINLINE
-    math::Int<dim> toNdim(int idx) const
+    namespace cursor
     {
-        math::Int<dim> result;
-        int volume = 1;
-        for(int i = 0; i < dim; i++)
+        /**
+         * Use this navigator to wrap a ndim-cursor into a 1D cursor.
+         */
+        template<int T_dim>
+        class MapTo1DNavigator
         {
-            result[i] = (idx / volume) % this->shape[i];
-            volume *= this->shape[i];
-        }
-        return result;
-    }
-public:
-    /**
-     * @param shape area to map the 1D index to.
-     */
-    HDINLINE
-    MapTo1DNavigator(math::Size_t<dim> shape)
-     : shape(shape), pos(0) {}
+        public:
+            static constexpr int dim = T_dim;
 
-    template<typename Cursor>
-    HDINLINE
-    Cursor operator()(const Cursor& cursor, math::Int<1> jump)
-    {
-        math::Int<dim> ndstart = toNdim(this->pos);
-        this->pos += jump.x();
-        math::Int<dim> ndend = toNdim(this->pos);
+        private:
+            math::Size_t<dim> shape;
+            int pos;
+
+            HDINLINE
+            math::Int<dim> toNdim(int idx) const
+            {
+                math::Int<dim> result;
+                int volume = 1;
+                for(int i = 0; i < dim; i++)
+                {
+                    result[i] = (idx / volume) % this->shape[i];
+                    volume *= this->shape[i];
+                }
+                return result;
+            }
+
+        public:
+            /**
+             * @param shape area to map the 1D index to.
+             */
+            HDINLINE
+            MapTo1DNavigator(math::Size_t<dim> shape) : shape(shape), pos(0)
+            {
+            }
 
-        math::Int<dim> ndjump = ndend - ndstart;
+            template<typename Cursor>
+            HDINLINE Cursor operator()(const Cursor& cursor, math::Int<1> jump)
+            {
+                math::Int<dim> ndstart = toNdim(this->pos);
+                this->pos += jump.x();
+                math::Int<dim> ndend = toNdim(this->pos);
 
-        return cursor(ndjump);
-    }
+                math::Int<dim> ndjump = ndend - ndstart;
 
-};
+                return cursor(ndjump);
+            }
+        };
 
-} // namespace cursor
+    } // namespace cursor
 } // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/MultiIndexNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/MultiIndexNavigator.hpp
index a2607f5f24..a32438321a 100644
--- a/include/pmacc/cuSTL/cursor/navigator/MultiIndexNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/MultiIndexNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,34 +28,30 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-template<int T_dim>
-struct MultiIndexNavigator
-{
-    typedef tag::MultiIndexNavigator tag;
-    static constexpr int dim = T_dim;
-
-    template<typename MultiIndex>
-    HDINLINE
-    MultiIndex operator()(const MultiIndex& index, const math::Int<dim>& jump) const
+    namespace cursor
     {
-        return index + jump;
-    }
-};
-
-namespace traits
-{
-
-template<int T_dim>
-struct dim<MultiIndexNavigator<T_dim> >
-{
-    static constexpr int value = T_dim;
-};
-
-}
-
-} // cursor
-} // pmacc
-
+        template<int T_dim>
+        struct MultiIndexNavigator
+        {
+            typedef tag::MultiIndexNavigator tag;
+            static constexpr int dim = T_dim;
+
+            template<typename MultiIndex>
+            HDINLINE MultiIndex operator()(const MultiIndex& index, const math::Int<dim>& jump) const
+            {
+                return index + jump;
+            }
+        };
+
+        namespace traits
+        {
+            template<int T_dim>
+            struct dim<MultiIndexNavigator<T_dim>>
+            {
+                static constexpr int value = T_dim;
+            };
+
+        } // namespace traits
+
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/PlusNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/PlusNavigator.hpp
index cf3d33b6cc..8211169792 100644
--- a/include/pmacc/cuSTL/cursor/navigator/PlusNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/PlusNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -23,21 +23,18 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-
-/** Navigator which combines jump and marker of any type by using the +operator.
- */
-struct PlusNavigator
-{
-    template<typename Marker, typename Jump>
-    HDINLINE Marker
-    operator()(const Marker& marker, const Jump& jump) const
+    namespace cursor
     {
-        return marker + jump;
-    }
-};
+        /** Navigator which combines jump and marker of any type by using the +operator.
+         */
+        struct PlusNavigator
+        {
+            template<typename Marker, typename Jump>
+            HDINLINE Marker operator()(const Marker& marker, const Jump& jump) const
+            {
+                return marker + jump;
+            }
+        };
 
-} // namespace cursor
+    } // namespace cursor
 } // namespace pmacc
-
diff --git a/include/pmacc/cuSTL/cursor/navigator/compile-time/BufferNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/compile-time/BufferNavigator.hpp
index b4d5b05a11..8ea1855233 100644
--- a/include/pmacc/cuSTL/cursor/navigator/compile-time/BufferNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/compile-time/BufferNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,63 +26,57 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace CT
-{
-
-template<typename Pitch, int dim = Pitch::dim + 1>
-struct BufferNavigator;
-
-template<typename Pitch>
-struct BufferNavigator<Pitch, 1>
-{
-    static constexpr int dim = 1;
-
-    template<typename Data>
-    HDINLINE
-    Data operator()(const Data& data, const math::Int<dim>& jump) const
+    namespace cursor
     {
-        char* result = (char*)data;
-        result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type);
-        return (Data)result;
-    }
-};
+        namespace CT
+        {
+            template<typename Pitch, int dim = Pitch::dim + 1>
+            struct BufferNavigator;
 
-template<typename Pitch>
-struct BufferNavigator<Pitch, 2>
-{
-    static constexpr int dim = 2;
+            template<typename Pitch>
+            struct BufferNavigator<Pitch, 1>
+            {
+                static constexpr int dim = 1;
 
-    template<typename Data>
-    HDINLINE
-    Data operator()(const Data& data, const math::Int<dim>& jump) const
-    {
-        char* result = (char*)data;
-        result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type)
-               + jump.y() * Pitch::x::value;
-        return (Data)result;
-    }
-};
+                template<typename Data>
+                HDINLINE Data operator()(const Data& data, const math::Int<dim>& jump) const
+                {
+                    char* result = (char*) data;
+                    result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type);
+                    return (Data) result;
+                }
+            };
 
-template<typename Pitch>
-struct BufferNavigator<Pitch, 3>
-{
-    static constexpr int dim = 3;
+            template<typename Pitch>
+            struct BufferNavigator<Pitch, 2>
+            {
+                static constexpr int dim = 2;
 
-    template<typename Data>
-    HDINLINE
-    Data operator()(const Data& data, const math::Int<dim>& jump) const
-    {
-        char* result = (char*)data;
-        result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type)
-               + jump.y() * Pitch::x::value
-               + jump.z() * Pitch::y::value;
-        return (Data)result;
-    }
-};
+                template<typename Data>
+                HDINLINE Data operator()(const Data& data, const math::Int<dim>& jump) const
+                {
+                    char* result = (char*) data;
+                    result
+                        += jump.x() * sizeof(typename boost::remove_pointer<Data>::type) + jump.y() * Pitch::x::value;
+                    return (Data) result;
+                }
+            };
+
+            template<typename Pitch>
+            struct BufferNavigator<Pitch, 3>
+            {
+                static constexpr int dim = 3;
 
-} // CT
-} // cursor
-} // pmacc
+                template<typename Data>
+                HDINLINE Data operator()(const Data& data, const math::Int<dim>& jump) const
+                {
+                    char* result = (char*) data;
+                    result += jump.x() * sizeof(typename boost::remove_pointer<Data>::type)
+                        + jump.y() * Pitch::x::value + jump.z() * Pitch::y::value;
+                    return (Data) result;
+                }
+            };
 
+        } // namespace CT
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/compile-time/TwistAxesNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/compile-time/TwistAxesNavigator.hpp
index bdf91f3da4..aa5c3b0be9 100644
--- a/include/pmacc/cuSTL/cursor/navigator/compile-time/TwistAxesNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/compile-time/TwistAxesNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,48 +25,44 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace CT
-{
-
-template<typename Axes, int dim = Axes::dim>
-struct TwistAxesNavigator;
-
-template<typename Axes>
-struct TwistAxesNavigator<Axes, 2>
-{
-    static constexpr int dim = 2;
-
-    template<typename TCursor>
-    HDINLINE
-    TCursor operator()(const TCursor& cursor, const math::Int<2>& jump) const
+    namespace cursor
     {
-        math::Int<2> twistedJump;
-        twistedJump[Axes::x::value] = jump.x();
-        twistedJump[Axes::y::value] = jump.y();
-        return cursor(twistedJump);
-    }
-};
+        namespace CT
+        {
+            template<typename Axes, int dim = Axes::dim>
+            struct TwistAxesNavigator;
 
-template<typename Axes>
-struct TwistAxesNavigator<Axes, 3>
-{
-    static constexpr int dim = 3;
+            template<typename Axes>
+            struct TwistAxesNavigator<Axes, 2>
+            {
+                static constexpr int dim = 2;
 
-    template<typename TCursor>
-    HDINLINE
-    TCursor operator()(const TCursor& cursor, const math::Int<3>& jump) const
-    {
-        math::Int<3> twistedJump;
-        twistedJump[Axes::x::value] = jump.x();
-        twistedJump[Axes::y::value] = jump.y();
-        twistedJump[Axes::z::value] = jump.z();
-        return cursor(twistedJump);
-    }
-};
+                template<typename TCursor>
+                HDINLINE TCursor operator()(const TCursor& cursor, const math::Int<2>& jump) const
+                {
+                    math::Int<2> twistedJump;
+                    twistedJump[Axes::x::value] = jump.x();
+                    twistedJump[Axes::y::value] = jump.y();
+                    return cursor(twistedJump);
+                }
+            };
+
+            template<typename Axes>
+            struct TwistAxesNavigator<Axes, 3>
+            {
+                static constexpr int dim = 3;
 
-} // CT
-} // cursor
-} // pmacc
+                template<typename TCursor>
+                HDINLINE TCursor operator()(const TCursor& cursor, const math::Int<3>& jump) const
+                {
+                    math::Int<3> twistedJump;
+                    twistedJump[Axes::x::value] = jump.x();
+                    twistedJump[Axes::y::value] = jump.y();
+                    twistedJump[Axes::z::value] = jump.z();
+                    return cursor(twistedJump);
+                }
+            };
 
+        } // namespace CT
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/compile-time/TwistedAxesNavigator.hpp b/include/pmacc/cuSTL/cursor/navigator/compile-time/TwistedAxesNavigator.hpp
index a94c93ae93..3537c421ca 100644
--- a/include/pmacc/cuSTL/cursor/navigator/compile-time/TwistedAxesNavigator.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/compile-time/TwistedAxesNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,48 +25,44 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace CT
-{
-
-template<typename Axes, int dim = Axes::dim>
-struct TwistedAxesNavigator;
-
-template<typename Axes>
-struct TwistedAxesNavigator<Axes, 2>
-{
-    static constexpr int dim = 2;
-
-    template<typename TCursor>
-    HDINLINE
-    TCursor operator()(const TCursor& cursor, const math::Int<2>& jump) const
+    namespace cursor
     {
-        math::Int<2> twistedJump;
-        twistedJump[Axes::x::value] = jump.x();
-        twistedJump[Axes::y::value] = jump.y();
-        return cursor(twistedJump);
-    }
-};
+        namespace CT
+        {
+            template<typename Axes, int dim = Axes::dim>
+            struct TwistedAxesNavigator;
 
-template<typename Axes>
-struct TwistedAxesNavigator<Axes, 3>
-{
-    static constexpr int dim = 3;
+            template<typename Axes>
+            struct TwistedAxesNavigator<Axes, 2>
+            {
+                static constexpr int dim = 2;
 
-    template<typename TCursor>
-    HDINLINE
-    TCursor operator()(const TCursor& cursor, const math::Int<3>& jump) const
-    {
-        math::Int<3> twistedJump;
-        twistedJump[Axes::x::value] = jump.x();
-        twistedJump[Axes::y::value] = jump.y();
-        twistedJump[Axes::z::value] = jump.z();
-        return cursor(twistedJump);
-    }
-};
+                template<typename TCursor>
+                HDINLINE TCursor operator()(const TCursor& cursor, const math::Int<2>& jump) const
+                {
+                    math::Int<2> twistedJump;
+                    twistedJump[Axes::x::value] = jump.x();
+                    twistedJump[Axes::y::value] = jump.y();
+                    return cursor(twistedJump);
+                }
+            };
+
+            template<typename Axes>
+            struct TwistedAxesNavigator<Axes, 3>
+            {
+                static constexpr int dim = 3;
 
-} // CT
-} // cursor
-} // pmacc
+                template<typename TCursor>
+                HDINLINE TCursor operator()(const TCursor& cursor, const math::Int<3>& jump) const
+                {
+                    math::Int<3> twistedJump;
+                    twistedJump[Axes::x::value] = jump.x();
+                    twistedJump[Axes::y::value] = jump.y();
+                    twistedJump[Axes::z::value] = jump.z();
+                    return cursor(twistedJump);
+                }
+            };
 
+        } // namespace CT
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/navigator/tag.hpp b/include/pmacc/cuSTL/cursor/navigator/tag.hpp
index b20b6310a7..a29605afe1 100644
--- a/include/pmacc/cuSTL/cursor/navigator/tag.hpp
+++ b/include/pmacc/cuSTL/cursor/navigator/tag.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,15 +24,15 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace tag
-{
-struct BufferNavigator;
-struct CartNavigator;
-struct MultiIndexNavigator;
-} // tag
-} // cursor
-} // pmacc
+    namespace cursor
+    {
+        namespace tag
+        {
+            struct BufferNavigator;
+            struct CartNavigator;
+            struct MultiIndexNavigator;
+        } // namespace tag
+    } // namespace cursor
+} // namespace pmacc
 
 #endif // CURSOR_NAVIGATOR_TAG_H
diff --git a/include/pmacc/cuSTL/cursor/tools/LinearInterp.hpp b/include/pmacc/cuSTL/cursor/tools/LinearInterp.hpp
index 4c4739eb5f..4409514bb5 100644
--- a/include/pmacc/cuSTL/cursor/tools/LinearInterp.hpp
+++ b/include/pmacc/cuSTL/cursor/tools/LinearInterp.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -31,62 +31,46 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace tools
-{
-
-/** Return a cursor that does 1D, 2D or 3D, linear interpolation on input data.
- *
- * \tparam T_PositionComp integral type of the weighting factor
- */
-template<typename T_PositionComp = float>
-struct LinearInterp
-{
-    template<typename T_Cursor>
-    Cursor<
-        LinearInterpAccessor<T_Cursor>,
-        PlusNavigator,
-        pmacc::math::Vector<
-            T_PositionComp,
-            pmacc::cursor::traits::dim<
-                T_Cursor>::value
-            >
-        >
-    HDINLINE
-    operator()(const T_Cursor& cur)
+    namespace cursor
     {
-        return make_Cursor(
-            LinearInterpAccessor<T_Cursor>(cur),
-            PlusNavigator(),
-            pmacc::math::Vector<
-                T_PositionComp,
-                pmacc::cursor::traits::dim<T_Cursor>::value>::create(0.0));
-    }
-};
+        namespace tools
+        {
+            /** Return a cursor that does 1D, 2D or 3D, linear interpolation on input data.
+             *
+             * \tparam T_PositionComp integral type of the weighting factor
+             */
+            template<typename T_PositionComp = float>
+            struct LinearInterp
+            {
+                template<typename T_Cursor>
+                Cursor<
+                    LinearInterpAccessor<T_Cursor>,
+                    PlusNavigator,
+                    pmacc::math::Vector<T_PositionComp, pmacc::cursor::traits::dim<T_Cursor>::value>>
+                    HDINLINE operator()(const T_Cursor& cur)
+                {
+                    return make_Cursor(
+                        LinearInterpAccessor<T_Cursor>(cur),
+                        PlusNavigator(),
+                        pmacc::math::Vector<T_PositionComp, pmacc::cursor::traits::dim<T_Cursor>::value>::create(0.0));
+                }
+            };
 
-} // namespace tools
-} // namespace cursor
+        } // namespace tools
+    } // namespace cursor
 
-namespace result_of
-{
-
-template<typename T_Cursor, typename T_PositionComp>
-struct Functor<cursor::tools::LinearInterp<T_PositionComp>, T_Cursor>
-{
-    typedef pmacc::cursor::Cursor<
-        cursor::LinearInterpAccessor<T_Cursor>,
-        cursor::PlusNavigator,
-        pmacc::math::Vector<
-            T_PositionComp,
-            pmacc::cursor::traits::dim<
-                T_Cursor
-            >::value
-        >
-    > type;
-};
+    namespace result_of
+    {
+        template<typename T_Cursor, typename T_PositionComp>
+        struct Functor<cursor::tools::LinearInterp<T_PositionComp>, T_Cursor>
+        {
+            typedef pmacc::cursor::Cursor<
+                cursor::LinearInterpAccessor<T_Cursor>,
+                cursor::PlusNavigator,
+                pmacc::math::Vector<T_PositionComp, pmacc::cursor::traits::dim<T_Cursor>::value>>
+                type;
+        };
 
-} // namespace result_of
+    } // namespace result_of
 
 } // namespace pmacc
-
diff --git a/include/pmacc/cuSTL/cursor/tools/slice.hpp b/include/pmacc/cuSTL/cursor/tools/slice.hpp
index d566af14e1..275427b816 100644
--- a/include/pmacc/cuSTL/cursor/tools/slice.hpp
+++ b/include/pmacc/cuSTL/cursor/tools/slice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,79 +28,79 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace tools
-{
-namespace detail
-{
-template<typename TCursor, typename Tag>
-struct SliceResult;
+    namespace cursor
+    {
+        namespace tools
+        {
+            namespace detail
+            {
+                template<typename TCursor, typename Tag>
+                struct SliceResult;
 
-template<typename TCursor>
-struct SliceResult<TCursor, tag::BufferNavigator>
-{
-    typedef Cursor<
-        typename TCursor::Accessor,
-        BufferNavigator<TCursor::Navigator::dim-1>,
-        typename TCursor::Marker> type;
-};
+                template<typename TCursor>
+                struct SliceResult<TCursor, tag::BufferNavigator>
+                {
+                    typedef Cursor<
+                        typename TCursor::Accessor,
+                        BufferNavigator<TCursor::Navigator::dim - 1>,
+                        typename TCursor::Marker>
+                        type;
+                };
 
-template<typename TCursor>
-struct SliceResult<TCursor, tag::CartNavigator>
-{
-    typedef Cursor<
-        typename TCursor::Accessor,
-        CartNavigator<TCursor::Navigator::dim-1>,
-        typename TCursor::Marker> type;
-};
+                template<typename TCursor>
+                struct SliceResult<TCursor, tag::CartNavigator>
+                {
+                    typedef Cursor<
+                        typename TCursor::Accessor,
+                        CartNavigator<TCursor::Navigator::dim - 1>,
+                        typename TCursor::Marker>
+                        type;
+                };
 
-template<typename Navi, typename NaviTag>
-struct Slice_helper;
+                template<typename Navi, typename NaviTag>
+                struct Slice_helper;
 
-template<typename Navi>
-struct Slice_helper<Navi, tag::BufferNavigator>
-{
-    HDINLINE
-    BufferNavigator<Navi::dim-1> operator()(const Navi& navi)
-    {
-        math::Size_t<Navi::dim-2> pitch;
-        for(int i = 0; i < Navi::dim-2; i++)
-            pitch[i] = navi.getPitch()[i];
-        return BufferNavigator<Navi::dim-1>(pitch);
-    }
-};
-
-template<typename Navi>
-struct Slice_helper<Navi, tag::CartNavigator>
-{
-    HDINLINE
-    CartNavigator<Navi::dim-1> operator()(const Navi& navi)
-    {
-        math::Int<Navi::dim-1> factor;
-        for(uint32_t i = 0; i < Navi::dim-1; i++)
-            factor[i] = navi.getFactor()[i];
-        return CartNavigator<Navi::dim-1>(factor);
-    }
-};
+                template<typename Navi>
+                struct Slice_helper<Navi, tag::BufferNavigator>
+                {
+                    HDINLINE
+                    BufferNavigator<Navi::dim - 1> operator()(const Navi& navi)
+                    {
+                        math::Size_t<Navi::dim - 2> pitch;
+                        for(int i = 0; i < Navi::dim - 2; i++)
+                            pitch[i] = navi.getPitch()[i];
+                        return BufferNavigator<Navi::dim - 1>(pitch);
+                    }
+                };
 
-} // detail
+                template<typename Navi>
+                struct Slice_helper<Navi, tag::CartNavigator>
+                {
+                    HDINLINE
+                    CartNavigator<Navi::dim - 1> operator()(const Navi& navi)
+                    {
+                        math::Int<Navi::dim - 1> factor;
+                        for(uint32_t i = 0; i < Navi::dim - 1; i++)
+                            factor[i] = navi.getFactor()[i];
+                        return CartNavigator<Navi::dim - 1>(factor);
+                    }
+                };
 
-/** makes a 2D cursor of a 3D vector by dropping the z-component
- */
-template<typename TCursor>
-HDINLINE
-typename detail::SliceResult<TCursor, typename TCursor::Navigator::tag>::type
-slice(const TCursor& cur)
-{
-    detail::Slice_helper<typename TCursor::Navigator, typename TCursor::Navigator::tag> slice_helper;
-    return typename detail::SliceResult<TCursor, typename TCursor::Navigator::tag>::type
-            (cur.getAccessor(),
-             slice_helper(cur.getNavigator()),
-             cur.getMarker());
-}
+            } // namespace detail
 
-} // tools
-} // cursor
-} // pmacc
+            /** makes a 2D cursor of a 3D vector by dropping the z-component
+             */
+            template<typename TCursor>
+            HDINLINE typename detail::SliceResult<TCursor, typename TCursor::Navigator::tag>::type slice(
+                const TCursor& cur)
+            {
+                detail::Slice_helper<typename TCursor::Navigator, typename TCursor::Navigator::tag> slice_helper;
+                return typename detail::SliceResult<TCursor, typename TCursor::Navigator::tag>::type(
+                    cur.getAccessor(),
+                    slice_helper(cur.getNavigator()),
+                    cur.getMarker());
+            }
 
+        } // namespace tools
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/tools/twistAxes.hpp b/include/pmacc/cuSTL/cursor/tools/twistAxes.hpp
index cd5ddbfcb4..bb14bac837 100644
--- a/include/pmacc/cuSTL/cursor/tools/twistAxes.hpp
+++ b/include/pmacc/cuSTL/cursor/tools/twistAxes.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,30 +27,29 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace tools
-{
-
-/** Returns a new cursor which looks like a rotated version of the one passed.
- *
- * The new cursor wraps the one that is passed. In the new cursor's navigator
- * the components of the passed int-vector are reordered according to the Axes
- * parameter and then passed to the nested cursor.
- *
- * \tparam Axes compile-time vector (pmacc::math::CT::Int) that descripes the mapping.
- * x-axis -> Axes::at<0>, y-axis -> Axes::at<1>, ...
- */
-template<typename Axes, typename TCursor>
-HDINLINE
-Cursor<CursorAccessor<TCursor>, CT::TwistAxesNavigator<Axes>, TCursor>
-twistAxes(const TCursor& cursor)
-{
-    return Cursor<CursorAccessor<TCursor>, CT::TwistAxesNavigator<Axes>, TCursor>
-        (CursorAccessor<TCursor>(), CT::TwistAxesNavigator<Axes>(), cursor);
-}
-
-} // tools
-} // cursor
-} // pmacc
+    namespace cursor
+    {
+        namespace tools
+        {
+            /** Returns a new cursor which looks like a rotated version of the one passed.
+             *
+             * The new cursor wraps the one that is passed. In the new cursor's navigator
+             * the components of the passed int-vector are reordered according to the Axes
+             * parameter and then passed to the nested cursor.
+             *
+             * \tparam Axes compile-time vector (pmacc::math::CT::Int) that descripes the mapping.
+             * x-axis -> Axes::at<0>, y-axis -> Axes::at<1>, ...
+             */
+            template<typename Axes, typename TCursor>
+            HDINLINE Cursor<CursorAccessor<TCursor>, CT::TwistAxesNavigator<Axes>, TCursor> twistAxes(
+                const TCursor& cursor)
+            {
+                return Cursor<CursorAccessor<TCursor>, CT::TwistAxesNavigator<Axes>, TCursor>(
+                    CursorAccessor<TCursor>(),
+                    CT::TwistAxesNavigator<Axes>(),
+                    cursor);
+            }
 
+        } // namespace tools
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp b/include/pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp
index 917560c609..75bb4a5e5e 100644
--- a/include/pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp
+++ b/include/pmacc/cuSTL/cursor/tools/twistVectorFieldAxes.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,95 +27,93 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace tools
-{
+    namespace cursor
+    {
+        namespace tools
+        {
+            namespace result_of
+            {
+                /** result for TwistVectorFieldAxes
+                 *
+                 * \tparam T_NavigatorPerm permutation vector for navigator
+                 * \tparam T_AccessorPerm permutation vector for the accessor
+                 * \tparam T_Cursor cursor to permute
+                 */
+                template<typename T_NavigatorPerm, typename T_AccessorPerm, typename T_Cursor>
+                struct TwistVectorFieldAxes
+                {
+                    typedef Cursor<
+                        TwistAxesAccessor<T_Cursor, T_AccessorPerm>,
+                        pmacc::cursor::CT::TwistAxesNavigator<T_NavigatorPerm>,
+                        T_Cursor>
+                        type;
+                };
 
-namespace result_of
-{
+            } // namespace result_of
 
-/** result for TwistVectorFieldAxes
- *
- * \tparam T_NavigatorPerm permutation vector for navigator
- * \tparam T_AccessorPerm permutation vector for the accessor
- * \tparam T_Cursor cursor to permute
- */
-template<typename T_NavigatorPerm, typename T_AccessorPerm, typename T_Cursor>
-struct TwistVectorFieldAxes
-{
-    typedef Cursor<TwistAxesAccessor<T_Cursor, T_AccessorPerm>,
-                   pmacc::cursor::CT::TwistAxesNavigator<T_NavigatorPerm>,
-                   T_Cursor> type;
-};
-
-} // result_of
-
-/** Returns a new cursor which looks like a vector field rotated version of the one passed
- *
- * When rotating a vector field in physics the coordinate system and the vectors themselves
- * have to be rotated. This is the idea behind this function. It is assuming that the cursor
- * which is passed returns in its access call a vector type of the same dimension as in
- * the jumping call. In other words, the field and the vector have the same dimension.
- *
- * e.g.: new_cur = twistVectorFieldAxes<math::CT::Int<1,2,0> >(cur); // x -> y, y -> z, z -> x
- *
- * \tparam T_Permutation compile-time vector (pmacc::math::CT::Int) that describes the mapping.
- * x-axis -> T_Permutation::at<0>, y-axis -> T_Permutation::at<1>, ...
- *
- */
-template<typename T_Permutation, typename T_Cursor>
-HDINLINE
-typename result_of::TwistVectorFieldAxes<T_Permutation, T_Permutation, T_Cursor>::type
-twistVectorFieldAxes(const T_Cursor& cursor)
-{
-    return typename result_of::TwistVectorFieldAxes<T_Permutation, T_Permutation, T_Cursor>::type
-        (TwistAxesAccessor<T_Cursor, T_Permutation>(),
-        pmacc::cursor::CT::TwistAxesNavigator<T_Permutation>(),
-        cursor);
-}
+            /** Returns a new cursor which looks like a vector field rotated version of the one passed
+             *
+             * When rotating a vector field in physics the coordinate system and the vectors themselves
+             * have to be rotated. This is the idea behind this function. It is assuming that the cursor
+             * which is passed returns in its access call a vector type of the same dimension as in
+             * the jumping call. In other words, the field and the vector have the same dimension.
+             *
+             * e.g.: new_cur = twistVectorFieldAxes<math::CT::Int<1,2,0> >(cur); // x -> y, y -> z, z -> x
+             *
+             * \tparam T_Permutation compile-time vector (pmacc::math::CT::Int) that describes the mapping.
+             * x-axis -> T_Permutation::at<0>, y-axis -> T_Permutation::at<1>, ...
+             *
+             */
+            template<typename T_Permutation, typename T_Cursor>
+            HDINLINE typename result_of::TwistVectorFieldAxes<T_Permutation, T_Permutation, T_Cursor>::type
+            twistVectorFieldAxes(const T_Cursor& cursor)
+            {
+                return typename result_of::TwistVectorFieldAxes<T_Permutation, T_Permutation, T_Cursor>::type(
+                    TwistAxesAccessor<T_Cursor, T_Permutation>(),
+                    pmacc::cursor::CT::TwistAxesNavigator<T_Permutation>(),
+                    cursor);
+            }
 
-/** permute navigation and access of a cursor
- *
- * use same permutation for accessor and navigator
- *
- * \tparam T_Permutation permutation vector
- * \tparam T_Cursor permutation vector
- * \param cursor cursor to permute
- * \param permutation cursor to permute
- */
-template<typename T_Cursor, typename T_Permutation>
-HDINLINE
-typename result_of::TwistVectorFieldAxes<T_Permutation, T_Permutation, T_Cursor>::type
-twistVectorFieldAxes(const T_Cursor& cursor, const T_Permutation& /*permutation*/)
-{
-    return typename result_of::TwistVectorFieldAxes<T_Permutation, T_Permutation, T_Cursor>::type
-        (TwistAxesAccessor<T_Cursor, T_Permutation>(),
-        pmacc::cursor::CT::TwistAxesNavigator<T_Permutation>(),
-        cursor);
-}
+            /** permute navigation and access of a cursor
+             *
+             * use same permutation for accessor and navigator
+             *
+             * \tparam T_Permutation permutation vector
+             * \tparam T_Cursor permutation vector
+             * \param cursor cursor to permute
+             * \param permutation cursor to permute
+             */
+            template<typename T_Cursor, typename T_Permutation>
+            HDINLINE typename result_of::TwistVectorFieldAxes<T_Permutation, T_Permutation, T_Cursor>::type
+            twistVectorFieldAxes(const T_Cursor& cursor, const T_Permutation& /*permutation*/)
+            {
+                return typename result_of::TwistVectorFieldAxes<T_Permutation, T_Permutation, T_Cursor>::type(
+                    TwistAxesAccessor<T_Cursor, T_Permutation>(),
+                    pmacc::cursor::CT::TwistAxesNavigator<T_Permutation>(),
+                    cursor);
+            }
 
-/** permute navigation and access of a cursor
- *
- * different dimensions for the accessor and navigator permutation vector are allowed
- *
- * \param cursor cursor to permute
- * \param navigatorPermutation compile time permutation vector for the navigator
- * \param accessorPermutation compile time permutation vector for the accessor
- */
-template<typename T_Cursor, typename T_NavigatorPerm, typename T_AccessorPerm>
-HDINLINE
-typename result_of::TwistVectorFieldAxes<T_NavigatorPerm, T_AccessorPerm, T_Cursor>::type
-twistVectorFieldAxes(const T_Cursor& cursor,
-                       const T_NavigatorPerm& /*navigatorPermutation*/,
-                       const T_AccessorPerm& /*accessorPermutation*/)
-{
-    return typename result_of::TwistVectorFieldAxes<T_NavigatorPerm, T_AccessorPerm, T_Cursor>::type
-        (TwistAxesAccessor<T_Cursor, T_AccessorPerm>(),
-        pmacc::cursor::CT::TwistAxesNavigator<T_NavigatorPerm>(),
-        cursor);
-}
+            /** permute navigation and access of a cursor
+             *
+             * different dimensions for the accessor and navigator permutation vector are allowed
+             *
+             * \param cursor cursor to permute
+             * \param navigatorPermutation compile time permutation vector for the navigator
+             * \param accessorPermutation compile time permutation vector for the accessor
+             */
+            template<typename T_Cursor, typename T_NavigatorPerm, typename T_AccessorPerm>
+            HDINLINE typename result_of::TwistVectorFieldAxes<T_NavigatorPerm, T_AccessorPerm, T_Cursor>::type
+            twistVectorFieldAxes(
+                const T_Cursor& cursor,
+                const T_NavigatorPerm& /*navigatorPermutation*/,
+                const T_AccessorPerm& /*accessorPermutation*/)
+            {
+                return typename result_of::TwistVectorFieldAxes<T_NavigatorPerm, T_AccessorPerm, T_Cursor>::type(
+                    TwistAxesAccessor<T_Cursor, T_AccessorPerm>(),
+                    pmacc::cursor::CT::TwistAxesNavigator<T_NavigatorPerm>(),
+                    cursor);
+            }
 
-} // tools
-} // cursor
-} // pmacc
+        } // namespace tools
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/cursor/traits.hpp b/include/pmacc/cuSTL/cursor/traits.hpp
index 8855965023..3ee70eeb5a 100644
--- a/include/pmacc/cuSTL/cursor/traits.hpp
+++ b/include/pmacc/cuSTL/cursor/traits.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -23,15 +23,13 @@
 
 namespace pmacc
 {
-namespace cursor
-{
-namespace traits
-{
-
-template<typename type>
-struct dim;
-
-} // traits
-} // cursor
-} // pmacc
+    namespace cursor
+    {
+        namespace traits
+        {
+            template<typename type>
+            struct dim;
 
+        } // namespace traits
+    } // namespace cursor
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/zone/SphericZone.hpp b/include/pmacc/cuSTL/zone/SphericZone.hpp
index 6ad4166fa7..37d68bd520 100644
--- a/include/pmacc/cuSTL/zone/SphericZone.hpp
+++ b/include/pmacc/cuSTL/zone/SphericZone.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,44 +27,52 @@
 
 namespace pmacc
 {
-namespace zone
-{
-
-namespace tag
-{
-struct SphericZone {};
-}
-
-/* spheric (no holes), cartesian zone
- *
- * \tparam T_dim dimension of the zone
- *
- * This is a zone which is simply described by a size and a offset.
- *
- */
-template<int T_dim>
-struct SphericZone
-{
-    typedef tag::SphericZone tag;
-    static constexpr int dim = T_dim;
-    math::Size_t<dim> size;
-    math::Int<dim> offset;
+    namespace zone
+    {
+        namespace tag
+        {
+            struct SphericZone
+            {
+            };
+        } // namespace tag
 
-    HDINLINE SphericZone() {}
-    HDINLINE SphericZone(const math::Size_t<dim>& size) : size(size), offset(math::Int<dim>::create(0)) {}
-    HDINLINE SphericZone(const math::Size_t<dim>& size,
-                         const math::Int<dim>& offset) : size(size), offset(offset) {}
+        /* spheric (no holes), cartesian zone
+         *
+         * \tparam T_dim dimension of the zone
+         *
+         * This is a zone which is simply described by a size and a offset.
+         *
+         */
+        template<int T_dim>
+        struct SphericZone
+        {
+            typedef tag::SphericZone tag;
+            static constexpr int dim = T_dim;
+            math::Size_t<dim> size;
+            math::Int<dim> offset;
 
-    /* Returns whether pos is within the zone */
-    HDINLINE bool within(const pmacc::math::Int<T_dim>& pos) const
-    {
-        bool result = true;
-        for(int i = 0; i < T_dim; i++)
-            if((pos[i] < offset[i]) || (pos[i] >= offset[i] + (int)size[i])) result = false;
-        return result;
-    }
-};
+            HDINLINE SphericZone()
+            {
+            }
+            HDINLINE SphericZone(const math::Size_t<dim>& size) : size(size), offset(math::Int<dim>::create(0))
+            {
+            }
+            HDINLINE SphericZone(const math::Size_t<dim>& size, const math::Int<dim>& offset)
+                : size(size)
+                , offset(offset)
+            {
+            }
 
-} // zone
-} // pmacc
+            /* Returns whether pos is within the zone */
+            HDINLINE bool within(const pmacc::math::Int<T_dim>& pos) const
+            {
+                bool result = true;
+                for(int i = 0; i < T_dim; i++)
+                    if((pos[i] < offset[i]) || (pos[i] >= offset[i] + (int) size[i]))
+                        result = false;
+                return result;
+            }
+        };
 
+    } // namespace zone
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/zone/StaggeredZone.hpp b/include/pmacc/cuSTL/zone/StaggeredZone.hpp
index 755c4d4843..40255e67f8 100644
--- a/include/pmacc/cuSTL/zone/StaggeredZone.hpp
+++ b/include/pmacc/cuSTL/zone/StaggeredZone.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,21 +27,22 @@
 
 namespace pmacc
 {
-namespace zone
-{
-namespace tag
-{
-struct StaggeredZone {};
-}
-
-template<int T_dim>
-struct StaggeredZone : public SphericZone<T_dim>
-{
-    typedef tag::StaggeredZone tag;
-    math::UInt32<dim> staggered;
-    math::UInt32<dim> staggeredOffset;
-};
+    namespace zone
+    {
+        namespace tag
+        {
+            struct StaggeredZone
+            {
+            };
+        } // namespace tag
 
-} // zone
-} // pmacc
+        template<int T_dim>
+        struct StaggeredZone : public SphericZone<T_dim>
+        {
+            typedef tag::StaggeredZone tag;
+            math::UInt32<dim> staggered;
+            math::UInt32<dim> staggeredOffset;
+        };
 
+    } // namespace zone
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/zone/ToricZone.hpp b/include/pmacc/cuSTL/zone/ToricZone.hpp
index fae3acdd90..8d84a7dca4 100644
--- a/include/pmacc/cuSTL/zone/ToricZone.hpp
+++ b/include/pmacc/cuSTL/zone/ToricZone.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,23 +26,24 @@
 
 namespace pmacc
 {
-namespace zone
-{
-namespace tag
-{
-struct ToricZone {};
-}
-
-template<int T_dim>
-struct ToricZone
-{
-    typedef tag::ToricZone tag;
-    static constexpr int dim = T_dim;
-    math::Size_t<dim> offset;
-    math::Size_t<dim> size;
-    uint32_t thickness;
-};
+    namespace zone
+    {
+        namespace tag
+        {
+            struct ToricZone
+            {
+            };
+        } // namespace tag
 
-} // zone
-} // pmacc
+        template<int T_dim>
+        struct ToricZone
+        {
+            typedef tag::ToricZone tag;
+            static constexpr int dim = T_dim;
+            math::Size_t<dim> offset;
+            math::Size_t<dim> size;
+            uint32_t thickness;
+        };
 
+    } // namespace zone
+} // namespace pmacc
diff --git a/include/pmacc/cuSTL/zone/compile-time/SphericZone.hpp b/include/pmacc/cuSTL/zone/compile-time/SphericZone.hpp
index 862b9f728f..30a148832e 100644
--- a/include/pmacc/cuSTL/zone/compile-time/SphericZone.hpp
+++ b/include/pmacc/cuSTL/zone/compile-time/SphericZone.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -25,29 +25,29 @@
 
 namespace pmacc
 {
-namespace zone
-{
-namespace CT
-{
-
-/* spheric (no holes), cartesian, compile-time zone
- *
- * \tparam _Size compile-time vector (pmacc::math::CT::Size_t) of the zone's size.
- * \tparam _Offset compile-time vector (pmacc::math::CT::Size_t) of the zone's offset. default is a zero vector.
- *
- * This is a zone which is simply described by a size and a offset.
- *
- * Compile-time version of zone::SphericZone
- *
- */
-template<typename _Size, typename _Offset = typename math::CT::make_Int<_Size::dim, 0>::type>
-struct SphericZone
-{
-    typedef _Size Size;
-    typedef _Offset Offset;
-    static constexpr int dim = Size::dim;
-};
+    namespace zone
+    {
+        namespace CT
+        {
+            /* spheric (no holes), cartesian, compile-time zone
+             *
+             * \tparam _Size compile-time vector (pmacc::math::CT::Size_t) of the zone's size.
+             * \tparam _Offset compile-time vector (pmacc::math::CT::Size_t) of the zone's offset. default is a zero
+             * vector.
+             *
+             * This is a zone which is simply described by a size and a offset.
+             *
+             * Compile-time version of zone::SphericZone
+             *
+             */
+            template<typename _Size, typename _Offset = typename math::CT::make_Int<_Size::dim, 0>::type>
+            struct SphericZone
+            {
+                typedef _Size Size;
+                typedef _Offset Offset;
+                static constexpr int dim = Size::dim;
+            };
 
-} // CT
-} // zone
-} // pmacc
+        } // namespace CT
+    } // namespace zone
+} // namespace pmacc
diff --git a/include/pmacc/cudaSpecs.hpp b/include/pmacc/cudaSpecs.hpp
index d8e7d507af..a72e719682 100644
--- a/include/pmacc/cudaSpecs.hpp
+++ b/include/pmacc/cudaSpecs.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -26,20 +26,19 @@
 
 namespace pmacc
 {
-namespace cudaSpecs
-{
-
-/* Various hardware specific numerical limits taken from the
- * *CUDA C Programming Guide* Section: G.1. Features and Technical Specifications.
- *
- * Valid for sm_2.x - sm_5.3
- */
+    namespace cudaSpecs
+    {
+        /* Various hardware specific numerical limits taken from the
+         * *CUDA C Programming Guide* Section: G.1. Features and Technical Specifications.
+         *
+         * Valid for sm_2.x - sm_5.3
+         */
 
-/** maximum number of threads per block */
-constexpr uint32_t maxNumThreadsPerBlock = 1024;
+        /** maximum number of threads per block */
+        constexpr uint32_t maxNumThreadsPerBlock = 1024;
 
-/** maximum number of threads per axis of a block */
-typedef math::CT::Size_t<1024, 1024, 64> MaxNumThreadsPerBlockDim;
+        /** maximum number of threads per axis of a block */
+        typedef math::CT::Size_t<1024, 1024, 64> MaxNumThreadsPerBlockDim;
 
-} // namespace cudaSpecs
+    } // namespace cudaSpecs
 } // namespace pmacc
diff --git a/include/pmacc/cuplaHelper/ValidateCall.hpp b/include/pmacc/cuplaHelper/ValidateCall.hpp
index 18f283ca19..383c651095 100644
--- a/include/pmacc/cuplaHelper/ValidateCall.hpp
+++ b/include/pmacc/cuplaHelper/ValidateCall.hpp
@@ -1,6 +1,6 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
- *                     Alexander Grund
+ *                     Alexander Grund, Sergei Bastrakov
  *
  * This file is part of PMacc.
  *
@@ -23,35 +23,70 @@
 
 #pragma once
 
-#include <cuda_to_cupla.hpp>
+#include <cupla.hpp>
+
 #include <iostream>
 #include <stdexcept>
 
-namespace pmacc
-{
 
 /**
- * Print a cuda error message including file/line info to stderr
+ * Print a cupla error message including file/line info to stderr
  */
-#define PMACC_PRINT_CUDA_ERROR(msg) \
-    std::cerr << "[CUDA] Error: <" << __FILE__ << ">:" << __LINE__ << " " << msg << std::endl
+#define PMACC_PRINT_CUPLA_ERROR(msg)                                                                                  \
+    std::cerr << "[cupla] Error: <" << __FILE__ << ">:" << __LINE__ << " " << msg << std::endl
 
 /**
- * Print a cuda error message including file/line info to stderr and raises an exception
+ * Print a cupla error message including file/line info to stderr and raises an exception
  */
-#define PMACC_PRINT_CUDA_ERROR_AND_THROW(cudaError, msg) \
-    PMACC_PRINT_CUDA_ERROR(msg);                         \
-    throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(cudaError)))
+#define PMACC_PRINT_CUPLA_ERROR_AND_THROW(cuplaError, msg)                                                            \
+    PMACC_PRINT_CUPLA_ERROR(msg);                                                                                     \
+    throw std::runtime_error(std::string("[cupla] Error: ") + std::string(cuplaGetErrorString(cuplaError)))
 
 /**
  * Captures CUDA errors and prints messages to stdout, including line number and file.
  *
- * @param cmd command with cudaError_t return value to check
+ * @param cmd command with cuplaError_t return value to check
  */
-#define CUDA_CHECK(cmd) {cudaError_t error = cmd; if(error!=cudaSuccess){ PMACC_PRINT_CUDA_ERROR_AND_THROW(error, ""); }}
-
-#define CUDA_CHECK_MSG(cmd,msg) {cudaError_t error = cmd; if(error!=cudaSuccess){ PMACC_PRINT_CUDA_ERROR_AND_THROW(error, msg); }}
+#define CUDA_CHECK(cmd)                                                                                               \
+    {                                                                                                                 \
+        cuplaError_t error = cmd;                                                                                     \
+        if(error != cuplaSuccess)                                                                                     \
+        {                                                                                                             \
+            PMACC_PRINT_CUPLA_ERROR_AND_THROW(error, "");                                                             \
+        }                                                                                                             \
+    }
 
-#define CUDA_CHECK_NO_EXCEPT(cmd) {cudaError_t error = cmd; if(error!=cudaSuccess){ PMACC_PRINT_CUDA_ERROR(""); }}
+/** Capture error, report and throw
+ *
+ * This macro is only used when PMACC_SYNC_KERNEL == 1 to wrap all
+ * kernel calls. Since alpaka may throw inside cmd, everything is
+ * wrapped up in another try-catch level.
+ *
+ * This macro will always throw in case of an error, either by
+ * producing a new exception or propagating an existing one
+ */
+#define CUDA_CHECK_MSG(cmd, msg)                                                                                      \
+    {                                                                                                                 \
+        try                                                                                                           \
+        {                                                                                                             \
+            cuplaError_t error = cmd;                                                                                 \
+            if(error != cuplaSuccess)                                                                                 \
+            {                                                                                                         \
+                PMACC_PRINT_CUPLA_ERROR_AND_THROW(error, msg);                                                        \
+            }                                                                                                         \
+        }                                                                                                             \
+        catch(...)                                                                                                    \
+        {                                                                                                             \
+            PMACC_PRINT_CUPLA_ERROR(msg);                                                                             \
+            throw;                                                                                                    \
+        }                                                                                                             \
+    }
 
-} // namespace pmacc
+#define CUDA_CHECK_NO_EXCEPT(cmd)                                                                                     \
+    {                                                                                                                 \
+        cuplaError_t error = cmd;                                                                                     \
+        if(error != cuplaSuccess)                                                                                     \
+        {                                                                                                             \
+            PMACC_PRINT_CUPLA_ERROR("");                                                                              \
+        }                                                                                                             \
+    }
diff --git a/include/pmacc/dataManagement/AbstractInitialiser.hpp b/include/pmacc/dataManagement/AbstractInitialiser.hpp
index 5da2e3009e..519bd3d1f5 100644
--- a/include/pmacc/dataManagement/AbstractInitialiser.hpp
+++ b/include/pmacc/dataManagement/AbstractInitialiser.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -26,7 +26,6 @@
 
 namespace pmacc
 {
-
     /**
      * Abstract base class for initialising simulation data (ISimulationData).
      */
@@ -39,13 +38,16 @@ namespace pmacc
          *
          * @return the next timestep
          */
-        virtual uint32_t setup() { return 0;};
+        virtual uint32_t setup()
+        {
+            return 0;
+        };
 
         /**
          * Tears down this initialiser.
          * Called after any init.
          */
-        virtual void teardown() {};
+        virtual void teardown(){};
 
         /**
          * Initialises simulation data (concrete type of data is described by id).
@@ -56,4 +58,4 @@ namespace pmacc
         virtual void init(ISimulationData& data, uint32_t currentStep) = 0;
     };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/dataManagement/DataConnector.hpp b/include/pmacc/dataManagement/DataConnector.hpp
index e0c2ba065f..737fc019e1 100644
--- a/include/pmacc/dataManagement/DataConnector.hpp
+++ b/include/pmacc/dataManagement/DataConnector.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl, Sergei Bastrakov
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl, Sergei Bastrakov
  *
  * This file is part of PMacc.
  *
@@ -44,17 +44,12 @@ namespace pmacc
     class DataConnector
     {
     private:
-        std::list< std::shared_ptr< ISimulationData > >::iterator
-        findId( SimulationDataId id )
+        std::list<std::shared_ptr<ISimulationData>>::iterator findId(SimulationDataId id)
         {
             return std::find_if(
                 datasets.begin(),
                 datasets.end(),
-                [ & id ]( std::shared_ptr< ISimulationData > data ) -> bool
-                {
-                    return data->getUniqueId() == id;
-                }
-            );
+                [&id](std::shared_ptr<ISimulationData> data) -> bool { return data->getUniqueId() == id; });
         }
 
     public:
@@ -63,10 +58,9 @@ namespace pmacc
          * @param id id of the Dataset to query
          * @return if dataset with id is registered
          */
-        bool
-        hasId( SimulationDataId id )
+        bool hasId(SimulationDataId id)
         {
-            return findId( id ) != datasets.end();
+            return findId(id) != datasets.end();
         }
 
         /**
@@ -76,20 +70,13 @@ namespace pmacc
          * @param initialiser class used for initialising Datasets
          * @param currentStep current simulation step
          */
-        void
-        initialise(
-            AbstractInitialiser& initialiser,
-            uint32_t currentStep
-        )
+        void initialise(AbstractInitialiser& initialiser, uint32_t currentStep)
         {
             currentStep = initialiser.setup();
 
-            for( auto & data : datasets )
+            for(auto& data : datasets)
             {
-                initialiser.init(
-                    *data,
-                    currentStep
-                );
+                initialiser.init(*data, currentStep);
             }
 
             initialiser.teardown();
@@ -102,24 +89,18 @@ namespace pmacc
          *
          * @param data simulation data to share ownership
          */
-        void
-        share( const std::shared_ptr< ISimulationData > & data )
+        void share(const std::shared_ptr<ISimulationData>& data)
         {
-            PMACC_ASSERT( data != nullptr );
+            PMACC_ASSERT(data != nullptr);
 
             SimulationDataId id = data->getUniqueId();
 
-            log< ggLog::MEMORY >( "DataConnector: data shared '%1%'" ) % id;
+            log<ggLog::MEMORY>("DataConnector: data shared '%1%'") % id;
 
-            if( hasId( id ) )
-                throw std::runtime_error(
-                    getExceptionStringForID(
-                        "dataset ID already exists",
-                        id
-                    )
-                );
+            if(hasId(id))
+                throw std::runtime_error(getExceptionStringForID("dataset ID already exists", id));
 
-            datasets.push_back( data );
+            datasets.push_back(data);
         }
 
         /** Register a new Dataset and transfer its ownership.
@@ -130,50 +111,40 @@ namespace pmacc
          *
          * @param data simulation data to transfer ownership
          */
-        void
-        consume( std::unique_ptr< ISimulationData > data )
+        void consume(std::unique_ptr<ISimulationData> data)
         {
-            std::shared_ptr< ISimulationData > newOwner( std::move( data ) );
-            share( newOwner );
+            std::shared_ptr<ISimulationData> newOwner(std::move(data));
+            share(newOwner);
         }
 
         /** End sharing a dataset with identifier id
          *
          * @param id id of the dataset to remove
          */
-        void
-        deregister( SimulationDataId id )
+        void deregister(SimulationDataId id)
         {
-            const auto it = findId( id );
+            const auto it = findId(id);
 
-            if( it == datasets.end() )
-                throw std::runtime_error(
-                    getExceptionStringForID(
-                        "dataset not found",
-                        id
-                    )
-                );
+            if(it == datasets.end())
+                throw std::runtime_error(getExceptionStringForID("dataset not found", id));
 
-            log< ggLog::MEMORY >( "DataConnector: unshared '%1%' (%2% uses left)" ) %
-                                id % ( it->use_count() - 1 );
+            log<ggLog::MEMORY>("DataConnector: unshared '%1%' (%2% uses left)") % id % (it->use_count() - 1);
 
-            datasets.erase( it );
+            datasets.erase(it);
         }
 
         /** Unshare all associated datasets
          */
-        void
-        clean()
+        void clean()
         {
-            log< ggLog::MEMORY >( "DataConnector: being cleaned (%1% datasets left to unshare)" ) %
-                                datasets.size();
+            log<ggLog::MEMORY>("DataConnector: being cleaned (%1% datasets left to unshare)") % datasets.size();
 
             // verbose version of: datasets.clear();
-            while( ! datasets.empty() )
+            while(!datasets.empty())
             {
                 auto it = datasets.rbegin();
-                log< ggLog::MEMORY >( "DataConnector: unshared '%1%' (%2% uses left)" ) %
-                                    (*it)->getUniqueId() % ( it->use_count() - 1 );
+                log<ggLog::MEMORY>("DataConnector: unshared '%1%' (%2% uses left)") % (*it)->getUniqueId()
+                    % (it->use_count() - 1);
                 datasets.pop_back();
             }
         }
@@ -191,32 +162,25 @@ namespace pmacc
          * @param noSync indicates that no synchronization should be performed, regardless of dataset status
          * @return returns a reference to the data of type TYPE
          */
-        template< class TYPE >
-        std::shared_ptr< TYPE >
-        get(
+        template<class TYPE>
+        std::shared_ptr<TYPE> get(
             SimulationDataId id,
             bool noSync = false // @todo invert!
         )
         {
-            auto it = findId( id );
+            auto it = findId(id);
 
-            if( it == datasets.end() )
-                throw std::runtime_error(
-                    getExceptionStringForID(
-                        "Invalid dataset ID",
-                        id
-                    )
-                );
+            if(it == datasets.end())
+                throw std::runtime_error(getExceptionStringForID("Invalid dataset ID", id));
 
-            log< ggLog::MEMORY >( "DataConnector: sharing access to '%1%' (%2% uses)" ) %
-                                id % ( it->use_count() );
+            log<ggLog::MEMORY>("DataConnector: sharing access to '%1%' (%2% uses)") % id % (it->use_count());
 
-            if( !noSync )
+            if(!noSync)
             {
                 (*it)->synchronize();
             }
 
-            return std::static_pointer_cast< TYPE >( *it );
+            return std::static_pointer_cast<TYPE>(*it);
         }
 
         /** Indicate a data set gotten temporarily via @see getData is not used anymore
@@ -225,41 +189,30 @@ namespace pmacc
          *
          * @param id id for the dataset previously acquired using getData()
          */
-        void
-        releaseData( SimulationDataId )
+        void releaseData(SimulationDataId)
         {
         }
 
     private:
-
         friend struct detail::Environment;
 
-        static DataConnector&
-        getInstance()
+        static DataConnector& getInstance()
         {
             static DataConnector instance;
             return instance;
         }
 
-        std::list< std::shared_ptr< ISimulationData > > datasets;
+        std::list<std::shared_ptr<ISimulationData>> datasets;
 
-        DataConnector()
-        {
-        };
+        DataConnector(){};
 
-        virtual
-        ~DataConnector()
+        virtual ~DataConnector()
         {
-            log< ggLog::MEMORY >( "DataConnector: being destroyed (%1% datasets left to destroy)" ) %
-                                datasets.size();
+            log<ggLog::MEMORY>("DataConnector: being destroyed (%1% datasets left to destroy)") % datasets.size();
             clean();
         }
 
-        std::string
-        getExceptionStringForID(
-            const char *msg,
-            SimulationDataId id
-        )
+        std::string getExceptionStringForID(const char* msg, SimulationDataId id)
         {
             std::stringstream stream;
             stream << "DataConnector: " << msg << " (" << id << ")";
@@ -267,5 +220,4 @@ namespace pmacc
         }
     };
 
-}
-
+} // namespace pmacc
diff --git a/include/pmacc/dataManagement/ISimulationData.hpp b/include/pmacc/dataManagement/ISimulationData.hpp
index a5ed822a94..82c335ee68 100644
--- a/include/pmacc/dataManagement/ISimulationData.hpp
+++ b/include/pmacc/dataManagement/ISimulationData.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Benjamin Worpitz,
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -35,7 +35,9 @@ namespace pmacc
     class ISimulationData
     {
     public:
-        virtual ~ISimulationData(){}
+        virtual ~ISimulationData()
+        {
+        }
         /**
          * Synchronizes simulation data, meaning accessing (host side) data
          * will return up-to-date values.
@@ -48,6 +50,5 @@ namespace pmacc
          * @return globally unique identifier
          */
         virtual SimulationDataId getUniqueId() = 0;
-
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/debug/DebugBuffers.hpp b/include/pmacc/debug/DebugBuffers.hpp
index e9bacc6a09..0ae0ec7e8b 100644
--- a/include/pmacc/debug/DebugBuffers.hpp
+++ b/include/pmacc/debug/DebugBuffers.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -29,13 +29,12 @@
 
 namespace pmacc
 {
-
     /**
      * Helper class for debugging buffers
      *
      * @tparam DIM dimension of the buffer to debug.
      */
-    template <unsigned DIM>
+    template<unsigned DIM>
     class DebugBuffers
     {
     public:
@@ -46,25 +45,24 @@ namespace pmacc
          * @param hostBuffer the HostBuffer to convert to a string
          * @return a string representing the buffer
          */
-        template <class TYPE>
+        template<class TYPE>
         static std::string bufferToStr(HostBuffer<TYPE, DIM>& hostBuffer);
     };
 
-    template <>
+    template<>
     class DebugBuffers<DIM2>
     {
     public:
-
-        template <class TYPE>
+        template<class TYPE>
         static std::string bufferToStr(HostBuffer<TYPE, DIM2>& hostBuffer)
         {
             std::stringstream stream;
 
             typename HostBuffer<TYPE, DIM2>::DataBoxType db = hostBuffer.getDataBox();
 
-            for (size_t y = 0; y < hostBuffer.getDataSpace().y(); y++)
+            for(size_t y = 0; y < hostBuffer.getDataSpace().y(); y++)
             {
-                for (size_t x = 0; x < hostBuffer.getDataSpace().x(); x++)
+                for(size_t x = 0; x < hostBuffer.getDataSpace().x(); x++)
                     stream << db[y][x] << " ";
 
                 stream << std::endl;
@@ -74,33 +72,31 @@ namespace pmacc
         }
     };
 
-    template <>
+    template<>
     class DebugBuffers<DIM3>
     {
     public:
-
-        template <class TYPE>
+        template<class TYPE>
         static std::string bufferToStr(HostBuffer<TYPE, DIM3>& hostBuffer)
         {
             std::stringstream stream;
 
             typename HostBuffer<TYPE, DIM3>::DataBoxType db = hostBuffer.getDataBox();
 
-            for (size_t z = 0; z < hostBuffer.getDataSpace().z(); z++)
+            for(size_t z = 0; z < hostBuffer.getDataSpace().z(); z++)
             {
                 stream << "z = " << z << std::endl;
 
-                for (size_t y = 0; y < hostBuffer.getDataSpace().y(); y++)
+                for(size_t y = 0; y < hostBuffer.getDataSpace().y(); y++)
                 {
-                    for (size_t x = 0; x < hostBuffer.getDataSpace().x(); x++)
+                    for(size_t x = 0; x < hostBuffer.getDataSpace().x(); x++)
                         stream << db[z][y][x] << " ";
 
                     stream << std::endl;
                 }
-
             }
 
             return stream.str();
         }
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/debug/DebugDataSpace.hpp b/include/pmacc/debug/DebugDataSpace.hpp
index c10b3b6a96..c657e9bcb1 100644
--- a/include/pmacc/debug/DebugDataSpace.hpp
+++ b/include/pmacc/debug/DebugDataSpace.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -29,20 +29,19 @@
 
 namespace pmacc
 {
-
     /**
      * Helper class for debugging DataSpaces
      *
      * @tparam DIM dimension of the DataSpace to debug.
      */
-    template <unsigned DIM>
+    template<unsigned DIM>
     class DebugDataSpace
     {
     public:
         static std::string dspToStr(DataSpace<DIM>& dsp);
     };
 
-    template <>
+    template<>
     class DebugDataSpace<DIM2>
     {
     public:
@@ -56,7 +55,7 @@ namespace pmacc
         }
     };
 
-    template <>
+    template<>
     class DebugDataSpace<DIM3>
     {
     public:
@@ -70,4 +69,4 @@ namespace pmacc
         }
     };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/debug/DebugExchangeTypes.hpp b/include/pmacc/debug/DebugExchangeTypes.hpp
index 806e2cb243..692011f910 100644
--- a/include/pmacc/debug/DebugExchangeTypes.hpp
+++ b/include/pmacc/debug/DebugExchangeTypes.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -29,7 +29,6 @@
 
 namespace pmacc
 {
-
     /**
      * Helper class for debugging exchange types.
      *
@@ -37,7 +36,6 @@ namespace pmacc
     class DebugExchangeTypes
     {
     public:
-
         /**
          * Converts an exchange type to a string for debugging.
          *
@@ -51,22 +49,22 @@ namespace pmacc
             std::stringstream stream;
             stream << "[";
 
-            if (mask.containsExchangeType(LEFT))
+            if(mask.containsExchangeType(LEFT))
                 stream << "LEFT ";
 
-            if (mask.containsExchangeType(RIGHT))
+            if(mask.containsExchangeType(RIGHT))
                 stream << "RIGHT ";
 
-            if (mask.containsExchangeType(TOP))
+            if(mask.containsExchangeType(TOP))
                 stream << "TOP ";
 
-            if (mask.containsExchangeType(BOTTOM))
+            if(mask.containsExchangeType(BOTTOM))
                 stream << "BOTTOM ";
 
-            if (mask.containsExchangeType(FRONT))
+            if(mask.containsExchangeType(FRONT))
                 stream << "FRONT ";
 
-            if (mask.containsExchangeType(BACK))
+            if(mask.containsExchangeType(BACK))
                 stream << "BACK ";
 
             stream << "]";
@@ -75,4 +73,4 @@ namespace pmacc
         }
     };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/debug/PMaccVerbose.hpp b/include/pmacc/debug/PMaccVerbose.hpp
index f4a9013306..cd598db554 100644
--- a/include/pmacc/debug/PMaccVerbose.hpp
+++ b/include/pmacc/debug/PMaccVerbose.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,33 +26,25 @@
 #include <stdint.h>
 
 #ifndef PMACC_VERBOSE_LVL
-#define PMACC_VERBOSE_LVL 0
+#    define PMACC_VERBOSE_LVL 0
 #endif
 
 namespace pmacc
 {
-
     /*create verbose class*/
     DEFINE_VERBOSE_CLASS(PMaccVerbose)
     (
         /* define log lvl for later use
          * e.g. log<pmaccLogLvl::NOTHING>("TEXT");*/
-        DEFINE_LOGLVL(0,NOTHING);
-        DEFINE_LOGLVL(1,MEMORY);
-        DEFINE_LOGLVL(2,INFO);
-        DEFINE_LOGLVL(4,CRITICAL);
-        DEFINE_LOGLVL(8,MPI);
-        DEFINE_LOGLVL(16,CUDA_RT);
-        DEFINE_LOGLVL(32,COMMUNICATION);
-        DEFINE_LOGLVL(64,EVENT);
-    )
-    /*set default verbose lvl (integer number)*/
-    (NOTHING::lvl|PMACC_VERBOSE_LVL);
-
-    //short name for access verbose types of PMacc
+        DEFINE_LOGLVL(0, NOTHING); DEFINE_LOGLVL(1, MEMORY); DEFINE_LOGLVL(2, INFO); DEFINE_LOGLVL(4, CRITICAL);
+        DEFINE_LOGLVL(8, MPI);
+        DEFINE_LOGLVL(16, CUDA_RT);
+        DEFINE_LOGLVL(32, COMMUNICATION);
+        DEFINE_LOGLVL(64, EVENT);)
+        /*set default verbose lvl (integer number)*/
+        (NOTHING::lvl | PMACC_VERBOSE_LVL);
+
+    // short name for access verbose types of PMacc
     using ggLog = PMaccVerbose;
 
-}
-
-
-
+} // namespace pmacc
diff --git a/include/pmacc/debug/VerboseLog.hpp b/include/pmacc/debug/VerboseLog.hpp
index 7b3c8148bd..690191ccc4 100644
--- a/include/pmacc/debug/VerboseLog.hpp
+++ b/include/pmacc/debug/VerboseLog.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -32,123 +32,115 @@
 
 namespace pmacc
 {
-
-
-/** get the name of a verbose lvl
- *
- * this function is defined as friend function for every log lvl
- * @param dummy instance of LogClass to find name
- * @return name of LogClass
- */
-template<class LogClass>
-std::string getLogName(const LogClass& dummy)
-{
-    return std::string("UNDEFINED_LVL");
-}
-
-
-namespace verboseLog_detail
-{
-
-template<typename X, typename Y>
-struct IsSameClassType
-{
-    static constexpr bool result = false;
-};
-
-template<typename X>
-struct IsSameClassType<X, X>
-{
-    static constexpr bool result = true;
-};
-
-} //namespace verboseLog_detail
-
-template<uint64_t lvl_, class membership_>
-struct LogLvl
-{
-    typedef membership_ Parent;
-    static constexpr uint64_t lvl = lvl_;
-
-    /* This operation is only allowed for LogLvl with the same Parent type.
-     * Create a LogLvl that contains two levels. At least one lvl has to be true
+    /** get the name of a verbose lvl
+     *
+     * this function is defined as friend function for every log lvl
+     * @param dummy instance of LogClass to find name
+     * @return name of LogClass
      */
-    template<class OtherLogLvl >
-    LogLvl < (OtherLogLvl::lvl | lvl), membership_> operator+(const OtherLogLvl&)
+    template<class LogClass>
+    std::string getLogName(const LogClass& dummy)
     {
-        return LogLvl < (OtherLogLvl::lvl | lvl), membership_ > ();
+        return std::string("UNDEFINED_LVL");
     }
 
-};
 
-namespace verboseLog_detail
-{
+    namespace verboseLog_detail
+    {
+        template<typename X, typename Y>
+        struct IsSameClassType
+        {
+            static constexpr bool result = false;
+        };
 
-template<class LogLevel>
-class VerboseLog
-{
-private:
-    typedef typename LogLevel::Parent LogParent;
-    static constexpr uint64_t logLvl = LogLevel::lvl;
-public:
+        template<typename X>
+        struct IsSameClassType<X, X>
+        {
+            static constexpr bool result = true;
+        };
 
-    VerboseLog(const char* msg) : fmt(msg)
-    {
-    }
+    } // namespace verboseLog_detail
 
-    ~VerboseLog()
+    template<uint64_t lvl_, class membership_>
+    struct LogLvl
     {
-        typedef LogLvl<(logLvl & LogParent::log_level), LogParent> LogClass;
-        /* check if a bit in the mask is set
-         * If you get an linker error in the next two lines you have not used
-         * DEFINE_LOGLVL makro to define a named logLvl
+        typedef membership_ Parent;
+        static constexpr uint64_t lvl = lvl_;
+
+        /* This operation is only allowed for LogLvl with the same Parent type.
+         * Create a LogLvl that contains two levels. At least one lvl has to be true
          */
-        if (logLvl & LogParent::log_level) /*compile-time check*/
+        template<class OtherLogLvl>
+        LogLvl<(OtherLogLvl::lvl | lvl), membership_> operator+(const OtherLogLvl&)
         {
-            std::cout << LogParent::getName() << " " << getLogName(LogClass()) <<
-            "(" << (logLvl & LogParent::log_level) << ")" << " | " << fmt << std::endl;
+            return LogLvl<(OtherLogLvl::lvl | lvl), membership_>();
         }
-    }
+    };
 
-    template <typename T>
-    VerboseLog& operator %(T value)
+    namespace verboseLog_detail
+    {
+        template<class LogLevel>
+        class VerboseLog
+        {
+        private:
+            typedef typename LogLevel::Parent LogParent;
+            static constexpr uint64_t logLvl = LogLevel::lvl;
+
+        public:
+            VerboseLog(const char* msg) : fmt(msg)
+            {
+            }
+
+            ~VerboseLog()
+            {
+                typedef LogLvl<(logLvl & LogParent::log_level), LogParent> LogClass;
+                /* check if a bit in the mask is set
+                 * If you get an linker error in the next two lines you have not used
+                 * DEFINE_LOGLVL makro to define a named logLvl
+                 */
+                if(logLvl & LogParent::log_level) /*compile-time check*/
+                {
+                    std::cout << LogParent::getName() << " " << getLogName(LogClass()) << "("
+                              << (logLvl & LogParent::log_level) << ")"
+                              << " | " << fmt << std::endl;
+                }
+            }
+
+            template<typename T>
+            VerboseLog& operator%(T value)
+            {
+                if(logLvl & LogParent::log_level) /*compile-time check*/
+                    fmt % value;
+                return *this;
+            }
+
+        protected:
+            boost::format fmt;
+        };
+
+    } // namespace verboseLog_detail
+
+    /*
+     * example call:
+     * log<MYLOGLEVELS::CRITICAL>("printf %2% stream %1%, number example %3%.") % "messages" % "style" % 5;
+     * output of example: 4 | printf style stream messages, number example 5
+     */
+    template<class LogLvl>
+    verboseLog_detail::VerboseLog<LogLvl> log(const char* msg)
     {
-        if (logLvl & LogParent::log_level) /*compile-time check*/
-            fmt % value;
-        return *this;
+        return verboseLog_detail::VerboseLog<LogLvl>(msg);
     }
 
-protected:
-    boost::format fmt;
-};
-
-}//namespace verboseLog_detail
-
-/*
- * example call:
- * log<MYLOGLEVELS::CRITICAL>("printf %2% stream %1%, number example %3%.") % "messages" % "style" % 5;
- * output of example: 4 | printf style stream messages, number example 5
- */
-template <class LogLvl>
-verboseLog_detail::VerboseLog<LogLvl>
-log(const char* msg)
-{
-    return verboseLog_detail::VerboseLog<LogLvl > (msg);
-}
-
-/* version that allows to combine error levels
- * example call:
- * log(MYLOGLEVELS::CRITICAL+MYLOGLEVELS::MEMORY,"printf %2% stream %1%, number example %3%.") % "messages" % "style" % 5
- */
-template <class LogLvl>
-verboseLog_detail::VerboseLog<LogLvl>
-log(const LogLvl, const char* msg)
-{
-    return verboseLog_detail::VerboseLog<LogLvl > (msg);
-}
-
-
-
-} //namespace pmacc
+    /* version that allows to combine error levels
+     * example call:
+     * log(MYLOGLEVELS::CRITICAL+MYLOGLEVELS::MEMORY,"printf %2% stream %1%, number example %3%.") % "messages" %
+     * "style" % 5
+     */
+    template<class LogLvl>
+    verboseLog_detail::VerboseLog<LogLvl> log(const LogLvl, const char* msg)
+    {
+        return verboseLog_detail::VerboseLog<LogLvl>(msg);
+    }
 
 
+} // namespace pmacc
diff --git a/include/pmacc/debug/VerboseLogMakros.hpp b/include/pmacc/debug/VerboseLogMakros.hpp
index c329153149..4a914d2bb2 100644
--- a/include/pmacc/debug/VerboseLogMakros.hpp
+++ b/include/pmacc/debug/VerboseLogMakros.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -28,37 +28,39 @@
  * @param code integer which represent a bit in a 64bit bitmask
  * @param name name of the log lvl, name is needet later to call log<name>(...)
  */
-#define DEFINE_LOGLVL(code,name)                            \
-    typedef pmacc::LogLvl < code, thisClass > name;         \
-    friend inline std::string getLogName(const name)        \
-    {                                                       \
-        return std::string(#name);                          \
+#define DEFINE_LOGLVL(code, name)                                                                                     \
+    typedef pmacc::LogLvl<code, thisClass> name;                                                                      \
+    friend inline std::string getLogName(const name)                                                                  \
+    {                                                                                                                 \
+        return std::string(#name);                                                                                    \
     }
 
 /** set a default value for a verbose class
  * @param default_lvl must be a integer which represent a defined log lvl
  */
-#define __DEFINE_VERBOSE_CLASS_DEFAULT_LVL(default_lvl) \
-    static constexpr uint64_t log_level = default_lvl;      \
+#define __DEFINE_VERBOSE_CLASS_DEFAULT_LVL(default_lvl)                                                               \
+    static constexpr uint64_t log_level = default_lvl;                                                                \
     }
 
 /** helper for define log lvl inside of DEFINE_VERBOSE_CLASS
  */
-#define __DEFINE_VERBOSE_CLASS_LVLS(...)    \
-    __VA_ARGS__                             \
+#define __DEFINE_VERBOSE_CLASS_LVLS(...)                                                                              \
+    __VA_ARGS__                                                                                                       \
     __DEFINE_VERBOSE_CLASS_DEFAULT_LVL
 
 /** create a struct which represent a verbose container
  * @param structName name of the container(struct)
  */
-#define DEFINE_VERBOSE_CLASS(structName)        \
-    struct structName                           \
-    {                                           \
-        static std::string getName()            \
-        {                                       \
-            return std::string(#structName);    \
-        }                                       \
-    private:                                    \
-        typedef structName thisClass;           \
-    public:                                     \
-    __DEFINE_VERBOSE_CLASS_LVLS
+#define DEFINE_VERBOSE_CLASS(structName)                                                                              \
+    struct structName                                                                                                 \
+    {                                                                                                                 \
+        static std::string getName()                                                                                  \
+        {                                                                                                             \
+            return std::string(#structName);                                                                          \
+        }                                                                                                             \
+                                                                                                                      \
+    private:                                                                                                          \
+        typedef structName thisClass;                                                                                 \
+                                                                                                                      \
+    public:                                                                                                           \
+        __DEFINE_VERBOSE_CLASS_LVLS
diff --git a/include/pmacc/debug/abortWithError.hpp b/include/pmacc/debug/abortWithError.hpp
index 140ee49e71..c8714d9788 100644
--- a/include/pmacc/debug/abortWithError.hpp
+++ b/include/pmacc/debug/abortWithError.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,33 +28,28 @@
 
 namespace pmacc
 {
-namespace{
-    /** abort program with an exception
-     *
-     * This function always throws a `runtime_error`.
-     *
-     * @param exp evaluated expression
-     * @param filename name of the broken file
-     * @param lineNumber line in file
-     * @param msg user defined error message
-     */
-    void abortWithError(
-        const std::string exp,
-        const std::string filename,
-        const uint32_t lineNumber,
-        const std::string msg = std::string()
-    )
+    namespace
     {
-        std::stringstream line;
-        line << lineNumber;
+        /** abort program with an exception
+         *
+         * This function always throws a `runtime_error`.
+         *
+         * @param exp evaluated expression
+         * @param filename name of the broken file
+         * @param lineNumber line in file
+         * @param msg user defined error message
+         */
+        void abortWithError(
+            const std::string exp,
+            const std::string filename,
+            const uint32_t lineNumber,
+            const std::string msg = std::string())
+        {
+            std::stringstream line;
+            line << lineNumber;
 
-        throw std::runtime_error(
-            "expression (" +
-            exp +
-            ") failed in file (" +
-            filename + ":" + line.str() + ") : " +
-            msg
-        );
-    }
-}
-}
+            throw std::runtime_error(
+                "expression (" + exp + ") failed in file (" + filename + ":" + line.str() + ") : " + msg);
+        }
+    } // namespace
+} // namespace pmacc
diff --git a/include/pmacc/dimensions/DataSpace.hpp b/include/pmacc/dimensions/DataSpace.hpp
index a46a87dc81..8c37ac02af 100644
--- a/include/pmacc/dimensions/DataSpace.hpp
+++ b/include/pmacc/dimensions/DataSpace.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
@@ -27,7 +27,6 @@
 
 namespace pmacc
 {
-
     /**
      * A T_Dim-dimensional data space.
      *
@@ -36,12 +35,11 @@ namespace pmacc
      *
      * @tparam T_Dim dimension (1-3) of the dataspace
      */
-    template <unsigned T_Dim>
-    class DataSpace : public math::Vector<int,T_Dim>
+    template<unsigned T_Dim>
+    class DataSpace : public math::Vector<int, T_Dim>
     {
     public:
-
-        static constexpr int Dim=T_Dim;
+        static constexpr int Dim = T_Dim;
         using BaseType = math::Vector<int, T_Dim>;
 
         /**
@@ -50,19 +48,21 @@ namespace pmacc
          */
         HDINLINE DataSpace()
         {
-            for (uint32_t i = 0; i < T_Dim; ++i)
+            for(uint32_t i = 0; i < T_Dim; ++i)
             {
                 (*this)[i] = 0;
             }
         }
 
+        constexpr HDINLINE DataSpace& operator=(const DataSpace&) = default;
+
         /**
          * constructor.
          * Sets size of all dimensions from cuda dim3.
          */
-        HDINLINE explicit DataSpace(dim3 value)
+        HDINLINE explicit DataSpace(cupla::dim3 value)
         {
-            for (uint32_t i = 0; i < T_Dim; ++i)
+            for(uint32_t i = 0; i < T_Dim; ++i)
             {
                 (*this)[i] = *(&(value.x) + i);
             }
@@ -70,11 +70,11 @@ namespace pmacc
 
         /**
          * constructor.
-         * Sets size of all dimensions from cuda uint3 (e.g. threadIdx/blockIdx)
+         * Sets size of all dimensions from cupla uint3 (e.g. cupla::threadIdx(acc)/cupla::blockIdx(acc))
          */
-        HDINLINE explicit DataSpace(uint3 value)
+        HDINLINE DataSpace(cupla::uint3 value)
         {
-            for (uint32_t i = 0; i < T_Dim; ++i)
+            for(uint32_t i = 0; i < T_Dim; ++i)
             {
                 (*this)[i] = *(&(value.x) + i);
             }
@@ -120,7 +120,7 @@ namespace pmacc
 
         HDINLINE DataSpace(const math::Size_t<T_Dim>& vec)
         {
-            for (uint32_t i = 0; i < T_Dim; ++i)
+            for(uint32_t i = 0; i < T_Dim; ++i)
             {
                 (*this)[i] = vec[i];
             }
@@ -135,7 +135,7 @@ namespace pmacc
         HDINLINE static DataSpace<T_Dim> create(int value = 1)
         {
             DataSpace<T_Dim> tmp;
-            for (uint32_t i = 0; i < T_Dim; ++i)
+            for(uint32_t i = 0; i < T_Dim; ++i)
             {
                 tmp[i] = value;
             }
@@ -160,9 +160,9 @@ namespace pmacc
          */
         HINLINE bool isOneDimensionGreaterThan(const DataSpace<T_Dim>& other) const
         {
-            for (uint32_t i = 0; i < T_Dim; ++i)
+            for(uint32_t i = 0; i < T_Dim; ++i)
             {
-                if ((*this)[i] > other[i])
+                if((*this)[i] > other[i])
                     return true;
             }
             return false;
@@ -171,18 +171,17 @@ namespace pmacc
         HDINLINE operator math::Size_t<T_Dim>() const
         {
             math::Size_t<T_Dim> result;
-            for (uint32_t i = 0; i < T_Dim; i++)
+            for(uint32_t i = 0; i < T_Dim; i++)
                 result[i] = static_cast<size_t>((*this)[i]);
             return result;
         }
 
-        HDINLINE explicit operator dim3() const
+        HDINLINE operator cupla::dim3() const
         {
             return this->toDim3();
         }
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
 
 #include "pmacc/dimensions/DataSpace.tpp"
diff --git a/include/pmacc/dimensions/DataSpace.tpp b/include/pmacc/dimensions/DataSpace.tpp
index cf8793aec3..3c9ca4d588 100644
--- a/include/pmacc/dimensions/DataSpace.tpp
+++ b/include/pmacc/dimensions/DataSpace.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -32,53 +32,50 @@
 
 namespace pmacc
 {
-
-namespace traits
-{
-
-template<unsigned DIM>
-struct GetComponentsType<DataSpace<DIM>, false >
-{
-    typedef typename DataSpace<DIM>::type type;
-};
-
-/** Trait for float_X */
-template<unsigned DIM>
-struct GetNComponents<DataSpace<DIM>,false >
-{
-    static constexpr uint32_t value=DIM;
-};
-
-}// namespace traits
-
-namespace algorithms
-{
-namespace precisionCast
-{
-
-template<unsigned T_Dim>
-struct TypeCast<int, pmacc::DataSpace<T_Dim> >
-{
-    typedef const pmacc::DataSpace<T_Dim>& result;
-
-    HDINLINE result operator( )(const pmacc::DataSpace<T_Dim>& vector ) const
+    namespace traits
     {
-        return vector;
-    }
-};
-
-template<typename T_CastToType, unsigned T_Dim>
-struct TypeCast<T_CastToType, pmacc::DataSpace<T_Dim>  >
-{
-    typedef ::pmacc::math::Vector<T_CastToType, T_Dim> result;
-
-    HDINLINE result operator( )(const pmacc::DataSpace<T_Dim>& vector ) const
+        template<unsigned DIM>
+        struct GetComponentsType<DataSpace<DIM>, false>
+        {
+            typedef typename DataSpace<DIM>::type type;
+        };
+
+        /** Trait for float_X */
+        template<unsigned DIM>
+        struct GetNComponents<DataSpace<DIM>, false>
+        {
+            static constexpr uint32_t value = DIM;
+        };
+
+    } // namespace traits
+
+    namespace algorithms
     {
-        return result( vector );
-    }
-};
-
-} //namespace typecast
-} //namespace algorithms
-
-} //namespace pmacc
+        namespace precisionCast
+        {
+            template<unsigned T_Dim>
+            struct TypeCast<int, pmacc::DataSpace<T_Dim>>
+            {
+                typedef const pmacc::DataSpace<T_Dim>& result;
+
+                HDINLINE result operator()(const pmacc::DataSpace<T_Dim>& vector) const
+                {
+                    return vector;
+                }
+            };
+
+            template<typename T_CastToType, unsigned T_Dim>
+            struct TypeCast<T_CastToType, pmacc::DataSpace<T_Dim>>
+            {
+                typedef ::pmacc::math::Vector<T_CastToType, T_Dim> result;
+
+                HDINLINE result operator()(const pmacc::DataSpace<T_Dim>& vector) const
+                {
+                    return result(vector);
+                }
+            };
+
+        } // namespace precisionCast
+    } // namespace algorithms
+
+} // namespace pmacc
diff --git a/include/pmacc/dimensions/DataSpaceOperations.hpp b/include/pmacc/dimensions/DataSpaceOperations.hpp
index 6e57113910..57176142df 100644
--- a/include/pmacc/dimensions/DataSpaceOperations.hpp
+++ b/include/pmacc/dimensions/DataSpaceOperations.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -29,7 +29,6 @@
 
 namespace pmacc
 {
-
     /**
      * Implements operations on DataSpace objects such as reduce and extend.
      *
@@ -62,7 +61,7 @@ namespace pmacc
          * @param ex exchange direction for reduction
          * @return reduced DataSpace with dimension DIM-1
          */
-        static HDINLINE DataSpace<DIM - 1 > reduce(DataSpace<DIM> ds, uint32_t ex);
+        static HDINLINE DataSpace<DIM - 1> reduce(DataSpace<DIM> ds, uint32_t ex);
 
         /**
          * Extends the DataSpace object ds of dimension DIM to a DataSpace object of dimension DIM+1.
@@ -78,19 +77,21 @@ namespace pmacc
          * @param offset DataSpace describing size of target grid's offset
          * @return extended DataSpace with dimension DIM+1
          */
-        static HDINLINE DataSpace<DIM + 1 > extend(DataSpace<DIM> ds, uint32_t ex,
-                                                  DataSpace<DIM + 1 > target, DataSpace<DIM + 1 > offset);
+        static HDINLINE DataSpace<DIM + 1> extend(
+            DataSpace<DIM> ds,
+            uint32_t ex,
+            DataSpace<DIM + 1> target,
+            DataSpace<DIM + 1> offset);
     };
 
     template<>
     class DataSpaceOperations<DIM1>
     {
     public:
-
         template<class TVEC>
         static HDINLINE DataSpace<DIM1> map(uint32_t pos)
         {
-            return DataSpace<DIM1 > (pos);
+            return DataSpace<DIM1>(pos);
         }
 
         template<class TVEC>
@@ -101,66 +102,64 @@ namespace pmacc
 
         static HDINLINE DataSpace<DIM1> map(const DataSpace<DIM1>& size, uint32_t pos)
         {
-            return DataSpace<DIM1 > (pos);
+            return DataSpace<DIM1>(pos);
         }
 
-        static HDINLINE DataSpace<DIM2> extend(DataSpace<DIM1> ds, uint32_t ex,
-                                              DataSpace<DIM2> target, DataSpace<DIM2> offset)
+        static HDINLINE DataSpace<DIM2> extend(
+            DataSpace<DIM1> ds,
+            uint32_t ex,
+            DataSpace<DIM2> target,
+            DataSpace<DIM2> offset)
         {
-            DataSpace<DIM2> directions = Mask::getRelativeDirections<DIM2 > (ex);
+            DataSpace<DIM2> directions = Mask::getRelativeDirections<DIM2>(ex);
 
             DataSpace<DIM2> result(ds[0], ds[0]);
 
             // RIGHT
-            if (directions.x() == 1)
+            if(directions.x() == 1)
             {
                 result.x() = target.x() - offset.x() - 1;
             }
 
             // LEFT
-            if (directions.x() == -1)
+            if(directions.x() == -1)
             {
                 result.x() = offset.x();
             }
 
             // TOP
-            if (directions.y() == 1)
+            if(directions.y() == 1)
             {
                 result.y() = target.y() - offset.y() - 1;
             }
 
             // BOTTOM
-            if (directions.y() == -1)
+            if(directions.y() == -1)
             {
                 result.y() = offset.y();
             }
 
             return result;
-
         }
-
     };
 
     template<>
     class DataSpaceOperations<DIM2>
     {
     public:
-
         template<class TVEC>
         static HDINLINE DataSpace<DIM2> map(uint32_t pos)
         {
             auto const y = pos / TVEC::x::value;
             auto const x = pos - y * TVEC::x::value;
 
-            return DataSpace< DIM2 >( x , y );
+            return DataSpace<DIM2>(x, y);
         }
 
         template<class TVEC>
         static HDINLINE uint32_t map(const DataSpace<DIM2>& pos)
         {
-            return
-                pos.y() * TVEC::x::value +
-                pos.x();
+            return pos.y() * TVEC::x::value + pos.x();
         }
 
         static HDINLINE DataSpace<DIM2> map(const DataSpace<DIM2>& size, uint32_t pos)
@@ -168,33 +167,34 @@ namespace pmacc
             auto const y = pos / size.x();
             auto const x = pos - y * size.x();
 
-            return DataSpace< DIM2 >( x , y );
+            return DataSpace<DIM2>(x, y);
         }
 
         static HDINLINE uint32_t map(const DataSpace<DIM2>& size, const DataSpace<DIM2>& pos)
         {
-            return
-                pos.y() * size.x() +
-                pos.x();
+            return pos.y() * size.x() + pos.x();
         }
 
         static HDINLINE DataSpace<DIM1> reduce(DataSpace<DIM2> ds, uint32_t ex)
         {
-            DataSpace<DIM2> directions = Mask::getRelativeDirections<DIM2 > (ex);
+            DataSpace<DIM2> directions = Mask::getRelativeDirections<DIM2>(ex);
 
-            if (directions.x() != 0)
-                return DataSpace<DIM1 > (ds.y());
+            if(directions.x() != 0)
+                return DataSpace<DIM1>(ds.y());
 
-            if (directions.y() != 0)
-                return DataSpace<DIM1 > (ds.x());
+            if(directions.y() != 0)
+                return DataSpace<DIM1>(ds.x());
 
-            return DataSpace<DIM1 > (0);
+            return DataSpace<DIM1>(0);
         }
 
-        static HDINLINE DataSpace<DIM3> extend(DataSpace<DIM2> ds, uint32_t ex,
-                                              DataSpace<DIM3> target, DataSpace<DIM3> offset)
+        static HDINLINE DataSpace<DIM3> extend(
+            DataSpace<DIM2> ds,
+            uint32_t ex,
+            DataSpace<DIM3> target,
+            DataSpace<DIM3> offset)
         {
-            DataSpace<DIM3> directions = Mask::getRelativeDirections<DIM3 > (ex);
+            DataSpace<DIM3> directions = Mask::getRelativeDirections<DIM3>(ex);
 
             DataSpace<DIM3> result;
 
@@ -202,7 +202,7 @@ namespace pmacc
             const uint32_t z_entry(1);
             uint32_t y_entry(1);
 
-            switch (directions.x())
+            switch(directions.x())
             {
                 // RIGHT
             case 1:
@@ -219,7 +219,7 @@ namespace pmacc
                 break;
             }
 
-            switch (directions.z())
+            switch(directions.z())
             {
                 // BACK
             case 1:
@@ -234,7 +234,7 @@ namespace pmacc
                 break;
             }
 
-            switch (directions.y())
+            switch(directions.y())
             {
                 // BOTTOM
             case 1:
@@ -245,8 +245,8 @@ namespace pmacc
                 result.y() = offset.y();
                 break;
             case 0:
-                //thsi if fiy lmem usage (old wars result.y()=ds[y_entry] )
-                if (y_entry == 0)
+                // thsi if fiy lmem usage (old wars result.y()=ds[y_entry] )
+                if(y_entry == 0)
                     result.y() = ds.x();
                 else
                     result.y() = ds.y();
@@ -255,14 +255,12 @@ namespace pmacc
 
             return result;
         }
-
     };
 
     template<>
     class DataSpaceOperations<DIM3>
     {
     public:
-
         template<class TVEC>
         static HDINLINE DataSpace<DIM3> map(uint32_t pos)
         {
@@ -272,7 +270,7 @@ namespace pmacc
             auto const y = pos / TVEC::x::value;
             auto const x = pos - y * TVEC::x::value;
 
-            return DataSpace< DIM3 >( x , y, z );
+            return DataSpace<DIM3>(x, y, z);
         }
 
         static HDINLINE DataSpace<DIM3> map(const DataSpace<DIM3>& size, uint32_t pos)
@@ -283,41 +281,35 @@ namespace pmacc
             auto const y = pos / size.x();
             auto const x = pos - y * size.x();
 
-            return DataSpace< DIM3 >( x , y, z );
+            return DataSpace<DIM3>(x, y, z);
         }
 
         template<class TVEC>
         static HDINLINE uint32_t map(const DataSpace<DIM3>& pos)
         {
-            return
-                pos.z() * ( TVEC::x::value * TVEC::y::value ) +
-                pos.y() * TVEC::x::value +
-                pos.x();
+            return pos.z() * (TVEC::x::value * TVEC::y::value) + pos.y() * TVEC::x::value + pos.x();
         }
 
         static HDINLINE uint32_t map(const DataSpace<DIM3>& size, const DataSpace<DIM3>& pos)
         {
-            return
-                pos.z() * size.x() * size.y() +
-                pos.y() * size.x() +
-                pos.x();
+            return pos.z() * size.x() * size.y() + pos.y() * size.x() + pos.x();
         }
 
         static HDINLINE DataSpace<DIM2> reduce(DataSpace<DIM3> ds, uint32_t ex)
         {
-            DataSpace<DIM3> directions = Mask::getRelativeDirections<DIM3 > (ex);
+            DataSpace<DIM3> directions = Mask::getRelativeDirections<DIM3>(ex);
 
-            if (directions.x() != 0)
-                return DataSpace<DIM2 > (ds.y(), ds.z());
+            if(directions.x() != 0)
+                return DataSpace<DIM2>(ds.y(), ds.z());
 
-            if (directions.z() != 0)
-                return DataSpace<DIM2 > (ds.x(), ds.y());
+            if(directions.z() != 0)
+                return DataSpace<DIM2>(ds.x(), ds.y());
 
-            if (directions.y() != 0)
-                return DataSpace<DIM2 > (ds.x(), ds.z());
+            if(directions.y() != 0)
+                return DataSpace<DIM2>(ds.x(), ds.z());
 
 
-            return DataSpace<DIM2 > (0, 0);
+            return DataSpace<DIM2>(0, 0);
         }
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/dimensions/Definition.hpp b/include/pmacc/dimensions/Definition.hpp
index 0236567bf9..9c6ef1071f 100644
--- a/include/pmacc/dimensions/Definition.hpp
+++ b/include/pmacc/dimensions/Definition.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2019-2020 Rene Widera
+/* Copyright 2019-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/dimensions/GridLayout.hpp b/include/pmacc/dimensions/GridLayout.hpp
index 2b2e390a8b..9a525c76d4 100644
--- a/include/pmacc/dimensions/GridLayout.hpp
+++ b/include/pmacc/dimensions/GridLayout.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -26,33 +26,29 @@
 
 namespace pmacc
 {
-
     /**
      * Describes layout of a DIM-dimensional data grid including the actual grid and optional guards.
      *
      * @tparam DIM dimension of the grid
      */
-    template <unsigned DIM>
+    template<unsigned DIM>
     class GridLayout
     {
     public:
-
-        HDINLINE GridLayout() :
-        dataSpace(DataSpace<DIM>::create(1)),
-        guard(DataSpace<DIM>::create(0))
+        HDINLINE GridLayout() : dataSpace(DataSpace<DIM>::create(1)), guard(DataSpace<DIM>::create(0))
         {
         }
 
         /**
          * constructor
          * @param dataSpace DataSpace defining size of the layout (native loacal simulation area whithout any guarding)
-         * @param guard DataSpace defining size of the guard cells. Guard is added to actual grid (dataSpace). Will be initialized to 0.
+         * @param guard DataSpace defining size of the guard cells. Guard is added to actual grid (dataSpace). Will be
+         * initialized to 0.
          */
-        HDINLINE GridLayout(const DataSpace<DIM> &dataSpace, DataSpace<DIM> guard = DataSpace<DIM>()) :
-        dataSpace(dataSpace),
-        guard(guard)
+        HDINLINE GridLayout(const DataSpace<DIM>& dataSpace, DataSpace<DIM> guard = DataSpace<DIM>())
+            : dataSpace(dataSpace)
+            , guard(guard)
         {
-
         }
 
         /**
@@ -82,7 +78,6 @@ namespace pmacc
     private:
         DataSpace<DIM> dataSpace;
         DataSpace<DIM> guard;
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/dimensions/SuperCellDescription.hpp b/include/pmacc/dimensions/SuperCellDescription.hpp
index e4f23557f8..5f0799b250 100644
--- a/include/pmacc/dimensions/SuperCellDescription.hpp
+++ b/include/pmacc/dimensions/SuperCellDescription.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -37,12 +37,12 @@ namespace pmacc
      * @tparam T_OffsetOrigin compile time size of the guard relative to origin (positive value)
      * @tparam T_OffsetEnd compile time size of the guard relative to end of SuperCell (positive value)
      */
-    template< class T_SuperCellSize,
-    class T_OffsetOrigin = typename math::CT::make_Int<T_SuperCellSize::dim, 0>::type,
-    class T_OffsetEnd = typename math::CT::make_Int<T_SuperCellSize::dim, 0>::type >
+    template<
+        class T_SuperCellSize,
+        class T_OffsetOrigin = typename math::CT::make_Int<T_SuperCellSize::dim, 0>::type,
+        class T_OffsetEnd = typename math::CT::make_Int<T_SuperCellSize::dim, 0>::type>
     struct SuperCellDescription
     {
-
         enum
         {
             Dim = T_SuperCellSize::dim
@@ -52,8 +52,8 @@ namespace pmacc
         typedef T_OffsetEnd OffsetEnd;
         typedef SuperCellDescription<SuperCellSize, OffsetOrigin, OffsetEnd> Type;
 
-        typedef typename ct::add<OffsetOrigin,SuperCellSize>::type AddFirst;
-        typedef typename ct::add<AddFirst,OffsetEnd>::type FullSuperCellSize;
+        typedef typename ct::add<OffsetOrigin, SuperCellSize>::type AddFirst;
+        typedef typename ct::add<AddFirst, OffsetEnd>::type FullSuperCellSize;
     };
 
-}//namespace
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/EventSystem.hpp b/include/pmacc/eventSystem/EventSystem.hpp
index 90bba8b606..9f2d4397e6 100644
--- a/include/pmacc/eventSystem/EventSystem.hpp
+++ b/include/pmacc/eventSystem/EventSystem.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/eventSystem/EventSystem.tpp b/include/pmacc/eventSystem/EventSystem.tpp
index 7e53b8c5d6..dd01952790 100644
--- a/include/pmacc/eventSystem/EventSystem.tpp
+++ b/include/pmacc/eventSystem/EventSystem.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/eventSystem/EventType.hpp b/include/pmacc/eventSystem/EventType.hpp
index 3de4257d14..0fa9dc717c 100644
--- a/include/pmacc/eventSystem/EventType.hpp
+++ b/include/pmacc/eventSystem/EventType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -28,27 +28,26 @@
 
 namespace pmacc
 {
-namespace eventSystem
-{
-
-    /**
-     * Internal event/task type used for notifications in the event system.
-     */
-    enum EventType
+    namespace eventSystem
     {
-        FINISHED,
-        COPYHOST2DEVICE,
-        COPYDEVICE2HOST,
-        COPYDEVICE2DEVICE,
-        SENDFINISHED,
-        RECVFINISHED,
-        LOGICALAND,
-        SETVALUE,
-        GETVALUE,
-        KERNEL
-    };
+        /**
+         * Internal event/task type used for notifications in the event system.
+         */
+        enum EventType
+        {
+            FINISHED,
+            COPYHOST2DEVICE,
+            COPYDEVICE2HOST,
+            COPYDEVICE2DEVICE,
+            SENDFINISHED,
+            RECVFINISHED,
+            LOGICALAND,
+            SETVALUE,
+            GETVALUE,
+            KERNEL
+        };
 
-} // namespace type
+    } // namespace eventSystem
 
     // for backward compatibility pull all definitions into the pmacc namespace
     using namespace eventSystem;
diff --git a/include/pmacc/eventSystem/Manager.hpp b/include/pmacc/eventSystem/Manager.hpp
index 09d67ea434..3ead7f8c87 100644
--- a/include/pmacc/eventSystem/Manager.hpp
+++ b/include/pmacc/eventSystem/Manager.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
@@ -67,15 +67,14 @@ namespace pmacc
          * adds an ITask to the manager and returns an EventTask for it
          * @param task task to add to the manager
          */
-        void addTask(ITask *task);
+        void addTask(ITask* task);
 
-        void addPassiveTask(ITask *task);
+        void addPassiveTask(ITask* task);
 
 
         std::size_t getCount();
 
     private:
-
         friend struct detail::Environment;
 
         inline ITask* getPassiveITaskIfNotFinished(id_t taskId) const;
@@ -98,4 +97,4 @@ namespace pmacc
         TaskMap passiveTasks;
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/Manager.tpp b/include/pmacc/eventSystem/Manager.tpp
index 1e11df0154..cddff3666a 100644
--- a/include/pmacc/eventSystem/Manager.tpp
+++ b/include/pmacc/eventSystem/Manager.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -35,179 +35,176 @@
 
 namespace pmacc
 {
+    inline Manager::~Manager()
+    {
+        CUDA_CHECK_NO_EXCEPT(cuplaGetLastError());
+        waitForAllTasks();
+        CUDA_CHECK_NO_EXCEPT(cuplaGetLastError());
+    }
 
-inline Manager::~Manager( )
-{
-    CUDA_CHECK_NO_EXCEPT(cudaGetLastError( ));
-    waitForAllTasks( );
-    CUDA_CHECK_NO_EXCEPT(cudaGetLastError( ));
-}
-
-inline bool Manager::execute( id_t taskToWait )
-{
+    inline bool Manager::execute(id_t taskToWait)
+    {
 #ifdef DEBUG_EVENTS
-    static int old_max = 0;
-    static int deep = -1;
-    static int counter = 0;
-    ++counter;
+        static int old_max = 0;
+        static int deep = -1;
+        static int counter = 0;
+        ++counter;
 
-    deep++;
-    if ( deep > old_max )
-    {
-        old_max = deep;
-    }
+        deep++;
+        if(deep > old_max)
+        {
+            old_max = deep;
+        }
 #endif
 
-    static TaskMap::iterator iter = tasks.begin( );
+        static TaskMap::iterator iter = tasks.begin();
 
-    if ( iter == tasks.end( ) )
-        iter = tasks.begin( );
+        if(iter == tasks.end())
+            iter = tasks.begin();
 
-    // this is the slow but very save variant to delete tasks in a map
-    while ( iter != tasks.end( ) )
-    {
-        id_t id = iter->first;
-        ITask* taskPtr = iter->second;
-        PMACC_ASSERT( taskPtr != nullptr );
-        ++iter;
+        // this is the slow but very save variant to delete tasks in a map
+        while(iter != tasks.end())
+        {
+            id_t id = iter->first;
+            ITask* taskPtr = iter->second;
+            PMACC_ASSERT(taskPtr != nullptr);
+            ++iter;
 #ifdef DEBUG_EVENTS
-        if ( counter == 500000 )
-            std::cout << taskPtr->toString( ) << " " << passiveTasks.size( ) << std::endl;
+            if(counter == 500000)
+                std::cout << taskPtr->toString() << " " << passiveTasks.size() << std::endl;
 #endif
-        if ( taskPtr->execute( ) )
-        {
-            /*test if task is deleted by other stackdeep*/
-            if ( getActiveITaskIfNotFinished( id ) == taskPtr )
+            if(taskPtr->execute())
             {
-                tasks.erase( id );
-                __delete(taskPtr);
-            }
+                /*test if task is deleted by other stackdeep*/
+                if(getActiveITaskIfNotFinished(id) == taskPtr)
+                {
+                    tasks.erase(id);
+                    __delete(taskPtr);
+                }
 #ifdef DEBUG_EVENTS
-            counter = 0;
+                counter = 0;
 #endif
 
-            if ( taskToWait == id )
-            {
-                iter = tasks.end( );
+                if(taskToWait == id)
+                {
+                    iter = tasks.end();
 #ifdef DEBUG_EVENTS
-                --deep;
+                    --deep;
 #endif
-                return true; //jump out because searched task is finished
+                    return true; // jump out because searched task is finished
+                }
             }
         }
-    }
 
 #ifdef DEBUG_EVENTS
-    --deep;
+        --deep;
 #endif
 
-    return false;
-}
+        return false;
+    }
 
-inline void Manager::event( id_t eventId, EventType, IEventData* )
-{
-    passiveTasks.erase( eventId );
-}
+    inline void Manager::event(id_t eventId, EventType, IEventData*)
+    {
+        passiveTasks.erase(eventId);
+    }
 
-inline ITask* Manager::getITaskIfNotFinished( id_t taskId ) const
-{
-    if( taskId == 0 )
-        return nullptr;
-    ITask* passiveTask = getPassiveITaskIfNotFinished( taskId );
-    if ( passiveTask != nullptr )
-        return passiveTask;
+    inline ITask* Manager::getITaskIfNotFinished(id_t taskId) const
+    {
+        if(taskId == 0)
+            return nullptr;
+        ITask* passiveTask = getPassiveITaskIfNotFinished(taskId);
+        if(passiveTask != nullptr)
+            return passiveTask;
 
-    return getActiveITaskIfNotFinished( taskId );
-}
+        return getActiveITaskIfNotFinished(taskId);
+    }
 
-inline ITask* Manager::getPassiveITaskIfNotFinished( id_t taskId ) const
-{
-    TaskMap::const_iterator itPassive = passiveTasks.find( taskId );
-    if ( itPassive != passiveTasks.end( ) )
-        return itPassive->second;
-    return nullptr;
-}
+    inline ITask* Manager::getPassiveITaskIfNotFinished(id_t taskId) const
+    {
+        TaskMap::const_iterator itPassive = passiveTasks.find(taskId);
+        if(itPassive != passiveTasks.end())
+            return itPassive->second;
+        return nullptr;
+    }
 
-inline ITask* Manager::getActiveITaskIfNotFinished( id_t taskId ) const
-{
-    TaskMap::const_iterator it = tasks.find( taskId );
-    if ( it != tasks.end( ) )
-        return it->second;
-    return nullptr;
-}
+    inline ITask* Manager::getActiveITaskIfNotFinished(id_t taskId) const
+    {
+        TaskMap::const_iterator it = tasks.find(taskId);
+        if(it != tasks.end())
+            return it->second;
+        return nullptr;
+    }
 
-inline void Manager::waitForFinished( id_t taskId )
-{
-    if( taskId == 0 )
-        return;
-    //check if task is passive and wait on it
-    ITask* task = getPassiveITaskIfNotFinished( taskId );
-    if ( task != nullptr )
+    inline void Manager::waitForFinished(id_t taskId)
     {
-        do
+        if(taskId == 0)
+            return;
+        // check if task is passive and wait on it
+        ITask* task = getPassiveITaskIfNotFinished(taskId);
+        if(task != nullptr)
         {
-            this->execute( );
+            do
+            {
+                this->execute();
+            } while(getPassiveITaskIfNotFinished(taskId) != nullptr);
+
+            return; // we can jump out because task is passive task
         }
-        while ( getPassiveITaskIfNotFinished( taskId ) != nullptr );
 
-        return; //we can jump out because task is passive task
+        // check if task is  active and wait on it
+        task = getActiveITaskIfNotFinished(taskId);
+        if(task != nullptr)
+        {
+            do
+            {
+                if(this->execute(taskId))
+                    return; // jump out because task is finished
+            } while(getActiveITaskIfNotFinished(taskId) != nullptr);
+        }
     }
 
-    //check if task is  active and wait on it
-    task = getActiveITaskIfNotFinished( taskId );
-    if ( task != nullptr )
+    inline void Manager::waitForAllTasks()
     {
-        do
+        while(tasks.size() != 0 || passiveTasks.size() != 0)
         {
-            if ( this->execute( taskId ) )
-                return; //jump out because task is finished
+            this->execute();
         }
-        while ( getActiveITaskIfNotFinished( taskId ) != nullptr );
+        PMACC_ASSERT(tasks.size() == 0);
     }
-}
 
-inline void Manager::waitForAllTasks( )
-{
-    while ( tasks.size( ) != 0 || passiveTasks.size( ) != 0 )
+    inline void Manager::addTask(ITask* task)
     {
-        this->execute( );
+        PMACC_ASSERT(task != nullptr);
+        tasks[task->getId()] = task;
     }
-    PMACC_ASSERT( tasks.size( ) == 0 );
-}
-
-inline void Manager::addTask( ITask *task )
-{
-    PMACC_ASSERT( task != nullptr );
-    tasks[task->getId( )] = task;
-}
 
-inline void Manager::addPassiveTask( ITask *task )
-{
-    PMACC_ASSERT( task != nullptr );
+    inline void Manager::addPassiveTask(ITask* task)
+    {
+        PMACC_ASSERT(task != nullptr);
 
-    task->addObserver( this );
-    passiveTasks[task->getId( )] = task;
-}
+        task->addObserver(this);
+        passiveTasks[task->getId()] = task;
+    }
 
-inline Manager::Manager( )
-{
-}
+    inline Manager::Manager()
+    {
+    }
 
-inline Manager::Manager( const Manager& )
-{
-}
+    inline Manager::Manager(const Manager&)
+    {
+    }
 
 
-inline std::size_t Manager::getCount( )
-{
-    for ( TaskMap::iterator iter = tasks.begin( ); iter != tasks.end( ); ++iter )
+    inline std::size_t Manager::getCount()
     {
-        if ( iter->second != nullptr )
+        for(TaskMap::iterator iter = tasks.begin(); iter != tasks.end(); ++iter)
         {
-            std::cout << iter->first << " = " << iter->second->toString( ) << std::endl;
+            if(iter->second != nullptr)
+            {
+                std::cout << iter->first << " = " << iter->second->toString() << std::endl;
+            }
         }
+        return tasks.size();
     }
-    return tasks.size( );
-}
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/CudaEvent.def b/include/pmacc/eventSystem/events/CudaEvent.def
index f56fca9c6f..d046812c77 100644
--- a/include/pmacc/eventSystem/events/CudaEvent.def
+++ b/include/pmacc/eventSystem/events/CudaEvent.def
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,84 +25,79 @@
 #include "pmacc/assert.hpp"
 
 
-
 namespace pmacc
 {
-
-/** Wrapper for cudaEvent_t
- *
- * This class follows the RAII rules
- */
-class CudaEvent
-{
-private:
-
-    /** native cuda event */
-    cudaEvent_t event;
-    /** native cuda stream where the event is recorded
-     *
-     *  only valid if isRecorded is true
-     */
-    cudaStream_t stream;
-    /** state if event is recorded */
-    bool isRecorded;
-    /** state if a recorded event is finished
+    /** Wrapper for cuplaEvent_t
      *
-     * avoid cuda driver calls after `isFinished()` returns the first time true
+     * This class follows the RAII rules
      */
-    bool finished;
-
-    /** number of CudaEventHandle's to the instance */
-    uint32_t refCounter;
-
-
-public:
-
-    /** Constructor
-     *
-     * if called before the cuda device is initialized the behavior is undefined
-     */
-    HINLINE CudaEvent( );
-
-    /** Destructor */
-    HINLINE ~CudaEvent( );
-
-    /** register a existing handle to a event instance */
-    HINLINE void registerHandle( );
-
-    /** free a registered handle */
-    HINLINE void releaseHandle( );
-
-    /** get native cudaEvent_t object
-     *
-     * @return native cuda event
-     */
-    cudaEvent_t operator*( ) const
-    {
-        return event;
-    }
-
-    /** get stream in which this event is recorded
-     *
-     * @return native cuda stream
-     */
-    cudaStream_t getStream( ) const
+    class CudaEvent
     {
-        assert( isRecorded );
-        return stream;
-    }
-
-    /** check whether the event is finished
-     *
-     * @return true if event is finished else false
-     */
-    HINLINE bool isFinished( );
-
-    /** record event in a device stream
-     *
-     * @param stream native cuda stream
-     */
-    HINLINE void recordEvent( cudaStream_t stream );
-
-};
-}
+    private:
+        /** native cupla event */
+        cuplaEvent_t event;
+        /** native cupla stream where the event is recorded
+         *
+         *  only valid if isRecorded is true
+         */
+        cuplaStream_t stream;
+        /** state if event is recorded */
+        bool isRecorded;
+        /** state if a recorded event is finished
+         *
+         * avoid cupla driver calls after `isFinished()` returns the first time true
+         */
+        bool finished;
+
+        /** number of CudaEventHandle's to the instance */
+        uint32_t refCounter;
+
+
+    public:
+        /** Constructor
+         *
+         * if called before the cupla device is initialized the behavior is undefined
+         */
+        HINLINE CudaEvent();
+
+        /** Destructor */
+        HINLINE ~CudaEvent();
+
+        /** register a existing handle to a event instance */
+        HINLINE void registerHandle();
+
+        /** free a registered handle */
+        HINLINE void releaseHandle();
+
+        /** get native cuplaEvent_t object
+         *
+         * @return native cupla event
+         */
+        cuplaEvent_t operator*() const
+        {
+            return event;
+        }
+
+        /** get stream in which this event is recorded
+         *
+         * @return native cupla stream
+         */
+        cuplaStream_t getStream() const
+        {
+            assert(isRecorded);
+            return stream;
+        }
+
+        /** check whether the event is finished
+         *
+         * @return true if event is finished else false
+         */
+        HINLINE bool isFinished();
+
+        /** record event in a device stream
+         *
+         * @param stream native cupla stream
+         */
+        HINLINE void recordEvent(cuplaStream_t stream);
+    };
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/CudaEvent.hpp b/include/pmacc/eventSystem/events/CudaEvent.hpp
index e205962508..6446315f37 100644
--- a/include/pmacc/eventSystem/events/CudaEvent.hpp
+++ b/include/pmacc/eventSystem/events/CudaEvent.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,25 +26,22 @@
 #include "pmacc/types.hpp"
 
 
-
-
 namespace pmacc
 {
-    CudaEvent::CudaEvent( ) : isRecorded( false ), finished( true ), refCounter( 0u )
+    CudaEvent::CudaEvent() : isRecorded(false), finished(true), refCounter(0u)
     {
-        log( ggLog::CUDA_RT()+ggLog::EVENT(), "create event" );
-        CUDA_CHECK( cudaEventCreateWithFlags( &event, cudaEventDisableTiming ) );
+        log(ggLog::CUDA_RT() + ggLog::EVENT(), "create event");
+        CUDA_CHECK(cuplaEventCreateWithFlags(&event, cuplaEventDisableTiming));
     }
 
 
-    CudaEvent::~CudaEvent( )
+    CudaEvent::~CudaEvent()
     {
-        PMACC_ASSERT( refCounter == 0u );
-        log( ggLog::CUDA_RT()+ggLog::EVENT(), "sync and delete event" );
-        // free cuda event
-        CUDA_CHECK_NO_EXCEPT(cudaEventSynchronize( event ));
-        CUDA_CHECK_NO_EXCEPT(cudaEventDestroy( event ));
-
+        PMACC_ASSERT(refCounter == 0u);
+        log(ggLog::CUDA_RT() + ggLog::EVENT(), "sync and delete event");
+        // free cupla event
+        CUDA_CHECK_NO_EXCEPT(cuplaEventSynchronize(event));
+        CUDA_CHECK_NO_EXCEPT(cuplaEventDestroy(event));
     }
 
     void CudaEvent::registerHandle()
@@ -54,49 +51,49 @@ namespace pmacc
 
     void CudaEvent::releaseHandle()
     {
-        assert( refCounter != 0u );
+        assert(refCounter != 0u);
         // get old value and decrement
         uint32_t oldCounter = refCounter--;
-        if( oldCounter == 1u )
+        if(oldCounter == 1u)
         {
             // reset event meta data
             isRecorded = false;
             finished = true;
 
-            Environment<>::get().EventPool( ).push( this );
+            Environment<>::get().EventPool().push(this);
         }
     }
 
 
     bool CudaEvent::isFinished()
     {
-        // avoid cuda driver calls if event is already finished
-        if( finished )
+        // avoid cupla driver calls if event is already finished
+        if(finished)
             return true;
-        assert( isRecorded );
+        assert(isRecorded);
 
-        cudaError_t rc = cudaEventQuery(event);
+        cuplaError_t rc = cuplaEventQuery(event);
 
-        if(rc == cudaSuccess)
+        if(rc == cuplaSuccess)
         {
             finished = true;
             return true;
         }
-        else if(rc == cudaErrorNotReady)
+        else if(rc == cuplaErrorNotReady)
             return false;
         else
-            PMACC_PRINT_CUDA_ERROR_AND_THROW(rc, "Event query failed");
+            PMACC_PRINT_CUPLA_ERROR_AND_THROW(rc, "Event query failed");
     }
 
 
-    void CudaEvent::recordEvent(cudaStream_t stream)
+    void CudaEvent::recordEvent(cuplaStream_t stream)
     {
         /* disallow double recording */
         assert(isRecorded == false);
         isRecorded = true;
         finished = false;
         this->stream = stream;
-        CUDA_CHECK(cudaEventRecord(event, stream));
+        CUDA_CHECK(cuplaEventRecord(event, stream));
     }
 
-} // namepsace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/CudaEventHandle.hpp b/include/pmacc/eventSystem/events/CudaEventHandle.hpp
index c105240a42..34fd1854c3 100644
--- a/include/pmacc/eventSystem/events/CudaEventHandle.hpp
+++ b/include/pmacc/eventSystem/events/CudaEventHandle.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,106 +29,101 @@
 
 namespace pmacc
 {
-
-/** handle to CudaEvent */
-class CudaEventHandle
-{
-private:
-
-    /** pointer to the CudaEvent */
-    CudaEvent* event;
-
-public:
-
-    /** create invalid handle  */
-    CudaEventHandle( ) : event( nullptr )
-    {
-
-    }
-
-    /** create a handle to a valid CudaEvent
-     *
-     * @param evPointer pointer to a CudaEvent
-     */
-    CudaEventHandle( CudaEvent* const evPointer ) : event( evPointer )
-    {
-        event->registerHandle();
-    }
-
-    CudaEventHandle( const CudaEventHandle& other ) : event( nullptr )
-    {
-        /* register and release handle is done by the assign operator */
-        *this = other;
-    }
-
-    /** assign an event handle
-     *
-     * undefined behavior if the other event handle is equal to this instance
-     *
-     * @param other event handle
-     * @return this handle
-     */
-    CudaEventHandle&
-    operator=( const CudaEventHandle& other )
-    {
-        /* check if an old event is overwritten */
-        if( event )
-            event->releaseHandle( );
-        event = other.event;
-        /* check that new event pointer is not nullptr */
-        if( event )
-            event->registerHandle( );
-        return *this;
-    }
-
-    /** Destructor */
-    ~CudaEventHandle( )
-    {
-        if( event )
-            event->releaseHandle( );
-        event = nullptr;
-    }
-
-    /**
-     * get native cuda event
-     *
-     * @return native cuda event
-     */
-    cudaEvent_t operator*( ) const
-    {
-        assert( event );
-        return **event;
-    }
-
-    /** check whether the event is finished
-     *
-     * @return true if event is finished else false
-     */
-    bool isFinished( )
-    {
-        PMACC_ASSERT( event );
-        return event->isFinished( );
-    }
-
-
-    /** get stream in which this event is recorded
-     *
-     * @return native cuda stream
-     */
-    cudaStream_t getStream( ) const
-    {
-        PMACC_ASSERT( event );
-        return event->getStream( );
-    }
-
-    /** record event in a device stream
-     *
-     * @param stream native cuda stream
-     */
-    void recordEvent( cudaStream_t stream )
+    /** handle to CudaEvent */
+    class CudaEventHandle
     {
-        PMACC_ASSERT( event );
-        event->recordEvent( stream );
-    }
-};
-}
+    private:
+        /** pointer to the CudaEvent */
+        CudaEvent* event;
+
+    public:
+        /** create invalid handle  */
+        CudaEventHandle() : event(nullptr)
+        {
+        }
+
+        /** create a handle to a valid CudaEvent
+         *
+         * @param evPointer pointer to a CudaEvent
+         */
+        CudaEventHandle(CudaEvent* const evPointer) : event(evPointer)
+        {
+            event->registerHandle();
+        }
+
+        CudaEventHandle(const CudaEventHandle& other) : event(nullptr)
+        {
+            /* register and release handle is done by the assign operator */
+            *this = other;
+        }
+
+        /** assign an event handle
+         *
+         * undefined behavior if the other event handle is equal to this instance
+         *
+         * @param other event handle
+         * @return this handle
+         */
+        CudaEventHandle& operator=(const CudaEventHandle& other)
+        {
+            /* check if an old event is overwritten */
+            if(event)
+                event->releaseHandle();
+            event = other.event;
+            /* check that new event pointer is not nullptr */
+            if(event)
+                event->registerHandle();
+            return *this;
+        }
+
+        /** Destructor */
+        ~CudaEventHandle()
+        {
+            if(event)
+                event->releaseHandle();
+            event = nullptr;
+        }
+
+        /**
+         * get native cupla event
+         *
+         * @return native cupla event
+         */
+        cuplaEvent_t operator*() const
+        {
+            assert(event);
+            return **event;
+        }
+
+        /** check whether the event is finished
+         *
+         * @return true if event is finished else false
+         */
+        bool isFinished()
+        {
+            PMACC_ASSERT(event);
+            return event->isFinished();
+        }
+
+
+        /** get stream in which this event is recorded
+         *
+         * @return native cupla stream
+         */
+        cuplaStream_t getStream() const
+        {
+            PMACC_ASSERT(event);
+            return event->getStream();
+        }
+
+        /** record event in a device stream
+         *
+         * @param stream native cupla stream
+         */
+        void recordEvent(cuplaStream_t stream)
+        {
+            PMACC_ASSERT(event);
+            event->recordEvent(stream);
+        }
+    };
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/EventDataReceive.hpp b/include/pmacc/eventSystem/events/EventDataReceive.hpp
index d6cb1bbd96..91381a9577 100644
--- a/include/pmacc/eventSystem/events/EventDataReceive.hpp
+++ b/include/pmacc/eventSystem/events/EventDataReceive.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -26,14 +26,12 @@
 
 namespace pmacc
 {
-
     class EventDataReceive : public IEventData
     {
     public:
-        EventDataReceive(EventNotify *task, size_t recv_count) :
-        IEventData(task),
-        recv_count(recv_count)
-        {}
+        EventDataReceive(EventNotify* task, size_t recv_count) : IEventData(task), recv_count(recv_count)
+        {
+        }
 
         size_t getReceivedCount() const
         {
@@ -42,7 +40,6 @@ namespace pmacc
 
     private:
         size_t recv_count;
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/EventNotify.hpp b/include/pmacc/eventSystem/events/EventNotify.hpp
index 84b5236d53..efeabaa71b 100644
--- a/include/pmacc/eventSystem/events/EventNotify.hpp
+++ b/include/pmacc/eventSystem/events/EventNotify.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -28,7 +28,6 @@
 
 namespace pmacc
 {
-
     class IEventData;
     class IEvent;
 
@@ -38,7 +37,6 @@ namespace pmacc
     class EventNotify
     {
     public:
-
         virtual ~EventNotify()
         {
         }
@@ -67,12 +65,10 @@ namespace pmacc
          * @param type the type of this notification
          * @param data data passed to observers
          */
-        void notify(id_t eventId, EventType type, IEventData *data);
+        void notify(id_t eventId, EventType type, IEventData* data);
 
     private:
         std::set<IEvent*> observers;
-
     };
 
-} //namespace pmacc
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/EventNotify.tpp b/include/pmacc/eventSystem/events/EventNotify.tpp
index b8bb0f4517..3da67a684e 100644
--- a/include/pmacc/eventSystem/events/EventNotify.tpp
+++ b/include/pmacc/eventSystem/events/EventNotify.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -30,27 +30,25 @@
 
 namespace pmacc
 {
-
-        inline void EventNotify::notify( id_t eventId, EventType type, IEventData *data )
+    inline void EventNotify::notify(id_t eventId, EventType type, IEventData* data)
+    {
+        std::set<IEvent*>::iterator iter = observers.begin();
+        for(; iter != observers.end(); iter++)
         {
-            std::set<IEvent*>::iterator iter = observers.begin( );
-            for (; iter != observers.end( ); iter++ )
-            {
-                if ( *iter != nullptr )
-                    ( *iter )->event( eventId, type, data );
-            }
-            /* if notify is not called from destructor
-             * other tasks can register after this call.
-             * But any ITask must call this function in destrctor again"
-             */
-            observers.clear( );
-
-            /**
-             * \TODO are we sure that data won't be deleted anywhere else?
-             * if (data != nullptr)
-             *  delete data;
-             **/
-
+            if(*iter != nullptr)
+                (*iter)->event(eventId, type, data);
         }
-
-} //namespace pmacc
+        /* if notify is not called from destructor
+         * other tasks can register after this call.
+         * But any ITask must call this function in destrctor again"
+         */
+        observers.clear();
+
+        /**
+         * \TODO are we sure that data won't be deleted anywhere else?
+         * if (data != nullptr)
+         *  delete data;
+         **/
+    }
+
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/EventPool.hpp b/include/pmacc/eventSystem/events/EventPool.hpp
index 87c03ffd9f..3340d552cf 100644
--- a/include/pmacc/eventSystem/events/EventPool.hpp
+++ b/include/pmacc/eventSystem/events/EventPool.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -33,26 +33,24 @@
 
 namespace pmacc
 {
-
-    /** Manages a pool of cudaEvent_t objects and gives access to them. */
+    /** Manages a pool of cuplaEvent_t objects and gives access to them. */
     class EventPool
     {
     public:
-
-        /** Returns a free cuda event
+        /** Returns a free cupla event
          *
-         * @return free cuda event
+         * @return free cupla event
          */
-        CudaEventHandle pop( )
+        CudaEventHandle pop()
         {
-            if( freeEvents.size( ) != 0 )
+            if(freeEvents.size() != 0)
             {
-                CudaEventHandle result = freeEvents.front( );
-                freeEvents.pop_front( );
+                CudaEventHandle result = freeEvents.front();
+                freeEvents.pop_front();
                 return result;
             }
-            createEvents( );
-            return pop( );
+            createEvents();
+            return pop();
         }
 
 
@@ -62,67 +60,66 @@ namespace pmacc
          *
          * @param ev pointer to CudaEvent
          */
-        void push( CudaEvent* const ev )
+        void push(CudaEvent* const ev)
         {
             /* Guard that no event is added during the pool is closed (shutdown phase).
              * This method is also called during the evaluation of the destructor.
              */
-            if( !isClosed )
-                freeEvents.push_back( CudaEventHandle(ev) );
+            if(!isClosed)
+                freeEvents.push_back(CudaEventHandle(ev));
         }
 
-        /** create and add cuda events to the pool
+        /** create and add cupla events to the pool
          *
-         * @param count number of cuda events to add
+         * @param count number of cupla events to add
          */
-        void createEvents( size_t count = 1u )
+        void createEvents(size_t count = 1u)
         {
-            for( size_t i = 0u; i < count; i++ )
+            for(size_t i = 0u; i < count; i++)
             {
-                CudaEvent* nativeEvent = new CudaEvent( );
-                events.push_back( nativeEvent );
-                push( nativeEvent );
+                CudaEvent* nativeEvent = new CudaEvent();
+                events.push_back(nativeEvent);
+                push(nativeEvent);
             }
         }
 
-        /** Returns the number of cuda events in the pool.
+        /** Returns the number of cupla events in the pool.
          *
-         * @return number of cuda events
+         * @return number of cupla events
          */
-        size_t getEventsCount( )
+        size_t getEventsCount()
         {
-            return events.size( );
+            return events.size();
         }
 
     private:
-
         friend struct detail::Environment;
 
-        static EventPool& getInstance( )
+        static EventPool& getInstance()
         {
             static EventPool instance;
             return instance;
         }
 
         /** Constructor */
-        EventPool( ) : isClosed( false )
+        EventPool() : isClosed(false)
         {
         }
 
         /** Destructor
          *
-         * destroys all cuda events in the pool
+         * destroys all cupla events in the pool
          */
         ~EventPool()
         {
-            log( ggLog::CUDA_RT( )+ggLog::EVENT( ), "shutdown EventPool with %1% events" ) % getEventsCount( );
+            log(ggLog::CUDA_RT() + ggLog::EVENT(), "shutdown EventPool with %1% events") % getEventsCount();
             isClosed = true;
-            freeEvents.clear( );
-            for( std::vector<CudaEvent*>::const_iterator iter = events.begin(); iter != events.end(); ++iter )
+            freeEvents.clear();
+            for(std::vector<CudaEvent*>::const_iterator iter = events.begin(); iter != events.end(); ++iter)
             {
                 delete *iter;
             }
-            events.clear( );
+            events.clear();
         }
 
         //! hold all CudaEvents
@@ -137,4 +134,4 @@ namespace pmacc
          */
         bool isClosed;
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/EventTask.hpp b/include/pmacc/eventSystem/events/EventTask.hpp
index 20987b15e5..bc4c6abaa6 100644
--- a/include/pmacc/eventSystem/events/EventTask.hpp
+++ b/include/pmacc/eventSystem/events/EventTask.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -27,7 +27,6 @@
 
 namespace pmacc
 {
-
     /**
      * EventTask is used for task-synchronization in the event system.
      *
@@ -37,7 +36,6 @@ namespace pmacc
     class EventTask
     {
     public:
-
         /**
          * Constructor.
          *
@@ -45,6 +43,8 @@ namespace pmacc
          */
         EventTask(id_t taskId);
 
+        constexpr EventTask(const pmacc::EventTask&) = default;
+
         /**
          * Constructor.
          */
@@ -78,7 +78,7 @@ namespace pmacc
          *
          * @param other EventTask to add to this task
          */
-        EventTask operator+(const EventTask & other);
+        EventTask operator+(const EventTask& other);
 
         /**
          * Adds two tasks (this task and other) and creates
@@ -86,22 +86,19 @@ namespace pmacc
          *
          * @param other EventTask to add to this task
          */
-        EventTask& operator+=(const EventTask & other);
+        EventTask& operator+=(const EventTask& other);
 
         /**
          * Copies attributes from other to this task.
          *
          * This task effectively becomes other.
          */
-        EventTask & operator=(const EventTask & other);
+        EventTask& operator=(const EventTask& other);
 
         std::string toString();
 
     private:
-
         id_t taskId;
     };
 
-} //namespace pmacc
-
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/EventTask.tpp b/include/pmacc/eventSystem/events/EventTask.tpp
index 01793454f7..55c0cea8ba 100644
--- a/include/pmacc/eventSystem/events/EventTask.tpp
+++ b/include/pmacc/eventSystem/events/EventTask.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -27,19 +27,18 @@
 
 namespace pmacc
 {
+    inline EventTask::EventTask(id_t taskId) : taskId(taskId)
+    {
+    }
 
-    inline EventTask::EventTask(id_t taskId) :
-        taskId(taskId)
-    {}
-
-    inline EventTask::EventTask() :
-        taskId(0)
-    {}
+    inline EventTask::EventTask() : taskId(0)
+    {
+    }
 
     inline std::string EventTask::toString()
     {
-        ITask* task=Environment<>::get().Manager().getITaskIfNotFinished(taskId);
-        if(task!=nullptr)
+        ITask* task = Environment<>::get().Manager().getITaskIfNotFinished(taskId);
+        if(task != nullptr)
             return task->toString();
 
         return std::string();
@@ -60,48 +59,46 @@ namespace pmacc
         Environment<>::get().Manager().waitForFinished(taskId);
     }
 
-    inline EventTask EventTask::operator+(const EventTask & other)
+    inline EventTask EventTask::operator+(const EventTask& other)
     {
-        EventTask tmp=*this;
-        return tmp+=other;
+        EventTask tmp = *this;
+        return tmp += other;
     }
 
-    inline EventTask& EventTask::operator+=(const EventTask & other)
+    inline EventTask& EventTask::operator+=(const EventTask& other)
     {
         // If one of the two tasks is already finished, the other task is returned.
         // Otherwise, a TaskLogicalAnd is created and added to the Manager's queue.
         Manager& manager = Environment<>::get().Manager();
 
-        if(this->taskId==other.taskId)
+        if(this->taskId == other.taskId)
             return *this;
 
         ITask* myTask = manager.getITaskIfNotFinished(this->taskId);
-        if(myTask==nullptr)
+        if(myTask == nullptr)
         {
-            this->taskId=other.taskId;
+            this->taskId = other.taskId;
             return *this;
         }
 
         ITask* otherTask = manager.getITaskIfNotFinished(other.taskId);
-        if(otherTask==nullptr)
+        if(otherTask == nullptr)
         {
             return *this;
         }
 
-        TaskLogicalAnd *taskAnd = new TaskLogicalAnd(myTask,
-                                                     otherTask);
-        this->taskId=taskAnd->getId();
+        TaskLogicalAnd* taskAnd = new TaskLogicalAnd(myTask, otherTask);
+        this->taskId = taskAnd->getId();
         manager.addPassiveTask(taskAnd);
 
         return *this;
     }
 
-    inline EventTask& EventTask::operator=(const EventTask & other)
+    inline EventTask& EventTask::operator=(const EventTask& other)
     {
-        //this is faster than a copy constructor
+        // this is faster than a copy constructor
         taskId = other.taskId;
         return *this;
     }
 
-}
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/IEvent.hpp b/include/pmacc/eventSystem/events/IEvent.hpp
index 8c6d703a7a..6885ff01f6 100644
--- a/include/pmacc/eventSystem/events/IEvent.hpp
+++ b/include/pmacc/eventSystem/events/IEvent.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -26,7 +26,6 @@
 
 namespace pmacc
 {
-
     class IEventData;
 
     /**
@@ -35,7 +34,6 @@ namespace pmacc
     class IEvent
     {
     public:
-
         /**
          * Destructor.
          */
@@ -53,4 +51,4 @@ namespace pmacc
         virtual void event(id_t eventId, EventType type, IEventData* data) = 0;
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/IEventData.hpp b/include/pmacc/eventSystem/events/IEventData.hpp
index 773c820745..0accf82b3f 100644
--- a/include/pmacc/eventSystem/events/IEventData.hpp
+++ b/include/pmacc/eventSystem/events/IEventData.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -35,13 +35,13 @@ namespace pmacc
     class IEventData
     {
     public:
-
-        IEventData(EventNotify *task) :
-        task(task)
-        {}
+        IEventData(EventNotify* task) : task(task)
+        {
+        }
 
         virtual ~IEventData()
-        {}
+        {
+        }
 
         EventNotify* getEventNotify()
         {
@@ -49,8 +49,7 @@ namespace pmacc
         }
 
     protected:
-        EventNotify *task;
-
+        EventNotify* task;
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/events/kernelEvents.hpp b/include/pmacc/eventSystem/events/kernelEvents.hpp
index 9c72393922..445ef77a16 100644
--- a/include/pmacc/eventSystem/events/kernelEvents.hpp
+++ b/include/pmacc/eventSystem/events/kernelEvents.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -32,241 +32,176 @@
 #include <string>
 
 
-
 /* No namespace in this file since we only declare macro defines */
 
 /*if this flag is defined all kernel calls would be checked and synchronize
  * this flag must set by the compiler or inside of the Makefile
  */
-#if( PMACC_SYNC_KERNEL  == 1 )
-#   define CUDA_CHECK_KERNEL_MSG(...)  CUDA_CHECK_MSG(__VA_ARGS__)
+#if(PMACC_SYNC_KERNEL == 1)
+#    define CUDA_CHECK_KERNEL_MSG(...) CUDA_CHECK_MSG(__VA_ARGS__)
 #else
-    /*no synchronize and check of kernel calls*/
-#   define CUDA_CHECK_KERNEL_MSG(...)  ;
+/*no synchronize and check of kernel calls*/
+#    define CUDA_CHECK_KERNEL_MSG(...) ;
 #endif
 
 
 namespace pmacc
 {
-namespace exec
-{
-    /** configured kernel object
-     *
-     * this objects contains the functor and the starting parameter
-     *
-     * @tparam T_Kernel pmacc Kernel object
-     * @tparam T_VectorGrid type which defines the grid extents (type must be castable to CUDA dim3)
-     * @tparam T_VectorBlock type which defines the block extents (type must be castable to CUDA dim3)
-     */
-    template<
-        typename T_Kernel,
-        typename T_VectorGrid,
-        typename T_VectorBlock
-    >
-    struct KernelStarter;
-
-    /** wrapper for the user kernel functor
-     *
-     * contains debug information like filename and line of the kernel call
-     */
-    template< typename T_KernelFunctor >
-    struct Kernel
+    namespace exec
     {
-        using KernelType = T_KernelFunctor;
-        /** functor */
-        T_KernelFunctor const m_kernelFunctor;
-        /** file name from where the kernel is called */
-        std::string const m_file;
-        /** line number in the file */
-        size_t const m_line;
-
-        /**
-         *
-         * @param gridExtent grid extent configuration for the kernel
-         * @param blockExtent block extent configuration for the kernel
-         * @param sharedMemByte dynamic shared memory used by the kernel (in byte )
-         * @return
-         */
-        HINLINE Kernel(
-            T_KernelFunctor const & kernelFunctor,
-            std::string const & file = std::string(),
-            size_t const line = 0
-        ) :
-            m_kernelFunctor( kernelFunctor ),
-            m_file( file ),
-            m_line( line )
-        {
-
-        }
-
         /** configured kernel object
          *
          * this objects contains the functor and the starting parameter
          *
-         * @tparam T_VectorGrid type which defines the grid extents (type must be castable to CUDA dim3)
-         * @tparam T_VectorBlock type which defines the block extents (type must be castable to CUDA dim3)
-         *
-         * @param gridExtent grid extent configuration for the kernel
-         * @param blockExtent block extent configuration for the kernel
-         * @param sharedMemByte dynamic shared memory used by the kernel (in byte)
+         * @tparam T_Kernel pmacc Kernel object
+         * @tparam T_VectorGrid type which defines the grid extents (type must be castable to cupla dim3)
+         * @tparam T_VectorBlock type which defines the block extents (type must be castable to cupla dim3)
          */
-        template<
-            typename T_VectorGrid,
-            typename T_VectorBlock
-        >
-        HINLINE
-        auto
-        operator()(
-            T_VectorGrid const & gridExtent,
-            T_VectorBlock const & blockExtent,
-            size_t const sharedMemByte = 0
-        ) const
-        -> KernelStarter<
-            Kernel,
-            T_VectorGrid,
-            T_VectorBlock
-        >;
-    };
-
-
-    template<
-        typename T_Kernel,
-        typename T_VectorGrid,
-        typename T_VectorBlock
-   >
-    struct KernelStarter
-    {
-        /** kernel functor */
-        T_Kernel const m_kernel;
-        /** grid extents for the kernel */
-        T_VectorGrid const m_gridExtent;
-        /** block extents for the kernel */
-        T_VectorBlock const m_blockExtent;
-        /** dynamic shared memory consumed by the kernel (in byte) */
-        size_t const m_sharedMemByte;
+        template<typename T_Kernel, typename T_VectorGrid, typename T_VectorBlock>
+        struct KernelStarter;
 
-        /** kernel starter object
+        /** wrapper for the user kernel functor
          *
-         * @param kernel pmacc Kernel
+         * contains debug information like filename and line of the kernel call
          */
-        HINLINE KernelStarter(
-            T_Kernel const & kernel,
-            T_VectorGrid const & gridExtent,
-            T_VectorBlock const & blockExtent,
-            size_t const sharedMemByte
-        ) :
-            m_kernel( kernel ),
-            m_gridExtent( gridExtent ),
-            m_blockExtent( blockExtent ),
-            m_sharedMemByte( sharedMemByte )
+        template<typename T_KernelFunctor>
+        struct Kernel
         {
-
-        }
-
-        /** execute the kernel functor
-         *
-         * @tparam T_Args types of the arguments
-         * @param args arguments for the kernel functor
+            using KernelType = T_KernelFunctor;
+            /** functor */
+            T_KernelFunctor const m_kernelFunctor;
+            /** file name from where the kernel is called */
+            std::string const m_file;
+            /** line number in the file */
+            size_t const m_line;
+
+            /**
+             *
+             * @param gridExtent grid extent configuration for the kernel
+             * @param blockExtent block extent configuration for the kernel
+             * @param sharedMemByte dynamic shared memory used by the kernel (in byte )
+             * @return
+             */
+            HINLINE Kernel(
+                T_KernelFunctor const& kernelFunctor,
+                std::string const& file = std::string(),
+                size_t const line = 0)
+                : m_kernelFunctor(kernelFunctor)
+                , m_file(file)
+                , m_line(line)
+            {
+            }
+
+            /** configured kernel object
+             *
+             * this objects contains the functor and the starting parameter
+             *
+             * @tparam T_VectorGrid type which defines the grid extents (type must be castable to cupla dim3)
+             * @tparam T_VectorBlock type which defines the block extents (type must be castable to cupla dim3)
+             *
+             * @param gridExtent grid extent configuration for the kernel
+             * @param blockExtent block extent configuration for the kernel
+             * @param sharedMemByte dynamic shared memory used by the kernel (in byte)
+             */
+            template<typename T_VectorGrid, typename T_VectorBlock>
+            HINLINE auto operator()(
+                T_VectorGrid const& gridExtent,
+                T_VectorBlock const& blockExtent,
+                size_t const sharedMemByte = 0) const -> KernelStarter<Kernel, T_VectorGrid, T_VectorBlock>;
+        };
+
+
+        template<typename T_Kernel, typename T_VectorGrid, typename T_VectorBlock>
+        struct KernelStarter
+        {
+            /** kernel functor */
+            T_Kernel const m_kernel;
+            /** grid extents for the kernel */
+            T_VectorGrid const m_gridExtent;
+            /** block extents for the kernel */
+            T_VectorBlock const m_blockExtent;
+            /** dynamic shared memory consumed by the kernel (in byte) */
+            size_t const m_sharedMemByte;
+
+            /** kernel starter object
+             *
+             * @param kernel pmacc Kernel
+             */
+            HINLINE KernelStarter(
+                T_Kernel const& kernel,
+                T_VectorGrid const& gridExtent,
+                T_VectorBlock const& blockExtent,
+                size_t const sharedMemByte)
+                : m_kernel(kernel)
+                , m_gridExtent(gridExtent)
+                , m_blockExtent(blockExtent)
+                , m_sharedMemByte(sharedMemByte)
+            {
+            }
+
+            /** execute the kernel functor
+             *
+             * @tparam T_Args types of the arguments
+             * @param args arguments for the kernel functor
+             *
+             * @{
+             */
+            template<typename... T_Args>
+            HINLINE void operator()(T_Args const&... args) const
+            {
+                std::string const kernelName = typeid(m_kernel.m_kernelFunctor).name();
+                std::string const kernelInfo = kernelName + std::string(" [") + m_kernel.m_file + std::string(":")
+                    + std::to_string(m_kernel.m_line) + std::string(" ]");
+
+                CUDA_CHECK_KERNEL_MSG(cuplaDeviceSynchronize(), std::string("Crash before kernel call ") + kernelInfo);
+
+                pmacc::TaskKernel* taskKernel
+                    = pmacc::Environment<>::get().Factory().createTaskKernel(typeid(kernelName).name());
+
+                DataSpace<traits::GetNComponents<T_VectorGrid>::value> gridExtent(m_gridExtent);
+
+                DataSpace<traits::GetNComponents<T_VectorBlock>::value> blockExtent(m_blockExtent);
+
+                CUPLA_KERNEL(typename T_Kernel::KernelType)
+                (gridExtent.toDim3(), blockExtent.toDim3(), m_sharedMemByte, taskKernel->getCudaStream())(args...);
+                CUDA_CHECK_KERNEL_MSG(
+                    cuplaGetLastError(),
+                    std::string("Last error after kernel launch ") + kernelInfo);
+                CUDA_CHECK_KERNEL_MSG(
+                    cuplaDeviceSynchronize(),
+                    std::string("Crash after kernel launch ") + kernelInfo);
+                taskKernel->activateChecks();
+                CUDA_CHECK_KERNEL_MSG(
+                    cuplaDeviceSynchronize(),
+                    std::string("Crash after kernel activation") + kernelInfo);
+            }
+
+            template<typename... T_Args>
+            HINLINE void operator()(T_Args const&... args)
+            {
+                return static_cast<const KernelStarter&>(*this)(args...);
+            }
+
+            /** @} */
+        };
+
+
+        /** creates a kernel object
          *
-         * @{
+         * @tparam T_KernelFunctor type of the kernel functor
+         * @param kernelFunctor instance of the functor
+         * @param file file name (for debug)
+         * @param line line number in the file (for debug)
          */
-        template<
-            typename ... T_Args
-        >
-        HINLINE
-        void
-        operator()(
-            T_Args const & ... args
-        ) const
-        {
-
-            std::string const kernelName = typeid( m_kernel.m_kernelFunctor ).name();
-            std::string const kernelInfo = kernelName +
-                std::string( " [" ) + m_kernel.m_file + std::string( ":" ) +
-                std::to_string( m_kernel.m_line ) + std::string( " ]" );
-
-            CUDA_CHECK_KERNEL_MSG(
-                cudaDeviceSynchronize( ),
-                std::string( "Crash before kernel call " ) + kernelInfo
-            );
-
-            pmacc::TaskKernel* taskKernel = pmacc::Environment<>::get().Factory().createTaskKernel(
-                typeid( kernelName ).name()
-            );
-
-            DataSpace<
-                traits::GetNComponents<
-                    T_VectorGrid
-                >::value
-            > gridExtent( m_gridExtent );
-
-            DataSpace<
-                traits::GetNComponents<
-                    T_VectorBlock
-                >::value
-            > blockExtent( m_blockExtent );
-
-            CUPLA_KERNEL( typename T_Kernel::KernelType )(
-                gridExtent.toDim3(),
-                blockExtent.toDim3(),
-                m_sharedMemByte,
-                taskKernel->getCudaStream()
-            )(
-                args ...
-            );
-            CUDA_CHECK_KERNEL_MSG(
-                cudaGetLastError( ),
-                std::string( "Last error after kernel launch " ) + kernelInfo
-            );
-            CUDA_CHECK_KERNEL_MSG(
-                cudaDeviceSynchronize( ),
-                std::string( "Crash after kernel launch " ) + kernelInfo
-            );
-            taskKernel->activateChecks( );
-            CUDA_CHECK_KERNEL_MSG(
-                cudaDeviceSynchronize( ),
-                std::string(  "Crash after kernel activation" ) + kernelInfo
-            );
-        }
-
-        template<
-            typename ... T_Args
-        >
-        HINLINE
-        void
-        operator()(
-            T_Args const &... args
-        )
+        template<typename T_KernelFunctor>
+        auto kernel(
+            T_KernelFunctor const& kernelFunctor,
+            std::string const& file = std::string(),
+            size_t const line = 0) -> Kernel<T_KernelFunctor>
         {
-            return static_cast< const KernelStarter & >(*this)( args ... );
+            return Kernel<T_KernelFunctor>(kernelFunctor, file, line);
         }
-
-        /** @} */
-
-    };
-
-
-    /** creates a kernel object
-     *
-     * @tparam T_KernelFunctor type of the kernel functor
-     * @param kernelFunctor instance of the functor
-     * @param file file name (for debug)
-     * @param line line number in the file (for debug)
-     */
-    template< typename T_KernelFunctor >
-    auto kernel(
-        T_KernelFunctor const & kernelFunctor,
-        std::string const & file = std::string(),
-        size_t const line = 0
-    ) -> Kernel< T_KernelFunctor >
-    {
-        return Kernel< T_KernelFunctor >(
-            kernelFunctor,
-            file,
-            line
-        );
-    }
-} // namespace exec
+    } // namespace exec
 } // namespace pmacc
 
 
@@ -276,7 +211,7 @@ namespace exec
  *
  * @param ... instance of kernel functor
  */
-#define PMACC_KERNEL( ... ) ::pmacc::exec::kernel( __VA_ARGS__, __FILE__,  static_cast< size_t >( __LINE__ ) )
+#define PMACC_KERNEL(...) ::pmacc::exec::kernel(__VA_ARGS__, __FILE__, static_cast<size_t>(__LINE__))
 
 
 #include "pmacc/eventSystem/events/kernelEvents.tpp"
diff --git a/include/pmacc/eventSystem/events/kernelEvents.tpp b/include/pmacc/eventSystem/events/kernelEvents.tpp
index 9efdd38e2c..30a4fd6c88 100644
--- a/include/pmacc/eventSystem/events/kernelEvents.tpp
+++ b/include/pmacc/eventSystem/events/kernelEvents.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,36 +28,16 @@
 
 namespace pmacc
 {
-namespace exec
-{
-    template< typename T_KernelFunctor >
-    template<
-        typename T_VectorGrid,
-        typename T_VectorBlock
-    >
-    HINLINE
-    auto
-    Kernel< T_KernelFunctor >::operator()(
-        T_VectorGrid const & gridExtent,
-        T_VectorBlock const & blockExtent,
-        size_t const sharedMemByte
-    ) const
-    -> KernelStarter<
-        Kernel,
-        T_VectorGrid,
-        T_VectorBlock
-    >
+    namespace exec
     {
-        return KernelStarter<
-            Kernel,
-            T_VectorGrid,
-            T_VectorBlock
-        >(
-            *this,
-            gridExtent,
-            blockExtent,
-            sharedMemByte
-        );
-    }
-} // namespace exec
+        template<typename T_KernelFunctor>
+        template<typename T_VectorGrid, typename T_VectorBlock>
+        HINLINE auto Kernel<T_KernelFunctor>::operator()(
+            T_VectorGrid const& gridExtent,
+            T_VectorBlock const& blockExtent,
+            size_t const sharedMemByte) const -> KernelStarter<Kernel, T_VectorGrid, T_VectorBlock>
+        {
+            return KernelStarter<Kernel, T_VectorGrid, T_VectorBlock>(*this, gridExtent, blockExtent, sharedMemByte);
+        }
+    } // namespace exec
 } // namespace pmacc
diff --git a/include/pmacc/eventSystem/streams/EventStream.hpp b/include/pmacc/eventSystem/streams/EventStream.hpp
index 4919badec5..784b5bd1ab 100644
--- a/include/pmacc/eventSystem/streams/EventStream.hpp
+++ b/include/pmacc/eventSystem/streams/EventStream.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -25,57 +25,54 @@
 #include "pmacc/types.hpp"
 
 
-
 namespace pmacc
 {
-
-/**
- * Wrapper for a single cuda stream.
- * Allows recording cuda events on the stream.
- */
-class EventStream
-{
-public:
-
     /**
-     * Constructor.
-     * Creates the cudaStream_t object.
+     * Wrapper for a single cupla stream.
+     * Allows recording cupla events on the stream.
      */
-    EventStream() : stream(nullptr)
+    class EventStream
     {
-        CUDA_CHECK(cudaStreamCreate(&stream));
-    }
+    public:
+        /**
+         * Constructor.
+         * Creates the cuplaStream_t object.
+         */
+        EventStream() : stream(nullptr)
+        {
+            CUDA_CHECK(cuplaStreamCreate(&stream));
+        }
 
-    /**
-     * Destructor.
-     * Waits for the stream to finish and destroys it.
-     */
-    virtual ~EventStream()
-    {
-        // wait for all kernels in stream to finish
-        CUDA_CHECK_NO_EXCEPT(cudaStreamSynchronize(stream));
-        CUDA_CHECK_NO_EXCEPT(cudaStreamDestroy(stream));
-    }
+        /**
+         * Destructor.
+         * Waits for the stream to finish and destroys it.
+         */
+        virtual ~EventStream()
+        {
+            // wait for all kernels in stream to finish
+            CUDA_CHECK_NO_EXCEPT(cuplaStreamSynchronize(stream));
+            CUDA_CHECK_NO_EXCEPT(cuplaStreamDestroy(stream));
+        }
 
-    /**
-     * Returns the cudaStream_t object associated with this EventStream.
-     * @return the internal cuda stream object
-     */
-    cudaStream_t getCudaStream() const
-    {
-        return stream;
-    }
+        /**
+         * Returns the cuplaStream_t object associated with this EventStream.
+         * @return the internal cupla stream object
+         */
+        cuplaStream_t getCudaStream() const
+        {
+            return stream;
+        }
 
-    void waitOn(const CudaEventHandle& ev)
-    {
-        if (this->stream != ev.getStream())
+        void waitOn(const CudaEventHandle& ev)
         {
-            CUDA_CHECK(cudaStreamWaitEvent(this->getCudaStream(), *ev, 0));
+            if(this->stream != ev.getStream())
+            {
+                CUDA_CHECK(cuplaStreamWaitEvent(this->getCudaStream(), *ev, 0));
+            }
         }
-    }
 
-private:
-    cudaStream_t stream;
-};
+    private:
+        cuplaStream_t stream;
+    };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/streams/StreamController.hpp b/include/pmacc/eventSystem/streams/StreamController.hpp
index 2148eacda9..be6c654055 100644
--- a/include/pmacc/eventSystem/streams/StreamController.hpp
+++ b/include/pmacc/eventSystem/streams/StreamController.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -28,7 +28,6 @@
 #include "pmacc/Environment.def"
 
 
-
 #include <string>
 #include <stdexcept>
 #include <vector>
@@ -42,7 +41,6 @@ namespace pmacc
     class StreamController
     {
     public:
-
         /**
          * Returns a pointer to the next EventStream in the controller's queue.
          * @return pointer to next EventStream
@@ -50,10 +48,11 @@ namespace pmacc
         EventStream* getNextStream()
         {
             if(!isActivated)
-                throw std::runtime_error(std::string("StreamController is not activated but getNextStream() was called"));
+                throw std::runtime_error(
+                    std::string("StreamController is not activated but getNextStream() was called"));
             size_t oldIndex = currentStreamIndex;
             currentStreamIndex++;
-            if (currentStreamIndex == streams.size())
+            if(currentStreamIndex == streams.size())
                 currentStreamIndex = 0;
 
             return streams[oldIndex];
@@ -65,8 +64,7 @@ namespace pmacc
          */
         virtual ~StreamController()
         {
-
-            for (size_t i = 0; i < streams.size(); i++)
+            for(size_t i = 0; i < streams.size(); i++)
             {
                 __delete(streams[i]);
             }
@@ -74,8 +72,8 @@ namespace pmacc
 
             /* This is the single point in PIC where ALL CUDA work must be finished. */
             /* Accessing CUDA objects after this point may fail! */
-            CUDA_CHECK_NO_EXCEPT(cudaDeviceSynchronize());
-            CUDA_CHECK_NO_EXCEPT(cudaDeviceReset());
+            CUDA_CHECK_NO_EXCEPT(cuplaDeviceSynchronize());
+            CUDA_CHECK_NO_EXCEPT(cuplaDeviceReset());
         }
 
         /**
@@ -84,7 +82,7 @@ namespace pmacc
          */
         void addStreams(size_t count)
         {
-            for (size_t i = 0; i < count; i++)
+            for(size_t i = 0; i < count; i++)
             {
                 streams.push_back(new EventStream());
             }
@@ -97,7 +95,7 @@ namespace pmacc
         void activate()
         {
             addStreams(1);
-            isActivated=true;
+            isActivated = true;
         }
 
         /**
@@ -110,13 +108,12 @@ namespace pmacc
         }
 
     private:
-
         friend struct detail::Environment;
 
         /**
          * Constructor.
          */
-        StreamController() : isActivated(false),currentStreamIndex(0)
+        StreamController() : isActivated(false), currentStreamIndex(0)
         {
         }
 
@@ -134,7 +131,6 @@ namespace pmacc
         std::vector<EventStream*> streams;
         size_t currentStreamIndex;
         bool isActivated;
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/Factory.hpp b/include/pmacc/eventSystem/tasks/Factory.hpp
index 22440e8564..b4a5a44a48 100644
--- a/include/pmacc/eventSystem/tasks/Factory.hpp
+++ b/include/pmacc/eventSystem/tasks/Factory.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -30,13 +30,13 @@
 
 namespace pmacc
 {
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class HostBuffer;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class DeviceBuffer;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class Exchange;
 
     class TaskKernel;
@@ -49,110 +49,126 @@ namespace pmacc
     class Factory
     {
     public:
-
         /**
          * creates a TaskCopyHostToDevice
          * @param src HostBuffer to copy data from
          * @param dst DeviceBuffer to copy data to
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskCopyHostToDevice(HostBuffer<TYPE, DIM>& src, DeviceBuffer<TYPE, DIM>& dst,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskCopyHostToDevice(
+            HostBuffer<TYPE, DIM>& src,
+            DeviceBuffer<TYPE, DIM>& dst,
+            ITask* registeringTask = nullptr);
 
         /**
          * creates a TaskCopyDeviceToHost
          * @param src DeviceBuffer to copy data from
          * @param dst HostBuffer to copy data to
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskCopyDeviceToHost(DeviceBuffer<TYPE, DIM>& src,
-        HostBuffer<TYPE, DIM>& dst,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskCopyDeviceToHost(
+            DeviceBuffer<TYPE, DIM>& src,
+            HostBuffer<TYPE, DIM>& dst,
+            ITask* registeringTask = nullptr);
 
         /**
          * creates a TaskCopyDeviceToDevice
          * @param src DeviceBuffer to copy data from
          * @param dst DeviceBuffer to copy data to
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskCopyDeviceToDevice( DeviceBuffer<TYPE, DIM>& src, DeviceBuffer<TYPE, DIM>& dst,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskCopyDeviceToDevice(
+            DeviceBuffer<TYPE, DIM>& src,
+            DeviceBuffer<TYPE, DIM>& dst,
+            ITask* registeringTask = nullptr);
 
         /**
          * Creates a TaskReceive.
          * @param ex Exchange to create new TaskReceive with
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskReceive(Exchange<TYPE, DIM> &ex,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskReceive(Exchange<TYPE, DIM>& ex, ITask* registeringTask = nullptr);
 
         /**
          * Creates a TaskSend.
          * @param ex Exchange to create new TaskSend with
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskSend(Exchange<TYPE, DIM> &ex,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskSend(Exchange<TYPE, DIM>& ex, ITask* registeringTask = nullptr);
 
         /**
          * Creates a TaskSendMPI.
          * @param exchange Exchange to create new TaskSendMPI with
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskSendMPI(Exchange<TYPE, DIM> *ex,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskSendMPI(Exchange<TYPE, DIM>* ex, ITask* registeringTask = nullptr);
 
         /**
          * Creates a TaskReceiveMPI.
          * @param ex Exchange to create new TaskReceiveMPI with
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskReceiveMPI(Exchange<TYPE, DIM> *ex,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskReceiveMPI(Exchange<TYPE, DIM>* ex, ITask* registeringTask = nullptr);
 
         /**
          * Creates a new TaskSetValue.
          * @param dst destination DeviceBuffer to set value on
          * @param value value to be set in the DeviceBuffer
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskSetValue(DeviceBuffer<TYPE, DIM>& dst, const TYPE& value,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskSetValue(
+            DeviceBuffer<TYPE, DIM>& dst,
+            const TYPE& value,
+            ITask* registeringTask = nullptr);
 
         /**
          * Creates a new TaskSetCurrentSizeOnDevice.
          * @param dst destination DeviceBuffer to set current size on
          * @param size size to be set on DeviceBuffer
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskSetCurrentSizeOnDevice(DeviceBuffer<TYPE, DIM>& dst, size_t size,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskSetCurrentSizeOnDevice(
+            DeviceBuffer<TYPE, DIM>& dst,
+            size_t size,
+            ITask* registeringTask = nullptr);
 
         /**
          * Creates a new TaskGetCurrentSizeFromDevic.
          * @param buffer DeviceBuffer to get current size from
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
-        template <class TYPE, unsigned DIM>
-        EventTask createTaskGetCurrentSizeFromDevice(DeviceBuffer<TYPE, DIM>& buffer,
-        ITask *registeringTask = nullptr);
+        template<class TYPE, unsigned DIM>
+        EventTask createTaskGetCurrentSizeFromDevice(
+            DeviceBuffer<TYPE, DIM>& buffer,
+            ITask* registeringTask = nullptr);
 
         /**
          * Creates a new TaskKernel.
          * @param kernelname name of the kernel which should be called
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          * @return the newly created TaskKernel
          */
-        TaskKernel* createTaskKernel(std::string kernelname, ITask *registeringTask = nullptr);
+        TaskKernel* createTaskKernel(std::string kernelname, ITask* registeringTask = nullptr);
 
         /**
          * Starts a task by initialising it and adding it to the Manager's queue.
@@ -160,23 +176,20 @@ namespace pmacc
          * @param task the ITask to start
          * @param registeringTask optional task which can be registered as an observer for task
          */
-        EventTask startTask(ITask& task, ITask *registeringTask);
+        EventTask startTask(ITask& task, ITask* registeringTask);
 
     private:
-
         friend struct detail::Environment;
 
-        Factory() {};
+        Factory(){};
 
-        Factory(const Factory&) { };
+        Factory(const Factory&){};
 
         static Factory& getInstance()
         {
             static Factory instance;
             return instance;
         }
-
     };
 
-} //namespace pmacc
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/Factory.tpp b/include/pmacc/eventSystem/tasks/Factory.tpp
index 428e99c9cc..07e97f8538 100644
--- a/include/pmacc/eventSystem/tasks/Factory.tpp
+++ b/include/pmacc/eventSystem/tasks/Factory.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -43,19 +43,19 @@
 
 namespace pmacc
 {
-
     /**
      * creates a TaskCopyHostToDevice
      * @param src HostBuffer to copy data from
      * @param dst DeviceBuffer to copy data to
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskCopyHostToDevice(HostBuffer<TYPE, DIM>& src, DeviceBuffer<TYPE, DIM>& dst,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskCopyHostToDevice(
+        HostBuffer<TYPE, DIM>& src,
+        DeviceBuffer<TYPE, DIM>& dst,
+        ITask* registeringTask)
     {
-
-        TaskCopyHostToDevice<TYPE, DIM>* task = new TaskCopyHostToDevice<TYPE, DIM > (src, dst);
+        TaskCopyHostToDevice<TYPE, DIM>* task = new TaskCopyHostToDevice<TYPE, DIM>(src, dst);
 
         return startTask(*task, registeringTask);
     }
@@ -66,12 +66,13 @@ namespace pmacc
      * @param dst HostBuffer to copy data to
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskCopyDeviceToHost(DeviceBuffer<TYPE, DIM>& src,
-    HostBuffer<TYPE, DIM>& dst,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskCopyDeviceToHost(
+        DeviceBuffer<TYPE, DIM>& src,
+        HostBuffer<TYPE, DIM>& dst,
+        ITask* registeringTask)
     {
-        TaskCopyDeviceToHost<TYPE, DIM>* task = new TaskCopyDeviceToHost<TYPE, DIM > (src, dst);
+        TaskCopyDeviceToHost<TYPE, DIM>* task = new TaskCopyDeviceToHost<TYPE, DIM>(src, dst);
 
         return startTask(*task, registeringTask);
     }
@@ -82,11 +83,13 @@ namespace pmacc
      * @param dst DeviceBuffer to copy data to
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskCopyDeviceToDevice( DeviceBuffer<TYPE, DIM>& src, DeviceBuffer<TYPE, DIM>& dst,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskCopyDeviceToDevice(
+        DeviceBuffer<TYPE, DIM>& src,
+        DeviceBuffer<TYPE, DIM>& dst,
+        ITask* registeringTask)
     {
-        TaskCopyDeviceToDevice<TYPE, DIM>* task = new TaskCopyDeviceToDevice<TYPE, DIM > (src, dst);
+        TaskCopyDeviceToDevice<TYPE, DIM>* task = new TaskCopyDeviceToDevice<TYPE, DIM>(src, dst);
 
         return startTask(*task, registeringTask);
     }
@@ -96,11 +99,10 @@ namespace pmacc
      * @param ex Exchange to create new TaskReceive with
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskReceive(Exchange<TYPE, DIM> &ex,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskReceive(Exchange<TYPE, DIM>& ex, ITask* registeringTask)
     {
-        TaskReceive<TYPE, DIM>* task = new TaskReceive<TYPE, DIM > (ex);
+        TaskReceive<TYPE, DIM>* task = new TaskReceive<TYPE, DIM>(ex);
 
         return startTask(*task, registeringTask);
     }
@@ -110,11 +112,10 @@ namespace pmacc
      * @param ex Exchange to create new TaskSend with
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskSend(Exchange<TYPE, DIM> &ex,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskSend(Exchange<TYPE, DIM>& ex, ITask* registeringTask)
     {
-        TaskSend<TYPE, DIM>* task = new TaskSend<TYPE, DIM > (ex);
+        TaskSend<TYPE, DIM>* task = new TaskSend<TYPE, DIM>(ex);
 
         return startTask(*task, registeringTask);
     }
@@ -124,11 +125,10 @@ namespace pmacc
      * @param exchange Exchange to create new TaskSendMPI with
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskSendMPI(Exchange<TYPE, DIM> *ex,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskSendMPI(Exchange<TYPE, DIM>* ex, ITask* registeringTask)
     {
-        TaskSendMPI<TYPE, DIM>* task = new TaskSendMPI<TYPE, DIM > (ex);
+        TaskSendMPI<TYPE, DIM>* task = new TaskSendMPI<TYPE, DIM>(ex);
 
         return startTask(*task, registeringTask);
     }
@@ -138,11 +138,10 @@ namespace pmacc
      * @param ex Exchange to create new TaskReceiveMPI with
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskReceiveMPI(Exchange<TYPE, DIM> *ex,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskReceiveMPI(Exchange<TYPE, DIM>* ex, ITask* registeringTask)
     {
-        TaskReceiveMPI<TYPE, DIM>* task = new TaskReceiveMPI<TYPE, DIM > (ex);
+        TaskReceiveMPI<TYPE, DIM>* task = new TaskReceiveMPI<TYPE, DIM>(ex);
 
         return startTask(*task, registeringTask);
     }
@@ -153,20 +152,21 @@ namespace pmacc
      * @param value value to be set in the DeviceBuffer
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskSetValue(DeviceBuffer<TYPE, DIM>& dst,const TYPE& value,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskSetValue(
+        DeviceBuffer<TYPE, DIM>& dst,
+        const TYPE& value,
+        ITask* registeringTask)
     {
-
         /* sizeof(TYPE)<256 use fast set method for small data and slow method for big data
          * the rest of 256bytes are reserved for other kernel parameter
          */
         enum
         {
-            isSmall = (sizeof (TYPE) <= 128)
-        }; //if we use const variable the compiler create warnings
+            isSmall = (sizeof(TYPE) <= 128)
+        }; // if we use const variable the compiler create warnings
 
-        TaskSetValue<TYPE, DIM, isSmall >* task = new TaskSetValue<TYPE, DIM, isSmall > (dst, value);
+        TaskSetValue<TYPE, DIM, isSmall>* task = new TaskSetValue<TYPE, DIM, isSmall>(dst, value);
 
         return startTask(*task, registeringTask);
     }
@@ -177,11 +177,13 @@ namespace pmacc
      * @param size size to be set on DeviceBuffer
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskSetCurrentSizeOnDevice(DeviceBuffer<TYPE, DIM>& dst, size_t size,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskSetCurrentSizeOnDevice(
+        DeviceBuffer<TYPE, DIM>& dst,
+        size_t size,
+        ITask* registeringTask)
     {
-        TaskSetCurrentSizeOnDevice<TYPE, DIM>* task = new TaskSetCurrentSizeOnDevice<TYPE, DIM > (dst, size);
+        TaskSetCurrentSizeOnDevice<TYPE, DIM>* task = new TaskSetCurrentSizeOnDevice<TYPE, DIM>(dst, size);
 
         return startTask(*task, registeringTask);
     }
@@ -191,11 +193,12 @@ namespace pmacc
      * @param buffer DeviceBuffer to get current size from
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      */
-    template <class TYPE, unsigned DIM>
-    inline EventTask Factory::createTaskGetCurrentSizeFromDevice(DeviceBuffer<TYPE, DIM>& buffer,
-    ITask *registeringTask)
+    template<class TYPE, unsigned DIM>
+    inline EventTask Factory::createTaskGetCurrentSizeFromDevice(
+        DeviceBuffer<TYPE, DIM>& buffer,
+        ITask* registeringTask)
     {
-        TaskGetCurrentSizeFromDevice<TYPE, DIM>* task = new TaskGetCurrentSizeFromDevice<TYPE, DIM > (buffer);
+        TaskGetCurrentSizeFromDevice<TYPE, DIM>* task = new TaskGetCurrentSizeFromDevice<TYPE, DIM>(buffer);
 
         return startTask(*task, registeringTask);
     }
@@ -206,20 +209,21 @@ namespace pmacc
      * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
      * @return the newly created TaskKernel
      */
-    inline TaskKernel* Factory::createTaskKernel(std::string kernelname, ITask *registeringTask)
+    inline TaskKernel* Factory::createTaskKernel(std::string kernelname, ITask* registeringTask)
     {
         TaskKernel* task = new TaskKernel(kernelname);
 
-        if (registeringTask != nullptr)
+        if(registeringTask != nullptr)
             task->addObserver(registeringTask);
 
         return task;
     }
 
 
-    inline EventTask Factory::startTask(ITask& task, ITask *registeringTask )
+    inline EventTask Factory::startTask(ITask& task, ITask* registeringTask)
     {
-        if (registeringTask != nullptr){
+        if(registeringTask != nullptr)
+        {
             task.addObserver(registeringTask);
         }
         EventTask event(task.getId());
@@ -232,7 +236,4 @@ namespace pmacc
     }
 
 
-} //namespace pmacc
-
-
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/ITask.hpp b/include/pmacc/eventSystem/tasks/ITask.hpp
index 0f4f1eed80..e8b28402a8 100644
--- a/include/pmacc/eventSystem/tasks/ITask.hpp
+++ b/include/pmacc/eventSystem/tasks/ITask.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -36,19 +36,23 @@ namespace pmacc
     /**
      * Abstract base class for all tasks.
      */
-    class ITask : public EventNotify, public IEvent
+    class ITask
+        : public EventNotify
+        , public IEvent
     {
     public:
-
         enum TaskType
         {
-            TASK_UNKNOWN, TASK_CUDA, TASK_MPI, TASK_HOST
+            TASK_UNKNOWN,
+            TASK_DEVICE,
+            TASK_MPI,
+            TASK_HOST
         };
 
         /**
          * constructor
          */
-        ITask(): myType(ITask::TASK_UNKNOWN)
+        ITask() : myType(ITask::TASK_UNKNOWN)
         {
             // task id 0 is reserved for invalid
             static id_t globalId = 1;
@@ -69,7 +73,7 @@ namespace pmacc
          */
         bool execute()
         {
-            //std::cout << "execute: " << toString() << std::endl;
+            // std::cout << "execute: " << toString() << std::endl;
             return executeIntern();
         }
 
@@ -77,7 +81,7 @@ namespace pmacc
          * Initializes the task.
          * Must be called before adding the task to the Manager's queue.
          */
-        virtual void init()=0;
+        virtual void init() = 0;
 
         /**
          * Returns the unique id of this task.
@@ -116,6 +120,7 @@ namespace pmacc
          * @return a string naming this task
          */
         virtual std::string toString() = 0;
+
     protected:
         virtual bool executeIntern() = 0;
 
@@ -123,4 +128,4 @@ namespace pmacc
         TaskType myType;
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/MPITask.hpp b/include/pmacc/eventSystem/tasks/MPITask.hpp
index 542a5fcf2a..af5c7f9574 100644
--- a/include/pmacc/eventSystem/tasks/MPITask.hpp
+++ b/include/pmacc/eventSystem/tasks/MPITask.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -27,21 +27,17 @@
 
 namespace pmacc
 {
-
     /**
      * Abstract base class for all tasks which depend on MPI communication.
      */
     class MPITask : public ITask
     {
     public:
-
         /**
          * Constructor.
          * Starts a MPI operation on the transaction system.
          */
-        MPITask() :
-        ITask(),
-        finished(false)
+        MPITask() : ITask(), finished(false)
         {
             this->setTaskType(ITask::TASK_MPI);
         }
@@ -54,7 +50,6 @@ namespace pmacc
         }
 
     protected:
-
         /**
          * Returns if the task is finished.
          *
@@ -72,7 +67,8 @@ namespace pmacc
         {
             finished = true;
         }
+
     private:
         bool finished;
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/StreamTask.hpp b/include/pmacc/eventSystem/tasks/StreamTask.hpp
index dce83d07ed..a816972a64 100644
--- a/include/pmacc/eventSystem/tasks/StreamTask.hpp
+++ b/include/pmacc/eventSystem/tasks/StreamTask.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,12 +29,11 @@ namespace pmacc
     class EventStream;
 
     /**
-     * Abstract base class for all tasks which depend on cuda streams.
+     * Abstract base class for all tasks which depend on cupla streams.
      */
     class StreamTask : public ITask
     {
     public:
-
         /**
          * Constructor
          *
@@ -50,19 +49,19 @@ namespace pmacc
         }
 
         /**
-         * Returns the cuda event associated with this task.
+         * Returns the cupla event associated with this task.
          * An event has to be recorded or set before calling this.
          *
-         * @return the task's cuda event
+         * @return the task's cupla event
          */
         CudaEventHandle getCudaEventHandle() const;
 
         /**
          * Sets the
          *
-         * @param cudaEvent
+         * @param cuplaEvent
          */
-        void setCudaEventHandle(const CudaEventHandle& cudaEvent);
+        void setCudaEventHandle(const CudaEventHandle& cuplaEvent);
 
         /**
          * Returns if this task is finished.
@@ -86,25 +85,24 @@ namespace pmacc
         void setEventStream(EventStream* newStream);
 
         /**
-         * Returns the cuda stream of the underlying EventStream.
+         * Returns the cupla stream of the underlying EventStream.
          *
-         * @return the associated cuda stream
+         * @return the associated cupla stream
          */
-        cudaStream_t getCudaStream();
+        cuplaStream_t getCudaStream();
 
 
     protected:
-
         /**
          * Activates this task by recording an event on its stream.
          */
         inline void activate();
 
 
-        EventStream *stream;
-        CudaEventHandle cudaEvent;
+        EventStream* stream;
+        CudaEventHandle cuplaEvent;
         bool hasCudaEventHandle;
         bool alwaysFinished;
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/StreamTask.tpp b/include/pmacc/eventSystem/tasks/StreamTask.tpp
index 2a36779238..c328a28636 100644
--- a/include/pmacc/eventSystem/tasks/StreamTask.tpp
+++ b/include/pmacc/eventSystem/tasks/StreamTask.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -29,69 +29,64 @@
 
 namespace pmacc
 {
+    inline StreamTask::StreamTask() : ITask(), stream(nullptr), hasCudaEventHandle(false), alwaysFinished(false)
+    {
+        this->setTaskType(ITask::TASK_DEVICE);
+    }
 
-inline StreamTask::StreamTask( ) :
-ITask( ),
-stream( nullptr ),
-hasCudaEventHandle( false ),
-alwaysFinished( false )
-{
-    this->setTaskType( ITask::TASK_CUDA );
-}
-
-inline CudaEventHandle StreamTask::getCudaEventHandle( ) const
-{
-    PMACC_ASSERT( hasCudaEventHandle );
-    return cudaEvent;
-}
+    inline CudaEventHandle StreamTask::getCudaEventHandle() const
+    {
+        PMACC_ASSERT(hasCudaEventHandle);
+        return cuplaEvent;
+    }
 
-inline void StreamTask::setCudaEventHandle(const CudaEventHandle& cudaEvent )
-{
-    this->hasCudaEventHandle = true;
-    this->cudaEvent = cudaEvent;
-}
+    inline void StreamTask::setCudaEventHandle(const CudaEventHandle& cuplaEvent)
+    {
+        this->hasCudaEventHandle = true;
+        this->cuplaEvent = cuplaEvent;
+    }
 
-inline bool StreamTask::isFinished( )
-{
-    if ( alwaysFinished )
-        return true;
-    if ( hasCudaEventHandle )
+    inline bool StreamTask::isFinished()
     {
-        if ( cudaEvent.isFinished( ) )
-        {
-            alwaysFinished = true;
+        if(alwaysFinished)
             return true;
+        if(hasCudaEventHandle)
+        {
+            if(cuplaEvent.isFinished())
+            {
+                alwaysFinished = true;
+                return true;
+            }
         }
+        return false;
     }
-    return false;
-}
 
-inline EventStream* StreamTask::getEventStream( )
-{
-    if ( stream == nullptr )
-        stream = __getEventStream( TASK_CUDA );
-    return stream;
-}
+    inline EventStream* StreamTask::getEventStream()
+    {
+        if(stream == nullptr)
+            stream = __getEventStream(TASK_DEVICE);
+        return stream;
+    }
 
-inline void StreamTask::setEventStream( EventStream* newStream )
-{
-    PMACC_ASSERT( newStream != nullptr );
-    PMACC_ASSERT( stream == nullptr ); //it is only allowed to set a stream if no stream is set before
-    this->stream = newStream;
-}
+    inline void StreamTask::setEventStream(EventStream* newStream)
+    {
+        PMACC_ASSERT(newStream != nullptr);
+        PMACC_ASSERT(stream == nullptr); // it is only allowed to set a stream if no stream is set before
+        this->stream = newStream;
+    }
 
-inline cudaStream_t StreamTask::getCudaStream( )
-{
-    if ( stream == nullptr )
-        stream = Environment<>::get( ).TransactionManager( ).getEventStream( TASK_CUDA );
-    return stream->getCudaStream( );
-}
+    inline cuplaStream_t StreamTask::getCudaStream()
+    {
+        if(stream == nullptr)
+            stream = Environment<>::get().TransactionManager().getEventStream(TASK_DEVICE);
+        return stream->getCudaStream();
+    }
 
-inline void StreamTask::activate( )
-{
-    cudaEvent = Environment<>::get().EventPool( ).pop( );
-    cudaEvent.recordEvent( getCudaStream( ) );
-    hasCudaEventHandle = true;
-}
+    inline void StreamTask::activate()
+    {
+        cuplaEvent = Environment<>::get().EventPool().pop();
+        cuplaEvent.recordEvent(getCudaStream());
+        hasCudaEventHandle = true;
+    }
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskCopyDeviceToDevice.hpp b/include/pmacc/eventSystem/tasks/TaskCopyDeviceToDevice.hpp
index fe64d8517c..c39a224459 100644
--- a/include/pmacc/eventSystem/tasks/TaskCopyDeviceToDevice.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskCopyDeviceToDevice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -28,23 +28,19 @@
 #include "pmacc/types.hpp"
 
 
-
 namespace pmacc
 {
-
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class DeviceBuffer;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class TaskCopyDeviceToDeviceBase : public StreamTask
     {
     public:
-
-        TaskCopyDeviceToDeviceBase( DeviceBuffer<TYPE, DIM>& src, DeviceBuffer<TYPE, DIM>& dst) :
-        StreamTask()
+        TaskCopyDeviceToDeviceBase(DeviceBuffer<TYPE, DIM>& src, DeviceBuffer<TYPE, DIM>& dst) : StreamTask()
         {
-            this->source = & src;
-            this->destination =  & dst;
+            this->source = &src;
+            this->destination = &dst;
         }
 
         virtual ~TaskCopyDeviceToDeviceBase()
@@ -59,7 +55,6 @@ namespace pmacc
 
         void event(id_t, EventType, IEventData*)
         {
-
         }
 
         virtual void init()
@@ -67,7 +62,7 @@ namespace pmacc
             size_t current_size = source->getCurrentSize();
             destination->setCurrentSize(current_size);
             DataSpace<DIM> devCurrentSize = source->getCurrentDataSpace(current_size);
-            if (source->is1D() && destination->is1D())
+            if(source->is1D() && destination->is1D())
                 fastCopy(source->getPointer(), destination->getPointer(), devCurrentSize.productOfComponents());
             else
                 copy(devCurrentSize);
@@ -81,114 +76,99 @@ namespace pmacc
         }
 
     protected:
-
-        virtual void copy(DataSpace<DIM> &devCurrentSize) = 0;
+        virtual void copy(DataSpace<DIM>& devCurrentSize) = 0;
 
         void fastCopy(TYPE* src, TYPE* dst, size_t size)
         {
-            CUDA_CHECK(cudaMemcpyAsync(dst,
-                                       src,
-                                       size * sizeof (TYPE), cudaMemcpyDeviceToDevice,
-                                       this->getCudaStream()));
+            CUDA_CHECK(
+                cuplaMemcpyAsync(dst, src, size * sizeof(TYPE), cuplaMemcpyDeviceToDevice, this->getCudaStream()));
         }
 
-        DeviceBuffer<TYPE, DIM> *source;
-        DeviceBuffer<TYPE, DIM> *destination;
+        DeviceBuffer<TYPE, DIM>* source;
+        DeviceBuffer<TYPE, DIM>* destination;
     };
 
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class TaskCopyDeviceToDevice;
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyDeviceToDevice<TYPE, DIM1> : public TaskCopyDeviceToDeviceBase<TYPE, DIM1>
     {
     public:
-
-        TaskCopyDeviceToDevice(DeviceBuffer<TYPE, DIM1>& src, DeviceBuffer<TYPE, DIM1>& dst) :
-        TaskCopyDeviceToDeviceBase<TYPE, DIM1>(src, dst)
+        TaskCopyDeviceToDevice(DeviceBuffer<TYPE, DIM1>& src, DeviceBuffer<TYPE, DIM1>& dst)
+            : TaskCopyDeviceToDeviceBase<TYPE, DIM1>(src, dst)
         {
         }
 
     private:
-
-        virtual void copy(DataSpace<DIM1> &devCurrentSize)
+        virtual void copy(DataSpace<DIM1>& devCurrentSize)
         {
-
-            CUDA_CHECK(cudaMemcpyAsync(this->destination->getPointer(),
-                                       this->source->getPointer(),
-                                       devCurrentSize[0] * sizeof (TYPE), cudaMemcpyDeviceToDevice,
-                                       this->getCudaStream()));
+            CUDA_CHECK(cuplaMemcpyAsync(
+                this->destination->getPointer(),
+                this->source->getPointer(),
+                devCurrentSize[0] * sizeof(TYPE),
+                cuplaMemcpyDeviceToDevice,
+                this->getCudaStream()));
         }
-
     };
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyDeviceToDevice<TYPE, DIM2> : public TaskCopyDeviceToDeviceBase<TYPE, DIM2>
     {
     public:
-
-        TaskCopyDeviceToDevice( DeviceBuffer<TYPE, DIM2>& src, DeviceBuffer<TYPE, DIM2>& dst) :
-        TaskCopyDeviceToDeviceBase<TYPE, DIM2>(src, dst)
+        TaskCopyDeviceToDevice(DeviceBuffer<TYPE, DIM2>& src, DeviceBuffer<TYPE, DIM2>& dst)
+            : TaskCopyDeviceToDeviceBase<TYPE, DIM2>(src, dst)
         {
         }
 
     private:
-
-        virtual void copy(DataSpace<DIM2> &devCurrentSize)
+        virtual void copy(DataSpace<DIM2>& devCurrentSize)
         {
-            CUDA_CHECK(cudaMemcpy2DAsync(this->destination->getPointer(),
-                                         this->destination->getPitch(),
-                                         this->source->getPointer(),
-                                         this->source->getPitch(),
-                                         devCurrentSize[0] * sizeof (TYPE),
-                                         devCurrentSize[1],
-                                         cudaMemcpyDeviceToDevice,
-                                         this->getCudaStream()));
-
+            CUDA_CHECK(cuplaMemcpy2DAsync(
+                this->destination->getPointer(),
+                this->destination->getPitch(),
+                this->source->getPointer(),
+                this->source->getPitch(),
+                devCurrentSize[0] * sizeof(TYPE),
+                devCurrentSize[1],
+                cuplaMemcpyDeviceToDevice,
+                this->getCudaStream()));
         }
-
     };
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyDeviceToDevice<TYPE, DIM3> : public TaskCopyDeviceToDeviceBase<TYPE, DIM3>
     {
     public:
-
-        TaskCopyDeviceToDevice( DeviceBuffer<TYPE, DIM3>& src, DeviceBuffer<TYPE, DIM3>& dst) :
-        TaskCopyDeviceToDeviceBase<TYPE, DIM3>(src, dst)
+        TaskCopyDeviceToDevice(DeviceBuffer<TYPE, DIM3>& src, DeviceBuffer<TYPE, DIM3>& dst)
+            : TaskCopyDeviceToDeviceBase<TYPE, DIM3>(src, dst)
         {
         }
 
     private:
-
-        virtual void copy(DataSpace<DIM3> &devCurrentSize)
+        virtual void copy(DataSpace<DIM3>& devCurrentSize)
         {
-
-            cudaMemcpy3DParms params;
+            cuplaMemcpy3DParms params;
             params.srcArray = nullptr;
-            params.srcPos = make_cudaPos(
-                                         this->source->getOffset()[0] * sizeof (TYPE),
-                                         this->source->getOffset()[1],
-                                         this->source->getOffset()[2]);
+            params.srcPos = make_cuplaPos(
+                this->source->getOffset()[0] * sizeof(TYPE),
+                this->source->getOffset()[1],
+                this->source->getOffset()[2]);
             params.srcPtr = this->source->getCudaPitched();
 
             params.dstArray = nullptr;
-            params.dstPos = make_cudaPos(
-                                         this->destination->getOffset()[0] * sizeof (TYPE),
-                                         this->destination->getOffset()[1],
-                                         this->destination->getOffset()[2]);
+            params.dstPos = make_cuplaPos(
+                this->destination->getOffset()[0] * sizeof(TYPE),
+                this->destination->getOffset()[1],
+                this->destination->getOffset()[2]);
             ;
             params.dstPtr = this->destination->getCudaPitched();
 
-            params.extent = make_cudaExtent(
-                                            devCurrentSize[0] * sizeof (TYPE),
-                                            devCurrentSize[1],
-                                            devCurrentSize[2]);
-            params.kind = cudaMemcpyDeviceToDevice;
-            CUDA_CHECK(cudaMemcpy3DAsync(&params, this->getCudaStream()));
+            params.extent = make_cuplaExtent(devCurrentSize[0] * sizeof(TYPE), devCurrentSize[1], devCurrentSize[2]);
+            params.kind = cuplaMemcpyDeviceToDevice;
+            CUDA_CHECK(cuplaMemcpy3DAsync(&params, this->getCudaStream()));
         }
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskCopyDeviceToHost.hpp b/include/pmacc/eventSystem/tasks/TaskCopyDeviceToHost.hpp
index 4ddd908e57..0e7bb501cd 100644
--- a/include/pmacc/eventSystem/tasks/TaskCopyDeviceToHost.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskCopyDeviceToHost.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -27,27 +27,23 @@
 #include "pmacc/eventSystem/tasks/StreamTask.hpp"
 
 
-
 #include <iomanip>
 
 namespace pmacc
 {
-
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class HostBuffer;
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class DeviceBuffer;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class TaskCopyDeviceToHostBase : public StreamTask
     {
     public:
-
-        TaskCopyDeviceToHostBase( DeviceBuffer<TYPE, DIM>& src, HostBuffer<TYPE, DIM>& dst) :
-        StreamTask()
+        TaskCopyDeviceToHostBase(DeviceBuffer<TYPE, DIM>& src, HostBuffer<TYPE, DIM>& dst) : StreamTask()
         {
-            this->host =  & dst;
-            this->device =  & src;
+            this->host = &dst;
+            this->device = &src;
         }
 
         virtual ~TaskCopyDeviceToHostBase()
@@ -74,8 +70,8 @@ namespace pmacc
             size_t current_size = device->getCurrentSize();
             host->setCurrentSize(current_size);
             DataSpace<DIM> devCurrentSize = device->getCurrentDataSpace(current_size);
-            if (host->is1D() && device->is1D())
-                fastCopy(device->getPointer(),host->getPointer(),  devCurrentSize.productOfComponents());
+            if(host->is1D() && device->is1D())
+                fastCopy(device->getPointer(), host->getPointer(), devCurrentSize.productOfComponents());
             else
                 copy(devCurrentSize);
 
@@ -83,118 +79,101 @@ namespace pmacc
         }
 
     protected:
+        virtual void copy(DataSpace<DIM>& devCurrentSize) = 0;
 
-        virtual void copy(DataSpace<DIM> &devCurrentSize) = 0;
-
-        void fastCopy(TYPE* src,TYPE* dst,  size_t size)
+        void fastCopy(TYPE* src, TYPE* dst, size_t size)
         {
-            CUDA_CHECK(cudaMemcpyAsync(dst,
-                                       src,
-                                       size * sizeof (TYPE),
-                                       cudaMemcpyDeviceToHost,
-                                       this->getCudaStream()));
+            CUDA_CHECK(
+                cuplaMemcpyAsync(dst, src, size * sizeof(TYPE), cuplaMemcpyDeviceToHost, this->getCudaStream()));
         }
 
-        HostBuffer<TYPE, DIM> *host;
-        DeviceBuffer<TYPE, DIM> *device;
+        HostBuffer<TYPE, DIM>* host;
+        DeviceBuffer<TYPE, DIM>* device;
     };
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class TaskCopyDeviceToHost;
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyDeviceToHost<TYPE, DIM1> : public TaskCopyDeviceToHostBase<TYPE, DIM1>
     {
     public:
-
-        TaskCopyDeviceToHost( DeviceBuffer<TYPE, DIM1>& src, HostBuffer<TYPE, DIM1>& dst) :
-        TaskCopyDeviceToHostBase<TYPE, DIM1>(src, dst)
+        TaskCopyDeviceToHost(DeviceBuffer<TYPE, DIM1>& src, HostBuffer<TYPE, DIM1>& dst)
+            : TaskCopyDeviceToHostBase<TYPE, DIM1>(src, dst)
         {
         }
 
     private:
-
-        virtual void copy(DataSpace<DIM1> &devCurrentSize)
+        virtual void copy(DataSpace<DIM1>& devCurrentSize)
         {
-
-            CUDA_CHECK(cudaMemcpyAsync(this->host->getBasePointer(),
-                                       this->device->getPointer(),
-                                       devCurrentSize[0] * sizeof (TYPE),
-                                       cudaMemcpyDeviceToHost,
-                                       this->getCudaStream()));
-
+            CUDA_CHECK(cuplaMemcpyAsync(
+                this->host->getBasePointer(),
+                this->device->getPointer(),
+                devCurrentSize[0] * sizeof(TYPE),
+                cuplaMemcpyDeviceToHost,
+                this->getCudaStream()));
         }
-
     };
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyDeviceToHost<TYPE, DIM2> : public TaskCopyDeviceToHostBase<TYPE, DIM2>
     {
     public:
-
-        TaskCopyDeviceToHost(DeviceBuffer<TYPE, DIM2>& src, HostBuffer<TYPE, DIM2>& dst) :
-        TaskCopyDeviceToHostBase<TYPE, DIM2>(src, dst)
+        TaskCopyDeviceToHost(DeviceBuffer<TYPE, DIM2>& src, HostBuffer<TYPE, DIM2>& dst)
+            : TaskCopyDeviceToHostBase<TYPE, DIM2>(src, dst)
         {
         }
 
     private:
-
-        virtual void copy(DataSpace<DIM2> &devCurrentSize)
+        virtual void copy(DataSpace<DIM2>& devCurrentSize)
         {
-            CUDA_CHECK(cudaMemcpy2DAsync(this->host->getBasePointer(),
-                                         this->host->getDataSpace()[0] * sizeof (TYPE), /*this is pitch*/
-                                         this->device->getPointer(),
-                                         this->device->getPitch(), /*this is pitch*/
-                                         devCurrentSize[0] * sizeof (TYPE),
-                                         devCurrentSize[1],
-                                         cudaMemcpyDeviceToHost,
-                                         this->getCudaStream()));
-
+            CUDA_CHECK(cuplaMemcpy2DAsync(
+                this->host->getBasePointer(),
+                this->host->getDataSpace()[0] * sizeof(TYPE), /*this is pitch*/
+                this->device->getPointer(),
+                this->device->getPitch(), /*this is pitch*/
+                devCurrentSize[0] * sizeof(TYPE),
+                devCurrentSize[1],
+                cuplaMemcpyDeviceToHost,
+                this->getCudaStream()));
         }
-
     };
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyDeviceToHost<TYPE, DIM3> : public TaskCopyDeviceToHostBase<TYPE, DIM3>
     {
     public:
-
-        TaskCopyDeviceToHost( DeviceBuffer<TYPE, DIM3>& src, HostBuffer<TYPE, DIM3>& dst) :
-        TaskCopyDeviceToHostBase<TYPE, DIM3>(src, dst)
+        TaskCopyDeviceToHost(DeviceBuffer<TYPE, DIM3>& src, HostBuffer<TYPE, DIM3>& dst)
+            : TaskCopyDeviceToHostBase<TYPE, DIM3>(src, dst)
         {
         }
 
     private:
-
-        virtual void copy(DataSpace<DIM3> &devCurrentSize)
+        virtual void copy(DataSpace<DIM3>& devCurrentSize)
         {
-            cudaPitchedPtr hostPtr;
-            hostPtr.pitch = this->host->getDataSpace()[0] * sizeof (TYPE);
+            cuplaPitchedPtr hostPtr;
+            hostPtr.pitch = this->host->getDataSpace()[0] * sizeof(TYPE);
             hostPtr.ptr = this->host->getBasePointer();
-            hostPtr.xsize = this->host->getDataSpace()[0] * sizeof (TYPE);
+            hostPtr.xsize = this->host->getDataSpace()[0] * sizeof(TYPE);
             hostPtr.ysize = this->host->getDataSpace()[1];
 
-            cudaMemcpy3DParms params;
+            cuplaMemcpy3DParms params;
             params.srcArray = nullptr;
-            params.srcPos = make_cudaPos(this->device->getOffset()[0] * sizeof (TYPE),
-                                         this->device->getOffset()[1],
-                                         this->device->getOffset()[2]);
+            params.srcPos = make_cuplaPos(
+                this->device->getOffset()[0] * sizeof(TYPE),
+                this->device->getOffset()[1],
+                this->device->getOffset()[2]);
             params.srcPtr = this->device->getCudaPitched();
 
             params.dstArray = nullptr;
-            params.dstPos = make_cudaPos(0, 0, 0);
+            params.dstPos = make_cuplaPos(0, 0, 0);
             params.dstPtr = hostPtr;
 
-            params.extent = make_cudaExtent(
-                                            devCurrentSize[0] * sizeof (TYPE),
-                                            devCurrentSize[1],
-                                            devCurrentSize[2]);
-            params.kind = cudaMemcpyDeviceToHost;
-
-            CUDA_CHECK(cudaMemcpy3DAsync(&params, this->getCudaStream()));
+            params.extent = make_cuplaExtent(devCurrentSize[0] * sizeof(TYPE), devCurrentSize[1], devCurrentSize[2]);
+            params.kind = cuplaMemcpyDeviceToHost;
 
+            CUDA_CHECK(cuplaMemcpy3DAsync(&params, this->getCudaStream()));
         }
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskCopyHostToDevice.hpp b/include/pmacc/eventSystem/tasks/TaskCopyHostToDevice.hpp
index 716efb5ea3..9c34bb15d6 100644
--- a/include/pmacc/eventSystem/tasks/TaskCopyHostToDevice.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskCopyHostToDevice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -27,25 +27,21 @@
 #include "pmacc/eventSystem/tasks/StreamTask.hpp"
 
 
-
 namespace pmacc
 {
-
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class HostBuffer;
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class DeviceBuffer;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class TaskCopyHostToDeviceBase : public StreamTask
     {
     public:
-
-        TaskCopyHostToDeviceBase(HostBuffer<TYPE, DIM>& src, DeviceBuffer<TYPE, DIM>& dst) :
-        StreamTask()
+        TaskCopyHostToDeviceBase(HostBuffer<TYPE, DIM>& src, DeviceBuffer<TYPE, DIM>& dst) : StreamTask()
         {
-            this->host =  & src;
-            this->device =  & dst;
+            this->host = &src;
+            this->device = &dst;
         }
 
         virtual ~TaskCopyHostToDeviceBase()
@@ -66,13 +62,13 @@ namespace pmacc
         {
             size_t current_size = host->getCurrentSize();
             DataSpace<DIM> hostCurrentSize = host->getCurrentDataSpace(current_size);
-            /* IMPORTENT: `setCurrentSize()` must be called before the native cuda memcopy
+            /* IMPORTENT: `setCurrentSize()` must be called before the native cupla memcopy
              * is called else `setCurrentSize()` is not handled as part of this task.
              * The reason for that is that the native memcopy calls `this->getCudaStream()`
              * but not register an task before this `init()` is finished.
              */
             device->setCurrentSize(current_size);
-            if (host->is1D() && device->is1D())
+            if(host->is1D() && device->is1D())
                 fastCopy(host->getPointer(), device->getPointer(), hostCurrentSize.productOfComponents());
             else
                 copy(hostCurrentSize);
@@ -87,110 +83,104 @@ namespace pmacc
 
 
     protected:
-
-        virtual void copy(DataSpace<DIM> &hostCurrentSize) = 0;
+        virtual void copy(DataSpace<DIM>& hostCurrentSize) = 0;
 
         void fastCopy(TYPE* src, TYPE* dst, size_t size)
         {
-            CUDA_CHECK(cudaMemcpyAsync(dst,
-                                       src,
-                                       size * sizeof (TYPE),
-                                       cudaMemcpyHostToDevice,
-                                       this->getCudaStream()));
+            CUDA_CHECK(
+                cuplaMemcpyAsync(dst, src, size * sizeof(TYPE), cuplaMemcpyHostToDevice, this->getCudaStream()));
         }
 
 
-        HostBuffer<TYPE, DIM> *host;
-        DeviceBuffer<TYPE, DIM> *device;
-
+        HostBuffer<TYPE, DIM>* host;
+        DeviceBuffer<TYPE, DIM>* device;
     };
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class TaskCopyHostToDevice;
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyHostToDevice<TYPE, DIM1> : public TaskCopyHostToDeviceBase<TYPE, DIM1>
     {
     public:
-
-        TaskCopyHostToDevice(HostBuffer<TYPE, DIM1>& src, DeviceBuffer<TYPE, DIM1>& dst) :
-        TaskCopyHostToDeviceBase<TYPE, DIM1>(src, dst)
+        TaskCopyHostToDevice(HostBuffer<TYPE, DIM1>& src, DeviceBuffer<TYPE, DIM1>& dst)
+            : TaskCopyHostToDeviceBase<TYPE, DIM1>(src, dst)
         {
         }
-    private:
 
-        virtual void copy(DataSpace<DIM1> &hostCurrentSize)
+    private:
+        virtual void copy(DataSpace<DIM1>& hostCurrentSize)
         {
-            CUDA_CHECK(cudaMemcpyAsync(this->device->getPointer(), /*pointer include X offset*/
-                                       this->host->getBasePointer(),
-                                       hostCurrentSize[0] * sizeof (TYPE), cudaMemcpyHostToDevice,
-                                       this->getCudaStream()));
+            CUDA_CHECK(cuplaMemcpyAsync(
+                this->device->getPointer(), /*pointer include X offset*/
+                this->host->getBasePointer(),
+                hostCurrentSize[0] * sizeof(TYPE),
+                cuplaMemcpyHostToDevice,
+                this->getCudaStream()));
         }
     };
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyHostToDevice<TYPE, DIM2> : public TaskCopyHostToDeviceBase<TYPE, DIM2>
     {
     public:
-
-        TaskCopyHostToDevice( HostBuffer<TYPE, DIM2>& src, DeviceBuffer<TYPE, DIM2>& dst) :
-        TaskCopyHostToDeviceBase<TYPE, DIM2>(src, dst)
+        TaskCopyHostToDevice(HostBuffer<TYPE, DIM2>& src, DeviceBuffer<TYPE, DIM2>& dst)
+            : TaskCopyHostToDeviceBase<TYPE, DIM2>(src, dst)
         {
         }
-    private:
 
-        virtual void copy(DataSpace<DIM2> &hostCurrentSize)
+    private:
+        virtual void copy(DataSpace<DIM2>& hostCurrentSize)
         {
-            CUDA_CHECK(cudaMemcpy2DAsync(this->device->getPointer(),
-                                         this->device->getPitch(), /*this is pitch*/
-                                         this->host->getBasePointer(),
-                                         this->host->getDataSpace()[0] * sizeof (TYPE), /*this is pitch*/
-                                         hostCurrentSize[0] * sizeof (TYPE),
-                                         hostCurrentSize[1],
-                                         cudaMemcpyHostToDevice,
-                                         this->getCudaStream()));
+            CUDA_CHECK(cuplaMemcpy2DAsync(
+                this->device->getPointer(),
+                this->device->getPitch(), /*this is pitch*/
+                this->host->getBasePointer(),
+                this->host->getDataSpace()[0] * sizeof(TYPE), /*this is pitch*/
+                hostCurrentSize[0] * sizeof(TYPE),
+                hostCurrentSize[1],
+                cuplaMemcpyHostToDevice,
+                this->getCudaStream()));
         }
     };
 
-    template <class TYPE>
+    template<class TYPE>
     class TaskCopyHostToDevice<TYPE, DIM3> : public TaskCopyHostToDeviceBase<TYPE, DIM3>
     {
     public:
-
-        TaskCopyHostToDevice( HostBuffer<TYPE, DIM3>& src, DeviceBuffer<TYPE, DIM3>& dst) :
-        TaskCopyHostToDeviceBase<TYPE, DIM3>(src, dst)
+        TaskCopyHostToDevice(HostBuffer<TYPE, DIM3>& src, DeviceBuffer<TYPE, DIM3>& dst)
+            : TaskCopyHostToDeviceBase<TYPE, DIM3>(src, dst)
         {
         }
-    private:
 
-        virtual void copy(DataSpace<DIM3> &hostCurrentSize)
+    private:
+        virtual void copy(DataSpace<DIM3>& hostCurrentSize)
         {
-            cudaPitchedPtr hostPtr;
-            hostPtr.pitch = this->host->getDataSpace()[0] * sizeof (TYPE);
+            cuplaPitchedPtr hostPtr;
+            hostPtr.pitch = this->host->getDataSpace()[0] * sizeof(TYPE);
             hostPtr.ptr = this->host->getBasePointer();
-            hostPtr.xsize = this->host->getDataSpace()[0] * sizeof (TYPE);
+            hostPtr.xsize = this->host->getDataSpace()[0] * sizeof(TYPE);
             hostPtr.ysize = this->host->getDataSpace()[1];
 
-            cudaMemcpy3DParms params;
+            cuplaMemcpy3DParms params;
             params.dstArray = nullptr;
-            params.dstPos = make_cudaPos(this->device->getOffset()[0] * sizeof (TYPE),
-                                         this->device->getOffset()[1],
-                                         this->device->getOffset()[2]);
+            params.dstPos = make_cuplaPos(
+                this->device->getOffset()[0] * sizeof(TYPE),
+                this->device->getOffset()[1],
+                this->device->getOffset()[2]);
             params.dstPtr = this->device->getCudaPitched();
 
             params.srcArray = nullptr;
-            params.srcPos = make_cudaPos(0, 0, 0);
+            params.srcPos = make_cuplaPos(0, 0, 0);
             params.srcPtr = hostPtr;
 
-            params.extent = make_cudaExtent(
-                                            hostCurrentSize[0] * sizeof (TYPE),
-                                            hostCurrentSize[1],
-                                            hostCurrentSize[2]);
-            params.kind = cudaMemcpyHostToDevice;
+            params.extent
+                = make_cuplaExtent(hostCurrentSize[0] * sizeof(TYPE), hostCurrentSize[1], hostCurrentSize[2]);
+            params.kind = cuplaMemcpyHostToDevice;
 
-            CUDA_CHECK(cudaMemcpy3DAsync(&params, this->getCudaStream()));
+            CUDA_CHECK(cuplaMemcpy3DAsync(&params, this->getCudaStream()));
         }
     };
 
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskGetCurrentSizeFromDevice.hpp b/include/pmacc/eventSystem/tasks/TaskGetCurrentSizeFromDevice.hpp
index 1d9117e727..15d67a140e 100644
--- a/include/pmacc/eventSystem/tasks/TaskGetCurrentSizeFromDevice.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskGetCurrentSizeFromDevice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -29,58 +29,52 @@
 #include "pmacc/types.hpp"
 
 
-
-
 namespace pmacc
 {
+    template<class TYPE, unsigned DIM>
+    class DeviceBuffer;
 
-
-template <class TYPE, unsigned DIM>
-class DeviceBuffer;
-
-template <class TYPE, unsigned DIM>
-class TaskGetCurrentSizeFromDevice : public StreamTask
-{
-public:
-
-    TaskGetCurrentSizeFromDevice(DeviceBuffer<TYPE,DIM>& buffer):
-    StreamTask()
-    {
-        this->buffer =  & buffer;
-    }
-
-    virtual ~TaskGetCurrentSizeFromDevice()
-    {
-        notify(this->myId,GETVALUE, nullptr);
-    }
-
-    bool executeIntern()
+    template<class TYPE, unsigned DIM>
+    class TaskGetCurrentSizeFromDevice : public StreamTask
     {
-        return isFinished();
-    }
-
-    void event(id_t, EventType, IEventData*)
-    {
-    }
-
-    virtual void init()
-    {
-        CUDA_CHECK(cudaMemcpyAsync((void*) buffer->getCurrentSizeHostSidePointer(),
-                                   buffer->getCurrentSizeOnDevicePointer(),
-                                   sizeof (size_t),
-                                   cudaMemcpyDeviceToHost,
-                                   this->getCudaStream()));
-        this->activate();
-    }
-
-    virtual std::string toString()
-    {
-        return "TaskGetCurrentSizeFromDevice";
-    }
-
-private:
-
-    DeviceBuffer<TYPE, DIM> *buffer;
-};
-
-} //namespace pmacc
+    public:
+        TaskGetCurrentSizeFromDevice(DeviceBuffer<TYPE, DIM>& buffer) : StreamTask()
+        {
+            this->buffer = &buffer;
+        }
+
+        virtual ~TaskGetCurrentSizeFromDevice()
+        {
+            notify(this->myId, GETVALUE, nullptr);
+        }
+
+        bool executeIntern()
+        {
+            return isFinished();
+        }
+
+        void event(id_t, EventType, IEventData*)
+        {
+        }
+
+        virtual void init()
+        {
+            CUDA_CHECK(cuplaMemcpyAsync(
+                (void*) buffer->getCurrentSizeHostSidePointer(),
+                buffer->getCurrentSizeOnDevicePointer(),
+                sizeof(size_t),
+                cuplaMemcpyDeviceToHost,
+                this->getCudaStream()));
+            this->activate();
+        }
+
+        virtual std::string toString()
+        {
+            return "TaskGetCurrentSizeFromDevice";
+        }
+
+    private:
+        DeviceBuffer<TYPE, DIM>* buffer;
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskKernel.hpp b/include/pmacc/eventSystem/tasks/TaskKernel.hpp
index 2c330b5729..44af6cab5d 100644
--- a/include/pmacc/eventSystem/tasks/TaskKernel.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskKernel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -28,15 +28,10 @@
 
 namespace pmacc
 {
-
     class TaskKernel : public StreamTask
     {
     public:
-
-        TaskKernel(std::string kernelName) :
-        StreamTask(),
-        kernelName(kernelName),
-        canBeChecked(false)
+        TaskKernel(std::string kernelName) : StreamTask(), kernelName(kernelName), canBeChecked(false)
         {
         }
 
@@ -74,5 +69,4 @@ namespace pmacc
         std::string kernelName;
     };
 
-} //namespace pmacc
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskKernel.tpp b/include/pmacc/eventSystem/tasks/TaskKernel.tpp
index 04c476a0e2..d4aadf7336 100644
--- a/include/pmacc/eventSystem/tasks/TaskKernel.tpp
+++ b/include/pmacc/eventSystem/tasks/TaskKernel.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -25,8 +25,8 @@
 #include "pmacc/eventSystem/tasks/TaskKernel.hpp"
 #include "pmacc/Environment.hpp"
 
-namespace pmacc{
-
+namespace pmacc
+{
     void TaskKernel::activateChecks()
     {
         canBeChecked = true;
@@ -35,4 +35,4 @@ namespace pmacc{
         Environment<>::get().Manager().addTask(this);
         __setTransactionEvent(EventTask(this->getId()));
     }
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskLogicalAnd.hpp b/include/pmacc/eventSystem/tasks/TaskLogicalAnd.hpp
index 609f87e7c9..92d17aa8ab 100644
--- a/include/pmacc/eventSystem/tasks/TaskLogicalAnd.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskLogicalAnd.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -29,22 +29,17 @@
 
 namespace pmacc
 {
-
     /**
      * TaskLogicalAnd AND-connects tasks to a new single task
      */
     class TaskLogicalAnd : public StreamTask
     {
     public:
-
         /**
          * s1 and s1 must be a valid IStreamTask
          * constructor
          */
-        TaskLogicalAnd(ITask* s1, ITask* s2) :
-        StreamTask(),
-        task1(s1->getId()),
-        task2(s2->getId())
+        TaskLogicalAnd(ITask* s1, ITask* s2) : StreamTask(), task1(s1->getId()), task2(s2->getId())
         {
             combine(s1, s2);
         }
@@ -54,13 +49,11 @@ namespace pmacc
          */
         virtual ~TaskLogicalAnd()
         {
-
             notify(this->myId, LOGICALAND, nullptr);
         }
 
         void init()
         {
-
         }
 
         bool executeIntern()
@@ -81,16 +74,16 @@ namespace pmacc
                 if(task != nullptr)
                 {
                     ITask::TaskType type = task->getTaskType();
-                    if (type == ITask::TASK_CUDA )
+                    if(type == ITask::TASK_DEVICE)
                     {
                         this->stream = static_cast<StreamTask*>(task)->getEventStream();
-                        this->setTaskType(ITask::TASK_CUDA);
-                        this->cudaEvent = static_cast<StreamTask*>(task)->getCudaEventHandle();
+                        this->setTaskType(ITask::TASK_DEVICE);
+                        this->cuplaEvent = static_cast<StreamTask*>(task)->getCudaEventHandle();
                         this->hasCudaEventHandle = true;
                     }
                 }
             }
-            else if (task2 == eventId)
+            else if(task2 == eventId)
             {
                 task2 = 0;
 
@@ -98,11 +91,11 @@ namespace pmacc
                 if(task != nullptr)
                 {
                     ITask::TaskType type = task->getTaskType();
-                    if (type == ITask::TASK_CUDA )
+                    if(type == ITask::TASK_DEVICE)
                     {
                         this->stream = static_cast<StreamTask*>(task)->getEventStream();
-                        this->setTaskType(ITask::TASK_CUDA);
-                        this->cudaEvent = static_cast<StreamTask*>(task)->getCudaEventHandle();
+                        this->setTaskType(ITask::TASK_DEVICE);
+                        this->cuplaEvent = static_cast<StreamTask*>(task)->getCudaEventHandle();
                         this->hasCudaEventHandle = true;
                     }
                 }
@@ -118,32 +111,28 @@ namespace pmacc
 
         std::string toString()
         {
-            return std::string("TaskLogicalAnd (") +
-                EventTask(task1).toString() +
-                std::string(" - ") +
-                EventTask(task2).toString() +
-                std::string(" )");
+            return std::string("TaskLogicalAnd (") + EventTask(task1).toString() + std::string(" - ")
+                + EventTask(task2).toString() + std::string(" )");
         }
 
     private:
-
         inline void combine(ITask* s1, ITask* s2)
         {
             s1->addObserver(this);
             s2->addObserver(this);
-            if(s1->getTaskType() == ITask::TASK_CUDA && s2->getTaskType() == ITask::TASK_CUDA)
+            if(s1->getTaskType() == ITask::TASK_DEVICE && s2->getTaskType() == ITask::TASK_DEVICE)
             {
-                this->setTaskType(ITask::TASK_CUDA);
-                this->setEventStream(static_cast<StreamTask*> (s2)->getEventStream());
-                if(static_cast<StreamTask*> (s1)->getEventStream() != static_cast<StreamTask*> (s2)->getEventStream())
-                    this->getEventStream()->waitOn(static_cast<StreamTask*> (s1)->getCudaEventHandle());
+                this->setTaskType(ITask::TASK_DEVICE);
+                this->setEventStream(static_cast<StreamTask*>(s2)->getEventStream());
+                if(static_cast<StreamTask*>(s1)->getEventStream() != static_cast<StreamTask*>(s2)->getEventStream())
+                    this->getEventStream()->waitOn(static_cast<StreamTask*>(s1)->getCudaEventHandle());
                 this->activate();
             }
-            else if(s1->getTaskType() == ITask::TASK_MPI && s2->getTaskType() == ITask::TASK_CUDA)
+            else if(s1->getTaskType() == ITask::TASK_MPI && s2->getTaskType() == ITask::TASK_DEVICE)
             {
                 this->setTaskType(ITask::TASK_MPI);
             }
-            else if(s2->getTaskType() == ITask::TASK_MPI && s1->getTaskType() == ITask::TASK_CUDA)
+            else if(s2->getTaskType() == ITask::TASK_MPI && s1->getTaskType() == ITask::TASK_DEVICE)
             {
                 this->setTaskType(ITask::TASK_MPI);
             }
@@ -157,5 +146,4 @@ namespace pmacc
         id_t task2;
     };
 
-} //namespace pmacc
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskReceive.hpp b/include/pmacc/eventSystem/tasks/TaskReceive.hpp
index 95d216d288..d89bb2f295 100644
--- a/include/pmacc/eventSystem/tasks/TaskReceive.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskReceive.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -32,16 +32,11 @@
 
 namespace pmacc
 {
-
-
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class TaskReceive : public MPITask
     {
     public:
-
-        TaskReceive(Exchange<TYPE, DIM> &ex) :
-        exchange(&ex),
-        state(Constructor)
+        TaskReceive(Exchange<TYPE, DIM>& ex) : exchange(&ex), state(Constructor)
         {
         }
 
@@ -53,38 +48,75 @@ namespace pmacc
 
         bool executeIntern()
         {
-            switch (state)
+            switch(state)
             {
-                case WaitForReceived:
-                    break;
-                case RunCopy:
-                    state = WaitForFinish;
-                   __startTransaction();
-                    exchange->getHostBuffer().setCurrentSize(newBufferSize);
-                    if (exchange->hasDeviceDoubleBuffer())
+            case WaitForReceived:
+                break;
+            case RunCopy:
+                state = WaitForFinish;
+                __startTransaction();
+
+                /* If MPI direct is enabled
+                 *   - we do not have any host representation of an exchange
+                 *   - MPI will write directly into the device buffer
+                 *     or double buffer when available.
+                 */
+                if(exchange->hasDeviceDoubleBuffer())
+                {
+                    if(Environment<>::get().isMpiDirectEnabled())
                     {
-
-                        Environment<>::get().Factory().createTaskCopyHostToDevice(exchange->getHostBuffer(),
-                                                                                     exchange->getDeviceDoubleBuffer());
-                        Environment<>::get().Factory().createTaskCopyDeviceToDevice(exchange->getDeviceDoubleBuffer(),
-                                                                                       exchange->getDeviceBuffer(),
-                                                                                       this);
+                        exchange->getDeviceDoubleBuffer().setCurrentSize(newBufferSize);
                     }
                     else
                     {
+                        exchange->getHostBuffer().setCurrentSize(newBufferSize);
+                        Environment<>::get().Factory().createTaskCopyHostToDevice(
+                            exchange->getHostBuffer(),
+                            exchange->getDeviceDoubleBuffer());
+                    }
 
-                        Environment<>::get().Factory().createTaskCopyHostToDevice(exchange->getHostBuffer(),
-                                                                                     exchange->getDeviceBuffer(),
-                                                                                     this);
+                    Environment<>::get().Factory().createTaskCopyDeviceToDevice(
+                        exchange->getDeviceDoubleBuffer(),
+                        exchange->getDeviceBuffer(),
+                        this);
+                }
+                else
+                {
+                    if(Environment<>::get().isMpiDirectEnabled())
+                    {
+                        exchange->getDeviceBuffer().setCurrentSize(newBufferSize);
+                        /* We can not be notified from setCurrentSize() therefore
+                         * we need to wait that the current event is finished.
+                         */
+                        setSizeEvent = __getTransactionEvent();
+                        state = WaitForSetSize;
                     }
-                    __endTransaction();
-                    break;
-                case WaitForFinish:
-                    break;
-                case Finish:
+                    else
+                    {
+                        exchange->getHostBuffer().setCurrentSize(newBufferSize);
+                        Environment<>::get().Factory().createTaskCopyHostToDevice(
+                            exchange->getHostBuffer(),
+                            exchange->getDeviceBuffer(),
+                            this);
+                    }
+                }
+
+                __endTransaction();
+                break;
+            case WaitForSetSize:
+                // this code is only passed if gpu direct is enabled
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(setSizeEvent.getTaskId()))
+                {
+                    state = Finish;
                     return true;
-                default:
-                    return false;
+                }
+                break;
+            case WaitForFinish:
+                break;
+            case Finish:
+                return true;
+            default:
+                return false;
             }
 
             return false;
@@ -97,51 +129,51 @@ namespace pmacc
 
         void event(id_t, EventType type, IEventData* data)
         {
-            switch (type)
+            switch(type)
             {
-                case RECVFINISHED:
-                    if (data != nullptr)
-                    {
-                        EventDataReceive *rdata = static_cast<EventDataReceive*> (data);
-                        // std::cout<<" data rec "<<rdata->getReceivedCount()/sizeof(TYPE)<<std::endl;
-                        newBufferSize = rdata->getReceivedCount() / sizeof (TYPE);
-                        state = RunCopy;
-                        executeIntern();
-                    }
-                    break;
-                case COPYHOST2DEVICE:
-                case COPYDEVICE2DEVICE:
-                    state = Finish;
-                    break;
-                default:
-                    return;
+            case RECVFINISHED:
+                if(data != nullptr)
+                {
+                    EventDataReceive* rdata = static_cast<EventDataReceive*>(data);
+                    // std::cout<<" data rec "<<rdata->getReceivedCount()/sizeof(TYPE)<<std::endl;
+                    newBufferSize = rdata->getReceivedCount() / sizeof(TYPE);
+                    state = RunCopy;
+                    executeIntern();
+                }
+                break;
+            case COPYHOST2DEVICE:
+            case COPYDEVICE2DEVICE:
+                state = Finish;
+                break;
+            default:
+                return;
             }
         }
 
         std::string toString()
         {
             std::stringstream ss;
-            ss<<state;
-            return std::string("TaskReceive ")+ ss.str();
+            ss << state;
+            return std::string("TaskReceive ") + ss.str();
         }
 
     private:
-
         enum state_t
         {
             Constructor,
             WaitForReceived,
             RunCopy,
+            WaitForSetSize,
             WaitForFinish,
             Finish
 
         };
 
 
-        Exchange<TYPE, DIM> *exchange;
+        Exchange<TYPE, DIM>* exchange;
         state_t state;
         size_t newBufferSize;
+        EventTask setSizeEvent;
     };
 
-} //namespace pmacc
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskReceiveMPI.hpp b/include/pmacc/eventSystem/tasks/TaskReceiveMPI.hpp
index ec23fdca60..313b449f98 100644
--- a/include/pmacc/eventSystem/tasks/TaskReceiveMPI.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskReceiveMPI.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -31,80 +31,72 @@
 
 namespace pmacc
 {
-
-template <class TYPE, unsigned DIM>
-class TaskReceiveMPI : public MPITask
-{
-public:
-
-    TaskReceiveMPI(Exchange<TYPE, DIM> *exchange) :
-    MPITask(),
-    exchange(exchange)
-    {
-
-    }
-
-    virtual void init()
+    template<class TYPE, unsigned DIM>
+    class TaskReceiveMPI : public MPITask
     {
-        this->request = Environment<DIM>::get().EnvironmentController()
-                .getCommunicator().startReceive(
-                                                exchange->getExchangeType(),
-                                                (char*) exchange->getHostBuffer().getBasePointer(),
-                                                exchange->getHostBuffer().getDataSpace().productOfComponents() * sizeof (TYPE),
-                                                exchange->getCommunicationTag());
-    }
-
-    bool executeIntern()
-    {
-        if (this->isFinished())
-            return true;
-
-        if (this->request == nullptr)
-            throw std::runtime_error("request was nullptr (call executeIntern after freed");
-
-        int flag=0;
-        MPI_CHECK(MPI_Test(this->request, &flag, &(this->status)));
-
-        if (flag) //finished
+    public:
+        TaskReceiveMPI(Exchange<TYPE, DIM>* exchange) : MPITask(), exchange(exchange)
         {
-            delete this->request;
-            this->request = nullptr;
-            setFinished();
-            return true;
         }
-        return false;
-    }
-
-    virtual ~TaskReceiveMPI()
-    {
-        //! \todo this make problems because we send bytes and not combined types
-        int recv_data_count;
-        MPI_CHECK_NO_EXCEPT(MPI_Get_count(&(this->status), MPI_CHAR, &recv_data_count));
 
+        virtual void init()
+        {
+            Buffer<TYPE, DIM>* dst = exchange->getCommunicationBuffer();
 
-        IEventData *edata = new EventDataReceive(nullptr, recv_data_count);
+            this->request = Environment<DIM>::get().EnvironmentController().getCommunicator().startReceive(
+                exchange->getExchangeType(),
+                reinterpret_cast<char*>(dst->getPointer()),
+                dst->getDataSpace().productOfComponents() * sizeof(TYPE),
+                exchange->getCommunicationTag());
+        }
 
-        notify(this->myId, RECVFINISHED, edata); /*add notify her*/
-        __delete(edata);
+        bool executeIntern()
+        {
+            if(this->isFinished())
+                return true;
+
+            if(this->request == nullptr)
+                throw std::runtime_error("request was nullptr (call executeIntern after freed");
+
+            int flag = 0;
+            MPI_CHECK(MPI_Test(this->request, &flag, &(this->status)));
+
+            if(flag) // finished
+            {
+                delete this->request;
+                this->request = nullptr;
+                setFinished();
+                return true;
+            }
+            return false;
+        }
 
-    }
+        virtual ~TaskReceiveMPI()
+        {
+            //! \todo this make problems because we send bytes and not combined types
+            int recv_data_count;
+            MPI_CHECK_NO_EXCEPT(MPI_Get_count(&(this->status), MPI_CHAR, &recv_data_count));
 
-    void event(id_t, EventType, IEventData*)
-    {
 
+            IEventData* edata = new EventDataReceive(nullptr, recv_data_count);
 
-    }
+            notify(this->myId, RECVFINISHED, edata); /*add notify her*/
+            __delete(edata);
+        }
 
-    std::string toString()
-    {
-        return "TaskReceiveMPI";
-    }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
-private:
-    Exchange<TYPE, DIM> *exchange;
-    MPI_Request *request;
-    MPI_Status status;
-};
+        std::string toString()
+        {
+            return std::string("TaskReceiveMPI exchange type=") + std::to_string(exchange->getExchangeType());
+        }
 
-} //namespace pmacc
+    private:
+        Exchange<TYPE, DIM>* exchange;
+        MPI_Request* request;
+        MPI_Status status;
+    };
 
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskSend.hpp b/include/pmacc/eventSystem/tasks/TaskSend.hpp
index 9b5ac8c19d..21b7b1ae3c 100644
--- a/include/pmacc/eventSystem/tasks/TaskSend.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskSend.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -32,59 +32,72 @@
 
 namespace pmacc
 {
-
-
-
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class TaskSend : public MPITask
     {
     public:
-
-        TaskSend(Exchange<TYPE, DIM> &ex) :
-        exchange(&ex),
-        state(Constructor)
+        TaskSend(Exchange<TYPE, DIM>& ex) : exchange(&ex), state(Constructor)
         {
         }
 
         virtual void init()
         {
             state = InitDone;
-            if (exchange->hasDeviceDoubleBuffer())
+            if(exchange->hasDeviceDoubleBuffer())
             {
-                Environment<>::get().Factory().createTaskCopyDeviceToDevice(exchange->getDeviceBuffer(),
-                                                                            exchange->getDeviceDoubleBuffer()
-                                                                            );
-                Environment<>::get().Factory().createTaskCopyDeviceToHost(exchange->getDeviceDoubleBuffer(),
-                                                                          exchange->getHostBuffer(),
-                                                                          this);
+                if(Environment<>::get().isMpiDirectEnabled())
+                    Environment<>::get().Factory().createTaskCopyDeviceToDevice(
+                        exchange->getDeviceBuffer(),
+                        exchange->getDeviceDoubleBuffer(),
+                        this);
+                else
+                {
+                    Environment<>::get().Factory().createTaskCopyDeviceToDevice(
+                        exchange->getDeviceBuffer(),
+                        exchange->getDeviceDoubleBuffer());
+
+                    Environment<>::get().Factory().createTaskCopyDeviceToHost(
+                        exchange->getDeviceDoubleBuffer(),
+                        exchange->getHostBuffer(),
+                        this);
+                }
             }
             else
             {
-                Environment<>::get().Factory().createTaskCopyDeviceToHost(exchange->getDeviceBuffer(),
-                                                                          exchange->getHostBuffer(),
-                                                                          this);
+                if(Environment<>::get().isMpiDirectEnabled())
+                {
+                    /* Wait to be sure that all device work is finished before MPI is triggered.
+                     * MPI will not wait for work in our device streams
+                     */
+                    __getTransactionEvent().waitForFinished();
+                    state = ReadyForMPISend;
+                }
+                else
+                    Environment<>::get().Factory().createTaskCopyDeviceToHost(
+                        exchange->getDeviceBuffer(),
+                        exchange->getHostBuffer(),
+                        this);
             }
-
         }
 
         bool executeIntern()
         {
-            switch (state)
+            switch(state)
             {
-                case InitDone:
-                    break;
-                case DeviceToHostFinished:
-                    state = SendDone;
-                    __startTransaction();
-                    Environment<>::get().Factory().createTaskSendMPI(exchange, this);
-                    __endTransaction();
-                    break;
-                case SendDone:
-                    break;
-                case Finish:
-                    return true;
-                default:
-                    return false;
+            case InitDone:
+                break;
+            case ReadyForMPISend:
+                state = SendDone;
+                __startTransaction();
+                Environment<>::get().Factory().createTaskSendMPI(exchange, this);
+                __endTransaction();
+                break;
+            case SendDone:
+                break;
+            case Finish:
+                return true;
+            default:
+                return false;
             }
 
             return false;
@@ -97,41 +110,37 @@ namespace pmacc
 
         void event(id_t, EventType type, IEventData*)
         {
-            if (type == COPYDEVICE2HOST)
+            if(type == COPYDEVICE2HOST || type == COPYDEVICE2DEVICE)
             {
-                state = DeviceToHostFinished;
+                state = ReadyForMPISend;
                 executeIntern();
-
             }
 
-            if (type == SENDFINISHED)
+            if(type == SENDFINISHED)
             {
                 state = Finish;
             }
-
         }
 
         std::string toString()
         {
             std::stringstream ss;
-            ss<<state;
-            return std::string("TaskSend ")+ ss.str();
+            ss << state;
+            return std::string("TaskSend ") + ss.str();
         }
 
     private:
-
         enum state_t
         {
             Constructor,
             InitDone,
-            DeviceToHostFinished,
+            ReadyForMPISend,
             SendDone,
             Finish
         };
 
-        Exchange<TYPE, DIM> *exchange;
+        Exchange<TYPE, DIM>* exchange;
         state_t state;
     };
 
-} //namespace pmacc
-
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskSendMPI.hpp b/include/pmacc/eventSystem/tasks/TaskSendMPI.hpp
index b4a82cfc1a..a36633ff72 100644
--- a/include/pmacc/eventSystem/tasks/TaskSendMPI.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskSendMPI.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Wolfgang Hoenig,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -31,70 +31,64 @@
 
 namespace pmacc
 {
-
-template <class TYPE, unsigned DIM>
-class TaskSendMPI : public MPITask
-{
-public:
-
-    TaskSendMPI(Exchange<TYPE, DIM> *exchange) :
-    MPITask(),
-    exchange(exchange)
-    {
-
-    }
-
-    virtual void init()
+    template<class TYPE, unsigned DIM>
+    class TaskSendMPI : public MPITask
     {
-        this->request = Environment<DIM>::get().EnvironmentController()
-                .getCommunicator().startSend(
-                                             exchange->getExchangeType(),
-                                             (char*) exchange->getHostBuffer().getPointer(),
-                                             exchange->getHostBuffer().getCurrentSize() * sizeof (TYPE),
-                                             exchange->getCommunicationTag());
-    }
-
-    bool executeIntern()
-    {
-        if (this->isFinished())
-            return true;
+    public:
+        TaskSendMPI(Exchange<TYPE, DIM>* exchange) : MPITask(), exchange(exchange)
+        {
+        }
 
-        if (this->request == nullptr)
-            throw std::runtime_error("request was nullptr (call executeIntern after freed");
+        virtual void init()
+        {
+            Buffer<TYPE, DIM>* src = exchange->getCommunicationBuffer();
 
-        int flag=0;
-        MPI_CHECK(MPI_Test(this->request, &flag, &(this->status)));
+            this->request = Environment<DIM>::get().EnvironmentController().getCommunicator().startSend(
+                exchange->getExchangeType(),
+                reinterpret_cast<char*>(src->getPointer()),
+                src->getCurrentSize() * sizeof(TYPE),
+                exchange->getCommunicationTag());
+        }
 
-        if (flag) //finished
+        bool executeIntern()
         {
-            delete this->request;
-            this->request = nullptr;
-            this->setFinished();
-            return true;
+            if(this->isFinished())
+                return true;
+
+            if(this->request == nullptr)
+                throw std::runtime_error("request was nullptr (call executeIntern after freed");
+
+            int flag = 0;
+            MPI_CHECK(MPI_Test(this->request, &flag, &(this->status)));
+
+            if(flag) // finished
+            {
+                delete this->request;
+                this->request = nullptr;
+                this->setFinished();
+                return true;
+            }
+            return false;
         }
-        return false;
-    }
 
-    virtual ~TaskSendMPI()
-    {
-        notify(this->myId, SENDFINISHED, nullptr);
-    }
-
-    void event(id_t, EventType, IEventData*)
-    {
-
-    }
+        virtual ~TaskSendMPI()
+        {
+            notify(this->myId, SENDFINISHED, nullptr);
+        }
 
-    std::string toString()
-    {
-        return "TaskSendMPI";
-    }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
-private:
-    Exchange<TYPE, DIM> *exchange;
-    MPI_Request *request;
-    MPI_Status status;
-};
+        std::string toString()
+        {
+            return std::string("TaskSendMPI exchange type=") + std::to_string(exchange->getExchangeType());
+        }
 
-} //namespace pmacc
+    private:
+        Exchange<TYPE, DIM>* exchange;
+        MPI_Request* request;
+        MPI_Status status;
+    };
 
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskSetCurrentSizeOnDevice.hpp b/include/pmacc/eventSystem/tasks/TaskSetCurrentSizeOnDevice.hpp
index 3e3a9a3c42..59e7e2ca80 100644
--- a/include/pmacc/eventSystem/tasks/TaskSetCurrentSizeOnDevice.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskSetCurrentSizeOnDevice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -32,76 +32,62 @@
 
 namespace pmacc
 {
-
-struct KernelSetValueOnDeviceMemory
-{
-    template< typename T_Acc >
-    DINLINE void operator()(const T_Acc&, size_t* pointer, const size_t size) const
-    {
-        *pointer = size;
-    }
-};
-
-template <class TYPE, unsigned DIM>
-class DeviceBuffer;
-
-template <class TYPE, unsigned DIM>
-class TaskSetCurrentSizeOnDevice : public StreamTask
-{
-public:
-
-    TaskSetCurrentSizeOnDevice(DeviceBuffer<TYPE, DIM>& dst, size_t size) :
-    StreamTask(),
-    size(size)
-    {
-        this->destination = & dst;
-    }
-
-    virtual ~TaskSetCurrentSizeOnDevice()
+    struct KernelSetValueOnDeviceMemory
     {
-        notify(this->myId, SETVALUE, nullptr);
-    }
-
-    virtual void init()
-    {
-        setSize();
-    }
-
-    bool executeIntern()
+        template<typename T_Acc>
+        DINLINE void operator()(const T_Acc&, size_t* pointer, const size_t size) const
+        {
+            *pointer = size;
+        }
+    };
+
+    template<class TYPE, unsigned DIM>
+    class DeviceBuffer;
+
+    template<class TYPE, unsigned DIM>
+    class TaskSetCurrentSizeOnDevice : public StreamTask
     {
-        return isFinished();
-    }
-
-    void event(id_t, EventType, IEventData*)
-    {
-    }
-
-    std::string toString()
-    {
-        return "TaskSetCurrentSizeOnDevice";
-    }
-
-private:
-
-    void setSize()
-    {
-        auto sizePtr = destination->getCurrentSizeOnDevicePointer();
-        CUPLA_KERNEL( KernelSetValueOnDeviceMemory )(
-            1,
-            1,
-            0,
-            this->getCudaStream()
-        )(
-            sizePtr,
-            size
-        );
-
-        activate();
-    }
-
-    DeviceBuffer<TYPE, DIM> *destination;
-    const size_t size;
-};
-
-} //namespace pmacc
-
+    public:
+        TaskSetCurrentSizeOnDevice(DeviceBuffer<TYPE, DIM>& dst, size_t size) : StreamTask(), size(size)
+        {
+            this->destination = &dst;
+        }
+
+        virtual ~TaskSetCurrentSizeOnDevice()
+        {
+            notify(this->myId, SETVALUE, nullptr);
+        }
+
+        virtual void init()
+        {
+            setSize();
+        }
+
+        bool executeIntern()
+        {
+            return isFinished();
+        }
+
+        void event(id_t, EventType, IEventData*)
+        {
+        }
+
+        std::string toString()
+        {
+            return "TaskSetCurrentSizeOnDevice";
+        }
+
+    private:
+        void setSize()
+        {
+            auto sizePtr = destination->getCurrentSizeOnDevicePointer();
+            CUPLA_KERNEL(KernelSetValueOnDeviceMemory)(1, 1, 0, this->getCudaStream())(sizePtr, size);
+
+            activate();
+        }
+
+        DeviceBuffer<TYPE, DIM>* destination;
+        const size_t size;
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/tasks/TaskSetValue.hpp b/include/pmacc/eventSystem/tasks/TaskSetValue.hpp
index a69bc2a193..4e97483724 100644
--- a/include/pmacc/eventSystem/tasks/TaskSetValue.hpp
+++ b/include/pmacc/eventSystem/tasks/TaskSetValue.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -37,329 +37,260 @@
 #include <boost/type_traits.hpp>
 
 
-
-
 namespace pmacc
 {
-namespace taskSetValueHelper
-{
-
-/** define access operation for non-pointer types
- */
-template<typename T_Type, bool isPointer>
-struct Value
-{
-    typedef const T_Type type;
-
-    HDINLINE type& operator()(type& v) const
+    namespace taskSetValueHelper
     {
-        return v;
-    }
-};
+        /** define access operation for non-pointer types
+         */
+        template<typename T_Type, bool isPointer>
+        struct Value
+        {
+            typedef const T_Type type;
 
-/** define access operation for pointer types
- *
- * access first element of a pointer
- */
-template<typename T_Type>
-struct Value<T_Type, true>
-{
-    typedef const T_Type PtrType;
-    typedef const typename boost::remove_pointer<PtrType>::type type;
+            HDINLINE type& operator()(type& v) const
+            {
+                return v;
+            }
+        };
+
+        /** define access operation for pointer types
+         *
+         * access first element of a pointer
+         */
+        template<typename T_Type>
+        struct Value<T_Type, true>
+        {
+            typedef const T_Type PtrType;
+            typedef const typename boost::remove_pointer<PtrType>::type type;
 
-    HDINLINE type& operator()(PtrType v) const
-    {
-        return *v;
-    }
-};
+            HDINLINE type& operator()(PtrType v) const
+            {
+                return *v;
+            }
+        };
 
-/** Get access to a value from a pointer or reference with the same method
- */
-template<typename T_Type>
-HDINLINE typename Value<T_Type, boost::is_pointer<T_Type>::value >::type&
-getValue(T_Type& value)
-{
-    typedef Value<T_Type, boost::is_pointer<T_Type>::value > Functor;
-    return Functor()(value);
-}
+        /** Get access to a value from a pointer or reference with the same method
+         */
+        template<typename T_Type>
+        HDINLINE typename Value<T_Type, boost::is_pointer<T_Type>::value>::type& getValue(T_Type& value)
+        {
+            typedef Value<T_Type, boost::is_pointer<T_Type>::value> Functor;
+            return Functor()(value);
+        }
 
-}
+    } // namespace taskSetValueHelper
 
-/** set a value to all elements of a box
- *
- * @tparam T_numWorkers number of workers
- * @tparam T_xChunkSize number of elements in x direction to prepare with one cuda block
- */
-template<
-    uint32_t T_numWorkers,
-    uint32_t T_xChunkSize
->
-struct KernelSetValue
-{
-    /** set value to all elements
+    /** set a value to all elements of a box
      *
-     * @tparam T_DataBox pmacc::DataBox, type of the memory box
-     * @tparam T_ValueType type of the value
-     * @tparam T_SizeVecType pmacc::math::Vector, index type
-     * @tparam T_Acc alpaka accelerator type
-     *
-     * @param memBox box of which all elements shall be set to value
-     * @param value value to set to all elements of memBox
-     * @param size extents of memBox
+     * @tparam T_numWorkers number of workers
+     * @tparam T_xChunkSize number of elements in x direction to prepare with one cupla block
      */
-    template<
-        typename T_DataBox,
-        typename T_ValueType,
-        typename T_SizeVecType,
-        typename T_Acc
-    >
-    DINLINE void
-    operator()(
-        T_Acc const & acc,
-        T_DataBox & memBox,
-        T_ValueType const & value,
-        T_SizeVecType const & size
-    ) const
+    template<uint32_t T_numWorkers, uint32_t T_xChunkSize>
+    struct KernelSetValue
     {
-        using namespace mappings::threads;
-        using SizeVecType = T_SizeVecType;
-
-        SizeVecType const blockIndex( blockIdx );
-        SizeVecType blockSize( SizeVecType::create( 1 ) );
-        blockSize.x( ) = T_xChunkSize;
-
-        constexpr uint32_t numWorkers = T_numWorkers;
-        uint32_t const workerIdx = threadIdx.x;
-
-        ForEachIdx<
-            IdxConfig<
-                T_xChunkSize,
-                numWorkers
-            >
-        >{ workerIdx }(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
-                auto virtualWorkerIdx( SizeVecType::create( 0 ) );
-                virtualWorkerIdx.x( ) = linearIdx;
+        /** set value to all elements
+         *
+         * @tparam T_DataBox pmacc::DataBox, type of the memory box
+         * @tparam T_ValueType type of the value
+         * @tparam T_SizeVecType pmacc::math::Vector, index type
+         * @tparam T_Acc alpaka accelerator type
+         *
+         * @param memBox box of which all elements shall be set to value
+         * @param value value to set to all elements of memBox
+         * @param size extents of memBox
+         */
+        template<typename T_DataBox, typename T_ValueType, typename T_SizeVecType, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_DataBox& memBox,
+            T_ValueType const& value,
+            T_SizeVecType const& size) const
+        {
+            using namespace mappings::threads;
+            using SizeVecType = T_SizeVecType;
 
-                SizeVecType const idx( blockSize * blockIndex + virtualWorkerIdx );
-                if( idx.x() < size.x() )
-                    memBox( idx ) = taskSetValueHelper::getValue( value );
-            }
-        );
-    }
-};
+            SizeVecType const blockIndex(cupla::blockIdx(acc));
+            SizeVecType blockSize(SizeVecType::create(1));
+            blockSize.x() = T_xChunkSize;
 
-template <class TYPE, unsigned DIM>
-class DeviceBuffer;
+            constexpr uint32_t numWorkers = T_numWorkers;
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-/** Set all cells of a GridBuffer on the device to a given value
- *
- * T_ValueType  = data type (e.g. float, float2)
- * T_dim   = dimension of the GridBuffer
- * T_isSmallValue = true if T_ValueType can be send via kernel parameter (on cuda T_ValueType must be smaller than 256 byte)
- */
-template <class T_ValueType, unsigned T_dim, bool T_isSmallValue>
-class TaskSetValue;
+            ForEachIdx<IdxConfig<T_xChunkSize, numWorkers>>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                auto virtualWorkerIdx(SizeVecType::create(0));
+                virtualWorkerIdx.x() = linearIdx;
 
-template <class T_ValueType, unsigned T_dim>
-class TaskSetValueBase : public StreamTask
-{
-public:
-    typedef T_ValueType ValueType;
-    static constexpr uint32_t dim = T_dim;
+                SizeVecType const idx(blockSize * blockIndex + virtualWorkerIdx);
+                if(idx.x() < size.x())
+                    memBox(idx) = taskSetValueHelper::getValue(value);
+            });
+        }
+    };
 
-    TaskSetValueBase(DeviceBuffer<ValueType, dim>& dst, const ValueType& value) :
-    StreamTask(),
-    value(value)
-    {
-        this->destination = &dst;
-    }
+    template<class TYPE, unsigned DIM>
+    class DeviceBuffer;
 
-    virtual ~TaskSetValueBase()
+    /** Set all cells of a GridBuffer on the device to a given value
+     *
+     * T_ValueType  = data type (e.g. float, float2)
+     * T_dim   = dimension of the GridBuffer
+     * T_isSmallValue = true if T_ValueType can be send via kernel parameter (on cupla T_ValueType must be smaller than
+     * 256 byte)
+     */
+    template<class T_ValueType, unsigned T_dim, bool T_isSmallValue>
+    class TaskSetValue;
+
+    template<class T_ValueType, unsigned T_dim>
+    class TaskSetValueBase : public StreamTask
     {
-        notify(this->myId, SETVALUE, nullptr);
+    public:
+        typedef T_ValueType ValueType;
+        static constexpr uint32_t dim = T_dim;
 
-    }
+        TaskSetValueBase(DeviceBuffer<ValueType, dim>& dst, const ValueType& value) : StreamTask(), value(value)
+        {
+            this->destination = &dst;
+        }
 
-    virtual void init() = 0;
+        virtual ~TaskSetValueBase()
+        {
+            notify(this->myId, SETVALUE, nullptr);
+        }
 
-    bool executeIntern()
-    {
-        return isFinished();
-    }
+        virtual void init() = 0;
 
-    void event(id_t, EventType, IEventData*)
-    {
-    }
+        bool executeIntern()
+        {
+            return isFinished();
+        }
 
-protected:
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
-    std::string toString()
+    protected:
+        std::string toString()
+        {
+            return "TaskSetValue";
+        }
+
+        DeviceBuffer<ValueType, dim>* destination;
+        ValueType value;
+    };
+
+    /** implementation for small values (<= 256byte)
+     */
+    template<class T_ValueType, unsigned T_dim>
+    class TaskSetValue<T_ValueType, T_dim, true> : public TaskSetValueBase<T_ValueType, T_dim>
     {
-        return "TaskSetValue";
-    }
+    public:
+        typedef T_ValueType ValueType;
+        static constexpr uint32_t dim = T_dim;
 
-    DeviceBuffer<ValueType, dim> *destination;
-    ValueType value;
-};
+        TaskSetValue(DeviceBuffer<ValueType, dim>& dst, const ValueType& value)
+            : TaskSetValueBase<ValueType, dim>(dst, value)
+        {
+        }
 
-/** implementation for small values (<= 256byte)
- */
-template <class T_ValueType, unsigned T_dim>
-class TaskSetValue<T_ValueType, T_dim, true> : public TaskSetValueBase<T_ValueType, T_dim>
-{
-public:
-    typedef T_ValueType ValueType;
-    static constexpr uint32_t dim = T_dim;
+        virtual ~TaskSetValue()
+        {
+        }
 
-    TaskSetValue(DeviceBuffer<ValueType, dim>& dst, const ValueType& value) :
-    TaskSetValueBase<ValueType, dim>(dst, value)
-    {
-    }
+        virtual void init()
+        {
+            // number of elements in destination
+            size_t const current_size = this->destination->getCurrentSize();
+            // n-dimensional size of destination based on `current_size`
+            DataSpace<dim> const area_size(this->destination->getCurrentDataSpace(current_size));
 
-    virtual ~TaskSetValue()
-    {
+            if(area_size.productOfComponents() != 0)
+            {
+                auto gridSize = area_size;
 
-    }
+                /* number of elements in x direction used to chunk the destination buffer
+                 * for block parallel processing
+                 */
+                constexpr uint32_t xChunkSize = 256;
+                constexpr uint32_t numWorkers = traits::GetNumWorkers<xChunkSize>::value;
 
-    virtual void init()
-    {
-        // number of elements in destination
-        size_t const current_size = this->destination->getCurrentSize( );
-        // n-dimensional size of destination based on `current_size`
-        DataSpace< dim > const area_size( this->destination->getCurrentDataSpace( current_size ) );
+                // number of blocks in x direction
+                gridSize.x() = ceil(static_cast<double>(gridSize.x()) / static_cast<double>(xChunkSize));
 
-        if( area_size.productOfComponents() != 0 )
-        {
-            auto gridSize = area_size;
-
-            /* number of elements in x direction used to chunk the destination buffer
-             * for block parallel processing
-             */
-            constexpr uint32_t xChunkSize = 256;
-            constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                xChunkSize
-            >::value;
-
-            // number of blocks in x direction
-            gridSize.x() = ceil(
-                static_cast< double >( gridSize.x( ) ) /
-                static_cast< double >( xChunkSize )
-           );
-
-            auto destBox = this->destination->getDataBox( );
-            CUPLA_KERNEL(
-                KernelSetValue<
-                    numWorkers,
-                    xChunkSize
-                >
-            )(
-                gridSize.toDim3(),
-                numWorkers,
-                0,
-                this->getCudaStream( )
-            )(
-                destBox,
-                this->value,
-                area_size
-            );
+                auto destBox = this->destination->getDataBox();
+                CUPLA_KERNEL(KernelSetValue<numWorkers, xChunkSize>)
+                (gridSize.toDim3(), numWorkers, 0, this->getCudaStream())(destBox, this->value, area_size);
+            }
+            this->activate();
         }
-        this->activate( );
-    }
-};
-
-/** implementation for big values (>256 byte)
- *
- * This class uses CUDA memcopy to copy an instance of T_ValueType to the GPU
- * and runs a kernel which assigns this value to all cells.
- */
-template <class T_ValueType, unsigned T_dim>
-class TaskSetValue<T_ValueType, T_dim, false> : public TaskSetValueBase<T_ValueType, T_dim>
-{
-public:
-    typedef T_ValueType ValueType;
-    static constexpr uint32_t dim = T_dim;
+    };
 
-    TaskSetValue(DeviceBuffer<ValueType, dim>& dst, const ValueType& value) :
-    TaskSetValueBase<ValueType, dim>(dst, value), valuePointer_host(nullptr)
+    /** implementation for big values (>256 byte)
+     *
+     * This class uses CUDA memcopy to copy an instance of T_ValueType to the GPU
+     * and runs a kernel which assigns this value to all cells.
+     */
+    template<class T_ValueType, unsigned T_dim>
+    class TaskSetValue<T_ValueType, T_dim, false> : public TaskSetValueBase<T_ValueType, T_dim>
     {
-    }
+    public:
+        typedef T_ValueType ValueType;
+        static constexpr uint32_t dim = T_dim;
 
-    virtual ~TaskSetValue()
-    {
-        if (valuePointer_host != nullptr)
+        TaskSetValue(DeviceBuffer<ValueType, dim>& dst, const ValueType& value)
+            : TaskSetValueBase<ValueType, dim>(dst, value)
+            , valuePointer_host(nullptr)
         {
-            CUDA_CHECK_NO_EXCEPT(cudaFreeHost(valuePointer_host));
-            valuePointer_host = nullptr;
         }
-    }
 
-    void init()
-    {
-        size_t current_size = this->destination->getCurrentSize();
-        const DataSpace<dim> area_size(this->destination->getCurrentDataSpace(current_size));
-        if(area_size.productOfComponents() != 0)
+        virtual ~TaskSetValue()
         {
-            auto gridSize = area_size;
-
-            /* number of elements in x direction used to chunk the destination buffer
-             * for block parallel processing
-             */
-            constexpr int xChunkSize = 256;
-            constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                xChunkSize
-            >::value;
-
-            // number of blocks in x direction
-            gridSize.x() = ceil(
-                static_cast< double >( gridSize.x( ) ) /
-                static_cast< double >( xChunkSize )
-            );
-
-            ValueType* devicePtr = this->destination->getPointer();
-
-            CUDA_CHECK( cudaMallocHost(
-                (void**)&valuePointer_host,
-                sizeof( ValueType )
-            ));
-            *valuePointer_host = this->value; //copy value to new place
-
-            CUDA_CHECK( cudaMemcpyAsync(
-                devicePtr,
-                valuePointer_host,
-                sizeof( ValueType ),
-                cudaMemcpyHostToDevice,
-                this->getCudaStream( )
-            ));
-
-            auto destBox = this->destination->getDataBox( );
-            CUPLA_KERNEL(
-                KernelSetValue<
-                    numWorkers,
-                    xChunkSize
-                >
-            )(
-                gridSize.toDim3(),
-                numWorkers,
-                0,
-                this->getCudaStream()
-            )(
-                destBox,
-                devicePtr,
-                area_size
-            );
+            if(valuePointer_host != nullptr)
+            {
+                CUDA_CHECK_NO_EXCEPT(cuplaFreeHost(valuePointer_host));
+                valuePointer_host = nullptr;
+            }
         }
 
-        this->activate();
-    }
+        void init()
+        {
+            size_t current_size = this->destination->getCurrentSize();
+            const DataSpace<dim> area_size(this->destination->getCurrentDataSpace(current_size));
+            if(area_size.productOfComponents() != 0)
+            {
+                auto gridSize = area_size;
+
+                /* number of elements in x direction used to chunk the destination buffer
+                 * for block parallel processing
+                 */
+                constexpr int xChunkSize = 256;
+                constexpr uint32_t numWorkers = traits::GetNumWorkers<xChunkSize>::value;
+
+                // number of blocks in x direction
+                gridSize.x() = ceil(static_cast<double>(gridSize.x()) / static_cast<double>(xChunkSize));
+
+                ValueType* devicePtr = this->destination->getPointer();
 
-private:
-    ValueType *valuePointer_host;
+                CUDA_CHECK(cuplaMallocHost((void**) &valuePointer_host, sizeof(ValueType)));
+                *valuePointer_host = this->value; // copy value to new place
+
+                CUDA_CHECK(cuplaMemcpyAsync(
+                    devicePtr,
+                    valuePointer_host,
+                    sizeof(ValueType),
+                    cuplaMemcpyHostToDevice,
+                    this->getCudaStream()));
+
+                auto destBox = this->destination->getDataBox();
+                CUPLA_KERNEL(KernelSetValue<numWorkers, xChunkSize>)
+                (gridSize.toDim3(), numWorkers, 0, this->getCudaStream())(destBox, devicePtr, area_size);
+            }
+
+            this->activate();
+        }
 
-};
+    private:
+        ValueType* valuePointer_host;
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/transactions/Transaction.hpp b/include/pmacc/eventSystem/transactions/Transaction.hpp
index 37ab193467..33aa581f2b 100644
--- a/include/pmacc/eventSystem/transactions/Transaction.hpp
+++ b/include/pmacc/eventSystem/transactions/Transaction.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -26,54 +26,51 @@
 
 namespace pmacc
 {
-
-class EventStream;
-
-/**
- * Represents a single transaction in the task/event synchronization system.
- */
-class Transaction
-{
-public:
-
-    /**
-     * Constructor.
-     *
-     * @param event initial EventTask for base event
-     */
-    HINLINE Transaction(EventTask event);
+    class EventStream;
 
     /**
-     * Adds event to the base event of this transaction.
-     *
-     * @param event EventTask to add to base event
-     * @return new base event
+     * Represents a single transaction in the task/event synchronization system.
      */
-    HINLINE EventTask setTransactionEvent(const EventTask& event);
+    class Transaction
+    {
+    public:
+        /**
+         * Constructor.
+         *
+         * @param event initial EventTask for base event
+         */
+        HINLINE Transaction(EventTask event);
 
-    /**
-     * Returns the current base event.
-     *
-     * @return current base event
-     */
-    HINLINE EventTask getTransactionEvent();
+        /**
+         * Adds event to the base event of this transaction.
+         *
+         * @param event EventTask to add to base event
+         * @return new base event
+         */
+        HINLINE EventTask setTransactionEvent(const EventTask& event);
 
-    /**
-     * Performs an operation on the transaction which leads to synchronization.
-     *
-     * @param operation type of operation to perform, defines resulting synchronization.
-     */
-    HINLINE void operation(ITask::TaskType operation);
+        /**
+         * Returns the current base event.
+         *
+         * @return current base event
+         */
+        HINLINE EventTask getTransactionEvent();
 
-    /* Get a EventStream which include all dependencies
-     * @param operation type of operation to perform
-     * @return EventStream with solved dependencies
-     */
-    HINLINE EventStream* getEventStream(ITask::TaskType operation);
+        /**
+         * Performs an operation on the transaction which leads to synchronization.
+         *
+         * @param operation type of operation to perform, defines resulting synchronization.
+         */
+        HINLINE void operation(ITask::TaskType operation);
 
-private:
-    EventTask baseEvent;
-};
+        /* Get a EventStream which include all dependencies
+         * @param operation type of operation to perform
+         * @return EventStream with solved dependencies
+         */
+        HINLINE EventStream* getEventStream(ITask::TaskType operation);
 
-}
+    private:
+        EventTask baseEvent;
+    };
 
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/transactions/Transaction.tpp b/include/pmacc/eventSystem/transactions/Transaction.tpp
index 51738a2e04..1f1f0c6d10 100644
--- a/include/pmacc/eventSystem/transactions/Transaction.tpp
+++ b/include/pmacc/eventSystem/transactions/Transaction.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -29,60 +29,58 @@
 
 namespace pmacc
 {
+    Transaction::Transaction(EventTask event) : baseEvent(event)
+    {
+    }
 
-Transaction::Transaction( EventTask event ) : baseEvent( event )
-{
-
-}
-
-EventTask Transaction::setTransactionEvent( const EventTask& event )
-{
-    baseEvent += event;
-    return baseEvent;
-}
-
-EventTask Transaction::getTransactionEvent( )
-{
-    return baseEvent;
-}
+    EventTask Transaction::setTransactionEvent(const EventTask& event)
+    {
+        baseEvent += event;
+        return baseEvent;
+    }
 
-void Transaction::operation( ITask::TaskType operation )
-{
-    if ( operation == ITask::TASK_CUDA )
+    EventTask Transaction::getTransactionEvent()
     {
-        Manager &manager = Environment<>::get( ).Manager( );
+        return baseEvent;
+    }
 
-        ITask* baseTask = manager.getITaskIfNotFinished( this->baseEvent.getTaskId( ) );
-        if ( baseTask != nullptr )
+    void Transaction::operation(ITask::TaskType operation)
+    {
+        if(operation == ITask::TASK_DEVICE)
         {
-            if ( baseTask->getTaskType( ) == ITask::TASK_CUDA )
+            Manager& manager = Environment<>::get().Manager();
+
+            ITask* baseTask = manager.getITaskIfNotFinished(this->baseEvent.getTaskId());
+            if(baseTask != nullptr)
             {
-                /* no blocking is needed */
-                return;
+                if(baseTask->getTaskType() == ITask::TASK_DEVICE)
+                {
+                    /* no blocking is needed */
+                    return;
+                }
             }
         }
+        baseEvent.waitForFinished();
     }
-    baseEvent.waitForFinished( );
-}
 
-EventStream* Transaction::getEventStream( ITask::TaskType )
-{
-    Manager &manager = Environment<>::get( ).Manager( );
-    ITask* baseTask = manager.getITaskIfNotFinished( this->baseEvent.getTaskId( ) );
-
-    if ( baseTask != nullptr )
+    EventStream* Transaction::getEventStream(ITask::TaskType)
     {
-        if ( baseTask->getTaskType( ) == ITask::TASK_CUDA )
+        Manager& manager = Environment<>::get().Manager();
+        ITask* baseTask = manager.getITaskIfNotFinished(this->baseEvent.getTaskId());
+
+        if(baseTask != nullptr)
         {
-            /* `StreamTask` from previous task must be reused to guarantee
-             * that the dependency chain not brake
-             */
-            StreamTask* task = static_cast<StreamTask*> ( baseTask );
-            return task->getEventStream( );
+            if(baseTask->getTaskType() == ITask::TASK_DEVICE)
+            {
+                /* `StreamTask` from previous task must be reused to guarantee
+                 * that the dependency chain not brake
+                 */
+                StreamTask* task = static_cast<StreamTask*>(baseTask);
+                return task->getEventStream();
+            }
+            baseEvent.waitForFinished();
         }
-        baseEvent.waitForFinished( );
+        return Environment<>::get().StreamController().getNextStream();
     }
-    return Environment<>::get( ).StreamController( ).getNextStream( );
-}
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/transactions/TransactionManager.hpp b/include/pmacc/eventSystem/transactions/TransactionManager.hpp
index 1ce012b40c..ed7f6e6a77 100644
--- a/include/pmacc/eventSystem/transactions/TransactionManager.hpp
+++ b/include/pmacc/eventSystem/transactions/TransactionManager.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,78 +28,74 @@
 
 namespace pmacc
 {
-// forward declaration
-template<unsigned DIM>
-class Environment;
+    // forward declaration
+    template<unsigned DIM>
+    class Environment;
 
-class EventStream;
-
-/**
- * Manages the task/event synchronization system using task 'transactions'.
- * Transactions are grouped on a stack.
- */
-class TransactionManager
-{
-public:
-    /**
-     * Destructor.
-     */
-    virtual ~TransactionManager() /*noexcept(false)*/;
-
-    /**
-     * Adds a new transaction to the stack.
-     *
-     * @param serialEvent initial base event for new transaction
-     */
-    void startTransaction(EventTask serialEvent = EventTask());
+    class EventStream;
 
     /**
-     * Removes the top-most transaction from the stack.
-     *
-     * @return the base event of the removed transaction
+     * Manages the task/event synchronization system using task 'transactions'.
+     * Transactions are grouped on a stack.
      */
-    EventTask endTransaction();
-
-    /**
-     * Synchronizes a blocking operation with events on the top-most transaction.
-     *
-     * @param op operation type for synchronization
-     * @return an EventStream which can be used for StreamTasks
-     */
-    void startOperation(ITask::TaskType op);
-
-    /**
-     * Adds event to the base event of the top-most transaction.
-     *
-     * @param event event to add to base event
-     * @return new base event
-     */
-    EventTask setTransactionEvent(const EventTask& event);
-
-    /**
-     * Returns the base event of the top-most transaction.
-     *
-     * @return base event
-     */
-    EventTask getTransactionEvent();
-
-    EventStream* getEventStream(ITask::TaskType op);
-
-private:
-
-    friend struct detail::Environment;
-
-    TransactionManager();
-
-    TransactionManager(const TransactionManager& cc);
-
-    static TransactionManager& getInstance();
-
-    std::stack<Transaction> transactions;
-};
-
-
-}
-
-
-
+    class TransactionManager
+    {
+    public:
+        /**
+         * Destructor.
+         */
+        virtual ~TransactionManager() /*noexcept(false)*/;
+
+        /**
+         * Adds a new transaction to the stack.
+         *
+         * @param serialEvent initial base event for new transaction
+         */
+        void startTransaction(EventTask serialEvent = EventTask());
+
+        /**
+         * Removes the top-most transaction from the stack.
+         *
+         * @return the base event of the removed transaction
+         */
+        EventTask endTransaction();
+
+        /**
+         * Synchronizes a blocking operation with events on the top-most transaction.
+         *
+         * @param op operation type for synchronization
+         * @return an EventStream which can be used for StreamTasks
+         */
+        void startOperation(ITask::TaskType op);
+
+        /**
+         * Adds event to the base event of the top-most transaction.
+         *
+         * @param event event to add to base event
+         * @return new base event
+         */
+        EventTask setTransactionEvent(const EventTask& event);
+
+        /**
+         * Returns the base event of the top-most transaction.
+         *
+         * @return base event
+         */
+        EventTask getTransactionEvent();
+
+        EventStream* getEventStream(ITask::TaskType op);
+
+    private:
+        friend struct detail::Environment;
+
+        TransactionManager();
+
+        TransactionManager(const TransactionManager& cc);
+
+        static TransactionManager& getInstance();
+
+        std::stack<Transaction> transactions;
+    };
+
+
+} // namespace pmacc
diff --git a/include/pmacc/eventSystem/transactions/TransactionManager.tpp b/include/pmacc/eventSystem/transactions/TransactionManager.tpp
index 32e33f18f6..6cbc0bc881 100644
--- a/include/pmacc/eventSystem/transactions/TransactionManager.tpp
+++ b/include/pmacc/eventSystem/transactions/TransactionManager.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,80 +28,78 @@
 
 namespace pmacc
 {
-
-inline TransactionManager::~TransactionManager() /*noexcept(false)*/
-{
-    if(transactions.size() == 0)
-        std::cerr << "[PMacc] [TransactionManager] "
-                  << "Missing transaction on the stack!" << std::endl;
-    else if(transactions.size() > 1)
-        std::cerr << "[PMacc] [TransactionManager] "
-                  << "Unfinished transactions on the stack" << std::endl;
-    transactions.pop( );
-}
-
-inline TransactionManager::TransactionManager( )
-{
-    startTransaction( EventTask( ) );
-}
-
-inline TransactionManager::TransactionManager( const TransactionManager& )
-{
-
-}
-
-inline void TransactionManager::startTransaction( EventTask serialEvent )
-{
-    transactions.push( Transaction( serialEvent ) );
-}
-
-inline EventTask TransactionManager::endTransaction( )
-{
-    if ( transactions.size( ) == 0 )
-        throw std::runtime_error( "Calling endTransaction on empty transaction stack is not allowed" );
-
-    EventTask event = transactions.top( ).getTransactionEvent( );
-    transactions.pop( );
-    return event;
-}
-
-inline void TransactionManager::startOperation( ITask::TaskType op )
-{
-    if ( transactions.size( ) == 0 )
-        throw std::runtime_error( "Calling startOperation on empty transaction stack is not allowed" );
-
-    transactions.top( ).operation( op );
-}
-
-inline EventStream* TransactionManager::getEventStream( ITask::TaskType op )
-{
-    if ( transactions.size( ) == 0 )
-        throw std::runtime_error( "Calling startOperation on empty transaction stack is not allowed" );
-
-    return transactions.top( ).getEventStream( op );
-}
-
-inline EventTask TransactionManager::setTransactionEvent( const EventTask& event )
-{
-    if ( transactions.size( ) == 0 )
-        throw std::runtime_error( "Calling setTransactionEvent on empty transaction stack is not allowed" );
-
-    return transactions.top( ).setTransactionEvent( event );
-}
-
-inline EventTask TransactionManager::getTransactionEvent( )
-{
-    if ( transactions.size( ) == 0 )
-        throw std::runtime_error( "Calling getTransactionEvent on empty transaction stack is not allowed" );
-
-    return transactions.top( ).getTransactionEvent( );
-}
-
-inline TransactionManager& TransactionManager::getInstance( )
-{
-    static TransactionManager instance;
-    return instance;
-}
-
-
-}
+    inline TransactionManager::~TransactionManager() /*noexcept(false)*/
+    {
+        if(transactions.size() == 0)
+            std::cerr << "[PMacc] [TransactionManager] "
+                      << "Missing transaction on the stack!" << std::endl;
+        else if(transactions.size() > 1)
+            std::cerr << "[PMacc] [TransactionManager] "
+                      << "Unfinished transactions on the stack" << std::endl;
+        transactions.pop();
+    }
+
+    inline TransactionManager::TransactionManager()
+    {
+        startTransaction(EventTask());
+    }
+
+    inline TransactionManager::TransactionManager(const TransactionManager&)
+    {
+    }
+
+    inline void TransactionManager::startTransaction(EventTask serialEvent)
+    {
+        transactions.push(Transaction(serialEvent));
+    }
+
+    inline EventTask TransactionManager::endTransaction()
+    {
+        if(transactions.size() == 0)
+            throw std::runtime_error("Calling endTransaction on empty transaction stack is not allowed");
+
+        EventTask event = transactions.top().getTransactionEvent();
+        transactions.pop();
+        return event;
+    }
+
+    inline void TransactionManager::startOperation(ITask::TaskType op)
+    {
+        if(transactions.size() == 0)
+            throw std::runtime_error("Calling startOperation on empty transaction stack is not allowed");
+
+        transactions.top().operation(op);
+    }
+
+    inline EventStream* TransactionManager::getEventStream(ITask::TaskType op)
+    {
+        if(transactions.size() == 0)
+            throw std::runtime_error("Calling startOperation on empty transaction stack is not allowed");
+
+        return transactions.top().getEventStream(op);
+    }
+
+    inline EventTask TransactionManager::setTransactionEvent(const EventTask& event)
+    {
+        if(transactions.size() == 0)
+            throw std::runtime_error("Calling setTransactionEvent on empty transaction stack is not allowed");
+
+        return transactions.top().setTransactionEvent(event);
+    }
+
+    inline EventTask TransactionManager::getTransactionEvent()
+    {
+        if(transactions.size() == 0)
+            throw std::runtime_error("Calling getTransactionEvent on empty transaction stack is not allowed");
+
+        return transactions.top().getTransactionEvent();
+    }
+
+    inline TransactionManager& TransactionManager::getInstance()
+    {
+        static TransactionManager instance;
+        return instance;
+    }
+
+
+} // namespace pmacc
diff --git a/include/pmacc/fields/SimulationFieldHelper.hpp b/include/pmacc/fields/SimulationFieldHelper.hpp
index a074ae3cc0..7ea42322a5 100644
--- a/include/pmacc/fields/SimulationFieldHelper.hpp
+++ b/include/pmacc/fields/SimulationFieldHelper.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -28,37 +28,37 @@
 
 namespace pmacc
 {
+    template<class CellDescription>
+    class SimulationFieldHelper
+    {
+    public:
+        typedef CellDescription MappingDesc;
 
-template<class CellDescription>
-class SimulationFieldHelper
-{
-public:
-
-    typedef CellDescription MappingDesc;
-
-    SimulationFieldHelper(CellDescription description) :
-    cellDescription(description)
-    {}
+        SimulationFieldHelper(CellDescription description) : cellDescription(description)
+        {
+        }
 
-    virtual ~SimulationFieldHelper(){}
+        virtual ~SimulationFieldHelper()
+        {
+        }
 
-    /**
-     * Reset is as well used for init.
-     */
-    virtual void reset(uint32_t currentStep) = 0;
+        /**
+         * Reset is as well used for init.
+         */
+        virtual void reset(uint32_t currentStep) = 0;
 
-    /**
-     * Synchronize data from host to device.
-     */
-    virtual void syncToDevice() = 0;
+        /**
+         * Synchronize data from host to device.
+         */
+        virtual void syncToDevice() = 0;
 
-    CellDescription getCellDescription() const
-    {
-        return cellDescription;
-    }
+        CellDescription getCellDescription() const
+        {
+            return cellDescription;
+        }
 
-protected:
-    CellDescription cellDescription;
-};
+    protected:
+        CellDescription cellDescription;
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/fields/operations/AddExchangeToBorder.hpp b/include/pmacc/fields/operations/AddExchangeToBorder.hpp
index 30088c2887..c1349af50e 100644
--- a/include/pmacc/fields/operations/AddExchangeToBorder.hpp
+++ b/include/pmacc/fields/operations/AddExchangeToBorder.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -34,208 +34,167 @@
 
 namespace pmacc
 {
-namespace fields
-{
-namespace operations
-{
-
-    /** Add field values from a received temporary buffer (exchange) to the local box (border)
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template< uint32_t T_numWorkers >
-    struct KernelAddExchangeToBorder
+    namespace fields
     {
-        /** add intermediate box to the border of the local box
-         *
-         * The `template< typename T> operator+( T const & rhs )` must be defined for
-         * the value type of exchangeBox and destBox.
-         *
-         * @tparam T_DestBox pmacc::DataBox, type of the local box
-         * @tparam T_ExchangeBox pmacc::ExchangeBox, type of the intermediate box
-         * @tparam T_Extent pmacc::DataSpace, type to describe n-dimensional sizes
-         * @tparam T_Mapping mapper functor type
-         *
-         * @param destBox box to a local field
-         * @param exchangeBox exchange box with guard data from the neighboring GPU
-         * @param exchangeSize dimensions of exchangeBox
-         * @param direction the direction of exchangeBox
-         * @param mapper functor to map a CUDA block to a supercell
-         */
-        template<
-            typename T_DestBox,
-            typename T_ExchangeBox,
-            typename T_Extent,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_DestBox & destBox,
-            T_ExchangeBox const & exchangeBox,
-            T_Extent const & exchangeSize,
-            T_Extent const & direction,
-            T_Mapping const & mapper
-        ) const
+        namespace operations
         {
-            using namespace mappings::threads;
-
-            using SuperCellSize = typename T_Mapping::SuperCellSize;
-
-            // number of cells in a superCell
-            constexpr uint32_t numCells = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorkers = T_numWorkers;
-            PMACC_CONSTEXPR_CAPTURE int dim = T_Mapping::Dim;
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            DataSpace< dim > const blockCell(
-                mapper.getSuperCellIndex( DataSpace< dim >( blockIdx ) )
-                    * SuperCellSize::toRT()
-            );
-
-            // origin in area from local GPU
-            DataSpace< dim > nullSourceCell(
-                mapper.getSuperCellIndex( DataSpace< dim > () )
-                * SuperCellSize::toRT()
-            );
-
-            auto const numGuardSuperCells = mapper.getGuardingSuperCells();
-
-            ForEachIdx<
-                IdxConfig<
-                    numCells,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
+            /** Add field values from a received temporary buffer (exchange) to the local box (border)
+             *
+             * @tparam T_numWorkers number of workers
+             */
+            template<uint32_t T_numWorkers>
+            struct KernelAddExchangeToBorder
+            {
+                /** add intermediate box to the border of the local box
+                 *
+                 * The `template< typename T> operator+( T const & rhs )` must be defined for
+                 * the value type of exchangeBox and destBox.
+                 *
+                 * @tparam T_DestBox pmacc::DataBox, type of the local box
+                 * @tparam T_ExchangeBox pmacc::ExchangeBox, type of the intermediate box
+                 * @tparam T_Extent pmacc::DataSpace, type to describe n-dimensional sizes
+                 * @tparam T_Mapping mapper functor type
+                 *
+                 * @param destBox box to a local field
+                 * @param exchangeBox exchange box with guard data from the neighboring GPU
+                 * @param exchangeSize dimensions of exchangeBox
+                 * @param direction the direction of exchangeBox
+                 * @param mapper functor to map a CUDA block to a supercell
+                 */
+                template<
+                    typename T_DestBox,
+                    typename T_ExchangeBox,
+                    typename T_Extent,
+                    typename T_Mapping,
+                    typename T_Acc>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    T_DestBox& destBox,
+                    T_ExchangeBox const& exchangeBox,
+                    T_Extent const& exchangeSize,
+                    T_Extent const& direction,
+                    T_Mapping const& mapper) const
                 {
-                    // cell index within the superCell
-                    DataSpace< dim > const cellIdx = DataSpaceOperations< dim >::template map< SuperCellSize >( linearIdx );
-                    DataSpace< dim > targetCell( blockCell + cellIdx );
-                    DataSpace< dim > sourceCell( targetCell - nullSourceCell );
+                    using namespace mappings::threads;
 
-                    // supercell offset relative to the guard origin (in cells)
-                    DataSpace< dim > superCellOffsetInGuard( ( sourceCell / SuperCellSize::toRT() ) * SuperCellSize::toRT() );
+                    using SuperCellSize = typename T_Mapping::SuperCellSize;
 
-                    /* defines if the virtual worker needs to add the value from
-                     * the exchange box to the cell in the border
-                     */
-                    bool addValue = true;
+                    // number of cells in a superCell
+                    constexpr uint32_t numCells = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    constexpr uint32_t numWorkers = T_numWorkers;
+                    PMACC_CONSTEXPR_CAPTURE int dim = T_Mapping::Dim;
 
-                    for( uint32_t d = 0; d < dim; ++d )
-                    {
-                        if( direction[ d ] == 1 )
-                        {
-                            if(
-                                superCellOffsetInGuard[ d ] + cellIdx[ d ] <
-                                numGuardSuperCells[ d ] * SuperCellSize::toRT()[ d ] - exchangeSize[ d ]
-                            )
-                                addValue = false;
-                            sourceCell[ d ] -= numGuardSuperCells[ d ] * SuperCellSize::toRT()[ d ] - exchangeSize[ d ];
-                            targetCell[ d ] -= numGuardSuperCells[ d ] * SuperCellSize::toRT()[ d ];
-                        }
-                        else if( direction[ d ] == -1 )
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                    DataSpace<dim> const blockCell(
+                        mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc))) * SuperCellSize::toRT());
+
+                    // origin in area from local GPU
+                    DataSpace<dim> nullSourceCell(mapper.getSuperCellIndex(DataSpace<dim>()) * SuperCellSize::toRT());
+
+                    auto const numGuardSuperCells = mapper.getGuardingSuperCells();
+
+                    ForEachIdx<IdxConfig<numCells, numWorkers>>{
+                        workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                        // cell index within the superCell
+                        DataSpace<dim> const cellIdx
+                            = DataSpaceOperations<dim>::template map<SuperCellSize>(linearIdx);
+                        DataSpace<dim> targetCell(blockCell + cellIdx);
+                        DataSpace<dim> sourceCell(targetCell - nullSourceCell);
+
+                        // supercell offset relative to the guard origin (in cells)
+                        DataSpace<dim> superCellOffsetInGuard(
+                            (sourceCell / SuperCellSize::toRT()) * SuperCellSize::toRT());
+
+                        /* defines if the virtual worker needs to add the value from
+                         * the exchange box to the cell in the border
+                         */
+                        bool addValue = true;
+
+                        for(uint32_t d = 0; d < dim; ++d)
                         {
-                            if( superCellOffsetInGuard[ d ] + cellIdx[ d ] >= exchangeSize[ d ] )
-                                addValue = false;
-                            targetCell[ d ] += numGuardSuperCells[ d ] * SuperCellSize::toRT()[ d ];
+                            if(direction[d] == 1)
+                            {
+                                if(superCellOffsetInGuard[d] + cellIdx[d]
+                                   < numGuardSuperCells[d] * SuperCellSize::toRT()[d] - exchangeSize[d])
+                                    addValue = false;
+                                sourceCell[d] -= numGuardSuperCells[d] * SuperCellSize::toRT()[d] - exchangeSize[d];
+                                targetCell[d] -= numGuardSuperCells[d] * SuperCellSize::toRT()[d];
+                            }
+                            else if(direction[d] == -1)
+                            {
+                                if(superCellOffsetInGuard[d] + cellIdx[d] >= exchangeSize[d])
+                                    addValue = false;
+                                targetCell[d] += numGuardSuperCells[d] * SuperCellSize::toRT()[d];
+                            }
                         }
-                    }
-                    if( addValue )
-                        destBox( targetCell ) += exchangeBox( sourceCell );
+                        if(addValue)
+                            destBox(targetCell) += exchangeBox(sourceCell);
+                    });
                 }
-            );
-        }
-    };
+            };
 
 
-    /** add a exchange buffer to the border of the local buffer
-     *
-     * CopyGuardToExchange is the opposite operation for the neighboring
-     * device to create an exchange which can be added with this functor.
-     */
-    struct AddExchangeToBorder
-    {
-        /** add exchange to border of the local buffer
-         *
-         * Add data cell-wise from the exchange to the border of the local buffer.
-         * The `template< typename T> operator+( T const & rhs )` must be defined for
-         * the value type of the buffer.
-         *
-         * @tparam T_DestBuffer pmacc::GridBuffer, type of the used buffer
-         * @tparam T_SuperCellSize pmacc::math::CT::vector, size of the supercell in each direction
-         *
-         * @param destBuffer destination buffer with exchanges
-         * @param superCellSize compile time supercell size
-         * @param exchangeType the exchange direction which needs to be copied
-         */
-        template<
-            typename T_DestBuffer,
-            typename T_SuperCellSize
-        >
-        void operator()(
-            T_DestBuffer & destBuffer,
-            T_SuperCellSize const & superCellSize,
-            uint32_t const exchangeType
-        ) const
-        {
-            boost::ignore_unused( superCellSize );
+            /** add a exchange buffer to the border of the local buffer
+             *
+             * CopyGuardToExchange is the opposite operation for the neighboring
+             * device to create an exchange which can be added with this functor.
+             */
+            struct AddExchangeToBorder
+            {
+                /** add exchange to border of the local buffer
+                 *
+                 * Add data cell-wise from the exchange to the border of the local buffer.
+                 * The `template< typename T> operator+( T const & rhs )` must be defined for
+                 * the value type of the buffer.
+                 *
+                 * @tparam T_DestBuffer pmacc::GridBuffer, type of the used buffer
+                 * @tparam T_SuperCellSize pmacc::math::CT::vector, size of the supercell in each direction
+                 *
+                 * @param destBuffer destination buffer with exchanges
+                 * @param superCellSize compile time supercell size
+                 * @param exchangeType the exchange direction which needs to be copied
+                 */
+                template<typename T_DestBuffer, typename T_SuperCellSize>
+                void operator()(
+                    T_DestBuffer& destBuffer,
+                    T_SuperCellSize const& superCellSize,
+                    uint32_t const exchangeType) const
+                {
+                    boost::ignore_unused(superCellSize);
 
-            using SuperCellSize = T_SuperCellSize;
+                    using SuperCellSize = T_SuperCellSize;
 
-            constexpr int dim = T_SuperCellSize::dim;
+                    constexpr int dim = T_SuperCellSize::dim;
 
-            using MappingDesc = MappingDescription<
-                dim,
-                SuperCellSize
-            >;
+                    using MappingDesc = MappingDescription<dim, SuperCellSize>;
 
-            /* use only the x dimension to determine the number of supercells in the GUARD
-             *
-             * @warning pmacc restriction: all dimension must have the some number of guarding
-             * supercells
-             */
-            auto const numGuardSuperCells = destBuffer.getGridLayout().getGuard() /
-                SuperCellSize::toRT();
-
-            MappingDesc const mappingDesc(
-                destBuffer.getGridLayout().getDataSpace(),
-                numGuardSuperCells
-            );
-
-            ExchangeMapping<
-                GUARD,
-                MappingDesc
-            > mapper(
-                mappingDesc,
-                exchangeType
-            );
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            const DataSpace< dim > direction = Mask::getRelativeDirections< dim >(
-                mapper.getExchangeType( )
-            );
-
-            PMACC_KERNEL( KernelAddExchangeToBorder< numWorkers >{ } )(
-                mapper.getGridDim( ),
-                numWorkers
-            )(
-                destBuffer.getDeviceBuffer( ).getDataBox( ),
-                destBuffer.getReceiveExchange( exchangeType ).getDeviceBuffer( ).getDataBox( ),
-                destBuffer.getReceiveExchange( exchangeType ).getDeviceBuffer( ).getDataSpace( ),
-                direction,
-                mapper
-            );
-        }
-    };
-
-} // namespace operations
-} // namespace fields
+                    /* use only the x dimension to determine the number of supercells in the GUARD
+                     *
+                     * @warning pmacc restriction: all dimension must have the some number of guarding
+                     * supercells
+                     */
+                    auto const numGuardSuperCells = destBuffer.getGridLayout().getGuard() / SuperCellSize::toRT();
+
+                    MappingDesc const mappingDesc(destBuffer.getGridLayout().getDataSpace(), numGuardSuperCells);
+
+                    ExchangeMapping<GUARD, MappingDesc> mapper(mappingDesc, exchangeType);
+
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                    const DataSpace<dim> direction = Mask::getRelativeDirections<dim>(mapper.getExchangeType());
+
+                    PMACC_KERNEL(KernelAddExchangeToBorder<numWorkers>{})
+                    (mapper.getGridDim(), numWorkers)(
+                        destBuffer.getDeviceBuffer().getDataBox(),
+                        destBuffer.getReceiveExchange(exchangeType).getDeviceBuffer().getDataBox(),
+                        destBuffer.getReceiveExchange(exchangeType).getDeviceBuffer().getDataSpace(),
+                        direction,
+                        mapper);
+                }
+            };
+
+        } // namespace operations
+    } // namespace fields
 } // namespace pmacc
diff --git a/include/pmacc/fields/operations/CopyGuardToExchange.hpp b/include/pmacc/fields/operations/CopyGuardToExchange.hpp
index 374cf82310..9d1e56797e 100644
--- a/include/pmacc/fields/operations/CopyGuardToExchange.hpp
+++ b/include/pmacc/fields/operations/CopyGuardToExchange.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Marco Garten,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -34,198 +34,157 @@
 
 namespace pmacc
 {
-namespace fields
-{
-namespace operations
-{
-
-    /** copy guarding cells to an intermediate buffer
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template< uint32_t T_numWorkers >
-    struct KernelCopyGuardToExchange
+    namespace fields
     {
-        /** copy guarding cells to an intermediate box
-         *
-         * @tparam T_ExchangeBox pmacc::ExchangeBox, type of the intermediate box
-         * @tparam T_SrcBox pmacc::DataBox, type of the local box
-         * @tparam T_Extent pmacc::DataSpace, type to describe n-dimensional sizes
-         * @tparam T_Mapping mapper functor type
-         *
-         * @param exchangeBox exchange box for the guard data of the local GPU
-         * @param srcBox box to a local field
-         * @param exchangeSize dimensions of exchangeBox
-         * @param direction the direction of exchangeBox
-         * @param mapper functor to map a CUDA block to a supercell
-         */
-        template<
-            typename T_ExchangeBox,
-            typename T_SrcBox,
-            typename T_Extent,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_ExchangeBox & exchangeBox,
-            T_SrcBox const & srcBox,
-            T_Extent const & exchangeSize,
-            T_Extent const & direction,
-            T_Mapping const & mapper
-        ) const
+        namespace operations
         {
-            using namespace mappings::threads;
-
-            using SuperCellSize = typename T_Mapping::SuperCellSize;
-
-            // number of cells in a superCell
-            constexpr uint32_t numCells = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorkers = T_numWorkers;
-            PMACC_CONSTEXPR_CAPTURE int dim = T_Mapping::Dim;
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            DataSpace< dim > const blockCell(
-                mapper.getSuperCellIndex( DataSpace< dim >( blockIdx ) ) *
-                SuperCellSize::toRT()
-            );
-
-            // origin in area from local GPU
-            DataSpace< dim > nullSourceCell(
-                mapper.getSuperCellIndex( DataSpace< dim > () ) *
-                SuperCellSize::toRT()
-            );
-
-            auto const numGuardSuperCells = mapper.getGuardingSuperCells();
-
-            ForEachIdx<
-                IdxConfig<
-                    numCells,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
+            /** copy guarding cells to an intermediate buffer
+             *
+             * @tparam T_numWorkers number of workers
+             */
+            template<uint32_t T_numWorkers>
+            struct KernelCopyGuardToExchange
+            {
+                /** copy guarding cells to an intermediate box
+                 *
+                 * @tparam T_ExchangeBox pmacc::ExchangeBox, type of the intermediate box
+                 * @tparam T_SrcBox pmacc::DataBox, type of the local box
+                 * @tparam T_Extent pmacc::DataSpace, type to describe n-dimensional sizes
+                 * @tparam T_Mapping mapper functor type
+                 *
+                 * @param exchangeBox exchange box for the guard data of the local GPU
+                 * @param srcBox box to a local field
+                 * @param exchangeSize dimensions of exchangeBox
+                 * @param direction the direction of exchangeBox
+                 * @param mapper functor to map a CUDA block to a supercell
+                 */
+                template<
+                    typename T_ExchangeBox,
+                    typename T_SrcBox,
+                    typename T_Extent,
+                    typename T_Mapping,
+                    typename T_Acc>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    T_ExchangeBox& exchangeBox,
+                    T_SrcBox const& srcBox,
+                    T_Extent const& exchangeSize,
+                    T_Extent const& direction,
+                    T_Mapping const& mapper) const
                 {
-                    // cell index within the superCell
-                    DataSpace< dim > const cellIdx = DataSpaceOperations< dim >::template map< SuperCellSize >( linearIdx );
+                    using namespace mappings::threads;
 
-                    DataSpace< T_Mapping::Dim > const sourceCell( blockCell + cellIdx );
-                    DataSpace< dim > targetCell( sourceCell - nullSourceCell );
+                    using SuperCellSize = typename T_Mapping::SuperCellSize;
 
-                    // supercell offset relative to the guard origin (in cells)
-                    DataSpace< dim > superCellOffsetInGuard( ( targetCell / SuperCellSize::toRT() ) * SuperCellSize::toRT() );
+                    // number of cells in a superCell
+                    constexpr uint32_t numCells = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                    constexpr uint32_t numWorkers = T_numWorkers;
+                    PMACC_CONSTEXPR_CAPTURE int dim = T_Mapping::Dim;
 
-                    /* defines if the virtual worker needs to copy the value of
-                     * the cell to to the exchange box
-                     */
-                    bool copyValue = true;
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                    DataSpace<dim> const blockCell(
+                        mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc))) * SuperCellSize::toRT());
+
+                    // origin in area from local GPU
+                    DataSpace<dim> nullSourceCell(mapper.getSuperCellIndex(DataSpace<dim>()) * SuperCellSize::toRT());
 
-                    for( uint32_t d = 0; d < dim; ++d )
-                    {
-                        if( direction[ d ] == -1 )
+                    auto const numGuardSuperCells = mapper.getGuardingSuperCells();
+
+                    ForEachIdx<IdxConfig<numCells, numWorkers>>{
+                        workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                        // cell index within the superCell
+                        DataSpace<dim> const cellIdx
+                            = DataSpaceOperations<dim>::template map<SuperCellSize>(linearIdx);
+
+                        DataSpace<T_Mapping::Dim> const sourceCell(blockCell + cellIdx);
+                        DataSpace<dim> targetCell(sourceCell - nullSourceCell);
+
+                        // supercell offset relative to the guard origin (in cells)
+                        DataSpace<dim> superCellOffsetInGuard(
+                            (targetCell / SuperCellSize::toRT()) * SuperCellSize::toRT());
+
+                        /* defines if the virtual worker needs to copy the value of
+                         * the cell to to the exchange box
+                         */
+                        bool copyValue = true;
+
+                        for(uint32_t d = 0; d < dim; ++d)
                         {
-                            if(
-                                superCellOffsetInGuard[ d ] + cellIdx[ d ] <
-                                numGuardSuperCells[ d ] * SuperCellSize::toRT()[ d ] - exchangeSize[ d ]
-                            )
+                            if(direction[d] == -1)
+                            {
+                                if(superCellOffsetInGuard[d] + cellIdx[d]
+                                   < numGuardSuperCells[d] * SuperCellSize::toRT()[d] - exchangeSize[d])
+                                    copyValue = false;
+                                targetCell[d] -= numGuardSuperCells[d] * SuperCellSize::toRT()[d] - exchangeSize[d];
+                            }
+                            else if(direction[d] == 1 && superCellOffsetInGuard[d] + cellIdx[d] >= exchangeSize[d])
                                 copyValue = false;
-                            targetCell[ d ] -= numGuardSuperCells[ d ] * SuperCellSize::toRT()[ d ] - exchangeSize[ d ];
                         }
-                        else if(
-                            direction[d] == 1 && superCellOffsetInGuard[ d ] + cellIdx[ d ] >=
-                            exchangeSize[d]
-                        )
-                            copyValue = false;
-                    }
-
-                    if( copyValue )
-                        exchangeBox( targetCell ) = srcBox( sourceCell );
+
+                        if(copyValue)
+                            exchangeBox(targetCell) = srcBox(sourceCell);
+                    });
                 }
-            );
-        }
-    };
-
-    /** copy guard of the local buffer to the exchange buffer
-     *
-     * AddExchangeToBorder is the opposite operation for the neighboring
-     * device to add the exchange buffer to the local field.
-     */
-    struct CopyGuardToExchange
-    {
-        /** copy local guard to exchange buffer
-         *
-         * Copy data cell-wise from the guard of the local to the exchange buffer.
-         *
-         * @tparam T_SrcBuffer pmacc::GridBuffer, type of the used buffer
-         * @tparam T_SuperCellSize pmacc::math::CT::vector, size of the supercell in each direction
-         *
-         * @param srcBuffer source buffer with exchanges
-         * @param superCellSize compile time supercell size
-         * @param exchangeType the exchange direction which needs to be copied
-         */
-        template<
-            typename T_SrcBuffer,
-            typename T_SuperCellSize
-        >
-        void operator()(
-            T_SrcBuffer & srcBuffer,
-            T_SuperCellSize const & superCellSize,
-            uint32_t const exchangeType
-        ) const
-        {
-            boost::ignore_unused( superCellSize );
+            };
 
-            using SuperCellSize = T_SuperCellSize;
+            /** copy guard of the local buffer to the exchange buffer
+             *
+             * AddExchangeToBorder is the opposite operation for the neighboring
+             * device to add the exchange buffer to the local field.
+             */
+            struct CopyGuardToExchange
+            {
+                /** copy local guard to exchange buffer
+                 *
+                 * Copy data cell-wise from the guard of the local to the exchange buffer.
+                 *
+                 * @tparam T_SrcBuffer pmacc::GridBuffer, type of the used buffer
+                 * @tparam T_SuperCellSize pmacc::math::CT::vector, size of the supercell in each direction
+                 *
+                 * @param srcBuffer source buffer with exchanges
+                 * @param superCellSize compile time supercell size
+                 * @param exchangeType the exchange direction which needs to be copied
+                 */
+                template<typename T_SrcBuffer, typename T_SuperCellSize>
+                void operator()(
+                    T_SrcBuffer& srcBuffer,
+                    T_SuperCellSize const& superCellSize,
+                    uint32_t const exchangeType) const
+                {
+                    boost::ignore_unused(superCellSize);
 
-            constexpr int dim = T_SuperCellSize::dim;
+                    using SuperCellSize = T_SuperCellSize;
 
-            using MappingDesc = MappingDescription<
-                dim,
-                SuperCellSize
-            >;
+                    constexpr int dim = T_SuperCellSize::dim;
 
-            /* use only the x dimension to determine the number of supercells in the guard
-             * pmacc restriction: all dimension must have the some number of guarding
-             * supercells.
-             */
-            auto const numGuardSuperCells = srcBuffer.getGridLayout().getGuard() /
-                SuperCellSize::toRT();
-
-            MappingDesc const mappingDesc(
-                srcBuffer.getGridLayout().getDataSpace(),
-                numGuardSuperCells
-            );
-
-            ExchangeMapping<
-                GUARD,
-                MappingDesc
-            > mapper( mappingDesc, exchangeType );
-
-            DataSpace< dim > const direction = Mask::getRelativeDirections< dim >(
-                mapper.getExchangeType( )
-            );
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            PMACC_KERNEL( KernelCopyGuardToExchange< numWorkers >{ } )(
-                mapper.getGridDim( ),
-                numWorkers
-            )(
-                srcBuffer.getSendExchange( exchangeType ).getDeviceBuffer( ).getDataBox( ),
-                srcBuffer.getDeviceBuffer( ).getDataBox( ),
-                srcBuffer.getSendExchange( exchangeType ).getDeviceBuffer( ).getDataSpace( ),
-                direction,
-                mapper
-            );
-        }
-    };
-
-} // namespace operations
-} // namespace fields
+                    using MappingDesc = MappingDescription<dim, SuperCellSize>;
+
+                    /* use only the x dimension to determine the number of supercells in the guard
+                     * pmacc restriction: all dimension must have the some number of guarding
+                     * supercells.
+                     */
+                    auto const numGuardSuperCells = srcBuffer.getGridLayout().getGuard() / SuperCellSize::toRT();
+
+                    MappingDesc const mappingDesc(srcBuffer.getGridLayout().getDataSpace(), numGuardSuperCells);
+
+                    ExchangeMapping<GUARD, MappingDesc> mapper(mappingDesc, exchangeType);
+
+                    DataSpace<dim> const direction = Mask::getRelativeDirections<dim>(mapper.getExchangeType());
+
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                    PMACC_KERNEL(KernelCopyGuardToExchange<numWorkers>{})
+                    (mapper.getGridDim(), numWorkers)(
+                        srcBuffer.getSendExchange(exchangeType).getDeviceBuffer().getDataBox(),
+                        srcBuffer.getDeviceBuffer().getDataBox(),
+                        srcBuffer.getSendExchange(exchangeType).getDeviceBuffer().getDataSpace(),
+                        direction,
+                        mapper);
+                }
+            };
+
+        } // namespace operations
+    } // namespace fields
 } // namespace pmacc
diff --git a/include/pmacc/fields/tasks/FieldFactory.hpp b/include/pmacc/fields/tasks/FieldFactory.hpp
index 7d7df44659..a2c6804c5a 100644
--- a/include/pmacc/fields/tasks/FieldFactory.hpp
+++ b/include/pmacc/fields/tasks/FieldFactory.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,7 +28,6 @@
 
 namespace pmacc
 {
-
     /**
      * Singleton Factory-pattern class for creation of several types of EventTasks.
      * Tasks are not actually 'returned' but immediately initialised and
@@ -37,34 +36,34 @@ namespace pmacc
     class FieldFactory
     {
     public:
-
         /**
          * Creates a TaskReceive.
          * @param ex Exchange to create new TaskReceive with
          * @param task_out returns the newly created task
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
         template<class Field>
-        EventTask createTaskFieldReceiveAndInsert(Field &buffer,
-        ITask *registeringTask = nullptr);
+        EventTask createTaskFieldReceiveAndInsert(Field& buffer, ITask* registeringTask = nullptr);
 
         template<class Field>
-        EventTask createTaskFieldReceiveAndInsertExchange(Field &buffer, uint32_t exchange,
-        ITask *registeringTask = nullptr);
+        EventTask createTaskFieldReceiveAndInsertExchange(
+            Field& buffer,
+            uint32_t exchange,
+            ITask* registeringTask = nullptr);
 
         /**
          * Creates a TaskSend.
          * @param ex Exchange to create new TaskSend with
          * @param task_in TaskReceive to register at new TaskSend
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
         template<class Field>
-        EventTask createTaskFieldSend(Field &buffer,
-        ITask *registeringTask = nullptr);
+        EventTask createTaskFieldSend(Field& buffer, ITask* registeringTask = nullptr);
 
         template<class Field>
-        EventTask createTaskFieldSendExchange(Field &buffer, uint32_t exchange,
-        ITask *registeringTask = nullptr);
+        EventTask createTaskFieldSendExchange(Field& buffer, uint32_t exchange, ITask* registeringTask = nullptr);
 
         /**
          * returns the instance of this factory
@@ -77,14 +76,11 @@ namespace pmacc
         }
 
     private:
+        FieldFactory(){};
 
-        FieldFactory() { };
-
-        FieldFactory(const FieldFactory&) { };
-
+        FieldFactory(const FieldFactory&){};
     };
 
-} //namespace pmacc
+} // namespace pmacc
 
 #include "pmacc/fields/tasks/FieldFactory.tpp"
-
diff --git a/include/pmacc/fields/tasks/FieldFactory.tpp b/include/pmacc/fields/tasks/FieldFactory.tpp
index 6c27b90a29..cc4a68c0e4 100644
--- a/include/pmacc/fields/tasks/FieldFactory.tpp
+++ b/include/pmacc/fields/tasks/FieldFactory.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -32,47 +32,44 @@
 
 namespace pmacc
 {
-
     template<class Field>
-    inline EventTask FieldFactory::createTaskFieldReceiveAndInsert(Field &buffer,
-                                                                   ITask *registeringTask)
+    inline EventTask FieldFactory::createTaskFieldReceiveAndInsert(Field& buffer, ITask* registeringTask)
     {
-        TaskFieldReceiveAndInsert<Field>* task = new TaskFieldReceiveAndInsert<Field > (buffer);
+        TaskFieldReceiveAndInsert<Field>* task = new TaskFieldReceiveAndInsert<Field>(buffer);
 
         return Environment<>::get().Factory().startTask(*task, registeringTask);
     }
 
     template<class Field>
-    inline EventTask FieldFactory::createTaskFieldReceiveAndInsertExchange(Field &buffer, uint32_t exchange,
-                                                                           ITask *registeringTask)
+    inline EventTask FieldFactory::createTaskFieldReceiveAndInsertExchange(
+        Field& buffer,
+        uint32_t exchange,
+        ITask* registeringTask)
     {
-        TaskFieldReceiveAndInsertExchange<Field>* task = new TaskFieldReceiveAndInsertExchange<Field > (buffer, exchange);
+        TaskFieldReceiveAndInsertExchange<Field>* task
+            = new TaskFieldReceiveAndInsertExchange<Field>(buffer, exchange);
 
         return Environment<>::get().Factory().startTask(*task, registeringTask);
     }
 
     template<class Field>
-    inline EventTask FieldFactory::createTaskFieldSend(Field &buffer,
-                                                       ITask *registeringTask)
+    inline EventTask FieldFactory::createTaskFieldSend(Field& buffer, ITask* registeringTask)
     {
-        TaskFieldSend<Field>* task = new TaskFieldSend<Field > (buffer);
+        TaskFieldSend<Field>* task = new TaskFieldSend<Field>(buffer);
 
         return Environment<>::get().Factory().startTask(*task, registeringTask);
     }
 
     template<class Field>
-    inline EventTask FieldFactory::createTaskFieldSendExchange(Field &buffer, uint32_t exchange,
-                                                               ITask *registeringTask)
+    inline EventTask FieldFactory::createTaskFieldSendExchange(
+        Field& buffer,
+        uint32_t exchange,
+        ITask* registeringTask)
     {
-        TaskFieldSendExchange<Field>* task = new TaskFieldSendExchange<Field > (buffer, exchange);
+        TaskFieldSendExchange<Field>* task = new TaskFieldSendExchange<Field>(buffer, exchange);
 
         return Environment<>::get().Factory().startTask(*task, registeringTask);
     }
 
 
-
-} //namespace pmacc
-
-
-
-
+} // namespace pmacc
diff --git a/include/pmacc/fields/tasks/TaskFieldReceiveAndInsert.hpp b/include/pmacc/fields/tasks/TaskFieldReceiveAndInsert.hpp
index 02b57946ec..fd929c9c01 100644
--- a/include/pmacc/fields/tasks/TaskFieldReceiveAndInsert.hpp
+++ b/include/pmacc/fields/tasks/TaskFieldReceiveAndInsert.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -32,114 +32,107 @@
 
 namespace pmacc
 {
-
-template<class Field>
-class TaskFieldReceiveAndInsert : public MPITask
-{
-public:
-
-
-    static constexpr uint32_t Dim = picongpu::simDim;
-
-    TaskFieldReceiveAndInsert(Field &buffer) :
-    m_buffer(buffer),
-    m_state(Constructor)
+    template<class Field>
+    class TaskFieldReceiveAndInsert : public MPITask
     {
-    }
+    public:
+        static constexpr uint32_t Dim = picongpu::simDim;
 
-    virtual void init()
-    {
-        m_state = Init;
-        EventTask serialEvent = __getTransactionEvent();
+        TaskFieldReceiveAndInsert(Field& buffer) : m_buffer(buffer), m_state(Constructor)
+        {
+        }
 
-        for (uint32_t i = 1; i < traits::NumberOfExchanges<Dim>::value; ++i)
+        virtual void init()
         {
-            if (m_buffer.getGridBuffer().hasReceiveExchange(i))
+            m_state = Init;
+            EventTask serialEvent = __getTransactionEvent();
+
+            for(uint32_t i = 1; i < traits::NumberOfExchanges<Dim>::value; ++i)
             {
-                __startTransaction(serialEvent);
-                FieldFactory::getInstance().createTaskFieldReceiveAndInsertExchange(m_buffer, i);
-                m_tmpEvent += __endTransaction();
+                if(m_buffer.getGridBuffer().hasReceiveExchange(i))
+                {
+                    __startTransaction(serialEvent);
+                    FieldFactory::getInstance().createTaskFieldReceiveAndInsertExchange(m_buffer, i);
+                    m_tmpEvent += __endTransaction();
+                }
             }
+            m_state = WaitForReceived;
         }
-        m_state = WaitForReceived;
-    }
 
-    bool executeIntern()
-    {
-        switch (m_state)
+        bool executeIntern()
         {
-        case Init:
-            break;
-        case WaitForReceived:
-            if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(m_tmpEvent.getTaskId()))
-            {
-                m_state = Insert;
-            }
-            break;
-        case Insert:
-            m_state = Wait;
-            __startTransaction();
-            for (uint32_t i = 1; i < traits::NumberOfExchanges<Dim>::value; ++i)
+            switch(m_state)
             {
-                if (m_buffer.getGridBuffer().hasReceiveExchange(i))
+            case Init:
+                break;
+            case WaitForReceived:
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(m_tmpEvent.getTaskId()))
                 {
-                    m_buffer.insertField(i);
+                    m_state = Insert;
                 }
-            }
-            m_tmpEvent = __endTransaction();
-            m_state = WaitInsertFinished;
-            break;
-        case Wait:
-            break;
-        case WaitInsertFinished:
-            if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(m_tmpEvent.getTaskId()))
-            {
-                m_state = Finish;
+                break;
+            case Insert:
+                m_state = Wait;
+                __startTransaction();
+                for(uint32_t i = 1; i < traits::NumberOfExchanges<Dim>::value; ++i)
+                {
+                    if(m_buffer.getGridBuffer().hasReceiveExchange(i))
+                    {
+                        m_buffer.insertField(i);
+                    }
+                }
+                m_tmpEvent = __endTransaction();
+                m_state = WaitInsertFinished;
+                break;
+            case Wait:
+                break;
+            case WaitInsertFinished:
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(m_tmpEvent.getTaskId()))
+                {
+                    m_state = Finish;
+                    return true;
+                }
+                break;
+            case Finish:
                 return true;
+            default:
+                return false;
             }
-            break;
-        case Finish:
-            return true;
-        default:
+
             return false;
         }
 
-        return false;
-    }
+        virtual ~TaskFieldReceiveAndInsert()
+        {
+            notify(this->myId, RECVFINISHED, nullptr);
+        }
 
-    virtual ~TaskFieldReceiveAndInsert()
-    {
-        notify(this->myId, RECVFINISHED, nullptr);
-    }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
-    void event(id_t, EventType, IEventData*)
-    {
-    }
+        std::string toString()
+        {
+            return "TaskFieldReceiveAndInsert";
+        }
 
-    std::string toString()
-    {
-        return "TaskFieldReceiveAndInsert";
-    }
+    private:
+        enum state_t
+        {
+            Constructor,
+            Init,
+            Wait,
+            Insert,
+            WaitInsertFinished,
+            WaitForReceived,
+            Finish
 
-private:
+        };
 
-    enum state_t
-    {
-        Constructor,
-        Init,
-        Wait,
-        Insert,
-        WaitInsertFinished,
-        WaitForReceived,
-        Finish
 
+        Field& m_buffer;
+        state_t m_state;
+        EventTask m_tmpEvent;
     };
 
-
-    Field& m_buffer;
-    state_t m_state;
-    EventTask m_tmpEvent;
-
-};
-
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/fields/tasks/TaskFieldReceiveAndInsertExchange.hpp b/include/pmacc/fields/tasks/TaskFieldReceiveAndInsertExchange.hpp
index 8151e77d38..248a7d5c68 100644
--- a/include/pmacc/fields/tasks/TaskFieldReceiveAndInsertExchange.hpp
+++ b/include/pmacc/fields/tasks/TaskFieldReceiveAndInsertExchange.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,89 +28,81 @@
 #include "pmacc/eventSystem/events/EventDataReceive.hpp"
 
 
-
 namespace pmacc
 {
-
-template<class Field>
-class TaskFieldReceiveAndInsertExchange : public MPITask
-{
-public:
-
-    TaskFieldReceiveAndInsertExchange(Field &buffer, uint32_t exchange) :
-    m_buffer(buffer),
-    m_exchange(exchange),
-    m_state(Constructor),
-    initDependency(__getTransactionEvent())
+    template<class Field>
+    class TaskFieldReceiveAndInsertExchange : public MPITask
     {
-    }
+    public:
+        TaskFieldReceiveAndInsertExchange(Field& buffer, uint32_t exchange)
+            : m_buffer(buffer)
+            , m_exchange(exchange)
+            , m_state(Constructor)
+            , initDependency(__getTransactionEvent())
+        {
+        }
 
-    virtual void init()
-    {
-        m_state = Init;
-        initDependency = m_buffer.getGridBuffer().asyncReceive(initDependency, m_exchange);
-        m_state = WaitForReceive;
-    }
+        virtual void init()
+        {
+            m_state = Init;
+            initDependency = m_buffer.getGridBuffer().asyncReceive(initDependency, m_exchange);
+            m_state = WaitForReceive;
+        }
 
-    bool executeIntern()
-    {
-        switch (m_state)
+        bool executeIntern()
         {
-        case Init:
-            break;
-        case WaitForReceive:
-            if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(initDependency.getTaskId()))
+            switch(m_state)
             {
-                m_state = Finished;
+            case Init:
+                break;
+            case WaitForReceive:
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(initDependency.getTaskId()))
+                {
+                    m_state = Finished;
+                    return true;
+                }
+                break;
+            case Finished:
                 return true;
+            default:
+                return false;
             }
-            break;
-        case Finished:
-            return true;
-        default:
+
             return false;
         }
 
-        return false;
-    }
+        virtual ~TaskFieldReceiveAndInsertExchange()
+        {
+            notify(this->myId, RECVFINISHED, nullptr);
+        }
 
-    virtual ~TaskFieldReceiveAndInsertExchange()
-    {
-        notify(this->myId, RECVFINISHED, nullptr);
-    }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
-    void event(id_t, EventType, IEventData*)
-    {
-    }
+        std::string toString()
+        {
+            std::ostringstream stateNumber;
+            stateNumber << m_state;
+            return std::string("TaskFieldReceiveAndInsertExchange/") + stateNumber.str();
+        }
 
-    std::string toString()
-    {
-        std::ostringstream stateNumber;
-        stateNumber << m_state;
-        return std::string("TaskFieldReceiveAndInsertExchange/") + stateNumber.str();
-    }
+    private:
+        enum state_t
+        {
+            Constructor,
+            Init,
+            WaitForReceive,
+            Finished
 
-private:
+        };
 
-    enum state_t
-    {
-        Constructor,
-        Init,
-        WaitForReceive,
-        Finished
 
+        Field& m_buffer;
+        state_t m_state;
+        EventTask insertEvent;
+        EventTask initDependency;
+        uint32_t m_exchange;
     };
 
-
-
-
-    Field& m_buffer;
-    state_t m_state;
-    EventTask insertEvent;
-    EventTask initDependency;
-    uint32_t m_exchange;
-};
-
-} //namespace pmacc
-
-
+} // namespace pmacc
diff --git a/include/pmacc/fields/tasks/TaskFieldSend.hpp b/include/pmacc/fields/tasks/TaskFieldSend.hpp
index a132d4f6fd..adb951e91c 100644
--- a/include/pmacc/fields/tasks/TaskFieldSend.hpp
+++ b/include/pmacc/fields/tasks/TaskFieldSend.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -31,29 +31,27 @@
 
 namespace pmacc
 {
-
     template<class Field>
     class TaskFieldSend : public MPITask
     {
     public:
-
         enum
         {
             Dim = picongpu::simDim
         };
 
-        TaskFieldSend(Field &buffer) :
-        m_buffer(buffer),
-        m_state(Constructor) { }
+        TaskFieldSend(Field& buffer) : m_buffer(buffer), m_state(Constructor)
+        {
+        }
 
         virtual void init()
         {
             m_state = Init;
             EventTask serialEvent = __getTransactionEvent();
 
-            for (uint32_t i = 1; i < traits::NumberOfExchanges<Dim>::value; ++i)
+            for(uint32_t i = 1; i < traits::NumberOfExchanges<Dim>::value; ++i)
             {
-                if (m_buffer.getGridBuffer().hasSendExchange(i))
+                if(m_buffer.getGridBuffer().hasSendExchange(i))
                 {
                     __startTransaction(serialEvent);
                     FieldFactory::getInstance().createTaskFieldSendExchange(m_buffer, i);
@@ -65,14 +63,14 @@ namespace pmacc
 
         bool executeIntern()
         {
-            switch (m_state)
+            switch(m_state)
             {
-                case Init:
-                    break;
-                case WaitForSend:
-                    return nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId());
-                default:
-                    return false;
+            case Init:
+                break;
+            case WaitForSend:
+                return nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId());
+            default:
+                return false;
             }
 
             return false;
@@ -83,7 +81,9 @@ namespace pmacc
             notify(this->myId, SENDFINISHED, nullptr);
         }
 
-        void event(id_t, EventType, IEventData*) { }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
         std::string toString()
         {
@@ -91,7 +91,6 @@ namespace pmacc
         }
 
     private:
-
         enum state_t
         {
             Constructor,
@@ -106,4 +105,4 @@ namespace pmacc
         EventTask tmpEvent;
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/fields/tasks/TaskFieldSendExchange.hpp b/include/pmacc/fields/tasks/TaskFieldSendExchange.hpp
index b13bf3af25..bb4feb7f3f 100644
--- a/include/pmacc/fields/tasks/TaskFieldSendExchange.hpp
+++ b/include/pmacc/fields/tasks/TaskFieldSendExchange.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,20 +28,17 @@
 #include "pmacc/eventSystem/events/EventDataReceive.hpp"
 
 
-
 namespace pmacc
 {
-
     template<class Field>
     class TaskFieldSendExchange : public MPITask
     {
     public:
-
-        TaskFieldSendExchange(Field &buffer, uint32_t exchange) :
-        m_buffer(buffer),
-        m_exchange(exchange),
-        m_state(Constructor),
-        m_initDependency(__getTransactionEvent())
+        TaskFieldSendExchange(Field& buffer, uint32_t exchange)
+            : m_buffer(buffer)
+            , m_exchange(exchange)
+            , m_state(Constructor)
+            , m_initDependency(__getTransactionEvent())
         {
         }
 
@@ -56,13 +53,13 @@ namespace pmacc
 
         bool executeIntern()
         {
-            switch (m_state)
+            switch(m_state)
             {
             case Init:
                 break;
             case WaitForBash:
 
-                if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(m_initDependency.getTaskId()) )
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(m_initDependency.getTaskId()))
                 {
                     m_state = InitSend;
                     m_sendEvent = m_buffer.getGridBuffer().asyncSend(EventTask(), m_exchange);
@@ -74,7 +71,7 @@ namespace pmacc
             case InitSend:
                 break;
             case WaitForSendEnd:
-                if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(m_sendEvent.getTaskId()))
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(m_sendEvent.getTaskId()))
                 {
                     m_state = Finished;
                     return true;
@@ -104,7 +101,6 @@ namespace pmacc
         }
 
     private:
-
         enum state_t
         {
             Constructor,
@@ -124,5 +120,4 @@ namespace pmacc
         uint32_t m_exchange;
     };
 
-} //namespace pmacc
-
+} // namespace pmacc
diff --git a/include/pmacc/filter/Interface.hpp b/include/pmacc/filter/Interface.hpp
index c7e6c50760..7e6d832eed 100644
--- a/include/pmacc/filter/Interface.hpp
+++ b/include/pmacc/filter/Interface.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,28 +27,20 @@
 
 namespace pmacc
 {
-namespace filter
-{
-
-    /** Interface for a filter
-     *
-     * A filter is a functor which is evaluated to true or false depending
-     * on the input parameters.
-     * A filter can be used to decide e.g. if a particle is located in a user
-     * defined area or if an attribute is above a threshold.
-     *
-     * @tparam T_UserFunctor pmacc::functor::Interface, type of the functor (filter rule)
-     * @tparam T_numArguments number of arguments which must be supported by T_UserFunctor
-     */
-    template<
-        typename T_UserFunctor,
-        uint32_t T_numArguments
-    >
-    using Interface = pmacc::functor::Interface<
-        T_UserFunctor,
-        T_numArguments,
-        bool
-    >;
+    namespace filter
+    {
+        /** Interface for a filter
+         *
+         * A filter is a functor which is evaluated to true or false depending
+         * on the input parameters.
+         * A filter can be used to decide e.g. if a particle is located in a user
+         * defined area or if an attribute is above a threshold.
+         *
+         * @tparam T_UserFunctor pmacc::functor::Interface, type of the functor (filter rule)
+         * @tparam T_numArguments number of arguments which must be supported by T_UserFunctor
+         */
+        template<typename T_UserFunctor, uint32_t T_numArguments>
+        using Interface = pmacc::functor::Interface<T_UserFunctor, T_numArguments, bool>;
 
-} // namespace filter
+    } // namespace filter
 } // namespace pmacc
diff --git a/include/pmacc/filter/operators/And.hpp b/include/pmacc/filter/operators/And.hpp
index 719f907810..107680e8f7 100644
--- a/include/pmacc/filter/operators/And.hpp
+++ b/include/pmacc/filter/operators/And.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,42 +24,36 @@
 
 namespace pmacc
 {
-namespace filter
-{
-namespace operators
-{
-
-    //! combine all arguments by AND `&&`
-    struct And
+    namespace filter
     {
-        /** return a
-         *
-         * @param a a boolean value
-         * @return the input argument
-         */
-        template< typename T_Arg >
-        HDINLINE bool
-        operator()( T_Arg const a ) const
+        namespace operators
         {
-            return a;
-        }
+            //! combine all arguments by AND `&&`
+            struct And
+            {
+                /** return a
+                 *
+                 * @param a a boolean value
+                 * @return the input argument
+                 */
+                template<typename T_Arg>
+                HDINLINE bool operator()(T_Arg const a) const
+                {
+                    return a;
+                }
 
-        /** get AND combined result
-         *
-         * @param args arguments to combine
-         * @return AND combination of all arguments
-         */
-        template<
-            typename T_Arg1,
-            typename ... T_Args
-        >
-        HDINLINE bool
-        operator()( T_Arg1 const a, T_Args const ... args ) const
-        {
-            return a && And{}( args ... );
-        }
-    };
+                /** get AND combined result
+                 *
+                 * @param args arguments to combine
+                 * @return AND combination of all arguments
+                 */
+                template<typename T_Arg1, typename... T_Args>
+                HDINLINE bool operator()(T_Arg1 const a, T_Args const... args) const
+                {
+                    return a && And{}(args...);
+                }
+            };
 
-} // namespace operators
-} // namespace filter
+        } // namespace operators
+    } // namespace filter
 } // namespace pmacc
diff --git a/include/pmacc/filter/operators/Or.hpp b/include/pmacc/filter/operators/Or.hpp
index 13532b4ca9..fb6bca211c 100644
--- a/include/pmacc/filter/operators/Or.hpp
+++ b/include/pmacc/filter/operators/Or.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,42 +24,36 @@
 
 namespace pmacc
 {
-namespace filter
-{
-namespace operators
-{
-
-    //! combine all arguments by OR `||`
-    struct Or
+    namespace filter
     {
-        /** return a
-         *
-         * @param a a boolean value
-         * @return the input argument
-         */
-        template< typename T_Arg >
-        HDINLINE bool
-        operator()( T_Arg const a ) const
+        namespace operators
         {
-            return a;
-        }
+            //! combine all arguments by OR `||`
+            struct Or
+            {
+                /** return a
+                 *
+                 * @param a a boolean value
+                 * @return the input argument
+                 */
+                template<typename T_Arg>
+                HDINLINE bool operator()(T_Arg const a) const
+                {
+                    return a;
+                }
 
-        /** get OR combined result
-         *
-         * @param args arguments to combine
-         * @return OR combination of all arguments
-         */
-        template<
-            typename T_Arg1,
-            typename ... T_Args
-        >
-        HDINLINE bool
-        operator()( T_Arg1 const a, T_Args const ... args ) const
-        {
-            return a || Or{}( args ... );
-        }
-    };
+                /** get OR combined result
+                 *
+                 * @param args arguments to combine
+                 * @return OR combination of all arguments
+                 */
+                template<typename T_Arg1, typename... T_Args>
+                HDINLINE bool operator()(T_Arg1 const a, T_Args const... args) const
+                {
+                    return a || Or{}(args...);
+                }
+            };
 
-} // namespace operators
-} // namespace filter
+        } // namespace operators
+    } // namespace filter
 } // namespace pmacc
diff --git a/include/pmacc/functor/Filtered.hpp b/include/pmacc/functor/Filtered.hpp
index 8e4503f06b..b4ff6644e6 100644
--- a/include/pmacc/functor/Filtered.hpp
+++ b/include/pmacc/functor/Filtered.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,243 +30,145 @@
 
 namespace pmacc
 {
-namespace functor
-{
-namespace acc
-{
-
-
-    /** interface to combine a filter and a functor on the accelerator
-     *
-     * @tparam T_FilterOperator pmacc::filter::operators, type concatenate the
-     *                          results of the filter
-     * @tparam T_Filter pmacc::filter::Interface, type of the filter
-     * @tparam T_Functor pmacc::functor::Interface, type of the functor
-     */
-    template<
-        typename T_FilterOperator,
-        typename T_Filter,
-        typename T_Functor
-    >
-    struct Filtered :
-        private T_Filter,
-        public T_Functor
+    namespace functor
     {
-        using Filter = T_Filter;
-        using Functor = T_Functor;
-
-        HDINLINE Filtered(
-            Filter const & filter,
-            Functor const & functor
-        ) :
-            Filter( filter ),
-            Functor( functor )
+        namespace acc
         {
-
-        }
-
-        /** execute the functor depending of the filter result
+            /** interface to combine a filter and a functor on the accelerator
+             *
+             * @tparam T_FilterOperator pmacc::filter::operators, type concatenate the
+             *                          results of the filter
+             * @tparam T_Filter pmacc::filter::Interface, type of the filter
+             * @tparam T_Functor pmacc::functor::Interface, type of the functor
+             */
+            template<typename T_FilterOperator, typename T_Filter, typename T_Functor>
+            struct Filtered
+                : private T_Filter
+                , public T_Functor
+            {
+                using Filter = T_Filter;
+                using Functor = T_Functor;
+
+                HDINLINE Filtered(Filter const& filter, Functor const& functor) : Filter(filter), Functor(functor)
+                {
+                }
+
+                /** execute the functor depending of the filter result
+                 *
+                 * Call the filter for each argument. If the combined result is true
+                 * the user functor is called.
+                 *
+                 * @param args arguments passed to the functor if the filter results of
+                 *             each argument evaluate to true when combined
+                 */
+                template<typename T_Acc, typename... T_Args>
+                HDINLINE auto operator()(T_Acc const& acc, T_Args&&... args) -> void
+                {
+                    // call the filter on each argument and combine the results
+                    bool const combinedResult = T_FilterOperator{}((*static_cast<Filter*>(this))(acc, args)...);
+
+                    if(combinedResult)
+                        (*static_cast<Functor*>(this))(acc, args...);
+                }
+            };
+
+        } // namespace acc
+
+        /** combine a filter and a functor
          *
-         * Call the filter for each argument. If the combined result is true
-         * the user functor is called.
+         * Creates a functor where each argument which is passed to
+         * the accelerator instance is evaluated by the filter and if the
+         * combined result is true the functor is executed.
          *
-         * @param args arguments passed to the functor if the filter results of
-         *             each argument evaluate to true when combined
+         * @tparam T_FilterOperator pmacc::filter::operators, type concatenate the
+         *                          results of the filter
+         * @tparam T_Filter pmacc::filter::Interface, type of the filter
+         * @tparam T_Functor pmacc::functor::Interface, type of the functor
          */
-        template<
-            typename T_Acc,
-            typename ... T_Args
-        >
-        HDINLINE auto operator( )(
-            T_Acc const & acc,
-            T_Args && ... args
-        )
-        -> void
-        {
-            // call the filter on each argument and combine the results
-            bool const combinedResult = T_FilterOperator{ }(
-                ( *static_cast< Filter * >( this ) )( acc, args ) ...
-            );
-
-            if( combinedResult )
-                ( *static_cast< Functor * >( this ) )( acc, args ... );
-        }
-    };
-
-} // namespace acc
-
-    /** combine a filter and a functor
-     *
-     * Creates a functor where each argument which is passed to
-     * the accelerator instance is evaluated by the filter and if the
-     * combined result is true the functor is executed.
-     *
-     * @tparam T_FilterOperator pmacc::filter::operators, type concatenate the
-     *                          results of the filter
-     * @tparam T_Filter pmacc::filter::Interface, type of the filter
-     * @tparam T_Functor pmacc::functor::Interface, type of the functor
-     */
-    template<
-        typename T_FilterOperator,
-        typename T_Filter,
-        typename T_Functor
-    >
-    struct Filtered;
-
-    /** specialization of Filtered (with unary filter)
-     *
-     * This specialization can only be used if T_Filter is of the type pmacc::filter::Interface
-     * and T_Functor is of the type pmacc::functor::Interface.
-     * A unary filters means that each argument can only pass the same filter
-     * check before its results are combined.
-     */
-    template<
-        typename T_FilterOperator,
-        typename T_Filter,
-        typename T_Functor,
-        uint32_t T_numFunctorArguments
-
-    >
-    struct Filtered<
-        T_FilterOperator,
-        filter::Interface<
-            T_Filter,
-            1u
-        >,
-        Interface<
-            T_Functor,
-            T_numFunctorArguments,
-            void
-        >
-    > :
-        private filter::Interface<
-            T_Filter,
-            1u
-        >,
-        Interface<
-            T_Functor,
-            T_numFunctorArguments,
-            void
-        >
-    {
+        template<typename T_FilterOperator, typename T_Filter, typename T_Functor>
+        struct Filtered;
 
-        template< typename ... T_Params >
-        struct apply
-        {
-            using type = Filtered<
-                T_FilterOperator,
-                typename boost::mpl::apply<
-                    T_Filter,
-                    T_Params ...
-                >::type,
-                typename boost::mpl::apply<
-                    T_Functor,
-                    T_Params ...
-                >::type
-            >;
-        };
-
-        using Filter = filter::Interface<
-            T_Filter,
-            1u
-        >;
-        using Functor = Interface<
-            T_Functor,
-            T_numFunctorArguments,
-            void
-        >;
-
-        template< typename DeferFunctor = Functor >
-        HINLINE Filtered( uint32_t const currentStep ) :
-            Filter( currentStep ),
-            Functor( currentStep )
-        {
-        }
-
-
-        /** create a filtered functor which can be used on the accelerator
+        /** specialization of Filtered (with unary filter)
          *
-         * @tparam T_OffsetType type to describe the size of a domain
-         * @tparam T_numWorkers number of workers
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param domainOffset offset to the origin of the local domain
-         *                     This can be e.g a supercell or cell offset and depends
-         *                     of the context where the interface is specialized.
-         * @param workerCfg configuration of the worker
-         * @return accelerator instance of the filtered functor
+         * This specialization can only be used if T_Filter is of the type pmacc::filter::Interface
+         * and T_Functor is of the type pmacc::functor::Interface.
+         * A unary filters means that each argument can only pass the same filter
+         * check before its results are combined.
          */
         template<
-            typename T_OffsetType,
-            uint32_t T_numWorkers,
-            typename T_Acc
-        >
-        HDINLINE auto
-        operator( )(
-            T_Acc const & acc,
-            T_OffsetType const & domainOffset,
-            mappings::threads::WorkerCfg< T_numWorkers > const & workerCfg
-        ) const
-        -> acc::Filtered<
-            T_FilterOperator,
-            decltype(
-                alpaka::core::declval< Filter >( )(
-                    acc,
-                    domainOffset,
-                    workerCfg
-                )
-            ),
-            decltype(
-                alpaka::core::declval< Functor >( )(
-                    acc,
-                    domainOffset,
-                    workerCfg
-                )
-            )
-        >
-        {
-            return acc::Filtered<
-                T_FilterOperator,
-                decltype(
-                    alpaka::core::declval< Filter >( )(
-                        acc,
-                        domainOffset,
-                        workerCfg
-                    )
-                ),
-                decltype(
-                    alpaka::core::declval< Functor >( )(
-                        acc,
-                        domainOffset,
-                        workerCfg
-                    )
-                )
-            >(
-                ( *static_cast< Filter const * >( this ) )(
-                    acc,
-                    domainOffset,
-                    workerCfg
-                ),
-                ( *static_cast< Functor const * >( this ) )(
-                    acc,
-                    domainOffset,
-                    workerCfg
-                )
-            );
-        }
+            typename T_FilterOperator,
+            typename T_Filter,
+            typename T_Functor,
+            uint32_t T_numFunctorArguments
 
-        /** get name the of the filtered functor
-         *
-         * @return combination of the filter and functor name, the names are
-         *         separated by an underscore `_`
-         */
-        HINLINE std::string
-        getName( ) const
+            >
+        struct Filtered<
+            T_FilterOperator,
+            filter::Interface<T_Filter, 1u>,
+            Interface<T_Functor, T_numFunctorArguments, void>>
+            : private filter::Interface<T_Filter, 1u>
+            , Interface<T_Functor, T_numFunctorArguments, void>
         {
-            return Filter::getName( ) + std::string("_") + Functor::getName( );
-        }
-    };
+            template<typename... T_Params>
+            struct apply
+            {
+                using type = Filtered<
+                    T_FilterOperator,
+                    typename boost::mpl::apply<T_Filter, T_Params...>::type,
+                    typename boost::mpl::apply<T_Functor, T_Params...>::type>;
+            };
+
+            using Filter = filter::Interface<T_Filter, 1u>;
+            using Functor = Interface<T_Functor, T_numFunctorArguments, void>;
+
+            template<typename DeferFunctor = Functor>
+            HINLINE Filtered(uint32_t const currentStep) : Filter(currentStep)
+                                                         , Functor(currentStep)
+            {
+            }
+
+
+            /** create a filtered functor which can be used on the accelerator
+             *
+             * @tparam T_OffsetType type to describe the size of a domain
+             * @tparam T_numWorkers number of workers
+             * @tparam T_Acc alpaka accelerator type
+             *
+             * @param alpaka accelerator
+             * @param domainOffset offset to the origin of the local domain
+             *                     This can be e.g a supercell or cell offset and depends
+             *                     of the context where the interface is specialized.
+             * @param workerCfg configuration of the worker
+             * @return accelerator instance of the filtered functor
+             */
+            template<typename T_OffsetType, uint32_t T_numWorkers, typename T_Acc>
+            HDINLINE auto operator()(
+                T_Acc const& acc,
+                T_OffsetType const& domainOffset,
+                mappings::threads::WorkerCfg<T_numWorkers> const& workerCfg) const
+                -> acc::Filtered<
+                    T_FilterOperator,
+                    decltype(alpaka::core::declval<Filter>()(acc, domainOffset, workerCfg)),
+                    decltype(alpaka::core::declval<Functor>()(acc, domainOffset, workerCfg))>
+            {
+                return acc::Filtered<
+                    T_FilterOperator,
+                    decltype(alpaka::core::declval<Filter>()(acc, domainOffset, workerCfg)),
+                    decltype(alpaka::core::declval<Functor>()(acc, domainOffset, workerCfg))>(
+                    (*static_cast<Filter const*>(this))(acc, domainOffset, workerCfg),
+                    (*static_cast<Functor const*>(this))(acc, domainOffset, workerCfg));
+            }
+
+            /** get name the of the filtered functor
+             *
+             * @return combination of the filter and functor name, the names are
+             *         separated by an underscore `_`
+             */
+            HINLINE std::string getName() const
+            {
+                return Filter::getName() + std::string("_") + Functor::getName();
+            }
+        };
 
-} // namespace functor
+    } // namespace functor
 } // namespace pmacc
diff --git a/include/pmacc/functor/Interface.hpp b/include/pmacc/functor/Interface.hpp
index 785a6d6c5b..491ff66358 100644
--- a/include/pmacc/functor/Interface.hpp
+++ b/include/pmacc/functor/Interface.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -31,219 +31,170 @@
 
 namespace pmacc
 {
-namespace functor
-{
-namespace acc
-{
-namespace detail
-{
-    /** Helper class to compare void with void
-     *
-     * std::is_same does not allow to use void as type. By wrapping the type before
-     * comparing, we can workaround this limitation.
-     *
-     * @tparam T_Type type to be wrapped
-     */
-    template< typename T_Type >
-    struct VoidWrapper
-    {
-
-    };
-} // namespace detail
-
-    /** functor interface used on the accelerator side
-     *
-     * The user functor of the type T_UserFunctor must contain
-     * - the `operator()` with T_numArguments arguments and a return type T_ReturnType.
-     * - a copy constructor
-     * This interface is used to wrap the user functor to make sure that
-     * the required interface is fulfilled.
-     *
-     * @tparam T_UserFunctor user functor type
-     * @tparam T_numArguments number which must be supported by T_UserFunctor
-     * @tparam T_ReturnType required return type of T_UserFunctor
-     */
-    template<
-        typename T_UserFunctor,
-        uint32_t T_numArguments,
-        typename T_ReturnType
-    >
-    struct Interface : public T_UserFunctor
+    namespace functor
     {
-        //! type of the user functor
-        using UserFunctor = T_UserFunctor;
-
-        /** constructor
-         *
-         * @param functor user functor instance
-         */
-        HDINLINE Interface( UserFunctor const & functor ) :
-            UserFunctor( functor )
-        {
-        }
-
-        /** execute the functor
-         *
-         * The number of arguments and the return type of the user functor are
-         * evaluated at compile-time and must be equal to the interface description.
-         *
-         * @tparam T_Args type of the arguments passed to the user functor
-         *
-         * @param args arguments passed to the user functor
-         * @return T_ReturnType
-         */
-        template<
-            typename T_Acc,
-            typename ... T_Args
-        >
-        HDINLINE auto operator( )(
-            T_Acc const & acc,
-            T_Args && ... args )
-        -> T_ReturnType
+        namespace acc
         {
-            /* check if the current used number of arguments to execute the
-             * functor is equal to the interface requirements
+            namespace detail
+            {
+                /** Helper class to compare void with void
+                 *
+                 * std::is_same does not allow to use void as type. By wrapping the type before
+                 * comparing, we can workaround this limitation.
+                 *
+                 * @tparam T_Type type to be wrapped
+                 */
+                template<typename T_Type>
+                struct VoidWrapper
+                {
+                };
+            } // namespace detail
+
+            /** functor interface used on the accelerator side
+             *
+             * The user functor of the type T_UserFunctor must contain
+             * - the `operator()` with T_numArguments arguments and a return type T_ReturnType.
+             * - a copy constructor
+             * This interface is used to wrap the user functor to make sure that
+             * the required interface is fulfilled.
+             *
+             * @tparam T_UserFunctor user functor type
+             * @tparam T_numArguments number which must be supported by T_UserFunctor
+             * @tparam T_ReturnType required return type of T_UserFunctor
              */
-            PMACC_CASSERT_MSG_TYPE(
-                __user_functor_has_wrong_number_of_arguments,
-                UserFunctor,
-                T_numArguments == sizeof...( args )
-            );
-
-            // get the return type of the user functor
-            using UserFunctorReturnType = decltype(
-                alpaka::core::declval< UserFunctor >( )( acc, args ... )
-            );
-
-            // compare user functor return type with the interface requirements
-            PMACC_CASSERT_MSG(
-                __wrong_user_functor_return_type,
-                std::is_same<
-                    detail::VoidWrapper< UserFunctorReturnType >,
-                    detail::VoidWrapper< T_ReturnType >
-                >::value
-            );
-            return ( *static_cast< UserFunctor * >( this ) )( acc, args ... );
-        }
-    };
-
-} // namespace acc
-
-    /** Interface for a user functor
-     *
-     * @tparam T_UserFunctor user functor type
-     * @tparam T_numArguments number of arguments which must be supported by T_UserFunctor
-     * @tparam T_ReturnType required return type of T_UserFunctor
-     */
-    template<
-        typename T_UserFunctor,
-        uint32_t T_numArguments,
-        typename T_ReturnType
-    >
-    struct Interface : private T_UserFunctor
-    {
-
-        //! type of the user functor
-        using UserFunctor = T_UserFunctor;
-
-        /** constructor
-         *
-         * This constructor is only compiled if the user functor has
-         * a host side constructor with one (uint32_t) argument.
+            template<typename T_UserFunctor, uint32_t T_numArguments, typename T_ReturnType>
+            struct Interface : public T_UserFunctor
+            {
+                //! type of the user functor
+                using UserFunctor = T_UserFunctor;
+
+                /** constructor
+                 *
+                 * @param functor user functor instance
+                 */
+                HDINLINE Interface(UserFunctor const& functor) : UserFunctor(functor)
+                {
+                }
+
+                /** execute the functor
+                 *
+                 * The number of arguments and the return type of the user functor are
+                 * evaluated at compile-time and must be equal to the interface description.
+                 *
+                 * @tparam T_Args type of the arguments passed to the user functor
+                 *
+                 * @param args arguments passed to the user functor
+                 * @return T_ReturnType
+                 */
+                template<typename T_Acc, typename... T_Args>
+                HDINLINE auto operator()(T_Acc const& acc, T_Args&&... args) -> T_ReturnType
+                {
+                    /* check if the current used number of arguments to execute the
+                     * functor is equal to the interface requirements
+                     */
+                    PMACC_CASSERT_MSG_TYPE(
+                        __user_functor_has_wrong_number_of_arguments,
+                        UserFunctor,
+                        T_numArguments == sizeof...(args));
+
+                    // get the return type of the user functor
+                    using UserFunctorReturnType = decltype(alpaka::core::declval<UserFunctor>()(acc, args...));
+
+                    // compare user functor return type with the interface requirements
+                    PMACC_CASSERT_MSG(
+                        __wrong_user_functor_return_type,
+                        std::is_same<detail::VoidWrapper<UserFunctorReturnType>, detail::VoidWrapper<T_ReturnType>>::
+                            value);
+                    return (*static_cast<UserFunctor*>(this))(acc, args...);
+                }
+            };
+
+        } // namespace acc
+
+        /** Interface for a user functor
          *
-         * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
-         *                      the constructor
-         * @param currentStep current simulation time step
-         * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+         * @tparam T_UserFunctor user functor type
+         * @tparam T_numArguments number of arguments which must be supported by T_UserFunctor
+         * @tparam T_ReturnType required return type of T_UserFunctor
          */
-        template< typename DeferFunctor = UserFunctor >
-        HINLINE Interface(
-            uint32_t const currentStep,
-            typename std::enable_if<
-                std::is_constructible<
-                    DeferFunctor,
-                    uint32_t
-                >::value
-            >::type* = 0
-        ) : UserFunctor( currentStep )
+        template<typename T_UserFunctor, uint32_t T_numArguments, typename T_ReturnType>
+        struct Interface : private T_UserFunctor
         {
-        }
-
-        /** constructor
-         *
-         * This constructor is only compiled if the user functor has a default constructor.
-         *
-         * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
-         *                      the constructor
-         * @param currentStep simulation time step
-         * @param is used to enable/disable the constructor (do not pass any value to this parameter)
-         */
-        template< typename DeferFunctor = UserFunctor >
-        HINLINE Interface(
-            uint32_t const currentStep,
-            typename std::enable_if<
-                std::is_constructible< DeferFunctor >::value
-            >::type* = 0
-        ) : UserFunctor( )
-        {
-            boost::ignore_unused( currentStep );
-        }
-
-        /** create a functor which can be used on the accelerator
-         *
-         * @tparam T_OffsetType type to describe the size of a domain
-         * @tparam T_numWorkers number of workers
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param alpaka accelerator
-         * @param domainOffset offset to the origin of the local domain
-         *                     This can be e.g a supercell or cell offset and depends
-         *                     of the context where the interface is specialized.
-         * @param workerCfg configuration of the worker
-         * @return an instance of the user functor wrapped by the accelerator
-         *         functor interface
-         */
-        template<
-            typename T_OffsetType,
-            uint32_t T_numWorkers,
-            typename T_Acc
-        >
-        HDINLINE auto
-        operator( )(
-            T_Acc const & acc,
-            T_OffsetType const & domainOffset,
-            mappings::threads::WorkerCfg< T_numWorkers > const & workerCfg
-        ) const
-        -> acc::Interface<
-            decltype(
-                alpaka::core::declval< UserFunctor >( )(
-                    acc,
-                    domainOffset,
-                    workerCfg
-                )
-            ),
-            T_numArguments,
-            T_ReturnType
-        >
-        {
-            return ( *static_cast< UserFunctor const * >( this ) )(
-                acc,
-                domainOffset,
-                workerCfg
-            );
-        }
-
-        /** get name of the user functor
-         *
-         * @return name to identify the functor
-         */
-        static
-        HINLINE std::string
-        getName( )
-        {
-            return UserFunctor::getName( );
-        }
-    };
+            //! type of the user functor
+            using UserFunctor = T_UserFunctor;
+
+            /** constructor
+             *
+             * This constructor is only compiled if the user functor has
+             * a host side constructor with one (uint32_t) argument.
+             *
+             * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
+             *                      the constructor
+             * @param currentStep current simulation time step
+             * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+             */
+            template<typename DeferFunctor = UserFunctor>
+            HINLINE Interface(
+                uint32_t const currentStep,
+                typename std::enable_if<std::is_constructible<DeferFunctor, uint32_t>::value>::type* = 0)
+                : UserFunctor(currentStep)
+            {
+            }
+
+            /** constructor
+             *
+             * This constructor is only compiled if the user functor has a default constructor.
+             *
+             * @tparam DeferFunctor is used to defer the functor type evaluation to enable/disable
+             *                      the constructor
+             * @param currentStep simulation time step
+             * @param is used to enable/disable the constructor (do not pass any value to this parameter)
+             */
+            template<typename DeferFunctor = UserFunctor>
+            HINLINE Interface(
+                uint32_t const currentStep,
+                typename std::enable_if<std::is_constructible<DeferFunctor>::value>::type* = 0)
+                : UserFunctor()
+            {
+                boost::ignore_unused(currentStep);
+            }
+
+            /** create a functor which can be used on the accelerator
+             *
+             * @tparam T_OffsetType type to describe the size of a domain
+             * @tparam T_numWorkers number of workers
+             * @tparam T_Acc alpaka accelerator type
+             *
+             * @param alpaka accelerator
+             * @param domainOffset offset to the origin of the local domain
+             *                     This can be e.g a supercell or cell offset and depends
+             *                     of the context where the interface is specialized.
+             * @param workerCfg configuration of the worker
+             * @return an instance of the user functor wrapped by the accelerator
+             *         functor interface
+             */
+            template<typename T_OffsetType, uint32_t T_numWorkers, typename T_Acc>
+            HDINLINE auto operator()(
+                T_Acc const& acc,
+                T_OffsetType const& domainOffset,
+                mappings::threads::WorkerCfg<T_numWorkers> const& workerCfg) const
+                -> acc::Interface<
+                    decltype(alpaka::core::declval<UserFunctor>()(acc, domainOffset, workerCfg)),
+                    T_numArguments,
+                    T_ReturnType>
+            {
+                return (*static_cast<UserFunctor const*>(this))(acc, domainOffset, workerCfg);
+            }
+
+            /** get name of the user functor
+             *
+             * @return name to identify the functor
+             */
+            static HINLINE std::string getName()
+            {
+                return UserFunctor::getName();
+            }
+        };
 
-} // namespace functor
+    } // namespace functor
 } // namespace pmacc
diff --git a/include/pmacc/identifier/alias.hpp b/include/pmacc/identifier/alias.hpp
index 7d430262b0..299b8a64c3 100644
--- a/include/pmacc/identifier/alias.hpp
+++ b/include/pmacc/identifier/alias.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Benjamin Worpitz,
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -32,47 +32,9 @@
 
 namespace pmacc
 {
-identifier(pmacc_void);
-identifier(pmacc_isAlias);
-} //namespace pmacc
-
-#ifdef __CUDACC__
-#   define PMACC_alias_CUDA(name,id)                                          \
-        namespace PMACC_JOIN(device_placeholder,id){                          \
-            /* This variable exists only for template parameter deduction, its
-             * value is never used. So in this case it is fine to have a
-             * separate version in each translation unit due to static.
-             */                                                               \
-            static __constant__ PMACC_JOIN(placeholder_definition,id)::name<> \
-                PMACC_JOIN(name,_);                                           \
-        }
-#else
-#   define PMACC_alias_CUDA(name,id)
-#endif
-
-/*define special makros for creating classes which are only used as identifer*/
-#define PMACC_alias(name,id)                                                   \
-    namespace PMACC_JOIN(placeholder_definition,id) {                          \
-        template<typename T_Type=pmacc::pmacc_void,typename T_IsAlias=pmacc::pmacc_isAlias> \
-        struct name                                                            \
-        {                                                                      \
-            static std::string getName()                                       \
-            {                                                                  \
-                return std::string(#name);                                     \
-            }                                                                  \
-        };                                                                     \
-    }                                                                          \
-    using namespace PMACC_JOIN(placeholder_definition,id);                     \
-    namespace PMACC_JOIN(host_placeholder,id){                                 \
-        /* This variable exists only for template parameter deduction, its value
-         * is never used. So in this case it is fine to have a separate version
-         * in each translation unit due to static.
-         */                                                                    \
-        static PMACC_JOIN(placeholder_definition,id)::name<>                   \
-            PMACC_JOIN(name,_);                                                \
-    }                                                                          \
-    PMACC_alias_CUDA(name,id);                                                 \
-    PMACC_PLACEHOLDER(id);
+    identifier(pmacc_void, );
+    identifier(pmacc_isAlias, );
+} // namespace pmacc
 
 
 /** create an alias
@@ -90,23 +52,30 @@ identifier(pmacc_isAlias);
  * get type which is represented by the alias
  *      typedef typename traits::Resolve<name>::type resolved_type;
  */
-#define alias(name) PMACC_alias(name,__COUNTER__)
+#define alias(name)                                                                                                   \
+    template<typename T_Type = pmacc::pmacc_void, typename T_IsAlias = pmacc::pmacc_isAlias>                          \
+    struct name                                                                                                       \
+    {                                                                                                                 \
+        static std::string getName()                                                                                  \
+        {                                                                                                             \
+            return std::string(#name);                                                                                \
+        }                                                                                                             \
+    };                                                                                                                \
+    constexpr name<> PMACC_JOIN(name, _)
 
 namespace pmacc
 {
-namespace traits
-{
-
-template<template<typename,typename> class T_Object, typename T_AnyType>
-struct Resolve<T_Object<T_AnyType,pmacc::pmacc_isAlias> >
-{
-    /*solve recursive if alias is nested*/
-    typedef typename  bmpl::if_<
-        boost::is_same<T_AnyType,typename Resolve<T_AnyType>::type >,
-        T_AnyType,
-        typename Resolve<T_AnyType>::type
-    >::type type;
-};
+    namespace traits
+    {
+        template<template<typename, typename> class T_Object, typename T_AnyType>
+        struct Resolve<T_Object<T_AnyType, pmacc::pmacc_isAlias>>
+        {
+            /*solve recursive if alias is nested*/
+            typedef typename bmpl::if_<
+                boost::is_same<T_AnyType, typename Resolve<T_AnyType>::type>,
+                T_AnyType,
+                typename Resolve<T_AnyType>::type>::type type;
+        };
 
-} //namespace traits
-} //namespace pmacc
+    } // namespace traits
+} // namespace pmacc
diff --git a/include/pmacc/identifier/identifier.hpp b/include/pmacc/identifier/identifier.hpp
index 76e9eaeed8..a35adb12a4 100644
--- a/include/pmacc/identifier/identifier.hpp
+++ b/include/pmacc/identifier/identifier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -24,46 +24,6 @@
 #include "pmacc/types.hpp"
 #include "pmacc/ppFunctions.hpp"
 
-/* No namespace is needed because we only have defines*/
-
-#ifdef __CUDA_ARCH__ //we are on gpu
-#   define PMACC_PLACEHOLDER(id) using namespace PMACC_JOIN(device_placeholder,id)
-#else
-#   define PMACC_PLACEHOLDER(id) using namespace PMACC_JOIN(host_placeholder,id)
-#endif
-
-#ifdef __CUDACC__
-#   define PMACC_identifier_CUDA(name,id)                                         \
-        namespace PMACC_JOIN(device_placeholder,id){                               \
-            /* This variable exists only for template parameter deduction, its value
-             * is never used. So in this case it is fine to have a separate version
-             * in each translation unit due to static.
-             */                                                                    \
-            static __constant__ PMACC_JOIN(placeholder_definition,id)::name PMACC_JOIN(name,_); \
-        }
-#else
-#   define PMACC_identifier_CUDA(name,id)
-#endif
-
-/*define special macros for creating classes which are only used as identifier*/
-#define PMACC_identifier(name,id,...)                                          \
-    namespace PMACC_JOIN(placeholder_definition,id) {                          \
-        struct name{                                                           \
-            __VA_ARGS__                                                        \
-        };                                                                     \
-    }                                                                          \
-    using namespace PMACC_JOIN(placeholder_definition,id);                     \
-    namespace PMACC_JOIN(host_placeholder,id){                                 \
-        /* This variable exists only for template parameter deduction, its value
-         * is never used. So in this case it is fine to have a separate version
-         * in each translation unit due to static.
-         */                                                                    \
-        static PMACC_JOIN(placeholder_definition,id)::name PMACC_JOIN(name,_); \
-    }                                                                          \
-    PMACC_identifier_CUDA(name,id);                                            \
-    PMACC_PLACEHOLDER(id);
-
-
 /** create an identifier (identifier with arbitrary code as second parameter
  * !! second parameter is optional and can be any C++ code one can add inside a class
  *
@@ -74,4 +34,9 @@
  * to create an instance of this identifier you can use:
  *      varname();   or varname_
  */
-#define identifier(name,...) PMACC_identifier(name,__COUNTER__,__VA_ARGS__)
+#define identifier(name, ...)                                                                                         \
+    struct name                                                                                                       \
+    {                                                                                                                 \
+        __VA_ARGS__                                                                                                   \
+    };                                                                                                                \
+    constexpr name PMACC_JOIN(name, _)
diff --git a/include/pmacc/identifier/named_type.hpp b/include/pmacc/identifier/named_type.hpp
index 4b3401c9f6..4abd70855c 100644
--- a/include/pmacc/identifier/named_type.hpp
+++ b/include/pmacc/identifier/named_type.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -43,12 +43,6 @@
  *      length();   or length_
  *
  */
-#define named_type(in_type,name,...)                                           \
-        identifier(name,                                                       \
-        typedef in_type type;                                                  \
-        static std::string getName()                                           \
-        {                                                                      \
-                return std::string(#name);                                     \
-        }                                                                      \
-        __VA_ARGS__                                                            \
-    )
+#define named_type(in_type, name, ...)                                                                                \
+    identifier(                                                                                                       \
+        name, typedef in_type type; static std::string getName() { return std::string(#name); } __VA_ARGS__)
diff --git a/include/pmacc/identifier/value_identifier.hpp b/include/pmacc/identifier/value_identifier.hpp
index 5a80fed5e0..234b5e5951 100644
--- a/include/pmacc/identifier/value_identifier.hpp
+++ b/include/pmacc/identifier/value_identifier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -45,15 +45,8 @@
  * to create a instance of this value_identifier you can use:
  *      `length()` or `length_`
  */
-#define value_identifier(in_type,name,in_default)                              \
-        identifier(name,                                                       \
-        typedef in_type type;                                                  \
-        static HDINLINE type getValue()                                        \
-        {                                                                      \
-                return in_default;                                             \
-        }                                                                      \
-        static std::string getName()                                           \
-        {                                                                      \
-                return std::string(#name);                                     \
-        }                                                                      \
-    )
+#define value_identifier(in_type, name, in_default)                                                                   \
+    identifier(                                                                                                       \
+        name, typedef in_type type; static HDINLINE type getValue() {                                                 \
+            return in_default;                                                                                        \
+        } static std::string getName() { return std::string(#name); })
diff --git a/include/pmacc/mappings/kernel/AreaMapping.hpp b/include/pmacc/mappings/kernel/AreaMapping.hpp
index 7136a3b336..612c8b1d47 100644
--- a/include/pmacc/mappings/kernel/AreaMapping.hpp
+++ b/include/pmacc/mappings/kernel/AreaMapping.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,24 +28,19 @@
 
 namespace pmacc
 {
-
     template<uint32_t areaType, class baseClass>
     class AreaMapping;
 
-    template<
-    uint32_t areaType,
-    template<unsigned, class> class baseClass,
-    unsigned DIM,
-    class SuperCellSize_
-    >
-    class AreaMapping<areaType, baseClass<DIM, SuperCellSize_> > : public baseClass<DIM, SuperCellSize_>
+    template<uint32_t areaType, template<unsigned, class> class baseClass, unsigned DIM, class SuperCellSize_>
+    class AreaMapping<areaType, baseClass<DIM, SuperCellSize_>> : public baseClass<DIM, SuperCellSize_>
     {
     public:
         typedef baseClass<DIM, SuperCellSize_> BaseClass;
 
         enum
         {
-            AreaType = areaType, Dim = BaseClass::Dim
+            AreaType = areaType,
+            Dim = BaseClass::Dim
         };
 
 
@@ -62,8 +57,7 @@ namespace pmacc
          */
         HINLINE DataSpace<DIM> getGridDim() const
         {
-            return AreaMappingMethods<areaType, DIM>::getGridDim(*this,
-                                                                 this->getGridSuperCells());
+            return AreaMappingMethods<areaType, DIM>::getGridDim(*this, this->getGridSuperCells());
         }
 
         /**
@@ -74,11 +68,11 @@ namespace pmacc
          */
         HDINLINE DataSpace<DIM> getSuperCellIndex(const DataSpace<DIM>& realSuperCellIdx) const
         {
-            return AreaMappingMethods<areaType, DIM>::getBlockIndex(*this,
-                                                                    this->getGridSuperCells(),
-                                                                    realSuperCellIdx);
+            return AreaMappingMethods<areaType, DIM>::getBlockIndex(
+                *this,
+                this->getGridSuperCells(),
+                realSuperCellIdx);
         }
-
     };
 
 } // namespace pmacc
diff --git a/include/pmacc/mappings/kernel/AreaMappingMethods.hpp b/include/pmacc/mappings/kernel/AreaMappingMethods.hpp
index 802cbf820a..d26f8314eb 100644
--- a/include/pmacc/mappings/kernel/AreaMappingMethods.hpp
+++ b/include/pmacc/mappings/kernel/AreaMappingMethods.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,7 +26,6 @@
 
 namespace pmacc
 {
-
     /**
      * Helper class for AreaMapping.
      * Provides methods called by AreaMapping using template specialization.
@@ -37,70 +36,70 @@ namespace pmacc
     template<uint32_t areaType, unsigned DIM>
     class AreaMappingMethods;
 
-    //CORE + BORDER + GUARD
+    // CORE + BORDER + GUARD
 
     template<unsigned DIM>
     class AreaMappingMethods<CORE + BORDER + GUARD, DIM>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace<DIM> getGridDim(const Base&, const DataSpace<DIM> &gBlocks)
+        HINLINE static DataSpace<DIM> getGridDim(const Base&, const DataSpace<DIM>& gBlocks)
         {
             return gBlocks;
         }
 
         template<class Base>
-        HDINLINE static DataSpace<DIM> getBlockIndex(const Base&,
-        const DataSpace<DIM>&,
-        const DataSpace<DIM>& _blockIdx)
+        HDINLINE static DataSpace<DIM> getBlockIndex(
+            const Base&,
+            const DataSpace<DIM>&,
+            const DataSpace<DIM>& _blockIdx)
         {
             return _blockIdx;
         }
     };
 
-    //CORE
+    // CORE
 
     template<unsigned DIM>
     class AreaMappingMethods<CORE, DIM>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace<DIM> getGridDim(const Base &base, const DataSpace<DIM> &gBlocks)
+        HINLINE static DataSpace<DIM> getGridDim(const Base& base, const DataSpace<DIM>& gBlocks)
         {
             // skip 2 x (border + guard) == 4 x guard
             return gBlocks - 4 * base.getGuardingSuperCells();
         }
 
         template<class Base>
-        HDINLINE static DataSpace<DIM> getBlockIndex(const Base &base,
-        const DataSpace<DIM> &gBlocks,
-        const DataSpace<DIM>& _blockIdx)
+        HDINLINE static DataSpace<DIM> getBlockIndex(
+            const Base& base,
+            const DataSpace<DIM>& gBlocks,
+            const DataSpace<DIM>& _blockIdx)
         {
             // skip guard + border == 2 x guard
             return _blockIdx + 2 * base.getGuardingSuperCells();
         }
     };
 
-    //CORE+BORDER
+    // CORE+BORDER
 
     template<unsigned DIM>
     class AreaMappingMethods<CORE + BORDER, DIM>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace<DIM> getGridDim(const Base &base, const DataSpace<DIM> &gBlocks)
+        HINLINE static DataSpace<DIM> getGridDim(const Base& base, const DataSpace<DIM>& gBlocks)
         {
             // remove guard + border == 2 x guard
             return gBlocks - 2 * base.getGuardingSuperCells();
         }
 
         template<class Base>
-        HDINLINE static DataSpace<DIM> getBlockIndex(const Base &base,
-        const DataSpace<DIM> &gBlocks,
-        const DataSpace<DIM>& _blockIdx)
+        HDINLINE static DataSpace<DIM> getBlockIndex(
+            const Base& base,
+            const DataSpace<DIM>& gBlocks,
+            const DataSpace<DIM>& _blockIdx)
         {
             // skip guarding supercells
             return _blockIdx + base.getGuardingSuperCells();
@@ -108,20 +107,16 @@ namespace pmacc
     };
 
 
-    //dim 2D
+    // dim 2D
 
-    //GUARD
+    // GUARD
 
     template<>
     class AreaMappingMethods<GUARD, DIM2>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace< DIM2 > getGridDim(
-            const Base &base,
-            const DataSpace< DIM2 >& gBlocks
-        )
+        HINLINE static DataSpace<DIM2> getGridDim(const Base& base, const DataSpace<DIM2>& gBlocks)
         {
             const int x = gBlocks.x();
             const int y_ = gBlocks.y() - 2 * base.getGuardingSuperCells().y();
@@ -129,18 +124,14 @@ namespace pmacc
             const int xArea = x * base.getGuardingSuperCells().y();
             const int y_Area = y_ * base.getGuardingSuperCells().x();
 
-            return DataSpace< DIM2 >(
-                xArea + y_Area,
-                2
-            );
+            return DataSpace<DIM2>(xArea + y_Area, 2);
         }
 
         template<class Base>
-        HDINLINE static DataSpace< DIM2 > getBlockIndex(
-            const Base &base,
-            const DataSpace< DIM2 >& gBlocks,
-            const DataSpace< DIM2 >& _blockIdx
-        )
+        HDINLINE static DataSpace<DIM2> getBlockIndex(
+            const Base& base,
+            const DataSpace<DIM2>& gBlocks,
+            const DataSpace<DIM2>& _blockIdx)
         {
             const int x = gBlocks.x();
 
@@ -149,63 +140,51 @@ namespace pmacc
             if(_blockIdx.x() < xArea)
             {
                 const int tmp_x = _blockIdx.x();
-                return DataSpace< DIM2 >(
+                return DataSpace<DIM2>(
                     tmp_x % x,
                     tmp_x / x +
-                    // if _blockIdx.y() == 1 means bottom plane
-                    _blockIdx.y() * (gBlocks.y() - base.getGuardingSuperCells().y())
-                );
+                        // if _blockIdx.y() == 1 means bottom plane
+                        _blockIdx.y() * (gBlocks.y() - base.getGuardingSuperCells().y()));
             }
             else
             {
                 const int tmp_x = _blockIdx.x() - xArea;
-                return DataSpace< DIM2 >(
+                return DataSpace<DIM2>(
                     tmp_x % base.getGuardingSuperCells().x() +
-                    // if _blockIdx.y() == 1 means right plane
-                    _blockIdx.y() * (gBlocks.x() - base.getGuardingSuperCells().x()),
-                    tmp_x / base.getGuardingSuperCells().x() + base.getGuardingSuperCells().y()
-                );
+                        // if _blockIdx.y() == 1 means right plane
+                        _blockIdx.y() * (gBlocks.x() - base.getGuardingSuperCells().x()),
+                    tmp_x / base.getGuardingSuperCells().x() + base.getGuardingSuperCells().y());
             }
         }
     };
 
-    //BORDER
+    // BORDER
 
     template<>
     class AreaMappingMethods<BORDER, DIM2>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace< DIM2 > getGridDim(
-            const Base& base,
-            const DataSpace< DIM2 >& gBlocks
-        )
+        HINLINE static DataSpace<DIM2> getGridDim(const Base& base, const DataSpace<DIM2>& gBlocks)
         {
             // removes the guard, than BORDER is the new GUARD and we can reuse the GUARD mapper
-            const DataSpace< DIM2 > sizeWithoutGuard(gBlocks - 2 * base.getGuardingSuperCells());
+            const DataSpace<DIM2> sizeWithoutGuard(gBlocks - 2 * base.getGuardingSuperCells());
 
             return AreaMappingMethods<GUARD, DIM2>{}.getGridDim(base, sizeWithoutGuard);
         }
 
         template<class Base>
-        HDINLINE static DataSpace< DIM2 > getBlockIndex(
+        HDINLINE static DataSpace<DIM2> getBlockIndex(
             const Base& base,
-            const DataSpace< DIM2 >& gBlocks,
-            const DataSpace< DIM2 >& _blockIdx
-        )
+            const DataSpace<DIM2>& gBlocks,
+            const DataSpace<DIM2>& _blockIdx)
         {
             // removes the guard, than BORDER is the new GUARD and we can reuse the GUARD mapper
-            const DataSpace< DIM2 > sizeWithoutGuard(gBlocks - 2 * base.getGuardingSuperCells());
+            const DataSpace<DIM2> sizeWithoutGuard(gBlocks - 2 * base.getGuardingSuperCells());
 
             // use result of the shrinked domain and skip guarding supercells
-            return
-                AreaMappingMethods<GUARD,  DIM2 >{}.getBlockIndex(
-                    base,
-                    sizeWithoutGuard,
-                    _blockIdx
-                ) +
-                base.getGuardingSuperCells();
+            return AreaMappingMethods<GUARD, DIM2>{}.getBlockIndex(base, sizeWithoutGuard, _blockIdx)
+                + base.getGuardingSuperCells();
         }
     };
 
@@ -213,9 +192,8 @@ namespace pmacc
     class AreaMappingMethods<GUARD, DIM3>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace< DIM3 > getGridDim(const Base &base, const DataSpace< DIM3 > &gBlocks)
+        HINLINE static DataSpace<DIM3> getGridDim(const Base& base, const DataSpace<DIM3>& gBlocks)
         {
             const int x = gBlocks.x();
             const int x_ = gBlocks.x() - 2 * base.getGuardingSuperCells().x();
@@ -226,19 +204,14 @@ namespace pmacc
             const int z_yVolume = z_ * y * base.getGuardingSuperCells().x();
             const int z_x_Volume = z_ * x_ * base.getGuardingSuperCells().y();
 
-            return DataSpace< DIM3 >(
-                xyVolume + z_x_Volume + z_yVolume,
-                2,
-                1
-            );
+            return DataSpace<DIM3>(xyVolume + z_x_Volume + z_yVolume, 2, 1);
         }
 
         template<class Base>
-        HDINLINE static DataSpace< DIM3 > getBlockIndex(
-            const Base &base,
-            const DataSpace< DIM3 >& gBlocks,
-            const DataSpace< DIM3 >& _blockIdx
-        )
+        HDINLINE static DataSpace<DIM3> getBlockIndex(
+            const Base& base,
+            const DataSpace<DIM3>& gBlocks,
+            const DataSpace<DIM3>& _blockIdx)
         {
             const int x = gBlocks.x();
             const int x_ = gBlocks.x() - 2 * base.getGuardingSuperCells().x();
@@ -258,9 +231,8 @@ namespace pmacc
                     tmp_x % x,
                     tmp_x / x % y,
                     tmp_x / xyPlane +
-                    // if _blockIdx.y() == 1 means back plane
-                    _blockIdx.y() * (gBlocks.z() - base.getGuardingSuperCells().z())
-                );
+                        // if _blockIdx.y() == 1 means back plane
+                        _blockIdx.y() * (gBlocks.z() - base.getGuardingSuperCells().z()));
             }
             else if(_blockIdx.x() >= xyVolume && _blockIdx.x() < xyVolume + z_yVolume)
             {
@@ -268,13 +240,12 @@ namespace pmacc
                 const int z_yPlane = z_ * y;
                 const int tmp_x = _blockIdx.x() - xyVolume;
 
-                return DataSpace< DIM3 >(
+                return DataSpace<DIM3>(
                     tmp_x / z_yPlane +
-                    // if _blockIdx.y() == 1 means right plane
-                    _blockIdx.y() * (gBlocks.x() - base.getGuardingSuperCells().x()),
+                        // if _blockIdx.y() == 1 means right plane
+                        _blockIdx.y() * (gBlocks.x() - base.getGuardingSuperCells().x()),
                     tmp_x % y,
-                    tmp_x / y % z_ + base.getGuardingSuperCells().z()
-                );
+                    tmp_x / y % z_ + base.getGuardingSuperCells().z());
             }
             else
             {
@@ -284,10 +255,9 @@ namespace pmacc
                 return DataSpace<DIM3>(
                     (tmp_x % x_) + base.getGuardingSuperCells().x(),
                     tmp_x / x_z_Plane +
-                    // if _blockIdx.y() == 1 means bottom plane
-                    _blockIdx.y() * (gBlocks.y() - base.getGuardingSuperCells().y()),
-                    tmp_x / x_ % z_ + base.getGuardingSuperCells().z()
-                );
+                        // if _blockIdx.y() == 1 means bottom plane
+                        _blockIdx.y() * (gBlocks.y() - base.getGuardingSuperCells().y()),
+                    tmp_x / x_ % z_ + base.getGuardingSuperCells().z());
             }
         }
     };
@@ -296,38 +266,28 @@ namespace pmacc
     class AreaMappingMethods<BORDER, DIM3>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace< DIM3 > getGridDim(
-            const Base &base,
-            const DataSpace< DIM3 > &gBlocks
-        )
+        HINLINE static DataSpace<DIM3> getGridDim(const Base& base, const DataSpace<DIM3>& gBlocks)
         {
             // removes the guard, than BORDER is the new GUARD and we can reuse the GUARD mapper
-            const DataSpace< DIM3 > sizeWithoutGuard(gBlocks - 2 * base.getGuardingSuperCells());
+            const DataSpace<DIM3> sizeWithoutGuard(gBlocks - 2 * base.getGuardingSuperCells());
 
             return AreaMappingMethods<GUARD, DIM3>{}.getGridDim(base, sizeWithoutGuard);
         }
 
         template<class Base>
-        HDINLINE static DataSpace< DIM3 > getBlockIndex(
-            const Base &base,
-            const DataSpace< DIM3 >& gBlocks,
-            const DataSpace< DIM3 >& _blockIdx
-        )
+        HDINLINE static DataSpace<DIM3> getBlockIndex(
+            const Base& base,
+            const DataSpace<DIM3>& gBlocks,
+            const DataSpace<DIM3>& _blockIdx)
         {
             // removes the guard, than BORDER is the new GUARD and we can reuse the GUARD mapper
             const DataSpace<DIM3> sizeWithoutGuard(gBlocks - 2 * base.getGuardingSuperCells());
 
             // use result of the shrinked domain and skip guarding supercells
-            return
-                AreaMappingMethods<GUARD, DIM3>{}.getBlockIndex(
-                    base,
-                    sizeWithoutGuard,
-                    _blockIdx
-                ) +
-                base.getGuardingSuperCells();
+            return AreaMappingMethods<GUARD, DIM3>{}.getBlockIndex(base, sizeWithoutGuard, _blockIdx)
+                + base.getGuardingSuperCells();
         }
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/mappings/kernel/BorderMapping.hpp b/include/pmacc/mappings/kernel/BorderMapping.hpp
index 76fb498914..7426fbed45 100644
--- a/include/pmacc/mappings/kernel/BorderMapping.hpp
+++ b/include/pmacc/mappings/kernel/BorderMapping.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Alexander Grund
+/* Copyright 2013-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -29,7 +29,6 @@
 
 namespace pmacc
 {
-
     /**
      * This maps onto the border to 1 exchange direction (e.g. TOP, BOTTOM, TOP + LEFT, ...)
      * Choosing multiple directions defines an intersection [1] in mathematical set theory.
@@ -47,12 +46,8 @@ namespace pmacc
     template<class T_BaseClass>
     class BorderMapping;
 
-    template<
-        template<unsigned, class> class T_BaseClass,
-        unsigned T_dim,
-        class T_SuperCellSize
-    >
-    class BorderMapping<T_BaseClass<T_dim, T_SuperCellSize> >: public T_BaseClass<T_dim, T_SuperCellSize>
+    template<template<unsigned, class> class T_BaseClass, unsigned T_dim, class T_SuperCellSize>
+    class BorderMapping<T_BaseClass<T_dim, T_SuperCellSize>> : public T_BaseClass<T_dim, T_SuperCellSize>
     {
     public:
         typedef T_BaseClass<T_dim, T_SuperCellSize> BaseClass;
@@ -72,7 +67,9 @@ namespace pmacc
          * @param base object of base class baseClass (see template parameters)
          * @param direction exchange direction to map to
          */
-        HINLINE BorderMapping(const BaseClass& base, pmacc::ExchangeType direction): BaseClass(base), m_direction(direction)
+        HINLINE BorderMapping(const BaseClass& base, pmacc::ExchangeType direction)
+            : BaseClass(base)
+            , m_direction(direction)
         {
             PMACC_ASSERT(direction != 0);
         }
@@ -80,8 +77,7 @@ namespace pmacc
         /**
          * Returns the exchange direction used by this mapper
          */
-        HDINLINE pmacc::ExchangeType
-        getDirection() const
+        HDINLINE pmacc::ExchangeType getDirection() const
         {
             return m_direction;
         }
@@ -99,7 +95,7 @@ namespace pmacc
 
             for(int i = 0; i < Dim; i++)
             {
-                if (directions[i] != 0)
+                if(directions[i] != 0)
                     result[i] = this->getGuardingSuperCells()[i];
             }
 
@@ -120,7 +116,7 @@ namespace pmacc
 
             for(int i = 0; i < Dim; i++)
             {
-                if (directions[i] == 1)
+                if(directions[i] == 1)
                     result[i] += this->getGridSuperCells()[i] - 2 * this->getGuardingSuperCells()[i];
                 else
                     result[i] += this->getGuardingSuperCells()[i];
@@ -128,6 +124,7 @@ namespace pmacc
 
             return result;
         }
+
     private:
         PMACC_ALIGN(m_direction, const pmacc::ExchangeType);
     };
diff --git a/include/pmacc/mappings/kernel/ExchangeMapping.hpp b/include/pmacc/mappings/kernel/ExchangeMapping.hpp
index 37dd8d5ce7..2bc5ea1d55 100644
--- a/include/pmacc/mappings/kernel/ExchangeMapping.hpp
+++ b/include/pmacc/mappings/kernel/ExchangeMapping.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,7 +28,6 @@
 
 namespace pmacc
 {
-
     template<uint32_t areaType, class baseClass>
     class ExchangeMapping;
 
@@ -39,16 +38,12 @@ namespace pmacc
      * @tparam areaType are to map to
      * @tparam baseClass base class for mapping, should be MappingDescription
      */
-    template<
-    uint32_t areaType,
-    template<unsigned, class> class baseClass,
-    unsigned DIM,
-    class SuperCellSize_
-    >
-    class ExchangeMapping<areaType, baseClass<DIM, SuperCellSize_> > : public baseClass<DIM, SuperCellSize_>
+    template<uint32_t areaType, template<unsigned, class> class baseClass, unsigned DIM, class SuperCellSize_>
+    class ExchangeMapping<areaType, baseClass<DIM, SuperCellSize_>> : public baseClass<DIM, SuperCellSize_>
     {
     private:
         uint32_t exchangeType;
+
     public:
         typedef baseClass<DIM, SuperCellSize_> BaseClass;
 
@@ -66,9 +61,7 @@ namespace pmacc
          * @param base object of base class baseClass (see template parameters)
          * @param exchangeType exchange type for mapping
          */
-        HINLINE ExchangeMapping(BaseClass base, uint32_t exchangeType) :
-        BaseClass(base),
-        exchangeType(exchangeType)
+        HINLINE ExchangeMapping(BaseClass base, uint32_t exchangeType) : BaseClass(base), exchangeType(exchangeType)
         {
         }
 
@@ -98,12 +91,8 @@ namespace pmacc
          */
         HDINLINE DataSpace<DIM> getSuperCellIndex(const DataSpace<DIM>& realSuperCellIdx) const
         {
-            return ExchangeMappingMethods<areaType, DIM>::getBlockIndex(
-                                                                        *this,
-                                                                        realSuperCellIdx,
-                                                                        exchangeType);
+            return ExchangeMappingMethods<areaType, DIM>::getBlockIndex(*this, realSuperCellIdx, exchangeType);
         }
-
     };
 
 } // namespace pmacc
diff --git a/include/pmacc/mappings/kernel/ExchangeMappingMethods.hpp b/include/pmacc/mappings/kernel/ExchangeMappingMethods.hpp
index f9186bff46..360343b788 100644
--- a/include/pmacc/mappings/kernel/ExchangeMappingMethods.hpp
+++ b/include/pmacc/mappings/kernel/ExchangeMappingMethods.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,7 +28,6 @@
 
 namespace pmacc
 {
-
     /**
      * Helper class for ExchangeMapping.
      * Provides methods called by ExchangeMapping using template specialization.
@@ -40,16 +39,17 @@ namespace pmacc
     class ExchangeMappingMethods
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace<DIM> getGridDim(const Base &base, uint32_t exchangeType)
+        HINLINE static DataSpace<DIM> getGridDim(const Base& base, uint32_t exchangeType)
         {
             return base.getGridSuperCells();
         }
 
         template<class Base>
-        HDINLINE static DataSpace<DIM> getBlockIndex(const Base &base,
-        const DataSpace<DIM>& _blockIdx, uint32_t exchangeType)
+        HDINLINE static DataSpace<DIM> getBlockIndex(
+            const Base& base,
+            const DataSpace<DIM>& _blockIdx,
+            uint32_t exchangeType)
         {
             return _blockIdx;
         }
@@ -61,40 +61,40 @@ namespace pmacc
     class ExchangeMappingMethods<GUARD, DIM>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace< DIM > getGridDim(const Base &base, uint32_t exchangeType)
+        HINLINE static DataSpace<DIM> getGridDim(const Base& base, uint32_t exchangeType)
         {
-            const DataSpace< DIM > guardingSupercells = base.getGuardingSuperCells();
-            DataSpace< DIM > result(base.getGridSuperCells() - 2 * guardingSupercells);
+            const DataSpace<DIM> guardingSupercells = base.getGuardingSuperCells();
+            DataSpace<DIM> result(base.getGridSuperCells() - 2 * guardingSupercells);
 
-            const DataSpace< DIM > directions = Mask::getRelativeDirections< DIM > (exchangeType);
+            const DataSpace<DIM> directions = Mask::getRelativeDirections<DIM>(exchangeType);
 
-            for( uint32_t d = 0; d < DIM; ++d )
+            for(uint32_t d = 0; d < DIM; ++d)
             {
-                if (directions[ d ] != 0)
-                    result[ d ] = guardingSupercells[ d ];
+                if(directions[d] != 0)
+                    result[d] = guardingSupercells[d];
             }
 
             return result;
         }
 
         template<class Base>
-        HDINLINE static DataSpace< DIM > getBlockIndex(const Base &base,
-        const DataSpace< DIM >& _blockIdx, uint32_t exchangeType)
+        HDINLINE static DataSpace<DIM> getBlockIndex(
+            const Base& base,
+            const DataSpace<DIM>& _blockIdx,
+            uint32_t exchangeType)
         {
-            DataSpace< DIM > result(_blockIdx);
+            DataSpace<DIM> result(_blockIdx);
 
-            const DataSpace< DIM > directions = Mask::getRelativeDirections< DIM > (exchangeType);
-            const DataSpace< DIM > guardingSupercells = base.getGuardingSuperCells();
+            const DataSpace<DIM> directions = Mask::getRelativeDirections<DIM>(exchangeType);
+            const DataSpace<DIM> guardingSupercells = base.getGuardingSuperCells();
 
-            for( uint32_t d = 0; d < DIM; ++d )
+            for(uint32_t d = 0; d < DIM; ++d)
             {
-                if (directions[ d ] == 0)
-                    result[ d ] += guardingSupercells[ d ];
-                else
-                    if (directions[ d ] == 1)
-                    result[ d ] += base.getGridSuperCells()[ d ] - guardingSupercells[ d ];
+                if(directions[d] == 0)
+                    result[d] += guardingSupercells[d];
+                else if(directions[d] == 1)
+                    result[d] += base.getGridSuperCells()[d] - guardingSupercells[d];
             }
 
             return result;
@@ -104,51 +104,52 @@ namespace pmacc
 
     // areaType == BORDER
 
-    template< unsigned DIM >
+    template<unsigned DIM>
     class ExchangeMappingMethods<BORDER, DIM>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace< DIM > getGridDim(const Base &base, uint32_t exchangeType)
+        HINLINE static DataSpace<DIM> getGridDim(const Base& base, uint32_t exchangeType)
         {
             // skip 2 x (border + guard) == 4 x guard
-            DataSpace< DIM > result(base.getGridSuperCells() - 4 * base.getGuardingSuperCells());
+            DataSpace<DIM> result(base.getGridSuperCells() - 4 * base.getGuardingSuperCells());
 
-            DataSpace< DIM > directions = Mask::getRelativeDirections< DIM > (exchangeType);
+            DataSpace<DIM> directions = Mask::getRelativeDirections<DIM>(exchangeType);
 
-            for( uint32_t d = 0; d < DIM; ++d )
+            for(uint32_t d = 0; d < DIM; ++d)
             {
-                if (directions[ d ] != 0)
-                    result[ d ] = base.getGuardingSuperCells()[ d ];
+                if(directions[d] != 0)
+                    result[d] = base.getGuardingSuperCells()[d];
             }
 
             return result;
         }
 
         template<class Base>
-        HDINLINE static DataSpace< DIM > getBlockIndex(const Base &base,
-        const DataSpace< DIM >& _blockIdx, uint32_t exchangeType)
+        HDINLINE static DataSpace<DIM> getBlockIndex(
+            const Base& base,
+            const DataSpace<DIM>& _blockIdx,
+            uint32_t exchangeType)
         {
-            DataSpace< DIM > result(_blockIdx);
+            DataSpace<DIM> result(_blockIdx);
 
-            DataSpace< DIM > directions = Mask::getRelativeDirections< DIM > (exchangeType);
+            DataSpace<DIM> directions = Mask::getRelativeDirections<DIM>(exchangeType);
 
-            DataSpace< DIM > guardingBlocks = base.getGuardingSuperCells();
+            DataSpace<DIM> guardingBlocks = base.getGuardingSuperCells();
 
-            for( uint32_t d = 0; d <  DIM; ++d )
+            for(uint32_t d = 0; d < DIM; ++d)
             {
-                switch (directions[ d ])
+                switch(directions[d])
                 {
-                    case 0:
-                        result[ d ] += 2 * guardingBlocks[ d ];
-                        break;
-                    case -1:
-                        result[ d ] += guardingBlocks[ d ];
-                        break;
-                    case 1:
-                        result[ d ] += base.getGridSuperCells()[ d ] - 2 * guardingBlocks[ d ];
-                        break;
+                case 0:
+                    result[d] += 2 * guardingBlocks[d];
+                    break;
+                case -1:
+                    result[d] += guardingBlocks[d];
+                    break;
+                case 1:
+                    result[d] += base.getGridSuperCells()[d] - 2 * guardingBlocks[d];
+                    break;
                 }
             }
 
@@ -156,4 +157,4 @@ namespace pmacc
         }
     };
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/mappings/kernel/MappingDescription.hpp b/include/pmacc/mappings/kernel/MappingDescription.hpp
index a490ef8030..bb58e5af25 100644
--- a/include/pmacc/mappings/kernel/MappingDescription.hpp
+++ b/include/pmacc/mappings/kernel/MappingDescription.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -31,105 +31,102 @@
 
 namespace pmacc
 {
+    /**
+     * Abstracts logical block information from block variables.
+     *
+     * @tparam DIM dimension for grid/blocks
+     * @tparam SuperCellSize mapper class for logical grid information
+     */
 
-/**
- * Abstracts logical block information from block variables.
- *
- * @tparam DIM dimension for grid/blocks
- * @tparam SuperCellSize mapper class for logical grid information
- */
-
-template<unsigned DIM, class SuperCellSize_>
-class MappingDescription
-{
-public:
-
-    enum
+    template<unsigned DIM, class SuperCellSize_>
+    class MappingDescription
     {
-        Dim = DIM
-    };
+    public:
+        enum
+        {
+            Dim = DIM
+        };
 
-    typedef SuperCellSize_ SuperCellSize;
+        typedef SuperCellSize_ SuperCellSize;
 
-    /** constructor
-     *
-     * @param localGridCells number of cells in the local value (including guarding cells)
-     * @param guardingSuperCells number of **supercells** within the guard
-     */
-    MappingDescription(
-        DataSpace<DIM> localGridCells,
-        DataSpace<DIM> guardingSuperCells = DataSpace<DIM>::create(0)
-    ) :
-        gridSuperCells(localGridCells / SuperCellSize::toRT()), /*block count per dimension*/
-        guardingSuperCells(guardingSuperCells)
-    {
-        /* each dimension needs at least one supercell for the core and 2 * guardingSuperCells
-         * (one supercell for the border and one for the guard) or it has no guarding and border
-         * and contains only a core (this is allowed for local arrays which can not sync the
-         * outer supercells with there neighbor MPI ranks.
+        /** constructor
+         *
+         * @param localGridCells number of cells in the local value (including guarding cells)
+         * @param guardingSuperCells number of **supercells** within the guard
          */
-        for( uint32_t d = 0; d < DIM; ++d )
+        MappingDescription(
+            DataSpace<DIM> localGridCells,
+            DataSpace<DIM> guardingSuperCells = DataSpace<DIM>::create(0))
+            : gridSuperCells(localGridCells / SuperCellSize::toRT())
+            , /*block count per dimension*/
+            guardingSuperCells(guardingSuperCells)
         {
-            /*minimal 3 blocks are needed if we have guarding blocks*/
-            int minBlock = std::min(gridSuperCells.x(), gridSuperCells.y());
-            if (DIM == DIM3)
+            /* each dimension needs at least one supercell for the core and 2 * guardingSuperCells
+             * (one supercell for the border and one for the guard) or it has no guarding and border
+             * and contains only a core (this is allowed for local arrays which can not sync the
+             * outer supercells with there neighbor MPI ranks.
+             */
+            for(uint32_t d = 0; d < DIM; ++d)
             {
-                minBlock = std::min(minBlock, gridSuperCells[2]);
+                /*minimal 3 blocks are needed if we have guarding blocks*/
+                int minBlock = std::min(gridSuperCells.x(), gridSuperCells.y());
+                if(DIM == DIM3)
+                {
+                    minBlock = std::min(minBlock, gridSuperCells[2]);
+                }
+                PMACC_VERIFY(
+                    (guardingSuperCells[d] == 0 && gridSuperCells[d] >= 1)
+                    || gridSuperCells[d] >= 2 * guardingSuperCells[d] + 1);
             }
-            PMACC_VERIFY(
-                ( guardingSuperCells[ d ] == 0 && gridSuperCells[ d ] >= 1) ||
-                gridSuperCells[ d ] >= 2 * guardingSuperCells[ d ] + 1
-            );
         }
-    }
 
-    HDINLINE DataSpace<DIM> getGridSuperCells() const
-    {
-        return this->gridSuperCells;
-    }
-
-    HDINLINE DataSpace<DIM> getGuardingSuperCells() const
-    {
-        return guardingSuperCells;
-    }
-
-    HDINLINE void setGridSuperCells(DataSpace<DIM> superCellsCount)
-    {
-        gridSuperCells = superCellsCount;
-    }
+        HDINLINE DataSpace<DIM> getGridSuperCells() const
+        {
+            return this->gridSuperCells;
+        }
 
-    /*! get the Coordinate of the root supercell in the hole simulation area
-     * * root supercell in 2D LEFT+TOP | in 3D LEFT+TOP+FRONT
-     * @param globaOffset cells
-     * @return global index of the root supercell
-     */
-    HINLINE DataSpace<DIM> getRootSuperCellCoordinate(const DataSpace<DIM> globalOffset) const
-    {
-        return globalOffset/SuperCellSize::toRT();
-    }
+        HDINLINE DataSpace<DIM> getGuardingSuperCells() const
+        {
+            return guardingSuperCells;
+        }
 
-    HDINLINE DataSpace<DIM> getSuperCellSize() const
-    {
-        return SuperCellSize::toRT();
-    }
+        HDINLINE void setGridSuperCells(DataSpace<DIM> superCellsCount)
+        {
+            gridSuperCells = superCellsCount;
+        }
 
-    HDINLINE GridLayout<DIM> getGridLayout() const
-    {
-        return GridLayout<DIM > (SuperCellSize::toRT()*(gridSuperCells - 2 * guardingSuperCells), SuperCellSize::toRT() * guardingSuperCells);
-    }
+        /*! get the Coordinate of the root supercell in the hole simulation area
+         * * root supercell in 2D LEFT+TOP | in 3D LEFT+TOP+FRONT
+         * @param globaOffset cells
+         * @return global index of the root supercell
+         */
+        HINLINE DataSpace<DIM> getRootSuperCellCoordinate(const DataSpace<DIM> globalOffset) const
+        {
+            return globalOffset / SuperCellSize::toRT();
+        }
 
-    HINLINE DataSpace<DIM> getGlobalSuperCells() const
-    {
-        return Environment<DIM>::get().GridController().getGpuNodes() * (gridSuperCells - 2 * guardingSuperCells);
-    }
+        HDINLINE DataSpace<DIM> getSuperCellSize() const
+        {
+            return SuperCellSize::toRT();
+        }
 
+        HDINLINE GridLayout<DIM> getGridLayout() const
+        {
+            return GridLayout<DIM>(
+                SuperCellSize::toRT() * (gridSuperCells - 2 * guardingSuperCells),
+                SuperCellSize::toRT() * guardingSuperCells);
+        }
 
-protected:
+        HINLINE DataSpace<DIM> getGlobalSuperCells() const
+        {
+            return Environment<DIM>::get().GridController().getGpuNodes() * (gridSuperCells - 2 * guardingSuperCells);
+        }
 
-    //\todo: keine Eigenschaft einer Zelle
-    PMACC_ALIGN(gridSuperCells, DataSpace<DIM>);
-    PMACC_ALIGN(guardingSuperCells, DataSpace<DIM>);
 
-};
+    protected:
+        //\todo: keine Eigenschaft einer Zelle
+        PMACC_ALIGN(gridSuperCells, DataSpace<DIM>);
+        PMACC_ALIGN(guardingSuperCells, DataSpace<DIM>);
+    };
 
 } // namespace pmacc
diff --git a/include/pmacc/mappings/kernel/StrideMapping.hpp b/include/pmacc/mappings/kernel/StrideMapping.hpp
index 38b3e52d00..6b407ea515 100644
--- a/include/pmacc/mappings/kernel/StrideMapping.hpp
+++ b/include/pmacc/mappings/kernel/StrideMapping.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,82 +29,82 @@
 
 namespace pmacc
 {
-
-template<uint32_t areaType, uint32_t stride, class baseClass>
-class StrideMapping;
-
-template<
-uint32_t areaType,
-uint32_t stride,
-template<unsigned, class> class baseClass,
-unsigned DIM,
-class SuperCellSize_
->
-class StrideMapping<areaType, stride, baseClass<DIM, SuperCellSize_> > : public baseClass<DIM, SuperCellSize_>
-{
-public:
-    typedef baseClass<DIM, SuperCellSize_> BaseClass;
-
-    enum
+    template<uint32_t areaType, uint32_t stride, class baseClass>
+    class StrideMapping;
+
+    template<
+        uint32_t areaType,
+        uint32_t stride,
+        template<unsigned, class>
+        class baseClass,
+        unsigned DIM,
+        class SuperCellSize_>
+    class StrideMapping<areaType, stride, baseClass<DIM, SuperCellSize_>> : public baseClass<DIM, SuperCellSize_>
     {
-        AreaType = areaType, Dim = BaseClass::Dim, Stride = stride
+    public:
+        typedef baseClass<DIM, SuperCellSize_> BaseClass;
+
+        enum
+        {
+            AreaType = areaType,
+            Dim = BaseClass::Dim,
+            Stride = stride
+        };
+
+
+        typedef typename BaseClass::SuperCellSize SuperCellSize;
+
+        HINLINE StrideMapping(BaseClass base) : BaseClass(base), offset()
+        {
+        }
+
+        /**
+         * Generate grid dimension information for kernel calls
+         *
+         * @return size of the grid
+         */
+        HINLINE DataSpace<DIM> getGridDim() const
+        {
+            return (StrideMappingMethods<areaType, DIM>::getGridDim(*this) - offset + (int) Stride - 1) / (int) Stride;
+        }
+
+        /**
+         * Returns index of current logical block
+         *
+         * @param realSuperCellIdx current SuperCell index (block index)
+         * @return mapped SuperCell index
+         */
+        HDINLINE DataSpace<DIM> getSuperCellIndex(const DataSpace<DIM>& realSuperCellIdx) const
+        {
+            const DataSpace<DIM> blockId((realSuperCellIdx * (int) Stride) + offset);
+            return StrideMappingMethods<areaType, DIM>::shift(*this, blockId);
+        }
+
+        HDINLINE DataSpace<DIM> getOffset() const
+        {
+            return offset;
+        }
+
+        HDINLINE void setOffset(const DataSpace<DIM> offset)
+        {
+            this->offset = offset;
+        }
+
+        /** set mapper to next domain
+         *
+         * @return true if domain is valid, else false
+         */
+        HINLINE bool next()
+        {
+            int linearOffset = DataSpaceOperations<Dim>::map(DataSpace<DIM>::create(stride), offset);
+            linearOffset++;
+            offset = DataSpaceOperations<Dim>::map(DataSpace<DIM>::create(stride), linearOffset);
+
+            return linearOffset < DataSpace<DIM>::create(stride).productOfComponents();
+        }
+
+    private:
+        PMACC_ALIGN(offset, DataSpace<DIM>);
     };
 
-
-    typedef typename BaseClass::SuperCellSize SuperCellSize;
-
-    HINLINE StrideMapping(BaseClass base) : BaseClass(base), offset()
-    {
-    }
-
-    /**
-     * Generate grid dimension information for kernel calls
-     *
-     * @return size of the grid
-     */
-    HINLINE DataSpace<DIM> getGridDim() const
-    {
-        return (StrideMappingMethods<areaType, DIM>::getGridDim(*this) - offset + (int)Stride - 1) / (int)Stride;
-    }
-
-    /**
-     * Returns index of current logical block
-     *
-     * @param realSuperCellIdx current SuperCell index (block index)
-     * @return mapped SuperCell index
-     */
-    HDINLINE DataSpace<DIM> getSuperCellIndex(const DataSpace<DIM>& realSuperCellIdx) const
-    {
-        const DataSpace<DIM> blockId((realSuperCellIdx * (int)Stride) + offset);
-        return StrideMappingMethods<areaType, DIM>::shift(*this, blockId);
-    }
-
-    HDINLINE DataSpace<DIM> getOffset() const
-    {
-        return offset;
-    }
-
-    HDINLINE void setOffset(const DataSpace<DIM> offset)
-    {
-        this->offset = offset;
-    }
-
-    /** set mapper to next domain
-     *
-     * @return true if domain is valid, else false
-     */
-    HINLINE bool next()
-    {
-        int linearOffset = DataSpaceOperations<Dim>::map(DataSpace<DIM>::create(stride), offset);
-        linearOffset++;
-        offset = DataSpaceOperations<Dim>::map(DataSpace<DIM>::create(stride), linearOffset);
-
-        return linearOffset < DataSpace<DIM>::create(stride).productOfComponents();
-    }
-
-private:
-    PMACC_ALIGN(offset, DataSpace<DIM>);
-
-};
-
 } // namespace pmacc
diff --git a/include/pmacc/mappings/kernel/StrideMappingMethods.hpp b/include/pmacc/mappings/kernel/StrideMappingMethods.hpp
index 4c6912607f..c42dec3282 100644
--- a/include/pmacc/mappings/kernel/StrideMappingMethods.hpp
+++ b/include/pmacc/mappings/kernel/StrideMappingMethods.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,7 +27,6 @@
 
 namespace pmacc
 {
-
     /**
      * Helper class for StrideMapping.
      * Provides methods called by StrideMapping using template specialization.
@@ -38,67 +37,63 @@ namespace pmacc
     template<uint32_t areaType, unsigned DIM>
     class StrideMappingMethods;
 
-    //CORE + BORDER + GUARD
+    // CORE + BORDER + GUARD
 
     template<unsigned DIM>
     class StrideMappingMethods<CORE + BORDER + GUARD, DIM>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace<DIM> getGridDim(const Base &base)
+        HINLINE static DataSpace<DIM> getGridDim(const Base& base)
         {
             return base.getGridSuperCells();
         }
 
         template<class Base>
-        HDINLINE static DataSpace<DIM> shift(const Base &base, const DataSpace<DIM>& value)
+        HDINLINE static DataSpace<DIM> shift(const Base& base, const DataSpace<DIM>& value)
         {
             return value;
         }
     };
 
-    //CORE
+    // CORE
 
     template<unsigned DIM>
     class StrideMappingMethods<CORE, DIM>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace<DIM> getGridDim(const Base &base)
+        HINLINE static DataSpace<DIM> getGridDim(const Base& base)
         {
             // skip 2 x (border + guard) == 4 x guard
             return base.getGridSuperCells() - 4 * base.getGuardingSuperCells();
         }
 
         template<class Base>
-        HDINLINE static DataSpace<DIM> shift(const Base &base, const DataSpace<DIM>& value)
+        HDINLINE static DataSpace<DIM> shift(const Base& base, const DataSpace<DIM>& value)
         {
             // skip guard + border == 2 x guard
             return value + 2 * base.getGuardingSuperCells();
         }
-
     };
 
-    //CORE+BORDER
+    // CORE+BORDER
 
     template<unsigned DIM>
     class StrideMappingMethods<CORE + BORDER, DIM>
     {
     public:
-
         template<class Base>
-        HINLINE static DataSpace<DIM> getGridDim(const Base &base)
+        HINLINE static DataSpace<DIM> getGridDim(const Base& base)
         {
             return base.getGridSuperCells() - 2 * base.getGuardingSuperCells();
         }
 
         template<class Base>
-        HDINLINE static DataSpace<DIM> shift(const Base &base, const DataSpace<DIM>& value)
+        HDINLINE static DataSpace<DIM> shift(const Base& base, const DataSpace<DIM>& value)
         {
             return value + base.getGuardingSuperCells();
         }
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/mappings/simulation/EnvironmentController.hpp b/include/pmacc/mappings/simulation/EnvironmentController.hpp
index ec652bf7ea..f3a29fef46 100644
--- a/include/pmacc/mappings/simulation/EnvironmentController.hpp
+++ b/include/pmacc/mappings/simulation/EnvironmentController.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Wolfgang Hoenig, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Wolfgang Hoenig, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,58 +28,54 @@
 
 namespace pmacc
 {
-
-class EnvironmentController
-{
-public:
-
-    /*! Get communicator
-     * @return Communicator for MPI
-     */
-    ICommunicator& getCommunicator() const
-    {
-        return *comm;
-    }
-
-
-
-    /*! Get Mask with all GPU neighbar
-     * @return Mask with neighbar
-     */
-    const Mask& getCommunicationMask() const
-    {
-        return comm->getCommunicationMask();
-    }
-
-
-    /*! Set MPI communicator
-     * @param comm A instance of ICommunicator
-     */
-    void setCommunicator(ICommunicator& comm)
+    class EnvironmentController
     {
-        this->comm = &comm;
-    }
-
-private:
-
-    friend struct detail::Environment;
-
-    /*! Default constructor.
-     */
-    EnvironmentController() {}
-
-    static EnvironmentController& getInstance()
-    {
-        static EnvironmentController instance;
-        return instance;
-    }
-
-private:
-
-    /*! Pointer to MPI communicator.
-     */
-    ICommunicator* comm;
-
-};
-
-} //namespace pmacc
+    public:
+        /*! Get communicator
+         * @return Communicator for MPI
+         */
+        ICommunicator& getCommunicator() const
+        {
+            return *comm;
+        }
+
+
+        /*! Get Mask with all GPU neighbar
+         * @return Mask with neighbar
+         */
+        const Mask& getCommunicationMask() const
+        {
+            return comm->getCommunicationMask();
+        }
+
+
+        /*! Set MPI communicator
+         * @param comm A instance of ICommunicator
+         */
+        void setCommunicator(ICommunicator& comm)
+        {
+            this->comm = &comm;
+        }
+
+    private:
+        friend struct detail::Environment;
+
+        /*! Default constructor.
+         */
+        EnvironmentController()
+        {
+        }
+
+        static EnvironmentController& getInstance()
+        {
+            static EnvironmentController instance;
+            return instance;
+        }
+
+    private:
+        /*! Pointer to MPI communicator.
+         */
+        ICommunicator* comm;
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/mappings/simulation/Filesystem.hpp b/include/pmacc/mappings/simulation/Filesystem.hpp
index 152d900ecb..282f82d811 100644
--- a/include/pmacc/mappings/simulation/Filesystem.hpp
+++ b/include/pmacc/mappings/simulation/Filesystem.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt
+/* Copyright 2014-2021 Felix Schmitt
  *
  * This file is part of PMacc.
  *
@@ -27,109 +27,96 @@
 
 namespace pmacc
 {
-
+    /**
+     * Singleton class providing common filesystem operations.
+     *
+     * @tparam DIM number of dimensions of the simulation
+     */
+    template<unsigned DIM>
+    class Filesystem
+    {
+    public:
         /**
-         * Singleton class providing common filesystem operations.
+         * Create directory with default permissions
          *
-         * @tparam DIM number of dimensions of the simulation
+         * @param dir name of directory
          */
-        template<unsigned DIM>
-        class Filesystem
+        void createDirectory(const std::string dir) const
         {
-        public:
-
-            /**
-             * Create directory with default permissions
-             *
-             * @param dir name of directory
-             */
-            void
-            createDirectory( const std::string dir ) const
-            {
-                /* does not throw if the directory exists or has been created */
-                bfs::create_directories(dir);
-            }
+            /* does not throw if the directory exists or has been created */
+            bfs::create_directories(dir);
+        }
 
-            /**
-             * Set 755 permissions for a directory
-             *
-             * @param dir name of directory
-             */
-            void
-            setDirectoryPermissions( const std::string dir )  const
-            {
-                /* set permissions */
-                bfs::permissions(dir,
-                                 bfs::owner_all |
-                                 bfs::group_read |
-                                 bfs::group_exe |
-                                 bfs::others_read |
-                                 bfs::others_exe);
-            }
-
-            /**
-             * Create directory and set 755 permissions by root rank.
-             *
-             * @param dir name of directory
-             */
-            void
-            createDirectoryWithPermissions( const std::string dir ) const
-            {
-                GridController<DIM>& gc = Environment<DIM>::get().GridController();
+        /**
+         * Set 755 permissions for a directory
+         *
+         * @param dir name of directory
+         */
+        void setDirectoryPermissions(const std::string dir) const
+        {
+            /* set permissions */
+            bfs::permissions(
+                dir,
+                bfs::owner_all | bfs::group_read | bfs::group_exe | bfs::others_read | bfs::others_exe);
+        }
 
-                createDirectory(dir);
+        /**
+         * Create directory and set 755 permissions by root rank.
+         *
+         * @param dir name of directory
+         */
+        void createDirectoryWithPermissions(const std::string dir) const
+        {
+            GridController<DIM>& gc = Environment<DIM>::get().GridController();
 
-                if (gc.getGlobalRank() == 0)
-                {
-                    /* must be set by only one process to avoid races */
-                    setDirectoryPermissions(dir);
-                }
-            }
+            createDirectory(dir);
 
-            /**
-             * Strip path from absolute or relative paths to filenames
-             *
-             * @param path and filename
-             */
-            std::string
-            basename( const std::string pathFilename ) const
+            if(gc.getGlobalRank() == 0)
             {
-                return bfs::path( pathFilename ).filename().string();
+                /* must be set by only one process to avoid races */
+                setDirectoryPermissions(dir);
             }
+        }
 
-        private:
-
-            friend class Environment<DIM>;
-
-            /**
-             * Constructor
-             */
-            Filesystem()
-            {
-
-            }
+        /**
+         * Strip path from absolute or relative paths to filenames
+         *
+         * @param path and filename
+         */
+        std::string basename(const std::string pathFilename) const
+        {
+            return bfs::path(pathFilename).filename().string();
+        }
 
-            /**
-             * Constructor
-             */
-            Filesystem(const Filesystem& fs)
-            {
+    private:
+        friend class Environment<DIM>;
 
-            }
+        /**
+         * Constructor
+         */
+        Filesystem()
+        {
+        }
 
-            /**
-             * Returns the instance of the filesystem class.
-             *
-             * This class is a singleton class.
-             *
-             * @return a filesystem instance
-             */
-            static Filesystem<DIM>& getInstance()
-            {
-                static Filesystem<DIM> instance;
-                return instance;
-            }
-        };
+        /**
+         * Constructor
+         */
+        Filesystem(const Filesystem& fs)
+        {
+        }
 
-} //namespace pmacc
+        /**
+         * Returns the instance of the filesystem class.
+         *
+         * This class is a singleton class.
+         *
+         * @return a filesystem instance
+         */
+        static Filesystem<DIM>& getInstance()
+        {
+            static Filesystem<DIM> instance;
+            return instance;
+        }
+    };
 
+} // namespace pmacc
diff --git a/include/pmacc/mappings/simulation/GridController.hpp b/include/pmacc/mappings/simulation/GridController.hpp
index d5306c421b..15b2a46a73 100644
--- a/include/pmacc/mappings/simulation/GridController.hpp
+++ b/include/pmacc/mappings/simulation/GridController.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -30,272 +30,267 @@
 
 namespace pmacc
 {
-
+    /**
+     * GridController manages grid information.
+     *
+     * GridController provides information for a DIM-dimensional grid
+     * such as the number of GPU nodes and the current node's position in the grid
+     * and manages sliding window.
+     * GridController is a singleton.
+     *
+     * @tparam DIM dimension of the controlled grid
+     */
+    template<unsigned DIM>
+    class GridController
+    {
+    public:
         /**
-         * GridController manages grid information.
+         * Initialisation of the controller.
          *
-         * GridController provides information for a DIM-dimensional grid
-         * such as the number of GPU nodes and the current node's position in the grid
-         * and manages sliding window.
-         * GridController is a singleton.
+         * This methode must be called before any subgrids or buffers are used.
          *
-         * @tparam DIM dimension of the controlled grid
+         * @param nodes number of GPU nodes in each dimension
+         * @param periodic specifying whether the grid is periodic (1) or not (0) in each dimension
          */
-        template <unsigned DIM>
-        class GridController
+        void init(DataSpace<DIM> nodes, DataSpace<DIM> periodic = DataSpace<DIM>())
         {
-        public:
-
-            /**
-             * Initialisation of the controller.
-             *
-             * This methode must be called before any subgrids or buffers are used.
-             *
-             * @param nodes number of GPU nodes in each dimension
-             * @param periodic specifying whether the grid is periodic (1) or not (0) in each dimension
-             */
-            void init(DataSpace<DIM> nodes, DataSpace<DIM> periodic = DataSpace<DIM>())
+            static bool commIsInit = false;
+            if(!commIsInit)
             {
-                static bool commIsInit = false;
-                if (!commIsInit)
-                {
-                    gpuNodes = nodes;
+                gpuNodes = nodes;
 
-                    DataSpace<DIM3> tmp;
-                    DataSpace<DIM3> periodicTmp;
-                    tmp[0] = nodes[0];
-                    periodicTmp[0] = periodic[0];
-                    if (DIM < DIM2)
-                    {
-                        tmp[1] = 1;
-                        periodicTmp[1] = 1;
-                    }
-                    else
-                    {
-                        tmp[1] = nodes[1];
-                        periodicTmp[1] = periodic[1];
-                    }
-
-                    if (DIM < DIM3)
-                    {
-                        tmp[2] = 1;
-                        periodicTmp[2] = 1;
-                    }
-                    else
-                    {
-                        tmp[2] = nodes[2];
-                        periodicTmp[2] = periodic[2];
-                    }
-
-                    comm.init(tmp, periodicTmp);
-                    commIsInit = true;
-
-                    Environment<DIM>::get().EnvironmentController().setCommunicator(comm);
+                DataSpace<DIM3> tmp;
+                DataSpace<DIM3> periodicTmp;
+                tmp[0] = nodes[0];
+                periodicTmp[0] = periodic[0];
+                if(DIM < DIM2)
+                {
+                    tmp[1] = 1;
+                    periodicTmp[1] = 1;
+                }
+                else
+                {
+                    tmp[1] = nodes[1];
+                    periodicTmp[1] = periodic[1];
                 }
-            }
 
-            /**
-             * Returns the number of GPU nodes in each dimension.
-             *
-             * @return number of nodes
-             */
-            const DataSpace<DIM> getGpuNodes() const
-            {
-                return gpuNodes;
-            }
+                if(DIM < DIM3)
+                {
+                    tmp[2] = 1;
+                    periodicTmp[2] = 1;
+                }
+                else
+                {
+                    tmp[2] = nodes[2];
+                    periodicTmp[2] = periodic[2];
+                }
 
-            /**
-             * Returns the position of the calling process' GPU in the grid.
-             *
-             * @return current GPU position
-             * */
-            const DataSpace<DIM> getPosition() const
-            {
-                return comm.getCoordinates();
-            }
+                comm.init(tmp, periodicTmp);
+                commIsInit = true;
 
-            /**
-             * Returns the scalar position (rank) of this GPU,
-             * depending on its current grid position
-             *
-             * @return current grid position as scalar value
-             */
-            uint32_t getScalarPosition() const
-            {
-                return DataSpaceOperations<DIM>::map(getGpuNodes(), getPosition());
+                Environment<DIM>::get().EnvironmentController().setCommunicator(comm);
             }
+        }
 
-            /**
-             * Returns the local rank of the caller on the current host.
-             *
-             * return local rank on host
-             */
-            uint32_t getHostRank() const
-            {
-                return comm.getHostRank();
-            }
+        /**
+         * Returns the number of GPU nodes in each dimension.
+         *
+         * @return number of nodes
+         */
+        const DataSpace<DIM> getGpuNodes() const
+        {
+            return gpuNodes;
+        }
 
-            /**
-             * Returns the global MPI rank of the caller among all hosts.
-             *
-             * @return global MPI rank
-             */
-            uint32_t getGlobalRank() const
-            {
-                return comm.getRank();
-            }
+        /**
+         * Returns the position of the calling process' GPU in the grid.
+         *
+         * @return current GPU position
+         * */
+        const DataSpace<DIM> getPosition() const
+        {
+            return comm.getCoordinates();
+        }
 
-            /**
-             * Returns the global MPI size.
-             *
-             * @return global number of MPI ranks
-             */
-            uint32_t getGlobalSize() const
-            {
-                return comm.getSize();
-            }
+        /**
+         * Returns the scalar position (rank) of this GPU,
+         * depending on its current grid position
+         *
+         * @return current grid position as scalar value
+         */
+        uint32_t getScalarPosition() const
+        {
+            return DataSpaceOperations<DIM>::map(getGpuNodes(), getPosition());
+        }
 
-            /**
-             * Initialises a slide of the simulation area.
-             *
-             * Starts a slide of the simulation area. In the process, GPU nodes are
-             * reassigned to new grid positions to enable large simulation areas
-             * to be computed.
-             * All nodes in the simulation must call this function at the same iteration.
-             *
-             * @return true if the position of the calling GPU is switched to the end, false otherwise
-             */
-            bool slide()
-            {
-               /* wait that all tasks are finished */
-               Environment<DIM>::get().Manager().waitForAllTasks();//
+        /**
+         * Returns the local rank of the caller on the current host.
+         *
+         * return local rank on host
+         */
+        uint32_t getHostRank() const
+        {
+            return comm.getHostRank();
+        }
 
-               bool result = comm.slide();
+        /**
+         * Returns the global MPI rank of the caller among all hosts.
+         *
+         * @return global MPI rank
+         */
+        uint32_t getGlobalRank() const
+        {
+            return comm.getRank();
+        }
 
-               updateDomainOffset();
+        /**
+         * Returns the global MPI size.
+         *
+         * @return global number of MPI ranks
+         */
+        uint32_t getGlobalSize() const
+        {
+            return comm.getSize();
+        }
 
-               return result;
-            }
+        /**
+         * Initialises a slide of the simulation area.
+         *
+         * Starts a slide of the simulation area. In the process, GPU nodes are
+         * reassigned to new grid positions to enable large simulation areas
+         * to be computed.
+         * All nodes in the simulation must call this function at the same iteration.
+         *
+         * @return true if the position of the calling GPU is switched to the end, false otherwise
+         */
+        bool slide()
+        {
+            /* wait that all tasks are finished */
+            Environment<DIM>::get().Manager().waitForAllTasks(); //
 
-            /**
-             * Slides multiple times.
-             *
-             * Restores the state of the communicator and the domain offsets as
-             * if the simulation has been slided for numSlides times.
-             *
-             * \warning you are not allowed to call this method if moving
-             *          the simulation does not use a moving window,
-             *          else static load balancing will break in y-direction
-             *
-             * @param[in] numSlides number of slides to slide
-             * @return true if the position of gpu is switched to the end, else false
-             */
-            bool setStateAfterSlides(size_t numSlides)
-            {
-                // nothing to do, nothing to change
-                // note: prevents destroying static load balancing in y for
-                //       non-moving window simulations
-                if( numSlides == 0 )
-                    return false;
+            bool result = comm.slide();
 
-                bool result = comm.setStateAfterSlides(numSlides);
-                updateDomainOffset(numSlides);
-                return result;
-            }
+            updateDomainOffset();
 
-            /**
-             * Returns a Mask which describes all neighbouring GPU nodes.
-             *
-             * @return Mask with all neighbors
-             */
-            const Mask& getCommunicationMask() const
-            {
-                return Environment<DIM>::get().EnvironmentController().getCommunicationMask();
-            }
+            return result;
+        }
 
-            /**
-             * Returns the MPI communicator class
-             *
-             * @return current CommunicatorMPI
-             */
-            CommunicatorMPI<DIM>& getCommunicator()
-            {
-                return comm;
-            }
+        /**
+         * Slides multiple times.
+         *
+         * Restores the state of the communicator and the domain offsets as
+         * if the simulation has been slided for numSlides times.
+         *
+         * \warning you are not allowed to call this method if moving
+         *          the simulation does not use a moving window,
+         *          else static load balancing will break in y-direction
+         *
+         * @param[in] numSlides number of slides to slide
+         * @return true if the position of gpu is switched to the end, else false
+         */
+        bool setStateAfterSlides(size_t numSlides)
+        {
+            // nothing to do, nothing to change
+            // note: prevents destroying static load balancing in y for
+            //       non-moving window simulations
+            if(numSlides == 0)
+                return false;
 
-        private:
+            bool result = comm.setStateAfterSlides(numSlides);
+            updateDomainOffset(numSlides);
+            return result;
+        }
 
-            friend class Environment<DIM>;
-            /**
-             * Constructor
-             */
-            GridController() : gpuNodes(DataSpace<DIM>())
-            {
+        /**
+         * Returns a Mask which describes all neighbouring GPU nodes.
+         *
+         * @return Mask with all neighbors
+         */
+        const Mask& getCommunicationMask() const
+        {
+            return Environment<DIM>::get().EnvironmentController().getCommunicationMask();
+        }
 
-            }
+        /**
+         * Returns the MPI communicator class
+         *
+         * @return current CommunicatorMPI
+         */
+        CommunicatorMPI<DIM>& getCommunicator()
+        {
+            return comm;
+        }
 
-            /**
-             * Constructor
-             */
-            GridController(const GridController& gc)
-            {
+    private:
+        friend class Environment<DIM>;
+        /**
+         * Constructor
+         */
+        GridController() : gpuNodes(DataSpace<DIM>())
+        {
+        }
 
-            }
+        /**
+         * Constructor
+         */
+        GridController(const GridController& gc)
+        {
+        }
 
-            /**
-             * Sets globalDomain.offset & localDomain.offset using the current position.
-             *
-             * (This function is idempotent)
-             *
-             * @param[in] numSlides number of slides to slide
-             *
-             * \warning the implementation of this method is not compatible with
-             *          static load balancing in y-direction
+        /**
+         * Sets globalDomain.offset & localDomain.offset using the current position.
+         *
+         * (This function is idempotent)
+         *
+         * @param[in] numSlides number of slides to slide
+         *
+         * \warning the implementation of this method is not compatible with
+         *          static load balancing in y-direction
+         */
+        void updateDomainOffset(size_t numSlides = 1)
+        {
+            /* if we slide we must change our localDomain.offset of the simulation
+             * (only change slide direction Y)
              */
-            void updateDomainOffset(size_t numSlides = 1)
-            {
-                /* if we slide we must change our localDomain.offset of the simulation
-                 * (only change slide direction Y)
-                 */
-                int gpuOffset_y = this->getPosition().y();
-                const SubGrid<DIM>& subGrid = Environment<DIM>::get().SubGrid();
-                DataSpace<DIM> localDomainOffset(subGrid.getLocalDomain().offset);
-                DataSpace<DIM> globalDomainOffset(subGrid.getGlobalDomain().offset);
-                /* this is allowed in the case that we use sliding window
-                 * because size in Y direction is the same for all gpus domains
-                 */
-                localDomainOffset.y() = gpuOffset_y * subGrid.getLocalDomain().size.y();
-                globalDomainOffset.y() += numSlides * subGrid.getLocalDomain().size.y();
+            int gpuOffset_y = this->getPosition().y();
+            const SubGrid<DIM>& subGrid = Environment<DIM>::get().SubGrid();
+            DataSpace<DIM> localDomainOffset(subGrid.getLocalDomain().offset);
+            DataSpace<DIM> globalDomainOffset(subGrid.getGlobalDomain().offset);
+            /* this is allowed in the case that we use sliding window
+             * because size in Y direction is the same for all gpus domains
+             */
+            localDomainOffset.y() = gpuOffset_y * subGrid.getLocalDomain().size.y();
+            globalDomainOffset.y() += numSlides * subGrid.getLocalDomain().size.y();
 
-                Environment<DIM>::get().SubGrid().setLocalDomainOffset(localDomainOffset);
-                Environment<DIM>::get().SubGrid().setGlobalDomainOffset(globalDomainOffset);
-            }
+            Environment<DIM>::get().SubGrid().setLocalDomainOffset(localDomainOffset);
+            Environment<DIM>::get().SubGrid().setGlobalDomainOffset(globalDomainOffset);
+        }
 
-            /**
-             * Returns the instance of the controller.
-             *
-             * This class is a singleton class.
-             *
-             * @return a controller instance
-             */
-            static GridController<DIM>& getInstance()
-            {
-                static GridController<DIM> instance;
-                return instance;
-            }
+        /**
+         * Returns the instance of the controller.
+         *
+         * This class is a singleton class.
+         *
+         * @return a controller instance
+         */
+        static GridController<DIM>& getInstance()
+        {
+            static GridController<DIM> instance;
+            return instance;
+        }
 
-            /**
-             * Communicator for MPI
-             */
-            static CommunicatorMPI<DIM> comm;
+        /**
+         * Communicator for MPI
+         */
+        static CommunicatorMPI<DIM> comm;
 
-            /**
-             * number of GPU nodes for each direction
-             */
-            DataSpace<DIM> gpuNodes;
-        };
+        /**
+         * number of GPU nodes for each direction
+         */
+        DataSpace<DIM> gpuNodes;
+    };
 
-        template <unsigned DIM>
-        CommunicatorMPI<DIM> GridController<DIM>::comm;
+    template<unsigned DIM>
+    CommunicatorMPI<DIM> GridController<DIM>::comm;
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/mappings/simulation/ResourceMonitor.hpp b/include/pmacc/mappings/simulation/ResourceMonitor.hpp
index e56815eaa3..534ed6e405 100644
--- a/include/pmacc/mappings/simulation/ResourceMonitor.hpp
+++ b/include/pmacc/mappings/simulation/ResourceMonitor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Erik Zenker
+/* Copyright 2016-2021 Erik Zenker
  *
  * This file is part of PMacc.
  *
@@ -20,22 +20,20 @@
  */
 
 #pragma once
-#include <vector>  /* std::vector */
+#include <vector> /* std::vector */
 #include <cstdlib> /* std::size_t */
 
 namespace pmacc
 {
-
     /**
      * Provides ressource information of the current subgrid
      *
      * @tparam T_DIM number of dimensions of the simulation
      */
-    template <unsigned T_DIM>
+    template<unsigned T_DIM>
     class ResourceMonitor
     {
     public:
-
         /**
          * Constructor
          */
@@ -49,10 +47,8 @@ namespace pmacc
         /**
          * Returns the number of particles per species on the device
          */
-        template <typename T_Species, typename T_MappingDesc, typename T_ParticleFilter>
-        std::vector<std::size_t> getParticleCounts(T_MappingDesc &cellDescription, T_ParticleFilter & parFilter);
-
+        template<typename T_Species, typename T_MappingDesc, typename T_ParticleFilter>
+        std::vector<std::size_t> getParticleCounts(T_MappingDesc& cellDescription, T_ParticleFilter& parFilter);
     };
 
-} //namespace pmacc
-
+} // namespace pmacc
diff --git a/include/pmacc/mappings/simulation/ResourceMonitor.tpp b/include/pmacc/mappings/simulation/ResourceMonitor.tpp
index 73ccc132c6..0fa624715b 100644
--- a/include/pmacc/mappings/simulation/ResourceMonitor.tpp
+++ b/include/pmacc/mappings/simulation/ResourceMonitor.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Erik Zenker
+/* Copyright 2016-2021 Erik Zenker
  *
  * This file is part of PMacc.
  *
@@ -34,21 +34,21 @@ namespace pmacc
     template<typename T_DIM, typename T_Species>
     struct MyCountParticles
     {
-        template <typename T_Vector, typename T_MappingDesc, typename T_ParticleFilter>
-        void operator()(T_Vector & particleCounts, T_MappingDesc & cellDescription,  T_ParticleFilter & parFilter)
+        template<typename T_Vector, typename T_MappingDesc, typename T_ParticleFilter>
+        void operator()(T_Vector& particleCounts, T_MappingDesc& cellDescription, T_ParticleFilter& parFilter)
         {
-            DataConnector & dc = Environment<>::get().DataConnector();
+            DataConnector& dc = Environment<>::get().DataConnector();
 
-            const SubGrid<T_DIM::value> & subGrid = Environment<T_DIM::value>::get().SubGrid();
+            const SubGrid<T_DIM::value>& subGrid = Environment<T_DIM::value>::get().SubGrid();
             const DataSpace<T_DIM::value> localSize(subGrid.getLocalDomain().size);
 
             uint64_cu totalNumParticles = 0;
-            totalNumParticles = pmacc::CountParticles::countOnDevice < CORE + BORDER > (
-                    *dc.get<T_Species >(T_Species::FrameType::getName(), true),
-                    cellDescription,
-                    DataSpace<T_DIM::value>(),
-                    localSize,
-                    parFilter);
+            totalNumParticles = pmacc::CountParticles::countOnDevice<CORE + BORDER>(
+                *dc.get<T_Species>(T_Species::FrameType::getName(), true),
+                cellDescription,
+                DataSpace<T_DIM::value>(),
+                localSize,
+                parFilter);
             particleCounts.push_back(totalNumParticles);
         }
     };
@@ -56,7 +56,6 @@ namespace pmacc
     template<unsigned T_DIM>
     ResourceMonitor<T_DIM>::ResourceMonitor()
     {
-
     }
 
     template<unsigned T_DIM>
@@ -66,14 +65,16 @@ namespace pmacc
     }
 
     template<unsigned T_DIM>
-    template <typename T_Species, typename T_MappingDesc, typename T_ParticleFilter>
-    std::vector<size_t> ResourceMonitor<T_DIM>::getParticleCounts(T_MappingDesc &cellDescription, T_ParticleFilter & parFilter)
+    template<typename T_Species, typename T_MappingDesc, typename T_ParticleFilter>
+    std::vector<size_t> ResourceMonitor<T_DIM>::getParticleCounts(
+        T_MappingDesc& cellDescription,
+        T_ParticleFilter& parFilter)
     {
         typedef bmpl::integral_c<unsigned, T_DIM> dim;
         std::vector<size_t> particleCounts;
-        meta::ForEach<T_Species, MyCountParticles<dim, bmpl::_1> > countParticles;
+        meta::ForEach<T_Species, MyCountParticles<dim, bmpl::_1>> countParticles;
         countParticles(particleCounts, cellDescription, parFilter);
         return particleCounts;
     }
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/mappings/simulation/Selection.hpp b/include/pmacc/mappings/simulation/Selection.hpp
index 9ab6cf11fe..930d91e38e 100644
--- a/include/pmacc/mappings/simulation/Selection.hpp
+++ b/include/pmacc/mappings/simulation/Selection.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt
+/* Copyright 2014-2021 Felix Schmitt
  *
  * This file is part of PMacc.
  *
@@ -28,86 +28,72 @@
 
 namespace pmacc
 {
-
-/**
- * Any DIM-dimensional selection of a simulation volume with a size and offset.
- *
- * @tparam DIM number of dimensions
- */
-template <unsigned DIM>
-class Selection
-{
-public:
-
     /**
-     * Constructor
-     * Size and offset initialized to 0 (empty selection)
+     * Any DIM-dimensional selection of a simulation volume with a size and offset.
+     *
+     * @tparam DIM number of dimensions
      */
-    Selection(void)
+    template<unsigned DIM>
+    class Selection
     {
-        for (uint32_t i = 0; i < DIM; ++i)
+    public:
+        /**
+         * Constructor
+         * Size and offset initialized to 0 (empty selection)
+         */
+        Selection(void)
         {
-            size[i] = 0;
-            offset[i] = 0;
+            for(uint32_t i = 0; i < DIM; ++i)
+            {
+                size[i] = 0;
+                offset[i] = 0;
+            }
         }
-    }
-
-    /**
-     * Copy constructor
-     *
-     * @param other Selection to copy information from
-     */
-    Selection(const Selection<DIM>& other) :
-    size(other.size),
-    offset(other.offset)
-    {
 
-    }
+        /**
+         * Copy constructor
+         */
+        constexpr Selection(const Selection&) = default;
 
-    /**
-     * Constructor
-     * Offset is initialized to 0.
-     *
-     * @param size DataSpace for selection size
-     */
-    Selection(DataSpace<DIM> size) :
-    size(size)
-    {
-        for (uint32_t i = 0; i < DIM; ++i)
+        /**
+         * Constructor
+         * Offset is initialized to 0.
+         *
+         * @param size DataSpace for selection size
+         */
+        Selection(DataSpace<DIM> size) : size(size)
         {
-            offset[i] = 0;
+            for(uint32_t i = 0; i < DIM; ++i)
+            {
+                offset[i] = 0;
+            }
         }
-    }
 
-    /**
-     * Constructor
-     *
-     * @param size DataSpace for selection size
-     * @param offset DataSpace for selection offset
-     */
-    Selection(DataSpace<DIM> size, DataSpace<DIM> offset) :
-    size(size),
-    offset(offset)
-    {
-
-    }
+        /**
+         * Constructor
+         *
+         * @param size DataSpace for selection size
+         * @param offset DataSpace for selection offset
+         */
+        Selection(DataSpace<DIM> size, DataSpace<DIM> offset) : size(size), offset(offset)
+        {
+        }
 
-    /**
-     * Return a string representation
-     *
-     * @return string representation
-     */
-    HINLINE const std::string toString(void) const
-    {
-        std::stringstream str;
-        str << "{ size = " << size.toString() <<
-               " offset = " << offset.toString() << " }";
-        return str.str();
-    }
+        /**
+         * Return a string representation
+         *
+         * @return string representation
+         */
+        HINLINE const std::string toString(void) const
+        {
+            std::stringstream str;
+            str << "{ size = " << size.toString() << " offset = " << offset.toString() << " }";
+            return str.str();
+        }
 
-    DataSpace<DIM> size;
+        DataSpace<DIM> size;
 
-    DataSpace<DIM> offset;
-};
+        DataSpace<DIM> offset;
+    };
 
-} // namespace picongpu
+} // namespace pmacc
diff --git a/include/pmacc/mappings/simulation/SubGrid.hpp b/include/pmacc/mappings/simulation/SubGrid.hpp
index c5d9cf8dc8..5ceb6372d4 100644
--- a/include/pmacc/mappings/simulation/SubGrid.hpp
+++ b/include/pmacc/mappings/simulation/SubGrid.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera, Wolfgang Hoenig
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera, Wolfgang Hoenig
  *
  * This file is part of PMacc.
  *
@@ -29,18 +29,19 @@
 namespace pmacc
 {
     /**
-      * Groups local, global and total domain information.
-      *
-      * For a detailed description of domains, see the PIConGPU wiki page:
-      * https://github.com/ComputationalRadiationPhysics/picongpu/wiki/PIConGPU-domain-definitions
-      */
-    template <unsigned DIM>
+     * Groups local, global and total domain information.
+     *
+     * For a detailed description of domains, see the PIConGPU wiki page:
+     * https://github.com/ComputationalRadiationPhysics/picongpu/wiki/PIConGPU-domain-definitions
+     */
+    template<unsigned DIM>
     class SubGrid
     {
     public:
-
         typedef DataSpace<DIM> Size;
 
+        constexpr SubGrid& operator=(const SubGrid&) = default;
+
         /**
          * Initialize SubGrid instance
          *
@@ -48,9 +49,7 @@ namespace pmacc
          * @param globalSize global domain size
          * @param localOffset local domain offset (formerly 'globalOffset')
          */
-        void init(const Size& localSize,
-                  const Size& globalSize,
-                  const Size& localOffset)
+        void init(const Size& localSize, const Size& globalSize, const Size& localOffset)
         {
             totalDomain = Selection<DIM>(globalSize);
             globalDomain = Selection<DIM>(globalSize);
@@ -114,7 +113,6 @@ namespace pmacc
         }
 
     private:
-
         friend class Environment<DIM>;
 
         /** total simulation volume, including active and inactive subvolumes */
@@ -131,7 +129,6 @@ namespace pmacc
          */
         SubGrid()
         {
-
         }
 
         static SubGrid<DIM>& getInstance()
@@ -149,12 +146,8 @@ namespace pmacc
          */
         SubGrid(const SubGrid& gc)
         {
-
         }
     };
 
 
-} //namespace pmacc
-
-
-
+} // namespace pmacc
diff --git a/include/pmacc/mappings/threads/ForEachIdx.hpp b/include/pmacc/mappings/threads/ForEachIdx.hpp
index 8e9d5b8bba..be6fdcf043 100644
--- a/include/pmacc/mappings/threads/ForEachIdx.hpp
+++ b/include/pmacc/mappings/threads/ForEachIdx.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,100 +29,75 @@
 
 namespace pmacc
 {
-namespace mappings
-{
-namespace threads
-{
-
-    /** execute a functor for each index
-     *
-     * Distribute the indices even over all worker and execute a user defined functor.
-     * There is no guarantee in which order the indices will be processed.
-     *
-     * @tparam T_IdxConfig index domain description
-     */
-    template<
-        typename T_IdxConfig
-    >
-    struct ForEachIdx : public T_IdxConfig
+    namespace mappings
     {
-        using T_IdxConfig::domainSize;
-        using T_IdxConfig::workerSize;
-        using T_IdxConfig::simdSize;
-        using T_IdxConfig::numCollIter;
+        namespace threads
+        {
+            /** execute a functor for each index
+             *
+             * Distribute the indices even over all worker and execute a user defined functor.
+             * There is no guarantee in which order the indices will be processed.
+             *
+             * @tparam T_IdxConfig index domain description
+             */
+            template<typename T_IdxConfig>
+            struct ForEachIdx : public T_IdxConfig
+            {
+                using T_IdxConfig::domainSize;
+                using T_IdxConfig::numCollIter;
+                using T_IdxConfig::simdSize;
+                using T_IdxConfig::workerSize;
 
-        uint32_t const m_workerIdx;
+                uint32_t const m_workerIdx;
 
-        static constexpr bool outerLoopCondition =
-            ( domainSize % (simdSize * workerSize) ) == 0u ||
-            ( simdSize * workerSize == 1u );
+                static constexpr bool outerLoopCondition
+                    = (domainSize % (simdSize * workerSize)) == 0u || (simdSize * workerSize == 1u);
 
-        static constexpr bool innerLoopCondition =
-            ( domainSize % simdSize ) == 0u ||
-            ( simdSize == 1u );
+                static constexpr bool innerLoopCondition = (domainSize % simdSize) == 0u || (simdSize == 1u);
 
-        /** constructor
-         *
-         * @param workerIdx index of the worker: range [0;workerSize)
-         */
-        HDINLINE
-        ForEachIdx( uint32_t const workerIdx ) : m_workerIdx( workerIdx )
-        {
-        }
+                /** constructor
+                 *
+                 * @param workerIdx index of the worker: range [0;workerSize)
+                 */
+                HDINLINE
+                ForEachIdx(uint32_t const workerIdx) : m_workerIdx(workerIdx)
+                {
+                }
 
-        /** execute a functor
-         *
-         * @param functor is called for each index which is mapped to the worker
-         *
-         * The functor must fulfill the following interface:
-         * @code
-         * template< typename ... T_Args >
-         * void operator()( uint32_t const linearIdx, uint32_t const idx, T_Args && ... );
-         * @endcode
-         *
-         * @{
-         */
-        template<
-            typename T_Functor,
-            typename ... T_Args
-        >
-        HDINLINE void
-        operator()(
-            T_Functor && functor,
-            T_Args && ... args
-        ) const
-        {
-            for( uint32_t i = 0u; i < numCollIter; ++i )
-            {
-                uint32_t const beginWorker = i * simdSize;
-                uint32_t const beginIdx = beginWorker * workerSize + simdSize * m_workerIdx;
-                if(
-                    outerLoopCondition ||
-                    !innerLoopCondition ||
-                    beginIdx < domainSize
-                )
+                /** execute a functor
+                 *
+                 * @param functor is called for each index which is mapped to the worker
+                 *
+                 * The functor must fulfill the following interface:
+                 * @code
+                 * template< typename ... T_Args >
+                 * void operator()( uint32_t const linearIdx, uint32_t const idx, T_Args && ... );
+                 * @endcode
+                 *
+                 * @{
+                 */
+                template<typename T_Functor, typename... T_Args>
+                HDINLINE void operator()(T_Functor&& functor, T_Args&&... args) const
                 {
-                    for( uint32_t j = 0u; j < simdSize; ++j )
+                    for(uint32_t i = 0u; i < numCollIter; ++i)
                     {
-                        uint32_t const localIdx = beginIdx + j;
-                        if(
-                            innerLoopCondition ||
-                            localIdx < domainSize
-                        )
-                            functor(
-                                localIdx,
-                                beginWorker + j,
-                                std::forward< T_Args >( args ) ...
-                            );
+                        uint32_t const beginWorker = i * simdSize;
+                        uint32_t const beginIdx = beginWorker * workerSize + simdSize * m_workerIdx;
+                        if(outerLoopCondition || !innerLoopCondition || beginIdx < domainSize)
+                        {
+                            for(uint32_t j = 0u; j < simdSize; ++j)
+                            {
+                                uint32_t const localIdx = beginIdx + j;
+                                if(innerLoopCondition || localIdx < domainSize)
+                                    functor(localIdx, beginWorker + j, std::forward<T_Args>(args)...);
+                            }
+                        }
                     }
                 }
-            }
-        }
-
-        /** @} */
 
-    };
+                /** @} */
+            };
 
-} // namespace threads
-} // namespace mappings
+        } // namespace threads
+    } // namespace mappings
 } // namespace pmacc
diff --git a/include/pmacc/mappings/threads/IdxConfig.hpp b/include/pmacc/mappings/threads/IdxConfig.hpp
index 785b98a251..fc6a600c94 100644
--- a/include/pmacc/mappings/threads/IdxConfig.hpp
+++ b/include/pmacc/mappings/threads/IdxConfig.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,38 +26,33 @@
 
 namespace pmacc
 {
-namespace mappings
-{
-namespace threads
-{
-
-    /** describe a constant index domain
-     *
-     * describe the size of the index domain and the number of workers to operate on the domain
-     *
-     * @tparam T_domainSize number of indices in the domain
-     * @tparam T_workerSize number of worker working on @p T_domainSize
-     * @tparam T_simdSize SIMD width
-     */
-    template<
-        uint32_t T_domainSize,
-        uint32_t T_workerSize,
-        uint32_t T_simdSize = 1u
-    >
-    struct IdxConfig
+    namespace mappings
     {
-        /** number of indices within the domain */
-        static constexpr uint32_t domainSize = T_domainSize;
-        /** number of worker (threads) working on @p domainSize */
-        static constexpr uint32_t workerSize = T_workerSize;
-        /** SIMD width */
-        static constexpr uint32_t simdSize = T_simdSize;
+        namespace threads
+        {
+            /** describe a constant index domain
+             *
+             * describe the size of the index domain and the number of workers to operate on the domain
+             *
+             * @tparam T_domainSize number of indices in the domain
+             * @tparam T_workerSize number of worker working on @p T_domainSize
+             * @tparam T_simdSize SIMD width
+             */
+            template<uint32_t T_domainSize, uint32_t T_workerSize, uint32_t T_simdSize = 1u>
+            struct IdxConfig
+            {
+                /** number of indices within the domain */
+                static constexpr uint32_t domainSize = T_domainSize;
+                /** number of worker (threads) working on @p domainSize */
+                static constexpr uint32_t workerSize = T_workerSize;
+                /** SIMD width */
+                static constexpr uint32_t simdSize = T_simdSize;
 
-        /** number of collective iterations needed to address all indices */
-        static constexpr uint32_t numCollIter =
-            ( domainSize + simdSize * workerSize - 1u ) / ( simdSize * workerSize);
-    };
+                /** number of collective iterations needed to address all indices */
+                static constexpr uint32_t numCollIter
+                    = (domainSize + simdSize * workerSize - 1u) / (simdSize * workerSize);
+            };
 
-} // namespace threads
-} // namespace mappings
+        } // namespace threads
+    } // namespace mappings
 } // namespace pmacc
diff --git a/include/pmacc/mappings/threads/ThreadCollective.hpp b/include/pmacc/mappings/threads/ThreadCollective.hpp
index e6c1153356..ce56f2bd57 100644
--- a/include/pmacc/mappings/threads/ThreadCollective.hpp
+++ b/include/pmacc/mappings/threads/ThreadCollective.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -31,97 +31,68 @@
 
 namespace pmacc
 {
-
-/** execute a functor for each cell of a domain
- *
- * the user functor is executed on each elements of the full domain (GUARD +CORE)
- *
- * @tparam T_DataDomain pmacc::SuperCellDescription, compile time data domain
- *                      description with a CORE and GUARD
- * @tparam T_numWorkers number of workers
- */
-template<
-    typename T_DataDomain,
-    uint32_t T_numWorkers
->
-class ThreadCollective
-{
-private:
-    // size of the CORE (in elements per dimension)
-    using CoreDomainSize = typename T_DataDomain::SuperCellSize;
-    // full size of the domain including the GUARD (in elements per dimension)
-    using DomainSize = typename T_DataDomain::FullSuperCellSize;
-    // offset (in elements per dimension) from the GUARD origin to the CORE
-    using OffsetOrigin = typename T_DataDomain::OffsetOrigin;
-
-    static constexpr uint32_t numWorkers = T_numWorkers;
-    static constexpr uint32_t dim = T_DataDomain::Dim;
-
-    PMACC_ALIGN(
-        m_workerIdx,
-        const uint32_t
-    );
-
-public:
-
-    /** constructor
+    /** execute a functor for each cell of a domain
      *
-     * @param workerIdx index of the worker
-     */
-    DINLINE ThreadCollective( uint32_t const workerIdx ) :
-        m_workerIdx( workerIdx )
-    {
-    }
-
-    /** execute the user functor for each element in the full domain
-     *
-     * @tparam T_Functor type of the user functor, must have a `void operator()`
-     *                   with as many arguments as args contains
-     * @tparam T_Args type of the arguments, each type must implement an operator
-     *                 `template<typename T, typename R> R operator(T)`
+     * the user functor is executed on each elements of the full domain (GUARD +CORE)
      *
-     * @param functor user defined functor
-     * @param args arguments passed to the functor
-     *             The method `template<typename T, typename R> R operator(T)`
-     *             is called for each argument, the result is passed to the
-     *             functor `functor::operator()`.
-     *             `T` is a N-dimensional vector of an index relative to the origin
-     *             of data domain GUARD
+     * @tparam T_DataDomain pmacc::SuperCellDescription, compile time data domain
+     *                      description with a CORE and GUARD
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_Functor,
-        typename ... T_Args,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_Functor & functor,
-        T_Args && ... args
-    )
+    template<typename T_DataDomain, uint32_t T_numWorkers>
+    class ThreadCollective
     {
-        using namespace mappings::threads;
-        ForEachIdx<
-            IdxConfig<
-                math::CT::volume< DomainSize >::type::value,
-                numWorkers
-            >
-        >{ m_workerIdx }(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
-                /* offset (in elements) of the current processed element relative
-                 * to the origin of the core domain
-                 */
-                DataSpace< dim > const offset(
-                    DataSpaceOperations< dim >::template map< DomainSize >( linearIdx ) -
-                    OffsetOrigin::toRT( )
-                );
-                functor( acc, args( offset ) ... );
-            }
-        );
-    }
-};
+    private:
+        // size of the CORE (in elements per dimension)
+        using CoreDomainSize = typename T_DataDomain::SuperCellSize;
+        // full size of the domain including the GUARD (in elements per dimension)
+        using DomainSize = typename T_DataDomain::FullSuperCellSize;
+        // offset (in elements per dimension) from the GUARD origin to the CORE
+        using OffsetOrigin = typename T_DataDomain::OffsetOrigin;
+
+        static constexpr uint32_t numWorkers = T_numWorkers;
+        static constexpr uint32_t dim = T_DataDomain::Dim;
+
+        PMACC_ALIGN(m_workerIdx, const uint32_t);
+
+    public:
+        /** constructor
+         *
+         * @param workerIdx index of the worker
+         */
+        DINLINE ThreadCollective(uint32_t const workerIdx) : m_workerIdx(workerIdx)
+        {
+        }
+
+        /** execute the user functor for each element in the full domain
+         *
+         * @tparam T_Functor type of the user functor, must have a `void operator()`
+         *                   with as many arguments as args contains
+         * @tparam T_Args type of the arguments, each type must implement an operator
+         *                 `template<typename T, typename R> R operator(T)`
+         *
+         * @param functor user defined functor
+         * @param args arguments passed to the functor
+         *             The method `template<typename T, typename R> R operator(T)`
+         *             is called for each argument, the result is passed to the
+         *             functor `functor::operator()`.
+         *             `T` is a N-dimensional vector of an index relative to the origin
+         *             of data domain GUARD
+         */
+        template<typename T_Functor, typename... T_Args, typename T_Acc>
+        DINLINE void operator()(T_Acc const& acc, T_Functor& functor, T_Args&&... args)
+        {
+            using namespace mappings::threads;
+            ForEachIdx<IdxConfig<math::CT::volume<DomainSize>::type::value, numWorkers>>{m_workerIdx}(
+                [&](uint32_t const linearIdx, uint32_t const) {
+                    /* offset (in elements) of the current processed element relative
+                     * to the origin of the core domain
+                     */
+                    DataSpace<dim> const offset(
+                        DataSpaceOperations<dim>::template map<DomainSize>(linearIdx) - OffsetOrigin::toRT());
+                    functor(acc, args(offset)...);
+                });
+        }
+    };
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/mappings/threads/WorkerCfg.hpp b/include/pmacc/mappings/threads/WorkerCfg.hpp
index 73f658e50a..17a2812aa7 100644
--- a/include/pmacc/mappings/threads/WorkerCfg.hpp
+++ b/include/pmacc/mappings/threads/WorkerCfg.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,58 +26,54 @@
 
 namespace pmacc
 {
-namespace mappings
-{
-namespace threads
-{
-
-/** holds a worker configuration
- *
- * collection of the compile time number of workers and the runtime worker index
- *
- * @tparam T_numWorkers number of workers which are used to execute this functor
- */
-template< uint32_t T_numWorkers >
-class WorkerCfg
-{
-private:
-
-    //! index of the worker: range [0;T_numWorkers) */
-    PMACC_ALIGN( m_workerIdx, uint32_t const );
-
-public:
+    namespace mappings
+    {
+        namespace threads
+        {
+            /** holds a worker configuration
+             *
+             * collection of the compile time number of workers and the runtime worker index
+             *
+             * @tparam T_numWorkers number of workers which are used to execute this functor
+             */
+            template<uint32_t T_numWorkers>
+            class WorkerCfg
+            {
+            private:
+                //! index of the worker: range [0;T_numWorkers) */
+                PMACC_ALIGN(m_workerIdx, uint32_t const);
 
-    //! number of workers
-    static constexpr uint32_t numWorkers = T_numWorkers;
+            public:
+                //! number of workers
+                static constexpr uint32_t numWorkers = T_numWorkers;
 
-    /** constructor
-     *
-     * @param workerIdx worker index
-     */
-    HDINLINE WorkerCfg( uint32_t const workerIdx ) :
-        m_workerIdx( workerIdx )
-    { }
+                /** constructor
+                 *
+                 * @param workerIdx worker index
+                 */
+                HDINLINE WorkerCfg(uint32_t const workerIdx) : m_workerIdx(workerIdx)
+                {
+                }
 
-    /** get the worker index
-     *
-     * @return index of the worker
-     */
-    HDINLINE uint32_t getWorkerIdx( ) const
-    {
-        return m_workerIdx;
-    }
+                /** get the worker index
+                 *
+                 * @return index of the worker
+                 */
+                HDINLINE uint32_t getWorkerIdx() const
+                {
+                    return m_workerIdx;
+                }
 
-    /** get the number of workers
-     *
-     * @return number of workers
-     */
-    HDINLINE static
-    constexpr uint32_t getNumWorkers( )
-    {
-        return T_numWorkers;
-    }
-};
+                /** get the number of workers
+                 *
+                 * @return number of workers
+                 */
+                HDINLINE static constexpr uint32_t getNumWorkers()
+                {
+                    return T_numWorkers;
+                }
+            };
 
-} // namespace threads
-} // namespace mappings
+        } // namespace threads
+    } // namespace mappings
 } // namespace pmacc
diff --git a/include/pmacc/math/Complex.hpp b/include/pmacc/math/Complex.hpp
index 84fb29cee1..ca457a449c 100644
--- a/include/pmacc/math/Complex.hpp
+++ b/include/pmacc/math/Complex.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Debus
+/* Copyright 2015-2021 Alexander Debus
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/math/ConstVector.hpp b/include/pmacc/math/ConstVector.hpp
index 256a2f3afd..4e57c6e043 100644
--- a/include/pmacc/math/ConstVector.hpp
+++ b/include/pmacc/math/ConstVector.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2014-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -26,21 +26,43 @@
 #include "pmacc/types.hpp"
 
 /* select namespace depending on __CUDA_ARCH__ compiler flag*/
-#ifdef __CUDA_ARCH__ //we are on gpu
-#   define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_device,id)
+#if(CUPLA_DEVICE_COMPILE == 1) // we are on gpu
+#    define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id)                                                             \
+        using namespace PMACC_JOIN(pmacc_static_const_vector_device, id)
 #else
-#   define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id) using namespace PMACC_JOIN(pmacc_static_const_vector_host,id)
+#    define PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id)                                                             \
+        using namespace PMACC_JOIN(pmacc_static_const_vector_host, id)
 #endif
 
-#ifdef __CUDACC__
-#   define PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id,Name,Type,...)                \
-        namespace PMACC_JOIN(pmacc_static_const_vector_device,id)                  \
-        {                                                                          \
-           /* store all values in a const C array on device*/                      \
-            __constant__ const Type PMACC_JOIN(Name, _data)[]={__VA_ARGS__};       \
+#if defined(__CUDACC__) || BOOST_COMP_HIP
+#    define PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id, Name, Type, ...)                                               \
+        namespace PMACC_JOIN(pmacc_static_const_vector_device, id)                                                    \
+        {                                                                                                             \
+            /* store all values in a const C array on device*/                                                        \
+            __constant__ const Type PMACC_JOIN(Name, _data)[] = {__VA_ARGS__};                                        \
         } /*namespace pmacc_static_const_vector_device + id */
 #else
-#   define PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id,Name,Type,...)
+#    define PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id, Name, Type, ...)
+#endif
+
+#define PMACC_PRAGMA_QUOTE(x) _Pragma(#x)
+#define PMACC_PRAGMA_OACC_DECLARE_ARRAY(name, count)
+#define PMACC_PRAGMA_OMP_TARGET_BEGIN_DECLARE
+#define PMACC_PRAGMA_OMP_TARGET_END_DECLARE
+#define PMACC_TARGET_CONSTEXPR constexpr
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+#    undef PMACC_PRAGMA_OACC_DECLARE_ARRAY(name, count)
+#    undef PMACC_TARGET_CONSTEXPR
+// might need to remove parentheses from macro argument count to clean up copyin clause, but works with NVHPC
+#    define PMACC_PRAGMA_OACC_DECLARE_ARRAY(name, count) PMACC_PRAGMA_QUOTE(acc declare copyin(name))
+#    define PMACC_TARGET_CONSTEXPR
+#elif defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+#    undef PMACC_PRAGMA_OMP_TARGET_BEGIN_DECLARE
+#    undef PMACC_PRAGMA_OMP_TARGET_END_DECLARE
+// the single-pragma declare (more like the OpenACC version above) does not work with clang 11
+#    define PMACC_PRAGMA_OMP_TARGET_BEGIN_DECLARE _Pragma("omp declare target")
+#    define PMACC_PRAGMA_OMP_TARGET_END_DECLARE _Pragma("omp end declare target")
 #endif
 
 /** define a const vector
@@ -50,75 +72,75 @@
  *
  * @param id unique precompiler id to create unique namespaces
  */
-#define PMACC_STATIC_CONST_VECTOR_DIM_DEF(id,Name,Type,Dim,count,...)          \
-namespace PMACC_JOIN(pmacc_static_const_storage,id)                            \
-{                                                                              \
-    PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id,Name,Type,__VA_ARGS__);          \
-    namespace PMACC_JOIN(pmacc_static_const_vector_host,id)                    \
-    {                                                                          \
-        /* store all values in a const C array on host*/                       \
-        const Type PMACC_JOIN(Name,_data)[]={__VA_ARGS__};                     \
-    } /* namespace pmacc_static_const_vector_host + id  */                     \
-    /* select host or device namespace depending on __CUDA_ARCH__ compiler flag*/ \
-    PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id);                             \
-    template<typename T_Type, int T_Dim>                                       \
-    struct ConstArrayStorage                                                   \
-    {                                                                          \
-        PMACC_CASSERT_MSG(                                                     \
-            __PMACC_CONST_VECTOR_dimension_needs_to_be_less_than_or_equal_to_the_number_of_arguments__, \
-            Dim <= count );                                                    \
-        static constexpr bool isConst = true;                                  \
-        typedef T_Type type;                                                   \
-        static constexpr int dim = T_Dim;                                      \
-                                                                               \
-        HDINLINE const type& operator[](const int idx) const                   \
-        {                                                                      \
-            /*access const C array with the name of array*/                    \
-            return PMACC_JOIN(Name,_data)[idx];                                \
-        }                                                                      \
-    };                                                                         \
-    /*define a const vector type, ConstArrayStorage is used as Storage policy*/\
-    typedef const pmacc::math::Vector<                                         \
-        Type,                                                                  \
-        Dim,                                                                   \
-        pmacc::math::StandardAccessor,                                         \
-        pmacc::math::StandardNavigator,                                        \
-        ConstArrayStorage > PMACC_JOIN(Name,_t);                               \
-} /* namespace pmacc_static_const_storage + id */                              \
-using namespace PMACC_JOIN(pmacc_static_const_storage,id)
+#define PMACC_STATIC_CONST_VECTOR_DIM_DEF(id, Name, Type, Dim, count, ...)                                            \
+    namespace PMACC_JOIN(pmacc_static_const_storage, id)                                                              \
+    {                                                                                                                 \
+        PMACC_STATIC_CONST_VECTOR_DIM_DEF_CUDA(id, Name, Type, __VA_ARGS__);                                          \
+        namespace PMACC_JOIN(pmacc_static_const_vector_host, id)                                                      \
+        {                                                                                                             \
+            /* store all values in a const C array on host*/                                                          \
+            PMACC_PRAGMA_OMP_TARGET_BEGIN_DECLARE                                                                     \
+            PMACC_TARGET_CONSTEXPR Type PMACC_JOIN(Name, _data)[] = {__VA_ARGS__};                                    \
+            PMACC_PRAGMA_OMP_TARGET_END_DECLARE                                                                       \
+            PMACC_PRAGMA_OACC_DECLARE_ARRAY(PMACC_JOIN(Name, _data), count)                                           \
+        } /* namespace pmacc_static_const_vector_host + id  */                                                        \
+        /* select host or device namespace depending on __CUDA_ARCH__ compiler flag*/                                 \
+        PMACC_USING_STATIC_CONST_VECTOR_NAMESPACE(id);                                                                \
+        template<typename T_Type, int T_Dim>                                                                          \
+        struct ConstArrayStorage                                                                                      \
+        {                                                                                                             \
+            PMACC_CASSERT_MSG(                                                                                        \
+                __PMACC_CONST_VECTOR_dimension_needs_to_be_less_than_or_equal_to_the_number_of_arguments__,           \
+                Dim <= count);                                                                                        \
+            static constexpr bool isConst = true;                                                                     \
+            typedef T_Type type;                                                                                      \
+            static constexpr int dim = T_Dim;                                                                         \
+                                                                                                                      \
+            HDINLINE const type& operator[](const int idx) const                                                      \
+            {                                                                                                         \
+                /*access const C array with the name of array*/                                                       \
+                return PMACC_JOIN(Name, _data)[idx];                                                                  \
+            }                                                                                                         \
+        };                                                                                                            \
+        /*define a const vector type, ConstArrayStorage is used as Storage policy*/                                   \
+        typedef const pmacc::math::                                                                                   \
+            Vector<Type, Dim, pmacc::math::StandardAccessor, pmacc::math::StandardNavigator, ConstArrayStorage>       \
+                PMACC_JOIN(Name, _t);                                                                                 \
+    } /* namespace pmacc_static_const_storage + id */                                                                 \
+    using namespace PMACC_JOIN(pmacc_static_const_storage, id)
 
-#ifdef __CUDACC__
-#   define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name,id)                \
-        namespace PMACC_JOIN(pmacc_static_const_vector_device,id)              \
-        {                                                                      \
-            /* create const instance on device */                              \
-            __constant__ const PMACC_JOIN(Name,_t) Name;                       \
+#if defined(__CUDACC__) || BOOST_COMP_HIP
+#    define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name, id)                                                     \
+        namespace PMACC_JOIN(pmacc_static_const_vector_device, id)                                                    \
+        {                                                                                                             \
+            /* create const instance on device */                                                                     \
+            __constant__ const PMACC_JOIN(Name, _t) Name;                                                             \
         } /* namespace pmacc_static_const_vector_device + id */
 #else
-#   define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name,id)
+#    define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name, id)
 #endif
 
 /** create a instance of type `Name_t` with the name `Name`
  */
-#define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE(id,Name,Type,Dim,count,...)     \
-namespace PMACC_JOIN(pmacc_static_const_storage,id)                            \
-{                                                                              \
-    /* Conditionally define the instance on CUDA devices */                    \
-    PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name,id)                       \
-    namespace PMACC_JOIN(pmacc_static_const_vector_host,id)                    \
-    {                                                                          \
-        /* create const instance on host*/                                     \
-        const PMACC_JOIN(Name,_t) Name;                                        \
-    } /* namespace pmacc_static_const_vector_host + id  */                     \
-} /* namespace pmacc_static_const_storage + id */
+#define PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE(id, Name, Type, Dim, count, ...)                                       \
+    namespace PMACC_JOIN(pmacc_static_const_storage, id)                                                              \
+    {                                                                                                                 \
+        /* Conditionally define the instance on CUDA devices */                                                       \
+        PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE_CUDA(Name, id)                                                         \
+        namespace PMACC_JOIN(pmacc_static_const_vector_host, id)                                                      \
+        {                                                                                                             \
+            /* create const instance on host*/                                                                        \
+            constexpr PMACC_JOIN(Name, _t) Name;                                                                      \
+        } /* namespace pmacc_static_const_vector_host + id  */                                                        \
+    } /* namespace pmacc_static_const_storage + id */
 
 /** @see PMACC_CONST_VECTOR documentation, only unique "id" is added
  *
  * @param id unique precompiler id to create unique namespaces
  */
-#define PMACC_STATIC_CONST_VECTOR_DIM(id,Name,Type,Dim,count,...)              \
-    PMACC_STATIC_CONST_VECTOR_DIM_DEF(id,Name,Type,Dim,count,__VA_ARGS__);     \
-    PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE(id,Name,Type,Dim,count,__VA_ARGS__)
+#define PMACC_STATIC_CONST_VECTOR_DIM(id, Name, Type, Dim, count, ...)                                                \
+    PMACC_STATIC_CONST_VECTOR_DIM_DEF(id, Name, Type, Dim, count, __VA_ARGS__);                                       \
+    PMACC_STATIC_CONST_VECTOR_DIM_INSTANCE(id, Name, Type, Dim, count, __VA_ARGS__)
 
 
 /** define a const vector
@@ -127,8 +149,8 @@ namespace PMACC_JOIN(pmacc_static_const_storage,id)                            \
  *
  * create type definition `name_t`
  */
-#define PMACC_CONST_VECTOR_DEF(type,dim,name,...)                              \
-    PMACC_STATIC_CONST_VECTOR_DIM_DEF(__COUNTER__,name,type,dim,PMACC_COUNT_ARGS(type,__VA_ARGS__),__VA_ARGS__)
+#define PMACC_CONST_VECTOR_DEF(type, dim, name, ...)                                                                  \
+    PMACC_STATIC_CONST_VECTOR_DIM_DEF(__COUNTER__, name, type, dim, PMACC_COUNT_ARGS(type, __VA_ARGS__), __VA_ARGS__)
 
 /** Create global constant math::Vector with compile time values which can be
  *  used on device and host
@@ -146,5 +168,5 @@ namespace PMACC_JOIN(pmacc_static_const_storage,id)                            \
  *      create math:Vector<float,2> myVector(2.1,4.2); //as global const vector
  *      The type of the created vector is "name_t" -> in this case "myVector_t"
  */
-#define PMACC_CONST_VECTOR(type,dim,name,...)                                   \
-    PMACC_STATIC_CONST_VECTOR_DIM(__COUNTER__,name,type,dim,PMACC_COUNT_ARGS(type,__VA_ARGS__),__VA_ARGS__)
+#define PMACC_CONST_VECTOR(type, dim, name, ...)                                                                      \
+    PMACC_STATIC_CONST_VECTOR_DIM(__COUNTER__, name, type, dim, PMACC_COUNT_ARGS(type, __VA_ARGS__), __VA_ARGS__)
diff --git a/include/pmacc/math/MapTuple.hpp b/include/pmacc/math/MapTuple.hpp
index e085e66ea8..644427fdd0 100644
--- a/include/pmacc/math/MapTuple.hpp
+++ b/include/pmacc/math/MapTuple.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -32,220 +32,138 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-    namespace bmpl = boost::mpl;
-
-    /** wrap a datum
-     *
-     * align the data structure with `PMACC_ALIGN`
-     *
-     * @tparam T_Pair boost mpl pair< key, type of the value >
-     */
-    template< typename T_Pair >
-    struct AlignedData
+    namespace math
     {
-        typedef typename T_Pair::first Key;
-        typedef typename T_Pair::second ValueType;
-
-        PMACC_ALIGN( value, ValueType );
+        namespace bmpl = boost::mpl;
 
-        HDINLINE AlignedData( )
+        /** wrap a datum
+         *
+         * align the data structure with `PMACC_ALIGN`
+         *
+         * @tparam T_Pair boost mpl pair< key, type of the value >
+         */
+        template<typename T_Pair>
+        struct AlignedData
         {
-        }
+            typedef typename T_Pair::first Key;
+            typedef typename T_Pair::second ValueType;
 
-        HDINLINE AlignedData( const ValueType& value ) : value( value )
-        {
-        }
+            PMACC_ALIGN(value, ValueType);
 
-        HDINLINE ValueType& operator[]( const Key& )
-        {
-            return value;
-        }
+            HDINLINE AlignedData()
+            {
+            }
 
-        HDINLINE const ValueType& operator[]( const Key& ) const
-        {
-            return value;
-        }
-    };
-
-    /** wrap a datum
-     *
-     * @tparam T_Pair boost mpl pair< key, type of the value >
-     */
-    template< typename T_Pair >
-    struct NativeData
-    {
-        typedef typename T_Pair::first Key;
-        typedef typename T_Pair::second ValueType;
+            HDINLINE AlignedData(const ValueType& value) : value(value)
+            {
+            }
 
-        ValueType value;
+            HDINLINE ValueType& operator[](const Key&)
+            {
+                return value;
+            }
 
-        HDINLINE NativeData( )
-        {
-        }
+            HDINLINE const ValueType& operator[](const Key&) const
+            {
+                return value;
+            }
+        };
 
-        HDINLINE NativeData( const ValueType& value ) : value( value )
+        /** wrap a datum
+         *
+         * @tparam T_Pair boost mpl pair< key, type of the value >
+         */
+        template<typename T_Pair>
+        struct NativeData
         {
-        }
+            typedef typename T_Pair::first Key;
+            typedef typename T_Pair::second ValueType;
 
-        HDINLINE ValueType& operator[]( const Key& )
-        {
-            return value;
-        }
+            ValueType value;
 
-        HDINLINE const ValueType& operator[]( const Key& ) const
-        {
-            return value;
-        }
-    };
-
-    template<
-        typename T_Map,
-        template< typename > class T_PodType = NativeData
-    >
-    struct MapTuple :
-        protected InheritLinearly<
-            T_Map,
-            T_PodType
-        >
-    {
+            HDINLINE NativeData()
+            {
+            }
 
-        typedef T_Map Map;
-        static constexpr int dim = bmpl::size< Map >::type::value;
-        typedef InheritLinearly<
-            T_Map,
-            T_PodType
-        > Base;
+            HDINLINE NativeData(const ValueType& value) : value(value)
+            {
+            }
 
-        template< class > struct result;
+            HDINLINE ValueType& operator[](const Key&)
+            {
+                return value;
+            }
 
-        template<
-            class T_F,
-            class T_Key
-        >
-        struct result< T_F( T_Key ) >
-        {
-            typedef typename bmpl::at<
-                Map,
-                T_Key
-            >::type& type;
+            HDINLINE const ValueType& operator[](const Key&) const
+            {
+                return value;
+            }
         };
 
-        template<
-            class T_F,
-            class T_Key
-        >
-        struct result< const T_F( T_Key ) >
+        template<typename T_Map, template<typename> class T_PodType = NativeData>
+        struct MapTuple : protected InheritLinearly<T_Map, T_PodType>
         {
-            typedef const typename bmpl::at<
-                Map,
-                T_Key
-            >::type& type;
+            typedef T_Map Map;
+            static constexpr int dim = bmpl::size<Map>::type::value;
+            typedef InheritLinearly<T_Map, T_PodType> Base;
+
+            template<class>
+            struct result;
+
+            template<class T_F, class T_Key>
+            struct result<T_F(T_Key)>
+            {
+                typedef typename bmpl::at<Map, T_Key>::type& type;
+            };
+
+            template<class T_F, class T_Key>
+            struct result<const T_F(T_Key)>
+            {
+                typedef const typename bmpl::at<Map, T_Key>::type& type;
+            };
+
+            /** access a datum with a key
+             *
+             * @tparam T_Key key type
+             *
+             * @{
+             */
+            template<typename T_Key>
+            HDINLINE typename boost::result_of<MapTuple(T_Key)>::type operator[](const T_Key& key)
+            {
+                return (*(static_cast<T_PodType<bmpl::pair<T_Key, typename bmpl::at<Map, T_Key>::type>>*>(this)))[key];
+            }
+
+            template<typename T_Key>
+            HDINLINE typename boost::result_of<const MapTuple(T_Key)>::type operator[](const T_Key& key) const
+            {
+                return (*(
+                    static_cast<const T_PodType<bmpl::pair<T_Key, typename bmpl::at<Map, T_Key>::type>>*>(this)))[key];
+            }
+            /** @} */
+
+            /** access a datum with an index
+             *
+             * @tparam T_i the index of tuple's i-th element
+             *
+             * @{
+             */
+            template<int T_i>
+            HDINLINE typename boost::result_of<MapTuple(typename bmpl::at<Map, bmpl::int_<T_i>>::type::first)>::type
+            at()
+            {
+                return (*this)[typename bmpl::at<Map, bmpl::int_<T_i>>::type::first()];
+            }
+
+            template<int T_i>
+            HDINLINE
+                typename boost::result_of<const MapTuple(typename bmpl::at<Map, bmpl::int_<T_i>>::type::first)>::type
+                at() const
+            {
+                return (*this)[typename bmpl::at<Map, bmpl::int_<T_i>>::type::first()];
+            }
+            /** @} */
         };
 
-        /** access a datum with a key
-         *
-         * @tparam T_Key key type
-         *
-         * @{
-         */
-        template< typename T_Key >
-        HDINLINE
-        typename boost::result_of<
-            MapTuple( T_Key )
-        >::type
-        operator[]( const T_Key& key )
-        {
-            return
-            (
-                *( static_cast<
-                    T_PodType<
-                        bmpl::pair<
-                            T_Key,
-                            typename bmpl::at<
-                                Map,
-                                T_Key
-                            >::type
-                        >
-                    >*
-                >( this ) )
-            )[key];
-        }
-
-        template< typename T_Key >
-        HDINLINE
-        typename boost::result_of<
-            const MapTuple( T_Key )
-        >::type
-        operator[]( const T_Key& key ) const
-        {
-            return (
-                *(
-                    static_cast<
-                        const T_PodType<
-                            bmpl::pair<
-                                T_Key,
-                                typename bmpl::at<
-                                    Map,
-                                    T_Key
-                                >::type
-                            >
-                        >*
-                    >( this )
-                )
-            )[key];
-        }
-        /** @} */
-
-        /** access a datum with an index
-         *
-         * @tparam T_i the index of tuple's i-th element
-         *
-         * @{
-         */
-        template< int T_i >
-        HDINLINE
-        typename boost::result_of<
-            MapTuple(
-                typename bmpl::at<
-                    Map,
-                    bmpl::int_< T_i >
-                >::type::first
-            )
-        >::type
-        at( )
-        {
-            return ( *this )[
-                typename bmpl::at<
-                    Map,
-                    bmpl::int_< T_i >
-                >::type::first( )
-            ];
-        }
-
-        template< int T_i >
-        HDINLINE
-        typename boost::result_of<
-            const MapTuple(
-                typename bmpl::at<
-                    Map,
-                    bmpl::int_< T_i >
-                >::type::first
-            )
-        >::type
-        at( ) const
-        {
-            return ( *this )[
-                typename bmpl::at<
-                    Map,
-                    bmpl::int_< T_i >
-                >::type::first( )
-            ];
-        }
-        /** @} */
-    };
-
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/RungeKutta.hpp b/include/pmacc/math/RungeKutta.hpp
index a7e59e2035..bf5764778e 100644
--- a/include/pmacc/math/RungeKutta.hpp
+++ b/include/pmacc/math/RungeKutta.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Richard Pausch
+/* Copyright 2015-2021 Richard Pausch
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/math/RungeKutta/RungeKutta4.hpp b/include/pmacc/math/RungeKutta/RungeKutta4.hpp
index 373876ad0b..99836f44d3 100644
--- a/include/pmacc/math/RungeKutta/RungeKutta4.hpp
+++ b/include/pmacc/math/RungeKutta/RungeKutta4.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Richard Pausch
+/* Copyright 2015-2021 Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -23,55 +23,48 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-struct RungeKutta4
-{
-    /** Runge Kutta solver 4th order
-     *
-     *  Calculate next time step based on the Runge Kutta
-     *  algorithm and return next variable
-     *
-     *  @param diffEq functor with first argument time and second variables
-     *  @param var variables of type T_Variable (can be vector type)
-     *  @param time current time
-     *  @param deltaTime time step
-     *  @return var for the consecutive time step
-     */
-    template<typename T_Functor, typename T_Variable, typename T_Time>
-    HDINLINE T_Variable operator()(const T_Functor diffEq,
-                                   const T_Variable var,
-                                   const T_Time time,
-                                   const T_Time deltaTime)
+    namespace math
     {
-      // use typenames instead of template types
-      typedef T_Functor FunctorType;
-      typedef T_Variable VariableType;
-      typedef T_Time TimeType;
+        struct RungeKutta4
+        {
+            /** Runge Kutta solver 4th order
+             *
+             *  Calculate next time step based on the Runge Kutta
+             *  algorithm and return next variable
+             *
+             *  @param diffEq functor with first argument time and second variables
+             *  @param var variables of type T_Variable (can be vector type)
+             *  @param time current time
+             *  @param deltaTime time step
+             *  @return var for the consecutive time step
+             */
+            template<typename T_Functor, typename T_Variable, typename T_Time>
+            HDINLINE T_Variable
+            operator()(const T_Functor diffEq, const T_Variable var, const T_Time time, const T_Time deltaTime)
+            {
+                // use typenames instead of template types
+                typedef T_Functor FunctorType;
+                typedef T_Variable VariableType;
+                typedef T_Time TimeType;
 
-      // calculate all 4 steps of the Runge Kutta 4th order
-      const VariableType k_1 = diffEq(time,
-                                      var);
-      const VariableType k_2 = diffEq(time + TimeType(0.5) * deltaTime,
-                                      var + (TimeType(0.5) * deltaTime) * k_1);
-      const VariableType k_3 = diffEq(time + TimeType(0.5) * deltaTime,
-                                      var + (TimeType(0.5) * deltaTime) * k_2);
-      const VariableType k_4 = diffEq(time + deltaTime,
-                                      var + deltaTime * k_3);
+                // calculate all 4 steps of the Runge Kutta 4th order
+                const VariableType k_1 = diffEq(time, var);
+                const VariableType k_2
+                    = diffEq(time + TimeType(0.5) * deltaTime, var + (TimeType(0.5) * deltaTime) * k_1);
+                const VariableType k_3
+                    = diffEq(time + TimeType(0.5) * deltaTime, var + (TimeType(0.5) * deltaTime) * k_2);
+                const VariableType k_4 = diffEq(time + deltaTime, var + deltaTime * k_3);
 
-      // combine all 4 steps
-      const VariableType diff = deltaTime/TimeType(6.) * (k_1
-                                                          + TimeType(2.) * k_2
-                                                          + TimeType(2.) * k_3
-                                                          +  k_4);
+                // combine all 4 steps
+                const VariableType diff
+                    = deltaTime / TimeType(6.) * (k_1 + TimeType(2.) * k_2 + TimeType(2.) * k_3 + k_4);
 
-      // current var + difference = new var
-      const VariableType out = var + diff;
-      return out;
-    }
-};
+                // current var + difference = new var
+                const VariableType out = var + diff;
+                return out;
+            }
+        };
 
 
-} //namespace math
-} //namespace pmacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/Tuple.hpp b/include/pmacc/math/Tuple.hpp
index 808d8235f5..d6751e9627 100644
--- a/include/pmacc/math/Tuple.hpp
+++ b/include/pmacc/math/Tuple.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -40,129 +40,122 @@
 
 namespace pmacc
 {
-namespace math
-{
-
+    namespace math
+    {
 #ifndef TUPLE_MAX_DIM
-#define TUPLE_MAX_DIM 8
+#    define TUPLE_MAX_DIM 8
 #endif
 
-#define CONSTRUCTOR(Z, N, _)                                \
-    template<BOOST_PP_ENUM_PARAMS(N, typename Arg)>         \
-    HDINLINE                                                \
-    Tuple(BOOST_PP_ENUM_BINARY_PARAMS(N, const Arg, &arg))  \
-    : value(arg0),                                          \
-      base(BOOST_PP_ENUM_SHIFTED_PARAMS(N, arg))            \
-    {                                                       \
-        BOOST_STATIC_ASSERT(dim == N);                      \
+#define CONSTRUCTOR(Z, N, _)                                                                                          \
+    template<BOOST_PP_ENUM_PARAMS(N, typename Arg)>                                                                   \
+    HDINLINE Tuple(BOOST_PP_ENUM_BINARY_PARAMS(N, const Arg, &arg))                                                   \
+        : value(arg0)                                                                                                 \
+        , base(BOOST_PP_ENUM_SHIFTED_PARAMS(N, arg))                                                                  \
+    {                                                                                                                 \
+        BOOST_STATIC_ASSERT(dim == N);                                                                                \
     }
 
-namespace mpl = boost::mpl;
-
-template<typename TypeList, bool ListEmpty = mpl::empty<TypeList>::type::value>
-class Tuple;
-
-template<typename TypeList>
-class Tuple<TypeList, true> {};
-
-template<typename TypeList>
-class Tuple<TypeList, false>
-    : public Tuple<typename mpl::pop_front<TypeList>::type>
-{
-public:
-    static constexpr int dim = mpl::size<TypeList>::type::value;
-    typedef TypeList TypeList_;
-private:
-    typedef Tuple<typename mpl::pop_front<TypeList>::type> base;
-
-    typedef typename mpl::front<TypeList>::type Value;
-    typedef typename boost::remove_reference<Value>::type pureValue;
-
-    Value value;
-public:
-    HDINLINE Tuple() {}
-
-    HDINLINE Tuple(Value arg0) : value(arg0)
-    {
-        BOOST_STATIC_ASSERT(dim == 1);
-    }
-
-    BOOST_PP_REPEAT_FROM_TO(2, BOOST_PP_INC(TUPLE_MAX_DIM), CONSTRUCTOR, _)
-
-    template<int i>
-    HDINLINE
-    typename mpl::at_c<TypeList, i>::type&
-    at_c()
-    {
-        return this->at(mpl::int_<i>());
-    }
-    template<int i>
-    HDINLINE
-    const typename mpl::at_c<TypeList, i>::type&
-    at_c() const
-    {
-        return this->at(mpl::int_<i>());
-    }
-
-    HDINLINE Value& at(mpl::int_<0>)
-    {
-        return value;
-    }
-    HDINLINE Value& at(mpl::integral_c<int, 0>)
-    {
-        return value;
-    }
-
-    HDINLINE const Value& at(mpl::int_<0>) const
-    {
-        return value;
-    }
-    HDINLINE const Value& at(mpl::integral_c<int, 0>) const
-    {
-        return value;
-    }
-
-    template<typename Idx>
-    HDINLINE
-    typename mpl::at<TypeList, Idx>::type&
-    at(Idx)
-    {
-        return base::at(typename mpl::minus<Idx, mpl::int_<1> >::type());
-    }
-
-    template<typename Idx>
-    HDINLINE
-    const typename mpl::at<TypeList, Idx>::type&
-    at(Idx) const
-    {
-        return base::at(typename mpl::minus<Idx, mpl::int_<1> >::type());
-    }
-};
+        namespace mpl = boost::mpl;
+
+        template<typename TypeList, bool ListEmpty = mpl::empty<TypeList>::type::value>
+        class Tuple;
+
+        template<typename TypeList>
+        class Tuple<TypeList, true>
+        {
+        };
+
+        template<typename TypeList>
+        class Tuple<TypeList, false> : public Tuple<typename mpl::pop_front<TypeList>::type>
+        {
+        public:
+            static constexpr int dim = mpl::size<TypeList>::type::value;
+            typedef TypeList TypeList_;
+
+        private:
+            typedef Tuple<typename mpl::pop_front<TypeList>::type> base;
+
+            typedef typename mpl::front<TypeList>::type Value;
+            typedef typename boost::remove_reference<Value>::type pureValue;
+
+            Value value;
+
+        public:
+            HDINLINE Tuple()
+            {
+            }
+
+            HDINLINE Tuple(Value arg0) : value(arg0)
+            {
+                BOOST_STATIC_ASSERT(dim == 1);
+            }
+
+            BOOST_PP_REPEAT_FROM_TO(2, BOOST_PP_INC(TUPLE_MAX_DIM), CONSTRUCTOR, _)
+
+            template<int i>
+            HDINLINE typename mpl::at_c<TypeList, i>::type& at_c()
+            {
+                return this->at(mpl::int_<i>());
+            }
+            template<int i>
+            HDINLINE const typename mpl::at_c<TypeList, i>::type& at_c() const
+            {
+                return this->at(mpl::int_<i>());
+            }
+
+            HDINLINE Value& at(mpl::int_<0>)
+            {
+                return value;
+            }
+            HDINLINE Value& at(mpl::integral_c<int, 0>)
+            {
+                return value;
+            }
+
+            HDINLINE const Value& at(mpl::int_<0>) const
+            {
+                return value;
+            }
+            HDINLINE const Value& at(mpl::integral_c<int, 0>) const
+            {
+                return value;
+            }
+
+            template<typename Idx>
+            HDINLINE typename mpl::at<TypeList, Idx>::type& at(Idx)
+            {
+                return base::at(typename mpl::minus<Idx, mpl::int_<1>>::type());
+            }
+
+            template<typename Idx>
+            HDINLINE const typename mpl::at<TypeList, Idx>::type& at(Idx) const
+            {
+                return base::at(typename mpl::minus<Idx, mpl::int_<1>>::type());
+            }
+        };
 
 #undef CONSTRUCTOR
 
-#define MAKE_TUPLE(Z, N, _) \
-    template<BOOST_PP_ENUM_PARAMS(N, typename Value)> \
-    HDINLINE \
-    Tuple<mpl::vector<BOOST_PP_ENUM_PARAMS(N, Value)> > \
-    make_Tuple(BOOST_PP_ENUM_BINARY_PARAMS(N, Value, value)) \
-    { \
-        return Tuple<mpl::vector<BOOST_PP_ENUM_PARAMS(N, Value)> > \
-            (BOOST_PP_ENUM_PARAMS(N, value)); \
+#define MAKE_TUPLE(Z, N, _)                                                                                           \
+    template<BOOST_PP_ENUM_PARAMS(N, typename Value)>                                                                 \
+    HDINLINE Tuple<mpl::vector<BOOST_PP_ENUM_PARAMS(N, Value)>> make_Tuple(                                           \
+        BOOST_PP_ENUM_BINARY_PARAMS(N, Value, value))                                                                 \
+    {                                                                                                                 \
+        return Tuple<mpl::vector<BOOST_PP_ENUM_PARAMS(N, Value)>>(BOOST_PP_ENUM_PARAMS(N, value));                    \
     }
 
-BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(TUPLE_MAX_DIM), MAKE_TUPLE, _)
+        BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(TUPLE_MAX_DIM), MAKE_TUPLE, _)
 
 #undef MAKE_TUPLE
 
-namespace result_of
-{
-template<typename TTuple, int i>
-struct at_c
-{
-    typedef typename mpl::at_c<typename TTuple::TypeList_, i>::type type;
-};
-} // result_of
-
-} // math
-} // PMacc
+        namespace result_of
+        {
+            template<typename TTuple, int i>
+            struct at_c
+            {
+                typedef typename mpl::at_c<typename TTuple::TypeList_, i>::type type;
+            };
+        } // namespace result_of
+
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/Vector.hpp b/include/pmacc/math/Vector.hpp
index 6580efb57f..78f0310902 100644
--- a/include/pmacc/math/Vector.hpp
+++ b/include/pmacc/math/Vector.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/math/VectorOperations.hpp b/include/pmacc/math/VectorOperations.hpp
index e14797c7b8..445858b442 100644
--- a/include/pmacc/math/VectorOperations.hpp
+++ b/include/pmacc/math/VectorOperations.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl
+/* Copyright 2014-2021 Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -26,64 +26,56 @@
 
 namespace pmacc
 {
-namespace math
-{
-    /** Map a runtime linear index to a N dimensional position
-     *
-     *  The size of the space to map the index to must be know at compile time
-     *
-     * \tparam T_Dim dimension of the position to map to
-     */
-    template<uint32_t T_Dim>
-    struct MapToPos;
-
-    template<>
-    struct MapToPos<3>
+    namespace math
     {
-        /** Functor
+        /** Map a runtime linear index to a N dimensional position
+         *
+         *  The size of the space to map the index to must be know at compile time
          *
-         *  \tparam T_ctVec math::CT::vector type like \see math::CT::Int
-         *  \param math::CT::vector with spatial size to map the index to
-         *  \param linearIndex linear index to be mapped
-         *  \return runtime math::vector of dimension T_Dim
+         * \tparam T_Dim dimension of the position to map to
          */
-        template<typename T_ctVec>
-        DINLINE
-        typename T_ctVec::RT_type
-        operator()( T_ctVec, const int linearIndex )
+        template<uint32_t T_Dim>
+        struct MapToPos;
+
+        template<>
+        struct MapToPos<3>
         {
-            return typename T_ctVec::RT_type(
-                (linearIndex  % T_ctVec::x::value),
-                ((linearIndex % (T_ctVec::x::value * T_ctVec::y::value)) / T_ctVec::x::value),
-                (linearIndex  / (T_ctVec::x::value * T_ctVec::y::value)));
-        }
-    };
+            /** Functor
+             *
+             *  \tparam T_ctVec math::CT::vector type like \see math::CT::Int
+             *  \param math::CT::vector with spatial size to map the index to
+             *  \param linearIndex linear index to be mapped
+             *  \return runtime math::vector of dimension T_Dim
+             */
+            template<typename T_ctVec>
+            DINLINE typename T_ctVec::RT_type operator()(T_ctVec, const int linearIndex)
+            {
+                return typename T_ctVec::RT_type(
+                    (linearIndex % T_ctVec::x::value),
+                    ((linearIndex % (T_ctVec::x::value * T_ctVec::y::value)) / T_ctVec::x::value),
+                    (linearIndex / (T_ctVec::x::value * T_ctVec::y::value)));
+            }
+        };
 
-    template<>
-    struct MapToPos<2>
-    {
-        template<typename T_ctVec>
-        DINLINE
-        typename T_ctVec::RT_type
-        operator()( T_ctVec, const int linearIndex )
+        template<>
+        struct MapToPos<2>
         {
-            return typename T_ctVec::RT_type(
-                (linearIndex % T_ctVec::x::value),
-                (linearIndex / T_ctVec::x::value));
-        }
-    };
+            template<typename T_ctVec>
+            DINLINE typename T_ctVec::RT_type operator()(T_ctVec, const int linearIndex)
+            {
+                return typename T_ctVec::RT_type((linearIndex % T_ctVec::x::value), (linearIndex / T_ctVec::x::value));
+            }
+        };
 
-    template<>
-    struct MapToPos<1>
-    {
-        template<typename T_ctVec>
-        DINLINE
-        typename T_ctVec::RT_type
-        operator()( T_ctVec, const int linearIndex )
+        template<>
+        struct MapToPos<1>
         {
-            return typename T_ctVec::RT_type( linearIndex );
-        }
-    };
+            template<typename T_ctVec>
+            DINLINE typename T_ctVec::RT_type operator()(T_ctVec, const int linearIndex)
+            {
+                return typename T_ctVec::RT_type(linearIndex);
+            }
+        };
 
-} /* namespace math */
+    } /* namespace math */
 } /* namespace pmacc */
diff --git a/include/pmacc/math/complex/Bessel.hpp b/include/pmacc/math/complex/Bessel.hpp
index 95b16eb27d..c9b691539b 100644
--- a/include/pmacc/math/complex/Bessel.hpp
+++ b/include/pmacc/math/complex/Bessel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2003-2020 Alexander Debus, C. Bond
+/* Copyright 2003-2021 Alexander Debus, C. Bond
  *
  * This file is part of PMacc.
  *
@@ -67,180 +67,179 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-namespace bessel
-{
-    template<
-        typename T_Type,
-        typename T_TableA,
-        typename T_TableB,
-        typename T_TableA1,
-        typename T_TableB1
-    >
-    struct Cbesselj0Base;
-
-    template< typename T_Type >
-    HDINLINE typename J0< pmacc::math::Complex< T_Type > >::result
-    j0( pmacc::math::Complex< T_Type > const & z )
-    {
-        return J0< pmacc::math::Complex< T_Type > >( )( z );
-    }
-
-    template<
-        typename T_Type,
-        typename T_TableA,
-        typename T_TableB,
-        typename T_TableA1,
-        typename T_TableB1
-    >
-    struct Cbesselj1Base;
-
-    template< typename T_Type >
-    HDINLINE typename J1< pmacc::math::Complex< T_Type > >::result
-    j1( pmacc::math::Complex< T_Type > const & z)
+    namespace math
     {
-        return J1< pmacc::math::Complex< T_Type > >( )( z );
-    }
-
-    PMACC_CONST_VECTOR( double, 14, aDouble,
-        -7.03125e-2,
-         0.112152099609375,
-        -0.5725014209747314,
-         6.074042001273483,
-        -1.100171402692467e2,
-         3.038090510922384e3,
-        -1.188384262567832e5,
-         6.252951493434797e6,
-        -4.259392165047669e8,
-         3.646840080706556e10,
-        -3.833534661393944e12,
-         4.854014686852901e14,
-        -7.286857349377656e16,
-         1.279721941975975e19
-    );
-
-    PMACC_CONST_VECTOR( double, 14, bDouble,
-         7.32421875e-2,
-        -0.2271080017089844,
-         1.727727502584457,
-        -2.438052969955606e1,
-         5.513358961220206e2,
-        -1.825775547429318e4,
-         8.328593040162893e5,
-        -5.006958953198893e7,
-         3.836255180230433e9,
-        -3.649010818849833e11,
-         4.218971570284096e13,
-        -5.827244631566907e15,
-         9.476288099260110e17,
-        -1.792162323051699e20
-    );
-
-    PMACC_CONST_VECTOR( double, 14, a1Double,
-         0.1171875,
-        -0.1441955566406250,
-         0.6765925884246826,
-        -6.883914268109947,
-         1.215978918765359e2,
-        -3.302272294480852e3,
-         1.276412726461746e5,
-        -6.656367718817688e6,
-         4.502786003050393e8,
-        -3.833857520742790e10,
-         4.011838599133198e12,
-        -5.060568503314727e14,
-         7.572616461117958e16,
-        -1.326257285320556e19
-    );
-
-    PMACC_CONST_VECTOR( double, 14, b1Double,
-        -0.1025390625,
-         0.2775764465332031,
-        -1.993531733751297,
-         2.724882731126854e1,
-        -6.038440767050702e2,
-         1.971837591223663e4,
-        -8.902978767070678e5,
-         5.310411010968522e7,
-        -4.043620325107754e9,
-         3.827011346598605e11,
-        -4.406481417852278e13,
-         6.065091351222699e15,
-        -9.833883876590679e17,
-         1.855045211579828e20
-    );
-
-    PMACC_CONST_VECTOR( float, 14, aFloat,
-        -7.03125e-2,
-         0.112152099609375,
-        -0.5725014209747314,
-         6.074042001273483,
-        -1.100171402692467e2,
-         3.038090510922384e3,
-        -1.188384262567832e5,
-         6.252951493434797e6,
-        -4.259392165047669e8,
-         3.646840080706556e10,
-        -3.833534661393944e12,
-         4.854014686852901e14,
-        -7.286857349377656e16,
-         1.279721941975975e19
-    );
-
-    PMACC_CONST_VECTOR( float, 14, bFloat,
-         7.32421875e-2,
-        -0.2271080017089844,
-         1.727727502584457,
-        -2.438052969955606e1,
-         5.513358961220206e2,
-        -1.825775547429318e4,
-         8.328593040162893e5,
-        -5.006958953198893e7,
-         3.836255180230433e9,
-        -3.649010818849833e11,
-         4.218971570284096e13,
-        -5.827244631566907e15,
-         9.476288099260110e17,
-        -1.792162323051699e20
-    );
-
-    PMACC_CONST_VECTOR(float, 14, a1Float,
-         0.1171875,
-        -0.1441955566406250,
-         0.6765925884246826,
-        -6.883914268109947,
-         1.215978918765359e2,
-        -3.302272294480852e3,
-         1.276412726461746e5,
-        -6.656367718817688e6,
-         4.502786003050393e8,
-        -3.833857520742790e10,
-         4.011838599133198e12,
-        -5.060568503314727e14,
-         7.572616461117958e16,
-        -1.326257285320556e19
-    );
-
-    PMACC_CONST_VECTOR( float, 14, b1Float,
-        -0.1025390625,
-         0.2775764465332031,
-        -1.993531733751297,
-         2.724882731126854e1,
-        -6.038440767050702e2,
-         1.971837591223663e4,
-        -8.902978767070678e5,
-         5.310411010968522e7,
-        -4.043620325107754e9,
-         3.827011346598605e11,
-        -4.406481417852278e13,
-         6.065091351222699e15,
-        -9.833883876590679e17,
-         1.855045211579828e20
-    );
-} //namespace bessel
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
+        namespace bessel
+        {
+            template<typename T_Type, typename T_TableA, typename T_TableB, typename T_TableA1, typename T_TableB1>
+            struct Cbesselj0Base;
+
+            template<typename T_Type>
+            HDINLINE typename J0<pmacc::math::Complex<T_Type>>::result j0(pmacc::math::Complex<T_Type> const& z)
+            {
+                return J0<pmacc::math::Complex<T_Type>>()(z);
+            }
+
+            template<typename T_Type, typename T_TableA, typename T_TableB, typename T_TableA1, typename T_TableB1>
+            struct Cbesselj1Base;
+
+            template<typename T_Type>
+            HDINLINE typename J1<pmacc::math::Complex<T_Type>>::result j1(pmacc::math::Complex<T_Type> const& z)
+            {
+                return J1<pmacc::math::Complex<T_Type>>()(z);
+            }
+
+            PMACC_CONST_VECTOR(
+                double,
+                14,
+                aDouble,
+                -7.03125e-2,
+                0.112152099609375,
+                -0.5725014209747314,
+                6.074042001273483,
+                -1.100171402692467e2,
+                3.038090510922384e3,
+                -1.188384262567832e5,
+                6.252951493434797e6,
+                -4.259392165047669e8,
+                3.646840080706556e10,
+                -3.833534661393944e12,
+                4.854014686852901e14,
+                -7.286857349377656e16,
+                1.279721941975975e19);
+
+            PMACC_CONST_VECTOR(
+                double,
+                14,
+                bDouble,
+                7.32421875e-2,
+                -0.2271080017089844,
+                1.727727502584457,
+                -2.438052969955606e1,
+                5.513358961220206e2,
+                -1.825775547429318e4,
+                8.328593040162893e5,
+                -5.006958953198893e7,
+                3.836255180230433e9,
+                -3.649010818849833e11,
+                4.218971570284096e13,
+                -5.827244631566907e15,
+                9.476288099260110e17,
+                -1.792162323051699e20);
+
+            PMACC_CONST_VECTOR(
+                double,
+                14,
+                a1Double,
+                0.1171875,
+                -0.1441955566406250,
+                0.6765925884246826,
+                -6.883914268109947,
+                1.215978918765359e2,
+                -3.302272294480852e3,
+                1.276412726461746e5,
+                -6.656367718817688e6,
+                4.502786003050393e8,
+                -3.833857520742790e10,
+                4.011838599133198e12,
+                -5.060568503314727e14,
+                7.572616461117958e16,
+                -1.326257285320556e19);
+
+            PMACC_CONST_VECTOR(
+                double,
+                14,
+                b1Double,
+                -0.1025390625,
+                0.2775764465332031,
+                -1.993531733751297,
+                2.724882731126854e1,
+                -6.038440767050702e2,
+                1.971837591223663e4,
+                -8.902978767070678e5,
+                5.310411010968522e7,
+                -4.043620325107754e9,
+                3.827011346598605e11,
+                -4.406481417852278e13,
+                6.065091351222699e15,
+                -9.833883876590679e17,
+                1.855045211579828e20);
+
+            PMACC_CONST_VECTOR(
+                float,
+                14,
+                aFloat,
+                -7.03125e-2,
+                0.112152099609375,
+                -0.5725014209747314,
+                6.074042001273483,
+                -1.100171402692467e2,
+                3.038090510922384e3,
+                -1.188384262567832e5,
+                6.252951493434797e6,
+                -4.259392165047669e8,
+                3.646840080706556e10,
+                -3.833534661393944e12,
+                4.854014686852901e14,
+                -7.286857349377656e16,
+                1.279721941975975e19);
+
+            PMACC_CONST_VECTOR(
+                float,
+                14,
+                bFloat,
+                7.32421875e-2,
+                -0.2271080017089844,
+                1.727727502584457,
+                -2.438052969955606e1,
+                5.513358961220206e2,
+                -1.825775547429318e4,
+                8.328593040162893e5,
+                -5.006958953198893e7,
+                3.836255180230433e9,
+                -3.649010818849833e11,
+                4.218971570284096e13,
+                -5.827244631566907e15,
+                9.476288099260110e17,
+                -1.792162323051699e20);
+
+            PMACC_CONST_VECTOR(
+                float,
+                14,
+                a1Float,
+                0.1171875,
+                -0.1441955566406250,
+                0.6765925884246826,
+                -6.883914268109947,
+                1.215978918765359e2,
+                -3.302272294480852e3,
+                1.276412726461746e5,
+                -6.656367718817688e6,
+                4.502786003050393e8,
+                -3.833857520742790e10,
+                4.011838599133198e12,
+                -5.060568503314727e14,
+                7.572616461117958e16,
+                -1.326257285320556e19);
+
+            PMACC_CONST_VECTOR(
+                float,
+                14,
+                b1Float,
+                -0.1025390625,
+                0.2775764465332031,
+                -1.993531733751297,
+                2.724882731126854e1,
+                -6.038440767050702e2,
+                1.971837591223663e4,
+                -8.902978767070678e5,
+                5.310411010968522e7,
+                -4.043620325107754e9,
+                3.827011346598605e11,
+                -4.406481417852278e13,
+                6.065091351222699e15,
+                -9.833883876590679e17,
+                1.855045211579828e20);
+        } // namespace bessel
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/complex/Bessel.tpp b/include/pmacc/math/complex/Bessel.tpp
index 9437a50a25..449c714ec6 100644
--- a/include/pmacc/math/complex/Bessel.tpp
+++ b/include/pmacc/math/complex/Bessel.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2003-2020 Alexander Debus, C. Bond
+/* Copyright 2003-2021 Alexander Debus, C. Bond
  *
  * This file is part of PMacc.
  *
@@ -69,218 +69,207 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-namespace bessel
-{
-    namespace pmMath = pmacc::algorithms::math;
-
-    template<
-        typename T_Type,
-        typename T_TableA,
-        typename T_TableB,
-        typename T_TableA1,
-        typename T_TableB1
-    >
-    struct Cbesselj0Base
+    namespace math
     {
-        using Result = pmacc::math::Complex< T_Type >;
-        using complex_T = pmacc::math::Complex< T_Type >;
-        using float_T = T_Type;
-
-        HDINLINE Result operator( )( complex_T const & z )
+        namespace bessel
         {
-            T_TableA a;
-            T_TableB b;
-            T_TableA1 a1;
-            T_TableB1 b1;
-            Result cj0;
-            /* The target rel. accuracy goal eps is chosen according to the original implementation
-             * of C. Bond, where for double-precision the accuracy goal is 1.0e-15. Here the accuracy
-             * goal value is the same 4.5 * DBL_EPSILON = 1.0e-15 for double-precision, but is similarly
-             * defined for float-precision.
-             */
-            float_T const eps = float_T( 4.5 ) * std::numeric_limits< float_T >::epsilon( );
-
-            complex_T const cii = complex_T( 0, 1 );
-            complex_T const cone = complex_T( 1, 0 );
-            complex_T const czero = complex_T( 0, 0 );
-
-            float_T const a0 = pmMath::abs( z );
-            complex_T const z2 = z * z;
-            complex_T z1 = z;
-            if( a0 == float_T( 0.0 ) )
-            {
-                cj0 = cone;
-                return cj0;
-            }
-            if( z.get_real() < float_T( 0.0 ) )
-                z1 = float_T( -1.0 ) * z;
-            if( a0 <= float_T( 12.0 ) )
+            template<typename T_Type, typename T_TableA, typename T_TableB, typename T_TableA1, typename T_TableB1>
+            struct Cbesselj0Base
             {
-                cj0 = cone;
-                complex_T cr = cone;
-                for ( uint32_t k = 1u; k <= 40u; k++ )
-                {
-                    cr *= float_T( -0.25 ) * z2 / float_T( k * k );
-                    cj0 += cr;
-                    if( pmMath::abs( cr ) < pmMath::abs( cj0 ) * eps ) break;
-                }
-            }
-            else {
-                uint32_t kz;
-                if( a0 >= float_T( 50.0 ) ) kz = 8u;       // can be changed to 10
-                else if( a0 >= float_T( 35.0 ) ) kz = 10u; //   "      "     "  12
-                else kz = 12u;                             //   "      "     "  14
-                complex_T ct1 = z1 - Pi< float_T >::quarterValue;
-                complex_T cp0 = cone;
-                for ( uint32_t k = 0u; k < kz; k++ )
-                {
-                    cp0 += a[ k ] * pow(
-                        z1,
-                        float_T( -2.0 ) * k - float_T( 2.0 )
-                    );
-                }
-                complex_T cq0 = float_T( -0.125 ) / z1;
-                for ( uint32_t k = 0; k < kz; k++ )
+                using Result = pmacc::math::Complex<T_Type>;
+                using complex_T = pmacc::math::Complex<T_Type>;
+                using float_T = T_Type;
+
+                HDINLINE Result operator()(complex_T const& z)
                 {
-                    cq0 += b[ k ] * pmMath::pow(
-                        z1,
-                        float_T( -2.0 ) * k - float_T( 3.0 )
-                    );
-                }
-                complex_T const cu = pmMath::sqrt( Pi< float_T >::doubleReciprocalValue / z1 );
-                cj0 = cu * ( cp0 * pmMath::cos( ct1 ) - cq0 * pmMath::sin( ct1 ) );
-            }
-            return cj0;
-        }
-    };
+                    T_TableA a;
+                    T_TableB b;
+                    T_TableA1 a1;
+                    T_TableB1 b1;
+                    Result cj0;
+                    /* The target rel. accuracy goal eps is chosen according to the original implementation
+                     * of C. Bond, where for double-precision the accuracy goal is 1.0e-15. Here the accuracy
+                     * goal value is the same 4.5 * DBL_EPSILON = 1.0e-15 for double-precision, but is similarly
+                     * defined for float-precision.
+                     */
+                    float_T const eps = float_T(4.5) * std::numeric_limits<float_T>::epsilon();
 
-    template<
-        typename T_Type,
-        typename T_TableA,
-        typename T_TableB,
-        typename T_TableA1,
-        typename T_TableB1
-    >
-    struct Cbesselj1Base
-    {
-        using Result = pmacc::math::Complex< T_Type >;
-        using complex_T = pmacc::math::Complex< T_Type >;
-        using float_T = T_Type;
+                    complex_T const cii = complex_T(0, 1);
+                    complex_T const cone = complex_T(1, 0);
+                    complex_T const czero = complex_T(0, 0);
 
-        HDINLINE Result operator( )( complex_T const & z )
-        {
-            T_TableA a;
-            T_TableB b;
-            T_TableA1 a1;
-            T_TableB1 b1;
-            Result cj1;
-            /* The target rel. accuracy goal eps is chosen according to the original implementation
-             * of C. Bond, where for double-precision the accuracy goal is 1.0e-15. Here the accuracy
-             * goal value is the same 4.5 * DBL_EPSILON = 1.0e-15 for double-precision, but is similarly
-             * defined for float-precision.
-             */
-            float_T const eps = float_T( 4.5 ) * std::numeric_limits< float_T >::epsilon( );
-
-            complex_T const cii = complex_T( 0, 1 );
-            complex_T const cone = complex_T( 1, 0 );
-            complex_T const czero = complex_T( 0, 0 );
+                    float_T const a0 = cupla::math::abs(z);
+                    complex_T const z2 = z * z;
+                    complex_T z1 = z;
+                    if(a0 == float_T(0.0))
+                    {
+                        cj0 = cone;
+                        return cj0;
+                    }
+                    if(z.get_real() < float_T(0.0))
+                        z1 = float_T(-1.0) * z;
+                    if(a0 <= float_T(12.0))
+                    {
+                        cj0 = cone;
+                        complex_T cr = cone;
+                        for(uint32_t k = 1u; k <= 40u; k++)
+                        {
+                            cr *= float_T(-0.25) * z2 / float_T(k * k);
+                            cj0 += cr;
+                            if(cupla::math::abs(cr) < cupla::math::abs(cj0) * eps)
+                                break;
+                        }
+                    }
+                    else
+                    {
+                        uint32_t kz;
+                        if(a0 >= float_T(50.0))
+                            kz = 8u; // can be changed to 10
+                        else if(a0 >= float_T(35.0))
+                            kz = 10u; //   "      "     "  12
+                        else
+                            kz = 12u; //   "      "     "  14
+                        complex_T ct1 = z1 - Pi<float_T>::quarterValue;
+                        complex_T cp0 = cone;
+                        for(uint32_t k = 0u; k < kz; k++)
+                        {
+                            cp0 += a[k] * pow(z1, float_T(-2.0) * k - float_T(2.0));
+                        }
+                        complex_T cq0 = float_T(-0.125) / z1;
+                        for(uint32_t k = 0; k < kz; k++)
+                        {
+                            cq0 += b[k] * cupla::pow(z1, float_T(-2.0) * k - float_T(3.0));
+                        }
+                        complex_T const cu = cupla::math::sqrt(Pi<float_T>::doubleReciprocalValue / z1);
+                        cj0 = cu * (cp0 * cupla::math::cos(ct1) - cq0 * cupla::math::sin(ct1));
+                    }
+                    return cj0;
+                }
+            };
 
-            float_T const a0 = pmMath::abs( z );
-            complex_T const z2 = z * z;
-            complex_T z1 = z;
-            if( a0 == float_T( 0.0 ) )
+            template<typename T_Type, typename T_TableA, typename T_TableB, typename T_TableA1, typename T_TableB1>
+            struct Cbesselj1Base
             {
-                cj1 = czero;
-                return cj1;
-            }
-            if( z.get_real() < float_T( 0.0 ) )
-                z1 = float_T( -1.0 ) * z;
-            if( a0 <= float_T( 12.0 ) )
-            {
-                cj1 = cone;
-                complex_T cr = cone;
-                for ( uint32_t k = 1u; k <= 40u; k++ )
-                {
-                    cr *= float_T( -0.25 ) * z2 / ( k * ( k + float_T( 1.0 ) ) );
-                    cj1 += cr;
-                    if ( pmMath::abs( cr ) < pmMath::abs( cj1 ) * eps ) break;
-                }
-                cj1 *= float_T( 0.5 ) * z1;
-            }
-            else {
-                uint32_t kz;
-                if( a0 >= float_T( 50.0 ) ) kz = 8u;        // can be changed to 10
-                else if ( a0 >= float_T( 35.0 ) ) kz = 10u; //   "      "     "  12
-                else kz = 12u;                              //   "      "     "  14
-                complex_T const cu = pmMath::sqrt( Pi< float_T >::doubleReciprocalValue / z1 );
-                complex_T const ct2 = z1 - float_T( 0.75 ) * Pi< float_T >::value;
-                complex_T cp1 = cone;
-                for ( uint32_t k = 0u; k < kz; k++ )
-                {
-                    cp1 += a1[ k ] * pmMath::pow(
-                        z1,
-                        float_T( -2.0 ) * k - float_T( 2.0 )
-                    );
-                }
-                complex_T cq1 = float_T( 0.375 ) / z1;
-                for ( uint32_t k = 0u; k < kz; k++ )
+                using Result = pmacc::math::Complex<T_Type>;
+                using complex_T = pmacc::math::Complex<T_Type>;
+                using float_T = T_Type;
+
+                HDINLINE Result operator()(complex_T const& z)
                 {
-                    cq1 += b1[ k ] * pmMath::pow(
-                        z1,
-                        float_T( -2.0 ) * k - float_T( 3.0 )
-                    );
+                    T_TableA a;
+                    T_TableB b;
+                    T_TableA1 a1;
+                    T_TableB1 b1;
+                    Result cj1;
+                    /* The target rel. accuracy goal eps is chosen according to the original implementation
+                     * of C. Bond, where for double-precision the accuracy goal is 1.0e-15. Here the accuracy
+                     * goal value is the same 4.5 * DBL_EPSILON = 1.0e-15 for double-precision, but is similarly
+                     * defined for float-precision.
+                     */
+                    float_T const eps = float_T(4.5) * std::numeric_limits<float_T>::epsilon();
+
+                    complex_T const cii = complex_T(0, 1);
+                    complex_T const cone = complex_T(1, 0);
+                    complex_T const czero = complex_T(0, 0);
+
+                    float_T const a0 = cupla::math::abs(z);
+                    complex_T const z2 = z * z;
+                    complex_T z1 = z;
+                    if(a0 == float_T(0.0))
+                    {
+                        cj1 = czero;
+                        return cj1;
+                    }
+                    if(z.get_real() < float_T(0.0))
+                        z1 = float_T(-1.0) * z;
+                    if(a0 <= float_T(12.0))
+                    {
+                        cj1 = cone;
+                        complex_T cr = cone;
+                        for(uint32_t k = 1u; k <= 40u; k++)
+                        {
+                            cr *= float_T(-0.25) * z2 / (k * (k + float_T(1.0)));
+                            cj1 += cr;
+                            if(cupla::math::abs(cr) < cupla::math::abs(cj1) * eps)
+                                break;
+                        }
+                        cj1 *= float_T(0.5) * z1;
+                    }
+                    else
+                    {
+                        uint32_t kz;
+                        if(a0 >= float_T(50.0))
+                            kz = 8u; // can be changed to 10
+                        else if(a0 >= float_T(35.0))
+                            kz = 10u; //   "      "     "  12
+                        else
+                            kz = 12u; //   "      "     "  14
+                        complex_T const cu = cupla::math::sqrt(Pi<float_T>::doubleReciprocalValue / z1);
+                        complex_T const ct2 = z1 - float_T(0.75) * Pi<float_T>::value;
+                        complex_T cp1 = cone;
+                        for(uint32_t k = 0u; k < kz; k++)
+                        {
+                            cp1 += a1[k] * cupla::pow(z1, float_T(-2.0) * k - float_T(2.0));
+                        }
+                        complex_T cq1 = float_T(0.375) / z1;
+                        for(uint32_t k = 0u; k < kz; k++)
+                        {
+                            cq1 += b1[k] * cupla::pow(z1, float_T(-2.0) * k - float_T(3.0));
+                        }
+                        cj1 = cu * (cp1 * cupla::math::cos(ct2) - cq1 * cupla::math::sin(ct2));
+                    }
+                    if(z.get_real() < float_T(0.0))
+                    {
+                        cj1 = float_T(-1.0) * cj1;
+                    }
+                    return cj1;
                 }
-                cj1 = cu * ( cp1 * pmMath::cos( ct2 ) - cq1 * pmMath::sin( ct2 ) );
-            }
-            if( z.get_real( ) < float_T( 0.0 ) )
-            {
-                cj1 = float_T( -1.0 ) * cj1;
-            }
-            return cj1;
-        }
-    };
+            };
 
-    template< >
-    struct J0< pmacc::math::Complex< double > > : public Cbesselj0Base<
-        double,
-        pmacc::algorithms::math::bessel::aDouble_t,
-        pmacc::algorithms::math::bessel::bDouble_t,
-        pmacc::algorithms::math::bessel::a1Double_t,
-        pmacc::algorithms::math::bessel::b1Double_t
-    >{ };
+            template<>
+            struct J0<pmacc::math::Complex<double>>
+                : public Cbesselj0Base<
+                      double,
+                      pmacc::math::bessel::aDouble_t,
+                      pmacc::math::bessel::bDouble_t,
+                      pmacc::math::bessel::a1Double_t,
+                      pmacc::math::bessel::b1Double_t>
+            {
+            };
 
-    template< >
-    struct J0< pmacc::math::Complex< float > > : public Cbesselj0Base<
-        float,
-        pmacc::algorithms::math::bessel::aFloat_t,
-        pmacc::algorithms::math::bessel::bFloat_t,
-        pmacc::algorithms::math::bessel::a1Float_t,
-        pmacc::algorithms::math::bessel::b1Float_t
-    >{ };
+            template<>
+            struct J0<pmacc::math::Complex<float>>
+                : public Cbesselj0Base<
+                      float,
+                      pmacc::math::bessel::aFloat_t,
+                      pmacc::math::bessel::bFloat_t,
+                      pmacc::math::bessel::a1Float_t,
+                      pmacc::math::bessel::b1Float_t>
+            {
+            };
 
-    template< >
-    struct J1< pmacc::math::Complex< double > > : public Cbesselj1Base<
-        double,
-        pmacc::algorithms::math::bessel::aDouble_t,
-        pmacc::algorithms::math::bessel::bDouble_t,
-        pmacc::algorithms::math::bessel::a1Double_t,
-        pmacc::algorithms::math::bessel::b1Double_t
-    >{ };
+            template<>
+            struct J1<pmacc::math::Complex<double>>
+                : public Cbesselj1Base<
+                      double,
+                      pmacc::math::bessel::aDouble_t,
+                      pmacc::math::bessel::bDouble_t,
+                      pmacc::math::bessel::a1Double_t,
+                      pmacc::math::bessel::b1Double_t>
+            {
+            };
 
-    template< >
-    struct J1< pmacc::math::Complex< float > > : public Cbesselj1Base<
-        float,
-        pmacc::algorithms::math::bessel::aFloat_t,
-        pmacc::algorithms::math::bessel::bFloat_t,
-        pmacc::algorithms::math::bessel::a1Float_t,
-        pmacc::algorithms::math::bessel::b1Float_t
-    >{ };
+            template<>
+            struct J1<pmacc::math::Complex<float>>
+                : public Cbesselj1Base<
+                      float,
+                      pmacc::math::bessel::aFloat_t,
+                      pmacc::math::bessel::bFloat_t,
+                      pmacc::math::bessel::a1Float_t,
+                      pmacc::math::bessel::b1Float_t>
+            {
+            };
 
-} //namespace bessel
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
+        } // namespace bessel
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/complex/Complex.hpp b/include/pmacc/math/complex/Complex.hpp
index 0aec12bf56..292de6bbb3 100644
--- a/include/pmacc/math/complex/Complex.hpp
+++ b/include/pmacc/math/complex/Complex.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch, Alexander Debus
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch, Alexander Debus
  *
  * This file is part of PMacc.
  *
@@ -24,199 +24,195 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-/** A complex number class */
-template<typename T_Type>
-struct Complex
-{
-
-public:
-
-    typedef T_Type type;
-
-    // constructor (real, imaginary)
-    HDINLINE Complex(T_Type real, T_Type imaginary = type(0.0) ) : real(real), imaginary(imaginary) { }
-
-    // constructor (Complex<T_OtherType>)
-    template<typename T_OtherType>
-    HDINLINE explicit Complex(const Complex<T_OtherType >& other) :
-                        real( static_cast<T_Type> (other.get_real()) ),
-                        imaginary( static_cast<T_Type> (other.get_imag()) ) { }
-
-    // default constructor ( ! no initialization of data ! )
-    HDINLINE Complex(void) { }
-
-    // Conversion from scalar (assignment)
-    HDINLINE Complex& operator=(const T_Type& other)
-    {
-        real = other;
-        imaginary = type(0.0);
-        return *this;
-    }
-
-    // Assignment operator
-    HDINLINE Complex& operator=(const Complex& other)
-    {
-        real = other.get_real();
-        imaginary = other.get_imag();
-        return *this;
-    }
-
-    // assign addition
-    HDINLINE Complex& operator+=(const Complex& other)
-    {
-        real += other.get_real();
-        imaginary += other.get_imag();
-        return *this;
-    }
-
-    // assign difference
-    HDINLINE Complex& operator-=(const Complex& other)
+    namespace math
     {
-        real -= other.get_real();
-        imaginary -= other.get_imag();
-        return *this;
-    }
-
-    // assign multiplication
-    HDINLINE Complex& operator *=(const Complex& other)
-    {
-        *this = *this * other;
-        return *this;
-    }
-
-    // real part
-    HDINLINE T_Type& get_real()
-    {
-        return real;
-    }
-
-    // real part
-    HDINLINE T_Type get_real(void) const
-    {
-        return real;
-    }
-
-    // imaginary part
-    HDINLINE T_Type& get_imag()
-    {
-        return imaginary;
-    }
-
-    // imaginary part
-    HDINLINE T_Type get_imag(void) const
-    {
-        return imaginary;
-    }
-
-    // complex zero
-    HDINLINE static Complex<T_Type> zero(void)
-    {
-        return Complex<T_Type>( type(0.0) , type(0.0) );
-    }
-
-private:
-    PMACC_ALIGN(real,T_Type); // real part
-    PMACC_ALIGN(imaginary,T_Type); // imaginary part
-
-};
-
-/** Addition operators */
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator+(const Complex<T_Type>& lhs, const Complex<T_Type>& rhs)
-{
-    return Complex<T_Type>(lhs.get_real() + rhs.get_real(), lhs.get_imag() + rhs.get_imag());
-}
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator+(const Complex<T_Type>& lhs, const T_Type& rhs)
-{
-    return Complex<T_Type>(lhs.get_real() + rhs, lhs.get_imag());
-}
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator+(const T_Type& lhs, const Complex<T_Type>& rhs)
-{
-    return Complex<T_Type>(lhs + rhs.get_real(), rhs.get_imag());
-}
-
-/** Subtraction operators */
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator-(const Complex<T_Type>& lhs, const Complex<T_Type>& rhs)
-{
-    return Complex<T_Type>(lhs.get_real() - rhs.get_real(), lhs.get_imag() - rhs.get_imag());
-}
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator-(const Complex<T_Type>& lhs, const T_Type& rhs)
-{
-    return Complex<T_Type>(lhs.get_real() - rhs, lhs.get_imag());
-}
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator-(const T_Type& lhs, const Complex<T_Type>& rhs)
-{
-    return Complex<T_Type>(lhs - rhs.get_real(), -rhs.get_imag());
-}
-
-/** Multiplication operators */
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator*(const Complex<T_Type>& lhs, const Complex<T_Type>& rhs)
-{
-    return Complex<T_Type>(lhs.get_real() * rhs.get_real() - lhs.get_imag() * rhs.get_imag(),
-                     lhs.get_imag() * rhs.get_real() + lhs.get_real() * rhs.get_imag());
-}
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator*(const Complex<T_Type>& lhs, const T_Type& rhs)
-{
-    return Complex<T_Type>(lhs.get_real() * rhs, lhs.get_imag() * rhs);
-}
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator*(const T_Type& lhs, const Complex<T_Type>& rhs)
-{
-    return Complex<T_Type>(lhs * rhs.get_real(), lhs * rhs.get_imag());
-}
-
-/** Division operators */
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator/(const Complex<T_Type>& lhs, const T_Type& rhs)
-{
-    return Complex<T_Type>(lhs.get_real() / rhs, lhs.get_imag() / rhs);
-}
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator/(const T_Type& lhs, const Complex<T_Type>& rhs)
-{
-    return Complex<T_Type>(lhs * rhs.get_real()/(rhs.get_real()*rhs.get_real()+rhs.get_imag()*rhs.get_imag()),
-                     -lhs * rhs.get_imag()/( rhs.get_real()*rhs.get_real()+rhs.get_imag()*rhs.get_imag() ));
-}
-
-template<typename T_Type>
-HDINLINE Complex<T_Type>
-operator/(const Complex<T_Type>& lhs, const Complex<T_Type>& rhs)
-{
-    return lhs*Complex<T_Type>(rhs.get_real()/(rhs.get_real()*rhs.get_real()+rhs.get_imag()*rhs.get_imag()),
-                        -rhs.get_imag()/( rhs.get_real()*rhs.get_real()+rhs.get_imag()*rhs.get_imag() ));
-}
-
-} //namespace math
-} //namespace pmacc
+        /** A complex number class */
+        template<typename T_Type>
+        struct Complex
+        {
+        public:
+            typedef T_Type type;
+
+            // constructor (real, imaginary)
+            HDINLINE Complex(T_Type real, T_Type imaginary = type(0.0)) : real(real), imaginary(imaginary)
+            {
+            }
+
+            constexpr HDINLINE Complex(const Complex& other) = default;
+
+            // constructor (Complex<T_OtherType>)
+            template<typename T_OtherType>
+            HDINLINE explicit Complex(const Complex<T_OtherType>& other)
+                : real(static_cast<T_Type>(other.get_real()))
+                , imaginary(static_cast<T_Type>(other.get_imag()))
+            {
+            }
+
+            // default constructor ( ! no initialization of data ! )
+            HDINLINE Complex(void)
+            {
+            }
+
+            // Conversion from scalar (assignment)
+            HDINLINE Complex& operator=(const T_Type& other)
+            {
+                real = other;
+                imaginary = type(0.0);
+                return *this;
+            }
+
+            // Assignment operator
+            HDINLINE Complex& operator=(const Complex& other)
+            {
+                real = other.get_real();
+                imaginary = other.get_imag();
+                return *this;
+            }
+
+            // assign addition
+            HDINLINE Complex& operator+=(const Complex& other)
+            {
+                real += other.get_real();
+                imaginary += other.get_imag();
+                return *this;
+            }
+
+            // assign difference
+            HDINLINE Complex& operator-=(const Complex& other)
+            {
+                real -= other.get_real();
+                imaginary -= other.get_imag();
+                return *this;
+            }
+
+            // assign multiplication
+            HDINLINE Complex& operator*=(const Complex& other)
+            {
+                *this = *this * other;
+                return *this;
+            }
+
+            // real part
+            HDINLINE T_Type& get_real()
+            {
+                return real;
+            }
+
+            // real part
+            HDINLINE T_Type get_real(void) const
+            {
+                return real;
+            }
+
+            // imaginary part
+            HDINLINE T_Type& get_imag()
+            {
+                return imaginary;
+            }
+
+            // imaginary part
+            HDINLINE T_Type get_imag(void) const
+            {
+                return imaginary;
+            }
+
+            // complex zero
+            HDINLINE static Complex<T_Type> zero(void)
+            {
+                return Complex<T_Type>(type(0.0), type(0.0));
+            }
+
+        private:
+            PMACC_ALIGN(real, T_Type); // real part
+            PMACC_ALIGN(imaginary, T_Type); // imaginary part
+        };
+
+        /** Addition operators */
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator+(const Complex<T_Type>& lhs, const Complex<T_Type>& rhs)
+        {
+            return Complex<T_Type>(lhs.get_real() + rhs.get_real(), lhs.get_imag() + rhs.get_imag());
+        }
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator+(const Complex<T_Type>& lhs, const T_Type& rhs)
+        {
+            return Complex<T_Type>(lhs.get_real() + rhs, lhs.get_imag());
+        }
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator+(const T_Type& lhs, const Complex<T_Type>& rhs)
+        {
+            return Complex<T_Type>(lhs + rhs.get_real(), rhs.get_imag());
+        }
+
+        /** Subtraction operators */
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator-(const Complex<T_Type>& lhs, const Complex<T_Type>& rhs)
+        {
+            return Complex<T_Type>(lhs.get_real() - rhs.get_real(), lhs.get_imag() - rhs.get_imag());
+        }
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator-(const Complex<T_Type>& lhs, const T_Type& rhs)
+        {
+            return Complex<T_Type>(lhs.get_real() - rhs, lhs.get_imag());
+        }
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator-(const T_Type& lhs, const Complex<T_Type>& rhs)
+        {
+            return Complex<T_Type>(lhs - rhs.get_real(), -rhs.get_imag());
+        }
+
+        /** Multiplication operators */
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator*(const Complex<T_Type>& lhs, const Complex<T_Type>& rhs)
+        {
+            return Complex<T_Type>(
+                lhs.get_real() * rhs.get_real() - lhs.get_imag() * rhs.get_imag(),
+                lhs.get_imag() * rhs.get_real() + lhs.get_real() * rhs.get_imag());
+        }
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator*(const Complex<T_Type>& lhs, const T_Type& rhs)
+        {
+            return Complex<T_Type>(lhs.get_real() * rhs, lhs.get_imag() * rhs);
+        }
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator*(const T_Type& lhs, const Complex<T_Type>& rhs)
+        {
+            return Complex<T_Type>(lhs * rhs.get_real(), lhs * rhs.get_imag());
+        }
+
+        /** Division operators */
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator/(const Complex<T_Type>& lhs, const T_Type& rhs)
+        {
+            return Complex<T_Type>(lhs.get_real() / rhs, lhs.get_imag() / rhs);
+        }
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator/(const T_Type& lhs, const Complex<T_Type>& rhs)
+        {
+            return Complex<T_Type>(
+                lhs * rhs.get_real() / (rhs.get_real() * rhs.get_real() + rhs.get_imag() * rhs.get_imag()),
+                -lhs * rhs.get_imag() / (rhs.get_real() * rhs.get_real() + rhs.get_imag() * rhs.get_imag()));
+        }
+
+        template<typename T_Type>
+        HDINLINE Complex<T_Type> operator/(const Complex<T_Type>& lhs, const Complex<T_Type>& rhs)
+        {
+            return lhs
+                * Complex<T_Type>(
+                       rhs.get_real() / (rhs.get_real() * rhs.get_real() + rhs.get_imag() * rhs.get_imag()),
+                       -rhs.get_imag() / (rhs.get_real() * rhs.get_real() + rhs.get_imag() * rhs.get_imag()));
+        }
+
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/complex/Complex.tpp b/include/pmacc/math/complex/Complex.tpp
index 5034c116d2..47d781d7ae 100644
--- a/include/pmacc/math/complex/Complex.tpp
+++ b/include/pmacc/math/complex/Complex.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch,
  *                     Alexander Debus, Benjamin Worpitz, Finn-Ole Carstens
  *
  * This file is part of PMacc.
@@ -33,254 +33,261 @@
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-namespace pmMath = pmacc::algorithms::math;
-
-/*  Set primary template and subsequent specialization for returning a complex number
-    by using Euler's formula. */
-
-template<typename T_Type>
-struct Euler;
-
-template<typename T_Type>
-HDINLINE typename Euler< T_Type >::result euler(const T_Type& magnitude, const T_Type& phase)
-{
-    return Euler< T_Type > ()(magnitude, phase);
-}
-
-template<typename T_Type>
-HDINLINE typename Euler< T_Type >::result euler(const T_Type& magnitude, const T_Type& sinValue,
-                                                const T_Type& cosValue)
-{
-    return Euler< T_Type > ()(magnitude, sinValue, cosValue);
-}
-
-template<typename T_Type>
-struct Euler
-{
-    typedef typename ::pmacc::math::Complex<T_Type> result;
-
-    HDINLINE result operator( )(const T_Type &magnitude, const T_Type &phase)
-    {
-        return result(magnitude * pmMath::cos(phase),magnitude * pmMath::sin(phase));
-    }
-
-    HDINLINE result operator( )(const T_Type &magnitude,
-                                const T_Type &sinValue, const T_Type &cosValue)
+    namespace math
     {
-        return result(magnitude * cosValue, magnitude * sinValue);
-    }
-};
+        /*  Set primary template and subsequent specialization for returning a complex number
+            by using Euler's formula. */
 
-/* Specialize sqrt() for complex numbers. */
+        template<typename T_Type>
+        struct Euler;
 
-template<typename T_Type>
-struct Sqrt< ::pmacc::math::Complex<T_Type> >
-{
-    typedef typename ::pmacc::math::Complex<T_Type> result;
-    typedef T_Type type;
-
-    HDINLINE result operator( )(const ::pmacc::math::Complex<T_Type>& other)
-    {
-        if (other.get_real()<=type(0.0) && other.get_imag()==type(0.0) ) {
-            return ::pmacc::math::Complex<T_Type>(type(0.0), pmMath::sqrt( -other.get_real() ) );
-        }
-        else {
-            return pmMath::sqrt( pmMath::abs(other) )*(other+pmMath::abs(other))
-                /pmMath::abs(other+pmMath::abs(other));
-        }
-    }
-};
-
-/* Specialize exp() for complex numbers. */
-
-template<typename T_Type>
-struct Exp< ::pmacc::math::Complex<T_Type> >
-{
-    typedef typename ::pmacc::math::Complex<T_Type> result;
-    typedef T_Type type;
-
-    HDINLINE result operator( )(const ::pmacc::math::Complex<T_Type>& other)
-    {
-        return pmMath::euler(type(1.0),other.get_imag())*pmMath::exp(other.get_real());
-    }
-};
-
-/*  Set primary template and subsequent specialization of arg() for retrieving
- *  the phase of a complex number (Note: Branchcut running from -infinity to 0).
- */
-template<typename T_Type>
-struct Arg;
-
-template<typename T_Type>
-HDINLINE typename Arg< T_Type >::result arg(const T_Type& val)
-{
-    return Arg< T_Type > ()(val);
-}
-
-template<typename T_Type>
-struct Arg< ::pmacc::math::Complex<T_Type> >
-{
-    typedef typename ::pmacc::math::Complex<T_Type>::type result;
-    typedef T_Type type;
-
-    HDINLINE result operator( )(const ::pmacc::math::Complex<T_Type>& other)
-    {
-        if ( other.get_real()==type(0.0) && other.get_imag()==type(0.0) )
-            return type(0.0);
-        else if ( other.get_real()==type(0.0) && other.get_imag()>type(0.0) )
-            return Pi< type >::halfValue;
-        else if ( other.get_real()==type(0.0) && other.get_imag()<type(0.0) )
-            return -Pi< type >::halfValue;
-        else if ( other.get_real()<type(0.0) && other.get_imag()==type(0.0) )
-            return Pi< type >::value;
-        else
-            return pmMath::atan2(other.get_imag(),other.get_real());
-    }
-};
-
-/*  Specialize pow() for complex numbers. */
-template<typename T_Type>
-struct Pow< ::pmacc::math::Complex<T_Type>, T_Type >
-{
-    typedef typename ::pmacc::math::Complex<T_Type> result;
-    typedef T_Type type;
-
-    HDINLINE result operator( )(const ::pmacc::math::Complex<T_Type>& other,
-                                const T_Type& exponent)
-    {
-        return pmMath::pow( pmMath::abs(other),exponent )
-                *pmMath::exp( ::pmacc::math::Complex<T_Type>(type(0.),type(1.) )
-                *pmMath::arg(other)*exponent );
-    }
-};
-
-/*  Specialize abs() for complex numbers. */
-template<typename T_Type>
-struct Abs< ::pmacc::math::Complex<T_Type> >
-{
-    typedef typename ::pmacc::math::Complex<T_Type>::type result;
-
-    HDINLINE result operator( )(const ::pmacc::math::Complex<T_Type>& other)
-    {
-        return pmMath::sqrt( pmMath::abs2(other.get_real()) + pmMath::abs2(other.get_imag()) );
-    }
-};
-
-/*  Specialize abs2() for complex numbers. */
-template<typename T_Type>
-struct Abs2< ::pmacc::math::Complex<T_Type> >
-{
-    typedef typename ::pmacc::math::Complex<T_Type>::type result;
-
-    HDINLINE result operator( )(const ::pmacc::math::Complex<T_Type>& other)
-    {
-        return pmMath::abs2(other.get_real()) + pmMath::abs2(other.get_imag());
-    }
-};
-
-    /*  Specialize log() for complex numbers. */
-    template< typename T_Type >
-    struct Log< ::pmacc::math::Complex< T_Type > >
-    {
-        using type = T_Type;
-        using result = typename ::pmacc::math::Complex< type >::type;
-
-        HDINLINE result operator( )( ::pmacc::math::Complex< T_Type > const & other )
+        template<typename T_Type>
+        HDINLINE typename Euler<T_Type>::result euler(const T_Type& magnitude, const T_Type& phase)
         {
-            return pmMath::log( pmMath::abs( other ) ) +
-                ::pmacc::math::Complex< T_Type >(
-                    type( 0. ),
-                    type( 1. )
-                ) * pmMath::arg( other );
+            return Euler<T_Type>()(magnitude, phase);
         }
-    };
-
-    /*  Specialize sin( ) for complex numbers. */
-    template< typename T_Type >
-    struct Sin< ::pmacc::math::Complex< T_Type > >
-    {
-        using result = typename ::pmacc::math::Complex< T_Type >;
-        using type = T_Type;
 
-        HDINLINE result operator( )( const ::pmacc::math::Complex< T_Type > & other )
+        template<typename T_Type>
+        HDINLINE typename Euler<T_Type>::result euler(
+            const T_Type& magnitude,
+            const T_Type& sinValue,
+            const T_Type& cosValue)
         {
-            return ( pmMath::exp( ::pmacc::math::Complex< T_Type >( type( 0. ), type( 1. ) ) * other ) -
-                   pmMath::exp( ::pmacc::math::Complex< T_Type >( type( 0. ), type( -1. ) ) * other ) ) /
-                   ::pmacc::math::Complex< T_Type >( type( 0. ), type( 2. ) );
+            return Euler<T_Type>()(magnitude, sinValue, cosValue);
         }
-    };
-
-    /*  Specialize cos( ) for complex numbers. */
-    template< typename T_Type >
-    struct Cos< ::pmacc::math::Complex< T_Type > >
-    {
-        using result = typename ::pmacc::math::Complex< T_Type >;
-        using type = T_Type;
 
-        HDINLINE result operator( )( const ::pmacc::math::Complex< T_Type >& other )
+        template<typename T_Type>
+        struct Euler
+        {
+            typedef typename ::pmacc::math::Complex<T_Type> result;
+
+            HDINLINE result operator()(const T_Type& magnitude, const T_Type& phase)
+            {
+                return result(magnitude * cupla::math::cos(phase), magnitude * cupla::math::sin(phase));
+            }
+
+            HDINLINE result operator()(const T_Type& magnitude, const T_Type& sinValue, const T_Type& cosValue)
+            {
+                return result(magnitude * cosValue, magnitude * sinValue);
+            }
+        };
+
+        /*  Set primary template and subsequent specialization of arg() for retrieving
+         *  the phase of a complex number (Note: Branchcut running from -infinity to 0).
+         */
+        template<typename T_Type>
+        struct Arg;
+
+        template<typename T_Type>
+        HDINLINE typename Arg<T_Type>::result arg(const T_Type& val)
         {
-            return ( pmMath::exp( ::pmacc::math::Complex< T_Type >( type( 0. ), type( 1. ) ) * other ) +
-                   pmMath::exp( ::pmacc::math::Complex< T_Type >( type( 0. ), type( -1. ) ) * other ) ) /
-                   type( 2.0 );
+            return Arg<T_Type>()(val);
         }
-    };
-
-} //namespace math
-} //namespace algorithms
-} //namespace pmacc
-
-namespace pmacc
-{
-namespace algorithms
-{
-namespace precisionCast
-{
 
-/*  Specialize precisionCast-operators for complex numbers. */
+        template<typename T_Type>
+        struct Arg<::pmacc::math::Complex<T_Type>>
+        {
+            typedef typename ::pmacc::math::Complex<T_Type>::type result;
+            typedef T_Type type;
+
+            HDINLINE result operator()(const ::pmacc::math::Complex<T_Type>& other)
+            {
+                if(other.get_real() == type(0.0) && other.get_imag() == type(0.0))
+                    return type(0.0);
+                else if(other.get_real() == type(0.0) && other.get_imag() > type(0.0))
+                    return Pi<type>::halfValue;
+                else if(other.get_real() == type(0.0) && other.get_imag() < type(0.0))
+                    return -Pi<type>::halfValue;
+                else if(other.get_real() < type(0.0) && other.get_imag() == type(0.0))
+                    return Pi<type>::value;
+                else
+                    return cupla::math::atan2(other.get_imag(), other.get_real());
+            }
+        };
+
+        /** Specialize abs2() for complex numbers.
+         *
+         * Note: Abs is specialized in alpaka::math below
+         */
+        template<typename T_Type>
+        struct Abs2<::pmacc::math::Complex<T_Type>>
+        {
+            typedef typename ::pmacc::math::Complex<T_Type>::type result;
 
-template<typename T_CastToType>
-struct TypeCast<T_CastToType, ::pmacc::math::Complex<T_CastToType> >
-{
-    typedef const ::pmacc::math::Complex<T_CastToType>& result;
+            HDINLINE result operator()(const ::pmacc::math::Complex<T_Type>& other)
+            {
+                return pmacc::math::abs2(other.get_real()) + pmacc::math::abs2(other.get_imag());
+            }
+        };
 
-    HDINLINE result operator( )(const ::pmacc::math::Complex<T_CastToType>& complexNumber ) const
-    {
-        return complexNumber;
-    }
-};
+    } // namespace math
+} // namespace pmacc
 
-template<typename T_CastToType, typename T_OldType>
-struct TypeCast<T_CastToType, ::pmacc::math::Complex<T_OldType> >
+namespace alpaka
 {
-    typedef ::pmacc::math::Complex<T_CastToType> result;
-
-    HDINLINE result operator( )(const ::pmacc::math::Complex<T_OldType>& complexNumber ) const
+    namespace math
     {
-        return result( complexNumber );
-    }
-};
+        namespace traits
+        {
+            template<typename T_Ctx, typename T_Type>
+            struct Pow<T_Ctx, ::pmacc::math::Complex<T_Type>, T_Type, void>
+            {
+                ALPAKA_FN_HOST_ACC static auto pow(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Complex<T_Type> const& other,
+                    T_Type const& exponent) -> ::pmacc::math::Complex<T_Type>
+                {
+                    return cupla::pow(cupla::math::abs(other), exponent)
+                        * cupla::math::exp(
+                               ::pmacc::math::Complex<T_Type>(T_Type(0.), T_Type(1.)) * pmacc::math::arg(other)
+                               * exponent);
+                }
+            };
+
+            template<typename T_Ctx, typename T_Type>
+            struct Sqrt<T_Ctx, ::pmacc::math::Complex<T_Type>, void>
+            {
+                ALPAKA_FN_HOST_ACC static auto sqrt(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Complex<T_Type> const& other) -> ::pmacc::math::Complex<T_Type>
+                {
+                    using type = T_Type;
+                    if(other.get_real() <= type(0.0) && other.get_imag() == type(0.0))
+                    {
+                        return ::pmacc::math::Complex<T_Type>(
+                            type(0.0),
+                            alpaka::math::sqrt(mathConcept, -other.get_real()));
+                    }
+                    else
+                    {
+                        return alpaka::math::sqrt(mathConcept, cupla::math::abs(other))
+                            * (other + cupla::math::abs(other)) / cupla::math::abs(other + cupla::math::abs(other));
+                    }
+                }
+            };
+
+            template<typename T_Ctx, typename T_Type>
+            struct Exp<T_Ctx, ::pmacc::math::Complex<T_Type>, void>
+            {
+                ALPAKA_FN_HOST_ACC static auto exp(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Complex<T_Type> const& other) -> ::pmacc::math::Complex<T_Type>
+                {
+                    using type = T_Type;
+                    return pmacc::math::euler(type(1.0), other.get_imag())
+                        * alpaka::math::exp(mathConcept, other.get_real());
+                }
+            };
+
+            template<typename T_Ctx, typename T_Type>
+            struct Abs<T_Ctx, ::pmacc::math::Complex<T_Type>, void>
+            {
+                ALPAKA_FN_HOST_ACC static auto abs(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Complex<T_Type> const& other) -> T_Type
+                {
+                    /* It is not possible to use alpaka::math::sqrt( mathConcept, ... )
+                     * here, as the mathConcept would not match, so go around via cupla
+                     */
+                    return cupla::math::sqrt(pmacc::math::abs2(other));
+                }
+            };
+
+            template<typename T_Ctx, typename T_Type>
+            struct Log<T_Ctx, ::pmacc::math::Complex<T_Type>, void>
+            {
+                ALPAKA_FN_HOST_ACC static auto log(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Complex<T_Type> const& other) -> ::pmacc::math::Complex<T_Type>
+                {
+                    using type = T_Type;
+                    return alpaka::math::log(mathConcept, cupla::math::abs(other))
+                        + ::pmacc::math::Complex<T_Type>(type(0.), type(1.)) * pmacc::math::arg(other);
+                }
+            };
+
+            template<typename T_Ctx, typename T_Type>
+            struct Cos<T_Ctx, ::pmacc::math::Complex<T_Type>, void>
+            {
+                ALPAKA_FN_HOST_ACC static auto cos(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Complex<T_Type> const& other) -> ::pmacc::math::Complex<T_Type>
+                {
+                    using type = T_Type;
+                    return (alpaka::math::exp(mathConcept, ::pmacc::math::Complex<T_Type>(type(0.), type(1.)) * other)
+                            + alpaka::math::exp(
+                                mathConcept,
+                                ::pmacc::math::Complex<T_Type>(type(0.), type(-1.)) * other))
+                        / type(2.0);
+                }
+            };
+
+            template<typename T_Ctx, typename T_Type>
+            struct Sin<T_Ctx, ::pmacc::math::Complex<T_Type>, void>
+            {
+                ALPAKA_FN_HOST_ACC static auto sin(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Complex<T_Type> const& other) -> ::pmacc::math::Complex<T_Type>
+                {
+                    using type = T_Type;
+
+                    return (alpaka::math::exp(mathConcept, ::pmacc::math::Complex<T_Type>(type(0.), type(1.)) * other)
+                            - alpaka::math::exp(
+                                mathConcept,
+                                ::pmacc::math::Complex<T_Type>(type(0.), type(-1.)) * other))
+                        / ::pmacc::math::Complex<T_Type>(type(0.), type(2.));
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
 
-} //namespace typecast
-} //namespace algorithms
 
-namespace mpi
+namespace pmacc
 {
-
-    using complex_X = pmacc::math::Complex< picongpu::float_X >;
-
-    // Specialize complex type grid buffer for MPI
-    template<>
-    MPI_StructAsArray getMPI_StructAsArray< pmacc::math::Complex<picongpu::float_X> >()
+    namespace algorithms
     {
-        MPI_StructAsArray result = getMPI_StructAsArray< complex_X::type > ();
-        result.sizeMultiplier *= uint32_t(sizeof(complex_X) / sizeof(typename complex_X::type));
-        return result;
-    };
+        namespace precisionCast
+        {
+            /*  Specialize precisionCast-operators for complex numbers. */
+
+            template<typename T_CastToType>
+            struct TypeCast<T_CastToType, ::pmacc::math::Complex<T_CastToType>>
+            {
+                typedef const ::pmacc::math::Complex<T_CastToType>& result;
+
+                HDINLINE result operator()(const ::pmacc::math::Complex<T_CastToType>& complexNumber) const
+                {
+                    return complexNumber;
+                }
+            };
+
+            template<typename T_CastToType, typename T_OldType>
+            struct TypeCast<T_CastToType, ::pmacc::math::Complex<T_OldType>>
+            {
+                typedef ::pmacc::math::Complex<T_CastToType> result;
+
+                HDINLINE result operator()(const ::pmacc::math::Complex<T_OldType>& complexNumber) const
+                {
+                    return result(complexNumber);
+                }
+            };
+
+        } // namespace precisionCast
+    } // namespace algorithms
+
+    namespace mpi
+    {
+        using complex_X = pmacc::math::Complex<picongpu::float_X>;
+
+        // Specialize complex type grid buffer for MPI
+        template<>
+        MPI_StructAsArray getMPI_StructAsArray<pmacc::math::Complex<picongpu::float_X>>()
+        {
+            MPI_StructAsArray result = getMPI_StructAsArray<complex_X::type>();
+            result.sizeMultiplier *= uint32_t(sizeof(complex_X) / sizeof(typename complex_X::type));
+            return result;
+        };
 
-} //namespace mpi
-} //namespace pmacc
+    } // namespace mpi
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/Float.hpp b/include/pmacc/math/vector/Float.hpp
index 5e29beb0f4..b807091e6b 100644
--- a/include/pmacc/math/vector/Float.hpp
+++ b/include/pmacc/math/vector/Float.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -25,46 +25,46 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-template<int dim>
-struct Float : public Vector<float, dim>
-{
-    using BaseType = Vector<float, dim>;
-
-    HDINLINE Float()
+    namespace math
     {
-    }
+        template<int dim>
+        struct Float : public Vector<float, dim>
+        {
+            using BaseType = Vector<float, dim>;
 
-    HDINLINE Float(float x) : BaseType(x)
-    {
-    }
+            HDINLINE Float()
+            {
+            }
 
-    HDINLINE Float(float x, float y) : BaseType(x, y)
-    {
-    }
+            HDINLINE Float(float x) : BaseType(x)
+            {
+            }
 
-    HDINLINE Float(float x, float y, float z) : BaseType(x, y, z)
-    {
-    }
+            HDINLINE Float(float x, float y) : BaseType(x, y)
+            {
+            }
 
-    /*! only allow explicit cast*/
-    template<
-    typename T_OtherType,
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE explicit Float(const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec) :
-    BaseType(vec)
-    {
-    }
+            HDINLINE Float(float x, float y, float z) : BaseType(x, y, z)
+            {
+            }
 
-    HDINLINE Float(const BaseType& vec) :
-    BaseType(vec)
-    {
-    }
-};
+            /*! only allow explicit cast*/
+            template<
+                typename T_OtherType,
+                typename T_OtherAccessor,
+                typename T_OtherNavigator,
+                template<typename, int>
+                class T_OtherStorage>
+            HDINLINE explicit Float(
+                const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec)
+                : BaseType(vec)
+            {
+            }
+
+            HDINLINE Float(const BaseType& vec) : BaseType(vec)
+            {
+            }
+        };
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/Int.hpp b/include/pmacc/math/vector/Int.hpp
index dc502c333f..8b4aca725c 100644
--- a/include/pmacc/math/vector/Int.hpp
+++ b/include/pmacc/math/vector/Int.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,46 +25,46 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-template<int dim>
-struct Int : public Vector<int, dim>
-{
-    using BaseType = Vector<int, dim>;
-
-    HDINLINE Int()
+    namespace math
     {
-    }
+        template<int dim>
+        struct Int : public Vector<int, dim>
+        {
+            using BaseType = Vector<int, dim>;
 
-    HDINLINE Int(int x) : BaseType(x)
-    {
-    }
+            HDINLINE Int()
+            {
+            }
 
-    HDINLINE Int(int x, int y) : BaseType(x, y)
-    {
-    }
+            HDINLINE Int(int x) : BaseType(x)
+            {
+            }
 
-    HDINLINE Int(int x, int y, int z) : BaseType(x, y, z)
-    {
-    }
+            HDINLINE Int(int x, int y) : BaseType(x, y)
+            {
+            }
 
-    /*! only allow explicit cast*/
-    template<
-    typename T_OtherType,
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE explicit Int(const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec) :
-    BaseType(vec)
-    {
-    }
+            HDINLINE Int(int x, int y, int z) : BaseType(x, y, z)
+            {
+            }
 
-    HDINLINE Int(const BaseType& vec) :
-    BaseType(vec)
-    {
-    }
-};
+            /*! only allow explicit cast*/
+            template<
+                typename T_OtherType,
+                typename T_OtherAccessor,
+                typename T_OtherNavigator,
+                template<typename, int>
+                class T_OtherStorage>
+            HDINLINE explicit Int(
+                const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec)
+                : BaseType(vec)
+            {
+            }
+
+            HDINLINE Int(const BaseType& vec) : BaseType(vec)
+            {
+            }
+        };
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/Size_t.hpp b/include/pmacc/math/vector/Size_t.hpp
index c59872e5e5..9d5625cbf5 100644
--- a/include/pmacc/math/vector/Size_t.hpp
+++ b/include/pmacc/math/vector/Size_t.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,46 +25,46 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-template<int dim>
-struct Size_t : public Vector<size_t, dim>
-{
-    using BaseType = Vector<size_t, dim>;
-
-    HDINLINE Size_t()
+    namespace math
     {
-    }
+        template<int dim>
+        struct Size_t : public Vector<size_t, dim>
+        {
+            using BaseType = Vector<size_t, dim>;
 
-    HDINLINE Size_t(size_t x) : BaseType(x)
-    {
-    }
+            HDINLINE Size_t()
+            {
+            }
 
-    HDINLINE Size_t(size_t x, size_t y) : BaseType(x, y)
-    {
-    }
+            HDINLINE Size_t(size_t x) : BaseType(x)
+            {
+            }
 
-    HDINLINE Size_t(size_t x, size_t y, size_t z) : BaseType(x, y, z)
-    {
-    }
+            HDINLINE Size_t(size_t x, size_t y) : BaseType(x, y)
+            {
+            }
 
-    /*! only allow explicit cast*/
-    template<
-    typename T_OtherType,
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE explicit Size_t(const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec) :
-    BaseType(vec)
-    {
-    }
+            HDINLINE Size_t(size_t x, size_t y, size_t z) : BaseType(x, y, z)
+            {
+            }
 
-    HDINLINE Size_t(const BaseType& vec) :
-    BaseType(vec)
-    {
-    }
-};
+            /*! only allow explicit cast*/
+            template<
+                typename T_OtherType,
+                typename T_OtherAccessor,
+                typename T_OtherNavigator,
+                template<typename, int>
+                class T_OtherStorage>
+            HDINLINE explicit Size_t(
+                const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec)
+                : BaseType(vec)
+            {
+            }
+
+            HDINLINE Size_t(const BaseType& vec) : BaseType(vec)
+            {
+            }
+        };
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/TwistComponents.hpp b/include/pmacc/math/vector/TwistComponents.hpp
index 3dcb454d23..f471eb7275 100644
--- a/include/pmacc/math/vector/TwistComponents.hpp
+++ b/include/pmacc/math/vector/TwistComponents.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,65 +26,56 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-namespace result_of
-{
-
-template<typename T_Axes,
-typename T_Vector>
-struct TwistComponents
-{
-    using type = typename TwistComponents<
-        T_Axes,
-        typename T_Vector::This
-    >::type;
-};
+    namespace math
+    {
+        namespace result_of
+        {
+            template<typename T_Axes, typename T_Vector>
+            struct TwistComponents
+            {
+                using type = typename TwistComponents<T_Axes, typename T_Vector::This>::type;
+            };
 
-template<typename T_Axes,
-typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage>
-struct TwistComponents<T_Axes,math::Vector<T_Type,T_Dim,T_Accessor,T_Navigator,T_Storage> >
-{
-    using type = math::Vector<
-        T_Type,
-        T_Dim,
-        T_Accessor,
-        math::StackedNavigator<
-            T_Navigator,
-            math::PermutedNavigator<T_Axes>
-        >,
-        T_Storage
-    >&;
-};
+            template<
+                typename T_Axes,
+                typename T_Type,
+                int T_Dim,
+                typename T_Accessor,
+                typename T_Navigator,
+                template<typename, int>
+                class T_Storage>
+            struct TwistComponents<T_Axes, math::Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>>
+            {
+                using type = math::Vector<
+                    T_Type,
+                    T_Dim,
+                    T_Accessor,
+                    math::StackedNavigator<T_Navigator, math::PermutedNavigator<T_Axes>>,
+                    T_Storage>&;
+            };
 
-} // result_of
+        } // namespace result_of
 
-/** Returns a reference of vector with twisted axes.
- *
- * The axes twist operation is done in place. This means that the result refers to the
- * memory of the input vector. The input vector's navigator policy is replaced by
- * a new navigator which merely consists of the old navigator plus a twisting navigator.
- * This new navigator does not use any real memory.
- *
- * \tparam T_Axes Mapped indices
- * \tparam T_Vector type of vector to be twisted
- * \param vector vector to be twisted
- * \return reference of the input vector with twisted axes.
- */
-template<typename T_Axes, typename T_Vector>
-HDINLINE
-typename result_of::TwistComponents<T_Axes, T_Vector>::type
-twistComponents(T_Vector& vector)
-{
-    /* The reinterpret_cast is valid because the target type is the same as the
-     * input type except its navigator policy which does not occupy any memory though.
-     */
-    return reinterpret_cast<typename result_of::TwistComponents<T_Axes, T_Vector>::type>(vector);
-}
+        /** Returns a reference of vector with twisted axes.
+         *
+         * The axes twist operation is done in place. This means that the result refers to the
+         * memory of the input vector. The input vector's navigator policy is replaced by
+         * a new navigator which merely consists of the old navigator plus a twisting navigator.
+         * This new navigator does not use any real memory.
+         *
+         * \tparam T_Axes Mapped indices
+         * \tparam T_Vector type of vector to be twisted
+         * \param vector vector to be twisted
+         * \return reference of the input vector with twisted axes.
+         */
+        template<typename T_Axes, typename T_Vector>
+        HDINLINE typename result_of::TwistComponents<T_Axes, T_Vector>::type twistComponents(T_Vector& vector)
+        {
+            /* The reinterpret_cast is valid because the target type is the same as the
+             * input type except its navigator policy which does not occupy any memory though.
+             */
+            return reinterpret_cast<typename result_of::TwistComponents<T_Axes, T_Vector>::type>(vector);
+        }
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/UInt32.hpp b/include/pmacc/math/vector/UInt32.hpp
index b224a00497..5b333d9e8b 100644
--- a/include/pmacc/math/vector/UInt32.hpp
+++ b/include/pmacc/math/vector/UInt32.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,46 +25,46 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-template<int dim>
-struct UInt32 : public Vector<uint32_t, dim>
-{
-    using BaseType = Vector<uint32_t, dim>;
-
-    HDINLINE UInt32()
+    namespace math
     {
-    }
+        template<int dim>
+        struct UInt32 : public Vector<uint32_t, dim>
+        {
+            using BaseType = Vector<uint32_t, dim>;
 
-    HDINLINE UInt32(uint32_t x) : BaseType(x)
-    {
-    }
+            HDINLINE UInt32()
+            {
+            }
 
-    HDINLINE UInt32(uint32_t x, uint32_t y) : BaseType(x, y)
-    {
-    }
+            HDINLINE UInt32(uint32_t x) : BaseType(x)
+            {
+            }
 
-    HDINLINE UInt32(uint32_t x, uint32_t y, uint32_t z) : BaseType(x, y, z)
-    {
-    }
+            HDINLINE UInt32(uint32_t x, uint32_t y) : BaseType(x, y)
+            {
+            }
 
-    /*! only allow explicit cast*/
-    template<
-    typename T_OtherType,
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE explicit UInt32(const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec) :
-    BaseType(vec)
-    {
-    }
+            HDINLINE UInt32(uint32_t x, uint32_t y, uint32_t z) : BaseType(x, y, z)
+            {
+            }
 
-    HDINLINE UInt32(const BaseType& vec) :
-    BaseType(vec)
-    {
-    }
-};
+            /*! only allow explicit cast*/
+            template<
+                typename T_OtherType,
+                typename T_OtherAccessor,
+                typename T_OtherNavigator,
+                template<typename, int>
+                class T_OtherStorage>
+            HDINLINE explicit UInt32(
+                const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec)
+                : BaseType(vec)
+            {
+            }
+
+            HDINLINE UInt32(const BaseType& vec) : BaseType(vec)
+            {
+            }
+        };
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/UInt64.hpp b/include/pmacc/math/vector/UInt64.hpp
index fff31a4168..31e7543353 100644
--- a/include/pmacc/math/vector/UInt64.hpp
+++ b/include/pmacc/math/vector/UInt64.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -25,46 +25,46 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-template<int dim>
-struct UInt64 : public Vector<uint64_t, dim>
-{
-    using BaseType = Vector<uint64_t, dim>;
-
-    HDINLINE UInt64()
+    namespace math
     {
-    }
+        template<int dim>
+        struct UInt64 : public Vector<uint64_t, dim>
+        {
+            using BaseType = Vector<uint64_t, dim>;
 
-    HDINLINE UInt64(uint64_t x) : BaseType(x)
-    {
-    }
+            HDINLINE UInt64()
+            {
+            }
 
-    HDINLINE UInt64(uint64_t x, uint64_t y) : BaseType(x, y)
-    {
-    }
+            HDINLINE UInt64(uint64_t x) : BaseType(x)
+            {
+            }
 
-    HDINLINE UInt64(uint64_t x, uint64_t y, uint64_t z) : BaseType(x, y, z)
-    {
-    }
+            HDINLINE UInt64(uint64_t x, uint64_t y) : BaseType(x, y)
+            {
+            }
 
-    /*! only allow explicit cast*/
-    template<
-    typename T_OtherType,
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE explicit UInt64(const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec) :
-    BaseType(vec)
-    {
-    }
+            HDINLINE UInt64(uint64_t x, uint64_t y, uint64_t z) : BaseType(x, y, z)
+            {
+            }
 
-    HDINLINE UInt64(const BaseType& vec) :
-    BaseType(vec)
-    {
-    }
-};
+            /*! only allow explicit cast*/
+            template<
+                typename T_OtherType,
+                typename T_OtherAccessor,
+                typename T_OtherNavigator,
+                template<typename, int>
+                class T_OtherStorage>
+            HDINLINE explicit UInt64(
+                const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& vec)
+                : BaseType(vec)
+            {
+            }
+
+            HDINLINE UInt64(const BaseType& vec) : BaseType(vec)
+            {
+            }
+        };
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/Vector.hpp b/include/pmacc/math/vector/Vector.hpp
index 0f816d34eb..115ee23b5b 100644
--- a/include/pmacc/math/vector/Vector.hpp
+++ b/include/pmacc/math/vector/Vector.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund, Axel Huebl
  *
  * This file is part of PMacc.
@@ -36,830 +36,822 @@
 
 namespace pmacc
 {
-namespace math
-{
-namespace detail
-{
-
-template<typename T_Type, int T_Dim>
-struct Vector_components
-{
-    static constexpr bool isConst = false;
-    static constexpr int dim = T_Dim;
-    using type = T_Type;
-
-    /*align full vector*/
-    PMACC_ALIGN(v[dim], type);
-
-    HDINLINE
-    type& operator[](const int idx)
-    {
-        return v[idx];
-    }
-
-    HDINLINE
-    const type& operator[](const int idx) const
-    {
-        return v[idx];
-    }
-};
-
-
-/** functor to copy a object element-wise
- *
- * @tparam isDestConst define if destination is const (not copyable) object
- */
-template<bool isDestConst>
-struct CopyElementWise
-{
-    /** copy object element-wise
-     *
-     * @tparam T_Dest destination object type
-     * @tparam T_Src source object type
-     */
-    template<typename T_Dest,typename T_Src>
-    HDINLINE void operator()(T_Dest& dest,const T_Src& src) const
-    {
-        PMACC_CASSERT_MSG(CopyElementWise_destination_and_source_had_different_dimension,
-                          T_Dest::dim == T_Src::dim);
-        for (int d = 0; d < T_Dest::dim; d++)
-            dest[d] = src[d];
-    }
-};
-
-/** specialization for constant destination
- *
- * the constant storage is already available and set in the destination
- */
-template<>
-struct CopyElementWise<true>
-{
-    template<typename T_Dest,typename T_Src>
-    HDINLINE void operator()(T_Dest& dest,const T_Src& src) const
-    {
-    }
-};
-
-} //namespace detail
-
-namespace tag
-{
-    struct Vector;
-}
-
-template<typename T_Type, int T_dim,
-typename T_Accessor = StandardAccessor,
-typename T_Navigator = StandardNavigator,
-template <typename, int> class T_Storage = detail::Vector_components>
-struct Vector : private T_Storage<T_Type, T_dim>, protected T_Accessor, protected T_Navigator
-{
-    using Storage = T_Storage<T_Type, T_dim>;
-    using type = typename Storage::type;
-    static constexpr int dim = Storage::dim;
-    using tag = tag::Vector;
-    using Accessor = T_Accessor;
-    using Navigator = T_Navigator;
-    using This = Vector<type, dim, Accessor, Navigator, T_Storage>;
-    using ParamType = typename boost::call_traits<type>::param_type;
-
-    /*Vectors without elements are not allowed*/
-    PMACC_CASSERT_MSG(math_Vector__with_DIM_0_is_not_allowed,dim > 0);
-
-    template<class> struct result;
-
-    template<class F, typename T>
-    struct result < F(T)>
-    {
-        using type = typename F::type&;
-    };
-
-    template<class F, typename T>
-    struct result < const F(T)>
-    {
-        using type = const typename F::type&;
-    };
-
-    HDINLINE Vector()
-    {}
-
-    HDINLINE
-    Vector(const type x)
-    {
-        PMACC_CASSERT_MSG(math_Vector__constructor_is_only_allowed_for_DIM1,dim == 1);
-        (*this)[0] = x;
-    }
-
-    HDINLINE
-    Vector(const type x, const type y)
-    {
-        PMACC_CASSERT_MSG(math_Vector__constructor_is_only_allowed_for_DIM2,dim == 2);
-        (*this)[0] = x;
-        (*this)[1] = y;
-    }
-
-    HDINLINE
-    Vector(const type x, const type y, const type z)
-    {
-        PMACC_CASSERT_MSG(math_Vector__constructor_is_only_allowed_for_DIM3,dim == 3);
-        (*this)[0] = x;
-        (*this)[1] = y;
-        (*this)[2] = z;
-    }
-
-    HDINLINE Vector(const This& other)
-    {
-        detail::CopyElementWise<Storage::isConst>()(*this,other);
-    }
-
-    template<
-    typename T_OtherType,
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE explicit Vector(const Vector<
-                             T_OtherType,
-                             dim,
-                             T_OtherAccessor,
-                             T_OtherNavigator,
-                             T_OtherStorage
-                             >&
-                             other)
-    {
-        for (int i = 0; i < dim; i++)
-            (*this)[i] = static_cast<type> (other[i]);
-    }
-
-    /** Allow static_cast / explicit cast to member type for 1D vector */
-    template<
-        int T_deferDim = T_dim,
-        typename = typename std::enable_if< T_deferDim == 1 >::type
-    >
-    HDINLINE
-    explicit
-    operator type()
+    namespace math
     {
-        return (*this)[0];
-    }
-
-    /**
-     * Creates a Vector where all dimensions are set to the same value
-     *
-     * @param value Value which is set for all dimensions
-     * @return new Vector<...>
-     */
-    HDINLINE
-    static This create(ParamType value)
-    {
-        This result;
-        for (int i = 0; i < dim; i++)
-            result[i] = value;
-
-        return result;
-    }
-
-    HDINLINE const This& toRT() const
-    {
-        return *this;
-    }
-
-    HDINLINE This& toRT()
-    {
-        return *this;
-    }
-
-    HDINLINE This revert()
-    {
-        This invertedVector;
-        for (int i = 0; i < dim; i++)
-            invertedVector[dim-1-i] = (*this)[i];
-
-        return invertedVector;
-    }
-
-    template<
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE This&
-    operator=(const Vector<type, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
-    {
-        for (int i = 0; i < dim; i++)
-            (*this)[i] = rhs[i];
-        return *this;
-    }
-
-    HDINLINE
-    type& operator[](const int idx)
-    {
-        return Accessor::operator()(Storage::operator[](Navigator::operator()(idx)));
-    }
-
-    HDINLINE
-    const type& operator[](const int idx) const
-    {
-        return Accessor::operator()(Storage::operator[](Navigator::operator()(idx)));
-    }
-
-    HDINLINE type & x()
-    {
-        return (*this)[0];
-    }
-
-    HDINLINE type & y()
-    {
-        PMACC_CASSERT_MSG(math_Vector__access_to_y_is_not_allowed_for_DIM_lesser_than_2,dim >= 2);
-        return (*this)[1];
-    }
-
-    HDINLINE type & z()
-    {
-        PMACC_CASSERT_MSG(math_Vector__access_to_z_is_not_allowed_for_DIM_lesser_than_3,dim >= 3);
-        return (*this)[2];
-    }
-
-    HDINLINE const type & x() const
-    {
-        return (*this)[0];
-    }
-
-    HDINLINE const type & y() const
-    {
-        PMACC_CASSERT_MSG(math_Vector__access_to_y_is_not_allowed_for_DIM_lesser_than_2,dim >= 2);
-        return (*this)[1];
-    }
+        namespace detail
+        {
+            template<typename T_Type, int T_Dim>
+            struct Vector_components
+            {
+                static constexpr bool isConst = false;
+                static constexpr int dim = T_Dim;
+                using type = T_Type;
+
+                HDINLINE
+                constexpr Vector_components()
+                {
+                }
+
+                constexpr Vector_components& operator=(const Vector_components&) = default;
+
+                /*align full vector*/
+                PMACC_ALIGN(v[dim], type);
+
+                HDINLINE
+                type& operator[](const int idx)
+                {
+                    return v[idx];
+                }
+
+                HDINLINE
+                const type& operator[](const int idx) const
+                {
+                    return v[idx];
+                }
+            };
+
+
+            /** functor to copy a object element-wise
+             *
+             * @tparam isDestConst define if destination is const (not copyable) object
+             */
+            template<bool isDestConst>
+            struct CopyElementWise
+            {
+                /** copy object element-wise
+                 *
+                 * @tparam T_Dest destination object type
+                 * @tparam T_Src source object type
+                 */
+                template<typename T_Dest, typename T_Src>
+                HDINLINE void operator()(T_Dest& dest, const T_Src& src) const
+                {
+                    PMACC_CASSERT_MSG(
+                        CopyElementWise_destination_and_source_had_different_dimension,
+                        T_Dest::dim == T_Src::dim);
+                    for(int d = 0; d < T_Dest::dim; d++)
+                        dest[d] = src[d];
+                }
+            };
+
+            /** specialization for constant destination
+             *
+             * the constant storage is already available and set in the destination
+             */
+            template<>
+            struct CopyElementWise<true>
+            {
+                template<typename T_Dest, typename T_Src>
+                HDINLINE void operator()(T_Dest& dest, const T_Src& src) const
+                {
+                }
+            };
+
+        } // namespace detail
+
+        namespace tag
+        {
+            struct Vector;
+        }
 
-    HDINLINE const type & z() const
-    {
-        PMACC_CASSERT_MSG(math_Vector__access_to_z_is_not_allowed_for_DIM_lesser_than_3,dim >= 3);
-        return (*this)[2];
-    }
+        template<
+            typename T_Type,
+            int T_dim,
+            typename T_Accessor = StandardAccessor,
+            typename T_Navigator = StandardNavigator,
+            template<typename, int> class T_Storage = detail::Vector_components>
+        struct Vector
+            : private T_Storage<T_Type, T_dim>
+            , protected T_Accessor
+            , protected T_Navigator
+        {
+            using Storage = T_Storage<T_Type, T_dim>;
+            using type = typename Storage::type;
+            static constexpr int dim = Storage::dim;
+            using tag = tag::Vector;
+            using Accessor = T_Accessor;
+            using Navigator = T_Navigator;
+            using This = Vector<type, dim, Accessor, Navigator, T_Storage>;
+            using ParamType = typename boost::call_traits<type>::param_type;
+
+            /*Vectors without elements are not allowed*/
+            PMACC_CASSERT_MSG(math_Vector__with_DIM_0_is_not_allowed, dim > 0);
+
+            template<class>
+            struct result;
+
+            template<class F, typename T>
+            struct result<F(T)>
+            {
+                using type = typename F::type&;
+            };
+
+            template<class F, typename T>
+            struct result<const F(T)>
+            {
+                using type = const typename F::type&;
+            };
+
+            HDINLINE
+            constexpr Vector()
+            {
+            }
+
+            HDINLINE
+            constexpr Vector(const type x)
+            {
+                PMACC_CASSERT_MSG(math_Vector__constructor_is_only_allowed_for_DIM1, dim == 1);
+                (*this)[0] = x;
+            }
+
+            HDINLINE
+            constexpr Vector(const type x, const type y)
+            {
+                PMACC_CASSERT_MSG(math_Vector__constructor_is_only_allowed_for_DIM2, dim == 2);
+                (*this)[0] = x;
+                (*this)[1] = y;
+            }
+
+            HDINLINE
+            constexpr Vector(const type x, const type y, const type z)
+            {
+                PMACC_CASSERT_MSG(math_Vector__constructor_is_only_allowed_for_DIM3, dim == 3);
+                (*this)[0] = x;
+                (*this)[1] = y;
+                (*this)[2] = z;
+            }
+
+            HDINLINE
+            constexpr Vector(const This& other)
+            {
+                detail::CopyElementWise<Storage::isConst>()(*this, other);
+            }
+
+            template<
+                typename T_OtherType,
+                typename T_OtherAccessor,
+                typename T_OtherNavigator,
+                template<typename, int>
+                class T_OtherStorage>
+            HDINLINE explicit Vector(
+                const Vector<T_OtherType, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] = static_cast<type>(other[i]);
+            }
+
+            /** Allow static_cast / explicit cast to member type for 1D vector */
+            template<int T_deferDim = T_dim, typename = typename std::enable_if<T_deferDim == 1>::type>
+            HDINLINE explicit operator type()
+            {
+                return (*this)[0];
+            }
+
+            /**
+             * Creates a Vector where all dimensions are set to the same value
+             *
+             * @param value Value which is set for all dimensions
+             * @return new Vector<...>
+             */
+            HDINLINE
+            static This create(ParamType value)
+            {
+                This result;
+                for(int i = 0; i < dim; i++)
+                    result[i] = value;
+
+                return result;
+            }
+
+            HDINLINE const This& toRT() const
+            {
+                return *this;
+            }
+
+            HDINLINE This& toRT()
+            {
+                return *this;
+            }
+
+            HDINLINE This revert()
+            {
+                This invertedVector;
+                for(int i = 0; i < dim; i++)
+                    invertedVector[dim - 1 - i] = (*this)[i];
+
+                return invertedVector;
+            }
+
+            constexpr HDINLINE Vector& operator=(const Vector&) = default;
+
+            template<typename T_OtherAccessor, typename T_OtherNavigator, template<typename, int> class T_OtherStorage>
+            HDINLINE This& operator=(const Vector<type, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] = rhs[i];
+                return *this;
+            }
+
+            HDINLINE
+            type& operator[](const int idx)
+            {
+                return Accessor::operator()(Storage::operator[](Navigator::operator()(idx)));
+            }
+
+            HDINLINE
+            const type& operator[](const int idx) const
+            {
+                return Accessor::operator()(Storage::operator[](Navigator::operator()(idx)));
+            }
+
+            HDINLINE type& x()
+            {
+                return (*this)[0];
+            }
+
+            HDINLINE type& y()
+            {
+                PMACC_CASSERT_MSG(math_Vector__access_to_y_is_not_allowed_for_DIM_lesser_than_2, dim >= 2);
+                return (*this)[1];
+            }
+
+            HDINLINE type& z()
+            {
+                PMACC_CASSERT_MSG(math_Vector__access_to_z_is_not_allowed_for_DIM_lesser_than_3, dim >= 3);
+                return (*this)[2];
+            }
+
+            HDINLINE const type& x() const
+            {
+                return (*this)[0];
+            }
+
+            HDINLINE const type& y() const
+            {
+                PMACC_CASSERT_MSG(math_Vector__access_to_y_is_not_allowed_for_DIM_lesser_than_2, dim >= 2);
+                return (*this)[1];
+            }
+
+            HDINLINE const type& z() const
+            {
+                PMACC_CASSERT_MSG(math_Vector__access_to_z_is_not_allowed_for_DIM_lesser_than_3, dim >= 3);
+                return (*this)[2];
+            }
+
+            template<int shrinkedDim>
+            HDINLINE Vector<type, shrinkedDim, Accessor, Navigator> shrink(const int startIdx = 0) const
+            {
+                PMACC_CASSERT_MSG(
+                    math_Vector__shrinkedDim_DIM_must_be_lesser_or_equal_to_Vector_DIM,
+                    shrinkedDim <= dim);
+                Vector<type, shrinkedDim, Accessor, Navigator> result;
+                for(int i = 0; i < shrinkedDim; i++)
+                    result[i] = (*this)[(startIdx + i) % dim];
+                return result;
+            }
+
+            /** Removes a component
+             *
+             * It is not allowed to call this method on a vector with the dimensionality of one.
+             *
+             * @tparam dimToRemove index which shall be removed; range: [ 0; dim - 1 ]
+             * @return vector with `dim - 1` elements
+             */
+            template<int dimToRemove>
+            HDINLINE Vector<type, dim - 1, Accessor, Navigator> remove() const
+            {
+                PMACC_CASSERT_MSG(__math_Vector__dim_must_be_greater_than_1__, dim > 1);
+                PMACC_CASSERT_MSG(__math_Vector__dimToRemove_must_be_lesser_than_dim__, dimToRemove < dim);
+                Vector<type, dim - 1, Accessor, Navigator> result;
+                for(int i = 0; i < dim - 1; ++i)
+                {
+                    // skip component which must be deleted
+                    const int sourceIdx = i >= dimToRemove ? i + 1 : i;
+                    result[i] = (*this)[sourceIdx];
+                }
+                return result;
+            }
+
+            /** Returns product of all components.
+             *
+             * @return product of components
+             */
+            HDINLINE type productOfComponents() const
+            {
+                type result = (*this)[0];
+                for(int i = 1; i < dim; i++)
+                    result *= (*this)[i];
+                return result;
+            }
+
+            /** Returns sum of all components.
+             *
+             * @return sum of components
+             */
+            HDINLINE type sumOfComponents() const
+            {
+                type result = (*this)[0];
+                for(int i = 1; i < dim; i++)
+                    result += (*this)[i];
+                return result;
+            }
+
+            /*! += operator
+             * @param other instance with same type and dimension like the left instance
+             * @return reference to manipulated left instance
+             */
+            template<typename T_OtherAccessor, typename T_OtherNavigator, template<typename, int> class T_OtherStorage>
+            HDINLINE This& operator+=(
+                const Vector<type, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] += other[i];
+                return *this;
+            }
+
+            /*! -= operator
+             * @param other instance with same type and dimension like the left instance
+             * @return reference to manipulated left instance
+             */
+            template<typename T_OtherAccessor, typename T_OtherNavigator, template<typename, int> class T_OtherStorage>
+            HDINLINE This& operator-=(
+                const Vector<type, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] -= other[i];
+                return *this;
+            }
+
+            /*! *= operator
+             * @param other instance with same type and dimension like the left instance
+             * @return reference to manipulated left instance
+             */
+            template<typename T_OtherAccessor, typename T_OtherNavigator, template<typename, int> class T_OtherStorage>
+            HDINLINE This& operator*=(
+                const Vector<type, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] *= other[i];
+                return *this;
+            }
+
+            /*! /= operator
+             * @param other instance with same type and dimension like the left instance
+             * @return reference to manipulated left instance
+             */
+            template<typename T_OtherAccessor, typename T_OtherNavigator, template<typename, int> class T_OtherStorage>
+            HDINLINE This& operator/=(
+                const Vector<type, dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] /= other[i];
+                return *this;
+            }
+
+            HDINLINE This& operator+=(ParamType other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] += other;
+                return *this;
+            }
+
+            HDINLINE This& operator-=(ParamType other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] -= other;
+                return *this;
+            }
+
+            HDINLINE This& operator*=(ParamType other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] *= other;
+                return *this;
+            }
+
+            HDINLINE This& operator/=(ParamType other)
+            {
+                for(int i = 0; i < dim; i++)
+                    (*this)[i] /= other;
+                return *this;
+            }
+
+            /**
+             * == comparison operator.
+             *
+             * Compares sizes of two DataSpaces.
+             *
+             * @param other Vector to compare to
+             * @return true if all components in both vectors are equal, else false
+             */
+            HDINLINE bool operator==(const This& rhs) const
+            {
+                bool result = true;
+                for(int i = 0; i < dim; i++)
+                    result = result && ((*this)[i] == rhs[i]);
+                return result;
+            }
+
+            /**
+             * != comparison operator.
+             *
+             * Compares sizes of two DataSpaces.
+             *
+             * @param other Vector to compare to
+             * @return true if one component in both vectors are not equal, else false
+             */
+            HDINLINE bool operator!=(const This& rhs) const
+            {
+                return !((*this) == rhs);
+            }
+
+            /** create string out of the vector
+             *
+             * @param separator string to separate components of the vector
+             * @param enclosings string with size 2 to enclose vector
+             *                   size == 0 ? no enclose symbols
+             *                   size == 1 ? means enclose symbol begin and end are equal
+             *                   size >= 2 ? letter[0] = begin enclose symbol
+             *                               letter[1] = end enclose symbol
+             *
+             * example:
+             * .toString(";","|")     -> |x;...;z|
+             * .toString(",","[]")    -> [x,...,z]
+             */
+            std::string toString(const std::string separator = ",", const std::string enclosings = "{}") const
+            {
+                std::string locale_enclosing_begin;
+                std::string locale_enclosing_end;
+                size_t enclosing_size = enclosings.size();
+
+                if(enclosing_size > 0)
+                {
+                    /* % avoid out of memory access */
+                    locale_enclosing_begin = enclosings[0 % enclosing_size];
+                    locale_enclosing_end = enclosings[1 % enclosing_size];
+                }
+
+                std::stringstream stream;
+                stream << locale_enclosing_begin << (*this)[0];
+
+                for(int i = 1; i < dim; ++i)
+                    stream << separator << (*this)[i];
+                stream << locale_enclosing_end;
+                return stream.str();
+            }
+
+            HDINLINE cupla::dim3 toDim3() const
+            {
+                cupla::dim3 result;
+                unsigned int* ptr = &result.x;
+                for(int d = 0; d < dim; ++d)
+                    ptr[d] = (*this)[d];
+                return result;
+            }
+        };
+
+        template<typename Type>
+        struct Vector<Type, 0>
+        {
+            using type = Type;
+            static constexpr int dim = 0;
+
+            template<typename OtherType>
+            HDINLINE operator Vector<OtherType, 0>() const
+            {
+                return Vector<OtherType, 0>();
+            }
+
+            /**
+             * == comparison operator.
+             *
+             * Returns always true
+             */
+            HDINLINE bool operator==(const Vector& rhs) const
+            {
+                return true;
+            }
+
+            /**
+             * != comparison operator.
+             *
+             * Returns always false
+             */
+            HDINLINE bool operator!=(const Vector& rhs) const
+            {
+                return false;
+            }
+
+            HDINLINE
+            static Vector create(Type)
+            {
+                /* this method should never be actually called,
+                 * it exists only for Visual Studio to handle pmacc::math::Size_t< 0 >
+                 */
+                PMACC_CASSERT_MSG(Vector_dim_0_create_cannot_be_called, sizeof(Type) != 0 && false);
+            }
+        };
+
+        template<typename Type, int dim, typename Accessor, typename Navigator>
+        std::ostream& operator<<(std::ostream& s, const Vector<Type, dim, Accessor, Navigator>& vec)
+        {
+            return s << vec.toString();
+        }
 
-    template<int shrinkedDim >
-    HDINLINE Vector<type, shrinkedDim, Accessor, Navigator> shrink(const int startIdx = 0) const
-    {
-        PMACC_CASSERT_MSG(math_Vector__shrinkedDim_DIM_must_be_lesser_or_equal_to_Vector_DIM,shrinkedDim <= dim);
-        Vector<type, shrinkedDim, Accessor, Navigator> result;
-        for (int i = 0; i < shrinkedDim; i++)
-            result[i] = (*this)[(startIdx + i) % dim];
-        return result;
-    }
-
-    /** Removes a component
-     *
-     * It is not allowed to call this method on a vector with the dimensionality of one.
-     *
-     * @tparam dimToRemove index which shall be removed; range: [ 0; dim - 1 ]
-     * @return vector with `dim - 1` elements
-     */
-    template<int dimToRemove>
-    HDINLINE Vector<type, dim - 1, Accessor, Navigator> remove() const
-    {
-        PMACC_CASSERT_MSG(__math_Vector__dim_must_be_greater_than_1__, dim > 1);
-        PMACC_CASSERT_MSG(__math_Vector__dimToRemove_must_be_lesser_than_dim__, dimToRemove < dim);
-        Vector<type, dim - 1, Accessor, Navigator> result;
-        for (int i = 0; i < dim - 1; ++i)
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage,
+            typename T_OtherAccessor,
+            typename T_OtherNavigator,
+            template<typename, int>
+            class T_OtherStorage>
+        HDINLINE Vector<T_Type, T_Dim> operator+(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
         {
-            // skip component which must be deleted
-            const int sourceIdx = i >= dimToRemove ? i + 1 : i;
-            result[i] = (*this)[sourceIdx];
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(lhs);
+            result += rhs;
+            return result;
         }
-        return result;
-    }
-
-    /** Returns product of all components.
-     *
-     * @return product of components
-     */
-    HDINLINE type productOfComponents() const
-    {
-        type result = (*this)[0];
-        for (int i = 1; i < dim; i++)
-            result *= (*this)[i];
-        return result;
-    }
-
-    /** Returns sum of all components.
-     *
-     * @return sum of components
-     */
-    HDINLINE type sumOfComponents() const
-    {
-        type result = (*this)[0];
-        for (int i = 1; i < dim; i++)
-            result += (*this)[i];
-        return result;
-    }
-
-    /*! += operator
-     * @param other instance with same type and dimension like the left instance
-     * @return reference to manipulated left instance
-     */
-    template<
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE This&
-    operator+=(const Vector<
-               type, dim,
-               T_OtherAccessor, T_OtherNavigator, T_OtherStorage>&
-               other)
-    {
-        for (int i = 0; i < dim; i++)
-            (*this)[i] += other[i];
-        return *this;
-    }
-
-    /*! -= operator
-     * @param other instance with same type and dimension like the left instance
-     * @return reference to manipulated left instance
-     */
-    template<
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE This&
-    operator-=(const Vector<
-               type, dim,
-               T_OtherAccessor, T_OtherNavigator, T_OtherStorage>&
-               other)
-    {
-        for (int i = 0; i < dim; i++)
-            (*this)[i] -= other[i];
-        return *this;
-    }
-
-    /*! *= operator
-     * @param other instance with same type and dimension like the left instance
-     * @return reference to manipulated left instance
-     */
-    template<
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE This&
-    operator*=(const Vector<
-               type, dim,
-               T_OtherAccessor, T_OtherNavigator, T_OtherStorage>&
-               other)
-    {
 
-        for (int i = 0; i < dim; i++)
-            (*this)[i] *= other[i];
-        return *this;
-    }
-
-    /*! /= operator
-     * @param other instance with same type and dimension like the left instance
-     * @return reference to manipulated left instance
-     */
-    template<
-    typename T_OtherAccessor,
-    typename T_OtherNavigator,
-    template <typename, int> class T_OtherStorage>
-    HDINLINE This&
-    operator/=(const Vector<
-               type, dim,
-               T_OtherAccessor, T_OtherNavigator, T_OtherStorage>&
-               other)
-    {
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage>
+        HDINLINE Vector<T_Type, T_Dim> operator+(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            typename Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>::ParamType rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(lhs);
+            result += rhs;
+            return result;
+        }
 
-        for (int i = 0; i < dim; i++)
-            (*this)[i] /= other[i];
-        return *this;
-    }
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage,
+            typename T_OtherAccessor,
+            typename T_OtherNavigator,
+            template<typename, int>
+            class T_OtherStorage>
+        HDINLINE Vector<T_Type, T_Dim> operator-(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(lhs);
+            result -= rhs;
+            return result;
+        }
 
-    HDINLINE This& operator+=(ParamType other)
-    {
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage>
+        HDINLINE Vector<T_Type, T_Dim> operator-(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            typename Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>::ParamType rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(lhs);
+            result -= rhs;
+            return result;
+        }
 
-        for (int i = 0; i < dim; i++)
-            (*this)[i] += other;
-        return *this;
-    }
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage,
+            typename T_OtherAccessor,
+            typename T_OtherNavigator,
+            template<typename, int>
+            class T_OtherStorage>
+        HDINLINE Vector<T_Type, T_Dim> operator*(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(lhs);
+            result *= rhs;
+            return result;
+        }
 
-    HDINLINE This& operator-=(ParamType other)
-    {
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage,
+            typename T_OtherAccessor,
+            typename T_OtherNavigator,
+            template<typename, int>
+            class T_OtherStorage>
+        HDINLINE Vector<T_Type, T_Dim> operator/(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(lhs);
+            result /= rhs;
+            return result;
+        }
 
-        for (int i = 0; i < dim; i++)
-            (*this)[i] -= other;
-        return *this;
-    }
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage>
+        HDINLINE Vector<T_Type, T_Dim> operator*(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            typename Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>::ParamType rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(lhs);
+            result *= rhs;
+            return result;
+        }
 
-    HDINLINE This& operator*=(ParamType other)
-    {
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage>
+        HDINLINE Vector<T_Type, T_Dim> operator*(
+            typename boost::call_traits<T_Type>::param_type lhs,
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(rhs);
+            result *= lhs;
+            return result;
+        }
 
-        for (int i = 0; i < dim; i++)
-            (*this)[i] *= other;
-        return *this;
-    }
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage>
+        HDINLINE Vector<T_Type, T_Dim> operator/(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            typename Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>::ParamType rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(lhs);
+            result /= rhs;
+            return result;
+        }
 
-    HDINLINE This& operator/=(ParamType other)
-    {
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage>
+        HDINLINE Vector<T_Type, T_Dim> operator-(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& vec)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<T_Type, T_Dim> result(vec);
 
-        for (int i = 0; i < dim; i++)
-            (*this)[i] /= other;
-        return *this;
-    }
-
-    /**
-     * == comparison operator.
-     *
-     * Compares sizes of two DataSpaces.
-     *
-     * @param other Vector to compare to
-     * @return true if all components in both vectors are equal, else false
-     */
-    HDINLINE bool operator==(const This& rhs) const
-    {
-        bool result = true;
-        for (int i = 0; i < dim; i++)
-            result = result && ((*this)[i] == rhs[i]);
-        return result;
-    }
-
-    /**
-     * != comparison operator.
-     *
-     * Compares sizes of two DataSpaces.
-     *
-     * @param other Vector to compare to
-     * @return true if one component in both vectors are not equal, else false
-     */
-    HDINLINE bool operator!=(const This& rhs) const
-    {
+            for(int i = 0; i < T_Dim; i++)
+                result[i] = -result[i];
+            return result;
+        }
 
-        return !((*this) == rhs);
-    }
-
-    /** create string out of the vector
-     *
-     * @param separator string to separate components of the vector
-     * @param enclosings string with size 2 to enclose vector
-     *                   size == 0 ? no enclose symbols
-     *                   size == 1 ? means enclose symbol begin and end are equal
-     *                   size >= 2 ? letter[0] = begin enclose symbol
-     *                               letter[1] = end enclose symbol
-     *
-     * example:
-     * .toString(";","|")     -> |x;...;z|
-     * .toString(",","[]")    -> [x,...,z]
-     */
-    std::string toString(const std::string separator = ",", const std::string enclosings = "{}") const
-    {
-        std::string locale_enclosing_begin;
-        std::string locale_enclosing_end;
-        size_t enclosing_size=enclosings.size();
+        template<
+            typename T_Type,
+            int T_Dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage,
+            typename T_OtherAccessor,
+            typename T_OtherNavigator,
+            template<typename, int>
+            class T_OtherStorage>
+        HDINLINE Vector<bool, T_Dim> operator>=(
+            const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
+            const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
+        {
+            /* to avoid allocation side effects the result is always a vector
+             * with default policies*/
+            Vector<bool, T_Dim> result;
+            for(int i = 0; i < T_Dim; ++i)
+                result[i] = (lhs[i] >= rhs[i]);
+            return result;
+        }
 
-        if(enclosing_size > 0)
+        template<
+            typename T_Type,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage,
+            typename T_OtherAccessor,
+            typename T_OtherNavigator,
+            template<typename, int>
+            class T_OtherStorage>
+        HDINLINE T_Type linearize(
+            const Vector<T_Type, 1, T_Accessor, T_Navigator, T_Storage>& size,
+            const Vector<T_Type, 2, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& pos)
         {
-            /* % avoid out of memory access */
-            locale_enclosing_begin=enclosings[0%enclosing_size];
-            locale_enclosing_end=enclosings[1%enclosing_size];
+            return pos.y() * size.x() + pos.x();
         }
 
-        std::stringstream stream;
-        stream << locale_enclosing_begin << (*this)[0];
+        template<
+            typename T_Type,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage,
+            typename T_OtherAccessor,
+            typename T_OtherNavigator,
+            template<typename, int>
+            class T_OtherStorage>
+        HDINLINE T_Type linearize(
+            const Vector<T_Type, 2, T_Accessor, T_Navigator, T_Storage>& size,
+            const Vector<T_Type, 3, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& pos)
+        {
+            return pos.z() * size.x() * size.y() + pos.y() * size.x() + pos.x();
+        }
 
-        for (int i = 1; i < dim; ++i)
-            stream << separator << (*this)[i];
-        stream << locale_enclosing_end;
-        return stream.str();
-    }
 
-    HDINLINE dim3 toDim3() const
-    {
-        dim3 result;
-        unsigned int* ptr = &result.x;
-        for (int d = 0; d < dim; ++d)
-            ptr[d] = (*this)[d];
-        return result;
-    }
-};
-
-template<typename Type>
-struct Vector<Type, 0 >
-{
-    using type = Type;
-    static constexpr int dim = 0;
+        template<typename Lhs, typename Rhs>
+        HDINLINE Lhs operator%(const Lhs& lhs, const Rhs& rhs)
+        {
+            Lhs result;
 
-    template<typename OtherType >
-    HDINLINE operator Vector<OtherType, 0 > () const
-    {
-        return Vector<OtherType, 0 > ();
-    }
-
-    /**
-     * == comparison operator.
-     *
-     * Returns always true
-     */
-    HDINLINE bool operator==(const Vector& rhs) const
-    {
-        return true;
-    }
-
-    /**
-     * != comparison operator.
-     *
-     * Returns always false
-     */
-    HDINLINE bool operator!=(const Vector& rhs) const
-    {
-        return false;
-    }
+            for(int i = 0; i < Lhs::dim; i++)
+                result[i] = lhs[i] % rhs[i];
+            return result;
+        }
 
-    HDINLINE
-    static Vector create(Type)
-    {
-        /* this method should never be actually called,
-         * it exists only for Visual Studio to handle pmacc::math::Size_t< 0 >
+        struct Abs
+        {
+            template<typename Type, int dim>
+            HDINLINE Type operator()(const Vector<Type, dim>& vec)
+            {
+                return cupla::math::abs(vec);
+            }
+        };
+
+        /** Get the unit basis vector of the given type along the given direction
+         *
+         * In case 0 <= T_direction < T_Vector::dim, return the basis vector with value
+         * 1 in component T_direction and 0 in other components, otherwise return the
+         * zero vector.
+         *
+         * @tparam T_Vector result type
+         * @tparam T_direction index of the basis vector direction
          */
-        PMACC_CASSERT_MSG(Vector_dim_0_create_cannot_be_called, false);
-    }
-};
-
-template<typename Type, int dim, typename Accessor, typename Navigator>
-std::ostream& operator<<(std::ostream& s, const Vector<Type, dim, Accessor, Navigator>& vec)
-{
-
-    return s << vec.toString();
-}
-
-template<typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage,
-typename T_OtherAccessor,
-typename T_OtherNavigator,
-template <typename, int> class T_OtherStorage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator+(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-          const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(lhs);
-    result += rhs;
-    return result;
-}
-
-template<typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator+(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-          typename Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>::ParamType rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(lhs);
-    result += rhs;
-    return result;
-}
-
-template<typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage,
-typename T_OtherAccessor,
-typename T_OtherNavigator,
-template <typename, int> class T_OtherStorage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator-(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-          const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(lhs);
-    result -= rhs;
-    return result;
-}
-
-template<typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator-(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-          typename Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>::ParamType rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(lhs);
-    result -= rhs;
-    return result;
-}
-
-template<typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage,
-typename T_OtherAccessor,
-typename T_OtherNavigator,
-template <typename, int> class T_OtherStorage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator*(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-          const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(lhs);
-    result *= rhs;
-    return result;
-}
-
-template<
-typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage,
-typename T_OtherAccessor,
-typename T_OtherNavigator,
-template <typename, int> class T_OtherStorage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator/(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-          const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(lhs);
-    result /= rhs;
-    return result;
-}
-
-template<
-typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator*(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-          typename Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>::ParamType rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(lhs);
-    result *= rhs;
-    return result;
-}
-
-template<
-typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator*(typename boost::call_traits<T_Type>::param_type lhs,
-          const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(rhs);
-    result *= lhs;
-    return result;
-}
-
-template<
-typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator/(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-          typename Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>::ParamType rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(lhs);
-    result /= rhs;
-    return result;
-}
-
-template<
-typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage
->
-HDINLINE Vector<T_Type, T_Dim>
-operator-(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& vec)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<T_Type, T_Dim> result(vec);
-
-    for (int i = 0; i < T_Dim; i++)
-        result[i] = -result[i];
-    return result;
-}
-
-template<
-typename T_Type, int T_Dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage,
-typename T_OtherAccessor,
-typename T_OtherNavigator,
-template <typename, int> class T_OtherStorage
->
-HDINLINE Vector<bool, T_Dim>
-operator>=(const Vector<T_Type, T_Dim, T_Accessor, T_Navigator, T_Storage>& lhs,
-           const Vector<T_Type, T_Dim, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& rhs)
-{
-    /* to avoid allocation side effects the result is always a vector
-     * with default policies*/
-    Vector<bool, T_Dim > result;
-    for (int i = 0; i < T_Dim; ++i)
-        result[i] = (lhs[i] >= rhs[i]);
-    return result;
-}
-
-template<
-typename T_Type,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage,
-typename T_OtherAccessor,
-typename T_OtherNavigator,
-template <typename, int> class T_OtherStorage
->
-HDINLINE T_Type
-linearize(const Vector<T_Type, 1, T_Accessor, T_Navigator, T_Storage >& size,
-          const Vector<T_Type, 2, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& pos)
-{
-    return pos.y() * size.x() + pos.x();
-}
-
-template<
-typename T_Type,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage,
-typename T_OtherAccessor,
-typename T_OtherNavigator,
-template <typename, int> class T_OtherStorage
->
-HDINLINE T_Type
-linearize(const Vector<T_Type, 2, T_Accessor, T_Navigator, T_Storage >& size,
-          const Vector<T_Type, 3, T_OtherAccessor, T_OtherNavigator, T_OtherStorage>& pos)
-{
-    return pos.z() * size.x() * size.y() + pos.y() * size.x() + pos.x();
-}
+        template<typename T_Vector, uint32_t T_direction>
+        HDINLINE T_Vector basisVector();
 
+    } // namespace math
 
-template<typename Lhs, typename Rhs>
-HDINLINE Lhs operator%(const Lhs& lhs, const Rhs& rhs)
-{
-    Lhs result;
-
-    for (int i = 0; i < Lhs::dim; i++)
-        result[i] = lhs[i] % rhs[i];
-    return result;
-}
-
-struct Abs2
-{
-    template<typename Type, int dim >
-    HDINLINE Type operator()(const Vector<Type, dim>& vec)
-    {
-        return pmacc::algorithms::math::abs2(vec);
-    }
-};
-
-struct Abs
-{
-    template<typename Type, int dim >
-    HDINLINE Type operator()(const Vector<Type, dim>& vec)
+    namespace result_of
     {
-        return pmacc::algorithms::math::abs(vec);
-    }
-};
-
-} //namespace math
-
-namespace result_of
-{
-
-template<typename TVector>
-struct Functor<math::Abs2, TVector>
-{
-    using type = typename TVector::type;
-};
-
-template<typename TVector>
-struct Functor<math::Abs, TVector>
-{
-    using type = typename TVector::type;
-};
+        template<typename TVector>
+        struct Functor<math::Abs, TVector>
+        {
+            using type = typename TVector::type;
+        };
 
-} //namespace result_of
-} //namespace pmacc
+    } // namespace result_of
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/Vector.tpp b/include/pmacc/math/vector/Vector.tpp
index c22243fabd..70a059f943 100644
--- a/include/pmacc/math/vector/Vector.tpp
+++ b/include/pmacc/math/vector/Vector.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Sergei Bastrakov
  *
  * This file is part of PMacc.
@@ -25,6 +25,7 @@
 
 
 #include "pmacc/math/Vector.hpp"
+#include "pmacc/math/vector/compile-time/Vector.hpp"
 #include "pmacc/algorithms/math.hpp"
 #include "pmacc/algorithms/TypeCast.hpp"
 #include "pmacc/algorithms/PromoteType.hpp"
@@ -33,332 +34,329 @@
 #include "pmacc/traits/GetNComponents.hpp"
 #include "pmacc/traits/GetInitializedInstance.hpp"
 
-namespace pmacc
-{
-namespace traits
-{
-
-template<typename T_DataType, int T_Dim>
-struct GetComponentsType<pmacc::math::Vector<T_DataType, T_Dim>, false >
-{
-    using type = typename pmacc::math::Vector<T_DataType, T_Dim>::type;
-};
-
-template<typename T_DataType, int T_Dim>
-struct GetNComponents<pmacc::math::Vector<T_DataType, T_Dim>,false >
-{
-    static constexpr uint32_t value = (uint32_t) pmacc::math::Vector<T_DataType, T_Dim>::dim;
-};
+#include <utility>
 
-template<typename T_Type, int T_dim, typename T_Accessor, typename T_Navigator, template<typename, int> class T_Storage>
-struct GetInitializedInstance<math::Vector<T_Type, T_dim, T_Accessor, T_Navigator, T_Storage> >
+namespace pmacc
 {
-    using Type = math::Vector<T_Type, T_dim, T_Accessor, T_Navigator, T_Storage>;
-    using ValueType = typename Type::type;
-
-    HDINLINE Type operator()(const ValueType value) const
+    namespace traits
     {
-        return Type::create(value);
-    }
-};
-
-} // namespace traits
+        template<typename T_DataType, int T_Dim>
+        struct GetComponentsType<pmacc::math::Vector<T_DataType, T_Dim>, false>
+        {
+            using type = typename pmacc::math::Vector<T_DataType, T_Dim>::type;
+        };
+
+        template<typename T_DataType, int T_Dim>
+        struct GetNComponents<pmacc::math::Vector<T_DataType, T_Dim>, false>
+        {
+            static constexpr uint32_t value = (uint32_t) pmacc::math::Vector<T_DataType, T_Dim>::dim;
+        };
+
+        template<
+            typename T_Type,
+            int T_dim,
+            typename T_Accessor,
+            typename T_Navigator,
+            template<typename, int>
+            class T_Storage>
+        struct GetInitializedInstance<math::Vector<T_Type, T_dim, T_Accessor, T_Navigator, T_Storage>>
+        {
+            using Type = math::Vector<T_Type, T_dim, T_Accessor, T_Navigator, T_Storage>;
+            using ValueType = typename Type::type;
+
+            HDINLINE Type operator()(const ValueType value) const
+            {
+                return Type::create(value);
+            }
+        };
+
+    } // namespace traits
 } // namespace pmacc
 
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace math
-{
-
-/*#### comparison ############################################################*/
-
-/*specialize max algorithm*/
-template<typename Type, int dim>
-struct Max< ::pmacc::math::Vector<Type, dim>, ::pmacc::math::Vector<Type, dim> >
-{
-    using result = ::pmacc::math::Vector<Type, dim>;
-
-    HDINLINE result operator( )(const ::pmacc::math::Vector<Type, dim> &vector1, const ::pmacc::math::Vector<Type, dim> &vector2 )
+    namespace math
     {
-        result tmp;
-        for ( int i = 0; i < dim; ++i )
-            tmp[i] = pmacc::algorithms::math::max( vector1[i], vector2[i] );
-        return tmp;
-    }
-};
-
-/*specialize max algorithm*/
-template<typename Type, int dim>
-struct Min< ::pmacc::math::Vector<Type, dim>, ::pmacc::math::Vector<Type, dim> >
-{
-    using result = ::pmacc::math::Vector<Type, dim>;
-
-    HDINLINE result operator( )(const ::pmacc::math::Vector<Type, dim> &vector1, const ::pmacc::math::Vector<Type, dim> &vector2 )
-    {
-        result tmp;
-        for ( int i = 0; i < dim; ++i )
-            tmp[i] = pmacc::algorithms::math::min( vector1[i], vector2[i] );
-        return tmp;
-    }
-};
-
-/*#### abs ###################################################################*/
-
-/*specialize abs2 algorithm*/
-template<typename Type, int dim>
-struct Abs2< ::pmacc::math::Vector<Type, dim> >
-{
-    using result = typename ::pmacc::math::Vector<Type, dim>::type;
-
-    HDINLINE result operator( )(const ::pmacc::math::Vector<Type, dim> &vector )
-    {
-        result tmp = pmacc::algorithms::math::abs2( vector.x( ) );
-        for ( int i = 1; i < dim; ++i )
-            tmp += pmacc::algorithms::math::abs2( vector[i] );
-        return tmp;
-    }
-};
-
-/*specialize abs algorithm*/
-template<typename Type, int dim>
-struct Abs< ::pmacc::math::Vector<Type, dim> >
-{
-    using result = typename ::pmacc::math::Vector<Type, dim>::type;
-
-    HDINLINE result operator( )( ::pmacc::math::Vector<Type, dim> vector )
-    {
-        const result tmp = pmacc::algorithms::math::abs2( vector );
-        return pmacc::algorithms::math::sqrt( tmp );
-    }
-};
-
-/*#### cross #################################################################*/
-
-template<typename Type>
-struct Cross< ::pmacc::math::Vector<Type, DIM3>, ::pmacc::math::Vector<Type, DIM3> >
-{
-    using myType = ::pmacc::math::Vector<Type, DIM3>;
-    using result = myType;
-
-    HDINLINE myType operator( )(const myType& lhs, const myType & rhs )
-    {
-        return myType( lhs.y( ) * rhs.z( ) - lhs.z( ) * rhs.y( ),
-                       lhs.z( ) * rhs.x( ) - lhs.x( ) * rhs.z( ),
-                       lhs.x( ) * rhs.y( ) - lhs.y( ) * rhs.x( ) );
-    }
-};
-
-/*#### dot ###################################################################*/
-
-template<typename Type, int dim>
-struct Dot< ::pmacc::math::Vector<Type, dim>, ::pmacc::math::Vector<Type, dim> >
-{
-    using myType = ::pmacc::math::Vector<Type, dim>;
-    using result = Type;
-
-    HDINLINE result operator( )(const myType& a, const myType & b )
-    {
-        BOOST_STATIC_ASSERT( dim > 0 );
-        result tmp = a.x( ) * b.x( );
-        for ( int i = 1; i < dim; i++ )
-            tmp += a[i] * b[i];
-        return tmp;
-    }
-};
-
-/*#### exp ###################################################################*/
-
-/*! Specialization of exp where power is a vector
- *
- * Compute exp separately for every component of the vector.
- *
- * @param power vector with power values
- */
-template<typename T1, int dim>
-struct Exp< ::pmacc::math::Vector<T1, dim> >
-{
-    using Vector1 = ::pmacc::math::Vector<T1, dim>;
-    using result = Vector1;
-
-    HDINLINE result operator( )(const Vector1& power )
-    {
-        BOOST_STATIC_ASSERT( dim > 0 );
-        result tmp;
-        for ( int i = 0; i < dim; ++i )
-            tmp[i] = pmacc::algorithms::math::exp( power[i] );
-        return tmp;
-    }
-};
+        /*specialize max algorithm*/
+        template<typename Type, int dim>
+        struct Max<::pmacc::math::Vector<Type, dim>, ::pmacc::math::Vector<Type, dim>>
+        {
+            using result = ::pmacc::math::Vector<Type, dim>;
+
+            HDINLINE result operator()(
+                const ::pmacc::math::Vector<Type, dim>& vector1,
+                const ::pmacc::math::Vector<Type, dim>& vector2)
+            {
+                result tmp;
+                for(int i = 0; i < dim; ++i)
+                    tmp[i] = pmacc::math::max(vector1[i], vector2[i]);
+                return tmp;
+            }
+        };
+
+        /*specialize min algorithm*/
+        template<typename Type, int dim>
+        struct Min<::pmacc::math::Vector<Type, dim>, ::pmacc::math::Vector<Type, dim>>
+        {
+            using result = ::pmacc::math::Vector<Type, dim>;
+
+            HDINLINE result operator()(
+                const ::pmacc::math::Vector<Type, dim>& vector1,
+                const ::pmacc::math::Vector<Type, dim>& vector2)
+            {
+                result tmp;
+                for(int i = 0; i < dim; ++i)
+                    tmp[i] = pmacc::math::min(vector1[i], vector2[i]);
+                return tmp;
+            }
+        };
+
+        /*! Specialisation of cross where base is a vector with three components */
+        template<typename Type>
+        struct Cross<::pmacc::math::Vector<Type, DIM3>, ::pmacc::math::Vector<Type, DIM3>>
+        {
+            using myType = ::pmacc::math::Vector<Type, DIM3>;
+            using result = myType;
+
+            HDINLINE myType operator()(const myType& lhs, const myType& rhs)
+            {
+                return myType(
+                    lhs.y() * rhs.z() - lhs.z() * rhs.y(),
+                    lhs.z() * rhs.x() - lhs.x() * rhs.z(),
+                    lhs.x() * rhs.y() - lhs.y() * rhs.x());
+            }
+        };
+
+        /*! Specialisation of Dot where base is a vector */
+        template<typename Type, int dim>
+        struct Dot<::pmacc::math::Vector<Type, dim>, ::pmacc::math::Vector<Type, dim>>
+        {
+            using myType = ::pmacc::math::Vector<Type, dim>;
+            using result = Type;
+
+            HDINLINE result operator()(const myType& a, const myType& b)
+            {
+                PMACC_CASSERT(dim > 0);
+                result tmp = a.x() * b.x();
+                for(int i = 1; i < dim; i++)
+                    tmp += a[i] * b[i];
+                return tmp;
+            }
+        };
+
+        /*specialize abs2 algorithm*/
+        template<typename Type, int dim>
+        struct Abs2<::pmacc::math::Vector<Type, dim>>
+        {
+            using result = typename ::pmacc::math::Vector<Type, dim>::type;
+
+            HDINLINE result operator()(const ::pmacc::math::Vector<Type, dim>& vector)
+            {
+                result tmp = pmacc::math::abs2(vector.x());
+                for(int i = 1; i < dim; ++i)
+                    tmp += pmacc::math::abs2(vector[i]);
+                return tmp;
+            }
+        };
+
+        template<typename T_Vector, uint32_t T_direction>
+        HDINLINE T_Vector basisVector()
+        {
+            using Result = typename CT::make_BasisVector<T_Vector::dim, T_direction, typename T_Vector::type>::type;
+            return Result::toRT();
+        }
+
+    } // namespace math
+} // namespace pmacc
 
-/*#### pow ###################################################################*/
 
-/*! Specialisation of pow where base is a vector and exponent is a scalar
- *
- * Create pow separatley for every component of the vector.
- *
- * @prama base vector with base values
- * @param exponent scalar with exponent value
+/* Using the free alpaka functions `alpaka::math::*` will result into `__host__ __device__`
+ * errors, therefore the alpaka math trait must be used.
  */
-template<typename T1, typename T2, int dim>
-struct Pow< ::pmacc::math::Vector<T1, dim>, T2 >
-{
-    using Vector1 = ::pmacc::math::Vector<T1, dim>;
-    using result = Vector1;
-
-    HDINLINE result operator( )(const Vector1& base, const T2 & exponent )
-    {
-        BOOST_STATIC_ASSERT( dim > 0 );
-        result tmp;
-        for ( int i = 0; i < dim; ++i )
-            tmp[i] = pmacc::algorithms::math::pow( base[i], exponent );
-        return tmp;
+#define PMACC_UNARY_APAKA_MATH_SPECIALIZATION(functionName, alpakaMathTrait)                                          \
+    template<typename T_Ctx, typename T_ScalarType, int T_dim>                                                        \
+    struct alpakaMathTrait<T_Ctx, ::pmacc::math::Vector<T_ScalarType, T_dim>, void>                                   \
+    {                                                                                                                 \
+        using ResultType = ::pmacc::math::Vector<T_ScalarType, T_dim>;                                                \
+                                                                                                                      \
+        ALPAKA_FN_ACC static auto functionName(                                                                       \
+            T_Ctx const& mathConcept,                                                                                 \
+            ::pmacc::math::Vector<T_ScalarType, T_dim> const& vector) -> ResultType                                   \
+        {                                                                                                             \
+            PMACC_CASSERT(T_dim > 0);                                                                                 \
+                                                                                                                      \
+            ResultType tmp;                                                                                           \
+            for(int i = 0; i < T_dim; ++i)                                                                            \
+                tmp[i] = alpaka::math::functionName(mathConcept, vector[i]);                                          \
+            return tmp;                                                                                               \
+        }                                                                                                             \
     }
-};
-
-/*#### floor #################################################################*/
 
-/*specialize floor algorithm*/
-template<typename Type, int dim>
-struct Floor< ::pmacc::math::Vector<Type, dim> >
+namespace alpaka
 {
-    using result = ::pmacc::math::Vector<Type, dim>;
-
-    HDINLINE result operator( )( ::pmacc::math::Vector<Type, dim> &vector )
+    namespace math
     {
-        result tmp;
-        for ( int i = 0; i < dim; ++i )
-            tmp[i] = pmacc::algorithms::math::floor( vector[i] );
-        return tmp;
-    }
-};
-
-
-} // namespace math
-} // namespace algorithms
-} // namespace pmacc
+        namespace traits
+        {
+            /*! Specialisation of pow where base is a vector and exponent is a scalar
+             *
+             * Create pow separatley for every component of the vector.
+             */
+            template<typename T_Ctx, typename T_ScalarType, int T_dim>
+            struct Pow<T_Ctx, ::pmacc::math::Vector<T_ScalarType, T_dim>, T_ScalarType, void>
+            {
+                using ResultType = typename ::pmacc::math::Vector<T_ScalarType, T_dim>::type;
+
+                ALPAKA_FN_HOST_ACC static auto pow(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Vector<T_ScalarType, T_dim> const& vector,
+                    T_ScalarType const& exponent) -> ResultType
+                {
+                    PMACC_CASSERT(T_dim > 0);
+                    ResultType tmp;
+                    for(int i = 0; i < T_dim; ++i)
+                        tmp[i] = cupla::pow(vector[i], exponent);
+                    return tmp;
+                }
+            };
+
+            // Exp specialization
+            PMACC_UNARY_APAKA_MATH_SPECIALIZATION(exp, Exp);
+
+            // Floor specialization
+            PMACC_UNARY_APAKA_MATH_SPECIALIZATION(floor, Floor);
+
+            /* Abs specialization
+             *
+             * Returns the length of the vector to fit the old implementation.
+             * @todo implement a math function magnitude instead of using abs to get the length of the vector.
+             */
+            template<typename T_Ctx, typename T_ScalarType, int T_dim>
+            struct Abs<T_Ctx, ::pmacc::math::Vector<T_ScalarType, T_dim>, void>
+            {
+                using ResultType = typename ::pmacc::math::Vector<T_ScalarType, T_dim>::type;
+
+                ALPAKA_FN_HOST_ACC static auto abs(
+                    T_Ctx const& mathConcept,
+                    ::pmacc::math::Vector<T_ScalarType, T_dim> const& vector) -> ResultType
+                {
+                    PMACC_CASSERT(T_dim > 0);
+
+                    ResultType const tmp = pmacc::math::abs2(vector);
+                    return cupla::math::sqrt(tmp);
+                }
+            };
+
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace precisionCast
-{
-
-template<typename CastToType,
-int dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage>
-struct TypeCast<
-    CastToType,
-    ::pmacc::math::Vector<CastToType, dim, T_Accessor, T_Navigator, T_Storage>
->
-{
-    using result = const ::pmacc::math::Vector<
-        CastToType,
-        dim,
-        T_Accessor,
-        T_Navigator,
-        T_Storage>&;
-
-    HDINLINE result operator( )( result vector ) const
+    namespace algorithms
     {
-        return vector;
-    }
-};
-
-template<typename CastToType,
-typename OldType,
-int dim,
-typename T_Accessor,
-typename T_Navigator,
-template <typename, int> class T_Storage>
-struct TypeCast<
-    CastToType,
-    ::pmacc::math::Vector<OldType, dim, T_Accessor, T_Navigator, T_Storage>
->
-{
-    using result = ::pmacc::math::Vector<CastToType, dim>;
-    using ParamType = ::pmacc::math::Vector<OldType, dim, T_Accessor, T_Navigator, T_Storage>;
-
-    HDINLINE result operator( )(const ParamType& vector ) const
-    {
-        return result( vector );
-    }
-};
-
-} // namespace typecast
-} // namespace algorithms
+        namespace precisionCast
+        {
+            template<
+                typename CastToType,
+                int dim,
+                typename T_Accessor,
+                typename T_Navigator,
+                template<typename, int>
+                class T_Storage>
+            struct TypeCast<CastToType, ::pmacc::math::Vector<CastToType, dim, T_Accessor, T_Navigator, T_Storage>>
+            {
+                using result = const ::pmacc::math::Vector<CastToType, dim, T_Accessor, T_Navigator, T_Storage>&;
+
+                HDINLINE result operator()(result vector) const
+                {
+                    return vector;
+                }
+            };
+
+            template<
+                typename CastToType,
+                typename OldType,
+                int dim,
+                typename T_Accessor,
+                typename T_Navigator,
+                template<typename, int>
+                class T_Storage>
+            struct TypeCast<CastToType, ::pmacc::math::Vector<OldType, dim, T_Accessor, T_Navigator, T_Storage>>
+            {
+                using result = ::pmacc::math::Vector<CastToType, dim>;
+                using ParamType = ::pmacc::math::Vector<OldType, dim, T_Accessor, T_Navigator, T_Storage>;
+
+                HDINLINE result operator()(const ParamType& vector) const
+                {
+                    return result(vector);
+                }
+            };
+
+        } // namespace precisionCast
+    } // namespace algorithms
 } // namespace pmacc
 
 namespace pmacc
 {
-namespace algorithms
-{
-namespace promoteType
-{
-
-template<typename PromoteToType, typename OldType, int dim>
-struct promoteType<PromoteToType, ::pmacc::math::Vector<OldType, dim> >
-{
-    using PartType = typename promoteType<OldType, PromoteToType>::type;
-    using type = ::pmacc::math::Vector<PartType, dim>;
-};
-
-} // namespace promoteType
-} // namespace algorithms
+    namespace algorithms
+    {
+        namespace promoteType
+        {
+            template<typename PromoteToType, typename OldType, int dim>
+            struct promoteType<PromoteToType, ::pmacc::math::Vector<OldType, dim>>
+            {
+                using PartType = typename promoteType<OldType, PromoteToType>::type;
+                using type = ::pmacc::math::Vector<PartType, dim>;
+            };
+
+        } // namespace promoteType
+    } // namespace algorithms
 } // namespace pmacc
 
 namespace pmacc
 {
-namespace mpi
-{
-namespace def
-{
-
-template<int T_dim>
-struct GetMPI_StructAsArray< ::pmacc::math::Vector<float, T_dim> >
-{
-
-    MPI_StructAsArray operator( )( ) const
-    {
-        return MPI_StructAsArray( MPI_FLOAT, T_dim );
-    }
-};
-
-template<int T_dim, int T_N>
-struct GetMPI_StructAsArray< ::pmacc::math::Vector<float, T_dim>[T_N] >
-{
-
-    MPI_StructAsArray operator( )( ) const
+    namespace mpi
     {
-        return MPI_StructAsArray( MPI_FLOAT, T_dim * T_N );
-    }
-};
-
-template<int T_dim>
-struct GetMPI_StructAsArray< ::pmacc::math::Vector<double, T_dim> >
-{
-
-    MPI_StructAsArray operator( )( ) const
-    {
-        return MPI_StructAsArray( MPI_DOUBLE, T_dim );
-    }
-};
-
-template<int T_dim, int T_N>
-struct GetMPI_StructAsArray< ::pmacc::math::Vector<double, T_dim>[T_N] >
-{
-
-    MPI_StructAsArray operator( )( ) const
-    {
-        return MPI_StructAsArray( MPI_DOUBLE, T_dim * T_N  );
-    }
-};
-
-} // namespace def
-} // namespace mpi
+        namespace def
+        {
+            template<int T_dim>
+            struct GetMPI_StructAsArray<::pmacc::math::Vector<float, T_dim>>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_FLOAT, T_dim);
+                }
+            };
+
+            template<int T_dim, int T_N>
+            struct GetMPI_StructAsArray<::pmacc::math::Vector<float, T_dim>[T_N]>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_FLOAT, T_dim * T_N);
+                }
+            };
+
+            template<int T_dim>
+            struct GetMPI_StructAsArray<::pmacc::math::Vector<double, T_dim>>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_DOUBLE, T_dim);
+                }
+            };
+
+            template<int T_dim, int T_N>
+            struct GetMPI_StructAsArray<::pmacc::math::Vector<double, T_dim>[T_N]>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_DOUBLE, T_dim * T_N);
+                }
+            };
+
+        } // namespace def
+    } // namespace mpi
 } // namespace pmacc
diff --git a/include/pmacc/math/vector/accessor/StandardAccessor.hpp b/include/pmacc/math/vector/accessor/StandardAccessor.hpp
index 81484f330a..1030b50b87 100644
--- a/include/pmacc/math/vector/accessor/StandardAccessor.hpp
+++ b/include/pmacc/math/vector/accessor/StandardAccessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -25,25 +25,23 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-/** \todo rename this class to AccessorIdentity*/
-struct StandardAccessor
-{
-template<typename Data>
-HDINLINE Data& operator()(Data& data) const
-{
-    return data;
-}
-
-template<typename Data>
-HDINLINE const Data& operator()(const Data& data) const
-{
-    return data;
-}
+    namespace math
+    {
+        /** \todo rename this class to AccessorIdentity*/
+        struct StandardAccessor
+        {
+            template<typename Data>
+            HDINLINE Data& operator()(Data& data) const
+            {
+                return data;
+            }
 
-};
+            template<typename Data>
+            HDINLINE const Data& operator()(const Data& data) const
+            {
+                return data;
+            }
+        };
 
-} // math
-} // pmacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/compile-time/Float.hpp b/include/pmacc/math/vector/compile-time/Float.hpp
index 74edca93f7..3278d7ad27 100644
--- a/include/pmacc/math/vector/compile-time/Float.hpp
+++ b/include/pmacc/math/vector/compile-time/Float.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,43 +28,42 @@ namespace mpl = boost::mpl;
 
 namespace pmacc
 {
-namespace math
-{
-namespace CT
-{
-
-template<typename X = mpl::void_,
-         typename Y = mpl::void_,
-         typename Z = mpl::void_>
-struct Float
-{
-    using x = X;
-    using y = Y;
-    using z = Z;
+    namespace math
+    {
+        namespace CT
+        {
+            template<typename X = mpl::void_, typename Y = mpl::void_, typename Z = mpl::void_>
+            struct Float
+            {
+                using x = X;
+                using y = Y;
+                using z = Z;
 
-    static constexpr int dim = 3;
-};
+                static constexpr int dim = 3;
+            };
 
-template<>
-struct Float<> {};
+            template<>
+            struct Float<>
+            {
+            };
 
-template<typename X>
-struct Float<X>
-{
-    using x = X;
+            template<typename X>
+            struct Float<X>
+            {
+                using x = X;
 
-    static constexpr int dim = 1;
-};
+                static constexpr int dim = 1;
+            };
 
-template<typename X, typename Y>
-struct Float<X, Y>
-{
-    using x = X;
-    using y = Y;
+            template<typename X, typename Y>
+            struct Float<X, Y>
+            {
+                using x = X;
+                using y = Y;
 
-    static constexpr int dim = 2u;
-};
+                static constexpr int dim = 2u;
+            };
 
-} // CT
-} // math
-} // pmacc
+        } // namespace CT
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/compile-time/Int.hpp b/include/pmacc/math/vector/compile-time/Int.hpp
index 4334ff2366..324e9ee611 100644
--- a/include/pmacc/math/vector/compile-time/Int.hpp
+++ b/include/pmacc/math/vector/compile-time/Int.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,65 +28,65 @@
 
 namespace pmacc
 {
-namespace math
-{
-namespace CT
-{
+    namespace math
+    {
+        namespace CT
+        {
+            /** Compile time int vector
+             *
+             *
+             * @tparam x value for x allowed range [INT_MIN;INT_MAX-1]
+             * @tparam y value for y allowed range [INT_MIN;INT_MAX-1]
+             * @tparam z value for z allowed range [INT_MIN;INT_MAX-1]
+             *
+             * default parameter is used to distinguish between values given by
+             * the user and unset values.
+             */
+            template<
+                int x = traits::limits::Max<int>::value,
+                int y = traits::limits::Max<int>::value,
+                int z = traits::limits::Max<int>::value>
+            struct Int : public CT::Vector<mpl::integral_c<int, x>, mpl::integral_c<int, y>, mpl::integral_c<int, z>>
+            {
+            };
 
-/** Compile time int vector
- *
- *
- * @tparam x value for x allowed range [INT_MIN;INT_MAX-1]
- * @tparam y value for y allowed range [INT_MIN;INT_MAX-1]
- * @tparam z value for z allowed range [INT_MIN;INT_MAX-1]
- *
- * default parameter is used to distinguish between values given by
- * the user and unset values.
- */
-template<int x = traits::limits::Max<int>::value,
-         int y = traits::limits::Max<int>::value,
-         int z = traits::limits::Max<int>::value>
-struct Int: public CT::Vector<mpl::integral_c<int, x>,
-                              mpl::integral_c<int, y>,
-                              mpl::integral_c<int, z> >
-{};
+            template<>
+            struct Int<> : public CT::Vector<>
+            {
+            };
 
-template<>
-struct Int<> : public CT::Vector<>
-{};
+            template<int x>
+            struct Int<x> : public CT::Vector<mpl::integral_c<int, x>>
+            {
+            };
 
-template<int x>
-struct Int<x> : public CT::Vector<mpl::integral_c<int, x> >
-{};
+            template<int x, int y>
+            struct Int<x, y> : public CT::Vector<mpl::integral_c<int, x>, mpl::integral_c<int, y>>
+            {
+            };
 
-template<int x, int y>
-struct Int<x, y> : public CT::Vector<mpl::integral_c<int, x>,
-                                     mpl::integral_c<int, y> >
-{};
 
+            template<int dim, int val>
+            struct make_Int;
 
+            template<int val>
+            struct make_Int<1, val>
+            {
+                using type = Int<val>;
+            };
 
-template<int dim, int val>
-struct make_Int;
+            template<int val>
+            struct make_Int<2, val>
+            {
+                using type = Int<val, val>;
+            };
 
-template<int val>
-struct make_Int<1, val>
-{
-    using type = Int<val>;
-};
-
-template<int val>
-struct make_Int<2, val>
-{
-    using type = Int<val, val>;
-};
-
-template<int val>
-struct make_Int<3, val>
-{
-    using type = Int<val, val, val>;
-};
+            template<int val>
+            struct make_Int<3, val>
+            {
+                using type = Int<val, val, val>;
+            };
 
-} // CT
-} // math
-} // pmacc
+        } // namespace CT
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/compile-time/Size_t.hpp b/include/pmacc/math/vector/compile-time/Size_t.hpp
index 307ee62e48..bb7d844fcd 100644
--- a/include/pmacc/math/vector/compile-time/Size_t.hpp
+++ b/include/pmacc/math/vector/compile-time/Size_t.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,42 +28,44 @@
 
 namespace pmacc
 {
-namespace math
-{
-namespace CT
-{
-
-/** Compile time size_t vector
- *
- *
- * @tparam x value for x allowed range [0;max size_t value -1]
- * @tparam y value for y allowed range [0;max size_t value -1]
- * @tparam z value for z allowed range [0;max size_t value -1]
- *
- * default parameter is used to distinguish between values given by
- * the user and unset values.
- */
-template<size_t x = traits::limits::Max<size_t>::value,
-         size_t y = traits::limits::Max<size_t>::value,
-         size_t z = traits::limits::Max<size_t>::value>
-struct Size_t : public CT::Vector<mpl::integral_c<size_t, x>,
-                              mpl::integral_c<size_t, y>,
-                              mpl::integral_c<size_t, z> >
-{};
+    namespace math
+    {
+        namespace CT
+        {
+            /** Compile time size_t vector
+             *
+             *
+             * @tparam x value for x allowed range [0;max size_t value -1]
+             * @tparam y value for y allowed range [0;max size_t value -1]
+             * @tparam z value for z allowed range [0;max size_t value -1]
+             *
+             * default parameter is used to distinguish between values given by
+             * the user and unset values.
+             */
+            template<
+                size_t x = traits::limits::Max<size_t>::value,
+                size_t y = traits::limits::Max<size_t>::value,
+                size_t z = traits::limits::Max<size_t>::value>
+            struct Size_t
+                : public CT::Vector<mpl::integral_c<size_t, x>, mpl::integral_c<size_t, y>, mpl::integral_c<size_t, z>>
+            {
+            };
 
-template<>
-struct Size_t<> : public CT::Vector<>
-{};
+            template<>
+            struct Size_t<> : public CT::Vector<>
+            {
+            };
 
-template<size_t x>
-struct Size_t<x> : public CT::Vector<mpl::integral_c<size_t, x> >
-{};
+            template<size_t x>
+            struct Size_t<x> : public CT::Vector<mpl::integral_c<size_t, x>>
+            {
+            };
 
-template<size_t x, size_t y>
-struct Size_t<x, y> : public CT::Vector<mpl::integral_c<size_t, x>,
-                                    mpl::integral_c<size_t, y> >
-{};
+            template<size_t x, size_t y>
+            struct Size_t<x, y> : public CT::Vector<mpl::integral_c<size_t, x>, mpl::integral_c<size_t, y>>
+            {
+            };
 
-} // CT
-} // math
-} // pmacc
+        } // namespace CT
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/compile-time/TwistComponents.hpp b/include/pmacc/math/vector/compile-time/TwistComponents.hpp
index 69a7d1b8d7..00be3d31bd 100644
--- a/include/pmacc/math/vector/compile-time/TwistComponents.hpp
+++ b/include/pmacc/math/vector/compile-time/TwistComponents.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -25,42 +25,41 @@
 
 namespace pmacc
 {
-namespace math
-{
-namespace CT
-{
-
-/**
- * @class TwistComponents
- * @brief Twists axes of a compile-time vector.
- * @tparam Vec compile-time vector to be twisted
- * @tparam Axes compile-time vector containing new axes
- *
- * Example:
- *
- * using Orientation_Y = pmacc::math::CT::Int<1,2,0>;
- * using TwistedBlockDim = typename pmacc::math::CT::TwistComponents<BlockDim, Orientation_Y>::type;
- */
-template<typename Vec, typename Axes, int dim=Vec::dim>
-struct TwistComponents;
+    namespace math
+    {
+        namespace CT
+        {
+            /**
+             * @class TwistComponents
+             * @brief Twists axes of a compile-time vector.
+             * @tparam Vec compile-time vector to be twisted
+             * @tparam Axes compile-time vector containing new axes
+             *
+             * Example:
+             *
+             * using Orientation_Y = pmacc::math::CT::Int<1,2,0>;
+             * using TwistedBlockDim = typename pmacc::math::CT::TwistComponents<BlockDim, Orientation_Y>::type;
+             */
+            template<typename Vec, typename Axes, int dim = Vec::dim>
+            struct TwistComponents;
 
-template<typename Vec, typename Axes>
-struct TwistComponents<Vec, Axes, DIM2>
-{
-    using type = math::CT::Vector<
-        typename Vec::template at<Axes::x::value>::type,
-        typename Vec::template at<Axes::y::value>::type>;
-};
+            template<typename Vec, typename Axes>
+            struct TwistComponents<Vec, Axes, DIM2>
+            {
+                using type = math::CT::Vector<
+                    typename Vec::template at<Axes::x::value>::type,
+                    typename Vec::template at<Axes::y::value>::type>;
+            };
 
-template<typename Vec, typename Axes>
-struct TwistComponents<Vec, Axes, DIM3>
-{
-    using type = math::CT::Vector<
-        typename Vec::template at<Axes::x::value>::type,
-        typename Vec::template at<Axes::y::value>::type,
-        typename Vec::template at<Axes::z::value>::type>;
-};
+            template<typename Vec, typename Axes>
+            struct TwistComponents<Vec, Axes, DIM3>
+            {
+                using type = math::CT::Vector<
+                    typename Vec::template at<Axes::x::value>::type,
+                    typename Vec::template at<Axes::y::value>::type,
+                    typename Vec::template at<Axes::z::value>::type>;
+            };
 
-} // namespace CT
-} // namespace math
+        } // namespace CT
+    } // namespace math
 } // namespace pmacc
diff --git a/include/pmacc/math/vector/compile-time/UInt32.hpp b/include/pmacc/math/vector/compile-time/UInt32.hpp
index 0e797aa1e5..dcd813a61f 100644
--- a/include/pmacc/math/vector/compile-time/UInt32.hpp
+++ b/include/pmacc/math/vector/compile-time/UInt32.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,44 +28,46 @@
 
 namespace pmacc
 {
-namespace math
-{
-namespace CT
-{
-
-/** Compile time uint vector
- *
- *
- * @tparam x value for x allowed range [0;max uint32_t value -1]
- * @tparam y value for y allowed range [0;max uint32_t value -1]
- * @tparam z value for z allowed range [0;max uint32_t value -1]
- *
- * default parameter is used to distinguish between values given by
- * the user and unset values.
- */
-template<uint32_t x = traits::limits::Max<uint32_t>::value,
-         uint32_t y = traits::limits::Max<uint32_t>::value,
-         uint32_t z = traits::limits::Max<uint32_t>::value>
-struct UInt32 : public CT::Vector<mpl::integral_c<uint32_t, x>,
-                                                   mpl::integral_c<uint32_t, y>,
-                                                   mpl::integral_c<uint32_t, z> >
-{};
-
-template<>
-struct UInt32<> : public CT::Vector<>
-{};
+    namespace math
+    {
+        namespace CT
+        {
+            /** Compile time uint32_t vector
+             *
+             *
+             * @tparam x value for x allowed range [0;max uint32_t value -1]
+             * @tparam y value for y allowed range [0;max uint32_t value -1]
+             * @tparam z value for z allowed range [0;max uint32_t value -1]
+             *
+             * default parameter is used to distinguish between values given by
+             * the user and unset values.
+             */
+            template<
+                uint32_t x = traits::limits::Max<uint32_t>::value,
+                uint32_t y = traits::limits::Max<uint32_t>::value,
+                uint32_t z = traits::limits::Max<uint32_t>::value>
+            struct UInt32
+                : public CT::
+                      Vector<mpl::integral_c<uint32_t, x>, mpl::integral_c<uint32_t, y>, mpl::integral_c<uint32_t, z>>
+            {
+            };
 
-template<uint32_t x>
-struct UInt32<x> : public CT::Vector< mpl::integral_c<uint32_t, x> >
-{};
+            template<>
+            struct UInt32<> : public CT::Vector<>
+            {
+            };
 
-template<uint32_t x, uint32_t y>
-struct UInt32<x, y> : public CT::Vector<mpl::integral_c<uint32_t, x>,
-                                                mpl::integral_c<uint32_t, y> >
-{};
+            template<uint32_t x>
+            struct UInt32<x> : public CT::Vector<mpl::integral_c<uint32_t, x>>
+            {
+            };
 
+            template<uint32_t x, uint32_t y>
+            struct UInt32<x, y> : public CT::Vector<mpl::integral_c<uint32_t, x>, mpl::integral_c<uint32_t, y>>
+            {
+            };
 
 
-} // CT
-} // math
-} // pmacc
+        } // namespace CT
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/compile-time/UInt64.hpp b/include/pmacc/math/vector/compile-time/UInt64.hpp
index 23800ae137..193195d2f3 100644
--- a/include/pmacc/math/vector/compile-time/UInt64.hpp
+++ b/include/pmacc/math/vector/compile-time/UInt64.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -28,44 +28,46 @@
 
 namespace pmacc
 {
-namespace math
-{
-namespace CT
-{
-
-/** Compile time uint vector
- *
- *
- * @tparam x value for x allowed range [0;max uint64_t value -1]
- * @tparam y value for y allowed range [0;max uint64_t value -1]
- * @tparam z value for z allowed range [0;max uint64_t value -1]
- *
- * default parameter is used to distinguish between values given by
- * the user and unset values.
- */
-template<uint64_t x = traits::limits::Max<uint64_t>::value,
-         uint64_t y = traits::limits::Max<uint64_t>::value,
-         uint64_t z = traits::limits::Max<uint64_t>::value>
-struct UInt64 : public CT::Vector<mpl::integral_c<uint64_t, x>,
-                                  mpl::integral_c<uint64_t, y>,
-                                  mpl::integral_c<uint64_t, z> >
-{};
-
-template<>
-struct UInt64<> : public CT::Vector<>
-{};
+    namespace math
+    {
+        namespace CT
+        {
+            /** Compile time uint64_t vector
+             *
+             *
+             * @tparam x value for x allowed range [0;max uint64_t value -1]
+             * @tparam y value for y allowed range [0;max uint64_t value -1]
+             * @tparam z value for z allowed range [0;max uint64_t value -1]
+             *
+             * default parameter is used to distinguish between values given by
+             * the user and unset values.
+             */
+            template<
+                uint64_t x = traits::limits::Max<uint64_t>::value,
+                uint64_t y = traits::limits::Max<uint64_t>::value,
+                uint64_t z = traits::limits::Max<uint64_t>::value>
+            struct UInt64
+                : public CT::
+                      Vector<mpl::integral_c<uint64_t, x>, mpl::integral_c<uint64_t, y>, mpl::integral_c<uint64_t, z>>
+            {
+            };
 
-template<uint64_t x>
-struct UInt64<x> : public CT::Vector< mpl::integral_c<uint64_t, x> >
-{};
+            template<>
+            struct UInt64<> : public CT::Vector<>
+            {
+            };
 
-template<uint64_t x, uint64_t y>
-struct UInt64<x, y> : public CT::Vector<mpl::integral_c<uint64_t, x>,
-                                        mpl::integral_c<uint64_t, y> >
-{};
+            template<uint64_t x>
+            struct UInt64<x> : public CT::Vector<mpl::integral_c<uint64_t, x>>
+            {
+            };
 
+            template<uint64_t x, uint64_t y>
+            struct UInt64<x, y> : public CT::Vector<mpl::integral_c<uint64_t, x>, mpl::integral_c<uint64_t, y>>
+            {
+            };
 
 
-} // CT
-} // math
-} // pmacc
+        } // namespace CT
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/compile-time/Vector.hpp b/include/pmacc/math/vector/compile-time/Vector.hpp
index d440093187..b2e144ba5a 100644
--- a/include/pmacc/math/vector/compile-time/Vector.hpp
+++ b/include/pmacc/math/vector/compile-time/Vector.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -40,455 +40,445 @@
 
 namespace pmacc
 {
-namespace math
-{
-namespace CT
-{
-
-namespace mpl = boost::mpl;
-
-namespace detail
-{
-template<int dim>
-struct VectorFromCT;
-
-template<>
-struct VectorFromCT<1>
-{
-
-    template<typename Vec, typename CTVec>
-    HDINLINE void operator()(Vec& vec, CTVec) const
-    {
-        BOOST_STATIC_ASSERT(Vec::dim == 1);
-        BOOST_STATIC_ASSERT(CTVec::dim == 1);
-        vec[0] = (typename Vec::type)CTVec::x::value;
-    }
-};
-
-template<>
-struct VectorFromCT<2>
-{
-
-    template<typename Vec, typename CTVec>
-    HDINLINE void operator()(Vec& vec, CTVec) const
-    {
-        BOOST_STATIC_ASSERT(Vec::dim == 2);
-        BOOST_STATIC_ASSERT(CTVec::dim == 2);
-        vec[0] = (typename Vec::type)CTVec::x::value;
-        vec[1] = (typename Vec::type)CTVec::y::value;
-    }
-};
-
-template<>
-struct VectorFromCT<3>
-{
-
-    template<typename Vec, typename CTVec>
-    HDINLINE void operator()(Vec& vec, CTVec) const
-    {
-        BOOST_STATIC_ASSERT(Vec::dim == 3);
-        BOOST_STATIC_ASSERT(CTVec::dim == 3);
-        vec[0] = (typename Vec::type)CTVec::x::value;
-        vec[1] = (typename Vec::type)CTVec::y::value;
-        vec[2] = (typename Vec::type)CTVec::z::value;
-    }
-};
-
-template<typename Arg0>
-struct TypeSelector
-{
-    using type = Arg0;
-};
-
-/** get integral type*/
-template<typename T, T value>
-struct TypeSelector<mpl::integral_c<T, value > >
-{
-    using type = T;
-};
-
-template<>
-struct TypeSelector<mpl::na>
-{
-    using type = mpl::int_<0>;
-};
-
-}
-
-namespace mpl = boost::mpl;
-
-template<typename Arg0 = mpl::na,
-typename Arg1 = mpl::na,
-typename Arg2 = mpl::na>
-struct Vector
-{
-    using x = Arg0;
-    using y = Arg1;
-    using z = Arg2;
-
-    using mplVector = mpl::vector<x, y, z>;
-
-    template<int element>
-    struct at
-    {
-        using type = typename mpl::at_c<mplVector, element>::type;
-    };
-
-    static constexpr int dim = mpl::size<mplVector >::type::value;
-
-    using type = typename detail::TypeSelector<x>::type;
-    using This = Vector<x, y, z>;
-    using RT_type = math::Vector<type, dim>;
-    using vector_type = This;
-
-    template<typename OtherType>
-    HDINLINE
-    operator math::Vector<OtherType, dim>() const
-    {
-        math::Vector<OtherType, dim> result;
-        math::CT::detail::VectorFromCT<dim>()(result, *this);
-        return result;
-    }
-
-    /** Create a runtime Vector
-     *
-     *  Creates the corresponding runtime vector object.
-     *
-     *  \return RT_type runtime vector with same value type
-     */
-    static HDINLINE RT_type toRT()
+    namespace math
     {
-        math::Vector<type, dim> result;
-        math::CT::detail::VectorFromCT<dim>()(result, This());
-        return result;
-    }
-};
-
-//*********************************************************
-
-//________________________OperatorBase____________________________
-
-template<typename Lhs, typename Rhs, typename T_BinaryOperator>
-struct applyOperator
-{
-    using type = typename applyOperator<
-        typename Lhs::vector_type,
-        typename Rhs::vector_type,
-        T_BinaryOperator
-    >::type;
-};
-
-template<typename T_TypeA,
-typename T_TypeB,
-typename T_BinaryOperator>
-struct applyOperator<CT::Vector<T_TypeA>, CT::Vector<T_TypeB>, T_BinaryOperator>
-{
-    using OpResult = typename mpl::apply<T_BinaryOperator, T_TypeA, T_TypeB>::type;
-    using type = CT::Vector<OpResult>;
-};
-
-template<typename T_TypeA0, typename T_TypeA1,
-typename T_TypeB0, typename T_TypeB1,
-typename T_BinaryOperator>
-struct applyOperator<CT::Vector<T_TypeA0, T_TypeA1>,
-CT::Vector<T_TypeB0, T_TypeB1>,
-T_BinaryOperator>
-{
-    using OpResult0 = typename mpl::apply<T_BinaryOperator, T_TypeA0, T_TypeB0>::type;
-    using OpResult1 = typename mpl::apply<T_BinaryOperator, T_TypeA1, T_TypeB1>::type;
-    using type = CT::Vector<OpResult0, OpResult1>;
-};
-
-template<typename T_TypeA0, typename T_TypeA1, typename T_TypeA2,
-typename T_TypeB0, typename T_TypeB1, typename T_TypeB2,
-typename T_BinaryOperator>
-struct applyOperator<CT::Vector<T_TypeA0, T_TypeA1, T_TypeA2>,
-CT::Vector<T_TypeB0, T_TypeB1, T_TypeB2>,
-T_BinaryOperator>
-{
-    using OpResult0 = typename mpl::apply<T_BinaryOperator, T_TypeA0, T_TypeB0>::type;
-    using OpResult1 = typename mpl::apply<T_BinaryOperator, T_TypeA1, T_TypeB1>::type;
-    using OpResult2 = typename mpl::apply<T_BinaryOperator, T_TypeA2, T_TypeB2>::type;
-    using type = CT::Vector<OpResult0, OpResult1, OpResult2>;
-};
-
-//________________________A D D____________________________
-
-template<typename Lhs, typename Rhs>
-struct add
-{
-    using type = typename applyOperator<
-        typename Lhs::vector_type,
-        typename Rhs::vector_type,
-        mpl::plus<mpl::_1, mpl::_2>
-    >::type;
-};
-
-//________________________M U L____________________________
-
-template<typename Lhs, typename Rhs>
-struct mul
-{
-    using type = typename applyOperator<
-        typename Lhs::vector_type,
-        typename Rhs::vector_type,
-        mpl::times<mpl::_1, mpl::_2>
-    >::type;
-};
-
-//________________________M A X____________________________
-
-/** maximum value
- *
- * @tparam Lhs input vector
- * @tparam Rhs input vector
- * @return ::type if Rhs is not given - maximum value in elements of Lhs else
- *         vector with point-wise maximum value per component
- */
-template<typename Lhs, typename Rhs = void>
-struct max
-{
-    using type = typename applyOperator<
-        typename Lhs::vector_type,
-        typename Rhs::vector_type,
-        mpl::max<mpl::_1, mpl::_2>
-    >::type;
-};
-
-
-/** get element with maximum value
- *
- * @tparam T_Vec input vector
- * @return ::type maximum value in elements of T_Vec
- */
-template<typename T_Vec>
-struct max<
-    T_Vec,
-    void
->
-{
-    using type = typename mpl::accumulate<
-        typename T_Vec::mplVector,
-        typename T_Vec::x,
-        mpl::max<
-            mpl::_1,
-            mpl::_2
-        >
-    >::type;
-};
-
-//________________________M I N____________________________
-
-
-/** minimum value
- *
- * @tparam Lhs input vector
- * @tparam Rhs input vector
- * @return ::type if Rhs is not given - minimum value in elements of Lhs else
- *         vector with point-wise minimum value per component
- */
-template<typename Lhs, typename Rhs = void>
-struct min
-{
-    using type = typename applyOperator<
-        typename Lhs::vector_type,
-        typename Rhs::vector_type,
-        mpl::min<mpl::_1, mpl::_2>
-    >::type;
-};
-
-/** get element with minimum value
- *
- * @tparam T_Vec input vector
- * @return ::type minimum value in elements of T_Vec
- */
-template<typename T_Vec>
-struct min<
-    T_Vec,
-    void
->
-{
-    using type = typename mpl::accumulate<
-        typename T_Vec::mplVector,
-        typename T_Vec::x,
-        mpl::min<
-            mpl::_1,
-            mpl::_2
-        >
-    >::type;
-};
-
-//________________________D O T____________________________
-
-template<typename Lhs, typename Rhs>
-struct dot
-{
-    using MulResult = typename mul<Lhs, Rhs>::type;
-    using type = typename mpl::accumulate<
-        typename MulResult::mplVector,
-        mpl::int_<0>,
-        mpl::plus<mpl::_1, mpl::_2>
-    >::type;
-};
-
-//________________________V O L U M E____________________________
-
-template<typename T_Vec>
-struct volume
-{
-    using type = typename mpl::accumulate<
-        typename T_Vec::mplVector,
-        mpl::int_<1>,
-        mpl::times<mpl::_1, mpl::_2>
-    >::type;
-};
-
-//________________________S H R I N K T O________________________
-
-/** shrink CT vector to given component count (dimension)
- *
- * This operation is designed to handle vectors with up to 3 components
- *
- * @tparam T_Vec vector to shrink
- * @tparam T_dim target component count
- * @treturn ::type new shrinked vector
- */
-template<typename T_Vec, uint32_t T_dim>
-struct shrinkTo;
-
-template<typename T_Vec>
-struct shrinkTo<T_Vec, DIM3>
-{
-    using Vec = T_Vec;
-    using type = CT::Vector<typename Vec::x, typename Vec::y, typename Vec::z>;
-};
-
-template<typename T_Vec>
-struct shrinkTo<T_Vec, DIM2>
-{
-    using Vec = T_Vec;
-    using type = CT::Vector<typename Vec::x, typename Vec::y, mpl::na>;
-};
-
-template<typename T_Vec>
-struct shrinkTo<T_Vec, DIM1>
-{
-    using Vec = T_Vec;
-    using type = CT::Vector<typename Vec::x, mpl::na, mpl::na>;
-};
-
-//________________________A S S I G N________________________
-
-/** Assign a type to a given component in the CT::Vector
- *
- * defines a public type as result
- *
- * @tparam T_Vec math::CT::Vector which should be changed
- * @tparam T_ComponentPos number of component to changed (type must be bmpl::integral_c<anyType,X>)
- * @tparam T_Value new value
- */
-template<typename T_Vec, typename T_ComponentPos, typename T_Value>
-struct Assign;
-
-template<typename T_Value, typename T_0, typename T_1, typename T_2, typename T_IntegralType>
-struct Assign<pmacc::math::CT::Vector<T_0, T_1, T_2>, bmpl::integral_c<T_IntegralType,0> , T_Value>
-{
-    using type = pmacc::math::CT::Vector<T_Value, T_1, T_2>;
-};
-
-template<typename T_Value, typename T_0, typename T_1, typename T_2, typename T_IntegralType>
-struct Assign<pmacc::math::CT::Vector<T_0, T_1, T_2>, bmpl::integral_c<T_IntegralType,1>, T_Value>
-{
-    using type = pmacc::math::CT::Vector<T_0, T_Value, T_2>;
-};
-
-template<typename T_Value, typename T_0, typename T_1, typename T_2, typename T_IntegralType>
-struct Assign<pmacc::math::CT::Vector<T_0, T_1, T_2>, bmpl::integral_c<T_IntegralType,2>, T_Value>
-{
-    using type = pmacc::math::CT::Vector<T_0, T_1, T_Value>;
-};
-
-/** Assign a type to a given component in the CT::Vector if position is not out of range
- *
- * if T_ComponentPos < T_Vec::dim ? T_Value is assigned to component T_ComponentPos
- * else nothing is done.
- * defines a public type as result
- *
- * @tparam T_Vec math::CT::Vector which should be changed
- * @tparam T_ComponentPos number of component to changed (type must be bmpl::integral_c<anyType,X>)
- * @tparam T_Value new value
- */
-template<typename T_Vec, typename T_ComponentPos, typename T_Value>
-struct AssignIfInRange
-{
-    using VectorDim = bmpl::integral_c<size_t,T_Vec::dim>;
-    using type = typename bmpl::if_<
-        bmpl::less<T_ComponentPos, VectorDim>,
-        typename pmacc::math::CT::Assign<T_Vec,T_ComponentPos,T_Value>::type,
-        T_Vec
-    >::type;
-};
-
-//________________________At_c____________________________
-
-/** get element from a CT::Vector
- *
- * defines a public type as result
- *
- * @tparam T_Vec input CT::Vector
- * @tparam T_idx integral index of the component
- */
-template<typename T_Vec,size_t T_idx>
-struct At_c
-{
-    using type = typename mpl::at_c<typename T_Vec::mplVector,T_idx>::type;
-};
-
-//________________________At____________________________
-
-/** get element from a CT::Vector
- *
- * defines a public type as result
- *
- * @tparam T_Vec input CT::Vector
- * @tparam T_Idx integral type index of the component (e.g. boost::mpl::int_<2>)
- */
-template<typename T_Vec, typename T_Idx>
-struct At
-{
-    using type = typename mpl::at<typename T_Vec::mplVector,T_Idx>::type;
-};
-
-//________________________make_Vector___________________
-
-/** create CT::Vector with equal elements
- *
- * defines a public type as result
- *
- * @tparam T_dim count of components
- * @tparam T_Type type which is assigned to all components
- */
-template<int T_dim, typename T_Type>
-struct make_Vector;
-
-template<typename T_Type>
-struct make_Vector<1, T_Type>
-{
-    using type = pmacc::math::CT::Vector<T_Type>;
-};
-
-template<typename T_Type>
-struct make_Vector<2, T_Type>
-{
-    using type = pmacc::math::CT::Vector<T_Type, T_Type>;
-};
-
-template<typename T_Type>
-struct make_Vector<3, T_Type>
-{
-    using type = pmacc::math::CT::Vector<T_Type, T_Type, T_Type>;
-};
-
-} // CT
-} // math
-} // pmacc
+        namespace CT
+        {
+            namespace mpl = boost::mpl;
+
+            namespace detail
+            {
+                template<int dim>
+                struct VectorFromCT;
+
+                template<>
+                struct VectorFromCT<1>
+                {
+                    template<typename Vec, typename CTVec>
+                    HDINLINE void operator()(Vec& vec, CTVec) const
+                    {
+                        BOOST_STATIC_ASSERT(Vec::dim == 1);
+                        BOOST_STATIC_ASSERT(CTVec::dim == 1);
+                        vec[0] = (typename Vec::type) CTVec::x::value;
+                    }
+                };
+
+                template<>
+                struct VectorFromCT<2>
+                {
+                    template<typename Vec, typename CTVec>
+                    HDINLINE void operator()(Vec& vec, CTVec) const
+                    {
+                        BOOST_STATIC_ASSERT(Vec::dim == 2);
+                        BOOST_STATIC_ASSERT(CTVec::dim == 2);
+                        vec[0] = (typename Vec::type) CTVec::x::value;
+                        vec[1] = (typename Vec::type) CTVec::y::value;
+                    }
+                };
+
+                template<>
+                struct VectorFromCT<3>
+                {
+                    template<typename Vec, typename CTVec>
+                    HDINLINE void operator()(Vec& vec, CTVec) const
+                    {
+                        BOOST_STATIC_ASSERT(Vec::dim == 3);
+                        BOOST_STATIC_ASSERT(CTVec::dim == 3);
+                        vec[0] = (typename Vec::type) CTVec::x::value;
+                        vec[1] = (typename Vec::type) CTVec::y::value;
+                        vec[2] = (typename Vec::type) CTVec::z::value;
+                    }
+                };
+
+                template<typename Arg0>
+                struct TypeSelector
+                {
+                    using type = Arg0;
+                };
+
+                /** get integral type*/
+                template<typename T, T value>
+                struct TypeSelector<mpl::integral_c<T, value>>
+                {
+                    using type = T;
+                };
+
+                template<>
+                struct TypeSelector<mpl::na>
+                {
+                    using type = mpl::int_<0>;
+                };
+
+            } // namespace detail
+
+            namespace mpl = boost::mpl;
+
+            template<typename Arg0 = mpl::na, typename Arg1 = mpl::na, typename Arg2 = mpl::na>
+            struct Vector
+            {
+                using x = Arg0;
+                using y = Arg1;
+                using z = Arg2;
+
+                using mplVector = mpl::vector<x, y, z>;
+
+                template<int element>
+                struct at
+                {
+                    using type = typename mpl::at_c<mplVector, element>::type;
+                };
+
+                static constexpr int dim = mpl::size<mplVector>::type::value;
+
+                using type = typename detail::TypeSelector<x>::type;
+                using This = Vector<x, y, z>;
+                using RT_type = math::Vector<type, dim>;
+                using vector_type = This;
+
+                template<typename OtherType>
+                HDINLINE operator math::Vector<OtherType, dim>() const
+                {
+                    math::Vector<OtherType, dim> result;
+                    math::CT::detail::VectorFromCT<dim>()(result, *this);
+                    return result;
+                }
+
+                /** Create a runtime Vector
+                 *
+                 *  Creates the corresponding runtime vector object.
+                 *
+                 *  \return RT_type runtime vector with same value type
+                 */
+                static HDINLINE RT_type toRT()
+                {
+                    math::Vector<type, dim> result;
+                    math::CT::detail::VectorFromCT<dim>()(result, This());
+                    return result;
+                }
+            };
+
+            //*********************************************************
+
+            //________________________OperatorBase____________________________
+
+            template<typename Lhs, typename Rhs, typename T_BinaryOperator>
+            struct applyOperator
+            {
+                using type =
+                    typename applyOperator<typename Lhs::vector_type, typename Rhs::vector_type, T_BinaryOperator>::
+                        type;
+            };
+
+            template<typename T_TypeA, typename T_TypeB, typename T_BinaryOperator>
+            struct applyOperator<CT::Vector<T_TypeA>, CT::Vector<T_TypeB>, T_BinaryOperator>
+            {
+                using OpResult = typename mpl::apply<T_BinaryOperator, T_TypeA, T_TypeB>::type;
+                using type = CT::Vector<OpResult>;
+            };
+
+            template<
+                typename T_TypeA0,
+                typename T_TypeA1,
+                typename T_TypeB0,
+                typename T_TypeB1,
+                typename T_BinaryOperator>
+            struct applyOperator<CT::Vector<T_TypeA0, T_TypeA1>, CT::Vector<T_TypeB0, T_TypeB1>, T_BinaryOperator>
+            {
+                using OpResult0 = typename mpl::apply<T_BinaryOperator, T_TypeA0, T_TypeB0>::type;
+                using OpResult1 = typename mpl::apply<T_BinaryOperator, T_TypeA1, T_TypeB1>::type;
+                using type = CT::Vector<OpResult0, OpResult1>;
+            };
+
+            template<
+                typename T_TypeA0,
+                typename T_TypeA1,
+                typename T_TypeA2,
+                typename T_TypeB0,
+                typename T_TypeB1,
+                typename T_TypeB2,
+                typename T_BinaryOperator>
+            struct applyOperator<
+                CT::Vector<T_TypeA0, T_TypeA1, T_TypeA2>,
+                CT::Vector<T_TypeB0, T_TypeB1, T_TypeB2>,
+                T_BinaryOperator>
+            {
+                using OpResult0 = typename mpl::apply<T_BinaryOperator, T_TypeA0, T_TypeB0>::type;
+                using OpResult1 = typename mpl::apply<T_BinaryOperator, T_TypeA1, T_TypeB1>::type;
+                using OpResult2 = typename mpl::apply<T_BinaryOperator, T_TypeA2, T_TypeB2>::type;
+                using type = CT::Vector<OpResult0, OpResult1, OpResult2>;
+            };
+
+            //________________________A D D____________________________
+
+            template<typename Lhs, typename Rhs>
+            struct add
+            {
+                using type = typename applyOperator<
+                    typename Lhs::vector_type,
+                    typename Rhs::vector_type,
+                    mpl::plus<mpl::_1, mpl::_2>>::type;
+            };
+
+            //________________________M U L____________________________
+
+            template<typename Lhs, typename Rhs>
+            struct mul
+            {
+                using type = typename applyOperator<
+                    typename Lhs::vector_type,
+                    typename Rhs::vector_type,
+                    mpl::times<mpl::_1, mpl::_2>>::type;
+            };
+
+            //________________________M A X____________________________
+
+            /** maximum value
+             *
+             * @tparam Lhs input vector
+             * @tparam Rhs input vector
+             * @return ::type if Rhs is not given - maximum value in elements of Lhs else
+             *         vector with point-wise maximum value per component
+             */
+            template<typename Lhs, typename Rhs = void>
+            struct max
+            {
+                using type = typename applyOperator<
+                    typename Lhs::vector_type,
+                    typename Rhs::vector_type,
+                    mpl::max<mpl::_1, mpl::_2>>::type;
+            };
+
+
+            /** get element with maximum value
+             *
+             * @tparam T_Vec input vector
+             * @return ::type maximum value in elements of T_Vec
+             */
+            template<typename T_Vec>
+            struct max<T_Vec, void>
+            {
+                using type = typename mpl::
+                    accumulate<typename T_Vec::mplVector, typename T_Vec::x, mpl::max<mpl::_1, mpl::_2>>::type;
+            };
+
+            //________________________M I N____________________________
+
+
+            /** minimum value
+             *
+             * @tparam Lhs input vector
+             * @tparam Rhs input vector
+             * @return ::type if Rhs is not given - minimum value in elements of Lhs else
+             *         vector with point-wise minimum value per component
+             */
+            template<typename Lhs, typename Rhs = void>
+            struct min
+            {
+                using type = typename applyOperator<
+                    typename Lhs::vector_type,
+                    typename Rhs::vector_type,
+                    mpl::min<mpl::_1, mpl::_2>>::type;
+            };
+
+            /** get element with minimum value
+             *
+             * @tparam T_Vec input vector
+             * @return ::type minimum value in elements of T_Vec
+             */
+            template<typename T_Vec>
+            struct min<T_Vec, void>
+            {
+                using type = typename mpl::
+                    accumulate<typename T_Vec::mplVector, typename T_Vec::x, mpl::min<mpl::_1, mpl::_2>>::type;
+            };
+
+            //________________________D O T____________________________
+
+            template<typename Lhs, typename Rhs>
+            struct dot
+            {
+                using MulResult = typename mul<Lhs, Rhs>::type;
+                using type = typename mpl::
+                    accumulate<typename MulResult::mplVector, mpl::int_<0>, mpl::plus<mpl::_1, mpl::_2>>::type;
+            };
+
+            //________________________V O L U M E____________________________
+
+            template<typename T_Vec>
+            struct volume
+            {
+                using type = typename mpl::
+                    accumulate<typename T_Vec::mplVector, mpl::int_<1>, mpl::times<mpl::_1, mpl::_2>>::type;
+            };
+
+            //________________________S H R I N K T O________________________
+
+            /** shrink CT vector to given component count (dimension)
+             *
+             * This operation is designed to handle vectors with up to 3 components
+             *
+             * @tparam T_Vec vector to shrink
+             * @tparam T_dim target component count
+             * @treturn ::type new shrinked vector
+             */
+            template<typename T_Vec, uint32_t T_dim>
+            struct shrinkTo;
+
+            template<typename T_Vec>
+            struct shrinkTo<T_Vec, DIM3>
+            {
+                using Vec = T_Vec;
+                using type = CT::Vector<typename Vec::x, typename Vec::y, typename Vec::z>;
+            };
+
+            template<typename T_Vec>
+            struct shrinkTo<T_Vec, DIM2>
+            {
+                using Vec = T_Vec;
+                using type = CT::Vector<typename Vec::x, typename Vec::y, mpl::na>;
+            };
+
+            template<typename T_Vec>
+            struct shrinkTo<T_Vec, DIM1>
+            {
+                using Vec = T_Vec;
+                using type = CT::Vector<typename Vec::x, mpl::na, mpl::na>;
+            };
+
+            //________________________A S S I G N________________________
+
+            /** Assign a type to a given component in the CT::Vector
+             *
+             * defines a public type as result
+             *
+             * @tparam T_Vec math::CT::Vector which should be changed
+             * @tparam T_ComponentPos number of component to changed (type must be bmpl::integral_c<anyType,X>)
+             * @tparam T_Value new value
+             */
+            template<typename T_Vec, typename T_ComponentPos, typename T_Value>
+            struct Assign;
+
+            template<typename T_Value, typename T_0, typename T_1, typename T_2, typename T_IntegralType>
+            struct Assign<pmacc::math::CT::Vector<T_0, T_1, T_2>, bmpl::integral_c<T_IntegralType, 0>, T_Value>
+            {
+                using type = pmacc::math::CT::Vector<T_Value, T_1, T_2>;
+            };
+
+            template<typename T_Value, typename T_0, typename T_1, typename T_2, typename T_IntegralType>
+            struct Assign<pmacc::math::CT::Vector<T_0, T_1, T_2>, bmpl::integral_c<T_IntegralType, 1>, T_Value>
+            {
+                using type = pmacc::math::CT::Vector<T_0, T_Value, T_2>;
+            };
+
+            template<typename T_Value, typename T_0, typename T_1, typename T_2, typename T_IntegralType>
+            struct Assign<pmacc::math::CT::Vector<T_0, T_1, T_2>, bmpl::integral_c<T_IntegralType, 2>, T_Value>
+            {
+                using type = pmacc::math::CT::Vector<T_0, T_1, T_Value>;
+            };
+
+            /** Assign a type to a given component in the CT::Vector if position is not out of range
+             *
+             * if T_ComponentPos < T_Vec::dim ? T_Value is assigned to component T_ComponentPos
+             * else nothing is done.
+             * defines a public type as result
+             *
+             * @tparam T_Vec math::CT::Vector which should be changed
+             * @tparam T_ComponentPos number of component to changed (type must be bmpl::integral_c<anyType,X>)
+             * @tparam T_Value new value
+             */
+            template<typename T_Vec, typename T_ComponentPos, typename T_Value>
+            struct AssignIfInRange
+            {
+                using VectorDim = bmpl::integral_c<size_t, T_Vec::dim>;
+                using type = typename bmpl::if_<
+                    bmpl::less<T_ComponentPos, VectorDim>,
+                    typename pmacc::math::CT::Assign<T_Vec, T_ComponentPos, T_Value>::type,
+                    T_Vec>::type;
+            };
+
+            //________________________At_c____________________________
+
+            /** get element from a CT::Vector
+             *
+             * defines a public type as result
+             *
+             * @tparam T_Vec input CT::Vector
+             * @tparam T_idx integral index of the component
+             */
+            template<typename T_Vec, size_t T_idx>
+            struct At_c
+            {
+                using type = typename mpl::at_c<typename T_Vec::mplVector, T_idx>::type;
+            };
+
+            //________________________At____________________________
+
+            /** get element from a CT::Vector
+             *
+             * defines a public type as result
+             *
+             * @tparam T_Vec input CT::Vector
+             * @tparam T_Idx integral type index of the component (e.g. boost::mpl::int_<2>)
+             */
+            template<typename T_Vec, typename T_Idx>
+            struct At
+            {
+                using type = typename mpl::at<typename T_Vec::mplVector, T_Idx>::type;
+            };
+
+            //________________________make_Vector___________________
+
+            /** create CT::Vector with equal elements
+             *
+             * defines a public type as result
+             *
+             * @tparam T_dim count of components
+             * @tparam T_Type type which is assigned to all components
+             */
+            template<int T_dim, typename T_Type>
+            struct make_Vector;
+
+            template<typename T_Type>
+            struct make_Vector<1, T_Type>
+            {
+                using type = pmacc::math::CT::Vector<T_Type>;
+            };
+
+            template<typename T_Type>
+            struct make_Vector<2, T_Type>
+            {
+                using type = pmacc::math::CT::Vector<T_Type, T_Type>;
+            };
+
+            template<typename T_Type>
+            struct make_Vector<3, T_Type>
+            {
+                using type = pmacc::math::CT::Vector<T_Type, T_Type, T_Type>;
+            };
+
+            //________________________make_BasisVector___________________
+
+            /** Create CT::Vector that is the unit basis vector along the given direction
+             *
+             * Defines a public type as result.
+             * In case 0 <= T_direction < T_dim, return the basis vector type with value
+             * 1 in component T_direction and 0 in other components, otherwise return the
+             * zero vector type.
+             *
+             * @tparam T_dim count of components
+             * @tparam T_direction index of the basis vector direction
+             * @tparam T_ValueType value type of the vector
+             */
+            template<uint32_t T_dim, uint32_t T_direction, typename T_ValueType = int>
+            struct make_BasisVector
+            {
+                using Zeroes = typename make_Vector<T_dim, bmpl::integral_c<T_ValueType, 0>>::type;
+                using type = typename AssignIfInRange<
+                    Zeroes,
+                    bmpl::integral_c<size_t, T_direction>,
+                    bmpl::integral_c<T_ValueType, 1>>::type;
+            };
+
+        } // namespace CT
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/math_functor/abs.hpp b/include/pmacc/math/vector/math_functor/abs.hpp
deleted file mode 100644
index 2c656d0bf8..0000000000
--- a/include/pmacc/math/vector/math_functor/abs.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include "pmacc/algorithms/math/defines/abs.hpp"
-
-namespace pmacc
-{
-namespace math
-{
-namespace math_functor
-{
-
-struct Abs
-{
-    template<typename Type>
-    HDINLINE
-    Type operator()(const Type& x) const
-    {
-        return algorithms::math::abs(x);
-    }
-};
-
-} // math_vector
-} // math
-
-namespace result_of
-{
-
-template<typename Type>
-struct Functor<pmacc::math::math_functor::Abs, Type>
-{
-    using type = Type;
-};
-
-} // result_of
-} // pmacc
diff --git a/include/pmacc/math/vector/math_functor/cosf.hpp b/include/pmacc/math/vector/math_functor/cosf.hpp
deleted file mode 100644
index 87a75f231f..0000000000
--- a/include/pmacc/math/vector/math_functor/cosf.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include "pmacc/algorithms/math/defines/trigo.hpp"
-
-namespace pmacc
-{
-namespace math
-{
-namespace math_functor
-{
-
-struct Cosf
-{
-    using result_type = float;
-
-    DINLINE result_type operator()(const result_type& value) const
-    {
-        return algorithms::math::cos(value);
-    }
-};
-
-} // math_functor
-} // math
-} // pmacc
-
diff --git a/include/pmacc/math/vector/math_functor/max.hpp b/include/pmacc/math/vector/math_functor/max.hpp
deleted file mode 100644
index db58d67813..0000000000
--- a/include/pmacc/math/vector/math_functor/max.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include "pmacc/algorithms/math/defines/comparison.hpp"
-
-namespace pmacc
-{
-namespace math
-{
-namespace math_functor
-{
-
-struct Max
-{
-    template<typename Type>
-    HDINLINE
-    Type operator()(const Type& a, const Type& b) const
-    {
-        return algorithms::math::max(a,b);
-    }
-};
-
-} // math_vector
-} // math
-
-namespace result_of
-{
-
-template<typename Type>
-struct Functor<math::math_functor::Max, Type, Type>
-{
-    using type = Type;
-};
-
-} // result_of
-} // pmacc
diff --git a/include/pmacc/math/vector/math_functor/min.hpp b/include/pmacc/math/vector/math_functor/min.hpp
deleted file mode 100644
index da593802d1..0000000000
--- a/include/pmacc/math/vector/math_functor/min.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include "pmacc/algorithms/math/defines/comparison.hpp"
-
-namespace pmacc
-{
-namespace math
-{
-namespace math_functor
-{
-
-struct Min
-{
-    template<typename Type>
-    HDINLINE
-    Type operator()(const Type& a, const Type& b) const
-    {
-        return algorithms::math::min(a,b);
-    }
-};
-
-} // math_functor
-} // math
-
-namespace result_of
-{
-
-template<typename Type>
-struct Functor<math::math_functor::Min, Type, Type>
-{
-    using type = Type;
-};
-
-}
-
-} // pmacc
-
diff --git a/include/pmacc/math/vector/math_functor/sin.hpp b/include/pmacc/math/vector/math_functor/sin.hpp
deleted file mode 100644
index 1e34b499d7..0000000000
--- a/include/pmacc/math/vector/math_functor/sin.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include "pmacc/algorithms/math/defines/trigo.hpp"
-
-namespace pmacc
-{
-namespace math
-{
-namespace math_functor
-{
-
-template<typename T_Type>
-struct Sin
-{
-    using result_type = T_Type;
-
-    DINLINE result_type operator()(const result_type& value) const
-    {
-        return algorithms::math::sin(value);
-    }
-};
-
-} // math_functor
-} // math
-} // pmacc
-
diff --git a/include/pmacc/math/vector/math_functor/sqrtf.hpp b/include/pmacc/math/vector/math_functor/sqrtf.hpp
deleted file mode 100644
index 8810c40d3c..0000000000
--- a/include/pmacc/math/vector/math_functor/sqrtf.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "pmacc/types.hpp"
-#include "pmacc/algorithms/math/defines/sqrt.hpp"
-
-namespace pmacc
-{
-namespace math
-{
-namespace math_functor
-{
-
-struct Sqrtf
-{
-    using result_type = float;
-
-    HDINLINE result_type operator()(const result_type& value) const
-    {
-        return algorithms::math::sqrt(value);
-    }
-};
-
-} // math_functor
-} // math
-} // PMacc
-
diff --git a/include/pmacc/math/vector/navigator/PermutedNavigator.hpp b/include/pmacc/math/vector/navigator/PermutedNavigator.hpp
index e9547ff521..b993e1b565 100644
--- a/include/pmacc/math/vector/navigator/PermutedNavigator.hpp
+++ b/include/pmacc/math/vector/navigator/PermutedNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -25,17 +25,16 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-template<typename Permutation>
-struct PermutedNavigator
-{
-    HDINLINE int operator()(int component) const
+    namespace math
     {
-        return Permutation().toRT()[component];
-    }
-};
+        template<typename Permutation>
+        struct PermutedNavigator
+        {
+            HDINLINE int operator()(int component) const
+            {
+                return Permutation().toRT()[component];
+            }
+        };
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/navigator/StackedNavigator.hpp b/include/pmacc/math/vector/navigator/StackedNavigator.hpp
index 03e2dfc64b..fa99da708b 100644
--- a/include/pmacc/math/vector/navigator/StackedNavigator.hpp
+++ b/include/pmacc/math/vector/navigator/StackedNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2014-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -25,22 +25,21 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-/* Sticks two navigators together resulting in a new navigator.
- *
- * \tparam NaviA first navigator to be called
- * \tparam NaviB second navigator to be called
- */
-template<typename NaviA, typename NaviB>
-struct StackedNavigator
-{
-    HDINLINE int operator()(int component) const
+    namespace math
     {
-        return NaviB()(NaviA()(component));
-    }
-};
+        /* Sticks two navigators together resulting in a new navigator.
+         *
+         * \tparam NaviA first navigator to be called
+         * \tparam NaviB second navigator to be called
+         */
+        template<typename NaviA, typename NaviB>
+        struct StackedNavigator
+        {
+            HDINLINE int operator()(int component) const
+            {
+                return NaviB()(NaviA()(component));
+            }
+        };
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/math/vector/navigator/StandardNavigator.hpp b/include/pmacc/math/vector/navigator/StandardNavigator.hpp
index 9650ff6d9b..f8ce1e337d 100644
--- a/include/pmacc/math/vector/navigator/StandardNavigator.hpp
+++ b/include/pmacc/math/vector/navigator/StandardNavigator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -25,17 +25,16 @@
 
 namespace pmacc
 {
-namespace math
-{
-
-/** \todo rename this class to NavigatorIdentity*/
-struct StandardNavigator
-{
-    HDINLINE int operator()(int component) const
+    namespace math
     {
-        return component;
-    }
-};
+        /** \todo rename this class to NavigatorIdentity*/
+        struct StandardNavigator
+        {
+            HDINLINE int operator()(int component) const
+            {
+                return component;
+            }
+        };
 
-} // math
-} // PMacc
+    } // namespace math
+} // namespace pmacc
diff --git a/include/pmacc/memory/Align.hpp b/include/pmacc/memory/Align.hpp
index c12089f026..43949229d7 100644
--- a/include/pmacc/memory/Align.hpp
+++ b/include/pmacc/memory/Align.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -26,17 +26,15 @@
 #include "pmacc/ppFunctions.hpp"
 
 /** calculate and set the optimal alignment for data
-  *
-  * you must align all arrays and structs that are used on the device
-  * @param byte size of data in bytes
-  */
-#define __optimal_align__(byte)                                                \
-    alignas(                                                                 \
-        /** \bug avoid bug if alignment is >16 byte                            \
-         * https://github.com/ComputationalRadiationPhysics/picongpu/issues/1563 \
-         */                                                                    \
-        PMACC_MIN(PMACC_ROUND_UP_NEXT_POW2(byte),16)                           \
-    )
+ *
+ * you must align all arrays and structs that are used on the device
+ * @param byte size of data in bytes
+ */
+#define __optimal_align__(byte)                                                                                       \
+    alignas(/** \bug avoid bug if alignment is >16 byte                                                               \
+             * https://github.com/ComputationalRadiationPhysics/picongpu/issues/1563                                  \
+             */                                                                                                       \
+            PMACC_MIN(PMACC_ROUND_UP_NEXT_POW2(byte), 16))
 
-#define PMACC_ALIGN( var, ... ) __optimal_align__( sizeof( __VA_ARGS__ ) ) __VA_ARGS__ var
-#define PMACC_ALIGN8( var, ... ) alignas( 8 ) __VA_ARGS__ var
+#define PMACC_ALIGN(var, ...) __optimal_align__(sizeof(__VA_ARGS__)) __VA_ARGS__ var
+#define PMACC_ALIGN8(var, ...) alignas(8) __VA_ARGS__ var
diff --git a/include/pmacc/memory/Array.hpp b/include/pmacc/memory/Array.hpp
index a7df616ffc..55d3a67818 100644
--- a/include/pmacc/memory/Array.hpp
+++ b/include/pmacc/memory/Array.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,108 +26,101 @@
 
 namespace pmacc
 {
-namespace memory
-{
-    /** static sized array
-     *
-     * mimic the most parts of the `std::array`
-     */
-    template<
-        typename T_Type,
-        size_t T_size
-    >
-    struct Array
+    namespace memory
     {
-        using value_type = T_Type;
-        using size_type = size_t;
-        using reference = value_type &;
-        using const_reference = value_type const &;
-        using pointer = value_type *;
-        using const_pointer = value_type const *;
-
-        /** get number of elements */
-        HDINLINE
-        constexpr size_type size( ) const
-        {
-            return T_size;
-        }
-
-        /** get maximum number of elements */
-        HDINLINE
-        constexpr size_type max_size( ) const
-        {
-            return T_size;
-        }
-
-        /** get the direct access to the internal data
+        /** static sized array
          *
-         * @{
+         * mimic the most parts of the `std::array`
          */
-        HDINLINE
-        pointer data( )
+        template<typename T_Type, size_t T_size>
+        struct Array
         {
-            return reinterpret_cast< pointer >( m_data );
-        }
+            using value_type = T_Type;
+            using size_type = size_t;
+            using reference = value_type&;
+            using const_reference = value_type const&;
+            using pointer = value_type*;
+            using const_pointer = value_type const*;
 
-        HDINLINE
-        const_pointer data( ) const
-        {
-            return reinterpret_cast< const_pointer >( m_data );
-        }
-        /** @} */
+            /** get number of elements */
+            HDINLINE
+            constexpr size_type size() const
+            {
+                return T_size;
+            }
 
-        /** default constructor
-         *
-         * all members are uninitialized
-         */
-        Array() = default;
+            /** get maximum number of elements */
+            HDINLINE
+            constexpr size_type max_size() const
+            {
+                return T_size;
+            }
 
-        /** constructor
-         *
-         * initialize each member with the given value
-         *
-         * @param value element assigned to each member
-         */
-        HDINLINE Array( T_Type const & value )
-        {
-            for( size_type i = 0; i < size(); ++i )
-                reinterpret_cast< T_Type* >( m_data )[ i ] = value;
-        }
+            /** get the direct access to the internal data
+             *
+             * @{
+             */
+            HDINLINE
+            pointer data()
+            {
+                return reinterpret_cast<pointer>(m_data);
+            }
 
-        /** get N-th value
-         *
-         * @tparam T_Idx any type which can be implicit casted to an integral type
-         * @param idx index within the array
-         *
-         * @{
-         */
-        template< typename T_Idx >
-        HDINLINE
-        const_reference
-        operator[]( T_Idx const idx ) const
-        {
-            return reinterpret_cast< T_Type const * >( m_data )[ idx ];
-        }
+            HDINLINE
+            const_pointer data() const
+            {
+                return reinterpret_cast<const_pointer>(m_data);
+            }
+            /** @} */
 
-        template< typename T_Idx >
-        HDINLINE
-        reference
-        operator[]( T_Idx const idx )
-        {
-            return reinterpret_cast< T_Type* >( m_data )[ idx ];
-        }
-        /** @} */
+            /** default constructor
+             *
+             * all members are uninitialized
+             */
+            Array() = default;
 
-    private:
-        /** data storage
-         *
-         * std::array is a so-called "aggregate" which does not default-initialize
-         * its members. In order to allow arbitrary types to skip implementing
-         * a default constructur, this member is not stored as
-         * `value_type m_data[ T_size ]` but as type-size aligned Byte type.
-         */
-        uint8_t m_data alignas( alignof( T_Type ) ) [ T_size * sizeof( T_Type ) ];
-    };
+            /** constructor
+             *
+             * initialize each member with the given value
+             *
+             * @param value element assigned to each member
+             */
+            HDINLINE Array(T_Type const& value)
+            {
+                for(size_type i = 0; i < size(); ++i)
+                    reinterpret_cast<T_Type*>(m_data)[i] = value;
+            }
+
+            /** get N-th value
+             *
+             * @tparam T_Idx any type which can be implicit casted to an integral type
+             * @param idx index within the array
+             *
+             * @{
+             */
+            template<typename T_Idx>
+            HDINLINE const_reference operator[](T_Idx const idx) const
+            {
+                return reinterpret_cast<T_Type const*>(m_data)[idx];
+            }
+
+            template<typename T_Idx>
+            HDINLINE reference operator[](T_Idx const idx)
+            {
+                return reinterpret_cast<T_Type*>(m_data)[idx];
+            }
+            /** @} */
+
+        private:
+            /** data storage
+             *
+             * std::array is a so-called "aggregate" which does not default-initialize
+             * its members. In order to allow arbitrary types to skip implementing
+             * a default constructur, this member is not stored as
+             * `value_type m_data[ T_size ]` but as type-size aligned Byte type.
+             */
+            uint8_t m_data alignas(alignof(T_Type))[T_size * sizeof(T_Type)];
+        };
 
-} // namespace memory
+    } // namespace memory
 } // namespace pmacc
diff --git a/include/pmacc/memory/CtxArray.hpp b/include/pmacc/memory/CtxArray.hpp
index bcffda24a3..4733891359 100644
--- a/include/pmacc/memory/CtxArray.hpp
+++ b/include/pmacc/memory/CtxArray.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -31,87 +31,70 @@
 
 namespace pmacc
 {
-namespace memory
-{
-    /** Static sized array for a local variable
-     *
-     * The array is designed to hold context variables in lock step
-     * programming. A context variable is just a local variable of a virtual
-     * worker. Allocating and using a context array allows to propagate
-     * virtual worker states over subsequent lock steps. A context array
-     * for a set of virtual workers is owned by their (physical) worker.
-     *
-     * The number of elements depends on the index domain size and the number
-     * of workers to process the indices.
-     */
-    template<
-        typename T_Type,
-        typename T_IdxConfig
-    >
-    struct CtxArray :
-        public Array<
-            T_Type,
-            T_IdxConfig::numCollIter * T_IdxConfig::simdSize
-        >,
-        T_IdxConfig
+    namespace memory
     {
-
-        using T_IdxConfig::domainSize;
-        using T_IdxConfig::workerSize;
-        using T_IdxConfig::simdSize;
-        using T_IdxConfig::numCollIter;
-
-        using BaseArray = Array<
-            T_Type,
-            T_IdxConfig::numCollIter * T_IdxConfig::simdSize
-        >;
-
-        /** default constructor
-         *
-         * data member are uninitialized
-         */
-        CtxArray() = default;
-
-        /** constructor
+        /** Static sized array for a local variable
          *
-         * initialize each member with the given value
+         * The array is designed to hold context variables in lock step
+         * programming. A context variable is just a local variable of a virtual
+         * worker. Allocating and using a context array allows to propagate
+         * virtual worker states over subsequent lock steps. A context array
+         * for a set of virtual workers is owned by their (physical) worker.
          *
-         * @param value element assigned to each member
+         * The number of elements depends on the index domain size and the number
+         * of workers to process the indices.
          */
-        HDINLINE explicit CtxArray( T_Type const & value ) : BaseArray( value )
+        template<typename T_Type, typename T_IdxConfig>
+        struct CtxArray
+            : public Array<T_Type, T_IdxConfig::numCollIter * T_IdxConfig::simdSize>
+            , T_IdxConfig
         {
-        }
+            using T_IdxConfig::domainSize;
+            using T_IdxConfig::numCollIter;
+            using T_IdxConfig::simdSize;
+            using T_IdxConfig::workerSize;
 
-        /** disable copy constructor
-         */
-        HDINLINE CtxArray( CtxArray const & ) = delete;
+            using BaseArray = Array<T_Type, T_IdxConfig::numCollIter * T_IdxConfig::simdSize>;
 
-        /** constructor
-         *
-         * initialize each member with the result of the given functor
-         *
-         * @tparam T_Functor type of the user functor
-         * @tparam T_Args type of user parameters
-         * @param workerIdx number of worker range: [0;workerSize)
-         * @param functor functor to initialize the member ( need to implement `::operator(size_type idx)`)
-         * @param args user defined arguments those should forwarded to the functor
-         */
-        template<
-            typename T_Functor,
-            typename ... T_Args
-        >
-        HDINLINE explicit CtxArray( uint32_t const workerIdx, T_Functor const & functor, T_Args const && ... args )
-        {
-            mappings::threads::ForEachIdx< T_IdxConfig >
-            { workerIdx }(
-                [&,this]( uint32_t const linearIdx, uint32_t const idx )
-                {
-                    (*this)[idx] = functor( linearIdx, idx, std::forward< T_Args >( args ) ... );
-                }
-            );
-        }
+            /** default constructor
+             *
+             * data member are uninitialized
+             */
+            CtxArray() = default;
+
+            /** constructor
+             *
+             * initialize each member with the given value
+             *
+             * @param value element assigned to each member
+             */
+            HDINLINE explicit CtxArray(T_Type const& value) : BaseArray(value)
+            {
+            }
+
+            /** disable copy constructor
+             */
+            HDINLINE CtxArray(CtxArray const&) = delete;
 
-    };
+            /** constructor
+             *
+             * initialize each member with the result of the given functor
+             *
+             * @tparam T_Functor type of the user functor
+             * @tparam T_Args type of user parameters
+             * @param workerIdx number of worker range: [0;workerSize)
+             * @param functor functor to initialize the member ( need to implement `::operator(size_type idx)`)
+             * @param args user defined arguments those should forwarded to the functor
+             */
+            template<typename T_Functor, typename... T_Args>
+            HDINLINE explicit CtxArray(uint32_t const workerIdx, T_Functor const& functor, T_Args const&&... args)
+            {
+                mappings::threads::ForEachIdx<T_IdxConfig>{workerIdx}(
+                    [&, this](uint32_t const linearIdx, uint32_t const idx) {
+                        (*this)[idx] = functor(linearIdx, idx, std::forward<T_Args>(args)...);
+                    });
+            }
+        };
 
-} // namespace memory
+    } // namespace memory
 } // namespace pmacc
diff --git a/include/pmacc/memory/Delete.hpp b/include/pmacc/memory/Delete.hpp
index 894490404a..45480c5614 100644
--- a/include/pmacc/memory/Delete.hpp
+++ b/include/pmacc/memory/Delete.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -24,5 +24,15 @@
 #pragma once
 
 
-#define __delete( var ) if( ( var ) ) { delete( var ); ( var ) = nullptr; }
-#define __deleteArray( var ) if( ( var ) ) { delete[ ]( var ); ( var ) = nullptr; }
+#define __delete(var)                                                                                                 \
+    if((var))                                                                                                         \
+    {                                                                                                                 \
+        delete(var);                                                                                                  \
+        (var) = nullptr;                                                                                              \
+    }
+#define __deleteArray(var)                                                                                            \
+    if((var))                                                                                                         \
+    {                                                                                                                 \
+        delete[](var);                                                                                                \
+        (var) = nullptr;                                                                                              \
+    }
diff --git a/include/pmacc/memory/IndexPool.hpp b/include/pmacc/memory/IndexPool.hpp
index fbaf2b4a99..bf49606a06 100644
--- a/include/pmacc/memory/IndexPool.hpp
+++ b/include/pmacc/memory/IndexPool.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Heiko Burau
+/* Copyright 2017-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -29,159 +29,142 @@
 
 namespace pmacc
 {
-namespace memory
-{
-
-    /** A memory pool of dynamic size containing indices.
-     *
-     * At initial state the pool consists of consecutive indices according to
-     * the `size` parameter. A new index is created by calling `get()`.
-     * If the user releases an index, by calling
-     * `release()`, it will be recycled at the next `get()` call.
-     * Therefore the initial ordering is not preserved.
-     * This pool provides `begin()` and `end()` methods. The iteration is done
-     * reversely, allowing for additions and removal of the current element while
-     * iterating.
-     *
-     * Scalings:
-     *  `<constructor>` ~ O(N)
-     *  `get()`         ~ O(1)
-     *  `release()`     ~ O(N)
-     *  `<iterating>`   ~ O(N) ~ std::array
-     *
-     * @warning: This class is not thread-safe!
-     *
-     * @tparam T_Index type of index
-     * @tparam T_maxSize maximum number of indices
-     */
-    template<
-        typename T_Index,
-        size_t T_maxSize
-    >
-    struct IndexPool
+    namespace memory
     {
-    private:
-
-        /** Reverse-iterator of the memory pool. The pool is iterated reversely
-         * to ensure removal of the current element while iterating.
+        /** A memory pool of dynamic size containing indices.
+         *
+         * At initial state the pool consists of consecutive indices according to
+         * the `size` parameter. A new index is created by calling `get()`.
+         * If the user releases an index, by calling
+         * `release()`, it will be recycled at the next `get()` call.
+         * Therefore the initial ordering is not preserved.
+         * This pool provides `begin()` and `end()` methods. The iteration is done
+         * reversely, allowing for additions and removal of the current element while
+         * iterating.
+         *
+         * Scalings:
+         *  `<constructor>` ~ O(N)
+         *  `get()`         ~ O(1)
+         *  `release()`     ~ O(N)
+         *  `<iterating>`   ~ O(N) ~ std::array
+         *
+         * @warning: This class is not thread-safe!
+         *
+         * @tparam T_Index type of index
+         * @tparam T_maxSize maximum number of indices
          */
-        struct ReverseIterator
+        template<typename T_Index, size_t T_maxSize>
+        struct IndexPool
         {
-            T_Index* pointer;
+        private:
+            /** Reverse-iterator of the memory pool. The pool is iterated reversely
+             * to ensure removal of the current element while iterating.
+             */
+            struct ReverseIterator
+            {
+                T_Index* pointer;
 
-            HDINLINE
-            ReverseIterator( T_Index* const pointer ) : pointer( pointer )
-            {}
+                HDINLINE
+                ReverseIterator(T_Index* const pointer) : pointer(pointer)
+                {
+                }
 
-            HDINLINE
-            void operator++()
-            {
-                this->pointer--;
-            }
+                HDINLINE
+                void operator++()
+                {
+                    this->pointer--;
+                }
 
+                HDINLINE
+                T_Index& operator*()
+                {
+                    return *(this->pointer);
+                }
+
+                HDINLINE
+                bool operator!=(ReverseIterator const& other) const
+                {
+                    return this->pointer != other.pointer;
+                }
+            };
+
+            size_t m_size;
+            Array<T_Index, T_maxSize> listIds;
+
+        public:
+            using Index = T_Index;
+
+            PMACC_STATIC_ASSERT_MSG(std::numeric_limits<Index>::is_integer, _Index_type_must_be_an_integer_type);
+            PMACC_STATIC_ASSERT_MSG(std::numeric_limits<Index>::is_signed, _Index_type_must_be_a_signed_type);
+            PMACC_STATIC_ASSERT_MSG(T_maxSize > 0u, _maxSize_has_to_be_greater_than_zero);
+
+            /** init pool with consecutive indices
+             *
+             * @param size initial number of indices
+             */
             HDINLINE
-            T_Index& operator*()
+            IndexPool(const Index size = 0) : m_size(size)
             {
-                return *(this->pointer);
+                /* TODO: parallelize */
+                for(size_t i = 0; i < T_maxSize; i++)
+                    this->listIds[i] = static_cast<Index>(i);
             }
 
+            /** get a new index */
             HDINLINE
-            bool operator!=( ReverseIterator const & other ) const
+            Index get()
             {
-                return this->pointer != other.pointer;
-            }
-        };
+                if(this->m_size == T_maxSize - 1u)
+                    return Index(-1);
 
-        size_t m_size;
-        Array<
-            T_Index,
-            T_maxSize
-        > listIds;
-
-    public:
-
-        using Index = T_Index;
-
-        PMACC_STATIC_ASSERT_MSG(
-            std::numeric_limits< Index >::is_integer,
-            _Index_type_must_be_an_integer_type
-        );
-        PMACC_STATIC_ASSERT_MSG(
-            std::numeric_limits< Index >::is_signed,
-            _Index_type_must_be_a_signed_type
-        );
-        PMACC_STATIC_ASSERT_MSG(
-            T_maxSize > 0u,
-            _maxSize_has_to_be_greater_than_zero
-        );
-
-        /** init pool with consecutive indices
-         *
-         * @param size initial number of indices
-         */
-        HDINLINE
-        IndexPool( const Index size = 0 ) : m_size( size )
-        {
-            /* TODO: parallelize */
-            for( size_t i = 0; i < T_maxSize; i++ )
-                this->listIds[i] = static_cast< Index >( i );
-        }
-
-        /** get a new index */
-        HDINLINE
-        Index get()
-        {
-            if( this->m_size == T_maxSize - 1u )
-                return Index(-1);
-
-            return this->listIds[this->m_size++];
-        }
+                return this->listIds[this->m_size++];
+            }
 
-        /** release an index */
-        HDINLINE
-        void release( const Index idx )
-        {
-            /* find position of `idx` */
-            size_t pos;
-            for( size_t i = 0; i < this->m_size; i++ )
+            /** release an index */
+            HDINLINE
+            void release(const Index idx)
             {
-                if( this->listIds[i] == idx )
+                /* find position of `idx` */
+                size_t pos;
+                for(size_t i = 0; i < this->m_size; i++)
                 {
-                    pos = i;
-                    break;
+                    if(this->listIds[i] == idx)
+                    {
+                        pos = i;
+                        break;
+                    }
                 }
-            }
 
-            this->listIds[pos] = this->listIds[--this->m_size];
-            this->listIds[this->m_size] = idx;
-        }
+                this->listIds[pos] = this->listIds[--this->m_size];
+                this->listIds[this->m_size] = idx;
+            }
 
-        /** get number of indices within pool */
-        HDINLINE
-        size_t size( ) const
-        {
-            return this->m_size;
-        }
+            /** get number of indices within pool */
+            HDINLINE
+            size_t size() const
+            {
+                return this->m_size;
+            }
 
-        /** get maximum number of indices within pool */
-        HDINLINE
-        constexpr size_t max_size( ) const
-        {
-            return T_maxSize;
-        }
+            /** get maximum number of indices within pool */
+            HDINLINE
+            constexpr size_t max_size() const
+            {
+                return T_maxSize;
+            }
 
-        HDINLINE
-        ReverseIterator begin()
-        {
-            return ReverseIterator( this->listIds.data() + this->m_size - 1u );
-        }
+            HDINLINE
+            ReverseIterator begin()
+            {
+                return ReverseIterator(this->listIds.data() + this->m_size - 1u);
+            }
 
-        HDINLINE
-        ReverseIterator end()
-        {
-            return ReverseIterator( this->listIds.data() - 1u );
-        }
-    };
+            HDINLINE
+            ReverseIterator end()
+            {
+                return ReverseIterator(this->listIds.data() - 1u);
+            }
+        };
 
-} // namespace memory
+    } // namespace memory
 } // namespace pmacc
diff --git a/include/pmacc/memory/MakeUnique.hpp b/include/pmacc/memory/MakeUnique.hpp
deleted file mode 100644
index 38d92ad6bf..0000000000
--- a/include/pmacc/memory/MakeUnique.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2019-2020 Sergei Bastrakov
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <memory>
-#include <utility>
-
-
-namespace pmacc
-{
-namespace memory
-{
-
-    /*
-     * Analogue of std::make_unique for C++11, except not disabled for arrays.
-     * Implementation is taken from
-     * https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
-     */
-    template<
-        typename T,
-        typename ... T_Args
-    >
-    inline std::unique_ptr< T > makeUnique( T_Args && ... args )
-    {
-        return std::unique_ptr< T >( new T( std::forward< T_Args >( args ) ... ) );
-    }
-
-} // namespace memory
-} // namespace pmacc
diff --git a/include/pmacc/memory/boxes/CachedBox.hpp b/include/pmacc/memory/boxes/CachedBox.hpp
index fc9833e4e2..ec164a902b 100644
--- a/include/pmacc/memory/boxes/CachedBox.hpp
+++ b/include/pmacc/memory/boxes/CachedBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -31,49 +31,49 @@ namespace pmacc
 {
     namespace intern
     {
-
-        template< typename T_ValueType, class T_BlockDescription, uint32_t T_Id>
+        template<typename T_ValueType, class T_BlockDescription, uint32_t T_Id>
         class CachedBox
         {
         public:
             typedef T_BlockDescription BlockDescription;
             typedef T_ValueType ValueType;
+
         private:
             typedef typename BlockDescription::SuperCellSize SuperCellSize;
             typedef typename BlockDescription::FullSuperCellSize FullSuperCellSize;
             typedef typename BlockDescription::OffsetOrigin OffsetOrigin;
 
         public:
-            typedef DataBox<SharedBox<ValueType, FullSuperCellSize,T_Id> > Type;
+            typedef DataBox<SharedBox<ValueType, FullSuperCellSize, T_Id>> Type;
 
-            template< typename T_Acc >
-            HDINLINE static Type create( T_Acc const & acc )
+            template<typename T_Acc>
+            HDINLINE static Type create(T_Acc const& acc)
             {
                 DataSpace<OffsetOrigin::dim> offset(OffsetOrigin::toRT());
-                Type c_box(Type::init( acc ));
+                Type c_box(Type::init(acc));
                 return c_box.shift(offset);
             }
-
         };
-    }
+    } // namespace intern
 
     struct CachedBox
     {
-
-        template<uint32_t Id_, typename ValueType_, class BlockDescription_, typename T_Acc >
-        DINLINE static typename intern::CachedBox<ValueType_, BlockDescription_, Id_ >::Type
-        create( T_Acc const & acc, const ValueType_& value, const BlockDescription_ block )
+        template<uint32_t Id_, typename ValueType_, class BlockDescription_, typename T_Acc>
+        DINLINE static typename intern::CachedBox<ValueType_, BlockDescription_, Id_>::Type create(
+            T_Acc const& acc,
+            const ValueType_& value,
+            const BlockDescription_ block)
         {
-            return intern::CachedBox<ValueType_, BlockDescription_, Id_>::create( acc );
+            return intern::CachedBox<ValueType_, BlockDescription_, Id_>::create(acc);
         }
 
-        template< uint32_t Id_, typename ValueType_, class BlockDescription_, typename T_Acc >
-        DINLINE static typename intern::CachedBox<ValueType_, BlockDescription_, Id_ >::Type
-        create( T_Acc const & acc, const BlockDescription_ block )
+        template<uint32_t Id_, typename ValueType_, class BlockDescription_, typename T_Acc>
+        DINLINE static typename intern::CachedBox<ValueType_, BlockDescription_, Id_>::Type create(
+            T_Acc const& acc,
+            const BlockDescription_ block)
         {
-            return intern::CachedBox<ValueType_, BlockDescription_, Id_>::create( acc );
+            return intern::CachedBox<ValueType_, BlockDescription_, Id_>::create(acc);
         }
-
     };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/memory/boxes/DataBox.hpp b/include/pmacc/memory/boxes/DataBox.hpp
index bd8fe3a20e..042773e5e0 100644
--- a/include/pmacc/memory/boxes/DataBox.hpp
+++ b/include/pmacc/memory/boxes/DataBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -33,10 +33,9 @@ namespace pmacc
         class Box;
 
         template<class Base>
-        class Box< DIM1, Base> : public Base
+        class Box<DIM1, Base> : public Base
         {
         public:
-
             enum
             {
                 Dim = DIM1
@@ -44,12 +43,12 @@ namespace pmacc
             typedef typename Base::ValueType ValueType;
             typedef typename Base::RefValueType RefValueType;
 
-            HDINLINE RefValueType operator()(const DataSpace<DIM1> &idx = DataSpace<DIM1>()) const
+            HDINLINE RefValueType operator()(const DataSpace<DIM1>& idx = DataSpace<DIM1>()) const
             {
                 return Base::operator[](idx.x());
             }
 
-            HDINLINE RefValueType operator()(const DataSpace<DIM1> &idx = DataSpace<DIM1>())
+            HDINLINE RefValueType operator()(const DataSpace<DIM1>& idx = DataSpace<DIM1>())
             {
                 return Base::operator[](idx.x());
             }
@@ -63,24 +62,23 @@ namespace pmacc
             }
         };
 
-        template< class Base>
-        class Box< DIM2, Base> : public Base
+        template<class Base>
+        class Box<DIM2, Base> : public Base
         {
         public:
-
             enum
             {
                 Dim = DIM2
             };
             typedef typename Base::ValueType ValueType;
-             typedef typename Base::RefValueType RefValueType;
+            typedef typename Base::RefValueType RefValueType;
 
-            HDINLINE RefValueType operator()(const DataSpace<DIM2> &idx = DataSpace<DIM2>()) const
+            HDINLINE RefValueType operator()(const DataSpace<DIM2>& idx = DataSpace<DIM2>()) const
             {
                 return (Base::operator[](idx.y()))[idx.x()];
             }
 
-            HDINLINE RefValueType operator()(const DataSpace<DIM2> &idx = DataSpace<DIM2>())
+            HDINLINE RefValueType operator()(const DataSpace<DIM2>& idx = DataSpace<DIM2>())
             {
                 return (Base::operator[](idx.y()))[idx.x()];
             }
@@ -92,14 +90,12 @@ namespace pmacc
             HDINLINE Box() : Base()
             {
             }
-
         };
 
         template<class Base>
         class Box<DIM3, Base> : public Base
         {
         public:
-
             enum
             {
                 Dim = DIM3
@@ -107,12 +103,12 @@ namespace pmacc
             typedef typename Base::ValueType ValueType;
             typedef typename Base::RefValueType RefValueType;
 
-            HDINLINE RefValueType operator()(const DataSpace<DIM3> &idx = DataSpace<DIM3>()) const
+            HDINLINE RefValueType operator()(const DataSpace<DIM3>& idx = DataSpace<DIM3>()) const
             {
                 return (Base::operator[](idx.z()))[idx.y()][idx.x()];
             }
 
-            HDINLINE RefValueType operator()(const DataSpace<DIM3> &idx = DataSpace<DIM3>())
+            HDINLINE RefValueType operator()(const DataSpace<DIM3>& idx = DataSpace<DIM3>())
             {
                 return (Base::operator[](idx.z()))[idx.y()][idx.x()];
             }
@@ -124,18 +120,15 @@ namespace pmacc
             HDINLINE Box() : Base()
             {
             }
-
         };
 
 
-
-    }
+    } // namespace private_Box
 
     template<class Base>
     class DataBox : public private_Box::Box<Base::Dim, Base>
     {
     public:
-
         typedef typename Base::ValueType ValueType;
         typedef DataBox<Base> Type;
         typedef typename Base::RefValueType RefValueType;
@@ -157,8 +150,8 @@ namespace pmacc
 
         HDINLINE DataBox<typename Base::ReducedType> reduceZ(const int zOffset) const
         {
-            return DataBox<typename Base::ReducedType > (Base::reduceZ(zOffset));
+            return DataBox<typename Base::ReducedType>(Base::reduceZ(zOffset));
         }
     };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/memory/boxes/DataBoxDim1Access.hpp b/include/pmacc/memory/boxes/DataBoxDim1Access.hpp
index 5d3ac59cb2..cbe28612c4 100644
--- a/include/pmacc/memory/boxes/DataBoxDim1Access.hpp
+++ b/include/pmacc/memory/boxes/DataBoxDim1Access.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,54 +28,54 @@
 
 namespace pmacc
 {
-
-template<class T_Base>
-class DataBoxDim1Access : protected T_Base
-{
-public:
-
-    typedef T_Base Base;
-    static constexpr uint32_t Dim = Base::Dim;
-
-
-    typedef typename Base::ValueType ValueType;
-    typedef typename Base::RefValueType RefValueType;
-
-
-    HDINLINE RefValueType operator()(const pmacc::DataSpace<DIM1> &idx = pmacc::DataSpace<DIM1>()) const
+    template<class T_Base>
+    class DataBoxDim1Access : protected T_Base
     {
-        const pmacc::DataSpace<Dim> real_idx(DataSpaceOperations<Dim>::map(originalSize, idx.x()));
-        return Base::operator()(real_idx);
-    }
-
-    HDINLINE RefValueType operator()(const pmacc::DataSpace<DIM1> &idx = pmacc::DataSpace<DIM1>())
-    {
-        const pmacc::DataSpace<Dim> real_idx(DataSpaceOperations<Dim>::map(originalSize, idx.x()));
-        return Base::operator()(real_idx);
-    }
-
-    HDINLINE RefValueType operator[](const int idx) const
-    {
-        const pmacc::DataSpace<Dim> real_idx(DataSpaceOperations<Dim>::map(originalSize, idx));
-        return Base::operator()(real_idx);
-    }
-
-    HDINLINE RefValueType operator[](const int idx)
-    {
-        const pmacc::DataSpace<Dim> real_idx(DataSpaceOperations<Dim>::map(originalSize, idx));
-        return Base::operator()(real_idx);
-    }
-
-    HDINLINE DataBoxDim1Access(const Base base, const pmacc::DataSpace<Dim> originalSize) : Base(base), originalSize(originalSize)
-    {
-    }
-
-    HDINLINE DataBoxDim1Access(const pmacc::DataSpace<Dim> originalSize) : Base(), originalSize(originalSize)
-    {
-    }
-private:
-    PMACC_ALIGN(originalSize, const pmacc::DataSpace<Dim>);
-
-};
-
-} //namespace
+    public:
+        typedef T_Base Base;
+        static constexpr uint32_t Dim = Base::Dim;
+
+
+        typedef typename Base::ValueType ValueType;
+        typedef typename Base::RefValueType RefValueType;
+
+
+        HDINLINE RefValueType operator()(const pmacc::DataSpace<DIM1>& idx = pmacc::DataSpace<DIM1>()) const
+        {
+            const pmacc::DataSpace<Dim> real_idx(DataSpaceOperations<Dim>::map(originalSize, idx.x()));
+            return Base::operator()(real_idx);
+        }
+
+        HDINLINE RefValueType operator()(const pmacc::DataSpace<DIM1>& idx = pmacc::DataSpace<DIM1>())
+        {
+            const pmacc::DataSpace<Dim> real_idx(DataSpaceOperations<Dim>::map(originalSize, idx.x()));
+            return Base::operator()(real_idx);
+        }
+
+        HDINLINE RefValueType operator[](const int idx) const
+        {
+            const pmacc::DataSpace<Dim> real_idx(DataSpaceOperations<Dim>::map(originalSize, idx));
+            return Base::operator()(real_idx);
+        }
+
+        HDINLINE RefValueType operator[](const int idx)
+        {
+            const pmacc::DataSpace<Dim> real_idx(DataSpaceOperations<Dim>::map(originalSize, idx));
+            return Base::operator()(real_idx);
+        }
+
+        HDINLINE DataBoxDim1Access(const Base base, const pmacc::DataSpace<Dim> originalSize)
+            : Base(base)
+            , originalSize(originalSize)
+        {
+        }
+
+        HDINLINE DataBoxDim1Access(const pmacc::DataSpace<Dim> originalSize) : Base(), originalSize(originalSize)
+        {
+        }
+
+    private:
+        PMACC_ALIGN(originalSize, const pmacc::DataSpace<Dim>);
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/memory/boxes/DataBoxUnaryTransform.hpp b/include/pmacc/memory/boxes/DataBoxUnaryTransform.hpp
index b0d6394e30..1a14a20e16 100644
--- a/include/pmacc/memory/boxes/DataBoxUnaryTransform.hpp
+++ b/include/pmacc/memory/boxes/DataBoxUnaryTransform.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,60 +27,57 @@
 
 namespace pmacc
 {
-
-/** DataBox which apply a unary functor on every operator () and [] access
- *
- * @tparam T_Base base class to inherit from
- * @tparam T_UnaryFunctor unary functor which is applied on every access
- *         - template parameter of functor is the input type for the functor
- *         - functor must have defined the result type as ::result
- */
-template<class T_Base, template<typename> class T_UnaryFunctor>
-class DataBoxUnaryTransform : public T_Base
-{
-public:
-
-    typedef T_Base Base;
-    typedef typename Base::ValueType BaseValueType;
-
-    typedef T_UnaryFunctor<BaseValueType> UnaryFunctor;
-
-    typedef typename UnaryFunctor::result ValueType;
-    typedef ValueType RefValueType;
-    static constexpr uint32_t Dim = Base::Dim;
-
-    HDINLINE DataBoxUnaryTransform(const Base& base) : Base(base)
-    {
-    }
-
-    HDINLINE DataBoxUnaryTransform() : Base()
+    /** DataBox which apply a unary functor on every operator () and [] access
+     *
+     * @tparam T_Base base class to inherit from
+     * @tparam T_UnaryFunctor unary functor which is applied on every access
+     *         - template parameter of functor is the input type for the functor
+     *         - functor must have defined the result type as ::result
+     */
+    template<class T_Base, template<typename> class T_UnaryFunctor>
+    class DataBoxUnaryTransform : public T_Base
     {
-    }
-
-    template<typename T_Index>
-    HDINLINE ValueType operator()(const T_Index &idx) const
-    {
-        return UnaryFunctor()(Base::operator()(idx));
-    }
-
-    template<typename T_Index>
-    HDINLINE ValueType operator()(const T_Index &idx)
-    {
-        return UnaryFunctor()(Base::operator()(idx));
-    }
-
-    template<typename T_Index>
-    HDINLINE ValueType operator[](const T_Index idx)
-    {
-        return UnaryFunctor()(Base::operator[](idx));
-    }
-
-    template<typename T_Index>
-    HDINLINE ValueType operator[](const T_Index idx) const
-    {
-        return UnaryFunctor()(Base::operator[](idx));
-    }
-
-};
-
-} //namespace pmacc
+    public:
+        typedef T_Base Base;
+        typedef typename Base::ValueType BaseValueType;
+
+        typedef T_UnaryFunctor<BaseValueType> UnaryFunctor;
+
+        typedef typename UnaryFunctor::result ValueType;
+        typedef ValueType RefValueType;
+        static constexpr uint32_t Dim = Base::Dim;
+
+        HDINLINE DataBoxUnaryTransform(const Base& base) : Base(base)
+        {
+        }
+
+        HDINLINE DataBoxUnaryTransform() : Base()
+        {
+        }
+
+        template<typename T_Index>
+        HDINLINE ValueType operator()(const T_Index& idx) const
+        {
+            return UnaryFunctor()(Base::operator()(idx));
+        }
+
+        template<typename T_Index>
+        HDINLINE ValueType operator()(const T_Index& idx)
+        {
+            return UnaryFunctor()(Base::operator()(idx));
+        }
+
+        template<typename T_Index>
+        HDINLINE ValueType operator[](const T_Index idx)
+        {
+            return UnaryFunctor()(Base::operator[](idx));
+        }
+
+        template<typename T_Index>
+        HDINLINE ValueType operator[](const T_Index idx) const
+        {
+            return UnaryFunctor()(Base::operator[](idx));
+        }
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/memory/boxes/MultiBox.hpp b/include/pmacc/memory/boxes/MultiBox.hpp
index 10444dbef7..7b40e945b1 100644
--- a/include/pmacc/memory/boxes/MultiBox.hpp
+++ b/include/pmacc/memory/boxes/MultiBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -29,258 +29,265 @@
 
 namespace pmacc
 {
-
-namespace mutiBoxAccess
-{
-
-template<typename Type>
-class MutiBoxAccess
-{
-public:
-    typedef Type ValueType;
-    typedef ValueType& RefValueType;
-
-    HDINLINE MutiBoxAccess(ValueType* ptr, const size_t offset) :
-    offset(offset), ptr((char*) ptr)
-    {
-    }
-
-    HDINLINE RefValueType operator[](const uint32_t idx)
-    {
-        return *((ValueType*) (ptr + (idx * offset)));
-    }
-
-    HDINLINE RefValueType operator[](const uint32_t idx) const
-    {
-        return *((ValueType*) (ptr + (idx * offset)));
-    }
-
-private:
-    PMACC_ALIGN(offset, const size_t);
-    PMACC_ALIGN(ptr, const char*);
-};
-
-}//namespace MutiBoxAccass
-
-template<typename Type, unsigned DIM>
-class MultiBox;
-
-template<typename Type>
-class MultiBox<Type, DIM1>
-{
-private:
-    typedef DataBox<PitchedBox<Type, DIM1 > > DataBoxType;
-public:
-
-    enum
-    {
-        Dim = DIM1
+    namespace mutiBoxAccess
+    {
+        template<typename Type>
+        class MutiBoxAccess
+        {
+        public:
+            typedef Type ValueType;
+            typedef ValueType& RefValueType;
+
+            HDINLINE MutiBoxAccess(ValueType* ptr, const size_t offset) : offset(offset), ptr((char*) ptr)
+            {
+            }
+
+            HDINLINE RefValueType operator[](const uint32_t idx)
+            {
+                return *((ValueType*) (ptr + (idx * offset)));
+            }
+
+            HDINLINE RefValueType operator[](const uint32_t idx) const
+            {
+                return *((ValueType*) (ptr + (idx * offset)));
+            }
+
+        private:
+            PMACC_ALIGN(offset, const size_t);
+            PMACC_ALIGN(ptr, const char*);
+        };
+
+    } // namespace mutiBoxAccess
+
+    template<typename Type, unsigned DIM>
+    class MultiBox;
+
+    template<typename Type>
+    class MultiBox<Type, DIM1>
+    {
+    private:
+        typedef DataBox<PitchedBox<Type, DIM1>> DataBoxType;
+
+    public:
+        enum
+        {
+            Dim = DIM1
+        };
+        typedef mutiBoxAccess::MutiBoxAccess<Type> ValueType;
+        typedef mutiBoxAccess::MutiBoxAccess<Type> RefValueType;
+        typedef MultiBox<Type, DIM1> ReducedType;
+
+        HDINLINE DataBoxType getDataBox(uint32_t nameId)
+        {
+            return DataBoxType(PitchedBox<Type, DIM1>((Type*) ((char*) fixedPointer + attributePitch * nameId)));
+        }
+
+        HDINLINE RefValueType operator[](const int idx)
+        {
+            return RefValueType(fixedPointer + idx, attributePitch);
+        }
+
+        HDINLINE RefValueType operator[](const int idx) const
+        {
+            return RefValueType(fixedPointer + idx, attributePitch);
+        }
+
+        HDINLINE MultiBox(Type* pointer, const DataSpace<DIM1>& offset, const DataSpace<DIM1>&, const size_t pitch)
+            : attributePitch(pitch)
+            , fixedPointer(pointer + offset[0])
+        {
+        }
+
+        HDINLINE MultiBox(Type* pointer, const size_t attributePitch)
+            : attributePitch(attributePitch)
+            , fixedPointer(pointer)
+        {
+        }
+
+        /*Object must init by copy a valid instance*/
+        HDINLINE MultiBox()
+        {
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return RefValueType(fixedPointer, attributePitch);
+        }
+
+        HDINLINE Type const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE Type* getPointer()
+        {
+            return fixedPointer;
+        }
+
+
+    protected:
+        PMACC_ALIGN(attributePitch, size_t);
+        PMACC_ALIGN(fixedPointer, Type*);
     };
-    typedef mutiBoxAccess::MutiBoxAccess<Type> ValueType;
-    typedef mutiBoxAccess::MutiBoxAccess<Type> RefValueType;
-    typedef MultiBox<Type, DIM1> ReducedType;
-
-    HDINLINE DataBoxType getDataBox(uint32_t nameId)
-    {
-        return DataBoxType(PitchedBox<Type, DIM1 > ((Type*) ((char*) fixedPointer + attributePitch * nameId)));
-    }
-
-    HDINLINE RefValueType operator[](const int idx)
-    {
-        return RefValueType(fixedPointer + idx, attributePitch);
-    }
 
-    HDINLINE RefValueType operator[](const int idx) const
-    {
-        return RefValueType(fixedPointer + idx, attributePitch);
-    }
-
-    HDINLINE MultiBox(Type* pointer, const DataSpace<DIM1> &offset, const DataSpace<DIM1>&, const size_t pitch) :
-    attributePitch(pitch), fixedPointer(pointer + offset[0])
-    {
-    }
-
-    HDINLINE MultiBox(Type* pointer, const size_t attributePitch) :
-    attributePitch(attributePitch), fixedPointer(pointer)
-    {
-    }
-
-    /*Object must init by copy a valid instance*/
-    HDINLINE MultiBox()
-    {
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return RefValueType(fixedPointer, attributePitch);
-    }
-
-    HDINLINE Type const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE Type* getPointer()
-    {
-        return fixedPointer;
-    }
-
-
-protected:
-
-    PMACC_ALIGN(attributePitch, size_t);
-    PMACC_ALIGN(fixedPointer, Type*);
-};
-
-template<typename Type>
-class MultiBox<Type, DIM2>
-{
-private:
-    typedef DataBox<PitchedBox<Type, DIM2 > > DataBoxType;
-public:
-
-    enum
-    {
-        Dim = DIM2
+    template<typename Type>
+    class MultiBox<Type, DIM2>
+    {
+    private:
+        typedef DataBox<PitchedBox<Type, DIM2>> DataBoxType;
+
+    public:
+        enum
+        {
+            Dim = DIM2
+        };
+        typedef mutiBoxAccess::MutiBoxAccess<Type> ValueType;
+        typedef mutiBoxAccess::MutiBoxAccess<Type> RefValueType;
+        typedef MultiBox<Type, DIM1> ReducedType;
+
+        HDINLINE DataBoxType getDataBox(uint32_t nameId)
+        {
+            return DataBoxType(
+                PitchedBox<Type, DIM2>((Type*) ((char*) fixedPointer + attributePitch * nameId), pitch));
+        }
+
+        HDINLINE MultiBox(
+            Type* pointer,
+            const DataSpace<DIM2>& offset,
+            const DataSpace<DIM2>& memSize,
+            const size_t pitch)
+            : pitch(pitch)
+            , attributePitch(pitch * memSize.y())
+            , fixedPointer((Type*) ((char*) pointer + offset[1] * pitch) + offset[0])
+        {
+        }
+
+        /*Object must init by copy a valid instance*/
+        HDINLINE MultiBox()
+        {
+        }
+
+        HDINLINE ReducedType operator[](const int idx)
+        {
+            return ReducedType((Type*) ((char*) this->fixedPointer + idx * pitch), attributePitch);
+        }
+
+        HDINLINE ReducedType operator[](const int idx) const
+        {
+            return ReducedType((Type*) ((char*) this->fixedPointer + idx * pitch), attributePitch);
+        }
+
+        HDINLINE MultiBox(Type* pointer, size_t pitch, size_t attributePitch)
+            : pitch(pitch)
+            , attributePitch(attributePitch)
+            , fixedPointer(pointer)
+        {
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return RefValueType(fixedPointer, attributePitch);
+        }
+
+        HDINLINE Type const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE Type* getPointer()
+        {
+            return fixedPointer;
+        }
+
+    protected:
+        PMACC_ALIGN(pitch, size_t);
+        PMACC_ALIGN(attributePitch, size_t);
+        PMACC_ALIGN(fixedPointer, Type*);
     };
-    typedef mutiBoxAccess::MutiBoxAccess<Type> ValueType;
-    typedef mutiBoxAccess::MutiBoxAccess<Type> RefValueType;
-    typedef MultiBox<Type, DIM1> ReducedType;
 
-    HDINLINE DataBoxType getDataBox(uint32_t nameId)
-    {
-        return DataBoxType(PitchedBox<Type, DIM2 > ((Type*) ((char*) fixedPointer + attributePitch * nameId), pitch));
-    }
-
-    HDINLINE MultiBox(Type* pointer, const DataSpace<DIM2> &offset, const DataSpace<DIM2> &memSize, const size_t pitch) :
-    pitch(pitch),
-    attributePitch(pitch*memSize.y()),
-    fixedPointer((Type*) ((char*) pointer + offset[1] * pitch) + offset[0])
-    {
-    }
-
-    /*Object must init by copy a valid instance*/
-    HDINLINE MultiBox()
-    {
-    }
-
-    HDINLINE ReducedType operator[](const int idx)
-    {
-        return ReducedType((Type*) ((char*) this->fixedPointer + idx * pitch), attributePitch);
-    }
-
-    HDINLINE ReducedType operator[](const int idx) const
-    {
-        return ReducedType((Type*) ((char*) this->fixedPointer + idx * pitch), attributePitch);
-    }
-
-    HDINLINE MultiBox(Type* pointer, size_t pitch, size_t attributePitch) :
-    pitch(pitch),
-    attributePitch(attributePitch),
-    fixedPointer(pointer)
-    {
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return RefValueType(fixedPointer, attributePitch);
-    }
-
-    HDINLINE Type const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE Type* getPointer()
-    {
-        return fixedPointer;
-    }
-
-protected:
-
-    PMACC_ALIGN(pitch, size_t);
-    PMACC_ALIGN(attributePitch, size_t);
-    PMACC_ALIGN(fixedPointer, Type*);
-};
-
-template<typename Type>
-class MultiBox<Type, DIM3>
-{
-private:
-    typedef DataBox<PitchedBox<Type, DIM3 > > DataBoxType;
-public:
-
-    enum
-    {
-        Dim = DIM3
+    template<typename Type>
+    class MultiBox<Type, DIM3>
+    {
+    private:
+        typedef DataBox<PitchedBox<Type, DIM3>> DataBoxType;
+
+    public:
+        enum
+        {
+            Dim = DIM3
+        };
+        typedef mutiBoxAccess::MutiBoxAccess<Type> ValueType;
+        typedef mutiBoxAccess::MutiBoxAccess<Type> RefValueType;
+        typedef MultiBox<Type, DIM2> ReducedType;
+
+        HDINLINE DataBoxType getDataBox(uint32_t nameId)
+        {
+            return DataBoxType(
+                PitchedBox<Type, DIM3>((Type*) ((char*) fixedPointer + attributePitch * nameId), pitch, pitch2D));
+        }
+
+        HDINLINE ReducedType operator[](const int idx)
+        {
+            return ReducedType((Type*) ((char*) (this->fixedPointer) + idx * pitch2D), pitch, attributePitch);
+        }
+
+        HDINLINE ReducedType operator[](const int idx) const
+        {
+            return ReducedType((Type*) ((char*) (this->fixedPointer) + idx * pitch2D), pitch, attributePitch);
+        }
+
+        /** constructor
+         *
+         * @param pointer pointer to the origin of the physical memory
+         * @param offset offset (in elements)
+         * @param memSize size of the physical memory (in elements)
+         * @param pitch number of bytes in one line (first dimension)
+         */
+        HDINLINE MultiBox(
+            Type* pointer,
+            const DataSpace<DIM3>& offset,
+            const DataSpace<DIM3>& memSize,
+            const size_t pitch)
+            : pitch(pitch)
+            , pitch2D(memSize.y() * pitch)
+            , attributePitch((memSize.y() * pitch) * size.z())
+            , fixedPointer(
+                  (Type*) ((char*) pointer + offset[2] * (memSize.y() * pitch) + offset[1] * pitch) + offset[0])
+        {
+        }
+
+        /*Object must init by copy a valid instance*/
+        HDINLINE MultiBox()
+        {
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return RefValueType(fixedPointer, attributePitch);
+        }
+
+        HDINLINE Type const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE Type* getPointer()
+        {
+            return fixedPointer;
+        }
+
+
+        PMACC_ALIGN(pitch, size_t);
+        PMACC_ALIGN(pitch2D, size_t);
+        PMACC_ALIGN(attributePitch, size_t);
+        PMACC_ALIGN(fixedPointer, Type*);
     };
-    typedef mutiBoxAccess::MutiBoxAccess<Type> ValueType;
-    typedef mutiBoxAccess::MutiBoxAccess<Type> RefValueType;
-    typedef MultiBox<Type, DIM2> ReducedType;
-
-    HDINLINE DataBoxType getDataBox(uint32_t nameId)
-    {
-        return DataBoxType(PitchedBox<Type, DIM3 > ((Type*) ((char*) fixedPointer + attributePitch * nameId), pitch, pitch2D));
-    }
-
-    HDINLINE ReducedType operator[](const int idx)
-    {
-        return ReducedType((Type*) ((char*) (this->fixedPointer) + idx * pitch2D), pitch, attributePitch);
-    }
-
-    HDINLINE ReducedType operator[](const int idx) const
-    {
-        return ReducedType((Type*) ((char*) (this->fixedPointer) + idx * pitch2D), pitch, attributePitch);
-    }
-
-    /** constructor
-     *
-     * @param pointer pointer to the origin of the physical memory
-     * @param offset offset (in elements)
-     * @param memSize size of the physical memory (in elements)
-     * @param pitch number of bytes in one line (first dimension)
-     */
-    HDINLINE MultiBox(Type* pointer, const DataSpace<DIM3> &offset, const DataSpace<DIM3> &memSize, const size_t pitch) :
-    pitch(pitch), pitch2D(memSize.y() * pitch), attributePitch((memSize.y() * pitch) * size.z()),
-    fixedPointer((Type*) ((char*) pointer + offset[2] * (memSize.y() * pitch) + offset[1] * pitch) + offset[0])
-    {
-    }
-
-    /*Object must init by copy a valid instance*/
-    HDINLINE MultiBox()
-    {
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return RefValueType(fixedPointer, attributePitch);
-    }
-
-    HDINLINE Type const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE Type* getPointer()
-    {
-        return fixedPointer;
-    }
-
-
-    PMACC_ALIGN(pitch, size_t);
-    PMACC_ALIGN(pitch2D, size_t);
-    PMACC_ALIGN(attributePitch, size_t);
-    PMACC_ALIGN(fixedPointer, Type*);
-
-};
-
-
-}
 
 
+} // namespace pmacc
diff --git a/include/pmacc/memory/boxes/PitchedBox.hpp b/include/pmacc/memory/boxes/PitchedBox.hpp
index 44e2dc0976..b61b915647 100644
--- a/include/pmacc/memory/boxes/PitchedBox.hpp
+++ b/include/pmacc/memory/boxes/PitchedBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,235 +28,224 @@
 
 namespace pmacc
 {
-
-template<typename TYPE, unsigned DIM>
-class PitchedBox;
-
-template<typename TYPE>
-class PitchedBox<TYPE, DIM1>
-{
-public:
-
-    enum
-    {
-        Dim = DIM1
+    template<typename TYPE, unsigned DIM>
+    class PitchedBox;
+
+    template<typename TYPE>
+    class PitchedBox<TYPE, DIM1>
+    {
+    public:
+        enum
+        {
+            Dim = DIM1
+        };
+        typedef TYPE ValueType;
+        typedef ValueType& RefValueType;
+        typedef PitchedBox<TYPE, DIM1> ReducedType;
+
+        HDINLINE RefValueType operator[](const int idx)
+        {
+            return fixedPointer[idx];
+        }
+
+        HDINLINE RefValueType operator[](const int idx) const
+        {
+            return fixedPointer[idx];
+        }
+
+        HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM1>& offset, const DataSpace<DIM1>&, const size_t)
+            : fixedPointer(pointer + offset[0])
+        {
+        }
+
+        HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM1>& offset) : fixedPointer(pointer + offset[0])
+        {
+        }
+
+        HDINLINE PitchedBox(TYPE* pointer) : fixedPointer(pointer)
+        {
+        }
+
+        /*Object must init by copy a valid instance*/
+        HDINLINE PitchedBox()
+        {
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return *(fixedPointer);
+        }
+
+        HDINLINE TYPE const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE TYPE* getPointer()
+        {
+            return fixedPointer;
+        }
+
+
+    protected:
+        PMACC_ALIGN(fixedPointer, TYPE*);
     };
-    typedef TYPE ValueType;
-    typedef ValueType& RefValueType;
-    typedef PitchedBox<TYPE, DIM1> ReducedType;
-
-    HDINLINE RefValueType operator[](const int idx)
-    {
-        return fixedPointer[idx];
-    }
-
-    HDINLINE RefValueType operator[](const int idx) const
-    {
-        return fixedPointer[idx];
-    }
-
-    HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM1> &offset, const DataSpace<DIM1>&, const size_t) :
-    fixedPointer(pointer + offset[0])
-    {
-    }
-
-    HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM1> &offset) :
-    fixedPointer(pointer + offset[0])
-    {
-    }
-
-    HDINLINE PitchedBox(TYPE* pointer) :
-    fixedPointer(pointer)
-    {
-    }
-
-    /*Object must init by copy a valid instance*/
-    HDINLINE PitchedBox()
-    {
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return *(fixedPointer);
-    }
 
-    HDINLINE TYPE const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE TYPE* getPointer()
-    {
-        return fixedPointer;
-    }
-
-
-protected:
-
-    PMACC_ALIGN(fixedPointer, TYPE*);
-};
-
-template<typename TYPE>
-class PitchedBox<TYPE, DIM2>
-{
-public:
-
-    enum
-    {
-        Dim = DIM2
+    template<typename TYPE>
+    class PitchedBox<TYPE, DIM2>
+    {
+    public:
+        enum
+        {
+            Dim = DIM2
+        };
+        typedef TYPE ValueType;
+        typedef ValueType& RefValueType;
+        typedef PitchedBox<TYPE, DIM1> ReducedType;
+
+        HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM2>& offset, const DataSpace<DIM2>&, const size_t pitch)
+            : pitch(pitch)
+            , fixedPointer((TYPE*) ((char*) pointer + offset[1] * pitch) + offset[0])
+        {
+        }
+
+        HDINLINE PitchedBox(TYPE* pointer, size_t pitch) : pitch(pitch), fixedPointer(pointer)
+        {
+        }
+
+        /*Object must init by copy a valid instance*/
+        HDINLINE PitchedBox()
+        {
+        }
+
+        HDINLINE ReducedType operator[](const int idx)
+        {
+            return ReducedType((TYPE*) ((char*) this->fixedPointer + idx * pitch));
+        }
+
+        HDINLINE ReducedType operator[](const int idx) const
+        {
+            return ReducedType((TYPE*) ((char*) this->fixedPointer + idx * pitch));
+        }
+
+        HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM2>& offset, size_t pitch)
+            : pitch(pitch)
+            , fixedPointer((TYPE*) ((char*) pointer + offset[1] * pitch) + offset[0])
+        {
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return *((TYPE*) fixedPointer);
+        }
+
+        HDINLINE TYPE const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE TYPE* getPointer()
+        {
+            return fixedPointer;
+        }
+
+    protected:
+        PMACC_ALIGN(pitch, size_t);
+        PMACC_ALIGN(fixedPointer, TYPE*);
     };
-    typedef TYPE ValueType;
-    typedef ValueType& RefValueType;
-    typedef PitchedBox<TYPE, DIM1> ReducedType;
-
-    HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM2> &offset, const DataSpace<DIM2>&, const size_t pitch) :
-    pitch(pitch),
-    fixedPointer((TYPE*) ((char*) pointer + offset[1] * pitch) + offset[0])
-    {
-    }
-
-    HDINLINE PitchedBox(TYPE* pointer, size_t pitch) :
-    pitch(pitch),
-    fixedPointer(pointer)
-    {
-    }
-
-    /*Object must init by copy a valid instance*/
-    HDINLINE PitchedBox()
-    {
-    }
-
-    HDINLINE ReducedType operator[](const int idx)
-    {
-        return ReducedType((TYPE*) ((char*) this->fixedPointer + idx * pitch));
-    }
-
-    HDINLINE ReducedType operator[](const int idx) const
-    {
-        return ReducedType((TYPE*) ((char*) this->fixedPointer + idx * pitch));
-    }
 
-    HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM2>& offset, size_t pitch) :
-    pitch(pitch),
-    fixedPointer((TYPE*) ((char*) pointer + offset[1] * pitch) + offset[0])
-    {
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return *((TYPE*) fixedPointer);
-    }
-
-    HDINLINE TYPE const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE TYPE* getPointer()
-    {
-        return fixedPointer;
-    }
-
-protected:
-
-    PMACC_ALIGN(pitch, size_t);
-    PMACC_ALIGN(fixedPointer, TYPE*);
-
-};
-
-template<typename TYPE>
-class PitchedBox<TYPE, DIM3>
-{
-public:
-
-    enum
-    {
-        Dim = DIM3
+    template<typename TYPE>
+    class PitchedBox<TYPE, DIM3>
+    {
+    public:
+        enum
+        {
+            Dim = DIM3
+        };
+        typedef TYPE ValueType;
+        typedef ValueType& RefValueType;
+        typedef PitchedBox<TYPE, DIM2> ReducedType;
+
+        HDINLINE ReducedType operator[](const int idx)
+        {
+            return ReducedType((TYPE*) ((char*) (this->fixedPointer) + idx * pitch2D), pitch);
+        }
+
+        HDINLINE ReducedType operator[](const int idx) const
+        {
+            return ReducedType((TYPE*) ((char*) (this->fixedPointer) + idx * pitch2D), pitch);
+        }
+
+        /** constructor
+         *
+         * @param pointer pointer to the origin of the physical memory
+         * @param offset offset (in elements)
+         * @param memSize size of the physical memory (in elements)
+         * @param pitch number of bytes in one line (first dimension)
+         */
+        HDINLINE PitchedBox(
+            TYPE* pointer,
+            const DataSpace<DIM3>& offset,
+            const DataSpace<DIM3>& memSize,
+            const size_t pitch)
+            : pitch(pitch)
+            , pitch2D(memSize[1] * pitch)
+            , fixedPointer(
+                  (TYPE*) ((char*) pointer + offset[2] * (memSize[1] * pitch) + offset[1] * pitch) + offset[0])
+        {
+        }
+
+        HDINLINE PitchedBox(TYPE* pointer, const size_t pitch, const size_t pitch2D)
+            : pitch(pitch)
+            , pitch2D(pitch2D)
+            , fixedPointer(pointer)
+        {
+        }
+
+        /*Object must init by copy a valid instance*/
+        HDINLINE PitchedBox()
+        {
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return *(fixedPointer);
+        }
+
+        HDINLINE TYPE const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE TYPE* getPointer()
+        {
+            return fixedPointer;
+        }
+
+        HDINLINE pmacc::cursor::BufferCursor<TYPE, DIM3> toCursor() const
+        {
+            return pmacc::cursor::BufferCursor<TYPE, DIM3>(
+                (TYPE*) fixedPointer,
+                ::pmacc::math::Size_t<2>(pitch, pitch2D));
+        }
+
+    protected:
+        HDINLINE PitchedBox<TYPE, DIM2> reduceZ(const int zOffset) const
+        {
+            return PitchedBox<TYPE, DIM2>((TYPE*) ((char*) (this->fixedPointer) + pitch2D * zOffset), pitch);
+        }
+
+
+        PMACC_ALIGN(pitch, size_t);
+        PMACC_ALIGN(pitch2D, size_t);
+        PMACC_ALIGN(fixedPointer, TYPE*);
     };
-    typedef TYPE ValueType;
-    typedef ValueType& RefValueType;
-    typedef PitchedBox<TYPE, DIM2> ReducedType;
-
-    HDINLINE ReducedType operator[](const int idx)
-    {
-        return ReducedType((TYPE*) ((char*) (this->fixedPointer) + idx * pitch2D), pitch);
-    }
-
-    HDINLINE ReducedType operator[](const int idx) const
-    {
-        return ReducedType((TYPE*) ((char*) (this->fixedPointer) + idx * pitch2D), pitch);
-    }
-
-    /** constructor
-     *
-     * @param pointer pointer to the origin of the physical memory
-     * @param offset offset (in elements)
-     * @param memSize size of the physical memory (in elements)
-     * @param pitch number of bytes in one line (first dimension)
-     */
-    HDINLINE PitchedBox(TYPE* pointer, const DataSpace<DIM3> &offset, const DataSpace<DIM3> &memSize, const size_t pitch) :
-    pitch(pitch), pitch2D(memSize[1] * pitch),
-    fixedPointer((TYPE*) ((char*) pointer + offset[2] * (memSize[1] * pitch) + offset[1] * pitch) + offset[0])
-    {
-    }
-
-    HDINLINE PitchedBox(TYPE* pointer, const size_t pitch, const size_t pitch2D) :
-    pitch(pitch), pitch2D(pitch2D),
-    fixedPointer(pointer)
-    {
-    }
-
-    /*Object must init by copy a valid instance*/
-    HDINLINE PitchedBox()
-    {
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return *(fixedPointer);
-    }
-
-    HDINLINE TYPE const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE TYPE* getPointer()
-    {
-        return fixedPointer;
-    }
-
-    HDINLINE pmacc::cursor::BufferCursor<TYPE, DIM3>
-    toCursor() const
-    {
-        return pmacc::cursor::BufferCursor<TYPE, DIM3>
-            ((TYPE*)fixedPointer, ::pmacc::math::Size_t<2>(pitch, pitch2D));
-    }
-
-protected:
-
-    HDINLINE PitchedBox<TYPE, DIM2> reduceZ(const int zOffset) const
-    {
-        return PitchedBox<TYPE, DIM2 > (
-                                        (TYPE*) ((char*) (this->fixedPointer) + pitch2D * zOffset),
-                                        pitch
-                                        );
-    }
-
-
-    PMACC_ALIGN(pitch, size_t);
-    PMACC_ALIGN(pitch2D, size_t);
-    PMACC_ALIGN(fixedPointer, TYPE*);
-
-};
-
-
-}
 
 
+} // namespace pmacc
diff --git a/include/pmacc/memory/boxes/SharedBox.hpp b/include/pmacc/memory/boxes/SharedBox.hpp
index db4660701b..0cddec71a8 100644
--- a/include/pmacc/memory/boxes/SharedBox.hpp
+++ b/include/pmacc/memory/boxes/SharedBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -32,253 +32,230 @@
 
 namespace pmacc
 {
-
-/** create shared memory on gpu
- *
- * @tparam T_TYPE type of memory objects
- * @tparam T_Vector CT::Vector with size description (per dimension)
- * @tparam T_id unique id for this object
- *              (is needed if more than one instance of shared memory in one kernel is used)
- * @tparam T_dim dimension of the memory (supports DIM1,DIM2 and DIM3)
- */
-template<typename T_TYPE, class T_Vector, uint32_t T_id=0, uint32_t T_dim=T_Vector::dim>
-class SharedBox;
-
-template<typename T_TYPE, class T_Vector, uint32_t T_id>
-class SharedBox<T_TYPE, T_Vector, T_id, DIM1>
-{
-public:
-
-    enum
-    {
-        Dim = DIM1
-    };
-    typedef T_TYPE ValueType;
-    typedef ValueType& RefValueType;
-    typedef T_Vector Size;
-    typedef SharedBox<ValueType, math::CT::Int<Size::x::value>, T_id> ReducedType;
-    typedef SharedBox<ValueType, T_Vector, T_id, DIM1> This;
-
-    HDINLINE RefValueType operator[](const int idx)
-    {
-        return fixedPointer[idx];
-    }
-
-    HDINLINE RefValueType operator[](const int idx) const
-    {
-        return fixedPointer[idx];
-    }
-
-    HDINLINE SharedBox(ValueType* pointer) :
-    fixedPointer(pointer)
-    {
-    }
-
-    DINLINE SharedBox() :
-    fixedPointer(nullptr)
-    {
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return *(fixedPointer);
-    }
-
-    HDINLINE ValueType const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE ValueType* getPointer()
-    {
-        return fixedPointer;
-    }
-
-    /** create a shared memory box
+    /** create shared memory on gpu
      *
-     * This call synchronizes a block and must be called from all threads and
-     * not inside a if clauses
+     * @tparam T_TYPE type of memory objects
+     * @tparam T_Vector CT::Vector with size description (per dimension)
+     * @tparam T_id unique id for this object
+     *              (is needed if more than one instance of shared memory in one kernel is used)
+     * @tparam T_dim dimension of the memory (supports DIM1,DIM2 and DIM3)
      */
-    template< typename T_Acc >
-    static DINLINE SharedBox
-    init( T_Acc const & acc )
-    {
-        auto& mem_sh = pmacc::memory::shared::allocate<
-            T_id,
-            memory::Array<
-                ValueType,
-                math::CT::volume< Size >::type::value
-            >
-        >( acc );
-        return SharedBox( mem_sh.data() );
-    }
-
-protected:
-
-    PMACC_ALIGN(fixedPointer, ValueType*);
-};
+    template<typename T_TYPE, class T_Vector, uint32_t T_id = 0, uint32_t T_dim = T_Vector::dim>
+    class SharedBox;
 
-template<typename T_TYPE, class T_Vector, uint32_t T_id>
-class SharedBox<T_TYPE, T_Vector,T_id, DIM2 >
-{
-public:
-
-    enum
+    template<typename T_TYPE, class T_Vector, uint32_t T_id>
+    class SharedBox<T_TYPE, T_Vector, T_id, DIM1>
     {
-        Dim = DIM2
+    public:
+        enum
+        {
+            Dim = DIM1
+        };
+        typedef T_TYPE ValueType;
+        typedef ValueType& RefValueType;
+        typedef T_Vector Size;
+        typedef SharedBox<ValueType, math::CT::Int<Size::x::value>, T_id> ReducedType;
+        typedef SharedBox<ValueType, T_Vector, T_id, DIM1> This;
+
+        HDINLINE RefValueType operator[](const int idx)
+        {
+            return fixedPointer[idx];
+        }
+
+        HDINLINE RefValueType operator[](const int idx) const
+        {
+            return fixedPointer[idx];
+        }
+
+        HDINLINE SharedBox(ValueType* pointer) : fixedPointer(pointer)
+        {
+        }
+
+        DINLINE SharedBox() : fixedPointer(nullptr)
+        {
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return *(fixedPointer);
+        }
+
+        HDINLINE ValueType const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE ValueType* getPointer()
+        {
+            return fixedPointer;
+        }
+
+        /** create a shared memory box
+         *
+         * This call synchronizes a block and must be called from all threads and
+         * not inside a if clauses
+         */
+        template<typename T_Acc>
+        static DINLINE SharedBox init(T_Acc const& acc)
+        {
+            auto& mem_sh
+                = pmacc::memory::shared::allocate<T_id, memory::Array<ValueType, math::CT::volume<Size>::type::value>>(
+                    acc);
+            return SharedBox(mem_sh.data());
+        }
+
+    protected:
+        PMACC_ALIGN(fixedPointer, ValueType*);
     };
-    typedef T_TYPE ValueType;
-    typedef ValueType& RefValueType;
-    typedef T_Vector Size;
-    typedef SharedBox<ValueType, math::CT::Int<Size::x::value>, T_id > ReducedType;
-    typedef SharedBox<ValueType, T_Vector, T_id, DIM2 > This;
-
-    HDINLINE SharedBox(ValueType* pointer = nullptr) :
-    fixedPointer(pointer)
-    {
-    }
-
-    HDINLINE ReducedType operator[](const int idx)
-    {
-        return ReducedType(this->fixedPointer + idx * Size::x::value);
-    }
 
-    HDINLINE ReducedType operator[](const int idx) const
+    template<typename T_TYPE, class T_Vector, uint32_t T_id>
+    class SharedBox<T_TYPE, T_Vector, T_id, DIM2>
     {
-        return ReducedType(this->fixedPointer + idx * Size::x::value);
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return *((ValueType*) fixedPointer);
-    }
-
-    HDINLINE ValueType const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE ValueType* getPointer()
-    {
-        return fixedPointer;
-    }
-
-    /** create a shared memory box
-     *
-     * This call synchronizes a block and must be called from all threads and
-     * not inside a if clauses
-     */
-    template< typename T_Acc >
-    static DINLINE SharedBox
-    init( T_Acc const & acc )
-    {
-        auto& mem_sh = pmacc::memory::shared::allocate<
-            T_id,
-            memory::Array<
-                ValueType,
-                math::CT::volume< Size >::type::value
-            >
-        >( acc );
-        return SharedBox( mem_sh.data() );
-    }
-
-    HDINLINE pmacc::cursor::CT::BufferCursor<ValueType, ::pmacc::math::CT::Int<sizeof (ValueType) * Size::x::value> >
-    toCursor() const
-    {
-        return pmacc::cursor::CT::BufferCursor<ValueType, ::pmacc::math::CT::Int<sizeof (ValueType) * Size::x::value> >
-            ((ValueType*) fixedPointer);
-    }
-
-protected:
-
-    PMACC_ALIGN(fixedPointer, ValueType*);
-};
-
-template<typename T_TYPE, class T_Vector, uint32_t T_id>
-class SharedBox<T_TYPE, T_Vector, T_id, DIM3>
-{
-public:
-
-    enum
-    {
-        Dim = DIM3
+    public:
+        enum
+        {
+            Dim = DIM2
+        };
+        typedef T_TYPE ValueType;
+        typedef ValueType& RefValueType;
+        typedef T_Vector Size;
+        typedef SharedBox<ValueType, math::CT::Int<Size::x::value>, T_id> ReducedType;
+        typedef SharedBox<ValueType, T_Vector, T_id, DIM2> This;
+
+        HDINLINE SharedBox(ValueType* pointer = nullptr) : fixedPointer(pointer)
+        {
+        }
+
+        HDINLINE ReducedType operator[](const int idx)
+        {
+            return ReducedType(this->fixedPointer + idx * Size::x::value);
+        }
+
+        HDINLINE ReducedType operator[](const int idx) const
+        {
+            return ReducedType(this->fixedPointer + idx * Size::x::value);
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return *((ValueType*) fixedPointer);
+        }
+
+        HDINLINE ValueType const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE ValueType* getPointer()
+        {
+            return fixedPointer;
+        }
+
+        /** create a shared memory box
+         *
+         * This call synchronizes a block and must be called from all threads and
+         * not inside a if clauses
+         */
+        template<typename T_Acc>
+        static DINLINE SharedBox init(T_Acc const& acc)
+        {
+            auto& mem_sh
+                = pmacc::memory::shared::allocate<T_id, memory::Array<ValueType, math::CT::volume<Size>::type::value>>(
+                    acc);
+            return SharedBox(mem_sh.data());
+        }
+
+        HDINLINE pmacc::cursor::CT::BufferCursor<ValueType, ::pmacc::math::CT::Int<sizeof(ValueType) * Size::x::value>>
+        toCursor() const
+        {
+            return pmacc::cursor::CT::
+                BufferCursor<ValueType, ::pmacc::math::CT::Int<sizeof(ValueType) * Size::x::value>>(
+                    (ValueType*) fixedPointer);
+        }
+
+    protected:
+        PMACC_ALIGN(fixedPointer, ValueType*);
     };
-    typedef T_TYPE ValueType;
-    typedef ValueType& RefValueType;
-    typedef T_Vector Size;
-    typedef SharedBox<ValueType, math::CT::Int<Size::x::value, Size::y::value>, T_id > ReducedType;
-    typedef SharedBox<ValueType, T_Vector, T_id, DIM3 > This;
-
-    HDINLINE ReducedType operator[](const int idx)
-    {
-        return ReducedType(this->fixedPointer + idx *  (Size::x::value * Size::y::value));
-    }
-
-    HDINLINE ReducedType operator[](const int idx) const
-    {
-        return ReducedType(this->fixedPointer + idx *  (Size::x::value *Size::y::value));
-    }
 
-    HDINLINE SharedBox(ValueType* pointer = nullptr) :
-    fixedPointer(pointer)
+    template<typename T_TYPE, class T_Vector, uint32_t T_id>
+    class SharedBox<T_TYPE, T_Vector, T_id, DIM3>
     {
-    }
-
-    /*!return the first value in the box (list)
-     * @return first value
-     */
-    HDINLINE RefValueType operator*()
-    {
-        return *(fixedPointer);
-    }
-
-    HDINLINE ValueType const * getPointer() const
-    {
-        return fixedPointer;
-    }
-    HDINLINE ValueType* getPointer()
-    {
-        return fixedPointer;
-    }
-
-    HDINLINE pmacc::cursor::CT::BufferCursor<ValueType, ::pmacc::math::CT::Int<sizeof (ValueType) * Size::x::value,
-    sizeof (ValueType) * Size::x::value * Size::y::value> >
-    toCursor() const
-    {
-        return pmacc::cursor::CT::BufferCursor<ValueType, ::pmacc::math::CT::Int<sizeof (ValueType) * Size::x::value,
-            sizeof (ValueType) * Size::x::value * Size::y::value> >
-            ((ValueType*)fixedPointer);
-    }
-
-    /** create a shared memory box
-     *
-     * This call synchronizes a block and must be called from all threads and
-     * not inside a if clauses
-     */
-    template< typename T_Acc >
-    static DINLINE SharedBox
-    init( T_Acc const & acc )
-    {
-        auto& mem_sh = pmacc::memory::shared::allocate<
-            T_id,
-            memory::Array<
+    public:
+        enum
+        {
+            Dim = DIM3
+        };
+        typedef T_TYPE ValueType;
+        typedef ValueType& RefValueType;
+        typedef T_Vector Size;
+        typedef SharedBox<ValueType, math::CT::Int<Size::x::value, Size::y::value>, T_id> ReducedType;
+        typedef SharedBox<ValueType, T_Vector, T_id, DIM3> This;
+
+        HDINLINE ReducedType operator[](const int idx)
+        {
+            return ReducedType(this->fixedPointer + idx * (Size::x::value * Size::y::value));
+        }
+
+        HDINLINE ReducedType operator[](const int idx) const
+        {
+            return ReducedType(this->fixedPointer + idx * (Size::x::value * Size::y::value));
+        }
+
+        HDINLINE SharedBox(ValueType* pointer = nullptr) : fixedPointer(pointer)
+        {
+        }
+
+        /*!return the first value in the box (list)
+         * @return first value
+         */
+        HDINLINE RefValueType operator*()
+        {
+            return *(fixedPointer);
+        }
+
+        HDINLINE ValueType const* getPointer() const
+        {
+            return fixedPointer;
+        }
+        HDINLINE ValueType* getPointer()
+        {
+            return fixedPointer;
+        }
+
+        HDINLINE pmacc::cursor::CT::BufferCursor<
+            ValueType,
+            ::pmacc::math::CT::
+                Int<sizeof(ValueType) * Size::x::value, sizeof(ValueType) * Size::x::value * Size::y::value>>
+        toCursor() const
+        {
+            return pmacc::cursor::CT::BufferCursor<
                 ValueType,
-                math::CT::volume< Size >::type::value
-            >
-        >( acc );
-        return SharedBox( mem_sh.data() );
-    }
-
-protected:
-
-    PMACC_ALIGN(fixedPointer, ValueType*);
-
-};
-
+                ::pmacc::math::CT::
+                    Int<sizeof(ValueType) * Size::x::value, sizeof(ValueType) * Size::x::value * Size::y::value>>(
+                (ValueType*) fixedPointer);
+        }
+
+        /** create a shared memory box
+         *
+         * This call synchronizes a block and must be called from all threads and
+         * not inside a if clauses
+         */
+        template<typename T_Acc>
+        static DINLINE SharedBox init(T_Acc const& acc)
+        {
+            auto& mem_sh
+                = pmacc::memory::shared::allocate<T_id, memory::Array<ValueType, math::CT::volume<Size>::type::value>>(
+                    acc);
+            return SharedBox(mem_sh.data());
+        }
+
+    protected:
+        PMACC_ALIGN(fixedPointer, ValueType*);
+    };
 
-}
 
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/Buffer.hpp b/include/pmacc/memory/buffers/Buffer.hpp
index 26fae41b6a..7916336a7b 100644
--- a/include/pmacc/memory/buffers/Buffer.hpp
+++ b/include/pmacc/memory/buffers/Buffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -32,19 +32,17 @@
 
 namespace pmacc
 {
-
     /**
      * Minimal function description of a buffer,
      *
      * @tparam TYPE data type stored in the buffer
      * @tparam DIM dimension of the buffer (1-3)
      */
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class Buffer
     {
     public:
-
-        typedef DataBox<PitchedBox<TYPE, DIM> > DataBoxType;
+        typedef DataBox<PitchedBox<TYPE, DIM>> DataBoxType;
 
         /** constructor
          *
@@ -53,10 +51,13 @@ namespace pmacc
          *             can be less than `physicalMemorySize`
          * @param physicalMemorySize size of the physical memory (in elements)
          */
-        Buffer(DataSpace<DIM> size, DataSpace<DIM> physicalMemorySize) :
-        data_space(size), data1D(true), current_size(nullptr), m_physicalMemorySize(physicalMemorySize)
+        Buffer(DataSpace<DIM> size, DataSpace<DIM> physicalMemorySize)
+            : data_space(size)
+            , data1D(true)
+            , current_size(nullptr)
+            , m_physicalMemorySize(physicalMemorySize)
         {
-            CUDA_CHECK(cudaMallocHost((void**)&current_size, sizeof (size_t)));
+            CUDA_CHECK(cuplaMallocHost((void**) &current_size, sizeof(size_t)));
             *current_size = size.productOfComponents();
         }
 
@@ -65,7 +66,7 @@ namespace pmacc
          */
         virtual ~Buffer()
         {
-            CUDA_CHECK_NO_EXCEPT(cudaFreeHost(current_size));
+            CUDA_CHECK_NO_EXCEPT(cuplaFreeHost(current_size));
         }
 
         /*! Get base pointer to memory
@@ -110,41 +111,44 @@ namespace pmacc
             int64_t current_size = static_cast<int64_t>(currentSize);
 
             //!\todo: current size can be changed if it is a DeviceBuffer and current size is on device
-            //call first get current size (but const not allow this)
+            // call first get current size (but const not allow this)
 
-            if (DIM == DIM1)
+            if(DIM == DIM1)
             {
                 tmp[0] = current_size;
             }
-            if (DIM == DIM2)
+            if(DIM == DIM2)
             {
-                if (current_size <= data_space[0])
+                if(current_size <= data_space[0])
                 {
                     tmp[0] = current_size;
                     tmp[1] = 1;
-                } else
+                }
+                else
                 {
                     tmp[0] = data_space[0];
-                    tmp[1] = (current_size+data_space[0]-1) / data_space[0];
+                    tmp[1] = (current_size + data_space[0] - 1) / data_space[0];
                 }
             }
-            if (DIM == DIM3)
+            if(DIM == DIM3)
             {
-                if (current_size <= data_space[0])
+                if(current_size <= data_space[0])
                 {
                     tmp[0] = current_size;
                     tmp[1] = 1;
                     tmp[2] = 1;
-                } else if (current_size <= (data_space[0] * data_space[1]))
+                }
+                else if(current_size <= (data_space[0] * data_space[1]))
                 {
                     tmp[0] = data_space[0];
-                    tmp[1] = (current_size+data_space[0]-1) / data_space[0];
+                    tmp[1] = (current_size + data_space[0] - 1) / data_space[0];
                     tmp[2] = 1;
-                } else
+                }
+                else
                 {
                     tmp[0] = data_space[0];
                     tmp[1] = data_space[1];
-                    tmp[2] = (current_size+(data_space[0] * data_space[1])-1) / (data_space[0] * data_space[1]);
+                    tmp[2] = (current_size + (data_space[0] * data_space[1]) - 1) / (data_space[0] * data_space[1]);
                 }
             }
 
@@ -170,11 +174,11 @@ namespace pmacc
             *current_size = newsize;
         }
 
-        virtual void reset(bool preserveData = false)=0;
+        virtual void reset(bool preserveData = false) = 0;
 
-        virtual void setValue(const TYPE& value)=0;
+        virtual void setValue(const TYPE& value) = 0;
 
-        virtual DataBox<PitchedBox<TYPE,DIM> > getDataBox()=0;
+        virtual DataBox<PitchedBox<TYPE, DIM>> getDataBox() = 0;
 
         inline bool is1D()
         {
@@ -182,7 +186,6 @@ namespace pmacc
         }
 
     protected:
-
         /*! Check if my DataSpace is greater than other.
          * @param other other DataSpace
          * @return true if my DataSpace (one dimension) is greater than other, false otherwise
@@ -195,10 +198,9 @@ namespace pmacc
         DataSpace<DIM> data_space;
         DataSpace<DIM> m_physicalMemorySize;
 
-        size_t *current_size;
+        size_t* current_size;
 
         bool data1D;
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/DeviceBuffer.hpp b/include/pmacc/memory/buffers/DeviceBuffer.hpp
index 5ad1c73eb9..1a2f344070 100644
--- a/include/pmacc/memory/buffers/DeviceBuffer.hpp
+++ b/include/pmacc/memory/buffers/DeviceBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -31,18 +31,16 @@
 #include "pmacc/types.hpp"
 
 
-
-
 #include <stdexcept>
 
 namespace pmacc
 {
     class EventTask;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class HostBuffer;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class Buffer;
 
     /**
@@ -51,11 +49,10 @@ namespace pmacc
      * @tparam TYPE datatype of the buffer
      * @tparam DIM dimension of the buffer
      */
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class DeviceBuffer : public Buffer<TYPE, DIM>
     {
     protected:
-
         /** constructor
          *
          * @param size extent for each dimension (in elements)
@@ -63,36 +60,35 @@ namespace pmacc
          *             can be less than `physicalMemorySize`
          * @param physicalMemorySize size of the physical memory (in elements)
          */
-        DeviceBuffer(DataSpace<DIM> size, DataSpace<DIM> physicalMemorySize) :
-        Buffer<TYPE, DIM>(size, physicalMemorySize)
+        DeviceBuffer(DataSpace<DIM> size, DataSpace<DIM> physicalMemorySize)
+            : Buffer<TYPE, DIM>(size, physicalMemorySize)
         {
-
         }
 
     public:
-
         using Buffer<TYPE, DIM>::setCurrentSize; //!\todo :this function was hidden, I don't know why.
 
         /**
          * Destructor.
          */
-        virtual ~DeviceBuffer()
-        {
-        };
+        virtual ~DeviceBuffer(){};
 
         HINLINE
-        container::CartBuffer<TYPE, DIM, allocator::DeviceMemAllocator<TYPE, DIM>,
-                                copier::D2DCopier<DIM>,
-                                assigner::DeviceMemAssigner<> >
+        container::CartBuffer<
+            TYPE,
+            DIM,
+            allocator::DeviceMemAllocator<TYPE, DIM>,
+            copier::D2DCopier<DIM>,
+            assigner::DeviceMemAssigner<>>
         cartBuffer() const
         {
-            cudaPitchedPtr cudaData = this->getCudaPitched();
+            cuplaPitchedPtr cuplaData = this->getCudaPitched();
             math::Size_t<DIM - 1> pitch;
             if(DIM >= 2)
-                pitch[0] = cudaData.pitch;
+                pitch[0] = cuplaData.pitch;
             if(DIM == 3)
                 pitch[1] = pitch[0] * this->getPhysicalMemorySize()[1];
-            container::DeviceBuffer<TYPE, DIM> result((TYPE*)cudaData.ptr, this->getDataSpace(), false, pitch);
+            container::DeviceBuffer<TYPE, DIM> result((TYPE*) cuplaData.ptr, this->getDataSpace(), false, pitch);
             return result;
         }
 
@@ -121,7 +117,7 @@ namespace pmacc
          *
          * @return pointer to stored value on host side
          */
-        virtual size_t* getCurrentSizeHostSidePointer()=0;
+        virtual size_t* getCurrentSizeHostSidePointer() = 0;
 
         /**
          * Sets current size of any dimension.
@@ -135,11 +131,11 @@ namespace pmacc
         virtual void setCurrentSize(const size_t size) = 0;
 
         /**
-         * Returns the internal pitched cuda pointer.
+         * Returns the internal pitched cupla pointer.
          *
-         * @return internal pitched cuda pointer
+         * @return internal pitched cupla pointer
          */
-        virtual const cudaPitchedPtr getCudaPitched() const = 0;
+        virtual const cuplaPitchedPtr getCudaPitched() const = 0;
 
         /** get line pitch of memory in byte
          *
@@ -160,7 +156,6 @@ namespace pmacc
          * @param other the DeviceBuffer to copy from
          */
         virtual void copyFrom(DeviceBuffer<TYPE, DIM>& other) = 0;
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/DeviceBufferIntern.hpp b/include/pmacc/memory/buffers/DeviceBufferIntern.hpp
index f056b3ddac..b5618b41bd 100644
--- a/include/pmacc/memory/buffers/DeviceBufferIntern.hpp
+++ b/include/pmacc/memory/buffers/DeviceBufferIntern.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -26,293 +26,289 @@
 #include "pmacc/eventSystem/tasks/Factory.hpp"
 #include "pmacc/memory/buffers/DeviceBuffer.hpp"
 #include "pmacc/memory/boxes/DataBox.hpp"
+#include "pmacc/memory/Array.hpp"
 #include "pmacc/assert.hpp"
 
 namespace pmacc
 {
-
-/**
- * Internal device buffer implementation.
- */
-template <class TYPE, unsigned DIM>
-class DeviceBufferIntern : public DeviceBuffer<TYPE, DIM>
-{
-public:
-
-    typedef typename DeviceBuffer<TYPE, DIM>::DataBoxType DataBoxType;
-
-    /*! create device buffer
-     * @param size extent for each dimension (in elements)
-     * @param sizeOnDevice memory with the current size of the grid is stored on device
-     * @param useVectorAsBase use a vector as base of the array (is not lined pitched)
-     *                      if true size on device is atomaticly set to false
+    /**
+     * Internal device buffer implementation.
      */
-    DeviceBufferIntern(DataSpace<DIM> size, bool sizeOnDevice = false, bool useVectorAsBase = false) :
-    DeviceBuffer<TYPE, DIM>(size, size),
-    sizeOnDevice(sizeOnDevice),
-    useOtherMemory(false),
-    offset(DataSpace<DIM>())
+    template<class TYPE, unsigned DIM>
+    class DeviceBufferIntern : public DeviceBuffer<TYPE, DIM>
     {
-        //create size on device before any use of setCurrentSize
-        if (useVectorAsBase)
+    public:
+        typedef typename DeviceBuffer<TYPE, DIM>::DataBoxType DataBoxType;
+
+        /*! create device buffer
+         * @param size extent for each dimension (in elements)
+         * @param sizeOnDevice memory with the current size of the grid is stored on device
+         * @param useVectorAsBase use a vector as base of the array (is not lined pitched)
+         *                      if true size on device is atomaticly set to false
+         */
+        DeviceBufferIntern(DataSpace<DIM> size, bool sizeOnDevice = false, bool useVectorAsBase = false)
+            : DeviceBuffer<TYPE, DIM>(size, size)
+            , sizeOnDevice(sizeOnDevice)
+            , useOtherMemory(false)
+            , offset(DataSpace<DIM>())
         {
-            this->sizeOnDevice = false;
-            createSizeOnDevice(this->sizeOnDevice);
-            createFakeData();
-            this->data1D = true;
+            // create size on device before any use of setCurrentSize
+            if(useVectorAsBase)
+            {
+                this->sizeOnDevice = false;
+                createSizeOnDevice(this->sizeOnDevice);
+                createFakeData();
+                this->data1D = true;
+            }
+            else
+            {
+                createSizeOnDevice(this->sizeOnDevice);
+                createData();
+                this->data1D = false;
+            }
         }
-        else
+
+        DeviceBufferIntern(
+            DeviceBuffer<TYPE, DIM>& source,
+            DataSpace<DIM> size,
+            DataSpace<DIM> offset,
+            bool sizeOnDevice = false)
+            : DeviceBuffer<TYPE, DIM>(size, source.getPhysicalMemorySize())
+            , sizeOnDevice(sizeOnDevice)
+            , offset(offset + source.getOffset())
+            , data(source.getCudaPitched())
+            , useOtherMemory(true)
         {
-            createSizeOnDevice(this->sizeOnDevice);
-            createData();
+            createSizeOnDevice(sizeOnDevice);
             this->data1D = false;
         }
 
-    }
-
-    DeviceBufferIntern(DeviceBuffer<TYPE, DIM>& source, DataSpace<DIM> size, DataSpace<DIM> offset, bool sizeOnDevice = false) :
-    DeviceBuffer<TYPE, DIM>(size, source.getPhysicalMemorySize()),
-    sizeOnDevice(sizeOnDevice),
-    offset(offset + source.getOffset()),
-    data(source.getCudaPitched()),
-    useOtherMemory(true)
-    {
-        createSizeOnDevice(sizeOnDevice);
-        this->data1D = false;
-    }
-
-    virtual ~DeviceBufferIntern()
-    {
-        __startOperation(ITask::TASK_CUDA);
-
-        if (sizeOnDevice)
-        {
-            CUDA_CHECK_NO_EXCEPT(cudaFree(sizeOnDevicePtr));
-        }
-        if (!useOtherMemory)
+        virtual ~DeviceBufferIntern()
         {
-            CUDA_CHECK_NO_EXCEPT(cudaFree(data.ptr));
+            __startOperation(ITask::TASK_DEVICE);
 
+            if(sizeOnDevice)
+            {
+                CUDA_CHECK_NO_EXCEPT(cuplaFree(sizeOnDevicePtr));
+            }
+            if(!useOtherMemory)
+            {
+                CUDA_CHECK_NO_EXCEPT(cuplaFree(data.ptr));
+            }
         }
-    }
-
-    void reset(bool preserveData = true)
-    {
-        this->setCurrentSize(Buffer<TYPE, DIM>::getDataSpace().productOfComponents());
 
-        __startOperation(ITask::TASK_CUDA);
-        if (!preserveData)
+        void reset(bool preserveData = true)
         {
-            TYPE value;
-            /* using `uint8_t` for byte-wise looping through tmp var value of `TYPE` */
-            uint8_t* valuePtr = reinterpret_cast<uint8_t*>(&value);
-            for( size_t b = 0; b < sizeof(TYPE); ++b)
+            this->setCurrentSize(Buffer<TYPE, DIM>::getDataSpace().productOfComponents());
+
+            __startOperation(ITask::TASK_DEVICE);
+            if(!preserveData)
             {
-                valuePtr[b] = static_cast<uint8_t>(0);
+                // Using Array is a workaround for types without default constructor
+                memory::Array<TYPE, 1> tmp;
+                memset(reinterpret_cast<void*>(tmp.data()), 0, sizeof(tmp));
+                // use first element to avoid issue because Array is aligned (sizeof can be larger than component type)
+                setValue(tmp[0]);
             }
-            /* set value with zero-ed `TYPE` */
-            setValue(value);
         }
-    }
-
-    DataBoxType getDataBox()
-    {
-        __startOperation(ITask::TASK_CUDA);
-        return DataBoxType(PitchedBox<TYPE, DIM > ((TYPE*) data.ptr, offset,
-                                                   this->getPhysicalMemorySize(), data.pitch));
-    }
 
-    TYPE* getPointer()
-    {
-        __startOperation(ITask::TASK_CUDA);
-
-        if (DIM == DIM1)
+        DataBoxType getDataBox()
         {
-            return (TYPE*) (data.ptr) + this->offset[0];
+            __startOperation(ITask::TASK_DEVICE);
+            return DataBoxType(
+                PitchedBox<TYPE, DIM>((TYPE*) data.ptr, offset, this->getPhysicalMemorySize(), data.pitch));
         }
-        else if (DIM == DIM2)
+
+        TYPE* getPointer()
         {
-            return (TYPE*) ((char*) data.ptr + this->offset[1] * this->data.pitch) + this->offset[0];
+            __startOperation(ITask::TASK_DEVICE);
+
+            if(DIM == DIM1)
+            {
+                return (TYPE*) (data.ptr) + this->offset[0];
+            }
+            else if(DIM == DIM2)
+            {
+                return (TYPE*) ((char*) data.ptr + this->offset[1] * this->data.pitch) + this->offset[0];
+            }
+            else
+            {
+                const size_t offsetY = this->offset[1] * this->data.pitch;
+                const size_t sizePlaneXY = this->getPhysicalMemorySize()[1] * this->data.pitch;
+                return (TYPE*) ((char*) data.ptr + this->offset[2] * sizePlaneXY + offsetY) + this->offset[0];
+            }
         }
-        else
+
+        DataSpace<DIM> getOffset() const
         {
-            const size_t offsetY = this->offset[1] * this->data.pitch;
-            const size_t sizePlaneXY = this->getPhysicalMemorySize()[1] * this->data.pitch;
-            return (TYPE*) ((char*) data.ptr + this->offset[2] * sizePlaneXY + offsetY) + this->offset[0];
+            return offset;
         }
-    }
-
-    DataSpace<DIM> getOffset() const
-    {
-        return offset;
-    }
 
-    bool hasCurrentSizeOnDevice() const
-    {
-        return sizeOnDevice;
-    }
-
-    size_t* getCurrentSizeOnDevicePointer()
-    {
-        __startOperation(ITask::TASK_CUDA);
-        if (!sizeOnDevice)
+        bool hasCurrentSizeOnDevice() const
         {
-            throw std::runtime_error("Buffer has no size on device!, currentSize is only stored on host side.");
+            return sizeOnDevice;
         }
-        return sizeOnDevicePtr;
-    }
-
-    size_t* getCurrentSizeHostSidePointer()
-    {
-        __startOperation(ITask::TASK_HOST);
-        return this->current_size;
-    }
-
-    TYPE* getBasePointer()
-    {
-        __startOperation(ITask::TASK_CUDA);
-        return (TYPE*) data.ptr;
-    }
 
-    /*! Get current size of any dimension
-     * @return count of current elements per dimension
-     */
-    virtual size_t getCurrentSize()
-    {
-        if (sizeOnDevice)
+        size_t* getCurrentSizeOnDevicePointer()
         {
-            __startTransaction(__getTransactionEvent());
-            Environment<>::get().Factory().createTaskGetCurrentSizeFromDevice(*this);
-            __endTransaction().waitForFinished();
+            __startOperation(ITask::TASK_DEVICE);
+            if(!sizeOnDevice)
+            {
+                throw std::runtime_error("Buffer has no size on device!, currentSize is only stored on host side.");
+            }
+            return sizeOnDevicePtr;
         }
 
-        return DeviceBuffer<TYPE, DIM>::getCurrentSize();
-    }
-
-    virtual void setCurrentSize(const size_t size)
-    {
-        Buffer<TYPE, DIM>::setCurrentSize(size);
-
-        if (sizeOnDevice)
+        size_t* getCurrentSizeHostSidePointer()
         {
-            Environment<>::get().Factory().createTaskSetCurrentSizeOnDevice(
-                                                                            *this, size);
+            __startOperation(ITask::TASK_HOST);
+            return this->current_size;
         }
-    }
-
-    void copyFrom(HostBuffer<TYPE, DIM>& other)
-    {
-
-        PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
-        Environment<>::get().Factory().createTaskCopyHostToDevice(other, *this);
-
-    }
-
-    void copyFrom(DeviceBuffer<TYPE, DIM>& other)
-    {
 
-        PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
-        Environment<>::get().Factory().createTaskCopyDeviceToDevice(other, *this);
-
-    }
-
-    const cudaPitchedPtr getCudaPitched() const
-    {
-        __startOperation(ITask::TASK_CUDA);
-        return data;
-    }
+        TYPE* getBasePointer()
+        {
+            __startOperation(ITask::TASK_DEVICE);
+            return (TYPE*) data.ptr;
+        }
 
-    size_t getPitch() const
-    {
-        return data.pitch;
-    }
+        /*! Get current size of any dimension
+         * @return count of current elements per dimension
+         */
+        virtual size_t getCurrentSize()
+        {
+            if(sizeOnDevice)
+            {
+                __startTransaction(__getTransactionEvent());
+                Environment<>::get().Factory().createTaskGetCurrentSizeFromDevice(*this);
+                __endTransaction().waitForFinished();
+            }
 
-    virtual void setValue(const TYPE& value)
-    {
-        Environment<>::get().Factory().createTaskSetValue(*this, value);
-    };
+            return DeviceBuffer<TYPE, DIM>::getCurrentSize();
+        }
 
-private:
+        virtual void setCurrentSize(const size_t size)
+        {
+            Buffer<TYPE, DIM>::setCurrentSize(size);
 
-    /*! create native array with pitched lines
-     */
-    void createData()
-    {
-        __startOperation(ITask::TASK_CUDA);
-        data.ptr = nullptr;
-        data.pitch = 1;
-        data.xsize = this->getDataSpace()[0] * sizeof (TYPE);
-        data.ysize = 1;
+            if(sizeOnDevice)
+            {
+                Environment<>::get().Factory().createTaskSetCurrentSizeOnDevice(*this, size);
+            }
+        }
 
-        if (DIM == DIM1)
+        void copyFrom(HostBuffer<TYPE, DIM>& other)
         {
-            log<ggLog::MEMORY >("Create device 1D data: %1% MiB") % (data.xsize / 1024 / 1024);
-            CUDA_CHECK(cudaMallocPitch(&data.ptr, &data.pitch, data.xsize, 1));
+            PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
+            Environment<>::get().Factory().createTaskCopyHostToDevice(other, *this);
         }
-        if (DIM == DIM2)
-        {
-            data.ysize = this->getDataSpace()[1];
-            log<ggLog::MEMORY >("Create device 2D data: %1% MiB") % (data.xsize * data.ysize / 1024 / 1024);
-            CUDA_CHECK(cudaMallocPitch(&data.ptr, &data.pitch, data.xsize, data.ysize));
 
-        }
-        if (DIM == DIM3)
+        void copyFrom(DeviceBuffer<TYPE, DIM>& other)
         {
-            cudaExtent extent;
-            extent.width = this->getDataSpace()[0] * sizeof (TYPE);
-            extent.height = this->getDataSpace()[1];
-            extent.depth = this->getDataSpace()[2];
+            PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
+            Environment<>::get().Factory().createTaskCopyDeviceToDevice(other, *this);
+        }
 
-            log<ggLog::MEMORY >("Create device 3D data: %1% MiB") % (this->getDataSpace().productOfComponents() * sizeof (TYPE) / 1024 / 1024);
-            CUDA_CHECK(cudaMalloc3D(&data, extent));
+        const cuplaPitchedPtr getCudaPitched() const
+        {
+            __startOperation(ITask::TASK_DEVICE);
+            return data;
         }
 
-        reset(false);
-    }
+        size_t getPitch() const
+        {
+            return data.pitch;
+        }
 
-    /*!create 1D, 2D, 3D Array which use only a vector as base
-     */
-    void createFakeData()
-    {
-        __startOperation(ITask::TASK_CUDA);
-        data.ptr = nullptr;
-        data.pitch = 1;
-        data.xsize = this->getDataSpace()[0] * sizeof (TYPE);
-        data.ysize = 1;
+        virtual void setValue(const TYPE& value)
+        {
+            Environment<>::get().Factory().createTaskSetValue(*this, value);
+        };
 
-        log<ggLog::MEMORY >("Create device fake data: %1% MiB") % (this->getDataSpace().productOfComponents() * sizeof (TYPE) / 1024 / 1024);
-        CUDA_CHECK(cudaMallocPitch(&data.ptr, &data.pitch, this->getDataSpace().productOfComponents() * sizeof (TYPE), 1));
+    private:
+        /*! create native array with pitched lines
+         */
+        void createData()
+        {
+            __startOperation(ITask::TASK_DEVICE);
+            data.ptr = nullptr;
+            data.pitch = 1;
+            data.xsize = this->getDataSpace()[0] * sizeof(TYPE);
+            data.ysize = 1;
 
-        //fake the pitch, thus we can use this 1D Buffer as 2D or 3D
-        data.pitch = this->getDataSpace()[0] * sizeof (TYPE);
+            if(DIM == DIM1)
+            {
+                log<ggLog::MEMORY>("Create device 1D data: %1% MiB") % (data.xsize / 1024 / 1024);
+                CUDA_CHECK(cuplaMallocPitch(&data.ptr, &data.pitch, data.xsize, 1));
+            }
+            if(DIM == DIM2)
+            {
+                data.ysize = this->getDataSpace()[1];
+                log<ggLog::MEMORY>("Create device 2D data: %1% MiB") % (data.xsize * data.ysize / 1024 / 1024);
+                CUDA_CHECK(cuplaMallocPitch(&data.ptr, &data.pitch, data.xsize, data.ysize));
+            }
+            if(DIM == DIM3)
+            {
+                cuplaExtent extent;
+                extent.width = this->getDataSpace()[0] * sizeof(TYPE);
+                extent.height = this->getDataSpace()[1];
+                extent.depth = this->getDataSpace()[2];
+
+                log<ggLog::MEMORY>("Create device 3D data: %1% MiB")
+                    % (this->getDataSpace().productOfComponents() * sizeof(TYPE) / 1024 / 1024);
+                CUDA_CHECK(cuplaMalloc3D(&data, extent));
+            }
 
-        if (DIM > DIM1)
-        {
-            data.ysize = this->getDataSpace()[1];
+            reset(false);
         }
 
-        reset(false);
-    }
+        /*!create 1D, 2D, 3D Array which use only a vector as base
+         */
+        void createFakeData()
+        {
+            __startOperation(ITask::TASK_DEVICE);
+            data.ptr = nullptr;
+            data.pitch = 1;
+            data.xsize = this->getDataSpace()[0] * sizeof(TYPE);
+            data.ysize = 1;
+
+            log<ggLog::MEMORY>("Create device fake data: %1% MiB")
+                % (this->getDataSpace().productOfComponents() * sizeof(TYPE) / 1024 / 1024);
+            CUDA_CHECK(cuplaMallocPitch(
+                &data.ptr,
+                &data.pitch,
+                this->getDataSpace().productOfComponents() * sizeof(TYPE),
+                1));
+
+            // fake the pitch, thus we can use this 1D Buffer as 2D or 3D
+            data.pitch = this->getDataSpace()[0] * sizeof(TYPE);
+
+            if(DIM > DIM1)
+            {
+                data.ysize = this->getDataSpace()[1];
+            }
 
-    void createSizeOnDevice(bool sizeOnDevice)
-    {
-        __startOperation(ITask::TASK_HOST);
-        sizeOnDevicePtr = nullptr;
+            reset(false);
+        }
 
-        if (sizeOnDevice)
+        void createSizeOnDevice(bool sizeOnDevice)
         {
-            CUDA_CHECK(cudaMalloc((void**)&sizeOnDevicePtr, sizeof (size_t)));
+            __startOperation(ITask::TASK_HOST);
+            sizeOnDevicePtr = nullptr;
+
+            if(sizeOnDevice)
+            {
+                CUDA_CHECK(cuplaMalloc((void**) &sizeOnDevicePtr, sizeof(size_t)));
+            }
+            setCurrentSize(this->getDataSpace().productOfComponents());
         }
-        setCurrentSize(this->getDataSpace().productOfComponents());
-    }
 
-private:
-    DataSpace<DIM> offset;
+    private:
+        DataSpace<DIM> offset;
 
-    bool sizeOnDevice;
-    size_t* sizeOnDevicePtr;
-    cudaPitchedPtr data;
-    bool useOtherMemory;
-};
+        bool sizeOnDevice;
+        size_t* sizeOnDevicePtr;
+        cuplaPitchedPtr data;
+        bool useOtherMemory;
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/Exchange.hpp b/include/pmacc/memory/buffers/Exchange.hpp
index 0ee14142e1..e7243b1841 100644
--- a/include/pmacc/memory/buffers/Exchange.hpp
+++ b/include/pmacc/memory/buffers/Exchange.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -26,7 +26,6 @@
 
 namespace pmacc
 {
-
     /**
      * Interface for a DIM-dimensional buffer used for data exchange.
      *
@@ -38,24 +37,23 @@ namespace pmacc
      * @tparam TYPE the datatype for internal buffers
      * @tparam DIM the dimension of the internal buffers
      */
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class Exchange
     {
     public:
-
         /**
          * Returns the exchange buffer on the device.
          *
          * @return Exchange buffer on device
          */
-        virtual DeviceBuffer<TYPE, DIM> &getDeviceBuffer() = 0;
+        virtual DeviceBuffer<TYPE, DIM>& getDeviceBuffer() = 0;
 
         /**
          * Returns the exchange buffer on the host.
          *
          * @return Exchange buffer on host
          */
-        virtual HostBuffer <TYPE, DIM> &getHostBuffer() = 0;
+        virtual HostBuffer<TYPE, DIM>& getHostBuffer() = 0;
 
         /**
          * Returns the type describing exchange directions
@@ -77,21 +75,24 @@ namespace pmacc
             return communicationTag;
         }
 
-        virtual bool hasDeviceDoubleBuffer()=0;
+        /**
+         * Return the buffer which can be used for data exchange with MPI
+         *
+         * The buffer can point to device or host memory.
+         */
+        virtual Buffer<TYPE, DIM>* getCommunicationBuffer() = 0;
+
+        virtual bool hasDeviceDoubleBuffer() = 0;
 
-        virtual DeviceBuffer<TYPE, DIM>& getDeviceDoubleBuffer()=0;
+        virtual DeviceBuffer<TYPE, DIM>& getDeviceDoubleBuffer() = 0;
 
     protected:
-
-        Exchange(uint32_t extype, uint32_t tag) :
-        exchange(extype),
-        communicationTag(tag)
+        Exchange(uint32_t extype, uint32_t tag) : exchange(extype), communicationTag(tag)
         {
-
         }
 
         uint32_t exchange;
         uint32_t communicationTag;
     };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/ExchangeIntern.hpp b/include/pmacc/memory/buffers/ExchangeIntern.hpp
index abf0360c99..ac561a3656 100644
--- a/include/pmacc/memory/buffers/ExchangeIntern.hpp
+++ b/include/pmacc/memory/buffers/ExchangeIntern.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -27,7 +27,6 @@
 #include "pmacc/memory/dataTypes/Mask.hpp"
 #include "pmacc/memory/buffers/DeviceBufferIntern.hpp"
 #include "pmacc/memory/buffers/HostBufferIntern.hpp"
-#include "pmacc/memory/MakeUnique.hpp"
 
 #include "pmacc/eventSystem/tasks/Factory.hpp"
 #include "pmacc/eventSystem/tasks/TaskReceive.hpp"
@@ -39,20 +38,26 @@
 
 namespace pmacc
 {
-
-    /**
-     * Internal Exchange implementation.
+    /** Internal Exchange implementation.
+     *
+     * There will be no host double buffer available if MPI direct for PMacc is enabled.
      */
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class ExchangeIntern : public Exchange<TYPE, DIM>
     {
     public:
-
-        ExchangeIntern(DeviceBuffer<TYPE, DIM>& source, GridLayout<DIM> memoryLayout, DataSpace<DIM> guardingCells, uint32_t exchange,
-                       uint32_t communicationTag, uint32_t area = BORDER, bool sizeOnDevice = false) :
-        Exchange<TYPE, DIM>(exchange, communicationTag)
+        ExchangeIntern(
+            DeviceBuffer<TYPE, DIM>& source,
+            GridLayout<DIM> memoryLayout,
+            DataSpace<DIM> guardingCells,
+            uint32_t exchange,
+            uint32_t communicationTag,
+            uint32_t area = BORDER,
+            bool sizeOnDevice = false)
+            : Exchange<TYPE, DIM>(exchange, communicationTag)
+            , deviceDoubleBuffer(nullptr)
+            , hostBuffer(nullptr)
         {
-
             PMACC_ASSERT(!guardingCells.isOneDimensionGreaterThan(memoryLayout.getGuard()));
 
             DataSpace<DIM> tmp_size = memoryLayout.getDataSpaceWithoutGuarding();
@@ -62,62 +67,56 @@ namespace pmacc
 
             DataSpace<DIM> exchangeDimensions = exchangeTypeToDim(exchange);
 
-            for (uint32_t dim = 0; dim < DIM; dim++)
+            for(uint32_t dim = 0; dim < DIM; dim++)
             {
-                if (DIM > dim && exchangeDimensions[dim] == 1)
+                if(DIM > dim && exchangeDimensions[dim] == 1)
                     tmp_size[dim] = guardingCells[dim];
             }
 
             /*This is only a pointer to other device data
              */
             using DeviceBuffer = DeviceBufferIntern<TYPE, DIM>;
-            deviceBuffer = memory::makeUnique<DeviceBuffer>(
+            deviceBuffer = std::make_unique<DeviceBuffer>(
                 source,
                 tmp_size,
-                exchangeTypeToOffset(
-                    exchange,
-                    memoryLayout,
-                    guardingCells,
-                    area
-                ),
-                sizeOnDevice
-            );
-            if (DIM > DIM1)
+                exchangeTypeToOffset(exchange, memoryLayout, guardingCells, area),
+                sizeOnDevice);
+            if(DIM > DIM1)
             {
                 /*create double buffer on gpu for faster memory transfers*/
-                deviceDoubleBuffer = memory::makeUnique<DeviceBuffer>(
-                    tmp_size,
-                    false,
-                    true
-                );
+                deviceDoubleBuffer = std::make_unique<DeviceBuffer>(tmp_size, false, true);
             }
 
-            using HostBuffer = HostBufferIntern<TYPE, DIM>;
-            hostBuffer = memory::makeUnique<HostBuffer>(tmp_size);
+            if(!Environment<>::get().isMpiDirectEnabled())
+            {
+                using HostBuffer = HostBufferIntern<TYPE, DIM>;
+                hostBuffer = std::make_unique<HostBuffer>(tmp_size);
+            }
         }
 
-        ExchangeIntern(DataSpace<DIM> exchangeDataSpace, uint32_t exchange,
-                       uint32_t communicationTag, bool sizeOnDevice = false) :
-        Exchange<TYPE, DIM>(exchange, communicationTag)
+        ExchangeIntern(
+            DataSpace<DIM> exchangeDataSpace,
+            uint32_t exchange,
+            uint32_t communicationTag,
+            bool sizeOnDevice = false)
+            : Exchange<TYPE, DIM>(exchange, communicationTag)
+            , deviceDoubleBuffer(nullptr)
+            , hostBuffer(nullptr)
         {
-            using DeviceBuffer = DeviceBufferIntern<TYPE, DIM >;
-            deviceBuffer = memory::makeUnique<DeviceBuffer>(
-                exchangeDataSpace,
-                sizeOnDevice
-            );
+            using DeviceBuffer = DeviceBufferIntern<TYPE, DIM>;
+            deviceBuffer = std::make_unique<DeviceBuffer>(exchangeDataSpace, sizeOnDevice);
             //  this->deviceBuffer = new DeviceBufferIntern<TYPE, DIM > (exchangeDataSpace, sizeOnDevice,true);
-            if (DIM > DIM1)
+            if(DIM > DIM1)
             {
                 /*create double buffer on gpu for faster memory transfers*/
-                deviceDoubleBuffer = memory::makeUnique<DeviceBuffer>(
-                    exchangeDataSpace,
-                    false,
-                    true
-                );
+                deviceDoubleBuffer = std::make_unique<DeviceBuffer>(exchangeDataSpace, false, true);
             }
 
-            using HostBuffer = HostBufferIntern<TYPE, DIM >;
-            hostBuffer = memory::makeUnique<HostBuffer>(exchangeDataSpace);
+            if(!Environment<>::get().isMpiDirectEnabled())
+            {
+                using HostBuffer = HostBufferIntern<TYPE, DIM>;
+                hostBuffer = std::make_unique<HostBuffer>(exchangeDataSpace);
+            }
         }
 
         /**
@@ -131,13 +130,13 @@ namespace pmacc
 
             Mask exchangeMask(exchange);
 
-            if (exchangeMask.containsExchangeType(LEFT) || exchangeMask.containsExchangeType(RIGHT))
+            if(exchangeMask.containsExchangeType(LEFT) || exchangeMask.containsExchangeType(RIGHT))
                 result[0] = 1;
 
-            if (DIM > DIM1 && (exchangeMask.containsExchangeType(TOP) || exchangeMask.containsExchangeType(BOTTOM)))
+            if(DIM > DIM1 && (exchangeMask.containsExchangeType(TOP) || exchangeMask.containsExchangeType(BOTTOM)))
                 result[1] = 1;
 
-            if (DIM > DIM2 && (exchangeMask.containsExchangeType(FRONT) || exchangeMask.containsExchangeType(BACK)))
+            if(DIM > DIM2 && (exchangeMask.containsExchangeType(FRONT) || exchangeMask.containsExchangeType(BACK)))
                 result[2] = 1;
 
             return result;
@@ -145,19 +144,22 @@ namespace pmacc
 
         virtual ~ExchangeIntern() = default;
 
-        DataSpace<DIM> exchangeTypeToOffset(uint32_t exchange, GridLayout<DIM> &memoryLayout,
-                                            DataSpace<DIM> guardingCells, uint32_t area) const
+        DataSpace<DIM> exchangeTypeToOffset(
+            uint32_t exchange,
+            GridLayout<DIM>& memoryLayout,
+            DataSpace<DIM> guardingCells,
+            uint32_t area) const
         {
             DataSpace<DIM> size = memoryLayout.getDataSpace();
             DataSpace<DIM> border = memoryLayout.getGuard();
             Mask mask(exchange);
             DataSpace<DIM> tmp_offset;
-            if (DIM >= DIM1)
+            if(DIM >= DIM1)
             {
-                if (mask.containsExchangeType(RIGHT))
+                if(mask.containsExchangeType(RIGHT))
                 {
                     tmp_offset[0] = size[0] - border[0] - guardingCells[0];
-                    if (area == GUARD)
+                    if(area == GUARD)
                     {
                         tmp_offset[0] += guardingCells[0];
                     }
@@ -166,18 +168,18 @@ namespace pmacc
                 else
                 {
                     tmp_offset[0] = border[0];
-                    if (area == GUARD && mask.containsExchangeType(LEFT))
+                    if(area == GUARD && mask.containsExchangeType(LEFT))
                     {
                         tmp_offset[0] -= guardingCells[0];
                     }
                 }
             }
-            if (DIM >= DIM2)
+            if(DIM >= DIM2)
             {
-                if (mask.containsExchangeType(BOTTOM))
+                if(mask.containsExchangeType(BOTTOM))
                 {
                     tmp_offset[1] = size[1] - border[1] - guardingCells[1];
-                    if (area == GUARD)
+                    if(area == GUARD)
                     {
                         tmp_offset[1] += guardingCells[1];
                     }
@@ -185,18 +187,18 @@ namespace pmacc
                 else
                 {
                     tmp_offset[1] = border[1];
-                    if (area == GUARD && mask.containsExchangeType(TOP))
+                    if(area == GUARD && mask.containsExchangeType(TOP))
                     {
                         tmp_offset[1] -= guardingCells[1];
                     }
                 }
             }
-            if (DIM == DIM3)
+            if(DIM == DIM3)
             {
-                if (mask.containsExchangeType(BACK))
+                if(mask.containsExchangeType(BACK))
                 {
                     tmp_offset[2] = size[2] - border[2] - guardingCells[2];
-                    if (area == GUARD)
+                    if(area == GUARD)
                     {
                         tmp_offset[2] += guardingCells[2];
                     }
@@ -204,7 +206,7 @@ namespace pmacc
                 else /*all other begin from front*/
                 {
                     tmp_offset[2] = border[2];
-                    if (area == GUARD && mask.containsExchangeType(FRONT))
+                    if(area == GUARD && mask.containsExchangeType(FRONT))
                     {
                         tmp_offset[2] -= guardingCells[2];
                     }
@@ -212,26 +214,28 @@ namespace pmacc
             }
 
             return tmp_offset;
-
         }
 
-        virtual HostBuffer<TYPE, DIM>& getHostBuffer()
+        HostBuffer<TYPE, DIM>& getHostBuffer() override
         {
+            PMACC_ASSERT(hostBuffer != nullptr);
             return *hostBuffer;
         }
 
-        virtual DeviceBuffer<TYPE, DIM>& getDeviceBuffer()
+        DeviceBuffer<TYPE, DIM>& getDeviceBuffer() override
         {
+            PMACC_ASSERT(deviceBuffer != nullptr);
             return *deviceBuffer;
         }
 
-        virtual bool hasDeviceDoubleBuffer()
+        bool hasDeviceDoubleBuffer() override
         {
             return deviceDoubleBuffer != nullptr;
         }
 
-        virtual DeviceBuffer<TYPE, DIM>& getDeviceDoubleBuffer()
+        DeviceBuffer<TYPE, DIM>& getDeviceDoubleBuffer() override
         {
+            PMACC_ASSERT(deviceDoubleBuffer != nullptr);
             return *deviceDoubleBuffer;
         }
 
@@ -245,13 +249,29 @@ namespace pmacc
             return Environment<>::get().Factory().createTaskReceive(*this);
         }
 
+        Buffer<TYPE, DIM>* getCommunicationBuffer() override
+        {
+            if(Environment<>::get().isMpiDirectEnabled())
+            {
+                if(hasDeviceDoubleBuffer())
+                    return &(getDeviceDoubleBuffer());
+                else
+                    return &(getDeviceBuffer());
+            }
+
+            return &(getHostBuffer());
+        }
+
     protected:
-        std::unique_ptr< HostBufferIntern<TYPE, DIM> > hostBuffer;
+        /** host double buffer of the exchange data
+         *
+         * Is always a nullptr if MPI direct is used
+         */
+        std::unique_ptr<HostBufferIntern<TYPE, DIM>> hostBuffer;
 
         //! This buffer is a vector which is used as message buffer for faster memcopy
-        std::unique_ptr< DeviceBufferIntern<TYPE, DIM> > deviceDoubleBuffer;
-        std::unique_ptr< DeviceBufferIntern<TYPE, DIM> > deviceBuffer;
-
+        std::unique_ptr<DeviceBufferIntern<TYPE, DIM>> deviceDoubleBuffer;
+        std::unique_ptr<DeviceBufferIntern<TYPE, DIM>> deviceBuffer;
     };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/GridBuffer.hpp b/include/pmacc/memory/buffers/GridBuffer.hpp
index 8e86644d4b..0e737d505b 100644
--- a/include/pmacc/memory/buffers/GridBuffer.hpp
+++ b/include/pmacc/memory/buffers/GridBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -35,502 +35,533 @@
 
 namespace pmacc
 {
-namespace privateGridBuffer
-{
-
-class UniquTag
-{
-public:
-
-    static UniquTag& getInstance()
-    {
-        static UniquTag instance;
-        return instance;
-    }
-
-    bool isTagUniqu(uint32_t tag)
-    {
-        bool isUniqu = tags.find(tag) == tags.end();
-        if (isUniqu)
-            tags.insert(tag);
-        return isUniqu;
-    }
-private:
-
-    UniquTag()
-    {
-    }
-
-    /**
-     * Constructor
-     */
-    UniquTag(const UniquTag&)
+    namespace privateGridBuffer
     {
+        class UniquTag
+        {
+        public:
+            static UniquTag& getInstance()
+            {
+                static UniquTag instance;
+                return instance;
+            }
 
-    }
-
-    std::set<uint32_t> tags;
-};
+            bool isTagUniqu(uint32_t tag)
+            {
+                bool isUniqu = tags.find(tag) == tags.end();
+                if(isUniqu)
+                    tags.insert(tag);
+                return isUniqu;
+            }
 
-}//end namespace privateGridBuffer
+        private:
+            UniquTag()
+            {
+            }
 
-/**
- * GridBuffer represents a DIM-dimensional buffer which exists on the host as well as on the device.
- *
- * GridBuffer combines a HostBuffer and a DeviceBuffer with equal sizes.
- * Additionally, it allows sending data from and receiving data to these buffers.
- * Buffers consist of core data which may be surrounded by border data.
- *
- * @tparam TYPE datatype for internal Host- and DeviceBuffer
- * @tparam DIM dimension of the buffers
- * @tparam BORDERTYPE optional type for border data in the buffers. TYPE is used by default.
- */
-template <class TYPE, unsigned DIM, class BORDERTYPE = TYPE>
-class GridBuffer: public HostDeviceBuffer<TYPE, DIM>
-{
-    typedef HostDeviceBuffer<TYPE, DIM> Parent;
-public:
+            /**
+             * Constructor
+             */
+            UniquTag(const UniquTag&)
+            {
+            }
 
-    typedef typename Parent::DataBoxType DataBoxType;
+            std::set<uint32_t> tags;
+        };
 
-    /**
-     * Constructor.
-     *
-     * @param gridLayout layout of the buffers, including border-cells
-     * @param sizeOnDevice if true, size information exists on device, too.
-     */
-    GridBuffer(const GridLayout<DIM>& gridLayout, bool sizeOnDevice = false) :
-    Parent(gridLayout.getDataSpace(), sizeOnDevice),
-    gridLayout(gridLayout),
-    hasOneExchange(false),
-    maxExchange(0)
-    {
-        init();
-    }
+    } // end namespace privateGridBuffer
 
     /**
-     * Constructor.
+     * GridBuffer represents a DIM-dimensional buffer which exists on the host as well as on the device.
      *
-     * @param dataSpace DataSpace representing buffer size without border-cells
-     * @param sizeOnDevice if true, internal buffers must store their
-     *        size additionally on the device
-     *        (as we keep this information coherent with the host, it influences
-     *        performance on host-device copies, but some algorithms on the device
-     *        might need to know the size of the buffer)
-     */
-    GridBuffer(const DataSpace<DIM>& dataSpace, bool sizeOnDevice = false) :
-    Parent(dataSpace, sizeOnDevice),
-    gridLayout(dataSpace),
-    hasOneExchange(false),
-    maxExchange(0)
-    {
-        init();
-    }
-
-    /**
-     * Constructor.
+     * GridBuffer combines a HostBuffer and a DeviceBuffer with equal sizes.
+     * Additionally, it allows sending data from and receiving data to these buffers.
+     * Buffers consist of core data which may be surrounded by border data.
      *
-     * @param otherDeviceBuffer DeviceBuffer which should be used instead of creating own DeviceBuffer
-     * @param gridLayout layout of the buffers, including border-cells
-     * @param sizeOnDevice if true, internal buffers must store their
-     *        size additionally on the device
-     *        (as we keep this information coherent with the host, it influences
-     *        performance on host-device copies, but some algorithms on the device
-     *        might need to know the size of the buffer)
-     */
-    GridBuffer(DeviceBuffer<TYPE, DIM>& otherDeviceBuffer, const GridLayout<DIM>& gridLayout, bool sizeOnDevice = false) :
-    Parent(otherDeviceBuffer, gridLayout.getDataSpace(), sizeOnDevice),
-    gridLayout(gridLayout),
-    hasOneExchange(false),
-    maxExchange(0)
-    {
-        init();
-    }
-
-    GridBuffer(
-               HostBuffer<TYPE, DIM>& otherHostBuffer,
-               const DataSpace<DIM>& offsetHost,
-               DeviceBuffer<TYPE, DIM>& otherDeviceBuffer,
-               const DataSpace<DIM>& offsetDevice,
-               const GridLayout<DIM>& gridLayout,
-               bool sizeOnDevice = false) :
-    Parent(otherHostBuffer, offsetHost, otherDeviceBuffer, offsetDevice, gridLayout.getDataSpace(), sizeOnDevice),
-    gridLayout(gridLayout),
-    hasOneExchange(false),
-    maxExchange(0)
-    {
-        init();
-    }
-
-    /**
-     * Destructor.
+     * @tparam TYPE datatype for internal Host- and DeviceBuffer
+     * @tparam DIM dimension of the buffers
+     * @tparam BORDERTYPE optional type for border data in the buffers. TYPE is used by default.
      */
-    virtual ~GridBuffer()
+    template<class TYPE, unsigned DIM, class BORDERTYPE = TYPE>
+    class GridBuffer : public HostDeviceBuffer<TYPE, DIM>
     {
-        for (uint32_t i = 0; i < 27; ++i)
+        typedef HostDeviceBuffer<TYPE, DIM> Parent;
+
+    public:
+        typedef typename Parent::DataBoxType DataBoxType;
+
+        /**
+         * Constructor.
+         *
+         * @param gridLayout layout of the buffers, including border-cells
+         * @param sizeOnDevice if true, size information exists on device, too.
+         */
+        GridBuffer(const GridLayout<DIM>& gridLayout, bool sizeOnDevice = false)
+            : Parent(gridLayout.getDataSpace(), sizeOnDevice)
+            , gridLayout(gridLayout)
+            , hasOneExchange(false)
+            , maxExchange(0)
         {
-            __delete(sendExchanges[i]);
-            __delete(receiveExchanges[i]);
-        };
-    }
-
-    /**
-     * Add Exchange in GridBuffer memory space.
-     *
-     * An Exchange is added to this GridBuffer. The exchange buffers use
-     * the same memory as this GridBuffer.
-     *
-     * @param dataPlace place where received data is stored [GUARD | BORDER]
-     *        if dataPlace=GUARD than copy other BORDER to my GUARD
-     *        if dataPlace=BORDER than copy other GUARD to my BORDER
-     * @param receive a Mask which describes the directions for the exchange
-     * @param guardingCells number of guarding cells in each dimension
-     * @param communicationTag unique tag/id for communication
-     * @param sizeOnDeviceSend if true, internal send buffers must store their
-     *        size additionally on the device
-     *        (as we keep this information coherent with the host, it influences
-     *        performance on host-device copies, but some algorithms on the device
-     *        might need to know the size of the buffer)
-     * @param sizeOnDeviceReceive if true, internal receive buffers must store their
-     *        size additionally on the device
-     */
-    void addExchange(uint32_t dataPlace, const Mask &receive, DataSpace<DIM> guardingCells, uint32_t communicationTag, bool sizeOnDeviceSend, bool sizeOnDeviceReceive )
-    {
-
-        if (hasOneExchange && (communicationTag != lastUsedCommunicationTag))
-            throw std::runtime_error("It is not allowed to give the same GridBuffer different communicationTags");
-
-        lastUsedCommunicationTag = communicationTag;
+            init();
+        }
 
-        receiveMask = receiveMask + receive;
-        sendMask = this->receiveMask.getMirroredMask();
-        Mask send = receive.getMirroredMask();
+        /**
+         * Constructor.
+         *
+         * @param dataSpace DataSpace representing buffer size without border-cells
+         * @param sizeOnDevice if true, internal buffers must store their
+         *        size additionally on the device
+         *        (as we keep this information coherent with the host, it influences
+         *        performance on host-device copies, but some algorithms on the device
+         *        might need to know the size of the buffer)
+         */
+        GridBuffer(const DataSpace<DIM>& dataSpace, bool sizeOnDevice = false)
+            : Parent(dataSpace, sizeOnDevice)
+            , gridLayout(dataSpace)
+            , hasOneExchange(false)
+            , maxExchange(0)
+        {
+            init();
+        }
 
+        /**
+         * Constructor.
+         *
+         * @param otherDeviceBuffer DeviceBuffer which should be used instead of creating own DeviceBuffer
+         * @param gridLayout layout of the buffers, including border-cells
+         * @param sizeOnDevice if true, internal buffers must store their
+         *        size additionally on the device
+         *        (as we keep this information coherent with the host, it influences
+         *        performance on host-device copies, but some algorithms on the device
+         *        might need to know the size of the buffer)
+         */
+        GridBuffer(
+            DeviceBuffer<TYPE, DIM>& otherDeviceBuffer,
+            const GridLayout<DIM>& gridLayout,
+            bool sizeOnDevice = false)
+            : Parent(otherDeviceBuffer, gridLayout.getDataSpace(), sizeOnDevice)
+            , gridLayout(gridLayout)
+            , hasOneExchange(false)
+            , maxExchange(0)
+        {
+            init();
+        }
 
+        GridBuffer(
+            HostBuffer<TYPE, DIM>& otherHostBuffer,
+            const DataSpace<DIM>& offsetHost,
+            DeviceBuffer<TYPE, DIM>& otherDeviceBuffer,
+            const DataSpace<DIM>& offsetDevice,
+            const GridLayout<DIM>& gridLayout,
+            bool sizeOnDevice = false)
+            : Parent(
+                otherHostBuffer,
+                offsetHost,
+                otherDeviceBuffer,
+                offsetDevice,
+                gridLayout.getDataSpace(),
+                sizeOnDevice)
+            , gridLayout(gridLayout)
+            , hasOneExchange(false)
+            , maxExchange(0)
+        {
+            init();
+        }
 
-        for (uint32_t ex = 1; ex< -12 * (int) DIM + 6 * (int) DIM * (int) DIM + 9; ++ex)
+        /**
+         * Destructor.
+         */
+        virtual ~GridBuffer()
         {
-            if (send.isSet(ex))
+            for(uint32_t i = 0; i < 27; ++i)
             {
-                uint32_t uniqCommunicationTag = (communicationTag << 5) | ex;
-
-                if (!hasOneExchange && !privateGridBuffer::UniquTag::getInstance().isTagUniqu(uniqCommunicationTag))
-                {
-                    std::stringstream message;
-                    message << "unique exchange communication tag ("
-                        << uniqCommunicationTag << ") which is created from communicationTag ("
-                        << communicationTag << ") already used for other GridBuffer exchange";
-                    throw std::runtime_error(message.str());
-                }
-                hasOneExchange = true;
-
-                if (sendExchanges[ex] != nullptr)
-                {
-                    throw std::runtime_error("Exchange already added!");
-                }
-
-                maxExchange = std::max(maxExchange, ex + 1u);
-                sendExchanges[ex] = new ExchangeIntern<BORDERTYPE, DIM > (this->getDeviceBuffer(), gridLayout, guardingCells,
-                                                                          (ExchangeType) ex, uniqCommunicationTag,
-                                                                          dataPlace == GUARD ? BORDER : GUARD, sizeOnDeviceSend);
-                ExchangeType recvex = Mask::getMirroredExchangeType(ex);
-                maxExchange = std::max(maxExchange, recvex + 1u);
-                receiveExchanges[recvex] =
-                    new ExchangeIntern<BORDERTYPE, DIM > (
-                                                          this->getDeviceBuffer(),
-                                                          gridLayout,
-                                                          guardingCells,
-                                                          recvex,
-                                                          uniqCommunicationTag,
-                                                          dataPlace == GUARD ? GUARD : BORDER,
-                                                          sizeOnDeviceReceive);
-            }
+                __delete(sendExchanges[i]);
+                __delete(receiveExchanges[i]);
+            };
         }
-    }
 
-    /**
-     * Add Exchange in GridBuffer memory space.
-     *
-     * An Exchange is added to this GridBuffer. The exchange buffers use
-     * the same memory as this GridBuffer.
-     *
-     * @param dataPlace place where received data is stored [GUARD | BORDER]
-     *        if dataPlace=GUARD than copy other BORDER to my GUARD
-     *        if dataPlace=BORDER than copy other GUARD to my BORDER
-     * @param receive a Mask which describes the directions for the exchange
-     * @param guardingCells number of guarding cells in each dimension
-     * @param communicationTag unique tag/id for communication
-     * @param sizeOnDevice if true, internal buffers must store their
-     *        size additionally on the device
-     *        (as we keep this information coherent with the host, it influences
-     *        performance on host-device copies, but some algorithms on the device
-     *        might need to know the size of the buffer)
-     */
-    void addExchange(uint32_t dataPlace, const Mask &receive, DataSpace<DIM> guardingCells, uint32_t communicationTag, bool sizeOnDevice = false)
-    {
-        addExchange( dataPlace, receive, guardingCells, communicationTag, sizeOnDevice, sizeOnDevice );
-    }
-
-    /**
-     * Add Exchange in dedicated memory space.
-     *
-     * An Exchange is added to this GridBuffer. The exchange buffers use
-     * the their own memory instead of using the GridBuffer's memory space.
-     *
-     * @param receive a Mask which describes the directions for the exchange
-     * @param dataSpace size of the newly created exchange buffer in each dimension
-     * @param communicationTag unique tag/id for communication
-     * @param sizeOnDeviceSend if true, internal send buffers must store their
-     *        size additionally on the device
-     *        (as we keep this information coherent with the host, it influences
-     *        performance on host-device copies, but some algorithms on the device
-     *        might need to know the size of the buffer)
-     * @param sizeOnDeviceReceive if true, internal receive buffers must store their
-     *        size additionally on the device
-     */
-    void addExchangeBuffer(const Mask &receive, const DataSpace<DIM> &dataSpace, uint32_t communicationTag, bool sizeOnDeviceSend, bool sizeOnDeviceReceive )
-    {
-
-        if (hasOneExchange && (communicationTag != lastUsedCommunicationTag))
-            throw std::runtime_error("It is not allowed to give the same GridBuffer different communicationTags");
-        lastUsedCommunicationTag = communicationTag;
+        /**
+         * Add Exchange in GridBuffer memory space.
+         *
+         * An Exchange is added to this GridBuffer. The exchange buffers use
+         * the same memory as this GridBuffer.
+         *
+         * @param dataPlace place where received data is stored [GUARD | BORDER]
+         *        if dataPlace=GUARD than copy other BORDER to my GUARD
+         *        if dataPlace=BORDER than copy other GUARD to my BORDER
+         * @param receive a Mask which describes the directions for the exchange
+         * @param guardingCells number of guarding cells in each dimension
+         * @param communicationTag unique tag/id for communication
+         * @param sizeOnDeviceSend if true, internal send buffers must store their
+         *        size additionally on the device
+         *        (as we keep this information coherent with the host, it influences
+         *        performance on host-device copies, but some algorithms on the device
+         *        might need to know the size of the buffer)
+         * @param sizeOnDeviceReceive if true, internal receive buffers must store their
+         *        size additionally on the device
+         */
+        void addExchange(
+            uint32_t dataPlace,
+            const Mask& receive,
+            DataSpace<DIM> guardingCells,
+            uint32_t communicationTag,
+            bool sizeOnDeviceSend,
+            bool sizeOnDeviceReceive)
+        {
+            if(hasOneExchange && (communicationTag != lastUsedCommunicationTag))
+                throw std::runtime_error("It is not allowed to give the same GridBuffer different communicationTags");
 
+            lastUsedCommunicationTag = communicationTag;
 
-        /*don't create buffer with 0 (zero) elements*/
-        if (dataSpace.productOfComponents() != 0)
-        {
             receiveMask = receiveMask + receive;
             sendMask = this->receiveMask.getMirroredMask();
             Mask send = receive.getMirroredMask();
-            for (uint32_t ex = 1; ex < 27; ++ex)
+
+
+            for(uint32_t ex = 1; ex < -12 * (int) DIM + 6 * (int) DIM * (int) DIM + 9; ++ex)
             {
-                if (send.isSet(ex))
+                if(send.isSet(ex))
                 {
                     uint32_t uniqCommunicationTag = (communicationTag << 5) | ex;
-                    if (!hasOneExchange && !privateGridBuffer::UniquTag::getInstance().isTagUniqu(uniqCommunicationTag))
+
+                    if(!hasOneExchange && !privateGridBuffer::UniquTag::getInstance().isTagUniqu(uniqCommunicationTag))
                     {
                         std::stringstream message;
-                        message << "unique exchange communication tag ("
-                            << uniqCommunicationTag << ") which is created from communicationTag ("
-                            << communicationTag << ") already used for other GridBuffer exchange";
+                        message << "unique exchange communication tag (" << uniqCommunicationTag
+                                << ") which is created from communicationTag (" << communicationTag
+                                << ") already used for other GridBuffer exchange";
                         throw std::runtime_error(message.str());
                     }
                     hasOneExchange = true;
 
-                    if (sendExchanges[ex] != nullptr)
+                    if(sendExchanges[ex] != nullptr)
                     {
                         throw std::runtime_error("Exchange already added!");
                     }
 
-                    //GridLayout<DIM> memoryLayout(size);
                     maxExchange = std::max(maxExchange, ex + 1u);
-                    sendExchanges[ex] = new ExchangeIntern<BORDERTYPE, DIM > (/*memoryLayout*/ dataSpace,
-                                                                              ex, uniqCommunicationTag, sizeOnDeviceSend);
-
+                    sendExchanges[ex] = new ExchangeIntern<BORDERTYPE, DIM>(
+                        this->getDeviceBuffer(),
+                        gridLayout,
+                        guardingCells,
+                        (ExchangeType) ex,
+                        uniqCommunicationTag,
+                        dataPlace == GUARD ? BORDER : GUARD,
+                        sizeOnDeviceSend);
                     ExchangeType recvex = Mask::getMirroredExchangeType(ex);
                     maxExchange = std::max(maxExchange, recvex + 1u);
-                    receiveExchanges[recvex] = new ExchangeIntern<BORDERTYPE, DIM > (/*memoryLayout*/ dataSpace,
-                                                                                     recvex, uniqCommunicationTag, sizeOnDeviceReceive);
+                    receiveExchanges[recvex] = new ExchangeIntern<BORDERTYPE, DIM>(
+                        this->getDeviceBuffer(),
+                        gridLayout,
+                        guardingCells,
+                        recvex,
+                        uniqCommunicationTag,
+                        dataPlace == GUARD ? GUARD : BORDER,
+                        sizeOnDeviceReceive);
                 }
             }
         }
-    }
 
-    /**
-     * Add Exchange in dedicated memory space.
-     *
-     * An Exchange is added to this GridBuffer. The exchange buffers use
-     * the their own memory instead of using the GridBuffer's memory space.
-     *
-     * @param receive a Mask which describes the directions for the exchange
-     * @param dataSpace size of the newly created exchange buffer in each dimension
-     * @param communicationTag unique tag/id for communication
-     * @param sizeOnDevice if true, internal buffers must store their
-     *        size additionally on the device
-     *        (as we keep this information coherent with the host, it influences
-     *        performance on host-device copies, but some algorithms on the device
-     *        might need to know the size of the buffer)
-     */
-    void addExchangeBuffer(const Mask &receive, const DataSpace<DIM> &dataSpace, uint32_t communicationTag, bool sizeOnDevice = false )
-    {
-        addExchangeBuffer( receive, dataSpace, communicationTag, sizeOnDevice, sizeOnDevice );
-    }
+        /**
+         * Add Exchange in GridBuffer memory space.
+         *
+         * An Exchange is added to this GridBuffer. The exchange buffers use
+         * the same memory as this GridBuffer.
+         *
+         * @param dataPlace place where received data is stored [GUARD | BORDER]
+         *        if dataPlace=GUARD than copy other BORDER to my GUARD
+         *        if dataPlace=BORDER than copy other GUARD to my BORDER
+         * @param receive a Mask which describes the directions for the exchange
+         * @param guardingCells number of guarding cells in each dimension
+         * @param communicationTag unique tag/id for communication
+         * @param sizeOnDevice if true, internal buffers must store their
+         *        size additionally on the device
+         *        (as we keep this information coherent with the host, it influences
+         *        performance on host-device copies, but some algorithms on the device
+         *        might need to know the size of the buffer)
+         */
+        void addExchange(
+            uint32_t dataPlace,
+            const Mask& receive,
+            DataSpace<DIM> guardingCells,
+            uint32_t communicationTag,
+            bool sizeOnDevice = false)
+        {
+            addExchange(dataPlace, receive, guardingCells, communicationTag, sizeOnDevice, sizeOnDevice);
+        }
 
-    /**
-     * Returns whether this GridBuffer has an Exchange for sending in ex direction.
-     *
-     * @param ex exchange direction to query
-     * @return true if send exchanges with ex direction exist, otherwise false
-     */
-    bool hasSendExchange(uint32_t ex) const
-    {
-        return ( (sendExchanges[ex] != nullptr) && (getSendMask().isSet(ex)));
-    }
+        /**
+         * Add Exchange in dedicated memory space.
+         *
+         * An Exchange is added to this GridBuffer. The exchange buffers use
+         * the their own memory instead of using the GridBuffer's memory space.
+         *
+         * @param receive a Mask which describes the directions for the exchange
+         * @param dataSpace size of the newly created exchange buffer in each dimension
+         * @param communicationTag unique tag/id for communication
+         * @param sizeOnDeviceSend if true, internal send buffers must store their
+         *        size additionally on the device
+         *        (as we keep this information coherent with the host, it influences
+         *        performance on host-device copies, but some algorithms on the device
+         *        might need to know the size of the buffer)
+         * @param sizeOnDeviceReceive if true, internal receive buffers must store their
+         *        size additionally on the device
+         */
+        void addExchangeBuffer(
+            const Mask& receive,
+            const DataSpace<DIM>& dataSpace,
+            uint32_t communicationTag,
+            bool sizeOnDeviceSend,
+            bool sizeOnDeviceReceive)
+        {
+            if(hasOneExchange && (communicationTag != lastUsedCommunicationTag))
+                throw std::runtime_error("It is not allowed to give the same GridBuffer different communicationTags");
+            lastUsedCommunicationTag = communicationTag;
 
-    /**
-     * Returns whether this GridBuffer has an Exchange for receiving from ex direction.
-     *
-     * @param ex exchange direction to query
-     * @return true if receive exchanges with ex direction exist, otherwise false
-     */
-    bool hasReceiveExchange(uint32_t ex) const
-    {
-        return ( (receiveExchanges[ex] != nullptr) && (getReceiveMask().isSet(ex)));
-    }
 
-    /**
-     * Returns the Exchange for sending data in ex direction.
-     *
-     * Returns an Exchange which for sending data from
-     * this GridBuffer in the direction described by ex.
-     *
-     * @param ex the direction to query
-     * @return the Exchange for sending data
-     */
-    Exchange<BORDERTYPE, DIM>& getSendExchange(uint32_t ex) const
-    {
-        return *sendExchanges[ex];
-    }
+            /*don't create buffer with 0 (zero) elements*/
+            if(dataSpace.productOfComponents() != 0)
+            {
+                receiveMask = receiveMask + receive;
+                sendMask = this->receiveMask.getMirroredMask();
+                Mask send = receive.getMirroredMask();
+                for(uint32_t ex = 1; ex < 27; ++ex)
+                {
+                    if(send.isSet(ex))
+                    {
+                        uint32_t uniqCommunicationTag = (communicationTag << 5) | ex;
+                        if(!hasOneExchange
+                           && !privateGridBuffer::UniquTag::getInstance().isTagUniqu(uniqCommunicationTag))
+                        {
+                            std::stringstream message;
+                            message << "unique exchange communication tag (" << uniqCommunicationTag
+                                    << ") which is created from communicationTag (" << communicationTag
+                                    << ") already used for other GridBuffer exchange";
+                            throw std::runtime_error(message.str());
+                        }
+                        hasOneExchange = true;
+
+                        if(sendExchanges[ex] != nullptr)
+                        {
+                            throw std::runtime_error("Exchange already added!");
+                        }
+
+                        // GridLayout<DIM> memoryLayout(size);
+                        maxExchange = std::max(maxExchange, ex + 1u);
+                        sendExchanges[ex] = new ExchangeIntern<BORDERTYPE, DIM>(
+                            /*memoryLayout*/ dataSpace,
+                            ex,
+                            uniqCommunicationTag,
+                            sizeOnDeviceSend);
+
+                        ExchangeType recvex = Mask::getMirroredExchangeType(ex);
+                        maxExchange = std::max(maxExchange, recvex + 1u);
+                        receiveExchanges[recvex] = new ExchangeIntern<BORDERTYPE, DIM>(
+                            /*memoryLayout*/ dataSpace,
+                            recvex,
+                            uniqCommunicationTag,
+                            sizeOnDeviceReceive);
+                    }
+                }
+            }
+        }
 
-    /**
-     * Returns the Exchange for receiving data from ex direction.
-     *
-     * Returns an Exchange which for receiving data to
-     * this GridBuffer from the direction described by ex.
-     *
-     * @param ex the direction to query
-     * @return the Exchange for receiving data
-     */
-    Exchange<BORDERTYPE, DIM>& getReceiveExchange(uint32_t ex) const
-    {
-        return *receiveExchanges[ex];
-    }
+        /**
+         * Add Exchange in dedicated memory space.
+         *
+         * An Exchange is added to this GridBuffer. The exchange buffers use
+         * the their own memory instead of using the GridBuffer's memory space.
+         *
+         * @param receive a Mask which describes the directions for the exchange
+         * @param dataSpace size of the newly created exchange buffer in each dimension
+         * @param communicationTag unique tag/id for communication
+         * @param sizeOnDevice if true, internal buffers must store their
+         *        size additionally on the device
+         *        (as we keep this information coherent with the host, it influences
+         *        performance on host-device copies, but some algorithms on the device
+         *        might need to know the size of the buffer)
+         */
+        void addExchangeBuffer(
+            const Mask& receive,
+            const DataSpace<DIM>& dataSpace,
+            uint32_t communicationTag,
+            bool sizeOnDevice = false)
+        {
+            addExchangeBuffer(receive, dataSpace, communicationTag, sizeOnDevice, sizeOnDevice);
+        }
 
-    /**
-     * Returns the Mask describing send exchanges
-     *
-     * @return Mask for send exchanges
-     */
-    Mask getSendMask() const
-    {
-        return (Environment<DIM>::get().EnvironmentController().getCommunicationMask() & sendMask);
-    }
+        /**
+         * Returns whether this GridBuffer has an Exchange for sending in ex direction.
+         *
+         * @param ex exchange direction to query
+         * @return true if send exchanges with ex direction exist, otherwise false
+         */
+        bool hasSendExchange(uint32_t ex) const
+        {
+            return ((sendExchanges[ex] != nullptr) && (getSendMask().isSet(ex)));
+        }
 
-    /**
-     * Returns the Mask describing receive exchanges
-     *
-     * @return Mask for receive exchanges
-     */
-    Mask getReceiveMask() const
-    {
-        return (Environment<DIM>::get().EnvironmentController().getCommunicationMask() & receiveMask);
-    }
+        /**
+         * Returns whether this GridBuffer has an Exchange for receiving from ex direction.
+         *
+         * @param ex exchange direction to query
+         * @return true if receive exchanges with ex direction exist, otherwise false
+         */
+        bool hasReceiveExchange(uint32_t ex) const
+        {
+            return ((receiveExchanges[ex] != nullptr) && (getReceiveMask().isSet(ex)));
+        }
 
-    /**
-     * Starts sync data from own device buffer to neighbor device buffer.
-     *
-     * Asynchronously starts synchronization data from internal DeviceBuffer using added
-     * Exchange buffers.
-     * This operation runs sequential to other code but intern asynchronous
-     *
-     */
-    EventTask communication()
-    {
-        EventTask ev = this->asyncCommunication(__getTransactionEvent());
-        __setTransactionEvent(ev);
-        return ev;
-    }
+        /**
+         * Returns the Exchange for sending data in ex direction.
+         *
+         * Returns an Exchange which for sending data from
+         * this GridBuffer in the direction described by ex.
+         *
+         * @param ex the direction to query
+         * @return the Exchange for sending data
+         */
+        Exchange<BORDERTYPE, DIM>& getSendExchange(uint32_t ex) const
+        {
+            return *sendExchanges[ex];
+        }
 
-    /**
-     * Starts sync data from own device buffer to neighbor device buffer.
-     *
-     * Asynchronously starts synchronization data from internal DeviceBuffer using added
-     * Exchange buffers.
-     *
-     */
-    EventTask asyncCommunication(EventTask serialEvent)
-    {
-        EventTask evR;
-        for (uint32_t i = 0; i < maxExchange; ++i)
+        /**
+         * Returns the Exchange for receiving data from ex direction.
+         *
+         * Returns an Exchange which for receiving data to
+         * this GridBuffer from the direction described by ex.
+         *
+         * @param ex the direction to query
+         * @return the Exchange for receiving data
+         */
+        Exchange<BORDERTYPE, DIM>& getReceiveExchange(uint32_t ex) const
+        {
+            return *receiveExchanges[ex];
+        }
+
+        /**
+         * Returns the Mask describing send exchanges
+         *
+         * @return Mask for send exchanges
+         */
+        Mask getSendMask() const
         {
+            return (Environment<DIM>::get().EnvironmentController().getCommunicationMask() & sendMask);
+        }
+
+        /**
+         * Returns the Mask describing receive exchanges
+         *
+         * @return Mask for receive exchanges
+         */
+        Mask getReceiveMask() const
+        {
+            return (Environment<DIM>::get().EnvironmentController().getCommunicationMask() & receiveMask);
+        }
 
-            evR += asyncReceive(serialEvent, i);
+        /**
+         * Starts sync data from own device buffer to neighbor device buffer.
+         *
+         * Asynchronously starts synchronization data from internal DeviceBuffer using added
+         * Exchange buffers.
+         * This operation runs sequential to other code but intern asynchronous
+         *
+         */
+        EventTask communication()
+        {
+            EventTask ev = this->asyncCommunication(__getTransactionEvent());
+            __setTransactionEvent(ev);
+            return ev;
+        }
 
-            ExchangeType sendEx = Mask::getMirroredExchangeType(i);
+        /**
+         * Starts sync data from own device buffer to neighbor device buffer.
+         *
+         * Asynchronously starts synchronization data from internal DeviceBuffer using added
+         * Exchange buffers.
+         *
+         */
+        EventTask asyncCommunication(EventTask serialEvent)
+        {
+            EventTask evR;
+            for(uint32_t i = 0; i < maxExchange; ++i)
+            {
+                evR += asyncReceive(serialEvent, i);
 
-            evR += asyncSend(serialEvent, sendEx);
+                ExchangeType sendEx = Mask::getMirroredExchangeType(i);
 
+                evR += asyncSend(serialEvent, sendEx);
+            }
+            return evR;
         }
-        return evR;
-    }
 
-    EventTask asyncSend(EventTask serialEvent, uint32_t sendEx)
-    {
-        if (hasSendExchange(sendEx))
+        EventTask asyncSend(EventTask serialEvent, uint32_t sendEx)
         {
-            __startTransaction(serialEvent + sendEvents[sendEx]);
-            sendEvents[sendEx] = sendExchanges[sendEx]->startSend();
-            __endTransaction();
-            return sendEvents[sendEx];
+            if(hasSendExchange(sendEx))
+            {
+                __startTransaction(serialEvent + sendEvents[sendEx]);
+                sendEvents[sendEx] = sendExchanges[sendEx]->startSend();
+                __endTransaction();
+                return sendEvents[sendEx];
+            }
+            return EventTask();
         }
-        return EventTask();
-    }
 
-    EventTask asyncReceive(EventTask serialEvent, uint32_t recvEx)
-    {
-        if (hasReceiveExchange(recvEx))
+        EventTask asyncReceive(EventTask serialEvent, uint32_t recvEx)
         {
-            __startTransaction(serialEvent + receiveEvents[recvEx]);
-            receiveEvents[recvEx] = receiveExchanges[recvEx]->startReceive();
+            if(hasReceiveExchange(recvEx))
+            {
+                __startTransaction(serialEvent + receiveEvents[recvEx]);
+                receiveEvents[recvEx] = receiveExchanges[recvEx]->startReceive();
 
-            __endTransaction();
-            return receiveEvents[recvEx];
+                __endTransaction();
+                return receiveEvents[recvEx];
+            }
+            return EventTask();
         }
-        return EventTask();
-    }
 
-    /**
-     * Returns the GridLayout describing this GridBuffer.
-     *
-     * @return the layout of this buffer
-     */
-    GridLayout<DIM> getGridLayout()
-    {
-        return gridLayout;
-    }
-
-private:
+        /**
+         * Returns the GridLayout describing this GridBuffer.
+         *
+         * @return the layout of this buffer
+         */
+        GridLayout<DIM> getGridLayout()
+        {
+            return gridLayout;
+        }
 
-    friend class Environment<DIM>;
+    private:
+        friend class Environment<DIM>;
 
-    void init()
-    {
-        for (uint32_t i = 0; i < 27; ++i)
+        void init()
         {
-            sendExchanges[i] = nullptr;
-            receiveExchanges[i] = nullptr;
-            /* fill array with valid empty events to avoid side effects if
-             * array is accessed without calling hasExchange() before usage */
-            receiveEvents[i] = EventTask();
-            sendEvents[i] = EventTask();
+            for(uint32_t i = 0; i < 27; ++i)
+            {
+                sendExchanges[i] = nullptr;
+                receiveExchanges[i] = nullptr;
+                /* fill array with valid empty events to avoid side effects if
+                 * array is accessed without calling hasExchange() before usage */
+                receiveEvents[i] = EventTask();
+                sendEvents[i] = EventTask();
+            }
         }
-    }
 
-protected:
-    /*if we have one exchange we don't check if communicationTag has been used before*/
-    bool hasOneExchange;
-    uint32_t lastUsedCommunicationTag;
-    GridLayout<DIM> gridLayout;
+    protected:
+        /*if we have one exchange we don't check if communicationTag has been used before*/
+        bool hasOneExchange;
+        uint32_t lastUsedCommunicationTag;
+        GridLayout<DIM> gridLayout;
 
-    Mask sendMask;
-    Mask receiveMask;
+        Mask sendMask;
+        Mask receiveMask;
 
-    ExchangeIntern<BORDERTYPE, DIM>* sendExchanges[27];
-    ExchangeIntern<BORDERTYPE, DIM>* receiveExchanges[27];
-    EventTask receiveEvents[27];
-    EventTask sendEvents[27];
+        ExchangeIntern<BORDERTYPE, DIM>* sendExchanges[27];
+        ExchangeIntern<BORDERTYPE, DIM>* receiveExchanges[27];
+        EventTask receiveEvents[27];
+        EventTask sendEvents[27];
 
-    uint32_t maxExchange; //use max exchanges and run over the array is faster as use set from stl
-};
+        uint32_t maxExchange; // use max exchanges and run over the array is faster as use set from stl
+    };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/HostBuffer.hpp b/include/pmacc/memory/buffers/HostBuffer.hpp
index 903dea0845..853c6b170a 100644
--- a/include/pmacc/memory/buffers/HostBuffer.hpp
+++ b/include/pmacc/memory/buffers/HostBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -28,43 +28,40 @@
 
 namespace pmacc
 {
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class HostBuffer;
 
-namespace detail
-{
-    template< class TYPE >
-    container::HostBuffer< TYPE, 1u >
-    make_CartBuffer( HostBuffer<TYPE, 1u> & hb )
+    namespace detail
     {
-        return container::HostBuffer<TYPE, 1u>(hb.getBasePointer(), hb.getDataSpace(), false);
-    }
+        template<class TYPE>
+        container::HostBuffer<TYPE, 1u> make_CartBuffer(HostBuffer<TYPE, 1u>& hb)
+        {
+            return container::HostBuffer<TYPE, 1u>(hb.getBasePointer(), hb.getDataSpace(), false);
+        }
 
-    template< class TYPE >
-    container::HostBuffer< TYPE, 2u >
-    make_CartBuffer( HostBuffer<TYPE, 2u> & hb )
-    {
-        math::Size_t<2u - 1u> pitch;
-        pitch[0] = hb.getPhysicalMemorySize()[0] * sizeof(TYPE);
-        return container::HostBuffer<TYPE, 2u>(hb.getBasePointer(), hb.getDataSpace(), false, pitch);
-    }
+        template<class TYPE>
+        container::HostBuffer<TYPE, 2u> make_CartBuffer(HostBuffer<TYPE, 2u>& hb)
+        {
+            math::Size_t<2u - 1u> pitch;
+            pitch[0] = hb.getPhysicalMemorySize()[0] * sizeof(TYPE);
+            return container::HostBuffer<TYPE, 2u>(hb.getBasePointer(), hb.getDataSpace(), false, pitch);
+        }
 
-    template< class TYPE >
-    container::HostBuffer< TYPE, 3u >
-    make_CartBuffer( HostBuffer<TYPE, 3u> & hb )
-    {
-        math::Size_t<3u - 1u> pitch;
-        pitch[0] = hb.getPhysicalMemorySize()[0] * sizeof(TYPE);
-        pitch[1] = pitch[0] * hb.getPhysicalMemorySize()[1];
-        return container::HostBuffer<TYPE, 3u>(hb.getBasePointer(), hb.getDataSpace(), false, pitch);
-    }
-}
+        template<class TYPE>
+        container::HostBuffer<TYPE, 3u> make_CartBuffer(HostBuffer<TYPE, 3u>& hb)
+        {
+            math::Size_t<3u - 1u> pitch;
+            pitch[0] = hb.getPhysicalMemorySize()[0] * sizeof(TYPE);
+            pitch[1] = pitch[0] * hb.getPhysicalMemorySize()[1];
+            return container::HostBuffer<TYPE, 3u>(hb.getBasePointer(), hb.getDataSpace(), false, pitch);
+        }
+    } // namespace detail
     class EventTask;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class DeviceBuffer;
 
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class Buffer;
 
     /**
@@ -73,7 +70,7 @@ namespace detail
      * @tparam TYPE datatype for buffer data
      * @tparam DIM dimension of the buffer
      */
-    template <class TYPE, unsigned DIM>
+    template<class TYPE, unsigned DIM>
     class HostBuffer : public Buffer<TYPE, DIM>
     {
     public:
@@ -98,9 +95,7 @@ namespace detail
         /**
          * Destructor.
          */
-        virtual ~HostBuffer()
-        {
-        };
+        virtual ~HostBuffer(){};
 
         /**
          * Conversion to cuSTL HostBuffer.
@@ -108,14 +103,12 @@ namespace detail
          * Returns a cuSTL HostBuffer with reference to the same data.
          */
         HINLINE
-        container::HostBuffer<TYPE, DIM>
-        cartBuffer()
+        container::HostBuffer<TYPE, DIM> cartBuffer()
         {
-            return detail::make_CartBuffer( *this );
+            return detail::make_CartBuffer(*this);
         }
 
     protected:
-
         /** Constructor.
          *
          * @param size extent for each dimension (in elements)
@@ -123,11 +116,10 @@ namespace detail
          *             can be less than `physicalMemorySize`
          * @param physicalMemorySize size of the physical memory (in elements)
          */
-        HostBuffer(DataSpace<DIM> size, DataSpace<DIM> physicalMemorySize) :
-        Buffer<TYPE, DIM>(size, physicalMemorySize)
+        HostBuffer(DataSpace<DIM> size, DataSpace<DIM> physicalMemorySize)
+            : Buffer<TYPE, DIM>(size, physicalMemorySize)
         {
-
         }
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/HostBufferIntern.hpp b/include/pmacc/memory/buffers/HostBufferIntern.hpp
index ff867610b7..d89706600e 100644
--- a/include/pmacc/memory/buffers/HostBufferIntern.hpp
+++ b/include/pmacc/memory/buffers/HostBufferIntern.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -26,125 +26,126 @@
 #include "pmacc/eventSystem/tasks/Factory.hpp"
 #include "pmacc/eventSystem/EventSystem.hpp"
 #include "pmacc/memory/boxes/DataBoxDim1Access.hpp"
+#include "pmacc/memory/Array.hpp"
 #include "pmacc/assert.hpp"
 
 namespace pmacc
 {
-
-/**
- * Internal implementation of the HostBuffer interface.
- */
-template <class TYPE, unsigned DIM>
-class HostBufferIntern : public HostBuffer<TYPE, DIM>
-{
-public:
-
-    typedef typename HostBuffer<TYPE, DIM>::DataBoxType DataBoxType;
-
-    /** constructor
-     *
-     * @param size extent for each dimension (in elements)
-     */
-    HostBufferIntern(DataSpace<DIM> size) :
-    HostBuffer<TYPE, DIM>(size, size),
-    pointer(nullptr),ownPointer(true)
-    {
-        CUDA_CHECK(cudaMallocHost((void**)&pointer, size.productOfComponents() * sizeof (TYPE)));
-        reset(false);
-    }
-
-    HostBufferIntern(HostBufferIntern& source, DataSpace<DIM> size, DataSpace<DIM> offset=DataSpace<DIM>()) :
-    HostBuffer<TYPE, DIM>(size, source.getPhysicalMemorySize()),
-    pointer(nullptr),ownPointer(false)
-    {
-        pointer=&(source.getDataBox()(offset));/*fix me, this is a bad way*/
-        reset(true);
-    }
-
     /**
-     * destructor
+     * Internal implementation of the HostBuffer interface.
      */
-    virtual ~HostBufferIntern()
+    template<class TYPE, unsigned DIM>
+    class HostBufferIntern : public HostBuffer<TYPE, DIM>
     {
-        __startOperation(ITask::TASK_HOST);
+    public:
+        typedef typename HostBuffer<TYPE, DIM>::DataBoxType DataBoxType;
+
+        /** constructor
+         *
+         * @param size extent for each dimension (in elements)
+         */
+        HostBufferIntern(DataSpace<DIM> size) : HostBuffer<TYPE, DIM>(size, size), pointer(nullptr), ownPointer(true)
+        {
+            CUDA_CHECK(cuplaMallocHost((void**) &pointer, size.productOfComponents() * sizeof(TYPE)));
+            reset(false);
+        }
 
-        if (pointer && ownPointer)
+        HostBufferIntern(HostBufferIntern& source, DataSpace<DIM> size, DataSpace<DIM> offset = DataSpace<DIM>())
+            : HostBuffer<TYPE, DIM>(size, source.getPhysicalMemorySize())
+            , pointer(nullptr)
+            , ownPointer(false)
         {
-            CUDA_CHECK_NO_EXCEPT(cudaFreeHost(pointer));
+            pointer = &(source.getDataBox()(offset)); /*fix me, this is a bad way*/
+            reset(true);
         }
-    }
 
-    /*! Get pointer of memory
-     * @return pointer to memory
-     */
-    TYPE* getBasePointer()
-    {
-        __startOperation(ITask::TASK_HOST);
-        return pointer;
-    }
+        /**
+         * destructor
+         */
+        virtual ~HostBufferIntern()
+        {
+            __startOperation(ITask::TASK_HOST);
 
-    TYPE* getPointer()
-    {
-        __startOperation(ITask::TASK_HOST);
-        return pointer;
-    }
+            if(pointer && ownPointer)
+            {
+                CUDA_CHECK_NO_EXCEPT(cuplaFreeHost(pointer));
+            }
+        }
 
-    void copyFrom(DeviceBuffer<TYPE, DIM>& other)
-    {
-        PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
-        Environment<>::get().Factory().createTaskCopyDeviceToHost(other, *this);
-    }
+        /*! Get pointer of memory
+         * @return pointer to memory
+         */
+        TYPE* getBasePointer()
+        {
+            __startOperation(ITask::TASK_HOST);
+            return pointer;
+        }
 
-    void reset(bool preserveData = true)
-    {
-        __startOperation(ITask::TASK_HOST);
-        this->setCurrentSize(this->getDataSpace().productOfComponents());
-        if (!preserveData)
+        TYPE* getPointer()
         {
-            /* if it is a pointer out of other memory we can not assume that
-             * that the physical memory is contiguous
-             */
-            if(ownPointer)
-                memset(pointer, 0, this->getDataSpace().productOfComponents() * sizeof (TYPE));
-            else
+            __startOperation(ITask::TASK_HOST);
+            return pointer;
+        }
+
+        void copyFrom(DeviceBuffer<TYPE, DIM>& other)
+        {
+            PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
+            Environment<>::get().Factory().createTaskCopyDeviceToHost(other, *this);
+        }
+
+        void reset(bool preserveData = true)
+        {
+            __startOperation(ITask::TASK_HOST);
+            this->setCurrentSize(this->getDataSpace().productOfComponents());
+            if(!preserveData)
             {
-                TYPE value;
-                /* using `uint8_t` for byte-wise looping through tmp var value of `TYPE` */
-                uint8_t* valuePtr = (uint8_t*)&value;
-                for( size_t b = 0; b < sizeof(TYPE); ++b)
+                /* if it is a pointer out of other memory we can not assume that
+                 * that the physical memory is contiguous
+                 */
+                if(ownPointer)
+                    memset(
+                        reinterpret_cast<void*>(pointer),
+                        0,
+                        this->getDataSpace().productOfComponents() * sizeof(TYPE));
+                else
                 {
-                    valuePtr[b] = static_cast<uint8_t>(0);
+                    // Using Array is a workaround for types without default constructor
+                    memory::Array<TYPE, 1> tmp;
+                    memset(reinterpret_cast<void*>(tmp.data()), 0, sizeof(tmp));
+                    // use first element to avoid issue because Array is aligned (sizeof can be larger than component
+                    // type)
+                    setValue(tmp[0]);
                 }
-                /* set value with zero-ed `TYPE` */
-                setValue(value);
             }
         }
-    }
 
-    void setValue(const TYPE& value)
-    {
-        __startOperation(ITask::TASK_HOST);
-        int64_t current_size = static_cast< int64_t >(this->getCurrentSize());
-        auto memBox = getDataBox();
-        typedef DataBoxDim1Access<DataBoxType > D1Box;
-        D1Box d1Box(memBox, this->getDataSpace());
-        #pragma omp parallel for
-        for (int64_t i = 0; i < current_size; i++)
+        void setValue(const TYPE& value)
         {
-            d1Box[i] = value;
+            __startOperation(ITask::TASK_HOST);
+            int64_t current_size = static_cast<int64_t>(this->getCurrentSize());
+            auto memBox = getDataBox();
+            typedef DataBoxDim1Access<DataBoxType> D1Box;
+            D1Box d1Box(memBox, this->getDataSpace());
+#pragma omp parallel for
+            for(int64_t i = 0; i < current_size; i++)
+            {
+                d1Box[i] = value;
+            }
         }
-    }
 
-    DataBoxType getDataBox()
-    {
-        __startOperation(ITask::TASK_HOST);
-        return DataBoxType(PitchedBox<TYPE, DIM > (pointer, DataSpace<DIM > (),
-                                                   this->getPhysicalMemorySize(), this->getPhysicalMemorySize()[0] * sizeof (TYPE)));
-    }
+        DataBoxType getDataBox()
+        {
+            __startOperation(ITask::TASK_HOST);
+            return DataBoxType(PitchedBox<TYPE, DIM>(
+                pointer,
+                DataSpace<DIM>(),
+                this->getPhysicalMemorySize(),
+                this->getPhysicalMemorySize()[0] * sizeof(TYPE)));
+        }
 
-private:
-    TYPE* pointer;
-    bool ownPointer;
-};
+    private:
+        TYPE* pointer;
+        bool ownPointer;
+    };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/HostDeviceBuffer.hpp b/include/pmacc/memory/buffers/HostDeviceBuffer.hpp
index 852c92bee1..21ef71c0fd 100644
--- a/include/pmacc/memory/buffers/HostDeviceBuffer.hpp
+++ b/include/pmacc/memory/buffers/HostDeviceBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -29,20 +29,23 @@
 #include <boost/type_traits.hpp>
 
 
-namespace pmacc{
-
+namespace pmacc
+{
     /** Buffer that contains a host and device buffer and allows synchronizing those 2 */
     template<typename T_Type, unsigned T_dim>
     class HostDeviceBuffer
     {
         typedef HostBufferIntern<T_Type, T_dim> HostBufferType;
         typedef DeviceBufferIntern<T_Type, T_dim> DeviceBufferType;
+
     public:
         using ValueType = T_Type;
         typedef HostBuffer<T_Type, T_dim> HBuffer;
         typedef DeviceBuffer<T_Type, T_dim> DBuffer;
         typedef typename HostBufferType::DataBoxType DataBoxType;
-        PMACC_CASSERT_MSG(DataBoxTypes_must_match, boost::is_same<DataBoxType, typename DeviceBufferType::DataBoxType>::value);
+        PMACC_CASSERT_MSG(
+            DataBoxTypes_must_match,
+            boost::is_same<DataBoxType, typename DeviceBufferType::DataBoxType>::value);
 
         /**
          * Constructor that creates the buffers with the given size
@@ -69,12 +72,12 @@ namespace pmacc{
          * Passing a size bigger than the buffer (minus the offset) is undefined.
          */
         HostDeviceBuffer(
-                   HBuffer& otherHostBuffer,
-                   const DataSpace<T_dim>& offsetHost,
-                   DBuffer& otherDeviceBuffer,
-                   const DataSpace<T_dim>& offsetDevice,
-                   const GridLayout<T_dim> size,
-                   bool sizeOnDevice = false);
+            HBuffer& otherHostBuffer,
+            const DataSpace<T_dim>& offsetHost,
+            DBuffer& otherDeviceBuffer,
+            const DataSpace<T_dim>& offsetDevice,
+            const GridLayout<T_dim> size,
+            bool sizeOnDevice = false);
 
         HINLINE virtual ~HostDeviceBuffer();
 
@@ -111,12 +114,12 @@ namespace pmacc{
          * Asynchronously copies data from internal device to internal host buffer.
          */
         HINLINE void deviceToHost();
+
     private:
         HBuffer* hostBuffer;
         DBuffer* deviceBuffer;
-
     };
 
-}  // namespace pmacc
+} // namespace pmacc
 
 #include "pmacc/memory/buffers/HostDeviceBuffer.tpp"
diff --git a/include/pmacc/memory/buffers/HostDeviceBuffer.tpp b/include/pmacc/memory/buffers/HostDeviceBuffer.tpp
index 8d677c2538..a4c43c850b 100644
--- a/include/pmacc/memory/buffers/HostDeviceBuffer.tpp
+++ b/include/pmacc/memory/buffers/HostDeviceBuffer.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -24,37 +24,37 @@
 #include "HostDeviceBuffer.hpp"
 
 
-namespace pmacc{
-
+namespace pmacc
+{
     template<typename T_Type, unsigned T_dim>
     HostDeviceBuffer<T_Type, T_dim>::HostDeviceBuffer(const DataSpace<T_dim>& size, bool sizeOnDevice)
     {
-        hostBuffer   = new HostBufferIntern<T_Type, T_dim>(size);
+        hostBuffer = new HostBufferIntern<T_Type, T_dim>(size);
         deviceBuffer = new DeviceBufferIntern<T_Type, T_dim>(size, sizeOnDevice);
     }
 
     template<typename T_Type, unsigned T_dim>
     HostDeviceBuffer<T_Type, T_dim>::HostDeviceBuffer(
-            DBuffer& otherDeviceBuffer,
-            const DataSpace<T_dim>& size,
-            bool sizeOnDevice)
+        DBuffer& otherDeviceBuffer,
+        const DataSpace<T_dim>& size,
+        bool sizeOnDevice)
     {
-        hostBuffer   = new HostBufferIntern<T_Type, T_dim>(size);
+        hostBuffer = new HostBufferIntern<T_Type, T_dim>(size);
         deviceBuffer = new DeviceBufferType(otherDeviceBuffer, size, DataSpace<T_dim>(), sizeOnDevice);
     }
 
     template<typename T_Type, unsigned T_dim>
     HostDeviceBuffer<T_Type, T_dim>::HostDeviceBuffer(
-               HBuffer& otherHostBuffer,
-               const DataSpace<T_dim>& offsetHost,
-               DBuffer& otherDeviceBuffer,
-               const DataSpace<T_dim>& offsetDevice,
-               const GridLayout<T_dim> size,
-               bool sizeOnDevice)
-   {
-        hostBuffer   = new HostBufferType(otherHostBuffer, size, offsetHost);
+        HBuffer& otherHostBuffer,
+        const DataSpace<T_dim>& offsetHost,
+        DBuffer& otherDeviceBuffer,
+        const DataSpace<T_dim>& offsetDevice,
+        const GridLayout<T_dim> size,
+        bool sizeOnDevice)
+    {
+        hostBuffer = new HostBufferType(otherHostBuffer, size, offsetHost);
         deviceBuffer = new DeviceBufferType(otherDeviceBuffer, size, offsetDevice, sizeOnDevice);
-   }
+    }
 
     template<typename T_Type, unsigned T_dim>
     HostDeviceBuffer<T_Type, T_dim>::~HostDeviceBuffer()
@@ -94,4 +94,4 @@ namespace pmacc{
         hostBuffer->copyFrom(*deviceBuffer);
     }
 
-}  // namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/MappedBufferIntern.hpp b/include/pmacc/memory/buffers/MappedBufferIntern.hpp
index 5254250830..657b945a6e 100644
--- a/include/pmacc/memory/buffers/MappedBufferIntern.hpp
+++ b/include/pmacc/memory/buffers/MappedBufferIntern.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Axel Huebl, Benjamin Worpitz,
+/* Copyright 2014-2021 Rene Widera, Axel Huebl, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -30,50 +30,50 @@
 
 namespace pmacc
 {
-
-/** Implementation of the DeviceBuffer interface for cuda mapped memory
- *
- * For all pmacc tasks and functions this buffer looks like native device buffer
- * but in real it is stored in host memory.
- */
-template <class TYPE, unsigned DIM>
-class MappedBufferIntern : public DeviceBuffer<TYPE, DIM>
-{
-    /** IMPORTANT: if someone implements that a MappedBufferIntern can points to an other
-     * mapped buffer then `getDataSpace()` in `getHostDataBox()` and `getDeviceDataBox`
-     * must be changed to `getPhysicalMemorySize`
-     */
-public:
-
-    typedef typename DeviceBuffer<TYPE, DIM>::DataBoxType DataBoxType;
-
-    /** constructor
+    /** Implementation of the DeviceBuffer interface for cuda mapped memory
      *
-     * @param size extent for each dimension (in elements)
+     * For all pmacc tasks and functions this buffer looks like native device buffer
+     * but in real it is stored in host memory.
      */
-    MappedBufferIntern(DataSpace<DIM> size):
-    DeviceBuffer<TYPE, DIM>(size, size),
-    pointer(nullptr), ownPointer(true)
+    template<class TYPE, unsigned DIM>
+    class MappedBufferIntern : public DeviceBuffer<TYPE, DIM>
     {
-#if( PMACC_CUDA_ENABLED == 1 )
-        CUDA_CHECK((cuplaError_t)cudaHostAlloc(&pointer, size.productOfComponents() * sizeof (TYPE), cudaHostAllocMapped));
+        /** IMPORTANT: if someone implements that a MappedBufferIntern can points to an other
+         * mapped buffer then `getDataSpace()` in `getHostDataBox()` and `getDeviceDataBox`
+         * must be changed to `getPhysicalMemorySize`
+         */
+    public:
+        typedef typename DeviceBuffer<TYPE, DIM>::DataBoxType DataBoxType;
+
+        /** constructor
+         *
+         * @param size extent for each dimension (in elements)
+         */
+        MappedBufferIntern(DataSpace<DIM> size)
+            : DeviceBuffer<TYPE, DIM>(size, size)
+            , pointer(nullptr)
+            , ownPointer(true)
+        {
+#if(PMACC_CUDA_ENABLED == 1)
+            CUDA_CHECK((
+                cuplaError_t) cudaHostAlloc(&pointer, size.productOfComponents() * sizeof(TYPE), cudaHostAllocMapped));
 #else
-        pointer = new TYPE[size.productOfComponents()];
+            pointer = new TYPE[size.productOfComponents()];
 #endif
-        reset(false);
-    }
-
-    /**
-     * destructor
-     */
-    virtual ~MappedBufferIntern()
-    {
-        __startOperation(ITask::TASK_CUDA);
-        __startOperation(ITask::TASK_HOST);
+            reset(false);
+        }
 
-        if (pointer && ownPointer)
+        /**
+         * destructor
+         */
+        virtual ~MappedBufferIntern()
         {
-#if( PMACC_CUDA_ENABLED == 1 )
+            __startOperation(ITask::TASK_DEVICE);
+            __startOperation(ITask::TASK_HOST);
+
+            if(pointer && ownPointer)
+            {
+#if(PMACC_CUDA_ENABLED == 1)
 /* cupla 0.2.0 does not support the function cudaHostAlloc to create mapped memory.
  * Therefore we need to call the native CUDA function cudaFreeHost to free memory.
  * Due to the renaming of cuda functions with cupla via macros we need to remove
@@ -85,137 +85,139 @@ class MappedBufferIntern : public DeviceBuffer<TYPE, DIM>
  *   https://github.com/ComputationalRadiationPhysics/alpaka/issues/296
  *   https://github.com/ComputationalRadiationPhysics/alpaka/issues/612
  */
-#   undef cudaFreeHost
-            CUDA_CHECK((cuplaError_t)cudaFreeHost(pointer));
+#    undef cudaFreeHost
+                CUDA_CHECK((cuplaError_t) cudaFreeHost(pointer));
 // re-introduce the cupla macro
-#   define cudaFreeHost(...) cuplaFreeHost(__VA_ARGS__)
+#    define cudaFreeHost(...) cuplaFreeHost(__VA_ARGS__)
 #else
-            __deleteArray(pointer);
+                __deleteArray(pointer);
 #endif
+            }
         }
-    }
 
-    /*! Get unchanged device pointer of memory
-     * @return device pointer to memory
-     */
-    TYPE* getBasePointer()
-    {
-        __startOperation(ITask::TASK_HOST);
-        return (TYPE*) this->getCudaPitched().ptr;
-    }
+        /*! Get unchanged device pointer of memory
+         * @return device pointer to memory
+         */
+        TYPE* getBasePointer()
+        {
+            __startOperation(ITask::TASK_HOST);
+            return (TYPE*) this->getCudaPitched().ptr;
+        }
 
-    /*! Get device pointer of memory
-     *
-     * This pointer is shifted by the offset, if this buffer points to other
-     * existing buffer
-     *
-     * @return device pointer to memory
-     */
-    TYPE* getPointer()
-    {
-        __startOperation(ITask::TASK_HOST);
-        return (TYPE*) this->getCudaPitched().ptr;
-    }
+        /*! Get device pointer of memory
+         *
+         * This pointer is shifted by the offset, if this buffer points to other
+         * existing buffer
+         *
+         * @return device pointer to memory
+         */
+        TYPE* getPointer()
+        {
+            __startOperation(ITask::TASK_HOST);
+            return (TYPE*) this->getCudaPitched().ptr;
+        }
 
-    void copyFrom(HostBuffer<TYPE, DIM>& other)
-    {
-        PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
-        Environment<>::get().Factory().createTaskCopyHostToDevice(other, *this);
-    }
+        void copyFrom(HostBuffer<TYPE, DIM>& other)
+        {
+            PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
+            Environment<>::get().Factory().createTaskCopyHostToDevice(other, *this);
+        }
 
-    void copyFrom(DeviceBuffer<TYPE, DIM>& other)
-    {
-        PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
-        Environment<>::get().Factory().createTaskCopyDeviceToDevice(other, *this);
-    }
+        void copyFrom(DeviceBuffer<TYPE, DIM>& other)
+        {
+            PMACC_ASSERT(this->isMyDataSpaceGreaterThan(other.getCurrentDataSpace()));
+            Environment<>::get().Factory().createTaskCopyDeviceToDevice(other, *this);
+        }
 
-    void reset(bool preserveData = true)
-    {
-        __startOperation(ITask::TASK_HOST);
-        this->setCurrentSize(this->getDataSpace().productOfComponents());
-        if (!preserveData)
-            memset(pointer, 0, this->getDataSpace().productOfComponents() * sizeof (TYPE));
-    }
+        void reset(bool preserveData = true)
+        {
+            __startOperation(ITask::TASK_HOST);
+            this->setCurrentSize(this->getDataSpace().productOfComponents());
+            if(!preserveData)
+                memset(pointer, 0, this->getDataSpace().productOfComponents() * sizeof(TYPE));
+        }
 
-    void setValue(const TYPE& value)
-    {
-        __startOperation(ITask::TASK_HOST);
-        size_t current_size = this->getCurrentSize();
-        for (size_t i = 0; i < current_size; i++)
+        void setValue(const TYPE& value)
         {
-            pointer[i] = value;
+            __startOperation(ITask::TASK_HOST);
+            size_t current_size = this->getCurrentSize();
+            for(size_t i = 0; i < current_size; i++)
+            {
+                pointer[i] = value;
+            }
         }
-    }
 
-    bool hasCurrentSizeOnDevice() const
-    {
-        return false;
-    }
+        bool hasCurrentSizeOnDevice() const
+        {
+            return false;
+        }
 
-    virtual size_t* getCurrentSizeHostSidePointer()
-    {
-        return this->current_size;
-    }
+        virtual size_t* getCurrentSizeHostSidePointer()
+        {
+            return this->current_size;
+        }
 
-    size_t* getCurrentSizeOnDevicePointer()
-    {
-        return nullptr;
-    }
+        size_t* getCurrentSizeOnDevicePointer()
+        {
+            return nullptr;
+        }
 
-    DataSpace<DIM> getOffset() const
-    {
-        return DataSpace<DIM>();
-    }
+        DataSpace<DIM> getOffset() const
+        {
+            return DataSpace<DIM>();
+        }
 
-    void setCurrentSize(const size_t size)
-    {
-        Buffer<TYPE, DIM>::setCurrentSize(size);
-    }
+        void setCurrentSize(const size_t size)
+        {
+            Buffer<TYPE, DIM>::setCurrentSize(size);
+        }
 
-    const cudaPitchedPtr getCudaPitched() const
-    {
-        __startOperation(ITask::TASK_CUDA);
-        TYPE* dPointer;
-        cudaHostGetDevicePointer(&dPointer, pointer, 0);
+        const cuplaPitchedPtr getCudaPitched() const
+        {
+            __startOperation(ITask::TASK_DEVICE);
+            TYPE* dPointer;
+            cuplaHostGetDevicePointer(&dPointer, pointer, 0);
+
+            /* on 1D memory we have no size for y, therefore we set y to 1 to
+             * get a valid cuplaPitchedPtr
+             */
+            int size_y = 1;
+            if(DIM > DIM1)
+                size_y = this->data_space[1];
+
+            return make_cuplaPitchedPtr(dPointer, this->data_space.x() * sizeof(TYPE), this->data_space.x(), size_y);
+        }
 
-        /* on 1D memory we have no size for y, therefore we set y to 1 to
-         * get a valid cudaPitchedPtr
-         */
-        int size_y=1;
-        if(DIM>DIM1)
-            size_y= this->data_space[1];
-
-        return make_cudaPitchedPtr(dPointer,
-                                   this->data_space.x() * sizeof (TYPE),
-                                   this->data_space.x(),
-                                   size_y
-                                   );
-    }
-
-    size_t getPitch() const
-    {
-        return this->data_space.x() * sizeof (TYPE);
-    }
+        size_t getPitch() const
+        {
+            return this->data_space.x() * sizeof(TYPE);
+        }
 
-    DataBoxType getHostDataBox()
-    {
-        __startOperation(ITask::TASK_HOST);
-        return DataBoxType(PitchedBox<TYPE, DIM > (pointer, DataSpace<DIM > (),
-                                                   this->data_space, this->data_space[0] * sizeof (TYPE)));
-    }
+        DataBoxType getHostDataBox()
+        {
+            __startOperation(ITask::TASK_HOST);
+            return DataBoxType(PitchedBox<TYPE, DIM>(
+                pointer,
+                DataSpace<DIM>(),
+                this->data_space,
+                this->data_space[0] * sizeof(TYPE)));
+        }
 
-    DataBoxType getDataBox()
-    {
-        __startOperation(ITask::TASK_CUDA);
-        TYPE* dPointer;
-        cudaHostGetDevicePointer(&dPointer, pointer, 0);
-        return DataBoxType(PitchedBox<TYPE, DIM > (dPointer, DataSpace<DIM > (),
-                                                   this->data_space, this->data_space[0] * sizeof (TYPE)));
-    }
-
-private:
-    TYPE* pointer;
-    bool ownPointer;
-};
-
-}
+        DataBoxType getDataBox()
+        {
+            __startOperation(ITask::TASK_DEVICE);
+            TYPE* dPointer;
+            cuplaHostGetDevicePointer(&dPointer, pointer, 0);
+            return DataBoxType(PitchedBox<TYPE, DIM>(
+                dPointer,
+                DataSpace<DIM>(),
+                this->data_space,
+                this->data_space[0] * sizeof(TYPE)));
+        }
+
+    private:
+        TYPE* pointer;
+        bool ownPointer;
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/memory/buffers/MultiGridBuffer.hpp b/include/pmacc/memory/buffers/MultiGridBuffer.hpp
index 22f46f41d5..f62dd4b0b3 100644
--- a/include/pmacc/memory/buffers/MultiGridBuffer.hpp
+++ b/include/pmacc/memory/buffers/MultiGridBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -37,242 +37,241 @@
 
 namespace pmacc
 {
-
-template<typename Type_, uint32_t communicationTag_ = 0, bool sizeOnDevice_ = false >
-        struct TypeDescriptionElement
-{
-    typedef Type_ Type;
-    static constexpr uint32_t communicationTag = communicationTag_;
-    static constexpr bool sizeOnDevice = sizeOnDevice_;
-
-
-};
-
-/**
- * GridBuffer represents a DIM-dimensional buffer which exists on the host as well as on the device.
- *
- * GridBuffer combines a HostBuffer and a DeviceBuffer with equal sizes.
- * Additionally, it allows sending data from and receiving data to these buffers.
- * Buffers consist of core data which may be surrounded by border data.
- *
- * @tparam Type_ datatype for internal Host- and DeviceBuffer
- * @tparam DIM dimension of the buffers
- * @tparam BufferNames a class with a enum with the name "Names" and member with the name "Count" with number of elements in Names
- * etc.:
- *  struct Mem
- *  {
- *    enum Names{VALUE1,VALUE2};
- *    static constexpr uint32_t Count=2;
- *  };
- * @tparam BORDERTYPE optional type for border data in the buffers. TYPE is used by default.
- */
-template <
-typename Type_,
-unsigned DIM,
-class BufferNames,
-class BORDERTYPE = Type_>
-class MultiGridBuffer
-{
-public:
-
-    typedef Type_ Type;
-    typedef DataBox<MultiBox<Type, DIM> > DataBoxType;
-    typedef GridBuffer<Type, DIM> GridBufferType;
-    typedef typename BufferNames::Names NameType;
-
-    /**
-     * Constructor.
-     *
-     * @param gridLayout layout of the buffers, including border-cells
-     * @param firstCommunicationTag optional value which can be used to tag ('name') this buffer in communications
-     * @param sizeOnDevice if true, size information exists on device, too.
-     */
-    MultiGridBuffer(const GridLayout<DIM>& gridLayout, bool sizeOnDevice = false) : blobDeviceBuffer(nullptr),blobHostBuffer(nullptr)
+    template<typename Type_, uint32_t communicationTag_ = 0, bool sizeOnDevice_ = false>
+    struct TypeDescriptionElement
     {
-        init(gridLayout, sizeOnDevice);
-    }
+        typedef Type_ Type;
+        static constexpr uint32_t communicationTag = communicationTag_;
+        static constexpr bool sizeOnDevice = sizeOnDevice_;
+    };
 
     /**
-     * Constructor.
+     * GridBuffer represents a DIM-dimensional buffer which exists on the host as well as on the device.
      *
-     * @param dataSpace DataSpace representing buffer size without border-cells
-     * @param firstCommunicationTag optional value which can be used to tag ('name') this buffer in communications
-     * @param sizeOnDevice if true, size information exists on device, too.
-     */
-    MultiGridBuffer(DataSpace<DIM>& dataSpace, bool sizeOnDevice = false) : blobDeviceBuffer(nullptr),blobHostBuffer(nullptr)
-    {
-        init(GridLayout<DIM > (dataSpace), sizeOnDevice);
-    }
-
-    /**
-     * Add Exchange in MultiGridBuffer memory space.
+     * GridBuffer combines a HostBuffer and a DeviceBuffer with equal sizes.
+     * Additionally, it allows sending data from and receiving data to these buffers.
+     * Buffers consist of core data which may be surrounded by border data.
      *
-     * An Exchange is added to this MultiGridBuffer. The exchange buffers use
-     * the same memory as this MultiGridBuffer.
-     *
-     * @param dataPlace place where received data are stored [GUARD | BORDER]
-     *        if dataPlace=GUARD than copy other BORDER to my GUARD
-     *        if dataPlace=BORDER than copy other GUARD to my BORDER
-     * @param receive a Mask which describes the directions for the exchange
-     * @param guardingCells number of guarding cells in each dimension
-     * @param firstCommunicationTag a object unique number to connect same objects from different nodes
-     * (MultiGridBuffer reserves all tags from [firstCommunicationTag;firstCommunicationTag+BufferNames::Count]
-     * @param sizeOnDevice if true, internal buffers have their size information on the device, too
+     * @tparam Type_ datatype for internal Host- and DeviceBuffer
+     * @tparam DIM dimension of the buffers
+     * @tparam BufferNames a class with a enum with the name "Names" and member with the name "Count" with number of
+     * elements in Names etc.: struct Mem
+     *  {
+     *    enum Names{VALUE1,VALUE2};
+     *    static constexpr uint32_t Count=2;
+     *  };
+     * @tparam BORDERTYPE optional type for border data in the buffers. TYPE is used by default.
      */
-    void addExchange(uint32_t dataPlace, const Mask &receive, DataSpace<DIM> guardingCells, uint32_t firstCommunicationTag, bool sizeOnDevice = false)
+    template<typename Type_, unsigned DIM, class BufferNames, class BORDERTYPE = Type_>
+    class MultiGridBuffer
     {
-        for (uint32_t i = 0; i < BufferNames::Count; ++i)
+    public:
+        typedef Type_ Type;
+        typedef DataBox<MultiBox<Type, DIM>> DataBoxType;
+        typedef GridBuffer<Type, DIM> GridBufferType;
+        typedef typename BufferNames::Names NameType;
+
+        /**
+         * Constructor.
+         *
+         * @param gridLayout layout of the buffers, including border-cells
+         * @param firstCommunicationTag optional value which can be used to tag ('name') this buffer in communications
+         * @param sizeOnDevice if true, size information exists on device, too.
+         */
+        MultiGridBuffer(const GridLayout<DIM>& gridLayout, bool sizeOnDevice = false)
+            : blobDeviceBuffer(nullptr)
+            , blobHostBuffer(nullptr)
         {
-            getGridBuffer(static_cast<NameType> (i)).addExchange(dataPlace, receive, guardingCells, firstCommunicationTag + i, sizeOnDevice);
+            init(gridLayout, sizeOnDevice);
         }
-    }
 
-    /**
-     * Destructor.
-     */
-    virtual ~MultiGridBuffer()
-    {
-        for (uint32_t i = 0; i < BufferNames::Count; ++i)
+        /**
+         * Constructor.
+         *
+         * @param dataSpace DataSpace representing buffer size without border-cells
+         * @param firstCommunicationTag optional value which can be used to tag ('name') this buffer in communications
+         * @param sizeOnDevice if true, size information exists on device, too.
+         */
+        MultiGridBuffer(DataSpace<DIM>& dataSpace, bool sizeOnDevice = false)
+            : blobDeviceBuffer(nullptr)
+            , blobHostBuffer(nullptr)
         {
-            __delete(gridBuffers[i]);
+            init(GridLayout<DIM>(dataSpace), sizeOnDevice);
         }
-        __delete(blobDeviceBuffer);
-        __delete(blobHostBuffer);
-    }
 
-    /**
-     * Resets both internal buffers.
-     *
-     * See DeviceBuffer::reset and HostBuffer::reset for details.
-     *
-     * @param preserveData determines if data on internal buffers should not be erased
-     */
-    void reset(bool preserveData = true)
-    {
-        for (uint32_t i = 0; i < BufferNames::Count; ++i)
+        /**
+         * Add Exchange in MultiGridBuffer memory space.
+         *
+         * An Exchange is added to this MultiGridBuffer. The exchange buffers use
+         * the same memory as this MultiGridBuffer.
+         *
+         * @param dataPlace place where received data are stored [GUARD | BORDER]
+         *        if dataPlace=GUARD than copy other BORDER to my GUARD
+         *        if dataPlace=BORDER than copy other GUARD to my BORDER
+         * @param receive a Mask which describes the directions for the exchange
+         * @param guardingCells number of guarding cells in each dimension
+         * @param firstCommunicationTag a object unique number to connect same objects from different nodes
+         * (MultiGridBuffer reserves all tags from [firstCommunicationTag;firstCommunicationTag+BufferNames::Count]
+         * @param sizeOnDevice if true, internal buffers have their size information on the device, too
+         */
+        void addExchange(
+            uint32_t dataPlace,
+            const Mask& receive,
+            DataSpace<DIM> guardingCells,
+            uint32_t firstCommunicationTag,
+            bool sizeOnDevice = false)
         {
-            getGridBuffer(static_cast<NameType> (i)).reset(preserveData);
+            for(uint32_t i = 0; i < BufferNames::Count; ++i)
+            {
+                getGridBuffer(static_cast<NameType>(i))
+                    .addExchange(dataPlace, receive, guardingCells, firstCommunicationTag + i, sizeOnDevice);
+            }
         }
-    }
 
-    /**
-     * Starts sync data from own device buffer to neighboring device buffer.
-     *
-     * Asynchronously starts synchronization of data from internal DeviceBuffer using added
-     * Exchange buffers.
-     *
-     */
-    EventTask asyncCommunication(EventTask serialEvent)
-    {
-        EventTask ev;
-
-        for (uint32_t i = 0; i < BufferNames::Count; ++i)
+        /**
+         * Destructor.
+         */
+        virtual ~MultiGridBuffer()
         {
-            ev += getGridBuffer(static_cast<NameType> (i)).asyncCommunication(serialEvent);
+            for(uint32_t i = 0; i < BufferNames::Count; ++i)
+            {
+                __delete(gridBuffers[i]);
+            }
+            __delete(blobDeviceBuffer);
+            __delete(blobHostBuffer);
         }
-        return ev;
-    }
 
-    /**
-     * Starts sync data from own device buffer to neighboring device buffer.
-     *
-     * Asynchronously starts synchronization of data from internal DeviceBuffer using added
-     * Exchange buffers.
-     * This operation runs sequentially to other code but uses asynchronous operations internally.
-     *
-     */
-    EventTask communication()
-    {
-        EventTask ev;
-        EventTask serialEvent = __getTransactionEvent();
-
-        for (uint32_t i = 0; i < BufferNames::Count; ++i)
+        /**
+         * Resets both internal buffers.
+         *
+         * See DeviceBuffer::reset and HostBuffer::reset for details.
+         *
+         * @param preserveData determines if data on internal buffers should not be erased
+         */
+        void reset(bool preserveData = true)
         {
-            ev += getGridBuffer(static_cast<NameType> (i)).asyncCommunication(serialEvent);
+            for(uint32_t i = 0; i < BufferNames::Count; ++i)
+            {
+                getGridBuffer(static_cast<NameType>(i)).reset(preserveData);
+            }
         }
-        __setTransactionEvent(ev);
-        return ev;
-    }
-
-    /**
-     * Asynchronously copies data from internal host to internal device buffer.
-     *
-     */
-    void hostToDevice()
-    {
 
-        for (uint32_t i = 0; i < BufferNames::Count; ++i)
+        /**
+         * Starts sync data from own device buffer to neighboring device buffer.
+         *
+         * Asynchronously starts synchronization of data from internal DeviceBuffer using added
+         * Exchange buffers.
+         *
+         */
+        EventTask asyncCommunication(EventTask serialEvent)
         {
-            getGridBuffer(static_cast<NameType> (i)).hostToDevice();
+            EventTask ev;
+
+            for(uint32_t i = 0; i < BufferNames::Count; ++i)
+            {
+                ev += getGridBuffer(static_cast<NameType>(i)).asyncCommunication(serialEvent);
+            }
+            return ev;
         }
-    }
 
-    /**
-     * Asynchronously copies data from internal device to internal host buffer.
-     */
-    void deviceToHost()
-    {
-        for (uint32_t i = 0; i < BufferNames::Count; ++i)
+        /**
+         * Starts sync data from own device buffer to neighboring device buffer.
+         *
+         * Asynchronously starts synchronization of data from internal DeviceBuffer using added
+         * Exchange buffers.
+         * This operation runs sequentially to other code but uses asynchronous operations internally.
+         *
+         */
+        EventTask communication()
         {
-            getGridBuffer(static_cast<NameType> (i)).deviceToHost();
+            EventTask ev;
+            EventTask serialEvent = __getTransactionEvent();
+
+            for(uint32_t i = 0; i < BufferNames::Count; ++i)
+            {
+                ev += getGridBuffer(static_cast<NameType>(i)).asyncCommunication(serialEvent);
+            }
+            __setTransactionEvent(ev);
+            return ev;
         }
-    }
-
-    GridBuffer<Type, DIM>& getGridBuffer(typename BufferNames::Names name)
-    {
-        PMACC_ASSERT(name >= 0 && name < BufferNames::Count);
-        return *gridBuffers[name];
-    }
-
-    DataBoxType getHostDataBox()
-    {
-        __startOperation(ITask::TASK_HOST);
-        return DataBoxType(MultiBox<Type, DIM > (getGridBuffer(static_cast<NameType> (0)).getHostBuffer().getBasePointer(),
-                                                 DataSpace<DIM > (),
-                                                 getGridBuffer(static_cast<NameType> (0)).getHostBuffer().getPhysicalMemorySize(),
-                                                 getGridBuffer(static_cast<NameType> (0)).getHostBuffer().getPhysicalMemorySize().x() * sizeof (Type)));
-    }
 
-    DataBoxType getDeviceDataBox()
-    {
-        __startOperation(ITask::TASK_CUDA);
-        return DataBoxType(MultiBox<Type, DIM > (getGridBuffer(static_cast<NameType> (0)).getDeviceBuffer().getBasePointer(),
-                                                 getGridBuffer(static_cast<NameType> (0)).getDeviceBuffer().getOffset(),
-                                                 getGridBuffer(static_cast<NameType> (0)).getDeviceBuffer().getPhysicalMemorySize(),
-                                                 getGridBuffer(static_cast<NameType> (0)).getDeviceBuffer().getCudaPitched().pitch));
-    }
-
-private:
-
-    void init(GridLayout<DIM> gridLayout, bool sizeOnDevice)
-    {
-        DataSpace<DIM> blobOffset;
-        blobOffset[DIM - 1] = gridLayout.getDataSpace()[DIM - 1];
-
-        DataSpace<DIM> blobSize = gridLayout.getDataSpace() + blobOffset * (BufferNames::Count - 1);
-
-        blobDeviceBuffer = new DeviceBufferIntern<Type_, DIM > (blobSize, false);
-        blobHostBuffer = new HostBufferIntern<Type_, DIM > (blobSize);
-
-        for (uint32_t i = 0; i < BufferNames::Count; ++i)
+        /**
+         * Asynchronously copies data from internal host to internal device buffer.
+         *
+         */
+        void hostToDevice()
         {
-            DataSpace<DIM> offset = blobOffset*i;
-            gridBuffers[i] = new GridBuffer<Type, DIM > (
-                                                         *blobHostBuffer, offset,
-                                                         *blobDeviceBuffer, offset,
-                                                         gridLayout, sizeOnDevice);
+            for(uint32_t i = 0; i < BufferNames::Count; ++i)
+            {
+                getGridBuffer(static_cast<NameType>(i)).hostToDevice();
+            }
         }
-    }
 
+        /**
+         * Asynchronously copies data from internal device to internal host buffer.
+         */
+        void deviceToHost()
+        {
+            for(uint32_t i = 0; i < BufferNames::Count; ++i)
+            {
+                getGridBuffer(static_cast<NameType>(i)).deviceToHost();
+            }
+        }
 
+        GridBuffer<Type, DIM>& getGridBuffer(typename BufferNames::Names name)
+        {
+            PMACC_ASSERT(name >= 0 && name < BufferNames::Count);
+            return *gridBuffers[name];
+        }
 
-protected:
+        DataBoxType getHostDataBox()
+        {
+            __startOperation(ITask::TASK_HOST);
+            return DataBoxType(MultiBox<Type, DIM>(
+                getGridBuffer(static_cast<NameType>(0)).getHostBuffer().getBasePointer(),
+                DataSpace<DIM>(),
+                getGridBuffer(static_cast<NameType>(0)).getHostBuffer().getPhysicalMemorySize(),
+                getGridBuffer(static_cast<NameType>(0)).getHostBuffer().getPhysicalMemorySize().x() * sizeof(Type)));
+        }
 
-    DeviceBufferIntern<Type, DIM>* blobDeviceBuffer;
-    HostBufferIntern<Type, DIM>* blobHostBuffer;
-    GridBufferType* gridBuffers[BufferNames::Count];
+        DataBoxType getDeviceDataBox()
+        {
+            __startOperation(ITask::TASK_DEVICE);
+            return DataBoxType(MultiBox<Type, DIM>(
+                getGridBuffer(static_cast<NameType>(0)).getDeviceBuffer().getBasePointer(),
+                getGridBuffer(static_cast<NameType>(0)).getDeviceBuffer().getOffset(),
+                getGridBuffer(static_cast<NameType>(0)).getDeviceBuffer().getPhysicalMemorySize(),
+                getGridBuffer(static_cast<NameType>(0)).getDeviceBuffer().getCudaPitched().pitch));
+        }
 
-};
-}
+    private:
+        void init(GridLayout<DIM> gridLayout, bool sizeOnDevice)
+        {
+            DataSpace<DIM> blobOffset;
+            blobOffset[DIM - 1] = gridLayout.getDataSpace()[DIM - 1];
+
+            DataSpace<DIM> blobSize = gridLayout.getDataSpace() + blobOffset * (BufferNames::Count - 1);
+
+            blobDeviceBuffer = new DeviceBufferIntern<Type_, DIM>(blobSize, false);
+            blobHostBuffer = new HostBufferIntern<Type_, DIM>(blobSize);
+
+            for(uint32_t i = 0; i < BufferNames::Count; ++i)
+            {
+                DataSpace<DIM> offset = blobOffset * i;
+                gridBuffers[i] = new GridBuffer<Type, DIM>(
+                    *blobHostBuffer,
+                    offset,
+                    *blobDeviceBuffer,
+                    offset,
+                    gridLayout,
+                    sizeOnDevice);
+            }
+        }
 
 
+    protected:
+        DeviceBufferIntern<Type, DIM>* blobDeviceBuffer;
+        HostBufferIntern<Type, DIM>* blobHostBuffer;
+        GridBufferType* gridBuffers[BufferNames::Count];
+    };
+} // namespace pmacc
diff --git a/include/pmacc/memory/dataTypes/Mask.hpp b/include/pmacc/memory/dataTypes/Mask.hpp
index 7da82d5693..8c2431579a 100644
--- a/include/pmacc/memory/dataTypes/Mask.hpp
+++ b/include/pmacc/memory/dataTypes/Mask.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera, Wolfgang Hoenig,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera, Wolfgang Hoenig,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -29,7 +29,6 @@
 
 namespace pmacc
 {
-
     /**
      * Mask is used to describe in which directions data must be
      * sent/received or where a grid node has neighbors.
@@ -37,14 +36,12 @@ namespace pmacc
     class Mask
     {
     public:
-
         /**
          * Constructor.
          *
          * Sets this mask to 0 (nothing).
          */
-        Mask() :
-        bitMask(0u)
+        Mask() : bitMask(0u)
         {
         }
 
@@ -55,8 +52,7 @@ namespace pmacc
          *
          * @param ex directions for this mask
          */
-        Mask(ExchangeType ex) :
-        bitMask(1u << ex)
+        Mask(ExchangeType ex) : bitMask(1u << ex)
         {
         }
 
@@ -67,8 +63,7 @@ namespace pmacc
          *
          * @param ex directions for this mask
          */
-        Mask(uint32_t ex) :
-        bitMask(1u << ex)
+        Mask(uint32_t ex) : bitMask(1u << ex)
         {
         }
 
@@ -90,7 +85,7 @@ namespace pmacc
         /**
          * Gives uint32_t value of this mask.
          */
-        Mask & operator=(uint32_t other)
+        Mask& operator=(uint32_t other)
         {
             bitMask = other;
             return *this;
@@ -104,7 +99,7 @@ namespace pmacc
          * @param other Mask with directions to join
          * @return the newly created mask
          */
-        Mask operator+(const Mask &other) const
+        Mask operator+(const Mask& other) const
         {
             Mask result;
             result.bitMask = bitMask | other.bitMask;
@@ -119,7 +114,7 @@ namespace pmacc
          * @param other Mask with directions to intersect with
          * @return the newly created mask
          */
-        Mask operator&(const Mask &other) const
+        Mask operator&(const Mask& other) const
         {
             Mask result;
             result.bitMask = bitMask & other.bitMask;
@@ -139,18 +134,18 @@ namespace pmacc
          */
         HDINLINE bool containsExchangeType(uint32_t ex) const
         {
-            for (uint32_t i = 1; i < 27; i++) //first bit in mask is 1u<<RIGHT
+            for(uint32_t i = 1; i < 27; i++) // first bit in mask is 1u<<RIGHT
             {
-                if (isSet(i))
+                if(isSet(i))
                 {
                     uint32_t tmp = i;
                     uint32_t tmp_ex = ex;
-                    while (tmp_ex >= 3)
+                    while(tmp_ex >= 3)
                     {
                         tmp_ex /= 3;
                         tmp /= 3;
                     }
-                    if (tmp % 3 == tmp_ex)
+                    if(tmp % 3 == tmp_ex)
                         return true;
                 }
             }
@@ -182,9 +177,9 @@ namespace pmacc
         Mask getMirroredMask() const
         {
             uint32_t tmp = 0;
-            for (uint32_t i = 1; i < 27; i++) //first bit in mask is 1u<<RIGHT
+            for(uint32_t i = 1; i < 27; i++) // first bit in mask is 1u<<RIGHT
             {
-                if (isSet((ExchangeType) i))
+                if(isSet((ExchangeType) i))
                 {
                     tmp |= (1u << getMirroredExchangeType((ExchangeType) i));
                 }
@@ -203,22 +198,22 @@ namespace pmacc
          */
         static ExchangeType getMirroredExchangeType(uint32_t ex)
         {
-            if (ex >= traits::NumberOfExchanges<DIM3>::value)
+            if(ex >= traits::NumberOfExchanges<DIM3>::value)
                 throw std::runtime_error("parameter exceeds allowed maximum");
 
             Mask mask(ex);
             uint32_t tmp = 0;
-            if (mask.containsExchangeType(RIGHT))
+            if(mask.containsExchangeType(RIGHT))
                 tmp += LEFT;
-            if (mask.containsExchangeType(LEFT))
+            if(mask.containsExchangeType(LEFT))
                 tmp += RIGHT;
-            if (mask.containsExchangeType(BOTTOM))
+            if(mask.containsExchangeType(BOTTOM))
                 tmp += TOP;
-            if (mask.containsExchangeType(TOP))
+            if(mask.containsExchangeType(TOP))
                 tmp += BOTTOM;
-            if (mask.containsExchangeType(FRONT))
+            if(mask.containsExchangeType(FRONT))
                 tmp += BACK;
-            if (mask.containsExchangeType(BACK))
+            if(mask.containsExchangeType(BACK))
                 tmp += FRONT;
 
             return (ExchangeType) tmp;
@@ -237,11 +232,11 @@ namespace pmacc
          * @return DataSpace with relative offsets
          */
         template<unsigned DIM>
-        static HDINLINE DataSpace<DIM> getRelativeDirections( uint32_t direction)
+        static HDINLINE DataSpace<DIM> getRelativeDirections(uint32_t direction)
         {
             DataSpace<DIM> tmp;
 
-            for( uint32_t d = 0; d < DIM; ++d )
+            for(uint32_t d = 0; d < DIM; ++d)
             {
                 const int dim_direction(direction % 3);
                 tmp[d] = (dim_direction == 2 ? -1 : dim_direction);
@@ -251,12 +246,10 @@ namespace pmacc
         }
 
     protected:
-
         /**
          * mask which is a combination of the type \see ExchangeType
          */
         uint32_t bitMask;
-
     };
 
     /** special implementation for `DIM1`
@@ -264,9 +257,9 @@ namespace pmacc
      * optimization: no modulo is used
      */
     template<>
-    HDINLINE DataSpace<DIM1> Mask::getRelativeDirections( uint32_t direction)
+    HDINLINE DataSpace<DIM1> Mask::getRelativeDirections(uint32_t direction)
     {
-        return (direction == 2 ? DataSpace<DIM1 > (-1) : DataSpace<DIM1 > (direction));
+        return (direction == 2 ? DataSpace<DIM1>(-1) : DataSpace<DIM1>(direction));
     }
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/memory/shared/Allocate.hpp b/include/pmacc/memory/shared/Allocate.hpp
index d60c5e8b00..a8ef472773 100644
--- a/include/pmacc/memory/shared/Allocate.hpp
+++ b/include/pmacc/memory/shared/Allocate.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,87 +28,60 @@
 
 namespace pmacc
 {
-namespace memory
-{
-namespace shared
-{
-
-    /** allocate shared memory
-     *
-     * shared memory is always uninitialized
-     *
-     * @tparam T_uniqueId unique id for this object
-     *          (is needed if more than one instance of shared memory in one kernel is used)
-     * @tparam T_Type type of the stored object
-     */
-    template<
-        uint32_t T_uniqueId,
-        typename T_Type
-    >
-    struct Allocate
+    namespace memory
     {
-        /** get a shared memory
-         *
-         * @return reference to shared memory
-         */
-        template< typename T_Acc >
-        static DINLINE T_Type &
-        get( T_Acc const & acc )
+        namespace shared
         {
-            auto& smem = ::alpaka::block::shared::st::allocVar<
-                T_Type,
-                T_uniqueId
-            >( acc );
-            return smem;
-        }
-    };
+            /** allocate shared memory
+             *
+             * shared memory is always uninitialized
+             *
+             * @tparam T_uniqueId unique id for this object
+             *          (is needed if more than one instance of shared memory in one kernel is used)
+             * @tparam T_Type type of the stored object
+             */
+            template<uint32_t T_uniqueId, typename T_Type>
+            struct Allocate
+            {
+                /** get a shared memory
+                 *
+                 * @return reference to shared memory
+                 */
+                template<typename T_Acc>
+                static DINLINE T_Type& get(T_Acc const& acc)
+                {
+                    auto& smem = ::alpaka::declareSharedVar<T_Type, T_uniqueId>(acc);
+                    return smem;
+                }
+            };
 
-    /** allocate shared memory
-     *
-     * shared memory is always uninitialized
-     *
-     * @tparam T_uniqueId unique id for this object
-     *          (is needed if more than one instance of shared memory in one kernel is used)
-     * @tparam T_Type type of the stored object
-     * @return reference to shared memory
-     *
-     * @{
-     */
-    template<
-        uint32_t T_uniqueId,
-        typename T_Type,
-        typename T_Acc
-    >
-    DINLINE T_Type&
-    allocate( T_Acc const & acc )
-    {
-        return Allocate<
-            T_uniqueId,
-            T_Type
-        >::get( acc );
-    }
+            /** allocate shared memory
+             *
+             * shared memory is always uninitialized
+             *
+             * @tparam T_uniqueId unique id for this object
+             *          (is needed if more than one instance of shared memory in one kernel is used)
+             * @tparam T_Type type of the stored object
+             * @return reference to shared memory
+             *
+             * @{
+             */
+            template<uint32_t T_uniqueId, typename T_Type, typename T_Acc>
+            DINLINE T_Type& allocate(T_Acc const& acc)
+            {
+                return Allocate<T_uniqueId, T_Type>::get(acc);
+            }
 
-    /* @param instance of the type to store (is not to initialize the shared memory) */
-    template<
-        uint32_t T_uniqueId,
-        typename T_Type,
-        typename T_Acc
-    >
-    DINLINE T_Type&
-    allocate(
-        T_Acc const & acc,
-        T_Type const &
-    )
-    {
-        return Allocate<
-            T_uniqueId,
-            T_Type
-        >::get( );
-    }
-    /** @} */
+            /* @param instance of the type to store (is not to initialize the shared memory) */
+            template<uint32_t T_uniqueId, typename T_Type, typename T_Acc>
+            DINLINE T_Type& allocate(T_Acc const& acc, T_Type const&)
+            {
+                return Allocate<T_uniqueId, T_Type>::get();
+            }
+            /** @} */
 
-} // namespace shared
-} // namespace memory
+        } // namespace shared
+    } // namespace memory
 } // namespace pmacc
 
 /** allocate shared memory
@@ -149,4 +122,4 @@ namespace shared
  * @param varName name of the variable
  * @param ... type of the variable
  */
-#define PMACC_SMEM( acc, varName, ... ) auto & varName = pmacc::memory::shared::allocate< __COUNTER__, __VA_ARGS__ >( acc )
+#define PMACC_SMEM(acc, varName, ...) auto& varName = pmacc::memory::shared::allocate<__COUNTER__, __VA_ARGS__>(acc)
diff --git a/include/pmacc/meta/AllCombinations.hpp b/include/pmacc/meta/AllCombinations.hpp
index 68879715a9..21399063c0 100644
--- a/include/pmacc/meta/AllCombinations.hpp
+++ b/include/pmacc/meta/AllCombinations.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2014-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -38,148 +38,135 @@
 
 namespace pmacc
 {
-namespace bmpl = boost::mpl;
+    namespace bmpl = boost::mpl;
 
-namespace detail
-{
-/** Create tuples out of the elements of N sequences
- *
- * Combines all elements of N given sequences in T_MplSeq into N-tuples.
- * If the number of elements in each sequence is S0, S1, ... S(N-1)
- * than the resulting sequence will contain S0 * S1 * ... S(N-1) tuples.
- *
- * @tparam T_MplSeq sequence of input sequences
- * @tparam T_TmpResult temporary result
- * @tparam T_isEmpty true if T_MplSeq is empty else false
- */
-template<typename T_MplSeq,
-typename T_TmpResult = bmpl::vector0<>,
-bool T_isEmpty = bmpl::empty<T_MplSeq>::value
->
-struct AllCombinations;
-
-/** implementation for inner recursive creation
- */
-template<typename T_MplSeq, typename T_TmpResult>
-struct AllCombinations<T_MplSeq, T_TmpResult, false >
-{
-    typedef T_MplSeq MplSeq;
-    typedef T_TmpResult TmpResult;
-
-    static constexpr uint32_t rangeVectorSize = bmpl::size<MplSeq>::value;
-    typedef typename bmpl::at<MplSeq, bmpl::integral_c<uint32_t, rangeVectorSize - 1 > > ::type LastElement;
-    typedef bmpl::empty<LastElement> IsLastElementEmpty;
-    typedef typename MakeSeq<LastElement>::type LastElementAsSequence;
-    typedef typename bmpl::pop_back<MplSeq>::type ShrinkedRangeVector;
-
-    /* copy last given sequence to a mpl::vector to be sure that we can later on
-     * call mpl::transform even if the input sequence is mpl::range_c
+    namespace detail
+    {
+        /** Create tuples out of the elements of N sequences
+         *
+         * Combines all elements of N given sequences in T_MplSeq into N-tuples.
+         * If the number of elements in each sequence is S0, S1, ... S(N-1)
+         * than the resulting sequence will contain S0 * S1 * ... S(N-1) tuples.
+         *
+         * @tparam T_MplSeq sequence of input sequences
+         * @tparam T_TmpResult temporary result
+         * @tparam T_isEmpty true if T_MplSeq is empty else false
+         */
+        template<
+            typename T_MplSeq,
+            typename T_TmpResult = bmpl::vector0<>,
+            bool T_isEmpty = bmpl::empty<T_MplSeq>::value>
+        struct AllCombinations;
+
+        /** implementation for inner recursive creation
+         */
+        template<typename T_MplSeq, typename T_TmpResult>
+        struct AllCombinations<T_MplSeq, T_TmpResult, false>
+        {
+            typedef T_MplSeq MplSeq;
+            typedef T_TmpResult TmpResult;
+
+            static constexpr uint32_t rangeVectorSize = bmpl::size<MplSeq>::value;
+            typedef typename bmpl::at<MplSeq, bmpl::integral_c<uint32_t, rangeVectorSize - 1>>::type LastElement;
+            typedef bmpl::empty<LastElement> IsLastElementEmpty;
+            typedef typename MakeSeq<LastElement>::type LastElementAsSequence;
+            typedef typename bmpl::pop_back<MplSeq>::type ShrinkedRangeVector;
+
+            /* copy last given sequence to a mpl::vector to be sure that we can later on
+             * call mpl::transform even if the input sequence is mpl::range_c
+             */
+            typedef typename bmpl::copy<LastElementAsSequence, bmpl::back_inserter<bmpl::vector0<>>>::type TmpVector;
+
+            /** Assign to each element in a sequence of CT::Vector(s) a type at a given
+             *  component position
+             *
+             * @tparam T_ComponentPos position of the component to be changed (type must be
+             * bmpl::integral_c<uint32_t,X>)
+             * @tparam T_Element value (type) which should replace the component at position T_Component
+             *                   in the CT::Vector elements
+             */
+            template<typename T_ComponentPos, typename T_Element>
+            struct AssignToAnyElementInVector
+            {
+                typedef TmpResult InVector;
+                typedef T_Element Element;
+
+                typedef typename bmpl::
+                    transform<InVector, pmacc::math::CT::Assign<bmpl::_1, T_ComponentPos, Element>>::type type;
+            };
+
+            typedef typename bmpl::transform<
+                TmpVector,
+                AssignToAnyElementInVector<bmpl::integral_c<uint32_t, rangeVectorSize - 1>, bmpl::_1>>::type NestedSeq;
+
+            typedef typename MakeSeqFromNestedSeq<NestedSeq>::type OneSeq;
+
+            typedef typename detail::AllCombinations<ShrinkedRangeVector, OneSeq>::type ResultIfNotEmpty;
+            typedef typename bmpl::if_<IsLastElementEmpty, bmpl::vector0<>, ResultIfNotEmpty>::type type;
+        };
+
+        /** recursive end implementation
+         */
+        template<typename T_MplSeq, typename T_TmpResult>
+        struct AllCombinations<T_MplSeq, T_TmpResult, true>
+        {
+            typedef T_TmpResult type;
+        };
+
+    } // namespace detail
+
+
+    /** Create tuples out of the elements of N sequences
+     *
+     * Combines all elements of N given sequences in T_MplSeq into N-tuples.
+     * If the number of elements in each sequence is S0, S1, ... S(N-1)
+     * than the resulting sequence will contain S0 * S1 * ... S(N-1) tuples.
+     *
+     * example:
+     *
+     * sequence  == [ ]
+     * tuple     == ( )
+     *
+     * T_MplSeq = [[1,2],[1],[4,3]]
+     * combined to
+     * AllCombinations<T_MplSeq>::type = [(1,1,4),(1,1,3),(2,1,4),(2,1,3)]
+     *
+     * @tparam T_MplSeq N-dimensional sequence with input values
+     *                  or single type (e.g. `bmpl::integral_c<uint32_t,5>`)
+     *                  (if `T_MplSeq` is only one type it will be transformed to a sequence)
+     * @typedef AllCombinations<T_MplSeq>::type
+     *          MplSequence of N-tuples
      */
-    typedef typename bmpl::copy<LastElementAsSequence, bmpl::back_inserter< bmpl::vector0<> > >::type TmpVector;
-
-    /** Assign to each element in a sequence of CT::Vector(s) a type at a given
-    *  component position
-    *
-    * @tparam T_ComponentPos position of the component to be changed (type must be bmpl::integral_c<uint32_t,X>)
-    * @tparam T_Element value (type) which should replace the component at position T_Component
-    *                   in the CT::Vector elements
-    */
-    template<
-        typename T_ComponentPos,
-        typename T_Element
-    >
-    struct AssignToAnyElementInVector
+    template<typename T_MplSeq>
+    struct AllCombinations
     {
-        typedef TmpResult InVector;
-        typedef T_Element Element;
-
-        typedef typename bmpl::transform<
-                InVector,
-                pmacc::math::CT::Assign<
-                    bmpl::_1,
-                    T_ComponentPos,
-                    Element
-                >
-            >::type type;
-    };
-
-    typedef typename bmpl::transform<
-        TmpVector,
-        AssignToAnyElementInVector<
-            bmpl::integral_c<uint32_t, rangeVectorSize - 1 >,
-            bmpl::_1
-        >
-    >::type NestedSeq;
+        /* if T_MplSeq is no sequence it is a single type, we put this type in
+         * a sequence because all next algorithms can only work with sequences */
+        typedef typename MakeSeq<T_MplSeq>::type MplSeq;
 
-    typedef typename MakeSeqFromNestedSeq<NestedSeq>::type OneSeq;
+        static constexpr uint32_t rangeVectorSize = bmpl::size<MplSeq>::value;
+        typedef typename bmpl::at<MplSeq, bmpl::integral_c<uint32_t, rangeVectorSize - 1>>::type LastElement;
+        typedef bmpl::empty<LastElement> IsLastElementEmpty;
+        typedef typename MakeSeq<LastElement>::type LastElementAsSequence;
 
-    typedef typename detail::AllCombinations<ShrinkedRangeVector, OneSeq>::type ResultIfNotEmpty;
-    typedef typename bmpl::if_<IsLastElementEmpty,bmpl::vector0<>,ResultIfNotEmpty>::type type;
-};
+        typedef typename bmpl::pop_back<MplSeq>::type ShrinkedRangeVector;
+        /* copy last given sequence to a mpl::vector to be sure that we can later on
+         * call mpl::transform even if the input sequence is mpl::range_c
+         */
+        typedef typename bmpl::copy<LastElementAsSequence, bmpl::back_inserter<bmpl::vector0<>>>::type TmpVector;
 
-/** recursive end implementation
- */
-template<typename T_MplSeq, typename T_TmpResult>
-struct AllCombinations<T_MplSeq, T_TmpResult, true >
-{
-    typedef T_TmpResult type;
-};
 
-} //detail
-
-
-/** Create tuples out of the elements of N sequences
- *
- * Combines all elements of N given sequences in T_MplSeq into N-tuples.
- * If the number of elements in each sequence is S0, S1, ... S(N-1)
- * than the resulting sequence will contain S0 * S1 * ... S(N-1) tuples.
- *
- * example:
- *
- * sequence  == [ ]
- * tuple     == ( )
- *
- * T_MplSeq = [[1,2],[1],[4,3]]
- * combined to
- * AllCombinations<T_MplSeq>::type = [(1,1,4),(1,1,3),(2,1,4),(2,1,3)]
- *
- * @tparam T_MplSeq N-dimensional sequence with input values
- *                  or single type (e.g. `bmpl::integral_c<uint32_t,5>`)
- *                  (if `T_MplSeq` is only one type it will be transformed to a sequence)
- * @typedef AllCombinations<T_MplSeq>::type
- *          MplSequence of N-tuples
- */
-template<typename T_MplSeq>
-struct AllCombinations
-{
-    /* if T_MplSeq is no sequence it is a single type, we put this type in
-     * a sequence because all next algorithms can only work with sequences */
-    typedef typename MakeSeq<T_MplSeq>::type MplSeq;
-
-    static constexpr uint32_t rangeVectorSize = bmpl::size<MplSeq>::value;
-    typedef typename bmpl::at<MplSeq, bmpl::integral_c<uint32_t, rangeVectorSize - 1 > > ::type LastElement;
-    typedef bmpl::empty<LastElement> IsLastElementEmpty;
-    typedef typename MakeSeq<LastElement>::type LastElementAsSequence;
-
-    typedef typename bmpl::pop_back<MplSeq>::type ShrinkedRangeVector;
-    /* copy last given sequence to a mpl::vector to be sure that we can later on
-     * call mpl::transform even if the input sequence is mpl::range_c
-     */
-    typedef typename bmpl::copy<LastElementAsSequence, bmpl::back_inserter< bmpl::vector0<> > >::type TmpVector;
-
-
-
-    /* transform all elements in the vector to math::CT::vector<> */
-    typedef math::CT::Vector<> EmptyVector;
-    typedef typename bmpl::transform<
-    TmpVector,
-    pmacc::math::CT::Assign<EmptyVector, bmpl::integral_c<uint32_t, rangeVectorSize - 1 >, bmpl::_1>
-    >::type FirstList;
+        /* transform all elements in the vector to math::CT::vector<> */
+        typedef math::CT::Vector<> EmptyVector;
+        typedef typename bmpl::transform<
+            TmpVector,
+            pmacc::math::CT::Assign<EmptyVector, bmpl::integral_c<uint32_t, rangeVectorSize - 1>, bmpl::_1>>::type
+            FirstList;
 
-    /* result type: MplSequence of N-tuples */
-    typedef typename detail::AllCombinations<ShrinkedRangeVector, FirstList>::type ResultIfNotEmpty;
-    typedef typename bmpl::if_<IsLastElementEmpty,bmpl::vector0<>,ResultIfNotEmpty>::type type;
-};
+        /* result type: MplSequence of N-tuples */
+        typedef typename detail::AllCombinations<ShrinkedRangeVector, FirstList>::type ResultIfNotEmpty;
+        typedef typename bmpl::if_<IsLastElementEmpty, bmpl::vector0<>, ResultIfNotEmpty>::type type;
+    };
 
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/ForEach.hpp b/include/pmacc/meta/ForEach.hpp
index 72a6468301..2236ab3a4a 100644
--- a/include/pmacc/meta/ForEach.hpp
+++ b/include/pmacc/meta/ForEach.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -34,158 +34,114 @@
 
 namespace pmacc
 {
-namespace meta
-{
-namespace detail
-{
-    /** call the functor were itBegin points to
-     *
-     *  \tparam itBegin iterator to an element in a mpl sequence
-     *  \tparam itEnd iterator to the end of a mpl sequence
-     *  \tparam isEnd true if itBegin == itEnd, else false
-     */
-    template<
-        typename itBegin,
-        typename itEnd,
-        bool isEnd = boost::is_same<
-            itBegin,
-            itEnd
-        >::value
-    >
-    struct CallFunctorOfIterator
+    namespace meta
     {
-        typedef typename boost::mpl::next< itBegin >::type nextIt;
-        typedef typename boost::mpl::deref< itBegin >::type Functor;
-        typedef CallFunctorOfIterator<
-            nextIt,
-            itEnd
-        > NextCall;
-
-        PMACC_NO_NVCC_HDWARNING
-        template< typename ... T_Types >
-        HDINLINE void
-        operator( )( T_Types && ... ts ) const
+        namespace detail
         {
-            Functor( )( std::forward< T_Types >( ts ) ... );
-            NextCall( )( ts ... );
-        }
-
-        PMACC_NO_NVCC_HDWARNING
-        template< typename... T_Types >
-        HDINLINE void
-        operator( )( T_Types && ... ts )
-        {
-            Functor( )( std::forward< T_Types >( ts ) ... );
-            NextCall( )( ts ... );
-        }
-    };
-
-    /** Recursion end of ForEach */
-    template<
-    typename itBegin,
-    typename itEnd>
-    struct CallFunctorOfIterator<
-        itBegin,
-        itEnd,
-        true
-    >
-    {
-        PMACC_NO_NVCC_HDWARNING
-        template< typename ... T_Types >
-        HDINLINE void
-        operator()( T_Types && ... ) const
-        {
-
-        }
-
-        PMACC_NO_NVCC_HDWARNING
-        template< typename ... T_Types >
-        HDINLINE void
-        operator()( T_Types && ... )
+            /** call the functor were itBegin points to
+             *
+             *  \tparam itBegin iterator to an element in a mpl sequence
+             *  \tparam itEnd iterator to the end of a mpl sequence
+             *  \tparam isEnd true if itBegin == itEnd, else false
+             */
+            template<typename itBegin, typename itEnd, bool isEnd = boost::is_same<itBegin, itEnd>::value>
+            struct CallFunctorOfIterator
+            {
+                typedef typename boost::mpl::next<itBegin>::type nextIt;
+                typedef typename boost::mpl::deref<itBegin>::type Functor;
+                typedef CallFunctorOfIterator<nextIt, itEnd> NextCall;
+
+                PMACC_NO_NVCC_HDWARNING
+                template<typename... T_Types>
+                HDINLINE void operator()(T_Types&&... ts) const
+                {
+                    Functor()(std::forward<T_Types>(ts)...);
+                    NextCall()(ts...);
+                }
+
+                PMACC_NO_NVCC_HDWARNING
+                template<typename... T_Types>
+                HDINLINE void operator()(T_Types&&... ts)
+                {
+                    Functor()(std::forward<T_Types>(ts)...);
+                    NextCall()(ts...);
+                }
+            };
+
+            /** Recursion end of ForEach */
+            template<typename itBegin, typename itEnd>
+            struct CallFunctorOfIterator<itBegin, itEnd, true>
+            {
+                PMACC_NO_NVCC_HDWARNING
+                template<typename... T_Types>
+                HDINLINE void operator()(T_Types&&...) const
+                {
+                }
+
+                PMACC_NO_NVCC_HDWARNING
+                template<typename... T_Types>
+                HDINLINE void operator()(T_Types&&...)
+                {
+                }
+            };
+
+        } // namespace detail
+
+        /** Compile-Time for each for Boost::MPL Type Lists
+         *
+         *  \tparam T_MPLSeq A mpl sequence that can be accessed by mpl::begin, mpl::end, mpl::next
+         *  \tparam T_Functor An unary lambda functor with a HDINLINE void operator()(...) method
+         *          _1 is substituted by Accessor's result using boost::mpl::apply with elements from T_MPLSeq.
+         *          The maximum number of parameters for the operator() is limited by
+         *          PMACC_MAX_FUNCTOR_OPERATOR_PARAMS
+         *  \tparam T_Accessor An unary lambda operation
+         *
+         * Example:
+         *      MPLSeq = boost::mpl::vector<int,float>
+         *      Functor = any unary lambda functor
+         *      Accessor = lambda operation identity
+         *
+         *      definition: F(X) means boost::apply<F,X>
+         *
+         *      call:   ForEach<MPLSeq,Functor,Accessor>()(42);
+         *      unrolled code: Functor(Accessor(int))(42);
+         *                     Functor(Accessor(float))(42);
+         */
+        template<typename T_MPLSeq, typename T_Functor, typename T_Accessor = meta::accessors::Identity<>>
+        struct ForEach
         {
+            template<typename X>
+            struct ReplacePlaceholder : bmpl::apply1<T_Functor, typename bmpl::apply1<T_Accessor, X>::type>
+            {
+            };
 
-        }
-    };
-
-} // namespace detail
-
-    /** Compile-Time for each for Boost::MPL Type Lists
-     *
-     *  \tparam T_MPLSeq A mpl sequence that can be accessed by mpl::begin, mpl::end, mpl::next
-     *  \tparam T_Functor An unary lambda functor with a HDINLINE void operator()(...) method
-     *          _1 is substituted by Accessor's result using boost::mpl::apply with elements from T_MPLSeq.
-     *          The maximum number of parameters for the operator() is limited by
-     *          PMACC_MAX_FUNCTOR_OPERATOR_PARAMS
-     *  \tparam T_Accessor An unary lambda operation
-     *
-     * Example:
-     *      MPLSeq = boost::mpl::vector<int,float>
-     *      Functor = any unary lambda functor
-     *      Accessor = lambda operation identity
-     *
-     *      definition: F(X) means boost::apply<F,X>
-     *
-     *      call:   ForEach<MPLSeq,Functor,Accessor>()(42);
-     *      unrolled code: Functor(Accessor(int))(42);
-     *                     Functor(Accessor(float))(42);
-     */
-    template<
-        typename T_MPLSeq,
-        typename T_Functor,
-        typename T_Accessor = meta::accessors::Identity< >
-    >
-    struct ForEach
-    {
-
-        template< typename X >
-        struct ReplacePlaceholder : bmpl::apply1<
-            T_Functor,
-            typename bmpl::apply1<
-                T_Accessor,
-                X
-            >::type
-        >
-        {
-        };
+            typedef typename bmpl::transform<T_MPLSeq, ReplacePlaceholder<bmpl::_1>>::type SolvedFunctors;
 
-        typedef typename bmpl::transform<
-            T_MPLSeq,
-            ReplacePlaceholder< bmpl::_1 >
-        >::type SolvedFunctors;
+            typedef typename boost::mpl::begin<SolvedFunctors>::type begin;
+            typedef typename boost::mpl::end<SolvedFunctors>::type end;
 
-        typedef typename boost::mpl::begin< SolvedFunctors >::type begin;
-        typedef typename boost::mpl::end< SolvedFunctors >::type end;
 
+            typedef detail::CallFunctorOfIterator<begin, end> NextCall;
 
-        typedef detail::CallFunctorOfIterator<
-            begin,
-            end
-        > NextCall;
+            /* this functor does nothing */
+            typedef detail::CallFunctorOfIterator<end, end> Functor;
 
-        /* this functor does nothing */
-        typedef detail::CallFunctorOfIterator<
-            end,
-            end
-        > Functor;
+            PMACC_NO_NVCC_HDWARNING
+            template<typename... T_Types>
+            HDINLINE void operator()(T_Types&&... ts) const
+            {
+                Functor()(std::forward<T_Types>(ts)...);
+                NextCall()(ts...);
+            }
 
-        PMACC_NO_NVCC_HDWARNING
-        template< typename ... T_Types >
-        HDINLINE void
-        operator( )( T_Types && ... ts ) const
-        {
-            Functor()( std::forward< T_Types >( ts ) ... );
-            NextCall()( ts ... );
-        }
-
-        PMACC_NO_NVCC_HDWARNING
-        template< typename ... T_Types >
-        HDINLINE void
-        operator( )( T_Types && ... ts )
-        {
-            Functor( )( std::forward< T_Types >( ts ) ... );
-            NextCall( )( ts ... );
-        }
-    };
+            PMACC_NO_NVCC_HDWARNING
+            template<typename... T_Types>
+            HDINLINE void operator()(T_Types&&... ts)
+            {
+                Functor()(std::forward<T_Types>(ts)...);
+                NextCall()(ts...);
+            }
+        };
 
-} // namespace meta
+    } // namespace meta
 } // namespace pmacc
diff --git a/include/pmacc/meta/GetKeyFromAlias.hpp b/include/pmacc/meta/GetKeyFromAlias.hpp
index 1a9c5341d0..af52d63867 100644
--- a/include/pmacc/meta/GetKeyFromAlias.hpp
+++ b/include/pmacc/meta/GetKeyFromAlias.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -33,44 +33,38 @@
 
 namespace pmacc
 {
-
-/**
- * Returns the key type from an alias
- *
- * \tparam T_MPLSeq Sequence of keys to search
- * \tparam T_Key Key or alias of a key in the sequence
- * \tparam T_KeyNotFoundPolicy Binary meta-function that is called like (T_MPLSeq, T_Key)
- *         when T_Key is not found in the sequence. Default is to return bmpl::void_
- */
-template<typename T_MPLSeq,
-         typename T_Key,
-         typename T_KeyNotFoundPolicy = errorHandlerPolicies::ReturnType<>
->
-struct GetKeyFromAlias
-{
-private:
-    typedef T_KeyNotFoundPolicy KeyNotFoundPolicy;
-    /*create a map where Key is a undeclared alias and value is real type*/
-    typedef typename SeqToMap<T_MPLSeq, TypeToAliasPair<bmpl::_1> >::type AliasMap;
-    /*create a map where Key and value is real type*/
-    typedef typename SeqToMap<T_MPLSeq, TypeToPair<bmpl::_1> >::type KeyMap;
-    /*combine both maps*/
-    typedef bmpl::inserter< KeyMap, bmpl::insert<bmpl::_1, bmpl::_2> > Map_inserter;
-    typedef typename bmpl::copy<
-        AliasMap,
-        Map_inserter
-        >::type FullMap;
-    /* search for given key,
-     * - we get the real type if key found
-     * - else we get boost::mpl::void_
+    /**
+     * Returns the key type from an alias
+     *
+     * \tparam T_MPLSeq Sequence of keys to search
+     * \tparam T_Key Key or alias of a key in the sequence
+     * \tparam T_KeyNotFoundPolicy Binary meta-function that is called like (T_MPLSeq, T_Key)
+     *         when T_Key is not found in the sequence. Default is to return bmpl::void_
      */
-    typedef typename bmpl::at<FullMap, T_Key>::type MapType;
-public:
-    /* Check for KeyNotFound and calculate final type. (Uses lazy evaluation) */
-    typedef typename bmpl::if_<
-        boost::is_same<MapType, bmpl::void_>,
-        bmpl::apply<KeyNotFoundPolicy, T_MPLSeq, T_Key>,
-        bmpl::identity<MapType> >::type::type type;
-};
+    template<typename T_MPLSeq, typename T_Key, typename T_KeyNotFoundPolicy = errorHandlerPolicies::ReturnType<>>
+    struct GetKeyFromAlias
+    {
+    private:
+        typedef T_KeyNotFoundPolicy KeyNotFoundPolicy;
+        /*create a map where Key is a undeclared alias and value is real type*/
+        typedef typename SeqToMap<T_MPLSeq, TypeToAliasPair<bmpl::_1>>::type AliasMap;
+        /*create a map where Key and value is real type*/
+        typedef typename SeqToMap<T_MPLSeq, TypeToPair<bmpl::_1>>::type KeyMap;
+        /*combine both maps*/
+        typedef bmpl::inserter<KeyMap, bmpl::insert<bmpl::_1, bmpl::_2>> Map_inserter;
+        typedef typename bmpl::copy<AliasMap, Map_inserter>::type FullMap;
+        /* search for given key,
+         * - we get the real type if key found
+         * - else we get boost::mpl::void_
+         */
+        typedef typename bmpl::at<FullMap, T_Key>::type MapType;
+
+    public:
+        /* Check for KeyNotFound and calculate final type. (Uses lazy evaluation) */
+        typedef typename bmpl::if_<
+            boost::is_same<MapType, bmpl::void_>,
+            bmpl::apply<KeyNotFoundPolicy, T_MPLSeq, T_Key>,
+            bmpl::identity<MapType>>::type::type type;
+    };
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/String.hpp b/include/pmacc/meta/String.hpp
index f0f285e7c8..a70f8f9714 100644
--- a/include/pmacc/meta/String.hpp
+++ b/include/pmacc/meta/String.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,92 +26,72 @@
 
 namespace pmacc
 {
-namespace meta
-{
-    /** get character of an C-string
-     *
-     * @tparam T_len length of the string
-     *
-     * @param cstr input string
-     * @param idx index of the character
-     * @return if x < T_len character at index idx, else '0'
-     */
-    template<
-        int T_len
-    >
-    constexpr auto
-    elem_at(
-        char const ( & cstr )[ T_len ],
-        size_t const idx
-    )
-    -> char
-    {
-        return idx < T_len ? cstr[ idx ] : 0;
-    }
-
-    /** compile time string
-     *
-     * The size of the instance is 1 byte.
-     */
-    template< char ... T_c >
-    struct String
+    namespace meta
     {
-        /** get stored string */
-        static auto
-        str()
-        -> std::string
+        /** get character of an C-string
+         *
+         * @tparam T_len length of the string
+         *
+         * @param cstr input string
+         * @param idx index of the character
+         * @return if x < T_len character at index idx, else '0'
+         */
+        template<int T_len>
+        constexpr auto elem_at(char const (&cstr)[T_len], size_t const idx) -> char
         {
-            return std::string(
-                std::array<
-                    char,
-                    sizeof...( T_c ) + 1
-                >( {
-                    T_c ...,
-                    // at terminal zero to support empty strings
-                    0
-                } ).data( )
-            );
+            return idx < T_len ? cstr[idx] : 0;
         }
-    };
+
+        /** compile time string
+         *
+         * The size of the instance is 1 byte.
+         */
+        template<char... T_c>
+        struct String
+        {
+            /** get stored string */
+            static auto str() -> std::string
+            {
+                return std::string(std::array<char, sizeof...(T_c) + 1>({T_c...,
+                                                                         // at terminal zero to support empty strings
+                                                                         0})
+                                       .data());
+            }
+        };
 
 
-#define PMACC_CHAR_AT_N(z, n, name ) pmacc::meta::elem_at< sizeof(name) >( name, n ),
+#define PMACC_CHAR_AT_N(z, n, name) pmacc::meta::elem_at<sizeof(name)>(name, n),
 
-/** create a compile time string type
- *
- * Support strings with up to 64 characters.
- * Longer strings are cropped to 64 characters.
- *
- * usage example:
- * @code{.cpp}
- * // create an instance of the compile time string
- * auto particleName = PMACC_CSTRING( "electrons" ){};
- * // create a C++ type (can be used as template parameter)
- * using Electrons = PMACC_CSTRING( "electrons" );
- * @endcode
- */
+        /** create a compile time string type
+         *
+         * Support strings with up to 64 characters.
+         * Longer strings are cropped to 64 characters.
+         *
+         * usage example:
+         * @code{.cpp}
+         * // create an instance of the compile time string
+         * auto particleName = PMACC_CSTRING( "electrons" ){};
+         * // create a C++ type (can be used as template parameter)
+         * using Electrons = PMACC_CSTRING( "electrons" );
+         * @endcode
+         */
 
-#define PMACC_CSTRING( str )                                                   \
-    /* // PMACC_CSTRING("example") is transformed in                           \
-     * pmacc::meta::String<                                             \
-     *     pmacc::meta::elem_at< sizeof("example") >( sizeof("example", 0 ), \
-     *     pmacc::meta::elem_at< sizeof("example") >( sizeof("example", 1 ), \
-     *     ...                                                                 \
-     *     pmacc::meta::elem_at< sizeof("example") >( sizeof("example", 63 ), \
-     *     0                                                                   \
-     * >                                                                       \
-     */                                                                        \
-    pmacc::meta::String<                                                \
-        BOOST_PP_REPEAT_FROM_TO(                                               \
-            0,                                                                 \
-            /* support up to 64 charactres */                                  \
-            64,                                                                \
-            PMACC_CHAR_AT_N,                                                   \
-            str                                                                \
-        )                                                                      \
-        /* add a end zero because PMACC_CHAR_AT_N end with a comma */          \
-        0                                                                      \
-    >
+#define PMACC_CSTRING(str)                                                                                            \
+    /* // PMACC_CSTRING("example") is transformed in                                                                  \
+     * pmacc::meta::String<                                                                                           \
+     *     pmacc::meta::elem_at< sizeof("example") >( sizeof("example", 0 ),                                          \
+     *     pmacc::meta::elem_at< sizeof("example") >( sizeof("example", 1 ),                                          \
+     *     ...                                                                                                        \
+     *     pmacc::meta::elem_at< sizeof("example") >( sizeof("example", 63 ),                                         \
+     *     0                                                                                                          \
+     * >                                                                                                              \
+     */                                                                                                               \
+    pmacc::meta::String<BOOST_PP_REPEAT_FROM_TO(                                                                      \
+        0, /* support up to 64 charactres */                                                                          \
+        64,                                                                                                           \
+        PMACC_CHAR_AT_N,                                                                                              \
+        str) /* add a end zero because PMACC_CHAR_AT_N end with a comma */                                            \
+                        0>
 
-} // namespace meta
+    } // namespace meta
 } // namespace pmacc
diff --git a/include/pmacc/meta/accessors/First.hpp b/include/pmacc/meta/accessors/First.hpp
index de971482cb..2cf253999c 100644
--- a/include/pmacc/meta/accessors/First.hpp
+++ b/include/pmacc/meta/accessors/First.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,26 +26,24 @@
 
 namespace pmacc
 {
-namespace meta
-{
-
-namespace accessors
-{
-
-/** Get first type of the given type
- *
- * \tparam T type from which we return the first held type
- *
- * T must have defined ::first
- */
-template<typename T>
-struct First
-{
-    typedef typename T::first type;
-};
-
-}//namespace accessors
-
-}//namespace meta
-
-}//namespace  pmacc
+    namespace meta
+    {
+        namespace accessors
+        {
+            /** Get first type of the given type
+             *
+             * \tparam T type from which we return the first held type
+             *
+             * T must have defined ::first
+             */
+            template<typename T>
+            struct First
+            {
+                typedef typename T::first type;
+            };
+
+        } // namespace accessors
+
+    } // namespace meta
+
+} // namespace  pmacc
diff --git a/include/pmacc/meta/accessors/Identity.hpp b/include/pmacc/meta/accessors/Identity.hpp
index 12dbbbcee9..f7c1f6c64c 100644
--- a/include/pmacc/meta/accessors/Identity.hpp
+++ b/include/pmacc/meta/accessors/Identity.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,25 +27,22 @@
 
 namespace pmacc
 {
-namespace meta
-{
-
-namespace accessors
-{
-
-/** Get the type of a given type without changes
- *
- * \tparam T in type
- *
- */
-template<typename T=bmpl::_1>
-struct Identity : bmpl::identity<T>
-{
-
-};
-
-}//namespace accessors
-
-}//namespace meta
-
-}//namespace  pmacc
+    namespace meta
+    {
+        namespace accessors
+        {
+            /** Get the type of a given type without changes
+             *
+             * \tparam T in type
+             *
+             */
+            template<typename T = bmpl::_1>
+            struct Identity : bmpl::identity<T>
+            {
+            };
+
+        } // namespace accessors
+
+    } // namespace meta
+
+} // namespace  pmacc
diff --git a/include/pmacc/meta/accessors/Second.hpp b/include/pmacc/meta/accessors/Second.hpp
index 2b870f4a7a..fa2d1c8b93 100644
--- a/include/pmacc/meta/accessors/Second.hpp
+++ b/include/pmacc/meta/accessors/Second.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,26 +26,24 @@
 
 namespace pmacc
 {
-namespace meta
-{
-
-namespace accessors
-{
-
-/** Get second type of the given type
- *
- * \tparam T type from which we return the second held type
- *
- * T must have defined ::second
- */
-template<typename T>
-struct Second
-{
-    typedef typename T::second type;
-};
-
-}//namespace accessors
-
-}//namespace meta
-
-}//namespace  pmacc
+    namespace meta
+    {
+        namespace accessors
+        {
+            /** Get second type of the given type
+             *
+             * \tparam T type from which we return the second held type
+             *
+             * T must have defined ::second
+             */
+            template<typename T>
+            struct Second
+            {
+                typedef typename T::second type;
+            };
+
+        } // namespace accessors
+
+    } // namespace meta
+
+} // namespace  pmacc
diff --git a/include/pmacc/meta/accessors/Type.hpp b/include/pmacc/meta/accessors/Type.hpp
index 0870ceb3f3..8a30460431 100644
--- a/include/pmacc/meta/accessors/Type.hpp
+++ b/include/pmacc/meta/accessors/Type.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -27,22 +27,22 @@
 
 namespace pmacc
 {
-namespace meta
-{
-namespace accessors
-{
-    /** Get ::type member of the given type
-     *
-     * @tparam T type from which we return the type held in ::type
-     *
-     * T must have defined ::type
-     */
-    template< typename T = bmpl::_1 >
-    struct Type
+    namespace meta
     {
-        using type = typename T::type;
-    };
+        namespace accessors
+        {
+            /** Get ::type member of the given type
+             *
+             * @tparam T type from which we return the type held in ::type
+             *
+             * T must have defined ::type
+             */
+            template<typename T = bmpl::_1>
+            struct Type
+            {
+                using type = typename T::type;
+            };
 
-} // namespace accessors
-} // namespace meta
+        } // namespace accessors
+    } // namespace meta
 } // namespace pmacc
diff --git a/include/pmacc/meta/conversion/JoinToSeq.hpp b/include/pmacc/meta/conversion/JoinToSeq.hpp
index bbe8bcdd19..fe9ad36fe0 100644
--- a/include/pmacc/meta/conversion/JoinToSeq.hpp
+++ b/include/pmacc/meta/conversion/JoinToSeq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,24 +30,21 @@
 
 namespace pmacc
 {
+    /** Join both input types to one boost mpl sequence
+     *
+     * @tparam T_1 a boost mpl sequence or single type
+     * @tparam T_2 a boost mpl sequence or single type
+     */
 
-/** Join both input types to one boost mpl sequence
- *
- * @tparam T_1 a boost mpl sequence or single type
- * @tparam T_2 a boost mpl sequence or single type
- */
+    template<typename T_1, typename T_2 = bmpl::vector0<>>
+    struct JoinToSeq
+    {
+    private:
+        typedef typename ToSeq<T_1>::type Seq1;
+        typedef typename ToSeq<T_2>::type Seq2;
 
-template<typename T_1, typename T_2 = bmpl::vector0<> >
-struct JoinToSeq
-{
-private:
-    typedef typename ToSeq<T_1 >::type Seq1;
-    typedef typename ToSeq<T_2 >::type Seq2;
-public:
-    typedef typename bmpl::copy<
-    Seq2,
-    bmpl::back_inserter< Seq1>
-    >::type type;
-};
+    public:
+        typedef typename bmpl::copy<Seq2, bmpl::back_inserter<Seq1>>::type type;
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/MakeSeq.hpp b/include/pmacc/meta/conversion/MakeSeq.hpp
index 9724fb5eaf..fd5b9c98b5 100644
--- a/include/pmacc/meta/conversion/MakeSeq.hpp
+++ b/include/pmacc/meta/conversion/MakeSeq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,30 +28,27 @@
 
 namespace pmacc
 {
-
-/** combine all input types to one sequence
- *
- * Note: if the input type is a sequence itself, its elements will be unfolded
- *       and added separately
- *
- * @tparam T_Args a boost mpl sequence or single type
- *
- * @code
- * using MyType = typename MakeSeq< A, B >::type
- * using MyType2 = typename MakeSeq< boost::mpl::vector<A, B>, C >::type
- * @endcode
- *
- */
-template< typename... T_Args >
-struct MakeSeq
-{
-    typedef typename MakeSeqFromNestedSeq<
-        bmpl::vector< T_Args... >
-    >::type type;
-};
-
-/** short hand definition for @see MakeSeq<> */
-template< typename... T_Args >
-using MakeSeq_t = typename MakeSeq< T_Args... >::type;
-
-} //namespace pmacc
+    /** combine all input types to one sequence
+     *
+     * Note: if the input type is a sequence itself, its elements will be unfolded
+     *       and added separately
+     *
+     * @tparam T_Args a boost mpl sequence or single type
+     *
+     * @code
+     * using MyType = typename MakeSeq< A, B >::type
+     * using MyType2 = typename MakeSeq< boost::mpl::vector<A, B>, C >::type
+     * @endcode
+     *
+     */
+    template<typename... T_Args>
+    struct MakeSeq
+    {
+        typedef typename MakeSeqFromNestedSeq<bmpl::vector<T_Args...>>::type type;
+    };
+
+    /** short hand definition for @see MakeSeq<> */
+    template<typename... T_Args>
+    using MakeSeq_t = typename MakeSeq<T_Args...>::type;
+
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/MakeSeqFromNestedSeq.hpp b/include/pmacc/meta/conversion/MakeSeqFromNestedSeq.hpp
index 4bcc3431d3..2d15997fb7 100644
--- a/include/pmacc/meta/conversion/MakeSeqFromNestedSeq.hpp
+++ b/include/pmacc/meta/conversion/MakeSeqFromNestedSeq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,26 +29,21 @@
 
 namespace pmacc
 {
+    /** combine all elements of the input type to a single vector
+     *
+     * If elements of the input sequence are a sequence themself, all of their
+     * elements will be added to the resulting sequence
+     *
+     * @tparam T_In a boost mpl sequence or single type
+     */
+    template<typename T_In>
+    struct MakeSeqFromNestedSeq
+    {
+    private:
+        typedef typename ToSeq<T_In>::type Seq;
 
-/** combine all elements of the input type to a single vector
- *
- * If elements of the input sequence are a sequence themself, all of their
- * elements will be added to the resulting sequence
- *
- * @tparam T_In a boost mpl sequence or single type
- */
-template<typename T_In>
-struct MakeSeqFromNestedSeq
-{
-private:
-    typedef typename ToSeq<T_In >::type Seq;
-
-public:
-    typedef typename bmpl::fold<
-      Seq,
-      bmpl::vector0<>,
-      JoinToSeq<bmpl::_1,bmpl::_2>
-    >::type type;
-};
+    public:
+        typedef typename bmpl::fold<Seq, bmpl::vector0<>, JoinToSeq<bmpl::_1, bmpl::_2>>::type type;
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/OperateOnSeq.hpp b/include/pmacc/meta/conversion/OperateOnSeq.hpp
index 565d07564a..ccc7a308b7 100644
--- a/include/pmacc/meta/conversion/OperateOnSeq.hpp
+++ b/include/pmacc/meta/conversion/OperateOnSeq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -34,35 +34,26 @@
 
 namespace pmacc
 {
-
-/** run an unary operator on each element of a sequence
- *
- * @tparam T_MPLSeq any boost mpl sequence
- * @tparam T_UnaryOperator unary operator to translate type from the sequence
- * to a mpl pair
- * @tparam T_Accessor an unary lambda operator that is used before the type
- * from the sequence is passed to T_UnaryOperator
- * @return ::type bmpl::vector
- */
-template<typename T_MPLSeq,
-typename T_UnaryOperator,
-typename T_Accessor = meta::accessors::Identity<>
->
-struct OperateOnSeq
-{
-
-    template<typename X>
-    struct Op :bmpl::apply1<T_UnaryOperator, typename bmpl::apply1<T_Accessor,X>::type >
+    /** run an unary operator on each element of a sequence
+     *
+     * @tparam T_MPLSeq any boost mpl sequence
+     * @tparam T_UnaryOperator unary operator to translate type from the sequence
+     * to a mpl pair
+     * @tparam T_Accessor an unary lambda operator that is used before the type
+     * from the sequence is passed to T_UnaryOperator
+     * @return ::type bmpl::vector
+     */
+    template<typename T_MPLSeq, typename T_UnaryOperator, typename T_Accessor = meta::accessors::Identity<>>
+    struct OperateOnSeq
     {
+        template<typename X>
+        struct Op : bmpl::apply1<T_UnaryOperator, typename bmpl::apply1<T_Accessor, X>::type>
+        {
+        };
+
+        typedef T_MPLSeq MPLSeq;
+        typedef bmpl::back_inserter<bmpl::vector<>> Inserter;
+        typedef typename bmpl::transform<MPLSeq, Op<bmpl::_1>, Inserter>::type type;
     };
 
-    typedef T_MPLSeq MPLSeq;
-    typedef bmpl::back_inserter< bmpl::vector<> > Inserter;
-    typedef typename bmpl::transform<
-            MPLSeq,
-            Op<bmpl::_1>,
-            Inserter
-            >::type type;
-};
-
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/RemoveFromSeq.hpp b/include/pmacc/meta/conversion/RemoveFromSeq.hpp
index 8f63d88dc3..8f16641137 100644
--- a/include/pmacc/meta/conversion/RemoveFromSeq.hpp
+++ b/include/pmacc/meta/conversion/RemoveFromSeq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,25 +30,21 @@
 
 namespace pmacc
 {
-
-/* remove types from a sequence
- *
- * @tparam T_MPLSeqSrc source sequence from were we delete types
- * @tparam T_MPLSeqObjectsToRemove sequence with types which shuld be deleted
- */
-template<
-typename T_MPLSeqSrc,
-typename T_MPLSeqObjectsToRemove
->
-struct RemoveFromSeq
-{
-    template<typename T_Value>
-    struct hasId
+    /* remove types from a sequence
+     *
+     * @tparam T_MPLSeqSrc source sequence from were we delete types
+     * @tparam T_MPLSeqObjectsToRemove sequence with types which shuld be deleted
+     */
+    template<typename T_MPLSeqSrc, typename T_MPLSeqObjectsToRemove>
+    struct RemoveFromSeq
     {
-        typedef bmpl::contains<T_MPLSeqObjectsToRemove,T_Value> type;
-    };
+        template<typename T_Value>
+        struct hasId
+        {
+            typedef bmpl::contains<T_MPLSeqObjectsToRemove, T_Value> type;
+        };
 
-    typedef typename bmpl::remove_if< T_MPLSeqSrc, hasId<bmpl::_> >::type type;
-};
+        typedef typename bmpl::remove_if<T_MPLSeqSrc, hasId<bmpl::_>>::type type;
+    };
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/ResolveAliases.hpp b/include/pmacc/meta/conversion/ResolveAliases.hpp
index 7586c218c4..3db857941b 100644
--- a/include/pmacc/meta/conversion/ResolveAliases.hpp
+++ b/include/pmacc/meta/conversion/ResolveAliases.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -32,37 +32,32 @@
 
 namespace pmacc
 {
-
-/** Translate all pmacc alias types to full specialized types
- *
- * Use lookup sequence to translate types
- * The policy is used if the type from T_MPLSeq is not in T_MPLSeqLookup a compile time error is triggered
- *
- * @tparam T_MPLSeq source sequence with types to translate
- * @tparam T_MPLSeqLookup lookup sequence to translate aliases
- */
-template<
-    typename T_MPLSeq,
-    typename T_MPLSeqLookup,
-    typename T_AliasNotFoundPolicy = errorHandlerPolicies::ThrowValueNotFound
->
-struct ResolveAliases
-{
-    typedef T_MPLSeq MPLSeq;
-    typedef T_MPLSeqLookup MPLSeqLookup;
-    typedef T_AliasNotFoundPolicy AliasNotFoundPolicy;
-    typedef bmpl::back_inserter< bmpl::vector<> > Inserter;
-
-    template<typename T_Identifier>
-    struct GetKeyFromAliasAccessor
+    /** Translate all pmacc alias types to full specialized types
+     *
+     * Use lookup sequence to translate types
+     * The policy is used if the type from T_MPLSeq is not in T_MPLSeqLookup a compile time error is triggered
+     *
+     * @tparam T_MPLSeq source sequence with types to translate
+     * @tparam T_MPLSeqLookup lookup sequence to translate aliases
+     */
+    template<
+        typename T_MPLSeq,
+        typename T_MPLSeqLookup,
+        typename T_AliasNotFoundPolicy = errorHandlerPolicies::ThrowValueNotFound>
+    struct ResolveAliases
     {
-        typedef typename GetKeyFromAlias<MPLSeqLookup, T_Identifier, AliasNotFoundPolicy>::type type;
+        typedef T_MPLSeq MPLSeq;
+        typedef T_MPLSeqLookup MPLSeqLookup;
+        typedef T_AliasNotFoundPolicy AliasNotFoundPolicy;
+        typedef bmpl::back_inserter<bmpl::vector<>> Inserter;
+
+        template<typename T_Identifier>
+        struct GetKeyFromAliasAccessor
+        {
+            typedef typename GetKeyFromAlias<MPLSeqLookup, T_Identifier, AliasNotFoundPolicy>::type type;
+        };
+
+        typedef typename bmpl::transform<MPLSeq, GetKeyFromAliasAccessor<bmpl::_1>>::type type;
     };
 
-    typedef typename bmpl::transform<
-        MPLSeq,
-        GetKeyFromAliasAccessor<bmpl::_1>
-    >::type type;
-};
-
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/ResolveAndRemoveFromSeq.hpp b/include/pmacc/meta/conversion/ResolveAndRemoveFromSeq.hpp
index 92713a8c25..f181e18b6e 100644
--- a/include/pmacc/meta/conversion/ResolveAndRemoveFromSeq.hpp
+++ b/include/pmacc/meta/conversion/ResolveAndRemoveFromSeq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Alexander Grund
+/* Copyright 2014-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -29,22 +29,19 @@
 
 namespace pmacc
 {
+    /** Resolve and remove types from a sequence
+     *
+     * @tparam T_MPLSeqSrc source sequence from were we delete types
+     * @tparam T_MPLSeqObjectsToRemove sequence with types which should be deleted (pmacc aliases are allowed)
+     */
+    template<typename T_MPLSeqSrc, typename T_MPLSeqObjectsToRemove>
+    struct ResolveAndRemoveFromSeq
+    {
+        typedef T_MPLSeqSrc MPLSeqSrc;
+        typedef T_MPLSeqObjectsToRemove MPLSeqObjectsToRemove;
+        typedef typename ResolveAliases<MPLSeqObjectsToRemove, MPLSeqSrc, errorHandlerPolicies::ReturnValue>::type
+            ResolvedSeqWithObjectsToRemove;
+        typedef typename RemoveFromSeq<MPLSeqSrc, ResolvedSeqWithObjectsToRemove>::type type;
+    };
 
-/** Resolve and remove types from a sequence
- *
- * @tparam T_MPLSeqSrc source sequence from were we delete types
- * @tparam T_MPLSeqObjectsToRemove sequence with types which should be deleted (pmacc aliases are allowed)
- */
-template<
-typename T_MPLSeqSrc,
-typename T_MPLSeqObjectsToRemove
->
-struct ResolveAndRemoveFromSeq
-{
-    typedef T_MPLSeqSrc MPLSeqSrc;
-    typedef T_MPLSeqObjectsToRemove MPLSeqObjectsToRemove;
-    typedef typename ResolveAliases<MPLSeqObjectsToRemove, MPLSeqSrc, errorHandlerPolicies::ReturnValue>::type ResolvedSeqWithObjectsToRemove;
-    typedef typename RemoveFromSeq<MPLSeqSrc, ResolvedSeqWithObjectsToRemove>::type type;
-};
-
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/SeqToMap.hpp b/include/pmacc/meta/conversion/SeqToMap.hpp
index a42f9fbd3c..80889b9cc5 100644
--- a/include/pmacc/meta/conversion/SeqToMap.hpp
+++ b/include/pmacc/meta/conversion/SeqToMap.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -37,35 +37,26 @@
 
 namespace pmacc
 {
-
-/** convert boost mpl sequence to a mpl map
- *
- * @tparam T_MPLSeq any boost mpl sequence
- * @tparam T_UnaryOperator unary operator to translate type from the sequence
- * to a mpl pair
- * @tparam T_Accessor An unary lambda operator which is used before the type
- * from the sequence is passed to T_UnaryOperator
- * @return ::type mpl map
- */
-template<typename T_MPLSeq,
-typename T_UnaryOperator,
-typename T_Accessor = meta::accessors::Identity<>
->
-struct SeqToMap
-{
-
-    template<typename X>
-    struct Op :bmpl::apply1<T_UnaryOperator, typename bmpl::apply1<T_Accessor,X>::type >
+    /** convert boost mpl sequence to a mpl map
+     *
+     * @tparam T_MPLSeq any boost mpl sequence
+     * @tparam T_UnaryOperator unary operator to translate type from the sequence
+     * to a mpl pair
+     * @tparam T_Accessor An unary lambda operator which is used before the type
+     * from the sequence is passed to T_UnaryOperator
+     * @return ::type mpl map
+     */
+    template<typename T_MPLSeq, typename T_UnaryOperator, typename T_Accessor = meta::accessors::Identity<>>
+    struct SeqToMap
     {
+        template<typename X>
+        struct Op : bmpl::apply1<T_UnaryOperator, typename bmpl::apply1<T_Accessor, X>::type>
+        {
+        };
+
+        typedef T_MPLSeq MPLSeq;
+        typedef bmpl::inserter<bmpl::map<>, bmpl::insert<bmpl::_1, bmpl::_2>> Map_inserter;
+        typedef typename bmpl::transform<MPLSeq, Op<bmpl::_1>, Map_inserter>::type type;
     };
 
-    typedef T_MPLSeq MPLSeq;
-    typedef bmpl::inserter< bmpl::map<>, bmpl::insert<bmpl::_1, bmpl::_2> > Map_inserter;
-    typedef typename bmpl::transform<
-            MPLSeq,
-            Op<bmpl::_1> ,
-            Map_inserter
-            >::type type;
-};
-
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/ToSeq.hpp b/include/pmacc/meta/conversion/ToSeq.hpp
index d40c324891..dbd397ce68 100644
--- a/include/pmacc/meta/conversion/ToSeq.hpp
+++ b/include/pmacc/meta/conversion/ToSeq.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,15 +29,14 @@
 
 namespace pmacc
 {
+    /** cast type to boost mpl vector
+     * @return ::type if T_Type is sequence then identity of T_Type
+     *                else boost::mpl::vector<T_Type>
+     */
+    template<typename T_Type>
+    struct ToSeq
+    {
+        typedef typename bmpl::if_<bmpl::is_sequence<T_Type>, T_Type, bmpl::vector1<T_Type>>::type type;
+    };
 
-/** cast type to boost mpl vector
- * @return ::type if T_Type is sequence then identity of T_Type
- *                else boost::mpl::vector<T_Type>
- */
-template<typename T_Type>
-struct ToSeq
-{
-    typedef typename bmpl::if_<bmpl::is_sequence< T_Type >,T_Type,bmpl::vector1<T_Type> >::type type;
-};
-
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/TypeToAliasPair.hpp b/include/pmacc/meta/conversion/TypeToAliasPair.hpp
index bc5cc8f7be..e96f558595 100644
--- a/include/pmacc/meta/conversion/TypeToAliasPair.hpp
+++ b/include/pmacc/meta/conversion/TypeToAliasPair.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,31 +28,27 @@
 
 namespace pmacc
 {
-
-/** create boost mpl pair
- *
- * If T_Type is a pmacc alias than first is set to anonym alias name
- * and second is set to T_Type.
- * If T_Type is no alias than TypeToPair is used.
- *
- * @tparam T_Type any type
- * @resturn ::type
- */
-template<typename T_Type>
-struct TypeToAliasPair
-{
-    typedef typename TypeToPair<T_Type>::type type;
-};
-
-/** specialisation if T_Type is a pmacc alias*/
-template<template<typename,typename> class T_Alias,typename T_Type>
-struct TypeToAliasPair< T_Alias<T_Type,pmacc::pmacc_isAlias> >
-{
-    typedef
-    bmpl::pair< T_Alias<pmacc_void,pmacc::pmacc_isAlias> ,
-            T_Alias<T_Type,pmacc::pmacc_isAlias> >
-            type;
-};
-
-
-}//namespace pmacc
+    /** create boost mpl pair
+     *
+     * If T_Type is a pmacc alias than first is set to anonym alias name
+     * and second is set to T_Type.
+     * If T_Type is no alias than TypeToPair is used.
+     *
+     * @tparam T_Type any type
+     * @resturn ::type
+     */
+    template<typename T_Type>
+    struct TypeToAliasPair
+    {
+        typedef typename TypeToPair<T_Type>::type type;
+    };
+
+    /** specialisation if T_Type is a pmacc alias*/
+    template<template<typename, typename> class T_Alias, typename T_Type>
+    struct TypeToAliasPair<T_Alias<T_Type, pmacc::pmacc_isAlias>>
+    {
+        typedef bmpl::pair<T_Alias<pmacc_void, pmacc::pmacc_isAlias>, T_Alias<T_Type, pmacc::pmacc_isAlias>> type;
+    };
+
+
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/TypeToPair.hpp b/include/pmacc/meta/conversion/TypeToPair.hpp
index 9265d9742e..f2dc965b81 100644
--- a/include/pmacc/meta/conversion/TypeToPair.hpp
+++ b/include/pmacc/meta/conversion/TypeToPair.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,23 +27,16 @@
 
 namespace pmacc
 {
-
-
-
-/** create boost mpl pair
- *
- * @tparam T_Type any type
- * @resturn ::type boost mpl pair where first and second is set to T_Type
- */
-template<typename T_Type>
-struct TypeToPair
-{
-    typedef
-    bmpl::pair< T_Type,
-            T_Type >
-            type;
-};
-
-
-
-}//namespace pmacc
+    /** create boost mpl pair
+     *
+     * @tparam T_Type any type
+     * @resturn ::type boost mpl pair where first and second is set to T_Type
+     */
+    template<typename T_Type>
+    struct TypeToPair
+    {
+        typedef bmpl::pair<T_Type, T_Type> type;
+    };
+
+
+} // namespace pmacc
diff --git a/include/pmacc/meta/conversion/TypeToPointerPair.hpp b/include/pmacc/meta/conversion/TypeToPointerPair.hpp
index c1fe30a994..910a142b52 100644
--- a/include/pmacc/meta/conversion/TypeToPointerPair.hpp
+++ b/include/pmacc/meta/conversion/TypeToPointerPair.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,48 +28,47 @@
 
 namespace pmacc
 {
+    /** Wrapper to use any type as identifier
+     *
+     * Wrap a type thus we can call default constructor on every class
+     * This is needed to support that any type can used as identifier in for math::MapTuple
+     */
+    template<typename T_Type>
+    struct TypeAsIdentifier
+    {
+        typedef T_Type type;
+    };
 
-/** Wrapper to use any type as identifier
- *
- * Wrap a type thus we can call default constructor on every class
- * This is needed to support that any type can used as identifier in for math::MapTuple
- */
-template<typename T_Type>
-struct TypeAsIdentifier
-{
-    typedef T_Type type;
-};
+    /** Unary functor to wrap any type with TypeAsIdentifier
+     *
+     * @tparam T_Type to to wrap
+     */
+    template<typename T_Type>
+    struct MakeIdentifier
+    {
+        typedef TypeAsIdentifier<T_Type> type;
+    };
 
-/** Unary functor to wrap any type with TypeAsIdentifier
- *
- * @tparam T_Type to to wrap
- */
-template<typename T_Type>
-struct MakeIdentifier
-{
-    typedef TypeAsIdentifier<T_Type> type;
-};
+    /** Pass through of an already existing Identifier
+     *
+     * Avoids double-wrapping of an Identifier
+     */
+    template<typename T_Type>
+    struct MakeIdentifier<TypeAsIdentifier<T_Type>>
+    {
+        typedef TypeAsIdentifier<T_Type> type;
+    };
 
-/** Pass through of an already existing Identifier
- *
- * Avoids double-wrapping of an Identifier
- */
-template<typename T_Type>
-struct MakeIdentifier<TypeAsIdentifier<T_Type> >
-{
-    typedef TypeAsIdentifier<T_Type> type;
-};
-
-/** create boost mpl pair <TypeAsIdentifier<Type>,PointerOfType>
- *
- * @tparam T_Type any type
- * @return ::type boost::mpl::pair<TypeAsIdentifier<Type>,PointerOfType>
- */
-template<typename T_Type>
-struct TypeToPointerPair
-{
-    typedef T_Type* TypePtr;
-    typedef bmpl::pair< typename MakeIdentifier<T_Type>::type , TypePtr > type;
-};
+    /** create boost mpl pair <TypeAsIdentifier<Type>,PointerOfType>
+     *
+     * @tparam T_Type any type
+     * @return ::type boost::mpl::pair<TypeAsIdentifier<Type>,PointerOfType>
+     */
+    template<typename T_Type>
+    struct TypeToPointerPair
+    {
+        typedef T_Type* TypePtr;
+        typedef bmpl::pair<typename MakeIdentifier<T_Type>::type, TypePtr> type;
+    };
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/meta/errorHandlerPolicies/ReturnType.hpp b/include/pmacc/meta/errorHandlerPolicies/ReturnType.hpp
index f3bfaa8b51..d39aac572d 100644
--- a/include/pmacc/meta/errorHandlerPolicies/ReturnType.hpp
+++ b/include/pmacc/meta/errorHandlerPolicies/ReturnType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -25,21 +25,20 @@
 
 namespace pmacc
 {
-namespace errorHandlerPolicies
-{
-
-/** Returns the given type
- *  Binary meta function that takes any boost mpl sequence and a type
- */
-template<typename T_ReturnType = bmpl::void_>
-struct ReturnType
-{
-    template<typename T_MPLSeq, typename T_Value>
-    struct apply
+    namespace errorHandlerPolicies
     {
-        typedef T_ReturnType type;
-    };
-};
+        /** Returns the given type
+         *  Binary meta function that takes any boost mpl sequence and a type
+         */
+        template<typename T_ReturnType = bmpl::void_>
+        struct ReturnType
+        {
+            template<typename T_MPLSeq, typename T_Value>
+            struct apply
+            {
+                typedef T_ReturnType type;
+            };
+        };
 
-} // namespace errorHandlerPolicies
+    } // namespace errorHandlerPolicies
 } // namespace pmacc
diff --git a/include/pmacc/meta/errorHandlerPolicies/ReturnValue.hpp b/include/pmacc/meta/errorHandlerPolicies/ReturnValue.hpp
index cefda11bda..88c98581f3 100644
--- a/include/pmacc/meta/errorHandlerPolicies/ReturnValue.hpp
+++ b/include/pmacc/meta/errorHandlerPolicies/ReturnValue.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -25,20 +25,19 @@
 
 namespace pmacc
 {
-namespace errorHandlerPolicies
-{
-
-/** Returns the second parameter (normally the value that the sequence was searched for
- *  Binary meta function that takes any boost mpl sequence and a type
- */
-struct ReturnValue
-{
-    template<typename T_MPLSeq, typename T_Value>
-    struct apply
+    namespace errorHandlerPolicies
     {
-        typedef T_Value type;
-    };
-};
+        /** Returns the second parameter (normally the value that the sequence was searched for
+         *  Binary meta function that takes any boost mpl sequence and a type
+         */
+        struct ReturnValue
+        {
+            template<typename T_MPLSeq, typename T_Value>
+            struct apply
+            {
+                typedef T_Value type;
+            };
+        };
 
-} // namespace errorHandlerPolicies
+    } // namespace errorHandlerPolicies
 } // namespace pmacc
diff --git a/include/pmacc/meta/errorHandlerPolicies/ThrowValueNotFound.hpp b/include/pmacc/meta/errorHandlerPolicies/ThrowValueNotFound.hpp
index fdf14d7d41..649e48c919 100644
--- a/include/pmacc/meta/errorHandlerPolicies/ThrowValueNotFound.hpp
+++ b/include/pmacc/meta/errorHandlerPolicies/ThrowValueNotFound.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,29 +26,28 @@
 
 namespace pmacc
 {
-namespace errorHandlerPolicies
-{
-
-/** Throws an assertion that the value was not found in the sequence
- *  Binary meta function that takes any boost mpl sequence and a type
- */
-struct ThrowValueNotFound
-{
-    template<typename T_MPLSeq, typename T_Value>
-    struct apply
+    namespace errorHandlerPolicies
     {
-        /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
-         * even if the class is never instantiated. In that case static assert is always
-         * evaluated (e.g. with clang), this results in an error if the condition is false.
-         * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
-         *
-         * A workaround is to add a template dependency to the expression.
-         * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+        /** Throws an assertion that the value was not found in the sequence
+         *  Binary meta function that takes any boost mpl sequence and a type
          */
-        PMACC_CASSERT_MSG_TYPE(value_not_found_in_seq, T_Value, false && ( sizeof(T_MPLSeq) != 0 ) );
-        typedef bmpl::void_ type;
-    };
-};
+        struct ThrowValueNotFound
+        {
+            template<typename T_MPLSeq, typename T_Value>
+            struct apply
+            {
+                /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
+                 * even if the class is never instantiated. In that case static assert is always
+                 * evaluated (e.g. with clang), this results in an error if the condition is false.
+                 * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
+                 *
+                 * A workaround is to add a template dependency to the expression.
+                 * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+                 */
+                PMACC_CASSERT_MSG_TYPE(value_not_found_in_seq, T_Value, false && (sizeof(T_MPLSeq) != 0));
+                typedef bmpl::void_ type;
+            };
+        };
 
-} // namespace errorHandlerPolicies
+    } // namespace errorHandlerPolicies
 } // namespace pmacc
diff --git a/include/pmacc/misc/splitString.hpp b/include/pmacc/misc/splitString.hpp
index fe59a0ec3a..dd964d3f03 100644
--- a/include/pmacc/misc/splitString.hpp
+++ b/include/pmacc/misc/splitString.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,37 +28,26 @@
 
 namespace pmacc
 {
-namespace misc
-{
-    /** split a string in a vector of strings
-     *
-     * Based on Stack Overflow post:
-     *   source: https://stackoverflow.com/a/28142357
-     *   author: Marcin
-     *   date: Jan 25 '15
-     *
-     * @param input string to split
-     * @param regex separator between two elements
-     */
-    HINLINE std::vector< std::string > splitString(
-        std::string const & input,
-        std::string const & delimiter = ","
-    )
+    namespace misc
     {
-        std::regex re( delimiter );
-        // passing -1 as the submatch index parameter performs splitting
-        std::sregex_token_iterator first{
-            input.begin(),
-            input.end(),
-            re,
-            -1
-        };
-        std::sregex_token_iterator last;
+        /** split a string in a vector of strings
+         *
+         * Based on Stack Overflow post:
+         *   source: https://stackoverflow.com/a/28142357
+         *   author: Marcin
+         *   date: Jan 25 '15
+         *
+         * @param input string to split
+         * @param regex separator between two elements
+         */
+        HINLINE std::vector<std::string> splitString(std::string const& input, std::string const& delimiter = ",")
+        {
+            std::regex re(delimiter);
+            // passing -1 as the submatch index parameter performs splitting
+            std::sregex_token_iterator first{input.begin(), input.end(), re, -1};
+            std::sregex_token_iterator last;
 
-        return {
-            first,
-            last
-        };
-    }
-} // namespace misc
+            return {first, last};
+        }
+    } // namespace misc
 } // namespace pmacc
diff --git a/include/pmacc/mpi/GetMPI_Op.hpp b/include/pmacc/mpi/GetMPI_Op.hpp
index 8baf3b789f..5d49369795 100644
--- a/include/pmacc/mpi/GetMPI_Op.hpp
+++ b/include/pmacc/mpi/GetMPI_Op.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,8 +28,7 @@ namespace pmacc
 {
     namespace mpi
     {
-
         template<class Functor>
         MPI_Op getMPI_Op();
     }
-}
+} // namespace pmacc
diff --git a/include/pmacc/mpi/GetMPI_StructAsArray.hpp b/include/pmacc/mpi/GetMPI_StructAsArray.hpp
index a63db93b19..93c5fd830a 100644
--- a/include/pmacc/mpi/GetMPI_StructAsArray.hpp
+++ b/include/pmacc/mpi/GetMPI_StructAsArray.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,26 +26,23 @@
 
 namespace pmacc
 {
-namespace mpi
-{
-namespace def
-{
-
-template<typename Type>
-struct GetMPI_StructAsArray;
+    namespace mpi
+    {
+        namespace def
+        {
+            template<typename Type>
+            struct GetMPI_StructAsArray;
 
-}//namespace intern
+        } // namespace def
 
-template<typename Type>
-pmacc::mpi::MPI_StructAsArray getMPI_StructAsArray()
-{
-    return def::GetMPI_StructAsArray<Type > ()();
-}
+        template<typename Type>
+        pmacc::mpi::MPI_StructAsArray getMPI_StructAsArray()
+        {
+            return def::GetMPI_StructAsArray<Type>()();
+        }
 
-} //namespace mpi
+    } // namespace mpi
 
-}//namespace pmacc
+} // namespace pmacc
 
 #include "pmacc/mpi/GetMPI_StructAsArray.tpp"
-
-
diff --git a/include/pmacc/mpi/GetMPI_StructAsArray.tpp b/include/pmacc/mpi/GetMPI_StructAsArray.tpp
index 1c05c140f0..d4f6c8d836 100644
--- a/include/pmacc/mpi/GetMPI_StructAsArray.tpp
+++ b/include/pmacc/mpi/GetMPI_StructAsArray.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -25,93 +25,83 @@
 
 namespace pmacc
 {
-namespace mpi
-{
-namespace def
-{
-
-template<>
-struct GetMPI_StructAsArray<int >
-{
-
-    MPI_StructAsArray operator()() const
-    {
-        return MPI_StructAsArray(MPI_INT, 1);
-    }
-};
-
-template<>
-struct GetMPI_StructAsArray<unsigned >
-{
-
-    MPI_StructAsArray operator()() const
-    {
-        return MPI_StructAsArray(MPI_UNSIGNED, 1);
-    }
-};
-
-template<>
-struct GetMPI_StructAsArray<long >
-{
-
-    MPI_StructAsArray operator()() const
-    {
-        return MPI_StructAsArray(MPI_LONG, 1);
-    }
-};
-
-template<>
-struct GetMPI_StructAsArray<unsigned long >
-{
-
-    MPI_StructAsArray operator()() const
-    {
-        return MPI_StructAsArray(MPI_UNSIGNED_LONG, 1);
-    }
-};
-
-template<>
-struct GetMPI_StructAsArray<long long >
-{
-
-    MPI_StructAsArray operator()() const
-    {
-        return MPI_StructAsArray(MPI_LONG_LONG, 1);
-    }
-};
-
-template<>
-struct GetMPI_StructAsArray<unsigned long long >
-{
-
-    MPI_StructAsArray operator()() const
-    {
-        return MPI_StructAsArray(MPI_UNSIGNED_LONG_LONG, 1);
-    }
-};
-
-template<>
-struct GetMPI_StructAsArray<float >
-{
-
-    MPI_StructAsArray operator()() const
+    namespace mpi
     {
-        return MPI_StructAsArray(MPI_FLOAT, 1);
-    }
-};
-
-template<>
-struct GetMPI_StructAsArray<double >
-{
-
-    MPI_StructAsArray operator()() const
-    {
-        return MPI_StructAsArray(MPI_DOUBLE, 1);
-    }
-};
-
-} //namespace def
-}//namespace mpi
-
-}//namespace pmacc
-
+        namespace def
+        {
+            template<>
+            struct GetMPI_StructAsArray<int>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_INT, 1);
+                }
+            };
+
+            template<>
+            struct GetMPI_StructAsArray<unsigned>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_UNSIGNED, 1);
+                }
+            };
+
+            template<>
+            struct GetMPI_StructAsArray<long>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_LONG, 1);
+                }
+            };
+
+            template<>
+            struct GetMPI_StructAsArray<unsigned long>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_UNSIGNED_LONG, 1);
+                }
+            };
+
+            template<>
+            struct GetMPI_StructAsArray<long long>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_LONG_LONG, 1);
+                }
+            };
+
+            template<>
+            struct GetMPI_StructAsArray<unsigned long long>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_UNSIGNED_LONG_LONG, 1);
+                }
+            };
+
+            template<>
+            struct GetMPI_StructAsArray<float>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_FLOAT, 1);
+                }
+            };
+
+            template<>
+            struct GetMPI_StructAsArray<double>
+            {
+                MPI_StructAsArray operator()() const
+                {
+                    return MPI_StructAsArray(MPI_DOUBLE, 1);
+                }
+            };
+
+        } // namespace def
+    } // namespace mpi
+
+} // namespace pmacc
diff --git a/include/pmacc/mpi/MPIReduce.hpp b/include/pmacc/mpi/MPIReduce.hpp
index 4d05b6a5fe..bc0cd19fae 100644
--- a/include/pmacc/mpi/MPIReduce.hpp
+++ b/include/pmacc/mpi/MPIReduce.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -33,166 +33,158 @@
 
 namespace pmacc
 {
-namespace mpi
-{
-
-/** reduce data over selected mpi ranks */
-struct MPIReduce
-{
-
-    MPIReduce() : mpiRank(-1), numRanks(0), comm(MPI_COMM_NULL), isMPICommInitialized(false)
+    namespace mpi
     {
+        /** reduce data over selected mpi ranks */
+        struct MPIReduce
+        {
+            MPIReduce() : mpiRank(-1), numRanks(0), comm(MPI_COMM_NULL), isMPICommInitialized(false)
+            {
+            }
 
-    }
+            virtual ~MPIReduce()
+            {
+                if(isMPICommInitialized)
+                {
+                    MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&comm));
+                }
+            }
 
-    virtual ~MPIReduce()
-    {
-        if (isMPICommInitialized)
-        {
-            MPI_CHECK_NO_EXCEPT(MPI_Comm_free(&comm));
-        }
-    }
-
-    /* defines if the result of the MPI operation is valid
-     *
-     * @tparam MPIMethod type of the reduction method
-     * @param method used reduction method e.g.,
-     *                reduceMethods::AllReduce, reduceMethods::Reduce
-     * @return if resut of operator() is valid*/
-    template<class MPIMethod>
-    bool hasResult(const MPIMethod & method)
-    {
-        if (!isMPICommInitialized)
-            participate(true);
-        return method.hasResult(mpiRank);
-    }
-
-    /** defines if the result of the MPI operation is valid
-     *
-     * The reduction method reduceMethods::Reduce is used.
-     *
-     * @return if result of operator() is valid
-     */
-    bool hasResult()
-    {
-        if (!isMPICommInitialized)
-            participate(true);
-        return this->hasResult(::pmacc::mpi::reduceMethods::AllReduce());
-    }
-
-    /* Activate participation for reduce algorithm.
-     * Must called from any mpi process. This function use global blocking mpi calls.
-     * @param isActive true if mpi rank should be part of reduce operation, else false
-     */
-    void participate(bool isActive)
-    {
-        /*free old communicator of init is called again*/
-        if (isMPICommInitialized)
-        {
-            MPI_CHECK(MPI_Comm_free(&comm));
-            mpiRank = -1;
-            numRanks = 0;
-            isMPICommInitialized = false;
-        }
-
-        int countRanks;
-        MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &countRanks));
-        std::vector<int> reduceRank(countRanks);
-        std::vector<int> groupRanks(countRanks);
-        MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank));
-
-        if (!isActive)
-            mpiRank = -1;
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Allgather(&mpiRank, 1, MPI_INT, &reduceRank[0], 1, MPI_INT, MPI_COMM_WORLD));
-
-        for (int i = 0; i < countRanks; ++i)
-        {
-            if (reduceRank[i] != -1)
+            /* defines if the result of the MPI operation is valid
+             *
+             * @tparam MPIMethod type of the reduction method
+             * @param method used reduction method e.g.,
+             *                reduceMethods::AllReduce, reduceMethods::Reduce
+             * @return if resut of operator() is valid*/
+            template<class MPIMethod>
+            bool hasResult(const MPIMethod& method)
             {
-                groupRanks[numRanks] = reduceRank[i];
-                numRanks++ ;
+                if(!isMPICommInitialized)
+                    participate(true);
+                return method.hasResult(mpiRank);
             }
-        }
 
-        MPI_Group group = MPI_GROUP_NULL;
-        MPI_Group newgroup = MPI_GROUP_NULL;
-        MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &group));
-        MPI_CHECK(MPI_Group_incl(group, numRanks, &groupRanks[0], &newgroup));
+            /** defines if the result of the MPI operation is valid
+             *
+             * The reduction method reduceMethods::Reduce is used.
+             *
+             * @return if result of operator() is valid
+             */
+            bool hasResult()
+            {
+                if(!isMPICommInitialized)
+                    participate(true);
+                return this->hasResult(::pmacc::mpi::reduceMethods::AllReduce());
+            }
 
-        MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, newgroup, &comm));
+            /* Activate participation for reduce algorithm.
+             * Must called from any mpi process. This function use global blocking mpi calls.
+             * @param isActive true if mpi rank should be part of reduce operation, else false
+             */
+            void participate(bool isActive)
+            {
+                /*free old communicator of init is called again*/
+                if(isMPICommInitialized)
+                {
+                    MPI_CHECK(MPI_Comm_free(&comm));
+                    mpiRank = -1;
+                    numRanks = 0;
+                    isMPICommInitialized = false;
+                }
+
+                int countRanks;
+                MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &countRanks));
+                std::vector<int> reduceRank(countRanks);
+                std::vector<int> groupRanks(countRanks);
+                MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank));
+
+                if(!isActive)
+                    mpiRank = -1;
+
+                // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                __getTransactionEvent().waitForFinished();
+                MPI_CHECK(MPI_Allgather(&mpiRank, 1, MPI_INT, &reduceRank[0], 1, MPI_INT, MPI_COMM_WORLD));
+
+                for(int i = 0; i < countRanks; ++i)
+                {
+                    if(reduceRank[i] != -1)
+                    {
+                        groupRanks[numRanks] = reduceRank[i];
+                        numRanks++;
+                    }
+                }
+
+                MPI_Group group = MPI_GROUP_NULL;
+                MPI_Group newgroup = MPI_GROUP_NULL;
+                MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &group));
+                MPI_CHECK(MPI_Group_incl(group, numRanks, &groupRanks[0], &newgroup));
+
+                MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, newgroup, &comm));
+
+                if(mpiRank != -1)
+                {
+                    MPI_CHECK(MPI_Comm_rank(comm, &mpiRank));
+                    isMPICommInitialized = true;
+                }
+                MPI_CHECK(MPI_Group_free(&group));
+                MPI_CHECK(MPI_Group_free(&newgroup));
+            }
 
-        if (mpiRank != -1)
-        {
-            MPI_CHECK(MPI_Comm_rank(comm, &mpiRank));
-            isMPICommInitialized = true;
-        }
-        MPI_CHECK(MPI_Group_free(&group));
-        MPI_CHECK(MPI_Group_free(&newgroup));
-    }
-
-    /* Reduce elements on cpu memory
-     * call hasResult to see if returned value is valid
-     *
-     * @param func binary functor for reduce which takes two arguments, first argument is the source and get the new reduced value.
-     * Functor must specialize the function getMPI_Op.
-     * @param dest buffer for result data
-     * @param src a class or a pointer where the reduce algorithm can access the value by operator [] (one dimension access)
-     * @param n number of elements to reduce
-     * @param method mpi method for reduce
-     *
-     */
-    template<class Functor, typename Type, class ReduceMethod >
-    HINLINE void operator()(Functor func,
-                             Type* dest,
-                             Type* src,
-                             const size_t n,
-                             const ReduceMethod method)
-    {
-        if (!isMPICommInitialized)
-            participate(true);
-        typedef Type ValueType;
-
-        method(func,
-               dest,
-               src,
-               n * ::pmacc::mpi::getMPI_StructAsArray<ValueType > ().sizeMultiplier,
-               ::pmacc::mpi::getMPI_StructAsArray<ValueType > ().dataType,
-               ::pmacc::mpi::getMPI_Op<Functor > (),
-               comm);
-    }
-
-    /* Reduce elements on cpu memory
-     * the default reduce method is allReduce which means that any host get the reduced value back
-     *
-     * @param func binary functor for reduce which takes two arguments, first argument is the source and get the new reduced value.
-     * Functor must specialize the function getMPI_Op.
-     * @param dest buffer for result data
-     * @param src a class or a pointer where the reduce algorithm can access the value by operator [] (one dimension access)
-     * @param n number of elements to reduce
-     *
-     * @return reduced value
-     */
-    template<class Functor, typename Type >
-    HINLINE void operator()(Functor func,
-                             Type* dest,
-                             Type* src,
-                             const size_t n)
-    {
-        if (!isMPICommInitialized)
-            participate(true);
-        this->operator ()(func, dest, src, n, ::pmacc::mpi::reduceMethods::AllReduce());
-    }
+            /* Reduce elements on cpu memory
+             * call hasResult to see if returned value is valid
+             *
+             * @param func binary functor for reduce which takes two arguments, first argument is the source and get
+             * the new reduced value. Functor must specialize the function getMPI_Op.
+             * @param dest buffer for result data
+             * @param src a class or a pointer where the reduce algorithm can access the value by operator [] (one
+             * dimension access)
+             * @param n number of elements to reduce
+             * @param method mpi method for reduce
+             *
+             */
+            template<class Functor, typename Type, class ReduceMethod>
+            HINLINE void operator()(Functor func, Type* dest, Type* src, const size_t n, const ReduceMethod method)
+            {
+                if(!isMPICommInitialized)
+                    participate(true);
+                typedef Type ValueType;
+
+                method(
+                    func,
+                    dest,
+                    src,
+                    n * ::pmacc::mpi::getMPI_StructAsArray<ValueType>().sizeMultiplier,
+                    ::pmacc::mpi::getMPI_StructAsArray<ValueType>().dataType,
+                    ::pmacc::mpi::getMPI_Op<Functor>(),
+                    comm);
+            }
 
+            /* Reduce elements on cpu memory
+             * the default reduce method is allReduce which means that any host get the reduced value back
+             *
+             * @param func binary functor for reduce which takes two arguments, first argument is the source and get
+             * the new reduced value. Functor must specialize the function getMPI_Op.
+             * @param dest buffer for result data
+             * @param src a class or a pointer where the reduce algorithm can access the value by operator [] (one
+             * dimension access)
+             * @param n number of elements to reduce
+             *
+             * @return reduced value
+             */
+            template<class Functor, typename Type>
+            HINLINE void operator()(Functor func, Type* dest, Type* src, const size_t n)
+            {
+                if(!isMPICommInitialized)
+                    participate(true);
+                this->operator()(func, dest, src, n, ::pmacc::mpi::reduceMethods::AllReduce());
+            }
 
-private:
 
-    MPI_Comm comm;
-    int mpiRank;
-    int numRanks;
-    bool isMPICommInitialized;
-};
-} // namespace mpi
+        private:
+            MPI_Comm comm;
+            int mpiRank;
+            int numRanks;
+            bool isMPICommInitialized;
+        };
+    } // namespace mpi
 } // namespace pmacc
diff --git a/include/pmacc/mpi/MPI_StructAsArray.hpp b/include/pmacc/mpi/MPI_StructAsArray.hpp
index c5af978b04..94c0b59fb3 100644
--- a/include/pmacc/mpi/MPI_StructAsArray.hpp
+++ b/include/pmacc/mpi/MPI_StructAsArray.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -31,12 +31,11 @@ namespace pmacc
     {
         struct MPI_StructAsArray
         {
-
             MPI_StructAsArray(MPI_Datatype type, uint32_t factor) : dataType(type), sizeMultiplier(factor)
             {
             }
             MPI_Datatype dataType;
             uint32_t sizeMultiplier;
         };
-    }
-}
+    } // namespace mpi
+} // namespace pmacc
diff --git a/include/pmacc/mpi/SeedPerRank.hpp b/include/pmacc/mpi/SeedPerRank.hpp
index d4804ab0a1..6e7acf6591 100644
--- a/include/pmacc/mpi/SeedPerRank.hpp
+++ b/include/pmacc/mpi/SeedPerRank.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Alexander Grund
+/* Copyright 2014-2021 Axel Huebl, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -28,53 +28,52 @@
 
 namespace pmacc
 {
-namespace mpi
-{
-    /** Calculate a Seed per Rank
-     *
-     * This functor derives a unique seed for each MPI rank (or GPU) from
-     * a given global seed in a deterministic manner.
-     *
-     * \tparam T_DIM Dimensionality of the simulation (1-3 D)
-     */
-    template <unsigned T_DIM>
-    struct SeedPerRank
+    namespace mpi
     {
-        /** Functor implementation
+        /** Calculate a Seed per Rank
          *
-         * This method provides a guaranteed unique number per MPI rank
-         * (or GPU). When a (only locally unique) localSeed parameter is used
-         * it is furthermore guaranteed that this number does not collide
-         * with an other seed.
+         * This functor derives a unique seed for each MPI rank (or GPU) from
+         * a given global seed in a deterministic manner.
          *
-         * \param localSeed Initial seed to vary two identical simulations
-         *                  can have been xor'ed with e.g. a unique species id
-         *                  to get an unique seed per species
-         * \return uint32_t seed
+         * \tparam T_DIM Dimensionality of the simulation (1-3 D)
          */
-        uint32_t
-        operator()( uint32_t localSeed )
+        template<unsigned T_DIM>
+        struct SeedPerRank
         {
-            auto& gc = pmacc::Environment<T_DIM>::get().GridController();
-
-            uint32_t rank = gc.getGlobalRank( );
-            /* We put the rank into the upper bits to allow values which start
-             * from zero (e.g. cellIdxs, time steps) to be used as additional seed contributors
-             * Those would then write to the lower bits leaving the upper bits alone
-             * which still results in globally unique seeds
-             */
-            uint32_t globalUniqueSeed = reverseBits(rank);
-            /* localSeed often contains a counted number, so we rotate it by some bits to not "destroy"
-             * the counted rank that is already there. Also it is not reversed to get a different pattern
+            /** Functor implementation
+             *
+             * This method provides a guaranteed unique number per MPI rank
+             * (or GPU). When a (only locally unique) localSeed parameter is used
+             * it is furthermore guaranteed that this number does not collide
+             * with an other seed.
+             *
+             * \param localSeed Initial seed to vary two identical simulations
+             *                  can have been xor'ed with e.g. a unique species id
+             *                  to get an unique seed per species
+             * \return uint32_t seed
              */
-            localSeed = (localSeed << 16) | (localSeed >> (sizeof(uint32_t) * CHAR_BIT - 16));
-            globalUniqueSeed ^= localSeed;
-            /* For any globally constant localSeed globalUniqueSeed is now guaranteed
-             * to be globally unique
-             */
-            return globalUniqueSeed;
-        }
-    };
+            uint32_t operator()(uint32_t localSeed)
+            {
+                auto& gc = pmacc::Environment<T_DIM>::get().GridController();
+
+                uint32_t rank = gc.getGlobalRank();
+                /* We put the rank into the upper bits to allow values which start
+                 * from zero (e.g. cellIdxs, time steps) to be used as additional seed contributors
+                 * Those would then write to the lower bits leaving the upper bits alone
+                 * which still results in globally unique seeds
+                 */
+                uint32_t globalUniqueSeed = reverseBits(rank);
+                /* localSeed often contains a counted number, so we rotate it by some bits to not "destroy"
+                 * the counted rank that is already there. Also it is not reversed to get a different pattern
+                 */
+                localSeed = (localSeed << 16) | (localSeed >> (sizeof(uint32_t) * CHAR_BIT - 16));
+                globalUniqueSeed ^= localSeed;
+                /* For any globally constant localSeed globalUniqueSeed is now guaranteed
+                 * to be globally unique
+                 */
+                return globalUniqueSeed;
+            }
+        };
 
-} /* namespace mpi */
-} /* namespace picongpu */
+    } /* namespace mpi */
+} // namespace pmacc
diff --git a/include/pmacc/mpi/reduceMethods/AllReduce.hpp b/include/pmacc/mpi/reduceMethods/AllReduce.hpp
index bbd298f251..4c91bb481b 100644
--- a/include/pmacc/mpi/reduceMethods/AllReduce.hpp
+++ b/include/pmacc/mpi/reduceMethods/AllReduce.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,37 +27,35 @@
 
 namespace pmacc
 {
-namespace mpi
-{
-
-namespace reduceMethods
-{
-
-struct AllReduce
-{
-
-    HINLINE bool hasResult(int mpiRank) const
+    namespace mpi
     {
-        return mpiRank != -1;
-    }
-
-    template<class Functor, typename Type >
-    HINLINE void operator()(Functor, Type* dest, Type* src, const size_t count, MPI_Datatype type, MPI_Op op, MPI_Comm comm) const
-    {
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Allreduce((void*) src,
-                                (void*) dest,
-                                count,
-                                type,
-                                op, comm));
-    }
-};
-
-} /*namespace reduceMethods*/
-
-} /*namespace mpi*/
+        namespace reduceMethods
+        {
+            struct AllReduce
+            {
+                HINLINE bool hasResult(int mpiRank) const
+                {
+                    return mpiRank != -1;
+                }
+
+                template<class Functor, typename Type>
+                HINLINE void operator()(
+                    Functor,
+                    Type* dest,
+                    Type* src,
+                    const size_t count,
+                    MPI_Datatype type,
+                    MPI_Op op,
+                    MPI_Comm comm) const
+                {
+                    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                    __getTransactionEvent().waitForFinished();
+                    MPI_CHECK(MPI_Allreduce((void*) src, (void*) dest, count, type, op, comm));
+                }
+            };
+
+        } /*namespace reduceMethods*/
+
+    } /*namespace mpi*/
 
 } /*namespace pmacc*/
-
-
diff --git a/include/pmacc/mpi/reduceMethods/Reduce.hpp b/include/pmacc/mpi/reduceMethods/Reduce.hpp
index 542c908fd9..3d8ad894ac 100644
--- a/include/pmacc/mpi/reduceMethods/Reduce.hpp
+++ b/include/pmacc/mpi/reduceMethods/Reduce.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,38 +27,36 @@
 
 namespace pmacc
 {
-namespace mpi
-{
-
-namespace reduceMethods
-{
-
-struct Reduce
-{
-
-    HINLINE bool hasResult(int mpiRank) const
+    namespace mpi
     {
-        return mpiRank == 0;
-    }
-
-    template<class Functor, typename Type >
-    HINLINE void operator()(Functor, Type* dest, Type* src, const size_t count, MPI_Datatype type, MPI_Op op, MPI_Comm comm) const
-    {
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-
-        MPI_CHECK(MPI_Reduce((void*) src,
-                             (void*) dest,
-                             count,
-                             type,
-                             op, 0, comm));
-    }
-};
-
-} /*namespace reduceMethods*/
-
-} /*namespace mpi*/
+        namespace reduceMethods
+        {
+            struct Reduce
+            {
+                HINLINE bool hasResult(int mpiRank) const
+                {
+                    return mpiRank == 0;
+                }
+
+                template<class Functor, typename Type>
+                HINLINE void operator()(
+                    Functor,
+                    Type* dest,
+                    Type* src,
+                    const size_t count,
+                    MPI_Datatype type,
+                    MPI_Op op,
+                    MPI_Comm comm) const
+                {
+                    // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+                    __getTransactionEvent().waitForFinished();
+
+                    MPI_CHECK(MPI_Reduce((void*) src, (void*) dest, count, type, op, 0, comm));
+                }
+            };
+
+        } /*namespace reduceMethods*/
+
+    } /*namespace mpi*/
 
 } /*namespace pmacc*/
-
-
diff --git a/include/pmacc/nvidia/atomic.hpp b/include/pmacc/nvidia/atomic.hpp
index 134f7989c6..f7673538dc 100644
--- a/include/pmacc/nvidia/atomic.hpp
+++ b/include/pmacc/nvidia/atomic.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Alexander Grund
+/* Copyright 2015-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -21,182 +21,246 @@
 
 #pragma once
 
-
 #include "pmacc/types.hpp"
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include "pmacc/nvidia/warp.hpp"
-#endif
+#include "pmacc/memory/Array.hpp"
+#include "pmacc/nvidia/warp.hpp"
+
+#include <alpaka/intrinsic/Traits.hpp>
+#include <alpaka/warp/Traits.hpp>
+
 #include <boost/type_traits.hpp>
+
+#include <type_traits>
 #include <climits>
 
 
 namespace pmacc
 {
-namespace nvidia
-{
+    namespace nvidia
+    {
+        namespace detail
+        {
+            /** optimized atomic operation without return value
+             *
+             * For some backends PMacc is using optimized intrinsics to perform this operation.
+             *
+             * @tparam T_Op atomic alpaka operation type
+             * @tparam T_Acc alpaka accelerator context type
+             * @tparam T_Type value type
+             * @tparam T_Hierarchy alpaka hierarchy type of the atomic operation
+             */
+            template<typename T_Op, typename T_Acc, typename T_Type, typename T_Hierarchy>
+            struct AtomicOpNoRet
+            {
+                /** perform the atomic operation
+                 *
+                 * @param acc alpaka accelerator context
+                 * @param ptr pointer to destination memory
+                 * @param value input value
+                 * @param hierarchy alpaka hierarchy scope for atomics
+                 */
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    T_Type* ptr,
+                    T_Type const value,
+                    T_Hierarchy const& hierarchy)
+                {
+                    ::alpaka::atomicOp<T_Op>(acc, ptr, value, hierarchy);
+                }
+            };
 
-    namespace detail {
+#if(!defined(__CUDA__) && ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+            /** HIP backend specialization for atomic add
+             *
+             * Uses the intrinsic atomicAddNoRet available for AMD gpus only.
+             * Not compatible with HIP-nvcc.
+             */
+            template<typename T_Hierarchy, typename... T_AccArgs>
+            struct AtomicOpNoRet<::alpaka::AtomicAdd, alpaka::AccGpuHipRt<T_AccArgs...>, float, T_Hierarchy>
+            {
+                DINLINE void operator()(
+                    alpaka::AccGpuHipRt<T_AccArgs...> const& acc,
+                    float* ptr,
+                    float const value,
+                    T_Hierarchy const& hierarchy)
+                {
+                    ::atomicAddNoRet(ptr, value);
+                }
+            };
+#endif
 
-        template<typename T_Type, bool T_isKepler>
-        struct AtomicAllInc
-        {
-            template< typename T_Acc, typename T_Hierarchy >
-            HDINLINE T_Type
-            operator()(const T_Acc& acc, T_Type* ptr, const T_Hierarchy& hierarchy)
+            template<typename T_Type, bool T_isKepler>
+            struct AtomicAllInc
             {
-                return ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, ptr, T_Type(1), hierarchy);
-            }
-        };
-
-#if PMACC_CUDA_ARCH >= 300
-       /**
-         * Trait that returns whether an optimized version of AtomicAllInc
-         * exists for Kepler architectures (and up)
-         */
-        template<typename T>
-        struct AtomicAllIncIsOptimized
-        {
-            enum{
-                value = boost::is_same<T,          int>::value ||
-                        boost::is_same<T, unsigned int>::value ||
-                        boost::is_same<T,          long long int>::value ||
-                        boost::is_same<T, unsigned long long int>::value ||
-                        boost::is_same<T, float>::value
+                template<typename T_Acc, typename T_Hierarchy>
+                HDINLINE T_Type operator()(const T_Acc& acc, T_Type* ptr, const T_Hierarchy& hierarchy)
+                {
+                    return ::alpaka::atomicOp<::alpaka::AtomicAdd>(acc, ptr, T_Type(1), hierarchy);
+                }
             };
-        };
 
-        /**
-         * AtomicAllInc for Kepler and up
-         * Defaults to unoptimized version for unsupported types
-         */
-        template<typename T_Type, bool T_UseOptimized = AtomicAllIncIsOptimized<T_Type>::value>
-        struct AtomicAllIncKepler: public AtomicAllInc<T_Type, false>
-        {};
+#if CUPLA_DEVICE_COMPILE == 1
+            /**
+             * Trait that returns whether an optimized version of AtomicAllInc
+             * exists for Kepler architectures (and up)
+             */
+            template<typename T>
+            struct AtomicAllIncIsOptimized
+            {
+                enum
+                {
+                    value = boost::is_same<T, int>::value || boost::is_same<T, unsigned int>::value
+                        || boost::is_same<T, long long int>::value || boost::is_same<T, unsigned long long int>::value
+                        || boost::is_same<T, float>::value
+                };
+            };
 
-        /**
-         * Optimized version
-         *
-         * This warp aggregated atomic increment implementation based on nvidia parallel forall example
-         * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
-         * (author: Andrew Adinetz, date: October 1th, 2014)
-         *
-         */
-        template<typename T_Type>
-        struct AtomicAllIncKepler<T_Type, true>
-        {
-            template< typename T_Acc, typename T_Hierarchy >
-            HDINLINE T_Type
-            operator()(const T_Acc& acc,T_Type* ptr, const T_Hierarchy& hierarchy)
+            /**
+             * AtomicAllInc for Kepler and up
+             * Defaults to unoptimized version for unsupported types
+             */
+            template<typename T_Type, bool T_UseOptimized = AtomicAllIncIsOptimized<T_Type>::value>
+            struct AtomicAllIncKepler : public AtomicAllInc<T_Type, false>
             {
-                /* Get a bitmask with 1 for each thread in the warp, that executes this */
-#if(__CUDACC_VER_MAJOR__ >= 9)
-                const int mask = __activemask();
-#else
-                const int mask = __ballot(1);
-#endif
-                /* select the leader */
-                const int leader = __ffs(mask) - 1;
-                T_Type result;
-                const int laneId = getLaneId();
-                /* Get the start value for this warp */
-                if (laneId == leader)
-                    result = ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc,ptr, static_cast<T_Type>(__popc(mask)), hierarchy);
-                result = warpBroadcast(result, leader);
-                /* Add offset per thread */
-                return result + static_cast<T_Type>(__popc(mask & ((1 << laneId) - 1)));
-            }
-        };
-
-        /**
-         * Optimized version for int64.
-         * As CUDA atomicAdd does not support int64 directly we just cast it
-         * and call the uint64 implementation
-         */
-        template<>
-        struct AtomicAllIncKepler<long long int, true>
-        {
-            template< typename T_Acc, typename T_Hierarchy >
-            HDINLINE long long int
-            operator()(const T_Acc& acc, long long int* ptr, const T_Hierarchy&, const T_Hierarchy& hierarchy )
+            };
+
+            /**
+             * Optimized version
+             *
+             * This warp aggregated atomic increment implementation based on nvidia parallel forall example
+             * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
+             * (author: Andrew Adinetz, date: October 1th, 2014)
+             *
+             */
+            template<typename T_Type>
+            struct AtomicAllIncKepler<T_Type, true>
             {
-                return static_cast<long long int>(
-                        AtomicAllIncKepler<unsigned long long int>()(
+                template<typename T_Acc, typename T_Hierarchy>
+                HDINLINE T_Type operator()(const T_Acc& acc, T_Type* ptr, const T_Hierarchy& hierarchy)
+                {
+                    const auto mask = alpaka::warp::activemask(acc);
+                    const auto leader = alpaka::ffs(acc, static_cast<std::make_signed_t<decltype(mask)>>(mask)) - 1;
+
+                    T_Type result;
+                    const int laneId = getLaneId();
+                    /* Get the start value for this warp */
+                    if(laneId == leader)
+                        result = ::alpaka::atomicOp<::alpaka::AtomicAdd>(
                             acc,
-                            reinterpret_cast<unsigned long long int*>(ptr),
-                            hierarchy
-                        )
-                );
-            }
-        };
+                            ptr,
+                            static_cast<T_Type>(alpaka::popcount(acc, mask)),
+                            hierarchy);
+                    result = warpBroadcast(result, leader);
+                    /* Add offset per thread */
+                    return result
+                        + static_cast<T_Type>(
+                               alpaka::popcount(acc, mask & ((static_cast<decltype(mask)>(1u) << laneId) - 1u)));
+                }
+            };
 
-        template<typename T_Type>
-        struct AtomicAllInc<T_Type, true>: public AtomicAllIncKepler<T_Type>
-        {};
-#endif /* PMACC_CUDA_ARCH >= 300 */
+            /**
+             * Optimized version for int64.
+             * As CUDA atomicAdd does not support int64 directly we just cast it
+             * and call the uint64 implementation
+             */
+            template<>
+            struct AtomicAllIncKepler<long long int, true>
+            {
+                template<typename T_Acc, typename T_Hierarchy>
+                HDINLINE long long int operator()(
+                    const T_Acc& acc,
+                    long long int* ptr,
+                    const T_Hierarchy&,
+                    const T_Hierarchy& hierarchy)
+                {
+                    return static_cast<long long int>(AtomicAllIncKepler<unsigned long long int>()(
+                        acc,
+                        reinterpret_cast<unsigned long long int*>(ptr),
+                        hierarchy));
+                }
+            };
 
-    }  // namespace detail
+            template<typename T_Type>
+            struct AtomicAllInc<T_Type, true> : public AtomicAllIncKepler<T_Type>
+            {
+            };
+#endif // CUPLA_DEVICE_COMPILE == 1
 
-/** optimized atomic increment
- *
- * - only optimized if PTX ISA >=3.0
- * - this atomic uses warp aggregation to speedup the operation compared to cuda `atomicInc()`
- * - cuda `atomicAdd()` is used if the compute architecture does not support warp aggregation
- * - all participate threads must change the same pointer (ptr) else the result is unspecified
- *
- * @param ptr pointer to memory (must be the same address for all threads in a block)
- *
- */
-template<typename T, typename T_Acc, typename T_Hierarchy>
-HDINLINE
-T atomicAllInc(const T_Acc& acc, T *ptr, const T_Hierarchy& hierarchy)
-{
-    return detail::AtomicAllInc<T, (PMACC_CUDA_ARCH >= 300) >()(acc, ptr, hierarchy);
-}
+        } // namespace detail
 
-template<typename T>
-HDINLINE
-T atomicAllInc(T *ptr)
-{
-#ifdef __CUDA_ARCH__
-    return atomicAllInc(alpaka::atomic::AtomicCudaBuiltIn(), ptr, ::alpaka::hierarchy::Grids());
-#else
-    // assume that we can use the standard library atomics if we are not on gpu
-    return atomicAllInc(alpaka::atomic::AtomicStdLibLock<16>(), ptr, ::alpaka::hierarchy::Grids());
-#endif
-}
+        /** optimized atomic increment
+         *
+         * - only optimized if PTX ISA >=3.0
+         * - this atomic uses warp aggregation to speedup the operation compared to cuda `atomicInc()`
+         * - cuda `atomicAdd()` is used if the compute architecture does not support warp aggregation
+         * - all participate threads must change the same pointer (ptr) else the result is unspecified
+         *
+         * @param ptr pointer to memory (must be the same address for all threads in a block)
+         *
+         */
+        template<typename T, typename T_Acc, typename T_Hierarchy>
+        HDINLINE T atomicAllInc(const T_Acc& acc, T* ptr, const T_Hierarchy& hierarchy)
+        {
+            return detail::AtomicAllInc<T, (PMACC_CUDA_ARCH >= 300 || BOOST_COMP_HIP)>()(acc, ptr, hierarchy);
+        }
 
-/** optimized atomic value exchange
- *
- * - only optimized if PTX ISA >=2.0
- * - this atomic uses warp vote function to speedup the operation
- *   compared to cuda `atomicExch()`
- * - cuda `atomicExch()` is used if the compute architecture not supports
- *   warps vote functions
- * - all participate threads must change the same
- *   pointer (ptr) and set the same value, else the
- *   result is unspecified
- *
- * @param ptr pointer to memory (must be the same address for all threads in a block)
- * @param value new value (must be the same for all threads in a block)
- */
-template<typename T_Type, typename T_Acc, typename T_Hierarchy>
-DINLINE void
-atomicAllExch(const T_Acc& acc, T_Type* ptr, const T_Type value, const T_Hierarchy& hierarchy)
-{
-#if (__CUDA_ARCH__ >= 200)
-#   if(__CUDACC_VER_MAJOR__ >= 9)
-    const int mask = __activemask();
-#   else
-    const int mask = __ballot(1);
-#   endif
-    // select the leader
-    const int leader = __ffs(mask) - 1;
-    // leader does the update
-    if (getLaneId() == leader)
-#endif
-        ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, ptr, value, hierarchy);
-}
+        template<typename T>
+        HDINLINE T atomicAllInc(T* ptr)
+        {
+            /* Dirty hack to call an alpaka accelerator based function.
+             * Members of the fakeAcc will be uninitialized and must not be accessed.
+             *
+             * The id provider for particles is the only code where atomicAllInc is used without an accelerator.
+             * @todo remove the unsafe faked accelerator
+             */
+            pmacc::memory::Array<cupla::AccThreadSeq, 1> fakeAcc;
+            return atomicAllInc(fakeAcc[0], ptr, ::alpaka::hierarchy::Grids());
+        }
 
+        /** optimized atomic value exchange
+         *
+         * - only optimized if PTX ISA >=2.0
+         * - this atomic uses warp vote function to speedup the operation
+         *   compared to cuda `atomicExch()`
+         * - cuda `atomicExch()` is used if the compute architecture not supports
+         *   warps vote functions
+         * - all participate threads must change the same
+         *   pointer (ptr) and set the same value, else the
+         *   result is unspecified
+         *
+         * @param ptr pointer to memory (must be the same address for all threads in a block)
+         * @param value new value (must be the same for all threads in a block)
+         */
+        template<typename T_Type, typename T_Acc, typename T_Hierarchy>
+        DINLINE void atomicAllExch(const T_Acc& acc, T_Type* ptr, const T_Type value, const T_Hierarchy& hierarchy)
+        {
+            const auto mask = alpaka::warp::activemask(acc);
+            const auto leader = alpaka::ffs(acc, static_cast<std::make_signed_t<decltype(mask)>>(mask)) - 1;
+
+#if CUPLA_DEVICE_COMPILE == 1
+            if(getLaneId() == leader)
+#endif
+                ::alpaka::atomicOp<::alpaka::AtomicExch>(acc, ptr, value, hierarchy);
+        }
 
-} //namespace nvidia
-} //namespace pmacc
+        /** optimized atomic operation without return value
+         *
+         * Executes an alpaka atomic operation but without giving the old value back.
+         * For some backends PMacc is using optimized intrinsics to perform this operation.
+         *
+         * @tparam T_Op atomic alpaka operation type
+         * @tparam T_Acc alpaka accelerator context type
+         * @tparam T_Type value type
+         * @tparam T_Hierarchy alpaka hierarchy type of the atomic operation
+         * @param acc alpaka accelerator context
+         * @param ptr pointer to memory
+         * @param value input value
+         * @param hierarchy alpaka hierarchy scope for atomics
+         */
+        template<typename T_Op, typename T_Acc, typename T_Type, typename T_Hierarchy>
+        DINLINE void atomicOpNoRet(T_Acc const& acc, T_Type* ptr, T_Type const value, T_Hierarchy const& hierarchy)
+        {
+            return detail::AtomicOpNoRet<T_Op, T_Acc, T_Type, T_Hierarchy>{}(acc, ptr, value, hierarchy);
+        }
+    } // namespace nvidia
+} // namespace pmacc
diff --git a/include/pmacc/nvidia/functors/Add.hpp b/include/pmacc/nvidia/functors/Add.hpp
index ee4b391857..52a0d13b58 100644
--- a/include/pmacc/nvidia/functors/Add.hpp
+++ b/include/pmacc/nvidia/functors/Add.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -26,36 +26,36 @@
 
 namespace pmacc
 {
-namespace nvidia
-{
-namespace functors
-{
-    struct Add
+    namespace nvidia
     {
-        template<typename Dst, typename Src >
-        HDINLINE void operator()(Dst & dst, const Src & src) const
+        namespace functors
         {
-            dst += src;
-        }
+            struct Add
+            {
+                template<typename Dst, typename Src>
+                HDINLINE void operator()(Dst& dst, const Src& src) const
+                {
+                    dst += src;
+                }
 
-        template<typename Dst, typename Src, typename T_Acc >
-        HDINLINE void operator()(const T_Acc &, Dst & dst, const Src & src) const
-        {
-            dst += src;
-        }
-    };
-} // namespace functors
-} // namespace nvidia
+                template<typename Dst, typename Src, typename T_Acc>
+                HDINLINE void operator()(const T_Acc&, Dst& dst, const Src& src) const
+                {
+                    dst += src;
+                }
+            };
+        } // namespace functors
+    } // namespace nvidia
 } // namespace pmacc
 
 namespace pmacc
 {
-namespace mpi
-{
-    template<>
-    HINLINE MPI_Op getMPI_Op<pmacc::nvidia::functors::Add>()
+    namespace mpi
     {
-        return MPI_SUM;
-    }
-} // namespace mpi
+        template<>
+        HINLINE MPI_Op getMPI_Op<pmacc::nvidia::functors::Add>()
+        {
+            return MPI_SUM;
+        }
+    } // namespace mpi
 } // namespace pmacc
diff --git a/include/pmacc/nvidia/functors/Assign.hpp b/include/pmacc/nvidia/functors/Assign.hpp
index 796e02c65c..18cc959784 100644
--- a/include/pmacc/nvidia/functors/Assign.hpp
+++ b/include/pmacc/nvidia/functors/Assign.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -25,24 +25,24 @@
 
 namespace pmacc
 {
-namespace nvidia
-{
-namespace functors
-{
-    struct Assign
+    namespace nvidia
     {
-        template<typename Dst, typename Src >
-        HDINLINE void operator()(Dst & dst, const Src & src) const
+        namespace functors
         {
-            dst = src;
-        }
+            struct Assign
+            {
+                template<typename Dst, typename Src>
+                HDINLINE void operator()(Dst& dst, const Src& src) const
+                {
+                    dst = src;
+                }
 
-        template<typename Dst, typename Src, typename T_Acc >
-        HDINLINE void operator()(const T_Acc &, Dst & dst, const Src & src) const
-        {
-            dst = src;
-        }
-    };
-} // namespace functors
-} // namespace nvidia
+                template<typename Dst, typename Src, typename T_Acc>
+                HDINLINE void operator()(const T_Acc&, Dst& dst, const Src& src) const
+                {
+                    dst = src;
+                }
+            };
+        } // namespace functors
+    } // namespace nvidia
 } // namespace pmacc
diff --git a/include/pmacc/nvidia/functors/Atomic.hpp b/include/pmacc/nvidia/functors/Atomic.hpp
new file mode 100644
index 0000000000..7969fdb1cc
--- /dev/null
+++ b/include/pmacc/nvidia/functors/Atomic.hpp
@@ -0,0 +1,74 @@
+/* Copyright 2020-2021 Rene Widera
+ *
+ * This file is part of PMacc.
+ *
+ * PMacc is free software: you can redistribute it and/or modify
+ * it under the terms of either the GNU General Public License or
+ * the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PMacc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License and the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * and the GNU Lesser General Public License along with PMacc.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pmacc/types.hpp"
+
+#include "pmacc/nvidia/atomic.hpp"
+
+namespace pmacc
+{
+    namespace nvidia
+    {
+        namespace functors
+        {
+            /** Addition of two values
+             *
+             * @tparam T_AlpakaOperation alpaka atomic operation [::alpaka::op]
+             * @tparam T_AlpakaHierarchy alpaka atomic hierarchy [::alpaka::hierarchy]
+             */
+            template<typename T_AlpakaOperation, typename T_AlpakaHierarchy = ::alpaka::hierarchy::Grids>
+            struct Atomic
+            {
+                /** Execute generic atomic operation */
+                template<typename T_Acc, typename T_Dst, typename T_Src>
+                HDINLINE void operator()(T_Acc const& acc, T_Dst& dst, T_Src const& src) const
+                {
+                    atomicOpNoRet<T_AlpakaOperation>(acc, &dst, src, T_AlpakaHierarchy{});
+                }
+
+                /** Execute atomic operation for pmacc::math::Vector */
+                template<
+                    typename T_Acc,
+                    typename T_Type,
+                    int T_dim,
+                    typename T_DstAccessor,
+                    typename T_DstNavigator,
+                    template<typename, int>
+                    class T_DstStorage,
+                    typename T_SrcAccessor,
+                    typename T_SrcNavigator,
+                    template<typename, int>
+                    class T_SrcStorage>
+                HDINLINE void operator()(
+                    T_Acc const& acc,
+                    pmacc::math::Vector<T_Type, T_dim, T_DstAccessor, T_DstNavigator, T_DstStorage>& dst,
+                    pmacc::math::Vector<T_Type, T_dim, T_SrcAccessor, T_SrcNavigator, T_SrcStorage> const& src) const
+                {
+                    for(int i = 0; i < T_dim; ++i)
+                        atomicOpNoRet<T_AlpakaOperation>(acc, &dst[i], src[i], T_AlpakaHierarchy{});
+                }
+            };
+
+        } // namespace functors
+    } // namespace nvidia
+} // namespace pmacc
diff --git a/include/pmacc/nvidia/functors/Max.hpp b/include/pmacc/nvidia/functors/Max.hpp
index 355fd8ddc1..47dd4c5cbd 100644
--- a/include/pmacc/nvidia/functors/Max.hpp
+++ b/include/pmacc/nvidia/functors/Max.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,36 +28,36 @@
 
 namespace pmacc
 {
-namespace nvidia
-{
-namespace functors
-{
-    struct Max
+    namespace nvidia
     {
-        template<typename Dst, typename Src >
-        DINLINE void operator()(Dst & dst, const Src & src) const
+        namespace functors
         {
-            dst = algorithms::math::max(dst, src);
-        }
+            struct Max
+            {
+                template<typename Dst, typename Src>
+                DINLINE void operator()(Dst& dst, const Src& src) const
+                {
+                    dst = math::max(dst, src);
+                }
 
-        template<typename Dst, typename Src, typename T_Acc >
-        DINLINE void operator()(const T_Acc &, Dst & dst, const Src & src) const
-        {
-            dst = algorithms::math::max(dst, src);
-        }
-    };
-} // namespace functors
-} // namespace nvidia
+                template<typename Dst, typename Src, typename T_Acc>
+                DINLINE void operator()(const T_Acc&, Dst& dst, const Src& src) const
+                {
+                    dst = math::max(dst, src);
+                }
+            };
+        } // namespace functors
+    } // namespace nvidia
 } // namespace pmacc
 
 namespace pmacc
 {
-namespace mpi
-{
-    template<>
-    HINLINE MPI_Op getMPI_Op<pmacc::nvidia::functors::Max>()
+    namespace mpi
     {
-        return MPI_MAX;
-    }
-} // namespace mpi
+        template<>
+        HINLINE MPI_Op getMPI_Op<pmacc::nvidia::functors::Max>()
+        {
+            return MPI_MAX;
+        }
+    } // namespace mpi
 } // namespace pmacc
diff --git a/include/pmacc/nvidia/functors/Min.hpp b/include/pmacc/nvidia/functors/Min.hpp
index 6431843a55..dfec74251e 100644
--- a/include/pmacc/nvidia/functors/Min.hpp
+++ b/include/pmacc/nvidia/functors/Min.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -29,36 +29,36 @@
 
 namespace pmacc
 {
-namespace nvidia
-{
-namespace functors
-{
-    struct Min
+    namespace nvidia
     {
-        template<typename Dst, typename Src >
-        DINLINE void operator()(Dst & dst, const Src & src) const
+        namespace functors
         {
-            dst = algorithms::math::max(dst, src);
-        }
+            struct Min
+            {
+                template<typename Dst, typename Src>
+                DINLINE void operator()(Dst& dst, const Src& src) const
+                {
+                    dst = math::min(dst, src);
+                }
 
-        template<typename Dst, typename Src, typename T_Acc >
-        DINLINE void operator()(const T_Acc &, Dst & dst, const Src & src) const
-        {
-            dst = algorithms::math::max(dst, src);
-        }
-    };
-} // namespace functors
-} // namespace nvidia
+                template<typename Dst, typename Src, typename T_Acc>
+                DINLINE void operator()(const T_Acc&, Dst& dst, const Src& src) const
+                {
+                    dst = math::min(dst, src);
+                }
+            };
+        } // namespace functors
+    } // namespace nvidia
 } // namespace pmacc
 
 namespace pmacc
 {
-namespace mpi
-{
-    template<>
-    HINLINE MPI_Op getMPI_Op<pmacc::nvidia::functors::Min>()
+    namespace mpi
     {
-        return MPI_MIN;
-    }
-} // namespace mpi
+        template<>
+        HINLINE MPI_Op getMPI_Op<pmacc::nvidia::functors::Min>()
+        {
+            return MPI_MIN;
+        }
+    } // namespace mpi
 } // namespace pmacc
diff --git a/include/pmacc/nvidia/functors/Mul.hpp b/include/pmacc/nvidia/functors/Mul.hpp
index 8a67aa192b..8ccafd42ee 100644
--- a/include/pmacc/nvidia/functors/Mul.hpp
+++ b/include/pmacc/nvidia/functors/Mul.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl
+/* Copyright 2014-2021 Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -26,38 +26,36 @@
 
 namespace pmacc
 {
-namespace nvidia
-{
-namespace functors
-{
-    struct Mul
+    namespace nvidia
     {
-        template<typename Dst, typename Src>
-        HDINLINE void
-        operator()( Dst& dst, const Src& src ) const
+        namespace functors
         {
-            dst *= src;
-        }
+            struct Mul
+            {
+                template<typename Dst, typename Src>
+                HDINLINE void operator()(Dst& dst, const Src& src) const
+                {
+                    dst *= src;
+                }
 
-        template<typename Dst, typename Src, typename T_Acc>
-        HDINLINE void
-        operator()( const T_Acc &, Dst& dst, const Src& src ) const
-        {
-            dst *= src;
-        }
-    };
-} // namespace functors
-} // namespace nvidia
+                template<typename Dst, typename Src, typename T_Acc>
+                HDINLINE void operator()(const T_Acc&, Dst& dst, const Src& src) const
+                {
+                    dst *= src;
+                }
+            };
+        } // namespace functors
+    } // namespace nvidia
 } // namespace pmacc
 
 namespace pmacc
 {
-namespace mpi
-{
-    template<>
-    HINLINE MPI_Op getMPI_Op<pmacc::nvidia::functors::Mul>()
+    namespace mpi
     {
-        return MPI_PROD;
-    }
-} // namespace mpi
+        template<>
+        HINLINE MPI_Op getMPI_Op<pmacc::nvidia::functors::Mul>()
+        {
+            return MPI_PROD;
+        }
+    } // namespace mpi
 } // namespace pmacc
diff --git a/include/pmacc/nvidia/functors/Sub.hpp b/include/pmacc/nvidia/functors/Sub.hpp
index 9b62ad0c86..cc0822efe4 100644
--- a/include/pmacc/nvidia/functors/Sub.hpp
+++ b/include/pmacc/nvidia/functors/Sub.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl
+/* Copyright 2014-2021 Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -25,26 +25,24 @@
 
 namespace pmacc
 {
-namespace nvidia
-{
-namespace functors
-{
-    struct Sub
+    namespace nvidia
     {
-        template<typename Dst, typename Src>
-        HDINLINE void
-        operator()( Dst& dst, const Src& src ) const
+        namespace functors
         {
-            dst -= src;
-        }
+            struct Sub
+            {
+                template<typename Dst, typename Src>
+                HDINLINE void operator()(Dst& dst, const Src& src) const
+                {
+                    dst -= src;
+                }
 
-        template<typename Dst, typename Src, typename T_Acc>
-        HDINLINE void
-        operator()( const T_Acc &, Dst& dst, const Src& src ) const
-        {
-            dst -= src;
-        }
-    };
-} // namespace functors
-} // namespace nvidia
+                template<typename Dst, typename Src, typename T_Acc>
+                HDINLINE void operator()(const T_Acc&, Dst& dst, const Src& src) const
+                {
+                    dst -= src;
+                }
+            };
+        } // namespace functors
+    } // namespace nvidia
 } // namespace pmacc
diff --git a/include/pmacc/nvidia/gpuEntryFunction.hpp b/include/pmacc/nvidia/gpuEntryFunction.hpp
index e5b44a133a..716f490b97 100644
--- a/include/pmacc/nvidia/gpuEntryFunction.hpp
+++ b/include/pmacc/nvidia/gpuEntryFunction.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Felix Rene Widera
+/* Copyright 2016-2021 Felix Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,32 +28,25 @@
 
 namespace pmacc
 {
-namespace nvidia
-{
-
-    /**
-     *
-     * @tparam T_KernelFunctor type of the functor for device execution
-    */
-    template< typename T_KernelFunctor >
-    struct PMaccKernel
+    namespace nvidia
     {
         /**
          *
-         * @param acc functor for device execution
-         * @param args arguments for the functor
+         * @tparam T_KernelFunctor type of the functor for device execution
          */
-        template<
-            typename T_Acc,
-            typename ... T_Args
-        >
-        DINLINE void operator()(
-            T_Acc const acc,
-            T_Args ... args
-        ) const
+        template<typename T_KernelFunctor>
+        struct PMaccKernel
         {
-            T_KernelFunctor{}( acc, args ... );
-        }
-    };
-} //namespace nvidia
-} //namespace pmacc
+            /**
+             *
+             * @param acc functor for device execution
+             * @param args arguments for the functor
+             */
+            template<typename T_Acc, typename... T_Args>
+            DINLINE void operator()(T_Acc const acc, T_Args... args) const
+            {
+                T_KernelFunctor{}(acc, args...);
+            }
+        };
+    } // namespace nvidia
+} // namespace pmacc
diff --git a/include/pmacc/nvidia/memory/MemoryInfo.hpp b/include/pmacc/nvidia/memory/MemoryInfo.hpp
index c87152f95a..a4af3ee54c 100644
--- a/include/pmacc/nvidia/memory/MemoryInfo.hpp
+++ b/include/pmacc/nvidia/memory/MemoryInfo.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,106 +28,99 @@
 
 namespace pmacc
 {
-
-namespace nvidia
-{
-namespace memory
-{
-
-/**
- * Provides convenience methods for querying memory information.
- * Singleton class.
- */
-class MemoryInfo
-{
-public:
-    /**
-     * Returns information about device memory.
-     *
-     * @param free amount of free memory in bytes. can be nullptr
-     * @param total total amount of memory in bytes. can be nullptr. (nullptr by default)
-     */
-    void getMemoryInfo(size_t *free, size_t *total = nullptr)
+    namespace nvidia
     {
-        size_t freeInternal = 0;
-        size_t totalInternal = 0;
-
-        CUDA_CHECK(cudaMemGetInfo(&freeInternal, &totalInternal));
-
-        if (free != nullptr)
-        {
-            if (reservedMem > freeInternal)
-                freeInternal = 0;
-            else
-                freeInternal -= reservedMem;
-
-            *free = freeInternal;
-        }
-        if (total != nullptr)
+        namespace memory
         {
-            if (reservedMem > totalInternal)
-                totalInternal = 0;
-            else
-                totalInternal -= reservedMem;
-
-            *total = totalInternal;
-        }
-    }
-
-    /** Returns true if the memory pool is shared by host and device */
-    bool isSharedMemoryPool()
-    {
-#if( PMACC_CUDA_ENABLED != 1 )
-        return true;
+            /**
+             * Provides convenience methods for querying memory information.
+             * Singleton class.
+             */
+            class MemoryInfo
+            {
+            public:
+                /**
+                 * Returns information about device memory.
+                 *
+                 * @param free amount of free memory in bytes. can be nullptr
+                 * @param total total amount of memory in bytes. can be nullptr. (nullptr by default)
+                 */
+                void getMemoryInfo(size_t* free, size_t* total = nullptr)
+                {
+                    size_t freeInternal = 0;
+                    size_t totalInternal = 0;
+
+                    CUDA_CHECK(cuplaMemGetInfo(&freeInternal, &totalInternal));
+
+                    if(free != nullptr)
+                    {
+                        if(reservedMem > freeInternal)
+                            freeInternal = 0;
+                        else
+                            freeInternal -= reservedMem;
+
+                        *free = freeInternal;
+                    }
+                    if(total != nullptr)
+                    {
+                        if(reservedMem > totalInternal)
+                            totalInternal = 0;
+                        else
+                            totalInternal -= reservedMem;
+
+                        *total = totalInternal;
+                    }
+                }
+
+                /** Returns true if the memory pool is shared by host and device */
+                bool isSharedMemoryPool()
+                {
+#if(PMACC_CUDA_ENABLED != 1 && ALPAKA_ACC_GPU_HIP_ENABLED != 1)
+                    return true;
 #else
-        size_t freeInternal = 0;
-        size_t freeAtStart = 0;
+                    size_t freeInternal = 0;
+                    size_t freeAtStart = 0;
 
-        getMemoryInfo(&freeAtStart);
+                    getMemoryInfo(&freeAtStart);
 
-        /* alloc 90%, since allocating 100% is a bit risky on a SoC-like device */
-        size_t allocSth = size_t( 0.9 * double(freeAtStart) );
-        uint8_t* c = new uint8_t[allocSth];
-        memset(c, 0, allocSth);
+                    /* alloc 90%, since allocating 100% is a bit risky on a SoC-like device */
+                    size_t allocSth = size_t(0.9 * double(freeAtStart));
+                    uint8_t* c = new uint8_t[allocSth];
+                    memset(c, 0, allocSth);
 
-        getMemoryInfo(&freeInternal);
-        delete [] c;
+                    getMemoryInfo(&freeInternal);
+                    delete[] c;
 
-        /* if we allocated 90% of available mem, we should have "lost" more
-         * than 50% of memory, even with fluctuations from the OS */
-        if( double(freeInternal)/double(freeAtStart) < 0.5 )
-            return true;
+                    /* if we allocated 90% of available mem, we should have "lost" more
+                     * than 50% of memory, even with fluctuations from the OS */
+                    if(double(freeInternal) / double(freeAtStart) < 0.5)
+                        return true;
 
-        return false;
+                    return false;
 #endif
-    }
-
-    void setReservedMemory(size_t reservedMem)
-    {
-        this->reservedMem = reservedMem;
-    }
-
-protected:
-    size_t reservedMem;
-
-private:
-
-    friend struct detail::Environment;
-
-    static MemoryInfo& getInstance()
-    {
-        static MemoryInfo instance;
-        return instance;
-    }
-
-    MemoryInfo() :
-    reservedMem(0)
-    {
-
-    }
-};
-} //namespace memory
-} //namespace nvidia
-} //namespace pmacc
-
-
+                }
+
+                void setReservedMemory(size_t reservedMem)
+                {
+                    this->reservedMem = reservedMem;
+                }
+
+            protected:
+                size_t reservedMem;
+
+            private:
+                friend struct detail::Environment;
+
+                static MemoryInfo& getInstance()
+                {
+                    static MemoryInfo instance;
+                    return instance;
+                }
+
+                MemoryInfo() : reservedMem(0)
+                {
+                }
+            };
+        } // namespace memory
+    } // namespace nvidia
+} // namespace pmacc
diff --git a/include/pmacc/nvidia/reduce/Reduce.hpp b/include/pmacc/nvidia/reduce/Reduce.hpp
index 1b3448aef7..4d6fcc0a38 100644
--- a/include/pmacc/nvidia/reduce/Reduce.hpp
+++ b/include/pmacc/nvidia/reduce/Reduce.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -37,551 +37,440 @@
 
 namespace pmacc
 {
-namespace nvidia
-{
-namespace reduce
-{
-
-namespace kernel
-{
-    /** reduce elements within a buffer
-     *
-     * @tparam type element type within the buffer
-     * @tparam T_blockSize minimum number of elements which will be reduced
-     *                     within a CUDA block
-     * @tparam T_numWorkers number of workers
-     */
-    template<
-        typename Type,
-        uint32_t T_blockSize,
-        uint32_t T_numWorkers
-    >
-    struct Reduce
+    namespace nvidia
     {
-
-        /** reduce buffer
-         *
-         * This method can be used to reduce a chunk of an array.
-         * This method is a **collective** method and needs to be called by all
-         * threads within a CUDA block.
-         *
-         * @tparam T_SrcBuffer type of the buffer
-         * @tparam T_DestBuffer type of result buffer
-         * @tparam T_Functor type of the binary functor to reduce two elements to the intermediate buffer
-         * @tparam T_DestFunctor type of the binary functor to reduce two elements to @destBuffer
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param acc alpaka accelerator
-         * @param srcBuffer a class or a pointer with the `operator[](size_t)` (one dimensional access)
-         * @param bufferSize number of elements in @p srcBuffer
-         * @param destBuffer a class or a pointer with the `operator[](size_t)` (one dimensional access),
-         *        number of elements within the buffer must be at least one
-         * @param func binary functor for reduce which takes two arguments,
-         *        first argument is the source and get the new reduced value.
-         * @param destFunc binary functor for reduce which takes two arguments,
-         *        first argument is the source and get the new reduced value.
-         *
-         * @result void intermediate results are stored in @destBuffer,
-         *         the final result is stored in the first slot of @destBuffer
-         *         if the operator is called with one CUDA block
-         */
-        template<
-            typename T_SrcBuffer,
-            typename T_DestBuffer,
-            typename T_Functor,
-            typename T_DestFunctor,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_SrcBuffer const & srcBuffer,
-            uint32_t const bufferSize,
-            T_DestBuffer destBuffer,
-            T_Functor func,
-            T_DestFunctor destFunc
-        ) const
-        {
-            using namespace mappings::threads;
-
-            constexpr uint32_t numWorkers = T_numWorkers;
-            uint32_t const workerIdx = threadIdx.x;
-
-            uint32_t const numGlobalVirtualThreadCount = gridDim.x * T_blockSize;
-            WorkerCfg< numWorkers > workerCfg( workerIdx );
-
-            sharedMemExtern(s_mem,Type);
-
-            this->operator()(
-                acc,
-                workerCfg,
-                numGlobalVirtualThreadCount,
-                srcBuffer,
-                bufferSize,
-                func,
-                s_mem,
-                blockIdx.x
-            );
-
-            using MasterOnly = IdxConfig<
-                1,
-                numWorkers
-            >;
-
-            ForEachIdx< MasterOnly >{ workerIdx }(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    destFunc(
-                        acc,
-                        destBuffer[ blockIdx.x ],
-                        s_mem[ 0 ]
-                    );
-                }
-            );
-        }
-
-        /** reduce a buffer
-         *
-         * This method can be used to reduce a chunk of an array.
-         * This method is a **collective** method and needs to be called by all
-         * threads within a cuda block.
-         *
-         * @tparam T_SrcBuffer type of the buffer
-         * @tparam T_Functor type of the binary functor to reduce two elements
-         * @tparam T_SharedBuffer type of the shared memory buffer
-         * @tparam T_WorkerCfg worker configuration type
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param acc alpaka accelerator
-         * @param workerCfg lockstep worker configuration
-         * @param numReduceThreads Number of threads which working together to reduce the array.
-         *                         For a reduction within a block the value must be equal to T_blockSize
-         * @param srcBuffer a class or a pointer with the `operator[](size_t)` (one dimensional access)
-         * @param bufferSize number of elements in @p srcBuffer
-         * @param func binary functor for reduce which takes two arguments,
-         *        first argument is the source and get the new reduced value.
-         * @param sharedMem shared memory buffer with storage for `linearThreadIdxInBlock` elements,
-         *        buffer must implement `operator[](size_t)` (one dimensional access)
-         * @param blockIndex index of the cuda block,
-         *                   for a global reduce: `blockIdx.x`,
-         *                   for a reduce within a block: `0`
-         *
-         * @result void the result is stored in the first slot of @p sharedMem
-         */
-        template<
-            typename T_SrcBuffer,
-            typename T_Functor,
-            typename T_SharedBuffer,
-            typename T_WorkerCfg,
-            typename T_Acc
-        >
-        DINLINE void
-        operator()(
-            T_Acc const & acc,
-            T_WorkerCfg const workerCfg,
-            size_t const numReduceThreads,
-            T_SrcBuffer const & srcBuffer,
-            size_t const bufferSize,
-            T_Functor const & func,
-            T_SharedBuffer & sharedMem,
-            size_t const blockIndex = 0u
-        ) const
+        namespace reduce
         {
-            using namespace mappings::threads;
-
-            using VirtualWorkerCfg = IdxConfig<
-                T_blockSize,
-                T_WorkerCfg::numWorkers
-            >;
-
-            pmacc::memory::CtxArray<
-                uint32_t,
-                VirtualWorkerCfg
-            >
-            linearReduceThreadIdxCtx(
-                workerCfg.getWorkerIdx( ),
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    return blockIndex * T_blockSize  + linearIdx;
-                }
-            );
-
-            pmacc::memory::CtxArray<
-                bool,
-                VirtualWorkerCfg
-            >
-            isActiveCtx(
-                workerCfg.getWorkerIdx(),
-                [&](
-                    uint32_t const,
-                    uint32_t const idx
-                )
+            namespace kernel
+            {
+                /** reduce elements within a buffer
+                 *
+                 * @tparam type element type within the buffer
+                 * @tparam T_blockSize minimum number of elements which will be reduced
+                 *                     within a CUDA block
+                 * @tparam T_numWorkers number of workers
+                 */
+                template<typename Type, uint32_t T_blockSize, uint32_t T_numWorkers>
+                struct Reduce
                 {
-                    return linearReduceThreadIdxCtx[ idx ] < bufferSize;
-                }
-            );
+                    /** reduce buffer
+                     *
+                     * This method can be used to reduce a chunk of an array.
+                     * This method is a **collective** method and needs to be called by all
+                     * threads within a CUDA block.
+                     *
+                     * @tparam T_SrcBuffer type of the buffer
+                     * @tparam T_DestBuffer type of result buffer
+                     * @tparam T_Functor type of the binary functor to reduce two elements to the intermediate buffer
+                     * @tparam T_DestFunctor type of the binary functor to reduce two elements to @destBuffer
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param acc alpaka accelerator
+                     * @param srcBuffer a class or a pointer with the `operator[](size_t)` (one dimensional access)
+                     * @param bufferSize number of elements in @p srcBuffer
+                     * @param destBuffer a class or a pointer with the `operator[](size_t)` (one dimensional access),
+                     *        number of elements within the buffer must be at least one
+                     * @param func binary functor for reduce which takes two arguments,
+                     *        first argument is the source and get the new reduced value.
+                     * @param destFunc binary functor for reduce which takes two arguments,
+                     *        first argument is the source and get the new reduced value.
+                     *
+                     * @result void intermediate results are stored in @destBuffer,
+                     *         the final result is stored in the first slot of @destBuffer
+                     *         if the operator is called with one CUDA block
+                     */
+                    template<
+                        typename T_SrcBuffer,
+                        typename T_DestBuffer,
+                        typename T_Functor,
+                        typename T_DestFunctor,
+                        typename T_Acc>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_SrcBuffer const& srcBuffer,
+                        uint32_t const bufferSize,
+                        T_DestBuffer destBuffer,
+                        T_Functor func,
+                        T_DestFunctor destFunc) const
+                    {
+                        using namespace mappings::threads;
 
-            ForEachIdx< VirtualWorkerCfg > forEachVirtualThread( workerCfg.getWorkerIdx() );
+                        constexpr uint32_t numWorkers = T_numWorkers;
+                        uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-            forEachVirtualThread(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    if( isActiveCtx[ idx ] )
+                        uint32_t const numGlobalVirtualThreadCount = cupla::gridDim(acc).x * T_blockSize;
+                        WorkerCfg<numWorkers> workerCfg(workerIdx);
+
+                        sharedMemExtern(s_mem, Type);
+
+                        this->operator()(
+                            acc,
+                            workerCfg,
+                            numGlobalVirtualThreadCount,
+                            srcBuffer,
+                            bufferSize,
+                            func,
+                            s_mem,
+                            cupla::blockIdx(acc).x);
+
+                        using MasterOnly = IdxConfig<1, numWorkers>;
+
+                        ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
+                            destFunc(acc, destBuffer[cupla::blockIdx(acc).x], s_mem[0]);
+                        });
+                    }
+
+                    /** reduce a buffer
+                     *
+                     * This method can be used to reduce a chunk of an array.
+                     * This method is a **collective** method and needs to be called by all
+                     * threads within a cupla block.
+                     *
+                     * @tparam T_SrcBuffer type of the buffer
+                     * @tparam T_Functor type of the binary functor to reduce two elements
+                     * @tparam T_SharedBuffer type of the shared memory buffer
+                     * @tparam T_WorkerCfg worker configuration type
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param acc alpaka accelerator
+                     * @param workerCfg lockstep worker configuration
+                     * @param numReduceThreads Number of threads which working together to reduce the array.
+                     *                         For a reduction within a block the value must be equal to T_blockSize
+                     * @param srcBuffer a class or a pointer with the `operator[](size_t)` (one dimensional access)
+                     * @param bufferSize number of elements in @p srcBuffer
+                     * @param func binary functor for reduce which takes two arguments,
+                     *        first argument is the source and get the new reduced value.
+                     * @param sharedMem shared memory buffer with storage for `linearThreadIdxInBlock` elements,
+                     *        buffer must implement `operator[](size_t)` (one dimensional access)
+                     * @param blockIndex index of the cupla block,
+                     *                   for a global reduce: `cupla::blockIdx(acc).x`,
+                     *                   for a reduce within a block: `0`
+                     *
+                     * @result void the result is stored in the first slot of @p sharedMem
+                     */
+                    template<
+                        typename T_SrcBuffer,
+                        typename T_Functor,
+                        typename T_SharedBuffer,
+                        typename T_WorkerCfg,
+                        typename T_Acc>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_WorkerCfg const workerCfg,
+                        size_t const numReduceThreads,
+                        T_SrcBuffer const& srcBuffer,
+                        size_t const bufferSize,
+                        T_Functor const& func,
+                        T_SharedBuffer& sharedMem,
+                        size_t const blockIndex = 0u) const
                     {
-                        /*fill shared mem*/
-                        Type r_value = srcBuffer[ linearReduceThreadIdxCtx[ idx ] ];
-                        /*reduce not read global memory to shared*/
-                        uint32_t i = linearReduceThreadIdxCtx[ idx ] + numReduceThreads;
-                        while( i < bufferSize )
+                        using namespace mappings::threads;
+
+                        using VirtualWorkerCfg = IdxConfig<T_blockSize, T_WorkerCfg::numWorkers>;
+
+                        pmacc::memory::CtxArray<uint32_t, VirtualWorkerCfg> linearReduceThreadIdxCtx(
+                            workerCfg.getWorkerIdx(),
+                            [&](uint32_t const linearIdx, uint32_t const) {
+                                return blockIndex * T_blockSize + linearIdx;
+                            });
+
+                        pmacc::memory::CtxArray<bool, VirtualWorkerCfg> isActiveCtx(
+                            workerCfg.getWorkerIdx(),
+                            [&](uint32_t const, uint32_t const idx) {
+                                return linearReduceThreadIdxCtx[idx] < bufferSize;
+                            });
+
+                        ForEachIdx<VirtualWorkerCfg> forEachVirtualThread(workerCfg.getWorkerIdx());
+
+                        forEachVirtualThread([&](uint32_t const linearIdx, uint32_t const idx) {
+                            if(isActiveCtx[idx])
+                            {
+                                /*fill shared mem*/
+                                Type r_value = srcBuffer[linearReduceThreadIdxCtx[idx]];
+                                /*reduce not read global memory to shared*/
+                                uint32_t i = linearReduceThreadIdxCtx[idx] + numReduceThreads;
+                                while(i < bufferSize)
+                                {
+                                    func(acc, r_value, srcBuffer[i]);
+                                    i += numReduceThreads;
+                                }
+                                sharedMem[linearIdx] = r_value;
+                            }
+                        });
+
+                        cupla::__syncthreads(acc);
+                        /*now reduce shared memory*/
+                        uint32_t chunk_count = T_blockSize;
+
+                        while(chunk_count != 1u)
                         {
-                            func(
-                                acc,
-                                r_value,
-                                srcBuffer[ i ]
-                            );
-                            i += numReduceThreads;
+                            /* Half number of chunks (rounded down) */
+                            uint32_t active_threads = chunk_count / 2u;
+
+                            /* New chunks is half number of chunks rounded up for uneven counts
+                             * --> linearThreadIdxInBlock == 0 will reduce the single element for
+                             * an odd number of values at the end
+                             */
+                            chunk_count = (chunk_count + 1u) / 2u;
+
+                            forEachVirtualThread([&](uint32_t const linearIdx, uint32_t const idx) {
+                                isActiveCtx[idx] = (linearReduceThreadIdxCtx[idx] < bufferSize)
+                                    && !(linearIdx != 0u && linearIdx >= active_threads);
+                                if(isActiveCtx[idx])
+                                    func(acc, sharedMem[linearIdx], sharedMem[linearIdx + chunk_count]);
+
+                                cupla::__syncthreads(acc);
+                            });
                         }
-                        sharedMem[ linearIdx ] = r_value;
                     }
-                }
-            );
-
-            __syncthreads( );
-            /*now reduce shared memory*/
-            uint32_t chunk_count = T_blockSize;
+                };
+            } // namespace kernel
 
-            while( chunk_count != 1u )
+            class Reduce
             {
-                /* Half number of chunks (rounded down) */
-                uint32_t active_threads = chunk_count / 2u;
-
-                /* New chunks is half number of chunks rounded up for uneven counts
-                 * --> linearThreadIdxInBlock == 0 will reduce the single element for
-                 * an odd number of values at the end
+            public:
+                /* Constructor
+                 * Don't create a instance before you have set you cupla device!
+                 * @param byte how many bytes in global gpu memory can reserved for the reduce algorithm
+                 * @param sharedMemByte limit the usage of shared memory per block on gpu
                  */
-                chunk_count = ( chunk_count + 1u ) / 2u;
+                HINLINE Reduce(const uint32_t byte, const uint32_t sharedMemByte = 4 * 1024)
+                    : byte(byte)
+                    , sharedMemByte(sharedMemByte)
+                    , reduceBuffer(nullptr)
+                {
+                    reduceBuffer = new GridBuffer<char, DIM1>(DataSpace<DIM1>(byte));
+                }
 
-                forEachVirtualThread(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
+                /* Reduce elements in global gpu memory
+                 *
+                 * @param func binary functor for reduce which takes two arguments, first argument is the source and
+                 * get the new reduced value. Functor must specialize the function getMPI_Op.
+                 * @param src a class or a pointer where the reduce algorithm can access the value by operator [] (one
+                 * dimensional access)
+                 * @param n number of elements to reduce
+                 *
+                 * @return reduced value
+                 */
+                template<class Functor, typename Src>
+                HINLINE typename traits::GetValueType<Src>::ValueType operator()(Functor func, Src src, uint32_t n)
+                {
+                    /* - the result of a functor can be a reference or a const value
+                     * - it is not allowed to create const or reference memory
+                     *   thus we remove `references` and `const` qualifiers */
+                    typedef typename boost::remove_const<
+                        typename boost::remove_reference<typename traits::GetValueType<Src>::ValueType>::type>::type
+                        Type;
+
+                    uint32_t blockcount = optimalThreadsPerBlock(n, sizeof(Type));
+
+                    uint32_t n_buffer = byte / sizeof(Type);
+
+                    uint32_t threads = n_buffer * blockcount
+                        * 2; /* x2 is used thus we can use all byte in Buffer, after we calculate threads/2 */
+
+
+                    if(threads > n)
+                        threads = n;
+                    Type* dest = (Type*) reduceBuffer->getDeviceBuffer().getBasePointer();
+
+                    uint32_t blocks = threads / 2 / blockcount;
+                    if(blocks == 0)
+                        blocks = 1;
+                    callReduceKernel<Type>(
+                        blocks,
+                        blockcount,
+                        blockcount * sizeof(Type),
+                        src,
+                        n,
+                        dest,
+                        func,
+                        pmacc::nvidia::functors::Assign());
+                    n = blocks;
+                    blockcount = optimalThreadsPerBlock(n, sizeof(Type));
+                    blocks = n / 2 / blockcount;
+                    if(blocks == 0 && n > 1)
+                        blocks = 1;
+
+
+                    while(blocks != 0)
                     {
-                        isActiveCtx[ idx ] = ( linearReduceThreadIdxCtx[ idx ] < bufferSize ) &&
-                            !(
-                                linearIdx != 0u &&
-                                linearIdx >= active_threads
-                            );
-                        if( isActiveCtx[ idx ] )
-                            func(
-                                acc,
-                                sharedMem[ linearIdx ],
-                                sharedMem[ linearIdx + chunk_count ]
-                            );
-
-                        __syncthreads();
-                    }
-                );
-            }
-        }
-    };
-} // namespace kernel
-
-    class Reduce
-    {
-    public:
-
-        /* Constructor
-         * Don't create a instance before you have set you cuda device!
-         * @param byte how many bytes in global gpu memory can reserved for the reduce algorithm
-         * @param sharedMemByte limit the usage of shared memory per block on gpu
-         */
-        HINLINE Reduce(const uint32_t byte, const uint32_t sharedMemByte = 4 * 1024) :
-        byte(byte), sharedMemByte(sharedMemByte), reduceBuffer(nullptr)
-        {
-
-            reduceBuffer = new GridBuffer<char, DIM1 > (DataSpace<DIM1 > (byte));
-        }
-
-        /* Reduce elements in global gpu memory
-         *
-         * @param func binary functor for reduce which takes two arguments, first argument is the source and get the new reduced value.
-         * Functor must specialize the function getMPI_Op.
-         * @param src a class or a pointer where the reduce algorithm can access the value by operator [] (one dimensional access)
-         * @param n number of elements to reduce
-         *
-         * @return reduced value
-         */
-        template<class Functor, typename Src>
-        HINLINE typename traits::GetValueType<Src>::ValueType operator()(Functor func, Src src, uint32_t n)
-        {
-           /* - the result of a functor can be a reference or a const value
-            * - it is not allowed to create const or reference memory
-            *   thus we remove `references` and `const` qualifiers */
-           typedef typename boost::remove_const<
-                       typename boost::remove_reference<
-                           typename traits::GetValueType<Src>::ValueType
-                       >::type
-                   >::type Type;
-
-            uint32_t blockcount = optimalThreadsPerBlock(n, sizeof (Type));
-
-            uint32_t n_buffer = byte / sizeof (Type);
-
-            uint32_t threads = n_buffer * blockcount * 2; /* x2 is used thus we can use all byte in Buffer, after we calculate threads/2 */
-
-
-
-            if (threads > n) threads = n;
-            Type* dest = (Type*) reduceBuffer->getDeviceBuffer().getBasePointer();
+                        if(blocks > 1)
+                        {
+                            uint32_t blockOffset = ceil((double) blocks / blockcount);
+                            uint32_t useBlocks = blocks - blockOffset;
+                            uint32_t problemSize = n - (blockOffset * blockcount);
+                            Type* srcPtr = dest + (blockOffset * blockcount);
+
+                            callReduceKernel<Type>(
+                                useBlocks,
+                                blockcount,
+                                blockcount * sizeof(Type),
+                                srcPtr,
+                                problemSize,
+                                dest,
+                                func,
+                                func);
+                            blocks = blockOffset * blockcount;
+                        }
+                        else
+                        {
+                            callReduceKernel<Type>(
+                                blocks,
+                                blockcount,
+                                blockcount * sizeof(Type),
+                                dest,
+                                n,
+                                dest,
+                                func,
+                                pmacc::nvidia::functors::Assign());
+                        }
 
-            uint32_t blocks = threads / 2 / blockcount;
-            if (blocks == 0) blocks = 1;
-            callReduceKernel< Type >(blocks, blockcount, blockcount * sizeof (Type),
-                src, n, dest, func, pmacc::nvidia::functors::Assign());
-            n = blocks;
-            blockcount = optimalThreadsPerBlock(n, sizeof (Type));
-            blocks = n / 2 / blockcount;
-            if (blocks == 0 && n > 1) blocks = 1;
+                        n = blocks;
+                        blockcount = optimalThreadsPerBlock(n, sizeof(Type));
+                        blocks = n / 2 / blockcount;
+                        if(blocks == 0 && n > 1)
+                            blocks = 1;
+                    }
 
+                    reduceBuffer->deviceToHost();
+                    __getTransactionEvent().waitForFinished();
+                    return *((Type*) (reduceBuffer->getHostBuffer().getBasePointer()));
+                }
 
-            while (blocks != 0)
-            {
-                if (blocks > 1)
+                virtual ~Reduce()
                 {
-                    uint32_t blockOffset = ceil((double) blocks / blockcount);
-                    uint32_t useBlocks = blocks - blockOffset;
-                    uint32_t problemSize = n - (blockOffset * blockcount);
-                    Type* srcPtr = dest + (blockOffset * blockcount);
-
-                    callReduceKernel< Type >(useBlocks, blockcount, blockcount * sizeof (Type),
-                        srcPtr, problemSize, dest, func, func);
-                    blocks = blockOffset*blockcount;
+                    __delete(reduceBuffer);
                 }
-                else
-                {
 
-                    callReduceKernel< Type >(blocks, blockcount, blockcount * sizeof (Type),
-                        dest, n, dest, func, pmacc::nvidia::functors::Assign());
+            private:
+                /* calculate number of threads per block
+                 * @param threads maximal number of threads per block
+                 * @return number of threads per block
+                 */
+                HINLINE uint32_t getThreadsPerBlock(uint32_t threads)
+                {
+                    /// \todo this list is not complete
+                    ///        extend it and maybe check for sm_version
+                    ///        and add possible threads accordingly.
+                    ///        maybe this function should be exported
+                    ///        to a more general nvidia class, too.
+                    if(threads >= 512)
+                        return 512;
+                    if(threads >= 256)
+                        return 256;
+                    if(threads >= 128)
+                        return 128;
+                    if(threads >= 64)
+                        return 64;
+                    if(threads >= 32)
+                        return 32;
+                    if(threads >= 16)
+                        return 16;
+                    if(threads >= 8)
+                        return 8;
+                    if(threads >= 4)
+                        return 4;
+                    if(threads >= 2)
+                        return 2;
+
+                    return 1;
                 }
 
-                n = blocks;
-                blockcount = optimalThreadsPerBlock(n, sizeof (Type));
-                blocks = n / 2 / blockcount;
-                if (blocks == 0 && n > 1) blocks = 1;
-            }
-
-            reduceBuffer->deviceToHost();
-            __getTransactionEvent().waitForFinished();
-            return *((Type*) (reduceBuffer->getHostBuffer().getBasePointer()));
 
-        }
+                /* start the reduce kernel
+                 *
+                 * The minimal number of elements reduced within a CUDA block is chosen at
+                 * compile time.
+                 */
+                template<typename Type, typename... T_Args>
+                HINLINE void callReduceKernel(
+                    uint32_t blocks,
+                    uint32_t threads,
+                    uint32_t sharedMemSize,
+                    T_Args&&... args)
+                {
+                    if(threads >= 512u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<512u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 512u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else if(threads >= 256u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<256u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 256u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else if(threads >= 128u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<128u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 128u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else if(threads >= 64u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<64u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 64u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else if(threads >= 32u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<32u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 32u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else if(threads >= 16u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<16u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 16u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else if(threads >= 8u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<8u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 8u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else if(threads >= 4u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<4u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 4u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else if(threads >= 2u)
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<2u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 2u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                    else
+                    {
+                        constexpr uint32_t numWorkers = traits::GetNumWorkers<1u>::value;
+                        PMACC_KERNEL(kernel::Reduce<Type, 1u, numWorkers>{})
+                        (blocks, numWorkers, sharedMemSize)(args...);
+                    }
+                }
 
-        virtual ~Reduce()
-        {
-            __delete(reduceBuffer);
-        }
 
-    private:
+                /*calculate optimal number of threads per block with respect to shared memory limitations
+                 * @param n number of elements to reduce
+                 * @param sizePerElement size in bytes per elements
+                 * @return optimal count of threads per block to solve the problem
+                 */
+                HINLINE uint32_t optimalThreadsPerBlock(uint32_t n, uint32_t sizePerElement)
+                {
+                    uint32_t const sharedBorder = sharedMemByte / sizePerElement;
+                    return getThreadsPerBlock(std::min(sharedBorder, n));
+                }
 
-        /* calculate number of threads per block
-         * @param threads maximal number of threads per block
-         * @return number of threads per block
-         */
-        HINLINE uint32_t getThreadsPerBlock(uint32_t threads)
-        {
-            /// \todo this list is not complete
-            ///        extend it and maybe check for sm_version
-            ///        and add possible threads accordingly.
-            ///        maybe this function should be exported
-            ///        to a more general nvidia class, too.
-            if (threads >= 512) return 512;
-            if (threads >= 256) return 256;
-            if (threads >= 128) return 128;
-            if (threads >= 64) return 64;
-            if (threads >= 32) return 32;
-            if (threads >= 16) return 16;
-            if (threads >= 8) return 8;
-            if (threads >= 4) return 4;
-            if (threads >= 2) return 2;
-
-            return 1;
-        }
-
-
-        /* start the reduce kernel
-         *
-         * The minimal number of elements reduced within a CUDA block is chosen at
-         * compile time.
-         */
-        template< typename Type, typename ... T_Args >
-        HINLINE void callReduceKernel(
-            uint32_t blocks,
-            uint32_t threads,
-            uint32_t sharedMemSize,
-            T_Args && ... args
-        )
-        {
-            if(threads >= 512u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    512u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 512u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else if(threads >= 256u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    256u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 256u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else if(threads >= 128u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    128u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 128u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else if(threads >= 64u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    64u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 64u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else if(threads >= 32u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    32u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 32u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else if(threads >= 16u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    16u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 16u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else if(threads >= 8u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    8u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 8u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else if(threads >= 4u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    4u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 4u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else if(threads >= 2u)
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    2u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 2u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-            else
-            {
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    1u
-                >::value;
-                PMACC_KERNEL( kernel::Reduce< Type, 1u, numWorkers >{ } )(
-                    blocks,
-                    numWorkers,
-                    sharedMemSize
-                )(
-                    args ...
-                );
-            }
-        }
-
-
-        /*calculate optimal number of threads per block with respect to shared memory limitations
-         * @param n number of elements to reduce
-         * @param sizePerElement size in bytes per elements
-         * @return optimal count of threads per block to solve the problem
-         */
-        HINLINE uint32_t optimalThreadsPerBlock(uint32_t n, uint32_t sizePerElement)
-        {
-            uint32_t const sharedBorder = sharedMemByte / sizePerElement;
-            return getThreadsPerBlock(std::min(sharedBorder, n));
-        }
-
-        /*global gpu buffer for reduce steps*/
-        GridBuffer<char, DIM1 > *reduceBuffer;
-        /*buffer size limit in bytes on gpu*/
-        uint32_t byte;
-        /*shared memory limit in byte for one block*/
-        uint32_t sharedMemByte;
-
-    };
-}
-}
-}
+                /*global gpu buffer for reduce steps*/
+                GridBuffer<char, DIM1>* reduceBuffer;
+                /*buffer size limit in bytes on gpu*/
+                uint32_t byte;
+                /*shared memory limit in byte for one block*/
+                uint32_t sharedMemByte;
+            };
+        } // namespace reduce
+    } // namespace nvidia
+} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/RNG.hpp b/include/pmacc/nvidia/rng/RNG.hpp
deleted file mode 100644
index 2a47f857c9..0000000000
--- a/include/pmacc/nvidia/rng/RNG.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-
-    /* create a random number generator on gpu
-     * \tparam RngMethod method to generate random number
-     * \tparam Distribution functor for distribution
-     */
-    template<class RNGMethod, class Distribution>
-    class RNG : public RNGMethod
-    {
-    public:
-
-        typedef RNGMethod MethodType;
-        typedef Distribution DistributionType;
-        typedef RNG<RNGMethod, Distribution> This;
-
-        HDINLINE RNG()
-        {
-        }
-
-        /*
-         * \param rngMethod instance of generator
-         * \param distribution instance of distribution functor
-         */
-        DINLINE RNG(const RNGMethod& rng_method, const Distribution& rng_operation) :
-        RNGMethod(rng_method), op(rng_operation)
-        {
-        }
-
-        HDINLINE RNG(const This& other) :
-        RNGMethod(static_cast<RNGMethod>(other)), op(other.op)
-        {
-        }
-
-        /* default method to generate a random number
-         * @return random number
-         */
-        DINLINE typename Distribution::Type operator()()
-        {
-            return this->op(this->getState());
-        }
-
-    private:
-        PMACC_ALIGN(op, Distribution);
-    };
-
-    /* create a random number generator on gpu
-     * \tparam RngMethod method to generate random number
-     * \tparam Distribution functor for distribution
-     *
-     * \param rngMethod instance of generator
-     * \param distribution instance of distribution functor
-     * \return class which can used to generate random numbers
-     */
-    template<class RngMethod, class Distribution>
-    DINLINE typename pmacc::nvidia::rng::RNG<RngMethod, Distribution> create(const RngMethod & rngMethod,
-                                                                             const Distribution & distribution)
-    {
-        return pmacc::nvidia::rng::RNG<RngMethod, Distribution >(rngMethod, distribution);
-    }
-
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/distributions/Normal_float.hpp b/include/pmacc/nvidia/rng/distributions/Normal_float.hpp
deleted file mode 100644
index 2f09df70a7..0000000000
--- a/include/pmacc/nvidia/rng/distributions/Normal_float.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-namespace distributions
-{
-namespace detail
-{
-    /*Return normally distributed floats with mean 0.0f and standard deviation 1.0f
-     */
-    template< typename T_Acc>
-    class Normal_float
-    {
-    public:
-        typedef float Type;
-    private:
-        using Dist =
-            decltype(
-                ::alpaka::rand::distribution::createNormalReal<Type>(
-                    alpaka::core::declval<T_Acc const &>()));
-        PMACC_ALIGN(dist, Dist);
-    public:
-        HDINLINE Normal_float()
-        {
-        }
-
-        HDINLINE Normal_float(const T_Acc& acc) : dist(::alpaka::rand::distribution::createNormalReal<Type>(acc))
-        {
-        }
-
-        template<class RNGState>
-        DINLINE Type operator()(RNGState& state)
-        {
-            return dist(state);
-        }
-
-    };
-} // namespace detail
-
-    struct Normal_float
-    {
-        template< typename T_Acc>
-        static HDINLINE detail::Normal_float< T_Acc >
-        get( T_Acc const & acc)
-        {
-            return detail::Normal_float< T_Acc >( acc );
-        }
-    };
-} // namespace distributions
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp b/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp
deleted file mode 100644
index b757c9f04a..0000000000
--- a/include/pmacc/nvidia/rng/distributions/Uniform_float.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-namespace distributions
-{
-namespace detail
-{
-    /*create a random float number from [0.0,1.0)
-     */
-    template< typename T_Acc>
-    class Uniform_float
-    {
-    public:
-        typedef float Type;
-    private:
-        using Dist =
-            decltype(
-                ::alpaka::rand::distribution::createUniformReal<Type>(
-                    alpaka::core::declval<T_Acc const &>()));
-        PMACC_ALIGN(dist, Dist);
-    public:
-
-        HDINLINE Uniform_float()
-        {
-        }
-
-        HDINLINE Uniform_float(const T_Acc& acc) : dist(::alpaka::rand::distribution::createUniformReal<Type>(acc))
-        {
-        }
-
-        template<class RNGState>
-        DINLINE Type operator()(RNGState& state)
-        {
-            // (0.f, 1.0f]
-            const Type raw = dist(state);
-
-            /// \warn hack, are is that really ok? I say, yes, since
-            /// it shifts just exactly one number. Axel
-            ///
-            ///   Note: (1.0f - raw) does not work, since
-            ///         nvidia seems to return denormalized
-            ///         floats around 0.f (thats not as they
-            ///         state it out in their documentation)
-            // [0.f, 1.0f)
-            const Type r = raw * static_cast<float>( raw != Type(1.0) );
-            return r;
-        }
-
-    };
-} // namespace detail
-
-    struct Uniform_float
-    {
-        template< typename T_Acc>
-        static HDINLINE detail::Uniform_float< T_Acc >
-        get( T_Acc const & acc)
-        {
-            return detail::Uniform_float< T_Acc >( acc );
-        }
-    };
-} // namespace distributions
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp b/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp
deleted file mode 100644
index e0569ecb26..0000000000
--- a/include/pmacc/nvidia/rng/distributions/Uniform_int32.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-namespace distributions
-{
-namespace detail
-{
-    /*create a 32Bit random int number
-     * Range: [INT_MIN,INT_MAX]
-     */
-    template< typename T_Acc>
-    class Uniform_int32
-    {
-    public:
-        typedef int32_t Type;
-
-    private:
-        typedef uint32_t RngType;
-        using Dist =
-            decltype(
-                ::alpaka::rand::distribution::createUniformUint<RngType>(
-                    alpaka::core::declval<T_Acc const &>()));
-        PMACC_ALIGN(dist, Dist);
-    public:
-        HDINLINE Uniform_int()
-        {
-        }
-
-        HDINLINE Uniform_int(const T_Acc& acc) : dist(::alpaka::rand::distribution::createUniformUint<RngType>(acc))
-        {
-        }
-
-        template<class RNGState>
-        DINLINE Type operator()(RNGState& state)
-        {
-            /*curand create a random 32Bit int value*/
-            return static_cast<Type>(dist(state));
-        }
-    };
-} // namespace detail
-
-    struct Normal_float
-    {
-        template< typename T_Acc>
-        static HDINLINE detail::Uniform_int32< T_Acc >
-        get( T_Acc const & acc)
-        {
-            return detail::Uniform_int32< T_Acc >( acc );
-        }
-    };
-} // namespace distributions
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/rng/methods/Xor.hpp b/include/pmacc/nvidia/rng/methods/Xor.hpp
deleted file mode 100644
index f48e6c3714..0000000000
--- a/include/pmacc/nvidia/rng/methods/Xor.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-#pragma once
-
-#include "pmacc/types.hpp"
-
-namespace pmacc
-{
-namespace nvidia
-{
-namespace rng
-{
-namespace methods
-{
-
-    template< typename T_Acc >
-    class Xor
-    {
-    private:
-         using Gen =
-            decltype(
-                ::alpaka::rand::generator::createDefault(
-                    alpaka::core::declval<T_Acc const &>(),
-                    alpaka::core::declval<uint32_t &>(),
-                    alpaka::core::declval<uint32_t &>()));
-        PMACC_ALIGN(gen, Gen);
-    public:
-        typedef Gen StateType;
-        typedef T_Acc Acc;
-
-        HDINLINE Xor() : gen (0)
-        {
-        }
-
-        DINLINE Xor(const T_Acc& acc, uint32_t seed, uint32_t subsequence = 0)
-        {
-            gen = ::alpaka::rand::generator::createDefault(acc, seed, subsequence);
-        }
-
-        HDINLINE Xor(const Xor& other): gen(other.gen)
-        {
-
-        }
-
-    protected:
-
-        DINLINE StateType& getState()
-        {
-            return gen;
-        }
-    };
-} // namespace methods
-} // namespace rng
-} // namespace nvidia
-} // namespace pmacc
diff --git a/include/pmacc/nvidia/warp.hpp b/include/pmacc/nvidia/warp.hpp
index 48c58dbad6..d15bd93f2f 100644
--- a/include/pmacc/nvidia/warp.hpp
+++ b/include/pmacc/nvidia/warp.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Alexander Grund
+/* Copyright 2015-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -21,96 +21,99 @@
 
 #pragma once
 
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
 
-#include "pmacc/types.hpp"
+#    include "pmacc/types.hpp"
 
 
 namespace pmacc
 {
-namespace nvidia
-{
-
+    namespace nvidia
+    {
 /** get lane id of a thread within a warp
  *
  * id is in range [0,WAPRSIZE-1]
  * required PTX ISA >=1.3
  */
-#if (__CUDA_ARCH__ >= 130)
-DINLINE uint32_t getLaneId()
-{
-    uint32_t id;
-    asm("mov.u32 %0, %%laneid;" : "=r" (id));
-    return id;
-}
-#endif
+#    if(__CUDA_ARCH__ >= 130)
+        DINLINE uint32_t getLaneId()
+        {
+            uint32_t id;
+            asm("mov.u32 %0, %%laneid;" : "=r"(id));
+            return id;
+        }
+#    elif BOOST_COMP_HIP
+        DINLINE uint32_t getLaneId()
+        {
+            return __lane_id();
+        }
+#    endif
 
 
-#if (__CUDA_ARCH__ >= 300)
-/** broadcast data within a warp
- *
- * required PTX ISA >=3.0
- *
- * @param data value to broadcast
- * @param srcLaneId lane id of the source thread
- * @return value send by the source thread
- *
- * \{
- */
-//! broadcast a 32bit integer
-DINLINE int32_t warpBroadcast(const int32_t data, const int32_t srcLaneId)
-{
-#if(__CUDACC_VER_MAJOR__ >= 9)
-    return  __shfl_sync(__activemask(), data, srcLaneId);
-#else
-    return  __shfl(data, srcLaneId);
-#endif
-}
+#    if(__CUDA_ARCH__ >= 300 || BOOST_COMP_HIP)
+        /** broadcast data within a warp
+         *
+         * required PTX ISA >=3.0
+         *
+         * @param data value to broadcast
+         * @param srcLaneId lane id of the source thread
+         * @return value send by the source thread
+         *
+         * \{
+         */
+        //! broadcast a 32bit integer
+        DINLINE int32_t warpBroadcast(const int32_t data, const int32_t srcLaneId)
+        {
+#        if(__CUDACC_VER_MAJOR__ >= 9)
+            return __shfl_sync(__activemask(), data, srcLaneId);
+#        else
+            return __shfl(data, srcLaneId);
+#        endif
+        }
 
-//! Broadcast a 64bit integer by using 2 32bit broadcasts
-DINLINE int64_cu warpBroadcast(int64_cu data, const int32_t srcLaneId)
-{
-    int32_t* const pData = reinterpret_cast<int32_t*>(&data);
-    pData[0] = warpBroadcast(pData[0], srcLaneId);
-    pData[1] = warpBroadcast(pData[1], srcLaneId);
-    return data;
-}
+        //! Broadcast a 64bit integer by using 2 32bit broadcasts
+        DINLINE int64_cu warpBroadcast(int64_cu data, const int32_t srcLaneId)
+        {
+            int32_t* const pData = reinterpret_cast<int32_t*>(&data);
+            pData[0] = warpBroadcast(pData[0], srcLaneId);
+            pData[1] = warpBroadcast(pData[1], srcLaneId);
+            return data;
+        }
 
-//! Broadcast a 32bit unsigned int
-DINLINE uint32_t warpBroadcast(const uint32_t data, const int32_t srcLaneId)
-{
-    return static_cast<uint32_t>(
-        warpBroadcast(static_cast<int32_t>(data), srcLaneId)
-    );
-}
+        //! Broadcast a 32bit unsigned int
+        DINLINE uint32_t warpBroadcast(const uint32_t data, const int32_t srcLaneId)
+        {
+            return static_cast<uint32_t>(warpBroadcast(static_cast<int32_t>(data), srcLaneId));
+        }
 
-//!Broadcast a 64bit unsigned int
-DINLINE uint64_cu warpBroadcast(const uint64_cu data, const int32_t srcLaneId)
-{
-    return static_cast<uint64_cu>(
-        warpBroadcast(static_cast<int64_cu>(data), srcLaneId)
-    );
-}
+        //! Broadcast a 64bit unsigned int
+        DINLINE uint64_cu warpBroadcast(const uint64_cu data, const int32_t srcLaneId)
+        {
+            return static_cast<uint64_cu>(warpBroadcast(static_cast<int64_cu>(data), srcLaneId));
+        }
 
-//! Broadcast a 32bit float
-DINLINE float warpBroadcast(const float data, const int32_t srcLaneId)
-{
-#if(__CUDACC_VER_MAJOR__ >= 9)
-    return  __shfl_sync(__activemask(), data, srcLaneId);
-#else
-    return  __shfl(data, srcLaneId);
-#endif
-}
+        //! Broadcast a 32bit float
+        DINLINE float warpBroadcast(const float data, const int32_t srcLaneId)
+        {
+#        if(__CUDACC_VER_MAJOR__ >= 9)
+            return __shfl_sync(__activemask(), data, srcLaneId);
+#        else
+            return __shfl(data, srcLaneId);
+#        endif
+        }
 
-//! Broadcast a 64bit float by using 2 32bit broadcasts
-DINLINE double warpBroadcast(double data, const int32_t srcLaneId)
-{
-    float* const pData = reinterpret_cast<float*>(&data);
-    pData[0] = warpBroadcast(pData[0], srcLaneId);
-    pData[1] = warpBroadcast(pData[1], srcLaneId);
-    return data;
-}
+        //! Broadcast a 64bit float by using 2 32bit broadcasts
+        DINLINE double warpBroadcast(double data, const int32_t srcLaneId)
+        {
+            float* const pData = reinterpret_cast<float*>(&data);
+            pData[0] = warpBroadcast(pData[0], srcLaneId);
+            pData[1] = warpBroadcast(pData[1], srcLaneId);
+            return data;
+        }
 //! @}
-#endif
+#    endif
 
-} //namespace nvidia
-} //namespace pmacc
+    } // namespace nvidia
+} // namespace pmacc
+
+#endif
diff --git a/include/pmacc/particles/AsyncCommunicationImpl.hpp b/include/pmacc/particles/AsyncCommunicationImpl.hpp
index 8edbc0ed81..eaa2b3d3bd 100644
--- a/include/pmacc/particles/AsyncCommunicationImpl.hpp
+++ b/include/pmacc/particles/AsyncCommunicationImpl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -27,25 +27,27 @@
 #include "pmacc/particles/ParticlesBase.hpp"
 #include <boost/type_traits.hpp>
 
-namespace pmacc{
-
+namespace pmacc
+{
     /**
      * Trait that should return true if T is a particle species
      */
     template<typename T>
     struct IsParticleSpecies
     {
-        enum{ value = boost::is_same<typename T::SimulationDataTag, ParticlesTag>::value };
+        enum
+        {
+            value = boost::is_same<typename T::SimulationDataTag, ParticlesTag>::value
+        };
     };
 
-    namespace communication {
-
+    namespace communication
+    {
         template<typename T_Data>
-        struct AsyncCommunicationImpl<T_Data, Bool2Type<IsParticleSpecies<T_Data>::value> >
+        struct AsyncCommunicationImpl<T_Data, Bool2Type<IsParticleSpecies<T_Data>::value>>
         {
             template<class T_Particles>
-            EventTask
-            operator()(T_Particles& par, EventTask event) const
+            EventTask operator()(T_Particles& par, EventTask event) const
             {
                 EventTask ret;
                 __startTransaction(event);
@@ -59,5 +61,5 @@ namespace pmacc{
             }
         };
 
-    }  // namespace communication
-}  // namespace pmacc
+    } // namespace communication
+} // namespace pmacc
diff --git a/include/pmacc/particles/IdProvider.def b/include/pmacc/particles/IdProvider.def
index 25392d8a88..1e25e08559 100644
--- a/include/pmacc/particles/IdProvider.def
+++ b/include/pmacc/particles/IdProvider.def
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -26,7 +26,6 @@
 
 namespace pmacc
 {
-
     /**
      * Provider for globally unique ids (even across ranks)
      * Implemented for use in static contexts which allows e.g. calling from CUDA kernels
@@ -35,7 +34,8 @@ namespace pmacc
     class IdProvider
     {
     public:
-        struct State{
+        struct State
+        {
             /** Next id to be returned */
             uint64_t nextId;
             /** First id used */
diff --git a/include/pmacc/particles/IdProvider.hpp b/include/pmacc/particles/IdProvider.hpp
index 07bd569f1b..5f17c70631 100644
--- a/include/pmacc/particles/IdProvider.hpp
+++ b/include/pmacc/particles/IdProvider.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -31,10 +31,15 @@
 
 namespace pmacc
 {
-
-     namespace idDetail {
-
+    namespace idDetail
+    {
         DEVICEONLY uint64_cu nextId;
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+#    pragma acc declare device_resident(::pmacc::idDetail::nextId)
+#endif
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+#    pragma omp declare target(::pmacc::idDetail::nextId)
+#endif
 
         struct KernelSetNextId
         {
@@ -63,7 +68,7 @@ namespace pmacc
             }
         };
 
-    }  // namespace idDetail
+    } // namespace idDetail
 
     template<unsigned T_dim>
     uint64_t IdProvider<T_dim>::m_maxNumProc;
@@ -94,9 +99,8 @@ namespace pmacc
         m_startId = state.startId;
         if(m_maxNumProc < state.maxNumProc)
             m_maxNumProc = state.maxNumProc;
-        log<ggLog::INFO>("(Re-)Initialized IdProvider with id=%1%/%2% and maxNumProc=%3%/%4%")
-                % state.nextId % state.startId
-                % state.maxNumProc % m_maxNumProc;
+        log<ggLog::INFO>("(Re-)Initialized IdProvider with id=%1%/%2% and maxNumProc=%3%/%4%") % state.nextId
+            % state.startId % state.maxNumProc % m_maxNumProc;
     }
 
     template<unsigned T_dim>
@@ -127,7 +131,8 @@ namespace pmacc
          * when counting the bits from 1 = right most bit
          * So first we calculate n, then remove the lowest bits of the next id so we have only the n upper bits
          * If any of them is non-zero, it is an overflow and we can have duplicate ids.
-         * If not, then all ids are probably unique (still a chance, the id is overflown so much, that detection is impossible)
+         * If not, then all ids are probably unique (still a chance, the id is overflown so much, that detection is
+         * impossible)
          */
         uint64_t tmp = curState.maxNumProc - 1;
         int32_t bitsToCheck = 0;
@@ -182,4 +187,4 @@ namespace pmacc
         return static_cast<uint64_t>(newIdBuf.getHostBuffer().getDataBox()(0));
     }
 
-}  // namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/Identifier.hpp b/include/pmacc/particles/Identifier.hpp
index 5de6032581..3f6ccb0e75 100644
--- a/include/pmacc/particles/Identifier.hpp
+++ b/include/pmacc/particles/Identifier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Alexander Grund, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Alexander Grund, Axel Huebl
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/particles/ParticleDescription.hpp b/include/pmacc/particles/ParticleDescription.hpp
index ae7aefc421..801088db09 100644
--- a/include/pmacc/particles/ParticleDescription.hpp
+++ b/include/pmacc/particles/ParticleDescription.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,99 +29,97 @@
 
 namespace pmacc
 {
-
-/** ParticleDescription defines attributes, methods and flags of a particle
- *
- * This class holds no runtime data.
- * The class holds information about the name, attributes, flags and methods of a
- * particle.
- *
- * @tparam T_Name name of described particle (e.g. electron, ion)
- *                type must be a boost::mpl::string
- * @tparam T_SuperCellSize compile time size of a super cell
- * @tparam T_ValueTypeSeq sequence or single type with value_identifier
- * @tparam T_Flags sequence or single type with identifier to add flags on a frame
- * @tparam T_MethodsList sequence or single class with particle methods
- *                       (e.g. calculate mass, gamma, ...)
- *                       (e.g. useSolverXY, calcRadiation, ...)
- * @tparam T_FrameExtensionList sequence or single class with frame extensions
- *                    - extension must be an unary template class that supports bmpl::apply1<>
- *                    - type of the final frame is applied to each extension class
- *                      (this allows pointers and references to a frame itself)
- *                    - the final frame that uses ParticleDescription inherits from all
- *                      extension classes
- */
-template<
-typename T_Name,
-typename T_SuperCellSize,
-typename T_ValueTypeSeq,
-typename T_Flags = bmpl::vector0<>,
-typename T_HandleGuardRegion = HandleGuardRegion<particles::policies::ExchangeParticles, particles::policies::DeleteParticles>,
-typename T_MethodsList = bmpl::vector0<>,
-typename T_FrameExtensionList = bmpl::vector0<>
->
-struct ParticleDescription
-{
-    typedef T_Name Name;
-    typedef T_SuperCellSize SuperCellSize;
-    typedef typename ToSeq<T_ValueTypeSeq>::type ValueTypeSeq;
-    typedef typename ToSeq<T_Flags>::type FlagsList;
-    typedef T_HandleGuardRegion HandleGuardRegion;
-    typedef typename ToSeq<T_MethodsList>::type MethodsList;
-    typedef typename ToSeq<T_FrameExtensionList>::type FrameExtensionList;
-    typedef ParticleDescription<
-        Name,
-        SuperCellSize,
-        ValueTypeSeq,
-        FlagsList,
-        HandleGuardRegion,
-        MethodsList,
-        FrameExtensionList
-    > ThisType;
-
-};
+    /** ParticleDescription defines attributes, methods and flags of a particle
+     *
+     * This class holds no runtime data.
+     * The class holds information about the name, attributes, flags and methods of a
+     * particle.
+     *
+     * @tparam T_Name name of described particle (e.g. electron, ion)
+     *                type must be a boost::mpl::string
+     * @tparam T_SuperCellSize compile time size of a super cell
+     * @tparam T_ValueTypeSeq sequence or single type with value_identifier
+     * @tparam T_Flags sequence or single type with identifier to add flags on a frame
+     * @tparam T_MethodsList sequence or single class with particle methods
+     *                       (e.g. calculate mass, gamma, ...)
+     *                       (e.g. useSolverXY, calcRadiation, ...)
+     * @tparam T_FrameExtensionList sequence or single class with frame extensions
+     *                    - extension must be an unary template class that supports bmpl::apply1<>
+     *                    - type of the final frame is applied to each extension class
+     *                      (this allows pointers and references to a frame itself)
+     *                    - the final frame that uses ParticleDescription inherits from all
+     *                      extension classes
+     */
+    template<
+        typename T_Name,
+        typename T_SuperCellSize,
+        typename T_ValueTypeSeq,
+        typename T_Flags = bmpl::vector0<>,
+        typename T_HandleGuardRegion
+        = HandleGuardRegion<particles::policies::ExchangeParticles, particles::policies::DeleteParticles>,
+        typename T_MethodsList = bmpl::vector0<>,
+        typename T_FrameExtensionList = bmpl::vector0<>>
+    struct ParticleDescription
+    {
+        typedef T_Name Name;
+        typedef T_SuperCellSize SuperCellSize;
+        typedef typename ToSeq<T_ValueTypeSeq>::type ValueTypeSeq;
+        typedef typename ToSeq<T_Flags>::type FlagsList;
+        typedef T_HandleGuardRegion HandleGuardRegion;
+        typedef typename ToSeq<T_MethodsList>::type MethodsList;
+        typedef typename ToSeq<T_FrameExtensionList>::type FrameExtensionList;
+        typedef ParticleDescription<
+            Name,
+            SuperCellSize,
+            ValueTypeSeq,
+            FlagsList,
+            HandleGuardRegion,
+            MethodsList,
+            FrameExtensionList>
+            ThisType;
+    };
 
 
-/** Get ParticleDescription with a new ValueTypeSeq
- *
- * @tparam T_OldParticleDescription base description
- * @tparam T_NewValueTypeSeq new boost mpl sequence with value types
- * @treturn ::type new ParticleDescription
- */
-template<typename T_OldParticleDescription, typename T_NewValueTypeSeq>
-struct ReplaceValueTypeSeq
-{
-    typedef T_OldParticleDescription OldParticleDescription;
-    typedef ParticleDescription<
-    typename OldParticleDescription::Name,
-    typename OldParticleDescription::SuperCellSize,
-    typename ToSeq<T_NewValueTypeSeq>::type,
-    typename OldParticleDescription::FlagsList,
-    typename OldParticleDescription::HandleGuardRegion,
-    typename OldParticleDescription::MethodsList,
-    typename OldParticleDescription::FrameExtensionList
-    > type;
-};
+    /** Get ParticleDescription with a new ValueTypeSeq
+     *
+     * @tparam T_OldParticleDescription base description
+     * @tparam T_NewValueTypeSeq new boost mpl sequence with value types
+     * @treturn ::type new ParticleDescription
+     */
+    template<typename T_OldParticleDescription, typename T_NewValueTypeSeq>
+    struct ReplaceValueTypeSeq
+    {
+        typedef T_OldParticleDescription OldParticleDescription;
+        typedef ParticleDescription<
+            typename OldParticleDescription::Name,
+            typename OldParticleDescription::SuperCellSize,
+            typename ToSeq<T_NewValueTypeSeq>::type,
+            typename OldParticleDescription::FlagsList,
+            typename OldParticleDescription::HandleGuardRegion,
+            typename OldParticleDescription::MethodsList,
+            typename OldParticleDescription::FrameExtensionList>
+            type;
+    };
 
-/** Get ParticleDescription with a new FrameExtensionSeq
- *
- * @tparam T_OldParticleDescription base description
- * @tparam T_FrameExtensionSeq new boost mpl sequence with value types
- * @treturn ::type new ParticleDescription
- */
-template<typename T_OldParticleDescription, typename T_FrameExtensionSeq>
-struct ReplaceFrameExtensionSeq
-{
-    typedef T_OldParticleDescription OldParticleDescription;
-    typedef ParticleDescription<
-    typename OldParticleDescription::Name,
-    typename OldParticleDescription::SuperCellSize,
-    typename OldParticleDescription::ValueTypeSeq,
-    typename OldParticleDescription::FlagsList,
-    typename OldParticleDescription::HandleGuardRegion,
-    typename OldParticleDescription::MethodsList,
-    typename ToSeq<T_FrameExtensionSeq>::type
-    > type;
-};
+    /** Get ParticleDescription with a new FrameExtensionSeq
+     *
+     * @tparam T_OldParticleDescription base description
+     * @tparam T_FrameExtensionSeq new boost mpl sequence with value types
+     * @treturn ::type new ParticleDescription
+     */
+    template<typename T_OldParticleDescription, typename T_FrameExtensionSeq>
+    struct ReplaceFrameExtensionSeq
+    {
+        typedef T_OldParticleDescription OldParticleDescription;
+        typedef ParticleDescription<
+            typename OldParticleDescription::Name,
+            typename OldParticleDescription::SuperCellSize,
+            typename OldParticleDescription::ValueTypeSeq,
+            typename OldParticleDescription::FlagsList,
+            typename OldParticleDescription::HandleGuardRegion,
+            typename OldParticleDescription::MethodsList,
+            typename ToSeq<T_FrameExtensionSeq>::type>
+            type;
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/ParticlesBase.hpp b/include/pmacc/particles/ParticlesBase.hpp
index 3275a88db8..45ab0cd59e 100644
--- a/include/pmacc/particles/ParticlesBase.hpp
+++ b/include/pmacc/particles/ParticlesBase.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -39,174 +39,164 @@
 
 namespace pmacc
 {
+    /* Tag used for marking particle types */
+    struct ParticlesTag;
 
-/* Tag used for marking particle types */
-struct ParticlesTag;
-
-template<typename T_ParticleDescription, class T_MappingDesc, typename T_DeviceHeap>
-class ParticlesBase : public SimulationFieldHelper<T_MappingDesc>
-{
-    typedef T_ParticleDescription ParticleDescription;
-    typedef T_MappingDesc MappingDesc;
-
-public:
-
-    /* Type of used particles buffer
-     */
-    typedef ParticlesBuffer<ParticleDescription, typename MappingDesc::SuperCellSize, T_DeviceHeap, MappingDesc::Dim> BufferType;
-
-    /* Type of frame in particles buffer
-     */
-    typedef typename BufferType::FrameType FrameType;
-    /* Type of border frame in a particle buffer
-     */
-    typedef typename BufferType::FrameTypeBorder FrameTypeBorder;
-
-    /* Type of the particle box which particle buffer create
-     */
-    typedef typename BufferType::ParticlesBoxType ParticlesBoxType;
-
-    /* Policies for handling particles in guard cells */
-    typedef typename ParticleDescription::HandleGuardRegion HandleGuardRegion;
-
-    enum
+    template<typename T_ParticleDescription, class T_MappingDesc, typename T_DeviceHeap>
+    class ParticlesBase : public SimulationFieldHelper<T_MappingDesc>
     {
-        Dim = MappingDesc::Dim,
-        Exchanges = traits::NumberOfExchanges<Dim>::value,
-        TileSize = math::CT::volume<typename MappingDesc::SuperCellSize>::type::value
-    };
-
-    /* Mark this simulation data as a particle type */
-    typedef ParticlesTag SimulationDataTag;
+        typedef T_ParticleDescription ParticleDescription;
+        typedef T_MappingDesc MappingDesc;
+
+    public:
+        /* Type of used particles buffer
+         */
+        typedef ParticlesBuffer<
+            ParticleDescription,
+            typename MappingDesc::SuperCellSize,
+            T_DeviceHeap,
+            MappingDesc::Dim>
+            BufferType;
+
+        /* Type of frame in particles buffer
+         */
+        typedef typename BufferType::FrameType FrameType;
+        /* Type of border frame in a particle buffer
+         */
+        typedef typename BufferType::FrameTypeBorder FrameTypeBorder;
+
+        /* Type of the particle box which particle buffer create
+         */
+        typedef typename BufferType::ParticlesBoxType ParticlesBoxType;
+
+        /* Policies for handling particles in guard cells */
+        typedef typename ParticleDescription::HandleGuardRegion HandleGuardRegion;
+
+        enum
+        {
+            Dim = MappingDesc::Dim,
+            Exchanges = traits::NumberOfExchanges<Dim>::value,
+            TileSize = math::CT::volume<typename MappingDesc::SuperCellSize>::type::value
+        };
 
-protected:
+        /* Mark this simulation data as a particle type */
+        typedef ParticlesTag SimulationDataTag;
 
-    BufferType *particlesBuffer;
+    protected:
+        BufferType* particlesBuffer;
 
-    ParticlesBase(
-        const std::shared_ptr<T_DeviceHeap>& deviceHeap,
-        MappingDesc description
-    ) :
-        SimulationFieldHelper<MappingDesc>(description),
-        particlesBuffer(NULL)
-    {
-        particlesBuffer = new BufferType(
-            deviceHeap,
-            description.getGridLayout().getDataSpace(),
-            MappingDesc::SuperCellSize::toRT()
-        );
-    }
-
-    virtual ~ParticlesBase()
-    {
-        delete this->particlesBuffer;
-    }
-
-    /* Shift all particle in a AREA
-     * @tparam AREA area which is used (CORE,BORDER,GUARD or a combination)
-     */
-    template<uint32_t AREA>
-    void shiftParticles()
-    {
-        StrideMapping<AREA, 3, MappingDesc> mapper(this->cellDescription);
-        ParticlesBoxType pBox = particlesBuffer->getDeviceParticleBox();
-
-        constexpr uint32_t numWorkers = traits::GetNumWorkers<
-            math::CT::volume<typename FrameType::SuperCellSize>::type::value
-        >::value;
-        __startTransaction(__getTransactionEvent());
-        do
+        ParticlesBase(const std::shared_ptr<T_DeviceHeap>& deviceHeap, MappingDesc description)
+            : SimulationFieldHelper<MappingDesc>(description)
+            , particlesBuffer(NULL)
         {
-            PMACC_KERNEL(KernelShiftParticles< numWorkers >{})
-                (mapper.getGridDim(), numWorkers)
-                (pBox, mapper);
+            particlesBuffer = new BufferType(
+                deviceHeap,
+                description.getGridLayout().getDataSpace(),
+                MappingDesc::SuperCellSize::toRT());
         }
-        while (mapper.next());
-
-        __setTransactionEvent(__endTransaction());
 
-    }
+        virtual ~ParticlesBase()
+        {
+            delete this->particlesBuffer;
+        }
 
-    /* fill gaps in a AREA
-     * @tparam AREA area which is used (CORE,BORDER,GUARD or a combination)
-     */
-    template<uint32_t AREA>
-    void fillGaps()
-    {
-        AreaMapping<AREA, MappingDesc> mapper(this->cellDescription);
+        /* Shift all particle in a AREA
+         * @tparam AREA area which is used (CORE,BORDER,GUARD or a combination)
+         */
+        template<uint32_t AREA>
+        void shiftParticles()
+        {
+            StrideMapping<AREA, 3, MappingDesc> mapper(this->cellDescription);
+            ParticlesBoxType pBox = particlesBuffer->getDeviceParticleBox();
+
+            constexpr uint32_t numWorkers
+                = traits::GetNumWorkers<math::CT::volume<typename FrameType::SuperCellSize>::type::value>::value;
+            __startTransaction(__getTransactionEvent());
+            do
+            {
+                PMACC_KERNEL(KernelShiftParticles<numWorkers>{})
+                (mapper.getGridDim(), numWorkers)(pBox, mapper);
+            } while(mapper.next());
+
+            __setTransactionEvent(__endTransaction());
+        }
 
-        constexpr uint32_t numWorkers = traits::GetNumWorkers<
-            math::CT::volume<typename FrameType::SuperCellSize>::type::value
-        >::value;
+        /* fill gaps in a AREA
+         * @tparam AREA area which is used (CORE,BORDER,GUARD or a combination)
+         */
+        template<uint32_t AREA>
+        void fillGaps()
+        {
+            AreaMapping<AREA, MappingDesc> mapper(this->cellDescription);
 
-        PMACC_KERNEL(KernelFillGaps< numWorkers >{})
-            (mapper.getGridDim(), numWorkers)
-            (particlesBuffer->getDeviceParticleBox(), mapper);
-    }
+            constexpr uint32_t numWorkers
+                = traits::GetNumWorkers<math::CT::volume<typename FrameType::SuperCellSize>::type::value>::value;
 
+            PMACC_KERNEL(KernelFillGaps<numWorkers>{})
+            (mapper.getGridDim(), numWorkers)(particlesBuffer->getDeviceParticleBox(), mapper);
+        }
 
-public:
 
-    /* fill gaps in a the complete simulation area (include GUARD)
-     */
-    void fillAllGaps()
-    {
-        this->fillGaps < CORE + BORDER + GUARD > ();
-    }
+    public:
+        /* fill gaps in a the complete simulation area (include GUARD)
+         */
+        void fillAllGaps()
+        {
+            this->fillGaps<CORE + BORDER + GUARD>();
+        }
 
-    /* fill all gaps in the border of the simulation
-     */
-    void fillBorderGaps()
-    {
-        this->fillGaps < BORDER > ();
-    }
-
-    /* Delete all particles in GUARD for one direction.
-     */
-    void deleteGuardParticles(uint32_t exchangeType);
-
-    /* Delete all particle in an area*/
-    template<uint32_t T_area>
-    void deleteParticlesInArea();
-
-    /** copy guard particles to intermediate exchange buffer
-     *
-     * Copy all particles from the guard of a direction to the device exchange buffer.
-     * @warning This method resets the number of particles in the processed supercells even
-     * if there are particles left in the supercell and does not guarantee that the last frame is
-     * contiguous filled.
-     * Call fillAllGaps afterwards if you need a valid number of particles
-     * and a contiguously filled last frame.
-     */
-    void copyGuardToExchange(uint32_t exchangeType);
-
-    /* Insert all particles which are in device exchange buffer
-     */
-    void insertParticles(uint32_t exchangeType);
-
-    ParticlesBoxType getDeviceParticlesBox()
-    {
-        return particlesBuffer->getDeviceParticleBox();
-    }
+        /* fill all gaps in the border of the simulation
+         */
+        void fillBorderGaps()
+        {
+            this->fillGaps<BORDER>();
+        }
 
-    ParticlesBoxType getHostParticlesBox(const int64_t memoryOffset)
-    {
-        return particlesBuffer->getHostParticleBox(memoryOffset);
-    }
+        /* Delete all particles in GUARD for one direction.
+         */
+        void deleteGuardParticles(uint32_t exchangeType);
+
+        /* Delete all particle in an area*/
+        template<uint32_t T_area>
+        void deleteParticlesInArea();
+
+        /** copy guard particles to intermediate exchange buffer
+         *
+         * Copy all particles from the guard of a direction to the device exchange buffer.
+         * @warning This method resets the number of particles in the processed supercells even
+         * if there are particles left in the supercell and does not guarantee that the last frame is
+         * contiguous filled.
+         * Call fillAllGaps afterwards if you need a valid number of particles
+         * and a contiguously filled last frame.
+         */
+        void copyGuardToExchange(uint32_t exchangeType);
+
+        /* Insert all particles which are in device exchange buffer
+         */
+        void insertParticles(uint32_t exchangeType);
+
+        ParticlesBoxType getDeviceParticlesBox()
+        {
+            return particlesBuffer->getDeviceParticleBox();
+        }
 
-    /* Get the particles buffer which is used for the particles.
-     */
-    BufferType& getParticlesBuffer()
-    {
-        PMACC_ASSERT(particlesBuffer != nullptr);
-        return *particlesBuffer;
-    }
+        ParticlesBoxType getHostParticlesBox(const int64_t memoryOffset)
+        {
+            return particlesBuffer->getHostParticleBox(memoryOffset);
+        }
 
-    /* set all internal objects to initial state*/
-    virtual void reset(uint32_t currentStep);
+        /* Get the particles buffer which is used for the particles.
+         */
+        BufferType& getParticlesBuffer()
+        {
+            PMACC_ASSERT(particlesBuffer != nullptr);
+            return *particlesBuffer;
+        }
 
-};
+        /* set all internal objects to initial state*/
+        virtual void reset(uint32_t currentStep);
+    };
 
-} //namespace pmacc
+} // namespace pmacc
 
 #include "pmacc/particles/ParticlesBase.tpp"
diff --git a/include/pmacc/particles/ParticlesBase.kernel b/include/pmacc/particles/ParticlesBase.kernel
index a493e7926a..5304e61804 100644
--- a/include/pmacc/particles/ParticlesBase.kernel
+++ b/include/pmacc/particles/ParticlesBase.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -40,682 +40,423 @@
 
 namespace pmacc
 {
+    template<typename T_ParticleBox, typename T_SuperCellIdxType, typename T_Acc>
+    DINLINE typename T_ParticleBox::FramePtr getPreviousFrameAndRemoveLastFrame(
+        const T_Acc& acc,
+        const typename T_ParticleBox::FramePtr& frame,
+        T_ParticleBox& pb,
+        const T_SuperCellIdxType& superCellIdx)
+    {
+        typename T_ParticleBox::FramePtr result = pb.getPreviousFrame(frame);
+        pb.removeLastFrame(acc, superCellIdx);
+        return result;
+    }
 
-template<typename T_ParticleBox, typename T_SuperCellIdxType>
-DINLINE typename T_ParticleBox::FramePtr
-getPreviousFrameAndRemoveLastFrame( const typename T_ParticleBox::FramePtr& frame,
-                                    T_ParticleBox& pb,
-                                    const T_SuperCellIdxType& superCellIdx )
-{
-    typename T_ParticleBox::FramePtr result = pb.getPreviousFrame( frame );
-    pb.removeLastFrame( superCellIdx );
-    return result;
-}
-
-/** fill particle gaps in the last frame
- *
- * Copy all particles in a frame to the storage places at the frame's beginning.
- * This leaves the frame with a contiguous number of valid particles at
- * the beginning and a subsequent, contiguous gap at the end.
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelFillGapsLastFrame
-{
-    /** fill particle gaps
+    /** fill particle gaps in the last frame
      *
-     * @tparam T_ParBox pmacc::ParticlesBox, particle box type
-     * @tparam T_Mapping mapper functor type
+     * Copy all particles in a frame to the storage places at the frame's beginning.
+     * This leaves the frame with a contiguous number of valid particles at
+     * the beginning and a subsequent, contiguous gap at the end.
      *
-     * @param boxPar particle memory
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParBox,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParBox pb,
-        T_Mapping mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelFillGapsLastFrame
     {
-        using namespace particles::operations;
-        using namespace mappings::threads;
-
-        constexpr uint32_t frameSize = math::CT::volume<typename T_Mapping::SuperCellSize>::type::value;
-        constexpr uint32_t dim = T_Mapping::Dim;
-        constexpr uint32_t numWorkers = T_numWorkers;
-
-        using FramePtr = typename T_ParBox::FramePtr;
-
-        DataSpace< dim > const superCellIdx = mapper.getSuperCellIndex( DataSpace< dim > ( blockIdx ) );
-
-        PMACC_SMEM(
-            acc,
-            lastFrame,
-            FramePtr
-        );
-        PMACC_SMEM(
-            acc,
-            gapIndices_sh,
-            memory::Array<
-                int,
-                frameSize
-            >
-        );
-        PMACC_SMEM(
-            acc,
-            numGaps,
-            int
-        );
-        PMACC_SMEM(
-            acc,
-            numParticles,
-            int
-        );
-        PMACC_SMEM(
-            acc,
-            srcGap,
-            int
-        );
-
-        uint32_t const workerIdx = threadIdx.x;
-
-        using MasterOnly = IdxConfig<
-            1,
-            numWorkers
-        >;
-
-        ForEachIdx< MasterOnly >{ workerIdx }(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                lastFrame = pb.getLastFrame( superCellIdx );
+        /** fill particle gaps
+         *
+         * @tparam T_ParBox pmacc::ParticlesBox, particle box type
+         * @tparam T_Mapping mapper functor type
+         *
+         * @param boxPar particle memory
+         * @param mapper functor to map a block to a supercell
+         */
+        template<typename T_ParBox, typename T_Mapping, typename T_Acc>
+        DINLINE void operator()(T_Acc const& acc, T_ParBox pb, T_Mapping mapper) const
+        {
+            using namespace particles::operations;
+            using namespace mappings::threads;
+
+            constexpr uint32_t frameSize = math::CT::volume<typename T_Mapping::SuperCellSize>::type::value;
+            constexpr uint32_t dim = T_Mapping::Dim;
+            constexpr uint32_t numWorkers = T_numWorkers;
+
+            using FramePtr = typename T_ParBox::FramePtr;
+
+            DataSpace<dim> const superCellIdx = mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc)));
+
+            PMACC_SMEM(acc, lastFrame, FramePtr);
+            PMACC_SMEM(acc, gapIndices_sh, memory::Array<int, frameSize>);
+            PMACC_SMEM(acc, numGaps, int);
+            PMACC_SMEM(acc, numParticles, int);
+            PMACC_SMEM(acc, srcGap, int);
+
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+            using MasterOnly = IdxConfig<1, numWorkers>;
+
+            ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
+                lastFrame = pb.getLastFrame(superCellIdx);
                 numGaps = 0;
                 numParticles = 0;
                 srcGap = 0;
-            }
-        );
+            });
 
-        __syncthreads( );
+            cupla::__syncthreads(acc);
 
-        if ( lastFrame.isValid( ) )
-        {
-            using ParticleDomCfg = IdxConfig<
-                frameSize,
-                numWorkers
-            >;
-
-            /* context if an element within the frame is a particle */
-            memory::CtxArray<
-                bool,
-                ParticleDomCfg
-            >
-            isParticleCtx(
-                workerIdx,
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    return lastFrame[ linearIdx ][ multiMask_ ];
-                }
-            );
+            if(lastFrame.isValid())
+            {
+                using ParticleDomCfg = IdxConfig<frameSize, numWorkers>;
 
-            /* loop over all particles in the frame */
-            ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
+                /* context if an element within the frame is a particle */
+                memory::CtxArray<bool, ParticleDomCfg> isParticleCtx(
+                    workerIdx,
+                    [&](uint32_t const linearIdx, uint32_t const) { return lastFrame[linearIdx][multiMask_]; });
 
-            // count particles in last frame
-            forEachParticle(
-                [&](
-                    uint32_t const,
-                    uint32_t const idx
-                )
-                {
-                    if( isParticleCtx[ idx ] )
-                        nvidia::atomicAllInc( acc, &numParticles, ::alpaka::hierarchy::Threads{} );
-                }
-            );
+                /* loop over all particles in the frame */
+                ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
 
-            __syncthreads( );
+                // count particles in last frame
+                forEachParticle([&](uint32_t const, uint32_t const idx) {
+                    if(isParticleCtx[idx])
+                        nvidia::atomicAllInc(acc, &numParticles, ::alpaka::hierarchy::Threads{});
+                });
 
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    if ( linearIdx < numParticles && isParticleCtx[ idx ] == false )
+                cupla::__syncthreads(acc);
+
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    if(linearIdx < numParticles && isParticleCtx[idx] == false)
                     {
-                        int const localGapIdx = nvidia::atomicAllInc( acc, &numGaps, ::alpaka::hierarchy::Threads{} );
-                        gapIndices_sh[ localGapIdx ] = linearIdx;
+                        int const localGapIdx = nvidia::atomicAllInc(acc, &numGaps, ::alpaka::hierarchy::Threads{});
+                        gapIndices_sh[localGapIdx] = linearIdx;
                     }
-                }
-            );
-            __syncthreads( );
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    if ( linearIdx >= numParticles && isParticleCtx[ idx ] )
+                });
+                cupla::__syncthreads(acc);
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    if(linearIdx >= numParticles && isParticleCtx[idx])
                     {
                         // any particle search a gap
-                        int const srcGapIdx = nvidia::atomicAllInc( acc, &srcGap, ::alpaka::hierarchy::Threads{} );
-                        int const gapIdx = gapIndices_sh[ srcGapIdx ];
-                        auto parDestFull = lastFrame[ gapIdx ];
+                        int const srcGapIdx = nvidia::atomicAllInc(acc, &srcGap, ::alpaka::hierarchy::Threads{});
+                        int const gapIdx = gapIndices_sh[srcGapIdx];
+                        auto parDestFull = lastFrame[gapIdx];
                         /* enable particle */
-                        parDestFull[ multiMask_ ] = 1;
+                        parDestFull[multiMask_] = 1;
                         /* we do not update the multiMask because copying from mem to mem is too slow
                          * we have to enabled particles explicitly
                          */
-                        auto parDest = deselect< multiMask >( parDestFull );
-                        auto parSrc = ( lastFrame[ linearIdx ] );
-                        assign( parDest, parSrc );
+                        auto parDest = deselect<multiMask>(parDestFull);
+                        auto parSrc = (lastFrame[linearIdx]);
+                        assign(parDest, parSrc);
                         parSrc[multiMask_] = 0; // delete old particle
                     }
-                }
-            );
-        }
-        ForEachIdx< MasterOnly >{ workerIdx }(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
+                });
+            }
+            ForEachIdx<MasterOnly>{workerIdx}([&](uint32_t const, uint32_t const) {
                 // there is no need to add a zero to the global memory
-                if( numParticles != 0 )
+                if(numParticles != 0)
                 {
-                    auto & superCell = pb.getSuperCell( superCellIdx );
-                    superCell.setNumParticles(
-                        superCell.getNumParticles() + numParticles
-                    );
+                    auto& superCell = pb.getSuperCell(superCellIdx);
+                    superCell.setNumParticles(superCell.getNumParticles() + numParticles);
                 }
                 else
                 {
                     /* The last frame is empty therefore it must be removed.
                      * It is save to call this method even if there is no last frame.
                      */
-                    pb.removeLastFrame( superCellIdx );
+                    pb.removeLastFrame(acc, superCellIdx);
                 }
-            }
-        );
-    }
-};
+            });
+        }
+    };
 
-/** fill particle gaps in all frames
- *
- * Copy all particles from the end to the gaps at the beginning of the frame list.
- * The functor fulfills the restriction that the last frame must be hold a contiguous
- * number of valid particles at the beginning and a subsequent, contiguous gap at the end.
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelFillGaps
-{
-    /** fill particle gaps
+    /** fill particle gaps in all frames
      *
-     * @tparam T_ParBox pmacc::ParticlesBox, particle box type
-     * @tparam T_Mapping mapper functor type
+     * Copy all particles from the end to the gaps at the beginning of the frame list.
+     * The functor fulfills the restriction that the last frame must be hold a contiguous
+     * number of valid particles at the beginning and a subsequent, contiguous gap at the end.
      *
-     * @param pb particle memory
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParBox,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParBox pb,
-        T_Mapping const mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelFillGaps
     {
-        using namespace particles::operations;
-        using namespace mappings::threads;
-
-        using FramePtr = typename T_ParBox::FramePtr;
-
-        constexpr uint32_t frameSize = math::CT::volume< typename T_ParBox::FrameType::SuperCellSize >::type::value;
-        constexpr uint32_t dim = T_Mapping::Dim;
-        constexpr uint32_t numWorkers = T_numWorkers;
-
-        uint32_t const workerIdx = threadIdx.x;
-
-        DataSpace< dim > const superCellIdx( mapper.getSuperCellIndex( DataSpace< dim >( blockIdx ) ) );
-
-        // data copied from right (last) to left (first)
-        PMACC_SMEM(
-            acc,
-            firstFrame,
-            FramePtr
-        );
-        PMACC_SMEM(
-            acc,
-            lastFrame,
-            FramePtr
-        );
-
-        PMACC_SMEM(
-            acc,
-            particleIndices_sh,
-            memory::Array<
-                int,
-                frameSize
-            >
-        );
-        // number of gaps in firstFrame frame
-        PMACC_SMEM(
-            acc,
-            numGaps,
-            int
-        );
-        // number of particles in the lastFrame
-        PMACC_SMEM(
-            acc,
-            numParticles,
-            int
-        );
-
-        uint32_t numParticlesPerSuperCell = 0u;
-
-        ForEachIdx<
-            IdxConfig<
-                1,
-                numWorkers
-            >
-        > onlyMaster{ workerIdx };
-
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                firstFrame = pb.getFirstFrame( superCellIdx );
-                lastFrame = pb.getLastFrame( superCellIdx );
-            }
-        );
+        /** fill particle gaps
+         *
+         * @tparam T_ParBox pmacc::ParticlesBox, particle box type
+         * @tparam T_Mapping mapper functor type
+         *
+         * @param pb particle memory
+         * @param mapper functor to map a block to a supercell
+         */
+        template<typename T_ParBox, typename T_Mapping, typename T_Acc>
+        DINLINE void operator()(T_Acc const& acc, T_ParBox pb, T_Mapping const mapper) const
+        {
+            using namespace particles::operations;
+            using namespace mappings::threads;
 
-        __syncthreads( );
+            using FramePtr = typename T_ParBox::FramePtr;
 
-        while ( firstFrame.isValid( ) && firstFrame != lastFrame )
-        {
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
+            constexpr uint32_t frameSize = math::CT::volume<typename T_ParBox::FrameType::SuperCellSize>::type::value;
+            constexpr uint32_t dim = T_Mapping::Dim;
+            constexpr uint32_t numWorkers = T_numWorkers;
+
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+            DataSpace<dim> const superCellIdx(mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc))));
+
+            // data copied from right (last) to left (first)
+            PMACC_SMEM(acc, firstFrame, FramePtr);
+            PMACC_SMEM(acc, lastFrame, FramePtr);
+
+            PMACC_SMEM(acc, particleIndices_sh, memory::Array<int, frameSize>);
+            // number of gaps in firstFrame frame
+            PMACC_SMEM(acc, numGaps, int);
+            // number of particles in the lastFrame
+            PMACC_SMEM(acc, numParticles, int);
+
+            uint32_t numParticlesPerSuperCell = 0u;
+
+            ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
+
+            onlyMaster([&](uint32_t const, uint32_t const) {
+                firstFrame = pb.getFirstFrame(superCellIdx);
+                lastFrame = pb.getLastFrame(superCellIdx);
+            });
+
+            cupla::__syncthreads(acc);
+
+            while(firstFrame.isValid() && firstFrame != lastFrame)
+            {
+                onlyMaster([&](uint32_t const, uint32_t const) {
                     numGaps = 0;
                     numParticles = 0;
-                }
-            );
+                });
 
-            __syncthreads( );
+                cupla::__syncthreads(acc);
 
-            using ParticleDomCfg = IdxConfig<
-                frameSize,
-                numWorkers
-            >;
-            // loop over all particles in the frame
-            ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
-
-            memory::CtxArray<
-                int,
-                ParticleDomCfg
-            > localGapIdxCtx( INV_LOC_IDX );
-
-            // find gaps in firstFrame
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    if( firstFrame[ linearIdx ][ multiMask_ ] == 0 )
+                using ParticleDomCfg = IdxConfig<frameSize, numWorkers>;
+                // loop over all particles in the frame
+                ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
+
+                memory::CtxArray<int, ParticleDomCfg> localGapIdxCtx(INV_LOC_IDX);
+
+                // find gaps in firstFrame
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    if(firstFrame[linearIdx][multiMask_] == 0)
                     {
-                        localGapIdxCtx[ idx ] = nvidia::atomicAllInc( acc, &numGaps, ::alpaka::hierarchy::Threads{} );
+                        localGapIdxCtx[idx] = nvidia::atomicAllInc(acc, &numGaps, ::alpaka::hierarchy::Threads{});
                     }
-                }
-            );
+                });
 
-            __syncthreads( );
+                cupla::__syncthreads(acc);
 
-            if( numGaps != 0 )
-            {
-                // count particles in lastFrame
-                forEachParticle(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
-                    {
+                if(numGaps != 0)
+                {
+                    // count particles in lastFrame
+                    forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
                         // search particles for gaps
-                        if( lastFrame[ linearIdx ][ multiMask_ ] == 1 )
+                        if(lastFrame[linearIdx][multiMask_] == 1)
                         {
-                            int const localParticleIdx = nvidia::atomicAllInc( acc, &numParticles, ::alpaka::hierarchy::Threads{} );
-                            particleIndices_sh[ localParticleIdx ] = linearIdx;
+                            int const localParticleIdx
+                                = nvidia::atomicAllInc(acc, &numParticles, ::alpaka::hierarchy::Threads{});
+                            particleIndices_sh[localParticleIdx] = linearIdx;
                         }
-                    }
-                );
+                    });
 
-                __syncthreads( );
+                    cupla::__syncthreads(acc);
 
-                // copy particles from lastFrame to the gaps in firstFrame
-                forEachParticle(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
-                    {
-                        if( localGapIdxCtx[ idx ] < numParticles )
+                    // copy particles from lastFrame to the gaps in firstFrame
+                    forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                        if(localGapIdxCtx[idx] < numParticles)
                         {
-                            int const parIdx = particleIndices_sh[ localGapIdxCtx[ idx ] ];
-                            auto parDestFull = firstFrame[ linearIdx ];
+                            int const parIdx = particleIndices_sh[localGapIdxCtx[idx]];
+                            auto parDestFull = firstFrame[linearIdx];
                             // enable particle
-                            parDestFull[ multiMask_ ] = 1;
+                            parDestFull[multiMask_] = 1;
                             /* we not update multiMask because copy from mem to mem is to slow
                              * we have enabled particle explicit
                              */
-                            auto parDest = deselect< multiMask >( parDestFull );
-                            auto parSrc = lastFrame[ parIdx ];
-                            assign( parDest, parSrc );
-                            parSrc[ multiMask_ ] = 0;
+                            auto parDest = deselect<multiMask>(parDestFull);
+                            auto parSrc = lastFrame[parIdx];
+                            assign(parDest, parSrc);
+                            parSrc[multiMask_] = 0;
                         }
-                    }
-                );
+                    });
 
-                __syncthreads( );
+                    cupla::__syncthreads(acc);
 
-                onlyMaster(
-                    [&](
-                        uint32_t const,
-                        uint32_t const
-                    )
-                    {
-                        if( numGaps < numParticles )
+                    onlyMaster([&](uint32_t const, uint32_t const) {
+                        if(numGaps < numParticles)
                         {
                             numParticlesPerSuperCell += frameSize;
                             // any gap in the first frame is filled
-                            firstFrame = pb.getNextFrame( firstFrame );
+                            firstFrame = pb.getNextFrame(firstFrame);
                         }
-                        else if( numGaps > numParticles )
+                        else if(numGaps > numParticles)
                         {
                             // we need more particles
-                            lastFrame = getPreviousFrameAndRemoveLastFrame(
-                                lastFrame,
-                                pb,
-                                superCellIdx
-                            );
+                            lastFrame = getPreviousFrameAndRemoveLastFrame(acc, lastFrame, pb, superCellIdx);
                         }
-                        else if( numGaps == numParticles )
+                        else if(numGaps == numParticles)
                         {
                             // update lastFrame and firstFrame
-                            lastFrame = getPreviousFrameAndRemoveLastFrame(
-                                lastFrame,
-                                pb,
-                                superCellIdx
-                            );
-                            if( lastFrame.isValid( ) && lastFrame != firstFrame )
+                            lastFrame = getPreviousFrameAndRemoveLastFrame(acc, lastFrame, pb, superCellIdx);
+                            if(lastFrame.isValid() && lastFrame != firstFrame)
                             {
                                 numParticlesPerSuperCell += frameSize;
-                                firstFrame = pb.getNextFrame( firstFrame );
+                                firstFrame = pb.getNextFrame(firstFrame);
                             }
                         }
-                    }
-                );
-            }
-            else
-            {
-                // there are no gaps in firstFrame, goto to next frame
-                onlyMaster(
-                    [&](
-                        uint32_t const,
-                        uint32_t const
-                    )
-                    {
+                    });
+                }
+                else
+                {
+                    // there are no gaps in firstFrame, goto to next frame
+                    onlyMaster([&](uint32_t const, uint32_t const) {
                         numParticlesPerSuperCell += frameSize;
-                        firstFrame = pb.getNextFrame( firstFrame );
-                    }
-                );
-            }
-
-            __syncthreads( );
+                        firstFrame = pb.getNextFrame(firstFrame);
+                    });
+                }
 
-        }
+                cupla::__syncthreads(acc);
+            }
 
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
+            onlyMaster([&](uint32_t const, uint32_t const) {
                 /* numParticlesPerSuperCell is the number of particles in the
                  * supercell except the particles in the last frame
                  */
-                auto & superCell = pb.getSuperCell( superCellIdx );
-                superCell.setNumParticles( numParticlesPerSuperCell );
-            }
-        );
-
-        // fill all gaps in the last frame of the supercell
-        KernelFillGapsLastFrame< numWorkers >{ }(
-            acc,
-            pb,
-            mapper
-        );
-    }
-};
+                auto& superCell = pb.getSuperCell(superCellIdx);
+                superCell.setNumParticles(numParticlesPerSuperCell);
+            });
 
-/** shift particles leaving the supercell
- *
- * The functor fulfills the restriction that all frames except the last
- * must be fully filled with particles as can be stored in a frame.
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelShiftParticles
-{
-    /** This kernel moves particles to the next supercell
+            // fill all gaps in the last frame of the supercell
+            KernelFillGapsLastFrame<numWorkers>{}(acc, pb, mapper);
+        }
+    };
+
+    /** shift particles leaving the supercell
      *
-     * @warning this kernel can only run with a double checker board
+     * The functor fulfills the restriction that all frames except the last
+     * must be fully filled with particles as can be stored in a frame.
+     *
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParBox,
-        typename Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParBox pb,
-        Mapping mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelShiftParticles
     {
-        using ParBox = T_ParBox;
-        using FrameType = typename ParBox::FrameType;
-        using FramePtr = typename ParBox::FramePtr;
-
-        PMACC_CONSTEXPR_CAPTURE uint32_t dim = Mapping::Dim;
-        constexpr uint32_t frameSize = math::CT::volume< typename FrameType::SuperCellSize >::type::value;
-        /* number exchanges in 2D=9 and in 3D=27 */
-        constexpr uint32_t numExchanges = traits::NumberOfExchanges< dim >::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
-
-        /* define memory for two times Exchanges
-         * index range [0,numExchanges-1] are being referred to as `low frames`
-         * index range [numExchanges,2*numExchanges-1] are being referred to as `high frames`
+        /** This kernel moves particles to the next supercell
+         *
+         * @warning this kernel can only run with a double checker board
          */
-        PMACC_SMEM(
-            acc,
-            destFrames,
-            memory::Array<
-                FramePtr,
-                numExchanges * 2
-            >
-        );
-        //count particles per frame
-        PMACC_SMEM(
-            acc,
-            destFramesCounter,
-            memory::Array<
-                int,
-                numExchanges
-            >
-        );
-
-        PMACC_SMEM(
-            acc,
-            frame,
-            FramePtr
-        );
-        PMACC_SMEM(
-            acc,
-            mustShift,
-            bool
-        );
-
-        DataSpace< dim > superCellIdx = mapper.getSuperCellIndex( DataSpace< dim >( blockIdx ) );
-        uint32_t const workerIdx = threadIdx.x;
-
-        using namespace mappings::threads;
-
-        ForEachIdx<
-            IdxConfig<
-                1,
-                numWorkers
-            >
-        >{ workerIdx }(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                mustShift = pb.getSuperCell( superCellIdx ).mustShift( );
-                if ( mustShift )
+        template<typename T_ParBox, typename Mapping, typename T_Acc>
+        DINLINE void operator()(T_Acc const& acc, T_ParBox pb, Mapping mapper) const
+        {
+            using ParBox = T_ParBox;
+            using FrameType = typename ParBox::FrameType;
+            using FramePtr = typename ParBox::FramePtr;
+
+            PMACC_CONSTEXPR_CAPTURE uint32_t dim = Mapping::Dim;
+            constexpr uint32_t frameSize = math::CT::volume<typename FrameType::SuperCellSize>::type::value;
+            /* number exchanges in 2D=9 and in 3D=27 */
+            constexpr uint32_t numExchanges = traits::NumberOfExchanges<dim>::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
+
+            /* define memory for two times Exchanges
+             * index range [0,numExchanges-1] are being referred to as `low frames`
+             * index range [numExchanges,2*numExchanges-1] are being referred to as `high frames`
+             */
+            PMACC_SMEM(acc, destFrames, memory::Array<FramePtr, numExchanges * 2>);
+            // count particles per frame
+            PMACC_SMEM(acc, destFramesCounter, memory::Array<int, numExchanges>);
+
+            PMACC_SMEM(acc, frame, FramePtr);
+            PMACC_SMEM(acc, mustShift, bool);
+
+            DataSpace<dim> superCellIdx = mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc)));
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+            using namespace mappings::threads;
+
+            ForEachIdx<IdxConfig<1, numWorkers>>{workerIdx}([&](uint32_t const, uint32_t const) {
+                mustShift = pb.getSuperCell(superCellIdx).mustShift();
+                if(mustShift)
                 {
-                    pb.getSuperCell( superCellIdx ).setMustShift( false );
-                    frame = pb.getFirstFrame( superCellIdx );
+                    pb.getSuperCell(superCellIdx).setMustShift(false);
+                    frame = pb.getFirstFrame(superCellIdx);
                 }
-            }
-        );
-
-        __syncthreads( );
-        if ( !mustShift || !frame.isValid( ) ) return;
-
-        using ExchangeDomCfg = IdxConfig<
-            numExchanges,
-            numWorkers
-        >;
-
-        memory::CtxArray<
-            int32_t,
-            ExchangeDomCfg
-        > newParticleInFrame( 0 );
-
-        memory::CtxArray<
-            DataSpace< dim >,
-            ExchangeDomCfg
-        > relativeCtx(
-            workerIdx,
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            -> DataSpace< dim >
-            {
-                return superCellIdx + Mask::getRelativeDirections< dim > ( linearIdx + 1);
-            }
-        );
+            });
 
-        ForEachIdx< ExchangeDomCfg > forEachExchange( workerIdx );
+            cupla::__syncthreads(acc);
+            if(!mustShift || !frame.isValid())
+                return;
 
-        /* if a partially filled last frame exists for the neighboring supercell,
-         * each master thread (one master per direction) will load it
-         */
-        forEachExchange(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const idx
-            )
-            {
-                destFramesCounter[ linearIdx ] = 0;
-                destFrames[ linearIdx ] = FramePtr();
-                destFrames[ linearIdx + numExchanges ] = FramePtr();
+            using ExchangeDomCfg = IdxConfig<numExchanges, numWorkers>;
+
+            memory::CtxArray<int32_t, ExchangeDomCfg> newParticleInFrame(0);
+
+            memory::CtxArray<DataSpace<dim>, ExchangeDomCfg> relativeCtx(
+                workerIdx,
+                [&](uint32_t const linearIdx, uint32_t const) -> DataSpace<dim> {
+                    return superCellIdx + Mask::getRelativeDirections<dim>(linearIdx + 1);
+                });
+
+            ForEachIdx<ExchangeDomCfg> forEachExchange(workerIdx);
+
+            /* if a partially filled last frame exists for the neighboring supercell,
+             * each master thread (one master per direction) will load it
+             */
+            forEachExchange([&](uint32_t const linearIdx, uint32_t const idx) {
+                destFramesCounter[linearIdx] = 0;
+                destFrames[linearIdx] = FramePtr();
+                destFrames[linearIdx + numExchanges] = FramePtr();
                 /* load last frame of neighboring supercell */
-                FramePtr tmpFrame( pb.getLastFrame( relativeCtx[ idx ] ) );
+                FramePtr tmpFrame(pb.getLastFrame(relativeCtx[idx]));
 
-                if ( tmpFrame.isValid() )
+                if(tmpFrame.isValid())
                 {
-                    int32_t const particlesInFrame = pb.getSuperCell( relativeCtx[ idx ] ).getSizeLastFrame( );
+                    int32_t const particlesInFrame = pb.getSuperCell(relativeCtx[idx]).getSizeLastFrame();
                     // do not use the neighbor's last frame if it is full
-                    if ( particlesInFrame < frameSize )
+                    if(particlesInFrame < frameSize)
                     {
-                        newParticleInFrame[ idx ] = -particlesInFrame;
-                        destFrames[ linearIdx ] = tmpFrame;
-                        destFramesCounter[ linearIdx ] = particlesInFrame;
+                        newParticleInFrame[idx] = -particlesInFrame;
+                        destFrames[linearIdx] = tmpFrame;
+                        destFramesCounter[linearIdx] = particlesInFrame;
                     }
                 }
-            }
-        );
+            });
 
-        __syncthreads( );
+            cupla::__syncthreads(acc);
 
-        /* iterate over the frame list of the current supercell */
-        while ( frame.isValid( ) )
-        {
-            using ParticleDomCfg = IdxConfig<
-                frameSize,
-                numWorkers
-            >;
-
-            ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
-
-            memory::CtxArray<
-                lcellId_t,
-                ParticleDomCfg
-            > destParticleIdxCtx( INV_LOC_IDX );
-            memory::CtxArray<
-                int,
-                ParticleDomCfg
-            > directionCtx;
-
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
+            /* iterate over the frame list of the current supercell */
+            while(frame.isValid())
+            {
+                using ParticleDomCfg = IdxConfig<frameSize, numWorkers>;
+
+                ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
+
+                memory::CtxArray<lcellId_t, ParticleDomCfg> destParticleIdxCtx(INV_LOC_IDX);
+                memory::CtxArray<int, ParticleDomCfg> directionCtx;
+
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
                     /* set to value to of multiMask to a value in range [-2, EXCHANGES - 1]
                      * -2 is no particle
                      * -1 is particle but it is not shifted (stays in supercell)
                      * >=0 particle moves in a certain direction
                      *     (@see ExchangeType in types.h)
                      */
-                    directionCtx[ idx ] = frame[ linearIdx ][ multiMask_ ] - 2;
-                    if ( directionCtx[ idx ] >= 0 )
+                    directionCtx[idx] = frame[linearIdx][multiMask_] - 2;
+                    if(directionCtx[idx] >= 0)
                     {
-                        destParticleIdxCtx[ idx ] = atomicAdd( &(destFramesCounter[ directionCtx[ idx ] ]), 1, ::alpaka::hierarchy::Threads{} );
+                        destParticleIdxCtx[idx] = cupla::atomicAdd(
+                            acc,
+                            &(destFramesCounter[directionCtx[idx]]),
+                            1,
+                            ::alpaka::hierarchy::Threads{});
                     }
-                }
-            );
-            __syncthreads( );
-
-            forEachExchange(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
+                });
+                cupla::__syncthreads(acc);
+
+                forEachExchange([&](uint32_t const linearIdx, uint32_t const idx) {
                     /* If the master thread (responsible for a certain direction) did not
                      * obtain a `low frame` from the neighboring super cell before the loop,
                      * it will create one now.
@@ -724,41 +465,27 @@ struct KernelShiftParticles
                      * supercell fit into the `low frame`, a second frame is created to
                      * contain further particles, the `high frame` (default: invalid).
                      */
-                    if ( destFramesCounter[ linearIdx ] > 0 )
+                    if(destFramesCounter[linearIdx] > 0)
                     {
                         /* if we had no `low frame` we load a new empty one */
-                        if ( !destFrames[ linearIdx ].isValid( ) )
+                        if(!destFrames[linearIdx].isValid())
                         {
-                            FramePtr tmpFrame( pb.getEmptyFrame( ) );
-                            destFrames[ linearIdx ] = tmpFrame;
-                            pb.setAsLastFrame(
-                                acc,
-                                tmpFrame,
-                                relativeCtx[ idx ]
-                            );
+                            FramePtr tmpFrame(pb.getEmptyFrame(acc));
+                            destFrames[linearIdx] = tmpFrame;
+                            pb.setAsLastFrame(acc, tmpFrame, relativeCtx[idx]);
                         }
                         /* check if a `high frame` is needed */
-                        if ( destFramesCounter[ linearIdx ] > frameSize )
+                        if(destFramesCounter[linearIdx] > frameSize)
                         {
-                            FramePtr tmpFrame( pb.getEmptyFrame( ) );
-                            destFrames[ linearIdx + numExchanges ] = tmpFrame;
-                            pb.setAsLastFrame(
-                                acc,
-                                tmpFrame,
-                                relativeCtx[ idx ]
-                            );
+                            FramePtr tmpFrame(pb.getEmptyFrame(acc));
+                            destFrames[linearIdx + numExchanges] = tmpFrame;
+                            pb.setAsLastFrame(acc, tmpFrame, relativeCtx[idx]);
                         }
                     }
-                }
-            );
-            __syncthreads( );
-
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
+                });
+                cupla::__syncthreads(acc);
+
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
                     /* All threads with a valid index in the neighbor's frame, valid index
                      * range is [0, frameSize * 2-1], will copy their particle to the new
                      * frame.
@@ -766,588 +493,366 @@ struct KernelShiftParticles
                      * The default value for indexes (in the destination frame) is
                      * above this range (INV_LOC_IDX) for all particles that are not shifted.
                      */
-                    if ( destParticleIdxCtx[ idx ] < frameSize * 2 )
+                    if(destParticleIdxCtx[idx] < frameSize * 2)
                     {
-                        if ( destParticleIdxCtx[ idx ] >= frameSize )
+                        if(destParticleIdxCtx[idx] >= frameSize)
                         {
                             /* use `high frame` */
-                            directionCtx[ idx ] += numExchanges;
-                            destParticleIdxCtx[ idx ] -= frameSize;
+                            directionCtx[idx] += numExchanges;
+                            destParticleIdxCtx[idx] -= frameSize;
                         }
-                        auto dstParticle = destFrames[ directionCtx[ idx ] ][ destParticleIdxCtx[ idx ] ];
-                        auto srcParticle = frame[ linearIdx ];
-                        dstParticle[ multiMask_ ] = 1;
-                        srcParticle[ multiMask_ ] = 0;
-                        auto dstFilteredParticle =
-                            particles::operations::deselect< multiMask >( dstParticle );
-                        particles::operations::assign(
-                            dstFilteredParticle,
-                            srcParticle
-                        );
+                        auto dstParticle = destFrames[directionCtx[idx]][destParticleIdxCtx[idx]];
+                        auto srcParticle = frame[linearIdx];
+                        dstParticle[multiMask_] = 1;
+                        srcParticle[multiMask_] = 0;
+                        auto dstFilteredParticle = particles::operations::deselect<multiMask>(dstParticle);
+                        particles::operations::assign(dstFilteredParticle, srcParticle);
                     }
-                }
-            );
-            __syncthreads( );
-
-            forEachExchange(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
+                });
+                cupla::__syncthreads(acc);
+
+                forEachExchange([&](uint32_t const linearIdx, uint32_t const idx) {
                     /* if the `low frame` is full, each master thread
                      * uses the `high frame` (is invalid, if still empty) as the next
                      * `low frame` for the following iteration of the loop
                      */
-                    if ( destFramesCounter[ linearIdx ] >= frameSize )
+                    if(destFramesCounter[linearIdx] >= frameSize)
                     {
-                        newParticleInFrame[ idx ] += frameSize;
-                        destFramesCounter[ linearIdx ] -= frameSize;
-                        destFrames[ linearIdx ] = destFrames[ linearIdx + numExchanges ];
-                        destFrames[ linearIdx + numExchanges ] = FramePtr( );
+                        newParticleInFrame[idx] += frameSize;
+                        destFramesCounter[linearIdx] -= frameSize;
+                        destFrames[linearIdx] = destFrames[linearIdx + numExchanges];
+                        destFrames[linearIdx + numExchanges] = FramePtr();
                     }
-                    if ( linearIdx == 0 )
+                    if(linearIdx == 0)
                     {
-                        frame = pb.getNextFrame( frame );
+                        frame = pb.getNextFrame(frame);
                     }
-                }
-            );
-            __syncthreads( );
-        }
+                });
+                cupla::__syncthreads(acc);
+            }
 
-        forEachExchange(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const idx
-            )
-            {
-                newParticleInFrame[ idx ] += destFramesCounter[ linearIdx ];
-                if( newParticleInFrame[ idx ] > 0 )
+            forEachExchange([&](uint32_t const linearIdx, uint32_t const idx) {
+                newParticleInFrame[idx] += destFramesCounter[linearIdx];
+                if(newParticleInFrame[idx] > 0)
                 {
                     /* Each master thread updates the number of particles
                      * for the neighbor frame. The number of particles in the neighbor
                      * frame must be correct because fill gaps is only called on the
                      * current used supercell.
                      */
-                    auto & superCell = pb.getSuperCell( relativeCtx[ idx ] );
-                    superCell.setNumParticles(
-                        superCell.getNumParticles() + newParticleInFrame[ idx ]
-                    );
+                    auto& superCell = pb.getSuperCell(relativeCtx[idx]);
+                    superCell.setNumParticles(superCell.getNumParticles() + newParticleInFrame[idx]);
                 }
-            }
-        );
-
-        // fill all gaps in the frame list of the supercell
-        KernelFillGaps< numWorkers >{ }(
-            acc,
-            pb,
-            mapper
-        );
-    }
-};
+            });
 
-/** deletes all particles within an AREA
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelDeleteParticles
-{
-    /** deletes all particles
-     *
-     * @warning the particle memory of the particle is not byte-wise zeroed
-     *
-     * @tparam T_ParticleBox pmacc::ParticlesBox, particle box type
-     * @tparam T_Mapping mapper functor type
+            // fill all gaps in the frame list of the supercell
+            KernelFillGaps<numWorkers>{}(acc, pb, mapper);
+        }
+    };
+
+    /** deletes all particles within an AREA
      *
-     * @param pb particle memory
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParticleBox,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParticleBox pb,
-        T_Mapping const mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelDeleteParticles
     {
-        using namespace particles::operations;
-        using namespace mappings::threads;
-
-        using ParticleBox = T_ParticleBox;
-        using FrameType = typename ParticleBox::FrameType;
-        using FramePtr = typename ParticleBox::FramePtr;
-
-        constexpr uint32_t dim = T_Mapping::Dim;
-        constexpr uint32_t frameSize = math::CT::volume< typename FrameType::SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
-
-        DataSpace< dim > const superCellIdx = mapper.getSuperCellIndex( DataSpace< dim >( blockIdx ) );
-        uint32_t const workerIdx = threadIdx.x;
-
-        PMACC_SMEM(
-            acc,
-            frame,
-            FramePtr
-        );
-
-        ForEachIdx<
-            IdxConfig<
-                1,
-                numWorkers
-            >
-        > onlyMaster{ workerIdx };
-
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                frame = pb.getLastFrame( superCellIdx );
-            }
-        );
+        /** deletes all particles
+         *
+         * @warning the particle memory of the particle is not byte-wise zeroed
+         *
+         * @tparam T_ParticleBox pmacc::ParticlesBox, particle box type
+         * @tparam T_Mapping mapper functor type
+         *
+         * @param pb particle memory
+         * @param mapper functor to map a block to a supercell
+         */
+        template<typename T_ParticleBox, typename T_Mapping, typename T_Acc>
+        DINLINE void operator()(T_Acc const& acc, T_ParticleBox pb, T_Mapping const mapper) const
+        {
+            using namespace particles::operations;
+            using namespace mappings::threads;
 
-        __syncthreads( );
+            using ParticleBox = T_ParticleBox;
+            using FrameType = typename ParticleBox::FrameType;
+            using FramePtr = typename ParticleBox::FramePtr;
 
-        while( frame.isValid( ) )
-        {
-            using ParticleDomCfg = IdxConfig<
-                frameSize,
-                numWorkers
-            >;
-            // loop over all particles in the frame
-            ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
+            constexpr uint32_t dim = T_Mapping::Dim;
+            constexpr uint32_t frameSize = math::CT::volume<typename FrameType::SuperCellSize>::type::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
 
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    auto particle = ( frame[ linearIdx ] );
-                    particle[ multiMask_ ] = 0; // delete particle
-                }
-            );
+            DataSpace<dim> const superCellIdx = mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc)));
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-            __syncthreads( );
+            PMACC_SMEM(acc, frame, FramePtr);
 
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    // always remove the last frame
-                    frame = getPreviousFrameAndRemoveLastFrame(
-                        frame,
-                        pb,
-                        superCellIdx
-                    );
-                }
-            );
-            __syncthreads( );
-        }
+            ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
+
+            onlyMaster([&](uint32_t const, uint32_t const) { frame = pb.getLastFrame(superCellIdx); });
 
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
+            cupla::__syncthreads(acc);
+
+            while(frame.isValid())
             {
-                // all frames and particles are removed
-                pb.getSuperCell( superCellIdx ).setNumParticles( 0 );
+                using ParticleDomCfg = IdxConfig<frameSize, numWorkers>;
+                // loop over all particles in the frame
+                ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
+
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const) {
+                    auto particle = (frame[linearIdx]);
+                    particle[multiMask_] = 0; // delete particle
+                });
+
+                cupla::__syncthreads(acc);
+
+                onlyMaster([&](uint32_t const, uint32_t const) {
+                    // always remove the last frame
+                    frame = getPreviousFrameAndRemoveLastFrame(acc, frame, pb, superCellIdx);
+                });
+                cupla::__syncthreads(acc);
             }
-        );
-    }
-};
 
-/** copy particles from the guard to an exchange buffer
- *
- * @warning This kernel resets the number of particles in the processed supercells even
- * if there are particles left in the supercell and does not guarantee that the last frame is
- * contiguous filled.
- * Call KernelFillGaps afterwards if you need a valid number of particles
- * and a contiguously filled last frame.
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelCopyGuardToExchange
-{
-    /** copy guard particles to an exchange buffer
+            onlyMaster([&](uint32_t const, uint32_t const) {
+                // all frames and particles are removed
+                pb.getSuperCell(superCellIdx).setNumParticles(0);
+            });
+        }
+    };
+
+    /** copy particles from the guard to an exchange buffer
      *
-     * @tparam T_ParBox pmacc::ParticlesBox, particle box type
-     * @tparam T_ExchangeValueType frame type of the exchange buffer
-     * @tparam T_Mapping mapper functor type
+     * @warning This kernel resets the number of particles in the processed supercells even
+     * if there are particles left in the supercell and does not guarantee that the last frame is
+     * contiguous filled.
+     * Call KernelFillGaps afterwards if you need a valid number of particles
+     * and a contiguously filled last frame.
      *
-     * @param pb particle memory
-     * @param exchangeBox exchange buffer for particles
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParBox,
-        typename T_ExchangeValueType,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParBox pb,
-        ExchangePushDataBox<
-            vint_t,
-            T_ExchangeValueType,
-            T_Mapping::Dim - 1
-        > exchangeBox,
-        T_Mapping const mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelCopyGuardToExchange
     {
-        using namespace particles::operations;
-        using namespace mappings::threads;
-
-        PMACC_CONSTEXPR_CAPTURE uint32_t dim = T_Mapping::Dim;
-        constexpr uint32_t frameSize = math::CT::volume< typename T_ParBox::FrameType::SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
-
-        using FramePtr = typename T_ParBox::FramePtr;
-
-        DataSpace< dim > const superCellIdx = mapper.getSuperCellIndex( DataSpace< dim >( blockIdx ) );
-        uint32_t const workerIdx = threadIdx.x;
-
-        // number of particles in the current handled frame
-        PMACC_SMEM(
-            acc,
-            numParticles,
-            int
-        );
-        PMACC_SMEM(
-            acc,
-            frame,
-            FramePtr
-        );
-
-        /* `exchangeChunk` is a view to a chunk of the memory in the exchange-
-         * The chunk contains between 0 and `numParticles` particles
-         * and is updated for each frame.
-         */
-        PMACC_SMEM(
-            acc,
-            exchangeChunk,
-            TileDataBox< T_ExchangeValueType >
-        );
-
-        /* flag: define if all particles from the current frame are copied to the
-         * exchange buffer
+        /** copy guard particles to an exchange buffer
+         *
+         * @tparam T_ParBox pmacc::ParticlesBox, particle box type
+         * @tparam T_ExchangeValueType frame type of the exchange buffer
+         * @tparam T_Mapping mapper functor type
          *
-         * `true` if all particles are copied, else `false`
+         * @param pb particle memory
+         * @param exchangeBox exchange buffer for particles
+         * @param mapper functor to map a block to a supercell
          */
-        PMACC_SMEM(
-            acc,
-            allParticlesCopied,
-            bool
-        );
-
-        ForEachIdx<
-            IdxConfig<
-                1,
-                numWorkers
-            >
-        > onlyMaster{ workerIdx };
-
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                allParticlesCopied = true;
-                frame = pb.getLastFrame( superCellIdx );
-            }
-        );
+        template<typename T_ParBox, typename T_ExchangeValueType, typename T_Mapping, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_ParBox pb,
+            ExchangePushDataBox<vint_t, T_ExchangeValueType, T_Mapping::Dim - 1> exchangeBox,
+            T_Mapping const mapper) const
+        {
+            using namespace particles::operations;
+            using namespace mappings::threads;
 
-        __syncthreads( );
+            PMACC_CONSTEXPR_CAPTURE uint32_t dim = T_Mapping::Dim;
+            constexpr uint32_t frameSize = math::CT::volume<typename T_ParBox::FrameType::SuperCellSize>::type::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
 
-        while ( frame.isValid( ) && allParticlesCopied )
-        {
-            using ParticleDomCfg = IdxConfig<
-                frameSize,
-                numWorkers
-            >;
+            using FramePtr = typename T_ParBox::FramePtr;
+
+            DataSpace<dim> const superCellIdx = mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc)));
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
 
-            /* the index of the gap in the exchange box where the particle
-             * is copied to
+            // number of particles in the current handled frame
+            PMACC_SMEM(acc, numParticles, int);
+            PMACC_SMEM(acc, frame, FramePtr);
+
+            /* `exchangeChunk` is a view to a chunk of the memory in the exchange-
+             * The chunk contains between 0 and `numParticles` particles
+             * and is updated for each frame.
              */
-            memory::CtxArray<
-                lcellId_t,
-                ParticleDomCfg
-            >
-            exchangeGapIdxCtx( INV_LOC_IDX );
-
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    numParticles = 0;
-                }
-            );
+            PMACC_SMEM(acc, exchangeChunk, TileDataBox<T_ExchangeValueType>);
 
-            __syncthreads( );
+            /* flag: define if all particles from the current frame are copied to the
+             * exchange buffer
+             *
+             * `true` if all particles are copied, else `false`
+             */
+            PMACC_SMEM(acc, allParticlesCopied, bool);
 
-             // loop over all particles in the frame
-            ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
+            ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
 
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    if ( frame[ linearIdx ][ multiMask_ ] == 1 )
-                    {
-                        exchangeGapIdxCtx[ idx ] = nvidia::atomicAllInc( acc, &numParticles, ::alpaka::hierarchy::Threads{} );
-                    }
-                }
-            );
-            __syncthreads( );
+            onlyMaster([&](uint32_t const, uint32_t const) {
+                allParticlesCopied = true;
+                frame = pb.getLastFrame(superCellIdx);
+            });
+
+            cupla::__syncthreads(acc);
 
-            if( numParticles > 0 )
+            while(frame.isValid() && allParticlesCopied)
             {
+                using ParticleDomCfg = IdxConfig<frameSize, numWorkers>;
 
-                onlyMaster(
-                    [&](
-                        uint32_t const,
-                        uint32_t const
-                    )
+                /* the index of the gap in the exchange box where the particle
+                 * is copied to
+                 */
+                memory::CtxArray<lcellId_t, ParticleDomCfg> exchangeGapIdxCtx(INV_LOC_IDX);
+
+                onlyMaster([&](uint32_t const, uint32_t const) { numParticles = 0; });
+
+                cupla::__syncthreads(acc);
+
+                // loop over all particles in the frame
+                ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
+
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    if(frame[linearIdx][multiMask_] == 1)
                     {
+                        exchangeGapIdxCtx[idx]
+                            = nvidia::atomicAllInc(acc, &numParticles, ::alpaka::hierarchy::Threads{});
+                    }
+                });
+                cupla::__syncthreads(acc);
+
+                if(numParticles > 0)
+                {
+                    onlyMaster([&](uint32_t const, uint32_t const) {
                         // try to get as many memory as particles in the current frame
                         exchangeChunk = exchangeBox.pushN(
                             acc,
                             numParticles,
                             // Compute the target supercell depending on the exchangeType
-                            DataSpaceOperations< dim >::reduce(
-                                superCellIdx,
-                                mapper.getExchangeType( )
-                            ),
-                            ::alpaka::hierarchy::Blocks{}
-                        );
-                        if( exchangeChunk.getSize( ) < numParticles )
+                            DataSpaceOperations<dim>::reduce(superCellIdx, mapper.getExchangeType()),
+                            ::alpaka::hierarchy::Blocks{});
+                        if(exchangeChunk.getSize() < numParticles)
                             allParticlesCopied = false;
-                    }
-                );
+                    });
 
-                __syncthreads( );
+                    cupla::__syncthreads(acc);
 
-                forEachParticle(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const idx
-                    )
-                    {
-                        if( exchangeGapIdxCtx[ idx ] != INV_LOC_IDX && exchangeGapIdxCtx[ idx ] < exchangeChunk.getSize( ) )
+                    forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                        if(exchangeGapIdxCtx[idx] != INV_LOC_IDX && exchangeGapIdxCtx[idx] < exchangeChunk.getSize())
                         {
-                            auto parDest = exchangeChunk[ exchangeGapIdxCtx[ idx ] ][ 0 ];
-                            auto parSrc = frame[ linearIdx ];
-                            assign( parDest, parSrc );
-                            parSrc[ multiMask_ ] = 0;
+                            auto parDest = exchangeChunk[exchangeGapIdxCtx[idx]][0];
+                            auto parSrc = frame[linearIdx];
+                            assign(parDest, parSrc);
+                            parSrc[multiMask_] = 0;
                         }
-                    }
-                );
-                __syncthreads( );
-            }
+                    });
+                    cupla::__syncthreads(acc);
+                }
 
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
+                onlyMaster([&](uint32_t const, uint32_t const) {
                     /* do not remove the frame if we had not copied
                      * all particles from the current frame to the exchange buffer
                      */
-                    if ( allParticlesCopied )
-                        frame = getPreviousFrameAndRemoveLastFrame( frame, pb, superCellIdx );
-                }
-            );
+                    if(allParticlesCopied)
+                        frame = getPreviousFrameAndRemoveLastFrame(acc, frame, pb, superCellIdx);
+                });
 
-            __syncthreads( );
-        }
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
+                cupla::__syncthreads(acc);
+            }
+            onlyMaster([&](uint32_t const, uint32_t const) {
                 /* Mark supercell as empty even if there are particles left.
                  * This kernel not depends on the correct number particles in the supercell.
                  */
-                pb.getSuperCell( superCellIdx ).setNumParticles( 0 );
-            }
-        );
-
-    }
-};
+                pb.getSuperCell(superCellIdx).setNumParticles(0);
+            });
+        }
+    };
 
-/** copy particles from exchange buffer into the border of the simulation
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelInsertParticles
-{
     /** copy particles from exchange buffer into the border of the simulation
      *
-     * @tparam T_ParBox pmacc::ParticlesBox, particle box type
-     * @tparam T_ExchangeValueType frame type of the exchange buffer
-     * @tparam T_Mapping mapper functor type
-     *
-     * @param pb particle memory
-     * @param exchangeBox exchange box for particles
-     * @param mapper functor to map a block to a supercell
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_ParBox,
-        typename T_ExchangeValueType,
-        typename T_Mapping,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_ParBox pb,
-        ExchangePopDataBox<
-            vint_t,
-            T_ExchangeValueType,
-            T_Mapping::Dim - 1
-        > exchangeBox,
-        T_Mapping const mapper
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelInsertParticles
     {
-        using namespace particles::operations;
-        using namespace mappings::threads;
-
-        PMACC_CONSTEXPR_CAPTURE uint32_t dim = T_Mapping::Dim;
-        constexpr uint32_t frameSize = math::CT::volume< typename T_ParBox::FrameType::SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
-
-        uint32_t const workerIdx = threadIdx.x;
-
-        using FramePtr = typename T_ParBox::FramePtr;
-
-        PMACC_SMEM(
-            acc,
-            frame,
-            FramePtr
-        );
-        PMACC_SMEM(
-            acc,
-            elementCount,
-            int
-        );
-        PMACC_SMEM(
-            acc,
-            exchangeChunk,
-            TileDataBox< T_ExchangeValueType >
-        );
-
-        using MasterOnly = IdxConfig<
-            1,
-            numWorkers
-        >;
-
-        /* compressed index of the the supercell
-         * can be uncompressed with `DataSpaceOperations< >::extend()`
+        /** copy particles from exchange buffer into the border of the simulation
+         *
+         * @tparam T_ParBox pmacc::ParticlesBox, particle box type
+         * @tparam T_ExchangeValueType frame type of the exchange buffer
+         * @tparam T_Mapping mapper functor type
+         *
+         * @param pb particle memory
+         * @param exchangeBox exchange box for particles
+         * @param mapper functor to map a block to a supercell
          */
-        memory::CtxArray<
-            DataSpace< dim - 1 >,
-            MasterOnly
-        > compressedSuperCellIdxCtx{ };
-
-        ForEachIdx<
-            MasterOnly
-        > onlyMaster{ workerIdx };
-
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const idx
-            )
-            {
-                exchangeChunk = exchangeBox.get(
-                    blockIdx.x,
-                    compressedSuperCellIdxCtx[ idx ]
-                );
-                elementCount = exchangeChunk.getSize( );
-                if ( elementCount > 0 )
+        template<typename T_ParBox, typename T_ExchangeValueType, typename T_Mapping, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_ParBox pb,
+            ExchangePopDataBox<vint_t, T_ExchangeValueType, T_Mapping::Dim - 1> exchangeBox,
+            T_Mapping const mapper) const
+        {
+            using namespace particles::operations;
+            using namespace mappings::threads;
+
+            PMACC_CONSTEXPR_CAPTURE uint32_t dim = T_Mapping::Dim;
+            constexpr uint32_t frameSize = math::CT::volume<typename T_ParBox::FrameType::SuperCellSize>::type::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
+
+            uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+            using FramePtr = typename T_ParBox::FramePtr;
+
+            PMACC_SMEM(acc, frame, FramePtr);
+            PMACC_SMEM(acc, elementCount, int);
+            PMACC_SMEM(acc, exchangeChunk, TileDataBox<T_ExchangeValueType>);
+
+            using MasterOnly = IdxConfig<1, numWorkers>;
+
+            /* compressed index of the the supercell
+             * can be uncompressed with `DataSpaceOperations< >::extend()`
+             */
+            memory::CtxArray<DataSpace<dim - 1>, MasterOnly> compressedSuperCellIdxCtx{};
+
+            ForEachIdx<MasterOnly> onlyMaster{workerIdx};
+
+            onlyMaster([&](uint32_t const, uint32_t const idx) {
+                exchangeChunk = exchangeBox.get(cupla::blockIdx(acc).x, compressedSuperCellIdxCtx[idx]);
+                elementCount = exchangeChunk.getSize();
+                if(elementCount > 0)
                 {
-                    frame = pb.getEmptyFrame( );
+                    frame = pb.getEmptyFrame(acc);
                 }
-            }
-        );
-
-        __syncthreads( );
-
-        // loop over all particles in the frame
-        ForEachIdx<
-            IdxConfig<
-                frameSize,
-                numWorkers
-            >
-        > forEachParticle{ workerIdx };
-
-        forEachParticle(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
-            {
-                if( linearIdx < elementCount )
+            });
+
+            cupla::__syncthreads(acc);
+
+            // loop over all particles in the frame
+            ForEachIdx<IdxConfig<frameSize, numWorkers>> forEachParticle{workerIdx};
+
+            forEachParticle([&](uint32_t const linearIdx, uint32_t const) {
+                if(linearIdx < elementCount)
                 {
-                    auto parDestFull = frame[ linearIdx ];
-                    parDestFull[ multiMask_ ] = 1;
-                    auto parSrc = exchangeChunk[ linearIdx ][ 0 ];
+                    auto parDestFull = frame[linearIdx];
+                    parDestFull[multiMask_] = 1;
+                    auto parSrc = exchangeChunk[linearIdx][0];
                     /*we know that source has no multiMask*/
-                    auto parDest = deselect<multiMask>( parDestFull );
-                    assign( parDest, parSrc );
+                    auto parDest = deselect<multiMask>(parDestFull);
+                    assign(parDest, parSrc);
                 }
-            }
-        );
+            });
 
-        /** @bug This synchronize fixes a kernel crash in special cases,
-         * psychocoderHPC: I can't tell why.
-         */
-        __syncthreads( );
+            /** @bug This synchronize fixes a kernel crash in special cases,
+             * psychocoderHPC: I can't tell why.
+             */
+            cupla::__syncthreads(acc);
 
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const idx
-            )
-            {
-                if( elementCount > 0 )
+            onlyMaster([&](uint32_t const, uint32_t const idx) {
+                if(elementCount > 0)
                 {
                     // compute the super cell position in target frame to insert into
                     //! @todo: offset == simulation border should be passed to this func instead of being created here
-                    DataSpace< dim > dstSuperCell = DataSpaceOperations < dim - 1 > ::extend(
-                        compressedSuperCellIdxCtx[ idx ],
-                        mapper.getExchangeType( ),
-                        mapper.getGridSuperCells( ),
-                        mapper.getGuardingSuperCells( )
-                    );
-
-                    pb.setAsLastFrame(
-                        acc,
-                        frame,
-                        dstSuperCell
-                    );
-                }
-            }
-        );
+                    DataSpace<dim> dstSuperCell = DataSpaceOperations<dim - 1>::extend(
+                        compressedSuperCellIdxCtx[idx],
+                        mapper.getExchangeType(),
+                        mapper.getGridSuperCells(),
+                        mapper.getGuardingSuperCells());
 
-    }
-};
+                    pb.setAsLastFrame(acc, frame, dstSuperCell);
+                }
+            });
+        }
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/ParticlesBase.tpp b/include/pmacc/particles/ParticlesBase.tpp
index bf7c3fa7ba..fc32b20fcb 100644
--- a/include/pmacc/particles/ParticlesBase.tpp
+++ b/include/pmacc/particles/ParticlesBase.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -37,111 +37,82 @@ namespace pmacc
     template<typename T_ParticleDescription, class MappingDesc, typename T_DeviceHeap>
     void ParticlesBase<T_ParticleDescription, MappingDesc, T_DeviceHeap>::deleteGuardParticles(uint32_t exchangeType)
     {
-
         ExchangeMapping<GUARD, MappingDesc> mapper(this->cellDescription, exchangeType);
 
-        constexpr uint32_t numWorkers = traits::GetNumWorkers<
-            math::CT::volume< typename FrameType::SuperCellSize >::type::value
-        >::value;
-
-        PMACC_KERNEL( KernelDeleteParticles< numWorkers >{ } )(
-            mapper.getGridDim( ),
-            numWorkers
-        )(
-            particlesBuffer->getDeviceParticleBox( ),
-            mapper
-        );
+        constexpr uint32_t numWorkers
+            = traits::GetNumWorkers<math::CT::volume<typename FrameType::SuperCellSize>::type::value>::value;
+
+        PMACC_KERNEL(KernelDeleteParticles<numWorkers>{})
+        (mapper.getGridDim(), numWorkers)(particlesBuffer->getDeviceParticleBox(), mapper);
     }
 
     template<typename T_ParticleDescription, class MappingDesc, typename T_DeviceHeap>
     template<uint32_t T_area>
     void ParticlesBase<T_ParticleDescription, MappingDesc, T_DeviceHeap>::deleteParticlesInArea()
     {
-
         AreaMapping<T_area, MappingDesc> mapper(this->cellDescription);
 
-        constexpr uint32_t numWorkers = traits::GetNumWorkers<
-            math::CT::volume< typename FrameType::SuperCellSize >::type::value
-        >::value;
-
-        PMACC_KERNEL( KernelDeleteParticles< numWorkers >{ } )(
-            mapper.getGridDim( ),
-            numWorkers
-        )(
-            particlesBuffer->getDeviceParticleBox( ),
-            mapper
-        );
+        constexpr uint32_t numWorkers
+            = traits::GetNumWorkers<math::CT::volume<typename FrameType::SuperCellSize>::type::value>::value;
+
+        PMACC_KERNEL(KernelDeleteParticles<numWorkers>{})
+        (mapper.getGridDim(), numWorkers)(particlesBuffer->getDeviceParticleBox(), mapper);
     }
 
     template<typename T_ParticleDescription, class MappingDesc, typename T_DeviceHeap>
-    void ParticlesBase<T_ParticleDescription, MappingDesc, T_DeviceHeap>::reset(uint32_t )
+    void ParticlesBase<T_ParticleDescription, MappingDesc, T_DeviceHeap>::reset(uint32_t)
     {
-        deleteParticlesInArea<CORE+BORDER+GUARD>();
-        particlesBuffer->reset( );
+        deleteParticlesInArea<CORE + BORDER + GUARD>();
+        particlesBuffer->reset();
     }
 
     template<typename T_ParticleDescription, class MappingDesc, typename T_DeviceHeap>
-    void ParticlesBase<T_ParticleDescription, MappingDesc, T_DeviceHeap>::copyGuardToExchange( uint32_t exchangeType )
+    void ParticlesBase<T_ParticleDescription, MappingDesc, T_DeviceHeap>::copyGuardToExchange(uint32_t exchangeType)
     {
-        if( particlesBuffer->hasSendExchange( exchangeType ) )
+        if(particlesBuffer->hasSendExchange(exchangeType))
         {
-            ExchangeMapping<
-                GUARD,
-                MappingDesc
-            > mapper(
-                this->cellDescription,
-                exchangeType
-            );
-
-            particlesBuffer->getSendExchangeStack( exchangeType ).setCurrentSize( 0 );
-
-            constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                math::CT::volume< typename FrameType::SuperCellSize >::type::value
-            >::value;
-
-            PMACC_KERNEL( KernelCopyGuardToExchange< numWorkers >{ } )(
-                mapper.getGridDim( ),
-                numWorkers
-            )(
-                particlesBuffer->getDeviceParticleBox( ),
-                particlesBuffer->getSendExchangeStack( exchangeType ).getDeviceExchangePushDataBox( ),
-                mapper
-            );
+            ExchangeMapping<GUARD, MappingDesc> mapper(this->cellDescription, exchangeType);
+
+            particlesBuffer->getSendExchangeStack(exchangeType).setCurrentSize(0);
+
+            constexpr uint32_t numWorkers
+                = traits::GetNumWorkers<math::CT::volume<typename FrameType::SuperCellSize>::type::value>::value;
+
+            PMACC_KERNEL(KernelCopyGuardToExchange<numWorkers>{})
+            (mapper.getGridDim(), numWorkers)(
+                particlesBuffer->getDeviceParticleBox(),
+                particlesBuffer->getSendExchangeStack(exchangeType).getDeviceExchangePushDataBox(),
+                mapper);
         }
     }
 
     template<typename T_ParticleDescription, class MappingDesc, typename T_DeviceHeap>
     void ParticlesBase<T_ParticleDescription, MappingDesc, T_DeviceHeap>::insertParticles(uint32_t exchangeType)
     {
-        if( particlesBuffer->hasReceiveExchange( exchangeType ) )
+        if(particlesBuffer->hasReceiveExchange(exchangeType))
         {
-            size_t grid( particlesBuffer->getReceiveExchangeStack( exchangeType ).getHostCurrentSize( ) );
-            if( grid != 0u )
+            size_t numParticles = 0u;
+            if(Environment<>::get().isMpiDirectEnabled())
+                numParticles = particlesBuffer->getReceiveExchangeStack(exchangeType).getDeviceCurrentSize();
+            else
+                numParticles = particlesBuffer->getReceiveExchangeStack(exchangeType).getHostCurrentSize();
+
+            if(numParticles != 0u)
             {
-                ExchangeMapping<
-                    GUARD,
-                    MappingDesc
-                > mapper(
-                    this->cellDescription,
-                    exchangeType
-                );
-
-                constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                    math::CT::volume< typename FrameType::SuperCellSize >::type::value
-                >::value;
-
-                PMACC_KERNEL( KernelInsertParticles< numWorkers >{ } )(
-                    grid,
-                    numWorkers
-                )(
-                    particlesBuffer->getDeviceParticleBox( ),
-                    particlesBuffer->getReceiveExchangeStack( exchangeType ).getDeviceExchangePopDataBox( ),
-                    mapper
-                );
+                ExchangeMapping<GUARD, MappingDesc> mapper(this->cellDescription, exchangeType);
+
+                constexpr uint32_t numWorkers
+                    = traits::GetNumWorkers<math::CT::volume<typename FrameType::SuperCellSize>::type::value>::value;
+
+                PMACC_KERNEL(KernelInsertParticles<numWorkers>{})
+                (numParticles, numWorkers)(
+                    particlesBuffer->getDeviceParticleBox(),
+                    particlesBuffer->getReceiveExchangeStack(exchangeType).getDeviceExchangePopDataBox(),
+                    mapper);
             }
         }
     }
 
-} //namespace pmacc
+} // namespace pmacc
 
 #include "pmacc/particles/AsyncCommunicationImpl.hpp"
diff --git a/include/pmacc/particles/algorithm/CallForEach.hpp b/include/pmacc/particles/algorithm/CallForEach.hpp
index c2fbb9f497..4ea1155cc3 100644
--- a/include/pmacc/particles/algorithm/CallForEach.hpp
+++ b/include/pmacc/particles/algorithm/CallForEach.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2019-2020 Rene Widera
+/* Copyright 2019-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,56 +30,41 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace algorithm
-{
-
-    /** Functor to execute an operation on all particles
-     *
-     * @tparam T_SpeciesOperator an operator to create the used species
-     *                           with the species type as ::type
-     * @tparam T_FunctorOperator an operator to create a particle functor
-     *                           with the functor type as ::type
-     */
-    template<
-        typename T_SpeciesOperator,
-        typename T_FunctorOperator
-    >
-    struct CallForEach
+    namespace particles
     {
-        /** Operate on the domain CORE and BORDER
-         *
-         * @param currentStep current simulation time step
-         */
-        HINLINE void
-        operator()( uint32_t const currentStep )
+        namespace algorithm
         {
-            using Species = typename T_SpeciesOperator::type;
-            using FrameType = typename Species::FrameType;
+            /** Functor to execute an operation on all particles
+             *
+             * @tparam T_SpeciesOperator an operator to create the used species
+             *                           with the species type as ::type
+             * @tparam T_FunctorOperator an operator to create a particle functor
+             *                           with the functor type as ::type
+             */
+            template<typename T_SpeciesOperator, typename T_FunctorOperator>
+            struct CallForEach
+            {
+                /** Operate on the domain CORE and BORDER
+                 *
+                 * @param currentStep current simulation time step
+                 */
+                HINLINE void operator()(uint32_t const currentStep)
+                {
+                    using Species = typename T_SpeciesOperator::type;
+                    using FrameType = typename Species::FrameType;
 
-            // be sure the species functor follows the pmacc functor interface
-            using UnaryFunctor = pmacc::functor::Interface<
-                typename T_FunctorOperator::type,
-                1u,
-                void
-            >;
+                    // be sure the species functor follows the pmacc functor interface
+                    using UnaryFunctor = pmacc::functor::Interface<typename T_FunctorOperator::type, 1u, void>;
 
-            DataConnector &dc = Environment<>::get().DataConnector();
-            auto species = dc.get< Species >(
-                FrameType::getName(),
-                true
-            );
+                    DataConnector& dc = Environment<>::get().DataConnector();
+                    auto species = dc.get<Species>(FrameType::getName(), true);
 
-            forEach(
-                *species,
-                UnaryFunctor( currentStep )
-            );
+                    forEach(*species, UnaryFunctor(currentStep));
 
-            dc.releaseData( FrameType::getName() );
-        }
-    };
+                    dc.releaseData(FrameType::getName());
+                }
+            };
 
-} // namespace algorithm
-} // namespace particles
+        } // namespace algorithm
+    } // namespace particles
 } // namespace pmacc
diff --git a/include/pmacc/particles/algorithm/ForEach.hpp b/include/pmacc/particles/algorithm/ForEach.hpp
index 68a0c1d284..764936252d 100644
--- a/include/pmacc/particles/algorithm/ForEach.hpp
+++ b/include/pmacc/particles/algorithm/ForEach.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl, Rene Widera
+/* Copyright 2017-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -33,164 +33,123 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace algorithm
-{
-namespace acc
-{
-namespace detail
-{
-
-    /** operate on particles of a species
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template< uint32_t T_numWorkers >
-    struct ForEachParticle
+    namespace particles
     {
-        /** operate on particles
-         *
-         * @tparam T_Acc alpaka accelerator type
-         * @tparam T_Functor type of the functor to operate on a particle
-         * @tparam T_Mapping mapping functor type
-         * @tparam T_ParBox pmacc::ParticlesBox, type of the species box
-         *
-         * @param acc alpaka accelerator
-         * @param functor functor to operate on a particle
-         *                must fulfill the interface pmacc::functor::Interface<F, 1u, void>
-         * @param mapper functor to map a block to a supercell
-         * @param pb particles species box
-         */
-        template<
-            typename T_Acc,
-            typename T_Functor,
-            typename T_Mapping,
-            typename T_ParBox
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_Functor functor,
-            T_Mapping const mapper,
-            T_ParBox pb
-        ) const
+        namespace algorithm
         {
-            using namespace mappings::threads;
-
-            using SuperCellSize = typename T_ParBox::FrameType::SuperCellSize;
-            constexpr uint32_t dim = SuperCellSize::dim;
-            constexpr uint32_t frameSize = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorkers = T_numWorkers;
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            DataSpace< dim > const superCellIdx(
-                mapper.getSuperCellIndex( DataSpace< dim >( blockIdx ) )
-            );
-
-            auto const & superCell = pb.getSuperCell( superCellIdx );
-            uint32_t const numPartcilesInSupercell = superCell.getNumParticles();
-
-
-            // end kernel if we have no particles
-            if( numPartcilesInSupercell == 0 )
-                return;
-
-            using FramePtr = typename T_ParBox::FramePtr;
-            FramePtr frame = pb.getFirstFrame( superCellIdx );
+            namespace acc
+            {
+                namespace detail
+                {
+                    /** operate on particles of a species
+                     *
+                     * @tparam T_numWorkers number of workers
+                     */
+                    template<uint32_t T_numWorkers>
+                    struct ForEachParticle
+                    {
+                        /** operate on particles
+                         *
+                         * @tparam T_Acc alpaka accelerator type
+                         * @tparam T_Functor type of the functor to operate on a particle
+                         * @tparam T_Mapping mapping functor type
+                         * @tparam T_ParBox pmacc::ParticlesBox, type of the species box
+                         *
+                         * @param acc alpaka accelerator
+                         * @param functor functor to operate on a particle
+                         *                must fulfill the interface pmacc::functor::Interface<F, 1u, void>
+                         * @param mapper functor to map a block to a supercell
+                         * @param pb particles species box
+                         */
+                        template<typename T_Acc, typename T_Functor, typename T_Mapping, typename T_ParBox>
+                        DINLINE void operator()(
+                            T_Acc const& acc,
+                            T_Functor functor,
+                            T_Mapping const mapper,
+                            T_ParBox pb) const
+                        {
+                            using namespace mappings::threads;
+
+                            using SuperCellSize = typename T_ParBox::FrameType::SuperCellSize;
+                            constexpr uint32_t dim = SuperCellSize::dim;
+                            constexpr uint32_t frameSize = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                            constexpr uint32_t numWorkers = T_numWorkers;
+
+                            uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                            DataSpace<dim> const superCellIdx(
+                                mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc))));
+
+                            auto const& superCell = pb.getSuperCell(superCellIdx);
+                            uint32_t const numPartcilesInSupercell = superCell.getNumParticles();
+
+
+                            // end kernel if we have no particles
+                            if(numPartcilesInSupercell == 0)
+                                return;
+
+                            using FramePtr = typename T_ParBox::FramePtr;
+                            FramePtr frame = pb.getFirstFrame(superCellIdx);
+
+                            // offset of the superCell (in cells, without any guards) to the origin of the local domain
+                            DataSpace<dim> const localSuperCellOffset = superCellIdx - mapper.getGuardingSuperCells();
+
+                            auto accFunctor = functor(acc, localSuperCellOffset, WorkerCfg<T_numWorkers>{workerIdx});
+
+                            for(uint32_t parOffset = 0; parOffset < numPartcilesInSupercell; parOffset += frameSize)
+                            {
+                                using ParticleDomCfg = IdxConfig<frameSize, numWorkers>;
+
+                                // loop over all particles in the frame
+                                ForEachIdx<ParticleDomCfg>{workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
+                                    // particle index within the supercell
+                                    uint32_t parIdx = parOffset + linearIdx;
+                                    auto particle = frame[linearIdx];
+
+                                    bool const isPar = parIdx < numPartcilesInSupercell;
+                                    if(isPar)
+                                        accFunctor(acc, particle);
+                                });
+
+                                frame = pb.getNextFrame(frame);
+                            }
+                        }
+                    };
+
+                } // namespace detail
+            } // namespace acc
+
+            /** Run a unary functor for each particle of a species
+             *
+             * @warning Does NOT fill gaps automatically! If the
+             *          operation deactivates particles or creates "gaps" in any
+             *          other way, CallFillAllGaps needs to be called for the
+             *          species manually afterwards!
+             *
+             * Operates on the domain CORE and BORDER
+             *
+             * @tparam T_Species type of the species
+             * @tparam T_Functor unary particle functor type which follows the interface of
+             *                   pmacc::functor::Interface<F, 1u, void>
+             *
+             * @param species species to operate on
+             * @param functor operation which is applied to each particle of the species
+             */
+            template<typename T_Species, typename T_Functor>
+            void forEach(T_Species&& species, T_Functor functor)
+            {
+                using MappingDesc = decltype(species.getCellDescription());
+                AreaMapping<CORE + BORDER, MappingDesc> mapper(species.getCellDescription());
 
-            // offset of the superCell (in cells, without any guards) to the origin of the local domain
-            DataSpace< dim > const localSuperCellOffset =
-                superCellIdx - mapper.getGuardingSuperCells( );
+                using SuperCellSize = typename MappingDesc::SuperCellSize;
 
-            auto accFunctor = functor(
-                acc,
-                localSuperCellOffset,
-                WorkerCfg< T_numWorkers >{ workerIdx }
-            );
+                constexpr uint32_t numWorkers
+                    = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
 
-            for( uint32_t parOffset = 0; parOffset < numPartcilesInSupercell; parOffset += frameSize)
-            {
-                using ParticleDomCfg = IdxConfig<
-                    frameSize,
-                    numWorkers
-                >;
-
-                // loop over all particles in the frame
-                ForEachIdx< ParticleDomCfg >{ workerIdx }(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const
-                    )
-                    {
-                        // particle index within the supercell
-                        uint32_t parIdx = parOffset + linearIdx;
-                        auto particle = frame[ linearIdx ];
-
-                        bool const isPar = parIdx < numPartcilesInSupercell;
-                        if( isPar )
-                            accFunctor(
-                                acc,
-                                particle
-                            );
-                    }
-                );
-
-                frame = pb.getNextFrame( frame );
+                PMACC_KERNEL(acc::detail::ForEachParticle<numWorkers>{})
+                (mapper.getGridDim(), numWorkers)(std::move(functor), mapper, species.getDeviceParticlesBox());
             }
-        }
-    };
-
-} //namespace detail
-} //namespace acc
-
-    /** Run a unary functor for each particle of a species
-     *
-     * @warning Does NOT fill gaps automatically! If the
-     *          operation deactivates particles or creates "gaps" in any
-     *          other way, CallFillAllGaps needs to be called for the
-     *          species manually afterwards!
-     *
-     * Operates on the domain CORE and BORDER
-     *
-     * @tparam T_Species type of the species
-     * @tparam T_Functor unary particle functor type which follows the interface of
-     *                   pmacc::functor::Interface<F, 1u, void>
-     *
-     * @param species species to operate on
-     * @param functor operation which is applied to each particle of the species
-     */
-    template<
-        typename T_Species,
-        typename T_Functor
-    >
-    void forEach(
-        T_Species && species,
-        T_Functor functor
-    )
-    {
-        using MappingDesc = decltype(species.getCellDescription());
-        AreaMapping<
-            CORE + BORDER,
-            MappingDesc
-        > mapper( species.getCellDescription() );
-
-        using SuperCellSize = typename MappingDesc::SuperCellSize;
-
-        constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-            pmacc::math::CT::volume< SuperCellSize >::type::value
-        >::value;
-
-        PMACC_KERNEL( acc::detail::ForEachParticle< numWorkers >{ } )(
-            mapper.getGridDim(),
-            numWorkers
-        )(
-            std::move(functor),
-            mapper,
-            species.getDeviceParticlesBox( )
-        );
-    }
-
-} // namespace algorithm
-} // namespace particles
+
+        } // namespace algorithm
+    } // namespace particles
 } // namespace pmacc
diff --git a/include/pmacc/particles/boostExtension/InheritGenerators.hpp b/include/pmacc/particles/boostExtension/InheritGenerators.hpp
index c2f171ce92..0fe3e627a0 100644
--- a/include/pmacc/particles/boostExtension/InheritGenerators.hpp
+++ b/include/pmacc/particles/boostExtension/InheritGenerators.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -37,68 +37,65 @@
 
 namespace pmacc
 {
-
-template <class list_>
-struct LinearInherit;
-
-template <class Base1, class Base2>
-class LinearInheritFork : public Base1, public Base2
-{
-};
-
-
-/** Rule if head is a class without Base template parameter
- *
- * Create a fork and inherit from head and combined classes from Vec
- */
-template <class Head, class Vec,bool isVectorEmpty=bmpl::empty<Vec>::value>
-struct TypelistLinearInherit;
-
-template <class Head, class Vec>
-struct TypelistLinearInherit<Head,Vec,false>
-{
-    typedef LinearInheritFork<Head, typename LinearInherit<Vec>::type > type;
-};
-
-
-
-/** Rule if head is a class which can inherit from other class
- */
-template < template<class> class Head, class Vec>
-struct TypelistLinearInherit<Head<pmacc::NullFrame>, Vec ,false>
-{
-    typedef Head<typename LinearInherit<Vec>::type > type;
-};
-
-
-/** Rule if Vec is empty but Head is valid
- *
- * This is the recursive end rule
- */
-template <class Head,class Vec>
-struct TypelistLinearInherit<Head, Vec ,true>
-{
-    typedef Head type;
-};
-
-
-
-/** Create a data structure which inherit linearly
- * \tparam vec_ boost mpl vector with classes
- *
- * class A<pmacc::NullFrame>;
- * LinearInherit<mpl::vector<A<>,B> >::type return
- *
- * typedef A<B> type;
- */
-template <typename vec_>
-struct LinearInherit
-{
-    typedef typename TypelistLinearInherit <
-        typename bmpl::front<vec_>::type,
-        typename bmpl::pop_front<vec_>::type >::type type;
-};
-
-}
-
-
+    template<class list_>
+    struct LinearInherit;
+
+    template<class Base1, class Base2>
+    class LinearInheritFork
+        : public Base1
+        , public Base2
+    {
+    };
+
+
+    /** Rule if head is a class without Base template parameter
+     *
+     * Create a fork and inherit from head and combined classes from Vec
+     */
+    template<class Head, class Vec, bool isVectorEmpty = bmpl::empty<Vec>::value>
+    struct TypelistLinearInherit;
+
+    template<class Head, class Vec>
+    struct TypelistLinearInherit<Head, Vec, false>
+    {
+        typedef LinearInheritFork<Head, typename LinearInherit<Vec>::type> type;
+    };
+
+
+    /** Rule if head is a class which can inherit from other class
+     */
+    template<template<class> class Head, class Vec>
+    struct TypelistLinearInherit<Head<pmacc::NullFrame>, Vec, false>
+    {
+        typedef Head<typename LinearInherit<Vec>::type> type;
+    };
+
+
+    /** Rule if Vec is empty but Head is valid
+     *
+     * This is the recursive end rule
+     */
+    template<class Head, class Vec>
+    struct TypelistLinearInherit<Head, Vec, true>
+    {
+        typedef Head type;
+    };
+
+
+    /** Create a data structure which inherit linearly
+     * \tparam vec_ boost mpl vector with classes
+     *
+     * class A<pmacc::NullFrame>;
+     * LinearInherit<mpl::vector<A<>,B> >::type return
+     *
+     * typedef A<B> type;
+     */
+    template<typename vec_>
+    struct LinearInherit
+    {
+        typedef
+            typename TypelistLinearInherit<typename bmpl::front<vec_>::type, typename bmpl::pop_front<vec_>::type>::
+                type type;
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/particles/boostExtension/InheritLinearly.hpp b/include/pmacc/particles/boostExtension/InheritLinearly.hpp
index feeab93ab6..bb7485ddda 100644
--- a/include/pmacc/particles/boostExtension/InheritLinearly.hpp
+++ b/include/pmacc/particles/boostExtension/InheritLinearly.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -31,43 +31,27 @@
 
 namespace pmacc
 {
-namespace detail
-{
-
-    /** get combined type which inherit from a boost mpl sequence
-     *
-     * @tparam T_Sequence boost mpl sequence with classes
-     * @tparam T_Accessor unary operator to transform each element of the sequence
-     */
-    template<
-        typename T_Sequence,
-        template< typename > class T_Accessor = meta::accessors::Identity
-    >
-    using InheritLinearly =
-        typename bmpl::inherit_linearly<
-            T_Sequence,
-            bmpl::inherit<
-                bmpl::_1,
-                T_Accessor< bmpl::_2 >
-            >
-        >::type;
+    namespace detail
+    {
+        /** get combined type which inherit from a boost mpl sequence
+         *
+         * @tparam T_Sequence boost mpl sequence with classes
+         * @tparam T_Accessor unary operator to transform each element of the sequence
+         */
+        template<typename T_Sequence, template<typename> class T_Accessor = meta::accessors::Identity>
+        using InheritLinearly =
+            typename bmpl::inherit_linearly<T_Sequence, bmpl::inherit<bmpl::_1, T_Accessor<bmpl::_2>>>::type;
 
-} //namespace detail
+    } // namespace detail
 
     /** type which inherits from multiple classes
      *
      * @tparam T_Sequence boost mpl sequence with classes
      * @tparam T_Accessor unary operator to transform each element of the sequence
      */
-    template<
-        typename T_Sequence,
-        template< typename > class T_Accessor = meta::accessors::Identity
-    >
-    struct InheritLinearly : detail::InheritLinearly<
-        T_Sequence,
-        T_Accessor
-    >
+    template<typename T_Sequence, template<typename> class T_Accessor = meta::accessors::Identity>
+    struct InheritLinearly : detail::InheritLinearly<T_Sequence, T_Accessor>
     {
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/frame_types.hpp b/include/pmacc/particles/frame_types.hpp
index c0254388bd..c2d994a420 100644
--- a/include/pmacc/particles/frame_types.hpp
+++ b/include/pmacc/particles/frame_types.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -23,10 +23,10 @@
 
 #include "pmacc/types.hpp"
 
-//define which index means that the index is invalid
+// define which index means that the index is invalid
 #define INV_IDX 0xFFFFFFFF
 
-//define which index means that a local cell index is invalid
+// define which index means that a local cell index is invalid
 #define INV_LOC_IDX 0xFFFF
 
 namespace pmacc
@@ -45,5 +45,10 @@ namespace pmacc
     /**
      * Describes type of a frame (core, border)
      */
-    enum FrameType { CORE_FRAME = 0u, BORDER_FRAME =1u , BIG_FRAME=2u};
-}
+    enum FrameType
+    {
+        CORE_FRAME = 0u,
+        BORDER_FRAME = 1u,
+        BIG_FRAME = 2u
+    };
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/boxes/ExchangePopDataBox.hpp b/include/pmacc/particles/memory/boxes/ExchangePopDataBox.hpp
index c8dbf9e2c1..869bd02cb0 100644
--- a/include/pmacc/particles/memory/boxes/ExchangePopDataBox.hpp
+++ b/include/pmacc/particles/memory/boxes/ExchangePopDataBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -29,36 +29,31 @@
 
 namespace pmacc
 {
-
-
-template<class TYPE, class VALUE, unsigned DIM>
-class ExchangePopDataBox : public DataBox<PitchedBox<VALUE, DIM1> >
-{
-public:
-    typedef ExchangeMemoryIndex<TYPE, DIM> PopType;
-
-    HDINLINE ExchangePopDataBox(DataBox<PitchedBox<VALUE, DIM1> > data,
-                                DataBox<PitchedBox<PopType, DIM1> > virtualMemory
-                               ) :
-                                  DataBox<PitchedBox<VALUE, DIM1> >(data),
-                                  virtualMemory(virtualMemory)
-    {
-
-    }
-
-    HDINLINE
-    TileDataBox<VALUE> get(TYPE idx, DataSpace<DIM> &superCell)
+    template<class TYPE, class VALUE, unsigned DIM>
+    class ExchangePopDataBox : public DataBox<PitchedBox<VALUE, DIM1>>
     {
-        PopType tmp = virtualMemory[idx];
-
-        superCell = tmp.getSuperCell();
-        return TileDataBox<VALUE > (this->fixedPointer,
-                                    DataSpace<DIM1 > (tmp.getStartIndex()),
-                                    tmp.getCount());
-    }
-
-protected:
-    PMACC_ALIGN8(virtualMemory, DataBox<PitchedBox<PopType, DIM1> >);
-};
-
-}
+    public:
+        typedef ExchangeMemoryIndex<TYPE, DIM> PopType;
+
+        HDINLINE ExchangePopDataBox(
+            DataBox<PitchedBox<VALUE, DIM1>> data,
+            DataBox<PitchedBox<PopType, DIM1>> virtualMemory)
+            : DataBox<PitchedBox<VALUE, DIM1>>(data)
+            , virtualMemory(virtualMemory)
+        {
+        }
+
+        HDINLINE
+        TileDataBox<VALUE> get(TYPE idx, DataSpace<DIM>& superCell)
+        {
+            PopType tmp = virtualMemory[idx];
+
+            superCell = tmp.getSuperCell();
+            return TileDataBox<VALUE>(this->fixedPointer, DataSpace<DIM1>(tmp.getStartIndex()), tmp.getCount());
+        }
+
+    protected:
+        PMACC_ALIGN8(virtualMemory, DataBox<PitchedBox<PopType, DIM1>>);
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/boxes/ExchangePushDataBox.hpp b/include/pmacc/particles/memory/boxes/ExchangePushDataBox.hpp
index 9160d271ab..6091d5bd7a 100644
--- a/include/pmacc/particles/memory/boxes/ExchangePushDataBox.hpp
+++ b/include/pmacc/particles/memory/boxes/ExchangePushDataBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -31,82 +31,76 @@
 
 namespace pmacc
 {
-
-
-/**
- * @tparam TYPE type for addresses
- * @tparam VALUE type for actual data
- * @tparam DIM dimension
- */
-template<class TYPE, class VALUE, unsigned DIM>
-class ExchangePushDataBox : public DataBox<PitchedBox<VALUE, DIM1> >
-{
-public:
-
-    typedef ExchangeMemoryIndex<TYPE, DIM> PushType;
-
-    HDINLINE ExchangePushDataBox(VALUE *data, TYPE *currentSizePointer, TYPE maxSize,
-                                PushDataBox<TYPE, PushType > virtualMemory) :
-    DataBox<PitchedBox<VALUE, DIM1> >(PitchedBox<VALUE, DIM1>(data, DataSpace<DIM1>())),
-    currentSizePointer(currentSizePointer),
-    maxSize(maxSize),
-    virtualMemory(virtualMemory)
-    {
-    }
-
-    /** give access to push N elements into the memory
-     *
-     * The method is threadsave within the given alpaka hierarchy.
-     *
-     * @tparam T_Acc type of the alpaka accelerator
-     * @tparam T_Hierarchy alpaka::hierarchy type of the hierarchy
-     *
-     * @param acc alpaka accelerator
-     * @param count number of elements to increase stack with
-     * @param superCell offset of the supercell relative to the local domain
-     * @param hierarchy alpaka parallelism hierarchy levels guarantee valid
-     *                  concurrency access to the memory
-     *
-     * @return a TileDataBox of size count pointing to the new stack elements
+    /**
+     * @tparam TYPE type for addresses
+     * @tparam VALUE type for actual data
+     * @tparam DIM dimension
      */
-    template< typename T_Acc, typename T_Hierarchy >
-    HDINLINE TileDataBox<VALUE> pushN(
-        T_Acc const & acc,
-        TYPE count,
-        DataSpace<DIM> const &superCell,
-        T_Hierarchy const & hierarchy
-    )
+    template<class TYPE, class VALUE, unsigned DIM>
+    class ExchangePushDataBox : public DataBox<PitchedBox<VALUE, DIM1>>
     {
-        TYPE oldSize = atomicAdd(currentSizePointer, count, hierarchy); //get count VALUEs
+    public:
+        typedef ExchangeMemoryIndex<TYPE, DIM> PushType;
 
-        if (oldSize + count > maxSize)
+        HDINLINE ExchangePushDataBox(
+            VALUE* data,
+            TYPE* currentSizePointer,
+            TYPE maxSize,
+            PushDataBox<TYPE, PushType> virtualMemory)
+            : DataBox<PitchedBox<VALUE, DIM1>>(PitchedBox<VALUE, DIM1>(data, DataSpace<DIM1>()))
+            , currentSizePointer(currentSizePointer)
+            , maxSize(maxSize)
+            , virtualMemory(virtualMemory)
         {
-            atomicExch(currentSizePointer, maxSize, hierarchy); //reset size to maxsize
-            if (oldSize >= maxSize)
-            {
-                return TileDataBox<VALUE > (nullptr,
-                                            DataSpace<DIM1 > (0),
-                                            0);
-            }
-            else
-                count = maxSize - oldSize;
         }
 
-        TileDataBox<PushType> tmp = virtualMemory.pushN(acc, 1, hierarchy);
-        tmp[0].setSuperCell(superCell);
-        tmp[0].setCount(count);
-        tmp[0].setStartIndex(oldSize);
-        return TileDataBox<VALUE > (this->fixedPointer,
-                                    DataSpace<DIM1 > (oldSize),
-                                    count);
-    }
+        /** give access to push N elements into the memory
+         *
+         * The method is threadsave within the given alpaka hierarchy.
+         *
+         * @tparam T_Acc type of the alpaka accelerator
+         * @tparam T_Hierarchy alpaka::hierarchy type of the hierarchy
+         *
+         * @param acc alpaka accelerator
+         * @param count number of elements to increase stack with
+         * @param superCell offset of the supercell relative to the local domain
+         * @param hierarchy alpaka parallelism hierarchy levels guarantee valid
+         *                  concurrency access to the memory
+         *
+         * @return a TileDataBox of size count pointing to the new stack elements
+         */
+        template<typename T_Acc, typename T_Hierarchy>
+        HDINLINE TileDataBox<VALUE> pushN(
+            T_Acc const& acc,
+            TYPE count,
+            DataSpace<DIM> const& superCell,
+            T_Hierarchy const& hierarchy)
+        {
+            TYPE oldSize = cupla::atomicAdd(acc, currentSizePointer, count, hierarchy); // get count VALUEs
 
+            if(oldSize + count > maxSize)
+            {
+                cupla::atomicExch(acc, currentSizePointer, maxSize, hierarchy); // reset size to maxsize
+                if(oldSize >= maxSize)
+                {
+                    return TileDataBox<VALUE>(nullptr, DataSpace<DIM1>(0), 0);
+                }
+                else
+                    count = maxSize - oldSize;
+            }
+
+            TileDataBox<PushType> tmp = virtualMemory.pushN(acc, 1, hierarchy);
+            tmp[0].setSuperCell(superCell);
+            tmp[0].setCount(count);
+            tmp[0].setStartIndex(oldSize);
+            return TileDataBox<VALUE>(this->fixedPointer, DataSpace<DIM1>(oldSize), count);
+        }
 
 
-protected:
-    PMACC_ALIGN8(virtualMemory, PushDataBox<TYPE, PushType >);
-    PMACC_ALIGN(maxSize, TYPE);
-    PMACC_ALIGN(currentSizePointer, TYPE*);
-};
+    protected:
+        PMACC_ALIGN8(virtualMemory, PushDataBox<TYPE, PushType>);
+        PMACC_ALIGN(maxSize, TYPE);
+        PMACC_ALIGN(currentSizePointer, TYPE*);
+    };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
index 24532ffa6f..d6630e606e 100644
--- a/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
+++ b/include/pmacc/particles/memory/boxes/ParticlesBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Alexander Grund
  *
  * This file is part of PMacc.
@@ -22,8 +22,8 @@
 
 #pragma once
 
-#if( PMACC_CUDA_ENABLED == 1 )
-#   include <mallocMC/mallocMC.hpp>
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+#    include <mallocMC/mallocMC.hpp>
 #endif
 #include "pmacc/particles/frame_types.hpp"
 #include "pmacc/dimensions/DataSpace.hpp"
@@ -33,300 +33,283 @@
 
 namespace pmacc
 {
-
-/**
- * A DIM-dimensional Box holding frames with particle data.
- *
- * @tparam FRAME datatype for frames
- * @tparam DIM dimension of data (1-3)
- */
-template<class T_Frame, typename T_DeviceHeapHandle, unsigned DIM>
-class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM> >
-{
-private:
-    PMACC_ALIGN( m_deviceHeapHandle, T_DeviceHeapHandle );
-    PMACC_ALIGN( hostMemoryOffset, int64_t );
-public:
-
-    typedef T_Frame FrameType;
-    typedef FramePointer<FrameType> FramePtr;
-    typedef SuperCell<FrameType> SuperCellType;
-    typedef DataBox<PitchedBox<SuperCell<FrameType>, DIM> > BaseType;
-    typedef T_DeviceHeapHandle DeviceHeapHandle;
-
-    static constexpr uint32_t Dim = DIM;
-
-    /** default constructor
+    /**
+     * A DIM-dimensional Box holding frames with particle data.
      *
-     * \warning after this call the object is in a invalid state and must be
-     * initialized with an assignment of a valid ParticleBox
+     * @tparam FRAME datatype for frames
+     * @tparam DIM dimension of data (1-3)
      */
-    HDINLINE ParticlesBox( ) : hostMemoryOffset( 0 )
-    {
-
-    }
-
-    HDINLINE ParticlesBox(
-        const DataBox<PitchedBox<SuperCellType, DIM> >& superCells,
-        const DeviceHeapHandle&  deviceHeapHandle
-    ) :
-        BaseType( superCells ), m_deviceHeapHandle(deviceHeapHandle), hostMemoryOffset( 0 )
+    template<class T_Frame, typename T_DeviceHeapHandle, unsigned DIM>
+    class ParticlesBox : protected DataBox<PitchedBox<SuperCell<T_Frame>, DIM>>
     {
+    private:
+        PMACC_ALIGN(m_deviceHeapHandle, T_DeviceHeapHandle);
+        PMACC_ALIGN(hostMemoryOffset, int64_t);
+
+    public:
+        typedef T_Frame FrameType;
+        typedef FramePointer<FrameType> FramePtr;
+        typedef SuperCell<FrameType> SuperCellType;
+        typedef DataBox<PitchedBox<SuperCell<FrameType>, DIM>> BaseType;
+        typedef T_DeviceHeapHandle DeviceHeapHandle;
+
+        static constexpr uint32_t Dim = DIM;
+
+        /** default constructor
+         *
+         * \warning after this call the object is in a invalid state and must be
+         * initialized with an assignment of a valid ParticleBox
+         */
+        HDINLINE ParticlesBox() : hostMemoryOffset(0)
+        {
+        }
 
-    }
-
-    HDINLINE ParticlesBox(
-        const DataBox<PitchedBox<SuperCellType, DIM> > &superCells,
-        const DeviceHeapHandle&  deviceHeapHandle,
-        int64_t memoryOffset
-    ) :
-        BaseType( superCells ), m_deviceHeapHandle(deviceHeapHandle), hostMemoryOffset( memoryOffset )
-    {
+        HDINLINE ParticlesBox(
+            const DataBox<PitchedBox<SuperCellType, DIM>>& superCells,
+            const DeviceHeapHandle& deviceHeapHandle)
+            : BaseType(superCells)
+            , m_deviceHeapHandle(deviceHeapHandle)
+            , hostMemoryOffset(0)
+        {
+        }
 
-    }
+        HDINLINE ParticlesBox(
+            const DataBox<PitchedBox<SuperCellType, DIM>>& superCells,
+            const DeviceHeapHandle& deviceHeapHandle,
+            int64_t memoryOffset)
+            : BaseType(superCells)
+            , m_deviceHeapHandle(deviceHeapHandle)
+            , hostMemoryOffset(memoryOffset)
+        {
+        }
 
-    /**
-     * Returns an empty frame from data heap.
-     *
-     * @return an empty frame
-     */
-    DINLINE FramePtr getEmptyFrame( )
-    {
-        FrameType* tmp = nullptr;
-        const int maxTries = 13; //magic number is not performance critical
-        for ( int numTries = 0; numTries < maxTries; ++numTries )
+        /**
+         * Returns an empty frame from data heap.
+         *
+         * @return an empty frame
+         */
+        template<typename T_Acc>
+        DINLINE FramePtr getEmptyFrame(const T_Acc& acc)
         {
-#if( PMACC_CUDA_ENABLED == 1 )
-            tmp = (FrameType*) m_deviceHeapHandle.malloc( sizeof (FrameType) );
+            FrameType* tmp = nullptr;
+            const int maxTries = 13; // magic number is not performance critical
+            for(int numTries = 0; numTries < maxTries; ++numTries)
+            {
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+                tmp = (FrameType*) m_deviceHeapHandle.malloc(acc, sizeof(FrameType));
 #else
-            tmp = new FrameType;
+                tmp = new FrameType;
 #endif
-            if ( tmp != nullptr )
-            {
-                /* disable all particles since we can not assume that newly allocated memory contains zeros */
-                for ( int i = 0; i < (int) math::CT::volume<typename FrameType::SuperCellSize>::type::value; ++i )
-                    ( *tmp )[i][multiMask_] = 0;
-#if( PMACC_CUDA_ENABLED == 1 )
-                /* takes care that changed values are visible to all threads inside this block*/
-                __threadfence_block( );
+                if(tmp != nullptr)
+                {
+                    /* disable all particles since we can not assume that newly allocated memory contains zeros */
+                    for(int i = 0; i < (int) math::CT::volume<typename FrameType::SuperCellSize>::type::value; ++i)
+                        (*tmp)[i][multiMask_] = 0;
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+                    /* takes care that changed values are visible to all threads inside this block*/
+                    __threadfence_block();
 #endif
-                break;
-            }
-            else
-            {
-                printf( "%s: mallocMC out of memory (try %i of %i)\n",
+                    break;
+                }
+                else
+                {
+#ifndef BOOST_COMP_HIP
+                    printf(
+                        "%s: mallocMC out of memory (try %i of %i)\n",
                         (numTries + 1) == maxTries ? "ERROR" : "WARNING",
                         numTries + 1,
-                        maxTries );
+                        maxTries);
+#endif
+                }
             }
-        }
 
-        return FramePtr( tmp );
-    }
+            return FramePtr(tmp);
+        }
 
-    /**
-     * Removes frame from heap data heap.
-     *
-     * @param frame frame to remove
-     */
-    DINLINE void removeFrame( FramePtr& frame )
-    {
-#if( PMACC_CUDA_ENABLED == 1 )
-        m_deviceHeapHandle.free( (void*) frame.ptr );
+        /**
+         * Removes frame from heap data heap.
+         *
+         * @param frame frame to remove
+         */
+        template<typename T_Acc>
+        DINLINE void removeFrame(const T_Acc& acc, FramePtr& frame)
+        {
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+            m_deviceHeapHandle.free(acc, (void*) frame.ptr);
 #else
-        delete(frame.ptr);
+            delete(frame.ptr);
 #endif
-        frame.ptr = nullptr;
-    }
+            frame.ptr = nullptr;
+        }
 
-    HDINLINE
-    FramePtr mapPtr( const FramePtr& devPtr ) const
-    {
-#ifndef __CUDA_ARCH__
-        int64_t useOffset = hostMemoryOffset * static_cast<int64_t> (devPtr.ptr != 0);
-        return FramePtr( reinterpret_cast<FrameType*> (
-                                                       reinterpret_cast<char*> (devPtr.ptr) - useOffset
-                                                       )
-                        );
+        HDINLINE
+        FramePtr mapPtr(const FramePtr& devPtr) const
+        {
+#if(CUPLA_DEVICE_COMPILE == 1)
+            return devPtr;
 #else
-        return devPtr;
+            int64_t useOffset = hostMemoryOffset * static_cast<int64_t>(devPtr.ptr != 0);
+            return FramePtr(reinterpret_cast<FrameType*>(reinterpret_cast<char*>(devPtr.ptr) - useOffset));
 #endif
-    }
-
-    /**
-     * Returns the next frame in the linked list.
-     *
-     * @param frame the active frame
-     * @return the next frame in the list
-     */
-    HDINLINE FramePtr getNextFrame( const FramePtr& frame ) const
-    {
-        return mapPtr( frame->nextFrame.ptr );
-    }
+        }
 
-    /**
-     * Returns the previous frame in the linked list.
-     *
-     * @param frame the active frame
-     * @return the previous frame in the list
-     */
-    HDINLINE FramePtr getPreviousFrame( const FramePtr& frame ) const
-    {
-        return mapPtr( frame->previousFrame.ptr );
-    }
+        /**
+         * Returns the next frame in the linked list.
+         *
+         * @param frame the active frame
+         * @return the next frame in the list
+         */
+        HDINLINE FramePtr getNextFrame(const FramePtr& frame) const
+        {
+            return mapPtr(frame->nextFrame.ptr);
+        }
 
-    /**
-     * Returns the last frame of a supercell.
-     *
-     * @param idx position of supercell
-     * @return the last frame of the linked list from supercell
-     */
-    HDINLINE FramePtr getLastFrame( const DataSpace<DIM> &idx ) const
-    {
-        return mapPtr( getSuperCell( idx ).LastFramePtr( ) );
-    }
+        /**
+         * Returns the previous frame in the linked list.
+         *
+         * @param frame the active frame
+         * @return the previous frame in the list
+         */
+        HDINLINE FramePtr getPreviousFrame(const FramePtr& frame) const
+        {
+            return mapPtr(frame->previousFrame.ptr);
+        }
 
-    /**
-     * Returns the first frame of a supercell.
-     *
-     * @param idx position of supercell
-     * @return the first frame of the linked list from supercell
-     */
-    HDINLINE FramePtr getFirstFrame( const DataSpace<DIM> &idx ) const
-    {
-        return mapPtr( getSuperCell( idx ).FirstFramePtr( ) );
-    }
+        /**
+         * Returns the last frame of a supercell.
+         *
+         * @param idx position of supercell
+         * @return the last frame of the linked list from supercell
+         */
+        HDINLINE FramePtr getLastFrame(const DataSpace<DIM>& idx) const
+        {
+            return mapPtr(getSuperCell(idx).LastFramePtr());
+        }
 
-    /**
-     * Sets frame as the first frame of a supercell.
-     *
-     * @param frame frame to set as first frame
-     * @param idx position of supercell
-     */
-    template<
-        typename T_Acc
-    >
-    DINLINE void setAsFirstFrame(
-        T_Acc const & acc,
-        FramePtr & frame,
-        DataSpace< DIM > const &idx
-    )
-    {
-        FrameType** firstFrameNativPtr = &(getSuperCell( idx ).firstFramePtr);
+        /**
+         * Returns the first frame of a supercell.
+         *
+         * @param idx position of supercell
+         * @return the first frame of the linked list from supercell
+         */
+        HDINLINE FramePtr getFirstFrame(const DataSpace<DIM>& idx) const
+        {
+            return mapPtr(getSuperCell(idx).FirstFramePtr());
+        }
 
-        frame->previousFrame = FramePtr( );
-        frame->nextFrame = FramePtr( *firstFrameNativPtr );
-#if( PMACC_CUDA_ENABLED == 1 )
-        /* - takes care that `next[index]` is visible to all threads on the gpu
-         * - this is needed because later on in this method we change `previous`
-         *   of an other frame, this must be done in order!
+        /**
+         * Sets frame as the first frame of a supercell.
+         *
+         * @param frame frame to set as first frame
+         * @param idx position of supercell
          */
-        __threadfence( );
+        template<typename T_Acc>
+        DINLINE void setAsFirstFrame(T_Acc const& acc, FramePtr& frame, DataSpace<DIM> const& idx)
+        {
+            FrameType** firstFrameNativPtr = &(getSuperCell(idx).firstFramePtr);
+
+            frame->previousFrame = FramePtr();
+            frame->nextFrame = FramePtr(*firstFrameNativPtr);
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+            /* - takes care that `next[index]` is visible to all threads on the gpu
+             * - this is needed because later on in this method we change `previous`
+             *   of an other frame, this must be done in order!
+             */
+            __threadfence();
 #endif
-        FramePtr oldFirstFramePtr(
-            (FrameType*) atomicExch(
+            FramePtr oldFirstFramePtr((FrameType*) cupla::atomicExch(
+                acc,
                 (unsigned long long int*) firstFrameNativPtr,
                 (unsigned long long int) frame.ptr,
-                ::alpaka::hierarchy::Grids{}
-            )
-        );
+                ::alpaka::hierarchy::Grids{}));
 
-        frame->nextFrame = oldFirstFramePtr;
-        if ( oldFirstFramePtr.isValid( ) )
-        {
-            oldFirstFramePtr->previousFrame = frame;
-        }
-        else
-        {
-            //we add the first frame in supercell
-            getSuperCell( idx ).lastFramePtr = frame.ptr;
+            frame->nextFrame = oldFirstFramePtr;
+            if(oldFirstFramePtr.isValid())
+            {
+                oldFirstFramePtr->previousFrame = frame;
+            }
+            else
+            {
+                // we add the first frame in supercell
+                getSuperCell(idx).lastFramePtr = frame.ptr;
+            }
         }
-    }
-
-    /**
-     * Sets frame as the last frame of a supercell.
-     *
-     * @param frame frame to set as last frame
-     * @param idx position of supercell
-     */
-    template<
-        typename T_Acc
-    >
-    DINLINE void setAsLastFrame(
-        T_Acc const & acc,
-        FramePointer<
-            FrameType
-        >& frame,
-        DataSpace< DIM > const &idx
-    )
-    {
-        FrameType** lastFrameNativPtr = &(getSuperCell( idx ).lastFramePtr);
 
-        frame->nextFrame = FramePtr( );
-        frame->previousFrame = FramePtr( *lastFrameNativPtr );
-#if( PMACC_CUDA_ENABLED == 1 )
-        /* - takes care that `next[index]` is visible to all threads on the gpu
-         * - this is needed because later on in this method we change `next`
-         *   of an other frame, this must be done in order!
+        /**
+         * Sets frame as the last frame of a supercell.
+         *
+         * @param frame frame to set as last frame
+         * @param idx position of supercell
          */
-        __threadfence( );
+        template<typename T_Acc>
+        DINLINE void setAsLastFrame(T_Acc const& acc, FramePointer<FrameType>& frame, DataSpace<DIM> const& idx)
+        {
+            FrameType** lastFrameNativPtr = &(getSuperCell(idx).lastFramePtr);
+
+            frame->nextFrame = FramePtr();
+            frame->previousFrame = FramePtr(*lastFrameNativPtr);
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
+            /* - takes care that `next[index]` is visible to all threads on the gpu
+             * - this is needed because later on in this method we change `next`
+             *   of an other frame, this must be done in order!
+             */
+            __threadfence();
 #endif
-        FramePtr oldLastFramePtr(
-            (FrameType*) atomicExch(
+            FramePtr oldLastFramePtr((FrameType*) cupla::atomicExch(
+                acc,
                 (unsigned long long int*) lastFrameNativPtr,
                 (unsigned long long int) frame.ptr,
-                ::alpaka::hierarchy::Grids{}
-            )
-        );
+                ::alpaka::hierarchy::Grids{}));
 
-        frame->previousFrame = oldLastFramePtr;
-        if ( oldLastFramePtr.isValid( ) )
-        {
-            oldLastFramePtr->nextFrame = frame;
-        }
-        else
-        {
-            //we add the first frame in supercell
-            getSuperCell( idx ).firstFramePtr = frame.ptr;
+            frame->previousFrame = oldLastFramePtr;
+            if(oldLastFramePtr.isValid())
+            {
+                oldLastFramePtr->nextFrame = frame;
+            }
+            else
+            {
+                // we add the first frame in supercell
+                getSuperCell(idx).firstFramePtr = frame.ptr;
+            }
         }
-    }
 
-    /**
-     * Removes the last frame of a supercell.
-     * This call is not threadsave, only one thread from a supercell may call this function.
-     * @param idx position of supercell
-     * @return true if more frames in list, else false
-     */
-    DINLINE bool removeLastFrame( const DataSpace<DIM> &idx )
-    {
-        //!\todo this is not thread save
-        FrameType** lastFrameNativPtr = &(getSuperCell( idx ).lastFramePtr);
-
-        FramePtr last( *lastFrameNativPtr );
-        if ( last.isValid( ) )
+        /**
+         * Removes the last frame of a supercell.
+         * This call is not threadsave, only one thread from a supercell may call this function.
+         * @param idx position of supercell
+         * @return true if more frames in list, else false
+         */
+        template<typename T_Acc>
+        DINLINE bool removeLastFrame(const T_Acc& acc, const DataSpace<DIM>& idx)
         {
-            FramePtr prev( last->previousFrame );
+            //!\todo this is not thread save
+            FrameType** lastFrameNativPtr = &(getSuperCell(idx).lastFramePtr);
 
-            if ( prev.isValid( ) )
+            FramePtr last(*lastFrameNativPtr);
+            if(last.isValid())
             {
-                prev->nextFrame = FramePtr( ); //set to invalid frame
-                *lastFrameNativPtr = prev.ptr; //set new last frame
-                removeFrame( last );
-                return true;
+                FramePtr prev(last->previousFrame);
+
+                if(prev.isValid())
+                {
+                    prev->nextFrame = FramePtr(); // set to invalid frame
+                    *lastFrameNativPtr = prev.ptr; // set new last frame
+                    removeFrame(acc, last);
+                    return true;
+                }
+                // remove last frame of supercell
+                getSuperCell(idx).firstFramePtr = nullptr;
+                getSuperCell(idx).lastFramePtr = nullptr;
+
+                removeFrame(acc, last);
             }
-            //remove last frame of supercell
-            getSuperCell( idx ).firstFramePtr = nullptr;
-            getSuperCell( idx ).lastFramePtr = nullptr;
-
-            removeFrame( last );
+            return false;
         }
-        return false;
-    }
 
-    HDINLINE SuperCellType& getSuperCell( DataSpace<DIM> idx ) const
-    {
-        return BaseType::operator()(idx);
-    }
-};
+        HDINLINE SuperCellType& getSuperCell(DataSpace<DIM> idx) const
+        {
+            return BaseType::operator()(idx);
+        }
+    };
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/boxes/PushDataBox.hpp b/include/pmacc/particles/memory/boxes/PushDataBox.hpp
index c5041727a4..e503268610 100644
--- a/include/pmacc/particles/memory/boxes/PushDataBox.hpp
+++ b/include/pmacc/particles/memory/boxes/PushDataBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -21,7 +21,6 @@
  */
 
 
-
 #pragma once
 
 
@@ -32,8 +31,6 @@
 
 namespace pmacc
 {
-
-
     /**
      * Implements a Box to which elements can only be added, using atomic operations.
      *
@@ -41,10 +38,9 @@ namespace pmacc
      * @tparam VALUE datatype for values addresses point to
      */
     template<class TYPE, class VALUE>
-    class PushDataBox : public DataBox<PitchedBox<VALUE, DIM1> >
+    class PushDataBox : public DataBox<PitchedBox<VALUE, DIM1>>
     {
     public:
-
         /**
          * Constructor.
          *
@@ -52,11 +48,11 @@ namespace pmacc
          * @param offset relative offset to pointer start address
          * @param currentSize size of the buffer data points to
          */
-        HDINLINE PushDataBox(VALUE *data, TYPE *currentSize, DataSpace<DIM1> offset=DataSpace<DIM1>(0)) :
-        DataBox<PitchedBox<VALUE, DIM1> >(PitchedBox<VALUE,DIM1> ( data, offset)),
-        currentSize(currentSize),maxSize(0) /*\todo implement max size*/
+        HDINLINE PushDataBox(VALUE* data, TYPE* currentSize, DataSpace<DIM1> offset = DataSpace<DIM1>(0))
+            : DataBox<PitchedBox<VALUE, DIM1>>(PitchedBox<VALUE, DIM1>(data, offset))
+            , currentSize(currentSize)
+            , maxSize(0) /*\todo implement max size*/
         {
-
         }
 
         /** Increases the size of the stack with count elements in an atomic operation
@@ -73,11 +69,11 @@ namespace pmacc
          *
          * @return a TileDataBox of size count pointing to the new stack elements
          */
-        template< typename T_Acc, typename T_Hierarchy >
-        HDINLINE TileDataBox<VALUE> pushN(T_Acc const & acc, TYPE count, T_Hierarchy const & hierarchy)
+        template<typename T_Acc, typename T_Hierarchy>
+        HDINLINE TileDataBox<VALUE> pushN(T_Acc const& acc, TYPE count, T_Hierarchy const& hierarchy)
         {
-            TYPE old_addr = atomicAdd(currentSize, count, hierarchy);
-            return TileDataBox<VALUE > (this->fixedPointer, DataSpace<DIM1>(old_addr));
+            TYPE old_addr = cupla::atomicAdd(acc, currentSize, count, hierarchy);
+            return TileDataBox<VALUE>(this->fixedPointer, DataSpace<DIM1>(old_addr));
         }
 
         /** Adds a value to the stack in an atomic operation.
@@ -94,15 +90,15 @@ namespace pmacc
          *
          * @return a TileDataBox of size count pointing to the new stack elements
          */
-        template< typename T_Acc, typename T_Hierarchy >
-        HDINLINE void push(T_Acc const & acc, VALUE val, T_Hierarchy const & hierarchy)
+        template<typename T_Acc, typename T_Hierarchy>
+        HDINLINE void push(T_Acc const& acc, VALUE val, T_Hierarchy const& hierarchy)
         {
-            TYPE old_addr = atomicAdd(currentSize, 1, hierarchy);
+            TYPE old_addr = cupla::atomicAdd(acc, currentSize, 1, hierarchy);
             (*this)[old_addr] = val;
         }
 
     protected:
-        PMACC_ALIGN(maxSize,TYPE);
-        PMACC_ALIGN(currentSize,TYPE*);
+        PMACC_ALIGN(maxSize, TYPE);
+        PMACC_ALIGN(currentSize, TYPE*);
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/boxes/TileDataBox.hpp b/include/pmacc/particles/memory/boxes/TileDataBox.hpp
index 049ba03b01..bff337df03 100644
--- a/include/pmacc/particles/memory/boxes/TileDataBox.hpp
+++ b/include/pmacc/particles/memory/boxes/TileDataBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,81 +28,74 @@
 
 namespace pmacc
 {
-
-template<class TYPE>
-class VectorDataBox : public DataBox<PitchedBox<TYPE, DIM1> >
-{
-public:
-    typedef DataBox<PitchedBox<TYPE, DIM1> > BaseType;
-    typedef TYPE type;
-
-    template<class> struct result;
-
-    template<class F, typename T>
-    struct result < F(T)>
-    {
-        typedef  TYPE& type;
-    };
-
-    template<class F, typename T>
-    struct result < const F(T)>
+    template<class TYPE>
+    class VectorDataBox : public DataBox<PitchedBox<TYPE, DIM1>>
     {
-        typedef const  TYPE& type;
+    public:
+        typedef DataBox<PitchedBox<TYPE, DIM1>> BaseType;
+        typedef TYPE type;
+
+        template<class>
+        struct result;
+
+        template<class F, typename T>
+        struct result<F(T)>
+        {
+            typedef TYPE& type;
+        };
+
+        template<class F, typename T>
+        struct result<const F(T)>
+        {
+            typedef const TYPE& type;
+        };
+
+        HDINLINE VectorDataBox(TYPE* pointer, const DataSpace<DIM1>& offset = DataSpace<DIM1>(0))
+            : BaseType(PitchedBox<TYPE, DIM1>(pointer, offset))
+        {
+        }
+
+        HDINLINE VectorDataBox()
+        {
+        }
     };
 
-    HDINLINE VectorDataBox(TYPE* pointer,
-                           const DataSpace<DIM1> &offset = DataSpace<DIM1>(0)) :
-    BaseType(PitchedBox<TYPE, DIM1>(pointer, offset))
-    {
-    }
-
-    HDINLINE VectorDataBox()
-    {
-    }
-
-
-};
-
-/**
- * Specifies a one-dimensional DataBox for more convenient usage.
- *
- * @tparam TYPE type of data represented by the DataBox
- */
-template<class TYPE>
-class TileDataBox : public VectorDataBox<TYPE>
-{
-public:
-    typedef VectorDataBox<TYPE> BaseType;
-
-    HDINLINE TileDataBox(TYPE* pointer,
-                         const DataSpace<DIM1> &offset = DataSpace<DIM1>(0),
-                         uint32_t size = 0) :
-    BaseType(pointer, offset), size(size)
-    {
-    }
-
     /**
-     * Returns  size of the Box.
+     * Specifies a one-dimensional DataBox for more convenient usage.
      *
-     * @return size of this TileDataBox
+     * @tparam TYPE type of data represented by the DataBox
      */
-    HDINLINE int getSize()
-    {
-        return size;
-    }
-
-    /*object is not  initialized valid, copy a valid instance to this object to get a valid instance*/
-    HDINLINE TileDataBox()
+    template<class TYPE>
+    class TileDataBox : public VectorDataBox<TYPE>
     {
-    }
-
-
-protected:
-
-    PMACC_ALIGN(size, size_t);
-
-};
-
+    public:
+        typedef VectorDataBox<TYPE> BaseType;
+
+        HDINLINE TileDataBox(TYPE* pointer, const DataSpace<DIM1>& offset = DataSpace<DIM1>(0), uint32_t size = 0)
+            : BaseType(pointer, offset)
+            , size(size)
+        {
+        }
+
+        /**
+         * Returns  size of the Box.
+         *
+         * @return size of this TileDataBox
+         */
+        HDINLINE int getSize()
+        {
+            return size;
+        }
+
+        /*object is not  initialized valid, copy a valid instance to this object to get a valid instance*/
+        HDINLINE TileDataBox()
+        {
+        }
+
+
+    protected:
+        PMACC_ALIGN(size, size_t);
+    };
 
 
-}
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/buffers/MallocMCBuffer.hpp b/include/pmacc/particles/memory/buffers/MallocMCBuffer.hpp
index 896c69e13a..3af7745493 100644
--- a/include/pmacc/particles/memory/buffers/MallocMCBuffer.hpp
+++ b/include/pmacc/particles/memory/buffers/MallocMCBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -21,24 +21,25 @@
 
 #pragma once
 
-
 #include "pmacc/dataManagement/ISimulationData.hpp"
 
-#include <mallocMC/mallocMC.hpp>
-
 #include <string>
-#include <memory>
+#include <cstdint>
+
+#if(PMACC_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+
+#    include <mallocMC/mallocMC.hpp>
+#    include <memory>
 
 namespace pmacc
 {
-
-    template< typename T_DeviceHeap >
+    template<typename T_DeviceHeap>
     class MallocMCBuffer : public ISimulationData
     {
     public:
         using DeviceHeap = T_DeviceHeap;
 
-        MallocMCBuffer( const std::shared_ptr<DeviceHeap>& deviceHeap );
+        MallocMCBuffer(const std::shared_ptr<DeviceHeap>& deviceHeap);
 
         virtual ~MallocMCBuffer();
 
@@ -60,7 +61,6 @@ namespace pmacc
         void synchronize() override;
 
     private:
-
         char* hostPtr;
         int64_t hostBufferOffset;
         mallocMC::HeapInfo deviceHeapInfo;
@@ -69,4 +69,39 @@ namespace pmacc
 
 } // namespace pmacc
 
-#include "pmacc/particles/memory/buffers/MallocMCBuffer.tpp"
+#    include "pmacc/particles/memory/buffers/MallocMCBuffer.tpp"
+
+#else
+
+namespace pmacc
+{
+    template<typename T_DeviceHeap>
+    class MallocMCBuffer : public ISimulationData
+    {
+    public:
+        MallocMCBuffer(const std::shared_ptr<T_DeviceHeap>&);
+
+        virtual ~MallocMCBuffer() = default;
+
+        SimulationDataId getUniqueId() override
+        {
+            return getName();
+        }
+
+        static std::string getName()
+        {
+            return std::string("MallocMCBuffer");
+        }
+
+        int64_t getOffset()
+        {
+            return 0u;
+        }
+
+        void synchronize() override
+        {
+        }
+    };
+
+} // namespace pmacc
+#endif
diff --git a/include/pmacc/particles/memory/buffers/MallocMCBuffer.tpp b/include/pmacc/particles/memory/buffers/MallocMCBuffer.tpp
index 7c6c916f45..7003ac35c8 100644
--- a/include/pmacc/particles/memory/buffers/MallocMCBuffer.tpp
+++ b/include/pmacc/particles/memory/buffers/MallocMCBuffer.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Alexander Grund
+/* Copyright 2015-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -21,58 +21,71 @@
 
 #pragma once
 
-#include "pmacc/particles/memory/buffers/MallocMCBuffer.hpp"
-#include "pmacc/types.hpp"
-#include "pmacc/eventSystem/EventSystem.hpp"
+#if(PMACC_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1)
 
-#include <memory>
+#    include "pmacc/particles/memory/buffers/MallocMCBuffer.hpp"
+#    include "pmacc/types.hpp"
+#    include "pmacc/eventSystem/EventSystem.hpp"
 
+#    include <memory>
 
-namespace pmacc
-{
-template< typename T_DeviceHeap >
-MallocMCBuffer< T_DeviceHeap >::MallocMCBuffer( const std::shared_ptr<DeviceHeap>& deviceHeap ) :
-    hostPtr( nullptr ),
-    /* currently mallocMC has only one heap */
-    deviceHeapInfo( deviceHeap->getHeapLocations( )[ 0 ] ),
-    hostBufferOffset( 0 )
-{
-}
 
-template< typename T_DeviceHeap >
-MallocMCBuffer< T_DeviceHeap >::~MallocMCBuffer( )
+namespace pmacc
 {
-    if ( hostPtr != nullptr )
-        cudaHostUnregister(hostPtr);
-
-    __deleteArray(hostPtr);
+    template<typename T_DeviceHeap>
+    MallocMCBuffer<T_DeviceHeap>::MallocMCBuffer(const std::shared_ptr<DeviceHeap>& deviceHeap)
+        : hostPtr(nullptr)
+        ,
+        /* currently mallocMC has only one heap */
+        deviceHeapInfo(deviceHeap->getHeapLocations()[0])
+        , hostBufferOffset(0)
+    {
+    }
 
-}
+    template<typename T_DeviceHeap>
+    MallocMCBuffer<T_DeviceHeap>::~MallocMCBuffer()
+    {
+        if(hostPtr != nullptr)
+        {
+#    if(PMACC_CUDA_ENABLED == 1)
+            cudaHostUnregister(hostPtr);
+            __deleteArray(hostPtr);
+#    else
+            CUDA_CHECK_NO_EXCEPT((cuplaError_t) hipFree(hostPtr));
+#    endif
+        }
+    }
 
-template< typename T_DeviceHeap >
-void MallocMCBuffer< T_DeviceHeap >::synchronize( )
-{
-    /** \todo: we had no abstraction to create a host buffer and a pseudo
-     *         device buffer (out of the mallocMC ptr) and copy both with our event
-     *         system.
-     *         WORKAROUND: use native cuda calls :-(
-     */
-    if ( hostPtr == nullptr )
+    template<typename T_DeviceHeap>
+    void MallocMCBuffer<T_DeviceHeap>::synchronize()
     {
-        /* use `new` and than `cudaHostRegister` is faster than `cudaMallocHost`
-         * but with the some result (create page-locked memory)
+        /** \todo: we had no abstraction to create a host buffer and a pseudo
+         *         device buffer (out of the mallocMC ptr) and copy both with our event
+         *         system.
+         *         WORKAROUND: use native CUDA/HIP calls :-(
          */
-        hostPtr = new char[deviceHeapInfo.size];
-        CUDA_CHECK((cuplaError_t)cudaHostRegister(hostPtr, deviceHeapInfo.size, cudaHostRegisterDefault));
-
+        if(hostPtr == nullptr)
+        {
+#    if(PMACC_CUDA_ENABLED == 1)
+            /* use `new` and than `cudaHostRegister` is faster than `cudaMallocHost`
+             * but with the some result (create page-locked memory)
+             */
+            hostPtr = new char[deviceHeapInfo.size];
+            CUDA_CHECK((cuplaError_t) cudaHostRegister(hostPtr, deviceHeapInfo.size, cudaHostRegisterDefault));
+#    else
+            // we do not use hipHostRegister because this would require a strict alignment
+            // https://github.com/alpaka-group/alpaka/pull/896
+            CUDA_CHECK((cuplaError_t) hipHostMalloc((void**) &hostPtr, deviceHeapInfo.size, hipHostMallocDefault));
+#    endif
 
-        this->hostBufferOffset = static_cast<int64_t>(reinterpret_cast<char*>(deviceHeapInfo.p) - hostPtr);
+            this->hostBufferOffset = static_cast<int64_t>(reinterpret_cast<char*>(deviceHeapInfo.p) - hostPtr);
+        }
+        /* add event system hints */
+        __startOperation(ITask::TASK_DEVICE);
+        __startOperation(ITask::TASK_HOST);
+        CUDA_CHECK(cuplaMemcpy(hostPtr, deviceHeapInfo.p, deviceHeapInfo.size, cuplaMemcpyDeviceToHost));
     }
-    /* add event system hints */
-    __startOperation(ITask::TASK_CUDA);
-    __startOperation(ITask::TASK_HOST);
-    CUDA_CHECK(cudaMemcpy(hostPtr, deviceHeapInfo.p, deviceHeapInfo.size, cudaMemcpyDeviceToHost));
 
-}
+} // namespace pmacc
 
-} //namespace pmacc
+#endif
diff --git a/include/pmacc/particles/memory/buffers/ParticlesBuffer.hpp b/include/pmacc/particles/memory/buffers/ParticlesBuffer.hpp
index f5721b26a2..ef702e9259 100644
--- a/include/pmacc/particles/memory/buffers/ParticlesBuffer.hpp
+++ b/include/pmacc/particles/memory/buffers/ParticlesBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -52,331 +52,293 @@
 
 namespace pmacc
 {
-
-/**
- * Describes DIM-dimensional buffer for particles data on the host.
- *
- * @tParam T_ParticleDescription Object which describe a frame @see ParticleDescription.hpp
- * @tparam SuperCellSize_ TVec which descripe size of a superce
- * @tparam DIM dimension of the buffer (1-3)
- */
-template<typename T_ParticleDescription, class SuperCellSize_, typename T_DeviceHeap, unsigned DIM>
-class ParticlesBuffer
-{
-public:
-
-    /** create static array
-     */
-    template< uint32_t T_size >
-    struct OperatorCreatePairStaticArray
-    {
-
-        template<typename X>
-        struct apply
-        {
-            typedef bmpl::pair<
-                X,
-                StaticArray<
-                    typename traits::Resolve<X>::type::type,
-                    bmpl::integral_c<uint32_t, T_size>
-                >
-            > type;
-        };
-    };
-
-    /** type of the border frame management object
-     *
-     * contains:
-     *   - superCell position of the border frames inside a given range
-     *   - start position inside the exchange stack for frames
-     *   - number of frames corresponding to the superCell position
-     */
-    typedef ExchangeMemoryIndex<
-        vint_t,
-        DIM - 1
-    > BorderFrameIndex;
-
-    typedef SuperCellSize_ SuperCellSize;
-
-    typedef typename MakeSeq<
-        typename T_ParticleDescription::ValueTypeSeq,
-        localCellIdx,
-        multiMask
-    >::type ParticleAttributeList;
-
-    typedef typename MakeSeq<
-        typename T_ParticleDescription::ValueTypeSeq,
-        localCellIdx
-    >::type ParticleAttributeListBorder;
-
-    typedef
-    typename ReplaceValueTypeSeq<
-        T_ParticleDescription,
-        ParticleAttributeList
-    >::type FrameDescriptionWithManagementAttributes;
-
-    /** double linked list pointer */
-    typedef
-    typename MakeSeq<
-        PreviousFramePtr<>,
-        NextFramePtr<>
-    >::type LinkedListPointer;
-
-    /* extent particle description with pointer to a frame*/
-    typedef typename ReplaceFrameExtensionSeq<
-        FrameDescriptionWithManagementAttributes,
-        LinkedListPointer
-    >::type FrameDescription;
-
-    /** frame definition
-     *
-     * a group of particles is stored as frame
-     */
-    typedef Frame<
-        OperatorCreatePairStaticArray<
-            pmacc::math::CT::volume< SuperCellSize >::type::value
-        >,
-        FrameDescription
-    > FrameType;
-
-    typedef typename ReplaceValueTypeSeq<
-        T_ParticleDescription,
-        ParticleAttributeListBorder
-    >::type FrameDescriptionBorder;
-
-    /** frame which is used to communicate particles to neighbors
-     *
-     * - each frame contains only one particle
-     * - local administration attributes of a particle are removed
-     */
-    typedef Frame<
-        OperatorCreatePairStaticArray< 1u >,
-        FrameDescriptionBorder
-    > FrameTypeBorder;
-
-    typedef SuperCell<FrameType> SuperCellType;
-
-    typedef T_DeviceHeap DeviceHeap;
-    /* Type of the particle box which particle buffer create */
-    typedef ParticlesBox< FrameType, typename DeviceHeap::AllocatorHandle, DIM> ParticlesBoxType;
-
-private:
-
-    /* this enum is used only for internal calculations */
-    enum
-    {
-        SizeOfOneBorderElement = (sizeof (FrameTypeBorder) + sizeof (BorderFrameIndex))
-    };
-
-public:
-
     /**
-     * Constructor.
+     * Describes DIM-dimensional buffer for particles data on the host.
      *
-     * @param deviceHeap device heap memory allocator
-     * @param layout number of cell per dimension
-     * @param superCellSize size of one super cell
-     * @param gpuMemory how many memory on device is used for this instance (in byte)
+     * @tParam T_ParticleDescription Object which describe a frame @see ParticleDescription.hpp
+     * @tparam SuperCellSize_ TVec which descripe size of a superce
+     * @tparam DIM dimension of the buffer (1-3)
      */
-    ParticlesBuffer(const std::shared_ptr<DeviceHeap>& deviceHeap, DataSpace<DIM> layout, DataSpace<DIM> superCellSize) :
-        m_deviceHeap(deviceHeap), superCellSize(superCellSize), gridSize(layout), framesExchanges(nullptr)
+    template<typename T_ParticleDescription, class SuperCellSize_, typename T_DeviceHeap, unsigned DIM>
+    class ParticlesBuffer
     {
+    public:
+        /** create static array
+         */
+        template<uint32_t T_size>
+        struct OperatorCreatePairStaticArray
+        {
+            template<typename X>
+            struct apply
+            {
+                typedef bmpl::
+                    pair<X, StaticArray<typename traits::Resolve<X>::type::type, bmpl::integral_c<uint32_t, T_size>>>
+                        type;
+            };
+        };
 
-        exchangeMemoryIndexer = new GridBuffer<BorderFrameIndex, DIM1 > (DataSpace<DIM1 > (0));
-        framesExchanges = new GridBuffer< FrameType, DIM1, FrameTypeBorder > (DataSpace<DIM1 > (0));
-
-        DataSpace<DIM> superCellsCount = gridSize / superCellSize;
-
-        superCells = new GridBuffer<SuperCellType, DIM > (superCellsCount);
-
-        reset();
-    }
-
-    /**
-     * Destructor.
-     */
-    virtual ~ParticlesBuffer()
-    {
-        __delete(superCells);
-        __delete(framesExchanges);
-        __delete(exchangeMemoryIndexer);
-    }
-
-    /**
-     * Resets all internal buffers.
-     */
-    void reset()
-    {
-
-        superCells->getDeviceBuffer().setValue(SuperCellType ());
-        superCells->getHostBuffer().setValue(SuperCellType ());
-    }
-
-    /**
-     * Adds an exchange buffer to frames.
-     *
-     * @param receive Mask describing receive directions
-     * @param usedMemory memory to be used for this exchange
-     */
-    void addExchange(Mask receive, size_t usedMemory, uint32_t communicationTag)
-    {
-
-        size_t numFrameTypeBorders = usedMemory / SizeOfOneBorderElement;
-
-        framesExchanges->addExchangeBuffer(receive, DataSpace<DIM1 > (numFrameTypeBorders), communicationTag, true, false);
-
-        exchangeMemoryIndexer->addExchangeBuffer(receive, DataSpace<DIM1 > (numFrameTypeBorders), communicationTag | (1u << (20 - 5)), true, false);
-    }
-
-    /**
-     * Returns a ParticlesBox for device frame data.
-     *
-     * @return device frames ParticlesBox
-     */
-    ParticlesBoxType getDeviceParticleBox()
-    {
-
-        return ParticlesBoxType(
-            superCells->getDeviceBuffer().getDataBox(),
-            m_deviceHeap->getAllocatorHandle()
-        );
-    }
-
-    /**
-     * Returns a ParticlesBox for host frame data.
-     *
-     * @return host frames ParticlesBox
-     */
-    ParticlesBoxType getHostParticleBox(int64_t memoryOffset)
-    {
-
-        return ParticlesBoxType (
-            superCells->getHostBuffer().getDataBox(),
-            m_deviceHeap->getAllocatorHandle(),
-            memoryOffset
-        );
-    }
-
-    /**
-     * Returns if the buffer has a send exchange in ex direction.
-     *
-     * @param ex direction to query
-     * @return true if buffer has send exchange for ex
-     */
-    bool hasSendExchange(uint32_t ex)
-    {
-
-        return framesExchanges->hasSendExchange(ex);
-    }
-
-    /**
-     * Returns if the buffer has a receive exchange in ex direction.
-     *
-     * @param ex direction to query
-     * @return true if buffer has receive exchange for ex
-     */
-    bool hasReceiveExchange(uint32_t ex)
-    {
-
-        return framesExchanges->hasReceiveExchange(ex);
-    }
-
-    StackExchangeBuffer<FrameTypeBorder, BorderFrameIndex, DIM - 1 > getSendExchangeStack(uint32_t ex)
-    {
-
-        return StackExchangeBuffer<FrameTypeBorder, BorderFrameIndex, DIM - 1 >
-            (framesExchanges->getSendExchange(ex), exchangeMemoryIndexer->getSendExchange(ex));
-    }
-
-    StackExchangeBuffer<FrameTypeBorder, BorderFrameIndex, DIM - 1 > getReceiveExchangeStack(uint32_t ex)
-    {
-
-        return StackExchangeBuffer<FrameTypeBorder, BorderFrameIndex, DIM - 1 >
-            (framesExchanges->getReceiveExchange(ex), exchangeMemoryIndexer->getReceiveExchange(ex));
-    }
-
-    /**
-     * Starts sync data from own device buffer to neighbor device buffer.
-     *
-     * GridBuffer
-     *
-     */
-    EventTask asyncCommunication(EventTask serialEvent)
-    {
-
-        return framesExchanges->asyncCommunication(serialEvent) +
-               exchangeMemoryIndexer->asyncCommunication(serialEvent);
-    }
-
-    EventTask asyncSendParticles(EventTask serialEvent, uint32_t ex)
-    {
-        /* store each gpu-free event separately to avoid race conditions */
-        EventTask framesExchangesGPUEvent;
-        EventTask exchangeMemoryIndexerGPUEvent;
-        EventTask returnEvent = framesExchanges->asyncSend(serialEvent, ex) +
-            exchangeMemoryIndexer->asyncSend(serialEvent, ex);
+        /** type of the border frame management object
+         *
+         * contains:
+         *   - superCell position of the border frames inside a given range
+         *   - start position inside the exchange stack for frames
+         *   - number of frames corresponding to the superCell position
+         */
+        typedef ExchangeMemoryIndex<vint_t, DIM - 1> BorderFrameIndex;
+
+        typedef SuperCellSize_ SuperCellSize;
+
+        typedef typename MakeSeq<typename T_ParticleDescription::ValueTypeSeq, localCellIdx, multiMask>::type
+            ParticleAttributeList;
+
+        typedef typename MakeSeq<typename T_ParticleDescription::ValueTypeSeq, localCellIdx>::type
+            ParticleAttributeListBorder;
+
+        typedef typename ReplaceValueTypeSeq<T_ParticleDescription, ParticleAttributeList>::type
+            FrameDescriptionWithManagementAttributes;
+
+        /** double linked list pointer */
+        typedef typename MakeSeq<PreviousFramePtr<>, NextFramePtr<>>::type LinkedListPointer;
+
+        /* extent particle description with pointer to a frame*/
+        typedef typename ReplaceFrameExtensionSeq<FrameDescriptionWithManagementAttributes, LinkedListPointer>::type
+            FrameDescription;
+
+        /** frame definition
+         *
+         * a group of particles is stored as frame
+         */
+        typedef Frame<
+            OperatorCreatePairStaticArray<pmacc::math::CT::volume<SuperCellSize>::type::value>,
+            FrameDescription>
+            FrameType;
+
+        typedef typename ReplaceValueTypeSeq<T_ParticleDescription, ParticleAttributeListBorder>::type
+            FrameDescriptionBorder;
+
+        /** frame which is used to communicate particles to neighbors
+         *
+         * - each frame contains only one particle
+         * - local administration attributes of a particle are removed
+         */
+        typedef Frame<OperatorCreatePairStaticArray<1u>, FrameDescriptionBorder> FrameTypeBorder;
+
+        typedef SuperCell<FrameType> SuperCellType;
+
+        typedef T_DeviceHeap DeviceHeap;
+        /* Type of the particle box which particle buffer create */
+        typedef ParticlesBox<FrameType, typename DeviceHeap::AllocatorHandle, DIM> ParticlesBoxType;
+
+    private:
+        /* this enum is used only for internal calculations */
+        enum
+        {
+            SizeOfOneBorderElement = (sizeof(FrameTypeBorder) + sizeof(BorderFrameIndex))
+        };
 
-        return returnEvent;
-    }
+    public:
+        /**
+         * Constructor.
+         *
+         * @param deviceHeap device heap memory allocator
+         * @param layout number of cell per dimension
+         * @param superCellSize size of one super cell
+         * @param gpuMemory how many memory on device is used for this instance (in byte)
+         */
+        ParticlesBuffer(
+            const std::shared_ptr<DeviceHeap>& deviceHeap,
+            DataSpace<DIM> layout,
+            DataSpace<DIM> superCellSize)
+            : m_deviceHeap(deviceHeap)
+            , superCellSize(superCellSize)
+            , gridSize(layout)
+            , framesExchanges(nullptr)
+        {
+            exchangeMemoryIndexer = new GridBuffer<BorderFrameIndex, DIM1>(DataSpace<DIM1>(0));
+            framesExchanges = new GridBuffer<FrameType, DIM1, FrameTypeBorder>(DataSpace<DIM1>(0));
 
-    EventTask asyncReceiveParticles(EventTask serialEvent, uint32_t ex)
-    {
+            DataSpace<DIM> superCellsCount = gridSize / superCellSize;
 
-        return framesExchanges->asyncReceive(serialEvent, ex) +
-            exchangeMemoryIndexer->asyncReceive(serialEvent, ex);
-    }
+            superCells = new GridBuffer<SuperCellType, DIM>(superCellsCount);
 
-    /**
-     * Returns number of supercells in each dimension.
-     *
-     * @return number of supercells
-     */
-    DataSpace<DIM> getSuperCellsCount()
-    {
+            reset();
+        }
 
-        PMACC_ASSERT(superCells != nullptr);
-        return superCells->getGridLayout().getDataSpace();
-    }
+        /**
+         * Destructor.
+         */
+        virtual ~ParticlesBuffer()
+        {
+            __delete(superCells);
+            __delete(framesExchanges);
+            __delete(exchangeMemoryIndexer);
+        }
+
+        /**
+         * Resets all internal buffers.
+         */
+        void reset()
+        {
+            superCells->getDeviceBuffer().setValue(SuperCellType());
+            superCells->getHostBuffer().setValue(SuperCellType());
+        }
+
+        /**
+         * Adds an exchange buffer to frames.
+         *
+         * @param receive Mask describing receive directions
+         * @param usedMemory memory to be used for this exchange
+         */
+        void addExchange(Mask receive, size_t usedMemory, uint32_t communicationTag)
+        {
+            size_t numFrameTypeBorders = usedMemory / SizeOfOneBorderElement;
+
+            framesExchanges
+                ->addExchangeBuffer(receive, DataSpace<DIM1>(numFrameTypeBorders), communicationTag, true, false);
+
+            exchangeMemoryIndexer->addExchangeBuffer(
+                receive,
+                DataSpace<DIM1>(numFrameTypeBorders),
+                communicationTag | (1u << (20 - 5)),
+                true,
+                false);
+        }
+
+        /**
+         * Returns a ParticlesBox for device frame data.
+         *
+         * @return device frames ParticlesBox
+         */
+        ParticlesBoxType getDeviceParticleBox()
+        {
+            return ParticlesBoxType(superCells->getDeviceBuffer().getDataBox(), m_deviceHeap->getAllocatorHandle());
+        }
+
+        /**
+         * Returns a ParticlesBox for host frame data.
+         *
+         * @return host frames ParticlesBox
+         */
+        ParticlesBoxType getHostParticleBox(int64_t memoryOffset)
+        {
+            return ParticlesBoxType(
+                superCells->getHostBuffer().getDataBox(),
+                m_deviceHeap->getAllocatorHandle(),
+                memoryOffset);
+        }
+
+        /**
+         * Returns if the buffer has a send exchange in ex direction.
+         *
+         * @param ex direction to query
+         * @return true if buffer has send exchange for ex
+         */
+        bool hasSendExchange(uint32_t ex)
+        {
+            return framesExchanges->hasSendExchange(ex);
+        }
+
+        /**
+         * Returns if the buffer has a receive exchange in ex direction.
+         *
+         * @param ex direction to query
+         * @return true if buffer has receive exchange for ex
+         */
+        bool hasReceiveExchange(uint32_t ex)
+        {
+            return framesExchanges->hasReceiveExchange(ex);
+        }
 
-    /**
-     * Returns number of supercells in each dimension.
-     *
-     * @return number of supercells
-     */
-    GridLayout<DIM> getSuperCellsLayout()
-    {
+        StackExchangeBuffer<FrameTypeBorder, BorderFrameIndex, DIM - 1> getSendExchangeStack(uint32_t ex)
+        {
+            return StackExchangeBuffer<FrameTypeBorder, BorderFrameIndex, DIM - 1>(
+                framesExchanges->getSendExchange(ex),
+                exchangeMemoryIndexer->getSendExchange(ex));
+        }
 
-        PMACC_ASSERT(superCells != nullptr);
-        return superCells->getGridLayout();
-    }
+        StackExchangeBuffer<FrameTypeBorder, BorderFrameIndex, DIM - 1> getReceiveExchangeStack(uint32_t ex)
+        {
+            return StackExchangeBuffer<FrameTypeBorder, BorderFrameIndex, DIM - 1>(
+                framesExchanges->getReceiveExchange(ex),
+                exchangeMemoryIndexer->getReceiveExchange(ex));
+        }
+
+        /**
+         * Starts sync data from own device buffer to neighbor device buffer.
+         *
+         * GridBuffer
+         *
+         */
+        EventTask asyncCommunication(EventTask serialEvent)
+        {
+            return framesExchanges->asyncCommunication(serialEvent)
+                + exchangeMemoryIndexer->asyncCommunication(serialEvent);
+        }
 
-    /**
-     * Returns size of supercells in each dimension.
-     *
-     * @return size of supercells
-     */
-    DataSpace<DIM> getSuperCellSize()
-    {
+        EventTask asyncSendParticles(EventTask serialEvent, uint32_t ex)
+        {
+            /* store each gpu-free event separately to avoid race conditions */
+            EventTask framesExchangesGPUEvent;
+            EventTask exchangeMemoryIndexerGPUEvent;
+            EventTask returnEvent
+                = framesExchanges->asyncSend(serialEvent, ex) + exchangeMemoryIndexer->asyncSend(serialEvent, ex);
 
-        return superCellSize;
-    }
+            return returnEvent;
+        }
 
-    void deviceToHost()
-    {
-        superCells->deviceToHost();
-    }
+        EventTask asyncReceiveParticles(EventTask serialEvent, uint32_t ex)
+        {
+            return framesExchanges->asyncReceive(serialEvent, ex)
+                + exchangeMemoryIndexer->asyncReceive(serialEvent, ex);
+        }
+
+        /**
+         * Returns number of supercells in each dimension.
+         *
+         * @return number of supercells
+         */
+        DataSpace<DIM> getSuperCellsCount()
+        {
+            PMACC_ASSERT(superCells != nullptr);
+            return superCells->getGridLayout().getDataSpace();
+        }
+
+        /**
+         * Returns number of supercells in each dimension.
+         *
+         * @return number of supercells
+         */
+        GridLayout<DIM> getSuperCellsLayout()
+        {
+            PMACC_ASSERT(superCells != nullptr);
+            return superCells->getGridLayout();
+        }
+
+        /**
+         * Returns size of supercells in each dimension.
+         *
+         * @return size of supercells
+         */
+        DataSpace<DIM> getSuperCellSize()
+        {
+            return superCellSize;
+        }
 
+        void deviceToHost()
+        {
+            superCells->deviceToHost();
+        }
 
-private:
-    GridBuffer<BorderFrameIndex, DIM1> *exchangeMemoryIndexer;
 
-    GridBuffer<SuperCellType, DIM> *superCells;
-    /*GridBuffer for hold borderFrames, we need a own buffer to create first exchanges without core memory*/
-    GridBuffer< FrameType, DIM1, FrameTypeBorder> *framesExchanges;
+    private:
+        GridBuffer<BorderFrameIndex, DIM1>* exchangeMemoryIndexer;
 
-    DataSpace<DIM> superCellSize;
-    DataSpace<DIM> gridSize;
-    std::shared_ptr<DeviceHeap> m_deviceHeap;
+        GridBuffer<SuperCellType, DIM>* superCells;
+        /*GridBuffer for hold borderFrames, we need a own buffer to create first exchanges without core memory*/
+        GridBuffer<FrameType, DIM1, FrameTypeBorder>* framesExchanges;
 
-};
-}
+        DataSpace<DIM> superCellSize;
+        DataSpace<DIM> gridSize;
+        std::shared_ptr<DeviceHeap> m_deviceHeap;
+    };
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/buffers/StackExchangeBuffer.hpp b/include/pmacc/particles/memory/buffers/StackExchangeBuffer.hpp
index f1f2db8903..30e7639743 100644
--- a/include/pmacc/particles/memory/buffers/StackExchangeBuffer.hpp
+++ b/include/pmacc/particles/memory/buffers/StackExchangeBuffer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Felix Schmitt, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -28,9 +28,6 @@
 
 namespace pmacc
 {
-
-
-
     /**
      * Can be used for creating several DataBox types from an Exchange.
      *
@@ -40,7 +37,6 @@ namespace pmacc
     class StackExchangeBuffer
     {
     public:
-
         /**
          * Create a stack from any ExchangeBuffer<FRAME,DIM>.
          *
@@ -48,10 +44,10 @@ namespace pmacc
          *
          * @param stack Exchange
          */
-        StackExchangeBuffer(Exchange<FRAME, DIM1> &stack, Exchange<FRAMEINDEX, DIM1> &stackIndexer) :
-        stack(stack), stackIndexer(stackIndexer)
+        StackExchangeBuffer(Exchange<FRAME, DIM1>& stack, Exchange<FRAMEINDEX, DIM1>& stackIndexer)
+            : stack(stack)
+            , stackIndexer(stackIndexer)
         {
-
         }
 
         /**
@@ -61,13 +57,13 @@ namespace pmacc
          */
         ExchangePushDataBox<vint_t, FRAME, DIM> getHostExchangePushDataBox()
         {
-            return ExchangePushDataBox<vint_t, FRAME, DIM > (
-                                                             stack.getHostBuffer().getBasePointer(),
-                                                             stack.getHostBuffer().getCurrentSizePointer(),
-                                                             stack.getHostBuffer().getDataSpace().productOfComponents(),
-                                                             PushDataBox<vint_t, FRAMEINDEX > (
-                                                                                               stackIndexer.getHostBuffer().getBasePointer(),
-                                                                                               stackIndexer.getHostBuffer().getCurrentSizePointer()));
+            return ExchangePushDataBox<vint_t, FRAME, DIM>(
+                stack.getHostBuffer().getBasePointer(),
+                stack.getHostBuffer().getCurrentSizePointer(),
+                stack.getHostBuffer().getDataSpace().productOfComponents(),
+                PushDataBox<vint_t, FRAMEINDEX>(
+                    stackIndexer.getHostBuffer().getBasePointer(),
+                    stackIndexer.getHostBuffer().getCurrentSizePointer()));
         }
 
         /**
@@ -77,10 +73,9 @@ namespace pmacc
          */
         ExchangePopDataBox<vint_t, FRAME, DIM> getHostExchangePopDataBox()
         {
-            return ExchangePopDataBox<vint_t, FRAME, DIM > (
-                                                            stack.getHostBuffer().getDataBox(),
-                                                            stackIndexer.getHostBuffer().getDataBox()
-                                                           );
+            return ExchangePopDataBox<vint_t, FRAME, DIM>(
+                stack.getHostBuffer().getDataBox(),
+                stackIndexer.getHostBuffer().getDataBox());
         }
 
         /**
@@ -92,13 +87,13 @@ namespace pmacc
         {
             PMACC_ASSERT(stack.getDeviceBuffer().hasCurrentSizeOnDevice() == true);
             PMACC_ASSERT(stackIndexer.getDeviceBuffer().hasCurrentSizeOnDevice() == true);
-            return ExchangePushDataBox<vint_t, FRAME, DIM > (
-                                                             stack.getDeviceBuffer().getBasePointer(),
-                                                             (vint_t*) stack.getDeviceBuffer().getCurrentSizeOnDevicePointer(),
-                                                             stack.getDeviceBuffer().getDataSpace().productOfComponents(),
-                                                             PushDataBox<vint_t, FRAMEINDEX > (
-                                                                                               stackIndexer.getDeviceBuffer().getBasePointer(),
-                                                                                               (vint_t*) stackIndexer.getDeviceBuffer().getCurrentSizeOnDevicePointer()));
+            return ExchangePushDataBox<vint_t, FRAME, DIM>(
+                stack.getDeviceBuffer().getBasePointer(),
+                (vint_t*) stack.getDeviceBuffer().getCurrentSizeOnDevicePointer(),
+                stack.getDeviceBuffer().getDataSpace().productOfComponents(),
+                PushDataBox<vint_t, FRAMEINDEX>(
+                    stackIndexer.getDeviceBuffer().getBasePointer(),
+                    (vint_t*) stackIndexer.getDeviceBuffer().getCurrentSizeOnDevicePointer()));
         }
 
         /**
@@ -108,20 +103,24 @@ namespace pmacc
          */
         ExchangePopDataBox<vint_t, FRAME, DIM> getDeviceExchangePopDataBox()
         {
-            return ExchangePopDataBox<vint_t, FRAME, DIM > (
-                                                            stack.getDeviceBuffer().getDataBox(),
-                                                            stackIndexer.getDeviceBuffer().getDataBox()
-                                                           );
+            return ExchangePopDataBox<vint_t, FRAME, DIM>(
+                stack.getDeviceBuffer().getDataBox(),
+                stackIndexer.getDeviceBuffer().getDataBox());
         }
 
         void setCurrentSize(const size_t size)
         {
             // do host and device setCurrentSize parallel
             EventTask split = __getTransactionEvent();
-            __startTransaction(split);
-            stackIndexer.getHostBuffer().setCurrentSize(size);
-            stack.getHostBuffer().setCurrentSize(size);
-            EventTask e1 = __endTransaction();
+            EventTask e1;
+
+            if(!Environment<>::get().isMpiDirectEnabled())
+            {
+                __startTransaction(split);
+                stackIndexer.getHostBuffer().setCurrentSize(size);
+                stack.getHostBuffer().setCurrentSize(size);
+                e1 = __endTransaction();
+            }
 
             __startTransaction(split);
             stackIndexer.getDeviceBuffer().setCurrentSize(size);
@@ -135,7 +134,13 @@ namespace pmacc
 
         size_t getHostCurrentSize()
         {
-            return stackIndexer.getHostBuffer().getCurrentSize();
+            size_t result = 0u;
+            if(Environment<>::get().isMpiDirectEnabled())
+                result = stackIndexer.getDeviceBuffer().getCurrentSize();
+            else
+                result = stackIndexer.getHostBuffer().getCurrentSize();
+
+            return result;
         }
 
         size_t getDeviceCurrentSize()
@@ -150,17 +155,25 @@ namespace pmacc
 
         size_t getHostParticlesCurrentSize()
         {
+            if(Environment<>::get().isMpiDirectEnabled())
+                return stack.getDeviceBuffer().getCurrentSize();
+
             return stack.getHostBuffer().getCurrentSize();
         }
 
         size_t getMaxParticlesCount()
         {
-            return stack.getHostBuffer().getDataSpace().productOfComponents();
+            size_t result = 0u;
+            if(Environment<>::get().isMpiDirectEnabled())
+                result = stack.getDeviceBuffer().getDataSpace().productOfComponents();
+            else
+                result = stack.getHostBuffer().getDataSpace().productOfComponents();
+
+            return result;
         }
 
     private:
-
-        Exchange<FRAME, DIM1> &getExchangeBuffer()
+        Exchange<FRAME, DIM1>& getExchangeBuffer()
         {
             return stack;
         }
@@ -168,4 +181,4 @@ namespace pmacc
         Exchange<FRAME, DIM1>& stack;
         Exchange<FRAMEINDEX, DIM1>& stackIndexer;
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/dataTypes/ExchangeMemoryIndex.hpp b/include/pmacc/particles/memory/dataTypes/ExchangeMemoryIndex.hpp
index 259c1c7800..bb64465e1b 100644
--- a/include/pmacc/particles/memory/dataTypes/ExchangeMemoryIndex.hpp
+++ b/include/pmacc/particles/memory/dataTypes/ExchangeMemoryIndex.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -26,49 +26,47 @@
 
 namespace pmacc
 {
-
-template<class TYPE, unsigned DIM>
-class ExchangeMemoryIndex
-{
-public:
-
-    HDINLINE ExchangeMemoryIndex() : startIdx(0), count(0)
+    template<class TYPE, unsigned DIM>
+    class ExchangeMemoryIndex
     {
-    }
+    public:
+        HDINLINE ExchangeMemoryIndex() : startIdx(0), count(0)
+        {
+        }
 
-    HDINLINE void setStartIndex(TYPE startIdx)
-    {
-        this->startIdx = startIdx;
-    }
+        HDINLINE void setStartIndex(TYPE startIdx)
+        {
+            this->startIdx = startIdx;
+        }
 
-    HDINLINE void setCount(TYPE count)
-    {
-        this->count = count;
-    }
+        HDINLINE void setCount(TYPE count)
+        {
+            this->count = count;
+        }
 
-    HDINLINE void setSuperCell(DataSpace<DIM> superCell)
-    {
-        this->superCell = superCell;
-    }
+        HDINLINE void setSuperCell(DataSpace<DIM> superCell)
+        {
+            this->superCell = superCell;
+        }
 
-    HDINLINE TYPE getStartIndex()
-    {
-        return startIdx;
-    }
+        HDINLINE TYPE getStartIndex()
+        {
+            return startIdx;
+        }
 
-    HDINLINE TYPE getCount()
-    {
-        return count;
-    }
+        HDINLINE TYPE getCount()
+        {
+            return count;
+        }
 
-    HDINLINE DataSpace<DIM> getSuperCell()
-    {
-        return superCell;
-    }
-private:
+        HDINLINE DataSpace<DIM> getSuperCell()
+        {
+            return superCell;
+        }
 
-    PMACC_ALIGN(superCell, DataSpace<DIM>);
-    PMACC_ALIGN(startIdx, TYPE);
-    PMACC_ALIGN(count, TYPE);
-};
-}
+    private:
+        PMACC_ALIGN(superCell, DataSpace<DIM>);
+        PMACC_ALIGN(startIdx, TYPE);
+        PMACC_ALIGN(count, TYPE);
+    };
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/dataTypes/FramePointer.hpp b/include/pmacc/particles/memory/dataTypes/FramePointer.hpp
index 3c5a88ed19..dc9fc9cfeb 100644
--- a/include/pmacc/particles/memory/dataTypes/FramePointer.hpp
+++ b/include/pmacc/particles/memory/dataTypes/FramePointer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,68 +26,67 @@
 
 namespace pmacc
 {
-
-/** Wrapper for a raw pointer a PMacc frame
- *
- * @tparam T_Type type of the pointed object
- */
-template< typename T_Type >
-class FramePointer : public Pointer< T_Type >
-{
-private:
-    using Base = Pointer< T_Type >;
-public:
-    using type = typename Base::type;
-    using PtrType = typename Base::PtrType;
-
-    /** default constructor
+    /** Wrapper for a raw pointer a PMacc frame
      *
-     * the default pointer points to invalid memory
+     * @tparam T_Type type of the pointed object
      */
-    HDINLINE FramePointer( ) : Base( )
+    template<typename T_Type>
+    class FramePointer : public Pointer<T_Type>
     {
-    }
+    private:
+        using Base = Pointer<T_Type>;
 
-    HDINLINE FramePointer( PtrType const ptrIn ) : Base( ptrIn )
-    {
-    }
+    public:
+        using type = typename Base::type;
+        using PtrType = typename Base::PtrType;
 
-    HDINLINE FramePointer( const Base& other ) : Base( other )
-    {
-    }
+        /** default constructor
+         *
+         * the default pointer points to invalid memory
+         */
+        HDINLINE FramePointer() : Base()
+        {
+        }
 
-    HDINLINE FramePointer( const FramePointer& other ) : Base( other )
-    {
-    }
+        HDINLINE FramePointer(PtrType const ptrIn) : Base(ptrIn)
+        {
+        }
 
-    HDINLINE FramePointer& operator=(const FramePointer& other)
-    {
-        Base::operator=(other);
-        return *this;
-    }
+        HDINLINE FramePointer(const Base& other) : Base(other)
+        {
+        }
 
-    /** access the Nth particle
-     *
-     * it is not checked whether `FramePointer` points to valid memory
-     *
-     * @param idx particle index in the frame
-     */
-    HDINLINE typename type::ParticleType operator[](const uint32_t idx)
-    {
-        return (*Base::ptr)[idx];
-    }
+        HDINLINE FramePointer(const FramePointer& other) : Base(other)
+        {
+        }
 
-    /** access the Nth particle
-     *
-     * it is not checked whether `FramePointer` points to valid memory
-     *
-     * @param idx particle index in the frame
-     */
-    HDINLINE const typename type::ParticleType operator[](const uint32_t idx) const
-    {
-        return (*Base::ptr)[idx];
-    }
+        HDINLINE FramePointer& operator=(const FramePointer& other)
+        {
+            Base::operator=(other);
+            return *this;
+        }
+
+        /** access the Nth particle
+         *
+         * it is not checked whether `FramePointer` points to valid memory
+         *
+         * @param idx particle index in the frame
+         */
+        HDINLINE typename type::ParticleType operator[](const uint32_t idx)
+        {
+            return (*Base::ptr)[idx];
+        }
 
-};
+        /** access the Nth particle
+         *
+         * it is not checked whether `FramePointer` points to valid memory
+         *
+         * @param idx particle index in the frame
+         */
+        HDINLINE const typename type::ParticleType operator[](const uint32_t idx) const
+        {
+            return (*Base::ptr)[idx];
+        }
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/dataTypes/ListPointer.hpp b/include/pmacc/particles/memory/dataTypes/ListPointer.hpp
index b2d3d29325..d24172052f 100644
--- a/include/pmacc/particles/memory/dataTypes/ListPointer.hpp
+++ b/include/pmacc/particles/memory/dataTypes/ListPointer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,17 +27,16 @@
 
 namespace pmacc
 {
+    template<typename T_Type = bmpl::_1>
+    struct PreviousFramePtr
+    {
+        PMACC_ALIGN(previousFrame, Pointer<T_Type>);
+    };
 
-template<typename T_Type = bmpl::_1>
-struct PreviousFramePtr
-{
-    PMACC_ALIGN(previousFrame, Pointer<T_Type>);
-};
-
-template<typename T_Type = bmpl::_1>
-struct NextFramePtr
-{
-    PMACC_ALIGN(nextFrame, Pointer<T_Type>);
-};
+    template<typename T_Type = bmpl::_1>
+    struct NextFramePtr
+    {
+        PMACC_ALIGN(nextFrame, Pointer<T_Type>);
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/dataTypes/Particle.hpp b/include/pmacc/particles/memory/dataTypes/Particle.hpp
index b19354cd28..26333a84a0 100644
--- a/include/pmacc/particles/memory/dataTypes/Particle.hpp
+++ b/include/pmacc/particles/memory/dataTypes/Particle.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -49,295 +49,246 @@
 
 namespace pmacc
 {
+    namespace pmath = pmacc::math;
 
-namespace pmath = pmacc::math;
-
-/** A single particle of a @see Frame
- *
- * A instance of this Particle is a representation ("pointer") to the memory
- * where the frame is stored.
- *
- * @tparam T_FrameType type of the parent frame
- * @tparam T_ValueTypeSeq sequence with all attribute identifiers
- *                        (can be a subset of T_FrameType::ValueTypeSeq)
- */
-template<typename T_FrameType, typename T_ValueTypeSeq = typename T_FrameType::ValueTypeSeq>
-struct Particle : public InheritLinearly<typename T_FrameType::MethodsList>
-{
-    typedef T_FrameType FrameType;
-    typedef T_ValueTypeSeq ValueTypeSeq;
-    typedef typename FrameType::Name Name;
-    typedef typename FrameType::SuperCellSize SuperCellSize;
-    typedef Particle<FrameType, ValueTypeSeq> ThisType;
-    typedef typename FrameType::MethodsList MethodsList;
-
-    /** index of particle inside the Frame*/
-    PMACC_ALIGN(idx, uint32_t);
-
-    /** pointer to parent frame where this particle is from
+    /** A single particle of a @see Frame
      *
-     * ATTENTION: The pointer must be the last member to avoid local memory usage
-     *            https://github.com/ComputationalRadiationPhysics/picongpu/pull/762
-     */
-    PMACC_ALIGN(frame, FrameType*);
-
-    /** set particle handle to invalid
+     * A instance of this Particle is a representation ("pointer") to the memory
+     * where the frame is stored.
      *
-     * This method sets the particle handle to invalid. It is possible to test with
-     * the method isHandleValid if the particle is valid.
-     * If the particle is set to invalid it is not allowed to call any method other
-     * than isHandleValid or setHandleInvalid, but it does not mean the particle is
-     * deactivated outside of this instance.
+     * @tparam T_FrameType type of the parent frame
+     * @tparam T_ValueTypeSeq sequence with all attribute identifiers
+     *                        (can be a subset of T_FrameType::ValueTypeSeq)
      */
-    HDINLINE void setHandleInvalid()
+    template<typename T_FrameType, typename T_ValueTypeSeq = typename T_FrameType::ValueTypeSeq>
+    struct Particle : public InheritLinearly<typename T_FrameType::MethodsList>
     {
-        frame = nullptr;
-    }
-
-    /** check if particle handle is valid
-     *
-     * A valid particle handle means that the memory behind the handle can be used
-     * savely. A valid handle does not mean that the particle's multiMask is valid (>=1).
-     *
-     * @return true if the particle handle is valid, else false
-     */
-    HDINLINE bool isHandleValid() const
-    {
-        return frame != nullptr;
-    }
-
-    /** create particle
-     *
-     * @param frame reference to parent frame
-     * @param idx index of particle inside the frame
-     */
-    HDINLINE Particle(FrameType& frame, uint32_t idx) : frame(&frame), idx(idx)
-    {
-    }
-
-    template<typename T_OtherParticle >
-    HDINLINE Particle(const T_OtherParticle& other) : frame(other.frame), idx(other.idx)
-    {
-    }
-
-    /** access attribute with a identifier
-     *
-     * @param T_Key instance of identifier type
-     *              (can be an alias, value_identifier or any other class)
-     * @return result of operator[] of the Frame
-     */
-    template<typename T_Key >
-    HDINLINE
-    typename boost::result_of<
-    typename boost::remove_reference<
-    typename boost::result_of < FrameType(T_Key)>::type
-    >::type(uint32_t)
-    >::type
-    operator[](const T_Key key)
-    {
-        PMACC_CASSERT_MSG_TYPE(
-            key_not_available,
-            T_Key,
-            traits::HasIdentifier< Particle, T_Key >::type::value
-        );
-
-        return frame->getIdentifier(key)[idx];
-    }
-
-    /** const version of method operator(const T_Key) */
-    template<typename T_Key >
-    HDINLINE
-    typename boost::result_of<
-    typename boost::remove_reference<
-    typename boost::result_of <const FrameType(T_Key)>::type
-    >::type(uint32_t)
-    >::type
-    operator[](const T_Key key) const
-    {
-        PMACC_CASSERT_MSG_TYPE(
-            key_not_available,
-            T_Key,
-            traits::HasIdentifier< Particle, T_Key >::type::value
-        );
-
-        return frame->getIdentifier(key)[idx];
-    }
-
-    HDINLINE
-    ThisType& operator=(const ThisType& other) = default;
-
-private:
-    /* we disallow to assign this class*/
-    template<typename T_OtherParticle >
-    HDINLINE
-    ThisType& operator=(const T_OtherParticle& other);
-
-};
-
-namespace traits
-{
-
-template<
-    typename T_Key,
-    typename T_FrameType,
-    typename T_ValueTypeSeq
->
-struct HasIdentifier<
-    pmacc::Particle< T_FrameType, T_ValueTypeSeq >,
-    T_Key
->
-{
-private:
-    typedef pmacc::Particle<T_FrameType, T_ValueTypeSeq> ParticleType;
-    typedef typename ParticleType::ValueTypeSeq ValueTypeSeq;
-public:
-    /* If T_Key can not be found in the T_ValueTypeSeq of this Particle class,
-     * SolvedAliasName will be void_.
-     * Look-up is also valid if T_Key is an alias.
-     */
-    typedef typename GetKeyFromAlias<
-        ValueTypeSeq,
-        T_Key
-    >::type SolvedAliasName;
-
-    typedef bmpl::contains<ValueTypeSeq, SolvedAliasName> type;
-};
-
-template<
-    typename T_Key,
-    typename T_FrameType,
-    typename T_ValueTypeSeq
->
-struct HasFlag<
-    pmacc::Particle<T_FrameType, T_ValueTypeSeq>,
-    T_Key
->: public HasFlag<T_FrameType, T_Key>
-{};
-
-template<
-    typename T_Key,
-    typename T_FrameType,
-    typename T_ValueTypeSeq
->
-struct GetFlagType<
-    pmacc::Particle<T_FrameType, T_ValueTypeSeq>,
-    T_Key
->: public GetFlagType<T_FrameType, T_Key>
-{};
-
-} //namespace traits
-
-namespace particles
-{
-namespace operations
-{
-namespace detail
-{
-
-/** Assign common attributes of two particle species
- *
- * Assigns all attributes in ValueTypeSeq1 that also exist in T_ValueTypeSeq2
- * from T_FrameType1 to T_FrameType2.
- */
-template<
-typename T_FrameType1, typename T_ValueTypeSeq1,
-typename T_FrameType2, typename T_ValueTypeSeq2
->
-struct Assign
-<
-pmacc::Particle<T_FrameType1, T_ValueTypeSeq1>,
-pmacc::Particle<T_FrameType2, T_ValueTypeSeq2>
->
-{
-    typedef pmacc::Particle<T_FrameType1, T_ValueTypeSeq1> Dest;
-    typedef pmacc::Particle<T_FrameType2, T_ValueTypeSeq2> Src;
-
-    typedef typename Dest::ValueTypeSeq DestTypeSeq;
-    typedef typename Src::ValueTypeSeq SrcTypeSeq;
-
-    /* create attribute list with a subset of common attributes in two sequences
-     * bmpl::contains has lower complexity than traits::HasIdentifier
-     * and was used for this reason
-     */
-    typedef typename bmpl::copy_if<
-            DestTypeSeq,
-            bmpl::contains<SrcTypeSeq, bmpl::_1>,
-            bmpl::back_inserter< bmpl::vector0<> >
-            >::type CommonTypeSeq;
-
-    /* create sequences with disjunct attributes from `DestTypeSeq` */
-    typedef typename bmpl::copy_if<
-            DestTypeSeq,
-            bmpl::not_<bmpl::contains<SrcTypeSeq, bmpl::_1> >,
-            bmpl::back_inserter< bmpl::vector0<> >
-            >::type UniqueInDestTypeSeq;
-
-    /** Assign particle attributes
-     *
-     * The common subset of the attribute lists from both particles is
-     * used to set the attributes in dest with the corresponding ones from src.
-     * The remaining attributes that only exist in dest (UniqueInDestTypeSeq)
-     * are simply set to their default values.
-     *
-     * @param dest destination particle that shall be initialized/assigned with values from src
-     * @param src source particle were attributes are loaded from
-     */
-    HDINLINE
-    void operator()(Dest& dest, const Src& src)
-    {
-        using pmacc::meta::ForEach;
-        /* assign attributes from src to dest*/
-        ForEach<CommonTypeSeq,
-            CopyIdentifier<bmpl::_1> > copy;
-        copy(dest, src);
-
-        /* set all attributes which are not in src to their default value*/
-        ForEach<UniqueInDestTypeSeq,
-            SetAttributeToDefault<bmpl::_1> > setAttributeToDefault;
-        setAttributeToDefault(dest);
-
-    };
-};
-
-template<
-typename T_MPLSeqWithObjectsToRemove,
-typename T_FrameType, typename T_ValueTypeSeq
->
-struct Deselect
-<
-T_MPLSeqWithObjectsToRemove,
-pmacc::Particle<T_FrameType, T_ValueTypeSeq>
->
-{
-    typedef T_FrameType FrameType;
-    typedef T_ValueTypeSeq ValueTypeSeq;
-    typedef pmacc::Particle<FrameType, ValueTypeSeq> ParticleType;
-    typedef T_MPLSeqWithObjectsToRemove MPLSeqWithObjectsToRemove;
-
-    /* translate aliases to full specialized identifier*/
-    typedef typename ResolveAliases<MPLSeqWithObjectsToRemove, ValueTypeSeq, errorHandlerPolicies::ReturnValue>::type ResolvedSeqWithObjectsToRemove;
-    /* remove types from original particle attribute list*/
-    typedef typename RemoveFromSeq<ValueTypeSeq, ResolvedSeqWithObjectsToRemove>::type NewValueTypeSeq;
-    /* new particle type*/
-    typedef pmacc::Particle<FrameType, NewValueTypeSeq> ResultType;
-
-    template<class> struct result;
-
-    template<class F, class T_Obj>
-    struct result< F(T_Obj)>
-    {
-        typedef ResultType type;
+        typedef T_FrameType FrameType;
+        typedef T_ValueTypeSeq ValueTypeSeq;
+        typedef typename FrameType::Name Name;
+        typedef typename FrameType::SuperCellSize SuperCellSize;
+        typedef Particle<FrameType, ValueTypeSeq> ThisType;
+        typedef typename FrameType::MethodsList MethodsList;
+
+        /** index of particle inside the Frame*/
+        PMACC_ALIGN(idx, uint32_t);
+
+        /** pointer to parent frame where this particle is from
+         *
+         * ATTENTION: The pointer must be the last member to avoid local memory usage
+         *            https://github.com/ComputationalRadiationPhysics/picongpu/pull/762
+         */
+        PMACC_ALIGN(frame, FrameType*);
+
+        /** set particle handle to invalid
+         *
+         * This method sets the particle handle to invalid. It is possible to test with
+         * the method isHandleValid if the particle is valid.
+         * If the particle is set to invalid it is not allowed to call any method other
+         * than isHandleValid or setHandleInvalid, but it does not mean the particle is
+         * deactivated outside of this instance.
+         */
+        HDINLINE void setHandleInvalid()
+        {
+            frame = nullptr;
+        }
+
+        /** check if particle handle is valid
+         *
+         * A valid particle handle means that the memory behind the handle can be used
+         * savely. A valid handle does not mean that the particle's multiMask is valid (>=1).
+         *
+         * @return true if the particle handle is valid, else false
+         */
+        HDINLINE bool isHandleValid() const
+        {
+            return frame != nullptr;
+        }
+
+        /** create particle
+         *
+         * @param frame reference to parent frame
+         * @param idx index of particle inside the frame
+         */
+        HDINLINE Particle(FrameType& frame, uint32_t idx) : frame(&frame), idx(idx)
+        {
+        }
+
+        template<typename T_OtherParticle>
+        HDINLINE Particle(const T_OtherParticle& other) : frame(other.frame)
+                                                        , idx(other.idx)
+        {
+        }
+
+        /** access attribute with a identifier
+         *
+         * @param T_Key instance of identifier type
+         *              (can be an alias, value_identifier or any other class)
+         * @return result of operator[] of the Frame
+         */
+        template<typename T_Key>
+        HDINLINE typename boost::result_of<
+            typename boost::remove_reference<typename boost::result_of<FrameType(T_Key)>::type>::type(uint32_t)>::type
+        operator[](const T_Key key)
+        {
+            PMACC_CASSERT_MSG_TYPE(key_not_available, T_Key, traits::HasIdentifier<Particle, T_Key>::type::value);
+
+            return frame->getIdentifier(key)[idx];
+        }
+
+        /** const version of method operator(const T_Key) */
+        template<typename T_Key>
+        HDINLINE typename boost::result_of<typename boost::remove_reference<
+            typename boost::result_of<const FrameType(T_Key)>::type>::type(uint32_t)>::type
+        operator[](const T_Key key) const
+        {
+            PMACC_CASSERT_MSG_TYPE(key_not_available, T_Key, traits::HasIdentifier<Particle, T_Key>::type::value);
+
+            return frame->getIdentifier(key)[idx];
+        }
+
+        HDINLINE
+        ThisType& operator=(const ThisType& other) = default;
+
+    private:
+        /* we disallow to assign this class*/
+        template<typename T_OtherParticle>
+        HDINLINE ThisType& operator=(const T_OtherParticle& other);
     };
 
-    HDINLINE
-    ResultType operator()(const ParticleType& particle)
+    namespace traits
     {
-        return ResultType(particle);
-    };
-};
-
-} //namespace detail
-} //namespace operations
-} //namespace particles
-
-} //namespace pmacc
+        template<typename T_Key, typename T_FrameType, typename T_ValueTypeSeq>
+        struct HasIdentifier<pmacc::Particle<T_FrameType, T_ValueTypeSeq>, T_Key>
+        {
+        private:
+            typedef pmacc::Particle<T_FrameType, T_ValueTypeSeq> ParticleType;
+            typedef typename ParticleType::ValueTypeSeq ValueTypeSeq;
+
+        public:
+            /* If T_Key can not be found in the T_ValueTypeSeq of this Particle class,
+             * SolvedAliasName will be void_.
+             * Look-up is also valid if T_Key is an alias.
+             */
+            typedef typename GetKeyFromAlias<ValueTypeSeq, T_Key>::type SolvedAliasName;
+
+            typedef bmpl::contains<ValueTypeSeq, SolvedAliasName> type;
+        };
+
+        template<typename T_Key, typename T_FrameType, typename T_ValueTypeSeq>
+        struct HasFlag<pmacc::Particle<T_FrameType, T_ValueTypeSeq>, T_Key> : public HasFlag<T_FrameType, T_Key>
+        {
+        };
+
+        template<typename T_Key, typename T_FrameType, typename T_ValueTypeSeq>
+        struct GetFlagType<pmacc::Particle<T_FrameType, T_ValueTypeSeq>, T_Key>
+            : public GetFlagType<T_FrameType, T_Key>
+        {
+        };
+
+    } // namespace traits
+
+    namespace particles
+    {
+        namespace operations
+        {
+            namespace detail
+            {
+                /** Assign common attributes of two particle species
+                 *
+                 * Assigns all attributes in ValueTypeSeq1 that also exist in T_ValueTypeSeq2
+                 * from T_FrameType1 to T_FrameType2.
+                 */
+                template<
+                    typename T_FrameType1,
+                    typename T_ValueTypeSeq1,
+                    typename T_FrameType2,
+                    typename T_ValueTypeSeq2>
+                struct Assign<
+                    pmacc::Particle<T_FrameType1, T_ValueTypeSeq1>,
+                    pmacc::Particle<T_FrameType2, T_ValueTypeSeq2>>
+                {
+                    typedef pmacc::Particle<T_FrameType1, T_ValueTypeSeq1> Dest;
+                    typedef pmacc::Particle<T_FrameType2, T_ValueTypeSeq2> Src;
+
+                    typedef typename Dest::ValueTypeSeq DestTypeSeq;
+                    typedef typename Src::ValueTypeSeq SrcTypeSeq;
+
+                    /* create attribute list with a subset of common attributes in two sequences
+                     * bmpl::contains has lower complexity than traits::HasIdentifier
+                     * and was used for this reason
+                     */
+                    typedef typename bmpl::copy_if<
+                        DestTypeSeq,
+                        bmpl::contains<SrcTypeSeq, bmpl::_1>,
+                        bmpl::back_inserter<bmpl::vector0<>>>::type CommonTypeSeq;
+
+                    /* create sequences with disjunct attributes from `DestTypeSeq` */
+                    typedef typename bmpl::copy_if<
+                        DestTypeSeq,
+                        bmpl::not_<bmpl::contains<SrcTypeSeq, bmpl::_1>>,
+                        bmpl::back_inserter<bmpl::vector0<>>>::type UniqueInDestTypeSeq;
+
+                    /** Assign particle attributes
+                     *
+                     * The common subset of the attribute lists from both particles is
+                     * used to set the attributes in dest with the corresponding ones from src.
+                     * The remaining attributes that only exist in dest (UniqueInDestTypeSeq)
+                     * are simply set to their default values.
+                     *
+                     * @param dest destination particle that shall be initialized/assigned with values from src
+                     * @param src source particle were attributes are loaded from
+                     */
+                    HDINLINE
+                    void operator()(Dest& dest, const Src& src)
+                    {
+                        using pmacc::meta::ForEach;
+                        /* assign attributes from src to dest*/
+                        ForEach<CommonTypeSeq, CopyIdentifier<bmpl::_1>> copy;
+                        copy(dest, src);
+
+                        /* set all attributes which are not in src to their default value*/
+                        ForEach<UniqueInDestTypeSeq, SetAttributeToDefault<bmpl::_1>> setAttributeToDefault;
+                        setAttributeToDefault(dest);
+                    };
+                };
+
+                template<typename T_MPLSeqWithObjectsToRemove, typename T_FrameType, typename T_ValueTypeSeq>
+                struct Deselect<T_MPLSeqWithObjectsToRemove, pmacc::Particle<T_FrameType, T_ValueTypeSeq>>
+                {
+                    typedef T_FrameType FrameType;
+                    typedef T_ValueTypeSeq ValueTypeSeq;
+                    typedef pmacc::Particle<FrameType, ValueTypeSeq> ParticleType;
+                    typedef T_MPLSeqWithObjectsToRemove MPLSeqWithObjectsToRemove;
+
+                    /* translate aliases to full specialized identifier*/
+                    typedef typename ResolveAliases<
+                        MPLSeqWithObjectsToRemove,
+                        ValueTypeSeq,
+                        errorHandlerPolicies::ReturnValue>::type ResolvedSeqWithObjectsToRemove;
+                    /* remove types from original particle attribute list*/
+                    typedef typename RemoveFromSeq<ValueTypeSeq, ResolvedSeqWithObjectsToRemove>::type NewValueTypeSeq;
+                    /* new particle type*/
+                    typedef pmacc::Particle<FrameType, NewValueTypeSeq> ResultType;
+
+                    template<class>
+                    struct result;
+
+                    template<class F, class T_Obj>
+                    struct result<F(T_Obj)>
+                    {
+                        typedef ResultType type;
+                    };
+
+                    HDINLINE
+                    ResultType operator()(const ParticleType& particle)
+                    {
+                        return ResultType(particle);
+                    };
+                };
+
+            } // namespace detail
+        } // namespace operations
+    } // namespace particles
+
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/dataTypes/Pointer.hpp b/include/pmacc/particles/memory/dataTypes/Pointer.hpp
index 27cfbf2a98..c4eb169c6d 100644
--- a/include/pmacc/particles/memory/dataTypes/Pointer.hpp
+++ b/include/pmacc/particles/memory/dataTypes/Pointer.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020  Rene Widera
+/* Copyright 2014-2021  Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,84 +25,81 @@
 
 namespace pmacc
 {
-
-/** Wrapper for a raw pointer
- *
- * @tparam T_Type type of the pointed object
- */
-template< typename T_Type >
-class Pointer
-{
-public:
-
-    using type = T_Type;
-    using PtrType = type*;
-    using ConstPtrType = const type*;
-
-    HDINLINE Pointer( ):
-        ptr{ nullptr }
-    {
-    }
-
-    HDINLINE Pointer( PtrType const ptrIn ) : ptr( ptrIn )
-    {
-    }
-
-    HDINLINE Pointer( const Pointer& other ) : ptr( other.ptr )
-    {
-    }
-
-    HDINLINE Pointer& operator=(const Pointer& other)
-    {
-        ptr = other.ptr;
-        return *this;
-    }
-
-    /** dereference the pointer*/
-    HDINLINE type& operator*()
-    {
-        return *ptr;
-    }
-
-    /** dereference the pointer*/
-    HDINLINE const type& operator*() const
-    {
-        return *ptr;
-    }
-
-    /** access member*/
-    HDINLINE PtrType operator->()
-    {
-        return ptr;
-    }
-
-    /** access member*/
-    HDINLINE ConstPtrType operator->() const
-    {
-        return ptr;
-    }
-
-    /** compare if two pointers point to the same memory address*/
-    HDINLINE bool operator==(const Pointer<type>& other) const
-    {
-        return ptr == other.ptr;
-    }
-
-    /** check if the memory address of two pointers are different*/
-    HDINLINE bool operator!=(const Pointer<type>& other) const
-    {
-        return ptr != other.ptr;
-    }
-
-    /** check if the memory pointed to has a valid address
-     * @return false if memory adress is nullptr else true
+    /** Wrapper for a raw pointer
+     *
+     * @tparam T_Type type of the pointed object
      */
-    HDINLINE bool isValid( ) const
+    template<typename T_Type>
+    class Pointer
     {
-        return ptr != nullptr;
-    }
-
-    PMACC_ALIGN( ptr, PtrType );
-};
-
-} //namespace pmacc
+    public:
+        using type = T_Type;
+        using PtrType = type*;
+        using ConstPtrType = const type*;
+
+        HDINLINE Pointer() : ptr{nullptr}
+        {
+        }
+
+        HDINLINE Pointer(PtrType const ptrIn) : ptr(ptrIn)
+        {
+        }
+
+        HDINLINE Pointer(const Pointer& other) : ptr(other.ptr)
+        {
+        }
+
+        HDINLINE Pointer& operator=(const Pointer& other)
+        {
+            ptr = other.ptr;
+            return *this;
+        }
+
+        /** dereference the pointer*/
+        HDINLINE type& operator*()
+        {
+            return *ptr;
+        }
+
+        /** dereference the pointer*/
+        HDINLINE const type& operator*() const
+        {
+            return *ptr;
+        }
+
+        /** access member*/
+        HDINLINE PtrType operator->()
+        {
+            return ptr;
+        }
+
+        /** access member*/
+        HDINLINE ConstPtrType operator->() const
+        {
+            return ptr;
+        }
+
+        /** compare if two pointers point to the same memory address*/
+        HDINLINE bool operator==(const Pointer<type>& other) const
+        {
+            return ptr == other.ptr;
+        }
+
+        /** check if the memory address of two pointers are different*/
+        HDINLINE bool operator!=(const Pointer<type>& other) const
+        {
+            return ptr != other.ptr;
+        }
+
+        /** check if the memory pointed to has a valid address
+         * @return false if memory adress is nullptr else true
+         */
+        HDINLINE bool isValid() const
+        {
+            return ptr != nullptr;
+        }
+
+        PMACC_ALIGN(ptr, PtrType);
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/dataTypes/StaticArray.hpp b/include/pmacc/particles/memory/dataTypes/StaticArray.hpp
index 27e46794d0..9b68e7c6f2 100644
--- a/include/pmacc/particles/memory/dataTypes/StaticArray.hpp
+++ b/include/pmacc/particles/memory/dataTypes/StaticArray.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,44 +26,45 @@
 
 namespace pmacc
 {
+    namespace pmath = pmacc::math;
 
-namespace pmath = pmacc::math;
+    template<typename T_Type, typename T_size>
+    class StaticArray
+    {
+    public:
+        static constexpr uint32_t size = T_size::value;
+        typedef T_Type Type;
 
-template<typename T_Type, typename T_size>
-class StaticArray
-{
-public:
-    static constexpr uint32_t size = T_size::value;
-    typedef T_Type Type;
-private:
-    Type data[size];
-public:
+    private:
+        Type data[size];
 
-    template<class> struct result;
+    public:
+        template<class>
+        struct result;
 
-    template<class F, typename TKey>
-    struct result<F(TKey)>
-    {
-        typedef Type& type;
-    };
+        template<class F, typename TKey>
+        struct result<F(TKey)>
+        {
+            typedef Type& type;
+        };
 
-    template<class F, typename TKey>
-    struct result<const F(TKey)>
-    {
-        typedef const Type& type;
-    };
+        template<class F, typename TKey>
+        struct result<const F(TKey)>
+        {
+            typedef const Type& type;
+        };
 
-    HDINLINE
-    Type& operator[](const int idx)
-    {
-        return data[idx];
-    }
+        HDINLINE
+        Type& operator[](const int idx)
+        {
+            return data[idx];
+        }
 
-    HDINLINE
-    const Type& operator[](const int idx) const
-    {
-        return data[idx];
-    }
-};
+        HDINLINE
+        const Type& operator[](const int idx) const
+        {
+            return data[idx];
+        }
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/dataTypes/SuperCell.hpp b/include/pmacc/particles/memory/dataTypes/SuperCell.hpp
index cf0a9aabe4..ac79628368 100644
--- a/include/pmacc/particles/memory/dataTypes/SuperCell.hpp
+++ b/include/pmacc/particles/memory/dataTypes/SuperCell.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,36 +27,30 @@
 
 namespace pmacc
 {
-
-    template< class T_FrameType >
+    template<class T_FrameType>
     class SuperCell
     {
     public:
-
-        HDINLINE SuperCell() :
-            firstFramePtr( nullptr ),
-            lastFramePtr( nullptr ),
-            numParticles( 0 ),
-            mustShiftVal( false )
+        HDINLINE SuperCell() : firstFramePtr(nullptr), lastFramePtr(nullptr), numParticles(0), mustShiftVal(false)
         {
         }
 
-        HDINLINE T_FrameType * FirstFramePtr()
+        HDINLINE T_FrameType* FirstFramePtr()
         {
             return firstFramePtr;
         }
 
-        HDINLINE T_FrameType * LastFramePtr()
+        HDINLINE T_FrameType* LastFramePtr()
         {
             return lastFramePtr;
         }
 
-        HDINLINE T_FrameType const * FirstFramePtr() const
+        HDINLINE T_FrameType const* FirstFramePtr() const
         {
             return firstFramePtr;
         }
 
-        HDINLINE T_FrameType const *  LastFramePtr() const
+        HDINLINE T_FrameType const* LastFramePtr() const
         {
             return lastFramePtr;
         }
@@ -66,17 +60,15 @@ namespace pmacc
             return mustShiftVal;
         }
 
-        HDINLINE void setMustShift( bool const value )
+        HDINLINE void setMustShift(bool const value)
         {
             mustShiftVal = value;
         }
 
         HDINLINE uint32_t getSizeLastFrame() const
         {
-            constexpr uint32_t frameSize = math::CT::volume<
-                typename T_FrameType::SuperCellSize
-            >::type::value;
-            return numParticles ? ( ( numParticles - 1u ) % frameSize + 1u ) : 0u;
+            constexpr uint32_t frameSize = math::CT::volume<typename T_FrameType::SuperCellSize>::type::value;
+            return numParticles ? ((numParticles - 1u) % frameSize + 1u) : 0u;
         }
 
         HDINLINE uint32_t getNumParticles() const
@@ -84,29 +76,18 @@ namespace pmacc
             return numParticles;
         }
 
-        HDINLINE void setNumParticles( uint32_t const size )
+        HDINLINE void setNumParticles(uint32_t const size)
         {
             numParticles = size;
         }
 
     public:
-        PMACC_ALIGN(
-            firstFramePtr,
-            T_FrameType*
-        );
-        PMACC_ALIGN(
-            lastFramePtr,
-            T_FrameType*
-        );
+        PMACC_ALIGN(firstFramePtr, T_FrameType*);
+        PMACC_ALIGN(lastFramePtr, T_FrameType*);
+
     private:
-        PMACC_ALIGN(
-            numParticles,
-            uint32_t
-        );
-        PMACC_ALIGN(
-            mustShiftVal,
-            bool
-        );
+        PMACC_ALIGN(numParticles, uint32_t);
+        PMACC_ALIGN(mustShiftVal, bool);
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/frames/Frame.hpp b/include/pmacc/particles/memory/frames/Frame.hpp
index e45959a475..ca59b7b43c 100644
--- a/include/pmacc/particles/memory/frames/Frame.hpp
+++ b/include/pmacc/particles/memory/frames/Frame.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -50,171 +50,150 @@
 
 namespace pmacc
 {
+    namespace pmath = pmacc::math;
 
-namespace pmath = pmacc::math;
-
-/** Frame is a storage for arbitrary number >0 of Particles with attributes
- *
- * @tparam T_CreatePairOperator unary template operator to create a boost pair
- *                              from single type ( pair<name,dataType> )
- *                              @see MapTupel
- * @tparam T_ValueTypeSeq sequence with value_identifier
- * @tparam T_MethodsList sequence of classes with particle methods
- *                       (e.g. calculate mass, gamma, ...)
- * @tparam T_Flags sequence with identifiers to add flags on a frame
- *                 (e.g. useSolverXY, calcRadiation, ...)
- */
-template<typename T_CreatePairOperator,
-typename T_ParticleDescription >
-struct Frame;
-
-template<typename T_CreatePairOperator,
-typename T_ParticleDescription >
-struct Frame :
-public InheritLinearly<typename T_ParticleDescription::MethodsList>,
-protected pmath::MapTuple<typename SeqToMap<typename T_ParticleDescription::ValueTypeSeq, T_CreatePairOperator>::type, pmath::AlignedData>,
-public InheritLinearly<
-    typename OperateOnSeq<
-        typename T_ParticleDescription::FrameExtensionList,
-        bmpl::apply1<bmpl::_1, Frame<T_CreatePairOperator,T_ParticleDescription> >
-    >::type
->
-{
-    typedef T_ParticleDescription ParticleDescription;
-    typedef typename ParticleDescription::Name Name;
-    typedef typename ParticleDescription::SuperCellSize SuperCellSize;
-    typedef typename ParticleDescription::ValueTypeSeq ValueTypeSeq;
-    typedef typename ParticleDescription::MethodsList MethodsList;
-    typedef typename ParticleDescription::FlagsList FlagList;
-    typedef typename ParticleDescription::FrameExtensionList FrameExtensionList;
-    typedef Frame<T_CreatePairOperator, ParticleDescription> ThisType;
-    /* definition of the MapTupel where we inherit from*/
-    typedef pmath::MapTuple<typename SeqToMap<ValueTypeSeq, T_CreatePairOperator>::type, pmath::AlignedData> BaseType;
-
-    /* type of a single particle*/
-    typedef pmacc::Particle<ThisType> ParticleType;
-
-    /* define boost result_of results
-     * normaly result_of defines operator() result, in this case we define the result for
-     * operator[]
-     */
-    template<class> struct result;
-
-    /* const operator[]*/
-    template<class F, class TKey>
-    struct result<const F(TKey)>
-    {
-        typedef typename GetKeyFromAlias<ValueTypeSeq, TKey, errorHandlerPolicies::ThrowValueNotFound>::type Key;
-        typedef typename boost::result_of<const BaseType(Key)>::type type;
-    };
-
-    /* non const operator[]*/
-    template<class F, class TKey>
-    struct result< F(TKey)>
-    {
-        typedef typename GetKeyFromAlias<ValueTypeSeq, TKey, errorHandlerPolicies::ThrowValueNotFound>::type Key;
-        typedef typename boost::result_of< BaseType(Key)>::type type;
-    };
-
-    /** access the Nth particle*/
-    HDINLINE ParticleType operator[](const uint32_t idx)
-    {
-        return ParticleType(*this, idx);
-    }
-
-    /** access the Nth particle*/
-    HDINLINE const ParticleType operator[](const uint32_t idx) const
-    {
-        return ParticleType(*this, idx);
-    }
-
-    /** access attribute with a identifier
+    /** Frame is a storage for arbitrary number >0 of Particles with attributes
      *
-     * @param T_Key instance of identifier type
-     *              (can be an alias, value_identifier or any other class)
-     * @return result of operator[] of MapTupel
+     * @tparam T_CreatePairOperator unary template operator to create a boost pair
+     *                              from single type ( pair<name,dataType> )
+     *                              @see MapTupel
+     * @tparam T_ValueTypeSeq sequence with value_identifier
+     * @tparam T_MethodsList sequence of classes with particle methods
+     *                       (e.g. calculate mass, gamma, ...)
+     * @tparam T_Flags sequence with identifiers to add flags on a frame
+     *                 (e.g. useSolverXY, calcRadiation, ...)
      */
-    template<typename T_Key >
-    HDINLINE
-    typename boost::result_of < ThisType(T_Key)>::type
-    getIdentifier(const T_Key)
+    template<typename T_CreatePairOperator, typename T_ParticleDescription>
+    struct Frame;
+
+    template<typename T_CreatePairOperator, typename T_ParticleDescription>
+    struct Frame
+        : public InheritLinearly<typename T_ParticleDescription::MethodsList>
+        , protected pmath::MapTuple<
+              typename SeqToMap<typename T_ParticleDescription::ValueTypeSeq, T_CreatePairOperator>::type,
+              pmath::AlignedData>
+        , public InheritLinearly<typename OperateOnSeq<
+              typename T_ParticleDescription::FrameExtensionList,
+              bmpl::apply1<bmpl::_1, Frame<T_CreatePairOperator, T_ParticleDescription>>>::type>
     {
-        typedef typename GetKeyFromAlias<ValueTypeSeq, T_Key>::type Key;
-        return BaseType::operator[](Key());
-    }
-
-    /** const version of method getIdentifier(const T_Key) */
-    template<typename T_Key >
-    HDINLINE
-    typename boost::result_of < const ThisType(T_Key)>::type
-    getIdentifier(const T_Key) const
-    {
-        typedef typename GetKeyFromAlias<ValueTypeSeq, T_Key>::type Key;
-        return BaseType::operator[](Key());
-    }
+        typedef T_ParticleDescription ParticleDescription;
+        typedef typename ParticleDescription::Name Name;
+        typedef typename ParticleDescription::SuperCellSize SuperCellSize;
+        typedef typename ParticleDescription::ValueTypeSeq ValueTypeSeq;
+        typedef typename ParticleDescription::MethodsList MethodsList;
+        typedef typename ParticleDescription::FlagsList FlagList;
+        typedef typename ParticleDescription::FrameExtensionList FrameExtensionList;
+        typedef Frame<T_CreatePairOperator, ParticleDescription> ThisType;
+        /* definition of the MapTupel where we inherit from*/
+        typedef pmath::MapTuple<typename SeqToMap<ValueTypeSeq, T_CreatePairOperator>::type, pmath::AlignedData>
+            BaseType;
+
+        /* type of a single particle*/
+        typedef pmacc::Particle<ThisType> ParticleType;
+
+        /* define boost result_of results
+         * normaly result_of defines operator() result, in this case we define the result for
+         * operator[]
+         */
+        template<class>
+        struct result;
+
+        /* const operator[]*/
+        template<class F, class TKey>
+        struct result<const F(TKey)>
+        {
+            typedef typename GetKeyFromAlias<ValueTypeSeq, TKey, errorHandlerPolicies::ThrowValueNotFound>::type Key;
+            typedef typename boost::result_of<const BaseType(Key)>::type type;
+        };
+
+        /* non const operator[]*/
+        template<class F, class TKey>
+        struct result<F(TKey)>
+        {
+            typedef typename GetKeyFromAlias<ValueTypeSeq, TKey, errorHandlerPolicies::ThrowValueNotFound>::type Key;
+            typedef typename boost::result_of<BaseType(Key)>::type type;
+        };
+
+        /** access the Nth particle*/
+        HDINLINE ParticleType operator[](const uint32_t idx)
+        {
+            return ParticleType(*this, idx);
+        }
+
+        /** access the Nth particle*/
+        HDINLINE const ParticleType operator[](const uint32_t idx) const
+        {
+            return ParticleType(*this, idx);
+        }
+
+        /** access attribute with a identifier
+         *
+         * @param T_Key instance of identifier type
+         *              (can be an alias, value_identifier or any other class)
+         * @return result of operator[] of MapTupel
+         */
+        template<typename T_Key>
+        HDINLINE typename boost::result_of<ThisType(T_Key)>::type getIdentifier(const T_Key)
+        {
+            typedef typename GetKeyFromAlias<ValueTypeSeq, T_Key>::type Key;
+            return BaseType::operator[](Key());
+        }
+
+        /** const version of method getIdentifier(const T_Key) */
+        template<typename T_Key>
+        HDINLINE typename boost::result_of<const ThisType(T_Key)>::type getIdentifier(const T_Key) const
+        {
+            typedef typename GetKeyFromAlias<ValueTypeSeq, T_Key>::type Key;
+            return BaseType::operator[](Key());
+        }
+
+        HINLINE static std::string getName()
+        {
+            return Name::str();
+        }
+    };
 
-    HINLINE static std::string getName()
+    namespace traits
     {
-        return Name::str();
-    }
-
-};
-
-namespace traits
-{
-
-template<typename T_IdentifierName,
-typename T_CreatePairOperator,
-typename T_ParticleDescription
->
-struct HasIdentifier<
-pmacc::Frame<T_CreatePairOperator, T_ParticleDescription>,
-T_IdentifierName
->
-{
-private:
-    typedef pmacc::Frame<T_CreatePairOperator, T_ParticleDescription> FrameType;
-public:
-    typedef typename FrameType::ValueTypeSeq ValueTypeSeq;
-    /* if T_IdentifierName is void_ than we have no T_IdentifierName in our Sequence.
-     * check is also valid if T_Key is a alias
-     */
-    typedef typename GetKeyFromAlias<ValueTypeSeq, T_IdentifierName>::type SolvedAliasName;
-
-    typedef bmpl::contains<ValueTypeSeq, SolvedAliasName> type;
-};
-
-template<typename T_IdentifierName,
-typename T_CreatePairOperator,
-typename T_ParticleDescription
->
-struct HasFlag<
-pmacc::Frame<T_CreatePairOperator, T_ParticleDescription>, T_IdentifierName>
-{
-private:
-    typedef pmacc::Frame<T_CreatePairOperator, T_ParticleDescription> FrameType;
-    typedef typename GetFlagType<FrameType, T_IdentifierName>::type SolvedAliasName;
-    typedef typename FrameType::FlagList FlagList;
-public:
-
-    typedef bmpl::contains<FlagList, SolvedAliasName> type;
-};
-
-template<typename T_IdentifierName,
-typename T_CreatePairOperator,
-typename T_ParticleDescription
->
-struct GetFlagType<
-pmacc::Frame<T_CreatePairOperator, T_ParticleDescription>, T_IdentifierName>
-{
-private:
-    typedef pmacc::Frame<T_CreatePairOperator, T_ParticleDescription> FrameType;
-    typedef typename FrameType::FlagList FlagList;
-public:
-
-    typedef typename GetKeyFromAlias<FlagList, T_IdentifierName>::type type;
-};
-
-} //namespace traits
-
-}//namespace pmacc
+        template<typename T_IdentifierName, typename T_CreatePairOperator, typename T_ParticleDescription>
+        struct HasIdentifier<pmacc::Frame<T_CreatePairOperator, T_ParticleDescription>, T_IdentifierName>
+        {
+        private:
+            typedef pmacc::Frame<T_CreatePairOperator, T_ParticleDescription> FrameType;
+
+        public:
+            typedef typename FrameType::ValueTypeSeq ValueTypeSeq;
+            /* if T_IdentifierName is void_ than we have no T_IdentifierName in our Sequence.
+             * check is also valid if T_Key is a alias
+             */
+            typedef typename GetKeyFromAlias<ValueTypeSeq, T_IdentifierName>::type SolvedAliasName;
+
+            typedef bmpl::contains<ValueTypeSeq, SolvedAliasName> type;
+        };
+
+        template<typename T_IdentifierName, typename T_CreatePairOperator, typename T_ParticleDescription>
+        struct HasFlag<pmacc::Frame<T_CreatePairOperator, T_ParticleDescription>, T_IdentifierName>
+        {
+        private:
+            typedef pmacc::Frame<T_CreatePairOperator, T_ParticleDescription> FrameType;
+            typedef typename GetFlagType<FrameType, T_IdentifierName>::type SolvedAliasName;
+            typedef typename FrameType::FlagList FlagList;
+
+        public:
+            typedef bmpl::contains<FlagList, SolvedAliasName> type;
+        };
+
+        template<typename T_IdentifierName, typename T_CreatePairOperator, typename T_ParticleDescription>
+        struct GetFlagType<pmacc::Frame<T_CreatePairOperator, T_ParticleDescription>, T_IdentifierName>
+        {
+        private:
+            typedef pmacc::Frame<T_CreatePairOperator, T_ParticleDescription> FrameType;
+            typedef typename FrameType::FlagList FlagList;
+
+        public:
+            typedef typename GetKeyFromAlias<FlagList, T_IdentifierName>::type type;
+        };
+
+    } // namespace traits
+
+} // namespace pmacc
diff --git a/include/pmacc/particles/memory/frames/NullFrame.hpp b/include/pmacc/particles/memory/frames/NullFrame.hpp
index 30cd3070b7..508ec3a46d 100644
--- a/include/pmacc/particles/memory/frames/NullFrame.hpp
+++ b/include/pmacc/particles/memory/frames/NullFrame.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -26,16 +26,14 @@
 
 namespace pmacc
 {
-
-
     class NullFrame
     {
     public:
-
         enum
         {
-            tileSize = 0, dim = DIM3
+            tileSize = 0,
+            dim = DIM3
         };
     };
 
-}//namespace
+} // namespace pmacc
diff --git a/include/pmacc/particles/meta/FindByNameOrType.hpp b/include/pmacc/particles/meta/FindByNameOrType.hpp
index 628bde237b..5bb51d55c5 100644
--- a/include/pmacc/particles/meta/FindByNameOrType.hpp
+++ b/include/pmacc/particles/meta/FindByNameOrType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -35,70 +35,48 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace meta
-{
-
-    /* find a type within a sequence by name or the type itself
-     *
-     * pmacc::traits::GetCTName is used to translate each element of
-     * T_MPLSeq into a name.
-     *
-     * @tparam T_MPLSeq source sequence where we search T_Identifier
-     * @tparam T_Identifier name or type to search
-     */
-    template<
-        typename T_MPLSeq,
-        typename T_Identifier,
-        typename T_KeyNotFoundPolicy = pmacc::errorHandlerPolicies::ThrowValueNotFound
-    >
-    struct FindByNameOrType
+    namespace particles
     {
-        using KeyNotFoundPolicy = T_KeyNotFoundPolicy;
-
-        template< typename T_Value >
-        struct HasTypeOrName
+        namespace meta
         {
-            using type = bmpl::or_<
-                boost::is_same<
-                    T_Identifier,
-                    T_Value
-                >,
-                boost::is_same<
-                    pmacc::traits::GetCTName_t< T_Value >,
-                    T_Identifier
-                >
-            >;
-        };
+            /* find a type within a sequence by name or the type itself
+             *
+             * pmacc::traits::GetCTName is used to translate each element of
+             * T_MPLSeq into a name.
+             *
+             * @tparam T_MPLSeq source sequence where we search T_Identifier
+             * @tparam T_Identifier name or type to search
+             */
+            template<
+                typename T_MPLSeq,
+                typename T_Identifier,
+                typename T_KeyNotFoundPolicy = pmacc::errorHandlerPolicies::ThrowValueNotFound>
+            struct FindByNameOrType
+            {
+                using KeyNotFoundPolicy = T_KeyNotFoundPolicy;
+
+                template<typename T_Value>
+                struct HasTypeOrName
+                {
+                    using type = bmpl::or_<
+                        boost::is_same<T_Identifier, T_Value>,
+                        boost::is_same<pmacc::traits::GetCTName_t<T_Value>, T_Identifier>>;
+                };
 
-        using FilteredSeq = typename bmpl::copy_if<
-            T_MPLSeq,
-            HasTypeOrName< bmpl::_1 >
-        >::type;
+                using FilteredSeq = typename bmpl::copy_if<T_MPLSeq, HasTypeOrName<bmpl::_1>>::type;
 
-        using type = typename bmpl::if_<
-            bmpl::empty< FilteredSeq >,
-            bmpl::apply<
-                KeyNotFoundPolicy,
-                T_MPLSeq,
-                T_Identifier
-            >,
-            bmpl::front< FilteredSeq >
-        >::type::type;
-    };
+                using type = typename bmpl::if_<
+                    bmpl::empty<FilteredSeq>,
+                    bmpl::apply<KeyNotFoundPolicy, T_MPLSeq, T_Identifier>,
+                    bmpl::front<FilteredSeq>>::type::type;
+            };
 
-    template<
-        typename T_MPLSeq,
-        typename T_Identifier,
-        typename T_KeyNotFoundPolicy = pmacc::errorHandlerPolicies::ThrowValueNotFound
-    >
-    using FindByNameOrType_t = typename FindByNameOrType<
-        T_MPLSeq,
-        T_Identifier,
-        T_KeyNotFoundPolicy
-    >::type;
+            template<
+                typename T_MPLSeq,
+                typename T_Identifier,
+                typename T_KeyNotFoundPolicy = pmacc::errorHandlerPolicies::ThrowValueNotFound>
+            using FindByNameOrType_t = typename FindByNameOrType<T_MPLSeq, T_Identifier, T_KeyNotFoundPolicy>::type;
 
-} // namespace meta
-} // namespace particles
+        } // namespace meta
+    } // namespace particles
 } // namespace pmacc
diff --git a/include/pmacc/particles/operations/Assign.hpp b/include/pmacc/particles/operations/Assign.hpp
index 1da74ced9f..200260f872 100644
--- a/include/pmacc/particles/operations/Assign.hpp
+++ b/include/pmacc/particles/operations/Assign.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,24 +26,23 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace operations
-{
-
-namespace detail
-{
-template<typename T_Dest,typename T_Src>
-struct Assign;
-
-}//namespace detail
-
-template<typename T_Dest,typename T_Src>
-HDINLINE void assign(T_Dest& dest,const T_Src& src)
-{
-    detail::Assign<T_Dest,T_Src>()(dest,src);
-}
-
-}//operators
-}//namespace particles
-} //namespace pmacc
+    namespace particles
+    {
+        namespace operations
+        {
+            namespace detail
+            {
+                template<typename T_Dest, typename T_Src>
+                struct Assign;
+
+            } // namespace detail
+
+            template<typename T_Dest, typename T_Src>
+            HDINLINE void assign(T_Dest& dest, const T_Src& src)
+            {
+                detail::Assign<T_Dest, T_Src>()(dest, src);
+            }
+
+        } // namespace operations
+    } // namespace particles
+} // namespace pmacc
diff --git a/include/pmacc/particles/operations/ConcatListOfFrames.hpp b/include/pmacc/particles/operations/ConcatListOfFrames.hpp
index bc26bd8134..d113cc3996 100644
--- a/include/pmacc/particles/operations/ConcatListOfFrames.hpp
+++ b/include/pmacc/particles/operations/ConcatListOfFrames.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -29,143 +29,141 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace operations
-{
-
-/** Copy Particles to a Single Frame
- *
- * - copy particle data that was stored in a linked list of frames for each
- *   super-cell on the GPU to a single frame on the CPU RAM
- * - the deep on-GPU hierarchy must be copied to the CPU beforehand
- * - remove species attributes `multiMask` and `localCellIdx`
- * - add new cellIdx attribute relative to a user-defined domain
- */
-template<unsigned T_dim>
-struct ConcatListOfFrames
-{
-    DataSpace<T_dim> m_gridSize;
-
-    ConcatListOfFrames(const DataSpace<T_dim>& gridSize) :
-    m_gridSize(gridSize)
-    {
-
-    }
-
-    /** concatenate list of frames to single frame
-     *
-     * @param counter[in,out] scalar offset in `destFrame`
-     * @param destFrame single frame were all particles are copied in
-     * @param srcBox particle box were particles are read from
-     * @param particleFilter filter to select particles
-     * @param domainOffset offset to a user-defined domain. Can, e.g. be used to
-     *                     calculate a totalCellIdx, adding
-     *                     globalDomain.offset + localDomain.offset
-     * @param domainCellIdxIdentifier the identifier for the particle cellIdx
-     *                                that is calculated with respect to
-     *                                domainOffset
-     * @param mapper mapper which describes the area where particles are copied from
-     * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
-     *                  The working domain for the filter is supercells.
-     */
-    template<class T_DestFrame, class T_SrcBox, class T_Filter, class T_Space, class T_Identifier, class T_Mapping, typename T_ParticleFilter>
-    void operator()(
-        int& counter,
-        T_DestFrame destFrame,
-        T_SrcBox srcBox,
-        const T_Filter particleFilter,
-        const T_Space domainOffset,
-        const T_Identifier domainCellIdxIdentifier,
-        const T_Mapping mapper,
-        T_ParticleFilter & parFilter
-    )
+    namespace particles
     {
-        #pragma omp parallel for
-        for (int linearBlockIdx = 0;
-             linearBlockIdx < m_gridSize.productOfComponents();
-             ++linearBlockIdx
-             )
+        namespace operations
         {
-            // local copy for each omp thread
-            T_Filter filter = particleFilter;
-            DataSpace<T_dim> blockIndex(DataSpaceOperations<T_dim>::map(m_gridSize, linearBlockIdx));
-
-            using namespace pmacc::particles::operations;
-            using namespace mappings::threads;
-
-            typedef T_DestFrame DestFrameType;
-            typedef typename T_SrcBox::FrameType SrcFrameType;
-            typedef typename T_SrcBox::FramePtr SrcFramePtr;
-
-            typedef T_Mapping Mapping;
-            typedef typename Mapping::SuperCellSize SuperCellSize;
-
-
-            const int particlesPerFrame = pmacc::math::CT::volume<SuperCellSize>::type::value;
-            int localIdxs[particlesPerFrame];
-
-            const DataSpace<Mapping::Dim> superCellIdx = mapper.getSuperCellIndex(blockIndex);
-            const DataSpace<Mapping::Dim> superCellPosition((superCellIdx - mapper.getGuardingSuperCells()) * mapper.getSuperCellSize());
-            filter.setSuperCellPosition(superCellPosition);
-            auto accParFilter = parFilter(
-                1, /* @todo this is a hack, please add a alpaka accelerator here*/
-                superCellIdx - mapper.getGuardingSuperCells( ),
-                WorkerCfg< 1 >{ 0 } /* @todo this is a workaround because we use no alpaka*/
-            );
-
-            SrcFramePtr srcFramePtr = srcBox.getFirstFrame(superCellIdx);
-
-            /* Loop over all frames in current super cell */
-            while (srcFramePtr.isValid())
+            /** Copy Particles to a Single Frame
+             *
+             * - copy particle data that was stored in a linked list of frames for each
+             *   super-cell on the GPU to a single frame on the CPU RAM
+             * - the deep on-GPU hierarchy must be copied to the CPU beforehand
+             * - remove species attributes `multiMask` and `localCellIdx`
+             * - add new cellIdx attribute relative to a user-defined domain
+             */
+            template<unsigned T_dim>
+            struct ConcatListOfFrames
             {
-                /* Count number of particles in current frame and init its indices */
-                int curNumParticles = 0;
-                for (int particleIdx = 0; particleIdx < particlesPerFrame; ++particleIdx)
-                {
-                    localIdxs[particleIdx] = -1;
-                    auto parSrc = (srcFramePtr[particleIdx]);
-                    /* Check if particle exists and is not filtered */
-                    if (parSrc[multiMask_] == 1 && filter(*srcFramePtr, particleIdx))
-                        if(
-                            accParFilter(
-                                1, /* @todo this is a hack, please add a alpaka accelerator here*/
-                                parSrc
-                            )
-                        )
-                            localIdxs[particleIdx] = curNumParticles++;
-                }
+                DataSpace<T_dim> m_gridSize;
 
-                int globalOffset;
-                /* atomic update with openmp*/
-                #pragma omp critical
+                ConcatListOfFrames(const DataSpace<T_dim>& gridSize) : m_gridSize(gridSize)
                 {
-                    globalOffset = counter;
-                    counter += curNumParticles;
                 }
 
-                for (int particleIdx = 0; particleIdx < particlesPerFrame; ++particleIdx)
+                /** concatenate list of frames to single frame
+                 *
+                 * @param counter[in,out] scalar offset in `destFrame`
+                 * @param destFrame single frame were all particles are copied in
+                 * @param srcBox particle box were particles are read from
+                 * @param particleFilter filter to select particles
+                 * @param domainOffset offset to a user-defined domain. Can, e.g. be used to
+                 *                     calculate a totalCellIdx, adding
+                 *                     globalDomain.offset + localDomain.offset
+                 * @param domainCellIdxIdentifier the identifier for the particle cellIdx
+                 *                                that is calculated with respect to
+                 *                                domainOffset
+                 * @param mapper mapper which describes the area where particles are copied from
+                 * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
+                 *                  The working domain for the filter is supercells.
+                 */
+                template<
+                    class T_DestFrame,
+                    class T_SrcBox,
+                    class T_Filter,
+                    class T_Space,
+                    class T_Identifier,
+                    class T_Mapping,
+                    typename T_ParticleFilter>
+                void operator()(
+                    int& counter,
+                    T_DestFrame destFrame,
+                    T_SrcBox srcBox,
+                    const T_Filter particleFilter,
+                    const T_Space domainOffset,
+                    const T_Identifier domainCellIdxIdentifier,
+                    const T_Mapping mapper,
+                    T_ParticleFilter& parFilter)
                 {
-                    if (localIdxs[particleIdx] != -1)
+#pragma omp parallel for
+                    for(int linearBlockIdx = 0; linearBlockIdx < m_gridSize.productOfComponents(); ++linearBlockIdx)
                     {
-                        auto parSrc = (srcFramePtr[particleIdx]);
-                        auto parDest = destFrame[globalOffset + localIdxs[particleIdx]];
-                        auto parDestNoDomainIdx = deselect<T_Identifier>(parDest);
-                        assign(parDestNoDomainIdx, parSrc);
-                        /* calculate cell index for user-defined domain */
-                        DataSpace<Mapping::Dim> localCellIdx(DataSpaceOperations<Mapping::Dim>::template map<SuperCellSize>(parSrc[localCellIdx_]));
-                        parDest[domainCellIdxIdentifier] = domainOffset + superCellPosition + localCellIdx;
+                        // local copy for each omp thread
+                        T_Filter filter = particleFilter;
+                        DataSpace<T_dim> blockIndex(DataSpaceOperations<T_dim>::map(m_gridSize, linearBlockIdx));
+
+                        using namespace pmacc::particles::operations;
+                        using namespace mappings::threads;
+
+                        typedef T_DestFrame DestFrameType;
+                        typedef typename T_SrcBox::FrameType SrcFrameType;
+                        typedef typename T_SrcBox::FramePtr SrcFramePtr;
+
+                        typedef T_Mapping Mapping;
+                        typedef typename Mapping::SuperCellSize SuperCellSize;
+
+
+                        const int particlesPerFrame = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                        int localIdxs[particlesPerFrame];
+
+                        const DataSpace<Mapping::Dim> superCellIdx = mapper.getSuperCellIndex(blockIndex);
+                        const DataSpace<Mapping::Dim> superCellPosition(
+                            (superCellIdx - mapper.getGuardingSuperCells()) * mapper.getSuperCellSize());
+                        filter.setSuperCellPosition(superCellPosition);
+                        auto accParFilter = parFilter(
+                            1, /* @todo this is a hack, please add a alpaka accelerator here*/
+                            superCellIdx - mapper.getGuardingSuperCells(),
+                            WorkerCfg<1>{0} /* @todo this is a workaround because we use no alpaka*/
+                        );
+
+                        SrcFramePtr srcFramePtr = srcBox.getFirstFrame(superCellIdx);
+
+                        /* Loop over all frames in current super cell */
+                        while(srcFramePtr.isValid())
+                        {
+                            /* Count number of particles in current frame and init its indices */
+                            int curNumParticles = 0;
+                            for(int particleIdx = 0; particleIdx < particlesPerFrame; ++particleIdx)
+                            {
+                                localIdxs[particleIdx] = -1;
+                                auto parSrc = (srcFramePtr[particleIdx]);
+                                /* Check if particle exists and is not filtered */
+                                if(parSrc[multiMask_] == 1 && filter(*srcFramePtr, particleIdx))
+                                    if(accParFilter(
+                                           1, /* @todo this is a hack, please add a alpaka accelerator here*/
+                                           parSrc))
+                                        localIdxs[particleIdx] = curNumParticles++;
+                            }
+
+                            int globalOffset;
+/* atomic update with openmp*/
+#pragma omp critical
+                            {
+                                globalOffset = counter;
+                                counter += curNumParticles;
+                            }
+
+                            for(int particleIdx = 0; particleIdx < particlesPerFrame; ++particleIdx)
+                            {
+                                if(localIdxs[particleIdx] != -1)
+                                {
+                                    auto parSrc = (srcFramePtr[particleIdx]);
+                                    auto parDest = destFrame[globalOffset + localIdxs[particleIdx]];
+                                    auto parDestNoDomainIdx = deselect<T_Identifier>(parDest);
+                                    assign(parDestNoDomainIdx, parSrc);
+                                    /* calculate cell index for user-defined domain */
+                                    DataSpace<Mapping::Dim> localCellIdx(
+                                        DataSpaceOperations<Mapping::Dim>::template map<SuperCellSize>(
+                                            parSrc[localCellIdx_]));
+                                    parDest[domainCellIdxIdentifier] = domainOffset + superCellPosition + localCellIdx;
+                                }
+                            }
+                            /*get next frame in supercell*/
+                            srcFramePtr = srcBox.getNextFrame(srcFramePtr);
+                        }
                     }
                 }
-                /*get next frame in supercell*/
-                srcFramePtr = srcBox.getNextFrame(srcFramePtr);
-
-            }
-        }
-    }
-
-};
+            };
 
-} //namespace operations
-} //namespace particles
-} //namespace pmacc
+        } // namespace operations
+    } // namespace particles
+} // namespace pmacc
diff --git a/include/pmacc/particles/operations/CopyIdentifier.hpp b/include/pmacc/particles/operations/CopyIdentifier.hpp
index 9b0a33929d..a855ad5967 100644
--- a/include/pmacc/particles/operations/CopyIdentifier.hpp
+++ b/include/pmacc/particles/operations/CopyIdentifier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,21 +27,17 @@
 
 namespace pmacc
 {
+    namespace pmath = pmacc::math;
 
-namespace pmath = pmacc::math;
 
-
-template<typename T_Key>
-struct CopyIdentifier
-{
-    template<typename T_T1,typename T_T2>
-    HDINLINE
-    void operator()(T_T1& dest, const T_T2& src)
+    template<typename T_Key>
+    struct CopyIdentifier
     {
-        dest[T_Key()]=src[T_Key()];
-    }
-
-
-};
-
-}//namespace pmacc
+        template<typename T_T1, typename T_T2>
+        HDINLINE void operator()(T_T1& dest, const T_T2& src)
+        {
+            dest[T_Key()] = src[T_Key()];
+        }
+    };
+
+} // namespace pmacc
diff --git a/include/pmacc/particles/operations/CountParticles.hpp b/include/pmacc/particles/operations/CountParticles.hpp
index b75bdc42a1..ccbc94c8e6 100644
--- a/include/pmacc/particles/operations/CountParticles.hpp
+++ b/include/pmacc/particles/operations/CountParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Erik Zenker
+/* Copyright 2013-2021 Rene Widera, Erik Zenker
  *
  * This file is part of PMacc.
  *
@@ -37,281 +37,222 @@
 
 namespace pmacc
 {
-
-/* count particles
- *
- * it is allowed to call this kernel on frames with holes (without calling fillAllGAps before)
- *
- * @tparam T_numWorkers number of workers
- */
-template< uint32_t T_numWorkers >
-struct KernelCountParticles
-{
-    /** count particles
+    /* count particles
      *
-     * @tparam T_PBox pmacc::ParticlesBox, particle box type
-     * @tparam T_Filter functor to filter particles
-     * @tparam T_Mapping supercell mapper functor type
-     * @tparam T_ParticleFilter pmacc::filter::Interface, type of the particle filter
-     * @tparam T_Acc type of the alpaka accelerator
+     * it is allowed to call this kernel on frames with holes (without calling fillAllGAps before)
      *
-     * @param pb particle memory
-     * @param gCounter pointer for the result
-     * @param filter functor to filter particles those should be counted
-     * @param mapper functor to map a block to a supercell
-     * @param parFilter particle filter method, the working domain for the filter is supercells
+     * @tparam T_numWorkers number of workers
      */
-    template<
-        typename T_PBox,
-        typename T_Filter,
-        typename T_Mapping,
-        typename T_ParticleFilter,
-        typename T_Acc
-    >
-    DINLINE void operator( )(
-        T_Acc const & acc,
-        T_PBox pb,
-        uint64_cu* gCounter,
-        T_Filter filter,
-        T_Mapping const mapper,
-        T_ParticleFilter parFilter
-    ) const
+    template<uint32_t T_numWorkers>
+    struct KernelCountParticles
     {
-        using namespace mappings::threads;
+        /** count particles
+         *
+         * @tparam T_PBox pmacc::ParticlesBox, particle box type
+         * @tparam T_Filter functor to filter particles
+         * @tparam T_Mapping supercell mapper functor type
+         * @tparam T_ParticleFilter pmacc::filter::Interface, type of the particle filter
+         * @tparam T_Acc type of the alpaka accelerator
+         *
+         * @param pb particle memory
+         * @param gCounter pointer for the result
+         * @param filter functor to filter particles those should be counted
+         * @param mapper functor to map a block to a supercell
+         * @param parFilter particle filter method, the working domain for the filter is supercells
+         */
+        template<typename T_PBox, typename T_Filter, typename T_Mapping, typename T_ParticleFilter, typename T_Acc>
+        DINLINE void operator()(
+            T_Acc const& acc,
+            T_PBox pb,
+            uint64_cu* gCounter,
+            T_Filter filter,
+            T_Mapping const mapper,
+            T_ParticleFilter parFilter) const
+        {
+            using namespace mappings::threads;
 
-        using Frame = typename T_PBox::FrameType;
-        using FramePtr = typename T_PBox::FramePtr;
-        constexpr uint32_t dim = T_Mapping::Dim;
-        constexpr uint32_t frameSize = math::CT::volume< typename Frame::SuperCellSize >::type::value;
-        constexpr uint32_t numWorkers = T_numWorkers;
+            using Frame = typename T_PBox::FrameType;
+            using FramePtr = typename T_PBox::FramePtr;
+            constexpr uint32_t dim = T_Mapping::Dim;
+            constexpr uint32_t frameSize = math::CT::volume<typename Frame::SuperCellSize>::type::value;
+            constexpr uint32_t numWorkers = T_numWorkers;
 
-        PMACC_SMEM(
-            acc,
-            frame,
-            FramePtr
-        );
-        PMACC_SMEM(
-            acc,
-            counter,
-            int
-        );
-        PMACC_SMEM(
-            acc,
-            particlesInSuperCell,
-            lcellId_t
-        );
+            PMACC_SMEM(acc, frame, FramePtr);
+            PMACC_SMEM(acc, counter, int);
+            PMACC_SMEM(acc, particlesInSuperCell, lcellId_t);
 
-        using SuperCellSize = typename T_Mapping::SuperCellSize;
+            using SuperCellSize = typename T_Mapping::SuperCellSize;
 
-        DataSpace< dim > const threadIndex( threadIdx );
-        uint32_t const workerIdx = static_cast< uint32_t >(
-            DataSpaceOperations< dim >::template map< SuperCellSize >( threadIndex )
-        );
+            DataSpace<dim> const threadIndex(cupla::threadIdx(acc));
+            uint32_t const workerIdx
+                = static_cast<uint32_t>(DataSpaceOperations<dim>::template map<SuperCellSize>(threadIndex));
 
-        DataSpace< dim > const superCellIdx( mapper.getSuperCellIndex( DataSpace< dim >( blockIdx ) ) );
+            DataSpace<dim> const superCellIdx(mapper.getSuperCellIndex(DataSpace<dim>(cupla::blockIdx(acc))));
 
-        ForEachIdx<
-            IdxConfig<
-                1,
-                numWorkers
-            >
-        > onlyMaster{ workerIdx };
+            ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
 
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-                frame = pb.getLastFrame( superCellIdx );
-                particlesInSuperCell = pb.getSuperCell( superCellIdx ).getSizeLastFrame( );
+            onlyMaster([&](uint32_t const, uint32_t const) {
+                frame = pb.getLastFrame(superCellIdx);
+                particlesInSuperCell = pb.getSuperCell(superCellIdx).getSizeLastFrame();
                 counter = 0;
-            }
-        );
+            });
 
-        __syncthreads( );
+            cupla::__syncthreads(acc);
 
-        if( !frame.isValid() )
-            return; //end kernel if we have no frames
-        filter.setSuperCellPosition(
-            ( superCellIdx - mapper.getGuardingSuperCells( ) ) *
-            mapper.getSuperCellSize( )
-        );
+            if(!frame.isValid())
+                return; // end kernel if we have no frames
+            filter.setSuperCellPosition((superCellIdx - mapper.getGuardingSuperCells()) * mapper.getSuperCellSize());
 
-        auto accParFilter = parFilter(
-            acc,
-            superCellIdx - mapper.getGuardingSuperCells( ),
-            WorkerCfg< numWorkers >{ workerIdx }
-        );
+            auto accParFilter
+                = parFilter(acc, superCellIdx - mapper.getGuardingSuperCells(), WorkerCfg<numWorkers>{workerIdx});
 
-        ForEachIdx<
-            IdxConfig<
-                frameSize,
-                numWorkers
-            >
-        > forEachParticle( workerIdx );
+            ForEachIdx<IdxConfig<frameSize, numWorkers>> forEachParticle(workerIdx);
 
-        while( frame.isValid( ) )
-        {
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    if( linearIdx < particlesInSuperCell )
+            while(frame.isValid())
+            {
+                forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                    if(linearIdx < particlesInSuperCell)
                     {
-                        bool const useParticle = filter(
-                            *frame,
-                            linearIdx
-                        );
-                        if( useParticle )
+                        bool const useParticle = filter(*frame, linearIdx);
+                        if(useParticle)
                         {
-                            auto parSrc = ( frame[ linearIdx ] );
-                            if(
-                                accParFilter(
-                                    acc,
-                                    parSrc
-                                )
-                            )
-                                nvidia::atomicAllInc( acc, &counter, ::alpaka::hierarchy::Threads{} );
+                            auto parSrc = (frame[linearIdx]);
+                            if(accParFilter(acc, parSrc))
+                                nvidia::atomicAllInc(acc, &counter, ::alpaka::hierarchy::Threads{});
                         }
                     }
-                }
-            );
+                });
 
-            __syncthreads( );
+                cupla::__syncthreads(acc);
 
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
-                {
-                    frame = pb.getPreviousFrame( frame );
+                onlyMaster([&](uint32_t const, uint32_t const) {
+                    frame = pb.getPreviousFrame(frame);
                     particlesInSuperCell = frameSize;
-                }
-            );
+                });
 
-            __syncthreads( );
-        }
-
-        onlyMaster(
-            [&](
-                uint32_t const,
-                uint32_t const
-            )
-            {
-
-                atomicAdd(
-                    gCounter,
-                    static_cast< uint64_cu >( counter ),
-                    ::alpaka::hierarchy::Blocks{}
-                );
+                cupla::__syncthreads(acc);
             }
-        );
-    }
-};
-
-struct CountParticles
-{
-
-    /** Get particle count
-     *
-     * @tparam AREA area were particles are counted (CORE, BORDER, GUARD)
-     *
-     * @param buffer source particle buffer
-     * @param cellDescription instance of MappingDesction
-     * @param filter filter instance which must inharid from PositionFilter
-     * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
-     *                  The working domain for the filter is supercells.
-     * @return number of particles in defined area
-     */
-    template<uint32_t AREA, class PBuffer, class Filter, class CellDesc, typename T_ParticleFilter>
-    static uint64_cu countOnDevice( PBuffer& buffer, CellDesc cellDescription, Filter filter, T_ParticleFilter & parFilter )
-    {
-        GridBuffer<
-            uint64_cu,
-            DIM1
-        > counter( DataSpace< DIM1 >( 1 ) );
 
-        AreaMapping<
-            AREA,
-            CellDesc
-        > mapper( cellDescription );
-        constexpr uint32_t numWorkers = traits::GetNumWorkers<
-            math::CT::volume< typename CellDesc::SuperCellSize >::type::value
-        >::value;
-
-        PMACC_KERNEL( KernelCountParticles< numWorkers >{ } )(
-            mapper.getGridDim( ),
-            numWorkers
-        )(
-            buffer.getDeviceParticlesBox( ),
-            counter.getDeviceBuffer( ).getBasePointer( ),
-            filter,
-            mapper,
-            parFilter
-        );
-
-        counter.deviceToHost( );
-        return *( counter.getHostBuffer( ).getDataBox( ) );
-    }
+            onlyMaster([&](uint32_t const, uint32_t const) {
+                cupla::atomicAdd(acc, gCounter, static_cast<uint64_cu>(counter), ::alpaka::hierarchy::Blocks{});
+            });
+        }
+    };
 
-    /** Get particle count
-     *
-     * @param buffer source particle buffer
-     * @param cellDescription instance of MappingDesction
-     * @param filter filter instance which must inharid from PositionFilter
-     * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
-     *                  The working domain for the filter is supercells.
-     * @return number of particles in defined area
-     */
-    template< class PBuffer, class Filter, class CellDesc, typename T_ParticleFilter>
-    static uint64_cu countOnDevice(PBuffer& buffer, CellDesc cellDescription, Filter filter, T_ParticleFilter & parFilter)
+    struct CountParticles
     {
-        return pmacc::CountParticles::countOnDevice < CORE + BORDER + GUARD > (buffer, cellDescription, filter, parFilter);
-    }
+        /** Get particle count
+         *
+         * @tparam AREA area were particles are counted (CORE, BORDER, GUARD)
+         *
+         * @param buffer source particle buffer
+         * @param cellDescription instance of MappingDesction
+         * @param filter filter instance which must inharid from PositionFilter
+         * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
+         *                  The working domain for the filter is supercells.
+         * @return number of particles in defined area
+         */
+        template<uint32_t AREA, class PBuffer, class Filter, class CellDesc, typename T_ParticleFilter>
+        static uint64_cu countOnDevice(
+            PBuffer& buffer,
+            CellDesc cellDescription,
+            Filter filter,
+            T_ParticleFilter& parFilter)
+        {
+            GridBuffer<uint64_cu, DIM1> counter(DataSpace<DIM1>(1));
+
+            AreaMapping<AREA, CellDesc> mapper(cellDescription);
+            constexpr uint32_t numWorkers
+                = traits::GetNumWorkers<math::CT::volume<typename CellDesc::SuperCellSize>::type::value>::value;
+
+            PMACC_KERNEL(KernelCountParticles<numWorkers>{})
+            (mapper.getGridDim(), numWorkers)(
+                buffer.getDeviceParticlesBox(),
+                counter.getDeviceBuffer().getBasePointer(),
+                filter,
+                mapper,
+                parFilter);
+
+            counter.deviceToHost();
+            return *(counter.getHostBuffer().getDataBox());
+        }
 
-    /** Get particle count
-     *
-     * @tparam AREA area were particles are counted (CORE, BORDER, GUARD)
-     *
-     * @param buffer source particle buffer
-     * @param cellDescription instance of MappingDesction
-     * @param origin local cell position (can be negative)
-     * @param size local size in cells for checked volume
-     * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
-     *                  The working domain for the filter is supercells.
-     * @return number of particles in defined area
-     */
-    template<uint32_t AREA, class PBuffer, class CellDesc, class Space, typename T_ParticleFilter>
-    static uint64_cu countOnDevice(PBuffer& buffer, CellDesc cellDescription, const Space& origin, const Space& size, T_ParticleFilter & parFilter)
-    {
-        typedef bmpl::vector< typename GetPositionFilter<Space::Dim>::type > usedFilters;
-        typedef typename FilterFactory<usedFilters>::FilterType MyParticleFilter;
-        MyParticleFilter filter;
-        filter.setStatus(true); /*activeate filter pipline*/
-        filter.setWindowPosition(origin, size);
-        return pmacc::CountParticles::countOnDevice<AREA>(buffer, cellDescription, filter, parFilter);
-    }
+        /** Get particle count
+         *
+         * @param buffer source particle buffer
+         * @param cellDescription instance of MappingDesction
+         * @param filter filter instance which must inharid from PositionFilter
+         * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
+         *                  The working domain for the filter is supercells.
+         * @return number of particles in defined area
+         */
+        template<class PBuffer, class Filter, class CellDesc, typename T_ParticleFilter>
+        static uint64_cu countOnDevice(
+            PBuffer& buffer,
+            CellDesc cellDescription,
+            Filter filter,
+            T_ParticleFilter& parFilter)
+        {
+            return pmacc::CountParticles::countOnDevice<CORE + BORDER + GUARD>(
+                buffer,
+                cellDescription,
+                filter,
+                parFilter);
+        }
 
-    /** Get particle count
-     *
-     * @param buffer source particle buffer
-     * @param cellDescription instance of MappingDesction
-     * @param origin local cell position (can be negative)
-     * @param size local size in cells for checked volume
-     * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
-     *                  The working domain for the filter is supercells.
-     * @return number of particles in defined area
-     */
-    template< class PBuffer, class Filter, class CellDesc, class Space, typename T_ParticleFilter>
-    static uint64_cu countOnDevice(PBuffer& buffer, CellDesc cellDescription, const Space& origin, const Space& size, T_ParticleFilter & parFilter)
-    {
-        return pmacc::CountParticles::countOnDevice < CORE + BORDER + GUARD > (buffer, cellDescription, origin, size, parFilter);
-    }
+        /** Get particle count
+         *
+         * @tparam AREA area were particles are counted (CORE, BORDER, GUARD)
+         *
+         * @param buffer source particle buffer
+         * @param cellDescription instance of MappingDesction
+         * @param origin local cell position (can be negative)
+         * @param size local size in cells for checked volume
+         * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
+         *                  The working domain for the filter is supercells.
+         * @return number of particles in defined area
+         */
+        template<uint32_t AREA, class PBuffer, class CellDesc, class Space, typename T_ParticleFilter>
+        static uint64_cu countOnDevice(
+            PBuffer& buffer,
+            CellDesc cellDescription,
+            const Space& origin,
+            const Space& size,
+            T_ParticleFilter& parFilter)
+        {
+            typedef bmpl::vector<typename GetPositionFilter<Space::Dim>::type> usedFilters;
+            typedef typename FilterFactory<usedFilters>::FilterType MyParticleFilter;
+            MyParticleFilter filter;
+            filter.setStatus(true); /*activeate filter pipline*/
+            filter.setWindowPosition(origin, size);
+            return pmacc::CountParticles::countOnDevice<AREA>(buffer, cellDescription, filter, parFilter);
+        }
 
-};
+        /** Get particle count
+         *
+         * @param buffer source particle buffer
+         * @param cellDescription instance of MappingDesction
+         * @param origin local cell position (can be negative)
+         * @param size local size in cells for checked volume
+         * @param parFilter particle filter method, must fulfill the interface of pmacc::filter::Interface
+         *                  The working domain for the filter is supercells.
+         * @return number of particles in defined area
+         */
+        template<class PBuffer, class Filter, class CellDesc, class Space, typename T_ParticleFilter>
+        static uint64_cu countOnDevice(
+            PBuffer& buffer,
+            CellDesc cellDescription,
+            const Space& origin,
+            const Space& size,
+            T_ParticleFilter& parFilter)
+        {
+            return pmacc::CountParticles::countOnDevice<CORE + BORDER + GUARD>(
+                buffer,
+                cellDescription,
+                origin,
+                size,
+                parFilter);
+        }
+    };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/operations/Deselect.hpp b/include/pmacc/particles/operations/Deselect.hpp
index 91c5228ef7..10b730d53e 100644
--- a/include/pmacc/particles/operations/Deselect.hpp
+++ b/include/pmacc/particles/operations/Deselect.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,38 +30,36 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace operations
-{
+    namespace particles
+    {
+        namespace operations
+        {
+            namespace detail
+            {
+                /* functor for deselect attributes of an object
+                 *
+                 * - must be boost result_of compatible
+                 * - must define a operator()(T_Object)
+                 *
+                 * @tparam T_Sequence any boost mpl sequence
+                 * @tparam T_Object a type were we can deselect attributes from
+                 */
+                template<typename T_Sequence, typename T_Object>
+                struct Deselect;
 
-namespace detail
-{
+            } // namespace detail
 
-/* functor for deselect attributes of an object
- *
- * - must be boost result_of compatible
- * - must define a operator()(T_Object)
- *
- * @tparam T_Sequence any boost mpl sequence
- * @tparam T_Object a type were we can deselect attributes from
- */
-template<typename T_Sequence, typename T_Object>
-struct Deselect;
-
-} //namespace detail
-
-template<typename T_Exclude, typename T_Object>
-HDINLINE
-typename boost::result_of < detail::Deselect<typename ToSeq<T_Exclude>::type,T_Object>(T_Object)>::type
-deselect(T_Object& object)
-{
-    typedef typename ToSeq< T_Exclude >::type DeselectSeq;
-    typedef detail::Deselect<DeselectSeq, T_Object> BaseType;
+            template<typename T_Exclude, typename T_Object>
+            HDINLINE
+                typename boost::result_of<detail::Deselect<typename ToSeq<T_Exclude>::type, T_Object>(T_Object)>::type
+                deselect(T_Object& object)
+            {
+                typedef typename ToSeq<T_Exclude>::type DeselectSeq;
+                typedef detail::Deselect<DeselectSeq, T_Object> BaseType;
 
-    return BaseType()(object);
-}
+                return BaseType()(object);
+            }
 
-}//operators
-}//namespace particles
-} //namespace pmacc
+        } // namespace operations
+    } // namespace particles
+} // namespace pmacc
diff --git a/include/pmacc/particles/operations/SetAttributeToDefault.hpp b/include/pmacc/particles/operations/SetAttributeToDefault.hpp
index 0f99859487..089463ddd2 100644
--- a/include/pmacc/particles/operations/SetAttributeToDefault.hpp
+++ b/include/pmacc/particles/operations/SetAttributeToDefault.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,29 +26,27 @@
 
 namespace pmacc
 {
-
-/** set an attribute of a particle to its default value
- *
- * @tparam  T_Attribute value_identifier or alias which is a value_identifier
- */
-template<typename T_Attribute>
-struct SetAttributeToDefault
-{
-    typedef T_Attribute Attribute;
-
-    /** set an attribute to their default value
+    /** set an attribute of a particle to its default value
      *
-     * @tparam T_Partcile particle type
+     * @tparam  T_Attribute value_identifier or alias which is a value_identifier
      */
-    template<typename T_Particle>
-    HDINLINE
-    void operator()(T_Particle& particle)
+    template<typename T_Attribute>
+    struct SetAttributeToDefault
     {
-        typedef typename pmacc::traits::Resolve<Attribute>::type ResolvedAttr;
-        /* set attribute to it's user defined default value */
-        particle[Attribute()] = ResolvedAttr::getValue();
-    }
-};
+        typedef T_Attribute Attribute;
+
+        /** set an attribute to their default value
+         *
+         * @tparam T_Partcile particle type
+         */
+        template<typename T_Particle>
+        HDINLINE void operator()(T_Particle& particle)
+        {
+            typedef typename pmacc::traits::Resolve<Attribute>::type ResolvedAttr;
+            /* set attribute to it's user defined default value */
+            particle[Attribute()] = ResolvedAttr::getValue();
+        }
+    };
 
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/operations/splitIntoListOfFrames.kernel b/include/pmacc/particles/operations/splitIntoListOfFrames.kernel
index dab6bf82e6..aa97f5fc36 100644
--- a/include/pmacc/particles/operations/splitIntoListOfFrames.kernel
+++ b/include/pmacc/particles/operations/splitIntoListOfFrames.kernel
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Alexander Grund
+/* Copyright 2014-2021 Rene Widera, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -37,440 +37,298 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace operations
-{
-namespace kernel
-{
-    /** transform a large frame into a list of small frames
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template< uint32_t T_numWorkers >
-    struct SplitIntoListOfFrames
+    namespace particles
     {
-        /** Copy particles from big frame to PMacc frame structure
-         *  (Opposite to ConcatListOfFrames)
-         *
-         * - convert a user-defined domainCellIdx to localCellIdx
-         * - processed particles per block <= number of cells per superCell
-         *
-         * @tparam T_CounterBox pmacc:DataBox, type of buffer for the statistics counter
-         * @tparam T_DestBox pmacc:ParticlesBox, type of the destination particle box
-         * @tparam T_SrcFrame pmacc:Frame, type of the source frame
-         * @tparam T_Space pmacc::DataSpace, type for indicies and offsets within the domain
-         * @tparam T_Identifier Identifier, type of the identifier for the total domain offset
-         * @tparam T_CellDescription pmacc::MappingDescription, type of the domain description
-         * @tparam T_Acc alpaka accelerator type
-         *
-         * @param acc alpaka accelerator
-         * @param counter box with three integers [sharedSrcParticleOffset, numLoadedParticles, numUsedFrames]
-         * @param destBox particle box where all particles are copied to (destination)
-         * @param srcFrame frame with particles (is used as source)
-         * @param maxParticles number of particles in srcFrame
-         * @param localDomainCellOffset offset in cells to user-defined domain (@see wiki PIConGPU domain definitions)
-         * @param domainCellIdxIdentifier the identifier for the particle domain cellIdx
-         *                                that is calculated back to the local domain
-         *                                with respect to localDomainCellOffset
-         * @param cellDesc supercell domain description
-         */
-        template<
-            typename T_CounterBox,
-            typename T_DestBox,
-            typename T_SrcFrame,
-            typename T_Space,
-            typename T_Identifier,
-            typename T_CellDescription,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_CounterBox counter,
-            T_DestBox destBox,
-            T_SrcFrame srcFrame,
-            int const maxParticles,
-            T_Space const localDomainCellOffset,
-            T_Identifier const domainCellIdxIdentifier,
-            T_CellDescription const cellDesc
-        ) const
+        namespace operations
         {
-            using namespace pmacc::particles::operations;
-            using namespace mappings::threads;
-
-            using SrcFrameType = T_SrcFrame;
-            using DestFrameType = typename T_DestBox::FrameType;
-            using DestFramePtr = typename T_DestBox::FramePtr;
-            using SuperCellSize = typename DestFrameType::SuperCellSize;
-
-            constexpr uint32_t numWorkers = T_numWorkers;
-            constexpr uint32_t numDims = T_DestBox::Dim;
-            constexpr uint32_t particlesPerFrame = math::CT::volume< SuperCellSize >::type::value;
-
-            PMACC_SMEM(
-                acc,
-                destFramePtr,
-                memory::Array<
-                    DestFramePtr,
-                    particlesPerFrame
-                >
-            );
-            PMACC_SMEM(
-                acc,
-                sharedLinearSuperCellIds,
-                memory::Array<
-                    int,
-                    particlesPerFrame
-                >
-            );
-            PMACC_SMEM(
-                acc,
-                sharedSrcParticleOffset,
-                int
-            );
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            DataSpace< numDims > const numSuperCells(
-                cellDesc.getGridSuperCells( ) - cellDesc.getGuardingSuperCells( ) * 2
-            );
-
-            ForEachIdx<
-                IdxConfig<
-                    1,
-                    numWorkers
-                >
-            > onlyMaster{ workerIdx };
-
-            onlyMaster(
-                [&](
-                    uint32_t const,
-                    uint32_t const
-                )
+            namespace kernel
+            {
+                /** transform a large frame into a list of small frames
+                 *
+                 * @tparam T_numWorkers number of workers
+                 */
+                template<uint32_t T_numWorkers>
+                struct SplitIntoListOfFrames
                 {
-                    /* apply for work for the full block, counter[0] contains the
-                     * offset in srcFrame to load N particles
+                    /** Copy particles from big frame to PMacc frame structure
+                     *  (Opposite to ConcatListOfFrames)
+                     *
+                     * - convert a user-defined domainCellIdx to localCellIdx
+                     * - processed particles per block <= number of cells per superCell
+                     *
+                     * @tparam T_CounterBox pmacc:DataBox, type of buffer for the statistics counter
+                     * @tparam T_DestBox pmacc:ParticlesBox, type of the destination particle box
+                     * @tparam T_SrcFrame pmacc:Frame, type of the source frame
+                     * @tparam T_Space pmacc::DataSpace, type for indicies and offsets within the domain
+                     * @tparam T_Identifier Identifier, type of the identifier for the total domain offset
+                     * @tparam T_CellDescription pmacc::MappingDescription, type of the domain description
+                     * @tparam T_Acc alpaka accelerator type
+                     *
+                     * @param acc alpaka accelerator
+                     * @param counter box with three integers [sharedSrcParticleOffset, numLoadedParticles,
+                     * numUsedFrames]
+                     * @param destBox particle box where all particles are copied to (destination)
+                     * @param srcFrame frame with particles (is used as source)
+                     * @param maxParticles number of particles in srcFrame
+                     * @param localDomainCellOffset offset in cells to user-defined domain (@see wiki PIConGPU domain
+                     * definitions)
+                     * @param domainCellIdxIdentifier the identifier for the particle domain cellIdx
+                     *                                that is calculated back to the local domain
+                     *                                with respect to localDomainCellOffset
+                     * @param cellDesc supercell domain description
                      */
-                    sharedSrcParticleOffset = atomicAdd(
-                        &( counter[ 0 ] ),
-                        particlesPerFrame,
-                        ::alpaka::hierarchy::Blocks{}
-                    );
-                }
-            );
-
-            __syncthreads();
-
-            using ParticleDomCfg = IdxConfig<
-                particlesPerFrame,
-                numWorkers
-            >;
-
-            memory::CtxArray<
-                int,
-                ParticleDomCfg
-            >
-            srcParticleIdxCtx{ };
-
-            memory::CtxArray<
-                bool,
-                ParticleDomCfg
-            >
-            hasValidParticleCtx{ };
-
-            // loop over all particles in the frame
-            ForEachIdx< ParticleDomCfg > forEachParticle( workerIdx );
-
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    destFramePtr[ linearIdx ] = DestFramePtr{ };
-                    sharedLinearSuperCellIds[ linearIdx ] = -1;
-
-                    srcParticleIdxCtx[ idx ] = sharedSrcParticleOffset + linearIdx;
-                    hasValidParticleCtx[ idx ] = srcParticleIdxCtx[ idx ] < maxParticles;
-                }
-            );
-
-            __syncthreads();
-
-            // supercell index of the particle relative to the origin of the local domain
-            memory::CtxArray<
-                DataSpace< numDims >,
-                ParticleDomCfg
-            >
-            particlesSuperCellCtx{ };
-
-            // linear cell index of the particle within the destination frame
-            memory::CtxArray<
-                lcellId_t,
-                ParticleDomCfg
-            >
-            lCellIdxCtx( INV_LOC_IDX );
-
-            memory::CtxArray<
-                int,
-                ParticleDomCfg
-            >
-            linearParticlesSuperCellCtx( -1 );
-
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    if( hasValidParticleCtx[ idx ] )
+                    template<
+                        typename T_CounterBox,
+                        typename T_DestBox,
+                        typename T_SrcFrame,
+                        typename T_Space,
+                        typename T_Identifier,
+                        typename T_CellDescription,
+                        typename T_Acc>
+                    DINLINE void operator()(
+                        T_Acc const& acc,
+                        T_CounterBox counter,
+                        T_DestBox destBox,
+                        T_SrcFrame srcFrame,
+                        int const maxParticles,
+                        T_Space const localDomainCellOffset,
+                        T_Identifier const domainCellIdxIdentifier,
+                        T_CellDescription const cellDesc) const
                     {
-                        // offset of the particle relative to the origin of the local domain
-                        DataSpace< numDims > const particleCellOffset =
-                            srcFrame[ srcParticleIdxCtx[ idx ] ][ domainCellIdxIdentifier ] -
-                            localDomainCellOffset;
-                        particlesSuperCellCtx[ idx ] = particleCellOffset / SuperCellSize::toRT( );
-                        linearParticlesSuperCellCtx[ idx ] =
-                            DataSpaceOperations< numDims >::map(
-                                numSuperCells,
-                                particlesSuperCellCtx[ idx ]
-                            );
-                        sharedLinearSuperCellIds[ linearIdx ] = linearParticlesSuperCellCtx[ idx ];
-                        DataSpace< numDims > const localCellIdx(
-                            particleCellOffset -
-                            particlesSuperCellCtx[ idx ] * SuperCellSize::toRT()
-                        );
-                        lCellIdxCtx[ idx ] =
-                            DataSpaceOperations< numDims >::template map< SuperCellSize >( localCellIdx );
-                    }
-                }
-            );
-
-            __syncthreads();
-
-            memory::CtxArray<
-                int,
-                ParticleDomCfg
-            >
-            masterVirtualThreadIdxCtx(
-                workerIdx,
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
-                    return linearIdx - 1;
-                }
-            );
+                        using namespace pmacc::particles::operations;
+                        using namespace mappings::threads;
 
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
-                {
-                    if( hasValidParticleCtx[ idx ] )
-                    {
-                        auto & vThreadMasterIdx = masterVirtualThreadIdxCtx[ idx ];
-                        /* search master thread index */
-                        while( vThreadMasterIdx >= 0 )
-                        {
-                            if(
-                                linearParticlesSuperCellCtx[ idx ] !=
-                                sharedLinearSuperCellIds[ vThreadMasterIdx ]
-                            )
-                                break;
-
-                            --vThreadMasterIdx;
-                        }
-                        ++vThreadMasterIdx;
-
-                        // load empty frame if virtual thread is the master
-                        if( vThreadMasterIdx == linearIdx )
-                        {
-                            /* counter[2] -> number of used frames */
-                            nvidia::atomicAllInc(
-                            acc,
-                                &( counter[ 2 ] ),
-                                ::alpaka::hierarchy::Blocks{}
-                            );
-                            DestFramePtr tmpFrame = destBox.getEmptyFrame( );
-                            destFramePtr[ linearIdx ] = tmpFrame;
-                            destBox.setAsFirstFrame(
+                        using SrcFrameType = T_SrcFrame;
+                        using DestFrameType = typename T_DestBox::FrameType;
+                        using DestFramePtr = typename T_DestBox::FramePtr;
+                        using SuperCellSize = typename DestFrameType::SuperCellSize;
+
+                        constexpr uint32_t numWorkers = T_numWorkers;
+                        constexpr uint32_t numDims = T_DestBox::Dim;
+                        constexpr uint32_t particlesPerFrame = math::CT::volume<SuperCellSize>::type::value;
+
+                        PMACC_SMEM(acc, destFramePtr, memory::Array<DestFramePtr, particlesPerFrame>);
+                        PMACC_SMEM(acc, sharedLinearSuperCellIds, memory::Array<int, particlesPerFrame>);
+                        PMACC_SMEM(acc, sharedSrcParticleOffset, int);
+
+                        uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                        DataSpace<numDims> const numSuperCells(
+                            cellDesc.getGridSuperCells() - cellDesc.getGuardingSuperCells() * 2);
+
+                        ForEachIdx<IdxConfig<1, numWorkers>> onlyMaster{workerIdx};
+
+                        onlyMaster([&](uint32_t const, uint32_t const) {
+                            /* apply for work for the full block, counter[0] contains the
+                             * offset in srcFrame to load N particles
+                             */
+                            sharedSrcParticleOffset = cupla::atomicAdd(
                                 acc,
-                                tmpFrame,
-                                particlesSuperCellCtx[ idx ] + cellDesc.getGuardingSuperCells( )
-                            );
-                        }
+                                &(counter[0]),
+                                particlesPerFrame,
+                                ::alpaka::hierarchy::Blocks{});
+                        });
+
+                        cupla::__syncthreads(acc);
+
+                        using ParticleDomCfg = IdxConfig<particlesPerFrame, numWorkers>;
+
+                        memory::CtxArray<int, ParticleDomCfg> srcParticleIdxCtx{};
+
+                        memory::CtxArray<bool, ParticleDomCfg> hasValidParticleCtx{};
+
+                        // loop over all particles in the frame
+                        ForEachIdx<ParticleDomCfg> forEachParticle(workerIdx);
+
+                        forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                            destFramePtr[linearIdx] = DestFramePtr{};
+                            sharedLinearSuperCellIds[linearIdx] = -1;
+
+                            srcParticleIdxCtx[idx] = sharedSrcParticleOffset + linearIdx;
+                            hasValidParticleCtx[idx] = srcParticleIdxCtx[idx] < maxParticles;
+                        });
+
+                        cupla::__syncthreads(acc);
+
+                        // supercell index of the particle relative to the origin of the local domain
+                        memory::CtxArray<DataSpace<numDims>, ParticleDomCfg> particlesSuperCellCtx{};
+
+                        // linear cell index of the particle within the destination frame
+                        memory::CtxArray<lcellId_t, ParticleDomCfg> lCellIdxCtx(INV_LOC_IDX);
+
+                        memory::CtxArray<int, ParticleDomCfg> linearParticlesSuperCellCtx(-1);
+
+                        forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                            if(hasValidParticleCtx[idx])
+                            {
+                                // offset of the particle relative to the origin of the local domain
+                                DataSpace<numDims> const particleCellOffset
+                                    = srcFrame[srcParticleIdxCtx[idx]][domainCellIdxIdentifier]
+                                    - localDomainCellOffset;
+                                particlesSuperCellCtx[idx] = particleCellOffset / SuperCellSize::toRT();
+                                linearParticlesSuperCellCtx[idx]
+                                    = DataSpaceOperations<numDims>::map(numSuperCells, particlesSuperCellCtx[idx]);
+                                sharedLinearSuperCellIds[linearIdx] = linearParticlesSuperCellCtx[idx];
+                                DataSpace<numDims> const localCellIdx(
+                                    particleCellOffset - particlesSuperCellCtx[idx] * SuperCellSize::toRT());
+                                lCellIdxCtx[idx]
+                                    = DataSpaceOperations<numDims>::template map<SuperCellSize>(localCellIdx);
+                            }
+                        });
+
+                        cupla::__syncthreads(acc);
+
+                        memory::CtxArray<int, ParticleDomCfg> masterVirtualThreadIdxCtx(
+                            workerIdx,
+                            [&](uint32_t const linearIdx, uint32_t const) { return linearIdx - 1; });
+
+                        forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                            if(hasValidParticleCtx[idx])
+                            {
+                                auto& vThreadMasterIdx = masterVirtualThreadIdxCtx[idx];
+                                /* search master thread index */
+                                while(vThreadMasterIdx >= 0)
+                                {
+                                    if(linearParticlesSuperCellCtx[idx] != sharedLinearSuperCellIds[vThreadMasterIdx])
+                                        break;
+
+                                    --vThreadMasterIdx;
+                                }
+                                ++vThreadMasterIdx;
+
+                                // load empty frame if virtual thread is the master
+                                if(vThreadMasterIdx == linearIdx)
+                                {
+                                    /* counter[2] -> number of used frames */
+                                    nvidia::atomicAllInc(acc, &(counter[2]), ::alpaka::hierarchy::Blocks{});
+                                    DestFramePtr tmpFrame = destBox.getEmptyFrame(acc);
+                                    destFramePtr[linearIdx] = tmpFrame;
+                                    destBox.setAsFirstFrame(
+                                        acc,
+                                        tmpFrame,
+                                        particlesSuperCellCtx[idx] + cellDesc.getGuardingSuperCells());
+                                }
+                            }
+                        });
+
+                        cupla::__syncthreads(acc);
+
+                        forEachParticle([&](uint32_t const linearIdx, uint32_t const idx) {
+                            if(hasValidParticleCtx[idx])
+                            {
+                                /* copy attributes and activate particle*/
+                                auto parDest = destFramePtr[masterVirtualThreadIdxCtx[idx]][linearIdx];
+                                auto parDestDeselect = deselect<bmpl::vector2<localCellIdx, multiMask>>(parDest);
+
+                                assign(parDestDeselect, srcFrame[srcParticleIdxCtx[idx]]);
+                                parDest[localCellIdx_] = lCellIdxCtx[idx];
+                                parDest[multiMask_] = 1;
+                                /* counter[1] -> number of loaded particles
+                                 * this counter is evaluated on host side
+                                 * (check that loaded particles by this kernel == loaded particles from HDF5 file)*/
+                                nvidia::atomicAllInc(acc, &(counter[1]), ::alpaka::hierarchy::Blocks{});
+                            }
+                        });
                     }
+                };
+            } // namespace kernel
+
+            /** Copy particles from big frame to PMacc frame structure
+             *  (Opposite to ConcatListOfFrames)
+             *
+             * - convert a user-defined domainCellIdx to localCellIdx
+             * - processed particles per block <= number of cells per superCell
+             *
+             * @tparam T_LogLvl type of the loc level for debuging output
+             * @tparam T_DestSpecies pmacc:ParticlesBase, type of the destination species
+             * @tparam T_SrcFrame pmacc:ParticlesBox, type of the source particle frame
+             * @tparam T_Space pmacc::DataSpace, type for indicies and offsets within the domain
+             * @tparam T_Identifier Identifier, type of the identifier for the total domain offset
+             * @tparam T_CellDescription pmacc::MappingDescription, type of the domain description
+             *
+             * @param destSpecies particle species instance whose deviceBuffer is written
+             * @param srcFrame device frame with particles (is used as source)
+             * @param numParticles number of particles in srcFrame
+             * @param chunkSize number of particles to process in one kernel call
+             * @param localDomainCellOffset offset in cells to user-defined domain (@see wiki PIConGPU domain
+             * definitions)
+             * @param domainCellIdxIdentifier the identifier for the particle domain cellIdx
+             *                                that is calculated back to the local domain
+             *                                with respect to localDomainCellOffset
+             * @param cellDesc supercell domain description
+             * @param logLvl Log level used for information logging
+             */
+            template<
+                typename T_LogLvl,
+                typename T_DestSpecies,
+                typename T_SrcFrame,
+                typename T_Space,
+                typename T_Identifier,
+                typename T_CellDescription>
+            HINLINE void splitIntoListOfFrames(
+                T_DestSpecies& destSpecies,
+                T_SrcFrame srcFrame,
+                uint32_t numParticles,
+                uint32_t const chunkSize,
+                T_Space const& localDomainCellOffset,
+                T_Identifier const domainCellIdxIdentifier,
+                T_CellDescription const& cellDesc,
+                T_LogLvl const& logLvl = T_LogLvl())
+            {
+                using SuperCellSize = typename T_CellDescription::SuperCellSize;
+                uint32_t const cellsInSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+
+                /* counter is used to apply for work, count used frames and count loaded particles
+                 * [0] -> offset for loading particles
+                 * [1] -> number of loaded particles
+                 * [2] -> number of used frames
+                 *
+                 * all values are zero after initialization
+                 */
+                GridBuffer<uint32_t, DIM1> counterBuffer(DataSpace<DIM1>(3));
+
+                uint32_t const iterationsForLoad
+                    = math::float2int_ru(static_cast<double>(numParticles) / static_cast<double>(chunkSize));
+                uint32_t leftOverParticles = numParticles;
+
+                for(uint32_t i = 0; i < iterationsForLoad; ++i)
+                {
+                    /* only load a chunk of particles per iteration to avoid blow up of frame usage */
+                    uint32_t currentChunkSize = std::min(leftOverParticles, chunkSize);
+                    log(logLvl, "load particles on device chunk offset=%1%; chunk size=%2%; left particles %3%")
+                        % (i * chunkSize) % currentChunkSize % leftOverParticles;
+
+                    constexpr uint32_t numWorkers
+                        = pmacc::traits::GetNumWorkers<pmacc::math::CT::volume<SuperCellSize>::type::value>::value;
+
+                    PMACC_KERNEL(kernel::SplitIntoListOfFrames<numWorkers>{})
+                    (math::float2int_ru(double(currentChunkSize) / double(cellsInSuperCell)), numWorkers)(
+                        counterBuffer.getDeviceBuffer().getDataBox(),
+                        destSpecies.getDeviceParticlesBox(),
+                        srcFrame,
+                        static_cast<int>(numParticles),
+                        localDomainCellOffset,
+                        domainCellIdxIdentifier,
+                        cellDesc);
+                    destSpecies.fillAllGaps();
+                    leftOverParticles -= currentChunkSize;
                 }
-            );
 
-            __syncthreads();
+                counterBuffer.deviceToHost();
+                log(logLvl, "wait for last processed chunk: %1%") % T_SrcFrame::getName();
+
+                __getTransactionEvent().waitForFinished();
 
-            forEachParticle(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const idx
-                )
+                log(logLvl, "used frames to load particles: %1%") % counterBuffer.getHostBuffer().getDataBox()[2];
+
+                if(static_cast<uint64_t>(counterBuffer.getHostBuffer().getDataBox()[1]) != numParticles)
                 {
-                    if( hasValidParticleCtx[ idx ] )
-                    {
-                        /* copy attributes and activate particle*/
-                        auto parDest = destFramePtr[ masterVirtualThreadIdxCtx[ idx ] ][ linearIdx ];
-                        auto parDestDeselect = deselect<
-                            bmpl::vector2<
-                                localCellIdx,
-                                multiMask
-                            >
-                        >( parDest );
-
-                        assign(
-                            parDestDeselect,
-                            srcFrame[ srcParticleIdxCtx[ idx ] ]
-                        );
-                        parDest[ localCellIdx_ ] = lCellIdxCtx[ idx ];
-                        parDest[ multiMask_ ] = 1;
-                        /* counter[1] -> number of loaded particles
-                         * this counter is evaluated on host side
-                         * (check that loaded particles by this kernel == loaded particles from HDF5 file)*/
-                        nvidia::atomicAllInc(
-                            acc,
-                            &( counter[ 1 ] ),
-                            ::alpaka::hierarchy::Blocks{}
-                        );
-                    }
+                    log(logLvl, "error load species | counter is %1% but should %2%")
+                        % counterBuffer.getHostBuffer().getDataBox()[1] % numParticles;
+                    throw std::runtime_error("Failed to load expected number of particles to GPU.");
                 }
-            );
-        }
-    };
-} // namespace kernel
-
-    /** Copy particles from big frame to PMacc frame structure
-     *  (Opposite to ConcatListOfFrames)
-     *
-     * - convert a user-defined domainCellIdx to localCellIdx
-     * - processed particles per block <= number of cells per superCell
-     *
-     * @tparam T_LogLvl type of the loc level for debuging output
-     * @tparam T_DestSpecies pmacc:ParticlesBase, type of the destination species
-     * @tparam T_SrcFrame pmacc:ParticlesBox, type of the source particle frame
-     * @tparam T_Space pmacc::DataSpace, type for indicies and offsets within the domain
-     * @tparam T_Identifier Identifier, type of the identifier for the total domain offset
-     * @tparam T_CellDescription pmacc::MappingDescription, type of the domain description
-     *
-     * @param destSpecies particle species instance whose deviceBuffer is written
-     * @param srcFrame device frame with particles (is used as source)
-     * @param numParticles number of particles in srcFrame
-     * @param chunkSize number of particles to process in one kernel call
-     * @param localDomainCellOffset offset in cells to user-defined domain (@see wiki PIConGPU domain definitions)
-     * @param domainCellIdxIdentifier the identifier for the particle domain cellIdx
-     *                                that is calculated back to the local domain
-     *                                with respect to localDomainCellOffset
-     * @param cellDesc supercell domain description
-     * @param logLvl Log level used for information logging
-     */
-    template<
-        typename T_LogLvl,
-        typename T_DestSpecies,
-        typename T_SrcFrame,
-        typename T_Space,
-        typename T_Identifier,
-        typename T_CellDescription
-    >
-    HINLINE void splitIntoListOfFrames(
-        T_DestSpecies & destSpecies,
-        T_SrcFrame srcFrame,
-        uint32_t numParticles,
-        uint32_t const chunkSize,
-        T_Space const & localDomainCellOffset,
-        T_Identifier const domainCellIdxIdentifier,
-        T_CellDescription const & cellDesc,
-        T_LogLvl const & logLvl = T_LogLvl( )
-    )
-    {
-        using SuperCellSize = typename T_CellDescription::SuperCellSize;
-        uint32_t const cellsInSuperCell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-
-        /* counter is used to apply for work, count used frames and count loaded particles
-         * [0] -> offset for loading particles
-         * [1] -> number of loaded particles
-         * [2] -> number of used frames
-         *
-         * all values are zero after initialization
-         */
-        GridBuffer<
-            uint32_t,
-            DIM1
-        > counterBuffer( DataSpace<DIM1>( 3 ) );
-
-        uint32_t const iterationsForLoad = algorithms::math::float2int_ru(
-            static_cast< double >( numParticles ) /
-            static_cast< double >( chunkSize )
-        );
-        uint32_t leftOverParticles = numParticles;
-
-        for( uint32_t i = 0; i < iterationsForLoad; ++i )
-        {
-            /* only load a chunk of particles per iteration to avoid blow up of frame usage */
-            uint32_t currentChunkSize = std::min(
-                leftOverParticles,
-                chunkSize
-            );
-            log(
-                logLvl,
-                "load particles on device chunk offset=%1%; chunk size=%2%; left particles %3%"
-            ) % ( i * chunkSize ) %
-                currentChunkSize %
-                leftOverParticles;
-
-            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-                pmacc::math::CT::volume< SuperCellSize >::type::value
-            >::value;
-
-            PMACC_KERNEL( kernel::SplitIntoListOfFrames< numWorkers >{ } )(
-                algorithms::math::float2int_ru( double( currentChunkSize ) / double( cellsInSuperCell ) ),
-                numWorkers
-            )(
-                counterBuffer.getDeviceBuffer( ).getDataBox( ),
-                destSpecies.getDeviceParticlesBox( ),
-                srcFrame,
-                static_cast< int >( numParticles ),
-                localDomainCellOffset,
-                domainCellIdxIdentifier,
-                cellDesc
-            );
-            destSpecies.fillAllGaps( );
-            leftOverParticles -= currentChunkSize;
-        }
-
-        counterBuffer.deviceToHost( );
-        log(
-            logLvl,
-            "wait for last processed chunk: %1%"
-        ) % T_SrcFrame::getName( );
-
-        __getTransactionEvent( ).waitForFinished( );
-
-        log(
-            logLvl,
-            "used frames to load particles: %1%"
-        ) % counterBuffer.getHostBuffer( ).getDataBox( )[ 2 ];
-
-        if(
-            static_cast<uint64_t>( counterBuffer.getHostBuffer().getDataBox( )[ 1 ] ) !=
-            numParticles
-        )
-        {
-            log(
-                logLvl,
-                "error load species | counter is %1% but should %2%"
-            ) % counterBuffer.getHostBuffer( ).getDataBox( )[ 1 ] %
-                numParticles;
-            throw std::runtime_error( "Failed to load expected number of particles to GPU." );
-        }
-    }
-
-} // namespace operations
-} // namespace particles
+            }
+
+        } // namespace operations
+    } // namespace particles
 } // namespace pmacc
diff --git a/include/pmacc/particles/particleFilter/FilterFactory.hpp b/include/pmacc/particles/particleFilter/FilterFactory.hpp
index 92b3237b3d..649d8465e8 100644
--- a/include/pmacc/particles/particleFilter/FilterFactory.hpp
+++ b/include/pmacc/particles/particleFilter/FilterFactory.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -37,27 +37,12 @@
 
 namespace pmacc
 {
-
-
-
-template<typename UserTypeList = bmpl::vector<NullFrame> >
+    template<typename UserTypeList = bmpl::vector<NullFrame>>
     class FilterFactory
-{
-public:
-
-    typedef
-    typename LinearInherit
-    <
-        typename MakeSeq<
-           DefaultFilter<> ,
-           UserTypeList,
-           TrueFilter
-        >::type
-    >::type FilterType;
-
-};
-
-}//namespace pmacc
-
-
+    {
+    public:
+        typedef
+            typename LinearInherit<typename MakeSeq<DefaultFilter<>, UserTypeList, TrueFilter>::type>::type FilterType;
+    };
 
+} // namespace pmacc
diff --git a/include/pmacc/particles/particleFilter/PositionFilter.hpp b/include/pmacc/particles/particleFilter/PositionFilter.hpp
index 40ecf1eb83..47abf22426 100644
--- a/include/pmacc/particles/particleFilter/PositionFilter.hpp
+++ b/include/pmacc/particles/particleFilter/PositionFilter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,87 +29,82 @@
 
 namespace pmacc
 {
-
-
-namespace privatePositionFilter
-{
-
-template<unsigned T_dim, class Base = NullFrame>
-class PositionFilter : public Base
-{
-public:
-    static constexpr uint32_t dim = T_dim;
-protected:
-    DataSpace<dim> offset;
-    DataSpace<dim> max;
-    DataSpace<dim> superCellIdx;
-
-public:
-
-    HDINLINE PositionFilter()
+    namespace privatePositionFilter
     {
-    }
-
-    HDINLINE void setWindowPosition(DataSpace<dim> offset, DataSpace<dim> size)
+        template<unsigned T_dim, class Base = NullFrame>
+        class PositionFilter : public Base
+        {
+        public:
+            static constexpr uint32_t dim = T_dim;
+
+        protected:
+            DataSpace<dim> offset;
+            DataSpace<dim> max;
+            DataSpace<dim> superCellIdx;
+
+        public:
+            HDINLINE PositionFilter()
+            {
+            }
+
+            HDINLINE void setWindowPosition(DataSpace<dim> offset, DataSpace<dim> size)
+            {
+                this->offset = offset;
+                this->max = offset + size;
+            }
+
+            HDINLINE void setSuperCellPosition(DataSpace<dim> superCellIdx)
+            {
+                this->superCellIdx = superCellIdx;
+            }
+
+            HDINLINE DataSpace<dim> getOffset()
+            {
+                return offset;
+            }
+
+            template<class FRAME>
+            HDINLINE bool operator()(FRAME& frame, lcellId_t id)
+            {
+                DataSpace<dim> localCellIdx = DataSpaceOperations<dim>::template map<typename FRAME::SuperCellSize>(
+                    (uint32_t)(frame[id][localCellIdx_]));
+                DataSpace<dim> pos = this->superCellIdx + localCellIdx;
+                bool result = true;
+                for(uint32_t d = 0; d < dim; ++d)
+                    result = result && (this->offset[d] <= pos[d]) && (pos[d] < this->max[d]);
+                return Base::operator()(frame, id) && result;
+            }
+        };
+
+    } // namespace privatePositionFilter
+
+    /** This wrapper class is needed because for filters we are only allowed to
+     * define one template parameter "base" (it is a constrain from FilterFactory)
+     */
+    template<class Base = NullFrame>
+    class PositionFilter3D : public privatePositionFilter::PositionFilter<DIM3, Base>
     {
-        this->offset = offset;
-        this->max = offset + size;
-    }
+    };
 
-    HDINLINE void setSuperCellPosition(DataSpace<dim> superCellIdx)
+    template<class Base = NullFrame>
+    class PositionFilter2D : public privatePositionFilter::PositionFilter<DIM2, Base>
     {
-        this->superCellIdx = superCellIdx;
-    }
+    };
 
-    HDINLINE DataSpace<dim> getOffset()
-    {
-        return offset;
-    }
+    template<unsigned dim>
+    struct GetPositionFilter;
 
-    template<class FRAME>
-    HDINLINE bool operator()(FRAME & frame, lcellId_t id)
+    template<>
+    struct GetPositionFilter<DIM3>
     {
-        DataSpace<dim> localCellIdx = DataSpaceOperations<dim>::template map<
-            typename FRAME::SuperCellSize
-            > ((uint32_t) (frame[id][localCellIdx_]));
-        DataSpace<dim> pos = this->superCellIdx + localCellIdx;
-        bool result = true;
-        for (uint32_t d = 0; d < dim; ++d)
-            result= result && (this->offset[d] <= pos[d]) && (pos[d]<this->max[d]);
-        return Base::operator() (frame, id) && result;
-    }
-
-};
-
-} //namespace privatePositionFilter
-
-/** This wrapper class is needed because for filters we are only allowed to
- * define one template parameter "base" (it is a constrain from FilterFactory)
- */
-template<class Base = NullFrame>
-class PositionFilter3D : public privatePositionFilter::PositionFilter<DIM3, Base>
-{
-};
-
-template<class Base = NullFrame>
-class PositionFilter2D : public privatePositionFilter::PositionFilter<DIM2, Base>
-{
-};
-
-template<unsigned dim>
-struct GetPositionFilter;
+        typedef PositionFilter3D<> type;
+    };
 
-template<>
-struct GetPositionFilter<DIM3>
-{
-    typedef PositionFilter3D<> type;
-};
-
-template<>
-struct GetPositionFilter<DIM2>
-{
-    typedef PositionFilter2D<> type;
-};
+    template<>
+    struct GetPositionFilter<DIM2>
+    {
+        typedef PositionFilter2D<> type;
+    };
 
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/particleFilter/system/DefaultFilter.hpp b/include/pmacc/particles/particleFilter/system/DefaultFilter.hpp
index 1825772ec4..6e588a2a52 100644
--- a/include/pmacc/particles/particleFilter/system/DefaultFilter.hpp
+++ b/include/pmacc/particles/particleFilter/system/DefaultFilter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -27,22 +27,21 @@
 
 namespace pmacc
 {
-
-
-template<class Base = NullFrame>
-class DefaultFilter : public Base
-{
+    template<class Base = NullFrame>
+    class DefaultFilter : public Base
+    {
     private:
         bool filterActive;
-    public:
 
+    public:
         HDINLINE DefaultFilter() : filterActive(false)
-        {}
+        {
+        }
 
         template<class FRAME>
-        HDINLINE bool operator()(FRAME & frame,lcellId_t id)
+        HDINLINE bool operator()(FRAME& frame, lcellId_t id)
         {
-            return (!filterActive)||Base::operator() (frame,id);
+            return (!filterActive) || Base::operator()(frame, id);
         }
 
         /*disable or enable filter
@@ -51,41 +50,42 @@ class DefaultFilter : public Base
          */
         HDINLINE void setStatus(bool active)
         {
-            filterActive=active;
+            filterActive = active;
         }
 
         HDINLINE bool getStatus()
         {
             return filterActive;
         }
-};
+    };
 
-template<>
-class DefaultFilter<NullFrame>
-{
+    template<>
+    class DefaultFilter<NullFrame>
+    {
     private:
         bool alwaysTrue;
-    public:
 
+    public:
         HDINLINE DefaultFilter() : alwaysTrue(true)
-        {}
+        {
+        }
 
         template<class FRAME>
-        HDINLINE bool operator()(FRAME & frame,lcellId_t id)
+        HDINLINE bool operator()(FRAME& frame, lcellId_t id)
         {
             return alwaysTrue;
         }
 
         HDINLINE void setDefault(bool value)
         {
-            alwaysTrue=value;
+            alwaysTrue = value;
         }
 
         HDINLINE bool getDefault()
         {
             return alwaysTrue;
         }
-};
+    };
 
 
-} //namespace Frame
+} // namespace pmacc
diff --git a/include/pmacc/particles/particleFilter/system/FalseFilter.hpp b/include/pmacc/particles/particleFilter/system/FalseFilter.hpp
index dcde30aace..10c1dfa2cd 100644
--- a/include/pmacc/particles/particleFilter/system/FalseFilter.hpp
+++ b/include/pmacc/particles/particleFilter/system/FalseFilter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -27,12 +27,9 @@
 
 namespace pmacc
 {
-
     class FalseFilter
     {
-
     public:
-
         FalseFilter()
         {
         }
@@ -42,10 +39,10 @@ namespace pmacc
         }
 
         template<class FRAME>
-        bool operator()(FRAME & frame, lcellId_t id)
+        bool operator()(FRAME& frame, lcellId_t id)
         {
             return false;
         }
     };
 
-} //namespace Frame
+} // namespace pmacc
diff --git a/include/pmacc/particles/particleFilter/system/TrueFilter.hpp b/include/pmacc/particles/particleFilter/system/TrueFilter.hpp
index 4d0f82f20e..6bd93eeaf9 100644
--- a/include/pmacc/particles/particleFilter/system/TrueFilter.hpp
+++ b/include/pmacc/particles/particleFilter/system/TrueFilter.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -27,12 +27,9 @@
 
 namespace pmacc
 {
-
     class TrueFilter
     {
-
     public:
-
         HDINLINE TrueFilter()
         {
         }
@@ -44,4 +41,4 @@ namespace pmacc
         }
     };
 
-} //namespace Frame
+} // namespace pmacc
diff --git a/include/pmacc/particles/policies/DeleteParticles.hpp b/include/pmacc/particles/policies/DeleteParticles.hpp
index 7924dd3fda..d08cecdbd0 100644
--- a/include/pmacc/particles/policies/DeleteParticles.hpp
+++ b/include/pmacc/particles/policies/DeleteParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -21,28 +21,29 @@
 
 #pragma once
 
-namespace pmacc{
-namespace particles {
-namespace policies {
-
-    /**
-     * Policy for HandleGuardParticles that removes all particles from guard cells
-     */
-    struct DeleteParticles
+namespace pmacc
+{
+    namespace particles
     {
-        template< class T_Particles >
-        void
-        handleOutgoing(T_Particles& par, int32_t direction) const
+        namespace policies
         {
-            par.deleteGuardParticles(direction);
-        }
+            /**
+             * Policy for HandleGuardParticles that removes all particles from guard cells
+             */
+            struct DeleteParticles
+            {
+                template<class T_Particles>
+                void handleOutgoing(T_Particles& par, int32_t direction) const
+                {
+                    par.deleteGuardParticles(direction);
+                }
 
-        template< class T_Particles >
-        void
-        handleIncoming(T_Particles& par, int32_t direction) const
-        {}
-    };
+                template<class T_Particles>
+                void handleIncoming(T_Particles& par, int32_t direction) const
+                {
+                }
+            };
 
-}  // namespace policies
-}  // namespace particles
-}  // namespace pmacc
+        } // namespace policies
+    } // namespace particles
+} // namespace pmacc
diff --git a/include/pmacc/particles/policies/ExchangeParticles.hpp b/include/pmacc/particles/policies/ExchangeParticles.hpp
index 1a431131e1..e92506bb37 100644
--- a/include/pmacc/particles/policies/ExchangeParticles.hpp
+++ b/include/pmacc/particles/policies/ExchangeParticles.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -24,31 +24,31 @@
 #include "pmacc/types.hpp"
 #include "pmacc/Environment.hpp"
 
-namespace pmacc{
-namespace particles {
-namespace policies {
-
-    /**
-     * Policy for \see HandleGuardRegion that moves particles from guard cells to exchange buffers
-     * and sends those to the correct neighbors
-     */
-    struct ExchangeParticles
+namespace pmacc
+{
+    namespace particles
     {
-        template< class T_Particles >
-        void
-        handleOutgoing(T_Particles& par, int32_t direction) const
+        namespace policies
         {
-            Environment<>::get().ParticleFactory().createTaskSendParticlesExchange(par, direction);
-        }
+            /**
+             * Policy for \see HandleGuardRegion that moves particles from guard cells to exchange buffers
+             * and sends those to the correct neighbors
+             */
+            struct ExchangeParticles
+            {
+                template<class T_Particles>
+                void handleOutgoing(T_Particles& par, int32_t direction) const
+                {
+                    Environment<>::get().ParticleFactory().createTaskSendParticlesExchange(par, direction);
+                }
 
-        template< class T_Particles >
-        void
-        handleIncoming(T_Particles& par, int32_t direction) const
-        {
-            Environment<>::get().ParticleFactory().createTaskReceiveParticlesExchange(par, direction);
-        }
-    };
+                template<class T_Particles>
+                void handleIncoming(T_Particles& par, int32_t direction) const
+                {
+                    Environment<>::get().ParticleFactory().createTaskReceiveParticlesExchange(par, direction);
+                }
+            };
 
-}  // namespace policies
-}  // namespace particles
-}  // namespace pmacc
+        } // namespace policies
+    } // namespace particles
+} // namespace pmacc
diff --git a/include/pmacc/particles/tasks/ParticleFactory.hpp b/include/pmacc/particles/tasks/ParticleFactory.hpp
index 8b30832b54..4f71881cd1 100644
--- a/include/pmacc/particles/tasks/ParticleFactory.hpp
+++ b/include/pmacc/particles/tasks/ParticleFactory.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,7 +27,6 @@
 
 namespace pmacc
 {
-
     /**
      * Singleton Factory-pattern class for creation of several types of EventTasks.
      * Tasks are not actually 'returned' but immediately initialised and
@@ -36,38 +35,40 @@ namespace pmacc
     class ParticleFactory
     {
     public:
-
         /**
          * Creates a TaskReceive.
          * @param ex Exchange to create new TaskReceive with
          * @param task_out returns the newly created task
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
         template<class ParBase>
-        EventTask createTaskParticlesReceive(ParBase &parBuffer,
-        ITask *registeringTask = nullptr);
+        EventTask createTaskParticlesReceive(ParBase& parBuffer, ITask* registeringTask = nullptr);
 
         template<class ParBase>
-        EventTask createTaskReceiveParticlesExchange(ParBase &parBase, uint32_t exchange,
-        ITask *registeringTask = nullptr);
+        EventTask createTaskReceiveParticlesExchange(
+            ParBase& parBase,
+            uint32_t exchange,
+            ITask* registeringTask = nullptr);
 
         /**
          * Creates a TaskSend.
          * @param ex Exchange to create new TaskSend with
          * @param task_in TaskReceive to register at new TaskSend
-         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
+         * @param registeringTask optional pointer to an ITask which should be registered at the new task as an
+         * observer
          */
         template<class ParBase>
-        EventTask createTaskParticlesSend(ParBase &parBase,
-        ITask *registeringTask = nullptr);
+        EventTask createTaskParticlesSend(ParBase& parBase, ITask* registeringTask = nullptr);
 
         template<class ParBase>
-        EventTask createTaskSendParticlesExchange(ParBase &parBase, uint32_t exchange,
-        ITask *registeringTask = nullptr);
+        EventTask createTaskSendParticlesExchange(
+            ParBase& parBase,
+            uint32_t exchange,
+            ITask* registeringTask = nullptr);
 
 
     private:
-
         friend struct detail::Environment;
 
         /**
@@ -80,10 +81,9 @@ namespace pmacc
             return instance;
         }
 
-        ParticleFactory() { };
-
-        ParticleFactory(const ParticleFactory&) { };
+        ParticleFactory(){};
 
+        ParticleFactory(const ParticleFactory&){};
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/tasks/ParticleFactory.tpp b/include/pmacc/particles/tasks/ParticleFactory.tpp
index 7698dc3772..23228f4681 100644
--- a/include/pmacc/particles/tasks/ParticleFactory.tpp
+++ b/include/pmacc/particles/tasks/ParticleFactory.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -33,47 +33,43 @@
 
 namespace pmacc
 {
-
     template<class ParBase>
-    inline EventTask ParticleFactory::createTaskParticlesReceive(ParBase &parBase,
-    ITask *registeringTask)
+    inline EventTask ParticleFactory::createTaskParticlesReceive(ParBase& parBase, ITask* registeringTask)
     {
-        TaskParticlesReceive<ParBase>* task = new TaskParticlesReceive<ParBase > (parBase);
+        TaskParticlesReceive<ParBase>* task = new TaskParticlesReceive<ParBase>(parBase);
 
         return Environment<>::get().Factory().startTask(*task, registeringTask);
     }
 
     template<class ParBase>
-    inline EventTask ParticleFactory::createTaskReceiveParticlesExchange(ParBase &parBase, uint32_t exchange,
-    ITask *registeringTask)
+    inline EventTask ParticleFactory::createTaskReceiveParticlesExchange(
+        ParBase& parBase,
+        uint32_t exchange,
+        ITask* registeringTask)
     {
-        TaskReceiveParticlesExchange<ParBase>* task = new TaskReceiveParticlesExchange<ParBase > (parBase, exchange);
+        TaskReceiveParticlesExchange<ParBase>* task = new TaskReceiveParticlesExchange<ParBase>(parBase, exchange);
 
         return Environment<>::get().Factory().startTask(*task, registeringTask);
     }
 
     template<class ParBase>
-    inline EventTask ParticleFactory::createTaskParticlesSend(ParBase &parBase,
-    ITask *registeringTask)
+    inline EventTask ParticleFactory::createTaskParticlesSend(ParBase& parBase, ITask* registeringTask)
     {
-        TaskParticlesSend<ParBase>* task = new TaskParticlesSend<ParBase > (parBase);
+        TaskParticlesSend<ParBase>* task = new TaskParticlesSend<ParBase>(parBase);
 
         return Environment<>::get().Factory().startTask(*task, registeringTask);
     }
 
     template<class ParBase>
-    inline EventTask ParticleFactory::createTaskSendParticlesExchange(ParBase &parBase, uint32_t exchange,
-    ITask *registeringTask)
+    inline EventTask ParticleFactory::createTaskSendParticlesExchange(
+        ParBase& parBase,
+        uint32_t exchange,
+        ITask* registeringTask)
     {
-        TaskSendParticlesExchange<ParBase>* task = new TaskSendParticlesExchange<ParBase > (parBase, exchange);
+        TaskSendParticlesExchange<ParBase>* task = new TaskSendParticlesExchange<ParBase>(parBase, exchange);
 
         return Environment<>::get().Factory().startTask(*task, registeringTask);
     }
 
 
-
-} //namespace pmacc
-
-
-
-
+} // namespace pmacc
diff --git a/include/pmacc/particles/tasks/TaskParticlesReceive.hpp b/include/pmacc/particles/tasks/TaskParticlesReceive.hpp
index ee96923fec..c604bfd604 100644
--- a/include/pmacc/particles/tasks/TaskParticlesReceive.hpp
+++ b/include/pmacc/particles/tasks/TaskParticlesReceive.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,12 +28,10 @@
 
 namespace pmacc
 {
-
     template<class T_Particles>
     class TaskParticlesReceive : public MPITask
     {
     public:
-
         typedef T_Particles Particles;
         typedef typename Particles::HandleGuardRegion HandleGuardRegion;
         typedef typename HandleGuardRegion::HandleExchanged HandleExchanged;
@@ -45,9 +43,9 @@ namespace pmacc
             Exchanges = traits::NumberOfExchanges<Dim>::value
         };
 
-        TaskParticlesReceive(Particles &parBase) :
-        parBase(parBase),
-        state(Constructor){ }
+        TaskParticlesReceive(Particles& parBase) : parBase(parBase), state(Constructor)
+        {
+        }
 
         virtual void init()
         {
@@ -56,13 +54,13 @@ namespace pmacc
             HandleExchanged handleExchanged;
             HandleNotExchanged handleNotExchanged;
 
-            for (int i = 1; i < Exchanges; ++i)
+            for(int i = 1; i < Exchanges; ++i)
             {
                 /* Start new transaction */
                 __startTransaction(serialEvent);
 
                 /* Handle particles */
-                if (parBase.getParticlesBuffer().hasReceiveExchange(i))
+                if(parBase.getParticlesBuffer().hasReceiveExchange(i))
                     handleExchanged.handleIncoming(parBase, i);
                 else
                     handleNotExchanged.handleIncoming(parBase, i);
@@ -76,27 +74,27 @@ namespace pmacc
 
         bool executeIntern()
         {
-            switch (state)
+            switch(state)
             {
-                case Init:
-                    break;
-                case WaitForReceived:
-                    if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId()))
-                        state = CallFillGaps;
-                    break;
-                case CallFillGaps:
-                    state = WaitForFillGaps;
-                    __startTransaction();
-                    parBase.fillBorderGaps();
-                    tmpEvent = __endTransaction();
-                    state = Finish;
-                    break;
-                case WaitForFillGaps:
-                    break;
-                case Finish:
-                    return nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId());
-                default:
-                    return false;
+            case Init:
+                break;
+            case WaitForReceived:
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId()))
+                    state = CallFillGaps;
+                break;
+            case CallFillGaps:
+                state = WaitForFillGaps;
+                __startTransaction();
+                parBase.fillBorderGaps();
+                tmpEvent = __endTransaction();
+                state = Finish;
+                break;
+            case WaitForFillGaps:
+                break;
+            case Finish:
+                return nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId());
+            default:
+                return false;
             }
 
             return false;
@@ -107,7 +105,9 @@ namespace pmacc
             notify(this->myId, RECVFINISHED, nullptr);
         }
 
-        void event(id_t, EventType, IEventData*) { }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
         std::string toString()
         {
@@ -115,7 +115,6 @@ namespace pmacc
         }
 
     private:
-
         enum state_t
         {
             Constructor,
@@ -131,7 +130,6 @@ namespace pmacc
         Particles& parBase;
         state_t state;
         EventTask tmpEvent;
-
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/tasks/TaskParticlesSend.hpp b/include/pmacc/particles/tasks/TaskParticlesSend.hpp
index 706be03e62..cbff5240ed 100644
--- a/include/pmacc/particles/tasks/TaskParticlesSend.hpp
+++ b/include/pmacc/particles/tasks/TaskParticlesSend.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,97 +27,92 @@
 
 namespace pmacc
 {
-
-template<class T_Particles>
-class TaskParticlesSend : public MPITask
-{
-public:
-
-    typedef T_Particles Particles;
-    typedef typename Particles::HandleGuardRegion HandleGuardRegion;
-    typedef typename HandleGuardRegion::HandleExchanged HandleExchanged;
-    typedef typename HandleGuardRegion::HandleNotExchanged HandleNotExchanged;
-
-    enum
-    {
-        Dim = Particles::Dim,
-        Exchanges = traits::NumberOfExchanges<Dim>::value
-    };
-
-    TaskParticlesSend(Particles &parBase) :
-    parBase(parBase),
-    state(Constructor)
+    template<class T_Particles>
+    class TaskParticlesSend : public MPITask
     {
-    }
+    public:
+        typedef T_Particles Particles;
+        typedef typename Particles::HandleGuardRegion HandleGuardRegion;
+        typedef typename HandleGuardRegion::HandleExchanged HandleExchanged;
+        typedef typename HandleGuardRegion::HandleNotExchanged HandleNotExchanged;
 
-    virtual void init()
-    {
-        state = Init;
-        EventTask serialEvent = __getTransactionEvent();
-        HandleExchanged handleExchanged;
-        HandleNotExchanged handleNotExchanged;
-
-        for (int i = 1; i < Exchanges; ++i)
+        enum
         {
-            /* Start new transaction */
-            __startTransaction(serialEvent);
+            Dim = Particles::Dim,
+            Exchanges = traits::NumberOfExchanges<Dim>::value
+        };
 
-            /* Handle particles */
-            if (parBase.getParticlesBuffer().hasSendExchange(i))
-                handleExchanged.handleOutgoing(parBase, i);
-            else
-                handleNotExchanged.handleOutgoing(parBase, i);
-
-            /* End transaction */
-            tmpEvent += __endTransaction();
+        TaskParticlesSend(Particles& parBase) : parBase(parBase), state(Constructor)
+        {
         }
 
-        state = WaitForSend;
-    }
+        virtual void init()
+        {
+            state = Init;
+            EventTask serialEvent = __getTransactionEvent();
+            HandleExchanged handleExchanged;
+            HandleNotExchanged handleNotExchanged;
+
+            for(int i = 1; i < Exchanges; ++i)
+            {
+                /* Start new transaction */
+                __startTransaction(serialEvent);
+
+                /* Handle particles */
+                if(parBase.getParticlesBuffer().hasSendExchange(i))
+                    handleExchanged.handleOutgoing(parBase, i);
+                else
+                    handleNotExchanged.handleOutgoing(parBase, i);
+
+                /* End transaction */
+                tmpEvent += __endTransaction();
+            }
+
+            state = WaitForSend;
+        }
 
-    bool executeIntern()
-    {
-        switch (state)
+        bool executeIntern()
         {
-        case Init:
-            break;
-        case WaitForSend:
-            return nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId());
-        default:
+            switch(state)
+            {
+            case Init:
+                break;
+            case WaitForSend:
+                return nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId());
+            default:
+                return false;
+            }
+
             return false;
         }
 
-        return false;
-    }
+        virtual ~TaskParticlesSend()
+        {
+            notify(this->myId, RECVFINISHED, nullptr);
+        }
 
-    virtual ~TaskParticlesSend()
-    {
-        notify(this->myId, RECVFINISHED, nullptr);
-    }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
-    void event(id_t, EventType, IEventData*)
-    {
-    }
+        std::string toString()
+        {
+            return "TaskParticlesSend";
+        }
 
-    std::string toString()
-    {
-        return "TaskParticlesSend";
-    }
+    private:
+        enum state_t
+        {
+            Constructor,
+            Init,
+            WaitForSend
 
-private:
+        };
 
-    enum state_t
-    {
-        Constructor,
-        Init,
-        WaitForSend
 
+        Particles& parBase;
+        state_t state;
+        EventTask tmpEvent;
     };
 
-
-    Particles& parBase;
-    state_t state;
-    EventTask tmpEvent;
-};
-
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/tasks/TaskReceiveParticlesExchange.hpp b/include/pmacc/particles/tasks/TaskReceiveParticlesExchange.hpp
index d583dbadc9..76285c0fec 100644
--- a/include/pmacc/particles/tasks/TaskReceiveParticlesExchange.hpp
+++ b/include/pmacc/particles/tasks/TaskReceiveParticlesExchange.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,25 +28,25 @@
 
 namespace pmacc
 {
-
     template<class ParBase>
     class TaskReceiveParticlesExchange : public MPITask
     {
     public:
-
         enum
         {
             Dim = ParBase::Dim,
             Exchanges = traits::NumberOfExchanges<Dim>::value
         };
 
-        TaskReceiveParticlesExchange(ParBase &parBase, uint32_t exchange) :
-        parBase(parBase),
-        exchange(exchange),
-        state(Constructor),
-        maxSize(parBase.getParticlesBuffer().getReceiveExchangeStack(exchange).getMaxParticlesCount()),
-        initDependency(__getTransactionEvent()),
-        lastSize(0) { }
+        TaskReceiveParticlesExchange(ParBase& parBase, uint32_t exchange)
+            : parBase(parBase)
+            , exchange(exchange)
+            , state(Constructor)
+            , maxSize(parBase.getParticlesBuffer().getReceiveExchangeStack(exchange).getMaxParticlesCount())
+            , initDependency(__getTransactionEvent())
+            , lastSize(0)
+        {
+        }
 
         virtual void init()
         {
@@ -58,48 +58,49 @@ namespace pmacc
 
         bool executeIntern()
         {
-            switch (state)
+            switch(state)
             {
-                case Init:
-                    break;
-                case WaitForReceive:
-
-                    if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(lastReceiveEvent.getTaskId()))
+            case Init:
+                break;
+            case WaitForReceive:
+
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(lastReceiveEvent.getTaskId()))
+                {
+                    state = InitInsert;
+                    // bash is finished
+                    __startTransaction();
+                    lastSize
+                        = parBase.getParticlesBuffer().getReceiveExchangeStack(exchange).getHostParticlesCurrentSize();
+                    parBase.insertParticles(exchange);
+                    tmpEvent = __endTransaction();
+                    initDependency = tmpEvent;
+                    state = WaitForInsert;
+                }
+
+                break;
+            case InitInsert:
+                break;
+            case WaitForInsert:
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId()))
+                {
+                    state = Wait;
+                    PMACC_ASSERT(lastSize <= maxSize);
+                    // check for next bash round
+                    if(lastSize == maxSize)
+                        init(); // call init and run a full send cycle
+                    else
                     {
-                        state = InitInsert;
-                        //bash is finished
-                        __startTransaction();
-                        lastSize = parBase.getParticlesBuffer().getReceiveExchangeStack(exchange).getHostParticlesCurrentSize();
-                        parBase.insertParticles(exchange);
-                        tmpEvent = __endTransaction();
-                        initDependency = tmpEvent;
-                        state = WaitForInsert;
+                        state = Finished;
+                        return true;
                     }
-
-                    break;
-                case InitInsert:
-                    break;
-                case WaitForInsert:
-                    if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId()))
-                    {
-                        state=Wait;
-                        PMACC_ASSERT(lastSize <= maxSize);
-                        //check for next bash round
-                        if (lastSize == maxSize)
-                            init(); //call init and run a full send cycle
-                        else
-                        {
-                            state = Finished;
-                            return true;
-                        }
-                    }
-                    break;
-                case Wait:
-                    break;
-                case Finished:
-                    return true;
-                default:
-                    return false;
+                }
+                break;
+            case Wait:
+                break;
+            case Finished:
+                return true;
+            default:
+                return false;
             }
 
             return false;
@@ -110,7 +111,9 @@ namespace pmacc
             notify(this->myId, RECVFINISHED, nullptr);
         }
 
-        void event(id_t, EventType, IEventData*) { }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
         std::string toString()
         {
@@ -118,7 +121,6 @@ namespace pmacc
         }
 
     private:
-
         enum state_t
         {
             Constructor,
@@ -132,8 +134,6 @@ namespace pmacc
         };
 
 
-
-
         ParBase& parBase;
         state_t state;
         EventTask tmpEvent;
@@ -144,4 +144,4 @@ namespace pmacc
         size_t lastSize;
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/tasks/TaskSendParticlesExchange.hpp b/include/pmacc/particles/tasks/TaskSendParticlesExchange.hpp
index a2fba34afc..830a186ca9 100644
--- a/include/pmacc/particles/tasks/TaskSendParticlesExchange.hpp
+++ b/include/pmacc/particles/tasks/TaskSendParticlesExchange.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,24 +28,26 @@
 
 namespace pmacc
 {
-
     template<class ParBase>
     class TaskSendParticlesExchange : public MPITask
     {
     public:
-
         enum
         {
             Dim = ParBase::Dim,
         };
 
-        TaskSendParticlesExchange(ParBase &parBase, uint32_t exchange) :
-        parBase(parBase),
-        exchange(exchange),
-        state(Constructor),
-        maxSize(parBase.getParticlesBuffer().getSendExchangeStack(exchange).getMaxParticlesCount()),
-        initDependency(__getTransactionEvent()),
-        lastSize(0),lastSendEvent(EventTask()),retryCounter(0){ }
+        TaskSendParticlesExchange(ParBase& parBase, uint32_t exchange)
+            : parBase(parBase)
+            , exchange(exchange)
+            , state(Constructor)
+            , maxSize(parBase.getParticlesBuffer().getSendExchangeStack(exchange).getMaxParticlesCount())
+            , initDependency(__getTransactionEvent())
+            , lastSize(0)
+            , lastSendEvent(EventTask())
+            , retryCounter(0)
+        {
+        }
 
         virtual void init()
         {
@@ -58,54 +60,54 @@ namespace pmacc
 
         bool executeIntern()
         {
-            switch (state)
+            switch(state)
             {
-                case Init:
-                    break;
-                case WaitForBash:
-
-                    if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId()) &&
-                        nullptr == Environment<>::get().Manager().getITaskIfNotFinished(lastSendEvent.getTaskId()))
+            case Init:
+                break;
+            case WaitForBash:
+
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId())
+                   && nullptr == Environment<>::get().Manager().getITaskIfNotFinished(lastSendEvent.getTaskId()))
+                {
+                    state = InitSend;
+                    // bash is finished
+                    __startTransaction();
+                    lastSize
+                        = parBase.getParticlesBuffer().getSendExchangeStack(exchange).getDeviceParticlesCurrentSize();
+                    lastSendEvent = parBase.getParticlesBuffer().asyncSendParticles(__getTransactionEvent(), exchange);
+                    initDependency = lastSendEvent;
+                    __endTransaction();
+                    state = WaitForSend;
+                }
+
+                break;
+            case InitSend:
+                break;
+            case WaitForSend:
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId()))
+                {
+                    PMACC_ASSERT(lastSize <= maxSize);
+                    // check for next bash round
+                    if(lastSize == maxSize)
                     {
-                        state = InitSend;
-                        //bash is finished
-                        __startTransaction();
-                        lastSize = parBase.getParticlesBuffer().getSendExchangeStack(exchange).getDeviceParticlesCurrentSize();
-                        lastSendEvent = parBase.getParticlesBuffer().asyncSendParticles(__getTransactionEvent(), exchange);
-                        initDependency = lastSendEvent;
-                        __endTransaction();
-                        state = WaitForSend;
+                        ++retryCounter;
+                        init(); // call init and run a full send cycle
                     }
-
-                    break;
-                case InitSend:
-                    break;
-                case WaitForSend:
-                    if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(tmpEvent.getTaskId()))
-                    {
-                        PMACC_ASSERT(lastSize <= maxSize);
-                        //check for next bash round
-                        if (lastSize == maxSize)
-                        {
-                            ++retryCounter;
-                            init(); //call init and run a full send cycle
-
-                        }
-                        else
-                            state = WaitForSendEnd;
-                    }
-                    break;
-                case WaitForSendEnd:
-                    if (nullptr == Environment<>::get().Manager().getITaskIfNotFinished(lastSendEvent.getTaskId()))
-                    {
-                        state = Finished;
-                        return true;
-                    }
-                    break;
-                case Finished:
+                    else
+                        state = WaitForSendEnd;
+                }
+                break;
+            case WaitForSendEnd:
+                if(nullptr == Environment<>::get().Manager().getITaskIfNotFinished(lastSendEvent.getTaskId()))
+                {
+                    state = Finished;
                     return true;
-                default:
-                    return false;
+                }
+                break;
+            case Finished:
+                return true;
+            default:
+                return false;
             }
 
             return false;
@@ -116,16 +118,16 @@ namespace pmacc
             notify(this->myId, RECVFINISHED, nullptr);
             if(retryCounter != 0)
             {
-                std::cerr << "Send/receive buffer for species " <<
-                    ParBase::FrameType::getName() <<
-                    " is too small (max: " << maxSize <<
-                    ", direction: " << exchange << " '" << ExchangeTypeNames{}[exchange] << "'" <<
-                    ", retries: " << retryCounter <<
-                    ")" << std::endl;
+                std::cerr << "Send/receive buffer for species " << ParBase::FrameType::getName()
+                          << " is too small (max: " << maxSize << ", direction: " << exchange << " '"
+                          << ExchangeTypeNames{}[exchange] << "'"
+                          << ", retries: " << retryCounter << ")" << std::endl;
             }
         }
 
-        void event(id_t, EventType, IEventData*) { }
+        void event(id_t, EventType, IEventData*)
+        {
+        }
 
         std::string toString()
         {
@@ -133,7 +135,6 @@ namespace pmacc
         }
 
     private:
-
         enum state_t
         {
             Constructor,
@@ -158,4 +159,4 @@ namespace pmacc
         size_t retryCounter;
     };
 
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/particles/traits/FilterByFlag.hpp b/include/pmacc/particles/traits/FilterByFlag.hpp
index 5d5c2ca045..5f008b3f32 100644
--- a/include/pmacc/particles/traits/FilterByFlag.hpp
+++ b/include/pmacc/particles/traits/FilterByFlag.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau
+/* Copyright 2015-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -28,33 +28,30 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace traits
-{
-
-/** Return a new sequence of particle species carrying flag.
- *
- * @tparam T_MPLSeq sequence of particle species
- * @tparam T_Flag flag to be filtered
- */
-template<typename T_MPLSeq, typename T_Flag>
-struct FilterByFlag
-{
-    typedef T_MPLSeq MPLSeq;
-    typedef T_Flag Flag;
-
-    template<typename T_Species>
-    struct HasFlag
+    namespace particles
     {
-        typedef typename ::pmacc::traits::HasFlag<
-            typename T_Species::FrameType,
-            Flag>::type type;
-    };
+        namespace traits
+        {
+            /** Return a new sequence of particle species carrying flag.
+             *
+             * @tparam T_MPLSeq sequence of particle species
+             * @tparam T_Flag flag to be filtered
+             */
+            template<typename T_MPLSeq, typename T_Flag>
+            struct FilterByFlag
+            {
+                typedef T_MPLSeq MPLSeq;
+                typedef T_Flag Flag;
+
+                template<typename T_Species>
+                struct HasFlag
+                {
+                    typedef typename ::pmacc::traits::HasFlag<typename T_Species::FrameType, Flag>::type type;
+                };
 
-    typedef typename bmpl::copy_if<MPLSeq, HasFlag<bmpl::_> >::type type;
-};
+                typedef typename bmpl::copy_if<MPLSeq, HasFlag<bmpl::_>>::type type;
+            };
 
-}//namespace traits
-}//namespace particles
-}//namespace pmacc
+        } // namespace traits
+    } // namespace particles
+} // namespace pmacc
diff --git a/include/pmacc/particles/traits/FilterByIdentifier.hpp b/include/pmacc/particles/traits/FilterByIdentifier.hpp
index da20b3e675..772555d121 100644
--- a/include/pmacc/particles/traits/FilterByIdentifier.hpp
+++ b/include/pmacc/particles/traits/FilterByIdentifier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Heiko Burau, Rene Widera
+/* Copyright 2015-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,42 +29,33 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace traits
-{
-
-    /** Return a new sequence of species which carry the identifier.
-     *
-     * @tparam T_MPLSeq sequence of particle species
-     * @tparam T_Identifier identifier to be filtered
-     *
-     * @typedef type boost mpl forward sequence
-     */
-    template<
-        typename T_MPLSeq,
-        typename T_Identifier
-    >
-    struct FilterByIdentifier
+    namespace particles
     {
-        using MPLSeq = T_MPLSeq;
-        using Identifier = T_Identifier;
-
-        template< typename T_Species >
-        struct HasIdentifier
+        namespace traits
         {
-            using type = typename ::pmacc::traits::HasIdentifier<
-                typename T_Species::FrameType,
-                Identifier
-            >::type;
-        };
-
-        using type = typename bmpl::copy_if<
-            MPLSeq,
-            HasIdentifier< bmpl::_ >
-        >::type;
-    };
-
-}//namespace traits
-}//namespace particles
-}//namespace pmacc
+            /** Return a new sequence of species which carry the identifier.
+             *
+             * @tparam T_MPLSeq sequence of particle species
+             * @tparam T_Identifier identifier to be filtered
+             *
+             * @typedef type boost mpl forward sequence
+             */
+            template<typename T_MPLSeq, typename T_Identifier>
+            struct FilterByIdentifier
+            {
+                using MPLSeq = T_MPLSeq;
+                using Identifier = T_Identifier;
+
+                template<typename T_Species>
+                struct HasIdentifier
+                {
+                    using type =
+                        typename ::pmacc::traits::HasIdentifier<typename T_Species::FrameType, Identifier>::type;
+                };
+
+                using type = typename bmpl::copy_if<MPLSeq, HasIdentifier<bmpl::_>>::type;
+            };
+
+        } // namespace traits
+    } // namespace particles
+} // namespace pmacc
diff --git a/include/pmacc/particles/traits/ResolveAliasFromSpecies.hpp b/include/pmacc/particles/traits/ResolveAliasFromSpecies.hpp
index 8d1e080634..1301854890 100644
--- a/include/pmacc/particles/traits/ResolveAliasFromSpecies.hpp
+++ b/include/pmacc/particles/traits/ResolveAliasFromSpecies.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -29,59 +29,58 @@
 
 namespace pmacc
 {
-namespace particles
-{
-namespace traits
-{
-
-/** Resolves a custom alias in the flag list of a particle species.
- *
- * Example:
- *
- * \code{.cpp}
- * typedef bmpl::vector<
- *   particlePusher<UsedParticlePusher>,
- *   shape<UsedParticleShape>,
- *   interpolation<UsedField2Particle>,
- *   current<UsedParticleCurrentSolver>,
- *   massRatio<MassRatioElectrons>,
- *   chargeRatio<ChargeRatioElectrons>,
- *   synchrotronPhotons<PIC_Photons>
- * > ParticleFlagsElectrons;
- *
- * typedef picongpu::Particles<
- *     PMACC_CSTRING( "e" ),
- *     ParticleFlagsElectrons,
- *     DefaultAttributesSeq
- * > PIC_Electrons;
- *
- * typedef typename ResolveAliasFromSpecies<
- *      PIC_Electrons,
- *      synchrotronPhotons<>
- * >::type PhotonSpecies;
- * boost::static_assert(boost::is_same<PhotonsSpecies, PIC_Photons>::value);
- * \endcode
- *
- * \tparam T_SpeciesType particle species
- * \tparam T_Alias alias
- */
-template<typename T_SpeciesType, typename T_Alias>
-struct ResolveAliasFromSpecies;
+    namespace particles
+    {
+        namespace traits
+        {
+            /** Resolves a custom alias in the flag list of a particle species.
+             *
+             * Example:
+             *
+             * \code{.cpp}
+             * typedef bmpl::vector<
+             *   particlePusher<UsedParticlePusher>,
+             *   shape<UsedParticleShape>,
+             *   interpolation<UsedField2Particle>,
+             *   current<UsedParticleCurrentSolver>,
+             *   massRatio<MassRatioElectrons>,
+             *   chargeRatio<ChargeRatioElectrons>,
+             *   synchrotronPhotons<PIC_Photons>
+             * > ParticleFlagsElectrons;
+             *
+             * typedef picongpu::Particles<
+             *     PMACC_CSTRING( "e" ),
+             *     ParticleFlagsElectrons,
+             *     DefaultAttributesSeq
+             * > PIC_Electrons;
+             *
+             * typedef typename ResolveAliasFromSpecies<
+             *      PIC_Electrons,
+             *      synchrotronPhotons<>
+             * >::type PhotonSpecies;
+             * boost::static_assert(boost::is_same<PhotonsSpecies, PIC_Photons>::value);
+             * \endcode
+             *
+             * \tparam T_SpeciesType particle species
+             * \tparam T_Alias alias
+             */
+            template<typename T_SpeciesType, typename T_Alias>
+            struct ResolveAliasFromSpecies;
 
-template<typename T_SpeciesType, template<typename,typename> class T_Object, typename T_AnyType>
-struct ResolveAliasFromSpecies<T_SpeciesType, T_Object<T_AnyType,pmacc::pmacc_isAlias> >
-{
-    typedef T_SpeciesType SpeciesType;
-    typedef T_Object<T_AnyType,pmacc::pmacc_isAlias> Alias;
-    typedef typename SpeciesType::FrameType FrameType;
+            template<typename T_SpeciesType, template<typename, typename> class T_Object, typename T_AnyType>
+            struct ResolveAliasFromSpecies<T_SpeciesType, T_Object<T_AnyType, pmacc::pmacc_isAlias>>
+            {
+                typedef T_SpeciesType SpeciesType;
+                typedef T_Object<T_AnyType, pmacc::pmacc_isAlias> Alias;
+                typedef typename SpeciesType::FrameType FrameType;
 
-    /* The following line only fetches the alias */
-    typedef typename pmacc::traits::GetFlagType<FrameType, Alias >::type FoundAlias;
+                /* The following line only fetches the alias */
+                typedef typename pmacc::traits::GetFlagType<FrameType, Alias>::type FoundAlias;
 
-    /* This now resolves the alias into the actual object type */
-    typedef typename pmacc::traits::Resolve<FoundAlias>::type type;
-}; // struct ResolveAliasFromSpecies
+                /* This now resolves the alias into the actual object type */
+                typedef typename pmacc::traits::Resolve<FoundAlias>::type type;
+            }; // struct ResolveAliasFromSpecies
 
-} // namespace traits
-} // namespace particles
+        } // namespace traits
+    } // namespace particles
 } // namespace pmacc
diff --git a/include/pmacc/pluginSystem/INotify.hpp b/include/pmacc/pluginSystem/INotify.hpp
index 92be33c262..268bce9623 100644
--- a/include/pmacc/pluginSystem/INotify.hpp
+++ b/include/pmacc/pluginSystem/INotify.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl,
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl,
  *                     Richard Pausch
  *
  * This file is part of PMacc.
@@ -34,7 +34,6 @@ namespace pmacc
         uint32_t lastNotify;
 
     public:
-
         INotify() : lastNotify(0)
         {
         }
@@ -50,7 +49,7 @@ namespace pmacc
          *
          * @param currentStep current simulation iteration step
          */
-        virtual void notify( uint32_t currentStep ) = 0;
+        virtual void notify(uint32_t currentStep) = 0;
 
         /** When was the plugin notified last?
          *
@@ -65,10 +64,9 @@ namespace pmacc
          *
          * @param currentStep current simulation iteration step
          */
-        void setLastNotify( uint32_t currentStep )
+        void setLastNotify(uint32_t currentStep)
         {
             lastNotify = currentStep;
         }
-
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/pluginSystem/IPlugin.hpp b/include/pmacc/pluginSystem/IPlugin.hpp
index cdb4671834..d4ce16b2b2 100644
--- a/include/pmacc/pluginSystem/IPlugin.hpp
+++ b/include/pmacc/pluginSystem/IPlugin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Richard Pausch
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Richard Pausch
  *
  * This file is part of PMacc.
  *
@@ -39,7 +39,6 @@ namespace pmacc
     class PluginException : public std::runtime_error
     {
     public:
-
         PluginException(const char* message) : std::runtime_error(message)
         {
         }
@@ -55,11 +54,8 @@ namespace pmacc
     class IPlugin : public INotify
     {
     public:
-
-        IPlugin() :
-        loaded(false), lastCheckpoint(0)
+        IPlugin() : loaded(false), lastCheckpoint(0)
         {
-
         }
 
         virtual ~IPlugin()
@@ -128,7 +124,8 @@ namespace pmacc
          * \param direction the direction the particles are leaving the simulation
          */
         virtual void onParticleLeave(const std::string& /*speciesName*/, const int32_t /*direction*/)
-        {}
+        {
+        }
 
         /** When was the plugin checkpointed last?
          *
@@ -143,7 +140,7 @@ namespace pmacc
          *
          * @param currentStep current simulation iteration step
          */
-        void setLastCheckpoint( uint32_t currentStep )
+        void setLastCheckpoint(uint32_t currentStep)
         {
             lastCheckpoint = currentStep;
         }
@@ -162,4 +159,4 @@ namespace pmacc
         bool loaded;
         uint32_t lastCheckpoint;
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/pluginSystem/PluginConnector.hpp b/include/pmacc/pluginSystem/PluginConnector.hpp
index d163715d4c..17f36206b6 100644
--- a/include/pmacc/pluginSystem/PluginConnector.hpp
+++ b/include/pmacc/pluginSystem/PluginConnector.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl, Benjamin Worpitz,
+/* Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl, Benjamin Worpitz,
  *                     Heiko Burau
  *
  * This file is part of PMacc.
@@ -43,15 +43,11 @@ namespace pmacc
     class PluginConnector
     {
     private:
-        using SeqOfTimeSlices = std::vector< pluginSystem::TimeSlice >;
-        using PluginPair = std::pair<
-            INotify*,
-            SeqOfTimeSlices
-        >;
-        using NotificationList = std::list< PluginPair >;
+        using SeqOfTimeSlices = std::vector<pluginSystem::TimeSlice>;
+        using PluginPair = std::pair<INotify*, SeqOfTimeSlices>;
+        using NotificationList = std::list<PluginPair>;
 
     public:
-
         /** Register a plugin for loading/unloading and notifications
          *
          * Plugins are loaded in the order they are registered and unloaded in reverse order.
@@ -60,9 +56,9 @@ namespace pmacc
          *
          * @param plugin plugin to register
          */
-        void registerPlugin(IPlugin *plugin)
+        void registerPlugin(IPlugin* plugin)
         {
-            if (plugin != nullptr)
+            if(plugin != nullptr)
             {
                 plugins.push_back(plugin);
             }
@@ -76,10 +72,9 @@ namespace pmacc
         void loadPlugins()
         {
             // load all plugins
-            for (std::list<IPlugin*>::iterator iter = plugins.begin();
-                 iter != plugins.end(); ++iter)
+            for(std::list<IPlugin*>::iterator iter = plugins.begin(); iter != plugins.end(); ++iter)
             {
-                if (!(*iter)->isLoaded())
+                if(!(*iter)->isLoaded())
                 {
                     (*iter)->load();
                 }
@@ -92,10 +87,9 @@ namespace pmacc
         void unloadPlugins()
         {
             // unload all plugins
-            for (std::list<IPlugin*>::reverse_iterator iter = plugins.rbegin();
-                 iter != plugins.rend(); ++iter)
+            for(std::list<IPlugin*>::reverse_iterator iter = plugins.rbegin(); iter != plugins.rend(); ++iter)
             {
-                if ((*iter)->isLoaded())
+                if((*iter)->isLoaded())
                 {
                     (*iter)->unload();
                 }
@@ -111,8 +105,7 @@ namespace pmacc
         {
             std::list<po::options_description> help_options;
 
-            for (std::list<IPlugin*>::iterator iter = plugins.begin();
-                 iter != plugins.end(); ++iter)
+            for(std::list<IPlugin*>::iterator iter = plugins.begin(); iter != plugins.end(); ++iter)
             {
                 // create a new help options section for this plugin,
                 // fill it and add to list of options
@@ -129,17 +122,14 @@ namespace pmacc
          * @param notifiedObj the object to notify, e.g. an IPlugin instance
          * @param period notification period
          */
-        void setNotificationPeriod(INotify* notifiedObj, std::string const & period)
+        void setNotificationPeriod(INotify* notifiedObj, std::string const& period)
         {
-            if (notifiedObj != nullptr)
+            if(notifiedObj != nullptr)
             {
-                if( !period.empty() )
+                if(!period.empty())
                 {
-                    SeqOfTimeSlices seqTimeSlices = pluginSystem::toTimeSlice( period );
-                    notificationList.push_back( std::make_pair(
-                        notifiedObj,
-                        seqTimeSlices
-                    ) );
+                    SeqOfTimeSlices seqTimeSlices = pluginSystem::toTimeSlice(period);
+                    notificationList.push_back(std::make_pair(notifiedObj, seqTimeSlices));
                 }
             }
             else
@@ -153,15 +143,9 @@ namespace pmacc
          */
         void notifyPlugins(uint32_t currentStep)
         {
-            for (NotificationList::iterator iter = notificationList.begin();
-                    iter != notificationList.end(); ++iter)
+            for(NotificationList::iterator iter = notificationList.begin(); iter != notificationList.end(); ++iter)
             {
-                if(
-                    containsStep(
-                        (*iter).second,
-                        currentStep
-                    )
-                )
+                if(containsStep((*iter).second, currentStep))
                 {
                     INotify* notifiedObj = iter->first;
                     notifiedObj->notify(currentStep);
@@ -178,8 +162,7 @@ namespace pmacc
          */
         void checkpointPlugins(uint32_t currentStep, const std::string checkpointDirectory)
         {
-            for (std::list<IPlugin*>::iterator iter = plugins.begin();
-                    iter != plugins.end(); ++iter)
+            for(std::list<IPlugin*>::iterator iter = plugins.begin(); iter != plugins.end(); ++iter)
             {
                 (*iter)->checkpoint(currentStep, checkpointDirectory);
                 (*iter)->setLastCheckpoint(currentStep);
@@ -194,8 +177,7 @@ namespace pmacc
          */
         void restartPlugins(uint32_t restartStep, const std::string restartDirectory)
         {
-            for (std::list<IPlugin*>::iterator iter = plugins.begin();
-                    iter != plugins.end(); ++iter)
+            for(std::list<IPlugin*>::iterator iter = plugins.begin(); iter != plugins.end(); ++iter)
             {
                 (*iter)->restart(restartStep, restartDirectory);
             }
@@ -211,9 +193,7 @@ namespace pmacc
         std::vector<Plugin*> getPluginsFromType()
         {
             std::vector<Plugin*> result;
-            for(std::list<IPlugin*>::iterator iter = plugins.begin();
-                iter != plugins.end();
-                iter++)
+            for(std::list<IPlugin*>::iterator iter = plugins.begin(); iter != plugins.end(); iter++)
             {
                 Plugin* plugin = dynamic_cast<Plugin*>(*iter);
                 if(plugin != nullptr)
@@ -232,7 +212,6 @@ namespace pmacc
         }
 
     private:
-
         friend struct detail::Environment;
 
         static PluginConnector& getInstance()
@@ -243,15 +222,13 @@ namespace pmacc
 
         PluginConnector()
         {
-
         }
 
         virtual ~PluginConnector()
         {
-
         }
 
         std::list<IPlugin*> plugins;
         NotificationList notificationList;
     };
-}
+} // namespace pmacc
diff --git a/include/pmacc/pluginSystem/TimeSlice.hpp b/include/pmacc/pluginSystem/TimeSlice.hpp
index e101248824..86da516aba 100644
--- a/include/pmacc/pluginSystem/TimeSlice.hpp
+++ b/include/pmacc/pluginSystem/TimeSlice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,54 +30,50 @@
 
 namespace pmacc
 {
-namespace pluginSystem
-{
-    struct TimeSlice
+    namespace pluginSystem
     {
-        /** time slice configuration
-         *
-         * 0 = begin of the interval
-         * 1 = end of the interval
-         * 2 = period
-         */
-        std::array< uint32_t, 3 > values;
-
-        std::string toString() const
+        struct TimeSlice
         {
-            std::string result;
-            result = std::to_string(values[0]) + ":" +
-                std::to_string(values[1]) + ":" +
-                std::to_string(values[2]);
-            return result;
-        }
+            /** time slice configuration
+             *
+             * 0 = begin of the interval
+             * 1 = end of the interval
+             * 2 = period
+             */
+            std::array<uint32_t, 3> values;
 
-        /** set the value
-         *
-         * if str is empty the default value for the given index is selected
-         *
-         * @param idx index to set, range [0,3)
-         * @param str value to set, can be empty
-         */
-        void setValue(uint32_t const idx, std::string const & str)
-        {
-            if(!str.empty())
+            std::string toString() const
             {
-                uint32_t value = std::stoul( str );
-                PMACC_VERIFY_MSG(
-                    !( idx == 2 && value == 0 ),
-                    "Zero is not a valid period"
-                );
-                values.at( idx )  = value;
+                std::string result;
+                result = std::to_string(values[0]) + ":" + std::to_string(values[1]) + ":" + std::to_string(values[2]);
+                return result;
             }
-        }
 
-        //! create a time slice instance
-        TimeSlice() :
-            /* default: start:end:period
-             * -1 stored as unsigned is the highest available unsigned integer
+            /** set the value
+             *
+             * if str is empty the default value for the given index is selected
+             *
+             * @param idx index to set, range [0,3)
+             * @param str value to set, can be empty
              */
-            values( { 0, uint32_t( -1 ), 1 } )
-        { }
-    };
-} // namespace pluginSystem
+            void setValue(uint32_t const idx, std::string const& str)
+            {
+                if(!str.empty())
+                {
+                    uint32_t value = std::stoul(str);
+                    PMACC_VERIFY_MSG(!(idx == 2 && value == 0), "Zero is not a valid period");
+                    values.at(idx) = value;
+                }
+            }
+
+            //! create a time slice instance
+            TimeSlice()
+                : /* default: start:end:period
+                   * -1 stored as unsigned is the highest available unsigned integer
+                   */
+                values({0, uint32_t(-1), 1})
+            {
+            }
+        };
+    } // namespace pluginSystem
 } // namespace pmacc
diff --git a/include/pmacc/pluginSystem/containsStep.hpp b/include/pmacc/pluginSystem/containsStep.hpp
index cbd473bed8..b82fa46c21 100644
--- a/include/pmacc/pluginSystem/containsStep.hpp
+++ b/include/pmacc/pluginSystem/containsStep.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,32 +28,26 @@
 
 namespace pmacc
 {
-namespace pluginSystem
-{
-    /** check if a given step is within an interval list
-     *
-     * @param seqTimeSlices vector with time intervals
-     * @param timeStep simulation time step to check
-     * @return true if step is included in the interval list else false
-     */
-    HINLINE bool containsStep(
-        std::vector< pluginSystem::TimeSlice > const & seqTimeSlices,
-        uint32_t const timeStep
-    )
+    namespace pluginSystem
     {
-        for(auto const & timeSlice : seqTimeSlices)
+        /** check if a given step is within an interval list
+         *
+         * @param seqTimeSlices vector with time intervals
+         * @param timeStep simulation time step to check
+         * @return true if step is included in the interval list else false
+         */
+        HINLINE bool containsStep(std::vector<pluginSystem::TimeSlice> const& seqTimeSlices, uint32_t const timeStep)
         {
-            if(
-                timeStep >= timeSlice.values[ 0 ] &&
-                timeStep <= timeSlice.values[ 1 ]
-            )
+            for(auto const& timeSlice : seqTimeSlices)
             {
-                uint32_t const timeRelativeToStart = timeStep - timeSlice.values[ 0 ];
-                if( timeRelativeToStart % timeSlice.values[ 2 ] == 0 )
-                    return true;
+                if(timeStep >= timeSlice.values[0] && timeStep <= timeSlice.values[1])
+                {
+                    uint32_t const timeRelativeToStart = timeStep - timeSlice.values[0];
+                    if(timeRelativeToStart % timeSlice.values[2] == 0)
+                        return true;
+                }
             }
+            return false;
         }
-        return false;
-    }
-} // namespace pluginSystem
+    } // namespace pluginSystem
 } // namespace pmacc
diff --git a/include/pmacc/pluginSystem/toTimeSlice.hpp b/include/pmacc/pluginSystem/toTimeSlice.hpp
index e1eac66114..76b21242f0 100644
--- a/include/pmacc/pluginSystem/toTimeSlice.hpp
+++ b/include/pmacc/pluginSystem/toTimeSlice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -36,73 +36,56 @@
 
 namespace pmacc
 {
-namespace pluginSystem
-{
-namespace detail
-{
-    /** check if string contains only digits
-     *
-     * @param str string to check
-     * @return true if str contains only digits else false
-     */
-    HINLINE bool is_number( std::string const & str )
+    namespace pluginSystem
     {
-        return std::all_of(
-            str.begin(),
-            str.end(),
-            ::isdigit
-        );
-    }
-} // namespace detail
-
-    /** create a TimeSlice out of an string
-     *
-     * Parse a comma separated list of time slices and creates a vector of TimeSlices.
-     * TimeSlice Syntax:
-     *   - `start:stop:period`
-     *   - a number ``N is equal to `::N`
-     */
-    HINLINE std::vector< TimeSlice > toTimeSlice( std::string const & str )
-    {
-        std::vector< TimeSlice > result;
-        auto const seqOfSlices = misc::splitString(
-            str,
-            ","
-        );
-        for( auto const & slice : seqOfSlices )
+        namespace detail
         {
-            auto const sliceComponents = misc::splitString(
-                slice,
-                ":"
-            );
-            PMACC_VERIFY_MSG(
-                !sliceComponents.empty( ),
-                std::string( "time slice without a defined element is not allowed" ) + str
-            );
+            /** check if string contains only digits
+             *
+             * @param str string to check
+             * @return true if str contains only digits else false
+             */
+            HINLINE bool is_number(std::string const& str)
+            {
+                return std::all_of(str.begin(), str.end(), ::isdigit);
+            }
+        } // namespace detail
 
-            // id of the component
-            size_t n = 0;
-            bool const hasOnlyPeriod = sliceComponents.size() == 1u;
-            TimeSlice timeSlice;
-            for( auto& component : sliceComponents )
+        /** create a TimeSlice out of an string
+         *
+         * Parse a comma separated list of time slices and creates a vector of TimeSlices.
+         * TimeSlice Syntax:
+         *   - `start:stop:period`
+         *   - a number ``N is equal to `::N`
+         */
+        HINLINE std::vector<TimeSlice> toTimeSlice(std::string const& str)
+        {
+            std::vector<TimeSlice> result;
+            auto const seqOfSlices = misc::splitString(str, ",");
+            for(auto const& slice : seqOfSlices)
             {
-                // be sure that component it is a number or empty
+                auto const sliceComponents = misc::splitString(slice, ":");
                 PMACC_VERIFY_MSG(
-                    component.empty() || detail::is_number( component ),
-                    std::string("value") + component +
-                        " in " + str + "is not a number"
-                );
+                    !sliceComponents.empty(),
+                    std::string("time slice without a defined element is not allowed") + str);
 
-                timeSlice.setValue(
-                    hasOnlyPeriod ? 2 : n,
-                    component
-                );
-                n++;
-            }
-            result.push_back( timeSlice );
+                // id of the component
+                size_t n = 0;
+                bool const hasOnlyPeriod = sliceComponents.size() == 1u;
+                TimeSlice timeSlice;
+                for(auto& component : sliceComponents)
+                {
+                    // be sure that component it is a number or empty
+                    PMACC_VERIFY_MSG(
+                        component.empty() || detail::is_number(component),
+                        std::string("value") + component + " in " + str + "is not a number");
 
+                    timeSlice.setValue(hasOnlyPeriod ? 2 : n, component);
+                    n++;
+                }
+                result.push_back(timeSlice);
+            }
+            return result;
         }
-        return result;
-    }
-} // namespace pluginSystem
+    } // namespace pluginSystem
 } // namespace pmacc
diff --git a/include/pmacc/ppFunctions.hpp b/include/pmacc/ppFunctions.hpp
index 1ed6980142..fa0d1a4a13 100644
--- a/include/pmacc/ppFunctions.hpp
+++ b/include/pmacc/ppFunctions.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,22 +26,22 @@
 #include <pmacc/preprocessor/size.hpp>
 
 
-#define PMACC_MIN(x,y) (((x)<=(y))?x:y)
-#define PMACC_MAX(x,y) (((x)>(y))?x:y)
+#define PMACC_MIN(x, y) (((x) <= (y)) ? x : y)
+#define PMACC_MAX(x, y) (((x) > (y)) ? x : y)
 
 
-#define PMACC_JOIN_DO(x,y) x##y
-#define PMACC_JOIN(x,y) PMACC_JOIN_DO(x,y)
+#define PMACC_JOIN_DO(x, y) x##y
+#define PMACC_JOIN(x, y) PMACC_JOIN_DO(x, y)
 
-#define PMACC_MAX_DO(what,x,y) (((x)>(y))?x what:y what)
-#define PMACC_MIN_DO(what,x,y) (((x)<(y))?x what:y what)
+#define PMACC_MAX_DO(what, x, y) (((x) > (y)) ? x what : y what)
+#define PMACC_MIN_DO(what, x, y) (((x) < (y)) ? x what : y what)
 
 
 #ifdef PMACC_PP_VARIADIC_SIZE
-#   define PMACC_COUNT_ARGS_DEF(type,...) (PMACC_PP_VARIADIC_SIZE(__VA_ARGS__))
+#    define PMACC_COUNT_ARGS_DEF(type, ...) (PMACC_PP_VARIADIC_SIZE(__VA_ARGS__))
 #else
-    // A fallback implementation using compound literals, supported by some compilers
-#   define PMACC_COUNT_ARGS_DEF(type,...) (sizeof((type[]){type{}, ##__VA_ARGS__})/sizeof(type)-1u)
+// A fallback implementation using compound literals, supported by some compilers
+#    define PMACC_COUNT_ARGS_DEF(type, ...) (sizeof((type[]){type{}, ##__VA_ARGS__}) / sizeof(type) - 1u)
 #endif
 
 /**
@@ -50,7 +50,7 @@
  * @param type type of the arguments in ...
  * @param ... arguments
  */
-#define PMACC_COUNT_ARGS(type,...) PMACC_COUNT_ARGS_DEF(type,__VA_ARGS__)
+#define PMACC_COUNT_ARGS(type, ...) PMACC_COUNT_ARGS_DEF(type, __VA_ARGS__)
 
 /**
  * Check if ... has arguments or not
@@ -60,7 +60,7 @@
  * @param ... arguments
  * @return false if no arguments are given, else true
  */
-#define PMACC_HAS_ARGS(...)  (PMACC_COUNT_ARGS(int,__VA_ARGS__)>0)
+#define PMACC_HAS_ARGS(...) (PMACC_COUNT_ARGS(int, __VA_ARGS__) > 0)
 
 /** round up to next higher pow 2 value
  *
@@ -71,12 +71,12 @@
  * @param value integral number between [1,Inf]
  * @return next higher pow 2 value
  */
-#define PMACC_ROUND_UP_NEXT_POW2(value) \
-        ((value)==1?1:                  \
-        ((value)<=2?2:                  \
-        ((value)<=4?4:                  \
-        ((value)<=8?8:                  \
-        ((value)<=16?16:                \
-        ((value)<=32?32:                \
-        ((value)<=64?64:128             \
-        )))))))
+#define PMACC_ROUND_UP_NEXT_POW2(value)                                                                               \
+    ((value) == 1                                                                                                     \
+         ? 1                                                                                                          \
+         : ((value) <= 2                                                                                              \
+                ? 2                                                                                                   \
+                : ((value) <= 4                                                                                       \
+                       ? 4                                                                                            \
+                       : ((value) <= 8 ? 8                                                                            \
+                                       : ((value) <= 16 ? 16 : ((value) <= 32 ? 32 : ((value) <= 64 ? 64 : 128)))))))
diff --git a/include/pmacc/preprocessor/facilities.hpp b/include/pmacc/preprocessor/facilities.hpp
index 04ee2ba3ec..1906c20069 100644
--- a/include/pmacc/preprocessor/facilities.hpp
+++ b/include/pmacc/preprocessor/facilities.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,14 +28,14 @@
 #define PMACC_PP_DEFER_ECHO() PMACC_PP_ECHO
 
 /** get the first element of a preprocessor pair */
-#define PMACC_PP_FIRST(first,second) first
+#define PMACC_PP_FIRST(first, second) first
 
 /** get the first element of a preprocessor pair with delay */
 #define PMACC_PP_DEFER_FIRST() PMACC_PP_FIRST
 
 
 /** get the second element of a preprocessor pair */
-#define PMACC_PP_SECOND(first,second) second
+#define PMACC_PP_SECOND(first, second) second
 
 /** get the second element of a preprocessor pair with delay */
 #define PMACC_PP_DEFER_SECOND() PMACC_PP_SECOND
@@ -52,4 +52,4 @@
 /** call the given macro with the given argument.
  * can be used as a helper for expanding arguments that are lists
  */
-#define PMACC_PP_CALL(macro,argument) macro argument
+#define PMACC_PP_CALL(macro, argument) macro argument
diff --git a/include/pmacc/preprocessor/size.hpp b/include/pmacc/preprocessor/size.hpp
index 72ba10e57c..666ef07fb5 100644
--- a/include/pmacc/preprocessor/size.hpp
+++ b/include/pmacc/preprocessor/size.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Sergei Bastrakov
+/* Copyright 2018-2021 Sergei Bastrakov
  *
  * This file is part of PMacc.
  *
@@ -32,51 +32,382 @@
  * for usage check if PMACC_PP_VARIADIC_SIZE is defined.
  * Implementation is essentially the same as BOOST_PP_VARIADIC_SIZE,
  * but supports up to 120 arguments instead of 64.
- * The implementation uses the “paired, sliding arg list” trick
+ * The implementation uses the "paired, sliding arg list" trick
  * explained in https://codecraft.co/2014/11/25/variadic-macros-tricks/
  */
-#if( BOOST_PP_VARIADICS == 1 )
-#   define PMACC_PP_VARIADIC_SIZE_I(                                          \
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, \
-        e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, \
-        e30, e31, e32, e33, e34, e35, e36, e37, e38, e39, e40, e41, e42, e43, \
-        e44, e45, e46, e47, e48, e49, e50, e51, e52, e53, e54, e55, e56, e57, \
-        e58, e59, e60, e61, e62, e63, e64, e65, e66, e67, e68, e69, e70, e71, \
-        e72, e73, e74, e75, e76, e77, e78, e79, e80, e81, e82, e83, e84, e85, \
-        e86, e87, e88, e89, e90, e91, e92, e93, e94, e95, e96, e97, e98, e99, \
-        e100, e101, e102, e103, e104, e105, e106, e107, e108, e109, e110,     \
-        e111, e112, e113, e114, e115, e116, e117, e118, e119,                 \
-        size, ...                                                             \
-    ) size
-#   if BOOST_PP_VARIADICS_MSVC
-#       define PMACC_PP_VARIADIC_SIZE(...)                                  \
-            BOOST_PP_CAT(                                                   \
-                PMACC_PP_VARIADIC_SIZE_I(                                   \
-                    __VA_ARGS__,                                            \
-                    120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110,  \
-                    109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99,   \
-                    98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, \
-                    84, 83, 82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, \
-                    70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 58, 57, \
-                    56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, \
-                    42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, \
-                    28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, \
-                    14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,          \
-                ),                                                          \
-            )
-#   else
-#       define PMACC_PP_VARIADIC_SIZE(...)                                   \
-            PMACC_PP_VARIADIC_SIZE_I(                                        \
-                __VA_ARGS__,                                                 \
-                120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109,  \
-                108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, \
-                95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82, 81,  \
-                80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66,  \
-                65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51,  \
-                50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36,  \
-                35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21,  \
-                20, 19, 18, 17, 16,  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,  \
-                4, 3, 2, 1,                                                  \
-            )
-#   endif
+#if(BOOST_PP_VARIADICS == 1)
+#    define PMACC_PP_VARIADIC_SIZE_I(                                                                                 \
+        e0,                                                                                                           \
+        e1,                                                                                                           \
+        e2,                                                                                                           \
+        e3,                                                                                                           \
+        e4,                                                                                                           \
+        e5,                                                                                                           \
+        e6,                                                                                                           \
+        e7,                                                                                                           \
+        e8,                                                                                                           \
+        e9,                                                                                                           \
+        e10,                                                                                                          \
+        e11,                                                                                                          \
+        e12,                                                                                                          \
+        e13,                                                                                                          \
+        e14,                                                                                                          \
+        e15,                                                                                                          \
+        e16,                                                                                                          \
+        e17,                                                                                                          \
+        e18,                                                                                                          \
+        e19,                                                                                                          \
+        e20,                                                                                                          \
+        e21,                                                                                                          \
+        e22,                                                                                                          \
+        e23,                                                                                                          \
+        e24,                                                                                                          \
+        e25,                                                                                                          \
+        e26,                                                                                                          \
+        e27,                                                                                                          \
+        e28,                                                                                                          \
+        e29,                                                                                                          \
+        e30,                                                                                                          \
+        e31,                                                                                                          \
+        e32,                                                                                                          \
+        e33,                                                                                                          \
+        e34,                                                                                                          \
+        e35,                                                                                                          \
+        e36,                                                                                                          \
+        e37,                                                                                                          \
+        e38,                                                                                                          \
+        e39,                                                                                                          \
+        e40,                                                                                                          \
+        e41,                                                                                                          \
+        e42,                                                                                                          \
+        e43,                                                                                                          \
+        e44,                                                                                                          \
+        e45,                                                                                                          \
+        e46,                                                                                                          \
+        e47,                                                                                                          \
+        e48,                                                                                                          \
+        e49,                                                                                                          \
+        e50,                                                                                                          \
+        e51,                                                                                                          \
+        e52,                                                                                                          \
+        e53,                                                                                                          \
+        e54,                                                                                                          \
+        e55,                                                                                                          \
+        e56,                                                                                                          \
+        e57,                                                                                                          \
+        e58,                                                                                                          \
+        e59,                                                                                                          \
+        e60,                                                                                                          \
+        e61,                                                                                                          \
+        e62,                                                                                                          \
+        e63,                                                                                                          \
+        e64,                                                                                                          \
+        e65,                                                                                                          \
+        e66,                                                                                                          \
+        e67,                                                                                                          \
+        e68,                                                                                                          \
+        e69,                                                                                                          \
+        e70,                                                                                                          \
+        e71,                                                                                                          \
+        e72,                                                                                                          \
+        e73,                                                                                                          \
+        e74,                                                                                                          \
+        e75,                                                                                                          \
+        e76,                                                                                                          \
+        e77,                                                                                                          \
+        e78,                                                                                                          \
+        e79,                                                                                                          \
+        e80,                                                                                                          \
+        e81,                                                                                                          \
+        e82,                                                                                                          \
+        e83,                                                                                                          \
+        e84,                                                                                                          \
+        e85,                                                                                                          \
+        e86,                                                                                                          \
+        e87,                                                                                                          \
+        e88,                                                                                                          \
+        e89,                                                                                                          \
+        e90,                                                                                                          \
+        e91,                                                                                                          \
+        e92,                                                                                                          \
+        e93,                                                                                                          \
+        e94,                                                                                                          \
+        e95,                                                                                                          \
+        e96,                                                                                                          \
+        e97,                                                                                                          \
+        e98,                                                                                                          \
+        e99,                                                                                                          \
+        e100,                                                                                                         \
+        e101,                                                                                                         \
+        e102,                                                                                                         \
+        e103,                                                                                                         \
+        e104,                                                                                                         \
+        e105,                                                                                                         \
+        e106,                                                                                                         \
+        e107,                                                                                                         \
+        e108,                                                                                                         \
+        e109,                                                                                                         \
+        e110,                                                                                                         \
+        e111,                                                                                                         \
+        e112,                                                                                                         \
+        e113,                                                                                                         \
+        e114,                                                                                                         \
+        e115,                                                                                                         \
+        e116,                                                                                                         \
+        e117,                                                                                                         \
+        e118,                                                                                                         \
+        e119,                                                                                                         \
+        size,                                                                                                         \
+        ...)                                                                                                          \
+        size
+#    if BOOST_PP_VARIADICS_MSVC
+#        define PMACC_PP_VARIADIC_SIZE(...)                                                                           \
+            BOOST_PP_CAT(                                                                                             \
+                PMACC_PP_VARIADIC_SIZE_I(                                                                             \
+                    __VA_ARGS__,                                                                                      \
+                    120,                                                                                              \
+                    119,                                                                                              \
+                    118,                                                                                              \
+                    117,                                                                                              \
+                    116,                                                                                              \
+                    115,                                                                                              \
+                    114,                                                                                              \
+                    113,                                                                                              \
+                    112,                                                                                              \
+                    111,                                                                                              \
+                    110,                                                                                              \
+                    109,                                                                                              \
+                    108,                                                                                              \
+                    107,                                                                                              \
+                    106,                                                                                              \
+                    105,                                                                                              \
+                    104,                                                                                              \
+                    103,                                                                                              \
+                    102,                                                                                              \
+                    101,                                                                                              \
+                    100,                                                                                              \
+                    99,                                                                                               \
+                    98,                                                                                               \
+                    97,                                                                                               \
+                    96,                                                                                               \
+                    95,                                                                                               \
+                    94,                                                                                               \
+                    93,                                                                                               \
+                    92,                                                                                               \
+                    91,                                                                                               \
+                    90,                                                                                               \
+                    89,                                                                                               \
+                    88,                                                                                               \
+                    87,                                                                                               \
+                    86,                                                                                               \
+                    85,                                                                                               \
+                    84,                                                                                               \
+                    83,                                                                                               \
+                    82,                                                                                               \
+                    81,                                                                                               \
+                    80,                                                                                               \
+                    79,                                                                                               \
+                    78,                                                                                               \
+                    77,                                                                                               \
+                    76,                                                                                               \
+                    75,                                                                                               \
+                    74,                                                                                               \
+                    73,                                                                                               \
+                    72,                                                                                               \
+                    71,                                                                                               \
+                    70,                                                                                               \
+                    69,                                                                                               \
+                    68,                                                                                               \
+                    67,                                                                                               \
+                    66,                                                                                               \
+                    65,                                                                                               \
+                    64,                                                                                               \
+                    63,                                                                                               \
+                    62,                                                                                               \
+                    61,                                                                                               \
+                    60,                                                                                               \
+                    59,                                                                                               \
+                    58,                                                                                               \
+                    57,                                                                                               \
+                    56,                                                                                               \
+                    55,                                                                                               \
+                    54,                                                                                               \
+                    53,                                                                                               \
+                    52,                                                                                               \
+                    51,                                                                                               \
+                    50,                                                                                               \
+                    49,                                                                                               \
+                    48,                                                                                               \
+                    47,                                                                                               \
+                    46,                                                                                               \
+                    45,                                                                                               \
+                    44,                                                                                               \
+                    43,                                                                                               \
+                    42,                                                                                               \
+                    41,                                                                                               \
+                    40,                                                                                               \
+                    39,                                                                                               \
+                    38,                                                                                               \
+                    37,                                                                                               \
+                    36,                                                                                               \
+                    35,                                                                                               \
+                    34,                                                                                               \
+                    33,                                                                                               \
+                    32,                                                                                               \
+                    31,                                                                                               \
+                    30,                                                                                               \
+                    29,                                                                                               \
+                    28,                                                                                               \
+                    27,                                                                                               \
+                    26,                                                                                               \
+                    25,                                                                                               \
+                    24,                                                                                               \
+                    23,                                                                                               \
+                    22,                                                                                               \
+                    21,                                                                                               \
+                    20,                                                                                               \
+                    19,                                                                                               \
+                    18,                                                                                               \
+                    17,                                                                                               \
+                    16,                                                                                               \
+                    15,                                                                                               \
+                    14,                                                                                               \
+                    13,                                                                                               \
+                    12,                                                                                               \
+                    11,                                                                                               \
+                    10,                                                                                               \
+                    9,                                                                                                \
+                    8,                                                                                                \
+                    7,                                                                                                \
+                    6,                                                                                                \
+                    5,                                                                                                \
+                    4,                                                                                                \
+                    3,                                                                                                \
+                    2,                                                                                                \
+                    1, ), )
+#    else
+#        define PMACC_PP_VARIADIC_SIZE(...)                                                                           \
+            PMACC_PP_VARIADIC_SIZE_I(                                                                                 \
+                __VA_ARGS__,                                                                                          \
+                120,                                                                                                  \
+                119,                                                                                                  \
+                118,                                                                                                  \
+                117,                                                                                                  \
+                116,                                                                                                  \
+                115,                                                                                                  \
+                114,                                                                                                  \
+                113,                                                                                                  \
+                112,                                                                                                  \
+                111,                                                                                                  \
+                110,                                                                                                  \
+                109,                                                                                                  \
+                108,                                                                                                  \
+                107,                                                                                                  \
+                106,                                                                                                  \
+                105,                                                                                                  \
+                104,                                                                                                  \
+                103,                                                                                                  \
+                102,                                                                                                  \
+                101,                                                                                                  \
+                100,                                                                                                  \
+                99,                                                                                                   \
+                98,                                                                                                   \
+                97,                                                                                                   \
+                96,                                                                                                   \
+                95,                                                                                                   \
+                94,                                                                                                   \
+                93,                                                                                                   \
+                92,                                                                                                   \
+                91,                                                                                                   \
+                90,                                                                                                   \
+                89,                                                                                                   \
+                88,                                                                                                   \
+                87,                                                                                                   \
+                86,                                                                                                   \
+                85,                                                                                                   \
+                84,                                                                                                   \
+                83,                                                                                                   \
+                82,                                                                                                   \
+                81,                                                                                                   \
+                80,                                                                                                   \
+                79,                                                                                                   \
+                78,                                                                                                   \
+                77,                                                                                                   \
+                76,                                                                                                   \
+                75,                                                                                                   \
+                74,                                                                                                   \
+                73,                                                                                                   \
+                72,                                                                                                   \
+                71,                                                                                                   \
+                70,                                                                                                   \
+                69,                                                                                                   \
+                68,                                                                                                   \
+                67,                                                                                                   \
+                66,                                                                                                   \
+                65,                                                                                                   \
+                64,                                                                                                   \
+                63,                                                                                                   \
+                62,                                                                                                   \
+                61,                                                                                                   \
+                60,                                                                                                   \
+                59,                                                                                                   \
+                58,                                                                                                   \
+                57,                                                                                                   \
+                56,                                                                                                   \
+                55,                                                                                                   \
+                54,                                                                                                   \
+                53,                                                                                                   \
+                52,                                                                                                   \
+                51,                                                                                                   \
+                50,                                                                                                   \
+                49,                                                                                                   \
+                48,                                                                                                   \
+                47,                                                                                                   \
+                46,                                                                                                   \
+                45,                                                                                                   \
+                44,                                                                                                   \
+                43,                                                                                                   \
+                42,                                                                                                   \
+                41,                                                                                                   \
+                40,                                                                                                   \
+                39,                                                                                                   \
+                38,                                                                                                   \
+                37,                                                                                                   \
+                36,                                                                                                   \
+                35,                                                                                                   \
+                34,                                                                                                   \
+                33,                                                                                                   \
+                32,                                                                                                   \
+                31,                                                                                                   \
+                30,                                                                                                   \
+                29,                                                                                                   \
+                28,                                                                                                   \
+                27,                                                                                                   \
+                26,                                                                                                   \
+                25,                                                                                                   \
+                24,                                                                                                   \
+                23,                                                                                                   \
+                22,                                                                                                   \
+                21,                                                                                                   \
+                20,                                                                                                   \
+                19,                                                                                                   \
+                18,                                                                                                   \
+                17,                                                                                                   \
+                16,                                                                                                   \
+                15,                                                                                                   \
+                14,                                                                                                   \
+                13,                                                                                                   \
+                12,                                                                                                   \
+                11,                                                                                                   \
+                10,                                                                                                   \
+                9,                                                                                                    \
+                8,                                                                                                    \
+                7,                                                                                                    \
+                6,                                                                                                    \
+                5,                                                                                                    \
+                4,                                                                                                    \
+                3,                                                                                                    \
+                2,                                                                                                    \
+                1, )
+#    endif
 #endif
diff --git a/include/pmacc/preprocessor/struct.hpp b/include/pmacc/preprocessor/struct.hpp
index 72a430f011..04a9bc6a13 100644
--- a/include/pmacc/preprocessor/struct.hpp
+++ b/include/pmacc/preprocessor/struct.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera
+/* Copyright 2015-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -60,10 +60,12 @@
  *     static const float2_64 center_SI = float2_64(1.134e-5, 1.134e-5);
  *   @endcode
  */
-#define PMACC_C_VECTOR(type,name,...) (0,(typename pmacc::traits::GetValueType<type>::type, \
-                                          name,                                             \
-                                          pmacc::traits::GetNComponents<type>::value,       \
-                                          __VA_ARGS__))
+#define PMACC_C_VECTOR(type, name, ...)                                                                               \
+    (0,                                                                                                               \
+     (typename pmacc::traits::GetValueType<type>::type,                                                               \
+      name,                                                                                                           \
+      pmacc::traits::GetNComponents<type>::value,                                                                     \
+      __VA_ARGS__))
 
 
 /** create static const member vector that needs no memory inside of the struct
@@ -79,7 +81,7 @@
  *     static const Vector<float_64,simDim> center_SI = Vector<float_64,simDim>(1.134e-5, 1.134e-5, 1.134e-5);
  *   @endcode
  */
-#define PMACC_C_VECTOR_DIM(type,dim,name,...) (0,(type,name,dim,__VA_ARGS__))
+#define PMACC_C_VECTOR_DIM(type, dim, name, ...) (0, (type, name, dim, __VA_ARGS__))
 
 /** create static constexpr member
  *
@@ -93,7 +95,7 @@
  *     static constexpr float_64 power_SI = float_64(2.0);
  *   @endcode
  */
-#define PMACC_C_VALUE(type,name,value) (1,(type,name,value))
+#define PMACC_C_VALUE(type, name, value) (1, (type, name, value))
 
 /** create changeable member
  *
@@ -107,7 +109,7 @@
  *     float_64 power_SI(2.0);
  *   @endcode
  */
-#define PMACC_VALUE(type,name,initValue) (2,(type,name,initValue))
+#define PMACC_VALUE(type, name, initValue) (2, (type, name, initValue))
 
 
 /** create changeable member vector
@@ -122,7 +124,7 @@
  *     float2_64 center_SI(1.134e-5, 1.134e-5);
  *   @endcode
  */
-#define PMACC_VECTOR(type,name,...) (5,(type,name, type(__VA_ARGS__) ))
+#define PMACC_VECTOR(type, name, ...) (5, (type, name, type(__VA_ARGS__)))
 
 /** create changeable member vector
  *
@@ -137,14 +139,8 @@
  *     Vector<float_64,3> center_SI(1.134e-5, 1.134e-5, 1.134e-5);
  *   @endcode
  */
-#define PMACC_VECTOR_DIM(type,dim,name,...)                                    \
-        (5,                                                                    \
-         (                                                                     \
-          (pmacc::math::Vector<type,dim>),                                     \
-          name,                                                                \
-          pmacc::math::Vector<type,dim>(__VA_ARGS__)                           \
-         )                                                                     \
-        )
+#define PMACC_VECTOR_DIM(type, dim, name, ...)                                                                        \
+    (5, ((pmacc::math::Vector<type, dim>), name, pmacc::math::Vector<type, dim>(__VA_ARGS__)))
 
 /** create static const character string
  *
@@ -157,7 +153,7 @@
  *     static const char* filename = (char*)"fooFile.txt";
  *   @endcode
  */
-#define PMACC_C_STRING(name,initValue) (3,(_,name,initValue))
+#define PMACC_C_STRING(name, initValue) (3, (_, name, initValue))
 
 /** create any code extension
  *
@@ -169,8 +165,7 @@
  *     typedef float FooFloat;
  *   @endcode
  */
-#define PMACC_EXTENT(...) (4,(_,_,__VA_ARGS__))
-
+#define PMACC_EXTENT(...) (4, (_, _, __VA_ARGS__))
 
 
 /** select member description
@@ -182,8 +177,8 @@
  * @return result of `(op def)` if `selectTypeID == typeID`
  *                   `( )`      else
  */
-#define PMACC_PP_X_SELECT_TYPEID(selectTypeID,op,typeID,def)                   \
-    BOOST_PP_IF( BOOST_PP_EQUAL(typeID,selectTypeID), (op def) , () )
+#define PMACC_PP_X_SELECT_TYPEID(selectTypeID, op, typeID, def)                                                       \
+    BOOST_PP_IF(BOOST_PP_EQUAL(typeID, selectTypeID), (op def), ())
 
 /** select member description of a TypeMemberPair for a specific type id
  *
@@ -193,8 +188,8 @@
  * @return result of `op(secound(...))` if type is selected
  *                   `( )`              else
  */
-#define PMACC_PP_SELECT_TYPEID(typeID,op,...)                                  \
-    PMACC_PP_X_SELECT_TYPEID( typeID,op,PMACC_PP_DEFER_FIRST() __VA_ARGS__ ,PMACC_PP_DEFER_SECOND() __VA_ARGS__ )
+#define PMACC_PP_SELECT_TYPEID(typeID, op, ...)                                                                       \
+    PMACC_PP_X_SELECT_TYPEID(typeID, op, PMACC_PP_DEFER_FIRST() __VA_ARGS__, PMACC_PP_DEFER_SECOND() __VA_ARGS__)
 
 
 /** run macro which calls accessor on the given element
@@ -208,52 +203,47 @@
  *
  * @{
  */
-#define PMACC_PP_SEQ_MACRO_WITH_ACCESSOR(r,accessor,elem) PMACC_PP_REMOVE_PAREN( accessor(elem))
+#define PMACC_PP_SEQ_MACRO_WITH_ACCESSOR(r, accessor, elem) PMACC_PP_REMOVE_PAREN(accessor(elem))
 
-#define PMACC_PP_X_CREATE_C_VECTOR_DEF(data,type,name,dim,...) PMACC_CONST_VECTOR_DEF(type,dim,name,__VA_ARGS__);
-#define PMACC_PP_CREATE_C_VECTOR_DEF(elem)                                     \
-    PMACC_PP_SELECT_TYPEID( 0,PMACC_PP_X_CREATE_C_VECTOR_DEF, elem )
+#define PMACC_PP_X_CREATE_C_VECTOR_DEF(data, type, name, dim, ...)                                                    \
+    PMACC_CONST_VECTOR_DEF(type, dim, name, __VA_ARGS__);
+#define PMACC_PP_CREATE_C_VECTOR_DEF(elem) PMACC_PP_SELECT_TYPEID(0, PMACC_PP_X_CREATE_C_VECTOR_DEF, elem)
 
-#define PMACC_PP_X_CREATE_C_VECTOR_VARIABLE(data,type,name,dim,...) const BOOST_PP_CAT(name,_t) name;
-#define PMACC_PP_CREATE_C_VECTOR_VARIABLE(elem)                                \
-    PMACC_PP_SELECT_TYPEID( 0,PMACC_PP_X_CREATE_C_VECTOR_VARIABLE, elem )
+#define PMACC_PP_X_CREATE_C_VECTOR_VARIABLE(data, type, name, dim, ...) const BOOST_PP_CAT(name, _t) name;
+#define PMACC_PP_CREATE_C_VECTOR_VARIABLE(elem) PMACC_PP_SELECT_TYPEID(0, PMACC_PP_X_CREATE_C_VECTOR_VARIABLE, elem)
 
-#define PMACC_PP_X_CREATE_VALUE_VARIABLE(data,type,name,...) type name;
-#define PMACC_PP_CREATE_VALUE_VARIABLE(elem)                                   \
-    PMACC_PP_SELECT_TYPEID( 2,PMACC_PP_X_CREATE_VALUE_VARIABLE, elem )
+#define PMACC_PP_X_CREATE_VALUE_VARIABLE(data, type, name, ...) type name;
+#define PMACC_PP_CREATE_VALUE_VARIABLE(elem) PMACC_PP_SELECT_TYPEID(2, PMACC_PP_X_CREATE_VALUE_VARIABLE, elem)
 
-#define PMACC_PP_X_CREATE_VALUE_VARIABLE_WITH_PAREN(data,type,name,...) PMACC_PP_REMOVE_PAREN(type) name;
-#define PMACC_PP_CREATE_VALUE_VARIABLE_WITH_PAREN(elem)                        \
-    PMACC_PP_SELECT_TYPEID( 5,PMACC_PP_X_CREATE_VALUE_VARIABLE_WITH_PAREN, elem )
+#define PMACC_PP_X_CREATE_VALUE_VARIABLE_WITH_PAREN(data, type, name, ...) PMACC_PP_REMOVE_PAREN(type) name;
+#define PMACC_PP_CREATE_VALUE_VARIABLE_WITH_PAREN(elem)                                                               \
+    PMACC_PP_SELECT_TYPEID(5, PMACC_PP_X_CREATE_VALUE_VARIABLE_WITH_PAREN, elem)
 
-#define PMACC_PP_X_CREATE_C_VALUE_VARIABLE(data,type,name,...) static constexpr type name = __VA_ARGS__;
-#define PMACC_PP_CREATE_C_VALUE_VARIABLE(elem)                                 \
-    PMACC_PP_SELECT_TYPEID( 1,PMACC_PP_X_CREATE_C_VALUE_VARIABLE,elem )
+#define PMACC_PP_X_CREATE_C_VALUE_VARIABLE(data, type, name, ...) static constexpr type name = __VA_ARGS__;
+#define PMACC_PP_CREATE_C_VALUE_VARIABLE(elem) PMACC_PP_SELECT_TYPEID(1, PMACC_PP_X_CREATE_C_VALUE_VARIABLE, elem)
 
 
-#define PMACC_PP_X1_INIT_VALUE_VARIABLE(data,type,name,...) (name(__VA_ARGS__))
-#define PMACC_PP_X_INIT_VALUE_VARIABLE(elem)                                   \
-    PMACC_PP_SELECT_TYPEID( 2,PMACC_PP_X1_INIT_VALUE_VARIABLE,elem )
+#define PMACC_PP_X1_INIT_VALUE_VARIABLE(data, type, name, ...) (name(__VA_ARGS__))
+#define PMACC_PP_X_INIT_VALUE_VARIABLE(elem) PMACC_PP_SELECT_TYPEID(2, PMACC_PP_X1_INIT_VALUE_VARIABLE, elem)
 
-#define PMACC_PP_X_INIT_VALUE_VARIABLE_WITH_PAREN(elem)                        \
-    PMACC_PP_SELECT_TYPEID( 5,PMACC_PP_X1_INIT_VALUE_VARIABLE,elem )
+#define PMACC_PP_X_INIT_VALUE_VARIABLE_WITH_PAREN(elem)                                                               \
+    PMACC_PP_SELECT_TYPEID(5, PMACC_PP_X1_INIT_VALUE_VARIABLE, elem)
 
-#define PMACC_PP_X_CREATE_C_STRING_VARIABLE(data,type,name,...) static constexpr const char* name = __VA_ARGS__;
-#define PMACC_PP_CREATE_C_STRING_VARIABLE(elem)                                \
-    PMACC_PP_SELECT_TYPEID( 3,PMACC_PP_X_CREATE_C_STRING_VARIABLE, elem )
+#define PMACC_PP_X_CREATE_C_STRING_VARIABLE(data, type, name, ...) static constexpr const char* name = __VA_ARGS__;
+#define PMACC_PP_CREATE_C_STRING_VARIABLE(elem) PMACC_PP_SELECT_TYPEID(3, PMACC_PP_X_CREATE_C_STRING_VARIABLE, elem)
 
-#define PMACC_PP_X_CREATE_EXTENT(data,type,name,...) __VA_ARGS__
-#define PMACC_PP_CREATE_EXTENT(elem)                                           \
-    PMACC_PP_SELECT_TYPEID( 4,PMACC_PP_X_CREATE_EXTENT,elem )
+#define PMACC_PP_X_CREATE_EXTENT(data, type, name, ...) __VA_ARGS__
+#define PMACC_PP_CREATE_EXTENT(elem) PMACC_PP_SELECT_TYPEID(4, PMACC_PP_X_CREATE_EXTENT, elem)
 
-#define PMACC_PP_X1_ADD_DATA_TO_TYPEDESCRIPTION_MACRO(data,first,second) ((first,(data,PMACC_PP_REMOVE_PAREN(second))))
-#define PMACC_PP_X_ADD_DATA_TO_TYPEDESCRIPTION_MACRO(data,value) \
-    PMACC_PP_CALL(PMACC_PP_X1_ADD_DATA_TO_TYPEDESCRIPTION_MACRO,(data,value))
+#define PMACC_PP_X1_ADD_DATA_TO_TYPEDESCRIPTION_MACRO(data, first, second)                                            \
+    ((first, (data, PMACC_PP_REMOVE_PAREN(second))))
+#define PMACC_PP_X_ADD_DATA_TO_TYPEDESCRIPTION_MACRO(data, value)                                                     \
+    PMACC_PP_CALL(PMACC_PP_X1_ADD_DATA_TO_TYPEDESCRIPTION_MACRO, (data, value))
 
 /** @} */
 
-#define PMACC_PP_ADD_DATA_TO_TYPEDESCRIPTION_MACRO(r,data,elem)                \
-    PMACC_PP_X_ADD_DATA_TO_TYPEDESCRIPTION_MACRO(data,PMACC_PP_REMOVE_PAREN(elem))
+#define PMACC_PP_ADD_DATA_TO_TYPEDESCRIPTION_MACRO(r, data, elem)                                                     \
+    PMACC_PP_X_ADD_DATA_TO_TYPEDESCRIPTION_MACRO(data, PMACC_PP_REMOVE_PAREN(elem))
 
 /** create constructor initialization of non static variables
  *
@@ -261,14 +251,10 @@
  *
  * @param ... preprocessor sequence with TypeMemberPair's to inherit from
  */
-#define PMACC_PP_INIT_VALUE_VARIABLES(op,emptyStruct,...)                                  \
-    PMACC_PP_DEFER_REMOVE_PAREN() (                                            \
-        BOOST_PP_EXPAND(                                                       \
-          BOOST_PP_SEQ_TO_TUPLE (                                              \
-            BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,op,__VA_ARGS__ emptyStruct) \
-          )                                                                    \
-        )                                                                      \
-    )
+#define PMACC_PP_INIT_VALUE_VARIABLES(op, emptyStruct, ...)                                                           \
+    PMACC_PP_DEFER_REMOVE_PAREN()                                                                                     \
+    (BOOST_PP_EXPAND(                                                                                                 \
+        BOOST_PP_SEQ_TO_TUPLE(BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR, op, __VA_ARGS__ emptyStruct))))
 
 /** generate the definition of a struct
  *
@@ -276,26 +262,44 @@
  * @param name name of the struct
  * @param ... preprocessor sequence with TypeMemberPair's
  */
-#define PMACC_PP_STRUCT_DEF(namespace_name,name,...)                           \
-namespace namespace_name{                                                      \
-    BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,PMACC_PP_CREATE_C_VECTOR_DEF,__VA_ARGS__) \
-    struct EmptyStruct{};                                                      \
-    struct EmptyStruct2{};                                                     \
-    struct name : private EmptyStruct, private EmptyStruct2 {                  \
-        name():                                                                \
-        PMACC_PP_INIT_VALUE_VARIABLES(PMACC_PP_X_INIT_VALUE_VARIABLE,((2,(a,b,EmptyStruct))),__VA_ARGS__),            \
-        PMACC_PP_INIT_VALUE_VARIABLES(PMACC_PP_X_INIT_VALUE_VARIABLE_WITH_PAREN,((5,(a,b,EmptyStruct2))),__VA_ARGS__) \
-        {}                                                                     \
-                                                                               \
-        BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,PMACC_PP_CREATE_C_VALUE_VARIABLE,__VA_ARGS__)  \
-        BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,PMACC_PP_CREATE_VALUE_VARIABLE,__VA_ARGS__)    \
-        BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,PMACC_PP_CREATE_C_VECTOR_VARIABLE,__VA_ARGS__) \
-        BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,PMACC_PP_CREATE_C_STRING_VARIABLE,__VA_ARGS__) \
-        BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,PMACC_PP_CREATE_EXTENT,__VA_ARGS__)            \
-        BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,PMACC_PP_CREATE_VALUE_VARIABLE_WITH_PAREN,__VA_ARGS__) \
-        };                                                                     \
-}  /*namespace*/                                                               \
-using namespace_name::name
+#define PMACC_PP_STRUCT_DEF(namespace_name, name, ...)                                                                \
+    namespace namespace_name                                                                                          \
+    {                                                                                                                 \
+        BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR, PMACC_PP_CREATE_C_VECTOR_DEF, __VA_ARGS__)            \
+        struct EmptyStruct                                                                                            \
+        {                                                                                                             \
+        };                                                                                                            \
+        struct EmptyStruct2                                                                                           \
+        {                                                                                                             \
+        };                                                                                                            \
+        struct name                                                                                                   \
+            : private EmptyStruct                                                                                     \
+            , private EmptyStruct2                                                                                    \
+        {                                                                                                             \
+            name()                                                                                                    \
+                : PMACC_PP_INIT_VALUE_VARIABLES(                                                                      \
+                    PMACC_PP_X_INIT_VALUE_VARIABLE,                                                                   \
+                    ((2, (a, b, EmptyStruct))),                                                                       \
+                    __VA_ARGS__)                                                                                      \
+                , PMACC_PP_INIT_VALUE_VARIABLES(                                                                      \
+                      PMACC_PP_X_INIT_VALUE_VARIABLE_WITH_PAREN,                                                      \
+                      ((5, (a, b, EmptyStruct2))),                                                                    \
+                      __VA_ARGS__)                                                                                    \
+            {                                                                                                         \
+            }                                                                                                         \
+                                                                                                                      \
+            BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR, PMACC_PP_CREATE_C_VALUE_VARIABLE, __VA_ARGS__)    \
+            BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR, PMACC_PP_CREATE_VALUE_VARIABLE, __VA_ARGS__)      \
+            BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR, PMACC_PP_CREATE_C_VECTOR_VARIABLE, __VA_ARGS__)   \
+            BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR, PMACC_PP_CREATE_C_STRING_VARIABLE, __VA_ARGS__)   \
+            BOOST_PP_SEQ_FOR_EACH(PMACC_PP_SEQ_MACRO_WITH_ACCESSOR, PMACC_PP_CREATE_EXTENT, __VA_ARGS__)              \
+            BOOST_PP_SEQ_FOR_EACH(                                                                                    \
+                PMACC_PP_SEQ_MACRO_WITH_ACCESSOR,                                                                     \
+                PMACC_PP_CREATE_VALUE_VARIABLE_WITH_PAREN,                                                            \
+                __VA_ARGS__)                                                                                          \
+        };                                                                                                            \
+    } /*namespace*/                                                                                                   \
+    using namespace_name::name
 
 
 /** add data to TypeMemberPair's
@@ -305,7 +309,8 @@ using namespace_name::name
  * @param data any data which should be added to the TypeMemberPair's
  * @param ... preprocessor sequence with TypeMemberPair's
  */
-#define PMACC_PP_ADD_DATA_TO_TYPEDESCRIPTION(data,...) BOOST_PP_SEQ_FOR_EACH(PMACC_PP_ADD_DATA_TO_TYPEDESCRIPTION_MACRO,data,__VA_ARGS__)
+#define PMACC_PP_ADD_DATA_TO_TYPEDESCRIPTION(data, ...)                                                               \
+    BOOST_PP_SEQ_FOR_EACH(PMACC_PP_ADD_DATA_TO_TYPEDESCRIPTION_MACRO, data, __VA_ARGS__)
 
 /** generate a struct with static and dynamic members
  *
@@ -334,5 +339,8 @@ using namespace_name::name
  * );
  * @endcode
  */
-#define PMACC_STRUCT(name,...)                                                 \
-    PMACC_PP_STRUCT_DEF(BOOST_PP_CAT(BOOST_PP_CAT(pmacc_,name),__COUNTER__),name,PMACC_PP_ADD_DATA_TO_TYPEDESCRIPTION(name,__VA_ARGS__))
+#define PMACC_STRUCT(name, ...)                                                                                       \
+    PMACC_PP_STRUCT_DEF(                                                                                              \
+        BOOST_PP_CAT(BOOST_PP_CAT(pmacc_, name), __COUNTER__),                                                        \
+        name,                                                                                                         \
+        PMACC_PP_ADD_DATA_TO_TYPEDESCRIPTION(name, __VA_ARGS__))
diff --git a/include/pmacc/random/RNGHandle.hpp b/include/pmacc/random/RNGHandle.hpp
index bbf2cd9403..a788f00349 100644
--- a/include/pmacc/random/RNGHandle.hpp
+++ b/include/pmacc/random/RNGHandle.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -27,76 +27,71 @@
 
 namespace pmacc
 {
-namespace random
-{
-
-    /**
-     * A reference to a state of a RNG provider
-     */
-    template<class T_RNGProvider>
-    struct RNGHandle
+    namespace random
     {
-        typedef T_RNGProvider RNGProvider;
-        static constexpr uint32_t rngDim = RNGProvider::dim;
-        typedef typename RNGProvider::DataBoxType RNGBox;
-        typedef typename RNGProvider::RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType RNGState;
-        typedef pmacc::DataSpace<rngDim> RNGSpace;
-
-        template<class T_Distribution>
-        struct GetRandomType
-        {
-            typedef typename T_Distribution::template applyMethod<RNGMethod>::type Distribution;
-            typedef Random<Distribution, RNGMethod, RNGState*> type;
-        };
-
-        /**
-         * Creates an instance of the functor
-         *
-         * @param rngBox Databox of the RNG provider
-         */
-        RNGHandle(const RNGBox& rngBox): m_rngBox(rngBox)
-        {}
-
         /**
-         * Initializes this instance
-         *
-         * \param cellIdx index into the underlying RNG provider
+         * A reference to a state of a RNG provider
          */
-        HDINLINE void
-        init(const RNGSpace& cellIdx)
+        template<class T_RNGProvider>
+        struct RNGHandle
         {
-            m_rngBox = m_rngBox.shift(cellIdx);
-        }
+            typedef T_RNGProvider RNGProvider;
+            static constexpr uint32_t rngDim = RNGProvider::dim;
+            typedef typename RNGProvider::DataBoxType RNGBox;
+            typedef typename RNGProvider::RNGMethod RNGMethod;
+            typedef typename RNGMethod::StateType RNGState;
+            typedef pmacc::DataSpace<rngDim> RNGSpace;
 
-        HDINLINE RNGState&
-        getState()
-        {
-            return m_rngBox(RNGSpace::create(0));
-        }
+            template<class T_Distribution>
+            struct GetRandomType
+            {
+                typedef typename T_Distribution::template applyMethod<RNGMethod>::type Distribution;
+                typedef Random<Distribution, RNGMethod, RNGState*> type;
+            };
 
-        HDINLINE RNGState&
-        operator*()
-        {
-            return m_rngBox(RNGSpace::create(0));
-        }
+            /**
+             * Creates an instance of the functor
+             *
+             * @param rngBox Databox of the RNG provider
+             */
+            RNGHandle(const RNGBox& rngBox) : m_rngBox(rngBox)
+            {
+            }
 
-        HDINLINE RNGState&
-        operator->()
-        {
-            return m_rngBox(RNGSpace::create(0));
-        }
+            /**
+             * Initializes this instance
+             *
+             * \param cellIdx index into the underlying RNG provider
+             */
+            HDINLINE void init(const RNGSpace& cellIdx)
+            {
+                m_rngBox = m_rngBox.shift(cellIdx);
+            }
 
-        template<class T_Distribution>
-        HDINLINE typename GetRandomType<T_Distribution>::type
-        applyDistribution()
-        {
-            return typename GetRandomType<T_Distribution>::type(&getState());
-        }
+            HDINLINE RNGState& getState()
+            {
+                return m_rngBox(RNGSpace::create(0));
+            }
 
-    protected:
-        PMACC_ALIGN8(m_rngBox, RNGBox);
-    };
+            HDINLINE RNGState& operator*()
+            {
+                return m_rngBox(RNGSpace::create(0));
+            }
+
+            HDINLINE RNGState& operator->()
+            {
+                return m_rngBox(RNGSpace::create(0));
+            }
+
+            template<class T_Distribution>
+            HDINLINE typename GetRandomType<T_Distribution>::type applyDistribution()
+            {
+                return typename GetRandomType<T_Distribution>::type(&getState());
+            }
+
+        protected:
+            PMACC_ALIGN8(m_rngBox, RNGBox);
+        };
 
-}  // namespace random
-}  // namespace pmacc
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/RNGProvider.hpp b/include/pmacc/random/RNGProvider.hpp
index 14cc7e75e1..744b98c9b9 100644
--- a/include/pmacc/random/RNGProvider.hpp
+++ b/include/pmacc/random/RNGProvider.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -29,101 +29,99 @@
 
 namespace pmacc
 {
-namespace random
-{
-
-    /**
-     * Provider of a per cell random number generator
-     *
-     * \tparam T_dim Number of dimensions of the grid
-     * \tparam T_RNGMethod Method to use for random number generation
-     */
-    template<uint32_t T_dim, class T_RNGMethod>
-    class RNGProvider : public ISimulationData
+    namespace random
     {
-    public:
-        static constexpr uint32_t dim = T_dim;
-        typedef T_RNGMethod RNGMethod;
-        typedef DataSpace<dim> Space;
-
-    private:
-        typedef typename RNGMethod::StateType RNGState;
-
-    public:
-        typedef HostDeviceBuffer< RNGState, dim > Buffer;
-        typedef typename Buffer::DataBoxType DataBoxType;
-        typedef RNGHandle<RNGProvider> Handle;
-
-        template<class T_Distribution>
-        struct GetRandomType
-        {
-            typedef typename T_Distribution::template applyMethod<RNGMethod>::type Distribution;
-            typedef Random<Distribution, RNGMethod, Handle> type;
-        };
-
         /**
-         * Create the RNGProvider and allocate memory for the given size
+         * Provider of a per cell random number generator
          *
-         * @param size Size of the grid for which RNGs should be provided
-         * @param uniqueId Unique ID for this instance. If none is given the default
-         *          (as returned by \ref getName()) is used
+         * \tparam T_dim Number of dimensions of the grid
+         * \tparam T_RNGMethod Method to use for random number generation
          */
-        RNGProvider(const Space& size, const std::string& uniqueId = "");
-        virtual ~RNGProvider()
+        template<uint32_t T_dim, class T_RNGMethod>
+        class RNGProvider : public ISimulationData
         {
-            __delete(buffer)
-        }
-        /**
-         * Initializes the random number generators
-         * Must be called before usage
-         * @param seed Base seed to be used
-         */
-        void init(uint32_t seed);
+        public:
+            static constexpr uint32_t dim = T_dim;
+            typedef T_RNGMethod RNGMethod;
+            typedef DataSpace<dim> Space;
 
-        /**
-         * Factory method
-         * Creates a handle to a state that can be used to create actual RNGs
-         *
-         * @param id SimulationDataId of the RNGProvider to use. Defaults to the default Id of the type
-         */
-        static Handle
-        createHandle(const std::string& id = getName());
+        private:
+            typedef typename RNGMethod::StateType RNGState;
 
-        /**
-         * Factory method
-         * Creates functor that creates random numbers with a given distribution
-         * Similar to the Handle but can be used directly
-         *
-         * @param id SimulationDataId of the RNGProvider to use. Defaults to the default Id of the type
-         */
-        template<class T_Distribution>
-        static typename GetRandomType<T_Distribution>::type
-        createRandom(const std::string& id = getName());
+        public:
+            typedef HostDeviceBuffer<RNGState, dim> Buffer;
+            typedef typename Buffer::DataBoxType DataBoxType;
+            typedef RNGHandle<RNGProvider> Handle;
 
-        /**
-         * Returns the default id for this type
-         */
-        static std::string getName();
-        SimulationDataId getUniqueId() override;
-        void synchronize() override;
+            template<class T_Distribution>
+            struct GetRandomType
+            {
+                typedef typename T_Distribution::template applyMethod<RNGMethod>::type Distribution;
+                typedef Random<Distribution, RNGMethod, Handle> type;
+            };
 
-        /**
-         * Return a reference to the buffer containing the states
-         * Note: This buffer might be empty
-         */
-        Buffer& getStateBuffer();
-    private:
-        /**
-         * Gets the device data box
-         */
-        DataBoxType getDeviceDataBox();
+            /**
+             * Create the RNGProvider and allocate memory for the given size
+             *
+             * @param size Size of the grid for which RNGs should be provided
+             * @param uniqueId Unique ID for this instance. If none is given the default
+             *          (as returned by \ref getName()) is used
+             */
+            RNGProvider(const Space& size, const std::string& uniqueId = "");
+            virtual ~RNGProvider()
+            {
+                __delete(buffer)
+            }
+            /**
+             * Initializes the random number generators
+             * Must be called before usage
+             * @param seed Base seed to be used
+             */
+            void init(uint32_t seed);
+
+            /**
+             * Factory method
+             * Creates a handle to a state that can be used to create actual RNGs
+             *
+             * @param id SimulationDataId of the RNGProvider to use. Defaults to the default Id of the type
+             */
+            static Handle createHandle(const std::string& id = getName());
+
+            /**
+             * Factory method
+             * Creates functor that creates random numbers with a given distribution
+             * Similar to the Handle but can be used directly
+             *
+             * @param id SimulationDataId of the RNGProvider to use. Defaults to the default Id of the type
+             */
+            template<class T_Distribution>
+            static typename GetRandomType<T_Distribution>::type createRandom(const std::string& id = getName());
+
+            /**
+             * Returns the default id for this type
+             */
+            static std::string getName();
+            SimulationDataId getUniqueId() override;
+            void synchronize() override;
 
-        const Space m_size;
-        Buffer* buffer;
-        const std::string m_uniqueId;
-    };
+            /**
+             * Return a reference to the buffer containing the states
+             * Note: This buffer might be empty
+             */
+            Buffer& getStateBuffer();
+
+        private:
+            /**
+             * Gets the device data box
+             */
+            DataBoxType getDeviceDataBox();
+
+            const Space m_size;
+            Buffer* buffer;
+            const std::string m_uniqueId;
+        };
 
-}  // namespace random
-}  // namespace pmacc
+    } // namespace random
+} // namespace pmacc
 
 #include "pmacc/random/RNGProvider.tpp"
diff --git a/include/pmacc/random/RNGProvider.tpp b/include/pmacc/random/RNGProvider.tpp
index 11dbae9992..0da2d080a3 100644
--- a/include/pmacc/random/RNGProvider.tpp
+++ b/include/pmacc/random/RNGProvider.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -33,163 +33,115 @@
 
 namespace pmacc
 {
-namespace random
-{
-
-    namespace kernel {
-
-        template<
-            uint32_t T_numWorkers,
-            uint32_t T_blockSize,
-            typename T_RNGMethod
-        >
-        struct InitRNGProvider
+    namespace random
+    {
+        namespace kernel
         {
-            template<
-                typename T_RNGBox,
-                typename T_Space,
-                typename T_Acc
-            >
-            DINLINE void
-            operator()(
-                T_Acc const & acc,
-                T_RNGBox rngBox,
-                uint32_t seed,
-                const T_Space size
-            ) const
+            template<uint32_t T_numWorkers, uint32_t T_blockSize, typename T_RNGMethod>
+            struct InitRNGProvider
             {
-                using namespace mappings::threads;
-
-                constexpr uint32_t numWorkers = T_numWorkers;
-                uint32_t const workerIdx = threadIdx.x;
-
-                using SupercellDomCfg = IdxConfig<
-                    T_blockSize,
-                    numWorkers
-                >;
-
-                // each virtual worker initialize one rng state
-                ForEachIdx< SupercellDomCfg > forEachCell( workerIdx );
-
-                forEachCell(
-                    [&](
-                        uint32_t const linearIdx,
-                        uint32_t const
-                    )
-                    {
-                        uint32_t const linearTid = blockIdx.x * T_blockSize + linearIdx;
-                        if( linearTid >= size.productOfComponents() )
+                template<typename T_RNGBox, typename T_Space, typename T_Acc>
+                DINLINE void operator()(T_Acc const& acc, T_RNGBox rngBox, uint32_t seed, const T_Space size) const
+                {
+                    using namespace mappings::threads;
+
+                    constexpr uint32_t numWorkers = T_numWorkers;
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                    using SupercellDomCfg = IdxConfig<T_blockSize, numWorkers>;
+
+                    // each virtual worker initialize one rng state
+                    ForEachIdx<SupercellDomCfg> forEachCell(workerIdx);
+
+                    forEachCell([&](uint32_t const linearIdx, uint32_t const) {
+                        uint32_t const linearTid = cupla::blockIdx(acc).x * T_blockSize + linearIdx;
+                        if(linearTid >= size.productOfComponents())
                             return;
 
-                        T_Space const cellIdx = DataSpaceOperations< T_Space::dim >::map(size, linearTid);
-                        T_RNGMethod().init(
-                            acc,
-                            rngBox( cellIdx ),
-                            seed,
-                            linearTid
-                        );
-                    }
-                );
-            }
-        };
-
-    }  // namespace kernel
-
-    template<uint32_t T_dim, class T_RNGMethod>
-    RNGProvider<T_dim, T_RNGMethod>::RNGProvider(const Space& size, const std::string& uniqueId):
-            m_size(size), m_uniqueId(uniqueId.empty() ? getName() : uniqueId),
-            buffer(new Buffer(size))
-    {
-        if(m_size.productOfComponents() == 0)
-            throw std::invalid_argument("Cannot create RNGProvider with zero size");
-    }
+                        T_Space const cellIdx = DataSpaceOperations<T_Space::dim>::map(size, linearTid);
+                        T_RNGMethod().init(acc, rngBox(cellIdx), seed, linearTid);
+                    });
+                }
+            };
 
-    template<uint32_t T_dim, class T_RNGMethod>
-    void RNGProvider<T_dim, T_RNGMethod>::init(uint32_t seed)
-    {
+        } // namespace kernel
 
-        const uint32_t blockSize = 256;
-
-        constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-            blockSize
-        >::value;
-
-        const uint32_t gridSize = (m_size.productOfComponents() + blockSize - 1u) / blockSize; // Round up
-
-        auto bufferBox = buffer->getDeviceBuffer().getDataBox();
-
-        PMACC_KERNEL(
-            kernel::InitRNGProvider<
-                numWorkers,
-                blockSize,
-                RNGMethod>{}
-        )(
-            gridSize,
-            numWorkers
-        )(
-            bufferBox,
-            seed,
-            m_size
-        );
-    }
-
-    template<uint32_t T_dim, class T_RNGMethod>
-    typename RNGProvider<T_dim, T_RNGMethod>::Handle
-    RNGProvider<T_dim, T_RNGMethod>::createHandle(const std::string& id)
-    {
-        auto provider =
-            Environment<>::get().DataConnector().get< RNGProvider >( id, true );
-        Handle result( provider->getDeviceDataBox() );
-        Environment<>::get().DataConnector().releaseData( id );
-        return result;
-    }
-
-    template<uint32_t T_dim, class T_RNGMethod>
-    template<class T_Distribution>
-    typename RNGProvider<T_dim, T_RNGMethod>::template GetRandomType<T_Distribution>::type
-    RNGProvider<T_dim, T_RNGMethod>::createRandom(const std::string& id)
-    {
-        typedef typename GetRandomType<T_Distribution>::type ResultType;
-        return ResultType(createHandle());
-    }
+        template<uint32_t T_dim, class T_RNGMethod>
+        RNGProvider<T_dim, T_RNGMethod>::RNGProvider(const Space& size, const std::string& uniqueId)
+            : m_size(size)
+            , m_uniqueId(uniqueId.empty() ? getName() : uniqueId)
+            , buffer(new Buffer(size))
+        {
+            if(m_size.productOfComponents() == 0)
+                throw std::invalid_argument("Cannot create RNGProvider with zero size");
+        }
 
-    template<uint32_t T_dim, class T_RNGMethod>
-    typename RNGProvider<T_dim, T_RNGMethod>::Buffer&
-    RNGProvider<T_dim, T_RNGMethod>::getStateBuffer()
-    {
-        return *buffer;
-    }
+        template<uint32_t T_dim, class T_RNGMethod>
+        void RNGProvider<T_dim, T_RNGMethod>::init(uint32_t seed)
+        {
+            const uint32_t blockSize = 256;
 
-    template<uint32_t T_dim, class T_RNGMethod>
-    typename RNGProvider<T_dim, T_RNGMethod>::DataBoxType
-    RNGProvider<T_dim, T_RNGMethod>::getDeviceDataBox()
-    {
-        return buffer->getDeviceBuffer().getDataBox();
-    }
+            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<blockSize>::value;
 
-    template<uint32_t T_dim, class T_RNGMethod>
-    std::string
-    RNGProvider<T_dim, T_RNGMethod>::getName()
-    {
-        /* generate a unique name (for this type!) to use as a default ID */
-        return std::string("RNGProvider")
-                + char('0' + dim) /* valid for 0..9 */
+            const uint32_t gridSize = (m_size.productOfComponents() + blockSize - 1u) / blockSize; // Round up
+
+            auto bufferBox = buffer->getDeviceBuffer().getDataBox();
+
+            PMACC_KERNEL(kernel::InitRNGProvider<numWorkers, blockSize, RNGMethod>{})
+            (gridSize, numWorkers)(bufferBox, seed, m_size);
+        }
+
+        template<uint32_t T_dim, class T_RNGMethod>
+        typename RNGProvider<T_dim, T_RNGMethod>::Handle RNGProvider<T_dim, T_RNGMethod>::createHandle(
+            const std::string& id)
+        {
+            auto provider = Environment<>::get().DataConnector().get<RNGProvider>(id, true);
+            Handle result(provider->getDeviceDataBox());
+            Environment<>::get().DataConnector().releaseData(id);
+            return result;
+        }
+
+        template<uint32_t T_dim, class T_RNGMethod>
+        template<class T_Distribution>
+        typename RNGProvider<T_dim, T_RNGMethod>::template GetRandomType<T_Distribution>::type RNGProvider<
+            T_dim,
+            T_RNGMethod>::createRandom(const std::string& id)
+        {
+            typedef typename GetRandomType<T_Distribution>::type ResultType;
+            return ResultType(createHandle());
+        }
+
+        template<uint32_t T_dim, class T_RNGMethod>
+        typename RNGProvider<T_dim, T_RNGMethod>::Buffer& RNGProvider<T_dim, T_RNGMethod>::getStateBuffer()
+        {
+            return *buffer;
+        }
+
+        template<uint32_t T_dim, class T_RNGMethod>
+        typename RNGProvider<T_dim, T_RNGMethod>::DataBoxType RNGProvider<T_dim, T_RNGMethod>::getDeviceDataBox()
+        {
+            return buffer->getDeviceBuffer().getDataBox();
+        }
+
+        template<uint32_t T_dim, class T_RNGMethod>
+        std::string RNGProvider<T_dim, T_RNGMethod>::getName()
+        {
+            /* generate a unique name (for this type!) to use as a default ID */
+            return std::string("RNGProvider") + char('0' + dim) /* valid for 0..9 */
                 + RNGMethod::getName();
-    }
+        }
 
-    template<uint32_t T_dim, class T_RNGMethod>
-    SimulationDataId
-    RNGProvider<T_dim, T_RNGMethod>::getUniqueId()
-    {
-        return m_uniqueId;
-    }
+        template<uint32_t T_dim, class T_RNGMethod>
+        SimulationDataId RNGProvider<T_dim, T_RNGMethod>::getUniqueId()
+        {
+            return m_uniqueId;
+        }
 
-    template<uint32_t T_dim, class T_RNGMethod>
-    void
-    RNGProvider<T_dim, T_RNGMethod>::synchronize()
-    {
-        buffer->deviceToHost();
-    }
+        template<uint32_t T_dim, class T_RNGMethod>
+        void RNGProvider<T_dim, T_RNGMethod>::synchronize()
+        {
+            buffer->deviceToHost();
+        }
 
-}  // namespace random
-}  // namespace pmacc
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/RNGState.hpp b/include/pmacc/random/RNGState.hpp
index 7be9137f10..8416aa1c51 100644
--- a/include/pmacc/random/RNGState.hpp
+++ b/include/pmacc/random/RNGState.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -25,34 +25,35 @@
 
 namespace pmacc
 {
-namespace random
-{
-
-    /**
-     * Wrapper class for a state of a random number generator
-     * Can be used for aligned storing of states
-     */
-    template<class T_RNGMethod>
-    class RNGState
+    namespace random
     {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
+        /**
+         * Wrapper class for a state of a random number generator
+         * Can be used for aligned storing of states
+         */
+        template<class T_RNGMethod>
+        class RNGState
+        {
+        public:
+            typedef T_RNGMethod RNGMethod;
+            typedef typename RNGMethod::StateType StateType;
 
-        HDINLINE RNGState()
-        {}
+            HDINLINE RNGState()
+            {
+            }
 
-        HDINLINE RNGState(const StateType& other): state(other)
-        {}
+            HDINLINE RNGState(const StateType& other) : state(other)
+            {
+            }
 
-        HDINLINE StateType&
-        getState()
-        {
-            return state;
-        }
-    private:
-        PMACC_ALIGN8(StateType) state;
-    };
-
-}  // namespace random
-}  // namespace pmacc
+            HDINLINE StateType& getState()
+            {
+                return state;
+            }
+
+        private:
+            PMACC_ALIGN8(StateType, ) state;
+        };
+
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/Random.hpp b/include/pmacc/random/Random.hpp
index 3451628876..b59ddf7007 100644
--- a/include/pmacc/random/Random.hpp
+++ b/include/pmacc/random/Random.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -27,91 +27,81 @@
 
 namespace pmacc
 {
-namespace random
-{
-
-    /**
-     * Random Number Generator. Functor that returns a random number per call
-     *
-     * Default implementation assumes a RNGHandle
-     */
-    template<
-        class T_Distribution,
-        class T_RNGMethod,
-        class T_RNGStatePtrOrHandle = typename T_RNGMethod::StateType*
-    >
-    struct Random: private T_Distribution, private T_RNGStatePtrOrHandle
+    namespace random
     {
-        typedef T_RNGMethod RNGMethod;
-        /* RNGHandle assumed */
-        typedef T_RNGStatePtrOrHandle RNGHandle;
-        typedef T_Distribution Distribution;
-        typedef typename boost::result_of<Distribution(typename RNGHandle::RNGState&)>::type result_type;
-
-        /** This can be constructed with either the RNGBox (like the RNGHandle) or from an RNGHandle instance */
-        template<class T_RNGBoxOrHandle>
-        explicit HINLINE Random(const T_RNGBoxOrHandle& rngBox): RNGHandle(rngBox)
-        {}
-
         /**
-         * Initializes this instance
+         * Random Number Generator. Functor that returns a random number per call
          *
-         * \param cellIdx index into the underlying RNG Provider
+         * Default implementation assumes a RNGHandle
          */
-        template<typename T_Offset>
-        HDINLINE void
-        init(const T_Offset& cellIdx)
-        {
-            RNGHandle::init(cellIdx);
-        }
-
-        /** Returns a new random number advancing the state */
-        template< typename T_Acc >
-        DINLINE result_type
-        operator()( T_Acc const & acc )
+        template<
+            class T_Distribution,
+            class T_RNGMethod,
+            class T_RNGStatePtrOrHandle = typename T_RNGMethod::StateType*>
+        struct Random
+            : private T_Distribution
+            , private T_RNGStatePtrOrHandle
         {
-            return Distribution::operator()(
-                acc,
-                RNGHandle::getState()
-            );
-        }
-    };
+            typedef T_RNGMethod RNGMethod;
+            /* RNGHandle assumed */
+            typedef T_RNGStatePtrOrHandle RNGHandle;
+            typedef T_Distribution Distribution;
+            typedef typename boost::result_of<Distribution(typename RNGHandle::RNGState&)>::type result_type;
 
-    /**
-     * Specialization when the state is a pointer
-     */
-    template<
-        class T_Distribution,
-        class T_RNGMethod,
-        class T_RNGState
-    >
-    struct Random<T_Distribution, T_RNGMethod, T_RNGState*>: private T_Distribution
-    {
-        typedef T_RNGMethod RNGMethod;
-        typedef T_RNGState RNGState;
-        typedef T_Distribution Distribution;
-        typedef typename boost::result_of<Distribution(RNGState&)>::type result_type;
+            /** This can be constructed with either the RNGBox (like the RNGHandle) or from an RNGHandle instance */
+            template<class T_RNGBoxOrHandle>
+            explicit HINLINE Random(const T_RNGBoxOrHandle& rngBox) : RNGHandle(rngBox)
+            {
+            }
 
-        HDINLINE Random(): m_rngState(nullptr)
-        {}
+            /**
+             * Initializes this instance
+             *
+             * \param cellIdx index into the underlying RNG Provider
+             */
+            template<typename T_Offset>
+            HDINLINE void init(const T_Offset& cellIdx)
+            {
+                RNGHandle::init(cellIdx);
+            }
 
-        HDINLINE Random(RNGState* m_rngState): m_rngState(m_rngState)
-        {}
+            /** Returns a new random number advancing the state */
+            template<typename T_Acc>
+            DINLINE result_type operator()(T_Acc const& acc)
+            {
+                return Distribution::operator()(acc, RNGHandle::getState());
+            }
+        };
 
-        /** Returns a new random number advancing the state */
-        template< typename T_Acc >
-        DINLINE result_type
-        operator()( T_Acc const & acc )
+        /**
+         * Specialization when the state is a pointer
+         */
+        template<class T_Distribution, class T_RNGMethod, class T_RNGState>
+        struct Random<T_Distribution, T_RNGMethod, T_RNGState*> : private T_Distribution
         {
-            return Distribution::operator()(
-                acc,
-                *m_rngState
-            );
-        }
+            typedef T_RNGMethod RNGMethod;
+            typedef T_RNGState RNGState;
+            typedef T_Distribution Distribution;
+            typedef typename boost::result_of<Distribution(RNGState&)>::type result_type;
+
+            HDINLINE Random() : m_rngState(nullptr)
+            {
+            }
+
+            HDINLINE Random(RNGState* m_rngState) : m_rngState(m_rngState)
+            {
+            }
+
+            /** Returns a new random number advancing the state */
+            template<typename T_Acc>
+            DINLINE result_type operator()(T_Acc const& acc)
+            {
+                return Distribution::operator()(acc, *m_rngState);
+            }
 
-    protected:
-        PMACC_ALIGN(m_rngState, RNGState*);
-    };
+        protected:
+            PMACC_ALIGN(m_rngState, RNGState*);
+        };
 
-}  // namespace random
-}  // namespace pmacc
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/Normal.hpp b/include/pmacc/random/distributions/Normal.hpp
index f5e2b74fbc..6f34bc707e 100644
--- a/include/pmacc/random/distributions/Normal.hpp
+++ b/include/pmacc/random/distributions/Normal.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -26,36 +26,33 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-    namespace detail
+    namespace random
     {
-        /** Only this must be specialized for different types */
-        template<typename T_Type, class T_RNGMethod, class T_SFINAE = void>
-        class Normal;
-    }
-
-    /**
-     * Returns a random, normal distributed value of the given type
-     */
-    template<typename T_Type, class T_RNGMethod = methods::RngPlaceholder>
-    struct Normal: public detail::Normal<T_Type, T_RNGMethod>
-    {
-        template< typename T_Method >
-        struct applyMethod
+        namespace distributions
         {
-            using type = Normal<
-                T_Type,
-                T_Method
-            >;
-        };
-    };
+            namespace detail
+            {
+                /** Only this must be specialized for different types */
+                template<typename T_Type, class T_RNGMethod, class T_SFINAE = void>
+                class Normal;
+            } // namespace detail
+
+            /**
+             * Returns a random, normal distributed value of the given type
+             */
+            template<typename T_Type, class T_RNGMethod = methods::RngPlaceholder>
+            struct Normal : public detail::Normal<T_Type, T_RNGMethod>
+            {
+                template<typename T_Method>
+                struct applyMethod
+                {
+                    using type = Normal<T_Type, T_Method>;
+                };
+            };
 
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
 
 #include "pmacc/random/distributions/normal/Normal_generic.hpp"
 #include "pmacc/random/distributions/normal/Normal_float.hpp"
diff --git a/include/pmacc/random/distributions/Uniform.hpp b/include/pmacc/random/distributions/Uniform.hpp
index 1f7ee392f1..cc1c095b4b 100644
--- a/include/pmacc/random/distributions/Uniform.hpp
+++ b/include/pmacc/random/distributions/Uniform.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -30,45 +30,42 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail {
-
-    /** Only this must be specialized for different types */
-    template<typename T_Type, class T_RNGMethod, class T_SFINAE = void>
-    class Uniform;
-
-}  // namespace detail
-
-    /**
-     * Returns a random, uniformly distributed value of the given type
-     *
-     * @tparam T_Type the result type or a range description @see uniform/Range.hpp
-     * \code
-     * Uniform<uniform::ExcludeOne<float>::Reduced> UniformReducedDistribution; //default
-     * Uniform<float> UniformDefaultDistribution; //equal to line one
-     * Uniform<uniform::ExcludeZero<float> > UniformNoZeroDistribution;
-     * \endcode
-     * @tparam T_RNGMethod method to create a random number
-     */
-    template<typename T_Type, class T_RNGMethod = methods::RngPlaceholder>
-    struct Uniform: public detail::Uniform<T_Type, T_RNGMethod>
+    namespace random
     {
-        template< typename T_Method >
-        struct applyMethod
+        namespace distributions
         {
-            using type = Uniform<
-                T_Type,
-                T_Method
-            >;
-        };
-    };
+            namespace detail
+            {
+                /** Only this must be specialized for different types */
+                template<typename T_Type, class T_RNGMethod, class T_SFINAE = void>
+                class Uniform;
+
+            } // namespace detail
+
+            /**
+             * Returns a random, uniformly distributed value of the given type
+             *
+             * @tparam T_Type the result type or a range description @see uniform/Range.hpp
+             * \code
+             * Uniform<uniform::ExcludeOne<float>::Reduced> UniformReducedDistribution; //default
+             * Uniform<float> UniformDefaultDistribution; //equal to line one
+             * Uniform<uniform::ExcludeZero<float> > UniformNoZeroDistribution;
+             * \endcode
+             * @tparam T_RNGMethod method to create a random number
+             */
+            template<typename T_Type, class T_RNGMethod = methods::RngPlaceholder>
+            struct Uniform : public detail::Uniform<T_Type, T_RNGMethod>
+            {
+                template<typename T_Method>
+                struct applyMethod
+                {
+                    using type = Uniform<T_Type, T_Method>;
+                };
+            };
 
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
 
 #include "pmacc/random/distributions/uniform/Uniform_float.hpp"
 #include "pmacc/random/distributions/uniform/Uniform_double.hpp"
diff --git a/include/pmacc/random/distributions/distributions.hpp b/include/pmacc/random/distributions/distributions.hpp
index b64d5d34b2..d08fbd3604 100644
--- a/include/pmacc/random/distributions/distributions.hpp
+++ b/include/pmacc/random/distributions/distributions.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/random/distributions/misc/MullerBox.hpp b/include/pmacc/random/distributions/misc/MullerBox.hpp
index c3f79da7ee..bbf0b5dec8 100644
--- a/include/pmacc/random/distributions/misc/MullerBox.hpp
+++ b/include/pmacc/random/distributions/misc/MullerBox.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,115 +28,84 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-
-    /** create a normal distributed random number
-     *
-     * Create a random number with mean 0 and standard deviation 1.
-     * The implementation based on the Wikipedia article:
-     *    - source: https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
-     *    - date: 01/12/2017
-     */
-    template<
-        typename T_Type,
-        typename T_RNGMethod
-    >
-    class MullerBox :
-        Uniform<
-            uniform::ExcludeZero< T_Type >,
-            T_RNGMethod
-        >
+    namespace random
     {
-        /** The muller box is creating two random number, each second time
-         * this number is valid and can be used.
-         */
-        T_Type secondRngNumber;
-        //! true if secondRngNumber is valid else false
-        bool hasSecondRngNumber = false;
+        namespace distributions
+        {
+            /** create a normal distributed random number
+             *
+             * Create a random number with mean 0 and standard deviation 1.
+             * The implementation based on the Wikipedia article:
+             *    - source: https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+             *    - date: 01/12/2017
+             */
+            template<typename T_Type, typename T_RNGMethod>
+            class MullerBox : Uniform<uniform::ExcludeZero<T_Type>, T_RNGMethod>
+            {
+                /** The muller box is creating two random number, each second time
+                 * this number is valid and can be used.
+                 */
+                T_Type secondRngNumber;
+                //! true if secondRngNumber is valid else false
+                bool hasSecondRngNumber = false;
 
-        using RNGMethod = T_RNGMethod;
-        using UniformRng = Uniform<
-            uniform::ExcludeZero< T_Type >,
-            RNGMethod
-        >;
-        using StateType = typename RNGMethod::StateType;
+                using RNGMethod = T_RNGMethod;
+                using UniformRng = Uniform<uniform::ExcludeZero<T_Type>, RNGMethod>;
+                using StateType = typename RNGMethod::StateType;
 
-        /** generate a normal distributed random number
-         *
-         * @param acc alpaka accelerator
-         * @param state the state of an pmacc random number generator
-         */
-        template< typename T_Acc >
-        DINLINE T_Type getNormal(
-            T_Acc const & acc,
-            StateType& state
-        )
-        {
-            constexpr T_Type valueTwoPI = 6.2831853071795860;
+                /** generate a normal distributed random number
+                 *
+                 * @param acc alpaka accelerator
+                 * @param state the state of an pmacc random number generator
+                 */
+                template<typename T_Acc>
+                DINLINE T_Type getNormal(T_Acc const& acc, StateType& state)
+                {
+                    constexpr T_Type valueTwoPI = 6.2831853071795860;
 
-            T_Type u1 = UniformRng::operator()(
-                acc,
-                state
-            );
-            T_Type u2 = UniformRng::operator()(
-                acc,
-                state
-            ) * valueTwoPI;
+                    T_Type u1 = UniformRng::operator()(acc, state);
+                    T_Type u2 = UniformRng::operator()(acc, state) * valueTwoPI;
 
-            T_Type s = algorithms::math::sqrt( T_Type( -2.0 ) * algorithms::math::log( u1 ) );
+                    T_Type s = cupla::math::sqrt(T_Type(-2.0) * cupla::math::log(u1));
 
-            T_Type firstRngNumber;
-            algorithms::math::sincos(
-                u2,
-                firstRngNumber,
-                secondRngNumber
-            );
+                    T_Type firstRngNumber;
+                    pmacc::math::sincos(u2, firstRngNumber, secondRngNumber);
 
-            firstRngNumber *= s;
-            secondRngNumber *= s;
-            hasSecondRngNumber = true;
-            return firstRngNumber;
-        }
+                    firstRngNumber *= s;
+                    secondRngNumber *= s;
+                    hasSecondRngNumber = true;
+                    return firstRngNumber;
+                }
 
-    public:
-        //! result type of the random number
-        using result_type = T_Type;
+            public:
+                //! result type of the random number
+                using result_type = T_Type;
 
-        /** generate a normal distributed random number
-         *
-         * Generates two random numbers with the first call, each second call
-         * the precomputed random number is returned.
-         *
-         * @param acc alpaka accelerator
-         * @param state the state of an pmacc random number generator
-         */
-        template< typename T_Acc >
-        DINLINE result_type
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        )
-        {
-            T_Type result;
-            if( hasSecondRngNumber )
-            {
-                result = secondRngNumber;
-                hasSecondRngNumber = false;
-            }
-            else
-            {
-                result = getNormal(
-                    acc,
-                    state
-                );
-            }
-            return result;
-        }
-    };
+                /** generate a normal distributed random number
+                 *
+                 * Generates two random numbers with the first call, each second call
+                 * the precomputed random number is returned.
+                 *
+                 * @param acc alpaka accelerator
+                 * @param state the state of an pmacc random number generator
+                 */
+                template<typename T_Acc>
+                DINLINE result_type operator()(T_Acc const& acc, StateType& state)
+                {
+                    T_Type result;
+                    if(hasSecondRngNumber)
+                    {
+                        result = secondRngNumber;
+                        hasSecondRngNumber = false;
+                    }
+                    else
+                    {
+                        result = getNormal(acc, state);
+                    }
+                    return result;
+                }
+            };
 
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/normal/Normal_double.hpp b/include/pmacc/random/distributions/normal/Normal_double.hpp
index 40337e07fe..2cd7570167 100644
--- a/include/pmacc/random/distributions/normal/Normal_double.hpp
+++ b/include/pmacc/random/distributions/normal/Normal_double.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -34,51 +34,30 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail
-{
+    namespace random
+    {
+        namespace distributions
+        {
+            namespace detail
+            {
 /* XorMin and MRG32k3aMin uses the alpaka RNG as fallback for CPU accelerators
  * therefore we are not allowed to add a specialization for those RNG methods
  */
-#if( PMACC_CUDA_ENABLED == 1 )
-    //! specialization for XorMin
-    template<
-        typename T_Acc
-    >
-    struct Normal<
-        double,
-        methods::XorMin< T_Acc >,
-        void
-    > :
-        public MullerBox<
-            double,
-            methods::XorMin< T_Acc >
-        >
-    {
-
-    };
-
-    //! specialization for MRG32k3aMin
-    template<
-        typename T_Acc
-    >
-    struct Normal<
-        double,
-        methods::MRG32k3aMin< T_Acc >,
-        void
-    > :
-        public MullerBox<
-            double,
-            methods::MRG32k3aMin< T_Acc >
-        >
-    {
-
-    };
+#if(PMACC_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+                //! specialization for XorMin
+                template<typename T_Acc>
+                struct Normal<double, methods::XorMin<T_Acc>, void> : public MullerBox<double, methods::XorMin<T_Acc>>
+                {
+                };
+
+                //! specialization for MRG32k3aMin
+                template<typename T_Acc>
+                struct Normal<double, methods::MRG32k3aMin<T_Acc>, void>
+                    : public MullerBox<double, methods::MRG32k3aMin<T_Acc>>
+                {
+                };
 #endif
-}  // namespace detail
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+            } // namespace detail
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/normal/Normal_float.hpp b/include/pmacc/random/distributions/normal/Normal_float.hpp
index 7b434806e0..bf4a89b34d 100644
--- a/include/pmacc/random/distributions/normal/Normal_float.hpp
+++ b/include/pmacc/random/distributions/normal/Normal_float.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -34,51 +34,30 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail
-{
+    namespace random
+    {
+        namespace distributions
+        {
+            namespace detail
+            {
 /* XorMin and MRG32k3aMin uses the alpaka RNG as fallback for CPU accelerators
  * therefore we are not allowed to add a specialization for those RNG methods
  */
-#if( PMACC_CUDA_ENABLED == 1 )
-    //! specialization for XorMin
-    template<
-        typename T_Acc
-    >
-    struct Normal<
-        float,
-        methods::XorMin< T_Acc >,
-        void
-    > :
-        public MullerBox<
-            float,
-            methods::XorMin< T_Acc >
-        >
-    {
-
-    };
-
-    //! specialization for MRG32k3aMin
-    template<
-        typename T_Acc
-    >
-    struct Normal<
-        float,
-        methods::MRG32k3aMin< T_Acc >,
-        void
-    > :
-        public MullerBox<
-            float,
-            methods::MRG32k3aMin< T_Acc >
-        >
-    {
-
-    };
+#if(PMACC_CUDA_ENABLED == 1 || ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+                //! specialization for XorMin
+                template<typename T_Acc>
+                struct Normal<float, methods::XorMin<T_Acc>, void> : public MullerBox<float, methods::XorMin<T_Acc>>
+                {
+                };
+
+                //! specialization for MRG32k3aMin
+                template<typename T_Acc>
+                struct Normal<float, methods::MRG32k3aMin<T_Acc>, void>
+                    : public MullerBox<float, methods::MRG32k3aMin<T_Acc>>
+                {
+                };
 #endif
-}  // namespace detail
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+            } // namespace detail
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/normal/Normal_generic.hpp b/include/pmacc/random/distributions/normal/Normal_generic.hpp
index 5be4bba79e..dffbc6805d 100644
--- a/include/pmacc/random/distributions/normal/Normal_generic.hpp
+++ b/include/pmacc/random/distributions/normal/Normal_generic.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,43 +29,30 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail
-{
-
-    //!Returns a normally distributed floating point with value with mean 0.0 and standard deviation 1.0
-    template<
-        typename T_Type,
-        typename T_RNGMethod
-    >
-    class Normal<
-        T_Type,
-        T_RNGMethod,
-        void
-    >
+    namespace random
     {
-        using RNGMethod = T_RNGMethod;
-        using StateType = typename RNGMethod::StateType;
-    public:
-        using result_type = T_Type;
-
-        template< typename T_Acc >
-        DINLINE result_type
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        )
+        namespace distributions
         {
-            return ::alpaka::rand::distribution::createNormalReal< T_Type >(
-                acc
-            )( state );
-        }
-    };
-
-}  // namespace detail
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+            namespace detail
+            {
+                //! Returns a normally distributed floating point with value with mean 0.0 and standard deviation 1.0
+                template<typename T_Type, typename T_RNGMethod>
+                class Normal<T_Type, T_RNGMethod, void>
+                {
+                    using RNGMethod = T_RNGMethod;
+                    using StateType = typename RNGMethod::StateType;
+
+                public:
+                    using result_type = T_Type;
+
+                    template<typename T_Acc>
+                    DINLINE result_type operator()(T_Acc const& acc, StateType& state)
+                    {
+                        return ::alpaka::rand::distribution::createNormalReal<T_Type>(acc)(state);
+                    }
+                };
+
+            } // namespace detail
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/uniform/Range.hpp b/include/pmacc/random/distributions/uniform/Range.hpp
index 52c34f0ce4..140c2633d5 100644
--- a/include/pmacc/random/distributions/uniform/Range.hpp
+++ b/include/pmacc/random/distributions/uniform/Range.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,58 +26,60 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace uniform
-{
-    /** floating point number in the range (0,1]
-     *
-     * @tparam T_Type type of the result
-     * @return value in the range (0,1]
-     */
-    template<typename T_Type>
-    struct ExcludeZero
-    {};
-
-    /**  floating point number in the range [0,1)
-     *
-     * @tparam T_Type type of the result
-     */
-    template<typename T_Type>
-    struct ExcludeOne
+    namespace random
     {
+        namespace distributions
+        {
+            namespace uniform
+            {
+                /** floating point number in the range (0,1]
+                 *
+                 * @tparam T_Type type of the result
+                 * @return value in the range (0,1]
+                 */
+                template<typename T_Type>
+                struct ExcludeZero
+                {
+                };
 
-        /** Reduce the random range
-         *
-         * number of unique random numbers for
-         *   - `float` is `2^24`
-         *   - `double` is `2^53`
-         *
-         * Creates intervals with the width of epsilon/2.
-         */
-        struct Reduced
-        {};
-
-        /** Loops until a random value inside the defined range is created
-         *
-         * The runtime of this method is not deterministic.
-         * @warning zero is excluded which results in a range (0,1)
-         */
-        struct Repeat
-        {};
+                /**  floating point number in the range [0,1)
+                 *
+                 * @tparam T_Type type of the result
+                 */
+                template<typename T_Type>
+                struct ExcludeOne
+                {
+                    /** Reduce the random range
+                     *
+                     * number of unique random numbers for
+                     *   - `float` is `2^24`
+                     *   - `double` is `2^53`
+                     *
+                     * Creates intervals with the width of epsilon/2.
+                     */
+                    struct Reduced
+                    {
+                    };
 
-        /** Swap the value one to zero
-         *
-         * This method creates a small error in uniform distribution
-         */
-        struct SwapOneToZero
-        {};
+                    /** Loops until a random value inside the defined range is created
+                     *
+                     * The runtime of this method is not deterministic.
+                     * @warning zero is excluded which results in a range (0,1)
+                     */
+                    struct Repeat
+                    {
+                    };
 
-    };
+                    /** Swap the value one to zero
+                     *
+                     * This method creates a small error in uniform distribution
+                     */
+                    struct SwapOneToZero
+                    {
+                    };
+                };
 
-}  // namespace uniform
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+            } // namespace uniform
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/uniform/Uniform_Integral32Bit.hpp b/include/pmacc/random/distributions/uniform/Uniform_Integral32Bit.hpp
index 52c3b5be0d..688274c8e3 100644
--- a/include/pmacc/random/distributions/uniform/Uniform_Integral32Bit.hpp
+++ b/include/pmacc/random/distributions/uniform/Uniform_Integral32Bit.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund
+/* Copyright 2015-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -27,44 +27,35 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail
-{
-
-    /**
-     * Returns a random, uniformly distributed (up to) 32 bit integral value
-     */
-    template<typename T_Type, class T_RNGMethod>
-    class Uniform<
-        T_Type,
-        T_RNGMethod,
-        typename bmpl::if_c<
-            boost::is_integral<T_Type>::value && sizeof(T_Type) <= 4,
-            void,
-            T_Type
-        >::type
-    >
+    namespace random
     {
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-    public:
-        typedef T_Type result_type;
-
-        template< typename T_Acc >
-        DINLINE result_type
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        )
+        namespace distributions
         {
-            return static_cast<result_type>(RNGMethod().get32Bits(acc, state));
-        }
-    };
+            namespace detail
+            {
+                /**
+                 * Returns a random, uniformly distributed (up to) 32 bit integral value
+                 */
+                template<typename T_Type, class T_RNGMethod>
+                class Uniform<
+                    T_Type,
+                    T_RNGMethod,
+                    typename bmpl::if_c<boost::is_integral<T_Type>::value && sizeof(T_Type) <= 4, void, T_Type>::type>
+                {
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+
+                public:
+                    typedef T_Type result_type;
+
+                    template<typename T_Acc>
+                    DINLINE result_type operator()(T_Acc const& acc, StateType& state)
+                    {
+                        return static_cast<result_type>(RNGMethod().get32Bits(acc, state));
+                    }
+                };
 
-}  // namespace detail
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+            } // namespace detail
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/uniform/Uniform_Integral64Bit.hpp b/include/pmacc/random/distributions/uniform/Uniform_Integral64Bit.hpp
index bfa1a6cbe4..51d0671af5 100644
--- a/include/pmacc/random/distributions/uniform/Uniform_Integral64Bit.hpp
+++ b/include/pmacc/random/distributions/uniform/Uniform_Integral64Bit.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,50 +29,35 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail
-{
-
-    /**
-     * Returns a random, uniformly distributed (up to) 64 bit integral value
-     */
-    template<
-        typename T_Type,
-        class T_RNGMethod
-    >
-    class Uniform<
-        T_Type,
-        T_RNGMethod,
-        typename bmpl::if_c<
-            boost::is_integral< T_Type >::value && sizeof( T_Type ) == 8,
-            void,
-            T_Type
-        >::type
-    >
+    namespace random
     {
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-    public:
-        typedef T_Type result_type;
-
-        template< typename T_Acc >
-        DINLINE result_type
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        )
+        namespace distributions
         {
-            return static_cast< result_type >( RNGMethod().get64Bits(
-                acc,
-                state
-            ) );
-        }
-    };
-
-}  // namespace detail
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+            namespace detail
+            {
+                /**
+                 * Returns a random, uniformly distributed (up to) 64 bit integral value
+                 */
+                template<typename T_Type, class T_RNGMethod>
+                class Uniform<
+                    T_Type,
+                    T_RNGMethod,
+                    typename bmpl::if_c<boost::is_integral<T_Type>::value && sizeof(T_Type) == 8, void, T_Type>::type>
+                {
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+
+                public:
+                    typedef T_Type result_type;
+
+                    template<typename T_Acc>
+                    DINLINE result_type operator()(T_Acc const& acc, StateType& state)
+                    {
+                        return static_cast<result_type>(RNGMethod().get64Bits(acc, state));
+                    }
+                };
+
+            } // namespace detail
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/uniform/Uniform_double.hpp b/include/pmacc/random/distributions/uniform/Uniform_double.hpp
index 6c3b7a5b60..441b1ac957 100644
--- a/include/pmacc/random/distributions/uniform/Uniform_double.hpp
+++ b/include/pmacc/random/distributions/uniform/Uniform_double.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,151 +27,109 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail
-{
-
-    /** Returns a random double value uniformly distributed in (0,1]
-     *
-     * The smallest created value is `2^-65` (~ `2.710505431213761*10^-20`)
-     */
-    template<class T_RNGMethod>
-    class Uniform<
-        uniform::ExcludeZero<double>,
-        T_RNGMethod,
-        void
-    >
+    namespace random
     {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-        typedef double result_type;
-
-        template< typename T_Acc >
-        DINLINE double
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        ) const
+        namespace distributions
         {
-            double const value2pow64Inv = 5.421010862427522e-20;
-            uint64_t const random = RNGMethod().get64Bits(
-                acc,
-                state
-            );
-            return static_cast< double >( random ) * value2pow64Inv +
-                ( value2pow64Inv / 2.0 );
-        }
-    };
+            namespace detail
+            {
+                /** Returns a random double value uniformly distributed in (0,1]
+                 *
+                 * The smallest created value is `2^-65` (~ `2.710505431213761*10^-20`)
+                 */
+                template<class T_RNGMethod>
+                class Uniform<uniform::ExcludeZero<double>, T_RNGMethod, void>
+                {
+                public:
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+                    typedef double result_type;
 
-    /** Returns a random double value uniformly distributed in [0,1)
-     *
-     * Swap the value one to zero (creates a small error in uniform distribution)
-     */
-    template<class T_RNGMethod>
-    class Uniform<
-        uniform::ExcludeOne< double >::SwapOneToZero,
-        T_RNGMethod,
-        void
-    >
-    {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-        typedef double result_type;
+                    template<typename T_Acc>
+                    DINLINE double operator()(T_Acc const& acc, StateType& state) const
+                    {
+                        double const value2pow64Inv = 5.421010862427522e-20;
+                        uint64_t const random = RNGMethod().get64Bits(acc, state);
+                        return static_cast<double>(random) * value2pow64Inv + (value2pow64Inv / 2.0);
+                    }
+                };
 
-        template< typename T_Acc >
-        DINLINE double
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            double const randomValue =
-                pmacc::random::distributions::Uniform<
-                    uniform::ExcludeZero< double >,
-                    RNGMethod
-            >()(acc, state);
-            return randomValue == 1.0 ? 0.0 : randomValue;
-        }
-    };
+                /** Returns a random double value uniformly distributed in [0,1)
+                 *
+                 * Swap the value one to zero (creates a small error in uniform distribution)
+                 */
+                template<class T_RNGMethod>
+                class Uniform<uniform::ExcludeOne<double>::SwapOneToZero, T_RNGMethod, void>
+                {
+                public:
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+                    typedef double result_type;
 
-    /** Returns a random double value uniformly distributed in [0,1)
-     *
-     * Number of unique random numbers is reduced to `2^53`.
-     * Uses a uniform distance of `2^-53` (`epsilon/2`) between each possible
-     * random number.
-     */
-    template<class T_RNGMethod>
-    class Uniform<
-        uniform::ExcludeOne< double >::Reduced,
-        T_RNGMethod,
-        void
-    >
-    {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-        typedef double result_type;
+                    template<typename T_Acc>
+                    DINLINE double operator()(T_Acc const& acc, StateType& state) const
+                    {
+                        double const randomValue
+                            = pmacc::random::distributions::Uniform<uniform::ExcludeZero<double>, RNGMethod>()(
+                                acc,
+                                state);
+                        return randomValue == 1.0 ? 0.0 : randomValue;
+                    }
+                };
 
-        template< typename T_Acc >
-        DINLINE double
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            double const value2pow53Inv = 1.1102230246251565e-16;
-            double const randomValue53Bit = RNGMethod().get64Bits( acc, state ) >> 11;
-            return randomValue53Bit * value2pow53Inv;
-        }
-    };
+                /** Returns a random double value uniformly distributed in [0,1)
+                 *
+                 * Number of unique random numbers is reduced to `2^53`.
+                 * Uses a uniform distance of `2^-53` (`epsilon/2`) between each possible
+                 * random number.
+                 */
+                template<class T_RNGMethod>
+                class Uniform<uniform::ExcludeOne<double>::Reduced, T_RNGMethod, void>
+                {
+                public:
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+                    typedef double result_type;
 
-    /** Returns a random double value uniformly distributed in (0,1)
-     *
-     * Loops until a random value inside the defined range is created.
-     * The runtime of this method is not deterministic.
-     */
-    template<
-        class T_RNGMethod
-    >
-    class Uniform<
-        typename uniform::ExcludeOne< double >::Repeat,
-        T_RNGMethod,
-        void
-    >
-    {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-        typedef double  result_type;
+                    template<typename T_Acc>
+                    DINLINE double operator()(T_Acc const& acc, StateType& state) const
+                    {
+                        double const value2pow53Inv = 1.1102230246251565e-16;
+                        double const randomValue53Bit = RNGMethod().get64Bits(acc, state) >> 11;
+                        return randomValue53Bit * value2pow53Inv;
+                    }
+                };
 
-        template< typename T_Acc >
-        DINLINE result_type
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            do
-            {
-                const double randomValue =
-                    pmacc::random::distributions::Uniform<
-                        uniform::ExcludeZero< double >,
-                        RNGMethod
-                    >()(acc, state);
+                /** Returns a random double value uniformly distributed in (0,1)
+                 *
+                 * Loops until a random value inside the defined range is created.
+                 * The runtime of this method is not deterministic.
+                 */
+                template<class T_RNGMethod>
+                class Uniform<typename uniform::ExcludeOne<double>::Repeat, T_RNGMethod, void>
+                {
+                public:
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+                    typedef double result_type;
+
+                    template<typename T_Acc>
+                    DINLINE result_type operator()(T_Acc const& acc, StateType& state) const
+                    {
+                        do
+                        {
+                            const double randomValue
+                                = pmacc::random::distributions::Uniform<uniform::ExcludeZero<double>, RNGMethod>()(
+                                    acc,
+                                    state);
 
-                if( randomValue != 1.0 )
-                    return randomValue;
-            }
-            while(true);
-        }
-    };
+                            if(randomValue != 1.0)
+                                return randomValue;
+                        } while(true);
+                    }
+                };
 
-}  // namespace detail
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+            } // namespace detail
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/uniform/Uniform_float.hpp b/include/pmacc/random/distributions/uniform/Uniform_float.hpp
index 4477f6ba35..707c17cc0a 100644
--- a/include/pmacc/random/distributions/uniform/Uniform_float.hpp
+++ b/include/pmacc/random/distributions/uniform/Uniform_float.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,146 +27,109 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail
-{
-
-    /** Returns a random float value uniformly distributed in (0,1]
-     *
-     * The smallest created value is `2^-33` (~ `1.164*10^-10`)
-     */
-    template<class T_RNGMethod>
-    class Uniform<
-        uniform::ExcludeZero<float>,
-        T_RNGMethod,
-        void
-    >
+    namespace random
     {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-        typedef float result_type;
-
-        template< typename T_Acc >
-        DINLINE float
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        ) const
+        namespace distributions
         {
-            const float value2pow32Inv = 2.3283064e-10f;
-            const uint32_t random = RNGMethod().get32Bits(acc, state);
-            return static_cast<float>( random ) * value2pow32Inv +
-                ( value2pow32Inv / 2.0f );
-        }
-    };
+            namespace detail
+            {
+                /** Returns a random float value uniformly distributed in (0,1]
+                 *
+                 * The smallest created value is `2^-33` (~ `1.164*10^-10`)
+                 */
+                template<class T_RNGMethod>
+                class Uniform<uniform::ExcludeZero<float>, T_RNGMethod, void>
+                {
+                public:
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+                    typedef float result_type;
 
-    /** Returns a random float value uniformly distributed in [0,1)
-     *
-     * Swap the value one to zero (creates a small error in uniform distribution)
-     */
-    template<class T_RNGMethod>
-    class Uniform<
-        uniform::ExcludeOne<float>::SwapOneToZero,
-        T_RNGMethod,
-        void
-    >
-    {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-        typedef float result_type;
+                    template<typename T_Acc>
+                    DINLINE float operator()(T_Acc const& acc, StateType& state) const
+                    {
+                        const float value2pow32Inv = 2.3283064e-10f;
+                        const uint32_t random = RNGMethod().get32Bits(acc, state);
+                        return static_cast<float>(random) * value2pow32Inv + (value2pow32Inv / 2.0f);
+                    }
+                };
 
-        template< typename T_Acc >
-        DINLINE float
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            const float randomValue =
-                pmacc::random::distributions::Uniform<
-                    uniform::ExcludeZero<float>,
-                    RNGMethod
-            >()(acc, state);
-            return randomValue == 1.0f ? 0.0f : randomValue;
-        }
-    };
+                /** Returns a random float value uniformly distributed in [0,1)
+                 *
+                 * Swap the value one to zero (creates a small error in uniform distribution)
+                 */
+                template<class T_RNGMethod>
+                class Uniform<uniform::ExcludeOne<float>::SwapOneToZero, T_RNGMethod, void>
+                {
+                public:
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+                    typedef float result_type;
 
-    /** Returns a random float value uniformly distributed in [0,1)
-     *
-     * Number of unique random numbers is reduced to `2^24`.
-     * Uses a uniform distance of `2^-24` (`epsilon/2`) between each possible
-     * random number.
-     */
-    template<class T_RNGMethod>
-    class Uniform<
-        uniform::ExcludeOne<float>::Reduced,
-        T_RNGMethod,
-        void
-    >
-    {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-        typedef float result_type;
+                    template<typename T_Acc>
+                    DINLINE float operator()(T_Acc const& acc, StateType& state) const
+                    {
+                        const float randomValue
+                            = pmacc::random::distributions::Uniform<uniform::ExcludeZero<float>, RNGMethod>()(
+                                acc,
+                                state);
+                        return randomValue == 1.0f ? 0.0f : randomValue;
+                    }
+                };
 
-        template< typename T_Acc >
-        DINLINE float
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            const float value2pow24Inv = 5.9604645e-08f;
-            const float randomValue24Bit = RNGMethod().get32Bits(acc, state) >> 8;
-            return static_cast<float>( randomValue24Bit ) * value2pow24Inv;
-        }
-    };
+                /** Returns a random float value uniformly distributed in [0,1)
+                 *
+                 * Number of unique random numbers is reduced to `2^24`.
+                 * Uses a uniform distance of `2^-24` (`epsilon/2`) between each possible
+                 * random number.
+                 */
+                template<class T_RNGMethod>
+                class Uniform<uniform::ExcludeOne<float>::Reduced, T_RNGMethod, void>
+                {
+                public:
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+                    typedef float result_type;
 
-    /** Returns a random float value uniformly distributed in (0,1)
-     *
-     * Loops until a random value inside the defined range is created.
-     * The runtime of this method is not deterministic.
-     */
-    template<class T_RNGMethod>
-    class Uniform<
-        typename uniform::ExcludeOne<float>::Repeat,
-        T_RNGMethod,
-        void
-    >
-    {
-    public:
-        typedef T_RNGMethod RNGMethod;
-        typedef typename RNGMethod::StateType StateType;
-        typedef float result_type;
+                    template<typename T_Acc>
+                    DINLINE float operator()(T_Acc const& acc, StateType& state) const
+                    {
+                        const float value2pow24Inv = 5.9604645e-08f;
+                        const float randomValue24Bit = RNGMethod().get32Bits(acc, state) >> 8;
+                        return static_cast<float>(randomValue24Bit) * value2pow24Inv;
+                    }
+                };
 
-        template< typename T_Acc >
-        DINLINE float
-        operator()(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            do
-            {
-                const float randomValue =
-                    pmacc::random::distributions::Uniform<
-                        uniform::ExcludeZero<float>,
-                        RNGMethod
-                    >()(acc, state);
+                /** Returns a random float value uniformly distributed in (0,1)
+                 *
+                 * Loops until a random value inside the defined range is created.
+                 * The runtime of this method is not deterministic.
+                 */
+                template<class T_RNGMethod>
+                class Uniform<typename uniform::ExcludeOne<float>::Repeat, T_RNGMethod, void>
+                {
+                public:
+                    typedef T_RNGMethod RNGMethod;
+                    typedef typename RNGMethod::StateType StateType;
+                    typedef float result_type;
+
+                    template<typename T_Acc>
+                    DINLINE float operator()(T_Acc const& acc, StateType& state) const
+                    {
+                        do
+                        {
+                            const float randomValue
+                                = pmacc::random::distributions::Uniform<uniform::ExcludeZero<float>, RNGMethod>()(
+                                    acc,
+                                    state);
 
-                if( randomValue != 1.0f )
-                    return randomValue;
-            }
-            while(true);
-        }
-    };
+                            if(randomValue != 1.0f)
+                                return randomValue;
+                        } while(true);
+                    }
+                };
 
-}  // namespace detail
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+            } // namespace detail
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/distributions/uniform/Uniform_generic.hpp b/include/pmacc/random/distributions/uniform/Uniform_generic.hpp
index 64b551444f..5b9e8269c1 100644
--- a/include/pmacc/random/distributions/uniform/Uniform_generic.hpp
+++ b/include/pmacc/random/distributions/uniform/Uniform_generic.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -29,57 +29,40 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace distributions
-{
-namespace detail
-{
-
-    /** Returns a random floating point value uniformly distributed in [0,1)
-     *
-     * Equivalent to uniform::ExcludeOne< T_Type >::Reduced
-     */
-    template<
-        typename T_Type,
-        class T_RNGMethod
-    >
-    class Uniform<
-        T_Type,
-        T_RNGMethod,
-        typename std::enable_if<
-            std::is_floating_point< T_Type >::value
-        >::type
-    > :
-        public pmacc::random::distributions::Uniform<
-            typename uniform::ExcludeOne< T_Type >::Reduced,
-            T_RNGMethod
-        >
+    namespace random
     {
-    };
+        namespace distributions
+        {
+            namespace detail
+            {
+                /** Returns a random floating point value uniformly distributed in [0,1)
+                 *
+                 * Equivalent to uniform::ExcludeOne< T_Type >::Reduced
+                 */
+                template<typename T_Type, class T_RNGMethod>
+                class Uniform<
+                    T_Type,
+                    T_RNGMethod,
+                    typename std::enable_if<std::is_floating_point<T_Type>::value>::type>
+                    : public pmacc::random::distributions::
+                          Uniform<typename uniform::ExcludeOne<T_Type>::Reduced, T_RNGMethod>
+                {
+                };
 
-    /** Returns a random floating point value uniformly distributed in [0,1)
-     *
-     * Equivalent to uniform::ExcludeOne< T_Type >::Reduced
-     */
-    template<
-        typename T_Type,
-        class T_RNGMethod
-    >
-    class Uniform<
-        uniform::ExcludeOne< T_Type>,
-        T_RNGMethod,
-        typename std::enable_if<
-            std::is_floating_point< T_Type >::value
-        >::type
-    > :
-        public pmacc::random::distributions::Uniform<
-            typename uniform::ExcludeOne< T_Type >::Reduced,
-            T_RNGMethod
-        >
-    {
-    };
-}  // namespace detail
-}  // namespace distributions
-}  // namespace random
-}  // namespace pmacc
+                /** Returns a random floating point value uniformly distributed in [0,1)
+                 *
+                 * Equivalent to uniform::ExcludeOne< T_Type >::Reduced
+                 */
+                template<typename T_Type, class T_RNGMethod>
+                class Uniform<
+                    uniform::ExcludeOne<T_Type>,
+                    T_RNGMethod,
+                    typename std::enable_if<std::is_floating_point<T_Type>::value>::type>
+                    : public pmacc::random::distributions::
+                          Uniform<typename uniform::ExcludeOne<T_Type>::Reduced, T_RNGMethod>
+                {
+                };
+            } // namespace detail
+        } // namespace distributions
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/methods/AlpakaRand.hpp b/include/pmacc/random/methods/AlpakaRand.hpp
index efe66c50d6..4a273952c4 100644
--- a/include/pmacc/random/methods/AlpakaRand.hpp
+++ b/include/pmacc/random/methods/AlpakaRand.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,72 +25,46 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace methods
-{
-
-    template< typename T_Acc = cupla::Acc>
-    class AlpakaRand
+    namespace random
     {
-    public:
-        using StateType =
-            decltype(
-                ::alpaka::rand::generator::createDefault(
-                    alpaka::core::declval<T_Acc const &>(),
-                    alpaka::core::declval<uint32_t &>(),
-                    alpaka::core::declval<uint32_t &>()
-                )
-            );
-
-        DINLINE void
-        init(
-            T_Acc const & acc,
-            StateType& state,
-            uint32_t seed,
-            uint32_t subsequence = 0
-        ) const
+        namespace methods
         {
-            state = ::alpaka::rand::generator::createDefault(
-                acc,
-                seed,
-                subsequence
-            );
-        }
+            template<typename T_Acc = cupla::Acc>
+            class AlpakaRand
+            {
+            public:
+                using StateType = decltype(::alpaka::rand::generator::createDefault(
+                    alpaka::core::declval<T_Acc const&>(),
+                    alpaka::core::declval<uint32_t&>(),
+                    alpaka::core::declval<uint32_t&>()));
 
-        DINLINE uint32_t
-        get32Bits(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            return ::alpaka::rand::distribution::createUniformUint< uint32_t >(
-                acc
-            )( state );
-        }
+                DINLINE void init(T_Acc const& acc, StateType& state, uint32_t seed, uint32_t subsequence = 0) const
+                {
+                    state = ::alpaka::rand::generator::createDefault(acc, seed, subsequence);
+                }
 
-        DINLINE uint64_t
-        get64Bits(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            /* Two 32bit values are packed into a 64bit value because alpaka is not
-             * supporting 64bit integer random numbers
-             */
-            uint64_t result = get32Bits( acc, state);
-            result <<= 32;
-            result ^= get32Bits( acc, state);
-            return result;
-        }
+                DINLINE uint32_t get32Bits(T_Acc const& acc, StateType& state) const
+                {
+                    return ::alpaka::rand::distribution::createUniformUint<uint32_t>(acc)(state);
+                }
 
-        static std::string
-        getName()
-        {
-            return "AlpakaRand";
-        }
-    };
+                DINLINE uint64_t get64Bits(T_Acc const& acc, StateType& state) const
+                {
+                    /* Two 32bit values are packed into a 64bit value because alpaka is not
+                     * supporting 64bit integer random numbers
+                     */
+                    uint64_t result = get32Bits(acc, state);
+                    result <<= 32;
+                    result ^= get32Bits(acc, state);
+                    return result;
+                }
+
+                static std::string getName()
+                {
+                    return "AlpakaRand";
+                }
+            };
 
-}  // namespace methods
-}  // namespace random
-}  // namespace pmacc
+        } // namespace methods
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/methods/MRG32k3aMin.hpp b/include/pmacc/random/methods/MRG32k3aMin.hpp
index 62a9768b6b..01694f61d7 100644
--- a/include/pmacc/random/methods/MRG32k3aMin.hpp
+++ b/include/pmacc/random/methods/MRG32k3aMin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund, Rene Widera
+/* Copyright 2016-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,112 +24,81 @@
 #include "pmacc/types.hpp"
 #include "pmacc/static_assert.hpp"
 
-#if( PMACC_CUDA_ENABLED != 1 )
-#   include "pmacc/random/methods/AlpakaRand.hpp"
+#if(PMACC_CUDA_ENABLED != 1)
+#    include "pmacc/random/methods/AlpakaRand.hpp"
 #else
-#   include <curand_kernel.h>
+#    include <curand_kernel.h>
 #endif
 
 
 namespace pmacc
 {
-namespace random
-{
-namespace methods
-{
-
-#if( PMACC_CUDA_ENABLED != 1 )
-    //! fallback to alpaka RNG if a cpu accelerator is used
-    template< typename T_Acc = cupla::Acc>
-    using MRG32k3aMin = AlpakaRand< T_Acc >;
-#else
-    //! Mersenne-Twister random number generator with a reduced state
-    template< typename T_Acc = cupla::Acc>
-    class MRG32k3aMin
+    namespace random
     {
-    public:
-        struct StateType
+        namespace methods
         {
-            double s1[3];
-            double s2[3];
-        };
+#if(PMACC_CUDA_ENABLED != 1)
+            //! fallback to alpaka RNG if a cpu accelerator is used
+            template<typename T_Acc = cupla::Acc>
+            using MRG32k3aMin = AlpakaRand<T_Acc>;
+#else
+            //! Mersenne-Twister random number generator with a reduced state
+            template<typename T_Acc = cupla::Acc>
+            class MRG32k3aMin
+            {
+            public:
+                struct StateType
+                {
+                    double s1[3];
+                    double s2[3];
+                };
 
-        DINLINE void
-        init(
-            T_Acc const & acc,
-            StateType & state,
-            uint32_t seed,
-            uint32_t subsequence = 0
-        ) const
-        {
-            curandStateMRG32k3a tmpState;
-            curand_init(
-                seed,
-                subsequence,
-                0,
-                &tmpState
-            );
-            AssignState(state, tmpState);
-        }
+                DINLINE void init(T_Acc const& acc, StateType& state, uint32_t seed, uint32_t subsequence = 0) const
+                {
+                    curandStateMRG32k3a tmpState;
+                    curand_init(seed, subsequence, 0, &tmpState);
+                    AssignState(state, tmpState);
+                }
 
-        DINLINE uint32_t
-        get32Bits(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            /* We can do this cast if: 1) Only state data is used and
-             *                         2) Data is aligned and positioned the same way
-             */
-            return curand( reinterpret_cast< curandStateMRG32k3a* >( &state ) );
-        }
+                DINLINE uint32_t get32Bits(T_Acc const& acc, StateType& state) const
+                {
+                    /* We can do this cast if: 1) Only state data is used and
+                     *                         2) Data is aligned and positioned the same way
+                     */
+                    return curand(reinterpret_cast<curandStateMRG32k3a*>(&state));
+                }
 
-        DINLINE uint64_t
-        get64Bits(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            // two 32bit values are packed into a 64bit value
-            uint64_t result = get32Bits( acc, state);
-            result <<= 32;
-            result ^= get32Bits( acc, state);
-            return result;
-        }
+                DINLINE uint64_t get64Bits(T_Acc const& acc, StateType& state) const
+                {
+                    // two 32bit values are packed into a 64bit value
+                    uint64_t result = get32Bits(acc, state);
+                    result <<= 32;
+                    result ^= get32Bits(acc, state);
+                    return result;
+                }
 
-        static std::string
-        getName()
-        {
-            return "MRG32k3aMin";
-        }
+                static std::string getName()
+                {
+                    return "MRG32k3aMin";
+                }
 
-    private:
-        // Sizes must match
-        PMACC_STATIC_ASSERT_MSG(
-            sizeof( StateType::s1 ) == sizeof( curandStateMRG32k3a::s1 ),
-            Unexpected_sizes
-        );
-        PMACC_STATIC_ASSERT_MSG(
-            sizeof( StateType::s2 ) == sizeof( curandStateMRG32k3a::s2 ),
-            Unexpected_sizes
-        );
-        // Offsets must match
-        PMACC_STATIC_ASSERT_MSG(
-            offsetof( StateType, s1 ) == offsetof( curandStateMRG32k3a, s1 ) &&
-            offsetof( StateType, s2 ) == offsetof( curandStateMRG32k3a, s2 ),
-            Incompatible_structs
-        );
+            private:
+                // Sizes must match
+                PMACC_STATIC_ASSERT_MSG(sizeof(StateType::s1) == sizeof(curandStateMRG32k3a::s1), Unexpected_sizes);
+                PMACC_STATIC_ASSERT_MSG(sizeof(StateType::s2) == sizeof(curandStateMRG32k3a::s2), Unexpected_sizes);
+                // Offsets must match
+                PMACC_STATIC_ASSERT_MSG(
+                    offsetof(StateType, s1) == offsetof(curandStateMRG32k3a, s1)
+                        && offsetof(StateType, s2) == offsetof(curandStateMRG32k3a, s2),
+                    Incompatible_structs);
 
-        static DINLINE void AssignState(
-            StateType& dest,
-            curandStateMRG32k3a const & src
-        )
-        {
-            // Check if we can do this cast
-            dest = reinterpret_cast< StateType const & >( src );
-        }
-    };
+                static DINLINE void AssignState(StateType& dest, curandStateMRG32k3a const& src)
+                {
+                    // Check if we can do this cast
+                    dest = reinterpret_cast<StateType const&>(src);
+                }
+            };
 #endif
-}  // namespace methods
-}  // namespace random
-}  // namespace pmacc
+        } // namespace methods
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/methods/RngPlaceholder.hpp b/include/pmacc/random/methods/RngPlaceholder.hpp
index 22a1a427ed..5bf94b1ae4 100644
--- a/include/pmacc/random/methods/RngPlaceholder.hpp
+++ b/include/pmacc/random/methods/RngPlaceholder.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,17 +26,16 @@
 
 namespace pmacc
 {
-namespace random
-{
-namespace methods
-{
-
-    //! placeholder for the rng method
-    struct RngPlaceholder
+    namespace random
     {
-        using StateType = int;
-    };
+        namespace methods
+        {
+            //! placeholder for the rng method
+            struct RngPlaceholder
+            {
+                using StateType = int;
+            };
 
-}  // namespace methods
-}  // namespace random
-}  // namespace pmacc
+        } // namespace methods
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/methods/XorMin.hpp b/include/pmacc/random/methods/XorMin.hpp
index 10850befcf..dcea1c86b9 100644
--- a/include/pmacc/random/methods/XorMin.hpp
+++ b/include/pmacc/random/methods/XorMin.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Alexander Grund, Rene Widera
+/* Copyright 2015-2021 Alexander Grund, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,115 +24,115 @@
 #include "pmacc/types.hpp"
 #include "pmacc/static_assert.hpp"
 
-#if( PMACC_CUDA_ENABLED != 1 )
-#   include "pmacc/random/methods/AlpakaRand.hpp"
+#if(BOOST_LANG_CUDA)
+#    include <curand_kernel.h>
+#elif(BOOST_LANG_HIP)
+#    include <hiprand_kernel.h>
 #else
-#   include <curand_kernel.h>
+#    include "pmacc/random/methods/AlpakaRand.hpp"
 #endif
 
 
 namespace pmacc
 {
-namespace random
-{
-namespace methods
-{
-
-#if( PMACC_CUDA_ENABLED != 1 )
-    //! fallback to alpaka RNG if a cpu accelerator is used
-    template< typename T_Acc = cupla::Acc>
-    using XorMin = AlpakaRand< T_Acc >;
-#else
-    //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution
-    template< typename T_Acc = cupla::Acc>
-    class XorMin
+    namespace random
     {
-    public:
-        class StateType
+        namespace methods
         {
-        public:
-            PMACC_ALIGN(
-                d,
-                unsigned int
-            );
-            PMACC_ALIGN(
-                v[ 5 ],
-                unsigned int
-            );
-
-            HDINLINE StateType( )
-            { }
-
-            DINLINE StateType( curandStateXORWOW_t const & other ): d( other.d )
+#if(ALPAKA_ACC_GPU_CUDA_ENABLED || ALPAKA_ACC_GPU_HIP_ENABLED)
+            //! Uses the CUDA XORWOW RNG but does not store state members required for normal distribution
+            template<typename T_Acc = cupla::Acc>
+            class XorMin
             {
-                PMACC_STATIC_ASSERT_MSG(
-                    sizeof( v ) == sizeof( other.v ),
-                    Unexpected_sizes
-                );
-                for( unsigned i = 0; i < sizeof( v ) / sizeof( v[ 0 ] ); i++ )
-                    v[ i ] = other.v[ i ];
-            }
-        };
-
-        DINLINE void
-        init(
-            T_Acc const & acc,
-            StateType & state,
-            uint32_t seed,
-            uint32_t subsequence = 0
-        ) const
-        {
-            curandStateXORWOW_t tmpState;
-            curand_init(
-                seed,
-                subsequence,
-                0,
-                &tmpState
-            );
-            state = tmpState;
-        }
-
-        DINLINE uint32_t
-        get32Bits(
-            T_Acc const & acc,
-            StateType & state
-        ) const
-        {
-            /* This generator uses the xorwow formula of
-             * www.jstatsoft.org/v08/i14/paper page 5
-             * Has period 2^192 - 2^32.
-             */
-            uint32_t t;
-            t = ( state.v[ 0 ] ^ ( state.v[ 0 ] >> 2 ) );
-            state.v[ 0 ] = state.v[ 1 ];
-            state.v[ 1 ] = state.v[ 2 ];
-            state.v[ 2 ] = state.v[ 3 ];
-            state.v[ 3 ] = state.v[ 4 ];
-            state.v[ 4 ] = ( state.v[ 4 ] ^ ( state.v[ 4 ] << 4 ) ) ^ ( t ^ ( t << 1 ) );
-            state.d += 362437;
-            return state.v[ 4 ] + state.d;
-        }
-
-        DINLINE uint64_t
-        get64Bits(
-            T_Acc const & acc,
-            StateType& state
-        ) const
-        {
-            // two 32bit values are packed into a 64bit value
-            uint64_t result = get32Bits( acc, state);
-            result <<= 32;
-            result ^= get32Bits( acc, state);
-            return result;
-        }
-
-        static std::string
-        getName( )
-        {
-            return "XorMin";
-        }
-    };
+#    if(BOOST_LANG_HIP)
+                using NativeStateType = hiprandStateXORWOW_t;
+#    elif(BOOST_LANG_CUDA)
+                using NativeStateType = curandStateXORWOW_t;
+#    endif
+
+            public:
+                class StateType
+                {
+                public:
+                    PMACC_ALIGN(d, unsigned int);
+                    PMACC_ALIGN(v[5], unsigned int);
+
+                    HDINLINE StateType()
+                    {
+                    }
+
+                    DINLINE StateType(NativeStateType const& other)
+                    {
+#    if(BOOST_LANG_HIP)
+                        // @todo avoid using pointer casts to copy the rng state
+                        auto baseObjectPtr
+                            = reinterpret_cast<typename NativeStateType::xorwow_state const* const>(&other);
+                        d = baseObjectPtr->d;
+                        auto const* nativeStateArray = baseObjectPtr->x;
+                        PMACC_STATIC_ASSERT_MSG(sizeof(v) == sizeof(baseObjectPtr->x), Unexpected_sizes);
+#    elif(BOOST_LANG_CUDA)
+                        d = other.d;
+                        auto const* nativeStateArray = other.v;
+                        PMACC_STATIC_ASSERT_MSG(sizeof(v) == sizeof(other.v), Unexpected_sizes);
+#    endif
+                        for(unsigned i = 0; i < sizeof(v) / sizeof(v[0]); i++)
+                            v[i] = nativeStateArray[i];
+                    }
+                };
+
+                DINLINE void init(T_Acc const& acc, StateType& state, uint32_t seed, uint32_t subsequence = 0) const
+                {
+                    NativeStateType tmpState;
+
+#    if(ALPAKA_ACC_GPU_HIP_ENABLED == 1)
+#        define PMACC_RNG_INIT_FN hiprand_init
+#    elif(ALPAKA_ACC_GPU_CUDA_ENABLED == 1)
+#        define PMACC_RNG_INIT_FN curand_init
+#    endif
+
+                    PMACC_RNG_INIT_FN(seed, subsequence, 0, &tmpState);
+
+#    undef PMACC_RNG_INIT_FN
+
+                    state = tmpState;
+                }
+
+                DINLINE uint32_t get32Bits(T_Acc const& acc, StateType& state) const
+                {
+                    /* This generator uses the xorwow formula of
+                     * www.jstatsoft.org/v08/i14/paper page 5
+                     * Has period 2^192 - 2^32.
+                     */
+                    uint32_t t;
+                    t = (state.v[0] ^ (state.v[0] >> 2));
+                    state.v[0] = state.v[1];
+                    state.v[1] = state.v[2];
+                    state.v[2] = state.v[3];
+                    state.v[3] = state.v[4];
+                    state.v[4] = (state.v[4] ^ (state.v[4] << 4)) ^ (t ^ (t << 1));
+                    state.d += 362437;
+                    return state.v[4] + state.d;
+                }
+
+                DINLINE uint64_t get64Bits(T_Acc const& acc, StateType& state) const
+                {
+                    // two 32bit values are packed into a 64bit value
+                    uint64_t result = get32Bits(acc, state);
+                    result <<= 32;
+                    result ^= get32Bits(acc, state);
+                    return result;
+                }
+
+                static std::string getName()
+                {
+                    return "XorMin";
+                }
+            };
+#else
+            //! fallback to alpaka RNG if a cpu accelerator is used
+            template<typename T_Acc = cupla::Acc>
+            using XorMin = AlpakaRand<T_Acc>;
 #endif
-}  // namespace methods
-}  // namespace random
-}  // namespace pmacc
+        } // namespace methods
+    } // namespace random
+} // namespace pmacc
diff --git a/include/pmacc/random/methods/methods.hpp b/include/pmacc/random/methods/methods.hpp
index 419a7634b2..e8cc891408 100644
--- a/include/pmacc/random/methods/methods.hpp
+++ b/include/pmacc/random/methods/methods.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
diff --git a/include/pmacc/result_of_Functor.hpp b/include/pmacc/result_of_Functor.hpp
index 818f195701..62a1241440 100644
--- a/include/pmacc/result_of_Functor.hpp
+++ b/include/pmacc/result_of_Functor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,26 +27,28 @@ namespace mpl = boost::mpl;
 
 namespace pmacc
 {
-namespace result_of
-{
-template<typename _Functor, typename Arg0 = mpl::void_,
-                            typename Arg1 = mpl::void_,
-                            typename Arg2 = mpl::void_,
-                            typename Arg3 = mpl::void_,
-                            typename Arg4 = mpl::void_,
-                            typename Arg5 = mpl::void_,
-                            typename Arg6 = mpl::void_,
-                            typename Arg7 = mpl::void_,
-                            typename Arg8 = mpl::void_,
-                            typename Arg9 = mpl::void_,
-                            typename Arg10 = mpl::void_,
-                            typename Arg11 = mpl::void_,
-                            typename Arg12 = mpl::void_,
-                            typename dummy = mpl::void_>
-struct Functor
-{
-    typedef typename _Functor::result_type type;
-};
+    namespace result_of
+    {
+        template<
+            typename _Functor,
+            typename Arg0 = mpl::void_,
+            typename Arg1 = mpl::void_,
+            typename Arg2 = mpl::void_,
+            typename Arg3 = mpl::void_,
+            typename Arg4 = mpl::void_,
+            typename Arg5 = mpl::void_,
+            typename Arg6 = mpl::void_,
+            typename Arg7 = mpl::void_,
+            typename Arg8 = mpl::void_,
+            typename Arg9 = mpl::void_,
+            typename Arg10 = mpl::void_,
+            typename Arg11 = mpl::void_,
+            typename Arg12 = mpl::void_,
+            typename dummy = mpl::void_>
+        struct Functor
+        {
+            typedef typename _Functor::result_type type;
+        };
 
-} // result_of
-} // PMacc
+    } // namespace result_of
+} // namespace pmacc
diff --git a/include/pmacc/simulationControl/SimulationDescription.hpp b/include/pmacc/simulationControl/SimulationDescription.hpp
index 31fd6afb65..5f8133ee8c 100644
--- a/include/pmacc/simulationControl/SimulationDescription.hpp
+++ b/include/pmacc/simulationControl/SimulationDescription.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Axel Huebl
+/* Copyright 2015-2021 Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -27,106 +27,101 @@
 
 namespace pmacc
 {
-namespace simulationControl
-{
-
-/**
- * Provides convenience methods for querying general simulation information.
- * Singleton class.
- */
-class SimulationDescription
-{
-public:
-    /** Return author of the simulation setup.
-     *
-     * The author that runs the simulation and is responsible for created
-     * output files.
-     *
-     * @return std::string with author name, can be empty
-     */
-    std::string getAuthor()
-    {
-        return author;
-    }
-
-    /** Set author
-     *
-     * @see getAuthor
-     *
-     * @param[in] std::string setAuthor
-     */
-    void setAuthor( const std::string setAuthor )
-    {
-        this->author = setAuthor;
-    }
-
-    /** Return last time step of simulation
-     *
-     * @return uint32_t last step of the simulation to run to
-     */
-    uint32_t getRunSteps()
-    {
-        return runSteps;
-    }
-
-    /** Set last time step of simulation
-     *
-     * @see getRunSteps
-     *
-     * @param[in] uint32_t setRunSteps
-     */
-    void setRunSteps( const uint32_t setRunSteps )
-    {
-        runSteps = setRunSteps;
-    }
-
-    /** Returns the current time step of the simulation
-     *
-     * \return uint32_t current time step
-     */
-    uint32_t getCurrentStep()
-    {
-        return currentStep;
-    }
-
-    /** Set the current time step
-     *
-     * @see getCurrentStep
-     *
-     * @param[in] uint32_t setCurrentStep
-     */
-    void setCurrentStep( const uint32_t setCurrentStep )
+    namespace simulationControl
     {
-        currentStep = setCurrentStep;
-    }
-
-protected:
-    /** author that runs the simulation */
-    std::string author;
-
-    /** maximum step to run this simulation to */
-    uint32_t runSteps;
-
-    /** current time step of simulation */
-    uint32_t currentStep;
-
-private:
-
-    friend struct detail::Environment;
-
-    static SimulationDescription& getInstance()
-    {
-        static SimulationDescription instance;
-        return instance;
-    }
-
-    SimulationDescription() :
-    author(""),
-    runSteps(0),
-    currentStep(0)
-    {
-    }
-};
-
-} // namespace simulationControl
+        /**
+         * Provides convenience methods for querying general simulation information.
+         * Singleton class.
+         */
+        class SimulationDescription
+        {
+        public:
+            /** Return author of the simulation setup.
+             *
+             * The author that runs the simulation and is responsible for created
+             * output files.
+             *
+             * @return std::string with author name, can be empty
+             */
+            std::string getAuthor()
+            {
+                return author;
+            }
+
+            /** Set author
+             *
+             * @see getAuthor
+             *
+             * @param[in] std::string setAuthor
+             */
+            void setAuthor(const std::string setAuthor)
+            {
+                this->author = setAuthor;
+            }
+
+            /** Return last time step of simulation
+             *
+             * @return uint32_t last step of the simulation to run to
+             */
+            uint32_t getRunSteps()
+            {
+                return runSteps;
+            }
+
+            /** Set last time step of simulation
+             *
+             * @see getRunSteps
+             *
+             * @param[in] uint32_t setRunSteps
+             */
+            void setRunSteps(const uint32_t setRunSteps)
+            {
+                runSteps = setRunSteps;
+            }
+
+            /** Returns the current time step of the simulation
+             *
+             * \return uint32_t current time step
+             */
+            uint32_t getCurrentStep()
+            {
+                return currentStep;
+            }
+
+            /** Set the current time step
+             *
+             * @see getCurrentStep
+             *
+             * @param[in] uint32_t setCurrentStep
+             */
+            void setCurrentStep(const uint32_t setCurrentStep)
+            {
+                currentStep = setCurrentStep;
+            }
+
+        protected:
+            /** author that runs the simulation */
+            std::string author;
+
+            /** maximum step to run this simulation to */
+            uint32_t runSteps;
+
+            /** current time step of simulation */
+            uint32_t currentStep;
+
+        private:
+            friend struct detail::Environment;
+
+            static SimulationDescription& getInstance()
+            {
+                static SimulationDescription instance;
+                return instance;
+            }
+
+            SimulationDescription() : author(""), runSteps(0), currentStep(0)
+            {
+            }
+        };
+
+    } // namespace simulationControl
 } // namespace pmacc
diff --git a/include/pmacc/simulationControl/SimulationHelper.hpp b/include/pmacc/simulationControl/SimulationHelper.hpp
index e6a3625118..34bbdf5d54 100644
--- a/include/pmacc/simulationControl/SimulationHelper.hpp
+++ b/include/pmacc/simulationControl/SimulationHelper.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera, Alexander Debus,
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera, Alexander Debus,
  *                     Benjamin Worpitz, Alexander Grund
  *
  * This file is part of PMacc.
@@ -44,433 +44,432 @@
 
 namespace pmacc
 {
-
-/**
- * Abstract base class for simulations.
- *
- * Use this helper class to write your own concrete simulations
- * by binding pure virtual methods.
- *
- * @tparam DIM base dimension for the simulation (2-3)
- */
-template<unsigned DIM>
-class SimulationHelper : public IPlugin
-{
-public:
-
-    using SeqOfTimeSlices = std::vector< pluginSystem::TimeSlice >;
-
     /**
-     * Constructor
+     * Abstract base class for simulations.
      *
+     * Use this helper class to write your own concrete simulations
+     * by binding pure virtual methods.
+     *
+     * @tparam DIM base dimension for the simulation (2-3)
      */
-    SimulationHelper() :
-    runSteps(0),
-    checkpointDirectory("checkpoints"),
-    numCheckpoints(0),
-    restartStep(-1),
-    restartDirectory("checkpoints"),
-    restartRequested(false),
-    CHECKPOINT_MASTER_FILE("checkpoints.txt"),
-    author("")
-    {
-        tSimulation.toggleStart();
-        tInit.toggleStart();
-    }
-
-    virtual ~SimulationHelper()
+    template<unsigned DIM>
+    class SimulationHelper : public IPlugin
     {
-        tSimulation.toggleEnd();
-        if (output)
+    public:
+        using SeqOfTimeSlices = std::vector<pluginSystem::TimeSlice>;
+
+        /**
+         * Constructor
+         *
+         */
+        SimulationHelper()
+            : runSteps(0)
+            , checkpointDirectory("checkpoints")
+            , numCheckpoints(0)
+            , restartStep(-1)
+            , restartDirectory("checkpoints")
+            , restartRequested(false)
+            , CHECKPOINT_MASTER_FILE("checkpoints.txt")
+            , author("")
+            , useMpiDirect(false)
         {
-            std::cout << "full simulation time: " <<
-                tSimulation.printInterval() << " = " <<
-                (uint64_t) (tSimulation.getInterval() / 1000.) << " sec" << std::endl;
+            tSimulation.toggleStart();
+            tInit.toggleStart();
         }
-    }
-
-    /**
-     * Must describe one iteration (step).
-     *
-     * This function is called automatically.
-     */
-    virtual void runOneStep(uint32_t currentStep) = 0;
 
-    /**
-     * Initialize simulation
-     *
-     * Does hardware selections/reservations, memory allocations and
-     * initializes data structures as empty.
-     */
-    virtual void init() = 0;
-
-    /**
-     * Fills simulation with initial data after init()
-     *
-     * @return returns the first step of the simulation
-     *         (can be >0 for, e.g., restarts from checkpoints)
-     */
-    virtual uint32_t fillSimulation() = 0;
-
-    /**
-     * Reset the simulation to a state such as it was after
-     * init() but for a specific time step.
-     * Can be used to call fillSimulation() again.
-     */
-    virtual void resetAll(uint32_t currentStep) = 0;
-
-    /**
-     * Check if moving window work must do
-     *
-     * If no moving window is needed the implementation of this function can be empty
-     *
-     * @param currentStep simulation step
-     */
-    virtual void movingWindowCheck(uint32_t currentStep) = 0;
-
-    /**
-     * Notifies registered output classes.
-     *
-     * This function is called automatically.
-     *
-     *  @param currentStep simulation step
-     */
-    virtual void dumpOneStep(uint32_t currentStep)
-    {
-        /* trigger notification */
-        Environment<DIM>::get().PluginConnector().notifyPlugins(currentStep);
-
-        /* trigger checkpoint notification */
-        if(
-            !checkpointPeriod.empty() &&
-            pluginSystem::containsStep(
-                seqCheckpointPeriod,
-                currentStep
-            )
-        )
+        virtual ~SimulationHelper()
         {
-            /* first synchronize: if something failed, we can spare the time
-             * for the checkpoint writing */
-            CUDA_CHECK(cudaDeviceSynchronize());
-            CUDA_CHECK(cudaGetLastError());
-
-            // avoid deadlock between not finished PMacc tasks and MPI_Barrier
-            __getTransactionEvent().waitForFinished();
-
-            GridController<DIM> &gc = Environment<DIM>::get().GridController();
-            /* can be spared for better scalings, but allows to spare the
-             * time for checkpointing if some ranks died */
-            MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
-
-            /* create directory containing checkpoints  */
-            if (numCheckpoints == 0)
+            tSimulation.toggleEnd();
+            if(output)
             {
-                Environment<DIM>::get().Filesystem().createDirectoryWithPermissions(checkpointDirectory);
+                std::cout << "full simulation time: " << tSimulation.printInterval() << " = "
+                          << (uint64_t)(tSimulation.getInterval() / 1000.) << " sec" << std::endl;
             }
+        }
 
-            Environment<DIM>::get().PluginConnector().checkpointPlugins(currentStep,
-                                                                        checkpointDirectory);
-
-            /* important synchronize: only if no errors occured until this
-             * point guarantees that a checkpoint is usable */
-            CUDA_CHECK(cudaDeviceSynchronize());
-            CUDA_CHECK(cudaGetLastError());
-
-            /* avoid deadlock between not finished PMacc tasks and MPI_Barrier */
-            __getTransactionEvent().waitForFinished();
-
-            /* \todo in an ideal world with MPI-3, this would be an
-             * MPI_Ibarrier call and this function would return a MPI_Request
-             * that could be checked */
-            MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
+        /**
+         * Must describe one iteration (step).
+         *
+         * This function is called automatically.
+         */
+        virtual void runOneStep(uint32_t currentStep) = 0;
+
+        /**
+         * Initialize simulation
+         *
+         * Does hardware selections/reservations, memory allocations and
+         * initializes data structures as empty.
+         */
+        virtual void init() = 0;
+
+        /**
+         * Fills simulation with initial data after init()
+         *
+         * @return returns the first step of the simulation
+         *         (can be >0 for, e.g., restarts from checkpoints)
+         */
+        virtual uint32_t fillSimulation() = 0;
+
+        /**
+         * Reset the simulation to a state such as it was after
+         * init() but for a specific time step.
+         * Can be used to call fillSimulation() again.
+         */
+        virtual void resetAll(uint32_t currentStep) = 0;
+
+        /**
+         * Check if moving window work must do
+         *
+         * If no moving window is needed the implementation of this function can be empty
+         *
+         * @param currentStep simulation step
+         */
+        virtual void movingWindowCheck(uint32_t currentStep) = 0;
+
+        /**
+         * Notifies registered output classes.
+         *
+         * This function is called automatically.
+         *
+         *  @param currentStep simulation step
+         */
+        virtual void dumpOneStep(uint32_t currentStep)
+        {
+            /* trigger notification */
+            Environment<DIM>::get().PluginConnector().notifyPlugins(currentStep);
 
-            if (gc.getGlobalRank() == 0)
+            /* trigger checkpoint notification */
+            if(!checkpointPeriod.empty() && pluginSystem::containsStep(seqCheckpointPeriod, currentStep))
             {
-                writeCheckpointStep(currentStep);
+                /* first synchronize: if something failed, we can spare the time
+                 * for the checkpoint writing */
+                CUDA_CHECK(cuplaDeviceSynchronize());
+                CUDA_CHECK(cuplaGetLastError());
+
+                // avoid deadlock between not finished PMacc tasks and MPI_Barrier
+                __getTransactionEvent().waitForFinished();
+
+                GridController<DIM>& gc = Environment<DIM>::get().GridController();
+                /* can be spared for better scalings, but allows to spare the
+                 * time for checkpointing if some ranks died */
+                MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
+
+                /* create directory containing checkpoints  */
+                if(numCheckpoints == 0)
+                {
+                    Environment<DIM>::get().Filesystem().createDirectoryWithPermissions(checkpointDirectory);
+                }
+
+                Environment<DIM>::get().PluginConnector().checkpointPlugins(currentStep, checkpointDirectory);
+
+                /* important synchronize: only if no errors occured until this
+                 * point guarantees that a checkpoint is usable */
+                CUDA_CHECK(cuplaDeviceSynchronize());
+                CUDA_CHECK(cuplaGetLastError());
+
+                /* avoid deadlock between not finished PMacc tasks and MPI_Barrier */
+                __getTransactionEvent().waitForFinished();
+
+                /* \todo in an ideal world with MPI-3, this would be an
+                 * MPI_Ibarrier call and this function would return a MPI_Request
+                 * that could be checked */
+                MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));
+
+                if(gc.getGlobalRank() == 0)
+                {
+                    writeCheckpointStep(currentStep);
+                }
+                numCheckpoints++;
             }
-            numCheckpoints++;
         }
-    }
-
-    GridController<DIM> & getGridController()
-    {
-        return Environment<DIM>::get().GridController();
-    }
 
-    void dumpTimes(TimeIntervall &tSimCalculation, TimeIntervall&, double& roundAvg, uint32_t currentStep)
-    {
-        /*dump 100% after simulation*/
-        if (output && progress && (currentStep % showProgressAnyStep) == 0)
+        GridController<DIM>& getGridController()
         {
-            tSimCalculation.toggleEnd();
-            std::cout << std::setw(3) <<
-                uint16_t(
-                    double( currentStep ) /
-                    double( Environment<>::get().SimulationDescription().getRunSteps() ) *
-                    100.
-                ) <<
-                " % = " << std::setw(8) << currentStep <<
-                " | time elapsed:" <<
-                std::setw(25) << tSimCalculation.printInterval() << " | avg time per step: " <<
-                TimeIntervall::printeTime(roundAvg / (double) showProgressAnyStep) << std::endl;
-            std::cout.flush();
-
-            roundAvg = 0.0; //clear round avg timer
+            return Environment<DIM>::get().GridController();
         }
 
-    }
-
-    /**
-     * Begin the simulation.
-     */
-    void startSimulation()
-    {
-        init();
-
-        // translate checkpointPeriod string into checkpoint intervals
-        seqCheckpointPeriod = pluginSystem::toTimeSlice( checkpointPeriod );
-
-        for (uint32_t nthSoftRestart = 0; nthSoftRestart <= softRestarts; ++nthSoftRestart)
+        void dumpTimes(TimeIntervall& tSimCalculation, TimeIntervall&, double& roundAvg, uint32_t currentStep)
         {
-            resetAll(0);
-            uint32_t currentStep = fillSimulation();
-            Environment<>::get().SimulationDescription().setCurrentStep( currentStep );
-
-            tInit.toggleEnd();
-            if (output)
+            /*dump 100% after simulation*/
+            if(output && progress && (currentStep % showProgressAnyStep) == 0)
             {
-                std::cout << "initialization time: " << tInit.printInterval() <<
-                    " = " <<
-                    (int) (tInit.getInterval() / 1000.) << " sec" << std::endl;
+                tSimCalculation.toggleEnd();
+                std::cout << std::setw(3)
+                          << uint16_t(
+                                 double(currentStep)
+                                 / double(Environment<>::get().SimulationDescription().getRunSteps()) * 100.)
+                          << " % = " << std::setw(8) << currentStep << " | time elapsed:" << std::setw(25)
+                          << tSimCalculation.printInterval() << " | avg time per step: "
+                          << TimeIntervall::printeTime(roundAvg / (double) showProgressAnyStep) << std::endl;
+                std::cout.flush();
+
+                roundAvg = 0.0; // clear round avg timer
             }
+        }
 
-            TimeIntervall tSimCalculation;
-            TimeIntervall tRound;
-            double roundAvg = 0.0;
-
-            /* Since in the main loop movingWindow is called always before the dump, we also call it here for consistency.
-             * This becomes only important, if movingWindowCheck does more than merely checking for a slide.
-             * TO DO in a new feature: Turn this into a general hook for pre-checks (window slides are just one possible action).
-             */
-            movingWindowCheck(currentStep);
-
-            /* dump initial step if simulation starts without restart */
-            if (!restartRequested)
-            {
-                dumpOneStep(currentStep);
-            }
+        /**
+         * Begin the simulation.
+         */
+        void startSimulation()
+        {
+            if(useMpiDirect)
+                Environment<>::get().enableMpiDirect();
 
-            /* dump 0% output */
-            dumpTimes(tSimCalculation, tRound, roundAvg, currentStep);
+            init();
 
+            // translate checkpointPeriod string into checkpoint intervals
+            seqCheckpointPeriod = pluginSystem::toTimeSlice(checkpointPeriod);
 
-            /** \todo currently we assume this is the only point in the simulation
-             *        that is allowed to manipulate `currentStep`. Else, one needs to
-             *        add and act on changed values via
-             *        `SimulationDescription().getCurrentStep()` in this loop
-             */
-            while (currentStep < Environment<>::get().SimulationDescription().getRunSteps())
+            for(uint32_t nthSoftRestart = 0; nthSoftRestart <= softRestarts; ++nthSoftRestart)
             {
-                tRound.toggleStart();
-                runOneStep(currentStep);
-                tRound.toggleEnd();
-                roundAvg += tRound.getInterval();
-
-                /* NEXT TIMESTEP STARTS HERE */
-                currentStep++;
-                Environment<>::get().SimulationDescription().setCurrentStep( currentStep );
-                /* output times after a round */
-                dumpTimes(tSimCalculation, tRound, roundAvg, currentStep);
-
+                resetAll(0);
+                uint32_t currentStep = fillSimulation();
+                Environment<>::get().SimulationDescription().setCurrentStep(currentStep);
+
+                tInit.toggleEnd();
+                if(output)
+                {
+                    std::cout << "initialization time: " << tInit.printInterval() << " = "
+                              << (int) (tInit.getInterval() / 1000.) << " sec" << std::endl;
+                }
+
+                TimeIntervall tSimCalculation;
+                TimeIntervall tRound;
+                double roundAvg = 0.0;
+
+                /* Since in the main loop movingWindow is called always before the dump, we also call it here for
+                 * consistency. This becomes only important, if movingWindowCheck does more than merely checking for a
+                 * slide. TO DO in a new feature: Turn this into a general hook for pre-checks (window slides are just
+                 * one possible action).
+                 */
                 movingWindowCheck(currentStep);
-                /* dump at the beginning of the simulated step */
-                dumpOneStep(currentStep);
-            }
 
-            // simulatation end
-            Environment<>::get().Manager().waitForAllTasks();
+                /* dump initial step if simulation starts without restart */
+                if(!restartRequested)
+                {
+                    dumpOneStep(currentStep);
+                }
 
-            tSimCalculation.toggleEnd();
+                /* dump 0% output */
+                dumpTimes(tSimCalculation, tRound, roundAvg, currentStep);
 
-            if (output)
-            {
-                std::cout << "calculation  simulation time: " <<
-                   tSimCalculation.printInterval() << " = " <<
-                   (int) (tSimCalculation.getInterval() / 1000.) << " sec" << std::endl;
-            }
 
-        } // softRestarts loop
-    }
+                /** \todo currently we assume this is the only point in the simulation
+                 *        that is allowed to manipulate `currentStep`. Else, one needs to
+                 *        add and act on changed values via
+                 *        `SimulationDescription().getCurrentStep()` in this loop
+                 */
+                while(currentStep < Environment<>::get().SimulationDescription().getRunSteps())
+                {
+                    tRound.toggleStart();
+                    runOneStep(currentStep);
+                    tRound.toggleEnd();
+                    roundAvg += tRound.getInterval();
+
+                    /* NEXT TIMESTEP STARTS HERE */
+                    currentStep++;
+                    Environment<>::get().SimulationDescription().setCurrentStep(currentStep);
+                    /* output times after a round */
+                    dumpTimes(tSimCalculation, tRound, roundAvg, currentStep);
+
+                    movingWindowCheck(currentStep);
+                    /* dump at the beginning of the simulated step */
+                    dumpOneStep(currentStep);
+                }
+
+                // simulatation end
+                Environment<>::get().Manager().waitForAllTasks();
+
+                tSimCalculation.toggleEnd();
+
+                if(output)
+                {
+                    std::cout << "calculation  simulation time: " << tSimCalculation.printInterval() << " = "
+                              << (int) (tSimCalculation.getInterval() / 1000.) << " sec" << std::endl;
+                }
+
+            } // softRestarts loop
+        }
 
-    virtual void pluginRegisterHelp(po::options_description& desc)
-    {
-        desc.add_options()
-            ("steps,s", po::value<uint32_t > (&runSteps), "Simulation steps")
-            ("checkpoint.restart.loop", po::value<uint32_t > (&softRestarts)->default_value(0),
-             "Number of times to restart the simulation after simulation has finished (for presentations). "
-             "Note: does not yet work with all plugins, see issue #1305")
-            ("percent,p", po::value<uint16_t > (&progress)->default_value(5),
-             "Print time statistics after p percent to stdout")
-            ("checkpoint.restart", po::value<bool>(&restartRequested)->zero_tokens(), "Restart simulation")
-            ("checkpoint.restart.directory", po::value<std::string>(&restartDirectory)->default_value(restartDirectory),
-             "Directory containing checkpoints for a restart")
-            ("checkpoint.restart.step", po::value<int32_t>(&restartStep), "Checkpoint step to restart from")
-            ("checkpoint.period", po::value<std::string>(&checkpointPeriod), "Period for checkpoint creation")
-            ("checkpoint.directory", po::value<std::string>(&checkpointDirectory)->default_value(checkpointDirectory),
-             "Directory for checkpoints")
-            ("author", po::value<std::string>(&author)->default_value(std::string("")),
-             "The author that runs the simulation and is responsible for created output files");
-    }
-
-    std::string pluginGetName() const
-    {
-        return "SimulationHelper";
-    }
+        virtual void pluginRegisterHelp(po::options_description& desc)
+        {
+            desc.add_options()("steps,s", po::value<uint32_t>(&runSteps), "Simulation steps")(
+                "checkpoint.restart.loop",
+                po::value<uint32_t>(&softRestarts)->default_value(0),
+                "Number of times to restart the simulation after simulation has finished (for presentations). "
+                "Note: does not yet work with all plugins, see issue #1305")(
+                "percent,p",
+                po::value<uint16_t>(&progress)->default_value(5),
+                "Print time statistics after p percent to stdout")(
+                "checkpoint.restart",
+                po::value<bool>(&restartRequested)->zero_tokens(),
+                "Restart simulation")(
+                "checkpoint.restart.directory",
+                po::value<std::string>(&restartDirectory)->default_value(restartDirectory),
+                "Directory containing checkpoints for a restart")(
+                "checkpoint.restart.step",
+                po::value<int32_t>(&restartStep),
+                "Checkpoint step to restart from")(
+                "checkpoint.period",
+                po::value<std::string>(&checkpointPeriod),
+                "Period for checkpoint creation")(
+                "checkpoint.directory",
+                po::value<std::string>(&checkpointDirectory)->default_value(checkpointDirectory),
+                "Directory for checkpoints")(
+                "author",
+                po::value<std::string>(&author)->default_value(std::string("")),
+                "The author that runs the simulation and is responsible for created output files")(
+                "mpiDirect",
+                po::value<bool>(&useMpiDirect)->zero_tokens(),
+                "use device direct for MPI communication e.g. GPU direct");
+        }
 
-    void pluginLoad()
-    {
-        Environment<>::get().SimulationDescription().setRunSteps(runSteps);
-        Environment<>::get().SimulationDescription().setAuthor(author);
+        std::string pluginGetName() const
+        {
+            return "SimulationHelper";
+        }
 
-        calcProgress();
+        void pluginLoad()
+        {
+            Environment<>::get().SimulationDescription().setRunSteps(runSteps);
+            Environment<>::get().SimulationDescription().setAuthor(author);
 
-        output = (getGridController().getGlobalRank() == 0);
-    }
+            calcProgress();
 
-    void pluginUnload()
-    {
-    }
+            output = (getGridController().getGlobalRank() == 0);
+        }
 
-    void restart(uint32_t, const std::string)
-    {
-    }
+        void pluginUnload()
+        {
+        }
 
-    void checkpoint(uint32_t, const std::string)
-    {
-    }
+        void restart(uint32_t, const std::string)
+        {
+        }
 
-protected:
-    /* number of simulation steps to compute */
-    uint32_t runSteps;
+        void checkpoint(uint32_t, const std::string)
+        {
+        }
 
-    /** Presentations: loop the whole simulation `softRestarts` times from
-     *                 initial step to runSteps */
-    uint32_t softRestarts;
+    protected:
+        /* number of simulation steps to compute */
+        uint32_t runSteps;
 
-    /* period for checkpoint creation */
-    std::string checkpointPeriod;
+        /** Presentations: loop the whole simulation `softRestarts` times from
+         *                 initial step to runSteps */
+        uint32_t softRestarts;
 
-    /* checkpoint intervals */
-    SeqOfTimeSlices seqCheckpointPeriod;
+        /* period for checkpoint creation */
+        std::string checkpointPeriod;
 
-    /* common directory for checkpoints */
-    std::string checkpointDirectory;
+        /* checkpoint intervals */
+        SeqOfTimeSlices seqCheckpointPeriod;
 
-    /* number of checkpoints written */
-    uint32_t numCheckpoints;
+        /* common directory for checkpoints */
+        std::string checkpointDirectory;
 
-    /* checkpoint step to restart from */
-    int32_t restartStep;
+        /* number of checkpoints written */
+        uint32_t numCheckpoints;
 
-    /* common directory for restarts */
-    std::string restartDirectory;
+        /* checkpoint step to restart from */
+        int32_t restartStep;
 
-    /* restart requested */
-    bool restartRequested;
+        /* common directory for restarts */
+        std::string restartDirectory;
 
-    /* filename for checkpoint master file with all checkpoint timesteps */
-    const std::string CHECKPOINT_MASTER_FILE;
+        /* restart requested */
+        bool restartRequested;
 
-    /* author that runs the simulation */
-    std::string author;
+        /* filename for checkpoint master file with all checkpoint timesteps */
+        const std::string CHECKPOINT_MASTER_FILE;
 
-private:
+        /* author that runs the simulation */
+        std::string author;
 
-    /**
-     * Set how often the elapsed time is printed.
-     *
-     * @param percent percentage difference for printing
-     */
-    void calcProgress()
-    {
-        if (progress == 0 || progress > 100)
-            progress = 100;
+        //! enable MPI gpu direct
+        bool useMpiDirect;
 
-        showProgressAnyStep = uint32_t(
-            double( Environment<>::get().SimulationDescription().getRunSteps() ) /
-            100. * double( progress )
-        );
-        if (showProgressAnyStep == 0)
-            showProgressAnyStep = 1;
-    }
+    private:
+        /**
+         * Set how often the elapsed time is printed.
+         *
+         * @param percent percentage difference for printing
+         */
+        void calcProgress()
+        {
+            if(progress == 0 || progress > 100)
+                progress = 100;
 
-    /**
-     * Append \p checkpointStep to the master checkpoint file
-     *
-     * @param checkpointStep current checkpoint step
-     */
-    void writeCheckpointStep(const uint32_t checkpointStep)
-    {
-        std::ofstream file;
-        const std::string checkpointMasterFile =
-            checkpointDirectory + std::string("/") + CHECKPOINT_MASTER_FILE;
+            showProgressAnyStep = uint32_t(
+                double(Environment<>::get().SimulationDescription().getRunSteps()) / 100. * double(progress));
+            if(showProgressAnyStep == 0)
+                showProgressAnyStep = 1;
+        }
 
-        file.open(checkpointMasterFile.c_str(), std::ofstream::app);
+        /**
+         * Append \p checkpointStep to the master checkpoint file
+         *
+         * @param checkpointStep current checkpoint step
+         */
+        void writeCheckpointStep(const uint32_t checkpointStep)
+        {
+            std::ofstream file;
+            const std::string checkpointMasterFile = checkpointDirectory + std::string("/") + CHECKPOINT_MASTER_FILE;
 
-        if (!file)
-            throw std::runtime_error("Failed to write checkpoint master file");
+            file.open(checkpointMasterFile.c_str(), std::ofstream::app);
 
-        file << checkpointStep << std::endl;
-        file.close();
-    }
+            if(!file)
+                throw std::runtime_error("Failed to write checkpoint master file");
 
-protected:
-    /**
-     * Reads the checkpoint master file if any and returns all found checkpoint steps
-     *
-     * @return vector of found checkpoints steps in order they appear in the file
-     */
-    std::vector<uint32_t> readCheckpointMasterFile()
-    {
-        std::vector<uint32_t> checkpoints;
+            file << checkpointStep << std::endl;
+            file.close();
+        }
 
-        const std::string checkpointMasterFile =
-            this->restartDirectory + std::string("/") + this->CHECKPOINT_MASTER_FILE;
+    protected:
+        /**
+         * Reads the checkpoint master file if any and returns all found checkpoint steps
+         *
+         * @return vector of found checkpoints steps in order they appear in the file
+         */
+        std::vector<uint32_t> readCheckpointMasterFile()
+        {
+            std::vector<uint32_t> checkpoints;
 
-        if (!boost::filesystem::exists(checkpointMasterFile))
-            return checkpoints;
+            const std::string checkpointMasterFile
+                = this->restartDirectory + std::string("/") + this->CHECKPOINT_MASTER_FILE;
 
-        std::ifstream file(checkpointMasterFile.c_str());
+            if(!boost::filesystem::exists(checkpointMasterFile))
+                return checkpoints;
 
-        /* read each line */
-        std::string line;
-        while (std::getline(file, line))
-        {
-            if (line.empty())
-                continue;
-            try
-            {
-                checkpoints.push_back(boost::lexical_cast<uint32_t>(line));
-            }
-            catch (boost::bad_lexical_cast const&)
+            std::ifstream file(checkpointMasterFile.c_str());
+
+            /* read each line */
+            std::string line;
+            while(std::getline(file, line))
             {
-                std::cerr << "Warning: checkpoint master file contains invalid data ("
-                    << line << ")" << std::endl;
+                if(line.empty())
+                    continue;
+                try
+                {
+                    checkpoints.push_back(boost::lexical_cast<uint32_t>(line));
+                }
+                catch(boost::bad_lexical_cast const&)
+                {
+                    std::cerr << "Warning: checkpoint master file contains invalid data (" << line << ")" << std::endl;
+                }
             }
-        }
 
-        return checkpoints;
-    }
-private:
-
-    bool output;
+            return checkpoints;
+        }
 
-    uint16_t progress;
-    uint32_t showProgressAnyStep;
+    private:
+        bool output = false;
 
-    TimeIntervall tSimulation;
-    TimeIntervall tInit;
+        uint16_t progress;
+        uint32_t showProgressAnyStep;
 
-};
+        TimeIntervall tSimulation;
+        TimeIntervall tInit;
+    };
 
 } // namespace pmacc
diff --git a/include/pmacc/simulationControl/TimeInterval.hpp b/include/pmacc/simulationControl/TimeInterval.hpp
index 3df27d47bc..4be6c0c853 100644
--- a/include/pmacc/simulationControl/TimeInterval.hpp
+++ b/include/pmacc/simulationControl/TimeInterval.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PMacc.
  *
@@ -31,14 +31,12 @@
 
 namespace pmacc
 {
-
     class TimeIntervall
     {
     public:
-
         TimeIntervall()
         {
-            start=end=getTime();
+            start = end = getTime();
         }
 
         /*! Get the timestamp in msec
@@ -46,24 +44,24 @@ namespace pmacc
          */
         static double getTime()
         {
-            auto time( Clock::now().time_since_epoch() );
-            auto timestamp = std::chrono::duration_cast< Milliseconds >( time ).count();
-            return static_cast< double >(timestamp);
+            auto time(Clock::now().time_since_epoch());
+            auto timestamp = std::chrono::duration_cast<Milliseconds>(time).count();
+            return static_cast<double>(timestamp);
         }
 
         double toggleStart()
         {
-            return start=getTime();
+            return start = getTime();
         }
 
         double toggleEnd()
         {
-            return end=getTime();
+            return end = getTime();
         }
 
         double getInterval()
         {
-            return end-start;
+            return end - start;
         }
 
         std::string printInterval()
@@ -78,53 +76,52 @@ namespace pmacc
 
             int p_time;
 
-            bool write_all=false;
-            if(time/(3600.*1000.)>1.)
+            bool write_all = false;
+            if(time / (3600. * 1000.) > 1.)
             {
-                p_time=time/(3600.*1000.);
-                time=time-3600.*1000.*p_time;
-                outstr<<std::setw(2)<<p_time<<"h ";
-                write_all=true;
+                p_time = time / (3600. * 1000.);
+                time = time - 3600. * 1000. * p_time;
+                outstr << std::setw(2) << p_time << "h ";
+                write_all = true;
             }
 
 
-            if(write_all || time/(60*1000)>1.)
+            if(write_all || time / (60 * 1000) > 1.)
             {
-                p_time=time/(60.*1000.);
-                time=time-60.*1000.*p_time;
-                outstr<<std::setw(2)<<p_time<<"min ";
-                write_all=true;
+                p_time = time / (60. * 1000.);
+                time = time - 60. * 1000. * p_time;
+                outstr << std::setw(2) << p_time << "min ";
+                write_all = true;
             }
 
 
-            if(write_all || time/1000.>1.)
+            if(write_all || time / 1000. > 1.)
             {
-                p_time=time/1000.;
-                time=time-1000.*p_time;
-                outstr<<std::setw(2)<<p_time<<"sec ";
-                write_all=true;
+                p_time = time / 1000.;
+                time = time - 1000. * p_time;
+                outstr << std::setw(2) << p_time << "sec ";
+                write_all = true;
             }
 
 
-            if(write_all || time>1.)
+            if(write_all || time > 1.)
             {
-                outstr<<std::setw(3)<<(int)time<<"msec";
+                outstr << std::setw(3) << (int) time << "msec";
             }
 
             if(outstr.str().empty())
-                outstr<<"  0msec";
+                outstr << "  0msec";
 
             return outstr.str();
         }
 
     private:
-
         using Clock = std::chrono::high_resolution_clock;
-        template< class Duration >
-        using TimePoint = std::chrono::time_point< Clock, Duration >;
+        template<class Duration>
+        using TimePoint = std::chrono::time_point<Clock, Duration>;
         using Milliseconds = std::chrono::milliseconds;
 
         double start;
         double end;
     };
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/static_assert.hpp b/include/pmacc/static_assert.hpp
index 7f279572e0..41a95d0b5e 100644
--- a/include/pmacc/static_assert.hpp
+++ b/include/pmacc/static_assert.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Felix Schmitt, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Felix Schmitt, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -32,12 +32,12 @@ namespace pmacc
     {
     };
 
-    template<typename T_Type=StaticAssertError>
+    template<typename T_Type = StaticAssertError>
     struct GetStaticAssertInfoType
     {
         typedef T_Type type;
     };
-}
+} // namespace pmacc
 
 /** call BOOST_MPL_ASSERT_MSG and add unique id to message
  * @param pmacc_cond an integral constant expression
@@ -45,46 +45,54 @@ namespace pmacc
  * @param pmacc_unique_id pre compiler unique id
  * @param pmacc_typeInfo a type that is shown in error message
  */
-#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA
+#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
 /* device compile with clang: boost static assert can not be used
  * error is: calling a `__host__` function from `__device__`
  * Therefore C++11 `static_assert` is used
  */
-#   define PMACC_STATIC_ASSERT_MSG_DO2(pmacc_cond, pmacc_msg, pmacc_unique_id, pmacc_typeInfo) \
-        static_assert(pmacc_cond,#pmacc_msg)
+#    define PMACC_STATIC_ASSERT_MSG_DO2(pmacc_cond, pmacc_msg, pmacc_unique_id, pmacc_typeInfo)                       \
+        static_assert(pmacc_cond, #pmacc_msg)
 #else
-#   define PMACC_STATIC_ASSERT_MSG_DO2(pmacc_cond, pmacc_msg, pmacc_unique_id, pmacc_typeInfo) \
-        BOOST_MPL_ASSERT_MSG(pmacc_cond,PMACC_JOIN(pmacc_msg,PMACC_JOIN(_________,pmacc_unique_id)),(pmacc_typeInfo))
+#    define PMACC_STATIC_ASSERT_MSG_DO2(pmacc_cond, pmacc_msg, pmacc_unique_id, pmacc_typeInfo)                       \
+        BOOST_MPL_ASSERT_MSG(                                                                                         \
+            pmacc_cond,                                                                                               \
+            PMACC_JOIN(pmacc_msg, PMACC_JOIN(_________, pmacc_unique_id)),                                            \
+            (pmacc_typeInfo))
 #endif
 
 /*! static assert with error message
  * @param pmacc_cond A condition which return true or false.
- * @param pmacc_msg A message which is shown if the condition is false. Msg must a valid c++ variable name (etc. _only_human_make_mistakes)
+ * @param pmacc_msg A message which is shown if the condition is false. Msg must a valid c++ variable name (etc.
+ * _only_human_make_mistakes)
  * @param ... (optional) a type that is shown in error message
  */
-#define PMACC_STATIC_ASSERT_MSG(pmacc_cond,pmacc_msg,...)                      \
-    PMACC_STATIC_ASSERT_MSG_DO2(pmacc_cond,pmacc_msg,__COUNTER__,typename pmacc::GetStaticAssertInfoType<__VA_ARGS__>::type)
+#define PMACC_STATIC_ASSERT_MSG(pmacc_cond, pmacc_msg, ...)                                                           \
+    PMACC_STATIC_ASSERT_MSG_DO2(                                                                                      \
+        pmacc_cond,                                                                                                   \
+        pmacc_msg,                                                                                                    \
+        __COUNTER__,                                                                                                  \
+        typename pmacc::GetStaticAssertInfoType<__VA_ARGS__>::type)
 
 /*! static assert
  * @param pmacc_cond A condition which return true or false.
  */
-#define PMACC_STATIC_ASSERT(pmacc_cond)                                        \
-    PMACC_STATIC_ASSERT_MSG(pmacc_cond,STATIC_ASSERTION_FAILURE)
+#define PMACC_STATIC_ASSERT(pmacc_cond) PMACC_STATIC_ASSERT_MSG(pmacc_cond, STATIC_ASSERTION_FAILURE, )
 
 /*! static assert wrapper which is easier to use than \see PMACC_STATIC_ASSERT_MSG
- * @param pmacc_msg A message which is shown if the condition is false. Msg must a valid c++ variable name (etc. _only_human_make_mistakes)
+ * @param pmacc_msg A message which is shown if the condition is false. Msg must a valid c++ variable name (etc.
+ * _only_human_make_mistakes)
  * @param pmacc_typeInfo a type that is shown in error message
  * @param ... A condition which return true or false.
  */
-#define PMACC_CASSERT_MSG_TYPE(pmacc_msg,pmacc_typeInfo,...)                   \
-    PMACC_STATIC_ASSERT_MSG((__VA_ARGS__),pmacc_msg,pmacc_typeInfo)
+#define PMACC_CASSERT_MSG_TYPE(pmacc_msg, pmacc_typeInfo, ...)                                                        \
+    PMACC_STATIC_ASSERT_MSG((__VA_ARGS__), pmacc_msg, pmacc_typeInfo)
 
 /*! static assert wrapper which is easier to use than \see PMACC_STATIC_ASSERT_MSG
- * @param pmacc_msg A message which is shown if the condition is false. Msg must a valid c++ variable name (etc. _only_human_make_mistakes)
+ * @param pmacc_msg A message which is shown if the condition is false. Msg must a valid c++ variable name (etc.
+ * _only_human_make_mistakes)
  * @param ... A condition which return true or false.
  */
-#define PMACC_CASSERT_MSG(pmacc_msg,...)                                       \
-    PMACC_STATIC_ASSERT_MSG((__VA_ARGS__),pmacc_msg)
+#define PMACC_CASSERT_MSG(pmacc_msg, ...) PMACC_STATIC_ASSERT_MSG((__VA_ARGS__), pmacc_msg, )
 
 /*! static assert
  * @param ... A condition which return true or false.
@@ -98,17 +106,18 @@ namespace pmacc
  * @param nmspace The name of the namespace
  * @param var The variable to look for.
  */
-#define PMACC_DEF_IN_NAMESPACE_MSG(pmacc_msg,nmspace,var)                      \
-  namespace pmacc_msg {                                                        \
-    using nmspace::var;                                                        \
-    namespace fallback                                                         \
-    {                                                                          \
-      struct var                                                               \
-      {                                                                        \
-        double d[9999];                                                        \
-        char   c;                                                              \
-      };                                                                       \
-    }                                                                          \
-    using fallback::var;                                                       \
-  }                                                                            \
-  PMACC_CASSERT_MSG( pmacc_msg, ((sizeof(pmacc_msg::var))!=(sizeof(pmacc_msg::fallback::var))) );
+#define PMACC_DEF_IN_NAMESPACE_MSG(pmacc_msg, nmspace, var)                                                           \
+    namespace pmacc_msg                                                                                               \
+    {                                                                                                                 \
+        using nmspace::var;                                                                                           \
+        namespace fallback                                                                                            \
+        {                                                                                                             \
+            struct var                                                                                                \
+            {                                                                                                         \
+                double d[9999];                                                                                       \
+                char c;                                                                                               \
+            };                                                                                                        \
+        }                                                                                                             \
+        using fallback::var;                                                                                          \
+    }                                                                                                                 \
+    PMACC_CASSERT_MSG(pmacc_msg, ((sizeof(pmacc_msg::var)) != (sizeof(pmacc_msg::fallback::var))));
diff --git a/include/pmacc/test/PMaccFixture.hpp b/include/pmacc/test/PMaccFixture.hpp
index 3bbae4c491..a02037ce6b 100644
--- a/include/pmacc/test/PMaccFixture.hpp
+++ b/include/pmacc/test/PMaccFixture.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -28,29 +28,28 @@
 
 namespace pmacc
 {
-namespace test
-{
-
-/** Fixture that initializes PMacc for a given dimensionality */
-template<unsigned T_dim>
-struct PMaccFixture
-{
-    PMaccFixture()
+    namespace test
     {
-        const pmacc::DataSpace<T_dim> devices = pmacc::DataSpace<T_dim>::create(1);
-        const pmacc::DataSpace<T_dim> periodic = pmacc::DataSpace<T_dim>::create(1);
-        pmacc::Environment<T_dim>::get().initDevices(devices, periodic);
-    }
-
-    ~PMaccFixture()
-    {
-        /* finalize the PMacc context */
-        pmacc::Environment<>::get().finalize();
-    }
-};
-
-using PMaccFixture2D = PMaccFixture< 2 >;
-using PMaccFixture3D = PMaccFixture< 3 >;
-
-} // namespace test
+        /** Fixture that initializes PMacc for a given dimensionality */
+        template<unsigned T_dim>
+        struct PMaccFixture
+        {
+            PMaccFixture()
+            {
+                const pmacc::DataSpace<T_dim> devices = pmacc::DataSpace<T_dim>::create(1);
+                const pmacc::DataSpace<T_dim> periodic = pmacc::DataSpace<T_dim>::create(1);
+                pmacc::Environment<T_dim>::get().initDevices(devices, periodic);
+            }
+
+            ~PMaccFixture()
+            {
+                /* finalize the PMacc context */
+                pmacc::Environment<>::get().finalize();
+            }
+        };
+
+        using PMaccFixture2D = PMaccFixture<2>;
+        using PMaccFixture3D = PMaccFixture<3>;
+
+    } // namespace test
 } // namespace pmacc
diff --git a/include/pmacc/test/TemplateUT.cpp b/include/pmacc/test/TemplateUT.cpp
deleted file mode 100644
index 4156ce4131..0000000000
--- a/include/pmacc/test/TemplateUT.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2015-2020 Erik Zenker
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-// STL
-#include <stdint.h> /* uint8_t */
-
-// BOOST
-#include <boost/test/unit_test.hpp>
-
-// Boost.Test documentation: http://www.boost.org/doc/libs/1_59_0/libs/test/doc/html/index.html
-
-/*******************************************************************************
- * Configuration
- ******************************************************************************/
-
-// Nothing to configure, but here could be
-// placed global variables, typedefs, classes.
-
-/*******************************************************************************
- * Test Suite
- ******************************************************************************/
-BOOST_AUTO_TEST_SUITE( template_unit_test )
-
-
-/***************************************************************************
- * Test Cases
- ****************************************************************************/
-
-// Normal test case
-BOOST_AUTO_TEST_CASE( first ){
-    BOOST_CHECK_EQUAL( sizeof(uint8_t), 1u );
-
-}
-
-
-BOOST_AUTO_TEST_SUITE_END()
diff --git a/include/pmacc/test/main.cpp b/include/pmacc/test/main.cpp
deleted file mode 100644
index bb1a505249..0000000000
--- a/include/pmacc/test/main.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2015-2020 Erik Zenker, Alexander Grund
- *
- * This file is part of PMacc.
- *
- * PMacc is free software: you can redistribute it and/or modify
- * it under the terms of either the GNU General Public License or
- * the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PMacc is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License and the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License
- * and the GNU Lesser General Public License along with PMacc.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#define BOOST_TEST_MODULE "PMacc Unit Tests"
-#define BOOST_TEST_NO_MAIN
-#include <boost/test/unit_test.hpp>
-
-
-int main(int argc, char* argv[], char* envp[])
-{
-    int result = boost::unit_test::unit_test_main(&init_unit_test, argc, argv);
-
-    return result;
-}
diff --git a/include/pmacc/test/memory/HostBufferIntern/copyFrom.hpp b/include/pmacc/test/memory/HostBufferIntern/copyFrom.hpp
index cf1f932c53..97ffdfc3b7 100644
--- a/include/pmacc/test/memory/HostBufferIntern/copyFrom.hpp
+++ b/include/pmacc/test/memory/HostBufferIntern/copyFrom.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Erik Zenker
+/* Copyright 2015-2021 Erik Zenker
  *
  * This file is part of PMacc.
  *
@@ -26,70 +26,73 @@
 
 namespace pmacc
 {
-namespace test
-{
-namespace memory
-{
-namespace HostBufferIntern
-{
-
-/**
- * Checks if data is copied correctly from device to
- * host.
- */
-struct CopyFromTest {
-
-    template<typename T_Dim>
-    void exec(T_Dim)
+    namespace test
     {
-        using Data = uint8_t ;
-        using Extents = size_t;
-
-        using ::pmacc::test::memory::getElementsPerDim;
-
-        std::vector<size_t> nElementsPerDim = getElementsPerDim<T_Dim>();
-
-        for(unsigned i = 0; i < nElementsPerDim.size(); ++i){
-            ::pmacc::DataSpace<T_Dim::value> const dataSpace = ::pmacc::DataSpace<T_Dim::value>::create(nElementsPerDim[i]);
-            ::pmacc::HostBuffer<Data, T_Dim::value>* hostBufferIntern = new ::pmacc::HostBufferIntern<Data, T_Dim::value>(dataSpace);
-            ::pmacc::DeviceBuffer<Data, T_Dim::value>* deviceBufferIntern = new ::pmacc::DeviceBufferIntern<Data, T_Dim::value>(dataSpace);
-
-            hostBufferIntern->reset();
-
-            for(size_t i = 0; i < static_cast<size_t>(dataSpace.productOfComponents()); ++i){
-                hostBufferIntern->getPointer()[i] = static_cast<Data>(i);
-            }
-
-            deviceBufferIntern->copyFrom(*hostBufferIntern);
-            hostBufferIntern->reset();
-            hostBufferIntern->copyFrom(*deviceBufferIntern);
-
-            for(size_t i = 0; i < static_cast<size_t>(dataSpace.productOfComponents()); ++i){
-                BOOST_CHECK_EQUAL(hostBufferIntern->getPointer()[i], static_cast<Data>(i));
-            }
-
-            delete hostBufferIntern;
-            delete deviceBufferIntern;
-
-        }
-
-    }
-
-    PMACC_NO_NVCC_HDWARNING
-    template<typename T_Dim>
-    HDINLINE void operator()(T_Dim dim)
-    {
-        exec(dim);
-    }
-};
-
-} // namespace HostBufferIntern
-} // namespace memory
-} // namespace test
+        namespace memory
+        {
+            namespace HostBufferIntern
+            {
+                /**
+                 * Checks if data is copied correctly from device to
+                 * host.
+                 */
+                struct CopyFromTest
+                {
+                    template<typename T_Dim>
+                    void exec(T_Dim)
+                    {
+                        using Data = uint8_t;
+                        using Extents = size_t;
+
+                        using ::pmacc::test::memory::getElementsPerDim;
+
+                        std::vector<size_t> nElementsPerDim = getElementsPerDim<T_Dim>();
+
+                        for(unsigned i = 0; i < nElementsPerDim.size(); ++i)
+                        {
+                            ::pmacc::DataSpace<T_Dim::value> const dataSpace
+                                = ::pmacc::DataSpace<T_Dim::value>::create(nElementsPerDim[i]);
+                            ::pmacc::HostBuffer<Data, T_Dim::value>* hostBufferIntern
+                                = new ::pmacc::HostBufferIntern<Data, T_Dim::value>(dataSpace);
+                            ::pmacc::DeviceBuffer<Data, T_Dim::value>* deviceBufferIntern
+                                = new ::pmacc::DeviceBufferIntern<Data, T_Dim::value>(dataSpace);
+
+                            hostBufferIntern->reset();
+
+                            for(size_t i = 0; i < static_cast<size_t>(dataSpace.productOfComponents()); ++i)
+                            {
+                                hostBufferIntern->getPointer()[i] = static_cast<Data>(i);
+                            }
+
+                            deviceBufferIntern->copyFrom(*hostBufferIntern);
+                            hostBufferIntern->reset();
+                            hostBufferIntern->copyFrom(*deviceBufferIntern);
+
+                            for(size_t i = 0; i < static_cast<size_t>(dataSpace.productOfComponents()); ++i)
+                            {
+                                REQUIRE(hostBufferIntern->getPointer()[i] == static_cast<Data>(i));
+                            }
+
+                            delete hostBufferIntern;
+                            delete deviceBufferIntern;
+                        }
+                    }
+
+                    PMACC_NO_NVCC_HDWARNING
+                    template<typename T_Dim>
+                    HDINLINE void operator()(T_Dim dim)
+                    {
+                        exec(dim);
+                    }
+                };
+
+            } // namespace HostBufferIntern
+        } // namespace memory
+    } // namespace test
 } // namespace pmacc
 
-BOOST_AUTO_TEST_CASE( copyFrom )
+TEST_CASE("HostBufferIntern::copyFrom", "[copyFrom]")
 {
     using namespace pmacc::test::memory::HostBufferIntern;
-    ::boost::mpl::for_each< Dims >( CopyFromTest() );
+    ::boost::mpl::for_each<Dims>(CopyFromTest());
 }
diff --git a/include/pmacc/test/memory/HostBufferIntern/reset.hpp b/include/pmacc/test/memory/HostBufferIntern/reset.hpp
index 49f9f55715..d4cb235a3d 100644
--- a/include/pmacc/test/memory/HostBufferIntern/reset.hpp
+++ b/include/pmacc/test/memory/HostBufferIntern/reset.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Erik Zenker
+/* Copyright 2015-2021 Erik Zenker
  *
  * This file is part of PMacc.
  *
@@ -26,58 +26,57 @@
 
 namespace pmacc
 {
-namespace test
-{
-namespace memory
-{
-namespace HostBufferIntern
-{
-
-/**
- * Checks if the HostBufferIntern is reseted correctly to zero.
- */
-struct ResetTest {
-
-    template<typename T_Dim>
-    void exec(T_Dim)
+    namespace test
     {
-        using Data = uint8_t ;
-        using Extents = size_t;
-
-        using ::pmacc::test::memory::getElementsPerDim;
-
-        std::vector<size_t> nElementsPerDim = getElementsPerDim<T_Dim>();
-
-        for(unsigned i = 0; i < nElementsPerDim.size(); ++i)
+        namespace memory
         {
-            ::pmacc::DataSpace<T_Dim::value> const dataSpace = ::pmacc::DataSpace<T_Dim::value>::create(nElementsPerDim[i]);
-            ::pmacc::HostBufferIntern<Data, T_Dim::value> hostBufferIntern(dataSpace);
-
-            hostBufferIntern.reset();
-
-            for(size_t i = 0; i < static_cast<size_t>(dataSpace.productOfComponents()); ++i){
-                BOOST_CHECK_EQUAL( hostBufferIntern.getPointer()[i], 0 );
-            }
-
-        }
-
-    }
-
-    PMACC_NO_NVCC_HDWARNING
-    template<typename T_Dim>
-    HDINLINE void operator()(T_Dim dim)
-    {
-        exec(dim);
-    }
-};
-
-} // namespace HostBufferIntern
-} // namespace memory
-} // namespace test
+            namespace HostBufferIntern
+            {
+                /**
+                 * Checks if the HostBufferIntern is reseted correctly to zero.
+                 */
+                struct ResetTest
+                {
+                    template<typename T_Dim>
+                    void exec(T_Dim)
+                    {
+                        using Data = uint8_t;
+                        using Extents = size_t;
+
+                        using ::pmacc::test::memory::getElementsPerDim;
+
+                        std::vector<size_t> nElementsPerDim = getElementsPerDim<T_Dim>();
+
+                        for(unsigned i = 0; i < nElementsPerDim.size(); ++i)
+                        {
+                            ::pmacc::DataSpace<T_Dim::value> const dataSpace
+                                = ::pmacc::DataSpace<T_Dim::value>::create(nElementsPerDim[i]);
+                            ::pmacc::HostBufferIntern<Data, T_Dim::value> hostBufferIntern(dataSpace);
+
+                            hostBufferIntern.reset();
+
+                            for(size_t i = 0; i < static_cast<size_t>(dataSpace.productOfComponents()); ++i)
+                            {
+                                REQUIRE(hostBufferIntern.getPointer()[i] == 0);
+                            }
+                        }
+                    }
+
+                    PMACC_NO_NVCC_HDWARNING
+                    template<typename T_Dim>
+                    HDINLINE void operator()(T_Dim dim)
+                    {
+                        exec(dim);
+                    }
+                };
+
+            } // namespace HostBufferIntern
+        } // namespace memory
+    } // namespace test
 } // namespace pmacc
 
-BOOST_AUTO_TEST_CASE( reset )
+TEST_CASE("HostBufferIntern::reset", "[reset]")
 {
     using namespace pmacc::test::memory::HostBufferIntern;
-    ::boost::mpl::for_each< Dims >( ResetTest() );
+    ::boost::mpl::for_each<Dims>(ResetTest());
 }
diff --git a/include/pmacc/test/memory/HostBufferIntern/setValue.hpp b/include/pmacc/test/memory/HostBufferIntern/setValue.hpp
index c2cd58537e..cd72c5cf67 100644
--- a/include/pmacc/test/memory/HostBufferIntern/setValue.hpp
+++ b/include/pmacc/test/memory/HostBufferIntern/setValue.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Erik Zenker
+/* Copyright 2015-2021 Erik Zenker
  *
  * This file is part of PMacc.
  *
@@ -26,62 +26,59 @@
 
 namespace pmacc
 {
-namespace test
-{
-namespace memory
-{
-namespace HostBufferIntern
-{
-
-/**
- * Checks if the HostBufferIntern is set to a constant value.
- */
-struct setValueTest
-{
-
-    template<typename T_Dim>
-    void exec(T_Dim)
+    namespace test
     {
-        using Data = uint8_t ;
-        using Extents = size_t;
-
-        using ::pmacc::test::memory::getElementsPerDim;
-
-        std::vector<size_t> nElementsPerDim = getElementsPerDim<T_Dim>();
-
-        for(size_t i = 0; i < nElementsPerDim.size(); ++i)
+        namespace memory
         {
-            ::pmacc::DataSpace<T_Dim::value> const dataSpace = ::pmacc::DataSpace<T_Dim::value>::create(nElementsPerDim[i]);
-            ::pmacc::HostBufferIntern<Data, T_Dim::value> hostBufferIntern(dataSpace);
-
-            const Data value = 255;
-            hostBufferIntern.setValue(value);
-
-            auto ptr = hostBufferIntern.getPointer( );
-            for(size_t j = 0; j < static_cast<size_t>(dataSpace.productOfComponents()); ++j)
+            namespace HostBufferIntern
             {
-                BOOST_CHECK_EQUAL( ptr[j], value );
-            }
-
-        }
-
-    }
-
-    PMACC_NO_NVCC_HDWARNING
-    template<typename T_Dim>
-    HDINLINE void operator()(T_Dim dim)
-    {
-        exec(dim);
-    }
-};
-
-} // namespace HostBufferIntern
-} // namespace memory
-} // namespace test
+                /**
+                 * Checks if the HostBufferIntern is set to a constant value.
+                 */
+                struct setValueTest
+                {
+                    template<typename T_Dim>
+                    void exec(T_Dim)
+                    {
+                        using Data = uint8_t;
+                        using Extents = size_t;
+
+                        using ::pmacc::test::memory::getElementsPerDim;
+
+                        std::vector<size_t> nElementsPerDim = getElementsPerDim<T_Dim>();
+
+                        for(size_t i = 0; i < nElementsPerDim.size(); ++i)
+                        {
+                            ::pmacc::DataSpace<T_Dim::value> const dataSpace
+                                = ::pmacc::DataSpace<T_Dim::value>::create(nElementsPerDim[i]);
+                            ::pmacc::HostBufferIntern<Data, T_Dim::value> hostBufferIntern(dataSpace);
+
+                            const Data value = 255;
+                            hostBufferIntern.setValue(value);
+
+                            auto ptr = hostBufferIntern.getPointer();
+                            for(size_t j = 0; j < static_cast<size_t>(dataSpace.productOfComponents()); ++j)
+                            {
+                                REQUIRE(ptr[j] == value);
+                            }
+                        }
+                    }
+
+                    PMACC_NO_NVCC_HDWARNING
+                    template<typename T_Dim>
+                    HDINLINE void operator()(T_Dim dim)
+                    {
+                        exec(dim);
+                    }
+                };
+
+            } // namespace HostBufferIntern
+        } // namespace memory
+    } // namespace test
 } // namespace pmacc
 
-BOOST_AUTO_TEST_CASE( setValue )
+TEST_CASE("HostBufferIntern::setValue", "[setValue]")
 {
     using namespace pmacc::test::memory::HostBufferIntern;
-    ::boost::mpl::for_each< Dims >( setValueTest() );
+    ::boost::mpl::for_each<Dims>(setValueTest());
 }
diff --git a/include/pmacc/test/memory/memoryUT.cpp b/include/pmacc/test/memory/memoryUT.cpp
index 731ac104e4..a4a814e8c1 100644
--- a/include/pmacc/test/memory/memoryUT.cpp
+++ b/include/pmacc/test/memory/memoryUT.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Erik Zenker, Alexander Grund
+/* Copyright 2015-2021 Erik Zenker, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -19,15 +19,17 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "pmacc/test/PMaccFixture.hpp"
+#include <pmacc/boost_workaround.hpp>
+#include <pmacc/test/PMaccFixture.hpp>
 
 // STL
 #include <stdint.h> /* uint8_t */
 #include <iostream> /* cout, endl */
 #include <string>
 
+#include <catch2/catch.hpp>
+
 // BOOST
-#include <boost/test/unit_test.hpp>
 #include <boost/mpl/list.hpp>
 #include <boost/mpl/for_each.hpp>
 #include <boost/mpl/int.hpp>
@@ -48,41 +50,42 @@
 
 namespace pmacc
 {
-namespace test
-{
-namespace memory
-{
-
-/*******************************************************************************
- * Configuration
- ******************************************************************************/
-
-/**
- * Defines for which numbers of elements a
- * test should be verfied e.g. the size
- * of a host or device buffer.
- */
-template<typename T_Dim>
-std::vector<size_t> getElementsPerDim(){
-    std::vector<size_t> nElements;
-    std::vector<size_t> nElementsPerDim;
-
-    // Elements total
-    nElements.push_back(1);
-    nElements.push_back(1 * 1000);
-    nElements.push_back(1 * 1000 * 1000);
-    nElements.push_back(1 * 1000 * 1000 * 10);
-
-    // Elements per dimension
-    for(size_t i = 0; i < nElements.size(); ++i){
-        nElementsPerDim.push_back(std::pow(nElements[i], static_cast<double>(1)/static_cast<double>(T_Dim::value)));
-
-    }
-    return nElementsPerDim;
-}
-
-} // namespace memory
-} // namespace test
+    namespace test
+    {
+        namespace memory
+        {
+            /*******************************************************************************
+             * Configuration
+             ******************************************************************************/
+
+            /**
+             * Defines for which numbers of elements a
+             * test should be verfied e.g. the size
+             * of a host or device buffer.
+             */
+            template<typename T_Dim>
+            std::vector<size_t> getElementsPerDim()
+            {
+                std::vector<size_t> nElements;
+                std::vector<size_t> nElementsPerDim;
+
+                // Elements total
+                nElements.push_back(1);
+                nElements.push_back(1 * 1000);
+                nElements.push_back(1 * 1000 * 1000);
+                nElements.push_back(1 * 1000 * 1000 * 10);
+
+                // Elements per dimension
+                for(size_t i = 0; i < nElements.size(); ++i)
+                {
+                    nElementsPerDim.push_back(
+                        std::pow(nElements[i], static_cast<double>(1) / static_cast<double>(T_Dim::value)));
+                }
+                return nElementsPerDim;
+            }
+
+        } // namespace memory
+    } // namespace test
 } // namespace pmacc
 
 /**
@@ -91,23 +94,15 @@ std::vector<size_t> getElementsPerDim(){
  * each dimension setup automatically. For this
  * purpose boost::mpl::for_each is used.
  */
-using Dims = ::boost::mpl::list< boost::mpl::int_< DIM1 >,
-                                 boost::mpl::int_< DIM2 >,
-                                 boost::mpl::int_< DIM3 > >;
+using Dims = ::boost::mpl::list<boost::mpl::int_<DIM1>, boost::mpl::int_<DIM2>, boost::mpl::int_<DIM3>>;
 
 /*******************************************************************************
  * Test Suites
  ******************************************************************************/
-using MyPMaccFixture = pmacc::test::PMaccFixture< TEST_DIM >;
-
-BOOST_GLOBAL_FIXTURE( MyPMaccFixture );
-
-BOOST_AUTO_TEST_SUITE( memory )
+using MyPMaccFixture = pmacc::test::PMaccFixture<TEST_DIM>;
 
-  BOOST_AUTO_TEST_SUITE( HostBufferIntern )
-#   include "HostBufferIntern/copyFrom.hpp"
-#   include "HostBufferIntern/reset.hpp"
-#   include "HostBufferIntern/setValue.hpp"
-  BOOST_AUTO_TEST_SUITE_END()
+static MyPMaccFixture fixture;
 
-BOOST_AUTO_TEST_SUITE_END()
+#include "HostBufferIntern/copyFrom.hpp"
+#include "HostBufferIntern/reset.hpp"
+#include "HostBufferIntern/setValue.hpp"
diff --git a/include/pmacc/test/particles/IdProvider.hpp b/include/pmacc/test/particles/IdProvider.hpp
index d41ad891f5..d42aab1971 100644
--- a/include/pmacc/test/particles/IdProvider.hpp
+++ b/include/pmacc/test/particles/IdProvider.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -32,163 +32,136 @@
 #include <boost/mpl/list.hpp>
 #include <boost/mpl/for_each.hpp>
 #include <boost/mpl/int.hpp>
-#include <boost/test/unit_test.hpp>
+
+#include <catch2/catch.hpp>
+
 #include <set>
 #include <algorithm>
 #include <stdint.h>
 
-BOOST_AUTO_TEST_SUITE( particles )
-
 
 namespace pmacc
 {
-namespace test
-{
-namespace particles
-{
-    namespace bmpl = boost::mpl;
-
-    template<
-        uint32_t T_numWorkers,
-        uint32_t T_numIdsPerBlock,
-        typename T_IdProvider
-    >
-    struct GenerateIds
+    namespace test
     {
-        template<class T_Box, typename T_Acc>
-        HDINLINE void operator()(const T_Acc & acc, T_Box outputbox, uint32_t numThreads, uint32_t numIdsPerThread) const
+        namespace particles
         {
-            using namespace ::pmacc;
-            using namespace mappings::threads;
-
-            constexpr uint32_t numWorkers = T_numWorkers;
-
-            uint32_t const workerIdx = threadIdx.x;
-
-            uint32_t const blockId = blockIdx.x * T_numIdsPerBlock;
-            ForEachIdx<
-                IdxConfig<
-                    T_numIdsPerBlock,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearId,
-                    uint32_t const
-                )
+            namespace bmpl = boost::mpl;
+
+            template<uint32_t T_numWorkers, uint32_t T_numIdsPerBlock, typename T_IdProvider>
+            struct GenerateIds
+            {
+                template<class T_Box, typename T_Acc>
+                HDINLINE void operator()(
+                    const T_Acc& acc,
+                    T_Box outputbox,
+                    uint32_t numThreads,
+                    uint32_t numIdsPerThread) const
                 {
-                    uint32_t const localId = blockId + linearId;
-                    if( localId < numThreads )
-                    {
-                        for( uint32_t i = 0u; i < numIdsPerThread; i++ )
-                            outputbox( i * numThreads + localId ) = T_IdProvider::getNewId( );
-                    }
+                    using namespace ::pmacc;
+                    using namespace mappings::threads;
+
+                    constexpr uint32_t numWorkers = T_numWorkers;
+
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                    uint32_t const blockId = cupla::blockIdx(acc).x * T_numIdsPerBlock;
+                    ForEachIdx<IdxConfig<T_numIdsPerBlock, numWorkers>>{workerIdx}(
+                        [&](uint32_t const linearId, uint32_t const) {
+                            uint32_t const localId = blockId + linearId;
+                            if(localId < numThreads)
+                            {
+                                for(uint32_t i = 0u; i < numIdsPerThread; i++)
+                                    outputbox(i * numThreads + localId) = T_IdProvider::getNewId();
+                            }
+                        });
+                }
+            };
+
+            /** function checks if a value is in a collection
+             *
+             * Use like: REQUIRE(checkDuplicate(col, value, true|false));
+             * @param col Container to be searched
+             * @param value Value to search for
+             * @param shouldFind Whether the value is expected in the collection or not
+             * @return Error-Value, if the value is not found and shouldFind is true or
+             *         the value is found and shouldFind is false, otherwise a True-Value
+             */
+            template<class T_Collection, typename T>
+            bool checkDuplicate(const T_Collection& col, const T& value, bool shouldFind)
+            {
+                if((std::find(col.begin(), col.end(), value) != col.end()) != shouldFind)
+                {
+                    bool res(false);
+                    if(shouldFind)
+                        std::cout << "Value not found found: ";
+                    else
+                        std::cout << "Duplicate found: ";
+                    std::cout << value << ". Values=[";
+                    for(typename T_Collection::const_iterator it = col.begin(); it != col.end(); ++it)
+                        std::cout << *it << ",";
+                    std::cout << "]";
+                    return res;
                 }
-            );
-
-
-        }
-    };
 
-/**
- * Boost.Test compatible function that checks if a value is in a collection
- * Use like: BOOST_REQUIRE(checkDuplicate(col, value, true|false));
- * @param col Container to be searched
- * @param value Value to search for
- * @param shouldFind Whether the value is expected in the collection or not
- * @return Error-Value, if the value is not found and shouldFind is true or
- *         the value is found and shouldFind is false, otherwise a True-Value
- */
-template<class T_Collection, typename T>
-boost::test_tools::predicate_result
-checkDuplicate(const T_Collection& col, const T& value, bool shouldFind)
-{
-    if((std::find(col.begin(), col.end(), value) != col.end()) != shouldFind)
-    {
-        boost::test_tools::predicate_result res(false);
-        if(shouldFind)
-            res.message() << "Value not found found: ";
-        else
-            res.message() << "Duplicate found: ";
-        res.message() << value << ". Values=[";
-        for(typename T_Collection::const_iterator it = col.begin(); it != col.end(); ++it)
-            res.message() << *it << ",";
-        res.message() << "]";
-        return res;
-    }
-
-    return true;
-}
+                return true;
+            }
 
 
-template<unsigned T_dim>
-struct IdProviderTest
-{
-    void operator()()
-    {
-        using namespace ::pmacc;
-
-        constexpr uint32_t numBlocks = 4;
-        constexpr uint32_t numIdsPerBlock = 64;
-        constexpr uint32_t numThreads = numBlocks * numIdsPerBlock;
-        constexpr uint32_t numIdsPerThread = 2;
-        constexpr uint32_t numIds = numThreads * numIdsPerThread;
-
-        using IdProvider = IdProvider< T_dim >;
-        IdProvider::init();
-        // Check initial state
-        typename IdProvider::State state = IdProvider::getState();
-        BOOST_REQUIRE_EQUAL(state.startId, state.nextId);
-        BOOST_REQUIRE_EQUAL(state.maxNumProc, 1u);
-        BOOST_REQUIRE(!IdProvider::isOverflown());
-        std::set<uint64_t> ids;
-        BOOST_REQUIRE_EQUAL(IdProvider::getNewIdHost(), state.nextId);
-        // Generate some IDs using the function
-        for(int i=0; i<numIds; i++)
-        {
-            const uint64_t newId = IdProvider::getNewIdHost();
-            BOOST_REQUIRE(checkDuplicate(ids, newId, false));
-            ids.insert(newId);
-        }
-        // Reset the state
-        IdProvider::setState(state);
-        BOOST_REQUIRE_EQUAL(IdProvider::getNewIdHost(), state.nextId);
-        // Generate the same IDs on the device
-        HostDeviceBuffer< uint64_t, 1 > idBuf(numIds);
-        constexpr uint32_t numWorkers = traits::GetNumWorkers<
-            numIdsPerBlock
-        >::value;
-        PMACC_KERNEL( GenerateIds<
-            numWorkers,
-            numIdsPerBlock,
-            IdProvider
-        >{  })(
-            numBlocks,
-            numWorkers
-        )(
-            idBuf.getDeviceBuffer().getDataBox(),
-            numThreads,
-            numIdsPerThread
-        );
-        idBuf.deviceToHost();
-        BOOST_REQUIRE_EQUAL(numIds, ids.size());
-        auto hostBox = idBuf.getHostBuffer().getDataBox();
-        // Make sure they are the same
-        for(uint32_t i=0; i<numIds; i++)
-        {
-            BOOST_REQUIRE(checkDuplicate(ids, hostBox(i), true));
-        }
-    }
-};
+            template<unsigned T_dim>
+            struct IdProviderTest
+            {
+                void operator()()
+                {
+                    using namespace ::pmacc;
+
+                    constexpr uint32_t numBlocks = 4;
+                    constexpr uint32_t numIdsPerBlock = 64;
+                    constexpr uint32_t numThreads = numBlocks * numIdsPerBlock;
+                    constexpr uint32_t numIdsPerThread = 2;
+                    constexpr uint32_t numIds = numThreads * numIdsPerThread;
+
+                    using IdProvider = IdProvider<T_dim>;
+                    IdProvider::init();
+                    // Check initial state
+                    typename IdProvider::State state = IdProvider::getState();
+                    REQUIRE(state.startId == state.nextId);
+                    REQUIRE(state.maxNumProc == 1u);
+                    REQUIRE(!IdProvider::isOverflown());
+                    std::set<uint64_t> ids;
+                    REQUIRE(IdProvider::getNewIdHost() == state.nextId);
+                    // Generate some IDs using the function
+                    for(int i = 0; i < numIds; i++)
+                    {
+                        const uint64_t newId = IdProvider::getNewIdHost();
+                        REQUIRE(checkDuplicate(ids, newId, false));
+                        ids.insert(newId);
+                    }
+                    // Reset the state
+                    IdProvider::setState(state);
+                    REQUIRE(IdProvider::getNewIdHost() == state.nextId);
+                    // Generate the same IDs on the device
+                    HostDeviceBuffer<uint64_t, 1> idBuf(numIds);
+                    constexpr uint32_t numWorkers = traits::GetNumWorkers<numIdsPerBlock>::value;
+                    PMACC_KERNEL(GenerateIds<numWorkers, numIdsPerBlock, IdProvider>{})
+                    (numBlocks, numWorkers)(idBuf.getDeviceBuffer().getDataBox(), numThreads, numIdsPerThread);
+                    idBuf.deviceToHost();
+                    REQUIRE(numIds == ids.size());
+                    auto hostBox = idBuf.getHostBuffer().getDataBox();
+                    // Make sure they are the same
+                    for(uint32_t i = 0; i < numIds; i++)
+                    {
+                        REQUIRE(checkDuplicate(ids, hostBox(i), true));
+                    }
+                }
+            };
 
-} // namespace particles
-} // namespace test
+        } // namespace particles
+    } // namespace test
 } // namespace pmacc
 
-BOOST_AUTO_TEST_CASE(IdProvider)
+TEST_CASE("particles::IDProvider", "[IDProvider]")
 {
     using namespace pmacc::test::particles;
     IdProviderTest<TEST_DIM>()();
 }
-
-BOOST_AUTO_TEST_SUITE_END()
diff --git a/include/pmacc/test/particles/memory/SuperCell.hpp b/include/pmacc/test/particles/memory/SuperCell.hpp
index d552424de9..40994ae451 100644
--- a/include/pmacc/test/particles/memory/SuperCell.hpp
+++ b/include/pmacc/test/particles/memory/SuperCell.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,90 +26,70 @@
 
 namespace pmacc
 {
-namespace test
-{
-namespace particles
-{
-namespace memory
-{
-
-template< typename T_SuperCell >
-struct TestNumParticlesLastFrame
-{
-    struct FrameTypeDummy
+    namespace test
     {
-        using SuperCellSize = T_SuperCell;
-    };
+        namespace particles
+        {
+            namespace memory
+            {
+                template<typename T_SuperCell>
+                struct TestNumParticlesLastFrame
+                {
+                    struct FrameTypeDummy
+                    {
+                        using SuperCellSize = T_SuperCell;
+                    };
 
-    /** test a combination
-     *
-     * @param numParticlesPerCell number of particles within the test supercell
-     * @param particleLastFrame the assumed result with the given number of particles
-     *                          and T_SuperCell
-     */
-    HDINLINE void operator()(
-        uint32_t numParticlesPerCell,
-        uint32_t particleLastFrame
-    )
-    {
-        pmacc::SuperCell< FrameTypeDummy > superCell;
-        superCell.setNumParticles( numParticlesPerCell );
+                    /** test a combination
+                     *
+                     * @param numParticlesPerCell number of particles within the test supercell
+                     * @param particleLastFrame the assumed result with the given number of particles
+                     *                          and T_SuperCell
+                     */
+                    HINLINE void operator()(uint32_t numParticlesPerCell, uint32_t particleLastFrame)
+                    {
+                        pmacc::SuperCell<FrameTypeDummy> superCell;
+                        superCell.setNumParticles(numParticlesPerCell);
 
-        BOOST_CHECK_EQUAL(
-            superCell.getSizeLastFrame(),
-            particleLastFrame
-        );
-    }
-};
+                        REQUIRE(superCell.getSizeLastFrame() == particleLastFrame);
+                    }
+                };
 
-} // namespace memory
-} // namespace particles
-} // namespace test
+            } // namespace memory
+        } // namespace particles
+    } // namespace test
 } // namespace pmacc
 
 /* The supercell test is always performed with a 3 dimensional supercell
  * because the supercell is agnostic about the number of dimensions.
  */
-BOOST_AUTO_TEST_CASE( copyFrom )
+TEST_CASE("particles::SuperCell", "[SuperCell]")
 {
     using namespace pmacc::test::particles::memory;
-    TestNumParticlesLastFrame<
-        pmacc::math::CT::Int<
-            8,
-            8,
-            4
-        >
-    > cell256{};
+    TestNumParticlesLastFrame<pmacc::math::CT::Int<8, 8, 4>> cell256{};
 
     // no particles in the supercell
-    cell256( 0u, 0u );
+    cell256(0u, 0u);
     // one full frame
-    cell256( 256u, 256u );
+    cell256(256u, 256u);
     // two full frames
-    cell256( 512u, 256u );
+    cell256(512u, 256u);
     // edge cases
-    cell256( 255u, 255u );
-    cell256( 257u, 1u );
-    cell256( 1u, 1u );
+    cell256(255u, 255u);
+    cell256(257u, 1u);
+    cell256(1u, 1u);
 
     using namespace pmacc::test::particles::memory;
-    TestNumParticlesLastFrame<
-        pmacc::math::CT::Int<
-            3,
-            3,
-            3
-        >
-    > cell27{};
+    TestNumParticlesLastFrame<pmacc::math::CT::Int<3, 3, 3>> cell27{};
 
     // no particles in the supercell
-    cell27( 0u, 0u );
+    cell27(0u, 0u);
     // one full frame
-    cell27( 27u, 27u );
+    cell27(27u, 27u);
     // two full frames
-    cell27( 54u, 27u );
+    cell27(54u, 27u);
     // edge cases
-    cell27( 26u, 26u );
-    cell27( 28u, 1u );
-    cell27( 1u, 1u );
-
+    cell27(26u, 26u);
+    cell27(28u, 1u);
+    cell27(1u, 1u);
 }
diff --git a/include/pmacc/test/particles/particlesUT.cpp b/include/pmacc/test/particles/particlesUT.cpp
index 8ce68203a0..bad91a48aa 100644
--- a/include/pmacc/test/particles/particlesUT.cpp
+++ b/include/pmacc/test/particles/particlesUT.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -19,17 +19,18 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "pmacc/test/PMaccFixture.hpp"
+#include <pmacc/boost_workaround.hpp>
+#include <pmacc/test/PMaccFixture.hpp>
 
-#include <boost/test/unit_test.hpp>
+#include <catch2/catch.hpp>
 
 
 #if TEST_DIM == 2
-    using pmacc::test::PMaccFixture2D;
-    BOOST_GLOBAL_FIXTURE( PMaccFixture2D );
+using pmacc::test::PMaccFixture2D;
+static PMaccFixture2D fixture;
 #else
-    using pmacc::test::PMaccFixture3D;
-    BOOST_GLOBAL_FIXTURE( PMaccFixture3D );
+using pmacc::test::PMaccFixture3D;
+static PMaccFixture3D fixture;
 #endif
 
 #include "IdProvider.hpp"
diff --git a/include/pmacc/test/random/2DDistribution.cpp b/include/pmacc/test/random/2DDistribution.cpp
index bbee7d68e9..cafe63aac4 100644
--- a/include/pmacc/test/random/2DDistribution.cpp
+++ b/include/pmacc/test/random/2DDistribution.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Alexander Grund
+/* Copyright 2016-2021 Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -19,19 +19,20 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "pmacc/types.hpp"
-#include "pmacc/memory/buffers/HostDeviceBuffer.hpp"
-#include "pmacc/random/RNGProvider.hpp"
-#include "pmacc/random/distributions/Uniform.hpp"
-#include "pmacc/random/methods/AlpakaRand.hpp"
-#include "pmacc/dimensions/DataSpace.hpp"
-#include "pmacc/assert.hpp"
-#include "pmacc/mappings/threads/ForEachIdx.hpp"
-#include "pmacc/mappings/threads/IdxConfig.hpp"
-#include "pmacc/traits/GetNumWorkers.hpp"
-#include "pmacc/dataManagement/ISimulationData.hpp"
-#include "pmacc/Environment.hpp"
-#include "pmacc/eventSystem/tasks/ITask.hpp"
+#include <pmacc/boost_workaround.hpp>
+#include <pmacc/types.hpp>
+#include <pmacc/memory/buffers/HostDeviceBuffer.hpp>
+#include <pmacc/random/RNGProvider.hpp>
+#include <pmacc/random/distributions/Uniform.hpp>
+#include <pmacc/random/methods/AlpakaRand.hpp>
+#include <pmacc/dimensions/DataSpace.hpp>
+#include <pmacc/assert.hpp>
+#include <pmacc/mappings/threads/ForEachIdx.hpp>
+#include <pmacc/mappings/threads/IdxConfig.hpp>
+#include <pmacc/traits/GetNumWorkers.hpp>
+#include <pmacc/dataManagement/ISimulationData.hpp>
+#include <pmacc/Environment.hpp>
+#include <pmacc/eventSystem/tasks/ITask.hpp>
 
 #include <stdint.h>
 #include <iostream>
@@ -41,295 +42,261 @@
 
 namespace pmacc
 {
-namespace test
-{
-namespace random
-{
+    namespace test
+    {
+        namespace random
+        {
+            using Space2D = pmacc::DataSpace<DIM2>;
+            using Space3D = pmacc::DataSpace<DIM3>;
 
-using Space2D = pmacc::DataSpace< DIM2 >;
-using Space3D = pmacc::DataSpace< DIM3 >;
+            template<uint32_t T_numWorkers, uint32_t T_blockSize>
+            struct RandomFiller
+            {
+                template<typename T_DataBox, typename T_Random, typename T_Acc>
+                DINLINE void operator()(
+                    T_Acc const& acc,
+                    T_DataBox box,
+                    Space2D const boxSize,
+                    T_Random const rand,
+                    uint32_t const numSamples) const
+                {
+                    using namespace pmacc::mappings::threads;
 
-template<
-    uint32_t T_numWorkers,
-    uint32_t T_blockSize
->
-struct RandomFiller
-{
-    template<
-        typename T_DataBox,
-        typename T_Random,
-        typename T_Acc
-    >
-    DINLINE void operator()(
-        T_Acc const & acc,
-        T_DataBox box,
-        Space2D const boxSize,
-        T_Random const rand,
-        uint32_t const numSamples
-    ) const
-    {
-        using namespace pmacc::mappings::threads;
+                    constexpr uint32_t numWorkers = T_numWorkers;
+                    uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                    using SupercellDomCfg = IdxConfig<T_blockSize, numWorkers>;
 
-        constexpr uint32_t numWorkers = T_numWorkers;
-        uint32_t const workerIdx = threadIdx.x;
+                    // each virtual worker initialize one rng state
+                    ForEachIdx<SupercellDomCfg> forEachCell(workerIdx);
 
-        using SupercellDomCfg = IdxConfig<
-            T_blockSize,
-            numWorkers
-        >;
+                    forEachCell([&](uint32_t const linearIdx, uint32_t const) {
+                        uint32_t const linearTid = cupla::blockIdx(acc).x * T_blockSize + linearIdx;
 
-        // each virtual worker initialize one rng state
-        ForEachIdx< SupercellDomCfg > forEachCell( workerIdx );
+                        if(linearTid >= boxSize.productOfComponents())
+                            return;
+
+                        Space2D const ownIdx = pmacc::DataSpaceOperations<Space2D::dim>::map(boxSize, linearTid);
+                        // each virtual worker needs an own instance of rand
+                        T_Random vWorkerRand = rand;
+                        vWorkerRand.init(ownIdx);
+                        for(uint32_t i = 0u; i < numSamples; i++)
+                        {
+                            Space2D idx = vWorkerRand(acc, boxSize);
+                            cupla::atomicAdd(acc, &box(idx), 1u, ::alpaka::hierarchy::Blocks{});
+                        }
+                    });
+                }
+            };
 
-        forEachCell(
-            [&](
-                uint32_t const linearIdx,
-                uint32_t const
-            )
+            template<class T_RNGProvider>
+            struct GetRandomIdx
             {
-                uint32_t const linearTid = blockIdx.x * T_blockSize + linearIdx;
-
-                if( linearTid >= boxSize.productOfComponents() )
-                    return;
-
-                Space2D const ownIdx = pmacc::DataSpaceOperations< Space2D::dim >::map(
-                    boxSize,
-                    linearTid
-                );
-                // each virtual worker needs an own instance of rand
-                T_Random vWorkerRand = rand;
-                vWorkerRand.init( ownIdx );
-                for( uint32_t i = 0u; i < numSamples; i++ )
+                typedef pmacc::random::distributions::Uniform<float> Distribution;
+                typedef typename T_RNGProvider::template GetRandomType<Distribution>::type Random;
+
+                HINLINE GetRandomIdx() : rand(T_RNGProvider::template createRandom<Distribution>())
                 {
-                    Space2D idx = vWorkerRand(
-                        acc,
-                        boxSize
-                    );
-                    atomicAdd(&box(idx), 1u, ::alpaka::hierarchy::Blocks{});
+                }
+
+                /** initialize the random generator
+                 *
+                 * @warning: it is not allowed to call this method twice on an instance
+                 */
+                DINLINE void init(Space2D globalCellIdx)
+                {
+                    rand.init(globalCellIdx);
+                }
+
+                template<typename T_Acc>
+                DINLINE Space2D operator()(T_Acc const& acc, Space2D size)
+                {
+                    using pmacc::math::float2int_rd;
+                    return Space2D(float2int_rd(rand(acc) * size.x()), float2int_rd(rand(acc) * size.y()));
+                }
+
+            private:
+                PMACC_ALIGN8(rand, Random);
+            };
+
+            /** Write in PGM grayscale file format (easy to read/interpret) */
+            template<class T_Buffer>
+            void writePGM(const std::string& filePath, T_Buffer& buffer)
+            {
+                const Space2D size = buffer.getDataSpace();
+                uint32_t maxVal = 0;
+                for(int y = 0; y < size.y(); y++)
+                {
+                    for(int x = 0; x < size.x(); x++)
+                    {
+                        uint32_t val = buffer.getDataBox()(Space2D(x, y));
+                        if(val > maxVal)
+                            maxVal = val;
+                    }
+                }
+
+                // Standard format is single byte per value which limits the range to 0-255
+                // An extension allows 2 bytes so 0-65536)
+                if(maxVal > std::numeric_limits<uint16_t>::max())
+                    maxVal = std::numeric_limits<uint16_t>::max();
+                const bool isTwoByteFormat = maxVal > std::numeric_limits<uint8_t>::max();
+
+                std::ofstream outFile(filePath.c_str());
+                // TAG
+                outFile << "P5\n";
+                // Size and maximum value (at most 65536 which is 2 bytes per value)
+                outFile << size.x() << " " << size.y() << " " << maxVal << "\n";
+                for(int y = 0; y < size.y(); y++)
+                {
+                    for(int x = 0; x < size.x(); x++)
+                    {
+                        uint32_t val = buffer.getDataBox()(Space2D(x, y));
+                        // Clip value
+                        if(val > maxVal)
+                            val = maxVal;
+                        // Write first byte (higher order bits) if file is in 2 byte format
+                        if(isTwoByteFormat)
+                            outFile << uint8_t(val >> 8);
+                        // Write remaining bytze
+                        outFile << uint8_t(val);
+                    }
                 }
             }
-        );
-    }
-};
 
-template<class T_RNGProvider>
-struct GetRandomIdx
-{
-    typedef pmacc::random::distributions::Uniform<float> Distribution;
-    typedef typename T_RNGProvider::template GetRandomType<Distribution>::type Random;
-
-    HINLINE GetRandomIdx(): rand(T_RNGProvider::template createRandom<Distribution>())
-    {}
-
-    /** initialize the random generator
-     *
-     * @warning: it is not allowed to call this method twice on an instance
-     */
-    DINLINE void
-    init(Space2D globalCellIdx)
-    {
-        rand.init(globalCellIdx);
-    }
-
-    template< typename T_Acc >
-    DINLINE Space2D
-    operator()(
-        T_Acc const & acc,
-        Space2D size
-    )
-    {
-        using pmacc::algorithms::math::float2int_rd;
-        return Space2D(
-            float2int_rd( rand( acc ) * size.x() ),
-            float2int_rd( rand( acc ) * size.y() )
-        );
-    }
-private:
-    PMACC_ALIGN8(rand, Random);
-};
-
-/** Write in PGM grayscale file format (easy to read/interpret) */
-template<class T_Buffer>
-void writePGM(const std::string& filePath, T_Buffer& buffer)
-{
-    const Space2D size = buffer.getDataSpace();
-    uint32_t maxVal = 0;
-    for(int y=0; y<size.y(); y++)
-    {
-        for(int x=0; x<size.x(); x++)
-        {
-            uint32_t val = buffer.getDataBox()(Space2D(x, y));
-            if(val > maxVal)
-                maxVal = val;
-        }
-    }
-
-    // Standard format is single byte per value which limits the range to 0-255
-    // An extension allows 2 bytes so 0-65536)
-    if(maxVal > std::numeric_limits<uint16_t>::max())
-        maxVal = std::numeric_limits<uint16_t>::max();
-    const bool isTwoByteFormat = maxVal > std::numeric_limits<uint8_t>::max();
-
-    std::ofstream outFile(filePath.c_str());
-    // TAG
-    outFile << "P5\n";
-    // Size and maximum value (at most 65536 which is 2 bytes per value)
-    outFile << size.x() << " " << size.y() << " " << maxVal << "\n";
-    for(int y=0; y<size.y(); y++)
-    {
-        for(int x=0; x<size.x(); x++)
-        {
-            uint32_t val = buffer.getDataBox()(Space2D(x, y));
-            // Clip value
-            if(val > maxVal)
-                val = maxVal;
-            // Write first byte (higher order bits) if file is in 2 byte format
-            if(isTwoByteFormat)
-                outFile << uint8_t(val >> 8);
-            // Write remaining bytze
-            outFile << uint8_t(val);
-        }
-    }
-}
+            template<class T_DeviceBuffer, class T_Random>
+            void generateRandomNumbers(
+                const Space2D& rngSize,
+                uint32_t numSamples,
+                T_DeviceBuffer& buffer,
+                const T_Random& rand)
+            {
+                cuplaEvent_t start, stop;
+                CUDA_CHECK(cuplaEventCreate(&start));
+                CUDA_CHECK(cuplaEventCreate(&stop));
+
+                constexpr uint32_t blockSize = 256;
+
+                constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<blockSize>::value;
+
+                uint32_t gridSize = (rngSize.productOfComponents() + blockSize - 1u) / blockSize;
+
+                CUDA_CHECK(cuplaEventRecord(
+                    start,
+                    /* we need to pass a stream to avoid that we record the event in
+                     * an empty or wrong stream
+                     */
+                    pmacc::Environment<>::get()
+                        .TransactionManager()
+                        .getEventStream(pmacc::ITask::TASK_DEVICE)
+                        ->getCudaStream()));
+                PMACC_KERNEL(RandomFiller<numWorkers, blockSize>{})
+                (gridSize, numWorkers)(buffer.getDataBox(), buffer.getDataSpace(), rand, numSamples);
+
+                CUDA_CHECK(cuplaEventRecord(
+                    stop,
+                    /* we need to pass a stream to avoid that we record the event in
+                     * an empty or wrong stream
+                     */
+                    pmacc::Environment<>::get()
+                        .TransactionManager()
+                        .getEventStream(pmacc::ITask::TASK_DEVICE)
+                        ->getCudaStream()));
+                CUDA_CHECK(cuplaEventSynchronize(stop));
+                float milliseconds = 0;
+                CUDA_CHECK(cuplaEventElapsedTime(&milliseconds, start, stop));
+                std::cout << "Done in " << milliseconds << "ms" << std::endl;
+                CUDA_CHECK(cuplaEventDestroy(start));
+                CUDA_CHECK(cuplaEventDestroy(stop));
+            }
 
-template<class T_DeviceBuffer, class T_Random>
-void generateRandomNumbers(const Space2D& rngSize, uint32_t numSamples, T_DeviceBuffer& buffer, const T_Random& rand)
-{
-    cudaEvent_t start, stop;
-    CUDA_CHECK(cudaEventCreate(&start));
-    CUDA_CHECK(cudaEventCreate(&stop));
-
-    constexpr uint32_t blockSize = 256;
-
-    constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
-            blockSize
-        >::value;
-
-    uint32_t gridSize = ( rngSize.productOfComponents() + blockSize - 1u ) / blockSize;
-
-    CUDA_CHECK(cudaEventRecord(
-        start,
-        /* we need to pass a stream to avoid that we record the event in
-         * an empty or wrong stream
-         */
-        pmacc::Environment<>::get( ).TransactionManager( ).
-            getEventStream( pmacc::ITask::TASK_CUDA )->getCudaStream()
-    ));
-    PMACC_KERNEL(
-        RandomFiller<
-            numWorkers,
-            blockSize
-        >{}
-    )(
-        gridSize,
-        numWorkers
-    )(
-        buffer.getDataBox(),
-        buffer.getDataSpace(),
-        rand,
-        numSamples
-    );
-
-    CUDA_CHECK(cudaEventRecord(
-        stop,
-        /* we need to pass a stream to avoid that we record the event in
-         * an empty or wrong stream
-         */
-        pmacc::Environment<>::get( ).TransactionManager( ).
-            getEventStream( pmacc::ITask::TASK_CUDA )->getCudaStream()
-    ));
-    CUDA_CHECK(cudaEventSynchronize(stop));
-    float milliseconds = 0;
-    CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
-    std::cout << "Done in " << milliseconds << "ms" << std::endl;
-    CUDA_CHECK(cudaEventDestroy(start));
-    CUDA_CHECK(cudaEventDestroy(stop));
-}
+            template<class T_Method>
+            void runTest(uint32_t numSamples)
+            {
+                typedef pmacc::random::RNGProvider<2, T_Method> RNGProvider;
+
+                const std::string rngName = RNGProvider::RNGMethod::getName();
+                std::cout << std::endl
+                          << "Running test for " << rngName << " with " << numSamples << " samples per cell"
+                          << std::endl;
+                // Size of the detector
+                const Space2D size(256, 256);
+                // Size of the rng provider (= number of states used)
+                const Space2D rngSize(256, 256);
+
+                pmacc::HostDeviceBuffer<uint32_t, 2> detector(size);
+                auto rngProvider = new RNGProvider(rngSize);
+
+                pmacc::Environment<>::get().DataConnector().share(
+                    std::shared_ptr<pmacc::ISimulationData>(rngProvider));
+                rngProvider->init(0x42133742);
+
+                generateRandomNumbers(rngSize, numSamples, detector.getDeviceBuffer(), GetRandomIdx<RNGProvider>());
+
+                detector.deviceToHost();
+                auto box = detector.getHostBuffer().getDataBox();
+                // Write data to file
+                std::ofstream dataFile((rngName + "_data.txt").c_str());
+                for(int y = 0; y < size.y(); y++)
+                {
+                    for(int x = 0; x < size.x(); x++)
+                        dataFile << box(Space2D(x, y)) << ",";
+                }
+                writePGM(rngName + "_img.pgm", detector.getHostBuffer());
 
-template<class T_Method>
-void runTest(uint32_t numSamples)
-{
-    typedef pmacc::random::RNGProvider<2, T_Method> RNGProvider;
-
-    const std::string rngName = RNGProvider::RNGMethod::getName();
-    std::cout << std::endl << "Running test for " << rngName
-              << " with " << numSamples << " samples per cell"
-              << std::endl;
-    // Size of the detector
-    const Space2D size(256, 256);
-    // Size of the rng provider (= number of states used)
-    const Space2D rngSize(256, 256);
-
-    pmacc::HostDeviceBuffer<uint32_t, 2> detector(size);
-    auto rngProvider = new RNGProvider(rngSize);
-
-    pmacc::Environment<>::get().DataConnector().share( std::shared_ptr< pmacc::ISimulationData >( rngProvider ) );
-    rngProvider->init(0x42133742);
-
-    generateRandomNumbers(rngSize, numSamples, detector.getDeviceBuffer(), GetRandomIdx<RNGProvider>());
-
-    detector.deviceToHost();
-    auto box = detector.getHostBuffer().getDataBox();
-    // Write data to file
-    std::ofstream dataFile((rngName + "_data.txt").c_str());
-    for(int y=0; y<size.y(); y++)
-    {
-        for(int x=0; x<size.x(); x++)
-            dataFile << box(Space2D(x, y)) << ",";
-    }
-    writePGM(rngName + "_img.pgm", detector.getHostBuffer());
-
-    uint64_t totalNumSamples = 0;
-    double mean = 0;
-    uint32_t maxVal = 0;
-    uint32_t minVal = static_cast<uint32_t>(-1);
-    for(int y=0; y<size.y(); y++)
-    {
-        for(int x=0; x<size.x(); x++)
-        {
-            Space2D idx(x, y);
-            uint32_t val = box(idx);
-            if(val > maxVal)
-                maxVal = val;
-            if(val < minVal)
-                minVal = val;
-            totalNumSamples += val;
-            mean += pmacc::math::linearize(size.shrink<1>(1), idx) * static_cast<uint64_t>(val);
-        }
-    }
-    PMACC_ASSERT(totalNumSamples == uint64_t(rngSize.productOfComponents()) * uint64_t(numSamples));
-    // Expected value: (n-1)/2
-    double Ex = (size.productOfComponents() - 1) / 2.;
-    // Variance: (n^2 - 1) / 12
-    double var = (pmacc::algorithms::math::pow<double>(size.productOfComponents(), 2) - 1.) / 12.;
-    // Mean value
-    mean /= totalNumSamples;
-    double errSq = 0;
-    // Calc standard derivation
-    for(int y=0; y<size.y(); y++)
-    {
-        for(int x=0; x<size.x(); x++)
-        {
-            Space2D idx(x, y);
-            uint32_t val = box(idx);
-            errSq += val * pmacc::algorithms::math::pow<double>(pmacc::math::linearize(size.shrink<1>(1), idx) - mean, 2);
-        }
-    }
-    double stdDev = sqrt(errSq/(totalNumSamples - 1));
-
-    uint64_t avg = totalNumSamples/size.productOfComponents();
-    std::cout << "  Samples: " << totalNumSamples << std::endl;
-    std::cout << "      Min: " << minVal << std::endl;
-    std::cout << "      Max: " << maxVal << std::endl;
-    std::cout << " Avg/cell: " << avg << std::endl;
-    std::cout << "     E(x): " << Ex << std::endl;
-    std::cout << "     mean: " << mean << std::endl;
-    std::cout << "   dev(x): " << sqrt(var) << std::endl;
-    std::cout << " std. dev: " << stdDev << std::endl;
-}
+                uint64_t totalNumSamples = 0;
+                double mean = 0;
+                uint32_t maxVal = 0;
+                uint32_t minVal = static_cast<uint32_t>(-1);
+                for(int y = 0; y < size.y(); y++)
+                {
+                    for(int x = 0; x < size.x(); x++)
+                    {
+                        Space2D idx(x, y);
+                        uint32_t val = box(idx);
+                        if(val > maxVal)
+                            maxVal = val;
+                        if(val < minVal)
+                            minVal = val;
+                        totalNumSamples += val;
+                        mean += pmacc::math::linearize(size.shrink<1>(1), idx) * static_cast<uint64_t>(val);
+                    }
+                }
+                PMACC_ASSERT(totalNumSamples == uint64_t(rngSize.productOfComponents()) * uint64_t(numSamples));
+                // Expected value: (n-1)/2
+                double Ex = (size.productOfComponents() - 1) / 2.;
+                // Variance: (n^2 - 1) / 12
+                double var = (cupla::pow(static_cast<double>(size.productOfComponents()), 2.0) - 1.) / 12.;
+                // Mean value
+                mean /= totalNumSamples;
+                double errSq = 0;
+                // Calc standard derivation
+                for(int y = 0; y < size.y(); y++)
+                {
+                    for(int x = 0; x < size.x(); x++)
+                    {
+                        Space2D idx(x, y);
+                        uint32_t val = box(idx);
+                        errSq += val
+                            * cupla::pow(
+                                     static_cast<double>(pmacc::math::linearize(size.shrink<1>(1), idx) - mean),
+                                     2.0);
+                    }
+                }
+                double stdDev = sqrt(errSq / (totalNumSamples - 1));
+
+                uint64_t avg = totalNumSamples / size.productOfComponents();
+                std::cout << "  Samples: " << totalNumSamples << std::endl;
+                std::cout << "      Min: " << minVal << std::endl;
+                std::cout << "      Max: " << maxVal << std::endl;
+                std::cout << " Avg/cell: " << avg << std::endl;
+                std::cout << "     E(x): " << Ex << std::endl;
+                std::cout << "     mean: " << mean << std::endl;
+                std::cout << "   dev(x): " << sqrt(var) << std::endl;
+                std::cout << " std. dev: " << stdDev << std::endl;
+            }
 
-} // namespace random
-} // namespace test
+        } // namespace random
+    } // namespace test
 } // namespace pmacc
 
 int main(int argc, char** argv)
@@ -341,7 +308,7 @@ int main(int argc, char** argv)
 
     const uint32_t numSamples = (argc > 1) ? atoi(argv[1]) : 100;
 
-    runTest< random::methods::AlpakaRand< cupla::Acc> >(numSamples);
+    runTest<random::methods::AlpakaRand<cupla::Acc>>(numSamples);
 
     /* finalize the pmacc context */
     Environment<>::get().finalize();
diff --git a/include/pmacc/test/random/CMakeLists.txt b/include/pmacc/test/random/CMakeLists.txt
index 0a137c59be..9003f1eea8 100644
--- a/include/pmacc/test/random/CMakeLists.txt
+++ b/include/pmacc/test/random/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2016-2020 Alexander Grund
+# Copyright 2016-2021 Alexander Grund
 #
 # This file is part of PMacc.
 #
@@ -19,7 +19,7 @@
 # If not, see <http://www.gnu.org/licenses/>.
 #
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 project("TestRandomGenerators")
 
 set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/../..")
diff --git a/include/pmacc/traits/GetCTName.hpp b/include/pmacc/traits/GetCTName.hpp
index 74c87e606f..13f3ccaeb9 100644
--- a/include/pmacc/traits/GetCTName.hpp
+++ b/include/pmacc/traits/GetCTName.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2018-2020 Rene Widera
+/* Copyright 2018-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,24 +26,23 @@
 
 namespace pmacc
 {
-namespace traits
-{
-
-    /** Return the compile time name
-     *
-     * @tparam T_Type type of the object where the name is queried
-     * @return ::type name of the object as pmacc::meta::String,
-     *         empty string is returned if the trait is not specified for
-     *         T_Type
-     */
-    template< typename T_Type >
-    struct GetCTName
+    namespace traits
     {
-        using type = pmacc::meta::String< >;
-    };
+        /** Return the compile time name
+         *
+         * @tparam T_Type type of the object where the name is queried
+         * @return ::type name of the object as pmacc::meta::String,
+         *         empty string is returned if the trait is not specified for
+         *         T_Type
+         */
+        template<typename T_Type>
+        struct GetCTName
+        {
+            using type = pmacc::meta::String<>;
+        };
 
-    template< typename T_Type >
-    using GetCTName_t = typename GetCTName< T_Type >::type;
+        template<typename T_Type>
+        using GetCTName_t = typename GetCTName<T_Type>::type;
 
-} // namespace traits
+    } // namespace traits
 } // namespace pmacc
diff --git a/include/pmacc/traits/GetComponentsType.hpp b/include/pmacc/traits/GetComponentsType.hpp
index 7ce698b51f..488633290c 100644
--- a/include/pmacc/traits/GetComponentsType.hpp
+++ b/include/pmacc/traits/GetComponentsType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,27 +25,25 @@
 
 namespace pmacc
 {
-
-namespace traits
-{
-    /** Get component type of an object
-     *
-     * \tparam T_Type any type
-     * \return \p ::type get result type
-     *            If T_Type is fundamental c++ type, the identity is returned
-     *
-     * Attention: do not defines this trait for structs with different attributes inside
-     */
-    template<typename T_Type,bool T_IsFundamental=boost::is_fundamental<T_Type>::value >
-    struct GetComponentsType;
-
-    template<typename T_Type>
-    struct GetComponentsType<T_Type,true>
+    namespace traits
     {
-        typedef T_Type type;
-    };
-
-} //namespace traits
-
-}// namespace pmacc
-
+        /** Get component type of an object
+         *
+         * \tparam T_Type any type
+         * \return \p ::type get result type
+         *            If T_Type is fundamental c++ type, the identity is returned
+         *
+         * Attention: do not defines this trait for structs with different attributes inside
+         */
+        template<typename T_Type, bool T_IsFundamental = boost::is_fundamental<T_Type>::value>
+        struct GetComponentsType;
+
+        template<typename T_Type>
+        struct GetComponentsType<T_Type, true>
+        {
+            typedef T_Type type;
+        };
+
+    } // namespace traits
+
+} // namespace pmacc
diff --git a/include/pmacc/traits/GetFlagType.hpp b/include/pmacc/traits/GetFlagType.hpp
index 3d1b0f578b..b728d9f9ed 100644
--- a/include/pmacc/traits/GetFlagType.hpp
+++ b/include/pmacc/traits/GetFlagType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,20 +24,19 @@
 
 namespace pmacc
 {
-namespace traits
-{
-
-/** Get Flag of an Object
- *
- * @tparam T_Object any object (class or typename)
- * @tparam T_Key a class which is used as identifier
- *
- * @treturn ::type
- */
-template<typename T_Object, typename T_Key>
-struct GetFlagType;
+    namespace traits
+    {
+        /** Get Flag of an Object
+         *
+         * @tparam T_Object any object (class or typename)
+         * @tparam T_Key a class which is used as identifier
+         *
+         * @treturn ::type
+         */
+        template<typename T_Object, typename T_Key>
+        struct GetFlagType;
 
 
-}//namespace traits
+    } // namespace traits
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/traits/GetInitializedInstance.hpp b/include/pmacc/traits/GetInitializedInstance.hpp
index a66bf02ddb..201671aec9 100644
--- a/include/pmacc/traits/GetInitializedInstance.hpp
+++ b/include/pmacc/traits/GetInitializedInstance.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PMacc.
  *
@@ -26,27 +26,26 @@
 
 namespace pmacc
 {
-namespace traits
-{
-
-/** Return an initialized instance. Expects a single parameter.
- *
- * The main reason to use this is for templated types where it's unknown
- * if they are fundamental or vector-like.
- *
- * \tparam T_Type type of object
- */
-template<typename T_Type>
-struct GetInitializedInstance
-{
-    typedef T_Type Type;
-
-    template<typename ValueType>
-    HDINLINE Type operator()(const ValueType& value) const
+    namespace traits
     {
-        return Type(value);
-    }
-};
+        /** Return an initialized instance. Expects a single parameter.
+         *
+         * The main reason to use this is for templated types where it's unknown
+         * if they are fundamental or vector-like.
+         *
+         * \tparam T_Type type of object
+         */
+        template<typename T_Type>
+        struct GetInitializedInstance
+        {
+            typedef T_Type Type;
+
+            template<typename ValueType>
+            HDINLINE Type operator()(const ValueType& value) const
+            {
+                return Type(value);
+            }
+        };
 
-} // traits
-} // PMacc
+    } // namespace traits
+} // namespace pmacc
diff --git a/include/pmacc/traits/GetNComponents.hpp b/include/pmacc/traits/GetNComponents.hpp
index 3ced6165ff..7a42d36fb8 100644
--- a/include/pmacc/traits/GetNComponents.hpp
+++ b/include/pmacc/traits/GetNComponents.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,38 +25,39 @@
 
 namespace pmacc
 {
-
-namespace traits
-{
-/** C
- *
- * \tparam T_Type any type
- * \return \p ::value as public with number of components (uint32_t)
- */
-template<typename T_Type, bool T_IsFundamental = boost::is_fundamental<T_Type>::value>
-struct GetNComponents
-{
-    /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
-     * even if the class is never instantiated. In that case static assert is always
-     * evaluated (e.g. with clang), this results in an error if the condition is false.
-     * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
-     *
-     * A workaround is to add a template dependency to the expression.
-     * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
-     */
-    PMACC_CASSERT_MSG_TYPE( __GetNComponents_is_not_defined_for_this_type, T_Type, false && ( sizeof(T_Type) != 0 ) );
-    static constexpr uint32_t value = 0;
-};
-
-/** return value=1 for al fundamental c++ types
- */
-template<typename T_Type>
-struct GetNComponents<T_Type, true>
-{
-    static constexpr uint32_t value=1;
-};
-
-} //namespace traits
-
-}// namespace Pmacc
-
+    namespace traits
+    {
+        /** C
+         *
+         * \tparam T_Type any type
+         * \return \p ::value as public with number of components (uint32_t)
+         */
+        template<typename T_Type, bool T_IsFundamental = boost::is_fundamental<T_Type>::value>
+        struct GetNComponents
+        {
+            /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
+             * even if the class is never instantiated. In that case static assert is always
+             * evaluated (e.g. with clang), this results in an error if the condition is false.
+             * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
+             *
+             * A workaround is to add a template dependency to the expression.
+             * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+             */
+            PMACC_CASSERT_MSG_TYPE(
+                __GetNComponents_is_not_defined_for_this_type,
+                T_Type,
+                false && (sizeof(T_Type) != 0));
+            static constexpr uint32_t value = 0;
+        };
+
+        /** return value=1 for al fundamental c++ types
+         */
+        template<typename T_Type>
+        struct GetNComponents<T_Type, true>
+        {
+            static constexpr uint32_t value = 1;
+        };
+
+    } // namespace traits
+
+} // namespace pmacc
diff --git a/include/pmacc/traits/GetNumWorkers.hpp b/include/pmacc/traits/GetNumWorkers.hpp
index 10a3ab19b7..71284ce1ed 100644
--- a/include/pmacc/traits/GetNumWorkers.hpp
+++ b/include/pmacc/traits/GetNumWorkers.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Rene Widera
+/* Copyright 2017-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,63 +27,60 @@
 
 namespace pmacc
 {
-namespace traits
-{
-    /** Get number of workers
-     *
-     * the number of workers for a kernel depending on the used accelerator
-     *
-     * @tparam T_maxWorkers the maximum number of workers
-     * @tparam T_Acc the accelerator type
-     * @return @p ::value number of workers
-     */
-    template<
-        uint32_t T_maxWorkers,
-        typename T_Acc = cupla::AccThreadSeq
-    >
-    struct GetNumWorkers
+    namespace traits
     {
-        static constexpr uint32_t value = T_maxWorkers;
-    };
+        /** Get number of workers
+         *
+         * the number of workers for a kernel depending on the used accelerator
+         *
+         * @tparam T_maxWorkers the maximum number of workers
+         * @tparam T_Acc the accelerator type
+         * @return @p ::value number of workers
+         */
+        template<uint32_t T_maxWorkers, typename T_Acc = cupla::AccThreadSeq>
+        struct GetNumWorkers
+        {
+            static constexpr uint32_t value = T_maxWorkers;
+        };
 
-#if( ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED == 1 )
-    template<
-        uint32_t T_maxWorkers,
-        typename ... T_Args
-    >
-    struct GetNumWorkers<
-        T_maxWorkers,
-        alpaka::acc::AccCpuOmp2Blocks< T_Args... >
-    >
-    {
-        static constexpr uint32_t value = 1u;
-    };
+#if(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED == 1)
+        template<uint32_t T_maxWorkers, typename... T_Args>
+        struct GetNumWorkers<T_maxWorkers, alpaka::AccCpuOmp2Blocks<T_Args...>>
+        {
+            static constexpr uint32_t value = 1u;
+        };
 #endif
-#if( ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED == 1 )
-    template<
-        uint32_t T_maxWorkers,
-        typename ... T_Args
-    >
-    struct GetNumWorkers<
-        T_maxWorkers,
-        alpaka::acc::AccCpuSerial< T_Args... >
-    >
-    {
-        static constexpr uint32_t value = 1u;
-    };
+#if(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED == 1)
+        template<uint32_t T_maxWorkers, typename... T_Args>
+        struct GetNumWorkers<T_maxWorkers, alpaka::AccCpuSerial<T_Args...>>
+        {
+            static constexpr uint32_t value = 1u;
+        };
 #endif
-#if( ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED == 1 )
-    template<
-        uint32_t T_maxWorkers,
-        typename ... T_Args
-    >
-    struct GetNumWorkers<
-        T_maxWorkers,
-        alpaka::acc::AccCpuTbbBlocks< T_Args... >
-    >
-    {
-        static constexpr uint32_t value = 1u;
-    };
+#if(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED == 1)
+        template<uint32_t T_maxWorkers, typename... T_Args>
+        struct GetNumWorkers<T_maxWorkers, alpaka::AccCpuTbbBlocks<T_Args...>>
+        {
+            static constexpr uint32_t value = 1u;
+        };
+#endif
+#if(ALPAKA_ACC_ANY_BT_OMP5_ENABLED == 1) && defined ALPAKA_OFFLOAD_MAX_BLOCK_SIZE && ALPAKA_OFFLOAD_MAX_BLOCK_SIZE > 0
+        template<uint32_t T_maxWorkers, typename... T_Args>
+        struct GetNumWorkers<T_maxWorkers, alpaka::AccOmp5<T_Args...>>
+        {
+            static constexpr uint32_t value = ALPAKA_OFFLOAD_MAX_BLOCK_SIZE;
+        };
+#endif
+#if(ALPAKA_ACC_ANY_BT_OACC_ENABLED == 1)
+        template<uint32_t T_maxWorkers, typename... T_Args>
+        struct GetNumWorkers<T_maxWorkers, alpaka::AccOacc<T_Args...>>
+        {
+#    ifdef ALPAKA_OFFLOAD_MAX_BLOCK_SIZE
+            static constexpr uint32_t value = ALPAKA_OFFLOAD_MAX_BLOCK_SIZE;
+#    else
+            static constexpr uint32_t value = 1;
+#    endif
+        };
 #endif
-} // namespace traits
+    } // namespace traits
 } // namespace pmacc
diff --git a/include/pmacc/traits/GetStringProperties.hpp b/include/pmacc/traits/GetStringProperties.hpp
index f9b5aa586a..125f97bd65 100644
--- a/include/pmacc/traits/GetStringProperties.hpp
+++ b/include/pmacc/traits/GetStringProperties.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -27,108 +27,108 @@
 
 namespace pmacc
 {
-namespace traits
-{
-
-    /** a property with sub properties
-     *
-     * This class inherit from `std::map`.
-     * If the `operator[]` is used to access a not existing key an empty StringProperty
-     * with the given key is inserted (default behavior of `std::map`)
-     */
-    struct StringProperty : public std::map< std::string, StringProperty >
+    namespace traits
     {
-        typedef std::map< std::string, StringProperty > StringPropertyMap;
-
-        //! empty constructor
-        StringProperty(
-        )
-        {}
-
-        /** constructor
+        /** a property with sub properties
          *
-         * creates a property with one key value
+         * This class inherit from `std::map`.
+         * If the `operator[]` is used to access a not existing key an empty StringProperty
+         * with the given key is inserted (default behavior of `std::map`)
          *
-         * \param key name of the key
-         * \param propertyValue value of the property
+         * Key naming convention:
+         *     "name" for name, openPMD-compatible when possible
+         *     "param" for additional parameters, corresdponding to openPMD
+         *             ...Parameters attribute
          */
-        StringProperty(
-            const std::string& key,
-            const std::string& propertyValue
-        ) : value(propertyValue)
+        struct StringProperty : public std::map<std::string, StringProperty>
         {
-            (*this)[key] = propertyValue;
-        }
+            typedef std::map<std::string, StringProperty> StringPropertyMap;
 
-        /** overwrite the value from a property
-         *
-         * \param propertyValue new value
-         * \return the property itself
+            //! empty constructor
+            StringProperty()
+            {
+            }
+
+            /** constructor
+             *
+             * creates a property with one key value
+             *
+             * \param key name of the key
+             * \param propertyValue value of the property
+             */
+            StringProperty(const std::string& key, const std::string& propertyValue) : value(propertyValue)
+            {
+                (*this)[key] = propertyValue;
+            }
+
+            /** overwrite the value from a property
+             *
+             * \param propertyValue new value
+             * \return the property itself
+             */
+            StringProperty& operator=(const std::string& propertyValue)
+            {
+                value = propertyValue;
+                return *this;
+            }
+
+            //! stores a property value
+            std::string value;
+        };
+
+        /** stream operator for a StringProperty
          */
-        StringProperty& operator=( const std::string& propertyValue )
+        HINLINE std::ostream& operator<<(std::ostream& out, const StringProperty& property)
         {
-            value = propertyValue;
-            return *this;
+            out << property.value;
+            return out;
         }
 
-        //! stores a property value
-        std::string value;
-    };
+        /** Get a property tree of an object
+         *
+         * specialize this struct including the static method `StringProperty get()`
+         * to define a property for an object without the method `getStringProperties()`
+         *
+         * \tparam T_Type any type
+         * \return \p T_Type::getStringProperties() if trait `GetStringProperties<>` is not specialized
+         */
+        template<typename T_Type>
+        struct StringProperties
+        {
+            static StringProperty get()
+            {
+                return T_Type::getStringProperties();
+            }
+        };
 
-    /** stream operator for a StringProperty
-     */
-    HINLINE std::ostream& operator<<( std::ostream& out, const StringProperty& property )
-    {
-        out << property.value;
-        return out;
-    }
 
-    /** Get a property tree of an object
-     *
-     * specialize this struct including the static method `StringProperty get()`
-     * to define a property for an object without the method `getStringProperties()`
-     *
-     * \tparam T_Type any type
-     * \return \p T_Type::getStringProperties() if trait `GetStringProperties<>` is not specialized
-     */
-    template< typename T_Type >
-    struct StringProperties
-    {
-        static StringProperty get()
+        /** get the properties of an object
+         *
+         * The struct `StringProperties<>` needs to be specialized to change the result
+         * of this trait for a user defined type.
+         * If there is no user defined specialization available this trait inherits from
+         * the result of `::getStringProperties()` from the queried type.
+         */
+        template<typename T_Type>
+        struct GetStringProperties : public StringProperty
         {
-            return T_Type::getStringProperties();
-        }
-    };
-
+            GetStringProperties() : StringProperty(StringProperties<T_Type>::get())
+            {
+            }
+        };
 
-    /** get the properties of an object
-     *
-     * The struct `StringProperties<>` needs to be specialized to change the result
-     * of this trait for a user defined type.
-     * If there is no user defined specialization available this trait inherits from
-     * the result of `::getStringProperties()` from the queried type.
-     */
-    template< typename T_Type >
-    struct GetStringProperties : public StringProperty
-    {
-        GetStringProperties() : StringProperty( StringProperties< T_Type >::get() )
+        /** get the properties of an object instance
+         *
+         * same as `GetStringProperties<>` but accepts an instance instead a type
+         *
+         * \param an instance that shall be queried
+         * \return StringProperty of the given instance
+         */
+        template<typename T_Type>
+        HINLINE StringProperty getStringProperties(const T_Type&)
         {
+            return GetStringProperties<T_Type>()();
         }
-    };
-
-    /** get the properties of an object instance
-     *
-     * same as `GetStringProperties<>` but accepts an instance instead a type
-     *
-     * \param an instance that shall be queried
-     * \return StringProperty of the given instance
-     */
-    template< typename T_Type >
-    HINLINE StringProperty
-    getStringProperties( const T_Type& )
-    {
-        return GetStringProperties<T_Type>()();
-    };
 
-} // namespace traits
+    } // namespace traits
 } // namespace pmacc
diff --git a/include/pmacc/traits/GetUniqueTypeId.hpp b/include/pmacc/traits/GetUniqueTypeId.hpp
index a963c7ca1e..6ff75859f6 100644
--- a/include/pmacc/traits/GetUniqueTypeId.hpp
+++ b/include/pmacc/traits/GetUniqueTypeId.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Sergei Bastrakov
+/* Copyright 2015-2021 Rene Widera, Sergei Bastrakov
  *
  * This file is part of PMacc.
  *
@@ -30,98 +30,92 @@
 
 namespace pmacc
 {
-namespace traits
-{
-
-/** Get next available type id
- *
- * Warning: is not thread-safe.
- */
-inline uint64_t getNextId( );
-
-namespace detail
-{
-
-/** Global counter for type ids
- */
-inline uint64_t & counter()
-{
-    static uint64_t value = 0;
-    return value;
-}
-
-/** Unique id for a given type
- *
- * @tparam T_Type type
- */
-template<typename T_Type>
-struct TypeId
-{
-    static const uint64_t id;
-};
-
-/** These id values are generated during the startup for all types that cause
- *  instantiation of GetUniqueTypeId<T_Type>::uid().
- *
- * The order of calls to GetUniqueTypeId<T_Type>::uid() does not affect the id
- * generation, which guarantees the ids are matching for all processes even when
- * the run-time access is not.
- */
-template<typename T_Type>
-const uint64_t TypeId<T_Type>::id = getNextId( );
-
-} //namespace detail
-
-/** Get next available type id
- *
- * Warning: is not thread-safe.
- */
-uint64_t getNextId( )
-{
-    return ++detail::counter( );
-}
-
-/** Get a unique id of a type
- *
- * - get a unique id of a type at runtime
- * - the id of a type is equal on each instance of a process
- *
- * @tparam T_Type any object (class or typename)
- * @tparam T_ResultType result type
- */
-template<typename T_Type, typename T_ResultType = uint64_t>
-struct GetUniqueTypeId
-{
-    typedef T_ResultType ResultType;
-    typedef T_Type Type;
-
-    /** create unique id
-     *
-     * @param maxValue largest allowed id
-     */
-    static const ResultType uid(uint64_t maxValue = boost::numeric::bounds<ResultType>::highest())
+    namespace traits
     {
+        /** Get next available type id
+         *
+         * Warning: is not thread-safe.
+         */
+        inline uint64_t getNextId();
 
-        const uint64_t id = detail::TypeId<Type>::id;
-
-        /* if `id` is out of range than throw an error */
-        if (id > maxValue)
+        namespace detail
         {
-            std::stringstream sId;
-            sId << id;
-            std::stringstream sMax;
-            sMax << maxValue;
-            throw std::runtime_error("generated id is out of range [ id = " +
-                                     sId.str() +
-                                     std::string(", largest allowed  id = ") +
-                                     sMax.str() +
-                                     std::string(" ]"));
+            /** Global counter for type ids
+             */
+            inline uint64_t& counter()
+            {
+                static uint64_t value = 0;
+                return value;
+            }
+
+            /** Unique id for a given type
+             *
+             * @tparam T_Type type
+             */
+            template<typename T_Type>
+            struct TypeId
+            {
+                static const uint64_t id;
+            };
+
+            /** These id values are generated during the startup for all types that cause
+             *  instantiation of GetUniqueTypeId<T_Type>::uid().
+             *
+             * The order of calls to GetUniqueTypeId<T_Type>::uid() does not affect the id
+             * generation, which guarantees the ids are matching for all processes even when
+             * the run-time access is not.
+             */
+            template<typename T_Type>
+            const uint64_t TypeId<T_Type>::id = getNextId();
+
+        } // namespace detail
+
+        /** Get next available type id
+         *
+         * Warning: is not thread-safe.
+         */
+        uint64_t getNextId()
+        {
+            return ++detail::counter();
         }
-        return static_cast<ResultType> (id);
-    }
-
-};
 
-}//namespace traits
-
-}//namespace pmacc
+        /** Get a unique id of a type
+         *
+         * - get a unique id of a type at runtime
+         * - the id of a type is equal on each instance of a process
+         *
+         * @tparam T_Type any object (class or typename)
+         * @tparam T_ResultType result type
+         */
+        template<typename T_Type, typename T_ResultType = uint64_t>
+        struct GetUniqueTypeId
+        {
+            typedef T_ResultType ResultType;
+            typedef T_Type Type;
+
+            /** create unique id
+             *
+             * @param maxValue largest allowed id
+             */
+            static const ResultType uid(uint64_t maxValue = boost::numeric::bounds<ResultType>::highest())
+            {
+                const uint64_t id = detail::TypeId<Type>::id;
+
+                /* if `id` is out of range than throw an error */
+                if(id > maxValue)
+                {
+                    std::stringstream sId;
+                    sId << id;
+                    std::stringstream sMax;
+                    sMax << maxValue;
+                    throw std::runtime_error(
+                        "generated id is out of range [ id = " + sId.str() + std::string(", largest allowed  id = ")
+                        + sMax.str() + std::string(" ]"));
+                }
+                return static_cast<ResultType>(id);
+            }
+        };
+
+    } // namespace traits
+
+} // namespace pmacc
diff --git a/include/pmacc/traits/GetValueType.hpp b/include/pmacc/traits/GetValueType.hpp
index 1076c3b8fc..f2312848df 100644
--- a/include/pmacc/traits/GetValueType.hpp
+++ b/include/pmacc/traits/GetValueType.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,7 +30,7 @@ namespace pmacc
         {
             typedef typename T::ValueType ValueType;
         };
-    }
-}
+    } // namespace traits
+} // namespace pmacc
 
 #include "GetValueType.tpp"
diff --git a/include/pmacc/traits/GetValueType.tpp b/include/pmacc/traits/GetValueType.tpp
index b2bd9d4bc3..211643618f 100644
--- a/include/pmacc/traits/GetValueType.tpp
+++ b/include/pmacc/traits/GetValueType.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,8 +30,5 @@ namespace pmacc
         {
             typedef Type ValueType;
         };
-    }
-}
-
-
-
+    } // namespace traits
+} // namespace pmacc
diff --git a/include/pmacc/traits/HasFlag.hpp b/include/pmacc/traits/HasFlag.hpp
index fed45d0caf..4fdf1b2f26 100644
--- a/include/pmacc/traits/HasFlag.hpp
+++ b/include/pmacc/traits/HasFlag.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,26 +24,25 @@
 
 namespace pmacc
 {
-namespace traits
-{
-
-/** Checks if a Objects has an flag
- *
- * @tparam T_Object any object (class or typename)
- * @tparam T_Key a class which is used as identifier
- *
- * This struct must define
- * ::type (boost::mpl::bool_<>)
- */
-template<typename T_Object, typename T_Key>
-struct HasFlag;
+    namespace traits
+    {
+        /** Checks if a Objects has an flag
+         *
+         * @tparam T_Object any object (class or typename)
+         * @tparam T_Key a class which is used as identifier
+         *
+         * This struct must define
+         * ::type (boost::mpl::bool_<>)
+         */
+        template<typename T_Object, typename T_Key>
+        struct HasFlag;
 
-template<typename T_Object, typename T_Key>
-bool hasFlag(const T_Object& obj,const T_Key& key)
-{
-    return HasFlag<T_Object,T_Key>::type::value;
-}
+        template<typename T_Object, typename T_Key>
+        bool hasFlag(const T_Object& obj, const T_Key& key)
+        {
+            return HasFlag<T_Object, T_Key>::type::value;
+        }
 
-}//namespace traits
+    } // namespace traits
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/traits/HasIdentifier.hpp b/include/pmacc/traits/HasIdentifier.hpp
index 4ad2a6b206..9ea941266c 100644
--- a/include/pmacc/traits/HasIdentifier.hpp
+++ b/include/pmacc/traits/HasIdentifier.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,41 +26,39 @@
 
 namespace pmacc
 {
-namespace traits
-{
+    namespace traits
+    {
+        /** Checks if a Objects has an identifier
+         *
+         * @tparam T_Object any object (class or typename)
+         * @tparam T_Key a class which is used as identifier
+         *
+         * This struct must define
+         * ::type (boost::mpl::bool_<>)
+         */
+        template<typename T_Object, typename T_Key>
+        struct HasIdentifier
+        {
+            /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
+             * even if the class is never instantiated. In that case static assert is always
+             * evaluated (e.g. with clang), this results in an error if the condition is false.
+             * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
+             *
+             * A workaround is to add a template dependency to the expression.
+             * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
+             */
+            PMACC_CASSERT_MSG_TYPE(
+                ___HasIdentifier_is_not_specialized_for_T_Object,
+                T_Object,
+                false && (sizeof(T_Object) != 0));
+        };
 
-/** Checks if a Objects has an identifier
- *
- * @tparam T_Object any object (class or typename)
- * @tparam T_Key a class which is used as identifier
- *
- * This struct must define
- * ::type (boost::mpl::bool_<>)
- */
-template<typename T_Object, typename T_Key>
-struct HasIdentifier
-{
-    /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
-     * even if the class is never instantiated. In that case static assert is always
-     * evaluated (e.g. with clang), this results in an error if the condition is false.
-     * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
-     *
-     * A workaround is to add a template dependency to the expression.
-     * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
-     */
-    PMACC_CASSERT_MSG_TYPE(
-        ___HasIdentifier_is_not_specialized_for_T_Object,
-        T_Object,
-        false && ( sizeof(T_Object) != 0 )
-    );
-};
-
-template<typename T_Object, typename T_Key>
-bool hasIdentifier(const T_Object& obj,const T_Key& key)
-{
-    return HasIdentifier<T_Object,T_Key>::type::value;
-}
+        template<typename T_Object, typename T_Key>
+        bool hasIdentifier(const T_Object& obj, const T_Key& key)
+        {
+            return HasIdentifier<T_Object, T_Key>::type::value;
+        }
 
-}//namespace traits
+    } // namespace traits
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/traits/HasIdentifiers.hpp b/include/pmacc/traits/HasIdentifiers.hpp
index 8569b7cd6a..1bdb182902 100644
--- a/include/pmacc/traits/HasIdentifiers.hpp
+++ b/include/pmacc/traits/HasIdentifiers.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2017-2020 Axel Huebl
+/* Copyright 2017-2021 Axel Huebl
  *
  * This file is part of PMacc.
  *
@@ -29,58 +29,33 @@
 
 namespace pmacc
 {
-namespace traits
-{
-
-    /** Checks if an object has all specified identifiers
-     *
-     * Individual identifiers checks are logically connected via
-     * boost::mpl::and_ .
-     *
-     * @tparam T_Object any object (class or typename)
-     * @tparam T_SeqKeys a sequence of identifiers
-     *
-     * This struct must define
-     * ::type (boost::mpl::bool_<>)
-     */
-    template<
-        typename T_Object,
-        typename T_SeqKeys
-    >
-    struct HasIdentifiers
-    {
-        using SeqHasIdentifiers = typename bmpl::transform<
-            T_SeqKeys,
-            HasIdentifier<
-                T_Object,
-                bmpl::_1
-            >
-        >::type;
-
-        using type = typename bmpl::accumulate<
-            SeqHasIdentifiers,
-            bmpl::bool_< true >,
-            bmpl::and_<
-                bmpl::_1,
-                bmpl::_2
-            >
-        >::type;
-    };
-
-    template<
-        typename T_Object,
-        typename T_SeqKeys
-    >
-    bool hasIdentifiers(
-        T_Object const &,
-        T_SeqKeys const &
-    )
+    namespace traits
     {
-        return HasIdentifiers<
-            T_Object,
-            T_SeqKeys
-        >::type::value;
-    }
-
-} // namespace traits
+        /** Checks if an object has all specified identifiers
+         *
+         * Individual identifiers checks are logically connected via
+         * boost::mpl::and_ .
+         *
+         * @tparam T_Object any object (class or typename)
+         * @tparam T_SeqKeys a sequence of identifiers
+         *
+         * This struct must define
+         * ::type (boost::mpl::bool_<>)
+         */
+        template<typename T_Object, typename T_SeqKeys>
+        struct HasIdentifiers
+        {
+            using SeqHasIdentifiers = typename bmpl::transform<T_SeqKeys, HasIdentifier<T_Object, bmpl::_1>>::type;
+
+            using type =
+                typename bmpl::accumulate<SeqHasIdentifiers, bmpl::bool_<true>, bmpl::and_<bmpl::_1, bmpl::_2>>::type;
+        };
+
+        template<typename T_Object, typename T_SeqKeys>
+        bool hasIdentifiers(T_Object const&, T_SeqKeys const&)
+        {
+            return HasIdentifiers<T_Object, T_SeqKeys>::type::value;
+        }
+
+    } // namespace traits
 } // namespace pmacc
diff --git a/include/pmacc/traits/IsBaseTemplateOf.hpp b/include/pmacc/traits/IsBaseTemplateOf.hpp
new file mode 100644
index 0000000000..f8f565aad9
--- /dev/null
+++ b/include/pmacc/traits/IsBaseTemplateOf.hpp
@@ -0,0 +1,62 @@
+/* Copyright 2020-2021 Sergei Bastrakov
+ *
+ * This file is part of PMacc.
+ *
+ * PMacc is free software: you can redistribute it and/or modify
+ * it under the terms of either the GNU General Public License or
+ * the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PMacc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License and the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * and the GNU Lesser General Public License along with PMacc.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+
+namespace pmacc
+{
+    namespace traits
+    {
+        /** Check if a type inherits the given class template (with any arguments)
+         *
+         * This is basically a version of std::is_base_of but for class template as base.
+         * Based on Stack Overflow post:
+         *   source: https://stackoverflow.com/a/34672753
+         *   author: rmawatson
+         *   date: Aug 23 '18
+         *
+         * @tparam T_Base base template (itself, without arguments)
+         * @tparam T_Derived derived type to check
+         * @treturn ::type std::true_type or std::false_type
+         */
+        template<template<typename...> class T_Base, typename T_Derived>
+        struct IsBaseTemplateOf
+        {
+            template<typename... T_Args>
+            static constexpr std::true_type test(const T_Base<T_Args...>*);
+            static constexpr std::false_type test(...);
+            using type = decltype(test(std::declval<T_Derived*>()));
+        };
+
+        /** Helper alias for IsBaseTemplateOf<...>::type
+         *
+         * @tparam T_Base base template (itself, without arguments)
+         * @tparam T_Derived derived type to check
+         * @treturn std::true_type or std::false_type
+         */
+        template<template<typename...> class T_Base, typename T_Derived>
+        using IsBaseTemplateOf_t = typename IsBaseTemplateOf<T_Base, T_Derived>::type;
+
+    } // namespace traits
+} // namespace pmacc
diff --git a/include/pmacc/traits/Limits.hpp b/include/pmacc/traits/Limits.hpp
index 679f32b45b..e6952c5173 100644
--- a/include/pmacc/traits/Limits.hpp
+++ b/include/pmacc/traits/Limits.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -26,28 +26,28 @@
 
 namespace pmacc
 {
-namespace traits
-{
-namespace limits
-{
-/** get maximum finite value
- *
- * @tparam T_Type any type
- * @result ::value
- */
-template<typename T_Type>
-struct Max;
+    namespace traits
+    {
+        namespace limits
+        {
+            /** get maximum finite value
+             *
+             * @tparam T_Type any type
+             * @result ::value
+             */
+            template<typename T_Type>
+            struct Max;
 
-/** get minimum finite value
- *
- * @tparam T_Type any type
- * @result ::value
- */
-template<typename T_Type>
-struct Min;
+            /** get minimum finite value
+             *
+             * @tparam T_Type any type
+             * @result ::value
+             */
+            template<typename T_Type>
+            struct Min;
 
-} //namespace limits
-} //namespace traits
-} //namespace pmacc
+        } // namespace limits
+    } // namespace traits
+} // namespace pmacc
 
 #include "pmacc/traits/Limits.tpp"
diff --git a/include/pmacc/traits/Limits.tpp b/include/pmacc/traits/Limits.tpp
index 06117e2638..2cd165ae86 100644
--- a/include/pmacc/traits/Limits.tpp
+++ b/include/pmacc/traits/Limits.tpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -28,29 +28,28 @@
 
 namespace pmacc
 {
-namespace traits
-{
-namespace limits
-{
-
-template<>
-struct Max<int>
-{
-    static constexpr int value=INT_MAX;
-};
-
-template<>
-struct Max<uint32_t>
-{
-    static constexpr uint32_t value=static_cast<uint32_t>(-1);
-};
-
-template<>
-struct Max<uint64_t>
-{
-    static constexpr uint64_t value=static_cast<uint64_t>(-1);
-};
-
-} //namespace limits
-} //namespace traits
-} //namespace pmacc
+    namespace traits
+    {
+        namespace limits
+        {
+            template<>
+            struct Max<int>
+            {
+                static constexpr int value = INT_MAX;
+            };
+
+            template<>
+            struct Max<uint32_t>
+            {
+                static constexpr uint32_t value = static_cast<uint32_t>(-1);
+            };
+
+            template<>
+            struct Max<uint64_t>
+            {
+                static constexpr uint64_t value = static_cast<uint64_t>(-1);
+            };
+
+        } // namespace limits
+    } // namespace traits
+} // namespace pmacc
diff --git a/include/pmacc/traits/NumberOfExchanges.hpp b/include/pmacc/traits/NumberOfExchanges.hpp
index 4fef38085d..3fd4dcd21d 100644
--- a/include/pmacc/traits/NumberOfExchanges.hpp
+++ b/include/pmacc/traits/NumberOfExchanges.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,37 +25,35 @@
 
 namespace pmacc
 {
-
-namespace traits
-{
-/** Get number of possible exchanges
- *
- * \tparam T_dim dimension of the simulation
- * \return \p ::value number of possible exchanges
- *              (is number neighbors + myself)
- */
-template<uint32_t T_dim >
-struct NumberOfExchanges;
-
-template<>
-struct NumberOfExchanges<DIM1>
-{
-    static constexpr uint32_t value = LEFT + RIGHT;
-};
-
-template<>
-struct NumberOfExchanges<DIM2>
-{
-    static constexpr uint32_t value = TOP + BOTTOM;
-};
-
-template<>
-struct NumberOfExchanges<DIM3>
-{
-    static constexpr uint32_t value = BACK + FRONT;
-};
-
-} //namespace traits
-
-}// namespace pmacc
-
+    namespace traits
+    {
+        /** Get number of possible exchanges
+         *
+         * \tparam T_dim dimension of the simulation
+         * \return \p ::value number of possible exchanges
+         *              (is number neighbors + myself)
+         */
+        template<uint32_t T_dim>
+        struct NumberOfExchanges;
+
+        template<>
+        struct NumberOfExchanges<DIM1>
+        {
+            static constexpr uint32_t value = LEFT + RIGHT;
+        };
+
+        template<>
+        struct NumberOfExchanges<DIM2>
+        {
+            static constexpr uint32_t value = TOP + BOTTOM;
+        };
+
+        template<>
+        struct NumberOfExchanges<DIM3>
+        {
+            static constexpr uint32_t value = BACK + FRONT;
+        };
+
+    } // namespace traits
+
+} // namespace pmacc
diff --git a/include/pmacc/traits/Resolve.hpp b/include/pmacc/traits/Resolve.hpp
index de984fb3f2..8fc9b19d32 100644
--- a/include/pmacc/traits/Resolve.hpp
+++ b/include/pmacc/traits/Resolve.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera
+/* Copyright 2014-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -24,25 +24,24 @@
 
 namespace pmacc
 {
-namespace traits
-{
-
-/** Get resolved type
- *
- * Explicitly resolve the type of a synonym type, e.g., resolve the type of an PMacc alias.
- * A synonym type is wrapper type (class) around an other type.
- * If this trait is not defined for the given type the result is the identity of the given type.
- *
- * @tparam T_Object any object (class or typename)
- *
- * @treturn ::type
- */
-template<typename T_Object>
-struct Resolve
-{
-    typedef T_Object type;
-};
+    namespace traits
+    {
+        /** Get resolved type
+         *
+         * Explicitly resolve the type of a synonym type, e.g., resolve the type of an PMacc alias.
+         * A synonym type is wrapper type (class) around an other type.
+         * If this trait is not defined for the given type the result is the identity of the given type.
+         *
+         * @tparam T_Object any object (class or typename)
+         *
+         * @treturn ::type
+         */
+        template<typename T_Object>
+        struct Resolve
+        {
+            typedef T_Object type;
+        };
 
-}//namespace traits
+    } // namespace traits
 
-}//namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/type/Area.hpp b/include/pmacc/type/Area.hpp
index 57def5afe1..03aa3c6d38 100644
--- a/include/pmacc/type/Area.hpp
+++ b/include/pmacc/type/Area.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -26,22 +26,21 @@
 
 namespace pmacc
 {
-namespace type
-{
-
-    /*! area which is calculated
-     *
-     * CORE is the inner area of a grid
-     * BORDER is the border of a grid (my own border, not the neighbor part)
-     */
-    enum AreaType
+    namespace type
     {
-        CORE = 1u,
-        BORDER = 2u,
-        GUARD = 4u
-    };
+        /*! area which is calculated
+         *
+         * CORE is the inner area of a grid
+         * BORDER is the border of a grid (my own border, not the neighbor part)
+         */
+        enum AreaType
+        {
+            CORE = 1u,
+            BORDER = 2u,
+            GUARD = 4u
+        };
 
-} // namespace type
+    } // namespace type
 
     // for backward compatibility pull all definitions into the pmacc namespace
     using namespace type;
diff --git a/include/pmacc/type/Exchange.hpp b/include/pmacc/type/Exchange.hpp
index 6b27a2ed4f..c9e57ee0a8 100644
--- a/include/pmacc/type/Exchange.hpp
+++ b/include/pmacc/type/Exchange.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -28,58 +28,70 @@
 
 namespace pmacc
 {
-namespace type
-{
-
-    /**
-     * Bitmask which describes the direction of communication.
-     *
-     * Bitmasks may be combined logically, e.g. LEFT+TOP = TOPLEFT.
-     * It is not possible to combine complementary masks (e.g. FRONT and BACK),
-     * as a bitmask always defines one direction of communication (send or receive).
-     *
-     * Axis index relation:
-     *   right & left are in X
-     *   bottom & top are in Y
-     *   back & front are in Z
-     */
-    enum ExchangeType
+    namespace type
     {
-        RIGHT = 1u,
-        LEFT = 2u,
-        BOTTOM = 3u,
-        TOP = 6u,
-        BACK = 9u,
-        FRONT = 18u // 3er-System
-    };
+        /**
+         * Bitmask which describes the direction of communication.
+         *
+         * Bitmasks may be combined logically, e.g. LEFT+TOP = TOPLEFT.
+         * It is not possible to combine complementary masks (e.g. FRONT and BACK),
+         * as a bitmask always defines one direction of communication (send or receive).
+         *
+         * Axis index relation:
+         *   right & left are in X
+         *   bottom & top are in Y
+         *   back & front are in Z
+         */
+        enum ExchangeType
+        {
+            RIGHT = 1u,
+            LEFT = 2u,
+            BOTTOM = 3u,
+            TOP = 6u,
+            BACK = 9u,
+            FRONT = 18u // 3er-System
+        };
 
-    struct ExchangeTypeNames
-    {
-        std::string operator[]( const uint32_t exchange ) const
+        struct ExchangeTypeNames
         {
-            if( exchange >= 27 )
-                return std::string("unknown exchange type: ") + std::to_string(exchange);
+            std::string operator[](const uint32_t exchange) const
+            {
+                if(exchange >= 27)
+                    return std::string("unknown exchange type: ") + std::to_string(exchange);
 
-            const char* names[27] = {
-                "none",
-                "right", "left", "bottom",
-                "right-bottom", "left-bottom",
-                "top",
-                "right-top", "left-top",
-                "back",
-                "right-back", "left-back",
-                "bottom-back", "right-bottom-back", "left-bottom-back",
-                "top-back", "right-top-back", "left-top-back",
-                "front",
-                "right-front", "left-front",
-                "bottom-front", "right-bottom-front", "left-bottom-front",
-                "top-front", "right-top-front", "left-top-front"
-            };
-            return names[exchange];
-        }
-    };
+                const char* names[27]
+                    = {"none",
+                       "right",
+                       "left",
+                       "bottom",
+                       "right-bottom",
+                       "left-bottom",
+                       "top",
+                       "right-top",
+                       "left-top",
+                       "back",
+                       "right-back",
+                       "left-back",
+                       "bottom-back",
+                       "right-bottom-back",
+                       "left-bottom-back",
+                       "top-back",
+                       "right-top-back",
+                       "left-top-back",
+                       "front",
+                       "right-front",
+                       "left-front",
+                       "bottom-front",
+                       "right-bottom-front",
+                       "left-bottom-front",
+                       "top-front",
+                       "right-top-front",
+                       "left-top-front"};
+                return names[exchange];
+            }
+        };
 
-} // namespace type
+    } // namespace type
 
     // for backward compatibility pull all definitions into the pmacc namespace
     using namespace type;
diff --git a/include/pmacc/type/Integral.hpp b/include/pmacc/type/Integral.hpp
index 85533edc8f..8c5c039756 100644
--- a/include/pmacc/type/Integral.hpp
+++ b/include/pmacc/type/Integral.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -28,14 +28,13 @@
 
 namespace pmacc
 {
-namespace type
-{
-
-    using id_t = uint64_t;
-    using uint64_cu = unsigned long long int;
-    using int64_cu = long long int;
+    namespace type
+    {
+        using id_t = uint64_t;
+        using uint64_cu = unsigned long long int;
+        using int64_cu = long long int;
 
-} // namespace type
+    } // namespace type
 
     // for backward compatibility pull all definitions into the pmacc namespace
     using namespace type;
diff --git a/include/pmacc/types.hpp b/include/pmacc/types.hpp
index 03d05ed716..7dc8a16344 100644
--- a/include/pmacc/types.hpp
+++ b/include/pmacc/types.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Heiko Burau, Rene Widera,
  *                     Wolfgang Hoenig, Benjamin Worpitz,
  *                     Alexander Grund
  *
@@ -30,34 +30,18 @@
 #include <cupla/types.hpp>
 
 #ifndef PMACC_CUDA_ENABLED
-#   define PMACC_CUDA_ENABLED ALPAKA_ACC_GPU_CUDA_ENABLED
+#    define PMACC_CUDA_ENABLED ALPAKA_ACC_GPU_CUDA_ENABLED
 #endif
 
-#if( PMACC_CUDA_ENABLED == 1 )
+#if(BOOST_LANG_CUDA || BOOST_COMP_HIP)
 /* include mallocMC before cupla renaming is activated, else we need the variable acc
  * to call atomic cuda functions
  */
-#   include <mallocMC/mallocMC.hpp>
+#    include <mallocMC/mallocMC.hpp>
 #endif
 
 
-#include <cuda_to_cupla.hpp>
-
-#if( PMACC_CUDA_ENABLED == 1 )
-/** @todo please remove this workaround
- * This workaround allows to use native CUDA on the CUDA device without
- * passing the variable `acc` to each function. This is only needed during the
- * porting phase to allow the full feature set of the plain PMacc and PIConGPU
- * CUDA version if the accelerator is CUDA.
- */
-#   undef blockIdx
-#   undef __syncthreads
-#   undef threadIdx
-#   undef gridDim
-#   undef blockDim
-#   undef uint3
-
-#endif
+#include <cupla.hpp>
 
 #include "pmacc/debug/PMaccVerbose.hpp"
 #include "pmacc/ppFunctions.hpp"
@@ -81,8 +65,7 @@
 
 namespace pmacc
 {
+    namespace bmpl = boost::mpl;
+    namespace bfs = boost::filesystem;
 
-namespace bmpl = boost::mpl;
-namespace bfs = boost::filesystem;
-
-} //namespace pmacc
+} // namespace pmacc
diff --git a/include/pmacc/verify.hpp b/include/pmacc/verify.hpp
index d221aaf91b..c3f59e7e43 100644
--- a/include/pmacc/verify.hpp
+++ b/include/pmacc/verify.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Rene Widera
+/* Copyright 2016-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,8 +30,7 @@
  *
  * @param expr expression to be evaluated
  */
-#define PMACC_VERIFY( expr )                                                   \
-    ( !!(expr) ) ? ( (void) 0 ) : pmacc::abortWithError( #expr, __FILE__, __LINE__ )
+#define PMACC_VERIFY(expr) (!!(expr)) ? ((void) 0) : pmacc::abortWithError(#expr, __FILE__, __LINE__)
 
 /** verify expression with message
  *
@@ -41,5 +40,4 @@
  * @param msg output message (of type `std::string`) which is printed if the
  *            expression is evaluated to false
  */
-#define PMACC_VERIFY_MSG( expr, msg )                                          \
-    ( !!(expr) ) ? ( (void) 0 ) : pmacc::abortWithError( #expr, __FILE__, __LINE__, msg )
+#define PMACC_VERIFY_MSG(expr, msg) (!!(expr)) ? ((void) 0) : pmacc::abortWithError(#expr, __FILE__, __LINE__, msg)
diff --git a/lib/python/picongpu/input/parameters.py b/lib/python/picongpu/input/parameters.py
index fd554c5d52..8dc174dd66 100755
--- a/lib/python/picongpu/input/parameters.py
+++ b/lib/python/picongpu/input/parameters.py
@@ -1,7 +1,7 @@
 """
 This file is part of PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke, Jeffrey Kelling
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/data/XrayScatteringData.py b/lib/python/picongpu/plugins/data/XrayScatteringData.py
new file mode 100644
index 0000000000..087d2a4ae8
--- /dev/null
+++ b/lib/python/picongpu/plugins/data/XrayScatteringData.py
@@ -0,0 +1,101 @@
+"""
+This file is part of the PIConGPU.
+
+Copyright 2017-2021 PIConGPU contributors
+Authors: Pawel Ordyna
+License: GPLv3+
+"""
+from .base_reader import DataReader
+
+from os import path
+import numpy as np
+import openpmd_api as api
+
+
+class XrayScatteringData(DataReader):
+    """ Data reader for the xrayScattering plugin. """
+
+    def __init__(self, run_directory, species, file_extension='bp',
+                 file_name_base='Output'):
+        """
+        Parameters
+        ----------
+        run_directory : string
+            path to the run directory of PIConGPU
+            (the path before ``simOutput/``)
+        species : string
+            Species for which the plugin output should be loaded. It's the
+            string defined in `speciesDefinition.param`.
+        file_extension : string
+            file extension of the xrayScattering output file.
+            Default is "bp".
+        file_name_base : string
+            String name set in the xrayScattering command line parameter
+            fileName. Default is "Output".
+            The full file name is
+
+            ::
+                `<species>_xrayScattering<file_name_base>.<file_extension>`
+        """
+
+        super().__init__(run_directory)
+
+        self.full_file_name = (species + "_xrayScattering" + file_name_base +
+                               "." + file_extension)
+
+        self.full_path = path.join(self.run_directory,
+                                   "simOutput/xrayScatteringOutput")
+        self.full_path = path.join(self.full_path, self.full_file_name)
+        # openPMD series
+        self.series = api.Series(self.full_path, api.Access_Type.read_only)
+        self.total_simulation_cells = self.series.get_attribute(
+            "totalSimulationCells")
+
+    def get_data_path(self, **kwargs):
+        """
+        Returns
+        -------
+        A string with the path to the underlying data file.
+        """
+        return self.full_path
+
+    def get_iterations(self, **kwargs):
+        """
+        Returns
+        -------
+        An array with unsigned integers of iterations for which
+        data is available.
+        """
+        return np.array(list(self.series.iterations))
+
+    def _get_for_iteration(self, iteration, **kwargs):
+        """ Get the data for a given iteration in PIC units.
+
+        Call `get_unit` method to get the conversion factor (to SI).
+
+        Returns
+        -------
+        The complex scattering amplitude in PIC units.
+        """
+
+        i = self.series.iterations[iteration]
+        amplitude = i.meshes['amplitude']
+        mrc_real, mrc_imag = amplitude['x'], amplitude['y']
+        real = mrc_real.load_chunk()
+        imag = mrc_imag.load_chunk()
+        self.series.flush()
+        if mrc_imag.dtype.type is np.float32:
+            dtype = np.complex64
+        elif mrc_imag.dtype.type is np.float64:
+            dtype = np.complex128
+        else:
+            raise TypeError
+        result = (real + 1j * imag) * self.total_simulation_cells
+        return result.astype(dtype)
+
+    def get_unit(self):
+        """ Get the amplitude unit. """
+        i = self.series.iterations[self.get_iterations()[0]]
+        amplitude = i.meshes['amplitude']
+        mrc_real = amplitude['x']
+        return mrc_real.unit_SI
diff --git a/lib/python/picongpu/plugins/data/__init__.py b/lib/python/picongpu/plugins/data/__init__.py
index 3c85f5f986..d9d5d006c4 100644
--- a/lib/python/picongpu/plugins/data/__init__.py
+++ b/lib/python/picongpu/plugins/data/__init__.py
@@ -5,6 +5,7 @@
 from .sliceFieldReader import FieldSliceData
 from .emittance import EmittanceData
 from .transitionradiation import TransitionRadiationData
+from .XrayScatteringData import XrayScatteringData
 
 __all__ = [
     "EnergyHistogramData",
@@ -14,4 +15,5 @@
     "FieldSliceData",
     "EmittanceData",
     "TransitionRadiationData",
+    "XrayScatteringData"
 ]
diff --git a/lib/python/picongpu/plugins/data/base_reader.py b/lib/python/picongpu/plugins/data/base_reader.py
index 4d869e1189..39c8a19bf4 100644
--- a/lib/python/picongpu/plugins/data/base_reader.py
+++ b/lib/python/picongpu/plugins/data/base_reader.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
@@ -38,7 +38,7 @@ def get_dt(self):
         """
         return self.find_time.get_dt()
 
-    def get_times(self, **kwargs):
+    def get_times(self, *args, **kwargs):
         """
         Returns
         -------
@@ -46,10 +46,10 @@ def get_times(self, **kwargs):
         data is available
         """
 
-        iterations = np.array(self.get_iterations(**kwargs))
+        iterations = np.array(self.get_iterations(*args, **kwargs))
         return self.find_time.get_time(iterations)
 
-    def get_data_path(self, **kwargs):
+    def get_data_path(self, *args, **kwargs):
         """
         Returns
         -------
@@ -57,7 +57,7 @@ def get_data_path(self, **kwargs):
         """
         raise NotImplementedError
 
-    def get_iterations(self, **kwargs):
+    def get_iterations(self, *args, **kwargs):
         """
         Returns
         -------
@@ -66,7 +66,7 @@ def get_iterations(self, **kwargs):
         """
         raise NotImplementedError
 
-    def get(self, **kwargs):
+    def get(self, *args, **kwargs):
         """
         Parameters
         ----------
@@ -74,6 +74,10 @@ def get(self, **kwargs):
         If both are given, the 'time' argument is converted to
         an iteration and data for the iteration matching the time
         is returned.
+        For other valid args and kwargs, please look at the
+        documentation of the '_get_for_iteration' methods
+        of the derived classes since the parameters are passed
+        on to that function.
 
         time: float or np.array of float or None.
             If None, data for all available times is returned.
@@ -103,15 +107,15 @@ def get(self, **kwargs):
             time = kwargs.pop('time')
             if time is None:
                 # use all times that are available, i.e. all iterations
-                iteration = self.get_iterations(**kwargs)
+                iteration = self.get_iterations(*args, **kwargs)
             else:
                 iteration = self.find_time.get_iteration(
                     time, method='closest')
             # print("got 'time'=", time, ", converted to iter", iteration)
 
-        return self._get_for_iteration(iteration, **kwargs)
+        return self._get_for_iteration(iteration, *args, **kwargs)
 
-    def _get_for_iteration(self, iteration, **kwargs):
+    def _get_for_iteration(self, iteration, *args, **kwargs):
         """
         Get the data for a given iteration.
 
diff --git a/lib/python/picongpu/plugins/data/emittance.py b/lib/python/picongpu/plugins/data/emittance.py
index 830b556a74..2ab0778a42 100644
--- a/lib/python/picongpu/plugins/data/emittance.py
+++ b/lib/python/picongpu/plugins/data/emittance.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sophie Rudat, Axel Huebl
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/data/energy_histogram.py b/lib/python/picongpu/plugins/data/energy_histogram.py
index ac6e29baff..0ea0d23265 100644
--- a/lib/python/picongpu/plugins/data/energy_histogram.py
+++ b/lib/python/picongpu/plugins/data/energy_histogram.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Axel Huebl
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/data/phase_space.py b/lib/python/picongpu/plugins/data/phase_space.py
index b8f6334835..5bad6fcf15 100644
--- a/lib/python/picongpu/plugins/data/phase_space.py
+++ b/lib/python/picongpu/plugins/data/phase_space.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Axel Huebl
 License: GPLv3+
 """
@@ -10,9 +10,7 @@
 import collections
 import numpy as np
 import os
-import glob
-import re
-import h5py as h5
+import openpmd_api as io
 
 
 class PhaseSpaceMeta(object):
@@ -71,10 +69,9 @@ def __init__(self, run_directory):
         super().__init__(run_directory)
 
         self.data_file_prefix = "PhaseSpace_{0}_{1}_{2}_{3}"
-        self.data_file_suffix = ".h5"
         self.data_hdf5_path = "/data/{0}/{1}"
 
-    def get_data_path(self, ps, species, species_filter="all", iteration=None):
+    def get_data_path(self, ps, species, species_filter="all", file_ext="h5"):
         """
         Return the path to the underlying data file.
 
@@ -89,19 +86,16 @@ def get_data_path(self, ps, species, species_filter="all", iteration=None):
         species_filter: string
             name of the particle species filter, default is 'all'
             (defined in ``particleFilters.param``)
-        iteration : (unsigned) int or list of int [unitless]
-            The iteration at which to read the data.
-            If 'None', a regular expression string matching
-            all iterations will be returned.
+        file_ext: string
+            filename extension for openPMD backend
+            default is 'h5' for the HDF5 backend
 
         Returns
         -------
-        A string with a file path and a string with a in-file HDF5 path if
-        iteration is a single value or a list of length one.
-        If iteration is a list of length > 1, a list of paths is returned.
-        If iteration is None, only the first string is returned and contains a
-        regex-* for the position iteration.
+        A string with a the full openPMD file path pattern for loading from
+        a file-based iteration layout.
         """
+        # @todo different file extensions?
         if species is None:
             raise ValueError('The species parameter can not be None!')
         if species_filter is None:
@@ -123,45 +117,17 @@ def get_data_path(self, ps, species, species_filter="all", iteration=None):
                           'Did the simulation already run?'
                           .format(self.run_directory))
 
-        if iteration is not None:
-            if not isinstance(iteration, collections.Iterable):
-                iteration = [iteration]
-
-            ret = []
-            for it in iteration:
-                data_file_name = self.data_file_prefix.format(
-                    species,
-                    species_filter,
-                    ps,
-                    str(it)) + self.data_file_suffix
-                data_file_path = os.path.join(output_dir, data_file_name)
-
-                if not os.path.isfile(data_file_path):
-                    raise IOError('The file {} does not exist.\n'
-                                  'Did the simulation already run?'
-                                  .format(data_file_path))
-
-                data_hdf5_name = self.data_hdf5_path.format(
-                    it,
-                    ps)
-
-                ret.append((data_file_path, data_hdf5_name))
-            if len(iteration) == 1:
-                return ret[0]
-            else:
-                return ret
-        else:
-            iteration_str = "*"
-
-            data_file_name = self.data_file_prefix.format(
-                species,
-                species_filter,
-                ps,
-                iteration_str
-            ) + self.data_file_suffix
-            return os.path.join(output_dir, data_file_name)
-
-    def get_iterations(self, ps, species, species_filter='all'):
+        iteration_str = "%T"
+        data_file_name = self.data_file_prefix.format(
+            species,
+            species_filter,
+            ps,
+            iteration_str
+        ) + '.' + file_ext
+        return os.path.join(output_dir, data_file_name)
+
+    def get_iterations(self, ps, species, species_filter='all',
+                       file_ext="h5"):
         """
         Return an array of iterations with available data.
 
@@ -176,32 +142,25 @@ def get_iterations(self, ps, species, species_filter='all'):
         species_filter: string
             name of the particle species filter, default is 'all'
             (defined in ``particleFilters.param``)
+        file_ext: string
+            filename extension for openPMD backend
+            default is 'h5' for the HDF5 backend
 
         Returns
         -------
         An array with unsigned integers.
         """
         # get the regular expression matching all available files
-        data_file_path = self.get_data_path(ps, species, species_filter)
-
-        matching_files = glob.glob(data_file_path)
-        re_it = re.compile(data_file_path.replace("*", "([0-9]+)"))
-
-        iterations = np.array(
-            sorted(
-                map(
-                    lambda file_path:
-                    np.uint64(re_it.match(file_path).group(1)),
-                    matching_files
-                )
-            ),
-            dtype=np.uint64
-        )
+        data_file_path = self.get_data_path(ps, species, species_filter,
+                                            file_ext=file_ext)
+
+        series = io.Series(data_file_path, io.Access.read_only)
+        iterations = [key for key, _ in series.iterations.items()]
 
         return iterations
 
     def _get_for_iteration(self, iteration, ps, species, species_filter='all',
-                           **kwargs):
+                           file_ext="h5", **kwargs):
         """
         Get a phase space histogram.
 
@@ -219,6 +178,9 @@ def _get_for_iteration(self, iteration, ps, species, species_filter='all',
         species_filter: string
             name of the particle species filter, default is 'all'
             (defined in ``particleFilters.param``)
+        file_ext: string
+            filename extension for openPMD backend
+            default is 'h5' for the HDF5 backend
 
         Returns
         -------
@@ -231,8 +193,11 @@ def _get_for_iteration(self, iteration, ps, species, species_filter='all',
         containing ps and ps_meta for each requested iteration.
         If a single iteration is requested, return the tuple (ps, ps_meta).
         """
-        available_iterations = self.get_iterations(
-            ps, species, species_filter)
+
+        data_file_path = self.get_data_path(ps, species, species_filter,
+                                            file_ext=file_ext)
+        series = io.Series(data_file_path, io.Access.read_only)
+        available_iterations = [key for key, _ in series.iterations.items()]
 
         if iteration is not None:
             if not isinstance(iteration, collections.Iterable):
@@ -247,28 +212,25 @@ def _get_for_iteration(self, iteration, ps, species, species_filter='all',
             iteration = available_iterations
 
         ret = []
-        for it in iteration:
-            data_file_path, data_hdf5_name = self.get_data_path(
-                ps,
-                species,
-                species_filter,
-                it)
-
-            f = h5.File(data_file_path, 'r')
-            ps_data = f[data_hdf5_name]
+        for index in iteration:
+            it = series.iterations[index]
+            dataset_name = "{}_{}_{}".format(species, species_filter, ps)
+            mesh = it.meshes[dataset_name]
+            ps_data = mesh[io.Mesh_Record_Component.SCALAR]
 
             # all in SI
-            dV = ps_data.attrs['dV'] * ps_data.attrs['dr_unit']**3
-            unitSI = ps_data.attrs['sim_unit']
-            p_range = ps_data.attrs['p_unit'] * \
-                np.array([ps_data.attrs['p_min'], ps_data.attrs['p_max']])
-
-            mv_start = ps_data.attrs['movingWindowOffset']
-            mv_end = mv_start + ps_data.attrs['movingWindowSize']
+            dV = mesh.get_attribute('dV') * mesh.get_attribute('dr')**3
+            unitSI = mesh.get_attribute('sim_unit')
+            p_range = mesh.get_attribute('p_unit') * \
+                np.array(
+                    [mesh.get_attribute('p_min'), mesh.get_attribute('p_max')])
+
+            mv_start = mesh.get_attribute('movingWindowOffset')
+            mv_end = mv_start + mesh.get_attribute('movingWindowSize')
             #                2D histogram:         0 (r_i); 1 (p_i)
-            spatial_offset = ps_data.attrs['_global_start'][1]
+            spatial_offset = mesh.get_attribute('_global_start')[0]
 
-            dr = ps_data.attrs['dr'] * ps_data.attrs['dr_unit']
+            dr = mesh.get_attribute('dr') * mesh.get_attribute('dr_unit')
 
             r_range_cells = np.array([mv_start, mv_end]) + spatial_offset
             r_range = r_range_cells * dr
@@ -278,7 +240,7 @@ def _get_for_iteration(self, iteration, ps, species, species_filter='all',
             # cut out the current window & scale by unitSI
             ps_cut = ps_data[mv_start:mv_end, :] * unitSI
 
-            f.close()
+            it.close()
 
             ps_meta = PhaseSpaceMeta(
                 species, species_filter, ps, ps_cut.shape, extent, dV)
diff --git a/lib/python/picongpu/plugins/data/png.py b/lib/python/picongpu/plugins/data/png.py
index 79e41ef17f..f391f5e972 100644
--- a/lib/python/picongpu/plugins/data/png.py
+++ b/lib/python/picongpu/plugins/data/png.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
@@ -9,8 +9,9 @@
 
 import numpy as np
 import os
-from scipy import misc
 import collections
+from imageio import imread
+
 
 SPECIES_LONG_NAMES = {
     'e': 'Electrons'
@@ -196,7 +197,7 @@ def _get_for_iteration(self, iteration, species, species_filter='all',
             # iteration is None, so we use all available data
             iteration = available_iterations
 
-        imgs = [misc.imread(
+        imgs = [imread(
             self.get_data_path(species, species_filter, axis,
                                slice_point, it)) for it in iteration]
 
diff --git a/lib/python/picongpu/plugins/data/radiation.py b/lib/python/picongpu/plugins/data/radiation.py
index 988e300f62..639a7f1dbd 100644
--- a/lib/python/picongpu/plugins/data/radiation.py
+++ b/lib/python/picongpu/plugins/data/radiation.py
@@ -1,4 +1,4 @@
-# Copyright 2016-2020 Richard Pausch
+# Copyright 2016-2021 Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/lib/python/picongpu/plugins/data/requirements.txt b/lib/python/picongpu/plugins/data/requirements.txt
index 515602f5a6..305e86a761 100644
--- a/lib/python/picongpu/plugins/data/requirements.txt
+++ b/lib/python/picongpu/plugins/data/requirements.txt
@@ -2,4 +2,5 @@ numpy
 pandas>=0.21.0
 h5py
 pillow
-scipy
+imageio
+openPMD-api>=0.10.3gt
diff --git a/lib/python/picongpu/plugins/data/sliceFieldReader.py b/lib/python/picongpu/plugins/data/sliceFieldReader.py
index 68b526f85b..da658a504d 100644
--- a/lib/python/picongpu/plugins/data/sliceFieldReader.py
+++ b/lib/python/picongpu/plugins/data/sliceFieldReader.py
@@ -1,4 +1,4 @@
-# Copyright 2014-2020 Richard Pausch, Klaus Steiniger
+# Copyright 2014-2021 Richard Pausch, Klaus Steiniger
 #
 # This file is part of PIConGPU.
 #
diff --git a/lib/python/picongpu/plugins/data/transitionradiation.py b/lib/python/picongpu/plugins/data/transitionradiation.py
index b77b6b0c7f..db88d9e28d 100644
--- a/lib/python/picongpu/plugins/data/transitionradiation.py
+++ b/lib/python/picongpu/plugins/data/transitionradiation.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Axel Huebl, Finn-Ole Carstens
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/jupyter_widgets/base_widget.py b/lib/python/picongpu/plugins/jupyter_widgets/base_widget.py
index 903035db01..75c9fd2e0e 100644
--- a/lib/python/picongpu/plugins/jupyter_widgets/base_widget.py
+++ b/lib/python/picongpu/plugins/jupyter_widgets/base_widget.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
@@ -166,6 +166,14 @@ def _show_run_dir_options_in_dropdown(self):
             self._handle_run_dir_selection_callback, names='value')
         # set the UI
         self.sim_drop.options = sim_options
+        # don't select a value yet but leave it to the user.
+        # this needs to be handled differently
+        # for single and multi selection
+        if isinstance(self.sim_drop, widgets.Dropdown):
+            self.sim_drop.value = None
+        else:
+            # we assume widgets.SelectMultiple instance here
+            self.sim_drop.value = ()
         # re-enable the callback functions
         self.sim_drop.observe(
             self._handle_run_dir_selection_callback, names='value')
@@ -187,6 +195,20 @@ def set_run_dir_options(self, run_dir_options):
         # set the options in the dropdown
         self._show_run_dir_options_in_dropdown()
 
+        # clear the ax (this is done by the current plot_mpl instance)
+        self._clean_ax()
+
+        # create a fresh plot_mpl object since the old
+        # one had some run directories which are outdated now
+        plot_mpl_class = type(self.plot_mpl)
+        self.plot_mpl = plot_mpl_class(
+            run_directories=None,
+            ax=self.ax)
+
+        # the user has not yet chosen any simulation
+        # so we have no option about which times are available
+        self.sim_time_slider.options = ('',)
+
     def _init_fig_and_ax(self, fig, **kwargs):
         """
         Creates the figure and the ax as members.
@@ -381,10 +403,6 @@ def visualize(self, **kwargs):
         if time is None or time == "":
             return
 
-        # print("{} called visualize for time {} and run_dirs {}".format(
-        #     type(self), time,
-        #     [reader.run_directory for reader in self.plot_mpl.data_reader]))
-
         vis_params = self._get_widget_args()
         try:
             self.plot_mpl.visualize(time=time,
@@ -396,13 +414,16 @@ def visualize(self, **kwargs):
         # since interactive mode should be turned off, we have
         # to update the figure explicitely
         try:
-            self.fig.canvas.draw()
-            self.fig.canvas.flush_events()
+            self.update_plot()
         except ValueError as e:
             warn("{}: drawing the plot failed! Reason: {}".format(
                 type(self), e))
             # raise e
 
+    def update_plot(self):
+        self.fig.canvas.draw()
+        self.fig.canvas.flush_events()
+
     def _make_drop_val_compatible(self, val):
         """
         Depending on the type of self.sim_drop we have to
@@ -482,3 +503,5 @@ def _use_options_from_other(self, other):
     @capture_output
     def _clean_ax(self):
         self.plot_mpl._clean_ax()
+        # refresh the figure since we are not in interactive mode
+        self.update_plot()
diff --git a/lib/python/picongpu/plugins/jupyter_widgets/energy_histogram_widget.py b/lib/python/picongpu/plugins/jupyter_widgets/energy_histogram_widget.py
index 73e3581e84..2749733666 100644
--- a/lib/python/picongpu/plugins/jupyter_widgets/energy_histogram_widget.py
+++ b/lib/python/picongpu/plugins/jupyter_widgets/energy_histogram_widget.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/jupyter_widgets/phase_space_widget.py b/lib/python/picongpu/plugins/jupyter_widgets/phase_space_widget.py
index 457406fd29..e8081acbfd 100644
--- a/lib/python/picongpu/plugins/jupyter_widgets/phase_space_widget.py
+++ b/lib/python/picongpu/plugins/jupyter_widgets/phase_space_widget.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/jupyter_widgets/png_widget.py b/lib/python/picongpu/plugins/jupyter_widgets/png_widget.py
index 79ec875114..9d8f0787ac 100644
--- a/lib/python/picongpu/plugins/jupyter_widgets/png_widget.py
+++ b/lib/python/picongpu/plugins/jupyter_widgets/png_widget.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/jupyter_widgets/utils.py b/lib/python/picongpu/plugins/jupyter_widgets/utils.py
index df2a6ade42..2137b61aa5 100644
--- a/lib/python/picongpu/plugins/jupyter_widgets/utils.py
+++ b/lib/python/picongpu/plugins/jupyter_widgets/utils.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/plot_mpl/base_visualizer.py b/lib/python/picongpu/plugins/plot_mpl/base_visualizer.py
index 184e5392aa..70efd494e4 100644
--- a/lib/python/picongpu/plugins/plot_mpl/base_visualizer.py
+++ b/lib/python/picongpu/plugins/plot_mpl/base_visualizer.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/plot_mpl/emittance_evolution_visualizer.py b/lib/python/picongpu/plugins/plot_mpl/emittance_evolution_visualizer.py
index 5ba8cc7227..f96eacf29a 100644
--- a/lib/python/picongpu/plugins/plot_mpl/emittance_evolution_visualizer.py
+++ b/lib/python/picongpu/plugins/plot_mpl/emittance_evolution_visualizer.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sophie Rudat, Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/plot_mpl/energy_histogram_visualizer.py b/lib/python/picongpu/plugins/plot_mpl/energy_histogram_visualizer.py
index 80a3941493..7c105e1270 100644
--- a/lib/python/picongpu/plugins/plot_mpl/energy_histogram_visualizer.py
+++ b/lib/python/picongpu/plugins/plot_mpl/energy_histogram_visualizer.py
@@ -1,14 +1,16 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
+import numpy as np
 
 from picongpu.plugins.data import EnergyHistogramData
 from picongpu.plugins.plot_mpl.base_visualizer import Visualizer as\
     BaseVisualizer
+from warnings import warn
 
 
 class Visualizer(BaseVisualizer):
@@ -44,6 +46,12 @@ def _create_plt_obj(self, idx):
 
         counts, bins, iteration, dt = self.data[idx]
         label = self.sim_labels[idx]
+
+        if np.all(counts == 0.):
+            warn("All counts were 0 for {}. ".format(label) +
+                 "No log-plot can be created!")
+            return
+
         self.plt_obj[idx] = self.ax.semilogy(
             bins, counts, nonposy='clip', label=label,
             color=self.colors[idx])[0]
@@ -53,6 +61,13 @@ def _update_plt_obj(self, idx):
         Implementation of base class function.
         """
         counts, bins, iteration, dt = self.data[idx]
+        label = self.sim_labels[idx]
+
+        if np.all(counts == 0.):
+            warn("All counts were 0 for {}. ".format(label) +
+                 "Log-plot will not be updated!")
+            return
+
         self.plt_obj[idx].set_data(bins, counts)
 
     def visualize(self, **kwargs):
diff --git a/lib/python/picongpu/plugins/plot_mpl/energy_waterfall_visualizer.py b/lib/python/picongpu/plugins/plot_mpl/energy_waterfall_visualizer.py
index 51ccbf1076..9627bb6ab7 100644
--- a/lib/python/picongpu/plugins/plot_mpl/energy_waterfall_visualizer.py
+++ b/lib/python/picongpu/plugins/plot_mpl/energy_waterfall_visualizer.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sophie Rudat, Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/plot_mpl/phase_space_visualizer.py b/lib/python/picongpu/plugins/plot_mpl/phase_space_visualizer.py
index 344d209d1c..4c5cc163c1 100644
--- a/lib/python/picongpu/plugins/plot_mpl/phase_space_visualizer.py
+++ b/lib/python/picongpu/plugins/plot_mpl/phase_space_visualizer.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
@@ -191,6 +191,9 @@ def visualize(self, **kwargs):
             ps : string
                 phase space selection in order: spatial, momentum component,
                 e.g. 'ypy' or 'ypx'
+            file_ext: string
+                filename extension for openPMD backend
+                default is 'h5' for the HDF5 backend
         """
         super().visualize(**kwargs)
 
diff --git a/lib/python/picongpu/plugins/plot_mpl/png_visualizer.py b/lib/python/picongpu/plugins/plot_mpl/png_visualizer.py
index a410970f18..bae668838d 100644
--- a/lib/python/picongpu/plugins/plot_mpl/png_visualizer.py
+++ b/lib/python/picongpu/plugins/plot_mpl/png_visualizer.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/plot_mpl/slice_emittance_visualizer.py b/lib/python/picongpu/plugins/plot_mpl/slice_emittance_visualizer.py
index 6cf06586f6..635109bf16 100644
--- a/lib/python/picongpu/plugins/plot_mpl/slice_emittance_visualizer.py
+++ b/lib/python/picongpu/plugins/plot_mpl/slice_emittance_visualizer.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sophie Rudat, Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/plugins/plot_mpl/slice_emittance_waterfall_visualizer.py b/lib/python/picongpu/plugins/plot_mpl/slice_emittance_waterfall_visualizer.py
index 911f491320..84d58f48ee 100644
--- a/lib/python/picongpu/plugins/plot_mpl/slice_emittance_waterfall_visualizer.py
+++ b/lib/python/picongpu/plugins/plot_mpl/slice_emittance_waterfall_visualizer.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sophie Rudat, Sebastian Starke
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/utils/field_ionization.py b/lib/python/picongpu/utils/field_ionization.py
index 54c9d2a286..e966c194d8 100755
--- a/lib/python/picongpu/utils/field_ionization.py
+++ b/lib/python/picongpu/utils/field_ionization.py
@@ -1,7 +1,7 @@
 """Field ionization models implemented in PIConGPU.
 
 This file is part of the PIConGPU.
-Copyright 2019-2020 PIConGPU contributors
+Copyright 2019-2021 PIConGPU contributors
 Authors: Marco Garten
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/utils/find_time.py b/lib/python/picongpu/utils/find_time.py
index 042ffa8b24..6a46d58059 100644
--- a/lib/python/picongpu/utils/find_time.py
+++ b/lib/python/picongpu/utils/find_time.py
@@ -1,7 +1,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Axel Huebl
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/utils/memory_calculator.py b/lib/python/picongpu/utils/memory_calculator.py
index 3e97e4f554..e647d7b6c5 100644
--- a/lib/python/picongpu/utils/memory_calculator.py
+++ b/lib/python/picongpu/utils/memory_calculator.py
@@ -6,7 +6,7 @@
 It is supposed to give an estimate for the memory requirement of a PIConGPU
 simulation per device.
 
-Copyright 2018-2020 PIConGPU contributors
+Copyright 2018-2021 PIConGPU contributors
 Authors: Marco Garten, Sergei Bastrakov
 License: GPLv3+
 """
diff --git a/lib/python/picongpu/utils/param_parser.py b/lib/python/picongpu/utils/param_parser.py
index f0d972e371..77dbb70bc0 100644
--- a/lib/python/picongpu/utils/param_parser.py
+++ b/lib/python/picongpu/utils/param_parser.py
@@ -2,7 +2,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke
 License: GPLv3+
 """
diff --git a/libraryDependencies.png b/libraryDependencies.png
index 2f04308fc6..7904d86e71 100644
Binary files a/libraryDependencies.png and b/libraryDependencies.png differ
diff --git a/share/ci/bash.profile b/share/ci/bash.profile
new file mode 100755
index 0000000000..80443b74b5
--- /dev/null
+++ b/share/ci/bash.profile
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# setup dependencies for PIConGPU for CMake and runtime usage
+
+set -e
+set -o pipefail
+
+if [ -d "/opt/pngwriter" ] ; then
+  export PNGWRITER_ROOT=/opt/pngwriter/0.7.0
+else
+  # pngwriter is currently install to the / instead of /opt
+  export PNGWRITER_ROOT=/pngwriter/0.7.0
+fi
+export CMAKE_PREFIX_PATH=$PNGWRITER_ROOT:$CMAKE_PREFIX_PATH
+export LD_LIBRARY_PATH=$PNGWRITER_ROOT/lib:$LD_LIBRARY_PATH
+
+export HDF5_ROOT=/opt/hdf5/1.8.20/
+export LD_LIBRARY_PATH=$HDF5_ROOT/lib:$LD_LIBRARY_PATH
+
+export SPLASH_ROOT=/opt/libsplash/1.7.0
+export CMAKE_PREFIX_PATH=$SPLASH_ROOT:$CMAKE_PREFIX_PATH
+export LD_LIBRARY_PATH=$SPLASH_ROOT/lib:$LD_LIBRARY_PATH
+
+export ADIOS1_ROOT=/opt/adios/1.13.1
+export CMAKE_PREFIX_PATH=$ADIOS1_ROOT:$CMAKE_PREFIX_PATH
+export PATH=$ADIOS1_ROOT/bin:$PATH
+export LD_LIBRARY_PATH=$ADIOS1_ROOT/lib:$LD_LIBRARY_PATH
+
+export ADIOS2_ROOT=/opt/adios/2.6.0
+export CMAKE_PREFIX_PATH=$ADIOS2_ROOT:$CMAKE_PREFIX_PATH
+export PATH=$ADIOS2_ROOT/bin:$PATH
+export LD_LIBRARY_PATH=$ADIOS2_ROOT/lib:$LD_LIBRARY_PATH
+
+if [ -z "$DISABLE_ISAAC" ] ; then
+  export ICET_ROOT=/opt/icet/2.9.0
+  export CMAKE_PREFIX_PATH=$ICET_ROOT/lib:$CMAKE_PREFIX_PATH
+  export LD_LIBRARY_PATH=$ICET_ROOT/lib:$LD_LIBRARY_PATH
+
+  export JANSSON_ROOT=/opt/jansson/2.9.0/
+  export CMAKE_PREFIX_PATH=$JANSSON_ROOT/lib/cmake:$CMAKE_PREFIX_PATH
+  export LD_LIBRARY_PATH=$JANSSON_ROOT/lib:$LD_LIBRARY_PATH
+
+  export ISAAC_ROOT=/opt/isaac/1.6.0-dev
+  export CMAKE_PREFIX_PATH=$ISAAC_ROOT:$CMAKE_PREFIX_PATH
+  export LD_LIBRARY_PATH=$ISAAC_ROOT/lib:$LD_LIBRARY_PATH
+fi
+
+export OPENPMD_ROOT=/opt/openPMD-api/0.12.0-dev
+export CMAKE_PREFIX_PATH=$OPENPMD_ROOT:$CMAKE_PREFIX_PATH
+export LD_LIBRARY_PATH=$OPENPMD_ROOT/lib:$LD_LIBRARY_PATH
diff --git a/share/ci/check_cpp_code_style.sh b/share/ci/check_cpp_code_style.sh
new file mode 100755
index 0000000000..d86e802172
--- /dev/null
+++ b/share/ci/check_cpp_code_style.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+cd $CI_PROJECT_DIR
+
+# check code style with clang format
+find include/ share/picongpu/ share/pmacc -iname "*.def" \
+  -o -iname "*.h" -o -iname "*.cpp" -o -iname "*.cu" \
+  -o -iname "*.hpp" -o -iname "*.tpp" -o -iname "*.kernel" \
+  -o -iname "*.loader" -o -iname "*.param" -o -iname "*.unitless" \
+  | xargs clang-format-11 --dry-run --Werror
+
+#############################################################################
+# Conformance with Alpaka: Do not write __global__ CUDA kernels directly    #
+#############################################################################
+test/hasCudaGlobalKeyword include/pmacc
+test/hasCudaGlobalKeyword share/pmacc/examples
+test/hasCudaGlobalKeyword include/picongpu
+test/hasCudaGlobalKeyword share/picongpu/examples
+
+#############################################################################
+# Disallow end-of-line (EOL) white spaces                                   #
+#############################################################################
+test/hasEOLwhiteSpace
+
+#############################################################################
+# Disallow TABs, use white spaces                                           #
+#############################################################################
+test/hasTabs
+
+#############################################################################
+# Disallow non-ASCII in source files and scripts                            #
+#############################################################################
+test/hasNonASCII
+
+#############################################################################
+# Disallow spaces before pre-compiler macros                                #
+#############################################################################
+test/hasSpaceBeforePrecompiler
+
+#############################################################################
+# Enforce angle brackets <...> for includes of external library files       #
+#############################################################################
+test/hasExtLibIncludeBrackets include boost
+test/hasExtLibIncludeBrackets include alpaka
+test/hasExtLibIncludeBrackets include cupla
+test/hasExtLibIncludeBrackets include splash
+test/hasExtLibIncludeBrackets include mallocMC
+test/hasExtLibIncludeBrackets include/picongpu pmacc
+test/hasExtLibIncludeBrackets share/picongpu/examples pmacc
+test/hasExtLibIncludeBrackets share/picongpu/examples boost
+test/hasExtLibIncludeBrackets share/picongpu/examples alpaka
+test/hasExtLibIncludeBrackets share/picongpu/examples cupla
+test/hasExtLibIncludeBrackets share/picongpu/examples splash
+test/hasExtLibIncludeBrackets share/picongpu/examples mallocMC
+test/hasExtLibIncludeBrackets share/pmacc/examples pmacc
diff --git a/share/ci/compiler_clang.yml b/share/ci/compiler_clang.yml
new file mode 100644
index 0000000000..da3754a0b1
--- /dev/null
+++ b/share/ci/compiler_clang.yml
@@ -0,0 +1,17 @@
+################################################################################
+#   [clang++-X] : X = {4.0, 5.0, 6.0, 7, 8, 9, 10, 11}
+
+.base_clang:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-clang-pic:1.2
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+  script:
+    - apt update
+    - apt install -y curl libjpeg-dev
+    - $CI_PROJECT_DIR/share/ci/git_merge.sh
+    - $CI_PROJECT_DIR/share/ci/bash.profile
+    - $CI_PROJECT_DIR/share/ci/run_pmacc_tests.sh
+    - $CI_PROJECT_DIR/share/ci/run_picongpu_tests.sh
+  # x86_64 tag is used to get a multi-core CPU for the tests
+  tags:
+    - x86_64
diff --git a/share/ci/compiler_clang_cuda.yml b/share/ci/compiler_clang_cuda.yml
new file mode 100644
index 0000000000..58caf4477d
--- /dev/null
+++ b/share/ci/compiler_clang_cuda.yml
@@ -0,0 +1,30 @@
+################################################################################
+#   [clang++-X] : X = {4.0, 5.0, 6.0, 7, 8, 9, 10, 11}
+# cuda9.2Clang is not supporting clang-7
+
+.base_cuda_clang:
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+    PIC_CMAKE_ARGS: "-DALPAKA_CUDA_COMPILER=clang"
+  script:
+    - apt update
+    - apt install -y curl libjpeg-dev
+    - $CI_PROJECT_DIR/share/ci/git_merge.sh
+    - $CI_PROJECT_DIR/share/ci/bash.profile
+    - $CI_PROJECT_DIR/share/ci/run_pmacc_tests.sh
+    - $CI_PROJECT_DIR/share/ci/run_picongpu_tests.sh
+  tags:
+    - cuda
+    - x86_64 
+
+.base_clangCuda_cuda_9.2:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda92-clangpic:1.2
+  extends: .base_cuda_clang
+  
+.base_clangCuda_cuda_10.0:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda100-clangpic:1.2
+  extends: .base_cuda_clang
+
+.base_clangCuda_cuda_10.1:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda101-clangpic:1.2
+  extends: .base_cuda_clang
diff --git a/share/ci/compiler_gcc.yml b/share/ci/compiler_gcc.yml
new file mode 100644
index 0000000000..ad24aa7b8e
--- /dev/null
+++ b/share/ci/compiler_gcc.yml
@@ -0,0 +1,17 @@
+################################################################################
+#   [g++-X] : X = {5, 6, 7, 8, 9 ,10}
+
+.base_gcc:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc-pic:1.2
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+  script:
+    - apt update
+    - apt install -y curl libjpeg-dev
+    - $CI_PROJECT_DIR/share/ci/git_merge.sh
+    - $CI_PROJECT_DIR/share/ci/bash.profile
+    - $CI_PROJECT_DIR/share/ci/run_pmacc_tests.sh
+    - $CI_PROJECT_DIR/share/ci/run_picongpu_tests.sh
+  # x86_64 tag is used to get a multi-core CPU for the tests
+  tags:
+    - x86_64
diff --git a/share/ci/compiler_hipcc.yml b/share/ci/compiler_hipcc.yml
new file mode 100644
index 0000000000..ba720aa4d3
--- /dev/null
+++ b/share/ci/compiler_hipcc.yml
@@ -0,0 +1,28 @@
+################################################################################
+#   [clang-X] : X = {12}
+# clang compiler is located under /opt/rocm/llvm/bin
+
+.base_hipcc:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-rocm4.0-pic:1.2
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+    PIC_CMAKE_ARGS: "-DALPAKA_HIP_ARCH=900 -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake"
+    # use VEGA64 GPU
+    HIP_VISIBLE_DEVICES: "2"
+    # ISAAC is not working with HIP
+    DISABLE_ISAAC: "yes"
+  script:
+    - export PATH="$PATH:/opt/rocm/llvm/bin/"
+    # rocm 4.0 container is missing a binary/symlink named `clang++-12`
+    - ln -s /opt/rocm/llvm/bin/clang++ /opt/rocm/llvm/bin/clang++-12
+    - rocm-smi
+    - hipcc --version
+    - apt update
+    - apt install -y curl libjpeg-dev
+    - $CI_PROJECT_DIR/share/ci/git_merge.sh
+    - $CI_PROJECT_DIR/share/ci/bash.profile
+    - $CI_PROJECT_DIR/share/ci/run_pmacc_tests.sh
+    - $CI_PROJECT_DIR/share/ci/run_picongpu_tests.sh
+  tags:
+    - amd
+    - rocm
diff --git a/share/ci/compiler_nvcc_cuda.yml b/share/ci/compiler_nvcc_cuda.yml
new file mode 100644
index 0000000000..9af605df63
--- /dev/null
+++ b/share/ci/compiler_nvcc_cuda.yml
@@ -0,0 +1,47 @@
+################################################################################
+#   [g++-X] : X = {5, 6, 7, 8, 9, 10}
+
+.base_nvcc:
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+  before_script:
+    - nvidia-smi
+    - nvcc --version
+  script:
+    - apt update
+    - apt install -y curl libjpeg-dev
+    - $CI_PROJECT_DIR/share/ci/git_merge.sh
+    - $CI_PROJECT_DIR/share/ci/bash.profile
+    - $CI_PROJECT_DIR/share/ci/run_pmacc_tests.sh
+    - $CI_PROJECT_DIR/share/ci/run_picongpu_tests.sh
+  tags:
+    - cuda
+    - x86_64 
+    
+.base_nvcc_cuda_9.2:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda92-gccpic:1.2
+  extends: .base_nvcc
+
+.base_nvcc_cuda_10.0:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda100-gccpic:1.2
+  extends: .base_nvcc
+  
+.base_nvcc_cuda_10.1:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda101-gccpic:1.2
+  extends: .base_nvcc
+
+.base_nvcc_cuda_10.2:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda102-gccpic:1.2
+  extends: .base_nvcc
+
+.base_nvcc_cuda_11.0:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda110-gccpic:1.2
+  extends: .base_nvcc
+
+.base_nvcc_cuda_11.1:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda111-gccpic:1.2
+  extends: .base_nvcc
+
+.base_nvcc_cuda_11.2:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda112-gccpic:1.2
+  extends: .base_nvcc
diff --git a/share/ci/generate_reduced_matrix.sh b/share/ci/generate_reduced_matrix.sh
new file mode 100755
index 0000000000..b43a7b8e4e
--- /dev/null
+++ b/share/ci/generate_reduced_matrix.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+# generate a reduced matrix with ci jobs based on the list (space separated) provided by the environment variable PIC_INPUTS
+
+export PATH=$CI_PROJECT_DIR/share/ci:$PATH
+export picongpu_DIR=$CI_PROJECT_DIR
+
+cd $picongpu_DIR/share/picongpu/
+
+echo "include:"
+echo "  - local: '/share/ci/compiler_clang.yml'"
+echo "  - local: '/share/ci/compiler_gcc.yml'"
+echo "  - local: '/share/ci/compiler_nvcc_cuda.yml'"
+echo "  - local: '/share/ci/compiler_clang_cuda.yml'"
+echo "  - local: '/share/ci/compiler_hipcc.yml'"
+echo ""
+
+# handle CI actions
+has_label=$($CI_PROJECT_DIR/share/ci/pr_has_label.sh "CI:no-compile" && echo "0" || echo "1")
+if [ "$has_label" == "0" ] ; then
+  echo "skip-compile:"
+  echo "  script:"
+  echo "    - echo \"CI action - 'CI:no-compile' -> skip compile/runtime tests\""
+  exit 0
+fi
+
+folders=()
+for CASE in ${PIC_INPUTS}; do
+  if [ "$CASE" == "examples" ] || [  "$CASE" == "tests"  ] || [  "$CASE" == "benchmarks"  ] ; then
+      all_cases=$(find ${CASE}/* -maxdepth 0 -type d)
+  else
+      all_cases=$(find $CASE -maxdepth 0 -type d)
+  fi
+  for test_case_folder in $all_cases ; do
+      folders+=($test_case_folder)
+  done
+done
+
+echo "${folders[@]}" | tr " " "\n" | n_wise_generator.py $@
diff --git a/share/ci/git_merge.sh b/share/ci/git_merge.sh
new file mode 100755
index 0000000000..09fc014725
--- /dev/null
+++ b/share/ci/git_merge.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+# merge the PR to the latest version of the destination branch
+
+cd $CI_PROJECT_DIR
+
+is_pr=$(echo "$CI_BUILD_REF_NAME" | grep -q "^pr-" && echo 0 || echo 1)
+# merge only pull requests
+if [ $is_pr -eq 0 ] ; then
+  github_group_repo="ComputationalRadiationPhysics/picongpu"
+
+  pr_id=$(echo "$CI_BUILD_REF_NAME" | cut -d"/" -f1 | cut -d"-" -f2)
+  # used a token without any rights from psychocoderHPC to avoid API query limitations
+  curl_data=$(curl -u psychocoderHPC:$GITHUB_TOKEN -X GET https://api.github.com/repos/${github_group_repo}/pulls/${pr_id} 2>/dev/null)
+  echo "--- curl data ---"
+  echo "$curl_data"
+  echo "-----------------"
+  # get the destination branch
+  destination_branch=$(echo "$curl_data" | python3 -c 'import json,sys;obj=json.loads(sys.stdin.read());print(obj["base"]["ref"])')
+  destination_sha=$(echo "$curl_data" | python3 -c 'import json,sys;obj=json.loads(sys.stdin.read());print(obj["base"]["sha"])')
+  echo "destination_branch=${destination_branch}"
+  echo "destination_sha=${destination_sha}"
+
+  mainline_exists=$(git remote -v | cut -f1 | grep mainline -q && echo 0 || echo 1)
+  # avoid adding the remote repository twice if gitlab already cached this operation
+  if [ $mainline_exists -ne 0 ] ; then
+    git remote add mainline https://github.com/${github_group_repo}.git
+  else
+    # if the PR was set to a different branch before
+    git remote set-url mainline https://github.com/${github_group_repo}.git
+  fi
+  git fetch mainline
+
+  # required by git to be able to use `git rebase`
+  git config --global user.email "CI-BOT"
+  git config --global user.name "CI-BOT@hzdr.d"
+
+  # make a copy of the pull request branch
+  git checkout -b pr_to_merge
+  # switch to the destination hash
+  git checkout -b destination_branch ${destination_sha}
+  # merge pull request to the destination
+  git merge --no-edit pr_to_merge
+fi
diff --git a/share/ci/n_wise_generator.py b/share/ci/n_wise_generator.py
new file mode 100755
index 0000000000..07a7be119f
--- /dev/null
+++ b/share/ci/n_wise_generator.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+
+# generate a reduced test matrix based on the N-wise testing model
+# https://en.wikipedia.org/wiki/All-pairs_testing
+
+from allpairspy import AllPairs
+import argparse
+import sys
+
+
+parser = argparse.ArgumentParser(description='Generate tesing pairs')
+parser.add_argument('-n', dest='n_pairs', default=1, action="store",
+                    help='number of tuple elements')
+parser.add_argument('--compact', dest='compact', action="store_true",
+                    help='print compact form of the test matrix')
+args = parser.parse_args()
+n_pairs = int(args.n_pairs)
+
+examples = []
+for i in sys.stdin:
+    examples.append(i.rstrip())
+
+
+def get_version(tuple):
+    if len(tuple) >= 2:
+        return float(tuple[1])
+    return 0
+
+
+# lookup table with compiler name and the required base container suffix
+image_dict = {
+    "g++": ".base_gcc",
+    "g++_nvcc": ".base_nvcc",
+    "clang++_nvcc": ".base_clangCuda",
+    "clang++": ".base_clang",
+    "clang++_clangCuda": ".base_clangCuda",
+    "clang++_hipcc": ".base_hipcc"
+}
+
+
+def get_base_image(compiler, backend):
+    lookup_name = compiler[0]
+    if len(compiler) == 3:
+        lookup_name += "_" + compiler[2]
+    img_name = image_dict[lookup_name]
+    if backend[0] == "cuda":
+        img_name += "_" + backend[0] + "_" + str(backend[1])
+
+    return img_name
+
+
+# filter invalid cominations
+#
+# filter based on the compatibility overview
+# https://gist.github.com/ax3l/9489132
+def is_valid_combination(row):
+    n = len(row)
+
+    if n >= 2:
+        v_compiler = get_version(row[0])
+
+        is_clang_cuda = True if len(row[0]) == 3 and \
+            row[0][2] == "clangCuda" else False
+        is_clang = True if row[0][0] == "clang++" or is_clang_cuda else False
+
+        is_gnu = True if row[0][0] == "g++" else False
+
+        is_nvcc = True if len(row[0]) == 3 and row[0][2] == "nvcc" else False
+        is_cuda = True if row[1][0] == "cuda" else False
+        v_cuda = get_version(row[1])
+
+        # hipcc
+        is_hipcc = True if len(row[0]) == 3 and row[0][2] == "hipcc" else False
+        is_hip = True if row[1][0] == "hip" else False
+
+        # CI nvcc image is not shipped with clang++
+        # clang_cuda images can currently not be used because
+        # the base image is setting -DALPAKA_CUDA_COMPILER=clang
+        if is_nvcc and is_clang:
+            return False
+
+        # hipcc is only valid in one combination
+        if is_hip and is_hipcc and is_clang and v_compiler == 12:
+            return True
+        elif is_hip or is_hipcc:
+            return False
+
+        # clang 12 is currently only shipped with the HIP container
+        if is_clang and v_compiler == 12:
+            return False
+
+        # docker images for clang cuda do not support clang++-7
+        # together with cuda-9.2
+        if is_clang_cuda and v_compiler == 7 and v_cuda == 9.2:
+            return False
+
+        # CUDA compiler requires backed `cuda`
+        if (is_nvcc or is_clang_cuda) and not is_cuda:
+            return False
+
+        # cpu only compiler can not handle the backend `cuda`
+        if (not is_nvcc and not is_clang_cuda) and is_cuda:
+            return False
+
+        # clang cuda compatibility
+        if is_clang_cuda:
+            if not is_cuda:
+                return False
+            if v_cuda == 9.2 and v_compiler >= 7:
+                return True
+            if v_cuda == 10.0 and v_compiler >= 8:
+                return True
+            if v_cuda == 10.1 and v_compiler >= 9:
+                return True
+
+            return False
+
+        # nvcc compatibility
+        if is_cuda and is_nvcc:
+            if is_gnu:
+                # g++-5.5 is not compatible with CUDA
+                # https://github.com/tensorflow/tensorflow/issues/10220
+                if v_compiler == 5:
+                    return False
+                if v_cuda <= 10.1 and v_compiler <= 7:
+                    return True
+                if v_cuda == 10.2 and v_compiler <= 8:
+                    return True
+                if v_cuda == 11.0 and v_compiler <= 9:
+                    return True
+                if v_cuda >= 11.1 and v_compiler <= 10:
+                    return True
+
+            if is_clang:
+                if v_cuda == 9.2 and v_compiler <= 5:
+                    return True
+                if 10.0 <= v_cuda and v_cuda <= 10.2 and v_compiler <= 8:
+                    return True
+                if v_cuda == 11.0 and v_compiler <= 9:
+                    return True
+                if v_cuda >= 11.1 and v_compiler <= 10:
+                    return True
+
+            return False
+
+    return True
+
+
+# compiler list
+# tuple with two components (compiler name, version)
+clang_compiers = [("clang++", 5.0), ("clang++", 6.0), ("clang++", 7),
+                  ("clang++", 8), ("clang++", 9), ("clang++", 10),
+                  ("clang++", 11), ("clang++", 12)]
+gnu_compilers = [("g++", 5), ("g++", 6), ("g++", 7), ("g++", 8),
+                 ("g++", 9), ("g++", 10)]
+compilers = [
+    clang_compiers,
+    gnu_compilers
+]
+
+# generate clang cuda compiler list
+# add third component with the device compiler name
+cuda_clang_compilers = []
+for i in clang_compiers:
+    cuda_clang_compilers.append(i + ("clangCuda", ))
+compilers.append(cuda_clang_compilers)
+
+# nvcc compiler
+cuda_nvcc_compilers = []
+for i in clang_compiers:
+    cuda_nvcc_compilers.append(i + ("nvcc", ))
+for i in gnu_compilers:
+    cuda_nvcc_compilers.append(i + ("nvcc", ))
+compilers.append(cuda_nvcc_compilers)
+
+# hipcc compiler
+hip_clang_compilers = []
+for i in clang_compiers:
+    hip_clang_compilers.append(i + ("hipcc", ))
+compilers.append(hip_clang_compilers)
+
+# PIConGPU backend list
+# tuple with two components (backend name, version)
+# version is only required for the cuda backend
+backends = [("cuda", 9.2),
+            ("cuda", 10.0), ("cuda", 10.1), ("cuda", 10.2),
+            ("cuda", 11.0), ("cuda", 11.1), ("cuda", 11.2),
+            ("omp2b", ), ("serial", ),
+            ("hip", )]
+boost_libs = ["1.65.1", "1.66.0", "1.67.0", "1.68.0", "1.69.0",
+              "1.70.0", "1.71.0", "1.72.0", "1.73.0", "1.74.0"]
+
+rounds = 1
+# activate looping over the compiler categories to minimize the test matrix
+# a small test matrix for each compiler e.g. clang, nvcc, g++, clang,
+# clangCuda is created
+if n_pairs == 1:
+    rounds = len(compilers)
+
+for i in range(rounds):
+    used_compilers = []
+    if n_pairs == 1:
+        used_compilers = compilers[i]
+    else:
+        for c in compilers:
+            used_compilers += c
+
+    parameters = [
+        used_compilers,
+        backends,
+        boost_libs,
+        examples
+    ]
+
+    for i, pairs in enumerate(
+            AllPairs(parameters,
+                     filter_func=is_valid_combination, n=n_pairs)):
+        if args.compact:
+            print("{:2d}: {}".format(i, pairs))
+        else:
+            compiler = pairs[0][0] + "-" + str(pairs[0][1])
+            backend = pairs[1][0]
+            boost_version = pairs[2]
+            folder = pairs[3]
+            v_cuda = get_version(pairs[1])
+            v_cuda_str = "" if v_cuda == 0 else str(v_cuda)
+            job_name = compiler + "_" + backend + v_cuda_str + "_boost" + \
+                boost_version + "_" + folder.replace("/", ".")
+            print(job_name + ":")
+            print("  variables:")
+            print("    PIC_TEST_CASE_FOLDER: '" + folder + "'")
+            print("    PIC_BACKEND: '" + backend + "'")
+            print("    BOOST_VERSION: '" + boost_version + "'")
+            print("    CXX_VERSION: '" + compiler + "'")
+            print("  before_script:")
+            print("    - apt-get update -qq")
+            print("    - apt-get install -y -qq libopenmpi-dev "
+                  "openmpi-bin openssh-server")
+            print("  extends: " + get_base_image(pairs[0], pairs[1]))
+            print("")
diff --git a/share/ci/pr_has_label.sh b/share/ci/pr_has_label.sh
new file mode 100755
index 0000000000..fb63e07200
--- /dev/null
+++ b/share/ci/pr_has_label.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+github_group_repo="ComputationalRadiationPhysics/picongpu"
+
+pr_id=$(echo "$CI_BUILD_REF_NAME" | cut -d"/" -f1 | cut -d"-" -f2)
+# used a token without any rights from psychocoderHPC to avoid API query limitations
+curl_data=$(curl -u psychocoderHPC:$GITHUB_TOKEN -X GET https://api.github.com/repos/${github_group_repo}/pulls/${pr_id} 2>/dev/null)
+# get the destination branch
+all_labels=$(echo "$curl_data" | python3 -c 'import json,sys;obj=json.loads(sys.stdin.read());x = obj["labels"];labels = list(i["name"] for i in x); print(labels)')
+echo "search for label: '$1'" >&2
+echo "labels: '${all_labels}'" >&2
+label_found=$(echo "$all_labels" | grep -q "$1" && echo 0 || echo 1)
+
+exit $label_found
diff --git a/share/ci/run_picongpu_tests.sh b/share/ci/run_picongpu_tests.sh
new file mode 100755
index 0000000000..49634567fb
--- /dev/null
+++ b/share/ci/run_picongpu_tests.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+# the default build type is Release
+# if neccesary, you can rerun the pipeline with another build type-> https://docs.gitlab.com/ee/ci/pipelines.html#manually-executing-pipelines
+# to change the build type, you must set the environment variable PIC_BUILD_TYPE
+if [[ ! -v PIC_BUILD_TYPE ]] ; then
+    PIC_BUILD_TYPE=Release ;
+fi
+
+###################################################
+# cmake config builder
+###################################################
+
+PIC_CONST_ARGS=""
+# to save compile time reduce the isaac functor chain length to one
+PIC_CONST_ARGS="${PIC_CONST_ARGS} -DISAAC_MAX_FUNCTORS=1 -DCMAKE_BUILD_TYPE=${PIC_BUILD_TYPE}"
+CMAKE_ARGS="${PIC_CONST_ARGS} ${PIC_CMAKE_ARGS} -DCMAKE_CXX_COMPILER=${CXX_VERSION} -DBOOST_ROOT=/opt/boost/${BOOST_VERSION}"
+
+# workaround for clang cuda
+# HDF5 from the apt sources is pulling -D_FORTIFY_SOURCE=2 into the compile flags
+# this workaround is creating a warning about the double definition of _FORTIFY_SOURCE
+#
+# Workaround will be removed after the test container are shipped with a self compiled HDF5
+if [[ $CXX_VERSION =~ ^clang && $PIC_BACKEND =~ ^cuda ]] ; then
+    CMAKE_ARGS="$CMAKE_ARGS -DCMAKE_CXX_FLAGS=-D_FORTIFY_SOURCE=0"
+fi
+
+###################################################
+# build an run tests
+###################################################
+
+# use one build directory for all build configurations
+cd $HOME
+mkdir buildCI
+cd buildCI
+
+export picongpu_DIR=$CI_PROJECT_DIR
+export PATH=$picongpu_DIR/bin:$PATH
+
+# adjust number of parallel builds to avoid out of memory errors
+# PIC_BUILD_REQUIRED_MEM_BYTES is a configured variable in the CI web interface
+PIC_PARALLEL_BUILDS=$(($CI_RAM_BYTES_TOTAL/$PIC_BUILD_REQUIRED_MEM_BYTES))
+
+# limit to number of available cores
+if [ $PIC_PARALLEL_BUILDS -gt $CI_CPUS ] ; then
+    PIC_PARALLEL_BUILDS=$CI_CPUS
+fi
+
+# CI_MAX_PARALLELISM is a configured variable in the CI web interface
+if [ $PIC_PARALLEL_BUILDS -gt $CI_MAX_PARALLELISM ] ; then
+    PIC_PARALLEL_BUILDS=$CI_MAX_PARALLELISM
+fi
+echo -e "\033[0;32m///////////////////////////////////////////////////"
+echo "PIC_BUILD_REQUIRED_MEM_BYTES-> ${PIC_BUILD_REQUIRED_MEM_BYTES}"
+echo "CI_RAM_BYTES_TOTAL          -> ${CI_RAM_BYTES_TOTAL}"
+echo "CI_CPUS                     -> ${CI_CPUS}"
+echo "CI_MAX_PARALLELISM          -> ${CI_MAX_PARALLELISM}"
+echo "number of processor threads -> $(nproc)"
+echo "number of parallel builds   -> $PIC_PARALLEL_BUILDS"
+echo "cmake version               -> $(cmake --version | head -n 1)"
+echo "build directory             -> $(pwd)"
+echo "CMAKE_ARGS                  -> ${CMAKE_ARGS}"
+echo "accelerator                 -> ${PIC_BACKEND}"
+echo "input set                   -> ${PIC_TEST_CASE_FOLDER}"
+echo -e "/////////////////////////////////////////////////// \033[0m \n\n"
+
+if [ "$PIC_TEST_CASE_FOLDER" == "examples/" ] || [ "$PIC_TEST_CASE_FOLDER" == "tests/" ] ||  [ "$PIC_TEST_CASE_FOLDER" == "benchmarks/" ] ; then
+    extended_compile_options="-l"
+fi
+
+# test compiling
+error_code=$(pic-compile -q -c"$CMAKE_ARGS" $extended_compile_options -j $PIC_PARALLEL_BUILDS ${picongpu_DIR}/share/picongpu/$PIC_TEST_CASE_FOLDER  . 2>&1 > pic_compile.log && echo "0" || echo "1")
+cat pic_compile.log
+for test_case in $(ls -w1 ./build) ; do
+    if [ -f  "build/$test_case/returnCode" ] ; then
+        returnCode=$(cat "build/$test_case/returnCode")
+        if [ "$returnCode" != "0" ] ; then
+            echo -e "\033[0;31m compile FAILED - $test_case \033[0m"
+            cat "build/$test_case/compile.log"
+        else
+            echo -e "\033[0;32m compile PASSED - $test_case \033[0m"
+        fi
+    else
+        echo -e "\033[0;33m compile NOT tested - $test_case \033[0m"
+    fi
+done
+if [ "$error_code" != "0" ] ; then
+    exit 1
+fi
+# runtime test (call --help)
+for test_case_folder in $(ls params/*/* -d -w1) ; do
+    export LD_LIBRARY_PATH=/opt/boost/${BOOST_VERSION}/lib:$LD_LIBRARY_PATH
+    echo -e "\033[0;33m runtime test- $(basename $test_case_folder) \033[0m"
+    ${test_case_folder}/bin/picongpu --help
+done
diff --git a/share/ci/run_pmacc_tests.sh b/share/ci/run_pmacc_tests.sh
new file mode 100755
index 0000000000..a08b1acddd
--- /dev/null
+++ b/share/ci/run_pmacc_tests.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+# the default build type is Release
+# if neccesary, you can rerun the pipeline with another build type-> https://docs.gitlab.com/ee/ci/pipelines.html#manually-executing-pipelines
+# to change the build type, you must set the environment variable PMACC_BUILD_TYPE
+if [[ ! -v PMACC_BUILD_TYPE ]] ; then
+    PMACC_BUILD_TYPE=Release;
+fi
+
+###################################################
+# cmake config builder
+###################################################
+
+PMACC_CONST_ARGS=""
+# to save compile time reduce the isaac functor chain length to one
+PMACC_CONST_ARGS="${PMACC_CONST_ARGS} -DCMAKE_BUILD_TYPE=${PMACC_BUILD_TYPE}"
+CMAKE_ARGS="${PMACC_CONST_ARGS} ${PIC_CMAKE_ARGS} -DCMAKE_CXX_COMPILER=${CXX_VERSION} -DBOOST_ROOT=/opt/boost/${BOOST_VERSION}"
+# allow root user to execute MPI
+CMAKE_ARGS="$CMAKE_ARGS -DUSE_MPI_AS_ROOT_USER=ON"
+
+###################################################
+# translate PIConGPU backend names into CMake Flags
+###################################################
+
+get_backend_flags()
+{
+    backend_cfg=(${1//:/ })
+    num_options="${#backend_cfg[@]}"
+    if [ $num_options -gt 2 ] ; then
+        echo "-b|--backend must be contain 'backend:arch' or 'backend'" >&2
+        exit 1
+    fi
+    if [ "${backend_cfg[0]}" == "cuda" ] ; then
+        result+=" -DALPAKA_ACC_GPU_CUDA_ENABLE=ON -DALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON"
+        if [ $num_options -eq 2 ] ; then
+            result+=" -DALPAKA_CUDA_ARCH=\"${backend_cfg[1]}\""
+        fi
+    elif [ "${backend_cfg[0]}" == "omp2b" ] ; then
+        result+=" -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON"
+        if [ $num_options -eq 2 ] ; then
+            result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
+        fi
+    elif [ "${backend_cfg[0]}" == "serial" ] ; then
+        result+=" -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON"
+        if [ $num_options -eq 2 ] ; then
+            result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
+        fi
+    elif [ "${backend_cfg[0]}" == "tbb" ] ; then
+        result+=" -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=ON"
+        if [ $num_options -eq 2 ] ; then
+            result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
+        fi
+    elif [ "${backend_cfg[0]}" == "threads" ] ; then
+        result+=" -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON"
+        if [ $num_options -eq 2 ] ; then
+            result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
+        fi
+    elif [ "${backend_cfg[0]}" == "hip" ] ; then
+        result+=" -DALPAKA_ACC_GPU_HIP_ENABLE=ON -DALPAKA_ACC_GPU_HIP_ONLY_MODE=ON"
+        if [ $num_options -eq 2 ] ; then
+            result+=" -DPMACC_CPU_ARCH=\"${backend_cfg[1]}\""
+        fi
+    else
+        echo "unsupported backend given '$1'" >&2
+        exit 1
+    fi
+
+    echo "$result"
+    exit 0
+}
+
+###################################################
+# build an run tests
+###################################################
+
+# use one build directory for all build configurations
+cd $HOME
+mkdir buildPMaccCI
+cd buildPMaccCI
+
+export code_DIR=$CI_PROJECT_DIR
+
+PMACC_PARALLEL_BUILDS=$(nproc)
+# limit to $CI_MAX_PARALLELISM parallel builds to avoid out of memory errors
+# CI_MAX_PARALLELISM is a configured variable in the CI web interface
+if [ $PMACC_PARALLEL_BUILDS -gt $CI_MAX_PARALLELISM ] ; then
+    PMACC_PARALLEL_BUILDS=$CI_MAX_PARALLELISM
+fi
+alpaka_backend=$(get_backend_flags ${PIC_BACKEND})
+CMAKE_ARGS="$CMAKE_ARGS $alpaka_backend"
+
+echo -e "\033[0;32m///////////////////////////////////////////////////"
+echo "number of processor threads -> $(nproc)"
+echo "number of parallel builds -> $PMACC_PARALLEL_BUILDS"
+echo "cmake version   -> $(cmake --version | head -n 1)"
+echo "build directory -> $(pwd)"
+echo "CMAKE_ARGS      -> ${CMAKE_ARGS}"
+echo "accelerator     -> ${PIC_BACKEND}"
+echo -e "/////////////////////////////////////////////////// \033[0m \n\n"
+
+# disable warning if infiniband is not used
+export OMPI_MCA_btl_base_warn_component_unused=0
+export LD_LIBRARY_PATH=/opt/boost/${BOOST_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake $CMAKE_ARGS $code_DIR/include/pmacc
+make
+
+ctest -V
diff --git a/share/paraview/hypnos.pvsc b/share/paraview/hypnos.pvsc
deleted file mode 100644
index b1352fa9e0..0000000000
--- a/share/paraview/hypnos.pvsc
+++ /dev/null
@@ -1,98 +0,0 @@
-<Servers>
-  <Server name="hypnos2 laser" configuration="" resource="csrc://localhost:11111">
-    <CommandStartup>
-      <Options>
-        <Option name="SSH_USER" label="SSH Username" save="true">
-          <String default="huebl" />
-        </Option>
-        <Option name="LOGIN_SERVER" label="Login Server" save="true">
-          <String default="uts.fz-rossendorf.de" />
-        </Option>
-        <Option name="HEAD_NODE" label="Head Node" save="true">
-          <String default="hypnos2" />
-        </Option>
-        <Option name="NUM_NODES" label="# of laser nodes" save="true">
-          <Range type="int" min="1" max="88" step="1" default="8" />
-        </Option>
-        <Option name="NUM_PPN" label="# of procs per node" save="true">
-          <Range type="int" min="1" max="64" step="1" default="8" />
-        </Option>
-        <Option name="NUM_PORT" label="port on head node" save="true">
-          <Range type="int" min="10000" max="20000" step="1" default="11111" />
-        </Option>
-        <Option name="T_WALLTIME" label="Wall time" save="true">
-          <String default="01:00:00" />
-        </Option>
-      </Options>
-      <Command exec="bash" timeout="0" delay="0">
-        <Arguments>
-          <Argument value="-c"/>
-          <Argument value="eval echo -e '\#PBS -q laser\\n
-\#PBS -l walltime=T_WALLTIME\\n
-\#PBS -N pvserver\\n
-\#PBS -l nodes=NUM_NODES:ppn=NUM_PPN\\n
-\#PBS -W x=NACCESSPOLICY:SINGLETASK\\n
-\#PBS -d .\\n
-\#PBS -o stdout\\n
-\#PBS -e stderr\\n
-cd .\\n
-. /etc/profile.modules\\n
-module load devel/python/2.7.5\\n
-module load numlib/icet/2.1.1\\n
-module load compiler/gnu/64/opt/4.8.2\\n
-module load mpi/openmpi/1.7.4\\n
-module load tools/infiniband/1.0.0\\n
-module load tools/mesa/7.8\\n
-module load analysis/paraview/3.98.laser\\n
-which pvserver\\n
-echo starting\\n
-mpiexec -npernode NUM_PPN -n \`expr NUM_NODES \\* NUM_PPN\`
-    \`which pvserver\` --use-offscreen-rendering -sp=NUM_PORT -rc -ch=HEAD_NODE\\n
-\# some interesting flags one can use:\\n
-\#   --mca mpi_yield_when_idle 1\\n
-\#       reduces load while idle - no busy loop\\n
-\#       http://www.open-mpi.org/faq/?category=running#force-aggressive-degraded\\n
-\#   -am \~/openib.conf\\n
-\#       in case you send HUGE data chunks over infiniband
-' > ~/.startParaView;
-echo 'Replacing space holders in job script';
-sed -i 's/NUM_NODES/$NUM_NODES$/g' ~/.startParaView;
-sed -i 's/NUM_PPN/$NUM_PPN$/g' ~/.startParaView;
-sed -i 's/T_WALLTIME/$T_WALLTIME$/g' ~/.startParaView;
-sed -i 's/HEAD_NODE/$HEAD_NODE$/g' ~/.startParaView;
-sed -i 's/NUM_PORT/$NUM_PORT$/g' ~/.startParaView;
-echo 'Checking tunnel to SSH port of $HEAD_NODE$';
-ssh -p 44334 $SSH_USER$@localhost 'exit 0' 2>/dev/null 1>/dev/null;
-if [ $? -ne 0 ] ; then
-  echo 'opening ssh tunnel to $HEAD_NODE$';
-  ssh -f -L 44334:hypnos2:22 $SSH_USER$@uts.fz-rossendorf.de -N;
-  sleep 1;
-  echo 'Permanent proxy to ssh port of $HEAD_NODE$ established';
-fi;
-echo 'Copy job script to $HEAD_NODE$';
-cat ~/.startParaView | ssh -p 44334 $SSH_USER$@localhost 'cat > ~/startParaView; /opt/torque/bin/qsub ~/startParaView';
-echo 'Check if old reverse connection tunnels still persist';
-if [ -s ~/.paraviewLastTunnel ] ; then
-  COMMENT='Check if the last recorded tunnel corresponds to a still active ssh process';
-  pidof ssh | grep -oE `cat ~/.paraviewLastTunnel | sed s/\ /\|/g` > ~/.paraviewLastTunnel;
-  if [ -s ~/.paraviewLastTunnel ] ; then
-    COMMENT='If the last recorded tunnel is still active kill it';
-    kill `cat ~/.paraviewLastTunnel`;
-  fi;
-fi;
-echo 'Open new reverse connection tunnel for the current session';
-COMMENT='Already running SSH connections are CLEAN and should not be killed';
-COMMENT='We store them in a temp file as an exclude list';
-pidof ssh | sed 's/ /,/g' > ~/.paraviewOtherSSH;
-ssh -f -p 44334 -R 10.0.2.253:$NUM_PORT$:localhost:11111 $SSH_USER$@localhost -N;
-sleep 1;
-echo -n 'Started session tunnel with process id ';
-COMMENT='Record new ssh process PID and remove temp file with PID excludes';
-pidof -o `cat ~/.paraviewOtherSSH` ssh | tee ~/.paraviewLastTunnel;
-rm ~/.paraviewOtherSSH;
-"/>
-        </Arguments>
-      </Command>
-    </CommandStartup>
-  </Server>
-</Servers>
diff --git a/share/picongpu/benchmarks/SPEC/etc/picongpu/1.cfg b/share/picongpu/benchmarks/SPEC/etc/picongpu/1.cfg
new file mode 100644
index 0000000000..96d50f11f9
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/etc/picongpu/1.cfg
@@ -0,0 +1,72 @@
+# Copyright 2013-2021 Rene Widera, Axel Huebl
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+##
+## This configuration file is used by PIConGPU's TBG tool to create a
+## batch script for PIConGPU runs. For a detailed description of PIConGPU
+## configuration files including all available variables, see
+##
+##                      docs/TBG_macros.cfg
+##
+
+
+#################################
+## Section: Required Variables ##
+#################################
+
+TBG_wallTime="02:00:00"
+
+TBG_devices_x=1
+TBG_devices_y=1
+TBG_devices_z=1
+
+TBG_gridSize="128 128 128"
+TBG_steps="1000"
+
+TBG_periodic="--periodic 1 1 1"
+
+
+#################################
+## Section: Optional Variables ##
+#################################
+
+TBG_plugins=" --p_macroParticlesCount.period 100          \
+              --e_macroParticlesCount.period 100          \
+              --fields_energy.period 100                  \
+              --e_energy.period 100 --e_energy.filter all \
+              --p_energy.period 100 --p_energy.filter all"
+
+
+#################################
+## Section: Program Parameters ##
+#################################
+
+TBG_deviceDist="!TBG_devices_x !TBG_devices_y !TBG_devices_z"
+
+TBG_programParams="-d !TBG_deviceDist \
+                   -g !TBG_gridSize   \
+                   -s !TBG_steps      \
+                   !TBG_periodic      \
+                   !TBG_plugins       \
+                   --versionOnce"
+
+# TOTAL number of devices
+TBG_tasks="$(( TBG_devices_x * TBG_devices_y * TBG_devices_z ))"
+
+"$TBG_cfgPath"/submitAction.sh
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/density.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/density.param
new file mode 100644
index 0000000000..5ab7ed52c3
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/density.param
@@ -0,0 +1,46 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+ *                     Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/densityProfiles/profiles.def"
+
+
+namespace picongpu
+{
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
+        constexpr float_64 BASE_DENSITY_SI = 1.e25;
+    } // namespace SI
+
+    namespace densityProfiles
+    {
+        /* definition of homogenous density profile */
+        using Homogenous = HomogenousImpl;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/fileOutput.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/fileOutput.param
new file mode 100644
index 0000000000..3b81ce69e2
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/fileOutput.param
@@ -0,0 +1,50 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+ *                     Benjamin Worpitz, Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+
+/* some forward declarations we need */
+#include "picongpu/fields/Fields.def"
+#include "picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def"
+
+#include <boost/mpl/vector.hpp>
+
+
+namespace picongpu
+{
+    /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
+     *
+     * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
+     */
+    using FieldTmpSolvers = MakeSeq_t<>;
+
+    /** FileOutputFields: Groups all Fields that shall be dumped *************/
+    using FileOutputFields = MakeSeq_t<>;
+
+    /** FileOutputParticles: Groups all Species that shall be dumped **********
+     *
+     * hint: to disable particle output set to
+     *   using FileOutputParticles = MakeSeq_t< >;
+     */
+    using FileOutputParticles = MakeSeq_t<>;
+
+} // namespace picongpu
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/grid.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/grid.param
new file mode 100644
index 0000000000..df502296f5
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/grid.param
@@ -0,0 +1,81 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+namespace picongpu
+{
+    namespace SI
+    {
+        /** Duration of one timestep
+         *  unit: seconds */
+        constexpr float_64 DELTA_T_SI = 3.0e-17;
+
+        /** equals X
+         *  unit: meter */
+        constexpr float_64 CELL_WIDTH_SI = 1.8e-8;
+        /** equals Y
+         *  unit: meter */
+        constexpr float_64 CELL_HEIGHT_SI = 1.8e-8;
+        /** equals Z
+         *  unit: meter */
+        constexpr float_64 CELL_DEPTH_SI = 1.8e-8;
+
+        /** Note on units in reduced dimensions
+         *
+         * In 2D3V simulations, the CELL_DEPTH_SI (Z) cell length
+         * is still used for normalization of densities, etc.
+         *
+         * A 2D3V simulation in a cartesian PIC simulation such as
+         * ours only changes the degrees of freedom in motion for
+         * (macro) particles and all (field) information in z
+         * travels instantaneous, making the 2D3V simulation
+         * behave like the interaction of infinite "wire particles"
+         * in fields with perfect symmetry in Z.
+         */
+    } // namespace SI
+
+    //! Defines the size of the absorbing zone (in cells)
+    constexpr uint32_t ABSORBER_CELLS[3][2] = {
+        {0, 0}, /*x direction [negative,positive]*/
+        {0, 0}, /*y direction [negative,positive]*/
+        {0, 0} /*z direction [negative,positive]*/
+    }; // unit: number of cells
+
+    //! Define the strength of the absorber for any direction
+    constexpr float_X ABSORBER_STRENGTH[3][2] = {
+        {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
+        {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
+
+    /** When to move the co-moving window.
+     *  An initial pseudo particle, flying with the speed of light,
+     *  is fired at the begin of the simulation.
+     *  When it reaches movePoint % of the absolute(*) simulation area,
+     *  the co-moving window starts to move with the speed of light.
+     *
+     *  (*) Note: beware, that there is one "hidden" row of gpus at the y-front,
+     *            when you use the co-moving window
+     *  0.75 means only 75% of simulation area is used for real simulation
+     */
+    constexpr float_64 movePoint = 0.90;
+
+} // namespace picongpu
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/isaac.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/isaac.param
new file mode 100644
index 0000000000..135d13420b
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/isaac.param
@@ -0,0 +1,64 @@
+/* Copyright 2016-2021 Alexander Matthes
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Definition which native fields and density fields of particles will be
+ * visualizable with ISAAC. ISAAC is an in-situ visualization library with which
+ * the PIC simulation can be observed while it is running avoiding the time
+ * consuming writing and reading of simulation data for the classical post
+ * processing of data.
+ *
+ * ISAAC can directly visualize natives fields like the E or B field, but
+ * density fields of particles need to be calculated from PIConGPU on the fly
+ * which slightly increases the runtime and the memory consumption. Every
+ * particle density field will reduce the amount of memory left for PIConGPUs
+ * particles and fields.
+ *
+ * To get best performance, ISAAC defines an exponential amount of different
+ * visualization kernels for every combination of (at runtime) activated
+ * fields. So furthermore a lot of fields will increase the compilation time.
+ *
+ */
+
+#pragma once
+
+namespace picongpu
+{
+    namespace isaacP
+    {
+        /** Intermediate list of native particle species of PIConGPU which shall be
+         *  visualized. */
+        using Particle_Seq = MakeSeq_t<>;
+
+        /** Intermediate list of native fields of PIConGPU which shall be
+         *  visualized. */
+        using Native_Seq = MakeSeq_t<>;
+
+        /** Intermediate list of particle species, from which density fields
+         *  shall be created at runtime to visualize them. */
+        using Density_Seq = MakeSeq_t<>;
+
+        /** Compile time sequence of all fields which shall be visualized. Basically
+         *  the join of Native_Seq and Density_Seq. */
+        using Fields_Seq = MakeSeq_t<>;
+
+
+    } // namespace isaacP
+} // namespace picongpu
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/memory.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/memory.param
new file mode 100644
index 0000000000..17fdfd2aa9
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/memory.param
@@ -0,0 +1,115 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Define low-level memory settings for compute devices.
+ *
+ * Settings for memory layout for supercells and particle frame-lists,
+ * data exchanges in multi-device domain-decomposition and reserved
+ * fields for temporarily derived quantities are defined here.
+ */
+
+#pragma once
+
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/mappings/kernel/MappingDescription.hpp>
+
+namespace picongpu
+{
+    /* We have to hold back 350MiB for gpu-internal operations:
+     *   - random number generator
+     *   - reduces
+     *   - ...
+     */
+    constexpr size_t reservedGpuMemorySize = 400 * 1024 * 1024;
+
+    /* short namespace*/
+    namespace mCT = pmacc::math::CT;
+    /** size of a superCell
+     *
+     * volume of a superCell must be <= 1024
+     */
+    using SuperCellSize = typename mCT::shrinkTo<mCT::Int<8, 8, 4>, simDim>::type;
+
+    /** define the object for mapping superCells to cells*/
+    using MappingDesc = MappingDescription<simDim, SuperCellSize>;
+
+    /** define the size of the core, border and guard area
+     *
+     * PIConGPU uses spatial domain-decomposition for parallelization
+     * over multiple devices with non-shared memory architecture.
+     * The global spatial domain is organized per device in three
+     * sections: the GUARD area contains copies of neighboring
+     * devices (also known as "halo"/"ghost").
+     * The BORDER area is the outermost layer of cells of a device,
+     * equally to what neighboring devices see as GUARD area.
+     * The CORE area is the innermost area of a device. In union with
+     * the BORDER area it defines the "active" spatial domain on a device.
+     *
+     * GuardSize is defined in units of SuperCellSize per dimension.
+     */
+    using GuardSize = typename mCT::shrinkTo<mCT::Int<1, 1, 1>, simDim>::type;
+
+    /** bytes reserved for species exchange buffer
+     *
+     * This is the default configuration for species exchanges buffer sizes.
+     * The default exchange buffer sizes can be changed per species by adding
+     * the alias exchangeMemCfg with similar members like in DefaultExchangeMemCfg
+     * to its flag list.
+     */
+    struct DefaultExchangeMemCfg
+    {
+        // memory used for a direction
+        static constexpr uint32_t BYTES_EXCHANGE_X = 1 * 1024 * 1024; // 4 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Y = 1 * 1024 * 1024; // 1 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Z = 6 * 1024 * 1024; // 6 MiB
+        static constexpr uint32_t BYTES_EDGES = 512 * 1024; // 512 kiB
+        static constexpr uint32_t BYTES_CORNER = 256 * 1024; // 256 kiB
+
+        /** Reference local domain size
+         *
+         * The size of the local domain for which the exchange sizes `BYTES_*` are configured for.
+         * The required size of each exchange will be calculated at runtime based on the local domain size and the
+         * reference size. The exchange size will be scaled only up and not down. Zero means that there is no reference
+         * domain size, exchanges will not be scaled.
+         */
+        using REF_LOCAL_DOM_SIZE = mCT::Int<128, 128, 128>;
+        /** Scaling rate per direction.
+         *
+         * 1.0 means it scales linear with the ratio between the local domain size at runtime and the reference local
+         * domain size.
+         */
+        const std::array<float_X, DIM3> DIR_SCALING_FACTOR = {0.5, 0.5, 1.0};
+    };
+
+    /** number of scalar fields that are reserved as temporary fields */
+    constexpr uint32_t fieldTmpNumSlots = 1;
+
+    /** can `FieldTmp` gather neighbor information
+     *
+     * If `true` it is possible to call the method `asyncCommunicationGather()`
+     * to copy data from the border of neighboring GPU into the local guard.
+     * This is also known as building up a "ghost" or "halo" region in domain
+     * decomposition and only necessary for specific algorithms that extend
+     * the basic PIC cycle, e.g. with dependence on derived density or energy fields.
+     */
+    constexpr bool fieldTmpSupportGatherCommunication = false;
+
+} // namespace picongpu
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/particle.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/particle.param
new file mode 100644
index 0000000000..d778bd4dc7
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/particle.param
@@ -0,0 +1,101 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
+ *                     Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/startPosition/functors.def"
+#include "picongpu/particles/manipulators/manipulators.def"
+
+#include <pmacc/nvidia/functors/Add.hpp>
+#include <pmacc/nvidia/functors/Assign.hpp>
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
+         *  unit: none
+         */
+        constexpr float_X MIN_WEIGHTING = 1.0;
+
+        namespace manipulators
+        {
+            CONST_VECTOR(float_X, 3, DriftParamElectrons_direction, 0.0, 0.0, 1.0);
+            struct DriftParamElectrons
+            {
+                /** Initial particle drift velocity
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 5.0;
+                const DriftParamElectrons_direction_t direction;
+            };
+            using AssignZDriftElectrons = unary::Drift<DriftParamElectrons, nvidia::functors::Assign>;
+
+            CONST_VECTOR(float_X, 3, DriftParamPositrons_direction, 0.0, 0.0, -1.0);
+            struct DriftParamPositrons
+            {
+                /** Initial particle drift velocity
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 5.0;
+                const DriftParamPositrons_direction_t direction;
+            };
+            // definition of SetDrift start
+            using AssignZDriftPositrons = unary::Drift<DriftParamPositrons, nvidia::functors::Assign>;
+
+        } // namespace manipulators
+
+        namespace startPosition
+        {
+            struct QuietParamElectrons
+            {
+                /** Count of particles per cell per direction at initial state
+                 *  unit: none
+                 */
+                using numParticlesPerDimension = mCT::shrinkTo<mCT::Int<1, 2, 4>, simDim>::type;
+            };
+
+            // definition of quiet particle start
+            using QuietElectrons = QuietImpl<QuietParamElectrons>;
+
+            struct QuietParamPositrons
+            {
+                /** Count of particles per cell per direction at initial state
+                 *  unit: none
+                 */
+                using numParticlesPerDimension = mCT::shrinkTo<mCT::Int<4, 1, 2>, simDim>::type;
+            };
+
+            // definition of quiet particle start
+            using QuietPositrons = QuietImpl<QuietParamPositrons>;
+
+        } // namespace startPosition
+
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL
+            = mCT::volume<startPosition::QuietParamElectrons::numParticlesPerDimension>::type::value;
+
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/species.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/species.param
new file mode 100644
index 0000000000..f6b68bad57
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/species.param
@@ -0,0 +1,106 @@
+/* Copyright 2014-2021 Rene Widera, Richard Pausch, Annegret Roeszler, Klaus Steiniger
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Particle shape, field to particle interpolation, current solver, and particle pusher
+ * can be declared here for usage in `speciesDefinition.param`.
+ *
+ * @see
+ *   **MODELS / Hierarchy of Charge Assignment Schemes**
+ *   in the online documentation for information on particle shapes.
+ *
+ *
+ * \attention
+ * The higher order shape names are redefined with release 0.6.0 in order to provide a consistent naming:
+ *     * PQS is the name of the 3rd order assignment function (instead of PCS)
+ *     * PCS is the name of the 4th order assignment function (instead of P4S)
+ *     * P4S does not exist anymore
+ */
+
+#pragma once
+
+#include "picongpu/particles/shapes.hpp"
+#include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
+#include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
+#include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
+#include "picongpu/particles/flylite/NonLTE.def"
+#include "picongpu/fields/currentDeposition/Solver.def"
+
+
+namespace picongpu
+{
+    /** select macroparticle shape
+     *
+     * **WARNING** the shape names are redefined and diverge from PIConGPU versions before 0.6.0.
+     *
+     *  - particles::shapes::CIC : Assignment function is a piecewise linear spline
+     *  - particles::shapes::TSC : Assignment function is a piecewise quadratic spline
+     *  - particles::shapes::PQS : Assignment function is a piecewise cubic spline
+     *  - particles::shapes::PCS : Assignment function is a piecewise quartic spline
+     */
+    using UsedParticleShape = particles::shapes::PQS;
+
+    /** select interpolation method to be used for interpolation of grid-based field values to particle positions
+     */
+    using UsedField2Particle = FieldToParticleInterpolation<UsedParticleShape, AssignedTrilinearInterpolation>;
+
+    /*! select current solver method
+     * - currentSolver::Esirkepov< SHAPE, STRATEGY > : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     * - currentSolver::VillaBune< SHAPE, STRATEGY > : particle shapes - CIC (1st order) only
+     * - currentSolver::EmZ< SHAPE, STRATEGY >       : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     *
+     * For development purposes:
+     * - currentSolver::EsirkepovNative< SHAPE, STRATEGY > : generic version of currentSolverEsirkepov
+     *   without optimization (~4x slower and needs more shared memory)
+     *
+     * STRATEGY (optional):
+     * - currentSolver::strategy::StridedCachedSupercells
+     * - currentSolver::strategy::StridedCachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::CachedSupercells
+     * - currentSolver::strategy::CachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::NonCachedSupercells
+     * - currentSolver::strategy::NonCachedSupercellsScaled<N> with N >= 1
+     */
+    using UsedParticleCurrentSolver = currentSolver::EmZ<UsedParticleShape>;
+
+    /** particle pusher configuration
+     *
+     * Defining a pusher is optional for particles
+     *
+     * - particles::pusher::HigueraCary : Higuera & Cary's relativistic pusher preserving both volume and ExB velocity
+     * - particles::pusher::Vay : Vay's relativistic pusher preserving ExB velocity
+     * - particles::pusher::Boris : Boris' relativistic pusher preserving volume
+     * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
+     *                                              with classical radiation reaction
+     * - particles::pusher::Composite : composite of two given pushers,
+     *                                  switches between using one (or none) of those
+     *
+     * For diagnostics & modeling: ------------------------------------------------
+     * - particles::pusher::Acceleration : Accelerate particles by applying a constant electric field
+     * - particles::pusher::Free : free propagation, ignore fields
+     *                             (= free stream model)
+     * - particles::pusher::Photon : propagate with c in direction of normalized mom.
+     * - particles::pusher::Probe : Probe particles that interpolate E & B
+     * For development purposes: --------------------------------------------------
+     * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
+     */
+    using UsedParticlePusher = particles::pusher::Boris;
+
+} // namespace picongpu
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/speciesDefinition.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/speciesDefinition.param
new file mode 100644
index 0000000000..ff84524b3d
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/speciesDefinition.param
@@ -0,0 +1,86 @@
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/particles/Particles.hpp"
+
+#include <pmacc/particles/Identifier.hpp>
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+#include <pmacc/identifier/value_identifier.hpp>
+#include <pmacc/particles/traits/FilterByFlag.hpp>
+#include <pmacc/meta/String.hpp>
+
+
+namespace picongpu
+{
+    /*########################### define particle attributes #####################*/
+
+    /** describe attributes of a particle */
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting>;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
+
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*--------------------------- positrons -------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioPositrons, 1.0);
+    value_identifier(float_X, ChargeRatioPositrons, -1.0);
+
+    /* ratio relative to BASE_DENSITY */
+    value_identifier(float_X, DensityRatioPositrons, 1.0);
+
+    using ParticleFlagsPositrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioPositrons>,
+        chargeRatio<ChargeRatioPositrons>,
+        densityRatio<DensityRatioPositrons>>;
+
+    /*define specie ions*/
+    using PIC_Positrons = Particles<PMACC_CSTRING("p"), ParticleFlagsPositrons, DefaultParticleAttributes>;
+
+    /*########################### end species ####################################*/
+
+    using VectorAllSpecies = MakeSeq_t<PIC_Electrons, PIC_Positrons>;
+
+} // namespace picongpu
diff --git a/share/picongpu/benchmarks/SPEC/include/picongpu/param/speciesInitialization.param b/share/picongpu/benchmarks/SPEC/include/picongpu/param/speciesInitialization.param
new file mode 100644
index 0000000000..7763ca1214
--- /dev/null
+++ b/share/picongpu/benchmarks/SPEC/include/picongpu/param/speciesInitialization.param
@@ -0,0 +1,49 @@
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Initialize particles inside particle species. This is the final step in
+ * setting up particles (defined in `speciesDefinition.param`) via density
+ * profiles (defined in `density.param`). One can then further derive particles
+ * from one species to another and manipulate attributes with "manipulators"
+ * and "filters" (defined in `particle.param` and `particleFilters.param`).
+ */
+
+#pragma once
+
+#include "picongpu/particles/InitFunctors.hpp"
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::Homogenous, startPosition::QuietElectrons, PIC_Electrons>,
+            CreateDensity<densityProfiles::Homogenous, startPosition::QuietPositrons, PIC_Positrons>,
+            Manipulate<manipulators::AssignZDriftPositrons, PIC_Positrons>,
+            Manipulate<manipulators::AssignZDriftElectrons, PIC_Electrons>>;
+
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/picongpu/dockerfiles/README.rst b/share/picongpu/dockerfiles/README.rst
index 7d2a93047d..128c294ebc 100644
--- a/share/picongpu/dockerfiles/README.rst
+++ b/share/picongpu/dockerfiles/README.rst
@@ -25,7 +25,7 @@ This exposes the ISAAC port to connect via the webclient to.
 .. code:: bash
 
     docker pull ax3l/picongpu
-    docker run --runtime=nvidia -p 2459:2459 -t ax3l/picongpu:0.5.0 lwfa_live
+    docker run --runtime=nvidia -p 2459:2459 -t ax3l/picongpu:0.6.0-dev lwfa_live
     # open firefox and isaac client
 
 or
@@ -56,12 +56,12 @@ You can also push the result to dockerhub and singularity-hub (you need an accou
     cd ubuntu-1604
 
     # docker image
-    docker build -t ax3l/picongpu:0.5.0
+    docker build -t ax3l/picongpu:0.6.0-dev .
     # optional: push to dockerhub (needed for singularity bootstrap)
     docker login
-    docker push ax3l/picongpu:0.5.0
+    docker push ax3l/picongpu:0.6.0-dev
     # optional: mark as latest release
-    docker tag ax3l/picongpu:0.5.0 ax3l/picongpu:latest
+    docker tag ax3l/picongpu:0.6.0-dev ax3l/picongpu:latest
     docker push ax3l/picongpu:latest
 
     # singularity image
@@ -69,7 +69,7 @@ You can also push the result to dockerhub and singularity-hub (you need an accou
     sudo singularity bootstrap picongpu.img Singularity
     # optional: push to a singularity registry
     # setup your $HOME/.sregistry first
-    sregistry push picongpu.img --name ax3l/picongpu --tag 0.5.0
+    sregistry push picongpu.img --name ax3l/picongpu --tag 0.6.0-dev
 
 Recipes
 -------
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/Singularity b/share/picongpu/dockerfiles/ubuntu-1604/Singularity
deleted file mode 100644
index 3578c8a996..0000000000
--- a/share/picongpu/dockerfiles/ubuntu-1604/Singularity
+++ /dev/null
@@ -1,11 +0,0 @@
-Bootstrap: docker
-From: ax3l/picongpu:0.5.0
-
-
-%labels
-Maintainer "Axel Huebl <a.huebl@hzdr.de>, Rene Widera <r.widera@hzdr.de>"
-Version 0.5.0
-
-
-%runscript
-exec /bin/bash -l
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/compilers.yaml b/share/picongpu/dockerfiles/ubuntu-1604/compilers.yaml
deleted file mode 100644
index 696f5b9a99..0000000000
--- a/share/picongpu/dockerfiles/ubuntu-1604/compilers.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-compilers:
-- compiler:
-    environment: {}
-    extra_rpaths: []
-    flags: {}
-    modules: []
-    operating_system: ubuntu16.04
-    paths:
-      cc: /usr/bin/gcc-5
-      cxx: /usr/bin/g++-5
-      f77: /usr/bin/gfortran-5
-      fc: /usr/bin/gfortran-5
-    spec: gcc@5.4.0
-    target: x86_64
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/packages.yaml b/share/picongpu/dockerfiles/ubuntu-1604/packages.yaml
deleted file mode 100644
index c7e994ba12..0000000000
--- a/share/picongpu/dockerfiles/ubuntu-1604/packages.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-packages:
-  cuda:
-    paths:
-      cuda@9.2.148%gcc@5.4.0 arch=linux-ubuntu16-x86_64: /usr/local/cuda
-    buildable: False
-  pkg-config:
-    paths:
-      pkg-config@0.29.1%gcc@5.4.0 arch=linux-ubuntu16-x86_64: /usr
-    buildable: False
-  python:
-    paths:
-      python@2.7.12%gcc@5.4.0 arch=linux-ubuntu16-x86_64: /usr
-    buildable: False
-  openmpi:
-    version: [3.1.3]
-    variants: +cuda fabrics=libfabric
-  hwloc:
-    variants: +cuda
-  # install issue with gettext
-  # https://github.com/spack/spack/issues/11551
-  flex:
-    version: [2.6.3]
-  all:
-    providers:
-      mpi: [openmpi]
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/Dockerfile b/share/picongpu/dockerfiles/ubuntu-2004/Dockerfile
similarity index 89%
rename from share/picongpu/dockerfiles/ubuntu-1604/Dockerfile
rename to share/picongpu/dockerfiles/ubuntu-2004/Dockerfile
index f772a07452..47d5c56927 100644
--- a/share/picongpu/dockerfiles/ubuntu-1604/Dockerfile
+++ b/share/picongpu/dockerfiles/ubuntu-2004/Dockerfile
@@ -1,12 +1,14 @@
-FROM       nvidia/cuda:9.2-base
-LABEL      maintainer="Axel Huebl <a.huebl@hzdr.de>, Rene Widera <r.widera@hzdr.de>"
+FROM       nvidia/cuda:11.2.0-base-ubuntu20.04
+MAINTAINER Axel Huebl <a.huebl@hzdr.de>
+LABEL      authors="Axel Huebl, René Widera"
 
 # docker and image environment
 ENV        DEBIAN_FRONTEND=noninteractive \
            FORCE_UNSAFE_CONFIGURE=1 \
            SPACK_ROOT=/usr/local \
            SPACK_EXTRA_REPO=/usr/local/share/spack-repo \
-           PIC_PACKAGE='picongpu@0.5.0+isaac backend=cuda'
+           PIC_PACKAGE='picongpu@develop+isaac backend=cuda target=x86_64' \
+           CUDA_PKG_VERSION="11-2"
 
 # install minimal spack dependencies
 #   - adds gfortran for spack's openmpi package
@@ -24,11 +26,9 @@ RUN        apt-get update && \
               coreutils \
               cuda-cupti-$CUDA_PKG_VERSION \
               cuda-command-line-tools-$CUDA_PKG_VERSION \
-              cuda-core-$CUDA_PKG_VERSION \
               cuda-cudart-dev-$CUDA_PKG_VERSION \
-              cuda-curand-dev-$CUDA_PKG_VERSION \
+              libcurand-dev-$CUDA_PKG_VERSION \
               cuda-minimal-build-$CUDA_PKG_VERSION \
-              cuda-misc-headers-$CUDA_PKG_VERSION \
               cuda-nvml-dev-$CUDA_PKG_VERSION \
               curl \
               environment-modules \
@@ -55,6 +55,7 @@ RUN        curl -s -L https://github.com/spack/spack/archive/develop.tar.gz \
            curl -s -L https://api.github.com/repos/ComputationalRadiationPhysics/spack-repo/tarball \
                 | tar xzC $SPACK_EXTRA_REPO --strip 1 && \
            spack repo add --scope=system $SPACK_EXTRA_REPO
+RUN        spack install --only dependencies $PIC_PACKAGE
 RUN        spack install $PIC_PACKAGE && \
            spack clean -a
 
@@ -81,7 +82,7 @@ RUN        /bin/echo -e '#!/bin/bash -l\n' \
 RUN        /bin/bash -l -c ' \
                pic-create $PICSRC/share/picongpu/examples/LaserWakefield /opt/picInputs/lwfa && \
                cd /opt/picInputs/lwfa && \
-               pic-build -b "cuda:30;35;37;50;60;70" -c"-DCUDAMEMTEST_ENABLE=OFF" && \
+               pic-build -b "cuda:35;37;50;60;70;80" -c"-DCUDAMEMTEST_ENABLE=OFF" && \
                rm -rf .build && \
                chmod a+x /opt/picInputs/*/bin/* && \
                chmod a+r -R /opt/picInputs/* && \
@@ -90,7 +91,7 @@ RUN        /bin/bash -l -c ' \
 RUN        /bin/bash -l -c ' \
                pic-create $PICSRC/share/picongpu/examples/KelvinHelmholtz /opt/picInputs/khi && \
                cd /opt/picInputs/khi && \
-               pic-build -b "cuda:30;35;37;50;60;70" -c"-DCUDAMEMTEST_ENABLE=OFF" && \
+               pic-build -b "cuda:35;37;50;60;70;80" -c"-DCUDAMEMTEST_ENABLE=OFF" && \
                rm -rf .build && \
                chmod a+x /opt/picInputs/*/bin/* && \
                chmod a+r -R /opt/picInputs/* && \
@@ -99,7 +100,7 @@ RUN        /bin/bash -l -c ' \
 RUN        /bin/bash -l -c ' \
                pic-create $PICSRC/share/picongpu/examples/FoilLCT /opt/picInputs/foil && \
                cd /opt/picInputs/foil && \
-               pic-build -b "cuda:30;35;37;50;60;70" -c"-DCUDAMEMTEST_ENABLE=OFF" && \
+               pic-build -b "cuda:35;37;50;60;70;80" -c"-DCUDAMEMTEST_ENABLE=OFF" && \
                rm -rf .build && \
                chmod a+x /opt/picInputs/*/bin/* && \
                chmod a+r -R /opt/picInputs/* && \
diff --git a/share/picongpu/dockerfiles/ubuntu-2004/Singularity b/share/picongpu/dockerfiles/ubuntu-2004/Singularity
new file mode 100644
index 0000000000..7080868e12
--- /dev/null
+++ b/share/picongpu/dockerfiles/ubuntu-2004/Singularity
@@ -0,0 +1,11 @@
+Bootstrap: docker
+From: ax3l/picongpu:0.6.0-dev
+
+
+%labels
+Maintainer "Axel Huebl <a.huebl@hzdr.de>, Rene Widera <r.widera@hzdr.de>"
+Version 0.6.0-dev
+
+
+%runscript
+exec /bin/bash -l
diff --git a/share/picongpu/dockerfiles/ubuntu-2004/compilers.yaml b/share/picongpu/dockerfiles/ubuntu-2004/compilers.yaml
new file mode 100644
index 0000000000..e28ec37f97
--- /dev/null
+++ b/share/picongpu/dockerfiles/ubuntu-2004/compilers.yaml
@@ -0,0 +1,14 @@
+compilers:
+- compiler:
+    environment: {}
+    extra_rpaths: []
+    flags: {}
+    modules: []
+    operating_system: ubuntu20.04
+    paths:
+      cc: /usr/bin/gcc-9
+      cxx: /usr/bin/g++-9
+      f77: /usr/bin/gfortran-9
+      fc: /usr/bin/gfortran-9
+    spec: gcc@9.3.0
+    target: x86_64
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/modules.yaml b/share/picongpu/dockerfiles/ubuntu-2004/modules.yaml
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/modules.yaml
rename to share/picongpu/dockerfiles/ubuntu-2004/modules.yaml
diff --git a/share/picongpu/dockerfiles/ubuntu-2004/packages.yaml b/share/picongpu/dockerfiles/ubuntu-2004/packages.yaml
new file mode 100644
index 0000000000..ab4e9dbae5
--- /dev/null
+++ b/share/picongpu/dockerfiles/ubuntu-2004/packages.yaml
@@ -0,0 +1,28 @@
+packages:
+  cuda:
+    buildable: false
+    externals:
+      - prefix: /usr/local/cuda
+        spec: cuda@11.2%gcc@9.3.0 arch=linux-ubuntu20.04-x86_64
+  pkg-config:
+    buildable: false
+    externals:
+      - prefix: /usr
+        spec: pkg-config@0.29.1%gcc@9.3.0 arch=linux-ubuntu20.04-x86_64
+  python:
+    buildable: false
+    externals:
+      - prefix: /usr
+        spec: python@2.7.18%gcc@9.3.0 arch=linux-ubuntu20.04-x86_64
+  openmpi:
+    version: [4.1.0]
+    variants: +cuda fabrics=auto
+  hwloc:
+    variants: +cuda
+  # install issue with gettext
+  # https://github.com/spack/spack/issues/11551
+  flex:
+    version: [2.6.3]
+  all:
+    providers:
+      mpi: [openmpi]
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_foil_4.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_foil_4.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_foil_4.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_foil_4.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_foil_8.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_foil_8.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_foil_8.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_foil_8.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_khi_1.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_khi_1.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_khi_1.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_khi_1.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_khi_4.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_khi_4.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_khi_4.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_khi_4.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_khi_8.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_khi_8.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_khi_8.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_khi_8.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_lwfa.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_lwfa.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_lwfa.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_lwfa.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_4.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_4.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_4.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_4.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_8.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_8.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_8.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_8.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_live.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_live.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_live.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_live.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_live_4.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_live_4.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_live_4.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_live_4.sh
diff --git a/share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_live_8.sh b/share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_live_8.sh
similarity index 100%
rename from share/picongpu/dockerfiles/ubuntu-1604/start_lwfa_live_8.sh
rename to share/picongpu/dockerfiles/ubuntu-2004/start_lwfa_live_8.sh
diff --git a/share/picongpu/examples/Bremsstrahlung/bin/plot_energy_histogram.py b/share/picongpu/examples/Bremsstrahlung/bin/plot_energy_histogram.py
index 8adfa7d46a..2faf025d9c 100644
--- a/share/picongpu/examples/Bremsstrahlung/bin/plot_energy_histogram.py
+++ b/share/picongpu/examples/Bremsstrahlung/bin/plot_energy_histogram.py
@@ -21,7 +21,7 @@
 There will be 5 datasets for the 5 different output iterations. The plots
 will also not contain the outliers.
 
-Copyright 2017-2020 Marco Garten, Axel Huebl
+Copyright 2017-2021 Marco Garten, Axel Huebl
 Authors: Axel Huebl
 License: GPLv3+
 """
diff --git a/share/picongpu/examples/Bremsstrahlung/bin/plot_particle_calorimeter.py b/share/picongpu/examples/Bremsstrahlung/bin/plot_particle_calorimeter.py
index 69bc1b1ff1..fbe928f2b5 100644
--- a/share/picongpu/examples/Bremsstrahlung/bin/plot_particle_calorimeter.py
+++ b/share/picongpu/examples/Bremsstrahlung/bin/plot_particle_calorimeter.py
@@ -21,7 +21,7 @@
 There will be 5 datasets for the 5 different output iterations. The plots
 will also not contain the outliers.
 
-Copyright 2017-2020 Marco Garten, Axel Huebl
+Copyright 2017-2021 Marco Garten, Axel Huebl
 Authors: Axel Huebl
 License: GPLv3+
 """
diff --git a/share/picongpu/examples/Bremsstrahlung/etc/picongpu/1.cfg b/share/picongpu/examples/Bremsstrahlung/etc/picongpu/1.cfg
index 6adb667207..aef299a5bb 100644
--- a/share/picongpu/examples/Bremsstrahlung/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/Bremsstrahlung/etc/picongpu/1.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Heiko Burau, Richard Pausch, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Heiko Burau, Richard Pausch, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -52,11 +52,16 @@ TBG_ph_calorimeter="--ph_calorimeter.period 1000 --ph_calorimeter.openingYaw 360
 
 TBG_ph_energyHistogram="--ph_energyHistogram.period 1000 --ph_energyHistogram.filter all --ph_energyHistogram.minEnergy 10 --ph_energyHistogram.maxEnergy 10000"
 
-TBG_plugins="--hdf5.period 1000 --hdf5.file simData \
-             --e_macroParticlesCount.period 1000 \
-             --i_macroParticlesCount.period 1000 \
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 1000  \
+             --openPMD.file simData \
+             --openPMD.ext h5"
+
+TBG_plugins="!TBG_openPMD                         \
+             --e_macroParticlesCount.period 1000  \
+             --i_macroParticlesCount.period 1000  \
              --ph_macroParticlesCount.period 1000 \
-             !TBG_ph_calorimeter \
+             !TBG_ph_calorimeter                  \
              !TBG_ph_energyHistogram"
 
 
diff --git a/share/picongpu/examples/Bremsstrahlung/etc/picongpu/8.cfg b/share/picongpu/examples/Bremsstrahlung/etc/picongpu/8.cfg
index 98b73db410..8536a07732 100644
--- a/share/picongpu/examples/Bremsstrahlung/etc/picongpu/8.cfg
+++ b/share/picongpu/examples/Bremsstrahlung/etc/picongpu/8.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Heiko Burau, Richard Pausch, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Heiko Burau, Richard Pausch, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -52,7 +52,12 @@ TBG_ph_calorimeter="--ph_calorimeter.period 1000 --ph_calorimeter.openingYaw 360
 
 TBG_ph_energyHistogram="--ph_energyHistogram.period 1000 --ph_energyHistogram.filter all --ph_energyHistogram.minEnergy 10 --ph_energyHistogram.maxEnergy 10000"
 
-TBG_plugins="--hdf5.period 1000 --hdf5.file simData \
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 1000  \
+             --openPMD.file simData \
+             --openPMD.ext h5"
+
+TBG_plugins="!TBG_openPMD                        \
              --e_macroParticlesCount.period 1000 \
              --i_macroParticlesCount.period 1000 \
              --ph_macroParticlesCount.period 1000 \
diff --git a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/bremsstrahlung.param b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/bremsstrahlung.param
index d9c182465e..8ab631aded 100644
--- a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/bremsstrahlung.param
+++ b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/bremsstrahlung.param
@@ -1,4 +1,4 @@
-/* Copyright 2016-2020 Heiko Burau
+/* Copyright 2016-2021 Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -21,112 +21,110 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace bremsstrahlung
-{
-
-
-/** params related to the energy loss and deflection of the incident electron */
-namespace electron
-{
-    /** Minimal kinetic electron energy in MeV for the lookup table.
-     *
-     * For electrons below this value Bremsstrahlung is not taken into account.
-     */
-    constexpr float_64 MIN_ENERGY_MeV = 0.5;
-
-    /** Maximal kinetic electron energy in MeV for the lookup table.
-     *
-     * Electrons above this value cause a out-of-bounds access at the
-     * lookup table. Bounds checking is enabled for "CRITICAL" log level.
-     */
-    constexpr float_64 MAX_ENERGY_MeV = 100.0;
-
-    /** Minimal polar deflection angle due to screening. */
-    constexpr float_64 MIN_THETA = 0.001;
-
-    /** number of lookup table divisions for the kappa axis.
-     *
-     * Kappa is the energy loss normalized to the initial kinetic energy.
-     * The axis is scaled linearly.
-     */
-    constexpr uint32_t NUM_SAMPLES_KAPPA = 64;
-
-    /** number of lookup table divisions for the initial kinetic energy axis.
-     *
-     * The axis is scaled logarithmically.
-     */
-    constexpr uint32_t NUM_SAMPLES_EKIN = 128;
-
-    /** Kappa is the energy loss normalized to the initial kinetic energy.
-     *
-     * This minimal value is needed by the numerics to avoid a division by zero.
-     */
-    constexpr float_64 MIN_KAPPA = 1.0e-10;
-
-} // namespace electron
-
-/** params related to the creation and the emission angle of the photon */
-namespace photon
-{
-    /** Low-energy threshold in keV of the incident electron for the creation of photons.
-     *
-     * Below this value photon emission is neglected.
-     */
-    constexpr float_64 SOFT_PHOTONS_CUTOFF_keV = 5.0;
-
-    /** number of lookup table divisions for the delta axis.
-     *
-     * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
-     * where theta is the angle between the photon momentum and the final electron momentum.
-     *
-     * The axis is scaled linearly.
-     */
-    constexpr uint32_t NUM_SAMPLES_DELTA = 256;
-
-    /** number of lookup table divisions for the gamma axis.
-     *
-     * Gamma is the relativistic factor of the incident electron.
-     *
-     * The axis is scaled logarithmically.
-     */
-    constexpr uint32_t NUM_SAMPLES_GAMMA = 64;
-
-    /** Maximal value of delta for the lookup table.
-     *
-     * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
-     * where theta is the angle between the photon momentum and the final electron momentum.
-     *
-     * A value close to one is reasonable. Though exactly one was actually correct,
-     * because it would map to theta = pi (maximum polar angle), the sampling then would be bad
-     * in the ultrarelativistic case. In this regime the emission primarily takes place at small thetas.
-     * So a maximum delta close to one maps to a reasonable maximum theta.
-     */
-    constexpr float_64 MAX_DELTA = 0.95;
-
-    /** minimal gamma for the lookup table. */
-    constexpr float_64 MIN_GAMMA = 1.0;
-
-    /** maximal gamma for the lookup table.
-     *
-     * Bounds checking is enabled for "CRITICAL" log level.
-     */
-    constexpr float_64 MAX_GAMMA = 200;
-
-    /** if the emission probability per timestep is higher than this value and the log level is set to
-     *  "CRITICAL" a warning will be raised.
-     */
-    constexpr float_64 SINGLE_EMISSION_PROB_LIMIT = 0.4;
-
-    /** ratio between macro electron weighting (numerator) and macro photon weighting (denominator)
-     *  at the time of creation.
-     *
-     * The emission probability is proportional to this parameter.
-     */
-    constexpr float_64 WEIGHTING_RATIO = 5;
-} // namespace photon
-
-} // namespace bremsstrahlung
-} // namespace particles
+    namespace particles
+    {
+        namespace bremsstrahlung
+        {
+            /** params related to the energy loss and deflection of the incident electron */
+            namespace electron
+            {
+                /** Minimal kinetic electron energy in MeV for the lookup table.
+                 *
+                 * For electrons below this value Bremsstrahlung is not taken into account.
+                 */
+                constexpr float_64 MIN_ENERGY_MeV = 0.5;
+
+                /** Maximal kinetic electron energy in MeV for the lookup table.
+                 *
+                 * Electrons above this value cause a out-of-bounds access at the
+                 * lookup table. Bounds checking is enabled for "CRITICAL" log level.
+                 */
+                constexpr float_64 MAX_ENERGY_MeV = 100.0;
+
+                /** Minimal polar deflection angle due to screening. */
+                constexpr float_64 MIN_THETA = 0.001;
+
+                /** number of lookup table divisions for the kappa axis.
+                 *
+                 * Kappa is the energy loss normalized to the initial kinetic energy.
+                 * The axis is scaled linearly.
+                 */
+                constexpr uint32_t NUM_SAMPLES_KAPPA = 64;
+
+                /** number of lookup table divisions for the initial kinetic energy axis.
+                 *
+                 * The axis is scaled logarithmically.
+                 */
+                constexpr uint32_t NUM_SAMPLES_EKIN = 128;
+
+                /** Kappa is the energy loss normalized to the initial kinetic energy.
+                 *
+                 * This minimal value is needed by the numerics to avoid a division by zero.
+                 */
+                constexpr float_64 MIN_KAPPA = 1.0e-10;
+
+            } // namespace electron
+
+            /** params related to the creation and the emission angle of the photon */
+            namespace photon
+            {
+                /** Low-energy threshold in keV of the incident electron for the creation of photons.
+                 *
+                 * Below this value photon emission is neglected.
+                 */
+                constexpr float_64 SOFT_PHOTONS_CUTOFF_keV = 5.0;
+
+                /** number of lookup table divisions for the delta axis.
+                 *
+                 * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
+                 * where theta is the angle between the photon momentum and the final electron momentum.
+                 *
+                 * The axis is scaled linearly.
+                 */
+                constexpr uint32_t NUM_SAMPLES_DELTA = 256;
+
+                /** number of lookup table divisions for the gamma axis.
+                 *
+                 * Gamma is the relativistic factor of the incident electron.
+                 *
+                 * The axis is scaled logarithmically.
+                 */
+                constexpr uint32_t NUM_SAMPLES_GAMMA = 64;
+
+                /** Maximal value of delta for the lookup table.
+                 *
+                 * Delta is the angular emission probability (normalized to one) integrated from zero to theta,
+                 * where theta is the angle between the photon momentum and the final electron momentum.
+                 *
+                 * A value close to one is reasonable. Though exactly one was actually correct,
+                 * because it would map to theta = pi (maximum polar angle), the sampling then would be bad
+                 * in the ultrarelativistic case. In this regime the emission primarily takes place at small thetas.
+                 * So a maximum delta close to one maps to a reasonable maximum theta.
+                 */
+                constexpr float_64 MAX_DELTA = 0.95;
+
+                /** minimal gamma for the lookup table. */
+                constexpr float_64 MIN_GAMMA = 1.0;
+
+                /** maximal gamma for the lookup table.
+                 *
+                 * Bounds checking is enabled for "CRITICAL" log level.
+                 */
+                constexpr float_64 MAX_GAMMA = 200;
+
+                /** if the emission probability per timestep is higher than this value and the log level is set to
+                 *  "CRITICAL" a warning will be raised.
+                 */
+                constexpr float_64 SINGLE_EMISSION_PROB_LIMIT = 0.4;
+
+                /** ratio between macro electron weighting (numerator) and macro photon weighting (denominator)
+                 *  at the time of creation.
+                 *
+                 * The emission probability is proportional to this parameter.
+                 */
+                constexpr float_64 WEIGHTING_RATIO = 5;
+            } // namespace photon
+
+        } // namespace bremsstrahlung
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/density.param b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/density.param
index 47e2a6b906..e6caad1521 100644
--- a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/density.param
+++ b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -26,61 +26,55 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** The maximum density in particles per m^3 in the gas distribution
-     *  unit: ELEMENTS/m^3
-     *
-     * He (2e- / Atom ) with 1.e15 He / m^3
-     *                      = 2.e15 e- / m^3 */
-
-    constexpr float_64 BASE_DENSITY_SI = 5.9e28; // solid gold
-
-}
+    namespace SI
+    {
+        /** The maximum density in particles per m^3 in the gas distribution
+         *  unit: ELEMENTS/m^3
+         *
+         * He (2e- / Atom ) with 1.e15 He / m^3
+         *                      = 2.e15 e- / m^3 */
 
-namespace densityProfiles
-{
+        constexpr float_64 BASE_DENSITY_SI = 5.9e28; // solid gold
 
-struct FoilFunctor
-{
+    } // namespace SI
 
-    /**
-     * This formula uses SI quantities only
-     * The profile will be multiplied by BASE_DENSITY_SI.
-     *
-     * @param position_SI total offset including all slides [in meter]
-     * @param cellSize_SI cell sizes [in meter]
-     *
-     * @return float_X density [normalized to 1.0]
-     */
-    HDINLINE float_X operator()(
-        float2_64 pos,
-        const float3_64& cellSize_SI
-    )
+    namespace densityProfiles
     {
-        /* center point of foil */
-        constexpr float_64 plateauPos = 4e-6;
-        /* thickness of foil */
-        constexpr float_64 plateauLength = 2e-6;
-        /* gaussian ramp length of density above the surface */
-        constexpr float_64 rampLength = 0.1e-6;
+        struct FoilFunctor
+        {
+            /**
+             * This formula uses SI quantities only
+             * The profile will be multiplied by BASE_DENSITY_SI.
+             *
+             * @param position_SI total offset including all slides [in meter]
+             * @param cellSize_SI cell sizes [in meter]
+             *
+             * @return float_X density [normalized to 1.0]
+             */
+            HDINLINE float_X operator()(float2_64 pos, const float3_64& cellSize_SI)
+            {
+                /* center point of foil */
+                constexpr float_64 plateauPos = 4e-6;
+                /* thickness of foil */
+                constexpr float_64 plateauLength = 2e-6;
+                /* gaussian ramp length of density above the surface */
+                constexpr float_64 rampLength = 0.1e-6;
 
-        using namespace pmacc::algorithms::math;
+                using namespace pmacc::math;
 
-        if( abs( pos.y() - plateauPos) < plateauLength / 2.0 )
-        {
-            return 1.0_X;
-        }
-        const float_64 d = math::min(
-            abs( pos.y() - plateauPos + plateauLength / 2.0 ),
-            abs( pos.y() - plateauPos - plateauLength / 2.0 )
-        );
-        return float_X( exp( -d * d / ( 2.0_X * rampLength * rampLength ) ) );
-    }
-};
+                if(abs(pos.y() - plateauPos) < plateauLength / 2.0)
+                {
+                    return 1.0_X;
+                }
+                const float_64 d = math::min(
+                    abs(pos.y() - plateauPos + plateauLength / 2.0),
+                    abs(pos.y() - plateauPos - plateauLength / 2.0));
+                return float_X(exp(-d * d / (2.0_X * rampLength * rampLength)));
+            }
+        };
 
-//! definition of free formula profile
-using Foil = FreeFormulaImpl< FoilFunctor >;
+        //! definition of free formula profile
+        using Foil = FreeFormulaImpl<FoilFunctor>;
 
-} // namespace densityProfiles
-} // namepsace picongpu
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/dimension.param b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/dimension.param
index 0d727bc754..7d41fd9e9f 100644
--- a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/dimension.param
+++ b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/dimension.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl
+/* Copyright 2014-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
diff --git a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/grid.param b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/grid.param
index e23d45b366..65ddb03586 100644
--- a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Richard Pausch, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -21,7 +21,6 @@
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** extent of one cell in x-direction
@@ -43,21 +42,21 @@ namespace picongpu
         /** Duration of one timestep
          *  unit: seconds */
         constexpr float_64 DELTA_T_SI = CELL_WIDTH_SI / SPEED_OF_LIGHT_SI / SQRT_OF_2 / EPS_CFL;
-    } //namespace SI
+    } // namespace SI
 
     //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {32, 32},  /*x direction [negative,positive]*/
-        {32, 32},  /*y direction [negative,positive]*/
-        {32, 32}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -76,5 +75,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/laser.param b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/laser.param
index 361d562cbd..f164b3a779 100644
--- a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/laser.param
+++ b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/laser.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch, Alexander Debus
+/* Copyright 2013-2021 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch, Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -48,106 +48,110 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace gaussianBeam
-{
-    //! Use only the 0th Laguerremode for a standard Gaussian
-    static constexpr uint32_t MODENUMBER = 0;
-    PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, 1.0);
-    // This is just an example for a more complicated set of Laguerre modes
-    //constexpr uint32_t MODENUMBER = 12;
-    //PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, -1.0, 0.0300519, 0.319461, -0.23783, 0.0954839, 0.0318653, -0.144547, 0.0249208, -0.111989, 0.0434385, -0.030038, -0.00896321, -0.0160788);
-
-} // namespace gaussianBeam
-
-    struct GaussianBeamParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        static constexpr float_64 _A0  = 40.0;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 8.0e-15;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *
-         *  unit: meter */
-        static constexpr float_64 W0_SI = 1.5e-6;
-        /** the distance to the laser focus in y-direction
-         *  unit: meter */
-        static constexpr float_64 FOCUS_POS_SI = 4.e-6;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 6.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        using LAGUERREMODES_t = gaussianBeam::LAGUERREMODES_t;
-        static constexpr uint32_t MODENUMBER = gaussianBeam::MODENUMBER;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-
-    //! currently selected laser profile
-    using Selected = GaussianBeam< GaussianBeamParam >;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace gaussianBeam
+            {
+                //! Use only the 0th Laguerremode for a standard Gaussian
+                static constexpr uint32_t MODENUMBER = 0;
+                PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, 1.0);
+                // This is just an example for a more complicated set of Laguerre modes
+                // constexpr uint32_t MODENUMBER = 12;
+                // PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, -1.0, 0.0300519, 0.319461, -0.23783,
+                // 0.0954839, 0.0318653, -0.144547, 0.0249208, -0.111989, 0.0434385, -0.030038, -0.00896321,
+                // -0.0160788);
+
+            } // namespace gaussianBeam
+
+            struct GaussianBeamParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                static constexpr float_64 _A0 = 40.0;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 8.0e-15;
+
+                /** beam waist: distance from the axis where the pulse intensity (E^2)
+                 *              decreases to its 1/e^2-th part,
+                 *              at the focus position of the laser
+                 * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                 *                             [   1.17741    ]
+                 *
+                 *  unit: meter */
+                static constexpr float_64 W0_SI = 1.5e-6;
+                /** the distance to the laser focus in y-direction
+                 *  unit: meter */
+                static constexpr float_64 FOCUS_POS_SI = 4.e-6;
+
+                /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                 *
+                 *  unit: none */
+                static constexpr float_64 PULSE_INIT = 6.0;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                using LAGUERREMODES_t = gaussianBeam::LAGUERREMODES_t;
+                static constexpr uint32_t MODENUMBER = gaussianBeam::MODENUMBER;
+
+                /** Available polarisation types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = LINEAR_X;
+            };
+
+            //! currently selected laser profile
+            using Selected = GaussianBeam<GaussianBeamParam>;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/particle.param b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/particle.param
index c8a3b2901b..ad34931a73 100644
--- a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -28,48 +28,41 @@
 
 namespace picongpu
 {
+    namespace particles
+    {
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
+         *  unit: none */
+        constexpr float_X MIN_WEIGHTING = 1.0;
 
-namespace particles
-{
+        namespace startPosition
+        {
+            struct RandomParameter100ppc
+            {
+                /** Count of particles per cell at initial state
+                 *  unit: none
+                 */
+                static constexpr uint32_t numParticlesPerCell = 100u;
+            };
+            using Random100ppc = RandomImpl<RandomParameter100ppc>;
 
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *  unit: none */
-    constexpr float_X MIN_WEIGHTING = 1.0;
 
-namespace startPosition
-{
+            struct QuietParameter1ppc
+            {
+                /** Count of particles per cell per direction at initial state
+                 *  unit: none
+                 */
+                using numParticlesPerDimension = typename mCT::shrinkTo<mCT::Int<1, 1, 1>, simDim>::type;
+            };
+            using Quiet1ppc = QuietImpl<QuietParameter1ppc>;
 
-    struct RandomParameter100ppc
-    {
-        /** Count of particles per cell at initial state
-         *  unit: none
-         */
-        static constexpr uint32_t numParticlesPerCell = 100u;
-    };
-    using Random100ppc = RandomImpl< RandomParameter100ppc >;
+        } // namespace startPosition
 
-
-    struct QuietParameter1ppc
-    {
-        /** Count of particles per cell per direction at initial state
-         *  unit: none
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
          */
-        using numParticlesPerDimension = typename mCT::shrinkTo<
-            mCT::Int< 1, 1, 1 >,
-            simDim
-        >::type;
-    };
-    using Quiet1ppc = QuietImpl< QuietParameter1ppc >;
-
-} // namespace startPosition
-
-    /** During unit normalization, we assume this is a typical
-     *  number of particles per cell for normalization of weighted
-     *  particle attributes.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL =
-        startPosition::RandomParameter100ppc::numParticlesPerCell;
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = startPosition::RandomParameter100ppc::numParticlesPerCell;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/speciesDefinition.param
index c2bd2922f2..121b78fe9c 100644
--- a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Heiko Burau
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -31,106 +31,87 @@
 
 namespace picongpu
 {
-
-/*########################### define particle attributes #####################*/
-
-/** describe attributes of a particle*/
-using DefaultParticleAttributes = MakeSeq_t<
-    position< position_pic >,
-    momentum,
-    weighting,
-    particleId,
-    momentumPrev1
-#if( RAD_MARK_PARTICLE > 1 ) || ( RAD_ACTIVATE_GAMMA_FILTER != 0 )
-    , radiationFlag
+    /*########################### define particle attributes #####################*/
+
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<
+        position<position_pic>,
+        momentum,
+        weighting,
+        particleId,
+        momentumPrev1
+#if(RAD_MARK_PARTICLE > 1) || (RAD_ACTIVATE_GAMMA_FILTER != 0)
+        ,
+        radiationFlag
 #endif
->;
-
-/*########################### end particle attributes ########################*/
-
-/*########################### define species #################################*/
-
-/*--------------------------- photons -------------------------------------------*/
-
-value_identifier( float_X, MassRatioPhotons, 0.0 );
-value_identifier( float_X, ChargeRatioPhotons, 0.0 );
-
-using ParticleFlagsPhotons = MakeSeq_t<
-    particlePusher< particles::pusher::Photon >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    massRatio< MassRatioPhotons >,
-    chargeRatio< ChargeRatioPhotons >
->;
-
-/* define species photons */
-using PIC_Photons = Particles<
-    PMACC_CSTRING( "ph" ),
-    ParticleFlagsPhotons,
-    DefaultParticleAttributes
->;
-
-
-/*--------------------------- ions -------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier(float_X, MassRatioIons, 359100);
-value_identifier(float_X, ChargeRatioIons, -79.0);
-value_identifier(float_X, DensityRatioIons, 1.0);
-
-using ParticleFlagsIons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioIons >,
-    chargeRatio< ChargeRatioIons >,
-    densityRatio< DensityRatioIons >,
-    atomicNumbers< ionization::atomicNumbers::Gold_t >
->;
-
-/* define species ions */
-using PIC_Ions = Particles<
-    PMACC_CSTRING( "i" ),
-    ParticleFlagsIons,
-    DefaultParticleAttributes
->;
-
-
-/*--------------------------- electrons --------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioElectrons, 1.0 );
-value_identifier( float_X, ChargeRatioElectrons, 1.0 );
-value_identifier( float_X, DensityRatioElectrons, 79.0 );
-
-using ParticleFlagsElectrons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioElectrons >,
-    chargeRatio< ChargeRatioElectrons >,
-    densityRatio< DensityRatioElectrons >,
-    bremsstrahlungIons< PIC_Ions >,
-    bremsstrahlungPhotons< PIC_Photons >
->;
-
-/* define species electrons */
-using PIC_Electrons = Particles<
-    PMACC_CSTRING( "e" ),
-    ParticleFlagsElectrons,
-    DefaultParticleAttributes
->;
-
-
-/*########################### end species ####################################*/
-
-
-using VectorAllSpecies = MakeSeq_t<
-    PIC_Electrons,
-    PIC_Ions,
-    PIC_Photons
->;
-
-} //namespace picongpu
+        >;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+    /*--------------------------- photons -------------------------------------------*/
+
+    value_identifier(float_X, MassRatioPhotons, 0.0);
+    value_identifier(float_X, ChargeRatioPhotons, 0.0);
+
+    using ParticleFlagsPhotons = MakeSeq_t<
+        particlePusher<particles::pusher::Photon>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        massRatio<MassRatioPhotons>,
+        chargeRatio<ChargeRatioPhotons>>;
+
+    /* define species photons */
+    using PIC_Photons = Particles<PMACC_CSTRING("ph"), ParticleFlagsPhotons, DefaultParticleAttributes>;
+
+
+    /*--------------------------- ions -------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioIons, 359100);
+    value_identifier(float_X, ChargeRatioIons, -79.0);
+    value_identifier(float_X, DensityRatioIons, 1.0);
+
+    using ParticleFlagsIons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioIons>,
+        chargeRatio<ChargeRatioIons>,
+        densityRatio<DensityRatioIons>,
+        atomicNumbers<ionization::atomicNumbers::Gold_t>>;
+
+    /* define species ions */
+    using PIC_Ions = Particles<PMACC_CSTRING("i"), ParticleFlagsIons, DefaultParticleAttributes>;
+
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+    value_identifier(float_X, DensityRatioElectrons, 79.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>,
+        densityRatio<DensityRatioElectrons>,
+        bremsstrahlungIons<PIC_Ions>,
+        bremsstrahlungPhotons<PIC_Photons>>;
+
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+
+    /*########################### end species ####################################*/
+
+
+    using VectorAllSpecies = MakeSeq_t<PIC_Electrons, PIC_Ions, PIC_Photons>;
+
+} // namespace picongpu
diff --git a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/speciesInitialization.param
index 28664393fc..dc2b167e24 100644
--- a/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/Bremsstrahlung/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,24 +33,15 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline define in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-        CreateDensity<
-            densityProfiles::Foil,
-            startPosition::Quiet1ppc,
-            PIC_Ions
-        >,
-        CreateDensity<
-            densityProfiles::Foil,
-            startPosition::Random100ppc,
-            PIC_Electrons
-        >
-    >;
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::Foil, startPosition::Quiet1ppc, PIC_Ions>,
+            CreateDensity<densityProfiles::Foil, startPosition::Random100ppc, PIC_Electrons>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/cmakeFlags b/share/picongpu/examples/Bunch/cmakeFlags
index 31fc52ed6b..e57cd552b9 100755
--- a/share/picongpu/examples/Bunch/cmakeFlags
+++ b/share/picongpu/examples/Bunch/cmakeFlags
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
@@ -30,15 +30,17 @@
 #   - increase by 1, no gaps
 
 flags[0]=""
-flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_INCLUDE_FIELDBACKGROUND=true'"
-flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_INCLUDE_FIELDBACKGROUND=true;-DPARAM_DIMENSION=DIM2'"
-flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_SINGLE_PARTICLE=true;-DPARAM_RADFORMFACTOR=radFormFactor_coherent'"
-flags[4]="-DPARAM_OVERWRITES:LIST='-DENABLE_SYNCHROTRON_PHOTONS=1;-DPARAM_RADFORMFACTOR=radFormFactor_Gauss_cell'"
-flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_FILTER_GAMMA=1;-DPARAM_RADFORMFACTOR=radFormFactor_incoherent'"
-flags[6]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADWINDOW=radWindowFunctionTriangle;-DPARAM_RADFORMFACTOR=radFormFactor_CIC_3D'"
-flags[7]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADWINDOW=radWindowFunctionHamming;-DPARAM_RADFORMFACTOR=radFormFactor_TSC_3D'"
-flags[8]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADWINDOW=radWindowFunctionTriplett;-DPARAM_RADFORMFACTOR=radFormFactor_PCS_3D'"
-flags[9]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADWINDOW=radWindowFunctionGauss;-DPARAM_RADFORMFACTOR=radFormFactor_CIC_1Dy'"
+flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_INCLUDE_FIELDBACKGROUND=true;-DPARAM_TWTSFAST=1'"
+flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_INCLUDE_FIELDBACKGROUND=true;-DPARAM_TWTSFAST=1;-DPARAM_DIMENSION=DIM2'"
+flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_INCLUDE_FIELDBACKGROUND=true'"
+flags[4]="-DPARAM_OVERWRITES:LIST='-DPARAM_INCLUDE_FIELDBACKGROUND=true;-DPARAM_DIMENSION=DIM2'"
+flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_SINGLE_PARTICLE=true;-DPARAM_RADFORMFACTOR=radFormFactor_coherent'"
+flags[6]="-DPARAM_OVERWRITES:LIST='-DENABLE_SYNCHROTRON_PHOTONS=1;-DPARAM_RADFORMFACTOR=radFormFactor_Gauss_cell'"
+flags[7]="-DPARAM_OVERWRITES:LIST='-DPARAM_FILTER_GAMMA=1;-DPARAM_RADFORMFACTOR=radFormFactor_incoherent'"
+flags[8]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADWINDOW=radWindowFunctionTriangle;-DPARAM_RADFORMFACTOR=radFormFactor_CIC_3D'"
+flags[9]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADWINDOW=radWindowFunctionHamming;-DPARAM_RADFORMFACTOR=radFormFactor_TSC_3D'"
+flags[10]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADWINDOW=radWindowFunctionTriplett;-DPARAM_RADFORMFACTOR=radFormFactor_PCS_3D'"
+flags[11]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADWINDOW=radWindowFunctionGauss;-DPARAM_RADFORMFACTOR=radFormFactor_CIC_1Dy'"
 
 
 ################################################################################
diff --git a/share/picongpu/examples/Bunch/etc/picongpu/32.cfg b/share/picongpu/examples/Bunch/etc/picongpu/32.cfg
index 26f8031b0d..432e7b6507 100644
--- a/share/picongpu/examples/Bunch/etc/picongpu/32.cfg
+++ b/share/picongpu/examples/Bunch/etc/picongpu/32.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Richard Pausch, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Richard Pausch, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/components.param b/share/picongpu/examples/Bunch/include/picongpu/param/components.param
index f983693a0f..276f6a8998 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/components.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/components.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Anton Helm, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Anton Helm, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -29,9 +29,9 @@
 
 namespace picongpu
 {
-/*! Simulation Starter ---------------------------------------------------
- *  - defaultPIConGPU         : default PIConGPU configuration
- */
-namespace simulation_starter = defaultPIConGPU;
+    /*! Simulation Starter ---------------------------------------------------
+     *  - defaultPIConGPU         : default PIConGPU configuration
+     */
+    namespace simulation_starter = defaultPIConGPU;
 
-}
+} // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/density.param b/share/picongpu/examples/Bunch/include/picongpu/param/density.param
index 4c991b0438..266276a085 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/density.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -27,108 +27,95 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     *
-     */
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         *
+         */
 #ifdef PARAM_SINGLE_PARTICLE
-    /* one particle per cell with weighting 1.0 */
-    constexpr float_64 BASE_DENSITY_SI =
-        1.0 /
-        ( CELL_WIDTH_SI * CELL_HEIGHT_SI * CELL_DEPTH_SI );
+        /* one particle per cell with weighting 1.0 */
+        constexpr float_64 BASE_DENSITY_SI = 1.0 / (CELL_WIDTH_SI * CELL_HEIGHT_SI * CELL_DEPTH_SI);
 #else
-    constexpr float_64 BASE_DENSITY_SI = 1.e25;
+        constexpr float_64 BASE_DENSITY_SI = 1.e25;
 #endif
 
-}
-
-namespace densityProfiles
-{
-
-    PMACC_STRUCT(GaussianCloudParam,
-        /** Profile Formula:
-         *     exponent = |globalCellPos - center| / sigma
-         *     density = e^[ gasFactor * exponent^gasPower ]
-         */
-        (PMACC_C_VALUE(float_X, gasFactor, -0.5))
-        (PMACC_C_VALUE(float_X, gasPower, 2.0))
-
-        /** height of vacuum area on top border
-         *
-         * this vacuum is important because of the laser initialization,
-         * which is done in the first cell of the simulation
-         * unit: cells
-         */
-        (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
-
-        /** The central position of the density distribution
-         *  unit: meter
-         */
-        (PMACC_C_VECTOR_DIM(float_64, simDim, center_SI, 1.024e-5, 9.072e-5, 1.024e-5))
-
-        /** the distance from gasCenter_SI until the density decreases to its 1/e-th part
-          *  unit: meter */
-        (PMACC_C_VECTOR_DIM(float_64, simDim, sigma_SI, 6.0e-6, 6.0e-6, 6.0e-6))
-    ); /* struct GaussianCloudParam */
-
-    /* definition of cloud profile */
-    using GaussianCloud = GaussianCloudImpl< GaussianCloudParam >;
+    } // namespace SI
 
-
-    struct FreeFormulaFunctor
+    namespace densityProfiles
     {
-        /** This formula uses SI quantities only
-         *
-         * The profile will be multiplied by BASE_DENSITY.
-         *
-         * @param position_SI total offset including all slides [in meter]
-         * @param cellSize_SI cell sizes [in meter]
-         *
-         * @return float_X density [normalized to 1.0]
-         */
-        HDINLINE float_X operator()(
-            const floatD_64& position_SI,
-            const float3_64& cellSize_SI
-        )
+        PMACC_STRUCT(
+            GaussianCloudParam,
+            /** Profile Formula:
+             *     exponent = |globalCellPos - center| / sigma
+             *     density = e^[ gasFactor * exponent^gasPower ]
+             */
+            (PMACC_C_VALUE(float_X, gasFactor, -0.5))(PMACC_C_VALUE(float_X, gasPower, 2.0))
+
+            /** height of vacuum area on top border
+             *
+             * this vacuum is important because of the laser initialization,
+             * which is done in the first cell of the simulation
+             * unit: cells
+             */
+            (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
+
+            /** The central position of the density distribution
+             *  unit: meter
+             */
+            (PMACC_C_VECTOR_DIM(float_64, simDim, center_SI, 1.024e-5, 9.072e-5, 1.024e-5))
+
+            /** the distance from gasCenter_SI until the density decreases to its 1/e-th part
+             *  unit: meter */
+            (PMACC_C_VECTOR_DIM(float_64, simDim, sigma_SI, 6.0e-6, 6.0e-6, 6.0e-6))); /* struct GaussianCloudParam */
+
+        /* definition of cloud profile */
+        using GaussianCloud = GaussianCloudImpl<GaussianCloudParam>;
+
+
+        struct FreeFormulaFunctor
         {
-            /* add particle in cell at center of Gaussian Cloud profile */
-            const float3_64 position_start_SI( 1.024e-5, 9.072e-5, 1.024e-5 );
-
-            /* from all cells ... */
-            const pmacc::math::UInt64< simDim > cell_id( position_SI / cellSize_SI.shrink< simDim >() );
-
-            /* ... we calculate the corresponding "center" cell to init the particle in ... */
-            const pmacc::math::UInt64< simDim > cell_start(
-                precisionCast< uint64_t >(
-                    math::floor(
-                        position_start_SI.shrink< simDim >() /
-                        cellSize_SI.shrink< simDim >()
-                    )
-                )
-            );
-
-            /* ... and only in that center cell the density is 1.0, outside zero */
-            bool isStartCell = true;
-            for( uint64_t d = 0; d < simDim; ++d )
-                if( cell_id[d] != cell_start[d] )
-                    isStartCell = false;
-
-            if( isStartCell )
-                return 1.0;
-
-            return 0.0;
-        }
-    };
-
-    /* definition of free formula profile */
-    using FreeFormula = FreeFormulaImpl< FreeFormulaFunctor >;
-
-}
-}
+            /** This formula uses SI quantities only
+             *
+             * The profile will be multiplied by BASE_DENSITY.
+             *
+             * @param position_SI total offset including all slides [in meter]
+             * @param cellSize_SI cell sizes [in meter]
+             *
+             * @return float_X density [normalized to 1.0]
+             */
+            HDINLINE float_X operator()(const floatD_64& position_SI, const float3_64& cellSize_SI)
+            {
+                /* add particle in cell at center of Gaussian Cloud profile */
+                const float3_64 position_start_SI(1.024e-5, 9.072e-5, 1.024e-5);
+
+                /* from all cells ... */
+                const pmacc::math::UInt64<simDim> cell_id(position_SI / cellSize_SI.shrink<simDim>());
+
+                /* ... we calculate the corresponding "center" cell to init the particle in ... */
+                const pmacc::math::UInt64<simDim> cell_start(precisionCast<uint64_t>(
+                    math::floor(position_start_SI.shrink<simDim>() / cellSize_SI.shrink<simDim>())));
+
+                /* ... and only in that center cell the density is 1.0, outside zero */
+                bool isStartCell = true;
+                for(uint64_t d = 0; d < simDim; ++d)
+                    if(cell_id[d] != cell_start[d])
+                        isStartCell = false;
+
+                if(isStartCell)
+                    return 1.0;
+
+                return 0.0;
+            }
+        };
+
+        /* definition of free formula profile */
+        using FreeFormula = FreeFormulaImpl<FreeFormulaFunctor>;
+
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/fieldBackground.param b/share/picongpu/examples/Bunch/include/picongpu/param/fieldBackground.param
index 9dcba0c80e..2bae837d4b 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/fieldBackground.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/fieldBackground.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Alexander Debus, Richard Pausch
+/* Copyright 2014-2021 Axel Huebl, Alexander Debus, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -20,200 +20,204 @@
 #pragma once
 
 /** Load pre-defined templates */
-#include "picongpu/fields/background/templates/TWTS/TWTS.hpp"
+#if PARAM_TWTSFAST == 1
+#    include "picongpu/fields/background/templates/twtsfast/twtsfast.hpp"
+#else
+#    include "picongpu/fields/background/templates/TWTS/TWTS.hpp"
+#endif
 
 #ifndef PARAM_INCLUDE_FIELDBACKGROUND
-#define PARAM_INCLUDE_FIELDBACKGROUND false
+#    define PARAM_INCLUDE_FIELDBACKGROUND false
 #endif
 
-/** Load external background fields
- *
- */
+/* Load external background fields */
 namespace picongpu
 {
     class FieldBackgroundE
     {
     public:
-
-        /* Add this additional field for pushing particles */
+        /** Add this additional field for pushing particles */
         static constexpr bool InfluenceParticlePusher = PARAM_INCLUDE_FIELDBACKGROUND;
 
-        /* We use this to calculate your SI input back to our unit system */
+        /** We use this to calculate your SI input back to our unit system */
         PMACC_ALIGN(m_unitField, const float3_64);
 
-        /* TWTS E-fields need to be initialized on host,
-         * so they can look up global grid dimensions.
+        /** TWTS E-fields need to be initialized on host,
+         *  so they can look up global grid dimensions.
          *
-         * Note: No PMACC_ALIGN(...) used, since this *additional* memory alignment would require
-         *       roughly float_64 the number of registers in the corresponding kernel on the device.
+         *  Note: No PMACC_ALIGN(...) used, since this *additional* memory alignment would require
+         *        roughly float_64 the number of registers in the corresponding kernel on the device.
          */
+#if PARAM_TWTSFAST == 1
+        const templates::twtsfast::EField twtsFieldE;
+#else
         const templates::twts::EField twtsFieldE;
+#endif
 
-        /* Constructor is host-only, because of subGrid and halfSimSize initialization */
-        HINLINE FieldBackgroundE( const float3_64 unitField ) :
-            m_unitField(unitField),
-            twtsFieldE(
-                /* focus_y [m], the distance to the laser focus in y-direction */
-                30.0e-6,
-                /* wavelength [m] */
-                0.8e-6,
-                /* pulselength [s], sigma of std. gauss for intensity (E^2) */
-                10.0e-15 / 2.3548200450309493820231386529194,
-                /* w_x [m], cylindrically focused spot size */
-                5.0e-6,
-                /* w_y [m] */
-                0.01,
-                /* interaction angle between TWTS laser propagation vector and the y-axis [rad] */
-                60. * (PI/180.),
-                /* propagation speed of overlap [speed of light]. */
-                1.0,
-                /* manual time delay [s] if auto_tdelay is false */
-                39.3e-6 / SI::SPEED_OF_LIGHT_SI,
-                /* Should PIConGPU automatically choose a suitable time delay? [true/false] */
-                false )
-        {}
+        /** Constructor is host-only, because of subGrid and halfSimSize initialization */
+        HINLINE FieldBackgroundE(const float3_64 unitField)
+            : m_unitField(unitField)
+            , twtsFieldE(
+                  /* focus_y [m], the distance to the laser focus in y-direction */
+                  30.0e-6,
+                  /* wavelength [m] */
+                  0.8e-6,
+                  /* pulselength [s], sigma of std. gauss for intensity (E^2) */
+                  10.0e-15 / 2.3548200450309493820231386529194,
+#if PARAM_TWTSFAST == 0
+                  /* w_x [m], cylindrically focused spot size */
+                  5.0e-6,
+#endif
+                  /* w_y [m] */
+                  0.01,
+                  /* interaction angle between TWTS laser propagation vector and the y-axis [rad] */
+                  60. * (PI / 180.),
+                  /* propagation speed of overlap [speed of light]. */
+                  1.0,
+                  /* manual time delay [s] if auto_tdelay is false */
+                  39.3e-6 / SI::SPEED_OF_LIGHT_SI,
+                  /* Should PIConGPU automatically choose a suitable time delay? [true/false] */
+                  false)
+        {
+        }
 
         /** Specify your background field E(r,t) here
          *
-         * \param cellIdx The total cell id counted from the start at t=0
-         * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()( const DataSpace<simDim>& cellIdx,
-                    const uint32_t currentStep ) const
+         * @param cellIdx The total cell id counted from the start at t=0
+         * @param currentStep The current time step */
+        HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
         {
             /* unit: meter */
             constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
 
-            /** UNITCONV */
-            constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
-                * SI::ELECTRON_MASS_SI * SI::SPEED_OF_LIGHT_SI
-                * SI::SPEED_OF_LIGHT_SI / SI::ELECTRON_CHARGE_SI;
+            /* UNITCONV */
+            constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * SI::ELECTRON_MASS_SI
+                * SI::SPEED_OF_LIGHT_SI * SI::SPEED_OF_LIGHT_SI / SI::ELECTRON_CHARGE_SI;
 
-            /** unit: W / m^2 */
+            /* unit: W / m^2 */
             // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
 
             /* unit: none */
-            constexpr float_64 _A0  = 1.0;
+            constexpr float_64 _A0 = 1.0;
 
             /* unit: Volt /meter
              *\todo #738 implement math::vector, native type operations
              */
-            const float3_64 invUnitField = float3_64( 1.0 / m_unitField[0],
-                                                      1.0 / m_unitField[1],
-                                                      1.0 / m_unitField[2] );
+            const float3_64 invUnitField = float3_64(1.0 / m_unitField[0], 1.0 / m_unitField[1], 1.0 / m_unitField[2]);
 
             /* laser amplitude in picongpu units [ unit: (Volt /meter) / unitField-factor ]
              * Note: the laser amplitude is included in all field components
              * polarization and other properties are established by the peak amplitude
              * normalized twtsFieldE(...)
              */
-            const float3_X amplitude = precisionCast<float_X>(
-                                float_64(_A0 * UNITCONV_A0_to_Amplitude_SI) * invUnitField );
+            const float3_X amplitude
+                = precisionCast<float_X>(float_64(_A0 * UNITCONV_A0_to_Amplitude_SI) * invUnitField);
 
             /* Note: twtsFieldE(...) is normalized, such that peak amplitude equals unity. */
-            return amplitude * twtsFieldE( cellIdx, currentStep );
+            return amplitude * twtsFieldE(cellIdx, currentStep);
         }
     };
 
     class FieldBackgroundB
     {
     public:
-        /* Add this additional field for pushing particles */
+        /** Add this additional field for pushing particles */
         static constexpr bool InfluenceParticlePusher = PARAM_INCLUDE_FIELDBACKGROUND;
 
-        /* TWTS B-fields need to be initialized on host,
-         * so they can look up global grid dimensions.
+        /** TWTS B-fields need to be initialized on host,
+         *  so they can look up global grid dimensions.
          *
-         * Note: No PMACC_ALIGN(...) used, since this *additional* memory alignment would require
-         *       roughly float_64 the number of registers in the corresponding kernel on the device.
+         *  Note: No PMACC_ALIGN(...) used, since this *additional* memory alignment would require
+         *        roughly float_64 the number of registers in the corresponding kernel on the device.
          */
+#if PARAM_TWTSFAST == 1
+        const templates::twtsfast::BField twtsFieldB;
+#else
         const templates::twts::BField twtsFieldB;
+#endif
 
-        /* We use this to calculate your SI input back to our unit system */
+        /** We use this to calculate your SI input back to our unit system */
         PMACC_ALIGN(m_unitField, const float3_64);
 
-        HINLINE FieldBackgroundB( const float3_64 unitField ) :
-            m_unitField(unitField),
-            twtsFieldB(
-                /* focus_y [m], the distance to the laser focus in y-direction */
-                30.0e-6,
-                /* wavelength [m] */
-                0.8e-6,
-                /* pulselength [s], sigma of std. gauss for intensity (E^2) */
-                10.0e-15 / 2.3548200450309493820231386529194,
-                /* w_x [m], cylindrically focused spot size */
-                5.0e-6,
-                /* w_y [m] */
-                0.01,
-                /* interaction angle between TWTS laser propagation vector and the y-axis [rad] */
-                60. * (PI / 180.),
-                /* propagation speed of overlap [speed of light]. */
-                1.0,
-                /* manual time delay [s] if auto_tdelay is false */
-                39.3e-6 / SI::SPEED_OF_LIGHT_SI,
-                /* Should PIConGPU automatically choose a suitable time delay? [true / false] */
-                false )
-        {}
+        HINLINE FieldBackgroundB(const float3_64 unitField)
+            : m_unitField(unitField)
+            , twtsFieldB(
+                  /* focus_y [m], the distance to the laser focus in y-direction */
+                  30.0e-6,
+                  /* wavelength [m] */
+                  0.8e-6,
+                  /* pulselength [s], sigma of std. gauss for intensity (E^2) */
+                  10.0e-15 / 2.3548200450309493820231386529194,
+#if PARAM_TWTSFAST == 0
+                  /* w_x [m], cylindrically focused spot size */
+                  5.0e-6,
+#endif
+                  /* w_y [m] */
+                  0.01,
+                  /* interaction angle between TWTS laser propagation vector and the y-axis [rad] */
+                  60. * (PI / 180.),
+                  /* propagation speed of overlap [speed of light]. */
+                  1.0,
+                  /* manual time delay [s] if auto_tdelay is false */
+                  39.3e-6 / SI::SPEED_OF_LIGHT_SI,
+                  /* Should PIConGPU automatically choose a suitable time delay? [true / false] */
+                  false)
+        {
+        }
 
         /** Specify your background field B(r,t) here
          *
-         * \param cellIdx The total cell id counted from the start at t=0
-         * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()( const DataSpace<simDim>& cellIdx,
-                    const uint32_t currentStep ) const
+         * @param cellIdx The total cell id counted from the start at t=0
+         * @param currentStep The current time step */
+        HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
         {
             /* unit: meter */
             constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
 
-            /** UNITCONV */
-            constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
-                * SI::ELECTRON_MASS_SI * SI::SPEED_OF_LIGHT_SI
-                * SI::SPEED_OF_LIGHT_SI / SI::ELECTRON_CHARGE_SI;
+            /* UNITCONV */
+            constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * SI::ELECTRON_MASS_SI
+                * SI::SPEED_OF_LIGHT_SI * SI::SPEED_OF_LIGHT_SI / SI::ELECTRON_CHARGE_SI;
 
-            /** unit: W / m^2 */
+            /* unit: W / m^2 */
             // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
 
             /** unit: none */
-            constexpr float_64 _A0  = 1.0;
+            constexpr float_64 _A0 = 1.0;
 
             /** unit: Volt /meter */
-            const float3_64 invUnitField = float3_64( 1.0 / m_unitField[0],
-                                                      1.0 / m_unitField[1],
-                                                      1.0 / m_unitField[2] );
+            const float3_64 invUnitField = float3_64(1.0 / m_unitField[0], 1.0 / m_unitField[1], 1.0 / m_unitField[2]);
 
             /* laser amplitude in picongpu units [ unit: (Volt/meter) / unitField-factor ]
              * Note: the laser amplitude is included in all field components
              * polarization and other properties are established by the peak amplitude
              * normalized twtsFieldB(...)
              */
-            const float3_X amplitude = precisionCast<float_X>(
-                    float_64(_A0 * UNITCONV_A0_to_Amplitude_SI) * invUnitField );
+            const float3_X amplitude
+                = precisionCast<float_X>(float_64(_A0 * UNITCONV_A0_to_Amplitude_SI) * invUnitField);
 
             /* Note: twtsFieldB(...) is normalized, such that peak amplitude equals unity. */
-            return amplitude * twtsFieldB( cellIdx, currentStep );
+            return amplitude * twtsFieldB(cellIdx, currentStep);
         }
     };
 
     class FieldBackgroundJ
     {
     public:
-        /* Add this additional field? */
+        /** Add this additional field? */
         static constexpr bool activated = false;
 
-        /* We use this to calculate your SI input back to our unit system */
+        /** We use this to calculate your SI input back to our unit system */
         PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundJ( const float3_64 unitField ) : m_unitField(unitField)
-        {}
+        HDINLINE FieldBackgroundJ(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field J(r,t) here
          *
-         * \param cellIdx The total cell id counted from the start at t=0
-         * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()( const DataSpace<simDim>& cellIdx,
-                    const uint32_t currentStep ) const
+         * @param cellIdx The total cell id counted from the start at t=0
+         * @param currentStep The current time step */
+        HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
         {
             return float3_X(0.0, 0.0, 0.0);
         }
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/grid.param b/share/picongpu/examples/Bunch/include/picongpu/param/grid.param
index 27c1c90065..7cba20d5a5 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Richard Pausch, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -18,12 +18,10 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** Duration of one timestep
@@ -52,21 +50,21 @@ namespace picongpu
          * behave like the interaction of infinite "wire particles"
          * in fields with perfect symmetry in Z.
          */
-    } //namespace SI
+    } // namespace SI
 
     //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {32, 32},  /*x direction [negative,positive]*/
-        {32, 32},  /*y direction [negative,positive]*/
-        {32, 32}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -85,7 +83,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/laser.param b/share/picongpu/examples/Bunch/include/picongpu/param/laser.param
index 4f2bf9f0a9..8127081277 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/laser.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/laser.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Anton Helm, Richard Pausch, Axel Huebl, Alexander Debus
+/* Copyright 2013-2021 Anton Helm, Richard Pausch, Axel Huebl, Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -48,83 +48,86 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-    struct PlaneWaveParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        static constexpr float_64 _A0  = 1.0;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** The profile of the test Lasers 0 and 2 can be stretched by a
-         *      constant area between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 50.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 2.65e-15;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before and after the plateau
-         *  unit: none */
-        static constexpr float_64 RAMP_INIT = 20.6146;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarization types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-
-    //! currently selected laser profile
-    using Selected = PlaneWave< PlaneWaveParam >;
-
-} // namespace laserProfiles
-} // namespace fields
+            struct PlaneWaveParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                static constexpr float_64 _A0 = 1.0;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** The profile of the test Lasers 0 and 2 can be stretched by a
+                 *      constant area between the up and downramp
+                 *  unit: seconds */
+                static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI
+                    = 50.0 * WAVE_LENGTH_SI / ::picongpu::SI::SPEED_OF_LIGHT_SI;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 2.65e-15;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before and after
+                 * the plateau unit: none */
+                static constexpr float_64 RAMP_INIT = 20.6146;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                /** Available polarization types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = LINEAR_X;
+            };
+
+            //! currently selected laser profile
+            using Selected = PlaneWave<PlaneWaveParam>;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/particle.param b/share/picongpu/examples/Bunch/include/picongpu/param/particle.param
index dc272be978..c76480f74c 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Richard Pausch, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -30,85 +30,77 @@
 
 namespace picongpu
 {
-
-namespace particles
-{
-
-    /* a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *  unit: none
-     */
-#ifdef PARAM_SINGLE_PARTICLE
-    // note: this specific setting allows all kinds of weightings > 0.0
-    constexpr float_X MIN_WEIGHTING = std::numeric_limits< float_X >::min();
-
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 1;
-#else
-    constexpr float_X MIN_WEIGHTING = 10.0;
-
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 6;
-#endif
-
-namespace manipulators
-{
-
-    CONST_VECTOR( float_X, 3, DriftParamNegative_direction, 0.0, -1.0, 0.0 );
-    struct DriftParamNegative
+    namespace particles
     {
-        /** Initial particle drift velocity for electrons and ions
-         *  Examples:
-         *    - No drift is equal to 1.0
+        /* a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
          *  unit: none
          */
-        static constexpr float_64 gamma = 5.0;
-        const DriftParamNegative_direction_t direction;
-    };
-    // definition of SetDrift start
-    using AssignYDriftNegative = unary::Drift<
-        DriftParamNegative,
-        nvidia::functors::Assign
-    >;
-
-} // namespace manipulators
-
-
-namespace startPosition
-{
-
-    struct RandomParameter
-    {
-        /** Count of particles per cell at initial state
-         *  unit: none
-         */
-        static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
-    };
-    using Random = RandomImpl< RandomParameter >;
-
-
-    // sit directly in lower corner of the cell
-    CONST_VECTOR(
-        float_X,
-        3,
-        InCellOffset,
-        /* each x, y, z in-cell position component in range [0.0, 1.0) */
-        0.0,
-        0.0,
-        0.0
-    );
-
-    struct OnePositionParameter
-    {
-        /** Count of particles per cell at initial state
-         *  unit: none
-         */
-        static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
+#ifdef PARAM_SINGLE_PARTICLE
+        // note: this specific setting allows all kinds of weightings > 0.0
+        constexpr float_X MIN_WEIGHTING = std::numeric_limits<float_X>::min();
 
-        const InCellOffset_t inCellOffset;
-    };
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 1;
+#else
+        constexpr float_X MIN_WEIGHTING = 10.0;
 
-    // definition of one specific position for particle start
-    using OnePosition = OnePositionImpl< OnePositionParameter >;
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 6;
+#endif
 
-} // namespace startPosition
-} // namespace particles
+        namespace manipulators
+        {
+            CONST_VECTOR(float_X, 3, DriftParamNegative_direction, 0.0, -1.0, 0.0);
+            struct DriftParamNegative
+            {
+                /** Initial particle drift velocity for electrons and ions
+                 *  Examples:
+                 *    - No drift is equal to 1.0
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 5.0;
+                const DriftParamNegative_direction_t direction;
+            };
+            // definition of SetDrift start
+            using AssignYDriftNegative = unary::Drift<DriftParamNegative, nvidia::functors::Assign>;
+
+        } // namespace manipulators
+
+
+        namespace startPosition
+        {
+            struct RandomParameter
+            {
+                /** Count of particles per cell at initial state
+                 *  unit: none
+                 */
+                static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
+            };
+            using Random = RandomImpl<RandomParameter>;
+
+
+            // sit directly in lower corner of the cell
+            CONST_VECTOR(
+                float_X,
+                3,
+                InCellOffset,
+                /* each x, y, z in-cell position component in range [0.0, 1.0) */
+                0.0,
+                0.0,
+                0.0);
+
+            struct OnePositionParameter
+            {
+                /** Count of particles per cell at initial state
+                 *  unit: none
+                 */
+                static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
+
+                const InCellOffset_t inCellOffset;
+            };
+
+            // definition of one specific position for particle start
+            using OnePosition = OnePositionImpl<OnePositionParameter>;
+
+        } // namespace startPosition
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/png.param b/share/picongpu/examples/Bunch/include/picongpu/param/png.param
index e05a31a24c..bdc71b3e02 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/png.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/png.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -24,17 +24,17 @@
 
 namespace picongpu
 {
-/*scale image before write to file, only scale if value is not 1.0
- */
-constexpr float_64 scale_image = 1.0;
+    /*scale image before write to file, only scale if value is not 1.0
+     */
+    constexpr float_64 scale_image = 1.0;
 
-/*if true image is scaled if cellsize is not quadratic, else no scale*/
-constexpr bool scale_to_cellsize = true;
+    /*if true image is scaled if cellsize is not quadratic, else no scale*/
+    constexpr bool scale_to_cellsize = true;
 
-constexpr bool white_box_per_GPU = true;
+    constexpr bool white_box_per_GPU = true;
 
-namespace visPreview
-{
+    namespace visPreview
+    {
 // normalize EM fields to typical laser or plasma quantities
 //-1: Auto:    enable adaptive scaling for each output
 // 1: Laser:   typical fields calculated out of the laser amplitude
@@ -49,33 +49,32 @@ namespace visPreview
 #define EM_FIELD_SCALE_CHANNEL2 1
 #define EM_FIELD_SCALE_CHANNEL3 1
 
-// multiply highest undisturbed particle density with factor
-constexpr float_X preParticleDens_opacity = 0.25;
-constexpr float_X preChannel1_opacity = 1.0;
-constexpr float_X preChannel2_opacity = 1.0;
-constexpr float_X preChannel3_opacity = 1.0;
-
-// specify color scales for each channel
-namespace preParticleDensCol = colorScales::red;
-namespace preChannel1Col = colorScales::blue;
-namespace preChannel2Col = colorScales::green;
-namespace preChannel3Col = colorScales::none;
+        // multiply highest undisturbed particle density with factor
+        constexpr float_X preParticleDens_opacity = 0.25;
+        constexpr float_X preChannel1_opacity = 1.0;
+        constexpr float_X preChannel2_opacity = 1.0;
+        constexpr float_X preChannel3_opacity = 1.0;
 
-/* png preview settings for each channel */
-DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return math::abs2(field_J);
-}
+        // specify color scales for each channel
+        namespace preParticleDensCol = colorScales::red;
+        namespace preChannel1Col = colorScales::blue;
+        namespace preChannel2Col = colorScales::green;
+        namespace preChannel3Col = colorScales::none;
 
-DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return field_E.x() * field_E.x();
-}
+        /* png preview settings for each channel */
+        DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return pmacc::math::abs2(field_J);
+        }
 
-DINLINE float_X preChannel3(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return -1.0_X * field_E.y();
-}
-}
-}
+        DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return field_E.x() * field_E.x();
+        }
 
+        DINLINE float_X preChannel3(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return -1.0_X * field_E.y();
+        }
+    } // namespace visPreview
+} // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/radiation.param b/share/picongpu/examples/Bunch/include/picongpu/param/radiation.param
index 09833200e3..a7b2f545df 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/radiation.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/radiation.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -18,13 +18,12 @@
  */
 
 
-
 #pragma once
 
-  /*
-    radiation verbose level:
-    0=nothing, 1=physics, 2=simulation_state, 4=memory, 8=critical
-  */
+/*
+  radiation verbose level:
+  0=nothing, 1=physics, 2=simulation_state, 4=memory, 8=critical
+*/
 
 #define PIC_VERBOSE_RADIATION 3
 
@@ -36,146 +35,162 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace linear_frequencies
-{
-namespace SI
-{
-constexpr float_64 omega_min = 0.0;
-constexpr float_64 omega_max = 5.8869e17;
-}
-
-constexpr unsigned int N_omega = 1024; // number of frequencies
-}
-
-namespace log_frequencies
-{
-namespace SI
-{
-constexpr float_64 omega_min = 1.0e14;
-constexpr float_64 omega_max = 1.0e17;
-}
-
-constexpr unsigned int N_omega = 2048; // number of frequencies
-}
+    namespace plugins
+    {
+        namespace radiation
+        {
+            namespace linear_frequencies
+            {
+                namespace SI
+                {
+                    constexpr float_64 omega_min = 0.0;
+                    constexpr float_64 omega_max = 5.8869e17;
+                } // namespace SI
+
+                constexpr unsigned int N_omega = 1024; // number of frequencies
+            } // namespace linear_frequencies
+
+            namespace log_frequencies
+            {
+                namespace SI
+                {
+                    constexpr float_64 omega_min = 1.0e14;
+                    constexpr float_64 omega_max = 1.0e17;
+                } // namespace SI
+
+                constexpr unsigned int N_omega = 2048; // number of frequencies
+            } // namespace log_frequencies
+
+
+            namespace frequencies_from_list
+            {
+                /** path to text file with frequencies */
+                constexpr const char* listLocation = "/path/to/frequency.list";
+                constexpr unsigned int N_omega = 2048; // number of frequencies
+            } // namespace frequencies_from_list
+
+
+            namespace radiation_frequencies = linear_frequencies;
+
+
+            namespace radiationNyquist
+            {
+                constexpr float_32 NyquistFactor = 0.5;
+            }
+
+            ///////////////////////////////////////////////////
+
+
+            // correct treatment of coherent and incoherent radiation from macroparticles
+            /* Choose different form factors in order to consider different  particle shapes for radiation
+             *  - radFormFactor_CIC_3D ... CIC charge distribution
+             *  - radFormFactor_TSC_3D ... TSC charge distribution
+             *  - radFormFactor_PCS_3D ... PCS charge distribution
+             *  - radFormFactor_CIC_1Dy ... only CIC charge distribution in y
+             *  - radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
+             *  - radFormFactor_Gauss_cell ... Gauss charge distribution according to cell size
+             *  - radFormFactor_incoherent ... only incoherent radiation
+             *  - radFormFactor_coherent ... only coherent radiation
+             */
+            namespace radFormFactor_CIC_3D
+            {
+            }
+            namespace radFormFactor_TSC_3D
+            {
+            }
+            namespace radFormFactor_PCS_3D
+            {
+            }
+            namespace radFormFactor_CIC_1Dy
+            {
+            }
+            namespace radFormFactor_Gauss_spherical
+            {
+            }
+            namespace radFormFactor_Gauss_cell
+            {
+            }
+            namespace radFormFactor_incoherent
+            {
+            }
+            namespace radFormFactor_coherent
+            {
+            }
 
+#ifndef PARAM_RADFORMFACTOR
+#    define PARAM_RADFORMFACTOR radFormFactor_Gauss_spherical
+#endif
+            namespace radFormFactor = PARAM_RADFORMFACTOR;
 
-namespace frequencies_from_list
-{
-/** path to text file with frequencies */
-constexpr const char * listLocation = "/path/to/frequency.list";
-constexpr unsigned int N_omega = 2048; // number of frequencies
-}
 
+            ///////////////////////////////////////////////////////////
 
-namespace radiation_frequencies = linear_frequencies;
 
+            namespace parameters
+            {
+                constexpr unsigned int N_observer = 128; // number of looking directions
 
-namespace radiationNyquist
-{
-  constexpr float_32 NyquistFactor = 0.5;
-}
-
-///////////////////////////////////////////////////
-
-
-  // correct treatment of coherent and incoherent radiation from macroparticles
-  /* Choose different form factors in order to consider different  particle shapes for radiation
-   *  - radFormFactor_CIC_3D ... CIC charge distribution
-   *  - radFormFactor_TSC_3D ... TSC charge distribution
-   *  - radFormFactor_PCS_3D ... PCS charge distribution
-   *  - radFormFactor_CIC_1Dy ... only CIC charge distribution in y
-   *  - radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
-   *  - radFormFactor_Gauss_cell ... Gauss charge distribution according to cell size
-   *  - radFormFactor_incoherent ... only incoherent radiation
-   *  - radFormFactor_coherent ... only coherent radiation
-   */
-  namespace radFormFactor_CIC_3D { }
-  namespace radFormFactor_TSC_3D { }
-  namespace radFormFactor_PCS_3D { }
-  namespace radFormFactor_CIC_1Dy { }
-  namespace radFormFactor_Gauss_spherical { }
-  namespace radFormFactor_Gauss_cell { }
-  namespace radFormFactor_incoherent { }
-  namespace radFormFactor_coherent { }
+            } /* end namespace parameters */
 
-#ifndef PARAM_RADFORMFACTOR
-#   define PARAM_RADFORMFACTOR radFormFactor_Gauss_spherical
-#endif
-namespace radFormFactor = PARAM_RADFORMFACTOR;
+            /** activate particles for radiation */
+            struct GammaFilterFunctor
+            {
+                static constexpr float_X radiationGamma = 3.0;
 
+                template<typename T_Particle>
+                HDINLINE void operator()(T_Particle& particle)
+                {
+                    if(picongpu::gamma<float_X>(
+                           particle[picongpu::momentum_],
+                           picongpu::traits::attribute::getMass(particle[picongpu::weighting_], particle))
+                       >= radiationGamma)
+                        particle[picongpu::radiationMask_] = true;
+                }
+            };
 
-///////////////////////////////////////////////////////////
 
+            /* filter to enable radiation for electrons
+             *
+             * to enable the filter:
+             *   - goto file `speciesDefinition.param`
+             *   - add the attribute `radiationMask` to the electron species
+             */
+            using RadiationParticleFilter = picongpu::particles::manipulators::generic::Free<GammaFilterFunctor>;
 
-namespace parameters
-{
 
+            // add a window function weighting to the radiation in order
+            // to avoid ringing effects from sharpe boundaries
+            // default: no window function via `radWindowFunctionNone`
 
-constexpr unsigned int N_observer = 128; // number of looking directions
-
-} /* end namespace parameters */
-
-  /** activate particles for radiation */
-  struct GammaFilterFunctor
-  {
-      static constexpr float_X radiationGamma = 3.0;
-
-      template< typename T_Particle >
-      HDINLINE void operator()( T_Particle& particle )
-      {
-          if(
-             picongpu::gamma<float_X>(
-                                      particle[ picongpu::momentum_ ],
-                                      picongpu::traits::attribute::getMass(
-                                                                           particle[ picongpu::weighting_ ],
-                                                                           particle
-                                                                           )
-                                      ) >= radiationGamma
-             )
-            particle[ picongpu::radiationMask_ ] = true;
-      }
-  };
-
-
-  /* filter to enable radiation for electrons
-   *
-   * to enable the filter:
-   *   - goto file `speciesDefinition.param`
-   *   - add the attribute `radiationMask` to the electron species
-   */
-  using RadiationParticleFilter = picongpu::particles::manipulators::generic::Free<
-    GammaFilterFunctor
-    >;
-
-
-
-// add a window function weighting to the radiation in order
-// to avoid ringing effects from sharpe boundaries
-// default: no window function via `radWindowFunctionNone`
-
-/* Choose different window function in order to get better ringing reduction
- * radWindowFunctionTriangle
- * radWindowFunctionHamming
- * radWindowFunctionTriplett
- * radWindowFunctionGauss
- * radWindowFunctionNone
- */
+            /* Choose different window function in order to get better ringing reduction
+             * radWindowFunctionTriangle
+             * radWindowFunctionHamming
+             * radWindowFunctionTriplett
+             * radWindowFunctionGauss
+             * radWindowFunctionNone
+             */
 
 #ifndef PARAM_RADWINDOW
-#   define PARAM_RADWINDOW radWindowFunctionNone
+#    define PARAM_RADWINDOW radWindowFunctionNone
 #endif
-  namespace radWindowFunctionTriangle { }
-  namespace radWindowFunctionHamming { }
-  namespace radWindowFunctionTriplett { }
-  namespace radWindowFunctionGauss { }
-  namespace radWindowFunctionNone { }
-
-  namespace radWindowFunction = PARAM_RADWINDOW;
-
-} // namespace radiation
-} // namespace plugins
+            namespace radWindowFunctionTriangle
+            {
+            }
+            namespace radWindowFunctionHamming
+            {
+            }
+            namespace radWindowFunctionTriplett
+            {
+            }
+            namespace radWindowFunctionGauss
+            {
+            }
+            namespace radWindowFunctionNone
+            {
+            }
+
+            namespace radWindowFunction = PARAM_RADWINDOW;
+
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/radiationObserver.param b/share/picongpu/examples/Bunch/include/picongpu/param/radiationObserver.param
index 392bc7d24b..0ffaa98dbb 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/radiationObserver.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/radiationObserver.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -22,60 +22,59 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace radiation_observer
-{
-    /** Compute observation angles
-     *
-     * This function is used in the Radiation plug-in kernel to compute
-     * the observation directions given as a unit vector pointing
-     * towards a 'virtual' detector
-     *
-     * @param    observation_id_extern
-     *           int index that identifies each block on the GPU
-     *           to compute the observation direction
-     *
-     * @return   unit vector pointing in observation direction
-     *           type: vector_64
-     *
-     */
-    HDINLINE vector_64 observation_direction(const int observation_id_extern)
+    namespace plugins
     {
-      /** Computes observation angles along the x-y plane.
-       *  Assuming electron(s) fly in -y direction and the laser
-       *  propages in +y direction, the observation angles are centered
-       *  around the -y-axis (0,-1,0) .
-       *  By setting gamma, the angle range can be adjusted to the
-       *  energy of the electrons.
-       */
-
-      /* in this case only one id is needed: an index for theta */
-      const int my_theta_id = observation_id_extern;
+        namespace radiation
+        {
+            namespace radiation_observer
+            {
+                /** Compute observation angles
+                 *
+                 * This function is used in the Radiation plug-in kernel to compute
+                 * the observation directions given as a unit vector pointing
+                 * towards a 'virtual' detector
+                 *
+                 * @param    observation_id_extern
+                 *           int index that identifies each block on the GPU
+                 *           to compute the observation direction
+                 *
+                 * @return   unit vector pointing in observation direction
+                 *           type: vector_64
+                 *
+                 */
+                HDINLINE vector_64 observation_direction(const int observation_id_extern)
+                {
+                    /** Computes observation angles along the x-y plane.
+                     *  Assuming electron(s) fly in -y direction and the laser
+                     *  propages in +y direction, the observation angles are centered
+                     *  around the -y-axis (0,-1,0) .
+                     *  By setting gamma, the angle range can be adjusted to the
+                     *  energy of the electrons.
+                     */
 
-      /* set up: */
-      constexpr picongpu::float_64 gamma_times_thetaMax = 1.5; /* max normalized angle */
-      constexpr picongpu::float_64 gamma = 5.0;                /* relativistic gamma */
-      constexpr picongpu::float_64 thetaMax = gamma_times_thetaMax / gamma; /* max angle */
+                    /* in this case only one id is needed: an index for theta */
+                    const int my_theta_id = observation_id_extern;
 
-      /* stepwith of theta for from [-thetaMax : +thetaMax] */
-      constexpr picongpu::float_64 delta_theta =  2.0 * thetaMax / (parameters::N_observer);
+                    /* set up: */
+                    constexpr picongpu::float_64 gamma_times_thetaMax = 1.5; /* max normalized angle */
+                    constexpr picongpu::float_64 gamma = 5.0; /* relativistic gamma */
+                    constexpr picongpu::float_64 thetaMax = gamma_times_thetaMax / gamma; /* max angle */
 
-      /* compute angle theta for index */
-      const picongpu::float_64 theta(my_theta_id * delta_theta - thetaMax + picongpu::PI);
-      /* + picongpu::PI -> turn observation direction 180 degrees towards -y */
+                    /* stepwith of theta for from [-thetaMax : +thetaMax] */
+                    constexpr picongpu::float_64 delta_theta = 2.0 * thetaMax / (parameters::N_observer);
 
-      /* compute observation unit vector */
-      picongpu::float_32 sinTheta;
-      picongpu::float_32 cosTheta;
-      math::sincos(precisionCast<picongpu::float_32>(theta), sinTheta, cosTheta);
-      return vector_64(sinTheta, cosTheta, 0.0);
+                    /* compute angle theta for index */
+                    const picongpu::float_64 theta(my_theta_id * delta_theta - thetaMax + picongpu::PI);
+                    /* + picongpu::PI -> turn observation direction 180 degrees towards -y */
 
-    }
+                    /* compute observation unit vector */
+                    picongpu::float_32 sinTheta;
+                    picongpu::float_32 cosTheta;
+                    pmacc::math::sincos(precisionCast<picongpu::float_32>(theta), sinTheta, cosTheta);
+                    return vector_64(sinTheta, cosTheta, 0.0);
+                }
 
-} // namespace radiation_observer
-} // namespace radiation
-} // namespace plugins
+            } // namespace radiation_observer
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/species.param b/share/picongpu/examples/Bunch/include/picongpu/param/species.param
deleted file mode 100644
index df5d1a5664..0000000000
--- a/share/picongpu/examples/Bunch/include/picongpu/param/species.param
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2014-2020 Rene Widera, Richard Pausch
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/particles/shapes.hpp"
-#include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
-#include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
-#include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
-
-#include "picongpu/particles/flylite/NonLTE.def"
-#include "picongpu/fields/currentDeposition/Solver.def"
-
-
-namespace picongpu
-{
-/*---------------------------- generic solver---------------------------------*/
-
-/*! Particle Shape definitions -------------------------------------------------
- *  - particles::shapes::CIC : 1st order
- *  - particles::shapes::TSC : 2nd order
- *  - particles::shapes::PCS : 3rd order
- *  - particles::shapes::P4S : 4th order
- *
- *  example: using UsedParticleShape = particles::shapes::CIC;
- */
-using UsedParticleShape = particles::shapes::CIC;
-
-/* define which interpolation method is used to interpolate fields to particle*/
-using UsedField2Particle = FieldToParticleInterpolation< UsedParticleShape, AssignedTrilinearInterpolation >;
-
-/*! select current solver method -----------------------------------------------
- * - currentSolver::Esirkepov<SHAPE>  : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- * - currentSolver::VillaBune<>       : particle shapes - CIC (1st order) only
- * - currentSolver::EmZ<SHAPE>        : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- *
- * For development purposes: ---------------------------------------------------
- * - currentSolver::EsirkepovNative<SHAPE> : generic version of currentSolverEsirkepov
- *   without optimization (~4x slower and needs more shared memory)
- */
-using UsedParticleCurrentSolver = currentSolver::Esirkepov< UsedParticleShape >;
-
-/*! particle pusher configuration ----------------------------------------------
- *
- * Defining a pusher is optional for particles
- *
- * - particles::pusher::Vay : better suited relativistic boris pusher
- * - particles::pusher::Boris : standard boris pusher
- * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
- *                                              with classical radiation reaction
- *
- * For diagnostics & modeling: ------------------------------------------------
- * - particles::pusher::Free : free propagation, ignore fields
- *                             (= free stream model)
- * - particles::pusher::Photon : propagate with c in direction of normalized mom.
- * - particles::pusher::Probe : Probe particles that interpolate E & B
- * For development purposes: --------------------------------------------------
- * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
- */
-using UsedParticlePusher = particles::pusher::Boris;
-
-} // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/Bunch/include/picongpu/param/speciesDefinition.param
index 383754520c..19b005bfb9 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Heiko Burau
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -31,82 +31,75 @@
 
 namespace picongpu
 {
-
 /*########################### define particle attributes #####################*/
 
-//disable or enable functor RadiationParticleFilter
-//diable (0) / enable (1)
+// disable or enable functor RadiationParticleFilter
+// diable (0) / enable (1)
 #ifndef PARAM_FILTER_GAMMA
 #    define PARAM_FILTER_GAMMA 0
 #endif
 
-/** describe attributes of a particle*/
-using DefaultParticleAttributes = MakeSeq_t<
-    position< position_pic >,
-    momentum,
-    weighting,
-    particleId,
-    momentumPrev1
-#   if( PARAM_FILTER_GAMMA == 1 )
-    , radiationMask
-#   endif
->;
-
-/*########################### end particle attributes ########################*/
-
-/*########################### define species #################################*/
-
-/*--------------------------- photons -------------------------------------------*/
-
-value_identifier( float_X, MassRatioPhotons, 0.0 );
-value_identifier( float_X, ChargeRatioPhotons, 0.0 );
-
-using ParticleFlagsPhotons = MakeSeq_t<
-    particlePusher< particles::pusher::Photon >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    massRatio< MassRatioPhotons >,
-    chargeRatio< ChargeRatioPhotons >
->;
-
-/* define species photons */
-using PIC_Photons = Particles<
-    PMACC_CSTRING( "ph" ),
-    ParticleFlagsPhotons,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- electrons --------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioElectrons, 1.0 );
-value_identifier( float_X, ChargeRatioElectrons, 1.0 );
-
-using ParticleFlagsElectrons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    massRatio< MassRatioElectrons >,
-    chargeRatio< ChargeRatioElectrons >
-#if( ENABLE_SYNCHROTRON_PHOTONS == 1 )
-    , synchrotronPhotons< PIC_Photons >
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<
+        position<position_pic>,
+        momentum,
+        weighting,
+        particleId,
+        momentumPrev1
+#if(PARAM_FILTER_GAMMA == 1)
+        ,
+        radiationMask
+#endif
+        >;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+    /*--------------------------- photons -------------------------------------------*/
+
+    value_identifier(float_X, MassRatioPhotons, 0.0);
+    value_identifier(float_X, ChargeRatioPhotons, 0.0);
+
+    using ParticleFlagsPhotons = MakeSeq_t<
+        particlePusher<particles::pusher::Photon>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        massRatio<MassRatioPhotons>,
+        chargeRatio<ChargeRatioPhotons>>;
+
+    /* define species photons */
+    using PIC_Photons = Particles<PMACC_CSTRING("ph"), ParticleFlagsPhotons, DefaultParticleAttributes>;
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>
+#if(ENABLE_SYNCHROTRON_PHOTONS == 1)
+        ,
+        synchrotronPhotons<PIC_Photons>
 #endif
->;
+        >;
 
-/* define species electrons */
-using PIC_Electrons = Particles<
-    PMACC_CSTRING( "e" ),
-    ParticleFlagsElectrons,
-    DefaultParticleAttributes
->;
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
 
-/*########################### end species ####################################*/
+    /*########################### end species ####################################*/
 
-using VectorAllSpecies = MakeSeq_t<
-    PIC_Electrons
-#if( ENABLE_SYNCHROTRON_PHOTONS == 1 )
-    , PIC_Photons
+    using VectorAllSpecies = MakeSeq_t<
+        PIC_Electrons
+#if(ENABLE_SYNCHROTRON_PHOTONS == 1)
+        ,
+        PIC_Photons
 #endif
->;
+        >;
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/Bunch/include/picongpu/param/speciesInitialization.param
index e340ab48da..3693d1c211 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,31 +33,19 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline define in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
 #ifdef PARAM_SINGLE_PARTICLE
-        CreateDensity<
-            densityProfiles::FreeFormula,
-            startPosition::OnePosition,
-            PIC_Electrons
-        >,
+            CreateDensity<densityProfiles::FreeFormula, startPosition::OnePosition, PIC_Electrons>,
 #else
-        CreateDensity<
-            densityProfiles::GaussianCloud,
-            startPosition::Random,
-            PIC_Electrons
-        >,
+            CreateDensity<densityProfiles::GaussianCloud, startPosition::Random, PIC_Electrons>,
 #endif
-        Manipulate<
-            manipulators::AssignYDriftNegative,
-            PIC_Electrons
-        >
-    >;
+            Manipulate<manipulators::AssignYDriftNegative, PIC_Electrons>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/Bunch/include/picongpu/param/starter.param b/share/picongpu/examples/Bunch/include/picongpu/param/starter.param
index fb40c2ab70..e05a8715b6 100644
--- a/share/picongpu/examples/Bunch/include/picongpu/param/starter.param
+++ b/share/picongpu/examples/Bunch/include/picongpu/param/starter.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Richard Pausch
+/* Copyright 2013-2021 Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 
@@ -26,9 +25,5 @@ namespace picongpu
 {
     namespace defaultPIConGPU
     {
-
     }
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/Empty/etc/picongpu/1.cfg b/share/picongpu/examples/Empty/etc/picongpu/1.cfg
index 4b01125bae..1d1945c8e9 100644
--- a/share/picongpu/examples/Empty/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/Empty/etc/picongpu/1.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/FieldAbsorberTest/cmakeFlags b/share/picongpu/examples/FieldAbsorberTest/cmakeFlags
index bc530b7f09..89a833122d 100755
--- a/share/picongpu/examples/FieldAbsorberTest/cmakeFlags
+++ b/share/picongpu/examples/FieldAbsorberTest/cmakeFlags
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch, Sergei Bastrakov
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch, Sergei Bastrakov
 #
 # This file is part of PIConGPU.
 #
@@ -30,62 +30,68 @@
 #   - increase by 1, no gaps
 
 # Test that exponential damping compiles in 3D and 2D
-flags[0]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=8'"
-flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=8;-DPARAM_DIMENSION=DIM2'"
+flags[0]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=10'"
+flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=10;-DPARAM_DIMENSION=DIM2'"
+# Test that arbitrary-order solver compiles in 3D and 2D
+flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=ArbitraryOrderFDTD<FOURTH_ORDER>;-DPARAM_ABSORBER_SIZE=10'"
+flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=ArbitraryOrderFDTD<FOURTH_ORDER>;-DPARAM_ABSORBER_SIZE=10;-DPARAM_DIMENSION=DIM2'"
 # Test that PML compiles in 3D and 2D
-flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=8;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
-flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=8;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
+flags[4]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
+flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
+# Test that arbitrary-order solver with PML absorbing boundary conditions compiles in 3D and 2D
+flags[6]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=ArbitraryOrderFDTDPML<FOURTH_ORDER>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
+flags[7]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=ArbitraryOrderFDTDPML<FOURTH_ORDER>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
 
 # The following tests are for absorber performance and demonstration of
 # reasonable parameters, commented out to make compile-time tests faster
 
 # Exponential damping in 3D, default strength
-#flags[4]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=12'"
-#flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=16'"
-#flags[6]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=24'"
-#flags[7]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=32'"
+#flags[8]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=12'"
+#flags[9]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=16'"
+#flags[10]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=24'"
+#flags[11]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=32'"
 
 # Exponential damping in 2D, default strength
-#flags[8]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=12;-DPARAM_DIMENSION=DIM2'"
-#flags[9]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=16;-DPARAM_DIMENSION=DIM2'"
-#flags[10]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=24;-DPARAM_DIMENSION=DIM2'"
-#flags[11]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee;-DPARAM_ABSORBER_SIZE=32;-DPARAM_DIMENSION=DIM2'"
+#flags[12]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=12;-DPARAM_DIMENSION=DIM2'"
+#flags[13]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=16;-DPARAM_DIMENSION=DIM2'"
+#flags[14]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=24;-DPARAM_DIMENSION=DIM2'"
+#flags[15]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Yee<CurrentInterpolation>;-DPARAM_ABSORBER_SIZE=32;-DPARAM_DIMENSION=DIM2'"
 
 # Convolutional PML in 3D, default strength, no stretching
-#flags[12]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=6;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
-#flags[13]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
-#flags[14]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=12;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
-#flags[15]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
+#flags[16]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=6;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
+#flags[17]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
+#flags[18]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=12;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
+#flags[19]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2'"
 
 # Convolutional PML in 3D, default strength, stretching
-#flags[16]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=8;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.2'"
-#flags[17]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.2'"
+#flags[20]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.2'"
+#flags[21]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.2'"
 
 # Convolutional PML in 2D, default strength, no stretching
-#flags[18]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=6;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
-#flags[19]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
-#flags[20]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=12;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
-#flags[21]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
+#flags[22]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=6;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
+#flags[23]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
+#flags[24]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=12;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
+#flags[25]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
 
 # Convolutional PML in 2D, default strength, stretching
-#flags[22]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=8;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
-#flags[23]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
+#flags[26]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
+#flags[27]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.2;-DPARAM_DIMENSION=DIM2'"
 
-# Unixaxial PML in 3D, default strength, no stretching
-#flags[24]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=8;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.0'"
-#flags[25]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.0'"
+# Uniaxial PML in 3D, default strength, no stretching
+#flags[28]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.0'"
+#flags[29]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.0'"
 
-# Unixaxial PML in 3D, default strength, stretching
-#flags[26]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=8;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.0'"
-#flags[27]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.0'"
+# Uniaxial PML in 3D, default strength, stretching
+#flags[30]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.0'"
+#flags[31]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.0'"
 
-# Unixaxial PML in 2D, default strength, no stretching
-#flags[28]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=8;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.0;-DPARAM_DIMENSION=DIM2'"
-#flags[29]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.0;-DPARAM_DIMENSION=DIM2'"
+# Uniaxial PML in 2D, default strength, no stretching
+#flags[32]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.0;-DPARAM_DIMENSION=DIM2'"
+#flags[33]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=1.0;-DPARAM_PML_ALPHA_MAX=0.0;-DPARAM_DIMENSION=DIM2'"
 
-# Unixaxial PML in 2D, default strength, stretching
-#flags[30]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=8;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.0;-DPARAM_DIMENSION=DIM2'"
-#flags[31]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.0;-DPARAM_DIMENSION=DIM2'"
+# Uniaxial PML in 2D, default strength, stretching
+#flags[34]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=10;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.0;-DPARAM_DIMENSION=DIM2'"
+#flags[35]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=YeePML<CurrentInterpolation>;-DPARAM_PML_SIZE=16;-DPARAM_PML_KAPPA_MAX=10.0;-DPARAM_PML_ALPHA_MAX=0.0;-DPARAM_DIMENSION=DIM2'"
 
 
 ################################################################################
diff --git a/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/1.cfg b/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/1.cfg
index b7d014e716..dc0dabefbb 100644
--- a/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/1.cfg
@@ -1,5 +1,4 @@
-# Copyright 2013-2020 Heiko Burau, Rene Widera, Felix Schmitt, Axel Huebl,
-#                     Sergei Bastrakov
+# Copyright 2013-2021 Heiko Burau, Rene Widera, Felix Schmitt, Axel Huebl, Sergei Bastrakov
 #
 # This file is part of PIConGPU.
 #
@@ -37,9 +36,9 @@ TBG_devices_x=1
 TBG_devices_y=1
 TBG_devices_z=1
 
-# When changing the number of cells consider changing sourceIdx
-# in FieldBackgroundJ::operator()
-TBG_numCells=128
+# When changing the number of cells consider changing positionX, positionY in FieldBackgroundJ::operator().
+# To match the setup from the Taflove book, the size should be 40 + PML size min border + PML size max border.
+TBG_numCells=60
 TBG_gridSize="!TBG_numCells !TBG_numCells !TBG_numCells"
 TBG_steps="1000"
 
@@ -48,7 +47,12 @@ TBG_steps="1000"
 ## Section: Optional Variables ##
 #################################
 
-TBG_plugins="--fields_energy.period 10 --hdf5.period 10 --hdf5.file simData"
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 100   \
+             --openPMD.file simData \
+             --openPMD.ext h5"
+
+TBG_plugins="--fields_energy.period 10 !TBG_openPMD"
 
 
 #################################
diff --git a/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/4.cfg b/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/4.cfg
new file mode 100644
index 0000000000..e3ffe8b2cf
--- /dev/null
+++ b/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/4.cfg
@@ -0,0 +1,74 @@
+# Copyright 2013-2021 Heiko Burau, Rene Widera, Felix Schmitt, Axel Huebl, Sergei Bastrakov
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+##
+## This configuration file is used by PIConGPU's TBG tool to create a
+## batch script for PIConGPU runs. For a detailed description of PIConGPU
+## configuration files including all available variables, see
+##
+##                      docs/TBG_macros.cfg
+##
+
+
+#################################
+## Section: Required Variables ##
+#################################
+
+TBG_wallTime="0:10:00"
+
+# This setup does not need multiple MPI ranks for performance, merely to test that it works
+TBG_devices_x=2
+TBG_devices_y=2
+TBG_devices_z=1
+
+# When changing the number of cells consider changing positionX, positionY in FieldBackgroundJ::operator().
+# To match the setup from the Taflove book, the size should be 40 + PML size min border + PML size max border.
+TBG_numCells=60
+TBG_gridSize="!TBG_numCells !TBG_numCells !TBG_numCells"
+TBG_steps="1000"
+
+
+#################################
+## Section: Optional Variables ##
+#################################
+
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 100   \
+             --openPMD.file simData \
+             --openPMD.ext h5"
+
+TBG_plugins="--fields_energy.period 10 !TBG_openPMD"
+
+
+#################################
+## Section: Program Parameters ##
+#################################
+
+TBG_deviceDist="!TBG_devices_x !TBG_devices_y !TBG_devices_z"
+
+TBG_programParams="-d !TBG_deviceDist \
+                   -g !TBG_gridSize   \
+                   -s !TBG_steps      \
+                   !TBG_plugins       \
+                   --versionOnce"
+
+# TOTAL number of devices
+TBG_tasks="$(( TBG_devices_x * TBG_devices_y * TBG_devices_z ))"
+
+"$TBG_cfgPath"/submitAction.sh
diff --git a/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/8.cfg b/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/8.cfg
deleted file mode 100644
index 51481df696..0000000000
--- a/share/picongpu/examples/FieldAbsorberTest/etc/picongpu/8.cfg
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2013-2020 Heiko Burau, Rene Widera, Felix Schmitt, Axel Huebl,
-#                     Sergei Bastrakov
-#
-# This file is part of PIConGPU.
-#
-# PIConGPU is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# PIConGPU is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with PIConGPU.
-# If not, see <http://www.gnu.org/licenses/>.
-#
-
-##
-## This configuration file is used by PIConGPU's TBG tool to create a
-## batch script for PIConGPU runs. For a detailed description of PIConGPU
-## configuration files including all available variables, see
-##
-##                      docs/TBG_macros.cfg
-##
-
-
-#################################
-## Section: Required Variables ##
-#################################
-
-TBG_wallTime="0:10:00"
-
-TBG_devices_x=2
-TBG_devices_y=2
-TBG_devices_z=2
-
-# When changing the number of cells consider changing sourceIdx
-# in FieldBackgroundJ::operator()
-TBG_numCells=128
-TBG_gridSize="!TBG_numCells !TBG_numCells !TBG_numCells"
-TBG_steps="1000"
-
-
-#################################
-## Section: Optional Variables ##
-#################################
-
-TBG_plugins="--fields_energy.period 10 --hdf5.period 10 --hdf5.file simData"
-
-
-#################################
-## Section: Program Parameters ##
-#################################
-
-TBG_deviceDist="!TBG_devices_x !TBG_devices_y !TBG_devices_z"
-
-TBG_programParams="-d !TBG_deviceDist \
-                   -g !TBG_gridSize   \
-                   -s !TBG_steps      \
-                   !TBG_plugins       \
-                   --versionOnce"
-
-# TOTAL number of devices
-TBG_tasks="$(( TBG_devices_x * TBG_devices_y * TBG_devices_z ))"
-
-"$TBG_cfgPath"/submitAction.sh
diff --git a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/dimension.param b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/dimension.param
index 0881e9884b..efb7c42757 100644
--- a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/dimension.param
+++ b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/dimension.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Rene Widera, Richard Pausch
+/* Copyright 2014-2021 Axel Huebl, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -20,7 +20,7 @@
 #pragma once
 
 #ifndef PARAM_DIMENSION
-#define PARAM_DIMENSION DIM3
+#    define PARAM_DIMENSION DIM3
 #endif
 
 #define SIMDIM PARAM_DIMENSION
diff --git a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fieldBackground.param b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fieldBackground.param
index 7575bbf10a..8465084d0f 100644
--- a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fieldBackground.param
+++ b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fieldBackground.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Alexander Debus
+/* Copyright 2014-2021 Axel Huebl, Alexander Debus, Klaus Steiniger, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -17,11 +17,13 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
-#pragma once
-
-/** Load external background fields
+/** @file fieldBackground.param
  *
+ * Load external background fields
  */
+
+#pragma once
+
 namespace picongpu
 {
     class FieldBackgroundE
@@ -31,31 +33,23 @@ namespace picongpu
         static constexpr bool InfluenceParticlePusher = true;
 
         /* We use this to calculate your SI input back to our unit system */
-        PMACC_ALIGN(
-            m_unitField,
-            const float3_64
-        );
+        PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundE( const float3_64 unitField ) :
-            m_unitField( unitField )
-        {}
+        HDINLINE FieldBackgroundE(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field E(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t = 0
          * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()(
+        HDINLINE float3_X operator()(
             const DataSpace<simDim>& /*cellIdx*/,
             const uint32_t /*currentStep*/
         ) const
         {
             /* specify your E-Field in V/m and convert to PIConGPU units */
-            return float3_X(
-                0.0,
-                0.0,
-                0.0
-            );
+            return float3_X(0.0, 0.0, 0.0);
         }
     };
 
@@ -66,31 +60,23 @@ namespace picongpu
         static constexpr bool InfluenceParticlePusher = true;
 
         /* We use this to calculate your SI input back to our unit system */
-        PMACC_ALIGN(
-            m_unitField,
-            const float3_64
-        );
+        PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundB( const float3_64 unitField ) :
-            m_unitField( unitField )
-        {}
+        HDINLINE FieldBackgroundB(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field B(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t=0
          * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()(
+        HDINLINE float3_X operator()(
             const DataSpace<simDim>& /*cellIdx*/,
             const uint32_t /*currentStep*/
         ) const
         {
             /* specify your B-Field in T and convert to PIConGPU units */
-            return float3_X(
-                0.0,
-                0.0,
-                0.0
-            );
+            return float3_X(0.0, 0.0, 0.0);
         }
     };
 
@@ -100,55 +86,59 @@ namespace picongpu
         /* Add this additional field? */
         static constexpr bool activated = true;
 
+        /* This setup is based on [Taflove, Hagness], section 7.11.1.
+         * The difference is we consider both 2D and 3D cases, and grid size may be increased due to our absorber being
+         * part of the simulation area, not located outside of it as in the book.
+         *
+         * Example of a rectangular conductor with a steady current.
+         *
+         * The conductor is oriented along the y-axis.
+         * Its edge length can be adjusted by the variable halfWidth in order to apply the test with meaningful results
+         * to higher-order solvers, too.
+         * The current in the wire ramps up over time according to a differentiated Gaussian.
+         * This defines the current density amplitude, too.
+         * Therefore, the total current through the wire scales with the wire's halfWidth.
+         */
+
+        //! Conductor is oriented along y-axis with the following coordinates and size, values for 60 cells in the grid
+        static constexpr int32_t positionX = 30; // unit: cells
+        static constexpr int32_t positionY = 30; // unit: cells
+        // We support non-unit source for high order field solver
+        static constexpr int32_t halfWidth = 1; // unit: cells
+
+        //! Amplitude in terms of current density in SI
+        float_X amplitudeSI = -2._X; // unit: A / m^2
+
         /* We use this to calculate your SI input back to our unit system */
-        PMACC_ALIGN(
-            m_unitField,
-            const float3_64
-        );
+        PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundJ( const float3_64 unitField ) :
-            m_unitField(unitField)
-        {}
+        HDINLINE FieldBackgroundJ(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field J(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t=0
-         * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()(
-            const DataSpace<simDim>& cellIdx,
-            const uint32_t currentStep
-        ) const
+         * \param currentStep The current time step
+         */
+        HDINLINE float3_X operator()(const DataSpace<simDim>& cellIdx, const uint32_t currentStep) const
         {
-            /* Source index is hard-coded, should be in the center of the global
-             * domain, so has to be changed together with the grid size.
-             */
-            DataSpace< simDim > const sourceIdx =
-                DataSpace< simDim >::create( 64u );
-            if( cellIdx != sourceIdx )
-                return float3_X(
-                    0.0,
-                    0.0,
-                    0.0
-                );
-
-            /* This setup is based on [Taflove, Hagness], section 7.11.1
-             * The difference is we consider both 2D and 3D cases,
-             * and grid size may be increased due to our absorber being part of
-             * the simulation area, not located outside of it as in the book.
-             */
-            constexpr float_X duration_SI = 26.53e-12; // 26.53 ps
-            constexpr float_X delay_SI = 4.0_X * duration_SI;
-            float_X const time_SI = currentStep * SI::DELTA_T_SI;
-            float_X const normalizedTime = ( time_SI - delay_SI ) / duration_SI;
-            float_X const value = -2.0 * normalizedTime *
-                math::exp( -normalizedTime * normalizedTime );
-            /* specify your J-Field in A/m^2 and convert to PIConGPU units */
+            /* specify J-Field */
+            float_X currentDensity = 0.0_X;
+
+            if(math::abs(float_X(static_cast<int32_t>(cellIdx.x()) - positionX) + .5_X) < halfWidth
+               && math::abs(float_X(static_cast<int32_t>(cellIdx.y()) - positionY) + .5_X) < halfWidth)
+            {
+                float_X const duration = 26.53e-12 / SI::DELTA_T_SI; // 26.53 ps in PIC units
+                float_X const delay = 4._X * duration;
+                float_X const relativeTime = (static_cast<float_X>(currentStep) - delay) / duration;
+                currentDensity = amplitudeSI * relativeTime * math::exp(-relativeTime * relativeTime);
+            }
+
             return float3_X(
-                0.0,
-                0.0,
-                value / m_unitField[1]
-            );
+                0.0_X,
+                currentDensity / m_unitField[1], // unit: none
+                0.0_X);
         }
     };
 
diff --git a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fieldSolver.param b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fieldSolver.param
index 5448e5bdaf..7973dab9cc 100644
--- a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fieldSolver.param
+++ b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fieldSolver.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -25,6 +25,11 @@
  *
  * Also allows to configure ad hoc mitigations for high frequency
  * noise in some setups via current smoothing.
+ *
+ * \attention
+ * Currently, the laser initialization in PIConGPU is implemented to work with the standard Yee solver.
+ * Using a solver of higher order will result in a slightly increased laser amplitude and energy than expected.
+ *
  */
 
 #pragma once
@@ -32,48 +37,58 @@
 #include "picongpu/fields/MaxwellSolver/Solvers.def"
 #include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
 
+#include <boost/preprocessor/punctuation/comma.hpp>
+
 
 namespace picongpu
 {
-namespace fields
-{
-
-    /** Current Interpolation
-     *
-     * CurrentInterpolation is used to set a method performing the
-     * interpolate/assign operation from the generated currents of particle
-     * species to the electro-magnetic fields.
-     *
-     * Allowed values are:
-     *   - None:
-     *     - default for staggered grids/Yee-scheme
-     *     - updates E
-     *   - Binomial: 2nd order Binomial filter
-     *     - smooths the current before assignment in staggered grid
-     *     - updates E & breaks local charge conservation slightly
-     *   - NoneDS:
-     *     - experimental assignment for all-centered/directional splitting
-     *     - updates E & B at the same time
-     */
-    using CurrentInterpolation = currentInterpolation::None;
+    namespace fields
+    {
+        /** Current Interpolation
+         *
+         * CurrentInterpolation is used to set a method performing the
+         * interpolate/assign operation from the generated currents of particle
+         * species to the electro-magnetic fields.
+         *
+         * Allowed values are:
+         *   - None:
+         *     - default for staggered grids/Yee-scheme
+         *     - updates E
+         *   - Binomial: 2nd order Binomial filter
+         *     - smooths the current before assignment in staggered grid
+         *     - updates E & breaks local charge conservation slightly
+         */
+        using CurrentInterpolation = currentInterpolation::None;
 
-    /** FieldSolver
-     *
-     * Field Solver Selection:
-     *  - Yee< CurrentInterpolation >: standard Yee solver
-     *  - YeePML< CurrentInterpolation >: standard Yee solver with PML absorber
-     *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
-     *  - DirSplitting< CurrentInterpolation >: Sentoku's Directional Splitting Method
-     *  - None< CurrentInterpolation >: disable the vacuum update of E and B
-     */
+        /** FieldSolver
+         *
+         * Field Solver Selection:
+         *  - Yee< CurrentInterpolation > : Standard Yee solver approximating derivatives with respect to time and
+         * space by second order finite differences.
+         *  - YeePML< CurrentInterpolation >: Standard Yee solver using Perfectly Matched Layer Absorbing Boundary
+         * Conditions (PML)
+         *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *  - LehePML< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *                                     using Perfectly Matched Layer Absorbing Boundary Conditions (PML)
+         *  - ArbitraryOrderFDTD< 4, CurrentInterpolation >: Solver using 4 neighbors to each direction to approximate
+         * *spatial* derivatives by finite differences. The number of neighbors can be changed from 4 to any positive,
+         * integer number. The order of the solver will be twice the number of neighbors in each direction. Yee's
+         * method is a special case of this using one neighbor to each direction.
+         *  - ArbitraryOrderFDTDPML< 4, CurrentInterpolation >: ArbitraryOrderFDTD solver using Perfectly Matched Layer
+         *                                                      Absorbing Boundary Conditions (PML)
+         *  - None< CurrentInterpolation >: disable the vacuum update of E and B
+         */
 
 #ifndef PARAM_FIELDSOLVER
-    /* WARNING: if you change field solver by hand please update your CELL_WIDTH_SI
-     * in `grid.param` to fulfill the convergence condition (CFL)
-     */
-#   define PARAM_FIELDSOLVER Yee
+        /* WARNING: if you change field solver by hand please update your CELL_WIDTH_SI
+         * in `grid.param` to fulfill the convergence condition (CFL)
+         */
+#    define SELECTED_FIELD_SOLVER Yee<CurrentInterpolation>
+#else
+#    define FOURTH_ORDER 4 BOOST_PP_COMMA() CurrentInterpolation
+#    define SELECTED_FIELD_SOLVER PARAM_FIELDSOLVER
 #endif
-    using Solver = maxwellSolver::PARAM_FIELDSOLVER< CurrentInterpolation >;
+        using Solver = maxwellSolver::SELECTED_FIELD_SOLVER;
 
-} // namespace fields
+    } // namespace fields
 } // namespace picongpu
diff --git a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fileOutput.param b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fileOutput.param
index a486a87cfe..e703d1ccad 100644
--- a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fileOutput.param
+++ b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/fileOutput.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
  *                     Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -63,33 +63,23 @@ namespace picongpu
     namespace deriveField = particles::particleToGrid;
 
     /* ChargeDensity section */
-    using ChargeDensity_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::ChargeDensity
-    >;
+    using ChargeDensity_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::ChargeDensity>;
 
     /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
      *
      * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
      */
-    using FieldTmpSolvers = MakeSeq_t<
-        ChargeDensity_Seq
-    >;
+    using FieldTmpSolvers = MakeSeq_t<ChargeDensity_Seq>;
 
 
     /** FileOutputFields: Groups all Fields that shall be dumped *************/
 
     /** Possible native fields: FieldE, FieldB, FieldJ
      */
-    using NativeFileOutputFields = MakeSeq_t<
-        FieldE,
-        FieldB,
-        FieldJ
-    >;
+    using NativeFileOutputFields = MakeSeq_t<FieldE, FieldB, FieldJ>;
 
-    using FileOutputFields = MakeSeq_t<
-        NativeFileOutputFields
-    >;
+    using FileOutputFields = MakeSeq_t<NativeFileOutputFields>;
 
 
     /** FileOutputParticles: Groups all Species that shall be dumped **********
@@ -97,6 +87,6 @@ namespace picongpu
      * hint: to disable particle output set to
      *   using FileOutputParticles = MakeSeq_t< >;
      */
-    using FileOutputParticles = MakeSeq_t< >;
+    using FileOutputParticles = MakeSeq_t<>;
 
-}
+} // namespace picongpu
diff --git a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/grid.param b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/grid.param
index 335fd8c1d4..2de7f59db7 100644
--- a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz, Sergei Bastrakov
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz, Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -18,15 +18,12 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
 {
-
     namespace SI
     {
-
         /** This setup is based on section 7.11.1 of
          *  A. Taflove, S.C. Hagness. Computational Electrodynamics
          *  The Finite-Difference Time-Domain Method. 3rd Edition.
@@ -66,27 +63,27 @@ namespace picongpu
          * in fields with perfect symmetry in Z.
          */
 
-    } //namespace SI
+    } // namespace SI
 
     //! Define the size of the absorbing zone (in cells) for both exponential
     //! absorber and PML, a compile-time parameter
 #ifndef PARAM_ABSORBER_SIZE
-#   define PARAM_ABSORBER_SIZE 8
+#    define PARAM_ABSORBER_SIZE 8
 #endif
     constexpr uint32_t ABSORBER_SIZE = PARAM_ABSORBER_SIZE;
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {ABSORBER_SIZE, ABSORBER_SIZE},  /*x direction [negative,positive]*/
-        {ABSORBER_SIZE, ABSORBER_SIZE},  /*y direction [negative,positive]*/
-        {ABSORBER_SIZE, ABSORBER_SIZE}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {ABSORBER_SIZE, ABSORBER_SIZE}, /*x direction [negative,positive]*/
+        {ABSORBER_SIZE, ABSORBER_SIZE}, /*y direction [negative,positive]*/
+        {ABSORBER_SIZE, ABSORBER_SIZE} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the exponential absorber only
     constexpr float_X ABSORBER_STRENGTH_VALUE = 1.0e-3;
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {ABSORBER_STRENGTH_VALUE, ABSORBER_STRENGTH_VALUE}, /*x direction [negative,positive]*/
         {ABSORBER_STRENGTH_VALUE, ABSORBER_STRENGTH_VALUE}, /*y direction [negative,positive]*/
-        {ABSORBER_STRENGTH_VALUE, ABSORBER_STRENGTH_VALUE}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {ABSORBER_STRENGTH_VALUE, ABSORBER_STRENGTH_VALUE} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -105,4 +102,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
+} // namespace picongpu
diff --git a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/memory.param b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/memory.param
new file mode 100644
index 0000000000..c128523630
--- /dev/null
+++ b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/memory.param
@@ -0,0 +1,103 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Define low-level memory settings for compute devices.
+ *
+ * Settings for memory layout for supercells and particle frame-lists,
+ * data exchanges in multi-device domain-decomposition and reserved
+ * fields for temporarily derived quantities are defined here.
+ */
+
+#pragma once
+#include <pmacc/math/Vector.hpp>
+#include <pmacc/mappings/kernel/MappingDescription.hpp>
+
+
+namespace picongpu
+{
+    /* We have to hold back 350MiB for gpu-internal operations:
+     *   - random number generator
+     *   - reduces
+     *   - ...
+     */
+    constexpr size_t reservedGpuMemorySize = 350 * 1024 * 1024;
+
+    /* short namespace*/
+    namespace mCT = pmacc::math::CT;
+    /** size of a superCell
+     *
+     * volume of a superCell must be <= 1024.
+     * This setup may use local grid size that is a multiple of 2 along x, y.
+     */
+    using SuperCellSize = typename mCT::shrinkTo<mCT::Int<2, 2, 4>, simDim>::type;
+
+    /** define mapper which is used for kernel call mappings */
+    using MappingDesc = MappingDescription<simDim, SuperCellSize>;
+
+    /** define the size of the core, border and guard area
+     *
+     * PIConGPU uses spatial domain-decomposition for parallelization
+     * over multiple devices with non-shared memory architecture.
+     * The global spatial domain is organized per device in three
+     * sections: the GUARD area contains copies of neighboring
+     * devices (also known as "halo"/"ghost").
+     * The BORDER area is the outermost layer of cells of a device,
+     * equally to what neighboring devices see as GUARD area.
+     * The CORE area is the innermost area of a device. In union with
+     * the BORDER area it defines the "active" spatial domain on a device.
+     *
+     * GuardSize is defined in units of SuperCellSize per dimension.
+     * This setup may need several guard supercells along x, y for the arbitrary order field solver.
+     * Also, Esirkepov current deposition requires at least 2 supercells when supercell size is 2.
+     */
+    using GuardSize = typename mCT::shrinkTo<mCT::Int<4, 4, 1>, simDim>::type;
+
+    /** bytes reserved for species exchange buffer
+     *
+     * This is the default configuration for species exchanges buffer sizes.
+     * The default exchange buffer sizes can be changed per species by adding
+     * the alias exchangeMemCfg with similar members like in DefaultExchangeMemCfg
+     * to its flag list.
+     */
+    struct DefaultExchangeMemCfg
+    {
+        // memory used for a direction
+        static constexpr uint32_t BYTES_EXCHANGE_X = 1 * 1024 * 1024; // 1 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Y = 3 * 1024 * 1024; // 3 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Z = 1 * 1024 * 1024; // 1 MiB
+        static constexpr uint32_t BYTES_EDGES = 32 * 1024; // 32 kiB
+        static constexpr uint32_t BYTES_CORNER = 8 * 1024; // 8 kiB
+    };
+
+    /** number of scalar fields that are reserved as temporary fields */
+    constexpr uint32_t fieldTmpNumSlots = 1;
+
+    /** can `FieldTmp` gather neighbor information
+     *
+     * If `true` it is possible to call the method `asyncCommunicationGather()`
+     * to copy data from the border of neighboring GPU into the local guard.
+     * This is also known as building up a "ghost" or "halo" region in domain
+     * decomposition and only necessary for specific algorithms that extend
+     * the basic PIC cycle, e.g. with dependence on derived density or energy fields.
+     */
+    constexpr bool fieldTmpSupportGatherCommunication = true;
+
+} // namespace picongpu
diff --git a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/pml.param b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/pml.param
index 687848479c..68e2d8d6bd 100644
--- a/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/pml.param
+++ b/share/picongpu/examples/FieldAbsorberTest/include/picongpu/param/pml.param
@@ -1,4 +1,4 @@
-/* Copyright 2019-2020 Sergei Bastrakov
+/* Copyright 2019-2021 Sergei Bastrakov
  *
  * This file is part of PIConGPU.
  *
@@ -21,7 +21,7 @@
  *
  * Configure the perfectly matched layer (PML).
  *
- * To enable PML use YeePML field solver.
+ * To enable PML use YeePML, LehePML or ArbitraryOrderFDTDPML field solver.
  */
 
 #pragma once
@@ -29,140 +29,129 @@
 
 namespace picongpu
 {
-namespace fields
-{
-namespace maxwellSolver
-{
-namespace yeePML
-{
-
-    /* The parameters in this file are only used if field solver is YeePML.
-     * The original paper on this approach is J.A. Roden, S.D. Gedney.
-     * Convolution PML (CPML): An efficient FDTD implementation of the CFS - PML
-     * for arbitrary media. Microwave and optical technology letters. 27 (5),
-     * 334-339 (2000).
-     * https://doi.org/10.1002/1098-2760(20001205)27:5%3C334::AID-MOP14%3E3.0.CO;2-A
-     * Our implementation based on a more detailed description in section 7.9 of
-     * the book A. Taflove, S.C. Hagness. Computational Electrodynamics.
-     * The Finite-Difference Time-Domain Method. Third Edition. Artech house,
-     * Boston (2005), referred to as [Taflove, Hagness].
-     */
-
-#    ifndef PARAM_PML_SIZE
-#        define PARAM_PML_SIZE 8
-#    endif
-
-    constexpr uint32_t THICKNESS = PARAM_PML_SIZE;
-
-    /** Thickness of the absorbing layer, in number of cells
-     *
-     * PML is located inside the global simulation area, near the outer borders.
-     * Setting size to 0 results in disabling absorption at the corresponding
-     * boundary. Normally thickness is between 6 and 16 cells, with larger
-     * values providing less reflections.
-     * 8 cells should be good enough for most simulations. There are no
-     * requirements on thickness being a multiple of the supercell size.
-     * It is only required that PML is small enough to fit near-boundary local
-     * domains at all time steps.
-     * Unit: number of cells.
-     */
-    constexpr uint32_t NUM_CELLS[ 3 ][ 2 ] = {
-        { THICKNESS, THICKNESS },  // x direction [negative, positive]
-        { THICKNESS, THICKNESS },  // y direction [negative, positive]
-        { THICKNESS, THICKNESS }   // z direction [negative, positive]
-    };
-
-    /** Order of polynomial grading for artificial electric conductivity and
-     *  stretching coefficient
-     *
-     * The conductivity (sigma) is polynomially scaling from 0 at the internal
-     * border of PML to the maximum value (defined below) at the external
-     * border. The stretching coefficient (kappa) scales from 1 to the
-     * corresponding maximum value (defined below) with the same polynomial.
-     * The grading is given in [Taflove, Hagness], eq. (7.60a, b), with
-     * the order denoted 'm'.
-     * Must be >= 0. Normally between 3 and 4, not required to be integer.
-     * Unitless.
-     */
-    constexpr float_64 SIGMA_KAPPA_GRADING_ORDER = 4.0;
-
-    // [Taflove, Hagness], eq. (7.66)
-    constexpr float_64 SIGMA_OPT_SI[ 3 ] = {
-        0.8 * ( SIGMA_KAPPA_GRADING_ORDER + 1.0 ) / ( SI::Z0_SI * SI::CELL_WIDTH_SI ),
-        0.8 * ( SIGMA_KAPPA_GRADING_ORDER + 1.0 ) / ( SI::Z0_SI * SI::CELL_HEIGHT_SI ),
-        0.8 * ( SIGMA_KAPPA_GRADING_ORDER + 1.0 ) / ( SI::Z0_SI * SI::CELL_DEPTH_SI )
-    };
-
-    // Muptiplier to express SIGMA_MAX_SI with SIGMA_OPT_SI
-    constexpr float_64 SIGMA_OPT_MULTIPLIER = 1.0;
-
-    /** Max value of artificial electric conductivity in PML
-     *
-     * Components correspond to directions: element 0 corresponds to absorption
-     * along x direction, 1 = y, 2 = z. Grading is described in comments for
-     * SIGMA_KAPPA_GRADING_ORDER.
-     * Too small values lead to significant reflections from the external
-     * border, too large - to reflections due to discretization errors.
-     * Artificial magnetic permeability will be chosen to perfectly match this.
-     * Must be >= 0. Normally between 0.7 * SIGMA_OPT_SI and 1.1 * SIGMA_OPT_SI.
-     * Unit: siemens / m.
-     */
-    constexpr float_64 SIGMA_MAX_SI[ 3 ] = {
-        SIGMA_OPT_SI[ 0 ] * SIGMA_OPT_MULTIPLIER,
-        SIGMA_OPT_SI[ 1 ] * SIGMA_OPT_MULTIPLIER,
-        SIGMA_OPT_SI[ 2 ] * SIGMA_OPT_MULTIPLIER
-    };
-
-    /** Max value of coordinate stretching coefficient in PML
-     *
-     * Components correspond to directions: element 0 corresponds to absorption
-     * along x direction, 1 = y, 2 = z. Grading is described in comments for
-     * SIGMA_KAPPA_GRADING_ORDER.
-     * Must be >= 1. For relatively homogeneous domains 1.0 is a reasonable value.
-     * Highly elongated domains can have better absorption with values between
-     * 7.0 and 20.0, for example, see section 7.11.2 in [Taflove, Hagness].
-     * Unitless.
-     */
-#    ifndef PARAM_PML_KAPPA_MAX
-#        define PARAM_PML_KAPPA_MAX 1.0
-#    endif
-    constexpr float_64 KAPPA_MAX[ 3 ] = {
-        PARAM_PML_KAPPA_MAX,
-        PARAM_PML_KAPPA_MAX,
-        PARAM_PML_KAPPA_MAX
-    };
-
-    /** Order of polynomial grading for complex frequency shift
-     *
-     * The complex frequency shift (alpha) is polynomially downscaling from the
-     * maximum value (defined below) at the internal border of PML to 0 at the
-     * external border. The grading is given in [Taflove, Hagness], eq. (7.79),
-     * with the order denoted 'm_a'.
-     * Must be >= 0. Normally values are around 1.0.
-     * Unitless.
-     */
-    constexpr float_64 ALPHA_GRADING_ORDER = 1.0;
-
-    /** Complex frequency shift in PML
-     *
-     * Components correspond to directions: element 0 corresponds to absorption
-     * along x direction, 1 = y, 2 = z. Setting it to 0 will make PML behave
-     * as uniaxial PML. Setting it to a positive value helps to attenuate
-     * evanescent modes, but can degrade absorption of propagating modes, as
-     * described in section 7.7 and 7.11.3 in [Taflove, Hagness].
-     * Must be >= 0. Normally values are 0 or between 0.15 and 0.3.
-     * Unit: siemens / m.
-     */
-#    ifndef PARAM_PML_ALPHA_MAX
-#        define PARAM_PML_ALPHA_MAX 0.2
-#    endif
-    constexpr float_64 ALPHA_MAX_SI[ 3 ] = {
-        PARAM_PML_ALPHA_MAX,
-        PARAM_PML_ALPHA_MAX,
-        PARAM_PML_ALPHA_MAX
-    };
-
-} // namespace yeePML
-} // namespace maxwellSolver
-} // namespace fields
+    namespace fields
+    {
+        namespace maxwellSolver
+        {
+            namespace Pml
+            {
+                /* The parameters in this file are only used if field solver is YeePML or LehePML.
+                 * The original paper on this approach is J.A. Roden, S.D. Gedney.
+                 * Convolution PML (CPML): An efficient FDTD implementation of the CFS - PML
+                 * for arbitrary media. Microwave and optical technology letters. 27 (5),
+                 * 334-339 (2000).
+                 * https://doi.org/10.1002/1098-2760(20001205)27:5%3C334::AID-MOP14%3E3.0.CO;2-A
+                 * Our implementation based on a more detailed description in section 7.9 of
+                 * the book A. Taflove, S.C. Hagness. Computational Electrodynamics.
+                 * The Finite-Difference Time-Domain Method. Third Edition. Artech house,
+                 * Boston (2005), referred to as [Taflove, Hagness].
+                 */
+
+#ifndef PARAM_PML_SIZE
+#    define PARAM_PML_SIZE 10
+#endif
+
+                constexpr uint32_t THICKNESS = PARAM_PML_SIZE;
+
+                /** Thickness of the absorbing layer, in number of cells
+                 *
+                 * PML is located inside the global simulation area, near the outer borders.
+                 * Setting size to 0 results in disabling absorption at the corresponding
+                 * boundary. Normally thickness is between 6 and 16 cells, with larger
+                 * values providing less reflections.
+                 * 8 cells should be good enough for most simulations. There are no
+                 * requirements on thickness being a multiple of the supercell size.
+                 * It is only required that PML is small enough to fit near-boundary local
+                 * domains at all time steps.
+                 * Unit: number of cells.
+                 */
+                constexpr uint32_t NUM_CELLS[3][2] = {
+                    {THICKNESS, THICKNESS}, // x direction [negative, positive]
+                    {THICKNESS, THICKNESS}, // y direction [negative, positive]
+                    {THICKNESS, THICKNESS} // z direction [negative, positive]
+                };
+
+                /** Order of polynomial grading for artificial electric conductivity and
+                 *  stretching coefficient
+                 *
+                 * The conductivity (sigma) is polynomially scaling from 0 at the internal
+                 * border of PML to the maximum value (defined below) at the external
+                 * border. The stretching coefficient (kappa) scales from 1 to the
+                 * corresponding maximum value (defined below) with the same polynomial.
+                 * The grading is given in [Taflove, Hagness], eq. (7.60a, b), with
+                 * the order denoted 'm'.
+                 * Must be >= 0. Normally between 3 and 4, not required to be integer.
+                 * Unitless.
+                 */
+                constexpr float_64 SIGMA_KAPPA_GRADING_ORDER = 4.0;
+
+                // [Taflove, Hagness], eq. (7.66)
+                constexpr float_64 SIGMA_OPT_SI[3]
+                    = {0.8 * (SIGMA_KAPPA_GRADING_ORDER + 1.0) / (SI::Z0_SI * SI::CELL_WIDTH_SI),
+                       0.8 * (SIGMA_KAPPA_GRADING_ORDER + 1.0) / (SI::Z0_SI * SI::CELL_HEIGHT_SI),
+                       0.8 * (SIGMA_KAPPA_GRADING_ORDER + 1.0) / (SI::Z0_SI * SI::CELL_DEPTH_SI)};
+
+                // Muptiplier to express SIGMA_MAX_SI with SIGMA_OPT_SI
+                constexpr float_64 SIGMA_OPT_MULTIPLIER = 1.0;
+
+                /** Max value of artificial electric conductivity in PML
+                 *
+                 * Components correspond to directions: element 0 corresponds to absorption
+                 * along x direction, 1 = y, 2 = z. Grading is described in comments for
+                 * SIGMA_KAPPA_GRADING_ORDER.
+                 * Too small values lead to significant reflections from the external
+                 * border, too large - to reflections due to discretization errors.
+                 * Artificial magnetic permeability will be chosen to perfectly match this.
+                 * Must be >= 0. Normally between 0.7 * SIGMA_OPT_SI and 1.1 * SIGMA_OPT_SI.
+                 * Unit: siemens / m.
+                 */
+                constexpr float_64 SIGMA_MAX_SI[3]
+                    = {SIGMA_OPT_SI[0] * SIGMA_OPT_MULTIPLIER,
+                       SIGMA_OPT_SI[1] * SIGMA_OPT_MULTIPLIER,
+                       SIGMA_OPT_SI[2] * SIGMA_OPT_MULTIPLIER};
+
+                /** Max value of coordinate stretching coefficient in PML
+                 *
+                 * Components correspond to directions: element 0 corresponds to absorption
+                 * along x direction, 1 = y, 2 = z. Grading is described in comments for
+                 * SIGMA_KAPPA_GRADING_ORDER.
+                 * Must be >= 1. For relatively homogeneous domains 1.0 is a reasonable value.
+                 * Highly elongated domains can have better absorption with values between
+                 * 7.0 and 20.0, for example, see section 7.11.2 in [Taflove, Hagness].
+                 * Unitless.
+                 */
+#ifndef PARAM_PML_KAPPA_MAX
+#    define PARAM_PML_KAPPA_MAX 1.0
+#endif
+                constexpr float_64 KAPPA_MAX[3] = {PARAM_PML_KAPPA_MAX, PARAM_PML_KAPPA_MAX, PARAM_PML_KAPPA_MAX};
+
+                /** Order of polynomial grading for complex frequency shift
+                 *
+                 * The complex frequency shift (alpha) is polynomially downscaling from the
+                 * maximum value (defined below) at the internal border of PML to 0 at the
+                 * external border. The grading is given in [Taflove, Hagness], eq. (7.79),
+                 * with the order denoted 'm_a'.
+                 * Must be >= 0. Normally values are around 1.0.
+                 * Unitless.
+                 */
+                constexpr float_64 ALPHA_GRADING_ORDER = 1.0;
+
+                /** Complex frequency shift in PML
+                 *
+                 * Components correspond to directions: element 0 corresponds to absorption
+                 * along x direction, 1 = y, 2 = z. Setting it to 0 will make PML behave
+                 * as uniaxial PML. Setting it to a positive value helps to attenuate
+                 * evanescent modes, but can degrade absorption of propagating modes, as
+                 * described in section 7.7 and 7.11.3 in [Taflove, Hagness].
+                 * Must be >= 0. Normally values are 0 or between 0.15 and 0.3.
+                 * Unit: siemens / m.
+                 */
+#ifndef PARAM_PML_ALPHA_MAX
+#    define PARAM_PML_ALPHA_MAX 0.2
+#endif
+                constexpr float_64 ALPHA_MAX_SI[3] = {PARAM_PML_ALPHA_MAX, PARAM_PML_ALPHA_MAX, PARAM_PML_ALPHA_MAX};
+
+            } // namespace Pml
+        } // namespace maxwellSolver
+    } // namespace fields
 } // namespace picongpu
diff --git a/share/picongpu/examples/FoilLCT/README.rst b/share/picongpu/examples/FoilLCT/README.rst
index 9426007de4..a4938e2e6e 100644
--- a/share/picongpu/examples/FoilLCT/README.rst
+++ b/share/picongpu/examples/FoilLCT/README.rst
@@ -7,7 +7,7 @@ FoilLCT: Ion Acceleration from a Liquid-Crystal Target
 .. moduleauthor:: Axel Huebl, T. Kluge
 
 The following example models a laser-ion accelerator in the [TNSA]_ regime.
-An optically over-dense target (:math:`n_\text{max} = 192 n_\text{c}`) consisting of a liquid-crystal material *8CB* (4-octyl-4'-cyanobiphenyl) :math:`C_{21}H_{25}N` is used.
+An optically over-dense target (:math:`n_\text{max} = 192 n_\text{c}`) consisting of a liquid-crystal material *8CB* (4-octyl-4'-cyanobiphenyl) :math:`C_{21}H_{25}N` is used [LCT]_.
 
 Irradiated with a high-power laser pulse with :math:`a_0 = 5` the target is assumed to be partly pre-ionized due to realistic laser contrast and pre-pulses to :math:`C^{2+}`, :math:`H^+` and :math:`N^{2+}` while being slightly expanded on its surfaces (modeled as exponential density slope).
 The overall target is assumed to be initially quasi-neutral and the *8CB* ion components are are not demixed in the surface regions.
@@ -32,3 +32,9 @@ References
        *Energetic proton generation in ultra-intense laser-solid interactions*,
        Physics of Plasmas **8**, 542 (2001),
        https://dx.doi.org/10.1063/1.1333697
+       
+.. [LCT]
+       P.L. Poole, L. Obst, G.E. Cochran, J. Metzkes, H.-P. Schlenvoigt, I. Prencipe, T. Kluge, T.E. Cowan, U. Schramm, and D.W. Schumacher.
+       *Laser-driven ion acceleration via target normal sheath acceleration in the relativistic transparency regime*,
+       New Journal of Physics **20**, 013019 (2018),
+       https://dx.doi.org/10.1088/1367-2630/aa9d47
diff --git a/share/picongpu/examples/FoilLCT/bin/plot_charge_density.py b/share/picongpu/examples/FoilLCT/bin/plot_charge_density.py
index acf019aa57..a53df67332 100755
--- a/share/picongpu/examples/FoilLCT/bin/plot_charge_density.py
+++ b/share/picongpu/examples/FoilLCT/bin/plot_charge_density.py
@@ -3,7 +3,7 @@
 """
 This file is part of the PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Axel Huebl
 License: GPLv3+
 """
diff --git a/share/picongpu/examples/FoilLCT/cmakeFlags b/share/picongpu/examples/FoilLCT/cmakeFlags
index f7039acc31..1dd3a5c723 100755
--- a/share/picongpu/examples/FoilLCT/cmakeFlags
+++ b/share/picongpu/examples/FoilLCT/cmakeFlags
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch, Jakob Trojok
 #
 # This file is part of PIConGPU.
 #
@@ -30,7 +30,7 @@
 #   - increase by 1, no gaps
 
 flags[0]=""
-flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_LASERPROFILE=ExpRampWithPrepulse'"
+flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_LASERPROFILE=ExpRampWithPrepulse;-DPARAM_IONIZATIONCURRENT=EnergyConservation'"
 
 ################################################################################
 # execution
diff --git a/share/picongpu/examples/FoilLCT/etc/picongpu/4.cfg b/share/picongpu/examples/FoilLCT/etc/picongpu/4.cfg
index 9181056a81..8f8d3da9f3 100644
--- a/share/picongpu/examples/FoilLCT/etc/picongpu/4.cfg
+++ b/share/picongpu/examples/FoilLCT/etc/picongpu/4.cfg
@@ -1,4 +1,4 @@
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl, Franz Poeschel
 #
 # This file is part of PIConGPU.
 #
@@ -71,12 +71,12 @@ TBG_sumEnergy="--fields_energy.period 100 \
 TBG_chargeConservation="--chargeConservation.period 100"
 
 # regular output
-TBG_hdf5="--hdf5.period 250 --hdf5.file simData"
+TBG_openPMD="--openPMD.period 250 --openPMD.file simData --openPMD.ext bp"
 
 TBG_plugins="!TBG_e_histogram !TBG_H_histogram !TBG_C_histogram !TBG_N_histogram \
              !TBG_e_PSypy !TBG_H_PSypy !TBG_C_PSypy !TBG_N_PSypy                 \
              !TBG_sumEnergy !TBG_chargeConservation                              \
-             !TBG_hdf5"
+             !TBG_openPMD"
 
 
 #################################
diff --git a/share/picongpu/examples/FoilLCT/etc/picongpu/4_isaac.cfg b/share/picongpu/examples/FoilLCT/etc/picongpu/4_isaac.cfg
index 11a568a3ac..4fb7c54cc5 100644
--- a/share/picongpu/examples/FoilLCT/etc/picongpu/4_isaac.cfg
+++ b/share/picongpu/examples/FoilLCT/etc/picongpu/4_isaac.cfg
@@ -1,4 +1,4 @@
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -46,7 +46,7 @@ TBG_periodic="--periodic 0 0 0"
 ## Section: Optional Variables ##
 #################################
 
-TBG_isaac="--isaac.period 1 --isaac.name foil --isaac.url hypnos5 --isaac.quality 99"
+TBG_isaac="--isaac.period 1 --isaac.name foil --isaac.url hemera4 --isaac.quality 99"
 # futher options:
 #   URL of the server
 #     --isaac.url URL
diff --git a/share/picongpu/examples/FoilLCT/etc/picongpu/8.cfg b/share/picongpu/examples/FoilLCT/etc/picongpu/8.cfg
index 539cf9437b..2567201d1c 100644
--- a/share/picongpu/examples/FoilLCT/etc/picongpu/8.cfg
+++ b/share/picongpu/examples/FoilLCT/etc/picongpu/8.cfg
@@ -1,4 +1,4 @@
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl, Franz Poeschel
 #
 # This file is part of PIConGPU.
 #
@@ -71,12 +71,12 @@ TBG_sumEnergy="--fields_energy.period 100 \
 TBG_chargeConservation="--chargeConservation.period 100"
 
 # regular output
-TBG_hdf5="--hdf5.period 250 --hdf5.file simData"
+TBG_openPMD="--openPMD.period 250 --openPMD.file simData --openPMD.ext bp"
 
 TBG_plugins="!TBG_e_histogram !TBG_H_histogram !TBG_C_histogram !TBG_N_histogram \
              !TBG_e_PSypy !TBG_H_PSypy !TBG_C_PSypy !TBG_N_PSypy                 \
              !TBG_sumEnergy !TBG_chargeConservation                              \
-             !TBG_hdf5"
+             !TBG_openPMD"
 
 
 #################################
diff --git a/share/picongpu/examples/FoilLCT/etc/picongpu/8_isaac.cfg b/share/picongpu/examples/FoilLCT/etc/picongpu/8_isaac.cfg
index c72bccdec9..b5e54b5825 100644
--- a/share/picongpu/examples/FoilLCT/etc/picongpu/8_isaac.cfg
+++ b/share/picongpu/examples/FoilLCT/etc/picongpu/8_isaac.cfg
@@ -1,4 +1,4 @@
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -46,7 +46,7 @@ TBG_periodic="--periodic 0 0 0"
 ## Section: Optional Variables ##
 #################################
 
-TBG_isaac="--isaac.period 1 --isaac.name foil --isaac.url hypnos5 --isaac.quality 99"
+TBG_isaac="--isaac.period 1 --isaac.name foil --isaac.url hemera4 --isaac.quality 99"
 # futher options:
 #   URL of the server
 #     --isaac.url URL
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/density.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/density.param
index 806db8eeb9..a86f0b1b75 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/density.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -34,81 +34,71 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     *
-     * We take n_e ("fully ionized") as reference density.
-     * Our target material (see speciesDefinition) is a liquid crystal called
-     * 8CB (4'-octyl-4-cyanobiphenyl).
-     */
-     constexpr float_64 nc = 1.11485e21 * 1.e6 / 0.8 / 0.8;
-     constexpr float_64 BASE_DENSITY_SI = 192. * nc;
-
-} // namespace SI
-
-namespace densityProfiles
-{
-    struct FlatFoilWithRampFunctor
+    namespace SI
     {
-        /** This formula uses SI quantities only.
-         *  The profile will be multiplied by BASE_DENSITY_SI.
+        /** Base density in particles per m^3 in the density profiles.
          *
-         * @param position_SI total offset including all slides [meter]
-         * @param cellSize_SI cell sizes [meter]
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
          *
-         * @return float_X density [normalized to 1.0]
+         * unit: ELEMENTS/m^3
+         *
+         * We take n_e ("fully ionized") as reference density.
+         * Our target material (see speciesDefinition) is a liquid crystal called
+         * 8CB (4'-octyl-4-cyanobiphenyl).
          */
-        HDINLINE float_X
-        operator()(
-            const floatD_64& position_SI,
-            const float3_64& cellSize_SI
-        )
+        constexpr float_64 nc = 1.11485e21 * 1.e6 / 0.8 / 0.8;
+        constexpr float_64 BASE_DENSITY_SI = 192. * nc;
+
+    } // namespace SI
+
+    namespace densityProfiles
+    {
+        struct FlatFoilWithRampFunctor
         {
-            // m -> mu
-            const float_64 y( position_SI.y() * 1.e6 );
+            /** This formula uses SI quantities only.
+             *  The profile will be multiplied by BASE_DENSITY_SI.
+             *
+             * @param position_SI total offset including all slides [meter]
+             * @param cellSize_SI cell sizes [meter]
+             *
+             * @return float_X density [normalized to 1.0]
+             */
+            HDINLINE float_X operator()(const floatD_64& position_SI, const float3_64& cellSize_SI)
+            {
+                // m -> mu
+                const float_64 y(position_SI.y() * 1.e6);
 
-            // target begin & end (plateau)
-            constexpr float_64 y0( 0.5 );
-            constexpr float_64 y1( y0 + 1.0 );
-            // exponential pre-expanded density
-            constexpr float_64 L( 10.e-3 );
-            constexpr float_64 L_cutoff( 4. * L );
+                // target begin & end (plateau)
+                constexpr float_64 y0(0.5);
+                constexpr float_64 y1(y0 + 1.0);
+                // exponential pre-expanded density
+                constexpr float_64 L(10.e-3);
+                constexpr float_64 L_cutoff(4. * L);
 
-            float_64 dens = 0.0;
+                float_64 dens = 0.0;
 
-            // upramp
-            if( y < y0 && (y0 - y) < L_cutoff )
-                dens = math::exp( ( y - y0 ) / L );
-            // downramp
-            if( y > y1 && (y - y1) < L_cutoff )
-                dens = math::exp( ( y1 - y ) / L );
-            // plateau
-            if( y >= y0 && y <= y1 )
-                dens = 1.0;
+                // upramp
+                if(y < y0 && (y0 - y) < L_cutoff)
+                    dens = math::exp((y - y0) / L);
+                // downramp
+                if(y > y1 && (y - y1) < L_cutoff)
+                    dens = math::exp((y1 - y) / L);
+                // plateau
+                if(y >= y0 && y <= y1)
+                    dens = 1.0;
 
-            // safety check: all parts of the function MUST be > 0
-            dens *= float_64( dens >= 0.0 );
-            return dens;
-        }
-    };
+                // safety check: all parts of the function MUST be > 0
+                dens *= float_64(dens >= 0.0);
+                return dens;
+            }
+        };
 
-    // definition of free formula profile
-    using FlatFoilWithRamp = FreeFormulaImpl< FlatFoilWithRampFunctor >;
+        // definition of free formula profile
+        using FlatFoilWithRamp = FreeFormulaImpl<FlatFoilWithRampFunctor>;
 
-    // put probe particles every 4th cell in X, Y(, Z)
-    using ProbeEveryFourthCell = EveryNthCellImpl<
-        mCT::UInt32<
-            4,
-            4,
-            4
-        >
-    >;
-} // namespace densityProfiles
+        // put probe particles every 4th cell in X, Y(, Z)
+        using ProbeEveryFourthCell = EveryNthCellImpl<mCT::UInt32<4, 4, 4>>;
+    } // namespace densityProfiles
 } // namespace picongpu
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/dimension.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/dimension.param
index a69f7998f9..8cb96ebe58 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/dimension.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/dimension.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl
+/* Copyright 2014-2021 Axel Huebl
  *
  * This file is part of PIConGPU.
  *
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/fileOutput.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/fileOutput.param
index 798173d705..c611c5c0f8 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/fileOutput.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/fileOutput.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
  *                     Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -63,55 +63,34 @@ namespace picongpu
     namespace deriveField = particles::particleToGrid;
 
     /* Density section */
-    using Density_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::Density
-    >;
+    using Density_Seq = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::Density>;
 
     /* BoundElectronDensity section */
-    using BoundElectronDensity_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::BoundElectronDensity
-    >;
+    using BoundElectronDensity_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::BoundElectronDensity>;
 
     /* ChargeDensity section */
-    using ChargeDensity_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::ChargeDensity
-    >;
+    using ChargeDensity_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::ChargeDensity>;
 
     /* EnergyDensity section */
-    using EnergyDensity_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::EnergyDensity
-    >;
+    using EnergyDensity_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::EnergyDensity>;
 
     /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
      *
      * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
      */
-    using FieldTmpSolvers = MakeSeq_t<
-        Density_Seq,
-        BoundElectronDensity_Seq,
-        ChargeDensity_Seq,
-        EnergyDensity_Seq
-    >;
+    using FieldTmpSolvers = MakeSeq_t<Density_Seq, BoundElectronDensity_Seq, ChargeDensity_Seq, EnergyDensity_Seq>;
 
 
     /** FileOutputFields: Groups all Fields that shall be dumped *************/
 
     /** Possible native fields: FieldE, FieldB, FieldJ
      */
-    using NativeFileOutputFields = MakeSeq_t<
-        FieldE,
-        FieldB,
-        FieldJ
-    >;
+    using NativeFileOutputFields = MakeSeq_t<FieldE, FieldB, FieldJ>;
 
-    using FileOutputFields = MakeSeq_t<
-        NativeFileOutputFields,
-        FieldTmpSolvers
-    >;
+    using FileOutputFields = MakeSeq_t<NativeFileOutputFields, FieldTmpSolvers>;
 
 
     /** FileOutputParticles: Groups all Species that shall be dumped **********
@@ -119,6 +98,6 @@ namespace picongpu
      * hint: to enable particle output set to
      *   using FileOutputParticles = VectorAllSpecies;
      */
-    using FileOutputParticles = MakeSeq_t< Probes >;
+    using FileOutputParticles = MakeSeq_t<Probes>;
 
-}
+} // namespace picongpu
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/grid.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/grid.param
index 1db5e05440..f50381ff3e 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -61,7 +61,7 @@ namespace picongpu
          *  unit: seconds                       CFL criteria for Yee MW Solver
          *                                             2D: sqrt(2)
          *                                             3D: sqrt(3)   */
-        constexpr float_64 DELTA_T_SI = CELL_WIDTH_SI / ( 1.415 * SPEED_OF_LIGHT_SI );
+        constexpr float_64 DELTA_T_SI = CELL_WIDTH_SI / (1.415 * SPEED_OF_LIGHT_SI);
 
     } // namespace SI
 
@@ -70,9 +70,9 @@ namespace picongpu
      *  unit: none
      */
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {64, 64},  /*x direction [negative,positive]*/
-        {64, 64},  /*y direction [negative,positive]*/
-        {64, 64}   /*z direction [negative,positive]*/
+        {64, 64}, /*x direction [negative,positive]*/
+        {64, 64}, /*y direction [negative,positive]*/
+        {64, 64} /*z direction [negative,positive]*/
     };
 
     /** Define the strength of the absorber for any direction
@@ -82,7 +82,7 @@ namespace picongpu
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
     };
 
     /** When to move the co-moving window.
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/laser.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/laser.param
index fe85a021e6..6fa4e7b348 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/laser.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/laser.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch,
  *                     Alexander Debus
  *
  * This file is part of PIConGPU.
@@ -48,149 +48,153 @@
 #include <pmacc/ppFunctions.hpp>
 
 #ifndef PARAM_LASERPROFILE
-#define PARAM_LASERPROFILE PlaneWave
+#    define PARAM_LASERPROFILE PlaneWave
 #endif
 
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-    struct PlaneWaveParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
-
-        /** UNITCONV */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        static constexpr float_64 _A0  = 5.0;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** The profile of the test Lasers 0 and 2 can be stretched by a
-         *      constexprant area between the up and downramp
-         *  unit: seconds */
-        static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 0.0;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 25.0e-15 / 2.354820045;
-
-        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before and after the plateau
-         *  unit: none */
-        static constexpr float_64 RAMP_INIT = 3. * 2.354820045;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0u;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = LINEAR_X;
-    };
-
-    struct ExpRampWithPrepulseParam : PlaneWaveParam
-    {
-        /* Laser profile with Gaussian spatial envelope and the following
-         * temporal shape:
-         * A Gaussian peak (optionally lengthened by a plateau) is preceded by
-         * two pieces of exponential preramps, defined by 3 (time, intensity)-
-         * -points.
-         * The first two points get connected by an exponential, the 2nd and
-         * 3rd point are connected by another exponential, which is then
-         * extrapolated to the peak. The Gaussian is added everywhere, but
-         * typically contributes significantly only near the peak.
-         * It is advisable to set the third point far enough from the plateau
-         * (approx 3*FWHM), then the contribution from the Gaussian is
-         * negligible there, and the intensity can be set as measured from the
-         * laser profile.
-         * Optionally a Gaussian prepulse can be added, given by the parameters
-         * of the relative intersity and time point.
-         * The time of the prepulse and the three preramp points are given in
-         * SI, the intensities are given as multiples of the peak intensity.
-         */
-
-        // Intensities of prepulse and exponential preramp
-        static constexpr float_X INT_RATIO_PREPULSE = 0.;
-        static constexpr float_X INT_RATIO_POINT_1 = 1.e-8;
-        static constexpr float_X INT_RATIO_POINT_2 = 1.e-4;
-        static constexpr float_X INT_RATIO_POINT_3 = 1.e-4;
-
-        // time-positions of prepulse and preramps points
-        static constexpr float_64 TIME_PREPULSE_SI = -950.0e-15;
-        static constexpr float_64 TIME_PEAKPULSE_SI = 0.0e-15;
-        static constexpr float_64 TIME_POINT_1_SI = -1000.0e-15;
-        static constexpr float_64 TIME_POINT_2_SI = -300.0e-15;
-        static constexpr float_64 TIME_POINT_3_SI = -100.0e-15;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = 3.0e-14 / 2.35482; // half of the time in which E falls to half its initial value (then I falls to half its value in 15fs, approx 6 wavelengths). Those are 4.8 wavelenghts.
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              WO_X_SI is this distance in x-direction
-         *              W0_Z_SI is this distance in z-direction
-         *              if both values are equal, the laser has a circular shape in x-z
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *  unit: meter */
-        static constexpr float_64 W0_X_SI = 2.5 * WAVE_LENGTH_SI;
-        static constexpr float_64 W0_Z_SI = W0_X_SI;
-
-        /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before plateau
-        and half at the end of the plateau
-         *  unit: none */
-        static constexpr float_64 RAMP_INIT = 16.0;
-    };
-
-    //! currently selected laser profile
-    // using Selected = PlaneWave< PlaneWaveParam >;
-    using Selected = PARAM_LASERPROFILE< PMACC_JOIN( PARAM_LASERPROFILE, Param )>;
-
-} // namespace laserProfiles
-} // namespace fields
+            struct PlaneWaveParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = 0.8e-6;
+
+                /** UNITCONV */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                static constexpr float_64 _A0 = 5.0;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** The profile of the test Lasers 0 and 2 can be stretched by a
+                 *      constexprant area between the up and downramp
+                 *  unit: seconds */
+                static constexpr float_64 LASER_NOFOCUS_CONSTANT_SI = 0.0;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 25.0e-15 / 2.354820045;
+
+                /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before and after
+                 * the plateau unit: none */
+                static constexpr float_64 RAMP_INIT = 3. * 2.354820045;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0u;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                /** Available polarisation types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = LINEAR_X;
+            };
+
+            struct ExpRampWithPrepulseParam : PlaneWaveParam
+            {
+                /* Laser profile with Gaussian spatial envelope and the following
+                 * temporal shape:
+                 * A Gaussian peak (optionally lengthened by a plateau) is preceded by
+                 * two pieces of exponential preramps, defined by 3 (time, intensity)-
+                 * -points.
+                 * The first two points get connected by an exponential, the 2nd and
+                 * 3rd point are connected by another exponential, which is then
+                 * extrapolated to the peak. The Gaussian is added everywhere, but
+                 * typically contributes significantly only near the peak.
+                 * It is advisable to set the third point far enough from the plateau
+                 * (approx 3*FWHM), then the contribution from the Gaussian is
+                 * negligible there, and the intensity can be set as measured from the
+                 * laser profile.
+                 * Optionally a Gaussian prepulse can be added, given by the parameters
+                 * of the relative intersity and time point.
+                 * The time of the prepulse and the three preramp points are given in
+                 * SI, the intensities are given as multiples of the peak intensity.
+                 */
+
+                // Intensities of prepulse and exponential preramp
+                static constexpr float_X INT_RATIO_PREPULSE = 0.;
+                static constexpr float_X INT_RATIO_POINT_1 = 1.e-8;
+                static constexpr float_X INT_RATIO_POINT_2 = 1.e-4;
+                static constexpr float_X INT_RATIO_POINT_3 = 1.e-4;
+
+                // time-positions of prepulse and preramps points
+                static constexpr float_64 TIME_PREPULSE_SI = -950.0e-15;
+                static constexpr float_64 TIME_PEAKPULSE_SI = 0.0e-15;
+                static constexpr float_64 TIME_POINT_1_SI = -1000.0e-15;
+                static constexpr float_64 TIME_POINT_2_SI = -300.0e-15;
+                static constexpr float_64 TIME_POINT_3_SI = -100.0e-15;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = 3.0e-14
+                    / 2.35482; // half of the time in which E falls to half its initial value (then I falls to half its
+                               // value in 15fs, approx 6 wavelengths). Those are 4.8 wavelenghts.
+
+                /** beam waist: distance from the axis where the pulse intensity (E^2)
+                 *              decreases to its 1/e^2-th part,
+                 *              WO_X_SI is this distance in x-direction
+                 *              W0_Z_SI is this distance in z-direction
+                 *              if both values are equal, the laser has a circular shape in x-z
+                 * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                 *                             [   1.17741    ]
+                 *  unit: meter */
+                static constexpr float_64 W0_X_SI = 2.5 * WAVE_LENGTH_SI;
+                static constexpr float_64 W0_Z_SI = W0_X_SI;
+
+                /** The laser pulse will be initialized half of PULSE_INIT times of the PULSE_LENGTH before plateau
+                and half at the end of the plateau
+                 *  unit: none */
+                static constexpr float_64 RAMP_INIT = 16.0;
+            };
+
+            //! currently selected laser profile
+            // using Selected = PlaneWave< PlaneWaveParam >;
+            using Selected = PARAM_LASERPROFILE<PMACC_JOIN(PARAM_LASERPROFILE, Param)>;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/memory.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/memory.param
index 6765369121..c403413b56 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/memory.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/memory.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -31,10 +31,10 @@
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/mappings/kernel/MappingDescription.hpp>
 
+#include <array>
 
 namespace picongpu
 {
-
     /* We have to hold back 350MiB for gpu-internal operations:
      *   - random number generator
      *   - reduces
@@ -48,10 +48,10 @@ namespace picongpu
      *
      * volume of a superCell must be <= 1024
      */
-    using SuperCellSize = mCT::Int< 16, 16 >;
+    using SuperCellSize = mCT::Int<16, 16>;
 
     /** define mapper which is used for kernel call mappings */
-    using MappingDesc = MappingDescription< simDim, SuperCellSize >;
+    using MappingDesc = MappingDescription<simDim, SuperCellSize>;
 
     /** define the size of the core, border and guard area
      *
@@ -67,10 +67,7 @@ namespace picongpu
      *
      * GuardSize is defined in units of SuperCellSize per dimension.
      */
-    using GuardSize = typename mCT::shrinkTo<
-        mCT::Int< 1, 1, 1 >,
-        simDim
-    >::type;
+    using GuardSize = typename mCT::shrinkTo<mCT::Int<1, 1, 1>, simDim>::type;
 
     /** bytes reserved for species exchange buffer
      *
@@ -87,6 +84,21 @@ namespace picongpu
         static constexpr uint32_t BYTES_EXCHANGE_Z = 3 * 1024 * 1024; // 3 MiB
         static constexpr uint32_t BYTES_EDGES = 128 * 1024; // 128 kiB
         static constexpr uint32_t BYTES_CORNER = 32 * 1024; // 32 kiB
+
+        /** Reference local domain size
+         *
+         * The size of the local domain for which the exchange sizes `BYTES_*` are configured for.
+         * The required size of each exchange will be calculated at runtime based on the local domain size and the
+         * reference size. The exchange size will be scaled only up and not down. Zero means that there is no reference
+         * domain size, exchanges will not be scaled.
+         */
+        using REF_LOCAL_DOM_SIZE = mCT::Int<0, 0, 0>;
+        /** Scaling rate per direction.
+         *
+         * 1.0 means it scales linear with the ratio between the local domain size at runtime and the reference local
+         * domain size.
+         */
+        const std::array<float_X, 3> DIR_SCALING_FACTOR = {{0.0, 0.0, 0.0}};
     };
 
     /** number of scalar fields that are reserved as temporary fields */
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param
index 8ac1a6bc87..06c0005e97 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -34,103 +34,95 @@
 
 #include <pmacc/nvidia/functors/Add.hpp>
 #include <pmacc/nvidia/functors/Assign.hpp>
-#include <pmacc/nvidia/rng/distributions/Uniform_float.hpp>
 
 
 namespace picongpu
 {
-namespace particles
-{
-
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *
-     *  unit: none
-     *
-     * here we essentially allow any weighting since it has no real meaning in 2D3V
-     */
-    constexpr float_X MIN_WEIGHTING = 0.0000001;
-
-namespace manipulators
-{
-    // ionize ions once by removing one bound electron
-    struct OnceIonizedImpl
-    {
-        template< typename T_Particle >
-        DINLINE void operator()(
-            T_Particle& particle
-        )
-        {
-            constexpr float_X protonNumber = GetAtomicNumbers< T_Particle >::type::numberOfProtons;
-            particle[ boundElectrons_ ] = protonNumber - 1.0_X;
-        }
-    };
-    using OnceIonized = generic::Free< OnceIonizedImpl >;
-
-    //! ionize ions twice
-    struct TwiceIonizedImpl
-    {
-        template< typename T_Particle >
-        DINLINE void operator()(
-            T_Particle& particle
-        )
-        {
-            constexpr float_X protonNumber = GetAtomicNumbers< T_Particle >::type::numberOfProtons;
-            particle[ boundElectrons_ ] = protonNumber - 2._X;
-        }
-    };
-
-    //! definition of TwiceIonizedImpl manipulator
-    using TwiceIonized = generic::Free< TwiceIonizedImpl >;
-
-    //! changes the in-cell position of each particle of a species
-    using RandomPosition = unary::RandomPosition;
-
-} // namespace manipulators
-
-
-namespace startPosition
-{
-    struct RandomParameter6ppc
+    namespace particles
     {
-        /** Count of particles per cell at initial state
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
          *
          *  unit: none
-         */
-        static constexpr uint32_t numParticlesPerCell = 6u;
-    };
-    using Random6ppc = RandomImpl< RandomParameter6ppc >;
-
-    // probe particles sit directly in lower corner of the cell
-    CONST_VECTOR(
-        float_X,
-        3,
-        InCellOffset,
-        // each x, y, z in-cell position component in range [0.0, 1.0)
-        0.0,
-        0.0,
-        0.0
-    );
-    struct OnePositionParameter
-    {
-        /** Count of particles per cell at initial state
          *
-         *  unit: none
+         * here we essentially allow any weighting since it has no real meaning in 2D3V
          */
-        static constexpr uint32_t numParticlesPerCell = 1u;
+        constexpr float_X MIN_WEIGHTING = 0.0000001;
 
-        const InCellOffset_t inCellOffset;
-    };
-    using OnePosition = OnePositionImpl< OnePositionParameter >;
-
-} // namespace startPosition
-
-    /** During unit normalization, we assume this is a typical
-     *  number of particles per cell for normalization of weighted
-     *  particle attributes.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL =
-        startPosition::RandomParameter6ppc::numParticlesPerCell;
+        namespace manipulators
+        {
+            // ionize ions once by removing one bound electron
+            struct OnceIonizedImpl
+            {
+                template<typename T_Particle>
+                DINLINE void operator()(T_Particle& particle)
+                {
+                    constexpr float_X protonNumber = GetAtomicNumbers<T_Particle>::type::numberOfProtons;
+                    particle[boundElectrons_] = protonNumber - 1.0_X;
+                }
+            };
+            using OnceIonized = generic::Free<OnceIonizedImpl>;
+
+            //! ionize ions twice
+            struct TwiceIonizedImpl
+            {
+                template<typename T_Particle>
+                DINLINE void operator()(T_Particle& particle)
+                {
+                    constexpr float_X protonNumber = GetAtomicNumbers<T_Particle>::type::numberOfProtons;
+                    particle[boundElectrons_] = protonNumber - 2._X;
+                }
+            };
+
+            //! definition of TwiceIonizedImpl manipulator
+            using TwiceIonized = generic::Free<TwiceIonizedImpl>;
+
+            //! changes the in-cell position of each particle of a species
+            using RandomPosition = unary::RandomPosition;
+
+        } // namespace manipulators
+
+
+        namespace startPosition
+        {
+            struct RandomParameter6ppc
+            {
+                /** Count of particles per cell at initial state
+                 *
+                 *  unit: none
+                 */
+                static constexpr uint32_t numParticlesPerCell = 6u;
+            };
+            using Random6ppc = RandomImpl<RandomParameter6ppc>;
+
+            // probe particles sit directly in lower corner of the cell
+            CONST_VECTOR(
+                float_X,
+                3,
+                InCellOffset,
+                // each x, y, z in-cell position component in range [0.0, 1.0)
+                0.0,
+                0.0,
+                0.0);
+            struct OnePositionParameter
+            {
+                /** Count of particles per cell at initial state
+                 *
+                 *  unit: none
+                 */
+                static constexpr uint32_t numParticlesPerCell = 1u;
+
+                const InCellOffset_t inCellOffset;
+            };
+            using OnePosition = OnePositionImpl<OnePositionParameter>;
+
+        } // namespace startPosition
+
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = startPosition::RandomParameter6ppc::numParticlesPerCell;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/speciesDefinition.param
index 1e19dab033..9ba99dce09 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Heiko Burau, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -44,188 +44,141 @@
 
 namespace picongpu
 {
-
-/*########################### define particle attributes #####################*/
-
-//! describe attributes of a particle
-using DefaultParticleAttributes = MakeSeq_t<
-    position< position_pic >,
-    momentum,
-    weighting
->;
-
-//! ions also need to have a boundElectrons attribute for ionization
-using IonParticleAttributes = MakeSeq_t<
-    DefaultParticleAttributes,
-    boundElectrons
->;
-
-/*########################### end particle attributes ########################*/
-
-/*########################### define species #################################*/
-
-/*--------------------------- electrons --------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioElectrons, 1.0 );
-value_identifier( float_X, ChargeRatioElectrons, 1.0 );
-
-using ParticleFlagsElectrons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioElectrons >,
-    chargeRatio< ChargeRatioElectrons >
->;
-
-/* define species electrons */
-using Electrons = Particles<
-    PMACC_CSTRING( "e" ),
-    ParticleFlagsElectrons,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- H+ --------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioHydrogen, 1836.152672 );
-value_identifier( float_X, ChargeRatioHydrogen, -1.0 );
-
-/* ratio relative to BASE_DENSITY (n_e) */
-value_identifier( float_X, DensityRatioHydrogen, 25. / 158. );
-
-using ParticleFlagsHydrogen = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioHydrogen >,
-    chargeRatio< ChargeRatioHydrogen >,
-    densityRatio< DensityRatioHydrogen >,
-    atomicNumbers< ionization::atomicNumbers::Hydrogen_t >,
-    ionizationEnergies< ionization::energies::AU::Hydrogen_t >,
-    effectiveNuclearCharge< ionization::effectiveNuclearCharge::Hydrogen_t >,
-    ionizers<
-        MakeSeq_t<
-            particles::ionization::BSIEffectiveZ< Electrons >,
-            particles::ionization::ADKLinPol< Electrons >,
-            particles::ionization::ThomasFermi< Electrons >
-        >
-    >
->;
-
-/* define species Hydrogen */
-using Hydrogen = Particles<
-    PMACC_CSTRING( "H" ),
-    ParticleFlagsHydrogen,
-    IonParticleAttributes
->;
-
-/*--------------------------- C ---------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioCarbon, 22032.0 );
-value_identifier( float_X, ChargeRatioCarbon, -6.0 );
-
-/* ratio relative to BASE_DENSITY (n_e) */
-value_identifier( float_X, DensityRatioCarbon, 21. / 158. );
-
-using ParticleFlagsCarbon = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioCarbon >,
-    chargeRatio< ChargeRatioCarbon >,
-    densityRatio< DensityRatioCarbon >,
-    atomicNumbers< ionization::atomicNumbers::Carbon_t >,
-    ionizationEnergies< ionization::energies::AU::Carbon_t >,
-    effectiveNuclearCharge< ionization::effectiveNuclearCharge::Carbon_t >,
-    ionizers<
-        MakeSeq_t<
-            particles::ionization::BSIEffectiveZ< Electrons >,
-            particles::ionization::ADKLinPol< Electrons >,
-            particles::ionization::ThomasFermi< Electrons >
-        >
-    >
->;
-
-/* define species Carbon */
-using Carbon = Particles<
-    PMACC_CSTRING( "C" ),
-    ParticleFlagsCarbon,
-    IonParticleAttributes
->;
-
-/*--------------------------- N ---------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioNitrogen, 25716.852 );
-value_identifier( float_X, ChargeRatioNitrogen, -7.0 );
-
-/* ratio relative to BASE_DENSITY (n_e) */
-value_identifier( float_X, DensityRatioNitrogen, 1. / 158. );
-
-using ParticleFlagsNitrogen = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioNitrogen >,
-    chargeRatio< ChargeRatioNitrogen >,
-    densityRatio< DensityRatioNitrogen >,
-    atomicNumbers< ionization::atomicNumbers::Nitrogen_t >,
-    ionizationEnergies< ionization::energies::AU::Nitrogen_t >,
-    effectiveNuclearCharge< ionization::effectiveNuclearCharge::Nitrogen_t >,
-    ionizers<
-        MakeSeq_t<
-            particles::ionization::BSIEffectiveZ< Electrons >,
-            particles::ionization::ADKLinPol< Electrons >,
-            particles::ionization::ThomasFermi< Electrons >
-        >
-    >
->;
-
-/* define species Nitrogen */
-using Nitrogen = Particles<
-    PMACC_CSTRING( "N" ),
-    ParticleFlagsNitrogen,
-    IonParticleAttributes
->;
-
-/*--------------------------- Probe Particles -------------------------------*/
-
-using ParticleFlagsProbes = MakeSeq_t<
-    particlePusher< particles::pusher::Probe >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >
->;
-
-/* define species Probe */
-using Probes = Particles<
-    PMACC_CSTRING( "probe" ),
-    ParticleFlagsProbes,
-    MakeSeq_t<
-        position< position_pic >,
-        probeB,
-        probeE
-    >
->;
-
-/*########################### end species ####################################*/
-
-/** All known particle species of the simulation
- *
- * List all defined particle species from above in this list
- * to make them available to the PIC algorithm.
- */
-using VectorAllSpecies = MakeSeq_t<
-    Electrons,
-    Hydrogen,
-    Carbon,
-    Nitrogen,
-    Probes
->;
+    /*########################### define particle attributes #####################*/
+
+    //! describe attributes of a particle
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting>;
+
+    //! ions also need to have a boundElectrons attribute for ionization
+    using IonParticleAttributes = MakeSeq_t<DefaultParticleAttributes, boundElectrons>;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
+
+    /* define species electrons */
+    using Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*--------------------------- H+ --------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioHydrogen, 1836.152672);
+    value_identifier(float_X, ChargeRatioHydrogen, -1.0);
+
+    /* ratio relative to BASE_DENSITY (n_e) */
+    value_identifier(float_X, DensityRatioHydrogen, 25. / 158.);
+
+
+#ifndef PARAM_IONIZATIONCURRENT
+#    define PARAM_IONIZATIONCURRENT None
+#endif
+    using ParticleFlagsHydrogen = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioHydrogen>,
+        chargeRatio<ChargeRatioHydrogen>,
+        densityRatio<DensityRatioHydrogen>,
+        atomicNumbers<ionization::atomicNumbers::Hydrogen_t>,
+        ionizationEnergies<ionization::energies::AU::Hydrogen_t>,
+        effectiveNuclearCharge<ionization::effectiveNuclearCharge::Hydrogen_t>,
+        ionizers<MakeSeq_t<
+            particles::ionization::BSIEffectiveZ<Electrons, particles::ionization::current::PARAM_IONIZATIONCURRENT>,
+            particles::ionization::ADKLinPol<Electrons, particles::ionization::current::PARAM_IONIZATIONCURRENT>,
+            particles::ionization::ThomasFermi<Electrons>>>>;
+
+    /* define species Hydrogen */
+    using Hydrogen = Particles<PMACC_CSTRING("H"), ParticleFlagsHydrogen, IonParticleAttributes>;
+
+    /*--------------------------- C ---------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioCarbon, 22032.0);
+    value_identifier(float_X, ChargeRatioCarbon, -6.0);
+
+    /* ratio relative to BASE_DENSITY (n_e) */
+    value_identifier(float_X, DensityRatioCarbon, 21. / 158.);
+
+    using ParticleFlagsCarbon = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioCarbon>,
+        chargeRatio<ChargeRatioCarbon>,
+        densityRatio<DensityRatioCarbon>,
+        atomicNumbers<ionization::atomicNumbers::Carbon_t>,
+        ionizationEnergies<ionization::energies::AU::Carbon_t>,
+        effectiveNuclearCharge<ionization::effectiveNuclearCharge::Carbon_t>,
+        ionizers<MakeSeq_t<
+            particles::ionization::BSIEffectiveZ<Electrons, particles::ionization::current::PARAM_IONIZATIONCURRENT>,
+            particles::ionization::ADKLinPol<Electrons, particles::ionization::current::PARAM_IONIZATIONCURRENT>,
+            particles::ionization::ThomasFermi<Electrons>>>>;
+
+    /* define species Carbon */
+    using Carbon = Particles<PMACC_CSTRING("C"), ParticleFlagsCarbon, IonParticleAttributes>;
+
+    /*--------------------------- N ---------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioNitrogen, 25716.852);
+    value_identifier(float_X, ChargeRatioNitrogen, -7.0);
+
+    /* ratio relative to BASE_DENSITY (n_e) */
+    value_identifier(float_X, DensityRatioNitrogen, 1. / 158.);
+
+    using ParticleFlagsNitrogen = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioNitrogen>,
+        chargeRatio<ChargeRatioNitrogen>,
+        densityRatio<DensityRatioNitrogen>,
+        atomicNumbers<ionization::atomicNumbers::Nitrogen_t>,
+        ionizationEnergies<ionization::energies::AU::Nitrogen_t>,
+        effectiveNuclearCharge<ionization::effectiveNuclearCharge::Nitrogen_t>,
+        ionizers<MakeSeq_t<
+            particles::ionization::BSIEffectiveZ<Electrons, particles::ionization::current::PARAM_IONIZATIONCURRENT>,
+            particles::ionization::ADKLinPol<Electrons, particles::ionization::current::PARAM_IONIZATIONCURRENT>,
+            particles::ionization::ThomasFermi<Electrons>>>>;
+
+    /* define species Nitrogen */
+    using Nitrogen = Particles<PMACC_CSTRING("N"), ParticleFlagsNitrogen, IonParticleAttributes>;
+
+    /*--------------------------- Probe Particles -------------------------------*/
+
+    using ParticleFlagsProbes = MakeSeq_t<
+        particlePusher<particles::pusher::Probe>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>>;
+
+    /* define species Probe */
+    using Probes
+        = Particles<PMACC_CSTRING("probe"), ParticleFlagsProbes, MakeSeq_t<position<position_pic>, probeB, probeE>>;
+
+    /*########################### end species ####################################*/
+
+    /** All known particle species of the simulation
+     *
+     * List all defined particle species from above in this list
+     * to make them available to the PIC algorithm.
+     */
+    using VectorAllSpecies = MakeSeq_t<Electrons, Hydrogen, Carbon, Nitrogen, Probes>;
 
 } // namespace picongpu
diff --git a/share/picongpu/examples/FoilLCT/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/FoilLCT/include/picongpu/param/speciesInitialization.param
index 878da563f8..0f234db171 100644
--- a/share/picongpu/examples/FoilLCT/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/FoilLCT/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,73 +33,32 @@
 
 namespace picongpu
 {
-namespace particles
-{
-
-    /** InitPipeline defines in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-        CreateDensity<
-            densityProfiles::FlatFoilWithRamp,
-            startPosition::Random6ppc,
-            Hydrogen
-        >,
-        /* derive the other two ion species and adjust their weighting to have always all
-         * three of macro ions present in a cell, even in cut-off regions of the density profile */
-        ManipulateDerive<
-            manipulators::binary::DensityWeighting,
-            Hydrogen,
-            Carbon
-        >,
-        ManipulateDerive<
-            manipulators::binary::DensityWeighting,
-            Hydrogen,
-            Nitrogen
-        >,
-        // randomize C & N in-cell
-        Manipulate<
-            manipulators::unary::RandomPosition,
-            Carbon
-        >,
-        Manipulate<
-            manipulators::unary::RandomPosition,
-            Nitrogen
-        >,
-        // partial pre-ionization: set bound electrons for C2+ & N2+
-        Manipulate<
-            manipulators::TwiceIonized,
-            Carbon
-        >,
-        // note: boundElectrons default is 0, so Hydrogen's default is H+
-        Manipulate<
-            manipulators::TwiceIonized,
-            Nitrogen
-        >,
-        // partial pre-ionization: create free electrons
-        Derive<
-            Hydrogen,
-            Electrons
-        >,
-        ManipulateDerive<
-            manipulators::binary::UnboundElectronsTimesWeighting,
-            Carbon,
-            Electrons
-        >,
-        ManipulateDerive<
-            manipulators::binary::UnboundElectronsTimesWeighting,
-            Nitrogen,
-            Electrons
-        >,
-        /* create non-physical "probe" particles that sit in every 4x4x4th cell
-         * and monitor the electro-magnetic fields */
-        CreateDensity<
-            densityProfiles::ProbeEveryFourthCell,
-            startPosition::OnePosition,
-            Probes
-        >
-    >;
+    namespace particles
+    {
+        /** InitPipeline defines in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::FlatFoilWithRamp, startPosition::Random6ppc, Hydrogen>,
+            /* derive the other two ion species and adjust their weighting to have always all
+             * three of macro ions present in a cell, even in cut-off regions of the density profile */
+            ManipulateDerive<manipulators::binary::DensityWeighting, Hydrogen, Carbon>,
+            ManipulateDerive<manipulators::binary::DensityWeighting, Hydrogen, Nitrogen>,
+            // randomize C & N in-cell
+            Manipulate<manipulators::unary::RandomPosition, Carbon>,
+            Manipulate<manipulators::unary::RandomPosition, Nitrogen>,
+            // partial pre-ionization: set bound electrons for C2+ & N2+
+            Manipulate<manipulators::TwiceIonized, Carbon>,
+            // note: boundElectrons default is 0, so Hydrogen's default is H+
+            Manipulate<manipulators::TwiceIonized, Nitrogen>,
+            // partial pre-ionization: create free electrons
+            Derive<Hydrogen, Electrons>,
+            ManipulateDerive<manipulators::binary::UnboundElectronsTimesWeighting, Carbon, Electrons>,
+            ManipulateDerive<manipulators::binary::UnboundElectronsTimesWeighting, Nitrogen, Electrons>,
+            /* create non-physical "probe" particles that sit in every 4x4x4th cell
+             * and monitor the electro-magnetic fields */
+            CreateDensity<densityProfiles::ProbeEveryFourthCell, startPosition::OnePosition, Probes>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/cmakeFlags b/share/picongpu/examples/KelvinHelmholtz/cmakeFlags
index ec0ffe7464..35c8e64f65 100755
--- a/share/picongpu/examples/KelvinHelmholtz/cmakeFlags
+++ b/share/picongpu/examples/KelvinHelmholtz/cmakeFlags
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -32,11 +32,6 @@
 flags[0]=""
 flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_DIMENSION=DIM2'"
 flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_RADIATION=1'"
-flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=EmZ<UsedParticleShape>;-DPARAM_PARTICLESHAPE=CIC'"
-flags[4]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=EmZ<UsedParticleShape>;-DPARAM_PARTICLESHAPE=PCS'"
-flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=EmZ<UsedParticleShape>;-DPARAM_PARTICLESHAPE=TSC'"
-flags[6]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=EmZ<UsedParticleShape>;-DPARAM_PARTICLESHAPE=TSC;-DPARAM_DIMENSION=DIM2'"
-flags[7]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=DirSplitting;-DPARAM_CURRENTINTERPOLATION=NoneDS'"
 
 
 ################################################################################
diff --git a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/1.cfg b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/1.cfg
index 4cc3fb3302..0d5bfd71f6 100644
--- a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/1.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/16.cfg b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/16.cfg
index abbe7abd37..89a94554b4 100644
--- a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/16.cfg
+++ b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/16.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
@@ -46,7 +46,10 @@ TBG_periodic="--periodic 1 1 1"
 ## Section: Optional Variables ##
 #################################
 
-TBG_hdf5="--hdf5.period 250 --hdf5.file simData"
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 250   \
+             --openPMD.file simData \
+             --openPMD.ext h5"
 
 TBG_pngYZ="--e_png.period 10 --e_png.axis yz --e_png.slicePoint 0.5 --e_png.folder pngElectronsYZ"
 TBG_pngYX="--e_png.period 10 --e_png.axis yx --e_png.slicePoint 0.5 --e_png.folder pngElectronsYX"
@@ -58,16 +61,16 @@ TBG_ipngYX="--i_png.period 10 --i_png.axis yx --i_png.slicePoint 0.5 --i_png.fol
 TBG_eBin="--e_energyHistogram.period 100 --e_energyHistogram.filter all --e_energyHistogram.binCount 1024 --e_energyHistogram.minEnergy 0 --e_energyHistogram.maxEnergy 5000"
 TBG_iBin="--i_energyHistogram.period 100 --i_energyHistogram.filter all --i_energyHistogram.binCount 1024 --i_energyHistogram.minEnergy 0 --i_energyHistogram.maxEnergy 2000000"
 
-TBG_plugins="!TBG_ipngYZ                   \
+TBG_plugins="!TBG_ipngYZ                    \
               !TBG_ipngYX                   \
               !TBG_eBin                     \
               !TBG_iBin                     \
               !TBG_pngYX                    \
               !TBG_pngYZ                    \
-              !TBG_hdf5                     \
+              !TBG_openPMD                  \
               --i_macroParticlesCount.period 100         \
               --e_macroParticlesCount.period 100         \
-              --fields_energy.period 10     \
+              --fields_energy.period 10                  \
               --e_energy.period 10 --e_energy.filter all \
               --i_energy.period 10 --i_energy.filter all"
 
diff --git a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/1_bench.cfg b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/1_bench.cfg
index b44e720184..76b8fa274f 100644
--- a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/1_bench.cfg
+++ b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/1_bench.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/4.cfg b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/4.cfg
index 993db01f7b..531dfb53f2 100644
--- a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/4.cfg
+++ b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/4.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/4_bench.cfg b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/4_bench.cfg
index 257defa5a6..945f39b161 100644
--- a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/4_bench.cfg
+++ b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/4_bench.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/8_bench.cfg b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/8_bench.cfg
index 11c88c5396..8c8bb09973 100644
--- a/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/8_bench.cfg
+++ b/share/picongpu/examples/KelvinHelmholtz/etc/picongpu/8_bench.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Rene Widera, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/density.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/density.param
index 9edc35a15e..bed7ea6308 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/density.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -25,22 +25,22 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     */
-    constexpr float_64 BASE_DENSITY_SI = 1.e25;
-}
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
+        constexpr float_64 BASE_DENSITY_SI = 1.e25;
+    } // namespace SI
 
-namespace densityProfiles
-{
-    /* definition of homogenous profile */
-    using Homogenous = HomogenousImpl;
-}
-}
+    namespace densityProfiles
+    {
+        /* definition of homogenous profile */
+        using Homogenous = HomogenousImpl;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/dimension.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/dimension.param
index 9f14baaec3..9cda9d9a01 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/dimension.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/dimension.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Rene Widera
+/* Copyright 2014-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -20,7 +20,7 @@
 #pragma once
 
 #ifndef PARAM_DIMENSION
-#define PARAM_DIMENSION DIM3
+#    define PARAM_DIMENSION DIM3
 #endif
 
 #define SIMDIM PARAM_DIMENSION
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/fieldSolver.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/fieldSolver.param
deleted file mode 100644
index c52f26f4f8..0000000000
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/fieldSolver.param
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-/** @file
- *
- * Configure the field solver.
- *
- * Select the numerical Maxwell solver (e.g. Yee's method).
- *
- * Also allows to configure ad hoc mitigations for high frequency
- * noise in some setups via current smoothing.
- */
-
-#pragma once
-
-#include "picongpu/fields/MaxwellSolver/Solvers.def"
-#include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
-
-
-namespace picongpu
-{
-namespace fields
-{
-
-    /** Current Interpolation
-     *
-     * CurrentInterpolation is used to set a method performing the
-     * interpolate/assign operation from the generated currents of particle
-     * species to the electro-magnetic fields.
-     *
-     * Allowed values are:
-     *   - None:
-     *     - default for staggered grids/Yee-scheme
-     *     - updates E
-     *   - Binomial: 2nd order Binomial filter
-     *     - smooths the current before assignment in staggered grid
-     *     - updates E & breaks local charge conservation slightly
-     *   - NoneDS:
-     *     - experimental assignment for all-centered/directional splitting
-     *     - updates E & B at the same time
-     */
-#ifndef PARAM_CURRENTINTERPOLATION
-#   define PARAM_CURRENTINTERPOLATION None
-#endif
-    using CurrentInterpolation = currentInterpolation::PARAM_CURRENTINTERPOLATION;
-
-    /** FieldSolver
-     *
-     * Field Solver Selection:
-     *  - Yee< CurrentInterpolation >: standard Yee solver
-     *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
-     *  - YeePML< CurrentInterpolation >: standard Yee solver with PML absorber
-     *  - DirSplitting< CurrentInterpolation >: Sentoku's Directional Splitting Method
-     *  - None< CurrentInterpolation >: disable the vacuum update of E and B
-     */
-
-#ifndef PARAM_FIELDSOLVER
-    /* WARNING: if you change field solver by hand please update your CELL_WIDTH_SI
-     * in `grid.param` to fulfill the convergence condition (CFL)
-     */
-#   define PARAM_FIELDSOLVER Yee
-#endif
-    using Solver = maxwellSolver::PARAM_FIELDSOLVER< CurrentInterpolation >;
-
-} // namespace fields
-} // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/grid.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/grid.param
index 26b100eb4c..ceffd4d2eb 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch,
  *                     Benjamin Worpitz
  *
  * This file is part of PIConGPU.
@@ -19,12 +19,10 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** Duration of one timestep
@@ -33,27 +31,10 @@ namespace picongpu
 
         /** equals X
          *  unit: meter */
-#define DirSplitting 1
-#if (PARAM_FIELDSOLVER == 1)
-        /* THIS CODE PATH IS ONLY USED IF `PARAM_FIELDSOLVER` IS CHANGED IN
-         * `cmakeFlags` and the field solver there is set to fieldSolverDirSplitting
-         *
-         * Directional Splitting requires a fixed ratio between dt and dx
-         * and in addition cubic cells.
-         * conditions: dX == dt * c
-         *             dX == dY
-         *             dX == dZ
-         */
-        constexpr float_64 CELL_WIDTH_SI = DELTA_T_SI*SPEED_OF_LIGHT_SI;
-#else
-        /* cell size for Yee solver (must fulfill CFL)
-         * WARNING: if you change the field solver in `componentsConfig` you
-         * have to change the CELL_SIZE in this code path
-         */
-        constexpr float_64 CELL_WIDTH_SI = 9.34635e-8;
-#endif
-#undef DirSplitting
 
+        /** equals X
+         *  unit: meter */
+        constexpr float_64 CELL_WIDTH_SI = 9.34635e-8;
         /** equals Y
          *  unit: meter */
         constexpr float_64 CELL_HEIGHT_SI = CELL_WIDTH_SI;
@@ -73,21 +54,21 @@ namespace picongpu
          * behave like the interaction of infinite "wire particles"
          * in fields with perfect symmetry in Z.
          */
-    } //namespace SI
+    } // namespace SI
 
     //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {32, 32},  /*x direction [negative,positive]*/
-        {32, 32},  /*y direction [negative,positive]*/
-        {32, 32}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -106,4 +87,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
+} // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/memory.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/memory.param
index 1a3ca4511a..c52ed07b1c 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/memory.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/memory.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -31,78 +31,87 @@
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/mappings/kernel/MappingDescription.hpp>
 
+#include <array>
 
 namespace picongpu
 {
+    /* We have to hold back 350MiB for gpu-internal operations:
+     *   - random number generator
+     *   - reduces
+     *   - ...
+     */
+    constexpr size_t reservedGpuMemorySize = 400 * 1024 * 1024;
 
-/* We have to hold back 350MiB for gpu-internal operations:
- *   - random number generator
- *   - reduces
- *   - ...
- */
-constexpr size_t reservedGpuMemorySize = 400 *1024*1024;
+    /* short namespace*/
+    namespace mCT = pmacc::math::CT;
+    /** size of a superCell
+     *
+     * volume of a superCell must be <= 1024
+     */
+    using SuperCellSize = typename mCT::shrinkTo<mCT::Int<8, 8, 4>, simDim>::type;
 
-/* short namespace*/
-namespace mCT = pmacc::math::CT;
-/** size of a superCell
- *
- * volume of a superCell must be <= 1024
- */
-using SuperCellSize = typename mCT::shrinkTo<
-    mCT::Int< 8, 8, 4 >,
-    simDim
->::type;
+    /** define the object for mapping superCells to cells*/
+    using MappingDesc = MappingDescription<simDim, SuperCellSize>;
 
-/** define the object for mapping superCells to cells*/
-using MappingDesc = MappingDescription< simDim, SuperCellSize >;
+    /** define the size of the core, border and guard area
+     *
+     * PIConGPU uses spatial domain-decomposition for parallelization
+     * over multiple devices with non-shared memory architecture.
+     * The global spatial domain is organized per device in three
+     * sections: the GUARD area contains copies of neighboring
+     * devices (also known as "halo"/"ghost").
+     * The BORDER area is the outermost layer of cells of a device,
+     * equally to what neighboring devices see as GUARD area.
+     * The CORE area is the innermost area of a device. In union with
+     * the BORDER area it defines the "active" spatial domain on a device.
+     *
+     * GuardSize is defined in units of SuperCellSize per dimension.
+     */
+    using GuardSize = typename mCT::shrinkTo<mCT::Int<1, 1, 1>, simDim>::type;
 
-/** define the size of the core, border and guard area
- *
- * PIConGPU uses spatial domain-decomposition for parallelization
- * over multiple devices with non-shared memory architecture.
- * The global spatial domain is organized per device in three
- * sections: the GUARD area contains copies of neighboring
- * devices (also known as "halo"/"ghost").
- * The BORDER area is the outermost layer of cells of a device,
- * equally to what neighboring devices see as GUARD area.
- * The CORE area is the innermost area of a device. In union with
- * the BORDER area it defines the "active" spatial domain on a device.
- *
- * GuardSize is defined in units of SuperCellSize per dimension.
- */
-using GuardSize = typename mCT::shrinkTo<
-    mCT::Int< 1, 1, 1 >,
-    simDim
->::type;
+    /** bytes reserved for species exchange buffer
+     *
+     * This is the default configuration for species exchanges buffer sizes.
+     * The default exchange buffer sizes can be changed per species by adding
+     * the alias exchangeMemCfg with similar members like in DefaultExchangeMemCfg
+     * to its flag list.
+     */
+    struct DefaultExchangeMemCfg
+    {
+        // memory used for a direction
+        static constexpr uint32_t BYTES_EXCHANGE_X = 2 * 1024 * 1024; // 2 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Y = 6 * 1024 * 1024; // 6 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Z = 2 * 1024 * 1024; // 2 MiB
+        static constexpr uint32_t BYTES_EDGES = 64 * 1024; // 64 kiB
+        static constexpr uint32_t BYTES_CORNER = 16 * 1024; // 16 kiB
 
-/** bytes reserved for species exchange buffer
- *
- * This is the default configuration for species exchanges buffer sizes.
- * The default exchange buffer sizes can be changed per species by adding
- * the alias exchangeMemCfg with similar members like in DefaultExchangeMemCfg
- * to its flag list.
- */
-struct DefaultExchangeMemCfg
-{
-    // memory used for a direction
-    static constexpr uint32_t BYTES_EXCHANGE_X = 2 * 1024 * 1024; // 2 MiB
-    static constexpr uint32_t BYTES_EXCHANGE_Y = 6 * 1024 * 1024; // 6 MiB
-    static constexpr uint32_t BYTES_EXCHANGE_Z = 2 * 1024 * 1024; // 2 MiB
-    static constexpr uint32_t BYTES_EDGES = 64 * 1024; // 64 kiB
-    static constexpr uint32_t BYTES_CORNER = 16 * 1024; // 16 kiB
-};
+        /** Reference local domain size
+         *
+         * The size of the local domain for which the exchange sizes `BYTES_*` are configured for.
+         * The required size of each exchange will be calculated at runtime based on the local domain size and the
+         * reference size. The exchange size will be scaled only up and not down. Zero means that there is no reference
+         * domain size, exchanges will not be scaled.
+         */
+        using REF_LOCAL_DOM_SIZE = mCT::Int<0, 0, 0>;
+        /** Scaling rate per direction.
+         *
+         * 1.0 means it scales linear with the ratio between the local domain size at runtime and the reference local
+         * domain size.
+         */
+        const std::array<float_X, 3> DIR_SCALING_FACTOR = {{0.0, 0.0, 0.0}};
+    };
 
-/** number of scalar fields that are reserved as temporary fields */
-constexpr uint32_t fieldTmpNumSlots = 1;
+    /** number of scalar fields that are reserved as temporary fields */
+    constexpr uint32_t fieldTmpNumSlots = 1;
 
-/** can `FieldTmp` gather neighbor information
- *
- * If `true` it is possible to call the method `asyncCommunicationGather()`
- * to copy data from the border of neighboring GPU into the local guard.
- * This is also known as building up a "ghost" or "halo" region in domain
- * decomposition and only necessary for specific algorithms that extend
- * the basic PIC cycle, e.g. with dependence on derived density or energy fields.
- */
-constexpr bool fieldTmpSupportGatherCommunication = true;
+    /** can `FieldTmp` gather neighbor information
+     *
+     * If `true` it is possible to call the method `asyncCommunicationGather()`
+     * to copy data from the border of neighboring GPU into the local guard.
+     * This is also known as building up a "ghost" or "halo" region in domain
+     * decomposition and only necessary for specific algorithms that extend
+     * the basic PIC cycle, e.g. with dependence on derived density or energy fields.
+     */
+    constexpr bool fieldTmpSupportGatherCommunication = true;
 
 } // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/particle.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/particle.param
index 32f98afb3a..8eddfb9d0c 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -28,86 +28,71 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    namespace startPosition
+    namespace particles
     {
-        struct QuietParam25ppc
+        namespace startPosition
         {
-            /** Count of particles per cell per direction at initial state
-             *  unit: none
-             */
-            using numParticlesPerDimension = typename mCT::shrinkTo<
-                mCT::Int<
-                    5,
-                    5,
-                    1
-                >,
-                simDim
-            >::type;
-        };
-        using Quiet25ppc = QuietImpl< QuietParam25ppc >;
-
-    } // namespace startPosition
+            struct QuietParam25ppc
+            {
+                /** Count of particles per cell per direction at initial state
+                 *  unit: none
+                 */
+                using numParticlesPerDimension = typename mCT::shrinkTo<mCT::Int<5, 5, 1>, simDim>::type;
+            };
+            using Quiet25ppc = QuietImpl<QuietParam25ppc>;
 
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *  unit: none
-     */
-    constexpr float_X MIN_WEIGHTING = 10.0;
+        } // namespace startPosition
 
-    /** During unit normalization, we assume this is a typical
-     *  number of particles per cell for normalization of weighted
-     *  particle attributes.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = mCT::volume<
-        startPosition::QuietParam25ppc::numParticlesPerDimension
-    >::type::value;
-
-namespace manipulators
-{
-
-    CONST_VECTOR(float_X,3,DriftParamPositive_direction,1.0,0.0,0.0);
-    struct DriftParamPositive
-    {
-        /** Initial particle drift velocity for electrons and ions
-         *  Examples:
-         *    - No drift is equal to 1.0
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
          *  unit: none
          */
-        static constexpr float_64 gamma = 1.021;
-        const DriftParamPositive_direction_t direction;
-    };
-    using AssignXDriftPositive = unary::Drift<
-        DriftParamPositive,
-        nvidia::functors::Assign
-    >;
+        constexpr float_X MIN_WEIGHTING = 10.0;
 
-    CONST_VECTOR(float_X,3,DriftParamNegative_direction,-1.0,0.0,0.0);
-    struct DriftParamNegative
-    {
-        /** Initial particle drift velocity for electrons and ions
-         *  Examples:
-         *    - No drift is equal to 1.0
-         *  unit: none
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
          */
-        static constexpr float_64 gamma = 1.021;
-        const DriftParamNegative_direction_t direction;
-    };
-    using AssignXDriftNegative = unary::Drift<
-        DriftParamNegative,
-        nvidia::functors::Assign
-    >;
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL
+            = mCT::volume<startPosition::QuietParam25ppc::numParticlesPerDimension>::type::value;
 
-    struct TemperatureParam
-    {
-        /* Initial temperature
-         *  unit: keV
-         */
-        static constexpr float_64 temperature = 0.0005;
-    };
-    using AddTemperature = unary::Temperature< TemperatureParam >;
+        namespace manipulators
+        {
+            CONST_VECTOR(float_X, 3, DriftParamPositive_direction, 1.0, 0.0, 0.0);
+            struct DriftParamPositive
+            {
+                /** Initial particle drift velocity for electrons and ions
+                 *  Examples:
+                 *    - No drift is equal to 1.0
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 1.021;
+                const DriftParamPositive_direction_t direction;
+            };
+            using AssignXDriftPositive = unary::Drift<DriftParamPositive, nvidia::functors::Assign>;
+
+            CONST_VECTOR(float_X, 3, DriftParamNegative_direction, -1.0, 0.0, 0.0);
+            struct DriftParamNegative
+            {
+                /** Initial particle drift velocity for electrons and ions
+                 *  Examples:
+                 *    - No drift is equal to 1.0
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 1.021;
+                const DriftParamNegative_direction_t direction;
+            };
+            using AssignXDriftNegative = unary::Drift<DriftParamNegative, nvidia::functors::Assign>;
+
+            struct TemperatureParam
+            {
+                /* Initial temperature
+                 *  unit: keV
+                 */
+                static constexpr float_64 temperature = 0.0005;
+            };
+            using AddTemperature = unary::Temperature<TemperatureParam>;
 
-} // namespace manipulators
-} // namespace particles
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/particleFilters.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/particleFilters.param
index 9186d273c3..03c6e5d08c 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/particleFilters.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/particleFilters.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -41,87 +41,79 @@
 
 namespace picongpu
 {
-namespace particles
-{
-namespace filter
-{
-    struct IfRelativeGlobalPositionParamLowQuarterPosition
-    {
-        /* lowerBound is included in the range */
-        static constexpr float_X lowerBound = 0.0;
-        /* upperBound is excluded in the range */
-        static constexpr float_X upperBound = 0.25;
-        /* dimension for the filter
-         * x = 0; y= 1; z = 2
-         */
-        static constexpr uint32_t dimension = 1u;
-
-        // filter name
-        static constexpr char const * name = "lowerQuarterYPosition";
-    };
-
-    using LowerQuarterYPosition = filter::RelativeGlobalDomainPosition<
-        IfRelativeGlobalPositionParamLowQuarterPosition
-    >;
-
-    struct IfRelativeGlobalPositionParamMiddleHalf
+    namespace particles
     {
-        /* lowerBound is included in the range */
-        static constexpr float_X lowerBound = 0.25;
-        /* upperBound is excluded in the range */
-        static constexpr float_X upperBound = 0.75;
-        /* dimension for the filter
-         * x = 0; y= 1; z = 2
-         */
-        static constexpr uint32_t dimension = 1u;
-
-        // filter name
-        static constexpr char const * name = "middleHalfYPosition";
-    };
-
-    using MiddleHalfYPosition = filter::RelativeGlobalDomainPosition<
-        IfRelativeGlobalPositionParamMiddleHalf
-    >;
-
-    struct IfRelativeGlobalPositionParamUpperQuarter
-    {
-        /* lowerBound is included in the range */
-        static constexpr float_X lowerBound = 0.75;
-        /* upperBound is excluded in the range */
-        static constexpr float_X upperBound = 1.0;
-        /* dimension for the filter
-         * x = 0; y= 1; z = 2
-         */
-        static constexpr uint32_t dimension = 1u;
-
-        // filter name
-        static constexpr char const * name = "upperQuarterYPosition";
-    };
-
-    using UpperQuarterYPosition = filter::RelativeGlobalDomainPosition<
-        IfRelativeGlobalPositionParamUpperQuarter
-    >;
-
-    /** Plugins: collection of all available particle filters
-     *
-     * Create a list of all filters here that you want to use in plugins.
-     *
-     * Note: filter All is defined in picongpu/particles/filter/filter.def
-     */
-    using AllParticleFilters = MakeSeq_t<
-        All,
-        LowerQuarterYPosition,
-        MiddleHalfYPosition,
-        UpperQuarterYPosition
-    >;
-
-} // namespace filter
-
-namespace traits
-{
-    /* if needed for generic "free" filters,
-     * place `SpeciesEligibleForSolver` traits for filters here
-     */
-} // namespace traits
-} // namespace particles
+        namespace filter
+        {
+            struct IfRelativeGlobalPositionParamLowQuarterPosition
+            {
+                /* lowerBound is included in the range */
+                static constexpr float_X lowerBound = 0.0;
+                /* upperBound is excluded in the range */
+                static constexpr float_X upperBound = 0.25;
+                /* dimension for the filter
+                 * x = 0; y= 1; z = 2
+                 */
+                static constexpr uint32_t dimension = 1u;
+
+                // filter name
+                static constexpr char const* name = "lowerQuarterYPosition";
+            };
+
+            using LowerQuarterYPosition
+                = filter::RelativeGlobalDomainPosition<IfRelativeGlobalPositionParamLowQuarterPosition>;
+
+            struct IfRelativeGlobalPositionParamMiddleHalf
+            {
+                /* lowerBound is included in the range */
+                static constexpr float_X lowerBound = 0.25;
+                /* upperBound is excluded in the range */
+                static constexpr float_X upperBound = 0.75;
+                /* dimension for the filter
+                 * x = 0; y= 1; z = 2
+                 */
+                static constexpr uint32_t dimension = 1u;
+
+                // filter name
+                static constexpr char const* name = "middleHalfYPosition";
+            };
+
+            using MiddleHalfYPosition = filter::RelativeGlobalDomainPosition<IfRelativeGlobalPositionParamMiddleHalf>;
+
+            struct IfRelativeGlobalPositionParamUpperQuarter
+            {
+                /* lowerBound is included in the range */
+                static constexpr float_X lowerBound = 0.75;
+                /* upperBound is excluded in the range */
+                static constexpr float_X upperBound = 1.0;
+                /* dimension for the filter
+                 * x = 0; y= 1; z = 2
+                 */
+                static constexpr uint32_t dimension = 1u;
+
+                // filter name
+                static constexpr char const* name = "upperQuarterYPosition";
+            };
+
+            using UpperQuarterYPosition
+                = filter::RelativeGlobalDomainPosition<IfRelativeGlobalPositionParamUpperQuarter>;
+
+            /** Plugins: collection of all available particle filters
+             *
+             * Create a list of all filters here that you want to use in plugins.
+             *
+             * Note: filter All is defined in picongpu/particles/filter/filter.def
+             */
+            using AllParticleFilters
+                = MakeSeq_t<All, LowerQuarterYPosition, MiddleHalfYPosition, UpperQuarterYPosition>;
+
+        } // namespace filter
+
+        namespace traits
+        {
+            /* if needed for generic "free" filters,
+             * place `SpeciesEligibleForSolver` traits for filters here
+             */
+        } // namespace traits
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/png.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/png.param
index 39108d20f8..b6720b7330 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/png.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/png.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -51,32 +51,31 @@ namespace picongpu
 
         // multiply highest undisturbed particle density with factor
         constexpr float_X preParticleDens_opacity = 0.25;
-        constexpr float_X preChannel1_opacity     = 1.0;
-        constexpr float_X preChannel2_opacity     = 1.0;
-        constexpr float_X preChannel3_opacity     = 1.0;
+        constexpr float_X preChannel1_opacity = 1.0;
+        constexpr float_X preChannel2_opacity = 1.0;
+        constexpr float_X preChannel3_opacity = 1.0;
 
         // specify color scales for each channel
         namespace preParticleDensCol = colorScales::red;
-        namespace preChannel1Col     = colorScales::blue;
-        namespace preChannel2Col     = colorScales::green;
-        namespace preChannel3Col     = colorScales::none;
+        namespace preChannel1Col = colorScales::blue;
+        namespace preChannel2Col = colorScales::green;
+        namespace preChannel3Col = colorScales::none;
 
         /* png preview settings for each channel */
-        DINLINE float_X preChannel1 ( const float3_X& field_B, const float3_X& field_E, const float3_X& field_J )
+        DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
         {
             return field_B.z();
         }
 
-        DINLINE float_X preChannel2 ( const float3_X& field_B, const float3_X& field_E, const float3_X& field_J )
+        DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
         {
             return -1.0_X * field_B.z();
         }
 
-        DINLINE float_X preChannel3 ( const float3_X& field_B, const float3_X& field_E, const float3_X& field_J )
+        DINLINE float_X preChannel3(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
         {
             return 1.0_X;
         }
-    }
-
-}
+    } // namespace visPreview
 
+} // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/radiation.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/radiation.param
index 13b21aece9..226d40ea40 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/radiation.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/radiation.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -18,13 +18,12 @@
  */
 
 
-
 #pragma once
 
-  /*
-    radiation verbose level:
-    0=nothing, 1=physics, 2=simulation_state, 4=memory, 8=critical
-  */
+/*
+  radiation verbose level:
+  0=nothing, 1=physics, 2=simulation_state, 4=memory, 8=critical
+*/
 
 #define PIC_VERBOSE_RADIATION 3
 
@@ -36,145 +35,161 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace linear_frequencies
-{
-namespace SI
-{
-constexpr float_64 omega_min = 0.0;
-constexpr float_64 omega_max = 1.06e16;
-}
-
-constexpr unsigned int N_omega = 1024; // number of frequencies
-}
-
-namespace log_frequencies
-{
-namespace SI
-{
-// plasma omega = sqrt( (electron density * (1.6e-19)^2) / (8.854e-12 * 9.11e-31) )
-//              = 1.78e14 1/s
-constexpr float_64 omega_pe = 1.78e14;
-constexpr float_64 omega_min = 0.1 * omega_pe;
-constexpr float_64 omega_max = 200 * omega_pe;
-}
-
-constexpr unsigned int N_omega = 1024; // number of frequencies
-}
-
-
-namespace frequencies_from_list
-{
-/** path to text file with frequencies */
-constexpr const char * listLocation = "/path/to/frequency.list";
-constexpr unsigned int N_omega = 2048; // number of frequencies
-}
-
-
-namespace radiation_frequencies = log_frequencies;
-
-
-namespace radiationNyquist
-{
-  constexpr float_32 NyquistFactor = 0.5;
-}
-
-///////////////////////////////////////////////////
-
-
-  // correct treatment of coherent and incoherent  radiation from macroparticles
-  /* Choose different form factors in order to consider different  particle shapes for radiation
-   *  - radFormFactor_CIC_3D ... CIC charge distribution
-   *  - radFormFactor_TSC_3D ... TSC charge distribution
-   *  - radFormFactor_PCS_3D ... PCS charge distribution
-   *  - radFormFactor_CIC_1Dy ... only CIC charge distribution in y
-   *  - radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
-   *  - radFormFactor_Gauss_cell ... Gauss charge distribution according to cell size
-   *  - radFormFactor_incoherent ... only incoherent radiation
-   *  - radFormFactor_coherent ... only coherent radiation
-   */
-  namespace radFormFactor_CIC_3D { }
-  namespace radFormFactor_TSC_3D { }
-  namespace radFormFactor_PCS_3D { }
-  namespace radFormFactor_CIC_1Dy { }
-  namespace radFormFactor_Gauss_spherical { }
-  namespace radFormFactor_Gauss_cell { }
-  namespace radFormFactor_incoherent { }
-  namespace radFormFactor_coherent { }
-
-  namespace radFormFactor = radFormFactor_Gauss_spherical;
-
-
-///////////////////////////////////////////////////////////
-
-
-namespace parameters
-{
-
-
-constexpr unsigned int N_observer = 256; // number of looking directions
-
-} /* end namespace parameters */
-
-  /** activate particles for radiation */
-  struct GammaFilterFunctor
-  {
-      static constexpr float_X radiationGamma = 5.0;
-
-      template< typename T_Particle >
-      HDINLINE void operator()( T_Particle& particle )
-      {
-          if(
-             picongpu::gamma<float_X>(
-                                      particle[ picongpu::momentum_ ],
-                                      picongpu::traits::attribute::getMass(
-                                                                           particle[ picongpu::weighting_ ],
-                                                                           particle
-                                                                           )
-                                      ) >= radiationGamma
-             )
-            particle[ picongpu::radiationMask_ ] = true;
-      }
-  };
-
-
-  /* filter to enable radiation for electrons
-   *
-   * to enable the filter:
-   *   - goto file `speciesDefinition.param`
-   *   - add the attribute `radiationMask` to the electron species
-   */
-  using RadiationParticleFilter = picongpu::particles::manipulators::generic::Free<
-      GammaFilterFunctor
-      >;
-
-
-
-//////////////////////////////////////////////////
-
-
-// add a window function weighting to the radiation in order
-// to avoid ringing effects from sharpe boundaries
-// default: no window function via `radWindowFunctionNone`
-
-/* Choose different window function in order to get better ringing reduction
- * radWindowFunctionTriangle
- * radWindowFunctionHamming
- * radWindowFunctionTriplett
- * radWindowFunctionGauss
- * radWindowFunctionNone
- */
-namespace radWindowFunctionTriangle { }
-namespace radWindowFunctionHamming { }
-namespace radWindowFunctionTriplett { }
-namespace radWindowFunctionGauss { }
-namespace radWindowFunctionNone { }
-
-namespace radWindowFunction = radWindowFunctionTriangle;
-
-} // namespace radiation
-} // namespace plugins
+    namespace plugins
+    {
+        namespace radiation
+        {
+            namespace linear_frequencies
+            {
+                namespace SI
+                {
+                    constexpr float_64 omega_min = 0.0;
+                    constexpr float_64 omega_max = 1.06e16;
+                } // namespace SI
+
+                constexpr unsigned int N_omega = 1024; // number of frequencies
+            } // namespace linear_frequencies
+
+            namespace log_frequencies
+            {
+                namespace SI
+                {
+                    // plasma omega = sqrt( (electron density * (1.6e-19)^2) / (8.854e-12 * 9.11e-31) )
+                    //              = 1.78e14 1/s
+                    constexpr float_64 omega_pe = 1.78e14;
+                    constexpr float_64 omega_min = 0.1 * omega_pe;
+                    constexpr float_64 omega_max = 200 * omega_pe;
+                } // namespace SI
+
+                constexpr unsigned int N_omega = 1024; // number of frequencies
+            } // namespace log_frequencies
+
+
+            namespace frequencies_from_list
+            {
+                /** path to text file with frequencies */
+                constexpr const char* listLocation = "/path/to/frequency.list";
+                constexpr unsigned int N_omega = 2048; // number of frequencies
+            } // namespace frequencies_from_list
+
+
+            namespace radiation_frequencies = log_frequencies;
+
+
+            namespace radiationNyquist
+            {
+                constexpr float_32 NyquistFactor = 0.5;
+            }
+
+            ///////////////////////////////////////////////////
+
+
+            // correct treatment of coherent and incoherent  radiation from macroparticles
+            /* Choose different form factors in order to consider different  particle shapes for radiation
+             *  - radFormFactor_CIC_3D ... CIC charge distribution
+             *  - radFormFactor_TSC_3D ... TSC charge distribution
+             *  - radFormFactor_PCS_3D ... PCS charge distribution
+             *  - radFormFactor_CIC_1Dy ... only CIC charge distribution in y
+             *  - radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
+             *  - radFormFactor_Gauss_cell ... Gauss charge distribution according to cell size
+             *  - radFormFactor_incoherent ... only incoherent radiation
+             *  - radFormFactor_coherent ... only coherent radiation
+             */
+            namespace radFormFactor_CIC_3D
+            {
+            }
+            namespace radFormFactor_TSC_3D
+            {
+            }
+            namespace radFormFactor_PCS_3D
+            {
+            }
+            namespace radFormFactor_CIC_1Dy
+            {
+            }
+            namespace radFormFactor_Gauss_spherical
+            {
+            }
+            namespace radFormFactor_Gauss_cell
+            {
+            }
+            namespace radFormFactor_incoherent
+            {
+            }
+            namespace radFormFactor_coherent
+            {
+            }
+
+            namespace radFormFactor = radFormFactor_Gauss_spherical;
+
+
+            ///////////////////////////////////////////////////////////
+
+
+            namespace parameters
+            {
+                constexpr unsigned int N_observer = 256; // number of looking directions
+
+            } /* end namespace parameters */
+
+            /** activate particles for radiation */
+            struct GammaFilterFunctor
+            {
+                static constexpr float_X radiationGamma = 5.0;
+
+                template<typename T_Particle>
+                HDINLINE void operator()(T_Particle& particle)
+                {
+                    if(picongpu::gamma<float_X>(
+                           particle[picongpu::momentum_],
+                           picongpu::traits::attribute::getMass(particle[picongpu::weighting_], particle))
+                       >= radiationGamma)
+                        particle[picongpu::radiationMask_] = true;
+                }
+            };
+
+
+            /* filter to enable radiation for electrons
+             *
+             * to enable the filter:
+             *   - goto file `speciesDefinition.param`
+             *   - add the attribute `radiationMask` to the electron species
+             */
+            using RadiationParticleFilter = picongpu::particles::manipulators::generic::Free<GammaFilterFunctor>;
+
+
+            //////////////////////////////////////////////////
+
+
+            // add a window function weighting to the radiation in order
+            // to avoid ringing effects from sharpe boundaries
+            // default: no window function via `radWindowFunctionNone`
+
+            /* Choose different window function in order to get better ringing reduction
+             * radWindowFunctionTriangle
+             * radWindowFunctionHamming
+             * radWindowFunctionTriplett
+             * radWindowFunctionGauss
+             * radWindowFunctionNone
+             */
+            namespace radWindowFunctionTriangle
+            {
+            }
+            namespace radWindowFunctionHamming
+            {
+            }
+            namespace radWindowFunctionTriplett
+            {
+            }
+            namespace radWindowFunctionGauss
+            {
+            }
+            namespace radWindowFunctionNone
+            {
+            }
+
+            namespace radWindowFunction = radWindowFunctionTriangle;
+
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/radiationObserver.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/radiationObserver.param
index 89be653fc6..c66d133b22 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/radiationObserver.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/radiationObserver.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -22,64 +22,62 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-namespace radiation
-{
-namespace radiation_observer
-{
-    /** Compute observation angles
-     *
-     * This function is used in the Radiation plug-in kernel to compute
-     * the observation directions given as a unit vector pointing
-     * towards a 'virtual' detector
-     *
-     * @param    observation_id_extern
-     *           int index that identifies each block on the GPU
-     *           to compute the observation direction
-     *
-     * @return   unit vector pointing in observation direction
-     *           type: vector_64
-     *
-     */
-    HDINLINE vector_64 observation_direction(const int observation_id_extern)
+    namespace plugins
     {
-      /** This computes observation directions for one octant
-       *  of a sphere around the simulation area.
-       *  The axises of the octant point towards:
-       *  (+1,0,0) ; (0,+1,0) ; (0,0,-1)
-       */
-
-      /* generate two indices from single block index */
-      constexpr int N_angle_split = 16; /* index split distance */
-      /* get column index for computing angle theta: */
-      const int my_index_theta = observation_id_extern / N_angle_split;
-      /* get row index for computing angle phi: */
-      const int my_index_phi = observation_id_extern % N_angle_split;
-
-      /*  range for BOTH angles */
-      constexpr picongpu::float_64 angle_range= picongpu::PI/2.0;
+        namespace radiation
+        {
+            namespace radiation_observer
+            {
+                /** Compute observation angles
+                 *
+                 * This function is used in the Radiation plug-in kernel to compute
+                 * the observation directions given as a unit vector pointing
+                 * towards a 'virtual' detector
+                 *
+                 * @param    observation_id_extern
+                 *           int index that identifies each block on the GPU
+                 *           to compute the observation direction
+                 *
+                 * @return   unit vector pointing in observation direction
+                 *           type: vector_64
+                 *
+                 */
+                HDINLINE vector_64 observation_direction(const int observation_id_extern)
+                {
+                    /** This computes observation directions for one octant
+                     *  of a sphere around the simulation area.
+                     *  The axises of the octant point towards:
+                     *  (+1,0,0) ; (0,+1,0) ; (0,0,-1)
+                     */
 
-      /* angle stepwidth for BOTH angles */
-      constexpr picongpu::float_64 delta_angle =  1.0 * angle_range / (N_angle_split-1);
+                    /* generate two indices from single block index */
+                    constexpr int N_angle_split = 16; /* index split distance */
+                    /* get column index for computing angle theta: */
+                    const int my_index_theta = observation_id_extern / N_angle_split;
+                    /* get row index for computing angle phi: */
+                    const int my_index_phi = observation_id_extern % N_angle_split;
 
-      /* compute both angles */
-      const picongpu::float_64 theta(  my_index_theta * delta_angle  + 0.5*picongpu::PI );
-      const picongpu::float_64 phi(    my_index_phi   * delta_angle  );
+                    /*  range for BOTH angles */
+                    constexpr picongpu::float_64 angle_range = picongpu::PI / 2.0;
 
-      /* compute unit vector */
-            picongpu::float_32 sinPhi;
-      picongpu::float_32 cosPhi;
-      picongpu::float_32 sinTheta;
-      picongpu::float_32 cosTheta;
-      math::sincos(precisionCast<picongpu::float_32>(phi), sinPhi, cosPhi);
-      math::sincos(precisionCast<picongpu::float_32>(theta), sinTheta, cosTheta);
-      return vector_64( sinTheta*cosPhi , sinTheta*sinPhi , cosTheta ) ;
+                    /* angle stepwidth for BOTH angles */
+                    constexpr picongpu::float_64 delta_angle = 1.0 * angle_range / (N_angle_split - 1);
 
+                    /* compute both angles */
+                    const picongpu::float_64 theta(my_index_theta * delta_angle + 0.5 * picongpu::PI);
+                    const picongpu::float_64 phi(my_index_phi * delta_angle);
 
-    }
+                    /* compute unit vector */
+                    picongpu::float_32 sinPhi;
+                    picongpu::float_32 cosPhi;
+                    picongpu::float_32 sinTheta;
+                    picongpu::float_32 cosTheta;
+                    pmacc::math::sincos(precisionCast<picongpu::float_32>(phi), sinPhi, cosPhi);
+                    pmacc::math::sincos(precisionCast<picongpu::float_32>(theta), sinTheta, cosTheta);
+                    return vector_64(sinTheta * cosPhi, sinTheta * sinPhi, cosTheta);
+                }
 
-} // namespace radiation_observer
-} // namespace radiation
-} // namespace plugins
+            } // namespace radiation_observer
+        } // namespace radiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/species.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/species.param
deleted file mode 100644
index c7e25d4840..0000000000
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/species.param
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2014-2020 Rene Widera, Richard Pausch
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/particles/shapes.hpp"
-#include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
-#include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
-#include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
-
-#include "picongpu/particles/flylite/NonLTE.def"
-#include "picongpu/fields/currentDeposition/Solver.def"
-
-
-namespace picongpu
-{
-/*---------------------------- generic solver---------------------------------*/
-
-/*! Particle Shape definitions -------------------------------------------------
- *  - particles::shapes::CIC : 1st order
- *  - particles::shapes::TSC : 2nd order
- *  - particles::shapes::PCS : 3rd order
- *  - particles::shapes::P4S : 4th order
- *
- *  example: using UsedParticleShape = particles::shapes::CIC;
- */
-#ifndef PARAM_PARTICLESHAPE
-#define PARAM_PARTICLESHAPE TSC
-#endif
-using UsedParticleShape = particles::shapes::PARAM_PARTICLESHAPE;
-
-/* define which interpolation method is used to interpolate fields to particle*/
-using UsedField2Particle = FieldToParticleInterpolation< UsedParticleShape, AssignedTrilinearInterpolation >;
-
-/*! select current solver method -----------------------------------------------
- * - currentSolver::Esirkepov<SHAPE>  : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- * - currentSolver::VillaBune<>       : particle shapes - CIC (1st order) only
- * - currentSolver::EmZ<SHAPE>        : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- *
- * For development purposes: ---------------------------------------------------
- * - currentSolver::EsirkepovNative<SHAPE> : generic version of currentSolverEsirkepov
- *   without optimization (~4x slower and needs more shared memory)
- */
-#ifndef PARAM_CURRENTSOLVER
-#define PARAM_CURRENTSOLVER Esirkepov<UsedParticleShape>
-#endif
-using UsedParticleCurrentSolver = currentSolver::PARAM_CURRENTSOLVER;
-
-/*! particle pusher configuration ----------------------------------------------
- *
- * Defining a pusher is optional for particles
- *
- * - particles::pusher::Vay : better suited relativistic boris pusher
- * - particles::pusher::Boris : standard boris pusher
- * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
- *                                              with classical radiation reaction
- *
- * For diagnostics & modeling: ------------------------------------------------
- * - particles::pusher::Free : free propagation, ignore fields
- *                             (= free stream model)
- * - particles::pusher::Photon : propagate with c in direction of normalized mom.
- * - particles::pusher::Probe : Probe particles that interpolate E & B
- * For development purposes: --------------------------------------------------
- * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
- */
-using UsedParticlePusher = particles::pusher::Boris;
-
-} // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/speciesDefinition.param
index 10fd332f24..d46409ee79 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Heiko Burau
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -30,89 +30,76 @@
 
 
 #ifndef PARAM_RADIATION
-    /* disable radiation calculation */
-#   define PARAM_RADIATION 0
+/* disable radiation calculation */
+#    define PARAM_RADIATION 0
 #endif
 
 
 namespace picongpu
 {
+    /*########################### define particle attributes #####################*/
+
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<
+        position<position_pic>,
+        momentum,
+        weighting
+#if(PARAM_RADIATION == 1)
+        ,
+        momentumPrev1
+#endif
+        >;
 
-/*########################### define particle attributes #####################*/
+    /*########################### end particle attributes ########################*/
 
-/** describe attributes of a particle*/
-using DefaultParticleAttributes = MakeSeq_t<
-    position< position_pic >,
-    momentum,
-    weighting
-#if( PARAM_RADIATION == 1 )
-    , momentumPrev1
-#endif
->;
-
-/*########################### end particle attributes ########################*/
-
-/*########################### define species #################################*/
-
-/*--------------------------- electrons --------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioElectrons, 1.0 );
-value_identifier( float_X, ChargeRatioElectrons, 1.0 );
-
-using ParticleFlagsElectrons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioElectrons >,
-    chargeRatio< ChargeRatioElectrons >
->;
-
-/* define species electrons */
-using PIC_Electrons = Particles<
-    PMACC_CSTRING( "e" ),
-    ParticleFlagsElectrons,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- ions -------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioIons, 1836.152672 );
-value_identifier( float_X, ChargeRatioIons, -1.0 );
-
-/* ratio relative to BASE_DENSITY */
-value_identifier( float_X, DensityRatioIons, 1.0 );
-
-using ParticleFlagsIons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioIons >,
-    chargeRatio< ChargeRatioIons >,
-    densityRatio< DensityRatioIons >,
-    atomicNumbers< ionization::atomicNumbers::Hydrogen_t >
->;
-
-/* define species ions */
-using PIC_Ions = Particles<
-    PMACC_CSTRING( "i" ),
-    ParticleFlagsIons,
-    DefaultParticleAttributes
->;
-
-/*########################### end species ####################################*/
-
-/** All known particle species of the simulation
- *
- * List all defined particle species from above in this list
- * to make them available to the PIC algorithm.
- */
-using VectorAllSpecies = MakeSeq_t<
-    PIC_Electrons,
-    PIC_Ions
->;
+    /*########################### define species #################################*/
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
+
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*--------------------------- ions -------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioIons, 1836.152672);
+    value_identifier(float_X, ChargeRatioIons, -1.0);
+
+    /* ratio relative to BASE_DENSITY */
+    value_identifier(float_X, DensityRatioIons, 1.0);
+
+    using ParticleFlagsIons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioIons>,
+        chargeRatio<ChargeRatioIons>,
+        densityRatio<DensityRatioIons>,
+        atomicNumbers<ionization::atomicNumbers::Hydrogen_t>>;
+
+    /* define species ions */
+    using PIC_Ions = Particles<PMACC_CSTRING("i"), ParticleFlagsIons, DefaultParticleAttributes>;
+
+    /*########################### end species ####################################*/
+
+    /** All known particle species of the simulation
+     *
+     * List all defined particle species from above in this list
+     * to make them available to the PIC algorithm.
+     */
+    using VectorAllSpecies = MakeSeq_t<PIC_Electrons, PIC_Ions>;
 
 } // namespace picongpu
diff --git a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/speciesInitialization.param
index f8e5bc5a9c..304410c368 100644
--- a/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/KelvinHelmholtz/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,57 +33,22 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline define in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-        CreateDensity<
-            densityProfiles::Homogenous,
-            startPosition::Quiet25ppc,
-            PIC_Electrons
-        >,
-        Derive<
-            PIC_Electrons,
-            PIC_Ions
-        >,
-        Manipulate<
-            manipulators::AssignXDriftPositive,
-            PIC_Ions,
-            filter::LowerQuarterYPosition
-        >,
-        Manipulate<
-            manipulators::AssignXDriftNegative,
-            PIC_Ions,
-            filter::MiddleHalfYPosition
-        >,
-        Manipulate<
-            manipulators::AssignXDriftPositive,
-            PIC_Ions,
-            filter::UpperQuarterYPosition
-        >,
-        Manipulate<
-            manipulators::AssignXDriftPositive,
-            PIC_Electrons,
-            filter::LowerQuarterYPosition
-        >,
-        Manipulate<
-            manipulators::AssignXDriftNegative,
-            PIC_Electrons,
-            filter::MiddleHalfYPosition
-        >,
-        Manipulate<
-            manipulators::AssignXDriftPositive,
-            PIC_Electrons,
-            filter::UpperQuarterYPosition
-        >,
-        Manipulate<
-            manipulators::AddTemperature,
-            PIC_Electrons
-        >
-    >;
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::Homogenous, startPosition::Quiet25ppc, PIC_Electrons>,
+            Derive<PIC_Electrons, PIC_Ions>,
+            Manipulate<manipulators::AssignXDriftPositive, PIC_Ions, filter::LowerQuarterYPosition>,
+            Manipulate<manipulators::AssignXDriftNegative, PIC_Ions, filter::MiddleHalfYPosition>,
+            Manipulate<manipulators::AssignXDriftPositive, PIC_Ions, filter::UpperQuarterYPosition>,
+            Manipulate<manipulators::AssignXDriftPositive, PIC_Electrons, filter::LowerQuarterYPosition>,
+            Manipulate<manipulators::AssignXDriftNegative, PIC_Electrons, filter::MiddleHalfYPosition>,
+            Manipulate<manipulators::AssignXDriftPositive, PIC_Electrons, filter::UpperQuarterYPosition>,
+            Manipulate<manipulators::AddTemperature, PIC_Electrons>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/cmakeFlags b/share/picongpu/examples/LaserWakefield/cmakeFlags
index d25c78758a..250ade041c 100755
--- a/share/picongpu/examples/LaserWakefield/cmakeFlags
+++ b/share/picongpu/examples/LaserWakefield/cmakeFlags
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -30,15 +30,8 @@
 #   - increase by 1, no gaps
 
 flags[0]=""
-flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Lehe;-DPARAM_PARTICLEPUSHER=Vay'"
-flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=Esirkepov;-DPARAM_PARTICLESHAPE=CIC'"
-flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=VillaBune;-DPARAM_PARTICLESHAPE=CIC'"
-flags[4]="-DPARAM_OVERWRITES:LIST='-DPARAM_PRECISION=precision64Bit'"
-flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_DIMENSION=DIM2'"
-flags[6]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=Esirkepov;-DPARAM_PARTICLESHAPE=CIC;-DPARAM_DIMENSION=DIM2'"
-flags[7]="-DPARAM_OVERWRITES:LIST='-DPARAM_PRECISION=precision64Bit;-DPARAM_DIMENSION=DIM2'"
-flags[8]="-DPARAM_OVERWRITES:LIST='-DPARAM_DIMENSION=DIM2'"
-flags[9]="-DPARAM_OVERWRITES:LIST='-DPARAM_IONS=1;-DPARAM_IONIZATION=1'"
+flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_DIMENSION=DIM2'"
+flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_IONS=1;-DPARAM_IONIZATION=1'"
 
 ################################################################################
 # execution
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/1.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/1.cfg
index 2b0968084d..1a2163a8c3 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/1.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt, Franz Poeschel
 #
 # This file is part of PIConGPU.
 #
@@ -63,10 +63,12 @@ TBG_e_PSypy="--e_phaseSpace.period 100                         \
              --e_phaseSpace.min -1.0 --e_phaseSpace.max 1.0    \
              --e_phaseSpace.filter all"
 
-# HDF5 raw data output (DISABLED, add to TBG_plugins below to ENABLE!)
-TBG_hdf5="--hdf5.period 100   \
-          --hdf5.file simData \
-          --hdf5.source 'species_all,fields_all'"
+TBG_openPMD="--openPMD.period 100   \
+             --openPMD.file simData \
+             --openPMD.ext bp \
+             --checkpoint.backend openPMD \
+             --checkpoint.period 100
+             --checkpoint.restart.backend openPMD"
 
 # macro particle counter (electrons, debug information for memory)
 TBG_e_macroCount="--e_macroParticlesCount.period 100"
@@ -74,7 +76,8 @@ TBG_e_macroCount="--e_macroParticlesCount.period 100"
 TBG_plugins="!TBG_pngYX                    \
              !TBG_e_histogram              \
              !TBG_e_PSypy                  \
-             !TBG_e_macroCount"
+             !TBG_e_macroCount             \
+             !TBG_openPMD"
 
 #################################
 ## Section: Program Parameters ##
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/16.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/16.cfg
index 3c05294d63..304bc558f7 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/16.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/16.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/1_isaac.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/1_isaac.cfg
index 4f3e2bb7e2..f530b53993 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/1_isaac.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/1_isaac.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/32.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/32.cfg
index c34e36cf02..af81abff70 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/32.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/32.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/4.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/4.cfg
index dac3e42c1f..7595db871b 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/4.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/4.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt, Franz Poeschel
 #
 # This file is part of PIConGPU.
 #
@@ -64,10 +64,12 @@ TBG_e_PSypy="--e_phaseSpace.period 100                         \
              --e_phaseSpace.min -1.0 --e_phaseSpace.max 1.0    \
              --e_phaseSpace.filter all"
 
-# HDF5 raw data output (DISABLED, add to TBG_plugins below to ENABLE!)
-TBG_hdf5="--hdf5.period 100   \
-          --hdf5.file simData \
-          --hdf5.source 'species_all,fields_all'"
+TBG_openPMD="--openPMD.period 100   \
+            --openPMD.file simData \
+            --openPMD.ext bp \
+            --checkpoint.backend openPMD \
+            --checkpoint.period 100
+            --checkpoint.restart.backend openPMD"
 
 # macro particle counter (electrons, debug information for memory)
 TBG_e_macroCount="--e_macroParticlesCount.period 100"
@@ -75,7 +77,8 @@ TBG_e_macroCount="--e_macroParticlesCount.period 100"
 TBG_plugins="!TBG_pngYX                    \
              !TBG_e_histogram              \
              !TBG_e_PSypy                  \
-             !TBG_e_macroCount"
+             !TBG_e_macroCount             \
+             !TBG_openPMD"
 
 
 #################################
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/4_gui.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/4_gui.cfg
index 115b3eb369..15453cda90 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/4_gui.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/4_gui.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/4_isaac.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/4_isaac.cfg
index fdcabb2781..6e76ba627a 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/4_isaac.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/4_isaac.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/8.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/8.cfg
index 105b4f0b9e..0039e71d07 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/8.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/8.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt, Franz Poeschel
 #
 # This file is part of PIConGPU.
 #
@@ -64,10 +64,12 @@ TBG_e_PSypy="--e_phaseSpace.period 100                         \
              --e_phaseSpace.min -1.0 --e_phaseSpace.max 1.0    \
              --e_phaseSpace.filter all"
 
-# HDF5 raw data output (DISABLED, add to TBG_plugins below to ENABLE!)
-TBG_hdf5="--hdf5.period 100   \
-          --hdf5.file simData \
-          --hdf5.source 'species_all,fields_all'"
+TBG_openPMD="--openPMD.period 100   \
+            --openPMD.file simData \
+            --openPMD.ext bp \
+            --checkpoint.backend openPMD \
+            --checkpoint.period 100
+            --checkpoint.restart.backend openPMD"
 
 # macro particle counter (electrons, debug information for memory)
 TBG_e_macroCount="--e_macroParticlesCount.period 100"
@@ -75,7 +77,8 @@ TBG_e_macroCount="--e_macroParticlesCount.period 100"
 TBG_plugins="!TBG_pngYX                    \
              !TBG_e_histogram              \
              !TBG_e_PSypy                  \
-             !TBG_e_macroCount"
+             !TBG_e_macroCount             \
+             !TBG_openPMD"
 
 #################################
 ## Section: Program Parameters ##
diff --git a/share/picongpu/examples/LaserWakefield/etc/picongpu/8_isaac.cfg b/share/picongpu/examples/LaserWakefield/etc/picongpu/8_isaac.cfg
index 2215846be1..0bfacd9b13 100644
--- a/share/picongpu/examples/LaserWakefield/etc/picongpu/8_isaac.cfg
+++ b/share/picongpu/examples/LaserWakefield/etc/picongpu/8_isaac.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/density.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/density.param
index 8122542894..4729dcb4ef 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/density.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
  *                     Richard Pausch, Marco Garten
  *
  * This file is part of PIConGPU.
@@ -26,59 +26,57 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     */
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
 #ifndef PARAM_BASE_DENSITY_SI
-#   define PARAM_BASE_DENSITY_SI 1.e25
+#    define PARAM_BASE_DENSITY_SI 1.e25
 #endif
-     constexpr float_64 BASE_DENSITY_SI = PARAM_BASE_DENSITY_SI;
-}
+        constexpr float_64 BASE_DENSITY_SI = PARAM_BASE_DENSITY_SI;
+    } // namespace SI
 
-namespace densityProfiles
-{
-    PMACC_STRUCT(GaussianParameter,
-        /** Profile Formula:
-         *   constexpr float_X exponent = abs((y - gasCenter_SI) / gasSigma_SI);
-         *   constexpr float_X density = exp(gasFactor * pow(exponent, gasPower));
-         *
-         *   takes `gasCenterLeft_SI      for y < gasCenterLeft_SI`,
-         *         `gasCenterRight_SI     for y > gasCenterRight_SI`,
-         *   and exponent = 0.0  for gasCenterLeft_SI < y < gasCenterRight_SI
-         */
-        (PMACC_C_VALUE(float_X, gasFactor, -1.0))
-        (PMACC_C_VALUE(float_X, gasPower, 4.0))
+    namespace densityProfiles
+    {
+        PMACC_STRUCT(
+            GaussianParameter,
+            /** Profile Formula:
+             *   constexpr float_X exponent = abs((y - gasCenter_SI) / gasSigma_SI);
+             *   constexpr float_X density = exp(gasFactor * pow(exponent, gasPower));
+             *
+             *   takes `gasCenterLeft_SI      for y < gasCenterLeft_SI`,
+             *         `gasCenterRight_SI     for y > gasCenterRight_SI`,
+             *   and exponent = 0.0  for gasCenterLeft_SI < y < gasCenterRight_SI
+             */
+            (PMACC_C_VALUE(float_X, gasFactor, -1.0))(PMACC_C_VALUE(float_X, gasPower, 4.0))
 
-        /** height of vacuum area on top border
-         *
-         *  this vacuum is important because of the laser initialization,
-         *  which is done in the first cells of the simulation and
-         *  assumes a charge-free volume
-         *  unit: cells
-         */
-        (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
+            /** height of vacuum area on top border
+             *
+             *  this vacuum is important because of the laser initialization,
+             *  which is done in the first cells of the simulation and
+             *  assumes a charge-free volume
+             *  unit: cells
+             */
+            (PMACC_C_VALUE(uint32_t, vacuumCellsY, 50))
 
-        /** The central position of the gas distribution
-          *  unit: meter
-          */
-        (PMACC_C_VALUE(float_64, gasCenterLeft_SI, 8.0e-5))
-        (PMACC_C_VALUE(float_64, gasCenterRight_SI, 10.0e-5))
+            /** The central position of the gas distribution
+             *  unit: meter
+             */
+            (PMACC_C_VALUE(float_64, gasCenterLeft_SI, 8.0e-5))(PMACC_C_VALUE(float_64, gasCenterRight_SI, 10.0e-5))
 
-        /** the distance from gasCenter_SI until the gas density decreases to its 1/e-th part
-          *  unit: meter
-          */
-        (PMACC_C_VALUE(float_64, gasSigmaLeft_SI, 8.0e-5))
-        (PMACC_C_VALUE(float_64, gasSigmaRight_SI, 8.0e-5))
-    ); /* struct GaussianParam */
+            /** the distance from gasCenter_SI until the gas density decreases to its 1/e-th part
+             *  unit: meter
+             */
+            (PMACC_C_VALUE(float_64, gasSigmaLeft_SI, 8.0e-5))(
+                PMACC_C_VALUE(float_64, gasSigmaRight_SI, 8.0e-5))); /* struct GaussianParam */
 
-    /* definition of density with Gaussian profile */
-    using Gaussian = GaussianImpl< GaussianParameter >;
-}
-}
+        /* definition of density with Gaussian profile */
+        using Gaussian = GaussianImpl<GaussianParameter>;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/dimension.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/dimension.param
index 9f14baaec3..9cda9d9a01 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/dimension.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/dimension.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Rene Widera
+/* Copyright 2014-2021 Axel Huebl, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -20,7 +20,7 @@
 #pragma once
 
 #ifndef PARAM_DIMENSION
-#define PARAM_DIMENSION DIM3
+#    define PARAM_DIMENSION DIM3
 #endif
 
 #define SIMDIM PARAM_DIMENSION
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/fieldSolver.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/fieldSolver.param
deleted file mode 100644
index 5ae21b62e2..0000000000
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/fieldSolver.param
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-/** @file
- *
- * Configure the field solver.
- *
- * Select the numerical Maxwell solver (e.g. Yee's method).
- *
- * Also allows to configure ad hoc mitigations for high frequency
- * noise in some setups via current smoothing.
- */
-
-#pragma once
-
-#include "picongpu/fields/MaxwellSolver/Solvers.def"
-#include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
-
-
-namespace picongpu
-{
-namespace fields
-{
-
-    /** Current Interpolation
-     *
-     * CurrentInterpolation is used to set a method performing the
-     * interpolate/assign operation from the generated currents of particle
-     * species to the electro-magnetic fields.
-     *
-     * Allowed values are:
-     *   - None:
-     *     - default for staggered grids/Yee-scheme
-     *     - updates E
-     *   - Binomial: 2nd order Binomial filter
-     *     - smooths the current before assignment in staggered grid
-     *     - updates E & breaks local charge conservation slightly
-     *   - NoneDS:
-     *     - experimental assignment for all-centered/directional splitting
-     *     - updates E & B at the same time
-     */
-#ifndef PARAM_CURRENTINTERPOLATION
-#   define PARAM_CURRENTINTERPOLATION None
-#endif
-    using CurrentInterpolation = currentInterpolation::PARAM_CURRENTINTERPOLATION;
-
-    /** FieldSolver
-     *
-     * Field Solver Selection:
-     *  - Yee< CurrentInterpolation >: standard Yee solver
-     *  - YeePML< CurrentInterpolation >: standard Yee solver with PML absorber
-     *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
-     *  - DirSplitting< CurrentInterpolation >: Sentoku's Directional Splitting Method
-     *  - None< CurrentInterpolation >: disable the vacuum update of E and B
-     */
-
-#ifndef PARAM_FIELDSOLVER
-    /* WARNING: if you change field solver by hand please update your CELL_WIDTH_SI
-     * in `grid.param` to fulfill the convergence condition (CFL)
-     */
-#   define PARAM_FIELDSOLVER Yee
-#endif
-    using Solver = maxwellSolver::PARAM_FIELDSOLVER< CurrentInterpolation >;
-
-} // namespace fields
-} // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/grid.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/grid.param
index a44d66b4e1..b5d81edf76 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -18,12 +18,10 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** Duration of one timestep
@@ -52,21 +50,21 @@ namespace picongpu
          * behave like the interaction of infinite "wire particles"
          * in fields with perfect symmetry in Z.
          */
-    } //namespace SI
+    } // namespace SI
 
     //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {32, 32},  /*x direction [negative,positive]*/
-        {32, 32},  /*y direction [negative,positive]*/
-        {32, 32}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -85,7 +83,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/laser.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/laser.param
index ce55cf26d6..264f694d29 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/laser.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/laser.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch, Alexander Debus
+/* Copyright 2013-2021 Axel Huebl, Anton Helm, Rene Widera, Richard Pausch, Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -46,119 +46,123 @@
 #include "picongpu/fields/laserProfiles/profiles.def"
 
 #ifndef PARAM_A0
-#   define PARAM_A0 8.0
+#    define PARAM_A0 8.0
 #endif
 
 #ifndef PARAM_WAVE_LENGTH_SI
-#   define PARAM_WAVE_LENGTH_SI 0.8e-6
+#    define PARAM_WAVE_LENGTH_SI 0.8e-6
 #endif
 
 #ifndef PARAM_PULSE_LENGTH_SI
-#   define PARAM_PULSE_LENGTH_SI 5.e-15
+#    define PARAM_PULSE_LENGTH_SI 5.e-15
 #endif
 
 namespace picongpu
 {
-namespace fields
-{
-namespace laserProfiles
-{
-namespace gaussianBeam
-{
-    //! Use only the 0th Laguerremode for a standard Gaussian
-    static constexpr uint32_t MODENUMBER = 0;
-    PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, 1.0);
-    // This is just an example for a more complicated set of Laguerre modes
-    //constexpr uint32_t MODENUMBER = 12;
-    //PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, -1.0, 0.0300519, 0.319461, -0.23783, 0.0954839, 0.0318653, -0.144547, 0.0249208, -0.111989, 0.0434385, -0.030038, -0.00896321, -0.0160788);
-
-} // namespace gaussianBeam
-
-    struct GaussianBeamParam
+    namespace fields
     {
-        /** unit: meter */
-        static constexpr float_64 WAVE_LENGTH_SI = PARAM_WAVE_LENGTH_SI;
-
-        /** Convert the normalized laser strength parameter a0 to Volt per meter */
-        static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
-
-        /** unit: W / m^2 */
-        // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
-
-        /** unit: none */
-        static constexpr float_64 _A0  = PARAM_A0;
-
-        /** unit: Volt / meter */
-        static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
-
-        /** unit: Volt / meter */
-        //static constexpr float_64 AMPLITUDE_SI = 1.738e13;
-
-        /** Pulse length: sigma of std. gauss for intensity (E^2)
-         *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
-         *                                          [    2.354820045     ]
-         *  Info:             FWHM_of_Intensity = FWHM_Illumination
-         *                      = what a experimentalist calls "pulse duration"
-         *
-         *  unit: seconds (1 sigma) */
-        static constexpr float_64 PULSE_LENGTH_SI = PARAM_PULSE_LENGTH_SI;
-
-        /** beam waist: distance from the axis where the pulse intensity (E^2)
-         *              decreases to its 1/e^2-th part,
-         *              at the focus position of the laser
-         * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
-         *                             [   1.17741    ]
-         *
-         *  unit: meter */
-        static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
-        /** the distance to the laser focus in y-direction
-         *  unit: meter */
-        static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
-
-        /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
-         *
-         *  unit: none */
-        static constexpr float_64 PULSE_INIT = 15.0;
-
-        /** cell from top where the laser is initialized
-         *
-         * if `initPlaneY == 0` than the absorber are disabled.
-         * if `initPlaneY > absorbercells negative Y` the negative absorber in y
-         * direction is enabled
-         *
-         * valid ranges:
-         *   - initPlaneY == 0
-         *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
-         */
-        static constexpr uint32_t initPlaneY = 0;
-
-        /** laser phase shift (no shift: 0.0)
-         *
-         * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
-         *
-         * unit: rad, periodic in 2*pi
-         */
-        static constexpr float_X LASER_PHASE = 0.0;
-
-        using LAGUERREMODES_t = gaussianBeam::LAGUERREMODES_t;
-        static constexpr uint32_t MODENUMBER = gaussianBeam::MODENUMBER;
-
-        /** Available polarisation types
-         */
-        enum PolarisationType
+        namespace laserProfiles
         {
-            LINEAR_X = 1u,
-            LINEAR_Z = 2u,
-            CIRCULAR = 4u,
-        };
-        /** Polarization selection
-         */
-        static constexpr PolarisationType Polarisation = CIRCULAR;
-    };
-
-    //! currently selected laser profile
-    using Selected = GaussianBeam< GaussianBeamParam >;
-
-} // namespace laserProfiles
-} // namespace fields
+            namespace gaussianBeam
+            {
+                //! Use only the 0th Laguerremode for a standard Gaussian
+                static constexpr uint32_t MODENUMBER = 0;
+                PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, 1.0);
+                // This is just an example for a more complicated set of Laguerre modes
+                // constexpr uint32_t MODENUMBER = 12;
+                // PMACC_CONST_VECTOR(float_X, MODENUMBER + 1, LAGUERREMODES, -1.0, 0.0300519, 0.319461, -0.23783,
+                // 0.0954839, 0.0318653, -0.144547, 0.0249208, -0.111989, 0.0434385, -0.030038, -0.00896321,
+                // -0.0160788);
+
+            } // namespace gaussianBeam
+
+            struct GaussianBeamParam
+            {
+                /** unit: meter */
+                static constexpr float_64 WAVE_LENGTH_SI = PARAM_WAVE_LENGTH_SI;
+
+                /** Convert the normalized laser strength parameter a0 to Volt per meter */
+                static constexpr float_64 UNITCONV_A0_to_Amplitude_SI = -2.0 * PI / WAVE_LENGTH_SI
+                    * ::picongpu::SI::ELECTRON_MASS_SI * ::picongpu::SI::SPEED_OF_LIGHT_SI
+                    * ::picongpu::SI::SPEED_OF_LIGHT_SI / ::picongpu::SI::ELECTRON_CHARGE_SI;
+
+                /** unit: W / m^2 */
+                // calculate: _A0 = 8.549297e-6 * sqrt( Intensity[W/m^2] ) * wavelength[m] (linearly polarized)
+
+                /** unit: none */
+                static constexpr float_64 _A0 = PARAM_A0;
+
+                /** unit: Volt / meter */
+                static constexpr float_64 AMPLITUDE_SI = _A0 * UNITCONV_A0_to_Amplitude_SI;
+
+                /** unit: Volt / meter */
+                // static constexpr float_64 AMPLITUDE_SI = 1.738e13;
+
+                /** Pulse length: sigma of std. gauss for intensity (E^2)
+                 *  PULSE_LENGTH_SI = FWHM_of_Intensity   / [ 2*sqrt{ 2* ln(2) } ]
+                 *                                          [    2.354820045     ]
+                 *  Info:             FWHM_of_Intensity = FWHM_Illumination
+                 *                      = what a experimentalist calls "pulse duration"
+                 *
+                 *  unit: seconds (1 sigma) */
+                static constexpr float_64 PULSE_LENGTH_SI = PARAM_PULSE_LENGTH_SI;
+
+                /** beam waist: distance from the axis where the pulse intensity (E^2)
+                 *              decreases to its 1/e^2-th part,
+                 *              at the focus position of the laser
+                 * W0_SI = FWHM_of_Intensity / sqrt{ 2* ln(2) }
+                 *                             [   1.17741    ]
+                 *
+                 *  unit: meter */
+                static constexpr float_64 W0_SI = 5.0e-6 / 1.17741;
+                /** the distance to the laser focus in y-direction
+                 *  unit: meter */
+                static constexpr float_64 FOCUS_POS_SI = 4.62e-5;
+
+                /** The laser pulse will be initialized PULSE_INIT times of the PULSE_LENGTH
+                 *
+                 *  unit: none */
+                static constexpr float_64 PULSE_INIT = 15.0;
+
+                /** cell from top where the laser is initialized
+                 *
+                 * if `initPlaneY == 0` than the absorber are disabled.
+                 * if `initPlaneY > absorbercells negative Y` the negative absorber in y
+                 * direction is enabled
+                 *
+                 * valid ranges:
+                 *   - initPlaneY == 0
+                 *   - absorber cells negative Y < initPlaneY < cells in y direction of the top gpu
+                 */
+                static constexpr uint32_t initPlaneY = 0;
+
+                /** laser phase shift (no shift: 0.0)
+                 *
+                 * sin(omega*time + laser_phase): starts with phase=0 at center --> E-field=0 at center
+                 *
+                 * unit: rad, periodic in 2*pi
+                 */
+                static constexpr float_X LASER_PHASE = 0.0;
+
+                using LAGUERREMODES_t = gaussianBeam::LAGUERREMODES_t;
+                static constexpr uint32_t MODENUMBER = gaussianBeam::MODENUMBER;
+
+                /** Available polarisation types
+                 */
+                enum PolarisationType
+                {
+                    LINEAR_X = 1u,
+                    LINEAR_Z = 2u,
+                    CIRCULAR = 4u,
+                };
+                /** Polarization selection
+                 */
+                static constexpr PolarisationType Polarisation = CIRCULAR;
+            };
+
+            //! currently selected laser profile
+            using Selected = GaussianBeam<GaussianBeamParam>;
+
+        } // namespace laserProfiles
+    } // namespace fields
 } // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/particle.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/particle.param
index 5048da8db7..2c92549011 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Marco Garten, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Marco Garten, Benjamin Worpitz,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -28,53 +28,49 @@
 
 namespace picongpu
 {
-namespace particles
-{
-
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *  unit: none
-     */
-    constexpr float_X MIN_WEIGHTING = 10.0;
-
-namespace startPosition
-{
-
-    struct RandomParameter2ppc
+    namespace particles
     {
-        /** Count of particles per cell at initial state
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
          *  unit: none
          */
-        static constexpr uint32_t numParticlesPerCell = 2u;
-    };
-    using Random2ppc = RandomImpl< RandomParameter2ppc >;
+        constexpr float_X MIN_WEIGHTING = 10.0;
 
-} // namespace startPosition
+        namespace startPosition
+        {
+            struct RandomParameter2ppc
+            {
+                /** Count of particles per cell at initial state
+                 *  unit: none
+                 */
+                static constexpr uint32_t numParticlesPerCell = 2u;
+            };
+            using Random2ppc = RandomImpl<RandomParameter2ppc>;
 
-    /** During unit normalization, we assume this is a typical
-     *  number of particles per cell for normalization of weighted
-     *  particle attributes.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL =
-        startPosition::RandomParameter2ppc::numParticlesPerCell;
+        } // namespace startPosition
 
-namespace manipulators
-{
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = startPosition::RandomParameter2ppc::numParticlesPerCell;
 
-    struct SetIonToNeutral
-    {
-        template< typename T_Particle >
-        DINLINE void operator()( T_Particle & particle )
+        namespace manipulators
         {
-            using Particle = T_Particle;
+            struct SetIonToNeutral
+            {
+                template<typename T_Particle>
+                DINLINE void operator()(T_Particle& particle)
+                {
+                    using Particle = T_Particle;
 
-            // number of bound electrons at initialization state of the neutral atom
-            float_X const protonNumber = traits::GetAtomicNumbers< T_Particle >::type::numberOfProtons;
+                    // number of bound electrons at initialization state of the neutral atom
+                    float_X const protonNumber = traits::GetAtomicNumbers<T_Particle>::type::numberOfProtons;
 
-            particle[ boundElectrons_ ] = protonNumber;
-        }
-    };
-    using SetBoundElectrons = generic::Free< SetIonToNeutral >;
-} // namespace manipulators
-} // namespace particles
+                    particle[boundElectrons_] = protonNumber;
+                }
+            };
+            using SetBoundElectrons = generic::Free<SetIonToNeutral>;
+        } // namespace manipulators
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/png.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/png.param
index 445b8bd10f..65620389a6 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/png.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/png.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -24,17 +24,17 @@
 
 namespace picongpu
 {
-/*scale image before write to file, only scale if value is not 1.0
- */
-constexpr float_64 scale_image = 1.0;
+    /*scale image before write to file, only scale if value is not 1.0
+     */
+    constexpr float_64 scale_image = 1.0;
 
-/*if true image is scaled if cellsize is not quadratic, else no scale*/
-constexpr bool scale_to_cellsize = true;
+    /*if true image is scaled if cellsize is not quadratic, else no scale*/
+    constexpr bool scale_to_cellsize = true;
 
-constexpr bool white_box_per_GPU = false;
+    constexpr bool white_box_per_GPU = false;
 
-namespace visPreview
-{
+    namespace visPreview
+    {
 // normalize EM fields to typical laser or plasma quantities
 //-1: Auto:    enable adaptive scaling for each output
 // 1: Laser:   typical fields calculated out of the laser amplitude
@@ -49,33 +49,32 @@ namespace visPreview
 #define EM_FIELD_SCALE_CHANNEL2 -1
 #define EM_FIELD_SCALE_CHANNEL3 -1
 
-// multiply highest undisturbed particle density with factor
-constexpr float_X preParticleDens_opacity = 0.25;
-constexpr float_X preChannel1_opacity = 1.0;
-constexpr float_X preChannel2_opacity = 1.0;
-constexpr float_X preChannel3_opacity = 1.0;
-
-// specify color scales for each channel
-namespace preParticleDensCol = colorScales::grayInv;
-namespace preChannel1Col = colorScales::green;
-namespace preChannel2Col = colorScales::none;
-namespace preChannel3Col = colorScales::none;
+        // multiply highest undisturbed particle density with factor
+        constexpr float_X preParticleDens_opacity = 0.25;
+        constexpr float_X preChannel1_opacity = 1.0;
+        constexpr float_X preChannel2_opacity = 1.0;
+        constexpr float_X preChannel3_opacity = 1.0;
 
-/* png preview settings for each channel */
-DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return field_E.x() * field_E.x();
-}
+        // specify color scales for each channel
+        namespace preParticleDensCol = colorScales::grayInv;
+        namespace preChannel1Col = colorScales::green;
+        namespace preChannel2Col = colorScales::none;
+        namespace preChannel3Col = colorScales::none;
 
-DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return field_E.y();
-}
+        /* png preview settings for each channel */
+        DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return field_E.x() * field_E.x();
+        }
 
-DINLINE float_X preChannel3(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return -1.0_X * field_E.y();
-}
-}
-}
+        DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return field_E.y();
+        }
 
+        DINLINE float_X preChannel3(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return -1.0_X * field_E.y();
+        }
+    } // namespace visPreview
+} // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/precision.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/precision.param
deleted file mode 100644
index 2ab132f083..0000000000
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/precision.param
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2013-2020 Rene Widera
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-/** @file
- *
- * Define the precision of typically used floating point types in the
- * simulation.
- *
- * PIConGPU normalizes input automatically, allowing to use single-precision by
- * default for the core algorithms. Note that implementations of various
- * algorithms (usually plugins or non-core components) might still decide to
- * hard-code a different (mixed) precision for some critical operations.
- */
-
-#pragma once
-
-
-namespace picongpu
-{
-
-/*! Select a precision for the simulation data
- *  - precision32Bit : use 32Bit floating point numbers
- *                     [significant digits 7 to 8]
- *  - precision64Bit : use 64Bit floating point numbers
- *                     [significant digits 15 to 16]
- */
-#ifndef PARAM_PRECISION
-#   define PARAM_PRECISION precision32Bit
-#endif
-namespace precisionPIConGPU      = PARAM_PRECISION;
-
-/*! Select a precision special operations (can be different from simulation precision)
- *  - precisionPIConGPU : use precision which is selected on top (precisionPIConGPU)
- *  - precision32Bit    : use 32Bit floating point numbers
- *  - precision64Bit    : use 64Bit floating point numbers
- */
-namespace precisionSqrt          = precisionPIConGPU;
-namespace precisionExp           = precisionPIConGPU;
-namespace precisionTrigonometric = precisionPIConGPU;
-
-
-} // namespace picongpu
-
-#include "picongpu/unitless/precision.unitless"
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/species.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/species.param
deleted file mode 100644
index fa31b0192a..0000000000
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/species.param
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2014-2020 Rene Widera, Richard Pausch
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/particles/shapes.hpp"
-#include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
-#include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
-#include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
-
-#include "picongpu/particles/flylite/NonLTE.def"
-#include "picongpu/fields/currentDeposition/Solver.def"
-
-
-namespace picongpu
-{
-/*---------------------------- generic solver---------------------------------*/
-
-/*! Particle Shape definitions -------------------------------------------------
- *  - particles::shapes::CIC : 1st order
- *  - particles::shapes::TSC : 2nd order
- *  - particles::shapes::PCS : 3rd order
- *  - particles::shapes::P4S : 4th order
- *
- *  example: using UsedParticleShape = particles::shapes::CIC;
- */
-#ifndef PARAM_PARTICLESHAPE
-#define PARAM_PARTICLESHAPE TSC
-#endif
-using UsedParticleShape = particles::shapes::PARAM_PARTICLESHAPE;
-
-/* define which interpolation method is used to interpolate fields to particle*/
-using UsedField2Particle = FieldToParticleInterpolation< UsedParticleShape, AssignedTrilinearInterpolation >;
-
-/*! select current solver method -----------------------------------------------
- * - currentSolver::Esirkepov<SHAPE>  : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- * - currentSolver::VillaBune<>       : particle shapes - CIC (1st order) only
- * - currentSolver::EmZ<SHAPE>        : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- *
- * For development purposes: ---------------------------------------------------
- * - currentSolver::EsirkepovNative<SHAPE> : generic version of currentSolverEsirkepov
- *   without optimization (~4x slower and needs more shared memory)
- */
-#ifndef PARAM_CURRENTSOLVER
-#define PARAM_CURRENTSOLVER Esirkepov
-#endif
-using UsedParticleCurrentSolver = currentSolver::PARAM_CURRENTSOLVER< UsedParticleShape >;
-
-/*! particle pusher configuration ----------------------------------------------
- *
- * Defining a pusher is optional for particles
- *
- * - particles::pusher::Vay : better suited relativistic boris pusher
- * - particles::pusher::Boris : standard boris pusher
- * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
- *                                              with classical radiation reaction
- *
- * For diagnostics & modeling: ------------------------------------------------
- * - particles::pusher::Free : free propagation, ignore fields
- *                             (= free stream model)
- * - particles::pusher::Photon : propagate with c in direction of normalized mom.
- * - particles::pusher::Probe : Probe particles that interpolate E & B
- * For development purposes: --------------------------------------------------
- * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
- */
-#ifndef PARAM_PARTICLEPUSHER
-#define PARAM_PARTICLEPUSHER Boris
-#endif
-using UsedParticlePusher = particles::pusher::PARAM_PARTICLEPUSHER;
-
-}//namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/speciesDefinition.param
index 5a1a26407d..3966dfb21e 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Marco Garten, Richard Pausch,
+/* Copyright 2013-2021 Rene Widera, Marco Garten, Richard Pausch,
  *                     Benjamin Worpitz, Axel Huebl
  *
  * This file is part of PIConGPU.
@@ -32,96 +32,80 @@
 
 namespace picongpu
 {
+    /*########################### define particle attributes #####################*/
 
-/*########################### define particle attributes #####################*/
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting>;
 
-/** describe attributes of a particle*/
-using DefaultParticleAttributes = MakeSeq_t<
-    position<position_pic>,
-    momentum,
-    weighting
->;
-
-/* attribute sequence for species: ions */
-using AttributeSeqIons = MakeSeq_t<
-    DefaultParticleAttributes
-#if( PARAM_IONIZATION == 1 )
-    , boundElectrons
+    /* attribute sequence for species: ions */
+    using AttributeSeqIons = MakeSeq_t<
+        DefaultParticleAttributes
+#if(PARAM_IONIZATION == 1)
+        ,
+        boundElectrons
 #endif
->;
-
-/*########################### end particle attributes ########################*/
-
-/*########################### define species #################################*/
-
-
-/*--------------------------- electrons --------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioElectrons, 1.0 );
-value_identifier( float_X, ChargeRatioElectrons, 1.0 );
-
-using ParticleFlagsElectrons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioElectrons >,
-    chargeRatio< ChargeRatioElectrons >
->;
-
-/* define species: electrons */
-using PIC_Electrons = Particles<
-    PMACC_CSTRING( "e" ),
-    ParticleFlagsElectrons,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- ions -------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioIons, 1836.152672 );
-value_identifier( float_X, ChargeRatioIons, -1.0 );
-
-using ParticleFlagsIons = MakeSeq_t<
-    particlePusher< UsedParticlePusher >,
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    current< UsedParticleCurrentSolver >,
-    massRatio< MassRatioIons >,
-    chargeRatio< ChargeRatioIons >,
-#if( PARAM_IONIZATION == 1 )
-    ionizers<
-        MakeSeq_t<
-            particles::ionization::BSIEffectiveZ< PIC_Electrons >,
-            particles::ionization::ADKCircPol< PIC_Electrons >
-        >
-    >,
-    ionizationEnergies< ionization::energies::AU::Hydrogen_t >,
-    effectiveNuclearCharge< ionization::effectiveNuclearCharge::Hydrogen_t >,
+        >;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
+
+    /* define species: electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*--------------------------- ions -------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioIons, 1836.152672);
+    value_identifier(float_X, ChargeRatioIons, -1.0);
+
+    using ParticleFlagsIons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioIons>,
+        chargeRatio<ChargeRatioIons>,
+#if(PARAM_IONIZATION == 1)
+        ionizers<MakeSeq_t<
+            particles::ionization::BSIEffectiveZ<PIC_Electrons, particles::ionization::current::None>,
+            particles::ionization::ADKCircPol<PIC_Electrons, particles::ionization::current::None>>>,
+        ionizationEnergies<ionization::energies::AU::Hydrogen_t>,
+        effectiveNuclearCharge<ionization::effectiveNuclearCharge::Hydrogen_t>,
 #endif
-    atomicNumbers< ionization::atomicNumbers::Hydrogen_t >
->;
+        atomicNumbers<ionization::atomicNumbers::Hydrogen_t>>;
 
-/* define species: ions */
-using PIC_Ions = Particles<
-    PMACC_CSTRING( "i" ),
-    ParticleFlagsIons,
-    AttributeSeqIons
->;
+    /* define species: ions */
+    using PIC_Ions = Particles<PMACC_CSTRING("i"), ParticleFlagsIons, AttributeSeqIons>;
 
 /*########################### end species ####################################*/
 
 /*enable (1) or disable (0) ions*/
 #ifndef PARAM_IONS
-#   define PARAM_IONS 0
+#    define PARAM_IONS 0
 #endif
 
-using VectorAllSpecies = MakeSeq_t<
-    PIC_Electrons
-#if( PARAM_IONS == 1)
-    ,PIC_Ions
+    using VectorAllSpecies = MakeSeq_t<
+        PIC_Electrons
+#if(PARAM_IONS == 1)
+        ,
+        PIC_Ions
 #endif
->;
+        >;
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/speciesInitialization.param
index 59f9c9cead..705193e900 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,39 +33,25 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline define in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-#if( PARAM_IONIZATION == 0 )
-        CreateDensity<
-            densityProfiles::Gaussian,
-            startPosition::Random2ppc,
-            PIC_Electrons
-        >
-#   if( PARAM_IONS == 1 )
-        ,
-        Derive<
-            PIC_Electrons,
-            PIC_Ions
-        >
-#   endif
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+#if(PARAM_IONIZATION == 0)
+            CreateDensity<densityProfiles::Gaussian, startPosition::Random2ppc, PIC_Electrons>
+#    if(PARAM_IONS == 1)
+            ,
+            Derive<PIC_Electrons, PIC_Ions>
+#    endif
 #else
 
-        CreateDensity<
-            densityProfiles::Gaussian,
-            startPosition::Random2ppc,
-            PIC_Ions
-        >,
-        Manipulate<
-            manipulators::SetBoundElectrons,
-            PIC_Ions
-        >
+            CreateDensity<densityProfiles::Gaussian, startPosition::Random2ppc, PIC_Ions>,
+            Manipulate<manipulators::SetBoundElectrons, PIC_Ions>
 #endif
-    >;
+            >;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/include/picongpu/param/starter.param b/share/picongpu/examples/LaserWakefield/include/picongpu/param/starter.param
index 5e6c700755..a7ca54ee55 100644
--- a/share/picongpu/examples/LaserWakefield/include/picongpu/param/starter.param
+++ b/share/picongpu/examples/LaserWakefield/include/picongpu/param/starter.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 
@@ -26,9 +25,5 @@ namespace picongpu
 {
     namespace defaultPIConGPU
     {
-
     }
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/LaserWakefield/lib/python/picongpu/params.py b/share/picongpu/examples/LaserWakefield/lib/python/picongpu/params.py
index ab7360a3b6..5aa6d890da 100644
--- a/share/picongpu/examples/LaserWakefield/lib/python/picongpu/params.py
+++ b/share/picongpu/examples/LaserWakefield/lib/python/picongpu/params.py
@@ -1,7 +1,7 @@
 """
 This file is part of PIConGPU.
 
-Copyright 2017-2020 PIConGPU contributors
+Copyright 2017-2021 PIConGPU contributors
 Authors: Sebastian Starke, Jeffrey Kelling
 License: GPLv3+
 
diff --git a/share/picongpu/examples/SingleParticleTest/cmakeFlags b/share/picongpu/examples/SingleParticleTest/cmakeFlags
index c9a87d031d..935db2cb84 100755
--- a/share/picongpu/examples/SingleParticleTest/cmakeFlags
+++ b/share/picongpu/examples/SingleParticleTest/cmakeFlags
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/SingleParticleTest/etc/picongpu/1.cfg b/share/picongpu/examples/SingleParticleTest/etc/picongpu/1.cfg
index d32aba5e9f..ebb7806628 100644
--- a/share/picongpu/examples/SingleParticleTest/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/SingleParticleTest/etc/picongpu/1.cfg
@@ -1,4 +1,5 @@
-# Copyright 2013-2020 Heiko Burau, Rene Widera, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Heiko Burau, Rene Widera, Felix Schmitt, Axel Huebl,
+#                     Franz Poeschel
 #
 # This file is part of PIConGPU.
 #
@@ -49,7 +50,8 @@ TBG_periodic="--periodic 1 1 1"
 # write position to stdout (messy):
 # --e_position.period 1
 
-TBG_plugins="--hdf5.period 1 --hdf5.file simData \
+TBG_openPMD="openPMD.period 1 --openPMD.file simData --openPMD.ext bp"
+TBG_plugins="!TBG_openPMD \
              --e_macroParticlesCount.period 100"
 
 
@@ -59,7 +61,7 @@ TBG_plugins="--hdf5.period 1 --hdf5.file simData \
 
 TBG_deviceDist="!TBG_devices_x !TBG_devices_y !TBG_devices_z"
 
-TBG_programParams="-d !TBG_deviceDist \
+TBG_programParams="!TBG_deviceDist \
                    -g !TBG_gridSize   \
                    -s !TBG_steps      \
                    !TBG_periodic      \
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/density.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/density.param
index 80733e2766..d957544767 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/density.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -27,65 +27,58 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     *
-     * One particle per cell with weighting 1.0:
-     */
-    constexpr float_64 BASE_DENSITY_SI =
-        1.0 /
-        ( CELL_WIDTH_SI * CELL_HEIGHT_SI * CELL_DEPTH_SI );
-
-}
-
-namespace densityProfiles
-{
-
-    struct FreeFormulaFunctor
+    namespace SI
     {
-
-        /**
-         * This formula uses SI quantities only
-         * The profile will be multiplied by BASE_DENSITY_SI.
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
          *
-         * @param position_SI total offset including all slides [in meter]
-         * @param cellSize_SI cell sizes [in meter]
+         * unit: ELEMENTS/m^3
          *
-         * @return float_X density [normalized to 1.0]
+         * One particle per cell with weighting 1.0:
          */
-        HDINLINE float_X operator()(
-            const floatD_64& position_SI,
-            const float3_64& cellSize_SI
-        )
+        constexpr float_64 BASE_DENSITY_SI = 1.0 / (CELL_WIDTH_SI * CELL_HEIGHT_SI * CELL_DEPTH_SI);
+
+    } // namespace SI
+
+    namespace densityProfiles
+    {
+        struct FreeFormulaFunctor
         {
-            const pmacc::math::UInt64< simDim > cell_id( position_SI / cellSize_SI.shrink< simDim >() );
+            /**
+             * This formula uses SI quantities only
+             * The profile will be multiplied by BASE_DENSITY_SI.
+             *
+             * @param position_SI total offset including all slides [in meter]
+             * @param cellSize_SI cell sizes [in meter]
+             *
+             * @return float_X density [normalized to 1.0]
+             */
+            HDINLINE float_X operator()(const floatD_64& position_SI, const float3_64& cellSize_SI)
+            {
+                const pmacc::math::UInt64<simDim> cell_id(position_SI / cellSize_SI.shrink<simDim>());
 
-            // add particle in cell in at [ 32 5 16 ]
-            // X=32: middle of X plane (gyro-motion in X-Y)
-            // Y=5:  do not start fully at border, e.g., if someone wants to increase E, and so mass over time
-            // Z=16: middle of box in Z, move slowly in positive Z as E-field drift
-            const pmacc::math::UInt64< DIM3 > cell_start( 32u, 5u, 16u );
+                // add particle in cell in at [ 32 5 16 ]
+                // X=32: middle of X plane (gyro-motion in X-Y)
+                // Y=5:  do not start fully at border, e.g., if someone wants to increase E, and so mass over time
+                // Z=16: middle of box in Z, move slowly in positive Z as E-field drift
+                const pmacc::math::UInt64<DIM3> cell_start(32u, 5u, 16u);
 
-            bool isStartCell = true;
-            for( uint64_t d = 0; d < simDim; ++d )
-                if( cell_id[d] != cell_start[d] )
-                    isStartCell = false;
+                bool isStartCell = true;
+                for(uint64_t d = 0; d < simDim; ++d)
+                    if(cell_id[d] != cell_start[d])
+                        isStartCell = false;
 
-            if( isStartCell )
-                return 1.0;
+                if(isStartCell)
+                    return 1.0;
 
-            return 0.0;
-        }
-    };
+                return 0.0;
+            }
+        };
 
-    /* definition of free formula profile */
-    using FreeFormula = FreeFormulaImpl< FreeFormulaFunctor >;
-}
-}
+        /* definition of free formula profile */
+        using FreeFormula = FreeFormulaImpl<FreeFormulaFunctor>;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/dimension.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/dimension.param
index 0881e9884b..efb7c42757 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/dimension.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/dimension.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Rene Widera, Richard Pausch
+/* Copyright 2014-2021 Axel Huebl, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -20,7 +20,7 @@
 #pragma once
 
 #ifndef PARAM_DIMENSION
-#define PARAM_DIMENSION DIM3
+#    define PARAM_DIMENSION DIM3
 #endif
 
 #define SIMDIM PARAM_DIMENSION
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/fieldBackground.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/fieldBackground.param
index 984b495661..61acac9eb4 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/fieldBackground.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/fieldBackground.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Axel Huebl, Alexander Debus
+/* Copyright 2014-2021 Axel Huebl, Alexander Debus
  *
  * This file is part of PIConGPU.
  *
@@ -31,31 +31,23 @@ namespace picongpu
         static constexpr bool InfluenceParticlePusher = true;
 
         /* We use this to calculate your SI input back to our unit system */
-        PMACC_ALIGN(
-            m_unitField,
-            const float3_64
-        );
+        PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundE( const float3_64 unitField ) :
-            m_unitField( unitField )
-        {}
+        HDINLINE FieldBackgroundE(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field E(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t = 0
          * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()(
+        HDINLINE float3_X operator()(
             const DataSpace<simDim>& /*cellIdx*/,
             const uint32_t /*currentStep*/
         ) const
         {
             /* specify your E-Field in V/m and convert to PIConGPU units */
-            return float3_X(
-                0.0,
-                0.0,
-                -10.0e6 / m_unitField[1]
-            );
+            return float3_X(0.0, 0.0, -10.0e6 / m_unitField[1]);
         }
     };
 
@@ -66,31 +58,23 @@ namespace picongpu
         static constexpr bool InfluenceParticlePusher = true;
 
         /* We use this to calculate your SI input back to our unit system */
-        PMACC_ALIGN(
-            m_unitField,
-            const float3_64
-        );
+        PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundB( const float3_64 unitField ) :
-            m_unitField( unitField )
-        {}
+        HDINLINE FieldBackgroundB(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field B(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t=0
          * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()(
+        HDINLINE float3_X operator()(
             const DataSpace<simDim>& /*cellIdx*/,
             const uint32_t /*currentStep*/
         ) const
         {
             /* specify your B-Field in T and convert to PIConGPU units */
-            return float3_X(
-                0.0,
-                0.0,
-                50.0 / m_unitField[1]
-            );
+            return float3_X(0.0, 0.0, 50.0 / m_unitField[1]);
         }
     };
 
@@ -101,31 +85,23 @@ namespace picongpu
         static constexpr bool activated = false;
 
         /* We use this to calculate your SI input back to our unit system */
-        PMACC_ALIGN(
-            m_unitField,
-            const float3_64
-        );
+        PMACC_ALIGN(m_unitField, const float3_64);
 
-        HDINLINE FieldBackgroundJ( const float3_64 unitField ) :
-            m_unitField(unitField)
-        {}
+        HDINLINE FieldBackgroundJ(const float3_64 unitField) : m_unitField(unitField)
+        {
+        }
 
         /** Specify your background field J(r,t) here
          *
          * \param cellIdx The total cell id counted from the start at t=0
          * \param currentStep The current time step */
-        HDINLINE float3_X
-        operator()(
+        HDINLINE float3_X operator()(
             const DataSpace<simDim>& /*cellIdx*/,
             const uint32_t /*currentStep*/
         ) const
         {
             /* specify your J-Field in A/m^2 and convert to PIConGPU units */
-            return float3_X(
-                0.0,
-                0.0,
-                0.0
-            );
+            return float3_X(0.0, 0.0, 0.0);
         }
     };
 
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/fileOutput.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/fileOutput.param
index 1763bc18e6..64da1739de 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/fileOutput.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/fileOutput.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
  *                     Benjamin Worpitz, Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -63,32 +63,23 @@ namespace picongpu
     namespace deriveField = particles::particleToGrid;
 
     /* ChargeDensity section */
-    using ChargeDensity_Seq = deriveField::CreateEligible_t<
-        VectorAllSpecies,
-        deriveField::derivedAttributes::ChargeDensity
-    >;
+    using ChargeDensity_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::ChargeDensity>;
 
     /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
      *
      * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
      */
-    using FieldTmpSolvers = MakeSeq_t<
-        ChargeDensity_Seq
-    >;
+    using FieldTmpSolvers = MakeSeq_t<ChargeDensity_Seq>;
 
 
     /** FileOutputFields: Groups all Fields that shall be dumped *************/
 
     /** Possible native fields: FieldE, FieldB, FieldJ
      */
-    using NativeFileOutputFields = MakeSeq_t<
-        FieldJ
-    >;
+    using NativeFileOutputFields = MakeSeq_t<FieldJ>;
 
-    using FileOutputFields = MakeSeq_t<
-        NativeFileOutputFields,
-        FieldTmpSolvers
-    >;
+    using FileOutputFields = MakeSeq_t<NativeFileOutputFields, FieldTmpSolvers>;
 
 
     /** FileOutputParticles: Groups all Species that shall be dumped **********
@@ -98,4 +89,4 @@ namespace picongpu
      */
     using FileOutputParticles = VectorAllSpecies;
 
-}
+} // namespace picongpu
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/grid.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/grid.param
index 8b66ec083d..162db76a96 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -18,12 +18,10 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** Period of a gyro-motion in s for an electron with beta=0.5 in B=50T
@@ -57,21 +55,21 @@ namespace picongpu
          * in fields with perfect symmetry in Z.
          */
 
-    } //namespace SI
+    } // namespace SI
 
     //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {32, 32},  /*x direction [negative,positive]*/
-        {32, 32},  /*y direction [negative,positive]*/
-        {32, 32}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -90,7 +88,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/particle.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/particle.param
index 7cc28fbf6d..bfa944c9f3 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -31,73 +31,65 @@
 
 namespace picongpu
 {
-
-namespace particles
-{
-
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *  note: this specific setting allows all kinds of weightings > 0.0
-     *  unit: none
-     */
-    constexpr float_X MIN_WEIGHTING = std::numeric_limits< float_X >::min();
-
-namespace manipulators
-{
-
-    // Parameters for a particle drift in X
-    CONST_VECTOR(
-        float_X,
-        3,
-        DriftParam_direction,
-        // unit vector for direction of drift: x, y, z
-        1.0,
-        0.0,
-        0.0
-    );
-    struct DriftParam
+    namespace particles
     {
-        static constexpr float_64 gamma = 1.1547; // beta: 0.5
-        const DriftParam_direction_t direction;
-    };
-    using AssignYDrift = unary::Drift<
-        DriftParam,
-        nvidia::functors::Assign
-    >;
-
-} // namespace manipulators
-
-
-namespace startPosition
-{
-    // sit directly in lower corner of the cell
-    CONST_VECTOR(
-        float_X,
-        3,
-        InCellOffset,
-        // each x, y, z in-cell position component in range [0.0, 1.0)
-        0.0,
-        0.0,
-        0.0
-    );
-    struct OnePositionParameter
-    {
-        /** Count of particles per cell at initial state
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
+         *  note: this specific setting allows all kinds of weightings > 0.0
          *  unit: none
          */
-        static constexpr uint32_t numParticlesPerCell = 1u;
-
-        const InCellOffset_t inCellOffset;
-    };
-    using OnePosition = OnePositionImpl< OnePositionParameter >;
-
-} // namespace startPosition
-
-    /** During unit normalization, we assume this is a typical
-     *  number of particles per cell for normalization of weighted
-     *  particle attributes.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 1u;
+        constexpr float_X MIN_WEIGHTING = std::numeric_limits<float_X>::min();
+
+        namespace manipulators
+        {
+            // Parameters for a particle drift in X
+            CONST_VECTOR(
+                float_X,
+                3,
+                DriftParam_direction,
+                // unit vector for direction of drift: x, y, z
+                1.0,
+                0.0,
+                0.0);
+            struct DriftParam
+            {
+                static constexpr float_64 gamma = 1.1547; // beta: 0.5
+                const DriftParam_direction_t direction;
+            };
+            using AssignYDrift = unary::Drift<DriftParam, nvidia::functors::Assign>;
+
+        } // namespace manipulators
+
+
+        namespace startPosition
+        {
+            // sit directly in lower corner of the cell
+            CONST_VECTOR(
+                float_X,
+                3,
+                InCellOffset,
+                // each x, y, z in-cell position component in range [0.0, 1.0)
+                0.0,
+                0.0,
+                0.0);
+            struct OnePositionParameter
+            {
+                /** Count of particles per cell at initial state
+                 *  unit: none
+                 */
+                static constexpr uint32_t numParticlesPerCell = 1u;
+
+                const InCellOffset_t inCellOffset;
+            };
+            using OnePosition = OnePositionImpl<OnePositionParameter>;
+
+        } // namespace startPosition
+
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 1u;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/species.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/species.param
index 48de646b1a..81ef7e445a 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/species.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/species.param
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Rene Widera, Richard Pausch
+/* Copyright 2014-2021 Rene Widera, Richard Pausch, Annegret Roeszler, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -17,55 +17,83 @@
  * If not, see <http://www.gnu.org/licenses/>.
  */
 
+/** @file
+ *
+ * Particle shape, field to particle interpolation, current solver, and particle pusher
+ * can be declared here for usage in `speciesDefinition.param`.
+ *
+ * @see
+ *   **MODELS / Hierarchy of Charge Assignment Schemes**
+ *   in the online documentation for information on particle shapes.
+ *
+ *
+ * \attention
+ * The higher order shape names are redefined with release 0.6.0 in order to provide a consistent naming:
+ *     * PQS is the name of the 3rd order assignment function (instead of PCS)
+ *     * PCS is the name of the 4th order assignment function (instead of P4S)
+ *     * P4S does not exist anymore
+ */
+
 #pragma once
 
 #include "picongpu/particles/shapes.hpp"
 #include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
 #include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
 #include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
-
 #include "picongpu/particles/flylite/NonLTE.def"
 #include "picongpu/fields/currentDeposition/Solver.def"
 
 
 namespace picongpu
 {
-/*---------------------------- generic solver---------------------------------*/
-
-/*! Particle Shape definitions -------------------------------------------------
- *  - particles::shapes::CIC : 1st order
- *  - particles::shapes::TSC : 2nd order
- *  - particles::shapes::PCS : 3rd order
- *  - particles::shapes::P4S : 4th order
- *
- *  example: using UsedParticleShape = particles::shapes::CIC;
- */
-using UsedParticleShape = particles::shapes::CIC;
+    /** select macroparticle shape
+     *
+     * **WARNING** the shape names are redefined and diverge from PIConGPU versions before 0.6.0.
+     *
+     *  - particles::shapes::CIC : Assignment function is a piecewise linear spline
+     *  - particles::shapes::TSC : Assignment function is a piecewise quadratic spline
+     *  - particles::shapes::PQS : Assignment function is a piecewise cubic spline
+     *  - particles::shapes::PCS : Assignment function is a piecewise quartic spline
+     */
+    using UsedParticleShape = particles::shapes::CIC;
 
-/* define which interpolation method is used to interpolate fields to particle*/
-using UsedField2Particle = FieldToParticleInterpolation<UsedParticleShape, AssignedTrilinearInterpolation>;
+    /** select interpolation method to be used for interpolation of grid-based field values to particle positions
+     */
+    using UsedField2Particle = FieldToParticleInterpolation<UsedParticleShape, AssignedTrilinearInterpolation>;
 
-/*! select current solver method -----------------------------------------------
- * - currentSolver::Esirkepov<SHAPE>  : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- * - currentSolver::VillaBune<>       : particle shapes - CIC (1st order) only
- * - currentSolver::EmZ<SHAPE>        : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- *
- * For development purposes: ---------------------------------------------------
- * - currentSolver::EsirkepovNative<SHAPE> : generic version of currentSolverEsirkepov
- *   without optimization (~4x slower and needs more shared memory)
- */
-using UsedParticleCurrentSolver = currentSolver::Esirkepov<UsedParticleShape>;
+    /*! select current solver method
+     * - currentSolver::Esirkepov< SHAPE, STRATEGY > : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     * - currentSolver::VillaBune< SHAPE, STRATEGY > : particle shapes - CIC (1st order) only
+     * - currentSolver::EmZ< SHAPE, STRATEGY >       : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     *
+     * For development purposes:
+     * - currentSolver::EsirkepovNative< SHAPE, STRATEGY > : generic version of currentSolverEsirkepov
+     *   without optimization (~4x slower and needs more shared memory)
+     *
+     * STRATEGY (optional):
+     * - currentSolver::strategy::StridedCachedSupercells
+     * - currentSolver::strategy::StridedCachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::CachedSupercells
+     * - currentSolver::strategy::CachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::NonCachedSupercells
+     * - currentSolver::strategy::NonCachedSupercellsScaled<N> with N >= 1
+     */
+    using UsedParticleCurrentSolver = currentSolver::Esirkepov<UsedParticleShape>;
 
-/*! particle pusher configuration ----------------------------------------------
+/** particle pusher configuration
  *
  * Defining a pusher is optional for particles
  *
- * - particles::pusher::Vay : better suited relativistic boris pusher
- * - particles::pusher::Boris : standard boris pusher
+ * - particles::pusher::HigueraCary : Higuera & Cary's relativistic pusher preserving both volume and ExB velocity
+ * - particles::pusher::Vay : Vay's relativistic pusher preserving ExB velocity
+ * - particles::pusher::Boris : Boris' relativistic pusher preserving volume
  * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
  *                                              with classical radiation reaction
+ * - particles::pusher::Composite : composite of two given pushers,
+ *                                  switches between using one (or none) of those
  *
  * For diagnostics & modeling: ------------------------------------------------
+ * - particles::pusher::Acceleration : Accelerate particles by applying a constant electric field
  * - particles::pusher::Free : free propagation, ignore fields
  *                             (= free stream model)
  * - particles::pusher::Photon : propagate with c in direction of normalized mom.
@@ -74,8 +102,8 @@ using UsedParticleCurrentSolver = currentSolver::Esirkepov<UsedParticleShape>;
  * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
  */
 #ifndef PARAM_PARTICLEPUSHER
-#define PARAM_PARTICLEPUSHER Boris
+#    define PARAM_PARTICLEPUSHER Boris
 #endif
-using UsedParticlePusher = particles::pusher::PARAM_PARTICLEPUSHER;
+    using UsedParticlePusher = particles::pusher::PARAM_PARTICLEPUSHER;
 
 } // namespace picongpu
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/speciesDefinition.param
index 5eb194b45b..1dcb7a048c 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -18,7 +18,6 @@
  */
 
 
-
 #pragma once
 
 #include "picongpu/simulation_defines.hpp"
@@ -33,15 +32,10 @@
 
 namespace picongpu
 {
+    /*########################### define particle attributes #####################*/
 
-/*########################### define particle attributes #####################*/
-
-/** describe attributes of a particle*/
-using DefaultParticleAttributes = MakeSeq_t<
-    position<position_pic>,
-    momentum,
-    weighting
->;
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting>;
 
 /*########################### end particle attributes ########################*/
 
@@ -49,39 +43,32 @@ using DefaultParticleAttributes = MakeSeq_t<
 
 /* enable pusher by default if `PARAM_ENABLEPUSHER` is not defined in `cmakeFlags` */
 #ifndef PARAM_ENABLEPUSHER
-#   define PARAM_ENABLEPUSHER 1
+#    define PARAM_ENABLEPUSHER 1
 #endif
 
-/*--------------------------- electrons --------------------------------------*/
+    /*--------------------------- electrons --------------------------------------*/
 
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier(float_X, MassRatioElectrons, 1.0);
-value_identifier(float_X, ChargeRatioElectrons, 1.0);
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
 
-using ParticleFlagsElectrons = MakeSeq_t<
+    using ParticleFlagsElectrons = MakeSeq_t<
 /* enable the pusher only if PARAM_ENABLEPUSHER is defined as one `1` */
-#if( PARAM_ENABLEPUSHER == 1 )
-    particlePusher<UsedParticlePusher>,
+#if(PARAM_ENABLEPUSHER == 1)
+        particlePusher<UsedParticlePusher>,
 #endif
-    shape<UsedParticleShape>,
-    interpolation<UsedField2Particle>,
-    current<UsedParticleCurrentSolver>,
-    massRatio<MassRatioElectrons>,
-    chargeRatio<ChargeRatioElectrons>
->;
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
 
-/* define species electrons */
-using PIC_Electrons = Particles<
-    PMACC_CSTRING( "e" ),
-    ParticleFlagsElectrons,
-    DefaultParticleAttributes
->;
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
 
-/*########################### end species ####################################*/
+    /*########################### end species ####################################*/
 
-using VectorAllSpecies = MakeSeq_t<
-    PIC_Electrons
->;
+    using VectorAllSpecies = MakeSeq_t<PIC_Electrons>;
 
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/speciesInitialization.param
index b198424a0b..2e56c9aa9f 100644
--- a/share/picongpu/examples/SingleParticleTest/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/SingleParticleTest/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,23 +33,15 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline define in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-        CreateDensity<
-            densityProfiles::FreeFormula,
-            startPosition::OnePosition,
-            PIC_Electrons
-        >,
-        Manipulate<
-            manipulators::AssignYDrift,
-            PIC_Electrons
-        >
-    >;
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::FreeFormula, startPosition::OnePosition, PIC_Electrons>,
+            Manipulate<manipulators::AssignYDrift, PIC_Electrons>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/etc/picongpu/1.cfg b/share/picongpu/examples/ThermalTest/etc/picongpu/1.cfg
index 3ba5f63a26..430c41402a 100644
--- a/share/picongpu/examples/ThermalTest/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/ThermalTest/etc/picongpu/1.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/ThermalTest/etc/picongpu/32.cfg b/share/picongpu/examples/ThermalTest/etc/picongpu/32.cfg
index 3f2122b339..1bb2b889cc 100644
--- a/share/picongpu/examples/ThermalTest/etc/picongpu/32.cfg
+++ b/share/picongpu/examples/ThermalTest/etc/picongpu/32.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Heiko Burau, Felix Schmitt, Axel Huebl
+# Copyright 2013-2021 Heiko Burau, Felix Schmitt, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/ThermalTest/etc/picongpu/4.cfg b/share/picongpu/examples/ThermalTest/etc/picongpu/4.cfg
index 3d7fe0524c..a03b65ccec 100644
--- a/share/picongpu/examples/ThermalTest/etc/picongpu/4.cfg
+++ b/share/picongpu/examples/ThermalTest/etc/picongpu/4.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/ThermalTest/etc/picongpu/64.cfg b/share/picongpu/examples/ThermalTest/etc/picongpu/64.cfg
index dc6cc96410..fc3220f2cc 100644
--- a/share/picongpu/examples/ThermalTest/etc/picongpu/64.cfg
+++ b/share/picongpu/examples/ThermalTest/etc/picongpu/64.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/ThermalTest/etc/picongpu/8.cfg b/share/picongpu/examples/ThermalTest/etc/picongpu/8.cfg
index fcf46b2f5f..d567d35426 100644
--- a/share/picongpu/examples/ThermalTest/etc/picongpu/8.cfg
+++ b/share/picongpu/examples/ThermalTest/etc/picongpu/8.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
+# Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/ThermalTest/executeOnClone b/share/picongpu/examples/ThermalTest/executeOnClone
index d379900356..52c66ce269 100755
--- a/share/picongpu/examples/ThermalTest/executeOnClone
+++ b/share/picongpu/examples/ThermalTest/executeOnClone
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Heiko Burau
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Heiko Burau
 #
 # This file is part of PIConGPU. 
 # 
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/ThermalTestSimulation.hpp b/share/picongpu/examples/ThermalTest/include/picongpu/ThermalTestSimulation.hpp
index 5e26ebbec8..93027cc778 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/ThermalTestSimulation.hpp
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/ThermalTestSimulation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Axel Huebl
+/* Copyright 2013-2021 Heiko Burau, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -22,7 +22,7 @@
 #include "picongpu/simulation_defines.hpp"
 #include <pmacc/Environment.hpp>
 
-#include "picongpu/simulation/control/MySimulation.hpp"
+#include "picongpu/simulation/control/Simulation.hpp"
 
 #include <pmacc/simulationControl/SimulationHelper.hpp>
 
@@ -34,9 +34,6 @@
 #include <pmacc/nvidia/memory/MemoryInfo.hpp>
 #include <pmacc/mappings/kernel/MappingDescription.hpp>
 #include "picongpu/ArgsParser.hpp"
-
-#include <cassert>
-
 #include "picongpu/plugins/PluginController.hpp"
 
 #include <pmacc/cuSTL/container/DeviceBuffer.hpp>
@@ -51,166 +48,155 @@
 
 #include <pmacc/cuSTL/container/allocator/DeviceMemEvenPitchAllocator.hpp>
 #include <pmacc/cuSTL/algorithm/host/Foreach.hpp>
-#include <pmacc/math/vector/math_functor/min.hpp>
-#include <pmacc/math/vector/math_functor/max.hpp>
-#include <pmacc/math/vector/math_functor/sqrtf.hpp>
-#include <pmacc/math/vector/math_functor/cosf.hpp>
 #include <pmacc/nvidia/functors/Add.hpp>
 #include <pmacc/cuSTL/algorithm/functor/GetComponent.hpp>
 #include <pmacc/cuSTL/algorithm/functor/Add.hpp>
 
-namespace picongpu
-{
-
-using namespace pmacc;
+#include <cassert>
+#include <memory>
 
-class ThermalTestSimulation : public MySimulation
+namespace picongpu
 {
-public:
+    using namespace pmacc;
 
-    ThermalTestSimulation()
-    : MySimulation()
+    class ThermalTestSimulation : public Simulation
     {
-    }
-
-    void init()
-    {
-        MySimulation::init();
-
-        using namespace ::pmacc::math;
-
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
+    public:
+        ThermalTestSimulation() : Simulation()
+        {
+        }
 
-        auto fieldE_coreBorder =
-             fieldE->getGridBuffer().getDeviceBuffer().cartBuffer().view(
-                   precisionCast<int>(GuardDim().toRT()), -precisionCast<int>(GuardDim().toRT()));
+        void init()
+        {
+            Simulation::init();
 
-        this->eField_zt[0] = new container::HostBuffer<float, 2 > (Size_t < 2 > (fieldE_coreBorder.size().z(), this->collectTimesteps));
-        this->eField_zt[1] = new container::HostBuffer<float, 2 >(this->eField_zt[0]->size());
+            using namespace ::pmacc::math;
 
-        dc.releaseData( FieldE::getName() );
-    }
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
 
-    void pluginRegisterHelp(po::options_description& desc)
-    {
-        MySimulation::pluginRegisterHelp(desc);
-    }
+            auto fieldE_coreBorder = fieldE->getGridBuffer().getDeviceBuffer().cartBuffer().view(
+                precisionCast<int>(GuardDim().toRT()),
+                -precisionCast<int>(GuardDim().toRT()));
 
-    void pluginLoad()
-    {
-        MySimulation::pluginLoad();
-    }
+            this->eField_zt[0] = std::make_unique<container::HostBuffer<float, 2>>(
+                Size_t<2>(fieldE_coreBorder.size().z(), this->collectTimesteps));
+            this->eField_zt[1] = std::make_unique<container::HostBuffer<float, 2>>(this->eField_zt[0]->size());
 
-    virtual ~ThermalTestSimulation()
-    {
-        __delete(eField_zt[0]);
-        __delete(eField_zt[1]);
-    }
+            dc.releaseData(FieldE::getName());
+        }
 
-    void writeOutput()
-    {
-        using namespace ::pmacc::math;
+        void pluginRegisterHelp(po::options_description& desc)
+        {
+            Simulation::pluginRegisterHelp(desc);
+        }
 
-        auto& con = Environment<simDim>::get().GridController();
-        Size_t<SIMDIM> gpuDim = (Size_t<SIMDIM>)con.getGpuNodes();
-        Int<3> gpuPos = (Int<3>)con.getPosition();
-        zone::SphericZone<SIMDIM> gpuGatheringZone(Size_t<SIMDIM > (1, 1, gpuDim.z()));
-        algorithm::mpi::Gather<SIMDIM> gather(gpuGatheringZone);
+        void pluginLoad()
+        {
+            Simulation::pluginLoad();
+        }
 
-        container::HostBuffer<float, 2 > eField_zt_reduced(eField_zt[0]->size());
+        virtual ~ThermalTestSimulation() = default;
 
-        for (int i = 0; i < 2; i++)
+        void writeOutput()
         {
-            bool reduceRoot = (gpuPos.x() == 0) && (gpuPos.y() == 0);
-            for(int gpuPos_z = 0; gpuPos_z < (int)gpuDim.z(); gpuPos_z++)
-            {
-                zone::SphericZone<3> gpuReducingZone(
-                    Size_t<3>(gpuDim.x(), gpuDim.y(), 1),
-                    Int<3>(0, 0, gpuPos_z));
+            using namespace ::pmacc::math;
 
-                algorithm::mpi::Reduce<3> reduce(gpuReducingZone, reduceRoot);
+            auto& con = Environment<simDim>::get().GridController();
+            Size_t<SIMDIM> gpuDim = (Size_t<SIMDIM>) con.getGpuNodes();
+            Int<3> gpuPos = (Int<3>) con.getPosition();
+            zone::SphericZone<SIMDIM> gpuGatheringZone(Size_t<SIMDIM>(1, 1, gpuDim.z()));
+            algorithm::mpi::Gather<SIMDIM> gather(gpuGatheringZone);
 
-                reduce(eField_zt_reduced, *(eField_zt[i]), pmacc::algorithm::functor::Add());
-            }
-            if(!reduceRoot) continue;
+            container::HostBuffer<float, 2> eField_zt_reduced(eField_zt[0]->size());
 
-            container::HostBuffer<float, 2 > global_eField_zt(
-                gpuDim.z() * eField_zt_reduced.size().x(), eField_zt_reduced.size().y());
-
-            gather(global_eField_zt, eField_zt_reduced, 1);
-            if (gather.root())
+            for(int i = 0; i < 2; i++)
             {
-                std::string filename;
-                if (i == 0)
-                    filename = "eField_zt_trans.dat";
-                else
-                    filename = "eField_zt_long.dat";
-                std::ofstream eField_zt_dat(filename.data());
-                eField_zt_dat << global_eField_zt;
-                eField_zt_dat.close();
+                bool reduceRoot = (gpuPos.x() == 0) && (gpuPos.y() == 0);
+                for(int gpuPos_z = 0; gpuPos_z < (int) gpuDim.z(); gpuPos_z++)
+                {
+                    zone::SphericZone<3> gpuReducingZone(Size_t<3>(gpuDim.x(), gpuDim.y(), 1), Int<3>(0, 0, gpuPos_z));
+
+                    algorithm::mpi::Reduce<3> reduce(gpuReducingZone, reduceRoot);
+
+                    reduce(eField_zt_reduced, *(eField_zt[i]), pmacc::algorithm::functor::Add());
+                }
+                if(!reduceRoot)
+                    continue;
+
+                container::HostBuffer<float, 2> global_eField_zt(
+                    gpuDim.z() * eField_zt_reduced.size().x(),
+                    eField_zt_reduced.size().y());
+
+                gather(global_eField_zt, eField_zt_reduced, 1);
+                if(gather.root())
+                {
+                    std::string filename;
+                    if(i == 0)
+                        filename = "eField_zt_trans.dat";
+                    else
+                        filename = "eField_zt_long.dat";
+                    std::ofstream eField_zt_dat(filename.data());
+                    eField_zt_dat << global_eField_zt;
+                    eField_zt_dat.close();
+                }
             }
         }
 
-    }
-
-    /**
-     * Run one simulation step.
-     *
-     * @param currentStep iteration number of the current step
-     */
-    void runOneStep(uint32_t currentStep)
-    {
-        MySimulation::runOneStep(currentStep);
+        /**
+         * Run one simulation step.
+         *
+         * @param currentStep iteration number of the current step
+         */
+        void runOneStep(uint32_t currentStep)
+        {
+            Simulation::runOneStep(currentStep);
 
-        if (currentStep > this->collectTimesteps + firstTimestep)
-            return;
-        if (currentStep < firstTimestep)
-            return;
+            if(currentStep > this->collectTimesteps + firstTimestep)
+                return;
+            if(currentStep < firstTimestep)
+                return;
 
-        using namespace math;
+            using namespace math;
 
-        DataConnector &dc = Environment<>::get().DataConnector();
-        auto fieldE = dc.get< FieldE >( FieldE::getName(), true );
+            DataConnector& dc = Environment<>::get().DataConnector();
+            auto fieldE = dc.get<FieldE>(FieldE::getName(), true);
 
-        auto fieldE_coreBorder =
-           fieldE->getGridBuffer().getDeviceBuffer().cartBuffer().view(
-                precisionCast<int>(GuardDim().toRT()), -precisionCast<int>(GuardDim().toRT()));
+            auto fieldE_coreBorder = fieldE->getGridBuffer().getDeviceBuffer().cartBuffer().view(
+                precisionCast<int>(GuardDim().toRT()),
+                -precisionCast<int>(GuardDim().toRT()));
 
-        for (size_t z = 0; z < eField_zt[0]->size().x(); z++)
-        {
-            zone::SphericZone < 2 > reduceZone(fieldE_coreBorder.size().shrink<2>());
-            for (int i = 0; i < 2; i++)
+            for(size_t z = 0; z < eField_zt[0]->size().x(); z++)
             {
-                *(eField_zt[i]->origin()(z, currentStep - firstTimestep)) =
-                    algorithm::kernel::Reduce()
-                        (cursor::make_FunctorCursor(
+                zone::SphericZone<2> reduceZone(fieldE_coreBorder.size().shrink<2>());
+                for(int i = 0; i < 2; i++)
+                {
+                    *(eField_zt[i]->origin()(z, currentStep - firstTimestep)) = algorithm::kernel::Reduce()(
+                        cursor::make_FunctorCursor(
                             cursor::tools::slice(fieldE_coreBorder.origin()(0, 0, z)),
-                            pmacc::algorithm::functor::GetComponent<typename FieldE::ValueType::type>(i == 0 ? 0 : 2)
-                        ),
+                            pmacc::algorithm::functor::GetComponent<typename FieldE::ValueType::type>(i == 0 ? 0 : 2)),
                         reduceZone,
                         nvidia::functors::Add());
+                }
             }
-        }
 
-        dc.releaseData( FieldE::getName() );
+            dc.releaseData(FieldE::getName());
 
-        if (currentStep == this->collectTimesteps + firstTimestep)
-            writeOutput();
-    }
+            if(currentStep == this->collectTimesteps + firstTimestep)
+                writeOutput();
+        }
 
-private:
-    // number of timesteps which collect the data
-    static constexpr uint32_t collectTimesteps = 512;
-    // first timestep which collects data
-    //   you may like to let the plasma develope/thermalize a little bit
-    static constexpr uint32_t firstTimestep = 1024;
+    private:
+        // number of timesteps which collect the data
+        static constexpr uint32_t collectTimesteps = 512;
+        // first timestep which collects data
+        //   you may like to let the plasma develope/thermalize a little bit
+        static constexpr uint32_t firstTimestep = 1024;
 
-    container::HostBuffer<float, 2 >* eField_zt[2];
+        std::array<std::unique_ptr<container::HostBuffer<float, 2>>, 2> eField_zt;
 
-    using BlockDim = pmacc::math::CT::Size_t < 16, 16, 1 >;
-    using GuardDim = SuperCellSize;
-};
+        using BlockDim = pmacc::math::CT::Size_t<16, 16, 1>;
+        using GuardDim = SuperCellSize;
+    };
 
 } // namespace picongpu
-
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/param/components.param b/share/picongpu/examples/ThermalTest/include/picongpu/param/components.param
index 12f2026dc3..807544ea5b 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/param/components.param
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/param/components.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Anton Helm, Rene Widera,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -30,9 +30,9 @@
 
 namespace picongpu
 {
-/*! Simulation Starter ---------------------------------------------------
- *  - thermalTestStarter         : starter for thermal test
- */
-namespace simulation_starter = thermalTestStarter;
+    /*! Simulation Starter ---------------------------------------------------
+     *  - thermalTestStarter         : starter for thermal test
+     */
+    namespace simulation_starter = thermalTestStarter;
 
 } // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/param/density.param b/share/picongpu/examples/ThermalTest/include/picongpu/param/density.param
index bbd1034ff2..eb6aa2b5a7 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/param/density.param
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -25,22 +25,22 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     */
-    constexpr float_64 BASE_DENSITY_SI = 1.571e24;
-}
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
+        constexpr float_64 BASE_DENSITY_SI = 1.571e24;
+    } // namespace SI
 
-namespace densityProfiles
-{
-    /* definition of homogenous density profile */
-    using Homogenous = HomogenousImpl;
-}
-}
+    namespace densityProfiles
+    {
+        /* definition of homogenous density profile */
+        using Homogenous = HomogenousImpl;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/param/grid.param b/share/picongpu/examples/ThermalTest/include/picongpu/param/grid.param
index af220aab28..f244b1e011 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,12 +18,10 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** Duration of one timestep
@@ -52,21 +50,21 @@ namespace picongpu
          * behave like the interaction of infinite "wire particles"
          * in fields with perfect symmetry in Z.
          */
-    } //namespace SI
+    } // namespace SI
 
-        //! Defines the size of the absorbing zone (in cells)
+    //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {0, 0},  /*x direction [negative,positive]*/
-        {0, 0},  /*y direction [negative,positive]*/
-        {0, 0}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {0, 0}, /*x direction [negative,positive]*/
+        {0, 0}, /*y direction [negative,positive]*/
+        {0, 0} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -85,7 +83,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/param/memory.param b/share/picongpu/examples/ThermalTest/include/picongpu/param/memory.param
index 2fee1b1993..26ac7159d0 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/param/memory.param
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/param/memory.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -31,78 +31,87 @@
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/mappings/kernel/MappingDescription.hpp>
 
+#include <array>
 
 namespace picongpu
 {
+    /* We have to hold back 350MiB for gpu-internal operations:
+     *   - random number generator
+     *   - reduces
+     *   - ...
+     */
+    constexpr size_t reservedGpuMemorySize = 350 * 1024 * 1024;
 
-/* We have to hold back 350MiB for gpu-internal operations:
- *   - random number generator
- *   - reduces
- *   - ...
- */
-constexpr size_t reservedGpuMemorySize = 350 *1024*1024;
+    /* short namespace*/
+    namespace mCT = pmacc::math::CT;
+    /** size of a superCell
+     *
+     * volume of a superCell must be <= 1024
+     */
+    using SuperCellSize = typename mCT::shrinkTo<mCT::Int<8, 8, 4>, simDim>::type;
 
-/* short namespace*/
-namespace mCT = pmacc::math::CT;
-/** size of a superCell
- *
- * volume of a superCell must be <= 1024
- */
-using SuperCellSize = typename mCT::shrinkTo<
-    mCT::Int< 8, 8, 4 >,
-    simDim
->::type;
+    /** define the object for mapping superCells to cells*/
+    using MappingDesc = MappingDescription<simDim, SuperCellSize>;
 
-/** define the object for mapping superCells to cells*/
-using MappingDesc = MappingDescription<simDim, SuperCellSize>;
+    /** define the size of the core, border and guard area
+     *
+     * PIConGPU uses spatial domain-decomposition for parallelization
+     * over multiple devices with non-shared memory architecture.
+     * The global spatial domain is organized per device in three
+     * sections: the GUARD area contains copies of neighboring
+     * devices (also known as "halo"/"ghost").
+     * The BORDER area is the outermost layer of cells of a device,
+     * equally to what neighboring devices see as GUARD area.
+     * The CORE area is the innermost area of a device. In union with
+     * the BORDER area it defines the "active" spatial domain on a device.
+     *
+     * GuardSize is defined in units of SuperCellSize per dimension.
+     */
+    using GuardSize = typename mCT::shrinkTo<mCT::Int<1, 1, 1>, simDim>::type;
 
-/** define the size of the core, border and guard area
- *
- * PIConGPU uses spatial domain-decomposition for parallelization
- * over multiple devices with non-shared memory architecture.
- * The global spatial domain is organized per device in three
- * sections: the GUARD area contains copies of neighboring
- * devices (also known as "halo"/"ghost").
- * The BORDER area is the outermost layer of cells of a device,
- * equally to what neighboring devices see as GUARD area.
- * The CORE area is the innermost area of a device. In union with
- * the BORDER area it defines the "active" spatial domain on a device.
- *
- * GuardSize is defined in units of SuperCellSize per dimension.
- */
-using GuardSize = typename mCT::shrinkTo<
-    mCT::Int< 1, 1, 1 >,
-    simDim
->::type;
+    /** bytes reserved for species exchange buffer
+     *
+     * This is the default configuration for species exchanges buffer sizes.
+     * The default exchange buffer sizes can be changed per species by adding
+     * the alias exchangeMemCfg with similar members like in DefaultExchangeMemCfg
+     * to its flag list.
+     */
+    struct DefaultExchangeMemCfg
+    {
+        // memory used for a direction
+        static constexpr uint32_t BYTES_EXCHANGE_X = 40 * 1024 * 1024; // 40 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Y = 40 * 1024 * 1024; // 40 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Z = 40 * 1024 * 1024; // 40 MiB
+        static constexpr uint32_t BYTES_EDGES = 3 * 1024 * 1024; // 3 MiB
+        static constexpr uint32_t BYTES_CORNER = 800 * 1024; // 800 kiB
 
-/** bytes reserved for species exchange buffer
- *
- * This is the default configuration for species exchanges buffer sizes.
- * The default exchange buffer sizes can be changed per species by adding
- * the alias exchangeMemCfg with similar members like in DefaultExchangeMemCfg
- * to its flag list.
- */
-struct DefaultExchangeMemCfg
-{
-    // memory used for a direction
-    static constexpr uint32_t BYTES_EXCHANGE_X = 40 * 1024 * 1024; // 40 MiB
-    static constexpr uint32_t BYTES_EXCHANGE_Y = 40 * 1024 * 1024; // 40 MiB
-    static constexpr uint32_t BYTES_EXCHANGE_Z = 40 * 1024 * 1024; // 40 MiB
-    static constexpr uint32_t BYTES_EDGES = 3 * 1024 * 1024; // 3 MiB
-    static constexpr uint32_t BYTES_CORNER = 800 * 1024; // 800 kiB
-};
+        /** Reference local domain size
+         *
+         * The size of the local domain for which the exchange sizes `BYTES_*` are configured for.
+         * The required size of each exchange will be calculated at runtime based on the local domain size and the
+         * reference size. The exchange size will be scaled only up and not down. Zero means that there is no reference
+         * domain size, exchanges will not be scaled.
+         */
+        using REF_LOCAL_DOM_SIZE = mCT::Int<0, 0, 0>;
+        /** Scaling rate per direction.
+         *
+         * 1.0 means it scales linear with the ratio between the local domain size at runtime and the reference local
+         * domain size.
+         */
+        const std::array<float_X, 3> DIR_SCALING_FACTOR = {{0.0, 0.0, 0.0}};
+    };
 
-/** number of scalar fields that are reserved as temporary fields */
-constexpr uint32_t fieldTmpNumSlots = 1;
+    /** number of scalar fields that are reserved as temporary fields */
+    constexpr uint32_t fieldTmpNumSlots = 1;
 
-/** can `FieldTmp` gather neighbor information
- *
- * If `true` it is possible to call the method `asyncCommunicationGather()`
- * to copy data from the border of neighboring GPU into the local guard.
- * This is also known as building up a "ghost" or "halo" region in domain
- * decomposition and only necessary for specific algorithms that extend
- * the basic PIC cycle, e.g. with dependence on derived density or energy fields.
- */
-constexpr bool fieldTmpSupportGatherCommunication = true;
+    /** can `FieldTmp` gather neighbor information
+     *
+     * If `true` it is possible to call the method `asyncCommunicationGather()`
+     * to copy data from the border of neighboring GPU into the local guard.
+     * This is also known as building up a "ghost" or "halo" region in domain
+     * decomposition and only necessary for specific algorithms that extend
+     * the basic PIC cycle, e.g. with dependence on derived density or energy fields.
+     */
+    constexpr bool fieldTmpSupportGatherCommunication = true;
 
 } // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/param/particle.param b/share/picongpu/examples/ThermalTest/include/picongpu/param/particle.param
index 636607e9af..0d2e2bc9c4 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -27,49 +27,45 @@
 
 namespace picongpu
 {
-
-namespace particles
-{
-
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *  unit: none
-     */
-    constexpr float_X MIN_WEIGHTING = 10.0;
-
-namespace manipulators
-{
-    struct TemperatureParam
+    namespace particles
     {
-        /** Initial temperature
-         *  unit: keV
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
+         *  unit: none
          */
-        static constexpr float_64 temperature = 51.16;
-    };
-    using AddTemperature = unary::Temperature< TemperatureParam > ;
-} // namespace manipulators
+        constexpr float_X MIN_WEIGHTING = 10.0;
 
-namespace startPosition
-{
+        namespace manipulators
+        {
+            struct TemperatureParam
+            {
+                /** Initial temperature
+                 *  unit: keV
+                 */
+                static constexpr float_64 temperature = 51.16;
+            };
+            using AddTemperature = unary::Temperature<TemperatureParam>;
+        } // namespace manipulators
 
-    struct RandomParameter16ppc
-    {
-        /** Count of particles per cell at initial state
-         *  unit: none
-         */
-        static constexpr uint32_t numParticlesPerCell = 16u;
-    };
-    // definition of random particle start
-    using Random16ppc = RandomImpl< RandomParameter16ppc >;
+        namespace startPosition
+        {
+            struct RandomParameter16ppc
+            {
+                /** Count of particles per cell at initial state
+                 *  unit: none
+                 */
+                static constexpr uint32_t numParticlesPerCell = 16u;
+            };
+            // definition of random particle start
+            using Random16ppc = RandomImpl<RandomParameter16ppc>;
 
-} // namespace startPosition
+        } // namespace startPosition
 
-    /** During unit normalization, we assume this is a typical
-     *  number of particles per cell for normalization of weighted
-     *  particle attributes.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL =
-        startPosition::RandomParameter16ppc::numParticlesPerCell;
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = startPosition::RandomParameter16ppc::numParticlesPerCell;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/ThermalTest/include/picongpu/param/speciesInitialization.param
index f7cee2f02f..cccfc7f4d2 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,32 +33,17 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline define in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-        CreateDensity<
-            densityProfiles::Homogenous,
-            startPosition::Random16ppc,
-            PIC_Ions
-        >,
-        ManipulateDerive<
-            manipulators::binary::ProtonTimesWeighting,
-            PIC_Ions,
-            PIC_Electrons
-        >,
-        Manipulate<
-            manipulators::AddTemperature,
-            PIC_Electrons
-        >,
-        Manipulate<
-            manipulators::AddTemperature,
-            PIC_Ions
-        >
-    >;
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::Homogenous, startPosition::Random16ppc, PIC_Ions>,
+            ManipulateDerive<manipulators::binary::ProtonTimesWeighting, PIC_Ions, PIC_Electrons>,
+            Manipulate<manipulators::AddTemperature, PIC_Electrons>,
+            Manipulate<manipulators::AddTemperature, PIC_Ions>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/param/starter.param b/share/picongpu/examples/ThermalTest/include/picongpu/param/starter.param
index 7630489bfa..f876fcf30c 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/param/starter.param
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/param/starter.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -18,18 +18,12 @@
  */
 
 
-
 #pragma once
 
 
 namespace picongpu
 {
-
     namespace thermalTestStarter
     {
-
     }
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/include/picongpu/unitless/starter.unitless b/share/picongpu/examples/ThermalTest/include/picongpu/unitless/starter.unitless
index e3966aa9d7..55d6d9ee53 100644
--- a/share/picongpu/examples/ThermalTest/include/picongpu/unitless/starter.unitless
+++ b/share/picongpu/examples/ThermalTest/include/picongpu/unitless/starter.unitless
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PIConGPU.
  *
@@ -34,7 +34,6 @@ namespace picongpu
         using SimStarter = ::picongpu::SimulationStarter<
             ::picongpu::InitialiserController,
             ::picongpu::PluginController,
-            ::picongpu::ThermalTestSimulation
-        >;
+            ::picongpu::ThermalTestSimulation>;
     }
-}
+} // namespace picongpu
diff --git a/share/picongpu/examples/ThermalTest/tools/dispersion.py b/share/picongpu/examples/ThermalTest/tools/dispersion.py
index e1561264b9..6634b991e5 100644
--- a/share/picongpu/examples/ThermalTest/tools/dispersion.py
+++ b/share/picongpu/examples/ThermalTest/tools/dispersion.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright 2013-2020 Heiko Burau, Axel Huebl
+# Copyright 2013-2021 Heiko Burau, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/TransitionRadiation/etc/picongpu/1.cfg b/share/picongpu/examples/TransitionRadiation/etc/picongpu/1.cfg
index 146623a785..761526431d 100644
--- a/share/picongpu/examples/TransitionRadiation/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/TransitionRadiation/etc/picongpu/1.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Richard Pausch, Felix Schmitt, Axel Huebl, Finn-Ole Carstens
+# Copyright 2013-2021 Richard Pausch, Felix Schmitt, Axel Huebl, Finn-Ole Carstens
 #
 # This file is part of PIConGPU.
 #
@@ -61,10 +61,6 @@ TBG_e_histogram="--e_energyHistogram.period 10 \
                  --e_energyHistogram.maxEnergy 500000 \
                  --e_energyHistogram.filter all"
 
-# optional hdf5 output
-TBG_hdf5="--hdf5.period 10\
-          --hdf5.file pos"
-
 # macroparticle count to see time consumption of transition radiation plugin
 TBG_e_macroParticleCount="--e_macroParticlesCount.period 10"
 
diff --git a/share/picongpu/examples/TransitionRadiation/etc/picongpu/16.cfg b/share/picongpu/examples/TransitionRadiation/etc/picongpu/16.cfg
index 1303fd0223..ddc561a99a 100644
--- a/share/picongpu/examples/TransitionRadiation/etc/picongpu/16.cfg
+++ b/share/picongpu/examples/TransitionRadiation/etc/picongpu/16.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Richard Pausch, Felix Schmitt, Axel Huebl, Finn-Ole Carstens
+# Copyright 2013-2021 Richard Pausch, Felix Schmitt, Axel Huebl, Finn-Ole Carstens
 #
 # This file is part of PIConGPU.
 #
@@ -61,10 +61,6 @@ TBG_e_histogram="--e_energyHistogram.period 10 \
                  --e_energyHistogram.maxEnergy 500000 \
                  --e_energyHistogram.filter all"
 
-# optional hdf5 output
-TBG_hdf5="--hdf5.period 10\
-          --hdf5.file pos"
-
 # macroparticle count to see time consumption of transition radiation plugin
 TBG_e_macroParticleCount="--e_macroParticlesCount.period 10"
 
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/density.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/density.param
index be19511511..f1a5bda1f4 100644
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/density.param
+++ b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
@@ -40,18 +40,17 @@ namespace picongpu
          */
         constexpr float_64 BASE_DENSITY_SI = 1.0e22;
 
-    }
+    } // namespace SI
 
     namespace densityProfiles
     {
-
-        PMACC_STRUCT(GaussianCloudParam,
+        PMACC_STRUCT(
+            GaussianCloudParam,
             /** Profile Formula:
              *     exponent = |globalCellPos - center| / sigma
              *     density = e^[ gasFactor * exponent^gasPower ]
              */
-            (PMACC_C_VALUE(float_X, gasFactor, -0.5))
-            (PMACC_C_VALUE(float_X, gasPower, 2.0))
+            (PMACC_C_VALUE(float_X, gasFactor, -0.5))(PMACC_C_VALUE(float_X, gasPower, 2.0))
 
             /** height of vacuum area on top border
              *
@@ -64,14 +63,13 @@ namespace picongpu
             /** The central position of the density distribution
              *  unit: meter
              */
-            (PMACC_C_VECTOR_DIM(float_64, simDim, center_SI, 128 * 0.16e-6 / 2.0, 0.912e-5 , 128 * 0.16e-6 / 2.0))
+            (PMACC_C_VECTOR_DIM(float_64, simDim, center_SI, 128 * 0.16e-6 / 2.0, 0.912e-5, 128 * 0.16e-6 / 2.0))
 
             /** the distance from gasCenter_SI until the density decreases to its 1/e-th part
              *  unit: meter */
-            (PMACC_C_VECTOR_DIM(float_64, simDim, sigma_SI, 3.5e-6, 3.0e-6, 3.5e-6))
-        ); /* struct GaussianCloudParam */
+            (PMACC_C_VECTOR_DIM(float_64, simDim, sigma_SI, 3.5e-6, 3.0e-6, 3.5e-6))); /* struct GaussianCloudParam */
 
         /* definition of cloud profile */
-        using GaussianCloud = GaussianCloudImpl< GaussianCloudParam >;
-    }
-}
+        using GaussianCloud = GaussianCloudImpl<GaussianCloudParam>;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/fieldSolver.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/fieldSolver.param
index a30867816b..41c9586801 100644
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/fieldSolver.param
+++ b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/fieldSolver.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov, Klaus Steiniger
  *
  * This file is part of PIConGPU.
  *
@@ -25,6 +25,11 @@
  *
  * Also allows to configure ad hoc mitigations for high frequency
  * noise in some setups via current smoothing.
+ *
+ * \attention
+ * Currently, the laser initialization in PIConGPU is implemented to work with the standard Yee solver.
+ * Using a solver of higher order will result in a slightly increased laser amplitude and energy than expected.
+ *
  */
 
 #pragma once
@@ -35,37 +40,43 @@
 
 namespace picongpu
 {
-namespace fields
-{
-
-    /** Current Interpolation
-     *
-     * CurrentInterpolation is used to set a method performing the
-     * interpolate/assign operation from the generated currents of particle
-     * species to the electro-magnetic fields.
-     *
-     * Allowed values are:
-     *   - None:
-     *     - default for staggered grids/Yee-scheme
-     *     - updates E
-     *   - Binomial: 2nd order Binomial filter
-     *     - smooths the current before assignment in staggered grid
-     *     - updates E & breaks local charge conservation slightly
-     *   - NoneDS:
-     *     - experimental assignment for all-centered/directional splitting
-     *     - updates E & B at the same time
-     */
-    using CurrentInterpolation = currentInterpolation::None;
+    namespace fields
+    {
+        /** Current Interpolation
+         *
+         * CurrentInterpolation is used to set a method performing the
+         * interpolate/assign operation from the generated currents of particle
+         * species to the electro-magnetic fields.
+         *
+         * Allowed values are:
+         *   - None:
+         *     - default for staggered grids/Yee-scheme
+         *     - updates E
+         *   - Binomial: 2nd order Binomial filter
+         *     - smooths the current before assignment in staggered grid
+         *     - updates E & breaks local charge conservation slightly
+         */
+        using CurrentInterpolation = currentInterpolation::None;
 
-    /** FieldSolver
-     *
-     * Field Solver Selection:
-     *  - Yee< CurrentInterpolation > : standard Yee solver
-     *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
-     *  - DirSplitting< CurrentInterpolation >: Sentoku's Directional Splitting Method
-     *  - None< CurrentInterpolation >: disable the vacuum update of E and B
-     */
-    using Solver = maxwellSolver::None< CurrentInterpolation >;
+        /** FieldSolver
+         *
+         * Field Solver Selection:
+         *  - Yee< CurrentInterpolation > : Standard Yee solver approximating derivatives with respect to time and
+         * space by second order finite differences.
+         *  - YeePML< CurrentInterpolation >: Standard Yee solver using Perfectly Matched Layer Absorbing Boundary
+         * Conditions (PML)
+         *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *  - LehePML< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *                                     using Perfectly Matched Layer Absorbing Boundary Conditions (PML)
+         *  - ArbitraryOrderFDTD< 4, CurrentInterpolation >: Solver using 4 neighbors to each direction to approximate
+         * *spatial* derivatives by finite differences. The number of neighbors can be changed from 4 to any positive,
+         * integer number. The order of the solver will be twice the number of neighbors in each direction. Yee's
+         * method is a special case of this using one neighbor to each direction.
+         *  - ArbitraryOrderFDTDPML< 4, CurrentInterpolation >: ArbitraryOrderFDTD solver using Perfectly Matched Layer
+         *                                                      Absorbing Boundary Conditions (PML)
+         *  - None< CurrentInterpolation >: disable the vacuum update of E and B
+         */
+        using Solver = maxwellSolver::None<CurrentInterpolation>;
 
-} // namespace fields
+    } // namespace fields
 } // namespace picongpu
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/grid.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/grid.param
index effd12601c..5ddfa2cb36 100644
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Richard Pausch, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -18,12 +18,10 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** Duration of one timestep
@@ -52,21 +50,21 @@ namespace picongpu
          * behave like the interaction of infinite "wire particles"
          * in fields with perfect symmetry in Z.
          */
-    } //namespace SI
+    } // namespace SI
 
     //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {32, 32},  /*x direction [negative,positive]*/
-        {32, 32},  /*y direction [negative,positive]*/
-        {32, 32}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     constexpr uint32_t ABSORBER_FADE_IN_STEPS = 16;
 
@@ -82,7 +80,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/particle.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/particle.param
index f9eec216ba..eea756f23c 100644
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch, Axel Huebl
+/* Copyright 2013-2021 Rene Widera, Richard Pausch, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -32,19 +32,17 @@ namespace picongpu
 {
     namespace particles
     {
-
         /* a particle with a weighting below MIN_WEIGHTING will not
-        *      be created / will be deleted
-        *  unit: none
-        */
+         *      be created / will be deleted
+         *  unit: none
+         */
         constexpr float_X MIN_WEIGHTING = 1.0;
 
         constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 50;
 
         namespace manipulators
         {
-
-            CONST_VECTOR( float_X, 3, DriftParamNegative_direction, 1.0, 1.0, 0.0 );
+            CONST_VECTOR(float_X, 3, DriftParamNegative_direction, 1.0, 1.0, 0.0);
             struct DriftParamNegative
             {
                 /** Initial particle drift velocity for electrons and ions
@@ -56,10 +54,7 @@ namespace picongpu
                 const DriftParamNegative_direction_t direction;
             };
             // definition of SetDrift start
-            using AssignYDriftNegative = unary::Drift<
-                DriftParamNegative,
-                nvidia::functors::Assign
-            >;
+            using AssignYDriftNegative = unary::Drift<DriftParamNegative, nvidia::functors::Assign>;
 
         } // namespace manipulators
 
@@ -73,7 +68,7 @@ namespace picongpu
                  */
                 static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
             };
-            using Random = RandomImpl< RandomParameter >;
+            using Random = RandomImpl<RandomParameter>;
         } // namespace startPosition
     } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/png.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/png.param
index 788e0121ae..4ed07c907e 100644
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/png.param
+++ b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/png.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Richard Pausch
+/* Copyright 2013-2021 Heiko Burau, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -24,17 +24,17 @@
 
 namespace picongpu
 {
-/*scale image before write to file, only scale if value is not 1.0
- */
-constexpr float_64 scale_image = 1.0;
+    /*scale image before write to file, only scale if value is not 1.0
+     */
+    constexpr float_64 scale_image = 1.0;
 
-/*if true image is scaled if cellsize is not quadratic, else no scale*/
-constexpr bool scale_to_cellsize = true;
+    /*if true image is scaled if cellsize is not quadratic, else no scale*/
+    constexpr bool scale_to_cellsize = true;
 
-constexpr bool white_box_per_GPU = true;
+    constexpr bool white_box_per_GPU = true;
 
-namespace visPreview
-{
+    namespace visPreview
+    {
 // normalize EM fields to typical laser or plasma quantities
 //-1: Auto:    enable adaptive scaling for each output
 // 1: Laser:   typical fields calculated out of the laser amplitude
@@ -49,33 +49,32 @@ namespace visPreview
 #define EM_FIELD_SCALE_CHANNEL2 -1
 #define EM_FIELD_SCALE_CHANNEL3 -1
 
-// multiply highest undisturbed particle density with factor
-constexpr float_X preParticleDens_opacity = 0.25;
-constexpr float_X preChannel1_opacity = 1.0;
-constexpr float_X preChannel2_opacity = 1.0;
-constexpr float_X preChannel3_opacity = 1.0;
-
-// specify color scales for each channel
-namespace preParticleDensCol = colorScales::red;
-namespace preChannel1Col = colorScales::blue;
-namespace preChannel2Col = colorScales::green;
-namespace preChannel3Col = colorScales::none;
+        // multiply highest undisturbed particle density with factor
+        constexpr float_X preParticleDens_opacity = 0.25;
+        constexpr float_X preChannel1_opacity = 1.0;
+        constexpr float_X preChannel2_opacity = 1.0;
+        constexpr float_X preChannel3_opacity = 1.0;
 
-/* png preview settings for each channel */
-DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return math::abs2(field_J);
-}
+        // specify color scales for each channel
+        namespace preParticleDensCol = colorScales::red;
+        namespace preChannel1Col = colorScales::blue;
+        namespace preChannel2Col = colorScales::green;
+        namespace preChannel3Col = colorScales::none;
 
-DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return field_E.x() * field_E.x();
-}
+        /* png preview settings for each channel */
+        DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return pmacc::math::abs2(field_J);
+        }
 
-DINLINE float_X preChannel3(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
-{
-    return -1.0_X * field_E.y();
-}
-}
-}
+        DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return field_E.x() * field_E.x();
+        }
 
+        DINLINE float_X preChannel3(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
+        {
+            return -1.0_X * field_E.y();
+        }
+    } // namespace visPreview
+} // namespace picongpu
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/species.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/species.param
deleted file mode 100644
index df5d1a5664..0000000000
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/species.param
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2014-2020 Rene Widera, Richard Pausch
- *
- * This file is part of PIConGPU.
- *
- * PIConGPU is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PIConGPU is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with PIConGPU.
- * If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include "picongpu/particles/shapes.hpp"
-#include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
-#include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
-#include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
-
-#include "picongpu/particles/flylite/NonLTE.def"
-#include "picongpu/fields/currentDeposition/Solver.def"
-
-
-namespace picongpu
-{
-/*---------------------------- generic solver---------------------------------*/
-
-/*! Particle Shape definitions -------------------------------------------------
- *  - particles::shapes::CIC : 1st order
- *  - particles::shapes::TSC : 2nd order
- *  - particles::shapes::PCS : 3rd order
- *  - particles::shapes::P4S : 4th order
- *
- *  example: using UsedParticleShape = particles::shapes::CIC;
- */
-using UsedParticleShape = particles::shapes::CIC;
-
-/* define which interpolation method is used to interpolate fields to particle*/
-using UsedField2Particle = FieldToParticleInterpolation< UsedParticleShape, AssignedTrilinearInterpolation >;
-
-/*! select current solver method -----------------------------------------------
- * - currentSolver::Esirkepov<SHAPE>  : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- * - currentSolver::VillaBune<>       : particle shapes - CIC (1st order) only
- * - currentSolver::EmZ<SHAPE>        : particle shapes - CIC, TSC, PCS, P4S (1st to 4th order)
- *
- * For development purposes: ---------------------------------------------------
- * - currentSolver::EsirkepovNative<SHAPE> : generic version of currentSolverEsirkepov
- *   without optimization (~4x slower and needs more shared memory)
- */
-using UsedParticleCurrentSolver = currentSolver::Esirkepov< UsedParticleShape >;
-
-/*! particle pusher configuration ----------------------------------------------
- *
- * Defining a pusher is optional for particles
- *
- * - particles::pusher::Vay : better suited relativistic boris pusher
- * - particles::pusher::Boris : standard boris pusher
- * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
- *                                              with classical radiation reaction
- *
- * For diagnostics & modeling: ------------------------------------------------
- * - particles::pusher::Free : free propagation, ignore fields
- *                             (= free stream model)
- * - particles::pusher::Photon : propagate with c in direction of normalized mom.
- * - particles::pusher::Probe : Probe particles that interpolate E & B
- * For development purposes: --------------------------------------------------
- * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
- */
-using UsedParticlePusher = particles::pusher::Boris;
-
-} // namespace picongpu
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/speciesDefinition.param
index 5fb514f525..c723423895 100644
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Heiko Burau,
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -32,16 +32,10 @@
 
 namespace picongpu
 {
-
     /*########################### define particle attributes #####################*/
 
     /** describe attributes of a particle*/
-    using DefaultParticleAttributes = MakeSeq_t<
-        position< position_pic >,
-        momentum,
-        weighting,
-        transitionRadiationMask
-    >;
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting, transitionRadiationMask>;
 
     /*########################### end particle attributes ########################*/
 
@@ -50,28 +44,21 @@ namespace picongpu
     /*--------------------------- electrons --------------------------------------*/
 
     /* ratio relative to BASE_CHARGE and BASE_MASS */
-    value_identifier( float_X, MassRatioElectrons, 1.0 );
-    value_identifier( float_X, ChargeRatioElectrons, 1.0 );
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
 
     using ParticleFlagsElectrons = MakeSeq_t<
-        particlePusher< UsedParticlePusher >,
-        shape< UsedParticleShape >,
-        interpolation< UsedField2Particle >,
-        massRatio< MassRatioElectrons >,
-        chargeRatio< ChargeRatioElectrons >
-    >;
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
 
     /* define species electrons */
-    using PIC_Electrons = Particles<
-        PMACC_CSTRING( "e" ),
-        ParticleFlagsElectrons,
-        DefaultParticleAttributes
-    >;
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
 
     /*########################### end species ####################################*/
 
-    using VectorAllSpecies = MakeSeq_t<
-        PIC_Electrons
-    >;
+    using VectorAllSpecies = MakeSeq_t<PIC_Electrons>;
 
-} //namespace picongpu
+} // namespace picongpu
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/speciesInitialization.param
index 1fee353a50..bdb6445015 100644
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,23 +33,15 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline define in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-        CreateDensity<
-            densityProfiles::GaussianCloud,
-            startPosition::Random,
-            PIC_Electrons
-        >,
-        Manipulate<
-            manipulators::AssignYDriftNegative,
-            PIC_Electrons
-        >
-    >;
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::GaussianCloud, startPosition::Random, PIC_Electrons>,
+            Manipulate<manipulators::AssignYDriftNegative, PIC_Electrons>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/transitionRadiation.param b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/transitionRadiation.param
index 7294e90e14..c9954201ed 100644
--- a/share/picongpu/examples/TransitionRadiation/include/picongpu/param/transitionRadiation.param
+++ b/share/picongpu/examples/TransitionRadiation/include/picongpu/param/transitionRadiation.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Richard Pausch, Finn-Ole Carstens
+/* Copyright 2013-2021 Rene Widera, Richard Pausch, Finn-Ole Carstens
  *
  * This file is part of PIConGPU.
  *
@@ -43,225 +43,237 @@
 
 namespace picongpu
 {
-namespace plugins
-{
-// initiate the formfactor namespaces from the radiation plugin
-namespace radiation
-{
-    namespace radFormFactor_CIC_3D { }
-    namespace radFormFactor_TSC_3D { }
-    namespace radFormFactor_PCS_3D { }
-    namespace radFormFactor_CIC_1Dy { }
-    namespace radFormFactor_Gauss_spherical { }
-    namespace radFormFactor_Gauss_cell { }
-    namespace radFormFactor_incoherent { }
-    namespace radFormFactor_coherent { }
-} // namespace radiation
-
-namespace transitionRadiation
-{
-namespace linearFrequencies
-{
-    namespace SI
-    {
-        //! mimimum frequency of the linear frequency scale in units of [1/s]
-        constexpr float_64 omegaMin = 0.0;
-        //! maximum frequency of the linear frequency scale in units of [1/s]
-        constexpr float_64 omegaMax = 1.06e16;
-    }
-
-    //! number of frequency values to compute in the linear frequency [unitless]
-    constexpr unsigned int nOmega = 512;
-
-} // namespace linearFrequencies
-
-namespace logFrequencies
-{
-    namespace SI
-    {
-        //! mimimum frequency of the logarithmic frequency scale in units of [1/s]
-        constexpr float_64 omegaMin = 1.0e13;
-        //! maximum frequency of the logarithmic frequency scale in units of [1/s]
-        constexpr float_64 omegaMax = 1.0e17;
-    }
-
-    //! number of frequency values to compute in the logarithmic frequency [unitless]
-    constexpr unsigned int nOmega = 256;
-
-} // namespace logFrequencies
-
-
-namespace listFrequencies
-{
-    //! path to text file with frequencies
-    constexpr char listLocation[] = "/path/to/frequency_list";
-    //! number of frequency values to compute if frequencies are given in a file [unitless]
-    constexpr unsigned int nOmega = 512;
-
-} // namespace listFrequencies
-
-
-    /** selected mode of frequency scaling:
-     *
-     * options:
-     * - linearFrequencies
-     * - logFrequencies
-     * - listFrequencies
-     */
-    namespace frequencies = logFrequencies;
-
-    ///////////////////////////////////////////////////
-
-
-    /** correct treatment of coherent radiation from macro particles
-     *
-     * These formfactors are the same as in the radiation plugin!
-     * Choose different form factors in order to consider different particle shapes for radiation
-     *  - ::picongpu::plugins::radiation::radFormFactor_CIC_3D ... CIC charge distribution
-     *  - ::picongpu::plugins::radiation::radFormFactor_TSC_3D ... TSC charge distribution
-     *  - ::picongpu::plugins::radiation::radFormFactor_PCS_3D ... PCS charge distribution
-     *  - ::picongpu::plugins::radiation::radFormFactor_CIC_1Dy ... only CIC charge distribution in y
-     *  - ::picongpu::plugins::radiation::radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
-     *  - ::picongpu::plugins::radiation::radFormFactor_Gauss_cell ... Gauss charge distribution according to cell size
-     *  - ::picongpu::plugins::radiation::radFormFactor_incoherent ... only incoherent radiation
-     *  - ::picongpu::plugins::radiation::radFormFactor_coherent ... only coherent radiation
-     */
-    namespace macroParticleFormFactor = ::picongpu::plugins::radiation::radFormFactor_Gauss_spherical;
-
-    ///////////////////////////////////////////////////////////
-
-    namespace parameters
+    namespace plugins
     {
-        // number of observation directions
-        constexpr unsigned int nPhi = 128;
-        constexpr unsigned int nTheta = 128;
-        constexpr unsigned int nObserver = nPhi * nTheta;
-
-        // theta goes from 0 to pi
-        constexpr float_64 thetaMin = 0.0;
-        constexpr float_64 thetaMax = picongpu::PI;
-
-        // phi goes from 0 to 2*pi
-        constexpr float_64 phiMin = 0.0;
-        constexpr float_64 phiMax = 2 * picongpu::PI;
-
-        namespace SI
+        // initiate the formfactor namespaces from the radiation plugin
+        namespace radiation
         {
-            // z position of the foil to calculate transition radiation at
-            // leave at 0 for no virtual particle propagation
-            constexpr float_64 foilPosition = 0.0;
-        }
-
-    } // end namespace parameters
-
-
-    //! example of a filter for the relativistic Lorentz factor gamma
-    struct GammaFilterFunctor
-    {
-        //! Gamma value above which the radiation is calculated
-        static constexpr float_X filterGamma = 5.0;
-
-        template< typename T_Particle >
-        HDINLINE void operator()( T_Particle& particle )
+            namespace radFormFactor_CIC_3D
+            {
+            }
+            namespace radFormFactor_TSC_3D
+            {
+            }
+            namespace radFormFactor_PCS_3D
+            {
+            }
+            namespace radFormFactor_CIC_1Dy
+            {
+            }
+            namespace radFormFactor_Gauss_spherical
+            {
+            }
+            namespace radFormFactor_Gauss_cell
+            {
+            }
+            namespace radFormFactor_incoherent
+            {
+            }
+            namespace radFormFactor_coherent
+            {
+            }
+        } // namespace radiation
+
+        namespace transitionRadiation
         {
-            if(
-                picongpu::gamma<float_X>(
-                    particle[ picongpu::momentum_ ],
-                    picongpu::traits::attribute::getMass(
-                        particle[ picongpu::weighting_ ],
-                        particle
-                    )
-                ) >= filterGamma
-            )
-                particle[ picongpu::transitionRadiationMask_ ] = true;
-        }
-    };
-
-    /** filter to (de)select particles for the radiation calculation
-     *
-     * to activate the filter:
-     *   - goto file `speciesDefinition.param`
-     *   - add the attribute `transitionRadiationMask` to the particle species
-     */
-    using GammaFilter = picongpu::particles::manipulators::generic::Free<
-        GammaFilterFunctor
-    >;
-
-    /** Compute observation angles
-     *
-     * This function is used in the transition radiation plugin kernel to compute
-     * the observation directions given as a unit vector pointing
-     * towards a 'virtual' detector
-     *
-     * This default setup is an example of a 2D detector array. It computes
-     * observation directions for 2D virtual detector field
-     * with its center pointing toward the +y direction (for theta=0, phi=0)
-     * with observation angles ranging from
-     * theta = [angle_theta_start : angle_theta_end]
-     * phi   = [angle_phi_start   : angle_phi_end  ]
-     * Every observation_id_extern index moves the phi angle from its
-     * start value toward its end value until the observation_id_extern
-     * reaches N_split. After that the theta angle moves further from its
-     * start value towards its end value while phi is reset to its start
-     * value.
-     *
-     * The unit vector pointing towards the observing virtual detector
-     * can be described using theta and phi by:
-     * x_value = sin(theta) * cos(phi)
-     * y_value = cos(theta)
-     * z_value = sin(theta) * sin(phi)
-     * These are the standard spherical coordinates.
-     *
-     * The example setup describes an detector array of
-     * 128X128 detectors ranging from 0 to pi for the azimuth angle
-     * theta and from 0 to 2 pi for the polar angle phi.
-     *
-     * @param    observation_id_extern
-     *           int index that identifies each block on the GPU
-     *           to compute the observation direction
-     *
-     * @return   unit vector pointing in observation direction
-     *           type: float3_X
-     */
-    HDINLINE float3_X observationDirection(const int observation_id_extern)
-    {
-        /* generate two indices from single block index */
-        /** split distance of given index
-         * pseudo-code:
-         * index_a = index / split_distance
-         * index_b = index % split_distance
-         */
-        /** get index for computing angle theta: */
-        const int indexTheta = observation_id_extern / parameters::nPhi;
-
-        /** step width angle theta, set it to 0 if nTheta = 1 */
-        const picongpu::float_64 deltaTheta = ( parameters::nTheta > 1 ) ?
-                ( parameters::thetaMax - parameters::thetaMin ) / ( parameters::nTheta - 1.0 ) : 0.0;
-
-        /** compute observation angles theta */
-        const picongpu::float_64 theta = indexTheta * deltaTheta + parameters::thetaMin;
-
-        /** get index for computing angle phi: */
-        const int indexPhi = observation_id_extern % parameters::nPhi;
-
-        /** step width angle phi, set it to 0 if nPhi = 1 */
-        const picongpu::float_64 deltaPhi = ( parameters::nPhi > 1 ) ?
-                ( parameters::phiMax - parameters::phiMin ) / ( parameters::nPhi - 1.0 ) : 0.0;
-
-        /** compute observation angles phi */
-        const picongpu::float_64 phi = indexPhi * deltaPhi - parameters::phiMin;
-
-        /* helper functions for efficient trigonometric calculations */
-        picongpu::float_32 sinPhi;
-        picongpu::float_32 cosPhi;
-        picongpu::float_32 sinTheta;
-        picongpu::float_32 cosTheta;
-        math::sincos( precisionCast< picongpu::float_32 >( phi ), sinPhi, cosPhi );
-        math::sincos( precisionCast< picongpu::float_32 >( theta ), sinTheta, cosTheta );
-        /** compute observation unit vector */
-        return float3_X( sinTheta * cosPhi , cosTheta, sinTheta * sinPhi );
-    }
-
-} // namespace transitionRadiation
-} // namespace plugins
+            namespace linearFrequencies
+            {
+                namespace SI
+                {
+                    //! mimimum frequency of the linear frequency scale in units of [1/s]
+                    constexpr float_64 omegaMin = 0.0;
+                    //! maximum frequency of the linear frequency scale in units of [1/s]
+                    constexpr float_64 omegaMax = 1.06e16;
+                } // namespace SI
+
+                //! number of frequency values to compute in the linear frequency [unitless]
+                constexpr unsigned int nOmega = 512;
+
+            } // namespace linearFrequencies
+
+            namespace logFrequencies
+            {
+                namespace SI
+                {
+                    //! mimimum frequency of the logarithmic frequency scale in units of [1/s]
+                    constexpr float_64 omegaMin = 1.0e13;
+                    //! maximum frequency of the logarithmic frequency scale in units of [1/s]
+                    constexpr float_64 omegaMax = 1.0e17;
+                } // namespace SI
+
+                //! number of frequency values to compute in the logarithmic frequency [unitless]
+                constexpr unsigned int nOmega = 256;
+
+            } // namespace logFrequencies
+
+
+            namespace listFrequencies
+            {
+                //! path to text file with frequencies
+                constexpr char listLocation[] = "/path/to/frequency_list";
+                //! number of frequency values to compute if frequencies are given in a file [unitless]
+                constexpr unsigned int nOmega = 512;
+
+            } // namespace listFrequencies
+
+
+            /** selected mode of frequency scaling:
+             *
+             * options:
+             * - linearFrequencies
+             * - logFrequencies
+             * - listFrequencies
+             */
+            namespace frequencies = logFrequencies;
+
+            ///////////////////////////////////////////////////
+
+
+            /** correct treatment of coherent radiation from macro particles
+             *
+             * These formfactors are the same as in the radiation plugin!
+             * Choose different form factors in order to consider different particle shapes for radiation
+             *  - ::picongpu::plugins::radiation::radFormFactor_CIC_3D ... CIC charge distribution
+             *  - ::picongpu::plugins::radiation::radFormFactor_TSC_3D ... TSC charge distribution
+             *  - ::picongpu::plugins::radiation::radFormFactor_PCS_3D ... PCS charge distribution
+             *  - ::picongpu::plugins::radiation::radFormFactor_CIC_1Dy ... only CIC charge distribution in y
+             *  - ::picongpu::plugins::radiation::radFormFactor_Gauss_spherical ... symmetric Gauss charge distribution
+             *  - ::picongpu::plugins::radiation::radFormFactor_Gauss_cell ... Gauss charge distribution according to
+             * cell size
+             *  - ::picongpu::plugins::radiation::radFormFactor_incoherent ... only incoherent radiation
+             *  - ::picongpu::plugins::radiation::radFormFactor_coherent ... only coherent radiation
+             */
+            namespace macroParticleFormFactor = ::picongpu::plugins::radiation::radFormFactor_Gauss_spherical;
+
+            ///////////////////////////////////////////////////////////
+
+            namespace parameters
+            {
+                // number of observation directions
+                constexpr unsigned int nPhi = 128;
+                constexpr unsigned int nTheta = 128;
+                constexpr unsigned int nObserver = nPhi * nTheta;
+
+                // theta goes from 0 to pi
+                constexpr float_64 thetaMin = 0.0;
+                constexpr float_64 thetaMax = picongpu::PI;
+
+                // phi goes from 0 to 2*pi
+                constexpr float_64 phiMin = 0.0;
+                constexpr float_64 phiMax = 2 * picongpu::PI;
+
+                namespace SI
+                {
+                    // z position of the foil to calculate transition radiation at
+                    // leave at 0 for no virtual particle propagation
+                    constexpr float_64 foilPosition = 0.0;
+                } // namespace SI
+
+            } // end namespace parameters
+
+
+            //! example of a filter for the relativistic Lorentz factor gamma
+            struct GammaFilterFunctor
+            {
+                //! Gamma value above which the radiation is calculated
+                static constexpr float_X filterGamma = 5.0;
+
+                template<typename T_Particle>
+                HDINLINE void operator()(T_Particle& particle)
+                {
+                    if(picongpu::gamma<float_X>(
+                           particle[picongpu::momentum_],
+                           picongpu::traits::attribute::getMass(particle[picongpu::weighting_], particle))
+                       >= filterGamma)
+                        particle[picongpu::transitionRadiationMask_] = true;
+                }
+            };
+
+            /** filter to (de)select particles for the radiation calculation
+             *
+             * to activate the filter:
+             *   - goto file `speciesDefinition.param`
+             *   - add the attribute `transitionRadiationMask` to the particle species
+             */
+            using GammaFilter = picongpu::particles::manipulators::generic::Free<GammaFilterFunctor>;
+
+            /** Compute observation angles
+             *
+             * This function is used in the transition radiation plugin kernel to compute
+             * the observation directions given as a unit vector pointing
+             * towards a 'virtual' detector
+             *
+             * This default setup is an example of a 2D detector array. It computes
+             * observation directions for 2D virtual detector field
+             * with its center pointing toward the +y direction (for theta=0, phi=0)
+             * with observation angles ranging from
+             * theta = [angle_theta_start : angle_theta_end]
+             * phi   = [angle_phi_start   : angle_phi_end  ]
+             * Every observation_id_extern index moves the phi angle from its
+             * start value toward its end value until the observation_id_extern
+             * reaches N_split. After that the theta angle moves further from its
+             * start value towards its end value while phi is reset to its start
+             * value.
+             *
+             * The unit vector pointing towards the observing virtual detector
+             * can be described using theta and phi by:
+             * x_value = sin(theta) * cos(phi)
+             * y_value = cos(theta)
+             * z_value = sin(theta) * sin(phi)
+             * These are the standard spherical coordinates.
+             *
+             * The example setup describes an detector array of
+             * 128X128 detectors ranging from 0 to pi for the azimuth angle
+             * theta and from 0 to 2 pi for the polar angle phi.
+             *
+             * @param    observation_id_extern
+             *           int index that identifies each block on the GPU
+             *           to compute the observation direction
+             *
+             * @return   unit vector pointing in observation direction
+             *           type: float3_X
+             */
+            HDINLINE float3_X observationDirection(const int observation_id_extern)
+            {
+                /* generate two indices from single block index */
+                /** split distance of given index
+                 * pseudo-code:
+                 * index_a = index / split_distance
+                 * index_b = index % split_distance
+                 */
+                /** get index for computing angle theta: */
+                const int indexTheta = observation_id_extern / parameters::nPhi;
+
+                /** step width angle theta, set it to 0 if nTheta = 1 */
+                const picongpu::float_64 deltaTheta = (parameters::nTheta > 1)
+                    ? (parameters::thetaMax - parameters::thetaMin) / (parameters::nTheta - 1.0)
+                    : 0.0;
+
+                /** compute observation angles theta */
+                const picongpu::float_64 theta = indexTheta * deltaTheta + parameters::thetaMin;
+
+                /** get index for computing angle phi: */
+                const int indexPhi = observation_id_extern % parameters::nPhi;
+
+                /** step width angle phi, set it to 0 if nPhi = 1 */
+                const picongpu::float_64 deltaPhi = (parameters::nPhi > 1)
+                    ? (parameters::phiMax - parameters::phiMin) / (parameters::nPhi - 1.0)
+                    : 0.0;
+
+                /** compute observation angles phi */
+                const picongpu::float_64 phi = indexPhi * deltaPhi - parameters::phiMin;
+
+                /* helper functions for efficient trigonometric calculations */
+                picongpu::float_32 sinPhi;
+                picongpu::float_32 cosPhi;
+                picongpu::float_32 sinTheta;
+                picongpu::float_32 cosTheta;
+                pmacc::math::sincos(precisionCast<picongpu::float_32>(phi), sinPhi, cosPhi);
+                pmacc::math::sincos(precisionCast<picongpu::float_32>(theta), sinTheta, cosTheta);
+                /** compute observation unit vector */
+                return float3_X(sinTheta * cosPhi, cosTheta, sinTheta * sinPhi);
+            }
+
+        } // namespace transitionRadiation
+    } // namespace plugins
 } // namespace picongpu
diff --git a/share/picongpu/examples/WarmCopper/cmakeFlags b/share/picongpu/examples/WarmCopper/cmakeFlags
index c8f3e2f1d0..e9d333d096 100755
--- a/share/picongpu/examples/WarmCopper/cmakeFlags
+++ b/share/picongpu/examples/WarmCopper/cmakeFlags
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/WarmCopper/etc/picongpu/1.cfg b/share/picongpu/examples/WarmCopper/etc/picongpu/1.cfg
index 0432cf0a3f..901e2042f6 100644
--- a/share/picongpu/examples/WarmCopper/etc/picongpu/1.cfg
+++ b/share/picongpu/examples/WarmCopper/etc/picongpu/1.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl, Franz Poeschel
 #
 # This file is part of PIConGPU.
 #
@@ -52,10 +52,15 @@ TBG_ehot_histogram="--ehot_energyHistogram.period 100 --ehot_energyHistogram.fil
                     --ehot_energyHistogram.minEnergy 0 --ehot_energyHistogram.maxEnergy 250"
 
 # file I/O
-TBG_hdf5="--hdf5.period 100 --hdf5.file simData"
-
-TBG_plugins="!TBG_eth_histogram !TBG_ehot_histogram \
-             !TBG_hdf5"
+TBG_openPMD="--openPMD.period 100 \
+             --openPMD.file simData \
+             --openPMD.ext bp \
+             --checkpoint.period 100 \
+             --checkpoint.backend openPMD"
+
+TBG_plugins="!TBG_eth_histogram  \
+             !TBG_ehot_histogram \
+             !TBG_openPMD"
 
 
 #################################
diff --git a/share/picongpu/examples/WarmCopper/include/picongpu/param/density.param b/share/picongpu/examples/WarmCopper/include/picongpu/param/density.param
index 77e5e6b040..190b88ce56 100644
--- a/share/picongpu/examples/WarmCopper/include/picongpu/param/density.param
+++ b/share/picongpu/examples/WarmCopper/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -27,22 +27,22 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     */
-    constexpr float_64 BASE_DENSITY_SI = 8.49e28; // copper ion density
-}
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
+        constexpr float_64 BASE_DENSITY_SI = 8.49e28; // copper ion density
+    } // namespace SI
 
-namespace densityProfiles
-{
-    /* definition of homogenous profile */
-    using Homogenous = HomogenousImpl;
-}
-}
+    namespace densityProfiles
+    {
+        /* definition of homogenous profile */
+        using Homogenous = HomogenousImpl;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/WarmCopper/include/picongpu/param/grid.param b/share/picongpu/examples/WarmCopper/include/picongpu/param/grid.param
index 279dcb4912..b02e025651 100644
--- a/share/picongpu/examples/WarmCopper/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/WarmCopper/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -22,7 +22,6 @@
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** Duration of one timestep
@@ -63,21 +62,21 @@ namespace picongpu
          * in fields with perfect symmetry in Z.
          */
 
-    } //namespace SI
+    } // namespace SI
 
     //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {32, 32},  /*x direction [negative,positive]*/
-        {32, 32},  /*y direction [negative,positive]*/
-        {32, 32}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -96,4 +95,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
+} // namespace picongpu
diff --git a/share/picongpu/examples/WarmCopper/include/picongpu/param/particle.param b/share/picongpu/examples/WarmCopper/include/picongpu/param/particle.param
index f6495f508a..a3659cb23c 100644
--- a/share/picongpu/examples/WarmCopper/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/WarmCopper/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -29,81 +29,72 @@
 
 namespace picongpu
 {
-
-namespace particles
-{
-
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *  unit: none
-     */
-    constexpr float_X MIN_WEIGHTING = 10.0;
-
-namespace manipulators
-{
-    // define a drift in X equal to 200 keV for electrons
-    CONST_VECTOR(float_X, 3, DriftParam_direction, 1.0, 0.0, 0.0);
-    struct Drift200keVParam
+    namespace particles
     {
-        static constexpr float_64 gamma = 1.39139;
-        const DriftParam_direction_t direction;
-    };
-    using Assign200keVDrift = unary::Drift< Drift200keVParam, nvidia::functors::Assign >;
-
-    struct TemperatureParam
-    {
-        /** Initial temperature
-         *  unit: keV
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
+         *  unit: none
          */
-        static constexpr float_64 temperature = 0.1;
-    };
-    using AddTemperature = unary::Temperature< TemperatureParam >;
+        constexpr float_X MIN_WEIGHTING = 10.0;
 
-    struct OnceIonizedImpl
-    {
-        template< typename T_Particle >
-        DINLINE void operator()( T_Particle& particle )
+        namespace manipulators
         {
-            constexpr float_X ion1plus =
-                GetAtomicNumbers< T_Particle >::type::numberOfProtons -
-                1._X;
-
-            // set (Z - 1) bound electrons
-            particle[boundElectrons_] = ion1plus;
-        }
-    };
-    // definition of SetDrift start
-    using OnceIonized = generic::Free< OnceIonizedImpl >;
-
-} // namespace manipulators
-
-
-namespace startPosition
-{
-
-    struct QuietParam2ppc
-    {
-        /** Count of particles per cell per direction at initial state
-         *  unit: none
+            // define a drift in X equal to 200 keV for electrons
+            CONST_VECTOR(float_X, 3, DriftParam_direction, 1.0, 0.0, 0.0);
+            struct Drift200keVParam
+            {
+                static constexpr float_64 gamma = 1.39139;
+                const DriftParam_direction_t direction;
+            };
+            using Assign200keVDrift = unary::Drift<Drift200keVParam, nvidia::functors::Assign>;
+
+            struct TemperatureParam
+            {
+                /** Initial temperature
+                 *  unit: keV
+                 */
+                static constexpr float_64 temperature = 0.1;
+            };
+            using AddTemperature = unary::Temperature<TemperatureParam>;
+
+            struct OnceIonizedImpl
+            {
+                template<typename T_Particle>
+                DINLINE void operator()(T_Particle& particle)
+                {
+                    constexpr float_X ion1plus = GetAtomicNumbers<T_Particle>::type::numberOfProtons - 1._X;
+
+                    // set (Z - 1) bound electrons
+                    particle[boundElectrons_] = ion1plus;
+                }
+            };
+            // definition of SetDrift start
+            using OnceIonized = generic::Free<OnceIonizedImpl>;
+
+        } // namespace manipulators
+
+
+        namespace startPosition
+        {
+            struct QuietParam2ppc
+            {
+                /** Count of particles per cell per direction at initial state
+                 *  unit: none
+                 */
+                using numParticlesPerDimension = typename mCT::shrinkTo<mCT::Int<1, 2, 1>, simDim>::type;
+            };
+
+            // definition of quiet particle start
+            using Quiet2ppc = QuietImpl<QuietParam2ppc>;
+
+        } // namespace startPosition
+
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
          */
-        using numParticlesPerDimension = typename mCT::shrinkTo<
-            mCT::Int< 1, 2, 1 >,
-            simDim
-        >::type;
-    };
-
-    // definition of quiet particle start
-    using Quiet2ppc = QuietImpl< QuietParam2ppc >;
-
-} // namespace startPosition
-
-    /** During unit normalization, we assume this is a typical
-     *  number of particles per cell for normalization of weighted
-     *  particle attributes.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = mCT::volume<
-        startPosition::QuietParam2ppc::numParticlesPerDimension
-    >::type::value;
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL
+            = mCT::volume<startPosition::QuietParam2ppc::numParticlesPerDimension>::type::value;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/WarmCopper/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/WarmCopper/include/picongpu/param/speciesDefinition.param
index f15be239c2..e57e0791bb 100644
--- a/share/picongpu/examples/WarmCopper/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/WarmCopper/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz, Heiko Burau
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau
  *
  * This file is part of PIConGPU.
  *
@@ -31,16 +31,10 @@
 
 namespace picongpu
 {
+    /*########################### define particle attributes #####################*/
 
-/*########################### define particle attributes #####################*/
-
-/** describe attributes of a particle*/
-using DefaultParticleAttributes = MakeSeq_t<
-    position< position_pic >,
-    momentum,
-    weighting,
-    particleId
->;
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting, particleId>;
 
 /** The default example keeps particles in place and does not create a current
  */
@@ -51,134 +45,101 @@ using DefaultParticleAttributes = MakeSeq_t<
 #    define PARAM_ENABLE_CURRENT 0
 #endif
 
-/*########################### end particle attributes ########################*/
+    /*########################### end particle attributes ########################*/
 
-/*########################### define species #################################*/
+    /*########################### define species #################################*/
 
-/*--------------------------- photons -------------------------------------------*/
+    /*--------------------------- photons -------------------------------------------*/
 
-value_identifier( float_X, MassRatioPhotons, 0.0 );
-value_identifier( float_X, ChargeRatioPhotons, 0.0 );
+    value_identifier(float_X, MassRatioPhotons, 0.0);
+    value_identifier(float_X, ChargeRatioPhotons, 0.0);
 
-using ParticleFlagsPhotons = MakeSeq_t<
-#if( PARAM_ENABLE_PUSHER == 1 )
-    particlePusher< particles::pusher::Photon >,
+    using ParticleFlagsPhotons = MakeSeq_t<
+#if(PARAM_ENABLE_PUSHER == 1)
+        particlePusher<particles::pusher::Photon>,
 #endif
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-    massRatio< MassRatioPhotons >,
-    chargeRatio< ChargeRatioPhotons >
->;
-
-/* define species photons */
-using Photons = Particles<
-    PMACC_CSTRING( "ph" ),
-    ParticleFlagsPhotons,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- electrons --------------------------------------*/
-/* thermal bulk electrons: 10, 100, 1000 eV
- *   and
- * non-thermal "hot"/prompt electrons: 200 keV
- */
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        massRatio<MassRatioPhotons>,
+        chargeRatio<ChargeRatioPhotons>>;
+
+    /* define species photons */
+    using Photons = Particles<PMACC_CSTRING("ph"), ParticleFlagsPhotons, DefaultParticleAttributes>;
+
+    /*--------------------------- electrons --------------------------------------*/
+    /* thermal bulk electrons: 10, 100, 1000 eV
+     *   and
+     * non-thermal "hot"/prompt electrons: 200 keV
+     */
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    /* ratio relative to BASE_DENSITY
+     * thermal "bulk": 1x ionized n_Cu
+     * non-thermal "hot"/prompt: 0.1% ne_bulk = 0.001 * n_Cu ~ 1e20 / cm3
+     */
+    value_identifier(float_X, DensityRatioBulkElectrons, 0.999);
+    value_identifier(float_X, DensityRatioPromptElectrons, 0.001);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+#if(PARAM_ENABLE_PUSHER == 1)
+        particlePusher<UsedParticlePusher>,
+#endif
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+#if(PARAM_ENABLE_CURRENT == 1)
+        current<UsedParticleCurrentSolver>,
+#endif
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
 
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioElectrons, 1.0 );
-value_identifier( float_X, ChargeRatioElectrons, 1.0 );
+    /* thermal bulk electrons */
+    using BulkElectrons = Particles<
+        PMACC_CSTRING("eth"),
+        MakeSeq_t<ParticleFlagsElectrons, densityRatio<DensityRatioBulkElectrons>>,
+        DefaultParticleAttributes>;
 
-/* ratio relative to BASE_DENSITY
- * thermal "bulk": 1x ionized n_Cu
- * non-thermal "hot"/prompt: 0.1% ne_bulk = 0.001 * n_Cu ~ 1e20 / cm3
- */
-value_identifier( float_X, DensityRatioBulkElectrons, 0.999 );
-value_identifier( float_X, DensityRatioPromptElectrons, 0.001 );
+    /* non-thermal "hot"/prompt electrons */
+    using PromptElectrons = Particles<
+        PMACC_CSTRING("ehot"),
+        MakeSeq_t<ParticleFlagsElectrons, densityRatio<DensityRatioPromptElectrons>>,
+        DefaultParticleAttributes>;
 
-using ParticleFlagsElectrons = MakeSeq_t<
-#if( PARAM_ENABLE_PUSHER == 1 )
-    particlePusher< UsedParticlePusher >,
-#endif
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-#if( PARAM_ENABLE_CURRENT == 1 )
-    current< UsedParticleCurrentSolver >,
-#endif
-    massRatio< MassRatioElectrons >,
-    chargeRatio< ChargeRatioElectrons >
->;
-
-/* thermal bulk electrons */
-using BulkElectrons = Particles<
-    PMACC_CSTRING( "eth" ),
-    MakeSeq_t<
-        ParticleFlagsElectrons,
-        densityRatio< DensityRatioBulkElectrons >
-    >,
-    DefaultParticleAttributes
->;
-
-/* non-thermal "hot"/prompt electrons */
-using PromptElectrons = Particles<
-    PMACC_CSTRING( "ehot" ),
-    MakeSeq_t<
-        ParticleFlagsElectrons,
-        densityRatio< DensityRatioPromptElectrons >
-    >,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- ions -------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioCopper, 115840. );
-value_identifier( float_X, ChargeRatioCopper, -29.0 );
-
-/* ratio relative to BASE_DENSITY */
-value_identifier( float_X, DensityRatioCopper, 1.0 );
-
-using ParticleFlagsCopper = MakeSeq_t<
-#if( PARAM_ENABLE_PUSHER == 1 )
-    particlePusher< UsedParticlePusher >,
+    /*--------------------------- ions -------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioCopper, 115840.);
+    value_identifier(float_X, ChargeRatioCopper, -29.0);
+
+    /* ratio relative to BASE_DENSITY */
+    value_identifier(float_X, DensityRatioCopper, 1.0);
+
+    using ParticleFlagsCopper = MakeSeq_t<
+#if(PARAM_ENABLE_PUSHER == 1)
+        particlePusher<UsedParticlePusher>,
 #endif
-    shape< UsedParticleShape >,
-    interpolation< UsedField2Particle >,
-#if( PARAM_ENABLE_CURRENT == 1 )
-    current< UsedParticleCurrentSolver >,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+#if(PARAM_ENABLE_CURRENT == 1)
+        current<UsedParticleCurrentSolver>,
 #endif
-    massRatio< MassRatioCopper >,
-    chargeRatio< ChargeRatioCopper >,
-    densityRatio< DensityRatioCopper >,
-    atomicNumbers< ionization::atomicNumbers::Copper_t >,
-    // note: this method is not yet fully implemented
-    populationKinetics<
-        particles::flylite::NonLTE<
-            MakeSeq_t<
-                BulkElectrons,
-                PromptElectrons
-            >,
-            MakeSeq_t< Photons >
-        >
-    >
->;
-
-/* define species ions */
-using CopperIons = Particles<
-    PMACC_CSTRING( "Cu" ),
-    ParticleFlagsCopper,
-    MakeSeq_t<
-        DefaultParticleAttributes,
-        boundElectrons,
-        superconfig
-    >
->;
-
-/*########################### end species ####################################*/
-
-using VectorAllSpecies = MakeSeq_t<
-    Photons,
-    BulkElectrons,
-    PromptElectrons,
-    CopperIons
->;
-
-}
+        massRatio<MassRatioCopper>,
+        chargeRatio<ChargeRatioCopper>,
+        densityRatio<DensityRatioCopper>,
+        atomicNumbers<ionization::atomicNumbers::Copper_t>,
+        // note: this method is not yet fully implemented
+        populationKinetics<particles::flylite::NonLTE<MakeSeq_t<BulkElectrons, PromptElectrons>, MakeSeq_t<Photons>>>>;
+
+    /* define species ions */
+    using CopperIons = Particles<
+        PMACC_CSTRING("Cu"),
+        ParticleFlagsCopper,
+        MakeSeq_t<DefaultParticleAttributes, boundElectrons, superconfig>>;
+
+    /*########################### end species ####################################*/
+
+    using VectorAllSpecies = MakeSeq_t<Photons, BulkElectrons, PromptElectrons, CopperIons>;
+
+} // namespace picongpu
diff --git a/share/picongpu/examples/WarmCopper/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/WarmCopper/include/picongpu/param/speciesInitialization.param
index 02312f6426..ea6109fc67 100644
--- a/share/picongpu/examples/WarmCopper/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/WarmCopper/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,48 +33,25 @@
 
 namespace picongpu
 {
-namespace particles
-{
-
-    /** InitPipeline defines in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-        // Generate Densities
-        CreateDensity<
-            densityProfiles::Homogenous,
-            startPosition::Quiet2ppc,
-            CopperIons
-        >,
-        ManipulateDerive<
-            manipulators::binary::DensityWeighting,
-            CopperIons,
-            BulkElectrons
-        >,
-        ManipulateDerive<
-            manipulators::binary::DensityWeighting,
-            CopperIons,
-            PromptElectrons
-        >,
-        // Set the Cu ions to Cu_1+
-        Manipulate<
-            manipulators::OnceIonized,
-            CopperIons
-        >,
-        // Set initial temperature of bulk electrons
-        Manipulate<
-            manipulators::AddTemperature,
-            BulkElectrons
-        >,
-        /* Set initial drift (directed in this case) of delta-distributed 200 keV
-         * prompt electrons
+    namespace particles
+    {
+        /** InitPipeline defines in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
          */
-        Manipulate<
-            manipulators::Assign200keVDrift,
-            PromptElectrons
-        >
-    >;
+        using InitPipeline = bmpl::vector<
+            // Generate Densities
+            CreateDensity<densityProfiles::Homogenous, startPosition::Quiet2ppc, CopperIons>,
+            ManipulateDerive<manipulators::binary::DensityWeighting, CopperIons, BulkElectrons>,
+            ManipulateDerive<manipulators::binary::DensityWeighting, CopperIons, PromptElectrons>,
+            // Set the Cu ions to Cu_1+
+            Manipulate<manipulators::OnceIonized, CopperIons>,
+            // Set initial temperature of bulk electrons
+            Manipulate<manipulators::AddTemperature, BulkElectrons>,
+            /* Set initial drift (directed in this case) of delta-distributed 200 keV
+             * prompt electrons
+             */
+            Manipulate<manipulators::Assign200keVDrift, PromptElectrons>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/WeibelTransverse/etc/picongpu/4.cfg b/share/picongpu/examples/WeibelTransverse/etc/picongpu/4.cfg
index 2cafc7a7f9..3b48e09712 100644
--- a/share/picongpu/examples/WeibelTransverse/etc/picongpu/4.cfg
+++ b/share/picongpu/examples/WeibelTransverse/etc/picongpu/4.cfg
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Rene Widera, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/density.param b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/density.param
index 9c45927d94..5ab7ed52c3 100644
--- a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/density.param
+++ b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/density.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -25,22 +25,22 @@
 
 namespace picongpu
 {
-namespace SI
-{
-    /** Base density in particles per m^3 in the density profiles.
-     *
-     * This is often taken as reference maximum density in normalized profiles.
-     * Individual particle species can define a `densityRatio` flag relative
-     * to this value.
-     *
-     * unit: ELEMENTS/m^3
-     */
-    constexpr float_64 BASE_DENSITY_SI = 1.e25;
-}
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
+        constexpr float_64 BASE_DENSITY_SI = 1.e25;
+    } // namespace SI
 
-namespace densityProfiles
-{
-    /* definition of homogenous density profile */
-    using Homogenous = HomogenousImpl;
-}
-}
+    namespace densityProfiles
+    {
+        /* definition of homogenous density profile */
+        using Homogenous = HomogenousImpl;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/grid.param b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/grid.param
index 7cbc0a468d..dbd829621c 100644
--- a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/grid.param
+++ b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/grid.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -18,12 +18,10 @@
  */
 
 
-
 #pragma once
 
 namespace picongpu
 {
-
     namespace SI
     {
         /** Duration of one timestep
@@ -52,21 +50,21 @@ namespace picongpu
          * behave like the interaction of infinite "wire particles"
          * in fields with perfect symmetry in Z.
          */
-    } //namespace SI
+    } // namespace SI
 
-        //! Defines the size of the absorbing zone (in cells)
+    //! Defines the size of the absorbing zone (in cells)
     constexpr uint32_t ABSORBER_CELLS[3][2] = {
-        {0, 0},  /*x direction [negative,positive]*/
-        {0, 0},  /*y direction [negative,positive]*/
-        {0, 0}   /*z direction [negative,positive]*/
-    }; //unit: number of cells
+        {0, 0}, /*x direction [negative,positive]*/
+        {0, 0}, /*y direction [negative,positive]*/
+        {0, 0} /*z direction [negative,positive]*/
+    }; // unit: number of cells
 
     //! Define the strength of the absorber for any direction
     constexpr float_X ABSORBER_STRENGTH[3][2] = {
         {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
         {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
-        {1.0e-3, 1.0e-3}  /*z direction [negative,positive]*/
-    }; //unit: none
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
 
     /** When to move the co-moving window.
      *  An initial pseudo particle, flying with the speed of light,
@@ -85,7 +83,4 @@ namespace picongpu
      */
     constexpr float_64 movePoint = 0.90;
 
-}
-
-
-
+} // namespace picongpu
diff --git a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/memory.param b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/memory.param
index ec6c2d79ab..82b4a6cea8 100644
--- a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/memory.param
+++ b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/memory.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -31,77 +31,87 @@
 #include <pmacc/math/Vector.hpp>
 #include <pmacc/mappings/kernel/MappingDescription.hpp>
 
+#include <array>
+
 namespace picongpu
 {
+    /* We have to hold back 350MiB for gpu-internal operations:
+     *   - random number generator
+     *   - reduces
+     *   - ...
+     */
+    constexpr size_t reservedGpuMemorySize = 400 * 1024 * 1024;
 
-/* We have to hold back 350MiB for gpu-internal operations:
- *   - random number generator
- *   - reduces
- *   - ...
- */
-constexpr size_t reservedGpuMemorySize = 400 *1024*1024;
+    /* short namespace*/
+    namespace mCT = pmacc::math::CT;
+    /** size of a superCell
+     *
+     * volume of a superCell must be <= 1024
+     */
+    using SuperCellSize = typename mCT::shrinkTo<mCT::Int<8, 8, 4>, simDim>::type;
 
-/* short namespace*/
-namespace mCT = pmacc::math::CT;
-/** size of a superCell
- *
- * volume of a superCell must be <= 1024
- */
-using SuperCellSize = typename mCT::shrinkTo<
-    mCT::Int< 8, 8, 4 >,
-    simDim
->::type;
+    /** define the object for mapping superCells to cells*/
+    using MappingDesc = MappingDescription<simDim, SuperCellSize>;
 
-/** define the object for mapping superCells to cells*/
-using MappingDesc = MappingDescription< simDim, SuperCellSize >;
+    /** define the size of the core, border and guard area
+     *
+     * PIConGPU uses spatial domain-decomposition for parallelization
+     * over multiple devices with non-shared memory architecture.
+     * The global spatial domain is organized per device in three
+     * sections: the GUARD area contains copies of neighboring
+     * devices (also known as "halo"/"ghost").
+     * The BORDER area is the outermost layer of cells of a device,
+     * equally to what neighboring devices see as GUARD area.
+     * The CORE area is the innermost area of a device. In union with
+     * the BORDER area it defines the "active" spatial domain on a device.
+     *
+     * GuardSize is defined in units of SuperCellSize per dimension.
+     */
+    using GuardSize = typename mCT::shrinkTo<mCT::Int<1, 1, 1>, simDim>::type;
 
-/** define the size of the core, border and guard area
- *
- * PIConGPU uses spatial domain-decomposition for parallelization
- * over multiple devices with non-shared memory architecture.
- * The global spatial domain is organized per device in three
- * sections: the GUARD area contains copies of neighboring
- * devices (also known as "halo"/"ghost").
- * The BORDER area is the outermost layer of cells of a device,
- * equally to what neighboring devices see as GUARD area.
- * The CORE area is the innermost area of a device. In union with
- * the BORDER area it defines the "active" spatial domain on a device.
- *
- * GuardSize is defined in units of SuperCellSize per dimension.
- */
-using GuardSize = typename mCT::shrinkTo<
-    mCT::Int< 1, 1, 1 >,
-    simDim
->::type;
+    /** bytes reserved for species exchange buffer
+     *
+     * This is the default configuration for species exchanges buffer sizes.
+     * The default exchange buffer sizes can be changed per species by adding
+     * the alias exchangeMemCfg with similar members like in DefaultExchangeMemCfg
+     * to its flag list.
+     */
+    struct DefaultExchangeMemCfg
+    {
+        // memory used for a direction
+        static constexpr uint32_t BYTES_EXCHANGE_X = 4 * 1024 * 1024; // 4 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Y = 6 * 1024 * 1024; // 6 MiB
+        static constexpr uint32_t BYTES_EXCHANGE_Z = 64 * 1024 * 1024; // 64 MiB
+        static constexpr uint32_t BYTES_EDGES = 2 * 1024 * 1024; // 2 MiB
+        static constexpr uint32_t BYTES_CORNER = 512 * 1024; // 512 kiB
 
-/** bytes reserved for species exchange buffer
- *
- * This is the default configuration for species exchanges buffer sizes.
- * The default exchange buffer sizes can be changed per species by adding
- * the alias exchangeMemCfg with similar members like in DefaultExchangeMemCfg
- * to its flag list.
- */
-struct DefaultExchangeMemCfg
-{
-    // memory used for a direction
-    static constexpr uint32_t BYTES_EXCHANGE_X = 4 * 1024 * 1024; // 4 MiB
-    static constexpr uint32_t BYTES_EXCHANGE_Y = 6 * 1024 * 1024; // 6 MiB
-    static constexpr uint32_t BYTES_EXCHANGE_Z = 64 * 1024 * 1024; // 64 MiB
-    static constexpr uint32_t BYTES_EDGES = 2 * 1024 * 1024; // 2 MiB
-    static constexpr uint32_t BYTES_CORNER = 512 * 1024; // 512 kiB
-};
+        /** Reference local domain size
+         *
+         * The size of the local domain for which the exchange sizes `BYTES_*` are configured for.
+         * The required size of each exchange will be calculated at runtime based on the local domain size and the
+         * reference size. The exchange size will be scaled only up and not down. Zero means that there is no reference
+         * domain size, exchanges will not be scaled.
+         */
+        using REF_LOCAL_DOM_SIZE = mCT::Int<0, 0, 0>;
+        /** Scaling rate per direction.
+         *
+         * 1.0 means it scales linear with the ratio between the local domain size at runtime and the reference local
+         * domain size.
+         */
+        const std::array<float_X, 3> DIR_SCALING_FACTOR = {{0.0, 0.0, 0.0}};
+    };
 
-/** number of scalar fields that are reserved as temporary fields */
-constexpr uint32_t fieldTmpNumSlots = 1;
+    /** number of scalar fields that are reserved as temporary fields */
+    constexpr uint32_t fieldTmpNumSlots = 1;
 
-/** can `FieldTmp` gather neighbor information
- *
- * If `true` it is possible to call the method `asyncCommunicationGather()`
- * to copy data from the border of neighboring GPU into the local guard.
- * This is also known as building up a "ghost" or "halo" region in domain
- * decomposition and only necessary for specific algorithms that extend
- * the basic PIC cycle, e.g. with dependence on derived density or energy fields.
- */
-constexpr bool fieldTmpSupportGatherCommunication = true;
+    /** can `FieldTmp` gather neighbor information
+     *
+     * If `true` it is possible to call the method `asyncCommunicationGather()`
+     * to copy data from the border of neighboring GPU into the local guard.
+     * This is also known as building up a "ghost" or "halo" region in domain
+     * decomposition and only necessary for specific algorithms that extend
+     * the basic PIC cycle, e.g. with dependence on derived density or energy fields.
+     */
+    constexpr bool fieldTmpSupportGatherCommunication = true;
 
 } // namespace picongpu
diff --git a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/particle.param b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/particle.param
index 18eb6cd0b3..1675796115 100644
--- a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/particle.param
+++ b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/particle.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Rene Widera, Benjamin Worpitz,
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
  *                     Richard Pausch
  *
  * This file is part of PIConGPU.
@@ -29,83 +29,75 @@
 
 namespace picongpu
 {
-
-namespace particles
-{
-
-    /** a particle with a weighting below MIN_WEIGHTING will not
-     *      be created / will be deleted
-     *  unit: none
-     */
-    constexpr float_X MIN_WEIGHTING = 10.0;
-
-namespace manipulators
-{
-
-    CONST_VECTOR( float_X, 3, DriftParamElectrons_direction, 0.0, 0.0, 1.0 );
-    struct DriftParamElectrons
+    namespace particles
     {
-        /** Initial particle drift velocity for electrons and ions
-         *  Examples:
-         *    - No drift is equal to 1.0
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
          *  unit: none
          */
-        static constexpr float_64 gamma = 1.021;
-        const DriftParamElectrons_direction_t direction;
-    };
-    using AssignZDriftElectrons = unary::Drift< DriftParamElectrons, nvidia::functors::Assign >;
-
-    CONST_VECTOR( float_X, 3, DriftParamIons_direction, 0.0, 0.0, -1.0 );
-    struct DriftParamIons
-    {
-        /** Initial particle drift velocity for electrons and ions
-         *  Examples:
-         *    - No drift is equal to 1.0
-         *  unit: none
-         */
-        static constexpr float_64 gamma = 1.021;
-        const DriftParamIons_direction_t direction;
-    };
-    // definition of SetDrift start
-    using AssignZDriftIons = unary::Drift< DriftParamIons, nvidia::functors::Assign >;
-
-    struct TemperatureParam
-    {
-        /** Initial temperature
-         *  unit: keV
-         */
-        static constexpr float_64 temperature = 0.005;
-    };
-    using AddTemperature = unary::Temperature< TemperatureParam >;
-
-} // namespace manipulators
-
-namespace startPosition
-{
-
-    struct QuietParam4ppc
-    {
-        /** Count of particles per cell per direction at initial state
-         *  unit: none
+        constexpr float_X MIN_WEIGHTING = 10.0;
+
+        namespace manipulators
+        {
+            CONST_VECTOR(float_X, 3, DriftParamElectrons_direction, 0.0, 0.0, 1.0);
+            struct DriftParamElectrons
+            {
+                /** Initial particle drift velocity for electrons and ions
+                 *  Examples:
+                 *    - No drift is equal to 1.0
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 1.021;
+                const DriftParamElectrons_direction_t direction;
+            };
+            using AssignZDriftElectrons = unary::Drift<DriftParamElectrons, nvidia::functors::Assign>;
+
+            CONST_VECTOR(float_X, 3, DriftParamIons_direction, 0.0, 0.0, -1.0);
+            struct DriftParamIons
+            {
+                /** Initial particle drift velocity for electrons and ions
+                 *  Examples:
+                 *    - No drift is equal to 1.0
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 1.021;
+                const DriftParamIons_direction_t direction;
+            };
+            // definition of SetDrift start
+            using AssignZDriftIons = unary::Drift<DriftParamIons, nvidia::functors::Assign>;
+
+            struct TemperatureParam
+            {
+                /** Initial temperature
+                 *  unit: keV
+                 */
+                static constexpr float_64 temperature = 0.005;
+            };
+            using AddTemperature = unary::Temperature<TemperatureParam>;
+
+        } // namespace manipulators
+
+        namespace startPosition
+        {
+            struct QuietParam4ppc
+            {
+                /** Count of particles per cell per direction at initial state
+                 *  unit: none
+                 */
+                using numParticlesPerDimension = mCT::shrinkTo<mCT::Int<2, 2, 1>, simDim>::type;
+            };
+
+            // definition of quiet particle start
+            using Quiet4ppc = QuietImpl<QuietParam4ppc>;
+
+        } // namespace startPosition
+
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
          */
-        using numParticlesPerDimension = mCT::shrinkTo<
-            mCT::Int< 2, 2, 1 >,
-            simDim
-        >::type;
-    };
-
-    // definition of quiet particle start
-    using Quiet4ppc = QuietImpl< QuietParam4ppc >;
-
-} // namespace startPosition
-
-    /** During unit normalization, we assume this is a typical
-     *  number of particles per cell for normalization of weighted
-     *  particle attributes.
-     */
-    constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = mCT::volume<
-        startPosition::QuietParam4ppc::numParticlesPerDimension
-    >::type::value;
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL
+            = mCT::volume<startPosition::QuietParam4ppc::numParticlesPerDimension>::type::value;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/png.param b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/png.param
index 3817d0df9e..a749261f55 100644
--- a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/png.param
+++ b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/png.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
@@ -57,25 +57,24 @@ namespace picongpu
 
         // specify color scales for each channel
         namespace preParticleDensCol = colorScales::red;
-        namespace preChannel1Col     = colorScales::blue;
-        namespace preChannel2Col     = colorScales::green;
-        namespace preChannel3Col     = colorScales::none;
+        namespace preChannel1Col = colorScales::blue;
+        namespace preChannel2Col = colorScales::green;
+        namespace preChannel3Col = colorScales::none;
 
         /* png preview settings for each channel */
-        DINLINE float_X preChannel1 ( const float3_X& field_B, const float3_X& field_E, const float3_X& field_J )
+        DINLINE float_X preChannel1(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
         {
-            return field_B.x()*field_B.x() + field_B.y()*field_B.y();
+            return field_B.x() * field_B.x() + field_B.y() * field_B.y();
         }
 
-        DINLINE float_X preChannel2 ( const float3_X& field_B, const float3_X& field_E, const float3_X& field_J )
+        DINLINE float_X preChannel2(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
         {
-            return field_E.x()*field_E.x() + field_E.y()*field_E.y();
+            return field_E.x() * field_E.x() + field_E.y() * field_E.y();
         }
 
-        DINLINE float_X preChannel3 ( const float3_X& field_B, const float3_X& field_E, const float3_X& field_J )
+        DINLINE float_X preChannel3(const float3_X& field_B, const float3_X& field_E, const float3_X& field_J)
         {
             return 1.0_X;
         }
-    }
-}
-
+    } // namespace visPreview
+} // namespace picongpu
diff --git a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/speciesDefinition.param b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/speciesDefinition.param
index 00758a9b9f..25290c0e7e 100644
--- a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/speciesDefinition.param
+++ b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/speciesDefinition.param
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Benjamin Worpitz
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz
  *
  * This file is part of PIConGPU.
  *
@@ -31,75 +31,57 @@
 
 namespace picongpu
 {
+    /*########################### define particle attributes #####################*/
 
-/*########################### define particle attributes #####################*/
-
-/** describe attributes of a particle */
-using DefaultParticleAttributes = MakeSeq_t<
-    position<position_pic>,
-    momentum,
-    weighting
->;
-
-/*########################### end particle attributes ########################*/
-
-/*########################### define species #################################*/
-
-
-/*--------------------------- electrons --------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioElectrons, 1.0 );
-value_identifier( float_X, ChargeRatioElectrons, 1.0 );
-
-using ParticleFlagsElectrons = MakeSeq_t<
-    particlePusher<UsedParticlePusher>,
-    shape<UsedParticleShape>,
-    interpolation<UsedField2Particle>,
-    current<UsedParticleCurrentSolver>,
-    massRatio<MassRatioElectrons>,
-    chargeRatio<ChargeRatioElectrons>
->;
-
-/* define species electrons */
-using PIC_Electrons = Particles<
-    PMACC_CSTRING( "e" ),
-    ParticleFlagsElectrons,
-    DefaultParticleAttributes
->;
-
-/*--------------------------- ions -------------------------------------------*/
-
-/* ratio relative to BASE_CHARGE and BASE_MASS */
-value_identifier( float_X, MassRatioIons, 1.0 );
-value_identifier( float_X, ChargeRatioIons, -1.0 );
-
-/* ratio relative to BASE_DENSITY */
-value_identifier( float_X, DensityRatioIons, 1.0 );
-
-using ParticleFlagsIons = MakeSeq_t<
-    particlePusher<UsedParticlePusher>,
-    shape<UsedParticleShape>,
-    interpolation<UsedField2Particle>,
-    current<UsedParticleCurrentSolver>,
-    massRatio<MassRatioIons>,
-    chargeRatio<ChargeRatioIons>,
-    densityRatio<DensityRatioIons>,
-    atomicNumbers<ionization::atomicNumbers::Hydrogen_t>
->;
-
-/*define specie ions*/
-using PIC_Ions = Particles<
-    PMACC_CSTRING( "i" ),
-    ParticleFlagsIons,
-    DefaultParticleAttributes
->;
-
-/*########################### end species ####################################*/
-
-using VectorAllSpecies = MakeSeq_t<
-    PIC_Electrons,
-    PIC_Ions
->;
-
-} //namespace picongpu
+    /** describe attributes of a particle */
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting>;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
+
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*--------------------------- ions -------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioIons, 1.0);
+    value_identifier(float_X, ChargeRatioIons, -1.0);
+
+    /* ratio relative to BASE_DENSITY */
+    value_identifier(float_X, DensityRatioIons, 1.0);
+
+    using ParticleFlagsIons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioIons>,
+        chargeRatio<ChargeRatioIons>,
+        densityRatio<DensityRatioIons>,
+        atomicNumbers<ionization::atomicNumbers::Hydrogen_t>>;
+
+    /*define specie ions*/
+    using PIC_Ions = Particles<PMACC_CSTRING("i"), ParticleFlagsIons, DefaultParticleAttributes>;
+
+    /*########################### end species ####################################*/
+
+    using VectorAllSpecies = MakeSeq_t<PIC_Electrons, PIC_Ions>;
+
+} // namespace picongpu
diff --git a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/speciesInitialization.param b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/speciesInitialization.param
index ea52788480..ca5ac89f0d 100644
--- a/share/picongpu/examples/WeibelTransverse/include/picongpu/param/speciesInitialization.param
+++ b/share/picongpu/examples/WeibelTransverse/include/picongpu/param/speciesInitialization.param
@@ -1,4 +1,4 @@
-/* Copyright 2015-2020 Rene Widera, Axel Huebl
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
  *
  * This file is part of PIConGPU.
  *
@@ -33,41 +33,26 @@
 
 namespace picongpu
 {
-namespace particles
-{
-    /** InitPipeline define in which order species are initialized
-     *
-     * the functors are called in order (from first to last functor)
-     */
-    using InitPipeline = bmpl::vector<
-        CreateDensity<
-            densityProfiles::Homogenous,
-            startPosition::Quiet4ppc,
-            PIC_Ions
-        >,
-        ManipulateDerive<
-            /* make sure in speciesDefinition.param that
-             *   densityRatio * chargeRatio
-             * of electrons and ions is quasi neutral!
-             * alternatively, use manipulators::ProtonTimesWeighting
-             */
-            manipulators::binary::DensityWeighting,
-            PIC_Ions,
-            PIC_Electrons
-        >,
-        Manipulate<
-            manipulators::AssignZDriftIons,
-            PIC_Ions
-        >,
-        Manipulate<
-            manipulators::AssignZDriftElectrons,
-            PIC_Electrons
-        >,
-        Manipulate<
-            manipulators::AddTemperature,
-            PIC_Electrons
-        >
-    >;
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::Homogenous, startPosition::Quiet4ppc, PIC_Ions>,
+            ManipulateDerive<
+                /* make sure in speciesDefinition.param that
+                 *   densityRatio * chargeRatio
+                 * of electrons and ions is quasi neutral!
+                 * alternatively, use manipulators::ProtonTimesWeighting
+                 */
+                manipulators::binary::DensityWeighting,
+                PIC_Ions,
+                PIC_Electrons>,
+            Manipulate<manipulators::AssignZDriftIons, PIC_Ions>,
+            Manipulate<manipulators::AssignZDriftElectrons, PIC_Electrons>,
+            Manipulate<manipulators::AddTemperature, PIC_Electrons>>;
 
-} // namespace particles
+    } // namespace particles
 } // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/README.rst b/share/picongpu/tests/XrayScattering/README.rst
new file mode 100644
index 0000000000..442a505f82
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/README.rst
@@ -0,0 +1,7 @@
+XrayScattering:
+===============
+This is a simulation with some simple density profiles (double slit, periodic grid, periodic stripes), no laser, no random species initialization.
+It is meant as a functional test for the xrayScattering plugin.
+Plugin output is validated by a comparision with an FFT result.
+
+.. sectionauthor:: Pawel Ordyna <p.ordyna (at) hzdr.de>
diff --git a/share/picongpu/tests/XrayScattering/cmakeFlags b/share/picongpu/tests/XrayScattering/cmakeFlags
new file mode 100755
index 0000000000..3a1e4aa1e2
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/cmakeFlags
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+#
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Pawel Ordyna
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# generic compile options
+#
+
+################################################################################
+# add presets here
+#   - default: index 0
+#   - start with zero index
+#   - increase by 1, no gaps
+
+flags[0]=""
+flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_DIMENSION=DIM3'"
+flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_PRECISION=precision64Bit'"
+flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_PRECISION=precision64Bit;-DPARAM_DIMENSION=DIM3'"
+flags[4]="-DPARAM_OVERWRITES:LIST='-DPARAM_IONS=1'"
+flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_ANGLES=1;-DPARAM_PRECISION=precision64Bit;-DPARAM_DIMENSION=DIM3'"
+
+################################################################################
+# execution
+
+case "$1" in
+    -l)  echo ${#flags[@]}
+         ;;
+    -ll) for f in "${flags[@]}"; do echo $f; done
+         ;;
+    *)   echo -n ${flags[$1]}
+         ;;
+esac
diff --git a/share/picongpu/tests/XrayScattering/etc/picongpu/1.cfg b/share/picongpu/tests/XrayScattering/etc/picongpu/1.cfg
new file mode 100644
index 0000000000..57ee9bbd42
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/etc/picongpu/1.cfg
@@ -0,0 +1,86 @@
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+#              Pawel Ordyna
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+##
+## This configuration file is used by PIConGPU's TBG tool to create a
+## batch script for PIConGPU runs. For a detailed description of PIConGPU
+## configuration files including all available variables, see
+##
+##                      docs/TBG_macros.cfg
+##
+
+
+#################################
+## Section: Required Variables ##
+#################################
+
+
+TBG_wallTime="0:30:00"
+
+TBG_devices_x=1
+TBG_devices_y=1
+TBG_devices_z=1
+
+TBG_gridSize="128 128 32"
+TBG_steps="1"
+
+# leave TBG_movingWindow empty to disable moving window
+TBG_movingWindow=""
+
+
+
+#################################
+## Section: Optional Variables ##
+#################################
+
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 1           \
+             --openPMD.file simData       \
+             --openPMD.source 'e_density' \
+             --openPMD.ext h5"
+
+TBG_e_xrayScattering="--e_xrayScattering.period 1  \
+            --e_xrayScattering.outputPeriod 1      \
+            --e_xrayScattering.n_qx 128 --e_xrayScattering.n_qy 128            \
+            --e_xrayScattering.qx_min -0.001 --e_xrayScattering.qx_max +0.001  \
+            --e_xrayScattering.qy_min -0.001 --e_xrayScattering.qy_max +0.001  \
+            --e_xrayScattering.memoryLayout distribute \
+            --e_xrayScattering.ext h5"
+
+TBG_plugins="!TBG_e_xrayScattering !TBG_openPMD"
+
+
+#################################
+## Section: Program Parameters ##
+#################################
+
+TBG_deviceDist="!TBG_devices_x !TBG_devices_y !TBG_devices_z"
+
+TBG_programParams="-d !TBG_deviceDist \
+                   -g !TBG_gridSize   \
+                   -s !TBG_steps      \
+                   !TBG_movingWindow  \
+                   !TBG_plugins       \
+                   --versionOnce"
+
+# TOTAL number of devices
+TBG_tasks="$(( TBG_devices_x * TBG_devices_y * TBG_devices_z ))"
+
+"$TBG_cfgPath"/submitAction.sh
diff --git a/share/picongpu/tests/XrayScattering/etc/picongpu/1_ions.cfg b/share/picongpu/tests/XrayScattering/etc/picongpu/1_ions.cfg
new file mode 100644
index 0000000000..423dcd834e
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/etc/picongpu/1_ions.cfg
@@ -0,0 +1,93 @@
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+#              Pawel Ordyna
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+##
+## This configuration file is used by PIConGPU's TBG tool to create a
+## batch script for PIConGPU runs. For a detailed description of PIConGPU
+## configuration files including all available variables, see
+##
+##                      docs/TBG_macros.cfg
+##
+
+
+#################################
+## Section: Required Variables ##
+#################################
+
+TBG_wallTime="0:30:00"
+
+TBG_devices_x=1
+TBG_devices_y=1
+TBG_devices_z=1
+
+TBG_gridSize="128 128 32"
+TBG_steps="2"
+
+# leave TBG_movingWindow empty to disable moving window
+TBG_movingWindow=""
+
+
+
+#################################
+## Section: Optional Variables ##
+#################################
+
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 1           \
+             --openPMD.file simData       \
+             --openPMD.source 'e_density' \
+             --openPMD.ext h5"
+
+TBG_e_xrayScattering="--e_xrayScattering.period 1 \
+            --e_xrayScattering.outputPeriod 1      \
+            --e_xrayScattering.n_qx 128 --e_xrayScattering.n_qy 128            \
+            --e_xrayScattering.qx_min -0.001 --e_xrayScattering.qx_max +0.001  \
+            --e_xrayScattering.qy_min -0.001 --e_xrayScattering.qy_max +0.001  \
+            --e_xrayScattering.memoryLayout distribute \
+            --e_xrayScattering.ext h5"
+
+TBG_i_xrayScattering="--i_xrayScattering.period 1  \
+            --i_xrayScattering.outputPeriod 1      \
+            --i_xrayScattering.n_qx 120 --i_xrayScattering.n_qy 120            \
+            --i_xrayScattering.qx_min -0.001 --i_xrayScattering.qx_max +0.001  \
+            --i_xrayScattering.qy_min -0.001 --i_xrayScattering.qy_max +0.001  \
+            --i_xrayScattering.memoryLayout distribute \
+            --i_xrayScattering.ext h5"
+
+TBG_plugins="!TBG_e_xrayScattering !TBG_i_xrayScattering !TBG_openPMD"
+
+
+#################################
+## Section: Program Parameters ##
+#################################
+
+TBG_deviceDist="!TBG_devices_x !TBG_devices_y !TBG_devices_z"
+
+TBG_programParams="-d !TBG_deviceDist \
+                   -g !TBG_gridSize   \
+                   -s !TBG_steps      \
+                   !TBG_movingWindow  \
+                   !TBG_plugins       \
+                   --versionOnce"
+
+# TOTAL number of devices
+TBG_tasks="$(( TBG_devices_x * TBG_devices_y * TBG_devices_z ))"
+
+"$TBG_cfgPath"/submitAction.sh
diff --git a/share/picongpu/tests/XrayScattering/etc/picongpu/1_mirror.cfg b/share/picongpu/tests/XrayScattering/etc/picongpu/1_mirror.cfg
new file mode 100644
index 0000000000..8cafb10812
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/etc/picongpu/1_mirror.cfg
@@ -0,0 +1,84 @@
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+#              Pawel Ordyna
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+##
+## This configuration file is used by PIConGPU's TBG tool to create a
+## batch script for PIConGPU runs. For a detailed description of PIConGPU
+## configuration files including all available variables, see
+##
+##                      docs/TBG_macros.cfg
+##
+
+
+#################################
+## Section: Required Variables ##
+#################################
+
+TBG_wallTime="0:30:00"
+
+TBG_devices_x=1
+TBG_devices_y=1
+TBG_devices_z=1
+
+TBG_gridSize="128 128 32"
+TBG_steps="2"
+
+# leave TBG_movingWindow empty to disable moving window
+TBG_movingWindow=""
+
+
+
+#################################
+## Section: Optional Variables ##
+#################################
+
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 1           \
+             --openPMD.file simData       \
+             --openPMD.source 'e_density' \
+             --openPMD.ext h5"
+
+TBG_e_xrayScattering="--e_xrayScattering.period 1  \
+            --e_xrayScattering.outputPeriod 1      \
+            --e_xrayScattering.n_qx 128 --e_xrayScattering.n_qy 128           \
+            --e_xrayScattering.qx_min -0.001 --e_xrayScattering.qx_max +0.001  \
+            --e_xrayScattering.qy_min -0.001 --e_xrayScattering.qy_max +0.001  \
+            --e_xrayScattering.memoryLayout mirror --e_xrayScattering.ext h5"
+
+TBG_plugins="!TBG_e_xrayScattering !TBG_openPMD"
+
+
+#################################
+## Section: Program Parameters ##
+#################################
+
+TBG_deviceDist="!TBG_devices_x !TBG_devices_y !TBG_devices_z"
+
+TBG_programParams="-d !TBG_deviceDist \
+                   -g !TBG_gridSize   \
+                   -s !TBG_steps      \
+                   !TBG_movingWindow  \
+                   !TBG_plugins       \
+                   --versionOnce"
+
+# TOTAL number of devices
+TBG_tasks="$(( TBG_devices_x * TBG_devices_y * TBG_devices_z ))"
+
+"$TBG_cfgPath"/submitAction.sh
diff --git a/share/picongpu/tests/XrayScattering/etc/picongpu/2.cfg b/share/picongpu/tests/XrayScattering/etc/picongpu/2.cfg
new file mode 100644
index 0000000000..9dc22878a1
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/etc/picongpu/2.cfg
@@ -0,0 +1,85 @@
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+#              Pawel Ordyna
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+##
+## This configuration file is used by PIConGPU's TBG tool to create a
+## batch script for PIConGPU runs. For a detailed description of PIConGPU
+## configuration files including all available variables, see
+##
+##                      docs/TBG_macros.cfg
+##
+
+
+#################################
+## Section: Required Variables ##
+#################################
+
+TBG_wallTime="0:30:00"
+
+TBG_devices_x=1
+TBG_devices_y=1
+TBG_devices_z=1
+
+TBG_gridSize="128 128 32"
+TBG_steps="1"
+
+# leave TBG_movingWindow empty to disable moving window
+TBG_movingWindow=""
+
+
+
+#################################
+## Section: Optional Variables ##
+#################################
+
+# file I/O with openPMD-HDF5
+TBG_openPMD="--openPMD.period 1           \
+             --openPMD.file simData       \
+             --openPMD.source 'e_density' \
+             --openPMD.ext h5"
+
+TBG_e_xrayScattering="--e_xrayScattering.period 1  \
+            --e_xrayScattering.outputPeriod 1      \
+            --e_xrayScattering.n_qx 128 --e_xrayScattering.n_qy 128            \
+            --e_xrayScattering.qx_min -0.001 --e_xrayScattering.qx_max +0.001  \
+            --e_xrayScattering.qy_min -0.001 --e_xrayScattering.qy_max +0.001  \
+            --e_xrayScattering.memoryLayout distribute \
+            --e_xrayScattering.ext h5"
+
+TBG_plugins="!TBG_e_xrayScattering !TBG_openPMD"
+
+
+#################################
+## Section: Program Parameters ##
+#################################
+
+TBG_deviceDist="!TBG_devices_x !TBG_devices_y !TBG_devices_z"
+
+TBG_programParams="-d !TBG_deviceDist \
+                   -g !TBG_gridSize   \
+                   -s !TBG_steps      \
+                   !TBG_movingWindow  \
+                   !TBG_plugins       \
+                   --versionOnce"
+
+# TOTAL number of devices
+TBG_tasks="$(( TBG_devices_x * TBG_devices_y * TBG_devices_z ))"
+
+"$TBG_cfgPath"/submitAction.sh
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/density.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/density.param
new file mode 100644
index 0000000000..eb7ed10e6f
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/density.param
@@ -0,0 +1,164 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+ *                     Richard Pausch, Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Configure existing or define new normalized density profiles here.
+ * During particle species creation in speciesInitialization.param,
+ * those profiles can be translated to spatial particle distributions.
+ */
+
+#pragma once
+
+#include "picongpu/particles/densityProfiles/profiles.def"
+/* preprocessor struct generator */
+#include <pmacc/preprocessor/struct.hpp>
+
+
+namespace picongpu
+{
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
+        constexpr float_64 BASE_DENSITY_SI = 1e25;
+    } // namespace SI
+
+    namespace densityProfiles
+    {
+        struct DoubleSlitFunctor
+        {
+            /** This formula uses SI quantities only.
+             *  The profile will be multiplied by BASE_DENSITY_SI.
+             *
+             *  Two stripes, in x,y plane, with density 0, everywhere else density
+             *  is 1. Translation invariant in z.
+             *
+             * @param position_SI total offset including all slides [meter]
+             * @param cellSize_SI cell sizes [meter]
+             *
+             * @return float_X density [normalized to 1.0]
+             */
+            HDINLINE float_X operator()(const floatD_64& position_SI, const float3_64& cellSize_SI)
+            {
+                const float_64 x(position_SI.x());
+                const float_64 y(position_SI.y());
+                const uint64_t xCellId(uint64_t(position_SI.x() / cellSize_SI[0]));
+                const uint64_t yCellId(uint64_t(position_SI.y() / cellSize_SI[1]));
+                constexpr uint32_t cellsY = 128;
+                constexpr uint32_t cellsX = 128;
+                constexpr uint32_t w = 8;
+                constexpr uint32_t d = 30;
+                constexpr uint32_t total = 2 * w + d;
+                constexpr uint32_t start = (cellsX - total) / 2;
+                constexpr uint32_t slitHalfHeight = 45;
+                float_X s = 1.0_X;
+                if(yCellId > cellsY / 2 - slitHalfHeight && yCellId <= cellsY / 2 + slitHalfHeight)
+                {
+                    if((xCellId > start - 1 && xCellId < start + w)
+                       || (xCellId >= start + w + d && xCellId < start + w + d + w))
+                    {
+                        s = 0.0;
+                    }
+                }
+                s *= float_X(s >= 0.0);
+                return s;
+            }
+        };
+
+        template<unsigned T>
+        struct PeriodicGrid2DFunctor
+        {
+            /** This formula uses SI quantities only.
+             *  The profile will be multiplied by BASE_DENSITY_SI.
+             *  density(x) = max{sgn(sin(2 * pi/ T * (x- T/4))), 0}
+             *
+             *  identical for y
+             *
+             * @param position_SI total offset including all slides [meter]
+             * @param cellSize_SI cell sizes [meter]
+             *
+             * @tparam T  period
+             * @return float_X density [normalized to 1.0]
+             */
+            HDINLINE float_X operator()(const floatD_64& position_SI, const float3_64& cellSize_SI)
+            {
+                // get cell number
+                const uint64_t xCellId = static_cast<uint64_t>(position_SI.x() / cellSize_SI.x());
+                const uint64_t yCellId = static_cast<uint64_t>(position_SI.y() / cellSize_SI.y());
+
+                float_64 dens = 1.0;
+                if(((xCellId + T / 4) / (T / 2)) % 2 && ((yCellId + T / 4) / (T / 2)) % 2)
+                {
+                    dens = 0.0;
+                }
+
+                // safety check: all parts of the function MUST be > 0
+                dens *= float_64(dens >= 0.0);
+                return dens;
+            }
+        };
+
+        template<unsigned T>
+        struct PeriodicStripesFunctor
+        {
+            /** This formula uses SI quantities only.
+             * The profile will be multiplied by BASE_DENSITY_SI.
+             * density(y) = max{sgn(sin(2 * pi/ T * (x- T/4))), 0}
+             *
+             *
+             *
+             * @param position_SI total offset including all slides [meter]
+             * @param cellSize_SI cell sizes [meter]
+             *
+             * @tparam T  period
+             * @return float_X density [normalized to 1.0]
+             */
+            HDINLINE float_X operator()(const floatD_64& position_SI, const float3_64& cellSize_SI)
+            {
+                // get cell number
+                const uint64_t yCellId = static_cast<uint64_t>(position_SI.y() / cellSize_SI.y());
+
+                float_64 dens = 1.0;
+                if(((yCellId + T / 4) / (T / 2)) % 2)
+                {
+                    dens = 0.0;
+                }
+
+                // safety check: all parts of the function MUST be > 0
+                dens *= float_64(dens >= 0.0);
+                return dens;
+            }
+        };
+        // definition of free formula profiles
+        using PeriodicGrid2D = FreeFormulaImpl<PeriodicGrid2DFunctor<12>>;
+        using PeriodicStripes = FreeFormulaImpl<PeriodicStripesFunctor<12>>;
+        using DoubleSlit = FreeFormulaImpl<DoubleSlitFunctor>;
+        // definition of homogeneous profile
+        using Homogenous = HomogenousImpl;
+        using UsedDensity = DoubleSlit;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/dimension.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/dimension.param
new file mode 100644
index 0000000000..eb9dcd9c52
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/dimension.param
@@ -0,0 +1,31 @@
+/* Copyright 2014-2021 Axel Huebl, Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#ifndef PARAM_DIMENSION
+#    define PARAM_DIMENSION DIM2
+#endif
+
+#define SIMDIM PARAM_DIMENSION
+
+namespace picongpu
+{
+    constexpr uint32_t simDim = SIMDIM;
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/fileOutput.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/fileOutput.param
new file mode 100644
index 0000000000..3f21f6faf0
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/fileOutput.param
@@ -0,0 +1,96 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+ *                     Benjamin Worpitz, Richard Pausch, Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+
+/* some forward declarations we need */
+#include "picongpu/fields/Fields.def"
+#include "picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def"
+
+#include <boost/mpl/vector.hpp>
+
+
+namespace picongpu
+{
+    /** FieldTmp output (calculated at runtime) *******************************
+     *
+     * Those operations derive scalar field quantities from particle species
+     * at runtime. Each value is mapped per cell. Some operations are identical
+     * up to a constant, so avoid writing those twice to save storage.
+     *
+     * you can choose any of these particle to grid projections:
+     *   - Density: particle position + shape on the grid
+     *   - BoundElectronDensity: density of bound electrons
+     *       note: only makes sense for partially ionized ions
+     *   - ChargeDensity: density * charge
+     *       note: for species that do not change their charge state, this is
+     *             the same as the density times a constant for the charge
+     *   - Energy: sum of kinetic particle energy per cell with respect to shape
+     *   - EnergyDensity: average kinetic particle energy per cell times the
+     *                    particle density
+     *       note: this is the same as the sum of kinetic particle energy
+     *             divided by a constant for the cell volume
+     *   - MomentumComponent: ratio between a selected momentum component and
+     *                        the absolute momentum with respect to shape
+     *   - LarmorPower: radiated Larmor power
+     *                  (species must contain the attribute `momentumPrev1`)
+     *
+     * for debugging:
+     *   - MidCurrentDensityComponent:
+     *       density * charge * velocity_component
+     *   - Counter: counts point like particles per cell
+     *   - MacroCounter: counts point like macro particles per cell
+     */
+    namespace deriveField = particles::particleToGrid;
+
+    /* ChargeDensity section */
+    using ChargeDensity_Seq
+        = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::ChargeDensity>;
+
+    /* Density section */
+    using Density_Seq = deriveField::CreateEligible_t<VectorAllSpecies, deriveField::derivedAttributes::Density>;
+
+
+    /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
+     *
+     * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
+     */
+    using FieldTmpSolvers = MakeSeq_t<ChargeDensity_Seq, Density_Seq>;
+
+
+    /** FileOutputFields: Groups all Fields that shall be dumped *************/
+
+    /** Possible native fields: FieldE, FieldB, FieldJ
+     */
+    using NativeFileOutputFields = MakeSeq_t<FieldE, FieldB>;
+
+    using FileOutputFields = MakeSeq_t<NativeFileOutputFields, FieldTmpSolvers>;
+
+
+    /** FileOutputParticles: Groups all Species that shall be dumped **********
+     *
+     * hint: to disable particle output set to
+     *   using FileOutputParticles = MakeSeq_t< >;
+     */
+    using FileOutputParticles = VectorAllSpecies;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/grid.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/grid.param
new file mode 100644
index 0000000000..2cbf2018de
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/grid.param
@@ -0,0 +1,105 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz, Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Definition of cell sizes and time step. Our cells are defining a regular,
+ * cartesian grid. Our explicit FDTD field solvers define an upper bound for
+ * the time step value in relation to the cell size for convergence. Make
+ * sure to resolve important wavelengths of your simulation, e.g. shortest
+ * plasma wavelength and central laser wavelength both spatially and
+ * temporarily.
+ *
+ * **Units in reduced dimensions**
+ *
+ * In 2D3V simulations, the CELL_DEPTH_SI (Z) cell length
+ * is still used for normalization of densities, etc..
+ *
+ * A 2D3V simulation in a cartesian PIC simulation such as
+ * ours only changes the degrees of freedom in motion for
+ * (macro) particles and all (field) information in z
+ * travels instantaneous, making the 2D3V simulation
+ * behave like the interaction of infinite "wire particles"
+ * in fields with perfect symmetry in Z.
+ *
+ */
+
+#pragma once
+
+
+namespace picongpu
+{
+    namespace SI
+    {
+        /** equals X
+         *  unit: meter */
+        // multiple of PI ensures nice q-space limits in the FFT
+        constexpr float_64 CELL_WIDTH_SI = PI * 0.1e-6;
+        /** equals Y
+         *  unit: meter */
+        constexpr float_64 CELL_HEIGHT_SI = CELL_WIDTH_SI;
+        /** equals Z
+         *  unit: meter */
+        constexpr float_64 CELL_DEPTH_SI = CELL_WIDTH_SI;
+
+        /** Duration of one timestep
+         *  unit: seconds */
+        // Works for both 2D and 3D.
+        constexpr float_64 DELTA_T_SI = CELL_WIDTH_SI / (1.734 * SPEED_OF_LIGHT_SI);
+
+    } // namespace SI
+
+    /** Defines the size of the absorbing zone (in cells)
+     *
+     *  unit: none
+     */
+    constexpr uint32_t ABSORBER_CELLS[3][2] = {
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    };
+
+    /** Define the strength of the absorber for any direction
+     *
+     *  unit: none
+     */
+    constexpr float_X ABSORBER_STRENGTH[3][2] = {
+        {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
+        {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    };
+
+    /** When to move the co-moving window.
+     *  An initial pseudo particle, flying with the speed of light,
+     *  is fired at the begin of the simulation.
+     *  When it reaches movePoint % of the absolute(*) simulation area,
+     *  the co-moving window starts to move with the speed of light.
+     *
+     *  (*) Note: beware, that there is one "hidden" row of gpus at the y-front,
+     *            when you use the co-moving window
+     *  0.75 means only 75% of simulation area is used for real simulation
+     *
+     * Warning: this variable is deprecated, but currently still required for
+     * building purposes. Please keep the variable here. In case a moving window
+     * is enabled in your .cfg file, please set the move point using the
+     * 'windowMovePoint' parameter in that file, its default value is movePoint.
+     */
+    constexpr float_64 movePoint = 0.9;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/particle.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/particle.param
new file mode 100644
index 0000000000..692418a213
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/particle.param
@@ -0,0 +1,94 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
+ *                     Richard Pausch, Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Configurations for particle manipulators. Set up and declare functors that
+ * can be used in speciesInitalization.param for particle species
+ * initialization and manipulation, such as temperature distributions, drifts,
+ * pre-ionization and in-cell position.
+ */
+
+#pragma once
+
+#include "picongpu/particles/startPosition/functors.def"
+#include "picongpu/particles/manipulators/manipulators.def"
+
+namespace picongpu
+{
+    namespace particles
+    {
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
+         *
+         *  unit: none */
+        constexpr float_X MIN_WEIGHTING = 10.0;
+
+        /** Number of maximum particles per cell during density profile evaluation.
+         *
+         * Determines the weighting of a macro particle and with it, the number of
+         * particles "sampling" dynamics in phase space.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL = 1u;
+
+        namespace manipulators
+        {
+            // ionize ions once by removing one bound electron
+            struct OnceIonizedImpl
+            {
+                template<typename T_Particle>
+                DINLINE void operator()(T_Particle& particle)
+                {
+                    constexpr float_X protonNumber = GetAtomicNumbers<T_Particle>::type::numberOfProtons;
+                    particle[boundElectrons_] = protonNumber - 1.0_X;
+                }
+            };
+            using OnceIonized = generic::Free<OnceIonizedImpl>;
+
+
+        } // namespace manipulators
+
+        namespace startPosition
+        {
+            /** sit directly in the middle of the cell */
+            CONST_VECTOR(
+                float_X,
+                3,
+                InCellOffset,
+                /* each x, y, z in-cell position component in range [0.0, 1.0) */
+                0.5,
+                0.5,
+                0.5);
+            struct OnePositionParameter
+            {
+                /** Count of particles per cell at initial state
+                 *
+                 *  unit: none */
+                static constexpr uint32_t numParticlesPerCell = TYPICAL_PARTICLES_PER_CELL;
+
+                const InCellOffset_t inCellOffset;
+            };
+
+            /** definition of one specific position for particle start */
+            using OnePosition = OnePositionImpl<OnePositionParameter>;
+
+        } // namespace startPosition
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/precision.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/precision.param
new file mode 100644
index 0000000000..162c25da0d
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/precision.param
@@ -0,0 +1,59 @@
+/* Copyright 2013-2021 Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Define the precision of typically used floating point types in the
+ * simulation.
+ *
+ * PIConGPU normalizes input automatically, allowing to use single-precision by
+ * default for the core algorithms. Note that implementations of various
+ * algorithms (usually plugins or non-core components) might still decide to
+ * hard-code a different (mixed) precision for some critical operations.
+ */
+
+#pragma once
+
+
+namespace picongpu
+{
+/*! Select a precision for the simulation data
+ *  - precision32Bit : use 32Bit floating point numbers
+ *                     [significant digits 7 to 8]
+ *  - precision64Bit : use 64Bit floating point numbers
+ *                     [significant digits 15 to 16]
+ */
+#ifndef PARAM_PRECISION
+#    define PARAM_PRECISION precision32Bit
+#endif
+    namespace precisionPIConGPU = PARAM_PRECISION;
+
+    /*! Select a precision special operations (can be different from simulation precision)
+     *  - precisionPIConGPU : use precision which is selected on top (precisionPIConGPU)
+     *  - precision32Bit    : use 32Bit floating point numbers
+     *  - precision64Bit    : use 64Bit floating point numbers
+     */
+    namespace precisionSqrt = precisionPIConGPU;
+    namespace precisionExp = precisionPIConGPU;
+    namespace precisionTrigonometric = precisionPIConGPU;
+
+
+} // namespace picongpu
+
+#include "picongpu/unitless/precision.unitless"
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/species.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/species.param
new file mode 100644
index 0000000000..75f7785cd9
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/species.param
@@ -0,0 +1,106 @@
+/* Copyright 2014-2021 Rene Widera, Richard Pausch, Annegret Roeszler, Klaus Steiniger, Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Particle shape, field to particle interpolation, current solver, and particle pusher
+ * can be declared here for usage in `speciesDefinition.param`.
+ *
+ * @see
+ *   **MODELS / Hierarchy of Charge Assignment Schemes**
+ *   in the online documentation for information on particle shapes.
+ *
+ *
+ * \attention
+ * The higher order shape names are redefined with release 0.6.0 in order to provide a consistent naming:
+ *     * PQS is the name of the 3rd order assignment function (instead of PCS)
+ *     * PCS is the name of the 4th order assignment function (instead of P4S)
+ *     * P4S does not exist anymore
+ */
+
+#pragma once
+
+#include "picongpu/particles/shapes.hpp"
+#include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
+#include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
+#include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
+#include "picongpu/particles/flylite/NonLTE.def"
+#include "picongpu/fields/currentDeposition/Solver.def"
+
+
+namespace picongpu
+{
+    /** select macroparticle shape
+     *
+     * **WARNING** the shape names are redefined and diverge from PIConGPU versions before 0.6.0.
+     *
+     *  - particles::shapes::CIC : Assignment function is a piecewise linear spline
+     *  - particles::shapes::TSC : Assignment function is a piecewise quadratic spline
+     *  - particles::shapes::PQS : Assignment function is a piecewise cubic spline
+     *  - particles::shapes::PCS : Assignment function is a piecewise quartic spline
+     */
+    using UsedParticleShape = particles::shapes::Counter;
+
+    /** select interpolation method to be used for interpolation of grid-based field values to particle positions
+     */
+    using UsedField2Particle = FieldToParticleInterpolation<UsedParticleShape, AssignedTrilinearInterpolation>;
+
+    /*! select current solver method
+     * - currentSolver::Esirkepov< SHAPE, STRATEGY > : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     * - currentSolver::VillaBune< SHAPE, STRATEGY > : particle shapes - CIC (1st order) only
+     * - currentSolver::EmZ< SHAPE, STRATEGY >       : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     *
+     * For development purposes:
+     * - currentSolver::EsirkepovNative< SHAPE, STRATEGY > : generic version of currentSolverEsirkepov
+     *   without optimization (~4x slower and needs more shared memory)
+     *
+     * STRATEGY (optional):
+     * - currentSolver::strategy::StridedCachedSupercells
+     * - currentSolver::strategy::StridedCachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::CachedSupercells
+     * - currentSolver::strategy::CachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::NonCachedSupercells
+     * - currentSolver::strategy::NonCachedSupercellsScaled<N> with N >= 1
+     */
+    using UsedParticleCurrentSolver = currentSolver::Esirkepov<UsedParticleShape>;
+
+    /** particle pusher configuration
+     *
+     * Defining a pusher is optional for particles
+     *
+     * - particles::pusher::HigueraCary : Higuera & Cary's relativistic pusher preserving both volume and ExB velocity
+     * - particles::pusher::Vay : Vay's relativistic pusher preserving ExB velocity
+     * - particles::pusher::Boris : Boris' relativistic pusher preserving volume
+     * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
+     *                                              with classical radiation reaction
+     * - particles::pusher::Composite : composite of two given pushers,
+     *                                  switches between using one (or none) of those
+     *
+     * For diagnostics & modeling: ------------------------------------------------
+     * - particles::pusher::Acceleration : Accelerate particles by applying a constant electric field
+     * - particles::pusher::Free : free propagation, ignore fields
+     *                             (= free stream model)
+     * - particles::pusher::Photon : propagate with c in direction of normalized mom.
+     * - particles::pusher::Probe : Probe particles that interpolate E & B
+     * For development purposes: --------------------------------------------------
+     * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
+     */
+    using UsedParticlePusher = particles::pusher::Boris;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/speciesDefinition.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/speciesDefinition.param
new file mode 100644
index 0000000000..ffd9cf762b
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/speciesDefinition.param
@@ -0,0 +1,120 @@
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau, Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Define particle species.
+ *
+ * This file collects all previous declarations of base (reference) quantities
+ * and configured solvers for species and defines particle species. This
+ * includes "attributes" (lvalues to store with each species) and "flags"
+ * (rvalues & aliases for solvers to perform with the species for each timestep
+ * and ratios to base quantities). With those information, a `Particles` class
+ * is defined for each species and then collected in the list
+ * `VectorAllSpecies`.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/particles/Particles.hpp"
+
+#include <pmacc/particles/Identifier.hpp>
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+#include <pmacc/identifier/value_identifier.hpp>
+#include <pmacc/particles/traits/FilterByFlag.hpp>
+#include <pmacc/meta/String.hpp>
+
+namespace picongpu
+{
+    /*########################### define particle attributes #####################*/
+
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting>;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>
+#if(ENABLE_SYNCHROTRON_PHOTONS == 1)
+        ,
+        synchrotronPhotons<PIC_Photons>
+#endif
+        >;
+
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*--------------------------- ions -------------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioIons, 115837);
+    value_identifier(float_X, ChargeRatioIons, -29.0);
+
+    /* ratio relative to BASE_DENSITY */
+    value_identifier(float_X, DensityRatioIons, 1.0);
+
+    using ParticleFlagsIons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioIons>,
+        chargeRatio<ChargeRatioIons>,
+        densityRatio<DensityRatioIons>,
+        atomicNumbers<ionization::atomicNumbers::Copper_t>>;
+
+    /* define species ions */
+    using PIC_Ions
+        = Particles<PMACC_CSTRING("i"), ParticleFlagsIons, MakeSeq_t<DefaultParticleAttributes, boundElectrons>>;
+
+/*########################### end species ####################################*/
+
+/*enable (1) or disable (0) ions*/
+#ifndef PARAM_IONS
+#    define PARAM_IONS 0
+#endif
+
+    /** All known particle species of the simulation
+     *
+     * List all defined particle species from above in this list
+     * to make them available to the PIC algorithm.
+     */
+    using VectorAllSpecies = MakeSeq_t<
+        PIC_Electrons
+#if(PARAM_IONS == 1)
+        ,
+        PIC_Ions
+#endif
+        >;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/speciesInitialization.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/speciesInitialization.param
new file mode 100644
index 0000000000..560b5f9cd6
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/speciesInitialization.param
@@ -0,0 +1,52 @@
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Initialize particles inside particle species. This is the final step in
+ * setting up particles (defined in `speciesDefinition.param`) via density
+ * profiles (defined in `density.param`). One can then further derive particles
+ * from one species to another and manipulate attributes with "manipulators"
+ * and "filters" (defined in `particle.param` and `particleFilters.param`).
+ */
+
+#pragma once
+
+#include "picongpu/particles/InitFunctors.hpp"
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        /** InitPipeline defines in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::UsedDensity, startPosition::OnePosition, PIC_Electrons>
+#if(PARAM_IONS == 1)
+            ,
+            Derive<PIC_Electrons, PIC_Ions>,
+            Manipulate<manipulators::OnceIonized, PIC_Ions>
+#endif
+            >;
+
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/include/picongpu/param/xrayScattering.param b/share/picongpu/tests/XrayScattering/include/picongpu/param/xrayScattering.param
new file mode 100644
index 0000000000..27e35241b7
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/include/picongpu/param/xrayScattering.param
@@ -0,0 +1,74 @@
+/* Copyright 2020-2021 Pawel Ordyna
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/plugins/xrayScattering/beam/Side.hpp"
+
+/* preprocessor struct generator */
+#include <pmacc/preprocessor/struct.hpp>
+
+#ifndef PARAM_ANGLES
+#    define PARAM_ANGLES 0
+#endif
+
+namespace picongpu
+{
+    namespace plugins
+    {
+        namespace xrayScattering
+        {
+            namespace beam
+            {
+                using namespace picongpu::plugins::xrayScattering::beam;
+                /* Choose from:
+                 *  - ZSide
+                 *  - YSide
+                 *  - XSide
+                 * - ZRSide
+                 * - YRSide
+                 * - XRSide
+                 */
+                using ProbingSide = ZSide;
+
+                PMACC_STRUCT(
+                    RotationParam,
+                    (PMACC_C_VALUE(
+                        float_X,
+                        yawAngle,
+#if PARAM_ANGLES == 1
+                        20.0_X / 180.0_X * pmacc::math::Pi<float_X>::value
+#else
+                        0.0_X
+#endif
+                        ))(
+                        PMACC_C_VALUE(
+                            float_X,
+                            pitchAngle,
+#if PARAM_ANGLES == 1
+                            42.0_X / 180.0_X * pmacc::math::Pi<float_X>::value
+#else
+                            0.0_X
+#endif
+                            )));
+            } // namespace beam
+        } // namespace xrayScattering
+    } // namespace plugins
+} // namespace picongpu
diff --git a/share/picongpu/tests/XrayScattering/lib/python/picongpu/checks.py b/share/picongpu/tests/XrayScattering/lib/python/picongpu/checks.py
new file mode 100644
index 0000000000..5a84fd9bd1
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/lib/python/picongpu/checks.py
@@ -0,0 +1,81 @@
+from os.path import join
+import numpy as np
+import openpmd_api as api
+from picongpu.plugins.data import XrayScatteringData
+from is_close import is_close
+
+
+def compare_with_fft(species, bound_electrons, rotation=None):
+    simulation_path = '../../../../'
+
+    # Load pluginOutput
+    xray_scattering_data = XrayScatteringData(simulation_path, species, 'h5')
+    amplitude = xray_scattering_data.get(iteration=0)
+    del xray_scattering_data
+
+    # Load density
+    internal_path = 'simOutput/h5'
+    file_name = 'simData_%T.h5'
+    path_output = join(simulation_path, internal_path, file_name)
+    series_output = api.Series(path_output, api.Access_Type.read_only)
+    i = series_output.iterations[0]
+    e_mesh = i.meshes['e_density']
+    ed = e_mesh[api.Mesh_Record_Component.SCALAR]
+    electron_density = ed.load_chunk()
+    # ions have the same density in this setup
+    electron_density *= bound_electrons
+    series_output.flush()
+
+    # Transform data
+    # (SideZ)
+    if electron_density.ndim == 3:
+        # zyx(openPMD) -> xyz(PIC) -> yxz(beam Side z)
+        electron_density = np.moveaxis(electron_density, (0, 1, 2), (2, 0, 1))
+    # for dim == 2 nothing changes xy are swiped twice.
+    if rotation is not None:
+        electron_density = rotation(electron_density)
+    fft = np.fft.fftn(electron_density)
+    if electron_density.ndim == 3:
+        fft = fft[:, :, 0]  # Take the z=0 slice.
+    fft = np.fft.fftshift(fft)
+    # Now some magic. Since x_beam = -1 * y_PIC (side z) we need to do the
+    # equivalent transformation q_x -> -q_x. The [1:,:] is necessary since the
+    # fft output has one extra, mismatching after reflection, frequency. It is
+    # left out of the comparision.
+    fft, amplitude = fft[1:, 1:], amplitude[1:, 1:]
+    fft = fft[::-1, :]
+
+    fft = fft.astype(amplitude.dtype.type)
+    if amplitude.real.dtype.type is np.float32:
+        params = {"abs_tolerance": 1e-1,
+                  "threshold": 1e-1, "rel_tolerance": 1e-1}
+    elif amplitude.real.dtype.type is np.float64:
+        params = {"abs_tolerance": 1e-8,
+                  "threshold": 1e-8, "rel_tolerance": 1e-8}
+    else:
+        raise TypeError
+
+    check_real = is_close(amplitude.real, fft.real, **params)
+    check_imag = is_close(amplitude.imag, fft.imag, **params)
+    return check_real and check_imag
+
+
+def check_summation():
+    simulation_path = '../../../../'
+    # Load pluginOutput
+    xray_scattering_data = XrayScatteringData(simulation_path, 'e', 'h5')
+    amplitude0 = xray_scattering_data.get(iteration=0)
+    amplitude1 = xray_scattering_data.get(iteration=1)
+    del xray_scattering_data
+    difference = amplitude1 - amplitude0
+    if amplitude0.real.dtype.type is np.float32:
+        params = {"abs_tolerance": 1e-4,
+                  "threshold": 1e-2, "rel_tolerance": 1e-3}
+    elif amplitude0.real.dtype.type is np.float64:
+        params = {"abs_tolerance": 1e-12,
+                  "threshold": 1e-11, "rel_tolerance": 1e-11}
+    else:
+        raise TypeError
+    real_check = is_close(difference.real, amplitude0.real, **params)
+    imag_check = is_close(difference.imag, amplitude0.imag, **params)
+    return real_check and imag_check
diff --git a/share/picongpu/tests/XrayScattering/lib/python/picongpu/is_close.py b/share/picongpu/tests/XrayScattering/lib/python/picongpu/is_close.py
new file mode 100644
index 0000000000..001f5dc6b9
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/lib/python/picongpu/is_close.py
@@ -0,0 +1,10 @@
+import numpy as np
+
+
+def is_close(input1, input2, abs_tolerance, threshold, rel_tolerance):
+    assert input1.dtype.type is input2.dtype.type
+    diff = np.abs(input1 - input2)
+    check0 = np.minimum(np.abs(input1), np.abs(input2)) < threshold
+    check1 = diff < abs_tolerance
+    check2 = diff < rel_tolerance * np.maximum(np.abs(input1), np.abs(input2))
+    return np.all(np.logical_or(np.logical_and(check0, check1), check2))
diff --git a/share/picongpu/tests/XrayScattering/lib/python/picongpu/test_1.py b/share/picongpu/tests/XrayScattering/lib/python/picongpu/test_1.py
new file mode 100644
index 0000000000..52c92acc7a
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/lib/python/picongpu/test_1.py
@@ -0,0 +1,23 @@
+from checks import compare_with_fft
+from checks import check_summation
+
+
+def main():
+
+    electrons_check = compare_with_fft('e', 1)
+    ions_check = compare_with_fft('i', 28)
+    summation_check = check_summation()
+    if summation_check and electrons_check and ions_check:
+        print("All tests passed.")
+    else:
+        print("Some tests didn't pass.")
+        print("electrons test {})"
+              "".format(electrons_check))
+        print("ion test {}"
+              "".format(ions_check))
+        print("check summation test {}"
+              "".format(summation_check))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/share/picongpu/tests/XrayScattering/lib/python/picongpu/test_2.py b/share/picongpu/tests/XrayScattering/lib/python/picongpu/test_2.py
new file mode 100644
index 0000000000..62ebc1531c
--- /dev/null
+++ b/share/picongpu/tests/XrayScattering/lib/python/picongpu/test_2.py
@@ -0,0 +1,16 @@
+from checks import compare_with_fft
+
+
+def main():
+
+    electrons_check = compare_with_fft('e', 1)
+    if electrons_check:
+        print("All tests passed.")
+    else:
+        print("Some tests didn't pass.")
+        print("electrons test {}"
+              "".format(electrons_check))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/share/picongpu/tests/compileCurrentSolver/README.rst b/share/picongpu/tests/compileCurrentSolver/README.rst
new file mode 100644
index 0000000000..d1ffee0bd2
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/README.rst
@@ -0,0 +1,5 @@
+Compile Test for Selected Species Solver
+========================================
+
+This test compiles current solver for different partcle shapes.
+Particle pusher are checked in the example SingleParticleTest.
diff --git a/share/picongpu/tests/compileCurrentSolver/cmakeFlags b/share/picongpu/tests/compileCurrentSolver/cmakeFlags
new file mode 100755
index 0000000000..c0bb0d6f6e
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/cmakeFlags
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+#
+# Copyright 2013-2021 Axel Huebl, Rene Widera
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# generic compile options
+#
+
+################################################################################
+# add presets here
+#   - default: index 0
+#   - start with zero index
+#   - increase by 1, no gaps
+
+flags[0]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=EmZ<UsedParticleShape>;-DPARAM_PARTICLESHAPE=PQS'"
+flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=EmZ<UsedParticleShape>;-DPARAM_PARTICLESHAPE=PCS;-DPARAM_DIMENSION=DIM2'"
+# Esirkepov and TSC is tested in most examples
+flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=Esirkepov<UsedParticleShape>;-DPARAM_PARTICLESHAPE=CIC'"
+flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=Esirkepov<UsedParticleShape>;-DPARAM_PARTICLESHAPE=TSC;-DPARAM_CURRENTINTERPOLATION=Binomial'"
+flags[4]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=Esirkepov<UsedParticleShape>;-DPARAM_PARTICLESHAPE=PQS;-DPARAM_DIMENSION=DIM2'"
+flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_CURRENTSOLVER=VillaBune<>;-DPARAM_PARTICLESHAPE=CIC'"
+
+
+################################################################################
+# execution
+
+case "$1" in
+    -l)  echo ${#flags[@]}
+         ;;
+    -ll) for f in "${flags[@]}"; do echo $f; done
+         ;;
+    *)   echo -n ${flags[$1]}
+         ;;
+esac
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/density.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/density.param
new file mode 100644
index 0000000000..bed7ea6308
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/density.param
@@ -0,0 +1,46 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+ *                     Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/densityProfiles/profiles.def"
+
+
+namespace picongpu
+{
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
+        constexpr float_64 BASE_DENSITY_SI = 1.e25;
+    } // namespace SI
+
+    namespace densityProfiles
+    {
+        /* definition of homogenous profile */
+        using Homogenous = HomogenousImpl;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/dimension.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/dimension.param
new file mode 100644
index 0000000000..9cda9d9a01
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/dimension.param
@@ -0,0 +1,31 @@
+/* Copyright 2014-2021 Axel Huebl, Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#ifndef PARAM_DIMENSION
+#    define PARAM_DIMENSION DIM3
+#endif
+
+#define SIMDIM PARAM_DIMENSION
+
+namespace picongpu
+{
+    constexpr uint32_t simDim = SIMDIM;
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/fieldSolver.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/fieldSolver.param
new file mode 100644
index 0000000000..c082c8add3
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/fieldSolver.param
@@ -0,0 +1,85 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov, Klaus Steiniger
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Configure the field solver.
+ *
+ * Select the numerical Maxwell solver (e.g. Yee's method).
+ *
+ * Also allows to configure ad hoc mitigations for high frequency
+ * noise in some setups via current smoothing.
+ *
+ * \attention
+ * Currently, the laser initialization in PIConGPU is implemented to work with the standard Yee solver.
+ * Using a solver of higher order will result in a slightly increased laser amplitude and energy than expected.
+ *
+ */
+
+#pragma once
+
+#include "picongpu/fields/MaxwellSolver/Solvers.def"
+#include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        /** Current Interpolation
+         *
+         * CurrentInterpolation is used to set a method performing the
+         * interpolate/assign operation from the generated currents of particle
+         * species to the electro-magnetic fields.
+         *
+         * Allowed values are:
+         *   - None:
+         *     - default for staggered grids/Yee-scheme
+         *     - updates E
+         *   - Binomial: 2nd order Binomial filter
+         *     - smooths the current before assignment in staggered grid
+         *     - updates E & breaks local charge conservation slightly
+         */
+#ifndef PARAM_CURRENTINTERPOLATION
+#    define PARAM_CURRENTINTERPOLATION None
+#endif
+        using CurrentInterpolation = currentInterpolation::PARAM_CURRENTINTERPOLATION;
+
+        /** FieldSolver
+         *
+         * Field Solver Selection:
+         *  - Yee< CurrentInterpolation > : Standard Yee solver approximating derivatives with respect to time and
+         * space by second order finite differences.
+         *  - YeePML< CurrentInterpolation >: Standard Yee solver using Perfectly Matched Layer Absorbing Boundary
+         * Conditions (PML)
+         *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *  - LehePML< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *                                     using Perfectly Matched Layer Absorbing Boundary Conditions (PML)
+         *  - ArbitraryOrderFDTD< 4, CurrentInterpolation >: Solver using 4 neighbors to each direction to approximate
+         * *spatial* derivatives by finite differences. The number of neighbors can be changed from 4 to any positive,
+         * integer number. The order of the solver will be twice the number of neighbors in each direction. Yee's
+         * method is a special case of this using one neighbor to each direction.
+         *  - ArbitraryOrderFDTDPML< 4, CurrentInterpolation >: ArbitraryOrderFDTD solver using Perfectly Matched Layer
+         *                                                      Absorbing Boundary Conditions (PML)
+         *  - None< CurrentInterpolation >: disable the vacuum update of E and B
+         */
+        using Solver = maxwellSolver::Yee<CurrentInterpolation>;
+
+    } // namespace fields
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/fileOutput.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/fileOutput.param
new file mode 100644
index 0000000000..18a50ebfa6
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/fileOutput.param
@@ -0,0 +1,56 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+ *                     Benjamin Worpitz, Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+
+/* some forward declarations we need */
+#include "picongpu/fields/Fields.def"
+#include "picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def"
+
+#include <boost/mpl/vector.hpp>
+
+
+namespace picongpu
+{
+    /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
+     *
+     * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
+     */
+    using FieldTmpSolvers = MakeSeq_t<>;
+
+    /** FileOutputFields: Groups all Fields that shall be dumped *************/
+
+    /** Possible native fields: FieldE, FieldB, FieldJ
+     */
+    using NativeFileOutputFields = MakeSeq_t<>;
+
+    using FileOutputFields = MakeSeq_t<>;
+
+
+    /** FileOutputParticles: Groups all Species that shall be dumped **********
+     *
+     * hint: to disable particle output set to
+     *   using FileOutputParticles = MakeSeq_t< >;
+     */
+    using FileOutputParticles = MakeSeq_t<>;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/isaac.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/isaac.param
new file mode 100644
index 0000000000..af98f960f6
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/isaac.param
@@ -0,0 +1,57 @@
+/* Copyright 2016-2021 Alexander Matthes
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Definition which native fields and density fields of particles will be
+ * visualizable with ISAAC. ISAAC is an in-situ visualization library with which
+ * the PIC simulation can be observed while it is running avoiding the time
+ * consuming writing and reading of simulation data for the classical post
+ * processing of data.
+ *
+ * ISAAC can directly visualize natives fields like the E or B field, but
+ * density fields of particles need to be calculated from PIConGPU on the fly
+ * which slightly increases the runtime and the memory consumption. Every
+ * particle density field will reduce the amount of memory left for PIConGPUs
+ * particles and fields.
+ *
+ * To get best performance, ISAAC defines an exponential amount of different
+ * visualization kernels for every combination of (at runtime) activated
+ * fields. So furthermore a lot of fields will increase the compilation time.
+ *
+ */
+
+#pragma once
+
+namespace picongpu
+{
+    namespace isaacP
+    {
+        /** Intermediate list of native particle species of PIConGPU which shall be
+         *  visualized. */
+        using Particle_Seq = MakeSeq_t<>;
+
+
+        /** Compile time sequence of all fields which shall be visualized. Basically
+         *  the join of Native_Seq and Density_Seq. */
+        using Fields_Seq = MakeSeq_t<>;
+
+
+    } // namespace isaacP
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/particle.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/particle.param
new file mode 100644
index 0000000000..792e7acad0
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/particle.param
@@ -0,0 +1,85 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
+ *                     Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/startPosition/functors.def"
+#include "picongpu/particles/manipulators/manipulators.def"
+#include "picongpu/particles/filter/filter.def"
+
+#include <pmacc/nvidia/functors/Assign.hpp>
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace startPosition
+        {
+            struct QuietParam25ppc
+            {
+                /** Count of particles per cell per direction at initial state
+                 *  unit: none
+                 */
+                using numParticlesPerDimension = typename mCT::shrinkTo<mCT::Int<5, 5, 1>, simDim>::type;
+            };
+            using Quiet25ppc = QuietImpl<QuietParam25ppc>;
+
+        } // namespace startPosition
+
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
+         *  unit: none
+         */
+        constexpr float_X MIN_WEIGHTING = 10.0;
+
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL
+            = mCT::volume<startPosition::QuietParam25ppc::numParticlesPerDimension>::type::value;
+
+        namespace manipulators
+        {
+            CONST_VECTOR(float_X, 3, DriftParamPositive_direction, 1.0, 0.0, 0.0);
+            struct DriftParamPositive
+            {
+                /** Initial particle drift velocity for electrons and ions
+                 *  Examples:
+                 *    - No drift is equal to 1.0
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 1.021;
+                const DriftParamPositive_direction_t direction;
+            };
+            using AssignXDriftPositive = unary::Drift<DriftParamPositive, nvidia::functors::Assign>;
+
+            struct TemperatureParam
+            {
+                /* Initial temperature
+                 *  unit: keV
+                 */
+                static constexpr float_64 temperature = 0.0005;
+            };
+            using AddTemperature = unary::Temperature<TemperatureParam>;
+
+        } // namespace manipulators
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/particleFilters.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/particleFilters.param
new file mode 100644
index 0000000000..5d6ad5c91b
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/particleFilters.param
@@ -0,0 +1,83 @@
+/* Copyright 2013-2021 Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * A common task in both modeling and in situ processing (output) is the
+ * selection of particles of a particle species by attributes. Users can
+ * define such selections as particle filters in this file.
+ *
+ * Particle filters are simple mappings assigning each particle of a species
+ * either `true` or `false` (ignore / filter out).
+ *
+ * All active filters need to be listed in `AllParticleFilters`. They are then
+ * combined with `VectorAllSpecies` at compile-time, e.g. for plugins.
+ */
+
+#pragma once
+
+#include "picongpu/particles/filter/filter.def"
+#include "picongpu/particles/traits/SpeciesEligibleForSolver.hpp"
+
+#include <pmacc/traits/HasIdentifiers.hpp>
+#include <pmacc/traits/HasFlag.hpp>
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace filter
+        {
+            struct IfRelativeGlobalPositionParamLowQuarterPosition
+            {
+                /* lowerBound is included in the range */
+                static constexpr float_X lowerBound = 0.0;
+                /* upperBound is excluded in the range */
+                static constexpr float_X upperBound = 0.25;
+                /* dimension for the filter
+                 * x = 0; y= 1; z = 2
+                 */
+                static constexpr uint32_t dimension = 1u;
+
+                // filter name
+                static constexpr char const* name = "lowerQuarterYPosition";
+            };
+
+            using LowerQuarterYPosition
+                = filter::RelativeGlobalDomainPosition<IfRelativeGlobalPositionParamLowQuarterPosition>;
+
+            /** Plugins: collection of all available particle filters
+             *
+             * Create a list of all filters here that you want to use in plugins.
+             *
+             * Note: filter All is defined in picongpu/particles/filter/filter.def
+             */
+            using AllParticleFilters = MakeSeq_t<All, LowerQuarterYPosition>;
+
+        } // namespace filter
+
+        namespace traits
+        {
+            /* if needed for generic "free" filters,
+             * place `SpeciesEligibleForSolver` traits for filters here
+             */
+        } // namespace traits
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/precision.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/precision.param
new file mode 100644
index 0000000000..162c25da0d
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/precision.param
@@ -0,0 +1,59 @@
+/* Copyright 2013-2021 Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Define the precision of typically used floating point types in the
+ * simulation.
+ *
+ * PIConGPU normalizes input automatically, allowing to use single-precision by
+ * default for the core algorithms. Note that implementations of various
+ * algorithms (usually plugins or non-core components) might still decide to
+ * hard-code a different (mixed) precision for some critical operations.
+ */
+
+#pragma once
+
+
+namespace picongpu
+{
+/*! Select a precision for the simulation data
+ *  - precision32Bit : use 32Bit floating point numbers
+ *                     [significant digits 7 to 8]
+ *  - precision64Bit : use 64Bit floating point numbers
+ *                     [significant digits 15 to 16]
+ */
+#ifndef PARAM_PRECISION
+#    define PARAM_PRECISION precision32Bit
+#endif
+    namespace precisionPIConGPU = PARAM_PRECISION;
+
+    /*! Select a precision special operations (can be different from simulation precision)
+     *  - precisionPIConGPU : use precision which is selected on top (precisionPIConGPU)
+     *  - precision32Bit    : use 32Bit floating point numbers
+     *  - precision64Bit    : use 64Bit floating point numbers
+     */
+    namespace precisionSqrt = precisionPIConGPU;
+    namespace precisionExp = precisionPIConGPU;
+    namespace precisionTrigonometric = precisionPIConGPU;
+
+
+} // namespace picongpu
+
+#include "picongpu/unitless/precision.unitless"
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/species.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/species.param
new file mode 100644
index 0000000000..c25d0d838a
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/species.param
@@ -0,0 +1,112 @@
+/* Copyright 2014-2021 Rene Widera, Richard Pausch, Annegret Roeszler, Klaus Steiniger
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Particle shape, field to particle interpolation, current solver, and particle pusher
+ * can be declared here for usage in `speciesDefinition.param`.
+ *
+ * @see
+ *   **MODELS / Hierarchy of Charge Assignment Schemes**
+ *   in the online documentation for information on particle shapes.
+ *
+ *
+ * \attention
+ * The higher order shape names are redefined with release 0.6.0 in order to provide a consistent naming:
+ *     * PQS is the name of the 3rd order assignment function (instead of PCS)
+ *     * PCS is the name of the 4th order assignment function (instead of P4S)
+ *     * P4S does not exist anymore
+ */
+
+#pragma once
+
+#include "picongpu/particles/shapes.hpp"
+#include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
+#include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
+#include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
+#include "picongpu/particles/flylite/NonLTE.def"
+#include "picongpu/fields/currentDeposition/Solver.def"
+
+
+namespace picongpu
+{
+/** select macroparticle shape
+ *
+ * **WARNING** the shape names are redefined and diverge from PIConGPU versions before 0.6.0.
+ *
+ *  - particles::shapes::CIC : Assignment function is a piecewise linear spline
+ *  - particles::shapes::TSC : Assignment function is a piecewise quadratic spline
+ *  - particles::shapes::PQS : Assignment function is a piecewise cubic spline
+ *  - particles::shapes::PCS : Assignment function is a piecewise quartic spline
+ */
+#ifndef PARAM_PARTICLESHAPE
+#    define PARAM_PARTICLESHAPE TSC
+#endif
+    using UsedParticleShape = particles::shapes::PARAM_PARTICLESHAPE;
+
+    /** select interpolation method to be used for interpolation of grid-based field values to particle positions
+     */
+    using UsedField2Particle = FieldToParticleInterpolation<UsedParticleShape, AssignedTrilinearInterpolation>;
+
+    /*! select current solver method
+     * - currentSolver::Esirkepov< SHAPE, STRATEGY > : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     * - currentSolver::VillaBune< SHAPE, STRATEGY > : particle shapes - CIC (1st order) only
+     * - currentSolver::EmZ< SHAPE, STRATEGY >       : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     *
+     * For development purposes:
+     * - currentSolver::EsirkepovNative< SHAPE, STRATEGY > : generic version of currentSolverEsirkepov
+     *   without optimization (~4x slower and needs more shared memory)
+     *
+     * STRATEGY (optional):
+     * - currentSolver::strategy::StridedCachedSupercells
+     * - currentSolver::strategy::StridedCachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::CachedSupercells
+     * - currentSolver::strategy::CachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::NonCachedSupercells
+     * - currentSolver::strategy::NonCachedSupercellsScaled<N> with N >= 1
+     */
+#ifndef PARAM_CURRENTSOLVER
+#    define PARAM_CURRENTSOLVER Esirkepov<UsedParticleShape>
+#endif
+    using UsedParticleCurrentSolver = currentSolver::PARAM_CURRENTSOLVER;
+
+    /** particle pusher configuration
+     *
+     * Defining a pusher is optional for particles
+     *
+     * - particles::pusher::HigueraCary : Higuera & Cary's relativistic pusher preserving both volume and ExB velocity
+     * - particles::pusher::Vay : Vay's relativistic pusher preserving ExB velocity
+     * - particles::pusher::Boris : Boris' relativistic pusher preserving volume
+     * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
+     *                                              with classical radiation reaction
+     * - particles::pusher::Composite : composite of two given pushers,
+     *                                  switches between using one (or none) of those
+     *
+     * For diagnostics & modeling: ------------------------------------------------
+     * - particles::pusher::Acceleration : Accelerate particles by applying a constant electric field
+     * - particles::pusher::Free : free propagation, ignore fields
+     *                             (= free stream model)
+     * - particles::pusher::Photon : propagate with c in direction of normalized mom.
+     * - particles::pusher::Probe : Probe particles that interpolate E & B
+     * For development purposes: --------------------------------------------------
+     * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
+     */
+    using UsedParticlePusher = particles::pusher::Boris;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/speciesDefinition.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/speciesDefinition.param
new file mode 100644
index 0000000000..81954926d5
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/speciesDefinition.param
@@ -0,0 +1,83 @@
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/particles/Particles.hpp"
+
+#include <pmacc/particles/Identifier.hpp>
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+#include <pmacc/identifier/value_identifier.hpp>
+#include <pmacc/particles/traits/FilterByFlag.hpp>
+#include <pmacc/meta/String.hpp>
+
+
+#ifndef PARAM_RADIATION
+/* disable radiation calculation */
+#    define PARAM_RADIATION 0
+#endif
+
+
+namespace picongpu
+{
+    /*########################### define particle attributes #####################*/
+
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<
+        position<position_pic>,
+        momentum,
+        weighting
+#if(PARAM_RADIATION == 1)
+        ,
+        momentumPrev1
+#endif
+        >;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
+
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*########################### end species ####################################*/
+
+    /** All known particle species of the simulation
+     *
+     * List all defined particle species from above in this list
+     * to make them available to the PIC algorithm.
+     */
+    using VectorAllSpecies = MakeSeq_t<PIC_Electrons>;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/speciesInitialization.param b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/speciesInitialization.param
new file mode 100644
index 0000000000..065fe94be8
--- /dev/null
+++ b/share/picongpu/tests/compileCurrentSolver/include/picongpu/param/speciesInitialization.param
@@ -0,0 +1,48 @@
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Initialize particles inside particle species. This is the final step in
+ * setting up particles (defined in `speciesDefinition.param`) via density
+ * profiles (defined in `density.param`). One can then further derive particles
+ * from one species to another and manipulate attributes with "manipulators"
+ * and "filters" (defined in `particle.param` and `particleFilters.param`).
+ */
+
+#pragma once
+
+#include "picongpu/particles/InitFunctors.hpp"
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::Homogenous, startPosition::Quiet25ppc, PIC_Electrons>,
+            Manipulate<manipulators::AssignXDriftPositive, PIC_Electrons, filter::LowerQuarterYPosition>,
+            Manipulate<manipulators::AddTemperature, PIC_Electrons>>;
+
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileFieldSolver/README.rst b/share/picongpu/tests/compileFieldSolver/README.rst
new file mode 100644
index 0000000000..2d44f77f07
--- /dev/null
+++ b/share/picongpu/tests/compileFieldSolver/README.rst
@@ -0,0 +1,5 @@
+Compile Test for Field Solver
+=============================
+
+This test compiles filed solver for two and three dimensions.
+One species electron is required to test the current interpolation algorithms.
diff --git a/share/picongpu/tests/compileFieldSolver/cmakeFlags b/share/picongpu/tests/compileFieldSolver/cmakeFlags
new file mode 100755
index 0000000000..014ad8597d
--- /dev/null
+++ b/share/picongpu/tests/compileFieldSolver/cmakeFlags
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+#
+# Copyright 2013-2021 Axel Huebl, Rene Widera
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# generic compile options
+#
+
+################################################################################
+# add presets here
+#   - default: index 0
+#   - start with zero index
+#   - increase by 1, no gaps
+
+# Yee solver is tested in the example FieldAbsorberTest
+flags[0]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=LehePML;-DPARAM_PRECISION=precision64Bit'"
+flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=LehePML;-DPARAM_DIMENSION=DIM2'"
+flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_FIELDSOLVER=Lehe;-DPARAM_DIMENSION=DIM2'"
+
+
+################################################################################
+# execution
+
+case "$1" in
+    -l)  echo ${#flags[@]}
+         ;;
+    -ll) for f in "${flags[@]}"; do echo $f; done
+         ;;
+    *)   echo -n ${flags[$1]}
+         ;;
+esac
diff --git a/share/picongpu/tests/compileFieldSolver/include/picongpu/param/dimension.param b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/dimension.param
new file mode 100644
index 0000000000..9cda9d9a01
--- /dev/null
+++ b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/dimension.param
@@ -0,0 +1,31 @@
+/* Copyright 2014-2021 Axel Huebl, Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#ifndef PARAM_DIMENSION
+#    define PARAM_DIMENSION DIM3
+#endif
+
+#define SIMDIM PARAM_DIMENSION
+
+namespace picongpu
+{
+    constexpr uint32_t simDim = SIMDIM;
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileFieldSolver/include/picongpu/param/fieldSolver.param b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/fieldSolver.param
new file mode 100644
index 0000000000..74e9a79ce2
--- /dev/null
+++ b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/fieldSolver.param
@@ -0,0 +1,82 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Sergei Bastrakov, Klaus Steiniger
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Configure the field solver.
+ *
+ * Select the numerical Maxwell solver (e.g. Yee's method).
+ *
+ * Also allows to configure ad hoc mitigations for high frequency
+ * noise in some setups via current smoothing.
+ *
+ * \attention
+ * Currently, the laser initialization in PIConGPU is implemented to work with the standard Yee solver.
+ * Using a solver of higher order will result in a slightly increased laser amplitude and energy than expected.
+ *
+ */
+
+#pragma once
+
+#include "picongpu/fields/MaxwellSolver/Solvers.def"
+#include "picongpu/fields/currentInterpolation/CurrentInterpolation.def"
+
+
+namespace picongpu
+{
+    namespace fields
+    {
+        /** Current Interpolation
+         *
+         * CurrentInterpolation is used to set a method performing the
+         * interpolate/assign operation from the generated currents of particle
+         * species to the electro-magnetic fields.
+         *
+         * Allowed values are:
+         *   - None:
+         *     - default for staggered grids/Yee-scheme
+         *     - updates E
+         *   - Binomial: 2nd order Binomial filter
+         *     - smooths the current before assignment in staggered grid
+         *     - updates E & breaks local charge conservation slightly
+         */
+        using CurrentInterpolation = currentInterpolation::None;
+
+        /** FieldSolver
+         *
+         * Field Solver Selection:
+         *  - Yee< CurrentInterpolation > : Standard Yee solver approximating derivatives with respect to time and
+         * space by second order finite differences.
+         *  - YeePML< CurrentInterpolation >: Standard Yee solver using Perfectly Matched Layer Absorbing Boundary
+         * Conditions (PML)
+         *  - Lehe< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *  - LehePML< CurrentInterpolation >: Num. Cherenkov free field solver in a chosen direction
+         *                                     using Perfectly Matched Layer Absorbing Boundary Conditions (PML)
+         *  - ArbitraryOrderFDTD< 4, CurrentInterpolation >: Solver using 4 neighbors to each direction to approximate
+         * *spatial* derivatives by finite differences. The number of neighbors can be changed from 4 to any positive,
+         * integer number. The order of the solver will be twice the number of neighbors in each direction. Yee's
+         * method is a special case of this using one neighbor to each direction.
+         *  - ArbitraryOrderFDTDPML< 4, CurrentInterpolation >: ArbitraryOrderFDTD solver using Perfectly Matched Layer
+         *                                                      Absorbing Boundary Conditions (PML)
+         *  - None< CurrentInterpolation >: disable the vacuum update of E and B
+         */
+        using Solver = maxwellSolver::PARAM_FIELDSOLVER<CurrentInterpolation>;
+
+    } // namespace fields
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileFieldSolver/include/picongpu/param/fileOutput.param b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/fileOutput.param
new file mode 100644
index 0000000000..18a50ebfa6
--- /dev/null
+++ b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/fileOutput.param
@@ -0,0 +1,56 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+ *                     Benjamin Worpitz, Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+
+/* some forward declarations we need */
+#include "picongpu/fields/Fields.def"
+#include "picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def"
+
+#include <boost/mpl/vector.hpp>
+
+
+namespace picongpu
+{
+    /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
+     *
+     * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
+     */
+    using FieldTmpSolvers = MakeSeq_t<>;
+
+    /** FileOutputFields: Groups all Fields that shall be dumped *************/
+
+    /** Possible native fields: FieldE, FieldB, FieldJ
+     */
+    using NativeFileOutputFields = MakeSeq_t<>;
+
+    using FileOutputFields = MakeSeq_t<>;
+
+
+    /** FileOutputParticles: Groups all Species that shall be dumped **********
+     *
+     * hint: to disable particle output set to
+     *   using FileOutputParticles = MakeSeq_t< >;
+     */
+    using FileOutputParticles = MakeSeq_t<>;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileFieldSolver/include/picongpu/param/grid.param b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/grid.param
new file mode 100644
index 0000000000..ffefe1be6c
--- /dev/null
+++ b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/grid.param
@@ -0,0 +1,87 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch,
+ *                     Benjamin Worpitz
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#pragma once
+
+namespace picongpu
+{
+    namespace SI
+    {
+        /** Duration of one timestep
+         *  unit: seconds */
+        constexpr float_64 DELTA_T_SI = 1.79e-16;
+
+        /** equals X
+         *  unit: meter */
+        constexpr float_64 CELL_WIDTH_SI = 9.34635e-8;
+        /** equals Y
+         *  unit: meter */
+        constexpr float_64 CELL_HEIGHT_SI = CELL_WIDTH_SI;
+        /** equals Z
+         *  unit: meter */
+        constexpr float_64 CELL_DEPTH_SI = CELL_WIDTH_SI;
+
+        /** Note on units in reduced dimensions
+         *
+         * In 2D3V simulations, the CELL_DEPTH_SI (Z) cell length
+         * is still used for normalization of densities, etc.
+         *
+         * A 2D3V simulation in a cartesian PIC simulation such as
+         * ours only changes the degrees of freedom in motion for
+         * (macro) particles and all (field) information in z
+         * travels instantaneous, making the 2D3V simulation
+         * behave like the interaction of infinite "wire particles"
+         * in fields with perfect symmetry in Z.
+         */
+    } // namespace SI
+
+    //! Defines the size of the absorbing zone (in cells)
+    constexpr uint32_t ABSORBER_CELLS[3][2] = {
+        {32, 32}, /*x direction [negative,positive]*/
+        {32, 32}, /*y direction [negative,positive]*/
+        {32, 32} /*z direction [negative,positive]*/
+    }; // unit: number of cells
+
+    //! Define the strength of the absorber for any direction
+    constexpr float_X ABSORBER_STRENGTH[3][2] = {
+        {1.0e-3, 1.0e-3}, /*x direction [negative,positive]*/
+        {1.0e-3, 1.0e-3}, /*y direction [negative,positive]*/
+        {1.0e-3, 1.0e-3} /*z direction [negative,positive]*/
+    }; // unit: none
+
+    /** When to move the co-moving window.
+     *  An initial pseudo particle, flying with the speed of light,
+     *  is fired at the begin of the simulation.
+     *  When it reaches movePoint % of the absolute(*) simulation area,
+     *  the co-moving window starts to move with the speed of light.
+     *
+     *  (*) Note: beware, that there is one "hidden" row of gpus at the y-front,
+     *            when you use the co-moving window
+     *  0.75 means only 75% of simulation area is used for real simulation
+     *
+     * Warning: this variable is deprecated, but currently still required for
+     * building purposes. Please keep the variable here. In case a moving window
+     * is enabled in your .cfg file, please set the move point using the
+     * 'windowMovePoint' parameter in that file, its default value is movePoint.
+     */
+    constexpr float_64 movePoint = 0.90;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileFieldSolver/include/picongpu/param/isaac.param b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/isaac.param
new file mode 100644
index 0000000000..af98f960f6
--- /dev/null
+++ b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/isaac.param
@@ -0,0 +1,57 @@
+/* Copyright 2016-2021 Alexander Matthes
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Definition which native fields and density fields of particles will be
+ * visualizable with ISAAC. ISAAC is an in-situ visualization library with which
+ * the PIC simulation can be observed while it is running avoiding the time
+ * consuming writing and reading of simulation data for the classical post
+ * processing of data.
+ *
+ * ISAAC can directly visualize natives fields like the E or B field, but
+ * density fields of particles need to be calculated from PIConGPU on the fly
+ * which slightly increases the runtime and the memory consumption. Every
+ * particle density field will reduce the amount of memory left for PIConGPUs
+ * particles and fields.
+ *
+ * To get best performance, ISAAC defines an exponential amount of different
+ * visualization kernels for every combination of (at runtime) activated
+ * fields. So furthermore a lot of fields will increase the compilation time.
+ *
+ */
+
+#pragma once
+
+namespace picongpu
+{
+    namespace isaacP
+    {
+        /** Intermediate list of native particle species of PIConGPU which shall be
+         *  visualized. */
+        using Particle_Seq = MakeSeq_t<>;
+
+
+        /** Compile time sequence of all fields which shall be visualized. Basically
+         *  the join of Native_Seq and Density_Seq. */
+        using Fields_Seq = MakeSeq_t<>;
+
+
+    } // namespace isaacP
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileFieldSolver/include/picongpu/param/precision.param b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/precision.param
new file mode 100644
index 0000000000..162c25da0d
--- /dev/null
+++ b/share/picongpu/tests/compileFieldSolver/include/picongpu/param/precision.param
@@ -0,0 +1,59 @@
+/* Copyright 2013-2021 Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Define the precision of typically used floating point types in the
+ * simulation.
+ *
+ * PIConGPU normalizes input automatically, allowing to use single-precision by
+ * default for the core algorithms. Note that implementations of various
+ * algorithms (usually plugins or non-core components) might still decide to
+ * hard-code a different (mixed) precision for some critical operations.
+ */
+
+#pragma once
+
+
+namespace picongpu
+{
+/*! Select a precision for the simulation data
+ *  - precision32Bit : use 32Bit floating point numbers
+ *                     [significant digits 7 to 8]
+ *  - precision64Bit : use 64Bit floating point numbers
+ *                     [significant digits 15 to 16]
+ */
+#ifndef PARAM_PRECISION
+#    define PARAM_PRECISION precision32Bit
+#endif
+    namespace precisionPIConGPU = PARAM_PRECISION;
+
+    /*! Select a precision special operations (can be different from simulation precision)
+     *  - precisionPIConGPU : use precision which is selected on top (precisionPIConGPU)
+     *  - precision32Bit    : use 32Bit floating point numbers
+     *  - precision64Bit    : use 64Bit floating point numbers
+     */
+    namespace precisionSqrt = precisionPIConGPU;
+    namespace precisionExp = precisionPIConGPU;
+    namespace precisionTrigonometric = precisionPIConGPU;
+
+
+} // namespace picongpu
+
+#include "picongpu/unitless/precision.unitless"
diff --git a/share/picongpu/tests/compileParticlePusher/README.rst b/share/picongpu/tests/compileParticlePusher/README.rst
new file mode 100644
index 0000000000..70fbfc1770
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/README.rst
@@ -0,0 +1,4 @@
+Compile Test for Particle Pushers
+=================================
+
+This test compiles all particle pushers, each for one particle shape.
diff --git a/share/picongpu/tests/compileParticlePusher/cmakeFlags b/share/picongpu/tests/compileParticlePusher/cmakeFlags
new file mode 100755
index 0000000000..431e8b47a1
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/cmakeFlags
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+#
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Sergei Bastrakov
+#
+# This file is part of PIConGPU.
+#
+# PIConGPU is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PIConGPU is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PIConGPU.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# generic compile options
+#
+
+################################################################################
+# add presets here
+#   - default: index 0
+#   - start with zero index
+#   - increase by 1, no gaps
+
+# Pushers are generally independent from particle shapes, so do not attempt to
+# test all possible combinations, just all pushers except Boris (tested in examples)
+flags[0]="-DPARAM_OVERWRITES:LIST='-DPARAM_PARTICLEPUSHER=HigueraCary;-DPARAM_PARTICLESHAPE=NGP'"
+flags[1]="-DPARAM_OVERWRITES:LIST='-DPARAM_PARTICLEPUSHER=Vay;-DPARAM_PARTICLESHAPE=CIC'"
+flags[2]="-DPARAM_OVERWRITES:LIST='-DPARAM_PARTICLEPUSHER=ReducedLandauLifshitz;-DPARAM_PARTICLESHAPE=TSC'"
+flags[3]="-DPARAM_OVERWRITES:LIST='-DPARAM_PARTICLEPUSHER=Boris;-DPARAM_COMPOSITEPUSHER=1;-DPARAM_PARTICLESHAPE=PQS'"
+flags[4]="-DPARAM_OVERWRITES:LIST='-DPARAM_PARTICLEPUSHER=HigueraCary;-DPARAM_COMPOSITEPUSHER=1;-DPARAM_PARTICLESHAPE=PCS'"
+flags[5]="-DPARAM_OVERWRITES:LIST='-DPARAM_PARTICLEPUSHER=Free;-DPARAM_PARTICLESHAPE=CIC'"
+flags[6]="-DPARAM_OVERWRITES:LIST='-DPARAM_PARTICLEPUSHER=Photon;-DPARAM_PARTICLESHAPE=TSC'"
+flags[7]="-DPARAM_OVERWRITES:LIST='-DPARAM_PARTICLEPUSHER=Probe;-DPARAM_PARTICLESHAPE=PQS'"
+
+################################################################################
+# execution
+
+case "$1" in
+    -l)  echo ${#flags[@]}
+         ;;
+    -ll) for f in "${flags[@]}"; do echo $f; done
+         ;;
+    *)   echo -n ${flags[$1]}
+         ;;
+esac
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/density.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/density.param
new file mode 100644
index 0000000000..bed7ea6308
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/density.param
@@ -0,0 +1,46 @@
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera, Felix Schmitt,
+ *                     Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/densityProfiles/profiles.def"
+
+
+namespace picongpu
+{
+    namespace SI
+    {
+        /** Base density in particles per m^3 in the density profiles.
+         *
+         * This is often taken as reference maximum density in normalized profiles.
+         * Individual particle species can define a `densityRatio` flag relative
+         * to this value.
+         *
+         * unit: ELEMENTS/m^3
+         */
+        constexpr float_64 BASE_DENSITY_SI = 1.e25;
+    } // namespace SI
+
+    namespace densityProfiles
+    {
+        /* definition of homogenous profile */
+        using Homogenous = HomogenousImpl;
+    } // namespace densityProfiles
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/dimension.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/dimension.param
new file mode 100644
index 0000000000..9cda9d9a01
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/dimension.param
@@ -0,0 +1,31 @@
+/* Copyright 2014-2021 Axel Huebl, Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#ifndef PARAM_DIMENSION
+#    define PARAM_DIMENSION DIM3
+#endif
+
+#define SIMDIM PARAM_DIMENSION
+
+namespace picongpu
+{
+    constexpr uint32_t simDim = SIMDIM;
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/fileOutput.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/fileOutput.param
new file mode 100644
index 0000000000..18a50ebfa6
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/fileOutput.param
@@ -0,0 +1,56 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Felix Schmitt,
+ *                     Benjamin Worpitz, Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+
+/* some forward declarations we need */
+#include "picongpu/fields/Fields.def"
+#include "picongpu/particles/particleToGrid/ComputeGridValuePerFrame.def"
+
+#include <boost/mpl/vector.hpp>
+
+
+namespace picongpu
+{
+    /** FieldTmpSolvers groups all solvers that create data for FieldTmp ******
+     *
+     * FieldTmpSolvers is used in @see FieldTmp to calculate the exchange size
+     */
+    using FieldTmpSolvers = MakeSeq_t<>;
+
+    /** FileOutputFields: Groups all Fields that shall be dumped *************/
+
+    /** Possible native fields: FieldE, FieldB, FieldJ
+     */
+    using NativeFileOutputFields = MakeSeq_t<>;
+
+    using FileOutputFields = MakeSeq_t<>;
+
+
+    /** FileOutputParticles: Groups all Species that shall be dumped **********
+     *
+     * hint: to disable particle output set to
+     *   using FileOutputParticles = MakeSeq_t< >;
+     */
+    using FileOutputParticles = MakeSeq_t<>;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/isaac.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/isaac.param
new file mode 100644
index 0000000000..af98f960f6
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/isaac.param
@@ -0,0 +1,57 @@
+/* Copyright 2016-2021 Alexander Matthes
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Definition which native fields and density fields of particles will be
+ * visualizable with ISAAC. ISAAC is an in-situ visualization library with which
+ * the PIC simulation can be observed while it is running avoiding the time
+ * consuming writing and reading of simulation data for the classical post
+ * processing of data.
+ *
+ * ISAAC can directly visualize natives fields like the E or B field, but
+ * density fields of particles need to be calculated from PIConGPU on the fly
+ * which slightly increases the runtime and the memory consumption. Every
+ * particle density field will reduce the amount of memory left for PIConGPUs
+ * particles and fields.
+ *
+ * To get best performance, ISAAC defines an exponential amount of different
+ * visualization kernels for every combination of (at runtime) activated
+ * fields. So furthermore a lot of fields will increase the compilation time.
+ *
+ */
+
+#pragma once
+
+namespace picongpu
+{
+    namespace isaacP
+    {
+        /** Intermediate list of native particle species of PIConGPU which shall be
+         *  visualized. */
+        using Particle_Seq = MakeSeq_t<>;
+
+
+        /** Compile time sequence of all fields which shall be visualized. Basically
+         *  the join of Native_Seq and Density_Seq. */
+        using Fields_Seq = MakeSeq_t<>;
+
+
+    } // namespace isaacP
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/particle.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/particle.param
new file mode 100644
index 0000000000..792e7acad0
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/particle.param
@@ -0,0 +1,85 @@
+/* Copyright 2013-2021 Axel Huebl, Rene Widera, Benjamin Worpitz,
+ *                     Richard Pausch
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/particles/startPosition/functors.def"
+#include "picongpu/particles/manipulators/manipulators.def"
+#include "picongpu/particles/filter/filter.def"
+
+#include <pmacc/nvidia/functors/Assign.hpp>
+
+namespace picongpu
+{
+    namespace particles
+    {
+        namespace startPosition
+        {
+            struct QuietParam25ppc
+            {
+                /** Count of particles per cell per direction at initial state
+                 *  unit: none
+                 */
+                using numParticlesPerDimension = typename mCT::shrinkTo<mCT::Int<5, 5, 1>, simDim>::type;
+            };
+            using Quiet25ppc = QuietImpl<QuietParam25ppc>;
+
+        } // namespace startPosition
+
+        /** a particle with a weighting below MIN_WEIGHTING will not
+         *      be created / will be deleted
+         *  unit: none
+         */
+        constexpr float_X MIN_WEIGHTING = 10.0;
+
+        /** During unit normalization, we assume this is a typical
+         *  number of particles per cell for normalization of weighted
+         *  particle attributes.
+         */
+        constexpr uint32_t TYPICAL_PARTICLES_PER_CELL
+            = mCT::volume<startPosition::QuietParam25ppc::numParticlesPerDimension>::type::value;
+
+        namespace manipulators
+        {
+            CONST_VECTOR(float_X, 3, DriftParamPositive_direction, 1.0, 0.0, 0.0);
+            struct DriftParamPositive
+            {
+                /** Initial particle drift velocity for electrons and ions
+                 *  Examples:
+                 *    - No drift is equal to 1.0
+                 *  unit: none
+                 */
+                static constexpr float_64 gamma = 1.021;
+                const DriftParamPositive_direction_t direction;
+            };
+            using AssignXDriftPositive = unary::Drift<DriftParamPositive, nvidia::functors::Assign>;
+
+            struct TemperatureParam
+            {
+                /* Initial temperature
+                 *  unit: keV
+                 */
+                static constexpr float_64 temperature = 0.0005;
+            };
+            using AddTemperature = unary::Temperature<TemperatureParam>;
+
+        } // namespace manipulators
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/precision.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/precision.param
new file mode 100644
index 0000000000..162c25da0d
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/precision.param
@@ -0,0 +1,59 @@
+/* Copyright 2013-2021 Rene Widera
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Define the precision of typically used floating point types in the
+ * simulation.
+ *
+ * PIConGPU normalizes input automatically, allowing to use single-precision by
+ * default for the core algorithms. Note that implementations of various
+ * algorithms (usually plugins or non-core components) might still decide to
+ * hard-code a different (mixed) precision for some critical operations.
+ */
+
+#pragma once
+
+
+namespace picongpu
+{
+/*! Select a precision for the simulation data
+ *  - precision32Bit : use 32Bit floating point numbers
+ *                     [significant digits 7 to 8]
+ *  - precision64Bit : use 64Bit floating point numbers
+ *                     [significant digits 15 to 16]
+ */
+#ifndef PARAM_PRECISION
+#    define PARAM_PRECISION precision32Bit
+#endif
+    namespace precisionPIConGPU = PARAM_PRECISION;
+
+    /*! Select a precision special operations (can be different from simulation precision)
+     *  - precisionPIConGPU : use precision which is selected on top (precisionPIConGPU)
+     *  - precision32Bit    : use 32Bit floating point numbers
+     *  - precision64Bit    : use 64Bit floating point numbers
+     */
+    namespace precisionSqrt = precisionPIConGPU;
+    namespace precisionExp = precisionPIConGPU;
+    namespace precisionTrigonometric = precisionPIConGPU;
+
+
+} // namespace picongpu
+
+#include "picongpu/unitless/precision.unitless"
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/species.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/species.param
new file mode 100644
index 0000000000..3064c2d9e1
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/species.param
@@ -0,0 +1,130 @@
+/* Copyright 2014-2021 Rene Widera, Richard Pausch, Annegret Roeszler, Klaus Steiniger, Sergei Bastrakov
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Particle shape, field to particle interpolation, current solver, and particle pusher
+ * can be declared here for usage in `speciesDefinition.param`.
+ *
+ * @see
+ *   **MODELS / Hierarchy of Charge Assignment Schemes**
+ *   in the online documentation for information on particle shapes.
+ *
+ *
+ * \attention
+ * The higher order shape names are redefined with release 0.6.0 in order to provide a consistent naming:
+ *     * PQS is the name of the 3rd order assignment function (instead of PCS)
+ *     * PCS is the name of the 4th order assignment function (instead of P4S)
+ *     * P4S does not exist anymore
+ */
+
+#pragma once
+
+#include "picongpu/particles/shapes.hpp"
+#include "picongpu/algorithms/FieldToParticleInterpolationNative.hpp"
+#include "picongpu/algorithms/FieldToParticleInterpolation.hpp"
+#include "picongpu/algorithms/AssignedTrilinearInterpolation.hpp"
+#include "picongpu/particles/flylite/NonLTE.def"
+#include "picongpu/fields/currentDeposition/Solver.def"
+
+
+namespace picongpu
+{
+/** select macroparticle shape
+ *
+ * **WARNING** the shape names are redefined and diverge from PIConGPU versions before 0.6.0.
+ *
+ *  - particles::shapes::CIC : Assignment function is a piecewise linear spline
+ *  - particles::shapes::TSC : Assignment function is a piecewise quadratic spline
+ *  - particles::shapes::PQS : Assignment function is a piecewise cubic spline
+ *  - particles::shapes::PCS : Assignment function is a piecewise quartic spline
+ */
+#ifndef PARAM_PARTICLESHAPE
+#    define PARAM_PARTICLESHAPE TSC
+#endif
+    using UsedParticleShape = particles::shapes::PARAM_PARTICLESHAPE;
+
+    /** select interpolation method to be used for interpolation of grid-based field values to particle positions
+     */
+    using UsedField2Particle = FieldToParticleInterpolation<UsedParticleShape, AssignedTrilinearInterpolation>;
+
+    /*! select current solver method
+     * - currentSolver::Esirkepov< SHAPE, STRATEGY > : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     * - currentSolver::VillaBune< SHAPE, STRATEGY > : particle shapes - CIC (1st order) only
+     * - currentSolver::EmZ< SHAPE, STRATEGY >       : particle shapes - CIC, TSC, PQS, PCS (1st to 4th order)
+     *
+     * For development purposes:
+     * - currentSolver::EsirkepovNative< SHAPE, STRATEGY > : generic version of currentSolverEsirkepov
+     *   without optimization (~4x slower and needs more shared memory)
+     *
+     * STRATEGY (optional):
+     * - currentSolver::strategy::StridedCachedSupercells
+     * - currentSolver::strategy::StridedCachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::CachedSupercells
+     * - currentSolver::strategy::CachedSupercellsScaled<N> with N >= 1
+     * - currentSolver::strategy::NonCachedSupercells
+     * - currentSolver::strategy::NonCachedSupercellsScaled<N> with N >= 1
+     */
+    using UsedParticleCurrentSolver = currentSolver::EmZ<UsedParticleShape>;
+
+/** particle pusher configuration
+ *
+ * Defining a pusher is optional for particles
+ *
+ * - particles::pusher::HigueraCary : Higuera & Cary's relativistic pusher preserving both volume and ExB velocity
+ * - particles::pusher::Vay : Vay's relativistic pusher preserving ExB velocity
+ * - particles::pusher::Boris : Boris' relativistic pusher preserving volume
+ * - particles::pusher::ReducedLandauLifshitz : 4th order RungeKutta pusher
+ *                                              with classical radiation reaction
+ * - particles::pusher::Composite : composite of two given pushers,
+ *                                  switches between using one (or none) of those
+ *
+ * For diagnostics & modeling: ------------------------------------------------
+ * - particles::pusher::Acceleration : Accelerate particles by applying a constant electric field
+ * - particles::pusher::Free : free propagation, ignore fields
+ *                             (= free stream model)
+ * - particles::pusher::Photon : propagate with c in direction of normalized mom.
+ * - particles::pusher::Probe : Probe particles that interpolate E & B
+ * For development purposes: --------------------------------------------------
+ * - particles::pusher::Axel : a pusher developed at HZDR during 2011 (testing)
+ */
+#ifndef PARAM_PARTICLEPUSHER
+#    define PARAM_PARTICLEPUSHER Boris
+#endif
+
+/* To avoid issues with commas in macro definitions,
+ * pass composite pushers via a special flag
+ */
+#ifndef PARAM_COMPOSITEPUSHER
+#    define PARAM_COMPOSITEPUSHER 0
+#endif
+
+#if PARAM_COMPOSITEPUSHER
+#    define PUSHER                                                                                                    \
+        particles::pusher::Composite<                                                                                 \
+            particles::pusher::Vay,                                                                                   \
+            particles::pusher::PARAM_PARTICLEPUSHER,                                                                  \
+            particles::pusher::CompositeBinarySwitchActivationFunctor<10>>
+#else
+#    define PUSHER particles::pusher::PARAM_PARTICLEPUSHER
+#endif
+
+    using UsedParticlePusher = PUSHER;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/speciesDefinition.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/speciesDefinition.param
new file mode 100644
index 0000000000..97a58949a7
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/speciesDefinition.param
@@ -0,0 +1,69 @@
+/* Copyright 2013-2021 Rene Widera, Benjamin Worpitz, Heiko Burau
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "picongpu/simulation_defines.hpp"
+#include "picongpu/particles/Particles.hpp"
+
+#include <pmacc/particles/Identifier.hpp>
+#include <pmacc/meta/conversion/MakeSeq.hpp>
+#include <pmacc/identifier/value_identifier.hpp>
+#include <pmacc/particles/traits/FilterByFlag.hpp>
+#include <pmacc/meta/String.hpp>
+
+
+namespace picongpu
+{
+    /*########################### define particle attributes #####################*/
+
+    /** describe attributes of a particle*/
+    using DefaultParticleAttributes = MakeSeq_t<position<position_pic>, momentum, weighting, probeE, probeB>;
+
+    /*########################### end particle attributes ########################*/
+
+    /*########################### define species #################################*/
+
+    /*--------------------------- electrons --------------------------------------*/
+
+    /* ratio relative to BASE_CHARGE and BASE_MASS */
+    value_identifier(float_X, MassRatioElectrons, 1.0);
+    value_identifier(float_X, ChargeRatioElectrons, 1.0);
+
+    using ParticleFlagsElectrons = MakeSeq_t<
+        particlePusher<UsedParticlePusher>,
+        shape<UsedParticleShape>,
+        interpolation<UsedField2Particle>,
+        current<UsedParticleCurrentSolver>,
+        massRatio<MassRatioElectrons>,
+        chargeRatio<ChargeRatioElectrons>>;
+
+    /* define species electrons */
+    using PIC_Electrons = Particles<PMACC_CSTRING("e"), ParticleFlagsElectrons, DefaultParticleAttributes>;
+
+    /*########################### end species ####################################*/
+
+    /** All known particle species of the simulation
+     *
+     * List all defined particle species from above in this list
+     * to make them available to the PIC algorithm.
+     */
+    using VectorAllSpecies = MakeSeq_t<PIC_Electrons>;
+
+} // namespace picongpu
diff --git a/share/picongpu/tests/compileParticlePusher/include/picongpu/param/speciesInitialization.param b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/speciesInitialization.param
new file mode 100644
index 0000000000..4d7745ef40
--- /dev/null
+++ b/share/picongpu/tests/compileParticlePusher/include/picongpu/param/speciesInitialization.param
@@ -0,0 +1,47 @@
+/* Copyright 2015-2021 Rene Widera, Axel Huebl
+ *
+ * This file is part of PIConGPU.
+ *
+ * PIConGPU is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * PIConGPU is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PIConGPU.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *
+ * Initialize particles inside particle species. This is the final step in
+ * setting up particles (defined in `speciesDefinition.param`) via density
+ * profiles (defined in `density.param`). One can then further derive particles
+ * from one species to another and manipulate attributes with "manipulators"
+ * and "filters" (defined in `particle.param` and `particleFilters.param`).
+ */
+
+#pragma once
+
+#include "picongpu/particles/InitFunctors.hpp"
+
+
+namespace picongpu
+{
+    namespace particles
+    {
+        /** InitPipeline define in which order species are initialized
+         *
+         * the functors are called in order (from first to last functor)
+         */
+        using InitPipeline = bmpl::vector<
+            CreateDensity<densityProfiles::Homogenous, startPosition::Quiet25ppc, PIC_Electrons>,
+            Manipulate<manipulators::AddTemperature, PIC_Electrons>>;
+
+    } // namespace particles
+} // namespace picongpu
diff --git a/share/pmacc/examples/gameOfLife2D/CMakeLists.txt b/share/pmacc/examples/gameOfLife2D/CMakeLists.txt
index b2baeefd9a..798d2646ab 100644
--- a/share/pmacc/examples/gameOfLife2D/CMakeLists.txt
+++ b/share/pmacc/examples/gameOfLife2D/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Rene Widera, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Axel Huebl
 #
 # This file is part of PMacc.
 #
@@ -23,7 +23,7 @@
 # Required cmake version
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 
 ################################################################################
@@ -60,10 +60,10 @@ endif()
 # Language Flags
 ###############################################################################
 
-# enforce C++11
+# enforce C++14
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 
 
 ################################################################################
diff --git a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
index ef6178feec..96cf25dceb 100644
--- a/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
+++ b/share/pmacc/examples/gameOfLife2D/include/Evolution.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Marco Garten
+/* Copyright 2013-2021 Rene Widera, Marco Garten
  *
  * This file is part of PMacc.
  *
@@ -26,11 +26,10 @@
 #include <pmacc/nvidia/functors/Assign.hpp>
 #include <pmacc/memory/boxes/CachedBox.hpp>
 #include <pmacc/memory/dataTypes/Mask.hpp>
-#include <pmacc/memory/MakeUnique.hpp>
 #include <pmacc/dimensions/DataSpaceOperations.hpp>
-#include <pmacc/nvidia/rng/RNG.hpp>
-#include <pmacc/nvidia/rng/methods/Xor.hpp>
-#include <pmacc/nvidia/rng/distributions/Uniform_float.hpp>
+#include <pmacc/random/distributions/distributions.hpp>
+#include <pmacc/random/methods/methods.hpp>
+#include <pmacc/random/Random.hpp>
 #include <pmacc/traits/GetNumWorkers.hpp>
 #include <pmacc/mappings/threads/ForEachIdx.hpp>
 #include <pmacc/mappings/threads/IdxConfig.hpp>
@@ -39,269 +38,188 @@
 
 namespace gol
 {
-namespace kernel
-{
-    using namespace pmacc;
-
-    /** run game of life stencil
-     *
-     * evaluate each cell in the supercell
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template< uint32_t T_numWorkers >
-    struct Evolution
+    namespace kernel
     {
-        /** run stencil for a supercell
+        using namespace pmacc;
+
+        /** run game of life stencil
          *
-         * @tparam T_BoxReadOnly PMacc::DataBox, box type of the old grid data
-         * @tparam T_BoxWriteOnly PMacc::DataBox, box type of the new grid data
-         * @tparam T_Mapping mapping functor type
+         * evaluate each cell in the supercell
          *
-         * @param buffRead buffer with cell data of the current step
-         * @param buffWrite buffer for the updated cell data
-         * @param rule description of the rule as bitmap mask
-         * @param mapper functor to map a block to a supercell
+         * @tparam T_numWorkers number of workers
          */
-        template<
-            typename T_BoxReadOnly,
-            typename T_BoxWriteOnly,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_BoxReadOnly const & buffRead,
-            T_BoxWriteOnly & buffWrite,
-            uint32_t const rule,
-            T_Mapping const & mapper
-        ) const
+        template<uint32_t T_numWorkers>
+        struct Evolution
         {
-            using namespace mappings::threads;
-
-            using Type = typename T_BoxReadOnly::ValueType;
-            using SuperCellSize = typename T_Mapping::SuperCellSize;
-            using BlockArea = SuperCellDescription<
-                SuperCellSize,
-                math::CT::Int< 1, 1 >,
-                math::CT::Int< 1, 1 >
-            >;
-            auto cache = CachedBox::create<
-                0,
-                Type
-            >( acc, BlockArea( ) );
-
-            Space const block( mapper.getSuperCellIndex( Space( blockIdx ) ) );
-            Space const blockCell = block * T_Mapping::SuperCellSize::toRT( );
-
-            constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorkers = T_numWorkers;
-            uint32_t const workerIdx = threadIdx.x;
-
-            auto buffRead_shifted = buffRead.shift( blockCell );
-
-            ThreadCollective<
-                BlockArea,
-                numWorkers
-            > collective( workerIdx );
-
-            nvidia::functors::Assign assign;
-            collective(
-                acc,
-                assign,
-                cache,
-                buffRead_shifted
-            );
-
-            __syncthreads();
-
-            ForEachIdx<
-                IdxConfig<
-                    cellsPerSuperCell,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
+            /** run stencil for a supercell
+             *
+             * @tparam T_BoxReadOnly PMacc::DataBox, box type of the old grid data
+             * @tparam T_BoxWriteOnly PMacc::DataBox, box type of the new grid data
+             * @tparam T_Mapping mapping functor type
+             *
+             * @param buffRead buffer with cell data of the current step
+             * @param buffWrite buffer for the updated cell data
+             * @param rule description of the rule as bitmap mask
+             * @param mapper functor to map a block to a supercell
+             */
+            template<typename T_BoxReadOnly, typename T_BoxWriteOnly, typename T_Mapping, typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                T_BoxReadOnly const& buffRead,
+                T_BoxWriteOnly& buffWrite,
+                uint32_t const rule,
+                T_Mapping const& mapper) const
+            {
+                using namespace mappings::threads;
+
+                using Type = typename T_BoxReadOnly::ValueType;
+                using SuperCellSize = typename T_Mapping::SuperCellSize;
+                using BlockArea = SuperCellDescription<SuperCellSize, math::CT::Int<1, 1>, math::CT::Int<1, 1>>;
+                auto cache = CachedBox::create<0, Type>(acc, BlockArea());
+
+                Space const block(mapper.getSuperCellIndex(Space(cupla::blockIdx(acc))));
+                Space const blockCell = block * T_Mapping::SuperCellSize::toRT();
+
+                constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                constexpr uint32_t numWorkers = T_numWorkers;
+                uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                auto buffRead_shifted = buffRead.shift(blockCell);
+
+                ThreadCollective<BlockArea, numWorkers> collective(workerIdx);
+
+                nvidia::functors::Assign assign;
+                collective(acc, assign, cache, buffRead_shifted);
+
+                cupla::__syncthreads(acc);
+
+                ForEachIdx<IdxConfig<cellsPerSuperCell, numWorkers>>{
+                    workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
                     // cell index within the superCell
-                    DataSpace< DIM2 > const cellIdx = DataSpaceOperations< DIM2 >::template map< SuperCellSize >( linearIdx );
+                    DataSpace<DIM2> const cellIdx = DataSpaceOperations<DIM2>::template map<SuperCellSize>(linearIdx);
 
                     Type neighbors = 0;
-                    for (uint32_t i = 1; i < 9; ++i)
+                    for(uint32_t i = 1; i < 9; ++i)
                     {
-                        Space const offset( Mask::getRelativeDirections< DIM2 > ( i ) );
-                        neighbors += cache( cellIdx + offset );
+                        Space const offset(Mask::getRelativeDirections<DIM2>(i));
+                        neighbors += cache(cellIdx + offset);
                     }
 
-                    Type isLife = cache( cellIdx );
-                    isLife = static_cast< bool >( ( (!isLife)*( 1 << (neighbors + 9) ) ) & rule ) +
-                        static_cast< bool >( ( isLife*( 1 << ( neighbors ) ) ) & rule );
+                    Type isLife = cache(cellIdx);
+                    isLife = static_cast<bool>(((!isLife) * (1 << (neighbors + 9))) & rule)
+                        + static_cast<bool>((isLife * (1 << (neighbors))) & rule);
 
-                    buffWrite( blockCell + cellIdx ) = isLife;
-                }
-            );
-        }
-    };
+                    buffWrite(blockCell + cellIdx) = isLife;
+                });
+            }
+        };
 
-    /** initialize each cell
-     *
-     * randomly activate each cell within a supercell
-     *
-     * @tparam T_numWorkers number of workers
-     */
-    template< uint32_t T_numWorkers >
-    struct RandomInit
-    {
         /** initialize each cell
          *
-         * @tparam T_BoxWriteOnly PMacc::DataBox, box type of the new grid data
-         * @tparam T_Mapping mapping functor type
+         * randomly activate each cell within a supercell
          *
-         * @param buffRead buffer with cell data of the current step
-         * @param seed random number generator seed
-         * @param threshold threshold to activate a cell, range [0.0;1.0]
-         *                  if random number is <= threshold than the cell will
-         *                  be activated
-         * @param mapper functor to map a block to a supercell
+         * @tparam T_numWorkers number of workers
          */
-        template<
-            typename T_BoxWriteOnly,
-            typename T_Mapping,
-            typename T_Acc
-        >
-        DINLINE void operator()(
-            T_Acc const & acc,
-            T_BoxWriteOnly & buffWrite,
-            uint32_t const seed,
-            float const threshold,
-            T_Mapping const & mapper
-        ) const
+        template<uint32_t T_numWorkers>
+        struct RandomInit
         {
-            using namespace mappings::threads;
-
-            using SuperCellSize = typename T_Mapping::SuperCellSize;
-            constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume< SuperCellSize >::type::value;
-            constexpr uint32_t numWorkers = T_numWorkers;
-            uint32_t const workerIdx = threadIdx.x;
-
-            // get position in grid in units of SuperCells from blockID
-            Space const block( mapper.getSuperCellIndex( Space( blockIdx ) ) );
-            // convert position in unit of cells
-            Space const blockCell = block * T_Mapping::SuperCellSize::toRT( );
-            // convert CUDA dim3 to DataSpace<DIM3>
-            Space const threadIndex(threadIdx);
-
-            uint32_t const globalUniqueId = DataSpaceOperations< DIM2 >::map(
-                mapper.getGridSuperCells() * T_Mapping::SuperCellSize::toRT(),
-                blockCell + DataSpaceOperations< DIM2 >::template map< SuperCellSize >( workerIdx )
-            );
-
-            // get uniform random number from seed
-            auto rng = nvidia::rng::create(
-                nvidia::rng::methods::Xor< T_Acc >( acc, seed, globalUniqueId ),
-                nvidia::rng::distributions::Uniform_float::get( acc )
-            );
-
-            ForEachIdx<
-                IdxConfig<
-                    cellsPerSuperCell,
-                    numWorkers
-                >
-            >{ workerIdx }(
-                [&](
-                    uint32_t const linearIdx,
-                    uint32_t const
-                )
-                {
+            /** initialize each cell
+             *
+             * @tparam T_BoxWriteOnly PMacc::DataBox, box type of the new grid data
+             * @tparam T_Mapping mapping functor type
+             *
+             * @param buffRead buffer with cell data of the current step
+             * @param seed random number generator seed
+             * @param threshold threshold to activate a cell, range [0.0;1.0]
+             *                  if random number is <= threshold than the cell will
+             *                  be activated
+             * @param mapper functor to map a block to a supercell
+             */
+            template<typename T_BoxWriteOnly, typename T_Mapping, typename T_Acc>
+            DINLINE void operator()(
+                T_Acc const& acc,
+                T_BoxWriteOnly& buffWrite,
+                uint32_t const seed,
+                float const threshold,
+                T_Mapping const& mapper) const
+            {
+                using namespace mappings::threads;
+
+                using SuperCellSize = typename T_Mapping::SuperCellSize;
+                constexpr uint32_t cellsPerSuperCell = pmacc::math::CT::volume<SuperCellSize>::type::value;
+                constexpr uint32_t numWorkers = T_numWorkers;
+                uint32_t const workerIdx = cupla::threadIdx(acc).x;
+
+                // get position in grid in units of SuperCells from blockID
+                Space const block(mapper.getSuperCellIndex(Space(cupla::blockIdx(acc))));
+                // convert position in unit of cells
+                Space const blockCell = block * T_Mapping::SuperCellSize::toRT();
+                // convert CUDA dim3 to DataSpace<DIM3>
+                Space const threadIndex(cupla::threadIdx(acc));
+
+                uint32_t const globalUniqueId = DataSpaceOperations<DIM2>::map(
+                    mapper.getGridSuperCells() * T_Mapping::SuperCellSize::toRT(),
+                    blockCell + DataSpaceOperations<DIM2>::template map<SuperCellSize>(workerIdx));
+
+                // create a random number state and generator
+                using RngMethod = random::methods::XorMin<T_Acc>;
+                using State = typename RngMethod::StateType;
+                State state;
+                RngMethod method;
+                method.init(acc, state, seed, globalUniqueId);
+                using Distribution = random::distributions::Uniform<float, RngMethod>;
+                using Random = random::Random<Distribution, RngMethod, State*>;
+                Random rng(&state);
+
+                ForEachIdx<IdxConfig<cellsPerSuperCell, numWorkers>>{
+                    workerIdx}([&](uint32_t const linearIdx, uint32_t const) {
                     // cell index within the superCell
-                    DataSpace< DIM2 > const cellIdx = DataSpaceOperations< DIM2 >::template map< SuperCellSize >( linearIdx );
+                    DataSpace<DIM2> const cellIdx = DataSpaceOperations<DIM2>::template map<SuperCellSize>(linearIdx);
                     // write 1(white) if uniform random number 0<rng<1 is smaller than 'threshold'
-                    buffWrite( blockCell + cellIdx ) = static_cast< bool >( rng() <= threshold );
-                }
-            );
-        }
-    };
-} // namespace kernel
+                    buffWrite(blockCell + cellIdx) = static_cast<bool>(rng(acc) <= threshold);
+                });
+            }
+        };
+    } // namespace kernel
 
-    template< typename T_MappingDesc >
+    template<typename T_MappingDesc>
     struct Evolution
     {
-        std::unique_ptr< T_MappingDesc > mapping;
+        std::unique_ptr<T_MappingDesc> mapping;
         uint32_t rule;
 
-        Evolution( uint32_t rule ) : rule( rule )
+        Evolution(uint32_t rule) : rule(rule)
         {
-
         }
 
-        void init(
-            Space const & layout,
-            Space const & guardSize
-        )
+        void init(Space const& layout, Space const& guardSize)
         {
-            mapping = memory::makeUnique< T_MappingDesc >(
-                layout,
-                guardSize
-            );
+            mapping = std::make_unique<T_MappingDesc>(layout, guardSize);
         }
 
-        template< typename DBox >
-        void initEvolution(
-            DBox const & writeBox,
-            float const fraction
-        )
+        template<typename DBox>
+        void initEvolution(DBox const& writeBox, float const fraction)
         {
-            AreaMapping <
-                CORE + BORDER,
-                T_MappingDesc
-            > mapper( *mapping );
-            constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                math::CT::volume< typename T_MappingDesc::SuperCellSize >::type::value
-            >::value;
+            AreaMapping<CORE + BORDER, T_MappingDesc> mapper(*mapping);
+            constexpr uint32_t numWorkers
+                = traits::GetNumWorkers<math::CT::volume<typename T_MappingDesc::SuperCellSize>::type::value>::value;
 
-            GridController< DIM2 >& gc = Environment< DIM2 >::get( ).GridController( );
-            uint32_t seed = gc.getGlobalSize( ) + gc.getGlobalRank( );
+            GridController<DIM2>& gc = Environment<DIM2>::get().GridController();
+            uint32_t seed = gc.getGlobalSize() + gc.getGlobalRank();
 
-            PMACC_KERNEL( kernel::RandomInit< numWorkers >{ } )(
-                mapper.getGridDim( ),
-                numWorkers
-            )(
-                writeBox,
-                seed,
-                fraction,
-                mapper
-            );
+            PMACC_KERNEL(kernel::RandomInit<numWorkers>{})
+            (mapper.getGridDim(), numWorkers)(writeBox, seed, fraction, mapper);
         }
 
-        template<
-            uint32_t Area,
-            typename DBox
-        >
-        void run(
-            DBox const & readBox,
-            DBox const & writeBox
-        )
+        template<uint32_t Area, typename DBox>
+        void run(DBox const& readBox, DBox const& writeBox)
         {
-            AreaMapping <
-                Area,
-                T_MappingDesc
-            > mapper( *mapping );
-            constexpr uint32_t numWorkers = traits::GetNumWorkers<
-                math::CT::volume< typename T_MappingDesc::SuperCellSize >::type::value
-            >::value;
+            AreaMapping<Area, T_MappingDesc> mapper(*mapping);
+            constexpr uint32_t numWorkers
+                = traits::GetNumWorkers<math::CT::volume<typename T_MappingDesc::SuperCellSize>::type::value>::value;
 
-            PMACC_KERNEL( kernel::Evolution< numWorkers >{ } )(
-                mapper.getGridDim( ),
-                numWorkers
-            )(
-                readBox,
-                writeBox,
-                rule,
-                mapper
-            );
+            PMACC_KERNEL(kernel::Evolution<numWorkers>{})
+            (mapper.getGridDim(), numWorkers)(readBox, writeBox, rule, mapper);
         }
     };
 
diff --git a/share/pmacc/examples/gameOfLife2D/include/GatherSlice.hpp b/share/pmacc/examples/gameOfLife2D/include/GatherSlice.hpp
index d33ed5c18d..6ee1644067 100644
--- a/share/pmacc/examples/gameOfLife2D/include/GatherSlice.hpp
+++ b/share/pmacc/examples/gameOfLife2D/include/GatherSlice.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Axel Huebl, Heiko Burau, Rene Widera,
+/* Copyright 2013-2021 Axel Huebl, Heiko Burau, Rene Widera,
  *                     Maximilian Knespel, Benjamin Worpitz
  *
  * This file is part of PMacc.
@@ -24,210 +24,210 @@
 #include <pmacc/mappings/simulation/GridController.hpp>
 #include <pmacc/memory/boxes/PitchedBox.hpp>
 #include <pmacc/dimensions/DataSpace.hpp>
-#include <pmacc/types.hpp>                                  // DIM*
+#include <pmacc/types.hpp> // DIM*
 
 #include <mpi.h>
 
 namespace gol
 {
-using namespace pmacc;
+    using namespace pmacc;
 
-struct MessageHeader
-{
-
-    MessageHeader()
-    {
-    }
-
-    MessageHeader(Space simSize, GridLayout<DIM2> layout, Space nodeOffset) :
-    simSize(simSize),
-    nodeOffset(nodeOffset)
+    struct MessageHeader
     {
-        nodeSize = layout.getDataSpace();
-        nodePictureSize = layout.getDataSpaceWithoutGuarding();
-        nodeGuardCells = layout.getGuard();
-    }
-
-    Space simSize;
-    Space nodeSize;
-    Space nodePictureSize;
-    Space nodeGuardCells;
-    Space nodeOffset;
-
-};
-
-struct GatherSlice
-{
-
-    GatherSlice() : mpiRank(-1), numRanks(0), filteredData(nullptr), fullData(nullptr), isMPICommInitialized(false)
-    {
-    }
+        MessageHeader()
+        {
+        }
 
-    ~GatherSlice()
-    {
+        MessageHeader(Space simSize, GridLayout<DIM2> layout, Space nodeOffset)
+            : simSize(simSize)
+            , nodeOffset(nodeOffset)
+        {
+            nodeSize = layout.getDataSpace();
+            nodePictureSize = layout.getDataSpaceWithoutGuarding();
+            nodeGuardCells = layout.getGuard();
+        }
 
-    }
+        Space simSize;
+        Space nodeSize;
+        Space nodePictureSize;
+        Space nodeGuardCells;
+        Space nodeOffset;
+    };
 
-    void finalize()
+    struct GatherSlice
     {
-        if (filteredData != nullptr)
-        {
-            delete[] filteredData;
-            filteredData=nullptr;
-        }
-        if (fullData != nullptr)
+        GatherSlice() : mpiRank(-1), numRanks(0), filteredData(nullptr), fullData(nullptr), isMPICommInitialized(false)
         {
-            delete[] fullData;
-            fullData=nullptr;
         }
-        if (isMPICommInitialized)
+
+        ~GatherSlice()
         {
-            MPI_Comm_free(&comm);
-            isMPICommInitialized=false;
         }
-        mpiRank=-1;
-    }
-
-    /*
-     * Saves the message header and creates a new MPI group with all ranks
-     * that called this with isActive = true
-     * @return true if the current rank is the master of the new MPI group
-     */
-    bool init(const MessageHeader mHeader, bool isActive)
-    {
-        header = mHeader;
 
-        int countRanks = Environment<DIM2>::get().GridController().getGpuNodes().productOfComponents();
-        std::vector<int> gatherRanks(countRanks);
-        std::vector<int> groupRanks(countRanks);
-        mpiRank = Environment<DIM2>::get().GridController().getGlobalRank();
-        if (!isActive)
-            mpiRank = -1;
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Allgather(&mpiRank, 1, MPI_INT, &gatherRanks[0], 1, MPI_INT, MPI_COMM_WORLD));
-
-        for (int i = 0; i < countRanks; ++i)
+        void finalize()
         {
-            if (gatherRanks[i] != -1)
+            if(filteredData != nullptr)
+            {
+                delete[] filteredData;
+                filteredData = nullptr;
+            }
+            if(fullData != nullptr)
+            {
+                delete[] fullData;
+                fullData = nullptr;
+            }
+            if(isMPICommInitialized)
             {
-                groupRanks[numRanks] = gatherRanks[i];
-                numRanks++;
+                MPI_Comm_free(&comm);
+                isMPICommInitialized = false;
             }
+            mpiRank = -1;
         }
 
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_Group group;
-        MPI_Group newgroup;
-        MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &group));
-        MPI_CHECK(MPI_Group_incl(group, numRanks, &groupRanks[0], &newgroup));
-
-        MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, newgroup, &comm));
-
-        if (mpiRank != -1)
+        /*
+         * Saves the message header and creates a new MPI group with all ranks
+         * that called this with isActive = true
+         * @return true if the current rank is the master of the new MPI group
+         */
+        bool init(const MessageHeader mHeader, bool isActive)
         {
-            MPI_Comm_rank(comm, &mpiRank);
-            isMPICommInitialized = true;
-        }
-
-        return mpiRank == 0;
-    }
+            header = mHeader;
 
-    template<class Box >
-    Box operator()(Box data)
-    {
-        typedef typename Box::ValueType ValueType;
+            int countRanks = Environment<DIM2>::get().GridController().getGpuNodes().productOfComponents();
+            std::vector<int> gatherRanks(countRanks);
+            std::vector<int> groupRanks(countRanks);
+            mpiRank = Environment<DIM2>::get().GridController().getGlobalRank();
+            if(!isActive)
+                mpiRank = -1;
 
-        Box dstBox = Box(PitchedBox<ValueType, DIM2 > (
-                                                       (ValueType*) filteredData,
-                                                       Space(),
-                                                       header.simSize,
-                                                       header.simSize.x() * sizeof (ValueType)
-                                                       ));
-        MessageHeader mHeader;
-        MessageHeader* fakeHeader = &mHeader;
-        memcpy(fakeHeader, &header, sizeof(MessageHeader));
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_CHECK(MPI_Allgather(&mpiRank, 1, MPI_INT, &gatherRanks[0], 1, MPI_INT, MPI_COMM_WORLD));
 
-        char* recvHeader = new char[ sizeof(MessageHeader)* numRanks];
-
-        if (fullData == nullptr && mpiRank == 0)
-            fullData = (char*) new ValueType[header.nodeSize.productOfComponents() * numRanks];
-
-        // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
-        __getTransactionEvent().waitForFinished();
-        MPI_CHECK(MPI_Gather(fakeHeader, sizeof(MessageHeader), MPI_CHAR, recvHeader, sizeof(MessageHeader),
-                             MPI_CHAR, 0, comm));
+            for(int i = 0; i < countRanks; ++i)
+            {
+                if(gatherRanks[i] != -1)
+                {
+                    groupRanks[numRanks] = gatherRanks[i];
+                    numRanks++;
+                }
+            }
 
-        const size_t elementsCount = header.nodeSize.productOfComponents() * sizeof (ValueType);
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_Group group;
+            MPI_Group newgroup;
+            MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &group));
+            MPI_CHECK(MPI_Group_incl(group, numRanks, &groupRanks[0], &newgroup));
 
-        MPI_CHECK(MPI_Gather(
-                             (char*) (data.getPointer()), elementsCount, MPI_CHAR,
-                             fullData, elementsCount, MPI_CHAR,
-                             0, comm));
+            MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, newgroup, &comm));
 
+            if(mpiRank != -1)
+            {
+                MPI_Comm_rank(comm, &mpiRank);
+                isMPICommInitialized = true;
+            }
 
+            return mpiRank == 0;
+        }
 
-        if (mpiRank == 0)
+        template<class Box>
+        Box operator()(Box data)
         {
-            if (filteredData == nullptr)
-                filteredData = (char*) new ValueType[header.simSize.productOfComponents()];
-
-            /*create box with valid memory*/
-            dstBox = Box(PitchedBox<ValueType, DIM2 > (
-                                                       (ValueType*) filteredData,
-                                                       Space(),
-                                                       header.simSize,
-                                                       header.simSize.x() * sizeof (ValueType)
-                                                       ));
-
-
-            for (int i = 0; i < numRanks; ++i)
+            typedef typename Box::ValueType ValueType;
+
+            Box dstBox = Box(PitchedBox<ValueType, DIM2>(
+                (ValueType*) filteredData,
+                Space(),
+                header.simSize,
+                header.simSize.x() * sizeof(ValueType)));
+            MessageHeader mHeader;
+            MessageHeader* fakeHeader = &mHeader;
+            memcpy(fakeHeader, &header, sizeof(MessageHeader));
+
+            char* recvHeader = new char[sizeof(MessageHeader) * numRanks];
+
+            if(fullData == nullptr && mpiRank == 0)
+                fullData = (char*) new ValueType[header.nodeSize.productOfComponents() * numRanks];
+
+            // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
+            __getTransactionEvent().waitForFinished();
+            MPI_CHECK(MPI_Gather(
+                fakeHeader,
+                sizeof(MessageHeader),
+                MPI_CHAR,
+                recvHeader,
+                sizeof(MessageHeader),
+                MPI_CHAR,
+                0,
+                comm));
+
+            const size_t elementsCount = header.nodeSize.productOfComponents() * sizeof(ValueType);
+
+            MPI_CHECK(MPI_Gather(
+                (char*) (data.getPointer()),
+                elementsCount,
+                MPI_CHAR,
+                fullData,
+                elementsCount,
+                MPI_CHAR,
+                0,
+                comm));
+
+
+            if(mpiRank == 0)
             {
-                MessageHeader* head = (MessageHeader*) (recvHeader + sizeof(MessageHeader)* i);
-                size_t offset = header.nodeSize.productOfComponents() * static_cast<size_t>(i);
-                Box srcBox = Box(PitchedBox<ValueType, DIM2 > (
-                                                               reinterpret_cast<ValueType*>(fullData) + offset,
-                                                               Space(),
-                                                               head->nodeSize,
-                                                               head->nodeSize.x() * sizeof (ValueType)
-                                                               ));
-
-                insertData(dstBox, srcBox, head->nodeOffset, head->nodePictureSize, head->nodeGuardCells);
+                if(filteredData == nullptr)
+                    filteredData = (char*) new ValueType[header.simSize.productOfComponents()];
+
+                /*create box with valid memory*/
+                dstBox = Box(PitchedBox<ValueType, DIM2>(
+                    (ValueType*) filteredData,
+                    Space(),
+                    header.simSize,
+                    header.simSize.x() * sizeof(ValueType)));
+
+
+                for(int i = 0; i < numRanks; ++i)
+                {
+                    MessageHeader* head = (MessageHeader*) (recvHeader + sizeof(MessageHeader) * i);
+                    size_t offset = header.nodeSize.productOfComponents() * static_cast<size_t>(i);
+                    Box srcBox = Box(PitchedBox<ValueType, DIM2>(
+                        reinterpret_cast<ValueType*>(fullData) + offset,
+                        Space(),
+                        head->nodeSize,
+                        head->nodeSize.x() * sizeof(ValueType)));
+
+                    insertData(dstBox, srcBox, head->nodeOffset, head->nodePictureSize, head->nodeGuardCells);
+                }
             }
 
-        }
-
-        delete[] recvHeader;
+            delete[] recvHeader;
 
-        return dstBox;
-    }
+            return dstBox;
+        }
 
-    template<class DstBox, class SrcBox>
-    void insertData(DstBox& dst, const SrcBox& src, Space offsetToSimNull, Space srcSize, Space nodeGuardCells)
-    {
-        for (int y = 0; y < srcSize.y(); ++y)
+        template<class DstBox, class SrcBox>
+        void insertData(DstBox& dst, const SrcBox& src, Space offsetToSimNull, Space srcSize, Space nodeGuardCells)
         {
-            for (int x = 0; x < srcSize.x(); ++x)
+            for(int y = 0; y < srcSize.y(); ++y)
             {
-                dst[y + offsetToSimNull.y()][x + offsetToSimNull.x()] =
-                    src[nodeGuardCells.y() + y][nodeGuardCells.x() + x];
+                for(int x = 0; x < srcSize.x(); ++x)
+                {
+                    dst[y + offsetToSimNull.y()][x + offsetToSimNull.x()]
+                        = src[nodeGuardCells.y() + y][nodeGuardCells.x() + x];
+                }
             }
         }
-    }
-
-private:
-
-    char* filteredData;
-    char* fullData;
-    MPI_Comm comm;
-    int mpiRank;
-    int numRanks;
-    bool isMPICommInitialized;
-    MessageHeader header;
-};
-
-}//namespace
-
 
+    private:
+        char* filteredData;
+        char* fullData;
+        MPI_Comm comm;
+        int mpiRank;
+        int numRanks;
+        bool isMPICommInitialized;
+        MessageHeader header;
+    };
+
+} // namespace gol
diff --git a/share/pmacc/examples/gameOfLife2D/include/PngCreator.hpp b/share/pmacc/examples/gameOfLife2D/include/PngCreator.hpp
index e16207faf5..061add1bf0 100644
--- a/share/pmacc/examples/gameOfLife2D/include/PngCreator.hpp
+++ b/share/pmacc/examples/gameOfLife2D/include/PngCreator.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Heiko Burau, Rene Widera
+/* Copyright 2013-2021 Heiko Burau, Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -25,12 +25,10 @@
 
 namespace gol
 {
-
     struct PngCreator
     {
-
         template<class DBox>
-        void operator() (uint32_t currentStep, DBox data, Space dataSize)
+        void operator()(uint32_t currentStep, DBox data, Space dataSize)
         {
             std::stringstream step;
             step << std::setw(6) << std::setfill('0') << currentStep;
@@ -38,11 +36,11 @@ namespace gol
             pngwriter png(dataSize.x(), dataSize.y(), 0, filename.c_str());
             png.setcompressionlevel(9);
 
-            for (int y = 0; y < dataSize.y(); ++y)
+            for(int y = 0; y < dataSize.y(); ++y)
             {
-                for (int x = 0; x < dataSize.x(); ++x)
+                for(int x = 0; x < dataSize.x(); ++x)
                 {
-                    float p = data[y ][x ];
+                    float p = data[y][x];
                     png.plot(x + 1, dataSize.y() - y, p, p, p);
                 }
             }
@@ -50,5 +48,4 @@ namespace gol
         }
     };
 
-}
-
+} // namespace gol
diff --git a/share/pmacc/examples/gameOfLife2D/include/Simulation.hpp b/share/pmacc/examples/gameOfLife2D/include/Simulation.hpp
index 7ce13e9557..66f777a92c 100644
--- a/share/pmacc/examples/gameOfLife2D/include/Simulation.hpp
+++ b/share/pmacc/examples/gameOfLife2D/include/Simulation.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera, Maximilian Knespel, Alexander Grund
+/* Copyright 2013-2021 Rene Widera, Maximilian Knespel, Alexander Grund
  *
  * This file is part of PMacc.
  *
@@ -41,198 +41,195 @@
 
 namespace gol
 {
-
-class Simulation
-{
-private:
-    /* math::CT::Int<16,16> is arbitrarily chosen SuperCellSize! */
-    typedef MappingDescription<DIM2, math::CT::Int< 16, 16 > > MappingDesc;
-    typedef Evolution<MappingDesc> Evolutiontype;
-
-    Space gridSize;
-    /* holds rule mask derived from 23/3 input, \see Evolution.hpp */
-    Evolutiontype evo;
-    GatherSlice gather;
-
-    /* for storing black (dead) and white (alive) data for gol */
-    Buffer* buff1; /* Buffer(\see types.h) for swapping between old and new world */
-    Buffer* buff2; /* like evolve(buff2 &, const buff1) would work internally */
-    uint32_t steps;
-
-    bool isMaster;
-
-public:
-
-    Simulation(uint32_t rule, int32_t steps, Space gridSize, Space devices, Space periodic) :
-    evo(rule), steps(steps), gridSize(gridSize), isMaster(false), buff1(nullptr), buff2(nullptr)
-    {
-        /* -First this initializes the GridController with number of 'devices'*
-         *  and 'periodic'ity. The init-routine will then create and manage   *
-         *  the MPI processes and communication group and topology.           *
-         * -Second the cudaDevices will be allocated to the corresponding     *
-         *  Host MPI processes where hostRank == deviceNumber, if the device  *
-         *  is not marked to be used exclusively by another process. This     *
-         *  affects: cudaMalloc,cudaKernelLaunch,                             *
-         * -Then the CUDA Stream Controller is activated and one stream is    *
-         *  added. It's basically a List of cudaStreams. Used to parallelize  *
-         *  Memory transfers and calculations.                                *
-         * -Initialize TransactionManager                                     */
-        Environment<DIM2>::get().initDevices(devices, periodic);
-
-        /* Now we have allocated every node to a grid position in the GC. We  *
-         * use that grid position to allocate every node to a position in the *
-         * physic grid. Using the localGridSize = the number of cells per     *
-         * node = number of cells / nodes, we can get the position of the     *
-         * current node as an offset in numbers of cells                      */
-        GridController<DIM2> & gc = Environment<DIM2>::get().GridController();
-        Space localGridSize(gridSize / devices);
-
-        /* - This forwards arguments to SubGrid.init()                        *
-         * - Create Singletons: EnvironmentController, DataConnector,         *
-         *                      PluginConnector, nvidia::memory::MemoryInfo   */
-        Environment<DIM2>::get().initGrids( gridSize, localGridSize,
-                                            gc.getPosition() * localGridSize);
-    }
-
-    virtual ~Simulation()
-    {
-    }
-
-    void finalize()
-    {
-        gather.finalize();
-        __delete(buff1);
-        __delete(buff2);
-    }
-
-    void init()
+    class Simulation
     {
-        /* subGrid holds global and
-         * local SimulationSize and where the local SimArea is in the greater
-         * scheme using Offsets from global LEFT, TOP, FRONT
-         */
-        const SubGrid<DIM2>& subGrid = Environment<DIM2>::get().SubGrid();
-
-        /* The following sets up the local layout which consists of the actual
-         * grid cells and some surrounding cells, called guards.
-         *
-         * ASCII Visualization: example taken for 1D,
-         * distributed over 2 GPUs, only 1 border shown between those two GPUs
-         * assuming non-periodic boundary conditions.
-         * In a N-GPU or periodic example, border cells guard cells exist in each direction.
-         * _______GPU 0________       _______GPU 1________
-         * | 0 | 1 | 2 | 3 | 4 |      | 3 | 4 | 5 | 6 | 7 |  <-- Global (super)cell idx
-         * |___|___|___|___|___|      |___|___|___|___|___|
-         * |___Core____|Bor|Gua|      |Gua|Bor|___Core____|
-         * |___________|der|rd_|      |rd_|der|___________|
-         * |__"real" cells_|***|      |***|__"real" cells_|
-         *
-         * |***| Clones cells which correspond to the border cells of the neighbor GPU
-         *       (sometimes also called "ghost" or "halo" cells/region)
-         *
-         * Recall that the following is defined:
-         *     typedef MappingDescription<DIM2, math::CT::Int<16,16> > MappingDesc;
-         * where math::CT::Int<16,16> is arbitrarily(!) chosen SuperCellSize
-         * and DIM2 is the dimension of the grid.
-         * Expression of 2nd argument translates to DataSpace<DIM3>(16,16,0).
-         * This is the guard size (here set to be one Supercell wide in all
-         * directions). Meaning we have 16*16*(2*grid.x+2*grid.y+4) more
-         * cells in GridLayout than in the SubGrid.
-         * The formula above is SuperCellSize * TotalNumGuardCells with (in this case)
-         * SuperCellSize = 16*16 (16 cells in 2 dimensions)
-         * TotalNumGuardCells =   2 * grid.x (top and bottom)
-         *                      + 2 * grid.y (left and right)
-         *                      + 4          (the corners)
-         */
-        GridLayout<DIM2> layout( subGrid.getLocalDomain().size,
-                                 MappingDesc::SuperCellSize::toRT());
-
-        /* getDataSpace will return DataSpace( grid.x +16+16, grid.y +16+16)  *
-         * MappingDesc stores the layout regarding Core, Border and Guard     *
-         * in units of SuperCells.                                            *
-         * This is saved by init to be used by the kernel to identify itself. */
-        evo.init(layout.getDataSpace(), Space::create(1));
-
-        buff1 = new Buffer(layout, false);
-        buff2 = new Buffer(layout, false);
-
-        /* Set up the future data exchange. In this case we need to copy the
-         * border cells of our neighbors to our guard cells, since we only read
-         * from the guard cells but never write to it.
-         * guardingCells holds the number of guard(super)cells in each dimension
-         */
-        Space guardingCells(1, 1);
-        for (uint32_t i = 1; i < traits::NumberOfExchanges<DIM2>::value; ++i)
+    private:
+        /* math::CT::Int<16,16> is arbitrarily chosen SuperCellSize! */
+        typedef MappingDescription<DIM2, math::CT::Int<16, 16>> MappingDesc;
+        typedef Evolution<MappingDesc> Evolutiontype;
+
+        Space gridSize;
+        /* holds rule mask derived from 23/3 input, \see Evolution.hpp */
+        Evolutiontype evo;
+        GatherSlice gather;
+
+        /* for storing black (dead) and white (alive) data for gol */
+        Buffer* buff1; /* Buffer(\see types.h) for swapping between old and new world */
+        Buffer* buff2; /* like evolve(buff2 &, const buff1) would work internally */
+        uint32_t steps;
+
+        bool isMaster;
+
+    public:
+        Simulation(uint32_t rule, int32_t steps, Space gridSize, Space devices, Space periodic)
+            : evo(rule)
+            , steps(steps)
+            , gridSize(gridSize)
+            , isMaster(false)
+            , buff1(nullptr)
+            , buff2(nullptr)
         {
-            /* to check which number corresponds to which direction, you can  *
-             * use the following member of class Mask like done in the two    *
-             * lines below:                                                   *
-             * DataSpace<DIM2>relVec = Mask::getRelativeDirections<DIM2>(i);  *
-             * std::cout << "Direction:" << i << " => Vec: (" << relVec[0]    *
-             *           << "," << relVec[1] << ")\n";                        *
-             * The result is: 1:right(1,0), 2:left(-1,0), 3:up(0,1),          *
-             *    4:up right(1,1), 5:(-1,1), 6:(0,-1), 7:(1,-1), 8:(-1,-1)    */
-
-            /* types.hpp: enum CommunicationTags{ BUFF1 = 0u, BUFF2 = 1u };   */
-            buff1->addExchange(GUARD, Mask(i), guardingCells, BUFF1);
-            buff2->addExchange(GUARD, Mask(i), guardingCells, BUFF2);
+            /* -First this initializes the GridController with number of 'devices'*
+             *  and 'periodic'ity. The init-routine will then create and manage   *
+             *  the MPI processes and communication group and topology.           *
+             * -Second the cudaDevices will be allocated to the corresponding     *
+             *  Host MPI processes where hostRank == deviceNumber, if the device  *
+             *  is not marked to be used exclusively by another process. This     *
+             *  affects: cudaMalloc,cudaKernelLaunch,                             *
+             * -Then the CUDA Stream Controller is activated and one stream is    *
+             *  added. It's basically a List of cudaStreams. Used to parallelize  *
+             *  Memory transfers and calculations.                                *
+             * -Initialize TransactionManager                                     */
+            Environment<DIM2>::get().initDevices(devices, periodic);
+
+            /* Now we have allocated every node to a grid position in the GC. We  *
+             * use that grid position to allocate every node to a position in the *
+             * physic grid. Using the localGridSize = the number of cells per     *
+             * node = number of cells / nodes, we can get the position of the     *
+             * current node as an offset in numbers of cells                      */
+            GridController<DIM2>& gc = Environment<DIM2>::get().GridController();
+            Space localGridSize(gridSize / devices);
+
+            /* - This forwards arguments to SubGrid.init()                        *
+             * - Create Singletons: EnvironmentController, DataConnector,         *
+             *                      PluginConnector, nvidia::memory::MemoryInfo   */
+            Environment<DIM2>::get().initGrids(gridSize, localGridSize, gc.getPosition() * localGridSize);
         }
 
-         /* Both next lines are defined in GatherSlice.hpp:                   *
-          *  -gather saves the MessageHeader object                           *
-          *  -Then do an Allgather for the gloabalRanks from GC, sort out     *
-          *  -inactive processes (second/boolean ,argument in gather.init) and*
-          *   save new MPI_COMMUNICATOR created from these into private var.  *
-          *  -return if rank == 0                                             */
-        MessageHeader header(gridSize, layout, subGrid.getLocalDomain().offset);
-        isMaster = gather.init(header, true);
+        virtual ~Simulation()
+        {
+        }
 
-        /* Calls kernel to initialize random generator. Game of Life is then  *
-         * initialized using uniform random numbers. With 10% (second arg)    *
-         * white points. World will be written to buffer in first argument    */
-        evo.initEvolution(buff1->getDeviceBuffer().getDataBox(), 0.1);
+        void finalize()
+        {
+            gather.finalize();
+            __delete(buff1);
+            __delete(buff2);
+        }
 
-    }
+        void init()
+        {
+            /* subGrid holds global and
+             * local SimulationSize and where the local SimArea is in the greater
+             * scheme using Offsets from global LEFT, TOP, FRONT
+             */
+            const SubGrid<DIM2>& subGrid = Environment<DIM2>::get().SubGrid();
+
+            /* The following sets up the local layout which consists of the actual
+             * grid cells and some surrounding cells, called guards.
+             *
+             * ASCII Visualization: example taken for 1D,
+             * distributed over 2 GPUs, only 1 border shown between those two GPUs
+             * assuming non-periodic boundary conditions.
+             * In a N-GPU or periodic example, border cells guard cells exist in each direction.
+             * _______GPU 0________       _______GPU 1________
+             * | 0 | 1 | 2 | 3 | 4 |      | 3 | 4 | 5 | 6 | 7 |  <-- Global (super)cell idx
+             * |___|___|___|___|___|      |___|___|___|___|___|
+             * |___Core____|Bor|Gua|      |Gua|Bor|___Core____|
+             * |___________|der|rd_|      |rd_|der|___________|
+             * |__"real" cells_|***|      |***|__"real" cells_|
+             *
+             * |***| Clones cells which correspond to the border cells of the neighbor GPU
+             *       (sometimes also called "ghost" or "halo" cells/region)
+             *
+             * Recall that the following is defined:
+             *     typedef MappingDescription<DIM2, math::CT::Int<16,16> > MappingDesc;
+             * where math::CT::Int<16,16> is arbitrarily(!) chosen SuperCellSize
+             * and DIM2 is the dimension of the grid.
+             * Expression of 2nd argument translates to DataSpace<DIM3>(16,16,0).
+             * This is the guard size (here set to be one Supercell wide in all
+             * directions). Meaning we have 16*16*(2*grid.x+2*grid.y+4) more
+             * cells in GridLayout than in the SubGrid.
+             * The formula above is SuperCellSize * TotalNumGuardCells with (in this case)
+             * SuperCellSize = 16*16 (16 cells in 2 dimensions)
+             * TotalNumGuardCells =   2 * grid.x (top and bottom)
+             *                      + 2 * grid.y (left and right)
+             *                      + 4          (the corners)
+             */
+            GridLayout<DIM2> layout(subGrid.getLocalDomain().size, MappingDesc::SuperCellSize::toRT());
+
+            /* getDataSpace will return DataSpace( grid.x +16+16, grid.y +16+16)  *
+             * MappingDesc stores the layout regarding Core, Border and Guard     *
+             * in units of SuperCells.                                            *
+             * This is saved by init to be used by the kernel to identify itself. */
+            evo.init(layout.getDataSpace(), Space::create(1));
+
+            buff1 = new Buffer(layout, false);
+            buff2 = new Buffer(layout, false);
+
+            /* Set up the future data exchange. In this case we need to copy the
+             * border cells of our neighbors to our guard cells, since we only read
+             * from the guard cells but never write to it.
+             * guardingCells holds the number of guard(super)cells in each dimension
+             */
+            Space guardingCells(1, 1);
+            for(uint32_t i = 1; i < traits::NumberOfExchanges<DIM2>::value; ++i)
+            {
+                /* to check which number corresponds to which direction, you can  *
+                 * use the following member of class Mask like done in the two    *
+                 * lines below:                                                   *
+                 * DataSpace<DIM2>relVec = Mask::getRelativeDirections<DIM2>(i);  *
+                 * std::cout << "Direction:" << i << " => Vec: (" << relVec[0]    *
+                 *           << "," << relVec[1] << ")\n";                        *
+                 * The result is: 1:right(1,0), 2:left(-1,0), 3:up(0,1),          *
+                 *    4:up right(1,1), 5:(-1,1), 6:(0,-1), 7:(1,-1), 8:(-1,-1)    */
+
+                /* types.hpp: enum CommunicationTags{ BUFF1 = 0u, BUFF2 = 1u };   */
+                buff1->addExchange(GUARD, Mask(i), guardingCells, BUFF1);
+                buff2->addExchange(GUARD, Mask(i), guardingCells, BUFF2);
+            }
+
+            /* Both next lines are defined in GatherSlice.hpp:                   *
+             *  -gather saves the MessageHeader object                           *
+             *  -Then do an Allgather for the gloabalRanks from GC, sort out     *
+             *  -inactive processes (second/boolean ,argument in gather.init) and*
+             *   save new MPI_COMMUNICATOR created from these into private var.  *
+             *  -return if rank == 0                                             */
+            MessageHeader header(gridSize, layout, subGrid.getLocalDomain().offset);
+            isMaster = gather.init(header, true);
+
+            /* Calls kernel to initialize random generator. Game of Life is then  *
+             * initialized using uniform random numbers. With 10% (second arg)    *
+             * white points. World will be written to buffer in first argument    */
+            evo.initEvolution(buff1->getDeviceBuffer().getDataBox(), 0.1);
+        }
 
-    void start()
-    {
-        Buffer* read = buff1;
-        Buffer* write = buff2;
-        for (uint32_t i = 0; i < steps; ++i)
+        void start()
         {
-            oneStep(i, read, write);
-            std::swap(read, write);
+            Buffer* read = buff1;
+            Buffer* write = buff2;
+            for(uint32_t i = 0; i < steps; ++i)
+            {
+                oneStep(i, read, write);
+                std::swap(read, write);
+            }
         }
-    }
-private:
 
-    void oneStep(uint32_t currentStep, Buffer* read, Buffer* write)
-    {
-        auto splitEvent = __getTransactionEvent();
-        /* GridBuffer 'read' will use 'splitEvent' to schedule transaction    *
-         * tasks from the Borders of the neighboring areas to the Guards of   *
-         * this local Area added by 'addExchange'. All transactions in        *
-         * Transaction Manager will then be done in parallel to the           *
-         * calculations in the core. In order to synchronize the data         *
-         * transfer for the case the core calculation is finished earlier,    *
-         * GridBuffer.asyncComm returns a transaction handle we can check     */
-        auto send = read->asyncCommunication(splitEvent);
-        evo.run<CORE>( read->getDeviceBuffer().getDataBox(),
-                       write->getDeviceBuffer().getDataBox() );
-        /* Join communication with worker tasks, Now all next tasks run sequential */
-        __setTransactionEvent(send);
-        /* Calculate Borders */
-        evo.run<BORDER>( read->getDeviceBuffer().getDataBox(),
-                         write->getDeviceBuffer().getDataBox() );
-        write->deviceToHost();
-
-        /* gather::operator() gathers all the buffers and assembles those to  *
-         * a complete picture discarding the guards.                          */
-        auto picture = gather(write->getHostBuffer().getDataBox());
-        PngCreator png;
-        if (isMaster) png(currentStep, picture, gridSize);
-
-    }
-
-};
-}
+    private:
+        void oneStep(uint32_t currentStep, Buffer* read, Buffer* write)
+        {
+            auto splitEvent = __getTransactionEvent();
+            /* GridBuffer 'read' will use 'splitEvent' to schedule transaction    *
+             * tasks from the Borders of the neighboring areas to the Guards of   *
+             * this local Area added by 'addExchange'. All transactions in        *
+             * Transaction Manager will then be done in parallel to the           *
+             * calculations in the core. In order to synchronize the data         *
+             * transfer for the case the core calculation is finished earlier,    *
+             * GridBuffer.asyncComm returns a transaction handle we can check     */
+            auto send = read->asyncCommunication(splitEvent);
+            evo.run<CORE>(read->getDeviceBuffer().getDataBox(), write->getDeviceBuffer().getDataBox());
+            /* Join communication with worker tasks, Now all next tasks run sequential */
+            __setTransactionEvent(send);
+            /* Calculate Borders */
+            evo.run<BORDER>(read->getDeviceBuffer().getDataBox(), write->getDeviceBuffer().getDataBox());
+            write->deviceToHost();
+
+            /* gather::operator() gathers all the buffers and assembles those to  *
+             * a complete picture discarding the guards.                          */
+            auto picture = gather(write->getHostBuffer().getDataBox());
+            PngCreator png;
+            if(isMaster)
+                png(currentStep, picture, gridSize);
+        }
+    };
+} // namespace gol
diff --git a/share/pmacc/examples/gameOfLife2D/include/types.hpp b/share/pmacc/examples/gameOfLife2D/include/types.hpp
index 7da90c2324..865d4d186e 100644
--- a/share/pmacc/examples/gameOfLife2D/include/types.hpp
+++ b/share/pmacc/examples/gameOfLife2D/include/types.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -30,12 +30,11 @@ namespace gol
 
     typedef DataSpace<DIM2> Space;
     typedef GridController<DIM2> GC;
-    typedef GridBuffer<uint8_t, DIM2 > Buffer;
+    typedef GridBuffer<uint8_t, DIM2> Buffer;
 
     enum CommunicationTags
     {
-        BUFF1 = 0u, BUFF2 = 1u
+        BUFF1 = 0u,
+        BUFF2 = 1u
     };
-}
-
-
+} // namespace gol
diff --git a/share/pmacc/examples/gameOfLife2D/main.cpp b/share/pmacc/examples/gameOfLife2D/main.cpp
index d33dc5abaa..4702f06636 100644
--- a/share/pmacc/examples/gameOfLife2D/main.cpp
+++ b/share/pmacc/examples/gameOfLife2D/main.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Rene Widera
+/* Copyright 2013-2021 Rene Widera
  *
  * This file is part of PMacc.
  *
@@ -37,38 +37,43 @@ namespace po = boost::program_options;
  * @param argc count of arguments in argv
  * @param argv arguments of program start
  */
-int main( int argc, char **argv )
+int main(int argc, char** argv)
 {
     typedef ::gol::Space Space;
 
-    std::vector<uint32_t> devices;  /* will be set by boost program argument option "-d 3 3 3" */
+    std::vector<uint32_t> devices; /* will be set by boost program argument option "-d 3 3" */
     std::vector<uint32_t> gridSize; /* same but with -g */
     std::vector<uint32_t> periodic;
     uint32_t steps;
     std::string rule; /* Game of Life Simulation Rules like 23/3 */
 
-    po::options_description desc( "Allowed options" );
-    desc.add_options( )
-            ( "help,h", "produce help message" )
-            ( "steps,s", po::value<uint32_t > ( &steps ), "simulation steps" )
-            ( "rule,r", po::value<std::string > ( &rule ), "simulation rule etc. 23/3" )
-            ( "devices,d", po::value<std::vector<uint32_t> > ( &devices )->multitoken( ),
-              "number of devices in each dimension (only 1D or 2D). If you use more than "
-              "one device in total, you will need to run mpirun with \"mpirun -n "
-              "<DeviceCount.x*DeviceCount.y> ./gameOfLife\"" )
-            ( "grid,g", po::value<std::vector<uint32_t> > ( &gridSize )->multitoken( ),
-              "size of the simulation grid (must be 2D, e.g.: -g 128 128). Because of the border, which is one supercell = 16 cells wide, "
-              "the size in each direction should be greater or equal than 3*16=48 per device, so that the core will be non-empty" )
-            ( "periodic,p", po::value<std::vector<uint32_t> > ( &periodic )->multitoken( ),
-              "specifying whether the grid is periodic (1) or not (0) in each dimension, default: no periodic dimensions" );
+    po::options_description desc("Allowed options");
+    desc.add_options()("help,h", "produce help message")(
+        "steps,s",
+        po::value<uint32_t>(&steps)->default_value(100),
+        "simulation steps")("rule,r", po::value<std::string>(&rule)->default_value("23/3"), "simulation rule")(
+        "devices,d",
+        po::value<std::vector<uint32_t>>(&devices)->multitoken(),
+        "number of devices in each dimension (only 1D or 2D). If you use more than "
+        "one device in total, you will need to run mpirun with \"mpirun -n "
+        "<DeviceCount.x*DeviceCount.y> ./gameOfLife\"")(
+        "grid,g",
+        po::value<std::vector<uint32_t>>(&gridSize)->multitoken(),
+        "size of the simulation grid (must be 2D, e.g.: -g 128 128). Because of the border, which is one supercell = "
+        "16 cells wide, "
+        "the size in each direction should be greater or equal than 3*16=48 per device, so that the core will be "
+        "non-empty")(
+        "periodic,p",
+        po::value<std::vector<uint32_t>>(&periodic)->multitoken(),
+        "specifying whether the grid is periodic (1) or not (0) in each dimension, default: no periodic dimensions");
 
     /* parse command line options and config file and store values in vm */
     po::variables_map vm;
-    po::store( boost::program_options::parse_command_line( argc, argv, desc ), vm );
-    po::notify( vm );
+    po::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
 
     /* print help message and quit simulation */
-    if ( vm.count( "help" ) )
+    if(vm.count("help"))
     {
         std::cerr << desc << "\n";
         return false;
@@ -76,62 +81,62 @@ int main( int argc, char **argv )
 
 
     /* fill periodic with 0 */
-    while ( periodic.size( ) < DIM2 )
-        periodic.push_back( 0 );
+    while(periodic.size() < DIM2)
+        periodic.push_back(0);
 
     /* check on correct number of devices. fill with default value 1 for missing dimensions */
-    if ( devices.size( ) > DIM2 )
+    if(devices.size() > DIM2)
     {
         std::cerr << "Invalid number of devices.\nuse [-d dx=1 dy=1 dz=1]" << std::endl;
     }
     else
-        while ( devices.size( ) < DIM2 )
-            devices.push_back( 1 );
+        while(devices.size() < DIM2)
+            devices.push_back(1);
 
     /* check on correct grid size. fill with default grid size value 1 for missing 3. dimension */
-    if ( gridSize.size( ) != DIM2 )
+    if(gridSize.size() != DIM2)
     {
         std::cerr << "Invalid or missing grid size.\nuse -g width height [depth=1]" << std::endl;
-        MPI_CHECK( MPI_Finalize( ) );
+        MPI_CHECK(MPI_Finalize());
         return 0;
     }
 
 
     /* after checking all input values, copy into DataSpace Datatype */
-    Space gpus( devices[0], devices[1] );
-    Space grid( gridSize[0], gridSize[1] );
-    Space endless( periodic[0], periodic[1] );
+    Space gpus(devices[0], devices[1]);
+    Space grid(gridSize[0], gridSize[1]);
+    Space endless(periodic[0], periodic[1]);
 
     uint32_t ruleMask = 0;
-    size_t strLen = rule.length( );
-    size_t gPoint = rule.find( '/' );
-    std::string stayAliveIf = rule.substr( 0, gPoint );
-    std::string newBornIf = rule.substr( gPoint + 1, strLen - gPoint - 1 );
+    size_t strLen = rule.length();
+    size_t gPoint = rule.find('/');
+    std::string stayAliveIf = rule.substr(0, gPoint);
+    std::string newBornIf = rule.substr(gPoint + 1, strLen - gPoint - 1);
 
 
-    for ( unsigned int i = 0; i < newBornIf.length( ); ++i )
+    for(unsigned int i = 0; i < newBornIf.length(); ++i)
     {
-        std::stringstream ss;   /* used for converting const char* "123" to int 123 */
+        std::stringstream ss; /* used for converting const char* "123" to int 123 */
         ss << newBornIf[i];
         int shift;
         ss >> shift;
-        ruleMask = ruleMask | 1 << ( shift + 9 );
+        ruleMask = ruleMask | 1 << (shift + 9);
     }
-    for ( unsigned int i = 0; i < stayAliveIf.length( ); ++i )
+    for(unsigned int i = 0; i < stayAliveIf.length(); ++i)
     {
         std::stringstream ss;
         ss << stayAliveIf[i];
         int shift;
         ss >> shift;
-        ruleMask = ruleMask | 1 << ( shift );
+        ruleMask = ruleMask | 1 << (shift);
     }
     std::cout << "newborn if=" << newBornIf << " stay alive if=" << stayAliveIf << " mask=" << ruleMask << std::endl;
 
     /* start game of life simulation */
-    gol::Simulation sim( ruleMask, steps, grid, gpus, endless );
-    sim.init( );
-    sim.start( );
-    sim.finalize( );
+    gol::Simulation sim(ruleMask, steps, grid, gpus, endless);
+    sim.init();
+    sim.start();
+    sim.finalize();
 
     /* finalize the pmacc context */
     pmacc::Environment<>::get().finalize();
diff --git a/share/pmacc/examples/gameOfLife2D/submit/1.cfg b/share/pmacc/examples/gameOfLife2D/submit/1.cfg
index 6a545b0238..6500c6065c 100644
--- a/share/pmacc/examples/gameOfLife2D/submit/1.cfg
+++ b/share/pmacc/examples/gameOfLife2D/submit/1.cfg
@@ -1,5 +1,5 @@
 #
-# Copyright 2013-2020 Rene Widera
+# Copyright 2013-2021 Rene Widera
 #
 # This file is part of PMacc.
 #
diff --git a/share/pmacc/examples/gameOfLife2D/submit/2.cfg b/share/pmacc/examples/gameOfLife2D/submit/2.cfg
index 0fc18f13af..f957c75d46 100644
--- a/share/pmacc/examples/gameOfLife2D/submit/2.cfg
+++ b/share/pmacc/examples/gameOfLife2D/submit/2.cfg
@@ -1,5 +1,5 @@
 #
-# Copyright 2013-2020 Rene Widera
+# Copyright 2013-2021 Rene Widera
 #
 # This file is part of PMacc.
 #
diff --git a/share/pmacc/examples/gameOfLife2D/submit/4.cfg b/share/pmacc/examples/gameOfLife2D/submit/4.cfg
index 0cdca74ee2..d4832ad50d 100644
--- a/share/pmacc/examples/gameOfLife2D/submit/4.cfg
+++ b/share/pmacc/examples/gameOfLife2D/submit/4.cfg
@@ -1,5 +1,5 @@
 #
-# Copyright 2013-2020 Rene Widera
+# Copyright 2013-2021 Rene Widera
 #
 # This file is part of PMacc.
 #
diff --git a/share/pmacc/examples/gameOfLife2D/submit/bash/bash_mpiexec.tpl b/share/pmacc/examples/gameOfLife2D/submit/bash/bash_mpiexec.tpl
index 7611ba9f2e..c3f3011a3d 100644
--- a/share/pmacc/examples/gameOfLife2D/submit/bash/bash_mpiexec.tpl
+++ b/share/pmacc/examples/gameOfLife2D/submit/bash/bash_mpiexec.tpl
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Rene Widera, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Axel Huebl
 #
 # This file is part of PMacc.
 #
diff --git a/share/pmacc/examples/gameOfLife2D/submit/bash/bash_mpirun.tpl b/share/pmacc/examples/gameOfLife2D/submit/bash/bash_mpirun.tpl
index 224cf6ad93..6e62bf7c08 100644
--- a/share/pmacc/examples/gameOfLife2D/submit/bash/bash_mpirun.tpl
+++ b/share/pmacc/examples/gameOfLife2D/submit/bash/bash_mpirun.tpl
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Rene Widera, Axel Huebl
+# Copyright 2013-2021 Rene Widera, Axel Huebl
 #
 # This file is part of PMacc.
 #
diff --git a/src/tools/bin/BinEnergyPlot.sh b/src/tools/bin/BinEnergyPlot.sh
index 3c9333ddba..ed355740d6 100755
--- a/src/tools/bin/BinEnergyPlot.sh
+++ b/src/tools/bin/BinEnergyPlot.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Rene Widera, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/addLicense b/src/tools/bin/addLicense
index 7b7e1faec2..48cc9ce611 100755
--- a/src/tools/bin/addLicense
+++ b/src/tools/bin/addLicense
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/create.sh b/src/tools/bin/create.sh
index 0902891dc7..c0fadea8cb 100755
--- a/src/tools/bin/create.sh
+++ b/src/tools/bin/create.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/findAndDo b/src/tools/bin/findAndDo
index dfb71e54a4..3e9c26ffbc 100755
--- a/src/tools/bin/findAndDo
+++ b/src/tools/bin/findAndDo
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -23,7 +23,7 @@
 # $2 = filename pattern
 # $3 = programm to call: programmName filename
 
-#example call: for i in `echo "*.def *.h *.cpp *.cu *.hpp *.tpp *.kernel *.loader *.param *.unitless"` ; do findAndDo include/pmacc/ "$i" deleteHeadComment ; done
+#example call: for i in `echo "-iname *.def -iname *.h -iname *.cpp -iname *.cu -iname *.hpp -iname *.tpp -iname *.kernel -iname *.loader -iname *.param -iname *.unitless"` ; do findAndDo include/pmacc/ "$i" deleteHeadComment ; done
 
 find $1 -name "$2" -type f | grep -v "\.svn" | grep -v "\.git" | \
 xargs -n1 -P8 -I{} $3 {}
diff --git a/src/tools/bin/newVersion.sh b/src/tools/bin/newVersion.sh
index b09481494d..5f543c62c0 100755
--- a/src/tools/bin/newVersion.sh
+++ b/src/tools/bin/newVersion.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -131,16 +131,16 @@ sed -i 's/'\
 sed -i 's/'\
 'picongpu@[0-9]\+\.[0-9]\+\.[0-9]\+\(-.\+\)*/'\
 'picongpu@'$VERSION_STR'/g' \
-    $REPO_DIR/share/picongpu/dockerfiles/ubuntu-1604/Dockerfile
+    $REPO_DIR/share/picongpu/dockerfiles/ubuntu-2004/Dockerfile
 
 sed -i 's/'\
 '\/picongpu:[0-9]\+\.[0-9]\+\.[0-9]\+\(-.\+\)*/'\
 '\/picongpu:'$VERSION_STR'/g' \
-    $REPO_DIR/share/picongpu/dockerfiles/ubuntu-1604/Singularity
+    $REPO_DIR/share/picongpu/dockerfiles/ubuntu-2004/Singularity
 sed -i 's/'\
 'Version [0-9]\+\.[0-9]\+\.[0-9]\+\(-.\+\)*/'\
 'Version '$VERSION_STR'/g' \
-    $REPO_DIR/share/picongpu/dockerfiles/ubuntu-1604/Singularity
+    $REPO_DIR/share/picongpu/dockerfiles/ubuntu-2004/Singularity
 
 # @todo `project(...)` version in CMakeLists.txt (future)
 
diff --git a/src/tools/bin/nextstep_from_period.sh b/src/tools/bin/nextstep_from_period.sh
index 6da5cd0a74..52c4bffc76 100755
--- a/src/tools/bin/nextstep_from_period.sh
+++ b/src/tools/bin/nextstep_from_period.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2017-2020 Axel Huebl, Ilja Goethel
+# Copyright 2017-2021 Axel Huebl, Ilja Goethel
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/pic2xdmf.py b/src/tools/bin/pic2xdmf.py
index 24eca75f55..398e9a9fb2 100755
--- a/src/tools/bin/pic2xdmf.py
+++ b/src/tools/bin/pic2xdmf.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright 2014-2020 Felix Schmitt, Conrad Schumann
+# Copyright 2014-2021 Felix Schmitt, Conrad Schumann
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/plotIntensity b/src/tools/bin/plotIntensity
index c8b5c53edf..b9231409a0 100755
--- a/src/tools/bin/plotIntensity
+++ b/src/tools/bin/plotIntensity
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/plotNumericalHeating b/src/tools/bin/plotNumericalHeating
index 5064aef21c..eef43b3208 100755
--- a/src/tools/bin/plotNumericalHeating
+++ b/src/tools/bin/plotNumericalHeating
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright 2015-2020 Richard Pausch
+# Copyright 2015-2021 Richard Pausch
 #
 # This file is part of PIConGPU.
 #
@@ -119,7 +119,7 @@ for sim in directories:
     mydir = sim+simDir
     # get relevant files with energy
     files = [f for f in os.listdir(mydir)
-             if os.path.isfile(os.path.join(mydir, f)) and (re.search('^.*_energy_all.dat', f) or re.search('^.fields_energy.dat', f))]
+             if os.path.isfile(os.path.join(mydir, f)) and (re.search('^.*_energy_all.dat', f) or re.search('^fields_energy.dat', f))]
     # check if file list is empty
     if len(files) == 0:
         sys.exit("There were no energy files in \"{}\".".format(mydir))
diff --git a/src/tools/bin/plotRadiation b/src/tools/bin/plotRadiation
index 2a45df727e..f940a6ee01 100755
--- a/src/tools/bin/plotRadiation
+++ b/src/tools/bin/plotRadiation
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright 2013-2020 Richard Pausch
+# Copyright 2013-2021 Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/plotSumEnergyRange b/src/tools/bin/plotSumEnergyRange
index baea76f133..89c5c46cf1 100755
--- a/src/tools/bin/plotSumEnergyRange
+++ b/src/tools/bin/plotSumEnergyRange
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl, Rene Widera
+# Copyright 2013-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/plot_chargeConservation.py b/src/tools/bin/plot_chargeConservation.py
index 1ceed9965a..db21828e84 100755
--- a/src/tools/bin/plot_chargeConservation.py
+++ b/src/tools/bin/plot_chargeConservation.py
@@ -1,6 +1,6 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #
-# Copyright 2015-2020 Richard Pausch
+# Copyright 2015-2021 Richard Pausch
 #
 # This file is part of PIConGPU.
 #
@@ -20,14 +20,13 @@
 #
 
 import argparse
-import os
 import numpy as np
-import h5py
 import matplotlib.pyplot as plt
+import openpmd_api as io
 
 __doc__ = '''
 This program reads electric field and charge density data
-from hdf5 files created by PIConGPU and checks charge conservation
+from openPMD files created by PIConGPU and checks charge conservation
 for the Yee scheme.
 
 Three slice plots show the error in $div(E) - rho/epsilon_0$
@@ -52,55 +51,64 @@ def set_colorbar(cb):
         t.set_fontsize(16)
 
 
-def plotError(h5file, slice_pos=[0.5, 0.5, 0.5]):
+def plotError(file_pattern, slice_pos=[0.5, 0.5, 0.5], timestep=-1):
     """
-    read field data from hdf5 files
+    read field data from an openPMD file
     compute div(E) - rho/epsilon_0
     plot slices through simulation volume
 
     Parameters:
-    h5file: file name
-        file name to hdf5 data set from PIConGPU
+    file_pattern: file name
+         openPMD file series pattern e.g. simData_%%T.bp
 
     slice_pos: list of floats
         list of 3 floats to define slice position [0, 1]
         Default=[0.5, 0.5, 0.5]
+
+    timestep: selected timestep
+        simulation step used if file is an
+        openPMD file series pattern e.g. simData_%%T.bp
     """
-    # load hdf5 file
-    f = h5py.File(h5file, "r")
+    # load file
+    series = io.Series(file_pattern, io.Access.read_only)
+
+    # read time step
+    if timestep == -1:
+        *_, timestep = series.iterations
 
-    # read time step (python 2 and 3 save)
-    timestep = -1
-    for i in f['/data'].keys():
-        timestep = i
+    f = series.iterations[timestep]
 
     # load physics constants and simulation parameters
-    EPS0 = f["/data/{}".format(timestep)].attrs["eps0"]
-    CELL_WIDTH = f["/data/{}".format(timestep)].attrs["cell_width"]
-    CELL_HEIGHT = f["/data/{}".format(timestep)].attrs["cell_height"]
-    CELL_DEPTH = f["/data/{}".format(timestep)].attrs["cell_depth"]
+    EPS0 = f.get_attribute("eps0")
+    CELL_WIDTH = f.get_attribute("cell_width")
+    CELL_HEIGHT = f.get_attribute("cell_height")
+    CELL_DEPTH = f.get_attribute("cell_depth")
 
     # load electric field
-    Ex = np.array(f["/data/{}/fields/E/x".format(timestep)])
-    Ey = np.array(f["/data/{}/fields/E/y".format(timestep)])
-    Ez = np.array(f["/data/{}/fields/E/z".format(timestep)])
+    Ex = f.meshes["E"]["x"][:]
+    Ey = f.meshes["E"]["y"][:]
+    Ez = f.meshes["E"]["z"][:]
+
+    series.flush()
 
     # load and add charge density
     charge = np.zeros_like(Ex)
     norm = 0.0
-    for field_name in f["/data/{}/fields/".format(timestep)].keys():
-        if field_name[-14:] == "_chargeDensity":
+
+    for fieldName in f.meshes:
+        search_pattern = "_chargeDensity"
+        if fieldName[-len(search_pattern):] == search_pattern:
             # load species density
-            species_Density = np.array(
-                f["/data/{}/fields/".format(timestep) + field_name]
-            )
+            species_Density = \
+                f.meshes[fieldName][io.Mesh_Record_Component.SCALAR][:]
+            series.flush()
             # choose norm to be the maximal charge density of all species
             norm = np.max([norm, np.amax(np.abs(species_Density))])
             # add charge density to total charge density
             charge += species_Density
 
-    # close hdf5 file
-    f.close()
+    # close file
+    del series
 
     # compute divergence of electric field according to Yee scheme
     div = ((Ex[1:, 1:, 1:] - Ex[1:, 1:, :-1]) / CELL_WIDTH +
@@ -116,7 +124,7 @@ def plotError(h5file, slice_pos=[0.5, 0.5, 0.5]):
     plt.figure(figsize=(14, 5))
 
     plt.subplot(131)
-    slice_cell_z = np.int(np.floor((diff.shape[0]-1) * slice_pos[0]))
+    slice_cell_z = np.int(np.floor((diff.shape[0] - 1) * slice_pos[0]))
     plt.title("slice in z at {}".format(slice_cell_z), fontsize=20)
     plt.imshow(diff[slice_cell_z, :, :],
                vmin=-limit, vmax=+limit,
@@ -132,7 +140,7 @@ def plotError(h5file, slice_pos=[0.5, 0.5, 0.5]):
                  )
 
     plt.subplot(132)
-    slice_cell_y = np.int(np.floor((diff.shape[1]-1) * slice_pos[1]))
+    slice_cell_y = np.int(np.floor((diff.shape[1] - 1) * slice_pos[1]))
     plt.title("slice in y at {}".format(slice_cell_y), fontsize=20)
     plt.imshow(diff[:, slice_cell_y, :],
                vmin=-limit, vmax=+limit,
@@ -148,7 +156,7 @@ def plotError(h5file, slice_pos=[0.5, 0.5, 0.5]):
                  )
 
     plt.subplot(133)
-    slice_cell_x = np.int(np.floor((diff.shape[2]-1) * slice_pos[2]))
+    slice_cell_x = np.int(np.floor((diff.shape[2] - 1) * slice_pos[2]))
     plt.title("slice in x at {}".format(slice_cell_x), fontsize=20)
     plt.imshow(diff[:, :, slice_cell_x],
                vmin=-limit, vmax=+limit,
@@ -176,15 +184,24 @@ def plotError(h5file, slice_pos=[0.5, 0.5, 0.5]):
     parser = argparse.ArgumentParser(
         description=__doc__,
         epilog='For further questions please contact Richard Pausch.'
-        )
+    )
 
-    parser.add_argument(metavar="hdf5 file",
-                        dest="h5file_name",
-                        help='hdf5 file with PIConGPU data',
+    parser.add_argument(metavar="openPMD file name",
+                        dest="filename",
+                        help='openPMD file or series pattern '
+                             'with PIConGPU data',
                         action='store',
                         type=str)
 
-    parser.add_argument("--x",
+    parser.add_argument("-t",
+                        dest="selected_timestep",
+                        help='simulation step used if file is an '
+                             'openPMD file series pattern e.g. simData_%%T.bp',
+                        action='store',
+                        default=-1,
+                        type=int)
+
+    parser.add_argument("-x",
                         dest="x_split",
                         action='store',
                         default=0.5,
@@ -192,7 +209,7 @@ def plotError(h5file, slice_pos=[0.5, 0.5, 0.5]):
                         help='float value between [0,1] to set slice ' +
                              'position in x (default = 0.5)')
 
-    parser.add_argument("--y",
+    parser.add_argument("-y",
                         dest="y_split",
                         action='store',
                         default=0.5,
@@ -200,7 +217,7 @@ def plotError(h5file, slice_pos=[0.5, 0.5, 0.5]):
                         help='float value between [0,1] to set slice ' +
                              'position in y (default = 0.5)')
 
-    parser.add_argument("--z",
+    parser.add_argument("-z",
                         dest="z_split",
                         action='store',
                         default=0.5,
@@ -222,7 +239,5 @@ def plotError(h5file, slice_pos=[0.5, 0.5, 0.5]):
                          args.x_split],
                         0, 1)
 
-    if os.path.isfile(args.h5file_name):
-        plotError(args.h5file_name, slice_pos=slice_pos)
-    else:
-        print("ERROR: {} is not a file".format(args.h5file_name))
+    plotError(args.filename, slice_pos=slice_pos,
+              timestep=args.selected_timestep)
diff --git a/src/tools/bin/plot_chargeConservation_overTime.py b/src/tools/bin/plot_chargeConservation_overTime.py
index 03471e9d8c..6f9225809a 100755
--- a/src/tools/bin/plot_chargeConservation_overTime.py
+++ b/src/tools/bin/plot_chargeConservation_overTime.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright 2015-2020 Richard Pausch, Axel Huebl
+# Copyright 2015-2021 Richard Pausch, Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -20,20 +20,17 @@
 #
 
 # import system interface modules
-import os
-import re
-import sys
 import argparse
 
 # import data analysis and plotting modules
 import numpy as np
-import h5py
 import matplotlib.pyplot as plt
 from matplotlib.ticker import LinearLocator, FormatStrFormatter
+import openpmd_api as io
 
 __doc__ = """
 This program reads electric field and charge density data
-from all hdf5 files created by a PIConGPU simulation and
+from all openPMD files created by a PIConGPU simulation and
 plots a variety of values to check charge conservation
 over time.
 
@@ -41,74 +38,47 @@
 normalized to the maximum [per-species] charge in the first
 simulation time step.
 
-Developer: Richard Pausch
+Developer: Richard Pausch, Rene Widera
 """
 
 
-def get_list_of_hdf5_files(base_directory):
+def deviation_charge_conservation(series, iteration):
     """
-    Returns a list of hdf5 files (`*_<step>.h5`)
-    listed in sub-directory `simOutput/h5/`
-
-    Parameters:
-    base_directory: string
-        directory path where to find simOutput/h5/
-
-    Return:
-    list of strings with hdf5 file names found
-    """
-    h5_list = []  # empty list for hdf5 files
-    h5_dir = base_directory + "/simOutput/h5/"
-    if not os.path.isdir(h5_dir):
-        raise Exception(("Error: {} does not contain" +
-                         " a simOutput/h5/ directory").format(directory))
-
-    for filename in os.listdir(h5_dir):
-        if os.path.isfile(h5_dir+filename):
-            if re.search(r".+_[0-9]+\.h5", filename):
-                h5_list.append(h5_dir + filename)
-    return h5_list
-
-
-def deviation_charge_conservation(h5file):
-    """
-    read field data from hdf5 files
+    read field data from openPMD file
     compute d = div(E)*epsilon_0 - rho
 
     Parameters:
-    h5file: file name
-        file name and path to hdf5 data from PIConGPU
+    series: file name
+        openPMD file series pattern e.g. simData_%%T.bp
+
+    iteration:
+        openPMD iteration object
 
     Return:
-    list of floats: [timestep, max(abs(d)),
-        mean(abs(d)), std(d), norm]
+    list of floats: [max(abs(d)), mean(abs(d)), std(d), norm]
     """
-    # load hdf5 file
-    f = h5py.File(h5file, "r")
-
-    # read time step (python 2 and 3 save)
-    timestep = -1
-    for i in f["/data"].keys():
-        timestep = i
 
     # load physics constants and simulation parameters
-    EPS0 = f["/data/{}".format(timestep)].attrs["eps0"]
+    EPS0 = iteration.get_attribute("eps0")
     is2D = False
 
     # load electric field
-    Ex = np.array(f["/data/{}/fields/E/x".format(timestep)])
-    Ey = np.array(f["/data/{}/fields/E/y".format(timestep)])
-    Ez = np.array(f["/data/{}/fields/E/z".format(timestep)])
+    Ex = iteration.meshes["E"]["x"][:]
+    Ey = iteration.meshes["E"]["y"][:]
+    Ez = iteration.meshes["E"]["z"][:]
+
+    series.flush()
 
     # load and add charge density
     charge = np.zeros_like(Ex)
     norm = 0.0
-    for field_name in f["/data/{}/fields/".format(timestep)].keys():
-        if field_name[-14:] == "_chargeDensity":
+    for fieldName in iteration.meshes:
+        if fieldName[-14:] == "_chargeDensity":
+            # load species density
             # load species density
-            species_Density_pointer = f["/data/{}/fields/".format(timestep) +
-                                        field_name]
-            species_Density = np.array(species_Density_pointer)
+            species_Density = \
+                iteration.meshes[fieldName][io.Mesh_Record_Component.SCALAR][:]
+            series.flush()
             # choose norm to be the maximal charge density of all species
             norm = np.max([norm, np.amax(np.abs(species_Density))])
             # add charge density to total charge density
@@ -119,16 +89,16 @@ def deviation_charge_conservation(h5file):
             # a 2D simulation, the size of the z or [2]-component is 1, which
             # is <2. The code changes the 2D3D flag if one Density data set is
             # 2D.
-            if species_Density_pointer.attrs['_size'][2] < 2:
+            if species_Density.ndim == 2:
                 is2D = True
 
     # load cell size and compute cell volume
-    CELL_WIDTH = f["/data/{}".format(timestep)].attrs["cell_width"]
-    CELL_HEIGHT = f["/data/{}".format(timestep)].attrs["cell_height"]
-    CELL_DEPTH = f["/data/{}".format(timestep)].attrs["cell_depth"]
+    CELL_WIDTH = iteration.get_attribute("cell_width")
+    CELL_HEIGHT = iteration.get_attribute("cell_height")
+    CELL_DEPTH = iteration.get_attribute("cell_depth")
 
-    # close hdf5 file
-    f.close()
+    # close iteration
+    iteration.close()
 
     if is2D:
         # compute divergence of electric field according to Yee scheme
@@ -149,7 +119,7 @@ def deviation_charge_conservation(h5file):
         # density
         diff = (div * EPS0 - charge[1:, 1:, 1:])
 
-    return float(timestep), np.amax(np.abs(diff)), np.mean(np.abs(diff)), \
+    return np.amax(np.abs(diff)), np.mean(np.abs(diff)), \
         np.std(diff), norm
 
 
@@ -160,13 +130,29 @@ def deviation_charge_conservation(h5file):
     parser = argparse.ArgumentParser(
         description=__doc__,
         epilog="For further questions please contact Richard Pausch."
-        )
+    )
+
+    parser.add_argument("--start",
+                        dest="start_timestep",
+                        help='first timstep',
+                        action='store',
+                        default=0,
+                        type=int)
+
+    parser.add_argument("--last",
+                        dest="last_timestep",
+                        help='last timstep',
+                        action='store',
+                        default=-1,
+                        type=int)
 
     parser.add_argument(metavar="simulation directories",
-                        dest="directories",
-                        help="simulation base directories",
+                        dest="file_pattern",
+                        help="openPMD series pattern with PIConGPU "
+                             "data e.g. simData_%%T.bp",
                         action="store",
-                        nargs="+")
+                        nargs="+"
+                        )
 
     parser.add_argument("--export",
                         metavar="file name",
@@ -176,7 +162,7 @@ def deviation_charge_conservation(h5file):
                              "(disable interactive window)")
 
     args = parser.parse_args()
-    directories = args.directories
+    file_patterns = args.file_pattern
 
     # prepare plot of data
     plt.figure(figsize=(10, 5))
@@ -214,35 +200,26 @@ def deviation_charge_conservation(h5file):
     # underscore labels)
     sim_dir_counter = 1
 
-    for directory in directories:
-        # do the data reading and catch errors
-        try:
-            # test if directory is a directory
-            if not os.path.isdir(directory):
-                raise Exception("Error: {} is not a directory".format(
-                                directory))
-
-            # check if any hdf5 files were found
-            h5_file_list = get_list_of_hdf5_files(directory)
-            if len(h5_file_list) == 0:
-                raise Exception("No hdf5 files found in {}".format(
-                                directory + "simOutput/h5/"))
-
-        except Exception as error_msg:
-            print("{}".format(error_msg))
-            sys.exit(1)
-
-        # collect data from all found hdf5 files
+    for pattern in file_patterns:
+        series = io.Series(pattern, io.Access.read_only)
+
+        first_step = args.start_timestep
+        last_step = args.last_timestep
+
         collect_results = None
-        print("Read files:")
-        for f in h5_file_list:
-            print(f)
-            t, cc_max, mean_abs, std, norm = deviation_charge_conservation(f)
-            data_tmp = np.array([[t, cc_max, mean_abs, std, norm]])
-            if collect_results is None:
-                collect_results = data_tmp
-            else:
-                collect_results = np.append(collect_results, data_tmp, axis=0)
+
+        for iteration in series.iterations:
+            if (iteration >= first_step and
+                    (iteration <= last_step or last_step == -1)):
+                print("load iteration {:d}".format(iteration))
+                cc_max, mean_abs, std, norm = deviation_charge_conservation(
+                    series, series.iterations[iteration])
+                data_tmp = np.array([[iteration, cc_max, mean_abs, std, norm]])
+                if collect_results is None:
+                    collect_results = data_tmp
+                else:
+                    collect_results = np.append(
+                        collect_results, data_tmp, axis=0)
 
         # sort data temporally
         collect_results = np.sort(collect_results, axis=0)
@@ -255,19 +232,20 @@ def deviation_charge_conservation(h5file):
         norm = collect_results[0, 4]  # first (t=0) norm
 
         # generate plot label based on directory and avoid underscore bug
-        plot_label = ("{:d}. ".format(sim_dir_counter) +
-                      os.path.normpath(directory).split("/")[-1])
+        plot_label = ("{:s}".format(pattern))
         sim_dir_counter += 1
 
         # add plot for maximum difference
-        ax1.plot(t, max_diff/norm,
+        ax1.plot(t, max_diff / norm,
                  linestyle="-", lw=3,
                  marker="+", ms=15, markeredgewidth=3,
                  label=plot_label)
 
         # add plot for mean difference and std
-        ax2.errorbar(t, mean_abs/norm, yerr=std/norm, lw=3, markeredgewidth=3,
-                     label=plot_label)
+        ax2.errorbar(t, mean_abs / norm, yerr=std / norm, lw=3,
+                     markeredgewidth=3, label=plot_label)
+
+        del series
 
     # finish plots
     ax1.legend(loc=0)
diff --git a/src/tools/bin/png2video.sh b/src/tools/bin/png2video.sh
index 18be27f3c2..f970fd8427 100755
--- a/src/tools/bin/png2video.sh
+++ b/src/tools/bin/png2video.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Rene Widera
+# Copyright 2013-2021 Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/position2Trace.sh b/src/tools/bin/position2Trace.sh
index e50e2957f9..63d7398f79 100755
--- a/src/tools/bin/position2Trace.sh
+++ b/src/tools/bin/position2Trace.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Rene Widera, Richard Pausch
+# Copyright 2013-2021 Rene Widera, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/printField.py b/src/tools/bin/printField.py
index bcab59a55e..75ee79f5b2 100755
--- a/src/tools/bin/printField.py
+++ b/src/tools/bin/printField.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright 2013-2020 Richard Pausch
+# Copyright 2013-2021 Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/radiationSyntheticDetector b/src/tools/bin/radiationSyntheticDetector
index cdf60d8d29..8a2221e520 100755
--- a/src/tools/bin/radiationSyntheticDetector
+++ b/src/tools/bin/radiationSyntheticDetector
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-# Copyright 2013-2020 Richard Pausch
+# Copyright 2013-2021 Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/smooth.py b/src/tools/bin/smooth.py
index 84737b1bc3..d579417286 100644
--- a/src/tools/bin/smooth.py
+++ b/src/tools/bin/smooth.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2013-2020 Richard Pausch
+# Copyright 2013-2021 Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/splash2vtk.sh b/src/tools/bin/splash2vtk.sh
index 41c68b612d..499c31e2be 100755
--- a/src/tools/bin/splash2vtk.sh
+++ b/src/tools/bin/splash2vtk.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Axel Huebl
+# Copyright 2013-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/transpose b/src/tools/bin/transpose
index 48e11696bf..aba8741c61 100755
--- a/src/tools/bin/transpose
+++ b/src/tools/bin/transpose
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2013-2020 Rene Widera
+# Copyright 2013-2021 Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/bin/uncrustifyMyCode b/src/tools/bin/uncrustifyMyCode
index 957045f920..5dc1b664dc 100755
--- a/src/tools/bin/uncrustifyMyCode
+++ b/src/tools/bin/uncrustifyMyCode
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2015-2020 Rene Widera
+# Copyright 2015-2021 Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/png2gas/CMakeLists.txt b/src/tools/png2gas/CMakeLists.txt
index 180255cf49..40a4dbe129 100644
--- a/src/tools/png2gas/CMakeLists.txt
+++ b/src/tools/png2gas/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright 2014-2020 Axel Huebl, Benjamin Schneider, Felix Schmitt, Heiko Burau, Rene Widera
+# Copyright 2014-2021 Axel Huebl, Benjamin Schneider, Felix Schmitt, Heiko Burau, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -22,7 +22,7 @@
 # Required cmake version
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 
 ################################################################################
@@ -68,10 +68,10 @@ endif()
 # Language Flags
 ###############################################################################
 
-# enforce C++11
+# enforce C++14
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 
 
 ################################################################################
diff --git a/src/tools/png2gas/png2gas.cpp b/src/tools/png2gas/png2gas.cpp
index 7faaedb792..003264b80d 100644
--- a/src/tools/png2gas/png2gas.cpp
+++ b/src/tools/png2gas/png2gas.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt, Axel Huebl, Richard Pausch
+/* Copyright 2014-2021 Felix Schmitt, Axel Huebl, Richard Pausch
  *
  * This file is part of PIConGPU.
  *
diff --git a/src/tools/share/awk/BinEnergyPlot.awk b/src/tools/share/awk/BinEnergyPlot.awk
index 3b7c9f6b4c..85af25d60f 100644
--- a/src/tools/share/awk/BinEnergyPlot.awk
+++ b/src/tools/share/awk/BinEnergyPlot.awk
@@ -1,5 +1,5 @@
 #
-# Copyright 2013-2020 Rene Widera
+# Copyright 2013-2021 Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/share/awk/SumEnergyRange.awk b/src/tools/share/awk/SumEnergyRange.awk
index f80fe8a473..3ae077ff7d 100644
--- a/src/tools/share/awk/SumEnergyRange.awk
+++ b/src/tools/share/awk/SumEnergyRange.awk
@@ -1,5 +1,5 @@
 #
-# Copyright 2013-2020 Rene Widera
+# Copyright 2013-2021 Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/share/gnuplot/BinEnergyPlot.gnuplot b/src/tools/share/gnuplot/BinEnergyPlot.gnuplot
index 0a3718c78b..38861cf073 100644
--- a/src/tools/share/gnuplot/BinEnergyPlot.gnuplot
+++ b/src/tools/share/gnuplot/BinEnergyPlot.gnuplot
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Axel Huebl, Richard Pausch
+# Copyright 2013-2021 Axel Huebl, Richard Pausch
 #
 # This file is part of PIConGPU.
 #
diff --git a/src/tools/splash2txt/CMakeLists.txt b/src/tools/splash2txt/CMakeLists.txt
index 64fdffd4f8..1a2d4ad715 100644
--- a/src/tools/splash2txt/CMakeLists.txt
+++ b/src/tools/splash2txt/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Felix Schmitt, Axel Huebl, Rene Widera
+# Copyright 2013-2021 Felix Schmitt, Axel Huebl, Rene Widera
 #
 # This file is part of splash2txt.
 #
@@ -22,7 +22,7 @@
 # Required cmake version
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 
 ################################################################################
@@ -62,10 +62,10 @@ endif()
 # Language Flags
 ###############################################################################
 
-# enforce C++11
+# enforce C++14
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 
 
 ################################################################################
diff --git a/src/tools/splash2txt/include/ITools.hpp b/src/tools/splash2txt/include/ITools.hpp
index e469919c37..1f9a671575 100644
--- a/src/tools/splash2txt/include/ITools.hpp
+++ b/src/tools/splash2txt/include/ITools.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt
+/* Copyright 2013-2021 Felix Schmitt
  *
  * This file is part of splash2txt.
  *
diff --git a/src/tools/splash2txt/include/splash2txt.hpp b/src/tools/splash2txt/include/splash2txt.hpp
index 1f184af073..8c91084377 100644
--- a/src/tools/splash2txt/include/splash2txt.hpp
+++ b/src/tools/splash2txt/include/splash2txt.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Axel Huebl, Rene Widera
  *
  * This file is part of splash2txt.
  *
diff --git a/src/tools/splash2txt/include/tools_adios_parallel.hpp b/src/tools/splash2txt/include/tools_adios_parallel.hpp
index b7d3b96365..f55e9b67af 100644
--- a/src/tools/splash2txt/include/tools_adios_parallel.hpp
+++ b/src/tools/splash2txt/include/tools_adios_parallel.hpp
@@ -1,5 +1,5 @@
 /*
- *Copyright 2014-2020 Felix Schmitt, Conrad Schumann
+ *Copyright 2014-2021 Felix Schmitt, Conrad Schumann
  *
  * This file is part of splash2txt.
  *
diff --git a/src/tools/splash2txt/include/tools_splash_parallel.hpp b/src/tools/splash2txt/include/tools_splash_parallel.hpp
index 5bbeab3139..b665b039da 100644
--- a/src/tools/splash2txt/include/tools_splash_parallel.hpp
+++ b/src/tools/splash2txt/include/tools_splash_parallel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt
+/* Copyright 2013-2021 Felix Schmitt
  *
  * This file is part of splash2txt.
  *
diff --git a/src/tools/splash2txt/splash2txt.cpp b/src/tools/splash2txt/splash2txt.cpp
index 2c5cbbd7ec..f5a883077f 100644
--- a/src/tools/splash2txt/splash2txt.cpp
+++ b/src/tools/splash2txt/splash2txt.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Axel Huebl, Rene Widera,
+/* Copyright 2013-2021 Felix Schmitt, Axel Huebl, Rene Widera,
  *                     Alexander Grund
  *
  * This file is part of splash2txt.
diff --git a/src/tools/splash2txt/tools_adios_parallel.cpp b/src/tools/splash2txt/tools_adios_parallel.cpp
index c18e2da1d5..1fb08fecf1 100644
--- a/src/tools/splash2txt/tools_adios_parallel.cpp
+++ b/src/tools/splash2txt/tools_adios_parallel.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2014-2020 Felix Schmitt, Conrad Schumann, Axel Huebl
+/* Copyright 2014-2021 Felix Schmitt, Conrad Schumann, Axel Huebl
  *
  * This file is part of splash2txt.
  *
diff --git a/src/tools/splash2txt/tools_splash_parallel.cpp b/src/tools/splash2txt/tools_splash_parallel.cpp
index d3e7fe02fc..21a3bef50e 100644
--- a/src/tools/splash2txt/tools_splash_parallel.cpp
+++ b/src/tools/splash2txt/tools_splash_parallel.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2013-2020 Felix Schmitt, Axel Huebl, Rene Widera
+/* Copyright 2013-2021 Felix Schmitt, Axel Huebl, Rene Widera
  *
  * This file is part of splash2txt.
  *
diff --git a/test/correctBranchPR b/test/correctBranchPR
index 29345834b9..38ead49cfc 100755
--- a/test/correctBranchPR
+++ b/test/correctBranchPR
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2017-2020 Axel Huebl
+# Copyright 2017-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
@@ -22,55 +22,71 @@
 # Disallow PRs to `ComputationalRadiationPhysics/picongpu` branch `master`
 # if not an other mainline branch such as `dev` or `release-...`
 #
-# See: https://docs.travis-ci.com/user/environment-variables/
-#      https://developer.github.com/v3/pulls/#get-a-single-pull-request
-#
 # -> only enforced for `master` branch
 #    -> only enforced for mainline repo (not for forks)
 #
-# This file needs to be sourced in .travis.yml to work.
+# dependencies: curl, python3
 #
-# @result 0 if correct target (or not in travis CI for mainline), else 1
+# @result 0 if correct target, else 1
 #
 
-# Are we even in travis? Otherwise pass this test.
-if [ "$TRAVIS" != "true" ]
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+cd $CI_PROJECT_DIR
+
+is_pr=$(echo "$CI_BUILD_REF_NAME" | grep -q "^pr-" && echo 0 || echo 1)
+# merge only pull requests
+
+mainline_slug="ComputationalRadiationPhysics/picongpu"
+
+# only enforced for PRs
+if [ $is_pr -eq 0 ]
 then
-    echo "Not in travis, so I have nothing to do :)"
-else
+    github_group_repo="ComputationalRadiationPhysics/picongpu"
 
-    mainline_slug="ComputationalRadiationPhysics/picongpu"
+    pr_id=$(echo "$CI_BUILD_REF_NAME" | cut -d"/" -f1 | cut -d"-" -f2)
+    # used a token without any rights from psychocoderHPC to avoid API query limitations
+    curl_data=$(curl -u psychocoderHPC:$GITHUB_TOKEN -X GET https://api.github.com/repos/${github_group_repo}/pulls/${pr_id} 2>/dev/null)
+    echo "--- curl data ---"
+    echo "$curl_data"
+    echo "-----------------"
+    # get the destination branch
+    destination_branch=$(echo "$curl_data" | python3 -c 'import json,sys;obj=json.loads(sys.stdin.read());print(obj["base"]["ref"])')
+    echo "destination_branch=${destination_branch}"
 
-    # only enforced for PRs
-    if [ "$TRAVIS_EVENT_TYPE" == "pull_request" ]
+    # only enforced for `master` branch
+    if [ "$destination_branch" == "master" ]
     then
-        # only enforced for `master` branch
-        if [ "$TRAVIS_BRANCH" == "master" ]
+        repo_slug=$(echo "$curl_data" | python3 -c 'import json,sys;obj=json.loads(sys.stdin.read());print(obj["base"]["repo"]["full_name"])')
+        echo "repo_slug=${repo_slug}"
+        # only enforced for mainline repo (not for forks)
+        if [ "$repo_slug" == "$mainline_slug" ]
         then
-            # only enforced for mainline repo (not for forks)
-            if [ "$TRAVIS_REPO_SLUG" == "$mainline_slug" ]
+            pull_request_slug=$(echo "$curl_data" | python3 -c 'import json,sys;obj=json.loads(sys.stdin.read());print(obj["head"]["repo"]["full_name"])')
+            echo "pull_request_slug=${pull_request_slug}"
+            # origin repo is not our mainline? so it's a PR from a fork!
+            if [ "$pull_request_slug" != "$mainline_slug" ]
             then
-                # origin repo is not our mainline? so it's a PR from a fork!
-                if [ "$TRAVIS_PULL_REQUEST_SLUG" != "$mainline_slug" ]
-                then
-                    # the PR came from a fork owned by the first part of the slug
-                    pr_author=$(echo "$TRAVIS_PULL_REQUEST_SLUG" | awk -F "/" '{print $1}')
-                    pr_branch=$TRAVIS_PULL_REQUEST_BRANCH
-                    echo ""
-                    echo "Pull request opened to wrong branch!"
-                    echo ""
-                    echo "New features need to go to our 'dev' branch but your"
-                    echo "pull-request from '"$TRAVIS_PULL_REQUEST_SLUG"' was"
-                    echo "sent to 'master' which is only updated by our"
-                    echo "maintainers for new stable releases."
-                    echo ""
-                    echo "Please re-open your pull-request against our 'dev' branch:"
-                    echo "  https://github.com/ComputationalRadiationPhysics/picongpu/compare/dev...$pr_author:$pr_branch?expand=1"
-                    echo ""
-                    echo "For further information, please see:"
-                    echo "  https://github.com/ComputationalRadiationPhysics/picongpu/blob/dev/CONTRIBUTING.md"
-                    exit 1
-                fi
+                # the PR came from a fork
+                pr_label=$(echo "$curl_data" | python3 -c 'import json,sys;obj=json.loads(sys.stdin.read());print(obj["head"]["label"])')
+                echo "pr_label=${pr_label}"
+                echo ""
+                echo "Pull request opened to wrong branch!"
+                echo ""
+                echo "New features need to go to our 'dev' branch but your"
+                echo "pull-request from '"$pull_request_slug"' was"
+                echo "sent to 'master' which is only updated by our"
+                echo "maintainers for new stable releases."
+                echo ""
+                echo "Please re-open your pull-request against our 'dev' branch:"
+                echo "  https://github.com/ComputationalRadiationPhysics/picongpu/compare/dev...$pr_label?expand=1"
+                echo ""
+                echo "For further information, please see:"
+                echo "  https://github.com/ComputationalRadiationPhysics/picongpu/blob/dev/CONTRIBUTING.md"
+                exit 1
             fi
         fi
     fi
diff --git a/test/hasCudaGlobalKeyword b/test/hasCudaGlobalKeyword
index cd1fe46a7c..3aa8e7a7ee 100755
--- a/test/hasCudaGlobalKeyword
+++ b/test/hasCudaGlobalKeyword
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2016-2020 Rene Widera
+# Copyright 2016-2021 Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/test/hasEOLwhiteSpace b/test/hasEOLwhiteSpace
index e6483e1861..8c70c790ab 100755
--- a/test/hasEOLwhiteSpace
+++ b/test/hasEOLwhiteSpace
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2016-2020 Axel Huebl, Rene Widera
+# Copyright 2016-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
@@ -33,7 +33,7 @@ files=()
 pattern="\.def$|\.h$|\.cpp$|\.cu$|\.hpp$|\.tpp$|\.kernel$|\.loader$|"\
 "\.param$|\.unitless$|\.sh$|\.bash$|\.cfg$|\.tpl$|\.conf$|"\
 "\.awk$|\.gnuplot$|\.cmake$|\.profile$|\.example$|\.py$|"\
-"cmakeFlags|CMakeLists\.txt|src/tools/bin"
+"cmakeFlags$|CMakeLists\.txt|src/tools/bin"
 
 for i in $(find . \
                 -not -path "./.git/*" \
diff --git a/test/hasExtLibIncludeBrackets b/test/hasExtLibIncludeBrackets
index a7df3bd55b..a9a566af95 100755
--- a/test/hasExtLibIncludeBrackets
+++ b/test/hasExtLibIncludeBrackets
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2016-2020 Axel Huebl, Rene Widera
+# Copyright 2016-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/test/hasNonASCII b/test/hasNonASCII
index 6d1fea619d..0117ed3220 100755
--- a/test/hasNonASCII
+++ b/test/hasNonASCII
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2016-2020 Axel Huebl, Rene Widera
+# Copyright 2016-2021 Axel Huebl, Rene Widera
 #
 # This file is part of PIConGPU.
 #
diff --git a/test/hasSpaceBeforePrecompiler b/test/hasSpaceBeforePrecompiler
index 1a86c66d66..080f6c5cb8 100755
--- a/test/hasSpaceBeforePrecompiler
+++ b/test/hasSpaceBeforePrecompiler
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2016-2020 Axel Huebl
+# Copyright 2016-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/test/hasTabs b/test/hasTabs
index 9f3a4b5185..9f11234f2e 100755
--- a/test/hasTabs
+++ b/test/hasTabs
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2016-2020 Axel Huebl
+# Copyright 2016-2021 Axel Huebl
 #
 # This file is part of PIConGPU.
 #
diff --git a/thirdParty/alpaka/.dockerignore b/thirdParty/alpaka/.dockerignore
deleted file mode 100644
index 6b8710a711..0000000000
--- a/thirdParty/alpaka/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-.git
diff --git a/thirdParty/alpaka/.gitignore b/thirdParty/alpaka/.gitignore
deleted file mode 100644
index 7e8b50b81b..0000000000
--- a/thirdParty/alpaka/.gitignore
+++ /dev/null
@@ -1,21 +0,0 @@
-/doc/doxygen/*
-!/doc/doxygen/Doxyfile
-!/doc/doxygen/alpaka_doxygen.png
-/doc/latex/*
-**/build
-
-# tmp files
-*~
-
-# netbeans project files
-/nbproject/
-
-# Code::Blocks project files
-/*.cbp
-/*.layout
-
-# original backup files
-*.orig
-
-# VIM project files
-.vimrc
diff --git a/thirdParty/alpaka/.travis.yml b/thirdParty/alpaka/.travis.yml
deleted file mode 100644
index a11d5a1ac3..0000000000
--- a/thirdParty/alpaka/.travis.yml
+++ /dev/null
@@ -1,382 +0,0 @@
-#
-# Copyright 2015-2019 Benjamin Worpitz, Erik Zenker
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-os: linux
-dist: xenial
-language: generic
-services:
-  - docker
-
-################################################################################
-# NOTE: Testing the full matrix is not practical.
-# Therefore we aim to have each value been set in at lest one job.
-# CXX                                           : {g++, clang++, cl.exe}
-#   [g++] ALPAKA_CI_GCC_VER                     : {4.9, 5, 6, 7, 8, 9}
-#   [clang++] ALPAKA_CI_CLANG_VER               : {4.0.0, 5.0.2, 6.0.1, 7.0.1, 8.0.0, 9.0.0}
-#   ALPAKA_CI_STDLIB                            : {libstdc++, [CXX==clang++]:libc++}
-#   [clang++] ALPAKA_CI_CLANG_LIBSTDCPP_VERSION : {5, 7}
-# CMAKE_BUILD_TYPE                              : {Debug, Release}
-# ALPAKA_CI                                     : {TRAVIS}
-# ALPAKA_CI_DOCKER_BASE_IMAGE_NAME              : {ubuntu:14.04, ubuntu:16.04, ubuntu:18.04}
-# ALPAKA_CI_BOOST_BRANCH                        : {[CXX!=cl.exe&&OS!=osx]:boost-1.62.0, [CXX!=cl.exe&&OS!=osx]:boost-1.63.0, [OS!=osx]boost-1.64.0, boost-1.65.1, boost-1.66.0, boost-1.67.0, boost-1.68.0, boost-1.69.0, boost-1.70.0, boost-1.71.0}
-# ALPAKA_CI_CMAKE_VER                           : {3.11.4, 3.12.4, 3.13.5, 3.14.7, 3.15.5, 3.16.0}
-# ALPAKA_CI_SANITIZERS                          : {ASan, UBsan, TSan}
-#    TSan is not currently used because it produces many unexpected errors
-# ALPAKA_CI_ANALYSIS                            : {ON, OFF}
-# ALPAKA_DEBUG                                  : {0, 1, 2}
-# ALPAKA_ACC_GPU_CUDA_ONLY_MODE                 : {ON, OFF}
-# ALPAKA_ACC_GPU_HIP_ONLY_MODE                  : {ON, OFF}
-# ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE             : {ON, OFF}
-# ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE         : {ON, OFF}
-# ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE          : {ON, OFF}
-# ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE            : {ON, OFF}
-#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
-# ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE            : {ON, OFF}
-#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
-# ALPAKA_ACC_CPU_BT_OMP4_ENABLE                 : {ON, OFF}
-#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
-# ALPAKA_ACC_GPU_CUDA_ENABLE                    : {ON, OFF}
-#   [ON] ALPAKA_CUDA_VERSION                    : {8.0, 9.0, 9.1, 9.2, 10.0, 10.1, 10.2}
-#   [ON] ALPAKA_CUDA_COMPILER                   : {nvcc, [CXX==clang++]:clang}
-# ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE             : {ON, OFF}
-# ALPAKA_ACC_GPU_HIP_ENABLE                     : {ON, OFF}
-#   [ON] ALPAKA_CI_HIP_BRANCH                   : {master}
-#   [ON] ALPAKA_HIP_PLATFORM                    : {nvcc}
-env:
-    global:
-        - ALPAKA_CI=TRAVIS
-        - ALPAKA_CI_DOCKER_IMAGE_NAME=alpaka_ubuntu
-        - ALPAKA_CI_DOCKER_CACHE_DIR=${HOME}/cache/docker
-        - ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH=${ALPAKA_CI_DOCKER_CACHE_DIR}/${ALPAKA_CI_DOCKER_IMAGE_NAME}.tar.gz
-        - BOOST_ROOT=${HOME}/boost
-        - ALPAKA_CI_BOOST_LIB_DIR=${HOME}/boost_libs/
-        - ALPAKA_CI_CLANG_DIR=${HOME}/llvm
-        - ALPAKA_CI_CMAKE_DIR=${HOME}/CMake
-        - ALPAKA_CI_CUDA_DIR=${HOME}/CUDA
-        - ALPAKA_CI_HIP_ROOT_DIR=${HOME}/hip
-        - TBB_ROOT_DIR=${HOME}/tbb
-        - ALPAKA_CI_SANITIZERS=
-        - ALPAKA_CI_ANALYSIS=OFF
-        - ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=5
-        - ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
-        - ALPAKA_ACC_GPU_HIP_ENABLE=OFF
-
-matrix:
-    include:
-    ### Analysis builds
-    - name: nvcc-9.1 + gcc-4.9 Debug Analysis
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc
-    - name: gcc-8 Debug Analysis
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=8       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.66.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2
-    - name: clang-4 + CUDA-8.0 Debug Analysis
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-6 Debug Analysis
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2
-    - name: macOS 10.14 Xcode 11.2 Debug Analysis
-      os: osx
-      osx_image: xcode11.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.65.1                            ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-    - name: MSVC-2017 Debug Analysis
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2
-
-    ### macOS
-    - name: macOS 10.14 Xcode 10.2.1 Debug
-      os: osx
-      osx_image: xcode10.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14 Xcode 10.2.1 Release
-      os: osx
-      osx_image: xcode10.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    - name: macOS 10.14.4 Xcode 10.3 Debug
-      os: osx
-      osx_image: xcode10.3
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14.4 Xcode 10.3 Release
-      os: osx
-      osx_image: xcode10.3
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    - name: macOS 10.14 Xcode 11.0 Debug
-      os: osx
-      osx_image: xcode11
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14 Xcode 11.0 Release
-      os: osx
-      osx_image: xcode11
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    - name: macOS 10.14 Xcode 11.1 Debug
-      os: osx
-      osx_image: xcode11.1
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14 Xcode 11.1 Release
-      os: osx
-      osx_image: xcode11.1
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    - name: macOS 10.14 Xcode 11.2 Debug
-      os: osx
-      osx_image: xcode11.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14 Xcode 11.2 Release
-      os: osx
-      osx_image: xcode11.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    ### Windows
-    - name: MSVC-2017 Release
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.14.7 OMP_NUM_THREADS=4 ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF
-    - name: MSVC-2017 Debug
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.64.0 ALPAKA_CI_CMAKE_VER=3.11.4 OMP_NUM_THREADS=4 ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-
-    ### Ubuntu
-    ## native
-    # g++
-    # We can not enable UBSan when using gcc because it does not have a -fsanitize-blacklist option to suppress errors in boost etc.
-    # gcc 6 ASan is triggered within libtbb.so
-    # gcc 7 ASan introduced 'stack-use-after-scope' which is triggered by GOMP_parallel
-    - name: gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.62.0 ALPAKA_CI_CMAKE_VER=3.11.4 OMP_NUM_THREADS=4 ALPAKA_CXX_STANDARD=11
-    - name: gcc-5 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.66.0 ALPAKA_CI_CMAKE_VER=3.16.0 OMP_NUM_THREADS=3
-    - name: gcc-6 Debug c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 OMP_NUM_THREADS=2 ALPAKA_CXX_STANDARD=14
-    - name: gcc-7 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.63.0 ALPAKA_CI_CMAKE_VER=3.13.5 OMP_NUM_THREADS=3
-    - name: gcc-8 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=8       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.12.4 OMP_NUM_THREADS=4
-    - name: gcc-9 Debug c++17
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=9       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.15.5 OMP_NUM_THREADS=3 ALPAKA_CXX_STANDARD=17 ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF
-
-    # clang++
-    - name: clang-4 Debug UBSan
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.11.4 OMP_NUM_THREADS=4 ALPAKA_CI_SANITIZERS=UBSan
-    - name: clang-5 Debug c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.63.0 ALPAKA_CI_CMAKE_VER=3.14.7 OMP_NUM_THREADS=3 ALPAKA_CXX_STANDARD=14
-    - name: clang-6 Release ASan C++17
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 OMP_NUM_THREADS=2 ALPAKA_CI_SANITIZERS=ASan ALPAKA_CXX_STANDARD=17
-    - name: clang-7 Release c++17
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.13.5 OMP_NUM_THREADS=2 ALPAKA_CXX_STANDARD=17 ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=7
-    - name: clang-8 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.12.4 OMP_NUM_THREADS=4
-    - name: clang-9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.16.0 OMP_NUM_THREADS=3
-
-    ## CUDA 8.0
-    # nvcc + g++
-    - name: nvcc-8.0 + gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.62.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="20;60" ALPAKA_CXX_STANDARD=11
-    # clang++
-    - name: clang-4 + CUDA-8.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-5 + CUDA-8.0 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="20;35" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: clang-6 + CUDA-8.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.64.0 ALPAKA_CI_CMAKE_VER=3.16.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-7 + CUDA-8.0 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-8 + CUDA-8.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 9.0
-    # nvcc + g++
-    - name: nvcc-9.0 + gcc-4.9 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-9.0 + gcc-5 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70"
-    # clang++
-    - name: clang-6 + CUDA-9.0 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35"
-    - name: clang-7 + CUDA-9.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;70"
-    - name: clang-8 + CUDA-9.0 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 9.1
-    # nvcc + g++
-    - name: nvcc-9.1 + gcc-4.9 Debug ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;72" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-9.1 + gcc-5 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc
-    # nvcc + clang++
-    - name: nvcc-9.1 + clang-4 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;70"
-    # clang++
-    - name: clang-7 + CUDA-9.1 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;72"
-    - name: clang-8 + CUDA-9.1 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 9.2
-    # nvcc + g++
-    - name: nvcc-9.2 + gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;72"
-    - name: nvcc-9.2 + gcc-5 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-9.2 + gcc-6 Debug separable compilation
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=ON
-    - name: nvcc-9.2 + gcc-7 Release + relaxed constexpr off + extended lambda off
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR=OFF ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA=OFF
-    # nvcc + clang++
-    - name: nvcc-9.2 + clang-4 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;70"
-    # clang++
-    - name: clang-7 + CUDA-9.2 Release c++17
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;72" ALPAKA_CXX_STANDARD=17
-    - name: clang-8 + CUDA-9.2 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-9 + CUDA-9.2 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.16.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 10.0
-    # nvcc + g++
-    - name: nvcc-10.0 + gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;75"
-    - name: nvcc-10.0 + gcc-5 Release c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CXX_STANDARD=14
-    - name: nvcc-10.0 + gcc-6 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.0 + gcc-7 Release c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CXX_STANDARD=14
-    # nvcc + clang++
-    - name: nvcc-10.0 + clang-4 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60"
-    - name: nvcc-10.0 + clang-5 Debug separable compilation
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=ON
-    - name: nvcc-10.0 + clang-6 Debug c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" ALPAKA_CXX_STANDARD=14
-    # nvcc + MSVC
-    - name: nvcc-10.0 + MSVC-2017 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE separable compilation
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_ARCH="30;75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=ON
-    - name: nvcc-10.0 + MSVC-2017 Debug (Only one CPU backend enabled due to compile time)
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.66.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=OFF
-    # clang++
-    - name: clang-8 + CUDA-10.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-9 + CUDA-10.0 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 10.1
-    # nvcc + g++
-    - name: nvcc-10.1 + gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;75"
-    - name: nvcc-10.1 + gcc-5 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.1 + gcc-6 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.1 + gcc-7 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35"
-    - name: nvcc-10.1 + gcc-8 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=8       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35"
-    # nvcc + clang++
-    - name: nvcc-10.1 + clang-4 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60"
-    - name: nvcc-10.1 + clang-5 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-10.1 + clang-6 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70"
-    - name: nvcc-10.1 + clang-7 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70"
-    - name: nvcc-10.1 + clang-8 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75"
-    # nvcc + MSVC
-    - name: nvcc-10.1 + MSVC-2017 Debug ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_ARCH="30;75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-10.1 + MSVC-2017 Release (Only one CPU backend enabled due to compile time)
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=OFF
-    # clang++
-    - name: clang-9 + CUDA-10.1 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 10.2
-    # nvcc + g++
-    - name: nvcc-10.2 + gcc-4.9 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;75"
-    - name: nvcc-10.2 + gcc-5 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35"
-    - name: nvcc-10.2 + gcc-6 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.2 + gcc-7 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.2 + gcc-8 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=8       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35"
-    # nvcc + clang++
-    - name: nvcc-10.2 + clang-4 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60"
-    - name: nvcc-10.2 + clang-5 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75"
-    - name: nvcc-10.2 + clang-6 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-10.2 + clang-7 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75"
-    - name: nvcc-10.2 + clang-8 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75"
-    # nvcc + MSVC
-    - name: nvcc-10.2 + MSVC-2017 Debug ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_ARCH="30;75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-10.2 + MSVC-2017 Release (Only one CPU backend enabled due to compile time)
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=OFF
-
-    ## HIP
-    - name: HIP(nvcc9.2) + gcc-5 Debug ALPAKA_ACC_GPU_HIP_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_HIP_ENABLE=ON ALPAKA_ACC_GPU_HIP_ONLY_MODE=ON ALPAKA_CI_HIP_BRANCH="roc-2.8.0" ALPAKA_HIP_PLATFORM=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR=OFF ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA=OFF
-    - name: HIP(nvcc9.2) + gcc-5 Release ALPAKA_ACC_GPU_HIP_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_HIP_ENABLE=ON ALPAKA_ACC_GPU_HIP_ONLY_MODE=ON ALPAKA_CI_HIP_BRANCH="roc-2.8.0" ALPAKA_HIP_PLATFORM=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc
-
-branches:
-    except:
-        - gh-pages
-
-cache:
-    directories:
-        - $ALPAKA_CI_DOCKER_CACHE_DIR
-
-script:
-    - set -eovx pipefail
-    - if [ "$TRAVIS_OS_NAME" = "linux" ] ;then sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install smem ;fi
-    - if [ "$TRAVIS_OS_NAME" = "linux" ] ;then sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install moreutils ;fi
-    - if [ "$TRAVIS_OS_NAME" = "osx" ] ;then brew install moreutils ;fi
-    - if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ] ;then ./script/travis/script.sh | ts ;fi
-    - if [ "$TRAVIS_OS_NAME" = "windows" ] ;then ./script/travis/script.sh ;fi
-
-after_failure:
-    - ./script/travis/after_failure.sh
-
-notifications:
-    email: false
diff --git a/thirdParty/alpaka/.zenodo.json b/thirdParty/alpaka/.zenodo.json
deleted file mode 100644
index 80d29a6ce4..0000000000
--- a/thirdParty/alpaka/.zenodo.json
+++ /dev/null
@@ -1,66 +0,0 @@
-{
-  "title": "Alpaka: Abstraction Library for Parallel Kernel Acceleration",
-  "description": "The alpaka library is a header-only C++11 abstraction library for accelerator development. Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.",
-  "creators": [
-    {
-      "affiliation": "LogMeIn, Inc.",
-      "name": "Worpitz, Benjamin"
-    },
-    {
-      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden",
-      "name": "Matthes, Alexander",
-      "orcid": "0000-0002-6702-2015"
-    },
-    {
-      "affiliation": "LogMeIn, Inc.",
-      "name": "Zenker, Erik",
-      "orcid": "0000-0001-9417-8712"
-    },
-    {
-      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden",
-      "name": "Huebl, Axel",
-      "orcid": "0000-0003-1943-7141"
-    },
-    {
-      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
-      "name": "Widera, René",
-      "orcid": "0000-0003-1642-0459"
-    }
-  ],
-  "access_right": "open",
-  "keywords": [
-    "HPC",
-    "CUDA",
-    "OpenMP",
-    "C++",
-    "GPU",
-    "HIP",
-    "heterogeneous computing",
-    "performance portability"
-  ],
-  "license": "MPL-2.0",
-  "upload_type": "software",
-  "grants": [
-    {
-      "id": "654220"
-    }
-  ],
-  "related_identifiers": [
-    {
-      "identifier": "DOI:10.5281/zenodo.49768",
-      "relation": "isCitedBy"
-    },
-    {
-      "identifier": "DOI:10.1007/978-3-319-46079-6_21",
-      "relation": "cites"
-    },
-    {
-      "identifier": "DOI:10.1109/IPDPSW.2016.50",
-      "relation": "isCitedBy"
-    },
-    {
-      "identifier": "DOI:10.1007/978-3-319-67630-2_36",
-      "relation": "isCitedBy"
-    }
-  ]
-}
diff --git a/thirdParty/alpaka/CMakeLists.txt b/thirdParty/alpaka/CMakeLists.txt
deleted file mode 100644
index 0f7a6c59b5..0000000000
--- a/thirdParty/alpaka/CMakeLists.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-#
-# Copyright 2015-2019 Benjamin Worpitz
-#
-# This file is part of alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-################################################################################
-# Required CMake version
-
-cmake_minimum_required(VERSION 3.11.4)
-
-project("alpakaAll")
-
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-################################################################################
-# Options and Variants
-
-option(alpaka_BUILD_EXAMPLES "Build the examples" ON)
-
-include(CTest)
-# automatically defines: BUILD_TESTING, default is ON
-
-################################################################################
-# Add subdirectories
-
-if(alpaka_BUILD_EXAMPLES)
-    add_subdirectory("example/")
-endif()
-if(BUILD_TESTING)
-    add_subdirectory("test/")
-endif()
diff --git a/thirdParty/alpaka/Findalpaka.cmake b/thirdParty/alpaka/Findalpaka.cmake
deleted file mode 100644
index 94d0187356..0000000000
--- a/thirdParty/alpaka/Findalpaka.cmake
+++ /dev/null
@@ -1,109 +0,0 @@
-#.rst:
-# Findalpaka
-# ----------
-#
-# Abstraction library for parallel kernel acceleration
-# https://github.com/ComputationalRadiationPhysics/alpaka
-#
-# Finding and Using alpaka
-# ^^^^^^^^^^^^^^^^^^^^^
-#
-# .. code-block:: cmake
-#
-#   FIND_PACKAGE(alpaka
-#     [version] [EXACT]     # Minimum or EXACT version, e.g. 1.0.0
-#     [REQUIRED]            # Fail with an error if alpaka or a required
-#                           # component is not found
-#     [QUIET]               # Do not warn if this module was not found
-#     [COMPONENTS <...>]    # Compiled in components: ignored
-#   )
-#   TARGET_LINK_LIBRARIES(<target> PUBLIC alpaka)
-#
-# To provide a hint to this module where to find the alpaka installation,
-# set the ALPAKA_ROOT variable.
-#
-# This module requires Boost. Make sure to provide a valid install of it
-# under the environment variable BOOST_ROOT.
-#
-# ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE will require Boost.Fiber to be built.
-# ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE and ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE will require a OpenMP 2.0+ capable compiler.
-# ALPAKA_ACC_CPU_BT_OMP4_ENABLE will require a OpenMP 4.0+ capable compiler.
-# ALPAKA_ACC_GPU_CUDA_ENABLE will require CUDA 8.0+ to be installed.
-# ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE will require TBB 2.2+ to be installed
-#
-# Set the following CMake variables BEFORE calling find_packages to
-# change the behaviour of this module:
-# - ``ALPAKA_ACC_GPU_CUDA_ONLY_MODE`` {ON, OFF}
-# - ``ALPAKA_ACC_GPU_HIP_ONLY_MODE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_BT_OMP4_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_GPU_CUDA_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_GPU_HIP_ENABLE`` {ON, OFF}
-# - ``ALPAKA_CUDA_VERSION`` {8.0, ...}
-# - ``ALPAKA_CUDA_ARCH`` {sm_20, sm...}
-# - ``ALPAKA_CUDA_FAST_MATH`` {ON, OFF}
-# - ``ALPAKA_CUDA_FTZ`` {ON, OFF}
-# - ``ALPAKA_CUDA_SHOW_REGISTER`` {ON, OFF}
-# - ``ALPAKA_CUDA_KEEP_FILES`` {ON, OFF}
-# - ``ALPAKA_CUDA_SHOW_CODELINES`` {ON, OFF}
-# - ``ALPAKA_DEBUG`` {0, 1, 2}
-# - ``ALPAKA_CXX_STANDARD`` {11, 14, 17}
-#
-# Result Variables
-# ^^^^^^^^^^^^^^^^
-#
-# - ``alpaka_FOUND``
-#   TRUE if alpaka found a working install.
-# - ``alpaka_VERSION``
-#   Version in format Major.Minor.Patch
-# - ``alpaka_COMPILE_OPTIONS``
-#   Compiler options.
-# - ``alpaka_COMPILE_DEFINITIONS``
-#   Compiler definitions (without "-D" prefix!).
-# - ``alpaka_DEFINITIONS``
-#   Deprecated old compiler definitions. Combination of alpaka_COMPILE_OPTIONS and alpaka_COMPILE_DEFINITIONS prefixed with "-D".
-# - ``alpaka_INCLUDE_DIRS``
-#   Include directories required by the alpaka headers.
-# - ``alpaka_LIBRARIES``
-#   Libraries required to link against to use alpaka.
-#
-#
-# IMPORTED Targets
-# ^^^^^^^^^^^^^^^^
-#
-# This module defines the :prop_tgt:`IMPORTED` target ``alpaka``, if alpaka has
-# been found.
-#
-
-
-################################################################################
-# Copyright 2015-2019 Benjamin Worpitz
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
-# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
-# USE OR PERFORMANCE OF THIS SOFTWARE.
-
-FIND_PATH(
-    _ALPAKA_ROOT_DIR
-    NAMES "include/alpaka/alpaka.hpp"
-    HINTS "${ALPAKA_ROOT}" ENV ALPAKA_ROOT
-    DOC "alpaka ROOT location")
-
-IF(_ALPAKA_ROOT_DIR)
-    INCLUDE("${_ALPAKA_ROOT_DIR}/alpakaConfig.cmake")
-ELSE()
-    MESSAGE(FATAL_ERROR "alpaka could not be found!")
-ENDIF()
diff --git a/thirdParty/alpaka/LICENSE b/thirdParty/alpaka/LICENSE
deleted file mode 100644
index a612ad9813..0000000000
--- a/thirdParty/alpaka/LICENSE
+++ /dev/null
@@ -1,373 +0,0 @@
-Mozilla Public License Version 2.0
-==================================
-
-1. Definitions
---------------
-
-1.1. "Contributor"
-    means each individual or legal entity that creates, contributes to
-    the creation of, or owns Covered Software.
-
-1.2. "Contributor Version"
-    means the combination of the Contributions of others (if any) used
-    by a Contributor and that particular Contributor's Contribution.
-
-1.3. "Contribution"
-    means Covered Software of a particular Contributor.
-
-1.4. "Covered Software"
-    means Source Code Form to which the initial Contributor has attached
-    the notice in Exhibit A, the Executable Form of such Source Code
-    Form, and Modifications of such Source Code Form, in each case
-    including portions thereof.
-
-1.5. "Incompatible With Secondary Licenses"
-    means
-
-    (a) that the initial Contributor has attached the notice described
-        in Exhibit B to the Covered Software; or
-
-    (b) that the Covered Software was made available under the terms of
-        version 1.1 or earlier of the License, but not also under the
-        terms of a Secondary License.
-
-1.6. "Executable Form"
-    means any form of the work other than Source Code Form.
-
-1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in
-    a separate file or files, that is not Covered Software.
-
-1.8. "License"
-    means this document.
-
-1.9. "Licensable"
-    means having the right to grant, to the maximum extent possible,
-    whether at the time of the initial grant or subsequently, any and
-    all of the rights conveyed by this License.
-
-1.10. "Modifications"
-    means any of the following:
-
-    (a) any file in Source Code Form that results from an addition to,
-        deletion from, or modification of the contents of Covered
-        Software; or
-
-    (b) any new file in Source Code Form that contains any Covered
-        Software.
-
-1.11. "Patent Claims" of a Contributor
-    means any patent claim(s), including without limitation, method,
-    process, and apparatus claims, in any patent Licensable by such
-    Contributor that would be infringed, but for the grant of the
-    License, by the making, using, selling, offering for sale, having
-    made, import, or transfer of either its Contributions or its
-    Contributor Version.
-
-1.12. "Secondary License"
-    means either the GNU General Public License, Version 2.0, the GNU
-    Lesser General Public License, Version 2.1, the GNU Affero General
-    Public License, Version 3.0, or any later versions of those
-    licenses.
-
-1.13. "Source Code Form"
-    means the form of the work preferred for making modifications.
-
-1.14. "You" (or "Your")
-    means an individual or a legal entity exercising rights under this
-    License. For legal entities, "You" includes any entity that
-    controls, is controlled by, or is under common control with You. For
-    purposes of this definition, "control" means (a) the power, direct
-    or indirect, to cause the direction or management of such entity,
-    whether by contract or otherwise, or (b) ownership of more than
-    fifty percent (50%) of the outstanding shares or beneficial
-    ownership of such entity.
-
-2. License Grants and Conditions
---------------------------------
-
-2.1. Grants
-
-Each Contributor hereby grants You a world-wide, royalty-free,
-non-exclusive license:
-
-(a) under intellectual property rights (other than patent or trademark)
-    Licensable by such Contributor to use, reproduce, make available,
-    modify, display, perform, distribute, and otherwise exploit its
-    Contributions, either on an unmodified basis, with Modifications, or
-    as part of a Larger Work; and
-
-(b) under Patent Claims of such Contributor to make, use, sell, offer
-    for sale, have made, import, and otherwise transfer either its
-    Contributions or its Contributor Version.
-
-2.2. Effective Date
-
-The licenses granted in Section 2.1 with respect to any Contribution
-become effective for each Contribution on the date the Contributor first
-distributes such Contribution.
-
-2.3. Limitations on Grant Scope
-
-The licenses granted in this Section 2 are the only rights granted under
-this License. No additional rights or licenses will be implied from the
-distribution or licensing of Covered Software under this License.
-Notwithstanding Section 2.1(b) above, no patent license is granted by a
-Contributor:
-
-(a) for any code that a Contributor has removed from Covered Software;
-    or
-
-(b) for infringements caused by: (i) Your and any other third party's
-    modifications of Covered Software, or (ii) the combination of its
-    Contributions with other software (except as part of its Contributor
-    Version); or
-
-(c) under Patent Claims infringed by Covered Software in the absence of
-    its Contributions.
-
-This License does not grant any rights in the trademarks, service marks,
-or logos of any Contributor (except as may be necessary to comply with
-the notice requirements in Section 3.4).
-
-2.4. Subsequent Licenses
-
-No Contributor makes additional grants as a result of Your choice to
-distribute the Covered Software under a subsequent version of this
-License (see Section 10.2) or under the terms of a Secondary License (if
-permitted under the terms of Section 3.3).
-
-2.5. Representation
-
-Each Contributor represents that the Contributor believes its
-Contributions are its original creation(s) or it has sufficient rights
-to grant the rights to its Contributions conveyed by this License.
-
-2.6. Fair Use
-
-This License is not intended to limit any rights You have under
-applicable copyright doctrines of fair use, fair dealing, or other
-equivalents.
-
-2.7. Conditions
-
-Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
-in Section 2.1.
-
-3. Responsibilities
--------------------
-
-3.1. Distribution of Source Form
-
-All distribution of Covered Software in Source Code Form, including any
-Modifications that You create or to which You contribute, must be under
-the terms of this License. You must inform recipients that the Source
-Code Form of the Covered Software is governed by the terms of this
-License, and how they can obtain a copy of this License. You may not
-attempt to alter or restrict the recipients' rights in the Source Code
-Form.
-
-3.2. Distribution of Executable Form
-
-If You distribute Covered Software in Executable Form then:
-
-(a) such Covered Software must also be made available in Source Code
-    Form, as described in Section 3.1, and You must inform recipients of
-    the Executable Form how they can obtain a copy of such Source Code
-    Form by reasonable means in a timely manner, at a charge no more
-    than the cost of distribution to the recipient; and
-
-(b) You may distribute such Executable Form under the terms of this
-    License, or sublicense it under different terms, provided that the
-    license for the Executable Form does not attempt to limit or alter
-    the recipients' rights in the Source Code Form under this License.
-
-3.3. Distribution of a Larger Work
-
-You may create and distribute a Larger Work under terms of Your choice,
-provided that You also comply with the requirements of this License for
-the Covered Software. If the Larger Work is a combination of Covered
-Software with a work governed by one or more Secondary Licenses, and the
-Covered Software is not Incompatible With Secondary Licenses, this
-License permits You to additionally distribute such Covered Software
-under the terms of such Secondary License(s), so that the recipient of
-the Larger Work may, at their option, further distribute the Covered
-Software under the terms of either this License or such Secondary
-License(s).
-
-3.4. Notices
-
-You may not remove or alter the substance of any license notices
-(including copyright notices, patent notices, disclaimers of warranty,
-or limitations of liability) contained within the Source Code Form of
-the Covered Software, except that You may alter any license notices to
-the extent required to remedy known factual inaccuracies.
-
-3.5. Application of Additional Terms
-
-You may choose to offer, and to charge a fee for, warranty, support,
-indemnity or liability obligations to one or more recipients of Covered
-Software. However, You may do so only on Your own behalf, and not on
-behalf of any Contributor. You must make it absolutely clear that any
-such warranty, support, indemnity, or liability obligation is offered by
-You alone, and You hereby agree to indemnify every Contributor for any
-liability incurred by such Contributor as a result of warranty, support,
-indemnity or liability terms You offer. You may include additional
-disclaimers of warranty and limitations of liability specific to any
-jurisdiction.
-
-4. Inability to Comply Due to Statute or Regulation
----------------------------------------------------
-
-If it is impossible for You to comply with any of the terms of this
-License with respect to some or all of the Covered Software due to
-statute, judicial order, or regulation then You must: (a) comply with
-the terms of this License to the maximum extent possible; and (b)
-describe the limitations and the code they affect. Such description must
-be placed in a text file included with all distributions of the Covered
-Software under this License. Except to the extent prohibited by statute
-or regulation, such description must be sufficiently detailed for a
-recipient of ordinary skill to be able to understand it.
-
-5. Termination
---------------
-
-5.1. The rights granted under this License will terminate automatically
-if You fail to comply with any of its terms. However, if You become
-compliant, then the rights granted under this License from a particular
-Contributor are reinstated (a) provisionally, unless and until such
-Contributor explicitly and finally terminates Your grants, and (b) on an
-ongoing basis, if such Contributor fails to notify You of the
-non-compliance by some reasonable means prior to 60 days after You have
-come back into compliance. Moreover, Your grants from a particular
-Contributor are reinstated on an ongoing basis if such Contributor
-notifies You of the non-compliance by some reasonable means, this is the
-first time You have received notice of non-compliance with this License
-from such Contributor, and You become compliant prior to 30 days after
-Your receipt of the notice.
-
-5.2. If You initiate litigation against any entity by asserting a patent
-infringement claim (excluding declaratory judgment actions,
-counter-claims, and cross-claims) alleging that a Contributor Version
-directly or indirectly infringes any patent, then the rights granted to
-You by any and all Contributors for the Covered Software under Section
-2.1 of this License shall terminate.
-
-5.3. In the event of termination under Sections 5.1 or 5.2 above, all
-end user license agreements (excluding distributors and resellers) which
-have been validly granted by You or Your distributors under this License
-prior to termination shall survive termination.
-
-************************************************************************
-*                                                                      *
-*  6. Disclaimer of Warranty                                           *
-*  -------------------------                                           *
-*                                                                      *
-*  Covered Software is provided under this License on an "as is"       *
-*  basis, without warranty of any kind, either expressed, implied, or  *
-*  statutory, including, without limitation, warranties that the       *
-*  Covered Software is free of defects, merchantable, fit for a        *
-*  particular purpose or non-infringing. The entire risk as to the     *
-*  quality and performance of the Covered Software is with You.        *
-*  Should any Covered Software prove defective in any respect, You     *
-*  (not any Contributor) assume the cost of any necessary servicing,   *
-*  repair, or correction. This disclaimer of warranty constitutes an   *
-*  essential part of this License. No use of any Covered Software is   *
-*  authorized under this License except under this disclaimer.         *
-*                                                                      *
-************************************************************************
-
-************************************************************************
-*                                                                      *
-*  7. Limitation of Liability                                          *
-*  --------------------------                                          *
-*                                                                      *
-*  Under no circumstances and under no legal theory, whether tort      *
-*  (including negligence), contract, or otherwise, shall any           *
-*  Contributor, or anyone who distributes Covered Software as          *
-*  permitted above, be liable to You for any direct, indirect,         *
-*  special, incidental, or consequential damages of any character      *
-*  including, without limitation, damages for lost profits, loss of    *
-*  goodwill, work stoppage, computer failure or malfunction, or any    *
-*  and all other commercial damages or losses, even if such party      *
-*  shall have been informed of the possibility of such damages. This   *
-*  limitation of liability shall not apply to liability for death or   *
-*  personal injury resulting from such party's negligence to the       *
-*  extent applicable law prohibits such limitation. Some               *
-*  jurisdictions do not allow the exclusion or limitation of           *
-*  incidental or consequential damages, so this exclusion and          *
-*  limitation may not apply to You.                                    *
-*                                                                      *
-************************************************************************
-
-8. Litigation
--------------
-
-Any litigation relating to this License may be brought only in the
-courts of a jurisdiction where the defendant maintains its principal
-place of business and such litigation shall be governed by laws of that
-jurisdiction, without reference to its conflict-of-law provisions.
-Nothing in this Section shall prevent a party's ability to bring
-cross-claims or counter-claims.
-
-9. Miscellaneous
-----------------
-
-This License represents the complete agreement concerning the subject
-matter hereof. If any provision of this License is held to be
-unenforceable, such provision shall be reformed only to the extent
-necessary to make it enforceable. Any law or regulation which provides
-that the language of a contract shall be construed against the drafter
-shall not be used to construe this License against a Contributor.
-
-10. Versions of the License
----------------------------
-
-10.1. New Versions
-
-Mozilla Foundation is the license steward. Except as provided in Section
-10.3, no one other than the license steward has the right to modify or
-publish new versions of this License. Each version will be given a
-distinguishing version number.
-
-10.2. Effect of New Versions
-
-You may distribute the Covered Software under the terms of the version
-of the License under which You originally received the Covered Software,
-or under the terms of any subsequent version published by the license
-steward.
-
-10.3. Modified Versions
-
-If you create software not governed by this License, and you want to
-create a new license for such software, you may create and use a
-modified version of this License if you rename the license and remove
-any references to the name of the license steward (except to note that
-such modified license differs from this License).
-
-10.4. Distributing Source Code Form that is Incompatible With Secondary
-Licenses
-
-If You choose to distribute Source Code Form that is Incompatible With
-Secondary Licenses under the terms of this version of the License, the
-notice described in Exhibit B of this License must be attached.
-
-Exhibit A - Source Code Form License Notice
--------------------------------------------
-
-  This Source Code Form is subject to the terms of the Mozilla Public
-  License, v. 2.0. If a copy of the MPL was not distributed with this
-  file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-If it is not possible or desirable to put the notice in a particular
-file, then You may include the notice in a location (such as a LICENSE
-file in a relevant directory) where a recipient would be likely to look
-for such a notice.
-
-You may add additional accurate notices of copyright ownership.
-
-Exhibit B - "Incompatible With Secondary Licenses" Notice
----------------------------------------------------------
-
-  This Source Code Form is "Incompatible With Secondary Licenses", as
-  defined by the Mozilla Public License, v. 2.0.
diff --git a/thirdParty/alpaka/README.md b/thirdParty/alpaka/README.md
deleted file mode 100644
index 8370dac9c4..0000000000
--- a/thirdParty/alpaka/README.md
+++ /dev/null
@@ -1,210 +0,0 @@
-**alpaka** - Abstraction Library for Parallel Kernel Acceleration
-=================================================================
-
-[![Travis CI Build Status](https://travis-ci.org/ComputationalRadiationPhysics/alpaka.svg?branch=develop)](https://travis-ci.org/ComputationalRadiationPhysics/alpaka)
-[![Language](https://img.shields.io/badge/language-C%2B%2B11-orange.svg)](https://isocpp.org/)
-[![Platforms](https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20mac-lightgrey.svg)](https://github.com/ComputationalRadiationPhysics/alpaka)
-[![License](https://img.shields.io/badge/license-MPL--2.0-blue.svg)](https://www.mozilla.org/en-US/MPL/2.0/)
-
-![Alpaka](doc/images/alpaka_401x135.png)
-
-The **alpaka** library is a header-only C++11 abstraction library for accelerator development.
-
-Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.
-
-It is platform independent and supports the concurrent and cooperative use of multiple devices such as the hosts CPU as well as attached accelerators as for instance CUDA GPUs and Xeon Phis (currently native execution only).
-A multitude of accelerator back-end variants using CUDA, OpenMP (2.0/4.0), Boost.Fiber, std::thread and also serial execution is provided and can be selected depending on the device.
-Only one implementation of the user kernel is required by representing them as function objects with a special interface.
-There is no need to write special CUDA, OpenMP or custom threading code.
-Accelerator back-ends can be mixed within a device queue.
-The decision which accelerator back-end executes which kernel can be made at runtime.
-
-The abstraction used is very similar to the CUDA grid-blocks-threads division strategy.
-Algorithms that should be parallelized have to be divided into a multi-dimensional grid consisting of small uniform work items.
-These functions are called kernels and are executed in parallel threads.
-The threads in the grid are organized in blocks.
-All threads in a block are executed in parallel and can interact via fast shared memory.
-Blocks are executed independently and can not interact in any way.
-The block execution order is unspecified and depends on the accelerator in use.
-By using this abstraction the execution can be optimally adapted to the available hardware.
-
-
-Software License
-----------------
-
-**alpaka** is licensed under **MPL-2.0**.
-
-
-Documentation
--------------
-
-The [general documentation](doc/markdown/Index.md) is located within the `doc/markdown` subfolder of the repository.
-The [source code documentation](http://computationalradiationphysics.github.io/alpaka/) is generated with [doxygen](http://www.doxygen.org).
-
-
-Accelerator Back-ends
----------------------
-
-|Accelerator Back-end|Lib/API|Devices|Execution strategy grid-blocks|Execution strategy block-threads|
-|---|---|---|---|---|
-|Serial|n/a|Host CPU (single core)|sequential|sequential (only 1 thread per block)|
-|OpenMP 2.0+ blocks|OpenMP 2.0+|Host CPU (multi core)|parallel (preemptive multitasking)|sequential (only 1 thread per block)|
-|OpenMP 2.0+ threads|OpenMP 2.0+|Host CPU (multi core)|sequential|parallel (preemptive multitasking)|
-|OpenMP 4.0+ (CPU)|OpenMP 4.0+|Host CPU (multi core)|parallel (undefined)|parallel (preemptive multitasking)|
-| std::thread | std::thread |Host CPU (multi core)|sequential|parallel (preemptive multitasking)|
-| Boost.Fiber | boost::fibers::fiber |Host CPU (single core)|sequential|parallel (cooperative multitasking)|
-|TBB|TBB 2.2+|Host CPU (multi core)|parallel (preemptive multitasking)|sequential (only 1 thread per block)|
-|CUDA|CUDA 8.0-10.2|NVIDIA GPUs|parallel (undefined)|parallel (lock-step within warps)|
-|HIP(nvcc)|[HIP 1.5+](https://github.com/ROCm-Developer-Tools/HIP)|NVIDIA GPUs SM 2.0+|parallel (undefined)|parallel (lock-step within warps)|
-
-
-Supported Compilers
--------------------
-
-This library uses C++11 (or newer when available).
-
-|Accelerator Back-end|gcc 4.9.4 <br/> (Linux)|gcc 5.5 <br/> (Linux)|gcc 6.4/7.3 <br/> (Linux)|gcc 8.1/9.1 <br/> (Linux)|clang 4 <br/> (Linux)|clang 5 <br/> (Linux)|clang 6 <br/> (Linux)|clang 7 <br/> (Linux)|clang 8 <br/> (Linux)|clang 9 <br/> (Linux)|Apple LLVM 10.2-11.2 <br/> (macOS)|MSVC 2017.9 <br/> (Windows)|
-|---|---|---|---|---|---|---|---|---|---|---|---|---|
-|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-|OpenMP 2.0+ blocks|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|
-|OpenMP 2.0+ threads|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|
-|OpenMP 4.0+ (CPU)|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:x:|
-| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|
-|TBB|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-|CUDA (nvcc)|:white_check_mark: <br/> (CUDA 8.0-10.2)|:white_check_mark: <br/> (CUDA 9.0-10.2)|:white_check_mark: <br/> (CUDA 9.2-10.2) |:x:|:white_check_mark: <br/> (CUDA 9.1-10.2)|:white_check_mark: <br/> (CUDA 10.1-10.2)|:white_check_mark: <br/> (CUDA 10.1-10.2)|:white_check_mark: <br/> (CUDA 10.1-10.2)|:white_check_mark: <br/> (CUDA 10.1-10.2)|:x:|:x:|:white_check_mark: <br/> (CUDA 10.0-10.2)|
-|CUDA (clang) | - | - | - | - | :white_check_mark: <br/> (CUDA 8.0)| :white_check_mark: <br/> (CUDA 8.0)| :white_check_mark: <br/> (CUDA 8.0-9.0) | :white_check_mark: <br/> (CUDA 8.0-9.2) | :white_check_mark: <br/> (CUDA 8.0-10.0) | :white_check_mark: <br/> (CUDA 9.2-10.1) | - | - |
-|[HIP](doc/markdown/user/implementation/mapping/HIP.md) (nvcc)|:white_check_mark: <br/> (nvcc 9.0+)|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|
-
-
-Other compilers or combinations marked with :x: in the table above may work but are not tested in CI and are therefore not explicitly supported.
-
-Dependencies
-------------
-
-[Boost](https://boost.org/) 1.62+ is the only mandatory external dependency (for CUDA 9+ Boost >=1.65.1 is required).
-The **alpaka** library itself just requires header-only libraries.
-However some of the accelerator back-end implementations require different boost libraries to be built.
-
-When an accelerator back-end using *Boost.Fiber* is enabled, `boost-fiber` and all of its dependencies are required to be built in C++11 mode `./b2 cxxflags="-std=c++11"`.
-When *Boost.Fiber* is enabled and alpaka is built in C++17 mode with clang and libstc++, Boost >= 1.67.0 is required.
-
-When an accelerator back-end using *CUDA* is enabled, version *8.0* of the *CUDA SDK* is the minimum requirement.
-*NOTE*: When using nvcc as *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with the *Boost.Fiber accelerator back-end* due to bugs in the nvcc compiler.
-*NOTE*: When using clang as a native *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with any *OpenMP accelerator back-end* because this combination is currently unsupported.
-*NOTE*: Separable compilation is only supported when using nvcc, not with clang as native *CUDA* compiler. It is disabled by default and can be enabled via the CMake flag `ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION`.
-
-When an accelerator back-end using *OpenMP* is enabled, the compiler and the platform have to support the corresponding minimum *OpenMP* version.
-
-When an accelerator back-end using *TBB* is enabled, the compiler and the platform have to support the corresponding minimum *TBB* version.
-
-
-Usage
------
-
-The library is header only so nothing has to be built.
-CMake 3.11.4+ is required to provide the correct defines and include paths.
-Just call `ALPAKA_ADD_EXECUTABLE` instead of `CUDA_ADD_EXECUTABLE` or `ADD_EXECUTABLE` and the difficulties of the CUDA nvcc compiler in handling `.cu` and `.cpp` files are automatically taken care of.
-Source files do not need any special file ending.
-Examples of how to utilize alpaka within CMake can be found in the `example` folder.
-
-The whole alpaka library can be included with: `#include <alpaka/alpaka.hpp>`
-Code that is not intended to be utilized by the user is hidden in the `detail` namespace.
-
-
-Introduction
-------------
-
-For a quick introduction, feel free to playback the recording of our presentation at
-[GTC 2016](http://mygtc.gputechconf.com/quicklink/858sI36):
-
- - E. Zenker, R. Widera, G. Juckeland et al.,
-   *Porting the Plasma Simulation PIConGPU to Heterogeneous Architectures with Alpaka*,
-   [video link (39 min)](http://on-demand.gputechconf.com/gtc/2016/video/S6298.html)
-
-
-Citing alpaka
--------------
-
-Currently all authors of **alpaka** are scientists or connected with
-research. For us to justify the importance and impact of our work, please
-consider citing us accordingly in your derived work and publications:
-
-```latex
-% Peer-Reviewed Publication %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-% Peer reviewed and accepted publication in
-%   "2nd International Workshop on Performance Portable
-%    Programming Models for Accelerators (P^3MA)"
-% colocated with the
-%   "2017 ISC High Performance Conference"
-%   in Frankfurt, Germany
-@inproceedings{MathesP3MA2017,
-  author    = {{Matthes}, A. and {Widera}, R. and {Zenker}, E. and {Worpitz}, B. and
-               {Huebl}, A. and {Bussmann}, M.},
-  title     = {Tuning and optimization for a variety of many-core architectures without changing a single line of implementation code
-               using the Alpaka library},
-  archivePrefix = "arXiv",
-  eprint    = {1706.10086},
-  keywords  = {Computer Science - Distributed, Parallel, and Cluster Computing},
-  day       = {30},
-  month     = {Jun},
-  year      = {2017},
-  url       = {https://arxiv.org/abs/1706.10086},
-}
-
-% Peer-Reviewed Publication %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%
-% Peer reviewed and accepted publication in
-%   "The Sixth International Workshop on
-%    Accelerators and Hybrid Exascale Systems (AsHES)"
-% at the
-%   "30th IEEE International Parallel and Distributed
-%    Processing Symposium" in Chicago, IL, USA
-@inproceedings{ZenkerAsHES2016,
-  author    = {Erik Zenker and Benjamin Worpitz and Ren{\'{e}} Widera
-               and Axel Huebl and Guido Juckeland and
-               Andreas Kn{\"{u}}pfer and Wolfgang E. Nagel and Michael Bussmann},
-  title     = {Alpaka - An Abstraction Library for Parallel Kernel Acceleration},
-  archivePrefix = "arXiv",
-  eprint    = {1602.08477},
-  keywords  = {Computer science;CUDA;Mathematical Software;nVidia;OpenMP;Package;
-               performance portability;Portability;Tesla K20;Tesla K80},
-  day       = {23},
-  month     = {May},
-  year      = {2016},
-  publisher = {IEEE Computer Society},
-  url       = {http://arxiv.org/abs/1602.08477},
-}
-
-
-% Original Work: Benjamin Worpitz' Master Thesis %%%%%%%%%%
-%
-@MasterThesis{Worpitz2015,
-  author = {Benjamin Worpitz},
-  title  = {Investigating performance portability of a highly scalable
-            particle-in-cell simulation code on various multi-core
-            architectures},
-  school = {{Technische Universit{\"{a}}t Dresden}},
-  month  = {Sep},
-  year   = {2015},
-  type   = {Master Thesis},
-  doi    = {10.5281/zenodo.49768},
-  url    = {http://dx.doi.org/10.5281/zenodo.49768}
-}
-```
-
-
-Authors
--------
-
-### Maintainers and Core Developers
-
-- Benjamin Worpitz (original author)
-- Rene Widera
-
-### Former Members, Contributions and Thanks
-
-- Dr. Michael Bussmann
-- Axel Huebl
-- Erik Zenker
diff --git a/thirdParty/alpaka/alpakaConfig.cmake b/thirdParty/alpaka/alpakaConfig.cmake
deleted file mode 100644
index ad66efd118..0000000000
--- a/thirdParty/alpaka/alpakaConfig.cmake
+++ /dev/null
@@ -1,1220 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Erik Zenker, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-################################################################################
-# Required cmake version.
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-################################################################################
-# alpaka.
-
-# Return values.
-UNSET(alpaka_FOUND)
-UNSET(alpaka_VERSION)
-UNSET(alpaka_COMPILE_OPTIONS)
-UNSET(alpaka_COMPILE_DEFINITIONS)
-UNSET(alpaka_DEFINITIONS)
-UNSET(alpaka_INCLUDE_DIR)
-UNSET(alpaka_INCLUDE_DIRS)
-UNSET(alpaka_LIBRARY)
-UNSET(alpaka_LIBRARIES)
-
-# Internal usage.
-UNSET(_ALPAKA_FOUND)
-UNSET(_ALPAKA_COMPILE_OPTIONS_PUBLIC)
-UNSET(_ALPAKA_COMPILE_DEFINITIONS_PUBLIC)
-UNSET(_ALPAKA_INCLUDE_DIRECTORY)
-UNSET(_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC)
-UNSET(_ALPAKA_LINK_LIBRARIES_PUBLIC)
-UNSET(_ALPAKA_LINK_FLAGS_PUBLIC)
-UNSET(_ALPAKA_COMMON_FILE)
-UNSET(_ALPAKA_ADD_EXECUTABLE_FILE)
-UNSET(_ALPAKA_ADD_LIBRRAY_FILE)
-UNSET(_ALPAKA_FILES_HEADER)
-UNSET(_ALPAKA_FILES_OTHER)
-
-#-------------------------------------------------------------------------------
-# Common.
-
-# Directory of this file.
-SET(_ALPAKA_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR})
-# Normalize the path (e.g. remove ../)
-GET_FILENAME_COMPONENT(_ALPAKA_ROOT_DIR "${_ALPAKA_ROOT_DIR}" ABSOLUTE)
-
-# Add common functions.
-SET(_ALPAKA_COMMON_FILE "${_ALPAKA_ROOT_DIR}/cmake/common.cmake")
-INCLUDE("${_ALPAKA_COMMON_FILE}")
-
-# Add ALPAKA_ADD_EXECUTABLE function.
-SET(_ALPAKA_ADD_EXECUTABLE_FILE "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake")
-INCLUDE("${_ALPAKA_ADD_EXECUTABLE_FILE}")
-
-# Add ALPAKA_ADD_LIBRARY function.
-SET(_ALPAKA_ADD_LIBRARY_FILE "${_ALPAKA_ROOT_DIR}/cmake/addLibrary.cmake")
-INCLUDE("${_ALPAKA_ADD_LIBRARY_FILE}")
-
-# Set found to true initially and set it to false if a required dependency is missing.
-SET(_ALPAKA_FOUND TRUE)
-
-# Add module search path
-SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${_ALPAKA_ROOT_DIR}/cmake/modules/")
-
-#-------------------------------------------------------------------------------
-# Options.
-SET(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT ON)
-
-# HIP and platform selection and warning about unsupported features
-OPTION(ALPAKA_ACC_GPU_HIP_ENABLE "Enable the HIP back-end (all other back-ends must be disabled)" OFF)
-OPTION(ALPAKA_ACC_GPU_HIP_ONLY_MODE "Only back-ends using HIP can be enabled in this mode." OFF) # HIP only runs without other back-ends
-
-# Drop-down combo box in cmake-gui for HIP platforms.
-SET(ALPAKA_HIP_PLATFORM "nvcc" CACHE STRING "Specify HIP platform")
-SET_PROPERTY(CACHE ALPAKA_HIP_PLATFORM PROPERTY STRINGS "nvcc;hcc;clang")
-
-IF(ALPAKA_ACC_GPU_HIP_ENABLE AND NOT ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-    MESSAGE(WARNING "HIP back-end must be used together with ALPAKA_ACC_GPU_HIP_ONLY_MODE")
-    SET(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "" FORCE)
-ENDIF()
-
-IF(ALPAKA_ACC_GPU_HIP_ENABLE AND (ALPAKA_HIP_PLATFORM MATCHES "hcc" OR ALPAKA_HIP_PLATFORM MATCHES "clang"))
-    MESSAGE(WARNING
-        "The HIP back-end is currently experimental, especially for HCC. "
-        "In alpaka HIP(HCC) has a few workarounds and does not support 3D memory and constant memory. "
-        )
-ENDIF()
-
-OPTION(ALPAKA_ACC_GPU_CUDA_ONLY_MODE "Only back-ends using CUDA can be enabled in this mode (This allows to mix alpaka code with native CUDA code)." OFF)
-# If CUDA-only mode is enabled, we set the defaults for all CPU back-ends to OFF.
-# If they are explicitly set via the command line, the user will get an error later on.
-IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE OR ALPAKA_ACC_GPU_HIP_ONLY_MODE) # CUDA-only or HIP-only
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT OFF)
-ENDIF()
-
-OPTION(ALPAKA_ACC_GPU_CUDA_ENABLE "Enable the CUDA GPU back-end" ON)
-
-# If CUDA is enabled, we set the defaults for some unsupported back-ends to OFF.
-# If they are explicitly set via the command line, the user will get an error later on.
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT OFF)
-    IF(ALPAKA_CUDA_COMPILER MATCHES "clang")
-        SET(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT OFF)
-        SET(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT OFF)
-        SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT OFF)
-    ENDIF()
-ENDIF()
-
-IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE AND NOT ALPAKA_ACC_GPU_CUDA_ENABLE)
-    MESSAGE(WARNING "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, ALPAKA_ACC_GPU_CUDA_ENABLE has to be enabled as well.")
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-IF(ALPAKA_ACC_GPU_HIP_ONLY_MODE AND NOT ALPAKA_ACC_GPU_HIP_ENABLE)
-    MESSAGE(WARNING "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, ALPAKA_ACC_GPU_HIP_ENABLE has to be enabled as well.")
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-
-
-OPTION(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE "Enable the serial CPU back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE "Enable the threads CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE "Enable the fibers CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE "Enable the TBB CPU grid block back-end" ${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE "Enable the OpenMP 2.0 CPU grid block back-end" ${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE "Enable the OpenMP 2.0 CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_BT_OMP4_ENABLE "Enable the OpenMP 4.0 CPU block and block thread back-end" ${ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT})
-
-IF((ALPAKA_ACC_GPU_CUDA_ONLY_MODE OR ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-   AND
-    (ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE OR
-    ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE OR
-    ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE OR
-    ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE OR
-    ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR
-    ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR
-    ALPAKA_ACC_CPU_BT_OMP4_ENABLE))
-    IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
-        MESSAGE(WARNING "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, only back-ends using CUDA can be enabled! This allows to mix alpaka code with native CUDA code. However, this prevents any non-CUDA back-ends from being enabled.")
-    ENDIF()
-    IF(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-        MESSAGE(WARNING "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, only back-ends using HIP can be enabled!")
-    ENDIF()
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-
-# avoids CUDA+HIP conflict
-IF(ALPAKA_ACC_GPU_HIP_ENABLE AND ALPAKA_ACC_GPU_CUDA_ENABLE)
-    MESSAGE(FATAL_ERROR "CUDA and HIP can not be enabled both at the same time.")
-ENDIF()
-
-# HIP is only supported on Linux
-IF(ALPAKA_ACC_GPU_HIP_ENABLE AND (MSVC OR WIN32))
-    MESSAGE(WARNING "Optional alpaka dependency HIP can not be built on Windows! HIP back-end disabled!")
-    SET(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "Enable the HIP GPU back-end" FORCE)
-ENDIF()
-
-# Drop-down combo box in cmake-gui.
-SET(ALPAKA_DEBUG "0" CACHE STRING "Debug level")
-SET_PROPERTY(CACHE ALPAKA_DEBUG PROPERTY STRINGS "0;1;2")
-
-SET(ALPAKA_CXX_STANDARD "11" CACHE STRING "C++ standard version")
-SET_PROPERTY(CACHE ALPAKA_CXX_STANDARD PROPERTY STRINGS "11;14;17")
-
-#-------------------------------------------------------------------------------
-# Debug output of common variables.
-IF(${ALPAKA_DEBUG} GREATER 1)
-    MESSAGE(STATUS "_ALPAKA_ROOT_DIR : ${_ALPAKA_ROOT_DIR}")
-    MESSAGE(STATUS "_ALPAKA_COMMON_FILE : ${_ALPAKA_COMMON_FILE}")
-    MESSAGE(STATUS "_ALPAKA_ADD_EXECUTABLE_FILE : ${_ALPAKA_ADD_EXECUTABLE_FILE}")
-    MESSAGE(STATUS "_ALPAKA_ADD_LIBRARY_FILE : ${_ALPAKA_ADD_LIBRARY_FILE}")
-    MESSAGE(STATUS "CMAKE_BUILD_TYPE : ${CMAKE_BUILD_TYPE}")
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Check supported compilers.
-IF(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0)
-    MESSAGE(FATAL_ERROR "Clang versions < 4.0 are not supported!")
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-
-IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE AND (ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE))
-    MESSAGE(FATAL_ERROR "Fibers and CUDA or HIP back-end can not be enabled both at the same time.")
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Compiler settings.
-IF(MSVC)
-    # Empty append to define it if it does not already exist.
-    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC)
-
-    IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
-        LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "/wd4505")   # CUDA\v9.2\include\crt/host_runtime.h(265): warning C4505: '__cudaUnregisterBinaryUtil': unreferenced local function has been removed
-    ENDIF()
-ELSE()
-    # Add linker options.
-    # lipthread:
-    LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "general;pthread")
-    IF(NOT APPLE)
-        # librt: undefined reference to `clock_gettime'
-        LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "general;rt")
-    ENDIF()
-
-    # Clang<4.0 or AppleClang<9.0
-    #   https://bugs.llvm.org/show_bug.cgi?id=18417
-    #   https://github.com/llvm/llvm-project/commit/e55b4737c026ea2e0b44829e4115d208577a67b2
-    IF(("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang" AND
-        CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.1) OR
-       ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND
-        CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0))
-        LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-ftemplate-depth=1024")
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find Boost.
-SET(_ALPAKA_BOOST_MIN_VER "1.62.0")
-IF(${ALPAKA_DEBUG} GREATER 1)
-    SET(Boost_DEBUG ON)
-    SET(Boost_DETAILED_FAILURE_MSG ON)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
-    FIND_PACKAGE(Boost ${_ALPAKA_BOOST_MIN_VER} QUIET COMPONENTS fiber context system thread atomic chrono date_time)
-    IF(NOT Boost_FIBER_FOUND)
-        MESSAGE(STATUS "Optional alpaka dependency Boost fiber could not be found! Fibers back-end disabled!")
-        SET(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE OFF CACHE BOOL "Enable the Fibers CPU back-end" FORCE)
-        FIND_PACKAGE(Boost ${_ALPAKA_BOOST_MIN_VER} QUIET)
-    ELSE()
-        # On Win32 boost context triggers:
-        # libboost_context-vc141-mt-gd-1_64.lib(jump_i386_ms_pe_masm.obj) : error LNK2026: module unsafe for SAFESEH image.
-        IF(MSVC)
-            IF(CMAKE_SIZEOF_VOID_P EQUAL 4)
-                SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /SAFESEH:NO")
-                SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /SAFESEH:NO")
-                SET(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /SAFESEH:NO")
-            ENDIF()
-        ENDIF()
-    ENDIF()
-
-ELSE()
-    FIND_PACKAGE(Boost ${_ALPAKA_BOOST_MIN_VER} QUIET)
-ENDIF()
-
-IF(${ALPAKA_DEBUG} GREATER 1)
-    MESSAGE(STATUS "Boost in:")
-    MESSAGE(STATUS "BOOST_ROOT : ${BOOST_ROOT}")
-    MESSAGE(STATUS "BOOSTROOT : ${BOOSTROOT}")
-    MESSAGE(STATUS "BOOST_INCLUDEDIR: ${BOOST_INCLUDEDIR}")
-    MESSAGE(STATUS "BOOST_LIBRARYDIR: ${BOOST_LIBRARYDIR}")
-    MESSAGE(STATUS "Boost_NO_SYSTEM_PATHS: ${Boost_NO_SYSTEM_PATHS}")
-    MESSAGE(STATUS "Boost_ADDITIONAL_VERSIONS: ${Boost_ADDITIONAL_VERSIONS}")
-    MESSAGE(STATUS "Boost_USE_MULTITHREADED: ${Boost_USE_MULTITHREADED}")
-    MESSAGE(STATUS "Boost_USE_STATIC_LIBS: ${Boost_USE_STATIC_LIBS}")
-    MESSAGE(STATUS "Boost_USE_STATIC_RUNTIME: ${Boost_USE_STATIC_RUNTIME}")
-    MESSAGE(STATUS "Boost_USE_DEBUG_RUNTIME: ${Boost_USE_DEBUG_RUNTIME}")
-    MESSAGE(STATUS "Boost_USE_DEBUG_PYTHON: ${Boost_USE_DEBUG_PYTHON}")
-    MESSAGE(STATUS "Boost_USE_STLPORT: ${Boost_USE_STLPORT}")
-    MESSAGE(STATUS "Boost_USE_STLPORT_DEPRECATED_NATIVE_IOSTREAMS: ${Boost_USE_STLPORT_DEPRECATED_NATIVE_IOSTREAMS}")
-    MESSAGE(STATUS "Boost_COMPILER: ${Boost_COMPILER}")
-    MESSAGE(STATUS "Boost_THREADAPI: ${Boost_THREADAPI}")
-    MESSAGE(STATUS "Boost_NAMESPACE: ${Boost_NAMESPACE}")
-    MESSAGE(STATUS "Boost_DEBUG: ${Boost_DEBUG}")
-    MESSAGE(STATUS "Boost_DETAILED_FAILURE_MSG: ${Boost_DETAILED_FAILURE_MSG}")
-    MESSAGE(STATUS "Boost_REALPATH: ${Boost_REALPATH}")
-    MESSAGE(STATUS "Boost_NO_BOOST_CMAKE: ${Boost_NO_BOOST_CMAKE}")
-    MESSAGE(STATUS "Boost out:")
-    MESSAGE(STATUS "Boost_FOUND: ${Boost_FOUND}")
-    MESSAGE(STATUS "Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}")
-    MESSAGE(STATUS "Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}")
-    MESSAGE(STATUS "Boost_LIBRARIES: ${Boost_LIBRARIES}")
-    MESSAGE(STATUS "Boost_FIBER_FOUND: ${Boost_FIBER_FOUND}")
-    MESSAGE(STATUS "Boost_FIBER_LIBRARY: ${Boost_FIBER_LIBRARY}")
-    MESSAGE(STATUS "Boost_CONTEXT_FOUND: ${Boost_CONTEXT_FOUND}")
-    MESSAGE(STATUS "Boost_CONTEXT_LIBRARY: ${Boost_CONTEXT_LIBRARY}")
-    MESSAGE(STATUS "Boost_SYSTEM_FOUND: ${Boost_SYSTEM_FOUND}")
-    MESSAGE(STATUS "Boost_SYSTEM_LIBRARY: ${Boost_SYSTEM_LIBRARY}")
-    MESSAGE(STATUS "Boost_THREAD_FOUND: ${Boost_THREAD_FOUND}")
-    MESSAGE(STATUS "Boost_THREAD_LIBRARY: ${Boost_THREAD_LIBRARY}")
-    MESSAGE(STATUS "Boost_ATOMIC_FOUND: ${Boost_ATOMIC_FOUND}")
-    MESSAGE(STATUS "Boost_ATOMIC_LIBRARY: ${Boost_ATOMIC_LIBRARY}")
-    MESSAGE(STATUS "Boost_CHRONO_FOUND: ${Boost_CHRONO_FOUND}")
-    MESSAGE(STATUS "Boost_CHRONO_LIBRARY: ${Boost_CHRONO_LIBRARY}")
-    MESSAGE(STATUS "Boost_DATE_TIME_FOUND: ${Boost_DATE_TIME_FOUND}")
-    MESSAGE(STATUS "Boost_DATE_TIME_LIBRARY: ${Boost_DATE_TIME_LIBRARY}")
-    MESSAGE(STATUS "Boost_VERSION: ${Boost_VERSION}")
-    MESSAGE(STATUS "Boost_LIB_VERSION: ${Boost_LIB_VERSION}")
-    MESSAGE(STATUS "Boost_MAJOR_VERSION: ${Boost_MAJOR_VERSION}")
-    MESSAGE(STATUS "Boost_MINOR_VERSION: ${Boost_MINOR_VERSION}")
-    MESSAGE(STATUS "Boost_SUBMINOR_VERSION: ${Boost_SUBMINOR_VERSION}")
-    MESSAGE(STATUS "Boost_LIB_DIAGNOSTIC_DEFINITIONS: ${Boost_LIB_DIAGNOSTIC_DEFINITIONS}")
-    MESSAGE(STATUS "Boost cached:")
-    MESSAGE(STATUS "Boost_INCLUDE_DIR: ${Boost_INCLUDE_DIR}")
-    MESSAGE(STATUS "Boost_LIBRARY_DIR: ${Boost_LIBRARY_DIR}")
-ENDIF()
-
-IF(NOT Boost_FOUND)
-    MESSAGE(WARNING "Required alpaka dependency Boost (>=${_ALPAKA_BOOST_MIN_VER}) could not be found!")
-    SET(_ALPAKA_FOUND FALSE)
-
-ELSE()
-    IF(Boost_FIBER_FOUND)
-        # Boost fiber and default header-only libraries
-        IF(TARGET Boost::fiber)
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC
-                 Boost::boost
-                 Boost::fiber Boost::context Boost::system Boost::thread
-                 Boost::chrono Boost::date_time Boost::atomic
-            )
-        ELSE()
-            # fallback: Boost version is too new for CMake
-            LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC ${Boost_INCLUDE_DIRS})
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC ${Boost_LIBRARIES})
-        ENDIF()
-    ELSE()
-        # header-only libraries
-        IF(TARGET Boost::boost)
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC Boost::boost)
-        ELSE()
-            # fallback: Boost version is too new for CMake
-            LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC ${Boost_INCLUDE_DIRS})
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC ${Boost_LIBRARIES})
-        ENDIF()
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find TBB.
-IF(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)
-    FIND_PACKAGE(TBB 2.2)
-    IF(NOT TBB_FOUND)
-        MESSAGE(STATUS "Optional alpaka dependency TBB could not be found! TBB grid block back-end disabled!")
-        SET(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE OFF CACHE BOOL "Enable the TBB grid block back-end" FORCE)
-    ELSE()
-        LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC ${TBB_LIBRARIES})
-        LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC ${TBB_INCLUDE_DIRS})
-        LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC ${TBB_DEFINITIONS})
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find OpenMP.
-IF(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR ALPAKA_ACC_CPU_BT_OMP4_ENABLE)
-    FIND_PACKAGE(OpenMP)
-
-    # Manually find OpenMP for the clang compiler if it was not already found.
-    # Even CMake 3.5 is unable to find libiomp and provide the correct OpenMP flags.
-    IF(NOT OPENMP_FOUND)
-        IF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-            FIND_PATH(_ALPAKA_LIBIOMP_INCLUDE_DIR NAMES "omp.h" PATH_SUFFIXES "include" "libiomp" "include/libiomp")
-            IF(_ALPAKA_LIBIOMP_INCLUDE_DIR)
-                SET(OPENMP_FOUND TRUE)
-                SET(OpenMP_CXX_FLAGS "-fopenmp=libiomp5")
-                SET(OpenMP_C_FLAGS "-fopenmp=libiomp5")
-                LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC "${_ALPAKA_LIBIOMP_INCLUDE_DIR}")
-            ENDIF()
-        ENDIF()
-    ENDIF()
-
-    IF(NOT OPENMP_FOUND)
-        MESSAGE(STATUS "Optional alpaka dependency OpenMP could not be found! OpenMP back-ends disabled!")
-        SET(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OFF CACHE BOOL "Enable the OpenMP 2.0 CPU grid block back-end" FORCE)
-        SET(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OFF CACHE BOOL "Enable the OpenMP 2.0 CPU block thread back-end" FORCE)
-        SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE OFF CACHE BOOL "Enable the OpenMP 4.0 CPU block and thread back-end" FORCE)
-
-    ELSE()
-
-        # Check whether OpenMP 4 is supported
-        IF(OpenMP_CXX_VERSION VERSION_LESS 4.0)
-            SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE OFF CACHE BOOL "Enable the OpenMP 4.0 CPU block and thread back-end" FORCE)
-        ENDIF()
-
-        LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC ${OpenMP_CXX_FLAGS})
-        IF(NOT MSVC)
-            LIST(APPEND _ALPAKA_LINK_FLAGS_PUBLIC ${OpenMP_CXX_FLAGS})
-        ENDIF()
-
-        # clang versions beginning with 3.9 support OpenMP 4.0 but only when given the corresponding flag
-        IF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-            IF(ALPAKA_ACC_CPU_BT_OMP4_ENABLE)
-                LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-fopenmp-version=40")
-            ENDIF()
-        ENDIF()
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find CUDA.
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-
-    IF(NOT DEFINED ALPAKA_CUDA_VERSION)
-        SET(ALPAKA_CUDA_VERSION 8.0)
-    ENDIF()
-
-    IF(ALPAKA_CUDA_VERSION VERSION_LESS 8.0)
-        MESSAGE(WARNING "CUDA Toolkit < 8.0 is not supported!")
-        SET(_ALPAKA_FOUND FALSE)
-
-    ELSE()
-        FIND_PACKAGE(CUDA "${ALPAKA_CUDA_VERSION}")
-        IF(NOT CUDA_FOUND)
-            MESSAGE(STATUS "Optional alpaka dependency CUDA could not be found! CUDA back-end disabled!")
-            SET(ALPAKA_ACC_GPU_CUDA_ENABLE OFF CACHE BOOL "Enable the CUDA GPU back-end" FORCE)
-
-        ELSE()
-            SET(ALPAKA_CUDA_VERSION "${CUDA_VERSION}")
-            IF(CUDA_VERSION VERSION_LESS 9.0)
-                SET(ALPAKA_CUDA_ARCH "20" CACHE STRING "GPU architecture")
-            ELSEIF(CUDA_VERSION VERSION_LESS 10.3)
-                SET(ALPAKA_CUDA_ARCH "30" CACHE STRING "GPU architecture")
-            ELSE()
-                SET(ALPAKA_CUDA_ARCH "35" CACHE STRING "GPU architecture")
-            ENDIF()
-            SET(ALPAKA_CUDA_COMPILER "nvcc" CACHE STRING "CUDA compiler")
-            SET_PROPERTY(CACHE ALPAKA_CUDA_COMPILER PROPERTY STRINGS "nvcc;clang")
-
-            OPTION(ALPAKA_CUDA_FAST_MATH "Enable fast-math" ON)
-            OPTION(ALPAKA_CUDA_FTZ "Set flush to zero for GPU" OFF)
-            OPTION(ALPAKA_CUDA_SHOW_REGISTER "Show kernel registers and create PTX" OFF)
-            OPTION(ALPAKA_CUDA_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps (folder: nvcc_tmp)" OFF)
-            OPTION(ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA "Enable experimental, extended host-device lambdas in NVCC" ON)
-            OPTION(ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR "Enable experimental, relaxed constexpr in NVCC" ON)
-            OPTION(ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION "Enable separable compilation in NVCC" OFF)
-
-            IF(ALPAKA_CUDA_COMPILER MATCHES "clang")
-                IF(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-                    MESSAGE(FATAL_ERROR "Using clang as CUDA compiler is only possible if clang is the host compiler!")
-                ENDIF()
-
-                IF(CMAKE_CXX_COMPILER_VERSION LESS 6.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 9.0)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 6 do not support CUDA 9 or greater!")
-                    ENDIF()
-                ELSEIF(CMAKE_CXX_COMPILER_VERSION LESS 7.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 9.1)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 7 do not support CUDA 9.1 or greater!")
-                    ENDIF()
-                ELSEIF(CMAKE_CXX_COMPILER_VERSION LESS 8.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 10.0)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 8 do not support CUDA 10.0 or greater!")
-                    ENDIF()
-                ELSEIF(CMAKE_CXX_COMPILER_VERSION LESS 9.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 10.1)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 9 do not support CUDA 10.1 or greater!")
-                    ENDIF()
-                ELSEIF(CMAKE_CXX_COMPILER_VERSION LESS 10.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 10.2)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 10 do not support CUDA 10.2 or greater!")
-                    ENDIF()
-                ENDIF()
-
-                IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
-                    MESSAGE(FATAL_ERROR "Clang as a CUDA compiler does not support boost.fiber!")
-                ENDIF()
-                IF(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)
-                    MESSAGE(FATAL_ERROR "Clang as a CUDA compiler does not support OpenMP 2!")
-                ENDIF()
-                IF(ALPAKA_ACC_CPU_BT_OMP4_ENABLE)
-                    MESSAGE(FATAL_ERROR "Clang as a CUDA compiler does not support OpenMP 4!")
-                ENDIF()
-
-                FOREACH(_CUDA_ARCH_ELEM ${ALPAKA_CUDA_ARCH})
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "--cuda-gpu-arch=sm_${_CUDA_ARCH_ELEM}")
-                ENDFOREACH()
-
-                LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "--cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
-
-                # This flag silences the warning produced by the Dummy.cpp files:
-                # clang: warning: argument unused during compilation: '--cuda-gpu-arch=sm_XX'
-                # This seems to be a false positive as all flags are 'unused' for an empty file.
-                LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-Qunused-arguments")
-
-                # Silences warnings that are produced by boost because clang is not correctly identified.
-                LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-Wno-unused-local-typedef")
-
-                IF(ALPAKA_CUDA_FAST_MATH)
-                    # -ffp-contract=fast enables the usage of FMA
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-ffast-math" "-ffp-contract=fast")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_FTZ)
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-fcuda-flush-denormals-to-zero")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_SHOW_REGISTER)
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-Xcuda-ptxas=-v")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_KEEP_FILES)
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-save-temps")
-                ENDIF()
-
-                # When libstdc++ is used and -std=gnu++XX is set, we get the following compile error:
-                # /usr/lib/gcc/x86_64-linux-gnu/5.5.0/../../../../include/c++/5.5.0/type_traits:311:39: error: __float128 is not supported on this target struct __is_floating_point_helper<__float128>
-                # Clang doesn't support the __float128 type (at least when building CUDA device code)
-                # * Due to the minimum requirement to compile with C++11 and because extensions are enabled by default by CMake, it adds -std=gnu++11 instead of -std=c++11 to the command line.
-                #   Due to alpaka being an INTERFACE library (header-only) we are not allowed to set CXX_EXTENSIONS to OFF and transitively disable extensions for inherited targets.
-                # * Defining __float128 on the command line is the least invasive workaround found here: https://bugs.llvm.org/show_bug.cgi?id=13530#c6
-                LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "__float128=void")
-
-                # CMake 3.15 does not provide the `--std=c++11` argument to clang anymore.
-                # It is not necessary for basic c++ compilation because clangs default is already higher, but CUDA code compiled with -x cuda still defaults to c++98.
-                IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.15.0")
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-std=c++${ALPAKA_CXX_STANDARD}")
-                ENDIF()
-
-            ELSE()
-                IF("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-                    IF(CUDA_VERSION VERSION_EQUAL 8.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 5.4)
-                            MESSAGE(FATAL_ERROR "NVCC 8.0 does not support GCC 5.4+. Please use GCC 4.9 - 5.3!")
-                        ENDIF()
-                    ELSEIF((CUDA_VERSION VERSION_EQUAL 9.0) OR (CUDA_VERSION VERSION_EQUAL 9.1))
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 6.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.0 - 9.1 do not support GCC 7+ and fail compiling the std::tuple implementation in GCC 6+. Please use GCC 4.9 - 5.5!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 9.2)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 8.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.2 does not support GCC 8+. Please use GCC 4.9, 5, 6 or 7!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 8.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.0 does not support GCC 8+. Please use GCC 4.9, 5, 6 or 7!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.1)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.1 does not support GCC 9+. Please use GCC 4.9, 5, 6, 7 or 8!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.2)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.2 does not support GCC 9+. Please use GCC 4.9, 5, 6, 7 or 8!")
-                        ENDIF()
-                    ENDIF()
-                ELSEIF("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-                    IF(CUDA_VERSION VERSION_EQUAL 8.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 4.0)
-                            MESSAGE(FATAL_ERROR "NVCC 8.0 does not support clang 4+. Please use NVCC 9.1!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 9.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 4.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.0 does not support clang 4+. Please use NVCC 9.1!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 9.1)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 5.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.1 does not support clang 5+. Please use clang 4!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 9.2)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 5.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.2 does not support clang 6+ and fails compiling with clang 5. Please use clang 4!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 7.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.0 does not support clang 7+. Please use clang 4, 5 or 6!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.1)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.1 does not support clang 9+. Please use clang 4, 5, 6, 7 or 8!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.2)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.2 does not support clang 9+. Please use clang 4, 5, 6, 7 or 8!")
-                        ENDIF()
-                    ENDIF()
-                ENDIF()
-
-                # CUDA 9.0 removed the __CUDACC_VER__ macro. Boost versions lower than 1.65.1 still use this macro.
-                IF(CUDA_VERSION VERSION_GREATER_EQUAL 9.0 AND Boost_VERSION VERSION_LESS 1.65.1)
-                    MESSAGE(WARNING "CUDA 9.0 or newer requires boost-1.65.1 or newer!")
-                    SET(_ALPAKA_FOUND FALSE)
-                ENDIF()
-
-                # CUDA 9.0 is the first to support c++14.
-                IF((CUDA_VERSION VERSION_LESS 9.0) AND (ALPAKA_CXX_STANDARD GREATER 11))
-                    MESSAGE(WARNING "CUDA 9.0 or newer is required for c++14 or higher!")
-                    SET(_ALPAKA_FOUND FALSE)
-                ENDIF()
-
-                IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
-                    MESSAGE(FATAL_ERROR "NVCC does not support boost.fiber!")
-                ENDIF()
-
-                # Clean up the flags. Else, multiple find calls would result in duplicate flags. Furthermore, other modules may have set different settings.
-                SET(CUDA_NVCC_FLAGS)
-
-                IF(${ALPAKA_DEBUG} GREATER 1)
-                    SET(CUDA_VERBOSE_BUILD ON)
-                ENDIF()
-
-                SET(CUDA_PROPAGATE_HOST_FLAGS ON)
-
-                IF(ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION)
-                    SET(CUDA_SEPARABLE_COMPILATION ON)
-                ENDIF()
-
-                # nvcc sets no linux/__linux macros on OpenPOWER linux
-                # nvidia bug id: 2448610
-                IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-                    IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
-                        LIST(APPEND CUDA_NVCC_FLAGS "-Dlinux")
-                    ENDIF()
-                ENDIF()
-
-                IF(CUDA_VERSION VERSION_EQUAL 8.0)
-                    LIST(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-                ENDIF()
-
-
-                IF(ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--expt-extended-lambda")
-                ENDIF()
-                IF(ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
-                ENDIF()
-
-                FOREACH(_CUDA_ARCH_ELEM ${ALPAKA_CUDA_ARCH})
-                    # set flags to create device code for the given architecture
-                    LIST(APPEND CUDA_NVCC_FLAGS
-                        --generate-code arch=compute_${_CUDA_ARCH_ELEM},code=sm_${_CUDA_ARCH_ELEM}
-                        --generate-code arch=compute_${_CUDA_ARCH_ELEM},code=compute_${_CUDA_ARCH_ELEM}
-                    )
-                ENDFOREACH()
-
-                IF(NOT MSVC)
-                    LIST(APPEND CUDA_NVCC_FLAGS "-std=c++${ALPAKA_CXX_STANDARD}")
-                ENDIF()
-
-                SET(CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
-
-                IF(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-                    LIST(APPEND CUDA_NVCC_FLAGS "-g")
-                    # https://github.com/ComputationalRadiationPhysics/alpaka/issues/428
-                    IF(((CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) OR
-                        (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.8)) AND
-                        CUDA_VERSION VERSION_LESS 9.0)
-                        MESSAGE(WARNING "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} does not support -G with CUDA <= 8! "
-                                        "Device debug symbols NOT added.")
-                    ELSEIF(MSVC)
-                        MESSAGE(WARNING "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} does not support -G with CUDA! "
-                                        "Device debug symbols NOT added.")
-                    ELSE()
-                        LIST(APPEND CUDA_NVCC_FLAGS "-G")
-                    ENDIF()
-                ENDIF()
-
-                IF(ALPAKA_CUDA_FAST_MATH)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_FTZ)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--ftz=true")
-                ELSE()
-                    LIST(APPEND CUDA_NVCC_FLAGS "--ftz=false")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_SHOW_REGISTER)
-                    LIST(APPEND CUDA_NVCC_FLAGS "-Xptxas=-v")
-                ENDIF()
-
-                # Always add warning/error numbers which can be used for suppressions
-                LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --display_error_number)
-
-                # avoids warnings on host-device signatured, default constructors/destructors
-                IF(CUDA_VERSION GREATER_EQUAL 9.0)
-                    LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored)
-                ENDIF()
-
-                # avoids warnings on host-device signature of 'std::__shared_count<>'
-                IF(CUDA_VERSION EQUAL 10.0)
-                    LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2905)
-                ELSEIF(CUDA_VERSION EQUAL 10.1)
-                    LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2912)
-                ELSEIF(CUDA_VERSION EQUAL 10.2)
-                    LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2976)
-                ENDIF()
-
-                IF(ALPAKA_CUDA_KEEP_FILES)
-                    MAKE_DIRECTORY("${PROJECT_BINARY_DIR}/nvcc_tmp")
-                    LIST(APPEND CUDA_NVCC_FLAGS "--keep" "--keep-dir" "${PROJECT_BINARY_DIR}/nvcc_tmp")
-                ENDIF()
-
-                OPTION(ALPAKA_CUDA_SHOW_CODELINES "Show kernel lines in cuda-gdb and cuda-memcheck" OFF)
-                IF(ALPAKA_CUDA_SHOW_CODELINES)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--source-in-ptx" "-lineinfo")
-                    IF(NOT MSVC)
-                        LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-rdynamic")
-                    ENDIF()
-                    SET(ALPAKA_CUDA_KEEP_FILES ON CACHE BOOL "activate keep files" FORCE)
-                ENDIF()
-            ENDIF()
-
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "general;${CUDA_CUDART_LIBRARY}")
-            LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC ${CUDA_INCLUDE_DIRS})
-        ENDIF()
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find HIP.
-IF(ALPAKA_ACC_GPU_HIP_ENABLE)
-
-    IF(NOT DEFINED ALPAKA_HIP_VERSION)
-        SET(ALPAKA_HIP_VERSION 1.5)
-    ENDIF()
-
-    IF(ALPAKA_HIP_VERSION VERSION_LESS 1.5)
-        MESSAGE(WARNING "HIP < 1.5 is not supported!")
-        SET(_ALPAKA_FOUND FALSE)
-
-    ELSE()
-        # must set this for HIP package (note that you also need certain env vars)
-        SET(HIP_PLATFORM "${ALPAKA_HIP_PLATFORM}" CACHE STRING "")
-        SET(HIP_RUNTIME "${ALPAKA_HIP_PLATFORM}" CACHE STRING "")
-
-        FIND_PACKAGE(HIP "${ALPAKA_HIP_VERSION}")
-        IF(NOT HIP_FOUND)
-            MESSAGE(WARNING "Optional alpaka dependency HIP could not be found! HIP back-end disabled!")
-            SET(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "Enable the HIP GPU back-end" FORCE)
-
-        ELSE()
-            SET(ALPAKA_HIP_VERSION "${HIP_VERSION}")
-            IF(ALPAKA_HIP_VERSION VERSION_LESS 1.5.19211)
-                MESSAGE(STATUS "HIP < 1.5.19211 untested!")
-            ENDIF()
-            SET(ALPAKA_HIP_COMPILER "hipcc" CACHE STRING "HIP compiler")
-            SET_PROPERTY(CACHE ALPAKA_HIP_COMPILER PROPERTY STRINGS "hipcc")
-
-            OPTION(ALPAKA_HIP_FAST_MATH "Enable fast-math" ON)
-            OPTION(ALPAKA_HIP_FTZ "Set flush to zero for GPU" OFF)
-            OPTION(ALPAKA_HIP_SHOW_REGISTER "Show kernel registers and create PTX" OFF)
-            OPTION(ALPAKA_HIP_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps (folder: nvcc_tmp)" OFF)
-
-            SET(HIP_HIPCC_FLAGS)
-
-            IF(ALPAKA_HIP_PLATFORM MATCHES "nvcc")
-                FIND_PACKAGE(CUDA)
-                IF(NOT CUDA_FOUND)
-                    MESSAGE(WARNING "Could not found CUDA while HIP platform is set to nvcc. Compiling might fail.")
-                ENDIF()
-
-                IF(CUDA_VERSION VERSION_LESS 9.0)
-                    SET(ALPAKA_CUDA_ARCH "20" CACHE STRING "GPU architecture")
-                ELSE()
-                    SET(ALPAKA_CUDA_ARCH "30" CACHE STRING "GPU architecture")
-                ENDIF()
-
-                # CUDA 9.0 removed the __CUDACC_VER__ macro. Boost versions lower than 1.65.1 still use this macro.
-                IF(CUDA_VERSION VERSION_GREATER_EQUAL 9.0 AND Boost_VERSION VERSION_LESS 1.65.1)
-                    MESSAGE(WARNING "CUDA 9.0 or newer requires boost-1.65.1 or newer!")
-                    SET(_ALPAKA_FOUND FALSE)
-                ENDIF()
-
-                IF(CUDA_VERSION VERSION_EQUAL 8.0)
-                    LIST(APPEND HIP_HIPCC_FLAGS "-Wno-deprecated-gpu-targets")
-                ENDIF()
-
-                IF(CUDA_VERSION VERSION_LESS 8.0)
-                    MESSAGE(WARNING "CUDA Toolkit < 8.0 is not supported!")
-                    SET(_ALPAKA_FOUND FALSE)
-                ENDIF()
-
-                IF(${ALPAKA_DEBUG} GREATER 1)
-                    SET(HIP_VERBOSE_BUILD ON)
-                ENDIF()
-
-                LIST(APPEND HIP_NVCC_FLAGS "--expt-extended-lambda")
-                LIST(APPEND HIP_NVCC_FLAGS "--expt-relaxed-constexpr")
-                LIST(APPEND _ALPAKA_HIP_LIBRARIES "cudart")
-
-                FOREACH(_HIP_ARCH_ELEM ${ALPAKA_CUDA_ARCH})
-                    # set flags to create device code for the given architecture
-                    LIST(APPEND CUDA_NVCC_FLAGS
-                        --generate-code arch=compute_${_HIP_ARCH_ELEM},code=sm_${_HIP_ARCH_ELEM}
-                        --generate-code arch=compute_${_HIP_ARCH_ELEM},code=compute_${_HIP_ARCH_ELEM}
-                    )
-                ENDFOREACH()
-                # for CUDA cmake adds automatically compiler flags as nvcc does not do this,
-                # but for HIP we have to do this here
-                LIST(APPEND HIP_NVCC_FLAGS "-D__CUDACC__")
-                LIST(APPEND HIP_NVCC_FLAGS "-ccbin ${CMAKE_CXX_COMPILER}")
-                LIST(APPEND HIP_NVCC_FLAGS "-Xcompiler" "-g")
-
-                IF(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-                    LIST(APPEND HIP_HIPCC_FLAGS "-G")
-                ENDIF()
-                # propage host flags
-                # SET(CUDA_PROPAGATE_HOST_FLAGS ON) # does not exist in HIP, so do it manually
-                string(TOUPPER "${CMAKE_BUILD_TYPE}" build_config)
-                FOREACH( _flag ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${build_config}})
-                    LIST(APPEND HIP_NVCC_FLAGS "-Xcompiler ${_flag}")
-                ENDFOREACH()
-
-                IF(ALPAKA_HIP_FAST_MATH)
-                    LIST(APPEND HIP_HIPCC_FLAGS "--use_fast_math")
-                ENDIF()
-
-                IF(ALPAKA_HIP_FTZ)
-                    LIST(APPEND HIP_HIPCC_FLAGS "--ftz=true")
-                ELSE()
-                    LIST(APPEND HIP_HIPCC_FLAGS "--ftz=false")
-                ENDIF()
-
-                IF(ALPAKA_HIP_SHOW_REGISTER)
-                    LIST(APPEND HIP_HIPCC_FLAGS "-Xptxas=-v")
-                ENDIF()
-                IF(CUDA_VERSION GREATER_EQUAL 9.0)
-                    # avoids warnings on host-device signatured, default constructors/destructors
-                    LIST(APPEND HIP_HIPCC_FLAGS "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored")
-                ENDIF()
-
-                # random numbers library ( HIP(NVCC) ) /hiprand
-                # HIP_ROOT_DIR is set by FindHIP.cmake
-                FIND_PATH(HIP_RAND_INC
-                    NAMES "hiprand_kernel.h"
-                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}/include" "hiprand"
-                    PATHS "/opt/rocm/rocrand/hiprand"
-                    PATH_SUFFIXES "include" "hiprand")
-                FIND_LIBRARY(HIP_RAND_LIBRARY
-                    NAMES "hiprand-d" "hiprand"
-                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}" "hiprand"
-                    PATHS "/opt/rocm/rocrand/hiprand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "lib" "lib64")
-                IF(NOT HIP_RAND_INC)
-                    MESSAGE(FATAL_ERROR "Could not find hipRAND include (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}).")
-                ENDIF()
-                IF(NOT HIP_RAND_LIBRARY)
-                    MESSAGE(FATAL_ERROR "Could not find hipRAND library (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}).")
-                ENDIF()
-                LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC "${HIP_RAND_INC}")
-                LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "${HIP_RAND_LIBRARY}")
-            ENDIF() # nvcc
-
-            IF(ALPAKA_HIP_PLATFORM MATCHES "hcc")
-
-                # random numbers library ( HIP(HCC) ) /rocrand
-                FIND_PATH(ROC_RAND_INC
-                    rocrand_kernel.h
-                    PATHS "${HIP_ROOT_DIR}/rocrand" "${HIP_ROOT_DIR}" "rocrand"
-                    PATHS "/opt/rocm/rocrand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "include")
-                FIND_LIBRARY(ROC_RAND_LIBRARY
-                    rocrand-d
-                    rocrand
-                    PATHS "${HIP_ROOT_DIR}/rocrand" "${HIP_ROOT_DIR}" "rocrand"
-                    PATHS "/opt/rocm/rocrand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "lib" "lib64")
-
-                # random numbers library ( HIP(HCC) ) rocrand/hiprand
-                FIND_PATH(HIP_RAND_INC
-                    hiprand_kernel.h
-                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}" "hiprand"
-                    PATHS "/opt/rocm/hiprand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "include")
-                FIND_LIBRARY(HIP_RAND_LIBRARY
-                    hiprand-d
-                    hiprand
-                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}" "hiprand"
-                    PATHS "/opt/rocm/hiprand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "lib" "lib64")
-                IF(NOT HIP_RAND_INC OR NOT HIP_RAND_LIBRARY)
-                    MESSAGE(FATAL_ERROR "Could not find hipRAND library")
-                ENDIF()
-                LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC "${HIP_RAND_INC}")
-                LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "${HIP_RAND_LIBRARY}")
-
-                IF(NOT ROC_RAND_INC OR NOT ROC_RAND_LIBRARY)
-                    MESSAGE(FATAL_ERROR "Could not find rocRAND library")
-                ENDIF()
-
-                LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC "${ROC_RAND_INC}")
-                LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "${ROC_RAND_LIBRARY}")
-
-            ENDIF()
-
-
-            LIST(APPEND HIP_HIPCC_FLAGS "-D__HIPCC__")
-            LIST(APPEND HIP_HIPCC_FLAGS "-std=c++${ALPAKA_CXX_STANDARD}")
-
-            IF(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-                LIST(APPEND HIP_HIPCC_FLAGS "-g")
-            ENDIF()
-
-
-            IF(ALPAKA_HIP_KEEP_FILES)
-                MAKE_DIRECTORY("${PROJECT_BINARY_DIR}/hip_tmp")
-                LIST(APPEND HIP_HIPCC_FLAGS "--keep" "--keep-dir" "${PROJECT_BINARY_DIR}/hip_tmp")
-            ENDIF()
-
-            OPTION(ALPAKA_HIP_SHOW_CODELINES "Show kernel lines in cuda-gdb and cuda-memcheck" OFF)
-            IF(ALPAKA_HIP_SHOW_CODELINES)
-                LIST(APPEND HIP_HIPCC_FLAGS "--source-in-ptx" "-lineinfo")
-                LIST(APPEND HIP_HIPCC_FLAGS "-Xcompiler" "-rdynamic")
-                SET(ALPAKA_HIP_KEEP_FILES ON CACHE BOOL "activate keep files" FORCE)
-            ENDIF()
-            IF(_ALPAKA_HIP_LIBRARIES)
-                LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "general;${_ALPAKA_HIP_LIBRARIES}")
-            ENDIF()
-        ENDIF()
-    ENDIF()
-ENDIF() # HIP
-
-#-------------------------------------------------------------------------------
-# alpaka.
-IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_GPU_CUDA_ONLY_MODE")
-    MESSAGE(STATUS ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
-ENDIF()
-
-IF(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_GPU_HIP_ONLY_MODE")
-    MESSAGE(STATUS ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-ENDIF()
-
-IF(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_BT_OMP4_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_BT_OMP4_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_BT_OMP4_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_GPU_CUDA_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_GPU_CUDA_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_GPU_HIP_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_GPU_HIP_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_GPU_HIP_ENABLED)
-ENDIF()
-
-LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_DEBUG=${ALPAKA_DEBUG}")
-
-IF(ALPAKA_CI)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_CI")
-ENDIF()
-
-SET(_ALPAKA_INCLUDE_DIRECTORY "${_ALPAKA_ROOT_DIR}/include")
-SET(_ALPAKA_SUFFIXED_INCLUDE_DIR "${_ALPAKA_INCLUDE_DIRECTORY}/alpaka")
-
-SET(_ALPAKA_LINK_LIBRARY)
-
-# # cxx flags will not be forwarded to hip wrapped compiler, so it has to be provided manually
-IF(ALPAKA_ACC_GPU_HIP_ENABLE)
-    SET(_ALPAKA_COMPILE_DEFINITIONS_HIP ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC})
-    LIST_ADD_PREFIX("-D" _ALPAKA_COMPILE_DEFINITIONS_HIP)
-    LIST(APPEND HIP_HIPCC_FLAGS
-        ${_ALPAKA_COMPILE_DEFINITIONS_HIP}
-        )
-    HIP_INCLUDE_DIRECTORIES(
-        # ${_ALPAKA_INCLUDE_DIRECTORY}
-        # ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC}
-        ${HIP_INCLUDE_DIRS}
-        ${Boost_INCLUDE_DIRS}
-        ${_ALPAKA_ROOT_DIR}/test/common/include
-        )
-
-    IF(OPENMP_FOUND) # remove fopenmp link from nvcc, otherwise linker error will occur
-        LIST(REMOVE_ITEM _ALPAKA_LINK_FLAGS_PUBLIC "${OpenMP_CXX_FLAGS}")
-        LIST(APPEND _ALPAKA_LINK_FLAGS_PUBLIC "-Xcompiler ${OpenMP_CXX_FLAGS}")
-    ENDIF()
-    IF(ALPAKA_HIP_PLATFORM MATCHES "hcc")
-        # GFX600, GFX601, GFX700, GFX701, GFX702, GFX703, GFX704, GFX801, GFX802, GFX803, GFX810, GFX900, GFX902
-        SET(_ALPAKA_LINK_LIBRARIES_PUBLIC "${_ALPAKA_LINK_LIBRARIES_PUBLIC}" "--amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906")
-    ENDIF()
-ENDIF()
-
-# Add all the source and include files in all recursive subdirectories and group them accordingly.
-append_recursive_files_add_to_src_group("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "hpp" _ALPAKA_FILES_HEADER)
-append_recursive_files_add_to_src_group("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "h" _ALPAKA_FILES_HEADER)
-
-append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/script" "${_ALPAKA_ROOT_DIR}" "sh" _ALPAKA_FILES_SCRIPT)
-SET_SOURCE_FILES_PROPERTIES(${_ALPAKA_FILES_SCRIPT} PROPERTIES HEADER_FILE_ONLY TRUE)
-
-append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/cmake" "${_ALPAKA_ROOT_DIR}" "cmake" _ALPAKA_FILES_CMAKE)
-LIST(APPEND _ALPAKA_FILES_CMAKE "${_ALPAKA_ROOT_DIR}/alpakaConfig.cmake" "${_ALPAKA_ROOT_DIR}/Findalpaka.cmake" "${_ALPAKA_ROOT_DIR}/CMakeLists.txt" "${_ALPAKA_ROOT_DIR}/cmake/dev.cmake" "${_ALPAKA_ROOT_DIR}/cmake/common.cmake" "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake" "${_ALPAKA_ADD_LIBRRAY_FILE}")
-SET_SOURCE_FILES_PROPERTIES(${_ALPAKA_FILES_CMAKE} PROPERTIES HEADER_FILE_ONLY TRUE)
-
-append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/doc/markdown" "${_ALPAKA_ROOT_DIR}" "md" _ALPAKA_FILES_DOC)
-SET_SOURCE_FILES_PROPERTIES(${_ALPAKA_FILES_DOC} PROPERTIES HEADER_FILE_ONLY TRUE)
-
-SET(_ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/.gitignore" "${_ALPAKA_ROOT_DIR}/.travis.yml" "${_ALPAKA_ROOT_DIR}/.zenodo.json" "${_ALPAKA_ROOT_DIR}/LICENSE" "${_ALPAKA_ROOT_DIR}/README.md")
-SET_SOURCE_FILES_PROPERTIES(${_ALPAKA_FILES_OTHER} PROPERTIES HEADER_FILE_ONLY TRUE)
-
-#-------------------------------------------------------------------------------
-# Target.
-IF(NOT TARGET "alpaka")
-    ADD_LIBRARY("alpaka" INTERFACE)
-
-    # HACK: Workaround for the limitation that files added to INTERFACE targets (target_sources) can not be marked as PUBLIC or PRIVATE but only as INTERFACE.
-    # Therefore those files will be added to projects "linking" to the INTERFACE library, but are not added to the project itself within an IDE.
-    add_custom_target("alpakaIde"
-        SOURCES ${_ALPAKA_FILES_HEADER} ${_ALPAKA_FILES_SCRIPT} ${_ALPAKA_FILES_CMAKE} ${_ALPAKA_FILES_DOC} ${_ALPAKA_FILES_OTHER}
-    )
-
-    target_compile_features("alpaka"
-        INTERFACE cxx_std_${ALPAKA_CXX_STANDARD}
-    )
-
-    # Compile options.
-    IF(${ALPAKA_DEBUG} GREATER 1)
-        MESSAGE(STATUS "_ALPAKA_COMPILE_OPTIONS_PUBLIC: ${_ALPAKA_COMPILE_OPTIONS_PUBLIC}")
-    ENDIF()
-    LIST(
-        LENGTH
-        _ALPAKA_COMPILE_OPTIONS_PUBLIC
-        _ALPAKA_COMPILE_OPTIONS_PUBLIC_LENGTH)
-    IF(${_ALPAKA_COMPILE_OPTIONS_PUBLIC_LENGTH} GREATER 0)
-        TARGET_COMPILE_OPTIONS(
-            "alpaka"
-            INTERFACE ${_ALPAKA_COMPILE_OPTIONS_PUBLIC})
-    ENDIF()
-
-    # Compile definitions.
-    IF(${ALPAKA_DEBUG} GREATER 1)
-        MESSAGE(STATUS "_ALPAKA_COMPILE_DEFINITIONS_PUBLIC: ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC}")
-    ENDIF()
-    LIST(
-        LENGTH
-        _ALPAKA_COMPILE_DEFINITIONS_PUBLIC
-        _ALPAKA_COMPILE_DEFINITIONS_PUBLIC_LENGTH)
-    IF(${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC_LENGTH} GREATER 0)
-        TARGET_COMPILE_DEFINITIONS(
-            "alpaka"
-            INTERFACE ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC})
-    ENDIF()
-
-    # Include directories.
-    IF(${ALPAKA_DEBUG} GREATER 1)
-        MESSAGE(STATUS "_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC: ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC}")
-    ENDIF()
-    LIST(
-        LENGTH
-        _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC
-        _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC_LENGTH)
-    IF(${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC_LENGTH} GREATER 0)
-        TARGET_INCLUDE_DIRECTORIES(
-            "alpaka"
-            SYSTEM
-            INTERFACE ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC})
-    ENDIF()
-    # the alpaka library itself
-    TARGET_INCLUDE_DIRECTORIES(
-        "alpaka"
-        INTERFACE ${_ALPAKA_INCLUDE_DIRECTORY}
-    )
-
-    # Link libraries.
-    # There are no PUBLIC_LINK_FLAGS in CMAKE:
-    # http://stackoverflow.com/questions/26850889/cmake-keeping-link-flags-of-internal-libs
-    IF(${ALPAKA_DEBUG} GREATER 1)
-        MESSAGE(STATUS "_ALPAKA_LINK_LIBRARIES_PUBLIC: ${_ALPAKA_LINK_LIBRARIES_PUBLIC}")
-    ENDIF()
-    LIST(
-        LENGTH
-        _ALPAKA_LINK_LIBRARIES_PUBLIC
-        _ALPAKA_LINK_LIBRARIES_PUBLIC_LENGTH)
-    IF(${_ALPAKA_LINK_LIBRARIES_PUBLIC_LENGTH} GREATER 0)
-        TARGET_LINK_LIBRARIES(
-            "alpaka"
-            INTERFACE ${_ALPAKA_LINK_LIBRARIES_PUBLIC} ${_ALPAKA_LINK_FLAGS_PUBLIC})
-    ENDIF()
-ENDIF()
-
-# NVCC does not incorporate the COMPILE_OPTIONS of a target but only the CMAKE_CXX_FLAGS
-IF((ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE) AND ALPAKA_CUDA_COMPILER MATCHES "nvcc")
-    STRING(REPLACE ";" " " _ALPAKA_COMPILE_OPTIONS_STRING "${_ALPAKA_COMPILE_OPTIONS_PUBLIC}")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_ALPAKA_COMPILE_OPTIONS_STRING}")
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find alpaka version.
-file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_MAJOR_HPP REGEX "#define ALPAKA_VERSION_MAJOR ")
-file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_MINOR_HPP REGEX "#define ALPAKA_VERSION_MINOR ")
-file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_PATCH_HPP REGEX "#define ALPAKA_VERSION_PATCH ")
-
-string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_MAJOR  ${ALPAKA_VERSION_MAJOR_HPP})
-string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_MINOR  ${ALPAKA_VERSION_MINOR_HPP})
-string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_PATCH  ${ALPAKA_VERSION_PATCH_HPP})
-
-SET(PACKAGE_VERSION "${ALPAKA_VERSION_MAJOR}.${ALPAKA_VERSION_MINOR}.${ALPAKA_VERSION_PATCH}")
-
-#-------------------------------------------------------------------------------
-# Set return values.
-SET(alpaka_VERSION "${ALPAKA_VERSION_MAJOR}.${ALPAKA_VERSION_MINOR}.${ALPAKA_VERSION_PATCH}")
-SET(alpaka_COMPILE_OPTIONS ${_ALPAKA_COMPILE_OPTIONS_PUBLIC})
-SET(alpaka_COMPILE_DEFINITIONS ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC})
-# Add '-D' to the definitions
-SET(alpaka_DEFINITIONS ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC})
-list_add_prefix("-D" alpaka_DEFINITIONS)
-# Add the compile options to the definitions.
-LIST(APPEND alpaka_DEFINITIONS ${_ALPAKA_COMPILE_OPTIONS_PUBLIC})
-SET(alpaka_INCLUDE_DIR ${_ALPAKA_INCLUDE_DIRECTORY})
-SET(alpaka_INCLUDE_DIRS ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC})
-LIST(APPEND alpaka_INCLUDE_DIRS ${_ALPAKA_INCLUDE_DIRECTORY})
-SET(alpaka_LIBRARY ${_ALPAKA_LINK_LIBRARY})
-SET(alpaka_LIBRARIES ${_ALPAKA_LINK_FLAGS_PUBLIC})
-LIST(APPEND alpaka_LIBRARIES ${_ALPAKA_LINK_LIBRARIES_PUBLIC})
-
-#-------------------------------------------------------------------------------
-# Print the return values.
-IF(${ALPAKA_DEBUG} GREATER 0)
-    MESSAGE(STATUS "alpaka_FOUND: ${alpaka_FOUND}")
-    MESSAGE(STATUS "alpaka_VERSION: ${alpaka_VERSION}")
-    MESSAGE(STATUS "alpaka_COMPILE_OPTIONS: ${alpaka_COMPILE_OPTIONS}")
-    MESSAGE(STATUS "alpaka_COMPILE_DEFINITIONS: ${alpaka_COMPILE_DEFINITIONS}")
-    MESSAGE(STATUS "alpaka_DEFINITIONS: ${alpaka_DEFINITIONS}")
-    MESSAGE(STATUS "alpaka_INCLUDE_DIR: ${alpaka_INCLUDE_DIR}")
-    MESSAGE(STATUS "alpaka_INCLUDE_DIRS: ${alpaka_INCLUDE_DIRS}")
-    MESSAGE(STATUS "alpaka_LIBRARY: ${alpaka_LIBRARY}")
-    MESSAGE(STATUS "alpaka_LIBRARIES: ${alpaka_LIBRARIES}")
-ENDIF()
-
-# Unset already set variables if not found.
-IF(NOT _ALPAKA_FOUND)
-    UNSET(alpaka_FOUND)
-    UNSET(alpaka_VERSION)
-    UNSET(alpaka_COMPILE_OPTIONS)
-    UNSET(alpaka_COMPILE_DEFINITIONS)
-    UNSET(alpaka_DEFINITIONS)
-    UNSET(alpaka_INCLUDE_DIR)
-    UNSET(alpaka_INCLUDE_DIRS)
-    UNSET(alpaka_LIBRARY)
-    UNSET(alpaka_LIBRARIES)
-
-    UNSET(_ALPAKA_FOUND)
-    UNSET(_ALPAKA_COMPILE_OPTIONS_PUBLIC)
-    UNSET(_ALPAKA_COMPILE_DEFINITIONS_PUBLIC)
-    UNSET(_ALPAKA_COMPILE_DEFINITIONS_HIP)
-    UNSET(_ALPAKA_HIP_LIBRARIES)
-    UNSET(_ALPAKA_INCLUDE_DIRECTORY)
-    UNSET(_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC)
-    UNSET(_ALPAKA_LINK_LIBRARY)
-    UNSET(_ALPAKA_LINK_LIBRARIES_PUBLIC)
-    UNSET(_ALPAKA_LINK_FLAGS_PUBLIC)
-    UNSET(_ALPAKA_COMMON_FILE)
-    UNSET(_ALPAKA_ADD_EXECUTABLE_FILE)
-    UNSET(_ALPAKA_ADD_LIBRARY_FILE)
-    UNSET(_ALPAKA_FILES_HEADER)
-    UNSET(_ALPAKA_FILES_OTHER)
-    UNSET(_ALPAKA_BOOST_MIN_VER)
-ELSE()
-    # Make internal variables advanced options in the GUI.
-    MARK_AS_ADVANCED(
-        alpaka_INCLUDE_DIR
-        alpaka_LIBRARY
-        _ALPAKA_COMPILE_OPTIONS_PUBLIC
-        _ALPAKA_COMPILE_DEFINITIONS_PUBLIC
-        _ALPAKA_INCLUDE_DIRECTORY
-        _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC
-        _ALPAKA_LINK_LIBRARY
-        _ALPAKA_LINK_LIBRARIES_PUBLIC
-        _ALPAKA_LINK_FLAGS_PUBLIC
-        _ALPAKA_COMMON_FILE
-        _ALPAKA_ADD_EXECUTABLE_FILE
-        _ALPAKA_ADD_LIBRARY_FILE
-        _ALPAKA_FILES_HEADER
-        _ALPAKA_FILES_OTHER
-        _ALPAKA_BOOST_MIN_VER)
-ENDIF()
-
-###############################################################################
-# FindPackage options
-
-# Handles the REQUIRED, QUIET and version-related arguments for FIND_PACKAGE.
-# NOTE: We do not check for alpaka_LIBRARIES and alpaka_DEFINITIONS because they can be empty.
-INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(
-    "alpaka"
-    FOUND_VAR alpaka_FOUND
-    REQUIRED_VARS alpaka_INCLUDE_DIR
-    VERSION_VAR alpaka_VERSION)
diff --git a/thirdParty/alpaka/cmake/addExecutable.cmake b/thirdParty/alpaka/cmake/addExecutable.cmake
deleted file mode 100644
index b602374544..0000000000
--- a/thirdParty/alpaka/cmake/addExecutable.cmake
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-#------------------------------------------------------------------------------
-# Calls CUDA_ADD_EXECUTABLE or ADD_EXECUTABLE depending on the enabled alpaka accelerators.
-# Using a macro to stay in the scope (fixes lost assignment of linker command in FindHIP.cmake)
-# https://github.com/ROCm-Developer-Tools/HIP/issues/631
-MACRO(ALPAKA_ADD_EXECUTABLE In_Name)
-    IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-        IF(ALPAKA_CUDA_COMPILER MATCHES "clang")
-            FOREACH(_file ${ARGN})
-                IF((${_file} MATCHES "\\.cpp$") OR (${_file} MATCHES "\\.cxx$") OR (${_file} MATCHES "\\.cu$"))
-                    SET_SOURCE_FILES_PROPERTIES(${_file} PROPERTIES COMPILE_FLAGS "-x cuda")
-                ENDIF()
-            ENDFOREACH()
-            ADD_EXECUTABLE(
-                ${In_Name}
-                ${ARGN})
-        ELSE()
-            FOREACH(_file ${ARGN})
-                IF((${_file} MATCHES "\\.cpp$") OR (${_file} MATCHES "\\.cxx$"))
-                    SET_SOURCE_FILES_PROPERTIES(${_file} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-                ENDIF()
-            ENDFOREACH()
-            IF (CMAKE_VERSION VERSION_LESS 3.9.0)
-                CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-            ELSE()
-                SET(CUDA_LINK_LIBRARIES_KEYWORD "PUBLIC")
-            ENDIF()
-            CUDA_ADD_EXECUTABLE(
-                ${In_Name}
-                ${ARGN})
-        ENDIF()
-    ELSEIF(ALPAKA_ACC_GPU_HIP_ENABLE)
-	      FOREACH(_file ${ARGN})
-		        IF((${_file} MATCHES "\\.cpp$") OR (${_file} MATCHES "\\.cxx$"))
-		            SET_SOURCE_FILES_PROPERTIES(${_file} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT OBJ)
-		        ENDIF()
-	      ENDFOREACH()
-        IF (CMAKE_VERSION VERSION_LESS 3.9.0)
-            CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-        ELSE()
-            SET(HIP_LINK_LIBRARIES_KEYWORD "PUBLIC")
-        ENDIF()
-
-	      HIP_ADD_EXECUTABLE(
-		        ${In_Name}
-		        ${ARGN})
-
-    ELSE()
-        ADD_EXECUTABLE(
-            ${In_Name}
-            ${ARGN})
-    ENDIF()
-ENDMACRO()
diff --git a/thirdParty/alpaka/cmake/addLibrary.cmake b/thirdParty/alpaka/cmake/addLibrary.cmake
deleted file mode 100644
index 2d1c497341..0000000000
--- a/thirdParty/alpaka/cmake/addLibrary.cmake
+++ /dev/null
@@ -1,155 +0,0 @@
-#
-# Copyright 2015-2019 Benjamin Worpitz, Maximilian Knespel
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-#------------------------------------------------------------------------------
-# Calls CUDA_ADD_LIBRARY or ADD_LIBRARY depending on the enabled alpaka
-# accelerators.
-#
-# ALPAKA_ADD_LIBRARY( cuda_target file0 file1 ... [STATIC | SHARED | MODULE]
-#   [EXCLUDE_FROM_ALL] [OPTIONS <nvcc-flags> ... ] )
-#
-# In order to be compliant with both ADD_LIBRARY and CUDA_ADD_LIBRARY
-# the position of STATIC, SHARED, MODULE, EXCLUDE_FROM_ALL options don't matter.
-# This also means you won't be able to include files with those exact same
-# case-sensitive names.
-# After OPTIONS only nvcc compiler flags are allowed though. And for readiblity
-# and portability you shouldn't completely mix STATIC, ... with the source
-# code filenames!
-# OPTIONS and the arguments thereafter are ignored if not using CUDA, they
-# won't throw an error in that case.
-MACRO(ALPAKA_ADD_LIBRARY libraryName)
-    # CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
-    #                   [STATIC | SHARED | MODULE]
-    #                   [EXCLUDE_FROM_ALL] [OPTIONS <nvcc-flags> ... ] )
-    # add_library( <name> [STATIC | SHARED | MODULE]
-    #              [EXCLUDE_FROM_ALL]
-    #              source1 [source2 ...] )
-
-    # traverse arguments and sort them by option and source files
-    SET( arguments ${ARGN} )
-    SET( optionsEncountered OFF )
-    UNSET( libraryType )
-    UNSET( excludeFromAll )
-    UNSET( optionArguments )
-    FOREACH( argument IN LISTS arguments )
-        # 1.) check for OPTIONS
-        IF( argument STREQUAL "OPTIONS" )
-            IF ( optionsEncountered )
-                MESSAGE( FATAL_ERROR "[ALPAKA_ADD_LIBRARY] OPTIONS subcommand specified more than one time. This is not allowed!" )
-            ELSE()
-                SET( optionsEncountered ON )
-            ENDIF()
-        ENDIF()
-
-        # 2.) check if inside OPTIONS, because then all other checks are
-        # unnecessary although they could give hints about wrong locations
-        # of those subcommands
-        IF( optionsEncountered )
-            LIST( APPEND optionArguments "${argument}" )
-            CONTINUE()
-        ENDIF()
-
-        # 3.) check for libraryType and EXCLUDE_FROM_ALL
-        IF( ( argument STREQUAL "STATIC" ) OR
-            ( argument STREQUAL "SHARED" ) OR
-            ( argument STREQUAL "MODULE" )
-        )
-            IF( DEFINED libraryType )
-                message( FATAL_ERROR "Setting more than one library type option ( STATIC SHARED MODULE ) not allowed!" )
-            ENDIF()
-            set( libraryType ${argument} )
-            CONTINUE()
-        ENDIF()
-        IF( argument STREQUAL "EXCLUDE_FROM_ALL" )
-            SET( excludeFromAll ${argument} )
-            CONTINUE()
-        ENDIF()
-
-        # 4.) ELSE the argument is a file name
-        list( APPEND sourceFileNames "${argument}" )
-    ENDFOREACH()
-    UNSET( optionsEncountered )
-    #message( "libraryType = ${libraryType}" )
-    #message( "sourceFileNames = ${sourceFileNames}" )
-
-    # call add_library or cuda_add_library now
-    IF( ALPAKA_ACC_GPU_CUDA_ENABLE )
-        IF(ALPAKA_CUDA_COMPILER MATCHES "clang")
-            FOREACH( _file ${ARGN} )
-                IF( ( ${_file} MATCHES "\\.cpp$" ) OR
-                    ( ${_file} MATCHES "\\.cxx$" ) OR
-                    ( ${_file} MATCHES "\\.cu$" )
-                )
-                    SET_SOURCE_FILES_PROPERTIES( ${_file} PROPERTIES COMPILE_FLAGS "-x cuda" )
-                ENDIF()
-            ENDFOREACH()
-            ADD_LIBRARY(
-                ${libraryName}
-                ${sourceFileNames}
-                ${libraryType}
-                ${excludeFromAll}
-                ${optionArguments}
-            )
-        ELSE()
-            FOREACH( _file ${ARGN} )
-                IF( ( ${_file} MATCHES "\\.cpp$" ) OR
-                    ( ${_file} MATCHES "\\.cxx$" )
-                )
-                    SET_SOURCE_FILES_PROPERTIES( ${_file} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ )
-                ENDIF()
-            ENDFOREACH()
-            IF (CMAKE_VERSION VERSION_LESS 3.9.0)
-                CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-            ELSE()
-                SET(CUDA_LINK_LIBRARIES_KEYWORD "PUBLIC")
-            ENDIF()
-            CUDA_ADD_LIBRARY(
-                ${libraryName}
-                ${sourceFileNames}
-                ${libraryType}
-                ${excludeFromAll}
-                ${optionArguments}
-            )
-        ENDIF()
-    ELSEIF( ALPAKA_ACC_GPU_HIP_ENABLE )
-            FOREACH( _file ${ARGN} )
-                IF( ( ${_file} MATCHES "\\.cpp$" ) OR
-                    ( ${_file} MATCHES "\\.cxx$" )
-                )
-                    SET_SOURCE_FILES_PROPERTIES( ${_file} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT OBJ )
-                ENDIF()
-            ENDFOREACH()
-            CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_LIBRARY calls TARGET_LINK_LIBRARIES without keywords.
-            HIP_ADD_LIBRARY(
-                ${libraryName}
-                ${sourceFileNames}
-                ${libraryType}
-                ${excludeFromAll}
-                ${optionArguments}
-            )
-
-    ELSE()
-        #message( "add_library( ${libraryName} ${libraryType} ${excludeFromAll} ${sourceFileNames} )" )
-        ADD_LIBRARY(
-            ${libraryName}
-            ${libraryType}
-            ${excludeFromAll}
-            ${sourceFileNames}
-        )
-    ENDIF()
-
-    # UNSET variables (not sure if necessary)
-    UNSET( libraryType )
-    UNSET( sourceFileNames )
-    UNSET( excludeFromAll )
-    UNSET( optionArguments )
-ENDMACRO()
diff --git a/thirdParty/alpaka/cmake/common.cmake b/thirdParty/alpaka/cmake/common.cmake
deleted file mode 100644
index af212c1c2e..0000000000
--- a/thirdParty/alpaka/cmake/common.cmake
+++ /dev/null
@@ -1,212 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-#------------------------------------------------------------------------------
-# Gets all recursive files with the given ending in the given directory and recursively below.
-# This makes adding files easier because we do not have to update a list each time a file is added but this prevents CMake from detecting if it should be rerun!
-FUNCTION(append_recursive_files In_RootDir In_FileExtension Out_FilePathsListVariableName)
-    #MESSAGE("In_RootDir: ${In_RootDir}")
-    #MESSAGE("In_FileExtension: ${In_FileExtension}")
-    #MESSAGE("Out_FilePathsListVariableName: ${Out_FilePathsListVariableName}")
-    # Get all recursive files.
-    FILE(
-        GLOB_RECURSE
-        relativeFilePathsList
-        "${In_RootDir}/*.${In_FileExtension}")
-    #MESSAGE( "relativeFilePathsList: ${relativeFilePathsList}" )
-    # Set the return value (append it to the value in the parent scope).
-    SET(
-        ${Out_FilePathsListVariableName}
-        "${${Out_FilePathsListVariableName}}" "${relativeFilePathsList}"
-        PARENT_SCOPE)
-ENDFUNCTION()
-
-#------------------------------------------------------------------------------
-# Gets all recursive relative subdirectories.
-FUNCTION(append_recursive_relative_subdirs In_RootDir Out_RecursiveRelativeSubDirsVariableName)
-    #MESSAGE("In_RootDir: ${In_RootDir}")
-    # Get all the recursive files with their relative paths.
-    FILE(
-        GLOB_RECURSE
-        recursiveRelativeFiles
-        RELATIVE "${In_RootDir}/" "${In_RootDir}/*")
-    #MESSAGE("recursiveRelativeFiles: ${recursiveRelativeFiles}")
-
-    # Get the paths to all the recursive files.
-    # Create empty list for the case of no subdirectories being present.
-    SET(recursiveRelativeSubDirs)
-    FOREACH(
-        relativeFilePath
-        IN LISTS recursiveRelativeFiles)
-        GET_FILENAME_COMPONENT(
-            relativeSubDir
-            "${relativeFilePath}"
-            PATH)
-        LIST(
-            APPEND
-            recursiveRelativeSubDirs
-            "${relativeSubDir}")
-    ENDFOREACH()
-    #MESSAGE("recursiveRelativeSubDirs: ${recursiveRelativeSubDirs}")
-
-    # If the list is not empty.
-    LIST(
-        LENGTH
-        recursiveRelativeSubDirs
-        recursiveRelativeSubDirsLength)
-    IF("${recursiveRelativeSubDirsLength}")
-        # Remove duplicates from the list.
-        LIST(
-            REMOVE_DUPLICATES
-            recursiveRelativeSubDirs)
-        #MESSAGE("recursiveRelativeSubDirs: ${recursiveRelativeSubDirs}")
-
-        # Set the return value (append it to the value in the parent scope).
-        #MESSAGE("Out_RecursiveRelativeSubDirsVariableName: ${Out_RecursiveRelativeSubDirsVariableName}")
-        SET(
-            ${Out_RecursiveRelativeSubDirsVariableName}
-            "${${Out_RecursiveRelativeSubDirsVariableName}}" "${recursiveRelativeSubDirs}"
-            PARENT_SCOPE)
-    ENDIF()
-ENDFUNCTION()
-
-#------------------------------------------------------------------------------
-# Groups the files in the same way the directories are structured.
-FUNCTION(add_recursive_files_to_src_group In_RootDir In_SrcGroupIgnorePrefix In_FileExtension)
-    #MESSAGE("In_RootDir: ${In_RootDir}")
-    #MESSAGE("In_SrcGroupIgnorePrefix: ${In_SrcGroupIgnorePrefix}")
-    #MESSAGE("In_FileExtension: ${In_FileExtension}")
-    SET(recursiveRelativeSubDirs)
-    # Get all recursive subdirectories.
-    append_recursive_relative_subdirs(
-        "${In_RootDir}"
-        recursiveRelativeSubDirs)
-    #MESSAGE("recursiveRelativeSubDirs: ${recursiveRelativeSubDirs}")
-
-    # For the folder itself and each sub-folder...
-    FOREACH(
-        currentRelativeSubDir
-        IN
-        LISTS recursiveRelativeSubDirs
-        ITEMS "")
-        # Appended the current subdirectory.
-        IF(currentRelativeSubDir STREQUAL "")
-            SET(
-                currentSubDir
-                "${In_RootDir}")
-        ELSE()
-            SET(
-                currentSubDir
-                "${In_RootDir}/${currentRelativeSubDir}")
-        ENDIF()
-        #MESSAGE("currentSubDir: ${currentSubDir}")
-        # Get all the files in this sub-folder.
-        SET(
-            wildcardFilePath
-            "${currentSubDir}/*.${In_FileExtension}")
-        #MESSAGE("wildcardFilePath: ${wildcardFilePath}")
-        FILE(
-            GLOB
-            filesInSubDirList
-            "${wildcardFilePath}")
-        #MESSAGE("filesInSubDirList: ${filesInSubDirList}")
-
-        LIST(
-            LENGTH
-            filesInSubDirList
-            filesInSubDirListLength)
-        IF("${filesInSubDirListLength}")
-            # Group the include files into a project sub-folder analogously to the filesystem hierarchy.
-            SET(
-                groupExpression
-                "${currentSubDir}")
-            #MESSAGE("groupExpression: ${groupExpression}")
-            # Remove the parent directory from the path.
-            # NOTE: This is not correct because it does not only replace at the beginning of the string.
-            #  "STRING(REGEX REPLACE" would be correct if there was an easy way to escape arbitrary strings.
-            STRING(
-                REPLACE "${In_SrcGroupIgnorePrefix}" ""
-                groupExpression
-                "${groupExpression}")
-            # Remove leading slash.
-            STRING(
-                REGEX REPLACE "^/" ""
-                groupExpression
-                "${groupExpression}")
-            #MESSAGE("groupExpression: ${groupExpression}")
-            # Replace the directory separators in the path to build valid grouping expressions.
-            STRING(
-                REPLACE "/" "\\"
-                groupExpression
-                "${groupExpression}")
-            #MESSAGE("groupExpression: ${groupExpression}")
-            SOURCE_GROUP(
-                "${groupExpression}"
-                FILES ${filesInSubDirList})
-        ENDIF()
-    ENDFOREACH()
-ENDFUNCTION()
-
-#------------------------------------------------------------------------------
-# Gets all files with the given ending in the given directory.
-# Groups the files in the same way the directories are structured.
-# This makes adding files easier because we do not have to update a list each time a file is added but this prevents CMake from detecting if it should be rerun!
-FUNCTION(append_recursive_files_add_to_src_group In_RootDir In_SrcGroupIgnorePrefix In_FileExtension Out_FilePathsListVariableName)
-    #MESSAGE("In_RootDir: ${In_RootDir}")
-    #MESSAGE("In_SrcGroupIgnorePrefix: ${In_SrcGroupIgnorePrefix}")
-    #MESSAGE("In_FileExtension: ${In_FileExtension}")
-    #MESSAGE("Out_FilePathsListVariableName: ${Out_FilePathsListVariableName}")
-    # We have to use a local variable and give it to the parent because append_recursive_files only gives it to our scope but not the one calling this function.
-    SET(
-        allFilePathsList
-        "${${Out_FilePathsListVariableName}}")
-    append_recursive_files(
-        "${In_RootDir}"
-        "${In_FileExtension}"
-        allFilePathsList)
-    #MESSAGE( "allFilePathsList: ${allFilePathsList}" )
-    # Set the return value (append it to the value in the parent scope).
-    SET(
-        ${Out_FilePathsListVariableName}
-        "${${Out_FilePathsListVariableName}}" "${allFilePathsList}"
-        PARENT_SCOPE)
-
-    add_recursive_files_to_src_group(
-        "${In_RootDir}"
-        "${In_SrcGroupIgnorePrefix}"
-        "${In_FileExtension}")
-ENDFUNCTION()
-
-#------------------------------------------------------------------------------
-# void list_add_prefix(string In_Prefix, list<string>* In_ListVariableName);
-# - returns The In_ListVariableName with In_Prefix prepended to all items.
-# - original list is modified
-FUNCTION(list_add_prefix In_Prefix In_ListVariableName)
-    SET(local_list)
-
-    FOREACH(
-        item
-        IN LISTS ${In_ListVariableName})
-        IF(POLICY CMP0054)
-            CMAKE_POLICY(SET CMP0054 NEW)   # Only interpret if() arguments as variables or keywords when unquoted.
-        ENDIF()
-        IF(NOT "${item}" STREQUAL "")
-            LIST(
-                APPEND
-                local_list
-                "${In_Prefix}${item}")
-        ENDIF()
-    ENDFOREACH()
-
-    SET(
-        ${In_ListVariableName}
-        "${local_list}"
-        PARENT_SCOPE)
-ENDFUNCTION()
diff --git a/thirdParty/alpaka/cmake/dev.cmake b/thirdParty/alpaka/cmake/dev.cmake
deleted file mode 100644
index e1ca9d99ca..0000000000
--- a/thirdParty/alpaka/cmake/dev.cmake
+++ /dev/null
@@ -1,146 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-#-------------------------------------------------------------------------------
-# Compiler settings.
-#-------------------------------------------------------------------------------
-# By marking the boost headers as system headers, warnings produced within them are ignored.
-# Marking the boost headers as system headers does not work for nvcc (FindCUDA always uses -I)
-TARGET_INCLUDE_DIRECTORIES(
-    "alpaka"
-    SYSTEM
-    INTERFACE ${Boost_INCLUDE_DIRS})
-
-#MSVC
-IF(MSVC)
-    # Force to always compile with W4 and WX
-    LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/W4")
-    LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/WX")
-    # Improve debugging.
-    IF(CMAKE_BUILD_TYPE MATCHES "Debug")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-d2Zi+")
-    ENDIF()
-    IF(MSVC_VERSION GREATER 1900)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/permissive-")
-        IF(MSVC_VERSION GREATER 1910)
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/Zc:twoPhase-")
-        ENDIF()
-    ENDIF()
-    IF(MSVC_VERSION GREATER 1800)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/Zc:throwingNew" "/Zc:strictStrings")
-    ENDIF()
-ELSE()
-  IF(NOT(ALPAKA_ACC_GPU_CUDA_ENABLE) OR ALPAKA_CUDA_COMPILER MATCHES "clang"
-      OR(ALPAKA_ACC_GPU_HIP_ENABLE AND HIP_PLATFORM MATCHES "nvcc"))
-    # GNU
-    IF(CMAKE_COMPILER_IS_GNUCXX)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wall")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wextra")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-pedantic")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Werror")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdouble-promotion")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wmissing-include-dirs")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wunknown-pragmas")
-        # Higher levels (max is 5) produce some strange warnings
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wstrict-overflow=2")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wtrampolines")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wfloat-equal")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wundef")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wshadow")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-qual")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-align")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wwrite-strings")
-        # Too noisy as it warns for every operation using numeric types smaller then int.
-        # Such values are converted to int implicitly before the calculation is done.
-        # E.g.: uint16_t = uint16_t * uint16_t will trigger the following warning:
-        # conversion to ‘short unsigned int’ from ‘int’ may alter its value
-        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wconversion")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsign-conversion")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wvector-operation-performance")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wzero-as-null-pointer-constant")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdate-time")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wuseless-cast")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wlogical-op")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-aggressive-loop-optimizations")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wmissing-declarations")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-multichar")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wopenmp-simd")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wpacked")
-        # Too much noise
-        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wpadded")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wredundant-decls")
-        # Too much noise
-        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Winline")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdisabled-optimization")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-nonliteral")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-security")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-y2k")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wctor-dtor-privacy")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdelete-non-virtual-dtor")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wliteral-suffix")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnon-virtual-dtor")
-        # This warns about members that have not explicitly been listed in the constructor initializer list.
-        # This could be useful even for members that have a default constructor.
-        # However, it also issues this warning for defaulted constructurs.
-        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Weffc++")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Woverloaded-virtual")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsign-promo")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wconditionally-supported")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnoexcept")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wold-style-cast")
-        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-final-types")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-final-methods")
-            # This does not work correctly as it suggests override to methods that are already marked with final.
-            # Because final implies override, this is not useful.
-            #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-override")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnormalized")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-signedness")
-        ENDIF()
-        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnull-dereference")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wduplicated-cond")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsubobject-linkage")
-        ENDIF()
-        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
-            # This warning might be useful but it is triggered by comile-time code where it does not make any sense:
-            # E.g. "vec::Vec<dim::DimInt<(TidxDimOut < TidxDimIn) ? TidxDimIn : TidxDimOut>, TElem>" when both values are equal
-            #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wduplicated-branches")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Walloc-zero")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Walloca")
-        ENDIF()
-        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0)
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-align=strict")
-        ENDIF()
-
-    # Clang or AppleClang
-    ELSEIF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Werror")
-        # Weverything really means everything (including Wall, Wextra, pedantic, ...)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Weverything")
-        # We are not C++98 compatible (we use C++11 features)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-c++98-compat")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-c++98-compat-pedantic")
-        # The following warnings are triggered by all instantiations of BOOST_AUTO_TEST_SUITE
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-disabled-macro-expansion")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-global-constructors")
-        # This padding warning is generated by the execution tasks depending on the argument types
-        # as they are stored as members. Therefore, the padding warning is triggered by the calling code
-        # and does not indicate a failure within alpaka.
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-padded")
-    # ICC
-    ELSEIF(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wall")
-    # PGI
-    ELSEIF(${CMAKE_CXX_COMPILER_ID} STREQUAL "PGI")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Minform=inform")
-    ENDIF()
-  ENDIF()
-ENDIF()
diff --git a/thirdParty/alpaka/cmake/modules/FindHIP.cmake b/thirdParty/alpaka/cmake/modules/FindHIP.cmake
deleted file mode 100644
index dd55e18228..0000000000
--- a/thirdParty/alpaka/cmake/modules/FindHIP.cmake
+++ /dev/null
@@ -1,601 +0,0 @@
-# /*
-# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-# */
-
-###############################################################################
-# FindHIP.cmake
-###############################################################################
-
-###############################################################################
-# SET: Variable defaults
-###############################################################################
-# User defined flags
-set(HIP_HIPCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HIPCC")
-set(HIP_HCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HCC")
-set(HIP_NVCC_FLAGS "" CACHE STRING "Semicolon delimted flags for NVCC")
-mark_as_advanced(HIP_HIPCC_FLAGS HIP_HCC_FLAGS HIP_NVCC_FLAGS)
-set(_hip_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
-list(REMOVE_DUPLICATES _hip_configuration_types)
-foreach(config ${_hip_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    set(HIP_HIPCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HIPCC")
-    set(HIP_HCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HCC")
-    set(HIP_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for NVCC")
-    mark_as_advanced(HIP_HIPCC_FLAGS_${config_upper} HIP_HCC_FLAGS_${config_upper} HIP_NVCC_FLAGS_${config_upper})
-endforeach()
-option(HIP_HOST_COMPILATION_CPP "Host code compilation mode" ON)
-option(HIP_VERBOSE_BUILD "Print out the commands run while compiling the HIP source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
-mark_as_advanced(HIP_HOST_COMPILATION_CPP)
-
-###############################################################################
-# Set HIP CMAKE Flags
-###############################################################################
-# Copy the invocation styles from CXX to HIP
-set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE})
-set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND})
-set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH})
-set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG})
-set(CMAKE_SHARED_LIBRARY_CREATE_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CXX_FLAGS})
-#set(CMAKE_SHARED_LIBRARY_LINK_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG})
-set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP})
-set(CMAKE_SHARED_LIBRARY_LINK_STATIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_STATIC_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_CXX_FLAGS})
-
-# Set the CMake Flags to use the HCC Compilier.
-set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
-set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
-set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-
-###############################################################################
-# FIND: HIP and associated helper binaries
-###############################################################################
-# HIP is supported on Linux only
-if(UNIX AND NOT APPLE AND NOT CYGWIN)
-    # Search for HIP installation
-    if(NOT HIP_ROOT_DIR)
-        # Search in user specified path first
-        find_path(
-            HIP_ROOT_DIR
-            NAMES hipconfig
-            PATHS
-            ENV ROCM_PATH
-            ENV HIP_PATH
-            PATH_SUFFIXES bin
-            DOC "HIP installed location"
-            NO_DEFAULT_PATH
-            )
-        # Now search in default path
-        find_path(
-            HIP_ROOT_DIR
-            NAMES hipconfig
-            PATHS
-            /opt/rocm
-            /opt/rocm/hip
-            PATH_SUFFIXES bin
-            DOC "HIP installed location"
-            )
-
-        # Check if we found HIP installation
-        if(HIP_ROOT_DIR)
-            # If so, fix the path
-            string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" HIP_ROOT_DIR ${HIP_ROOT_DIR})
-            # And push it back to the cache
-            set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
-        endif()
-        if(NOT EXISTS ${HIP_ROOT_DIR})
-            if(HIP_FIND_REQUIRED)
-                message(FATAL_ERROR "Specify HIP_ROOT_DIR")
-            elseif(NOT HIP_FIND_QUIETLY)
-                message("HIP_ROOT_DIR not found or specified")
-            endif()
-        endif()
-    endif()
-
-    # Find HIPCC executable
-    find_program(
-        HIP_HIPCC_EXECUTABLE
-        NAMES hipcc
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCC_EXECUTABLE)
-        # Now search in default paths
-        find_program(HIP_HIPCC_EXECUTABLE hipcc)
-    endif()
-    mark_as_advanced(HIP_HIPCC_EXECUTABLE)
-
-    # Find HIPCONFIG executable
-    find_program(
-        HIP_HIPCONFIG_EXECUTABLE
-        NAMES hipconfig
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCONFIG_EXECUTABLE)
-        # Now search in default paths
-        find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
-    endif()
-    mark_as_advanced(HIP_HIPCONFIG_EXECUTABLE)
-
-    # Find HIPCC_CMAKE_LINKER_HELPER executable
-    find_program(
-        HIP_HIPCC_CMAKE_LINKER_HELPER
-        NAMES hipcc_cmake_linker_helper
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER)
-        # Now search in default paths
-        find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper)
-    endif()
-    mark_as_advanced(HIP_HIPCC_CMAKE_LINKER_HELPER)
-
-    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_VERSION)
-        # Compute the version
-        execute_process(
-            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
-            OUTPUT_VARIABLE _hip_version
-            ERROR_VARIABLE _hip_error
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            ERROR_STRIP_TRAILING_WHITESPACE
-            )
-        if(NOT _hip_error)
-            set(HIP_VERSION ${_hip_version} CACHE STRING "Version of HIP as computed from hipcc")
-        else()
-            set(HIP_VERSION "0.0.0" CACHE STRING "Version of HIP as computed by FindHIP()")
-        endif()
-        mark_as_advanced(HIP_VERSION)
-    endif()
-    if(HIP_VERSION)
-        string(REPLACE "." ";" _hip_version_list "${HIP_VERSION}")
-        list(GET _hip_version_list 0 HIP_VERSION_MAJOR)
-        list(GET _hip_version_list 1 HIP_VERSION_MINOR)
-        list(GET _hip_version_list 2 HIP_VERSION_PATCH)
-        set(HIP_VERSION_STRING "${HIP_VERSION}")
-    endif()
-
-    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_PLATFORM)
-        # Compute the platform
-        execute_process(
-            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform
-            OUTPUT_VARIABLE _hip_platform
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            )
-        set(HIP_PLATFORM ${_hip_platform} CACHE STRING "HIP platform as computed by hipconfig")
-        mark_as_advanced(HIP_PLATFORM)
-    endif()
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(
-    HIP
-    REQUIRED_VARS
-    HIP_ROOT_DIR
-    HIP_HIPCC_EXECUTABLE
-    HIP_HIPCONFIG_EXECUTABLE
-    HIP_PLATFORM
-    VERSION_VAR HIP_VERSION
-    )
-
-###############################################################################
-# MACRO: Locate helper files
-###############################################################################
-macro(HIP_FIND_HELPER_FILE _name _extension)
-    set(_hip_full_name "${_name}.${_extension}")
-    get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-    set(HIP_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindHIP/${_hip_full_name}")
-    if(NOT EXISTS "${HIP_${_name}}")
-        set(error_message "${_hip_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindHIP")
-        if(HIP_FIND_REQUIRED)
-            message(FATAL_ERROR "${error_message}")
-        else()
-            if(NOT HIP_FIND_QUIETLY)
-                message(STATUS "${error_message}")
-            endif()
-        endif()
-    endif()
-    # Set this variable as internal, so the user isn't bugged with it.
-    set(HIP_${_name} ${HIP_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
-endmacro()
-
-###############################################################################
-hip_find_helper_file(run_make2cmake cmake)
-hip_find_helper_file(run_hipcc cmake)
-###############################################################################
-
-###############################################################################
-# MACRO: Reset compiler flags
-###############################################################################
-macro(HIP_RESET_FLAGS)
-    unset(HIP_HIPCC_FLAGS)
-    unset(HIP_HCC_FLAGS)
-    unset(HIP_NVCC_FLAGS)
-    foreach(config ${_hip_configuration_types})
-        string(TOUPPER ${config} config_upper)
-        unset(HIP_HIPCC_FLAGS_${config_upper})
-        unset(HIP_HCC_FLAGS_${config_upper})
-        unset(HIP_NVCC_FLAGS_${config_upper})
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Separate the options from the sources
-###############################################################################
-macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_options _nvcc_options)
-    set(${_sources})
-    set(${_cmake_options})
-    set(${_hipcc_options})
-    set(${_hcc_options})
-    set(${_nvcc_options})
-    set(_hipcc_found_options FALSE)
-    set(_hcc_found_options FALSE)
-    set(_nvcc_found_options FALSE)
-    foreach(arg ${ARGN})
-        if("x${arg}" STREQUAL "xHIPCC_OPTIONS")
-            set(_hipcc_found_options TRUE)
-            set(_hcc_found_options FALSE)
-            set(_nvcc_found_options FALSE)
-        elseif("x${arg}" STREQUAL "xHCC_OPTIONS")
-            set(_hipcc_found_options FALSE)
-            set(_hcc_found_options TRUE)
-            set(_nvcc_found_options FALSE)
-        elseif("x${arg}" STREQUAL "xNVCC_OPTIONS")
-            set(_hipcc_found_options FALSE)
-            set(_hcc_found_options FALSE)
-            set(_nvcc_found_options TRUE)
-        elseif(
-                "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
-                "x${arg}" STREQUAL "xSTATIC" OR
-                "x${arg}" STREQUAL "xSHARED" OR
-                "x${arg}" STREQUAL "xMODULE"
-                )
-            list(APPEND ${_cmake_options} ${arg})
-        else()
-            if(_hipcc_found_options)
-                list(APPEND ${_hipcc_options} ${arg})
-            elseif(_hcc_found_options)
-                list(APPEND ${_hcc_options} ${arg})
-            elseif(_nvcc_found_options)
-                list(APPEND ${_nvcc_options} ${arg})
-            else()
-                # Assume this is a file
-                list(APPEND ${_sources} ${arg})
-            endif()
-        endif()
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Add include directories to pass to the hipcc command
-###############################################################################
-set(HIP_HIPCC_INCLUDE_ARGS_USER "")
-macro(HIP_INCLUDE_DIRECTORIES)
-    foreach(dir ${ARGN})
-        list(APPEND HIP_HIPCC_INCLUDE_ARGS_USER $<$<BOOL:${dir}>:-I${dir}>)
-    endforeach()
-endmacro()
-
-###############################################################################
-# FUNCTION: Helper to avoid clashes of files with the same basename but different paths
-###############################################################################
-function(HIP_COMPUTE_BUILD_PATH path build_path)
-    # Convert to cmake style paths
-    file(TO_CMAKE_PATH "${path}" bpath)
-    if(IS_ABSOLUTE "${bpath}")
-        string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
-        if(_binary_dir_pos EQUAL 0)
-            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
-        else()
-            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
-        endif()
-    endif()
-
-    # Remove leading /
-    string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
-    # Avoid absolute paths by removing ':'
-    string(REPLACE ":" "_" bpath "${bpath}")
-    # Avoid relative paths that go up the tree
-    string(REPLACE "../" "__/" bpath "${bpath}")
-    # Avoid spaces
-    string(REPLACE " " "_" bpath "${bpath}")
-    # Strip off the filename
-    get_filename_component(bpath "${bpath}" PATH)
-
-    set(${build_path} "${bpath}" PARENT_SCOPE)
-endfunction()
-
-###############################################################################
-# MACRO: Parse OPTIONS from ARGN & set variables prefixed by _option_prefix
-###############################################################################
-macro(HIP_PARSE_HIPCC_OPTIONS _option_prefix)
-    set(_hip_found_config)
-    foreach(arg ${ARGN})
-        # Determine if we are dealing with a per-configuration flag
-        foreach(config ${_hip_configuration_types})
-            string(TOUPPER ${config} config_upper)
-            if(arg STREQUAL "${config_upper}")
-                set(_hip_found_config _${arg})
-                # Clear arg to prevent it from being processed anymore
-                set(arg)
-            endif()
-        endforeach()
-        if(arg)
-            list(APPEND ${_option_prefix}${_hip_found_config} "${arg}")
-        endif()
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Try and include dependency file if it exists
-###############################################################################
-macro(HIP_INCLUDE_HIPCC_DEPENDENCIES dependency_file)
-    set(HIP_HIPCC_DEPEND)
-    set(HIP_HIPCC_DEPEND_REGENERATE FALSE)
-
-    # Create the dependency file if it doesn't exist
-    if(NOT EXISTS ${dependency_file})
-        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
-    endif()
-    # Include the dependency file
-    include(${dependency_file})
-
-    # Verify the existence of all the included files
-    if(HIP_HIPCC_DEPEND)
-        foreach(f ${HIP_HIPCC_DEPEND})
-            if(NOT EXISTS ${f})
-                # If they aren't there, regenerate the file again
-                set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
-            endif()
-        endforeach()
-    else()
-        # No dependencies, so regenerate the file
-        set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
-    endif()
-
-    # Regenerate the dependency file if needed
-    if(HIP_HIPCC_DEPEND_REGENERATE)
-        set(HIP_HIPCC_DEPEND ${dependency_file})
-        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
-    endif()
-endmacro()
-
-###############################################################################
-# MACRO: Prepare cmake commands for the target
-###############################################################################
-macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files)
-    set(_hip_flags "")
-    string(TOUPPER "${CMAKE_BUILD_TYPE}" _hip_build_configuration)
-    if(HIP_HOST_COMPILATION_CPP)
-        set(HIP_C_OR_CXX CXX)
-    else()
-        set(HIP_C_OR_CXX C)
-    endif()
-    set(generated_extension ${CMAKE_${HIP_C_OR_CXX}_OUTPUT_EXTENSION})
-
-    # Initialize list of includes with those specified by the user. Append with
-    # ones specified to cmake directly.
-    set(HIP_HIPCC_INCLUDE_ARGS ${HIP_HIPCC_INCLUDE_ARGS_USER})
-
-    # Add the include directories
-    set(include_directories_generator "$<TARGET_PROPERTY:${_target},INCLUDE_DIRECTORIES>")
-    list(APPEND HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:${include_directories_generator}>:-I$<JOIN:${include_directories_generator}, -I>>")
-
-    get_directory_property(_hip_include_directories INCLUDE_DIRECTORIES)
-    list(REMOVE_DUPLICATES _hip_include_directories)
-    if(_hip_include_directories)
-        foreach(dir ${_hip_include_directories})
-            list(APPEND HIP_HIPCC_INCLUDE_ARGS $<$<BOOL:${dir}>:-I${dir}>)
-        endforeach()
-    endif()
-
-    HIP_GET_SOURCES_AND_OPTIONS(_hip_sources _hip_cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_HIPCC_FLAGS ${_hipcc_options})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_HCC_FLAGS ${_hcc_options})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_NVCC_FLAGS ${_nvcc_options})
-
-    # Add the compile definitions
-    set(compile_definition_generator "$<TARGET_PROPERTY:${_target},COMPILE_DEFINITIONS>")
-    list(APPEND HIP_HIPCC_FLAGS "$<$<BOOL:${compile_definition_generator}>:-D$<JOIN:${compile_definition_generator}, -D>>")
-
-    # Check if we are building shared library.
-    set(_hip_build_shared_libs FALSE)
-    list(FIND _hip_cmake_options SHARED _hip_found_SHARED)
-    list(FIND _hip_cmake_options MODULE _hip_found_MODULE)
-    if(_hip_found_SHARED GREATER -1 OR _hip_found_MODULE GREATER -1)
-        set(_hip_build_shared_libs TRUE)
-    endif()
-    list(FIND _hip_cmake_options STATIC _hip_found_STATIC)
-    if(_hip_found_STATIC GREATER -1)
-        set(_hip_build_shared_libs FALSE)
-    endif()
-
-    # If we are building a shared library, add extra flags to HIP_HIPCC_FLAGS
-    if(_hip_build_shared_libs)
-        list(APPEND HIP_HCC_FLAGS "-fPIC")
-        list(APPEND HIP_NVCC_FLAGS "--shared -Xcompiler '-fPIC'")
-    endif()
-
-    # Set host compiler
-    set(HIP_HOST_COMPILER "${CMAKE_${HIP_C_OR_CXX}_COMPILER}")
-
-    # Set compiler flags
-    set(_HIP_HOST_FLAGS "set(CMAKE_HOST_FLAGS ${CMAKE_${HIP_C_OR_CXX}_FLAGS})")
-    set(_HIP_HIPCC_FLAGS "set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS})")
-    set(_HIP_HCC_FLAGS "set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS})")
-    set(_HIP_NVCC_FLAGS "set(HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS})")
-    foreach(config ${_hip_configuration_types})
-        string(TOUPPER ${config} config_upper)
-        set(_HIP_HOST_FLAGS "${_HIP_HOST_FLAGS}\nset(CMAKE_HOST_FLAGS_${config_upper} ${CMAKE_${HIP_C_OR_CXX}_FLAGS_${config_upper}})")
-        set(_HIP_HIPCC_FLAGS "${_HIP_HIPCC_FLAGS}\nset(HIP_HIPCC_FLAGS_${config_upper} ${HIP_HIPCC_FLAGS_${config_upper}})")
-        set(_HIP_HCC_FLAGS "${_HIP_HCC_FLAGS}\nset(HIP_HCC_FLAGS_${config_upper} ${HIP_HCC_FLAGS_${config_upper}})")
-        set(_HIP_NVCC_FLAGS "${_HIP_NVCC_FLAGS}\nset(HIP_NVCC_FLAGS_${config_upper} ${HIP_NVCC_FLAGS_${config_upper}})")
-    endforeach()
-
-    # Reset the output variable
-    set(_hip_generated_files "")
-    set(_hip_source_files "")
-
-    # Iterate over all arguments and create custom commands for all source files
-    foreach(file ${ARGN})
-        # Ignore any file marked as a HEADER_FILE_ONLY
-        get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-        # Allow per source file overrides of the format. Also allows compiling non .cu files.
-        get_source_file_property(_hip_source_format ${file} HIP_SOURCE_PROPERTY_FORMAT)
-        if((${file} MATCHES "\\.cu$" OR _hip_source_format) AND NOT _is_header)
-            set(host_flag FALSE)
-        else()
-            set(host_flag TRUE)
-        endif()
-
-        if(NOT host_flag)
-            # Determine output directory
-            HIP_COMPUTE_BUILD_PATH("${file}" hip_build_path)
-            set(hip_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${hip_build_path}")
-
-            get_filename_component(basename ${file} NAME)
-            set(generated_file_path "${hip_compile_output_dir}/${CMAKE_CFG_INTDIR}")
-            set(generated_file_basename "${_target}_generated_${basename}${generated_extension}")
-
-            # Set file names
-            set(generated_file "${generated_file_path}/${generated_file_basename}")
-            set(cmake_dependency_file "${hip_compile_output_dir}/${generated_file_basename}.depend")
-            set(custom_target_script_pregen "${hip_compile_output_dir}/${generated_file_basename}.cmake.pre-gen")
-            set(custom_target_script "${hip_compile_output_dir}/${generated_file_basename}.cmake")
-
-            # Set properties for object files
-            set_source_files_properties("${generated_file}"
-                PROPERTIES
-                EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked
-                )
-
-            # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path
-            get_filename_component(file_path "${file}" PATH)
-            if(IS_ABSOLUTE "${file_path}")
-                set(source_file "${file}")
-            else()
-                set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
-            endif()
-
-            # Bring in the dependencies
-            HIP_INCLUDE_HIPCC_DEPENDENCIES(${cmake_dependency_file})
-
-            # Configure the build script
-            configure_file("${HIP_run_hipcc}" "${custom_target_script_pregen}" @ONLY)
-            file(GENERATE
-                OUTPUT "${custom_target_script}"
-                INPUT "${custom_target_script_pregen}"
-                )
-            set(main_dep DEPENDS ${source_file})
-            if(CMAKE_GENERATOR MATCHES "Makefiles")
-                set(verbose_output "$(VERBOSE)")
-            elseif(HIP_VERBOSE_BUILD)
-                set(verbose_output ON)
-            else()
-                set(verbose_output OFF)
-            endif()
-
-            # Create up the comment string
-            file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-            set(hip_build_comment_string "Building HIPCC object ${generated_file_relative_path}")
-
-            # Build the generated file and dependency file
-            add_custom_command(
-                OUTPUT ${generated_file}
-                # These output files depend on the source_file and the contents of cmake_dependency_file
-                ${main_dep}
-                DEPENDS ${HIP_HIPCC_DEPEND}
-                DEPENDS ${custom_target_script}
-                # Make sure the output directory exists before trying to write to it.
-                COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
-                COMMAND ${CMAKE_COMMAND} ARGS
-                -D verbose:BOOL=${verbose_output}
-                -D build_configuration:STRING=${_hip_build_configuration}
-                -D "generated_file:STRING=${generated_file}"
-                -P "${custom_target_script}"
-                WORKING_DIRECTORY "${hip_compile_output_dir}"
-                COMMENT "${hip_build_comment_string}"
-                )
-
-            # Make sure the build system knows the file is generated
-            set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
-            list(APPEND _hip_generated_files ${generated_file})
-            list(APPEND _hip_source_files ${file})
-        endif()
-    endforeach()
-
-    # Set the return parameter
-    set(${_generated_files} ${_hip_generated_files})
-    set(${_source_files} ${_hip_source_files})
-endmacro()
-
-###############################################################################
-# HIP_ADD_EXECUTABLE
-###############################################################################
-macro(HIP_ADD_EXECUTABLE hip_target)
-    # Separate the sources from the options
-    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-        list(REMOVE_ITEM _sources ${_source_files})
-    endif()
-    if("x${HCC_HOME}" STREQUAL "x")
-        set(HCC_HOME "/opt/rocm/hcc")
-    endif()
-    set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-    add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
-    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP)
-endmacro()
-
-###############################################################################
-# HIP_ADD_LIBRARY
-###############################################################################
-macro(HIP_ADD_LIBRARY hip_target)
-    # Separate the sources from the options
-    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-        list(REMOVE_ITEM _sources ${_source_files})
-    endif()
-    add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
-    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX})
-endmacro()
-
-# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/thirdParty/alpaka/cmake/modules/FindHIP/run_hipcc.cmake b/thirdParty/alpaka/cmake/modules/FindHIP/run_hipcc.cmake
deleted file mode 100644
index c9582bdbd4..0000000000
--- a/thirdParty/alpaka/cmake/modules/FindHIP/run_hipcc.cmake
+++ /dev/null
@@ -1,190 +0,0 @@
-# /*
-# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-# */
-
-###############################################################################
-# Runs commands using HIPCC
-###############################################################################
-
-###############################################################################
-# This file runs the hipcc commands to produce the desired output file
-# along with the dependency file needed by CMake to compute dependencies.
-#
-# Input variables:
-#
-# verbose:BOOL=<>               OFF: Be as quiet as possible (default)
-#                               ON : Describe each step
-# build_configuration:STRING=<> Build configuration. Defaults to Debug.
-# generated_file:STRING=<>      File to generate. Mandatory argument.
-
-if(NOT build_configuration)
-    set(build_configuration Debug)
-endif()
-if(NOT generated_file)
-    message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(HIP_HIPCC_EXECUTABLE "@HIP_HIPCC_EXECUTABLE@") # path
-set(HIP_HIPCONFIG_EXECUTABLE "@HIP_HIPCONFIG_EXECUTABLE@") #path
-set(HIP_HOST_COMPILER "@HIP_HOST_COMPILER@") # path
-set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
-set(HIP_run_make2cmake "@HIP_run_make2cmake@") # path
-set(HCC_HOME "@HCC_HOME@") #path
-
-@HIP_HOST_FLAGS@
-@_HIP_HIPCC_FLAGS@
-@_HIP_HCC_FLAGS@
-@_HIP_NVCC_FLAGS@
-set(HIP_HIPCC_INCLUDE_ARGS "@HIP_HIPCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly)
-
-set(cmake_dependency_file "@cmake_dependency_file@") # path
-set(source_file "@source_file@") # path
-set(host_flag "@host_flag@") # bool
-
-# Determine compiler and compiler flags
-execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform OUTPUT_VARIABLE HIP_PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT host_flag)
-    set(__CC ${HIP_HIPCC_EXECUTABLE})
-    if(HIP_PLATFORM STREQUAL "hcc")
-        if(NOT "x${HCC_HOME}" STREQUAL "x")
-            set(ENV{HCC_HOME} ${HCC_HOME})
-        endif()
-        set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_HCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_HCC_FLAGS_${build_configuration}})
-    else()
-        set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_NVCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_NVCC_FLAGS_${build_configuration}})
-    endif()
-else()
-    set(__CC ${HIP_HOST_COMPILER})
-    set(__CC_FLAGS ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-endif()
-set(__CC_INCLUDES ${HIP_HIPCC_INCLUDE_ARGS})
-
-# hip_execute_process - Executes a command with optional command echo and status message.
-#   status     - Status message to print if verbose is true
-#   command    - COMMAND argument from the usual execute_process argument structure
-#   ARGN       - Remaining arguments are the command with arguments
-#   HIP_result - Return value from running the command
-macro(hip_execute_process status command)
-    set(_command ${command})
-    if(NOT "x${_command}" STREQUAL "xCOMMAND")
-        message(FATAL_ERROR "Malformed call to hip_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-    endif()
-    if(verbose)
-        execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-        # Build command string to print
-        set(hip_execute_process_string)
-        foreach(arg ${ARGN})
-            # Escape quotes if any
-            string(REPLACE "\"" "\\\"" arg ${arg})
-            # Surround args with spaces with quotes
-            if(arg MATCHES " ")
-                list(APPEND hip_execute_process_string "\"${arg}\"")
-            else()
-                list(APPEND hip_execute_process_string ${arg})
-            endif()
-        endforeach()
-        # Echo the command
-        execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${hip_execute_process_string})
-    endif()
-    # Run the command
-    execute_process(COMMAND ${ARGN} RESULT_VARIABLE HIP_result)
-endmacro()
-
-# Delete the target file
-hip_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-
-# Generate the dependency file
-hip_execute_process(
-    "Generating dependency file: ${cmake_dependency_file}.pre"
-    COMMAND "${__CC}"
-    -M
-    "${source_file}"
-    -o "${cmake_dependency_file}.pre"
-    ${__CC_FLAGS}
-    ${__CC_INCLUDES}
-    )
-
-if(HIP_result)
-    message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file
-hip_execute_process(
-    "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-    COMMAND "${CMAKE_COMMAND}"
-    -D "input_file:FILEPATH=${cmake_dependency_file}.pre"
-    -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-    -D "verbose=${verbose}"
-    -P "${HIP_run_make2cmake}"
-    )
-
-if(HIP_result)
-    message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-hip_execute_process(
-    "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-    COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-    )
-
-if(HIP_result)
-    message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-hip_execute_process(
-    "Removing ${cmake_dependency_file}.tmp and ${cmake_dependency_file}.pre"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${cmake_dependency_file}.pre"
-    )
-
-if(HIP_result)
-    message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the output file
-hip_execute_process(
-    "Generating ${generated_file}"
-    COMMAND "${__CC}"
-    -c
-    "${source_file}"
-    -o "${generated_file}"
-    ${__CC_FLAGS}
-    ${__CC_INCLUDES}
-    )
-
-if(HIP_result)
-    # Make sure that we delete the output file
-    hip_execute_process(
-        "Removing ${generated_file}"
-        COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-        )
-    message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-    if(verbose)
-        message("Generated ${generated_file} successfully.")
-    endif()
-endif()
-# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/thirdParty/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake b/thirdParty/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake
deleted file mode 100644
index 48a51fa039..0000000000
--- a/thirdParty/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake
+++ /dev/null
@@ -1,72 +0,0 @@
-# /*
-# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-# */
-
-###############################################################################
-# Computes dependencies using HIPCC
-###############################################################################
-
-###############################################################################
-# This file converts dependency files generated using hipcc to a format that
-# cmake can understand.
-
-# Input variables:
-#
-# input_file:STRING=<> Dependency file to parse. Required argument
-# output_file:STRING=<> Output file to generate. Required argument
-
-if(NOT input_file OR NOT output_file)
-    message(FATAL_ERROR "You must specify input_file and output_file on the command line")
-endif()
-
-file(READ ${input_file} depend_text)
-
-if (NOT "${depend_text}" STREQUAL "")
-    string(REPLACE " /" "\n/" depend_text ${depend_text})
-    string(REGEX REPLACE "^.*:" "" depend_text ${depend_text})
-    string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
-
-    set(dependency_list "")
-
-    foreach(file ${depend_text})
-        string(REGEX REPLACE "^ +" "" file ${file})
-        if(NOT EXISTS "${file}")
-            message(WARNING " Removing non-existent dependency file: ${file}")
-            set(file "")
-        endif()
-
-        if(NOT IS_DIRECTORY "${file}")
-            get_filename_component(file_absolute "${file}" ABSOLUTE)
-            list(APPEND dependency_list "${file_absolute}")
-        endif()
-    endforeach()
-endif()
-
-# Remove the duplicate entries and sort them.
-list(REMOVE_DUPLICATES dependency_list)
-list(SORT dependency_list)
-
-foreach(file ${dependency_list})
-    set(hip_hipcc_depend "${hip_hipcc_depend} \"${file}\"\n")
-endforeach()
-
-file(WRITE ${output_file} "# Generated by: FindHIP.cmake. Do not edit.\nSET(HIP_HIPCC_DEPEND\n ${hip_hipcc_depend})\n\n")
-# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/thirdParty/alpaka/cmake/modules/FindTBB.cmake b/thirdParty/alpaka/cmake/modules/FindTBB.cmake
deleted file mode 100644
index 4cfabee852..0000000000
--- a/thirdParty/alpaka/cmake/modules/FindTBB.cmake
+++ /dev/null
@@ -1,246 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2015 Justus Calvin
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-#
-# FindTBB
-# -------
-#
-# Find TBB include directories and libraries.
-#
-# Usage:
-#
-#  find_package(TBB [major[.minor]] [EXACT]
-#               [QUIET] [REQUIRED]
-#               [[COMPONENTS] [components...]]
-#               [OPTIONAL_COMPONENTS components...]) 
-#
-# where the allowed components are tbbmalloc and tbb_preview. Users may modify 
-# the behavior of this module with the following variables:
-#
-# * TBB_ROOT_DIR          - The base directory the of TBB installation.
-# * TBB_INCLUDE_DIR       - The directory that contains the TBB headers files.
-# * TBB_LIBRARY           - The directory that contains the TBB library files.
-# * TBB_<library>_LIBRARY - The path of the TBB the corresponding TBB library. 
-#                           These libraries, if specified, override the 
-#                           corresponding library search results, where <library>
-#                           may be tbb, tbb_debug, tbbmalloc, tbbmalloc_debug,
-#                           tbb_preview, or tbb_preview_debug.
-# * TBB_USE_DEBUG_BUILD   - The debug version of tbb libraries, if present, will
-#                           be used instead of the release version.
-#
-# Users may modify the behavior of this module with the following environment
-# variables:
-#
-# * TBB_INSTALL_DIR 
-# * TBBROOT
-# * LIBRARY_PATH
-#
-# This module will set the following variables:
-#
-# * TBB_FOUND             - Set to false, or undefined, if we haven’t found, or
-#                           don’t want to use TBB.
-# * TBB_<component>_FOUND - If False, optional <component> part of TBB sytem is
-#                           not available.
-# * TBB_VERSION           - The full version string
-# * TBB_VERSION_MAJOR     - The major version
-# * TBB_VERSION_MINOR     - The minor version
-# * TBB_INTERFACE_VERSION - The interface version number defined in 
-#                           tbb/tbb_stddef.h.
-# * TBB_<library>_LIBRARY_RELEASE - The path of the TBB release version of 
-#                           <library>, where <library> may be tbb, tbb_debug,
-#                           tbbmalloc, tbbmalloc_debug, tbb_preview, or 
-#                           tbb_preview_debug.
-# * TBB_<library>_LIBRARY_DEGUG - The path of the TBB release version of 
-#                           <library>, where <library> may be tbb, tbb_debug,
-#                           tbbmalloc, tbbmalloc_debug, tbb_preview, or 
-#                           tbb_preview_debug.
-#
-# The following varibles should be used to build and link with TBB:
-#
-# * TBB_INCLUDE_DIRS - The include directory for TBB.
-# * TBB_LIBRARIES    - The libraries to link against to use TBB.
-# * TBB_DEFINITIONS  - Definitions to use when compiling code that uses TBB.
-
-include(FindPackageHandleStandardArgs)
-
-if(NOT TBB_FOUND)
-
-  ##################################
-  # Check the build type
-  ##################################
-  
-  if(NOT DEFINED TBB_USE_DEBUG_BUILD)
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-      message(STATUS "Set TBB_USE_DEBUG_BUILD to TRUE because CMAKE_BUILD_TYPE is one of the debug configurations.")
-      set(TBB_USE_DEBUG_BUILD TRUE)
-    else()
-      set(TBB_USE_DEBUG_BUILD FALSE)
-    endif()
-  endif()
-  
-  ##################################
-  # Set the TBB search directories
-  ##################################
-  
-  # Define search paths based on user input and environment variables
-  set(TBB_SEARCH_DIR ${TBB_ROOT_DIR} $ENV{TBB_INSTALL_DIR} $ENV{TBBROOT})
-  
-  # Define the search directories based on the current platform
-  if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
-    set(TBB_DEFAULT_SEARCH_DIR "C:/Program Files/Intel/TBB"
-                               "C:/Program Files (x86)/Intel/TBB")
-
-    # Set the target architecture
-    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-      set(TBB_ARCHITECTURE "intel64")
-    else()
-      set(TBB_ARCHITECTURE "ia32")
-    endif()
-
-    # Set the TBB search library path search suffix based on the version of VC
-    if(WINDOWS_STORE)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11_ui")
-    elseif(MSVC14)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc14")
-    elseif(MSVC12)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc12")
-    elseif(MSVC11)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11")
-    elseif(MSVC10)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc10")
-    endif()
-
-    # Add the library path search suffix for the VC independent version of TBB
-    list(APPEND TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc_mt")
-
-  elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-    # OS X
-    set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb")
-    
-    # TODO: Check to see which C++ library is being used by the compiler.
-    if(NOT ${CMAKE_SYSTEM_VERSION} VERSION_LESS 13.0)
-      # The default C++ library on OS X 10.9 and later is libc++
-      set(TBB_LIB_PATH_SUFFIX "lib/libc++")
-    else()
-      set(TBB_LIB_PATH_SUFFIX "lib")
-    endif()
-  elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-    # Linux
-    set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb")
-    
-    # TODO: Check compiler version to see the suffix should be <arch>/gcc4.1 or
-    #       <arch>/gcc4.1. For now, assume that the compiler is more recent than
-    #       gcc 4.4.x or later.
-    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
-      set(TBB_LIB_PATH_SUFFIX "lib/intel64/gcc4.4")
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$")
-      set(TBB_LIB_PATH_SUFFIX "lib/ia32/gcc4.4")
-    endif()
-  endif()
-  
-  ##################################
-  # Find the TBB include dir
-  ##################################
-  
-  find_path(TBB_INCLUDE_DIRS tbb/tbb.h
-      HINTS ${TBB_INCLUDE_DIR} ${TBB_SEARCH_DIR}
-      PATHS ${TBB_DEFAULT_SEARCH_DIR}
-      PATH_SUFFIXES include)
-  
-  ##################################
-  # Find TBB components
-  ##################################
-
-  # Find each component
-  foreach(_comp tbb_preview tbbmalloc tbb)
-    # Search for the libraries
-    find_library(TBB_${_comp}_LIBRARY_RELEASE ${_comp}
-        HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR}
-        PATHS ${TBB_DEFAULT_SEARCH_DIR}
-        PATH_SUFFIXES ${TBB_LIB_PATH_SUFFIX})
-
-    find_library(TBB_${_comp}_LIBRARY_DEBUG ${_comp}_debug
-        HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR}
-        PATHS ${TBB_DEFAULT_SEARCH_DIR} ENV LIBRARY_PATH
-        PATH_SUFFIXES ${TBB_LIB_PATH_SUFFIX})
-    
-    # Set the library to be used for the component
-    if(NOT TBB_${_comp}_LIBRARY)
-      if(TBB_USE_DEBUG_BUILD AND TBB_${_comp}_LIBRARY_DEBUG)
-        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_DEBUG}")
-      elseif(TBB_${_comp}_LIBRARY_RELEASE)
-        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_RELEASE}")
-      elseif(TBB_${_comp}_LIBRARY_DEBUG)
-        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_DEBUG}")
-        message(STATUS "Using the debug library of '${_comp}' because the release library could not be found!")
-      endif()
-    endif()
-    
-    # Set the TBB library list and component found variables
-    if(TBB_${_comp}_LIBRARY)
-      list(APPEND TBB_LIBRARIES "${TBB_${_comp}_LIBRARY}")
-      set(TBB_${_comp}_FOUND TRUE)
-    else()
-      set(TBB_${_comp}_FOUND FALSE)
-    endif()
-    
-    mark_as_advanced(TBB_${_comp}_LIBRARY_RELEASE)
-    mark_as_advanced(TBB_${_comp}_LIBRARY_DEBUG)
-    mark_as_advanced(TBB_${_comp}_LIBRARY)
-    
-  endforeach()
-  
-  ##################################
-  # Set compile flags
-  ##################################
-  
-  if(TBB_tbb_LIBRARY MATCHES "debug")
-    set(TBB_DEFINITIONS "-DTBB_USE_DEBUG=1")
-  endif()
-  
-  ##################################
-  # Set version strings
-  ##################################
-  
-  if(TBB_INCLUDE_DIRS)
-    file(READ "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" _tbb_version_file)
-    string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1"
-            TBB_VERSION_MAJOR "${_tbb_version_file}")
-    string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1"
-            TBB_VERSION_MINOR "${_tbb_version_file}")
-    string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1"
-            TBB_INTERFACE_VERSION "${_tbb_version_file}")
-    set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}")
-  endif()
-  
-  find_package_handle_standard_args(TBB 
-      REQUIRED_VARS TBB_INCLUDE_DIRS TBB_LIBRARIES
-      HANDLE_COMPONENTS
-      VERSION_VAR TBB_VERSION)
-  
-  mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARIES)
-
-  unset(TBB_ARCHITECTURE)
-  unset(TBB_LIB_PATH_SUFFIX)
-  unset(TBB_DEFAULT_SEARCH_DIR)
-
-endif()
diff --git a/thirdParty/alpaka/doc/doxygen/Doxyfile b/thirdParty/alpaka/doc/doxygen/Doxyfile
deleted file mode 100644
index 1f7127b6a2..0000000000
--- a/thirdParty/alpaka/doc/doxygen/Doxyfile
+++ /dev/null
@@ -1,2501 +0,0 @@
-# Doxyfile 1.8.13
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = alpaka
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          = "Abstraction Library for Parallel Kernel Acceleration"
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           = alpaka_doxygen.png
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       =
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = YES
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = YES
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
-# to that level are automatically included in the table of contents, even if
-# they do not have an id attribute.
-# Note: This feature currently applies only to Markdown headings.
-# Minimum value: 0, maximum value: 99, default value: 0.
-# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
-
-TOC_INCLUDE_HEADINGS   = 0
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = YES
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = YES
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = YES
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = YES
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = YES
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = YES
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = YES
-
-# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
-# The default value is: NO.
-
-WARN_AS_ERROR          = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = ../../include/ \
-                         ../../README.md
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
-
-FILE_PATTERNS          = *.c \
-                         *.cc \
-                         *.cxx \
-                         *.cpp \
-                         *.c++ \
-                         *.java \
-                         *.ii \
-                         *.ixx \
-                         *.ipp \
-                         *.i++ \
-                         *.inl \
-                         *.idl \
-                         *.ddl \
-                         *.odl \
-                         *.h \
-                         *.hh \
-                         *.hxx \
-                         *.hpp \
-                         *.h++ \
-                         *.cs \
-                         *.d \
-                         *.php \
-                         *.php4 \
-                         *.php5 \
-                         *.phtml \
-                         *.inc \
-                         *.m \
-                         *.markdown \
-                         *.md \
-                         *.mm \
-                         *.dox \
-                         *.py \
-                         *.f90 \
-                         *.f \
-                         *.for \
-                         *.tcl \
-                         *.vhd \
-                         *.vhdl \
-                         *.ucf \
-                         *.qsf \
-                         *.as \
-                         *.js \
-                         *.c \
-                         *.cu
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE = ../../README.md
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = YES
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = NO
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = NO
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = YES
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = YES
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = YES
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 1
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             = ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED \
-                         ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED \
-                         ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED \
-                         ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED \
-                         ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED \
-                         ALPAKA_ACC_CPU_BT_OMP4_ENABLED \
-                         ALPAKA_ACC_GPU_CUDA_ENABLED \
-                         __CUDACC__ \
-                         _OPENMP=201307
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
-# configuration file for plantuml.
-
-PLANTUML_CFG_FILE      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/thirdParty/alpaka/doc/markdown/Index.md b/thirdParty/alpaka/doc/markdown/Index.md
deleted file mode 100644
index 45aecef238..0000000000
--- a/thirdParty/alpaka/doc/markdown/Index.md
+++ /dev/null
@@ -1,20 +0,0 @@
-* User Documentation
-  * 1. [Introduction](user/Introduction.md)
-  * 2. [Abstraction](user/Abstraction.md)
-    * 1. [Thread](user/abstraction/Thread.md)
-    * 2. [Block](user/abstraction/Block.md)
-    * 3. [Warp](user/abstraction/Warp.md)
-    * 4. [Element](user/abstraction/Element.md)
-  * 3. [Implementation](user/Implementation.md)
-    * 1. [Library Interface](user/implementation/Library.md)
-      * 1. [Structure](user/implementation/library/Structure.md)
-      * 2. [Usage](user/implementation/library/Usage.md)
-      * 3. [Rationale](user/implementation/library/Rationale.md)
-      * 4. [Details](user/implementation/library/Details.md)
-    * 2. [Mapping onto Specific Hardware Architectures](user/implementation/Mapping.md)
-      * 1. [CUDA GPUs](user/implementation/mapping/CUDA.md)
-      * 2. [x86 CPUs](user/implementation/mapping/x86.md)
-      * 3. [Accelerators](user/implementation/mapping/Accelerators.md)
-* Developer Documentation
-  * 1. [Code Formatting](dev/style.md)
-  * 2. [Publishing Doxygen Documentation on gh-pages](dev/gh-pages.md)
diff --git a/thirdParty/alpaka/doc/markdown/dev/gh-pages.md b/thirdParty/alpaka/doc/markdown/dev/gh-pages.md
deleted file mode 100644
index e3a8159546..0000000000
--- a/thirdParty/alpaka/doc/markdown/dev/gh-pages.md
+++ /dev/null
@@ -1,38 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Publishing doxygen documentation on gh-pages
-============================================
-
-To deploy the doxygen documentation a copy of the repository is created inside the deployed folder.
-This copy is always in the gh-pages branch consisting only of the containing files.
-This folder is ignored in all other branches.
-
-Creation of gh-pages
---------------------
-
-*NOTE:* This has already been done once and does not have to be repeated!
-
-On working branch:
-- Add deploy directory to `.gitignore` (if not already done)
-- Create the `gh-pages` branch: `git checkout --orphan gh-pages`
-- Clean the branch: `git rm -rf .`
-- Commit and push the branch: `git add --all`, `git commit -m"add gh-pages branch"`, `git push`
-
-Setup
------
-
-*NOTE:* This has to be done once per cloned alpaka repository that is used to deploy the doxygen documentation!
-
-On working branch:
-- Clone the repo on the gh-pages branch inside the deploy folder: `git clone -b gh-pages git@github.com:ComputationalRadiationPhysics/alpaka.git doc/doxygen/html`
-
-Update
-------
-
-From within `develop`/`master`: 
-- Execute doxygen
-- `cd doc/doxygen/html`
-- `git add .`
-- `git commit -m "updated doxygen documentation"`
-- `git push`
-- `cd ../../../`
diff --git a/thirdParty/alpaka/doc/markdown/dev/style.md b/thirdParty/alpaka/doc/markdown/dev/style.md
deleted file mode 100644
index 70841cf6d1..0000000000
--- a/thirdParty/alpaka/doc/markdown/dev/style.md
+++ /dev/null
@@ -1,146 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Style
-=====
-
-Naming
-------
-
-* Types are always in PascalCase (KernlExecCuda, BufT, ...) and singular.
-* Variables are always in camelCase (memBufHost, ...) and plural for collections and singular else.
-* Namespaces are always in lowercase and singular is preferred.
-* There are no two consecutive upper case letters (AccOpenMp, HtmlRenderer, IoHandler, ...). This makes names more easily readable.
-
-
-Types
------
-
-* Always use integral types with known width (`int32_t`, `uin64_t`, ...).
-Never use `int`, `unisgned long`, etc.
-
-
-Type Qualifiers
----------------------
-
-The order of  type qualifiers should be:
-```Type const * const``` for a const pointer to a const Type.
-```Type const &``` for a reference to a const Type.
-
-The reason is that types can be read from right to left correctly without jumping back and forth.
-```const Type * const``` and ```const Type &``` would require jumping in either way to read them correctly.
-
-
-Variables
----------
-
-* Variables should always be initialized on construction because this can produce hard to debug errors.
-This can (nearly) always be done even in performance critical code without sacrificing speed by using a functional programming style.
-* Variables should (nearly) always be `const` to make the code more easy to understand.
-This is equivalent to functional programming and the SSA (static single assignment) style used by LLVM.
-This should have no speed implication as every half baked compiler analyses the usage of variables and reuses registers.  
-* Variable definitions should be differentiated from assignments by using either `(...)` or `{...}` but never `=` for definitions. 
-Use `uint32_t const iUsageOfThisVariable(42);` instead of `uint32_t const iUsageOfThisVariable = 42;`
-
-
-Comments
---------
-
-* Always use C++-Style comments `//`
-* For types use `//#############################################################################` to start the comment block.
-* For functions use `//-----------------------------------------------------------------------------` to start the comment block.
-* Never write comments for closing braces (namespaces, classes, etc ...)
-
-
-Braces
-------
-
-* Braces (opening and closing) for classes, structs, functions, namespaces, etc. appear on a new line. Exception: If the function or class body is empty, the opening and closing braces are on the same (next) line.
-* Only braces for variable initialization can appear in-line.
-
-
-Indentation
------------
-
-* Always indent everything by *one level* (namespace body, class members, function body, ...)
-* Do not use more indentation e.g. to align function parameters.
-
-
-Spaces
-------
-
-* Trailing white-spaces are forbidden.
-* There is no space between keywords (if, for, ...) and the opening parenthesis.
-* There is no space after the opening `(` or `<` and before the closing `)` `>`.
-* There is a space before and after binary operators (=, *, +, ...)
-* There is no space after the unary operators !, ~, ...
-
-
-Functions
----------
-
-* Always use the trailing return type syntax with the return type on a new line even if the return type is void: 
-```C++
-auto func() 
--> bool
-```
-  * This makes it easier to see the return type because it is on its own line.
-  * This leads to a consistent style for constructs where there is no alternative style (lambdas, functions templates with dependent return types) and standard functions.
-
-* Each function parameter is on a new indented line:
-```C++
-auto func(
-    float f1,
-    float f2) 
--> bool
-{
-    return true
-}
-```
-```C++
-func(
-    1.0f,
-    2.0f);
-```
-  * Makes it easier to see how many parameters there are and which position they have. 
-
-
-Templates
----------  
-
-* Template parameters are prefixed with `T` to differentiate them from class or function local typedefs.
-
-* Each template parameter is on a new indented line:
-```C++
-template<
-    typename TParam,
-    typename TArgs...>
-auto func() 
--> bool
-```
-  * Makes it easier to see how many template parameters there are and which position they have. 
-
-* Always use ```typename``` for template parameters. There is NO difference to class and typename matches the intent better.
-
-
-Traits
-------
-
-* Trait classes always have one more template parameter (with default parameter) then is required for enabling SFINAE in the specialization:
-```C++
-template<
-    typename T, 
-    typename TSfinae = void>
-struct GetOffsets;
-```
-
-* Template trait aliases always end with a `T` e.g. `BufT` while the corresponding trait ends with `Type` e.g. `BufType`
-
-* Traits for implementations always have the same name as the accessor function but in PascalCase while the member function is camelCase again: `sin(){...}` and `Sin{sin(){...}};`
-
-Includes
---------
-
-* The order of includes is from the most specialized header to the most general one.
-This order helps to find missing includes in more specialized headers because the general ones are always included afterwards.
-
-* A comment with the types or functions included by a include file make it easier to find out why a special header is included.
diff --git a/thirdParty/alpaka/doc/markdown/user/Abstraction.md b/thirdParty/alpaka/doc/markdown/user/Abstraction.md
deleted file mode 100644
index 3ba3199449..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/Abstraction.md
+++ /dev/null
@@ -1,131 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Abstraction
-===========
-
-<!---
-Objective of the abstraction is to separate the parallelization strategy from the algorithm itself.
-Algorithm code written by users should not depend on any parallelization library or specific strategy.
-This would allow to exchange the parallelization back-end without any changes to the algorithm itself.
-Besides allowing to test different parallelization strategies this also makes it possible to port algorithms to new, yet unsupported, platforms.
--->
-
-Parallelism and memory hierarchies at all levels need to be exploited in order to achieve performance portability across various types of accelerators.
-Within this chapter an abstraction will be derivated that tries to provide a maximum of parallelism while simultaneously considering implementability and applicability in hardware.
-
-Looking at the current HPC hardware landscape, we often see nodes with multiple sockets/processors extended by accelerators like GPUs or Intel Xeon Phi, each with their own processing units.
-Within a CPU or a Intel Xeon Phi there are cores with hyper-threads, vector units and a large caching infrastructure.
-Within a GPU there are many small cores and only few caches.
-Each entity in the hierarchy has access to different memories.
-For example, each socket / processor manages its RAM, while the cores additionally have non-explicit access to L3, L2 and L1 caches.
-On a GPU there are global, constant, shared and other memory types which all can be accessed explicitly.
-The interface has to abstract from these differences without sacrificing speed on any platform.
-
-A process running on a multi-socket node is the largest entity within *alpaka*.
-The abstraction is only about the task and data parallel execution on the process/node level and down.
-It does not provide any primitives for inter-node communication.
-However, such libraries can be combined with *alpaka*.
-
-An application process always has a main thread and is by definition running on the host.
-It can access the host memory and various accelerator devices.
-Such accelerators can be GPUs, Intel Xeon Phis, the host itself or other devices.
-Thus, the host not necessarily has to be different from the accelerator device used for the computations.
-For instance, an Intel Xeon Phi simultaneously can be the host and the accelerator device.
-
-The *alpaka* library can be used to offload the parallel execution of task and data parallel work simultaneously onto different accelerator devices.
-
-Task Parallelism
-----------------
-
-One of the basic building blocks of modern applications is task parallelism.
-For example, the operating system scheduler, deciding which thread of which process gets how many processing time on which CPU core, enables task parallelism of applications.
-It controls the execution of different tasks on different processing units.
-Such task parallelism can be, for instance, the output of the progress in parallel to a download.
-This can be implemented via two threads executing two different tasks.
-
-The valid dependencies between tasks within an application can be defined as a DAG (directed acyclic graph) in all cases.
-The tasks are represented by nodes and the dependencies by edges.
-In this model, a task is ready to be executed if the number of incoming edges is zero.
-After a task finished it's work, it is removed from the graph as well as all of it's outgoing edges,.
-This reduces the number of incoming edges of subsequent tasks.
-
-The problem with this model is the inherent overhead and the missing hardware and API support.
-When it is directly implemented as a graph, at least all depending tasks have to be updated and checked if they are ready to be executed after a task finished.
-Depending on the size of the graph and the number of edges this can be a huge overhead.
-
-*OpenCL* allows to define a task graph in a somewhat different way.
-Tasks can be enqueued into an out-of-order command queue combined with events that have to be finished before the newly enqueued task can be started.
-Tasks in the command queue with unmet dependencies are skipped and subsequent ones are executed.
-The `CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE` property of a command queue is an optional feature only supported by few vendors.
-Therefore, it can not be assumed to be available on all systems.
-
-*CUDA* on the other hand does currently (version 7.5) not support such out-of-order queues in any way.
-The user has to define dependencies explicitly through the order the tasks are enqueued into the queues (called queues in *CUDA*).
-Within a queue, tasks are always executed in sequential order, while multiple queues are executed in parallel.
-Queues can wait for events enqueued into other queues.
-
-In both APIs, *OpenCL* and *CUDA*, a task graph can be emulated by creating one queue per task and enqueuing a unique event after each task, which can be used to wait for the preceding task.
-However, this is not feasible due to the large queue and event creation costs as well as other overheads within this process.
-
-Therefore, to be compatible with a wide range of APIs, the interface for task parallelism has to be constrained.
-Instead of a general DAG, multiple queues of sequentially executed tasks will be used to describe task parallelism.
-Events that can be enqueued into the queues enhance the basic task parallelism by enabling synchronization between different queues, devices or the host threads.
-
-Data Parallelism
-----------------
-
-In contrast to task parallelism, data parallelism describes the execution of one and the same task on multiple, often related data elements.
-For example, an image color space conversion is a textbook example of a data parallel task.
-The same operation is executed independently on each pixel.
-Other data parallel algorithms additionally introduce dependencies between threads in the input-, intermediate-, or output-data.
-For example, the calculation of a brightness histogram has no input-data dependencies.
-However, all pixel brightness values finally have to be merged into a single result.
-Even these two simple examples show that it is necessary to think about the interaction of parallel entities to minimize the influence of data dependencies.
-
-Furthermore, it is necessary to respect the principles of spatial and temporal locality.
-Current hardware is built around these locality principles to reduce latency by using hierarchical memory as a trade-off between speed and hardware size.
-Multiple levels of caches, from small and very fast ones to very large and slower ones exploit temporal locality by keeping recently referenced data as close to the actual processing units as possible.
-Spatial locality in the main memory is also important for caches because they are usually divided into multiple lines that can only be exchanged one cache line at a time.
-If one data element is loaded and cached, it is highly likely that nearby elements are also cached.
-If the pixels of an image are stored row wise but are read out column wise, the spatial locality assumption of many CPUs is violated and the performance suffers.
-GPUs on the other hand do not have a large caching hierarchy but allow explicit access to a fast memory shared across multiple cores.
-Therefore, the best way to process individual data elements of a data parallel task is dependent on the data structure as well as the underlying hardware.
-
-The main part of the *alpaka* abstraction is the way it abstracts data parallelism and allows the algorithm writer to take into account the hierarchy of processing units, their data parallel features and corresponding memory regions.
-The abstraction developed is influenced and based on the groundbreaking *CUDA* and *OpenCL* abstractions of a multidimensional grid of threads with additional hierarchy levels in between.
-Another level of parallelism is added to those abstractions to unify the data parallel capabilities of modern hardware architectures.
-The explicit access to all hierarchy levels enables the user to write code that runs performant on all current platforms.
-However, the abstraction does not try to automatically optimize memory accesses or data structures but gives the user full freedom to use data structures matching the underlying hardware preferences.
-
-The individual levels are explained on the following pages:
-
-1. [Thread](abstraction/Thread.md)
-2. [Block](abstraction/Block.md)
-3. [Warp](abstraction/Warp.md)
-4. [Element](abstraction/Element.md)
-
-Summary
--------
-
-This abstraction is called *Redundant Hierarchical Parallelism*.
-This term is inspired by the paper *The Future of Accelerator Programming: Abstraction, Performance or Can We Have Both?* [PDF](http://olab.is.s.u-tokyo.ac.jp/~kamil.rocki/rocki_burtscher_sac14.pdf) [DOI](http://dx.doi.org/10.1109/ICPADS.2013.76).
-It investigates a similar *concept of copious parallel programming* reaching 80%-90% of the native performance while comparing CPU and GPU centric versions of an *OpenCL* n-body simulation with a general version utilizing parallelism on multiple hierarchy levels.
-
-The *CUDA* or *OpenCL* abstractions themselves are very similar to the one designed in the previous sections and consists of all but the Element level.
-However, as has been shown, all five abstraction hierarchy levels are necessary to fully utilize current architectures.
-By emulating unsupported or ignoring redundant levels of parallelism, algorithms written with this abstraction can always be mapped optimally to all supported accelerators. The following table summarizes the characteristics of the proposed hierarchy levels.
-
-| Hierarchy Level | Parallelism | Synchronizable |
-| --- | --- | --- |
-| grid | sequential / parallel | :x: / :white_check_mark: |
-| block | parallel | :x: |
-| warp | parallel | :white_check_mark: |
-| thread | parallel / lock-step| :white_check_mark: |
-| element | sequential | :x: |
-
-Depending on the queue a task is enqueued into, grids will either run in sequential order within the same queue or in parallel in different queues.
-They can be synchronized by using events.
-Blocks can not be synchronized and therefore can use the whole spectrum of parallelism ranging from fully parallel up to fully sequential execution depending on the device.
-Warps combine the execution of multiple threads in lock-step and can be synchronized implicitly by synchronizing the threads they contain.
-Threads within a block are executed in parallel warps and each thread computes a number of data elements sequentially.
-
diff --git a/thirdParty/alpaka/doc/markdown/user/Implementation.md b/thirdParty/alpaka/doc/markdown/user/Implementation.md
deleted file mode 100644
index 6dd7824082..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/Implementation.md
+++ /dev/null
@@ -1,10 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Implementation
-==============
-
-The implementation of the library in C++, especially the way C++11 allows to define the abstract concepts and to take advantage of the zero-overhead compile-time polymorphism is explained in this section.
-Furthermore, it is described how the abstraction can be mapped to real devices.
-
-1. [Library Interface](implementation/Library.md)
-2. [Mapping onto Specific Hardware Architectures](implementation/Mapping.md)
diff --git a/thirdParty/alpaka/doc/markdown/user/Introduction.md b/thirdParty/alpaka/doc/markdown/user/Introduction.md
deleted file mode 100644
index 87078731ad..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/Introduction.md
+++ /dev/null
@@ -1,326 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Introduction
-============
-
-The *alpaka* library defines and implements an abstract interface for the *hierarchical redundant parallelism* model.
-This model exploits task- and data-parallelism as well as memory hierarchies at all levels of current multi-core architectures.
-This allows to achieve portability of performant codes across various types of accelerators by ignoring specific unsupported levels and utilizing only the ones supported on a specific accelerator.
-All hardware types (multi- and many-core CPUs, GPUs and other accelerators) are treated and can be programmed in the same way.
-The *alpaka* library provides back-ends for *CUDA*, *OpenMP*, *Boost.Fiber* and other methods.
-The policy-based C++ template interface provided allows for straightforward user-defined extension of the library to support other accelerators.
-
-The library name *alpaka* is an acronym standing for **A**bstraction **L**ibrary for **Pa**rallel **K**ernel **A**cceleration.
-
-
-Motivation
-----------
-
-What scales well on current hardware does not necessarily scale well on future architectures.
-The hardware landscape is always changing.
-In the past the big clusters have been CPU only.
-Today we see a change to accelerator supported computing.
-For example, GPUs, Intel Xeon Phis or other special purpose extension cards are extensively used.
-It is unpredictable what the next big step will be and how the Exaflop hardware will look like.
-It is not clear that GPUs will always be the best platform.
-Nevertheless, the underlying physical algorithms as well as the need for heterogeneous architectures will not change.
-
-Current highly parallel GPUs are optimized for throughput and hide latency and data dependencies by always keeping a ready pool of work.
-This allows to sustain the performance at a high percent of peak.
-CPUs in turn are designed to optimize the execution time of a single thread.
-Features like branch prediction, speculative execution, register renaming and many more *[...] would cost far too much energy to be replicated for thousands of parallel GPU threads but [...] are entirely appropriate for CPUs.* ([State-of-the-art in Heterogeneous Computing](http://dx.doi.org/10.1155/2010/540159))
-Even more specialized architectures will appear and find their way into HPC.
-
-*The essence of the heterogeneous computing model is that one size does not fit all. Parallel and serial segments of the workload execute on the best-suited processor delivering faster overall performance, greater efficiency, and lower energy and cost per unit of computation.* ([State-of-the-art in Heterogeneous Computing](http://dx.doi.org/10.1155/2010/540159))
-
-New hardware will not only allow to execute faster or calculate more but will furthermore enable the usage of new algorithms for more precise simulations.
-For example, some tasks may require random searches for only a few values in a lookup table of up to hundreds of gigabytes.
-This would perfectly fit to a CPUs, while the rest of the simulation would still be running on the GPUs.
-With new hardware bringing those two worlds closer together, exploiting the heterogeneous hardware with heterogenous algorithms will likely be the way to go in the future.
-Being able to express both of those parallel tasks in the same way would greatly enhance the productivity of the programmer and the clarity of the code.
-
-Porting a complicated simulation code from *CUDA* to x86 and possibly to other hardware architectures is a non-trivial task.
-A lot of developer time could be saved if this task would not have to be done repeatedly for every new hardware, but rather only once.
-Therefore, *alpaka* tries to solve the problems in porting highly scalable simulation codes on various multi-core architectures.
-
-
-Problems in Porting Performant HPC Codes
-----------
-
-Porting a highly performant code to a new architecture is a non-trivial task that poses many problems.
-Often it is a requirement to keep the simulation operative on the previous platform as well.
-This means that multiple hardware platforms have to be supported simultaneously.
-A great number of projects take the route that seems easiest at first and simply duplicate all the parallel algorithms and port them to the new back-end.
-All the specific API functions that have been used, have to be supplemented by the new pendants, possibly guarded by preprocessor macros to switch between the old and the new version.
-A switch of the back-end used in a simulation, for example, from *OpenMP* to *CUDA* often requires a near rewrite.
-Each newly supported platform would have to duplicate the API specific kernel and invocation code lines.
-
-The following paragraphs will summarize problems that arise when performant HPC codes have to be ported:
-
-### Sustainability
-Because the underlying HPC hardware is constantly changing, every new generation will require an adaption of the simulation.
-Even to deliver the performance reached on previous architectures is a tough task for programmers.
-Furthermore, nobody can guarantee the lifespan of the parallelization technique used.
-*OpenMP*, *CUDA*, *OpenACC* and all the other possibilities could be discontinued or get deprecated for any reason at any time.
-Therefore, an abstract interface is required that hides the particular back-end and allows to port the interface implementation and not the application using the interface itself.
-
-### Heterogeneity
-Some parts of a simulation perfectly map to current GPUs while other parts are better computed on CPUs or other accelerators.
-Furthermore, by letting one part of the heterogeneous cluster hardware idle, a lot of computing power is wasted.
-It is essential, especially for future architectures, that those resources are utilized to reach the peak performance of the systems.
-This heterogeneous work division not only depends on the architecture but also on the number of available hardware resources, the workload and many other factors.
-Therefore, to reach good scaling across a multitude of systems, it is necessary to be able to dynamically decide where to execute which part of the simulation either at make-time, compile-time or at run-time.
-Currently this requires to duplicate the kernels and write specific implementations per back-end.
-Many projects only allow to switch the back-end of the whole simulation at once or possibly even per kernel at make-time.
-This will not be enough on future architectures where the ability to mix the back-ends is required to optimally utilize different cluster architectures or to dynamically load balance tasks across a diverse set of (possibly failing) accelerator devices.
-Therefore, an abstract interface unifying the abilities of all the back-ends is required to let the application express parallelism of the different back-ends in a unified algorithm that can then be mapped to the device currently in use.
-
-### Maintainability
-Looking at the software engineering aspects, duplication is a bad solution because this leads to maintainability issues.
-In many projects such copies result in a large growth in the number of lines of code while only minimal new functionality is implemented.
-Most of the new code only executes things that have already been implemented for the initial platform.
-Developers having to change one of the algorithms additionally have to change all duplicates for all other back-ends.
-Depending on the similarity of the implementations, this can result in a doubling / multiplication of developer efforts in the worst-case scenario.
-Especially for open-source projects that rely on contributions from the community this raises the hurdle for new developers because they have to know not only one, but multiple different parallelization libraries.
-In the end good maintainability is what keeps a software project alive and what ensures a steady development progress.
-Therefore, an interface hiding the differences between all the back-ends is required to let the application express parallelism in a unified algorithm.
-
-### Testability
-Code duplication, being the easiest way to port a simulation, exacerbates testing.
-Each new kernel has to be tested separately because different bugs could have been introduced into the distinct implementations.
-If the versions can be mixed, it is even harder because all combinations have to be tested.
-Often the tests (continuous integration tests, unit tests, etc.) have to run on a special testing hardware or on the production systems due to the reliance on the availability of special accelerators.
-For example, *CUDA* compile tests are possible without appropriate hardware but it is not feasible to execute even simple runtime tests due to the missing CPU emulation support.
-An interface allowing to switch between acceleration back-ends, which are tested for compatibility among each other, enables easy testing on development and test systems.
-
-### Optimizability
-Even if the simulation code has encapsulated the APIs used, the optimal way to write performant algorithms often differs between distinct parallelization frameworks.
-It is necessary to allow the user to fine-tune the algorithm to run optimally on each different accelerator device by compile time specialization or policy based abstractions without the need to duplicate the kernel.
-Within the kernel there has to be knowledge about the underlying platform to adaptively use data structures that map optimally onto the current architecture.
-To ease this optimization work, libraries with data structures, communication patterns and other things hiding the differences between back-ends have to be implemented.
-This would allow to optimize the interface implementation and not the simulation itself.
-
-In summary, it can be stated that all the portability problems of current HPC codes could be solved by introducing an abstract interface that hides the particular back-end implementations and unifies the way to access the parallelism available on modern many-core architectures.
-
-
-Similar Projects
-----------------
-
-There are multiple other libraries targeting the (portable) parallel task execution within nodes.
-Some of them require language extensions, others pretend to achieve full performance portability across a multitude of devices.
-But none of these libraries can provide full control over the (possibly diverse) underlying hardware while being only minimal invasive.
-There is always a productivity-performance trade-off.
-
-Furthermore, many of the libraries do not satisfy the requirement for full single-source C++ support.
-This is essential because many simulation codes heavily rely on template meta-programming for method specialization and compile time optimizations.
-
-
-### CUDA - Compute Unified Device Architecture
-
-*CUDA* is a parallel computing platform and programming model developed by *NVIDIA*.
-It is used in science and research as well as in consumer software to compute highly parallel workloads on GPUs starting from image and video editing up to simulations on high-performance computers.
-Such usage of graphics processing units not only for computer graphics, but also for tasks that have traditionally been handled by the CPU is called GPGPU (general-purpose computing on graphics processing units).
-A disadvantage of *CUDA* is that its application is bound to the usage of *NVIDIA* GPUs.
-Currently no other vendors provide accelerators that support *CUDA*.
-Additionally there is no supported free emulator allowing to execute *CUDA* code on CPUs.
-
-The *CUDA* API is a higher level part of the programming model which allows to access and execute code on GPUs from multiple host languages including C++.
-The *CUDA* C/C++ language on the other hand is a mid level construct based on standard C++ with some extensions for accelerator programming and limitations in the supported constructs.
-For example, throwing and catching exceptions as well as run-time type information (RTTI) are not supported.
-*CUDA* C/C++ is compiled to a low level virtual instruction set called PTX (Parallel Thread Execution).
-The PTX code is later compiled to assembler code by the GPU driver.
-
-*NVIDIA* provides an extended C++ compiler based on the LLVM clang compiler called nvcc that allows to mix host C++ code using the *CUDA* API with *CUDA* C/C++.
-The host part of the C++ code is compiled by the respective host system compiler (gcc, icc, clang, MSVC) while the GPU device code is separately compiled to PTX.
-After the compilation steps both binaries are linked together to form the final assembly.
-
-*CUDA* defines a heterogeneous programming model where tasks are offloaded from the host CPU to the device GPU.
-Functions that should be offloaded to the GPU are called kernels.
-As can be seen in the figure below a grid of such kernels is executed in parallel by multiple threads organized in blocks.
-Threads within a block can synchronize, while blocks are executed independently and possibly in sequential order depending on the underlying hardware.
-![grid-of-thread-blocks](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/grid-of-thread-blocks.png)
-
-The global device memory is the slowest but largest memory accessible by all threads.
-It can be accessed from host code via methods provided by the *CUDA* API.
-Global memory is persistent across kernel invocations.
-Threads within a block can communicate through a fast but small shared memory.
-Each thread has a set of very low latency registers similar to CPU threads.
-Additionally there are special purpose memory sections for constant and texture data.
-
-The *CUDA* C/C++ language gives full control over memory, caches and the execution of kernels.
-
-
-### [PGI CUDA-X86](https://www.pgroup.com/resources/cuda-x86.htm)
-is a compiler technology that allows to generate x86-64 binary code from *CUDA* C/C++ applications using the *CUDA Runtime API* but does not support the *CUDA Driver API*.
-At run-time *CUDA* C programs compiled for x86 execute each *CUDA* thread block using a single host core, eliminating synchronization where possible.
-Multiple kernel threads are combined to be executed together via the CPUs SIMD (Single Instruction Multiple Data) capabilities for vectorized execution.
-The *PGI Unified Binary technology* allows to create a single binary that uses *NVIDIA* GPUs when available, or runs on multi-core CPUs else.
-The compiler is not always up-to-date with the latest *CUDA* versions and is not available for free.
-Furthermore, the compiler seems not to be developed actively since *NVIDIA* acquired *PGI* in 2013.
-Since 2012 no news were published and nothing could be found in the yearly release notes of the *PGI* compiler suite.
-
-
-### [GPU Ocelot](http://gpuocelot.gatech.edu/)
-<!--- https://github.com/gtcasl/gpuocelot --->
-is an open-source dynamic JIT compilation framework.
-It allows to execute native *CUDA* binaries by dynamically translating the *NVIDIA PTX* virtual instruction set architecture to other instruction sets.
-It supports *NVIDIA* and *AMD* GPUs as well as multicore CPUs via a PTX to LLVM (Low Level Virtual Machine) translator.
-The project is not in active development anymore.
-It only supports PTX up to version 3.1 (current version is 5.0).
-
-
-### [OpenMP](http://openmp.org//)
-is an open specification for vendor agnostic shared memory parallelization.
-By adding annotations (pragmas in C/C++) to loops or regions, it allows to easily parallelize existing sequential C/C++/Fortran code in an incremental manner.
-Due to the nature of pragmas, these hints are ignored if the compiler does not support them or thinks they are inappropriate.
-This allows those programs to be compiled as sequential or parallel versions by only changing a compiler flag.
-In C/C++ the syntax for *OpenMP* directives is `#pragma omp` followed by multiple clauses.
-For example, with the directive `#pragma omp parallel for`, the compiler will automatically distribute the iterations of the directly following loop across the available cores.
-*OpenMP* 4.0 introduced support for offloading computations to accelerator devices, substantially improved the task support and extended the SIMD capabilities.
-By embedding code within a `#pragma omp target` block, the contained code will be executed on the selected device.
-*OpenMP* 4.0 is missing the ability for unstructured data movement and only implements structured data movement from and to devices.
-The compiler directive `#pragma omp target data map(...) ...` at the begin of a code block will define which data is copied to, copied back from and is created on the device.
-At the end of the code block the memory is copied back or gets deleted.
-There is no way to allocate device memory that is persistent between kernel calls in different methods because it is not possible to create a device data region spanning both functions in the general case.
-*OpenMP* 4.1, expected for the end of 2015, is likely to introduce `#pragma omp target enter data`, `#pragma omp target exit data` and other unstructured data movement directives that allow to pass and obtain pointers of already resident memory to and from offloaded kernels.
-Currently *OpenMP* does not provide a way to control the hierarchical memory because its main assumption is a shared memory for all threads.
-Therefore, the block shared memory on *CUDA* devices can not be explicitly utilized.
-
-
-### [OpenACC](http://www.openacc-standard.org/)
-is a pragma based programming standard for heterogeneous computing.
-It is very similar to *OpenMP* and provides annotations for parallel execution and data movement as well as run-time functions for accelerator and device management.
-In contrast to *OpenMP* it allows limited access to *CUDA* block shared memory.
-Current compiler implementations support *NVIDA*, *AMD* and *Intel* accelerators.
-Only as of *OpenACC* 2.0 explicit memory management and tiling is supported.
-*OpenACC* does not support dynamic allocation of memory (`new`, `delete`) in kernel code.
-It is aimed to be fully merged with *OpenMP* at some point, but for now *OpenMP* 4.0 only introduced some parts of it.
-
-
-### [OpenCL](https://www.khronos.org/opencl/)
-is a programming framework for heterogeneous platforms.
-It is fully hardware independent and can utilize CPUs and GPUs of nearly all vendors.
-This is achieved by compiling the *OpenCL* kernel code (or the standardized *SPIR* intermediate representation) at run-time by the platform driver into the native instruction set.
-Versions prior to 2.1 (released in March 2015) did only support a C-like kernel language.
-Version 2.1 introduced a subset of C++14.
-*OpenCL* does not support single-source programming (combining C++ host code and accelerator code in a single file).
-This is a precondition for templated kernels which are required for policy based generic programming.
-It is necessary to note that *NVIDIA* seems to neglect their *OpenCL* implementation.
-Support for version 1.2 has just been added in April 2015 after only three and a half years after the publication of the standard.
-*OpenCL* does not support dynamic allocation of memory (`new`, `delete`) in kernel code.
-
-
-### [SYCL](https://www.khronos.org/sycl/)
-is a cross-platform abstraction layer based on *OpenCL*.
-The main advantage over *OpenCL* itself is that it allows to write single-source heterogeneous programs.
-It enables the usage of a single C++ template function for host and device code.
-As of now there is no usable free compiler implementation available that has good support for multiple accelerator devices.
-
-
-### [C++ AMP (Accelerated Massive Parallelism)](https://msdn.microsoft.com/en-us/library/hh265136.aspx)
-is an open specification from *Microsoft* currently implemented on top of *DirectX 11*.
-It is a language extension requiring compiler support that allows to annotate C++ code that can then be run on multiple accelerators.
-*C++ AMP* requires the usage of the `array` data structure or the `array_view` wrapper responsible for copying data to and from the accelerator devices.
-The `parallel_for_each` function is responsible for offloading the provided function object whose `operator()` has to be annotated with `restrict(amp)`.
-The threads can access shared memory and synchronize.
-The range of supported accelerator devices, plaforms and compilers is currently very limited.
-
-
-### [KOKKOS](https://github.com/kokkos)
-<!---
-https://www.xsede.org/documents/271087/586927/Edwards-2013-XSCALE13-Kokkos.pdf
-http://trilinos.org/oldsite/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf
-http://on-demand.gputechconf.com/supercomputing/2013/presentation/SC3103\_Towards-Performance-Portable-Applications-Kokkos.pdf
-http://dx.doi.org/10.3233/SPR-2012-0343
---->
-provides an abstract interface for portable, performant shared memory-programming.
-It is a C++ library that offers `parallel_for`, `parallel_reduce` and similar functions for describing the pattern of the parallel tasks.
-The execution policy determines how the threads are executed.
-For example, this influences the sizes of blocks of threads or if static or dynamic scheduling should be used.
-The library abstracts the kernel as a function object that can not have any user defined parameters for its `operator()`.
-Inconveniently, arguments have to be stored in members of the function object coupling algorithm and data together.
-*KOKKOS* provides both, abstractions for parallel execution of code and data management.
-Multidimensional arrays with a neutral indexing and an architecture dependent layout are available, which can be used, for example, to abstract the underlying hardwares preferred memory access scheme that could be row-major, column-major or even blocked.
-
-
-### [Thrust](https://thrust.github.io/)
-is a parallel algorithms library resembling the C++ Standard Template Library (STL).
-It allows to select either the *CUDA*, *TBB* or *OpenMP* back-end at make-time.
-Because it is based on generic `host_vector` and `device_vector` container objects, it is tightly coupling the data structure and the parallelization strategy.
-There exist many similar libraries such as [ArrayFire](http://www.arrayfire.com/) (*CUDA*, *OpenCL*, native C++), [VexCL](https://github.com/ddemidov/vexcl/) (*OpenCL*, *CUDA*), [ViennaCL](http://viennacl.sourceforge.net/) (*OpenCL*, *CUDA*, *OpenMP*) and [hemi](https://github.com/harrism/hemi/) (*CUDA*, native C++).
-
-<!---
-Phalanx
-See [here](http://www.mgarland.org/files/papers/phalanx-sc12-preprint.pdf).
-It is very similar to *alpaka* in the way it abstracts the accelerators.
-C++ Interface provides CUDA, OpenMP, and GASNet back-ends
-
-Aura
-
-Intel TBB
-
-U\PC++
---->
-
-Distinction of the *alpaka* Library
-------------------------------------------
-
-In the section about the problems we saw that all portability problems of current HPC codes could be solved with an abstract interface unifying the underlying accelerator back-ends.
-The previous section showed that there is currently no project available that could solve all of the problems highlighted.
-The C++ interface library proposed to solve all those problems is called *alpaka*.
-The subsequent enumeration will summarize the purpose of the library:
-
-### *alpaka* is ...
-* an **abstract interface** describing parallel execution on multiple hierarchy levels. It allows to implement a mapping to various hardware architectures but **is no optimal mapping itself**.
-
-* sustainably solving portability (50% on the way to reach full performance portability)
-
-* solving the **heterogeneity** problem. An identical algorithm / kernel can be executed on heterogeneous parallel systems by selecting the target device.
-
-* reducing the **maintainability** burden by not requiring to duplicate all the parts of the simulation that are directly facing the parallelization framework. Instead, it allows to provide a single version of the algorithm / kernel that can be used by all back-ends. All the accelerator dependent implementation details are hidden within the *alpaka* library.
-
-* simplifying the **testability** by enabling **easy back-end switching**. No special hardware is required for testing the kernels. Even if the simulation itself will always use the *CUDA* back-end, the tests can completely run on a CPU. As long as the *alpaka* library is thoroughly tested for compatibility between the acceleration back-ends, the user simulation code is guaranteed to generate identical results (ignoring rounding errors / non-determinism) and is portable without any changes.
-
-* **optimizable**. Everything in *alpaka* can be replaced by user code to optimize for special use-cases.
-
-* **extensible**. Every concept described by the *alpaka* abstraction can be implemented by users. Therefore it is possible to non-intrusively define new devices, queues, buffer types or even whole accelerator back-ends.
-
-* **data structure agnostic**. The user can use and define arbitrary data structures.
-
-### *alpaka* is not ...
-
-* an automatically **optimal mapping** of algorithms / kernels to various acceleration platforms. Except in trivial examples an optimal execution always depends on suitable selected data structure. An adaptive selection of data structures is a separate topic that has to be implemented in a distinct library.
-
-* automatically **optimizing concurrent data accesses**.
-
-* **handling** or hiding differences in arithmetic operations. For example, due to **different rounding** or different implementations of floating point operations, results can differ slightly between accelerators.
-
-* **guaranteeing any determinism** of results. Due to the freedom of the library to reorder or repartition the threads within the tasks it is not possible or even desired to preserve deterministic results. For example, the non-associativity of floating point operations give non-deterministic results within and across accelerators.
-
-The *alpaka* library is aimed at parallelization within nodes of a cluster.
-It does not compete with libraries for distribution of processes across nodes and communication among those.
-For these purposes libraries like MPI (Message Passing Interface) or others should be used.
-MPI is situated one layer higher and can be combined with *alpaka* to facilitate the hardware of a whole heterogeneous cluster.
-The *alpaka* library can be used for parallelization within nodes, MPI for parallelization across nodes.
-
-
-Comparison
-----------
-
-The following table summarizes which of the problems mentioned in section about the problems can be solved by current intra-node parallelization frameworks and the proof-of-concept *alpaka* abstraction library.
-
-| Framework / API | Open-Source | Free | Single-Source C++ | Portability | Heterogenity | Maintainability | Testability | Optimizability | Data structure agnostic |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| CUDA			| :x:              	| :white_check_mark: | :white_check_mark: | :x:               | :x:               | :x:               | :x:               | :white_check_mark: | :white_check_mark: |
-| PGI CUDA-x86	| :x:             	| :x:               | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| GPU Ocelot		| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| OpenMP			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| OpenACC			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| OpenCL			| :white_check_mark:	| :white_check_mark: | :x:               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| SYCL			| :white_check_mark:	| (:ballot_box_with_check:) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | (:ballot_box_with_check:) | :white_check_mark: |
-| C++AMP			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | (:ballot_box_with_check:) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| KOKKOS			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :large_orange_diamond: |
-| Thrust			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x:               | :x:               |
-| **alpaka**			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
-
-Properties of intra-node parallelization frameworks and their ability to solve the problems in porting performant HPC codes. :white_check_mark: : yes / fully solved, :large_orange_diamond: : partially solved, :x: : no / not solved
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/Block.md b/thirdParty/alpaka/doc/markdown/user/abstraction/Block.md
deleted file mode 100644
index db4db63188..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/abstraction/Block.md
+++ /dev/null
@@ -1,34 +0,0 @@
-[:arrow_up: Up](../Abstraction.md)
-
-Block
-=====
-
-Building a processor with possibly thousands of cores where all cores have an equal length connection for fast communication and synchronization is not viable.
-Either the processor size would have to grow exponentially with the number of cores or the all-to-all communication speed would decrease so much that computations on the processor would be impractical.
-Therefore, the communication and synchronization of threads has to be limited to sizes manageable by real hardware.
-
-Figure \ref{fig:block} depicts the solution of introducing a new hierarchy level in the abstraction.
-A hypothetical processor is allowed to provide synchronization and fast communication within blocks of threads but is not required to provide synchronization across blocks.
-The whole grid is subdivided into equal sized blocks with a fast but small shared memory.
-Current accelerator abstractions (*CUDA* and *OpenCL*) only support equal sized blocks.
-This restriction could possibly be lifted to support future accelerators with heterogeneous block sizes.
-![block](block/block.png)
-
-There is another reason why independent blocks are necessary.
-Threads that can communicate and synchronize require either a one-to-one mapping of threads to cores, which is impossible because the number of data elements is theoretically unlimited, or at least a space to store the state of each thread.
-Even old single core CPUs were able to execute many communicating and synchronizing threads by using cooperative or preemptive multitasking.
-Therefore, one might think that a single core would be enough to execute all the data parallel threads.
-But the problem is that even storing the set of registers and local data of all the possible millions of threads of a task grid is not always viable.
-The blocking scheme solves this by enabling fast interaction of threads on a local scale but additionally removes the necessity to store the state of all threads in the grid at once because only threads within a block must be executed in parallel.
-Within a block of cores there still has to be enough memory to store all registers of all contained threads.
-The independence of blocks allows applications to scale well across diverse devices.
-As can be seen in the following figure, the accelerator can assign blocks of the task grid to blocks of cores in arbitrary order depending on availability and workload.
-![block_scale](block/block_scale.png)
-
-Shared Memory
--------------
-
-Each block has its own shared memory.
-This memory can only be accessed explicitly by threads within the same block and gets discarded after the complete block finished its calculation.
-This memory is typically very fast but also very small.
-No variables are shared between kernels by default.
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/Element.md b/thirdParty/alpaka/doc/markdown/user/abstraction/Element.md
deleted file mode 100644
index d89c0b7e19..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/abstraction/Element.md
+++ /dev/null
@@ -1,42 +0,0 @@
-[:arrow_up: Up](../Abstraction.md)
-
-Element
-=======
-
-To use the maximum available computing power of, for example, a modern x86 processor, the computation has to utilize the SIMD vector registers.
-Many current architectures support issuing a single instruction that can be applied to multiple data elements in parallel.
-
-The original x86 instruction set architecture did not support SIMD instructions but has been enhanced with MMX (64 bit width registers), SSE (128 bit width registers), AVX (256 bit width registers) and AVX-512 (512 bit width registers) extensions.
-In varying degree, they allow to process multiple 32 bit and 64 bit floating point numbers as well as 8, 16, 32 and 64 bit signed and unsigned integers.
-
-*CUDA* capable GPUs do not have vector registers where multiple values of type `float` or `double` can be manipulated by one instruction.
-Nevertheless, newer *CUDA* capable devices implement basic SIMD instructions on pairs of 16 bit values and quads of 8-bit values. 
-They are described in the documentation of the [PTX instruction set architecture](http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#axzz4OTzGGwcJ) chapter 8.7.13 but are only of any use in very special problem domains, for example for deep learning.
-
-It would be optimal if the compiler could automatically vectorize our kernels when they are called in a loop and vectorization is supported by the underlying accelerator.
-However, besides full blown vector processors, mainstream CPUs do not support predicated execution or similar complex things within vector registers.
-At most, there is support for masking operations which allow to emulate at least some conditional branching.
-Therefore, this missing hardware capability has to be circumvented by the compiler.
-There are scientific research projects such as the work done by Ralf Karrenberg et al [1](http://www.cdl.uni-saarland.de/publications/theses/karrenberg_msc.pdf) [2](http://www.cdl.uni-saarland.de/projects/wfv/wfv_cgo11_slides.pdf) [3](http://www.cdl.uni-saarland.de/papers/karrenberg_opencl.pdf) building on the *LLVM* compiler infrastructure supporting such whole-function vectorization.
-However, current mainstream compilers do not support automatic vectorization of basic, non trivial loops containing control flow statements (`if`, `else`, `for`, etc.) or other non-trivial memory operations.
-Therefore, it has to be made easier for the compiler to recognize the vectorization possibilities by making it more explicit.
-
-The opposite of automatic whole function vectorization is the fully explicit vectorization of expressions via compiler intrinsics directly resulting in the desired assembly instruction.
-A big problem when trying to utilize fully explicit vectorization is, that there is no common foundation supported by all explicit vectorization methods.
-A wrapper unifying the x86 SIMD intrinsics found in the `intrin.h` or `x86intrin.h` headers with those supported on other platforms, for example ARM NEON (`arm_neon.h`), PowerPC Altivec (`altivec.h`) or *CUDA* is not available and to write one is a huge task in itself.
-However, if this would become available in the future, it could easily be integrated into *alpaka* kernels.
-
-Due to current compilers being unable to vectorize whole functions and the explicit vectorization intrinsics not being portable, one has to rely on the vectorization capabilities of current compilers for primitive loops only consisting of a few computations.
-By creating a grid of data elements, where multiple elements are processed per thread and threads are pooled in independent blocks, as it is shown in the figure below, the user is free to loop sequentially over the elements or to use vectorization for selected expressions within the kernel.
-Even the sequential processing of multiple elements per thread can be useful depending on the architecture.
-For example, the *NVIDIA cuBLAS* general matrix-matrix multiplication (GEMM) internally executes only one thread for each second matrix data element to better utilize the registers available per thread.
-![element](element/element.png)
-
-<!---
-The best solution to vectorization would be one, where the user does not have to do anything.
-This is not possible because the smallest unit supplied by the user is a kernel which is executed in threads which can synchronize.
-
-It is not possible to execute multiple kernels sequentially to hide the vectorization by starting a kernel-thread for e.g. each 4th thread in a block and then looping over the 4 entries.
-This would prohibit the synchronization between these threads.
-By executing 4 fibers inside such a vectorization kernel-thread we would allow synchronization again but prevent the loop vectorizer from working.
---->
\ No newline at end of file
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/Thread.md b/thirdParty/alpaka/doc/markdown/user/abstraction/Thread.md
deleted file mode 100644
index a684001f10..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/abstraction/Thread.md
+++ /dev/null
@@ -1,38 +0,0 @@
-[:arrow_up: Up](../Abstraction.md)
-
-Thread
-======
-
-Theoretically, a basic data parallel task can be executed optimally by executing one thread per independent data element.
-In this context, the term thread does not correspond to a native kernel-thread, an *OpenMP* thread, a *CUDA* thread, a user-level thread or any other such threading variant.
-It only represents the execution of a sequence of commands forming the desired algorithm on a per data element level.
-This ideal one-to-one mapping of data elements to threads leads to the execution of a multidimensional grid of threads corresponding to the data structure of the underlying problem.
-The uniform function executed by each of the threads is called a kernel.
-Some algorithms such as reductions require the possibility to synchronize or communicate between threads to calculate a correct result in a time optimal manner.
-Therefore our basic abstraction requires a n-dimensional grid of synchronizable threads each executing the same kernel.
-The following figure shows an hypothetical processing unit that could optimally execute this data parallel task.
-The threads are mapped one-to-one to the cores of the processor.
-For a time optimal execution, the cores have to have an all-to-all equal length connection for communication and synchronization.
-![thread](thread/thread.png)
-
-The only difference between the threads is their positional index into the grid which allows each thread to compute a different part of the solution.
-Threads can always access their private registers and the global memory.
-
-Registers
----------
-
-All variables with default scope within a kernel are automatically saved in registers and are not shared automatically.
-This memory is local to each thread and can not be accessed by other threads.
-
-Global Memory
--------------
-
-The global memory can be accessed from every thread in the grid as well as from the host thread.
-This is typically the largest but also the slowest memory available.
-
-Individual threads within the grid are allowed to statically or dynamically allocate buffers in the global memory.
-
-Prior to the execution of a task, the host thread copies the input buffers and allocates the output buffers onto the accelerator device.
-Pointers to these buffers then can be given as arguments to the task invocation.
-By using the index of each thread within the grid, the offset into the global input and output buffers can be calculated.
-After the computation has finished, the output buffer can be used either as input to a subsequent task or can be copied back to the host.
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/Warp.md b/thirdParty/alpaka/doc/markdown/user/abstraction/Warp.md
deleted file mode 100644
index c5b05df768..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/abstraction/Warp.md
+++ /dev/null
@@ -1,29 +0,0 @@
-[:arrow_up: Up](../Abstraction.md)
-
-Warp
-====
-
-With the current abstraction only independent parallelism via blocks and synchronizable parallelism via threads can be expressed.
-However, there are more variants of parallelism in real hardware.
-Because all threads in the grid are executing the same kernel and even the same instruction at the same time when ignoring divergent control flows, a lot of chip space can be saved.
-Multiple threads can be executed in perfect synchronicity, which is also called lock-step.
-A group of such threads executing the same instruction at the same time is called a warp .
-All threads within a warp share a single instruction pointer (IP), and all cores executing the threads share one instruction fetch (IF) and instruction decode (ID) unit.
-![warp](warp/warp.png)
-
-Even threads with divergent control flows can be executed within one warp.
-*CUDA*, for example, solves this by supporting predicated execution and warp voting.
-For long conditional branches the compiler inserts code which checks if all threads in the warp take the same branch.
-For small branches, where this is too expensive, all threads always execute all branches.
-Control flow statements result in a predicate and only in those threads where it is true, the predicated instructions will have an effect.
-
-Not only *CUDA* GPUs support the execution of multiple threads in a warp.
-Full blown vector processors with good compilers are capable of combining multiple loop iterations containing complex control flow statements in a similar manner as *CUDA*.
-
-Due to the synchronictiy of threads within a warp, memory operations will always occur at the same time in all threads.
-This allows to coalesce memory accesses.
-Different *CUDA* devices support different levels of memory coalescing.
-Older ones only supported combining multiple memory accesses if they were aligned and sequential in the order of thread indices.
-Newer ones support unaligned scattered accesses as long as they target the same 128 byte segment.
-
-The ability of very fast context switches between warps and a queue of ready warps allows *CUDA* capable GPUs to hide the latency of global memory operations.
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/Library.md b/thirdParty/alpaka/doc/markdown/user/implementation/Library.md
deleted file mode 100644
index 3a0c042164..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/Library.md
+++ /dev/null
@@ -1,16 +0,0 @@
-[:arrow_up: Up](../Implementation.md)
-
-Library Interface
-=================
-
-As described in the chapter about the Abstraction, the general design of the library is very similar to *CUDA* and *OpenCL* but extends both by some points, while not requiring any language extensions.
-General interface design as well as interface implementation decisions differentiating *alpaka* from those libraries are described in the Rationale section.
-It uses C++ because it is one of the most performant languages available on nearly all systems.
-Furthermore, C++11 allows to describe the concepts in a very abstract way that is not possible with many other languages.
-The *alpaka* library extensively makes use of advanced functional C++ template meta-programming techniques.
-The Implementation Details  section discusses the C++ library and the way it provides extensibility and optimizability.
-
-1. [Structure](library/Structure.md)
-2. [Usage](library/Usage.md)
-2. [Rationale](library/Rationale.md)
-3. [Details](library/Details.md)
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/Mapping.md b/thirdParty/alpaka/doc/markdown/user/implementation/Mapping.md
deleted file mode 100644
index 70a28fb6b5..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/Mapping.md
+++ /dev/null
@@ -1,24 +0,0 @@
-[:arrow_up: Up](../Implementation.md)
-
-Mapping onto Specific Hardware Architectures
-============================================
-
-By providing an accelerator independent interface for kernels, their execution and memory accesses at different hierarchy levels, *alpaka* allows the user to write accelerator independent code that does not neglect performance.
-
-The mapping of the decomposition to the execution environment is handled by the back-ends provided by the *alpaka* library as well as user defined back-ends.
-A computation that is described with a maximum of the parallelism available in the *redundant hierarchical parallelism* abstraction can not be mapped one to one to any existing hardware.
-GPUs do not have vector registers for `float` or `double` types.
-Therefore, the element level is often omitted on *CUDA* accelerators.
-CPUs in turn are not (currently) capable of running thousands of threads concurrently and do not have equivalently fast inter-thread synchronization and shared memory access as GPUs do.
-
-A major point of the *redundant hierarchical parallelism* abstraction is to ignore specific unsupported levels and utilize only the ones supported on a specific accelerator.
-This allows a mapping to various current and future accelerators in a variety of ways enabling optimal usage of the underlying compute and memory capabilities.
-
-The grid level is always mapped to the whole device being in consideration.
-The scheduler can always execute multiple kernel grids from multiple queues in parallel by statically or dynamically subdividing the available resources.
-However, this will only ever simplify the mapping due to less available processing units.
-Furthermore, being restricted to less resources automatically improves the locality of data due to spatial and temporal locality properties of the caching hierarchy.
-
-1. [CUDA GPUs](mapping/CUDA.md)
-2. [x86 CPUs](mapping/x86.md)
-2. [Accelerators](mapping/Accelerators.md)
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/Details.md b/thirdParty/alpaka/doc/markdown/user/implementation/library/Details.md
deleted file mode 100644
index 0f76843013..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/library/Details.md
+++ /dev/null
@@ -1,242 +0,0 @@
-[:arrow_up: Up](../Library.md)
-
-Details
-=======
-
-![Overview of the structure of the *alpaka* library with concepts and implementations.](structure.png)
-
-The full stack of concepts defined by the *alpaka* library and their inheritance hierarchy is shown in the third column of the preceding figure.
-Default implementations for those concepts can be seen in the blueish columns.
-The various accelerator implementations, shown in the lower half of the figure, only differ in some of their underlying concepts but can share most of the base implementations.
-The default implementations can, but do not have to be used at all.
-They can be replaced by user code in arbitrary granularity.
-By substituting, for instance, the atomic operation implementation of an accelerator, the execution can be fine-tuned, to better utilize the hardware instruction set of a specific processor.
-However, also complete accelerators, devices and all of the other concepts can be implemented by the user without the need to change any part of the *alpaka* library itself.
-The way this and other things are implemented is explained in the following paragraphs.
-
-Concept Implementations
------------------------
-
-The *alpaka* library has been implemented with extensibility in mind.
-This means that there are no predefined classes, modeling the concepts, the *alpaka* functions require as input parameters.
-They allow arbitrary types as parameters, as long as they model the required concept.
-
-C++ provides a language inherent object oriented abstraction allowing to check that parameters to a function comply with the concept they are required to model.
-By defining interface classes, which model the *alpaka* concepts, the user would be able to inherit his extension classes from the interfaces he wants to model and implement the abstract virtual methods the interfaces define.
-The *alpaka* functions in turn would use the corresponding interface types as their parameter types.
-For example, the `Buffer` concept requires methods for getting the pitch or changing the memory pinning state.
-With this intrusive object oriented design pattern the `BufCpu` or `BufCudaRt` classes would have to inherit from an `IBuffer` interface and implement the abstract methods it declares.
-An example of this basic pattern is shown in the following source snippet:
-
-```C++
-struct IBuffer
-{
-	virtual std::size_t getPitch() const = 0;
-	virtual void pin() = 0;
-	virtual void unpin() = 0;
-	...
-};
-
-struct BufCpu : public IBuffer
-{
-	virtual std::size_t getPitch() const override { ... }
-	virtual void pin() override { ... }
-	virtual void unpin() override { ... }
-	...
-};
-	
-ALPAKA_FN_HOST auto copy(
-	IBuffer & dst,
-	IBuffer const & src)
--> void
-{
-	...
-}
-```
-
-The compiler can then check at compile time that the objects the user wants to use as function parameters can be implicitly cast to the interface type, which is the case for inherited base classes.
-The compiler returns an error message on a type mismatch.
-However, if the *alpaka* library were using those language inherent object oriented abstractions, the extensibility and optimizability it promises would not be possible.
-Classes and run-time polymorphism require the implementer of extensions to intrusively inherit from predefined interfaces and override special virtual functions.
-
-This is feasible for user defined classes or types where the source code is available and where it can be changed.
-The `std::vector` class template on the other hand would not be able to model the `Buffer` concept because we can not change its definition to inherit from the `IBuffer` interface class since it is part of the standard library.
-The standard inheritance based object orientation of C++ only works well when all the code it is to interoperate with can be changed to implement the interfaces.
-It does not enable interaction with unalterable or existing code that is too complex to change, which is the reality in the majority of software projects.
-
-Another option to implement an extensible library is to follow the way the C++ standard library uses.
-It allows to specialize function templates for user types to model concepts without altering the types themselves.
-For example, the `std::begin` and `std::end` free function templates can be specialized for user defined types.
-With those functions specialized, the C++11 range-based for loops (`for(auto & i : userContainer){...}`) see *C++ Standard 6.5.4/1* can be used with user defined types.
-Equally specializations of `std::swap` and other standard library function templates can be defined to extend those with support for user types.
-One Problem with function specialization is, that only full specializations are allowed.
-A partial function template specialization is not allowed by the standard.
-Another problem can emerge due to users carelessly overloading the template functions instead of specializing them.
-Mixing function overloading and function template specialization on the same base template function can result in unexpected results.
-The reasons and effects of this are described more closely in an article from H. Sutter (currently convener of the ISO C++ committee) called *Sutter's Mill: Why Not Specialize Function Templates?* in the *C/C++ Users Journal* in July 2001.
-<!--- NOTE: different way: http://ericniebler.com/2014/10/21/customization-point-design-in-c11-and-beyond/ -->
-
-The solution given in the article is to provide *"a single function template that should never be specialized or overloaded"*.
-This function simply forwards its arguments *"to a class template containing a static function with the same signature"*.
-This template class can fully or partially be specialized without affecting overload resolution.
-
-The way the *alpaka* library implements this is by not using the C++ inherent object orientation but lifting those abstractions to a higher level.
-Instead of using a non-extensible`class`/`struct` for defining the interface, a namespace is utilized.
-In place of abstract virtual member functions of the interface, *alpaka* defines free functions within those namespaces.
-All those functions are templates allowing the user to call them with arbitrary self defined types and not only those inheriting from a special interface type.
-Unlike member functions, they have no implicit `this` pointer, so the object instance has to be explicitly given as a parameter.
-Overriding the abstract virtual interface methods is replaced by the specialization of a template type that is defined for each such namespace function.
-
-A concept is completely implemented by specializing the predefined template types.
-This allows to extend and fine-tune the implementation non-intrusively.
-For example, the corresponding pitch and memory pinning template types can be specialized for `std::vector`.
-After doing this, the `std::vector` can be used everywhere a buffer is accepted as argument throughout the whole *alpaka* library without ever touching its definition.
-
-A simple function allowing arbitrary tasks to be enqueued into a queue can be implemented in the way shown in the following code.
-The `TSfinae` template parameter will be explained in a [following section](#Template-Specialization-Selection-on-Arbitrary-Conditions).
-
-```C++
-namespace queue
-{
-	template<
-		typename TQueue,
-		typename TTask,
-		typename TSfinae = void>
-	struct Enqueue;
-		
-	template<
-		typename TQueue,
-		typename TTask>
-	ALPAKA_FN_HOST auto enqueue(
-		TQueue & queue,
-		TTask & task)
-	-> void
-	{
-		Enqueue<
-			TQueue,
-			TTask>
-		::enqueue(
-			queue,
-			task);
-	}
-}
-```
-
-A user who wants his queue type to be used with this `enqueue` function has to specialize the `Enqueue` template struct.
-This can be either done partially by only replacing the `TQueue` template parameter and accepting arbitrary tasks or by fully specializing and replacing both `TQueue` and `TTask`. This gives the user complete freedom of choice.
-The example given in the following code shows this by specializing the `Enqueue` type for a user queue type `UserQueue` and arbitrary tasks.
-
-```C++
-struct UserQueue{};
-
-namespace queue
-{
-	// partial specialization
-	template<
-		typename TTask>
-	struct Enqueue<
-		UserQueue
-		TTask>
-	{
-		ALPAKA_FN_HOST static auto enqueue(
-			UserQueue & queue,
-			TTask & task)
-		-> void
-		{
-			//...
-		}
-	};
-}
-```
-
-In addition the subsequent code shows a full specialization of the `Enqueue` type for a given `UserQueue` and a `UserTask`.
-
-```C++
-struct UserQueue{};
-struct UserTask{};
-
-namespace queue
-{
-	// full specialization
-	template<>
-	struct Enqueue<
-		UserQueue
-		UserTask>
-	{
-		ALPAKA_FN_HOST static auto enqueue(
-			UserQueue & queue,
-			UserTask & task)
-		-> void
-		{
-			//...
-		}
-	};
-}
-```
-
-When the `enqueue` function template is called with an instance of `UserQueue`, the most specialized version of the `Enqueue` template is selected depending on the type of the task `TTask` it is called with.
-
-A type can model the queue concept completely by defining specializations for `alpaka::queue::Enqueue` and `alpaka::queue::Empty`.
-This functionality can be accessed by the corresponding `alpaka::queue::enqueue` and `alpaka::queue::empty` template functions.
-
-Currently there is no native language support for describing and checking concepts in C++ at compile time.
-A study group (SG8) is working on the ISO [specification for conecpts](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4377.pdf) and compiler forks implementing them do exist.
-For usage in current C++ there are libraries like [*Boost.ConceptCheck*](http://www.boost.org/doc/libs/1_58_0/libs/concept_check/concept_check.htm) which try to emulate requirement checking of concept types.
-Those libraries often exploit the preprocessor and require non-trivial changes to the function declaration syntax.
-Therefore the *alpaka* library does not currently make use of *Boost.ConceptCheck*.
-Neither does it facilitate the proposed concept specification due to its dependency on non-standard compilers.
-
-The usage of concepts as described in the working draft would often dramatically enhance the compiler error messages in case of violation of concept requirements.
-Currently the error messages are pointing deeply inside the stack of library template invocations where the missing method or the like is called.
-Instead of this, with concept checking it would directly fail at the point of invocation of the outermost template function with an expressive error message about the parameter and its violation of the concept requirements.
-This would simplify especially the work with extendable template libraries like *Boost* or *alpaka*.
-However, in the way concept checking would be used in the *alpaka* library, omitting it does not change the semantic of the program, only the compile time error diagnostics.
-In the future when the standard incorporates concept checking and the major compilers support it, it will be added to the *alpaka* library.
-
-
-Template Specialization Selection on Arbitrary Conditions
----------------------------------------------------------
-
-Basic template specialization only allows for a selection of the most specialized version where all explicitly stated types have to be matched identically.
-It is not possible to enable or disable a specialization based on arbitrary compile time expressions depending on the parameter types.
-To allow such conditions, *alpaka* adds a defaulted and unused `TSfinae` template parameter to all declarations of the implementation template structs.
-This was shown using the example of the `Enqueue` template type.
-The C++ technique called SFINAE, an acronym for *Substitution failure is not an error* allows to disable arbitrary specializations depending on compile time conditions.
-Specializations where the substitution of the parameter types by the deduced types would result in invalid code will not result in a compile error, but will simply be omitted.
-An example in the context of the `Enqueue` template type is shown in the following code.
-
-```C++
-struct UserQueue{};
-
-namespace queue
-{
-	template<
-		typename TQueue,
-		typename TTask>
-	struct Enqueue<
-		TQueue
-		TTask,
-		typename std::enable_if<
-			std::is_base_of<UserQueue, TQueue>::value
-			&& (TTask::TaskId == 1u)
-		>::type>
-	{
-		ALPAKA_FN_HOST static auto enqueue(
-			TQueue & queue,
-			TTask & task)
-		-> void
-		{
-			//...
-		}
-	};
-}
-```
-
-The `Enqueue` specialization shown here does not require any direct type match for the `TQueue` or the `TTask` template parameter.
-It will be used in all contexts where `TQueue` has inherited from `UserQueue` and where the `TTask` has a static const integral member value `TaskId` that equals one.
-If the `TTask` type does not have a `TaskId` member, this code would be invalid and the substitution would fail.
-However, due to SFINAE, this would not result in a compiler error but rather only in omitting this specialization.
-The `std::enable_if` template results in a valid expression, if the condition it contains evaluates to true, and an invalid expression if it is false.
-Therefore it can be used to disable specializations depending on arbitrary boolean conditions.
-It is utilized in the case where the `TaskId` member is unequal one or the `TQueue` does not inherit from `UserQueue`.
-In this cirumstances, the condition itself results in valid code but because it evaluates to false, the `std::enable_if` specialization results in invalid code and the whole `Enqueue` template specialization gets omitted.
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/Rationale.md b/thirdParty/alpaka/doc/markdown/user/implementation/library/Rationale.md
deleted file mode 100644
index c816a5c0a2..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/library/Rationale.md
+++ /dev/null
@@ -1,278 +0,0 @@
-[:arrow_up: Up](../Library.md)
-
-Rationale
-=========
-
-Interface Distinction
---------------------
-
-The *alpaka* library is different from other similar libraries (especially *CUDA*) in that it refrains from using implicit or hidden state.
-This and other interface design decisions will be explained int the following paragraphs.
-
-### No Current Device:
-The *CUDA* runtime API for example supplies a current device for each user code kernel-thread.
-Working with multiple devices requires to call `cudaSetDevice` to change the current device whenever an operation should be executed on a non-current device.
-Even the functions for creating a queue (`cudaStreamCreate`) or an event (`cudaEventCreate`) use the current device without any way to create them on a non current device.
-In the case of an event this dependency is not obvious, since at the same time queues can wait for events from multiple devices allowing cross-device synchronization without any additional work.
-So conceptually an event could also have been implemented device independently.
-This can lead to hard to track down bugs due to the non-explicit dependencies, especially in multi-threaded code using multiple devices.
-
-### No Default Device:
-In contrast to the *CUDA* runtime API *alpaka* does not provide a device by default per kernel-thread.
-Especially in combination with *OpenMP* parallelized host code this keeps users from surprises.
-The following code snippet shows that it does not necessarily do what one would expect.
-
-```C++
-cudaSetDevice(1);
-
-#pragma omp parallel for
-for(int i = 0; i<10; ++i)
-{
-    kernel<<<blocks,threads>>>(i);
-}
-```
-
-Depending on what the *CUDA* runtime API selects as default device for each of the *OpenMP* threads (due to each of them having its own current device), not all of the kernels will necessarily run on device one.
-
-In the *alpaka* library all such dependencies are made explicit.
-All functions depending on a device require it to be given as a parameter.
-The *alpaka* *CUDA* back-end checks before forwarding the calls to the *CUDA* runtime API whether the current device matches the given one and changes it if required.
-The *alpaka* *CUDA* back-end does not reset the current device to the one prior to the method invocation out of performance considerations.
-This has to be considered when native *CUDA* code is combined with *alpaka* code.
-
-### No Default Queue:
-*CUDA* allows to execute commands without specifying a queue.
-The default queue that is used synchronizes implicitly with all other queues on the device.
-If a command queue is issued to the default, all other asynchronous queues have to wait before executing any new commands, even when they have been enqueued much earlier.
-This can introduce hard to track down performance issues.
-As of *CUDA* 7.0 the default queue can be converted to a non synchronizing queue with a compiler option.
-Because concurrency is crucial for performance and users should think about the dependencies between their commands from begin on, *alpaka* does not provide such a default queue.
-All asynchronous operations (kernel launches, memory copies and memory sets) require a queue to be executed in.
-
-### No Implicit Built-in Variables and Functions:
-Within *CUDA* device functions (functions annotated with `__global__` or `__device__`) built-in functions (`__syncthreads`, `__threadfence`, `atomicAdd`, ... ) and variables (`gridDim`, `blockIdx`, `blockDim`, `threadIdx`, `warpSize`, ...) are provided.
-
-It would have been possible to emulate those implicit definitions by forcing the kernel function object to inherit from a class providing these functions and members.
-However functions outside the kernel function object would then pose a problem.
-They do not have access to those functions and members, the function object has inherited.
-To circumvent this, the functions and members would have to be public, the inheritance would have to be public and a reference to the currently executing function object would have to be passed as parameter to external functions.
-This would have been too cumbersome and inconsistent.
-Therefore access to the accelerator is given to the user kernel function object via one special input parameter representing the accelerator.
-After that this accelerator object can simply be passed to other functions.
-The built-in variables can be accessed by the user via query functions on this accelerator.
-
-  * Abandoning all the implicit and default state makes it much easier for users of the library to reason about their code. *
-
-### No Language Extensions:
-Unlike *CUDA*, the *alpaka* library does not extend the C++ language with any additional variable qualifiers (`__shared__`, `__constant__`, `__device__`) defining the memory space.
-Instead of those qualifiers *alpaka* provides accelerator functions to allocate memory in different the different memory spaces.
-
-### No Dimensionality Restriction:
-*CUDA* always uses three-dimensional indices and extents, even though the task may only be one or two dimensional.
-*OpenCL* on the other hand allows grid and block dimensions in the range [1,3] but does not provide corresponding n-dimensional indices, but rather provides functions like `get_global_id` or `get_local_id`, which require the dimension in which the one-dimensional ID is to be queried as a parameter.
-By itself this is no problem, but how can be assured that a two-dimensional kernel is called with grid and block extents of the correct dimensionality at compile time?
-How can it be assured that a kernel which only uses `threadIdx.x` or equivalently calls `get_global_id(0)` will not get called with two dimensional grid and block extents?
-Because the result in such a case is undefined, and most of the time not wanted by the kernel author, this should be easy to check and reject at compile-time.
-In *alpaka* all accelerators are templatized on the dimensionality.
-This allows a two-dimensional image filter to assert that it is only called with a two dimensional accelerator.
-Thereby the algorithms can check for supported dimensionality of the accelerator at compile time instead of runtime.
-Furthermore with the dimension being a template parameter, the CPU back-end implementations are able to use only the number of nested loops really necessary instead of the 6 loops (2 x 3 loops for grid blocks and block threads), which are mandatory to emulate the *CUDA* threaded blocking scheme.
-
-*By hiding all the accelerator functionality inside of the accelerator object that is passed to the user kernel, the user of the *alpaka* library is not faced with any non-standard C++ extensions.
-Nevertheless the *CUDA* back-end internally uses those language extensions.*
-
-### Integral Sizes of Arbitrary Type:
-The type of sizes such as extents, indices and related variables are depending on a template parameter of the accelerator and connected classes.
-This allows the kernel to be executed with sizes of arbitrary ranges.
-Thereby it is possible to force the accelerator back-ends to perform all internal index, extent and other integral size depending computations with a given precision.
-This is especially useful on current *NVIDIA* GPUs.
-Even though they support 64-bit integral operations, they are emulated with multiple 32-bit operations.
-This can be a huge performance penalty when the sizes of buffers, offsets, indices and other integral variables holding sizes are known to be limited.
-
-### No synchronous (blocking) and asynchronous (non-blocking) function versions:
-*CUDA* provides two versions of many of the runtime functions, for example, `cudaMemcpyAsync` and `cudaMemcpy`.
-The asynchronous version requires a queue while the synchronous version does not need a queue parameter.
-The asynchronous version immediately returns control back to the caller while the task is enqueued into the given queue and executed later in parallel to the host code.
-The synchronous version waits for the task to finish before the function call returns control to the caller.
-Inconsistently, all kernels in a *CUDA* program can only be started either asynchronously by default or synchronously if `CUDA_LAUNCH_BLOCKING` is defined.
-There is no way to specify this on a per kernel basis.
-To switch a whole application from asynchronous to synchronous calls, for example for debugging reasons, it is necessary to change the names of all the runtime functions being called as well as their parameters.
-In *alpaka* this is solved by always enqueuing all tasks into a queue and not defining a default queue.
-Non-blocking queues as well as blocking queues are provided for all devices.
-Changes to the synchronicity of multiple tasks can be made on a per queue basis by changing the queue type at the place of creation.
-There is no need to change any line of calling code.
-
-### Memory Management
-Memory buffers can not only be identified by the pointer to their first byte.
-The C++ `new` and `malloc`, the *CUDA* `cudaMalloc` as well as the *OpenCL* `clCreateBuffer` functions all return a plain pointer.
-This is not enough when working with multiple accelerators and multiple devices.
-To know where a specific pointer was allocated, additional information has to be stored to uniquely identify a memory buffer on a specific device.
-Memory copies between multiple buffers additionally require the buffer extents and pitches to be known.
-Many APIs, for example *CUDA*, require the user to store this information externally.
-To unify the usage, *alpaka* stores all the necessary information in a memory buffer object.
-
-Acceleratable Functions
------------------------
-
-Many parallelization libraries / frameworks do not fully support the separation of the parallelization strategy from the algorithm itself.
-*OpenMP*, for example, fully mixes the per thread algorithm and the parallelization strategy.
-This can be seen in the source listing showing a simple AXPY computation with OpenMP.
-
-```C++
-template<
-    typename TIdx,
-    typename TElem>
-void axpyOpenMP(
-    TIdx const n,
-    TElem const alpha,
-    TElem const * const X,
-    TElem * const Y)
-{
-    #pragma omp parallel for
-    for (i=0; i<n; i++)
-    {
-        Y[i] = alpha * X[i] + Y[i];
-    }
-}
-```
-
-Only one line of the function body, line 13, is the algorithm itself, while all surrounding lines represent the parallelization strategy.
-In *OpenACC* the parallelization and the algorithm are similarly combined.
-
-*CUDA*, *OpenCL* and other libraries allow, at least to some degree, to separate the algorithm from the parallelization strategy.
-They define the concept of a kernel representing the algorithm itself which is then parallelized depending on the underlying hardware.
-The AXPY *CUDA* kernel source code shown in figure consists only of the code of one single iteration.
-
-```C++
-template<
-    typename TIdx,
-    typename TElem>
-__global__ void axpyCUDA(
-    TIdx const n,
-    TElem const alpha,
-    TElem const * const X,
-    TElem * const Y)
-{
-    TIdx const i(blockIdx.x*blockDim.x + threadIdx.x)
-    if(i < n)
-    {
-        Y[i] = alpha * X[i] + Y[i];
-    }
-}
-```
-
-On the other hand the *CUDA* implementation is bloated with code handling the inherent blocking scheme.
-Even if the algorithm does not utilize blocking, as it is the case here, the algorithm writer has to calculate the global index of the current thread by hand (line 10).
-Furthermore, to support vectors larger then the predefined maximum number of threads per block (1024 for current *CUDA* devices), multiple blocks have to be used.
-When the number of blocks does not divide the number of vector elements, it has to be assured that the threads responsible for the vector elements behind the given length, do not access the memory to prevent a possible memory access error.
-
-By using the kernel concept, the parallelization strategy, whether all elements are executed in sequential order, in parallel or blocked is not hard coded into the algorithm itself.
-The possibly multidimensional nested loops do not have to be written by the user.
-For example, six loops would be required to emulate the *CUDA* execution pattern with a grid of blocks consisting of threads.
-
-Furthermore the kernel concept breaks the algorithm down to the per element level.
-Recombining multiple kernel iterations to loop over lines, columns, blocks or any other structure is always possible by changing the calling code and does not require a change of the kernel.
-In contrast, by using *OpenMP* this would not be possible.
-Therefore the *alpaka* interface builds on the kernel concept, being the body of the corresponding standard for loop executed in each thread.
-
-### Execution Domain Specifications
-
-*CUDA* requires the user to annotate its functions with execution domain specifications.
-Functions that can only be executed on the GPU have to be annotated with `__device__`, functions that can be executed on the host and on the GPU have to be annotated with `__host__ __device__` and host only functions can optionally be annotated with `__host__`.
-The nvcc *CUDA* compiler uses these annotations to decide with which back-ends a function has to be compiled.
-Depending on the compiler in use, *alpaka* defines the macros  `ALPAKA_FN_HOST`, `ALPAKA_FN_ACC` and `ALPAKA_FN_HOST_ACC` with the identical meaning which can be used in the same positions.
-When the *CUDA* compiler is used, they are defined to their *CUDA* equivalents, else they are empty.
-
-### Kernel Function
-
-#### Requirements
-
-- User kernels should be implemented independent of the accelerator.
-- A user kernel has to have access to accelerator methods (synchronization within blocks, index retrieval, ...).
-- For usage with CUDA, the kernel methods have to be attributed with \__device\__ \__host\__.
-- The user kernel has to fulfill std::is_trivially_copyable because only such objects can be copied into CUDA device memory.
-  A trivially copyable class is a class that
-   1. Has no non-trivial copy constructors(this also requires no virtual functions or virtual bases)
-   2. Has no non-trivial move constructors
-   3. Has no non-trivial copy assignment operators
-   4. Has no non-trivial move assignment operators
-   5. Has a trivial destructor
-
-#### Implementation Variants
-
-There are two possible ways to tell the kernel about the accelerator type:
- 1. The kernel is templated on the accelerator type ...
-  * + This allows users to specialize them for different accelerators. (Is this is really necessary or desired?)
-  * - The kernel has to be a class template. This does not allow C++ lambdas to be used as kernels because they are no templates themselves (but only their `operator()` can be templated in C++14).
-  * - This prevents the user from instantiating an accelerator independent kernel before executing it.
-  Because the memory layout in inheritance hierarchies is undefined a simple copy of the user kernel or its members to its specialized type is not possible platform independently.
-  This would require a copy from UserKernel<TDummyAcc> to UserKernel<TAcc> to be possible.
-  The only way to allow this would be to require the user to implement a templated copy constructor for every kernel.
-  This is not allowed for kernels that should be copyable to a CUDA device because std::is_trivially_copyable requires the kernel to have no non-trivial copy constructors.
-  * a) ... and inherits from the accelerator. 
-    * - The kernel itself has to inherit at least protected from the accelerator to allow the KernelExecutor to access the Accelerator.
-    * - How do accelerator functions called from the kernel (and not within the kernel class itself) access the accelerator methods?
-    Casting this to the accelerator type and giving it as parameter is too much to require from the user.
-  * b) ... and the `operator()` has a reference to the accelerator as parameter.
-    * + This allows to use the accelerator in functions called from the kernel (and not within the kernel class itself) to access the accelerator methods in the same way the kernel entry point function can.
-    * - This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless.
- 2. The `operator()` is templated on the accelerator type and has a reference to the accelerator as parameter.
-  * + The kernel can be an arbitrary function object with ALPAKA_FN_HOST_ACC attributes.
-  * + This would allow to instantiate the accelerator independent kernel and set its members before execution.
-  * +/- C++14 provides polymorphic lambdas. All compilers (even MSVC) support this.
-  * - The `operator()` could be overloaded on the accelerator type but there is no way to specialize the whole kernel class itself, so it always has the same members.
-  * - This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless.
-
-Currently we implement version 2.
-
-
-#### Implementation Notes
-
-Unlike *CUDA*, the *alpaka* library does not differentiate between the kernel function that represents the entry point and other functions that can be executed on the accelerator.
-The entry point function that has to be annotated with `__global__` in *CUDA* is internal to the *alpaka* *CUDA* back-end and is not exposed to the user.
-It directly calls into the user supplied kernel function object whose invocation operator is declared with `ALPAKA_FN_ACC`, which equals `__device__` in *CUDA*.
-In this respect there is no difference between the kernel entry point function and any other accelerator function in *alpaka*.
-
-The `operator()` of the kernel function object has to be `const`.
-This is especially important for the *CUDA* back-end, as it could possibly use the constant memory of the GPU to store the function object.
-The constant memory is a fast, cached, read-only memory that is beneficial when all threads uniformly read from the same address at the same time.
-In this case it is as fast as a read from a register.
-
-
-### Access to accelerator dependent functionality
-
-There are two possible ways to implement access to accelerator dependent functionality inside a kernel:
-* Making the functions/templates members of the accelerator (maybe by inheritance) and calling them like `acc.syncThreads()` or `acc.template getIdx<Grid, Thread, Dim1>()`.
-This would require the user to know and understand when to use the template keyword inside dependent type  object function calls.
-* The functions are only light wrappers around traits that can be specialized taking the accelerator as first value (it can not be the last value because of the potential use of variadic arguments). 
-The resulting code would look like `sync(acc)` or `getIdx<Grid, Thread, Dim1>(acc)`.
-Internally these wrappers would call trait templates that are specialized for the specific accelerator e.g. `template<typename TAcc> Sync{...};`
-
-The second version is easier to understand and usually shorter to use in user code.
-
-
-Index and Work Division
------------------------
-
-*CUDA* requires the user to calculate the global index of the current thread within the grid by hand (already shown as `axpyCUDA`).
-On the contrary, *OpenCL* provides the methods `get_global_size`, `get_global_id`, `get_local_size` and `get_local_id`.
-Called with the required dimension, they return the corresponding local or global index or extent (size).
-In *alpaka* this idea is extended to all dimensions.
-To unify the method interface and to avoid confusion between the differing terms and meanings of the functions in *OpenCL* and *CUDA*, in *alpaka* these methods are template functions.
-
-
-Block Shared Memory
--------------------
- 
-### Static Block Shared Memory
-
-The size of block shared memory that is allocated inside the kernel is required to be given as compile time constant.
-This is due to CUDA not allowing to allocate block shared memory inside a kernel at runtime.
- 
-### Dynamic Block Shared Memory
-
-The size of the external block shared memory is obtained from a trait that can be specialized for each kernel.
-The trait is called with the current kernel invocation parameters and the block-element extent prior to each kernel execution.
-Because the block shared memory size is only ever constant or dependent on the block-element extent or the parameters of the invocation this has multiple advantages:
-* It forces the separation of the kernel invocation from the calculation of the required block shared memory size.
-* It lets the user write this calculation once instead of multiple times spread across the code.
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/Structure.md b/thirdParty/alpaka/doc/markdown/user/implementation/library/Structure.md
deleted file mode 100644
index e0a2c88b2d..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/library/Structure.md
+++ /dev/null
@@ -1,35 +0,0 @@
-[:arrow_up: Up](../Library.md)
-
-Structure
-=========
-
-The *alpaka* library allows offloading of computations from the host execution domain to the accelerator execution domain, whereby they are allowed to be identical.
-
-In the abstraction hierarchy the library code is interleaved with user supplied code as is depicted in the following figure.
-![Execution Domains](execution_domain.png)
-User code invokes library functions, which in turn execute the user provided thread function (kernel) in parallel on the accelerator.
-The kernel in turn calls library functions when accessing accelerator properties and methods.
-Additionally, the user can enhance or optimize the library implementations by extending or replacing specific parts.
-
-The *alpaka* abstraction itself only defines requirements a type has to fulfill to be usable with the template functions the library provides.
-These type constraints are called concepts in C++.
-
-*A concept is a set of requirements consisting of valid expressions, associated types, invariants, and complexity guarantees.
-A type that satisfies the requirements is said to model the concept.
-A concept can extend the requirements of another concept, which is called refinement.* [BoostConcepts](http://www.boost.org/community/generic_programming.html)
-
-Concepts allow to safely define polymorphic algorithms that work with objects of many different types.
-
-The *alpaka* library implements a stack of concepts and their interactions modeling the abstraction defined in the previous chapter.
-Furthermore, default implementations for various devices and accelerators modeling those are included in the library.
-The interaction of the main user facing concepts can be seen in the following figure.
-![user / alpaka code interaction](structure_assoc.png)
-
-For each type of `Device` there is a `Platform` for enumerating the available `Device`s.
-A `Device` is the requirement for creating `Queues` and `Events` as it is for allocating `Buffers` on the respective `Device`. `Buffers` can be copied, their memory be set and they can be pinned or mapped.
-Copying and setting a buffer requires the corresponding `Copy` and `Set` tasks to be enqueued into the `Queue`.
-An `Event` can be enqueued into a `Queue` and its completion state can be queried by the user.
-It is possible to wait for (synchronize with) a single `Event`, a `Queue` or a whole `Device`.
-An `Executor` can be enqueued into a `Queue` and will execute the `Kernel` (after all previous tasks in the queue have been completed).
-The `Kernel` in turn has access to the `Accelerator` it is running on.
-The `Accelerator` provides the `Kernel` with its current index in the block or grid, their extents or other data as well as it allows to allocate shared memory, execute atomic operations and many more.
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/Usage.md b/thirdParty/alpaka/doc/markdown/user/implementation/library/Usage.md
deleted file mode 100644
index 52382c47e5..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/library/Usage.md
+++ /dev/null
@@ -1,112 +0,0 @@
-[:arrow_up: Up](../Library.md)
-
-Interface Usage
-===============
-
-Accelerator Executable Functions
---------------------------------
-
-Functions that should be executable on an accelerator have to be annotated with the execution domain (one of `ALPAKA_FN_HOST`, `ALPAKA_FN_ACC` and `ALPAKA_FN_HOST_ACC`).
-They most probably also require access to the accelerator data and methods, such as indices and extents as well as functions to allocate shared memory and to synchronize all threads within a block. 
-Therefore the accelerator has to be passed in as a templated constant reference parameter as can be seen in the following code snippet.
-
-```C++
-template<
-    typename TAcc>
-ALPAKA_FN_ACC auto doSomethingOnAccelerator(
-    TAcc const & acc/*,
-    ...*/)                  // Arbitrary number of parameters
--> int                      // Arbitrary return type
-{
-    //...
-}
-```
-
-
-Kernel Definition
------------------
-
-A kernel is a special function object which has to conform to the following requirements:
-* it has to fulfill the `std::is_trivially_copyable` trait (has to be copyable via memcpy)
-* the `operator()` is the kernel entry point
-  * it has to be an accelerator executable function
-  * it has to return `void`.
-  * its first argument has to be the accelerator (templated for arbitrary accelerator backends).
-
-The following code snippet shows a basic example of a kernel function object.
-
-```C++
-struct MyKernel
-{
-    template<
-        typename TAcc>       // Templated on the accelerator type.
-    ALPAKA_FN_ACC            // Macro marking the function to be executable on all accelerators.
-    auto operator()(         // The function / kernel to execute.
-        TAcc const & acc/*,  // The specific accelerator implementation.
-        ...*/) const         // Must be 'const'.
-    -> void
-    {
-        //...
-    }
-                      // Class can have members but has to be std::is_trivially_copyable.
-                      // Classes must not have pointers or references to host memory!
-};
-```
-
-The kernel function object is shared across all threads in all blocks.
-Due to the block execution order being undefined, there is no safe and consistent way of altering state that is stored inside of the function object.
-Therefore, the `operator()` of the kernel function object has to be `const` and is not allowed to modify any of the object members.
-
-
-Index and Work Division
------------------------
-
-The `alpaka::workdiv::getWorkDiv` and the `alpaka::idx::getIdx` functions both return a vector of the dimensionality the accelerator has been defined with.
-They are parametrized by the origin of the calculation as well as the unit in which the values are calculated.
-For example, `alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)` returns a vector with the extents of the grid in units of threads.
-
-
-Memory Management
------------------
-
-The memory allocation function of the *alpaka* library (`alpaka::mem::buf::alloc<TElem>(device, extents)`) is uniform for all devices, even for the host device.
-It does not return raw pointers but reference counted memory buffer objects that remove the necessity for manual freeing and the possibility of memory leaks.
-Additionally the memory buffer objects know their extents, their pitches as well as the device they reside on.
-This allows buffers that possibly reside on different devices with different pitches to be copied only by providing the buffer objects as well as the extents of the region to copy (`alpaka::mem::view::copy(bufDevA, bufDevB, copyExtents`).
-
-Kernel Execution
-----------------
-
-The following source code listing shows the execution of a kernel by enqueuing the execution task into a queue.
-
-```C++
-// Define the dimensionality of the task.
-using Dim = alpaka::dim::DimInt<1u>;
-// Define the type of the indexes.
-using Idx = std::size_t;
-// Define the accelerator to use.
-using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
-// Select the queue type.
-using Queue = alpaka::queue::QueueCpuNonBlocking;
-
-// Select a device to execute on.
-auto devAcc(alpaka::pltf::getDevByIdx<alpaka::pltf::PltfCpu>(0));
-// Create a queue to enqueue the execution into.
-Queue queue(devAcc);
-
-// Create a 1-dimensional work division with 256 blocks a 16 threads.
-auto const workDiv(alpaka::workdiv::WorkDivMembers<Dim, Idx>(256u, 16u);
-// Create an instance of the kernel function object.
-MyKernel kernel;
-// Enqueue the execution task into the queue.
-alpaka::kernel::exec<Acc>(queue, workDiv, kernel/*, arguments ...*/);
-```
-
-The dimensionality of the task as well as the type for index and extent have to be defined explicitly.
-Following this, the type of accelerator to execute on, as well as the type of the queue have to be defined.
-For both of these types instances have to be created.
-For the accelerator this has to be done indirectly by enumerating the required device via the device manager, whereas the queue can be created directly.
-
-To execute the kernel, an instance of the kernel function object has to be constructed.
-Following this, an execution task combining the work division (grid and block sizes) with the kernel function object and the bound invocation arguments has to be created.
-After that this task can be enqueued into a queue for immediate or later execution (depending on the queue used).
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md b/thirdParty/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md
deleted file mode 100644
index 452bc6ac15..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md
+++ /dev/null
@@ -1,76 +0,0 @@
-[:arrow_up: Up](../Mapping.md)
-
-Accelerator Implementations
-===========================
-
-|alpaka|Serial|std::thread|Boost.Fiber|OpenMP 2.0|OpenMP 4.0|CUDA 8.0|
-|---|---|---|---|---|---|---|
-|Devices|Host Core|Host Cores|Host Core|Host Cores|Host Cores|NVIDIA GPUs|
-|Lib/API|n/a|std::thread|boost::fibers::fiber|OpenMP 2.0|OpenMP 4.0|CUDA 8.0|
-|Kernel execution|n/a|std::thread(kernel)|boost::fibers::fiber(kernel)|omp_set_dynamic(0), #pragma omp parallel num_threads(iNumKernelsInBlock)|#pragma omp target, #pragma omp teams num_teams(...) thread_limit(...), #pragma omp distribute, #pragma omp parallel num_threads(...)|cudaConfigureCall, cudaSetupArgument, cudaLaunch|
-|Execution strategy grid-blocks|sequential|sequential|sequential|sequential|undefined|undefined|
-|Execution strategy block-kernels|sequential|preemptive multitasking|cooperative multithreading|preemptive multitasking|preemptive multitasking|lock-step within warps|
-|getIdx|n/a|*block-kernel*: mapping of std::this_thread::get_id() *grid-block*: member variable|*block-kernel*: mapping of std::this_fiber::get_id() *grid-block*: member variable|*block-kernel*: omp_get_thread_num() to 3D index mapping *grid-block*: member variable|*block-kernel*: omp_get_thread_num() to 3D index mapping *grid-block*: member variable|threadIdx, blockIdx|
-|getExtent|member variables|member variables|member variables|member variables|member variables|gridDim, blockDim|
-|getBlockSharedExternMem|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|\__shared__|
-|allocBlockSharedMem|master thread allocates|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|\__shared__|
-|syncBlockKernels|n/a|barrier|barrier|#pragma omp barrier|#pragma omp barrier|__syncthreads|
-|atomicOp|n/a|std::lock_guard< std::mutex >|n/a|#pragma omp critical|#pragma omp critical|atomicXXX|
-|ALPAKA_FN_HOST_ACC, ALPAKA_FN_ACC, ALPAKA_FN_HOST|inline|inline|inline|inline|inline|\__device__, \__host__, \__forceinline__|
-
-
-### Serial
-
-The serial accelerator only allows blocks with exactly one thread.
-Therefore it does not implement real synchronization or atomic primitives.
-
-### Threads
-
-#### Execution
-
-To prevent recreation of the threads between execution of different blocks in the grid, the threads are stored inside a thread pool.
-This thread pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage and lots of idling kernel-threads when there are multiple KernelExecutors around.
-Because the default policy of the threads in the pool is to yield instead of waiting, this would also slow down the system immensely.
-
-### Fibers
-
-#### Execution
-
-To prevent recreation of the fibers between execution of different blocks in the grid, the fibers are stored inside a fibers pool.
-This fiber pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage when there are multiple KernelExecutors around.
-
-### OpenMP
-
-#### Execution
-
-Parallel execution of the kernels in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line.
-So we have to spawn one real thread per kernel in a block.
-`omp for` is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required.
-Therefore we use `omp parallel` with the specified number of threads in a block.
-Another reason for not using `omp for` like `#pragma omp parallel for collapse(3) num_threads(blockDim.x*blockDim.y*blockDim.z)` is that `#pragma omp barrier` used for intra block synchronization is not allowed inside `omp for` blocks.
-
-Because OpenMP is designed for a 1:1 abstraction of hardware to software threads, the block size is restricted by the number of OpenMP threads allowed by the runtime. 
-This could be as little as 2 or 4 kernels but on a system with 4 cores and hyper-threading OpenMP can also allow 64 threads.
-
-#### Index
-
-OpenMP only provides a linear thread index. This index is converted to a 3 dimensional index at runtime.
-
-#### Atomic
-
-We can not use '#pragma omp atomic' because braces or calling other functions directly after `#pragma omp atomic` are not allowed.
-Because we are implementing the CUDA atomic operations which return the old value, this requires `#pragma omp critical` to be used.
-`omp_set_lock` is an alternative but is usually slower.
-
-### CUDA
-
-Nearly all CUDA functionality can be directly mapped to alpaka function calls.
-A major difference is that CUDA requires the block and grid sizes to be given in (x, y, z) order.
-Alpaka uses the mathematical C/C++ array indexing scheme [z][y][x].
-Dimension 0 in this case is z, dimensions 2 is x.
-
-Furthermore alpaka does not require the indices and extents to be 3-dimensional.
-The accelerators are templatized on and support arbitrary dimensionality.
-NOTE: Currently the CUDA implementation is restricted to a maximum of 3 dimensions!
-
-NOTE: The CUDA-accelerator back-end can change the current CUDA device and will NOT set the device back to the one prior to the invocation of the alpaka function!
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/CUDA.md b/thirdParty/alpaka/doc/markdown/user/implementation/mapping/CUDA.md
deleted file mode 100644
index 964bc1d8db..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/CUDA.md
+++ /dev/null
@@ -1,243 +0,0 @@
-[:arrow_up: Up](../Mapping.md)
-
-CUDA GPUs
-=========
-
-Mapping the abstraction to GPUs supporting *CUDA* is straightforward because the hierarchy levels are identical up to the element level.
-So blocks of warps of threads will be mapped directly to their *CUDA* equivalent.
-
-The element level is supported through an additional run-time variable containing the extent of elements per thread.
-This variable can be accessed by all threads and should optimally be placed in constant device memory for fast access.
-
-Porting CUDA to *alpaka*
-------------------------
-
-Nearly all CUDA functionality can be directly mapped to alpaka function calls.
-A major difference is that CUDA requires the block and grid sizes to be given in (x, y, z) order. Alpaka uses the mathematical C/C++ array indexing scheme [z][y][x]. In both cases x is the innermost / fast running index.
-
-Furthermore alpaka does not require the indices and extents to be 3-dimensional.
-The accelerators are templatized on and support arbitrary dimensionality.
-NOTE: Currently the CUDA implementation is restricted to a maximum of 3 dimensions!
-
-NOTE: You have to be careful when mixing alpaka and non alpaka CUDA code. The CUDA-accelerator back-end can change the current CUDA device and will NOT set the device back to the one prior to the invocation of the alpaka function.
-
-
-### Programming Interface
-
-*Function Attributes*
-
-|CUDA|alpaka|
-|---|---|
-|\_\_host\_\_|ALPAKA_FN_HOST|
-|\_\_device\_\_|ALPAKA_FN_ACC*|
-|\_\_global\_\_|ALPAKA_FN_ACC*|
-|\_\_host\_\_ \_\_device\_\_|ALPAKA_FN_HOST_ACC|
-
-\* You can not call CUDA only methods except when ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled.
-
-*Memory*
-
-|CUDA|alpaka|
-|---|---|
-|\_\_shared\_\_|[alpaka::block::shared::st::allocVar<std::uint32_t, \_\_COUNTER\_\_>(acc)](../../../../../test/unit/block/shared/src/BlockSharedMemSt.cpp#L69)|
-|\_\_constant\_\_|[ALPAKA_STATIC_ACC_MEM_CONSTANT](../../../../../test/unit/mem/view/src/ViewStaticAccMem.cpp#L58-L63)|
-|\_\_device\_\_|[ALPAKA_STATIC_ACC_MEM_GLOBAL](../../../../../test/unit/mem/view/src/ViewStaticAccMem.cpp#L164-L169)|
-
-*Index / Work Division*
-
-|CUDA|alpaka|
-|---|---|
-|threadIdx|alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)|
-|blockIdx|alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)|
-|blockDim|alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)|
-|gridDim|alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)|
-
-*Types*
-
-|CUDA|alpaka|
-|---|---|
-|dim3|[alpaka::vec::Vec< TDim, TVal >](../../../../../test/unit/vec/src/VecTest.cpp#L43-L45)|
-
-
-### CUDA Runtime API
-
-The following tables list the functions available in the [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/modules.html#modules) and their equivalent alpaka functions:
-
-*Device Management*
-
-|CUDA|alpaka|
-|---|---|
-|cudaChooseDevice|-|
-|cudaDeviceGetAttribute|-|
-|cudaDeviceGetByPCIBusId|-|
-|cudaDeviceGetCacheConfig|-|
-|cudaDeviceGetLimit|-|
-|cudaDeviceGetP2PAttribute|-|
-|cudaDeviceGetPCIBusId|-|
-|cudaDeviceGetSharedMemConfig|-|
-|cudaDeviceGetQueuePriorityRange|-|
-|cudaDeviceReset|alpaka::dev::reset(device)|
-|cudaDeviceSetCacheConfig|-|
-|cudaDeviceSetLimit|-|
-|cudaDeviceSetSharedMemConfig|-|
-|cudaDeviceSynchronize|void alpaka::wait::wait(device)|
-|cudaGetDevice|n/a (no current device)|
-|cudaGetDeviceCount|std::size_t alpaka::pltf::getDevCount< TPltf >()|
-|cudaGetDeviceFlags|-|
-|cudaGetDeviceProperties|alpaka::acc::getAccDevProps(dev) *NOTE: Only some properties available*|
-|cudaIpcCloseMemHandle|-|
-|cudaIpcGetEventHandle|-|
-|cudaIpcGetMemHandle|-|
-|cudaIpcOpenEventHandle|-|
-|cudaIpcOpenMemHandle|-|
-|cudaSetDevice|n/a (no current device)|
-|cudaSetDeviceFlags|-|
-|cudaSetValidDevices|-|
-
-*Error Handling*
-
-|CUDA|alpaka|
-|---|---|
-|cudaGetErrorName|n/a (handled internally, available in exception message)|
-|cudaGetErrorString|n/a (handled internally, available in exception message)|
-|cudaGetLastError|n/a (handled internally)|
-|cudaPeekAtLastError|n/a (handled internally)|
-
-*Queue Management*
-
-|CUDA|alpaka|
-|---|---|
-|cudaStreamAddCallback|alpaka::queue::enqueue(queue, \[\](){do_something();})|
-|cudaStreamAttachMemAsync|-|
-|cudaStreamCreate|<ul><li>queue = alpaka::queue::QueueCudaRtNonBlocking(device);</li><li>queue = alpaka::queue::QueueCudaRtBlocking(device);</li></ul>|
-|cudaStreamCreateWithFlags|see cudaStreamCreate (cudaStreamNonBlocking hard coded)|
-|cudaStreamCreateWithPriority|-|
-|cudaStreamDestroy|n/a (Destructor)|
-|cudaStreamGetFlags|-|
-|cudaStreamGetPriority|-|
-|cudaStreamQuery|bool alpaka::queue::empty(queue)|
-|cudaStreamSynchronize|void alpaka::wait::wait(queue)|
-|cudaStreamWaitEvent|void alpaka::wait::wait(queue, event)|
-
-*Event Management*
-
-|CUDA|alpaka|
-|---|---|
-|cudaEventCreate|alpaka::event::Event< TQueue > event(dev);|
-|cudaEventCreateWithFlags|-|
-|cudaEventDestroy|n/a (Destructor)|
-|cudaEventElapsedTime|-|
-|cudaEventQuery|bool alpaka::event::test(event)|
-|cudaEventRecord|void alpaka::queue::enqueue(queue, event)|
-|cudaEventSynchronize|void alpaka::wait::wait(event)|
-
-*Memory Management*
-
-|CUDA|alpaka|
-|---|---|
-|cudaArrayGetInfo|-|
-|cudaFree|n/a (automatic memory management with reference counted memory handles)|
-|cudaFreeArray|-|
-|cudaFreeHost|n/a|
-|cudaFreeMipmappedArray|-|
-|cudaGetMipmappedArrayLevel|-|
-|cudaGetSymbolAddress|-|
-|cudaGetSymbolSize|-|
-|cudaHostAlloc|n/a|
-|cudaHostGetDevicePointer|-|
-|cudaHostGetFlags|-|
-|cudaHostRegister|-|
-|cudaHostUnregister|-|
-|cudaMalloc|alpaka::mem::buf::alloc<TElement>(device, extents1D)|
-|cudaMalloc3D|alpaka::mem::buf::alloc<TElement>(device, extents3D)|
-|cudaMalloc3DArray|-|
-|cudaMallocArray|-|
-|cudaMallocHost|alpaka::mem::buf::alloc<TElement>(device, extents) *1D, 2D, 3D suppoorted!*|
-|cudaMallocManaged|-|
-|cudaMallocMipmappedArray|-|
-|cudaMallocPitch|alpaka::mem::alloc<TElement>(device, extents2D)|
-|cudaMemAdvise|-|
-|cudaMemGetInfo|<ul><li>alpaka::dev::getMemBytes</li><li>alpaka::dev::getFreeMemBytes</li><ul>|
-|cudaMemPrefetchAsync|-|
-|cudaMemRangeGetAttribute|-|
-|cudaMemRangeGetAttributes|-|
-|cudaMemcpy|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D)|
-|cudaMemcpy2D|alpaka::mem::view::copy(memBufDst, memBufSrc, extents2D)|
-|cudaMemcpy2DArrayToArray|-|
-|cudaMemcpy2DAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents2D, queue)|
-|cudaMemcpy2DFromArray|-|
-|cudaMemcpy2DFromArrayAsync|-|
-|cudaMemcpy2DToArray|-|
-|cudaMemcpy2DToArrayAsync|-|
-|cudaMemcpy3D|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D)|
-|cudaMemcpy3DAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D, queue)|
-|cudaMemcpy3DPeer|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D)|
-|cudaMemcpy3DPeerAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D, queue)|
-|cudaMemcpyArrayToArray|-|
-|cudaMemcpyAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D, queue)|
-|cudaMemcpyFromArray|-|
-|cudaMemcpyFromArrayAsync|-|
-|cudaMemcpyFromSymbol|-|
-|cudaMemcpyFromSymbolAsync|-|
-|cudaMemcpyPeer|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D)|
-|cudaMemcpyPeerAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D, queue)|
-|cudaMemcpyToArray|-|
-|cudaMemcpyToArrayAsync|-|
-|cudaMemcpyToSymbol|-|
-|cudaMemcpyToSymbolAsync|-|
-|cudaMemset|alpaka::mem::view::set(memBufDst, byte, extents1D)|
-|cudaMemset2D|alpaka::mem::view::set(memBufDst, byte, extents2D)|
-|cudaMemset2DAsync|alpaka::mem::view::set(memBufDst, byte, extents2D, queue)|
-|cudaMemset3D|alpaka::mem::view::set(memBufDst, byte, extents3D)|
-|cudaMemset3DAsync|alpaka::mem::view::set(memBufDst, byte, extents3D, queue)|
-|cudaMemsetAsync|alpaka::mem::view::set(memBufDst, byte, extents1D, queue)|
-|make_cudaExtent|-|
-|make_cudaPitchedPtr|-|
-|make_cudaPos|-|
-|cudaMemcpyHostToDevice|n/a (direction of copy is determined automatically)|
-|cudaMemcpyDeviceToHost|n/a (direction of copy is determined automatically)|
-
-*Execution Control*
-
-|CUDA|alpaka|
-|---|---|
-|cudaFuncGetAttributes|-|
-|cudaFuncSetCacheConfig|-|
-|cudaFuncSetSharedMemConfig|-|
-|cudaLaunchKernel|<ul><li>alpaka::kernel::exec< TAcc >(queue, workDiv, kernel, params...)</li><li>alpaka::kernel::BlockSharedExternMemSizeBytes< TKernel< TAcc > >::getBlockSharedExternMemSizeBytes<...>(...)</li></ul>|
-|cudaSetDoubleForDevice|n/a (alpaka assumes double support)|
-|cudaSetDoubleForHost|n/a (alpaka assumes double support)|
-
-*Occupancy*
-
-|CUDA|alpaka|
-|---|---|
-|cudaOccupancyMaxActiveBlocksPerMultiprocessor|-|
-|cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags|-|
-
-
-*Unified Addressing*
-
-|CUDA|alpaka|
-|---|---|
-|cudaPointerGetAttributes|-|
-
-*Peer Device Memory Access*
-
-|CUDA|alpaka|
-|---|---|
-|cudaDeviceCanAccessPeer|-|
-|cudaDeviceDisablePeerAccess|-|
-|cudaDeviceEnablePeerAccess|automatically done when required|
-
-**OpenGL, Direct3D, VDPAU, EGL, Graphics Interoperability**
-
-*not available*
-
-**Texture/Surface Reference/Object Management**
-
-*not available*
-
-**Version Management**
-
-*not available*
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/HIP.md b/thirdParty/alpaka/doc/markdown/user/implementation/mapping/HIP.md
deleted file mode 100644
index 34cf4bb275..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/HIP.md
+++ /dev/null
@@ -1,97 +0,0 @@
-## Current restrictions on HCC platform
-
-- Workaround for unsupported `syncthreads_{count|and|or}`.
-  - uses temporary shared value and atomics
-- Workaround for buggy `hipStreamQuery`, `hipStreamSynchronize`.
-  - introduces own queue management
-  - `hipStreamQuery` and `hipStreamSynchronize` did not work in multithreaded environment
-- Workaround for missing `cuStreamWaitValue32`.
-  - polls value each 10ms
-- device constant memory not supported yet
-- note, that `printf` in kernels is still not supported in HIP
-- 3D memory is currently disabled
-  - missing `hipMemcpy3DAsync` is replaced with `hipMemcpy3D` though
-  - exclude `hipMalloc3D` and `hipMallocPitch` when size is zero
-    - otherwise they throw an Unknown Error
-  - `TestAccs` excludes 3D specialization of Hip back-end for now
-  - ... because `verifyBytesSet` fails in `memView` for 3D specialization
-- `dim3` structure is not available on device (use `alpaka::vec::Vec` instead)
-- Constructors' attributes unified with destructors'.
-  - host/device signature must match in HIP(HCC)
-- a chain of functions must also provide correct host-device signatures
-  - e.g. a host function cannot be called from a host-device function
-- recompile your target when HCC linker returned the error:
-"File format not recognized
-clang-7: error: linker command failed with exit code 1"
-- if compile-error occurred, the linker still may link, but without the device code
-- AMD device architecture currently hardcoded in `alpakaConfig.cmake`
-
-## Compiling HIP from source
-
-Follow [this](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md "HIP installation") guide for installing HIP.
-HIP requires either `nvcc` or `hcc` to be installed on your system (see guide for further details).
-
-- If you want the hip binaries to be located in a directory that does not require superuser access, be sure to change the install directory of HIP by modifying the `CMAKE_INSTALL_PREFIX` cmake variable.
-- Also, after the installation is complete, add the following line to the `.profile` file in your home directory, in order to add the path to the HIP binaries to PATH:
-`PATH=$PATH:<path_to_binaries>`
-
-```bash
-git clone --recursive https://github.com/ROCm-Developer-Tools/HIP.git
-cd "HIP"
-mkdir -p build
-cd build
-cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX=${YOUR_HIP_INSTALL_DIR} -DBUILD_TESTING=OFF ..
-make
-make install
-```
-Set the appropriate paths (edit `${YOUR_**}` variables).
-```bash
-# HIP_PATH required by HIP tools
-export HIP_PATH=${YOUR_HIP_INSTALL_DIR}
-# Paths required by HIP tools
-export CUDA_PATH=${YOUR_CUDA_ROOT}
-# - if required, path to HCC compiler. Default /opt/rocm/hcc.
-export HCC_HOME=${YOUR_HCC_ROOT}
-# - if required, path to HSA include, lib. Default /opt/rocm/hsa.
-export HSA_PATH=${YOUR_HSA_PATH}
-# HIP binaries and libraries
-export PATH=${HIP_PATH}/bin:$PATH
-export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${LD_LIBRARY_PATH}
-```
-Test the HIP binaries.
-```bash
-# calls nvcc or hcc
-which hipcc
-hipcc -V
-which hipconfig
-hipconfig -v
-```
-
-
-## Verifying HIP installation
-- If PATH points to the location of the HIP binaries, the following command should list several relevant environment variables, and also the selected compiler on your system-`hipconfig -f`
-- Compile and run the [square sample](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/0_Intro/square), as pointed out in the [original](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#verify-your-installation) HIP install guide.
-
-## Compiling examples with HIP back-end
-As of now, the back-end has only been tested on the NVIDIA platform.
-### NVIDIA Platform
-* One issue in this branch of alpaka is that the host compiler flags don't propagate to the device compiler, as they do in CUDA. This is because a counterpart to the CUDA_PROPAGATE_HOST_FLAGS cmake variable has not been defined in the FindHIP.cmake file.
-Alpaka forwards the host compiler flags in cmake to the `HIP_NVCC_FLAGS` cmake variable, which also takes user-given flags. To add flags to this variable, toggle the advanced mode in `ccmake`.
-
-
-## Random Number Generator Library rocRAND for HIP back-end
-
-rocRAND provides an interface for HIP, where the cuRAND or rocRAND API is called depending on the chosen HIP platform (can be configured with cmake in alpaka).
-
-Clone the rocRAND repository, then build and install it:
-```bash
-git clone https://github.com/ROCmSoftwarePlatform/rocRAND
-cd rocRAND
-mkdir -p build
-cd build
-cmake -DCMAKE_INSTALL_PREFIX=${HIP_PATH} -DBUILD_BENCHMARK=OFF -DBUILD_TEST=OFF -DCMAKE_MODULE_PATH=${HIP_PATH}/cmake ..
-make
-```
-
-The `CMAKE_MODULE_PATH` is a cmake variable for locating module finding scripts like *FindHIP.cmake*.
-The paths to the `rocRAND` library and include directories should be appended to the `CMAKE_PREFIX_PATH` variable.
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/x86.md b/thirdParty/alpaka/doc/markdown/user/implementation/mapping/x86.md
deleted file mode 100644
index 0fca42c3f6..0000000000
--- a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/x86.md
+++ /dev/null
@@ -1,97 +0,0 @@
-[:arrow_up: Up](../Mapping.md)
-
-x86 CPUs
-========
-
-There are multiple possible ways to map the *alpaka* abstraction to x86 CPUs.
-The following figure shows the compute and memory hierarchy of a dual-socket (package) node with dual-core CPUs and symmetric multithreading (Hyper-Threading).
-Through symmetric multithreading (Hyper-Threading) each core represents two processing units.
-![x86_cpu](x86/x86_cpu.png)
-
-Thread
-------
-
-Mapping the thread level directly to the processing units is the most trivial part of the assignment of hierarchy levels to hardware units.
-However, the block and warp levels could be mapped to hardware components in different ways with varying advantages and disadvantages.
-
-Warp
-----
-
-Even though a warp seems to be identical to a vector register, because both execute a single uniform instruction on multiple data elements, they are not the same.
-[Warps](../../Abstraction.md) can handle branches with divergent control flows of multiple threads.
-There is no equivalent hardware unit in a CPU supporting this.
-Therefore, the warp level can not be utilized on CPUs leading to a one-to-one mapping of threads to warps which does not violate the rules of the abstraction.
-
-Block
------
-
-### One Block Per Node
-
-By combining all processing units (possibly Hyper-Threads) of all processors on a node into one block, the number of synchronizing and communicating threads can be enlarged.
-This high possible thread count would simplify the implementation of some types of algorithms but introduces performance issues on multi-core nodes.
-The shared memory between all cores on a node is the RAM.
-However, the RAM and the communication between the sockets is far too slow for fine-grained communication in the style of *CUDA* threads.
-
-### One Block Per Socket
-
-If each processor on each socket would concurrently execute one block, the L3 cache would be used as the fast shared memory.
-Although this is much better then to use the RAM, there is still a problem.
-Regions of the global memory and especially from the shared memory that are accessed are automatically cached in the L1 and / or L2 caches of each core.
-Not only the elements which are directly accessed will be cached but always the whole cache line they lie in.
-Cache lines typically have a size of 64 Bytes on modern x86 architectures.
-This leads to, for example, eight double precision floating point numbers being cached at once even though only one value really is required.
-As long as these values are only read there is no problem.
-However, if one thread writes to a value that is also cached on other cores, all such cache copies have to be invalidated.
-This results in a lot of cache and bus traffic.
-Due to the hierarchical decomposition of the grid of threads reflecting the data elements, neighboring threads are always combined into a common block.
-By mapping a block to a socket, threads that are executed concurrently always have very close indices into the grid.
-Therefore, the elements that are read and written by the threads are always very close together within the memory and will most probably share a cache line.
-This property is exploited on *CUDA* GPUs, where memory accesses within a warp are combined into one large transaction.
-However, when multiple threads from multiple CPU cores write to different elements within a cache line, this advantage is reversed into its opposite.
-This pattern non-intuitively leads to heavy performance degradation and is called false-sharing.
-
-### One Block Per Core
-
-The best compromise between a high number of threads per block and a fast communication between the threads is to map a block directly to a CPU core.
-Each processing unit (possibly a Hyper-Thread) executes one or more threads of our hierarchical abstraction while executing multiple elements locally either by processing them sequentially or in a vectorized fashion.
-This possible mapping of blocks, threads and elements to the compute and memory hierarchy of a dual-socket node with dual-core CPUs and symmetric multithreading is illustrated in the following figure.
-![x86_cpu](x86/x86_cpu_mapping.png)
-
-### One Block Per Thread
-
-If there is no symmetric multithreading or if it is desired, it is also possible to implement a mapping of one block with exactly one thread for each processing unit.
-This allows to completely remove the synchronization overhead for tasks where this is not required at all.
-
-Threading Mechanisms
---------------------
-
-The mapping of threads to processing units is independent of the threading mechanism that is used.
-As long as the thread affinity to cores can be set correctly, *OpenMP*, *pthread*, *std::thread* or other libraries and APIs can be used interchangeably to implement various *alpaka* back-ends.
-They all have different advantages and disadvantages.
-Real operating system threads like *pthread*, *std::thread* and others have a high cost of thread creation and thread change because their default stack size amounts to multiple megabytes.
-*OpenMP* threads on the other hand are by default much more lightweight.
-However, they are arbitrarily limited by the runtime implementation in the maximum number of concurrent threads a machine supports.
-All of the previous methods have non-deterministic thread changes in common.
-Therefore it is not possible to decide the order in which threads within a block are processed, which could be a good optimization opportunity.
-
-To allow blocks to contain more threads then the number of processing units each core provides, it is possible to simply start more threads then processing units are available.
-This is called oversubscription.
-Those threads can be bound to the correct cores and by relying on the operating system thread scheduler, they are preemptively multitasked while sharing a single cache and thereby avoiding false-sharing.
-However, this is not always beneficial because the cost of thread changes by the kernel-mode scheduler should not be underestimated.
-
-### Fibers
-
-To remove the overhead of the kernel mode scheduler as well as to enable the usage of deterministic thread context-switches, fibers can be used.
-A fiber is a user-space thread with cooperative context-switches and extends the concept of coroutines.
-A coroutine is basically a function that can be suspended and resumed but does not necessarily have a stack.
-In contrast, functions within most programming languages represent subroutines and not coroutines because they can neither be suspended in the mid of execution nor resumed exactly at the place they were suspended without losing values on the functions local stack.
-
-Multiple fibers can be executed within one operating system thread, which allows to simulate multiple threads per block without kernel-mode multithreading.
-This was not possible without fibers because only coroutines allow the kernel functions to be suspended at synchronization points and resumed when all fibers reached it.
-Each time an operating system thread executing a function would wait for an other thread or a resource, an equivalent fiber just switches to the next fiber within the executing host thread.
-Due to the context changes happening at user-level, the cost is much lower.
-Additionally, fiber context changes are deterministic and it is even possible to implement an user-level scheduler.
-An advantage of a user level scheduler over the operating system thread scheduler is the possibility to optimally utilize the caches by taking into account the memory access pattern of the algorithm.
-Furthermore, fibers reduce the number of locks and busy waits within a block because only one fiber is active per operating system thread at a time.
-
-There are multiple C++ Standards Committee Papers (N3858, N3985, N4134) discussing the inclusion of fibers, awaitable functions and similar concepts into C++.
diff --git a/thirdParty/alpaka/example/CMakeLists.txt b/thirdParty/alpaka/example/CMakeLists.txt
deleted file mode 100644
index c9d1e3a68a..0000000000
--- a/thirdParty/alpaka/example/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Copyright 2015-2019 Benjamin Worpitz
-#
-# This file exemplifies usage of Alpaka.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
-# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-#
-
-################################################################################
-# Required CMake version.
-################################################################################
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.0)
-
-PROJECT("alpakaExamples")
-
-################################################################################
-# Add subdirectories.
-################################################################################
-
-ADD_SUBDIRECTORY("bufferCopy/")
-ADD_SUBDIRECTORY("helloWorld/")
-ADD_SUBDIRECTORY("helloWorldLambda/")
-ADD_SUBDIRECTORY("reduce/")
-ADD_SUBDIRECTORY("vectorAdd/")
diff --git a/thirdParty/alpaka/example/bufferCopy/CMakeLists.txt b/thirdParty/alpaka/example/bufferCopy/CMakeLists.txt
deleted file mode 100644
index cbeebcbde5..0000000000
--- a/thirdParty/alpaka/example/bufferCopy/CMakeLists.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-#
-# Copyright 2014-2019 Erik Zenker, Benjamin Worpitz
-#
-# This file exemplifies usage of Alpaka.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
-# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-#
-
-################################################################################
-# Required CMake version.
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
-
-################################################################################
-# Project.
-
-SET(_TARGET_NAME bufferCopy)
-
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-#-------------------------------------------------------------------------------
-# Find alpaka.
-
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
-
-#-------------------------------------------------------------------------------
-# Add executable.
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    src/bufferCopy.cpp)
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PUBLIC alpaka)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/alpaka/example/bufferCopy/src/bufferCopy.cpp b/thirdParty/alpaka/example/bufferCopy/src/bufferCopy.cpp
deleted file mode 100644
index 585b46ae28..0000000000
--- a/thirdParty/alpaka/example/bufferCopy/src/bufferCopy.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Erik Zenker, Matthias Werner
- *
- * This file exemplifies usage of Alpaka.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
- * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <iostream>
-#include <cstdint>
-
-//-----------------------------------------------------------------------------
-template <size_t width>
-ALPAKA_FN_ACC size_t linIdxToPitchedIdx(size_t const globalIdx, size_t const pitch)
-{
-    const size_t idx_x = globalIdx % width;
-    const size_t idx_y = globalIdx / width;
-    return idx_x + idx_y * pitch;
-}
-
-//#############################################################################
-//! Prints all elements of the buffer.
-struct PrintBufferKernel
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TData,
-        typename TExtent>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TData const * const buffer,
-        TExtent const & extents,
-        size_t const pitch) const
-    -> void
-    {
-        auto const globalThreadIdx = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        auto const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(
-            globalThreadIdx,
-            globalThreadExtent);
-
-        for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
-        {
-            // NOTE: hard-coded for unsigned int
-            printf("%u:%u ", static_cast<uint32_t>(i), static_cast<uint32_t>(buffer[linIdxToPitchedIdx<2>(i,pitch)]));
-        }
-    }
-};
-
-
-//#############################################################################
-//! Tests if the value of the buffer on index i is equal to i.
-struct TestBufferKernel
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TData,
-        typename TExtent>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TData const * const
-#ifndef NDEBUG
-        data
-#endif
-        ,
-        TExtent const & extents,
-        size_t const
-#ifndef NDEBUG
-        pitch
-#endif
-        ) const
-    -> void
-    {
-        auto const globalThreadIdx = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        auto const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(
-            globalThreadIdx,
-            globalThreadExtent);
-
-        for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
-        {
-            ALPAKA_ASSERT(data[linIdxToPitchedIdx<2>(i,pitch)] == i);
-        }
-    }
-};
-
-//#############################################################################
-//! Fills values of buffer with increasing elements starting from 0
-struct FillBufferKernel
-{
-    template<
-        typename TAcc,
-        typename TData,
-        typename TExtent>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TData * const data,
-        TExtent const & extents) const
-    -> void
-    {
-        auto const globalThreadIdx = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        auto const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(
-            globalThreadIdx,
-            globalThreadExtent);
-
-        for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
-        {
-            data[i] = static_cast<TData>(i);
-        }
-    }
-};
-
-auto main()
--> int
-{
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
-    // Define the index domain
-    using Dim = alpaka::dim::DimInt<3u>;
-    using Idx = std::size_t;
-
-    // Define the device accelerator
-    //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
-    // - AccGpuCudaRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
-    // - AccCpuSerial
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
-    // Defines the synchronization behavior of a queue
-    //
-    // choose between Blocking and NonBlocking
-    using AccQueueProperty = alpaka::queue::Blocking;
-    using DevQueue = alpaka::queue::Queue<Acc, AccQueueProperty>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-
-    // Define the device accelerator
-    //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
-    // - AccCpuSerial
-    using Host = alpaka::acc::AccCpuSerial<Dim, Idx>;
-    // Defines the synchronization behavior of a queue
-    //
-    // choose between Blocking and NonBlocking
-    using HostQueueProperty = alpaka::queue::Blocking;
-    using HostQueue = alpaka::queue::Queue<Host, HostQueueProperty>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-
-    // Select devices
-    DevAcc const devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-    DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
-
-    // Create queues
-    DevQueue devQueue(devAcc);
-    HostQueue hostQueue(devHost);
-
-    // Define the work division
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-    Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
-
-    Vec const blocksPerGrid(
-        static_cast<Idx>(4),
-        static_cast<Idx>(8),
-        static_cast<Idx>(16));
-
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workdiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
-
-
-    // Create host and device buffers
-    //
-    // A buffer is an n-dimensional structure with a
-    // particular data type and size which corresponds
-    // to memory on the desired device. Buffers can be
-    // allocated on the device or can be obtained from
-    // already existing allocations e.g. std::array,
-    // std::vector or a simple call to new.
-    using Data = std::uint32_t;
-    constexpr Idx nElementsPerDim = 2;
-
-    const Vec extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
-
-    // Allocate host memory buffers
-    //
-    // The `alloc` method returns a reference counted buffer handle.
-    // When the last such handle is destroyed, the memory is freed automatically.
-    using BufHost = alpaka::mem::buf::Buf<DevHost, Data, Dim, Idx>;
-    BufHost hostBuffer(alpaka::mem::buf::alloc<Data, Idx>(devHost, extents));
-    // You can also use already allocated memory and wrap it within a view (irrespective of the device type).
-    // The view does not own the underlying memory. So you have to make sure that
-    // the view does not outlive its underlying memory.
-    std::array<Data, nElementsPerDim * nElementsPerDim * nElementsPerDim> plainBuffer;
-    using ViewHost = alpaka::mem::view::ViewPlainPtr<DevHost, Data, Dim, Idx>;
-    ViewHost hostViewPlainPtr(plainBuffer.data(), devHost, extents);
-
-    // Allocate accelerator memory buffers
-    //
-    // The interface to allocate a buffer is the same on the host and on the device.
-    using BufAcc = alpaka::mem::buf::Buf<DevAcc, Data, Dim, Idx>;
-    BufAcc deviceBuffer1(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extents));
-    BufAcc deviceBuffer2(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extents));
-
-
-    // Init host buffer
-    //
-    // You can not access the inner
-    // elements of a buffer directly, but
-    // you can get the pointer to the memory
-    // (getPtrNative).
-    Data * const pHostBuffer = alpaka::mem::view::getPtrNative(hostBuffer);
-
-    // This pointer can be used to directly write
-    // some values into the buffer memory.
-    // Mind, that only a host can write on host memory.
-    // The same holds true for device memory.
-    for(Idx i(0); i < extents.prod(); ++i)
-    {
-        pHostBuffer[i] = static_cast<Data>(i);
-    }
-
-    // Memory views and buffers can also be initialized by executing a kernel.
-    // To pass a buffer into a kernel, you can pass the
-    // native pointer into the kernel invocation.
-    Data * const pHostViewPlainPtr = alpaka::mem::view::getPtrNative(hostViewPlainPtr);
-
-    FillBufferKernel fillBufferKernel;
-
-    alpaka::kernel::exec<Host>(
-        hostQueue,
-        workdiv,
-        fillBufferKernel,
-        pHostViewPlainPtr, // 1st kernel argument
-        extents);          // 2nd kernel argument
-
-
-    // Copy host to device Buffer
-    //
-    // A copy operation of one buffer into
-    // another buffer is enqueued into a queue
-    // like it is done for kernel execution.
-    // As always within alpaka, you will get a compile
-    // time error if the desired copy coperation
-    // (e.g. between various accelerator devices) is
-    // not currently supported.
-    // In this example both host buffers are copied
-    // into device buffers.
-    alpaka::mem::view::copy(devQueue, deviceBuffer1, hostViewPlainPtr, extents);
-    alpaka::mem::view::copy(devQueue, deviceBuffer2, hostBuffer, extents);
-
-    Idx const deviceBuffer1Pitch(alpaka::mem::view::getPitchBytes<2u>(deviceBuffer1) / sizeof(Data));
-    Idx const deviceBuffer2Pitch(alpaka::mem::view::getPitchBytes<2u>(deviceBuffer2) / sizeof(Data));
-    Idx const hostBuffer1Pitch(alpaka::mem::view::getPitchBytes<2u>(hostBuffer) / sizeof(Data));
-    Idx const hostViewPlainPtrPitch(alpaka::mem::view::getPitchBytes<2u>(hostViewPlainPtr) / sizeof(Data));
-
-    // Test device Buffer
-    //
-    // This kernel tests if the copy operations
-    // were successful. In the case something
-    // went wrong an assert will fail.
-    Data const * const pDeviceBuffer1 = alpaka::mem::view::getPtrNative(deviceBuffer1);
-    Data const * const pDeviceBuffer2 = alpaka::mem::view::getPtrNative(deviceBuffer2);
-
-    TestBufferKernel testBufferKernel;
-    alpaka::kernel::exec<Acc>(
-        devQueue,
-        workdiv,
-        testBufferKernel,
-        pDeviceBuffer1,                                 // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        deviceBuffer1Pitch);                            // 3rd kernel argument
-
-    alpaka::kernel::exec<Acc>(
-        devQueue,
-        workdiv,
-        testBufferKernel,
-        pDeviceBuffer2,                                 // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        deviceBuffer2Pitch);                            // 3rd kernel argument
-
-
-    // Print device Buffer
-    //
-    // Because we really like to flood our
-    // terminal with numbers, the following
-    // kernel prints all numbers of the
-    // device buffer to stdout on the terminal.
-    // Since this possibly is a parallel operation,
-    // the output can appear in any order or even
-    // completely distorted.
-
-    PrintBufferKernel printBufferKernel;
-    alpaka::kernel::exec<Acc>(
-        devQueue,
-        workdiv,
-        printBufferKernel,
-        pDeviceBuffer1,                                 // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        deviceBuffer1Pitch);                            // 3rd kernel argument
-    alpaka::wait::wait(devQueue);
-    std::cout << std::endl;
-
-    alpaka::kernel::exec<Acc>(
-        devQueue,
-        workdiv,
-        printBufferKernel,
-        pDeviceBuffer2,                                 // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        deviceBuffer2Pitch);                            // 3rd kernel argument
-    alpaka::wait::wait(devQueue);
-    std::cout << std::endl;
-
-    alpaka::kernel::exec<Host>(
-        hostQueue,
-        workdiv,
-        printBufferKernel,
-        pHostBuffer,                                    // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        hostBuffer1Pitch);                              // 3rd kernel argument
-    alpaka::wait::wait(hostQueue);
-    std::cout << std::endl;
-
-    alpaka::kernel::exec<Host>(
-        hostQueue,
-        workdiv,
-        printBufferKernel,
-        pHostViewPlainPtr,                              // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        hostViewPlainPtrPitch);                         // 3rd kernel argument
-    alpaka::wait::wait(hostQueue);
-    std::cout << std::endl;
-
-    return EXIT_SUCCESS;
-#endif
-}
diff --git a/thirdParty/alpaka/example/helloWorld/CMakeLists.txt b/thirdParty/alpaka/example/helloWorld/CMakeLists.txt
deleted file mode 100644
index b18da1282f..0000000000
--- a/thirdParty/alpaka/example/helloWorld/CMakeLists.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-#
-# Copyright 2014-2019 Erik Zenker, Benjamin Worpitz
-#
-# This file exemplifies usage of Alpaka.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
-# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-#
-
-################################################################################
-# Required CMake version.
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
-
-################################################################################
-# Project.
-
-SET(_TARGET_NAME helloWorld)
-
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-#-------------------------------------------------------------------------------
-# Find alpaka.
-
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
-
-#-------------------------------------------------------------------------------
-# Add executable.
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    src/helloWorld.cpp)
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PUBLIC alpaka)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/alpaka/example/helloWorld/src/helloWorld.cpp b/thirdParty/alpaka/example/helloWorld/src/helloWorld.cpp
deleted file mode 100644
index 5d92a8f890..0000000000
--- a/thirdParty/alpaka/example/helloWorld/src/helloWorld.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker
- *
- * This file exemplifies usage of Alpaka.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
- * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <iostream>
-
-//#############################################################################
-//! Hello World Kernel
-//!
-//! Prints "[x, y, z][gtid] Hello World" where tid is the global thread number.
-struct HelloWorldKernel
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc) const
-    -> void
-    {
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using Vec = alpaka::vec::Vec<Dim, Idx>;
-        using Vec1 = alpaka::vec::Vec<alpaka::dim::DimInt<1u>, Idx>;
-
-        // In the most cases the parallel work distibution depends
-        // on the current index of a thread and how many threads
-        // exist overall. These information can be obtained by
-        // getIdx() and getWorkDiv(). In this example these
-        // values are obtained for a global scope.
-        Vec const globalThreadIdx = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        Vec const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-
-        // Map the three dimensional thread index into a
-        // one dimensional thread index space. We call it
-        // linearize the thread index.
-        Vec1 const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(
-            globalThreadIdx,
-            globalThreadExtent);
-
-        // Each thread prints a hello world to the terminal
-        // together with the global index of the thread in
-        // each dimension and the linearized global index.
-        // Mind, that alpaka uses the mathematical index
-        // order [z][y][x] where the last index is the fast one.
-        printf(
-            "[z:%u, y:%u, x:%u][linear:%u] Hello World\n",
-            static_cast<unsigned>(globalThreadIdx[0u]),
-            static_cast<unsigned>(globalThreadIdx[1u]),
-            static_cast<unsigned>(globalThreadIdx[2u]),
-            static_cast<unsigned>(linearizedGlobalThreadIdx[0u]));
-    }
-};
-
-auto main()
--> int
-{
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
-    // Define the index domain
-    //
-    // Depending on your type of problem, you have to define
-    // the dimensionality as well as the type used for indices.
-    // For small index domains 16 or 32 bit indices may be enough
-    // and may be faster to calculate depending on the accelerator.
-    using Dim = alpaka::dim::DimInt<3>;
-    using Idx = std::size_t;
-
-    // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
-    // - AccGpuCudaRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
-    // - AccCpuSerial
-    //
-    // Each accelerator has strengths and weaknesses. Therefore,
-    // they need to be choosen carefully depending on the actual
-    // use case. Furthermore, some accelerators only support a
-    // particular workdiv, but workdiv can also be generated
-    // automatically.
-
-    // By exchanging the Acc and Queue types you can select where to execute the kernel.
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
-
-    // Defines the synchronization behavior of a queue
-    //
-    // choose between Blocking and NonBlocking
-    using QueueProperty = alpaka::queue::Blocking;
-    using Queue = alpaka::queue::Queue<Acc, QueueProperty>;
-    using Dev = alpaka::dev::Dev<Acc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-
-
-    // Select a device
-    //
-    // The accelerator only defines how something should be
-    // parallized, but a device is the real entity which will
-    // run the parallel programm. The device can be choosen
-    // by id (0 to the number of devices minus 1) or you
-    // can also retrieve all devices in a vector (getDevs()).
-    // In this example the first devices is choosen.
-    Dev const devAcc(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-    // Create a queue on the device
-    //
-    // A queue can be interpreted as the work queue
-    // of a particular device. Queues are filled with
-    // tasks and alpaka takes care that these
-    // tasks will be executed. Queues are provided in
-    // non-blocking and blocking variants.
-    // The example queue is a blocking queue to a cpu device,
-    // but it also exists an non-blocking queue for this
-    // device (QueueCpuNonBlocking).
-    Queue queue(devAcc);
-
-    // Define the work division
-    //
-    // A kernel is executed for each element of a
-    // n-dimensional grid distinguished by the element indices.
-    // The work division defines the number of kernel instantiations as
-    // well as the type of parallelism used by the kernel execution task.
-    // Different accelerators have different requirements on the work
-    // division. For example, the sequential accelerator can not
-    // provide any thread level parallelism (synchronizable as well as non synchronizable),
-    // whereas the CUDA accelerator can spawn hundreds of synchronizing
-    // and non synchronizing threads at the same time.
-    //
-    // The workdiv is divided in three levels of parallelization:
-    // - grid-blocks:      The number of blocks in the grid (parallel, not synchronizable)
-    // - block-threads:    The number of threads per block (parallel, synchronizable).
-    //                     Each thread executes one kernel invocation.
-    // - thread-elements:  The number of elements per thread (sequential, not synchronizable).
-    //                     Each kernel has to execute its elements sequentially.
-    //
-    // - Grid     : consists of blocks
-    // - Block    : consists of threads
-    // - Elements : consists of elements
-    //
-    // Threads in the same grid can access the same global memory,
-    // while threads in the same block can access the same shared
-    // memory. Elements are supposed to be used for vectorization.
-    // Thus, a thread can process data element size wise with its
-    // vector processing unit.
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-    Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
-    Vec const blocksPerGrid(
-        static_cast<Idx>(4),
-        static_cast<Idx>(8),
-        static_cast<Idx>(16));
-
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
-
-
-    // Instantiate the kernel function object
-    //
-    // Kernels can be everything that has a callable operator()
-    // and which takes the accelerator as first argument.
-    // So a kernel can be a class or struct, a lambda, a std::function, etc.
-    HelloWorldKernel helloWorldKernel;
-
-    // Run the kernel
-    //
-    // To execute the kernel, you have to provide the
-    // work division as well as the additional kernel function
-    // parameters.
-    // The kernel execution task is enqueued into an accelerator queue.
-    // The queue can be blocking or non-blocking
-    // depending on the choosen queue type (see type definitions above).
-    // Here it is synchronous which means that the kernel is directly executed.
-    alpaka::kernel::exec<Acc>(
-        queue,
-        workDiv,
-        helloWorldKernel
-        /* put kernel arguments here */);
-    alpaka::wait::wait(queue);
-
-    return EXIT_SUCCESS;
-#endif
-}
diff --git a/thirdParty/alpaka/example/helloWorldLambda/CMakeLists.txt b/thirdParty/alpaka/example/helloWorldLambda/CMakeLists.txt
deleted file mode 100644
index 9c5727f17f..0000000000
--- a/thirdParty/alpaka/example/helloWorldLambda/CMakeLists.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-#
-# Copyright 2014-2019 Erik Zenker, Benjamin Worpitz
-#
-# This file exemplifies usage of Alpaka.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
-# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-#
-
-################################################################################
-# Required CMake version.
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
-
-################################################################################
-# Project.
-
-SET(_TARGET_NAME helloWorldLambda)
-
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-#-------------------------------------------------------------------------------
-# Find alpaka.
-
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
-
-#-------------------------------------------------------------------------------
-# Add executable.
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    src/helloWorldLambda.cpp)
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PUBLIC alpaka)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp b/thirdParty/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp
deleted file mode 100644
index 69a4d980fc..0000000000
--- a/thirdParty/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker
- *
- * This file exemplifies usage of Alpaka.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
- * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <functional>
-
-//-----------------------------------------------------------------------------
-//! This functions says hi to the world and
-//! can be encapsulated into a std::function
-//! and used as a kernel function. It is
-//! just another way to define alpaka kernels
-//! and might be useful when it is necessary
-//! to lift an existing function into a kernel
-//! function.
-template<
-    typename TAcc>
-void ALPAKA_FN_ACC hiWorldFunction(
-    TAcc const & acc,
-    size_t const nExclamationMarks)
-{
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-    using Vec1 = alpaka::vec::Vec<alpaka::dim::DimInt<1u>, Idx>;
-
-    Vec const globalThreadIdx    = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-    Vec const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-    Vec1 const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(globalThreadIdx,
-                                                              globalThreadExtent);
-
-    printf("[z:%u, y:%u, x:%u][linear:%u] Hi world from a function",
-           static_cast<unsigned>(globalThreadIdx[0]),
-           static_cast<unsigned>(globalThreadIdx[1]),
-           static_cast<unsigned>(globalThreadIdx[2]),
-           static_cast<unsigned>(linearizedGlobalThreadIdx[0]));
-
-    for(size_t i = 0; i < nExclamationMarks; ++i){
-        printf("!");
-    }
-
-    printf("\n");
-}
-
-auto main()
--> int
-{
-// It requires support for extended lambdas when using nvcc as CUDA compiler.
-// Requires sequential backend if CI is used
-#if (!defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_EXTENDED_LAMBDA__) )) && \
-    (!defined(ALPAKA_CI) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED))
-
-    // Define the index domain
-    using Dim = alpaka::dim::DimInt<3>;
-    using Idx = std::size_t;
-
-    // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
-    // - AccGpuCudaRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
-    // - AccCpuSerial
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
-
-    // Defines the synchronization behavior of a queue
-    //
-    // choose between Blocking and NonBlocking
-    using QueueProperty = alpaka::queue::Blocking;
-    using Queue = alpaka::queue::Queue<Acc, QueueProperty>;
-    using Dev = alpaka::dev::Dev<Acc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-
-    // Select a device
-    Dev const devAcc(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-    // Create a queue on the device
-    Queue queue(devAcc);
-
-    // Define the work division
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-    Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
-    Vec const blocksPerGrid(
-        static_cast<Idx>(1),
-        static_cast<Idx>(2),
-        static_cast<Idx>(4));
-
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
-
-    const size_t nExclamationMarks = 10;
-
-    // Run "Hello World" kernel with a lambda function
-    //
-    // Alpaka is able to execute lambda functions (anonymous functions) which
-    // are available since the C++11 standard.
-    // Alpaka forces the lambda function to accept
-    // the utilized accelerator as first argument.
-    // All following arguments can be provided after
-    // the lambda function declaration or be captured.
-    //
-    // This example passes the number exclamation marks, that should
-    // be written after we greet the world, to the
-    // lambda function.
-    alpaka::kernel::exec<Acc>(
-        queue,
-        workDiv,
-        [] ALPAKA_FN_ACC (Acc const & acc, size_t const nExclamationMarksAsArg) -> void {
-            auto globalThreadIdx    = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-            auto globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-            auto linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
-
-            printf("[z:%u, y:%u, x:%u][linear:%u] Hello world from a lambda",
-               static_cast<unsigned>(globalThreadIdx[0]),
-               static_cast<unsigned>(globalThreadIdx[1]),
-               static_cast<unsigned>(globalThreadIdx[2]),
-               static_cast<unsigned>(linearizedGlobalThreadIdx[0]));
-
-            for(size_t i = 0; i < nExclamationMarksAsArg; ++i){
-                printf("!");
-            }
-
-            printf("\n");
-
-        },
-        nExclamationMarks
-    );
-    alpaka::wait::wait(queue);
-
-    return EXIT_SUCCESS;
-
-#else
-    return EXIT_SUCCESS;
-#endif
-}
diff --git a/thirdParty/alpaka/example/reduce/CMakeLists.txt b/thirdParty/alpaka/example/reduce/CMakeLists.txt
deleted file mode 100644
index beda0ef2f5..0000000000
--- a/thirdParty/alpaka/example/reduce/CMakeLists.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-#
-# Copyright 2014-2019 Erik Zenker, Benjamin Worpitz
-#
-# This file exemplifies usage of Alpaka.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
-# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-#
-
-################################################################################
-# Required CMake version.
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
-
-################################################################################
-# Project.
-
-SET(_TARGET_NAME reduce)
-
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-#-------------------------------------------------------------------------------
-# Find alpaka.
-
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
-
-#-------------------------------------------------------------------------------
-# Add executable.
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    src/alpakaConfig.hpp
-    src/iterator.hpp
-    src/kernel.hpp
-    src/reduce.cpp)
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PUBLIC alpaka)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/alpaka/example/reduce/README.md b/thirdParty/alpaka/example/reduce/README.md
deleted file mode 100644
index 2c7f30b00b..0000000000
--- a/thirdParty/alpaka/example/reduce/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Reduction
-
-This is a reduction which works with CPU and GPU accelerators. 
-
-A benchmark of this reduction can be found at [alpaka_reduction_benchmark](https://github.com/kloppstock/alpaka_reduction_benchmark).
-
-## File Descriptions
-
-* [alpakaConfig.hpp](./src/alpakaConfig.hpp): configurations and settings specific to the individual accelerators.
-* [iterator.hpp](./src/iterator.hpp): contains a CPU and a GPU iterator.
-* [kernel.hpp](./src/kernel.hpp): contains the optimized alpaka reduction kernel.
-* [reduce.cpp](./src/reduce.cpp): the main file.
diff --git a/thirdParty/alpaka/example/reduce/src/alpakaConfig.hpp b/thirdParty/alpaka/example/reduce/src/alpakaConfig.hpp
deleted file mode 100644
index d3bd018270..0000000000
--- a/thirdParty/alpaka/example/reduce/src/alpakaConfig.hpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2019 Jonas Schenke
- *
- * This file exemplifies usage of Alpaka.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
- * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#include "iterator.hpp"
-#include <alpaka/alpaka.hpp>
-
-// Defines for dimensions and types.
-using Dim = alpaka::dim::DimInt<1u>;
-using Idx = uint64_t;
-using Extent = uint64_t;
-using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Extent>;
-
-//-----------------------------------------------------------------------------
-//! Returns the supplied number or the maxumim number of threads per block for a
-//! specific accelerator.
-//!
-//! \tparam TAcc The accelerator object.
-//! \tparam TSize The desired size.
-template <typename TAcc, uint64_t TSize>
-static constexpr uint64_t getMaxBlockSize()
-{
-    return (TAcc::MaxBlockSize::value > TSize) ? TSize
-                                               : TAcc::MaxBlockSize::value;
-}
-
-//#############################################################################
-//! Get Trait via struct.
-//!
-//! \tparam T The data type.
-//! \tparam TBuf The buffer type.
-//! \tparam TAcc The accelerator type.
-//!
-//! Defines the appropriate iterator for an accelerator.
-template <typename T, typename TBuf, typename TAcc>
-struct GetIterator
-{
-    using Iterator = IteratorCpu<TAcc, T, TBuf>;
-};
-
-// Note: Boost Fibers, OpenMP 2 Threads and TBB Blocks accelerators aren't implented
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-//#############################################################################
-//! OpenMP 2 Blocks defines
-//!
-//! Defines Host, Device, etc. for the OpenMP 2 Blocks accelerator.
-struct CpuOmp2Blocks
-{
-    using Host = alpaka::acc::AccCpuOmp2Blocks<Dim, Extent>;
-    using Acc = alpaka::acc::AccCpuOmp2Blocks<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCpuBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using SmCount = alpaka::dim::DimInt<1u>;
-    using MaxBlockSize = alpaka::dim::DimInt<1u>;
-};
-
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccCpuOmp2Blocks<TArgs...>>
-{
-    using Iterator =
-        IteratorCpu<alpaka::acc::AccCpuOmp2Blocks<TArgs...>, T, TBuf>;
-};
-#endif
-
-#ifdef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-//#############################################################################
-//! OpenMP 4 defines
-//!
-//! Defines Host, Device, etc. for the OpenMP 4 accelerator.
-struct CpuOmp4
-{
-    using Host = alpaka::acc::AccCpuSerial<Dim, Extent>;
-    using Acc = alpaka::acc::AccCpuOmp4<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCpuBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using MaxBlockSize = alpaka::dim::DimInt<1u>;
-};
-
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccCpuOmp4<TArgs...>>
-{
-    using Iterator = IteratorCpu<alpaka::acc::AccCpuOmp4<TArgs...>, T, TBuf>;
-};
-#endif
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-//#############################################################################
-//! Serial CPU defines
-//!
-//! Defines Host, Device, etc. for the serial CPU accelerator.
-struct CpuSerial
-{
-    using Host = alpaka::acc::AccCpuSerial<Dim, Extent>;
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCpuBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using MaxBlockSize = alpaka::dim::DimInt<1u>;
-};
-
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccCpuSerial<TArgs...>>
-{
-    using Iterator = IteratorCpu<alpaka::acc::AccCpuSerial<TArgs...>, T, TBuf>;
-};
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-//#############################################################################
-//! CPU Threads defines
-//!
-//! Defines Host, Device, etc. for the CPU Threads accelerator.
-struct CpuThreads
-{
-    using Host = alpaka::acc::AccCpuThreads<Dim, Extent>;
-    using Acc = alpaka::acc::AccCpuThreads<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCpuBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using MaxBlockSize = alpaka::dim::DimInt<1u>;
-};
-
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccCpuThreads<TArgs...>>
-{
-    using Iterator = IteratorCpu<alpaka::acc::AccCpuThreads<TArgs...>, T, TBuf>;
-};
-#endif
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-//#############################################################################
-//! CUDA defines
-//!
-//! Defines Host, Device, etc. for the CUDA accelerator.
-struct GpuCudaRt
-{
-    using Host = alpaka::acc::AccCpuSerial<Dim, Extent>;
-    using Acc = alpaka::acc::AccGpuCudaRt<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCudaRtNonBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using MaxBlockSize = alpaka::dim::DimInt<1024u>;
-};
-
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccGpuCudaRt<TArgs...>>
-{
-    using Iterator = IteratorGpu<alpaka::acc::AccGpuCudaRt<TArgs...>, T, TBuf>;
-};
-#endif
-#endif
diff --git a/thirdParty/alpaka/example/reduce/src/iterator.hpp b/thirdParty/alpaka/example/reduce/src/iterator.hpp
deleted file mode 100644
index 15c3da3097..0000000000
--- a/thirdParty/alpaka/example/reduce/src/iterator.hpp
+++ /dev/null
@@ -1,411 +0,0 @@
-/* Copyright 2019 Jonas Schenke
- *
- * This file exemplifies usage of Alpaka.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
- * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-//#############################################################################
-//! An iterator base class.
-//!
-//! \tparam T The type.
-//! \tparam TBuf The buffer type (standard is T).
-template <typename T, typename TBuf = T>
-class Iterator
-{
-protected:
-    const TBuf *mData;
-    uint64_t mIndex;
-    const uint64_t mMaximum;
-
-public:
-    //-----------------------------------------------------------------------------
-    //! Constructor.
-    //!
-    //! \param data A pointer to the data.
-    //! \param index The index.
-    //! \param maximum The first index outside of the iterator memory.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE Iterator(const TBuf *data,
-                                                 uint32_t index,
-                                                 uint64_t maximum)
-        : mData(data), mIndex(index), mMaximum(maximum)
-    {
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Constructor.
-    //!
-    //! \param other The other iterator object.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE Iterator(const Iterator &other) = default;
-
-    //-----------------------------------------------------------------------------
-    //! Compare operator.
-    //!
-    //! \param other The other object.
-    //!
-    //! Returns true if objects are equal and false otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator==(const Iterator &other) const -> bool
-    {
-        return (this->mData == other.mData) && (this->mIndex == other.mIndex) &&
-               (this->mMaximum == other.mMaximum);
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Compare operator.
-    //!
-    //! \param other The other object.
-    //!
-    //! Returns false if objects are equal and true otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator!=(const Iterator &other) const -> bool
-    {
-        return !operator==(other);
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Compare operator.
-    //!
-    //! \param other The other object.
-    //!
-    //! Returns false if the other object is equal or smaller and true
-    //! otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator<(const Iterator &other) const -> bool
-    {
-        return mIndex < other.mIndex;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Compare operator.
-    //!
-    //! \param other The other object.
-    //!
-    //! Returns false if the other object is equal or bigger and true otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator>(const Iterator &other) const -> bool
-    {
-        return mIndex > other.mIndex;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Compare operator.
-    //!
-    //! \param other The other object.
-    //!
-    //! Returns true if the other object is equal or bigger and false otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator<=(const Iterator &other) const -> bool
-    {
-        return mIndex <= other.mIndex;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Compare operator.
-    //!
-    //! \param other The other object.
-    //!
-    //! Returns true if the other object is equal or smaller and false
-    //! otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator>=(const Iterator &other) const -> bool
-    {
-        return mIndex >= other.mIndex;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the current element.
-    //!
-    //! Returns a reference to the current index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator*() -> const T &
-    {
-        return mData[mIndex];
-    }
-};
-
-//#############################################################################
-//! A CPU memory iterator.
-//!
-//! \tparam TAcc The accelerator type.
-//! \tparam T The type.
-//! \tparam TBuf The buffer type (standard is T).
-template <typename TAcc, typename T, typename TBuf = T>
-class IteratorCpu : public Iterator<T, TBuf>
-{
-public:
-    //-----------------------------------------------------------------------------
-    //! Constructor.
-    //!
-    //! \param acc The accelerator object.
-    //! \param data A pointer to the data.
-    //! \param linearizedIndex The linearized index.
-    //! \param gridSize The grid size.
-    //! \param n The problem size.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE IteratorCpu(const TAcc &acc,
-                                                    const TBuf *data,
-                                                    uint32_t linearizedIndex,
-                                                    uint32_t gridSize,
-                                                    uint64_t n)
-        : Iterator<T, TBuf>(
-              data,
-              static_cast<uint32_t>((n * linearizedIndex) / 
-                                    alpaka::math::min(acc, static_cast<uint64_t>(gridSize), n)),
-              static_cast<uint32_t>((n * (linearizedIndex + 1)) / 
-                  alpaka::math::min(acc, static_cast<uint64_t>(gridSize), n)))
-    {
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the iterator for the last item.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto end() const -> IteratorCpu
-    {
-        IteratorCpu ret(*this);
-        ret.mIndex = this->mMaximum;
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Increments the internal pointer to the next one and returns this
-    //! element.
-    //!
-    //! Returns a reference to the next index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator++() -> IteratorCpu &
-    {
-        ++(this->mIndex);
-        return *this;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the current element and increments the internal pointer to the
-    //! next one.
-    //!
-    //! Returns a reference to the current index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator++(int) -> IteratorCpu
-    {
-        auto ret(*this);
-        ++(this->mIndex);
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Decrements the internal pointer to the previous one and returns the this
-    //! element.
-    //!
-    //! Returns a reference to the previous index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator--() -> IteratorCpu &
-    {
-        --(this->mIndex);
-        return *this;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the current element and decrements the internal pointer to the
-    //! previous one.
-    //!
-    //! Returns a reference to the current index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator--(int) -> IteratorCpu
-    {
-        auto ret(*this);
-        --(this->mIndex);
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the index + a supplied offset.
-    //!
-    //! \param n The offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+(uint64_t n) const
-        -> IteratorCpu
-    {
-        IteratorCpu ret(*this);
-        ret.mIndex += n;
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the index - a supplied offset.
-    //!
-    //! \param n The offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-(uint64_t n) const
-        -> IteratorCpu
-    {
-        IteratorCpu ret(*this);
-        ret.mIndex -= n;
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Addition assignment.
-    //!
-    //! \param offset The offset.
-    //!
-    //! Returns the current object offset by the offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+=(uint64_t offset)
-        -> IteratorCpu &
-    {
-        this->mIndex += offset;
-        return *this;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Substraction assignment.
-    //!
-    //! \param offset The offset.
-    //!
-    //! Returns the current object offset by the offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-=(uint64_t offset)
-        -> IteratorCpu &
-    {
-        this->mIndex -= offset;
-        return *this;
-    }
-};
-
-//#############################################################################
-//! A GPU memory iterator.
-//!
-//! \tparam TAcc The accelerator type.
-//! \tparam T The type.
-//! \tparam TBuf The buffer type (standard is T).
-template <typename TAcc, typename T, typename TBuf = T>
-class IteratorGpu : public Iterator<T, TBuf>
-{
-private:
-    const uint32_t mGridSize;
-
-public:
-    //-----------------------------------------------------------------------------
-    //! Constructor.
-    //!
-    //! \param data A pointer to the data.
-    //! \param linearizedIndex The linearized index.
-    //! \param gridSize The grid size.
-    //! \param n The problem size.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE IteratorGpu(const TAcc &,
-                                                    const TBuf *data,
-                                                    uint32_t linearizedIndex,
-                                                    uint32_t gridSize,
-                                                    uint64_t n)
-        : Iterator<T, TBuf>(data, linearizedIndex, n), mGridSize(gridSize)
-    {
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the iterator for the last item.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto end() const -> IteratorGpu
-    {
-        IteratorGpu ret(*this);
-        ret.mIndex = this->mMaximum;
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Increments the internal pointer to the next one and returns this
-    //! element.
-    //!
-    //! Returns a reference to the next index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator++() -> IteratorGpu &
-    {
-        this->mIndex += this->mGridSize;
-        return *this;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the current element and increments the internal pointer to the
-    //! next one.
-    //!
-    //! Returns a reference to the current index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator++(int) -> IteratorGpu
-    {
-        auto ret(*this);
-        this->mIndex += this->mGridSize;
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Decrements the internal pointer to the previous one and returns the this
-    //! element.
-    //!
-    //! Returns a reference to the previous index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator--() -> IteratorGpu &
-    {
-        this->mIndex -= this->mGridSize;
-        return *this;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the current element and decrements the internal pointer to the
-    //! previous one.
-    //!
-    //! Returns a reference to the current index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator--(int) -> IteratorGpu
-    {
-        auto ret(*this);
-        this->mIndex -= this->mGridSize;
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the index + a supplied offset.
-    //!
-    //! \param n The offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+(uint64_t n) const
-        -> IteratorGpu
-    {
-        auto ret(*this);
-        ret.mIndex += n * mGridSize;
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Returns the index - a supplied offset.
-    //!
-    //! \param n The offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-(uint64_t n) const
-        -> IteratorGpu
-    {
-        auto ret(*this);
-        ret.mIndex -= n * mGridSize;
-        return ret;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Addition assignment.
-    //!
-    //! \param offset The offset.
-    //!
-    //! Returns the current object offset by the offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+=(uint64_t offset)
-        -> IteratorGpu &
-    {
-        this->mIndex += offset * this->mGridSize;
-        return *this;
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Substraction assignment.
-    //!
-    //! \param offset The offset.
-    //!
-    //! Returns the current object offset by the offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-=(uint64_t offset)
-        -> IteratorGpu &
-    {
-        this->mIndex -= offset * this->mGridSize;
-        return *this;
-    }
-};
diff --git a/thirdParty/alpaka/example/reduce/src/kernel.hpp b/thirdParty/alpaka/example/reduce/src/kernel.hpp
deleted file mode 100644
index 63193a9976..0000000000
--- a/thirdParty/alpaka/example/reduce/src/kernel.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright 2019 Jonas Schenke
- *
- * This file exemplifies usage of Alpaka.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
- * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-//#############################################################################
-//! A cheap wrapper around a C-style array in heap memory.
-template <typename T, uint64_t size>
-struct cheapArray
-{
-    T data[size];
-
-    //-----------------------------------------------------------------------------
-    //! Access operator.
-    //!
-    //! \param index The index of the element to be accessed.
-    //!
-    //! Returns the requested element per reference.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T &operator[](uint64_t index)
-    {
-        return data[index];
-    }
-
-    //-----------------------------------------------------------------------------
-    //! Access operator.
-    //!
-    //! \param index The index of the element to be accessed.
-    //!
-    //! Returns the requested element per constant reference.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE const T &operator[](uint64_t index) const
-    {
-        return data[index];
-    }
-};
-
-//#############################################################################
-//! A reduction kernel.
-//!
-//! \tparam TBlockSize The block size.
-//! \tparam T The data type.
-//! \tparam TFunc The Functor type for the reduction function.
-template <uint32_t TBlockSize, typename T, typename TFunc>
-struct ReduceKernel
-{
-    ALPAKA_NO_HOST_ACC_WARNING
-
-    //-----------------------------------------------------------------------------
-    //! The kernel entry point.
-    //!
-    //! \tparam TAcc The accelerator environment.
-    //! \tparam TElem The element type.
-    //! \tparam TIdx The index type.
-    //!
-    //! \param acc The accelerator object.
-    //! \param source The source memory.
-    //! \param destination The destination memory.
-    //! \param n The problem size.
-    //! \param func The reduction function.
-    template <typename TAcc, typename TElem, typename TIdx>
-    ALPAKA_FN_ACC auto operator()(TAcc const &acc,
-                                  TElem const *const source,
-                                  TElem *destination,
-                                  TIdx const &n,
-                                  TFunc func) const -> void
-    {
-        auto &sdata(
-            alpaka::block::shared::st::allocVar<cheapArray<T, TBlockSize>,
-                                                __COUNTER__>(acc));
-
-        const uint32_t blockIndex(static_cast<uint32_t>(
-            alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]));
-        const uint32_t threadIndex(static_cast<uint32_t>(
-            alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0]));
-        const uint32_t gridDimension(static_cast<uint32_t>(
-            alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0]));
-
-        // equivalent to blockIndex * TBlockSize + threadIndex
-        const uint32_t linearizedIndex(static_cast<uint32_t>(
-            alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]));
-
-        typename GetIterator<T, TElem, TAcc>::Iterator it(
-            acc, source, linearizedIndex, gridDimension * TBlockSize, n);
-
-        T result = 0; // suppresses compiler warnings
-
-        if (threadIndex < n)
-            result = *(it++); // avoids using the
-                              // neutral element of specific
-
-        // --------
-        // Level 1: grid reduce, reading from global memory
-        // --------
-
-        // reduce per thread with increased ILP by 4x unrolling sum.
-        // the thread of our block reduces its 4 grid-neighbored threads and
-        // advances by grid-striding loop (maybe 128bit load improve perf)
-
-        while (it + 3 < it.end())
-        {
-            result = func(
-                func(func(result, func(*it, *(it + 1))), *(it + 2)), *(it + 3));
-            it += 4;
-        }
-
-        // doing the remaining blocks
-        while (it < it.end())
-            result = func(result, *(it++));
-
-        if (threadIndex < n)
-            sdata[threadIndex] = result;
-
-        alpaka::block::sync::syncBlockThreads(acc);
-
-        // --------
-        // Level 2: block + warp reduce, reading from shared memory
-        // --------
-
-        ALPAKA_UNROLL()
-        for (uint32_t currentBlockSize = TBlockSize,
-                      currentBlockSizeUp =
-                          (TBlockSize + 1) / 2; // ceil(TBlockSize/2.0)
-             currentBlockSize > 1;
-             currentBlockSize = currentBlockSize / 2,
-                      currentBlockSizeUp = (currentBlockSize + 1) /
-                                           2) // ceil(currentBlockSize/2.0)
-        {
-            bool cond =
-                threadIndex < currentBlockSizeUp // only first half of block
-                                                 // is working
-                && (threadIndex + currentBlockSizeUp) <
-                       TBlockSize // index for second half must be in bounds
-                && (blockIndex * TBlockSize + threadIndex +
-                    currentBlockSizeUp) < n &&
-                threadIndex <
-                    n; // if elem in second half has been initialized before
-
-            if (cond)
-                sdata[threadIndex] =
-                    func(sdata[threadIndex],
-                         sdata[threadIndex + currentBlockSizeUp]);
-
-            alpaka::block::sync::syncBlockThreads(acc);
-        }
-
-        // store block result to gmem
-        if (threadIndex == 0 && threadIndex < n)
-            destination[blockIndex] = sdata[0];
-    }
-};
diff --git a/thirdParty/alpaka/example/reduce/src/reduce.cpp b/thirdParty/alpaka/example/reduce/src/reduce.cpp
deleted file mode 100644
index 724979790a..0000000000
--- a/thirdParty/alpaka/example/reduce/src/reduce.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Jonas Schenke, Matthias Werner
- *
- * This file exemplifies usage of Alpaka.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
- * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include "alpakaConfig.hpp"
-#include "kernel.hpp"
-#include <alpaka/alpaka.hpp>
-#include <cstdlib>
-#include <iostream>
-
-// It requires support for extended lambdas when using nvcc as CUDA compiler.
-// Requires sequential backend if CI is used
-#if (!defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_EXTENDED_LAMBDA__) )) && \
-    (!defined(ALPAKA_CI) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED))
-
-// use defines of a specific accelerator from alpakaConfig.hpp
-// that are defined in alpakaConfig.hpp
-// - GpuCudaRt
-// - CpuThreads
-// - CpuOmp2Blocks
-// - CpuOmp4
-// - CpuSerial
-//
-using Accelerator = CpuSerial;
-
-using DevAcc = Accelerator::DevAcc;
-using DevHost = Accelerator::DevHost;
-using QueueAcc = Accelerator::Stream;
-using Acc = Accelerator::Acc;
-using PltfAcc = Accelerator::PltfAcc;
-using PltfHost = Accelerator::PltfHost;
-using MaxBlockSize = Accelerator::MaxBlockSize;
-
-//-----------------------------------------------------------------------------
-//! Reduces the numbers 1 to n.
-//!
-//! \tparam T The data type.
-//! \tparam TFunc The data type of the reduction functor.
-//!
-//! \param devHost The host device.
-//! \param devAcc The accelerator object.
-//! \param queue The device queue.
-//! \param n The problem size.
-//! \param hostMemory The buffer containing the input data.
-//! \param func The reduction function.
-//!
-//! Returns true if the reduction was correct and false otherwise.
-template<typename T, typename TFunc>
-T reduce(DevHost devHost, DevAcc devAcc, QueueAcc queue, uint64_t n, alpaka::mem::buf::Buf<DevHost, T, Dim, Idx> hostMemory, TFunc func)
-{
-    static constexpr uint64_t blockSize = getMaxBlockSize<Accelerator, 256>();
-
-    // calculate optimal block size (8 times the MP count proved to be
-    // relatively near to peak performance in benchmarks)
-    uint32_t blockCount = static_cast<uint32_t>(
-        alpaka::acc::getAccDevProps<Acc, DevAcc>(devAcc).m_multiProcessorCount *
-        8);
-    uint32_t maxBlockCount = static_cast<uint32_t>(
-        (((n + 1) / 2) - 1) / blockSize + 1); // ceil(ceil(n/2.0)/blockSize)
-
-    if (blockCount > maxBlockCount)
-        blockCount = maxBlockCount;
-
-    alpaka::mem::buf::Buf<DevAcc, T, Dim, Extent> sourceDeviceMemory =
-        alpaka::mem::buf::alloc<T, Idx>(devAcc, n);
-
-    alpaka::mem::buf::Buf<DevAcc, T, Dim, Extent> destinationDeviceMemory =
-        alpaka::mem::buf::alloc<T, Idx>(
-            devAcc, static_cast<Extent>(blockCount));
-
-    // copy the data to the GPU
-    alpaka::mem::view::copy(queue, sourceDeviceMemory, hostMemory, n);
-
-    // create kernels with their workdivs
-    ReduceKernel<blockSize, T, TFunc> kernel1, kernel2;
-    WorkDiv workDiv1{ static_cast<Extent>(blockCount),
-                      static_cast<Extent>(blockSize),
-                      static_cast<Extent>(1) };
-    WorkDiv workDiv2{ static_cast<Extent>(1),
-                      static_cast<Extent>(blockSize),
-                      static_cast<Extent>(1) };
-
-    // create main reduction kernel execution task
-    auto const taskKernelReduceMain(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv1,
-        kernel1,
-        alpaka::mem::view::getPtrNative(sourceDeviceMemory),
-        alpaka::mem::view::getPtrNative(destinationDeviceMemory),
-        n,
-        func));
-
-    // create last block reduction kernel execution task
-    auto const taskKernelReduceLastBlock(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv2,
-        kernel2,
-        alpaka::mem::view::getPtrNative(destinationDeviceMemory),
-        alpaka::mem::view::getPtrNative(destinationDeviceMemory),
-        blockCount,
-        func));
-
-    // enqueue both kernel execution tasks
-    alpaka::queue::enqueue(queue, taskKernelReduceMain);
-    alpaka::queue::enqueue(queue, taskKernelReduceLastBlock);
-
-    //  download result from GPU
-    T resultGpuHost;
-    auto resultGpuDevice =
-        alpaka::mem::view::ViewPlainPtr<DevHost, T, Dim, Idx>(
-            &resultGpuHost, devHost, static_cast<Extent>(blockSize));
-
-    alpaka::mem::view::copy(queue, resultGpuDevice, destinationDeviceMemory, 1);
-
-    return resultGpuHost;
-}
-
-int main()
-{
-    // select device and problem size
-    const int dev = 0;
-    uint64_t n = 1 << 28;
-
-    using T = uint32_t;
-    static constexpr uint64_t blockSize = getMaxBlockSize<Accelerator, 256>();
-
-    DevAcc devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(dev));
-    DevHost devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
-    QueueAcc queue(devAcc);
-
-    // calculate optimal block size (8 times the MP count proved to be
-    // relatively near to peak performance in benchmarks)
-    uint32_t blockCount = static_cast<uint32_t>(
-        alpaka::acc::getAccDevProps<Acc, DevAcc>(devAcc).m_multiProcessorCount *
-        8);
-    uint32_t maxBlockCount = static_cast<uint32_t>(
-        (((n + 1) / 2) - 1) / blockSize + 1); // ceil(ceil(n/2.0)/blockSize)
-
-    if (blockCount > maxBlockCount)
-        blockCount = maxBlockCount;
-
-    // allocate memory
-    auto hostMemory = alpaka::mem::buf::alloc<T, Idx>(devHost, n);
-
-    T *nativeHostMemory = alpaka::mem::view::getPtrNative(hostMemory);
-
-    // fill array with data
-    for (uint64_t i = 0; i < n; i++)
-        nativeHostMemory[i] = static_cast<T>(i + 1);
-
-    // define the reduction function
-    auto addFn = [] ALPAKA_FN_ACC(T a, T b) -> T { return a + b; };
-
-    // reduce
-    T result = reduce<T>(devHost, devAcc, queue, n, hostMemory, addFn);
-    alpaka::wait::wait(queue);
-
-    // check result
-    T expectedResult = static_cast<T>(n / 2 * (n + 1));
-    if (result != expectedResult)
-    {
-        std::cerr << "Results don't match: " << result << " != " << expectedResult
-                  << "\n";
-        return EXIT_FAILURE;
-    }
-
-    std::cout << "Results match.\n";
-
-    return EXIT_SUCCESS;
-}
-
-#else
-
-int main() {
-    return EXIT_SUCCESS;
-}
-
-#endif
diff --git a/thirdParty/alpaka/example/vectorAdd/CMakeLists.txt b/thirdParty/alpaka/example/vectorAdd/CMakeLists.txt
deleted file mode 100644
index e6d5ab8ad4..0000000000
--- a/thirdParty/alpaka/example/vectorAdd/CMakeLists.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file exemplifies usage of Alpaka.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
-# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-#
-
-################################################################################
-# Required CMake version.
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
-
-################################################################################
-# Project.
-
-SET(_TARGET_NAME vectorAdd)
-
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-#-------------------------------------------------------------------------------
-# Find alpaka.
-
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
-
-#-------------------------------------------------------------------------------
-# Add executable.
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    src/vectorAdd.cpp)
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PUBLIC alpaka)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/alpaka/example/vectorAdd/src/vectorAdd.cpp b/thirdParty/alpaka/example/vectorAdd/src/vectorAdd.cpp
deleted file mode 100644
index a7126c43e4..0000000000
--- a/thirdParty/alpaka/example/vectorAdd/src/vectorAdd.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file exemplifies usage of Alpaka.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
- * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <random>
-#include <iostream>
-#include <typeinfo>
-
-//#############################################################################
-//! A vector addition kernel.
-class VectorAddKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    //! The kernel entry point.
-    //!
-    //! \tparam TAcc The accelerator environment to be executed on.
-    //! \tparam TElem The matrix element type.
-    //! \param acc The accelerator to be executed on.
-    //! \param A The first source vector.
-    //! \param B The second source vector.
-    //! \param C The destination vector.
-    //! \param numElements The number of elements.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem,
-        typename TIdx>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TElem const * const A,
-        TElem const * const B,
-        TElem * const C,
-        TIdx const & numElements) const
-    -> void
-    {
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 1,
-            "The VectorAddKernel expects 1-dimensional indices!");
-
-        TIdx const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        TIdx const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
-        TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
-
-        if(threadFirstElemIdx < numElements)
-        {
-            // Calculate the number of elements to compute in this thread.
-            // The result is uniform for all but the last thread.
-            TIdx const threadLastElemIdx(threadFirstElemIdx+threadElemExtent);
-            TIdx const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
-
-            for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i)
-            {
-                C[i] = A[i] + B[i];
-            }
-        }
-    }
-};
-
-auto main()
--> int
-{
-// Fallback for the CI with disabled sequential backend
-#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    return EXIT_SUCCESS;
-#else
-    // Define the index domain
-    using Dim = alpaka::dim::DimInt<1u>;
-    using Idx = std::size_t;
-
-    // Define the accelerator
-    //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
-    // - AccGpuCudaRt
-    // - AccCpuThreads
-    // - AccCpuFibers
-    // - AccCpuOmp2Threads
-    // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
-    // - AccCpuSerial
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-
-    // Defines the synchronization behavior of a queue
-    //
-    // choose between Blocking and NonBlocking
-    using QueueProperty = alpaka::queue::Blocking;
-    using QueueAcc = alpaka::queue::Queue<Acc, QueueProperty>;
-
-    // Select a device
-    DevAcc const devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-
-    // Create a queue on the device
-    QueueAcc queue(devAcc);
-
-    // Define the work division
-    Idx const numElements(123456);
-    Idx const elementsPerThread(3u);
-    alpaka::vec::Vec<Dim, Idx> const extent(numElements);
-
-    // Let alpaka calculate good block and grid sizes given our full problem extent
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extent,
-            elementsPerThread,
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    // Define the buffer element type
-    using Data = std::uint32_t;
-
-    // Get the host device for allocating memory on the host.
-    using DevHost = alpaka::dev::DevCpu;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
-
-    // Allocate 3 host memory buffers
-    using BufHost = alpaka::mem::buf::Buf<DevHost, Data, Dim, Idx>;
-    BufHost bufHostA(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
-    BufHost bufHostB(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
-    BufHost bufHostC(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
-
-    // Initialize the host input vectors A and B
-    Data * const pBufHostA(alpaka::mem::view::getPtrNative(bufHostA));
-    Data * const pBufHostB(alpaka::mem::view::getPtrNative(bufHostB));
-    Data * const pBufHostC(alpaka::mem::view::getPtrNative(bufHostC));
-
-    // C++11 random generator for uniformly distributed numbers in {1,..,42}
-    std::random_device rd{};
-    std::default_random_engine eng{ rd() };
-    std::uniform_int_distribution<Data> dist(1, 42);
-
-    for (Idx i(0); i < numElements; ++i)
-    {
-        pBufHostA[i] = dist(eng);
-        pBufHostB[i] = dist(eng);
-        pBufHostC[i] = 0;
-    }
-
-    // Allocate 3 buffers on the accelerator
-    using BufAcc = alpaka::mem::buf::Buf<DevAcc, Data, Dim, Idx>;
-    BufAcc bufAccA(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
-    BufAcc bufAccB(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
-    BufAcc bufAccC(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
-
-    // Copy Host -> Acc
-    alpaka::mem::view::copy(queue, bufAccA, bufHostA, extent);
-    alpaka::mem::view::copy(queue, bufAccB, bufHostB, extent);
-    alpaka::mem::view::copy(queue, bufAccC, bufHostC, extent);
-
-    // Instantiate the kernel function object
-    VectorAddKernel kernel;
-
-    // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        alpaka::mem::view::getPtrNative(bufAccA),
-        alpaka::mem::view::getPtrNative(bufAccB),
-        alpaka::mem::view::getPtrNative(bufAccC),
-        numElements));
-
-    // Enqueue the kernel execution task
-    alpaka::queue::enqueue(queue, taskKernel);
-
-    // Copy back the result
-    alpaka::mem::view::copy(queue, bufHostC, bufAccC, extent);
-    alpaka::wait::wait(queue);
-
-    bool resultCorrect(true);
-    for(Idx i(0u);
-        i < numElements;
-        ++i)
-    {
-        Data const & val(pBufHostC[i]);
-        Data const correctResult(pBufHostA[i] + pBufHostB[i]);
-        if(val != correctResult)
-        {
-            std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
-            resultCorrect = false;
-        }
-    }
-
-    if(resultCorrect)
-    {
-        std::cout << "Execution results correct!" << std::endl;
-        return EXIT_SUCCESS;
-    }
-    else
-    {
-        std::cout << "Execution results incorrect!" << std::endl;
-        return EXIT_FAILURE;
-    }
-#endif
-}
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccCpuFibers.hpp b/thirdParty/alpaka/include/alpaka/acc/AccCpuFibers.hpp
deleted file mode 100644
index 3e2c2af0ff..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccCpuFibers.hpp
+++ /dev/null
@@ -1,322 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtRefFiberIdMap.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-#include <alpaka/block/sync/BlockSyncBarrierFiber.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeStdLib.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Fibers.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <memory>
-#include <thread>
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuFibers;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The CPU fibers accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses boost::fibers to implement the cooperative parallelism.
-        //! By using fibers the shared memory can reside in the closest memory/cache available.
-        //! Furthermore there is no false sharing between neighboring threads as it is the case in real multi-threading.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuFibers final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>, // grid atomics
-                atomic::AtomicStdLibLock<16>, // block atomics
-                atomic::AtomicNoOp         // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStMasterSync,
-            public block::sync::BlockSyncBarrierFiber<TIdx>,
-            public rand::RandStdLib,
-            public time::TimeStdLib,
-            public concepts::Implements<ConceptAcc, AccCpuFibers<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuFibers;
-
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuFibers(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>(m_fibersToIndices),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>, // atomics between grids
-                        atomic::AtomicStdLibLock<16>, // atomics between blocks
-                        atomic::AtomicNoOp         // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStMasterSync(
-                        [this](){block::sync::syncBlockThreads(*this);},
-                        [this](){return (m_masterFiberId == boost::this_fiber::get_id());}),
-                    block::sync::BlockSyncBarrierFiber<TIdx>(
-                        workdiv::getWorkDiv<Block, Threads>(workDiv).prod()),
-                    rand::RandStdLib(),
-                    time::TimeStdLib(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuFibers(AccCpuFibers const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuFibers(AccCpuFibers &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuFibers const &) -> AccCpuFibers & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuFibers &&) -> AccCpuFibers & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuFibers() = default;
-
-        private:
-            // getIdx
-            typename idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>::FiberIdToIdxMap mutable m_fibersToIndices;  //!< The mapping of fibers id's to indices.
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;                    //!< The index of the currently executed block.
-
-            // allocBlockSharedArr
-            boost::fibers::fiber::id mutable m_masterFiberId;           //!< The id of the master fiber.
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                using type = acc::AccCpuFibers<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU fibers accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> alpaka::acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-#ifdef ALPAKA_CI
-                    auto const blockThreadCountMax(static_cast<TIdx>(3));
-#else
-                    auto const blockThreadCountMax(static_cast<TIdx>(4));  // \TODO: What is the maximum? Just set a reasonable value?
-#endif
-                    return {
-                        // m_multiProcessorCount
-                        std::max(static_cast<TIdx>(1), alpaka::core::clipCast<TIdx>(std::thread::hardware_concurrency())),   // \TODO: This may be inaccurate.
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::all(blockThreadCountMax),
-                        // m_blockThreadCountMax
-                        blockThreadCountMax,
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU fibers accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuFibers<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuFibers<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuFibers<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuFibers<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp
deleted file mode 100644
index 55465346ae..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtZero.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
-#include <alpaka/block/sync/BlockSyncNoOp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeOmp.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <limits>
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp2Blocks;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The CPU OpenMP 2.0 block accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses OpenMP 2.0 to implement the grid block parallelism.
-        //! The block idx is restricted to 1x1x1.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuOmp2Blocks final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtZero<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>,   // grid atomics
-                atomic::AtomicOmpBuiltIn,    // block atomics
-                atomic::AtomicNoOp           // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStNoSync,
-            public block::sync::BlockSyncNoOp,
-            public rand::RandStdLib,
-            public time::TimeOmp,
-            public concepts::Implements<ConceptAcc, AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuOmp2Blocks;
-
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuOmp2Blocks(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtZero<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>,// atomics between grids
-                        atomic::AtomicOmpBuiltIn, // atomics between blocks
-                        atomic::AtomicNoOp        // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStNoSync(),
-                    block::sync::BlockSyncNoOp(),
-                    rand::RandStdLib(),
-                    time::TimeOmp(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp2Blocks(AccCpuOmp2Blocks const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp2Blocks(AccCpuOmp2Blocks &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp2Blocks const &) -> AccCpuOmp2Blocks & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp2Blocks &&) -> AccCpuOmp2Blocks & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuOmp2Blocks() = default;
-
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;   //!< The index of the currently executed block.
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                using type = acc::AccCpuOmp2Blocks<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> alpaka::acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-                    return {
-                        // m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::ones(),
-                        // m_blockThreadCountMax
-                        static_cast<TIdx>(1),
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuOmp2Blocks<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuOmp2Blocks<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuOmp2Blocks<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp b/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp
deleted file mode 100644
index 3ef209d119..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtOmp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-#include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeOmp.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <omp.h>
-
-#include <limits>
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp2Threads;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The CPU OpenMP 2.0 thread accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses OpenMP 2.0 to implement the block thread parallelism.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuOmp2Threads final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtOmp<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>,   // grid atomics
-                atomic::AtomicOmpBuiltIn,    // block atomics
-                atomic::AtomicOmpBuiltIn     // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStMasterSync,
-            public block::sync::BlockSyncBarrierOmp,
-            public rand::RandStdLib,
-            public time::TimeOmp,
-            public concepts::Implements<ConceptAcc, AccCpuOmp2Threads<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuOmp2Threads;
-
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuOmp2Threads(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtOmp<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>,// atomics between grids
-                        atomic::AtomicOmpBuiltIn, // atomics between blocks
-                        atomic::AtomicOmpBuiltIn  // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStMasterSync(
-                        [this](){block::sync::syncBlockThreads(*this);},
-                        [](){return (::omp_get_thread_num() == 0);}),
-                    block::sync::BlockSyncBarrierOmp(),
-                    rand::RandStdLib(),
-                    time::TimeOmp(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp2Threads(AccCpuOmp2Threads const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp2Threads(AccCpuOmp2Threads &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp2Threads const &) -> AccCpuOmp2Threads & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp2Threads &&) -> AccCpuOmp2Threads & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuOmp2Threads() = default;
-
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;  //!< The index of the currently executed block.
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                using type = acc::AccCpuOmp2Threads<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> alpaka::acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-#ifdef ALPAKA_CI
-                    auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads())));
-#else
-                    auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(::omp_get_max_threads()));
-#endif
-                    return {
-                        // m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::all(blockThreadCountMax),
-                        // m_blockThreadCountMax
-                        blockThreadCountMax,
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuOmp2Threads<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuOmp2Threads<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuOmp2Threads<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuOmp2Threads<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp4.hpp b/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp4.hpp
deleted file mode 100644
index 2910fc032b..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccCpuOmp4.hpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-
-#if _OPENMP < 201307
-    #error If ALPAKA_ACC_CPU_BT_OMP4_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
-#endif
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtOmp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-#include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeOmp.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <omp.h>
-
-#include <limits>
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp4;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The CPU OpenMP 4.0 accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses CPU OpenMP4 to implement the parallelism.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuOmp4 final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtOmp<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>,   // grid atomics
-                atomic::AtomicOmpBuiltIn,    // block atomics
-                atomic::AtomicOmpBuiltIn     // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStMasterSync,
-            public block::sync::BlockSyncBarrierOmp,
-            public rand::RandStdLib,
-            public time::TimeOmp,
-            public concepts::Implements<ConceptAcc, AccCpuOmp4<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuOmp4;
-
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuOmp4(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtOmp<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>,// atomics between grids
-                        atomic::AtomicOmpBuiltIn, // atomics between blocks
-                        atomic::AtomicOmpBuiltIn  // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStMasterSync(
-                        [this](){block::sync::syncBlockThreads(*this);},
-                        [](){return (::omp_get_thread_num() == 0);}),
-                    block::sync::BlockSyncBarrierOmp(),
-                    rand::RandStdLib(),
-                    time::TimeOmp(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp4(AccCpuOmp4 const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp4(AccCpuOmp4 &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp4 const &) -> AccCpuOmp4 & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp4 &&) -> AccCpuOmp4 & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuOmp4() = default;
-
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;    //!< The index of the currently executed block.
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = acc::AccCpuOmp4<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-#ifdef ALPAKA_CI
-                    auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads())));
-#else
-                    auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(::omp_get_max_threads()));
-#endif
-                    return {
-                        // m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::all(blockThreadCountMax),
-                        // m_blockThreadCountMax
-                        blockThreadCountMax,
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuOmp4<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuOmp4<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuOmp4<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuOmp4<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccCpuSerial.hpp b/thirdParty/alpaka/include/alpaka/acc/AccCpuSerial.hpp
deleted file mode 100644
index 43ff644c0e..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccCpuSerial.hpp
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtZero.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
-#include <alpaka/block/sync/BlockSyncNoOp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeStdLib.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <memory>
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuSerial;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The CPU serial accelerator.
-        //!
-        //! This accelerator allows serial kernel execution on a CPU device.
-        //! The block idx is restricted to 1x1x1 and all blocks are executed serially so there is no parallelism at all.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuSerial final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtZero<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>, // grid atomics
-                atomic::AtomicNoOp,        // block atomics
-                atomic::AtomicNoOp         // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStNoSync,
-            public block::sync::BlockSyncNoOp,
-            public rand::RandStdLib,
-            public time::TimeStdLib,
-            public concepts::Implements<ConceptAcc, AccCpuSerial<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuSerial;
-
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuSerial(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtZero<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>, // atomics between grids
-                        atomic::AtomicNoOp,        // atomics between blocks
-                        atomic::AtomicNoOp         // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStNoSync(),
-                    block::sync::BlockSyncNoOp(),
-                    rand::RandStdLib(),
-                    time::TimeStdLib(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuSerial(AccCpuSerial const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuSerial(AccCpuSerial &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuSerial const &) -> AccCpuSerial & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuSerial &&) -> AccCpuSerial & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuSerial() = default;
-
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;    //!< The index of the currently executed block.
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                using type = acc::AccCpuSerial<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU serial accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-                    return {
-                        // m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::ones(),
-                        // m_blockThreadCountMax
-                        static_cast<TIdx>(1),
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU serial accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuSerial<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuSerial<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuSerial<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuSerial<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp b/thirdParty/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp
deleted file mode 100644
index ea2bc156ee..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtZero.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
-#include <alpaka/block/sync/BlockSyncNoOp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeStdLib.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <memory>
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuTbbBlocks;
-    }
-    namespace acc
-    {
-
-        //#############################################################################
-        //! The CPU TBB block accelerator.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuTbbBlocks final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtZero<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>, // grid atomics
-                atomic::AtomicStdLibLock<16>, // block atomics
-                atomic::AtomicNoOp         // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStNoSync,
-            public block::sync::BlockSyncNoOp,
-            public rand::RandStdLib,
-            public time::TimeStdLib,
-            public concepts::Implements<ConceptAcc, AccCpuTbbBlocks<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuTbbBlocks;
-
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuTbbBlocks(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtZero<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>, // atomics between grids
-                        atomic::AtomicStdLibLock<16>, // atomics between blocks
-                        atomic::AtomicNoOp         // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStNoSync(),
-                    block::sync::BlockSyncNoOp(),
-                    rand::RandStdLib(),
-                    time::TimeStdLib(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuTbbBlocks(AccCpuTbbBlocks const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuTbbBlocks(AccCpuTbbBlocks &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuTbbBlocks const &) -> AccCpuTbbBlocks & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuTbbBlocks &&) -> AccCpuTbbBlocks & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuTbbBlocks() = default;
-
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;  //!< The index of the currently executed block.
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                using type = acc::AccCpuTbbBlocks<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU TBB block accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                  ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-                    return {
-                        // m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::ones(),
-                        // m_blockThreadCountMax
-                        static_cast<TIdx>(1),
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-
-            };
-            //#############################################################################
-            //! The CPU TBB block accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuTbbBlocks<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuTbbBlocks<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuTbbBlocks<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuTbbBlocks<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccCpuThreads.hpp b/thirdParty/alpaka/include/alpaka/acc/AccCpuThreads.hpp
deleted file mode 100644
index 5ce32b990a..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccCpuThreads.hpp
+++ /dev/null
@@ -1,322 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtRefThreadIdMap.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-#include <alpaka/block/sync/BlockSyncBarrierThread.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeStdLib.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <memory>
-#include <thread>
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuThreads;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The CPU threads accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses C++11 std::thread to implement the parallelism.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuThreads final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>, // grid atomics
-                atomic::AtomicStdLibLock<16>, // block atomics
-                atomic::AtomicStdLibLock<16>  // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStMasterSync,
-            public block::sync::BlockSyncBarrierThread<TIdx>,
-            public rand::RandStdLib,
-            public time::TimeStdLib,
-            public concepts::Implements<ConceptAcc, AccCpuThreads<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuThreads;
-
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuThreads(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>(m_threadToIndexMap),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>, // atomics between grids
-                        atomic::AtomicStdLibLock<16>, // atomics between blocks
-                        atomic::AtomicStdLibLock<16>  // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStMasterSync(
-                        [this](){block::sync::syncBlockThreads(*this);},
-                        [this](){return (m_idMasterThread == std::this_thread::get_id());}),
-                    block::sync::BlockSyncBarrierThread<TIdx>(
-                        workdiv::getWorkDiv<Block, Threads>(workDiv).prod()),
-                    rand::RandStdLib(),
-                    time::TimeStdLib(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuThreads(AccCpuThreads const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuThreads(AccCpuThreads &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuThreads const &) -> AccCpuThreads & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuThreads &&) -> AccCpuThreads & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuThreads() = default;
-
-        private:
-            // getIdx
-            std::mutex mutable m_mtxMapInsert;                              //!< The mutex used to secure insertion into the ThreadIdToIdxMap.
-            typename idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>::ThreadIdToIdxMap mutable m_threadToIndexMap;    //!< The mapping of thread id's to indices.
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;                   //!< The index of the currently executed block.
-
-            // allocBlockSharedArr
-            std::thread::id mutable m_idMasterThread;                       //!< The id of the master thread.
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                using type = acc::AccCpuThreads<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU threads accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-#ifdef ALPAKA_CI
-                    auto const blockThreadCountMax(static_cast<TIdx>(8));
-#else
-                    // \TODO: Magic number. What is the maximum? Just set a reasonable value? There is a implementation defined maximum where the creation of a new thread crashes.
-                    // std::thread::hardware_concurrency can return 0, so 1 is the default case?
-                    auto const blockThreadCountMax(std::max(static_cast<TIdx>(1), alpaka::core::clipCast<TIdx>(std::thread::hardware_concurrency() * 8)));
-#endif
-                    return {
-                        // m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::all(blockThreadCountMax),
-                        // m_blockThreadCountMax
-                        blockThreadCountMax,
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU threads accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuThreads<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuThreads<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuThreads<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuThreads<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccDevProps.hpp b/thirdParty/alpaka/include/alpaka/acc/AccDevProps.hpp
deleted file mode 100644
index e561453dd8..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccDevProps.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Common.hpp>
-
-#include <vector>
-#include <string>
-
-namespace alpaka
-{
-    namespace acc
-    {
-        //#############################################################################
-        //! The acceleration properties on a device.
-        //
-        // \TODO:
-        //  TIdx m_maxClockFrequencyHz;            //!< Maximum clock frequency of the device in Hz.
-        //  TIdx m_sharedMemSizeBytes;             //!< Idx of the available block shared memory in bytes.
-        template<
-            typename TDim,
-            typename TIdx>
-        struct AccDevProps
-        {
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccDevProps(
-                TIdx const & multiProcessorCount,
-                vec::Vec<TDim, TIdx> const & gridBlockExtentMax,
-                TIdx const & gridBlockCountMax,
-                vec::Vec<TDim, TIdx> const & blockThreadExtentMax,
-                TIdx const & blockThreadCountMax,
-                vec::Vec<TDim, TIdx> const & threadElemExtentMax,
-                TIdx const & threadElemCountMax) :
-                    m_gridBlockExtentMax(gridBlockExtentMax),
-                    m_blockThreadExtentMax(blockThreadExtentMax),
-                    m_threadElemExtentMax(threadElemExtentMax),
-                    m_gridBlockCountMax(gridBlockCountMax),
-                    m_blockThreadCountMax(blockThreadCountMax),
-                    m_threadElemCountMax(threadElemCountMax),
-                    m_multiProcessorCount(multiProcessorCount)
-            {}
-
-            // NOTE: The members have been reordered from the order in the constructor because gcc is buggy for some TDim and TIdx and generates invalid assembly.
-            vec::Vec<TDim, TIdx> m_gridBlockExtentMax;      //!< The maximum number of blocks in each dimension of the grid.
-            vec::Vec<TDim, TIdx> m_blockThreadExtentMax;    //!< The maximum number of threads in each dimension of a block.
-            vec::Vec<TDim, TIdx> m_threadElemExtentMax;     //!< The maximum number of elements in each dimension of a thread.
-
-            TIdx m_gridBlockCountMax;                  //!< The maximum number of blocks in a grid.
-            TIdx m_blockThreadCountMax;                //!< The maximum number of threads in a block.
-            TIdx m_threadElemCountMax;                 //!< The maximum number of elements in a threads.
-
-            TIdx m_multiProcessorCount;                //!< The number of multiprocessors.
-        };
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp b/thirdParty/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp
deleted file mode 100644
index 93ea45a7b6..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivCudaBuiltIn.hpp>
-#include <alpaka/idx/gb/IdxGbCudaBuiltIn.hpp>
-#include <alpaka/idx/bt/IdxBtCudaBuiltIn.hpp>
-#include <alpaka/atomic/AtomicCudaBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathCudaBuiltIn.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp>
-#include <alpaka/block/sync/BlockSyncCudaBuiltIn.hpp>
-#include <alpaka/rand/RandCuRand.hpp>
-#include <alpaka/time/TimeCudaBuiltIn.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelGpuCudaRt;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The GPU CUDA accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on devices supporting CUDA.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccGpuCudaRt final :
-            public workdiv::WorkDivCudaBuiltIn<TDim, TIdx>,
-            public idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>,
-            public idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicCudaBuiltIn, // grid atomics
-                atomic::AtomicCudaBuiltIn, // block atomics
-                atomic::AtomicCudaBuiltIn  // thread atomics
-            >,
-            public math::MathCudaBuiltIn,
-            public block::shared::dyn::BlockSharedMemDynCudaBuiltIn,
-            public block::shared::st::BlockSharedMemStCudaBuiltIn,
-            public block::sync::BlockSyncCudaBuiltIn,
-            public rand::RandCuRand,
-            public time::TimeCudaBuiltIn,
-            public concepts::Implements<ConceptAcc, AccGpuCudaRt<TDim, TIdx>>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuCudaRt(
-                vec::Vec<TDim, TIdx> const & threadElemExtent) :
-                    workdiv::WorkDivCudaBuiltIn<TDim, TIdx>(threadElemExtent),
-                    idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>(),
-                    idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicCudaBuiltIn, // atomics between grids
-                        atomic::AtomicCudaBuiltIn, // atomics between blocks
-                        atomic::AtomicCudaBuiltIn  // atomics between threads
-                    >(),
-                    math::MathCudaBuiltIn(),
-                    block::shared::dyn::BlockSharedMemDynCudaBuiltIn(),
-                    block::shared::st::BlockSharedMemStCudaBuiltIn(),
-                    block::sync::BlockSyncCudaBuiltIn(),
-                    rand::RandCuRand(),
-                    time::TimeCudaBuiltIn()
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuCudaRt(AccGpuCudaRt const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuCudaRt(AccGpuCudaRt &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AccGpuCudaRt const &) -> AccGpuCudaRt & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AccGpuCudaRt &&) -> AccGpuCudaRt & = delete;
-            //-----------------------------------------------------------------------------
-            ~AccGpuCudaRt() = default;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = acc::AccGpuCudaRt<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The GPU CUDA accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCudaRt const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with cudaGetDeviceProperties
-                    // https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
-                    int multiProcessorCount = {};
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &multiProcessorCount,
-                        cudaDevAttrMultiProcessorCount,
-                        dev.m_iDevice));
-
-                    int maxGridSize[3] = {};
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxGridSize[0],
-                        cudaDevAttrMaxGridDimX,
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxGridSize[1],
-                        cudaDevAttrMaxGridDimY,
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxGridSize[2],
-                        cudaDevAttrMaxGridDimZ,
-                        dev.m_iDevice));
-
-                    int maxBlockDim[3] = {};
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxBlockDim[0],
-                        cudaDevAttrMaxBlockDimX,
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxBlockDim[1],
-                        cudaDevAttrMaxBlockDimY,
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxBlockDim[2],
-                        cudaDevAttrMaxBlockDimZ,
-                        dev.m_iDevice));
-
-                    int maxThreadsPerBlock = {};
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxThreadsPerBlock,
-                        cudaDevAttrMaxThreadsPerBlock,
-                        dev.m_iDevice));
-
-                    return {
-                        // m_multiProcessorCount
-                        alpaka::core::clipCast<TIdx>(multiProcessorCount),
-                        // m_gridBlockExtentMax
-                        extent::getExtentVecEnd<TDim>(
-                            vec::Vec<dim::DimInt<3u>, TIdx>(
-                                alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
-                                alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
-                                alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        extent::getExtentVecEnd<TDim>(
-                            vec::Vec<dim::DimInt<3u>, TIdx>(
-                                alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
-                                alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
-                                alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
-                        // m_blockThreadCountMax
-                        alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The GPU CUDA accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccGpuCudaRt<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = dev::DevCudaRt;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace detail
-        {
-            //#############################################################################
-            //! specialization of the TKernelFnObj return type evaluation
-            //
-            // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
-            // https://github.com/ComputationalRadiationPhysics/alpaka/pull/695#issuecomment-446103194
-            // The execution task TaskKernelGpuCudaRt is therefore performing this check on device side.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct CheckFnReturnType<
-                acc::AccGpuCudaRt<
-                    TDim,
-                    TIdx>>
-            {
-                template<
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                void operator()(
-                    TKernelFnObj const &,
-                    TArgs const & ...)
-                {
-
-                }
-            };
-        }
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccGpuCudaRt<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelGpuCudaRt<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelGpuCudaRt<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU CUDA execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = pltf::PltfCudaRt;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/AccGpuHipRt.hpp b/thirdParty/alpaka/include/alpaka/acc/AccGpuHipRt.hpp
deleted file mode 100644
index 101212e439..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/AccGpuHipRt.hpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivHipBuiltIn.hpp>
-#include <alpaka/idx/gb/IdxGbHipBuiltIn.hpp>
-#include <alpaka/idx/bt/IdxBtHipBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHipBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathHipBuiltIn.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp>
-#include <alpaka/block/sync/BlockSyncHipBuiltIn.hpp>
-#include <alpaka/rand/RandHipRand.hpp>
-#include <alpaka/time/TimeHipBuiltIn.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/core/Hip.hpp>
-
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelGpuHipRt;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The GPU HIP accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on devices supporting HIP or HCC
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccGpuHipRt final :
-            public workdiv::WorkDivHipBuiltIn<TDim, TIdx>,
-            public idx::gb::IdxGbHipBuiltIn<TDim, TIdx>,
-            public idx::bt::IdxBtHipBuiltIn<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicHipBuiltIn, // grid atomics
-                atomic::AtomicHipBuiltIn, // block atomics
-                atomic::AtomicHipBuiltIn  // thread atomics
-            >,
-            public math::MathHipBuiltIn,
-            public block::shared::dyn::BlockSharedMemDynHipBuiltIn,
-            public block::shared::st::BlockSharedMemStHipBuiltIn,
-            public block::sync::BlockSyncHipBuiltIn,
-            public rand::RandHipRand,
-            public time::TimeHipBuiltIn,
-            public concepts::Implements<ConceptAcc, AccGpuHipRt<TDim, TIdx>>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuHipRt(
-                vec::Vec<TDim, TIdx> const & threadElemExtent) :
-                    workdiv::WorkDivHipBuiltIn<TDim, TIdx>(threadElemExtent),
-                    idx::gb::IdxGbHipBuiltIn<TDim, TIdx>(),
-                    idx::bt::IdxBtHipBuiltIn<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicHipBuiltIn, // atomics between grids
-                        atomic::AtomicHipBuiltIn, // atomics between blocks
-                        atomic::AtomicHipBuiltIn  // atomics between threads
-                    >(),
-                    math::MathHipBuiltIn(),
-                    block::shared::dyn::BlockSharedMemDynHipBuiltIn(),
-                    block::shared::st::BlockSharedMemStHipBuiltIn(),
-                    block::sync::BlockSyncHipBuiltIn(),
-                    rand::RandHipRand(),
-                    time::TimeHipBuiltIn()
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuHipRt(AccGpuHipRt const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuHipRt(AccGpuHipRt &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AccGpuHipRt const &) -> AccGpuHipRt & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AccGpuHipRt &&) -> AccGpuHipRt & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST_ACC ~AccGpuHipRt() = default;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                using type = acc::AccGpuHipRt<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The GPU HIP accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevHipRt const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    hipDeviceProp_t hipDevProp;
-                    ALPAKA_HIP_RT_CHECK(hipGetDeviceProperties(
-                        &hipDevProp,
-                        dev.m_iDevice));
-
-                    return {
-                        // m_multiProcessorCount
-                        alpaka::core::clipCast<TIdx>(hipDevProp.multiProcessorCount),
-                        // m_gridBlockExtentMax
-                        extent::getExtentVecEnd<TDim>(
-                            vec::Vec<dim::DimInt<3u>, TIdx>(
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[2u]),
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[1u]),
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[0u]))),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        extent::getExtentVecEnd<TDim>(
-                            vec::Vec<dim::DimInt<3u>, TIdx>(
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[2u]),
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[1u]),
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[0u]))),
-                        // m_blockThreadCountMax
-                        alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsPerBlock),
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The GPU Hip accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccGpuHipRt<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                using type = dev::DevHipRt;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace detail
-        {
-            //#############################################################################
-            //! specialization of the TKernelFnObj return type evaluation
-            //
-            // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
-            // https://github.com/ComputationalRadiationPhysics/alpaka/pull/695#issuecomment-446103194
-            // The execution task TaskKernelGpuHipRt is therefore performing this check on device side.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct CheckFnReturnType<
-                acc::AccGpuHipRt<
-                    TDim,
-                    TIdx>>
-            {
-                template<
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                void operator()(
-                    TKernelFnObj const &,
-                    TArgs const & ...)
-                {
-
-                }
-            };
-        }
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccGpuHipRt<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelGpuHipRt<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelGpuHipRt<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU HIP execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                using type = pltf::PltfHipRt;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/acc/Traits.hpp b/thirdParty/alpaka/include/alpaka/acc/Traits.hpp
deleted file mode 100644
index 10d0f1570e..0000000000
--- a/thirdParty/alpaka/include/alpaka/acc/Traits.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/acc/AccDevProps.hpp>
-#include <alpaka/core/Common.hpp>
-
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-
-#include <string>
-#include <typeinfo>
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The accelerator specifics.
-    namespace acc
-    {
-        struct ConceptAcc;
-
-        //-----------------------------------------------------------------------------
-        //! The accelerator traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The accelerator type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct AccType;
-
-            //#############################################################################
-            //! The device properties get trait.
-            template<
-                typename TAcc,
-                typename TSfinae = void>
-            struct GetAccDevProps;
-
-            //#############################################################################
-            //! The accelerator name trait.
-            //!
-            //! The default implementation returns the mangled class name.
-            template<
-                typename TAcc,
-                typename TSfinae = void>
-            struct GetAccName
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return typeid(TAcc).name();
-                }
-            };
-        }
-
-        //#############################################################################
-        //! The accelerator type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Acc = typename traits::AccType<T>::type;
-
-        //-----------------------------------------------------------------------------
-        //! \return The acceleration properties on the given device.
-        template<
-            typename TAcc,
-            typename TDev>
-        ALPAKA_FN_HOST auto getAccDevProps(
-            TDev const & dev)
-        -> AccDevProps<dim::Dim<TAcc>, idx::Idx<TAcc>>
-        {
-            return
-                traits::GetAccDevProps<
-                    TAcc>
-                ::getAccDevProps(
-                    dev);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The accelerator name
-        //!
-        //! \tparam TAcc The accelerator type.
-        template<
-            typename TAcc>
-        ALPAKA_FN_HOST auto getAccName()
-        -> std::string
-        {
-            return
-                traits::GetAccName<
-                    TAcc>
-                ::getAccName();
-        }
-    }
-
-    namespace queue
-    {
-        namespace traits
-        {
-            template<
-                typename TAcc,
-                typename TProperty>
-            struct QueueType<
-                TAcc,
-                TProperty,
-                typename std::enable_if<
-                    concepts::ImplementsConcept<acc::ConceptAcc, TAcc>::value
-                >::type
-            >
-            {
-                using type = typename QueueType<
-                    typename pltf::traits::PltfType<TAcc>::type,
-                    TProperty
-                >::type;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/alpaka.hpp b/thirdParty/alpaka/include/alpaka/alpaka.hpp
deleted file mode 100644
index a268b6e0f4..0000000000
--- a/thirdParty/alpaka/include/alpaka/alpaka.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-//#############################################################################
-// Include the whole library.
-//#############################################################################
-
-//-----------------------------------------------------------------------------
-// version number
-#include <alpaka/version.hpp>
-//-----------------------------------------------------------------------------
-// acc
-#include <alpaka/acc/AccCpuSerial.hpp>
-#include <alpaka/acc/AccCpuThreads.hpp>
-#include <alpaka/acc/AccCpuFibers.hpp>
-#include <alpaka/acc/AccCpuTbbBlocks.hpp>
-#include <alpaka/acc/AccCpuOmp2Blocks.hpp>
-#include <alpaka/acc/AccCpuOmp2Threads.hpp>
-#include <alpaka/acc/AccCpuOmp4.hpp>
-#include <alpaka/acc/AccGpuCudaRt.hpp>
-#include <alpaka/acc/AccGpuHipRt.hpp>
-#include <alpaka/acc/AccDevProps.hpp>
-#include <alpaka/acc/Traits.hpp>
-//-----------------------------------------------------------------------------
-// atomic
-#include <alpaka/atomic/AtomicCudaBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHipBuiltIn.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/Op.hpp>
-#include <alpaka/atomic/Traits.hpp>
-//-----------------------------------------------------------------------------
-// block
-    //-----------------------------------------------------------------------------
-    // shared
-        //-----------------------------------------------------------------------------
-        // dynamic
-        #include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-        #include <alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp>
-        #include <alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp>
-        #include <alpaka/block/shared/dyn/Traits.hpp>
-        //-----------------------------------------------------------------------------
-        // static
-        #include <alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp>
-        #include <alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp>
-        #include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-        #include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
-        #include <alpaka/block/shared/st/Traits.hpp>
-    //-----------------------------------------------------------------------------
-    // sync
-    #include <alpaka/block/sync/BlockSyncBarrierFiber.hpp>
-    #include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
-    #include <alpaka/block/sync/BlockSyncBarrierThread.hpp>
-    #include <alpaka/block/sync/BlockSyncCudaBuiltIn.hpp>
-    #include <alpaka/block/sync/BlockSyncHipBuiltIn.hpp>
-    #include <alpaka/block/sync/BlockSyncNoOp.hpp>
-    #include <alpaka/block/sync/Traits.hpp>
-//-----------------------------------------------------------------------------
-// core
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Align.hpp>
-#include <alpaka/core/BarrierThread.hpp>
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/ConcurrentExecPool.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/core/Debug.hpp>
-#include <alpaka/core/Fibers.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unroll.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/core/Utility.hpp>
-#include <alpaka/core/Vectorize.hpp>
-//-----------------------------------------------------------------------------
-// dev
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/dev/cpu/Wait.hpp>
-#include <alpaka/dev/Traits.hpp>
-//-----------------------------------------------------------------------------
-// dim
-#include <alpaka/dim/DimArithmetic.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/dim/Traits.hpp>
-//-----------------------------------------------------------------------------
-// event
-#include <alpaka/event/EventCudaRt.hpp>
-#include <alpaka/event/EventHipRt.hpp>
-#include <alpaka/event/EventCpu.hpp>
-#include <alpaka/event/Traits.hpp>
-//-----------------------------------------------------------------------------
-// extent
-#include <alpaka/extent/Traits.hpp>
-//-----------------------------------------------------------------------------
-// idx
-#include <alpaka/idx/bt/IdxBtCudaBuiltIn.hpp>
-#include <alpaka/idx/bt/IdxBtHipBuiltIn.hpp>
-#include <alpaka/idx/bt/IdxBtOmp.hpp>
-#include <alpaka/idx/bt/IdxBtRefFiberIdMap.hpp>
-#include <alpaka/idx/bt/IdxBtRefThreadIdMap.hpp>
-#include <alpaka/idx/bt/IdxBtZero.hpp>
-#include <alpaka/idx/gb/IdxGbCudaBuiltIn.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/Accessors.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-//-----------------------------------------------------------------------------
-// kernel
-#include <alpaka/kernel/TaskKernelCpuSerial.hpp>
-#include <alpaka/kernel/TaskKernelCpuThreads.hpp>
-#include <alpaka/kernel/TaskKernelCpuFibers.hpp>
-#include <alpaka/kernel/TaskKernelCpuTbbBlocks.hpp>
-#include <alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp>
-#include <alpaka/kernel/TaskKernelCpuOmp2Threads.hpp>
-#include <alpaka/kernel/TaskKernelCpuOmp4.hpp>
-#include <alpaka/kernel/TaskKernelGpuCudaRt.hpp>
-#include <alpaka/kernel/TaskKernelGpuHipRt.hpp>
-#include <alpaka/kernel/Traits.hpp>
-//-----------------------------------------------------------------------------
-// math
-#include <alpaka/math/MathCudaBuiltIn.hpp>
-#include <alpaka/math/MathHipBuiltIn.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-//-----------------------------------------------------------------------------
-// mem
-#include <alpaka/mem/alloc/AllocCpuBoostAligned.hpp>
-#include <alpaka/mem/alloc/AllocCpuNew.hpp>
-#include <alpaka/mem/alloc/Traits.hpp>
-
-#include <alpaka/mem/buf/BufCpu.hpp>
-#include <alpaka/mem/buf/BufCudaRt.hpp>
-#include <alpaka/mem/buf/BufHipRt.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-
-#include <alpaka/mem/view/ViewCompileTimeArray.hpp>
-#include <alpaka/mem/view/ViewPlainPtr.hpp>
-#include <alpaka/mem/view/ViewStdArray.hpp>
-#include <alpaka/mem/view/ViewStdVector.hpp>
-#include <alpaka/mem/view/ViewSubView.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-//-----------------------------------------------------------------------------
-// meta
-#include <alpaka/meta/Apply.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/meta/CartesianProduct.hpp>
-#include <alpaka/meta/Concatenate.hpp>
-#include <alpaka/meta/DependentFalseType.hpp>
-#include <alpaka/meta/Filter.hpp>
-#include <alpaka/meta/Fold.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
-#include <alpaka/meta/Integral.hpp>
-#include <alpaka/meta/IsStrictBase.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/Set.hpp>
-#include <alpaka/meta/Transform.hpp>
-//-----------------------------------------------------------------------------
-// offset
-#include <alpaka/offset/Traits.hpp>
-//-----------------------------------------------------------------------------
-// platform
-#include <alpaka/pltf/PltfCpu.hpp>
-#include <alpaka/pltf/PltfCudaRt.hpp>
-#include <alpaka/pltf/PltfHipRt.hpp>
-#include <alpaka/pltf/Traits.hpp>
-//-----------------------------------------------------------------------------
-// rand
-#include <alpaka/rand/RandCuRand.hpp>
-#include <alpaka/rand/RandHipRand.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/rand/Traits.hpp>
-//-----------------------------------------------------------------------------
-// idx
-#include <alpaka/idx/Traits.hpp>
-//-----------------------------------------------------------------------------
-// queue
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/queue/QueueCpuNonBlocking.hpp>
-#include <alpaka/queue/QueueCpuBlocking.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/queue/Properties.hpp>
-//-----------------------------------------------------------------------------
-// time
-#include <alpaka/time/Traits.hpp>
-//-----------------------------------------------------------------------------
-// wait
-#include <alpaka/wait/Traits.hpp>
-//-----------------------------------------------------------------------------
-// workdiv
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/workdiv/Traits.hpp>
-#include <alpaka/workdiv/WorkDivHelpers.hpp>
-//-----------------------------------------------------------------------------
-// vec
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/vec/Traits.hpp>
diff --git a/thirdParty/alpaka/include/alpaka/atomic/AtomicCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/atomic/AtomicCudaBuiltIn.hpp
deleted file mode 100644
index 410c8bae73..0000000000
--- a/thirdParty/alpaka/include/alpaka/atomic/AtomicCudaBuiltIn.hpp
+++ /dev/null
@@ -1,1205 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/atomic/Op.hpp>
-#include <alpaka/atomic/Traits.hpp>
-#include <alpaka/meta/DependentFalseType.hpp>
-
-#include <climits>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-        //#############################################################################
-        //! The GPU CUDA accelerator atomic ops.
-        //
-        //  Atomics can used in the hierarchy level grids, blocks and threads.
-        //  Atomics are not guaranteed to be save between devices
-        class AtomicCudaBuiltIn
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            AtomicCudaBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            __device__ AtomicCudaBuiltIn(AtomicCudaBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ AtomicCudaBuiltIn(AtomicCudaBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AtomicCudaBuiltIn const &) -> AtomicCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AtomicCudaBuiltIn &&) -> AtomicCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AtomicCudaBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The specializations to execute the requested atomic ops of the CUDA accelerator.
-            // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions how to implement everything with CAS
-
-            //-----------------------------------------------------------------------------
-            // Add.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicAdd(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicAdd(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                float,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    float * const addr,
-                    float const & value)
-                -> float
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                double,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    double * const addr,
-                    double const & value)
-                -> double
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(6, 0, 0)
-                    return atomicAdd(addr, value);
-#else
-                    // Code from: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
-
-                    unsigned long long int * address_as_ull(reinterpret_cast<unsigned long long int *>(addr));
-                    unsigned long long int old(*address_as_ull);
-                    unsigned long long int assumed;
-                    do
-                    {
-                        assumed = old;
-                        old = atomicCAS(
-                            address_as_ull,
-                            assumed,
-                            static_cast<unsigned long long>(__double_as_longlong(value + __longlong_as_double(static_cast<long long>(assumed)))));
-                        // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-                    }
-                    while(assumed != old);
-                    return __longlong_as_double(static_cast<long long>(old));
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Sub.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicSub(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicSub(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicSub(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Sub, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Min.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicMin(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicMin(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicMin(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMin(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Min, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-               typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMin(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Min, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Max.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicMax(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicMax(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicMax(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMax(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Max, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-               typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMax(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Max, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Exch.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicExch(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicExch(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                float,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    float * const addr,
-                    float const & value)
-                -> float
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Inc.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Inc,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicInc(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Inc,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicInc(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Inc, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Dec.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Dec,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicDec(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Dec,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicDec(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Dec, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // And.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicAnd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicAnd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicAnd(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicAnd(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::And, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicAnd(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::And, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Or.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicOr(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicOr(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicOr(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicOr(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Or, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-               typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicOr(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Or, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Xor.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicXor(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicXor(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicXor(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicXor(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Xor, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-               typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicXor(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Xor, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Cas.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & compare,
-                    int const & value)
-                -> int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & compare,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & compare,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicCAS(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(compare),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicCAS(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(compare),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & compare,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-
-            //#############################################################################
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicCudaBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const & atomic,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<TOp, atomic::AtomicCudaBuiltIn, T>(atomic, addr, value) is not supported!");
-
-                    return T();
-                }
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const & atomic,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(compare);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<TOp, atomic::AtomicCudaBuiltIn, T>(atomic, addr, compare, value) is not supported!");
-
-                    return T();
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp b/thirdParty/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp
deleted file mode 100644
index 8f9141fb88..0000000000
--- a/thirdParty/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/atomic/Traits.hpp>
-
-#include <alpaka/meta/InheritFromList.hpp>
-#include <alpaka/meta/Unique.hpp>
-
-#include <tuple>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-
-        //#############################################################################
-        //! build a single class to inherit from different atomic implementations
-        //
-        //  This implementation inherit from all three hierarchies.
-        //  The multiple usage of the same type for different levels is allowed.
-        //  The class provide the feature that each atomic operation can be focused
-        //  to a hierarchy level in Alpaka. A operation to a hierarchy is independent
-        //  to the memory hierarchy.
-        //
-        //  \tparam TGridAtomic atomic implementation for atomic operations between grids within a device
-        //  \tparam TBlockAtomic atomic implementation for atomic operations between blocks within a grid
-        //  \tparam TThreadAtomic atomic implementation for atomic operations between threads within a block
-        template<
-            typename TGridAtomic,
-            typename TBlockAtomic,
-            typename TThreadAtomic
-        >
-        using AtomicHierarchy
-            = alpaka::meta::InheritFromList<
-                alpaka::meta::Unique<
-                    std::tuple<
-                        TGridAtomic,
-                        TBlockAtomic,
-                        TThreadAtomic,
-                        concepts::Implements<ConceptAtomicGrids, TGridAtomic>,
-                        concepts::Implements<ConceptAtomicBlocks, TBlockAtomic>,
-                        concepts::Implements<ConceptAtomicThreads, TThreadAtomic>
-                    >
-                >
-            >;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/atomic/AtomicHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/atomic/AtomicHipBuiltIn.hpp
deleted file mode 100644
index f2fc337b60..0000000000
--- a/thirdParty/alpaka/include/alpaka/atomic/AtomicHipBuiltIn.hpp
+++ /dev/null
@@ -1,1201 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/atomic/Op.hpp>
-#include <alpaka/atomic/Traits.hpp>
-#include <alpaka/meta/DependentFalseType.hpp>
-
-#include <climits>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-        //#############################################################################
-        //! The GPU HIP accelerator atomic ops.
-        //
-        //  Atomics can used in the hierarchy level grids, blocks and threads.
-        //  Atomics are not guaranteed to be save between devices
-        class AtomicHipBuiltIn
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            AtomicHipBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            __device__ AtomicHipBuiltIn(AtomicHipBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ AtomicHipBuiltIn(AtomicHipBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AtomicHipBuiltIn const &) -> AtomicHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AtomicHipBuiltIn &&) -> AtomicHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ALPAKA_FN_HOST_ACC ~AtomicHipBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The specializations to execute the requested atomic ops of the HIP accelerator.
-            // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions how to implement everything with CAS
-
-            //-----------------------------------------------------------------------------
-            // Add.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicAdd(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicAdd(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                float,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    float * const addr,
-                    float const & value)
-                -> float
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                double,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    double * const addr,
-                    double const & value)
-                -> double
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(6,0,0)
-                    return atomicAdd(addr, value);
-#else
-                    // Code from: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
-
-                    unsigned long long int * address_as_ull(reinterpret_cast<unsigned long long int *>(addr));
-                    unsigned long long int old(*address_as_ull);
-                    unsigned long long int assumed;
-                    do
-                    {
-                        assumed = old;
-                        old = atomicCAS(
-                            address_as_ull,
-                            assumed,
-                            static_cast<unsigned long long>(__double_as_longlong(value + __longlong_as_double(static_cast<long long>(assumed)))));
-                        // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-                    }
-                    while(assumed != old);
-                    return __longlong_as_double(static_cast<long long>(old));
-#endif
-
-                }
-            };
-            //-----------------------------------------------------------------------------
-            // Sub.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicSub(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicSub(addr, value);
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicSub(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Sub, atomic::AtomicHipBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Min.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicMin(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicMin(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicMin(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMin(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Min, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMin(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Min, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            // Max.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicMax(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicMax(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicMax(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMax(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Max, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-          //-----------------------------------------------------------------------------
-          //! The GPU HIP accelerator atomic operation.
-          template<
-            typename THierarchy>
-          struct AtomicOp<
-            op::Max,
-            atomic::AtomicHipBuiltIn,
-            unsigned long long int,
-            THierarchy>
-          {
-            //-----------------------------------------------------------------------------
-            __device__ static auto atomicOp(
-              atomic::AtomicHipBuiltIn const &,
-              unsigned long long int * const addr,
-              unsigned long long int const & value)
-              -> unsigned long long int
-              {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                return atomicMax(addr, value);
-#else
-                alpaka::ignore_unused(addr);
-                alpaka::ignore_unused(value);
-                static_assert(
-                  meta::DependentFalseType<THierarchy>::value,
-                  "atomicOp<op::Max, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                    }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Exch.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicExch(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicExch(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                float,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    float * const addr,
-                    float const & value)
-                -> float
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Inc.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Inc,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicInc(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Inc,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicInc(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Inc, atomic::AtomicHipBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Dec.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Dec,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicDec(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Dec,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicDec(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Dec, atomic::AtomicHipBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // And.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicAnd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicAnd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicAnd(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicAnd(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::And, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicAnd(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::And, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            // Or.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicOr(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicOr(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicOr(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicOr(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Or, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicOr(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Or, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            // Xor.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicXor(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicXor(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicXor(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicXor(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Xor, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicXor(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Xor, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Cas.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & compare,
-                    int const & value)
-                -> int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & compare,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & compare,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicCAS(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(compare),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicCAS(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(compare),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & compare,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-
-            //#############################################################################
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicHipBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const & atomic,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<TOp, atomic::AtomicHipBuiltIn, T>(atomic, addr, value) is not supported!");
-
-                    return T();
-                }
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const & atomic,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(compare);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<TOp, atomic::AtomicHipBuiltIn, T>(atomic, addr, compare, value) is not supported!");
-
-                    return T();
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/atomic/AtomicNoOp.hpp b/thirdParty/alpaka/include/alpaka/atomic/AtomicNoOp.hpp
deleted file mode 100644
index 997b98229b..0000000000
--- a/thirdParty/alpaka/include/alpaka/atomic/AtomicNoOp.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/atomic/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-        //#############################################################################
-        //! The CPU fibers accelerator atomic ops.
-        class AtomicNoOp
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            AtomicNoOp() = default;
-            //-----------------------------------------------------------------------------
-            AtomicNoOp(AtomicNoOp const &) = delete;
-            //-----------------------------------------------------------------------------
-            AtomicNoOp(AtomicNoOp &&) = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(AtomicNoOp const &) -> AtomicNoOp & = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(AtomicNoOp &&) -> AtomicNoOp & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AtomicNoOp() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator atomic operation.
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicNoOp,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicNoOp const & atomic,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    return TOp()(addr, value);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicNoOp const & atomic,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    return TOp()(addr, compare, value);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
deleted file mode 100644
index 02cba6acfd..0000000000
--- a/thirdParty/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright 2019 René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef _OPENMP
-
-#include <alpaka/atomic/Traits.hpp>
-#include <alpaka/atomic/Op.hpp>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-        //#############################################################################
-        //! The OpenMP accelerators atomic ops.
-        //
-        //  Atomics can be used in the blocks and threads hierarchy levels.
-        //  Atomics are not guaranteed to be safe between devices or grids.
-        class AtomicOmpBuiltIn
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            AtomicOmpBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AtomicOmpBuiltIn(AtomicOmpBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AtomicOmpBuiltIn(AtomicOmpBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AtomicOmpBuiltIn const &) -> AtomicOmpBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AtomicOmpBuiltIn &&) -> AtomicOmpBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AtomicOmpBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-
-// check for OpenMP 3.1+
-// "omp atomic capture" is not supported before OpenMP 3.1
-#if _OPENMP >= 201107
-
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: ADD
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref += value;
-                    }
-                    return old;
-                }
-            };
-
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: SUB
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref -= value;
-                    }
-                    return old;
-                }
-            };
-
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: EXCH
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref = value;
-                    }
-                    return old;
-                }
-            };
-
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: AND
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref &= value;
-                    }
-                    return old;
-                }
-            };
-
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: OR
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref |= value;
-                    }
-                    return old;
-                }
-            };
-
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: XOR
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref ^= value;
-                    }
-                    return old;
-                }
-            };
-
-#endif // _OPENMP >= 201107
-
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation
-            //
-            // generic implementations for operations where native atomics are not available
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    T old;
-                    // \TODO: Currently not only the access to the same memory location is protected by a mutex but all atomic ops on all threads.
-                    #pragma omp critical (AlpakaOmpAtomicOp)
-                    {
-                        old = TOp()(addr, value);
-                    }
-                    return old;
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    T old;
-                    // \TODO: Currently not only the access to the same memory location is protected by a mutex but all atomic ops on all threads.
-                    #pragma omp critical (AlpakaOmpAtomicOp2)
-                    {
-                        old = TOp()(addr, compare, value);
-                    }
-                    return old;
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp b/thirdParty/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp
deleted file mode 100644
index 816149f1b0..0000000000
--- a/thirdParty/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/atomic/Traits.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <mutex>
-#include <array>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-        //#############################################################################
-        //! The CPU threads accelerator atomic ops.
-        //
-        //  Atomics can be used in the grids, blocks and threads hierarchy levels.
-        //  Atomics are not guaranteed to be save between devices.
-        //
-        // \tparam THashTableSize size of the hash table to allow concurrency between
-        //                        atomics to different addresses
-        template<size_t THashTableSize>
-        class AtomicStdLibLock
-        {
-        public:
-            template<
-                typename TAtomic,
-                typename TOp,
-                typename T,
-                typename THierarchy,
-                typename TSfinae>
-            friend struct atomic::traits::AtomicOp;
-
-            static constexpr size_t nextPowerOf2(size_t const value, size_t const bit = 0u)
-            {
-                return value <= (static_cast<size_t>(1u) << bit) ?
-                    (static_cast<size_t>(1u) << bit) : nextPowerOf2(value, bit + 1u);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! get a hash value of the pointer
-            //
-            // This is no perfect hash, there will be collisions if the size of pointer type
-            // is not a power of two.
-            template<typename TPtr>
-            static size_t hash(TPtr const * const ptr)
-            {
-                size_t const ptrAddr = reinterpret_cast< size_t >( ptr );
-                // using power of two for the next division will increase the performance
-                constexpr size_t typeSizePowerOf2 = nextPowerOf2(sizeof(TPtr));
-                // division removes the stride between indices
-                return (ptrAddr / typeSizePowerOf2);
-            }
-
-            //-----------------------------------------------------------------------------
-            AtomicStdLibLock() = default;
-            //-----------------------------------------------------------------------------
-            AtomicStdLibLock(AtomicStdLibLock const &) = delete;
-            //-----------------------------------------------------------------------------
-            AtomicStdLibLock(AtomicStdLibLock &&) = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(AtomicStdLibLock const &) -> AtomicStdLibLock & = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(AtomicStdLibLock &&) -> AtomicStdLibLock & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AtomicStdLibLock() = default;
-
-            template<typename TPtr>
-            std::mutex & getMutex(TPtr const * const ptr) const
-            {
-                //-----------------------------------------------------------------------------
-                //! get the size of the hash table
-                //
-                // The size is at least 1 or THashTableSize rounded up to the next power of 2
-                constexpr size_t hashTableSize = THashTableSize == 0u ? 1u : nextPowerOf2(THashTableSize);
-
-                size_t const hashedAddr = hash(ptr) & (hashTableSize - 1u);
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wexit-time-destructors"
-#endif
-                static std::array<
-                    std::mutex,
-                    hashTableSize> m_mtxAtomic; //!< The mutex protecting access for an atomic operation.
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                return m_mtxAtomic[hashedAddr];
-            }
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator atomic operation.
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy,
-                size_t THashTableSize>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicStdLibLock<THashTableSize>,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicStdLibLock<THashTableSize> const & atomic,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
-                    return TOp()(addr, value);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicStdLibLock<THashTableSize> const & atomic,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
-                    return TOp()(addr, compare, value);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/atomic/Op.hpp b/thirdParty/alpaka/include/alpaka/atomic/Op.hpp
deleted file mode 100644
index b377cc55b8..0000000000
--- a/thirdParty/alpaka/include/alpaka/atomic/Op.hpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/vec/Vec.hpp>
-
-#include <algorithm>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-        //-----------------------------------------------------------------------------
-        //! Defines operation functors.
-        namespace op
-        {
-            //#############################################################################
-            //! The addition function object.
-            struct Add
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref += value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The subtraction function object.
-            struct Sub
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref -= value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The minimum function object.
-            struct Min
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = std::min(ref, value);
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The maximum function object.
-            struct Max
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = std::max(ref, value);
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The exchange function object.
-            struct Exch
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The increment function object.
-            struct Inc
-            {
-                //-----------------------------------------------------------------------------
-                //! Increments up to value, then reset to 0.
-                //!
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = ((old >= value) ? 0 : old + 1);
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The decrement function object.
-            struct Dec
-            {
-                //-----------------------------------------------------------------------------
-                //! Decrement down to 0, then reset to value.
-                //!
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = (((old == 0) || (old > value)) ? value : (old - 1));
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The and function object.
-            struct And
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref &= value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The or function object.
-            struct Or
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref |= value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The exclusive or function object.
-            struct Xor
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref ^= value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The compare and swap function object.
-            struct Cas
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * addr,
-                    T const & compare,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-
-// gcc-7.4.0 assumes for an optimization that a signed overflow does not occur here.
-// That's fine, so ignore that warning.
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-overflow"
-#endif
-                    ref = ((old == compare) ? value : old);
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
-#pragma GCC diagnostic pop
-#endif
-                    return old;
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/atomic/Traits.hpp b/thirdParty/alpaka/include/alpaka/atomic/Traits.hpp
deleted file mode 100644
index ef5a198eab..0000000000
--- a/thirdParty/alpaka/include/alpaka/atomic/Traits.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The atomic operation traits specifics.
-    namespace atomic
-    {
-        struct ConceptAtomicGrids;
-        struct ConceptAtomicBlocks;
-        struct ConceptAtomicThreads;
-
-        namespace detail
-        {
-            template<
-                typename THierarchy
-            >
-            struct AtomicHierarchyConceptType;
-
-            template<>
-            struct AtomicHierarchyConceptType<
-                hierarchy::Threads>
-            {
-                using type = ConceptAtomicThreads;
-            };
-
-            template<>
-            struct AtomicHierarchyConceptType<
-                hierarchy::Blocks>
-            {
-                using type = ConceptAtomicBlocks;
-            };
-
-            template<>
-            struct AtomicHierarchyConceptType<
-                hierarchy::Grids>
-            {
-                using type = ConceptAtomicGrids;
-            };
-        }
-
-        template<
-            typename THierarchy
-        >
-        using AtomicHierarchyConcept = typename detail::AtomicHierarchyConceptType<THierarchy>::type;
-
-        //-----------------------------------------------------------------------------
-        //! The atomic operation traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The atomic operation trait.
-            template<
-                typename TOp,
-                typename TAtomic,
-                typename T,
-                typename THierarchy,
-                typename TSfinae = void>
-            struct AtomicOp;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Executes the given operation atomically.
-        //!
-        //! \tparam TOp The operation type.
-        //! \tparam T The value type.
-        //! \tparam TAtomic The atomic implementation type.
-        //! \param addr The value to change atomically.
-        //! \param value The value used in the atomic operation.
-        //! \param atomic The atomic implementation.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOp,
-            typename TAtomic,
-            typename T,
-            typename THierarchy = hierarchy::Grids>
-        ALPAKA_FN_HOST_ACC auto atomicOp(
-            TAtomic const & atomic,
-            T * const addr,
-            T const & value,
-            THierarchy const & = THierarchy())
-        -> T
-        {
-            using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
-            return
-                traits::AtomicOp<
-                    TOp,
-                    ImplementationBase,
-                    T,
-                    THierarchy>
-                ::atomicOp(
-                    atomic,
-                    addr,
-                    value);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Executes the given operation atomically.
-        //!
-        //! \tparam TOp The operation type.
-        //! \tparam TAtomic The atomic implementation type.
-        //! \tparam T The value type.
-        //! \param atomic The atomic implementation.
-        //! \param addr The value to change atomically.
-        //! \param compare The comparison value used in the atomic operation.
-        //! \param value The value used in the atomic operation.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOp,
-            typename TAtomic,
-            typename T,
-            typename THierarchy = hierarchy::Grids>
-        ALPAKA_FN_HOST_ACC auto atomicOp(
-            TAtomic const & atomic,
-            T * const addr,
-            T const & compare,
-            T const & value,
-            THierarchy const & = THierarchy())
-        -> T
-        {
-            using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
-            return
-                traits::AtomicOp<
-                    TOp,
-                    ImplementationBase,
-                    T,
-                    THierarchy>
-                ::atomicOp(
-                    atomic,
-                    addr,
-                    compare,
-                    value);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp b/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp
deleted file mode 100644
index 9206f1752c..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Vectorize.hpp>
-#include <alpaka/block/shared/dyn/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <boost/align.hpp>
-
-#include <vector>
-#include <memory>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace dyn
-            {
-                //#############################################################################
-                //! The block shared dynamic memory allocator without synchronization.
-                class BlockSharedMemDynBoostAlignedAlloc : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynBoostAlignedAlloc>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemDynBoostAlignedAlloc(
-                        std::size_t const & blockSharedMemDynSizeBytes)
-                    {
-                        if(blockSharedMemDynSizeBytes > 0u)
-                        {
-                            m_blockSharedMemDyn.reset(
-                                reinterpret_cast<uint8_t *>(
-                                    boost::alignment::aligned_alloc(core::vectorization::defaultAlignment, blockSharedMemDynSizeBytes)));
-                        }
-                    }
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemDynBoostAlignedAlloc(BlockSharedMemDynBoostAlignedAlloc const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemDynBoostAlignedAlloc(BlockSharedMemDynBoostAlignedAlloc &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemDynBoostAlignedAlloc const &) -> BlockSharedMemDynBoostAlignedAlloc & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemDynBoostAlignedAlloc &&) -> BlockSharedMemDynBoostAlignedAlloc & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemDynBoostAlignedAlloc() = default;
-
-                public:
-                    std::unique_ptr<
-                        uint8_t,
-                        boost::alignment::aligned_delete> mutable
-                            m_blockSharedMemDyn;  //!< Block shared dynamic memory.
-                };
-
-                namespace traits
-                {
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
-#endif
-                    //#############################################################################
-                    template<
-                        typename T>
-                    struct GetMem<
-                        T,
-                        BlockSharedMemDynBoostAlignedAlloc>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto getMem(
-                            block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc const & blockSharedMemDyn)
-                        -> T *
-                        {
-                            static_assert(
-                                core::vectorization::defaultAlignment >= alignof(T),
-                                "Unable to get block shared dynamic memory for types with alignment higher than defaultAlignment!");
-
-                            return reinterpret_cast<T*>(blockSharedMemDyn.m_blockSharedMemDyn.get());
-                        }
-                    };
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp
deleted file mode 100644
index 2900fc27dd..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/block/shared/dyn/Traits.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace dyn
-            {
-                //#############################################################################
-                //! The GPU CUDA block shared memory allocator.
-                class BlockSharedMemDynCudaBuiltIn : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynCudaBuiltIn>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemDynCudaBuiltIn() = default;
-                    //-----------------------------------------------------------------------------
-                    __device__ BlockSharedMemDynCudaBuiltIn(BlockSharedMemDynCudaBuiltIn const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ BlockSharedMemDynCudaBuiltIn(BlockSharedMemDynCudaBuiltIn &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ auto operator=(BlockSharedMemDynCudaBuiltIn const &) -> BlockSharedMemDynCudaBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ auto operator=(BlockSharedMemDynCudaBuiltIn &&) -> BlockSharedMemDynCudaBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemDynCudaBuiltIn() = default;
-                };
-
-                namespace traits
-                {
-                    //#############################################################################
-                    template<
-                        typename T>
-                    struct GetMem<
-                        T,
-                        BlockSharedMemDynCudaBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-                        __device__ static auto getMem(
-                            block::shared::dyn::BlockSharedMemDynCudaBuiltIn const &)
-                        -> T *
-                        {
-                            // Because unaligned access to variables is not allowed in device code,
-                            // we have to use the widest possible type to have all types aligned correctly.
-                            // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
-                            // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
-                            extern __shared__ float4 shMem[];
-                            return reinterpret_cast<T *>(shMem);
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp
deleted file mode 100644
index 5d63bf7bf6..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/block/shared/dyn/Traits.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace dyn
-            {
-                //#############################################################################
-                //! The GPU HIP block shared memory allocator.
-                class BlockSharedMemDynHipBuiltIn : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynHipBuiltIn>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Default constructor.
-                    ALPAKA_FN_HOST_ACC BlockSharedMemDynHipBuiltIn() = default;
-                    //-----------------------------------------------------------------------------
-                    //! Copy constructor.
-                    __device__ BlockSharedMemDynHipBuiltIn(BlockSharedMemDynHipBuiltIn const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Move constructor.
-                    __device__ BlockSharedMemDynHipBuiltIn(BlockSharedMemDynHipBuiltIn &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Copy assignment operator.
-                    __device__ auto operator=(BlockSharedMemDynHipBuiltIn const &) -> BlockSharedMemDynHipBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Move assignment operator.
-                    __device__ auto operator=(BlockSharedMemDynHipBuiltIn &&) -> BlockSharedMemDynHipBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Destructor.
-                    /*virtual*/ ALPAKA_FN_HOST_ACC ~BlockSharedMemDynHipBuiltIn() = default;
-                };
-
-                namespace traits
-                {
-                    //#############################################################################
-                    //!
-                    template<
-                        typename T>
-                    struct GetMem<
-                        T,
-                        BlockSharedMemDynHipBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-
-                        __device__ static auto getMem(
-                            block::shared::dyn::BlockSharedMemDynHipBuiltIn const &)
-                        -> T *
-                        {
-                            // Because unaligned access to variables is not allowed in device code,
-                            // we have to use the widest possible type to have all types aligned correctly.
-                            extern __shared__ float4 shMem[];
-                            return reinterpret_cast<T *>(shMem);
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/dyn/Traits.hpp b/thirdParty/alpaka/include/alpaka/block/shared/dyn/Traits.hpp
deleted file mode 100644
index 9dff622d08..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/dyn/Traits.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The grid block specifics
-    namespace block
-    {
-        //-----------------------------------------------------------------------------
-        //! The block shared memory operation specifics.
-        namespace shared
-        {
-            //-----------------------------------------------------------------------------
-            //! The block shared dynamic memory operation specifics.
-            namespace dyn
-            {
-                struct ConceptBlockSharedDyn;
-
-                //-----------------------------------------------------------------------------
-                //! The block shared dynamic memory operation traits.
-                namespace traits
-                {
-                    //#############################################################################
-                    //! The block shared dynamic memory get trait.
-                    template<
-                        typename T,
-                        typename TBlockSharedMemDyn,
-                        typename TSfinae = void>
-                    struct GetMem;
-                }
-
-                //-----------------------------------------------------------------------------
-                //! Returns the pointr to the block shared dynamic memory.
-                //!
-                //! \tparam T The element type.
-                //! \tparam TBlockSharedMemDyn The block shared dynamic memory implementation type.
-                //! \param blockSharedMemDyn The block shared dynamic memory implementation.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T,
-                    typename TBlockSharedMemDyn>
-                ALPAKA_FN_ACC auto getMem(
-                    TBlockSharedMemDyn const & blockSharedMemDyn)
-                -> T *
-                {
-                    using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedDyn, TBlockSharedMemDyn>;
-                    return
-                        traits::GetMem<
-                            T,
-                            ImplementationBase>
-                        ::getMem(
-                            blockSharedMemDyn);
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp
deleted file mode 100644
index 9bfc0852ee..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/block/shared/st/Traits.hpp>
-
-#include <type_traits>
-#include <cstdint>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace st
-            {
-                //#############################################################################
-                //! The GPU CUDA block shared memory allocator.
-                class BlockSharedMemStCudaBuiltIn : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStCudaBuiltIn>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStCudaBuiltIn() = default;
-                    //-----------------------------------------------------------------------------
-                    __device__ BlockSharedMemStCudaBuiltIn(BlockSharedMemStCudaBuiltIn const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ BlockSharedMemStCudaBuiltIn(BlockSharedMemStCudaBuiltIn &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ auto operator=(BlockSharedMemStCudaBuiltIn const &) -> BlockSharedMemStCudaBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ auto operator=(BlockSharedMemStCudaBuiltIn &&) -> BlockSharedMemStCudaBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemStCudaBuiltIn() = default;
-                };
-
-                namespace traits
-                {
-                    //#############################################################################
-                    template<
-                        typename T,
-                        std::size_t TuniqueId>
-                    struct AllocVar<
-                        T,
-                        TuniqueId,
-                        BlockSharedMemStCudaBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-                        __device__ static auto allocVar(
-                            block::shared::st::BlockSharedMemStCudaBuiltIn const &)
-                        -> T &
-                        {
-                            __shared__ uint8_t shMem alignas(alignof(T)) [sizeof(T)];
-                            return *(
-                                reinterpret_cast<T*>( shMem ));
-                        }
-                    };
-                    //#############################################################################
-                    template<>
-                    struct FreeMem<
-                        BlockSharedMemStCudaBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-                        __device__ static auto freeMem(
-                            block::shared::st::BlockSharedMemStCudaBuiltIn const &)
-                        -> void
-                        {
-                            // Nothing to do. CUDA block shared memory is automatically freed when all threads left the block.
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp
deleted file mode 100644
index cf05f8c28d..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/block/shared/st/Traits.hpp>
-
-#include <type_traits>
-#include <cstdint>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace st
-            {
-                //#############################################################################
-                //! The GPU HIP block shared memory allocator.
-                class BlockSharedMemStHipBuiltIn : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStHipBuiltIn>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Default constructor.
-                    ALPAKA_FN_HOST_ACC BlockSharedMemStHipBuiltIn() = default;
-                    //-----------------------------------------------------------------------------
-                    //! Copy constructor.
-                    __device__ BlockSharedMemStHipBuiltIn(BlockSharedMemStHipBuiltIn const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Move constructor.
-                    __device__ BlockSharedMemStHipBuiltIn(BlockSharedMemStHipBuiltIn &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Copy assignment operator.
-                    __device__ auto operator=(BlockSharedMemStHipBuiltIn const &) -> BlockSharedMemStHipBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Move assignment operator.
-                    __device__ auto operator=(BlockSharedMemStHipBuiltIn &&) -> BlockSharedMemStHipBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Destructor.
-                    /*virtual*/ ALPAKA_FN_HOST_ACC ~BlockSharedMemStHipBuiltIn() = default;
-                };
-
-                namespace traits
-                {
-                    //#############################################################################
-                    //!
-                    template<
-                        typename T,
-                        std::size_t TuniqueId>
-                    struct AllocVar<
-                        T,
-                        TuniqueId,
-                        BlockSharedMemStHipBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-
-                        __device__ static auto allocVar(
-                            block::shared::st::BlockSharedMemStHipBuiltIn const &)
-                        -> T &
-                        {
-                            __shared__ uint8_t shMem alignas(alignof(T)) [sizeof(T)];
-                            return *(
-                                reinterpret_cast<T*>( shMem ));
-                        }
-                    };
-                    //#############################################################################
-                    //!
-                    template<>
-                    struct FreeMem<
-                        BlockSharedMemStHipBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-
-                        __device__ static auto freeMem(
-                            block::shared::st::BlockSharedMemStHipBuiltIn const &)
-                        -> void
-                        {
-                            // Nothing to do. HIP block shared memory is automatically freed when all threads left the block.
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp b/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp
deleted file mode 100644
index ea52c7ea8c..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Vectorize.hpp>
-#include <alpaka/block/shared/st/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <boost/align.hpp>
-
-#include <vector>
-#include <memory>
-#include <functional>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace st
-            {
-                //#############################################################################
-                //! The block shared memory allocator allocating memory with synchronization on the master thread.
-                class BlockSharedMemStMasterSync : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStMasterSync>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStMasterSync(
-                        std::function<void()> fnSync,
-                        std::function<bool()> fnIsMasterThread) :
-                            m_syncFn(fnSync),
-                            m_isMasterThreadFn(fnIsMasterThread)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStMasterSync(BlockSharedMemStMasterSync const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStMasterSync(BlockSharedMemStMasterSync &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemStMasterSync const &) -> BlockSharedMemStMasterSync & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemStMasterSync &&) -> BlockSharedMemStMasterSync & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemStMasterSync() = default;
-
-                public:
-                    // TODO: We should add the size of the (current) allocation.
-                    // This would allow to assert that all parallel function calls request to allocate the same size.
-                    std::vector<
-                        std::unique_ptr<
-                            uint8_t,
-                            boost::alignment::aligned_delete>> mutable
-                        m_sharedAllocs;
-
-                    std::function<void()> m_syncFn;
-                    std::function<bool()> m_isMasterThreadFn;
-                };
-
-                namespace traits
-                {
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
-#endif
-                    //#############################################################################
-                    template<
-                        typename T,
-                        std::size_t TuniqueId>
-                    struct AllocVar<
-                        T,
-                        TuniqueId,
-                        BlockSharedMemStMasterSync>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto allocVar(
-                            block::shared::st::BlockSharedMemStMasterSync const & blockSharedMemSt)
-                        -> T &
-                        {
-                            // TODO: replace with constexpr std::max in C++14
-                            constexpr std::size_t alignmentInBytes = (core::vectorization::defaultAlignment < alignof(T)) ? alignof(T) : core::vectorization::defaultAlignment;
-
-                            // Assure that all threads have executed the return of the last allocBlockSharedArr function (if there was one before).
-                            blockSharedMemSt.m_syncFn();
-
-                            if(blockSharedMemSt.m_isMasterThreadFn())
-                            {
-                                blockSharedMemSt.m_sharedAllocs.emplace_back(
-                                    reinterpret_cast<uint8_t *>(
-                                        boost::alignment::aligned_alloc(alignmentInBytes, sizeof(T))));
-                            }
-                            blockSharedMemSt.m_syncFn();
-
-                            return
-                                std::ref(
-                                    *reinterpret_cast<T*>(
-                                        blockSharedMemSt.m_sharedAllocs.back().get()));
-                        }
-                    };
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-                    //#############################################################################
-                    template<>
-                    struct FreeMem<
-                        BlockSharedMemStMasterSync>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto freeMem(
-                            block::shared::st::BlockSharedMemStMasterSync const & blockSharedMemSt)
-                        -> void
-                        {
-                            blockSharedMemSt.m_sharedAllocs.clear();
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp b/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp
deleted file mode 100644
index ae414d16f5..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Vectorize.hpp>
-#include <alpaka/block/shared/st/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <boost/align.hpp>
-
-#include <vector>
-#include <memory>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace st
-            {
-                //#############################################################################
-                //! The block shared memory allocator without synchronization.
-                class BlockSharedMemStNoSync : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStNoSync>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStNoSync() = default;
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStNoSync(BlockSharedMemStNoSync const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStNoSync(BlockSharedMemStNoSync &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemStNoSync const &) -> BlockSharedMemStNoSync & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemStNoSync &&) -> BlockSharedMemStNoSync & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemStNoSync() = default;
-
-                public:
-                    // TODO: We should add the size of the (current) allocation.
-                    // This would allow to assert that all parallel function calls request to allocate the same size.
-                    std::vector<
-                        std::unique_ptr<
-                            uint8_t,
-                            boost::alignment::aligned_delete>> mutable
-                        m_sharedAllocs;
-                };
-
-                namespace traits
-                {
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
-#endif
-                    //#############################################################################
-                    template<
-                        typename T,
-                        std::size_t TuniqueId>
-                    struct AllocVar<
-                        T,
-                        TuniqueId,
-                        BlockSharedMemStNoSync>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto allocVar(
-                            block::shared::st::BlockSharedMemStNoSync const & blockSharedMemSt)
-                        -> T &
-                        {
-                            // TODO: replace with constexpr std::max in C++14
-                            constexpr std::size_t alignmentInBytes = (core::vectorization::defaultAlignment < alignof(T)) ? alignof(T) : core::vectorization::defaultAlignment;
-
-                            blockSharedMemSt.m_sharedAllocs.emplace_back(
-                                reinterpret_cast<uint8_t *>(
-                                    boost::alignment::aligned_alloc(alignmentInBytes, sizeof(T))));
-                            return
-                                std::ref(
-                                    *reinterpret_cast<T*>(
-                                        blockSharedMemSt.m_sharedAllocs.back().get()));
-                        }
-                    };
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-                    //#############################################################################
-                    template<>
-                    struct FreeMem<
-                        BlockSharedMemStNoSync>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto freeMem(
-                            block::shared::st::BlockSharedMemStNoSync const & blockSharedMemSt)
-                        -> void
-                        {
-                            blockSharedMemSt.m_sharedAllocs.clear();
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/block/shared/st/Traits.hpp b/thirdParty/alpaka/include/alpaka/block/shared/st/Traits.hpp
deleted file mode 100644
index 9118860a7e..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/shared/st/Traits.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The grid block specifics
-    namespace block
-    {
-        //-----------------------------------------------------------------------------
-        //! The block shared memory operation specifics.
-        namespace shared
-        {
-            //-----------------------------------------------------------------------------
-            //! The block shared static memory operation specifics.
-            namespace st
-            {
-                struct ConceptBlockSharedSt;
-
-                //-----------------------------------------------------------------------------
-                //! The block shared static memory operation traits.
-                namespace traits
-                {
-                    //#############################################################################
-                    //! The block shared static memory variable allocation operation trait.
-                    template<
-                        typename T,
-                        std::size_t TuniqueId,
-                        typename TBlockSharedMemSt,
-                        typename TSfinae = void>
-                    struct AllocVar;
-                    //#############################################################################
-                    //! The block shared static memory free operation trait.
-                    template<
-                        typename TBlockSharedMemSt,
-                        typename TSfinae = void>
-                    struct FreeMem;
-                }
-
-                //-----------------------------------------------------------------------------
-                //! Allocates a variable in block shared static memory.
-                //!
-                //! The allocated variable is uninitialized and not default constructed!
-                //!
-                //! \tparam T The element type.
-                //! \tparam TuniqueId id those is unique inside a kernel
-                //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
-                //! \param blockSharedMemSt The block shared allocator implementation.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T,
-                    std::size_t TuniqueId,
-                    typename TBlockSharedMemSt>
-                ALPAKA_FN_ACC auto allocVar(
-                    TBlockSharedMemSt const & blockSharedMemSt)
-                -> T &
-                {
-                    using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
-                    return
-                        traits::AllocVar<
-                            T,
-                            TuniqueId,
-                            ImplementationBase>
-                        ::allocVar(
-                            blockSharedMemSt);
-                }
-
-                //-----------------------------------------------------------------------------
-                //! Frees all block shared static memory.
-                //!
-                //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
-                //! \param blockSharedMemSt The block shared allocator implementation.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TBlockSharedMemSt>
-                ALPAKA_FN_ACC auto freeMem(
-                    TBlockSharedMemSt & blockSharedMemSt)
-                -> void
-                {
-                    using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
-                    traits::FreeMem<
-                        ImplementationBase>
-                    ::freeMem(
-                        blockSharedMemSt);
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp b/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp
deleted file mode 100644
index 0d8e2d6b2f..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
-
-#include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/core/Fibers.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <mutex>
-#include <map>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace sync
-        {
-            //#############################################################################
-            //! The thread id map barrier block synchronization.
-            template<
-                typename TIdx>
-            class BlockSyncBarrierFiber : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierFiber<TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierFiber(
-                    TIdx const & blockThreadCount) :
-                        m_barrier(static_cast<std::size_t>(blockThreadCount)),
-                        m_threadCount(blockThreadCount),
-                        m_curThreadCount(static_cast<TIdx>(0u)),
-                        m_generation(static_cast<TIdx>(0u))
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierFiber(BlockSyncBarrierFiber const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierFiber(BlockSyncBarrierFiber &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierFiber const &) -> BlockSyncBarrierFiber & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierFiber &&) -> BlockSyncBarrierFiber & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~BlockSyncBarrierFiber() = default;
-
-                boost::fibers::barrier mutable m_barrier;
-
-                TIdx mutable m_threadCount;
-                TIdx mutable m_curThreadCount;
-                TIdx mutable m_generation;
-                int mutable m_result[2u];
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                template<
-                    typename TIdx>
-                struct SyncBlockThreads<
-                    BlockSyncBarrierFiber<TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto syncBlockThreads(
-                        block::sync::BlockSyncBarrierFiber<TIdx> const & blockSync)
-                    -> void
-                    {
-                        blockSync.m_barrier.wait();
-                    }
-                };
-
-                //#############################################################################
-                template<
-                    typename TOp,
-                    typename TIdx>
-                struct SyncBlockThreadsPredicate<
-                    TOp,
-                    BlockSyncBarrierFiber<TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncBarrierFiber<TIdx> const & blockSync,
-                        int predicate)
-                    -> int
-                    {
-                        if(blockSync.m_curThreadCount == blockSync.m_threadCount)
-                        {
-                            blockSync.m_curThreadCount = static_cast<TIdx>(0u);
-                            ++blockSync.m_generation;
-                        }
-
-                        auto const generationMod2(blockSync.m_generation % static_cast<TIdx>(2u));
-
-                        // The first fiber will reset the value to the initial value.
-                        if(blockSync.m_curThreadCount == static_cast<TIdx>(0u))
-                        {
-                            blockSync.m_result[generationMod2] = TOp::InitialValue;
-                        }
-
-                        ++blockSync.m_curThreadCount;
-
-                        // We do not have to lock because there is only ever one fiber active per block.
-                        blockSync.m_result[generationMod2] = TOp()(blockSync.m_result[generationMod2], predicate);
-
-                        // After all block threads have combined their values ...
-                        blockSync.m_barrier.wait();
-
-                        // ... the result can be returned.
-                        return blockSync.m_result[generationMod2];
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp b/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
deleted file mode 100644
index 1676b4b51d..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef _OPENMP
-
-#include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace sync
-        {
-            //#############################################################################
-            //! The OpenMP barrier block synchronization.
-            class BlockSyncBarrierOmp : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierOmp>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierOmp() :
-                    m_generation(0u)
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierOmp(BlockSyncBarrierOmp const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierOmp(BlockSyncBarrierOmp &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierOmp const &) -> BlockSyncBarrierOmp & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierOmp &&) -> BlockSyncBarrierOmp & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~BlockSyncBarrierOmp() = default;
-
-                std::uint8_t mutable m_generation;
-                int mutable m_result[2];
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                template<>
-                struct SyncBlockThreads<
-                    BlockSyncBarrierOmp>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto syncBlockThreads(
-                        block::sync::BlockSyncBarrierOmp const & blockSync)
-                    -> void
-                    {
-                        alpaka::ignore_unused(blockSync);
-
-                        // NOTE: This waits for all threads in all blocks.
-                        // If multiple blocks are executed in parallel this is not optimal.
-                        #pragma omp barrier
-                    }
-                };
-
-                namespace detail
-                {
-                    //#############################################################################
-                    template<
-                        typename TOp>
-                    struct AtomicOp;
-                    //#############################################################################
-                    template<>
-                    struct AtomicOp<
-                        block::sync::op::Count>
-                    {
-                        void operator()(int& result, bool value)
-                        {
-                            #pragma omp atomic
-                            result += static_cast<int>(value);
-                        }
-                    };
-                    //#############################################################################
-                    template<>
-                    struct AtomicOp<
-                        block::sync::op::LogicalAnd>
-                    {
-                        void operator()(int& result, bool value)
-                        {
-                            #pragma omp atomic
-                            result &= static_cast<int>(value);
-                        }
-                    };
-                    //#############################################################################
-                    template<>
-                    struct AtomicOp<
-                        block::sync::op::LogicalOr>
-                    {
-                        void operator()(int& result, bool value)
-                        {
-                            #pragma omp atomic
-                            result |= static_cast<int>(value);
-                        }
-                    };
-                }
-
-                //#############################################################################
-                template<
-                    typename TOp>
-                struct SyncBlockThreadsPredicate<
-                    TOp,
-                    BlockSyncBarrierOmp>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncBarrierOmp const & blockSync,
-                        int predicate)
-                    -> int
-                    {
-                        // The first thread initializes the value.
-                        // There is an implicit barrier at the end of omp single.
-                        // NOTE: This code is executed only once for all OpenMP threads.
-                        // If multiple blocks with multiple threads are executed in parallel
-                        // this reduction is executed only for one block!
-                        #pragma omp single
-                        {
-                            ++blockSync.m_generation;
-                            blockSync.m_result[blockSync.m_generation % 2u] = TOp::InitialValue;
-                        }
-
-                        auto const generationMod2(blockSync.m_generation % 2u);
-                        int& result(blockSync.m_result[generationMod2]);
-                        bool const predicateBool(predicate != 0);
-
-                        detail::AtomicOp<TOp>()(result, predicateBool);
-
-                        // Wait for all threads to write their predicate into the vector.
-                        // NOTE: This waits for all threads in all blocks.
-                        // If multiple blocks are executed in parallel this is not optimal.
-                        #pragma omp barrier
-
-                        return blockSync.m_result[generationMod2];
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp b/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
deleted file mode 100644
index 8f8755a1d9..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-
-#include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/core/BarrierThread.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <thread>
-#include <mutex>
-#include <map>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace sync
-        {
-            //#############################################################################
-            //! The thread id map barrier block synchronization.
-            template<
-                typename TIdx>
-            class BlockSyncBarrierThread : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierThread<TIdx>>
-            {
-            public:
-                using Barrier = core::threads::BarrierThread<TIdx>;
-                using BarrierWithPredicate = core::threads::BarrierThreadWithPredicate<TIdx>;
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierThread(
-                    TIdx const & blockThreadCount) :
-                        m_barrier(blockThreadCount),
-                        m_barrierWithPredicate(blockThreadCount)
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierThread(BlockSyncBarrierThread const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierThread(BlockSyncBarrierThread &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierThread const &) -> BlockSyncBarrierThread & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierThread &&) -> BlockSyncBarrierThread & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~BlockSyncBarrierThread() = default;
-
-                Barrier mutable m_barrier;
-                BarrierWithPredicate mutable m_barrierWithPredicate;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                template<
-                    typename TIdx>
-                struct SyncBlockThreads<
-                    BlockSyncBarrierThread<TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto syncBlockThreads(
-                        block::sync::BlockSyncBarrierThread<TIdx> const & blockSync)
-                    -> void
-                    {
-                        blockSync.m_barrier.wait();
-                    }
-                };
-
-                //#############################################################################
-                template<
-                    typename TOp,
-                    typename TIdx>
-                struct SyncBlockThreadsPredicate<
-                    TOp,
-                    BlockSyncBarrierThread<TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncBarrierThread<TIdx> const & blockSync,
-                        int predicate)
-                    -> int
-                    {
-                        return blockSync.m_barrierWithPredicate.template wait<TOp>(predicate);
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp
deleted file mode 100644
index cf2532e37f..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/block/sync/Traits.hpp>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace sync
-        {
-            //#############################################################################
-            //! The GPU CUDA block synchronization.
-            class BlockSyncCudaBuiltIn : public concepts::Implements<ConceptBlockSync, BlockSyncCudaBuiltIn>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                BlockSyncCudaBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                __device__ BlockSyncCudaBuiltIn(BlockSyncCudaBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ BlockSyncCudaBuiltIn(BlockSyncCudaBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(BlockSyncCudaBuiltIn const &) -> BlockSyncCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(BlockSyncCudaBuiltIn &&) -> BlockSyncCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~BlockSyncCudaBuiltIn() = default;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                template<>
-                struct SyncBlockThreads<
-                    BlockSyncCudaBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto syncBlockThreads(
-                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/)
-                    -> void
-                    {
-                        __syncthreads();
-                    }
-                };
-
-                //#############################################################################
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::Count,
-                    BlockSyncCudaBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-                        return __syncthreads_count(predicate);
-                    }
-                };
-
-                //#############################################################################
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::LogicalAnd,
-                    BlockSyncCudaBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-                        return __syncthreads_and(predicate);
-                    }
-                };
-
-                //#############################################################################
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::LogicalOr,
-                    BlockSyncCudaBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-                        return __syncthreads_or(predicate);
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncHipBuiltIn.hpp
deleted file mode 100644
index 6d1736d38a..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncHipBuiltIn.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/block/sync/Traits.hpp>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace sync
-        {
-            //#############################################################################
-            //! The GPU HIP block synchronization.
-            class BlockSyncHipBuiltIn : public concepts::Implements<ConceptBlockSync, BlockSyncHipBuiltIn>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                //! Default constructor.
-                ALPAKA_FN_HOST_ACC BlockSyncHipBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                //! Copy constructor.
-                __device__ BlockSyncHipBuiltIn(BlockSyncHipBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                //! Move constructor.
-                __device__ BlockSyncHipBuiltIn(BlockSyncHipBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                //! Copy assignment operator.
-                __device__ auto operator=(BlockSyncHipBuiltIn const &) -> BlockSyncHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                //! Move assignment operator.
-                __device__ auto operator=(BlockSyncHipBuiltIn &&) -> BlockSyncHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                //! Destructor.
-                /*virtual*/ ALPAKA_FN_HOST_ACC ~BlockSyncHipBuiltIn() = default;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                //!
-                template<>
-                struct SyncBlockThreads<
-                    BlockSyncHipBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto syncBlockThreads(
-                        block::sync::BlockSyncHipBuiltIn const & /*blockSync*/)
-                    -> void
-                    {
-                        __syncthreads();
-                    }
-                };
-
-                //#############################################################################
-                //!
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::Count,
-                    BlockSyncHipBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncHipBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-#if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__==0 && (BOOST_COMP_HCC || BOOST_COMP_HIP)
-                        // workaround for unsupported syncthreads_* operation on HIP(HCC)
-                        __shared__ int tmp;
-                        __syncthreads();
-                        if(threadIdx.x==0)
-                            tmp=0;
-                        __syncthreads();
-                        if(predicate)
-                            atomicAdd(&tmp, 1);
-                        __syncthreads();
-
-                        return tmp;
-#else
-                        return __syncthreads_count(predicate);
-#endif
-                    }
-                };
-
-                //#############################################################################
-                //!
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::LogicalAnd,
-                    BlockSyncHipBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncHipBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-#if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__==0 && (BOOST_COMP_HCC || BOOST_COMP_HIP)
-                        // workaround for unsupported syncthreads_* operation on HIP(HCC)
-                        __shared__ int tmp;
-                        __syncthreads();
-                        if(threadIdx.x==0)
-                            tmp=1;
-                        __syncthreads();
-                        if(!predicate)
-                            atomicAnd(&tmp, 0);
-                        __syncthreads();
-
-                        return tmp;
-#else
-                        return __syncthreads_and(predicate);
-#endif
-                    }
-                };
-
-                //#############################################################################
-                //!
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::LogicalOr,
-                    BlockSyncHipBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncHipBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-#if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__==0 && (BOOST_COMP_HCC || BOOST_COMP_HIP)
-                        // workaround for unsupported syncthreads_* operation on HIP(HCC)
-                        __shared__ int tmp;
-                        __syncthreads();
-                        if(threadIdx.x==0)
-                            tmp=0;
-                        __syncthreads();
-                        if(predicate)
-                            atomicOr(&tmp, 1);
-                        __syncthreads();
-
-                        return tmp;
-#else
-                        return __syncthreads_or(predicate);
-#endif
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp b/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp
deleted file mode 100644
index d93c9acf90..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace sync
-        {
-            //#############################################################################
-            //! The no op block synchronization.
-            class BlockSyncNoOp : public concepts::Implements<ConceptBlockSync, BlockSyncNoOp>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC BlockSyncNoOp() = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC BlockSyncNoOp(BlockSyncNoOp const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC BlockSyncNoOp(BlockSyncNoOp &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC auto operator=(BlockSyncNoOp const &) -> BlockSyncNoOp & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC auto operator=(BlockSyncNoOp &&) -> BlockSyncNoOp & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ALPAKA_FN_ACC ~BlockSyncNoOp() = default;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                template<>
-                struct SyncBlockThreads<
-                    BlockSyncNoOp>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreads(
-                        block::sync::BlockSyncNoOp const & blockSync)
-                    -> void
-                    {
-                        alpaka::ignore_unused(blockSync);
-                        // Nothing to do.
-                    }
-                };
-
-                //#############################################################################
-                template<
-                    typename TOp>
-                struct SyncBlockThreadsPredicate<
-                    TOp,
-                    BlockSyncNoOp>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncNoOp const & blockSync,
-                        int predicate)
-                    -> int
-                    {
-                        alpaka::ignore_unused(blockSync);
-                        return predicate;
-                    }
-                };
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/block/sync/Traits.hpp b/thirdParty/alpaka/include/alpaka/block/sync/Traits.hpp
deleted file mode 100644
index 57e769a24a..0000000000
--- a/thirdParty/alpaka/include/alpaka/block/sync/Traits.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The grid block specifics
-    namespace block
-    {
-        //-----------------------------------------------------------------------------
-        //! The block synchronization specifics.
-        namespace sync
-        {
-            struct ConceptBlockSync;
-
-            //-----------------------------------------------------------------------------
-            //! The block synchronization traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The block synchronization operation trait.
-                template<
-                    typename TBlockSync,
-                    typename TSfinae = void>
-                struct SyncBlockThreads;
-
-                //#############################################################################
-                //! The block synchronization and predicate operation trait.
-                template<
-                    typename TOp,
-                    typename TBlockSync,
-                    typename TSfinae = void>
-                struct SyncBlockThreadsPredicate;
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Synchronizes all threads within the current block (independently for all blocks).
-            //!
-            //! \tparam TBlockSync The block synchronization implementation type.
-            //! \param blockSync The block synchronization implementation.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TBlockSync>
-            ALPAKA_FN_ACC auto syncBlockThreads(
-                TBlockSync const & blockSync)
-            -> void
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
-                traits::SyncBlockThreads<
-                    ImplementationBase>
-                ::syncBlockThreads(
-                    blockSync);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Defines operation functors.
-            namespace op
-            {
-                //#############################################################################
-                //! The addition function object.
-                struct Count
-                {
-                    enum { InitialValue = 0u};
-
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename T>
-                    ALPAKA_FN_HOST_ACC auto operator()(
-                        T const & currentResult,
-                        T const & value) const
-                    -> T
-                    {
-                        return currentResult + static_cast<T>(value != static_cast<T>(0));
-                    }
-                };
-                //#############################################################################
-                //! The logical and function object.
-                struct LogicalAnd
-                {
-                    enum { InitialValue = 1u};
-
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename T>
-                    ALPAKA_FN_HOST_ACC auto operator()(
-                        T const & currentResult,
-                        T const & value) const
-                    -> T
-                    {
-                        return static_cast<T>(currentResult && (value != static_cast<T>(0)));
-                    }
-                };
-                //#############################################################################
-                //! The logical or function object.
-                struct LogicalOr
-                {
-                    enum { InitialValue = 0u};
-
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename T>
-                    ALPAKA_FN_HOST_ACC auto operator()(
-                        T const & currentResult,
-                        T const & value) const
-                    -> T
-                    {
-                        return static_cast<T>(currentResult || (value != static_cast<T>(0)));
-                    }
-                };
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Synchronizes all threads within the current block (independently for all blocks),
-            //! evaluates the predicate for all threads and returns the combination of all the results
-            //! computed via TOp.
-            //!
-            //! \tparam TOp The operation used to combine the predicate values of all threads.
-            //! \tparam TBlockSync The block synchronization implementation type.
-            //! \param blockSync The block synchronization implementation.
-            //! \param predicate The predicate value of the current thread.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TOp,
-                typename TBlockSync>
-            ALPAKA_FN_ACC auto syncBlockThreadsPredicate(
-                TBlockSync const & blockSync,
-                int predicate)
-            -> int
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
-                return
-                    traits::SyncBlockThreadsPredicate<
-                        TOp,
-                        ImplementationBase>
-                    ::syncBlockThreadsPredicate(
-                        blockSync,
-                        predicate);
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/core/Align.hpp b/thirdParty/alpaka/include/alpaka/core/Align.hpp
deleted file mode 100644
index c0b05a31c6..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Align.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <cstddef>
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace core
-    {
-        //-----------------------------------------------------------------------------
-        //! Rounds to the next higher power of two (if not already power of two).
-        // Adapted from llvm/ADT/SmallPtrSet.h
-        template<
-            std::size_t N>
-        struct RoundUpToPowerOfTwo;
-
-        //-----------------------------------------------------------------------------
-        //! Defines implementation details that should not be used directly by the user.
-        namespace detail
-        {
-            //-----------------------------------------------------------------------------
-            //! Base case for N being a power of two.
-            template<
-                std::size_t N,
-                bool TisPowerTwo>
-            struct RoundUpToPowerOfTwoHelper :
-                std::integral_constant<
-                    std::size_t,
-                    N>
-            {};
-            //-----------------------------------------------------------------------------
-            //! Case for N not being a power of two.
-            // We could just use NextVal = N+1, but this converges faster.  N|(N-1) sets
-            // the right-most zero bits to one all at once, e.g. 0b0011000 -> 0b0011111.
-            template<
-                std::size_t N>
-            struct RoundUpToPowerOfTwoHelper<
-                N,
-                false> :
-                    std::integral_constant<
-                        std::size_t,
-                        RoundUpToPowerOfTwo<(N | (N - 1)) + 1>::value>
-            {};
-        }
-        //-----------------------------------------------------------------------------
-        template<
-            std::size_t N>
-        struct RoundUpToPowerOfTwo :
-            std::integral_constant<
-                std::size_t,
-                detail::RoundUpToPowerOfTwoHelper<
-                    N,
-                    (N&(N - 1)) == 0>::value>
-        {};
-
-        //-----------------------------------------------------------------------------
-        //! The alignment specifics.
-        namespace align
-        {
-            //-----------------------------------------------------------------------------
-            //! Calculates the optimal alignment for data of the given size.
-            template<
-                std::size_t TsizeBytes>
-            struct OptimalAlignment :
-                std::integral_constant<
-                    std::size_t,
-#if BOOST_COMP_GNUC
-                    // GCC does not support alignments larger then 128: "warning: requested alignment 256 is larger than 128[-Wattributes]".
-                    (TsizeBytes > 64)
-                        ? 128
-                        :
-#endif
-                            (RoundUpToPowerOfTwo<TsizeBytes>::value)>
-            {};
-        }
-    }
-}
-
-// ICC does not support constant expressions as parameters to alignas
-// The optimal alignment for a type is the next higher or equal power of two.
-#if BOOST_COMP_INTEL
-    #define ALPAKA_OPTIMAL_ALIGNMENT_SIZE(...)\
-            ((__VA_ARGS__)==1?1:\
-            ((__VA_ARGS__)<=2?2:\
-            ((__VA_ARGS__)<=4?4:\
-            ((__VA_ARGS__)<=8?8:\
-            ((__VA_ARGS__)<=16?16:\
-            ((__VA_ARGS__)<=32?32:\
-            ((__VA_ARGS__)<=64?64:128\
-            )))))))
-    #define ALPAKA_OPTIMAL_ALIGNMENT(...)\
-            ALPAKA_OPTIMAL_ALIGNMENT_SIZE(sizeof(typename std::remove_cv<__VA_ARGS__>::type))
-#else
-    #define ALPAKA_OPTIMAL_ALIGNMENT(...)\
-            ::alpaka::core::align::OptimalAlignment<sizeof(typename std::remove_cv<__VA_ARGS__>::type)>::value
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/core/Assert.hpp b/thirdParty/alpaka/include/alpaka/core/Assert.hpp
deleted file mode 100644
index b10f9f9156..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Assert.hpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <cassert>
-#include <type_traits>
-
-
-#if !(defined(BOOST_LANG_HIP) && BOOST_LANG_HIP && BOOST_COMP_HCC)
-  #define ALPAKA_ASSERT(EXPRESSION) assert(EXPRESSION)
-#else
-
-  // Including assert.h would interfere with HIP's host-device implementation
-  // see: https://github.com/ROCm-Developer-Tools/HIP/issues/599
-  // However, cassert is still in some header, so we have to do a workaround for HIP.
-  #ifdef NDEBUG
-    #define ALPAKA_ASSERT(EXPRESSION) static_cast<void>(0)
-  #else
-    #define ALPAKA_ASSERT(EXPRESSION) assert_workaround(EXPRESSION)
-
-    #pragma push_macro("__DEVICE__")
-    #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
-            __attribute__((weak))
-
-     __DEVICE__ void __device_trap() __asm("llvm.trap");
-
-     __host__ __device__
-     __attribute__((always_inline))             \
-     __attribute__((weak))
-     void assert_workaround(bool expr) {
-       if(!expr) {
-         printf("assert failed.\n");
-         #if __HIP_DEVICE_COMPILE__==1
-           __device_trap();
-         #else
-           exit(1);
-         #endif
-       }
-     }
-  #endif //NDEBUG
-#endif
-
-namespace alpaka
-{
-    namespace core
-    {
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename TArg,
-                typename TSfinae = void>
-            struct AssertValueUnsigned;
-            //#############################################################################
-            template<
-                typename TArg>
-            struct AssertValueUnsigned<
-                TArg,
-                typename std::enable_if<!std::is_unsigned<TArg>::value>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto assertValueUnsigned(
-                    TArg const & arg)
-                -> void
-                {
-#ifdef NDEBUG
-                    alpaka::ignore_unused(arg);
-#else
-                    ALPAKA_ASSERT(arg >= 0);
-#endif
-                }
-            };
-            //#############################################################################
-            template<
-                typename TArg>
-            struct AssertValueUnsigned<
-                TArg,
-                typename std::enable_if<std::is_unsigned<TArg>::value>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto assertValueUnsigned(
-                    TArg const & arg)
-                -> void
-                {
-                    alpaka::ignore_unused(arg);
-                    // Nothing to do for unsigned types.
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! This method checks integral values if they are greater or equal zero.
-        //! The implementation prevents warnings for checking this for unsigned types.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto assertValueUnsigned(
-            TArg const & arg)
-        -> void
-        {
-            detail::AssertValueUnsigned<
-                TArg>
-            ::assertValueUnsigned(
-                arg);
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename TLhs,
-                typename TRhs,
-                typename TSfinae = void>
-            struct AssertGreaterThan;
-            //#############################################################################
-            template<
-                typename TLhs,
-                typename TRhs>
-            struct AssertGreaterThan<
-                TLhs,
-                TRhs,
-                typename std::enable_if<!std::is_unsigned<TRhs>::value || (TLhs::value != 0u)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto assertGreaterThan(
-                    TRhs const & lhs)
-                -> void
-                {
-#ifdef NDEBUG
-                    alpaka::ignore_unused(lhs);
-#else
-                    ALPAKA_ASSERT(TLhs::value > lhs);
-#endif
-                }
-            };
-            //#############################################################################
-            template<
-                typename TLhs,
-                typename TRhs>
-            struct AssertGreaterThan<
-                TLhs,
-                TRhs,
-                typename std::enable_if<std::is_unsigned<TRhs>::value && (TLhs::value == 0u)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto assertGreaterThan(
-                    TRhs const & lhs)
-                -> void
-                {
-                    alpaka::ignore_unused(lhs);
-                    // Nothing to do for unsigned types camparing to zero.
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! This method asserts that the integral value TArg is less than Tidx.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TLhs,
-            typename TRhs>
-        ALPAKA_FN_HOST_ACC auto assertGreaterThan(
-            TRhs const & lhs)
-        -> void
-        {
-            detail::AssertGreaterThan<
-                TLhs,
-                TRhs>
-            ::assertGreaterThan(
-                lhs);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/core/BarrierThread.hpp b/thirdParty/alpaka/include/alpaka/core/BarrierThread.hpp
deleted file mode 100644
index 8853b0970f..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/BarrierThread.hpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-// Uncomment this to disable the standard spinlock behaviour of the threads
-//#define ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/block/sync/Traits.hpp>
-
-#include <mutex>
-#include <condition_variable>
-#ifndef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-    #include <atomic>
-    #include <thread>
-#endif
-
-namespace alpaka
-{
-    namespace core
-    {
-        namespace threads
-        {
-            //#############################################################################
-            //! A self-resetting barrier.
-            template<
-                typename TIdx>
-            class BarrierThread final
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                explicit BarrierThread(
-                    TIdx const & threadCount) :
-                    m_threadCount(threadCount),
-                    m_curThreadCount(threadCount),
-                    m_generation(0)
-                {}
-                //-----------------------------------------------------------------------------
-                BarrierThread(BarrierThread const &) = delete;
-                //-----------------------------------------------------------------------------
-                BarrierThread(BarrierThread &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(BarrierThread const &) -> BarrierThread & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(BarrierThread &&) -> BarrierThread & = delete;
-                //-----------------------------------------------------------------------------
-                ~BarrierThread() = default;
-
-                //-----------------------------------------------------------------------------
-                //! Waits for all the other threads to reach the barrier.
-                auto wait()
-                -> void
-                {
-                    TIdx const generationWhenEnteredTheWait = m_generation;
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                    std::unique_lock<std::mutex> lock(m_mtxBarrier);
-#endif
-                    if(--m_curThreadCount == 0)
-                    {
-                        m_curThreadCount = m_threadCount;
-                        ++m_generation;
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                        m_cvAllThreadsReachedBarrier.notify_all();
-#endif
-                    }
-                    else
-                    {
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                        m_cvAllThreadsReachedBarrier.wait(lock, [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
-#else
-                        while(generationWhenEnteredTheWait == m_generation)
-                        {
-                            std::this_thread::yield();
-                        }
-#endif
-                    }
-                }
-
-            private:
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                std::mutex m_mtxBarrier;
-                std::condition_variable m_cvAllThreadsReachedBarrier;
-#endif
-                const TIdx m_threadCount;
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                TIdx m_curThreadCount;
-                TIdx m_generation;
-#else
-                std::atomic<TIdx> m_curThreadCount;
-                std::atomic<TIdx> m_generation;
-#endif
-            };
-
-            namespace detail
-            {
-                //#############################################################################
-                template<
-                    typename TOp>
-                struct AtomicOp;
-                //#############################################################################
-                template<>
-                struct AtomicOp<
-                    block::sync::op::Count>
-                {
-                    void operator()(std::atomic<int>& result, bool value)
-                    {
-                        result += static_cast<int>(value);
-                    }
-                };
-                //#############################################################################
-                template<>
-                struct AtomicOp<
-                    block::sync::op::LogicalAnd>
-                {
-                    void operator()(std::atomic<int>& result, bool value)
-                    {
-                        result &= static_cast<int>(value);
-                    }
-                };
-                //#############################################################################
-                template<>
-                struct AtomicOp<
-                    block::sync::op::LogicalOr>
-                {
-                    void operator()(std::atomic<int>& result, bool value)
-                    {
-                        result |= static_cast<int>(value);
-                    }
-                };
-            }
-
-            //#############################################################################
-            //! A self-resetting barrier with barrier.
-            template<
-                typename TIdx>
-            class BarrierThreadWithPredicate final
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                explicit BarrierThreadWithPredicate(
-                    TIdx const & threadCount) :
-                    m_threadCount(threadCount),
-                    m_curThreadCount(threadCount),
-                    m_generation(0)
-                {}
-                //-----------------------------------------------------------------------------
-                BarrierThreadWithPredicate(BarrierThreadWithPredicate const & other) = delete;
-                //-----------------------------------------------------------------------------
-                BarrierThreadWithPredicate(BarrierThreadWithPredicate &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(BarrierThreadWithPredicate const &) -> BarrierThreadWithPredicate & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(BarrierThreadWithPredicate &&) -> BarrierThreadWithPredicate & = delete;
-                //-----------------------------------------------------------------------------
-                ~BarrierThreadWithPredicate() = default;
-
-                //-----------------------------------------------------------------------------
-                //! Waits for all the other threads to reach the barrier.
-                template<
-                    typename TOp>
-                ALPAKA_FN_HOST auto wait(int predicate)
-                -> int
-                {
-                    TIdx const generationWhenEnteredTheWait = m_generation;
-                    std::unique_lock<std::mutex> lock(m_mtxBarrier);
-
-                    auto const generationMod2(m_generation % static_cast<TIdx>(2u));
-                    if(m_curThreadCount == m_threadCount)
-                    {
-                        m_result[generationMod2] = TOp::InitialValue;
-                    }
-
-                    std::atomic<int>& result(m_result[generationMod2]);
-                    bool const predicateBool(predicate != 0);
-
-                    detail::AtomicOp<TOp>()(result, predicateBool);
-
-                    if(--m_curThreadCount == 0)
-                    {
-                        m_curThreadCount = m_threadCount;
-                        ++m_generation;
-                        m_cvAllThreadsReachedBarrier.notify_all();
-                    }
-                    else
-                    {
-                        m_cvAllThreadsReachedBarrier.wait(lock, [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
-                    }
-                    return m_result[generationMod2];
-                }
-
-            private:
-                std::mutex m_mtxBarrier;
-                std::condition_variable m_cvAllThreadsReachedBarrier;
-                const TIdx m_threadCount;
-                TIdx m_curThreadCount;
-                TIdx m_generation;
-                std::atomic<int> m_result[2];
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/core/BoostPredef.hpp b/thirdParty/alpaka/include/alpaka/core/BoostPredef.hpp
deleted file mode 100644
index 14790d6804..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/BoostPredef.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <boost/predef.h>
-
-//-----------------------------------------------------------------------------
-// In boost since 1.68.0
-// BOOST_PREDEF_MAKE_10_VVRRP(V)
-#if !defined(BOOST_PREDEF_MAKE_10_VVRRP)
-    #define BOOST_PREDEF_MAKE_10_VVRRP(V) BOOST_VERSION_NUMBER(((V)/1000)%100,((V)/10)%100,(V)%10)
-#endif
-
-//---------------------------------------HIP-----------------------------------
-// __HIPCC__ is defined by hipcc (if either __HCC__ or __CUDACC__ is defined)
-#if !defined(BOOST_LANG_HIP)
-  #if defined(__HIPCC__) && ( defined(__CUDACC__) || defined(__HCC__) || defined(__HIP__))
-    #include <hip/hip_runtime.h>
-    //HIP defines "abort()" as "{asm("trap;");}", which breaks some kernels
-    #undef abort
-    // there is no HIP_VERSION macro
-    #define BOOST_LANG_HIP BOOST_VERSION_NUMBER_AVAILABLE
-    #if defined(BOOST_LANG_CUDA) && BOOST_LANG_CUDA
-        #undef BOOST_LANG_CUDA
-        #define BOOST_LANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-  #else
-    #define BOOST_LANG_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
-  #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// HSA device architecture detection (HSA generated via HIP(HCC) or HCC directly)
-#if !defined(BOOST_ARCH_HSA)
-    #if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__==1 && defined(__HCC__) \
-        || (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__!=0)
-        // __HIP_DEVICE_COMPILE__ does not represent feature capability of target device like CUDA_ARCH.
-        // For feature detection there are special macros, see ROCm's HIP porting guide.
-        #define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_AVAILABLE
-    #else
-        #define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// hcc HSA compiler detection
-#if !defined(BOOST_COMP_HCC)
-    #if defined(__HCC__)
-        #define BOOST_COMP_HCC BOOST_VERSION_NUMBER_AVAILABLE
-    #else
-        #define BOOST_COMP_HCC BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// hip compiler detection
-#if !defined(BOOST_COMP_HIP)
-    #if defined(__HIP__)
-        #define BOOST_COMP_HIP BOOST_VERSION_NUMBER_AVAILABLE
-    #else
-        #define BOOST_COMP_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// In boost since 1.68.0
-// CUDA language detection
-// - clang defines __CUDA__ and __CUDACC__ when compiling CUDA code ('-x cuda')
-// - nvcc defines __CUDACC__ when compiling CUDA code
-#if !defined(BOOST_LANG_CUDA)
-    #if defined(__CUDA__) || defined(__CUDACC__)
-        #include <cuda.h>
-        #define BOOST_LANG_CUDA BOOST_PREDEF_MAKE_10_VVRRP(CUDA_VERSION)
-    #else
-        #define BOOST_LANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// In boost since 1.68.0
-// CUDA device architecture detection
-#if !defined(BOOST_ARCH_PTX)
-    #if defined(__CUDA_ARCH__)
-        #define BOOST_ARCH_PTX BOOST_PREDEF_MAKE_10_VRP(__CUDA_ARCH__)
-    #else
-        #define BOOST_ARCH_PTX BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// In boost since 1.68.0
-// nvcc CUDA compiler detection
-
-#include <boost/version.hpp>
-#if BOOST_VERSION >= 106800
-    // BOOST_COMP_NVCC_EMULATED is defined by boost instead of BOOST_COMP_NVCC
-    #if defined(BOOST_COMP_NVCC) && defined(BOOST_COMP_NVCC_EMULATED)
-        #undef BOOST_COMP_NVCC
-        #define BOOST_COMP_NVCC BOOST_COMP_NVCC_EMULATED
-    #endif
-#endif
-
-#if !defined(BOOST_COMP_NVCC)
-    #if defined(__NVCC__)
-        // The __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__ and __CUDACC_VER_BUILD__
-        // have been added with nvcc 7.5 and have not been available before.
-        #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) || !defined(__CUDACC_VER_BUILD__)
-            #define BOOST_COMP_NVCC BOOST_VERSION_NUMBER_AVAILABLE
-        #else
-            #define BOOST_COMP_NVCC BOOST_VERSION_NUMBER(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, __CUDACC_VER_BUILD__)
-        #endif
-    #else
-        #define BOOST_COMP_NVCC BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// In boost since 1.64.0
-// Work around for broken intel detection
-#if BOOST_COMP_INTEL == 0
-    #if defined(__INTEL_COMPILER)
-        #ifdef BOOST_COMP_INTEL_DETECTION
-            #undef BOOST_COMP_INTEL_DETECTION
-        #endif
-        #define BOOST_COMP_INTEL_DETECTION BOOST_PREDEF_MAKE_10_VVRR(__INTEL_COMPILER)
-        #if defined(BOOST_COMP_INTEL)
-            #undef BOOST_COMP_INTEL
-        #endif
-        #define BOOST_COMP_INTEL BOOST_COMP_INTEL_DETECTION
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// clang CUDA compiler detection
-// Currently __CUDA__ is only defined by clang when compiling CUDA code.
-#if defined(__clang__) && defined(__CUDA__)
-    #define BOOST_COMP_CLANG_CUDA BOOST_COMP_CLANG
-#else
-    #define BOOST_COMP_CLANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/core/ClipCast.hpp b/thirdParty/alpaka/include/alpaka/core/ClipCast.hpp
deleted file mode 100644
index 1202c5c397..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/ClipCast.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/meta/Integral.hpp>
-
-#include <algorithm>
-#include <limits>
-
-namespace alpaka
-{
-    namespace core
-    {
-        //-----------------------------------------------------------------------------
-        //! \return The input casted and clipped to T.
-        template<
-            typename T,
-            typename V>
-        auto clipCast(
-            V const & val)
-        -> T
-        {
-            static_assert(std::is_integral<T>::value && std::is_integral<V>::value, "clipCast can not be called with non-integral types!");
-
-            auto constexpr max = static_cast<V>(std::numeric_limits<alpaka::meta::LowerMax<T, V>>::max());
-            auto constexpr min = static_cast<V>(std::numeric_limits<alpaka::meta::HigherMin<T, V>>::min());
-
-            return static_cast<T>(std::max(min, std::min(max, val)));
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/core/Common.hpp b/thirdParty/alpaka/include/alpaka/core/Common.hpp
deleted file mode 100644
index e71d5296f5..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Common.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Debug.hpp>
-
-// Boost.Uuid errors with VS2017 when intrin.h is not included
-#if defined(_MSC_VER) && _MSC_VER >= 1910
-    #include <intrin.h>
-#endif
-
-//-----------------------------------------------------------------------------
-// Boost does not yet correctly identify clang when compiling CUDA code.
-// After explicitly including <boost/config.hpp> we can safely undefine some of the wrong settings.
-#if BOOST_COMP_CLANG_CUDA
-    #include <boost/config.hpp>
-    #undef BOOST_NO_CXX11_VARIADIC_TEMPLATES
-#endif
-
-//-----------------------------------------------------------------------------
-// Boost disables variadic templates for nvcc (in some cases because it was buggy).
-// However, we rely on it being enabled.
-// After explicitly including <boost/config.hpp> we can safely undefine the wrong setting.
-#if BOOST_COMP_NVCC
-    #include <boost/config.hpp>
-    #undef BOOST_NO_CXX11_VARIADIC_TEMPLATES
-#endif
-
-//-----------------------------------------------------------------------------
-//! All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC.
-//!
-//! Usage:
-//! ALPAKA_FN_ACC
-//! auto add(std::int32_t a, std::int32_t b)
-//! -> std::int32_t;
-#if BOOST_LANG_CUDA || BOOST_LANG_HIP
-    #if defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) || defined(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-        #define ALPAKA_FN_ACC __device__
-    #else
-        #define ALPAKA_FN_ACC __device__ __host__
-    #endif
-    #define ALPAKA_FN_HOST_ACC __device__ __host__
-    #define ALPAKA_FN_HOST __host__
-#else
-    #define ALPAKA_FN_ACC
-    #define ALPAKA_FN_HOST_ACC
-    #define ALPAKA_FN_HOST
-#endif
-
-//-----------------------------------------------------------------------------
-//! Disable nvcc warning:
-//! 'calling a __host__ function from __host__ __device__ function.'
-//!
-//! Usage:
-//! ALPAKA_NO_HOST_ACC_WARNING
-//! ALPAKA_FN_HOST_ACC function_declaration()
-//!
-//! WARNING: Only use this method if there is no other way.
-//! Most cases can be solved by #if BOOST_ARCH_PTX or #if BOOST_LANG_CUDA.
-#if (BOOST_LANG_CUDA && !BOOST_COMP_CLANG_CUDA) || BOOST_LANG_HIP
-    #if BOOST_COMP_MSVC
-        #define ALPAKA_NO_HOST_ACC_WARNING __pragma(hd_warning_disable)
-    #else
-        #define ALPAKA_NO_HOST_ACC_WARNING _Pragma("hd_warning_disable")
-    #endif
-#else
-    #define ALPAKA_NO_HOST_ACC_WARNING
-#endif
-
-//-----------------------------------------------------------------------------
-//! Macro defining the inline function attribute.
-#if BOOST_LANG_CUDA || BOOST_LANG_HIP
-    #define ALPAKA_FN_INLINE __forceinline__
-#else
-    #define ALPAKA_FN_INLINE inline
-#endif
-
-//-----------------------------------------------------------------------------
-//! This macro defines a variable lying in global accelerator device memory.
-//!
-//! Example:
-//!   ALPAKA_STATIC_ACC_MEM_GLOBAL int i;
-//!
-//! Those variables behave like ordinary variables when used in file-scope.
-//! They have external linkage (are accessible from other compilation units).
-//! If you want to access it from a different compilation unit, you have to declare it as extern:
-//!   extern ALPAKA_STATIC_ACC_MEM_GLOBAL int i;
-//! Like ordinary variables, only one definition is allowed (ODR)
-//! Failure to do so might lead to linker errors.
-//!
-//! In contrast to ordinary variables, you can not define such variables
-//! as static compilation unit local variables with internal linkage
-//! because this is forbidden by CUDA.
-#if (BOOST_LANG_CUDA && BOOST_ARCH_PTX) || (BOOST_LANG_HIP && (BOOST_ARCH_HSA || BOOST_ARCH_PTX))
-    #define ALPAKA_STATIC_ACC_MEM_GLOBAL __device__
-#else
-    #define ALPAKA_STATIC_ACC_MEM_GLOBAL
-#endif
-
-//-----------------------------------------------------------------------------
-//! This macro defines a variable lying in constant accelerator device memory.
-//!
-//! Example:
-//!   ALPAKA_STATIC_ACC_MEM_CONSTANT int i;
-//!
-//! Those variables behave like ordinary variables when used in file-scope.
-//! They have external linkage (are accessible from other compilation units).
-//! If you want to access it from a different compilation unit, you have to declare it as extern:
-//!   extern ALPAKA_STATIC_ACC_MEM_CONSTANT int i;
-//! Like ordinary variables, only one definition is allowed (ODR)
-//! Failure to do so might lead to linker errors.
-//!
-//! In contrast to ordinary variables, you can not define such variables
-//! as static compilation unit local variables with internal linkage
-//! because this is forbidden by CUDA.
-#if (BOOST_LANG_CUDA && BOOST_ARCH_PTX) || (BOOST_LANG_HIP && (BOOST_ARCH_HSA || BOOST_ARCH_PTX))
-    #define ALPAKA_STATIC_ACC_MEM_CONSTANT __constant__
-#else
-    #define ALPAKA_STATIC_ACC_MEM_CONSTANT
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/core/Concepts.hpp b/thirdParty/alpaka/include/alpaka/core/Concepts.hpp
deleted file mode 100644
index af3bc4251a..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Concepts.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace concepts
-    {
-        //#############################################################################
-        //! Tag used in class inheritance hierarchies that describes that a specific concept (TConcept)
-        //! is implemented by the given base class (TBase).
-        template<
-            typename TConcept,
-            typename TBase>
-        struct Implements
-        {
-        };
-
-        //#############################################################################
-        //! Checks whether the concept is implemented by the given class
-        template<
-            typename TConcept,
-            typename TDerived>
-        struct ImplementsConcept {
-            template<
-                typename TBase>
-            static auto implements(Implements<TConcept, TBase>&) -> std::true_type;
-            static auto implements(...) -> std::false_type;
-
-            static constexpr auto value = decltype(implements(std::declval<TDerived&>()))::value;
-        };
-
-        namespace detail
-        {
-            //#############################################################################
-            //! Returns the type that implements the given concept in the inheritance hierarchy.
-            template<
-                typename TConcept,
-                typename TDerived,
-                typename Sfinae = void>
-            struct ImplementationBaseType;
-
-            //#############################################################################
-            //! Base case for types that do not inherit from "Implements<TConcept, ...>" is the type itself.
-            template<
-                typename TConcept,
-                typename TDerived>
-            struct ImplementationBaseType<
-                TConcept,
-                TDerived,
-                typename std::enable_if<!ImplementsConcept<TConcept, TDerived>::value>::type>
-            {
-                using type = TDerived;
-            };
-
-            //#############################################################################
-            //! For types that inherit from "Implements<TConcept, ...>" it finds the base class (TBase) which implements the concept.
-            template<
-                typename TConcept,
-                typename TDerived>
-            struct ImplementationBaseType<
-                TConcept,
-                TDerived,
-                typename std::enable_if<ImplementsConcept<TConcept, TDerived>::value>::type>
-            {
-                template<
-                    typename TBase>
-                static auto implementer(Implements<TConcept, TBase>&) -> TBase;
-
-                using type = decltype(implementer(std::declval<TDerived&>()));
-
-                static_assert(std::is_base_of<type, TDerived>::value, "The type implementing the concept has to be a publicly accessible base class!");
-            };
-        }
-
-        //#############################################################################
-        //! Returns the type that implements the given concept in the inheritance hierarchy.
-        template<
-            typename TConcept,
-            typename TDerived>
-        using ImplementationBase = typename detail::ImplementationBaseType<TConcept, TDerived>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/core/ConcurrentExecPool.hpp b/thirdParty/alpaka/include/alpaka/core/ConcurrentExecPool.hpp
deleted file mode 100644
index 303a512e8e..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/ConcurrentExecPool.hpp
+++ /dev/null
@@ -1,697 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-//-----------------------------------------------------------------------------
-// Clang does not support exceptions when natively compiling device code.
-// This is no problem at some places but others explicitly rely on std::exception_ptr,
-// std::current_exception, std::make_exception_ptr, etc. which are not declared in device code.
-// Therefore, we can not even parse those parts when compiling device code.
-//-----------------------------------------------------------------------------
-#include <alpaka/core/Common.hpp>
-
-#include <boost/config.hpp>
-
-#include <queue>
-#include <mutex>
-#include <stdexcept>
-#include <vector>
-#include <exception>
-#include <utility>
-#include <atomic>
-#include <functional>
-#include <memory>
-
-namespace alpaka
-{
-    namespace core
-    {
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename T>
-            class ThreadSafeQueue :
-                private std::queue<T>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ThreadSafeQueue()
-                {}
-                //-----------------------------------------------------------------------------
-                //! \return If the queue is empty.
-                auto empty() const
-                -> bool
-                {
-                    return std::queue<T>::empty();
-                }
-                //-----------------------------------------------------------------------------
-                //! Pushes the given value onto the back of the queue.
-                auto push(
-                    T && t)
-                -> void
-                {
-                    std::lock_guard<std::mutex> lk(m_Mutex);
-
-                    std::queue<T>::push(std::forward<T>(t));
-                }
-                //-----------------------------------------------------------------------------
-                //! Pops the given value from the front of the queue.
-                auto pop(
-                    T & t)
-                -> bool
-                {
-                    std::lock_guard<std::mutex> lk(m_Mutex);
-
-                    if(std::queue<T>::empty())
-                    {
-                        return false;
-                    }
-                    else
-                    {
-                        t = std::queue<T>::front();
-                        std::queue<T>::pop();
-                        return true;
-                    }
-                }
-
-            private:
-                std::mutex m_Mutex;
-            };
-
-            //#############################################################################
-            //! ITaskPkg.
-            // \NOTE: We can not use C++11 std::packaged_task as it forces the use of std::future
-            // but we additionally support boost::fibers::promise.
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-            class ITaskPkg
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                virtual ~ITaskPkg() = default;
-
-                //-----------------------------------------------------------------------------
-                //! Runs this task.
-                auto runTask() noexcept
-                -> void
-                {
-                    try
-                    {
-                        run();
-                    }
-                    catch(...)
-                    {
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                        setException(std::current_exception());
-#endif
-                    }
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! The execution function.
-                virtual auto run() -> void = 0;
-
-            public:
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                //-----------------------------------------------------------------------------
-                //! Sets an exception.
-                virtual auto setException(
-                    std::exception_ptr const & exceptPtr)
-                -> void = 0;
-#endif
-            };
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-
-            //#############################################################################
-            template<
-                template<typename TFnObjReturn> class TPromise,
-                typename TFnObj,
-                typename TFnObjReturn = decltype(std::declval<TFnObj>()())>
-            class TaskPkg;
-
-            //#############################################################################
-            //! TaskPkg with return type.
-            //!
-            //! \tparam TPromise The promise type returned by the task.
-            //! \tparam TFnObj The type of the function to execute.
-            //! \tparam TFnObjReturn The return type of the TFnObj. Used for class specialization.
-            template<
-                template<typename TFnObjReturn> class TPromise,
-                typename TFnObj,
-                typename TFnObjReturn>
-            class TaskPkg final :
-                public ITaskPkg
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                TaskPkg(
-                    TFnObj && func) :
-                        m_Promise(),
-                        m_FnObj(std::move(func))
-                {}
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! The execution function.
-                virtual auto run()
-                -> void final
-                {
-                    m_Promise.set_value(this->m_FnObj());
-                }
-            public:
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                //-----------------------------------------------------------------------------
-                //! Sets an exception.
-                virtual auto setException(
-                    std::exception_ptr const & exceptPtr)
-                -> void final
-                {
-                    m_Promise.set_exception(exceptPtr);
-                }
-#endif
-                TPromise<TFnObjReturn> m_Promise;
-            private:
-                // NOTE: To avoid invalid memory accesses to memory of a different thread
-                // `std::remove_reference` enforces the function object to be copied.
-                typename std::remove_reference<TFnObj>::type m_FnObj;
-            };
-
-            //#############################################################################
-            //! TaskPkg without return type.
-            //!
-            //! \tparam TPromise The promise type returned by the task.
-            //! \tparam TFnObj The type of the function to execute.
-            template<
-                template<typename TFnObjReturn> class TPromise,
-                typename TFnObj>
-            class TaskPkg<
-                TPromise,
-                TFnObj,
-                void> final :
-                public ITaskPkg
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                TaskPkg(
-                    TFnObj && func) :
-                        m_Promise(),
-                        m_FnObj(std::move(func))
-                {}
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! The execution function.
-                virtual auto run()
-                -> void final
-                {
-                    this->m_FnObj();
-                    m_Promise.set_value();
-                }
-            public:
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                //-----------------------------------------------------------------------------
-                //! Sets an exception.
-                virtual auto setException(
-                    std::exception_ptr const & exceptPtr)
-                -> void final
-                {
-                    m_Promise.set_exception(exceptPtr);
-                }
-#endif
-                TPromise<void> m_Promise;
-            private:
-                // NOTE: To avoid invalid memory accesses to memory of a different thread
-                // `std::remove_reference` enforces the function object to be copied.
-                typename std::remove_reference<TFnObj>::type m_FnObj;
-            };
-
-            //-----------------------------------------------------------------------------
-            template<
-                typename TFnObj0,
-                typename TFnObj1,
-                typename = typename std::enable_if<!std::is_same<void, decltype(std::declval<TFnObj0>()())>::value>::type>
-            auto invokeBothReturnFirst(
-                    TFnObj0 && fn0,
-                    TFnObj1 && fn1)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-             -> decltype(std::declval<TFnObj0>()())
-#endif
-            {
-                auto ret = fn0();
-                fn1();
-                return std::move(ret);
-            }
-
-            //-----------------------------------------------------------------------------
-            template<
-                typename TFnObj0,
-                typename TFnObj1,
-                typename = typename std::enable_if<std::is_same<void, decltype(std::declval<TFnObj0>()())>::value>::type>
-            auto invokeBothReturnFirst(
-                    TFnObj0 && fn0,
-                    TFnObj1 && fn1)
-            -> void
-            {
-                fn0();
-                fn1();
-            }
-
-            //#############################################################################
-            //! ConcurrentExecPool using yield.
-            //!
-            //! \tparam TConcurrentExec The type of concurrent executor (for example std::thread).
-            //! \tparam TPromise The promise type returned by the task.
-            //! \tparam TYield The type is required to have a static method "void yield()" to yield the current thread if there is no work.
-            //! \tparam TMutex Unused. The mutex type used for locking threads.
-            //! \tparam TCondVar Unused. The condition variable type used to make the threads wait if there is no work.
-            //! \tparam TisYielding Boolean value if the threads should yield instead of wait for a condition variable.
-            template<
-                typename TIdx,
-                typename TConcurrentExec,
-                template<typename TFnObjReturn> class TPromise,
-                typename TYield,
-                typename TMutex = void,
-                typename TCondVar = void,
-                bool TisYielding = true>
-            class ConcurrentExecPool final
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                //! Creates a concurrent executor pool with a specific number of concurrent executors and a maximum number of queued tasks.
-                //!
-                //! \param concurrentExecutionCount
-                //!    The guaranteed number of concurrent executors used in the pool.
-                //!    This is also the maximum number of tasks worked on concurrently.
-                ConcurrentExecPool(
-                    TIdx concurrentExecutionCount) :
-                    m_vConcurrentExecs(),
-                    m_qTasks(),
-                    m_numActiveTasks(0u),
-                    m_bShutdownFlag(false)
-                {
-                    if(concurrentExecutionCount < 1)
-                    {
-                        throw std::invalid_argument("The argument 'concurrentExecutionCount' has to be greate or equal to one!");
-                    }
-
-                    m_vConcurrentExecs.reserve(static_cast<std::size_t>(concurrentExecutionCount));
-
-                    // Create all concurrent executors.
-                    for(TIdx concurrentExec(0u); concurrentExec < concurrentExecutionCount; ++concurrentExec)
-                    {
-                        m_vConcurrentExecs.emplace_back([this](){concurrentExecFn();});
-                    }
-                }
-                //-----------------------------------------------------------------------------
-                ConcurrentExecPool(ConcurrentExecPool const &) = delete;
-                //-----------------------------------------------------------------------------
-                ConcurrentExecPool(ConcurrentExecPool &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(ConcurrentExecPool const &) -> ConcurrentExecPool & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(ConcurrentExecPool &&) -> ConcurrentExecPool & = delete;
-
-                //-----------------------------------------------------------------------------
-                //! Completes any currently running task normally.
-                //! Signals a std::runtime_error exception to any other tasks that was not able to run.
-                ~ConcurrentExecPool()
-                {
-                    // Signal that concurrent executors should not perform any new work
-                    m_bShutdownFlag.store(true);
-
-                    joinAllConcurrentExecs();
-
-                    auto currentTaskPackage = std::shared_ptr<ITaskPkg>{nullptr};
-
-                    // Signal to each incomplete task that it will not complete due to pool destruction.
-                    while(popTask(currentTaskPackage))
-                    {
-                        auto const except(std::runtime_error("Could not perform task before ConcurrentExecPool destruction"));
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                        currentTaskPackage->setException(std::make_exception_ptr(except));
-#endif
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                //! Runs the given function on one of the pool in First In First Out (FIFO) order.
-                //!
-                //! \tparam TFnObj  The function type.
-                //! \param task     Function object to be called on the pool.
-                //!                 Takes an arbitrary number of arguments and arbitrary return type.
-                //! \tparam TArgs   The argument types pack.
-                //! \param args     Arguments for task, cannot be moved.
-                //!                 If such parameters must be used, use a lambda and capture via move then move the lambda.
-                //!
-                //! \return Signals when the task has completed with either success or an exception.
-                //!         Also results in an exception if the pool is destroyed before execution has begun.
-                template<
-                    typename TFnObj,
-                    typename ... TArgs>
-                auto enqueueTask(
-                    TFnObj && task,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))
-                // FIXME: gcc 4.9 does not support the syntax below. Restricting the return type to void works because we never use something else within alpaka.
-                -> decltype(std::declval<TPromise<void>>().get_future())
-#else
-                -> decltype(std::declval<TPromise<decltype(task(args...))>>().get_future())
-#endif
-#endif
-                {
-                    auto boundTask([=](){return task(args...);});
-                    auto decrementNumActiveTasks([this](){--m_numActiveTasks;});
-
-                    auto extendedTask(
-                        [boundTask, decrementNumActiveTasks]()
-                        {
-                            return
-                                invokeBothReturnFirst(
-                                    std::move(boundTask),
-                                    std::move(decrementNumActiveTasks)
-                                );
-                        });
-
-                    using TaskPackage = TaskPkg<TPromise, decltype(extendedTask)>;
-                    auto pTaskPackage(new TaskPackage(std::move(extendedTask)));
-                    std::shared_ptr<ITaskPkg> upTaskPackage(pTaskPackage);
-
-                    auto future(pTaskPackage->m_Promise.get_future());
-
-                    ++m_numActiveTasks;
-                    m_qTasks.push(std::move(upTaskPackage));
-
-                    return future;
-                }
-                //-----------------------------------------------------------------------------
-                //! \return The number of concurrent executors available.
-                auto getConcurrentExecutionCount() const
-                -> TIdx
-                {
-                    return m_vConcurrentExecs.size();
-                }
-                //-----------------------------------------------------------------------------
-                //! \return If the thread pool is idle.
-                auto isIdle() const
-                -> bool
-                {
-                    return m_numActiveTasks == 0u;
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! The function the concurrent executors are executing.
-                void concurrentExecFn()
-                {
-                    // Checks whether pool is being destroyed, if so, stop running.
-                    while(!m_bShutdownFlag.load(std::memory_order_relaxed))
-                    {
-                        auto currentTaskPackage = std::shared_ptr<ITaskPkg>{nullptr};
-
-                        // Use popTask so we only ever have one reference to the ITaskPkg
-                        if(popTask(currentTaskPackage))
-                        {
-                            currentTaskPackage->runTask();
-                        }
-                        else
-                        {
-                            TYield::yield();
-                        }
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                //! Joins all concurrent executors.
-                void joinAllConcurrentExecs()
-                {
-                    for(auto && concurrentExec : m_vConcurrentExecs)
-                    {
-                        concurrentExec.join();
-                    }
-                }
-                //-----------------------------------------------------------------------------
-                //! Pops a task from the queue.
-                auto popTask(
-                    std::shared_ptr<ITaskPkg> & out)
-                -> bool
-                {
-                    if(m_qTasks.pop(out))
-                    {
-                        return true;
-                    }
-                    return false;
-                }
-
-            private:
-                std::vector<TConcurrentExec> m_vConcurrentExecs;
-                ThreadSafeQueue<std::shared_ptr<ITaskPkg>> m_qTasks;
-                std::atomic<std::uint32_t> m_numActiveTasks;
-                std::atomic<bool> m_bShutdownFlag;
-            };
-
-            //#############################################################################
-            //! ConcurrentExecPool using a condition variable to wait for new work.
-            //!
-            //! \tparam TConcurrentExec The type of concurrent executor (for example std::thread).
-            //! \tparam TPromise The promise type returned by the task.
-            //! \tparam TYield Unused. The type is required to have a static method "void yield()" to yield the current thread if there is no work.
-            //! \tparam TMutex The mutex type used for locking threads.
-            //! \tparam TCondVar The condition variable type used to make the threads wait if there is no work.
-            template<
-                typename TIdx,
-                typename TConcurrentExec,
-                template<typename TFnObjReturn> class TPromise,
-                typename TYield,
-                typename TMutex,
-                typename TCondVar>
-            class ConcurrentExecPool<
-                TIdx,
-                TConcurrentExec,
-                TPromise,
-                TYield,
-                TMutex,
-                TCondVar,
-                false> final
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                //! Creates a concurrent executors pool with a specific number of concurrent executors and a maximum number of queued tasks.
-                //!
-                //! \param concurrentExecutionCount
-                //!    The guaranteed number of concurrent executors used in the pool.
-                //!    This is also the maximum number of tasks worked on concurrently.
-                ConcurrentExecPool(
-                    TIdx concurrentExecutionCount) :
-                    m_vConcurrentExecs(),
-                    m_qTasks(),
-                    m_numActiveTasks(0u),
-                    m_mtxWakeup(),
-                    m_cvWakeup(),
-                    m_bShutdownFlag(false)
-                {
-                    if(concurrentExecutionCount < 1)
-                    {
-                        throw std::invalid_argument("The argument 'concurrentExecutionCount' has to be greate or equal to one!");
-                    }
-
-                    m_vConcurrentExecs.reserve(static_cast<std::size_t>(concurrentExecutionCount));
-
-                    // Create all concurrent executors.
-                    for(TIdx concurrentExec(0u); concurrentExec < concurrentExecutionCount; ++concurrentExec)
-                    {
-                        m_vConcurrentExecs.emplace_back([this](){concurrentExecFn();});
-                    }
-                }
-                //-----------------------------------------------------------------------------
-                ConcurrentExecPool(ConcurrentExecPool const &) = delete;
-                //-----------------------------------------------------------------------------
-                ConcurrentExecPool(ConcurrentExecPool &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(ConcurrentExecPool const &) -> ConcurrentExecPool & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(ConcurrentExecPool &&) -> ConcurrentExecPool & = delete;
-
-                //-----------------------------------------------------------------------------
-                //! Completes any currently running task normally.
-                //! Signals a std::runtime_error exception to any other tasks that was not able to run.
-                ~ConcurrentExecPool()
-                {
-                    {
-                        std::unique_lock<TMutex> lock(m_mtxWakeup);
-
-                        // Signal that concurrent executors should not perform any new work
-                        m_bShutdownFlag = true;
-                    }
-
-                    m_cvWakeup.notify_all();
-
-                    joinAllConcurrentExecs();
-
-                    auto currentTaskPackage = std::shared_ptr<ITaskPkg>{nullptr};
-
-                    // Signal to each incomplete task that it will not complete due to pool destruction.
-                    while(popTask(currentTaskPackage))
-                    {
-                        auto const except(std::runtime_error("Could not perform task before ConcurrentExecPool destruction"));
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                        currentTaskPackage->setException(std::make_exception_ptr(except));
-#endif
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                //! Runs the given function on one of the pool in First In First Out (FIFO) order.
-                //!
-                //! \tparam TFnObj  The function type.
-                //! \param task     Function object to be called on the pool.
-                //!                 Takes an arbitrary number of arguments and arbitrary return type.
-                //! \tparam TArgs   The argument types pack.
-                //! \param args     Arguments for task, cannot be moved.
-                //!                 If such parameters must be used, use a lambda and capture via move then move the lambda.
-                //!
-                //! \return Signals when the task has completed with either success or an exception.
-                //!         Also results in an exception if the pool is destroyed before execution has begun.
-                template<
-                    typename TFnObj,
-                    typename ... TArgs>
-                auto enqueueTask(
-                    TFnObj && task,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))
-                // FIXME: gcc 4.9 does not support the syntax below. Restricting the return type to void works because we never use something else within alpaka.
-                -> decltype(std::declval<TPromise<void>>().get_future())
-#else
-                -> decltype(std::declval<TPromise<decltype(task(args...))>>().get_future())
-#endif
-#endif
-                {
-                    auto boundTask([=](){return task(args...);});
-                    auto decrementNumActiveTasks([this](){--m_numActiveTasks;});
-
-                    auto extendedTask(
-                        [boundTask, decrementNumActiveTasks]()
-                        {
-                            return
-                                invokeBothReturnFirst(
-                                    std::move(boundTask),
-                                    std::move(decrementNumActiveTasks)
-                                );
-                        });
-
-                    using TaskPackage = TaskPkg<TPromise, decltype(extendedTask)>;
-                    auto pTaskPackage(new TaskPackage(std::move(extendedTask)));
-                    std::shared_ptr<ITaskPkg> upTaskPackage(pTaskPackage);
-
-                    auto future(pTaskPackage->m_Promise.get_future());
-
-                    ++m_numActiveTasks;
-                    {
-                        std::lock_guard<TMutex> lock(m_mtxWakeup);
-                        m_qTasks.push(std::move(upTaskPackage));
-
-                        m_cvWakeup.notify_one();
-                    }
-
-                    return future;
-                }
-                //-----------------------------------------------------------------------------
-                //! \return The number of concurrent executors available.
-                auto getConcurrentExecutionCount() const
-                -> TIdx
-                {
-                    return m_vConcurrentExecs.size();
-                }
-                //-----------------------------------------------------------------------------
-                //! \return If the thread pool is idle.
-                auto isIdle() const
-                -> bool
-                {
-                    return m_numActiveTasks == 0u;
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! The function the concurrent executors are executing.
-                void concurrentExecFn()
-                {
-                    // Checks whether pool is being destroyed, if so, stop running (lazy check without mutex).
-                    while(!m_bShutdownFlag)
-                    {
-                        auto currentTaskPackage = std::shared_ptr<ITaskPkg>{nullptr};
-
-                        // Use popTask so we only ever have one reference to the ITaskPkg
-                        if(popTask(currentTaskPackage))
-                        {
-                            currentTaskPackage->runTask();
-                        }
-                        {
-                            std::unique_lock<TMutex> lock(m_mtxWakeup);
-                            if(m_qTasks.empty())
-                            {
-                                // If the shutdown flag has been set since the last check, return now.
-                                if(m_bShutdownFlag)
-                                {
-                                    return;
-                                }
-
-                                m_cvWakeup.wait(lock, [this]() { return ((!m_qTasks.empty()) || m_bShutdownFlag); });
-                            }
-                        }
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                //! Joins all concurrent executors.
-                void joinAllConcurrentExecs()
-                {
-                    for(auto && concurrentExec : m_vConcurrentExecs)
-                    {
-                        concurrentExec.join();
-                    }
-                }
-                //-----------------------------------------------------------------------------
-                //! Pops a task from the queue.
-                auto popTask(
-                    std::shared_ptr<ITaskPkg> & out)
-                -> bool
-                {
-                    if(m_qTasks.pop(out))
-                    {
-                        return true;
-                    }
-                    return false;
-                }
-
-            private:
-                std::vector<TConcurrentExec> m_vConcurrentExecs;
-                ThreadSafeQueue<std::shared_ptr<ITaskPkg>> m_qTasks;
-                std::atomic<std::uint32_t> m_numActiveTasks;
-
-                TMutex m_mtxWakeup;
-                TCondVar m_cvWakeup;
-                std::atomic<bool> m_bShutdownFlag;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/core/Cuda.hpp b/thirdParty/alpaka/include/alpaka/core/Cuda.hpp
deleted file mode 100644
index 4ecb81d5e1..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Cuda.hpp
+++ /dev/null
@@ -1,768 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/elem/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
-
-// cuda_runtime_api.h: CUDA Runtime API C-style interface that does not require compiling with nvcc.
-// cuda_runtime.h: CUDA Runtime API  C++-style interface built on top of the C API.
-//  It wraps some of the C API routines, using overloading, references and default arguments.
-//  These wrappers can be used from C++ code and can be compiled with any C++ compiler.
-//  The C++ API also has some CUDA-specific wrappers that wrap C API routines that deal with symbols, textures, and device functions.
-//  These wrappers require the use of \p nvcc because they depend on code being generated by the compiler.
-//  For example, the execution configuration syntax to invoke kernels is only available in source code compiled with nvcc.
-#include <cuda_runtime.h>
-#include <cuda.h>
-
-#include <array>
-#include <type_traits>
-#include <utility>
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include <cstddef>
-
-#if (!defined(CUDART_VERSION) || (CUDART_VERSION < 8000))
-    #error "CUDA version 8.0 or greater required!"
-#endif
-
-#if (!defined(CUDA_VERSION) || (CUDA_VERSION < 8000))
-    #error "CUDA version 8.0 or greater required!"
-#endif
-
-namespace alpaka
-{
-    namespace cuda
-    {
-        namespace detail
-        {
-            //-----------------------------------------------------------------------------
-            //! CUDA runtime API error checking with log and exception, ignoring specific error values
-            ALPAKA_FN_HOST inline auto cudaRtCheck(
-                cudaError_t const & error,
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                if(error != cudaSuccess)
-                {
-                    std::string const sError(std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '" + cudaGetErrorName(error) +  "': '" + std::string(cudaGetErrorString(error)) + "'!");
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cerr << sError << std::endl;
-#endif
-                    ALPAKA_DEBUG_BREAK;
-                    // reset the last error to allow user side error handling
-                    cudaGetLastError();
-                    throw std::runtime_error(sError);
-                }
-            }
-            //-----------------------------------------------------------------------------
-            //! CUDA runtime API error checking with log and exception, ignoring specific error values
-            // NOTE: All ignored errors have to be convertible to cudaError_t.
-            template<
-                typename... TErrors/*,
-                typename = typename std::enable_if<
-                    meta::Conjunction<
-                        std::true_type,
-                        std::is_convertible<
-                            TErrors,
-                            cudaError_t
-                        >...
-                    >::value>::type*/>
-            ALPAKA_FN_HOST auto cudaRtCheckIgnore(
-                cudaError_t const & error,
-                char const * cmd,
-                char const * file,
-                int const & line,
-                TErrors && ... ignoredErrorCodes)
-            -> void
-            {
-                if(error != cudaSuccess)
-                {
-                    // https://stackoverflow.com/questions/18792731/can-we-omit-the-double-braces-for-stdarray-in-c14/18792782#18792782
-                    std::array<cudaError_t, sizeof...(ignoredErrorCodes)> const aIgnoredErrorCodes{{ignoredErrorCodes...}};
-
-                    // If the error code is not one of the ignored ones.
-                    if(std::find(aIgnoredErrorCodes.cbegin(), aIgnoredErrorCodes.cend(), error) == aIgnoredErrorCodes.cend())
-                    {
-                        cudaRtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line);
-                    }
-                }
-            }
-            //-----------------------------------------------------------------------------
-            //! CUDA runtime API last error checking with log and exception.
-            ALPAKA_FN_HOST inline auto cudaRtCheckLastError(
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                cudaError_t const error(cudaGetLastError());
-                cudaRtCheck(error, desc, file, line);
-            }
-        }
-    }
-}
-
-#if BOOST_COMP_MSVC
-    //-----------------------------------------------------------------------------
-    //! CUDA runtime error checking with log and exception, ignoring specific error values
-    #define ALPAKA_CUDA_RT_CHECK_IGNORE(cmd, ...)\
-        ::alpaka::cuda::detail::cudaRtCheckLastError("'" #cmd "' A previous CUDA call (not this one) set the error ", __FILE__, __LINE__);\
-        ::alpaka::cuda::detail::cudaRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, __VA_ARGS__)
-#else
-    #if BOOST_COMP_CLANG
-        #pragma clang diagnostic push
-        #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
-    #endif
-    //-----------------------------------------------------------------------------
-    //! CUDA runtime error checking with log and exception, ignoring specific error values
-    #define ALPAKA_CUDA_RT_CHECK_IGNORE(cmd, ...)\
-        ::alpaka::cuda::detail::cudaRtCheckLastError("'" #cmd "' A previous CUDA call (not this one) set the error ", __FILE__, __LINE__);\
-        ::alpaka::cuda::detail::cudaRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, ##__VA_ARGS__)
-    #if BOOST_COMP_CLANG
-        #pragma clang diagnostic pop
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-//! CUDA runtime error checking with log and exception.
-#define ALPAKA_CUDA_RT_CHECK(cmd)\
-    ALPAKA_CUDA_RT_CHECK_IGNORE(cmd)
-
-namespace alpaka
-{
-    namespace cuda
-    {
-        namespace detail
-        {
-            //-----------------------------------------------------------------------------
-            //! CUDA driver API error checking with log and exception, ignoring specific error values
-            ALPAKA_FN_HOST inline auto cudaDrvCheck(
-                CUresult const & error,
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                if(error == CUDA_SUCCESS)
-                    return;
-
-                char const * cu_err_name = nullptr;
-                char const * cu_err_string = nullptr;
-                CUresult cu_result_name = cuGetErrorName(error, &cu_err_name);
-                CUresult cu_result_string = cuGetErrorString(error, &cu_err_string);
-                std::string sError = std::string(file)
-                                   + "(" + std::to_string(line) + ") "
-                                   + std::string(desc) + " : '";
-                if( cu_result_name == CUDA_SUCCESS && cu_result_string == CUDA_SUCCESS )
-                {
-                    sError += std::string(cu_err_name) +  "': '"
-                            + std::string(cu_err_string) + "'!";
-                } else {
-                    // cuGetError*() failed, so append corresponding error message
-                    if( cu_result_name == CUDA_ERROR_INVALID_VALUE ) {
-                        sError += " cuGetErrorName: 'Invalid Value'!";
-                    }
-                    if( cu_result_string == CUDA_ERROR_INVALID_VALUE ) {
-                        sError += " cuGetErrorString: 'Invalid Value'!";
-                    }
-                }
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                std::cerr << sError << std::endl;
-#endif
-                ALPAKA_DEBUG_BREAK;
-                throw std::runtime_error(sError);
-            }
-        }
-    }
-}
-
-//-----------------------------------------------------------------------------
-//! CUDA driver error checking with log and exception.
-#define ALPAKA_CUDA_DRV_CHECK(cmd)\
-    ::alpaka::cuda::detail::cudaDrvCheck(cmd, #cmd, __FILE__, __LINE__)
-
-
-//-----------------------------------------------------------------------------
-// CUDA vector_types.h trait specializations.
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The CUDA specifics.
-    namespace cuda
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA vectors 1D dimension get trait specialization.
-            template<
-                typename T>
-            struct IsCudaBuiltInType :
-                std::integral_constant<
-                    bool,
-                    std::is_same<T, char1>::value
-                    || std::is_same<T, double1>::value
-                    || std::is_same<T, float1>::value
-                    || std::is_same<T, int1>::value
-                    || std::is_same<T, long1>::value
-                    || std::is_same<T, longlong1>::value
-                    || std::is_same<T, short1>::value
-                    || std::is_same<T, uchar1>::value
-                    || std::is_same<T, uint1>::value
-                    || std::is_same<T, ulong1>::value
-                    || std::is_same<T, ulonglong1>::value
-                    || std::is_same<T, ushort1>::value
-                    || std::is_same<T, char2>::value
-                    || std::is_same<T, double2>::value
-                    || std::is_same<T, float2>::value
-                    || std::is_same<T, int2>::value
-                    || std::is_same<T, long2>::value
-                    || std::is_same<T, longlong2>::value
-                    || std::is_same<T, short2>::value
-                    || std::is_same<T, uchar2>::value
-                    || std::is_same<T, uint2>::value
-                    || std::is_same<T, ulong2>::value
-                    || std::is_same<T, ulonglong2>::value
-                    || std::is_same<T, ushort2>::value
-                    || std::is_same<T, char3>::value
-                    || std::is_same<T, dim3>::value
-                    || std::is_same<T, double3>::value
-                    || std::is_same<T, float3>::value
-                    || std::is_same<T, int3>::value
-                    || std::is_same<T, long3>::value
-                    || std::is_same<T, longlong3>::value
-                    || std::is_same<T, short3>::value
-                    || std::is_same<T, uchar3>::value
-                    || std::is_same<T, uint3>::value
-                    || std::is_same<T, ulong3>::value
-                    || std::is_same<T, ulonglong3>::value
-                    || std::is_same<T, ushort3>::value
-                    || std::is_same<T, char4>::value
-                    || std::is_same<T, double4>::value
-                    || std::is_same<T, float4>::value
-                    || std::is_same<T, int4>::value
-                    || std::is_same<T, long4>::value
-                    || std::is_same<T, longlong4>::value
-                    || std::is_same<T, short4>::value
-                    || std::is_same<T, uchar4>::value
-                    || std::is_same<T, uint4>::value
-                    || std::is_same<T, ulong4>::value
-                    || std::is_same<T, ulonglong4>::value
-                    || std::is_same<T, ushort4>::value
-// CUDA built-in variables have special types in clang native CUDA compilation
-// defined in cuda_builtin_vars.h
-#if BOOST_COMP_CLANG_CUDA
-                    || std::is_same<T, __cuda_builtin_threadIdx_t>::value
-                    || std::is_same<T, __cuda_builtin_blockIdx_t>::value
-                    || std::is_same<T, __cuda_builtin_blockDim_t>::value
-                    || std::is_same<T, __cuda_builtin_gridDim_t>::value
-#endif
-                >
-            {};
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA vectors 1D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char1>::value
-                    || std::is_same<T, double1>::value
-                    || std::is_same<T, float1>::value
-                    || std::is_same<T, int1>::value
-                    || std::is_same<T, long1>::value
-                    || std::is_same<T, longlong1>::value
-                    || std::is_same<T, short1>::value
-                    || std::is_same<T, uchar1>::value
-                    || std::is_same<T, uint1>::value
-                    || std::is_same<T, ulong1>::value
-                    || std::is_same<T, ulonglong1>::value
-                    || std::is_same<T, ushort1>::value
-                >::type>
-            {
-                using type = dim::DimInt<1u>;
-            };
-            //#############################################################################
-            //! The CUDA vectors 2D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char2>::value
-                    || std::is_same<T, double2>::value
-                    || std::is_same<T, float2>::value
-                    || std::is_same<T, int2>::value
-                    || std::is_same<T, long2>::value
-                    || std::is_same<T, longlong2>::value
-                    || std::is_same<T, short2>::value
-                    || std::is_same<T, uchar2>::value
-                    || std::is_same<T, uint2>::value
-                    || std::is_same<T, ulong2>::value
-                    || std::is_same<T, ulonglong2>::value
-                    || std::is_same<T, ushort2>::value
-                >::type>
-            {
-                using type = dim::DimInt<2u>;
-            };
-            //#############################################################################
-            //! The CUDA vectors 3D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char3>::value
-                    || std::is_same<T, dim3>::value
-                    || std::is_same<T, double3>::value
-                    || std::is_same<T, float3>::value
-                    || std::is_same<T, int3>::value
-                    || std::is_same<T, long3>::value
-                    || std::is_same<T, longlong3>::value
-                    || std::is_same<T, short3>::value
-                    || std::is_same<T, uchar3>::value
-                    || std::is_same<T, uint3>::value
-                    || std::is_same<T, ulong3>::value
-                    || std::is_same<T, ulonglong3>::value
-                    || std::is_same<T, ushort3>::value
-#if BOOST_COMP_CLANG_CUDA
-                    || std::is_same<T, __cuda_builtin_threadIdx_t>::value
-                    || std::is_same<T, __cuda_builtin_blockIdx_t>::value
-                    || std::is_same<T, __cuda_builtin_blockDim_t>::value
-                    || std::is_same<T, __cuda_builtin_gridDim_t>::value
-#endif
-                >::type>
-            {
-                using type = dim::DimInt<3u>;
-            };
-            //#############################################################################
-            //! The CUDA vectors 4D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char4>::value
-                    || std::is_same<T, double4>::value
-                    || std::is_same<T, float4>::value
-                    || std::is_same<T, int4>::value
-                    || std::is_same<T, long4>::value
-                    || std::is_same<T, longlong4>::value
-                    || std::is_same<T, short4>::value
-                    || std::is_same<T, uchar4>::value
-                    || std::is_same<T, uint4>::value
-                    || std::is_same<T, ulong4>::value
-                    || std::is_same<T, ulonglong4>::value
-                    || std::is_same<T, ushort4>::value
-                >::type>
-            {
-                using type = dim::DimInt<4u>;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA vectors elem type trait specialization.
-            template<
-                typename T>
-            struct ElemType<
-                T,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<T>::value>::type>
-            {
-                using type = decltype(std::declval<T>().x);
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA vectors extent get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 1u>,
-                TExtent,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.x)
-                {
-                    return extent.x;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors extent get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 2u>,
-                TExtent,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.y)
-                {
-                    return extent.y;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors extent get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 3u>,
-                TExtent,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 3)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.z)
-                {
-                    return extent.z;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors extent get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 4u>,
-                TExtent,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 4)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.w)
-                {
-                    return extent.w;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 1u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent.x = extentVal;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 2u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent.y = extentVal;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 3u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 3)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent.z = extentVal;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 4u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 4)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent.w = extentVal;
-                }
-            };
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 1u>,
-                TOffsets,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.x)
-                {
-                    return offsets.x;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 2u>,
-                TOffsets,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.y)
-                {
-                    return offsets.y;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 3u>,
-                TOffsets,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 3)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.z)
-                {
-                    return offsets.z;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 4u>,
-                TOffsets,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 4)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.w)
-                {
-                    return offsets.w;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 1u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.x = offset;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 2u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.y = offset;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 3u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 3)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.z = offset;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 4u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 4)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.w = offset;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA vectors idx type trait specialization.
-            template<
-                typename TIdx>
-            struct IdxType<
-                TIdx,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TIdx>::value>::type>
-            {
-                using type = std::size_t;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/core/Debug.hpp b/thirdParty/alpaka/include/alpaka/core/Debug.hpp
deleted file mode 100644
index def86d5e56..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Debug.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <string>
-#include <iostream>
-
-//-----------------------------------------------------------------------------
-//! The no debug level.
-#define ALPAKA_DEBUG_DISABLED 0
-//-----------------------------------------------------------------------------
-//! The minimal debug level.
-#define ALPAKA_DEBUG_MINIMAL 1
-//-----------------------------------------------------------------------------
-//! The full debug level.
-#define ALPAKA_DEBUG_FULL 2
-
-#ifndef ALPAKA_DEBUG
-    //-----------------------------------------------------------------------------
-    //! Set the minimum log level if it is not defined.
-    #define ALPAKA_DEBUG ALPAKA_DEBUG_DISABLED
-#endif
-
-namespace alpaka
-{
-    namespace core
-    {
-        namespace detail
-        {
-            //#############################################################################
-            //! Scope logger.
-            class ScopeLogStdOut final
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ScopeLogStdOut(
-                    std::string const & sScope) :
-                        m_sScope(sScope)
-                {
-                    std::cout << "[+] " << m_sScope << std::endl;
-                }
-                //-----------------------------------------------------------------------------
-                ScopeLogStdOut(ScopeLogStdOut const &) = delete;
-                //-----------------------------------------------------------------------------
-                ScopeLogStdOut(ScopeLogStdOut &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(ScopeLogStdOut const &) -> ScopeLogStdOut & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(ScopeLogStdOut &&) -> ScopeLogStdOut & = delete;
-                //-----------------------------------------------------------------------------
-                ~ScopeLogStdOut()
-                {
-                    std::cout << "[-] " << m_sScope << std::endl;
-                }
-
-            private:
-                std::string const m_sScope;
-            };
-        }
-    }
-}
-
-//-----------------------------------------------------------------------------
-// Define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE.
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE\
-        ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
-#else
-    #define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
-#endif
-
-//-----------------------------------------------------------------------------
-// Define ALPAKA_DEBUG_FULL_LOG_SCOPE.
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-    #define ALPAKA_DEBUG_FULL_LOG_SCOPE\
-        ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
-#else
-    #define ALPAKA_DEBUG_FULL_LOG_SCOPE
-#endif
-
-//-----------------------------------------------------------------------------
-// Define ALPAKA_DEBUG_BREAK.
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #if BOOST_COMP_GNUC
-        #define ALPAKA_DEBUG_BREAK ::__builtin_trap()
-    #elif BOOST_COMP_INTEL
-        #define ALPAKA_DEBUG_BREAK ::__debugbreak()
-    #elif BOOST_COMP_MSVC
-        #define ALPAKA_DEBUG_BREAK ::__debugbreak()
-    #else
-        #define ALPAKA_DEBUG_BREAK
-        //#error debug-break for current compiler not implemented!
-    #endif
-#else
-    #define ALPAKA_DEBUG_BREAK
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/core/Fibers.hpp b/thirdParty/alpaka/include/alpaka/core/Fibers.hpp
deleted file mode 100644
index a9a156b19c..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Fibers.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if BOOST_COMP_MSVC
-    #pragma warning(push)
-
-    #pragma warning(disable: 4100)  // boost/context/detail/apply.hpp(31): warning C4100: "tpl": unreferenced formal parameter
-    #pragma warning(disable: 4245)  // boost/fiber/detail/futex.hpp(52): warning C4245: 'argument': conversion from 'int' to 'DWORD', signed/unsigned mismatch
-    #pragma warning(disable: 4324)  // boost/fiber/detail/context_mpsc_queue.hpp(41): warning C4324: 'boost::fibers::detail::context_mpsc_queue': structure was padded due to alignment specifier
-    #pragma warning(disable: 4456)  // boost/context/execution_context_v2.hpp(301): warning C4456: declaration of 'p' hides previous local declaration
-    #pragma warning(disable: 4702)  // boost/context/execution_context_v2.hpp(49): warning C4702: unreachable code
-    // Boost.Fiber indirectly includes windows.h for which we need to define some things.
-    #define NOMINMAX
-#endif
-
-// Boost fiber:
-// http://www.boost.org/doc/libs/develop/libs/fiber/doc/html/index.html
-// https://github.com/boostorg/fiber
-#include <boost/fiber/fiber.hpp>
-#include <boost/fiber/operations.hpp>
-#include <boost/fiber/condition_variable.hpp>
-#include <boost/fiber/mutex.hpp>
-#include <boost/fiber/future.hpp>
-#include <boost/fiber/barrier.hpp>
-
-#if BOOST_COMP_MSVC
-    #undef NOMINMAX
-    #pragma warning(pop)
-#endif
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/core/Hip.hpp b/thirdParty/alpaka/include/alpaka/core/Hip.hpp
deleted file mode 100644
index 056f802c7d..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Hip.hpp
+++ /dev/null
@@ -1,725 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/elem/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
-
-#include <hip/hip_runtime.h>
-
-#include <array>
-#include <type_traits>
-#include <utility>
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include <cstddef>
-
-#ifdef __HIP_PLATFORM_HCC__
-  #define HIPRT_CB
-#endif
-
-
-namespace alpaka
-{
-    namespace hip
-    {
-        namespace detail
-        {
-            //-----------------------------------------------------------------------------
-            //! HIP runtime API error checking with log and exception, ignoring specific error values
-            ALPAKA_FN_HOST inline auto hipRtCheck(
-                hipError_t const & error,
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                if(error != hipSuccess)
-                {
-                    std::string const sError(std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '" + hipGetErrorName(error) +  "': '" + std::string(hipGetErrorString(error)) + "'!");
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cerr << sError << std::endl;
-#endif
-                    ALPAKA_DEBUG_BREAK;
-                    throw std::runtime_error(sError);
-                }
-            }
-            //-----------------------------------------------------------------------------
-            //! HIP runtime API error checking with log and exception, ignoring specific error values
-            // NOTE: All ignored errors have to be convertible to hipError_t.
-            template<
-                typename... TErrors/*,
-                typename = typename std::enable_if<
-                    meta::Conjunction<
-                        std::true_type,
-                        std::is_convertible<
-                            TErrors,
-                            hipError_t
-                        >...
-                    >::value>::type*/>
-            ALPAKA_FN_HOST auto hipRtCheckIgnore(
-                hipError_t const & error,
-                char const * cmd,
-                char const * file,
-                int const & line,
-                TErrors && ... ignoredErrorCodes)
-            -> void
-            {
-                if(error != hipSuccess)
-                {
-                    // https://stackoverflow.com/questions/18792731/can-we-omit-the-double-braces-for-stdarray-in-c14/18792782#18792782
-                    std::array<hipError_t, sizeof...(ignoredErrorCodes)> const aIgnoredErrorCodes{{ignoredErrorCodes...}};
-                    // If the error code is not one of the ignored ones.
-                    if(std::find(aIgnoredErrorCodes.cbegin(), aIgnoredErrorCodes.cend(), error) == aIgnoredErrorCodes.cend())
-                    {
-                        hipRtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line);
-                    }
-                }
-            }
-            //-----------------------------------------------------------------------------
-            //! HIP runtime API last error checking with log and exception.
-            ALPAKA_FN_HOST inline auto hipRtCheckLastError(
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                hipError_t const error(hipGetLastError());
-                hipRtCheck(error, desc, file, line);
-            }
-        }
-    }
-}
-
-#if BOOST_COMP_MSVC
-    //-----------------------------------------------------------------------------
-    //! HIP runtime error checking with log and exception, ignoring specific error values
-    #define ALPAKA_HIP_RT_CHECK_IGNORE(cmd, ...)\
-        ::alpaka::hip::detail::hipRtCheckLastError("'" #cmd "' A previous HIP call (not this one) set the error ", __FILE__, __LINE__);\
-        ::alpaka::hip::detail::hipRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, __VA_ARGS__)
-#else
-    #if BOOST_COMP_CLANG
-        #pragma clang diagnostic push
-        #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
-    #endif
-    //-----------------------------------------------------------------------------
-    //! HIP runtime error checking with log and exception, ignoring specific error values
-    #define ALPAKA_HIP_RT_CHECK_IGNORE(cmd, ...)\
-        ::alpaka::hip::detail::hipRtCheckLastError("'" #cmd "' A previous HIP call (not this one) set the error ", __FILE__, __LINE__);\
-        ::alpaka::hip::detail::hipRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, ##__VA_ARGS__)
-    #if BOOST_COMP_CLANG
-        #pragma clang diagnostic pop
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-//! HIP runtime error checking with log and exception.
-#define ALPAKA_HIP_RT_CHECK(cmd)\
-    ALPAKA_HIP_RT_CHECK_IGNORE(cmd)
-
-//-----------------------------------------------------------------------------
-// HIP vector_types.h trait specializations.
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The HIP specifics.
-    namespace hip
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP vectors 1D dimension get trait specialization.
-            template<
-                typename T>
-            struct IsHipBuiltInType :
-                std::integral_constant<
-                    bool,
-                    std::is_same<T, char1>::value
-                    || std::is_same<T, double1>::value
-                    || std::is_same<T, float1>::value
-                    || std::is_same<T, int1>::value
-                    || std::is_same<T, long1>::value
-                    || std::is_same<T, longlong1>::value
-                    || std::is_same<T, short1>::value
-                    || std::is_same<T, uchar1>::value
-                    || std::is_same<T, uint1>::value
-                    || std::is_same<T, ulong1>::value
-                    || std::is_same<T, ulonglong1>::value
-                    || std::is_same<T, ushort1>::value
-                    || std::is_same<T, char2>::value
-                    || std::is_same<T, double2>::value
-                    || std::is_same<T, float2>::value
-                    || std::is_same<T, int2>::value
-                    || std::is_same<T, long2>::value
-                    || std::is_same<T, longlong2>::value
-                    || std::is_same<T, short2>::value
-                    || std::is_same<T, uchar2>::value
-                    || std::is_same<T, uint2>::value
-                    || std::is_same<T, ulong2>::value
-                    || std::is_same<T, ulonglong2>::value
-                    || std::is_same<T, ushort2>::value
-                    || std::is_same<T, char3>::value
-                    || std::is_same<T, dim3>::value
-                    || std::is_same<T, double3>::value
-                    || std::is_same<T, float3>::value
-                    || std::is_same<T, int3>::value
-                    || std::is_same<T, long3>::value
-                    || std::is_same<T, longlong3>::value
-                    || std::is_same<T, short3>::value
-                    || std::is_same<T, uchar3>::value
-                    || std::is_same<T, uint3>::value
-                    || std::is_same<T, ulong3>::value
-                    || std::is_same<T, ulonglong3>::value
-                    || std::is_same<T, ushort3>::value
-                    || std::is_same<T, char4>::value
-                    || std::is_same<T, double4>::value
-                    || std::is_same<T, float4>::value
-                    || std::is_same<T, int4>::value
-                    || std::is_same<T, long4>::value
-                    || std::is_same<T, longlong4>::value
-                    || std::is_same<T, short4>::value
-                    || std::is_same<T, uchar4>::value
-                    || std::is_same<T, uint4>::value
-                    || std::is_same<T, ulong4>::value
-                    || std::is_same<T, ulonglong4>::value
-                    || std::is_same<T, ushort4>::value
-                >
-            {};
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-          // If you receive '"alpaka::dim::traits::DimType" has already been defined'
-          // then too many operators in the enable_if are used. Split them in two or more structs.
-          // (compiler: gcc 5.3.0)
-            //#############################################################################
-            //! The HIP vectors 1D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char1>::value
-                    || std::is_same<T, double1>::value
-                    || std::is_same<T, float1>::value
-                    || std::is_same<T, int1>::value
-                    || std::is_same<T, long1>::value
-                    || std::is_same<T, longlong1>::value
-                    || std::is_same<T, short1>::value
-                >::type>
-            {
-                using type = dim::DimInt<1u>;
-            };
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, uchar1>::value
-                    || std::is_same<T, uint1>::value
-                    || std::is_same<T, ulong1>::value
-                    || std::is_same<T, ulonglong1>::value
-                    || std::is_same<T, ushort1>::value
-                >::type>
-            {
-                using type = dim::DimInt<1u>;
-            };
-            //#############################################################################
-            //! The HIP vectors 2D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char2>::value
-                    || std::is_same<T, double2>::value
-                    || std::is_same<T, float2>::value
-                    || std::is_same<T, int2>::value
-                    || std::is_same<T, long2>::value
-                    || std::is_same<T, longlong2>::value
-                    || std::is_same<T, short2>::value
-                >::type>
-            {
-                using type = dim::DimInt<2u>;
-            };
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, uchar2>::value
-                    || std::is_same<T, uint2>::value
-                    || std::is_same<T, ulong2>::value
-                    || std::is_same<T, ulonglong2>::value
-                    || std::is_same<T, ushort2>::value
-                >::type>
-            {
-                using type = dim::DimInt<2u>;
-            };
-            //#############################################################################
-            //! The HIP vectors 3D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char3>::value
-                    || std::is_same<T, dim3>::value
-                    || std::is_same<T, double3>::value
-                    || std::is_same<T, float3>::value
-                    || std::is_same<T, int3>::value
-                    || std::is_same<T, long3>::value
-                    || std::is_same<T, longlong3>::value
-                    || std::is_same<T, short3>::value
-                >::type>
-            {
-                using type = dim::DimInt<3u>;
-            };
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, uchar3>::value
-                    || std::is_same<T, uint3>::value
-                    || std::is_same<T, ulong3>::value
-                    || std::is_same<T, ulonglong3>::value
-                    || std::is_same<T, ushort3>::value
-                >::type>
-            {
-                using type = dim::DimInt<3u>;
-            };
-            //#############################################################################
-            //! The HIP vectors 4D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char4>::value
-                    || std::is_same<T, double4>::value
-                    || std::is_same<T, float4>::value
-                    || std::is_same<T, int4>::value
-                    || std::is_same<T, long4>::value
-                    || std::is_same<T, longlong4>::value
-                    || std::is_same<T, short4>::value
-                >::type>
-            {
-                using type = dim::DimInt<4u>;
-            };
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, uchar4>::value
-                    || std::is_same<T, uint4>::value
-                    || std::is_same<T, ulong4>::value
-                    || std::is_same<T, ulonglong4>::value
-                    || std::is_same<T, ushort4>::value
-                >::type>
-            {
-                using type = dim::DimInt<4u>;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP vectors elem type trait specialization.
-            template<
-                typename T>
-            struct ElemType<
-                T,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<T>::value>::type>
-            {
-                using type = decltype(std::declval<T>().x);
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP vectors extent get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 1u>,
-                TExtent,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.x)
-                {
-                    return extent.x;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors extent get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 2u>,
-                TExtent,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.y)
-                {
-                    return extent.y;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors extent get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 3u>,
-                TExtent,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 3)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.z)
-                {
-                    return extent.z;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors extent get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 4u>,
-                TExtent,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 4)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.w)
-                {
-                    return extent.w;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 1u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent.x = extentVal;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 2u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent.y = extentVal;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 3u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 3)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent.z = extentVal;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 4u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 4)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent.w = extentVal;
-                }
-            };
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 1u>,
-                TOffsets,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.x)
-                {
-                    return offsets.x;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 2u>,
-                TOffsets,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.y)
-                {
-                    return offsets.y;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 3u>,
-                TOffsets,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 3)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.z)
-                {
-                    return offsets.z;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 4u>,
-                TOffsets,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 4)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.w)
-                {
-                    return offsets.w;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 1u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.x = offset;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 2u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.y = offset;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 3u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 3)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.z = offset;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 4u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 4)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.w = offset;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP vectors idx type trait specialization.
-            template<
-                typename TIdx>
-            struct IdxType<
-                TIdx,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TIdx>::value>::type>
-            {
-                using type = std::size_t;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/core/Positioning.hpp b/thirdParty/alpaka/include/alpaka/core/Positioning.hpp
deleted file mode 100644
index 4147f8ccb1..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Positioning.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    //#############################################################################
-    //! Defines the parallelism hierarchy levels of Alpaka
-    namespace hierarchy
-    {
-        struct Grids{};
-
-        struct Blocks{};
-
-        struct Threads{};
-    }
-    //-----------------------------------------------------------------------------
-    //! Defines the origins available for getting extent and indices of kernel executions.
-    namespace origin
-    {
-        //#############################################################################
-        //! This type is used to get the extents/indices relative to the grid.
-        struct Grid;
-        //#############################################################################
-        //! This type is used to get the extent/indices relative to a/the current block.
-        struct Block;
-        //#############################################################################
-        //! This type is used to get the extents relative to the thread.
-        struct Thread;
-    }
-    //-----------------------------------------------------------------------------
-    //! Defines the units available for getting extent and indices of kernel executions.
-    namespace unit
-    {
-        //#############################################################################
-        //! This type is used to get the extent/indices in units of blocks.
-        struct Blocks;
-        //#############################################################################
-        //! This type is used to get the extent/indices in units of threads.
-        struct Threads;
-        //#############################################################################
-        //! This type is used to get the extents/indices in units of elements.
-        struct Elems;
-    }
-
-    using namespace origin;
-    using namespace unit;
-}
diff --git a/thirdParty/alpaka/include/alpaka/core/Unroll.hpp b/thirdParty/alpaka/include/alpaka/core/Unroll.hpp
deleted file mode 100644
index 829be5ca7f..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Unroll.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/BoostPredef.hpp>
-
-//-----------------------------------------------------------------------------
-//! Suggests unrolling of the directly following loop to the compiler.
-//!
-//! Usage:
-//!  `ALPAKA_UNROLL
-//!  for(...){...}`
-// \TODO: Implement for other compilers.
-#if BOOST_ARCH_PTX
-    #if BOOST_COMP_MSVC
-        #define ALPAKA_UNROLL(...) __pragma(unroll __VA_ARGS__)
-    #else
-        #define ALPAKA_UNROLL_STRINGIFY(x) #x
-        #define ALPAKA_UNROLL(...)  _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll __VA_ARGS__))
-    #endif
-#else
-    #if BOOST_COMP_INTEL || BOOST_COMP_IBM || BOOST_COMP_SUNPRO || BOOST_COMP_HPACC
-        #define ALPAKA_UNROLL_STRINGIFY(x) #x
-        #define ALPAKA_UNROLL(...)  _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll(__VA_ARGS__)))
-    #elif BOOST_COMP_PGI
-        #define ALPAKA_UNROLL(...)  _Pragma("unroll")
-    #else
-        #define ALPAKA_UNROLL(...)
-    #endif
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/core/Unused.hpp b/thirdParty/alpaka/include/alpaka/core/Unused.hpp
deleted file mode 100644
index 22fb5495d6..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Unused.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-
-#include <boost/config.hpp>
-
-namespace alpaka
-{
-    ALPAKA_NO_HOST_ACC_WARNING
-    template< typename... Ts >
-    BOOST_FORCEINLINE
-    BOOST_CXX14_CONSTEXPR
-    ALPAKA_FN_HOST_ACC
-    void
-    ignore_unused( Ts const& ... )
-    {}
-
-    ALPAKA_NO_HOST_ACC_WARNING
-    template< typename... Ts >
-    BOOST_FORCEINLINE
-    BOOST_CXX14_CONSTEXPR
-    ALPAKA_FN_HOST_ACC
-    void
-    ignore_unused()
-    {}
-
-} // namespace alpaka
-
diff --git a/thirdParty/alpaka/include/alpaka/core/Utility.hpp b/thirdParty/alpaka/include/alpaka/core/Utility.hpp
deleted file mode 100644
index e312fd9163..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Utility.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-
-#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
-#   include <type_traits>
-#else
-#   include <utility>
-#endif
-
-namespace alpaka
-{
-    namespace core
-    {
-        //-----------------------------------------------------------------------------
-        //! convert any type to a reverence type
-        //
-        // This function is equivalent to std::declval() but can be used
-        // within an alpaka accelerator kernel too.
-        // This function can be used only within std::decltype().
-        //-----------------------------------------------------------------------------
-#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
-        template< class T >
-        ALPAKA_FN_HOST_ACC
-        typename std::add_rvalue_reference<T>::type
-        declval();
-#else
-        using std::declval;
-#endif
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/core/Vectorize.hpp b/thirdParty/alpaka/include/alpaka/core/Vectorize.hpp
deleted file mode 100644
index a90ef0a4f5..0000000000
--- a/thirdParty/alpaka/include/alpaka/core/Vectorize.hpp
+++ /dev/null
@@ -1,384 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-
-#include <cstddef>
-#include <cstdint>
-
-//-----------------------------------------------------------------------------
-//! Suggests vectorization of the directly following loop to the compiler.
-//!
-//! Usage:
-//!  `ALPAKA_VECTORIZE_HINT
-//!  for(...){...}`
-// \TODO: Implement for other compilers.
-// See: http://stackoverflow.com/questions/2706286/pragmas-swp-ivdep-prefetch-support-in-various-compilers
-/*#if BOOST_COMP_INTEL || BOOST_COMP_HPACC
-    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("ivdep")
-#elif BOOST_COMP_PGI
-    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("vector")
-#elif BOOST_COMP_MSVC
-    #define ALPAKA_VECTORIZE_HINT(...)  __pragma(loop(ivdep))
-#elif BOOST_COMP_GNUC
-    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("GCC ivdep")
-#else
-    #define ALPAKA_VECTORIZE_HINT(...)
-#endif*/
-
-namespace alpaka
-{
-    namespace core
-    {
-        namespace vectorization
-        {
-            //-----------------------------------------------------------------------------
-            // The alignment required to enable optimal performance dependant on the target architecture.
-            constexpr std::size_t defaultAlignment =
-#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__MIC__)
-                64u
-#elif defined(__AVX__) || defined(__AVX2__)
-                32u
-#else
-                16u
-#endif
-            ;
-
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            // By default there is no vectorization.
-            template<
-                typename TElem>
-            struct GetVectorizationSizeElems
-            {
-                static constexpr std::size_t value = 1u;
-            };
-
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                double>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512F__) || defined(__MIC__)
-                // addition (AVX512F,KNC): vaddpd / _mm512_add_pd
-                // subtraction (AVX512F,KNC): vsubpd / _mm512_sub_pd
-                // multiplication (AVX512F,KNC): vmulpd / _mm512_mul_pd
-                8u;
-#elif defined(__AVX__)
-                // addition (AVX): vaddpd / _mm256_add_pd
-                // subtraction (AVX): vsubpd / _mm256_sub_pd
-                // multiplication (AVX): vmulpd / _mm256_mul_pd
-                4u;
-#elif defined(__SSE2__)
-                // addition (SSE2): addpd / _mm_add_pd
-                // subtraction (SSE2): subpd / _mm_sub_pd
-                // multiplication (SSE2): mulpd / _mm_mul_pd
-                2u;
-#elif defined(__ARM_NEON__)
-                // No support for double precision vectorization!
-                1u;
-#elif defined(__ALTIVEC__)
-                2u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                float>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512F__) || defined(__MIC__)
-                // addition (AVX512F,KNC): vaddps / _mm512_add_ps
-                // subtraction (AVX512F,KNC): vsubps / _mm512_sub_ps
-                // multiplication (AVX512F,KNC): vmulps / _mm512_mul_ps
-                16u;
-#elif defined(__AVX__)
-                // addition (AVX): vaddps / _mm256_add_ps
-                // subtraction (AVX): vsubps / _mm256_sub_ps
-                // multiplication (AVX): vmulps / _mm256_mul_ps
-                8u;
-#elif defined(__SSE__)
-                // addition (SSE): addps / _mm_add_ps
-                // subtraction (SSE): subps / _mm_sub_ps
-                // multiplication (SSE): mulps / _mm_mul_ps
-                4u;
-#elif defined(__ARM_NEON__)
-                4u;
-#elif defined(__ALTIVEC__)
-                4u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                std::int8_t>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512BW__)
-                // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
-                // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
-                // multiplication: -
-                64u;
-#elif defined(__AVX2__)
-                // addition (AVX2): vpaddb / _mm256_add_epi8
-                // subtraction (AVX2): vpsubb / _mm256_sub_epi8
-                // multiplication: -
-                32u;
-#elif defined(__SSE2__)
-                // addition (SSE2): paddb / _mm_add_epi8
-                // subtraction (SSE2): psubb / _mm_sub_epi8
-                // multiplication: -
-                16u;
-#elif defined(__ARM_NEON__)
-                16u;
-#elif defined(__ALTIVEC__)
-                16u;
-#elif defined(__CUDA_ARCH__)
-                // addition: __vadd4
-                // subtraction: __vsub4
-                // multiplication: -
-                4u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                std::uint8_t>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512BW__)
-                // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
-                // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
-                // multiplication: -
-                64u;
-#elif defined(__AVX2__)
-                // addition (AVX2): vpaddb / _mm256_add_epi8
-                // subtraction (AVX2): vpsubb / _mm256_sub_epi8
-                // multiplication: -
-                32u;
-#elif defined(__SSE2__)
-                // addition (SSE2): paddb / _mm_add_epi8
-                // subtraction (SSE2): psubb / _mm_sub_epi8
-                // multiplication: -
-                16u;
-#elif defined(__ARM_NEON__)
-                16u;
-#elif defined(__ALTIVEC__)
-                16u;
-#elif defined(__CUDA_ARCH__)
-                // addition: __vadd4
-                // subtraction: __vsub4
-                // multiplication: -
-                4u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                std::int16_t>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512BW__)
-                // addition (AVX512BW): vpaddw / _mm512_mask_add_epi16
-                // subtraction (AVX512BW): vpsubw / _mm512_mask_sub_epi16
-                // multiplication (AVX512BW): vpmullw / _mm512_mask_mullo_epi16
-                32u;
-#elif defined(__AVX2__)
-                // addition (AVX2): vpaddw / _mm256_add_epi16
-                // subtraction (AVX2): vpsubw / _mm256_sub_epi16
-                // multiplication (AVX2): vpmullw / _mm256_mullo_epi16
-                16u;
-#elif defined(__SSE2__)
-                // addition (SSE2): paddw / _mm_add_epi16
-                // subtraction (SSE2): psubw / _mm_sub_epi16
-                // multiplication (SSE2): pmullw / _mm_mullo_epi16
-                8u;
-#elif defined(__ARM_NEON__)
-                8u;
-#elif defined(__ALTIVEC__)
-                8u;
-#elif defined(__CUDA_ARCH__)
-                // addition: __vadd2
-                // subtraction: __vsub2
-                // multiplication: -
-                2u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                std::uint16_t>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512BW__)
-                // addition (AVX512BW): vpaddusw / _mm512_mask_adds_epu16
-                // subtraction (AVX512BW): vpsubw / _mm512_subs_epu16
-                // multiplication: ?
-                32u;
-#elif defined(__AVX2__)
-                // addition (AVX2): vpaddusw / _mm256_adds_epu16
-                // subtraction (AVX2): vpsubusw / _mm256_subs_epu16
-                // multiplication: ?
-                16u;
-#elif defined(__SSE2__)
-                // addition (SSE2): paddusw / _mm_adds_epu16
-                // subtraction (SSE2): psubusw / _mm_subs_epu16
-                // multiplication: ?
-                8u;
-#elif defined(__ARM_NEON__)
-                8u;
-#elif defined(__ALTIVEC__)
-                8u;
-#elif defined(__CUDA_ARCH__)
-                // addition: __vadd2
-                // subtraction: __vsub2
-                // multiplication: -
-                2u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                std::int32_t>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512F__) || defined(__MIC__)
-                // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
-                // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
-                // multiplication (AVX512F,KNC): vpmulld / _mm512_mask_mullo_epi32
-                16u;
-#elif defined(__AVX2__)
-                // addition (AVX2): vpaddd / _mm256_add_epi32
-                // subtraction (AVX2): vpsubd / _mm256_sub_epi32
-                // multiplication (AVX2): vpmulld / _mm256_mullo_epi32
-                8u;
-#elif defined(__SSE2__)
-                // addition (SSE2): paddd / _mm_add_epi32
-                // subtraction (SSE2): psubd / _mm_sub_epi32
-                // multiplication (SSE4.1): pmulld / _mm_mullo_epi32
-                4u;
-#elif defined(__ARM_NEON__)
-                4u;
-#elif defined(__ALTIVEC__)
-                4u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                std::uint32_t>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512F__) || defined(__MIC__)
-                // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
-                // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
-                // multiplication: ?
-                16u;
-#elif defined(__AVX2__)
-                // addition (AVX2): vpaddd / _mm256_add_epi32
-                // subtraction (AVX2): vpsubd / _mm256_sub_epi32
-                // multiplication: ?
-                8u;
-#elif defined(__SSE2__)
-                // addition (SSE2): paddd / _mm_add_epi32
-                // subtraction (SSE2): psubd / _mm_sub_epi32
-                // multiplication: ?
-                4u;
-#elif defined(__ARM_NEON__)
-                4u;
-#elif defined(__ALTIVEC__)
-                4u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                std::int64_t>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512F__)
-                // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
-                // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
-                // multiplication (AVX512DQ): vpmullq / _mm512_mask_mullo_epi64
-                8u;
-#elif defined(__AVX2__)
-                // addition (AVX2): vpaddq / _mm256_add_epi64
-                // subtraction (AVX2): vpsubq / _mm256_sub_epi64
-                // multiplication: -
-                4u;
-#elif defined(__SSE2__)
-                // addition (SSE2): paddq / _mm_add_epi64
-                // subtraction (SSE2): psubq / _mm_sub_epi64
-                // multiplication: -
-                2u;
-#elif defined(__ARM_NEON__)
-                2u;
-#else
-                1u;
-#endif
-            };
-            //-----------------------------------------------------------------------------
-            // Number of elements of the given type that can be processed in parallel in a vector register.
-            template<>
-            struct GetVectorizationSizeElems<
-                std::uint64_t>
-            {
-                static constexpr std::size_t value =
-#if defined(__AVX512F__)
-                // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
-                // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
-                // multiplication: ?
-                8u;
-#elif defined(__AVX2__)
-                // addition (AVX2): vpaddq / _mm256_add_epi64
-                // subtraction (AVX2): vpsubq / _mm256_sub_epi64
-                // multiplication: ?
-                4u;
-#elif defined(__SSE2__)
-                // addition (SSE2): paddq / _mm_add_epi64
-                // subtraction (SSE2): psubq / _mm_sub_epi64
-                // multiplication: ?
-                2u;
-#elif defined(__ARM_NEON__)
-                2u;
-#else
-                1u;
-#endif
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/dev/DevCpu.hpp b/thirdParty/alpaka/include/alpaka/dev/DevCpu.hpp
deleted file mode 100644
index 9fa0b6650e..0000000000
--- a/thirdParty/alpaka/include/alpaka/dev/DevCpu.hpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/cpu/ICpuQueue.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/cpu/SysInfo.hpp>
-
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/queue/Properties.hpp>
-
-#include <map>
-#include <mutex>
-#include <memory>
-#include <vector>
-#include <algorithm>
-
-namespace alpaka
-{
-    namespace queue
-    {
-        class QueueCpuNonBlocking;
-        class QueueCpuBlocking;
-
-        namespace cpu
-        {
-            namespace detail
-            {
-                class QueueCpuNonBlockingImpl;
-                class QueueCpuBlockingImpl;
-            }
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            template<
-                typename TPltf,
-                typename TSfinae>
-            struct GetDevByIdx;
-        }
-        class PltfCpu;
-    }
-    namespace dev
-    {
-        //-----------------------------------------------------------------------------
-        //! The CPU device.
-        namespace cpu
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CPU device implementation.
-                class DevCpuImpl
-                {
-                private:
-
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST auto GetAllQueueImpls(
-                        std::vector<std::weak_ptr<queue::cpu::ICpuQueue>> & queues) const
-                    -> std::vector<std::shared_ptr<queue::cpu::ICpuQueue>>
-                    {
-                        std::vector<std::shared_ptr<queue::cpu::ICpuQueue>> vspQueues;
-
-                        std::lock_guard<std::mutex> lk(m_Mutex);
-
-                        for(auto it = queues.begin(); it != queues.end();)
-                        {
-                            auto spQueue(it->lock());
-                            if(spQueue)
-                            {
-                                vspQueues.emplace_back(std::move(spQueue));
-                                ++it;
-                            }
-                            else
-                            {
-                                it = queues.erase(it);
-                            }
-                        }
-                        return vspQueues;
-                    }
-
-                public:
-                    //-----------------------------------------------------------------------------
-                    DevCpuImpl() = default;
-                    //-----------------------------------------------------------------------------
-                    DevCpuImpl(DevCpuImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    DevCpuImpl(DevCpuImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(DevCpuImpl const &) -> DevCpuImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(DevCpuImpl &&) -> DevCpuImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ~DevCpuImpl() = default;
-
-                    ALPAKA_FN_HOST auto GetAllQueues() const
-                    -> std::vector<std::shared_ptr<queue::cpu::ICpuQueue>>
-                    {
-                        return GetAllQueueImpls(m_queues);
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    //! Registers the given queue on this device.
-                    //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
-                    ALPAKA_FN_HOST auto RegisterQueue(std::shared_ptr<queue::cpu::ICpuQueue> spQueue)
-                    -> void
-                    {
-                        std::lock_guard<std::mutex> lk(m_Mutex);
-
-                        // Register this queue on the device.
-                        m_queues.push_back(spQueue);
-                    }
-
-                private:
-                    std::mutex mutable m_Mutex;
-                    std::vector<std::weak_ptr<queue::cpu::ICpuQueue>> mutable m_queues;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CPU device handle.
-        class DevCpu : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, DevCpu>
-        {
-            friend struct pltf::traits::GetDevByIdx<pltf::PltfCpu>;
-        protected:
-            //-----------------------------------------------------------------------------
-            DevCpu() :
-                m_spDevCpuImpl(std::make_shared<cpu::detail::DevCpuImpl>())
-            {}
-        public:
-            //-----------------------------------------------------------------------------
-            DevCpu(DevCpu const &) = default;
-            //-----------------------------------------------------------------------------
-            DevCpu(DevCpu &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevCpu const &) -> DevCpu & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevCpu &&) -> DevCpu & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(DevCpu const &) const
-            -> bool
-            {
-                return true;
-            }
-            //-----------------------------------------------------------------------------
-            auto operator!=(DevCpu const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~DevCpu() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::DevCpuImpl> m_spDevCpuImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device name get trait specialization.
-            template<>
-            struct GetName<
-                dev::DevCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getName(
-                    dev::DevCpu const & dev)
-                -> std::string
-                {
-                    alpaka::ignore_unused(dev);
-
-                    return dev::cpu::detail::getCpuName();
-                }
-            };
-
-            //#############################################################################
-            //! The CPU device available memory get trait specialization.
-            template<>
-            struct GetMemBytes<
-                dev::DevCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getMemBytes(
-                    dev::DevCpu const & dev)
-                -> std::size_t
-                {
-                    alpaka::ignore_unused(dev);
-
-                    return dev::cpu::detail::getTotalGlobalMemSizeBytes();
-                }
-            };
-
-            //#############################################################################
-            //! The CPU device free memory get trait specialization.
-            template<>
-            struct GetFreeMemBytes<
-                dev::DevCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getFreeMemBytes(
-                    dev::DevCpu const & dev)
-                -> std::size_t
-                {
-                    alpaka::ignore_unused(dev);
-
-                    return dev::cpu::detail::getFreeGlobalMemSizeBytes();
-                }
-            };
-
-            //#############################################################################
-            //! The CPU device reset trait specialization.
-            template<>
-            struct Reset<
-                dev::DevCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto reset(
-                    dev::DevCpu const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    alpaka::ignore_unused(dev);
-
-                    // The CPU does nothing on reset.
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCpu;
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU device memory buffer type trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct BufType<
-                    dev::DevCpu,
-                    TElem,
-                    TDim,
-                    TIdx>
-                {
-                    using type = mem::buf::BufCpu<TElem, TDim, TIdx>;
-                };
-            }
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device platform type trait specialization.
-            template<>
-            struct PltfType<
-                dev::DevCpu>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            template<>
-            struct QueueType<
-                dev::DevCpu,
-                queue::Blocking
-            >
-            {
-                using type = queue::QueueCpuBlocking;
-            };
-
-            template<>
-            struct QueueType<
-                dev::DevCpu,
-                queue::NonBlocking
-            >
-            {
-                using type = queue::QueueCpuNonBlocking;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/dev/DevCudaRt.hpp b/thirdParty/alpaka/include/alpaka/dev/DevCudaRt.hpp
deleted file mode 100644
index 8fd7322845..0000000000
--- a/thirdParty/alpaka/include/alpaka/dev/DevCudaRt.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/queue/Properties.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        namespace traits
-        {
-            template<
-                typename TPltf,
-                typename TSfinae>
-            struct GetDevByIdx;
-        }
-        class PltfCudaRt;
-    }
-
-    namespace queue
-    {
-        class QueueCudaRtBlocking;
-        class QueueCudaRtNonBlocking;
-    }
-
-    namespace dev
-    {
-        //#############################################################################
-        //! The CUDA RT device handle.
-        class DevCudaRt : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, DevCudaRt>
-        {
-            friend struct pltf::traits::GetDevByIdx<pltf::PltfCudaRt>;
-
-        protected:
-            //-----------------------------------------------------------------------------
-            DevCudaRt() = default;
-        public:
-            //-----------------------------------------------------------------------------
-            DevCudaRt(DevCudaRt const &) = default;
-            //-----------------------------------------------------------------------------
-            DevCudaRt(DevCudaRt &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevCudaRt const &) -> DevCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevCudaRt &&) -> DevCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(DevCudaRt const & rhs) const
-            -> bool
-            {
-                return m_iDevice == rhs.m_iDevice;
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(DevCudaRt const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~DevCudaRt() = default;
-
-        public:
-            int m_iDevice;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device name get trait specialization.
-            template<>
-            struct GetName<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getName(
-                    dev::DevCudaRt const & dev)
-                -> std::string
-                {
-                    // There is cudaDeviceGetAttribute as faster alternative to cudaGetDeviceProperties to get a single device property but it has no option to get the name
-                    cudaDeviceProp cudaDevProp;
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaGetDeviceProperties(
-                            &cudaDevProp,
-                            dev.m_iDevice));
-
-                    return std::string(cudaDevProp.name);
-                }
-            };
-
-            //#############################################################################
-            //! The CUDA RT device available memory get trait specialization.
-            template<>
-            struct GetMemBytes<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getMemBytes(
-                    dev::DevCudaRt const & dev)
-                -> std::size_t
-                {
-                    // Set the current device to wait for.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-
-                    std::size_t freeInternal(0u);
-                    std::size_t totalInternal(0u);
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemGetInfo(
-                            &freeInternal,
-                            &totalInternal));
-
-                    return totalInternal;
-                }
-            };
-
-            //#############################################################################
-            //! The CUDA RT device free memory get trait specialization.
-            template<>
-            struct GetFreeMemBytes<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getFreeMemBytes(
-                    dev::DevCudaRt const & dev)
-                -> std::size_t
-                {
-                    // Set the current device to wait for.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-
-                    std::size_t freeInternal(0u);
-                    std::size_t totalInternal(0u);
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemGetInfo(
-                            &freeInternal,
-                            &totalInternal));
-
-                    return freeInternal;
-                }
-            };
-
-            //#############################################################################
-            //! The CUDA RT device reset trait specialization.
-            template<>
-            struct Reset<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto reset(
-                    dev::DevCudaRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Set the current device to wait for.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaDeviceReset());
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCudaRt;
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA RT device memory buffer type trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct BufType<
-                    dev::DevCudaRt,
-                    TElem,
-                    TDim,
-                    TIdx>
-                {
-                    using type = mem::buf::BufCudaRt<TElem, TDim, TIdx>;
-                };
-            }
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device platform type trait specialization.
-            template<>
-            struct PltfType<
-                dev::DevCudaRt>
-            {
-                using type = pltf::PltfCudaRt;
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The thread CUDA device wait specialization.
-            //!
-            //! Blocks until the device has completed all preceding requested tasks.
-            //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
-            template<>
-            struct CurrentThreadWaitFor<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    dev::DevCudaRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Set the current device to wait for.
-                    ALPAKA_CUDA_RT_CHECK(cudaSetDevice(
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceSynchronize());
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            template<>
-            struct QueueType<
-                dev::DevCudaRt,
-                queue::Blocking
-            >
-            {
-                using type = queue::QueueCudaRtBlocking;
-            };
-
-            template<>
-            struct QueueType<
-                dev::DevCudaRt,
-                queue::NonBlocking
-            >
-            {
-                using type = queue::QueueCudaRtNonBlocking;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/dev/DevHipRt.hpp b/thirdParty/alpaka/include/alpaka/dev/DevHipRt.hpp
deleted file mode 100644
index d922626654..0000000000
--- a/thirdParty/alpaka/include/alpaka/dev/DevHipRt.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/queue/Properties.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        namespace traits
-        {
-            template<
-                typename TPltf,
-                typename TSfinae>
-            struct GetDevByIdx;
-        }
-        class PltfHipRt;
-    }
-
-    namespace queue
-    {
-        class QueueHipRtBlocking;
-        class QueueHipRtNonBlocking;
-    }
-
-    namespace dev
-    {
-        //#############################################################################
-        //! The HIP RT device handle.
-        class DevHipRt : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, DevHipRt>
-        {
-            friend struct pltf::traits::GetDevByIdx<pltf::PltfHipRt>;
-
-        protected:
-            //-----------------------------------------------------------------------------
-            DevHipRt() = default;
-        public:
-            //-----------------------------------------------------------------------------
-            DevHipRt(DevHipRt const &) = default;
-            //-----------------------------------------------------------------------------
-            DevHipRt(DevHipRt &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevHipRt const &) -> DevHipRt & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevHipRt &&) -> DevHipRt & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(DevHipRt const & rhs) const
-            -> bool
-            {
-                return m_iDevice == rhs.m_iDevice;
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(DevHipRt const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST_ACC ~DevHipRt() = default;
-
-        public:
-            int m_iDevice;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device name get trait specialization.
-            template<>
-            struct GetName<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getName(
-                    dev::DevHipRt const & dev)
-                -> std::string
-                {
-                    hipDeviceProp_t hipDevProp;
-                    ALPAKA_HIP_RT_CHECK(
-                        hipGetDeviceProperties(
-                            &hipDevProp,
-                            dev.m_iDevice));
-
-                    return std::string(hipDevProp.name);
-                }
-            };
-
-            //#############################################################################
-            //! The HIP RT device available memory get trait specialization.
-            template<>
-            struct GetMemBytes<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getMemBytes(
-                    dev::DevHipRt const & dev)
-                -> std::size_t
-                {
-                    // Set the current device to wait for.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-
-                    std::size_t freeInternal(0u);
-                    std::size_t totalInternal(0u);
-
-                    // \TODO: Check which is faster: hipMemGetInfo().totalInternal vs hipGetDeviceProperties().totalGlobalMem
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemGetInfo(
-                            &freeInternal,
-                            &totalInternal));
-
-                    return totalInternal;
-                }
-            };
-
-            //#############################################################################
-            //! The HIP RT device free memory get trait specialization.
-            template<>
-            struct GetFreeMemBytes<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getFreeMemBytes(
-                    dev::DevHipRt const & dev)
-                -> std::size_t
-                {
-                    // Set the current device to wait for.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-
-                    std::size_t freeInternal(0u);
-                    std::size_t totalInternal(0u);
-
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemGetInfo(
-                            &freeInternal,
-                            &totalInternal));
-
-                    return freeInternal;
-                }
-            };
-
-            //#############################################################################
-            //! The HIP RT device reset trait specialization.
-            template<>
-            struct Reset<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto reset(
-                    dev::DevHipRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Set the current device to wait for.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-                    ALPAKA_HIP_RT_CHECK(
-                        hipDeviceReset());
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufHipRt;
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP RT device memory buffer type trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct BufType<
-                    dev::DevHipRt,
-                    TElem,
-                    TDim,
-                    TIdx>
-                {
-                    using type = mem::buf::BufHipRt<TElem, TDim, TIdx>;
-                };
-            }
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device platform type trait specialization.
-            template<>
-            struct PltfType<
-                dev::DevHipRt>
-            {
-                using type = pltf::PltfHipRt;
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The thread HIP device wait specialization.
-            //!
-            //! Blocks until the device has completed all preceding requested tasks.
-            //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
-            template<>
-            struct CurrentThreadWaitFor<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    dev::DevHipRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Set the current device to wait for.
-                    ALPAKA_HIP_RT_CHECK(hipSetDevice(
-                        dev.m_iDevice));
-                    ALPAKA_HIP_RT_CHECK(hipDeviceSynchronize());
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            template<>
-            struct QueueType<
-                dev::DevHipRt,
-                queue::Blocking
-            >
-            {
-                using type = queue::QueueHipRtBlocking;
-            };
-
-            template<>
-            struct QueueType<
-                dev::DevHipRt,
-                queue::NonBlocking
-            >
-            {
-                using type = queue::QueueHipRtNonBlocking;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/dev/Traits.hpp b/thirdParty/alpaka/include/alpaka/dev/Traits.hpp
deleted file mode 100644
index 53930a909a..0000000000
--- a/thirdParty/alpaka/include/alpaka/dev/Traits.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-
-#include <boost/config.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The device specifics.
-    namespace dev
-    {
-        //-----------------------------------------------------------------------------
-        //! The device traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The device type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct DevType;
-
-            //#############################################################################
-            //! The device get trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct GetDev;
-
-            //#############################################################################
-            //! The device name get trait.
-            template<
-                typename TDev,
-                typename TSfinae = void>
-            struct GetName;
-
-            //#############################################################################
-            //! The device memory size get trait.
-            template<
-                typename TDev,
-                typename TSfinae = void>
-            struct GetMemBytes;
-
-            //#############################################################################
-            //! The device free memory size get trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct GetFreeMemBytes;
-
-            //#############################################################################
-            //! The device reset trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct Reset;
-        }
-
-        //#############################################################################
-        //! The device type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Dev = typename traits::DevType<T>::type;
-
-        //-----------------------------------------------------------------------------
-        //! \return The device this object is bound to.
-        template<
-            typename T>
-        ALPAKA_FN_HOST auto getDev(
-            T const & t)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(traits::GetDev<T>::getDev(t))
-#endif
-        {
-            return
-                traits::GetDev<
-                    T>
-                ::getDev(
-                    t);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The device name.
-        template<
-            typename TDev>
-        ALPAKA_FN_HOST auto getName(
-            TDev const & dev)
-        -> std::string
-        {
-            return
-                traits::GetName<
-                    TDev>
-                ::getName(
-                    dev);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The memory on the device in Bytes.
-        template<
-            typename TDev>
-        ALPAKA_FN_HOST auto getMemBytes(
-            TDev const & dev)
-        -> std::size_t
-        {
-            return
-                traits::GetMemBytes<
-                    TDev>
-                ::getMemBytes(
-                    dev);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The free memory on the device in Bytes.
-        template<
-            typename TDev>
-        ALPAKA_FN_HOST auto getFreeMemBytes(
-            TDev const & dev)
-        -> std::size_t
-        {
-            return
-                traits::GetFreeMemBytes<
-                    TDev>
-                ::getFreeMemBytes(
-                    dev);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Resets the device.
-        //! What this method does is dependent on the accelerator.
-        template<
-            typename TDev>
-        ALPAKA_FN_HOST auto reset(
-            TDev const & dev)
-        -> void
-        {
-            traits::Reset<
-                TDev>
-            ::reset(
-                dev);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/dev/cpu/SysInfo.hpp b/thirdParty/alpaka/include/alpaka/dev/cpu/SysInfo.hpp
deleted file mode 100644
index 879edaa0c8..0000000000
--- a/thirdParty/alpaka/include/alpaka/dev/cpu/SysInfo.hpp
+++ /dev/null
@@ -1,239 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Daniel Vollmer, Erik Zenker, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if BOOST_OS_WINDOWS || BOOST_OS_CYGWIN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #ifndef WIN32_LEAN_AND_MEAN
-        #define WIN32_LEAN_AND_MEAN
-    #endif
-    // We could use some more macros to reduce the number of sub-headers included, but this would restrict user code.
-    #include <windows.h>
-#elif BOOST_OS_UNIX || BOOST_OS_MACOS
-    #include <cstdint>
-    #include <unistd.h>
-    #include <sys/types.h>
-    #include <sys/param.h>
-    #if BOOST_OS_BSD || BOOST_OS_MACOS
-        #include <sys/sysctl.h>
-    #endif
-#endif
-
-#if BOOST_OS_LINUX
-    #include <fstream>
-#endif
-
-#include <stdexcept>
-#include <cstring>
-#include <string>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        namespace cpu
-        {
-            namespace detail
-            {
-#if BOOST_ARCH_X86
-    #if BOOST_COMP_GNUC || BOOST_COMP_CLANG || (!BOOST_COMP_MSVC_EMULATED && defined(__INTEL_COMPILER))
-        #include <cpuid.h>
-                //-----------------------------------------------------------------------------
-                inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4])
-                -> void
-                {
-                    __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]);
-                }
-
-    #elif BOOST_COMP_MSVC || defined(__INTEL_COMPILER)
-        #include <intrin.h>
-                //-----------------------------------------------------------------------------
-                inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4])
-                -> void
-                {
-                    __cpuidex(reinterpret_cast<int*>(ex), level, subfunction);
-                }
-    #endif
-#endif
-                //-----------------------------------------------------------------------------
-                //! \return The name of the CPU the code is running on.
-                inline auto getCpuName()
-                -> std::string
-                {
-#if BOOST_ARCH_X86
-                    // Get extended ids.
-                    std::uint32_t ex[4] = {0};
-                    cpuid(0x80000000, 0, ex);
-                    std::uint32_t const nExIds(ex[0]);
-
-                    // Get the information associated with each extended ID.
-                    char cpuBrandString[0x40] = {0};
-                    for(std::uint32_t i(0x80000000); i<=nExIds; ++i)
-                    {
-                        cpuid(i, 0, ex);
-
-                        // Interpret CPU brand string and cache information.
-                        if(i == 0x80000002)
-                        {
-                            std::memcpy(cpuBrandString, ex, sizeof(ex));
-                        }
-                        else if(i == 0x80000003)
-                        {
-                            std::memcpy(cpuBrandString + 16, ex, sizeof(ex));
-                        }
-                        else if(i == 0x80000004)
-                        {
-                            std::memcpy(cpuBrandString + 32, ex, sizeof(ex));
-                        }
-                    }
-                    return std::string(cpuBrandString);
-#else
-                    return "<unknown>";
-#endif
-                }
-                //-----------------------------------------------------------------------------
-                //! \return The frequency of the CPU the code is running on.
-                // TODO: implement!
-                /*inline auto getCpuFrequency()
-                -> std::size_t
-                {
-                    return 0;
-                }*/
-                //-----------------------------------------------------------------------------
-                //! \return The total number of bytes of global memory.
-                //! Adapted from David Robert Nadeau: http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
-                inline auto getTotalGlobalMemSizeBytes()
-                -> std::size_t
-                {
-#if BOOST_OS_WINDOWS
-                    MEMORYSTATUSEX status;
-                    status.dwLength = sizeof(status);
-                    GlobalMemoryStatusEx(&status);
-                    return static_cast<std::size_t>(status.ullTotalPhys);
-
-#elif BOOST_OS_CYGWIN
-                    // New 64-bit MEMORYSTATUSEX isn't available.
-                    MEMORYSTATUS status;
-                    status.dwLength = sizeof(status);
-                    GlobalMemoryStatus(&status);
-                    return static_cast<std::size_t>(status.dwTotalPhys);
-
-#elif BOOST_OS_UNIX || BOOST_OS_MACOS
-                    // Unix : Prefer sysctl() over sysconf() except sysctl() with HW_REALMEM and HW_PHYSMEM which are not always reliable
-    #if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
-                    int mib[2] = {CTL_HW,
-        #if defined(HW_MEMSIZE)                                                 // OSX
-                        HW_MEMSIZE
-        #elif defined(HW_PHYSMEM64)                                             // NetBSD, OpenBSD.
-                        HW_PHYSMEM64
-        #endif
-                    };
-                    std::uint64_t size(0);
-                    std::size_t sizeLen{sizeof(size)};
-                    if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
-                    {
-                        throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
-                    }
-                    return static_cast<std::size_t>(size);
-
-    #elif defined(_SC_AIX_REALMEM)                                          // AIX.
-                    return static_cast<std::size_t>(sysconf(_SC_AIX_REALMEM)) * static_cast<std::size_t>(1024);
-
-    #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)                  // Linux, FreeBSD, OpenBSD, Solaris.
-                    return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES)) * static_cast<std::size_t>(sysconf(_SC_PAGESIZE));
-
-    #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE)                 // Legacy.
-                    return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES)) * static_cast<std::size_t>(sysconf(_SC_PAGE_SIZE));
-
-    #elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM))   // FreeBSD, DragonFly BSD, NetBSD, OpenBSD, and OSX.
-                    int mib[2] = {CTL_HW,
-        #if defined(HW_REALMEM)                                                 // FreeBSD.
-                        HW_REALMEM;
-        #elif defined(HW_PYSMEM)                                                // Others.
-                        HW_PHYSMEM;
-        #endif
-                    };
-                    std::uint32_t size(0);
-                    std::size_t const sizeLen{sizeof(size)};
-                    if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
-                    {
-                        throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
-                    }
-                    return static_cast<std::size_t>(size);
-    #endif
-
-#else
-    #error "getTotalGlobalMemSizeBytes not implemented for this system!"
-#endif
-                }
-                //-----------------------------------------------------------------------------
-                //! \return The free number of bytes of global memory.
-                //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors.
-                inline auto getFreeGlobalMemSizeBytes()
-                -> std::size_t
-                {
-#if BOOST_OS_WINDOWS
-                    MEMORYSTATUSEX status;
-                    status.dwLength = sizeof(status);
-                    GlobalMemoryStatusEx(&status);
-                    return static_cast<std::size_t>(status.ullAvailPhys);
-
-#elif BOOST_OS_LINUX
-                    std::string token;
-                    std::ifstream file("/proc/meminfo");
-                    if(file)
-                    {
-                        while(file >> token)
-                        {
-                            if(token == "MemFree:")
-                            {
-                                std::size_t freeGlobalMemSizeBytes(0);
-                                if(file >> freeGlobalMemSizeBytes)
-                                {
-                                    return freeGlobalMemSizeBytes * size_t(1024);
-                                }
-                                else
-                                {
-                                    throw std::runtime_error("Unable to read MemFree value!");
-                                }
-                            }
-                        }
-                        throw std::runtime_error("Unable to find MemFree in '/proc/meminfo'!");
-                    }
-                    else
-                    {
-                        throw std::runtime_error("Unable to open '/proc/meminfo'!");
-                    }
-#elif BOOST_OS_MACOS
-                    int free_pages = 0;
-                    std::size_t len = sizeof(free_pages);
-                    if(sysctlbyname("vm.page_free_count", &free_pages, &len, nullptr, 0) < 0)
-                    {
-                        throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.page_free_count)!");
-                    }
-                    int page_size = 0;
-                    len = sizeof(page_size);
-                    if(sysctlbyname("vm.pagesize", &page_size, &len, nullptr, 0) < 0)
-                    {
-                        throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.pagesize)!");
-                    }
-                    return static_cast<std::size_t>(free_pages) * static_cast<std::size_t>(page_size);
-#else
-    #error "getFreeGlobalMemSizeBytes not implemented for this system!"
-#endif
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/dev/cpu/Wait.hpp b/thirdParty/alpaka/include/alpaka/dev/cpu/Wait.hpp
deleted file mode 100644
index 4d95fc30c1..0000000000
--- a/thirdParty/alpaka/include/alpaka/dev/cpu/Wait.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Rene Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/event/EventCpu.hpp>
-
-#include <alpaka/wait/Traits.hpp>
-
-namespace alpaka
-{
-    namespace wait
-    {
-        namespace traits
-        {
-            namespace detail
-            {
-                template<typename TDevice, typename TQueueVector>
-                ALPAKA_FN_HOST auto currentThreadWaitForDevice(
-                    TDevice const & dev, TQueueVector & vQueues
-                )
-                ->void
-                {
-                    // Furthermore there should not even be a chance to enqueue something between getting the queues and adding our wait events!
-                    std::vector<event::EventCpu> vEvents;
-                    for(auto && spQueue : vQueues)
-                    {
-                        vEvents.emplace_back(dev);
-                        spQueue->enqueue(vEvents.back());
-                    }
-
-                    // Now wait for all the events.
-                    for(auto && event : vEvents)
-                    {
-                        wait::wait(event);
-                    }
-                }
-            }
-            //#############################################################################
-            //! The CPU device thread wait specialization.
-            //!
-            //! Blocks until the device has completed all preceding requested tasks.
-            //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
-            template<>
-            struct CurrentThreadWaitFor<
-                dev::DevCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    dev::DevCpu const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Get all the queues on the device at the time of invocation.
-                    // All queues added afterwards are ignored.
-                    auto vspQueues(
-                        dev.m_spDevCpuImpl->GetAllQueues());
-
-                    detail::currentThreadWaitForDevice(dev, vspQueues);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/dim/DimArithmetic.hpp b/thirdParty/alpaka/include/alpaka/dim/DimArithmetic.hpp
deleted file mode 100644
index e5ccd4a795..0000000000
--- a/thirdParty/alpaka/include/alpaka/dim/DimArithmetic.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dim/DimIntegralConst.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace dim
-    {
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
-        {
-            //#############################################################################
-            //! The arithmetic type dimension getter trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<std::is_arithmetic<T>::value>::type>
-            {
-                using type = dim::DimInt<1u>;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/dim/DimIntegralConst.hpp b/thirdParty/alpaka/include/alpaka/dim/DimIntegralConst.hpp
deleted file mode 100644
index 9f05d07255..0000000000
--- a/thirdParty/alpaka/include/alpaka/dim/DimIntegralConst.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dim/Traits.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace dim
-    {
-        //-----------------------------------------------------------------------------
-        // N(th) dimension(s).
-        template<
-            std::size_t N>
-        using DimInt = std::integral_constant<std::size_t, N>;
-
-        //-----------------------------------------------------------------------------
-        // Trait specializations for integral_constant types.
-        /*namespace traits
-        {
-            //#############################################################################
-            //! The arithmetic type dimension getter trait specialization.
-            template<
-                std::size_t N>
-            struct DimType<
-                std::integral_constant<std::size_t, N>
-            {
-                using type = DimInt<N>;
-            };
-        }*/
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/dim/Traits.hpp b/thirdParty/alpaka/include/alpaka/dim/Traits.hpp
deleted file mode 100644
index 39736a61ec..0000000000
--- a/thirdParty/alpaka/include/alpaka/dim/Traits.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The dimension specifics.
-    namespace dim
-    {
-        //-----------------------------------------------------------------------------
-        //! The dimension traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The dimension getter type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct DimType;
-        }
-
-        //#############################################################################
-        //! The dimension type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Dim = typename traits::DimType<T>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/elem/Traits.hpp b/thirdParty/alpaka/include/alpaka/elem/Traits.hpp
deleted file mode 100644
index 26aa24e16f..0000000000
--- a/thirdParty/alpaka/include/alpaka/elem/Traits.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The element specifics.
-    namespace elem
-    {
-        //-----------------------------------------------------------------------------
-        //! The element traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The element type trait.
-            template<
-                typename TView,
-                typename TSfinae = void>
-            struct ElemType;
-        }
-
-        //#############################################################################
-        //! The element type trait alias template to remove the ::type.
-        template<
-            typename TView>
-        using Elem = typename std::remove_volatile<typename traits::ElemType<TView>::type>::type;
-
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
-        {
-            //#############################################################################
-            //! The fundamental type elem type trait specialization.
-            template<
-                typename T>
-            struct ElemType<
-                T,
-                typename std::enable_if<std::is_fundamental<T>::value>::type>
-            {
-                using type = T;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/event/EventCpu.hpp b/thirdParty/alpaka/include/alpaka/event/EventCpu.hpp
deleted file mode 100644
index 9df9c1400a..0000000000
--- a/thirdParty/alpaka/include/alpaka/event/EventCpu.hpp
+++ /dev/null
@@ -1,495 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/queue/QueueCpuNonBlocking.hpp>
-#include <alpaka/queue/QueueCpuBlocking.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-
-#include <mutex>
-#include <condition_variable>
-#include <future>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace event
-    {
-        namespace cpu
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CPU device event implementation.
-                class EventCpuImpl final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, EventCpuImpl>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    EventCpuImpl(
-                        dev::DevCpu const & dev) noexcept :
-                            m_dev(dev),
-                            m_mutex(),
-                            m_enqueueCount(0u),
-                            m_LastReadyEnqueueCount(0u)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    EventCpuImpl(EventCpuImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    EventCpuImpl(EventCpuImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventCpuImpl const &) -> EventCpuImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventCpuImpl &&) -> EventCpuImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ~EventCpuImpl() noexcept = default;
-
-                    //-----------------------------------------------------------------------------
-                    auto isReady() noexcept -> bool
-                    {
-                        return (m_LastReadyEnqueueCount == m_enqueueCount);
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    auto wait(std::size_t const & enqueueCount, std::unique_lock<std::mutex>& lk) const noexcept -> void
-                    {
-                        ALPAKA_ASSERT(enqueueCount <= m_enqueueCount);
-
-                        while(enqueueCount > m_LastReadyEnqueueCount)
-                        {
-                            auto future = m_future;
-                            lk.unlock();
-                            future.get();
-                            lk.lock();
-                        }
-                    }
-
-                public:
-                    dev::DevCpu const m_dev;                                //!< The device this event is bound to.
-
-                    std::mutex mutable m_mutex;                             //!< The mutex used to synchronize access to the event.
-                    std::shared_future<void> m_future;                      //!< The future signaling the event completion.
-                    std::size_t m_enqueueCount;                             //!< The number of times this event has been enqueued.
-                    std::size_t m_LastReadyEnqueueCount;                    //!< The time this event has been ready the last time.
-                                                                            //!< Ready means that the event was not waiting within a queue (not enqueued or already completed).
-                                                                            //!< If m_enqueueCount == m_LastReadyEnqueueCount, the event is currently not enqueued
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CPU device event.
-        class EventCpu final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, EventCpu>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! \param bBusyWaiting Unused. EventCpu never does busy waiting.
-            EventCpu(
-                dev::DevCpu const & dev,
-                bool bBusyWaiting = true) :
-                    m_spEventImpl(std::make_shared<cpu::detail::EventCpuImpl>(dev))
-            { 
-                alpaka::ignore_unused(bBusyWaiting);
-            }
-            //-----------------------------------------------------------------------------
-            EventCpu(EventCpu const &) = default;
-            //-----------------------------------------------------------------------------
-            EventCpu(EventCpu &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(EventCpu const &) -> EventCpu & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(EventCpu &&) -> EventCpu & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(EventCpu const & rhs) const
-            -> bool
-            {
-                return (m_spEventImpl == rhs.m_spEventImpl);
-            }
-            //-----------------------------------------------------------------------------
-            auto operator!=(EventCpu const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~EventCpu() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::EventCpuImpl> m_spEventImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event device get trait specialization.
-            template<>
-            struct GetDev<
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    event::EventCpu const & event)
-                -> dev::DevCpu
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event test trait specialization.
-            template<>
-            struct Test<
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return If the event is not waiting within a queue (not enqueued or already handled).
-                ALPAKA_FN_HOST static auto test(
-                    event::EventCpu const & event)
-                -> bool
-                {
-                    std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                    return event.m_spEventImpl->isReady();
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU non-blocking device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::cpu::detail::QueueCpuNonBlockingImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue::cpu::detail::QueueCpuNonBlockingImpl & queueImpl,
-#else
-                    queue::cpu::detail::QueueCpuNonBlockingImpl &,
-#endif
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer of the event implementation.
-                    // This is forwarded to the lambda that is enqueued into the queue to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    ++spEventImpl->m_enqueueCount;
-
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-                    // Enqueue a task that only resets the events flag if it is completed.
-                    spEventImpl->m_future = queueImpl.m_workerThread.enqueueTask(
-                        [spEventImpl, enqueueCount]()
-                        {
-                            std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
-
-                            // Nothing to do if it has been re-enqueued to a later position in the queue.
-                            if(enqueueCount == spEventImpl->m_enqueueCount)
-                            {
-                                spEventImpl->m_LastReadyEnqueueCount = spEventImpl->m_enqueueCount;
-                            }
-                        });
-#endif
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCpuNonBlocking,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuNonBlocking & queue,
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    queue::enqueue(*queue.m_spQueueImpl, event);
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::cpu::detail::QueueCpuBlockingImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::cpu::detail::QueueCpuBlockingImpl & queueImpl,
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    std::promise<void> promise;
-                    {
-                        std::lock_guard<std::mutex> lk(queueImpl.m_mutex);
-
-                        queueImpl.m_bCurrentlyExecutingTask = true;
-
-                        auto & eventImpl(*event.m_spEventImpl);
-
-                        {
-                            // Setting the event state and enqueuing it has to be atomic.
-                            std::lock_guard<std::mutex> evLk(eventImpl.m_mutex);
-
-                            ++eventImpl.m_enqueueCount;
-                            // NOTE: Difference to non-blocking version: directly set the event state instead of enqueuing.
-                            eventImpl.m_LastReadyEnqueueCount = eventImpl.m_enqueueCount;
-
-                            eventImpl.m_future = promise.get_future();
-                        }
-
-                        queueImpl.m_bCurrentlyExecutingTask = false;
-                    }
-                    promise.set_value();
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCpuBlocking,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuBlocking & queue,
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    queue::enqueue(*queue.m_spQueueImpl, event);
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event thread wait trait specialization.
-            //!
-            //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed.
-            //! If the event is not enqueued to a queue the method returns immediately.
-            template<>
-            struct CurrentThreadWaitFor<
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    event::EventCpu const & event)
-                -> void
-                {
-                    wait::wait(*event.m_spEventImpl);
-                }
-            };
-            //#############################################################################
-            //! The CPU device event implementation thread wait trait specialization.
-            //!
-            //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed.
-            //! If the event is not enqueued to a queue the method returns immediately.
-            //!
-            //! NOTE: This method is for internal usage only.
-            template<>
-            struct CurrentThreadWaitFor<
-                event::cpu::detail::EventCpuImpl>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    event::cpu::detail::EventCpuImpl const & eventImpl)
-                -> void
-                {
-                    std::unique_lock<std::mutex> lk(eventImpl.m_mutex);
-
-                    auto const enqueueCount = eventImpl.m_enqueueCount;
-                    eventImpl.wait(enqueueCount, lk);
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::cpu::detail::QueueCpuNonBlockingImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue::cpu::detail::QueueCpuNonBlockingImpl & queueImpl,
-#else
-                    queue::cpu::detail::QueueCpuNonBlockingImpl &,
-#endif
-                    event::EventCpu const & event)
-                -> void
-                {
-                    // Copy the shared pointer of the event implementation.
-                    // This is forwarded to the lambda that is enqueued into the queue to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    if(!spEventImpl->isReady())
-                    {
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                        auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-                        // Enqueue a task that waits for the given event.
-                        queueImpl.m_workerThread.enqueueTask(
-                            [spEventImpl, enqueueCount]()
-                            {
-                                std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
-                                spEventImpl->wait(enqueueCount, lk2);
-                            });
-#endif
-                    }
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCpuNonBlocking,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCpuNonBlocking & queue,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    wait::wait(*queue.m_spQueueImpl, event);
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::cpu::detail::QueueCpuBlockingImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::cpu::detail::QueueCpuBlockingImpl & queueImpl,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    alpaka::ignore_unused(queueImpl);
-
-                    // NOTE: Difference to non-blocking version: directly wait for event.
-                    wait::wait(*event.m_spEventImpl);
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCpuBlocking,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCpuBlocking & queue,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    wait::wait(*queue.m_spQueueImpl, event);
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device event wait trait specialization.
-            //!
-            //! Any future work submitted in any queue of this device will wait for event to complete before beginning execution.
-            template<>
-            struct WaiterWaitFor<
-                dev::DevCpu,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    dev::DevCpu & dev,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    // Get all the queues on the device at the time of invocation.
-                    // All queues added afterwards are ignored.
-                    auto vspQueues(
-                        dev.m_spDevCpuImpl->GetAllQueues());
-
-                    // Let all the queues wait for this event.
-                    // Furthermore there should not even be a chance to enqueue something between getting the queues and adding our wait events!
-                    for(auto && spQueue : vspQueues)
-                    {
-                        spQueue->wait(event);
-                    }
-                }
-            };
-
-            //#############################################################################
-            //! The CPU non-blocking device queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCpuNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCpuNonBlocking const & queue)
-                -> void
-                {
-                    event::EventCpu event(
-                        dev::getDev(queue));
-                    queue::enqueue(
-                        const_cast<queue::QueueCpuNonBlocking &>(queue),
-                        event);
-                    wait::wait(
-                        event);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/event/EventCudaRt.hpp b/thirdParty/alpaka/include/alpaka/event/EventCudaRt.hpp
deleted file mode 100644
index abcd94a725..0000000000
--- a/thirdParty/alpaka/include/alpaka/event/EventCudaRt.hpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/core/Cuda.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-
-namespace alpaka
-{
-    namespace event
-    {
-        namespace cuda
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CUDA RT device event implementation.
-                class EventCudaImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST EventCudaImpl(
-                        dev::DevCudaRt const & dev,
-                        bool bBusyWait) :
-                            m_dev(dev),
-                            m_CudaEvent()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // Create the event on the current device with the specified flags. Valid flags include:
-                        // - cudaEventDefault: Default event creation flag.
-                        // - cudaEventBlockingSync : Specifies that event should use blocking synchronization.
-                        //   A host thread that uses cudaEventSynchronize() to wait on an event created with this flag will block until the event actually completes.
-                        // - cudaEventDisableTiming : Specifies that the created event does not need to record timing data.
-                        //   Events created with this flag specified and the cudaEventBlockingSync flag not specified will provide the best performance when used with cudaStreamWaitEvent() and cudaEventQuery().
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaEventCreateWithFlags(
-                                &m_CudaEvent,
-                                (bBusyWait ? cudaEventDefault : cudaEventBlockingSync) | cudaEventDisableTiming));
-                    }
-                    //-----------------------------------------------------------------------------
-                    EventCudaImpl(EventCudaImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    EventCudaImpl(EventCudaImpl &&) = default;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventCudaImpl const &) -> EventCudaImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventCudaImpl &&) -> EventCudaImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~EventCudaImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before cudaEventDestroy required?
-                        ALPAKA_CUDA_RT_CHECK(cudaSetDevice(
-                            m_dev.m_iDevice));
-                        // In case event has been recorded but has not yet been completed when cudaEventDestroy() is called, the function will return immediately
-                        // and the resources associated with event will be released automatically once the device has completed event.
-                        // -> No need to synchronize here.
-                        ALPAKA_CUDA_RT_CHECK(cudaEventDestroy(
-                            m_CudaEvent));
-                    }
-
-                public:
-                    dev::DevCudaRt const m_dev;   //!< The device this event is bound to.
-                    cudaEvent_t m_CudaEvent;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CUDA RT device event.
-        class EventCudaRt final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, EventCudaRt>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST EventCudaRt(
-                dev::DevCudaRt const & dev,
-                bool bBusyWait = true) :
-                    m_spEventImpl(std::make_shared<cuda::detail::EventCudaImpl>(dev, bBusyWait))
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            }
-            //-----------------------------------------------------------------------------
-            EventCudaRt(EventCudaRt const &) = default;
-            //-----------------------------------------------------------------------------
-            EventCudaRt(EventCudaRt &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(EventCudaRt const &) -> EventCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(EventCudaRt &&) -> EventCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(EventCudaRt const & rhs) const
-            -> bool
-            {
-                return (m_spEventImpl == rhs.m_spEventImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(EventCudaRt const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~EventCudaRt() = default;
-
-        public:
-            std::shared_ptr<cuda::detail::EventCudaImpl> m_spEventImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device event device get trait specialization.
-            template<>
-            struct GetDev<
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    event::EventCudaRt const & event)
-                -> dev::DevCudaRt
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device event test trait specialization.
-            template<>
-            struct Test<
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto test(
-                    event::EventCudaRt const & event)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Query is allowed even for events on non current device.
-                    cudaError_t ret = cudaSuccess;
-                    ALPAKA_CUDA_RT_CHECK_IGNORE(
-                        ret = cudaEventQuery(
-                            event.m_spEventImpl->m_CudaEvent),
-                        cudaErrorNotReady);
-                    return (ret == cudaSuccess);
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    event::EventCudaRt & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_CUDA_RT_CHECK(cudaEventRecord(
-                        event.m_spEventImpl->m_CudaEvent,
-                        queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    event::EventCudaRt & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_CUDA_RT_CHECK(cudaEventRecord(
-                        event.m_spEventImpl->m_CudaEvent,
-                        queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device event thread wait trait specialization.
-            //!
-            //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed.
-            //! If the event is not enqueued to a queue the method returns immediately.
-            template<>
-            struct CurrentThreadWaitFor<
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    event::EventCudaRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Sync is allowed even for events on non current device.
-                    ALPAKA_CUDA_RT_CHECK(cudaEventSynchronize(
-                        event.m_spEventImpl->m_CudaEvent));
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCudaRtNonBlocking,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    event::EventCudaRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent(
-                        queue.m_spQueueImpl->m_CudaQueue,
-                        event.m_spEventImpl->m_CudaEvent,
-                        0));
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCudaRtBlocking,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCudaRtBlocking & queue,
-                    event::EventCudaRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent(
-                        queue.m_spQueueImpl->m_CudaQueue,
-                        event.m_spEventImpl->m_CudaEvent,
-                        0));
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT device event wait trait specialization.
-            //!
-            //! Any future work submitted in any queue of this device will wait for event to complete before beginning execution.
-            template<>
-            struct WaiterWaitFor<
-                dev::DevCudaRt,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    dev::DevCudaRt & dev,
-                    event::EventCudaRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent(
-                        nullptr,
-                        event.m_spEventImpl->m_CudaEvent,
-                        0));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/event/EventHipRt.hpp b/thirdParty/alpaka/include/alpaka/event/EventHipRt.hpp
deleted file mode 100644
index 295e780c16..0000000000
--- a/thirdParty/alpaka/include/alpaka/event/EventHipRt.hpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/QueueHipRtNonBlocking.hpp>
-#include <alpaka/queue/QueueHipRtBlocking.hpp>
-#include <alpaka/core/Hip.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-
-namespace alpaka
-{
-    namespace event
-    {
-        namespace hip
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The HIP RT device event implementation.
-                class EventHipImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST EventHipImpl(
-                        dev::DevHipRt const & dev,
-                        bool bBusyWait) :
-                            m_dev(dev),
-                            m_HipEvent()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // Create the event on the current device with the specified flags. Valid flags include:
-                        // - hipEventDefault: Default event creation flag.
-                        // - hipEventBlockingSync : Specifies that event should use blocking synchronization.
-                        //   A host thread that uses hipEventSynchronize() to wait on an event created with this flag will block until the event actually completes.
-                        // - hipEventDisableTiming : Specifies that the created event does not need to record timing data.
-                        //   Events created with this flag specified and the hipEventBlockingSync flag not specified will provide the best performance when used with hipQueueWaitEvent() and hipEventQuery().
-                        ALPAKA_HIP_RT_CHECK(
-                            hipEventCreateWithFlags(
-                                &m_HipEvent,
-                                (bBusyWait ? hipEventDefault : hipEventBlockingSync) | hipEventDisableTiming));
-                    }
-                    //-----------------------------------------------------------------------------
-                    EventHipImpl(EventHipImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    EventHipImpl(EventHipImpl &&) = default;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventHipImpl const &) -> EventHipImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventHipImpl &&) -> EventHipImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~EventHipImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before hipEventDestroy required?
-                        ALPAKA_HIP_RT_CHECK(hipSetDevice(
-                            m_dev.m_iDevice));
-                        // In case event has been recorded but has not yet been completed when hipEventDestroy() is called, the function will return immediately
-                        // and the resources associated with event will be released automatically once the device has completed event.
-                        // -> No need to synchronize here.
-                        ALPAKA_HIP_RT_CHECK(hipEventDestroy(
-                            m_HipEvent));
-                    }
-
-                public:
-                    dev::DevHipRt const m_dev;   //!< The device this event is bound to.
-                    hipEvent_t m_HipEvent;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The HIP RT device event.
-        class EventHipRt final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, EventHipRt>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! Constructor.
-            ALPAKA_FN_HOST EventHipRt(
-                dev::DevHipRt const & dev,
-                bool bBusyWait = true) :
-                    m_spEventImpl(std::make_shared<hip::detail::EventHipImpl>(dev, bBusyWait))
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            }
-            //-----------------------------------------------------------------------------
-            //! Copy constructor.
-            EventHipRt(EventHipRt const &) = default;
-            //-----------------------------------------------------------------------------
-            //! Move constructor.
-            EventHipRt(EventHipRt &&) = default;
-            //-----------------------------------------------------------------------------
-            //! Copy assignment operator.
-            auto operator=(EventHipRt const &) -> EventHipRt & = default;
-            //-----------------------------------------------------------------------------
-            //! Move assignment operator.
-            auto operator=(EventHipRt &&) -> EventHipRt & = default;
-            //-----------------------------------------------------------------------------
-            //! Equality comparison operator.
-            auto operator==(EventHipRt const & rhs) const
-            -> bool
-            {
-                return (m_spEventImpl->m_HipEvent == rhs.m_spEventImpl->m_HipEvent);
-            }
-            //-----------------------------------------------------------------------------
-            //! Equality comparison operator.
-            auto operator!=(EventHipRt const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            //! Destructor.
-            ALPAKA_FN_HOST_ACC ~EventHipRt() = default;
-
-        public:
-            std::shared_ptr<hip::detail::EventHipImpl> m_spEventImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device event device get trait specialization.
-            template<>
-            struct GetDev<
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto getDev(
-                    event::EventHipRt const & event)
-                -> dev::DevHipRt
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device event test trait specialization.
-            template<>
-            struct Test<
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto test(
-                    event::EventHipRt const & event)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Query is allowed even for events on non current device.
-                    hipError_t ret = hipSuccess;
-                    ALPAKA_HIP_RT_CHECK_IGNORE(
-                        ret = hipEventQuery(
-                            event.m_spEventImpl->m_HipEvent),
-                        hipErrorNotReady);
-                    return (ret == hipSuccess);
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    event::EventHipRt & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_HIP_RT_CHECK(hipEventRecord(
-                        event.m_spEventImpl->m_HipEvent,
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP RT queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    event::EventHipRt & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_HIP_RT_CHECK(hipEventRecord(
-                        event.m_spEventImpl->m_HipEvent,
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device event thread wait trait specialization.
-            //!
-            //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed.
-            //! If the event is not enqueued to a queue the method returns immediately.
-            //#############################################################################
-            template<>
-            struct CurrentThreadWaitFor<
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    event::EventHipRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Sync is allowed even for events on non current device.
-                    ALPAKA_HIP_RT_CHECK(hipEventSynchronize(
-                        event.m_spEventImpl->m_HipEvent));
-                }
-            };
-            //#############################################################################
-            //! The HIP RT queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueHipRtNonBlocking,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueHipRtNonBlocking & queue,
-                    event::EventHipRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent(
-                        queue.m_spQueueImpl->m_HipQueue,
-                        event.m_spEventImpl->m_HipEvent,
-                        0));
-                }
-            };
-            //#############################################################################
-            //! The HIP RT queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueHipRtBlocking,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueHipRtBlocking & queue,
-                    event::EventHipRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent(
-                        queue.m_spQueueImpl->m_HipQueue,
-                        event.m_spEventImpl->m_HipEvent,
-                        0));
-                }
-            };
-            //#############################################################################
-            //! The HIP RT device event wait trait specialization.
-            //!
-            //! Any future work submitted in any queue of this device will wait for event to complete before beginning execution.
-            //#############################################################################
-            template<>
-            struct WaiterWaitFor<
-                dev::DevHipRt,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    dev::DevHipRt & dev,
-                    event::EventHipRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-
-                    ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent(
-                        nullptr,
-                        event.m_spEventImpl->m_HipEvent,
-                        0));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/event/Traits.hpp b/thirdParty/alpaka/include/alpaka/event/Traits.hpp
deleted file mode 100644
index a828b4094c..0000000000
--- a/thirdParty/alpaka/include/alpaka/event/Traits.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The event management specifics.
-    namespace event
-    {
-        //-----------------------------------------------------------------------------
-        //! The event management traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The event type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct EventType;
-
-            //#############################################################################
-            //! The event tester trait.
-            template<
-                typename TEvent,
-                typename TSfinae = void>
-            struct Test;
-        }
-
-        //#############################################################################
-        //! The event type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Event = typename traits::EventType<T>::type;
-
-        //-----------------------------------------------------------------------------
-        //! Tests if the given event has already been completed.
-        template<
-            typename TEvent>
-        ALPAKA_FN_HOST auto test(
-            TEvent const & event)
-        -> bool
-        {
-            return
-                traits::Test<
-                    TEvent>
-                ::test(
-                    event);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/extent/Traits.hpp b/thirdParty/alpaka/include/alpaka/extent/Traits.hpp
deleted file mode 100644
index 48b05ff3ed..0000000000
--- a/thirdParty/alpaka/include/alpaka/extent/Traits.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/meta/Fold.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
-
-#include <type_traits>
-#include <functional>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The extent specifics.
-    namespace extent
-    {
-        //-----------------------------------------------------------------------------
-        //! The extent traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The extent get trait.
-            //!
-            //! If not specialized explicitly it returns 1.
-            template<
-                typename TIdxIntegralConst,
-                typename TExtent,
-                typename TSfinae = void>
-            struct GetExtent
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const &)
-                -> idx::Idx<TExtent>
-                {
-                    return static_cast<idx::Idx<TExtent>>(1);
-                }
-            };
-
-            //#############################################################################
-            //! The extent set trait.
-            template<
-                typename TIdxIntegralConst,
-                typename TExtent,
-                typename TExtentVal,
-                typename TSfinae = void>
-            struct SetExtent;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The extent in the given dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t Tidx,
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getExtent(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
-        {
-            return
-                traits::GetExtent<
-                    dim::DimInt<Tidx>,
-                    TExtent>
-                ::getExtent(
-                    extent);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The width.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getWidth(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
-        {
-            return getExtent<dim::Dim<TExtent>::value - 1u>(extent);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The height.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getHeight(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
-        {
-            return getExtent<dim::Dim<TExtent>::value - 2u>(extent);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The depth.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getDepth(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
-        {
-            return getExtent<dim::Dim<TExtent>::value - 3u>(extent);
-        }
-
-        namespace detail
-        {
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TExtent,
-                size_t... TIndices>
-            ALPAKA_FN_HOST_ACC auto getExtentProductInternal(
-                TExtent const & extent,
-                alpaka::meta::IndexSequence<TIndices...> const & indices)
-            -> idx::Idx<TExtent>
-            {
-                alpaka::ignore_unused(indices);
-
-                return
-                    meta::foldr(
-                        std::multiplies<idx::Idx<TExtent>>(),
-                        getExtent<TIndices>(extent)...);
-            }
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The product of the extent.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getExtentProduct(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
-        {
-            using IdxSequence = alpaka::meta::MakeIndexSequence<dim::Dim<TExtent>::value>;
-            return
-                detail::getExtentProductInternal(
-                    extent,
-                    IdxSequence());
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Sets the extent in the given dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t Tidx,
-            typename TExtent,
-            typename TExtentVal>
-        ALPAKA_FN_HOST_ACC auto setExtent(
-            TExtent & extent,
-            TExtentVal const & extentVal)
-        -> void
-        {
-            traits::SetExtent<
-                dim::DimInt<Tidx>,
-                TExtent,
-                TExtentVal>
-            ::setExtent(
-                extent,
-                extentVal);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the width.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent,
-            typename TWidth>
-        ALPAKA_FN_HOST_ACC auto setWidth(
-            TExtent & extent,
-            TWidth const & width)
-        -> void
-        {
-            setExtent<dim::Dim<TExtent>::value - 1u>(extent, width);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the height.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent,
-            typename THeight>
-        ALPAKA_FN_HOST_ACC auto setHeight(
-            TExtent & extent,
-            THeight const & height)
-        -> void
-        {
-            setExtent<dim::Dim<TExtent>::value - 2u>(extent, height);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the depth.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent,
-            typename TDepth>
-        ALPAKA_FN_HOST_ACC auto setDepth(
-            TExtent & extent,
-            TDepth const & depth)
-        -> void
-        {
-            setExtent<dim::Dim<TExtent>::value - 3u>(extent, depth);
-        }
-
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
-        {
-            //#############################################################################
-            //! The unsigned integral width get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<0u>,
-                TExtent,
-                typename std::enable_if<
-                    std::is_integral<TExtent>::value>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> idx::Idx<TExtent>
-                {
-                    return extent;
-                }
-            };
-            //#############################################################################
-            //! The unsigned integral width set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<0u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    std::is_integral<TExtent>::value>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent = extentVal;
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/idx/Accessors.hpp b/thirdParty/alpaka/include/alpaka/idx/Accessors.hpp
deleted file mode 100644
index 86bef954d2..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/Accessors.hpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/workdiv/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <boost/config.hpp>
-
-#include <utility>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        //-----------------------------------------------------------------------------
-        //! Get the indices requested.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOrigin,
-            typename TUnit,
-            typename TIdx,
-            typename TWorkDiv>
-        ALPAKA_FN_HOST_ACC auto getIdx(
-            TIdx const & idx,
-            TWorkDiv const & workDiv)
-        -> vec::Vec<dim::Dim<TWorkDiv>, idx::Idx<TIdx>>
-        {
-            return
-                traits::GetIdx<
-                    TIdx,
-                    TOrigin,
-                    TUnit>
-                ::getIdx(
-                    idx,
-                    workDiv);
-        }
-        //-----------------------------------------------------------------------------
-        //! Get the indices requested.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOrigin,
-            typename TUnit,
-            typename TIdxWorkDiv>
-        ALPAKA_FN_HOST_ACC auto getIdx(
-            TIdxWorkDiv const & idxWorkDiv)
-        -> vec::Vec<dim::Dim<TIdxWorkDiv>, idx::Idx<TIdxWorkDiv>>
-        {
-            return
-                traits::GetIdx<
-                    TIdxWorkDiv,
-                    TOrigin,
-                    TUnit>
-                ::getIdx(
-                    idxWorkDiv,
-                    idxWorkDiv);
-        }
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The grid block index get trait specialization for classes with IdxGbBase member type.
-            template<
-                typename TIdxGb>
-            struct GetIdx<
-                TIdxGb,
-                origin::Grid,
-                unit::Blocks>
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptIdxGb, TIdxGb>;
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    TIdxGb const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<dim::Dim<ImplementationBase>, idx::Idx<ImplementationBase>>
-                {
-                    return
-                        traits::GetIdx<
-                            ImplementationBase,
-                            origin::Grid,
-                            unit::Blocks>
-                        ::getIdx(
-                            idx,
-                            workDiv);
-                }
-            };
-
-            //#############################################################################
-            //! The block thread index get trait specialization for classes with IdxBtBase member type.
-            template<
-                typename TIdxBt>
-            struct GetIdx<
-                TIdxBt,
-                origin::Block,
-                unit::Threads>
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptIdxBt, TIdxBt>;
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    TIdxBt const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<dim::Dim<ImplementationBase>, idx::Idx<ImplementationBase>>
-                {
-                    return
-                        traits::GetIdx<
-                            ImplementationBase,
-                            origin::Block,
-                            unit::Threads>
-                        ::getIdx(
-                            idx,
-                            workDiv);
-                }
-            };
-
-            //#############################################################################
-            //! The grid thread index get trait specialization.
-            template<
-                typename TIdx>
-            struct GetIdx<
-                TIdx,
-                origin::Grid,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    TIdx const & idx,
-                    TWorkDiv const & workDiv)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> decltype(
-                    idx::getIdx<origin::Grid, unit::Blocks>(idx, workDiv)
-                    * workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                    + idx::getIdx<origin::Block, unit::Threads>(idx, workDiv))
-#endif
-                {
-                    return
-                        idx::getIdx<origin::Grid, unit::Blocks>(idx, workDiv)
-                        * workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                        + idx::getIdx<origin::Block, unit::Threads>(idx, workDiv);
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! Get the index of the first element this thread computes.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TIdxWorkDiv,
-            typename TGridThreadIdx,
-            typename TThreadElemExtent>
-        ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
-            TIdxWorkDiv const & idxWorkDiv,
-            TGridThreadIdx const & gridThreadIdx,
-            TThreadElemExtent const & threadElemExtent)
-        -> vec::Vec<dim::Dim<TIdxWorkDiv>, idx::Idx<TIdxWorkDiv>>
-        {
-            alpaka::ignore_unused(idxWorkDiv);
-
-            return gridThreadIdx * threadElemExtent;
-        }
-        //-----------------------------------------------------------------------------
-        //! Get the index of the first element this thread computes.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TIdxWorkDiv,
-            typename TGridThreadIdx>
-        ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
-            TIdxWorkDiv const & idxWorkDiv,
-            TGridThreadIdx const & gridThreadIdx)
-        -> vec::Vec<dim::Dim<TIdxWorkDiv>, idx::Idx<TIdxWorkDiv>>
-        {
-            auto const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(idxWorkDiv));
-            return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx, threadElemExtent);
-        }
-        //-----------------------------------------------------------------------------
-        //! Get the index of the first element this thread computes.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TIdxWorkDiv>
-        ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
-            TIdxWorkDiv const & idxWorkDiv)
-        -> vec::Vec<dim::Dim<TIdxWorkDiv>, idx::Idx<TIdxWorkDiv>>
-        {
-            auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(idxWorkDiv));
-            return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/idx/MapIdx.hpp b/thirdParty/alpaka/include/alpaka/idx/MapIdx.hpp
deleted file mode 100644
index 362480afa4..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/MapIdx.hpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace detail
-        {
-            //#############################################################################
-            //! Maps a linear index to a N dimensional index.
-            template<
-                std::size_t TidxDimOut,
-                std::size_t TidxDimIn,
-                typename TSfinae = void>
-            struct MapIdx;
-            //#############################################################################
-            //! Maps a N dimensional index to the same N dimensional index.
-            template<
-                std::size_t TidxDim>
-            struct MapIdx<
-                TidxDim,
-                TidxDim>
-            {
-                //-----------------------------------------------------------------------------
-                // \tparam TElem Type of the index values.
-                // \param idx Idx to be mapped.
-                // \param extent Spatial size to map the index to.
-                // \return A N dimensional vector.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TElem>
-                ALPAKA_FN_HOST_ACC static auto mapIdx(
-                    vec::Vec<dim::DimInt<TidxDim>, TElem> const & idx,
-                    vec::Vec<dim::DimInt<TidxDim>, TElem> const & extent)
-                -> vec::Vec<dim::DimInt<TidxDim>, TElem>
-                {
-                    alpaka::ignore_unused(extent);
-
-                    return idx;
-                }
-            };
-            //#############################################################################
-            //! Maps a 1 dimensional index to a N dimensional index.
-            template<
-                std::size_t TidxDimOut>
-            struct MapIdx<
-                TidxDimOut,
-                1u,
-                typename std::enable_if<TidxDimOut != 1u>::type>
-            {
-                //-----------------------------------------------------------------------------
-                // \tparam TElem Type of the index values.
-                // \param idx Idx to be mapped.
-                // \param extent Spatial size to map the index to
-                // \return A N dimensional vector.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TElem>
-                ALPAKA_FN_HOST_ACC static auto mapIdx(
-                    vec::Vec<dim::DimInt<1u>, TElem> const & idx,
-                    vec::Vec<dim::DimInt<TidxDimOut>, TElem> const & extent)
-                -> vec::Vec<dim::DimInt<TidxDimOut>, TElem>
-                {
-                    auto idxNd(vec::Vec<dim::DimInt<TidxDimOut>, TElem>::all(0u));
-
-                    constexpr std::size_t lastIdx(TidxDimOut - 1u);
-
-                    // fast-dim
-                    idxNd[lastIdx] = static_cast<TElem>(idx[0u] % extent[lastIdx]);
-
-                    // in-between
-                    TElem hyperPlanesBefore = extent[lastIdx];
-                    for(std::size_t r(1u); r < lastIdx; ++r)
-                    {
-                        std::size_t const d = lastIdx - r;
-                        idxNd[d] = static_cast<TElem>(idx[0u] / hyperPlanesBefore % extent[d]);
-                        hyperPlanesBefore *= extent[d];
-                    }
-
-                    // slow-dim
-                    idxNd[0u] = static_cast<TElem>(idx[0u] / hyperPlanesBefore);
-
-                    return idxNd;
-                }
-            };
-            //#############################################################################
-            //! Maps a N dimensional index to a 1 dimensional index.
-            template<
-                std::size_t TidxDimIn>
-            struct MapIdx<
-                1u,
-                TidxDimIn,
-                typename std::enable_if<TidxDimIn != 1u>::type>
-            {
-                //-----------------------------------------------------------------------------
-                // \tparam TElem Type of the index values.
-                // \param idx Idx to be mapped.
-                // \param extent Spatial size to map the index to.
-                // \return A 1 dimensional vector.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TElem>
-                ALPAKA_FN_HOST_ACC static auto mapIdx(
-                    vec::Vec<dim::DimInt<TidxDimIn>, TElem> const & idx,
-                    vec::Vec<dim::DimInt<TidxDimIn>, TElem> const & extent)
-                -> vec::Vec<dim::DimInt<1u>, TElem>
-                {
-                    TElem idx1d(idx[0u]);
-                    for(std::size_t d(1u); d < TidxDimIn; ++d)
-                    {
-                        idx1d = static_cast<TElem>(idx1d * extent[d] + idx[d]);
-                    }
-                    return {idx1d};
-                }
-            };
-        }
-
-        //#############################################################################
-        //! Maps a N dimensional index to a N dimensional position.
-        //!
-        //! \tparam TidxDimOut Dimension of the index vector to map to.
-        //! \tparam TidxDimIn Dimension of the index vector to map from.
-        //! \tparam TElem Type of the elements of the index vector to map from.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t TidxDimOut,
-            std::size_t TidxDimIn,
-            typename TElem>
-        ALPAKA_FN_HOST_ACC auto mapIdx(
-            vec::Vec<dim::DimInt<TidxDimIn>, TElem> const & idx,
-            vec::Vec<dim::DimInt<(TidxDimOut < TidxDimIn) ? TidxDimIn : TidxDimOut>, TElem> const & extent)
-        -> vec::Vec<dim::DimInt<TidxDimOut>, TElem>
-        {
-            static_assert(TidxDimOut > 0u, "The dimension of the output vector has to be greater than zero!");
-            static_assert(TidxDimIn > 0u, "The dimension of the input vector has to be greater than zero!");
-
-            return
-                detail::MapIdx<
-                    TidxDimOut,
-                    TidxDimIn>
-                ::mapIdx(
-                    idx,
-                    extent);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/idx/Traits.hpp b/thirdParty/alpaka/include/alpaka/idx/Traits.hpp
deleted file mode 100644
index 3af4fdc78b..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/Traits.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <utility>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The index specifics.
-    namespace idx
-    {
-        struct ConceptIdxBt;
-        struct ConceptIdxGb;
-
-        //-----------------------------------------------------------------------------
-        //! The idx traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The idx type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct IdxType;
-        }
-
-        template<
-            typename T>
-        using Idx = typename traits::IdxType<T>::type;
-
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
-        {
-            //#############################################################################
-            //! The arithmetic idx type trait specialization.
-            template<
-                typename T>
-            struct IdxType<
-                T,
-                typename std::enable_if<std::is_arithmetic<T>::value>::type>
-            {
-                using type = typename std::decay<T>::type;
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        //! The index traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The index get trait.
-            template<
-                typename TIdx,
-                typename TOrigin,
-                typename TUnit,
-                typename TSfinae = void>
-            struct GetIdx;
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtCudaBuiltIn.hpp
deleted file mode 100644
index 8876308996..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtCudaBuiltIn.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace bt
-        {
-            //#############################################################################
-            //! The CUDA accelerator ND index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtCudaBuiltIn : public concepts::Implements<ConceptIdxBt, IdxBtCudaBuiltIn<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxBtCudaBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                __device__ IdxBtCudaBuiltIn(IdxBtCudaBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ IdxBtCudaBuiltIn(IdxBtCudaBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(IdxBtCudaBuiltIn const & ) -> IdxBtCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(IdxBtCudaBuiltIn &&) -> IdxBtCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtCudaBuiltIn() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                __device__ static auto getIdx(
-                    idx::bt::IdxBtCudaBuiltIn<TDim, TIdx> const & idx,
-                    TWorkDiv const &)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    return vec::cast<TIdx>(offset::getOffsetVecEnd<TDim>(threadIdx));
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtHipBuiltIn.hpp
deleted file mode 100644
index e5dd535c8c..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtHipBuiltIn.hpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace bt
-        {
-            //#############################################################################
-            //! The HIP accelerator ND index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtHipBuiltIn : public concepts::Implements<ConceptIdxBt, IdxBtHipBuiltIn<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST_ACC IdxBtHipBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                IdxBtHipBuiltIn(IdxBtHipBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                IdxBtHipBuiltIn(IdxBtHipBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxBtHipBuiltIn const & ) -> IdxBtHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxBtHipBuiltIn &&) -> IdxBtHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ALPAKA_FN_HOST_ACC ~IdxBtHipBuiltIn() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtHipBuiltIn<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    idx::bt::IdxBtHipBuiltIn<TDim, TIdx> const & idx,
-                    TWorkDiv const &)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    return offset::getOffsetVecEnd<TDim>(
-                        vec::Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                            static_cast<TIdx>(hipThreadIdx_z),
-                            static_cast<TIdx>(hipThreadIdx_y),
-                            static_cast<TIdx>(hipThreadIdx_x)));
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp b/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp
deleted file mode 100644
index ba14dd111e..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef _OPENMP
-
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/workdiv/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-
-#include <omp.h>
-
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace bt
-        {
-            //#############################################################################
-            //! The OpenMP accelerator index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtOmp : public concepts::Implements<ConceptIdxBt, IdxBtOmp<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxBtOmp() = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtOmp(IdxBtOmp const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtOmp(IdxBtOmp &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtOmp const &) -> IdxBtOmp & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtOmp &&) -> IdxBtOmp & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtOmp() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The OpenMP accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtOmp<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The OpenMP accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtOmp<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::bt::IdxBtOmp<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    // We assume that the thread id is positive.
-                    ALPAKA_ASSERT(::omp_get_thread_num()>=0);
-                    // \TODO: Would it be faster to precompute the index and cache it inside an array?
-                    return idx::mapIdx<TDim::value>(
-                        vec::Vec<dim::DimInt<1u>, TIdx>(static_cast<TIdx>(::omp_get_thread_num())),
-                        workdiv::getWorkDiv<Block, Threads>(workDiv));
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The OpenMP accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtOmp<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp b/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp
deleted file mode 100644
index c61fbcc3c0..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Fibers.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <map>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace bt
-        {
-            //#############################################################################
-            //! The fibers accelerator index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtRefFiberIdMap : public concepts::Implements<ConceptIdxBt, IdxBtRefFiberIdMap<TDim, TIdx>>
-            {
-            public:
-                using FiberIdToIdxMap = std::map<boost::fibers::fiber::id, vec::Vec<TDim, TIdx>>;
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefFiberIdMap(
-                    FiberIdToIdxMap const & mFibersToIndices) :
-                    m_fibersToIndices(mFibersToIndices)
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefFiberIdMap(IdxBtRefFiberIdMap const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefFiberIdMap(IdxBtRefFiberIdMap &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtRefFiberIdMap const &) -> IdxBtRefFiberIdMap & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtRefFiberIdMap &&) -> IdxBtRefFiberIdMap & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtRefFiberIdMap() = default;
-
-            public:
-                FiberIdToIdxMap const & m_fibersToIndices; //!< The mapping of fiber id's to fiber indices.
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::bt::IdxBtRefFiberIdMap<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    auto const fiberId(boost::this_fiber::get_id());
-                    auto const fiberEntry(idx.m_fibersToIndices.find(fiberId));
-                    ALPAKA_ASSERT(fiberEntry != idx.m_fibersToIndices.end());
-                    return fiberEntry->second;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp b/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
deleted file mode 100644
index e43d83a672..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <thread>
-#include <map>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace bt
-        {
-            //#############################################################################
-            //! The threads accelerator index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtRefThreadIdMap : public concepts::Implements<ConceptIdxBt, IdxBtRefThreadIdMap<TDim, TIdx>>
-            {
-            public:
-                using ThreadIdToIdxMap = std::map<std::thread::id, vec::Vec<TDim, TIdx>>;
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefThreadIdMap(
-                    ThreadIdToIdxMap const & mThreadToIndices) :
-                    m_threadToIndexMap(mThreadToIndices)
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefThreadIdMap(IdxBtRefThreadIdMap const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefThreadIdMap(IdxBtRefThreadIdMap &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtRefThreadIdMap const &) -> IdxBtRefThreadIdMap & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtRefThreadIdMap &&) -> IdxBtRefThreadIdMap & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtRefThreadIdMap() = default;
-
-            public:
-                ThreadIdToIdxMap const & m_threadToIndexMap;   //!< The mapping of thread id's to thread indices.
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::bt::IdxBtRefThreadIdMap<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    auto const threadId(std::this_thread::get_id());
-                    auto const threadEntry(idx.m_threadToIndexMap.find(threadId));
-                    ALPAKA_ASSERT(threadEntry != idx.m_threadToIndexMap.end());
-                    return threadEntry->second;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp b/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp
deleted file mode 100644
index 7ae7c4fee5..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace bt
-        {
-            //#############################################################################
-            //! A zero block thread index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtZero : public concepts::Implements<ConceptIdxBt, IdxBtZero<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxBtZero() = default;
-                //-----------------------------------------------------------------------------
-                IdxBtZero(IdxBtZero const &) = delete;
-                //-----------------------------------------------------------------------------
-                IdxBtZero(IdxBtZero &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxBtZero const &) -> IdxBtZero & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxBtZero &&) -> IdxBtZero & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtZero() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The zero block thread index provider dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtZero<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The zero block thread index provider block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtZero<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::bt::IdxBtZero<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    alpaka::ignore_unused(workDiv);
-                    return vec::Vec<TDim, TIdx>::zeros();
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The zero block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtZero<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbCudaBuiltIn.hpp
deleted file mode 100644
index 47b57e1c4d..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbCudaBuiltIn.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace gb
-        {
-            //#############################################################################
-            //! The CUDA accelerator ND index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxGbCudaBuiltIn : public concepts::Implements<ConceptIdxGb, IdxGbCudaBuiltIn<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxGbCudaBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                __device__ IdxGbCudaBuiltIn(IdxGbCudaBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ IdxGbCudaBuiltIn(IdxGbCudaBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(IdxGbCudaBuiltIn const & ) -> IdxGbCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(IdxGbCudaBuiltIn &&) -> IdxGbCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxGbCudaBuiltIn() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator grid block index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current block in the grid.
-                template<
-                    typename TWorkDiv>
-                __device__ static auto getIdx(
-                    idx::gb::IdxGbCudaBuiltIn<TDim, TIdx> const & idx,
-                    TWorkDiv const &)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    return vec::cast<TIdx>(offset::getOffsetVecEnd<TDim>(blockIdx));
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator grid block index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbHipBuiltIn.hpp
deleted file mode 100644
index 1dab7ffc2a..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbHipBuiltIn.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace gb
-        {
-            //#############################################################################
-            //! The HIP accelerator ND index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxGbHipBuiltIn : public concepts::Implements<ConceptIdxGb, IdxGbHipBuiltIn<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST_ACC IdxGbHipBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                IdxGbHipBuiltIn(IdxGbHipBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                IdxGbHipBuiltIn(IdxGbHipBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxGbHipBuiltIn const & ) -> IdxGbHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxGbHipBuiltIn &&) -> IdxGbHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ALPAKA_FN_HOST_ACC ~IdxGbHipBuiltIn() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::gb::IdxGbHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator grid block index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::gb::IdxGbHipBuiltIn<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current block in the grid.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    idx::gb::IdxGbHipBuiltIn<TDim, TIdx> const & idx,
-                    TWorkDiv const &)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    return offset::getOffsetVecEnd<TDim>(
-                        vec::Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                            static_cast<TIdx>(hipBlockIdx_z),
-                            static_cast<TIdx>(hipBlockIdx_y),
-                            static_cast<TIdx>(hipBlockIdx_x)));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU HIP accelerator grid block index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::gb::IdxGbHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp b/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp
deleted file mode 100644
index 1bf7d9426f..0000000000
--- a/thirdParty/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace gb
-        {
-            //#############################################################################
-            //! A IdxGbRef grid block index.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxGbRef : public concepts::Implements<ConceptIdxGb, IdxGbRef<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxGbRef(
-                    vec::Vec<TDim, TIdx> const & gridBlockIdx) :
-                        m_gridBlockIdx(gridBlockIdx)
-                {}
-                //-----------------------------------------------------------------------------
-                IdxGbRef(IdxGbRef const &) = delete;
-                //-----------------------------------------------------------------------------
-                IdxGbRef(IdxGbRef &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxGbRef const &) -> IdxGbRef & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxGbRef &&) -> IdxGbRef & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxGbRef() = default;
-
-            public:
-                vec::Vec<TDim, TIdx> const & m_gridBlockIdx;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The IdxGbRef grid block index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::gb::IdxGbRef<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The IdxGbRef grid block index grid block index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::gb::IdxGbRef<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current block in the grid.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::gb::IdxGbRef<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    return idx.m_gridBlockIdx;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The IdxGbRef grid block index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::gb::IdxGbRef<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp
deleted file mode 100644
index 3989b5a63d..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp
+++ /dev/null
@@ -1,392 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccCpuFibers.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Fibers.hpp>
-#include <alpaka/core/ConcurrentExecPool.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <algorithm>
-#include <vector>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        //#############################################################################
-        //! The CPU fibers accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuFibers final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        private:
-            //#############################################################################
-            //! The type given to the ConcurrentExecPool for yielding the current fiber.
-            struct FiberPoolYield
-            {
-                //-----------------------------------------------------------------------------
-                //! Yields the current fiber.
-                ALPAKA_FN_HOST static auto yield()
-                -> void
-                {
-                    boost::this_fiber::yield();
-                }
-            };
-            //#############################################################################
-            // Yielding is not faster for fibers. Therefore we use condition variables.
-            // It is better to wake them up when the conditions are fulfilled because this does not cost as much as for real threads.
-            using FiberPool = alpaka::core::detail::ConcurrentExecPool<
-                TIdx,
-                boost::fibers::fiber,               // The concurrent execution type.
-                boost::fibers::promise,             // The promise type.
-                FiberPoolYield,                     // The type yielding the current concurrent execution.
-                boost::fibers::mutex,               // The mutex type to use. Only required if TisYielding is true.
-                boost::fibers::condition_variable,  // The condition variable type to use. Only required if TisYielding is true.
-                false>;                             // If the threads should yield.
-
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuFibers(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuFibers(TaskKernelCpuFibers const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuFibers(TaskKernelCpuFibers &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuFibers const &) -> TaskKernelCpuFibers & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuFibers &&) -> TaskKernelCpuFibers & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuFibers() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuFibers<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                acc::AccCpuFibers<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " Fiber stack idx: " << boost::fibers::fixedsize_stack::traits_type::default_size() << " B" << std::endl;
-#endif
-
-                auto const blockThreadCount(blockThreadExtent.prod());
-                FiberPool fiberPool(blockThreadCount);
-
-                auto const boundGridBlockExecHost(
-                    meta::apply(
-                        [this, &acc, &blockThreadExtent, &fiberPool](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            // Bind the kernel and its arguments to the grid block function.
-                            return
-                                std::bind(
-                                    &TaskKernelCpuFibers::gridBlockExecHost,
-                                    std::ref(acc),
-                                    std::placeholders::_1,
-                                    std::ref(blockThreadExtent),
-                                    std::ref(fiberPool),
-                                    std::ref(m_kernelFnObj),
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // Execute the blocks serially.
-                meta::ndLoopIncIdx(
-                    gridBlockExtent,
-                    boundGridBlockExecHost);
-            }
-
-        private:
-            //-----------------------------------------------------------------------------
-            //! The function executed for each grid block.
-            ALPAKA_FN_HOST static auto gridBlockExecHost(
-                acc::AccCpuFibers<TDim, TIdx> & acc,
-                vec::Vec<TDim, TIdx> const & gridBlockIdx,
-                vec::Vec<TDim, TIdx> const & blockThreadExtent,
-                FiberPool & fiberPool,
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                    // The futures of the threads in the current block.
-                std::vector<boost::fibers::future<void>> futuresInBlock;
-
-                // Set the index of the current block
-                acc.m_gridBlockIdx = gridBlockIdx;
-
-                // Bind the kernel and its arguments to the host block thread execution function.
-                auto boundBlockThreadExecHost(std::bind(
-                    &TaskKernelCpuFibers::blockThreadExecHost,
-                    std::ref(acc),
-                    std::ref(futuresInBlock),
-                    std::placeholders::_1,
-                    std::ref(fiberPool),
-                    std::ref(kernelFnObj),
-                    std::ref(args)...));
-                // Execute the block threads in parallel.
-                meta::ndLoopIncIdx(
-                    blockThreadExtent,
-                    boundBlockThreadExecHost);
-
-                // Wait for the completion of the block thread kernels.
-                std::for_each(
-                    futuresInBlock.begin(),
-                    futuresInBlock.end(),
-                    [](boost::fibers::future<void> & t)
-                    {
-                        t.wait();
-                    }
-                );
-                // Clean up.
-                futuresInBlock.clear();
-
-                acc.m_fibersToIndices.clear();
-
-                // After a block has been processed, the shared memory has to be deleted.
-                block::shared::st::freeMem(acc);
-            }
-            //-----------------------------------------------------------------------------
-            //! The function executed for each block thread.
-            ALPAKA_FN_HOST static auto blockThreadExecHost(
-                acc::AccCpuFibers<TDim, TIdx> & acc,
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                std::vector<boost::fibers::future<void>> & futuresInBlock,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                FiberPool & fiberPool,
-#else
-                std::vector<boost::fibers::future<void>> &,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                FiberPool &,
-#endif
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                // Bind the arguments to the accelerator block thread execution function.
-                // The blockThreadIdx is required to be copied in because the variable will get changed for the next iteration/thread.
-                auto boundBlockThreadExecAcc(
-                    [&, blockThreadIdx]()
-                    {
-                        blockThreadFiberFn(
-                            acc,
-                            blockThreadIdx,
-                            kernelFnObj,
-                            args...);
-                    });
-                // Add the bound function to the block thread pool.
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                futuresInBlock.emplace_back(
-                    fiberPool.enqueueTask(
-                        boundBlockThreadExecAcc));
-#else
-                (void)boundBlockThreadExecAcc;
-#endif
-            }
-            //-----------------------------------------------------------------------------
-            //! The fiber entry point.
-            ALPAKA_FN_HOST static auto blockThreadFiberFn(
-                acc::AccCpuFibers<TDim, TIdx> & acc,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                // We have to store the fiber data before the kernel is calling any of the methods of this class depending on them.
-                auto const fiberId(boost::this_fiber::get_id());
-
-                // Set the master thread id.
-                if(blockThreadIdx.sum() == 0)
-                {
-                    acc.m_masterFiberId = fiberId;
-                }
-
-                // Save the fiber id, and index.
-                acc.m_fibersToIndices.emplace(fiberId, blockThreadIdx);
-
-                // Sync all threads so that the maps with thread id's are complete and not changed after here.
-                syncBlockThreads(acc);
-
-                // Execute the kernel itself.
-                kernelFnObj(
-                    const_cast<acc::AccCpuFibers<TDim, TIdx> const &>(acc),
-                    args...);
-
-                // We have to sync all fibers here because if a fiber would finish before all fibers have been started, the new fiber could get a recycled (then duplicate) fiber id!
-                syncBlockThreads(acc);
-            }
-
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuFibers<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
deleted file mode 100644
index 39b6d7d29b..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Bert Wesarg, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccCpuOmp2Blocks.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <omp.h>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        //#############################################################################
-        //! The CPU OpenMP 2.0 block accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp2Blocks final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp2Blocks(TaskKernelCpuOmp2Blocks const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp2Blocks(TaskKernelCpuOmp2Blocks &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp2Blocks const &) -> TaskKernelCpuOmp2Blocks & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp2Blocks &&) -> TaskKernelCpuOmp2Blocks & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuOmp2Blocks() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuOmp2Blocks<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // The number of blocks in the grid.
-                TIdx const numBlocksInGrid(gridBlockExtent.prod());
-                if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
-                {
-                    throw std::runtime_error("Only one thread per block allowed in the OpenMP 2.0 block accelerator!");
-                }
-
-                if(::omp_in_parallel() != 0)
-                {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__ << " already within a parallel region." << std::endl;
-#endif
-                    parallelFn(
-                        boundKernelFnObj,
-                        blockSharedMemDynSizeBytes,
-                        numBlocksInGrid,
-                        gridBlockExtent);
-                }
-                else
-                {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__ << " opening new parallel region." << std::endl;
-#endif
-                    #pragma omp parallel
-                    parallelFn(
-                        boundKernelFnObj,
-                        blockSharedMemDynSizeBytes,
-                        numBlocksInGrid,
-                        gridBlockExtent);
-                }
-            }
-
-        private:
-            template<
-                typename FnObj>
-            ALPAKA_FN_HOST auto parallelFn(
-                FnObj const & boundKernelFnObj,
-                TIdx const & blockSharedMemDynSizeBytes,
-                TIdx const & numBlocksInGrid,
-                vec::Vec<TDim, TIdx> const & gridBlockExtent) const
-            -> void
-            {
-                #pragma omp single nowait
-                {
-                    // The OpenMP runtime does not create a parallel region when only one thread is required in the num_threads clause.
-                    // In all other cases we expect to be in a parallel region now.
-                    if((numBlocksInGrid > 1) && (::omp_in_parallel() == 0))
-                    {
-                        throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
-                    }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cout << __func__ << " omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
-#endif
-                }
-
-                acc::AccCpuOmp2Blocks<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
-
-                // NOTE: schedule(static) does not improve performance.
-#if _OPENMP < 200805    // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numBlocksInGrid));
-                std::intmax_t i;
-                #pragma omp for nowait schedule(guided)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#else
-                #pragma omp for nowait schedule(guided)
-                for(TIdx i = 0; i < numBlocksInGrid; ++i)
-#endif
-                {
-#if _OPENMP < 200805
-                    auto const i_tidx  = static_cast<TIdx>(i); // for issue #840
-                    auto const index   = vec::Vec<dim::DimInt<1u>, TIdx>( i_tidx ); // for issue #840
-#else
-                    auto const index   = vec::Vec<dim::DimInt<1u>, TIdx>( i ); // for issue #840
-#endif
-                    acc.m_gridBlockIdx = idx::mapIdx<TDim::value>(index,
-                                                                  gridBlockExtent);
-
-                    boundKernelFnObj(
-                        acc);
-
-                    // After a block has been processed, the shared memory has to be deleted.
-                    block::shared::st::freeMem(acc);
-                }
-            }
-
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 grid block execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuOmp2Blocks<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 grid block execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 grid block execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 grid block execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
deleted file mode 100644
index 7911a50ed9..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccCpuOmp2Threads.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <omp.h>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        //#############################################################################
-        //! The CPU OpenMP 2.0 thread accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp2Threads final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp2Threads(TaskKernelCpuOmp2Threads const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp2Threads(TaskKernelCpuOmp2Threads &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp2Threads const &) -> TaskKernelCpuOmp2Threads & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp2Threads &&) -> TaskKernelCpuOmp2Threads & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuOmp2Threads() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuOmp2Threads<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                acc::AccCpuOmp2Threads<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
-
-                // The number of threads in this block.
-                TIdx const blockThreadCount(blockThreadExtent.prod());
-                int const iBlockThreadCount(static_cast<int>(blockThreadCount));
-                alpaka::ignore_unused(iBlockThreadCount);
-
-                if(::omp_in_parallel() != 0)
-                {
-                    throw std::runtime_error("The OpenMP 2.0 thread backend can not be used within an existing parallel region!");
-                }
-
-                // Force the environment to use the given number of threads.
-                int const ompIsDynamic(::omp_get_dynamic());
-                ::omp_set_dynamic(0);
-
-                // Execute the blocks serially.
-                meta::ndLoopIncIdx(
-                    gridBlockExtent,
-                    [&](vec::Vec<TDim, TIdx> const & gridBlockIdx)
-                    {
-                        acc.m_gridBlockIdx = gridBlockIdx;
-
-                        // Execute the threads in parallel.
-
-                        // Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line.
-                        // So we have to spawn one OS thread per thread in a block.
-                        // 'omp for' is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required.
-                        // Therefore we use 'omp parallel' with the specified number of threads in a block.
-                        #pragma omp parallel num_threads(iBlockThreadCount)
-                        {
-                            #pragma omp single nowait
-                            {
-                                // The OpenMP runtime does not create a parallel region when only one thread is required in the num_threads clause.
-                                // In all other cases we expect to be in a parallel region now.
-                                if((iBlockThreadCount > 1) && (::omp_in_parallel() == 0))
-                                {
-                                    throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
-                                }
-
-                                // GCC 5.1 fails with:
-                                // error: redeclaration of const int& iBlockThreadCount
-                                // if(numThreads != iBlockThreadCount)
-                                //                  ^
-                                // note: const int& iBlockThreadCount previously declared here
-                                // #pragma omp parallel num_threads(iBlockThreadCount)
-                                //         ^
-#if (!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0)) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(6, 0, 0))
-                                int const numThreads(::omp_get_num_threads());
-                                if(numThreads != iBlockThreadCount)
-                                {
-                                    throw std::runtime_error("The OpenMP 2.0 runtime did not use the number of threads that had been required!");
-                                }
-#endif
-                            }
-                            boundKernelFnObj(
-                                acc);
-
-                            // Wait for all threads to finish before deleting the shared memory.
-                            // This is done by default if the omp 'nowait' clause is missing on the omp parallel directive
-                            //block::sync::syncBlockThreads(acc);
-                        }
-
-                        // After a block has been processed, the shared memory has to be deleted.
-                        block::shared::st::freeMem(acc);
-                    });
-
-                // Reset the dynamic thread number setting.
-                ::omp_set_dynamic(ompIsDynamic);
-            }
-
-        private:
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuOmp2Threads<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp
deleted file mode 100644
index 83081d99df..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-
-#if _OPENMP < 201307
-    #error If ALPAKA_ACC_CPU_BT_OMP4_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
-#endif
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccCpuOmp4.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <omp.h>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        //#############################################################################
-        //! The CPU OpenMP 4.0 accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp4 final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuOmp4(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp4(TaskKernelCpuOmp4 const & other) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp4(TaskKernelCpuOmp4 && other) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp4 const &) -> TaskKernelCpuOmp4 & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp4 &&) -> TaskKernelCpuOmp4 & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuOmp4() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuOmp4<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // The number of blocks in the grid.
-                TIdx const gridBlockCount(gridBlockExtent.prod());
-                // The number of threads in a block.
-                TIdx const blockThreadCount(blockThreadExtent.prod());
-
-                // We have to make sure, that the OpenMP runtime keeps enough threads for executing a block in parallel.
-                auto const maxOmpThreadCount(::omp_get_max_threads());
-                auto const maxTeamCount(maxOmpThreadCount/static_cast<int>(blockThreadCount));
-                auto const teamCount(std::min(maxTeamCount, static_cast<int>(gridBlockCount)));
-
-                if(::omp_in_parallel() != 0)
-                {
-                    throw std::runtime_error("The OpenMP 4.0 backend can not be used within an existing parallel region!");
-                }
-
-                // Force the environment to use the given number of threads.
-                int const ompIsDynamic(::omp_get_dynamic());
-                ::omp_set_dynamic(0);
-
-                // `When an if(scalar-expression) evaluates to false, the structured block is executed on the host.`
-                #pragma omp target if(0)
-                {
-                    #pragma omp teams num_teams(teamCount) thread_limit(blockThreadCount)
-                    {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        // The first team does some checks ...
-                        if((::omp_get_team_num() == 0))
-                        {
-                            int const iNumTeams(::omp_get_num_teams());
-                            printf("%s omp_get_num_teams: %d\n", __func__, iNumTeams);
-                        }
-#endif
-                        acc::AccCpuOmp4<TDim, TIdx> acc(
-                            *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                            blockSharedMemDynSizeBytes);
-
-                        #pragma omp distribute
-                        for(TIdx b = 0u; b<gridBlockCount; ++b)
-                        {
-                            vec::Vec<dim::DimInt<1u>, TIdx> const gridBlockIdx(b);
-                            // When this is not repeated here:
-                            // error: gridBlockExtent referenced in target region does not have a mappable type
-                            auto const gridBlockExtent2(
-                                workdiv::getWorkDiv<Grid, Blocks>(*static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this)));
-                            acc.m_gridBlockIdx = idx::mapIdx<TDim::value>(
-                                gridBlockIdx,
-                                gridBlockExtent2);
-
-                            // Execute the threads in parallel.
-
-                            // Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line.
-                            // So we have to spawn one OS thread per thread in a block.
-                            // 'omp for' is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required.
-                            // Therefore we use 'omp parallel' with the specified number of threads in a block.
-                            #pragma omp parallel num_threads(blockThreadCount)
-                            {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                                // The first thread does some checks in the first block executed.
-                                if((::omp_get_thread_num() == 0) && (b == 0))
-                                {
-                                    int const numThreads(::omp_get_num_threads());
-                                    printf("%s omp_get_num_threads: %d\n", __func__, numThreads);
-                                    if(numThreads != static_cast<int>(blockThreadCount))
-                                    {
-                                        throw std::runtime_error("ERROR: The OpenMP runtime did not use the number of threads that had been required!");
-                                    }
-                                }
-#endif
-                                boundKernelFnObj(
-                                    acc);
-
-                                // Wait for all threads to finish before deleting the shared memory.
-                                // This is done by default if the omp 'nowait' clause is missing
-                                //block::sync::syncBlockThreads(acc);
-                            }
-
-                            // After a block has been processed, the shared memory has to be deleted.
-                            block::shared::st::freeMem(acc);
-                        }
-                    }
-                }
-
-                // Reset the dynamic thread number setting.
-                ::omp_set_dynamic(ompIsDynamic);
-            }
-
-        private:
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuOmp4<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp
deleted file mode 100644
index 62dc8ab731..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccCpuSerial.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        //#############################################################################
-        //! The CPU serial execution task implementation.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuSerial final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuSerial(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuSerial(TaskKernelCpuSerial const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuSerial(TaskKernelCpuSerial &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuSerial const &) -> TaskKernelCpuSerial & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuSerial &&) -> TaskKernelCpuSerial & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuSerial() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuSerial<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                acc::AccCpuSerial<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
-
-                if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
-                {
-                    throw std::runtime_error("A block for the serial accelerator can only ever have one single thread!");
-                }
-
-                // Execute the blocks serially.
-                meta::ndLoopIncIdx(
-                    gridBlockExtent,
-                    [&](vec::Vec<TDim, TIdx> const & blockThreadIdx)
-                    {
-                        acc.m_gridBlockIdx = blockThreadIdx;
-
-                        boundKernelFnObj(
-                            acc);
-
-                        // After a block has been processed, the shared memory has to be deleted.
-                        block::shared::st::freeMem(acc);
-                    });
-            }
-
-        private:
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuSerial<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
deleted file mode 100644
index 2efbd29af9..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
+++ /dev/null
@@ -1,259 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccCpuTbbBlocks.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-#include <tbb/parallel_for.h>
-#include <tbb/blocked_range.h>
-#include <tbb/task_group.h>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        //#############################################################################
-        //! The CPU TBB block accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuTbbBlocks final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuTbbBlocks(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuTbbBlocks(TaskKernelCpuTbbBlocks const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuTbbBlocks(TaskKernelCpuTbbBlocks &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuTbbBlocks const &) -> TaskKernelCpuTbbBlocks & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuTbbBlocks &&) -> TaskKernelCpuTbbBlocks & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuTbbBlocks() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuTbbBlocks<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // The number of blocks in the grid.
-                TIdx const numBlocksInGrid(gridBlockExtent.prod());
-
-                if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
-                {
-                    throw std::runtime_error("A block for the TBB accelerator can only ever have one single thread!");
-                }
-
-                tbb::parallel_for(
-                    static_cast<TIdx>(0),
-                    static_cast<TIdx>(numBlocksInGrid),
-                    [&](TIdx i){
-                         acc::AccCpuTbbBlocks<TDim, TIdx> acc(
-                             *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                             blockSharedMemDynSizeBytes);
-
-                         acc.m_gridBlockIdx =
-                             idx::mapIdx<TDim::value>(
-                                 vec::Vec<dim::DimInt<1u>, TIdx>(
-                                     static_cast<TIdx>(i)
-                                  ),
-                                  gridBlockExtent
-                             );
-
-                         boundKernelFnObj(acc);
-
-                         block::shared::st::freeMem(acc);
-                });
-
-            }
-
-        private:
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuTbbBlocks<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU TBB block execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp
deleted file mode 100644
index 0b9cb85aec..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp
+++ /dev/null
@@ -1,394 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccCpuThreads.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/ConcurrentExecPool.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <algorithm>
-#include <thread>
-#include <vector>
-#include <tuple>
-#include <type_traits>
-#include <future>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        //#############################################################################
-        //! The CPU threads execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuThreads final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        private:
-            //#############################################################################
-            //! The type given to the ConcurrentExecPool for yielding the current thread.
-            struct ThreadPoolYield
-            {
-                //-----------------------------------------------------------------------------
-                //! Yields the current thread.
-                ALPAKA_FN_HOST static auto yield()
-                -> void
-                {
-                    std::this_thread::yield();
-                }
-            };
-            //#############################################################################
-            // When using the thread pool the threads are yielding because this is faster.
-            // Using condition variables and going to sleep is very costly for real threads.
-            // Especially when the time to wait is really short (syncBlockThreads) yielding is much faster.
-            using ThreadPool = alpaka::core::detail::ConcurrentExecPool<
-                TIdx,
-                std::thread,        // The concurrent execution type.
-                std::promise,       // The promise type.
-                ThreadPoolYield>;   // The type yielding the current concurrent execution.
-
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuThreads(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuThreads(TaskKernelCpuThreads const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuThreads(TaskKernelCpuThreads &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuThreads const &) -> TaskKernelCpuThreads & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuThreads &&) -> TaskKernelCpuThreads & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuThreads() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuThreads<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                acc::AccCpuThreads<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
-
-                auto const blockThreadCount(blockThreadExtent.prod());
-                ThreadPool threadPool(blockThreadCount);
-
-                // Bind the kernel and its arguments to the grid block function.
-                auto const boundGridBlockExecHost(
-                    meta::apply(
-                        [this, &acc, &blockThreadExtent, &threadPool](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    &TaskKernelCpuThreads::gridBlockExecHost,
-                                    std::ref(acc),
-                                    std::placeholders::_1,
-                                    std::ref(blockThreadExtent),
-                                    std::ref(threadPool),
-                                    std::ref(m_kernelFnObj),
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // Execute the blocks serially.
-                meta::ndLoopIncIdx(
-                    gridBlockExtent,
-                    boundGridBlockExecHost);
-            }
-
-        private:
-            //-----------------------------------------------------------------------------
-            //! The function executed for each grid block.
-            ALPAKA_FN_HOST static auto gridBlockExecHost(
-                acc::AccCpuThreads<TDim, TIdx> & acc,
-                vec::Vec<TDim, TIdx> const & gridBlockIdx,
-                vec::Vec<TDim, TIdx> const & blockThreadExtent,
-                ThreadPool & threadPool,
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                    // The futures of the threads in the current block.
-                std::vector<std::future<void>> futuresInBlock;
-
-                // Set the index of the current block
-                acc.m_gridBlockIdx = gridBlockIdx;
-
-                // Bind the kernel and its arguments to the host block thread execution function.
-                auto boundBlockThreadExecHost(std::bind(
-                    &TaskKernelCpuThreads::blockThreadExecHost,
-                    std::ref(acc),
-                    std::ref(futuresInBlock),
-                    std::placeholders::_1,
-                    std::ref(threadPool),
-                    std::ref(kernelFnObj),
-                    std::ref(args)...));
-                // Execute the block threads in parallel.
-                meta::ndLoopIncIdx(
-                    blockThreadExtent,
-                    boundBlockThreadExecHost);
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                // Wait for the completion of the block thread kernels.
-                std::for_each(
-                    futuresInBlock.begin(),
-                    futuresInBlock.end(),
-                    [](std::future<void> & t)
-                    {
-                        t.wait();
-                    }
-                );
-#endif
-                // Clean up.
-                futuresInBlock.clear();
-
-                acc.m_threadToIndexMap.clear();
-
-                // After a block has been processed, the shared memory has to be deleted.
-                block::shared::st::freeMem(acc);
-            }
-            //-----------------------------------------------------------------------------
-            //! The function executed for each block thread on the host.
-            ALPAKA_FN_HOST static auto blockThreadExecHost(
-                acc::AccCpuThreads<TDim, TIdx> & acc,
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                std::vector<std::future<void>> & futuresInBlock,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                ThreadPool & threadPool,
-#else
-                std::vector<std::future<void>> &,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                ThreadPool &,
-#endif
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                // Bind the arguments to the accelerator block thread execution function.
-                // The blockThreadIdx is required to be copied in because the variable will get changed for the next iteration/thread.
-                auto boundBlockThreadExecAcc(
-                    [&, blockThreadIdx]()
-                    {
-                        blockThreadExecAcc(
-                            acc,
-                            blockThreadIdx,
-                            kernelFnObj,
-                            args...);
-                    });
-                // Add the bound function to the block thread pool.
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                futuresInBlock.emplace_back(
-                    threadPool.enqueueTask(
-                        boundBlockThreadExecAcc));
-#else
-                (void)boundBlockThreadExecAcc;
-#endif
-            }
-            //-----------------------------------------------------------------------------
-            //! The thread entry point on the accelerator.
-            ALPAKA_FN_HOST static auto blockThreadExecAcc(
-                acc::AccCpuThreads<TDim, TIdx> & acc,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                // We have to store the thread data before the kernel is calling any of the methods of this class depending on them.
-                auto const threadId(std::this_thread::get_id());
-
-                // Set the master thread id.
-                if(blockThreadIdx.sum() == 0)
-                {
-                    acc.m_idMasterThread = threadId;
-                }
-
-                {
-                    // The insertion of elements has to be done one thread at a time.
-                    std::lock_guard<std::mutex> lock(acc.m_mtxMapInsert);
-
-                    // Save the thread id, and index.
-                    acc.m_threadToIndexMap.emplace(threadId, blockThreadIdx);
-                }
-
-                // Sync all threads so that the maps with thread id's are complete and not changed after here.
-                syncBlockThreads(acc);
-
-                // Execute the kernel itself.
-                kernelFnObj(
-                    const_cast<acc::AccCpuThreads<TDim, TIdx> const &>(acc),
-                    args...);
-
-                // We have to sync all threads here because if a thread would finish before all threads have been started,
-                // a new thread could get the recycled (then duplicate) thread id!
-                syncBlockThreads(acc);
-            }
-
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuThreads<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
deleted file mode 100644
index a50b4d00bd..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
+++ /dev/null
@@ -1,524 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccGpuCudaRt.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <alpaka/acc/Traits.hpp>
-    #include <alpaka/dev/Traits.hpp>
-    #include <alpaka/workdiv/WorkDivHelpers.hpp>
-#endif
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        namespace cuda
-        {
-            namespace detail
-            {
-                //-----------------------------------------------------------------------------
-                //! The GPU CUDA kernel entry point.
-                // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
-                template<
-                    typename TDim,
-                    typename TIdx,
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                __global__ void cudaKernel(
-                    vec::Vec<TDim, TIdx> const threadElemExtent,
-                    TKernelFnObj const kernelFnObj,
-                    TArgs ... args)
-                {
-#if BOOST_ARCH_PTX && (BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(2, 0, 0))
-    #error "Cuda device capability >= 2.0 is required!"
-#endif
-
-// with clang it is not possible to query std::result_of for a pure device lambda created on the host side
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
-                    static_assert(
-                        std::is_same<typename std::result_of<
-                            TKernelFnObj(acc::AccGpuCudaRt<TDim, TIdx> const &, TArgs const & ...)>::type, void>::value,
-                        "The TKernelFnObj is required to return void!");
-#endif
-                    acc::AccGpuCudaRt<TDim, TIdx> acc(threadElemExtent);
-
-                    kernelFnObj(
-                        const_cast<acc::AccGpuCudaRt<TDim, TIdx> const &>(acc),
-                        args...);
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TIdx
-                >
-                ALPAKA_FN_HOST auto checkVecOnly3Dim(
-                    vec::Vec<TDim, TIdx> const & vec)
-                -> void
-                {
-                    for(auto i(std::min(static_cast<typename TDim::value_type>(3), TDim::value)); i<TDim::value; ++i)
-                    {
-                        if(vec[TDim::value-1u-i] != 1)
-                        {
-                            throw std::runtime_error("The CUDA accelerator supports a maximum of 3 dimensions. All work division extents of the dimensions higher 3 have to be 1!");
-                        }
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TIdx
-                >
-                ALPAKA_FN_HOST auto convertVecToCudaDim(
-                    vec::Vec<TDim, TIdx> const & vec)
-                -> dim3
-                {
-                    dim3 dim(1, 1, 1);
-                    for(auto i(static_cast<typename TDim::value_type>(0)); i<std::min(static_cast<typename TDim::value_type>(3), TDim::value); ++i)
-                    {
-                        reinterpret_cast<unsigned int *>(&dim)[i] = static_cast<unsigned int>(vec[TDim::value-1u-i]);
-                    }
-                    checkVecOnly3Dim(vec);
-                    return dim;
-                }
-            }
-        }
-
-        //#############################################################################
-        //! The GPU CUDA accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelGpuCudaRt final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-// gcc-4.9 libstdc++ does not support std::is_trivially_copyable.
-// MSVC std::is_trivially_copyable seems to be buggy (last tested at 15.7).
-// libc++ in combination with CUDA does not seem to work.
-#if (!BOOST_COMP_MSVC) && !(defined(__GLIBCXX__) && (__GLIBCXX__)) && !(defined(_LIBCPP_VERSION) && BOOST_LANG_CUDA)
-            static_assert(
-                meta::Conjunction<
-                    std::is_trivially_copyable<
-                        TKernelFnObj>,
-                    std::is_trivially_copyable<
-                        TArgs>...
-                    >::value,
-                "The given kernel function object and its arguments have to fulfill is_trivially_copyable!");
-#endif
-
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelGpuCudaRt(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelGpuCudaRt(TaskKernelGpuCudaRt const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelGpuCudaRt(TaskKernelGpuCudaRt &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelGpuCudaRt const &) -> TaskKernelGpuCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelGpuCudaRt &&) -> TaskKernelGpuCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelGpuCudaRt() = default;
-
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccGpuCudaRt<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCudaRt;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU CUDA execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCudaRt;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA non-blocking kernel enqueue trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    //std::size_t printfFifoSize;
-                    //cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                    //cudaDeviceSetLimit(cudaLimitPrintfFifoSize, printfFifoSize*10);
-                    //cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
-#endif
-                    auto const gridBlockExtent(
-                        workdiv::getWorkDiv<Grid, Blocks>(task));
-                    auto const blockThreadExtent(
-                        workdiv::getWorkDiv<Block, Threads>(task));
-                    auto const threadElemExtent(
-                        workdiv::getWorkDiv<Thread, Elems>(task));
-
-                    dim3 const gridDim(kernel::cuda::detail::convertVecToCudaDim(gridBlockExtent));
-                    dim3 const blockDim(kernel::cuda::detail::convertVecToCudaDim(blockThreadExtent));
-                    kernel::cuda::detail::checkVecOnly3Dim(threadElemExtent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__
-                        << " gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x
-                        << " blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x
-                        << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                    if(!workdiv::isValidWorkDiv<acc::AccGpuCudaRt<TDim, TIdx>>(dev::getDev(queue), task))
-                    {
-                        throw std::runtime_error("The given work division is not valid or not supported by the device of type " + acc::getAccName<acc::AccGpuCudaRt<TDim, TIdx>>() + "!");
-                    }
-#endif
-
-                    // Get the size of the block shared dynamic memory.
-                    auto const blockSharedMemDynSizeBytes(
-                        meta::apply(
-                            [&](typename std::decay<TArgs>::type const & ... args)
-                            {
-                                return
-                                    kernel::getBlockSharedMemDynSizeBytes<
-                                        acc::AccGpuCudaRt<TDim, TIdx>>(
-                                            task.m_kernelFnObj,
-                                            blockThreadExtent,
-                                            threadElemExtent,
-                                            args...);
-                            },
-                            task.m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the block shared memory idx.
-                    std::cout << __func__
-                        << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the function attributes.
-                    cudaFuncAttributes funcAttrs;
-                    cudaFuncGetAttributes(&funcAttrs, kernel::cuda::detail::cudaKernel<TDim, TIdx, TKernelFnObj, TArgs...>);
-                    std::cout << __func__
-                        << " binaryVersion: " << funcAttrs.binaryVersion
-                        << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                        << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                        << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                        << " numRegs: " << funcAttrs.numRegs
-                        << " ptxVersion: " << funcAttrs.ptxVersion
-                        << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B"
-                        << std::endl;
-#endif
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            queue.m_spQueueImpl->m_dev.m_iDevice));
-                    // Enqueue the kernel execution.
-                    // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch language extension expects the arguments by value.
-                    // This forces the type of a float argument given with std::forward to this function to be of type float instead of e.g. "float const & __ptr64" (MSVC).
-                    // If not given by value, the kernel launch code does not copy the value but the pointer to the value location.
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            kernel::cuda::detail::cudaKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type...><<<
-                                gridDim,
-                                blockDim,
-                                static_cast<std::size_t>(blockSharedMemDynSizeBytes),
-                                queue.m_spQueueImpl->m_CudaQueue>>>(
-                                    threadElemExtent,
-                                    task.m_kernelFnObj,
-                                    args...);
-                        },
-                        task.m_args);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message.
-                    cudaStreamSynchronize(
-                        queue.m_spQueueImpl->m_CudaQueue);
-                    std::string const kernelName("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
-                    ::alpaka::cuda::detail::cudaRtCheckLastError(kernelName.c_str(), __FILE__, __LINE__);
-#endif
-                }
-            };
-            //#############################################################################
-            //! The CUDA synchronous kernel enqueue trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    //std::size_t printfFifoSize;
-                    //cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                    //cudaDeviceSetLimit(cudaLimitPrintfFifoSize, printfFifoSize*10);
-                    //cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
-#endif
-                    auto const gridBlockExtent(
-                        workdiv::getWorkDiv<Grid, Blocks>(task));
-                    auto const blockThreadExtent(
-                        workdiv::getWorkDiv<Block, Threads>(task));
-                    auto const threadElemExtent(
-                        workdiv::getWorkDiv<Thread, Elems>(task));
-
-                    dim3 const gridDim(kernel::cuda::detail::convertVecToCudaDim(gridBlockExtent));
-                    dim3 const blockDim(kernel::cuda::detail::convertVecToCudaDim(blockThreadExtent));
-                    kernel::cuda::detail::checkVecOnly3Dim(threadElemExtent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__ << "gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x << std::endl;
-                    std::cout << __func__ << "blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                    if(!workdiv::isValidWorkDiv<acc::AccGpuCudaRt<TDim, TIdx>>(dev::getDev(queue), task))
-                    {
-                        throw std::runtime_error("The given work division is not valid or not supported by the device of type " + acc::getAccName<acc::AccGpuCudaRt<TDim, TIdx>>() + "!");
-                    }
-#endif
-
-                    // Get the size of the block shared dynamic memory.
-                    auto const blockSharedMemDynSizeBytes(
-                        meta::apply(
-                            [&](typename std::decay<TArgs>::type const & ... args)
-                            {
-                                return
-                                    kernel::getBlockSharedMemDynSizeBytes<
-                                        acc::AccGpuCudaRt<TDim, TIdx>>(
-                                            task.m_kernelFnObj,
-                                            blockThreadExtent,
-                                            threadElemExtent,
-                                            args...);
-                            },
-                            task.m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the block shared memory idx.
-                    std::cout << __func__
-                        << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the function attributes.
-                    cudaFuncAttributes funcAttrs;
-                    cudaFuncGetAttributes(&funcAttrs, kernel::cuda::detail::cudaKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type...>);
-                    std::cout << __func__
-                        << " binaryVersion: " << funcAttrs.binaryVersion
-                        << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                        << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                        << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                        << " numRegs: " << funcAttrs.numRegs
-                        << " ptxVersion: " << funcAttrs.ptxVersion
-                        << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B"
-                        << std::endl;
-#endif
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            queue.m_spQueueImpl->m_dev.m_iDevice));
-                    // Enqueue the kernel execution.
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            kernel::cuda::detail::cudaKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type...><<<
-                                gridDim,
-                                blockDim,
-                                static_cast<std::size_t>(blockSharedMemDynSizeBytes),
-                                queue.m_spQueueImpl->m_CudaQueue>>>(
-                                    threadElemExtent,
-                                    task.m_kernelFnObj,
-                                    args...);
-                        },
-                        task.m_args);
-
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message.
-                    cudaStreamSynchronize(
-                        queue.m_spQueueImpl->m_CudaQueue);
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::string const kernelName("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
-                    ::alpaka::cuda::detail::cudaRtCheckLastError(kernelName.c_str(), __FILE__, __LINE__);
-#endif
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelGpuHipRt.hpp b/thirdParty/alpaka/include/alpaka/kernel/TaskKernelGpuHipRt.hpp
deleted file mode 100644
index bf8cb5de77..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/TaskKernelGpuHipRt.hpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccGpuHipRt.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/queue/QueueHipRtBlocking.hpp>
-#include <alpaka/queue/QueueHipRtNonBlocking.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <alpaka/acc/Traits.hpp>
-    #include <alpaka/dev/Traits.hpp>
-    #include <alpaka/workdiv/WorkDivHelpers.hpp>
-#endif
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Utility.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        namespace hip
-        {
-            namespace detail
-            {
-                //-----------------------------------------------------------------------------
-                //! The GPU HIP kernel entry point.
-                // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
-                template<
-                    typename TDim,
-                    typename TIdx,
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                __global__ void hipKernel(
-                    hipLaunchParm lp,
-                    vec::Vec<TDim, TIdx> const threadElemExtent,
-                    TKernelFnObj const kernelFnObj,
-                    TArgs ... args)
-                {
-#if BOOST_ARCH_PTX && (BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(2, 0, 0))
-    #error "Cuda device capability >= 2.0 is required!"
-#endif
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wignored-attributes"
-                    static_assert(
-                        std::is_same<
-                            decltype(kernelFnObj(
-                                alpaka::core::declval<acc::AccGpuHipRt<TDim, TIdx> const>(),
-                                args...)),
-                        void>::value,
-                        "The TKernelFnObj is required to return void!");
-#pragma clang diagnostic pop
-
-                    acc::AccGpuHipRt<TDim, TIdx> acc(threadElemExtent);
-
-                    kernelFnObj(
-                        const_cast<acc::AccGpuHipRt<TDim, TIdx> const &>(acc),
-                        args...);
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TIdx
-                    >
-                ALPAKA_FN_HOST auto checkVecOnly3Dim(
-                    vec::Vec<TDim, TIdx> const & vec)
-                    -> void
-                {
-                    for(auto i(std::min(static_cast<typename TDim::value_type>(3), TDim::value)); i<TDim::value; ++i)
-                    {
-                        if(vec[TDim::value-1u-i] != 1)
-                        {
-                            throw std::runtime_error("The CUDA accelerator supports a maximum of 3 dimensions. All work division extents of the dimensions higher 3 have to be 1!");
-                        }
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TIdx
-                    >
-                ALPAKA_FN_HOST auto convertVecToHipDim(
-                    vec::Vec<TDim, TIdx> const & vec)
-                    -> dim3
-                {
-                    dim3 dim(1, 1, 1);
-                    for(auto i(static_cast<typename TDim::value_type>(0)); i<std::min(static_cast<typename TDim::value_type>(3), TDim::value); ++i)
-                    {
-                        reinterpret_cast<unsigned int *>(&dim)[i] = static_cast<unsigned int>(vec[TDim::value-1u-i]);
-                    }
-                    checkVecOnly3Dim(vec);
-                    return dim;
-                }
-
-            }
-        }
-        //#############################################################################
-        //! The GPU HIP accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelGpuHipRt final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-// gcc-4.9 libstdc++ does not support std::is_trivially_copyable.
-// MSVC std::is_trivially_copyable seems to be buggy (last tested at 15.7).
-#if (!__GLIBCXX__) && (!BOOST_COMP_MSVC)
-            static_assert(
-                meta::Conjunction<
-                    std::is_trivially_copyable<
-                        TKernelFnObj>,
-                    std::is_trivially_copyable<
-                        TArgs>...
-                    >::value,
-                "The given kernel function object and its arguments have to fulfill is_trivially_copyable!");
-#endif
-
-            //-----------------------------------------------------------------------------
-            //! Constructor.
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelGpuHipRt(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            //! Copy constructor.
-            TaskKernelGpuHipRt(TaskKernelGpuHipRt const &) = default;
-            //-----------------------------------------------------------------------------
-            //! Move constructor.
-            TaskKernelGpuHipRt(TaskKernelGpuHipRt &&) = default;
-            //-----------------------------------------------------------------------------
-            //! Copy assignment operator.
-            auto operator=(TaskKernelGpuHipRt const &) -> TaskKernelGpuHipRt & = default;
-            //-----------------------------------------------------------------------------
-            //! Move assignment operator.
-            auto operator=(TaskKernelGpuHipRt &&) -> TaskKernelGpuHipRt & = default;
-            //-----------------------------------------------------------------------------
-            //! Destructor.
-            ALPAKA_FN_HOST_ACC ~TaskKernelGpuHipRt() = default;
-
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccGpuHipRt<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevHipRt;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU HIP execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfHipRt;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP non-blocking kernel enqueue trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory size
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    //std::size_t printfFifoSize;
-                    //hipDeviceGetLimit(&printfFifoSize, hipLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                    //hipDeviceSetLimit(hipLimitPrintfFifoSize, printfFifoSize*10);
-                    //hipDeviceGetLimit(&printfFifoSize, hipLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
-#endif
-                    auto const gridBlockExtent(
-                        workdiv::getWorkDiv<Grid, Blocks>(task));
-                    auto const blockThreadExtent(
-                        workdiv::getWorkDiv<Block, Threads>(task));
-                    auto const threadElemExtent(
-                        workdiv::getWorkDiv<Thread, Elems>(task));
-
-                    dim3 const gridDim(kernel::hip::detail::convertVecToHipDim(gridBlockExtent));
-                    dim3 const blockDim(kernel::hip::detail::convertVecToHipDim(blockThreadExtent));
-                    kernel::hip::detail::checkVecOnly3Dim(threadElemExtent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__
-                        << " gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x
-                        << " blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x
-                        << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                    if(!workdiv::isValidWorkDiv<acc::AccGpuHipRt<TDim, TIdx>>(dev::getDev(queue), task))
-                    {
-                        throw std::runtime_error("The given work division is not valid or not supported by the device of type " + acc::getAccName<acc::AccGpuHipRt<TDim, TIdx>>() + "!");
-                    }
-#endif
-
-                    // Get the size of the block shared dynamic memory.
-                    auto const blockSharedMemDynSizeBytes(
-                        meta::apply(
-                            // workaround for HIP(HCC) to
-                            // avoid forbidden host-call
-                            // within host-device functions
-                            #if defined(BOOST_COMP_HCC) && BOOST_COMP_HCC
-                            ALPAKA_FN_HOST_ACC
-                            #endif
-                            [&](typename std::decay<TArgs>::type const & ... args)
-                            {
-                                return
-                                    kernel::getBlockSharedMemDynSizeBytes<
-                                        acc::AccGpuHipRt<TDim, TIdx>>(
-                                            task.m_kernelFnObj,
-                                            blockThreadExtent,
-                                            threadElemExtent,
-                                            args...);
-                            },
-                            task.m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the block shared memory size.
-                    std::cout << __func__
-                        << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the function attributes.
-                    /*hipFuncAttributes funcAttrs;
-                    hipFuncGetAttributes(&funcAttrs, kernel::hip::detail::hipKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type...>);
-                    std::cout << __func__
-                        << " binaryVersion: " << funcAttrs.binaryVersion
-                        << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                        << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                        << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                        << " numRegs: " << funcAttrs.numRegs
-                        << " ptxVersion: " << funcAttrs.ptxVersion
-                        << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B"
-                        << std::endl; */
-#endif
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            queue.m_spQueueImpl->m_dev.m_iDevice));
-
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            hipLaunchKernelGGL(
-                                HIP_KERNEL_NAME(kernel::hip::detail::hipKernel< TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type... >),
-                                gridDim,
-                                blockDim,
-                                static_cast<std::uint32_t>(blockSharedMemDynSizeBytes),
-                                queue.m_spQueueImpl->m_HipQueue,
-                                hipLaunchParm{},
-                                threadElemExtent,
-                                task.m_kernelFnObj,
-                                args...
-                            );
-
-                        },
-                        task.m_args);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message.
-                    hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue);
-                    std::string const kernelName("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
-                    ::alpaka::hip::detail::hipRtCheckLastError(kernelName.c_str(), __FILE__, __LINE__);
-#endif
-                }
-            };
-            //#############################################################################
-            //! The HIP synchronous kernel enqueue trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory size
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    //std::size_t printfFifoSize;
-                    //hipDeviceGetLimit(&printfFifoSize, hipLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                    //hipDeviceSetLimit(hipLimitPrintfFifoSize, printfFifoSize*10);
-                    //hipDeviceGetLimit(&printfFifoSize, hipLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
-#endif
-                    auto const gridBlockExtent(
-                        workdiv::getWorkDiv<Grid, Blocks>(task));
-                    auto const blockThreadExtent(
-                        workdiv::getWorkDiv<Block, Threads>(task));
-                    auto const threadElemExtent(
-                        workdiv::getWorkDiv<Thread, Elems>(task));
-
-                    dim3 gridDim(kernel::hip::detail::convertVecToHipDim(gridBlockExtent));
-                    dim3 blockDim(kernel::hip::detail::convertVecToHipDim(blockThreadExtent));
-                    kernel::hip::detail::checkVecOnly3Dim(threadElemExtent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__ << "gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x << std::endl;
-                    std::cout << __func__ << "blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                    if(!workdiv::isValidWorkDiv<acc::AccGpuHipRt<TDim, TIdx>>(dev::getDev(queue), task))
-                    {
-                        throw std::runtime_error("The given work division is not valid or not supported by the device of type " + acc::getAccName<acc::AccGpuHipRt<TDim, TIdx>>() + "!");
-                    }
-#endif
-
-                    // Get the size of the block shared dynamic memory.
-                    auto const blockSharedMemDynSizeBytes(
-                        meta::apply(
-                            [&](typename std::decay<TArgs>::type const & ... args)
-                            {
-                                return
-                                    kernel::getBlockSharedMemDynSizeBytes<
-                                        acc::AccGpuHipRt<TDim, TIdx>>(
-                                            task.m_kernelFnObj,
-                                            blockThreadExtent,
-                                            threadElemExtent,
-                                            args...);
-                            },
-                            task.m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the block shared memory size.
-                    std::cout << __func__
-                        << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // hipFuncAttributes not ported from HIP to HIP.
-                    // Log the function attributes.
-                    /*hipFuncAttributes funcAttrs;
-                    hipFuncGetAttributes(&funcAttrs, kernel::hip::detail::hipKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type....>);
-                    std::cout << __func__
-                        << " binaryVersion: " << funcAttrs.binaryVersion
-                        << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                        << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                        << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                        << " numRegs: " << funcAttrs.numRegs
-                        << " ptxVersion: " << funcAttrs.ptxVersion
-                        << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B"
-                        << std::endl;*/
-#endif
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            queue.m_spQueueImpl->m_dev.m_iDevice));
-
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            hipLaunchKernelGGL(
-                                HIP_KERNEL_NAME(kernel::hip::detail::hipKernel< TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type... >),
-                                gridDim,
-                                blockDim,
-                                static_cast<std::uint32_t>(blockSharedMemDynSizeBytes),
-                                queue.m_spQueueImpl->m_HipQueue,
-                                hipLaunchParm{},
-                                threadElemExtent,
-                                task.m_kernelFnObj,
-                                args...
-                            );
-                        },
-                        task.m_args);
-
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message.
-                    hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue);
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::string const kernelName("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
-                    ::alpaka::hip::detail::hipRtCheckLastError(kernelName.c_str(), __FILE__, __LINE__);
-#endif
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/kernel/Traits.hpp b/thirdParty/alpaka/include/alpaka/kernel/Traits.hpp
deleted file mode 100644
index d851fd00c3..0000000000
--- a/thirdParty/alpaka/include/alpaka/kernel/Traits.hpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Debug.hpp>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-    #include <alpaka/workdiv/Traits.hpp>
-#endif
-
-#include <type_traits>
-
-//-----------------------------------------------------------------------------
-//! The alpaka accelerator library.
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The kernel specifics.
-    namespace kernel
-    {
-        //-----------------------------------------------------------------------------
-        //! The kernel traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The kernel execution task creation trait.
-            template<
-                typename TAcc,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs/*,
-                typename TSfinae = void*/>
-            struct CreateTaskKernel;
-
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory of a kernel.
-            //!
-            //! \tparam TKernelFnObj The kernel function object.
-            //! \tparam TAcc The accelerator.
-            //!
-            //! The default implementation returns 0.
-            template<
-                typename TKernelFnObj,
-                typename TAcc,
-                typename TSfinae = void>
-            struct BlockSharedMemDynSizeBytes
-            {
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"  // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-                //-----------------------------------------------------------------------------
-                //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-                //! \param blockThreadExtent The block thread extent.
-                //! \param threadElemExtent The thread element extent.
-                //! \tparam TArgs The kernel invocation argument types pack.
-                //! \param args,... The kernel invocation arguments.
-                //! \return The size of the shared memory allocated for a block in bytes.
-                //! The default version always returns zero.
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename... TArgs>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    TKernelFnObj const & kernelFnObj,
-                    vec::Vec<TDim, idx::Idx<TAcc>> const & blockThreadExtent,
-                    vec::Vec<TDim, idx::Idx<TAcc>> const & threadElemExtent,
-                    TArgs const & ... args)
-                -> idx::Idx<TAcc>
-                {
-                    alpaka::ignore_unused(kernelFnObj);
-                    alpaka::ignore_unused(blockThreadExtent);
-                    alpaka::ignore_unused(threadElemExtent);
-                    alpaka::ignore_unused(args...);
-
-                    return 0;
-                }
-            };
-        }
-
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"  // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-        //-----------------------------------------------------------------------------
-        //! \tparam TAcc The accelerator type.
-        //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-        //! \param blockThreadExtent The block thread extent.
-        //! \param threadElemExtent The thread element extent.
-        //! \param args,... The kernel invocation arguments.
-        //! \return The size of the shared memory allocated for a block in bytes.
-        //! The default implementation always returns zero.
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TAcc,
-            typename TKernelFnObj,
-            typename TDim,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(
-            TKernelFnObj const & kernelFnObj,
-            vec::Vec<TDim, idx::Idx<TAcc>> const & blockThreadExtent,
-            vec::Vec<TDim, idx::Idx<TAcc>> const & threadElemExtent,
-            TArgs const & ... args)
-        -> idx::Idx<TAcc>
-        {
-            return
-                traits::BlockSharedMemDynSizeBytes<
-                    TKernelFnObj,
-                    TAcc>
-                ::getBlockSharedMemDynSizeBytes(
-                    kernelFnObj,
-                    blockThreadExtent,
-                    threadElemExtent,
-                    args...);
-        }
-
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"  // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-
-        namespace detail
-        {
-            //#############################################################################
-            //! Check that the return of TKernelFnObj is void
-            template<typename TAcc>
-            struct CheckFnReturnType
-            {
-                template<
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                void operator()(
-                    TKernelFnObj const &,
-                    TArgs const & ...)
-                {
-                    static_assert(
-                        std::is_same<typename std::result_of<TKernelFnObj(TAcc const &, TArgs const & ...)>::type, void>::value,
-                        "The TKernelFnObj is required to return void!");
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! Creates a kernel execution task.
-        //!
-        //! \tparam TAcc The accelerator type.
-        //! \param workDiv The index domain work division.
-        //! \param kernelFnObj The kernel function object which should be executed.
-        //! \param args,... The kernel invocation arguments.
-        //! \return The kernel execution task.
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-        template<
-            typename TAcc,
-            typename TWorkDiv,
-            typename TKernelFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST auto createTaskKernel(
-            TWorkDiv const & workDiv,
-            TKernelFnObj const & kernelFnObj,
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::CreateTaskKernel<
-                TAcc,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            ::createTaskKernel(
-                workDiv,
-                kernelFnObj,
-                std::forward<TArgs>(args)...))
-#endif
-        {
-            // check for void return type
-            detail::CheckFnReturnType<TAcc>{}(kernelFnObj, args...);
-
-            static_assert(
-                dim::Dim<typename std::decay<TWorkDiv>::type>::value == dim::Dim<TAcc>::value,
-                "The dimensions of TAcc and TWorkDiv have to be identical!");
-            static_assert(
-                std::is_same<idx::Idx<typename std::decay<TWorkDiv>::type>, idx::Idx<TAcc>>::value,
-                "The idx type of TAcc and the idx type of TWorkDiv have to be identical!");
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            std::cout << __func__
-                << " workDiv: " << workDiv
-                << ", kernelFnObj: " << typeid(kernelFnObj).name()
-                << std::endl;
-#endif
-            return
-                traits::CreateTaskKernel<
-                    TAcc,
-                    TWorkDiv,
-                    TKernelFnObj,
-                    TArgs...>::createTaskKernel(
-                        workDiv,
-                        kernelFnObj,
-                        std::forward<TArgs>(args)...);
-        }
-
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"  // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-        //-----------------------------------------------------------------------------
-        //! Executes the given kernel in the given queue.
-        //!
-        //! \tparam TAcc The accelerator type.
-        //! \param queue The queue to enqueue the view copy task into.
-        //! \param workDiv The index domain work division.
-        //! \param kernelFnObj The kernel function object which should be executed.
-        //! \param args,... The kernel invocation arguments.
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-        template<
-            typename TAcc,
-            typename TQueue,
-            typename TWorkDiv,
-            typename TKernelFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST auto exec(
-            TQueue & queue,
-            TWorkDiv const & workDiv,
-            TKernelFnObj const & kernelFnObj,
-            TArgs && ... args)
-        -> void
-        {
-            queue::enqueue(
-                queue,
-                kernel::createTaskKernel<
-                    TAcc>(
-                    workDiv,
-                    kernelFnObj,
-                    std::forward<TArgs>(args)...));
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/MathCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/MathCudaBuiltIn.hpp
deleted file mode 100644
index 34b8af0058..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/MathCudaBuiltIn.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/abs/AbsCudaBuiltIn.hpp>
-#include <alpaka/math/acos/AcosCudaBuiltIn.hpp>
-#include <alpaka/math/asin/AsinCudaBuiltIn.hpp>
-#include <alpaka/math/atan/AtanCudaBuiltIn.hpp>
-#include <alpaka/math/atan2/Atan2CudaBuiltIn.hpp>
-#include <alpaka/math/cbrt/CbrtCudaBuiltIn.hpp>
-#include <alpaka/math/ceil/CeilCudaBuiltIn.hpp>
-#include <alpaka/math/cos/CosCudaBuiltIn.hpp>
-#include <alpaka/math/erf/ErfCudaBuiltIn.hpp>
-#include <alpaka/math/exp/ExpCudaBuiltIn.hpp>
-#include <alpaka/math/floor/FloorCudaBuiltIn.hpp>
-#include <alpaka/math/fmod/FmodCudaBuiltIn.hpp>
-#include <alpaka/math/log/LogCudaBuiltIn.hpp>
-#include <alpaka/math/max/MaxCudaBuiltIn.hpp>
-#include <alpaka/math/min/MinCudaBuiltIn.hpp>
-#include <alpaka/math/pow/PowCudaBuiltIn.hpp>
-#include <alpaka/math/remainder/RemainderCudaBuiltIn.hpp>
-#include <alpaka/math/round/RoundCudaBuiltIn.hpp>
-#include <alpaka/math/rsqrt/RsqrtCudaBuiltIn.hpp>
-#include <alpaka/math/sin/SinCudaBuiltIn.hpp>
-#include <alpaka/math/sincos/SinCosCudaBuiltIn.hpp>
-#include <alpaka/math/sqrt/SqrtCudaBuiltIn.hpp>
-#include <alpaka/math/tan/TanCudaBuiltIn.hpp>
-#include <alpaka/math/trunc/TruncCudaBuiltIn.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The mathematical operation specifics.
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library math trait specializations.
-        class MathCudaBuiltIn :
-            public AbsCudaBuiltIn,
-            public AcosCudaBuiltIn,
-            public AsinCudaBuiltIn,
-            public AtanCudaBuiltIn,
-            public Atan2CudaBuiltIn,
-            public CbrtCudaBuiltIn,
-            public CeilCudaBuiltIn,
-            public CosCudaBuiltIn,
-            public ErfCudaBuiltIn,
-            public ExpCudaBuiltIn,
-            public FloorCudaBuiltIn,
-            public FmodCudaBuiltIn,
-            public LogCudaBuiltIn,
-            public MaxCudaBuiltIn,
-            public MinCudaBuiltIn,
-            public PowCudaBuiltIn,
-            public RemainderCudaBuiltIn,
-            public RoundCudaBuiltIn,
-            public RsqrtCudaBuiltIn,
-            public SinCudaBuiltIn,
-            public SinCosCudaBuiltIn,
-            public SqrtCudaBuiltIn,
-            public TanCudaBuiltIn,
-            public TruncCudaBuiltIn
-        {};
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/MathHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/MathHipBuiltIn.hpp
deleted file mode 100644
index e362dfd35f..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/MathHipBuiltIn.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/abs/AbsHipBuiltIn.hpp>
-#include <alpaka/math/acos/AcosHipBuiltIn.hpp>
-#include <alpaka/math/asin/AsinHipBuiltIn.hpp>
-#include <alpaka/math/atan/AtanHipBuiltIn.hpp>
-#include <alpaka/math/atan2/Atan2HipBuiltIn.hpp>
-#include <alpaka/math/cbrt/CbrtHipBuiltIn.hpp>
-#include <alpaka/math/ceil/CeilHipBuiltIn.hpp>
-#include <alpaka/math/cos/CosHipBuiltIn.hpp>
-#include <alpaka/math/erf/ErfHipBuiltIn.hpp>
-#include <alpaka/math/exp/ExpHipBuiltIn.hpp>
-#include <alpaka/math/floor/FloorHipBuiltIn.hpp>
-#include <alpaka/math/fmod/FmodHipBuiltIn.hpp>
-#include <alpaka/math/log/LogHipBuiltIn.hpp>
-#include <alpaka/math/max/MaxHipBuiltIn.hpp>
-#include <alpaka/math/min/MinHipBuiltIn.hpp>
-#include <alpaka/math/pow/PowHipBuiltIn.hpp>
-#include <alpaka/math/remainder/RemainderHipBuiltIn.hpp>
-#include <alpaka/math/round/RoundHipBuiltIn.hpp>
-#include <alpaka/math/rsqrt/RsqrtHipBuiltIn.hpp>
-#include <alpaka/math/sin/SinHipBuiltIn.hpp>
-#include <alpaka/math/sincos/SinCosHipBuiltIn.hpp>
-#include <alpaka/math/sqrt/SqrtHipBuiltIn.hpp>
-#include <alpaka/math/tan/TanHipBuiltIn.hpp>
-#include <alpaka/math/trunc/TruncHipBuiltIn.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The mathematical operation specifics.
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library math trait specializations.
-        class MathHipBuiltIn :
-            public AbsHipBuiltIn,
-            public AcosHipBuiltIn,
-            public AsinHipBuiltIn,
-            public AtanHipBuiltIn,
-            public Atan2HipBuiltIn,
-            public CbrtHipBuiltIn,
-            public CeilHipBuiltIn,
-            public CosHipBuiltIn,
-            public ErfHipBuiltIn,
-            public ExpHipBuiltIn,
-            public FloorHipBuiltIn,
-            public FmodHipBuiltIn,
-            public LogHipBuiltIn,
-            public MaxHipBuiltIn,
-            public MinHipBuiltIn,
-            public PowHipBuiltIn,
-            public RemainderHipBuiltIn,
-            public RoundHipBuiltIn,
-            public RsqrtHipBuiltIn,
-            public SinCosHipBuiltIn,
-            public SinHipBuiltIn,
-            public SqrtHipBuiltIn,
-            public TanHipBuiltIn,
-            public TruncHipBuiltIn
-        {};
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/MathStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/MathStdLib.hpp
deleted file mode 100644
index eea098bcda..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/MathStdLib.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/abs/AbsStdLib.hpp>
-#include <alpaka/math/acos/AcosStdLib.hpp>
-#include <alpaka/math/asin/AsinStdLib.hpp>
-#include <alpaka/math/atan/AtanStdLib.hpp>
-#include <alpaka/math/atan2/Atan2StdLib.hpp>
-#include <alpaka/math/cbrt/CbrtStdLib.hpp>
-#include <alpaka/math/ceil/CeilStdLib.hpp>
-#include <alpaka/math/cos/CosStdLib.hpp>
-#include <alpaka/math/erf/ErfStdLib.hpp>
-#include <alpaka/math/exp/ExpStdLib.hpp>
-#include <alpaka/math/floor/FloorStdLib.hpp>
-#include <alpaka/math/fmod/FmodStdLib.hpp>
-#include <alpaka/math/log/LogStdLib.hpp>
-#include <alpaka/math/max/MaxStdLib.hpp>
-#include <alpaka/math/min/MinStdLib.hpp>
-#include <alpaka/math/pow/PowStdLib.hpp>
-#include <alpaka/math/remainder/RemainderStdLib.hpp>
-#include <alpaka/math/round/RoundStdLib.hpp>
-#include <alpaka/math/rsqrt/RsqrtStdLib.hpp>
-#include <alpaka/math/sin/SinStdLib.hpp>
-#include <alpaka/math/sincos/SinCosStdLib.hpp>
-#include <alpaka/math/sqrt/SqrtStdLib.hpp>
-#include <alpaka/math/tan/TanStdLib.hpp>
-#include <alpaka/math/trunc/TruncStdLib.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The mathematical operation specifics.
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library math trait specializations.
-        class MathStdLib :
-            public AbsStdLib,
-            public AcosStdLib,
-            public AsinStdLib,
-            public AtanStdLib,
-            public Atan2StdLib,
-            public CbrtStdLib,
-            public CeilStdLib,
-            public CosStdLib,
-            public ErfStdLib,
-            public ExpStdLib,
-            public FloorStdLib,
-            public FmodStdLib,
-            public LogStdLib,
-            public MaxStdLib,
-            public MinStdLib,
-            public PowStdLib,
-            public RemainderStdLib,
-            public RoundStdLib,
-            public RsqrtStdLib,
-            public SinStdLib,
-            public SinCosStdLib,
-            public SqrtStdLib,
-            public TanStdLib,
-            public TruncStdLib
-        {};
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/abs/AbsCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/abs/AbsCudaBuiltIn.hpp
deleted file mode 100644
index f75de34d8e..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/abs/AbsCudaBuiltIn.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/abs/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in abs.
-        class AbsCudaBuiltIn : public concepts::Implements<ConceptMathAbs, AbsCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA built in abs trait specialization.
-            template<
-                typename TArg>
-            struct Abs<
-                AbsCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto abs(
-                    AbsCudaBuiltIn const & abs_ctx,
-                    TArg const & arg)
-                -> decltype(::abs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::abs(arg);
-                }
-            };
-            //! The CUDA built in abs double specialization.
-            template<>
-            struct Abs<
-                AbsCudaBuiltIn,
-                double>
-            {
-                __device__ static auto abs(
-                    AbsCudaBuiltIn const & abs_ctx,
-                    double const & arg)
-                -> decltype(::fabs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::fabs(arg);
-                }
-            };
-            //! The CUDA built in abs float specialization.
-            template<>
-            struct Abs<
-                AbsCudaBuiltIn,
-                float>
-            {
-                __device__ static auto abs(
-                    AbsCudaBuiltIn const & abs_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::fabsf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/abs/AbsHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/abs/AbsHipBuiltIn.hpp
deleted file mode 100644
index e1dba07f61..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/abs/AbsHipBuiltIn.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/abs/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP built in abs.
-        class AbsHipBuiltIn : public concepts::Implements<ConceptMathAbs, AbsHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP built in abs trait specialization.
-            template<
-                typename TArg>
-            struct Abs<
-                AbsHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto abs(
-                    AbsHipBuiltIn const & abs_ctx,
-                    TArg const & arg)
-                -> decltype(::abs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::abs(arg);
-                }
-            };
-            //! The HIP built in abs double specialization.
-            template<>
-            struct Abs<
-                AbsHipBuiltIn,
-                double>
-            {
-                __device__ static auto abs(
-                    AbsHipBuiltIn const & abs_ctx,
-                    double const & arg)
-                -> decltype(::fabs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::fabs(arg);
-                }
-            };
-            //! The HIP built in abs float specialization.
-            template<>
-            struct Abs<
-                AbsHipBuiltIn,
-                float>
-            {
-                __device__ static auto abs(
-                    AbsHipBuiltIn const & abs_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::fabsf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/abs/AbsStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/abs/AbsStdLib.hpp
deleted file mode 100644
index 2a32da3f16..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/abs/AbsStdLib.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/abs/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cstdlib>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library abs.
-        class AbsStdLib : public concepts::Implements<ConceptMathAbs, AbsStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library abs trait specialization.
-            template<
-                typename TArg>
-            struct Abs<
-                AbsStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value
-                    && std::is_signed<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto abs(
-                    AbsStdLib const & abs_ctx,
-                    TArg const & arg)
-                -> decltype(std::abs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return std::abs(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/abs/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/abs/Traits.hpp
deleted file mode 100644
index 460ca80817..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/abs/Traits.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathAbs;
-
-        //-----------------------------------------------------------------------------
-        //! The math traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The abs trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Abs;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the absolute value.
-        //!
-        //! \tparam T The type of the object specializing Abs.
-        //! \tparam TArg The arg type.
-        //! \param abs_ctx The object specializing Abs.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto abs(
-            T const & abs_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Abs<
-                concepts::ImplementationBase<ConceptMathAbs, T>,
-                TArg>
-            ::abs(
-                abs_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathAbs, T>;
-            return
-                traits::Abs<
-                    ImplementationBase,
-                    TArg>
-                ::abs(
-                    abs_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/acos/AcosCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/acos/AcosCudaBuiltIn.hpp
deleted file mode 100644
index 67140c7ee0..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/acos/AcosCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/acos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in acos.
-        class AcosCudaBuiltIn : public concepts::Implements<ConceptMathAcos, AcosCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA acos trait specialization.
-            template<
-                typename TArg>
-            struct Acos<
-                AcosCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto acos(
-                    AcosCudaBuiltIn const & acos_ctx,
-                    TArg const & arg)
-                -> decltype(::acos(arg))
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return ::acos(arg);
-                }
-            };
-
-            template<>
-            struct Acos<
-                AcosCudaBuiltIn,
-                float>
-            {
-                __device__ static auto acos(
-                    AcosCudaBuiltIn const & acos_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return ::acosf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/acos/AcosHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/acos/AcosHipBuiltIn.hpp
deleted file mode 100644
index 442b9ce865..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/acos/AcosHipBuiltIn.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/acos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP acos.
-        class AcosHipBuiltIn : public concepts::Implements<ConceptMathAcos, AcosHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP acos trait specialization.
-            template<
-                typename TArg>
-            struct Acos<
-                AcosHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                __device__ static auto acos(
-                    AcosHipBuiltIn const & acos_ctx,
-                    TArg const & arg)
-                -> decltype(::acos(arg))
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return ::acos(arg);
-                }
-            };
-            //! The HIP acos float specialization.
-            template<>
-            struct Acos<
-                AcosHipBuiltIn,
-                float>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                __device__ static auto acos(
-                    AcosHipBuiltIn const & acos_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return ::acosf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/acos/AcosStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/acos/AcosStdLib.hpp
deleted file mode 100644
index aacf9d1bbe..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/acos/AcosStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/acos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library acos.
-        class AcosStdLib : public concepts::Implements<ConceptMathAcos, AcosStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library acos trait specialization.
-            template<
-                typename TArg>
-            struct Acos<
-                AcosStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto acos(
-                    AcosStdLib const & acos_ctx,
-                    TArg const & arg)
-                -> decltype(std::acos(arg))
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return std::acos(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/acos/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/acos/Traits.hpp
deleted file mode 100644
index 7d5b853554..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/acos/Traits.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathAcos;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The acos trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Acos;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the principal value of the arc cosine.
-        //!
-        //! \tparam TArg The arg type.
-        //! \param acos_ctx The object specializing Acos.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto acos(
-            T const & acos_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Acos<
-                concepts::ImplementationBase<ConceptMathAcos, T>,
-                TArg>
-            ::acos(
-                acos_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathAcos, T>;
-            return
-                traits::Acos<
-                    ImplementationBase,
-                    TArg>
-                ::acos(
-                    acos_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/asin/AsinCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/asin/AsinCudaBuiltIn.hpp
deleted file mode 100644
index 751bdc2eeb..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/asin/AsinCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/asin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in asin.
-        class AsinCudaBuiltIn : public concepts::Implements<ConceptMathAsin, AsinCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA asin trait specialization.
-            template<
-                typename TArg>
-            struct Asin<
-                AsinCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto asin(
-                    AsinCudaBuiltIn const & asin_ctx,
-                    TArg const & arg)
-                -> decltype(::asin(arg))
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return ::asin(arg);
-                }
-            };
-
-            template<>
-            struct Asin<
-                AsinCudaBuiltIn,
-                float>
-            {
-                __device__ static auto asin(
-                    AsinCudaBuiltIn const & asin_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return ::asinf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/asin/AsinHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/asin/AsinHipBuiltIn.hpp
deleted file mode 100644
index 9111b4fd08..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/asin/AsinHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/asin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP asin.
-        class AsinHipBuiltIn : public concepts::Implements<ConceptMathAsin, AsinHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP asin trait specialization.
-            template<
-                typename TArg>
-            struct Asin<
-                AsinHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto asin(
-                    AsinHipBuiltIn const & asin_ctx,
-                    TArg const & arg)
-                -> decltype(::asin(arg))
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return ::asin(arg);
-                }
-            };
-            //! The HIP asin float specialization.
-            template<>
-            struct Asin<
-                AsinHipBuiltIn,
-                float>
-            {
-                __device__ static auto asin(
-                    AsinHipBuiltIn const & asin_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return ::asinf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/asin/AsinStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/asin/AsinStdLib.hpp
deleted file mode 100644
index 248f0d2bfa..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/asin/AsinStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/asin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library asin.
-        class AsinStdLib : public concepts::Implements<ConceptMathAsin, AsinStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library asin trait specialization.
-            template<
-                typename TArg>
-            struct Asin<
-                AsinStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto asin(
-                    AsinStdLib const & asin_ctx,
-                    TArg const & arg)
-                -> decltype(std::asin(arg))
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return std::asin(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/asin/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/asin/Traits.hpp
deleted file mode 100644
index 903b5da7fe..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/asin/Traits.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathAsin;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The asin trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Asin;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the principal value of the arc sine.
-        //!
-        //! \tparam TArg The arg type.
-        //! \param asin_ctx The object specializing Asin.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto asin(
-            T const & asin_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Asin<
-                concepts::ImplementationBase<ConceptMathAsin, T>,
-                TArg>
-            ::asin(
-                asin_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathAsin, T>;
-            return
-                traits::Asin<
-                    ImplementationBase,
-                    TArg>
-                ::asin(
-                    asin_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/atan/AtanCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/atan/AtanCudaBuiltIn.hpp
deleted file mode 100644
index 3d96c3711a..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/atan/AtanCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/atan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in atan.
-        class AtanCudaBuiltIn : public concepts::Implements<ConceptMathAtan, AtanCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA atan trait specialization.
-            template<
-                typename TArg>
-            struct Atan<
-                AtanCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto atan(
-                    AtanCudaBuiltIn const & atan_ctx,
-                    TArg const & arg)
-                -> decltype(::atan(arg))
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return ::atan(arg);
-                }
-            };
-
-            template<>
-            struct Atan<
-                AtanCudaBuiltIn,
-                float>
-            {
-                __device__ static auto atan(
-                    AtanCudaBuiltIn const & atan_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return ::atanf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/atan/AtanHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/atan/AtanHipBuiltIn.hpp
deleted file mode 100644
index bc792c38a0..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/atan/AtanHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/atan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP atan.
-        class AtanHipBuiltIn : public concepts::Implements<ConceptMathAtan, AtanHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP atan trait specialization.
-            template<
-                typename TArg>
-            struct Atan<
-                AtanHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto atan(
-                    AtanHipBuiltIn const & atan_ctx,
-                    TArg const & arg)
-                -> decltype(::atan(arg))
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return ::atan(arg);
-                }
-            };
-            //! The HIP atan float specialization.
-            template<>
-            struct Atan<
-                AtanHipBuiltIn,
-                float>
-            {
-                __device__ static auto atan(
-                    AtanHipBuiltIn const & atan_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return ::atanf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/atan/AtanStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/atan/AtanStdLib.hpp
deleted file mode 100644
index 91b95358f7..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/atan/AtanStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/atan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library atan.
-        class AtanStdLib : public concepts::Implements<ConceptMathAtan, AtanStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library atan trait specialization.
-            template<
-                typename TArg>
-            struct Atan<
-                AtanStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto atan(
-                    AtanStdLib const & atan_ctx,
-                    TArg const & arg)
-                -> decltype(std::atan(arg))
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return std::atan(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/atan/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/atan/Traits.hpp
deleted file mode 100644
index 2303c82dd1..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/atan/Traits.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathAtan;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The atan trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Atan;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the principal value of the arc tangent.
-        //!
-        //! \tparam TArg The arg type.
-        //! \param atan_ctx The object specializing Atan.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto atan(
-            T const & atan_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Atan<
-                concepts::ImplementationBase<ConceptMathAtan, T>,
-                TArg>
-            ::atan(
-                atan_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathAtan, T>;
-            return
-                traits::Atan<
-                    ImplementationBase,
-                    TArg>
-                ::atan(
-                    atan_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/atan2/Atan2CudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/atan2/Atan2CudaBuiltIn.hpp
deleted file mode 100644
index fb7cd0cc36..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/atan2/Atan2CudaBuiltIn.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/atan2/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in atan2.
-        class Atan2CudaBuiltIn : public concepts::Implements<ConceptMathAtan2, Atan2CudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA atan2 trait specialization.
-            template<
-                typename Ty,
-                typename Tx>
-            struct Atan2<
-                Atan2CudaBuiltIn,
-                Ty,
-                Tx,
-                typename std::enable_if<
-                    std::is_floating_point<Ty>::value
-                    && std::is_floating_point<Tx>::value>::type>
-            {
-                __device__ static auto atan2(
-                    Atan2CudaBuiltIn const & atan2_ctx,
-                    Ty const & y,
-                    Tx const & x)
-                -> decltype(::atan2(y, x))
-                {
-                    alpaka::ignore_unused(atan2_ctx);
-                    return ::atan2(y, x);
-                }
-            };
-
-            template<>
-            struct Atan2<
-                Atan2CudaBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto atan2(
-                    Atan2CudaBuiltIn const & atan2_ctx,
-                    float const & y,
-                    float const & x)
-                -> float
-                {
-                    alpaka::ignore_unused(atan2_ctx);
-                    return ::atan2f(y, x);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/atan2/Atan2HipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/atan2/Atan2HipBuiltIn.hpp
deleted file mode 100644
index 51f13b52e6..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/atan2/Atan2HipBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/atan2/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP atan2.
-        class Atan2HipBuiltIn : public concepts::Implements<ConceptMathAtan2, Atan2HipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP atan2 trait specialization.
-            template<
-                typename Ty,
-                typename Tx>
-            struct Atan2<
-                Atan2HipBuiltIn,
-                Ty,
-                Tx,
-                typename std::enable_if<
-                    std::is_floating_point<Ty>::value
-                    && std::is_floating_point<Tx>::value>::type>
-            {
-                __device__ static auto atan2(
-                    Atan2HipBuiltIn const & atan2_ctx,
-                    Ty const & y,
-                    Tx const & x)
-                -> decltype(::atan2(y, x))
-                {
-                    alpaka::ignore_unused(atan2_ctx);
-                    return ::atan2(y, x);
-                }
-            };
-            //! The HIP sin float specialization.
-            template<>
-            struct Atan2<
-                Atan2HipBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto atan2(
-                    Atan2HipBuiltIn const & atan2_ctx,
-                    float const & y,
-                    float const & x)
-                -> float
-                {
-                    alpaka::ignore_unused(atan2_ctx);
-                    return ::atan2f(y, x);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp b/thirdParty/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp
deleted file mode 100644
index f5810060b7..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/atan2/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library atan2.
-        class Atan2StdLib : public concepts::Implements<ConceptMathAtan2, Atan2StdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library atan2 trait specialization.
-            template<
-                typename Ty,
-                typename Tx>
-            struct Atan2<
-                Atan2StdLib,
-                Ty,
-                Tx,
-                typename std::enable_if<
-                    std::is_arithmetic<Ty>::value
-                    && std::is_arithmetic<Tx>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto atan2(
-                    Atan2StdLib const & abs,
-                    Ty const & y,
-                    Tx const & x)
-                -> decltype(std::atan2(y, x))
-                {
-                    alpaka::ignore_unused(abs);
-                    return std::atan2(y, x);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/atan2/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/atan2/Traits.hpp
deleted file mode 100644
index 32d4bda83d..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/atan2/Traits.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathAtan2;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The atan2 trait.
-            template<
-                typename T,
-                typename Ty,
-                typename Tx,
-                typename TSfinae = void>
-            struct Atan2;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
-        //!
-        //! \tparam T The type of the object specializing Atan2.
-        //! \tparam Ty The y arg type.
-        //! \tparam Tx The x arg type.
-        //! \param atan2_ctx The object specializing Atan2.
-        //! \param y The y arg.
-        //! \param x The x arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Ty,
-            typename Tx>
-        ALPAKA_FN_HOST_ACC auto atan2(
-            T const & atan2_ctx,
-            Ty const & y,
-            Tx const & x)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Atan2<
-                concepts::ImplementationBase<ConceptMathAtan2, T>,
-                Ty,
-                Tx>
-            ::atan2(
-                atan2_ctx,
-                y,
-                x))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathAtan2, T>;
-            return
-                traits::Atan2<
-                    ImplementationBase,
-                    Ty,
-                    Tx>
-                ::atan2(
-                    atan2_ctx,
-                    y,
-                    x);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtCudaBuiltIn.hpp
deleted file mode 100644
index 48dd9525be..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/cbrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in cbrt.
-        class CbrtCudaBuiltIn : public concepts::Implements<ConceptMathCbrt, CbrtCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA cbrt trait specialization.
-            template<
-                typename TArg>
-            struct Cbrt<
-                CbrtCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                __device__ static auto cbrt(
-                    CbrtCudaBuiltIn const & cbrt_ctx,
-                    TArg const & arg)
-                -> decltype(::cbrt(arg))
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return ::cbrt(arg);
-                }
-            };
-
-            template<>
-            struct Cbrt<
-                CbrtCudaBuiltIn,
-                float>
-            {
-                __device__ static auto cbrt(
-                    CbrtCudaBuiltIn const & cbrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return ::cbrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtHipBuiltIn.hpp
deleted file mode 100644
index 2f86fe390b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/cbrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP cbrt.
-        class CbrtHipBuiltIn : public concepts::Implements<ConceptMathCbrt, CbrtHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP cbrt trait specialization.
-            template<
-                typename TArg>
-            struct Cbrt<
-                CbrtHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                __device__ static auto cbrt(
-                    CbrtHipBuiltIn const & cbrt_ctx,
-                    TArg const & arg)
-                -> decltype(::cbrt(arg))
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return ::cbrt(arg);
-                }
-            };
-            //! The HIP cbrt float specialization.
-            template<>
-            struct Cbrt<
-                CbrtHipBuiltIn,
-                float>
-            {
-                __device__ static auto cbrt(
-                    CbrtHipBuiltIn const & cbrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return ::cbrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp
deleted file mode 100644
index a2da289bea..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/cbrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library cbrt.
-        class CbrtStdLib : public concepts::Implements<ConceptMathCbrt, CbrtStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library cbrt trait specialization.
-            template<
-                typename TArg>
-            struct Cbrt<
-                CbrtStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto cbrt(
-                    CbrtStdLib const & cbrt_ctx,
-                    TArg const & arg)
-                -> decltype(std::cbrt(arg))
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return std::cbrt(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/cbrt/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/cbrt/Traits.hpp
deleted file mode 100644
index 92a6650ede..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/cbrt/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathCbrt;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The cbrt trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Cbrt;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the cbrt.
-        //!
-        //! \tparam T The type of the object specializing Cbrt.
-        //! \tparam TArg The arg type.
-        //! \param cbrt_ctx The object specializing Cbrt.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto cbrt(
-            T const & cbrt_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Cbrt<
-                concepts::ImplementationBase<ConceptMathCbrt, T>,
-                TArg>
-            ::cbrt(
-                cbrt_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathCbrt, T>;
-            return
-                traits::Cbrt<
-                    ImplementationBase,
-                    TArg>
-                ::cbrt(
-                    cbrt_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/ceil/CeilCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/ceil/CeilCudaBuiltIn.hpp
deleted file mode 100644
index e21b5ff0b5..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/ceil/CeilCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/ceil/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in ceil.
-        class CeilCudaBuiltIn : public concepts::Implements<ConceptMathCeil, CeilCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA ceil trait specialization.
-            template<
-                typename TArg>
-            struct Ceil<
-                CeilCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto ceil(
-                    CeilCudaBuiltIn const & ceil_ctx,
-                    TArg const & arg)
-                -> decltype(::ceil(arg))
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return ::ceil(arg);
-                }
-            };
-            //
-            template<>
-            struct Ceil<
-                CeilCudaBuiltIn,
-                float>
-            {
-                __device__ static auto ceil(
-                    CeilCudaBuiltIn const & ceil_ctx,
-                    float const & arg)
-                ->float
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return ::ceilf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/ceil/CeilHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/ceil/CeilHipBuiltIn.hpp
deleted file mode 100644
index cd65ab6db5..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/ceil/CeilHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/ceil/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP ceil.
-        class CeilHipBuiltIn : public concepts::Implements<ConceptMathCeil, CeilHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP ceil trait specialization.
-            template<
-                typename TArg>
-            struct Ceil<
-                CeilHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto ceil(
-                    CeilHipBuiltIn const & ceil_ctx,
-                    TArg const & arg)
-                -> decltype(::ceil(arg))
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return ::ceil(arg);
-                }
-            };
-            //! The HIP cos float specialization.
-            template<>
-            struct Ceil<
-                CeilHipBuiltIn,
-                float>
-            {
-                __device__ static auto ceil(
-                    CeilHipBuiltIn const & ceil_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return ::ceilf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp
deleted file mode 100644
index f1895c40b0..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/ceil/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library ceil.
-        class CeilStdLib : public concepts::Implements<ConceptMathCeil, CeilStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library ceil trait specialization.
-            template<
-                typename TArg>
-            struct Ceil<
-                CeilStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto ceil(
-                    CeilStdLib const & ceil_ctx,
-                    TArg const & arg)
-                -> decltype(std::ceil(arg))
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return std::ceil(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/ceil/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/ceil/Traits.hpp
deleted file mode 100644
index c9de6a6512..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/ceil/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathCeil;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The ceil trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Ceil;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the smallest integer value not less than arg.
-        //!
-        //! \tparam T The type of the object specializing Ceil.
-        //! \tparam TArg The arg type.
-        //! \param ceil_ctx The object specializing Ceil.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto ceil(
-            T const & ceil_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Ceil<
-                concepts::ImplementationBase<ConceptMathCeil, T>,
-                TArg>
-            ::ceil(
-                ceil_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathCeil, T>;
-            return
-                traits::Ceil<
-                    ImplementationBase,
-                    TArg>
-                ::ceil(
-                    ceil_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/cos/CosCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/cos/CosCudaBuiltIn.hpp
deleted file mode 100644
index 701f795aba..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/cos/CosCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/cos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in cos.
-        class CosCudaBuiltIn : public concepts::Implements<ConceptMathCos, CosCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA cos trait specialization.
-            template<
-                typename TArg>
-            struct Cos<
-                CosCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto cos(
-                    CosCudaBuiltIn const & cos_ctx,
-                    TArg const & arg)
-                -> decltype(::cos(arg))
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return ::cos(arg);
-                }
-            };
-
-            template<>
-            struct Cos<
-                CosCudaBuiltIn,
-                float>
-            {
-                __device__ static auto cos(
-                    CosCudaBuiltIn const & cos_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return ::cosf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/cos/CosHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/cos/CosHipBuiltIn.hpp
deleted file mode 100644
index 4e8ab79100..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/cos/CosHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/cos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP cos.
-        class CosHipBuiltIn : public concepts::Implements<ConceptMathCos, CosHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP cos trait specialization.
-            template<
-                typename TArg>
-            struct Cos<
-                CosHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto cos(
-                    CosHipBuiltIn const & cos_ctx,
-                    TArg const & arg)
-                -> decltype(::cos(arg))
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return ::cos(arg);
-                }
-            };
-            //! The HIP cos float specialization.
-            template<>
-            struct Cos<
-                CosHipBuiltIn,
-                float>
-            {
-                __device__ static auto cos(
-                    CosHipBuiltIn const & cos_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return ::cosf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/cos/CosStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/cos/CosStdLib.hpp
deleted file mode 100644
index 6acff5d064..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/cos/CosStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/cos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library cos.
-        class CosStdLib : public concepts::Implements<ConceptMathCos, CosStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library cos trait specialization.
-            template<
-                typename TArg>
-            struct Cos<
-                CosStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto cos(
-                    CosStdLib const & cos_ctx,
-                    TArg const & arg)
-                -> decltype(std::cos(arg))
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return std::cos(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/cos/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/cos/Traits.hpp
deleted file mode 100644
index 59302bf6c0..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/cos/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathCos;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The cos trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Cos;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the cosine (measured in radians).
-        //!
-        //! \tparam T The type of the object specializing Cos.
-        //! \tparam TArg The arg type.
-        //! \param cos_ctx The object specializing Cos.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto cos(
-            T const & cos_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Cos<
-                concepts::ImplementationBase<ConceptMathCos, T>,
-                TArg>
-            ::cos(
-                cos_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathCos, T>;
-            return
-                traits::Cos<
-                    ImplementationBase,
-                    TArg>
-                ::cos(
-                    cos_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/erf/ErfCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/erf/ErfCudaBuiltIn.hpp
deleted file mode 100644
index 28dab47b3b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/erf/ErfCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/erf/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in erf.
-        class ErfCudaBuiltIn : public concepts::Implements<ConceptMathErf, ErfCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA erf trait specialization.
-            template<
-                typename TArg>
-            struct Erf<
-                ErfCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto erf(
-                    ErfCudaBuiltIn const & erf_ctx,
-                    TArg const & arg)
-                -> decltype(::erf(arg))
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return ::erf(arg);
-                }
-            };
-
-            template<>
-            struct Erf<
-                ErfCudaBuiltIn,
-                float>
-            {
-                __device__ static auto erf(
-                    ErfCudaBuiltIn const & erf_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return ::erff(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/erf/ErfHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/erf/ErfHipBuiltIn.hpp
deleted file mode 100644
index e6fa0d529b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/erf/ErfHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/erf/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP erf.
-        class ErfHipBuiltIn : public concepts::Implements<ConceptMathErf, ErfHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP erf trait specialization.
-            template<
-                typename TArg>
-            struct Erf<
-                ErfHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto erf(
-                    ErfHipBuiltIn const & erf_ctx,
-                    TArg const & arg)
-                -> decltype(::erf(arg))
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return ::erf(arg);
-                }
-            };
-            //! The HIP erf float specialization.
-            template<>
-            struct Erf<
-                ErfHipBuiltIn,
-                float>
-            {
-                __device__ static auto erf(
-                    ErfHipBuiltIn const & erf_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return ::erff(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/erf/ErfStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/erf/ErfStdLib.hpp
deleted file mode 100644
index 6028cceb9a..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/erf/ErfStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/erf/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library erf.
-        class ErfStdLib : public concepts::Implements<ConceptMathErf, ErfStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library erf trait specialization.
-            template<
-                typename TArg>
-            struct Erf<
-                ErfStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto erf(
-                    ErfStdLib const & erf_ctx,
-                    TArg const & arg)
-                -> decltype(std::erf(arg))
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return std::erf(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/erf/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/erf/Traits.hpp
deleted file mode 100644
index 188d907688..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/erf/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathErf;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The erf trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Erf;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the error function of arg.
-        //!
-        //! \tparam T The type of the object specializing Erf.
-        //! \tparam TArg The arg type.
-        //! \param erf_ctx The object specializing Erf.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto erf(
-            T const & erf_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Erf<
-                concepts::ImplementationBase<ConceptMathErf, T>,
-                TArg>
-            ::erf(
-                erf_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathErf, T>;
-            return
-                traits::Erf<
-                    ImplementationBase,
-                    TArg>
-                ::erf(
-                    erf_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/exp/ExpCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/exp/ExpCudaBuiltIn.hpp
deleted file mode 100644
index c52d51696b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/exp/ExpCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/exp/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in exp.
-        class ExpCudaBuiltIn : public concepts::Implements<ConceptMathExp, ExpCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA exp trait specialization.
-            template<
-                typename TArg>
-            struct Exp<
-                ExpCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto exp(
-                    ExpCudaBuiltIn const & exp_ctx,
-                    TArg const & arg)
-                -> decltype(::exp(arg))
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return ::exp(arg);
-                }
-            };
-            //! The CUDA exp float specialization.
-            template<>
-            struct Exp<
-                ExpCudaBuiltIn,
-                float>
-            {
-                __device__ static auto exp(
-                    ExpCudaBuiltIn const & exp_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return ::expf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/exp/ExpHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/exp/ExpHipBuiltIn.hpp
deleted file mode 100644
index 325582b330..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/exp/ExpHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/exp/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP exp.
-        class ExpHipBuiltIn : public concepts::Implements<ConceptMathExp, ExpHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP exp trait specialization.
-            template<
-                typename TArg>
-            struct Exp<
-                ExpHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto exp(
-                    ExpHipBuiltIn const & exp_ctx,
-                    TArg const & arg)
-                -> decltype(::exp(arg))
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return ::exp(arg);
-                }
-            };
-            //! The HIP exp float specialization.
-            template<>
-            struct Exp<
-                ExpHipBuiltIn,
-                float>
-            {
-                __device__ static auto exp(
-                    ExpHipBuiltIn const & exp_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return ::expf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/exp/ExpStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/exp/ExpStdLib.hpp
deleted file mode 100644
index 70cbc53b79..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/exp/ExpStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/exp/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library exp.
-        class ExpStdLib : public concepts::Implements<ConceptMathExp, ExpStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library exp trait specialization.
-            template<
-                typename TArg>
-            struct Exp<
-                ExpStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto exp(
-                    ExpStdLib const & exp_ctx,
-                    TArg const & arg)
-                -> decltype(std::exp(arg))
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return std::exp(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/exp/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/exp/Traits.hpp
deleted file mode 100644
index 0d077c7928..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/exp/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathExp;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The exp trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Exp;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the e (Euler's number, 2.7182818) raised to the given power arg.
-        //!
-        //! \tparam T The type of the object specializing Exp.
-        //! \tparam TArg The arg type.
-        //! \param exp_ctx The object specializing Exp.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto exp(
-            T const & exp_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Exp<
-                concepts::ImplementationBase<ConceptMathExp, T>,
-                TArg>
-            ::exp(
-                exp_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathExp, T>;
-            return
-                traits::Exp<
-                    ImplementationBase,
-                    TArg>
-                ::exp(
-                    exp_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/floor/FloorCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/floor/FloorCudaBuiltIn.hpp
deleted file mode 100644
index 1ae4713d8c..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/floor/FloorCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/floor/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in floor.
-        class FloorCudaBuiltIn : public concepts::Implements<ConceptMathFloor, FloorCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA floor trait specialization.
-            template<
-                typename TArg>
-            struct Floor<
-                FloorCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto floor(
-                    FloorCudaBuiltIn const & floor_ctx,
-                    TArg const & arg)
-                -> decltype(::floor(arg))
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return ::floor(arg);
-                }
-            };
-            //! The CUDA floor float specialization.
-            template<>
-            struct Floor<
-                FloorCudaBuiltIn,
-                float>
-            {
-                __device__ static auto floor(
-                    FloorCudaBuiltIn const & floor_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return ::floorf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/floor/FloorHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/floor/FloorHipBuiltIn.hpp
deleted file mode 100644
index b045746988..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/floor/FloorHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/floor/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP floor.
-        class FloorHipBuiltIn : public concepts::Implements<ConceptMathFloor, FloorHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP floor trait specialization.
-            template<
-                typename TArg>
-            struct Floor<
-                FloorHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto floor(
-                    FloorHipBuiltIn const & floor_ctx,
-                    TArg const & arg)
-                -> decltype(::floor(arg))
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return ::floor(arg);
-                }
-            };
-            //! The HIP floor float specialization.
-            template<>
-            struct Floor<
-                FloorHipBuiltIn,
-                float>
-            {
-                __device__ static auto floor(
-                    FloorHipBuiltIn const & floor_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return ::floorf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/floor/FloorStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/floor/FloorStdLib.hpp
deleted file mode 100644
index 3f6f670aa6..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/floor/FloorStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/floor/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library floor.
-        class FloorStdLib : public concepts::Implements<ConceptMathFloor, FloorStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library floor trait specialization.
-            template<
-                typename TArg>
-            struct Floor<
-                FloorStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto floor(
-                    FloorStdLib const & floor_ctx,
-                    TArg const & arg)
-                -> decltype(std::floor(arg))
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return std::floor(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/floor/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/floor/Traits.hpp
deleted file mode 100644
index a4f2e87c05..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/floor/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathFloor;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The floor trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Floor;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the largest integer value not greater than arg.
-        //!
-        //! \tparam T The type of the object specializing Floor.
-        //! \tparam TArg The arg type.
-        //! \param floor_ctx The object specializing Floor.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto floor(
-            T const & floor_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Floor<
-                concepts::ImplementationBase<ConceptMathFloor, T>,
-                TArg>
-            ::floor(
-                floor_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathFloor, T>;
-            return
-                traits::Floor<
-                    ImplementationBase,
-                    TArg>
-                ::floor(
-                    floor_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/fmod/FmodCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/fmod/FmodCudaBuiltIn.hpp
deleted file mode 100644
index f927e7322b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/fmod/FmodCudaBuiltIn.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/fmod/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in fmod.
-        class FmodCudaBuiltIn : public concepts::Implements<ConceptMathFmod, FmodCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA fmod trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Fmod<
-                FmodCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                __device__ static auto fmod(
-                    FmodCudaBuiltIn const & fmod_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmod(x, y))
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return ::fmod(
-                        x,
-                        y);
-                }
-            };
-            //! The CUDA fmod float specialization.
-            template<>
-            struct Fmod<
-                FmodCudaBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto fmod(
-                    FmodCudaBuiltIn const & fmod_ctx,
-                    float const & x,
-                    float const & y)
-                -> float
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return ::fmodf(
-                        x,
-                        y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/fmod/FmodHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/fmod/FmodHipBuiltIn.hpp
deleted file mode 100644
index 83e230cb9c..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/fmod/FmodHipBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/fmod/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP fmod.
-        class FmodHipBuiltIn : public concepts::Implements<ConceptMathFmod, FmodHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP fmod trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Fmod<
-                FmodHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                __device__ static auto fmod(
-                    FmodHipBuiltIn const & fmod_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmod(x, y))
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return ::fmod(x, y);
-                }
-            };
-            //! The HIP fmod float specialization.
-            template<>
-            struct Fmod<
-                FmodHipBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto fmod(
-                    FmodHipBuiltIn const & fmod_ctx,
-                    float const & x,
-                    float const & y)
-                -> float
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return ::fmodf(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp
deleted file mode 100644
index c25ebf4adc..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/fmod/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library fmod.
-        class FmodStdLib : public concepts::Implements<ConceptMathFmod, FmodStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library fmod trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Fmod<
-                FmodStdLib,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto fmod(
-                    FmodStdLib const & fmod_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::fmod(x, y))
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return std::fmod(x, y);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/fmod/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/fmod/Traits.hpp
deleted file mode 100644
index faa821505f..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/fmod/Traits.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathFmod;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The fmod trait.
-            template<
-                typename T,
-                typename Tx,
-                typename Ty,
-                typename TSfinae = void>
-            struct Fmod;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the floating-point remainder of the division operation x/y.
-        //!
-        //! \tparam T The type of the object specializing Fmod.
-        //! \tparam Tx The type of the first argument.
-        //! \tparam Ty The type of the second argument.
-        //! \param fmod_ctx The object specializing Fmod.
-        //! \param x The first argument.
-        //! \param y The second argument.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Tx,
-            typename Ty>
-        ALPAKA_FN_HOST_ACC auto fmod(
-            T const & fmod_ctx,
-            Tx const & x,
-            Ty const & y)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Fmod<
-                concepts::ImplementationBase<ConceptMathFmod, T>,
-                Tx,
-                Ty>
-            ::fmod(
-                fmod_ctx,
-                x,
-                y))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathFmod, T>;
-            return
-                traits::Fmod<
-                    ImplementationBase,
-                    Tx,
-                    Ty>
-                ::fmod(
-                    fmod_ctx,
-                    x,
-                    y);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/log/LogCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/log/LogCudaBuiltIn.hpp
deleted file mode 100644
index 3f28b81ac6..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/log/LogCudaBuiltIn.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/log/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        // ! The CUDA built in log.
-        class LogCudaBuiltIn : public concepts::Implements<ConceptMathLog, LogCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA log trait specialization.
-            template<
-                typename TArg>
-            struct Log<
-                LogCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto log(
-                    LogCudaBuiltIn const & log_ctx,
-                    TArg const & arg)
-                -> decltype(::log(arg))
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return ::log(arg);
-                }
-            };
-            //! The CUDA log float specialization.
-            template<>
-            struct Log<
-                LogCudaBuiltIn,
-                float>
-            {
-                __device__ static auto log(
-                    LogCudaBuiltIn const & log_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return ::logf(arg);
-                }
-            };
-
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/log/LogHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/log/LogHipBuiltIn.hpp
deleted file mode 100644
index 80ee8193fb..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/log/LogHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/log/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP log.
-        class LogHipBuiltIn : public concepts::Implements<ConceptMathLog, LogHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP log trait specialization.
-            template<
-                typename TArg>
-            struct Log<
-                LogHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto log(
-                    LogHipBuiltIn const & log_ctx,
-                    TArg const & arg)
-                -> decltype(::log(arg))
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return ::log(arg);
-                }
-            };
-            //! The HIP log float specialization.
-            template<>
-            struct Log<
-                LogHipBuiltIn,
-                float>
-            {
-                __device__ static auto log(
-                    LogHipBuiltIn const & log_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return ::logf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/log/LogStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/log/LogStdLib.hpp
deleted file mode 100644
index 116007feed..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/log/LogStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/log/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library log.
-        class LogStdLib : public concepts::Implements<ConceptMathLog, LogStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library log trait specialization.
-            template<
-                typename TArg>
-            struct Log<
-                LogStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto log(
-                    LogStdLib const & log_ctx,
-                    TArg const & arg)
-                -> decltype(std::log(arg))
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return std::log(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/log/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/log/Traits.hpp
deleted file mode 100644
index 0ba09e8249..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/log/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathLog;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The log trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Log;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the the natural (base e) logarithm of arg.
-        //!
-        //! \tparam T The type of the object specializing Log.
-        //! \tparam TArg The arg type.
-        //! \param log_ctx The object specializing Log.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto log(
-            T const & log_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Log<
-                concepts::ImplementationBase<ConceptMathLog, T>,
-                TArg>
-            ::log(
-                log_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathLog, T>;
-            return
-                traits::Log<
-                    ImplementationBase,
-                    TArg>
-                ::log(
-                    log_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/max/MaxCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/max/MaxCudaBuiltIn.hpp
deleted file mode 100644
index 1c9a9806b3..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/max/MaxCudaBuiltIn.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/max/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in max.
-        class MaxCudaBuiltIn : public concepts::Implements<ConceptMathMax, MaxCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library integral max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                __device__ static auto max(
-                    MaxCudaBuiltIn const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::max(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return ::max(x, y);
-                }
-            };
-            //#############################################################################
-            //! The CUDA mixed integral floating point max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                __device__ static auto max(
-                    MaxCudaBuiltIn const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmax(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return ::fmax(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/max/MaxHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/max/MaxHipBuiltIn.hpp
deleted file mode 100644
index f16cc8e0e2..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/max/MaxHipBuiltIn.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/max/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP max.
-        class MaxHipBuiltIn : public concepts::Implements<ConceptMathMax, MaxHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP integral max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                __device__ static auto max(
-                    MaxHipBuiltIn const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::max(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return ::max(x, y);
-                }
-            };
-            //#############################################################################
-            //! The HIP mixed integral floating point max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                __device__ static auto max(
-                    MaxHipBuiltIn const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmax(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return ::fmax(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/max/MaxStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/max/MaxStdLib.hpp
deleted file mode 100644
index 4a433c2483..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/max/MaxStdLib.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/max/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-#include <algorithm>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library max.
-        class MaxStdLib : public concepts::Implements<ConceptMathMax, MaxStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library integral max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxStdLib,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto max(
-                    MaxStdLib const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::max(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return std::max(x, y);
-                }
-            };
-            //#############################################################################
-            //! The standard library mixed integral floating point max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxStdLib,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                ALPAKA_FN_HOST static auto max(
-                    MaxStdLib const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::fmax(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return std::fmax(x, y);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/max/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/max/Traits.hpp
deleted file mode 100644
index 083c5e8e13..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/max/Traits.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathMax;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The max trait.
-            template<
-                typename T,
-                typename Tx,
-                typename Ty,
-                typename TSfinae = void>
-            struct Max;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Returns the larger of two arguments.
-        //! NaNs are treated as missing data (between a NaN and a numeric value, the numeric value is chosen).
-        //!
-        //! \tparam T The type of the object specializing Max.
-        //! \tparam Tx The type of the first argument.
-        //! \tparam Ty The type of the second argument.
-        //! \param max_ctx The object specializing Max.
-        //! \param x The first argument.
-        //! \param y The second argument.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Tx,
-            typename Ty>
-        ALPAKA_FN_HOST_ACC auto max(
-            T const & max_ctx,
-            Tx const & x,
-            Ty const & y)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Max<
-                concepts::ImplementationBase<ConceptMathMax, T>,
-                Tx,
-                Ty>
-            ::max(
-                max_ctx,
-                x,
-                y))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathMax, T>;
-            return
-                traits::Max<
-                    ImplementationBase,
-                    Tx,
-                    Ty>
-                ::max(
-                    max_ctx,
-                    x,
-                    y);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/min/MinCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/min/MinCudaBuiltIn.hpp
deleted file mode 100644
index e922f173e5..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/min/MinCudaBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/min/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in min.
-        class MinCudaBuiltIn : public concepts::Implements<ConceptMathMin, MinCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA integral min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                __device__ static auto min(
-                    MinCudaBuiltIn const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::min(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return ::min(x, y);
-                }
-            };
-            //#############################################################################
-            //! The standard library mixed integral floating point min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                __device__ static auto min(
-                    MinCudaBuiltIn const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmin(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return ::fmin(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/min/MinHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/min/MinHipBuiltIn.hpp
deleted file mode 100644
index 36ca06feee..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/min/MinHipBuiltIn.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/min/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-#include <algorithm>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP min.
-        class MinHipBuiltIn : public concepts::Implements<ConceptMathMin, MinHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP integral min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                __device__ static auto min(
-                    MinHipBuiltIn const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::min(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return ::min(x, y);
-                }
-            };
-            //#############################################################################
-            //! The HIP mixed integral floating point min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                __device__ static auto min(
-                    MinHipBuiltIn const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmin(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return ::fmin(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/min/MinStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/min/MinStdLib.hpp
deleted file mode 100644
index ec0c40fc74..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/min/MinStdLib.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/min/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-#include <algorithm>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library min.
-        class MinStdLib : public concepts::Implements<ConceptMathMin, MinStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library integral min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinStdLib,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto min(
-                    MinStdLib const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::min(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return std::min(x, y);
-                }
-            };
-            //#############################################################################
-            //! The standard library mixed integral floating point min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinStdLib,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                ALPAKA_FN_HOST static auto min(
-                    MinStdLib const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::fmin(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return std::fmin(x, y);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/min/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/min/Traits.hpp
deleted file mode 100644
index 913e0d93c7..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/min/Traits.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathMin;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The min trait.
-            template<
-                typename T,
-                typename Tx,
-                typename Ty,
-                typename TSfinae = void>
-            struct Min;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Returns the smaller of two arguments.
-        //! NaNs are treated as missing data (between a NaN and a numeric value, the numeric value is chosen).
-        //!
-        //! \tparam T The type of the object specializing Min.
-        //! \tparam Tx The type of the first argument.
-        //! \tparam Ty The type of the second argument.
-        //! \param min_ctx The object specializing Min.
-        //! \param x The first argument.
-        //! \param y The second argument.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Tx,
-            typename Ty>
-        ALPAKA_FN_HOST_ACC auto min(
-            T const & min_ctx,
-            Tx const & x,
-            Ty const & y)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Min<
-                concepts::ImplementationBase<ConceptMathMin, T>,
-                Tx,
-                Ty>
-            ::min(
-                min_ctx,
-                x,
-                y))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathMin, T>;
-            return
-                traits::Min<
-                    ImplementationBase,
-                    Tx,
-                    Ty>
-                ::min(
-                    min_ctx,
-                    x,
-                    y);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/pow/PowCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/pow/PowCudaBuiltIn.hpp
deleted file mode 100644
index bb1e41a48b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/pow/PowCudaBuiltIn.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/pow/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in pow.
-        class PowCudaBuiltIn : public concepts::Implements<ConceptMathPow, PowCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA pow trait specialization.
-            template<
-                typename TBase,
-                typename TExp>
-            struct Pow<
-                PowCudaBuiltIn,
-                TBase,
-                TExp,
-                typename std::enable_if<
-                    std::is_floating_point<TBase>::value
-                    && std::is_floating_point<TExp>::value>::type>
-            {
-                __device__ static auto pow(
-                    PowCudaBuiltIn const & pow_ctx,
-                    TBase const & base,
-                    TExp const & exp)
-                -> decltype(::pow(base, exp))
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return ::pow(base, exp);
-                }
-            };
-            //! The CUDA pow float specialization.
-            template<>
-            struct Pow<
-                PowCudaBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto pow(
-                    PowCudaBuiltIn const & pow_ctx,
-                    float const & base,
-                    float const & exp)
-                -> float
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return ::powf(base, exp);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/pow/PowHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/pow/PowHipBuiltIn.hpp
deleted file mode 100644
index de6642f5be..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/pow/PowHipBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/pow/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP pow.
-        class PowHipBuiltIn : public concepts::Implements<ConceptMathPow, PowHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP pow trait specialization.
-            template<
-                typename TBase,
-                typename TExp>
-            struct Pow<
-                PowHipBuiltIn,
-                TBase,
-                TExp,
-                typename std::enable_if<
-                    std::is_floating_point<TBase>::value
-                    && std::is_floating_point<TExp>::value>::type>
-            {
-                __device__ static auto pow(
-                    PowHipBuiltIn const & pow_ctx,
-                    TBase const & base,
-                    TExp const & exp)
-                -> decltype(::pow(base, exp))
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return ::pow(base, exp);
-                }
-            };
-            //! The HIP pow float specialization.
-            template<>
-            struct Pow<
-                PowHipBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto pow(
-                    PowHipBuiltIn const & pow_ctx,
-                    float const & base,
-                    float const & exp)
-                -> float
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return ::powf(base, exp);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/pow/PowStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/pow/PowStdLib.hpp
deleted file mode 100644
index 219432ab8b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/pow/PowStdLib.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/pow/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library pow.
-        class PowStdLib : public concepts::Implements<ConceptMathPow, PowStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library pow trait specialization.
-            template<
-                typename TBase,
-                typename TExp>
-            struct Pow<
-                PowStdLib,
-                TBase,
-                TExp,
-                typename std::enable_if<
-                    std::is_arithmetic<TBase>::value
-                    && std::is_arithmetic<TExp>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto pow(
-                    PowStdLib const & pow_ctx,
-                    TBase const & base,
-                    TExp const & exp)
-                -> decltype(std::pow(base, exp))
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return std::pow(base, exp);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/pow/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/pow/Traits.hpp
deleted file mode 100644
index f45629cf03..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/pow/Traits.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathPow;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The pow trait.
-            template<
-                typename T,
-                typename TBase,
-                typename TExp,
-                typename TSfinae = void>
-            struct Pow;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the value of base raised to the power exp.
-        //!
-        //! \tparam T The type of the object specializing Pow.
-        //! \tparam TBase The base type.
-        //! \tparam TExp The exponent type.
-        //! \param pow_ctx The object specializing Pow.
-        //! \param base The base.
-        //! \param exp The exponent.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TBase,
-            typename TExp>
-        ALPAKA_FN_HOST_ACC auto pow(
-            T const & pow_ctx,
-            TBase const & base,
-            TExp const & exp)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Pow<
-                concepts::ImplementationBase<ConceptMathPow, T>,
-                TBase,
-                TExp>
-            ::pow(
-                pow_ctx,
-                base,
-                exp))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathPow, T>;
-            return
-                traits::Pow<
-                    ImplementationBase,
-                    TBase,
-                    TExp>
-                ::pow(
-                    pow_ctx,
-                    base,
-                    exp);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/remainder/RemainderCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/remainder/RemainderCudaBuiltIn.hpp
deleted file mode 100644
index ae7db890ae..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/remainder/RemainderCudaBuiltIn.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/remainder/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in remainder.
-        class RemainderCudaBuiltIn : public concepts::Implements<ConceptMathRemainder, RemainderCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA remainder trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Remainder<
-                RemainderCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                __device__ static auto remainder(
-                    RemainderCudaBuiltIn const & remainder_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::remainder(
-                    x,
-                    y))
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return ::remainder(
-                        x,
-                        y);
-                }
-            };
-            //! The CUDA remainder float specialization.
-            template<>
-            struct Remainder<
-                RemainderCudaBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto remainder(
-                    RemainderCudaBuiltIn const & remainder_ctx,
-                    float const & x,
-                    float const & y)
-                -> float
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return ::remainderf(
-                        x,
-                        y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/remainder/RemainderHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/remainder/RemainderHipBuiltIn.hpp
deleted file mode 100644
index 2e3fe8ac8b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/remainder/RemainderHipBuiltIn.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/remainder/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP remainder.
-        class RemainderHipBuiltIn : public concepts::Implements<ConceptMathRemainder, RemainderHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP remainder trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Remainder<
-                RemainderHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                __device__ static auto remainder(
-                    RemainderHipBuiltIn const & remainder_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::remainder(x, y))
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return ::remainder(x, y);
-                }
-            };
-            //! The HIP remainder float specialization.
-            template<>
-            struct Remainder<
-                RemainderHipBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto remainder(
-                    RemainderHipBuiltIn const & remainder_ctx,
-                    float const & x,
-                    float const & y)
-                -> float
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return ::remainderf(
-                        x,
-                        y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp
deleted file mode 100644
index f7e21cbbc5..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/remainder/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library remainder.
-        class RemainderStdLib : public concepts::Implements<ConceptMathRemainder, RemainderStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library remainder trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Remainder<
-                RemainderStdLib,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto remainder(
-                    RemainderStdLib const & remainder_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::remainder(x, y))
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return std::remainder(x, y);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/remainder/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/remainder/Traits.hpp
deleted file mode 100644
index 9300bf6fdc..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/remainder/Traits.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathRemainder;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The remainder trait.
-            template<
-                typename T,
-                typename Tx,
-                typename Ty,
-                typename TSfinae = void>
-            struct Remainder;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the IEEE remainder of the floating point division operation x/y.
-        //!
-        //! \tparam T The type of the object specializing Remainder.
-        //! \tparam Tx The type of the first argument.
-        //! \tparam Ty The type of the second argument.
-        //! \param remainder_ctx The object specializing Max.
-        //! \param x The first argument.
-        //! \param y The second argument.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Tx,
-            typename Ty>
-        ALPAKA_FN_HOST_ACC auto remainder(
-            T const & remainder_ctx,
-            Tx const & x,
-            Ty const & y)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Remainder<
-                concepts::ImplementationBase<ConceptMathRemainder, T>,
-                Tx,
-                Ty>
-            ::remainder(
-                remainder_ctx,
-                x,
-                y))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathRemainder, T>;
-            return
-                traits::Remainder<
-                    ImplementationBase,
-                    Tx,
-                    Ty>
-                ::remainder(
-                    remainder_ctx,
-                    x,
-                    y);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/round/RoundCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/round/RoundCudaBuiltIn.hpp
deleted file mode 100644
index 883f39584c..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/round/RoundCudaBuiltIn.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/round/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA round.
-        class RoundCudaBuiltIn : public concepts::Implements<ConceptMathRound, RoundCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA round trait specialization.
-            template<
-                typename TArg>
-            struct Round<
-                RoundCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto round(
-                    RoundCudaBuiltIn const & round_ctx,
-                    TArg const & arg)
-                -> decltype(::round(arg))
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return ::round(arg);
-                }
-            };
-            //#############################################################################
-            //! The CUDA lround trait specialization.
-            template<
-                typename TArg>
-            struct Lround<
-                RoundCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto lround(
-                    RoundCudaBuiltIn const & lround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(lround_ctx);
-                    return ::lround(arg);
-                }
-            };
-            //#############################################################################
-            //! The CUDA llround trait specialization.
-            template<
-                typename TArg>
-            struct Llround<
-                RoundCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto llround(
-                    RoundCudaBuiltIn const & llround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(llround_ctx);
-                    return ::llround(arg);
-                }
-            };
-            //! The CUDA round float specialization.
-            template<>
-            struct Round<
-                RoundCudaBuiltIn,
-                float>
-            {
-                __device__ static auto round(
-                    RoundCudaBuiltIn const & round_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return ::roundf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/round/RoundHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/round/RoundHipBuiltIn.hpp
deleted file mode 100644
index 7e1aeb798b..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/round/RoundHipBuiltIn.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/round/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP round.
-        class RoundHipBuiltIn : public concepts::Implements<ConceptMathRound, RoundHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP round trait specialization.
-            template<
-                typename TArg>
-            struct Round<
-                RoundHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto round(
-                    RoundHipBuiltIn const & round_ctx,
-                    TArg const & arg)
-                -> decltype(::round(arg))
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return ::round(arg);
-                }
-            };
-            //#############################################################################
-            //! The HIP round trait specialization.
-            template<
-                typename TArg>
-            struct Lround<
-                RoundHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto lround(
-                    RoundHipBuiltIn const & lround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(lround_ctx);
-                    return ::lround(arg);
-                }
-            };
-            //#############################################################################
-            //! The standard library round trait specialization.
-            template<
-                typename TArg>
-            struct Llround<
-                RoundHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto llround(
-                    RoundHipBuiltIn const & llround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(llround_ctx);
-                    return ::llround(arg);
-                }
-            };
-
-            template<>
-            struct Round<
-                RoundHipBuiltIn,
-                float>
-            {
-                __device__ static auto round(
-                    RoundHipBuiltIn const & round_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return ::roundf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/round/RoundStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/round/RoundStdLib.hpp
deleted file mode 100644
index b16ae63e03..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/round/RoundStdLib.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/round/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library round.
-        class RoundStdLib : public concepts::Implements<ConceptMathRound, RoundStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library round trait specialization.
-            template<
-                typename TArg>
-            struct Round<
-                RoundStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto round(
-                    RoundStdLib const & round_ctx,
-                    TArg const & arg)
-                -> decltype(std::round(arg))
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return std::round(arg);
-                }
-            };
-            //#############################################################################
-            //! The standard library round trait specialization.
-            template<
-                typename TArg>
-            struct Lround<
-                RoundStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto lround(
-                    RoundStdLib const & lround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(lround_ctx);
-                    return std::lround(arg);
-                }
-            };
-            //#############################################################################
-            //! The standard library round trait specialization.
-            template<
-                typename TArg>
-            struct Llround<
-                RoundStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto llround(
-                    RoundStdLib const & llround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(llround_ctx);
-                    return std::llround(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/round/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/round/Traits.hpp
deleted file mode 100644
index 1029281383..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/round/Traits.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathRound;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The round trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Round;
-
-            //#############################################################################
-            //! The round trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Lround;
-
-            //#############################################################################
-            //! The round trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Llround;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from zero, regardless of the current rounding mode.
-        //!
-        //! \tparam T The type of the object specializing Round.
-        //! \tparam TArg The arg type.
-        //! \param round_ctx The object specializing Round.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto round(
-            T const & round_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Round<
-                concepts::ImplementationBase<ConceptMathRound, T>,
-                TArg>
-            ::round(
-                round_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-            return
-                traits::Round<
-                    ImplementationBase,
-                    TArg>
-                ::round(
-                    round_ctx,
-                    arg);
-        }
-        //-----------------------------------------------------------------------------
-        //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero, regardless of the current rounding mode.
-        //!
-        //! \tparam T The type of the object specializing Round.
-        //! \tparam TArg The arg type.
-        //! \param lround_ctx The object specializing Round.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto lround(
-            T const & lround_ctx,
-            TArg const & arg)
-        -> long int
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-            return
-                traits::Lround<
-                    ImplementationBase,
-                    TArg>
-                ::lround(
-                    lround_ctx,
-                    arg);
-        }
-        //-----------------------------------------------------------------------------
-        //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero, regardless of the current rounding mode.
-        //!
-        //! \tparam T The type of the object specializing Round.
-        //! \tparam TArg The arg type.
-        //! \param llround_ctx The object specializing Round.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto llround(
-            T const & llround_ctx,
-            TArg const & arg)
-        -> long long int
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-            return
-                traits::Llround<
-                    ImplementationBase,
-                    TArg>
-                ::llround(
-                    llround_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtCudaBuiltIn.hpp
deleted file mode 100644
index eb80f4b2fc..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/rsqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA rsqrt.
-        class RsqrtCudaBuiltIn : public concepts::Implements<ConceptMathRsqrt, RsqrtCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA rsqrt trait specialization.
-            template<
-                typename TArg>
-            struct Rsqrt<
-                RsqrtCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                __device__ static auto rsqrt(
-                    RsqrtCudaBuiltIn const & rsqrt_ctx,
-                    TArg const & arg)
-                -> decltype(::rsqrt(arg))
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return ::rsqrt(arg);
-                }
-            };
-            //! The CUDA rsqrt float specialization.
-            template<>
-            struct Rsqrt<
-                RsqrtCudaBuiltIn,
-                float>
-            {
-                __device__ static auto rsqrt(
-                    RsqrtCudaBuiltIn const & rsqrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return ::rsqrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtHipBuiltIn.hpp
deleted file mode 100644
index a6f989136c..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/rsqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP rsqrt.
-        class RsqrtHipBuiltIn : public concepts::Implements<ConceptMathRsqrt, RsqrtHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP rsqrt trait specialization.
-            template<
-                typename TArg>
-            struct Rsqrt<
-                RsqrtHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                __device__ static auto rsqrt(
-                    RsqrtHipBuiltIn const & rsqrt_ctx,
-                    TArg const & arg)
-                -> decltype(::rsqrt(arg))
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return ::rsqrt(arg);
-                }
-            };
-            //! The HIP rsqrt float specialization.
-            template<>
-            struct Rsqrt<
-                RsqrtHipBuiltIn,
-                float>
-            {
-                __device__ static auto rsqrt(
-                    RsqrtHipBuiltIn const & rsqrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return ::rsqrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp
deleted file mode 100644
index 3d5bd68048..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/rsqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library rsqrt.
-        class RsqrtStdLib : public concepts::Implements<ConceptMathRsqrt, RsqrtStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library rsqrt trait specialization.
-            template<
-                typename TArg>
-            struct Rsqrt<
-                RsqrtStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto rsqrt(
-                    RsqrtStdLib const & rsqrt_ctx,
-                    TArg const & arg)
-                -> decltype(std::sqrt(arg))
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return static_cast<TArg>(1)/std::sqrt(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/rsqrt/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/rsqrt/Traits.hpp
deleted file mode 100644
index 8e33c29f19..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/rsqrt/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathRsqrt;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The rsqrt trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Rsqrt;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the rsqrt.
-        //!
-        //! \tparam T The type of the object specializing Rsqrt.
-        //! \tparam TArg The arg type.
-        //! \param rsqrt_ctx The object specializing Rsqrt.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto rsqrt(
-            T const & rsqrt_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Rsqrt<
-                concepts::ImplementationBase<ConceptMathRsqrt, T>,
-                TArg>
-            ::rsqrt(
-                rsqrt_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathRsqrt, T>;
-            return
-                traits::Rsqrt<
-                    ImplementationBase,
-                    TArg>
-                ::rsqrt(
-                    rsqrt_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/sin/SinCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/sin/SinCudaBuiltIn.hpp
deleted file mode 100644
index e52ec751af..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sin/SinCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/sin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA sin.
-        class SinCudaBuiltIn : public concepts::Implements<ConceptMathSin, SinCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA sin trait specialization.
-            template<
-                typename TArg>
-            struct Sin<
-                SinCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto sin(
-                    SinCudaBuiltIn const & sin_ctx,
-                    TArg const & arg)
-                -> decltype(::sin(arg))
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return ::sin(arg);
-                }
-            };
-            //! The CUDA sin float specialization.
-            template<>
-            struct Sin<
-                SinCudaBuiltIn,
-                float>
-            {
-                __device__ static auto sin(
-                    SinCudaBuiltIn const & sin_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return ::sinf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/sin/SinHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/sin/SinHipBuiltIn.hpp
deleted file mode 100644
index 86faad5c55..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sin/SinHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/sin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP sin.
-        class SinHipBuiltIn : public concepts::Implements<ConceptMathSin, SinHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP sin trait specialization.
-            template<
-                typename TArg>
-            struct Sin<
-                SinHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto sin(
-                    SinHipBuiltIn const & sin_ctx,
-                    TArg const & arg)
-                -> decltype(::sin(arg))
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return ::sin(arg);
-                }
-            };
-            //! The HIP sin float specialization.
-            template<>
-            struct Sin<
-                SinHipBuiltIn,
-                float>
-            {
-                __device__ static auto sin(
-                    SinHipBuiltIn const & sin_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return ::sinf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/sin/SinStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/sin/SinStdLib.hpp
deleted file mode 100644
index a9b56c2ed7..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sin/SinStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/sin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library sin.
-        class SinStdLib : public concepts::Implements<ConceptMathSin, SinStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library sin trait specialization.
-            template<
-                typename TArg>
-            struct Sin<
-                SinStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto sin(
-                    SinStdLib const & sin_ctx,
-                    TArg const & arg)
-                -> decltype(std::sin(arg))
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return std::sin(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/sin/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/sin/Traits.hpp
deleted file mode 100644
index 8c93297905..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sin/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathSin;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The sin trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Sin;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the sine (measured in radians).
-        //!
-        //! \tparam T The type of the object specializing Sin.
-        //! \tparam TArg The arg type.
-        //! \param sin_ctx The object specializing Sin.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto sin(
-            T const & sin_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Sin<
-                concepts::ImplementationBase<ConceptMathSin, T>,
-                TArg>
-            ::sin(
-                sin_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathSin, T>;
-            return
-                traits::Sin<
-                    ImplementationBase,
-                    TArg>
-                ::sin(
-                    sin_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/sincos/SinCosCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/sincos/SinCosCudaBuiltIn.hpp
deleted file mode 100644
index ad54c19e99..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sincos/SinCosCudaBuiltIn.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/sincos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA sincos.
-        class SinCosCudaBuiltIn : public concepts::Implements<ConceptMathSinCos, SinCosCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-
-            //! sincos trait specialization.
-            template<>
-            struct SinCos<
-                SinCosCudaBuiltIn,
-                double>
-            {
-                __device__ static auto sincos(
-                    SinCosCudaBuiltIn const & sincos_ctx,
-                    double const & arg,
-                    double & result_sin,
-                    double & result_cos)
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    ::sincos(arg, &result_sin, &result_cos);
-                }
-            };
-
-            //! The CUDA sin float specialization.
-            template<>
-            struct SinCos<
-                SinCosCudaBuiltIn,
-                float>
-            {
-                __device__ static auto sincos(
-                    SinCosCudaBuiltIn const & sincos_ctx,
-                    float const & arg,
-                    float & result_sin,
-                    float & result_cos)
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    ::sincosf(arg, &result_sin, &result_cos);
-                }
-            };
-
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/sincos/SinCosHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/sincos/SinCosHipBuiltIn.hpp
deleted file mode 100644
index 3033cc0fd0..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sincos/SinCosHipBuiltIn.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/sincos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! sincos.
-        class SinCosHipBuiltIn : public concepts::Implements<ConceptMathSinCos, SinCosHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! sincos trait specialization.
-            template<>
-            struct SinCos<SinCosHipBuiltIn, double>
-            {
-                __device__ static auto sincos(
-                    SinCosHipBuiltIn const & sincos_ctx,
-                    double const & arg,
-                    double & result_sin,
-                    double & result_cos)
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    ::sincos(arg, &result_sin, &result_cos);
-                }
-            };
-
-            //! The sincos float specialization.
-            template<>
-            struct SinCos<SinCosHipBuiltIn, float>
-            {
-                __device__ static auto sincos(
-                    SinCosHipBuiltIn const & sincos_ctx,
-                    float const & arg,
-                    float & result_sin,
-                    float & result_cos)
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    ::sincosf(arg, &result_sin, &result_cos);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp
deleted file mode 100644
index e39c4d2f65..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/sincos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library sincos.
-        class SinCosStdLib : public concepts::Implements<ConceptMathSinCos, SinCosStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library sincos trait specialization.
-            template<
-                typename TArg>
-            struct SinCos<
-                SinCosStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto sincos(
-                    SinCosStdLib const & sincos_ctx,
-                    TArg const & arg,
-                    TArg & result_sin,
-                    TArg & result_cos )
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    result_sin = std::sin(arg);
-                    result_cos = std::cos(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/sincos/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/sincos/Traits.hpp
deleted file mode 100644
index 355134cf86..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sincos/Traits.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathSinCos;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The sincos trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct SinCos;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the sine and cosine (measured in radians).
-        //!
-        //! \tparam T The type of the object specializing SinCos.
-        //! \tparam TArg The arg type.
-        //! \param sincos_ctx The object specializing SinCos.
-        //! \param arg The arg.
-        //! \param result_sin result of sine
-        //! \param result_cos result of cosine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto sincos(
-            T const & sincos_ctx,
-            TArg const & arg,
-            TArg & result_sin,
-            TArg & result_cos)
-        -> void
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathSinCos, T>;
-            traits::SinCos<
-                ImplementationBase,
-                TArg>
-                ::sincos(
-                    sincos_ctx,
-                    arg,
-                    result_sin,
-                    result_cos
-                    );
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtCudaBuiltIn.hpp
deleted file mode 100644
index 2e597e893d..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtCudaBuiltIn.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/sqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA sqrt.
-        class SqrtCudaBuiltIn : public concepts::Implements<ConceptMathSqrt, SqrtCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA sqrt trait specialization.
-            template<
-                typename TArg>
-            struct Sqrt<
-                SqrtCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto sqrt(
-                    SqrtCudaBuiltIn const & sqrt_ctx,
-                    TArg const & arg)
-                -> decltype(::sqrt(arg))
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return ::sqrt(arg);
-                }
-            };
-            //! The CUDA sqrt float specialization.
-            template<>
-            struct Sqrt<
-                SqrtCudaBuiltIn,
-                float>
-            {
-                __device__ static auto sqrt(
-                    SqrtCudaBuiltIn const & sqrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return ::sqrtf(arg);
-                }
-            };
-
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtHipBuiltIn.hpp
deleted file mode 100644
index fdf9b8ec01..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/sqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP sqrt.
-        class SqrtHipBuiltIn : public concepts::Implements<ConceptMathSqrt, SqrtHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP sqrt trait specialization.
-            template<
-                typename TArg>
-            struct Sqrt<
-                SqrtHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto sqrt(
-                    SqrtHipBuiltIn const & sqrt_ctx,
-                    TArg const & arg)
-                -> decltype(::sqrt(arg))
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return ::sqrt(arg);
-                }
-            };
-            //! The HIP sqrt float specialization.
-            template<>
-            struct Sqrt<
-                SqrtHipBuiltIn,
-                float>
-            {
-                __device__ static auto sqrt(
-                    SqrtHipBuiltIn const & sqrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return ::sqrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp
deleted file mode 100644
index 4fb1f3eae6..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/sqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library sqrt.
-        class SqrtStdLib : public concepts::Implements<ConceptMathSqrt, SqrtStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library sqrt trait specialization.
-            template<
-                typename TArg>
-            struct Sqrt<
-                SqrtStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto sqrt(
-                    SqrtStdLib const & sqrt_ctx,
-                    TArg const & arg)
-                -> decltype(std::sqrt(arg))
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return std::sqrt(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/sqrt/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/sqrt/Traits.hpp
deleted file mode 100644
index e83124fbd0..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/sqrt/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathSqrt;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The sqrt trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Sqrt;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the square root of arg.
-        //!
-        //! \tparam T The type of the object specializing Sqrt.
-        //! \tparam TArg The arg type.
-        //! \param sqrt_ctx The object specializing Sqrt.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto sqrt(
-            T const & sqrt_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Sqrt<
-                concepts::ImplementationBase<ConceptMathSqrt, T>,
-                TArg>
-            ::sqrt(
-                sqrt_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathSqrt, T>;
-            return
-                traits::Sqrt<
-                    ImplementationBase,
-                    TArg>
-                ::sqrt(
-                    sqrt_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/tan/TanCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/tan/TanCudaBuiltIn.hpp
deleted file mode 100644
index 96691b2410..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/tan/TanCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/tan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA tan.
-        class TanCudaBuiltIn : public concepts::Implements<ConceptMathTan, TanCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA tan trait specialization.
-            template<
-                typename TArg>
-            struct Tan<
-                TanCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto tan(
-                    TanCudaBuiltIn const & tan_ctx,
-                    TArg const & arg)
-                -> decltype(::tan(arg))
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return ::tan(arg);
-                }
-            };
-            //! The CUDA tan float specialization.
-            template<>
-            struct Tan<
-                TanCudaBuiltIn,
-                float>
-            {
-                __device__ static auto tan(
-                    TanCudaBuiltIn const & tan_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return ::tanf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/tan/TanHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/tan/TanHipBuiltIn.hpp
deleted file mode 100644
index d6e6d1deb6..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/tan/TanHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/tan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP tan.
-        class TanHipBuiltIn : public concepts::Implements<ConceptMathTan, TanHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP tan trait specialization.
-            template<
-                typename TArg>
-            struct Tan<
-                TanHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto tan(
-                    TanHipBuiltIn const & tan_ctx,
-                    TArg const & arg)
-                -> decltype(::tan(arg))
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return ::tan(arg);
-                }
-            };
-            //! The HIP tan float specialization.
-            template<>
-            struct Tan<
-                TanHipBuiltIn,
-                float>
-            {
-                __device__ static auto tan(
-                    TanHipBuiltIn const & tan_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return ::tanf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/tan/TanStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/tan/TanStdLib.hpp
deleted file mode 100644
index f7453360bb..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/tan/TanStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/tan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library tan.
-        class TanStdLib : public concepts::Implements<ConceptMathTan, TanStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library tan trait specialization.
-            template<
-                typename TArg>
-            struct Tan<
-                TanStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto tan(
-                    TanStdLib const & tan_ctx,
-                    TArg const & arg)
-                -> decltype(std::tan(arg))
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return std::tan(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/tan/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/tan/Traits.hpp
deleted file mode 100644
index d366d5f336..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/tan/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathTan;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The tan trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Tan;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the tangent (measured in radians).
-        //!
-        //! \tparam T The type of the object specializing Tan.
-        //! \tparam TArg The arg type.
-        //! \param tan_ctx The object specializing Tan.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto tan(
-            T const & tan_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Tan<
-                concepts::ImplementationBase<ConceptMathTan, T>,
-                TArg>
-            ::tan(
-                tan_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathTan, T>;
-            return
-                traits::Tan<
-                    ImplementationBase,
-                    TArg>
-                ::tan(
-                    tan_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/trunc/Traits.hpp b/thirdParty/alpaka/include/alpaka/math/trunc/Traits.hpp
deleted file mode 100644
index 2444acb810..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/trunc/Traits.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        struct ConceptMathTrunc;
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The trunc trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
-            struct Trunc;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Computes the nearest integer not greater in magnitude than arg.
-        //!
-        //! \tparam T The type of the object specializing Trunc.
-        //! \tparam TArg The arg type.
-        //! \param trunc_ctx The object specializing Trunc.
-        //! \param arg The arg.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto trunc(
-            T const & trunc_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Trunc<
-                concepts::ImplementationBase<ConceptMathTrunc, T>,
-                TArg>
-            ::trunc(
-                trunc_ctx,
-                arg))
-#endif
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptMathTrunc, T>;
-            return
-                traits::Trunc<
-                    ImplementationBase,
-                    TArg>
-                ::trunc(
-                    trunc_ctx,
-                    arg);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp
deleted file mode 100644
index b0febf6e16..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/trunc/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA trunc.
-        class TruncCudaBuiltIn : public concepts::Implements<ConceptMathTrunc, TruncCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA trunc trait specialization.
-            template<
-                typename TArg>
-            struct Trunc<
-                TruncCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto trunc(
-                    TruncCudaBuiltIn const & trunc_ctx,
-                    TArg const & arg)
-                -> decltype(::trunc(arg))
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return ::trunc(arg);
-                }
-            };
-            //! The CUDA trunc float specialization.
-            template<>
-            struct Trunc<
-                TruncCudaBuiltIn,
-                float>
-            {
-                __device__ static auto trunc(
-                    TruncCudaBuiltIn const & trunc_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return ::truncf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/trunc/TruncHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/math/trunc/TruncHipBuiltIn.hpp
deleted file mode 100644
index 2618199613..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/trunc/TruncHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/trunc/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP trunc.
-        class TruncHipBuiltIn : public concepts::Implements<ConceptMathTrunc, TruncHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP trunc trait specialization.
-            template<
-                typename TArg>
-            struct Trunc<
-                TruncHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto trunc(
-                    TruncHipBuiltIn const & trunc_ctx,
-                    TArg const & arg)
-                -> decltype(::trunc(arg))
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return ::trunc(arg);
-                }
-            };
-            //! The HIP trunc float specialization.
-            template<>
-            struct Trunc<
-                TruncHipBuiltIn,
-                float>
-            {
-                __device__ static auto trunc(
-                    TruncHipBuiltIn const & trunc_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return ::truncf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp b/thirdParty/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp
deleted file mode 100644
index 81059a0b47..0000000000
--- a/thirdParty/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/math/trunc/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-#include <cmath>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library trunc.
-        class TruncStdLib : public concepts::Implements<ConceptMathTrunc, TruncStdLib>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library trunc trait specialization.
-            template<
-                typename TArg>
-            struct Trunc<
-                TruncStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                ALPAKA_FN_HOST static auto trunc(
-                    TruncStdLib const & trunc_ctx,
-                    TArg const & arg)
-                -> decltype(std::trunc(arg))
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return std::trunc(arg);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp b/thirdParty/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp
deleted file mode 100644
index a89ef98eba..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/mem/alloc/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <boost/align.hpp>
-
-#include <algorithm>
-
-namespace alpaka
-{
-    namespace mem
-    {
-        //-----------------------------------------------------------------------------
-        //! The allocator specifics.
-        namespace alloc
-        {
-            //#############################################################################
-            //! The CPU boost aligned allocator.
-            //!
-            //! \tparam TAlignment An integral constant containing the alignment.
-            template<
-                typename TAlignment>
-            class AllocCpuBoostAligned : public concepts::Implements<ConceptMemAlloc, AllocCpuBoostAligned<TAlignment>>
-            {
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU boost aligned allocator memory allocation trait specialization.
-                template<
-                    typename T,
-                    typename TAlignment>
-                struct Alloc<
-                    T,
-                    AllocCpuBoostAligned<TAlignment>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto alloc(
-                        AllocCpuBoostAligned<TAlignment> const & alloc,
-                        std::size_t const & sizeElems)
-                    -> T *
-                    {
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                        // For CUDA host memory must be aligned to 4 kib to pin it with `cudaHostRegister`,
-                        // this was described in older programming guides but was removed later.
-                        // From testing with PIConGPU and cuda-memcheck we found out that the alignment is still required.
-                        //
-                        // For HIP the required alignment is the size of a cache line.
-                        // https://rocm-developer-tools.github.io/HIP/group__Memory.html#gab8258f051e1a1f7385f794a15300e674
-                        // To avoid issues with HIP(cuda) the alignment will be set also for HIP(clang/hcc)
-                        // to 4kib.
-                        // @todo evaluate requirements when the HIP ecosystem is more stable
-                        constexpr size_t minAlignement = 4096;
-#else
-                        constexpr size_t minAlignement = TAlignment::value;
-#endif
-                        alpaka::ignore_unused(alloc);
-                        return
-                            reinterpret_cast<T *>(
-                                boost::alignment::aligned_alloc(std::max(TAlignment::value, minAlignement), sizeElems * sizeof(T)));
-                    }
-                };
-
-                //#############################################################################
-                //! The CPU boost aligned allocator memory free trait specialization.
-                template<
-                    typename T,
-                    typename TAlignment>
-                struct Free<
-                    T,
-                    AllocCpuBoostAligned<TAlignment>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto free(
-                        AllocCpuBoostAligned<TAlignment> const & alloc,
-                        T const * const ptr)
-                    -> void
-                    {
-                        alpaka::ignore_unused(alloc);
-                            boost::alignment::aligned_free(
-                                const_cast<void *>(
-                                    reinterpret_cast<void const *>(ptr)));
-                    }
-                };
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp b/thirdParty/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp
deleted file mode 100644
index badd8f5608..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/mem/alloc/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace mem
-    {
-        //-----------------------------------------------------------------------------
-        //! The allocator specifics.
-        namespace alloc
-        {
-            //#############################################################################
-            //! The CPU new allocator.
-            class AllocCpuNew : public concepts::Implements<ConceptMemAlloc, AllocCpuNew>
-            {
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU new allocator memory allocation trait specialization.
-                template<
-                    typename T>
-                struct Alloc<
-                    T,
-                    AllocCpuNew>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto alloc(
-                        AllocCpuNew const & alloc,
-                        std::size_t const & sizeElems)
-                    -> T *
-                    {
-                        alpaka::ignore_unused(alloc);
-                        return new T[sizeElems];
-                    }
-                };
-
-                //#############################################################################
-                //! The CPU new allocator memory free trait specialization.
-                template<
-                    typename T>
-                struct Free<
-                    T,
-                    AllocCpuNew>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto free(
-                        AllocCpuNew const & alloc,
-                        T const * const ptr)
-                    -> void
-                    {
-                        alpaka::ignore_unused(alloc);
-                        return delete[] ptr;
-                    }
-                };
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/alloc/Traits.hpp b/thirdParty/alpaka/include/alpaka/mem/alloc/Traits.hpp
deleted file mode 100644
index 90ab18dcaf..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/alloc/Traits.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/extent/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-namespace alpaka
-{
-    namespace mem
-    {
-        //-----------------------------------------------------------------------------
-        //! The allocator specifics.
-        namespace alloc
-        {
-            struct ConceptMemAlloc;
-
-            //-----------------------------------------------------------------------------
-            //! The allocator traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The memory allocation trait.
-                template<
-                    typename T,
-                    typename TAlloc,
-                    typename TSfinae = void>
-                struct Alloc;
-
-                //#############################################################################
-                //! The memory free trait.
-                template<
-                    typename T,
-                    typename TAlloc,
-                    typename TSfinae = void>
-                struct Free;
-            }
-
-            //-----------------------------------------------------------------------------
-            //! \return The pointer to the allocated memory.
-            template<
-                typename T,
-                typename TAlloc>
-            ALPAKA_FN_HOST auto alloc(
-                TAlloc const & alloc,
-                std::size_t const & sizeElems)
-            -> T *
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
-                return
-                    traits::Alloc<
-                        T,
-                        ImplementationBase>
-                    ::alloc(
-                        alloc,
-                        sizeElems);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Frees the memory identified by the given pointer.
-            template<
-                typename TAlloc,
-                typename T>
-            ALPAKA_FN_HOST auto free(
-                TAlloc const & alloc,
-                T const * const ptr)
-            -> void
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
-                traits::Free<
-                    T,
-                    ImplementationBase>
-                ::free(
-                    alloc,
-                    ptr);
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/BufCpu.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/BufCpu.hpp
deleted file mode 100644
index d373f77736..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/BufCpu.hpp
+++ /dev/null
@@ -1,662 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Vectorize.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-
-// \TODO: Remove CUDA inclusion for BufCpu by replacing pinning with non CUDA code!
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-    #include <alpaka/core/Cuda.hpp>
-#endif
-
-#include <alpaka/mem/alloc/AllocCpuBoostAligned.hpp>
-
-#include <alpaka/meta/DependentFalseType.hpp>
-
-#include <memory>
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace buf
-        {
-            namespace cpu
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CPU memory buffer.
-                    template<
-                        typename TElem,
-                        typename TDim,
-                        typename TIdx>
-                    class BufCpuImpl final :
-                        public mem::alloc::AllocCpuBoostAligned<std::integral_constant<std::size_t, core::vectorization::defaultAlignment>>
-                    {
-                        static_assert(
-                            !std::is_const<TElem>::value,
-                            "The elem type of the buffer can not be const because the C++ Standard forbids containers of const elements!");
-                        static_assert(
-                            !std::is_const<TIdx>::value,
-                            "The idx type of the buffer can not be const!");
-                    public:
-                        //-----------------------------------------------------------------------------
-                        template<
-                            typename TExtent>
-                        ALPAKA_FN_HOST BufCpuImpl(
-                            dev::DevCpu const & dev,
-                            TExtent const & extent) :
-                                mem::alloc::AllocCpuBoostAligned<std::integral_constant<std::size_t, core::vectorization::defaultAlignment>>(),
-                                m_dev(dev),
-                                m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                                m_pMem(mem::alloc::alloc<TElem>(*this, static_cast<std::size_t>(computeElementCount(extent)))),
-                                m_pitchBytes(static_cast<TIdx>(extent::getWidth(extent) * static_cast<TIdx>(sizeof(TElem))))
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                                ,m_bPinned(false)
-#endif
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                            static_assert(
-                                TDim::value == dim::Dim<TExtent>::value,
-                                "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be identical!");
-                            static_assert(
-                                std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                                "The idx type of TExtent and the TIdx template parameter have to be identical!");
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            std::cout << __func__
-                                << " e: " << m_extentElements
-                                << " ptr: " << static_cast<void *>(m_pMem)
-                                << " pitch: " << m_pitchBytes
-                                << std::endl;
-#endif
-                        }
-                        //-----------------------------------------------------------------------------
-                        BufCpuImpl(BufCpuImpl const &) = delete;
-                        //-----------------------------------------------------------------------------
-                        BufCpuImpl(BufCpuImpl &&) = default;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(BufCpuImpl const &) -> BufCpuImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(BufCpuImpl &&) -> BufCpuImpl & = default;
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST ~BufCpuImpl()
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                            // Unpin this memory if it is currently pinned.
-                            mem::buf::unpin(*this);
-#endif
-                            // NOTE: m_pMem is allowed to be a nullptr here.
-                            mem::alloc::free(*this, m_pMem);
-                        }
-
-                    private:
-                        //-----------------------------------------------------------------------------
-                        //! \return The number of elements to allocate.
-                        template<
-                            typename TExtent>
-                        ALPAKA_FN_HOST static auto computeElementCount(
-                            TExtent const & extent)
-                        -> TIdx
-                        {
-                            auto const extentElementCount(extent::getExtentProduct(extent));
-
-                            return extentElementCount;
-                        }
-
-                    public:
-                        dev::DevCpu const m_dev;
-                        vec::Vec<TDim, TIdx> const m_extentElements;
-                        TElem * const m_pMem;
-                        TIdx const m_pitchBytes;
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                        bool m_bPinned;
-#endif
-                    };
-                }
-            }
-            //#############################################################################
-            //! The CPU memory buffer.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCpu
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST BufCpu(
-                    dev::DevCpu const & dev,
-                    TExtent const & extent) :
-                        m_spBufCpuImpl(std::make_shared<cpu::detail::BufCpuImpl<TElem, TDim, TIdx>>(dev, extent))
-                {}
-                //-----------------------------------------------------------------------------
-                BufCpu(BufCpu const &) = default;
-                //-----------------------------------------------------------------------------
-                BufCpu(BufCpu &&) = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(BufCpu const &) -> BufCpu & = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(BufCpu &&) -> BufCpu & = default;
-                //-----------------------------------------------------------------------------
-                ~BufCpu() = default;
-
-            public:
-                std::shared_ptr<cpu::detail::BufCpuImpl<TElem, TDim, TIdx>> m_spBufCpuImpl;
-            };
-        }
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCpu device type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-            //#############################################################################
-            //! The BufCpu device get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetDev<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                ALPAKA_FN_HOST static auto getDev(
-                    mem::buf::BufCpu<TElem, TDim, TIdx> const & buf)
-                -> dev::DevCpu
-                {
-                    return buf.m_spBufCpuImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCpu dimension getter trait.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCpu memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct ElemType<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCpu width get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetExtent<
-                TIdxIntegralConst,
-                mem::buf::BufCpu<TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    mem::buf::BufCpu<TElem, TDim, TIdx> const & extent)
-                -> TIdx
-                {
-                    return extent.m_spBufCpuImpl->m_extentElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf)
-                    -> TElem const *
-                    {
-                        return buf.m_spBufCpuImpl->m_pMem;
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf)
-                    -> TElem *
-                    {
-                        return buf.m_spBufCpuImpl->m_pMem;
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu pointer on device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf,
-                        dev::DevCpu const & dev)
-                    -> TElem const *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spBufCpuImpl->m_pMem;
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCpu const & dev)
-                    -> TElem *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spBufCpuImpl->m_pMem;
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu pitch get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    dim::DimInt<TDim::value - 1u>,
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & pitch)
-                    -> TIdx
-                    {
-                        return pitch.m_spBufCpuImpl->m_pitchBytes;
-                    }
-                };
-            }
-        }
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    TDim,
-                    TIdx,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevCpu const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufCpu<TElem, TDim, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        return mem::buf::BufCpu<
-                            TElem,
-                            TDim,
-                            TIdx>(
-                                dev,
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCpu const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Memory mapping of BufCpu between two devices is not implemented!");
-                        }
-                        // If it is the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCpu const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Memory unmapping of BufCpu between two devices is not implemented!");
-                        }
-                        // If it is the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory pinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Pin<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto pin(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(!mem::buf::isPinned(buf))
-                        {
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA)
-                            if(buf.m_spBufCpuImpl->m_extentElements.prod() != 0)
-                            {
-                                // - cudaHostRegisterDefault:
-                                //   See http://cgi.cs.indiana.edu/~nhusted/dokuwiki/doku.php?id=programming:cudaperformance1
-                                // - cudaHostRegisterPortable:
-                                //   The memory returned by this call will be considered as pinned memory by all CUDA contexts, not just the one that performed the allocation.
-                                ALPAKA_CUDA_RT_CHECK_IGNORE(
-                                    cudaHostRegister(
-                                        const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                        extent::getExtentProduct(buf) * sizeof(elem::Elem<buf::BufCpu<TElem, TDim, TIdx>>),
-                                        cudaHostRegisterDefault),
-                                    cudaErrorHostMemoryAlreadyRegistered);
-
-                                buf.m_spBufCpuImpl->m_bPinned = true;
-                            }
-#elif (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                            if(buf.m_spBufCpuImpl->m_extentElements.prod() != 0)
-                            {
-                                ALPAKA_HIP_RT_CHECK_IGNORE(
-                                    hipHostRegister(
-                                        const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                        extent::getExtentProduct(buf) * sizeof(elem::Elem<buf::BufCpu<TElem, TDim, TIdx>>),
-                                        hipHostRegisterDefault),
-                                    hipErrorHostMemoryAlreadyRegistered);
-
-                                buf.m_spBufCpuImpl->m_bPinned = true;
-                            }
-#else
-                            static_assert(
-                                meta::DependentFalseType<TElem>::value,
-                                "Memory pinning of BufCpu is not implemented when CUDA or HIP is not enabled!");
-#endif
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory unpinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unpin<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unpin(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf)
-                    -> void
-                    {
-                        mem::buf::unpin(*buf.m_spBufCpuImpl.get());
-                    }
-                };
-                //#############################################################################
-                //! The BufCpuImpl memory unpinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unpin<
-                    mem::buf::cpu::detail::BufCpuImpl<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unpin(
-                        mem::buf::cpu::detail::BufCpuImpl<TElem, TDim, TIdx> & bufImpl)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(mem::buf::isPinned(bufImpl))
-                        {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-                            ALPAKA_CUDA_RT_CHECK_IGNORE(
-                                cudaHostUnregister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(bufImpl.m_pMem))),
-                                cudaErrorHostMemoryNotRegistered);
-
-                            bufImpl.m_bPinned = false;
-#elif (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                            ALPAKA_HIP_RT_CHECK_IGNORE(
-                                hipHostUnregister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(bufImpl.m_pMem))),
-                                hipErrorHostMemoryNotRegistered);
-
-                            bufImpl.m_bPinned = false;
-#else
-                            static_assert(
-                                meta::DependentFalseType<TElem>::value,
-                                "Memory unpinning of BufCpu is not implemented when CUDA or HIP is not enabled!");
-#endif
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory pin state trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct IsPinned<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isPinned(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf)
-                    -> bool
-                    {
-                        return mem::buf::isPinned(*buf.m_spBufCpuImpl.get());
-                    }
-                };
-                //#############################################################################
-                //! The BufCpuImpl memory pin state trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct IsPinned<
-                    mem::buf::cpu::detail::BufCpuImpl<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isPinned(
-                        mem::buf::cpu::detail::BufCpuImpl<TElem, TDim, TIdx> const & bufImpl)
-                    -> bool
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                        return bufImpl.m_bPinned;
-#else
-                        alpaka::ignore_unused(bufImpl);
-                        return false;
-#endif
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory prepareForAsyncCopy trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct PrepareForAsyncCopy<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto prepareForAsyncCopy(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                        // to optimize the data transfer performance between a cuda/hip device the cpu buffer has to be pinned,
-                        // for exclusive cpu use, no preparing is needed
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                        pin( buf );
-#else
-                        alpaka::ignore_unused( buf );
-#endif
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCpu offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                    mem::buf::BufCpu<TElem, TDim, TIdx> const &)
-                -> TIdx
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCpu idx type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#include <alpaka/mem/buf/cpu/Copy.hpp>
-#include <alpaka/mem/buf/cpu/Set.hpp>
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/BufCudaRt.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/BufCudaRt.hpp
deleted file mode 100644
index 8f14dc6e5c..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/BufCudaRt.hpp
+++ /dev/null
@@ -1,778 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-
-#include <memory>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevCudaRt;
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCpu;
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            //#############################################################################
-            //! The CUDA memory buffer.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCudaRt
-            {
-                static_assert(
-                    !std::is_const<TElem>::value,
-                    "The elem type of the buffer can not be const because the C++ Standard forbids containers of const elements!");
-                static_assert(
-                    !std::is_const<TIdx>::value,
-                    "The idx type of the buffer can not be const!");
-            private:
-                using Elem = TElem;
-                using Dim = TDim;
-
-            public:
-                //-----------------------------------------------------------------------------
-                //! Constructor
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST BufCudaRt(
-                    dev::DevCudaRt const & dev,
-                    TElem * const pMem,
-                    TIdx const & pitchBytes,
-                    TExtent const & extent) :
-                        m_dev(dev),
-                        m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                        m_spMem(
-                            pMem,
-                            // NOTE: Because the BufCudaRt object can be copied and the original object could have been destroyed,
-                            // a std::ref(m_dev) or a this pointer can not be bound to the callback because they are not always valid at time of destruction.
-                            std::bind(&BufCudaRt::freeBuffer, std::placeholders::_1, m_dev)),
-                        m_pitchBytes(pitchBytes)
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        TDim::value == dim::Dim<TExtent>::value,
-                        "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! Frees the shared buffer.
-                ALPAKA_FN_HOST static auto freeBuffer(
-                    TElem * const memPtr,
-                    dev::DevCudaRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-                    // Free the buffer.
-                    ALPAKA_CUDA_RT_CHECK(
-                      cudaFree(reinterpret_cast<void *>(memPtr)));
-                }
-
-            public:
-                dev::DevCudaRt m_dev;               // NOTE: The device has to be destructed after the memory pointer because it is required for destruction.
-                vec::Vec<TDim, TIdx> m_extentElements;
-                std::shared_ptr<TElem> m_spMem;
-                TIdx m_pitchBytes;
-            };
-        }
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt device type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                using type = dev::DevCudaRt;
-            };
-            //#############################################################################
-            //! The BufCudaRt device get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetDev<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                ALPAKA_FN_HOST static auto getDev(
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf)
-                -> dev::DevCudaRt
-                {
-                    return buf.m_dev;
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt dimension getter trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct ElemType<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt extent get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetExtent<
-                TIdxIntegralConst,
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx> const & extent)
-                -> TIdx
-                {
-                    return extent.m_extentElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCudaRt native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf)
-                    -> TElem const *
-                    {
-                        return buf.m_spMem.get();
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> & buf)
-                    -> TElem *
-                    {
-                        return buf.m_spMem.get();
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt pointer on device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevCudaRt const & dev)
-                    -> TElem const *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spMem.get();
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> & buf,
-                        dev::DevCudaRt const & dev)
-                    -> TElem *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spMem.get();
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt pitch get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    dim::DimInt<TDim::value - 1u>,
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf)
-                    -> TIdx
-                    {
-                        return buf.m_pitchBytes;
-                    }
-                };
-            }
-        }
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA 1D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<1u>,
-                    TIdx,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevCudaRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufCudaRt<TElem, dim::DimInt<1u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        auto const width(extent::getWidth(extent));
-                        auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                dev.m_iDevice));
-                        // Allocate the buffer on this device.
-                        void * memPtr;
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMalloc(
-                                &memPtr,
-                                static_cast<std::size_t>(widthBytes)));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << width
-                            << " ewb: " << widthBytes
-                            << " ptr: " << memPtr
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufCudaRt<TElem, dim::DimInt<1u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(memPtr),
-                                static_cast<TIdx>(widthBytes),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The CUDA 2D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<2u>,
-                    TIdx,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevCudaRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufCudaRt<TElem, dim::DimInt<2u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        auto const width(extent::getWidth(extent));
-                        auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
-                        auto const height(extent::getHeight(extent));
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                dev.m_iDevice));
-                        // Allocate the buffer on this device.
-                        void * memPtr;
-                        std::size_t pitchBytes;
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMallocPitch(
-                                &memPtr,
-                                &pitchBytes,
-                                static_cast<std::size_t>(widthBytes),
-                                static_cast<std::size_t>(height)));
-                        ALPAKA_ASSERT(pitchBytes >= static_cast<std::size_t>(widthBytes) || (width * height) == 0);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << width
-                            << " eh: " << height
-                            << " ewb: " << widthBytes
-                            << " ptr: " << memPtr
-                            << " pitch: " << pitchBytes
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufCudaRt<TElem, dim::DimInt<2u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(memPtr),
-                                static_cast<TIdx>(pitchBytes),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The CUDA 3D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<3u>,
-                    TIdx,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevCudaRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufCudaRt<TElem, dim::DimInt<3u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        cudaExtent const cudaExtentVal(
-                            make_cudaExtent(
-                                static_cast<std::size_t>(extent::getWidth(extent) * static_cast<TIdx>(sizeof(TElem))),
-                                static_cast<std::size_t>(extent::getHeight(extent)),
-                                static_cast<std::size_t>(extent::getDepth(extent))));
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                dev.m_iDevice));
-                        // Allocate the buffer on this device.
-                        cudaPitchedPtr cudaPitchedPtrVal;
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMalloc3D(
-                                &cudaPitchedPtrVal,
-                                cudaExtentVal));
-
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << extent::getWidth(extent)
-                            << " eh: " << cudaExtentVal.height
-                            << " ed: " << cudaExtentVal.depth
-                            << " ewb: " << cudaExtentVal.width
-                            << " ptr: " << cudaPitchedPtrVal.ptr
-                            << " pitch: " << cudaPitchedPtrVal.pitch
-                            << " wb: " << cudaPitchedPtrVal.xsize
-                            << " h: " << cudaPitchedPtrVal.ysize
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufCudaRt<TElem, dim::DimInt<3u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(cudaPitchedPtrVal.ptr),
-                                static_cast<TIdx>(cudaPitchedPtrVal.pitch),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt CUDA device memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevCudaRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Mapping memory from one CUDA device into an other CUDA device not implemented!");
-                        }
-                        // If it is already the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt CUDA device memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevCudaRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Unmapping memory mapped from one CUDA device into an other CUDA device not implemented!");
-                        }
-                        // If it is already the same device, nothing has to be unmapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt memory pinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Pin<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto pin(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // CUDA device memory is always pinned, it can not be swapped out.
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt memory unpinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unpin<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unpin(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // CUDA device memory is always pinned, it can not be swapped out.
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt memory pin state trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct IsPinned<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isPinned(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const &)
-                    -> bool
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // CUDA device memory is always pinned, it can not be swapped out.
-                        return true;
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt memory prepareForAsyncCopy trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct PrepareForAsyncCopy<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto prepareForAsyncCopy(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // CUDA device memory is always ready for async copy
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                   mem::buf::BufCudaRt<TElem, TDim, TIdx> const &)
-                -> TIdx
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt idx type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // Trait specializations for BufCpu.
-    namespace mem
-    {
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu CUDA device memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCudaRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            // cudaHostRegisterMapped:
-                            //   Maps the allocation into the CUDA address space.The device pointer to the memory may be obtained by calling cudaHostGetDevicePointer().
-                            //   This feature is available only on GPUs with compute capability greater than or equal to 1.1.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaHostRegister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                    extent::getExtentProduct(buf) * sizeof(elem::Elem<BufCpu<TElem, TDim, TIdx>>),
-                                    cudaHostRegisterMapped));
-                        }
-                        // If it is already the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu CUDA device memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCudaRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            // Unmaps the memory range whose base address is specified by ptr, and makes it pageable again.
-                            // \FIXME: If the memory has separately been pinned before we destroy the pinning state.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaHostUnregister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf)))));
-                        }
-                        // If it is already the same device, nothing has to be unmapped.
-                    }
-                };
-            }
-        }
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu pointer on CUDA device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf,
-                        dev::DevCudaRt const &)
-                    -> TElem const *
-                    {
-                        // TODO: Check if the memory is mapped at all!
-                        TElem * pDev(nullptr);
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaHostGetDevicePointer(
-                                &pDev,
-                                const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                0));
-                        return pDev;
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCudaRt const &)
-                    -> TElem *
-                    {
-                        // TODO: Check if the memory is mapped at all!
-                        TElem * pDev(nullptr);
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaHostGetDevicePointer(
-                                &pDev,
-                                mem::view::getPtrNative(buf),
-                                0));
-                        return pDev;
-                    }
-                };
-            }
-        }
-    }
-}
-
-#include <alpaka/mem/buf/cuda/Copy.hpp>
-#include <alpaka/mem/buf/cuda/Set.hpp>
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/BufHipRt.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/BufHipRt.hpp
deleted file mode 100644
index 0b043117c7..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/BufHipRt.hpp
+++ /dev/null
@@ -1,791 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-
-#include <memory>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevHipRt;
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCpu;
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            //#############################################################################
-            //! The HIP memory buffer.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufHipRt
-            {
-                static_assert(
-                    !std::is_const<TElem>::value,
-                    "The elem type of the buffer can not be const because the C++ Standard forbids containers of const elements!");
-                static_assert(
-                    !std::is_const<TIdx>::value,
-                    "The idx type of the buffer can not be const!");
-            private:
-                using Elem = TElem;
-                using Dim = TDim;
-
-            public:
-                //-----------------------------------------------------------------------------
-                //! Constructor
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST BufHipRt(
-                    dev::DevHipRt const & dev,
-                    TElem * const pMem,
-                    TIdx const & pitchBytes,
-                    TExtent const & extent) :
-                        m_dev(dev),
-                        m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                        m_spMem(
-                            pMem,
-                            // NOTE: Because the BufHipRt object can be copied and the original object could have been destroyed,
-                            // a std::ref(m_dev) or a this pointer can not be bound to the callback because they are not always valid at time of destruction.
-                            std::bind(&BufHipRt::freeBuffer, std::placeholders::_1, m_dev)),
-                        m_pitchBytes(pitchBytes)
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        TDim::value == dim::Dim<TExtent>::value,
-                        "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! Frees the shared buffer.
-                ALPAKA_FN_HOST static auto freeBuffer(
-                    TElem * const memPtr,
-                    dev::DevHipRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-                    // Free the buffer.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipFree(reinterpret_cast<void *>(memPtr)));
-                }
-
-            public:
-                dev::DevHipRt m_dev;               // NOTE: The device has to be destructed after the memory pointer because it is required for destruction.
-                vec::Vec<TDim, TIdx> m_extentElements;
-                std::shared_ptr<TElem> m_spMem;
-                TIdx m_pitchBytes;
-            };
-        }
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt device type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                using type = dev::DevHipRt;
-            };
-            //#############################################################################
-            //! The BufHipRt device get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetDev<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                ALPAKA_FN_HOST static auto getDev(
-                    mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf)
-                -> dev::DevHipRt
-                {
-                    return buf.m_dev;
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt dimension getter trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct ElemType<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt extent get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetExtent<
-                TIdxIntegralConst,
-                mem::buf::BufHipRt<TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    mem::buf::BufHipRt<TElem, TDim, TIdx> const & extent)
-                -> TIdx
-                {
-                    return extent.m_extentElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufHipRt native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf)
-                    -> TElem const *
-                    {
-                        return buf.m_spMem.get();
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> & buf)
-                    -> TElem *
-                    {
-                        return buf.m_spMem.get();
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt pointer on device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevHipRt const & dev)
-                    -> TElem const *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spMem.get();
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> & buf,
-                        dev::DevHipRt const & dev)
-                    -> TElem *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spMem.get();
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt pitch get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    dim::DimInt<TDim::value - 1u>,
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf)
-                    -> TIdx
-                    {
-                        return buf.m_pitchBytes;
-                    }
-                };
-            }
-        }
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP 1D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<1u>,
-                    TIdx,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevHipRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufHipRt<TElem, dim::DimInt<1u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        auto const width(extent::getWidth(extent));
-                        auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                dev.m_iDevice));
-                        // Allocate the buffer on this device.
-                        void * memPtr;
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMalloc(
-                                &memPtr,
-                                static_cast<std::size_t>(widthBytes)));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << width
-                            << " ewb: " << widthBytes
-                            << " ptr: " << memPtr
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufHipRt<TElem, dim::DimInt<1u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(memPtr),
-                                static_cast<TIdx>(widthBytes),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The HIP 2D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<2u>,
-                    TIdx,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevHipRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufHipRt<TElem, dim::DimInt<2u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        auto const width(extent::getWidth(extent));
-                        auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
-                        auto const height(extent::getHeight(extent));
-
-                        void * memPtr = nullptr;
-                        std::size_t pitchBytes = widthBytes;
-
-                        //FIXME: hcc cannot handle zero-size input (throws Unknown Error)
-                        if(width!=0 && height!=0) {
-
-                            // Set the current device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    dev.m_iDevice));
-
-
-                            // Allocate the buffer on this device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMallocPitch(
-                                    &memPtr,
-                                    &pitchBytes,
-                                    static_cast<std::size_t>(widthBytes),
-                                    static_cast<std::size_t>(height)));
-                            ALPAKA_ASSERT(pitchBytes >= static_cast<std::size_t>(widthBytes) || (width * height) == 0);
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << width
-                            << " eh: " << height
-                            << " ewb: " << widthBytes
-                            << " ptr: " << memPtr
-                            << " pitch: " << pitchBytes
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufHipRt<TElem, dim::DimInt<2u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(memPtr),
-                                static_cast<TIdx>(pitchBytes),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The HIP 3D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<3u>,
-                    TIdx,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevHipRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufHipRt<TElem, dim::DimInt<3u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        hipExtent const hipExtentVal(
-                            make_hipExtent(
-                                static_cast<std::size_t>(extent::getWidth(extent) * static_cast<TIdx>(sizeof(TElem))),
-                                static_cast<std::size_t>(extent::getHeight(extent)),
-                                static_cast<std::size_t>(extent::getDepth(extent))));
-
-                        hipPitchedPtr hipPitchedPtrVal = {0};
-
-                        //FIXME: hcc cannot handle zero-size input
-                        if(hipExtentVal.width!=0
-                           && hipExtentVal.height!=0
-                           && hipExtentVal.depth!=0) {
-
-                            // Set the current device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    dev.m_iDevice));
-                            // Allocate the buffer on this device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMalloc3D(
-                                    &hipPitchedPtrVal,
-                                    hipExtentVal));
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << extent::getWidth(extent)
-                            << " eh: " << hipExtentVal.height
-                            << " ed: " << hipExtentVal.depth
-                            << " ewb: " << hipExtentVal.width
-                            << " ptr: " << hipPitchedPtrVal.ptr
-                            << " pitch: " << hipPitchedPtrVal.pitch
-                            << " wb: " << hipPitchedPtrVal.xsize
-                            << " h: " << hipPitchedPtrVal.ysize
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufHipRt<TElem, dim::DimInt<3u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(hipPitchedPtrVal.ptr),
-                                static_cast<TIdx>(hipPitchedPtrVal.pitch),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt HIP device memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevHipRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Mapping memory from one HIP device into an other HIP device not implemented!");
-                        }
-                        // If it is already the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt HIP device memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevHipRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Unmapping memory mapped from one HIP device into an other HIP device not implemented!");
-                        }
-                        // If it is already the same device, nothing has to be unmapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt memory pinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Pin<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto pin(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // HIP device memory is always pinned, it can not be swapped out.
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt memory unpinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unpin<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unpin(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // HIP device memory is always pinned, it can not be swapped out.
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt memory pin state trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct IsPinned<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isPinned(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const &)
-                    -> bool
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // HIP device memory is always pinned, it can not be swapped out.
-                        return true;
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt memory prepareForAsyncCopy trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct PrepareForAsyncCopy<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto prepareForAsyncCopy(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // HIP device memory is always ready for async copy
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                   mem::buf::BufHipRt<TElem, TDim, TIdx> const &)
-                -> TIdx
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt idx type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // Trait specializations for BufCpu.
-    namespace mem
-    {
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu HIP device memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevHipRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            // hipHostRegisterMapped:
-                            //   Maps the allocation into the HIP address space.The device pointer to the memory may be obtained by calling hipHostGetDevicePointer().
-                            //   This feature is available only on GPUs with compute capability greater than or equal to 1.1.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipHostRegister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                    extent::getExtentProduct(buf) * sizeof(elem::Elem<BufCpu<TElem, TDim, TIdx>>),
-                                    hipHostRegisterMapped));
-                        }
-                        // If it is already the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu HIP device memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevHipRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            // Unmaps the memory range whose base address is specified by ptr, and makes it pageable again.
-                            // \FIXME: If the memory has separately been pinned before we destroy the pinning state.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipHostUnregister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf)))));
-                        }
-                        // If it is already the same device, nothing has to be unmapped.
-                    }
-                };
-            }
-        }
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu pointer on HIP device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf,
-                        dev::DevHipRt const &)
-                    -> TElem const *
-                    {
-                        // TODO: Check if the memory is mapped at all!
-                        TElem * pDev(nullptr);
-                        ALPAKA_HIP_RT_CHECK(
-                            hipHostGetDevicePointer(
-                                &pDev,
-                                const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                0));
-                        return pDev;
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevHipRt const &)
-                    -> TElem *
-                    {
-                        // TODO: Check if the memory is mapped at all!
-                        TElem * pDev(nullptr);
-                        ALPAKA_HIP_RT_CHECK(
-                            hipHostGetDevicePointer(
-                                &pDev,
-                                mem::view::getPtrNative(buf),
-                                0));
-                        return pDev;
-                    }
-                };
-            }
-        }
-    }
-}
-
-#include <alpaka/mem/buf/hip/Copy.hpp>
-#include <alpaka/mem/buf/hip/Set.hpp>
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/Traits.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/Traits.hpp
deleted file mode 100644
index d4baea9ceb..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/Traits.hpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/mem/view/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <boost/config.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The memory specifics.
-    namespace mem
-    {
-        //-----------------------------------------------------------------------------
-        //! The buffer specifics.
-        namespace buf
-        {
-            //-----------------------------------------------------------------------------
-            //! The buffer traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The memory buffer type trait.
-                template<
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx,
-                    typename TSfinae = void>
-                struct BufType;
-
-                //#############################################################################
-                //! The memory allocator trait.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct Alloc;
-
-                //#############################################################################
-                //! The memory mapping trait.
-                template<
-                    typename TBuf,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct Map;
-
-                //#############################################################################
-                //! The memory unmapping trait.
-                template<
-                    typename TBuf,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct Unmap;
-
-                //#############################################################################
-                //! The memory pinning trait.
-                template<
-                    typename TBuf,
-                    typename TSfinae = void>
-                struct Pin;
-
-                //#############################################################################
-                //! The memory unpinning trait.
-                template<
-                    typename TBuf,
-                    typename TSfinae = void>
-                struct Unpin;
-
-                //#############################################################################
-                //! The memory pin state trait.
-                template<
-                    typename TBuf,
-                    typename TSfinae = void>
-                struct IsPinned;
-
-                //#############################################################################
-                //! The memory prepareForAsyncCopy trait.
-                template<
-                    typename TBuf,
-                    typename TSfinae = void>
-                struct PrepareForAsyncCopy;
-            }
-
-            //#############################################################################
-            //! The memory buffer type trait alias template to remove the ::type.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            using Buf = typename traits::BufType<TDev, TElem, TDim, TIdx>::type;
-
-            //-----------------------------------------------------------------------------
-            //! Allocates memory on the given device.
-            //!
-            //! \tparam TElem The element type of the returned buffer.
-            //! \tparam TExtent The extent of the buffer.
-            //! \tparam TDev The type of device the buffer is allocated on.
-            //! \param dev The device to allocate the buffer on.
-            //! \param extent The extent of the buffer.
-            //! \return The newly allocated buffer.
-            template<
-                typename TElem,
-                typename TIdx,
-                typename TExtent,
-                typename TDev>
-            ALPAKA_FN_HOST auto alloc(
-                TDev const & dev,
-                TExtent const & extent = TExtent())
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::Alloc<
-                    TElem,
-                    dim::Dim<TExtent>,
-                    TIdx,
-                    TDev>
-                ::alloc(
-                    dev,
-                    extent))
-#endif
-            {
-                return
-                    traits::Alloc<
-                        TElem,
-                        dim::Dim<TExtent>,
-                        TIdx,
-                        TDev>
-                    ::alloc(
-                        dev,
-                        extent);
-            }
-            //-----------------------------------------------------------------------------
-            //! Maps the buffer into the memory of the given device.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \tparam TDev The device type.
-            //! \param buf The buffer to map into the device memory.
-            //! \param dev The device to map the buffer into.
-            template<
-                typename TBuf,
-                typename TDev>
-            ALPAKA_FN_HOST auto map(
-                TBuf & buf,
-                TDev const & dev)
-            -> void
-            {
-                return
-                    traits::Map<
-                        TBuf,
-                        TDev>
-                    ::map(
-                        buf,
-                        dev);
-            }
-            //-----------------------------------------------------------------------------
-            //! Unmaps the buffer from the memory of the given device.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \tparam TDev The device type.
-            //! \param buf The buffer to unmap from the device memory.
-            //! \param dev The device to unmap the buffer from.
-            template<
-                typename TBuf,
-                typename TDev>
-            ALPAKA_FN_HOST auto unmap(
-                TBuf & buf,
-                TDev const & dev)
-            -> void
-            {
-                return
-                    traits::Unmap<
-                        TBuf,
-                        TDev>
-                    ::unmap(
-                        buf,
-                        dev);
-            }
-            //-----------------------------------------------------------------------------
-            //! Pins the buffer.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \param buf The buffer to pin in the device memory.
-            template<
-                typename TBuf>
-            ALPAKA_FN_HOST auto pin(
-                TBuf & buf)
-            -> void
-            {
-                return
-                    traits::Pin<
-                        TBuf>
-                    ::pin(
-                        buf);
-            }
-            //-----------------------------------------------------------------------------
-            //! Unpins the buffer.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \param buf The buffer to unpin from the device memory.
-            template<
-                typename TBuf>
-            ALPAKA_FN_HOST auto unpin(
-                TBuf & buf)
-            -> void
-            {
-                return
-                    traits::Unpin<
-                        TBuf>
-                    ::unpin(
-                        buf);
-            }
-            //-----------------------------------------------------------------------------
-            //! The pin state of the buffer.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \param buf The buffer to get the pin state of.
-            template<
-                typename TBuf>
-            ALPAKA_FN_HOST auto isPinned(
-                TBuf const & buf)
-            -> bool
-            {
-                return
-                    traits::IsPinned<
-                        TBuf>
-                    ::isPinned(
-                        buf);
-            }
-            //-----------------------------------------------------------------------------
-            //! Prepares the buffer for non-blocking copy operations, e.g. pinning if
-            //! non-blocking copy between a cpu and a cuda device is wanted
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \param buf The buffer to prepare in the device memory.
-            template<
-                typename TBuf>
-            ALPAKA_FN_HOST auto prepareForAsyncCopy(
-                TBuf & buf)
-            -> void
-            {
-                return
-                    traits::PrepareForAsyncCopy<
-                        TBuf>
-                    ::prepareForAsyncCopy(
-                        buf);
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp
deleted file mode 100644
index 4ce4d28b26..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, Rene Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/Integral.hpp>
-
-#include <cstring>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevCpu;
-    }
-}
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            namespace cpu
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CPU device memory copy task base.
-                    //!
-                    //! Copies from CPU memory into CPU memory.
-                    template<
-                        typename TDim,
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCpuBase
-                    {
-                        using ExtentSize = idx::Idx<TExtent>;
-                        using DstSize = idx::Idx<TViewDst>;
-                        using SrcSize = idx::Idx<TViewSrc>;
-                        using Elem = elem::Elem<TViewSrc>;
-
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == TDim::value,
-                            "The destination view and the input TDim are required to have the same dimensionality!");
-
-                        static_assert(
-                            meta::IsIntegralSuperset<DstSize, ExtentSize>::value,
-                            "The destination view and the extent are required to have compatible idx type!");
-                        static_assert(
-                            meta::IsIntegralSuperset<SrcSize, ExtentSize>::value,
-                            "The source view and the extent are required to have compatible idx type!");
-
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        //-----------------------------------------------------------------------------
-                        TaskCopyCpuBase(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent) :
-                                m_extent(extent::getExtentVec(extent)),
-                                m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem))),
-#if (!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                                m_dstExtent(extent::getExtentVec(viewDst)),
-                                m_srcExtent(extent::getExtentVec(viewSrc)),
-#endif
-                                m_dstPitchBytes(mem::view::getPitchBytesVec(viewDst)),
-                                m_srcPitchBytes(mem::view::getPitchBytesVec(viewSrc)),
-
-                                m_dstMemNative(reinterpret_cast<std::uint8_t *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<std::uint8_t const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-                            ALPAKA_ASSERT((vec::cast<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
-                            ALPAKA_ASSERT((vec::cast<SrcSize>(m_extent) <= m_srcExtent).foldrAll(std::logical_or<bool>()));
-                            ALPAKA_ASSERT(static_cast<DstSize>(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 1u]);
-                            ALPAKA_ASSERT(static_cast<SrcSize>(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 1u]);
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " e: " << m_extent
-                                << " ewb: " << this->m_extentWidthBytes
-                                << " de: " << m_dstExtent
-                                << " dptr: " << reinterpret_cast<void *>(m_dstMemNative)
-                                << " dpitchb: " << m_dstPitchBytes
-                                << " se: " << m_srcExtent
-                                << " sptr: " << reinterpret_cast<void const *>(m_srcMemNative)
-                                << " spitchb: " << m_srcPitchBytes
-                                << std::endl;
-                        }
-#endif
-
-                        vec::Vec<TDim, ExtentSize> const m_extent;
-                        ExtentSize const m_extentWidthBytes;
-#if (!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                        vec::Vec<TDim, DstSize> const m_dstExtent;
-                        vec::Vec<TDim, SrcSize> const m_srcExtent;
-#endif
-                        vec::Vec<TDim, DstSize> const m_dstPitchBytes;
-                        vec::Vec<TDim, SrcSize> const m_srcPitchBytes;
-
-                        std::uint8_t * const m_dstMemNative;
-                        std::uint8_t const * const m_srcMemNative;
-                    };
-
-
-
-                    //#############################################################################
-                    //! The CPU device ND memory copy task.
-                    template<
-                        typename TDim,
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCpu : public TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>
-                    {
-                        using DimMin1 = dim::DimInt<TDim::value - 1u>;
-                        using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
-                        using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
-                        using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
-
-                        //-----------------------------------------------------------------------------
-                        using TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto operator()() const
-                        -> void
-                        {
-#if defined(BOOST_COMP_HCC) && !defined(__HIP_DEVICE_COMPILE__)
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            this->printDebug();
-#endif
-                            // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one iteration.
-                            vec::Vec<DimMin1, ExtentSize> const extentWithoutInnermost(vec::subVecBegin<DimMin1>(this->m_extent));
-                            // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required.
-                            vec::Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(vec::subVecEnd<DimMin1>(this->m_dstPitchBytes));
-                            vec::Vec<DimMin1, SrcSize> const srcPitchBytesWithoutOutmost(vec::subVecEnd<DimMin1>(this->m_srcPitchBytes));
-
-                            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                            {
-                                meta::ndLoopIncIdx(
-                                    extentWithoutInnermost,
-                                    [&](vec::Vec<DimMin1, ExtentSize> const & idx)
-                                    {
-                                        std::memcpy(
-                                            reinterpret_cast<void *>(this->m_dstMemNative + (vec::cast<DstSize>(idx) * dstPitchBytesWithoutOutmost).foldrAll(std::plus<DstSize>())),
-                                            reinterpret_cast<void const *>(this->m_srcMemNative + (vec::cast<SrcSize>(idx) * srcPitchBytesWithoutOutmost).foldrAll(std::plus<SrcSize>())),
-                                            static_cast<std::size_t>(this->m_extentWidthBytes));
-                                    });
-                            }
-#endif
-                        }
-                    };
-
-                    //#############################################################################
-                    //! The CPU device 1D memory copy task.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCpu<
-                        dim::DimInt<1u>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent> : public TaskCopyCpuBase<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>
-                    {
-                        //-----------------------------------------------------------------------------
-                        using TaskCopyCpuBase<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto operator()() const
-                        -> void
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            this->printDebug();
-#endif
-                            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                            {
-                                std::memcpy(
-                                    reinterpret_cast<void *>(this->m_dstMemNative),
-                                    reinterpret_cast<void const *>(this->m_srcMemNative),
-                                    static_cast<std::size_t>(this->m_extentWidthBytes));
-                            }
-                        }
-                    };
-                }
-            }
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU device memory copy trait specialization.
-                //!
-                //! Copies from CPU memory into CPU memory.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCpu,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> cpu::detail::TaskCopyCpu<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        return
-                            cpu::detail::TaskCopyCpu<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent);
-                    }
-                };
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/cpu/Set.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/cpu/Set.hpp
deleted file mode 100644
index 6a06c6e61f..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/cpu/Set.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/Integral.hpp>
-
-#include <cstring>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevCpu;
-    }
-}
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            namespace cpu
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CPU device ND memory set task base.
-                    template<
-                        typename TDim,
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetCpuBase
-                    {
-                        using ExtentSize = idx::Idx<TExtent>;
-                        using DstSize = idx::Idx<TView>;
-                        using Elem = elem::Elem<TView>;
-
-                        static_assert(
-                            !std::is_const<TView>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                            "The destination view and the extent are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TView>::value == TDim::value,
-                            "The destination view and the input TDim are required to have the same dimensionality!");
-
-                        static_assert(
-                            meta::IsIntegralSuperset<DstSize, ExtentSize>::value,
-                            "The view and the extent are required to have compatible idx type!");
-
-                        //-----------------------------------------------------------------------------
-                        TaskSetCpuBase(
-                            TView & view,
-                            std::uint8_t const & byte,
-                            TExtent const & extent) :
-                                m_byte(byte),
-                                m_extent(extent::getExtentVec(extent)),
-                                m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem))),
-#if (!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                                m_dstExtent(extent::getExtentVec(view)),
-#endif
-                                m_dstPitchBytes(mem::view::getPitchBytesVec(view)),
-                                m_dstMemNative(reinterpret_cast<std::uint8_t *>(mem::view::getPtrNative(view)))
-                        {
-                            ALPAKA_ASSERT((vec::cast<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]);
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " e: " << this->m_extent
-                                << " ewb: " << this->m_extentWidthBytes
-                                << " de: " << this->m_dstExtent
-                                << " dptr: " << reinterpret_cast<void *>(this->m_dstMemNative)
-                                << " dpitchb: " << this->m_dstPitchBytes
-                                << std::endl;
-                        }
-#endif
-
-                        std::uint8_t const m_byte;
-                        vec::Vec<TDim, ExtentSize> const m_extent;
-                        ExtentSize const m_extentWidthBytes;
-#if (!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                        vec::Vec<TDim, DstSize> const m_dstExtent;
-#endif
-                        vec::Vec<TDim, DstSize> const m_dstPitchBytes;
-                        std::uint8_t * const m_dstMemNative;
-                    };
-
-                    //#############################################################################
-                    //! The CPU device ND memory set task.
-                    template<
-                        typename TDim,
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetCpu : public TaskSetCpuBase<TDim, TView, TExtent>
-                    {
-                        using DimMin1 = dim::DimInt<TDim::value - 1u>;
-                        using typename TaskSetCpuBase<TDim, TView, TExtent>::ExtentSize;
-                        using typename TaskSetCpuBase<TDim, TView, TExtent>::DstSize;
-
-                        //-----------------------------------------------------------------------------
-                        using TaskSetCpuBase<TDim, TView, TExtent>::TaskSetCpuBase;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto operator()() const
-                        -> void
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            this->printDebug();
-#endif
-                            // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one iteration.
-                            vec::Vec<DimMin1, ExtentSize> const extentWithoutInnermost(vec::subVecBegin<DimMin1>(this->m_extent));
-                            // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required.
-                            vec::Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(vec::subVecEnd<DimMin1>(this->m_dstPitchBytes));
-
-                            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                            {
-                                meta::ndLoopIncIdx(
-                                    extentWithoutInnermost,
-
-                                    // workaround for HIP(HCC) to
-                                    // avoid forbidden host-call
-                                    // within host-device functions
-                                    #if defined(BOOST_COMP_HCC) && BOOST_COMP_HCC
-                                    ALPAKA_FN_HOST_ACC
-                                    #endif
-                                    [&](vec::Vec<DimMin1, ExtentSize> const & idx)
-                                    {
-
-                                        memset(
-                                            reinterpret_cast<void *>(this->m_dstMemNative + (vec::cast<DstSize>(idx) * dstPitchBytesWithoutOutmost).foldrAll(std::plus<DstSize>())),
-                                            this->m_byte,
-                                            static_cast<std::size_t>(this->m_extentWidthBytes));
-                                    });
-                            }
-                        }
-                    };
-
-                    //#############################################################################
-                    //! The CPU device 1D memory set task.
-                    template<
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetCpu<
-                        dim::DimInt<1u>,
-                        TView,
-                        TExtent> : public TaskSetCpuBase<dim::DimInt<1u>, TView, TExtent>
-                    {
-                        //-----------------------------------------------------------------------------
-                        using TaskSetCpuBase<dim::DimInt<1u>, TView, TExtent>::TaskSetCpuBase;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto operator()() const
-                        -> void
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            this->printDebug();
-#endif
-                            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                            {
-                                std::memset(
-                                    reinterpret_cast<void *>(this->m_dstMemNative),
-                                    this->m_byte,
-                                    static_cast<std::size_t>(this->m_extentWidthBytes));
-                            }
-                        }
-                    };
-                }
-            }
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU device memory set trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskSet<
-                    TDim,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TView>
-                    ALPAKA_FN_HOST static auto createTaskSet(
-                        TView & view,
-                        std::uint8_t const & byte,
-                        TExtent const & extent)
-                    -> cpu::detail::TaskSetCpu<
-                        TDim,
-                        TView,
-                        TExtent>
-                    {
-                        return
-                            cpu::detail::TaskSetCpu<
-                                TDim,
-                                TView,
-                                TExtent>(
-                                    view,
-                                    byte,
-                                    extent);
-                    }
-                };
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/cuda/Copy.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/cuda/Copy.hpp
deleted file mode 100644
index 5e62ff9a0d..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/cuda/Copy.hpp
+++ /dev/null
@@ -1,1173 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Cuda.hpp>
-
-#include <set>
-#include <tuple>
-
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            namespace cuda
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CUDA memory copy trait.
-                    template<
-                        typename TDim,
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCuda;
-
-                    //#############################################################################
-                    //! The 1D CUDA memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCuda<
-                        dim::DimInt<1>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyCuda(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            cudaMemcpyKind const & cudaMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_cudaMemCpyKind(cudaMemCpyKind),
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_extentWidth(extent::getWidth(extent)),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),
-#endif
-                                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ddev: " << m_iDstDevice
-                                << " ew: " << m_extentWidth
-                                << " ewb: " << m_extentWidthBytes
-                                << " dw: " << m_dstWidth
-                                << " dptr: " << m_dstMemNative
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sptr: " << m_srcMemNative
-                                << std::endl;
-                        }
-#endif
-                        cudaMemcpyKind m_cudaMemCpyKind;
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_extentWidth;
-                        Idx m_dstWidth;
-                        Idx m_srcWidth;
-#endif
-                        Idx m_extentWidthBytes;
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-                    //#############################################################################
-                    //! The 2D CUDA memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCuda<
-                        dim::DimInt<2>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyCuda(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            cudaMemcpyKind const & cudaMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_cudaMemCpyKind(cudaMemCpyKind),
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_extentWidth(extent::getWidth(extent)),
-#endif
-                                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),      // required for 3D peer copy
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),      // required for 3D peer copy
-
-                                m_extentHeight(extent::getHeight(extent)),
-                                m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst))),    // required for 3D peer copy
-                                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc))),    // required for 3D peer copy
-
-                                m_dstpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - 1u>(viewDst))),
-                                m_srcpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - 1u>(viewSrc))),
-                                m_dstPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - (2u % dim::Dim<TViewDst>::value)>(viewDst))),
-                                m_srcPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - (2u % dim::Dim<TViewDst>::value)>(viewSrc))),
-
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ew: " << m_extentWidth
-                                << " eh: " << m_extentHeight
-                                << " ewb: " << m_extentWidthBytes
-                                << " ddev: " << m_iDstDevice
-                                << " dw: " << m_dstWidth
-                                << " dh: " << m_dstHeight
-                                << " dptr: " << m_dstMemNative
-                                << " dpitchb: " << m_dstpitchBytesX
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sh: " << m_srcHeight
-                                << " sptr: " << m_srcMemNative
-                                << " spitchb: " << m_srcpitchBytesX
-                                << std::endl;
-                        }
-#endif
-                        cudaMemcpyKind m_cudaMemCpyKind;
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_extentWidth;
-#endif
-                        Idx m_extentWidthBytes;
-                        Idx m_dstWidth;          // required for 3D peer copy
-                        Idx m_srcWidth;          // required for 3D peer copy
-
-                        Idx m_extentHeight;
-                        Idx m_dstHeight;         // required for 3D peer copy
-                        Idx m_srcHeight;         // required for 3D peer copy
-
-                        Idx m_dstpitchBytesX;
-                        Idx m_srcpitchBytesX;
-                        Idx m_dstPitchBytesY;
-                        Idx m_srcPitchBytesY;
-
-
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-                    //#############################################################################
-                    //! The 3D CUDA memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCuda<
-                        dim::DimInt<3>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyCuda(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            cudaMemcpyKind const & cudaMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_cudaMemCpyKind(cudaMemCpyKind),
-
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-
-                                m_extentWidth(extent::getWidth(extent)),
-                                m_extentWidthBytes(m_extentWidth * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),
-
-                                m_extentHeight(extent::getHeight(extent)),
-                                m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst))),
-                                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc))),
-
-                                m_extentDepth(extent::getDepth(extent)),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_dstDepth(static_cast<Idx>(extent::getDepth(viewDst))),
-                                m_srcDepth(static_cast<Idx>(extent::getDepth(viewSrc))),
-#endif
-                                m_dstpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - 1u>(viewDst))),
-                                m_srcpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - 1u>(viewSrc))),
-                                m_dstPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - (2u % dim::Dim<TViewDst>::value)>(viewDst))),
-                                m_srcPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - (2u % dim::Dim<TViewDst>::value)>(viewSrc))),
-
-
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                            ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                            ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ew: " << m_extentWidth
-                                << " eh: " << m_extentHeight
-                                << " ed: " << m_extentDepth
-                                << " ewb: " << m_extentWidthBytes
-                                << " ddev: " << m_iDstDevice
-                                << " dw: " << m_dstWidth
-                                << " dh: " << m_dstHeight
-                                << " dd: " << m_dstDepth
-                                << " dptr: " << m_dstMemNative
-                                << " dpitchb: " << m_dstpitchBytesX
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sh: " << m_srcHeight
-                                << " sd: " << m_srcDepth
-                                << " sptr: " << m_srcMemNative
-                                << " spitchb: " << m_srcpitchBytesX
-                                << std::endl;
-                        }
-#endif
-                        cudaMemcpyKind m_cudaMemCpyKind;
-
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-
-                        Idx m_extentWidth;
-                        Idx m_extentWidthBytes;
-                        Idx m_dstWidth;
-                        Idx m_srcWidth;
-
-                        Idx m_extentHeight;
-                        Idx m_dstHeight;
-                        Idx m_srcHeight;
-
-                        Idx m_extentDepth;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_dstDepth;
-                        Idx m_srcDepth;
-#endif
-                        Idx m_dstpitchBytesX;
-                        Idx m_srcpitchBytesX;
-                        Idx m_dstPitchBytesY;
-                        Idx m_srcPitchBytesY;
-
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-
-                    //-----------------------------------------------------------------------------
-                    //! Not being able to enable peer access does not prevent such device to device memory copies.
-                    //! However, those copies may be slower because the memory is copied via the CPU.
-                    inline auto enablePeerAccessIfPossible(
-                        const int & devSrc,
-                        const int & devDst)
-                    -> void
-                    {
-                        ALPAKA_ASSERT(devSrc != devDst);
-
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wexit-time-destructors"
-#endif
-                        static std::set<std::pair<int, int>> alreadyCheckedPeerAccessDevices;
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                        auto const devicePair = std::make_pair(devSrc, devDst);
-
-                        if(alreadyCheckedPeerAccessDevices.find(devicePair) == alreadyCheckedPeerAccessDevices.end())
-                        {
-                            alreadyCheckedPeerAccessDevices.insert(devicePair);
-
-                            int canAccessPeer = 0;
-                            ALPAKA_CUDA_RT_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devSrc, devDst));
-                            if(!canAccessPeer) {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            std::cout << __func__
-                                << " Direct peer access between given GPUs is not possible!"
-                                << " src=" << devSrc
-                                << " dst=" << devDst
-                                << std::endl;
-#endif
-                                return;
-                            }
-
-                            ALPAKA_CUDA_RT_CHECK(cudaSetDevice(devSrc));
-                            // NOTE: "until access is explicitly disabled using cudaDeviceDisablePeerAccess() or either device is reset using cudaDeviceReset()."
-                            // We do not remove a device from the enabled device pairs on cudaDeviceReset.
-                            // Note that access granted by this call is unidirectional and that in order to access memory on the current device from peerDevice, a separate symmetric call to cudaDeviceEnablePeerAccess() is required.
-                            ALPAKA_CUDA_RT_CHECK(cudaDeviceEnablePeerAccess(devDst, 0));
-                        }
-                    }
-                }
-            }
-
-            //-----------------------------------------------------------------------------
-            // Trait specializations for CreateTaskCopy.
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA to CPU memory copy trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCpu,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::cuda::detail::TaskCopyCuda<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const iDevice(
-                            dev::getDev(viewSrc).m_iDevice);
-
-                        return
-                            mem::view::cuda::detail::TaskCopyCuda<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    cudaMemcpyDeviceToHost,
-                                    iDevice,
-                                    iDevice);
-                    }
-                };
-                //#############################################################################
-                //! The CPU to CUDA memory copy trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCudaRt,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::cuda::detail::TaskCopyCuda<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const iDevice(
-                            dev::getDev(viewDst).m_iDevice);
-
-                        return
-                            mem::view::cuda::detail::TaskCopyCuda<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    cudaMemcpyHostToDevice,
-                                    iDevice,
-                                    iDevice);
-                    }
-                };
-                //#############################################################################
-                //! The CUDA to CUDA memory copy trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCudaRt,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::cuda::detail::TaskCopyCuda<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        return
-                            mem::view::cuda::detail::TaskCopyCuda<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    cudaMemcpyDeviceToDevice,
-                                    dev::getDev(viewDst).m_iDevice,
-                                    dev::getDev(viewSrc).m_iDevice);
-                    }
-                };
-            }
-            namespace cuda
-            {
-                namespace detail
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST auto buildCudaMemcpy3DParms(
-                        mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3>, TViewDst, TViewSrc, TExtent> const & task)
-                    -> cudaMemcpy3DParms
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & dstWidth(task.m_dstWidth);
-                        auto const & srcWidth(task.m_srcWidth);
-
-                        auto const & extentHeight(task.m_extentHeight);
-                        //auto const & dstHeight(task.m_dstHeight);
-                        //auto const & srcHeight(task.m_srcHeight);
-
-                        auto const & extentDepth(task.m_extentDepth);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-                        auto const & dstPitchBytesY(task.m_dstPitchBytesY);
-                        auto const & srcPitchBytesY(task.m_srcPitchBytesY);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        // Fill CUDA parameter structure.
-                        cudaMemcpy3DParms cudaMemCpy3DParms;
-                        cudaMemCpy3DParms.srcArray = nullptr;  // Either srcArray or srcPtr.
-                        cudaMemCpy3DParms.srcPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DParms.srcPtr =
-                            make_cudaPitchedPtr(
-                                const_cast<void *>(srcNativePtr),
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(srcWidth),
-                                static_cast<std::size_t>(srcPitchBytesY/srcPitchBytesX));
-                        cudaMemCpy3DParms.dstArray = nullptr;  // Either dstArray or dstPtr.
-                        cudaMemCpy3DParms.dstPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DParms.dstPtr =
-                            make_cudaPitchedPtr(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                static_cast<std::size_t>(dstWidth),
-                                static_cast<std::size_t>(dstPitchBytesY / dstPitchBytesX));
-                        cudaMemCpy3DParms.extent =
-                            make_cudaExtent(
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                static_cast<std::size_t>(extentDepth));
-                        cudaMemCpy3DParms.kind = task.m_cudaMemCpyKind;
-
-                        return cudaMemCpy3DParms;
-                    }
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms(
-                        mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2>, TViewDst, TViewSrc, TExtent> const & task)
-                    -> cudaMemcpy3DPeerParms
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const & iDstDev(task.m_iDstDevice);
-                        auto const & iSrcDev(task.m_iSrcDevice);
-
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & dstWidth(task.m_dstWidth);
-                        auto const & srcWidth(task.m_srcWidth);
-
-                        auto const & extentHeight(task.m_extentHeight);
-                        //auto const & dstHeight(task.m_dstHeight);
-                        //auto const & srcHeight(task.m_srcHeight);
-
-                        auto const extentDepth(1u);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-                        auto const & dstPitchBytesY(task.m_dstPitchBytesY);
-                        auto const & srcPitchBytesY(task.m_srcPitchBytesY);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        // Fill CUDA parameter structure.
-                        cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms;
-                        cudaMemCpy3DPeerParms.dstArray = nullptr;  // Either dstArray or dstPtr.
-                        cudaMemCpy3DPeerParms.dstDevice = iDstDev;
-                        cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DPeerParms.dstPtr =
-                            make_cudaPitchedPtr(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                static_cast<std::size_t>(dstWidth),
-                                static_cast<std::size_t>(dstPitchBytesY / dstPitchBytesX));
-                        cudaMemCpy3DPeerParms.extent =
-                            make_cudaExtent(
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                static_cast<std::size_t>(extentDepth));
-                        cudaMemCpy3DPeerParms.srcArray = nullptr;  // Either srcArray or srcPtr.
-                        cudaMemCpy3DPeerParms.srcDevice = iSrcDev;
-                        cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DPeerParms.srcPtr =
-                            make_cudaPitchedPtr(
-                                const_cast<void *>(srcNativePtr),
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(srcWidth),
-                                static_cast<std::size_t>(srcPitchBytesY / srcPitchBytesX));
-
-                        return cudaMemCpy3DPeerParms;
-                    }
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms(
-                        mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3>, TViewDst, TViewSrc, TExtent> const & task)
-                    -> cudaMemcpy3DPeerParms
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const & iDstDev(task.m_iDstDevice);
-                        auto const & iSrcDev(task.m_iSrcDevice);
-
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & dstWidth(task.m_dstWidth);
-                        auto const & srcWidth(task.m_srcWidth);
-
-                        auto const & extentHeight(task.m_extentHeight);
-                        //auto const & dstHeight(task.m_dstHeight);
-                        //auto const & srcHeight(task.m_srcHeight);
-
-                        auto const & extentDepth(task.m_extentDepth);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-                        auto const & dstPitchBytesY(task.m_dstPitchBytesY);
-                        auto const & srcPitchBytesY(task.m_srcPitchBytesY);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        // Fill CUDA parameter structure.
-                        cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms;
-                        cudaMemCpy3DPeerParms.dstArray = nullptr;  // Either dstArray or dstPtr.
-                        cudaMemCpy3DPeerParms.dstDevice = iDstDev;
-                        cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DPeerParms.dstPtr =
-                            make_cudaPitchedPtr(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                static_cast<std::size_t>(dstWidth),
-                                static_cast<std::size_t>(dstPitchBytesY/dstPitchBytesX));
-                        cudaMemCpy3DPeerParms.extent =
-                            make_cudaExtent(
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                static_cast<std::size_t>(extentDepth));
-                        cudaMemCpy3DPeerParms.srcArray = nullptr;  // Either srcArray or srcPtr.
-                        cudaMemCpy3DPeerParms.srcDevice = iSrcDev;
-                        cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DPeerParms.srcPtr =
-                            make_cudaPitchedPtr(
-                                const_cast<void *>(srcNativePtr),
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(srcWidth),
-                                static_cast<std::size_t>(srcPitchBytesY / srcPitchBytesX));
-
-                        return cudaMemCpy3DPeerParms;
-                    }
-                }
-            }
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA non-blocking device queue 1D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    if(task.m_extentWidthBytes == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & cudaMemCpyKind(task.m_cudaMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpyAsync(
-                                dstNativePtr,
-                                srcNativePtr,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                cudaMemCpyKind,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpyPeerAsync(
-                                dstNativePtr,
-                                iDstDev,
-                                srcNativePtr,
-                                iSrcDev,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 1D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    if(task.m_extentWidthBytes == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & cudaMemCpyKind(task.m_cudaMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpyAsync(
-                                dstNativePtr,
-                                srcNativePtr,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                cudaMemCpyKind,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpyPeerAsync(
-                                dstNativePtr,
-                                iDstDev,
-                                srcNativePtr,
-                                iSrcDev,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA non-blocking device queue 2D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & extentHeight(task.m_extentHeight);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        auto const & cudaMemCpyKind(task.m_cudaMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy2DAsync(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                srcNativePtr,
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                cudaMemCpyKind,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync.
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DPeerParms(
-                                task));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DPeerAsync(
-                                &cudaMemCpy3DPeerParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 2D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & extentHeight(task.m_extentHeight);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        auto const & cudaMemCpyKind(task.m_cudaMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy2DAsync(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                srcNativePtr,
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                cudaMemCpyKind,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync.
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DPeerParms(
-                                task));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DPeerAsync(
-                                &cudaMemCpy3DPeerParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA non-blocking device queue 3D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DParms const cudaMemCpy3DParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DParms(
-                                task));
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DAsync(
-                                &cudaMemCpy3DParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DPeerParms(
-                                task));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DPeerAsync(
-                                &cudaMemCpy3DPeerParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 3D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DParms const cudaMemCpy3DParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DParms(
-                                task));
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DAsync(
-                                &cudaMemCpy3DParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DPeerParms(
-                                task));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DPeerAsync(
-                                &cudaMemCpy3DPeerParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/cuda/Set.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/cuda/Set.hpp
deleted file mode 100644
index b1a80bb5c1..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/cuda/Set.hpp
+++ /dev/null
@@ -1,550 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Cuda.hpp>
-
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevCudaRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            namespace cuda
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CUDA memory set trait.
-                    template<
-                        typename TDim,
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetCuda
-                    {
-                        //-----------------------------------------------------------------------------
-                        TaskSetCuda(
-                            TView & view,
-                            std::uint8_t const & byte,
-                            TExtent const & extent) :
-                                m_view(view),
-                                m_byte(byte),
-                                m_extent(extent),
-                                m_iDevice(dev::getDev(view).m_iDevice)
-                        {
-                            static_assert(
-                                !std::is_const<TView>::value,
-                                "The destination view can not be const!");
-
-                            static_assert(
-                                dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                                "The destination view and the extent are required to have the same dimensionality!");
-                        }
-
-                        TView & m_view;
-                        std::uint8_t const m_byte;
-                        TExtent const m_extent;
-                        std::int32_t const m_iDevice;
-                    };
-                }
-            }
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA device memory set trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskSet<
-                    TDim,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TView>
-                    ALPAKA_FN_HOST static auto createTaskSet(
-                        TView & view,
-                        std::uint8_t const & byte,
-                        TExtent const & extent)
-                    -> mem::view::cuda::detail::TaskSetCuda<
-                        TDim,
-                        TView,
-                        TExtent>
-                    {
-                        return
-                            mem::view::cuda::detail::TaskSetCuda<
-                                TDim,
-                                TView,
-                                TExtent>(
-                                    view,
-                                    byte,
-                                    extent);
-                    }
-                };
-            }
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA non-blocking device queue 1D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<1u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<1u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 1u,
-                        "The destination buffer is required to be 1-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-
-                    if(extentWidth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-#endif
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemsetAsync(
-                            dstNativePtr,
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 1D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<1u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<1u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 1u,
-                        "The destination buffer is required to be 1-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-
-                    if(extentWidth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-#endif
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemsetAsync(
-                            dstNativePtr,
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            queue.m_spQueueImpl->m_CudaQueue));
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA non-blocking device queue 2D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<2u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<2u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 2u,
-                        "The destination buffer is required to be 2-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-
-                    if(extentWidth == 0 || extentHeight == 0)
-                    {
-                        return;
-                    }
-
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-                    auto const dstHeight(extent::getHeight(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemset2DAsync(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            static_cast<size_t>(extentHeight),
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 2D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<2u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<2u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 2u,
-                        "The destination buffer is required to be 2-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-
-                    if(extentWidth == 0 || extentHeight == 0)
-                    {
-                        return;
-                    }
-
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-                    auto const dstHeight(extent::getHeight(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemset2DAsync(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            static_cast<size_t>(extentHeight),
-                            queue.m_spQueueImpl->m_CudaQueue));
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA non-blocking device queue 3D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<3u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<3u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 3u,
-                        "The destination buffer is required to be 3-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Elem = alpaka::elem::Elem<TView>;
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    auto const extentDepth(extent::getDepth(extent));
-
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const dstWidth(extent::getWidth(view));
-#if !defined(NDEBUG)
-                    auto const dstHeight(extent::getHeight(view));
-                    auto const dstDepth(extent::getDepth(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstPitchBytesY(mem::view::getPitchBytes<dim::Dim<TView>::value - (2u % dim::Dim<TView>::value)>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-                    ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                    // Fill CUDA parameter structures.
-                    cudaPitchedPtr const cudaPitchedPtrVal(
-                        make_cudaPitchedPtr(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(dstPitchBytesY / dstPitchBytesX)));
-
-                    cudaExtent const cudaExtentVal(
-                        make_cudaExtent(
-                            static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(extentHeight),
-                            static_cast<size_t>(extentDepth)));
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemset3DAsync(
-                            cudaPitchedPtrVal,
-                            static_cast<int>(byte),
-                            cudaExtentVal,
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 3D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<3u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<3u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 3u,
-                        "The destination buffer is required to be 3-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Elem = alpaka::elem::Elem<TView>;
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    auto const extentDepth(extent::getDepth(extent));
-
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const dstWidth(extent::getWidth(view));
-#if !defined(NDEBUG)
-                    auto const dstHeight(extent::getHeight(view));
-                    auto const dstDepth(extent::getDepth(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstPitchBytesY(mem::view::getPitchBytes<dim::Dim<TView>::value - (2u % dim::Dim<TView>::value)>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-                    ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                    // Fill CUDA parameter structures.
-                    cudaPitchedPtr const cudaPitchedPtrVal(
-                        make_cudaPitchedPtr(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(dstPitchBytesY / dstPitchBytesX)));
-
-                    cudaExtent const cudaExtentVal(
-                        make_cudaExtent(
-                            static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(extentHeight),
-                            static_cast<size_t>(extentDepth)));
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemset3DAsync(
-                            cudaPitchedPtrVal,
-                            static_cast<int>(byte),
-                            cudaExtentVal,
-                            queue.m_spQueueImpl->m_CudaQueue));
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/hip/Copy.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/hip/Copy.hpp
deleted file mode 100644
index a77369b404..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/hip/Copy.hpp
+++ /dev/null
@@ -1,1015 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/queue/QueueHipRtBlocking.hpp>
-#include <alpaka/queue/QueueHipRtNonBlocking.hpp>
-
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Hip.hpp>
-
-#include <set>
-#include <tuple>
-
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            namespace hip
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The HIP memory copy trait.
-                    template<
-                        typename TDim,
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyHip;
-
-                    //#############################################################################
-                    //! The 1D HIP memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyHip<
-                        dim::DimInt<1>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Size of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyHip(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            hipMemcpyKind const & hipMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_hipMemCpyKind(hipMemCpyKind),
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_extentWidth(extent::getWidth(extent)),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),
-#endif
-                                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ddev: " << m_iDstDevice
-                                << " ew: " << m_extentWidth
-                                << " ewb: " << m_extentWidthBytes
-                                << " dw: " << m_dstWidth
-                                << " dptr: " << m_dstMemNative
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sptr: " << m_srcMemNative
-                                << std::endl;
-                        }
-#endif
-                        hipMemcpyKind m_hipMemCpyKind;
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_extentWidth;
-                        Idx m_dstWidth;
-                        Idx m_srcWidth;
-#endif
-                        Idx m_extentWidthBytes;
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-                    //#############################################################################
-                    //! The 2D HIP memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyHip<
-                        dim::DimInt<2>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Size of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyHip(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            hipMemcpyKind const & hipMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_hipMemCpyKind(hipMemCpyKind),
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_extentWidth(extent::getWidth(extent)),
-#endif
-                                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),      // required for 3D peer copy
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),      // required for 3D peer copy
-
-                                m_extentHeight(extent::getHeight(extent)),
-                                m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst))),    // required for 3D peer copy
-                                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc))),    // required for 3D peer copy
-
-                                m_dstpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - 1u>(viewDst))),
-                                m_srcpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - 1u>(viewSrc))),
-                                m_dstPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - (2u % dim::Dim<TViewDst>::value)>(viewDst))),
-                                m_srcPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - (2u % dim::Dim<TViewDst>::value)>(viewSrc))),
-
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        //!
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ew: " << m_extentWidth
-                                << " eh: " << m_extentHeight
-                                << " ewb: " << m_extentWidthBytes
-                                << " ddev: " << m_iDstDevice
-                                << " dw: " << m_dstWidth
-                                << " dh: " << m_dstHeight
-                                << " dptr: " << m_dstMemNative
-                                << " dpitchb: " << m_dstpitchBytesX
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sh: " << m_srcHeight
-                                << " sptr: " << m_srcMemNative
-                                << " spitchb: " << m_srcpitchBytesX
-                                << std::endl;
-                        }
-#endif
-                        hipMemcpyKind m_hipMemCpyKind;
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_extentWidth;
-#endif
-                        Idx m_extentWidthBytes;
-                        Idx m_dstWidth;          // required for 3D peer copy
-                        Idx m_srcWidth;          // required for 3D peer copy
-
-                        Idx m_extentHeight;
-                        Idx m_dstHeight;         // required for 3D peer copy
-                        Idx m_srcHeight;         // required for 3D peer copy
-
-                        Idx m_dstpitchBytesX;
-                        Idx m_srcpitchBytesX;
-                        Idx m_dstPitchBytesY;
-                        Idx m_srcPitchBytesY;
-
-
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-                    //#############################################################################
-                    //! The 3D HIP memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyHip<
-                        dim::DimInt<3>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Size of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyHip(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            hipMemcpyKind const & hipMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_hipMemCpyKind(hipMemCpyKind),
-
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-
-                                m_extentWidth(extent::getWidth(extent)),
-                                m_extentWidthBytes(m_extentWidth * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),
-
-                                m_extentHeight(extent::getHeight(extent)),
-                                m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst))),
-                                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc))),
-
-                                m_extentDepth(extent::getDepth(extent)),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_dstDepth(static_cast<Idx>(extent::getDepth(viewDst))),
-                                m_srcDepth(static_cast<Idx>(extent::getDepth(viewSrc))),
-#endif
-                                m_dstpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - 1u>(viewDst))),
-                                m_srcpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - 1u>(viewSrc))),
-                                m_dstPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - (2u % dim::Dim<TViewDst>::value)>(viewDst))),
-                                m_srcPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - (2u % dim::Dim<TViewDst>::value)>(viewSrc))),
-
-
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                            ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                            ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ew: " << m_extentWidth
-                                << " eh: " << m_extentHeight
-                                << " ed: " << m_extentDepth
-                                << " ewb: " << m_extentWidthBytes
-                                << " ddev: " << m_iDstDevice
-                                << " dw: " << m_dstWidth
-                                << " dh: " << m_dstHeight
-                                << " dd: " << m_dstDepth
-                                << " dptr: " << m_dstMemNative
-                                << " dpitchb: " << m_dstpitchBytesX
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sh: " << m_srcHeight
-                                << " sd: " << m_srcDepth
-                                << " sptr: " << m_srcMemNative
-                                << " spitchb: " << m_srcpitchBytesX
-                                << std::endl;
-                        }
-#endif
-                        hipMemcpyKind m_hipMemCpyKind;
-
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-
-                        Idx m_extentWidth;
-                        Idx m_extentWidthBytes;
-                        Idx m_dstWidth;
-                        Idx m_srcWidth;
-
-                        Idx m_extentHeight;
-                        Idx m_dstHeight;
-                        Idx m_srcHeight;
-
-                        Idx m_extentDepth;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_dstDepth;
-                        Idx m_srcDepth;
-#endif
-                        Idx m_dstpitchBytesX;
-                        Idx m_srcpitchBytesX;
-                        Idx m_dstPitchBytesY;
-                        Idx m_srcPitchBytesY;
-
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-
-                    //-----------------------------------------------------------------------------
-                    //! Not being able to enable peer access does not prevent such device to device memory copies.
-                    //! However, those copies may be slower because the memory is copied via the CPU.
-                    inline auto enablePeerAccessIfPossible(
-                        const int & devSrc,
-                        const int & devDst)
-                    -> void
-                    {
-                        ALPAKA_ASSERT(devSrc != devDst);
-
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wexit-time-destructors"
-#endif
-                        static std::set<std::pair<int, int>> alreadyCheckedPeerAccessDevices;
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                        auto const devicePair = std::make_pair(devSrc, devDst);
-
-                        if(alreadyCheckedPeerAccessDevices.find(devicePair) == alreadyCheckedPeerAccessDevices.end())
-                        {
-                            alreadyCheckedPeerAccessDevices.insert(devicePair);
-
-                            int canAccessPeer = 0;
-                            ALPAKA_HIP_RT_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, devSrc, devDst));
-                            if(!canAccessPeer) {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            std::cout << __func__
-                                << " Direct peer access between given GPUs is not possible!"
-                                << " src=" << devSrc
-                                << " dst=" << devDst
-                                << std::endl;
-#endif
-                                return;
-                            }
-
-                            ALPAKA_HIP_RT_CHECK(hipSetDevice(devSrc));
-                            // NOTE: "until access is explicitly disabled using hipDeviceDisablePeerAccess() or either device is reset using hipDeviceReset()."
-                            // We do not remove a device from the enabled device pairs on hipDeviceReset.
-                            // Note that access granted by this call is unidirectional and that in order to access memory on the current device from peerDevice, a separate symmetric call to hipDeviceEnablePeerAccess() is required.
-                            ALPAKA_HIP_RT_CHECK(hipDeviceEnablePeerAccess(devDst, 0));
-                        }
-                    }
-                }
-            }
-
-            //-----------------------------------------------------------------------------
-            // Trait specializations for CreateTaskCopy.
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP to CPU memory copy trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCpu,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::hip::detail::TaskCopyHip<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const iDevice(
-                            dev::getDev(viewSrc).m_iDevice);
-
-                        return
-                            mem::view::hip::detail::TaskCopyHip<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    hipMemcpyDeviceToHost,
-                                    iDevice,
-                                    iDevice);
-                    }
-                };
-                //#############################################################################
-                //! The CPU to HIP memory copy trait specialization.
-                //#############################################################################
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevHipRt,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::hip::detail::TaskCopyHip<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const iDevice(
-                            dev::getDev(viewDst).m_iDevice);
-
-                        return
-                            mem::view::hip::detail::TaskCopyHip<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    hipMemcpyHostToDevice,
-                                    iDevice,
-                                    iDevice);
-                    }
-                };
-                //#############################################################################
-                //! The HIP to HIP memory copy trait specialization.
-                //#############################################################################
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevHipRt,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::hip::detail::TaskCopyHip<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        return
-                            mem::view::hip::detail::TaskCopyHip<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    hipMemcpyDeviceToDevice,
-                                    dev::getDev(viewDst).m_iDevice,
-                                    dev::getDev(viewSrc).m_iDevice);
-                    }
-                };
-            }
-            namespace hip
-            {
-                namespace detail
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST auto buildHipMemcpy3DParms(
-                        mem::view::hip::detail::TaskCopyHip<dim::DimInt<3>, TViewDst, TViewSrc, TExtent> const & task)
-                    -> hipMemcpy3DParms
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & dstWidth(task.m_dstWidth);
-                        auto const & srcWidth(task.m_srcWidth);
-
-                        auto const & extentHeight(task.m_extentHeight);
-                        //auto const & dstHeight(task.m_dstHeight);
-                        //auto const & srcHeight(task.m_srcHeight);
-
-                        auto const & extentDepth(task.m_extentDepth);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-                        auto const & dstPitchBytesY(task.m_dstPitchBytesY);
-                        auto const & srcPitchBytesY(task.m_srcPitchBytesY);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        // Fill HIP parameter structure.
-                        hipMemcpy3DParms hipMemCpy3DParms;
-                        hipMemCpy3DParms.srcArray = nullptr;  // Either srcArray or srcPtr.
-                        hipMemCpy3DParms.srcPos = make_hipPos(0, 0, 0);  // Optional. Offset in bytes.
-                        hipMemCpy3DParms.srcPtr =
-                            make_hipPitchedPtr(
-                                const_cast<void *>(srcNativePtr),
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(srcWidth),
-                                static_cast<std::size_t>(srcPitchBytesY/srcPitchBytesX));
-                        hipMemCpy3DParms.dstArray = nullptr;  // Either dstArray or dstPtr.
-                        hipMemCpy3DParms.dstPos = make_hipPos(0, 0, 0);  // Optional. Offset in bytes.
-                        hipMemCpy3DParms.dstPtr =
-                            make_hipPitchedPtr(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                static_cast<std::size_t>(dstWidth),
-                                static_cast<std::size_t>(dstPitchBytesY/dstPitchBytesX));
-                        hipMemCpy3DParms.extent =
-                            make_hipExtent(
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                static_cast<std::size_t>(extentDepth));
-#ifdef __HIP_PLATFORM_NVCC__
-                        hipMemCpy3DParms.kind = hipMemcpyKindToCudaMemcpyKind(task.m_hipMemCpyKind);
-#else
-                        hipMemCpy3DParms.kind = task.m_hipMemCpyKind;
-#endif
-
-                        return hipMemCpy3DParms;
-                    }
-                }
-            }
-        }
-    }
-
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP non-blocking device queue 1D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    if(task.m_extentWidthBytes == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & hipMemCpyKind(task.m_hipMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMemcpyAsync(
-                                dstNativePtr,
-                                srcNativePtr,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                hipMemCpyKind,
-                                queue.m_spQueueImpl->m_HipQueue));
-                    }
-                    else
-                    {
-                        // Initiate the memory copy.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMemcpyPeerAsync(
-                                dstNativePtr,
-                                iDstDev,
-                                srcNativePtr,
-                                iSrcDev,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                queue.m_spQueueImpl->m_HipQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 1D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    if(task.m_extentWidthBytes == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & hipMemCpyKind(task.m_hipMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMemcpyAsync(
-                                dstNativePtr,
-                                srcNativePtr,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                hipMemCpyKind,
-                                queue.m_spQueueImpl->m_HipQueue));
-                    }
-                    else
-                    {
-                        // Initiate the memory copy.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMemcpyPeerAsync(
-                                dstNativePtr,
-                                iDstDev,
-                                srcNativePtr,
-                                iSrcDev,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                queue.m_spQueueImpl->m_HipQueue));
-                    }
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP non-blocking device queue 2D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-                    auto const & extentHeight(task.m_extentHeight);
-
-                    auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                    auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    auto const & hipMemCpyKind(task.m_hipMemCpyKind);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDstDev));
-
-                    if(iDstDev != iSrcDev)
-                    {
-                        // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers.
-                        // P2P access has to be enabled to avoid host transfer.
-                        // Checks if devices are connected via PCIe switch and enable P2P access then.
-                        alpaka::mem::view::hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-                    }
-
-                    // Initiate the memory copy.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemcpy2DAsync(
-                            dstNativePtr,
-                            static_cast<std::size_t>(dstPitchBytesX),
-                            srcNativePtr,
-                            static_cast<std::size_t>(srcPitchBytesX),
-                            static_cast<std::size_t>(extentWidthBytes),
-                            static_cast<std::size_t>(extentHeight),
-                            hipMemCpyKind,
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 2D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-                    auto const & extentHeight(task.m_extentHeight);
-
-                    auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                    auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    auto const & hipMemCpyKind(task.m_hipMemCpyKind);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDstDev));
-
-                    if(iDstDev != iSrcDev)
-                    {
-                        // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers.
-                        // P2P access has to be enabled to avoid host transfer.
-                        // Checks if devices are connected via PCIe switch and enable P2P access then.
-                        alpaka::mem::view::hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-                    }
-
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemcpy2DAsync(
-                            dstNativePtr,
-                            static_cast<std::size_t>(dstPitchBytesX),
-                            srcNativePtr,
-                            static_cast<std::size_t>(srcPitchBytesX),
-                            static_cast<std::size_t>(extentWidthBytes),
-                            static_cast<std::size_t>(extentHeight),
-                            hipMemCpyKind,
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-
-                }
-            };
-            //#############################################################################
-            //! The HIP non-blocking device queue 3D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    // Create the struct describing the copy.
-                    hipMemcpy3DParms const hipMemCpy3DParms(
-                        mem::view::hip::detail::buildHipMemcpy3DParms(
-                            task));
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDstDev));
-
-                    if(iDstDev != iSrcDev)
-                    {
-                        // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers.
-                        // P2P access has to be enabled to avoid host transfer.
-                        // Checks if devices are connected via PCIe switch and enable P2P access then.
-                        alpaka::mem::view::hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-                    }
-
-                    // Initiate the memory copy.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemcpy3DAsync(
-                            &hipMemCpy3DParms,
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 3D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    // Create the struct describing the copy.
-                    hipMemcpy3DParms const hipMemCpy3DParms(
-                        mem::view::hip::detail::buildHipMemcpy3DParms(
-                            task));
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDstDev));
-
-                    if(iDstDev != iSrcDev)
-                    {
-                        // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers.
-                        // P2P access has to be enabled to avoid host transfer.
-                        // Checks if devices are connected via PCIe switch and enable P2P access then.
-                        alpaka::mem::view::hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-                    }
-
-                    // Initiate the memory copy.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemcpy3DAsync(
-                            &hipMemCpy3DParms,
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
-
diff --git a/thirdParty/alpaka/include/alpaka/mem/buf/hip/Set.hpp b/thirdParty/alpaka/include/alpaka/mem/buf/hip/Set.hpp
deleted file mode 100644
index e28f17c4ec..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/buf/hip/Set.hpp
+++ /dev/null
@@ -1,541 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/queue/QueueHipRtBlocking.hpp>
-#include <alpaka/queue/QueueHipRtNonBlocking.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Hip.hpp>
-
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevHipRt;
-    }
-}
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    // Trait specializations for Set.
-    //-----------------------------------------------------------------------------
-    namespace mem
-    {
-        namespace view
-        {
-            namespace hip
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The HIP memory set trait.
-                    template<
-                        typename TDim,
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetHip
-                    {
-                        //-----------------------------------------------------------------------------
-                        TaskSetHip(
-                            TView & view,
-                            std::uint8_t const & byte,
-                            TExtent const & extent) :
-                                m_view(view),
-                                m_byte(byte),
-                                m_extent(extent),
-                                m_iDevice(dev::getDev(view).m_iDevice)
-                        {
-                            static_assert(
-                                !std::is_const<TView>::value,
-                                "The destination view can not be const!");
-                            static_assert(
-                                dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                                "The destination view and the extent are required to have the same dimensionality!");
-                        }
-
-                        TView & m_view;
-                        std::uint8_t const m_byte;
-                        TExtent const m_extent;
-                        std::int32_t const m_iDevice;
-                    };
-                }
-            }
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP device memory set trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskSet<
-                    TDim,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    //!
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TView>
-                    ALPAKA_FN_HOST static auto createTaskSet(
-                        TView & view,
-                        std::uint8_t const & byte,
-                        TExtent const & extent)
-                    -> mem::view::hip::detail::TaskSetHip<
-                        TDim,
-                        TView,
-                        TExtent>
-                    {
-                        return
-                            mem::view::hip::detail::TaskSetHip<
-                                TDim,
-                                TView,
-                                TExtent>(
-                                    view,
-                                    byte,
-                                    extent);
-                    }
-                };
-            }
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP non-blocking device queue 1D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<1u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<1u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 1u,
-                        "The destination buffer is required to be 1-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    if(extentWidth == 0)
-                    {
-                        return;
-                    }
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-#endif
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemsetAsync(
-                            dstNativePtr,
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 1D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<1u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<1u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 1u,
-                        "The destination buffer is required to be 1-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    if(extentWidth == 0)
-                    {
-                        return;
-                    }
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-#endif
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemsetAsync(
-                            dstNativePtr,
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP non-blocking device queue 2D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<2u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<2u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 2u,
-                        "The destination buffer is required to be 2-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    if(extentWidth == 0 || extentHeight == 0)
-                    {
-                        return;
-                    }
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-                    auto const dstHeight(extent::getHeight(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemset2DAsync(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            static_cast<size_t>(extentHeight),
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 2D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<2u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<2u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 2u,
-                        "The destination buffer is required to be 2-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    if(extentWidth == 0 || extentHeight == 0)
-                    {
-                        return;
-                    }
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-                    auto const dstHeight(extent::getHeight(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemset2DAsync(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            static_cast<size_t>(extentHeight),
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP non-blocking device queue 3D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<3u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<3u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    static_assert(
-                        dim::Dim<TView>::value == 3u,
-                        "The destination buffer is required to be 3-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Elem = alpaka::elem::Elem<TView>;
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    auto const extentDepth(extent::getDepth(extent));
-
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const dstWidth(extent::getWidth(view));
-#if !defined(NDEBUG)
-                    auto const dstHeight(extent::getHeight(view));
-                    auto const dstDepth(extent::getDepth(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstPitchBytesY(mem::view::getPitchBytes<dim::Dim<TView>::value - (2u % dim::Dim<TView>::value)>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-                    ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                    // Fill HIP parameter structures.
-                    hipPitchedPtr const hipPitchedPtrVal(
-                        make_hipPitchedPtr(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(dstPitchBytesY/dstPitchBytesX)));
-
-                    hipExtent const hipExtentVal(
-                        make_hipExtent(
-                            static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(extentHeight),
-                            static_cast<size_t>(extentDepth)));
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemset3DAsync(
-                            hipPitchedPtrVal,
-                            static_cast<int>(byte),
-                            hipExtentVal,
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 3D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<3u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<3u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    static_assert(
-                        dim::Dim<TView>::value == 3u,
-                        "The destination buffer is required to be 3-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Elem = alpaka::elem::Elem<TView>;
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    auto const extentDepth(extent::getDepth(extent));
-
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const dstWidth(extent::getWidth(view));
-#if !defined(NDEBUG)
-                    auto const dstHeight(extent::getHeight(view));
-                    auto const dstDepth(extent::getDepth(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstPitchBytesY(mem::view::getPitchBytes<dim::Dim<TView>::value - (2u % dim::Dim<TView>::value)>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-                    ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                    // Fill HIP parameter structures.
-                    hipPitchedPtr const hipPitchedPtrVal(
-                        make_hipPitchedPtr(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(dstPitchBytesY/dstPitchBytesX)));
-
-                    hipExtent const hipExtentVal(
-                        make_hipExtent(
-                            static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(extentHeight),
-                            static_cast<size_t>(extentDepth)));
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemset3DAsync(
-                            hipPitchedPtrVal,
-                            static_cast<int>(byte),
-                            hipExtentVal,
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/mem/view/Traits.hpp b/thirdParty/alpaka/include/alpaka/mem/view/Traits.hpp
deleted file mode 100644
index d8a3753d00..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/view/Traits.hpp
+++ /dev/null
@@ -1,609 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/elem/Traits.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/meta/Fold.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <boost/config.hpp>
-
-#include <iosfwd>
-
-namespace alpaka
-{
-    namespace mem
-    {
-        //-----------------------------------------------------------------------------
-        //! The view specifics.
-        namespace view
-        {
-            //-----------------------------------------------------------------------------
-            //! The view traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The native pointer get trait.
-                template<
-                    typename TView,
-                    typename TSfinae = void>
-                struct GetPtrNative;
-
-                //#############################################################################
-                //! The pointer on device get trait.
-                template<
-                    typename TView,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct GetPtrDev;
-
-                namespace detail
-                {
-                    //#############################################################################
-                    template<
-                        typename TIdx,
-                        typename TView,
-                        typename TSfinae = void>
-                    struct GetPitchBytesDefault;
-                }
-
-                //#############################################################################
-                //! The pitch in bytes.
-                //! This is the distance in bytes in the linear memory between two consecutive elements in the next higher dimension (TIdx-1).
-                //!
-                //! The default implementation uses the extent to calculate the pitch.
-                template<
-                    typename TIdx,
-                    typename TView,
-                    typename TSfinae = void>
-                struct GetPitchBytes
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        TView const & view)
-                    -> idx::Idx<TView>
-                    {
-                        return detail::GetPitchBytesDefault<TIdx, TView>::getPitchBytesDefault(view);
-                    }
-                };
-
-                namespace detail
-                {
-                    //#############################################################################
-                    template<
-                        typename TIdx,
-                        typename TView>
-                    struct GetPitchBytesDefault<
-                        TIdx,
-                        TView,
-                        typename std::enable_if<TIdx::value < (dim::Dim<TView>::value - 1)>::type>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto getPitchBytesDefault(
-                            TView const & view)
-                        -> idx::Idx<TView>
-                        {
-                            return
-                                extent::getExtent<TIdx::value>(view)
-                                * GetPitchBytes<dim::DimInt<TIdx::value+1>, TView>::getPitchBytes(view);
-                        }
-                    };
-                    //#############################################################################
-                    template<
-                        typename TView>
-                    struct GetPitchBytesDefault<
-                        dim::DimInt<dim::Dim<TView>::value - 1u>,
-                        TView>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto getPitchBytesDefault(
-                            TView const & view)
-                        -> idx::Idx<TView>
-                        {
-                            return
-                                extent::getExtent<dim::Dim<TView>::value - 1u>(view)
-                                * sizeof(elem::Elem<TView>);
-                        }
-                    };
-                    //#############################################################################
-                    template<
-                        typename TView>
-                    struct GetPitchBytesDefault<
-                        dim::DimInt<dim::Dim<TView>::value>,
-                        TView>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto getPitchBytesDefault(
-                            TView const &)
-                        -> idx::Idx<TView>
-                        {
-                            return
-                                sizeof(elem::Elem<TView>);
-                        }
-                    };
-                }
-
-                //#############################################################################
-                //! The memory set task trait.
-                //!
-                //! Fills the view with data.
-                template<
-                    typename TDim,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct CreateTaskSet;
-
-                //#############################################################################
-                //! The memory copy task trait.
-                //!
-                //! Copies memory from one view into another view possibly on a different device.
-                template<
-                    typename TDim,
-                    typename TDevDst,
-                    typename TDevSrc,
-                    typename TSfinae = void>
-                struct CreateTaskCopy;
-
-                //#############################################################################
-                //! The static device memory view creation trait.
-                template<
-                    typename TDev,
-                    typename TSfinae = void>
-                struct CreateStaticDevMemView;
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Gets the native pointer of the memory view.
-            //!
-            //! \param view The memory view.
-            //! \return The native pointer.
-            template<
-                typename TView>
-            ALPAKA_FN_HOST auto getPtrNative(
-                TView const & view)
-            -> elem::Elem<TView> const *
-            {
-                return
-                    traits::GetPtrNative<
-                        TView>
-                    ::getPtrNative(
-                        view);
-            }
-            //-----------------------------------------------------------------------------
-            //! Gets the native pointer of the memory view.
-            //!
-            //! \param view The memory view.
-            //! \return The native pointer.
-            template<
-                typename TView>
-            ALPAKA_FN_HOST auto getPtrNative(
-                TView & view)
-            -> elem::Elem<TView> *
-            {
-                return
-                    traits::GetPtrNative<
-                        TView>
-                    ::getPtrNative(
-                        view);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Gets the pointer to the view on the given device.
-            //!
-            //! \param view The memory view.
-            //! \param dev The device.
-            //! \return The pointer on the device.
-            template<
-                typename TView,
-                typename TDev>
-            ALPAKA_FN_HOST auto getPtrDev(
-                TView const & view,
-                TDev const & dev)
-            -> elem::Elem<TView> const *
-            {
-                return
-                    traits::GetPtrDev<
-                        TView,
-                        TDev>
-                    ::getPtrDev(
-                        view,
-                        dev);
-            }
-            //-----------------------------------------------------------------------------
-            //! Gets the pointer to the view on the given device.
-            //!
-            //! \param view The memory view.
-            //! \param dev The device.
-            //! \return The pointer on the device.
-            template<
-                typename TView,
-                typename TDev>
-            ALPAKA_FN_HOST auto getPtrDev(
-                TView & view,
-                TDev const & dev)
-            -> elem::Elem<TView> *
-            {
-                return
-                    traits::GetPtrDev<
-                        TView,
-                        TDev>
-                    ::getPtrDev(
-                        view,
-                        dev);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! \return The pitch in bytes. This is the distance in bytes between two consecutive elements in the given dimension.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                std::size_t Tidx,
-                typename TView>
-            ALPAKA_FN_HOST_ACC
-            auto getPitchBytes(
-                TView const & view)
-            -> idx::Idx<TView>
-            {
-                return
-                    traits::GetPitchBytes<
-                        dim::DimInt<Tidx>,
-                        TView>
-                    ::getPitchBytes(
-                        view);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Create a memory set task.
-            //!
-            //! \param view The memory view to fill.
-            //! \param byte Value to set for each element of the specified view.
-            //! \param extent The extent of the view to fill.
-            template<
-                typename TExtent,
-                typename TView>
-            ALPAKA_FN_HOST auto createTaskSet(
-                TView & view,
-                std::uint8_t const & byte,
-                TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateTaskSet<
-                    dim::Dim<TView>,
-                    dev::Dev<TView>>
-                ::createTaskSet(
-                    view,
-                    byte,
-                    extent))
-#endif
-            {
-                static_assert(
-                    dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                    "The view and the extent are required to have the same dimensionality!");
-
-                return
-                    traits::CreateTaskSet<
-                        dim::Dim<TView>,
-                        dev::Dev<TView>>
-                    ::createTaskSet(
-                        view,
-                        byte,
-                        extent);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Sets the memory to the given value.
-            //!
-            //! \param queue The queue to enqueue the view fill task into.
-            //! \param view The memory view to fill.
-            //! \param byte Value to set for each element of the specified view.
-            //! \param extent The extent of the view to fill.
-            template<
-                typename TExtent,
-                typename TView,
-                typename TQueue>
-            ALPAKA_FN_HOST auto set(
-                TQueue & queue,
-                TView & view,
-                std::uint8_t const & byte,
-                TExtent const & extent)
-            -> void
-            {
-                queue::enqueue(
-                    queue,
-                    mem::view::createTaskSet(
-                        view,
-                        byte,
-                        extent));
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Creates a memory copy task.
-            //!
-            //! \param viewDst The destination memory view.
-            //! \param viewSrc The source memory view.
-            //! \param extent The extent of the view to copy.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            ALPAKA_FN_HOST auto createTaskCopy(
-                TViewDst & viewDst,
-                TViewSrc const & viewSrc,
-                TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateTaskCopy<
-                    dim::Dim<TViewDst>,
-                    dev::Dev<TViewDst>,
-                    dev::Dev<TViewSrc>>
-                ::createTaskCopy(
-                    viewDst,
-                    viewSrc,
-                    extent))
-#endif
-            {
-                static_assert(
-                    dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                    "The source and the destination view are required to have the same dimensionality!");
-                static_assert(
-                    dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                    "The destination view and the extent are required to have the same dimensionality!");
-                static_assert(
-                    std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                    "The source and the destination view are required to have the same element type!");
-
-                return
-                    traits::CreateTaskCopy<
-                        dim::Dim<TViewDst>,
-                        dev::Dev<TViewDst>,
-                        dev::Dev<TViewSrc>>
-                    ::createTaskCopy(
-                        viewDst,
-                        viewSrc,
-                        extent);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Copies memory possibly between different memory spaces.
-            //!
-            //! \param queue The queue to enqueue the view copy task into.
-            //! \param viewDst The destination memory view.
-            //! \param viewSrc The source memory view.
-            //! \param extent The extent of the view to copy.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst,
-                typename TQueue>
-            ALPAKA_FN_HOST auto copy(
-                TQueue & queue,
-                TViewDst & viewDst,
-                TViewSrc const & viewSrc,
-                TExtent const & extent)
-            -> void
-            {
-                queue::enqueue(
-                    queue,
-                    mem::view::createTaskCopy(
-                        viewDst,
-                        viewSrc,
-                        extent));
-            }
-
-            namespace detail
-            {
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TView>
-                struct Print
-                {
-                    ALPAKA_FN_HOST static auto print(
-                        TView const & view,
-                        elem::Elem<TView> const * const ptr,
-                        vec::Vec<dim::Dim<TView>, idx::Idx<TView>> const & extent,
-                        std::ostream & os,
-                        std::string const & elementSeparator,
-                        std::string const & rowSeparator,
-                        std::string const & rowPrefix,
-                        std::string const & rowSuffix)
-                    -> void
-                    {
-                        os << rowPrefix;
-
-                        auto const pitch(view::getPitchBytes<TDim::value+1u>(view));
-                        auto const lastIdx(extent[TDim::value]-1u);
-                        for(auto i(decltype(lastIdx)(0)); i<=lastIdx ;++i)
-                        {
-                            Print<
-                                dim::DimInt<TDim::value+1u>,
-                                TView>
-                            ::print(
-                                view,
-                                reinterpret_cast<elem::Elem<TView> const *>(reinterpret_cast<std::uint8_t const *>(ptr)+i*pitch),
-                                extent,
-                                os,
-                                elementSeparator,
-                                rowSeparator,
-                                rowPrefix,
-                                rowSuffix);
-
-                            // While we are not at the end of a row, add the row separator.
-                            if(i != lastIdx)
-                            {
-                                os << rowSeparator;
-                            }
-                        }
-
-                        os << rowSuffix;
-                    }
-                };
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TView>
-                struct Print<
-                    dim::DimInt<dim::Dim<TView>::value-1u>,
-                    TView>
-                {
-                    ALPAKA_FN_HOST static auto print(
-                        TView const & view,
-                        elem::Elem<TView> const * const ptr,
-                        vec::Vec<dim::Dim<TView>, idx::Idx<TView>> const & extent,
-                        std::ostream & os,
-                        std::string const & elementSeparator,
-                        std::string const & rowSeparator,
-                        std::string const & rowPrefix,
-                        std::string const & rowSuffix)
-                    -> void
-                    {
-                        alpaka::ignore_unused(view);
-                        alpaka::ignore_unused(rowSeparator);
-
-                        os << rowPrefix;
-
-                        auto const lastIdx(extent[dim::Dim<TView>::value-1u]-1u);
-                        for(auto i(decltype(lastIdx)(0)); i<=lastIdx ;++i)
-                        {
-                            // Add the current element.
-                            os << *(ptr+i);
-
-                            // While we are not at the end of a line, add the element separator.
-                            if(i != lastIdx)
-                            {
-                                os << elementSeparator;
-                            }
-                        }
-
-                        os << rowSuffix;
-                    }
-                };
-            }
-            //-----------------------------------------------------------------------------
-            //! Prints the content of the view to the given queue.
-            // \TODO: Add precision flag.
-            // \TODO: Add column alignment flag.
-            template<
-                typename TView>
-            ALPAKA_FN_HOST auto print(
-                TView const & view,
-                std::ostream & os,
-                std::string const & elementSeparator = ", ",
-                std::string const & rowSeparator = "\n",
-                std::string const & rowPrefix = "[",
-                std::string const & rowSuffix = "]")
-            -> void
-            {
-                detail::Print<
-                    dim::DimInt<0u>,
-                    TView>
-                ::print(
-                    view,
-                    mem::view::getPtrNative(view),
-                    extent::getExtentVec(view),
-                    os,
-                    elementSeparator,
-                    rowSeparator,
-                    rowPrefix,
-                    rowSuffix);
-            }
-
-            namespace detail
-            {
-                //#############################################################################
-                //! A class with a create method that returns the pitch for each index.
-                template<
-                    std::size_t Tidx>
-                struct CreatePitchBytes
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename TPitch>
-                    ALPAKA_FN_HOST_ACC
-                    static auto create(
-                        TPitch const & pitch)
-                    -> idx::Idx<TPitch>
-                    {
-                        return mem::view::getPitchBytes<Tidx>(pitch);
-                    }
-                };
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The pitch vector.
-            template<
-                typename TPitch>
-            auto getPitchBytesVec(
-                TPitch const & pitch = TPitch())
-            -> vec::Vec<dim::Dim<TPitch>, idx::Idx<TPitch>>
-            {
-                return
-                    vec::createVecFromIndexedFnWorkaround<
-                        dim::Dim<TPitch>,
-                        idx::Idx<TPitch>,
-                        detail::CreatePitchBytes>(
-                            pitch);
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The pitch but only the last N elements.
-            template<
-                typename TDim,
-                typename TPitch>
-            ALPAKA_FN_HOST auto getPitchBytesVecEnd(
-                TPitch const & pitch = TPitch())
-            -> vec::Vec<TDim, idx::Idx<TPitch>>
-            {
-                using IdxOffset = std::integral_constant<std::intmax_t, static_cast<std::intmax_t>(dim::Dim<TPitch>::value) - static_cast<std::intmax_t>(TDim::value)>;
-                return
-                    vec::createVecFromIndexedFnOffsetWorkaround<
-                        TDim,
-                        idx::Idx<TPitch>,
-                        detail::CreatePitchBytes,
-                        IdxOffset>(
-                            pitch);
-            }
-
-            //-----------------------------------------------------------------------------
-            //! \return A view to static device memory.
-            template<
-                typename TElem,
-                typename TDev,
-                typename TExtent>
-            auto createStaticDevMemView(
-                TElem * pMem,
-                TDev const & dev,
-                TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateStaticDevMemView<
-                        TDev>
-                    ::createStaticDevMemView(
-                        pMem,
-                        dev,
-                        extent))
-#endif
-            {
-                return
-                    traits::CreateStaticDevMemView<
-                        TDev>
-                    ::createStaticDevMemView(
-                        pMem,
-                        dev,
-                        extent);
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp b/thirdParty/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp
deleted file mode 100644
index 3cf6615797..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-#include <alpaka/pltf/PltfCpu.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    // Trait specializations for fixed idx arrays.
-    //
-    // This allows the usage of multidimensional compile time arrays e.g. int[4][3] as argument to memory ops.
-    /*namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The fixed idx array device type trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct DevType<
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
-            {
-                using type = dev::DevCpu;
-            };
-
-            //#############################################################################
-            //! The fixed idx array device get trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct GetDev<
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    TFixedSizeArray const & view)
-                -> dev::DevCpu
-                {
-                    // \FIXME: CUDA device?
-                    return pltf::getDevByIdx<pltf::PltfCpu>(0u);
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The fixed idx array dimension getter trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct DimType<
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
-            {
-                using type = dim::DimInt<std::rank<TFixedSizeArray>::value>;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The fixed idx array memory element type get trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct ElemType<
-                TFixedSizeArray,
-                typename std::enable_if<
-                    std::is_array<TFixedSizeArray>::value>::type>
-            {
-                using type = typename std::remove_all_extent<TFixedSizeArray>::type;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The fixed idx array width get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TFixedSizeArray>
-            struct GetExtent<
-                TIdxIntegralConst,
-                TFixedSizeArray,
-                typename std::enable_if<
-                    std::is_array<TFixedSizeArray>::value
-                    && (std::rank<TFixedSizeArray>::value > TIdxIntegralConst::value)
-                    && (std::extent<TFixedSizeArray, TIdxIntegralConst::value>::value > 0u)>::type>
-            {
-                //-----------------------------------------------------------------------------
-                static constexpr auto getExtent(
-                    TFixedSizeArray const & //extent
-                )
-                -> idx::Idx<TFixedSizeArray>
-                {
-                    // C++14 constexpr with void return
-                    //alpaka::ignore_unused(extent);
-                    return std::extent<TFixedSizeArray, TIdxIntegralConst::value>::value;
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The fixed idx array native pointer get trait specialization.
-                template<
-                    typename TFixedSizeArray>
-                struct GetPtrNative<
-                    TFixedSizeArray,
-                    typename std::enable_if<
-                        std::is_array<TFixedSizeArray>::value>::type>
-                {
-                    using TElem = typename std::remove_all_extent<TFixedSizeArray>::type;
-
-                    //-----------------------------------------------------------------------------
-                    static auto getPtrNative(
-                        TFixedSizeArray const & view)
-                    -> TElem const *
-                    {
-                        return view;
-                    }
-                    //-----------------------------------------------------------------------------
-                    static auto getPtrNative(
-                        TFixedSizeArray & view)
-                    -> TElem *
-                    {
-                        return view;
-                    }
-                };
-
-                //#############################################################################
-                //! The fixed idx array pitch get trait specialization.
-                template<
-                    typename TFixedSizeArray>
-                struct GetPitchBytes<
-                    dim::DimInt<std::rank<TFixedSizeArray>::value - 1u>,
-                    TFixedSizeArray,
-                    typename std::enable_if<
-                        std::is_array<TFixedSizeArray>::value
-                        && (std::extent<TFixedSizeArray, std::rank<TFixedSizeArray>::value - 1u>::value > 0u)>::type>
-                {
-                    using TElem = typename std::remove_all_extent<TFixedSizeArray>::type;
-
-                    //-----------------------------------------------------------------------------
-                    static constexpr auto getPitchBytes(
-                        TFixedSizeArray const &)
-                    -> idx::Idx<TFixedSizeArray>
-                    {
-                        return sizeof(TElem) * std::extent<TFixedSizeArray, std::rank<TFixedSizeArray>::value - 1u>::value;
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The fixed idx array offset get trait specialization.
-            template<
-                typename TIdx,
-                typename TFixedSizeArray>
-            struct GetOffset<
-                TIdx,
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
-            {
-                //-----------------------------------------------------------------------------
-                static auto getOffset(
-                    TFixedSizeArray const &)
-                -> idx::Idx<TFixedSizeArray>
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::vector idx type trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct IdxType<
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
-            {
-                using type = std::size_t;
-            };
-        }
-    }*/
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp b/thirdParty/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp
deleted file mode 100644
index 19ddb1cab3..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp
+++ /dev/null
@@ -1,435 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/mem/view/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            //#############################################################################
-            //! The memory view to wrap plain pointers.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class ViewPlainPtr final
-            {
-                static_assert(
-                    !std::is_const<TIdx>::value,
-                    "The idx type of the view can not be const!");
-            public:
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST ViewPlainPtr(
-                    TElem * pMem,
-                    TDev const & dev,
-                    TExtent const & extent = TExtent()) :
-                        m_pMem(pMem),
-                        m_dev(dev),
-                        m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                        m_pitchBytes(calculatePitchesFromExtents(m_extentElements))
-                {}
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TExtent,
-                    typename TPitch>
-                ALPAKA_FN_HOST ViewPlainPtr(
-                    TElem * pMem,
-                    TDev const dev,
-                    TExtent const & extent,
-                    TPitch const & pitchBytes) :
-                        m_pMem(pMem),
-                        m_dev(dev),
-                        m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                        m_pitchBytes(
-                            vec::subVecEnd<TDim>(
-                               static_cast<
-                                    vec::Vec<TDim, TIdx> >(pitchBytes)
-                            )
-                        )
-                {}
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST
-                ViewPlainPtr(ViewPlainPtr const &) = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST
-                ViewPlainPtr(ViewPlainPtr && other) :
-                        m_pMem(other.m_pMem),
-                        m_dev(other.m_dev),
-                        m_extentElements(other.m_extentElements),
-                        m_pitchBytes(other.m_pitchBytes)
-                {
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST
-                auto operator=(ViewPlainPtr const &) -> ViewPlainPtr & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST
-                auto operator=(ViewPlainPtr &&) -> ViewPlainPtr & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST ~ViewPlainPtr() = default;
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! Calculate the pitches purely from the extents.
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST static auto calculatePitchesFromExtents(
-                    TExtent const & extent)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    vec::Vec<TDim, TIdx> pitchBytes(vec::Vec<TDim, TIdx>::all(0));
-                    pitchBytes[TDim::value - 1u] = extent[TDim::value - 1u] * static_cast<TIdx>(sizeof(TElem));
-                    for(TIdx i = TDim::value - 1u; i > static_cast<TIdx>(0u); --i)
-                    {
-                        pitchBytes[i-1] = extent[i-1] * pitchBytes[i];
-                    }
-                    return pitchBytes;
-                }
-
-            public:
-                TElem * const m_pMem;
-                TDev const m_dev;
-                vec::Vec<TDim, TIdx> const m_extentElements;
-                vec::Vec<TDim, TIdx> const m_pitchBytes;
-            };
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // Trait specializations for ViewPlainPtr.
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewPlainPtr device type trait specialization.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TDev;
-            };
-
-            //#############################################################################
-            //! The ViewPlainPtr device get trait specialization.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetDev<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                static auto getDev(
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & view)
-                    -> TDev
-                {
-                    return view.m_dev;
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewPlainPtr dimension getter trait.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewPlainPtr memory element type get trait specialization.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct ElemType<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewPlainPtr width get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetExtent<
-                TIdxIntegralConst,
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC
-                static auto getExtent(
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & extent)
-                -> TIdx
-                {
-                    return extent.m_extentElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The ViewPlainPtr native pointer get trait specialization.
-                template<
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-                {
-                    static auto getPtrNative(
-                        mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & view)
-                    -> TElem const *
-                    {
-                        return view.m_pMem;
-                    }
-                    static auto getPtrNative(
-                        mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> & view)
-                    -> TElem *
-                    {
-                        return view.m_pMem;
-                    }
-                };
-
-                //#############################################################################
-                //! The ViewPlainPtr memory pitch get trait specialization.
-                template<
-                    typename TIdxIntegralConst,
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    TIdxIntegralConst,
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>,
-                    typename std::enable_if<TIdxIntegralConst::value < TDim::value>::type>
-                {
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & view)
-                    -> TIdx
-                    {
-                        return view.m_pitchBytes[TIdxIntegralConst::value];
-                    }
-                };
-
-                //#############################################################################
-                //! The CPU device CreateStaticDevMemView trait specialization.
-                template<>
-                struct CreateStaticDevMemView<
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TElem,
-                        typename TExtent>
-                    static auto createStaticDevMemView(
-                        TElem * pMem,
-                        dev::DevCpu const & dev,
-                        TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                    -> alpaka::mem::view::ViewPlainPtr<dev::DevCpu, TElem, alpaka::dim::Dim<TExtent>, alpaka::idx::Idx<TExtent>>
-#endif
-                    {
-                        return
-                            alpaka::mem::view::ViewPlainPtr<
-                                dev::DevCpu,
-                                TElem,
-                                alpaka::dim::Dim<TExtent>,
-                                alpaka::idx::Idx<TExtent>>(
-                                    pMem,
-                                    dev,
-                                    extent);
-                    }
-                };
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                //#############################################################################
-                //! The CUDA RT device CreateStaticDevMemView trait specialization.
-                template<>
-                struct CreateStaticDevMemView<
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TElem,
-                        typename TExtent>
-                    static auto createStaticDevMemView(
-                        TElem * pMem,
-                        dev::DevCudaRt const & dev,
-                        TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                    -> alpaka::mem::view::ViewPlainPtr<dev::DevCudaRt, TElem, alpaka::dim::Dim<TExtent>, alpaka::idx::Idx<TExtent>>
-#endif
-                    {
-                        TElem* pMemAcc(nullptr);
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaGetSymbolAddress(
-                                reinterpret_cast<void **>(&pMemAcc),
-                                *pMem));
-                        return
-                            alpaka::mem::view::ViewPlainPtr<
-                                dev::DevCudaRt,
-                                TElem,
-                                alpaka::dim::Dim<TExtent>,
-                                alpaka::idx::Idx<TExtent>>(
-                                    pMemAcc,
-                                    dev,
-                                    extent);
-                    }
-                };
-#endif
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-                //#############################################################################
-                //! The HIP RT device CreateStaticDevMemView trait specialization.
-                template<>
-                struct CreateStaticDevMemView<
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TElem,
-                        typename TExtent>
-                    static auto createStaticDevMemView(
-                        TElem * pMem,
-                        dev::DevHipRt const & dev,
-                        TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                    -> alpaka::mem::view::ViewPlainPtr<dev::DevHipRt, TElem, alpaka::dim::Dim<TExtent>, alpaka::idx::Idx<TExtent>>
-#endif
-                    {
-                        TElem* pMemAcc(nullptr);
-#ifdef __HIP_PLATFORM_NVCC__
-                        ALPAKA_HIP_RT_CHECK(hipCUDAErrorTohipError(
-                            cudaGetSymbolAddress(
-                                reinterpret_cast<void **>(&pMemAcc),
-                                *pMem)));
-#else
-                        // FIXME: still does not work in HIP(HCC) (results in hipErrorNotFound)
-                        // HIP_SYMBOL(X) not useful because it only does #X on HIP(HCC), while &X on HIP(NVCC)
-                        ALPAKA_HIP_RT_CHECK(
-                            hipGetSymbolAddress(
-                                reinterpret_cast<void **>(&pMemAcc),
-                                pMem));
-#endif
-
-                        return
-                            alpaka::mem::view::ViewPlainPtr<
-                                dev::DevHipRt,
-                                TElem,
-                                alpaka::dim::Dim<TExtent>,
-                                alpaka::idx::Idx<TExtent>>(
-                                    pMemAcc,
-                                    dev,
-                                    extent);
-                    }
-                };
-#endif
-
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewPlainPtr offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC
-                static auto getOffset(
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const &)
-                -> TIdx
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewPlainPtr idx type trait specialization.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/view/ViewStdArray.hpp b/thirdParty/alpaka/include/alpaka/mem/view/ViewStdArray.hpp
deleted file mode 100644
index 561f479b86..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/view/ViewStdArray.hpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/pltf/PltfCpu.hpp>
-
-#include <array>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::array device type trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct DevType<
-                std::array<TElem, Tsize>>
-            {
-                using type = dev::DevCpu;
-            };
-
-            //#############################################################################
-            //! The std::array device get trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct GetDev<
-                std::array<TElem, Tsize>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    std::array<TElem, Tsize> const & view)
-                -> dev::DevCpu
-                {
-                    alpaka::ignore_unused(view);
-                    return pltf::getDevByIdx<pltf::PltfCpu>(0u);
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::array dimension getter trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct DimType<
-                std::array<TElem, Tsize>>
-            {
-                using type = dim::DimInt<1u>;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::array memory element type get trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct ElemType<
-                std::array<TElem, Tsize>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::array width get trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct GetExtent<
-                dim::DimInt<0u>,
-                std::array<TElem, Tsize>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static constexpr auto getExtent(
-                    std::array<TElem, Tsize> const & /*extent*/)
-                -> idx::Idx<std::array<TElem, Tsize>>
-                {
-                    // C++14 constexpr with void return
-                    /*alpaka::ignore_unused(extent);*/
-                    return Tsize;
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The std::array native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    std::size_t Tsize>
-                struct GetPtrNative<
-                    std::array<TElem, Tsize>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        std::array<TElem, Tsize> const & view)
-                    -> TElem const *
-                    {
-                        return view.data();
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        std::array<TElem, Tsize> & view)
-                    -> TElem *
-                    {
-                        return view.data();
-                    }
-                };
-
-                //#############################################################################
-                //! The std::array pitch get trait specialization.
-                template<
-                    typename TElem,
-                    std::size_t Tsize>
-                struct GetPitchBytes<
-                    dim::DimInt<0u>,
-                    std::array<TElem, Tsize>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        std::array<TElem, Tsize> const & pitch)
-                    -> idx::Idx<std::array<TElem, Tsize>>
-                    {
-                        return sizeof(TElem) * pitch.size();
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::array offset get trait specialization.
-            template<
-                typename TIdx,
-                typename TElem,
-                std::size_t Tsize>
-            struct GetOffset<
-                TIdx,
-                std::array<TElem, Tsize>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                    std::array<TElem, Tsize> const &)
-                -> idx::Idx<std::array<TElem, Tsize>>
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::vector idx type trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct IdxType<
-                std::array<TElem, Tsize>>
-            {
-                using type = std::size_t;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/view/ViewStdVector.hpp b/thirdParty/alpaka/include/alpaka/mem/view/ViewStdVector.hpp
deleted file mode 100644
index 860108261f..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/view/ViewStdVector.hpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/pltf/PltfCpu.hpp>
-
-#include <vector>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::vector device type trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct DevType<
-                std::vector<TElem, TAllocator>>
-            {
-                using type = dev::DevCpu;
-            };
-
-            //#############################################################################
-            //! The std::vector device get trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct GetDev<
-                std::vector<TElem, TAllocator>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    std::vector<TElem, TAllocator> const & view)
-                -> dev::DevCpu
-                {
-                    alpaka::ignore_unused(view);
-                    return pltf::getDevByIdx<pltf::PltfCpu>(0u);
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::vector dimension getter trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct DimType<
-                std::vector<TElem, TAllocator>>
-            {
-                using type = dim::DimInt<1u>;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::vector memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct ElemType<
-                std::vector<TElem, TAllocator>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::vector width get trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct GetExtent<
-                dim::DimInt<0u>,
-                std::vector<TElem, TAllocator>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    std::vector<TElem, TAllocator> const & extent)
-                -> idx::Idx<std::vector<TElem, TAllocator>>
-                {
-                    return extent.size();
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The std::vector native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TAllocator>
-                struct GetPtrNative<
-                    std::vector<TElem, TAllocator>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        std::vector<TElem, TAllocator> const & view)
-                    -> TElem const *
-                    {
-                        return view.data();
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        std::vector<TElem, TAllocator> & view)
-                    -> TElem *
-                    {
-                        return view.data();
-                    }
-                };
-
-                //#############################################################################
-                //! The std::vector pitch get trait specialization.
-                template<
-                    typename TElem,
-                    typename TAllocator>
-                struct GetPitchBytes<
-                    dim::DimInt<0u>,
-                    std::vector<TElem, TAllocator>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        std::vector<TElem, TAllocator> const & pitch)
-                    -> idx::Idx<std::vector<TElem, TAllocator>>
-                    {
-                        return sizeof(TElem) * pitch.size();
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::vector offset get trait specialization.
-            template<
-                typename TIdx,
-                typename TElem,
-                typename TAllocator>
-            struct GetOffset<
-                TIdx,
-                std::vector<TElem, TAllocator>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                    std::vector<TElem, TAllocator> const &)
-                -> idx::Idx<std::vector<TElem, TAllocator>>
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The std::vector idx type trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct IdxType<
-                std::vector<TElem, TAllocator>>
-            {
-                using type = std::size_t;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/mem/view/ViewSubView.hpp b/thirdParty/alpaka/include/alpaka/mem/view/ViewSubView.hpp
deleted file mode 100644
index 57159cadf1..0000000000
--- a/thirdParty/alpaka/include/alpaka/mem/view/ViewSubView.hpp
+++ /dev/null
@@ -1,439 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/mem/view/ViewPlainPtr.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Common.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            //#############################################################################
-            //! A sub-view to a view.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class ViewSubView
-            {
-                static_assert(
-                    !std::is_const<TIdx>::value,
-                    "The idx type of the view can not be const!");
-            public:
-                //-----------------------------------------------------------------------------
-                //! Constructor.
-                //! \param view The view this view is a sub-view of.
-                //! \param extentElements The extent in elements.
-                //! \param relativeOffsetsElements The offsets in elements.
-                template<
-                    typename TView,
-                    typename TOffsets,
-                    typename TExtent>
-                ViewSubView(
-                    TView const & view,
-                    TExtent const & extentElements,
-                    TOffsets const & relativeOffsetsElements = TOffsets()) :
-                        m_viewParentView(
-                            mem::view::getPtrNative(view),
-                            dev::getDev(view),
-                            extent::getExtentVec(view),
-                            mem::view::getPitchBytesVec(view)),
-                        m_extentElements(extent::getExtentVec(extentElements)),
-                        m_offsetsElements(offset::getOffsetVec(relativeOffsetsElements))
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    static_assert(
-                        std::is_same<TDev, dev::Dev<TView>>::value,
-                        "The dev type of TView and the TDev template parameter have to be identical!");
-
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TView>>::value,
-                        "The idx type of TView and the TIdx template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TOffsets>>::value,
-                        "The idx type of TOffsets and the TIdx template parameter have to be identical!");
-
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TView>>::value,
-                        "The dim type of TView and the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TExtent>>::value,
-                        "The dim type of TExtent and the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TOffsets>>::value,
-                        "The dim type of TOffsets and the TDim template parameter have to be identical!");
-
-                    ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= extent::getExtentVec(view)).foldrAll(std::logical_and<bool>()));
-                }
-                //-----------------------------------------------------------------------------
-                //! Constructor.
-                //! \param view The view this view is a sub-view of.
-                //! \param extentElements The extent in elements.
-                //! \param relativeOffsetsElements The offsets in elements.
-                template<
-                    typename TView,
-                    typename TOffsets,
-                    typename TExtent>
-                ViewSubView(
-                    TView & view,
-                    TExtent const & extentElements,
-                    TOffsets const & relativeOffsetsElements = TOffsets()) :
-                        m_viewParentView(
-                            mem::view::getPtrNative(view),
-                            dev::getDev(view),
-                            extent::getExtentVec(view),
-                            mem::view::getPitchBytesVec(view)),
-                        m_extentElements(extent::getExtentVec(extentElements)),
-                        m_offsetsElements(offset::getOffsetVec(relativeOffsetsElements))
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    static_assert(
-                        std::is_same<TDev, dev::Dev<TView>>::value,
-                        "The dev type of TView and the TDev template parameter have to be identical!");
-
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TView>>::value,
-                        "The idx type of TView and the TIdx template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TOffsets>>::value,
-                        "The idx type of TOffsets and the TIdx template parameter have to be identical!");
-
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TView>>::value,
-                        "The dim type of TView and the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TExtent>>::value,
-                        "The dim type of TExtent and the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TOffsets>>::value,
-                        "The dim type of TOffsets and the TDim template parameter have to be identical!");
-
-                    ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= extent::getExtentVec(view)).foldrAll(std::logical_and<bool>()));
-                }
-
-                //-----------------------------------------------------------------------------
-                //! \param view The view this view is a sub-view of.
-                template<
-                    typename TView>
-                ViewSubView(
-                    TView const & view) :
-                        ViewSubView(
-                            view,
-                            view,
-                            vec::Vec<TDim, TIdx>::all(0))
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-                }
-
-                //-----------------------------------------------------------------------------
-                //! \param view The view this view is a sub-view of.
-                template<
-                    typename TView>
-                ViewSubView(
-                    TView & view) :
-                        ViewSubView(
-                            view,
-                            view,
-                            vec::Vec<TDim, TIdx>::all(0))
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-                }
-
-            public:
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> m_viewParentView; // This wraps the parent view.
-                vec::Vec<TDim, TIdx> m_extentElements;     // The extent of this view.
-                vec::Vec<TDim, TIdx> m_offsetsElements;    // The offset relative to the parent view.
-            };
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // Trait specializations for ViewSubView.
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewSubView device type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct DevType<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TDev;
-            };
-
-            //#############################################################################
-            //! The ViewSubView device get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct GetDev<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & view)
-                -> TDev
-                {
-                    return
-                        dev::getDev(
-                            view.m_viewParentView);
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewSubView dimension getter trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct DimType<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewSubView memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct ElemType<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewSubView width get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct GetExtent<
-                TIdxIntegralConst,
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & extent)
-                -> TIdx
-                {
-                    return extent.m_extentElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type"
-#endif
-                //#############################################################################
-                //! The ViewSubView native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TDev,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-                {
-                private:
-                    using IdxSequence = meta::MakeIntegerSequence<std::size_t, TDim::value>;
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & view)
-                    -> TElem const *
-                    {
-                        // \TODO: pre-calculate this pointer for faster execution.
-                        return
-                            reinterpret_cast<TElem const *>(
-                                reinterpret_cast<std::uint8_t const *>(mem::view::getPtrNative(view.m_viewParentView))
-                                + pitchedOffsetBytes(view, IdxSequence()));
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::view::ViewSubView<TDev, TElem, TDim, TIdx> & view)
-                    -> TElem *
-                    {
-                        // \TODO: pre-calculate this pointer for faster execution.
-                        return
-                            reinterpret_cast<TElem *>(
-                                reinterpret_cast<std::uint8_t *>(mem::view::getPtrNative(view.m_viewParentView))
-                                + pitchedOffsetBytes(view, IdxSequence()));
-                    }
-
-                private:
-                    //-----------------------------------------------------------------------------
-                    //! For a 3D vector this calculates:
-                    //!
-                    //! offset::getOffset<0u>(view) * mem::view::getPitchBytes<1u>(view)
-                    //! + offset::getOffset<1u>(view) * mem::view::getPitchBytes<2u>(view)
-                    //! + offset::getOffset<2u>(view) * mem::view::getPitchBytes<3u>(view)
-                    //! while mem::view::getPitchBytes<3u>(view) is equivalent to sizeof(TElem)
-                    template<
-                        typename TView,
-                        std::size_t... TIndices>
-                    ALPAKA_FN_HOST static auto pitchedOffsetBytes(
-                        TView const & view,
-                        meta::IntegerSequence<std::size_t, TIndices...> const &)
-                    -> TIdx
-                    {
-                        return
-                            meta::foldr(
-                                std::plus<TIdx>(),
-                                pitchedOffsetBytesDim<TIndices>(view)...);
-                    }
-                    //-----------------------------------------------------------------------------
-                    template<
-                        std::size_t Tidx,
-                        typename TView>
-                    ALPAKA_FN_HOST static auto pitchedOffsetBytesDim(
-                        TView const & view)
-                    -> TIdx
-                    {
-                        return
-                            offset::getOffset<Tidx>(view)
-                            * mem::view::getPitchBytes<Tidx + 1u>(view);
-                    }
-                };
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-
-                //#############################################################################
-                //! The ViewSubView pitch get trait specialization.
-                template<
-                    typename TIdxIntegralConst,
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    TIdxIntegralConst,
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & view)
-                    -> TIdx
-                    {
-                        return
-                            mem::view::getPitchBytes<TIdxIntegralConst::value>(
-                                view.m_viewParentView);
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewSubView x offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & offset)
-                -> TIdx
-                {
-                    return offset.m_offsetsElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The ViewSubView idx type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct IdxType<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Apply.hpp b/thirdParty/alpaka/include/alpaka/meta/Apply.hpp
deleted file mode 100644
index c7d4e7908a..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Apply.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename TList,
-                template<typename...> class TApplicant>
-            struct ApplyImpl;
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename...> class TApplicant,
-                typename... T>
-            struct ApplyImpl<
-                TList<T...>,
-                TApplicant>
-            {
-                using type =
-                    TApplicant<T...>;
-            };
-        }
-        //#############################################################################
-        template<
-            typename TList,
-            template<typename...> class TApplicant>
-        using Apply = typename detail::ApplyImpl<TList, TApplicant>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/ApplyTuple.hpp b/thirdParty/alpaka/include/alpaka/meta/ApplyTuple.hpp
deleted file mode 100644
index 223a30909d..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/ApplyTuple.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <alpaka/meta/IntegerSequence.hpp>
-
-#include <boost/config.hpp>
-
-#include <utility>
-#include <tuple>
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        //-----------------------------------------------------------------------------
-        // C++17 std::invoke
-        namespace detail
-        {
-            template<class F, class... Args>
-            inline auto invoke_impl(F && f, Args &&... args)
-            -> decltype(std::forward<F>(f)(std::forward<Args>(args)...))
-            {
-                return std::forward<F>(f)(std::forward<Args>(args)...);
-            }
-
-            template<class Base, class T, class Derived>
-            inline auto invoke_impl(T Base::*pmd, Derived && ref)
-            -> decltype(std::forward<Derived>(ref).*pmd)
-            {
-                return std::forward<Derived>(ref).*pmd;
-            }
-
-            template<class PMD, class Pointer>
-            inline auto invoke_impl(PMD pmd, Pointer && ptr)
-            -> decltype((*std::forward<Pointer>(ptr)).*pmd)
-            {
-                return (*std::forward<Pointer>(ptr)).*pmd;
-            }
-
-            template<class Base, class T, class Derived, class... Args>
-            inline auto invoke_impl(T Base::*pmf, Derived && ref, Args &&... args)
-            -> decltype((std::forward<Derived>(ref).*pmf)(std::forward<Args>(args)...))
-            {
-                return (std::forward<Derived>(ref).*pmf)(std::forward<Args>(args)...);
-            }
-
-            template<class PMF, class Pointer, class... Args>
-            inline auto invoke_impl(PMF pmf, Pointer && ptr, Args &&... args)
-            -> decltype(((*std::forward<Pointer>(ptr)).*pmf)(std::forward<Args>(args)...))
-            {
-                return ((*std::forward<Pointer>(ptr)).*pmf)(std::forward<Args>(args)...);
-            }
-        }
-
-        template< class F, class... ArgTypes>
-        auto invoke(F && f, ArgTypes &&... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(detail::invoke_impl(std::forward<F>(f), std::forward<ArgTypes>(args)...))
-#endif
-        {
-            return detail::invoke_impl(std::forward<F>(f), std::forward<ArgTypes>(args)...);
-        }
-
-        //-----------------------------------------------------------------------------
-        // C++17 std::apply
-        namespace detail
-        {
-            template<class F, class Tuple, std::size_t... I>
-            auto apply_impl( F && f, Tuple && t, meta::IndexSequence<I...> )
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                meta::invoke(
-                    std::forward<F>(f),
-                    std::get<I>(std::forward<Tuple>(t))...))
-#endif
-            {
-                // If the the index sequence is empty, t will not be used at all.
-                alpaka::ignore_unused(t);
-
-                return
-                    meta::invoke(
-                        std::forward<F>(f),
-                        std::get<I>(std::forward<Tuple>(t))...);
-            }
-        }
-
-        template<class F, class Tuple>
-        auto apply(F && f, Tuple && t)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            detail::apply_impl(
-                std::forward<F>(f),
-                std::forward<Tuple>(t),
-                meta::MakeIndexSequence<std::tuple_size<typename std::decay<Tuple>::type>::value>{}))
-#endif
-        {
-            return
-                detail::apply_impl(
-                    std::forward<F>(f),
-                    std::forward<Tuple>(t),
-                    meta::MakeIndexSequence<std::tuple_size<typename std::decay<Tuple>::type>::value>{});
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/CartesianProduct.hpp b/thirdParty/alpaka/include/alpaka/meta/CartesianProduct.hpp
deleted file mode 100644
index c9ae52bb86..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/CartesianProduct.hpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/meta/Concatenate.hpp>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        //-----------------------------------------------------------------------------
-        // This is based on code by Patrick Fromberg.
-        // See http://stackoverflow.com/questions/9122028/how-to-create-the-cartesian-product-of-a-type-list/19611856#19611856
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename... Ts>
-            struct CartesianProductImplHelper;
-            //#############################################################################
-            // Stop condition.
-            template<
-                template<typename...> class TList,
-                typename... Ts>
-            struct CartesianProductImplHelper<
-                TList<Ts...>>
-            {
-                using type = TList<Ts...>;
-            };
-            //#############################################################################
-            // Catches first empty tuple.
-            template<
-                template<typename...> class TList,
-                typename... Ts>
-            struct CartesianProductImplHelper<
-                TList<TList<>>,
-                Ts...>
-            {
-                using type = TList<>;
-            };
-            //#############################################################################
-            // Catches any empty tuple except first.
-            template<
-                template<typename...> class TList,
-                typename... Ts,
-                typename... Rests>
-            struct CartesianProductImplHelper<
-                TList<Ts...>,
-                TList<>,
-                Rests...>
-            {
-                using type = TList<>;
-            };
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... X,
-                typename H,
-                typename... Rests>
-            struct CartesianProductImplHelper<
-                TList<X...>,
-                TList<H>,
-                Rests...>
-            {
-                using type1 = TList<Concatenate<X, TList<H>>...>;
-                using type = typename CartesianProductImplHelper<type1, Rests...>::type;
-            };
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... X,
-                template<typename...> class Head,
-                typename T,
-                typename... Ts,
-                typename... Rests>
-            struct CartesianProductImplHelper<
-                TList<X...>,
-                Head<T, Ts...>,
-                Rests...>
-            {
-                using type1 = TList<Concatenate<X, TList<T>>...>;
-                using type2 = typename CartesianProductImplHelper<TList<X...>, TList<Ts...>>::type;
-                using type3 = Concatenate<type1, type2>;
-                using type = typename CartesianProductImplHelper<type3, Rests...>::type;
-            };
-
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... Ts>
-            struct CartesianProductImpl;
-            //#############################################################################
-            // The base case for no input returns an empty sequence.
-            template<
-                template<typename...> class TList>
-            struct CartesianProductImpl<
-                TList>
-            {
-                using type = TList<>;
-            };
-            //#############################################################################
-            // R is the return type, Head<A...> is the first input list
-            template<
-                template<typename...> class TList,
-                template<typename...> class Head,
-                typename... Ts,
-                typename... Tail>
-            struct CartesianProductImpl<
-                TList,
-                Head<Ts...>,
-                Tail...>
-            {
-                using type =
-                    typename detail::CartesianProductImplHelper<
-                        TList<TList<Ts>...>,
-                        Tail...
-                    >::type;
-            };
-        }
-
-        //#############################################################################
-        template<
-            template<typename...> class TList,
-            typename... Ts>
-        using CartesianProduct = typename detail::CartesianProductImpl<TList, Ts...>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Concatenate.hpp b/thirdParty/alpaka/include/alpaka/meta/Concatenate.hpp
deleted file mode 100644
index 2a10a52fef..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Concatenate.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename... T>
-            struct ConcatenateImpl;
-            //#############################################################################
-            template<
-                typename T>
-            struct ConcatenateImpl<
-                T>
-            {
-                using type = T;
-            };
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... As,
-                typename... Bs,
-                typename... TRest>
-            struct ConcatenateImpl<
-                TList<As...>,
-                TList<Bs...>,
-                TRest...>
-            {
-                using type =
-                    typename ConcatenateImpl<
-                        TList<As..., Bs...>,
-                        TRest...
-                    >::type;
-            };
-        }
-        //#############################################################################
-        template<
-            typename... T>
-        using Concatenate = typename detail::ConcatenateImpl<T...>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/DependentFalseType.hpp b/thirdParty/alpaka/include/alpaka/meta/DependentFalseType.hpp
deleted file mode 100644
index 4b0d15c5b6..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/DependentFalseType.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        //#############################################################################
-        //! A false_type being dependent on a ignored template parameter.
-        //! This allows to use static_assert in uninstantiated template specializations without triggering.
-        template<
-            typename T>
-        struct DependentFalseType :
-            std::false_type
-        {};
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Filter.hpp b/thirdParty/alpaka/include/alpaka/meta/Filter.hpp
deleted file mode 100644
index d876bd5e89..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Filter.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/meta/Concatenate.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename> class TPred,
-                typename... Ts>
-            struct FilterImplHelper;
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename> class TPred>
-            struct FilterImplHelper<
-                TList,
-                TPred>
-            {
-                using type = TList<>;
-            };
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename> class TPred,
-                typename T,
-                typename... Ts>
-            struct FilterImplHelper<
-                TList,
-                TPred,
-                T,
-                Ts...>
-            {
-                using type =
-                    typename std::conditional<
-                        TPred<T>::value,    // TODO: Remove '::value' when C++14 variable templates are supported.
-                        Concatenate<TList<T>, typename FilterImplHelper<TList, TPred, Ts...>::type>,
-                        typename FilterImplHelper<TList, TPred, Ts...>::type >::type;
-            };
-
-            //#############################################################################
-            template<
-                typename TList,
-                template<typename> class TPred>
-            struct FilterImpl;
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename> class TPred,
-                typename... Ts>
-            struct FilterImpl<
-                TList<Ts...>,
-                TPred>
-            {
-                using type =
-                    typename detail::FilterImplHelper<
-                        TList,
-                        TPred,
-                        Ts...
-                    >::type;
-            };
-        }
-        //#############################################################################
-        template<
-            typename TList,
-            template<typename> class TPred>
-        using Filter = typename detail::FilterImpl<TList, TPred>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Fold.hpp b/thirdParty/alpaka/include/alpaka/meta/Fold.hpp
deleted file mode 100644
index aa3e8efe90..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Fold.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <boost/config.hpp>
-
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-    #include <type_traits>
-#endif
-
-namespace alpaka
-{
-    namespace meta
-    {
-        //-----------------------------------------------------------------------------
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TFnObj,
-            typename T>
-        ALPAKA_FN_HOST_ACC auto foldr(
-            TFnObj const & f,
-            T const & t)
-        -> T
-        {
-            alpaka::ignore_unused(f);
-
-            return t;
-        }
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename TFnObj,
-                typename... T>
-            struct TypeOfFold;
-            //#############################################################################
-            template<
-                typename TFnObj,
-                typename T>
-            struct TypeOfFold<
-                TFnObj,
-                T>
-            {
-                using type = T;
-            };
-            //#############################################################################
-            template<
-                typename TFnObj,
-                typename T,
-                typename... P>
-            struct TypeOfFold<
-                TFnObj,
-                T,
-                P...>
-            {
-                using type =
-                    typename std::result_of<
-                        TFnObj(T, typename TypeOfFold<TFnObj, P...>::type)>::type;
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TFnObj,
-            typename T0,
-            typename T1,
-            typename... Ts>
-        ALPAKA_FN_HOST_ACC auto foldr(
-            TFnObj const & f,
-            T0 const & t0,
-            T1 const & t1,
-            Ts const & ... ts)
-        // NOTE: The following line is not allowed because the point of function declaration is after the trailing return type.
-        // Thus the function itself is not available inside its return type declaration.
-        // http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#1433
-        // http://stackoverflow.com/questions/3744400/trailing-return-type-using-decltype-with-a-variadic-template-function
-        // http://stackoverflow.com/questions/11596898/variadic-template-and-inferred-return-type-in-concat/11597196#11597196
-        //-> decltype(f(t0, foldr(f, t1, ts...)))
-        -> typename detail::TypeOfFold<TFnObj, T0, T1, Ts...>::type
-        {
-            return f(t0, foldr(f, t1, ts...));
-        }
-#else
-        //-----------------------------------------------------------------------------
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TFnObj,
-            typename T0,
-            typename T1,
-            typename... Ts>
-        ALPAKA_FN_HOST_ACC auto foldr(
-            TFnObj const & f,
-            T0 const & t0,
-            T1 const & t1,
-            Ts const & ... ts)
-        {
-            return f(t0, foldr(f, t1, ts...));
-        }
-#endif
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/ForEachType.hpp b/thirdParty/alpaka/include/alpaka/meta/ForEachType.hpp
deleted file mode 100644
index 8a344f161b..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/ForEachType.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <utility>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename TList>
-            struct ForEachTypeHelper;
-            //#############################################################################
-            template<
-                template<typename...> class TList>
-            struct ForEachTypeHelper<
-                TList<>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TFnObj,
-                    typename... TArgs>
-                ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(
-                    TFnObj && f,
-                    TArgs && ... args)
-                -> void
-                {
-                    alpaka::ignore_unused(f);
-                    alpaka::ignore_unused(args...);
-                }
-            };
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename T,
-                typename... Ts>
-            struct ForEachTypeHelper<
-                TList<T, Ts...>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TFnObj,
-                    typename... TArgs>
-                ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(
-                    TFnObj && f,
-                    TArgs && ... args)
-                -> void
-                {
-                    // Call the function object template call operator.
-#if BOOST_COMP_MSVC && !BOOST_COMP_NVCC
-                    f.operator()<T>(
-                        std::forward<TArgs>(args)...);
-#else
-                    f.template operator()<T>(
-                        std::forward<TArgs>(args)...);
-#endif
-                    ForEachTypeHelper<
-                        TList<Ts...>>
-                    ::forEachTypeHelper(
-                        std::forward<TFnObj>(f),
-                        std::forward<TArgs>(args)...);
-                }
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Equivalent to boost::mpl::for_each but does not require the types of the sequence to be default constructible.
-        //! This function does not create instances of the types instead it passes the types as template parameter.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TList,
-            typename TFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto forEachType(
-            TFnObj && f,
-            TArgs && ... args)
-        -> void
-        {
-            detail::ForEachTypeHelper<
-                TList>
-            ::forEachTypeHelper(
-                std::forward<TFnObj>(f),
-                std::forward<TArgs>(args)...);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/InheritFromList.hpp b/thirdParty/alpaka/include/alpaka/meta/InheritFromList.hpp
deleted file mode 100644
index 196c2fc510..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/InheritFromList.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    namespace meta
-    {
-        template<
-            typename TBaseList
-        >
-        class InheritFromList;
-
-        template<
-            template<typename...> class TList,
-            typename... TBases
-        >
-        class InheritFromList<
-            TList<TBases...>
-        >
-            : public TBases...
-        {
-        };
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/IntegerSequence.hpp b/thirdParty/alpaka/include/alpaka/meta/IntegerSequence.hpp
deleted file mode 100644
index 4abdfb830d..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/IntegerSequence.hpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Common.hpp>
-#include <alpaka/meta/Set.hpp>
-
-#include <type_traits>
-#include <cstddef>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        //#############################################################################
-        // This could be replaced with c++14 std::IntegerSequence if we raise the minimum.
-        template<
-            typename T,
-            T... Tvals>
-        struct IntegerSequence
-        {
-            static_assert(std::is_integral<T>::value, "IntegerSequence<T, I...> requires T to be an integral type.");
-
-            using type = IntegerSequence<T, Tvals...>;
-            using value_type = T;
-
-            ALPAKA_FN_HOST_ACC static auto size() noexcept
-            -> std::size_t
-            {
-                return (sizeof...(Tvals));
-            }
-        };
-
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename TDstType,
-                typename TIntegerSequence>
-            struct ConvertIntegerSequence;
-            //#############################################################################
-            template<
-                typename TDstType,
-                typename T,
-                T... Tvals>
-            struct ConvertIntegerSequence<
-                TDstType,
-                IntegerSequence<T, Tvals...>>
-            {
-                using type = IntegerSequence<TDstType, static_cast<TDstType>(Tvals)...>;
-            };
-        }
-        //#############################################################################
-        template<
-            typename TDstType,
-            typename TIntegerSequence>
-        using ConvertIntegerSequence = typename detail::ConvertIntegerSequence<TDstType, TIntegerSequence>::type;
-
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename T,
-                template<T> class TOp,
-                typename TIntegerSequence>
-            struct TransformIntegerSequence;
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename T,
-                template<T> class TOp,
-                T... Tvals>
-            struct TransformIntegerSequence<
-                TList,
-                T,
-                TOp,
-                IntegerSequence<T, Tvals...>>
-            {
-                using type =
-                    TList<
-                        TOp<Tvals>...>;
-            };
-        }
-        //#############################################################################
-        template<
-            template<typename...> class TList,
-            typename T,
-            template<T> class TOp,
-            typename TIntegerSequence>
-        using TransformIntegerSequence = typename detail::TransformIntegerSequence<TList, T, TOp, TIntegerSequence>::type;
-
-        namespace detail
-        {
-            //#############################################################################
-            template<bool TisSizeNegative, bool TbIsBegin, typename T, T Tbegin, typename TIntCon, typename TIntSeq>
-            struct MakeIntegerSequenceHelper
-            {
-                static_assert(!TisSizeNegative, "MakeIntegerSequence<T, N> requires N to be non-negative.");
-            };
-            //#############################################################################
-            template<typename T, T Tbegin, T... Tvals>
-            struct MakeIntegerSequenceHelper<false, true, T, Tbegin, std::integral_constant<T, Tbegin>, IntegerSequence<T, Tvals...> > :
-                IntegerSequence<T, Tvals...>
-            {};
-            //#############################################################################
-            template<typename T, T Tbegin, T TIdx, T... Tvals>
-            struct MakeIntegerSequenceHelper<false, false, T, Tbegin, std::integral_constant<T, TIdx>, IntegerSequence<T, Tvals...> > :
-                MakeIntegerSequenceHelper<false, TIdx == (Tbegin+1), T, Tbegin, std::integral_constant<T, TIdx - 1>, IntegerSequence<T, TIdx - 1, Tvals...> >
-            {};
-        }
-
-        //#############################################################################
-        template<typename T, T Tbegin, T Tsize>
-        using MakeIntegerSequenceOffset = typename detail::MakeIntegerSequenceHelper<(Tsize < 0), (Tsize == 0), T, Tbegin, std::integral_constant<T, Tbegin+Tsize>, IntegerSequence<T> >::type;
-
-        //#############################################################################
-        template<typename T, T Tsize>
-        using MakeIntegerSequence = MakeIntegerSequenceOffset<T, 0u, Tsize>;
-
-
-        //#############################################################################
-        template<
-            std::size_t... Tvals>
-        using IndexSequence = IntegerSequence<std::size_t, Tvals...>;
-
-        //#############################################################################
-        template<
-            typename T,
-            T Tbegin,
-            T Tsize>
-        using MakeIndexSequenceOffset = MakeIntegerSequenceOffset<std::size_t, Tbegin, Tsize>;
-
-        //#############################################################################
-        template<
-            std::size_t Tsize>
-        using MakeIndexSequence = MakeIntegerSequence<std::size_t, Tsize>;
-
-        //#############################################################################
-        template<
-            typename... Ts>
-        using IndexSequenceFor = MakeIndexSequence<sizeof...(Ts)>;
-
-
-        //#############################################################################
-        //! Checks if the integral values are unique.
-        template<
-            typename T,
-            T... Tvals>
-        struct IntegralValuesUnique
-        {
-            static constexpr bool value = meta::IsParameterPackSet<std::integral_constant<T, Tvals>...>::value;
-        };
-
-        //#############################################################################
-        //! Checks if the values in the index sequence are unique.
-        template<
-            typename TIntegerSequence>
-        struct IntegerSequenceValuesUnique;
-        //#############################################################################
-        //! Checks if the values in the index sequence are unique.
-        template<
-            typename T,
-            T... Tvals>
-        struct IntegerSequenceValuesUnique<
-            IntegerSequence<T, Tvals...>>
-        {
-            static constexpr bool value = IntegralValuesUnique<T, Tvals...>::value;
-        };
-
-        //#############################################################################
-        //! Checks if the integral values are within the given range.
-        template<
-            typename T,
-            T Tmin,
-            T Tmax,
-            T... Tvals>
-        struct IntegralValuesInRange;
-        //#############################################################################
-        //! Checks if the integral values are within the given range.
-        template<
-            typename T,
-            T Tmin,
-            T Tmax>
-        struct IntegralValuesInRange<
-            T,
-            Tmin,
-            Tmax>
-        {
-            static constexpr bool value = true;
-        };
-        //#############################################################################
-        //! Checks if the integral values are within the given range.
-        template<
-            typename T,
-            T Tmin,
-            T Tmax,
-            T I,
-            T... Tvals>
-        struct IntegralValuesInRange<
-            T,
-            Tmin,
-            Tmax,
-            I,
-            Tvals...>
-        {
-            static constexpr bool value = (I >= Tmin) && (I <=Tmax) && IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
-        };
-
-        //#############################################################################
-        //! Checks if the values in the index sequence are within the given range.
-        template<
-            typename TIntegerSequence,
-            typename T,
-            T Tmin,
-            T Tmax>
-        struct IntegerSequenceValuesInRange;
-        //#############################################################################
-        //! Checks if the values in the index sequence are within the given range.
-        template<
-            typename T,
-            T... Tvals,
-            T Tmin,
-            T Tmax>
-        struct IntegerSequenceValuesInRange<
-            IntegerSequence<T, Tvals...>,
-            T,
-            Tmin,
-            Tmax>
-        {
-            static constexpr bool value = IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
-        };
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Integral.hpp b/thirdParty/alpaka/include/alpaka/meta/Integral.hpp
deleted file mode 100644
index 222be7d9c3..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Integral.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        //#############################################################################
-        //! The trait is true if all values of TSubset are contained in TSuperset.
-        template<
-            typename TSuperset,
-            typename TSubset>
-        using IsIntegralSuperset =
-            std::integral_constant<
-                bool,
-                std::is_integral<TSuperset>::value && std::is_integral<TSubset>::value
-                && (
-                    // If the signdness is equal, the sizes have to be greater or equal to be a superset.
-                    ((std::is_unsigned<TSuperset>::value == std::is_unsigned<TSubset>::value) && (sizeof(TSuperset) >= sizeof(TSubset)))
-                    // If the signdness is non-equal, the superset has to have at least one bit more.
-                    || ((std::is_unsigned<TSuperset>::value != std::is_unsigned<TSubset>::value) && (sizeof(TSuperset) > sizeof(TSubset)))
-                )>;
-
-        //#############################################################################
-        //! The type that has the higher max value.
-        template<
-            typename T0,
-            typename T1>
-        using HigherMax =
-            typename std::conditional<
-                (sizeof(T0) > sizeof(T1)),
-                T0,
-                typename std::conditional<
-                    ((sizeof(T0) == sizeof(T1)) && std::is_unsigned<T0>::value && std::is_signed<T1>::value),
-                        T0,
-                        T1>::type>::type;
-
-        //#############################################################################
-        //! The type that has the lower max value.
-        template<
-            typename T0,
-            typename T1>
-        using LowerMax =
-            typename std::conditional<
-                (sizeof(T0) < sizeof(T1)),
-                T0,
-                typename std::conditional<
-                    ((sizeof(T0) == sizeof(T1)) && std::is_signed<T0>::value && std::is_unsigned<T1>::value),
-                        T0,
-                        T1>::type>::type;
-
-        //#############################################################################
-        //! The type that has the higher min value. If both types have the same min value, the type with the wider range is chosen.
-        template<
-            typename T0,
-            typename T1>
-        using HigherMin =
-            typename std::conditional<
-                (std::is_unsigned<T0>::value == std::is_unsigned<T1>::value),
-                typename std::conditional<
-                    std::is_unsigned<T0>::value,
-                        typename std::conditional<
-                        (sizeof(T0) < sizeof(T1)),
-                            T1,
-                            T0>::type,
-                        typename std::conditional<
-                        (sizeof(T0) < sizeof(T1)),
-                            T0,
-                            T1>::type>::type,
-                typename std::conditional<
-                    std::is_unsigned<T0>::value,
-                        T0,
-                        T1>::type>::type;
-
-        //#############################################################################
-        //! The type that has the lower min value. If both types have the same min value, the type with the wider range is chosen.
-        template<
-            typename T0,
-            typename T1>
-        using LowerMin =
-            typename std::conditional<
-                (std::is_unsigned<T0>::value == std::is_unsigned<T1>::value),
-                typename std::conditional<
-                    (sizeof(T0) > sizeof(T1)),
-                        T0,
-                        T1>::type,
-                typename std::conditional<
-                    std::is_signed<T0>::value,
-                        T0,
-                        T1>::type>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/IsStrictBase.hpp b/thirdParty/alpaka/include/alpaka/meta/IsStrictBase.hpp
deleted file mode 100644
index 720fbb2866..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/IsStrictBase.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        //#############################################################################
-        //! The trait is true if TDerived is derived from TBase but is not TBase itself.
-        template<
-            typename TBase,
-            typename TDerived>
-        using IsStrictBase =
-            std::integral_constant<
-                bool,
-                std::is_base_of<TBase, TDerived>::value
-                && !std::is_same<TBase, typename std::decay<TDerived>::type>::value>;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Metafunctions.hpp b/thirdParty/alpaka/include/alpaka/meta/Metafunctions.hpp
deleted file mode 100644
index 8bc184c517..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Metafunctions.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            //#############################################################################
-            // TODO: Replace with C++17 std::conjunction
-            template<
-                typename...>
-            struct ConjunctionImpl :
-                std::true_type
-            {};
-            //#############################################################################
-            // TODO: Replace with C++17 std::conjunction
-            template<
-                typename B1>
-            struct ConjunctionImpl<B1> :
-                B1
-            {};
-            //#############################################################################
-            // TODO: Replace with C++17 std::conjunction
-            template<
-                typename B1,
-                typename... Bn>
-            struct ConjunctionImpl<
-                B1,
-                Bn...> :
-                    std::conditional<B1::value != false, ConjunctionImpl<Bn...>, B1>::type
-            {};
-        }
-        //#############################################################################
-        template<
-            typename... B>
-        using Conjunction = typename detail::ConjunctionImpl<B...>::type;
-
-        namespace detail
-        {
-            //#############################################################################
-            // TODO: Replace with C++17 std::disjunction
-            template<
-                typename...>
-            struct DisjunctionImpl :
-                std::false_type
-            {};
-            //#############################################################################
-            // TODO: Replace with C++17 std::disjunction
-            template<
-                typename B1>
-            struct DisjunctionImpl<B1> :
-                B1
-            {};
-            //#############################################################################
-            // TODO: Replace with C++17 std::disjunction
-            template<
-                typename B1,
-                typename... Bn>
-            struct DisjunctionImpl<
-                B1,
-                Bn...> :
-                    std::conditional<B1::value != false, B1, DisjunctionImpl<Bn...>>::type
-            {};
-        }
-        //#############################################################################
-        template<
-            typename... B>
-        using Disjunction = typename detail::DisjunctionImpl<B...>;
-
-        //#############################################################################
-        // TODO: Replace with C++17 std::negation
-        template<
-            typename B>
-        using Negation = std::integral_constant<bool, !B::value>;
-    }
-}
-
diff --git a/thirdParty/alpaka/include/alpaka/meta/NdLoop.hpp b/thirdParty/alpaka/include/alpaka/meta/NdLoop.hpp
deleted file mode 100644
index 9f6d92432c..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/NdLoop.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            //#############################################################################
-            //! N-dimensional loop iteration template.
-            template<
-                typename TIndexSequence>
-            struct NdLoop;
-            //#############################################################################
-            //! N-dimensional loop iteration template.
-            template<>
-            struct NdLoop<
-                meta::IndexSequence<>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TIndex,
-                    typename TExtentVec,
-                    typename TFnObj>
-                ALPAKA_FN_HOST_ACC static auto ndLoop(
-                    TIndex & idx,
-                    TExtentVec const & extent,
-                    TFnObj const & f)
-                -> void
-                {
-                    alpaka::ignore_unused(idx);
-                    alpaka::ignore_unused(extent);
-                    alpaka::ignore_unused(f);
-                }
-            };
-            //#############################################################################
-            //! N-dimensional loop iteration template.
-            template<
-                std::size_t Tdim>
-            struct NdLoop<
-                meta::IndexSequence<Tdim>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TIndex,
-                    typename TExtentVec,
-                    typename TFnObj>
-                ALPAKA_FN_HOST_ACC static auto ndLoop(
-                    TIndex & idx,
-                    TExtentVec const & extent,
-                    TFnObj const & f)
-                -> void
-                {
-                    static_assert(
-                        dim::Dim<TIndex>::value > 0u,
-                        "The dimension given to ndLoop has to be larger than zero!");
-                    static_assert(
-                        dim::Dim<TIndex>::value == dim::Dim<TExtentVec>::value,
-                        "The dimensions of the iteration vector and the extent vector have to be identical!");
-                    static_assert(
-                        dim::Dim<TIndex>::value > Tdim,
-                        "The current dimension has to be in the range [0,dim-1]!");
-
-                    for(idx[Tdim] = 0u; idx[Tdim] < extent[Tdim]; ++idx[Tdim])
-                    {
-                        f(idx);
-                    }
-                }
-            };
-            //#############################################################################
-            //! N-dimensional loop iteration template.
-            template<
-                std::size_t Tdim0,
-                std::size_t Tdim1,
-                std::size_t... Tdims>
-            struct NdLoop<
-                meta::IndexSequence<Tdim0, Tdim1, Tdims...>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TIndex,
-                    typename TExtentVec,
-                    typename TFnObj>
-                ALPAKA_FN_HOST_ACC static auto ndLoop(
-                    TIndex & idx,
-                    TExtentVec const & extent,
-                    TFnObj const & f)
-                -> void
-                {
-                    static_assert(
-                        dim::Dim<TIndex>::value > 0u,
-                        "The dimension given to ndLoop has to be larger than zero!");
-                    static_assert(
-                        dim::Dim<TIndex>::value == dim::Dim<TExtentVec>::value,
-                        "The dimensions of the iteration vector and the extent vector have to be identical!");
-                    static_assert(
-                        dim::Dim<TIndex>::value > Tdim0,
-                        "The current dimension has to be in the range [0,dim-1]!");
-
-                    for(idx[Tdim0] = 0u; idx[Tdim0] < extent[Tdim0]; ++idx[Tdim0])
-                    {
-                        detail::NdLoop<
-                            meta::IndexSequence<Tdim1, Tdims...>>
-                        ::template ndLoop(
-                                idx,
-                                extent,
-                                f);
-                    }
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
-        //! The loops are nested in the order given by the IndexSequence with the first element being the outermost and the last index the innermost loop.
-        //!
-        //! \param indexSequence A sequence of indices being a permutation of the values [0, dim-1].
-        //! \param extent N-dimensional loop extent.
-        //! \param f The function called at each iteration.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtentVec,
-            typename TFnObj,
-            std::size_t... Tdims>
-        ALPAKA_FN_HOST_ACC auto ndLoop(
-            meta::IndexSequence<Tdims...> const & indexSequence,
-            TExtentVec const & extent,
-            TFnObj const & f)
-        -> void
-        {
-            alpaka::ignore_unused(indexSequence);
-
-            static_assert(
-                dim::Dim<TExtentVec>::value > 0u,
-                "The dimension of the extent given to ndLoop has to be larger than zero!");
-            static_assert(
-                meta::IntegerSequenceValuesInRange<meta::IndexSequence<Tdims...>, std::size_t, 0, dim::Dim<TExtentVec>::value>::value,
-                "The values in the IndexSequence have to be in the range [0,dim-1]!");
-            static_assert(
-                meta::IntegerSequenceValuesUnique<meta::IndexSequence<Tdims...>>::value,
-                "The values in the IndexSequence have to be unique!");
-
-            auto idx(
-                vec::Vec<dim::Dim<TExtentVec>, idx::Idx<TExtentVec>>::zeros());
-
-            detail::NdLoop<
-                meta::IndexSequence<Tdims...>>
-            ::template ndLoop(
-                    idx,
-                    extent,
-                    f);
-        }
-        //-----------------------------------------------------------------------------
-        //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
-        //! The loops are nested from index zero outmost to index (dim-1) innermost.
-        //!
-        //! \param extent N-dimensional loop extent.
-        //! \param f The function called at each iteration.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtentVec,
-            typename TFnObj>
-        ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(
-            TExtentVec const & extent,
-            TFnObj const & f)
-        -> void
-        {
-            ndLoop(
-                meta::MakeIndexSequence<dim::Dim<TExtentVec>::value>(),
-                extent,
-                f);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Set.hpp b/thirdParty/alpaka/include/alpaka/meta/Set.hpp
deleted file mode 100644
index aeb2aea6ac..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Set.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <utility>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            //#############################################################################
-            //! Empty dependent type.
-            template<
-                typename T>
-            struct Empty
-            {};
-
-            //#############################################################################
-            template<
-                typename... Ts>
-            struct IsParameterPackSetImpl;
-            //#############################################################################
-            template<>
-            struct IsParameterPackSetImpl<>
-            {
-                static constexpr bool value = true;
-            };
-            //#############################################################################
-            // Based on code by Roland Bock: https://gist.github.com/rbock/ad8eedde80c060132a18
-            // Linearly inherits from empty<T> and checks if it has already inherited from this type.
-            template<
-                typename T,
-                typename... Ts>
-            struct IsParameterPackSetImpl<T, Ts...> :
-                public IsParameterPackSetImpl<Ts...>,
-                public virtual Empty<T>
-            {
-                using Base = IsParameterPackSetImpl<Ts...>;
-
-                static constexpr bool value = Base::value && !std::is_base_of<Empty<T>, Base>::value;
-            };
-        }
-        //#############################################################################
-        //! Trait that tells if the parameter pack contains only unique (no equal) types.
-        template<
-            typename... Ts>
-        using IsParameterPackSet = detail::IsParameterPackSetImpl<Ts...>;
-
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename TList>
-            struct IsSetImpl;
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... Ts>
-            struct IsSetImpl<
-                TList<Ts...>>
-            {
-                static constexpr bool value = IsParameterPackSet<Ts...>::value;
-            };
-        }
-        //#############################################################################
-        //! Trait that tells if the template contains only unique (no equal) types.
-        template<
-            typename TList>
-        using IsSet = detail::IsSetImpl<TList>;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Transform.hpp b/thirdParty/alpaka/include/alpaka/meta/Transform.hpp
deleted file mode 100644
index 22c2805fe9..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Transform.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename Ts,
-                template<typename...> class TOp>
-            struct TransformImpl;
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... Ts,
-                template<typename...> class TOp>
-            struct TransformImpl<
-                TList<Ts...>,
-                TOp>
-            {
-                using type =
-                    TList<
-                        TOp<Ts>...>;
-            };
-        }
-        //#############################################################################
-        template<
-            typename Ts,
-            template<typename...> class TOp>
-        using Transform = typename detail::TransformImpl<Ts, TOp>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/meta/Unique.hpp b/thirdParty/alpaka/include/alpaka/meta/Unique.hpp
deleted file mode 100644
index dbb6adc43a..0000000000
--- a/thirdParty/alpaka/include/alpaka/meta/Unique.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <type_traits>
-
-#include <alpaka/meta/Metafunctions.hpp>
-
-namespace alpaka
-{
-    namespace meta
-    {
-        namespace detail
-        {
-            template<
-                typename T,
-                typename... Ts>
-            struct UniqueHelper
-            {
-                using type = T;
-            };
-
-            template<
-                template<typename...> class TList,
-                typename... Ts,
-                typename U,
-                typename... Us>
-            struct UniqueHelper<TList<Ts...>, U, Us...>
-                : std::conditional<(Disjunction<std::is_same<U, Ts>...>::value)
-                    , UniqueHelper<TList<Ts...>, Us...>
-                    , UniqueHelper<TList<Ts..., U>, Us...>>::type
-            {};
-
-            template<
-                typename T>
-            struct UniqueImpl;
-
-            template<
-                template<typename...> class TList,
-                typename... Ts>
-            struct UniqueImpl<TList<Ts...>>
-            {
-                using type = typename UniqueHelper<TList<>, Ts...>::type;
-            };
-        }
-
-        //#############################################################################
-        //! Trait that returns a list with only unique (no equal) types (a set). Duplicates will be filtered out.
-        template<
-            typename TList>
-        using Unique = typename detail::UniqueImpl<TList>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/offset/Traits.hpp b/thirdParty/alpaka/include/alpaka/offset/Traits.hpp
deleted file mode 100644
index 996a37407b..0000000000
--- a/thirdParty/alpaka/include/alpaka/offset/Traits.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/core/Common.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The offset specifics.
-    namespace offset
-    {
-        //-----------------------------------------------------------------------------
-        //! The offset traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The x offset get trait.
-            //!
-            //! If not specialized explicitly it returns 0.
-            template<
-                typename TIdx,
-                typename TOffsets,
-                typename TSfinae = void>
-            struct GetOffset
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const &)
-                -> idx::Idx<TOffsets>
-                {
-                    return static_cast<idx::Idx<TOffsets>>(0);
-                }
-            };
-
-            //#############################################################################
-            //! The x offset set trait.
-            template<
-                typename TIdx,
-                typename TOffsets,
-                typename TOffset,
-                typename TSfinae = void>
-            struct SetOffset;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The offset in the given dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t Tidx,
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffset(
-            TOffsets const & offsets)
-        -> idx::Idx<TOffsets>
-        {
-            return
-                traits::GetOffset<
-                    dim::DimInt<Tidx>,
-                    TOffsets>
-                ::getOffset(
-                    offsets);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The offset in x dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetX(
-            TOffsets const & offsets = TOffsets())
-        -> idx::Idx<TOffsets>
-        {
-            return getOffset<dim::Dim<TOffsets>::value - 1u>(offsets);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The offset in y dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetY(
-            TOffsets const & offsets = TOffsets())
-        -> idx::Idx<TOffsets>
-        {
-            return getOffset<dim::Dim<TOffsets>::value - 2u>(offsets);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The offset in z dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetZ(
-            TOffsets const & offsets = TOffsets())
-        -> idx::Idx<TOffsets>
-        {
-            return getOffset<dim::Dim<TOffsets>::value - 3u>(offsets);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Sets the offset in the given dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t Tidx,
-            typename TOffsets,
-            typename TOffset>
-        ALPAKA_FN_HOST_ACC auto setOffset(
-            TOffsets const & offsets,
-            TOffset const & offset)
-        -> void
-        {
-            traits::SetOffset<
-                dim::DimInt<Tidx>,
-                TOffsets,
-                TOffset>
-            ::setOffset(
-                offsets,
-                offset);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the offset in x dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets,
-            typename TOffset>
-        ALPAKA_FN_HOST_ACC auto setOffsetX(
-            TOffsets const & offsets,
-            TOffset const & offset)
-        -> void
-        {
-            setOffset<dim::Dim<TOffsets>::value - 1u>(offsets, offset);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the offset in y dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets,
-            typename TOffset>
-        ALPAKA_FN_HOST_ACC auto setOffsetY(
-            TOffsets const & offsets,
-            TOffset const & offset)
-        -> void
-        {
-            setOffset<dim::Dim<TOffsets>::value - 2u>(offsets, offset);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the offset in z dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets,
-            typename TOffset>
-        ALPAKA_FN_HOST_ACC auto setOffsetZ(
-            TOffsets const & offsets,
-            TOffset const & offset)
-        -> void
-        {
-            setOffset<dim::Dim<TOffsets>::value - 3u>(offsets, offset);
-        }
-
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
-        {
-            //#############################################################################
-            //! The unsigned integral x offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<0u>,
-                TOffsets,
-                typename std::enable_if<
-                    std::is_integral<TOffsets>::value>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offset)
-                -> idx::Idx<TOffsets>
-                {
-                    return offset;
-                }
-            };
-            //#############################################################################
-            //! The unsigned integral x offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<0u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    std::is_integral<TOffsets>::value>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets = offset;
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/pltf/PltfCpu.hpp b/thirdParty/alpaka/include/alpaka/pltf/PltfCpu.hpp
deleted file mode 100644
index 61c2328762..0000000000
--- a/thirdParty/alpaka/include/alpaka/pltf/PltfCpu.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <sstream>
-#include <vector>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        //#############################################################################
-        //! The CPU device platform.
-        class PltfCpu :
-            public concepts::Implements<ConceptPltf, PltfCpu>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST PltfCpu() = delete;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device device type trait specialization.
-            template<>
-            struct DevType<
-                pltf::PltfCpu>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU platform device count get trait specialization.
-            template<>
-            struct GetDevCount<
-                pltf::PltfCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDevCount()
-                -> std::size_t
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    return 1;
-                }
-            };
-
-            //#############################################################################
-            //! The CPU platform device get trait specialization.
-            template<>
-            struct GetDevByIdx<
-                pltf::PltfCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDevByIdx(
-                    std::size_t const & devIdx)
-                -> dev::DevCpu
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    std::size_t const devCount(pltf::getDevCount<pltf::PltfCpu>());
-                    if(devIdx >= devCount)
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for CPU device with index " << devIdx << " because there are only " << devCount << " devices!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    return {};
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/pltf/PltfCudaRt.hpp b/thirdParty/alpaka/include/alpaka/pltf/PltfCudaRt.hpp
deleted file mode 100644
index 3fa715bf6f..0000000000
--- a/thirdParty/alpaka/include/alpaka/pltf/PltfCudaRt.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        //#############################################################################
-        //! The CUDA RT device manager.
-        class PltfCudaRt :
-            public concepts::Implements<ConceptPltf, PltfCudaRt>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST PltfCudaRt() = delete;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device manager device type trait specialization.
-            template<>
-            struct DevType<
-                pltf::PltfCudaRt>
-            {
-                using type = dev::DevCudaRt;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU platform device count get trait specialization.
-            template<>
-            struct GetDevCount<
-                pltf::PltfCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDevCount()
-                -> std::size_t
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    int iNumDevices(0);
-                    cudaError_t error = cudaGetDeviceCount(&iNumDevices);
-                    if(error != cudaSuccess)
-                        iNumDevices = 0;
-
-                    return static_cast<std::size_t>(iNumDevices);
-                }
-            };
-
-            //#############################################################################
-            //! The CPU platform device get trait specialization.
-            template<>
-            struct GetDevByIdx<
-                pltf::PltfCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDevByIdx(
-                    std::size_t const & devIdx)
-                -> dev::DevCudaRt
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    dev::DevCudaRt dev;
-
-                    std::size_t const devCount(pltf::getDevCount<pltf::PltfCudaRt>());
-                    if(devIdx >= devCount)
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount << " CUDA devices!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    if(isDevUsable(devIdx))
-                    {
-                        dev.m_iDevice = static_cast<int>(devIdx);
-
-                        // Log this device.
-    #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        cudaDeviceProp devProp;
-                        ALPAKA_CUDA_RT_CHECK(cudaGetDeviceProperties(&devProp, dev.m_iDevice));
-    #endif
-    #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        printDeviceProperties(devProp);
-    #elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        std::cout << __func__ << devProp.name << std::endl;
-    #endif
-                    }
-                    else
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    return dev;
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! \return If the device is usable.
-                ALPAKA_FN_HOST static auto isDevUsable(
-                    std::size_t iDevice)
-                -> bool
-                {
-                    cudaError rc(cudaSetDevice(static_cast<int>(iDevice)));
-
-                    cudaStream_t queue = {};
-                    // Create a dummy queue to check if the device is already used by an other process.
-                    // cudaSetDevice never returns an error if another process already uses the selected device and gpu compute mode is set "process exclusive".
-                    // \TODO: Check if this workaround is needed!
-                    if(rc == cudaSuccess)
-                    {
-                        rc = cudaStreamCreate(&queue);
-                    }
-
-                    if(rc == cudaSuccess)
-                    {
-                        // Destroy the dummy queue.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamDestroy(
-                                queue));
-                        return true;
-                    }
-                    else
-                    {
-                        // Return the previous error from cudaStreamCreate.
-                        ALPAKA_CUDA_RT_CHECK(
-                            rc);
-                        // Reset the Error state.
-                        cudaGetLastError();
-
-                        return false;
-                    }
-                }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                //-----------------------------------------------------------------------------
-                //! Prints all the device properties to std::cout.
-                ALPAKA_FN_HOST static auto printDeviceProperties(
-                    cudaDeviceProp const & devProp)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    std::size_t const kiB(1024);
-                    std::size_t const miB(kiB * kiB);
-                    std::cout << "name: " << devProp.name << std::endl;
-                    std::cout << "totalGlobalMem: " << devProp.totalGlobalMem/miB << " MiB" << std::endl;
-                    std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock/kiB << " KiB" << std::endl;
-                    std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
-                    std::cout << "warpSize: " << devProp.warpSize << std::endl;
-                    std::cout << "memPitch: " << devProp.memPitch << " B" << std::endl;
-                    std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
-                    std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1] << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
-                    std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", " << devProp.maxGridSize[2] << ")" << std::endl;
-                    std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
-                    std::cout << "totalConstMem: " << devProp.totalConstMem/kiB << " KiB" << std::endl;
-                    std::cout << "major: " << devProp.major << std::endl;
-                    std::cout << "minor: " << devProp.minor << std::endl;
-                    std::cout << "textureAlignment: " << devProp.textureAlignment << std::endl;
-                    std::cout << "texturePitchAlignment: " << devProp.texturePitchAlignment << std::endl;
-                    //std::cout << "deviceOverlap: " << devProp.deviceOverlap << std::endl;    // Deprecated
-                    std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
-                    std::cout << "kernelExecTimeoutEnabled: " << devProp.kernelExecTimeoutEnabled << std::endl;
-                    std::cout << "integrated: " << devProp.integrated << std::endl;
-                    std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
-                    std::cout << "computeMode: " << devProp.computeMode << std::endl;
-                    std::cout << "maxTexture1D: " << devProp.maxTexture1D << std::endl;
-                    std::cout << "maxTexture1DLinear: " << devProp.maxTexture1DLinear << std::endl;
-                    std::cout << "maxTexture2D[2]: " << devProp.maxTexture2D[0] << "x" << devProp.maxTexture2D[1] << std::endl;
-                    std::cout << "maxTexture2DLinear[3]: " << devProp.maxTexture2DLinear[0] << "x" << devProp.maxTexture2DLinear[1] << "x" << devProp.maxTexture2DLinear[2] << std::endl;
-                    std::cout << "maxTexture2DGather[2]: " << devProp.maxTexture2DGather[0] << "x" << devProp.maxTexture2DGather[1] << std::endl;
-                    std::cout << "maxTexture3D[3]: " << devProp.maxTexture3D[0] << "x" << devProp.maxTexture3D[1] << "x" << devProp.maxTexture3D[2] << std::endl;
-                    std::cout << "maxTextureCubemap: " << devProp.maxTextureCubemap << std::endl;
-                    std::cout << "maxTexture1DLayered[2]: " << devProp.maxTexture1DLayered[0] << "x" << devProp.maxTexture1DLayered[1] << std::endl;
-                    std::cout << "maxTexture2DLayered[3]: " << devProp.maxTexture2DLayered[0] << "x" << devProp.maxTexture2DLayered[1] << "x" << devProp.maxTexture2DLayered[2] << std::endl;
-                    std::cout << "maxTextureCubemapLayered[2]: " << devProp.maxTextureCubemapLayered[0] << "x" << devProp.maxTextureCubemapLayered[1] << std::endl;
-                    std::cout << "maxSurface1D: " << devProp.maxSurface1D << std::endl;
-                    std::cout << "maxSurface2D[2]: " << devProp.maxSurface2D[0] << "x" << devProp.maxSurface2D[1] << std::endl;
-                    std::cout << "maxSurface3D[3]: " << devProp.maxSurface3D[0] << "x" << devProp.maxSurface3D[1] << "x" << devProp.maxSurface3D[2] << std::endl;
-                    std::cout << "maxSurface1DLayered[2]: " << devProp.maxSurface1DLayered[0] << "x" << devProp.maxSurface1DLayered[1] << std::endl;
-                    std::cout << "maxSurface2DLayered[3]: " << devProp.maxSurface2DLayered[0] << "x" << devProp.maxSurface2DLayered[1] << "x" << devProp.maxSurface2DLayered[2] << std::endl;
-                    std::cout << "maxSurfaceCubemap: " << devProp.maxSurfaceCubemap << std::endl;
-                    std::cout << "maxSurfaceCubemapLayered[2]: " << devProp.maxSurfaceCubemapLayered[0] << "x" << devProp.maxSurfaceCubemapLayered[1] << std::endl;
-                    std::cout << "surfaceAlignment: " << devProp.surfaceAlignment << std::endl;
-                    std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
-                    std::cout << "ECCEnabled: " << devProp.ECCEnabled << std::endl;
-                    std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
-                    std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
-                    std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
-                    std::cout << "tccDriver: " << devProp.tccDriver << std::endl;
-                    std::cout << "asyncEngineCount: " << devProp.asyncEngineCount << std::endl;
-                    std::cout << "unifiedAddressing: " << devProp.unifiedAddressing << std::endl;
-                    std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
-                    std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
-                    std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
-                    std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
-                    std::cout << "streamPrioritiesSupported: " << devProp.streamPrioritiesSupported << std::endl;
-                    std::cout << "globalL1CacheSupported: " << devProp.globalL1CacheSupported << std::endl;
-                    std::cout << "localL1CacheSupported: " << devProp.localL1CacheSupported << std::endl;
-                    std::cout << "sharedMemPerMultiprocessor: " << devProp.sharedMemPerMultiprocessor << std::endl;
-                    std::cout << "regsPerMultiprocessor: " << devProp.regsPerMultiprocessor << std::endl;
-                    std::cout << "managedMemory: " << devProp.managedMemory << std::endl;
-                    std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
-                    std::cout << "multiGpuBoardGroupID: " << devProp.multiGpuBoardGroupID << std::endl;
-                    std::cout << "singleToDoublePrecisionPerfRatio: " << devProp.singleToDoublePrecisionPerfRatio << std::endl;
-                    std::cout << "pageableMemoryAccess: " << devProp.pageableMemoryAccess << std::endl;
-                    std::cout << "concurrentManagedAccess: " << devProp.concurrentManagedAccess << std::endl;
-                    std::cout << "computePreemptionSupported: " << devProp.computePreemptionSupported << std::endl;
-                    std::cout << "canUseHostPointerForRegisteredMem: " << devProp.canUseHostPointerForRegisteredMem << std::endl;
-                    std::cout << "cooperativeLaunch: " << devProp.cooperativeLaunch << std::endl;
-                    std::cout << "cooperativeMultiDeviceLaunch: " << devProp.cooperativeMultiDeviceLaunch << std::endl;
-                }
-#endif
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/pltf/PltfHipRt.hpp b/thirdParty/alpaka/include/alpaka/pltf/PltfHipRt.hpp
deleted file mode 100644
index ec9abf1393..0000000000
--- a/thirdParty/alpaka/include/alpaka/pltf/PltfHipRt.hpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        //#############################################################################
-        //! The HIP RT device manager.
-        class PltfHipRt :
-            public concepts::Implements<ConceptPltf, PltfHipRt>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! Constructor.
-            ALPAKA_FN_HOST PltfHipRt() = delete;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device manager device type trait specialization.
-            template<>
-            struct DevType<
-                pltf::PltfHipRt>
-            {
-                using type = dev::DevHipRt;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU platform device count get trait specialization.
-            template<>
-            struct GetDevCount<
-                pltf::PltfHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto getDevCount()
-                -> std::size_t
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    int iNumDevices(0);
-                    hipError_t error = hipGetDeviceCount(&iNumDevices);
-                    if(error != hipSuccess)
-                        iNumDevices = 0;
-                    return static_cast<std::size_t>(iNumDevices);
-                }
-            };
-
-            //#############################################################################
-            //! The CPU platform device get trait specialization.
-            template<>
-            struct GetDevByIdx<
-                pltf::PltfHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto getDevByIdx(
-                    std::size_t const & devIdx)
-                -> dev::DevHipRt
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    dev::DevHipRt dev;
-
-                    std::size_t const devCount(pltf::getDevCount<pltf::PltfHipRt>());
-                    if(devIdx >= devCount)
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount << " HIP devices!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    if(isDevUsable(devIdx))
-                    {
-                        dev.m_iDevice = static_cast<int>(devIdx);
-
-                        // Log this device.
-    #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        hipDeviceProp_t devProp;
-                        ALPAKA_HIP_RT_CHECK(hipGetDeviceProperties(&devProp, dev.m_iDevice));
-    #endif
-    #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        printDeviceProperties(devProp);
-    #elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        std::cout << __func__ << devProp.name << std::endl;
-    #endif
-                    }
-                    else
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    return dev;
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! \return If the device is usable.
-                ALPAKA_FN_HOST static auto isDevUsable(
-                    std::size_t iDevice)
-                -> bool
-                {
-                    hipError_t rc(hipSetDevice(static_cast<int>(iDevice)));
-
-                    hipStream_t queue = {};
-                    // Create a dummy queue to check if the device is already used by an other process.
-                    // hipSetDevice never returns an error if another process already uses the selected device and gpu compute mode is set "process exclusive".
-                    // \TODO: Check if this workaround is needed!
-                    if(rc == hipSuccess)
-                    {
-                        rc = hipStreamCreate(&queue);
-                    }
-
-                    if(rc == hipSuccess)
-                    {
-                        // Destroy the dummy queue.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamDestroy(
-                                queue));
-                        return true;
-                    }
-                    else
-                    {
-                        // Return the previous error from hipStreamCreate.
-                        ALPAKA_HIP_RT_CHECK(
-                            rc);
-                        // Reset the Error state.
-                        hipGetLastError();
-
-                        return false;
-                    }
-                }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                //-----------------------------------------------------------------------------
-                //! Prints all the device properties to std::cout.
-                ALPAKA_FN_HOST static auto printDeviceProperties(
-                    hipDeviceProp_t const & devProp)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    std::size_t const kiB(1024);
-                    std::size_t const miB(kiB * kiB);
-                    std::cout << "name: " << devProp.name << std::endl;
-                    std::cout << "totalGlobalMem: " << devProp.totalGlobalMem/miB << " MiB" << std::endl;
-                    std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock/kiB << " KiB" << std::endl;
-                    std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
-                    std::cout << "warpSize: " << devProp.warpSize << std::endl;
-                    std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
-                    std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1] << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
-                    std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", " << devProp.maxGridSize[2] << ")" << std::endl;
-                    std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
-                    std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
-                    std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
-                    std::cout << "totalConstMem: " << devProp.totalConstMem/kiB << " KiB" << std::endl;
-                    std::cout << "major: " << devProp.major << std::endl;
-                    std::cout << "minor: " << devProp.minor << std::endl;
-                    std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
-                    std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
-                    std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
-                    std::cout << "computeMode: " << devProp.computeMode << std::endl;
-                    std::cout << "clockInstructionRate: " << devProp.clockInstructionRate << "kHz" << std::endl;
-                    std::cout << "arch: " << std::endl;
-                    std::cout << "    hasGlobalInt32Atomics: " << devProp.arch.hasGlobalInt32Atomics << std::endl;
-                    std::cout << "    hasGlobalFloatAtomicExch: " << devProp.arch.hasGlobalFloatAtomicExch << std::endl;
-                    std::cout << "    hasSharedInt32Atomics: " << devProp.arch.hasSharedInt32Atomics << std::endl;
-                    std::cout << "    hasSharedFloatAtomicExch: " << devProp.arch.hasSharedFloatAtomicExch << std::endl;
-                    std::cout << "    hasFloatAtomicAdd: " << devProp.arch.hasFloatAtomicAdd << std::endl;
-                    std::cout << "    hasGlobalInt64Atomics: " << devProp.arch.hasGlobalInt64Atomics << std::endl;
-                    std::cout << "    hasSharedInt64Atomics: " << devProp.arch.hasSharedInt64Atomics << std::endl;
-                    std::cout << "    hasDoubles: " << devProp.arch.hasDoubles << std::endl;
-                    std::cout << "    hasWarpVote: " << devProp.arch.hasWarpVote << std::endl;
-                    std::cout << "    hasWarpBallot: " << devProp.arch.hasWarpBallot << std::endl;
-                    std::cout << "    hasWarpShuffle: " << devProp.arch.hasWarpShuffle << std::endl;
-                    std::cout << "    hasFunnelShift: " << devProp.arch.hasFunnelShift << std::endl;
-                    std::cout << "    hasThreadFenceSystem: " << devProp.arch.hasThreadFenceSystem << std::endl;
-                    std::cout << "    hasSyncThreadsExt: " << devProp.arch.hasSyncThreadsExt << std::endl;
-                    std::cout << "    hasSurfaceFuncs: " << devProp.arch.hasSurfaceFuncs << std::endl;
-                    std::cout << "    has3dGrid: " << devProp.arch.has3dGrid << std::endl;
-                    std::cout << "    hasDynamicParallelism: " << devProp.arch.hasDynamicParallelism << std::endl;
-                    std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
-                    std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
-                    std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
-                    std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
-                    std::cout << "maxSharedMemoryPerMultiProcessor: " << devProp.maxSharedMemoryPerMultiProcessor/kiB << " KiB" << std::endl;
-                    std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
-                    std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
-                    std::cout << "gcnArch: " << devProp.gcnArch << std::endl;
-                    std::cout << "integrated: " << devProp.integrated << std::endl;
-                }
-#endif
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/pltf/Traits.hpp b/thirdParty/alpaka/include/alpaka/pltf/Traits.hpp
deleted file mode 100644
index 3c8a8a8207..0000000000
--- a/thirdParty/alpaka/include/alpaka/pltf/Traits.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/dev/Traits.hpp>
-
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-
-#include <boost/config.hpp>
-
-#include <vector>
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The platform specifics.
-    namespace pltf
-    {
-        struct ConceptPltf;
-
-        //-----------------------------------------------------------------------------
-        //! The platform traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The platform type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct PltfType;
-
-            //#############################################################################
-            //! The device count get trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct GetDevCount;
-
-            //#############################################################################
-            //! The device get trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct GetDevByIdx;
-        }
-
-        //#############################################################################
-        //! The platform type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Pltf = typename traits::PltfType<T>::type;
-
-        //-----------------------------------------------------------------------------
-        //! \return The device identified by its index.
-        template<
-            typename TPltf>
-        ALPAKA_FN_HOST auto getDevCount()
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(traits::GetDevCount<TPltf>::getDevCount())
-#endif
-        {
-            return
-                traits::GetDevCount<
-                    TPltf>
-                ::getDevCount();
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The device identified by its index.
-        template<
-            typename TPltf>
-        ALPAKA_FN_HOST auto getDevByIdx(
-            std::size_t const & devIdx)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(traits::GetDevByIdx<TPltf>::getDevByIdx(devIdx))
-#endif
-        {
-            return
-                traits::GetDevByIdx<
-                    TPltf>
-                ::getDevByIdx(
-                    devIdx);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return All the devices available on this accelerator.
-        template<
-            typename TPltf>
-        ALPAKA_FN_HOST auto getDevs()
-        -> std::vector<dev::Dev<TPltf>>
-        {
-            std::vector<dev::Dev<TPltf>> devs;
-
-            std::size_t const devCount(getDevCount<TPltf>());
-            for(std::size_t devIdx(0); devIdx < devCount; ++devIdx)
-            {
-                devs.push_back(getDevByIdx<TPltf>(devIdx));
-            }
-
-            return devs;
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            template<
-                typename TPltf,
-                typename TProperty>
-            struct QueueType<
-                TPltf,
-                TProperty,
-                typename std::enable_if<concepts::ImplementsConcept<pltf::ConceptPltf, TPltf>::value>::type
-            >
-            {
-                using type = typename QueueType<
-                    typename dev::traits::DevType<TPltf>::type,
-                    TProperty>::type;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/queue/Properties.hpp b/thirdParty/alpaka/include/alpaka/queue/Properties.hpp
deleted file mode 100644
index 67da1cac30..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/Properties.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2019 Rene Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    namespace queue
-    {
-        //-----------------------------------------------------------------------------
-        //! Properties to define queue behavior
-        namespace property
-        {
-            //#############################################################################
-            //! The caller is waiting until the enqueued task is finished
-            struct Blocking{};
-
-            //#############################################################################
-            //! The caller is NOT waiting until the enqueued task is finished
-            struct NonBlocking{};
-        }
-
-        using namespace property;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp b/thirdParty/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp
deleted file mode 100644
index fe02591d7f..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/queue/cpu/ICpuQueue.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <atomic>
-#include <mutex>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCpu;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cpu
-        {
-            namespace detail
-            {
-#if BOOST_COMP_CLANG
-    // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]"
-    // https://stackoverflow.com/a/29288300
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-                //#############################################################################
-                //! The CPU device queue implementation.
-                class QueueCpuBlockingImpl final : public cpu::ICpuQueue
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    QueueCpuBlockingImpl(
-                        dev::DevCpu const & dev) noexcept :
-                            m_dev(dev),
-                            m_bCurrentlyExecutingTask(false)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    QueueCpuBlockingImpl(QueueCpuBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCpuBlockingImpl(QueueCpuBlockingImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuBlockingImpl const &) -> QueueCpuBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuBlockingImpl &&) -> QueueCpuBlockingImpl & = delete;
-
-                    //-----------------------------------------------------------------------------
-                    void enqueue(event::EventCpu & ev) final
-                    {
-                        queue::enqueue(*this, ev);
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    void wait(event::EventCpu const & ev) final
-                    {
-                        wait::wait(*this, ev);
-                    }
-
-                public:
-                    dev::DevCpu const m_dev;            //!< The device this queue is bound to.
-                    std::mutex mutable m_mutex;
-                    std::atomic<bool> m_bCurrentlyExecutingTask;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CPU device queue.
-        class QueueCpuBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCpuBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            QueueCpuBlocking(
-                dev::DevCpu const & dev) :
-                    m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuBlockingImpl>(dev))
-            {
-                dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            QueueCpuBlocking(QueueCpuBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCpuBlocking(QueueCpuBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuBlocking const &) -> QueueCpuBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuBlocking &&) -> QueueCpuBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(QueueCpuBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            auto operator!=(QueueCpuBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCpuBlocking() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::QueueCpuBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCpuBlocking>
-            {
-                using type = dev::DevCpu;
-            };
-            //#############################################################################
-            //! The CPU blocking device queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCpuBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCpuBlocking const & queue)
-                -> dev::DevCpu
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCpuBlocking>
-            {
-                using type = event::EventCpu;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            //! This default implementation for all tasks directly invokes the function call operator of the task.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCpuBlocking,
-                TTask>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-
-                    queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true;
-
-                    task();
-
-                    queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false;
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCpuBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCpuBlocking const & queue)
-                -> bool
-                {
-                    return !queue.m_spQueueImpl->m_bCurrentlyExecutingTask;
-                }
-            };
-        }
-    }
-
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCpuBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCpuBlocking const & queue)
-                -> void
-                {
-                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                }
-            };
-        }
-    }
-}
-
-#include <alpaka/event/EventCpu.hpp>
diff --git a/thirdParty/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp b/thirdParty/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp
deleted file mode 100644
index 94030569d1..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/queue/cpu/ICpuQueue.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/ConcurrentExecPool.hpp>
-
-#include <type_traits>
-#include <thread>
-#include <mutex>
-#include <future>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCpu;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cpu
-        {
-            namespace detail
-            {
-#if BOOST_COMP_CLANG
-    // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]"
-    // https://stackoverflow.com/a/29288300
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-                //#############################################################################
-                //! The CPU device queue implementation.
-                class QueueCpuNonBlockingImpl final : public cpu::ICpuQueue
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                {
-                private:
-                    //#############################################################################
-                    using ThreadPool = alpaka::core::detail::ConcurrentExecPool<
-                        std::size_t,
-                        std::thread,                // The concurrent execution type.
-                        std::promise,               // The promise type.
-                        void,                       // The type yielding the current concurrent execution.
-                        std::mutex,                 // The mutex type to use. Only required if TisYielding is true.
-                        std::condition_variable,    // The condition variable type to use. Only required if TisYielding is true.
-                        false>;                     // If the threads should yield.
-
-                public:
-                    //-----------------------------------------------------------------------------
-                    QueueCpuNonBlockingImpl(
-                        dev::DevCpu const & dev) :
-                            m_dev(dev),
-                            m_workerThread(1u)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    QueueCpuNonBlockingImpl(QueueCpuNonBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCpuNonBlockingImpl(QueueCpuNonBlockingImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuNonBlockingImpl const &) -> QueueCpuNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuNonBlockingImpl &&) -> QueueCpuNonBlockingImpl & = delete;
-
-                    //-----------------------------------------------------------------------------
-                    void enqueue(event::EventCpu & ev) final
-                    {
-                        queue::enqueue(*this, ev);
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    void wait(event::EventCpu const & ev) final
-                    {
-                        wait::wait(*this, ev);
-                    }
-
-                public:
-                    dev::DevCpu const m_dev;            //!< The device this queue is bound to.
-
-                    ThreadPool m_workerThread;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CPU device queue.
-        class QueueCpuNonBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCpuNonBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            QueueCpuNonBlocking(
-                dev::DevCpu const & dev) :
-                    m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuNonBlockingImpl>(dev))
-            {
-                dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            QueueCpuNonBlocking(QueueCpuNonBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCpuNonBlocking(QueueCpuNonBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuNonBlocking const &) -> QueueCpuNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuNonBlocking &&) -> QueueCpuNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(QueueCpuNonBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            auto operator!=(QueueCpuNonBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCpuNonBlocking() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::QueueCpuNonBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU non-blocking device queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCpuNonBlocking>
-            {
-                using type = dev::DevCpu;
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCpuNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCpuNonBlocking const & queue)
-                -> dev::DevCpu
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU non-blocking device queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCpuNonBlocking>
-            {
-                using type = event::EventCpu;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU non-blocking device queue enqueue trait specialization.
-            //! This default implementation for all tasks directly invokes the function call operator of the task.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCpuNonBlocking,
-                TTask>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue::QueueCpuNonBlocking & queue,
-                    TTask const & task)
-#else
-                    queue::QueueCpuNonBlocking &,
-                    TTask const &)
-#endif
-                -> void
-                {
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue.m_spQueueImpl->m_workerThread.enqueueTask(
-                        task);
-#endif
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCpuNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCpuNonBlocking const & queue)
-                -> bool
-                {
-                    return queue.m_spQueueImpl->m_workerThread.isIdle();
-                }
-            };
-        }
-    }
-}
-
-#include <alpaka/event/EventCpu.hpp>
diff --git a/thirdParty/alpaka/include/alpaka/queue/QueueCudaRtBlocking.hpp b/thirdParty/alpaka/include/alpaka/queue/QueueCudaRtBlocking.hpp
deleted file mode 100644
index bd9640bc40..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/QueueCudaRtBlocking.hpp
+++ /dev/null
@@ -1,345 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCudaRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cuda
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CUDA RT blocking queue implementation.
-                class QueueCudaRtBlockingImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST QueueCudaRtBlockingImpl(
-                        dev::DevCudaRt const & dev) :
-                            m_dev(dev),
-                            m_CudaQueue()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // - cudaStreamDefault: Default queue creation flag.
-                        // - cudaStreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue),
-                        //   and that the created queue should perform no implicit synchronization with queue 0.
-                        // Create the queue on the current device.
-                        // NOTE: cudaStreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue.
-                        // It would be too much work to implement implicit default queue synchronization on CPU.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamCreateWithFlags(
-                                &m_CudaQueue,
-                                cudaStreamNonBlocking));
-                    }
-                    //-----------------------------------------------------------------------------
-                    QueueCudaRtBlockingImpl(QueueCudaRtBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCudaRtBlockingImpl(QueueCudaRtBlockingImpl &&) = default;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCudaRtBlockingImpl const &) -> QueueCudaRtBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCudaRtBlockingImpl &&) -> QueueCudaRtBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~QueueCudaRtBlockingImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before cudaStreamDestroy required?
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // In case the device is still doing work in the queue when cudaStreamDestroy() is called, the function will return immediately
-                        // and the resources associated with queue will be released automatically once the device has completed all work in queue.
-                        // -> No need to synchronize here.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamDestroy(
-                                m_CudaQueue));
-                    }
-
-                public:
-                    dev::DevCudaRt const m_dev;   //!< The device this queue is bound to.
-                    cudaStream_t m_CudaQueue;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CUDA RT blocking queue.
-        class QueueCudaRtBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCudaRtBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST QueueCudaRtBlocking(
-                dev::DevCudaRt const & dev) :
-                m_spQueueImpl(std::make_shared<cuda::detail::QueueCudaRtBlockingImpl>(dev))
-            {}
-            //-----------------------------------------------------------------------------
-            QueueCudaRtBlocking(QueueCudaRtBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCudaRtBlocking(QueueCudaRtBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCudaRtBlocking const &) -> QueueCudaRtBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCudaRtBlocking &&) -> QueueCudaRtBlocking & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(QueueCudaRtBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(QueueCudaRtBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCudaRtBlocking() = default;
-
-        public:
-            std::shared_ptr<cuda::detail::QueueCudaRtBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT blocking queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCudaRtBlocking>
-            {
-                using type = dev::DevCudaRt;
-            };
-            //#############################################################################
-            //! The CUDA RT blocking queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCudaRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCudaRtBlocking const & queue)
-                -> dev::DevCudaRt
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT blocking queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCudaRtBlocking>
-            {
-                using type = event::EventCudaRt;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT blocking queue enqueue trait specialization.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                TTask>
-            {
-                //#############################################################################
-                enum class CallbackState
-                {
-                    enqueued,
-                    notified,
-                    finished,
-                };
-
-                //#############################################################################
-                struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
-                {
-                    std::mutex m_mutex;
-                    std::condition_variable m_event;
-                    CallbackState state = CallbackState::enqueued;
-                };
-
-                //-----------------------------------------------------------------------------
-                static void CUDART_CB cudaRtCallback(cudaStream_t /*queue*/, cudaError_t /*status*/, void *arg)
-                {
-                    // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
-                    const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
-
-                    // Notify the executing thread.
-                    {
-                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                        pCallbackSynchronizationData->state = CallbackState::notified;
-                    }
-                    pCallbackSynchronizationData->m_event.notify_one();
-
-                    // Wait for the executing thread to finish the task if it has not already finished.
-                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                    if(pCallbackSynchronizationData->state != CallbackState::finished)
-                    {
-                        pCallbackSynchronizationData->m_event.wait(
-                            lock,
-                            [pCallbackSynchronizationData](){
-                                return pCallbackSynchronizationData->state == CallbackState::finished;
-                            }
-                        );
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamAddCallback(
-                        queue.m_spQueueImpl->m_CudaQueue,
-                        cudaRtCallback,
-                        pCallbackSynchronizationData.get(),
-                        0u));
-
-                    // We start a new std::thread which stores the task to be executed.
-                    // This circumvents the limitation that it is not possible to call CUDA methods within the CUDA callback thread.
-                    // The CUDA thread signals the std::thread when it is ready to execute the task.
-                    // The CUDA thread is waiting for the std::thread to signal that it is finished executing the task
-                    // before it executes the next task in the queue (CUDA stream).
-                    std::thread t(
-                        [pCallbackSynchronizationData, task](){
-
-                            // If the callback has not yet been called, we wait for it.
-                            {
-                                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                                if(pCallbackSynchronizationData->state != CallbackState::notified)
-                                {
-                                    pCallbackSynchronizationData->m_event.wait(
-                                        lock,
-                                        [pCallbackSynchronizationData](){
-                                            return pCallbackSynchronizationData->state == CallbackState::notified;
-                                        }
-                                    );
-                                }
-
-                                task();
-
-                                // Notify the waiting CUDA thread.
-                                pCallbackSynchronizationData->state = CallbackState::finished;
-                            }
-                            pCallbackSynchronizationData->m_event.notify_one();
-                        }
-                    );
-
-                    t.join();
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT blocking queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCudaRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCudaRtBlocking const & queue)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Query is allowed even for queues on non current device.
-                    cudaError_t ret = cudaSuccess;
-                    ALPAKA_CUDA_RT_CHECK_IGNORE(
-                        ret = cudaStreamQuery(
-                            queue.m_spQueueImpl->m_CudaQueue),
-                        cudaErrorNotReady);
-                    return (ret == cudaSuccess);
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT blocking queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCudaRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCudaRtBlocking const & queue)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Sync is allowed even for queues on non current device.
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamSynchronize(
-                        queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/queue/QueueCudaRtNonBlocking.hpp b/thirdParty/alpaka/include/alpaka/queue/QueueCudaRtNonBlocking.hpp
deleted file mode 100644
index d89f860131..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/QueueCudaRtNonBlocking.hpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCudaRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cuda
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CUDA RT non-blocking queue implementation.
-                class QueueCudaRtNonBlockingImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST QueueCudaRtNonBlockingImpl(
-                        dev::DevCudaRt const & dev) :
-                            m_dev(dev),
-                            m_CudaQueue()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // - cudaStreamDefault: Default queue creation flag.
-                        // - cudaStreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue),
-                        //   and that the created queue should perform no implicit synchronization with queue 0.
-                        // Create the queue on the current device.
-                        // NOTE: cudaStreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue.
-                        // It would be too much work to implement implicit default queue synchronization on CPU.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamCreateWithFlags(
-                                &m_CudaQueue,
-                                cudaStreamNonBlocking));
-                    }
-                    //-----------------------------------------------------------------------------
-                    QueueCudaRtNonBlockingImpl(QueueCudaRtNonBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCudaRtNonBlockingImpl(QueueCudaRtNonBlockingImpl &&) = default;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCudaRtNonBlockingImpl const &) -> QueueCudaRtNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCudaRtNonBlockingImpl &&) -> QueueCudaRtNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~QueueCudaRtNonBlockingImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before cudaStreamDestroy required?
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // In case the device is still doing work in the queue when cudaStreamDestroy() is called, the function will return immediately
-                        // and the resources associated with queue will be released automatically once the device has completed all work in queue.
-                        // -> No need to synchronize here.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamDestroy(
-                                m_CudaQueue));
-                    }
-
-                public:
-                    dev::DevCudaRt const m_dev;   //!< The device this queue is bound to.
-                    cudaStream_t m_CudaQueue;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CUDA RT non-blocking queue.
-        class QueueCudaRtNonBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCudaRtNonBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST QueueCudaRtNonBlocking(
-                dev::DevCudaRt const & dev) :
-                m_spQueueImpl(std::make_shared<cuda::detail::QueueCudaRtNonBlockingImpl>(dev))
-            {}
-            //-----------------------------------------------------------------------------
-            QueueCudaRtNonBlocking(QueueCudaRtNonBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCudaRtNonBlocking(QueueCudaRtNonBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCudaRtNonBlocking const &) -> QueueCudaRtNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCudaRtNonBlocking &&) -> QueueCudaRtNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(QueueCudaRtNonBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(QueueCudaRtNonBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCudaRtNonBlocking() = default;
-
-        public:
-            std::shared_ptr<cuda::detail::QueueCudaRtNonBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT non-blocking queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCudaRtNonBlocking>
-            {
-                using type = dev::DevCudaRt;
-            };
-            //#############################################################################
-            //! The CUDA RT non-blocking queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCudaRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCudaRtNonBlocking const & queue)
-                -> dev::DevCudaRt
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT non-blocking queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCudaRtNonBlocking>
-            {
-                using type = event::EventCudaRt;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT sync queue enqueue trait specialization.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                TTask>
-            {
-                //#############################################################################
-                enum class CallbackState
-                {
-                    enqueued,
-                    notified,
-                    finished,
-                };
-
-                //#############################################################################
-                struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
-                {
-                    std::mutex m_mutex;
-                    std::condition_variable m_event;
-                    CallbackState state = CallbackState::enqueued;
-                };
-
-                //-----------------------------------------------------------------------------
-                static void CUDART_CB cudaRtCallback(cudaStream_t /*queue*/, cudaError_t /*status*/, void *arg)
-                {
-                    // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
-                    const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
-
-                    // Notify the executing thread.
-                    {
-                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                        pCallbackSynchronizationData->state = CallbackState::notified;
-                    }
-                    pCallbackSynchronizationData->m_event.notify_one();
-
-                    // Wait for the executing thread to finish the task if it has not already finished.
-                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                    if(pCallbackSynchronizationData->state != CallbackState::finished)
-                    {
-                        pCallbackSynchronizationData->m_event.wait(
-                            lock,
-                            [pCallbackSynchronizationData](){
-                                return pCallbackSynchronizationData->state == CallbackState::finished;
-                            }
-                        );
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamAddCallback(
-                        queue.m_spQueueImpl->m_CudaQueue,
-                        cudaRtCallback,
-                        pCallbackSynchronizationData.get(),
-                        0u));
-
-                    // We start a new std::thread which stores the task to be executed.
-                    // This circumvents the limitation that it is not possible to call CUDA methods within the CUDA callback thread.
-                    // The CUDA thread signals the std::thread when it is ready to execute the task.
-                    // The CUDA thread is waiting for the std::thread to signal that it is finished executing the task
-                    // before it executes the next task in the queue (CUDA stream).
-                    std::thread t(
-                        [pCallbackSynchronizationData, task](){
-
-                            // If the callback has not yet been called, we wait for it.
-                            {
-                                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                                if(pCallbackSynchronizationData->state != CallbackState::notified)
-                                {
-                                    pCallbackSynchronizationData->m_event.wait(
-                                        lock,
-                                        [pCallbackSynchronizationData](){
-                                            return pCallbackSynchronizationData->state == CallbackState::notified;
-                                        }
-                                    );
-                                }
-
-                                task();
-
-                                // Notify the waiting CUDA thread.
-                                pCallbackSynchronizationData->state = CallbackState::finished;
-                            }
-                            pCallbackSynchronizationData->m_event.notify_one();
-                        }
-                    );
-
-                    t.detach();
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT non-blocking queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCudaRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCudaRtNonBlocking const & queue)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Query is allowed even for queues on non current device.
-                    cudaError_t ret = cudaSuccess;
-                    ALPAKA_CUDA_RT_CHECK_IGNORE(
-                        ret = cudaStreamQuery(
-                            queue.m_spQueueImpl->m_CudaQueue),
-                        cudaErrorNotReady);
-                    return (ret == cudaSuccess);
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT non-blocking queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCudaRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCudaRtNonBlocking const & queue)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Sync is allowed even for queues on non current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/queue/QueueHipRtBlocking.hpp b/thirdParty/alpaka/include/alpaka/queue/QueueHipRtBlocking.hpp
deleted file mode 100644
index cab8b7db33..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/QueueHipRtBlocking.hpp
+++ /dev/null
@@ -1,385 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/DevHipRt.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventHipRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace hip
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The HIP RT blocking queue implementation.
-                class QueueHipRtBlockingImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST QueueHipRtBlockingImpl(
-                        dev::DevHipRt const & dev) :
-                            m_dev(dev),
-                            m_HipQueue()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // - hipStreamDefault: Default queue creation flag.
-                        // - hipStreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue),
-                        //   and that the created queue should perform no implicit synchronization with queue 0.
-                        // Create the queue on the current device.
-                        // NOTE: hipStreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue.
-                        // It would be too much work to implement implicit default queue synchronization on CPU.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamCreateWithFlags(
-                                &m_HipQueue,
-                                hipStreamNonBlocking));
-                    }
-                    //-----------------------------------------------------------------------------
-                    QueueHipRtBlockingImpl(QueueHipRtBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueHipRtBlockingImpl(QueueHipRtBlockingImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueHipRtBlockingImpl const &) -> QueueHipRtBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueHipRtBlockingImpl &&) -> QueueHipRtBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~QueueHipRtBlockingImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before hipStreamDestroy required?
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // In case the device is still doing work in the queue when hipStreamDestroy() is called, the function will return immediately
-                        // and the resources associated with queue will be released automatically once the device has completed all work in queue.
-                        // -> No need to synchronize here.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamDestroy(
-                                m_HipQueue));
-                    }
-
-                public:
-                    dev::DevHipRt const m_dev;   //!< The device this queue is bound to.
-                    hipStream_t m_HipQueue;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    int m_callees = 0;
-                    std::mutex m_mutex;
-#endif
-                };
-            } // detail
-        } // hip
-
-        //#############################################################################
-        //! The HIP RT blocking queue.
-        class QueueHipRtBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueHipRtBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST QueueHipRtBlocking(
-                dev::DevHipRt const & dev) :
-                m_spQueueImpl(std::make_shared<hip::detail::QueueHipRtBlockingImpl>(dev))
-            {}
-            //-----------------------------------------------------------------------------
-            QueueHipRtBlocking(QueueHipRtBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueHipRtBlocking(QueueHipRtBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueHipRtBlocking const &) -> QueueHipRtBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueHipRtBlocking &&) -> QueueHipRtBlocking & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(QueueHipRtBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(QueueHipRtBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            // NOTE: for HCC streams workaround: no need to sync with spawned tasks as this queue is already syncing in enqueue
-            ALPAKA_FN_HOST ~QueueHipRtBlocking() = default;
-
-        public:
-            std::shared_ptr<hip::detail::QueueHipRtBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueHipRtBlocking>
-            {
-                using type = dev::DevHipRt;
-            };
-            //#############################################################################
-            //! The HIP RT blocking queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueHipRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueHipRtBlocking const & queue)
-                -> dev::DevHipRt
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueHipRtBlocking>
-            {
-                using type = event::EventHipRt;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue enqueue trait specialization.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                TTask>
-            {
-                //#############################################################################
-                enum class CallbackState
-                {
-                    enqueued,
-                    notified,
-                    finished,
-                };
-
-                //#############################################################################
-                struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
-                {
-                    std::mutex m_mutex;
-                    std::condition_variable m_event;
-                    CallbackState state = CallbackState::enqueued;
-                };
-
-                //-----------------------------------------------------------------------------
-                static void HIPRT_CB hipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg)
-                {
-                    // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
-                    const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
-
-                    // Notify the executing thread.
-                    {
-                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                        pCallbackSynchronizationData->state = CallbackState::notified;
-                    }
-                    pCallbackSynchronizationData->m_event.notify_one();
-
-                    // Wait for the executing thread to finish the task if it has not already finished.
-                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                    if(pCallbackSynchronizationData->state != CallbackState::finished)
-                    {
-                        pCallbackSynchronizationData->m_event.wait(
-                            lock,
-                            [pCallbackSynchronizationData](){
-                                return pCallbackSynchronizationData->state == CallbackState::finished;
-                            }
-                        );
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    {
-                        // thread-safe callee incrementing
-                        std::lock_guard<std::mutex> guard(queue.m_spQueueImpl->m_mutex);
-                        queue.m_spQueueImpl->m_callees += 1;
-                    }
-#endif
-
-                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
-
-                    ALPAKA_HIP_RT_CHECK(hipStreamAddCallback(
-                        queue.m_spQueueImpl->m_HipQueue,
-                        hipRtCallback,
-                        pCallbackSynchronizationData.get(),
-                        0u));
-
-                    // We start a new std::thread which stores the task to be executed.
-                    // This circumvents the limitation that it is not possible to call HIP methods within the HIP callback thread.
-                    // The HIP thread signals the std::thread when it is ready to execute the task.
-                    // The HIP thread is waiting for the std::thread to signal that it is finished executing the task
-                    // before it executes the next task in the queue (HIP stream).
-                    std::thread t(
-                        [pCallbackSynchronizationData,
-                         task
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                         ,&queue
-#endif
-                        ](){
-
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                            // thread-safe task execution and callee decrementing
-                            std::lock_guard<std::mutex> guard(queue.m_spQueueImpl->m_mutex);
-#endif
-
-                            // If the callback has not yet been called, we wait for it.
-                            {
-                                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                                if(pCallbackSynchronizationData->state != CallbackState::notified)
-                                {
-                                    pCallbackSynchronizationData->m_event.wait(
-                                        lock,
-                                        [pCallbackSynchronizationData](){
-                                            return pCallbackSynchronizationData->state == CallbackState::notified;
-                                        }
-                                    );
-                                }
-
-                                task();
-
-                                // Notify the waiting HIP thread.
-                                pCallbackSynchronizationData->state = CallbackState::finished;
-                            }
-                            pCallbackSynchronizationData->m_event.notify_one();
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                            queue.m_spQueueImpl->m_callees -= 1;
-#endif
-                        }
-                    );
-
-                    t.join();
-                }
-            };
-            //#############################################################################
-            //! The HIP RT blocking queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueHipRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueHipRtBlocking const & queue)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // see: https://github.com/ROCm-Developer-Tools/HIP/blob/roc-1.9.x/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    return (queue.m_spQueueImpl->m_callees==0);
-#else
-                    // Query is allowed even for queues on non current device.
-                    hipError_t ret = hipSuccess;
-                    ALPAKA_HIP_RT_CHECK_IGNORE(
-                        ret = hipStreamQuery(
-                            queue.m_spQueueImpl->m_HipQueue),
-                        hipErrorNotReady);
-                    return (ret == hipSuccess);
-#endif
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueHipRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueHipRtBlocking const & queue)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    while(queue.m_spQueueImpl->m_callees>0) {
-                        std::this_thread::sleep_for(std::chrono::milliseconds(10u));
-                    }
-#else
-                    // Sync is allowed even for queues on non current device.
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-#endif
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/queue/QueueHipRtNonBlocking.hpp b/thirdParty/alpaka/include/alpaka/queue/QueueHipRtNonBlocking.hpp
deleted file mode 100644
index 6d0f25760c..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/QueueHipRtNonBlocking.hpp
+++ /dev/null
@@ -1,397 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/DevHipRt.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-#include <alpaka/meta/DependentFalseType.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventHipRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace hip
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The HIP RT non-blocking queue implementation.
-                class QueueHipRtNonBlockingImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST QueueHipRtNonBlockingImpl(
-                        dev::DevHipRt const & dev) :
-                            m_dev(dev),
-                            m_HipQueue()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // - hipStreamDefault: Default queue creation flag.
-                        // - hipStreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue),
-                        //   and that the created queue should perform no implicit synchronization with queue 0.
-                        // Create the queue on the current device.
-                        // NOTE: hipStreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue.
-                        // It would be too much work to implement implicit default queue synchronization on CPU.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamCreateWithFlags(
-                                &m_HipQueue,
-                                hipStreamNonBlocking));
-                    }
-                    //-----------------------------------------------------------------------------
-                    QueueHipRtNonBlockingImpl(QueueHipRtNonBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueHipRtNonBlockingImpl(QueueHipRtNonBlockingImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueHipRtNonBlockingImpl const &) -> QueueHipRtNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueHipRtNonBlockingImpl &&) -> QueueHipRtNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~QueueHipRtNonBlockingImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // In case the device is still doing work in the queue when hipStreamDestroy() is called, the function will return immediately
-                        // and the resources associated with queue will be released automatically once the device has completed all work in queue.
-                        // -> No need to synchronize here.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamDestroy(
-                                m_HipQueue));
-                    }
-
-                public:
-                    dev::DevHipRt const m_dev;   //!< The device this queue is bound to.
-                    hipStream_t m_HipQueue;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    int m_callees = 0;
-                    std::mutex m_mutex;
-#endif
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The HIP RT non-blocking queue.
-        class QueueHipRtNonBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueHipRtNonBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST QueueHipRtNonBlocking(
-                dev::DevHipRt const & dev) :
-                m_spQueueImpl(std::make_shared<hip::detail::QueueHipRtNonBlockingImpl>(dev))
-            {}
-            //-----------------------------------------------------------------------------
-            QueueHipRtNonBlocking(QueueHipRtNonBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueHipRtNonBlocking(QueueHipRtNonBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueHipRtNonBlocking const &) -> QueueHipRtNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueHipRtNonBlocking &&) -> QueueHipRtNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(QueueHipRtNonBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(QueueHipRtNonBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST ~QueueHipRtNonBlocking() {
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                // we are a non-blocking queue, so we have to wait here with its destruction until all spawned tasks have been processed
-                alpaka::wait::wait(*this);
-#endif
-            }
-
-        public:
-            std::shared_ptr<hip::detail::QueueHipRtNonBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT non-blocking queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueHipRtNonBlocking>
-            {
-                using type = dev::DevHipRt;
-            };
-            //#############################################################################
-            //! The HIP RT non-blocking queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueHipRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueHipRtNonBlocking const & queue)
-                -> dev::DevHipRt
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT non-blocking queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueHipRtNonBlocking>
-            {
-                using type = event::EventHipRt;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue enqueue trait specialization.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                TTask>
-            {
-                //#############################################################################
-                enum class CallbackState
-                {
-                    enqueued,
-                    notified,
-                    finished,
-                };
-
-                //#############################################################################
-                struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
-                {
-                    std::mutex m_mutex;
-                    std::condition_variable m_event;
-                    CallbackState state = CallbackState::enqueued;
-                };
-
-                //-----------------------------------------------------------------------------
-                static void HIPRT_CB hipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg)
-                {
-                    // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
-                    const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
-
-                    // Notify the executing thread.
-                    {
-                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                        pCallbackSynchronizationData->state = CallbackState::notified;
-                    }
-                    pCallbackSynchronizationData->m_event.notify_one();
-
-                    // Wait for the executing thread to finish the task if it has not already finished.
-                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                    if(pCallbackSynchronizationData->state != CallbackState::finished)
-                    {
-                        pCallbackSynchronizationData->m_event.wait(
-                            lock,
-                            [pCallbackSynchronizationData](){
-                                return pCallbackSynchronizationData->state == CallbackState::finished;
-                            }
-                        );
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-#if BOOST_COMP_HIP
-                    // NOTE: hip callbacks are not blocking the stream.
-                    // The workaround used for HIP(hcc) would avoid the usage in a workflow with
-                    // many stream/event synchronizations (e.g. PIConGPU).
-                    // @todo remove this assert when hipStreamAddCallback is fixed
-                    static_assert(
-                                meta::DependentFalseType<TTask>::value,
-                                "Callbacks are not supported for HIP-clang");
-#endif
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    {
-                        // thread-safe callee incrementing
-                        std::lock_guard<std::mutex> guard(queue.m_spQueueImpl->m_mutex);
-                        queue.m_spQueueImpl->m_callees += 1;
-                    }
-#endif
-                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
-                    // test example: https://github.com/ROCm-Developer-Tools/HIP/blob/roc-1.9.x/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
-                    ALPAKA_HIP_RT_CHECK(hipStreamAddCallback(
-                        queue.m_spQueueImpl->m_HipQueue,
-                        hipRtCallback,
-                        pCallbackSynchronizationData.get(),
-                        0u));
-
-                    // We start a new std::thread which stores the task to be executed.
-                    // This circumvents the limitation that it is not possible to call HIP methods within the HIP callback thread.
-                    // The HIP thread signals the std::thread when it is ready to execute the task.
-                    // The HIP thread is waiting for the std::thread to signal that it is finished executing the task
-                    // before it executes the next task in the queue (HIP stream).
-                    std::thread t(
-                        [pCallbackSynchronizationData,
-                         task
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                         ,&queue // requires queue's destructor to wait for all tasks
-#endif
-                        ](){
-
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                            // thread-safe task execution and callee decrementing
-                            std::lock_guard<std::mutex> guard(queue.m_spQueueImpl->m_mutex);
-#endif
-
-                            // If the callback has not yet been called, we wait for it.
-                            {
-                                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                                if(pCallbackSynchronizationData->state != CallbackState::notified)
-                                {
-                                    pCallbackSynchronizationData->m_event.wait(
-                                        lock,
-                                        [pCallbackSynchronizationData](){
-                                            return pCallbackSynchronizationData->state == CallbackState::notified;
-                                        }
-                                    );
-                                }
-
-                                task();
-
-                                // Notify the waiting HIP thread.
-                                pCallbackSynchronizationData->state = CallbackState::finished;
-                            }
-                            pCallbackSynchronizationData->m_event.notify_one();
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                            queue.m_spQueueImpl->m_callees -= 1;
-#endif
-                        }
-                    );
-
-                    t.detach();
-                }
-            };
-            //#############################################################################
-            //! The HIP RT non-blocking queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueHipRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueHipRtNonBlocking const & queue)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    return (queue.m_spQueueImpl->m_callees==0);
-#else
-
-                    // Query is allowed even for queues on non current device.
-                    hipError_t ret = hipSuccess;
-                    ALPAKA_HIP_RT_CHECK_IGNORE(
-                        ret = hipStreamQuery(
-                            queue.m_spQueueImpl->m_HipQueue),
-                        hipErrorNotReady);
-                    return (ret == hipSuccess);
-#endif
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT non-blocking queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueHipRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueHipRtNonBlocking const & queue)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    while(queue.m_spQueueImpl->m_callees>0) {
-                        std::this_thread::sleep_for(std::chrono::milliseconds(10u));
-                    }
-#else
-                    // Sync is allowed even for queues on non current device.
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                            queue.m_spQueueImpl->m_HipQueue));
-#endif
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/queue/Traits.hpp b/thirdParty/alpaka/include/alpaka/queue/Traits.hpp
deleted file mode 100644
index 47f6be0828..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/Traits.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The queue specifics.
-    namespace queue
-    {
-        //-----------------------------------------------------------------------------
-        //! The queue traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The queue enqueue trait.
-            template<
-                typename TQueue,
-                typename TTask,
-                typename TSfinae = void>
-            struct Enqueue;
-
-            //#############################################################################
-            //! The queue empty trait.
-            template<
-                typename TQueue,
-                typename TSfinae = void>
-            struct Empty;
-
-            //#############################################################################
-            //! Queue for an accelerator
-            template<
-                typename TAcc,
-                typename TProperty,
-                typename TSfinae = void>
-            struct QueueType;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Queues the given task in the given queue.
-        //!
-        //! Special Handling for events:
-        //!   If the event has previously been queued, then this call will overwrite any existing state of the event.
-        //!   Any subsequent calls which examine the status of event will only examine the completion of this most recent call to enqueue.
-        template<
-            typename TQueue,
-            typename TTask>
-        ALPAKA_FN_HOST auto enqueue(
-            TQueue & queue,
-            TTask && task)
-        -> void
-        {
-            traits::Enqueue<
-                TQueue,
-                typename std::decay<TTask>::type>
-            ::enqueue(
-                queue,
-                std::forward<TTask>(task));
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Tests if the queue is empty (all ops in the given queue have been completed).
-        template<
-            typename TQueue>
-        ALPAKA_FN_HOST auto empty(
-            TQueue const & queue)
-        -> bool
-        {
-            return
-                traits::Empty<
-                    TQueue>
-                ::empty(
-                    queue);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Queue based on the environment and a property
-        //
-        // \tparam TEnv Environment type, e.g.  accelerator, device or a platform.
-        //              queue::traits::QueueType must be specialized for TEnv
-        // \tparam TProperty Property to define the behavior of TEnv.
-        template<
-            typename TEnv,
-            typename TProperty>
-        using Queue = typename traits::QueueType<TEnv, TProperty>::type;
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp b/thirdParty/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp
deleted file mode 100644
index d13cc0f87e..0000000000
--- a/thirdParty/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/BoostPredef.hpp>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCpu;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cpu
-        {
-
-
-#if BOOST_COMP_CLANG
-    // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]"
-    // https://stackoverflow.com/a/29288300
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-
-            //#############################################################################
-            //! The CPU queue interface
-            class ICpuQueue
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                //! enqueue the event
-                virtual void enqueue(event::EventCpu &) = 0;
-                //-----------------------------------------------------------------------------
-                //! waiting for the event
-                virtual void wait(event::EventCpu const &) = 0;
-                //-----------------------------------------------------------------------------
-                virtual ~ICpuQueue() = default;
-            };
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/rand/RandCuRand.hpp b/thirdParty/alpaka/include/alpaka/rand/RandCuRand.hpp
deleted file mode 100644
index fb7b82144b..0000000000
--- a/thirdParty/alpaka/include/alpaka/rand/RandCuRand.hpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/rand/Traits.hpp>
-
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-#include <curand_kernel.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace rand
-    {
-        //#############################################################################
-        //! The CUDA rand implementation.
-        class RandCuRand : public concepts::Implements<ConceptRand, RandCuRand>
-        {
-        };
-
-        namespace generator
-        {
-            namespace cuda
-            {
-                //#############################################################################
-                //! The CUDA Xor random number generator.
-                class Xor
-                {
-                public:
-
-                    //-----------------------------------------------------------------------------
-                    // After calling this constructor the instance is not valid initialized and
-                    // need to be overwritten with a valid object
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST_ACC Xor() : m_State(curandStateXORWOW_t{})
-                    {
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    __device__ Xor(
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence = 0,
-                        std::uint32_t const & offset = 0)
-                    {
-                        curand_init(
-                            seed,
-                            subsequence,
-                            offset,
-                            &m_State);
-                    }
-
-                public:
-                    curandStateXORWOW_t m_State;
-                };
-            }
-        }
-        namespace distribution
-        {
-            namespace cuda
-            {
-                //#############################################################################
-                //! The CUDA random number floating point normal distribution.
-                template<
-                    typename T>
-                class NormalReal;
-
-                //#############################################################################
-                //! The CUDA random number float normal distribution.
-                template<>
-                class NormalReal<
-                    float>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> float
-                    {
-                        return curand_normal(&generator.m_State);
-                    }
-                };
-                //#############################################################################
-                //! The CUDA random number float normal distribution.
-                template<>
-                class NormalReal<
-                    double>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> double
-                    {
-                        return curand_normal_double(&generator.m_State);
-                    }
-                };
-
-                //#############################################################################
-                //! The CUDA random number floating point uniform distribution.
-                template<
-                    typename T>
-                class UniformReal;
-
-                //#############################################################################
-                //! The CUDA random number float uniform distribution.
-                template<>
-                class UniformReal<
-                    float>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> float
-                    {
-                        // (0.f, 1.0f]
-                        float const fUniformRand(curand_uniform(&generator.m_State));
-                        // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return denormalized floats around 0.f.
-                        // [0.f, 1.0f)
-                        return fUniformRand * static_cast<float>( fUniformRand != 1.0f );
-                    }
-                };
-                //#############################################################################
-                //! The CUDA random number float uniform distribution.
-                template<>
-                class UniformReal<
-                    double>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> double
-                    {
-                        // (0.f, 1.0f]
-                        double const fUniformRand(curand_uniform_double(&generator.m_State));
-                        // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to return denormalized floats around 0.f.
-                        // [0.f, 1.0f)
-                        return fUniformRand * static_cast<double>( fUniformRand != 1.0 );
-                    }
-                };
-
-                //#############################################################################
-                //! The CUDA random number integer uniform distribution.
-                template<
-                    typename T>
-                class UniformUint;
-
-                //#############################################################################
-                //! The CUDA random number unsigned integer uniform distribution.
-                template<>
-                class UniformUint<
-                    unsigned int>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    UniformUint() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> unsigned int
-                    {
-                        return curand(&generator.m_State);
-                    }
-                };
-            }
-        }
-
-        namespace distribution
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA random number float normal distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateNormalReal<
-                    RandCuRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto createNormalReal(
-                        RandCuRand const & /*rand*/)
-                    -> rand::distribution::cuda::NormalReal<T>
-                    {
-                        return rand::distribution::cuda::NormalReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The CUDA random number float uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformReal<
-                    RandCuRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto createUniformReal(
-                        RandCuRand const & /*rand*/)
-                    -> rand::distribution::cuda::UniformReal<T>
-                    {
-                        return rand::distribution::cuda::UniformReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The CUDA random number integer uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformUint<
-                    RandCuRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_integral<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto createUniformUint(
-                        RandCuRand const & /*rand*/)
-                    -> rand::distribution::cuda::UniformUint<T>
-                    {
-                        return rand::distribution::cuda::UniformUint<T>();
-                    }
-                };
-            }
-        }
-        namespace generator
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA random number default generator get trait specialization.
-                template<>
-                struct CreateDefault<
-                    RandCuRand>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto createDefault(
-                        RandCuRand const & /*rand*/,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::cuda::Xor
-                    {
-                        return rand::generator::cuda::Xor(
-                            seed,
-                            subsequence);
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/rand/RandHipRand.hpp b/thirdParty/alpaka/include/alpaka/rand/RandHipRand.hpp
deleted file mode 100644
index d4fcbb87ec..0000000000
--- a/thirdParty/alpaka/include/alpaka/rand/RandHipRand.hpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/rand/Traits.hpp>
-
-#include <alpaka/dev/DevHipRt.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wduplicate-decl-specifier"
-
-#include <hiprand_kernel.h>
-
-#pragma clang diagnostic pop
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace rand
-    {
-        //#############################################################################
-        //! The HIP rand implementation.
-        class RandHipRand : public concepts::Implements<ConceptRand, RandHipRand>
-        {
-        };
-
-        namespace generator
-        {
-            namespace hip
-            {
-                //#############################################################################
-                //! The HIP Xor random number generator.
-                class Xor
-                {
-                public:
-
-                    //-----------------------------------------------------------------------------
-                    // After calling this constructor the instance is not valid initialized and
-                    // need to be overwritten with a valid object
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST_ACC Xor() : m_State(hiprandStateXORWOW_t{})
-                    {
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    __device__ Xor(
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence = 0,
-                        std::uint32_t const & offset = 0)
-                    {
-                        hiprand_init(
-                            seed,
-                            subsequence,
-                            offset,
-                            &m_State);
-                    }
-
-                public:
-                    hiprandStateXORWOW_t m_State;
-                };
-            }
-        }
-        namespace distribution
-        {
-            namespace hip
-            {
-                //#############################################################################
-                //! The HIP random number floating point normal distribution.
-                template<
-                    typename T>
-                class NormalReal;
-
-                //#############################################################################
-                //! The HIP random number float normal distribution.
-                template<>
-                class NormalReal<
-                    float>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> float
-                    {
-                        return hiprand_normal(&generator.m_State);
-                    }
-                };
-                //#############################################################################
-                //! The HIP random number float normal distribution.
-                template<>
-                class NormalReal<
-                    double>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> double
-                    {
-                        return hiprand_normal_double(&generator.m_State);
-                    }
-                };
-
-                //#############################################################################
-                //! The HIP random number floating point uniform distribution.
-                template<
-                    typename T>
-                class UniformReal;
-
-                //#############################################################################
-                //! The HIP random number float uniform distribution.
-                template<>
-                class UniformReal<
-                    float>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> float
-                    {
-                        // (0.f, 1.0f]
-                        float const fUniformRand(hiprand_uniform(&generator.m_State));
-                        // NOTE: (1.0f - hiprand_uniform) does not work, because hiprand_uniform seems to return denormalized floats around 0.f.
-                        // [0.f, 1.0f)
-                        return fUniformRand * static_cast<float>( fUniformRand != 1.0f );
-                    }
-                };
-                //#############################################################################
-                //! The HIP random number float uniform distribution.
-                template<>
-                class UniformReal<
-                    double>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> double
-                    {
-                        // (0.f, 1.0f]
-                        double const fUniformRand(hiprand_uniform_double(&generator.m_State));
-                        // NOTE: (1.0f - hiprand_uniform_double) does not work, because hiprand_uniform_double seems to return denormalized floats around 0.f.
-                        // [0.f, 1.0f)
-                        return fUniformRand * static_cast<double>( fUniformRand != 1.0f );
-                    }
-                };
-
-                //#############################################################################
-                //! The HIP random number integer uniform distribution.
-                template<
-                    typename T>
-                class UniformUint;
-
-                //#############################################################################
-                //! The HIP random number unsigned integer uniform distribution.
-                template<>
-                class UniformUint<
-                    unsigned int>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    UniformUint() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> unsigned int
-                    {
-                        return hiprand(&generator.m_State);
-                    }
-                };
-            }
-        }
-
-        namespace distribution
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP random number float normal distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateNormalReal<
-                    RandHipRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    ALPAKA_FN_HOST_ACC static auto createNormalReal(
-                        RandHipRand const & /*rand*/)
-                    -> rand::distribution::hip::NormalReal<T>
-                    {
-                        return rand::distribution::hip::NormalReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The HIP random number float uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformReal<
-                    RandHipRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    ALPAKA_FN_HOST_ACC static auto createUniformReal(
-                        RandHipRand const & /*rand*/)
-                    -> rand::distribution::hip::UniformReal<T>
-                    {
-                        return rand::distribution::hip::UniformReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The HIP random number integer uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformUint<
-                    RandHipRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_integral<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    ALPAKA_FN_HOST_ACC static auto createUniformUint(
-                        RandHipRand const & /*rand*/)
-                    -> rand::distribution::hip::UniformUint<T>
-                    {
-                        return rand::distribution::hip::UniformUint<T>();
-                    }
-                };
-            }
-        }
-        namespace generator
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP random number default generator get trait specialization.
-                template<>
-                struct CreateDefault<
-                    RandHipRand>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto createDefault(
-                        RandHipRand const & /*rand*/,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::hip::Xor
-                    {
-                        return rand::generator::hip::Xor(
-                            seed,
-                            subsequence);
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/rand/RandStdLib.hpp b/thirdParty/alpaka/include/alpaka/rand/RandStdLib.hpp
deleted file mode 100644
index c93cc352fd..0000000000
--- a/thirdParty/alpaka/include/alpaka/rand/RandStdLib.hpp
+++ /dev/null
@@ -1,338 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/rand/Traits.hpp>
-#include <alpaka/rand/TinyMT/Engine.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <cstdint>
-#include <random>
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace rand
-    {
-        //#############################################################################
-        //! "Tiny" state mersenne twister implementation
-        class TinyMersenneTwister : public concepts::Implements<ConceptRand, TinyMersenneTwister>
-        {
-        };
-        using RandStdLib = TinyMersenneTwister;
-
-        //#############################################################################
-        //! The standard library mersenne twister implementation.
-        class MersenneTwister : public concepts::Implements<ConceptRand, MersenneTwister>
-        {
-        };
-
-        //#############################################################################
-        //! The standard library rand device implementation.
-        class RandomDevice : public concepts::Implements<ConceptRand, RandomDevice>
-        {
-        };
-
-        namespace generator
-        {
-            namespace cpu
-            {
-                //#############################################################################
-                //! The standard library mersenne twister random number generator.
-                //!
-                //! size of state: 19937 bytes
-                class MersenneTwister
-                {
-                public:
-
-                    //-----------------------------------------------------------------------------
-                    MersenneTwister() = default;
-
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST MersenneTwister(
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence = 0,
-                        std::uint32_t const & offset = 0) :
-                        // NOTE: XOR the seed and the subsequence to generate a unique seed.
-                        m_State((seed ^ subsequence) + offset)
-                    {
-                    }
-
-                public:
-                    std::mt19937 m_State;
-                };
-
-                //#############################################################################
-                //! "Tiny" state mersenne twister implementation
-                //!
-                //! repository: github.com/MersenneTwister-Lab/TinyMT
-                //!
-                //! license: 3-clause BSD
-                //!
-                //! @author Mutsuo Saito (Hiroshima University)Tokio University.
-                //! @author Makoto Matsumoto (The University of Tokyo)
-                //!
-                //! size of state: 28 bytes (127 bits?!)
-                class TinyMersenneTwister
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    TinyMersenneTwister() = default;
-
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST TinyMersenneTwister(
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence = 0,
-                        std::uint32_t const & offset = 0) :
-                        // NOTE: XOR the seed and the subsequence to generate a unique seed.
-                        m_State((seed ^ subsequence) + offset)
-                    {
-                    }
-
-                public:
-                    TinyMTengine m_State;
-                };
-
-                //#############################################################################
-                //! The standard library's random device based on the local entropy pool.
-                //!
-                //! Warning: the entropy pool on many devices degrates quickly and performance
-                //!          will drop significantly when this point occures.
-                //!
-                //! size of state: 1 byte
-                class RandomDevice
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    RandomDevice() = default;
-                    RandomDevice(RandomDevice&&) :
-                        m_State{}
-                    {
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST RandomDevice(
-                        std::uint32_t const &,
-                        std::uint32_t const & = 0,
-                        std::uint32_t const & = 0) :
-                        m_State{}
-                    {
-                    }
-
-                public:
-                    std::random_device m_State;
-                };
-            }
-        }
-
-        namespace distribution
-        {
-            namespace cpu
-            {
-                //#############################################################################
-                //! The CPU random number normal distribution.
-                template<
-                    typename T>
-                class NormalReal
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    ALPAKA_FN_HOST auto operator()(
-                        TGenerator & generator)
-                    -> T
-                    {
-                        return m_dist(generator.m_State);
-                    }
-                    std::normal_distribution<T> m_dist;
-                };
-
-                //#############################################################################
-                //! The CPU random number uniform distribution.
-                template<
-                    typename T>
-                class UniformReal
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    ALPAKA_FN_HOST auto operator()(
-                        TGenerator & generator)
-                    -> T
-                    {
-                        return m_dist(generator.m_State);
-                    }
-                    std::uniform_real_distribution<T> m_dist;
-                };
-
-                //#############################################################################
-                //! The CPU random number normal distribution.
-                template<
-                    typename T>
-                class UniformUint
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    UniformUint() :
-                        m_dist(
-                            0,  // For signed integer: std::numeric_limits<T>::lowest()
-                            std::numeric_limits<T>::max())
-                    {}
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    ALPAKA_FN_HOST auto operator()(
-                        TGenerator & generator)
-                    -> T
-                    {
-                        return m_dist(generator.m_State);
-                    }
-                    std::uniform_int_distribution<T> m_dist;
-                };
-            }
-        }
-
-        namespace distribution
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU device random number float normal distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateNormalReal<
-                    RandStdLib,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createNormalReal(
-                        RandStdLib const & rand)
-                    -> rand::distribution::cpu::NormalReal<T>
-                    {
-                        alpaka::ignore_unused(rand);
-                        return rand::distribution::cpu::NormalReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The CPU device random number float uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformReal<
-                    RandStdLib,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createUniformReal(
-                        RandStdLib const & rand)
-                    -> rand::distribution::cpu::UniformReal<T>
-                    {
-                        alpaka::ignore_unused(rand);
-                        return rand::distribution::cpu::UniformReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The CPU device random number integer uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformUint<
-                    RandStdLib,
-                    T,
-                    typename std::enable_if<
-                        std::is_integral<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createUniformUint(
-                        RandStdLib const & rand)
-                    -> rand::distribution::cpu::UniformUint<T>
-                    {
-                        alpaka::ignore_unused(rand);
-                        return rand::distribution::cpu::UniformUint<T>();
-                    }
-                };
-            }
-        }
-        namespace generator
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU device random number default generator get trait specialization.
-                template<>
-                struct CreateDefault<
-                    TinyMersenneTwister>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createDefault(
-                        TinyMersenneTwister const & rand,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::cpu::TinyMersenneTwister
-                    {
-                        alpaka::ignore_unused(rand);
-                        return rand::generator::cpu::TinyMersenneTwister(
-                            seed,
-                            subsequence);
-                    }
-                };
-
-                template<>
-                struct CreateDefault<
-                    MersenneTwister>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createDefault(
-                        MersenneTwister const & rand,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::cpu::MersenneTwister
-                    {
-                        alpaka::ignore_unused(rand);
-                        return rand::generator::cpu::MersenneTwister(
-                            seed,
-                            subsequence);
-                    }
-                };
-
-                template<>
-                struct CreateDefault<
-                    RandomDevice>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createDefault(
-                        RandomDevice const & rand,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::cpu::RandomDevice
-                    {
-                        alpaka::ignore_unused(rand);
-                        return rand::generator::cpu::RandomDevice(
-                            seed,
-                            subsequence);
-                    }
-                };
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/rand/TinyMT/Engine.hpp b/thirdParty/alpaka/include/alpaka/rand/TinyMT/Engine.hpp
deleted file mode 100644
index fb2582992d..0000000000
--- a/thirdParty/alpaka/include/alpaka/rand/TinyMT/Engine.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/rand/TinyMT/tinymt32.h>
-
-#include <cstdint>
-
-
-namespace alpaka
-{
-namespace rand
-{
-namespace generator
-{
-namespace cpu
-{
-    //! Implementation of std::UniformRandomBitGenerator for TinyMT32
-    struct TinyMTengine
-    {
-        using result_type = std::uint32_t;
-
-        static constexpr result_type default_seed()
-        {
-            return 42u;
-        }
-
-        void seed( result_type value = default_seed() )
-        {
-            // parameters from TinyMT/jump/sample.c
-            prng.mat1 = 0x8f7011ee;
-            prng.mat2 = 0xfc78ff1f;
-            prng.tmat = 0x3793fdff;
-
-            tinymt32_init( &prng, value );
-        }
-
-        TinyMTengine( std::uint32_t const & seedValue )
-        {
-            seed( seedValue );
-        }
-
-        TinyMTengine()
-        {
-            std::uint32_t const magicSeed = 42u;
-            seed( magicSeed );
-        }
-
-        result_type operator()()
-        {
-            return tinymt32_generate_uint32( &prng );
-        }
-
-        static constexpr result_type min()
-        {
-            return 0u;
-        }
-
-        static constexpr result_type max()
-        {
-            return UINT32_MAX;
-        }
-
-        void discard( unsigned long long ) // z
-        {
-            // not implemented
-            // tinymt32_jump( &prng, z, z );
-        }
-
-        tinymt32_t prng;
-    };
-
-} // namespace cpu
-} // namespace generator
-} // namespace rand
-} // namespace alpaka
diff --git a/thirdParty/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt b/thirdParty/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt
deleted file mode 100644
index 7496ebe318..0000000000
--- a/thirdParty/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2019 Mutsuo Saito
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-Copyright (c) 2011, 2013 Mutsuo Saito, Makoto Matsumoto,
-Hiroshima University and The University of Tokyo.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-      copyright notice, this list of conditions and the following
-      disclaimer in the documentation and/or other materials provided
-      with the distribution.
-    * Neither the name of the Hiroshima University nor the names of
-      its contributors may be used to endorse or promote products
-      derived from this software without specific prior written
-      permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdParty/alpaka/include/alpaka/rand/TinyMT/tinymt32.h b/thirdParty/alpaka/include/alpaka/rand/TinyMT/tinymt32.h
deleted file mode 100644
index 52ada12142..0000000000
--- a/thirdParty/alpaka/include/alpaka/rand/TinyMT/tinymt32.h
+++ /dev/null
@@ -1,424 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Mutsuo Saito
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#ifndef TINYMT32_H
-#define TINYMT32_H
-/**
- * @file tinymt32.h
- *
- * @brief Tiny Mersenne Twister only 127 bit internal state
- *
- * @author Mutsuo Saito (Hiroshima University)
- * @author Makoto Matsumoto (University of Tokyo)
- *
- * Copyright (C) 2011 Mutsuo Saito, Makoto Matsumoto,
- * Hiroshima University and The University of Tokyo.
- * All rights reserved.
- *
- * The 3-clause BSD License is applied to this software, see
- * LICENSE.txt
- */
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <cstdint>
-/* work-around for glibc < 2.18 according to bug
- * https://sourceware.org/bugzilla/show_bug.cgi?id=15366
- */
-#ifndef UINT32_MAX
-#   define UINT32_MAX ((uint32_t)-1u)
-#endif
-#ifndef UINT32_C
-#   define UINT32_C(value) uint_least32_t(value)
-#endif
-#include <cinttypes>
-
-#if BOOST_COMP_CLANG
-#   pragma clang diagnostic push
-#   pragma clang diagnostic ignored "-Wold-style-cast"
-#   pragma clang diagnostic ignored "-Wsign-conversion"
-#endif
-#if BOOST_COMP_GNUC
-#   pragma GCC diagnostic push
-#   pragma GCC diagnostic ignored "-Wsign-conversion"
-#endif
-#if BOOST_COMP_MSVC
-    #pragma warning(push)
-    #pragma warning(disable: 4100)  // tinymt32.h(60): warning C4100: 'random': unreferenced formal parameter
-#endif
-
-#define TINYMT32_MEXP 127
-#define TINYMT32_SH0 1
-#define TINYMT32_SH1 10
-#define TINYMT32_SH8 8
-#define TINYMT32_MASK UINT32_C(0x7fffffff)
-#define TINYMT32_MUL (1.0f / 16777216.0f)
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/**
- * tinymt32 internal state vector and parameters
- */
-struct TINYMT32_T {
-    uint32_t status[4];
-    uint32_t mat1;
-    uint32_t mat2;
-    uint32_t tmat;
-};
-
-typedef struct TINYMT32_T tinymt32_t;
-
-inline void tinymt32_init(tinymt32_t * random, uint32_t seed);
-inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
-                            int key_length);
-
-#if defined(__GNUC__)
-/**
- * This function always returns 127
- * @param random not used
- * @return always 127
- */
-inline static int tinymt32_get_mexp(
-    tinymt32_t * random  __attribute__((unused))) {
-    return TINYMT32_MEXP;
-}
-#else
-inline static int tinymt32_get_mexp(tinymt32_t * random) {
-    return TINYMT32_MEXP;
-}
-#endif
-
-/**
- * This function changes internal state of tinymt32.
- * Users should not call this function directly.
- * @param random tinymt internal status
- */
-inline static void tinymt32_next_state(tinymt32_t * random) {
-    uint32_t x;
-    uint32_t y;
-
-    y = random->status[3];
-    x = (random->status[0] & TINYMT32_MASK)
-        ^ random->status[1]
-        ^ random->status[2];
-    x ^= (x << TINYMT32_SH0);
-    y ^= (y >> TINYMT32_SH0) ^ x;
-    random->status[0] = random->status[1];
-    random->status[1] = random->status[2];
-    random->status[2] = x ^ (y << TINYMT32_SH1);
-    random->status[3] = y;
-    random->status[1] ^= -((int32_t)(y & 1)) & random->mat1;
-    random->status[2] ^= -((int32_t)(y & 1)) & random->mat2;
-}
-
-/**
- * This function outputs 32-bit unsigned integer from internal state.
- * Users should not call this function directly.
- * @param random tinymt internal status
- * @return 32-bit unsigned pseudorandom number
- */
-inline static uint32_t tinymt32_temper(tinymt32_t * random) {
-    uint32_t t0, t1;
-    t0 = random->status[3];
-#if defined(LINEARITY_CHECK)
-    t1 = random->status[0]
-        ^ (random->status[2] >> TINYMT32_SH8);
-#else
-    t1 = random->status[0]
-        + (random->status[2] >> TINYMT32_SH8);
-#endif
-    t0 ^= t1;
-    t0 ^= -((int32_t)(t1 & 1)) & random->tmat;
-    return t0;
-}
-
-/**
- * This function outputs floating point number from internal state.
- * Users should not call this function directly.
- * @param random tinymt internal status
- * @return floating point number r (1.0 <= r < 2.0)
- */
-inline static float tinymt32_temper_conv(tinymt32_t * random) {
-    uint32_t t0, t1;
-    union {
-        uint32_t u;
-        float f;
-    } conv;
-
-    t0 = random->status[3];
-#if defined(LINEARITY_CHECK)
-    t1 = random->status[0]
-        ^ (random->status[2] >> TINYMT32_SH8);
-#else
-    t1 = random->status[0]
-        + (random->status[2] >> TINYMT32_SH8);
-#endif
-    t0 ^= t1;
-    conv.u = ((t0 ^ (-((int32_t)(t1 & 1)) & random->tmat)) >> 9)
-              | UINT32_C(0x3f800000);
-    return conv.f;
-}
-
-/**
- * This function outputs floating point number from internal state.
- * Users should not call this function directly.
- * @param random tinymt internal status
- * @return floating point number r (1.0 < r < 2.0)
- */
-inline static float tinymt32_temper_conv_open(tinymt32_t * random) {
-    uint32_t t0, t1;
-    union {
-        uint32_t u;
-        float f;
-    } conv;
-
-    t0 = random->status[3];
-#if defined(LINEARITY_CHECK)
-    t1 = random->status[0]
-        ^ (random->status[2] >> TINYMT32_SH8);
-#else
-    t1 = random->status[0]
-        + (random->status[2] >> TINYMT32_SH8);
-#endif
-    t0 ^= t1;
-    conv.u = ((t0 ^ (-((int32_t)(t1 & 1)) & random->tmat)) >> 9)
-              | UINT32_C(0x3f800001);
-    return conv.f;
-}
-
-/**
- * This function outputs 32-bit unsigned integer from internal state.
- * @param random tinymt internal status
- * @return 32-bit unsigned integer r (0 <= r < 2^32)
- */
-inline static uint32_t tinymt32_generate_uint32(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper(random);
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function is implemented using multiplying by (1 / 2^24).
- * floating point multiplication is faster than using union trick in
- * my Intel CPU.
- * @param random tinymt internal status
- * @return floating point number r (0.0 <= r < 1.0)
- */
-inline static float tinymt32_generate_float(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return (tinymt32_temper(random) >> 8) * TINYMT32_MUL;
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function is implemented using union trick.
- * @param random tinymt internal status
- * @return floating point number r (1.0 <= r < 2.0)
- */
-inline static float tinymt32_generate_float12(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper_conv(random);
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function is implemented using union trick.
- * @param random tinymt internal status
- * @return floating point number r (0.0 <= r < 1.0)
- */
-inline static float tinymt32_generate_float01(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper_conv(random) - 1.0f;
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function may return 1.0 and never returns 0.0.
- * @param random tinymt internal status
- * @return floating point number r (0.0 < r <= 1.0)
- */
-inline static float tinymt32_generate_floatOC(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return 1.0f - tinymt32_generate_float(random);
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function returns neither 0.0 nor 1.0.
- * @param random tinymt internal status
- * @return floating point number r (0.0 < r < 1.0)
- */
-inline static float tinymt32_generate_floatOO(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper_conv_open(random) - 1.0f;
-}
-
-/**
- * This function outputs double precision floating point number from
- * internal state. The returned value has 32-bit precision.
- * In other words, this function makes one double precision floating point
- * number from one 32-bit unsigned integer.
- * @param random tinymt internal status
- * @return floating point number r (0.0 <= r < 1.0)
- */
-inline static double tinymt32_generate_32double(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper(random) * (1.0 / 4294967296.0);
-}
-
-#if defined(__cplusplus)
-}
-#endif
-
-#define MIN_LOOP 8
-#define PRE_LOOP 8
-
-/**
- * This function represents a function used in the initialization
- * by init_by_array
- * @param x 32-bit integer
- * @return 32-bit integer
- */
-static uint32_t ini_func1(uint32_t x) {
-    return (x ^ (x >> 27)) * UINT32_C(1664525);
-}
-
-/**
- * This function represents a function used in the initialization
- * by init_by_array
- * @param x 32-bit integer
- * @return 32-bit integer
- */
-static uint32_t ini_func2(uint32_t x) {
-    return (x ^ (x >> 27)) * UINT32_C(1566083941);
-}
-
-/**
- * This function certificate the period of 2^127-1.
- * @param random tinymt state vector.
- */
-static void period_certification(tinymt32_t * random) {
-    if ((random->status[0] & TINYMT32_MASK) == 0 &&
-	random->status[1] == 0 &&
-	random->status[2] == 0 &&
-	random->status[3] == 0) {
-	random->status[0] = 'T';
-	random->status[1] = 'I';
-	random->status[2] = 'N';
-	random->status[3] = 'Y';
-    }
-}
-
-/**
- * This function initializes the internal state array with a 32-bit
- * unsigned integer seed.
- * @param random tinymt state vector.
- * @param seed a 32-bit unsigned integer used as a seed.
- */
-inline void tinymt32_init(tinymt32_t * random, uint32_t seed) {
-    random->status[0] = seed;
-    random->status[1] = random->mat1;
-    random->status[2] = random->mat2;
-    random->status[3] = random->tmat;
-    for (uint32_t i = 1; i < MIN_LOOP; i++) {
-	random->status[i & 3] ^= i + UINT32_C(1812433253)
-	    * (random->status[(i - 1) & 3]
-	       ^ (random->status[(i - 1) & 3] >> 30));
-    }
-    period_certification(random);
-    for (int i = 0; i < PRE_LOOP; i++) {
-	tinymt32_next_state(random);
-    }
-}
-
-/**
- * This function initializes the internal state array,
- * with an array of 32-bit unsigned integers used as seeds
- * @param random tinymt state vector.
- * @param init_key the array of 32-bit integers, used as a seed.
- * @param key_length the length of init_key.
- */
-inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
-			    int key_length) {
-    const int lag = 1;
-    const int mid = 1;
-    const int size = 4;
-    uint32_t i;
-    int j;
-    int count;
-    uint32_t r;
-    uint32_t * st = &random->status[0];
-
-    st[0] = 0;
-    st[1] = random->mat1;
-    st[2] = random->mat2;
-    st[3] = random->tmat;
-    if (key_length + 1 > MIN_LOOP) {
-	count = key_length + 1;
-    } else {
-	count = MIN_LOOP;
-    }
-    r = ini_func1(st[0] ^ st[mid % size]
-		  ^ st[(size - 1) % size]);
-    st[mid % size] += r;
-    r += uint32_t(key_length);
-    st[(mid + lag) % size] += r;
-    st[0] = r;
-    count--;
-    for (i = 1, j = 0; (j < count) && (j < key_length); j++) {
-	r = ini_func1(st[i % size]
-		      ^ st[(i + mid) % size]
-		      ^ st[(i + size - 1) % size]);
-	st[(i + mid) % size] += r;
-	r += init_key[j] + i;
-	st[(i + mid + lag) % size] += r;
-	st[i % size] = r;
-	i = (i + 1) % size;
-    }
-    for (; j < count; j++) {
-	r = ini_func1(st[i % size]
-		      ^ st[(i + mid) % size]
-		      ^ st[(i + size - 1) % size]);
-	st[(i + mid) % size] += r;
-	r += i;
-	st[(i + mid + lag) % size] += r;
-	st[i % size] = r;
-	i = (i + 1) % size;
-    }
-    for (j = 0; j < size; j++) {
-	r = ini_func2(st[i % size]
-		      + st[(i + mid) % size]
-		      + st[(i + size - 1) % size]);
-	st[(i + mid) % size] ^= r;
-	r -= i;
-	st[(i + mid + lag) % size] ^= r;
-	st[i % size] = r;
-	i = (i + 1) % size;
-    }
-    period_certification(random);
-    for (i = 0; i < PRE_LOOP; i++) {
-	tinymt32_next_state(random);
-    }
-}
-
-#undef MIN_LOOP
-#undef PRE_LOOP
-
-#if BOOST_COMP_CLANG
-#   pragma clang diagnostic pop
-#endif
-#if BOOST_COMP_GNUC
-#   pragma GCC diagnostic pop
-#endif
-#if BOOST_COMP_MSVC
-#   pragma warning(pop)
-#endif
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/rand/Traits.hpp b/thirdParty/alpaka/include/alpaka/rand/Traits.hpp
deleted file mode 100644
index 579509377c..0000000000
--- a/thirdParty/alpaka/include/alpaka/rand/Traits.hpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The random number generation specifics.
-    namespace rand
-    {
-        struct ConceptRand;
-
-        //-----------------------------------------------------------------------------
-        //! The random number generator distribution specifics.
-        namespace distribution
-        {
-            //-----------------------------------------------------------------------------
-            //! The random number generator distribution traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The random number float normal distribution get trait.
-                template<
-                    typename TRand,
-                    typename T,
-                    typename TSfinae = void>
-                struct CreateNormalReal;
-
-                //#############################################################################
-                //! The random number float uniform distribution get trait.
-                template<
-                    typename TRand,
-                    typename T,
-                    typename TSfinae = void>
-                struct CreateUniformReal;
-
-                //#############################################################################
-                //! The random number integer uniform distribution get trait.
-                template<
-                    typename TRand,
-                    typename T,
-                    typename TSfinae = void>
-                struct CreateUniformUint;
-            }
-
-            //-----------------------------------------------------------------------------
-            //! \return A normal float distribution with mean 0.0f and standard deviation 1.0f.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename T,
-                typename TRand>
-            ALPAKA_FN_HOST_ACC auto createNormalReal(
-                TRand const & rand)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateNormalReal<
-                    concepts::ImplementationBase<ConceptRand, TRand>,
-                    T>
-                ::createNormalReal(
-                    rand))
-#endif
-            {
-                static_assert(
-                    std::is_floating_point<T>::value,
-                    "The value type T has to be a floating point type!");
-
-                using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-                return
-                    traits::CreateNormalReal<
-                        ImplementationBase,
-                        T>
-                    ::createNormalReal(
-                        rand);
-            }
-            //-----------------------------------------------------------------------------
-            //! \return A uniform floating point distribution [0.0, 1.0).
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename T,
-                typename TRand>
-            ALPAKA_FN_HOST_ACC auto createUniformReal(
-                TRand const & rand)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateUniformReal<
-                    concepts::ImplementationBase<ConceptRand, TRand>,
-                    T>
-                ::createUniformReal(
-                    rand))
-#endif
-            {
-                static_assert(
-                    std::is_floating_point<T>::value,
-                    "The value type T has to be a floating point type!");
-
-                using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-                return
-                    traits::CreateUniformReal<
-                        ImplementationBase,
-                        T>
-                    ::createUniformReal(
-                        rand);
-            }
-            //-----------------------------------------------------------------------------
-            //! \return A uniform integer distribution [0, UINT_MAX].
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename T,
-                typename TRand>
-            ALPAKA_FN_HOST_ACC auto createUniformUint(
-                TRand const & rand)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateUniformUint<
-                    concepts::ImplementationBase<ConceptRand, TRand>,
-                    T>
-                ::createUniformUint(
-                    rand))
-#endif
-            {
-                static_assert(
-                    std::is_integral<T>::value && std::is_unsigned<T>::value,
-                    "The value type T has to be a unsigned integral type!");
-
-                using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-                return
-                    traits::CreateUniformUint<
-                        ImplementationBase,
-                        T>
-                    ::createUniformUint(
-                        rand);
-            }
-        }
-
-        //-----------------------------------------------------------------------------
-        //! The random number generator specifics.
-        namespace generator
-        {
-            //-----------------------------------------------------------------------------
-            //! The random number generator traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The random number default generator get trait.
-                template<
-                    typename TRand,
-                    typename TSfinae = void>
-                struct CreateDefault;
-            }
-            //-----------------------------------------------------------------------------
-            //! \return A default random number generator.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TRand>
-            ALPAKA_FN_HOST_ACC auto createDefault(
-                TRand const & rand,
-                std::uint32_t const & seed,
-                std::uint32_t const & subsequence)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateDefault<
-                    concepts::ImplementationBase<ConceptRand, TRand>>
-                ::createDefault(
-                    rand,
-                    seed,
-                    subsequence))
-#endif
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-                return
-                    traits::CreateDefault<
-                        ImplementationBase>
-                    ::createDefault(
-                        rand,
-                        seed,
-                        subsequence);
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/standalone/CpuFibers.hpp b/thirdParty/alpaka/include/alpaka/standalone/CpuFibers.hpp
deleted file mode 100644
index 2a8180f98e..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/CpuFibers.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
-    #define ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp b/thirdParty/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp
deleted file mode 100644
index 1cecf21bf9..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-    #define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp b/thirdParty/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp
deleted file mode 100644
index c3cf763e37..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-    #define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/standalone/CpuOmp4.hpp b/thirdParty/alpaka/include/alpaka/standalone/CpuOmp4.hpp
deleted file mode 100644
index f93665dc57..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/CpuOmp4.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-    #define ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/standalone/CpuSerial.hpp b/thirdParty/alpaka/include/alpaka/standalone/CpuSerial.hpp
deleted file mode 100644
index 7a4ab7013e..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/CpuSerial.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-    #define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp b/thirdParty/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp
deleted file mode 100644
index 4c0f7ae0db..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-    #define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/standalone/CpuThreads.hpp b/thirdParty/alpaka/include/alpaka/standalone/CpuThreads.hpp
deleted file mode 100644
index 791ef4b4cf..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/CpuThreads.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
- 
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-    #define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/standalone/GpuCudaRt.hpp b/thirdParty/alpaka/include/alpaka/standalone/GpuCudaRt.hpp
deleted file mode 100644
index 2648e0c222..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/GpuCudaRt.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
-    #define ALPAKA_ACC_GPU_CUDA_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/standalone/GpuHipRt.hpp b/thirdParty/alpaka/include/alpaka/standalone/GpuHipRt.hpp
deleted file mode 100644
index f322b14c89..0000000000
--- a/thirdParty/alpaka/include/alpaka/standalone/GpuHipRt.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_GPU_HIP_ENABLED
-    #define ALPAKA_ACC_GPU_HIP_ENABLED
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/time/TimeCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/time/TimeCudaBuiltIn.hpp
deleted file mode 100644
index f4ddbcc1f7..0000000000
--- a/thirdParty/alpaka/include/alpaka/time/TimeCudaBuiltIn.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/time/Traits.hpp>
-
-namespace alpaka
-{
-    namespace time
-    {
-        //#############################################################################
-        //! The GPU CUDA accelerator time implementation.
-        class TimeCudaBuiltIn : public concepts::Implements<ConceptTime, TimeCudaBuiltIn>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            TimeCudaBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            __device__ TimeCudaBuiltIn(TimeCudaBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ TimeCudaBuiltIn(TimeCudaBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(TimeCudaBuiltIn const &) -> TimeCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(TimeCudaBuiltIn &&) -> TimeCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~TimeCudaBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA built-in clock operation.
-            template<>
-            struct Clock<
-                time::TimeCudaBuiltIn>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto clock(
-                    time::TimeCudaBuiltIn const &)
-                -> std::uint64_t
-                {
-                    // This can be converted to a wall-clock time in seconds by dividing through the shader clock rate given by cudaDeviceProp::clockRate.
-                    // This clock rate is double the main clock rate on Fermi and older cards. 
-                    return
-                        static_cast<std::uint64_t>(
-                            clock64());
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/time/TimeHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/time/TimeHipBuiltIn.hpp
deleted file mode 100644
index 1c1d314933..0000000000
--- a/thirdParty/alpaka/include/alpaka/time/TimeHipBuiltIn.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/time/Traits.hpp>
-
-namespace alpaka
-{
-    namespace time
-    {
-        //#############################################################################
-        //! The GPU HIP accelerator time implementation.
-        class TimeHipBuiltIn : public concepts::Implements<ConceptTime, TimeHipBuiltIn>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! Default constructor.
-            ALPAKA_FN_HOST_ACC TimeHipBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            //! Copy constructor.
-            __device__ TimeHipBuiltIn(TimeHipBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            //! Move constructor.
-            __device__ TimeHipBuiltIn(TimeHipBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            //! Copy assignment operator.
-            __device__ auto operator=(TimeHipBuiltIn const &) -> TimeHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            //! Move assignment operator.
-            __device__ auto operator=(TimeHipBuiltIn &&) -> TimeHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            //! Destructor.
-            /*virtual*/ ALPAKA_FN_HOST_ACC ~TimeHipBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP built-in clock operation.
-            template<>
-            struct Clock<
-                time::TimeHipBuiltIn>
-            {
-                //-----------------------------------------------------------------------------
-
-                __device__ static auto clock(
-                    time::TimeHipBuiltIn const &)
-                -> std::uint64_t
-                {
-                    // This can be converted to a wall-clock time in seconds by dividing through the shader clock rate given by hipDeviceProp::clockRate.
-                    // This clock rate is double the main clock rate on Fermi and older cards.
-                    return
-                        static_cast<std::uint64_t>(
-                            clock64());
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/time/TimeOmp.hpp b/thirdParty/alpaka/include/alpaka/time/TimeOmp.hpp
deleted file mode 100644
index a4bad6edfe..0000000000
--- a/thirdParty/alpaka/include/alpaka/time/TimeOmp.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef _OPENMP
-
-#include <alpaka/time/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <omp.h>
-
-namespace alpaka
-{
-    namespace time
-    {
-        //#############################################################################
-        //! The OpenMP accelerator time implementation.
-        class TimeOmp : public concepts::Implements<ConceptTime, TimeOmp>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            TimeOmp() = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST TimeOmp(TimeOmp const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST TimeOmp(TimeOmp &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(TimeOmp const &) -> TimeOmp & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(TimeOmp &&) -> TimeOmp & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~TimeOmp() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The OpenMP accelerator clock operation.
-            template<>
-            struct Clock<
-                time::TimeOmp>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto clock(
-                    time::TimeOmp const & time)
-                -> std::uint64_t
-                {
-                    alpaka::ignore_unused(time);
-                    // NOTE: We compute the number of clock ticks by dividing the following durations:
-                    // - omp_get_wtime returns the elapsed wall clock time in seconds.
-                    // - omp_get_wtick gets the timer precision, i.e., the number of seconds between two successive clock ticks. 
-                    return
-                        static_cast<std::uint64_t>(
-                            omp_get_wtime() / omp_get_wtick());
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/time/TimeStdLib.hpp b/thirdParty/alpaka/include/alpaka/time/TimeStdLib.hpp
deleted file mode 100644
index 4cacf5584d..0000000000
--- a/thirdParty/alpaka/include/alpaka/time/TimeStdLib.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/time/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <chrono>
-
-namespace alpaka
-{
-    namespace time
-    {
-        //#############################################################################
-        //! The CPU fibers accelerator time implementation.
-        class TimeStdLib : public concepts::Implements<ConceptTime, TimeStdLib>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            TimeStdLib() = default;
-            //-----------------------------------------------------------------------------
-            TimeStdLib(TimeStdLib const &) = delete;
-            //-----------------------------------------------------------------------------
-            TimeStdLib(TimeStdLib &&) = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(TimeStdLib const &) -> TimeStdLib & = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(TimeStdLib &&) -> TimeStdLib & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~TimeStdLib() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator clock operation.
-            template<>
-            struct Clock<
-                TimeStdLib>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto clock(
-                    time::TimeStdLib const & time)
-                -> std::uint64_t
-                {
-                    alpaka::ignore_unused(time);
-
-                    // NOTE: high_resolution_clock returns a non-steady wall-clock time!
-                    // This means that it is not ensured that the values will always increase monotonically.
-                    return
-                        static_cast<std::uint64_t>(
-                            std::chrono::high_resolution_clock::now()
-                                .time_since_epoch()
-                                    .count());
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/time/Traits.hpp b/thirdParty/alpaka/include/alpaka/time/Traits.hpp
deleted file mode 100644
index 97f408453c..0000000000
--- a/thirdParty/alpaka/include/alpaka/time/Traits.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The time traits specifics.
-    namespace time
-    {
-        struct ConceptTime;
-
-        //-----------------------------------------------------------------------------
-        //! The time traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The clock trait.
-            template<
-                typename TTime,
-                typename TSfinae = void>
-            struct Clock;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return A counter that is increasing every clock cycle.
-        //!
-        //! \tparam TTime The time implementation type.
-        //! \param time The time implementation.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TTime>
-        ALPAKA_FN_HOST_ACC auto clock(
-            TTime const & time)
-        -> std::uint64_t
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptTime, TTime>;
-            return
-                traits::Clock<
-                    ImplementationBase>
-                ::clock(
-                    time);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/vec/Traits.hpp b/thirdParty/alpaka/include/alpaka/vec/Traits.hpp
deleted file mode 100644
index 5f4b0890ed..0000000000
--- a/thirdParty/alpaka/include/alpaka/vec/Traits.hpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-
-#pragma once
-
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-#include <boost/config.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The vec specifics.
-    namespace vec
-    {
-        //-----------------------------------------------------------------------------
-        //! The vec traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! Trait for selecting a sub-vector.
-            template<
-                typename TVec,
-                typename TIndexSequence,
-                typename TSfinae = void>
-            struct SubVecFromIndices;
-
-            //#############################################################################
-            //! Trait for casting a vector.
-            template<
-                typename TVal,
-                typename TVec,
-                typename TSfinae = void>
-            struct Cast;
-
-            //#############################################################################
-            //! Trait for reversing a vector.
-            template<
-                typename TVec,
-                typename TSfinae = void>
-            struct Reverse;
-
-            //#############################################################################
-            //! Trait for concatenating two vectors.
-            template<
-                typename TVecL,
-                typename TVecR,
-                typename TSfinae = void>
-            struct Concat;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Builds a new vector by selecting the elements of the source vector in the given order.
-        //! Repeating and swizzling elements is allowed.
-        //! \return The sub-vector consisting of the elements specified by the indices.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TIndexSequence,
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto subVecFromIndices(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::SubVecFromIndices<
-                TVec,
-                TIndexSequence>
-            ::subVecFromIndices(
-                vec))
-#endif
-        {
-            return
-                traits::SubVecFromIndices<
-                    TVec,
-                    TIndexSequence>
-                ::subVecFromIndices(
-                    vec);
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TVec has to specialize SubVecFromIndices.
-        //! \return The sub-vector consisting of the first N elements of the source vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TSubDim,
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto subVecBegin(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            subVecFromIndices<
-                meta::MakeIntegerSequence<
-                    std::size_t,
-                    TSubDim::value
-                >
-            >(
-                vec))
-#endif
-        {
-            static_assert(
-                TSubDim::value <= dim::Dim<TVec>::value,
-                "The sub-Vec has to be smaller (or same size) then the original Vec.");
-
-            //! A sequence of integers from 0 to dim-1.
-            using IdxSubSequence =
-                meta::MakeIntegerSequence<
-                    std::size_t,
-                    TSubDim::value>;
-            return
-                subVecFromIndices<
-                    IdxSubSequence>(
-                        vec);
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TVec has to specialize SubVecFromIndices.
-        //! \return The sub-vector consisting of the last N elements of the source vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TSubDim,
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto subVecEnd(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            subVecFromIndices<
-                meta::MakeIntegerSequenceOffset<
-                    std::size_t,
-                    dim::Dim<TVec>::value - TSubDim::value,
-                    TSubDim::value
-                >
-            >(
-                vec))
-#endif
-        {
-            static_assert(
-                TSubDim::value <= dim::Dim<TVec>::value,
-                "The sub-Vec has to be smaller (or same size) then the original Vec.");
-
-            constexpr std::size_t idxOffset = dim::Dim<TVec>::value - TSubDim::value;
-
-            //! A sequence of integers from 0 to dim-1.
-            using IdxSubSequence =
-                meta::MakeIntegerSequenceOffset<
-                    std::size_t,
-                    idxOffset,
-                    TSubDim::value>;
-            return
-                subVecFromIndices<
-                    IdxSubSequence>(
-                        vec);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The casted vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TVal,
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto cast(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Cast<
-                TVal,
-                TVec>
-            ::cast(
-                vec))
-#endif
-        {
-            return
-                traits::Cast<
-                    TVal,
-                    TVec>
-                ::cast(
-                    vec);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The reverse vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto reverse(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Reverse<
-                TVec>
-            ::reverse(
-                vec))
-#endif
-        {
-            return
-                traits::Reverse<
-                    TVec>
-                ::reverse(
-                    vec);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The concatenated vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TVecL,
-            typename TVecR>
-        ALPAKA_FN_HOST_ACC auto concat(
-            TVecL const & vecL,
-            TVecR const & vecR)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Concat<
-                TVecL,
-                TVecR>
-            ::concat(
-                vecL,
-                vecR))
-#endif
-        {
-            return
-                traits::Concat<
-                    TVecL,
-                    TVecR>
-                ::concat(
-                    vecL,
-                    vecR);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/vec/Vec.hpp b/thirdParty/alpaka/include/alpaka/vec/Vec.hpp
deleted file mode 100644
index 52fcb7238e..0000000000
--- a/thirdParty/alpaka/include/alpaka/vec/Vec.hpp
+++ /dev/null
@@ -1,1374 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/vec/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/core/Align.hpp>
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
-#include <alpaka/meta/Fold.hpp>
-
-#include <boost/config.hpp>
-
-#include <cstdint>
-#include <ostream>
-#include <type_traits>
-#include <algorithm>
-
-// Some compilers do not support the out of class versions:
-// - the nvcc CUDA compiler (at least 8.0)
-// - the intel compiler
-#if BOOST_COMP_HCC || BOOST_COMP_HIP || BOOST_COMP_NVCC || BOOST_COMP_INTEL || (BOOST_COMP_CLANG_CUDA >= BOOST_VERSION_NUMBER(4, 0, 0)) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 0, 0))
-    #define ALPAKA_CREATE_VEC_IN_CLASS
-#endif
-
-namespace alpaka
-{
-    namespace vec
-    {
-        template<
-            typename TDim,
-            typename TVal>
-        class Vec;
-
-#ifndef ALPAKA_CREATE_VEC_IN_CLASS
-        //-----------------------------------------------------------------------------
-        //! Single value constructor helper.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            template<std::size_t> class TTFnObj,
-            typename... TArgs,
-            typename TIdxSize,
-            TIdxSize... TIndices>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnArbitrary(
-            meta::IntegerSequence<TIdxSize, TIndices...> const & indices,
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> Vec<TDim, decltype(TTFnObj<0>::create(std::forward<TArgs>(args)...))>
-#endif
-        {
-            alpaka::ignore_unused(indices);
-
-            return Vec<TDim, decltype(TTFnObj<0>::create(std::forward<TArgs>(args)...))>(
-                (TTFnObj<TIndices>::create(std::forward<TArgs>(args)...))...);
-        }
-        //-----------------------------------------------------------------------------
-        //! Creator using func<idx>(args...) to initialize all values of the vector.
-        //! The idx is in the range [0, TDim].
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            template<std::size_t> class TTFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFn(
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            createVecFromIndexedFnArbitrary<
-                TDim,
-                TTFnObj>(
-                    meta::MakeIntegerSequence<typename TDim::value_type, TDim::value>(),
-                    std::forward<TArgs>(args)...))
-#endif
-        {
-            using IdxSequence = meta::MakeIntegerSequence<typename TDim::value_type, TDim::value>;
-            return
-                createVecFromIndexedFnArbitrary<
-                    TDim,
-                    TTFnObj>(
-                        IdxSequence(),
-                        std::forward<TArgs>(args)...);
-        }
-        //-----------------------------------------------------------------------------
-        //! Creator using func<idx>(args...) to initialize all values of the vector.
-        //! The idx is in the range [TIdxOffset, TIdxOffset + TDim].
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            template<std::size_t> class TTFnObj,
-            typename TIdxOffset,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnOffset(
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            createVecFromIndexedFnArbitrary<
-                TDim,
-                TTFnObj>(
-                    meta::ConvertIntegerSequence<typename TIdxOffset::value_type, meta::MakeIntegerSequenceOffset<std::intmax_t, TIdxOffset::value, TDim::value>>(),
-                    std::forward<TArgs>(args)...))
-#endif
-        {
-            using IdxSubSequenceSigned = meta::MakeIntegerSequenceOffset<std::intmax_t, TIdxOffset::value, TDim::value>;
-            using IdxSubSequence = meta::ConvertIntegerSequence<typename TIdxOffset::value_type, IdxSubSequenceSigned>;
-            return
-                createVecFromIndexedFnArbitrary<
-                    TDim,
-                    TTFnObj>(
-                        IdxSubSequence(),
-                        std::forward<TArgs>(args)...);
-        }
-#endif
-
-        //#############################################################################
-        //! A n-dimensional vector.
-        template<
-            typename TDim,
-            typename TVal>
-        class Vec final
-        {
-        public:
-            static_assert(TDim::value >= 0u, "Invalid dimensionality");
-
-            using Dim = TDim;
-            static constexpr auto s_uiDim = TDim::value;
-            using Val = TVal;
-
-        private:
-            //! A sequence of integers from 0 to dim-1.
-            //! This can be used to write compile time indexing algorithms.
-            using IdxSequence = meta::MakeIntegerSequence<std::size_t, TDim::value>;
-
-        public:
-            //-----------------------------------------------------------------------------
-            // The default constructor is only available when the vector is zero-dimensional.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                bool B = (TDim::value == 0u),
-                typename = typename std::enable_if<B>::type>
-            ALPAKA_FN_HOST_ACC Vec() :
-                m_data{static_cast<TVal>(0u)}
-            {}
-
-
-            //-----------------------------------------------------------------------------
-            //! Value constructor.
-            //! This constructor is only available if the number of parameters matches the vector idx.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TArg0,
-                typename... TArgs,
-                typename = typename std::enable_if<
-                    // There have to be dim arguments.
-                    (sizeof...(TArgs)+1 == TDim::value)
-                    &&
-                    (std::is_same<TVal, typename std::decay<TArg0>::type>::value)
-                    >::type>
-            ALPAKA_FN_HOST_ACC Vec(
-                TArg0 && arg0,
-                TArgs && ... args) :
-                    m_data{std::forward<TArg0>(arg0), std::forward<TArgs>(args)...}
-            {}
-
-#ifdef ALPAKA_CREATE_VEC_IN_CLASS
-            //-----------------------------------------------------------------------------
-            //! Creator using func<idx>(args...) to initialize all values of the vector.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                template<std::size_t> class TTFnObj,
-                typename... TArgs,
-                typename TIdxSize,
-                TIdxSize... TIndices>
-            ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFnArbitrary(
-                meta::IntegerSequence<TIdxSize, TIndices...> const & indices,
-                TArgs && ... args)
-            -> Vec<TDim, TVal>
-            {
-                alpaka::ignore_unused(indices);
-
-                return Vec<TDim, TVal>(
-                    (TTFnObj<TIndices>::create(std::forward<TArgs>(args)...))...);
-            }
-            //-----------------------------------------------------------------------------
-            //! Creator using func<idx>(args...) to initialize all values of the vector.
-            //! The idx is in the range [0, TDim].
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                template<std::size_t> class TTFnObj,
-                typename... TArgs>
-            ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFn(
-                TArgs && ... args)
-            -> Vec<TDim, TVal>
-            {
-                return
-                    createVecFromIndexedFnArbitrary<
-                        TTFnObj>(
-                            IdxSequence(),
-                            std::forward<TArgs>(args)...);
-            }
-            //-----------------------------------------------------------------------------
-            //! Creator using func<idx>(args...) to initialize all values of the vector.
-            //! The idx is in the range [TIdxOffset, TIdxOffset + TDim].
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                template<std::size_t> class TTFnObj,
-                typename TIdxOffset,
-                typename... TArgs>
-            ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFnOffset(
-                TArgs && ... args)
-            -> Vec<TDim, TVal>
-            {
-                using IdxSubSequenceSigned = meta::MakeIntegerSequenceOffset<std::intmax_t, TIdxOffset::value, TDim::value>;
-                using IdxSubSequence = meta::ConvertIntegerSequence<typename TDim::value_type, IdxSubSequenceSigned>;
-                return
-                    createVecFromIndexedFnArbitrary<
-                        TTFnObj>(
-                            IdxSubSequence(),
-                            std::forward<TArgs>(args)...);
-            }
-#endif
-
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            Vec(Vec const &) = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            Vec(Vec &&) = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            auto operator=(Vec const &) -> Vec & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            auto operator=(Vec &&) -> Vec & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC ~Vec() = default;
-
-        private:
-            //#############################################################################
-            //! A function object that returns the given value for each index.
-            template<
-                std::size_t Tidx>
-            struct CreateSingleVal
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto create(
-                    TVal const & val)
-                -> TVal
-                {
-                    return val;
-                }
-            };
-        public:
-            //-----------------------------------------------------------------------------
-            //! \brief Single value constructor.
-            //!
-            //! Creates a vector with all values set to val.
-            //! \param val The initial value.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto all(
-                TVal const & val)
-            -> Vec<TDim, TVal>
-            {
-                return
-                    createVecFromIndexedFn<
-#ifndef ALPAKA_CREATE_VEC_IN_CLASS
-                        TDim,
-#endif
-                        CreateSingleVal>(
-                            val);
-            }
-            //-----------------------------------------------------------------------------
-            //! Zero value constructor.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto zeros()
-            -> Vec<TDim, TVal>
-            {
-                return all(static_cast<TVal>(0));
-            }
-            //-----------------------------------------------------------------------------
-            //! One value constructor.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto ones()
-            -> Vec<TDim, TVal>
-            {
-                return all(static_cast<TVal>(1));
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Value reference accessor at the given non-unsigned integer index.
-            //! \return A reference to the value at the given index.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TIdx,
-                typename = typename std::enable_if<
-                    std::is_integral<TIdx>::value>::type>
-            ALPAKA_FN_HOST_ACC auto operator[](
-                TIdx const iIdx)
-            -> TVal &
-            {
-                core::assertValueUnsigned(iIdx);
-                auto const idx(static_cast<typename TDim::value_type>(iIdx));
-                core::assertGreaterThan<TDim>(idx);
-                return m_data[idx];
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Value accessor at the given non-unsigned integer index.
-            //! \return The value at the given index.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TIdx,
-                typename = typename std::enable_if<
-                    std::is_integral<TIdx>::value>::type>
-            ALPAKA_FN_HOST_ACC auto operator[](
-                TIdx const iIdx) const
-            -> TVal
-            {
-                core::assertValueUnsigned(iIdx);
-                auto const idx(static_cast<typename TDim::value_type>(iIdx));
-                core::assertGreaterThan<TDim>(idx);
-                return m_data[idx];
-            }
-
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto operator==(
-                Vec const & rhs) const
-            -> bool
-            {
-                for(typename TDim::value_type i(0); i < TDim::value; ++i)
-                {
-                    if((*this)[i] != rhs[i])
-                    {
-                        return false;
-                    }
-                }
-                return true;
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto operator!=(
-                Vec const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TFnObj,
-                std::size_t... TIndices>
-            ALPAKA_FN_HOST_ACC auto foldrByIndices(
-                TFnObj const & f,
-                meta::IntegerSequence<std::size_t, TIndices...> const & indices) const
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                meta::foldr(
-                    f,
-                    ((*this)[TIndices])...))
-#endif
-            {
-                alpaka::ignore_unused(indices);
-
-                return
-                    meta::foldr(
-                        f,
-                        ((*this)[TIndices])...);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TFnObj>
-            ALPAKA_FN_HOST_ACC auto foldrAll(
-                TFnObj const & f) const
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-#if (BOOST_COMP_GNUC && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))) || BOOST_COMP_INTEL || BOOST_COMP_NVCC
-                this->foldrByIndices(
-#else
-                foldrByIndices(
-#endif
-                    f,
-                    IdxSequence()))
-#endif
-            {
-                return
-                    foldrByIndices(
-                        f,
-                        IdxSequence());
-            }
-// suppress strange warning produced by nvcc+MSVC in release mode
-#if BOOST_COMP_MSVC
-    #pragma warning(push)
-    #pragma warning(disable: 4702)  // unreachable code
-#endif
-            //-----------------------------------------------------------------------------
-            //! \return The product of all values.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto prod() const
-            -> TVal
-            {
-                return foldrAll(
-                    [](TVal a, TVal b)
-                    {
-                        return static_cast<TVal>(a * b);
-                    });
-            }
-#if BOOST_COMP_MSVC
-    #pragma warning(pop)
-#endif
-            //-----------------------------------------------------------------------------
-            //! \return The sum of all values.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto sum() const
-            -> TVal
-            {
-                return foldrAll(
-                    [](TVal a, TVal b)
-                    {
-                        return static_cast<TVal>(a + b);
-                    });
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The min of all values.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto min() const
-            -> TVal
-            {
-                return foldrAll(
-                    [](TVal a, TVal b)
-                    {
-                        return (b < a) ? b : a;
-                    });
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The max of all values.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto max() const
-            -> TVal
-            {
-                return foldrAll(
-                    [](TVal a, TVal b)
-                    {
-                        return (b > a) ? b : a;
-                    });
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The index of the minimal element.
-            ALPAKA_FN_HOST auto minElem() const
-            -> typename TDim::value_type
-            {
-                return
-                    static_cast<typename TDim::value_type>(
-                        std::distance(
-                            std::begin(m_data),
-                            std::min_element(
-                                std::begin(m_data),
-                                std::end(m_data))));
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The index of the maximal element.
-            ALPAKA_FN_HOST auto maxElem() const
-            -> typename TDim::value_type
-            {
-                return
-                    static_cast<typename TDim::value_type>(
-                        std::distance(
-                            std::begin(m_data),
-                            std::max_element(
-                                std::begin(m_data),
-                                std::end(m_data))));
-            }
-
-        private:
-            // Zero sized arrays are not allowed, therefore zero-dimensional vectors have one member.
-            TVal m_data[TDim::value == 0u ? 1u : TDim::value];
-        };
-
-        //-----------------------------------------------------------------------------
-        //! This is a conveniance method to have a out-of-class factory method even though the out-of-class version is not supported by all compilers.
-        //! Depending of the compiler conformance, the internal or external factory function is called.
-        //! This has the draw-back, that it requires the TVal parameter even though it should not be necessary.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal,
-            template<std::size_t> class TTFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnWorkaround(
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> alpaka::vec::Vec<TDim, TVal>
-#endif
-        {
-            return
-                alpaka::vec::
-#ifdef ALPAKA_CREATE_VEC_IN_CLASS
-                Vec<TDim, TVal>::template
-#endif
-                createVecFromIndexedFn<
-#ifndef ALPAKA_CREATE_VEC_IN_CLASS
-                    TDim,
-#endif
-                    TTFnObj>(
-                        std::forward<TArgs>(args)...);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! This is a conveniance method to have a out-of-class factory method even though the out-of-class version is not supported by all compilers.
-        //! Depending of the compiler conformance, the internal or external factory function is called.
-        //! This has the draw-back, that it requires the TVal parameter even though it should not be necessary.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal,
-            template<std::size_t> class TTFnObj,
-            typename TIdxOffset,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnOffsetWorkaround(
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> alpaka::vec::Vec<TDim, TVal>
-#endif
-        {
-            return
-                alpaka::vec::
-#ifdef ALPAKA_CREATE_VEC_IN_CLASS
-                Vec<TDim, TVal>::template
-#endif
-                createVecFromIndexedFnOffset<
-#ifndef ALPAKA_CREATE_VEC_IN_CLASS
-                    TDim,
-#endif
-                    TTFnObj,
-                    TIdxOffset>(
-                        std::forward<TArgs>(args)...);
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the sum of the two input vectors elements.
-            template<
-                std::size_t Tidx>
-            struct CreateAdd
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> TVal
-                {
-                    return p[Tidx] + q[Tidx];
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The element-wise sum of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator+(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, TVal>
-        {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    TVal,
-                    detail::CreateAdd>(
-                        p,
-                        q);
-        }
-
-        namespace detail
-        {
-            //##################################################################################
-            //! A function object that returns the difference of the two input vectors elements.
-            template<
-                std::size_t Tidx>
-            struct CreateSub
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> TVal
-                {
-                    return p[Tidx] - q[Tidx];
-                }
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The element-wise difference of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator-(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, TVal>
-        {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    TVal,
-                    detail::CreateSub>(
-                        p,
-                        q);
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the product of the two input vectors elements.
-            template<
-                std::size_t Tidx>
-            struct CreateMul
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> TVal
-                {
-                    return p[Tidx] * q[Tidx];
-                }
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The element-wise product of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator*(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, TVal>
-        {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    TVal,
-                    detail::CreateMul>(
-                        p,
-                        q);
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the element-wise less than relation of two vectors.
-            template<
-                std::size_t Tidx>
-            struct CreateLess
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> bool
-                {
-                    return p[Tidx] < q[Tidx];
-                }
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The element-wise less than relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator<(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, bool>
-        {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    bool,
-                    detail::CreateLess>(
-                        p,
-                        q);
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the element-wise less than or equal relation of two vectors.
-            template<
-                std::size_t Tidx>
-            struct CreateLessEqual
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> bool
-                {
-                    return p[Tidx] <= q[Tidx];
-                }
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The element-wise less than or equal relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator<=(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, bool>
-        {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    bool,
-                    detail::CreateLessEqual>(
-                        p,
-                        q);
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the element-wise greater than or equal relation of two vectors.
-            template<
-                std::size_t Tidx>
-            struct CreateGreaterEqual
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> bool
-                {
-                    return p[Tidx] >= q[Tidx];
-                }
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The element-wise greater than or equal relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator>=(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, bool>
-        {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    bool,
-                    detail::CreateGreaterEqual>(
-                        p,
-                        q);
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the element-wise greater than relation of two vectors.
-            template<
-                std::size_t Tidx>
-            struct CreateGreater
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> bool
-                {
-                    return p[Tidx] > q[Tidx];
-                }
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The element-wise greater than relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator>(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, bool>
-        {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    bool,
-                    detail::CreateGreater>(
-                        p,
-                        q);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Stream out operator.
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST auto operator<<(
-            std::ostream & os,
-            Vec<TDim, TVal> const & v)
-        -> std::ostream &
-        {
-            os << "(";
-            for(typename TDim::value_type i(0); i<TDim::value; ++i)
-            {
-                os << v[i];
-                if(i != TDim::value-1)
-                {
-                    os << ", ";
-                }
-            }
-            os << ")";
-
-            return os;
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The Vec dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TVal>
-            struct DimType<
-                vec::Vec<TDim, TVal>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The Vec idx type trait specialization.
-            template<
-                typename TDim,
-                typename TVal>
-            struct IdxType<
-                vec::Vec<TDim, TVal>>
-            {
-                using type = TVal;
-            };
-        }
-    }
-    namespace vec
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! Specialization for selecting a sub-vector.
-            template<
-                typename TDim,
-                typename TVal,
-                std::size_t... TIndices>
-            struct SubVecFromIndices<
-                Vec<TDim, TVal>,
-                meta::IntegerSequence<std::size_t, TIndices...>,
-                typename std::enable_if<
-                    !std::is_same<
-                        meta::IntegerSequence<std::size_t, TIndices...>,
-                        meta::MakeIntegerSequence<std::size_t, TDim::value>
-                    >::value
-                >::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto subVecFromIndices(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<dim::DimInt<sizeof...(TIndices)>, TVal>
-                {
-                    // In the case of a zero dimensional vector, vec is unused.
-                    alpaka::ignore_unused(vec);
-
-                    static_assert(sizeof...(TIndices) <= TDim::value, "The sub-vector has to be smaller (or same idx) then the origin vector.");
-
-                    return Vec<dim::DimInt<sizeof...(TIndices)>, TVal>(vec[TIndices]...);
-                }
-            };
-            //#############################################################################
-            //! Specialization for selecting the whole vector.
-            template<
-                typename TDim,
-                typename TVal>
-            struct SubVecFromIndices<
-                Vec<TDim, TVal>,
-                meta::MakeIntegerSequence<std::size_t, TDim::value>>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto subVecFromIndices(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<TDim, TVal>
-                {
-                    return vec;
-                }
-            };
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the given value for each index.
-            template<
-                std::size_t Tidx>
-            struct CreateCast
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TSizeNew,
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    TSizeNew const &/* valNew*/,
-                    Vec<TDim, TVal> const & vec)
-                -> TSizeNew
-                {
-                    return
-                        static_cast<TSizeNew>(
-                            vec[Tidx]);
-                }
-            };
-        }
-        namespace traits
-        {
-            //#############################################################################
-            //! Cast specialization for Vec.
-            template<
-                typename TSizeNew,
-                typename TDim,
-                typename TVal>
-            struct Cast<
-                TSizeNew,
-                Vec<TDim, TVal>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto cast(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<TDim, TSizeNew>
-                {
-                    return
-                        createVecFromIndexedFnWorkaround<
-                            TDim,
-                            TSizeNew,
-                            vec::detail::CreateCast>(
-                                TSizeNew(),
-                                vec);
-                }
-            };
-
-            //#############################################################################
-            //! (Non-)Cast specialization for Vec when src and dst types are identical.
-            //#############################################################################
-            template<
-                typename TDim,
-                typename TVal>
-            struct Cast<
-                TVal,
-                Vec<TDim, TVal>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto cast(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<TDim, TVal>
-                {
-                    return vec;
-                }
-            };
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the value at the index from the back of the vector.
-            template<
-                std::size_t Tidx>
-            struct CreateReverse
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & vec)
-                -> TVal
-                {
-                    return vec[TDim::value - 1u - Tidx];
-                }
-            };
-        }
-        namespace traits
-        {
-            //#############################################################################
-            //! Reverse specialization for Vec.
-            template<
-                typename TDim,
-                typename TVal>
-            struct Reverse<
-                Vec<TDim, TVal>>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto reverse(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<TDim, TVal>
-                {
-                    return
-                        createVecFromIndexedFnWorkaround<
-                            TDim,
-                            TVal,
-                            vec::detail::CreateReverse>(
-                                vec);
-                }
-            };
-
-            //#############################################################################
-            //! (Non-)Reverse specialization for 1D Vec.
-            template<
-                typename TVal>
-            struct Reverse<
-                Vec<dim::DimInt<1u>, TVal>>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto reverse(
-                    Vec<dim::DimInt<1u>, TVal> const & vec)
-                -> Vec<dim::DimInt<1u>, TVal>
-                {
-                    return vec;
-                }
-            };
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the value at the index from the back of the vector.
-            template<
-                std::size_t Tidx>
-            struct CreateConcat
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDimL,
-                    typename TDimR,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDimL, TVal> const & vecL,
-                    Vec<TDimR, TVal> const & vecR)
-                -> TVal
-                {
-                    return Tidx < TDimL::value ? vecL[Tidx] : vecR[Tidx - TDimL::value];
-                }
-            };
-        }
-        namespace traits
-        {
-            //#############################################################################
-            //! Concatenation specialization for Vec.
-            template<
-                typename TDimL,
-                typename TDimR,
-                typename TVal>
-            struct Concat<
-                Vec<TDimL, TVal>,
-                Vec<TDimR, TVal>>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto concat(
-                    Vec<TDimL, TVal> const & vecL,
-                    Vec<TDimR, TVal> const & vecR)
-                -> Vec<dim::DimInt<TDimL::value + TDimR::value>, TVal>
-                {
-                    return
-                        createVecFromIndexedFnWorkaround<
-                            dim::DimInt<TDimL::value + TDimR::value>,
-                            TVal,
-                            vec::detail::CreateConcat>(
-                                vecL,
-                                vecR);
-                }
-            };
-        }
-    }
-
-    namespace extent
-    {
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the extent for each index.
-            template<
-                std::size_t Tidx>
-            struct CreateExtent
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    TExtent const & extent)
-                -> idx::Idx<TExtent>
-                {
-                    return extent::getExtent<Tidx>(extent);
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TExtent has to specialize extent::GetExtent.
-        //! \return The extent vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getExtentVec(
-            TExtent const & extent = TExtent())
-        -> vec::Vec<dim::Dim<TExtent>, idx::Idx<TExtent>>
-        {
-            return
-                vec::createVecFromIndexedFnWorkaround<
-                    dim::Dim<TExtent>,
-                    idx::Idx<TExtent>,
-                    detail::CreateExtent>(
-                        extent);
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TExtent has to specialize extent::GetExtent.
-        //! \return The extent but only the last N elements.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getExtentVecEnd(
-            TExtent const & extent = TExtent())
-        -> vec::Vec<TDim, idx::Idx<TExtent>>
-        {
-            using IdxOffset = std::integral_constant<std::intmax_t, static_cast<std::intmax_t>(dim::Dim<TExtent>::value) - static_cast<std::intmax_t>(TDim::value)>;
-            return
-                vec::createVecFromIndexedFnOffsetWorkaround<
-                    TDim,
-                    idx::Idx<TExtent>,
-                    detail::CreateExtent,
-                    IdxOffset>(
-                        extent);
-        }
-    }
-
-    namespace offset
-    {
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the offsets for each index.
-            template<
-                std::size_t Tidx>
-            struct CreateOffset
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TOffsets>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    TOffsets const & offsets)
-                -> idx::Idx<TOffsets>
-                {
-                    return offset::getOffset<Tidx>(offsets);
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TOffsets has to specialize offset::GetOffset.
-        //! \return The offset vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetVec(
-            TOffsets const & offsets = TOffsets())
-        -> vec::Vec<dim::Dim<TOffsets>, idx::Idx<TOffsets>>
-        {
-            return
-                vec::createVecFromIndexedFnWorkaround<
-                    dim::Dim<TOffsets>,
-                    idx::Idx<TOffsets>,
-                    detail::CreateOffset>(
-                        offsets);
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TOffsets has to specialize offset::GetOffset.
-        //! \return The offset vector but only the last N elements.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetVecEnd(
-            TOffsets const & offsets = TOffsets())
-        -> vec::Vec<TDim, idx::Idx<TOffsets>>
-        {
-            using IdxOffset = std::integral_constant<std::size_t, static_cast<std::size_t>(static_cast<std::intmax_t>(dim::Dim<TOffsets>::value) - static_cast<std::intmax_t>(TDim::value))>;
-            return
-                vec::createVecFromIndexedFnOffsetWorkaround<
-                    TDim,
-                    idx::Idx<TOffsets>,
-                    detail::CreateOffset,
-                    IdxOffset>(
-                        offsets);
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The Vec extent get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDim,
-                typename TVal>
-            struct GetExtent<
-                TIdxIntegralConst,
-                vec::Vec<TDim, TVal>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    vec::Vec<TDim, TVal> const & extent)
-                -> TVal
-                {
-                    return extent[TIdxIntegralConst::value];
-                }
-            };
-            //#############################################################################
-            //! The Vec extent set trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDim,
-                typename TVal,
-                typename TExtentVal>
-            struct SetExtent<
-                TIdxIntegralConst,
-                vec::Vec<TDim, TVal>,
-                TExtentVal,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    vec::Vec<TDim, TVal> & extent,
-                    TExtentVal const & extentVal)
-                -> void
-                {
-                    extent[TIdxIntegralConst::value] = extentVal;
-                }
-            };
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The Vec offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDim,
-                typename TVal>
-            struct GetOffset<
-                TIdxIntegralConst,
-                vec::Vec<TDim, TVal>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    vec::Vec<TDim, TVal> const & offsets)
-                -> TVal
-                {
-                    return offsets[TIdxIntegralConst::value];
-                }
-            };
-            //#############################################################################
-            //! The Vec offset set trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDim,
-                typename TVal,
-                typename TOffset>
-            struct SetOffset<
-                TIdxIntegralConst,
-                vec::Vec<TDim, TVal>,
-                TOffset,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    vec::Vec<TDim, TVal> & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets[TIdxIntegralConst::value] = offset;
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/version.hpp b/thirdParty/alpaka/include/alpaka/version.hpp
deleted file mode 100644
index ba7830a5a7..0000000000
--- a/thirdParty/alpaka/include/alpaka/version.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <boost/predef/version_number.h>
-
-#define ALPAKA_VERSION_MAJOR 0
-#define ALPAKA_VERSION_MINOR 4
-#define ALPAKA_VERSION_PATCH 0
-
-//! The alpaka library version number
-#define ALPAKA_VERSION BOOST_VERSION_NUMBER(ALPAKA_VERSION_MAJOR, ALPAKA_VERSION_MINOR, ALPAKA_VERSION_PATCH)
diff --git a/thirdParty/alpaka/include/alpaka/wait/Traits.hpp b/thirdParty/alpaka/include/alpaka/wait/Traits.hpp
deleted file mode 100644
index a37c1c32cc..0000000000
--- a/thirdParty/alpaka/include/alpaka/wait/Traits.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The wait specifics.
-    namespace wait
-    {
-        struct ConceptCurrentThreadWaitFor;
-
-        //-----------------------------------------------------------------------------
-        //! The wait traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The thread wait trait.
-            template<
-                typename TAwaited,
-                typename TSfinae = void>
-            struct CurrentThreadWaitFor;
-
-            //#############################################################################
-            //! The waiter wait trait.
-            template<
-                typename TWaiter,
-                typename TAwaited,
-                typename TSfinae = void>
-            struct WaiterWaitFor;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Waits the thread for the completion of the given awaited action to complete.
-        template<
-            typename TAwaited>
-        ALPAKA_FN_HOST auto wait(
-            TAwaited const & awaited)
-        -> void
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptCurrentThreadWaitFor, TAwaited>;
-            traits::CurrentThreadWaitFor<
-                ImplementationBase>
-            ::currentThreadWaitFor(
-                awaited);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! The waiter waits for the given awaited action to complete.
-        template<
-            typename TWaiter,
-            typename TAwaited>
-        ALPAKA_FN_HOST auto wait(
-            TWaiter & waiter,
-            TAwaited const & awaited)
-        -> void
-        {
-            traits::WaiterWaitFor<
-                TWaiter,
-                TAwaited>
-            ::waiterWaitFor(
-                waiter,
-                awaited);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/workdiv/Traits.hpp b/thirdParty/alpaka/include/alpaka/workdiv/Traits.hpp
deleted file mode 100644
index c965649be2..0000000000
--- a/thirdParty/alpaka/include/alpaka/workdiv/Traits.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Common.hpp>
-
-#include <boost/config.hpp>
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The work division traits specifics.
-    namespace workdiv
-    {
-        struct ConceptWorkDiv;
-
-        //-----------------------------------------------------------------------------
-        //! The work division traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The work div trait.
-            template<
-                typename TWorkDiv,
-                typename TOrigin,
-                typename TUnit,
-                typename TSfinae = void>
-            struct GetWorkDiv;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Get the extent requested.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOrigin,
-            typename TUnit,
-            typename TWorkDiv>
-        ALPAKA_FN_HOST_ACC auto getWorkDiv(
-            TWorkDiv const & workDiv)
-        -> vec::Vec<dim::Dim<TWorkDiv>, idx::Idx<TWorkDiv>>
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptWorkDiv, TWorkDiv>;
-            return
-                traits::GetWorkDiv<
-                    ImplementationBase,
-                    TOrigin,
-                    TUnit>
-                ::getWorkDiv(
-                    workDiv);
-        }
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The work div grid thread extent trait specialization.
-            template<
-                typename TWorkDiv>
-            struct GetWorkDiv<
-                TWorkDiv,
-                origin::Grid,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    TWorkDiv const & workDiv)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> decltype(
-                    workdiv::getWorkDiv<origin::Grid, unit::Blocks>(workDiv)
-                    * workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv))
-#endif
-                {
-                    return
-                        workdiv::getWorkDiv<origin::Grid, unit::Blocks>(workDiv)
-                        * workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv);
-                }
-            };
-            //#############################################################################
-            //! The work div grid element extent trait specialization.
-            template<
-                typename TWorkDiv>
-            struct GetWorkDiv<
-                TWorkDiv,
-                origin::Grid,
-                unit::Elems>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    TWorkDiv const & workDiv)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> decltype(
-                    workdiv::getWorkDiv<origin::Grid, unit::Threads>(workDiv)
-                    * workdiv::getWorkDiv<origin::Thread, unit::Elems>(workDiv))
-#endif
-                {
-                    return
-                        workdiv::getWorkDiv<origin::Grid, unit::Threads>(workDiv)
-                        * workdiv::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
-                }
-            };
-            //#############################################################################
-            //! The work div block element extent trait specialization.
-            template<
-                typename TWorkDiv>
-            struct GetWorkDiv<
-                TWorkDiv,
-                origin::Block,
-                unit::Elems>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    TWorkDiv const & workDiv)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> decltype(
-                    workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                    * workdiv::getWorkDiv<origin::Thread, unit::Elems>(workDiv))
-#endif
-                {
-                    return
-                        workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                        * workdiv::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/workdiv/WorkDivCudaBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/workdiv/WorkDivCudaBuiltIn.hpp
deleted file mode 100644
index f7031774e6..0000000000
--- a/thirdParty/alpaka/include/alpaka/workdiv/WorkDivCudaBuiltIn.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/workdiv/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-namespace alpaka
-{
-    namespace workdiv
-    {
-        //#############################################################################
-        //! The GPU CUDA accelerator work division.
-        template<
-            typename TDim,
-            typename TIdx>
-        class WorkDivCudaBuiltIn : public concepts::Implements<ConceptWorkDiv, WorkDivCudaBuiltIn<TDim, TIdx>>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ WorkDivCudaBuiltIn(
-                vec::Vec<TDim, TIdx> const & threadElemExtent) :
-                    m_threadElemExtent(threadElemExtent)
-            {}
-            //-----------------------------------------------------------------------------
-            __device__ WorkDivCudaBuiltIn(WorkDivCudaBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ WorkDivCudaBuiltIn(WorkDivCudaBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(WorkDivCudaBuiltIn const &) -> WorkDivCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(WorkDivCudaBuiltIn &&) -> WorkDivCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~WorkDivCudaBuiltIn() = default;
-
-        public:
-            // \TODO: Optimize! Add WorkDivCudaBuiltInNoElems that has no member m_threadElemExtent as well as AccGpuCudaRtNoElems.
-            // Use it instead of AccGpuCudaRt if the thread element extent is one to reduce the register usage.
-            vec::Vec<TDim, TIdx> const & m_threadElemExtent;
-        };
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator work division dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                workdiv::WorkDivCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator work division idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                workdiv::WorkDivCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace workdiv
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator work division grid block extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivCudaBuiltIn<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                __device__ static auto getWorkDiv(
-                    WorkDivCudaBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    return vec::cast<TIdx>(extent::getExtentVecEnd<TDim>(gridDim));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU CUDA accelerator work division block thread extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivCudaBuiltIn<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of threads in each dimension of a block.
-                __device__ static auto getWorkDiv(
-                    WorkDivCudaBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    return vec::cast<TIdx>(extent::getExtentVecEnd<TDim>(blockDim));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU CUDA accelerator work division thread element extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivCudaBuiltIn<TDim, TIdx>,
-                origin::Thread,
-                unit::Elems>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                __device__ static auto getWorkDiv(
-                    WorkDivCudaBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_threadElemExtent;
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp b/thirdParty/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp
deleted file mode 100644
index 6989283365..0000000000
--- a/thirdParty/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp
+++ /dev/null
@@ -1,499 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/acc/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Common.hpp>
-
-#include <cmath>
-#include <algorithm>
-#include <functional>
-#include <set>
-#include <array>
-
-//-----------------------------------------------------------------------------
-//! The alpaka library.
-namespace alpaka
-{
-    namespace workdiv
-    {
-        //#############################################################################
-        //! The grid block extent subdivision restrictions.
-        enum class GridBlockExtentSubDivRestrictions
-        {
-            EqualExtent,       //!< The block thread extent will be equal in all dimensions.
-            CloseToEqualExtent,//!< The block thread extent will be as close to equal as possible in all dimensions.
-            Unrestricted,      //!< The block thread extent will not have any restrictions.
-        };
-
-        namespace detail
-        {
-            //-----------------------------------------------------------------------------
-            //! \param maxDivisor The maximum divisor.
-            //! \param dividend The dividend.
-            //! \return The biggest number that satisfies the following conditions:
-            //!     1) dividend/ret==0
-            //!     2) ret<=maxDivisor
-            template<
-                typename T,
-                typename = typename std::enable_if<std::is_integral<T>::value>::type>
-            ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(
-                T const & maxDivisor,
-                T const & dividend)
-            -> T
-            {
-                T divisor(maxDivisor);
-
-                core::assertValueUnsigned(dividend);
-                core::assertValueUnsigned(maxDivisor);
-                ALPAKA_ASSERT(dividend <= maxDivisor);
-
-                while((dividend%divisor) != 0)
-                {
-                    --divisor;
-                }
-
-                return divisor;
-            }
-            //-----------------------------------------------------------------------------
-            //! \param val The value to find divisors of.
-            //! \param maxDivisor The maximum.
-            //! \return A list of all divisors less then or equal to the given maximum.
-            template<
-                typename T,
-                typename = typename std::enable_if<std::is_integral<T>::value>::type>
-            ALPAKA_FN_HOST auto allDivisorsLessOrEqual(
-                T const & val,
-                T const & maxDivisor)
-            -> std::set<T>
-            {
-                std::set<T> divisorSet;
-
-                core::assertValueUnsigned(val);
-                core::assertValueUnsigned(maxDivisor);
-                ALPAKA_ASSERT(maxDivisor <= val);
-
-                for(T i(1); i <= std::min(val, maxDivisor); ++i)
-                {
-                    if(val % i == 0)
-                    {
-                        divisorSet.insert(val/i);
-                    }
-                }
-
-                return divisorSet;
-            }
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \param accDevProps The maxima for the work division.
-        //! \return If the accelerator device properties are valid.
-        template<
-            typename TDim,
-            typename TIdx>
-        ALPAKA_FN_HOST auto isValidAccDevProps(
-            acc::AccDevProps<TDim, TIdx> const & accDevProps)
-        -> bool
-        {
-            // Check that the maximum counts are greater or equal 1.
-            if((accDevProps.m_gridBlockCountMax < 1)
-                || (accDevProps.m_blockThreadCountMax < 1)
-                || (accDevProps.m_threadElemCountMax < 1))
-            {
-                return false;
-            }
-
-            // Store the maxima allowed for extents of grid, blocks and threads.
-            auto const gridBlockExtentMax(vec::subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax));
-            auto const blockThreadExtentMax(vec::subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax));
-            auto const threadElemExtentMax(vec::subVecEnd<TDim>(accDevProps.m_threadElemExtentMax));
-
-            // Check that the extents for all dimensions are correct.
-            for(typename TDim::value_type i(0); i<TDim::value; ++i)
-            {
-                // Check that the maximum extents are greater or equal 1.
-                if((gridBlockExtentMax[i] < 1)
-                    || (blockThreadExtentMax[i] < 1)
-                    || (threadElemExtentMax[i] < 1))
-                {
-                    return false;
-                }
-            }
-
-            return true;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
-        //! 1. The the maxima block, thread and element extent and counts
-        //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
-        //! 3. The requirement of the block extent.
-        //!
-        //! \param gridElemExtent
-        //!     The full extent of elements in the grid.
-        //! \param threadElemExtent
-        //!     the number of elements computed per thread.
-        //! \param accDevProps
-        //!     The maxima for the work division.
-        //! \param requireBlockThreadExtentToDivideGridThreadExtent
-        //!     If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
-        //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block thread extent will be one in this dimension.
-        //! \param gridBlockExtentSubDivRestrictions
-        //!     The grid block extent subdivision restrictions.
-        template<
-            typename TDim,
-            typename TIdx>
-        ALPAKA_FN_HOST auto subDivideGridElems(
-            vec::Vec<TDim, TIdx> const & gridElemExtent,
-            vec::Vec<TDim, TIdx> threadElemExtent,
-            acc::AccDevProps<TDim, TIdx> const & accDevProps,
-            bool requireBlockThreadExtentToDivideGridThreadExtent = true,
-            GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions = GridBlockExtentSubDivRestrictions::Unrestricted)
-        -> workdiv::WorkDivMembers<TDim, TIdx>
-        {
-            ///////////////////////////////////////////////////////////////////
-            // Check that the input data is valid.
-            for(typename TDim::value_type i(0); i<TDim::value; ++i)
-            {
-                ALPAKA_ASSERT(gridElemExtent[i] >= 1);
-                ALPAKA_ASSERT(threadElemExtent[i] >= 1);
-                ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
-            }
-            ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
-            ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
-
-            ///////////////////////////////////////////////////////////////////
-            // Handle the given threadElemExtent. After this only the blockThreadExtent has to be optimized.
-
-            // Restrict the thread elem extent with the grid elem extent.
-            for(typename TDim::value_type i(0); i<TDim::value; ++i)
-            {
-                threadElemExtent[i] = std::min(threadElemExtent[i], gridElemExtent[i]);
-            }
-
-            // Calculate the grid thread extent.
-            auto gridThreadExtent(vec::Vec<TDim, TIdx>::zeros());
-            for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-            {
-                gridThreadExtent[i] =
-                    static_cast<TIdx>(
-                        std::ceil(
-                            static_cast<double>(gridElemExtent[i])
-                            / static_cast<double>(threadElemExtent[i])));
-            }
-
-            ///////////////////////////////////////////////////////////////////
-            // Try to calculate an optimal blockThreadExtent.
-
-            // Initialize the block thread extent with the maximum possible.
-            auto blockThreadExtent(accDevProps.m_blockThreadExtentMax);
-
-            // Restrict the max block thread extent with the grid thread extent.
-            // This removes dimensions not required in the grid thread extent.
-            // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
-            for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-            {
-                blockThreadExtent[i] = std::min(blockThreadExtent[i], gridThreadExtent[i]);
-            }
-
-            // For equal block thread extent, restrict it to its minimum component.
-            // For example (512, 256, 1024) will get (256, 256, 256).
-            if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
-            {
-                auto const minBlockThreadExtent(blockThreadExtent.min());
-                for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                {
-                    blockThreadExtent[i] = minBlockThreadExtent;
-                }
-            }
-
-            auto const & blockThreadCountMax(accDevProps.m_blockThreadCountMax);
-            // Adjust blockThreadExtent if its product is too large.
-            if(blockThreadExtent.prod() > blockThreadCountMax)
-            {
-                // Satisfy the following equation:
-                // blockThreadCountMax >= blockThreadExtent.prod()
-                // For example 1024 >= 512 * 512 * 1024
-
-                // For equal block thread extent this is easily the nth root of blockThreadCountMax.
-                if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
-                {
-                    double const fNthRoot(std::pow(blockThreadCountMax, 1.0/static_cast<double>(TDim::value)));
-                    TIdx const nthRoot(static_cast<TIdx>(fNthRoot));
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        blockThreadExtent[i] = nthRoot;
-                    }
-                }
-                else if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
-                {
-                    // Very primitive clipping. Just halve the largest value until it fits.
-                    while(blockThreadExtent.prod() > blockThreadCountMax)
-                    {
-                        auto const maxElemIdx(blockThreadExtent.maxElem());
-                        blockThreadExtent[maxElemIdx] = blockThreadExtent[maxElemIdx] / static_cast<TIdx>(2u);
-                    }
-                }
-                else
-                {
-                    // Very primitive clipping. Just halve the smallest value until it fits.
-                    while(blockThreadExtent.prod() > blockThreadCountMax)
-                    {
-                        // Compute the minimum element index but ignore ones.
-                        // Ones compare always larger to everything else.
-                        auto const minElemIdx(
-                            static_cast<TIdx>(
-                                std::distance(
-                                    &blockThreadExtent[0u],
-                                    std::min_element(
-                                        &blockThreadExtent[0u],
-                                        &blockThreadExtent[TDim::value-1u],
-                                        [](TIdx const & a, TIdx const & b)
-                                        {
-                                            // This first case is redundant.
-                                            /*if((a == 1u) && (b == 1u))
-                                            {
-                                                return false;
-                                            }
-                                            else */if(a == static_cast<TIdx>(1u))
-                                            {
-                                                return false;
-                                            }
-                                            else if(b == static_cast<TIdx>(1u))
-                                            {
-                                                return true;
-                                            }
-                                            else
-                                            {
-                                                return a < b;
-                                            }
-                                        }))));
-                        blockThreadExtent[minElemIdx] = blockThreadExtent[minElemIdx] / static_cast<TIdx>(2u);
-                    }
-                }
-            }
-
-            // Make the block thread extent divide the grid thread extent.
-            if(requireBlockThreadExtentToDivideGridThreadExtent)
-            {
-                if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
-                {
-                    // For equal size block extent we have to compute the gcd of all grid thread extent that is less then the current maximal block thread extent.
-                    // For this we compute the divisors of all grid thread extent less then the current maximal block thread extent.
-                    std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        gridThreadExtentDivisors[i] =
-                            detail::allDivisorsLessOrEqual(
-                                gridThreadExtent[i],
-                                blockThreadExtent[i]);
-                    }
-                    // The maximal common divisor of all block thread extent is the optimal solution.
-                    std::set<TIdx> intersects[2u];
-                    for(typename TDim::value_type i(1u); i<TDim::value; ++i)
-                    {
-                        intersects[(i-1u)%2u] = gridThreadExtentDivisors[0];
-                        intersects[(i)%2u].clear();
-                        set_intersection(
-                            intersects[(i-1u)%2u].begin(),
-                            intersects[(i-1u)%2u].end(),
-                            gridThreadExtentDivisors[i].begin(),
-                            gridThreadExtentDivisors[i].end(),
-                            std::inserter(intersects[i%2], intersects[i%2u].begin()));
-                    }
-                    TIdx const maxCommonDivisor(*(--intersects[(TDim::value-1)%2u].end()));
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        blockThreadExtent[i] = maxCommonDivisor;
-                    }
-                }
-                else if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
-                {
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        blockThreadExtent[i] =
-                            detail::nextDivisorLowerOrEqual(
-                                blockThreadExtent[i],
-                                gridThreadExtent[i]);
-                    }
-                }
-                else
-                {
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        blockThreadExtent[i] =
-                            detail::nextDivisorLowerOrEqual(
-                                blockThreadExtent[i],
-                                gridThreadExtent[i]);
-                    }
-                }
-            }
-
-            ///////////////////////////////////////////////////////////////////
-            // Compute the gridBlockExtent.
-
-            // Set the grid block extent (rounded to the next integer not less then the quotient.
-            auto gridBlockExtent(vec::Vec<TDim, TIdx>::ones());
-            for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-            {
-                gridBlockExtent[i] =
-                    static_cast<TIdx>(
-                        std::ceil(
-                            static_cast<double>(gridThreadExtent[i])
-                            / static_cast<double>(blockThreadExtent[i])));
-            }
-
-            ///////////////////////////////////////////////////////////////////
-            // Return the final work division.
-            return
-                workdiv::WorkDivMembers<TDim, TIdx>(
-                    gridBlockExtent,
-                    blockThreadExtent,
-                    threadElemExtent);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \tparam TAcc The accelerator for which this work division has to be valid.
-        //! \tparam TGridElemExtent The type of the grid element extent.
-        //! \tparam TThreadElemExtent The type of the thread element extent.
-        //! \tparam TDev The type of the device.
-        //! \param dev
-        //!     The device the work division should be valid for.
-        //! \param gridElemExtent
-        //!     The full extent of elements in the grid.
-        //! \param threadElemExtents
-        //!     the number of elements computed per thread.
-        //! \param requireBlockThreadExtentToDivideGridThreadExtent
-        //!     If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
-        //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block thread extent will be one in this dimension.
-        //! \param gridBlockExtentSubDivRestrictions
-        //!     The grid block extent subdivision restrictions.
-        //! \return The work division.
-        template<
-            typename TAcc,
-            typename TGridElemExtent,
-            typename TThreadElemExtent,
-            typename TDev>
-        ALPAKA_FN_HOST auto getValidWorkDiv(
-            TDev const & dev,
-            TGridElemExtent const & gridElemExtent = TGridElemExtent(),
-            TThreadElemExtent const & threadElemExtents = TThreadElemExtent(),
-            bool requireBlockThreadExtentToDivideGridThreadExtent = true,
-            GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions = GridBlockExtentSubDivRestrictions::Unrestricted)
-        -> workdiv::WorkDivMembers<dim::Dim<TGridElemExtent>, idx::Idx<TGridElemExtent>>
-        {
-            static_assert(
-                dim::Dim<TGridElemExtent>::value == dim::Dim<TAcc>::value,
-                "The dimension of TAcc and the dimension of TGridElemExtent have to be identical!");
-            static_assert(
-                dim::Dim<TThreadElemExtent>::value == dim::Dim<TAcc>::value,
-                "The dimension of TAcc and the dimension of TThreadElemExtent have to be identical!");
-            static_assert(
-                std::is_same<idx::Idx<TGridElemExtent>, idx::Idx<TAcc>>::value,
-                "The idx type of TAcc and the idx type of TGridElemExtent have to be identical!");
-            static_assert(
-                std::is_same<idx::Idx<TThreadElemExtent>, idx::Idx<TAcc>>::value,
-                "The idx type of TAcc and the idx type of TThreadElemExtent have to be identical!");
-
-            return subDivideGridElems(
-                extent::getExtentVec(gridElemExtent),
-                extent::getExtentVec(threadElemExtents),
-                acc::getAccDevProps<TAcc>(dev),
-                requireBlockThreadExtentToDivideGridThreadExtent,
-                gridBlockExtentSubDivRestrictions);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \tparam TWorkDiv The type of the work division.
-        //! \param accDevProps The maxima for the work division.
-        //! \param workDiv The work division to test for validity.
-        //! \return If the work division is valid for the given accelerator device properties.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TWorkDiv>
-        ALPAKA_FN_HOST auto isValidWorkDiv(
-            acc::AccDevProps<TDim, TIdx> const & accDevProps,
-            TWorkDiv const & workDiv)
-        -> bool
-        {
-            // Store the maxima allowed for extents of grid, blocks and threads.
-            auto const gridBlockExtentMax(vec::subVecEnd<dim::Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax));
-            auto const blockThreadExtentMax(vec::subVecEnd<dim::Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax));
-            auto const threadElemExtentMax(vec::subVecEnd<dim::Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax));
-
-            // Get the extents of grid, blocks and threads of the work division to check.
-            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(workDiv));
-            auto const blockThreadExtent(getWorkDiv<Block, Threads>(workDiv));
-            auto const threadElemExtent(getWorkDiv<Block, Threads>(workDiv));
-
-            // Check that the maximal counts are satisfied.
-            if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
-            {
-                return false;
-            }
-            if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
-            {
-                return false;
-            }
-            if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
-            {
-                return false;
-            }
-
-            // Check that the extents for all dimensions are correct.
-            for(typename dim::Dim<TWorkDiv>::value_type i(0); i<dim::Dim<TWorkDiv>::value; ++i)
-            {
-                // No extent is allowed to be zero or greater then the allowed maximum.
-                if((gridBlockExtent[i] < 1)
-                    || (blockThreadExtent[i] < 1)
-                    || (threadElemExtent[i] < 1)
-                    || (gridBlockExtentMax[i] < gridBlockExtent[i])
-                    || (blockThreadExtentMax[i] < blockThreadExtent[i])
-                    || (threadElemExtentMax[i] < threadElemExtent[i]))
-                {
-                    return false;
-                }
-            }
-
-            return true;
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TAcc The accelerator to test the validity on.
-        //! \param dev The device to test the work division for validity on.
-        //! \param workDiv The work division to test for validity.
-        //! \return If the work division is valid on this accelerator.
-        template<
-            typename TAcc,
-            typename TDev,
-            typename TWorkDiv>
-        ALPAKA_FN_HOST auto isValidWorkDiv(
-            TDev const & dev,
-            TWorkDiv const & workDiv)
-        -> bool
-        {
-            return
-                workdiv::isValidWorkDiv(
-                    acc::getAccDevProps<TAcc>(dev),
-                    workDiv);
-        }
-    }
-}
diff --git a/thirdParty/alpaka/include/alpaka/workdiv/WorkDivHipBuiltIn.hpp b/thirdParty/alpaka/include/alpaka/workdiv/WorkDivHipBuiltIn.hpp
deleted file mode 100644
index c0f9353651..0000000000
--- a/thirdParty/alpaka/include/alpaka/workdiv/WorkDivHipBuiltIn.hpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/workdiv/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <hip/hip_runtime.h>
-
-
-namespace alpaka
-{
-    namespace workdiv
-    {
-        //#############################################################################
-        //! The GPU HIP accelerator work division.
-        template<
-            typename TDim,
-            typename TIdx>
-        class WorkDivHipBuiltIn : public concepts::Implements<ConceptWorkDiv, WorkDivHipBuiltIn<TDim, TIdx>>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! Default constructor.
-            __device__ WorkDivHipBuiltIn(
-                vec::Vec<TDim, TIdx> const & threadElemExtent) :
-                    m_threadElemExtent(threadElemExtent)
-            {}
-            //-----------------------------------------------------------------------------
-            //! Copy constructor.
-            __device__ WorkDivHipBuiltIn(WorkDivHipBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            //! Move constructor.
-            __device__ WorkDivHipBuiltIn(WorkDivHipBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            //! Copy assignment operator.
-            __device__ auto operator=(WorkDivHipBuiltIn const &) -> WorkDivHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            //! Move assignment operator.
-            __device__ auto operator=(WorkDivHipBuiltIn &&) -> WorkDivHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            //! Destructor.
-            /*virtual*/ ALPAKA_FN_HOST_ACC ~WorkDivHipBuiltIn() = default;
-
-        public:
-            // \TODO: Optimize! Add WorkDivHipBuiltInNoElems that has no member m_threadElemExtent as well as AccGpuHipRtNoElems.
-            // Use it instead of AccGpuHipRt if the thread element extent is one to reduce the register usage.
-            vec::Vec<TDim, TIdx> const & m_threadElemExtent;
-        };
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator work division dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                workdiv::WorkDivHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator work division idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                workdiv::WorkDivHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace workdiv
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator work division grid block extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivHipBuiltIn<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-#if defined(BOOST_COMP_HCC) && BOOST_COMP_HCC /* hcc requires matching host-device signature */
-                ALPAKA_FN_HOST_ACC
-#else /* nvcc does not know about blockDim.x etc. on host */
-                __device__
-#endif
-                static auto getWorkDiv(
-                    WorkDivHipBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-
-                    return extent::getExtentVecEnd<TDim>(
-                        vec::Vec<
-                          std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                            static_cast<TIdx>(hipGridDim_z),
-                            static_cast<TIdx>(hipGridDim_y),
-                            static_cast<TIdx>(hipGridDim_x)));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU HIP accelerator work division block thread extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivHipBuiltIn<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of threads in each dimension of a block.
-                ALPAKA_NO_HOST_ACC_WARNING
-#if defined(BOOST_COMP_HCC) && BOOST_COMP_HCC /* hcc requires matching host-device signature */
-                ALPAKA_FN_HOST_ACC
-#else /* nvcc does not know about blockDim.x etc. on host */
-                __device__
-#endif
-                static auto getWorkDiv(
-                    WorkDivHipBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-
-                    return extent::getExtentVecEnd<TDim>(
-                        vec::Vec<
-                          std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                            static_cast<TIdx>(hipBlockDim_z),
-                            static_cast<TIdx>(hipBlockDim_y),
-                            static_cast<TIdx>(hipBlockDim_x)));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU HIP accelerator work division thread element extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivHipBuiltIn<TDim, TIdx>,
-                origin::Thread,
-                unit::Elems>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    WorkDivHipBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_threadElemExtent;
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp b/thirdParty/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp
deleted file mode 100644
index 7815908dc2..0000000000
--- a/thirdParty/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/workdiv/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Common.hpp>
-
-#include <iosfwd>
-
-namespace alpaka
-{
-    namespace workdiv
-    {
-        //#############################################################################
-        //! A basic class holding the work division as grid block extent, block thread and thread element extent.
-        template<
-            typename TDim,
-            typename TIdx>
-        class WorkDivMembers : public concepts::Implements<ConceptWorkDiv, WorkDivMembers<TDim, TIdx>>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST_ACC WorkDivMembers() = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TGridBlockExtent,
-                typename TBlockThreadExtent,
-                typename TThreadElemExtent>
-            ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
-                TGridBlockExtent const & gridBlockExtent = TGridBlockExtent(),
-                TBlockThreadExtent const & blockThreadExtent = TBlockThreadExtent(),
-                TThreadElemExtent const & threadElemExtent = TThreadElemExtent()) :
-                m_gridBlockExtent(extent::getExtentVecEnd<TDim>(gridBlockExtent)),
-                m_blockThreadExtent(extent::getExtentVecEnd<TDim>(blockThreadExtent)),
-                m_threadElemExtent(extent::getExtentVecEnd<TDim>(threadElemExtent))
-            {}
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
-                WorkDivMembers const & other) :
-                    m_gridBlockExtent(other.m_gridBlockExtent),
-                    m_blockThreadExtent(other.m_blockThreadExtent),
-                    m_threadElemExtent(other.m_threadElemExtent)
-            {}
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
-                TWorkDiv const & other) :
-                    m_gridBlockExtent(vec::subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other))),
-                    m_blockThreadExtent(vec::subVecEnd<TDim>(getWorkDiv<Block, Threads>(other))),
-                    m_threadElemExtent(vec::subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other)))
-            {}
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            WorkDivMembers(WorkDivMembers &&) = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            auto operator=(WorkDivMembers const &) -> WorkDivMembers & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            auto operator=(WorkDivMembers &&) -> WorkDivMembers & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST_ACC auto operator=(
-                TWorkDiv const & other)
-            -> WorkDivMembers<TDim, TIdx> &
-            {
-                m_gridBlockExtent = vec::subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other));
-                m_blockThreadExtent = vec::subVecEnd<TDim>(getWorkDiv<Block, Threads>(other));
-                m_threadElemExtent = vec::subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other));
-                return *this;
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            /*virtual*/ ALPAKA_FN_HOST_ACC ~WorkDivMembers() = default;
-
-        public:
-            vec::Vec<TDim, TIdx> m_gridBlockExtent;
-            vec::Vec<TDim, TIdx> m_blockThreadExtent;
-            vec::Vec<TDim, TIdx> m_threadElemExtent;
-        };
-
-        //-----------------------------------------------------------------------------
-        template<
-            typename TDim,
-            typename TIdx>
-        ALPAKA_FN_HOST auto operator<<(
-            std::ostream & os,
-            WorkDivMembers<TDim, TIdx> const & workDiv)
-        -> std::ostream &
-        {
-            return (os
-                << "{gridBlockExtent: " << workDiv.m_gridBlockExtent
-                << ", blockThreadExtent: " << workDiv.m_blockThreadExtent
-                << ", threadElemExtent: " << workDiv.m_threadElemExtent
-                << "}");
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The WorkDivMembers dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                workdiv::WorkDivMembers<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The WorkDivMembers idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                workdiv::WorkDivMembers<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace workdiv
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The WorkDivMembers grid block extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivMembers<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    WorkDivMembers<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_gridBlockExtent;
-                }
-            };
-
-            //#############################################################################
-            //! The WorkDivMembers block thread extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivMembers<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of threads in each dimension of a block.
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    WorkDivMembers<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_blockThreadExtent;
-                }
-            };
-
-            //#############################################################################
-            //! The WorkDivMembers thread element extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivMembers<TDim, TIdx>,
-                origin::Thread,
-                unit::Elems>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of elements in each dimension of a thread.
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    WorkDivMembers<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_threadElemExtent;
-                }
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/script/travis/after_failure.sh b/thirdParty/alpaka/script/travis/after_failure.sh
deleted file mode 100755
index 4f791807d6..0000000000
--- a/thirdParty/alpaka/script/travis/after_failure.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2018-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-  sudo smem
-  sudo free -m -t
-  # show actions of the OOM killer
-  sudo dmesg
-fi
diff --git a/thirdParty/alpaka/script/travis/before_install.sh b/thirdParty/alpaka/script/travis/before_install.sh
deleted file mode 100755
index 3f87e6ed72..0000000000
--- a/thirdParty/alpaka/script/travis/before_install.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# Those are set to g++/gcc within the git bash even though they are overwritten in the .travis.yml file.
-if [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    CXX=cl.exe
-    CC=cl.exe
-fi
-
-#-------------------------------------------------------------------------------
-# gcc
-if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
-then
-    ALPAKA_CI_GCC_VER_SEMANTIC=( ${ALPAKA_CI_GCC_VER//./ } )
-    export ALPAKA_CI_GCC_VER_MAJOR="${ALPAKA_CI_GCC_VER_SEMANTIC[0]}"
-    echo ALPAKA_CI_GCC_VER_MAJOR: "${ALPAKA_CI_GCC_VER_MAJOR}"
-fi
-
-#-------------------------------------------------------------------------------
-# Boost.
-ALPAKA_CI_BOOST_BRANCH_MAJOR=${ALPAKA_CI_BOOST_BRANCH:6:1}
-echo ALPAKA_CI_BOOST_BRANCH_MAJOR: "${ALPAKA_CI_BOOST_BRANCH_MAJOR}"
-ALPAKA_CI_BOOST_BRANCH_MINOR=${ALPAKA_CI_BOOST_BRANCH:8:2}
-echo ALPAKA_CI_BOOST_BRANCH_MINOR: "${ALPAKA_CI_BOOST_BRANCH_MINOR}"
-
-#-------------------------------------------------------------------------------
-# CUDA
-export ALPAKA_CI_INSTALL_CUDA="OFF"
-if [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "ON" ]
-then
-    export ALPAKA_CI_INSTALL_CUDA="ON"
-fi
-if [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "ON" ]
-then
-    if [ "${ALPAKA_HIP_PLATFORM}" == "nvcc" ]
-    then
-        export ALPAKA_CI_INSTALL_CUDA="ON"
-    fi
-fi
-
-#-------------------------------------------------------------------------------
-# HIP
-export ALPAKA_CI_INSTALL_HIP="OFF"
-if [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "ON" ]
-then
-    export ALPAKA_CI_INSTALL_HIP="ON"
-
-    # if platform is nvcc, CUDA part is already processed in this file.
-    if [ "${ALPAKA_HIP_PLATFORM}" == "hcc" ]
-    then
-        echo "HIP(hcc) not supported yet."
-        exit 1
-    fi
-fi
-
-#-------------------------------------------------------------------------------
-# TBB
-export ALPAKA_CI_INSTALL_TBB="OFF"
-if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
-then
-    if [ "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}" = "ON" ]
-    then
-        export ALPAKA_CI_INSTALL_TBB="ON"
-    fi
-else
-    # If the variable is not set, the backend will most probably be used by default so we install it.
-    export ALPAKA_CI_INSTALL_TBB="ON"
-fi
-
-#-------------------------------------------------------------------------------
-# Fibers
-export ALPAKA_CI_INSTALL_FIBERS="OFF"
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ]
-then
-    if [ "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}" = "ON" ]
-    then
-        export ALPAKA_CI_INSTALL_FIBERS="ON"
-    fi
-else
-    # If the variable is not set, the backend will most probably be used by default so we install it.
-    export ALPAKA_CI_INSTALL_FIBERS="ON"
-fi
-
-
-# GCC-5.5 has broken avx512vlintrin.h in Release mode with NVCC 9.X
-#   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=76731
-#   https://github.com/tensorflow/tensorflow/issues/10220
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    if [ "${CXX}" == "g++" ]
-    then
-        if (( "${ALPAKA_CI_GCC_VER_MAJOR}" == 5 ))
-        then
-            if [ "${ALPAKA_CUDA_COMPILER}" == "nvcc" ]
-            then
-                if [ "${CMAKE_BUILD_TYPE}" == "Release" ]
-                then
-                    export CMAKE_BUILD_TYPE=Debug
-                fi
-            fi
-        fi
-    fi
-fi
-
-#-------------------------------------------------------------------------------
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-    then
-        if [ "${CXX}" == "g++" ]
-        then
-            echo "using libc++ with g++ not yet supported."
-            exit 1
-        fi
-
-        if [ "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" == "ubuntu:14.04" ]
-        then
-            echo "using libc++ with ubuntu:14.04 not supported."
-            exit 1
-        fi
-
-        if (( ( ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" == 1 ) && ( "${ALPAKA_CI_BOOST_BRANCH_MINOR}" < 65 ) ) || ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" < 1 ) ))
-        then
-            echo "using libc++ with boost < 1.65 is not supported."
-            exit 1
-        fi
-    fi
-
-    if [ "${ALPAKA_CI_STDLIB}" == "libstdc++" ]
-    then
-        if [ ! -z "${ALPAKA_CXX_STANDARD+x}" ]
-        then
-            if (( "${ALPAKA_CXX_STANDARD}" >= 17 ))
-            then
-                if [ "${CXX}" == "clang++" ]
-                then
-                    if (( "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}" < 7 ))
-                    then
-                        echo "Clang used in c++17 mode requires libstdc++-7 or newer."
-                        exit 1
-                    fi
-                fi
-                if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
-                then
-                    if (( ( ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" == 1 ) && ( "${ALPAKA_CI_BOOST_BRANCH_MINOR}" < 67 ) ) || ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" < 1 ) ))
-                    then
-                        # https://github.com/boostorg/coroutine2/issues/26
-                        echo "libstdc++ in c++17 mode is not compatible with boost.fibers in boost-1.66 and below."
-                        exit 1
-                    fi
-                fi
-            fi
-        fi
-    fi
-fi
\ No newline at end of file
diff --git a/thirdParty/alpaka/script/travis/docker_install.sh b/thirdParty/alpaka/script/travis/docker_install.sh
deleted file mode 100755
index 02b267cf46..0000000000
--- a/thirdParty/alpaka/script/travis/docker_install.sh
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-ls "${ALPAKA_CI_DOCKER_CACHE_DIR}"
-
-ALPAKA_DOCKER_BUILD_REQUIRED=1
-
-if [ -f "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}" ]
-then
-    # NOTE: The image being available is not the only precondition. If anything within any of the scripts has changed in comparison to the ones that created the docker image, we might have to rebuild the image.
-    ALPAKA_DOCKER_BUILD_REQUIRED=0
-fi
-
-# runtime and compile time options
-ALPAKA_DOCKER_ENV_LIST=()
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CC=${CC}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CXX=${CXX}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "TRAVIS_OS_NAME=${TRAVIS_OS_NAME}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_ANALYSIS=${ALPAKA_CI_ANALYSIS}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_BRANCH=${ALPAKA_CI_BOOST_BRANCH}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "BOOST_ROOT=${BOOST_ROOT}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_LIB_DIR=${ALPAKA_CI_BOOST_LIB_DIR}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_DIR=${ALPAKA_CI_CLANG_DIR}")
-if [ ! -z "${ALPAKA_CI_CLANG_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_VER=${ALPAKA_CI_CLANG_VER}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_STDLIB=${ALPAKA_CI_STDLIB}")
-if [ ! -z ${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION+x} ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_VER=${ALPAKA_CI_CMAKE_VER}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_DIR=${ALPAKA_CI_CMAKE_DIR}")
-if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_GCC_VER=${ALPAKA_CI_GCC_VER}")
-fi
-if [ ! -z "${ALPAKA_CI_SANITIZERS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_SANITIZERS=${ALPAKA_CI_SANITIZERS}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_BT_OMP4_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_BT_OMP4_ENABLE=${ALPAKA_ACC_CPU_BT_OMP4_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ENABLE=${ALPAKA_ACC_GPU_CUDA_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_HIP_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ENABLE=${ALPAKA_ACC_GPU_HIP_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_CUDA=${ALPAKA_CI_INSTALL_CUDA}")
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CUDA_DIR=${ALPAKA_CI_CUDA_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_VERSION=${ALPAKA_CUDA_VERSION}")
-    if [ ! -z "${ALPAKA_CUDA_COMPILER+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_COMPILER=${ALPAKA_CUDA_COMPILER}")
-    fi
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_HIP=${ALPAKA_CI_INSTALL_HIP}")
-if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_ROOT_DIR=${ALPAKA_CI_HIP_ROOT_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_BRANCH=${ALPAKA_CI_HIP_BRANCH}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_HIP_PLATFORM=${ALPAKA_HIP_PLATFORM}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_TBB=${ALPAKA_CI_INSTALL_TBB}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_FIBERS=${ALPAKA_CI_INSTALL_FIBERS}")
-
-if [ "${ALPAKA_DOCKER_BUILD_REQUIRED}" -eq 1 ]
-then
-  docker run -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" /bin/bash ./script/travis/install.sh
-
-  ALPAKA_DOCKER_CONTAINER_NAME=$(docker ps -l -q)
-  docker commit "${ALPAKA_DOCKER_CONTAINER_NAME}" "${ALPAKA_CI_DOCKER_IMAGE_NAME}"
-
-  # delete the container and the base image to save disc space
-  docker stop "${ALPAKA_DOCKER_CONTAINER_NAME}"
-  docker rm "${ALPAKA_DOCKER_CONTAINER_NAME}"
-  docker rmi "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}"
-
-  docker save "${ALPAKA_CI_DOCKER_IMAGE_NAME}" | gzip > "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}"
-
-  docker images
-fi
diff --git a/thirdParty/alpaka/script/travis/docker_run.sh b/thirdParty/alpaka/script/travis/docker_run.sh
deleted file mode 100755
index 59a2ec8575..0000000000
--- a/thirdParty/alpaka/script/travis/docker_run.sh
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-# runtime and compile time options
-ALPAKA_DOCKER_ENV_LIST=()
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CC=${CC}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CXX=${CXX}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "TRAVIS_OS_NAME=${TRAVIS_OS_NAME}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_ANALYSIS=${ALPAKA_CI_ANALYSIS}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_BRANCH=${ALPAKA_CI_BOOST_BRANCH}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "BOOST_ROOT=${BOOST_ROOT}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_LIB_DIR=${ALPAKA_CI_BOOST_LIB_DIR}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_DIR=${ALPAKA_CI_CLANG_DIR}")
-if [ ! -z "${ALPAKA_CI_CLANG_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_VER=${ALPAKA_CI_CLANG_VER}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_STDLIB=${ALPAKA_CI_STDLIB}")
-if [ ! -z "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_VER=${ALPAKA_CI_CMAKE_VER}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_DIR=${ALPAKA_CI_CMAKE_DIR}")
-if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_GCC_VER=${ALPAKA_CI_GCC_VER}")
-fi
-if [ ! -z "${ALPAKA_CI_SANITIZERS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_SANITIZERS=${ALPAKA_CI_SANITIZERS}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_BT_OMP4_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_BT_OMP4_ENABLE=${ALPAKA_ACC_CPU_BT_OMP4_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ENABLE=${ALPAKA_ACC_GPU_CUDA_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_HIP_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ENABLE=${ALPAKA_ACC_GPU_HIP_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_CUDA=${ALPAKA_CI_INSTALL_CUDA}")
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CUDA_DIR=${ALPAKA_CI_CUDA_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_VERSION=${ALPAKA_CUDA_VERSION}")
-    if [ ! -z "${ALPAKA_CUDA_COMPILER+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_COMPILER=${ALPAKA_CUDA_COMPILER}")
-    fi
-    if [ ! -z "${ALPAKA_CUDA_ARCH+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_ARCH=${ALPAKA_CUDA_ARCH}")
-    fi
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_HIP=${ALPAKA_CI_INSTALL_HIP}")
-if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_ROOT_DIR=${ALPAKA_CI_HIP_ROOT_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_HIP_PLATFORM=${ALPAKA_HIP_PLATFORM}")
-fi
-
-# runtime only options
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI=${ALPAKA_CI}")
-if [ ! -z "${ALPAKA_DEBUG+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_DEBUG=${ALPAKA_DEBUG}")
-fi
-if [ ! -z "${ALPAKA_CXX_STANDARD+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CXX_STANDARD=${ALPAKA_CXX_STANDARD}")
-fi
-if [ ! -z "${OMP_NUM_THREADS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "OMP_NUM_THREADS=${OMP_NUM_THREADS}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ONLY_MODE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ONLY_MODE=${ALPAKA_ACC_GPU_CUDA_ONLY_MODE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_HIP_ONLY_MODE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ONLY_MODE=${ALPAKA_ACC_GPU_HIP_ONLY_MODE}")
-fi
-if [ ! -z "${ALPAKA_CUDA_FAST_MATH+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_FAST_MATH=${ALPAKA_CUDA_FAST_MATH}")
-fi
-if [ ! -z "${ALPAKA_CUDA_FTZ+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_FTZ=${ALPAKA_CUDA_FTZ}")
-fi
-if [ ! -z "${ALPAKA_CUDA_SHOW_REGISTER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_SHOW_REGISTER=${ALPAKA_CUDA_SHOW_REGISTER}")
-fi
-if [ ! -z "${ALPAKA_CUDA_KEEP_FILES+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_KEEP_FILES=${ALPAKA_CUDA_KEEP_FILES}")
-fi
-if [ ! -z "${ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA=${ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA}")
-fi
-if [ ! -z "${ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR=${ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR}")
-fi
-if [ ! -z "${ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=${ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION}")
-fi
-
-docker images
-docker images -q ${ALPAKA_CI_DOCKER_IMAGE_NAME}
-
-# If we have created the image in the current run, we do not have to load it again, because it is already available.
-if [[ "$(docker images -q ${ALPAKA_CI_DOCKER_IMAGE_NAME} 2> /dev/null)" == "" ]]; then
-    gzip -dc "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}" | docker load
-fi
-
-# --cap-add SYS_PTRACE is required for LSAN to work
-docker run --cap-add SYS_PTRACE -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" --rm "${ALPAKA_CI_DOCKER_IMAGE_NAME}" /bin/bash ./script/travis/run.sh
diff --git a/thirdParty/alpaka/script/travis/install.sh b/thirdParty/alpaka/script/travis/install.sh
deleted file mode 100755
index 6115216845..0000000000
--- a/thirdParty/alpaka/script/travis/install.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: ${ALPAKA_CI_ANALYSIS?"ALPAKA_CI_ANALYSIS must be specified"}
-: ${ALPAKA_CI_INSTALL_CUDA?"ALPAKA_CI_INSTALL_CUDA must be specified"}
-: ${ALPAKA_CI_INSTALL_HIP?"ALPAKA_CI_INSTALL_HIP must be specified"}
-: ${ALPAKA_CI_INSTALL_TBB?"ALPAKA_CI_INSTALL_TBB must be specified"}
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    travis_retry apt-get -y --quiet update
-    travis_retry apt-get -y install sudo
-
-    # software-properties-common: 'add-apt-repository' and certificates for wget https download
-    # binutils: ld
-    # xz-utils: xzcat
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install software-properties-common wget git make binutils xz-utils
-fi
-
-if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    ./script/travis/install_cmake.sh
-fi
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/travis/install_analysis.sh ;fi
-fi
-
-# Install CUDA before installing gcc as it installs gcc-4.8 and overwrites our selected compiler
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ] ;then ./script/travis/install_cuda.sh ;fi
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ "${CXX}" == "g++" ] ;then ./script/travis/install_gcc.sh ;fi
-    if [ "${CXX}" == "clang++" ] ;then source ./script/travis/install_clang.sh ;fi
-    if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ] ;then ./script/travis/install_hip.sh ;fi
-fi
-
-if [ "${ALPAKA_CI_INSTALL_TBB}" = "ON" ]
-then
-    ./script/travis/install_tbb.sh
-fi
-
-./script/travis/install_boost.sh
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    # Minimize docker image size
-    sudo apt-get --quiet --purge autoremove
-    sudo apt-get clean
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-fi
diff --git a/thirdParty/alpaka/script/travis/install_analysis.sh b/thirdParty/alpaka/script/travis/install_analysis.sh
deleted file mode 100755
index adec4c9ca0..0000000000
--- a/thirdParty/alpaka/script/travis/install_analysis.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# Install sloc
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install sloccount
-sloccount --version
-
-#-------------------------------------------------------------------------------
-# Install shellcheck
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install shellcheck
-shellcheck --version
diff --git a/thirdParty/alpaka/script/travis/install_boost.sh b/thirdParty/alpaka/script/travis/install_boost.sh
deleted file mode 100755
index 86b720452d..0000000000
--- a/thirdParty/alpaka/script/travis/install_boost.sh
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-: "${BOOST_ROOT?'BOOST_ROOT must be specified'}"
-: "${ALPAKA_CI_BOOST_LIB_DIR?'ALPAKA_CI_BOOST_LIB_DIR must be specified'}"
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
-fi
-: "${CMAKE_BUILD_TYPE?'CMAKE_BUILD_TYPE must be specified'}"
-: "${CXX?'CXX must be specified'}"
-: "${CC?'CC must be specified'}"
-: "${ALPAKA_CI_INSTALL_FIBERS?'ALPAKA_CI_INSTALL_FIBERS must be specified'}"
-
-git clone -b "${ALPAKA_CI_BOOST_BRANCH}" --quiet --recursive --single-branch --depth 1 https://github.com/boostorg/boost.git "${BOOST_ROOT}"
-
-# Bootstrap boost.
-if [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    (cd "${BOOST_ROOT}"; ./bootstrap.bat)
-else
-    (cd "${BOOST_ROOT}"; sudo ./bootstrap.sh --with-toolset="${CC}")
-fi
-(cd "${BOOST_ROOT}"; cat ./bootstrap.log)
-
-# Create file links.
-if [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    (cd "${BOOST_ROOT}"; ./b2 headers)
-else
-    (cd "${BOOST_ROOT}"; sudo ./b2 headers)
-fi
-
-# Only build boost if we need some of the non-header-only libraries
-if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
-then
-    # Prepare the library destination directory.
-    mkdir -p "${ALPAKA_CI_BOOST_LIB_DIR}"
-
-    # Create the boost build command.
-    ALPAKA_BOOST_B2=""
-    ALPAKA_BOOST_B2_CFLAGS=""
-    ALPAKA_BOOST_B2_CXXFLAGS=""
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-    then
-        ALPAKA_BOOST_B2+="sudo "
-    fi
-    ALPAKA_BOOST_B2+="./b2 -j1"
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-    then
-        ALPAKA_BOOST_B2_CFLAGS+="-fPIC"
-        ALPAKA_BOOST_B2_CXXFLAGS+="-fPIC"
-    fi
-
-    if [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        ALPAKA_BOOST_B2+=" --layout=versioned --toolset=msvc-14.1"
-    else
-        ALPAKA_BOOST_B2+=" --layout=tagged --toolset=${CC}"
-    fi
-
-    # TODO: Win32: adress-model=32
-    ALPAKA_BOOST_B2+=" architecture=x86 address-model=64 link=static threading=multi runtime-link=shared"
-
-    if [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        ALPAKA_BOOST_B2+=" define=_CRT_NONSTDC_NO_DEPRECATE define=_CRT_SECURE_NO_DEPRECATE define=_SCL_SECURE_NO_DEPRECAT define=BOOST_USE_WINFIBERS define=_ENABLE_EXTENDED_ALIGNED_STORAGE"
-    fi
-
-    if [ "${CMAKE_BUILD_TYPE}" == "Debug" ]
-    then
-      ALPAKA_BOOST_B2+=" variant=debug"
-    else
-      ALPAKA_BOOST_B2+=" variant=release"
-    fi
-
-    # Clang is not supported by the FindBoost script.
-    # boost (especially old versions) produces too much warnings when using clang (newer versions) so that the 4 MiB log is too short.
-    if [ "${CXX}" == "clang++" ]
-    then
-        ALPAKA_BOOST_B2_CXXFLAGS+=" -Wunused-private-field -Wno-unused-local-typedef -Wno-c99-extensions -Wno-variadic-macros"
-    fi
-    # Select the libraries required.
-    # If the variable is not set, the backend will most probably be used by default so we install it.
-    if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
-    then
-        if [ "$TRAVIS_OS_NAME" = "linux" ]
-        then
-            ALPAKA_BOOST_B2_CXXFLAGS+=" -std=c++11"
-        fi
-        ALPAKA_BOOST_B2+=" --with-fiber --with-context --with-thread --with-atomic --with-system --with-chrono --with-date_time"
-    fi
-    if [ "${ALPAKA_BOOST_B2_CFLAGS}" != "" ]
-    then
-        ALPAKA_BOOST_B2+=' cflags="'
-        ALPAKA_BOOST_B2+="${ALPAKA_BOOST_B2_CFLAGS}"
-        ALPAKA_BOOST_B2+='"'
-    fi
-    if [ "${ALPAKA_BOOST_B2_CXXFLAGS}" != "" ]
-    then
-        ALPAKA_BOOST_B2+=' cxxflags="'
-        ALPAKA_BOOST_B2+="${ALPAKA_BOOST_B2_CXXFLAGS}"
-        if [ "$TRAVIS_OS_NAME" = "linux" ]
-        then
-            if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-            then
-                ALPAKA_BOOST_B2+=" -stdlib=libc++"
-            fi
-        fi
-        ALPAKA_BOOST_B2+='"'
-    fi
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ]
-    then
-        if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-        then
-            ALPAKA_BOOST_B2+=' linkflags="-stdlib=libc++"'
-        fi
-    fi
-
-    ALPAKA_BOOST_B2+=" --stagedir=${ALPAKA_CI_BOOST_LIB_DIR} stage"
-
-    # Build boost.
-    #echo "ALPAKA_BOOST_B2=${ALPAKA_BOOST_B2}"
-    (cd "${BOOST_ROOT}"; eval "${ALPAKA_BOOST_B2}")
-
-    # Clean the intermediate build files.
-    if [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        rm -rf bin.v2
-    else
-        sudo rm -rf bin.v2
-    fi
-fi
diff --git a/thirdParty/alpaka/script/travis/install_clang.sh b/thirdParty/alpaka/script/travis/install_clang.sh
deleted file mode 100755
index c7b03de736..0000000000
--- a/thirdParty/alpaka/script/travis/install_clang.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_CLANG_DIR?'ALPAKA_CI_CLANG_DIR must be specified'}"
-: "${ALPAKA_CI_CLANG_VER?'ALPAKA_CI_CLANG_VER must be specified'}"
-: "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION?'ALPAKA_CI_CLANG_LIBSTDCPP_VERSION must be specified'}"
-: "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
-: "${CXX?'CXX must be specified'}"
-
-if [ -z "$(ls -A "${ALPAKA_CI_CLANG_DIR}")" ]
-then
-    ALPAKA_CLANG_PKG_FILE_NAME=clang+llvm-${ALPAKA_CI_CLANG_VER}-x86_64-linux-gnu-ubuntu-16.04.tar.xz
-    travis_retry wget --no-verbose "http://llvm.org/releases/${ALPAKA_CI_CLANG_VER}/${ALPAKA_CLANG_PKG_FILE_NAME}"
-    mkdir -p "${ALPAKA_CI_CLANG_DIR}"
-    xzcat "${ALPAKA_CLANG_PKG_FILE_NAME}" | tar -xf - --strip 1 -C "${ALPAKA_CI_CLANG_DIR}"
-    sudo rm -rf "${ALPAKA_CLANG_PKG_FILE_NAME}"
-fi
-"${ALPAKA_CI_CLANG_DIR}/bin/llvm-config" --version
-export LLVM_CONFIG="${ALPAKA_CI_CLANG_DIR}/bin/llvm-config"
-
-travis_retry sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-travis_retry sudo apt-get -y --quiet update
-
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libstdc++-"${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}"-dev
-if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-then
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++-dev
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++abi-dev
-fi
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libiomp-dev
-sudo update-alternatives --install /usr/bin/clang clang "${ALPAKA_CI_CLANG_DIR}"/bin/clang 50
-sudo update-alternatives --install /usr/bin/clang++ clang++ "${ALPAKA_CI_CLANG_DIR}"/bin/clang++ 50
-sudo update-alternatives --install /usr/bin/cc cc "${ALPAKA_CI_CLANG_DIR}"/bin/clang 50
-sudo update-alternatives --install /usr/bin/c++ c++ "${ALPAKA_CI_CLANG_DIR}"/bin/clang++ 50
-# We have to prepend /usr/bin to the path because else the preinstalled clang from usr/bin/local/ is used.
-export PATH=${ALPAKA_CI_CLANG_DIR}/bin:${PATH}
-if [ -z ${LD_LIBRARY_PATH+x} ]
-then
-    LD_LIBRARY_PATH=
-fi
-export LD_LIBRARY_PATH=${ALPAKA_CI_CLANG_DIR}/lib:${LD_LIBRARY_PATH}
-if [ -z ${CPPFLAGS+x} ]
-then
-    CPPFLAGS=
-fi
-export CPPFLAGS="-I ${ALPAKA_CI_CLANG_DIR}/include/c++/v1 ${CPPFLAGS}"
-
-which "${CXX}"
-${CXX} -v
diff --git a/thirdParty/alpaka/script/travis/install_cmake.sh b/thirdParty/alpaka/script/travis/install_cmake.sh
deleted file mode 100755
index db218ee6ba..0000000000
--- a/thirdParty/alpaka/script/travis/install_cmake.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
-: "${ALPAKA_CI_CMAKE_VER?'ALPAKA_CI_CMAKE_VER must be specified'}"
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    # Download the selected version.
-    if [ -z "$(ls -A ${ALPAKA_CI_CMAKE_DIR})" ]
-    then
-        ALPAKA_CI_CMAKE_VER_SEMANTIC=( ${ALPAKA_CI_CMAKE_VER//./ } )
-        ALPAKA_CI_CMAKE_VER_MAJOR="${ALPAKA_CI_CMAKE_VER_SEMANTIC[0]}"
-        ALPAKA_CI_CMAKE_VER_MINOR="${ALPAKA_CI_CMAKE_VER_SEMANTIC[1]}"
-
-        ALPAKA_CMAKE_PKG_FILE_NAME_BASE=cmake-${ALPAKA_CI_CMAKE_VER}-Linux-x86_64
-        ALPAKA_CMAKE_PKG_FILE_NAME=${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}.tar.gz
-        travis_retry wget --no-verbose https://cmake.org/files/v"${ALPAKA_CI_CMAKE_VER_MAJOR}"."${ALPAKA_CI_CMAKE_VER_MINOR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME}"
-        mkdir -p "${ALPAKA_CI_CMAKE_DIR}"
-        tar -xzf "${ALPAKA_CMAKE_PKG_FILE_NAME}" -C "${ALPAKA_CI_CMAKE_DIR}"
-        sudo cp -fR "${ALPAKA_CI_CMAKE_DIR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}"/* "${ALPAKA_CI_CMAKE_DIR}"
-        sudo rm -rf "${ALPAKA_CMAKE_PKG_FILE_NAME}" "${ALPAKA_CI_CMAKE_DIR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}"
-    fi
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    choco uninstall cmake.install
-    choco install cmake.install --version ${ALPAKA_CI_CMAKE_VER}
-fi
diff --git a/thirdParty/alpaka/script/travis/install_cuda.sh b/thirdParty/alpaka/script/travis/install_cuda.sh
deleted file mode 100755
index 31e095b2b5..0000000000
--- a/thirdParty/alpaka/script/travis/install_cuda.sh
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CUDA_VERSION?'ALPAKA_CUDA_VERSION must be specified'}"
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    : "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME?'ALPAKA_CI_DOCKER_BASE_IMAGE_NAME must be specified'}"
-    : "${ALPAKA_CI_CUDA_DIR?'ALPAKA_CI_CUDA_DIR must be specified'}"
-    : "${ALPAKA_CUDA_COMPILER?'ALPAKA_CUDA_COMPILER must be specified'}"
-
-    # Ubuntu 18.04 requires some extra keys for verification
-    if [[ "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" == *"18.04"* ]]
-    then
-        travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install dirmngr gpg-agent
-        travis_retry sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80
-    fi
-
-    # Set the correct CUDA downloads
-    if [ "${ALPAKA_CUDA_VERSION}" == "8.0" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1404-8-0-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_8.0.44-1_amd64-deb
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "9.0" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-0-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.0.176-1_amd64-deb
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "9.1" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-1-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.1.85-1_amd64
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "9.2" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-2-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.2.88-1_amd64
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.0" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-0-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.0.130-410.48_1.0-1_amd64
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.1" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-1-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.1.168-418.67_1.0-1_amd64.deb
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.2" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-2-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.2.89-440.33.01_1.0-1_amd64.deb
-        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    else
-        echo CUDA versions other than 8.0, 9.0, 9.1, 9.2, 10.0, 10.1 and 10.2 are not currently supported on linux!
-    fi
-    if [ -z "$(ls -A ${ALPAKA_CI_CUDA_DIR})" ]
-    then
-        mkdir -p "${ALPAKA_CI_CUDA_DIR}"
-        travis_retry wget --no-verbose -O "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}" "${ALPAKA_CUDA_PKG_FILE_PATH}"
-    fi
-    sudo dpkg --install "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}"
-
-    travis_retry sudo apt-get -y --quiet update
-
-    # Install CUDA
-    # Currently we do not install CUDA fully: sudo apt-get --quiet -y install cuda
-    # We only install the minimal packages. Because of our manual partial installation we have to create a symlink at /usr/local/cuda
-    sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install cuda-core-"${ALPAKA_CUDA_VERSION}" cuda-cudart-"${ALPAKA_CUDA_VERSION}" cuda-cudart-dev-"${ALPAKA_CUDA_VERSION}" cuda-curand-"${ALPAKA_CUDA_VERSION}" cuda-curand-dev-"${ALPAKA_CUDA_VERSION}"
-    sudo ln -s /usr/local/cuda-"${ALPAKA_CUDA_VERSION}" /usr/local/cuda
-
-    if [ "${ALPAKA_CUDA_COMPILER}" == "clang" ]
-    then
-        travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install g++-multilib
-    fi
-
-    # clean up
-    sudo rm -rf "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}"
-    sudo dpkg --purge "${ALPAKA_CUDA_PKG_DEB_NAME}"
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    if [ "${ALPAKA_CUDA_VERSION}" == "10.0" ]
-    then
-        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.0.130_411.31_win10
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.1" ]
-    then
-        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.1.168_425.25_win10.exe
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.2" ]
-    then
-        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.2.89_441.22_win10.exe
-        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    else
-        echo CUDA versions other than 10.0, 10.1 and 10.2 are not currently supported on Windows!
-    fi
-
-    curl -L -o cuda_installer.exe ${ALPAKA_CUDA_PKG_FILE_PATH}
-    ./cuda_installer.exe -s "nvcc_${ALPAKA_CUDA_VERSION}" "curand_dev_${ALPAKA_CUDA_VERSION}"
-    # Deleting the installer worked until 08/2019 but something changed and this line now takes up to 25 minutes.
-    #rm -f cuda_installer.exe
-fi
diff --git a/thirdParty/alpaka/script/travis/install_gcc.sh b/thirdParty/alpaka/script/travis/install_gcc.sh
deleted file mode 100755
index 74e7c28262..0000000000
--- a/thirdParty/alpaka/script/travis/install_gcc.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_GCC_VER?'ALPAKA_CI_GCC_VER must be specified'}"
-: "${ALPAKA_CI_SANITIZERS?'ALPAKA_CI_SANITIZERS must be specified'}"
-: "${CXX?'CXX must be specified'}"
-
-travis_retry sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-travis_retry sudo apt-get -y --quiet update
-
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install g++-"${ALPAKA_CI_GCC_VER}"
-sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"${ALPAKA_CI_GCC_VER}" 50
-sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"${ALPAKA_CI_GCC_VER}" 50
-if [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]]
-then
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libtsan0
-fi
-
-which "${CXX}"
-${CXX} -v
diff --git a/thirdParty/alpaka/script/travis/install_hip.sh b/thirdParty/alpaka/script/travis/install_hip.sh
deleted file mode 100755
index 2eb91e6b21..0000000000
--- a/thirdParty/alpaka/script/travis/install_hip.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2018-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_HIP_ROOT_DIR?'ALPAKA_CI_HIP_ROOT_DIR must be specified'}"
-: "${ALPAKA_CI_HIP_BRANCH?'ALPAKA_CI_HIP_BRANCH must be specified'}"
-: "${CMAKE_BUILD_TYPE?'CMAKE_BUILD_TYPE must be specified'}"
-: "${CXX?'CXX must be specified'}"
-: "${CC?'CC must be specified'}"
-: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
-
-# CMake
-export PATH=${ALPAKA_CI_CMAKE_DIR}/bin:${PATH}
-cmake --version
-
-HIP_SOURCE_DIR=${ALPAKA_CI_HIP_ROOT_DIR}/source-hip/
-
-git clone -b "${ALPAKA_CI_HIP_BRANCH}" --quiet --recursive --single-branch https://github.com/ROCm-Developer-Tools/HIP.git "${HIP_SOURCE_DIR}"
-(cd "${HIP_SOURCE_DIR}"; mkdir -p build; cd build; cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX="${ALPAKA_CI_HIP_ROOT_DIR}" -DBUILD_TESTING=OFF .. && make && make install)
-
-
-## rocRAND
-export HIP_PLATFORM=nvcc
-export HIP_RUNTIME=nvcc
-export ROCRAND_SOURCE_DIR=${ALPAKA_CI_HIP_ROOT_DIR}/source-rocrand/
-if [ ! -d "${ROCRAND_SOURCE_DIR}" ]
-then
-    # install it into the HIP install dir
-    git clone --quiet --recursive https://github.com/ROCmSoftwarePlatform/rocRAND "${ROCRAND_SOURCE_DIR}"
-    (cd "${ROCRAND_SOURCE_DIR}"; mkdir -p build; cd build; cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX="${ALPAKA_CI_HIP_ROOT_DIR}" -DBUILD_BENCHMARK=OFF -DBUILD_TEST=OFF -DNVGPU_TARGETS="30" -DCMAKE_MODULE_PATH="${ALPAKA_CI_HIP_ROOT_DIR}/cmake" -DHIP_PLATFORM="${HIP_PLATFORM}" .. && make && make install)
-fi
diff --git a/thirdParty/alpaka/script/travis/install_tbb.sh b/thirdParty/alpaka/script/travis/install_tbb.sh
deleted file mode 100755
index 420cb08c33..0000000000
--- a/thirdParty/alpaka/script/travis/install_tbb.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-# Install TBB
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libtbb-dev
-elif [ "$TRAVIS_OS_NAME" = "osx" ]
-then
-    brew unlink python@2
-    brew install tbb
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    TBB_ARCHIVE_VER="tbb44_20160526oss"
-    TBB_DOWNLOAD_URL="https://github.com/intel/tbb/releases/download/4.4.5/${TBB_ARCHIVE_VER}_win.zip"
-    TBB_DST_PATH="tbb.zip"
-    powershell.exe -Command '[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 ; Invoke-WebRequest "'${TBB_DOWNLOAD_URL}'" -OutFile "'${TBB_DST_PATH}'"'
-    mkdir "${TBB_ROOT_DIR}"
-    unzip -q "${TBB_DST_PATH}" -d "${TBB_ROOT_DIR}"
-    rm "${TBB_DST_PATH}"
-    TBB_UNZIP_DIR="${TBB_ROOT_DIR}/${TBB_ARCHIVE_VER}"
-    mv ${TBB_UNZIP_DIR}/* "${TBB_ROOT_DIR}/"
-    rmdir "${TBB_UNZIP_DIR}"
-fi
diff --git a/thirdParty/alpaka/script/travis/prepare_sanitizers.sh b/thirdParty/alpaka/script/travis/prepare_sanitizers.sh
deleted file mode 100755
index 69832e7dd9..0000000000
--- a/thirdParty/alpaka/script/travis/prepare_sanitizers.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# Exports the CMAKE_CXX_FLAGS and CMAKE_EXE_LINKER_FLAGS to enable the sanitizers listed in ALPAKA_CI_SANITIZERS.
-if [ -z "${CMAKE_CXX_FLAGS+x}" ]
-then
-    export CMAKE_CXX_FLAGS=
-fi
-if [ -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
-then
-    export CMAKE_EXE_LINKER_FLAGS=
-fi
-if [ -z "${ASAN_OPTIONS+x}" ]
-then
-    export ASAN_OPTIONS=
-fi
-if [ -z "${LSAN_OPTIONS+x}" ]
-then
-    export LSAN_OPTIONS=
-fi
-
-#-------------------------------------------------------------------------------
-# sanitizers
-# General sanitizer settings
-if [[ "${ALPAKA_CI_SANITIZERS}" != "" ]]
-then
-    # - to get nicer stack-traces:
-    CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer"
-    # - to get perfect stack-traces:
-    CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fno-optimize-sibling-calls"
-
-    # g++ needs to use a different linker
-    if [[ "${CXX}" == "g++" ]]
-    then
-        CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold"
-    fi
-
-    # UBSan - http://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html
-    if [[ "${ALPAKA_CI_SANITIZERS}" == *"UBSan"* ]]
-    then
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=undefined"
-
-        if [[ "${CXX}" == "clang++" ]]
-        then
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize-blacklist=$(pwd)/test/sanitizer_ubsan_blacklist.txt"
-
-            # Previously 'local-bounds' was part of UBsan but has been removed because it is not a pure front-end check
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=local-bounds"
-            # 'unsigned-integer-overflow' is not really undefined behaviour but we want to handle it as such for our tests.
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=unsigned-integer-overflow"
-        fi
-    fi
-
-    # ASan - http://clang.llvm.org/docs/AddressSanitizer.html
-    if [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]]
-    then
-        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]] )
-        then
-            echo ASan is not supported in combination with TSan or MSan
-            exit 1
-        fi
-
-        if ( [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "ON" ] && [ "${ALPAKA_CUDA_COMPILER}" == "clang" ] )
-        then
-            # fatal error: error in backend: Module has a nontrivial global ctor, which NVPTX does not support.
-            # clang-3.9: error: clang frontend command failed with exit code 70 (use -v to see invocation)
-            echo ASan is not supported in combination with clang used as CUDA compiler
-            exit 1
-        fi
-
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=address"
-
-        if [[ "${CXX}" != "clang++" ]]
-        then
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize-address-use-after-scope"
-        fi
-
-        ASAN_OPTIONS="strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1"
-        LSAN_OPTIONS="print_suppressions=1:suppressions=$(pwd)/test/sanitizer_lsan_blacklist.txt"
-    fi
-
-    # TSan - http://clang.llvm.org/docs/ThreadSanitizer.html
-    # TSan requires PositionIndependentCode -pie;-fPIE;-fPIC. clang sets this automatically, gcc not.
-    # All base libraries (e.g. boost) have to be build with this flag.
-    # Furthermore, by installing gcc, libtsan0 is not automatically installed.
-    if [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]]
-    then
-        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]] )
-        then
-            echo TSan is not supported in combination with ASan or MSan
-            exit 1
-        fi
-
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=thread"
-        if [ "${CXX}" == "g++" ]
-        then
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -pie -fPIE"
-            CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -ltsan"
-        fi
-    fi
-
-    # MSan - http://clang.llvm.org/docs/MemorySanitizer.html
-    # NOTE: Currently we can not enable this for CI as this finds some 'use-of-uninitialized-value' inside:
-    #   - boost`s smart pointers used by the unit test framework
-    #   - alpaka/test/integ/mandelbrot/src/main.cpp:450:9 std::replace
-    #   - alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp:307:21 used alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp:130:44
-    if [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]]
-    then
-        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]] )
-        then
-            echo MSan is not supported in combination with ASan or TSan
-            exit 1
-        fi
-
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=memory -fsanitize-memory-track-origins"
-    fi
-fi
diff --git a/thirdParty/alpaka/script/travis/print_travisEnv.sh b/thirdParty/alpaka/script/travis/print_travisEnv.sh
deleted file mode 100755
index 47fe9ea9dd..0000000000
--- a/thirdParty/alpaka/script/travis/print_travisEnv.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# Print the travis environment variables: http://docs.travis-ci.com/user/ci-environment/
-echo TRAVIS_BRANCH: "${TRAVIS_BRANCH}"
-echo TRAVIS_BUILD_DIR: "${TRAVIS_BUILD_DIR}"
-echo TRAVIS_BUILD_ID: "${TRAVIS_BUILD_ID}"
-echo TRAVIS_BUILD_NUMBER: "${TRAVIS_BUILD_NUMBER}"
-echo TRAVIS_COMMIT: "${TRAVIS_COMMIT}"
-echo TRAVIS_COMMIT_RANGE: "${TRAVIS_COMMIT_RANGE}"
-echo TRAVIS_JOB_ID: "${TRAVIS_JOB_ID}"
-echo TRAVIS_JOB_NUMBER: "${TRAVIS_JOB_NUMBER}"
-echo TRAVIS_PULL_REQUEST: "${TRAVIS_PULL_REQUEST}"
-echo TRAVIS_SECURE_ENV_VARS: "${TRAVIS_SECURE_ENV_VARS}"
-echo TRAVIS_REPO_SLUG: "${TRAVIS_REPO_SLUG}"
-echo TRAVIS_OS_NAME: "${TRAVIS_OS_NAME}"
-echo TRAVIS_TAG: "${TRAVIS_TAG}"
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    # Show all running services
-    sudo service --status-all
-
-    # Stop some unnecessary services to save memory
-    sudo /etc/init.d/mysql stop
-    sudo /etc/init.d/postgresql stop
-    sudo /etc/init.d/redis-server stop
-
-    # Show memory stats
-    sudo smem
-    sudo free -m -t
-fi
diff --git a/thirdParty/alpaka/script/travis/run.sh b/thirdParty/alpaka/script/travis/run.sh
deleted file mode 100755
index 58a0159486..0000000000
--- a/thirdParty/alpaka/script/travis/run.sh
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
-echo "ALPAKA_CI_CMAKE_DIR: ${ALPAKA_CI_CMAKE_DIR}"
-: "${ALPAKA_CI_ANALYSIS?'ALPAKA_CI_ANALYSIS must be specified'}"
-echo "ALPAKA_CI_ANALYSIS: ${ALPAKA_CI_ANALYSIS}"
-: "${ALPAKA_CI_INSTALL_CUDA?'ALPAKA_CI_INSTALL_CUDA must be specified'}"
-: "${ALPAKA_CI_INSTALL_HIP?'ALPAKA_CI_INSTALL_HIP must be specified'}"
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
-    echo "ALPAKA_CI_STDLIB: ${ALPAKA_CI_STDLIB}"
-fi
-: "${CXX?'CXX must be specified'}"
-echo "CXX: ${CXX}"
-
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ -z "${LD_LIBRARY_PATH+x}" ]
-    then
-        LD_LIBRARY_PATH=
-    fi
-fi
-
-# CMake
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    export PATH=${ALPAKA_CI_CMAKE_DIR}/bin:${PATH}
-fi
-cmake --version
-
-#TBB
-if [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    #ALPAKA_TBB_BIN_DIR="${TBB_ROOT_DIR}/bin/ia32/vc14"
-    ALPAKA_TBB_BIN_DIR="${TBB_ROOT_DIR}/bin/intel64/vc14"
-    export PATH=${PATH}:"${ALPAKA_TBB_BIN_DIR}"
-fi
-
-# CUDA
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    : "${ALPAKA_CUDA_VERSION?'ALPAKA_CUDA_VERSION must be specified'}"
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ]
-    then
-        # CUDA
-        export PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}/bin:$PATH
-        export LD_LIBRARY_PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}/lib64:${LD_LIBRARY_PATH}
-        # We have to explicitly add the stub libcuda.so to CUDA_LIB_PATH because the real one would be installed by the driver (which we can not install).
-        export CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs/
-
-        if [ "${ALPAKA_CUDA_COMPILER}" == "nvcc" ]
-        then
-            which nvcc
-            nvcc -V
-        fi
-    elif [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        export PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${ALPAKA_CUDA_VERSION}\bin":$PATH
-        export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${ALPAKA_CUDA_VERSION}"
-    fi
-fi
-
-# HIP
-if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
-then
-: "${ALPAKA_CI_HIP_ROOT_DIR?'ALPAKA_CI_HIP_ROOT_DIR must be specified'}"
-
-    # HIP
-    # HIP_PATH required by HIP tools
-    export HIP_PATH=${ALPAKA_CI_HIP_ROOT_DIR}
-    # CUDA_PATH required by HIP tools
-    if [ -n "$(command -v nvcc)" ]
-    then
-        export CUDA_PATH=$(dirname $(which nvcc))/../
-    else
-        export CUDA_PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}
-    fi
-
-    export PATH=${HIP_PATH}/bin:$PATH
-    export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${HIP_PATH}/hiprand/lib:${LD_LIBRARY_PATH}
-    export CMAKE_PREFIX_PATH=${HIP_PATH}:${HIP_PATH}/hiprand:${CMAKE_PREFIX_PATH:-}
-    # to avoid "use of uninitialized value .." warnings in perl script hipcc
-    # TODO: rely on CI vars for platform and architecture
-    export HIP_PLATFORM=nvcc
-    export HIP_RUNTIME=nvcc
-    # calls nvcc or hcc
-    which hipcc
-    hipcc -V
-    which hipconfig
-    hipconfig --platform
-    hipconfig -v
-    # print newline as previous command does not do this
-    echo
-
-fi
-
-# clang
-if [ "${CXX}" == "clang++" ]
-then
-    # We have to prepend /usr/bin to the path because else the preinstalled clang from usr/bin/local/ is used.
-    export PATH=${ALPAKA_CI_CLANG_DIR}/bin:${PATH}
-    export LD_LIBRARY_PATH=${ALPAKA_CI_CLANG_DIR}/lib:${LD_LIBRARY_PATH}
-    if [ -z "${CPPFLAGS+x}" ]
-    then
-        CPPFLAGS=
-    fi
-    export CPPFLAGS="-I ${ALPAKA_CI_CLANG_DIR}/include/c++/v1 ${CPPFLAGS}"
-fi
-
-# stdlib
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-    then
-        if [ -z "${CMAKE_CXX_FLAGS+x}" ]
-        then
-            export CMAKE_CXX_FLAGS=
-        fi
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -stdlib=libc++"
-
-        if [ -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
-        then
-            export CMAKE_EXE_LINKER_FLAGS=
-        fi
-        CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -lc++ -lc++abi"
-    fi
-
-    which "${CXX}"
-    ${CXX} -v
-
-    source ./script/travis/prepare_sanitizers.sh
-    if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/travis/run_analysis.sh ;fi
-fi
-
-./script/travis/run_build.sh
-
-if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./script/travis/run_tests.sh ;fi
diff --git a/thirdParty/alpaka/script/travis/run_analysis.sh b/thirdParty/alpaka/script/travis/run_analysis.sh
deleted file mode 100755
index de94432400..0000000000
--- a/thirdParty/alpaka/script/travis/run_analysis.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# sloc
-sloccount .
-
-#-------------------------------------------------------------------------------
-# TODO/FIXME/HACK
-grep -r HACK ./* || true
-grep -r FIXME ./* || true
-grep -r TODO ./* || true
-
-#-------------------------------------------------------------------------------
-# check shell script with shellcheck
-find . -type f -name "*.sh" -exec shellcheck {} \;
diff --git a/thirdParty/alpaka/script/travis/run_build.sh b/thirdParty/alpaka/script/travis/run_build.sh
deleted file mode 100755
index ac60242a15..0000000000
--- a/thirdParty/alpaka/script/travis/run_build.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-
-# create a cmake variable definition if an environment variable exists
-#
-# This function can not handle environment variables with spaces in its content.
-#
-# @param $1 cmake/environment variable name
-#
-# @result if $1 exists cmake variable definition else nothing is returned
-#
-# @code{.bash}
-# FOO=ON
-# echo "$(env2cmake FOO)" # returns "-DFOO=ON"
-# echo "$(env2cmake BAR)" # returns nothing
-# @endcode
-function env2cmake()
-{
-    if [ ! -z "${1+x}" ] ; then
-        echo -n "-D$1=${!1}"
-    fi
-}
-
-#-------------------------------------------------------------------------------
-# Build and execute all tests.
-if [ ! -z "${CMAKE_CXX_FLAGS+x}" ]
-then
-    echo "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-fi
-if [ ! -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
-then
-    echo "CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}"
-fi
-if [ ! -z "${KMP_DEVICE_THREAD_LIMIT+x}" ]
-then
-    echo "KMP_DEVICE_THREAD_LIMIT=${KMP_DEVICE_THREAD_LIMIT}"
-fi
-if [ ! -z "${KMP_ALL_THREADS+x}" ]
-then
-    echo "KMP_ALL_THREADS=${KMP_ALL_THREADS}"
-fi
-if [ ! -z "${KMP_TEAMS_THREAD_LIMIT+x}" ]
-then
-    echo "KMP_TEAMS_THREAD_LIMIT=${KMP_TEAMS_THREAD_LIMIT}"
-fi
-if [ ! -z "${OMP_THREAD_LIMIT+x}" ]
-then
-    echo "OMP_THREAD_LIMIT=${OMP_THREAD_LIMIT}"
-fi
-if [ ! -z "${OMP_NUM_THREADS+x}" ]
-then
-    echo "OMP_NUM_THREADS=${OMP_NUM_THREADS}"
-fi
-
-mkdir -p build/
-cd build/
-
-if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-then
-    ALPAKA_CI_CMAKE_GENERATOR="Unix Makefiles"
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    # Use the 64 bit compiler
-    # FIXME: Path not found but does not seem to be necessary anymore
-    #"./C/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Auxiliary/Build/vcvarsall.bat" amd64
-
-    # Add msbuild to the path
-    MSBUILD_PATH="/C/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/MSBuild/15.0/Bin"
-    export PATH=$MSBUILD_PATH:$PATH
-    MSBuild.exe -version
-
-    # Select the generator
-    ALPAKA_CI_CMAKE_GENERATOR="Visual Studio 15 2017 Win64"
-fi
-
-cmake -G "${ALPAKA_CI_CMAKE_GENERATOR}" \
-    "$(env2cmake BOOST_ROOT)" -DBOOST_LIBRARYDIR="${ALPAKA_CI_BOOST_LIB_DIR}/lib" -DBoost_USE_STATIC_LIBS=ON -DBoost_USE_MULTITHREADED=ON -DBoost_USE_STATIC_RUNTIME=OFF \
-    "$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" \
-    "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)" \
-    "$(env2cmake ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)" \
-    "$(env2cmake ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_BT_OMP4_ENABLE)" \
-    "$(env2cmake TBB_ROOT_DIR)" \
-    "$(env2cmake ALPAKA_ACC_GPU_CUDA_ENABLE)" "$(env2cmake ALPAKA_CUDA_VERSION)" "$(env2cmake ALPAKA_ACC_GPU_CUDA_ONLY_MODE)" "$(env2cmake ALPAKA_CUDA_ARCH)" "$(env2cmake ALPAKA_CUDA_COMPILER)" \
-    "$(env2cmake ALPAKA_CUDA_FAST_MATH)" "$(env2cmake ALPAKA_CUDA_FTZ)" "$(env2cmake ALPAKA_CUDA_SHOW_REGISTER)" "$(env2cmake ALPAKA_CUDA_KEEP_FILES)" "$(env2cmake ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA)" "$(env2cmake ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR)" "$(env2cmake ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION)" \
-    "$(env2cmake ALPAKA_ACC_GPU_HIP_ENABLE)" "$(env2cmake ALPAKA_ACC_GPU_HIP_ONLY_MODE)" "$(env2cmake ALPAKA_HIP_PLATFORM)" \
-    "$(env2cmake ALPAKA_DEBUG)" "$(env2cmake ALPAKA_CI)" "$(env2cmake ALPAKA_CI_ANALYSIS)" "$(env2cmake ALPAKA_CXX_STANDARD)" \
-    ".."
-if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-then
-    make VERBOSE=1
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    MSBuild.exe "alpakaAll.sln" -p:Configuration=${CMAKE_BUILD_TYPE} -maxcpucount:2 -verbosity:minimal
-fi
-
-cd ..
diff --git a/thirdParty/alpaka/script/travis/run_tests.sh b/thirdParty/alpaka/script/travis/run_tests.sh
deleted file mode 100755
index 656bffc069..0000000000
--- a/thirdParty/alpaka/script/travis/run_tests.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_ACC_GPU_CUDA_ENABLE?'ALPAKA_ACC_GPU_CUDA_ENABLE must be specified'}"
-: "${ALPAKA_ACC_GPU_HIP_ENABLE?'ALPAKA_ACC_GPU_HIP_ENABLE must be specified'}"
-
-if [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "OFF" ] && [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "OFF" ];
-then
-    cd build/
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-    then
-        ctest -V
-    elif [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        ctest -V -C ${CMAKE_BUILD_TYPE}
-    fi
-
-    cd ..
-fi
diff --git a/thirdParty/alpaka/script/travis/script.sh b/thirdParty/alpaka/script/travis/script.sh
deleted file mode 100755
index 666b4ab3e9..0000000000
--- a/thirdParty/alpaka/script/travis/script.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2018-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-./script/travis/print_travisEnv.sh
-source ./script/travis/before_install.sh
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-  ./script/travis/docker_install.sh
-  ./script/travis/docker_run.sh
-elif [ "$TRAVIS_OS_NAME" = "windows" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-then
-  ./script/travis/install.sh
-  ./script/travis/run.sh
-fi
diff --git a/thirdParty/alpaka/script/travis/set.sh b/thirdParty/alpaka/script/travis/set.sh
deleted file mode 100755
index 262b6a77e5..0000000000
--- a/thirdParty/alpaka/script/travis/set.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2018-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-#-------------------------------------------------------------------------------
-# -e: exit as soon as one command returns a non-zero exit code
-# -o pipefail: pipeline returns exit code of the rightmost command with a non-zero exit code
-# -u: treat unset variables as an error
-# -v: Print shell input lines as they are read
-# -x: Print command traces before executing command
-set -eouvx pipefail
diff --git a/thirdParty/alpaka/script/travis/travis_retry.sh b/thirdParty/alpaka/script/travis/travis_retry.sh
deleted file mode 100755
index d29ab93241..0000000000
--- a/thirdParty/alpaka/script/travis/travis_retry.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-set -euo pipefail
-
-travis_retry() {
-  local result=0
-  local count=1
-  while [ $count -le 3 ]; do
-    [ $result -ne 0 ] && {
-      echo -e "\n${ANSI_RED}The command \"$*\" failed. Retrying, $count of 3.${ANSI_RESET}\n" >&2
-    }
-    "$@"
-    result=$?
-    [ $result -eq 0 ] && break
-    count=$((count + 1))
-    sleep 1
-  done
-  [ $count -gt 3 ] && {
-    echo -e "\n${ANSI_RED}The command \"$*\" failed 3 times.${ANSI_RESET}\n" >&2
-  }
-  return $result
-}
diff --git a/thirdParty/alpaka/test/CMakeLists.txt b/thirdParty/alpaka/test/CMakeLists.txt
deleted file mode 100644
index e8140a1171..0000000000
--- a/thirdParty/alpaka/test/CMakeLists.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-#
-# Copyright 2015-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-# Search in <PackageName>_ROOT:
-# https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
-
-ADD_SUBDIRECTORY("common/")
-
-OPTION(ALPAKA_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON)
-
-IF(ALPAKA_USE_INTERNAL_CATCH2)
-    message(STATUS "Catch2: Using INTERNAL version 2.11.0")
-ELSE()
-    find_package(Catch2 2.11.0 CONFIG REQUIRED)
-    set_target_properties(Catch2::Catch2 PROPERTIES IMPORTED_GLOBAL TRUE)
-    message(STATUS "Catch2: Found version ${Catch2_VERSION}")
-ENDIF()
-
-add_library(CatchMain CatchMain.cpp)
-# target_compile_features(CatchMain PUBLIC cxx_std_11)  # min C++11
-set_target_properties(CatchMain PROPERTIES
-    FOLDER "test"
-    CXX_STANDARD 11  # exactly C++11
-    CXX_EXTENSIONS OFF
-    CXX_STANDARD_REQUIRED ON
-    POSITION_INDEPENDENT_CODE ON
-    WINDOWS_EXPORT_ALL_SYMBOLS ON
-)
-target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE")
-IF(MSVC)
-    target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_WINDOWS_CRTDBG")
-    target_compile_options(CatchMain PUBLIC "/bigobj")
-ENDIF()
-
-IF(ALPAKA_USE_INTERNAL_CATCH2)
-    target_include_directories(CatchMain SYSTEM PUBLIC
-        ${CMAKE_CURRENT_LIST_DIR}/../thirdParty/catch2/include)
-ELSE()
-    target_include_directories(CatchMain SYSTEM PUBLIC
-        $<TARGET_PROPERTY:Catch2::Catch2,INTERFACE_INCLUDE_DIRECTORIES>)
-ENDIF()
-SET_TARGET_PROPERTIES(
-    CatchMain
-    PROPERTIES FOLDER "test")
-
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE AND ALPAKA_CUDA_COMPILER MATCHES "nvcc")
-    # NVCC does not incorporate the COMPILE_OPTIONS of a target but only the CMAKE_CXX_FLAGS
-    GET_TARGET_PROPERTY(_COMMON_COMPILE_OPTIONS common COMPILE_OPTIONS)
-    # If the property does not exist, the variable is set to NOTFOUND.
-    IF(_COMMON_COMPILE_OPTIONS)
-        STRING(REPLACE ";" " " _COMMON_COMPILE_OPTIONS_STRING "${_COMMON_COMPILE_OPTIONS}")
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_COMMON_COMPILE_OPTIONS_STRING}")
-    ENDIF()
-    # nvcc supports werror starting with 10.2
-    IF(CUDA_VERSION GREATER_EQUAL 10.2)
-        MESSAGE("adding -Werror=all-warnings")
-        LIST(APPEND CUDA_NVCC_FLAGS -Werror=all-warnings)
-    ENDIF()
-ENDIF()
-
-LIST(APPEND _ALPAKA_TEST_OPTIONS "--use-colour yes")
-
-ADD_SUBDIRECTORY("analysis/")
-ADD_SUBDIRECTORY("integ/")
-ADD_SUBDIRECTORY("unit/")
diff --git a/thirdParty/alpaka/test/CatchMain.cpp b/thirdParty/alpaka/test/CatchMain.cpp
deleted file mode 100644
index 31734b3a0a..0000000000
--- a/thirdParty/alpaka/test/CatchMain.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-/* Copyright 2019 Axel Huebl
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#define CATCH_CONFIG_MAIN
-#include <catch2/catch.hpp>
diff --git a/thirdParty/alpaka/test/analysis/CMakeLists.txt b/thirdParty/alpaka/test/analysis/CMakeLists.txt
deleted file mode 100644
index b1481957e1..0000000000
--- a/thirdParty/alpaka/test/analysis/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-################################################################################
-# Required CMake version.
-################################################################################
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.0)
-
-PROJECT("alpakaAnalysisTest")
-
-################################################################################
-# Add subdirectories.
-################################################################################
-
-ADD_SUBDIRECTORY("headerCheck/")
diff --git a/thirdParty/alpaka/test/analysis/headerCheck/CMakeLists.txt b/thirdParty/alpaka/test/analysis/headerCheck/CMakeLists.txt
deleted file mode 100644
index b37407d1df..0000000000
--- a/thirdParty/alpaka/test/analysis/headerCheck/CMakeLists.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-IF(NOT ALPAKA_CI OR (ALPAKA_CI AND ALPAKA_CI_ANALYSIS))
-
-SET(_TARGET_NAME "headerCheck")
-
-#-------------------------------------------------------------------------------
-# Create source files.
-
-SET(_ALPAKA_INCLUDE_DIRECTORY "${_ALPAKA_ROOT_DIR}/include")
-SET(_ALPAKA_SUFFIXED_INCLUDE_DIR "${_ALPAKA_INCLUDE_DIRECTORY}/alpaka")
-append_recursive_files("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "hpp" "_ALPAKA_FILES_HEADER")
-
-SET(_GENERATED_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
-
-FILE(REMOVE_RECURSE ${_GENERATED_SOURCE_DIR})
-
-FOREACH(_HEADER_FILE ${_ALPAKA_FILES_HEADER})
-    # Remove the parent directory from the path.
-    # NOTE: This is not correct because it does not only replace at the beginning of the string.
-    #  "STRING(REGEX REPLACE" would be correct if there was an easy way to escape arbitrary strings.
-    STRING(
-        REPLACE "${_ALPAKA_SUFFIXED_INCLUDE_DIR}/" ""
-        _HEADER_FILE
-        "${_HEADER_FILE}")
-    SET(_SOURCE_FILE "${_GENERATED_SOURCE_DIR}/${_HEADER_FILE}.cpp")
-    FILE(WRITE "${_SOURCE_FILE}" "#include <alpaka/${_HEADER_FILE}>\n#include <alpaka/${_HEADER_FILE}>\n")
-ENDFOREACH()
-
-#-------------------------------------------------------------------------------
-# Add executable.
-
-append_recursive_files_add_to_src_group("${_GENERATED_SOURCE_DIR}" "${_GENERATED_SOURCE_DIR}" "cpp" "_FILES_SOURCE")
-LIST(APPEND _FILES_SOURCE "src/main.cpp")
-
-# Always add all files to the target executable build call to add them to the build project.
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(headerCheck PROPERTIES FOLDER "test/analysis")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
-
-ENDIF()
diff --git a/thirdParty/alpaka/test/analysis/headerCheck/src/main.cpp b/thirdParty/alpaka/test/analysis/headerCheck/src/main.cpp
deleted file mode 100644
index 11d6b7196f..0000000000
--- a/thirdParty/alpaka/test/analysis/headerCheck/src/main.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <catch2/catch.hpp>
-
-
-TEST_CASE("headerCheckMain", "[headerCheck]")
-{
-    REQUIRE(true);
-}
diff --git a/thirdParty/alpaka/test/common/CMakeLists.txt b/thirdParty/alpaka/test/common/CMakeLists.txt
deleted file mode 100644
index 9d622dc02a..0000000000
--- a/thirdParty/alpaka/test/common/CMakeLists.txt
+++ /dev/null
@@ -1,63 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-SET(_COMMON_TARGET_NAME "common")
-
-SET(_COMMON_INCLUDE_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/include")
-LIST(APPEND _COMMON_INCLUDE_DIRECTORIES_PUBLIC "${_COMMON_INCLUDE_DIRECTORY}")
-SET(_COMMON_SOURCE_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/src")
-
-# Add all the source files in all recursive subdirectories and group them accordingly.
-append_recursive_files_add_to_src_group("${_COMMON_INCLUDE_DIRECTORY}" "${_COMMON_INCLUDE_DIRECTORY}" "hpp" _COMMON_FILES_HEADER)
-append_recursive_files_add_to_src_group("${_COMMON_SOURCE_DIRECTORY}" "${_COMMON_SOURCE_DIRECTORY}" "cpp" _COMMON_FILES_SOURCE)
-
-INCLUDE("${_ALPAKA_ROOT_DIR}/cmake/dev.cmake")
-LIST(APPEND _COMMON_COMPILE_OPTIONS_PUBLIC ${ALPAKA_DEV_COMPILE_OPTIONS})
-IF(MSVC)
-    LIST(APPEND _COMMON_COMPILE_OPTIONS_PUBLIC "/wd4996")   # This function or variable may be unsafe. Consider using <safe_version> instead.
-ENDIF()
-
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE OR (ALPAKA_ACC_GPU_HIP_ENABLE AND HIP_PLATFORM MATCHES "nvcc"))
-    # CUDA driver API is used by EventHostManualTrigger
-    LIST(APPEND _COMMON_LINK_LIBRARIES_PUBLIC "${CUDA_CUDA_LIBRARY}")
-    LIST(APPEND _COMMON_COMPILE_DEFINITIONS_PUBLIC "CUDA_API_PER_THREAD_DEFAULT_STREAM")
-ENDIF()
-
-ADD_LIBRARY(
-    ${_COMMON_TARGET_NAME}
-    STATIC
-    ${_COMMON_FILES_HEADER} ${_COMMON_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_COMMON_TARGET_NAME}
-    PUBLIC ${_COMMON_INCLUDE_DIRECTORIES_PUBLIC})
-LIST(
-    LENGTH
-    _COMMON_COMPILE_DEFINITIONS_PUBLIC
-    _COMMON_COMPILE_DEFINITIONS_PUBLIC_LENGTH)
-IF(${_COMMON_COMPILE_DEFINITIONS_PUBLIC_LENGTH} GREATER 0)
-    TARGET_COMPILE_DEFINITIONS(
-        ${_COMMON_TARGET_NAME}
-        PUBLIC ${_COMMON_COMPILE_DEFINITIONS_PUBLIC})
-ENDIF()
-TARGET_COMPILE_OPTIONS(
-    ${_COMMON_TARGET_NAME}
-    PUBLIC ${_COMMON_COMPILE_OPTIONS_PUBLIC})
-TARGET_LINK_LIBRARIES(
-    ${_COMMON_TARGET_NAME}
-    PUBLIC "alpaka;${_COMMON_LINK_LIBRARIES_PUBLIC}")
-SET_TARGET_PROPERTIES(
-    ${_COMMON_TARGET_NAME}
-    PROPERTIES FOLDER "test")
-
-TARGET_LINK_LIBRARIES(
-    ${_COMMON_TARGET_NAME}
-    PUBLIC CatchMain)
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/Array.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/Array.hpp
deleted file mode 100644
index 89291cb2a8..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/Array.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    namespace test
-    {
-        //#############################################################################
-        template<
-            typename TType,
-            size_t TSize>
-        struct Array {
-            TType m_data[TSize];
-
-            template<
-                typename T_Idx>
-            ALPAKA_FN_HOST_ACC const TType &operator[](
-                const T_Idx idx) const
-            {
-                return m_data[idx];
-            }
-
-            template<
-                typename TIdx>
-            ALPAKA_FN_HOST_ACC TType & operator[](
-                const TIdx idx)
-            {
-                return m_data[idx];
-            }
-        };
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/Check.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/Check.hpp
deleted file mode 100644
index c7acd759d5..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/Check.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <cstdio>
-
-#define ALPAKA_CHECK(success, expression) \
-    do \
-    { \
-        if(!(expression)) \
-        { \
-            printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression); \
-            success = false; \
-        } \
-    } while ( 0 )
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/Extent.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/Extent.hpp
deleted file mode 100644
index cc74121902..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/Extent.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The test specifics.
-    namespace test
-    {
-        //#############################################################################
-        //! 1D: (5)
-        //! 2D: (5, 4)
-        //! 3D: (5, 4, 3)
-        //! 4D: (5, 4, 3, 2)
-        // We have to be careful with the extents used.
-        // When TIdx is a 8 bit signed integer and Dim is 4, the extent is extremely limited.
-        template<
-            std::size_t Tidx>
-        struct CreateExtentBufVal
-        {
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TIdx>
-            ALPAKA_FN_HOST_ACC
-            static auto create(
-                TIdx)
-            -> TIdx
-            {
-                return static_cast<TIdx>(5u - Tidx);
-            }
-        };
-
-        //#############################################################################
-        //! 1D: (4)
-        //! 2D: (4, 3)
-        //! 3D: (4, 3, 2)
-        //! 4D: (4, 3, 2, 1)
-        template<
-            std::size_t Tidx>
-        struct CreateExtentViewVal
-        {
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TIdx>
-            ALPAKA_FN_HOST_ACC
-            static auto create(
-                TIdx)
-            -> TIdx
-            {
-                return static_cast<TIdx>(4u - Tidx);
-            }
-        };
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp
deleted file mode 100644
index ea11697c0e..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-#include <alpaka/test/Check.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-
-namespace alpaka
-{
-    namespace test
-    {
-        //#############################################################################
-        //! The fixture for executing a kernel on a given accelerator.
-        template<
-            typename TAcc>
-        class KernelExecutionFixture
-        {
-        public:
-            using Acc = TAcc;
-            using Dim = alpaka::dim::Dim<Acc>;
-            using Idx = alpaka::idx::Idx<Acc>;
-            using DevAcc = alpaka::dev::Dev<Acc>;
-            using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-            using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
-
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TExtent>
-            KernelExecutionFixture(
-                TExtent const & extent) :
-                    m_devHost(alpaka::pltf::getDevByIdx<pltf::PltfCpu>(0u)),
-                    m_devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u)),
-                    m_queue(m_devAcc),
-                    m_workDiv(
-                        alpaka::workdiv::getValidWorkDiv<Acc>(
-                            m_devAcc,
-                            extent,
-                            alpaka::vec::Vec<Dim, Idx>::ones(),
-                            false,
-                            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted))
-            {}
-            //-----------------------------------------------------------------------------
-            template<
-                typename TKernelFnObj,
-                typename... TArgs>
-            auto operator()(
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args)
-            -> bool
-            {
-                // Allocate the result value
-                auto bufAccResult(alpaka::mem::buf::alloc<bool, Idx>(m_devAcc, static_cast<Idx>(1u)));
-                alpaka::mem::view::set(
-                    m_queue,
-                    bufAccResult,
-                    static_cast<std::uint8_t>(true),
-                    bufAccResult);
-
-                alpaka::kernel::exec<Acc>(
-                    m_queue,
-                    m_workDiv,
-                    kernelFnObj,
-                    alpaka::mem::view::getPtrNative(bufAccResult),
-                    std::forward<TArgs>(args)...);
-
-                // Copy the result value to the host
-                auto bufHostResult(alpaka::mem::buf::alloc<bool, Idx>(m_devHost, static_cast<Idx>(1u)));
-                alpaka::mem::view::copy(m_queue, bufHostResult, bufAccResult, bufAccResult);
-                alpaka::wait::wait(m_queue);
-
-                auto const result(*alpaka::mem::view::getPtrNative(bufHostResult));
-
-                return result;
-            }
-
-        private:
-            alpaka::dev::DevCpu m_devHost;
-            DevAcc m_devAcc;
-            QueueAcc m_queue;
-            alpaka::workdiv::WorkDivMembers<Dim, Idx> m_workDiv;
-        };
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp
deleted file mode 100644
index 790b8fd512..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    namespace test
-    {
-        namespace integ
-        {
-            //-----------------------------------------------------------------------------
-            //! \return The run time of the given kernel.
-            template<
-                typename TQueue,
-                typename TTask>
-            auto measureTaskRunTimeMs(
-                TQueue & queue,
-                TTask && task)
-            -> std::chrono::milliseconds::rep
-            {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                std::cout
-                    << "measureKernelRunTime("
-                    << " queue: " << typeid(TQueue).name()
-                    << " task: " << typeid(typename std::decay<TTask>::type).name()
-                    << ")" << std::endl;
-#endif
-                // Wait for the queue to finish all tasks enqueued prior to the giventask.
-                alpaka::wait::wait(queue);
-
-                // Take the time prior to the execution.
-                auto const tpStart(std::chrono::high_resolution_clock::now());
-
-                // Enqueue the task.
-                alpaka::queue::enqueue(queue, std::forward<TTask>(task));
-
-                // Wait for the queue to finish the task execution to measure its run time.
-                alpaka::wait::wait(queue);
-
-                // Take the time after the execution.
-                auto const tpEnd(std::chrono::high_resolution_clock::now());
-
-                auto const durElapsed(tpEnd - tpStart);
-
-                // Return the duration.
-                return std::chrono::duration_cast<std::chrono::milliseconds>(durElapsed).count();
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp
deleted file mode 100644
index a418f80a7c..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp
+++ /dev/null
@@ -1,285 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-#include <alpaka/test/dim/TestDims.hpp>
-#include <alpaka/test/idx/TestIdxs.hpp>
-
-#include <tuple>
-#include <type_traits>
-#include <iosfwd>
-
-// When compiling the tests with CUDA enabled (nvcc or native clang) on the CI infrastructure
-// we have to dramatically reduce the number of tested combinations.
-// Else the log length would be exceeded.
-#if defined(ALPAKA_CI)
-  #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA \
-   || defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP && !BOOST_COMP_HCC
-    #define ALPAKA_CUDA_CI
-  #endif
-#endif
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The test specifics.
-    namespace test
-    {
-        //-----------------------------------------------------------------------------
-        //! The test accelerator specifics.
-        namespace acc
-        {
-            //-----------------------------------------------------------------------------
-            //! The detail namespace is used to separate implementation details from user accessible code.
-            namespace detail
-            {
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuSerialIfAvailableElseInt = alpaka::acc::AccCpuSerial<TDim, TIdx>;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuSerialIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) && !defined(ALPAKA_CUDA_CI)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuThreadsIfAvailableElseInt = alpaka::acc::AccCpuThreads<TDim, TIdx>;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuThreadsIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuFibersIfAvailableElseInt = alpaka::acc::AccCpuFibers<TDim, TIdx>;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuFibersIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuTbbIfAvailableElseInt = alpaka::acc::AccCpuTbbBlocks<TDim, TIdx>;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuTbbIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp2BlocksIfAvailableElseInt = alpaka::acc::AccCpuOmp2Blocks<TDim, TIdx>;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp2BlocksIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) && !defined(ALPAKA_CUDA_CI)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp2ThreadsIfAvailableElseInt = alpaka::acc::AccCpuOmp2Threads<TDim, TIdx>;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp2ThreadsIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_BT_OMP4_ENABLED) && !defined(ALPAKA_CUDA_CI)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp4IfAvailableElseInt = alpaka::acc::AccCpuOmp4<TDim, TIdx>;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp4IfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccGpuCudaRtIfAvailableElseInt = alpaka::acc::AccGpuCudaRt<TDim, TIdx>;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccGpuCudaRtIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccGpuHipRtIfAvailableElseInt = typename
-                    std::conditional<
-                    std::is_same<TDim,alpaka::dim::DimInt<3u>>::value==false,
-                    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-                    int>::type;
-#else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccGpuHipRtIfAvailableElseInt = int;
-#endif
-                //#############################################################################
-                //! A vector containing all available accelerators and void's.
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using EnabledAccsElseInt =
-                    std::tuple<
-                        AccCpuSerialIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuThreadsIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuFibersIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuTbbIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuOmp2BlocksIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuOmp2ThreadsIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuOmp4IfAvailableElseInt<TDim, TIdx>,
-                        AccGpuCudaRtIfAvailableElseInt<TDim, TIdx>,
-                        AccGpuHipRtIfAvailableElseInt<TDim, TIdx>
-                    >;
-            }
-
-            //#############################################################################
-            //! A vector containing all available accelerators.
-            template<
-                typename TDim,
-                typename TIdx>
-            using EnabledAccs =
-                typename alpaka::meta::Filter<
-                    detail::EnabledAccsElseInt<TDim, TIdx>,
-                    std::is_class
-                >;
-
-            namespace detail
-            {
-                //#############################################################################
-                //! The accelerator name write wrapper.
-                struct StreamOutAccName
-                {
-                    template<
-                        typename TAcc>
-                    ALPAKA_FN_HOST auto operator()(
-                        std::ostream & os)
-                    -> void
-                    {
-                        os << alpaka::acc::getAccName<TAcc>();
-                        os << " ";
-                    }
-                };
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Writes the enabled accelerators to the given stream.
-            template<
-                typename TDim,
-                typename TIdx>
-            ALPAKA_FN_HOST auto writeEnabledAccs(
-                std::ostream & os)
-            -> void
-            {
-                os << "Accelerators enabled: ";
-
-                alpaka::meta::forEachType<
-                    EnabledAccs<TDim, TIdx>>(
-                        detail::StreamOutAccName(),
-                        std::ref(os));
-
-                os << std::endl;
-            }
-
-            namespace detail
-            {
-                //#############################################################################
-                //! A std::tuple holding multiple std::tuple consisting of a dimension and a idx type.
-                //!
-                //! TestDimIdxTuples =
-                //!     tuple<
-                //!         tuple<Dim1,Idx1>,
-                //!         tuple<Dim2,Idx1>,
-                //!         tuple<Dim3,Idx1>,
-                //!         ...,
-                //!         tuple<DimN,IdxN>>
-                using TestDimIdxTuples =
-                    alpaka::meta::CartesianProduct<
-                        std::tuple,
-                        dim::TestDims,
-                        idx::TestIdxs
-                    >;
-
-                //#############################################################################
-                //! Transforms a std::tuple holding a dimension and a idx type to a fully instantiated accelerator.
-                //!
-                //! EnabledAccs<Dim,Idx> = tuple<Acc1<Dim,Idx>, ..., AccN<Dim,Idx>>
-                template<
-                    typename TTestAccParamSet>
-                struct InstantiateEnabledAccsWithTestParamSetImpl
-                {
-                    using type =
-                        EnabledAccs<
-                            typename std::tuple_element<0, TTestAccParamSet>::type,
-                            typename std::tuple_element<1, TTestAccParamSet>::type
-                        >;
-                };
-
-                template<
-                    typename TTestAccParamSet>
-                using InstantiateEnabledAccsWithTestParamSet = typename InstantiateEnabledAccsWithTestParamSetImpl<TTestAccParamSet>::type;
-
-                //#############################################################################
-                //! A std::tuple containing std::tuple with fully instantiated accelerators.
-                //!
-                //! TestEnabledAccs =
-                //!     tuple<
-                //!         tuple<Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>>,
-                //!         tuple<Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>>,
-                //!         ...,
-                //!         tuple<Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>>
-                using InstantiatedEnabledAccs =
-                    alpaka::meta::Transform<
-                        TestDimIdxTuples,
-                        InstantiateEnabledAccsWithTestParamSet
-                    >;
-            }
-
-            //#############################################################################
-            //! A std::tuple containing fully instantiated accelerators.
-            //!
-            //! TestAccs =
-            //!     tuple<
-            //!         Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>,
-            //!         Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>,
-            //!         ...,
-            //!         Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>
-            using TestAccs =
-                alpaka::meta::Apply<
-                    detail::InstantiatedEnabledAccs,
-                    alpaka::meta::Concatenate
-                >;
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp
deleted file mode 100644
index f88ca302ac..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/dim/DimIntegralConst.hpp>
-
-#include <tuple>
-
-// When compiling the tests with CUDA enabled (nvcc or native clang) on the CI infrastructure
-// we have to dramatically reduce the number of tested combinations.
-// Else the log length would be exceeded.
-#if defined(ALPAKA_CI)
-  #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA \
-   || defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP && !BOOST_COMP_HCC
-    #define ALPAKA_CUDA_CI
-  #endif
-#endif
-
-namespace alpaka
-{
-    namespace test
-    {
-        namespace dim
-        {
-            //#############################################################################
-            //! A std::tuple holding dimensions.
-            using TestDims =
-                std::tuple<
-                    alpaka::dim::DimInt<1u>
-#if !defined(ALPAKA_CUDA_CI)
-                    ,alpaka::dim::DimInt<2u>
-#endif
-                    ,alpaka::dim::DimInt<3u>
-                    // The CUDA & HIP accelerators do not currently support 4D buffers and 4D acceleration.
-#if !(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA)
-  #if !(defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                    ,alpaka::dim::DimInt<4u>
-  #endif
-#endif
-                >;
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp
deleted file mode 100644
index d7d58ae867..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp
+++ /dev/null
@@ -1,1003 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-#include <mutex>
-#include <condition_variable>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The test specifics.
-    //-----------------------------------------------------------------------------
-    namespace test
-    {
-        //-----------------------------------------------------------------------------
-        //! The test event specifics.
-        //-----------------------------------------------------------------------------
-        namespace event
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //!
-                //#############################################################################
-                template<
-                    typename TDev>
-                struct EventHostManualTriggerType;
-                //#############################################################################
-                //!
-                //#############################################################################
-                template<
-                    typename TDev>
-                struct IsEventHostManualTriggerSupported;
-            }
-
-            //#############################################################################
-            //! The event host manual trigger type trait alias template to remove the ::type.
-            //#############################################################################
-            template<
-                typename TDev>
-            using EventHostManualTrigger = typename traits::EventHostManualTriggerType<TDev>::type;
-
-            //-----------------------------------------------------------------------------
-            template<
-                typename TDev>
-            ALPAKA_FN_HOST auto isEventHostManualTriggerSupported(
-                TDev const & dev)
-            -> bool
-            {
-                return
-                    traits::IsEventHostManualTriggerSupported<
-                        TDev>
-                    ::isSupported(
-                        dev);
-            }
-
-            namespace cpu
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! Event that can be enqueued into a queue and can be triggered by the Host.
-                    //#############################################################################
-                    class EventHostManualTriggerCpuImpl
-                    {
-                    public:
-                        //-----------------------------------------------------------------------------
-                        //! Constructor.
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST EventHostManualTriggerCpuImpl(
-                            dev::DevCpu const & dev) noexcept :
-                                m_dev(dev),
-                                m_mutex(),
-                                m_enqueueCount(0u),
-                                m_bIsReady(true)
-                        {}
-                        //-----------------------------------------------------------------------------
-                        //! Copy constructor.
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl const & other) = delete;
-                        //-----------------------------------------------------------------------------
-                        //! Move constructor.
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl &&) = delete;
-                        //-----------------------------------------------------------------------------
-                        //! Copy assignment operator.
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerCpuImpl const &) -> EventHostManualTriggerCpuImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        //! Move assignment operator.
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerCpuImpl &&) -> EventHostManualTriggerCpuImpl & = delete;
-
-                        //-----------------------------------------------------------------------------
-                        //!
-                        //-----------------------------------------------------------------------------
-                        void trigger()
-                        {
-                            {
-                                std::unique_lock<std::mutex> lock(m_mutex);
-                                m_bIsReady = true;
-                            }
-                            m_conditionVariable.notify_one();
-                        }
-
-                    public:
-                        dev::DevCpu const m_dev;                                //!< The device this event is bound to.
-
-                        mutable std::mutex m_mutex;                             //!< The mutex used to synchronize access to the event.
-
-                        mutable std::condition_variable m_conditionVariable;    //!< The condition signaling the event completion.
-                        std::size_t m_enqueueCount;                             //!< The number of times this event has been enqueued.
-
-                        bool m_bIsReady;                                        //!< If the event is not waiting within a queue (not enqueued or already completed).
-                    };
-                }
-            }
-
-            //#############################################################################
-            //! Event that can be enqueued into a queue and can be triggered by the Host.
-            //#############################################################################
-            class EventHostManualTriggerCpu
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                //! Constructor.
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST EventHostManualTriggerCpu(
-                    dev::DevCpu const & dev) :
-                        m_spEventImpl(std::make_shared<cpu::detail::EventHostManualTriggerCpuImpl>(dev))
-                {}
-                //-----------------------------------------------------------------------------
-                //! Copy constructor.
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerCpu(EventHostManualTriggerCpu const &) = default;
-                //-----------------------------------------------------------------------------
-                //! Move constructor.
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerCpu(EventHostManualTriggerCpu &&) = default;
-                //-----------------------------------------------------------------------------
-                //! Copy assignment operator.
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerCpu const &) -> EventHostManualTriggerCpu & = default;
-                //-----------------------------------------------------------------------------
-                //! Move assignment operator.
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerCpu &&) -> EventHostManualTriggerCpu & = default;
-                //-----------------------------------------------------------------------------
-                //! Equality comparison operator.
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCpu const & rhs) const
-                -> bool
-                {
-                    return (m_spEventImpl == rhs.m_spEventImpl);
-                }
-                //-----------------------------------------------------------------------------
-                //! Inequality comparison operator.
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCpu const & rhs) const
-                -> bool
-                {
-                    return !((*this) == rhs);
-                }
-
-                //-----------------------------------------------------------------------------
-                //!
-                //-----------------------------------------------------------------------------
-                void trigger()
-                {
-                    m_spEventImpl->trigger();
-                }
-
-            public:
-                std::shared_ptr<cpu::detail::EventHostManualTriggerCpuImpl> m_spEventImpl;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                //!
-                //#############################################################################
-                template<>
-                struct EventHostManualTriggerType<
-                    alpaka::dev::DevCpu>
-                {
-                    using type = alpaka::test::event::EventHostManualTriggerCpu;
-                };
-                //#############################################################################
-                //! The CPU event host manual trigger support get trait specialization.
-                template<>
-                struct IsEventHostManualTriggerSupported<
-                    alpaka::dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isSupported(
-                        alpaka::dev::DevCpu const &)
-                    -> bool
-                    {
-                        return true;
-                    }
-                };
-            }
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event device get trait specialization.
-            //#############################################################################
-            template<>
-            struct GetDev<
-                test::event::EventHostManualTriggerCpu>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    test::event::EventHostManualTriggerCpu const & event)
-                -> dev::DevCpu
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event test trait specialization.
-            //#############################################################################
-            template<>
-            struct Test<
-                test::event::EventHostManualTriggerCpu>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return If the event is not waiting within a queue (not enqueued or already handled).
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto test(
-                    test::event::EventHostManualTriggerCpu const & event)
-                -> bool
-                {
-                    std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                    return event.m_spEventImpl->m_bIsReady;
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //!
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCpuNonBlocking,
-                test::event::EventHostManualTriggerCpu>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue::QueueCpuNonBlocking & queue,
-#else
-                    queue::QueueCpuNonBlocking &,
-#endif
-                    test::event::EventHostManualTriggerCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // Increment the enqueue counter. This is used to skip waits for events that had already been finished and re-enqueued which would lead to deadlocks.
-                    ++spEventImpl->m_enqueueCount;
-
-                    // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-                    // Enqueue a task that only resets the events flag if it is completed.
-                    queue.m_spQueueImpl->m_workerThread.enqueueTask(
-                        [spEventImpl, enqueueCount]()
-                        {
-                            std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
-                            spEventImpl->m_conditionVariable.wait(
-                                lk2,
-                                [spEventImpl, enqueueCount]
-                                {
-                                    return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady;
-                                });
-                        });
-#endif
-                }
-            };
-            //#############################################################################
-            //!
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCpuBlocking,
-                test::event::EventHostManualTriggerCpu>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuBlocking &,
-                    test::event::EventHostManualTriggerCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::unique_lock<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // Increment the enqueue counter. This is used to skip waits for events that had already been finished and re-enqueued which would lead to deadlocks.
-                    ++spEventImpl->m_enqueueCount;
-
-                    auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-                    spEventImpl->m_conditionVariable.wait(
-                        lk,
-                        [spEventImpl, enqueueCount]
-                        {
-                            return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady;
-                        });
-                }
-            };
-        }
-    }
-}
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <cuda.h>
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/core/Cuda.hpp>
-
-namespace alpaka
-{
-    namespace test
-    {
-        namespace event
-        {
-            namespace cuda
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    class EventHostManualTriggerCudaImpl final
-                    {
-                    public:
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST EventHostManualTriggerCudaImpl(
-                            dev::DevCudaRt const & dev) :
-                                m_dev(dev),
-                                m_mutex(),
-                                m_bIsReady(true)
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                            // Set the current device.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaSetDevice(
-                                    m_dev.m_iDevice));
-                            // Allocate the buffer on this device.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaMalloc(
-                                    &m_devMem,
-                                    static_cast<size_t>(sizeof(int32_t))));
-                            // Initiate the memory set.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaMemset(
-                                    m_devMem,
-                                    static_cast<int>(0u),
-                                    static_cast<size_t>(sizeof(int32_t))));
-                        }
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl const &) = delete;
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl &&) = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerCudaImpl const &) -> EventHostManualTriggerCudaImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerCudaImpl &&) -> EventHostManualTriggerCudaImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST ~EventHostManualTriggerCudaImpl()
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                            // Set the current device.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaSetDevice(
-                                    m_dev.m_iDevice));
-                            // Free the buffer.
-                            ALPAKA_CUDA_RT_CHECK(cudaFree(m_devMem));
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        void trigger()
-                        {
-                            std::unique_lock<std::mutex> lock(m_mutex);
-                            m_bIsReady = true;
-
-                            // Set the current device.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaSetDevice(
-                                    m_dev.m_iDevice));
-                            // Initiate the memory set.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaMemset(
-                                    m_devMem,
-                                    static_cast<int>(1u),
-                                    static_cast<size_t>(sizeof(int32_t))));
-                        }
-
-                    public:
-                        dev::DevCudaRt const m_dev;     //!< The device this event is bound to.
-
-                        mutable std::mutex m_mutex;     //!< The mutex used to synchronize access to the event.
-                        void * m_devMem;
-
-                        bool m_bIsReady;                //!< If the event is not waiting within a queue (not enqueued or already completed).
-                    };
-                }
-            }
-
-            //#############################################################################
-            class EventHostManualTriggerCuda final
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST EventHostManualTriggerCuda(
-                    dev::DevCudaRt const & dev) :
-                        m_spEventImpl(std::make_shared<cuda::detail::EventHostManualTriggerCudaImpl>(dev))
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                }
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerCuda(EventHostManualTriggerCuda const &) = default;
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerCuda(EventHostManualTriggerCuda &&) = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerCuda const &) -> EventHostManualTriggerCuda & = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerCuda &&) -> EventHostManualTriggerCuda & = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCuda const & rhs) const
-                -> bool
-                {
-                    return (m_spEventImpl == rhs.m_spEventImpl);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCuda const & rhs) const
-                -> bool
-                {
-                    return !((*this) == rhs);
-                }
-                //-----------------------------------------------------------------------------
-                ~EventHostManualTriggerCuda() = default;
-
-                //-----------------------------------------------------------------------------
-                void trigger()
-                {
-                    m_spEventImpl->trigger();
-                }
-
-            public:
-                std::shared_ptr<cuda::detail::EventHostManualTriggerCudaImpl> m_spEventImpl;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                template<>
-                struct EventHostManualTriggerType<
-                    alpaka::dev::DevCudaRt>
-                {
-                    using type = alpaka::test::event::EventHostManualTriggerCuda;
-                };
-                //#############################################################################
-                //! The CPU event host manual trigger support get trait specialization.
-                template<>
-                struct IsEventHostManualTriggerSupported<
-                    alpaka::dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isSupported(
-#if BOOST_LANG_CUDA >= BOOST_VERSION_NUMBER(9, 0, 0)
-                        alpaka::dev::DevCudaRt const & dev)
-#else
-                        alpaka::dev::DevCudaRt const &)
-#endif
-                    -> bool
-                    {
-#if BOOST_LANG_CUDA >= BOOST_VERSION_NUMBER(9, 0, 0)
-                        int result = 0;
-                        cuDeviceGetAttribute(
-                            &result,
-                            CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS,
-                            dev.m_iDevice);
-                        return result != 0;
-#else
-                        // In CUDA 8.0 there is no way to find out if those operations are really supported.
-                        return false;
-#endif
-                    }
-                };
-            }
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event device get trait specialization.
-            template<>
-            struct GetDev<
-                test::event::EventHostManualTriggerCuda>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    test::event::EventHostManualTriggerCuda const & event)
-                -> dev::DevCudaRt
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event test trait specialization.
-            template<>
-            struct Test<
-                test::event::EventHostManualTriggerCuda>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return If the event is not waiting within a queue (not enqueued or already handled).
-                ALPAKA_FN_HOST static auto test(
-                    test::event::EventHostManualTriggerCuda const & event)
-                -> bool
-                {
-                    std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                    return event.m_spEventImpl->m_bIsReady;
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                test::event::EventHostManualTriggerCuda>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    test::event::EventHostManualTriggerCuda & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // PGI Profiler`s User Guide:
-                    // The following are known issues related to Events and Metrics:
-                    // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-                    //   on host updates may hang. This includes synchronization between the host and
-                    //   the device build upon value-based CUDA queue synchronization APIs such as
-                    //   cuStreamWaitValue32() and cuStreamWriteValue32().
-                    ALPAKA_CUDA_DRV_CHECK(
-                        cuStreamWaitValue32(
-                            static_cast<CUstream>(queue.m_spQueueImpl->m_CudaQueue),
-                            reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
-                            0x01010101u,
-                            CU_STREAM_WAIT_VALUE_GEQ));
-                }
-            };
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                test::event::EventHostManualTriggerCuda>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    test::event::EventHostManualTriggerCuda & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // PGI Profiler`s User Guide:
-                    // The following are known issues related to Events and Metrics:
-                    // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-                    //   on host updates may hang. This includes synchronization between the host and
-                    //   the device build upon value-based CUDA queue synchronization APIs such as
-                    //   cuStreamWaitValue32() and cuStreamWriteValue32().
-                    ALPAKA_CUDA_DRV_CHECK(
-                        cuStreamWaitValue32(
-                            static_cast<CUstream>(queue.m_spQueueImpl->m_CudaQueue),
-                            reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
-                            0x01010101u,
-                            CU_STREAM_WAIT_VALUE_GEQ));
-                }
-            };
-        }
-    }
-}
-#endif
-
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <hip/hip_runtime.h>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/core/Hip.hpp>
-
-namespace alpaka
-{
-    namespace test
-    {
-        namespace event
-        {
-            namespace hip
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    class EventHostManualTriggerHipImpl final
-                    {
-                    public:
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST EventHostManualTriggerHipImpl(
-                            dev::DevHipRt const & dev) :
-                                m_dev(dev),
-                                m_mutex(),
-                                m_bIsReady(true)
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                            // Set the current device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    m_dev.m_iDevice));
-                            // Allocate the buffer on this device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMalloc(
-                                    &m_devMem,
-                                    static_cast<size_t>(sizeof(int32_t))));
-                            // Initiate the memory set.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMemset(
-                                    m_devMem,
-                                    static_cast<int>(0u),
-                                    static_cast<size_t>(sizeof(int32_t))));
-                        }
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl const &) = delete;
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl &&) = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerHipImpl const &) -> EventHostManualTriggerHipImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerHipImpl &&) -> EventHostManualTriggerHipImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST ~EventHostManualTriggerHipImpl()
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    m_dev.m_iDevice));
-                            // Free the buffer.
-                            ALPAKA_HIP_RT_CHECK(hipFree(m_devMem));
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        void trigger()
-                        {
-                            std::unique_lock<std::mutex> lock(m_mutex);
-                            m_bIsReady = true;
-
-                            // Set the current device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    m_dev.m_iDevice));
-                            // Initiate the memory set.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMemset(
-                                    m_devMem,
-                                    static_cast<int>(1u),
-                                    static_cast<size_t>(sizeof(int32_t))));
-                        }
-
-                    public:
-                        dev::DevHipRt const m_dev;     //!< The device this event is bound to.
-
-                        mutable std::mutex m_mutex;     //!< The mutex used to synchronize access to the event.
-                        void * m_devMem;
-
-                        bool m_bIsReady;                //!< If the event is not waiting within a queue (not enqueued or already completed).
-                    };
-                }
-            }
-
-            //#############################################################################
-            class EventHostManualTriggerHip final
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST EventHostManualTriggerHip(
-                    dev::DevHipRt const & dev) :
-                        m_spEventImpl(std::make_shared<hip::detail::EventHostManualTriggerHipImpl>(dev))
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                }
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerHip(EventHostManualTriggerHip const &) = default;
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerHip(EventHostManualTriggerHip &&) = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerHip const &) -> EventHostManualTriggerHip & = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerHip &&) -> EventHostManualTriggerHip & = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator==(EventHostManualTriggerHip const & rhs) const
-                -> bool
-                {
-                    return (m_spEventImpl == rhs.m_spEventImpl);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerHip const & rhs) const
-                -> bool
-                {
-                    return !((*this) == rhs);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST ~EventHostManualTriggerHip() = default;
-
-                //-----------------------------------------------------------------------------
-                void trigger()
-                {
-                    m_spEventImpl->trigger();
-                }
-
-            public:
-                std::shared_ptr<hip::detail::EventHostManualTriggerHipImpl> m_spEventImpl;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                template<>
-                struct EventHostManualTriggerType<
-                    alpaka::dev::DevHipRt>
-                {
-                    using type = alpaka::test::event::EventHostManualTriggerHip;
-                };
-
-                //#############################################################################
-                //! The HIP event host manual trigger support get trait specialization.
-                template<>
-                struct IsEventHostManualTriggerSupported<
-                    alpaka::dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    // TODO: there is no CUDA_VERSION in the HIP compiler path.
-                    // TODO: there is a hipDeviceGetAttribute, but there is no pendant for CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
-                    ALPAKA_FN_HOST static auto isSupported(
-                        alpaka::dev::DevHipRt const &)
-                    -> bool
-                    {
-                        return false;
-                    }
-                };
-            }
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event device get trait specialization.
-            template<>
-            struct GetDev<
-                test::event::EventHostManualTriggerHip>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    test::event::EventHostManualTriggerHip const & event)
-                -> dev::DevHipRt
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event test trait specialization.
-            template<>
-            struct Test<
-                test::event::EventHostManualTriggerHip>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return If the event is not waiting within a queue (not enqueued or already handled).
-                ALPAKA_FN_HOST static auto test(
-                    test::event::EventHostManualTriggerHip const & event)
-                -> bool
-                {
-                    std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                    return event.m_spEventImpl->m_bIsReady;
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                test::event::EventHostManualTriggerHip>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    test::event::EventHostManualTriggerHip & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // PGI Profiler`s User Guide:
-                    // The following are known issues related to Events and Metrics:
-                    // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-                    //   on host updates may hang. This includes synchronization between the host and
-                    //   the device build upon value-based CUDA queue synchronization APIs such as
-                    //   cuStreamWaitValue32() and cuStreamWriteValue32().
-                    int32_t hostMem=0;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cerr << "[Workaround] polling of device-located value in stream, as hipStreamWaitValue32 is not available.\n";
-#endif
-                    while(hostMem<0x01010101u) {
-                      ALPAKA_HIP_RT_CHECK(hipMemcpyDtoHAsync(&hostMem,
-                                                             reinterpret_cast<hipDeviceptr_t>(event.m_spEventImpl->m_devMem),
-                                                             sizeof(int32_t),
-                                                             queue.m_spQueueImpl->m_HipQueue));
-                      ALPAKA_HIP_RT_CHECK(hipStreamSynchronize(queue.m_spQueueImpl->m_HipQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                test::event::EventHostManualTriggerHip>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    test::event::EventHostManualTriggerHip & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // PGI Profiler`s User Guide:
-                    // The following are known issues related to Events and Metrics:
-                    // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-                    //   on host updates may hang. This includes synchronization between the host and
-                    //   the device build upon value-based HIP queue synchronization APIs such as
-                    //   cuStreamWaitValue32() and cuStreamWriteValue32().
-#if BOOST_COMP_NVCC
-                    ALPAKA_HIP_RT_CHECK(hipCUResultTohipError(
-                        cuStreamWaitValue32(
-                            static_cast<CUstream>(queue.m_spQueueImpl->m_HipQueue),
-                            reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
-                            0x01010101u,
-                            CU_STREAM_WAIT_VALUE_GEQ)));
-#else
-                    // workaround for missing cuStreamWaitValue32 in HIP(HCC)
-                    std::uint32_t hmem = 0;
-                    do {
-                        std::this_thread::sleep_for(std::chrono::milliseconds(10u));
-                        ALPAKA_HIP_RT_CHECK(hipMemcpy(&hmem, event.m_spEventImpl->m_devMem, sizeof(std::uint32_t), hipMemcpyDefault));
-                    } while(hmem < 0x01010101u);
-
-#endif
-                }
-            };
-        }
-    }
-}
-#endif
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp
deleted file mode 100644
index 0d860ad2b4..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <tuple>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The test specifics.
-    namespace test
-    {
-        //-----------------------------------------------------------------------------
-        //! The test accelerator specifics.
-        namespace idx
-        {
-            //#############################################################################
-            //! A std::tuple holding idx types.
-            using TestIdxs =
-                std::tuple<
-                    // size_t is most probably identical to either std::uint64_t or std::uint32_t.
-                    // This would lead to duplicate tests (especially test names) which is not allowed.
-                    //std::size_t,
-#if !defined(ALPAKA_CI)
-                    std::int64_t,
-#endif
-                    std::uint64_t,
-                    std::int32_t,
-#if !defined(ALPAKA_CI)
-                    std::uint32_t,
-                    std::int16_t,
-#endif
-                    std::uint16_t/*,
-                    // When Idx is a 8 bit integer, extents within the tests would be extremely limited
-                    // (especially when Dim is 4). Therefore, we do not test it.
-                    std::int8_t,
-                    std::uint8_t*/>;
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp
deleted file mode 100644
index 8ffb49fefa..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp
+++ /dev/null
@@ -1,239 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The test specifics.
-    namespace test
-    {
-        //-----------------------------------------------------------------------------
-        //! The test mem specifics.
-        namespace mem
-        {
-            //-----------------------------------------------------------------------------
-            //!
-            namespace view
-            {
-                //-----------------------------------------------------------------------------
-                //!
-                namespace traits
-                {
-                    //#############################################################################
-                    // \tparam T Type to conditionally make const.
-                    // \tparam TSource Type to mimic the constness of.
-                    template<
-                        typename T,
-                        typename TSource>
-                    using MimicConst = typename std::conditional<
-                        std::is_const<TSource>::value,
-                        typename std::add_const<T>::type,
-                        typename std::remove_const<T>::type>;
-
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'Byte*' to 'Elem*' increases required alignment of target type"
-#endif
-                    //#############################################################################
-                    template<
-                        typename TView,
-                        typename TSfinae = void>
-                    class IteratorView
-                    {
-                        using TViewDecayed = typename std::decay<TView>::type;
-                        using Dim = alpaka::dim::Dim<TViewDecayed>;
-                        using Idx = alpaka::idx::Idx<TViewDecayed>;
-                        using Elem = typename MimicConst<alpaka::elem::Elem<TViewDecayed>, TView>::type;
-
-                    public:
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST IteratorView(
-                            TView & view,
-                            Idx const idx) :
-                                m_nativePtr(alpaka::mem::view::getPtrNative(view)),
-                                m_currentIdx(idx),
-                                m_extents(alpaka::extent::getExtentVec(view)),
-                                m_pitchBytes(alpaka::mem::view::getPitchBytesVec(view))
-                        {}
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST IteratorView(
-                            TView & view) :
-                                IteratorView(view, 0)
-                        {}
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator++()
-                        -> IteratorView&
-                        {
-                            ++m_currentIdx;
-                            return *this;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator--()
-                        -> IteratorView&
-                        {
-                            --m_currentIdx;
-                            return *this;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator++(
-                            int)
-                        -> IteratorView
-                        {
-                            IteratorView iterCopy = *this;
-                            m_currentIdx++;
-                            return iterCopy;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator--(
-                            int)
-                        -> IteratorView
-                        {
-                            IteratorView iterCopy = *this;
-                            m_currentIdx--;
-                            return iterCopy;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        template<typename TIter>
-                        ALPAKA_FN_HOST_ACC auto operator==(
-                            TIter &other) const
-                        -> bool
-                        {
-                            return m_currentIdx == other.m_currentIdx;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        template<typename TIter>
-                        ALPAKA_FN_HOST_ACC auto operator!=(
-                            TIter &other) const
-                        -> bool
-                        {
-                            return m_currentIdx != other.m_currentIdx;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator*() const
-                        -> Elem &
-                        {
-                            using Dim1 = alpaka::dim::DimInt<1>;
-                            using DimMin1 = alpaka::dim::DimInt<Dim::value - 1u>;
-
-                            vec::Vec<Dim1, Idx> const currentIdxDim1{m_currentIdx};
-                            vec::Vec<Dim, Idx> const currentIdxDimx(alpaka::idx::mapIdx<Dim::value>(currentIdxDim1, m_extents));
-
-                            // [pz, py, px] -> [py, px]
-                            auto const pitchWithoutOutermost(vec::subVecEnd<DimMin1>(m_pitchBytes));
-                            // [ElemSize]
-                            vec::Vec<Dim1, Idx> const elementSizeVec(static_cast<Idx>(sizeof(Elem)));
-                            // [py, px] ++ [ElemSize] -> [py, px, ElemSize]
-                            vec::Vec<Dim, Idx> const dstPitchBytes(vec::concat(pitchWithoutOutermost, elementSizeVec));
-                            // [py, px, ElemSize] [z, y, x] -> [py*z, px*y, ElemSize*x]
-                            auto const dimensionalOffsetsInByte(currentIdxDimx * dstPitchBytes);
-                            // sum{[py*z, px*y, ElemSize*x]} -> offset in byte
-                            auto const offsetInByte(dimensionalOffsetsInByte.foldrAll(
-                                [](Idx a, Idx b)
-                                {
-                                    return static_cast<Idx>(a + b);
-                                }));
-
-                            using Byte = typename MimicConst<std::uint8_t, Elem>::type;
-                            Byte* ptr(reinterpret_cast<Byte*>(m_nativePtr) + offsetInByte);
-
-#if 0
-                            std::cout
-                                << " i1: " << currentIdxDim1
-                                << " in: " << currentIdxDimx
-                                << " dpb: " << dstPitchBytes
-                                << " offb: " << offsetInByte
-                                << " ptr: " << reinterpret_cast<void const *>(ptr)
-                                << " v: " << *reinterpret_cast<Elem *>(ptr)
-                                << std::endl;
-#endif
-                            return *reinterpret_cast<Elem *>(ptr);
-                        }
-
-                    private:
-                        Elem * const m_nativePtr;
-                        Idx m_currentIdx;
-                        vec::Vec<Dim, Idx> const m_extents;
-                        vec::Vec<Dim, Idx> const m_pitchBytes;
-                    };
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-
-                    //#############################################################################
-                    template<
-                        typename TView,
-                        typename TSfinae = void>
-                    struct Begin
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto begin(
-                            TView & view)
-                        -> IteratorView<TView>
-                        {
-                            return IteratorView<TView>(view);
-                        }
-                    };
-
-                    //#############################################################################
-                    template<
-                        typename TView,
-                        typename TSfinae = void>
-                    struct End
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto end(
-                            TView & view)
-                        -> IteratorView<TView>
-                        {
-                            auto extents = alpaka::extent::getExtentVec(view);
-                            return IteratorView<TView>(view, extents.prod());
-                        }
-                    };
-                }
-
-                //#############################################################################
-                template<
-                    typename TView>
-                using Iterator = traits::IteratorView<TView>;
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TView>
-                ALPAKA_FN_HOST auto begin(
-                    TView & view)
-                -> Iterator<TView>
-                {
-                    return traits::Begin<TView>::begin(view);
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TView>
-                ALPAKA_FN_HOST auto end(
-                    TView & view)
-                -> Iterator<TView>
-                {
-                    return traits::End<TView>::end(view);
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp
deleted file mode 100644
index 7844aac565..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp
+++ /dev/null
@@ -1,378 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/test/mem/view/Iterator.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <numeric>
-
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The test specifics.
-    namespace test
-    {
-        //-----------------------------------------------------------------------------
-        //! The test mem specifics.
-        namespace mem
-        {
-            //-----------------------------------------------------------------------------
-            namespace view
-            {
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx,
-                    typename TDev,
-                    typename TView>
-                ALPAKA_FN_HOST auto testViewImmutable(
-                    TView const & view,
-                    TDev const & dev,
-                    alpaka::vec::Vec<TDim, TIdx> const & extent,
-                    alpaka::vec::Vec<TDim, TIdx> const & offset)
-                -> void
-                {
-                    //-----------------------------------------------------------------------------
-                    // alpaka::dev::traits::DevType
-                    {
-                        static_assert(
-                            std::is_same<alpaka::dev::Dev<TView>, TDev>::value,
-                            "The device type of the view has to be equal to the specified one.");
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::dev::traits::GetDev
-                    {
-                        REQUIRE(
-                            dev == alpaka::dev::getDev(view));
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::dim::traits::DimType
-                    {
-                        static_assert(
-                            alpaka::dim::Dim<TView>::value == TDim::value,
-                            "The dimensionality of the view has to be equal to the specified one.");
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::elem::traits::ElemType
-                    {
-                        static_assert(
-                            std::is_same<alpaka::elem::Elem<TView>, TElem>::value,
-                            "The element type of the view has to be equal to the specified one.");
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::extent::traits::GetExtent
-                    {
-                        REQUIRE(
-                            extent ==
-                            alpaka::extent::getExtentVec(view));
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::traits::GetPitchBytes
-                    {
-                        // The pitches have to be at least as large as the values we calculate here.
-                        auto pitchMinimum(alpaka::vec::Vec<alpaka::dim::DimInt<TDim::value + 1u>, TIdx>::ones());
-                        // Initialize the pitch between two elements of the X dimension ...
-                        pitchMinimum[TDim::value] = sizeof(TElem);
-                        // ... and fill all the other dimensions.
-                        for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
-                        {
-                            pitchMinimum[i-1] = extent[i-1] * pitchMinimum[i];
-                        }
-
-                        auto const pitchView(alpaka::mem::view::getPitchBytesVec(view));
-
-                        for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
-                        {
-                            REQUIRE(
-                                pitchView[i-1] >=
-                                pitchMinimum[i-1]);
-                        }
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::traits::GetPtrNative
-                    {
-                        // The view is a const& so the pointer has to point to a const value.
-                        using NativePtr = decltype(alpaka::mem::view::getPtrNative(view));
-                        static_assert(
-                            std::is_pointer<NativePtr>::value,
-                            "The value returned by getPtrNative has to be a pointer.");
-                        static_assert(
-                            std::is_const<typename std::remove_pointer<NativePtr>::type>::value,
-                            "The value returned by getPtrNative has to be const when the view is const.");
-
-                        if(alpaka::extent::getExtentProduct(view) != static_cast<TIdx>(0u))
-                        {
-                            // The pointer is only required to be non-null when the extent is > 0.
-                            TElem const * const invalidPtr(nullptr);
-                            REQUIRE(
-                                invalidPtr !=
-                                alpaka::mem::view::getPtrNative(view));
-                        }
-                        else
-                        {
-                            // When the extent is 0, the pointer is undefined but it should still be possible get it.
-                            alpaka::mem::view::getPtrNative(view);
-                        }
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::offset::traits::GetOffset
-                    {
-                        REQUIRE(
-                            offset ==
-                            alpaka::offset::getOffsetVec(view));
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::idx::traits::IdxType
-                    {
-                        static_assert(
-                            std::is_same<alpaka::idx::Idx<TView>, TIdx>::value,
-                            "The idx type of the view has to be equal to the specified one.");
-                    }
-                }
-
-                //#############################################################################
-                //! Compares element-wise that all bytes are set to the same value.
-                struct VerifyBytesSetKernel
-                {
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename TAcc,
-                        typename TIter>
-                    ALPAKA_FN_ACC void operator()(
-                        TAcc const & acc,
-                        bool * success,
-                        TIter const & begin,
-                        TIter const & end,
-                        std::uint8_t const & byte) const
-                    {
-                        alpaka::ignore_unused(acc);
-
-                        constexpr auto elemSizeInByte = sizeof(decltype(*begin));
-                        for(auto it = begin; it != end; ++it)
-                        {
-                            auto const& elem = *it;
-                            auto const pBytes = reinterpret_cast<std::uint8_t const *>(&elem);
-                            for(std::size_t i = 0u; i < elemSizeInByte; ++i)
-                            {
-                                ALPAKA_CHECK(*success, pBytes[i] == byte);
-                            }
-                        }
-                    }
-                };
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TAcc,
-                    typename TView>
-                ALPAKA_FN_HOST auto verifyBytesSet(
-                    TView const & view,
-                    std::uint8_t const & byte)
-                -> void
-                {
-                    using Dim = alpaka::dim::Dim<TView>;
-                    using Idx = alpaka::idx::Idx<TView>;
-
-                    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-                        alpaka::vec::Vec<Dim, Idx>::ones());
-
-                    VerifyBytesSetKernel verifyBytesSet;
-
-                    REQUIRE(
-                        fixture(
-                            verifyBytesSet,
-                            alpaka::test::mem::view::begin(view),
-                            alpaka::test::mem::view::end(view),
-                            byte));
-                }
-
-                //#############################################################################
-                //! Compares iterators element-wise
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wfloat-equal"  // "comparing floating point with == or != is unsafe"
-#endif
-                struct VerifyViewsEqualKernel
-                {
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename TAcc,
-                        typename TIterA,
-                        typename TIterB>
-                    ALPAKA_FN_ACC void operator()(
-                        TAcc const & acc,
-                        bool * success,
-                        TIterA beginA,
-                        TIterA const & endA,
-                        TIterB beginB) const
-                    {
-                        alpaka::ignore_unused(acc);
-
-                        for(; beginA != endA; ++beginA, ++beginB)
-                        {
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wfloat-equal" // "comparing floating point with == or != is unsafe"
-#endif
-                            ALPAKA_CHECK(*success, *beginA == *beginB);
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                        }
-                    }
-                };
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TAcc,
-                    typename TViewB,
-                    typename TViewA>
-                ALPAKA_FN_HOST auto verifyViewsEqual(
-                    TViewA const & viewA,
-                    TViewB const & viewB)
-                -> void
-                {
-                    using DimA = alpaka::dim::Dim<TViewA>;
-                    using DimB = alpaka::dim::Dim<TViewB>;
-                    static_assert(DimA::value == DimB::value, "viewA and viewB are required to have identical Dim");
-                    using IdxA = alpaka::idx::Idx<TViewA>;
-                    using IdxB = alpaka::idx::Idx<TViewB>;
-                    static_assert(std::is_same<IdxA, IdxB>::value, "viewA and viewB are required to have identical Idx");
-
-                    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-                        alpaka::vec::Vec<DimA, IdxA>::ones());
-
-                    VerifyViewsEqualKernel verifyViewsEqualKernel;
-
-                    REQUIRE(
-                        fixture(
-                            verifyViewsEqualKernel,
-                            alpaka::test::mem::view::begin(viewA),
-                            alpaka::test::mem::view::end(viewA),
-                            alpaka::test::mem::view::begin(viewB)));
-                }
-
-                //-----------------------------------------------------------------------------
-                //! Fills the given view with increasing values starting at 0.
-                template<
-                    typename TView,
-                    typename TQueue>
-                ALPAKA_FN_HOST auto iotaFillView(
-                    TQueue & queue,
-                    TView & view)
-                -> void
-                {
-                    using Dim = alpaka::dim::Dim<TView>;
-                    using Idx = alpaka::idx::Idx<TView>;
-
-                    using DevHost = alpaka::dev::DevCpu;
-                    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-
-                    using Elem = alpaka::elem::Elem<TView>;
-
-                    using ViewPlainPtr = alpaka::mem::view::ViewPlainPtr<DevHost, Elem, Dim, Idx>;
-
-                    DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0));
-
-                    auto const extent(alpaka::extent::getExtentVec(view));
-
-                    // Init buf with increasing values
-                    std::vector<Elem> v(static_cast<std::size_t>(extent.prod()), static_cast<Elem>(0));
-                    std::iota(v.begin(), v.end(), static_cast<Elem>(0));
-                    ViewPlainPtr plainBuf(v.data(), devHost, extent);
-
-                    // Copy the generated content into the given view.
-                    alpaka::mem::view::copy(queue, view, plainBuf, extent);
-
-                    alpaka::wait::wait(queue);
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TAcc,
-                    typename TView,
-                    typename TQueue>
-                ALPAKA_FN_HOST auto testViewMutable(
-                    TQueue & queue,
-                    TView & view)
-                -> void
-                {
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::traits::GetPtrNative
-                    {
-                        // The view is a non-const so the pointer has to point to a non-const value.
-                        using NativePtr = decltype(alpaka::mem::view::getPtrNative(view));
-                        static_assert(
-                            std::is_pointer<NativePtr>::value,
-                            "The value returned by getPtrNative has to be a pointer.");
-                        static_assert(
-                            !std::is_const<typename std::remove_pointer<NativePtr>::type>::value,
-                            "The value returned by getPtrNative has to be non-const when the view is non-const.");
-                    }
-
-                    auto const extent(alpaka::extent::getExtentVec(view));
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::set
-                    {
-                        std::uint8_t const byte(static_cast<uint8_t>(42u));
-                        alpaka::mem::view::set(queue, view, byte, extent);
-                        alpaka::wait::wait(queue);
-                        verifyBytesSet<TAcc>(view, byte);
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::copy
-                    {
-                        using Elem = alpaka::elem::Elem<TView>;
-                        using Idx = alpaka::idx::Idx<TView>;
-
-                        auto const devAcc = alpaka::dev::getDev(view);
-
-                        //-----------------------------------------------------------------------------
-                        // alpaka::mem::view::copy into given view
-                        {
-                            auto srcBufAcc(alpaka::mem::buf::alloc<Elem, Idx>(devAcc, extent));
-                            iotaFillView(queue, srcBufAcc);
-                            alpaka::mem::view::copy(queue, view, srcBufAcc, extent);
-                            alpaka::wait::wait(queue);
-                            verifyViewsEqual<TAcc>(view, srcBufAcc);
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        // alpaka::mem::view::copy from given view
-                        {
-                            auto dstBufAcc(alpaka::mem::buf::alloc<Elem, Idx>(devAcc, extent));
-                            alpaka::mem::view::copy(queue, dstBufAcc, view, extent);
-                            alpaka::wait::wait(queue);
-                            verifyViewsEqual<TAcc>(dstBufAcc, view);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/queue/Queue.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/queue/Queue.hpp
deleted file mode 100644
index 95a950a95f..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/queue/Queue.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/alpaka.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The test specifics.
-    namespace test
-    {
-        //-----------------------------------------------------------------------------
-        //! The test queue specifics.
-        namespace queue
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The default queue type trait for devices.
-                template<
-                    typename TDev,
-                    typename TSfinae = void>
-                struct DefaultQueueType;
-
-                //#############################################################################
-                //! The default queue type trait specialization for the CPU device.
-                template<>
-                struct DefaultQueueType<
-                    alpaka::dev::DevCpu>
-                {
-#if (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                    using type = alpaka::queue::QueueCpuBlocking;
-#else
-                    using type = alpaka::queue::QueueCpuNonBlocking;
-#endif
-                };
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-                //#############################################################################
-                //! The default queue type trait specialization for the CUDA device.
-                template<>
-                struct DefaultQueueType<
-                    alpaka::dev::DevCudaRt>
-                {
-#if (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                    using type = alpaka::queue::QueueCudaRtBlocking;
-#else
-                    using type = alpaka::queue::QueueCudaRtNonBlocking;
-#endif
-                };
-#endif
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-                //#############################################################################
-                //! The default queue type trait specialization for the HIP device.
-                template<>
-                struct DefaultQueueType<
-                    alpaka::dev::DevHipRt>
-                {
-#if (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                    using type = alpaka::queue::QueueHipRtBlocking;
-#else
-                    using type = alpaka::queue::QueueHipRtNonBlocking;
-#endif
-                };
-#endif
-
-            }
-            //#############################################################################
-            //! The queue type that should be used for the given accelerator.
-            template<
-                typename TAcc>
-            using DefaultQueue = typename traits::DefaultQueueType<TAcc>::type;
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The blocking queue trait.
-                template<
-                    typename TQueue,
-                    typename TSfinae = void>
-                struct IsBlockingQueue;
-
-                //#############################################################################
-                //! The blocking queue trait specialization for a blocking CPU queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCpuBlocking>
-                {
-                    static constexpr bool value = true;
-                };
-
-                //#############################################################################
-                //! The blocking queue trait specialization for a non-blocking CPU queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCpuNonBlocking>
-                {
-                    static constexpr bool value = false;
-                };
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-                //#############################################################################
-                //! The blocking queue trait specialization for a blocking CUDA RT queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCudaRtBlocking>
-                {
-                    static constexpr bool value = true;
-                };
-
-                //#############################################################################
-                //! The blocking queue trait specialization for a non-blocking CUDA RT queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCudaRtNonBlocking>
-                {
-                    static constexpr bool value = false;
-                };
-#endif
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-                //#############################################################################
-                //! The blocking queue trait specialization for a blocking HIP RT queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueHipRtBlocking>
-                {
-                    static constexpr bool value = true;
-                };
-
-                //#############################################################################
-                //! The blocking queue trait specialization for a non-blocking HIP RT queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueHipRtNonBlocking>
-                {
-                    static constexpr bool value = false;
-                };
-#endif
-            }
-            //#############################################################################
-            //! The queue type that should be used for the given accelerator.
-            template<
-                typename TQueue>
-            using IsBlockingQueue = traits::IsBlockingQueue<TQueue>;
-
-            //#############################################################################
-            //! A std::tuple holding tuples of devices and corresponding queue types.
-            using TestQueues =
-                std::tuple<
-                    std::tuple<alpaka::dev::DevCpu, alpaka::queue::QueueCpuBlocking>,
-                    std::tuple<alpaka::dev::DevCpu, alpaka::queue::QueueCpuNonBlocking>
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                    ,
-                    std::tuple<alpaka::dev::DevCudaRt, alpaka::queue::QueueCudaRtBlocking>,
-                    std::tuple<alpaka::dev::DevCudaRt, alpaka::queue::QueueCudaRtNonBlocking>
-#endif
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-                    ,
-                    std::tuple<alpaka::dev::DevHipRt, alpaka::queue::QueueHipRtBlocking>,
-                    std::tuple<alpaka::dev::DevHipRt, alpaka::queue::QueueHipRtNonBlocking>
-#endif
-                >;
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
deleted file mode 100644
index 095984054e..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
+++ /dev/null
@@ -1,462 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
-
-#include <alpaka/test/queue/Queue.hpp>
-
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/queue/cpu/ICpuQueue.hpp>
-#include <alpaka/queue/QueueCpuBlocking.hpp>
-#include <alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp>
-#include <alpaka/test/event/EventHostManualTrigger.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <atomic>
-#include <mutex>
-#include <omp.h>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCpu;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cpu
-        {
-            namespace detail
-            {
-#if BOOST_COMP_CLANG
-    // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]"
-    // https://stackoverflow.com/a/29288300
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-                //#############################################################################
-                //! The CPU collective device queue implementation.
-                class QueueCpuOmp2CollectiveImpl final : public cpu::ICpuQueue
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    QueueCpuOmp2CollectiveImpl(
-                        dev::DevCpu const & dev) noexcept :
-                            m_dev(dev),
-                            m_uCurrentlyExecutingTask(0u)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    QueueCpuOmp2CollectiveImpl(QueueCpuOmp2CollectiveImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCpuOmp2CollectiveImpl(QueueCpuOmp2CollectiveImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuOmp2CollectiveImpl const &) -> QueueCpuOmp2CollectiveImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuOmp2CollectiveImpl &&) -> QueueCpuOmp2CollectiveImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    void enqueue(event::EventCpu & ev) final
-                    {
-                        queue::enqueue(*this, ev);
-                    }
-                    //-----------------------------------------------------------------------------
-                    void wait(event::EventCpu const & ev) final
-                    {
-                        wait::wait(*this, ev);
-                    }
-
-                public:
-                    dev::DevCpu const m_dev;            //!< The device this queue is bound to.
-                    std::mutex mutable m_mutex;
-                    std::atomic<uint32_t> m_uCurrentlyExecutingTask;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CPU collective device queue.
-        //
-        // @attention Queue can only be used together with the accelerator AccCpuOmp2Blocks.
-        //
-        // This queue is an example for a user provided queue and the behavior is strongly coupled
-        // to the user workflows.
-        //
-        // Within a OpenMP parallel region kernel will be performed collectively.
-        // All other operations will be performed from one thread (it is not defined which thread).
-        //
-        // Outside of a OpenMP parallel region the queue behaves like QueueCpuBlocking.
-        class QueueCpuOmp2Collective final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCpuOmp2Collective>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            QueueCpuOmp2Collective(
-                dev::DevCpu const & dev) :
-                    m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuOmp2CollectiveImpl>(dev)),
-                    m_spBlockingQueue(std::make_shared<QueueCpuBlocking>(dev))
-            {
-                dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            QueueCpuOmp2Collective(QueueCpuOmp2Collective const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCpuOmp2Collective(QueueCpuOmp2Collective &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuOmp2Collective const &) -> QueueCpuOmp2Collective & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuOmp2Collective &&) -> QueueCpuOmp2Collective & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(QueueCpuOmp2Collective const & rhs) const
-            -> bool
-            {
-                return m_spQueueImpl == rhs.m_spQueueImpl && m_spBlockingQueue == rhs.m_spBlockingQueue;
-            }
-            //-----------------------------------------------------------------------------
-            auto operator!=(QueueCpuOmp2Collective const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCpuOmp2Collective() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::QueueCpuOmp2CollectiveImpl> m_spQueueImpl;
-            std::shared_ptr<QueueCpuBlocking> m_spBlockingQueue;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCpuOmp2Collective>
-            {
-                using type = dev::DevCpu;
-            };
-            //#############################################################################
-            //! The CPU blocking device queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCpuOmp2Collective>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCpuOmp2Collective const & queue)
-                -> dev::DevCpu
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCpuOmp2Collective>
-            {
-                using type = event::EventCpu;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            //! This default implementation for all tasks directly invokes the function call operator of the task.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCpuOmp2Collective,
-                TTask>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuOmp2Collective & queue,
-                    TTask const & task)
-                -> void
-                {
-                    if(::omp_in_parallel() != 0)
-                    {
-                        // wait for all tasks en-queued before the parallel region
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        queue.m_spQueueImpl->m_uCurrentlyExecutingTask += 1u;
-
-                        #pragma omp single nowait
-                        task();
-
-                        queue.m_spQueueImpl->m_uCurrentlyExecutingTask -= 1u;
-                    }
-                    else
-                    {
-                        std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                        queue::enqueue(*queue.m_spBlockingQueue, task);
-                    }
-                }
-            };
-
-            //#############################################################################
-            //! The CPU blocking device queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCpuOmp2Collective>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCpuOmp2Collective const & queue)
-                -> bool
-                {
-                    return queue.m_spQueueImpl->m_uCurrentlyExecutingTask == 0u &&
-                        queue::empty(*queue.m_spBlockingQueue);
-                }
-            };
-
-            //#############################################################################
-            //! The CPU OpenMP2 collective device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::cpu::detail::QueueCpuOmp2CollectiveImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::cpu::detail::QueueCpuOmp2CollectiveImpl &,
-                    event::EventCpu &)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    #pragma omp barrier
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP2 collective device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCpuOmp2Collective,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuOmp2Collective & queue,
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    if(::omp_in_parallel() != 0)
-                    {
-                        // wait for all tasks en-queued before the parallel region
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        #pragma omp barrier
-                    }
-                    else
-                    {
-                        queue::enqueue(*queue.m_spBlockingQueue, event);
-                    }
-
-                }
-            };
-
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            //! This default implementation for all tasks directly invokes the function call operator of the task.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueCpuOmp2Collective,
-                kernel::TaskKernelCpuOmp2Blocks<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>>
-            {
-            private:
-                using Task = kernel::TaskKernelCpuOmp2Blocks<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs ...>;
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuOmp2Collective & queue,
-                    Task const & task)
-                -> void
-                {
-                    if(::omp_in_parallel() != 0)
-                    {
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        // execute within an OpenMP parallel region
-                        queue.m_spQueueImpl->m_uCurrentlyExecutingTask += 1u;
-                        // execute task within an OpenMP parallel region
-                        task();
-                        queue.m_spQueueImpl->m_uCurrentlyExecutingTask -= 1u;
-                    }
-                    else
-                    {
-                        std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                        queue::enqueue(*queue.m_spBlockingQueue, task);
-                    }
-                }
-            };
-
-            //#############################################################################
-            //!
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCpuOmp2Collective,
-                test::event::EventHostManualTriggerCpu>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuOmp2Collective & ,
-                    test::event::EventHostManualTriggerCpu & )
-                -> void
-                {
-                    // EventHostManualTriggerCpu are not supported for together with the queue QueueCpuOmp2Collective
-                    // but a specialization is needed to path the EventTests
-                }
-            };
-        }
-    }
-
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCpuOmp2Collective>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCpuOmp2Collective const & queue)
-                -> void
-                {
-                    if(::omp_in_parallel() != 0)
-                    {
-                        // wait for all tasks en-queued before the parallel region
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        #pragma omp barrier
-                    }
-                    else
-                    {
-                        std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                        wait::wait(*queue.m_spBlockingQueue);
-                    }
-                }
-            };
-
-
-            //#############################################################################
-            //! The CPU OpenMP2 collective device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::cpu::detail::QueueCpuOmp2CollectiveImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::cpu::detail::QueueCpuOmp2CollectiveImpl &,
-                    event::EventCpu const &)
-                -> void
-                {
-                    #pragma omp barrier
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP2 collective queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCpuOmp2Collective,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCpuOmp2Collective & queue,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    if(::omp_in_parallel() != 0)
-                    {
-                        // wait for all tasks en-queued before the parallel region
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        wait::wait(queue);
-                    }
-                    else
-                        wait::wait(*queue.m_spBlockingQueue, event);
-                }
-            };
-        }
-    }
-    //-----------------------------------------------------------------------------
-    //! The test specifics.
-    namespace test
-    {
-        //-----------------------------------------------------------------------------
-        //! The test queue specifics.
-        namespace queue
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The blocking queue trait specialization for a OpenMP2 collective CPU queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCpuOmp2Collective>
-                {
-                    static constexpr bool value = true;
-                };
-            }
-        }
-    }
-}
-
-#include <alpaka/event/EventCpu.hpp>
-
-#endif
diff --git a/thirdParty/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp b/thirdParty/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp
deleted file mode 100644
index 1fed8832c2..0000000000
--- a/thirdParty/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/alpaka.hpp>
-
-namespace alpaka
-{
-    namespace test
-    {
-        namespace queue
-        {
-            //#############################################################################
-            template<
-                typename TDevQueue>
-            struct QueueTestFixture
-            {
-                using Dev = typename std::tuple_element<0, TDevQueue>::type;
-                using Queue = typename std::tuple_element<1, TDevQueue>::type;
-
-                using Pltf = alpaka::pltf::Pltf<Dev>;
-
-                //-----------------------------------------------------------------------------
-                QueueTestFixture() :
-                    m_dev(alpaka::pltf::getDevByIdx<Pltf>(0u)),
-                    m_queue(m_dev)
-                {
-                }
-
-                Dev m_dev;
-                Queue m_queue;
-            };
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/common/src/Dummy.cpp b/thirdParty/alpaka/test/common/src/Dummy.cpp
deleted file mode 100644
index dbe641fb23..0000000000
--- a/thirdParty/alpaka/test/common/src/Dummy.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-// This file is here because CMake does not allow to create a header only library.
diff --git a/thirdParty/alpaka/test/integ/CMakeLists.txt b/thirdParty/alpaka/test/integ/CMakeLists.txt
deleted file mode 100644
index ecc338bd7c..0000000000
--- a/thirdParty/alpaka/test/integ/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-################################################################################
-# Required CMake version.
-################################################################################
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.0)
-
-PROJECT("alpakaIntegTest")
-
-################################################################################
-# Add subdirectories.
-################################################################################
-
-ADD_SUBDIRECTORY("axpy/")
-ADD_SUBDIRECTORY("cudaOnly/")
-ADD_SUBDIRECTORY("mandelbrot/")
-ADD_SUBDIRECTORY("matMul/")
-ADD_SUBDIRECTORY("separableCompilation/")
-ADD_SUBDIRECTORY("sharedMem/")
diff --git a/thirdParty/alpaka/test/integ/axpy/CMakeLists.txt b/thirdParty/alpaka/test/integ/axpy/CMakeLists.txt
deleted file mode 100644
index c9fcc109e6..0000000000
--- a/thirdParty/alpaka/test/integ/axpy/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "axpy")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/integ/axpy/src/axpy.cpp b/thirdParty/alpaka/test/integ/axpy/src/axpy.cpp
deleted file mode 100644
index 605d6736b3..0000000000
--- a/thirdParty/alpaka/test/integ/axpy/src/axpy.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <alpaka/test/MeasureKernelRunTime.hpp>
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <iostream>
-#include <typeinfo>
-#include <random>
-#include <limits>
-#include <cmath>
-#include <algorithm>
-
-//#############################################################################
-//! A vector addition kernel.
-class AxpyKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    //! Vector addition Y = alpha * X + Y.
-    //!
-    //! \tparam TAcc The type of the accelerator the kernel is executed on..
-    //! \tparam TElem The matrix element type.
-    //! \param acc The accelerator the kernel is executed on.
-    //! \param numElements Specifies the number of elements of the vectors X and Y.
-    //! \param alpha Scalar the X vector is multiplied with.
-    //! \param X Vector of at least n elements.
-    //! \param Y Vector of at least n elements.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem,
-        typename TIdx>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TIdx const & numElements,
-        TElem const & alpha,
-        TElem const * const X,
-        TElem * const Y) const
-    -> void
-    {
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 1,
-            "The AxpyKernel expects 1-dimensional indices!");
-
-        auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
-        auto const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
-
-        if(threadFirstElemIdx < numElements)
-        {
-            // Calculate the number of elements to compute in this thread.
-            // The result is uniform for all but the last thread.
-            auto const threadLastElemIdx(threadFirstElemIdx+threadElemExtent);
-            auto const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
-
-            for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i)
-            {
-                Y[i] = alpha * X[i] + Y[i];
-            }
-        }
-    }
-};
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::size_t>;
-
-TEMPLATE_LIST_TEST_CASE( "axpy", "[axpy]", TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-#ifdef ALPAKA_CI
-    Idx const numElements = 1u<<9u;
-#else
-    Idx const numElements = 1u<<16u;
-#endif
-
-    using Val = float;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
-    using PltfHost = alpaka::pltf::PltfCpu;
-
-    // Create the kernel function object.
-    AxpyKernel kernel;
-
-    // Get the host device.
-    auto const devHost(
-        alpaka::pltf::getDevByIdx<PltfHost>(0u));
-
-    // Select a device to execute on.
-    auto const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-
-    // Get a queue on this device.
-    QueueAcc queue(devAcc);
-
-    alpaka::vec::Vec<Dim, Idx> const extent(
-        numElements);
-
-    // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extent,
-            static_cast<Idx>(3u),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout
-        << "AxpyKernel("
-        << " numElements:" << numElements
-        << ", accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", kernel: " << typeid(kernel).name()
-        << ", workDiv: " << workDiv
-        << ")" << std::endl;
-
-    // Allocate host memory buffers.
-    auto memBufHostX(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    auto memBufHostOrigY(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    auto memBufHostY(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    Val * const pBufHostX = alpaka::mem::view::getPtrNative(memBufHostX);
-    Val * const pBufHostOrigY = alpaka::mem::view::getPtrNative(memBufHostOrigY);
-    Val * const pBufHostY = alpaka::mem::view::getPtrNative(memBufHostY);
-
-    // C++11 random generator for uniformly distributed numbers in [0,1)
-    // keep in mind, this can generate different values on different platforms
-    std::random_device rd{};
-    auto const seed = rd();
-    std::default_random_engine eng{ seed };
-    std::uniform_real_distribution<Val> dist(0.0, 1.0);
-    std::cout << "using seed: " << seed << "\n";
-    // Initialize the host input vectors
-    for (Idx i(0); i < numElements; ++i)
-    {
-        pBufHostX[i] = dist(eng);
-        pBufHostOrigY[i] = dist(eng);
-    }
-    Val const alpha( dist(eng) );
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-    std::cout << __func__
-        << " alpha: " << alpha << std::endl;
-    std::cout << __func__ << " X_host: ";
-    alpaka::mem::view::print(memBufHostX, std::cout);
-    std::cout << std::endl;
-    std::cout << __func__ << " Y_host: ";
-    alpaka::mem::view::print(memBufHostOrigY, std::cout);
-    std::cout << std::endl;
-#endif
-
-    // Allocate the buffer on the accelerator.
-    auto memBufAccX(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-    auto memBufAccY(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-
-    // Copy Host -> Acc.
-    alpaka::mem::view::copy(queue, memBufAccX, memBufHostX, extent);
-    alpaka::mem::view::copy(queue, memBufAccY, memBufHostOrigY, extent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-    alpaka::wait::wait(queue);
-
-    std::cout << __func__ << " X_Dev: ";
-    alpaka::mem::view::print(memBufHostX, std::cout);
-    std::cout << std::endl;
-    std::cout << __func__ << " Y_Dev: ";
-    alpaka::mem::view::print(memBufHostX, std::cout);
-    std::cout << std::endl;
-#endif
-
-    // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        numElements,
-        alpha,
-        alpaka::mem::view::getPtrNative(memBufAccX),
-        alpaka::mem::view::getPtrNative(memBufAccY)));
-
-    // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queue,
-            taskKernel)
-        << " ms"
-        << std::endl;
-
-    // Copy back the result.
-    alpaka::mem::view::copy(queue, memBufHostY, memBufAccY, extent);
-
-    // Wait for the queue to finish the memory operation.
-    alpaka::wait::wait(queue);
-
-    bool resultCorrect(true);
-    for(Idx i(0u); i < numElements; ++i)
-    {
-        auto const & val(pBufHostY[i]);
-        auto const correctResult(alpha * pBufHostX[i] + pBufHostOrigY[i]);
-        auto const relDiff = std::abs((val - correctResult) / std::min(val, correctResult));
-        if( relDiff > std::numeric_limits<Val>::epsilon() )
-        {
-            std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
-            resultCorrect = false;
-        }
-    }
-
-    REQUIRE(resultCorrect);
-}
diff --git a/thirdParty/alpaka/test/integ/cudaOnly/CMakeLists.txt b/thirdParty/alpaka/test/integ/cudaOnly/CMakeLists.txt
deleted file mode 100644
index 7c60fc14db..0000000000
--- a/thirdParty/alpaka/test/integ/cudaOnly/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright 2016-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "cudaOnly")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
-
-IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE AND ALPAKA_ACC_GPU_CUDA_ENABLE)
-    ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
-ENDIF()
diff --git a/thirdParty/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp b/thirdParty/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp
deleted file mode 100644
index 4b5fa2d9cf..0000000000
--- a/thirdParty/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) && defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-
-//-----------------------------------------------------------------------------
-//! Native CUDA function.
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wmissing-prototypes"
-#endif
-__device__ auto userDefinedThreadFence()
--> void
-{
-    __threadfence();
-}
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-
-//#############################################################################
-class CudaOnlyTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        alpaka::ignore_unused(acc);
-
-        // We should be able to call some native CUDA functions when ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled.
-        __threadfence_block();
-        userDefinedThreadFence();
-        __threadfence_system();
-
-        *success = true;
-    }
-};
-
-
-//-----------------------------------------------------------------------------
-TEST_CASE("cudaOnlyModeWorking", "[cudaOnly]")
-{
-    using TAcc = alpaka::acc::AccGpuCudaRt<alpaka::dim::DimInt<1u>, std::uint32_t>;
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    CudaOnlyTestKernel kernel;
-
-    REQUIRE(fixture(kernel));
-}
-
-#endif
diff --git a/thirdParty/alpaka/test/integ/mandelbrot/CMakeLists.txt b/thirdParty/alpaka/test/integ/mandelbrot/CMakeLists.txt
deleted file mode 100644
index 8e2242d2df..0000000000
--- a/thirdParty/alpaka/test/integ/mandelbrot/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "mandelbrot")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp b/thirdParty/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp
deleted file mode 100644
index 4031357289..0000000000
--- a/thirdParty/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <alpaka/test/MeasureKernelRunTime.hpp>
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <iostream>
-#include <typeinfo>
-#include <fstream>
-#include <algorithm>
-
-//#define ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING  // Define this to enable the continuous color mapping.
-
-//#############################################################################
-//! Complex Number.
-template<
-    typename T>
-class SimpleComplex
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC SimpleComplex(
-        T const & a,
-        T const & b) :
-            r(a),
-            i(b)
-    {}
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_INLINE
-    ALPAKA_FN_HOST_ACC auto absSq() const
-    -> T
-    {
-        return r*r + i*i;
-    }
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC auto operator*(SimpleComplex const & a)
-    -> SimpleComplex
-    {
-        return SimpleComplex(r*a.r - i*a.i, i*a.r + r*a.i);
-    }
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC auto operator*(float const & a)
-    -> SimpleComplex
-    {
-        return SimpleComplex(r*a, i*a);
-    }
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC auto operator+(SimpleComplex const & a)
-    -> SimpleComplex
-    {
-        return SimpleComplex(r+a.r, i+a.i);
-    }
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC auto operator+(float const & a)
-    -> SimpleComplex
-    {
-        return SimpleComplex(r+a, i);
-    }
-
-public:
-    T r;
-    T i;
-};
-
-//#############################################################################
-//! A Mandelbrot kernel.
-class MandelbrotKernel
-{
-public:
-#ifndef ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING
-    //-----------------------------------------------------------------------------
-    ALPAKA_FN_HOST_ACC MandelbrotKernel()
-    {
-        // Banding can be prevented by a continuous color functions.
-        m_colors[0u] = convertRgbSingleToBgra(66, 30, 15);
-        m_colors[1u] = convertRgbSingleToBgra(25, 7, 26);
-        m_colors[2u] = convertRgbSingleToBgra(9, 1, 47);
-        m_colors[3u] = convertRgbSingleToBgra(4, 4, 73);
-        m_colors[4u] = convertRgbSingleToBgra(0, 7, 100);
-        m_colors[5u] = convertRgbSingleToBgra(12, 44, 138);
-        m_colors[6u] = convertRgbSingleToBgra(24, 82, 177);
-        m_colors[7u] = convertRgbSingleToBgra(57, 125, 209);
-        m_colors[8u] = convertRgbSingleToBgra(134, 181, 229);
-        m_colors[9u] = convertRgbSingleToBgra(211, 236, 248);
-        m_colors[10u] = convertRgbSingleToBgra(241, 233, 191);
-        m_colors[11u] = convertRgbSingleToBgra(248, 201, 95);
-        m_colors[12u] = convertRgbSingleToBgra(255, 170, 0);
-        m_colors[13u] = convertRgbSingleToBgra(204, 128, 0);
-        m_colors[14u] = convertRgbSingleToBgra(153, 87, 0);
-        m_colors[15u] = convertRgbSingleToBgra(106, 52, 3);
-    }
-#endif
-
-    //-----------------------------------------------------------------------------
-    //! \param acc The accelerator to be executed on.
-    //! \param pColors The output image.
-    //! \param numRows The number of rows in the image
-    //! \param numCols The number of columns in the image.
-    //! \param pitchBytes The pitch in bytes.
-    //! \param fMinR The left border.
-    //! \param fMaxR The right border.
-    //! \param fMinI The bottom border.
-    //! \param fMaxI The top border.
-    //! \param maxIterations The maximum number of iterations.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        std::uint32_t * const pColors,
-        std::uint32_t const & numRows,
-        std::uint32_t const & numCols,
-        std::uint32_t const & pitchBytes,
-        float const & fMinR,
-        float const & fMaxR,
-        float const & fMinI,
-        float const & fMaxI,
-        std::uint32_t const & maxIterations) const
-    -> void
-    {
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 2,
-            "The MandelbrotKernel expects 2-dimensional indices!");
-
-        auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc));
-        auto const & gridThreadIdxX(gridThreadIdx[1u]);
-        auto const & gridThreadIdxY(gridThreadIdx[0u]);
-
-        if((gridThreadIdxY < numRows) && (gridThreadIdxX < numCols))
-        {
-            SimpleComplex<float> c(
-                (fMinR + (static_cast<float>(gridThreadIdxX)/float(numCols-1)*(fMaxR - fMinR))),
-                (fMinI + (static_cast<float>(gridThreadIdxY)/float(numRows-1)*(fMaxI - fMinI))));
-
-            auto const iterationCount(iterateMandelbrot(c, maxIterations));
-
-            auto const pColorsRow(pColors + ((gridThreadIdxY * pitchBytes) / sizeof(std::uint32_t)));
-            pColorsRow[gridThreadIdxX] =
-#ifdef ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING
-                iterationCountToContinousColor(iterationCount, maxIterations);
-#else
-                iterationCountToRepeatedColor(iterationCount);
-#endif
-        }
-    }
-    //-----------------------------------------------------------------------------
-    //! \return The number of iterations until the Mandelbrot iteration with the given Value reaches the absolute value of 2.
-    //!     Only does maxIterations steps and returns maxIterations if the value would be higher.
-    ALPAKA_FN_ACC static auto iterateMandelbrot(
-        SimpleComplex<float> const & c,
-        std::uint32_t const & maxIterations)
-    -> std::uint32_t
-    {
-        SimpleComplex<float> z(0.0f, 0.0f);
-        for(std::uint32_t iterations(0); iterations<maxIterations; ++iterations)
-        {
-            z = z*z + c;
-            if(z.absSq() > 4.0f)
-            {
-                return iterations;
-            }
-        }
-        return maxIterations;
-    }
-
-    //-----------------------------------------------------------------------------
-    ALPAKA_FN_HOST_ACC static auto convertRgbSingleToBgra(
-        std::uint32_t const & r,
-        std::uint32_t const & g,
-        std::uint32_t const & b)
-    -> std::uint32_t
-    {
-        return 0xFF000000 | (r<<16) | (g<<8) | b;
-    }
-
-#ifdef ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING
-    //-----------------------------------------------------------------------------
-    //! This uses a simple mapping from iteration count to colors.
-    //! This leads to banding but allows a all pixels to be colored.
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC static auto iterationCountToContinousColor(
-        std::uint32_t const & iterationCount,
-        std::uint32_t const & maxIterations)
-    -> std::uint32_t
-    {
-        // Map the iteration count on the 0..1 interval.
-        float const t(static_cast<float>(iterationCount)/static_cast<float>(maxIterations));
-        float const oneMinusT(1.0f-t);
-        // Use some modified Bernstein polynomials for r, g, b.
-        std::uint32_t const r(static_cast<std::uint32_t>(9.0f*oneMinusT*t*t*t*255.0f));
-        std::uint32_t const g(static_cast<std::uint32_t>(15.0f*oneMinusT*oneMinusT*t*t*255.0f));
-        std::uint32_t const b(static_cast<std::uint32_t>(8.5f*oneMinusT*oneMinusT*oneMinusT*t*255.0f));
-        return convertRgbSingleToBgra(r, g, b);
-    }
-#else
-    //-----------------------------------------------------------------------------
-    //! This uses a simple mapping from iteration count to colors.
-    //! This leads to banding but allows a all pixels to be colored.
-    ALPAKA_FN_ACC auto iterationCountToRepeatedColor(
-        std::uint32_t const & iterationCount) const
-    -> std::uint32_t
-    {
-        return m_colors[iterationCount%16];
-    }
-
-    std::uint32_t m_colors[16];
-#endif
-};
-
-//-----------------------------------------------------------------------------
-//! Writes the buffer color data to a file.
-template<
-    typename TBuf>
-auto writeTgaColorImage(
-    std::string const & fileName,
-    TBuf const & bufRgba)
--> void
-{
-    static_assert(
-        alpaka::dim::Dim<TBuf>::value == 2,
-        "The buffer has to be 2 dimensional!");
-    static_assert(
-        std::is_integral<alpaka::elem::Elem<TBuf>>::value,
-        "The buffer element type has to be integral!");
-
-    // The width of the input buffer is in input elements.
-    auto const bufWidthElems(alpaka::extent::getWidth(bufRgba));
-    auto const bufWidthBytes(bufWidthElems * sizeof(alpaka::elem::Elem<TBuf>));
-    // The row width in bytes has to be dividable by 4 Bytes (RGBA).
-    ALPAKA_ASSERT(bufWidthBytes % sizeof(std::uint32_t) == 0);
-    // The number of colors in a row.
-    auto const bufWidthColors(bufWidthBytes / sizeof(std::uint32_t));
-    ALPAKA_ASSERT(bufWidthColors >= 1);
-    auto const bufHeightColors(alpaka::extent::getHeight(bufRgba));
-    ALPAKA_ASSERT(bufHeightColors >= 1);
-    auto const bufPitchBytes(alpaka::mem::view::getPitchBytes<alpaka::dim::Dim<TBuf>::value - 1u>(bufRgba));
-    ALPAKA_ASSERT(bufPitchBytes >= bufWidthBytes);
-
-    std::ofstream ofs(
-        fileName,
-        std::ofstream::out | std::ofstream::binary);
-    if(!ofs.is_open())
-    {
-        throw std::invalid_argument("Unable to open file: "+fileName);
-    }
-
-    // Write tga image header.
-    ofs.put(0x00);                      // Number of Characters in Identification Field.
-    ofs.put(0x00);                      // Color Map Type.
-    ofs.put(0x02);                      // Image Type Code.
-    ofs.put(0x00);                      // Color Map Origin.
-    ofs.put(0x00);
-    ofs.put(0x00);                      // Color Map Length.
-    ofs.put(0x00);
-    ofs.put(0x00);                      // Color Map Entry Size.
-    ofs.put(0x00);                      // X Origin of Image.
-    ofs.put(0x00);
-    ofs.put(0x00);                      // Y Origin of Image.
-    ofs.put(0x00);
-    ofs.put(static_cast<char>(bufWidthColors & 0xFFu)); // Width of Image.
-    ofs.put(static_cast<char>((bufWidthColors >> 8) & 0xFFu));
-    ofs.put(static_cast<char>(bufHeightColors & 0xFFu));// Height of Image.
-    ofs.put(static_cast<char>((bufHeightColors >> 8) & 0xFFu));
-    ofs.put(0x20);                      // Image Pixel Size.
-    ofs.put(0x20);                      // Image Descriptor Byte.
-
-    // Write the data.
-    char const * pData(reinterpret_cast<char const *>(alpaka::mem::view::getPtrNative(bufRgba)));
-    // If there is no padding, we can directly write the whole buffer data ...
-    if(bufPitchBytes == bufWidthBytes)
-    {
-        ofs.write(
-            pData,
-            static_cast<std::streamsize>(bufWidthBytes*bufHeightColors));
-    }
-    // ... else we have to write row by row.
-    else
-    {
-        for(auto row(decltype(bufHeightColors)(0)); row<bufHeightColors; ++row)
-        {
-            ofs.write(
-                pData + bufPitchBytes*row,
-                static_cast<std::streamsize>(bufWidthBytes));
-        }
-    }
-}
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<2u>,
-    std::uint32_t>;
-
-TEMPLATE_LIST_TEST_CASE( "mandelbrot", "[mandelbrot]", TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-#ifdef ALPAKA_CI
-    Idx const imageSize(1u<<5u);
-#else
-    Idx const imageSize(1u<<10u);
-#endif
-    Idx const numRows(imageSize);
-    Idx const numCols(imageSize);
-    float const fMinR(-2.0f);
-    float const fMaxR(+1.0f);
-    float const fMinI(-1.2f);
-    float const fMaxI(+1.2f);
-    Idx const maxIterations(300u);
-
-    using Val = std::uint32_t;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
-    using PltfHost = alpaka::pltf::PltfCpu;
-
-    // Create the kernel function object.
-    MandelbrotKernel kernel;
-
-    // Get the host device.
-    auto const devHost(
-        alpaka::pltf::getDevByIdx<PltfHost>(0u));
-
-    // Select a device to execute on.
-    auto const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-
-    // Get a queue on this device.
-    QueueAcc queue(
-        devAcc);
-
-    alpaka::vec::Vec<Dim, Idx> const extent(
-        static_cast<Idx>(numRows),
-        static_cast<Idx>(numCols));
-
-    // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extent,
-            alpaka::vec::Vec<Dim, Idx>::ones(),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout
-        << "MandelbrotKernel("
-        << " numRows:" << numRows
-        << ", numCols:" << numCols
-        << ", maxIterations:" << maxIterations
-        << ", accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", kernel: " << typeid(kernel).name()
-        << ", workDiv: " << workDiv
-        << ")" << std::endl;
-
-    // allocate host memory
-    auto bufColorHost(
-        alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-
-    // Allocate the buffer on the accelerator.
-    auto bufColorAcc(
-        alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-
-    // Copy Host -> Acc.
-    alpaka::mem::view::copy(queue, bufColorAcc, bufColorHost, extent);
-
-    // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        alpaka::mem::view::getPtrNative(bufColorAcc),
-        numRows,
-        numCols,
-        alpaka::mem::view::getPitchBytes<1u>(bufColorAcc),
-        fMinR,
-        fMaxR,
-        fMinI,
-        fMaxI,
-        maxIterations));
-
-    // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queue,
-            taskKernel)
-        << " ms"
-        << std::endl;
-
-    // Copy back the result.
-    alpaka::mem::view::copy(queue, bufColorHost, bufColorAcc, extent);
-
-    // Wait for the queue to finish the memory operation.
-    alpaka::wait::wait(queue);
-
-    // Write the image to a file.
-    std::string fileName("mandelbrot"+std::to_string(numCols)+"x"+std::to_string(numRows)+"_"+alpaka::acc::getAccName<Acc>()+".tga");
-    std::replace(fileName.begin(), fileName.end(), '<', '_');
-    std::replace(fileName.begin(), fileName.end(), '>', '_');
-    writeTgaColorImage(
-        fileName,
-        bufColorHost);
-}
diff --git a/thirdParty/alpaka/test/integ/matMul/CMakeLists.txt b/thirdParty/alpaka/test/integ/matMul/CMakeLists.txt
deleted file mode 100644
index 6c5e91a091..0000000000
--- a/thirdParty/alpaka/test/integ/matMul/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "matMul")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/integ/matMul/src/matMul.cpp b/thirdParty/alpaka/test/integ/matMul/src/matMul.cpp
deleted file mode 100644
index cda1f0855a..0000000000
--- a/thirdParty/alpaka/test/integ/matMul/src/matMul.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <alpaka/test/MeasureKernelRunTime.hpp>
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <iostream>
-#include <typeinfo>
-#include <vector>
-#include <functional>
-
-//#############################################################################
-//! A matrix multiplication kernel.
-//! Computes C + alpha*A*B + beta*C. LxM * MxN -> LxN
-//! This is an adaption of the algorithm from the CUDA developers guide.
-class MatMulKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    //! \tparam TAcc The accelerator environment to be executed on.
-    //! \tparam TElem The matrix element type.
-    //! \param acc The accelerator to be executed on.
-    //! \param m The height of the A matrix.
-    //! \param n The width of the A and height of the B matrix.
-    //! \param k The width of the B matrix.
-    //! \param A The pointer to the matrix A data.
-    //! \param lda The pitch of the A matrix in elements.
-    //! \param B The pointer to the matrix B data.
-    //! \param ldb The pitch of the B matrix in elements.
-    //! \param C The pointer to the matrix C data.
-    //! \param ldc The pitch of the C matrix in elements.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem,
-        typename TIndex>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TIndex const & m,
-        TIndex const & n,
-        TIndex const & k,
-        TElem const & alpha,
-        TElem const * const A,
-        TIndex const & lda,
-        TElem const * const B,
-        TIndex const & ldb,
-        TElem const & beta,
-        TElem * const C,
-        TIndex const & ldc) const
-    -> void
-    {
-        static_assert(alpaka::dim::Dim<TAcc>::value == 2u,
-            "The accelerator used for the GemmAlpakaKernel has to be 2 dimensional!");
-
-        // Column and row of C to calculate.
-        auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc));
-        auto const & gridThreadIdxX(gridThreadIdx[1u]);
-        auto const & gridThreadIdxY(gridThreadIdx[0u]);
-
-        // Column and row inside the block of C to calculate.
-        auto const blockThreadIdx(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc));
-        auto const & blockThreadIdxX(blockThreadIdx[1u]);
-        auto const & blockThreadIdxY(blockThreadIdx[0u]);
-
-        // The block threads extent.
-        auto const blockThreadExtent(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc));
-        auto const & blockThreadExtentX(blockThreadExtent[1u]);
-        auto const & blockThreadExtentY(blockThreadExtent[0u]);
-        //ALPAKA_ASSERT(blockThreadExtentX == blockThreadExtentY);
-        auto const & blockThreadExtentVal(blockThreadExtentX);
-
-        // Shared memory used to store the current blocks of A and B.
-        auto * const pBlockSharedA(alpaka::block::shared::dyn::getMem<TElem>(acc));
-        auto * const pBlockSharedB(pBlockSharedA + blockThreadExtentX*blockThreadExtentY);
-
-        auto const sharedBlockIdx1d(blockThreadIdxY*blockThreadExtentX + blockThreadIdxX);
-
-        // If the element corresponding to the current thread is outside of the respective matrix.
-        bool const insideA(gridThreadIdxY < m);
-        bool const insideB(gridThreadIdxX < n);
-        bool const insideC(insideA && insideB);
-
-        TElem dotProduct(0);
-
-        // Loop over all blocks of A and B that are required to compute the C block.
-        auto const blockMulCount(static_cast<TIndex>(std::ceil(static_cast<float>(k)/static_cast<float>(blockThreadExtentVal))));
-        for(TIndex k2(0u); k2 < blockMulCount; ++k2)
-        {
-            // Copy the current blocks of A and B into shared memory in parallel.
-            // If the element of the current thread is outside of the matrix, zero is written into the shared memory.
-            // This is possible because zero is a result neutral extension of the matrices regarding the dot product.
-            auto const AIdxX(k2*blockThreadExtentX + blockThreadIdxX);
-            auto const AIdx1d(gridThreadIdxY*lda + AIdxX);
-            pBlockSharedA[sharedBlockIdx1d] = (
-                ((!insideA) || (AIdxX>=k))
-                ? static_cast<TElem>(0)
-                : A[AIdx1d]);
-
-            auto const BIdxY(k2*blockThreadExtentY + blockThreadIdxY);
-            auto const BIdx1d(BIdxY*ldb + gridThreadIdxX);
-            pBlockSharedB[sharedBlockIdx1d] = (
-                ((!insideB) || (BIdxY>=k))
-                ? static_cast<TElem>(0)
-                : B[BIdx1d]);
-
-            // Synchronize to make sure the complete blocks are loaded before starting the computation.
-            alpaka::block::sync::syncBlockThreads(acc);
-
-            // Not really necessary because we wrote zeros into those cells.
-            //if(insideC)
-            //{
-                // Compute the dot products within shared memory.
-                for(TIndex k3(0); k3 < blockThreadExtentVal; ++k3)
-                {
-                    dotProduct += pBlockSharedA[blockThreadIdxY*blockThreadExtentX + k3]
-                        * pBlockSharedB[k3*blockThreadExtentY + blockThreadIdxX];
-                }
-            //}
-
-            // Synchronize to make sure that the preceding computation is done before loading the next blocks of A and B.
-            alpaka::block::sync::syncBlockThreads(acc);
-        }
-
-        // If the element is outside of the matrix it was only a helper thread that did not calculate any meaningful results.
-        if(insideC)
-        {
-            auto const CIdx1d(gridThreadIdxY*ldc + gridThreadIdxX);
-            C[CIdx1d] = alpha * dotProduct + beta * C[CIdx1d];
-        }
-    }
-};
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory for a kernel.
-            template<
-                typename TAcc>
-            struct BlockSharedMemDynSizeBytes<
-                MatMulKernel,
-                TAcc>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The size of the shared memory allocated for a block.
-                template<
-                    typename TVec,
-                    typename TIndex,
-                    typename TElem>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    MatMulKernel const & matMulKernel,
-                    TVec const & blockThreadExtent,
-                    TVec const & threadElemExtent,
-                    TIndex const & m,
-                    TIndex const & n,
-                    TIndex const & k,
-                    TElem const & alpha,
-                    TElem const * const A,
-                    TIndex const & lda,
-                    TElem const * const B,
-                    TIndex const & ldb,
-                    TElem const & beta,
-                    TElem * const C,
-                    TIndex const & ldc)
-                -> TIndex
-                {
-                    alpaka::ignore_unused(matMulKernel);
-                    alpaka::ignore_unused(m);
-                    alpaka::ignore_unused(n);
-                    alpaka::ignore_unused(k);
-                    alpaka::ignore_unused(alpha);
-                    alpaka::ignore_unused(A);
-                    alpaka::ignore_unused(lda);
-                    alpaka::ignore_unused(B);
-                    alpaka::ignore_unused(ldb);
-                    alpaka::ignore_unused(beta);
-                    alpaka::ignore_unused(C);
-                    alpaka::ignore_unused(ldc);
-
-                    // Reserve the buffer for the two blocks of A and B.
-                    return 2u * blockThreadExtent.prod() * threadElemExtent.prod() * sizeof(TElem);
-                }
-            };
-        }
-    }
-}
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<2u>,
-    std::uint32_t>;
-
-TEMPLATE_LIST_TEST_CASE( "matMul", "[matMul]", TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    Idx const m(64u);
-    Idx const n(79u);
-    Idx const k(23u);
-
-    using Val = std::uint32_t;
-    using Vec2 = alpaka::vec::Vec<Dim, Idx>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<alpaka::dev::Dev<Acc>>;
-    using PltfHost = alpaka::pltf::PltfCpu;
-    using DevHost = alpaka::dev::Dev<PltfHost>;
-    using QueueHost = alpaka::queue::QueueCpuNonBlocking;
-
-    // Create the kernel function object.
-    MatMulKernel kernel;
-
-    // Get the host device.
-    DevHost const devHost(
-        alpaka::pltf::getDevByIdx<PltfHost>(0u));
-
-    // Get a queue on the host device.
-    QueueHost queueHost(
-        devHost);
-
-    // Select a device to execute on.
-    DevAcc const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-
-    // Get a queue on the accelerator device.
-    QueueAcc queueAcc(
-        devAcc);
-
-    // Specify the input matrix extents.
-    Vec2 const extentA(
-        static_cast<Idx>(m),
-        static_cast<Idx>(k));
-
-    Vec2 const extentB(
-        static_cast<Idx>(k),
-        static_cast<Idx>(n));
-
-    // Result matrix is MxN. We create one worker per result matrix cell.
-    Vec2 const extentC(
-        static_cast<Idx>(m),
-        static_cast<Idx>(n));
-
-    // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extentC,
-            alpaka::vec::Vec<Dim, Idx>::ones(),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::EqualExtent));
-
-    std::cout
-        << "MatMulKernel("
-        << "m:" << m
-        << ", n:" << n
-        << ", k:" << k
-        << ", accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", kernel: " << typeid(kernel).name()
-        << ", workDiv: " << workDiv
-        << ")" << std::endl;
-
-    // Allocate the A and B matrices as std::vectors because this allows them to be filled with uint32_t(1).
-    // alpaka::mem::view::set only supports setting all bytes leading to a value of 16843009 in all elements.
-    std::vector<Val> bufAHost1d(m * k, static_cast<Val>(1));
-    std::vector<Val> bufBHost1d(k * n, static_cast<Val>(1));
-    // Wrap the std::vectors into a memory buffer object.
-    // For 1D data this would not be required because alpaka::mem::view::copy is specialized for std::vector and std::array.
-    // For multi dimensional data you could directly create them using alpaka::mem::buf::alloc<Type>(devHost, extent), which is not used here.
-    // Instead we use ViewPlainPtr to wrap the data.
-    using BufWrapper = alpaka::mem::view::ViewPlainPtr<
-        DevHost,
-        Val,
-        Dim,
-        Idx>;
-    BufWrapper bufAHost(bufAHost1d.data(), devHost, extentA);
-    BufWrapper bufBHost(bufBHost1d.data(), devHost, extentB);
-
-    // Allocate C and set it to zero.
-    auto bufCHost(alpaka::mem::buf::alloc<Val, Idx>(devHost, extentC));
-    alpaka::mem::view::set(queueHost, bufCHost, 0u, extentC);
-
-    // Allocate the buffers on the accelerator.
-    auto bufAAcc(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extentA));
-    auto bufBAcc(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extentB));
-    auto bufCAcc(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extentC));
-
-    // Copy Host -> Acc.
-    alpaka::mem::view::copy(queueAcc, bufAAcc, bufAHost, extentA);
-    alpaka::mem::view::copy(queueAcc, bufBAcc, bufBHost, extentB);
-    alpaka::wait::wait(queueHost);
-    alpaka::mem::view::copy(queueAcc, bufCAcc, bufCHost, extentC);
-
-    // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        m,
-        n,
-        k,
-        static_cast<Val>(1),
-        alpaka::mem::view::getPtrNative(bufAAcc),
-        static_cast<Idx>(alpaka::mem::view::getPitchBytes<1u>(bufAAcc) / sizeof(Val)),
-        alpaka::mem::view::getPtrNative(bufBAcc),
-        static_cast<Idx>(alpaka::mem::view::getPitchBytes<1u>(bufBAcc) / sizeof(Val)),
-        static_cast<Val>(1),
-        alpaka::mem::view::getPtrNative(bufCAcc),
-        static_cast<Idx>(alpaka::mem::view::getPitchBytes<1u>(bufCAcc) / sizeof(Val))));
-
-    // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queueAcc,
-            taskKernel)
-        << " ms"
-        << std::endl;
-
-    // Copy back the result.
-    alpaka::mem::view::copy(queueAcc, bufCHost, bufCAcc, extentC);
-
-    // Wait for the queue to finish the memory operation.
-    alpaka::wait::wait(queueAcc);
-
-    // Assert that the results are correct.
-    // When multiplying square matrices filled with ones, the result of each cell is the size of the matrix.
-    auto const correctResult(static_cast<Val>(k));
-
-    bool resultCorrect(true);
-    auto const pHostData(alpaka::mem::view::getPtrNative(bufCHost));
-    for(Idx i(0u);
-        i < m * n;
-        ++i)
-    {
-        auto const & val(pHostData[i]);
-        if(val != correctResult)
-        {
-            std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
-            resultCorrect = false;
-        }
-    }
-
-    REQUIRE(resultCorrect);
-}
diff --git a/thirdParty/alpaka/test/integ/separableCompilation/CMakeLists.txt b/thirdParty/alpaka/test/integ/separableCompilation/CMakeLists.txt
deleted file mode 100644
index c394d439b2..0000000000
--- a/thirdParty/alpaka/test/integ/separableCompilation/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-IF((NOT ALPAKA_ACC_GPU_CUDA_ENABLE AND NOT ALPAKA_ACC_GPU_HIP_ENABLE) OR (ALPAKA_ACC_GPU_CUDA_ENABLE AND ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION AND ALPAKA_CUDA_COMPILER MATCHES "nvcc"))
-
-SET(_TARGET_NAME "separableCompilation")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-append_recursive_files_add_to_src_group("include/" "include/" "hpp" _FILES_HEADER)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE}
-    ${_FILES_HEADER})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE "include"
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
-
-ENDIF()
diff --git a/thirdParty/alpaka/test/integ/separableCompilation/include/mysqrt.hpp b/thirdParty/alpaka/test/integ/separableCompilation/include/mysqrt.hpp
deleted file mode 100644
index cc9c83838e..0000000000
--- a/thirdParty/alpaka/test/integ/separableCompilation/include/mysqrt.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/alpaka.hpp>
-
-ALPAKA_FN_HOST_ACC auto mysqrt(double x) -> double;
diff --git a/thirdParty/alpaka/test/integ/separableCompilation/src/main.cpp b/thirdParty/alpaka/test/integ/separableCompilation/src/main.cpp
deleted file mode 100644
index 317f4694b1..0000000000
--- a/thirdParty/alpaka/test/integ/separableCompilation/src/main.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include "mysqrt.hpp"
-
-#include <alpaka/test/MeasureKernelRunTime.hpp>
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <iostream>
-#include <typeinfo>
-
-//#############################################################################
-//! A vector addition kernel.
-class SqrtKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    //! The kernel entry point.
-    //!
-    //! \tparam TAcc The accelerator environment to be executed on.
-    //! \tparam TElem The matrix element type.
-    //! \param acc The accelerator to be executed on.
-    //! \param A The first source vector.
-    //! \param B The second source vector.
-    //! \param C The destination vector.
-    //! \param numElements The number of elements.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem,
-        typename TIdx>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TElem const * const A,
-        TElem const * const B,
-        TElem * const C,
-        TIdx const & numElements) const
-    -> void
-    {
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 1,
-            "The VectorAddKernel expects 1-dimensional indices!");
-
-        auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
-        auto const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
-
-        if(threadFirstElemIdx < numElements)
-        {
-            // Calculate the number of elements to compute in this thread.
-            // The result is uniform for all but the last thread.
-            auto const threadLastElemIdx(threadFirstElemIdx+threadElemExtent);
-            auto const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
-
-            for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i)
-            {
-                C[i] = mysqrt(A[i]) + mysqrt(B[i]);
-            }
-        }
-    }
-};
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::size_t>;
-
-TEMPLATE_LIST_TEST_CASE( "separableCompilation", "[separableCompilation]", TestAccs)
-{
-    using Acc = TestType;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    using Val = double;
-
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<alpaka::dev::Dev<Acc>>;
-    using PltfHost = alpaka::pltf::PltfCpu;
-    using DevHost = alpaka::dev::Dev<PltfHost>;
-
-    Idx const numElements(32);
-
-    // Create the kernel function object.
-    SqrtKernel kernel;
-
-    // Get the host device.
-    DevHost const devHost(
-        alpaka::pltf::getDevByIdx<PltfHost>(0u));
-
-    // Select a device to execute on.
-    DevAcc const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0));
-
-    // Get a queue on this device.
-    QueueAcc queueAcc(devAcc);
-
-    // The data extent.
-    alpaka::vec::Vec<alpaka::dim::DimInt<1u>, Idx> const extent(
-        numElements);
-
-    // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::workdiv::WorkDivMembers<alpaka::dim::DimInt<1u>, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extent,
-            static_cast<Idx>(3u),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout
-        << typeid(kernel).name() << "("
-        << "accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", workDiv: " << workDiv
-        << ", numElements:" << numElements
-        << ")" << std::endl;
-
-    // Allocate host memory buffers.
-    auto memBufHostA(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    auto memBufHostB(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    auto memBufHostC(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-
-    // Initialize the host input vectors
-    for (Idx i(0); i < numElements; ++i)
-    {
-        alpaka::mem::view::getPtrNative(memBufHostA)[i] = static_cast<Val>(rand()) / static_cast<Val>(RAND_MAX);
-        alpaka::mem::view::getPtrNative(memBufHostB)[i] = static_cast<Val>(rand()) / static_cast<Val>(RAND_MAX);
-    }
-
-    // Allocate the buffers on the accelerator.
-    auto memBufAccA(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-    auto memBufAccB(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-    auto memBufAccC(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-
-    // Copy Host -> Acc.
-    alpaka::mem::view::copy(queueAcc, memBufAccA, memBufHostA, extent);
-    alpaka::mem::view::copy(queueAcc, memBufAccB, memBufHostB, extent);
-
-    // Create the executor task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        alpaka::mem::view::getPtrNative(memBufAccA),
-        alpaka::mem::view::getPtrNative(memBufAccB),
-        alpaka::mem::view::getPtrNative(memBufAccC),
-        numElements));
-
-    // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queueAcc,
-            taskKernel)
-        << " ms"
-        << std::endl;
-
-    // Copy back the result.
-    alpaka::mem::view::copy(queueAcc, memBufHostC, memBufAccC, extent);
-    alpaka::wait::wait(queueAcc);
-
-    bool resultCorrect(true);
-    auto const pHostData(alpaka::mem::view::getPtrNative(memBufHostC));
-    for(Idx i(0u);
-        i < numElements;
-        ++i)
-    {
-        auto const & val(pHostData[i]);
-        auto const correctResult(std::sqrt(alpaka::mem::view::getPtrNative(memBufHostA)[i]) + std::sqrt(alpaka::mem::view::getPtrNative(memBufHostB)[i]));
-        auto const absDiff = (val - correctResult);
-        if( absDiff > std::numeric_limits<Val>::epsilon() )
-        {
-            std::cout << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
-            resultCorrect = false;
-        }
-    }
-
-    REQUIRE(true == resultCorrect);
-}
diff --git a/thirdParty/alpaka/test/integ/separableCompilation/src/mysqrt.cpp b/thirdParty/alpaka/test/integ/separableCompilation/src/mysqrt.cpp
deleted file mode 100644
index a5ea90a625..0000000000
--- a/thirdParty/alpaka/test/integ/separableCompilation/src/mysqrt.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include "mysqrt.hpp"
-
-// a square root calculation using simple operations
-ALPAKA_FN_HOST_ACC auto mysqrt(double x)
--> double
-{
-  if (x <= 0) {
-    return 0.0;
-  }
-
-  double result = x;
-
-  for (int i = 0; i < 100; ++i) {
-    if (result <= 0) {
-      result = 0.1;
-    }
-    double delta = x - (result * result);
-    result = result + 0.5 * delta / result;
-  }
-  return result;
-}
diff --git a/thirdParty/alpaka/test/integ/sharedMem/CMakeLists.txt b/thirdParty/alpaka/test/integ/sharedMem/CMakeLists.txt
deleted file mode 100644
index 5f083f5210..0000000000
--- a/thirdParty/alpaka/test/integ/sharedMem/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "sharedMem")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/integ/sharedMem/src/sharedMem.cpp b/thirdParty/alpaka/test/integ/sharedMem/src/sharedMem.cpp
deleted file mode 100644
index e1e49b59a1..0000000000
--- a/thirdParty/alpaka/test/integ/sharedMem/src/sharedMem.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/alpaka.hpp>
-
-#include <alpaka/test/MeasureKernelRunTime.hpp>
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <iostream>
-#include <typeinfo>
-#include <vector>
-
-//#############################################################################
-//! A kernel using atomicOp, syncBlockThreads, getMem, getIdx, getWorkDiv and global memory to compute a (useless) result.
-//! \tparam TnumUselessWork The number of useless calculations done in each kernel execution.
-template<
-    typename TnumUselessWork,
-    typename Val>
-class SharedMemKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        Val * const puiBlockRetVals) const
-    -> void
-    {
-        using Idx = alpaka::idx::Idx<TAcc>;
-
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 1,
-            "The SharedMemKernel expects 1-dimensional indices!");
-
-        // The number of threads in this block.
-        Idx const blockThreadCount(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
-
-        // Get the dynamically allocated shared memory.
-        Val * const pBlockShared(alpaka::block::shared::dyn::getMem<Val>(acc));
-
-        // Calculate linearized index of the thread in the block.
-        Idx const blockThreadIdx1d(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
-
-
-        // Fill the shared block with the thread ids [1+X, 2+X, 3+X, ..., #Threads+X].
-        auto sum1 = static_cast<Val>(blockThreadIdx1d+1);
-        for(Val i(0); i<static_cast<Val>(TnumUselessWork::value); ++i)
-        {
-            sum1 += i;
-        }
-        pBlockShared[blockThreadIdx1d] = sum1;
-
-
-        // Synchronize all threads because now we are writing to the memory again but inverse.
-        alpaka::block::sync::syncBlockThreads(acc);
-
-        // Do something useless.
-        auto sum2 = static_cast<Val>(blockThreadIdx1d);
-        for(Val i(0); i<static_cast<Val>(TnumUselessWork::value); ++i)
-        {
-            sum2 -= i;
-        }
-        // Add the inverse so that every cell is filled with [#Threads, #Threads, ..., #Threads].
-        pBlockShared[(blockThreadCount-1)-blockThreadIdx1d] += sum2;
-
-
-        // Synchronize all threads again.
-        alpaka::block::sync::syncBlockThreads(acc);
-
-        // Now add up all the cells atomically and write the result to cell 0 of the shared memory.
-        if(blockThreadIdx1d > 0)
-        {
-            alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &pBlockShared[0], pBlockShared[blockThreadIdx1d]);
-        }
-
-
-        alpaka::block::sync::syncBlockThreads(acc);
-
-        // Only master writes result to global memory.
-        if(blockThreadIdx1d==0)
-        {
-            // Calculate linearized block id.
-            Idx const gridBlockIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
-
-            puiBlockRetVals[gridBlockIdx] = pBlockShared[0];
-        }
-    }
-};
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory for a kernel.
-            template<
-                typename TnumUselessWork,
-                typename Val,
-                typename TAcc>
-            struct BlockSharedMemDynSizeBytes<
-                SharedMemKernel<TnumUselessWork, Val>,
-                TAcc>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The size of the shared memory allocated for a block.
-                template<
-                    typename TVec,
-                    typename... TArgs>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    SharedMemKernel<TnumUselessWork, Val> const & sharedMemKernel,
-                    TVec const & blockThreadExtent,
-                    TVec const & threadElemExtent,
-                    TArgs && ...)
-                -> idx::Idx<TAcc>
-                {
-                    alpaka::ignore_unused(sharedMemKernel);
-                    return blockThreadExtent.prod() * threadElemExtent.prod() * static_cast<idx::Idx<TAcc>>(sizeof(Val));
-                }
-            };
-        }
-    }
-}
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::uint32_t>;
-
-TEMPLATE_LIST_TEST_CASE( "sharedMem", "[sharedMem]", TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    Idx const numElements = 1u<<16u;
-
-    using Val = std::int32_t;
-    using TnumUselessWork = std::integral_constant<Idx, 100>;
-
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
-
-
-    // Create the kernel function object.
-    SharedMemKernel<TnumUselessWork, Val> kernel;
-
-    // Select a device to execute on.
-    auto const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-
-    // Get a queue on this device.
-    QueueAcc queue(
-        devAcc);
-
-    // Set the grid blocks extent.
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            numElements,
-            static_cast<Idx>(1u),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout
-        << "SharedMemKernel("
-        << " accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", kernel: " << typeid(kernel).name()
-        << ", workDiv: " << workDiv
-        << ")" << std::endl;
-
-    Idx const gridBlocksCount(
-        alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Blocks>(workDiv)[0u]);
-    Idx const blockThreadCount(
-        alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(workDiv)[0u]);
-
-    // An array for the return values calculated by the blocks.
-    std::vector<Val> blockRetVals(static_cast<std::size_t>(gridBlocksCount));
-
-    // Allocate accelerator buffers and copy.
-    Idx const resultElemCount(gridBlocksCount);
-    auto blockRetValsAcc(alpaka::mem::buf::alloc<Val, Idx>(devAcc, resultElemCount));
-    alpaka::mem::view::copy(queue, blockRetValsAcc, blockRetVals, resultElemCount);
-
-    // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        alpaka::mem::view::getPtrNative(blockRetValsAcc)));
-
-    // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queue,
-            taskKernel)
-        << " ms"
-        << std::endl;
-
-    // Copy back the result.
-    alpaka::mem::view::copy(queue, blockRetVals, blockRetValsAcc, resultElemCount);
-
-    // Wait for the queue to finish the memory operation.
-    alpaka::wait::wait(queue);
-
-    // Assert that the results are correct.
-    Val const correctResult(
-        static_cast<Val>(blockThreadCount*blockThreadCount));
-
-    bool resultCorrect(true);
-    for(Idx i(0); i<gridBlocksCount; ++i)
-    {
-        auto const val(blockRetVals[static_cast<std::size_t>(i)]);
-        if(val != correctResult)
-        {
-            std::cerr << "blockRetVals[" << i << "] == " << val << " != " << correctResult << std::endl;
-            resultCorrect = false;
-        }
-    }
-
-    REQUIRE(resultCorrect);
-}
diff --git a/thirdParty/alpaka/test/sanitizer_lsan_blacklist.txt b/thirdParty/alpaka/test/sanitizer_lsan_blacklist.txt
deleted file mode 100644
index 754a9c42ef..0000000000
--- a/thirdParty/alpaka/test/sanitizer_lsan_blacklist.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-#0 0xe95b9e in __interceptor_strdup /home/development/llvm/3.6.2/final/llvm.src/projects/compiler-rt/lib/asan/asan_interceptors.cc:603:3
-#1 0x7f445bc5eeb2  (/usr/lib/libtbb.so.2+0x12eb2)
-leak:libtbb.so
-#0 0x1dfd623 in __interceptor_strdup /work/release-test/final/llvm.src/projects/compiler-rt/lib/asan/asan_interceptors.cc:550:3
-#1 0x7efe2921da30 in __kmp_itt_thread_set_name_init_3_0(char const*) (/home/travis/cache/llvm/llvm-4.0.0/lib/libomp.so+0x19a30)
-leak:libomp.so
diff --git a/thirdParty/alpaka/test/sanitizer_ubsan_blacklist.txt b/thirdParty/alpaka/test/sanitizer_ubsan_blacklist.txt
deleted file mode 100644
index 5c6debaaec..0000000000
--- a/thirdParty/alpaka/test/sanitizer_ubsan_blacklist.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# boost/random/mersenne_twister.hpp:152:23: runtime error: unsigned integer overflow: 1812433253 * 5489 cannot be represented in type 'unsigned int'
-src:*/boost/random/mersenne_twister.hpp
-# boost/uuid/sha1.hpp:171:43: runtime error: unsigned integer overflow: 3903086636 + 2562383102 cannot be represented in type 'unsigned int'
-src:*/boost/uuid/sha1.hpp
-# boost/boost/fiber/context.hpp:570:25: runtime error: constructor call on misaligned address 0x7f8f6c0113e0 for type 'boost::fibers::context', which requires 64 byte alignment
-src:*boost/fiber/context.hpp
-# boost/fiber/condition_variable.hpp:71:9: runtime error: member call on misaligned address 0x0000066b2d20 for type 'boost::fibers::context', which requires 64 byte alignment
-src:*boost/fiber/condition_variable.hpp
-src:*boost/fiber/operations.hpp
-# boost/intrusive/slist.hpp:452:52: runtime error: reference binding to misaligned address 0x0000066b2d20 for type 'boost::fibers::context', which requires 64 byte alignment
-src:*boost/intrusive/slist.hpp
-src:*boost/intrusive/detail/hook_traits.hpp
-# /usr/include/tbb/task.h:705:30: runtime error: member call on address 0x7ff8ceee3200 which does not point to an object of type 'tbb::internal::scheduler'
-src:*/tbb/task.h
-# /usr/lib/gcc/x86_64-linux-gnu/5.4.1/../../../../include/c++/5.4.1/memory:118:54: runtime error: negation of 64 cannot be represented in type 'size_t' (aka 'unsigned long')
-src:*/memory
-# /usr/lib/gcc/x86_64-linux-gnu/5.4.1/../../../../include/c++/5.4.1/bits/random.tcc:416:33: runtime error: unsigned integer overflow: 397 - 624 cannot be represented in type 'unsigned long'
-src:*/bits/random.tcc
diff --git a/thirdParty/alpaka/test/unit/CMakeLists.txt b/thirdParty/alpaka/test/unit/CMakeLists.txt
deleted file mode 100644
index 02f21adda5..0000000000
--- a/thirdParty/alpaka/test/unit/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-#
-# Copyright 2015-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-################################################################################
-# Required CMake version.
-################################################################################
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.0)
-
-################################################################################
-# Add subdirectories.
-################################################################################
-
-ADD_SUBDIRECTORY("acc/")
-ADD_SUBDIRECTORY("atomic/")
-ADD_SUBDIRECTORY("block/shared/")
-ADD_SUBDIRECTORY("block/sync/")
-ADD_SUBDIRECTORY("core/")
-ADD_SUBDIRECTORY("event/")
-ADD_SUBDIRECTORY("idx/")
-ADD_SUBDIRECTORY("kernel/")
-ADD_SUBDIRECTORY("math/sincos/")
-ADD_SUBDIRECTORY("mem/buf/")
-ADD_SUBDIRECTORY("mem/view/")
-ADD_SUBDIRECTORY("mem/p2p/")
-ADD_SUBDIRECTORY("meta/")
-ADD_SUBDIRECTORY("queue/")
-ADD_SUBDIRECTORY("rand/")
-ADD_SUBDIRECTORY("time/")
-ADD_SUBDIRECTORY("vec/")
diff --git a/thirdParty/alpaka/test/unit/acc/CMakeLists.txt b/thirdParty/alpaka/test/unit/acc/CMakeLists.txt
deleted file mode 100644
index b913847dc0..0000000000
--- a/thirdParty/alpaka/test/unit/acc/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "acc")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/acc/src/AccNameTest.cpp b/thirdParty/alpaka/test/unit/acc/src/AccNameTest.cpp
deleted file mode 100644
index 31e139f31c..0000000000
--- a/thirdParty/alpaka/test/unit/acc/src/AccNameTest.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/acc/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <iostream>
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "getAccName", "[acc]", alpaka::test::acc::TestAccs)
-{
-    std::cout << alpaka::acc::getAccName<TestType>() << std::endl;
-}
diff --git a/thirdParty/alpaka/test/unit/atomic/CMakeLists.txt b/thirdParty/alpaka/test/unit/atomic/CMakeLists.txt
deleted file mode 100644
index d9b4717f1c..0000000000
--- a/thirdParty/alpaka/test/unit/atomic/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2016-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "atomic")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/atomic/src/AtomicTest.cpp b/thirdParty/alpaka/test/unit/atomic/src/AtomicTest.cpp
deleted file mode 100644
index 0fa4d9cf30..0000000000
--- a/thirdParty/alpaka/test/unit/atomic/src/AtomicTest.cpp
+++ /dev/null
@@ -1,970 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/atomic/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <climits>
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicAdd(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Add>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig + value;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicSub(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Sub>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig - value;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicMin(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Min>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = (operandOrig < value) ? operandOrig : value;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicMax(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Max>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = (operandOrig > value) ? operandOrig : value;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicExch(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Exch>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = value;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicInc(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    // \TODO: Check reset to 0 at 'value'.
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(42);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Inc>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig + 1;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicDec(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    // \TODO: Check reset to 'value' at 0.
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(42);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Dec>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig - 1;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicAnd(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::And>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig & value;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicOr(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Or>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig | value;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicXor(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = operandOrig + static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Xor>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig ^ value;
-    ALPAKA_CHECK(*success, operand == reference);
-}
-
-//-----------------------------------------------------------------------------
-ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicCas(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
-{
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-
-    //-----------------------------------------------------------------------------
-    // with match
-    {
-        operand = operandOrig;
-        T const compare = operandOrig;
-        T const value = static_cast<T>(4);
-        T const ret =
-            alpaka::atomic::atomicOp<
-                alpaka::atomic::op::Cas>(
-                    acc,
-                    &operand,
-                    compare,
-                    value);
-        ALPAKA_CHECK(*success, operandOrig == ret);
-        T const reference = value;
-        ALPAKA_CHECK(*success, operand == reference);
-    }
-
-    //-----------------------------------------------------------------------------
-    // without match
-    {
-        operand = operandOrig;
-        T const compare = operandOrig + static_cast<T>(1);
-        T const value = static_cast<T>(4);
-        T const ret =
-            alpaka::atomic::atomicOp<
-                alpaka::atomic::op::Cas>(
-                    acc,
-                    &operand,
-                    compare,
-                    value);
-        ALPAKA_CHECK(*success, operandOrig == ret);
-        T const reference = operandOrig;
-        ALPAKA_CHECK(*success, operand == reference);
-    }
-}
-
-//#############################################################################
-template<
-    typename TAcc,
-    typename T,
-    typename Sfinae = void>
-class AtomicTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success,
-        T operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        testAtomicSub(acc, success, operandOrig);
-
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-
-        testAtomicExch(acc, success, operandOrig);
-
-        testAtomicInc(acc, success, operandOrig);
-        testAtomicDec(acc, success, operandOrig);
-
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    int>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        int operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        testAtomicSub(acc, success, operandOrig);
-
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-
-        testAtomicExch(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
-
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-// NOTE: unsigned int is the only type supported by all atomic CUDA operations.
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    unsigned int>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned int operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        testAtomicSub(acc, success, operandOrig);
-
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-
-        testAtomicExch(acc, success, operandOrig);
-
-        testAtomicInc(acc, success, operandOrig);
-        testAtomicDec(acc, success, operandOrig);
-
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    unsigned long int>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned long int operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-#if UINT_MAX == ULONG_MAX // LLP64
-        testAtomicSub(acc, success, operandOrig);
-#endif
-
-#if ULONG_MAX == ULLONG_MAX // LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-#endif
-#endif
-
-        testAtomicExch(acc, success, operandOrig);
-
-#if UINT_MAX == ULONG_MAX // LLP64
-        testAtomicInc(acc, success, operandOrig);
-        testAtomicDec(acc, success, operandOrig);
-#endif
-
-#if ULONG_MAX == ULLONG_MAX // LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-#endif
-#endif
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    unsigned long long int>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned long long int operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        // Not supported
-        //testAtomicSub(acc, success, operandOrig);
-
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-#endif
-
-        testAtomicExch(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
-
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-#endif
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    float>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        float operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        // Not supported
-        //testAtomicSub(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicMin(acc, success, operandOrig);
-        //testAtomicMax(acc, success, operandOrig);
-
-        testAtomicExch(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicAnd(acc, success, operandOrig);
-        //testAtomicOr(acc, success, operandOrig);
-        //testAtomicXor(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    double>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        double operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        // Not supported
-        //testAtomicSub(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicMin(acc, success, operandOrig);
-        //testAtomicMax(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicExch(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicAnd(acc, success, operandOrig);
-        //testAtomicOr(acc, success, operandOrig);
-        //testAtomicXor(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx,
-    typename T>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    T,
-    typename std::enable_if<
-        !std::is_same<int, T>::value
-        && !std::is_same<unsigned int, T>::value
-        && !std::is_same<unsigned long int, T>::value
-        && !std::is_same<unsigned long long int, T>::value
-        && !std::is_same<float, T>::value
-        && !std::is_same<double, T>::value
-    >::type>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        T operandOrig) const
-    -> void
-    {
-        alpaka::ignore_unused(acc);
-        alpaka::ignore_unused(operandOrig);
-
-        // All other types are not supported by CUDA atomic operations.
-        ALPAKA_CHECK(*success, true);
-    }
-};
-#endif
-
-#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    int>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        int operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        testAtomicSub(acc, success, operandOrig);
-
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-
-        testAtomicExch(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
-
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-// NOTE: unsigned int is the only type supported by all atomic HIP operations.
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    unsigned int>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned int operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        testAtomicSub(acc, success, operandOrig);
-
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-
-        testAtomicExch(acc, success, operandOrig);
-
-        testAtomicInc(acc, success, operandOrig);
-        testAtomicDec(acc, success, operandOrig);
-
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    unsigned long int>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned long int operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-#if UINT_MAX == ULONG_MAX // LLP64
-        testAtomicSub(acc, success, operandOrig);
-#endif
-
-#if ULONG_MAX == ULLONG_MAX // LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-#endif
-#endif
-
-        testAtomicExch(acc, success, operandOrig);
-
-#if UINT_MAX == ULONG_MAX // LLP64
-        testAtomicInc(acc, success, operandOrig);
-        testAtomicDec(acc, success, operandOrig);
-#endif
-
-#if ULONG_MAX == ULLONG_MAX // LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-#endif
-#endif
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    unsigned long long int>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned long long int operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        // Not supported
-        //testAtomicSub(acc, success, operandOrig);
-
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-        testAtomicMin(acc, success, operandOrig);
-        testAtomicMax(acc, success, operandOrig);
-#endif
-
-        testAtomicExch(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
-
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-        testAtomicAnd(acc, success, operandOrig);
-        testAtomicOr(acc, success, operandOrig);
-        testAtomicXor(acc, success, operandOrig);
-#endif
-
-        testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    float>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        float operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        // Not supported
-        //testAtomicSub(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicMin(acc, success, operandOrig);
-        //testAtomicMax(acc, success, operandOrig);
-
-        testAtomicExch(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicAnd(acc, success, operandOrig);
-        //testAtomicOr(acc, success, operandOrig);
-        //testAtomicXor(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    double>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        double operandOrig) const
-    -> void
-    {
-        testAtomicAdd(acc, success, operandOrig);
-        // Not supported
-        //testAtomicSub(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicMin(acc, success, operandOrig);
-        //testAtomicMax(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicExch(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicAnd(acc, success, operandOrig);
-        //testAtomicOr(acc, success, operandOrig);
-        //testAtomicXor(acc, success, operandOrig);
-
-        // Not supported
-        //testAtomicCas(acc, success, operandOrig);
-    }
-};
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx,
-    typename T>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    T,
-    typename std::enable_if<
-        !std::is_same<int, T>::value
-        && !std::is_same<unsigned int, T>::value
-        && !std::is_same<unsigned long int, T>::value
-        && !std::is_same<unsigned long long int, T>::value
-        && !std::is_same<float, T>::value
-        && !std::is_same<double, T>::value
-    >::type>
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        T operandOrig) const
-    -> void
-    {
-        alpaka::ignore_unused(acc);
-        alpaka::ignore_unused(operandOrig);
-
-        // All other types are not supported by HIP atomic operations.
-        ALPAKA_CHECK(*success, true);
-    }
-};
-#endif
-
-
-//#############################################################################
-template<
-    typename TAcc,
-    typename T>
-struct TestAtomicOperations
-{
-    //-----------------------------------------------------------------------------
-    static auto testAtomicOperations()
-    -> void
-    {
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-
-        alpaka::test::KernelExecutionFixture<TAcc> fixture(
-            alpaka::vec::Vec<Dim, Idx>::ones());
-
-        AtomicTestKernel<TAcc, T> kernel;
-
-        T value = static_cast<T>(32);
-        REQUIRE(fixture(kernel, value));
-    }
-};
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::size_t>;
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "atomicOperationsWorking", "[atomic]", TestAccs)
-{
-    using Acc = TestType;
-    TestAtomicOperations<Acc, unsigned char>::testAtomicOperations();
-    TestAtomicOperations<Acc, char>::testAtomicOperations();
-    TestAtomicOperations<Acc, unsigned short>::testAtomicOperations();
-    TestAtomicOperations<Acc, short>::testAtomicOperations();
-
-    TestAtomicOperations<Acc, unsigned int>::testAtomicOperations();
-    TestAtomicOperations<Acc, int>::testAtomicOperations();
-
-    TestAtomicOperations<Acc, unsigned long>::testAtomicOperations();
-    TestAtomicOperations<Acc, long>::testAtomicOperations();
-    TestAtomicOperations<Acc, unsigned long long>::testAtomicOperations();
-    TestAtomicOperations<Acc, long long>::testAtomicOperations();
-
-    // Not all atomic operations are possible with floating point values.
-    //TestAtomicOperations<Acc, float>::testAtomicOperations();
-    //TestAtomicOperations<Acc, double>::testAtomicOperations();
-}
diff --git a/thirdParty/alpaka/test/unit/block/shared/CMakeLists.txt b/thirdParty/alpaka/test/unit/block/shared/CMakeLists.txt
deleted file mode 100644
index fa059864b6..0000000000
--- a/thirdParty/alpaka/test/unit/block/shared/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "blockShared")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp b/thirdParty/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp
deleted file mode 100644
index 8b7a89c1fc..0000000000
--- a/thirdParty/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/block/shared/dyn/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-class BlockSharedMemDynTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        // Assure that the pointer is non null.
-        auto && a = alpaka::block::shared::dyn::getMem<std::uint32_t>(acc);
-        ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != a);
-
-        // Each call should return the same pointer ...
-        auto && b = alpaka::block::shared::dyn::getMem<std::uint32_t>(acc);
-        ALPAKA_CHECK(*success, a == b);
-
-        // ... even for different types.
-        auto && c = alpaka::block::shared::dyn::getMem<float>(acc);
-        ALPAKA_CHECK(*success, a == reinterpret_cast<std::uint32_t *>(c));
-    }
-};
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory for a kernel.
-            template<
-                typename TAcc>
-            struct BlockSharedMemDynSizeBytes<
-                BlockSharedMemDynTestKernel,
-                TAcc>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The size of the shared memory allocated for a block.
-                template<
-                    typename TVec>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    BlockSharedMemDynTestKernel const & blockSharedMemDyn,
-                    TVec const & blockThreadExtent,
-                    TVec const & threadElemExtent,
-                    bool * success)
-                -> idx::Idx<TAcc>
-                {
-                    alpaka::ignore_unused(blockSharedMemDyn);
-                    alpaka::ignore_unused(success);
-                    return
-                        static_cast<idx::Idx<TAcc>>(sizeof(std::uint32_t)) * blockThreadExtent.prod() * threadElemExtent.prod();
-                }
-            };
-        }
-    }
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "sameNonNullAdress", "[blockSharedMemDyn]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    BlockSharedMemDynTestKernel kernel;
-
-    REQUIRE(
-        fixture(
-            kernel));
-}
diff --git a/thirdParty/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp b/thirdParty/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp
deleted file mode 100644
index 28f2625098..0000000000
--- a/thirdParty/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/block/shared/st/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/Array.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-class BlockSharedMemStNonNullTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-#if BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(6, 0, 0)
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Waddress"  // warning: the compiler can assume that the address of �a� will never be NULL [-Waddress]
-#endif
-        // Multiple runs to make sure it really works.
-        for(std::size_t i=0u; i<10; ++i)
-        {
-            auto && a = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != &a);
-
-            auto && b = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != &b);
-
-            auto && c = alpaka::block::shared::st::allocVar<float, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<float *>(nullptr) != &c);
-
-            auto && d = alpaka::block::shared::st::allocVar<double, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<double *>(nullptr) != &d);
-
-            auto && e = alpaka::block::shared::st::allocVar<std::uint64_t, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint64_t *>(nullptr) != &e);
-
-
-            auto && f = alpaka::block::shared::st::allocVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != &f[0]);
-
-            auto && g = alpaka::block::shared::st::allocVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != &g[0]);
-
-            auto && h = alpaka::block::shared::st::allocVar<alpaka::test::Array<double, 16>, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<double *>(nullptr) != &h[0]);
-        }
-#if BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(6, 0, 0)
-    #pragma GCC diagnostic pop
-#endif
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "nonNull", "[blockSharedMemSt]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    // Use multiple threads to make sure the synchronization really works.
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(3u)));
-
-    BlockSharedMemStNonNullTestKernel kernel;
-
-    REQUIRE(fixture(kernel));
-}
-
-//#############################################################################
-class BlockSharedMemStSameTypeDifferentAdressTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        // Multiple runs to make sure it really works.
-        for(std::size_t i=0u; i<10; ++i)
-        {
-            auto && a = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
-            auto && b = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, &a != &b);
-            auto && c = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, &b != &c);
-            ALPAKA_CHECK(*success, &a != &c);
-            ALPAKA_CHECK(*success, &b != &c);
-
-            auto && d = alpaka::block::shared::st::allocVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, &a != &d[0]);
-            ALPAKA_CHECK(*success, &b != &d[0]);
-            ALPAKA_CHECK(*success, &c != &d[0]);
-            auto && e = alpaka::block::shared::st::allocVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, &a != &e[0]);
-            ALPAKA_CHECK(*success, &b != &e[0]);
-            ALPAKA_CHECK(*success, &c != &e[0]);
-            ALPAKA_CHECK(*success, &d[0] != &e[0]);
-        }
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "sameTypeDifferentAddress", "[blockSharedMemSt]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    // Use multiple threads to make sure the synchronization really works.
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(3u)));
-
-    BlockSharedMemStSameTypeDifferentAdressTestKernel kernel;
-
-    REQUIRE(fixture(kernel));
-}
diff --git a/thirdParty/alpaka/test/unit/block/sync/CMakeLists.txt b/thirdParty/alpaka/test/unit/block/sync/CMakeLists.txt
deleted file mode 100644
index 63af2dc830..0000000000
--- a/thirdParty/alpaka/test/unit/block/sync/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "blockSync")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/block/sync/src/BlockSync.cpp b/thirdParty/alpaka/test/unit/block/sync/src/BlockSync.cpp
deleted file mode 100644
index a5b6d90888..0000000000
--- a/thirdParty/alpaka/test/unit/block/sync/src/BlockSync.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-class BlockSyncTestKernel
-{
-public:
-    static const std::uint8_t gridThreadExtentPerDim = 4u;
-
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        using Idx = alpaka::idx::Idx<TAcc>;
-
-        // Get the index of the current thread within the block and the block extent and map them to 1D.
-        auto const blockThreadIdx = alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc);
-        auto const blockThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
-        auto const blockThreadIdx1D = alpaka::idx::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u];
-        auto const blockThreadExtent1D = blockThreadExtent.prod();
-
-        // Allocate shared memory.
-        Idx * const pBlockSharedArray = alpaka::block::shared::dyn::getMem<Idx>(acc);
-   
-        // Write the thread index into the shared memory.
-        pBlockSharedArray[blockThreadIdx1D] = blockThreadIdx1D;
-
-        // Synchronize the threads in the block.
-        alpaka::block::sync::syncBlockThreads(acc);
-
-        // All other threads within the block should now have written their index into the shared memory.
-        for(auto i(static_cast<Idx>(0u)); i < blockThreadExtent1D; ++i)
-        {
-            ALPAKA_CHECK(*success, pBlockSharedArray[i] == i);
-        }
-    }
-};
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory for a kernel.
-            template<
-                typename TAcc>
-            struct BlockSharedMemDynSizeBytes<
-                BlockSyncTestKernel,
-                TAcc>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The size of the shared memory allocated for a block.
-                template<
-                    typename TVec>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    BlockSyncTestKernel const & blockSharedMemDyn,
-                    TVec const & blockThreadExtent,
-                    TVec const & threadElemExtent,
-                    bool * success)
-                -> idx::Idx<TAcc>
-                {
-                    using Idx = alpaka::idx::Idx<TAcc>;
-
-                    alpaka::ignore_unused(blockSharedMemDyn);
-                    alpaka::ignore_unused(threadElemExtent);
-                    alpaka::ignore_unused(success);
-                    return
-                        static_cast<idx::Idx<TAcc>>(sizeof(Idx)) * blockThreadExtent.prod();
-                }
-            };
-        }
-    }
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "synchronize", "[blockSync]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(BlockSyncTestKernel::gridThreadExtentPerDim)));
-
-    BlockSyncTestKernel kernel;
-
-    REQUIRE(
-        fixture(
-            kernel));
-}
diff --git a/thirdParty/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp b/thirdParty/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp
deleted file mode 100644
index 62c17299e2..0000000000
--- a/thirdParty/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-class BlockSyncPredicateTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        using Idx = alpaka::idx::Idx<TAcc>;
-
-        // Get the index of the current thread within the block and the block extent and map them to 1D.
-        auto const blockThreadIdx(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc));
-        auto const blockThreadExtent(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc));
-        auto const blockThreadIdx1D(alpaka::idx::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u]);
-        auto const blockThreadExtent1D(blockThreadExtent.prod());
-
-        // syncBlockThreadsPredicate<alpaka::block::sync::op::Count>
-        {
-            Idx const modulus(2u);
-            int const predicate(static_cast<int>(blockThreadIdx1D % modulus));
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::Count>(acc, predicate));
-            auto const expectedResult(static_cast<int>(blockThreadExtent1D / modulus));
-            ALPAKA_CHECK(*success, expectedResult == result);
-        }
-        {
-            Idx const modulus(3u);
-            int const predicate(static_cast<int>(blockThreadIdx1D % modulus));
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::Count>(acc, predicate));
-            auto const expectedResult(static_cast<int>(blockThreadExtent1D - ((blockThreadExtent1D + modulus - static_cast<Idx>(1u)) / modulus)));
-            ALPAKA_CHECK(*success, expectedResult == result);
-        }
-
-        // syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>
-        {
-            int const predicate(1);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>(acc, predicate));
-            ALPAKA_CHECK(*success, result == 1);
-        }
-        {
-            int const predicate(0);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>(acc, predicate));
-            ALPAKA_CHECK(*success, result == 0);
-        }
-        {
-            int const predicate(blockThreadIdx1D != 0);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>(acc, predicate));
-            ALPAKA_CHECK(*success, result == 0);
-        }
-
-        // syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>
-        {
-            int const predicate(1);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>(acc, predicate));
-            ALPAKA_CHECK(*success, result == 1);
-        }
-        {
-            int const predicate(0);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>(acc, predicate));
-            ALPAKA_CHECK(*success, result == 0);
-        }
-        {
-            int const predicate(static_cast<int>(blockThreadIdx1D != 1));
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>(acc, predicate));
-            ALPAKA_CHECK(*success, result == 1);
-        }
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "synchronizePredicate", "[blockSync]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    BlockSyncPredicateTestKernel kernel;
-
-    // 4^Dim
-    {
-        alpaka::test::KernelExecutionFixture<Acc> fixture(
-            alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(4u)));
-
-        REQUIRE(
-            fixture(
-                kernel));
-    }
-
-    // 1^Dim
-    {
-        alpaka::test::KernelExecutionFixture<Acc> fixture(
-            alpaka::vec::Vec<Dim, Idx>::ones());
-
-        REQUIRE(
-            fixture(
-                kernel));
-    }
-}
diff --git a/thirdParty/alpaka/test/unit/core/CMakeLists.txt b/thirdParty/alpaka/test/unit/core/CMakeLists.txt
deleted file mode 100644
index 6a8a33d144..0000000000
--- a/thirdParty/alpaka/test/unit/core/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2018-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "core")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/core/src/BoostPredefTest.cpp b/thirdParty/alpaka/test/unit/core/src/BoostPredefTest.cpp
deleted file mode 100644
index 33d0c6b02f..0000000000
--- a/thirdParty/alpaka/test/unit/core/src/BoostPredefTest.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <iostream>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("printDefines", "[core]")
-{
-#if BOOST_LANG_CUDA
-    std::cout << "BOOST_LANG_CUDA:" << BOOST_LANG_CUDA << std::endl;
-#endif
-#if BOOST_LANG_HIP
-    std::cout << "BOOST_LANG_HIP:" << BOOST_LANG_HIP << std::endl;
-#endif
-#if BOOST_ARCH_PTX
-    std::cout << "BOOST_ARCH_PTX:" << BOOST_ARCH_PTX << std::endl;
-#endif
-#if BOOST_ARCH_HSA
-    std::cout << "BOOST_ARCH_HSA:" << BOOST_ARCH_HSA << std::endl;
-#endif
-#if BOOST_COMP_NVCC
-    std::cout << "BOOST_COMP_NVCC:" << BOOST_COMP_NVCC << std::endl;
-#endif
-#if BOOST_COMP_HCC
-    std::cout << "BOOST_COMP_HCC:" << BOOST_COMP_HCC << std::endl;
-#endif
-#if BOOST_COMP_HIP
-    std::cout << "BOOST_COMP_HIP:" << BOOST_COMP_HIP << std::endl;
-#endif
-#if BOOST_COMP_CLANG
-    std::cout << "BOOST_COMP_CLANG:" << BOOST_COMP_CLANG << std::endl;
-#endif
-#if BOOST_COMP_GNUC
-    std::cout << "BOOST_COMP_GNUC:" << BOOST_COMP_GNUC << std::endl;
-#endif
-#if BOOST_COMP_MSVC
-    std::cout << "BOOST_COMP_MSVC:" << BOOST_COMP_MSVC << std::endl;
-#endif
-#if BOOST_COMP_CLANG_CUDA
-    std::cout << "BOOST_COMP_CLANG_CUDA:" << BOOST_COMP_CLANG_CUDA << std::endl;
-#endif
-#if BOOST_LIB_STD_GNU
-    std::cout << "BOOST_LIB_STD_GNU:" << BOOST_LIB_STD_GNU << std::endl;
-#endif
-#if BOOST_LIB_STD_CXX
-    std::cout << "BOOST_LIB_STD_CXX:" << BOOST_LIB_STD_CXX << std::endl;
-#endif
-#if BOOST_LIB_STD_DINKUMWARE
-    std::cout << "BOOST_LIB_STD_DINKUMWARE:" << BOOST_LIB_STD_DINKUMWARE << std::endl;
-#endif
-}
diff --git a/thirdParty/alpaka/test/unit/core/src/ClipCastTest.cpp b/thirdParty/alpaka/test/unit/core/src/ClipCastTest.cpp
deleted file mode 100644
index 19ec2792a6..0000000000
--- a/thirdParty/alpaka/test/unit/core/src/ClipCastTest.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/core/ClipCast.hpp>
-
-#include <catch2/catch.hpp>
-
-//-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastNoCastShouldNotChangeTheValue", "[core]")
-{
-    CHECK(
-        std::numeric_limits<std::int8_t>::max() ==
-        alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::int8_t>::max()));
-    CHECK(
-        std::numeric_limits<std::uint16_t>::min() ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::uint16_t>::min()));
-    CHECK(
-        std::numeric_limits<std::int32_t>::min() ==
-        alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::int32_t>::min()));
-    CHECK(
-        std::numeric_limits<std::uint64_t>::max() ==
-        alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::uint64_t>::max()));
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastUpCastEqualSigndnessShouldNotChangeTheValue", "[core]")
-{
-    CHECK(
-        static_cast<std::int16_t>(std::numeric_limits<std::int8_t>::max()) ==
-        alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::int8_t>::max()));
-    CHECK(
-        static_cast<std::uint32_t>(std::numeric_limits<std::uint16_t>::min()) ==
-        alpaka::core::clipCast<std::uint32_t>(std::numeric_limits<std::uint16_t>::min()));
-    CHECK(
-        static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()) ==
-        alpaka::core::clipCast<std::int64_t>(std::numeric_limits<std::int32_t>::min()));
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastUpCastDifferentSigndnessShouldNotChangeTheValueForPositives", "[core]")
-{
-    CHECK(
-        static_cast<std::uint16_t>(std::numeric_limits<std::int8_t>::max()) ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int8_t>::max()));
-    CHECK(
-        static_cast<std::int32_t>(std::numeric_limits<std::uint16_t>::max()) ==
-        alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::uint16_t>::max()));
-    CHECK(
-        static_cast<std::uint64_t>(std::numeric_limits<std::int32_t>::max()) ==
-        alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::int32_t>::max()));
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastUpCastDifferentSigndnessCanChangeTheValueForNegatives", "[core]")
-{
-    CHECK(
-        std::numeric_limits<std::uint16_t>::min() ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int8_t>::min()));
-    CHECK(
-        static_cast<std::int32_t>(std::numeric_limits<std::uint16_t>::min()) ==
-        alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::uint16_t>::min()));
-    CHECK(
-        std::numeric_limits<uint64_t>::min() ==
-        alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::int32_t>::min()));
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastDownCastEqualSigndnessCanChangeTheValue", "[core]")
-{
-    CHECK(
-        std::numeric_limits<std::uint8_t>::max() ==
-        alpaka::core::clipCast<std::uint8_t>(std::numeric_limits<std::uint16_t>::max()));
-    CHECK(
-        std::numeric_limits<std::int16_t>::min() ==
-        alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::int32_t>::min()));
-    CHECK(
-        std::numeric_limits<std::uint16_t>::max() ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::uint64_t>::max()));
-    CHECK(
-        std::numeric_limits<std::int8_t>::min() ==
-        alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::int64_t>::min()));
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastDownCastDifferentSigndnessCanChangeTheValue", "[core]")
-{
-    CHECK(
-        std::numeric_limits<std::int8_t>::max() ==
-        alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::uint16_t>::max()));
-    CHECK(
-        std::numeric_limits<std::uint16_t>::min() ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int32_t>::min()));
-    CHECK(
-        static_cast<std::int16_t>(std::numeric_limits<std::uint64_t>::min()) ==
-        alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::uint64_t>::min()));
-    CHECK(
-        std::numeric_limits<std::uint8_t>::max() ==
-        alpaka::core::clipCast<std::uint8_t>(std::numeric_limits<std::int64_t>::max()));
-}
diff --git a/thirdParty/alpaka/test/unit/core/src/ConceptsTest.cpp b/thirdParty/alpaka/test/unit/core/src/ConceptsTest.cpp
deleted file mode 100644
index 4e656111a4..0000000000
--- a/thirdParty/alpaka/test/unit/core/src/ConceptsTest.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/core/Concepts.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <type_traits>
-
-struct ConceptExample;
-struct ConceptNonMatchingExample;
-
-struct ImplementerNotTagged
-{
-};
-
-struct ImplementerNotTaggedButNonMatchingTagged
-    : public alpaka::concepts::Implements<ConceptNonMatchingExample, ImplementerNotTaggedButNonMatchingTagged>
-{
-};
-
-struct ImplementerTagged
-    : public alpaka::concepts::Implements<ConceptExample, ImplementerTagged>
-{
-};
-
-struct ImplementerTaggedButAlsoNonMatchingTagged
-    : public alpaka::concepts::Implements<ConceptNonMatchingExample, ImplementerTaggedButAlsoNonMatchingTagged>
-    , public alpaka::concepts::Implements<ConceptExample, ImplementerTaggedButAlsoNonMatchingTagged>
-{
-};
-
-struct ImplementerWithTaggedBase
-    : public ImplementerTagged
-{
-};
-
-struct ImplementerWithTaggedBaseAlsoNonMatchingTagged
-    : public ImplementerTaggedButAlsoNonMatchingTagged
-{
-};
-
-struct ImplementerTaggedToBase
-    : public ImplementerNotTagged
-    , public alpaka::concepts::Implements<ConceptExample, ImplementerNotTagged>
-{
-};
-
-struct ImplementerTaggedToBaseAlsoNonMatchingTagged
-    : public ImplementerNotTaggedButNonMatchingTagged
-    , public alpaka::concepts::Implements<ConceptExample, ImplementerNotTaggedButNonMatchingTagged>
-{
-};
-
-struct ImplementerNonMatchingTaggedTaggedToBase
-    : public ImplementerNotTagged
-    , public alpaka::concepts::Implements<ConceptNonMatchingExample, ImplementerTaggedToBaseAlsoNonMatchingTagged>
-    , public alpaka::concepts::Implements<ConceptExample, ImplementerNotTagged>
-{
-};
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerNotTagged", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerNotTagged>;
-
-    static_assert(
-        std::is_same<
-            ImplementerNotTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerNotTaggedButNonMatchingTagged", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerNotTaggedButNonMatchingTagged>;
-
-    static_assert(
-        std::is_same<
-            ImplementerNotTaggedButNonMatchingTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerTagged", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTagged>;
-
-    static_assert(
-        std::is_same<
-            ImplementerTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerTaggedButAlsoNonMatchingTagged", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTaggedButAlsoNonMatchingTagged>;
-
-    static_assert(
-        std::is_same<
-            ImplementerTaggedButAlsoNonMatchingTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerWithTaggedBaseAlsoNonMatchingTagged", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerWithTaggedBaseAlsoNonMatchingTagged>;
-
-    static_assert(
-        std::is_same<
-            ImplementerTaggedButAlsoNonMatchingTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerWithTaggedBase", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerWithTaggedBase>;
-
-    static_assert(
-        std::is_same<
-            ImplementerTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerTaggedToBase", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTaggedToBase>;
-
-    static_assert(
-        std::is_same<
-            ImplementerNotTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerTaggedToBaseAlsoNonMatchingTagged", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTaggedToBaseAlsoNonMatchingTagged>;
-
-    static_assert(
-        std::is_same<
-            ImplementerNotTaggedButNonMatchingTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("ImplementerNonMatchingTaggedTaggedToBase", "[meta]")
-{
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerNonMatchingTaggedTaggedToBase>;
-
-    static_assert(
-        std::is_same<
-            ImplementerNotTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/event/CMakeLists.txt b/thirdParty/alpaka/test/unit/event/CMakeLists.txt
deleted file mode 100644
index d0ebb51977..0000000000
--- a/thirdParty/alpaka/test/unit/event/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "event")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/event/src/EventTest.cpp b/thirdParty/alpaka/test/unit/event/src/EventTest.cpp
deleted file mode 100644
index af6b70c985..0000000000
--- a/thirdParty/alpaka/test/unit/event/src/EventTest.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/event/Traits.hpp>
-
-#include <alpaka/test/event/EventHostManualTrigger.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/queue/QueueTestFixture.hpp>
-#include <alpaka/test/queue/QueueCpuOmp2Collective.hpp>
-
-#include <catch2/catch.hpp>
-
-using TestQueues = alpaka::meta::Concatenate<
-        alpaka::test::queue::TestQueues
- #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-        ,
-        std::tuple<std::tuple<alpaka::dev::DevCpu, alpaka::queue::QueueCpuOmp2Collective>>
-#endif
-    >;
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "eventTestShouldInitiallyBeTrue", "[event]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    using Queue = typename Fixture::Queue;
-
-    Fixture f;
-    alpaka::event::Event<Queue> event(f.m_dev);
-
-    REQUIRE(alpaka::event::test(event));
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "eventTestShouldBeFalseWhileInQueueAndTrueAfterBeingProcessed", "[event]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    using Queue = typename Fixture::Queue;
-    using Dev = typename Fixture::Dev;
-
-    Fixture f1;
-    if(alpaka::test::event::isEventHostManualTriggerSupported(f1.m_dev))
-    {
-        auto q1 = f1.m_queue;
-        alpaka::event::Event<Queue> e1(f1.m_dev);
-        alpaka::test::event::EventHostManualTrigger<Dev> k1(f1.m_dev);
-
-        if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
-        {
-            alpaka::queue::enqueue(q1, k1);
-        }
-
-        alpaka::queue::enqueue(q1, e1);
-
-        if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
-        {
-            REQUIRE(alpaka::event::test(e1) == false);
-
-            k1.trigger();
-
-            alpaka::wait::wait(q1);
-        }
-
-        REQUIRE(alpaka::event::test(e1));
-    }
-    else
-    {
-        std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!" << std::endl;
-    }
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "eventReEnqueueShouldBePossibleIfNobodyWaitsFor", "[event]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    using Queue = typename Fixture::Queue;
-    using Dev = typename Fixture::Dev;
-
-    if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
-    {
-        Fixture f1;
-        if(alpaka::test::event::isEventHostManualTriggerSupported(f1.m_dev))
-        {
-            auto q1 = f1.m_queue;
-            alpaka::event::Event<Queue> e1(f1.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k1(f1.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k2(f1.m_dev);
-
-            // q1 = [k1]
-            alpaka::queue::enqueue(q1, k1);
-            REQUIRE(!alpaka::event::test(k1));
-
-            // q1 = [k1, e1]
-            alpaka::queue::enqueue(q1, e1);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(e1));
-
-            // q1 = [k1, e1, k2]
-            alpaka::queue::enqueue(q1, k2);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(e1));
-            REQUIRE(!alpaka::event::test(k2));
-
-            // re-enqueue should be possible
-            // q1 = [k1, k2, e1]
-            alpaka::queue::enqueue(q1, e1);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(k2));
-            REQUIRE(!alpaka::event::test(e1));
-
-            // q1 = [k2, e1]
-            k1.trigger();
-            REQUIRE(alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(k2));
-            REQUIRE(!alpaka::event::test(e1));
-
-            // q1 = [e1]
-            k2.trigger();
-            REQUIRE(alpaka::event::test(k2));
-            alpaka::wait::wait(e1);
-            REQUIRE(alpaka::event::test(e1));
-        }
-        else
-        {
-            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!" << std::endl;
-        }
-    }
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "eventReEnqueueShouldBePossibleIfSomeoneWaitsFor", "[event]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    using Queue = typename Fixture::Queue;
-    using Dev = typename Fixture::Dev;
-
-    if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
-    {
-        Fixture f1;
-        Fixture f2;
-        if(alpaka::test::event::isEventHostManualTriggerSupported(f1.m_dev)
-            && alpaka::test::event::isEventHostManualTriggerSupported(f2.m_dev))
-        {
-            auto q1 = f1.m_queue;
-            auto q2 = f2.m_queue;
-            alpaka::event::Event<Queue> e1(f1.m_dev);
-            alpaka::event::Event<Queue> e2(f2.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k1(f1.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k2(f1.m_dev);
-
-            // q1 = [k1]
-            alpaka::queue::enqueue(q1, k1);
-            REQUIRE(!alpaka::event::test(k1));
-
-            // q1 = [k1, e1]
-            alpaka::queue::enqueue(q1, e1);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(e1));
-
-            // q1 = [k1, e1, k2]
-            alpaka::queue::enqueue(q1, k2);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(e1));
-            REQUIRE(!alpaka::event::test(k2));
-
-            // wait for e1
-            // q2 = [->e1]
-            alpaka::wait::wait(q2, e1);
-
-            // q2 = [->e1, e2]
-            alpaka::queue::enqueue(q2, e2);
-            REQUIRE(!alpaka::event::test(e2));
-
-            // re-enqueue should be possible
-            // q1 = [k1, e1-old, k2, e1]
-            alpaka::queue::enqueue(q1, e1);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(k2));
-            REQUIRE(!alpaka::event::test(e1));
-            REQUIRE(!alpaka::event::test(e2));
-
-            // q1 = [k2, e1]
-            k1.trigger();
-            REQUIRE(alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(k2));
-            REQUIRE(!alpaka::event::test(e1));
-            REQUIRE(!alpaka::event::test(e2));
-
-            // q1 = [e1]
-            k2.trigger();
-            REQUIRE(alpaka::event::test(k2));
-            alpaka::wait::wait(e1);
-            REQUIRE(alpaka::event::test(e1));
-            alpaka::wait::wait(e2);
-            REQUIRE(alpaka::event::test(e2));
-        }
-        else
-        {
-            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!" << std::endl;
-        }
-    }
-}
-
-
-//-----------------------------------------------------------------------------
-// github issue #388
-TEMPLATE_LIST_TEST_CASE( "waitForEventThatAlreadyFinishedShouldBeSkipped", "[event]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    using Queue = typename Fixture::Queue;
-    using Dev = typename Fixture::Dev;
-
-    if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
-    {
-        Fixture f1;
-        Fixture f2;
-        if(alpaka::test::event::isEventHostManualTriggerSupported(f1.m_dev)
-            && alpaka::test::event::isEventHostManualTriggerSupported(f2.m_dev))
-        {
-            auto q1 = f1.m_queue;
-            auto q2 = f2.m_queue;
-            alpaka::test::event::EventHostManualTrigger<Dev> k1(f1.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k2(f2.m_dev);
-            alpaka::event::Event<Queue> e1(f1.m_dev);
-
-            // 1. kernel k1 is enqueued into queue q1
-            // q1 = [k1]
-            alpaka::queue::enqueue(q1, k1);
-            // 2. kernel k2 is enqueued into queue q2
-            // q2 = [k2]
-            alpaka::queue::enqueue(q2, k2);
-
-            // 3. event e1 is enqueued into queue q1
-            // q1 = [k1, e1]
-            alpaka::queue::enqueue(q1, e1);
-
-            // 4. q2 waits for e1
-            // q2 = [k2, ->e1]
-            alpaka::wait::wait(q2, e1);
-
-            // 5. kernel k1 finishes
-            // q1 = [e1]
-            k1.trigger();
-
-            // 6. e1 is finished
-            // q1 = []
-            alpaka::wait::wait(e1);
-            REQUIRE(alpaka::event::test(e1));
-
-            // 7. e1 is re-enqueued again but this time into q2
-            // q2 = [k2, ->e1, e1]
-            alpaka::queue::enqueue(q2, e1);
-
-            // 8. kernel k2 finishes
-            // q2 = [->e1, e1]
-            k2.trigger();
-
-            // 9. e1 had already been signaled so there should not be waited even though the event is now reused within q2 and its current state is 'unfinished' again.
-            // q2 = [e1]
-
-            // Both queues should successfully finish
-            alpaka::wait::wait(q1);
-            // q2 = []
-            alpaka::wait::wait(q2);
-        }
-        else
-        {
-            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!" << std::endl;
-        }
-    }
-}
diff --git a/thirdParty/alpaka/test/unit/idx/CMakeLists.txt b/thirdParty/alpaka/test/unit/idx/CMakeLists.txt
deleted file mode 100644
index 5ca5024141..0000000000
--- a/thirdParty/alpaka/test/unit/idx/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "idx")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/idx/src/MapIdx.cpp b/thirdParty/alpaka/test/unit/idx/src/MapIdx.cpp
deleted file mode 100644
index d6de1145ed..0000000000
--- a/thirdParty/alpaka/test/unit/idx/src/MapIdx.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/idx/Accessors.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-
-#include <alpaka/meta/ForEachType.hpp>
-#include <alpaka/test/dim/TestDims.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-//! 1D: (17)
-//! 2D: (17, 14)
-//! 3D: (17, 14, 11)
-//! 4D: (17, 14, 11, 8)
-template<
-    std::size_t Tidx>
-struct CreateExtentVal
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TIdx>
-    ALPAKA_FN_HOST_ACC static auto create(
-        TIdx)
-    -> TIdx
-    {
-        return  static_cast<TIdx>(17u - (Tidx*3u));
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "mapIdx", "[idx]", alpaka::test::dim::TestDims)
-{
-    using Dim = TestType;
-    using Idx = std::size_t;
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-
-    auto const extentNd(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, CreateExtentVal>(Idx()));
-    auto const idxNd(extentNd - Vec::all(4u));
-
-    auto const idx1d(alpaka::idx::mapIdx<1u>(idxNd, extentNd));
-
-    auto const idxNdResult(alpaka::idx::mapIdx<Dim::value>(idx1d, extentNd));
-
-    REQUIRE(idxNd == idxNdResult);
-}
diff --git a/thirdParty/alpaka/test/unit/kernel/CMakeLists.txt b/thirdParty/alpaka/test/unit/kernel/CMakeLists.txt
deleted file mode 100644
index b3ca1be4fb..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "kernel")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp
deleted file mode 100644
index 380bb571a1..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-// Generic lambdas are a C++14 feature.
-#if !defined(BOOST_NO_CXX14_GENERIC_LAMBDAS)
-// CUDA C Programming guide says: "__host__ __device__ extended lambdas cannot be generic lambdas"
-#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "genericLambdaKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    auto kernel =
-        [] ALPAKA_FN_ACC (
-            auto const & acc,
-            bool * success)
-        -> void
-        {
-            ALPAKA_CHECK(
-                *success,
-                static_cast<alpaka::idx::Idx<Acc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-        };
-
-    REQUIRE(fixture(kernel));
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "variadicGenericLambdaKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    std::uint32_t const arg1 = 42u;
-    std::uint32_t const arg2 = 43u;
-    auto kernel =
-        [] ALPAKA_FN_ACC (
-            Acc const & acc,
-            bool * success,
-            auto ... args)
-        -> void
-        {
-            alpaka::ignore_unused(acc);
-
-            ALPAKA_CHECK(
-                *success,
-                alpaka::meta::foldr([](auto a, auto b){return a + b;}, args...) == (42u + 43u));
-        };
-
-    REQUIRE(fixture(kernel, arg1, arg2));
-}
-
-#endif
-#endif
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelLambda.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelLambda.cpp
deleted file mode 100644
index 07c4da01cb..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelLambda.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-// NVCC needs --expt-extended-lambda
-#if !defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_EXTENDED_LAMBDA__))
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <catch2/catch.hpp>
-
-//-----------------------------------------------------------------------------
-struct TestTemplateLambda
-{
-template< typename TAcc >
-void operator()()
-{
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    auto kernel =
-        [] ALPAKA_FN_ACC (
-            TAcc const & acc,
-            bool * success)
-        -> void
-        {
-            ALPAKA_CHECK(
-                *success,
-                static_cast<alpaka::idx::Idx<TAcc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-        };
-
-    REQUIRE(fixture(kernel));
-}
-};
-
-//-----------------------------------------------------------------------------
-struct TestTemplateArg
-{
-template< typename TAcc >
-void operator()()
-{
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    std::uint32_t const arg = 42u;
-    auto kernel =
-        [] ALPAKA_FN_ACC (
-            TAcc const & acc,
-            bool * success,
-            std::uint32_t const & arg1)
-        -> void
-        {
-            alpaka::ignore_unused(acc);
-
-            ALPAKA_CHECK(*success, 42u == arg1);
-        };
-
-    REQUIRE(fixture(kernel, arg));
-}
-};
-
-//-----------------------------------------------------------------------------
-struct TestTemplateCapture
-{
-template< typename TAcc >
-void operator()()
-{
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    std::uint32_t const arg = 42u;
-
-#if BOOST_COMP_CLANG >= BOOST_VERSION_NUMBER(5,0,0)
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wunused-lambda-capture"
-#endif
-    auto kernel =
-        [arg] ALPAKA_FN_ACC (
-            TAcc const & acc,
-            bool * success)
-        -> void
-        {
-            alpaka::ignore_unused(acc);
-
-            ALPAKA_CHECK(*success, 42u == arg);
-        };
-#if BOOST_COMP_CLANG >= BOOST_VERSION_NUMBER(5,0,0)
-    #pragma clang diagnostic pop
-#endif
-
-    REQUIRE(fixture(kernel));
-}
-};
-
-
-TEST_CASE( "lambdaKernelIsWorking", "[kernel]")
-{
-    alpaka::meta::forEachType< alpaka::test::acc::TestAccs >( TestTemplateLambda() );
-}
-
-TEST_CASE( "lambdaKernelWithArgumentIsWorking", "[kernel]")
-{
-    alpaka::meta::forEachType< alpaka::test::acc::TestAccs >( TestTemplateArg() );
-}
-
-TEST_CASE( "lambdaKernelWithCapturingIsWorking", "[kernel]")
-{
-    alpaka::meta::forEachType< alpaka::test::acc::TestAccs >( TestTemplateCapture() );
-}
-
-#endif
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelStdFunction.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelStdFunction.cpp
deleted file mode 100644
index 0856ea3cd8..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelStdFunction.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <functional>
-#if BOOST_LANG_CUDA
-#include <nvfunctional>
-#endif
-
-//-----------------------------------------------------------------------------
-template<
-    typename Acc>
-void ALPAKA_FN_ACC kernelFn(
-    Acc const & acc,
-    bool * success,
-    std::int32_t val)
-{
-    alpaka::ignore_unused(acc);
-
-    ALPAKA_CHECK(*success, 42 == val);
-}
-
-// std::function and std::bind is only allowed on CPU
-#if !BOOST_LANG_CUDA && !BOOST_LANG_HIP
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "stdFunctionKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    const auto kernel = std::function<void(Acc const &, bool *, std::int32_t)>( kernelFn<Acc> );
-    REQUIRE(fixture(kernel, 42));
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "stdBindKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    const auto kernel = std::bind( kernelFn<Acc>, std::placeholders::_1, std::placeholders::_2, 42 );
-    REQUIRE(fixture(kernel));
-}
-#endif
-
-// This test is disabled due to #836 (cudaErrorIllegalInstruction crash)
-#if 0
-//#if BOOST_LANG_CUDA
-// clang as a native CUDA compiler does not seem to support nvstd::function when ALPAKA_ACC_GPU_CUDA_ONLY_MODE is used.
-// error: reference to __device__ function 'kernelFn<alpaka::acc::AccGpuCudaRt<std::__1::integral_constant<unsigned long, 1>, unsigned long> >' in __host__ function const auto kernel = nvstd::function<void(Acc const &, bool *, std::int32_t)>( kernelFn<Acc> );
-#if !(defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) && BOOST_COMP_CLANG_CUDA)
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "nvstdFunctionKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    const auto kernel = nvstd::function<void(Acc const &, bool *, std::int32_t)>( kernelFn<Acc> );
-    REQUIRE(fixture(kernel, 42));
-}
-
-#endif
-#endif
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp
deleted file mode 100644
index 8436adebea..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-class KernelWithAdditionalParamByValue
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success,
-        std::int32_t val) const
-    -> void
-    {
-        alpaka::ignore_unused(acc);
-
-        ALPAKA_CHECK(*success, 42 == val);
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByValue", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelWithAdditionalParamByValue kernel;
-
-    REQUIRE(fixture(kernel, 42));
-}
-
-/*
-Passing a parameter by reference to non-const is not allowed.
-There is only one single copy of the parameters on the CPU accelerators.
-They are shared between all threads. Therefore they should not be mutated.
-
-//#############################################################################
-class KernelWithAdditionalParamByRef
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template <typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const &acc,
-        bool *success,
-        std::int32_t &val) const -> void {
-        alpaka::ignore_unused(acc);
-
-        ALPAKA_CHECK(*success, 42 == val);
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByRef", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelWithAdditionalParamByRef kernel;
-
-    REQUIRE(fixture(kernel, 42));
-}*/
-
-//#############################################################################
-class KernelWithAdditionalParamByConstRef
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template <typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const &acc,
-        bool *success,
-        std::int32_t const &val) const -> void
-    {
-        alpaka::ignore_unused(acc);
-
-        ALPAKA_CHECK(*success, 42 == val);
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByConstRef", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelWithAdditionalParamByConstRef kernel;
-
-    REQUIRE(fixture(kernel, 42));
-}
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp
deleted file mode 100644
index 1e6d347eb9..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-class KernelWithConstructorAndMember
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_FN_HOST KernelWithConstructorAndMember(
-        std::int32_t const val = 42) :
-        m_val(val)
-    {}
-
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        alpaka::ignore_unused(acc);
-
-        ALPAKA_CHECK(*success, 42 == m_val);
-    }
-
-private:
-    std::int32_t m_val;
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelWithConstructorAndMember", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelWithConstructorAndMember kernel(42);
-
-    REQUIRE(fixture(kernel));
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelWithConstructorDefaultParamAndMember", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelWithConstructorAndMember kernel;
-
-    REQUIRE(fixture(kernel));
-}
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp
deleted file mode 100644
index e4ca38031e..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-// NVCC needs --expt-relaxed-constexpr
-#if !defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_RELAXED_CONSTEXPR__))
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <limits>
-
-//#############################################################################
-class KernelWithHostConstexpr
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool* success) const
-    -> void
-    {
-        alpaka::ignore_unused(acc);
-
-#if BOOST_COMP_MSVC
-    #pragma warning(push)
-    #pragma warning(disable: 4127)  // warning C4127: conditional expression is constant
-#endif
-
-        constexpr auto max = std::numeric_limits< std::uint32_t >::max();
-
-        ALPAKA_CHECK(*success, 0 != max);
-#if BOOST_COMP_MSVC
-    #pragma warning(pop)
-#endif
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelWithHostConstexpr", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelWithHostConstexpr kernel;
-
-    REQUIRE(fixture(kernel));
-}
-
-#endif
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp
deleted file mode 100644
index 7a78cf74bf..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <type_traits>
-
-//#############################################################################
-template<
-    typename T>
-class KernelFuntionObjectTemplate
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        ALPAKA_CHECK(
-            *success,
-            static_cast<alpaka::idx::Idx<TAcc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-
-        static_assert(
-            std::is_same<std::int32_t, T>::value,
-            "Incorrect additional kernel template parameter type!");
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplate", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelFuntionObjectTemplate<std::int32_t> kernel;
-
-    REQUIRE(fixture(kernel));
-}
-
-//#############################################################################
-class KernelInvocationWithAdditionalTemplate
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename T>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success,
-        T const &) const
-    -> void
-    {
-        ALPAKA_CHECK(
-            *success,
-            static_cast<alpaka::idx::Idx<TAcc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-
-        static_assert(
-            std::is_same<std::int32_t, T>::value,
-            "Incorrect additional kernel template parameter type!");
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectExtraTemplate", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelInvocationWithAdditionalTemplate kernel;
-
-    REQUIRE(fixture(kernel, std::int32_t()));
-}
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp
deleted file mode 100644
index 4873e5763d..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <type_traits>
-
-//#############################################################################
-template< typename TExpected >
-class KernelInvocationTemplateDeductionValueSemantics
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-        template<
-        typename Acc,
-        typename TByValue,
-        typename TByConstValue,
-        typename TByConstReference>
-        ALPAKA_FN_ACC auto operator()(
-            Acc const & acc,
-            bool * success,
-            TByValue,
-            TByConstValue const,
-            TByConstReference const &) const
-        -> void
-    {
-        ALPAKA_CHECK(
-            *success,
-            static_cast<alpaka::idx::Idx<Acc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-
-        static_assert(
-            std::is_same<TByValue, TExpected>::value,
-            "Incorrect first additional kernel template parameter type!");
-        static_assert(
-            std::is_same<TByConstValue, TExpected>::value,
-            "Incorrect second additional kernel template parameter type!");
-        static_assert(
-            std::is_same<TByConstReference, TExpected>::value,
-            "Incorrect third additional kernel template parameter type!");
-
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromValue", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    using Value = std::int32_t;
-    KernelInvocationTemplateDeductionValueSemantics< Value > kernel;
-
-    Value value{ };
-    REQUIRE(fixture(kernel, value, value, value));
-}
-
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstValue", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    using Value = std::int32_t;
-    KernelInvocationTemplateDeductionValueSemantics< Value > kernel;
-
-    Value const constValue{ };
-    REQUIRE(fixture(kernel, constValue, constValue, constValue));
-}
-
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstReference", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    using Value = std::int32_t;
-    KernelInvocationTemplateDeductionValueSemantics< Value > kernel;
-
-    Value value{ };
-    Value const & constReference = value;
-    REQUIRE(fixture(kernel, constReference, constReference, constReference));
-}
-
-//#############################################################################
-template<
-    typename TExpectedFirst,
-    typename TExpectedSecond = TExpectedFirst
->
-class KernelInvocationTemplateDeductionPointerSemantics
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-        template<
-        typename Acc,
-        typename TByPointer,
-        typename TByPointerToConst>
-        ALPAKA_FN_ACC auto operator()(
-            Acc const & acc,
-            bool * success,
-            TByPointer *,
-            TByPointerToConst const *) const
-        -> void
-    {
-        ALPAKA_CHECK(
-            *success,
-            static_cast<alpaka::idx::Idx<Acc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-
-        static_assert(
-            std::is_same<TByPointer, TExpectedFirst>::value,
-            "Incorrect first additional kernel template parameter type!");
-        static_assert(
-            std::is_same<TByPointerToConst, TExpectedSecond>::value,
-            "Incorrect second additional kernel template parameter type!");
-
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromPointer", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    using Value = std::int32_t;
-    KernelInvocationTemplateDeductionPointerSemantics< Value > kernel;
-
-    Value value{ };
-    Value * pointer = &value;
-    REQUIRE(fixture(kernel, pointer, pointer));
-}
-
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromPointerToConst", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    using Value = std::int32_t;
-    KernelInvocationTemplateDeductionPointerSemantics< Value const, Value > kernel;
-
-    Value const constValue{ };
-    Value const * pointerToConst = &constValue;
-    REQUIRE(fixture(kernel, pointerToConst, pointerToConst));
-}
-
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromStaticArray", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    using Value = std::int32_t;
-    KernelInvocationTemplateDeductionPointerSemantics< Value > kernel;
-
-    Value staticArray[4] = { };
-    REQUIRE(fixture(kernel, staticArray, staticArray));
-}
-
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstStaticArray", "[kernel]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    using Value = std::int32_t;
-    KernelInvocationTemplateDeductionPointerSemantics< Value const, Value > kernel;
-
-    Value const constStaticArray[4] = { };
-    REQUIRE(fixture(kernel, constStaticArray, constStaticArray));
-}
diff --git a/thirdParty/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp b/thirdParty/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp
deleted file mode 100644
index a855584e86..0000000000
--- a/thirdParty/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-//! It is not possible to use a alpaka kernel function object without a templated operator() when the CUDA accelerator is hard-coded.
-//!
-//! However, compiling such kernels with a CPU device works fine.
-//!
-//! When the CUDA accelerator is used, the following error is triggered:
-//! /alpaka/include/alpaka/workdiv/Traits.hpp(...): error: calling a __device__ function("getWorkDiv") from a __host__ __device__ function("getWorkDiv") is not allowed
-//! The kernel function objects function call operator is attributed with ALPAKA_FN_ACC which is identical to __host__ __device__.
-//! The 'alpaka::workdiv::getWorkDiv<...>(acc)' function that is called has the ALPAKA_FN_HOST_ACC attribute (also equal to __host__ __device__).
-//! The underlying trait calls the CUDA specialized method which has the __device__ attribute.
-//! Because this call chain does not contain any templates and therefore no calls depending on input types,
-//! everything can be resolved at the first time the template is parsed which results in the given error.
-//!
-//! Currently, the only possible way to solve this is to make the function call operator a template nonetheless by providing an unused template parameter.
-
-using Dim = alpaka::dim::DimInt<2u>;
-using Idx = std::uint32_t;
-#if defined(ALPAKA_ACC_CPU_SERIAL_ENABLED)
-using AccCpu = alpaka::acc::AccCpuSerial<Dim, Idx>;
-#endif
-#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
-using AccGpu = alpaka::acc::AccGpuHipRt<Dim, Idx>;
-#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-using AccGpu = alpaka::acc::AccGpuCudaRt<Dim, Idx>;
-#endif
-
-#if defined(ALPAKA_ACC_CPU_SERIAL_ENABLED)
-//#############################################################################
-struct KernelNoTemplateCpu
-{
-    //-----------------------------------------------------------------------------
-    ALPAKA_FN_ACC
-    auto operator()(
-        AccCpu const & acc,
-        bool* success) const
-    -> void
-    {
-        ALPAKA_CHECK(
-            *success,
-            static_cast<alpaka::idx::Idx<AccCpu>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEST_CASE("kernelNoTemplateCpu", "[kernel]")
-{
-    alpaka::test::KernelExecutionFixture<AccCpu> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelNoTemplateCpu kernel;
-
-    REQUIRE(fixture(kernel));
-}
-#endif
-
-/*#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-//#############################################################################
-//! DO NOT ENABLE! COMPILATION WILL FAIL!
-struct KernelNoTemplateGpu
-{
-    //-----------------------------------------------------------------------------
-    ALPAKA_FN_ACC
-    auto operator()(
-        AccGpu const & acc,
-        bool* success) const
-    -> void
-    {
-        ALPAKA_CHECK(
-            *success,
-            static_cast<alpaka::idx::Idx<AccGpu>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEST_CASE("kernelNoTemplateGpu", "[kernel]")
-{
-    alpaka::test::KernelExecutionFixture<AccGpu> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelNoTemplateGpu kernel;
-
-    REQUIRE(fixture(kernel));
-}
-#endif*/
-
-#if defined(ALPAKA_ACC_CPU_SERIAL_ENABLED)
-//#############################################################################
-struct KernelWithoutTemplateParamCpu
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TNotUsed = void>
-    ALPAKA_FN_ACC
-    auto operator()(
-        AccCpu const & acc,
-        bool* success) const
-    -> void
-    {
-        ALPAKA_CHECK(
-            *success,
-            static_cast<alpaka::idx::Idx<AccCpu>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEST_CASE("kernelWithoutTemplateParamCpu", "[kernel]")
-{
-    alpaka::test::KernelExecutionFixture<AccCpu> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelWithoutTemplateParamCpu kernel;
-
-    REQUIRE(fixture(kernel));
-}
-#endif
-
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) \
-  || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-//#############################################################################
-struct KernelWithoutTemplateParamGpu
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TNotUsed = void>
-    ALPAKA_FN_ACC
-    auto operator()(
-        AccGpu const & acc,
-        bool* success) const
-    -> void
-    {
-        ALPAKA_CHECK(
-            *success,
-            static_cast<alpaka::idx::Idx<AccGpu>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEST_CASE("kernelWithoutTemplateParamGpu", "[kernel]")
-{
-    alpaka::test::KernelExecutionFixture<AccGpu> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    KernelWithoutTemplateParamGpu kernel;
-
-    REQUIRE(fixture(kernel));
-}
-#endif
diff --git a/thirdParty/alpaka/test/unit/math/sincos/CMakeLists.txt b/thirdParty/alpaka/test/unit/math/sincos/CMakeLists.txt
deleted file mode 100644
index dfdf0b12f1..0000000000
--- a/thirdParty/alpaka/test/unit/math/sincos/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl, Matthias Werner
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "sincos")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/math/sincos/src/sincos.cpp b/thirdParty/alpaka/test/unit/math/sincos/src/sincos.cpp
deleted file mode 100644
index 5dbd4ff10f..0000000000
--- a/thirdParty/alpaka/test/unit/math/sincos/src/sincos.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/math/sincos/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-// https://en.cppreference.com/w/cpp/types/numeric_limits/epsilon
-template <typename TAcc, typename FP>
-ALPAKA_FN_ACC
-typename std::enable_if< !std::numeric_limits<FP>::is_integer, bool >::type
-almost_equal(TAcc const & acc, FP x, FP y, int ulp)
-{
-    // the machine epsilon has to be scaled to the magnitude of the values used
-    // and multiplied by the desired precision in ULPs (units in the last place)
-    return alpaka::math::abs(acc, x-y)
-        <= std::numeric_limits<FP>::epsilon() * alpaka::math::abs(acc, x+y) * ulp
-        // unless the result is subnormal
-        || alpaka::math::abs(acc, x-y) < std::numeric_limits<FP>::min();
-}
-
-
-class SinCosTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-    typename TAcc,
-    typename FP
-    >
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success,
-        FP const arg) const
-    -> void
-    {
-        // if arg is hardcoded then compiler can optimize it out
-        // (PTX kernel (float) was just empty)
-        FP check_sin = alpaka::math::sin(acc, arg);
-        FP check_cos = alpaka::math::cos(acc, arg);
-        FP result_sin = 0.;
-        FP result_cos = 0.;
-        alpaka::math::sincos(acc, arg, result_sin, result_cos);
-        ALPAKA_CHECK(*success,
-                     almost_equal(acc, result_sin, check_sin, 1)
-                     &&
-                     almost_equal(acc, result_cos, check_cos, 1)
-            );
-    }
-};
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::size_t>;
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "sincos", "[sincos]", TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    SinCosTestKernel kernel;
-
-    REQUIRE(fixture( kernel, 0.42f )); // float
-    REQUIRE(fixture( kernel, 0.42 ));  // double
-}
diff --git a/thirdParty/alpaka/test/unit/mem/buf/CMakeLists.txt b/thirdParty/alpaka/test/unit/mem/buf/CMakeLists.txt
deleted file mode 100644
index bc7afdf1dc..0000000000
--- a/thirdParty/alpaka/test/unit/mem/buf/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "memBuf")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/mem/buf/src/BufTest.cpp b/thirdParty/alpaka/test/unit/mem/buf/src/BufTest.cpp
deleted file mode 100644
index a13a80384e..0000000000
--- a/thirdParty/alpaka/test/unit/mem/buf/src/BufTest.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/mem/buf/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/mem/view/ViewTest.hpp>
-#include <alpaka/test/Extent.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <type_traits>
-#include <numeric>
-
-//-----------------------------------------------------------------------------
-template<
-    typename TAcc>
-static auto testBufferMutable(
-    alpaka::vec::Vec<alpaka::dim::Dim<TAcc>, alpaka::idx::Idx<TAcc>> const & extent)
--> void
-{
-    using Dev = alpaka::dev::Dev<TAcc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-    using Queue = alpaka::test::queue::DefaultQueue<Dev>;
-
-    using Elem = float;
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-    Queue queue(dev);
-
-    //-----------------------------------------------------------------------------
-    // alpaka::mem::buf::alloc
-    auto buf(alpaka::mem::buf::alloc<Elem, Idx>(dev, extent));
-
-    //-----------------------------------------------------------------------------
-    auto const offset(alpaka::vec::Vec<Dim, Idx>::zeros());
-    alpaka::test::mem::view::testViewImmutable<
-        Elem>(
-            buf,
-            dev,
-            extent,
-            offset);
-
-    //-----------------------------------------------------------------------------
-    alpaka::test::mem::view::testViewMutable<
-        TAcc>(
-            queue,
-            buf);
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "memBufBasicTest", "[memBuf]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-
-    testBufferMutable<
-        Acc>(
-            extent);
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "memBufZeroSizeTest", "[memBuf]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    auto const extent(alpaka::vec::Vec<Dim, Idx>::zeros());
-
-    testBufferMutable<
-        Acc>(
-            extent);
-}
-
-
-//-----------------------------------------------------------------------------
-template<
-    typename TAcc>
-static auto testBufferImmutable(
-    alpaka::vec::Vec<alpaka::dim::Dim<TAcc>, alpaka::idx::Idx<TAcc>> const & extent)
--> void
-{
-    using Dev = alpaka::dev::Dev<TAcc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-
-    using Elem = float;
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-    //-----------------------------------------------------------------------------
-    // alpaka::mem::buf::alloc
-    auto const buf(alpaka::mem::buf::alloc<Elem, Idx>(dev, extent));
-
-    //-----------------------------------------------------------------------------
-    auto const offset(alpaka::vec::Vec<Dim, Idx>::zeros());
-    alpaka::test::mem::view::testViewImmutable<
-        Elem>(
-            buf,
-            dev,
-            extent,
-            offset);
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "memBufConstTest", "[memBuf]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-
-    testBufferImmutable<
-        Acc>(
-            extent);
-}
diff --git a/thirdParty/alpaka/test/unit/mem/p2p/CMakeLists.txt b/thirdParty/alpaka/test/unit/mem/p2p/CMakeLists.txt
deleted file mode 100644
index 25a83bc6ec..0000000000
--- a/thirdParty/alpaka/test/unit/mem/p2p/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "memP2P")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/mem/p2p/src/P2P.cpp b/thirdParty/alpaka/test/unit/mem/p2p/src/P2P.cpp
deleted file mode 100644
index a6bad97c64..0000000000
--- a/thirdParty/alpaka/test/unit/mem/p2p/src/P2P.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/mem/buf/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/mem/view/ViewTest.hpp>
-#include <alpaka/test/Extent.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <type_traits>
-#include <numeric>
-
-//-----------------------------------------------------------------------------
-template<
-    typename TAcc>
-static auto testP2P(
-    alpaka::vec::Vec<alpaka::dim::Dim<TAcc>, alpaka::idx::Idx<TAcc>> const & extent)
--> void
-{
-    using Dev = alpaka::dev::Dev<TAcc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-    using Queue = alpaka::test::queue::DefaultQueue<Dev>;
-
-    using Elem = std::uint32_t;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    if(alpaka::pltf::getDevCount<Pltf>()<2) {
-      std::cerr << "No two devices found to test peer-to-peer copy." << std::endl;
-      CHECK(true);
-      return;
-    }
-
-    Dev const dev0(alpaka::pltf::getDevByIdx<Pltf>(0u));
-    Dev const dev1(alpaka::pltf::getDevByIdx<Pltf>(1u));
-    Queue queue0(dev0);
-
-    //-----------------------------------------------------------------------------
-    auto buf0(alpaka::mem::buf::alloc<Elem, Idx>(dev0, extent));
-    auto buf1(alpaka::mem::buf::alloc<Elem, Idx>(dev1, extent));
-
-    //-----------------------------------------------------------------------------
-    std::uint8_t const byte(static_cast<uint8_t>(42u));
-    alpaka::mem::view::set(queue0, buf0, byte, extent);
-
-    //-----------------------------------------------------------------------------
-    alpaka::mem::view::copy(queue0, buf1, buf0, extent);
-    alpaka::wait::wait(queue0);
-    alpaka::test::mem::view::verifyBytesSet<TAcc>(buf1, byte);
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "memP2PTest", "[memP2P]", alpaka::test::acc::TestAccs)
-{
-#if defined(ALPAKA_CI) &&                             \
-    BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(7,2,0) && \
-    BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(8,0,0) && \
-    defined(ALPAKA_ACC_CPU_BT_OMP4_ENABLED)
-    std::cerr << "Currently, memP2P is not working with gcc7.2 / gcc7.3 on Ubuntu14.04 on travis/CI." << std::endl;
-    CHECK(true);
-#else
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-
-    testP2P<Acc>( extent );
-#endif
-}
diff --git a/thirdParty/alpaka/test/unit/mem/view/CMakeLists.txt b/thirdParty/alpaka/test/unit/mem/view/CMakeLists.txt
deleted file mode 100644
index 9f7349e710..0000000000
--- a/thirdParty/alpaka/test/unit/mem/view/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "memView")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp b/thirdParty/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp
deleted file mode 100644
index 1f186bc1a8..0000000000
--- a/thirdParty/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/mem/view/ViewPlainPtr.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/mem/view/ViewTest.hpp>
-#include <alpaka/test/Extent.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <type_traits>
-#include <numeric>
-
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'std::uint8_t*' to 'Elem*' increases required alignment of target type"
-#endif
-
-namespace alpaka
-{
-namespace test
-{
-namespace mem
-{
-namespace view
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TElem,
-        typename TDim,
-        typename TIdx>
-    auto testViewPlainPtrImmutable(
-        alpaka::mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & view,
-        TDev const & dev,
-        alpaka::vec::Vec<TDim, TIdx> const & extentView,
-        alpaka::vec::Vec<TDim, TIdx> const & offsetView)
-    -> void
-    {
-        //-----------------------------------------------------------------------------
-        alpaka::test::mem::view::testViewImmutable<
-            TElem>(
-                view,
-                dev,
-                extentView,
-                offsetView);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TElem,
-        typename TDim,
-        typename TIdx>
-    auto testViewPlainPtrMutable(
-        alpaka::mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> & view,
-        TDev const & dev,
-        alpaka::vec::Vec<TDim, TIdx> const & extentView,
-        alpaka::vec::Vec<TDim, TIdx> const & offsetView)
-    -> void
-    {
-        //-----------------------------------------------------------------------------
-        testViewPlainPtrImmutable<
-            TAcc>(
-                view,
-                dev,
-                extentView,
-                offsetView);
-
-        using Queue = alpaka::test::queue::DefaultQueue<TDev>;
-        Queue queue(dev);
-        //-----------------------------------------------------------------------------
-        alpaka::test::mem::view::testViewMutable<
-            TAcc>(
-                queue,
-                view);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewPlainPtr()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
-
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewPlainPtr<Dev, TElem, Dim, Idx>;
-
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
-
-        auto const extentView(extentBuf);
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
-        View view(
-            alpaka::mem::view::getPtrNative(buf),
-            alpaka::dev::getDev(buf),
-            alpaka::extent::getExtentVec(buf),
-            alpaka::mem::view::getPitchBytesVec(buf));
-
-        alpaka::test::mem::view::testViewPlainPtrMutable<TAcc>(view, dev, extentView, offsetView);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewPlainPtrConst()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
-
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewPlainPtr<Dev, TElem, Dim, Idx>;
-
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
-
-        auto const extentView(extentBuf);
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
-        View const view(
-            alpaka::mem::view::getPtrNative(buf),
-            alpaka::dev::getDev(buf),
-            alpaka::extent::getExtentVec(buf),
-            alpaka::mem::view::getPitchBytesVec(buf));
-
-        alpaka::test::mem::view::testViewPlainPtrImmutable<TAcc>(view, dev, extentView, offsetView);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewPlainPtrOperators()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
-
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewPlainPtr<Dev, TElem, Dim, Idx>;
-
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
-
-        View view(
-            alpaka::mem::view::getPtrNative(buf),
-            alpaka::dev::getDev(buf),
-            alpaka::extent::getExtentVec(buf),
-            alpaka::mem::view::getPitchBytesVec(buf));
-
-        // copy-constructor
-        View viewCopy(view);
-
-        // move-constructor
-        View viewMove(std::move(viewCopy));
-    }
-}
-}
-}
-}
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewPlainPtrTest", "[memView]", alpaka::test::acc::TestAccs)
-{
-    alpaka::test::mem::view::testViewPlainPtr<TestType, float>();
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewPlainPtrConstTest", "[memView]", alpaka::test::acc::TestAccs)
-{
-    alpaka::test::mem::view::testViewPlainPtrConst<TestType, float>();
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewPlainPtrOperatorTest", "[memView]", alpaka::test::acc::TestAccs)
-{
-    alpaka::test::mem::view::testViewPlainPtrOperators<TestType, float>();
-}
diff --git a/thirdParty/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp b/thirdParty/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp
deleted file mode 100644
index 5f53ff5b4d..0000000000
--- a/thirdParty/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/core/Common.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <catch2/catch.hpp>
-
-using Elem = std::uint32_t;
-using Dim = alpaka::dim::DimInt<2u>;
-using Idx = std::uint32_t;
-
-// These forward declarations are only necessary when you want to access those variables
-// from a different compilation unit and should be moved to a common header.
-// Here they are used to silence clang`s -Wmissing-variable-declarations warning
-// that forces every non-static variable to be declared with extern before the are defined.
-extern ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DInitialized[3][2];
-extern ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DUninitialized[3][2];
-
-ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DInitialized[3][2] =
-    {
-        {0u, 1u},
-        {2u, 3u},
-        {4u, 5u}
-    };
-
-ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DUninitialized[3][2];
-
-//#############################################################################
-//! Uses static device memory on the accelerator defined globally for the whole compilation unit.
-struct StaticDeviceMemoryTestKernel
-{
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem>
-    ALPAKA_FN_ACC void operator()(
-        TAcc const & acc,
-        bool * success,
-        TElem const * const pConstantMem) const
-    {
-        auto const gridThreadExtent =
-            alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-        auto const gridThreadIdx =
-            alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-
-        auto const offset = gridThreadExtent[1u] * gridThreadIdx[0u] + gridThreadIdx[1u];
-        auto const val = offset;
-
-        ALPAKA_CHECK(*success, val == *(pConstantMem + offset));
-    }
-};
-
-using TestAccs = alpaka::test::acc::EnabledAccs<Dim, Idx>;
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAccs)
-{
-    using Acc = TestType;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    DevAcc devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-
-    alpaka::vec::Vec<Dim, Idx> const extent(3u, 2u);
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(extent);
-
-    StaticDeviceMemoryTestKernel kernel;
-
-    //-----------------------------------------------------------------------------
-    // FIXME: constant memory in HIP(HCC) is still not working
-#if !BOOST_COMP_HCC && !BOOST_COMP_HIP
-    // initialized static constant device memory
-    {
-        auto const viewConstantMemInitialized(
-            alpaka::mem::view::createStaticDevMemView(
-                &g_constantMemory2DInitialized[0u][0u],
-                devAcc,
-                extent));
-
-        REQUIRE(fixture(
-            kernel,
-            alpaka::mem::view::getPtrNative(viewConstantMemInitialized)));
-    }
-    //-----------------------------------------------------------------------------
-    // uninitialized static constant device memory
-    {
-        using PltfHost = alpaka::pltf::PltfCpu;
-        auto devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
-
-        using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
-        QueueAcc queueAcc(devAcc);
-
-        std::vector<Elem> const data{0u, 1u, 2u, 3u, 4u, 5u};
-        alpaka::mem::view::ViewPlainPtr<decltype(devHost), const Elem, Dim, Idx> bufHost(data.data(), devHost, extent);
-
-        auto viewConstantMemUninitialized(
-            alpaka::mem::view::createStaticDevMemView(
-                &g_constantMemory2DUninitialized[0u][0u],
-                devAcc,
-                extent));
-
-        alpaka::mem::view::copy(queueAcc, viewConstantMemUninitialized, bufHost, extent);
-        alpaka::wait::wait(queueAcc);
-
-        REQUIRE(fixture(
-            kernel,
-            alpaka::mem::view::getPtrNative(viewConstantMemUninitialized)));
-    }
-#endif
-}
-
-// These forward declarations are only necessary when you want to access those variables
-// from a different compilation unit and should be moved to a common header.
-// Here they are used to silence clang`s -Wmissing-variable-declarations warning
-// that forces every non-static variable to be declared with extern before the are defined.
-extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DInitialized[3][2];
-extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
-
-ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DInitialized[3][2] =
-    {
-        {0u, 1u},
-        {2u, 3u},
-        {4u, 5u}
-    };
-
-ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "staticDeviceMemoryConstant", "[viewStaticAccMem]", TestAccs)
-{
-    using Acc = TestType;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    DevAcc devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-
-    alpaka::vec::Vec<Dim, Idx> const extent(3u, 2u);
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(extent);
-
-    StaticDeviceMemoryTestKernel kernel;
-
-    //-----------------------------------------------------------------------------
-    // FIXME: static device memory in HIP(HCC) is still not working
-#if !BOOST_COMP_HCC && !BOOST_COMP_HIP
-    // initialized static global device memory
-    {
-        auto const viewGlobalMemInitialized(
-            alpaka::mem::view::createStaticDevMemView(
-                &g_globalMemory2DInitialized[0u][0u],
-                devAcc,
-                extent));
-
-        REQUIRE(
-            fixture(
-                kernel,
-                alpaka::mem::view::getPtrNative(viewGlobalMemInitialized)));
-    }
-
-    //-----------------------------------------------------------------------------
-    // uninitialized static global device memory
-    {
-        using PltfHost = alpaka::pltf::PltfCpu;
-        auto devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
-
-        using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
-        QueueAcc queueAcc(devAcc);
-
-        std::vector<Elem> const data{0u, 1u, 2u, 3u, 4u, 5u};
-        alpaka::mem::view::ViewPlainPtr<decltype(devHost), const Elem, Dim, Idx> bufHost(data.data(), devHost, extent);
-
-        auto viewGlobalMemUninitialized(
-            alpaka::mem::view::createStaticDevMemView(
-                &g_globalMemory2DUninitialized[0u][0u],
-                devAcc,
-                extent));
-
-        alpaka::mem::view::copy(queueAcc, viewGlobalMemUninitialized, bufHost, extent);
-        alpaka::wait::wait(queueAcc);
-
-        REQUIRE(
-            fixture(
-                kernel,
-                alpaka::mem::view::getPtrNative(viewGlobalMemUninitialized)));
-    }
-#endif
-}
diff --git a/thirdParty/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp b/thirdParty/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp
deleted file mode 100644
index ea6a0f4960..0000000000
--- a/thirdParty/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/mem/view/ViewSubView.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/mem/view/ViewTest.hpp>
-#include <alpaka/test/Extent.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <type_traits>
-#include <numeric>
-
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'std::uint8_t*' to 'Elem*' increases required alignment of target type"
-#endif
-
-namespace alpaka
-{
-namespace test
-{
-namespace mem
-{
-namespace view
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TElem,
-        typename TDim,
-        typename TIdx,
-        typename TBuf>
-    auto testViewSubViewImmutable(
-        alpaka::mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & view,
-        TBuf & buf,
-        TDev const & dev,
-        alpaka::vec::Vec<TDim, TIdx> const & extentView,
-        alpaka::vec::Vec<TDim, TIdx> const & offsetView)
-    -> void
-    {
-        //-----------------------------------------------------------------------------
-        alpaka::test::mem::view::testViewImmutable<
-            TElem>(
-                view,
-                dev,
-                extentView,
-                offsetView);
-
-        //-----------------------------------------------------------------------------
-        // alpaka::mem::view::traits::GetPitchBytes
-        // The pitch of the view has to be identical to the pitch of the underlying buffer in all dimensions.
-        {
-            auto const pitchBuf(alpaka::mem::view::getPitchBytesVec(buf));
-            auto const pitchView(alpaka::mem::view::getPitchBytesVec(view));
-
-            for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
-            {
-                REQUIRE(
-                    pitchBuf[i-static_cast<TIdx>(1u)] ==
-                    pitchView[i-static_cast<TIdx>(1u)]);
-            }
-        }
-
-        //-----------------------------------------------------------------------------
-        // alpaka::mem::view::traits::GetPtrNative
-        // The native pointer has to be exactly the value we calculate here.
-        {
-            auto viewPtrNative(
-                reinterpret_cast<std::uint8_t *>(
-                    alpaka::mem::view::getPtrNative(buf)));
-            auto const pitchBuf(alpaka::mem::view::getPitchBytesVec(buf));
-            for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
-            {
-                auto const pitch = (i < static_cast<TIdx>(TDim::value)) ? pitchBuf[i] : static_cast<TIdx>(sizeof(TElem));
-                viewPtrNative += offsetView[i - static_cast<TIdx>(1u)] * pitch;
-            }
-            REQUIRE(
-                reinterpret_cast<TElem *>(viewPtrNative) ==
-                alpaka::mem::view::getPtrNative(view));
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TElem,
-        typename TDim,
-        typename TIdx,
-        typename TBuf>
-    auto testViewSubViewMutable(
-        alpaka::mem::view::ViewSubView<TDev, TElem, TDim, TIdx> & view,
-        TBuf & buf,
-        TDev const & dev,
-        alpaka::vec::Vec<TDim, TIdx> const & extentView,
-        alpaka::vec::Vec<TDim, TIdx> const & offsetView)
-    -> void
-    {
-        //-----------------------------------------------------------------------------
-        testViewSubViewImmutable<
-            TAcc>(
-                view,
-                buf,
-                dev,
-                extentView,
-                offsetView);
-
-        using Queue = alpaka::test::queue::DefaultQueue<TDev>;
-        Queue queue(dev);
-        //-----------------------------------------------------------------------------
-        alpaka::test::mem::view::testViewMutable<
-            TAcc>(
-                queue,
-                view);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewSubViewNoOffset()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
-
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewSubView<Dev, TElem, Dim, Idx>;
-
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
-
-        auto const extentView(extentBuf);
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
-        View view(buf);
-
-        alpaka::test::mem::view::testViewSubViewMutable<TAcc>(view, buf, dev, extentView, offsetView);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewSubViewOffset()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
-
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewSubView<Dev, TElem, Dim, Idx>;
-
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
-
-        auto const extentView(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentViewVal>(Idx()));
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(1)));
-        View view(buf, extentView, offsetView);
-
-        alpaka::test::mem::view::testViewSubViewMutable<TAcc>(view, buf, dev, extentView, offsetView);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewSubViewOffsetConst()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
-
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewSubView<Dev, TElem, Dim, Idx>;
-
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
-
-        auto const extentView(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, CreateExtentViewVal>(Idx()));
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(1)));
-        View const view(buf, extentView, offsetView);
-
-        alpaka::test::mem::view::testViewSubViewImmutable<TAcc>(view, buf, dev, extentView, offsetView);
-    }
-}
-}
-}
-}
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewSubViewNoOffsetTest", "[memView]", alpaka::test::acc::TestAccs)
-{
-    alpaka::test::mem::view::testViewSubViewNoOffset<TestType, float>();
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewSubViewOffsetTest", "[memView]", alpaka::test::acc::TestAccs)
-{
-    alpaka::test::mem::view::testViewSubViewOffset<TestType, float>();
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewSubViewOffsetConstTest", "[memView]", alpaka::test::acc::TestAccs)
-{
-    alpaka::test::mem::view::testViewSubViewOffsetConst<TestType, float>();
-}
diff --git a/thirdParty/alpaka/test/unit/meta/CMakeLists.txt b/thirdParty/alpaka/test/unit/meta/CMakeLists.txt
deleted file mode 100644
index 1c4b5af7cc..0000000000
--- a/thirdParty/alpaka/test/unit/meta/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "meta")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/meta/src/ApplyTest.cpp b/thirdParty/alpaka/test/unit/meta/src/ApplyTest.cpp
deleted file mode 100644
index 7da938ec7c..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/ApplyTest.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/Apply.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-template<
-    typename... T>
-struct TypeList
-{};
-
-//-----------------------------------------------------------------------------
-TEST_CASE("apply", "[meta]")
-{
-    using ApplyInput =
-        std::tuple<
-            int,
-            float,
-            long>;
-
-    using ApplyResult =
-        alpaka::meta::Apply<
-            ApplyInput,
-            TypeList
-        >;
-
-    using ApplyReference =
-        TypeList<
-            int,
-            float,
-            long>;
-
-    static_assert(
-        std::is_same<
-            ApplyReference,
-            ApplyResult
-        >::value,
-        "alpaka::meta::Apply failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/ApplyTupleTest.cpp b/thirdParty/alpaka/test/unit/meta/src/ApplyTupleTest.cpp
deleted file mode 100644
index 4670fe85c4..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/ApplyTupleTest.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-struct Foo {
-    Foo(int num) : num_(num) {}
-    auto add(int i) const -> int { return num_ + i; }
-    int num_;
-};
-
-//-----------------------------------------------------------------------------
-auto abs_num(int i) -> int;
-auto abs_num(int i) -> int
-{
-    return std::abs(i);
-}
-
-//#############################################################################
-struct AbsNum {
-    auto operator()(int i) const -> int
-    {
-        return std::abs(i);
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEST_CASE("invoke", "[meta]")
-{
-    // invoke a free function
-    REQUIRE(9 == alpaka::meta::invoke(abs_num, -9));
-
-    // invoke a lambda
-    REQUIRE(42 == alpaka::meta::invoke([]() { return abs_num(-42); }));
-
-    // invoke a member function
-    const Foo foo(-314159);
-    REQUIRE(-314158 == alpaka::meta::invoke(&Foo::add, foo, 1));
-
-    // invoke (access) a data member
-    REQUIRE(-314159 == alpaka::meta::invoke(&Foo::num_, foo));
-
-    // invoke a function object
-    REQUIRE(18 == alpaka::meta::invoke(AbsNum(), -18));
-}
-
-//-----------------------------------------------------------------------------
-auto add(int first, int second) -> int;
-auto add(int first, int second) -> int
-{
-    return first + second;
-}
-
-//-----------------------------------------------------------------------------
-template<typename T>
-T add_generic(T first, T second)
-{
-    return first + second;
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("applyTuple", "[meta]")
-{
-    REQUIRE(3 == alpaka::meta::apply(add, std::make_tuple(1, 2)));
-
-    // intended compilation error: template argument deduction/substitution fails
-    // REQUIRE(5.0f == alpaka::meta::apply(add_generic, std::make_tuple(2.0f, 3.0f)));
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/CartesianProductTest.cpp b/thirdParty/alpaka/test/unit/meta/src/CartesianProductTest.cpp
deleted file mode 100644
index 1ea9a9081c..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/CartesianProductTest.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/CartesianProduct.hpp>
-
-#include <alpaka/dim/DimIntegralConst.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("cartesianProduct", "[meta]")
-{
-    using TestDims =
-        std::tuple<
-            alpaka::dim::DimInt<1u>,
-            alpaka::dim::DimInt<2u>,
-            alpaka::dim::DimInt<3u>>;
-
-    using TestIdxs =
-        std::tuple<
-            std::size_t,
-            std::int64_t>;
-
-    using CartesianProductResult =
-        alpaka::meta::CartesianProduct<
-            std::tuple,
-            TestDims,
-            TestIdxs
-        >;
-
-    using CartesianProductReference =
-        std::tuple<
-            std::tuple<alpaka::dim::DimInt<1u>, std::size_t>,
-            std::tuple<alpaka::dim::DimInt<2u>, std::size_t>,
-            std::tuple<alpaka::dim::DimInt<3u>, std::size_t>,
-            std::tuple<alpaka::dim::DimInt<1u>, std::int64_t>,
-            std::tuple<alpaka::dim::DimInt<2u>, std::int64_t>,
-            std::tuple<alpaka::dim::DimInt<3u>, std::int64_t>>;
-
-    static_assert(
-        std::is_same<
-            CartesianProductReference,
-            CartesianProductResult
-        >::value,
-        "alpaka::meta::CartesianProduct failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/ConcatenateTest.cpp b/thirdParty/alpaka/test/unit/meta/src/ConcatenateTest.cpp
deleted file mode 100644
index 2376e7217c..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/ConcatenateTest.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/Concatenate.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <string>
-#include <tuple>
-#include <type_traits>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("concatenate", "[meta]")
-{
-    using TestTuple1 =
-        std::tuple<
-            float,
-            int,
-            std::tuple<double, unsigned long>>;
-
-    using TestTuple2 =
-        std::tuple<
-            bool,
-            std::string>;
-
-    using ConcatenateResult =
-        alpaka::meta::Concatenate<
-            TestTuple1,
-            TestTuple2
-        >;
-
-    using ConcatenateReference =
-        std::tuple<
-            float,
-            int,
-            std::tuple<double, unsigned long>,
-            bool,
-            std::string>;
-
-    static_assert(
-        std::is_same<
-            ConcatenateReference,
-            ConcatenateResult
-        >::value,
-        "alpaka::meta::Concatenate failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/FilterTest.cpp b/thirdParty/alpaka/test/unit/meta/src/FilterTest.cpp
deleted file mode 100644
index 180d5a333e..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/FilterTest.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/Filter.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("filter", "[meta]")
-{
-    using FilterInput =
-        std::tuple<
-            int,
-            float,
-            long>;
-
-    using FilterResult =
-        alpaka::meta::Filter<
-            FilterInput,
-            std::is_integral
-        >;
-
-    using FilterReference =
-        std::tuple<
-            int,
-            long>;
-
-    static_assert(
-        std::is_same<
-            FilterReference,
-            FilterResult
-        >::value,
-        "alpaka::meta::Filter failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/IntegralTest.cpp b/thirdParty/alpaka/test/unit/meta/src/IntegralTest.cpp
deleted file mode 100644
index 41e2b8f2ca..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/IntegralTest.cpp
+++ /dev/null
@@ -1,1076 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/Integral.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <type_traits>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isIntegralSupersetTrue", "[meta]")
-{
-    // unsigned - unsigned
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint64_t, std::uint64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint64_t, std::uint32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint64_t, std::uint16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint64_t, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint32_t, std::uint32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint32_t, std::uint16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint32_t, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint16_t, std::uint16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint16_t, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint8_t, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    // signed - signed
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int64_t, std::int64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int64_t, std::int32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int64_t, std::int16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int64_t, std::int8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int32_t, std::int32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int32_t, std::int16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int32_t, std::int8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int16_t, std::int16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int16_t, std::int8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int8_t, std::int8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    // unsigned - signed
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint64_t, std::int32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint64_t, std::int16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint64_t, std::int8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint32_t, std::int16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint32_t, std::int8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::uint16_t, std::int8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    // signed - unsigned
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int64_t, std::uint32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int64_t, std::uint16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int64_t, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int32_t, std::uint16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int32_t, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        alpaka::meta::IsIntegralSuperset<std::int16_t, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isIntegralSupersetNoIntegral", "[meta]")
-{
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<float, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint64_t, double>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isIntegralSupersetFalse", "[meta]")
-{
-    // unsigned - unsigned
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint8_t, std::uint64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint8_t, std::uint32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint8_t, std::uint16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint16_t, std::uint64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint16_t, std::uint32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint32_t, std::uint64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    // signed - signed
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int8_t, std::int64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int8_t, std::int32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int8_t, std::int16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int16_t, std::int64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int16_t, std::int32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int32_t, std::int64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    // unsigned - signed
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint64_t, std::int64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint32_t, std::int64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint32_t, std::int32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint16_t, std::int64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint16_t, std::int32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint16_t, std::int16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint8_t, std::int64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint8_t, std::int32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint8_t, std::int16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::uint8_t, std::int8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    // signed - unsigned
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int64_t, std::uint64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int32_t, std::uint64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int32_t, std::uint32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int16_t, std::uint64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int16_t, std::uint32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int16_t, std::uint16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int8_t, std::uint64_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int8_t, std::uint32_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int8_t, std::uint16_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-    static_assert(
-        !alpaka::meta::IsIntegralSuperset<std::int8_t, std::uint8_t>::value,
-        "alpaka::meta::IsIntegralSuperset failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("higherMax", "[meta]")
-{
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int8_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int8_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int8_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int8_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int8_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int8_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int8_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int8_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint8_t, std::int8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint8_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint8_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint8_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint8_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint8_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint8_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint8_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int16_t, std::int8_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int16_t, std::uint8_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int16_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int16_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int16_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int16_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int16_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int16_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint16_t, std::int8_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint16_t, std::uint8_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint16_t, std::int16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint16_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint16_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint16_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint16_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint16_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int32_t, std::int8_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int32_t, std::uint8_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int32_t, std::int16_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int32_t, std::uint16_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int32_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int32_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int32_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int32_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint32_t, std::int8_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint32_t, std::uint8_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint32_t, std::int16_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint32_t, std::uint16_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint32_t, std::int32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint32_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint32_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint32_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int64_t, std::int8_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int64_t, std::uint8_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int64_t, std::int16_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int64_t, std::uint16_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int64_t, std::int32_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int64_t, std::uint32_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int64_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::int64_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint64_t, std::int8_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint64_t, std::uint8_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint64_t, std::int16_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint64_t, std::uint16_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint64_t, std::int32_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint64_t, std::uint32_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint64_t, std::int64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMax<std::uint64_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMax failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("lowerMax", "[meta]")
-{
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int8_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int8_t, std::uint8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int8_t, std::int16_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int8_t, std::uint16_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int8_t, std::int32_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int8_t, std::uint32_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int8_t, std::int64_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int8_t, std::uint64_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint8_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint8_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint8_t, std::int16_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint8_t, std::uint16_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint8_t, std::int32_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint8_t, std::uint32_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint8_t, std::int64_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint8_t, std::uint64_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int16_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int16_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int16_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int16_t, std::uint16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int16_t, std::int32_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int16_t, std::uint32_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int16_t, std::int64_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int16_t, std::uint64_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint16_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint16_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint16_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint16_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint16_t, std::int32_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint16_t, std::uint32_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint16_t, std::int64_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint16_t, std::uint64_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int32_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int32_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int32_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int32_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int32_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int32_t, std::uint32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int32_t, std::int64_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int32_t, std::uint64_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint32_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint32_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint32_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint32_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint32_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint32_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint32_t, std::int64_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint32_t, std::uint64_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int64_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int64_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int64_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int64_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int64_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int64_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int64_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::int64_t, std::uint64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMax failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint64_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint64_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint64_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint64_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint64_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint64_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint64_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMax failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMax<std::uint64_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::LowerMax failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("higherMin", "[meta]")
-{
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int8_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int8_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int8_t, std::int16_t>, std::int8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int8_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int8_t, std::int32_t>, std::int8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int8_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int8_t, std::int64_t>, std::int8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int8_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint8_t, std::int8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint8_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint8_t, std::int16_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint8_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint8_t, std::int32_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint8_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint8_t, std::int64_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint8_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int16_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int16_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int16_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int16_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int16_t, std::int32_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int16_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int16_t, std::int64_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int16_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint16_t, std::int8_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint16_t, std::uint8_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint16_t, std::int16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint16_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint16_t, std::int32_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint16_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint16_t, std::int64_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint16_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int32_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int32_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int32_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int32_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int32_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int32_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int32_t, std::int64_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int32_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint32_t, std::int8_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint32_t, std::uint8_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint32_t, std::int16_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint32_t, std::uint16_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint32_t, std::int32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint32_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint32_t, std::int64_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint32_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int64_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int64_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int64_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int64_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int64_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int64_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int64_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::int64_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint64_t, std::int8_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint64_t, std::uint8_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint64_t, std::int16_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint64_t, std::uint16_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint64_t, std::int32_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint64_t, std::uint32_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint64_t, std::int64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::HigherMin<std::uint64_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::HigherMin failed!");
-}
-//-----------------------------------------------------------------------------
-TEST_CASE("lowerMin", "[meta]")
-{
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int8_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int8_t, std::uint8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int8_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int8_t, std::uint16_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int8_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int8_t, std::uint32_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int8_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int8_t, std::uint64_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint8_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint8_t, std::uint8_t>, std::uint8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint8_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint8_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint8_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint8_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint8_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint8_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int16_t, std::int8_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int16_t, std::uint8_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int16_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int16_t, std::uint16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int16_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int16_t, std::uint32_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int16_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int16_t, std::uint64_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint16_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint16_t, std::uint8_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint16_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint16_t, std::uint16_t>, std::uint16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint16_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint16_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint16_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint16_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int32_t, std::int8_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int32_t, std::uint8_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int32_t, std::int16_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int32_t, std::uint16_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int32_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int32_t, std::uint32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int32_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int32_t, std::uint64_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint32_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint32_t, std::uint8_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint32_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint32_t, std::uint16_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint32_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint32_t, std::uint32_t>, std::uint32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint32_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint32_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int64_t, std::int8_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int64_t, std::uint8_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int64_t, std::int16_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int64_t, std::uint16_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int64_t, std::int32_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int64_t, std::uint32_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int64_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::int64_t, std::uint64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint64_t, std::int8_t>, std::int8_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint64_t, std::uint8_t>, std::uint64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint64_t, std::int16_t>, std::int16_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint64_t, std::uint16_t>, std::uint64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint64_t, std::int32_t>, std::int32_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint64_t, std::uint32_t>, std::uint64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint64_t, std::int64_t>, std::int64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-    static_assert(
-        std::is_same<alpaka::meta::LowerMin<std::uint64_t, std::uint64_t>, std::uint64_t>::value,
-        "alpaka::meta::LowerMin failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp b/thirdParty/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp
deleted file mode 100644
index 350353200e..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/IsStrictBase.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-class A {};
-class B : A {};
-class C {};
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isStrictBaseTrue", "[meta]")
-{
-    constexpr bool IsStrictBaseResult =
-        alpaka::meta::IsStrictBase<
-            A, B
-        >::value;
-
-    constexpr bool IsStrictBaseReference =
-        true;
-
-    static_assert(
-        IsStrictBaseReference == IsStrictBaseResult,
-        "alpaka::meta::IsStrictBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isStrictBaseIdentity", "[meta]")
-{
-    constexpr bool IsStrictBaseResult =
-        alpaka::meta::IsStrictBase<
-            A, A
-        >::value;
-
-    constexpr bool IsStrictBaseReference =
-        false;
-
-    static_assert(
-        IsStrictBaseReference == IsStrictBaseResult,
-        "alpaka::meta::IsStrictBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isStrictBaseNoInheritance", "[meta]")
-{
-    constexpr bool IsStrictBaseResult =
-        alpaka::meta::IsStrictBase<
-            A, C
-        >::value;
-
-    constexpr bool IsStrictBaseReference =
-        false;
-
-    static_assert(
-        IsStrictBaseReference == IsStrictBaseResult,
-        "alpaka::meta::IsStrictBase failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isStrictBaseWrongOrder", "[meta]")
-{
-    constexpr bool IsStrictBaseResult =
-        alpaka::meta::IsStrictBase<
-            B, A
-        >::value;
-
-    constexpr bool IsStrictBaseReference =
-        false;
-
-    static_assert(
-        IsStrictBaseReference == IsStrictBaseResult,
-        "alpaka::meta::IsStrictBase failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/MetafunctionsTest.cpp b/thirdParty/alpaka/test/unit/meta/src/MetafunctionsTest.cpp
deleted file mode 100644
index 2134ebcf1e..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/MetafunctionsTest.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/Metafunctions.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("conjunctionTrue", "[meta]")
-{
-    using ConjunctionResult =
-        alpaka::meta::Conjunction<
-            std::true_type,
-            std::true_type,
-            std::integral_constant<bool, true>
-        >;
-
-    static_assert(
-        ConjunctionResult::value == true,
-        "alpaka::meta::Conjunction failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("conjunctionFalse", "[meta]")
-{
-    using ConjunctionResult =
-        alpaka::meta::Conjunction<
-            std::true_type,
-            std::false_type,
-            std::integral_constant<bool, true>
-        >;
-
-    static_assert(
-        ConjunctionResult::value == false,
-        "alpaka::meta::Conjunction failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("disjunctionTrue", "[meta]")
-{
-    using DisjunctionResult =
-        alpaka::meta::Disjunction<
-            std::false_type,
-            std::true_type,
-            std::integral_constant<bool, false>
-        >;
-
-    static_assert(
-        DisjunctionResult::value == true,
-        "alpaka::meta::Disjunction failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("disjunctionFalse", "[meta]")
-{
-    using DisjunctionResult =
-        alpaka::meta::Disjunction<
-            std::false_type,
-            std::false_type,
-            std::integral_constant<bool, false>
-        >;
-
-    static_assert(
-        DisjunctionResult::value == false,
-        "alpaka::meta::Disjunction failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("negationFalse", "[meta]")
-{
-    using NegationResult =
-        alpaka::meta::Negation<
-            std::true_type
-        >;
-
-    using NegationReference =
-        std::false_type;
-
-    static_assert(
-        std::is_same<
-            NegationReference,
-            NegationResult
-        >::value,
-        "alpaka::meta::Negation failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("negationTrue", "[meta]")
-{
-    using NegationResult =
-        alpaka::meta::Negation<
-            std::false_type
-        >;
-
-    using NegationReference =
-        std::true_type;
-
-    static_assert(
-        std::is_same<
-            NegationReference,
-            NegationResult
-        >::value,
-        "alpaka::meta::Negation failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/SetTest.cpp b/thirdParty/alpaka/test/unit/meta/src/SetTest.cpp
deleted file mode 100644
index eb0ffc77af..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/SetTest.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/Set.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isSetTrue", "[meta]")
-{
-    using IsSetInput =
-        std::tuple<
-            int,
-            float,
-            long>;
-
-    constexpr bool IsSetResult =
-        alpaka::meta::IsSet<
-            IsSetInput
-        >::value;
-
-    constexpr bool IsSetReference =
-        true;
-
-    static_assert(
-        IsSetReference == IsSetResult,
-        "alpaka::meta::IsSet failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("isSetFalse", "[meta]")
-{
-    using IsSetInput =
-        std::tuple<
-            int,
-            float,
-            int>;
-
-    constexpr bool IsSetResult =
-        alpaka::meta::IsSet<
-            IsSetInput
-        >::value;
-
-    constexpr bool IsSetReference =
-        false;
-
-    static_assert(
-        IsSetReference == IsSetResult,
-        "alpaka::meta::IsSet failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/TransformTest.cpp b/thirdParty/alpaka/test/unit/meta/src/TransformTest.cpp
deleted file mode 100644
index dba8dd0785..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/TransformTest.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/Transform.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-template<
-    typename T>
-using AddConst = T const;
-
-//-----------------------------------------------------------------------------
-TEST_CASE("transform", "[meta]")
-{
-    using TransformInput =
-        std::tuple<
-            int,
-            float,
-            long>;
-
-    using TransformResult =
-        alpaka::meta::Transform<
-            TransformInput,
-            AddConst
-        >;
-
-    using TransformReference =
-        std::tuple<
-            int const,
-            float const,
-            long const>;
-
-    static_assert(
-        std::is_same<
-            TransformReference,
-            TransformResult
-        >::value,
-        "alpaka::meta::Transform failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("transformVariadic", "[meta]")
-{
-    using TransformInput =
-        std::tuple<
-            int,
-            float,
-            long>;
-
-    using TransformResult =
-        alpaka::meta::Transform<
-            TransformInput,
-            std::tuple
-        >;
-
-    using TransformReference =
-        std::tuple<
-            std::tuple<int>,
-            std::tuple<float>,
-            std::tuple<long>>;
-
-    static_assert(
-        std::is_same<
-            TransformReference,
-            TransformResult
-        >::value,
-        "alpaka::meta::Transform failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/meta/src/UniqueTest.cpp b/thirdParty/alpaka/test/unit/meta/src/UniqueTest.cpp
deleted file mode 100644
index 36cb4665d4..0000000000
--- a/thirdParty/alpaka/test/unit/meta/src/UniqueTest.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/meta/Unique.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <tuple>
-#include <type_traits>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("uniqueWithDuplicate", "[meta]")
-{
-    using UniqueInput =
-        std::tuple<
-            int,
-            float,
-            int,
-            float,
-            float,
-            int>;
-
-    using UniqueResult =
-        alpaka::meta::Unique<
-            UniqueInput
-        >;
-
-    using UniqueReference =
-        std::tuple<
-            int,
-            float>;
-
-    static_assert(
-        std::is_same<
-            UniqueReference,
-            UniqueResult
-        >::value,
-        "alpaka::meta::Unique failed!");
-}
-
-//-----------------------------------------------------------------------------
-TEST_CASE("uniqueWithoutDuplicate", "[meta]")
-{
-    using UniqueInput =
-        std::tuple<
-            int,
-            float,
-            double>;
-
-    using UniqueResult =
-        alpaka::meta::Unique<
-            UniqueInput
-        >;
-
-    using UniqueReference =
-        UniqueInput;
-
-    static_assert(
-        std::is_same<
-            UniqueReference,
-            UniqueResult
-        >::value,
-        "alpaka::meta::Unique failed!");
-}
diff --git a/thirdParty/alpaka/test/unit/queue/CMakeLists.txt b/thirdParty/alpaka/test/unit/queue/CMakeLists.txt
deleted file mode 100644
index f59808d505..0000000000
--- a/thirdParty/alpaka/test/unit/queue/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "queue")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/queue/src/CollectiveQueue.cpp b/thirdParty/alpaka/test/unit/queue/src/CollectiveQueue.cpp
deleted file mode 100644
index 3fb220a3b3..0000000000
--- a/thirdParty/alpaka/test/unit/queue/src/CollectiveQueue.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
-
-#include <alpaka/alpaka.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/queue/QueueTestFixture.hpp>
-#include <alpaka/test/queue/QueueCpuOmp2Collective.hpp>
-
-#include <vector>
-
-#include <catch2/catch.hpp>
-
-struct QueueCollectiveTestKernel
-{
-    template<typename TAcc>
-    auto operator()(
-        TAcc const & acc,
-        int* resultsPtr) const
-    -> void
-    {
-        size_t threadId = alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0];
-        // avoid that one thread is doing all the work
-        std::this_thread::sleep_for(std::chrono::milliseconds(200u * threadId));
-        resultsPtr[threadId] = static_cast<int>(threadId);
-    }
-};
-
-TEST_CASE("queueCollective", "[queue]")
-{
-    // Define the index domain
-    using Dim = alpaka::dim::DimInt<1>;
-    using Idx = size_t;
-
-    // Define the accelerator
-    using Acc = alpaka::acc::AccCpuOmp2Blocks<Dim, Idx>;
-    using Dev = alpaka::dev::Dev<Acc>;
-
-    using Queue = alpaka::queue::QueueCpuOmp2Collective;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-
-    auto dev = alpaka::pltf::getDevByIdx<Pltf>(0u);
-    Queue queue(dev);
-
-    std::vector<int> results(4, -1);
-
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-    Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
-    Vec const blocksPerGrid(results.size());
-
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
-
-    #pragma omp parallel num_threads(static_cast<int>(results.size()))
-    {
-        // The kernel will be performed collectively.
-        // OpenMP will distribute the work between the threads from the parallel region
-        alpaka::kernel::exec<Acc>(
-               queue,
-               workDiv,
-               QueueCollectiveTestKernel{},
-               results.data());
-
-        alpaka::wait::wait(queue);
-    }
-
-    for(size_t i = 0; i < results.size(); ++i)
-    {
-        REQUIRE(static_cast<int>(i) == results.at(i));
-    }
-}
-
-TEST_CASE("TestCollectiveMemcpy", "[queue]")
-{
-     // Define the index domain
-    using Dim = alpaka::dim::DimInt<1>;
-    using Idx = size_t;
-
-    // Define the accelerator
-    using Acc = alpaka::acc::AccCpuOmp2Blocks<Dim, Idx>;
-    using Dev = alpaka::dev::Dev<Acc>;
-
-    using Queue = alpaka::queue::QueueCpuOmp2Collective;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-
-    auto dev = alpaka::pltf::getDevByIdx<Pltf>(0u);
-    Queue queue(dev);
-
-    std::vector<int> results(4, -1);
-
-    // Define the work division
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-    Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
-    Vec const blocksPerGrid(results.size());
-
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
-
-    #pragma omp parallel num_threads(static_cast<int>(results.size()))
-    {
-        int threadId = omp_get_thread_num();
-
-        using View = alpaka::mem::view::ViewPlainPtr<Dev, int, Dim, Idx>;
-
-        View dst(
-            results.data() + threadId,
-            dev,
-            Vec(static_cast<Idx>(1u)),
-            Vec(sizeof(int)));
-
-        View src(
-            &threadId,
-            dev,
-            Vec(static_cast<Idx>(1u)),
-            Vec(sizeof(int)));
-
-        // avoid that the first thread is executing the copy (can not be guaranteed)
-        size_t sleep_ms = (results.size() - static_cast<uint32_t>(threadId)) * 100u;
-        std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
-
-        // only one thread will perform this memcpy
-        alpaka::mem::view::copy(queue, dst, src, Vec(static_cast<Idx>(1u)));
-
-        alpaka::wait::wait(queue);
-    }
-
-    uint32_t numFlippedValues = 0u;
-    uint32_t numNonIntitialValues = 0u;
-    for(size_t i = 0; i < results.size(); ++i)
-    {
-        if(static_cast<int>(i) == results.at(i))
-            numFlippedValues++;
-        if(results.at(i) != -1)
-            numNonIntitialValues++;
-    }
-    // only one thread is allowed to flip the value
-    REQUIRE(numFlippedValues == 1u);
-    // only one value is allowed to differ from the initial value
-    REQUIRE(numNonIntitialValues == 1u);
-}
-
-#endif
diff --git a/thirdParty/alpaka/test/unit/queue/src/QueueTest.cpp b/thirdParty/alpaka/test/unit/queue/src/QueueTest.cpp
deleted file mode 100644
index 9b9b6461d9..0000000000
--- a/thirdParty/alpaka/test/unit/queue/src/QueueTest.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/meta/Concatenate.hpp>
-
-#include <alpaka/test/queue/QueueCpuOmp2Collective.hpp>
-
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/queue/QueueTestFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-#include <future>
-#include <thread>
-
-using TestQueues = alpaka::meta::Concatenate<
-        alpaka::test::queue::TestQueues
- #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-        ,
-        std::tuple<std::tuple<alpaka::dev::DevCpu, alpaka::queue::QueueCpuOmp2Collective>>
-#endif
-    >;
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueIsInitiallyEmpty", "[queue]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    Fixture f;
-
-    CHECK(alpaka::queue::empty(f.m_queue));
-}
-
-#if !BOOST_COMP_HIP // HIP-clang is currently not supporting callbacks
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueCallbackIsWorking", "[queue]", TestQueues)
-{
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    Fixture f;
-
-    std::promise<bool> promise;
-
-    alpaka::queue::enqueue(
-        f.m_queue,
-        [&](){
-            promise.set_value(true);
-        }
-    );
-
-    CHECK(promise.get_future().get());
-#endif
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueWaitShouldWork", "[queue]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    Fixture f;
-
-    bool CallbackFinished = false;
-    alpaka::queue::enqueue(
-        f.m_queue,
-        [&CallbackFinished]() noexcept
-        {
-            std::this_thread::sleep_for(std::chrono::milliseconds(100u));
-            CallbackFinished = true;
-        });
-
-    alpaka::wait::wait(f.m_queue);
-    CHECK(CallbackFinished);
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinished", "[queue]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    Fixture f;
-
-    bool CallbackFinished = false;
-    alpaka::queue::enqueue(
-        f.m_queue,
-        [&f, &CallbackFinished]() noexcept
-        {
-            CHECK(!alpaka::queue::empty(f.m_queue));
-            std::this_thread::sleep_for(std::chrono::milliseconds(100u));
-            CallbackFinished = true;
-        });
-
-    // A non-blocking queue will always stay empty because the task has been executed immediately.
-    if(!alpaka::test::queue::IsBlockingQueue<typename Fixture::Queue>::value)
-    {
-        alpaka::wait::wait(f.m_queue);
-    }
-
-    CHECK(alpaka::queue::empty(f.m_queue));
-    CHECK(CallbackFinished);
-}
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueShouldNotExecuteTasksInParallel", "[queue]", TestQueues)
-{
-    using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
-    Fixture f;
-
-    std::atomic<bool> taskIsExecuting(false);
-    std::promise<void> firstTaskFinished;
-    std::future<void> firstTaskFinishedFuture = firstTaskFinished.get_future();
-    std::promise<void> secondTaskFinished;
-    std::future<void> secondTaskFinishedFuture = secondTaskFinished.get_future();
-
-    std::thread thread1([&f, &taskIsExecuting, &firstTaskFinished](){
-        alpaka::queue::enqueue(
-            f.m_queue,
-            [&taskIsExecuting, &firstTaskFinished]() noexcept
-            {
-                CHECK(!taskIsExecuting.exchange(true));
-                std::this_thread::sleep_for(std::chrono::milliseconds(100u));
-                CHECK(taskIsExecuting.exchange(false));
-                firstTaskFinished.set_value();
-            });
-    });
-
-    std::thread thread2([&f, &taskIsExecuting, &secondTaskFinished](){
-        alpaka::queue::enqueue(
-            f.m_queue,
-            [&taskIsExecuting, &secondTaskFinished]() noexcept
-            {
-                CHECK(!taskIsExecuting.exchange(true));
-                std::this_thread::sleep_for(std::chrono::milliseconds(100u));
-                CHECK(taskIsExecuting.exchange(false));
-                secondTaskFinished.set_value();
-            });
-    });
-
-    // Both tasks have to be enqueued
-    thread1.join();
-    thread2.join();
-
-    alpaka::wait::wait(f.m_queue);
-
-    firstTaskFinishedFuture.get();
-    secondTaskFinishedFuture.get();
-}
-
-#endif
diff --git a/thirdParty/alpaka/test/unit/rand/CMakeLists.txt b/thirdParty/alpaka/test/unit/rand/CMakeLists.txt
deleted file mode 100644
index bc7f7c274d..0000000000
--- a/thirdParty/alpaka/test/unit/rand/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "rand")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/rand/src/RandTest.cpp b/thirdParty/alpaka/test/unit/rand/src/RandTest.cpp
deleted file mode 100644
index 15fa2f3867..0000000000
--- a/thirdParty/alpaka/test/unit/rand/src/RandTest.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/rand/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-class RandTestKernel
-{
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename T_Generator
-    >
-    ALPAKA_FN_ACC void
-    genNumbers(
-        TAcc const & acc,
-        bool * success,
-        T_Generator & gen
-    ) const
-    {
-        {
-            auto dist(alpaka::rand::distribution::createNormalReal<float>(acc));
-            auto const r = dist(gen);
-#if !BOOST_ARCH_PTX
-            ALPAKA_CHECK(*success, std::isfinite(r));
-#else
-            alpaka::ignore_unused(r);
-#endif
-        }
-
-        {
-            auto dist(alpaka::rand::distribution::createNormalReal<double>(acc));
-            auto const r = dist(gen);
-#if !BOOST_ARCH_PTX
-            ALPAKA_CHECK(*success, std::isfinite(r));
-#else
-            alpaka::ignore_unused(r);
-#endif
-        }
-        {
-            auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));
-            auto const r = dist(gen);
-            ALPAKA_CHECK(*success, 0.0f <= r);
-            ALPAKA_CHECK(*success, 1.0f > r);
-        }
-
-        {
-            auto dist(alpaka::rand::distribution::createUniformReal<double>(acc));
-            auto const r = dist(gen);
-            ALPAKA_CHECK(*success, 0.0 <= r);
-            ALPAKA_CHECK(*success, 1.0 > r);
-        }
-
-        {
-            auto dist(alpaka::rand::distribution::createUniformUint<std::uint32_t>(acc));
-            auto const r = dist(gen);
-            alpaka::ignore_unused(r);
-        }
-    }
-
-public:
-
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        // default generator for accelerator
-        auto genDefault = alpaka::rand::generator::createDefault(
-            acc,
-            12345u,
-            6789u
-        );
-        genNumbers( acc, success, genDefault );
-
-#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && \
-  !defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-        // std::random_device
-        auto genRandomDevice = alpaka::rand::generator::createDefault(
-            alpaka::rand::RandomDevice{},
-            12345u,
-            6789u
-        );
-        genNumbers( acc, success, genRandomDevice );
-
-        // MersenneTwister
-        auto genMersenneTwister = alpaka::rand::generator::createDefault(
-            alpaka::rand::MersenneTwister{},
-            12345u,
-            6789u
-        );
-        genNumbers( acc, success, genMersenneTwister );
-
-        // TinyMersenneTwister
-        auto genTinyMersenneTwister = alpaka::rand::generator::createDefault(
-            alpaka::rand::TinyMersenneTwister{},
-            12345u,
-            6789u
-        );
-        genNumbers( acc, success, genTinyMersenneTwister );
-#endif
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "defaultRandomGeneratorIsWorking", "[rand]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    RandTestKernel kernel;
-
-    REQUIRE(
-        fixture(
-            kernel));
-}
diff --git a/thirdParty/alpaka/test/unit/time/CMakeLists.txt b/thirdParty/alpaka/test/unit/time/CMakeLists.txt
deleted file mode 100644
index f92d82b3c0..0000000000
--- a/thirdParty/alpaka/test/unit/time/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2016-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "time")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/time/src/ClockTest.cpp b/thirdParty/alpaka/test/unit/time/src/ClockTest.cpp
deleted file mode 100644
index dd99b86d02..0000000000
--- a/thirdParty/alpaka/test/unit/time/src/ClockTest.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/time/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-//#############################################################################
-class ClockTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
-    {
-        std::uint64_t const start(
-            alpaka::time::clock(acc));
-        ALPAKA_CHECK(*success, 0u != start);
-
-        std::uint64_t const end(
-            alpaka::time::clock(acc));
-        ALPAKA_CHECK(*success, 0u != end);
-
-        // 'end' has to be greater equal 'start'.
-        // CUDA clock will never be equal for two calls, but the clock implementations for CPUs can be.
-        ALPAKA_CHECK(*success, end >= start);
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "clockIsWorking", "[timeClock]", alpaka::test::acc::TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    ClockTestKernel kernel;
-
-    REQUIRE(fixture(kernel));
-}
diff --git a/thirdParty/alpaka/test/unit/vec/CMakeLists.txt b/thirdParty/alpaka/test/unit/vec/CMakeLists.txt
deleted file mode 100644
index bcacd96981..0000000000
--- a/thirdParty/alpaka/test/unit/vec/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "vec")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/alpaka/test/unit/vec/src/VecTest.cpp b/thirdParty/alpaka/test/unit/vec/src/VecTest.cpp
deleted file mode 100644
index 20ce0ffb52..0000000000
--- a/thirdParty/alpaka/test/unit/vec/src/VecTest.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/test/dim/TestDims.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-
-#include <catch2/catch.hpp>
-
-//-----------------------------------------------------------------------------
-TEST_CASE("basicVecTraits", "[vec]")
-{
-    using Dim = alpaka::dim::DimInt<3u>;
-    using Idx = std::size_t;
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-
-    Vec const vec(
-        static_cast<Idx>(0u),
-        static_cast<Idx>(8u),
-        static_cast<Idx>(15u));
-
-
-
-    //-----------------------------------------------------------------------------
-    // alpaka::vec::Vec zero elements
-    {
-        using Dim0 = alpaka::dim::DimInt<0u>;
-        alpaka::vec::Vec<Dim0, Idx> const vec0{};
-    }
-
-    //-----------------------------------------------------------------------------
-    // alpaka::vec::subVecFromIndices
-    {
-        using IdxSequence =
-            alpaka::meta::IntegerSequence<
-                std::size_t,
-                0u,
-                Dim::value -1u,
-                0u>;
-        auto const vecSubIndices(
-            alpaka::vec::subVecFromIndices<
-                IdxSequence>(
-                    vec));
-
-        REQUIRE(vecSubIndices[0u] == vec[0u]);
-        REQUIRE(vecSubIndices[1u] == vec[Dim::value -1u]);
-        REQUIRE(vecSubIndices[2u] == vec[0u]);
-    }
-
-    //-----------------------------------------------------------------------------
-    // alpaka::vec::subVecBegin
-    {
-        using DimSubVecEnd =
-            alpaka::dim::DimInt<2u>;
-        auto const vecSubBegin(
-            alpaka::vec::subVecBegin<
-                DimSubVecEnd>(
-                    vec));
-
-        for(typename Dim::value_type i(0); i < DimSubVecEnd::value; ++i)
-        {
-            REQUIRE(vecSubBegin[i] == vec[i]);
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // alpaka::vec::subVecEnd
-    {
-        using DimSubVecEnd =
-            alpaka::dim::DimInt<2u>;
-        auto const vecSubEnd(
-            alpaka::vec::subVecEnd<
-                DimSubVecEnd>(
-                    vec));
-
-        for(typename Dim::value_type i(0); i < DimSubVecEnd::value; ++i)
-        {
-            REQUIRE(vecSubEnd[i] == vec[Dim::value - DimSubVecEnd::value + i]);
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // alpaka::vec::cast
-    {
-        using SizeCast = std::uint16_t;
-        auto const vecCast(
-            alpaka::vec::cast<
-                SizeCast>(
-                    vec));
-
-        /*using VecCastConst = decltype(vecCast);
-        using VecCast = typename std::decay<VecCastConst>::type;
-        static_assert(
-            std::is_same<
-                alpaka::idx::Idx<VecCast>,
-                SizeCast
-            >::value,
-            "The idx type of the casted vec is wrong");*/
-
-        for(typename Dim::value_type i(0); i < Dim::value; ++i)
-        {
-            REQUIRE(vecCast[i] == static_cast<SizeCast>(vec[i]));
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // alpaka::vec::reverse
-    {
-        auto const vecReverse(
-            alpaka::vec::reverse(
-                vec));
-
-        for(typename Dim::value_type i(0); i < Dim::value; ++i)
-        {
-            REQUIRE(vecReverse[i] == vec[Dim::value - 1u - i]);
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // alpaka::vec::concat
-    {
-        using Dim2 = alpaka::dim::DimInt<2u>;
-        alpaka::vec::Vec<Dim2, Idx> const vec2(
-            static_cast<Idx>(47u),
-            static_cast<Idx>(11u));
-
-        auto const vecConcat(
-            alpaka::vec::concat(
-                vec,
-                vec2));
-
-        static_assert(
-            std::is_same<alpaka::dim::Dim<std::decay<decltype(vecConcat)>::type>, alpaka::dim::DimInt<5u>>::value,
-            "Result dimension type of concatenation incorrect!");
-
-        for(typename Dim::value_type i(0); i < Dim::value; ++i)
-        {
-            REQUIRE(vecConcat[i] == vec[i]);
-        }
-        for(typename Dim2::value_type i(0); i < Dim2::value; ++i)
-        {
-            REQUIRE(vecConcat[Dim::value + i] == vec2[i]);
-        }
-    }
-
-    {
-        alpaka::vec::Vec<Dim, Idx> const vec3(
-            static_cast<Idx>(47u),
-            static_cast<Idx>(8u),
-            static_cast<Idx>(3u));
-
-        //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator +
-        {
-            auto const vecLessEqual(vec + vec3);
-
-            static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
-                "Result dimension type of operator <= incorrect!");
-
-            static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
-                "Result idx type of operator <= incorrect!");
-
-            alpaka::vec::Vec<Dim, Idx> const referenceVec(
-                static_cast<Idx>(47u),
-                static_cast<Idx>(16u),
-                static_cast<Idx>(18u));
-
-            REQUIRE(referenceVec == vecLessEqual);
-        }
-
-        //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator -
-        {
-            auto const vecLessEqual(vec - vec3);
-
-            static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
-                "Result dimension type of operator <= incorrect!");
-
-            static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
-                "Result idx type of operator <= incorrect!");
-
-            alpaka::vec::Vec<Dim, Idx> const referenceVec(
-                static_cast<Idx>(-47),
-                static_cast<Idx>(0u),
-                static_cast<Idx>(12u));
-
-            REQUIRE(referenceVec == vecLessEqual);
-        }
-
-        //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator *
-        {
-            auto const vecLessEqual(vec * vec3);
-
-            static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
-                "Result dimension type of operator <= incorrect!");
-
-            static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
-                "Result idx type of operator <= incorrect!");
-
-            alpaka::vec::Vec<Dim, Idx> const referenceVec(
-                static_cast<Idx>(0u),
-                static_cast<Idx>(64u),
-                static_cast<Idx>(45u));
-
-            REQUIRE(referenceVec == vecLessEqual);
-        }
-
-        //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator <
-        {
-            auto const vecLessEqual(vec < vec3);
-
-            static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
-                "Result dimension type of operator <= incorrect!");
-
-            static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
-                "Result idx type of operator <= incorrect!");
-
-            alpaka::vec::Vec<Dim, bool> const referenceVec(
-                true,
-                false,
-                false);
-
-            REQUIRE(referenceVec == vecLessEqual);
-        }
-
-        //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator <=
-        {
-            auto const vecLessEqual(vec <= vec3);
-
-            static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
-                "Result dimension type of operator <= incorrect!");
-
-            static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
-                "Result idx type of operator <= incorrect!");
-
-            alpaka::vec::Vec<Dim, bool> const referenceVec(
-                true,
-                true,
-                false);
-
-            REQUIRE(referenceVec == vecLessEqual);
-        }
-
-        //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator >=
-        {
-            auto const vecLessEqual(vec >= vec3);
-
-            static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
-                "Result dimension type of operator <= incorrect!");
-
-            static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
-                "Result idx type of operator <= incorrect!");
-
-            alpaka::vec::Vec<Dim, bool> const referenceVec(
-                false,
-                true,
-                true);
-
-            REQUIRE(referenceVec == vecLessEqual);
-        }
-
-        //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator >
-        {
-            auto const vecLessEqual(vec > vec3);
-
-            static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
-                "Result dimension type of operator <= incorrect!");
-
-            static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
-                "Result idx type of operator <= incorrect!");
-
-            alpaka::vec::Vec<Dim, bool> const referenceVec(
-                false,
-                false,
-                true);
-
-            REQUIRE(referenceVec == vecLessEqual);
-        }
-    }
-}
-
-//#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-struct NonAlpakaVec
-{
-    //-----------------------------------------------------------------------------
-    operator ::alpaka::vec::Vec<
-        TDim,
-        TIdx>() const
-    {
-        using AlpakaVector = ::alpaka::vec::Vec<
-            TDim,
-            TIdx
-        >;
-        AlpakaVector result(AlpakaVector::zeros());
-
-        for(TIdx d(0); d < TDim::value; ++d)
-        {
-            result[TDim::value - 1 - d] = (*this)[d];
-        }
-
-        return result;
-    }
-    //-----------------------------------------------------------------------------
-    auto operator [](TIdx /*idx*/) const
-    -> TIdx
-    {
-        return static_cast<TIdx>(0);
-    }
-};
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "vecNDConstructionFromNonAlpakaVec", "[vec]", alpaka::test::dim::TestDims)
-{
-    using Dim = TestType;
-    using Idx = std::size_t;
-
-    NonAlpakaVec<Dim, Idx> nonAlpakaVec;
-    auto const alpakaVec(static_cast<alpaka::vec::Vec<Dim, Idx>>(nonAlpakaVec));
-
-    for(Idx d(0); d < Dim::value; ++d)
-    {
-        REQUIRE(nonAlpakaVec[d] == alpakaVec[d]);
-    }
-}
diff --git a/thirdParty/catch2/catch_main/CMakeLists.txt b/thirdParty/catch2/catch_main/CMakeLists.txt
new file mode 100644
index 0000000000..4c9dd7589f
--- /dev/null
+++ b/thirdParty/catch2/catch_main/CMakeLists.txt
@@ -0,0 +1,49 @@
+#
+# Copyright 2015-2020 Benjamin Worpitz, Axel Huebl, Rene Widera
+#
+# This file is copied from lpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+option(USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON)
+
+if(USE_INTERNAL_CATCH2)
+    message(STATUS "Catch2: Using INTERNAL version 2.11.0")
+else()
+    find_package(Catch2 2.11.0 CONFIG REQUIRED)
+    set_target_properties(Catch2::Catch2 PROPERTIES IMPORTED_GLOBAL TRUE)
+    message(STATUS "Catch2: Found version ${Catch2_VERSION}")
+endif()
+
+add_library(CatchMain src/CatchMain.cpp)
+# target_compile_features(CatchMain PUBLIC cxx_std_14)  # min C++14
+set_target_properties(CatchMain PROPERTIES
+    FOLDER "test"
+    CXX_STANDARD 14  # exactly C++14
+    CXX_EXTENSIONS OFF
+    CXX_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON
+    WINDOWS_EXPORT_ALL_SYMBOLS ON
+)
+
+target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE")
+if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
+    # Workaround for STL atomic issue: https://forums.developer.nvidia.com/t/support-for-atomic-in-libstdc-missing/135403/2
+    # still appears in NVHPC 20.7
+    target_compile_definitions(CatchMain PUBLIC "__GCC_ATOMIC_TEST_AND_SET_TRUEVAL=1")
+endif()
+if(MSVC)
+    target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_WINDOWS_CRTDBG")
+    target_compile_options(CatchMain PUBLIC "/bigobj")
+endif()
+
+if(USE_INTERNAL_CATCH2)
+    target_include_directories(CatchMain SYSTEM PUBLIC
+        ${CMAKE_CURRENT_LIST_DIR}/../include)
+else()
+    target_include_directories(CatchMain SYSTEM PUBLIC
+        $<TARGET_PROPERTY:Catch2::Catch2,INTERFACE_INCLUDE_DIRECTORIES>)
+endif()
diff --git a/thirdParty/catch2/catch_main/src/CatchMain.cpp b/thirdParty/catch2/catch_main/src/CatchMain.cpp
new file mode 100644
index 0000000000..992e94f13d
--- /dev/null
+++ b/thirdParty/catch2/catch_main/src/CatchMain.cpp
@@ -0,0 +1,10 @@
+/* Copyright 2019 Axel Huebl
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch.hpp>
diff --git a/thirdParty/catch2/include/catch2/catch.hpp b/thirdParty/catch2/include/catch2/catch.hpp
new file mode 100644
index 0000000000..2a2d77a27f
--- /dev/null
+++ b/thirdParty/catch2/include/catch2/catch.hpp
@@ -0,0 +1,17877 @@
+/*
+ *  Catch v2.13.3
+ *  Generated: 2020-10-31 18:20:31.045274
+ *  ----------------------------------------------------------
+ *  This file has been merged from multiple headers. Please don't edit it directly
+ *  Copyright (c) 2020 Two Blue Cubes Ltd. All rights reserved.
+ *
+ *  Distributed under the Boost Software License, Version 1.0. (See accompanying
+ *  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
+#define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
+// start catch.hpp
+
+
+#define CATCH_VERSION_MAJOR 2
+#define CATCH_VERSION_MINOR 13
+#define CATCH_VERSION_PATCH 3
+
+#ifdef __clang__
+#    pragma clang system_header
+#elif defined __GNUC__
+#    pragma GCC system_header
+#endif
+
+// start catch_suppress_warnings.h
+
+#ifdef __clang__
+#   ifdef __ICC // icpc defines the __clang__ macro
+#       pragma warning(push)
+#       pragma warning(disable: 161 1682)
+#   else // __ICC
+#       pragma clang diagnostic push
+#       pragma clang diagnostic ignored "-Wpadded"
+#       pragma clang diagnostic ignored "-Wswitch-enum"
+#       pragma clang diagnostic ignored "-Wcovered-switch-default"
+#    endif
+#elif defined __GNUC__
+     // Because REQUIREs trigger GCC's -Wparentheses, and because still
+     // supported version of g++ have only buggy support for _Pragmas,
+     // Wparentheses have to be suppressed globally.
+#    pragma GCC diagnostic ignored "-Wparentheses" // See #674 for details
+
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wunused-variable"
+#    pragma GCC diagnostic ignored "-Wpadded"
+#endif
+// end catch_suppress_warnings.h
+#if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER)
+#  define CATCH_IMPL
+#  define CATCH_CONFIG_ALL_PARTS
+#endif
+
+// In the impl file, we want to have access to all parts of the headers
+// Can also be used to sanely support PCHs
+#if defined(CATCH_CONFIG_ALL_PARTS)
+#  define CATCH_CONFIG_EXTERNAL_INTERFACES
+#  if defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#    undef CATCH_CONFIG_DISABLE_MATCHERS
+#  endif
+#  if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
+#    define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
+#  endif
+#endif
+
+#if !defined(CATCH_CONFIG_IMPL_ONLY)
+// start catch_platform.h
+
+#ifdef __APPLE__
+# include <TargetConditionals.h>
+# if TARGET_OS_OSX == 1
+#  define CATCH_PLATFORM_MAC
+# elif TARGET_OS_IPHONE == 1
+#  define CATCH_PLATFORM_IPHONE
+# endif
+
+#elif defined(linux) || defined(__linux) || defined(__linux__)
+#  define CATCH_PLATFORM_LINUX
+
+#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) || defined(__MINGW32__)
+#  define CATCH_PLATFORM_WINDOWS
+#endif
+
+// end catch_platform.h
+
+#ifdef CATCH_IMPL
+#  ifndef CLARA_CONFIG_MAIN
+#    define CLARA_CONFIG_MAIN_NOT_DEFINED
+#    define CLARA_CONFIG_MAIN
+#  endif
+#endif
+
+// start catch_user_interfaces.h
+
+namespace Catch {
+    unsigned int rngSeed();
+}
+
+// end catch_user_interfaces.h
+// start catch_tag_alias_autoregistrar.h
+
+// start catch_common.h
+
+// start catch_compiler_capabilities.h
+
+// Detect a number of compiler features - by compiler
+// The following features are defined:
+//
+// CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported?
+// CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported?
+// CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported?
+// CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled?
+// ****************
+// Note to maintainers: if new toggles are added please document them
+// in configuration.md, too
+// ****************
+
+// In general each macro has a _NO_<feature name> form
+// (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature.
+// Many features, at point of detection, define an _INTERNAL_ macro, so they
+// can be combined, en-mass, with the _NO_ forms later.
+
+#ifdef __cplusplus
+
+#  if (__cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
+#    define CATCH_CPP14_OR_GREATER
+#  endif
+
+#  if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#    define CATCH_CPP17_OR_GREATER
+#  endif
+
+#endif
+
+// We have to avoid both ICC and Clang, because they try to mask themselves
+// as gcc, and we want only GCC in this block
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__)
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "GCC diagnostic pop" )
+
+#    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__)
+
+#endif
+
+#if defined(__clang__)
+
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "clang diagnostic pop" )
+
+// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug
+// which results in calls to destructors being emitted for each temporary,
+// without a matching initialization. In practice, this can result in something
+// like `std::string::~string` being called on an uninitialized value.
+//
+// For example, this code will likely segfault under IBM XL:
+// ```
+// REQUIRE(std::string("12") + "34" == "1234")
+// ```
+//
+// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented.
+#  if !defined(__ibmxl__) && !defined(__CUDACC__)
+#    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */
+#  endif
+
+#    define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \
+         _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"")
+
+#    define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wparentheses\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wunused-template\"" )
+
+#endif // __clang__
+
+////////////////////////////////////////////////////////////////////////////////
+// Assume that non-Windows platforms support posix signals by default
+#if !defined(CATCH_PLATFORM_WINDOWS)
+    #define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// We know some environments not to support full POSIX signals
+#if defined(__CYGWIN__) || defined(__QNX__) || defined(__EMSCRIPTEN__) || defined(__DJGPP__)
+    #define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS
+#endif
+
+#ifdef __OS400__
+#       define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS
+#       define CATCH_CONFIG_COLOUR_NONE
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Android somehow still does not support std::to_string
+#if defined(__ANDROID__)
+#    define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
+#    define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Not all Windows environments support SEH properly
+#if defined(__MINGW32__)
+#    define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// PS4
+#if defined(__ORBIS__)
+#    define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Cygwin
+#ifdef __CYGWIN__
+
+// Required for some versions of Cygwin to declare gettimeofday
+// see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin
+#   define _BSD_SOURCE
+// some versions of cygwin (most) do not support std::to_string. Use the libstd check.
+// https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html line 2812-2813
+# if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) \
+           && !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF))
+
+#    define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
+
+# endif
+#endif // __CYGWIN__
+
+////////////////////////////////////////////////////////////////////////////////
+// Visual C++
+#if defined(_MSC_VER)
+
+#  define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma( warning(push) )
+#  define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  __pragma( warning(pop) )
+
+// Universal Windows platform does not support SEH
+// Or console colours (or console at all...)
+#  if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP)
+#    define CATCH_CONFIG_COLOUR_NONE
+#  else
+#    define CATCH_INTERNAL_CONFIG_WINDOWS_SEH
+#  endif
+
+// MSVC traditional preprocessor needs some workaround for __VA_ARGS__
+// _MSVC_TRADITIONAL == 0 means new conformant preprocessor
+// _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor
+#  if !defined(__clang__) // Handle Clang masquerading for msvc
+#    if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL)
+#      define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#    endif // MSVC_TRADITIONAL
+#  endif // __clang__
+
+#endif // _MSC_VER
+
+#if defined(_REENTRANT) || defined(_MSC_VER)
+// Enable async processing, as -pthread is specified or no additional linking is required
+# define CATCH_INTERNAL_CONFIG_USE_ASYNC
+#endif // _MSC_VER
+
+////////////////////////////////////////////////////////////////////////////////
+// Check if we are compiled with -fno-exceptions or equivalent
+#if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND)
+#  define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// DJGPP
+#ifdef __DJGPP__
+#  define CATCH_INTERNAL_CONFIG_NO_WCHAR
+#endif // __DJGPP__
+
+////////////////////////////////////////////////////////////////////////////////
+// Embarcadero C++Build
+#if defined(__BORLANDC__)
+    #define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Use of __COUNTER__ is suppressed during code analysis in
+// CLion/AppCode 2017.2.x and former, because __COUNTER__ is not properly
+// handled by it.
+// Otherwise all supported compilers support COUNTER macro,
+// but user still might want to turn it off
+#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
+    #define CATCH_INTERNAL_CONFIG_COUNTER
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+// RTX is a special version of Windows that is real time.
+// This means that it is detected as Windows, but does not provide
+// the same set of capabilities as real Windows does.
+#if defined(UNDER_RTSS) || defined(RTX64_BUILD)
+    #define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
+    #define CATCH_INTERNAL_CONFIG_NO_ASYNC
+    #define CATCH_CONFIG_COLOUR_NONE
+#endif
+
+#if !defined(_GLIBCXX_USE_C99_MATH_TR1)
+#define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER
+#endif
+
+// Various stdlib support checks that require __has_include
+#if defined(__has_include)
+  // Check if string_view is available and usable
+  #if __has_include(<string_view>) && defined(CATCH_CPP17_OR_GREATER)
+  #    define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW
+  #endif
+
+  // Check if optional is available and usable
+  #  if __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)
+  #    define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL
+  #  endif // __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)
+
+  // Check if byte is available and usable
+  #  if __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
+  #    include <cstddef>
+  #    if __cpp_lib_byte > 0
+  #      define CATCH_INTERNAL_CONFIG_CPP17_BYTE
+  #    endif
+  #  endif // __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
+
+  // Check if variant is available and usable
+  #  if __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
+  #    if defined(__clang__) && (__clang_major__ < 8)
+         // work around clang bug with libstdc++ https://bugs.llvm.org/show_bug.cgi?id=31852
+         // fix should be in clang 8, workaround in libstdc++ 8.2
+  #      include <ciso646>
+  #      if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
+  #        define CATCH_CONFIG_NO_CPP17_VARIANT
+  #      else
+  #        define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
+  #      endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
+  #    else
+  #      define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
+  #    endif // defined(__clang__) && (__clang_major__ < 8)
+  #  endif // __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
+#endif // defined(__has_include)
+
+#if defined(CATCH_INTERNAL_CONFIG_COUNTER) && !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER)
+#   define CATCH_CONFIG_COUNTER
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH)
+#   define CATCH_CONFIG_WINDOWS_SEH
+#endif
+// This is set by default, because we assume that unix compilers are posix-signal-compatible by default.
+#if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) && !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS)
+#   define CATCH_CONFIG_POSIX_SIGNALS
+#endif
+// This is set by default, because we assume that compilers with no wchar_t support are just rare exceptions.
+#if !defined(CATCH_INTERNAL_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_WCHAR)
+#   define CATCH_CONFIG_WCHAR
+#endif
+
+#if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_CPP11_TO_STRING)
+#    define CATCH_CONFIG_CPP11_TO_STRING
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_CPP17_OPTIONAL)
+#  define CATCH_CONFIG_CPP17_OPTIONAL
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW)
+#  define CATCH_CONFIG_CPP17_STRING_VIEW
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) && !defined(CATCH_CONFIG_NO_CPP17_VARIANT) && !defined(CATCH_CONFIG_CPP17_VARIANT)
+#  define CATCH_CONFIG_CPP17_VARIANT
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) && !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE)
+#  define CATCH_CONFIG_CPP17_BYTE
+#endif
+
+#if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
+#  define CATCH_INTERNAL_CONFIG_NEW_CAPTURE
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) && !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NEW_CAPTURE)
+#  define CATCH_CONFIG_NEW_CAPTURE
+#endif
+
+#if !defined(CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+#  define CATCH_CONFIG_DISABLE_EXCEPTIONS
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_POLYFILL_ISNAN)
+#  define CATCH_CONFIG_POLYFILL_ISNAN
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC)  && !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) && !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC)
+#  define CATCH_CONFIG_USE_ASYNC
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_NO_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_ANDROID_LOGWRITE)
+#  define CATCH_CONFIG_ANDROID_LOGWRITE
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
+#  define CATCH_CONFIG_GLOBAL_NEXTAFTER
+#endif
+
+// Even if we do not think the compiler has that warning, we still have
+// to provide a macro that can be used by the code.
+#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION)
+#   define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#endif
+#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION)
+#   define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS
+#endif
+
+// The goal of this macro is to avoid evaluation of the arguments, but
+// still have the compiler warn on problems inside...
+#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN)
+#   define CATCH_INTERNAL_IGNORE_BUT_WARN(...)
+#endif
+
+#if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10)
+#   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#elif defined(__clang__) && (__clang_major__ < 5)
+#   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+#define CATCH_TRY if ((true))
+#define CATCH_CATCH_ALL if ((false))
+#define CATCH_CATCH_ANON(type) if ((false))
+#else
+#define CATCH_TRY try
+#define CATCH_CATCH_ALL catch (...)
+#define CATCH_CATCH_ANON(type) catch (type)
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR)
+#define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#endif
+
+// end catch_compiler_capabilities.h
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
+#ifdef CATCH_CONFIG_COUNTER
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
+#else
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
+#endif
+
+#include <iosfwd>
+#include <string>
+#include <cstdint>
+
+// We need a dummy global operator<< so we can bring it into Catch namespace later
+struct Catch_global_namespace_dummy {};
+std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy);
+
+namespace Catch {
+
+    struct CaseSensitive { enum Choice {
+        Yes,
+        No
+    }; };
+
+    class NonCopyable {
+        NonCopyable( NonCopyable const& )              = delete;
+        NonCopyable( NonCopyable && )                  = delete;
+        NonCopyable& operator = ( NonCopyable const& ) = delete;
+        NonCopyable& operator = ( NonCopyable && )     = delete;
+
+    protected:
+        NonCopyable();
+        virtual ~NonCopyable();
+    };
+
+    struct SourceLineInfo {
+
+        SourceLineInfo() = delete;
+        SourceLineInfo( char const* _file, std::size_t _line ) noexcept
+        :   file( _file ),
+            line( _line )
+        {}
+
+        SourceLineInfo( SourceLineInfo const& other )            = default;
+        SourceLineInfo& operator = ( SourceLineInfo const& )     = default;
+        SourceLineInfo( SourceLineInfo&& )              noexcept = default;
+        SourceLineInfo& operator = ( SourceLineInfo&& ) noexcept = default;
+
+        bool empty() const noexcept { return file[0] == '\0'; }
+        bool operator == ( SourceLineInfo const& other ) const noexcept;
+        bool operator < ( SourceLineInfo const& other ) const noexcept;
+
+        char const* file;
+        std::size_t line;
+    };
+
+    std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info );
+
+    // Bring in operator<< from global namespace into Catch namespace
+    // This is necessary because the overload of operator<< above makes
+    // lookup stop at namespace Catch
+    using ::operator<<;
+
+    // Use this in variadic streaming macros to allow
+    //    >> +StreamEndStop
+    // as well as
+    //    >> stuff +StreamEndStop
+    struct StreamEndStop {
+        std::string operator+() const;
+    };
+    template<typename T>
+    T const& operator + ( T const& value, StreamEndStop ) {
+        return value;
+    }
+}
+
+#define CATCH_INTERNAL_LINEINFO \
+    ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
+
+// end catch_common.h
+namespace Catch {
+
+    struct RegistrarForTagAliases {
+        RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo );
+    };
+
+} // end namespace Catch
+
+#define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+    namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+// end catch_tag_alias_autoregistrar.h
+// start catch_test_registry.h
+
+// start catch_interfaces_testcase.h
+
+#include <vector>
+
+namespace Catch {
+
+    class TestSpec;
+
+    struct ITestInvoker {
+        virtual void invoke () const = 0;
+        virtual ~ITestInvoker();
+    };
+
+    class TestCase;
+    struct IConfig;
+
+    struct ITestCaseRegistry {
+        virtual ~ITestCaseRegistry();
+        virtual std::vector<TestCase> const& getAllTests() const = 0;
+        virtual std::vector<TestCase> const& getAllTestsSorted( IConfig const& config ) const = 0;
+    };
+
+    bool isThrowSafe( TestCase const& testCase, IConfig const& config );
+    bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config );
+    std::vector<TestCase> filterTests( std::vector<TestCase> const& testCases, TestSpec const& testSpec, IConfig const& config );
+    std::vector<TestCase> const& getAllTestCasesSorted( IConfig const& config );
+
+}
+
+// end catch_interfaces_testcase.h
+// start catch_stringref.h
+
+#include <cstddef>
+#include <string>
+#include <iosfwd>
+#include <cassert>
+
+namespace Catch {
+
+    /// A non-owning string class (similar to the forthcoming std::string_view)
+    /// Note that, because a StringRef may be a substring of another string,
+    /// it may not be null terminated.
+    class StringRef {
+    public:
+        using size_type = std::size_t;
+        using const_iterator = const char*;
+
+    private:
+        static constexpr char const* const s_empty = "";
+
+        char const* m_start = s_empty;
+        size_type m_size = 0;
+
+    public: // construction
+        constexpr StringRef() noexcept = default;
+
+        StringRef( char const* rawChars ) noexcept;
+
+        constexpr StringRef( char const* rawChars, size_type size ) noexcept
+        :   m_start( rawChars ),
+            m_size( size )
+        {}
+
+        StringRef( std::string const& stdString ) noexcept
+        :   m_start( stdString.c_str() ),
+            m_size( stdString.size() )
+        {}
+
+        explicit operator std::string() const {
+            return std::string(m_start, m_size);
+        }
+
+    public: // operators
+        auto operator == ( StringRef const& other ) const noexcept -> bool;
+        auto operator != (StringRef const& other) const noexcept -> bool {
+            return !(*this == other);
+        }
+
+        auto operator[] ( size_type index ) const noexcept -> char {
+            assert(index < m_size);
+            return m_start[index];
+        }
+
+    public: // named queries
+        constexpr auto empty() const noexcept -> bool {
+            return m_size == 0;
+        }
+        constexpr auto size() const noexcept -> size_type {
+            return m_size;
+        }
+
+        // Returns the current start pointer. If the StringRef is not
+        // null-terminated, throws std::domain_exception
+        auto c_str() const -> char const*;
+
+    public: // substrings and searches
+        // Returns a substring of [start, start + length).
+        // If start + length > size(), then the substring is [start, size()).
+        // If start > size(), then the substring is empty.
+        auto substr( size_type start, size_type length ) const noexcept -> StringRef;
+
+        // Returns the current start pointer. May not be null-terminated.
+        auto data() const noexcept -> char const*;
+
+        constexpr auto isNullTerminated() const noexcept -> bool {
+            return m_start[m_size] == '\0';
+        }
+
+    public: // iterators
+        constexpr const_iterator begin() const { return m_start; }
+        constexpr const_iterator end() const { return m_start + m_size; }
+    };
+
+    auto operator += ( std::string& lhs, StringRef const& sr ) -> std::string&;
+    auto operator << ( std::ostream& os, StringRef const& sr ) -> std::ostream&;
+
+    constexpr auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef {
+        return StringRef( rawChars, size );
+    }
+} // namespace Catch
+
+constexpr auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef {
+    return Catch::StringRef( rawChars, size );
+}
+
+// end catch_stringref.h
+// start catch_preprocessor.hpp
+
+
+#define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__
+#define CATCH_RECURSION_LEVEL1(...) CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL2(...) CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL3(...) CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL4(...) CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL5(...) CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__)))
+
+#ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__
+// MSVC needs more evaluations
+#define CATCH_RECURSION_LEVEL6(...) CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__)))
+#define CATCH_RECURSE(...)  CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__))
+#else
+#define CATCH_RECURSE(...)  CATCH_RECURSION_LEVEL5(__VA_ARGS__)
+#endif
+
+#define CATCH_REC_END(...)
+#define CATCH_REC_OUT
+
+#define CATCH_EMPTY()
+#define CATCH_DEFER(id) id CATCH_EMPTY()
+
+#define CATCH_REC_GET_END2() 0, CATCH_REC_END
+#define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2
+#define CATCH_REC_GET_END(...) CATCH_REC_GET_END1
+#define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT
+#define CATCH_REC_NEXT1(test, next) CATCH_DEFER ( CATCH_REC_NEXT0 ) ( test, next, 0)
+#define CATCH_REC_NEXT(test, next)  CATCH_REC_NEXT1(CATCH_REC_GET_END test, next)
+
+#define CATCH_REC_LIST0(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST1(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0) ) ( f, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST2(f, x, peek, ...)   f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ )
+
+#define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...)   f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+
+// Applies the function macro `f` to each of the remaining parameters, inserts commas between the results,
+// and passes userdata as the first parameter to each invocation,
+// e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a), f(x, b), f(x, c)
+#define CATCH_REC_LIST_UD(f, userdata, ...) CATCH_RECURSE(CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
+
+#define CATCH_REC_LIST(f, ...) CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
+
+#define INTERNAL_CATCH_EXPAND1(param) INTERNAL_CATCH_EXPAND2(param)
+#define INTERNAL_CATCH_EXPAND2(...) INTERNAL_CATCH_NO## __VA_ARGS__
+#define INTERNAL_CATCH_DEF(...) INTERNAL_CATCH_DEF __VA_ARGS__
+#define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
+#define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__)
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__
+#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param))
+#else
+// MSVC is adding extra space and needs another indirection to expand INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
+#define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__)
+#define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__
+#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1)
+#endif
+
+#define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__
+#define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name)
+
+#define INTERNAL_CATCH_REMOVE_PARENS(...) INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF __VA_ARGS__)
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>())
+#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))
+#else
+#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) INTERNAL_CATCH_EXPAND_VARGS(decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>()))
+#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)))
+#endif
+
+#define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)\
+    CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST,__VA_ARGS__)
+
+#define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0)
+#define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1)
+#define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2)
+#define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3)
+#define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4)
+#define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5)
+#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6)
+#define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7)
+#define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8)
+#define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9)
+#define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10)
+
+#define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+
+#define INTERNAL_CATCH_TYPE_GEN\
+    template<typename...> struct TypeList {};\
+    template<typename...Ts>\
+    constexpr auto get_wrapper() noexcept -> TypeList<Ts...> { return {}; }\
+    template<template<typename...> class...> struct TemplateTypeList{};\
+    template<template<typename...> class...Cs>\
+    constexpr auto get_wrapper() noexcept -> TemplateTypeList<Cs...> { return {}; }\
+    template<typename...>\
+    struct append;\
+    template<typename...>\
+    struct rewrap;\
+    template<template<typename...> class, typename...>\
+    struct create;\
+    template<template<typename...> class, typename>\
+    struct convert;\
+    \
+    template<typename T> \
+    struct append<T> { using type = T; };\
+    template< template<typename...> class L1, typename...E1, template<typename...> class L2, typename...E2, typename...Rest>\
+    struct append<L1<E1...>, L2<E2...>, Rest...> { using type = typename append<L1<E1...,E2...>, Rest...>::type; };\
+    template< template<typename...> class L1, typename...E1, typename...Rest>\
+    struct append<L1<E1...>, TypeList<mpl_::na>, Rest...> { using type = L1<E1...>; };\
+    \
+    template< template<typename...> class Container, template<typename...> class List, typename...elems>\
+    struct rewrap<TemplateTypeList<Container>, List<elems...>> { using type = TypeList<Container<elems...>>; };\
+    template< template<typename...> class Container, template<typename...> class List, class...Elems, typename...Elements>\
+    struct rewrap<TemplateTypeList<Container>, List<Elems...>, Elements...> { using type = typename append<TypeList<Container<Elems...>>, typename rewrap<TemplateTypeList<Container>, Elements...>::type>::type; };\
+    \
+    template<template <typename...> class Final, template< typename...> class...Containers, typename...Types>\
+    struct create<Final, TemplateTypeList<Containers...>, TypeList<Types...>> { using type = typename append<Final<>, typename rewrap<TemplateTypeList<Containers>, Types...>::type...>::type; };\
+    template<template <typename...> class Final, template <typename...> class List, typename...Ts>\
+    struct convert<Final, List<Ts...>> { using type = typename append<Final<>,TypeList<Ts>...>::type; };
+
+#define INTERNAL_CATCH_NTTP_1(signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> struct Nttp{};\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    constexpr auto get_wrapper() noexcept -> Nttp<__VA_ARGS__> { return {}; } \
+    template<template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...> struct NttpTemplateTypeList{};\
+    template<template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...Cs>\
+    constexpr auto get_wrapper() noexcept -> NttpTemplateTypeList<Cs...> { return {}; } \
+    \
+    template< template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class Container, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class List, INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>> { using type = TypeList<Container<__VA_ARGS__>>; };\
+    template< template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class Container, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class List, INTERNAL_CATCH_REMOVE_PARENS(signature), typename...Elements>\
+    struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>, Elements...> { using type = typename append<TypeList<Container<__VA_ARGS__>>, typename rewrap<NttpTemplateTypeList<Container>, Elements...>::type>::type; };\
+    template<template <typename...> class Final, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...Containers, typename...Types>\
+    struct create<Final, NttpTemplateTypeList<Containers...>, TypeList<Types...>> { using type = typename append<Final<>, typename rewrap<NttpTemplateTypeList<Containers>, Types...>::type...>::type; };
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST0(TestName)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST1(TestName, signature)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_X(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+
+#define INTERNAL_CATCH_DEFINE_SIG_TEST0(TestName)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST1(TestName, signature)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_X(TestName, signature,...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+
+#define INTERNAL_CATCH_NTTP_REGISTER0(TestFunc, signature)\
+    template<typename Type>\
+    void reg_test(TypeList<Type>, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestFunc<Type>), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER(TestFunc, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    void reg_test(Nttp<__VA_ARGS__>, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestFunc<__VA_ARGS__>), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER_METHOD0(TestName, signature, ...)\
+    template<typename Type>\
+    void reg_test(TypeList<Type>, Catch::StringRef className, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestName<Type>::test), CATCH_INTERNAL_LINEINFO, className, nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER_METHOD(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    void reg_test(Nttp<__VA_ARGS__>, Catch::StringRef className, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestName<__VA_ARGS__>::test), CATCH_INTERNAL_LINEINFO, className, nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0(TestName, ClassName)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1(TestName, ClassName, signature)\
+    template<typename TestType> \
+    struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<TestType> { \
+        void test();\
+    }
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X(TestName, ClassName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> \
+    struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<__VA_ARGS__> { \
+        void test();\
+    }
+
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0(TestName)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1(TestName, signature)\
+    template<typename TestType> \
+    void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<TestType>::test()
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> \
+    void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<__VA_ARGS__>::test()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_NTTP_0
+#define INTERNAL_CATCH_NTTP_GEN(...) INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__),INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_0)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)(TestName, ClassName, __VA_ARGS__)
+#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD0, INTERNAL_CATCH_NTTP_REGISTER_METHOD0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER0, INTERNAL_CATCH_NTTP_REGISTER0)(TestFunc, __VA_ARGS__)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST1, INTERNAL_CATCH_DEFINE_SIG_TEST0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST1, INTERNAL_CATCH_DECLARE_SIG_TEST0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...) INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,INTERNAL_CATCH_REMOVE_PARENS_10_ARG,INTERNAL_CATCH_REMOVE_PARENS_9_ARG,INTERNAL_CATCH_REMOVE_PARENS_8_ARG,INTERNAL_CATCH_REMOVE_PARENS_7_ARG,INTERNAL_CATCH_REMOVE_PARENS_6_ARG,INTERNAL_CATCH_REMOVE_PARENS_5_ARG,INTERNAL_CATCH_REMOVE_PARENS_4_ARG,INTERNAL_CATCH_REMOVE_PARENS_3_ARG,INTERNAL_CATCH_REMOVE_PARENS_2_ARG,INTERNAL_CATCH_REMOVE_PARENS_1_ARG)(__VA_ARGS__)
+#else
+#define INTERNAL_CATCH_NTTP_0(signature)
+#define INTERNAL_CATCH_NTTP_GEN(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_0)( __VA_ARGS__))
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)(TestName, ClassName, __VA_ARGS__))
+#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD0, INTERNAL_CATCH_NTTP_REGISTER_METHOD0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER0, INTERNAL_CATCH_NTTP_REGISTER0)(TestFunc, __VA_ARGS__))
+#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST1, INTERNAL_CATCH_DEFINE_SIG_TEST0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST1, INTERNAL_CATCH_DECLARE_SIG_TEST0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,INTERNAL_CATCH_REMOVE_PARENS_10_ARG,INTERNAL_CATCH_REMOVE_PARENS_9_ARG,INTERNAL_CATCH_REMOVE_PARENS_8_ARG,INTERNAL_CATCH_REMOVE_PARENS_7_ARG,INTERNAL_CATCH_REMOVE_PARENS_6_ARG,INTERNAL_CATCH_REMOVE_PARENS_5_ARG,INTERNAL_CATCH_REMOVE_PARENS_4_ARG,INTERNAL_CATCH_REMOVE_PARENS_3_ARG,INTERNAL_CATCH_REMOVE_PARENS_2_ARG,INTERNAL_CATCH_REMOVE_PARENS_1_ARG)(__VA_ARGS__))
+#endif
+
+// end catch_preprocessor.hpp
+// start catch_meta.hpp
+
+
+#include <type_traits>
+
+namespace Catch {
+    template<typename T>
+    struct always_false : std::false_type {};
+
+    template <typename> struct true_given : std::true_type {};
+    struct is_callable_tester {
+        template <typename Fun, typename... Args>
+        true_given<decltype(std::declval<Fun>()(std::declval<Args>()...))> static test(int);
+        template <typename...>
+        std::false_type static test(...);
+    };
+
+    template <typename T>
+    struct is_callable;
+
+    template <typename Fun, typename... Args>
+    struct is_callable<Fun(Args...)> : decltype(is_callable_tester::test<Fun, Args...>(0)) {};
+
+#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703
+    // std::result_of is deprecated in C++17 and removed in C++20. Hence, it is
+    // replaced with std::invoke_result here.
+    template <typename Func, typename... U>
+    using FunctionReturnType = std::remove_reference_t<std::remove_cv_t<std::invoke_result_t<Func, U...>>>;
+#else
+    // Keep ::type here because we still support C++11
+    template <typename Func, typename... U>
+    using FunctionReturnType = typename std::remove_reference<typename std::remove_cv<typename std::result_of<Func(U...)>::type>::type>::type;
+#endif
+
+} // namespace Catch
+
+namespace mpl_{
+    struct na;
+}
+
+// end catch_meta.hpp
+namespace Catch {
+
+template<typename C>
+class TestInvokerAsMethod : public ITestInvoker {
+    void (C::*m_testAsMethod)();
+public:
+    TestInvokerAsMethod( void (C::*testAsMethod)() ) noexcept : m_testAsMethod( testAsMethod ) {}
+
+    void invoke() const override {
+        C obj;
+        (obj.*m_testAsMethod)();
+    }
+};
+
+auto makeTestInvoker( void(*testAsFunction)() ) noexcept -> ITestInvoker*;
+
+template<typename C>
+auto makeTestInvoker( void (C::*testAsMethod)() ) noexcept -> ITestInvoker* {
+    return new(std::nothrow) TestInvokerAsMethod<C>( testAsMethod );
+}
+
+struct NameAndTags {
+    NameAndTags( StringRef const& name_ = StringRef(), StringRef const& tags_ = StringRef() ) noexcept;
+    StringRef name;
+    StringRef tags;
+};
+
+struct AutoReg : NonCopyable {
+    AutoReg( ITestInvoker* invoker, SourceLineInfo const& lineInfo, StringRef const& classOrMethod, NameAndTags const& nameAndTags ) noexcept;
+    ~AutoReg();
+};
+
+} // end namespace Catch
+
+#if defined(CATCH_CONFIG_DISABLE)
+    #define INTERNAL_CATCH_TESTCASE_NO_REGISTRATION( TestName, ... ) \
+        static void TestName()
+    #define INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION( TestName, ClassName, ... ) \
+        namespace{                        \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) { \
+                void test();              \
+            };                            \
+        }                                 \
+        void TestName::test()
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( TestName, TestFunc, Name, Tags, Signature, ... )  \
+        INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( TestNameClass, TestName, ClassName, Name, Tags, Signature, ... )    \
+        namespace{                                                                                  \
+            namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                                      \
+            INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+        }                                                                                           \
+        }                                                                                           \
+        INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) )
+    #endif
+#endif
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \
+        static void TestName(); \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &TestName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        static void TestName()
+    #define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), __VA_ARGS__ )
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &QualifiedMethod ), CATCH_INTERNAL_LINEINFO, "&" #QualifiedMethod, Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        namespace{ \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) { \
+                void test(); \
+            }; \
+            Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar ) ( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \
+        } \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        void TestName::test()
+    #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \
+        INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), ClassName, __VA_ARGS__ )
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( Function ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(TestName, TestFunc, Name, Tags, Signature, ... )\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        INTERNAL_CATCH_DECLARE_SIG_TEST(TestFunc, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){\
+            INTERNAL_CATCH_TYPE_GEN\
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            INTERNAL_CATCH_NTTP_REG_GEN(TestFunc,INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types> \
+            struct TestName{\
+                TestName(){\
+                    int index = 0;                                    \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\
+                    using expander = int[];\
+                    (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+            TestName<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>();\
+            return 0;\
+        }();\
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(TestName, TestFuncName, Name, Tags, Signature, TmplTypes, TypesList) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                      \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                      \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS              \
+        template<typename TestType> static void TestFuncName();       \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                                     \
+            INTERNAL_CATCH_TYPE_GEN                                                  \
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))         \
+            template<typename... Types>                               \
+            struct TestName {                                         \
+                void reg_tests() {                                          \
+                    int index = 0;                                    \
+                    using expander = int[];                           \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\
+                    constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\
+                    constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++)... };/* NOLINT */\
+                }                                                     \
+            };                                                        \
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \
+                using TestInit = typename create<TestName, decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()), TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type; \
+                TestInit t;                                           \
+                t.reg_tests();                                        \
+                return 0;                                             \
+            }();                                                      \
+        }                                                             \
+        }                                                             \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                       \
+        template<typename TestType>                                   \
+        static void TestFuncName()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename T,__VA_ARGS__)
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename T, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__)
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(TestName, TestFunc, Name, Tags, TmplList)\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        template<typename TestType> static void TestFunc();       \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){\
+        INTERNAL_CATCH_TYPE_GEN\
+        template<typename... Types>                               \
+        struct TestName {                                         \
+            void reg_tests() {                                          \
+                int index = 0;                                    \
+                using expander = int[];                           \
+                (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */\
+            }                                                     \
+        };\
+        static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \
+                using TestInit = typename convert<TestName, TmplList>::type; \
+                TestInit t;                                           \
+                t.reg_tests();                                        \
+                return 0;                                             \
+            }();                                                      \
+        }}\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                       \
+        template<typename TestType>                                   \
+        static void TestFunc()
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(Name, Tags, TmplList) \
+        INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, TmplList )
+
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, Signature, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){ \
+            INTERNAL_CATCH_TYPE_GEN\
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+            INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types> \
+            struct TestNameClass{\
+                TestNameClass(){\
+                    int index = 0;                                    \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\
+                    using expander = int[];\
+                    (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                TestNameClass<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>();\
+                return 0;\
+        }();\
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(TestNameClass, TestName, ClassName, Name, Tags, Signature, TmplTypes, TypesList)\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        template<typename TestType> \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName <TestType>) { \
+                void test();\
+            };\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestNameClass) {\
+            INTERNAL_CATCH_TYPE_GEN                  \
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types>\
+            struct TestNameClass{\
+                void reg_tests(){\
+                    int index = 0;\
+                    using expander = int[];\
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\
+                    constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\
+                    constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                using TestInit = typename create<TestNameClass, decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()), TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type;\
+                TestInit t;\
+                t.reg_tests();\
+                return 0;\
+            }(); \
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        template<typename TestType> \
+        void TestName<TestType>::test()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, typename T, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, typename T,__VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, Signature,__VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, TmplList) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        template<typename TestType> \
+        struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName <TestType>) { \
+            void test();\
+        };\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){ \
+            INTERNAL_CATCH_TYPE_GEN\
+            template<typename...Types>\
+            struct TestNameClass{\
+                void reg_tests(){\
+                    int index = 0;\
+                    using expander = int[];\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                using TestInit = typename convert<TestNameClass, TmplList>::type;\
+                TestInit t;\
+                t.reg_tests();\
+                return 0;\
+            }(); \
+        }}\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        template<typename TestType> \
+        void TestName<TestType>::test()
+
+#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(ClassName, Name, Tags, TmplList) \
+        INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, TmplList )
+
+// end catch_test_registry.h
+// start catch_capture.hpp
+
+// start catch_assertionhandler.h
+
+// start catch_assertioninfo.h
+
+// start catch_result_type.h
+
+namespace Catch {
+
+    // ResultWas::OfType enum
+    struct ResultWas { enum OfType {
+        Unknown = -1,
+        Ok = 0,
+        Info = 1,
+        Warning = 2,
+
+        FailureBit = 0x10,
+
+        ExpressionFailed = FailureBit | 1,
+        ExplicitFailure = FailureBit | 2,
+
+        Exception = 0x100 | FailureBit,
+
+        ThrewException = Exception | 1,
+        DidntThrowException = Exception | 2,
+
+        FatalErrorCondition = 0x200 | FailureBit
+
+    }; };
+
+    bool isOk( ResultWas::OfType resultType );
+    bool isJustInfo( int flags );
+
+    // ResultDisposition::Flags enum
+    struct ResultDisposition { enum Flags {
+        Normal = 0x01,
+
+        ContinueOnFailure = 0x02,   // Failures fail test, but execution continues
+        FalseTest = 0x04,           // Prefix expression with !
+        SuppressFail = 0x08         // Failures are reported but do not fail the test
+    }; };
+
+    ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs );
+
+    bool shouldContinueOnFailure( int flags );
+    inline bool isFalseTest( int flags ) { return ( flags & ResultDisposition::FalseTest ) != 0; }
+    bool shouldSuppressFailure( int flags );
+
+} // end namespace Catch
+
+// end catch_result_type.h
+namespace Catch {
+
+    struct AssertionInfo
+    {
+        StringRef macroName;
+        SourceLineInfo lineInfo;
+        StringRef capturedExpression;
+        ResultDisposition::Flags resultDisposition;
+
+        // We want to delete this constructor but a compiler bug in 4.8 means
+        // the struct is then treated as non-aggregate
+        //AssertionInfo() = delete;
+    };
+
+} // end namespace Catch
+
+// end catch_assertioninfo.h
+// start catch_decomposer.h
+
+// start catch_tostring.h
+
+#include <vector>
+#include <cstddef>
+#include <type_traits>
+#include <string>
+// start catch_stream.h
+
+#include <iosfwd>
+#include <cstddef>
+#include <ostream>
+
+namespace Catch {
+
+    std::ostream& cout();
+    std::ostream& cerr();
+    std::ostream& clog();
+
+    class StringRef;
+
+    struct IStream {
+        virtual ~IStream();
+        virtual std::ostream& stream() const = 0;
+    };
+
+    auto makeStream( StringRef const &filename ) -> IStream const*;
+
+    class ReusableStringStream : NonCopyable {
+        std::size_t m_index;
+        std::ostream* m_oss;
+    public:
+        ReusableStringStream();
+        ~ReusableStringStream();
+
+        auto str() const -> std::string;
+
+        template<typename T>
+        auto operator << ( T const& value ) -> ReusableStringStream& {
+            *m_oss << value;
+            return *this;
+        }
+        auto get() -> std::ostream& { return *m_oss; }
+    };
+}
+
+// end catch_stream.h
+// start catch_interfaces_enum_values_registry.h
+
+#include <vector>
+
+namespace Catch {
+
+    namespace Detail {
+        struct EnumInfo {
+            StringRef m_name;
+            std::vector<std::pair<int, StringRef>> m_values;
+
+            ~EnumInfo();
+
+            StringRef lookup( int value ) const;
+        };
+    } // namespace Detail
+
+    struct IMutableEnumValuesRegistry {
+        virtual ~IMutableEnumValuesRegistry();
+
+        virtual Detail::EnumInfo const& registerEnum( StringRef enumName, StringRef allEnums, std::vector<int> const& values ) = 0;
+
+        template<typename E>
+        Detail::EnumInfo const& registerEnum( StringRef enumName, StringRef allEnums, std::initializer_list<E> values ) {
+            static_assert(sizeof(int) >= sizeof(E), "Cannot serialize enum to int");
+            std::vector<int> intValues;
+            intValues.reserve( values.size() );
+            for( auto enumValue : values )
+                intValues.push_back( static_cast<int>( enumValue ) );
+            return registerEnum( enumName, allEnums, intValues );
+        }
+    };
+
+} // Catch
+
+// end catch_interfaces_enum_values_registry.h
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+#include <string_view>
+#endif
+
+#ifdef __OBJC__
+// start catch_objc_arc.hpp
+
+#import <Foundation/Foundation.h>
+
+#ifdef __has_feature
+#define CATCH_ARC_ENABLED __has_feature(objc_arc)
+#else
+#define CATCH_ARC_ENABLED 0
+#endif
+
+void arcSafeRelease( NSObject* obj );
+id performOptionalSelector( id obj, SEL sel );
+
+#if !CATCH_ARC_ENABLED
+inline void arcSafeRelease( NSObject* obj ) {
+    [obj release];
+}
+inline id performOptionalSelector( id obj, SEL sel ) {
+    if( [obj respondsToSelector: sel] )
+        return [obj performSelector: sel];
+    return nil;
+}
+#define CATCH_UNSAFE_UNRETAINED
+#define CATCH_ARC_STRONG
+#else
+inline void arcSafeRelease( NSObject* ){}
+inline id performOptionalSelector( id obj, SEL sel ) {
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Warc-performSelector-leaks"
+#endif
+    if( [obj respondsToSelector: sel] )
+        return [obj performSelector: sel];
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+    return nil;
+}
+#define CATCH_UNSAFE_UNRETAINED __unsafe_unretained
+#define CATCH_ARC_STRONG __strong
+#endif
+
+// end catch_objc_arc.hpp
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4180) // We attempt to stream a function (address) by const&, which MSVC complains about but is harmless
+#endif
+
+namespace Catch {
+    namespace Detail {
+
+        extern const std::string unprintableString;
+
+        std::string rawMemoryToString( const void *object, std::size_t size );
+
+        template<typename T>
+        std::string rawMemoryToString( const T& object ) {
+          return rawMemoryToString( &object, sizeof(object) );
+        }
+
+        template<typename T>
+        class IsStreamInsertable {
+            template<typename Stream, typename U>
+            static auto test(int)
+                -> decltype(std::declval<Stream&>() << std::declval<U>(), std::true_type());
+
+            template<typename, typename>
+            static auto test(...)->std::false_type;
+
+        public:
+            static const bool value = decltype(test<std::ostream, const T&>(0))::value;
+        };
+
+        template<typename E>
+        std::string convertUnknownEnumToString( E e );
+
+        template<typename T>
+        typename std::enable_if<
+            !std::is_enum<T>::value && !std::is_base_of<std::exception, T>::value,
+        std::string>::type convertUnstreamable( T const& ) {
+            return Detail::unprintableString;
+        }
+        template<typename T>
+        typename std::enable_if<
+            !std::is_enum<T>::value && std::is_base_of<std::exception, T>::value,
+         std::string>::type convertUnstreamable(T const& ex) {
+            return ex.what();
+        }
+
+        template<typename T>
+        typename std::enable_if<
+            std::is_enum<T>::value
+        , std::string>::type convertUnstreamable( T const& value ) {
+            return convertUnknownEnumToString( value );
+        }
+
+#if defined(_MANAGED)
+        //! Convert a CLR string to a utf8 std::string
+        template<typename T>
+        std::string clrReferenceToString( T^ ref ) {
+            if (ref == nullptr)
+                return std::string("null");
+            auto bytes = System::Text::Encoding::UTF8->GetBytes(ref->ToString());
+            cli::pin_ptr<System::Byte> p = &bytes[0];
+            return std::string(reinterpret_cast<char const *>(p), bytes->Length);
+        }
+#endif
+
+    } // namespace Detail
+
+    // If we decide for C++14, change these to enable_if_ts
+    template <typename T, typename = void>
+    struct StringMaker {
+        template <typename Fake = T>
+        static
+        typename std::enable_if<::Catch::Detail::IsStreamInsertable<Fake>::value, std::string>::type
+            convert(const Fake& value) {
+                ReusableStringStream rss;
+                // NB: call using the function-like syntax to avoid ambiguity with
+                // user-defined templated operator<< under clang.
+                rss.operator<<(value);
+                return rss.str();
+        }
+
+        template <typename Fake = T>
+        static
+        typename std::enable_if<!::Catch::Detail::IsStreamInsertable<Fake>::value, std::string>::type
+            convert( const Fake& value ) {
+#if !defined(CATCH_CONFIG_FALLBACK_STRINGIFIER)
+            return Detail::convertUnstreamable(value);
+#else
+            return CATCH_CONFIG_FALLBACK_STRINGIFIER(value);
+#endif
+        }
+    };
+
+    namespace Detail {
+
+        // This function dispatches all stringification requests inside of Catch.
+        // Should be preferably called fully qualified, like ::Catch::Detail::stringify
+        template <typename T>
+        std::string stringify(const T& e) {
+            return ::Catch::StringMaker<typename std::remove_cv<typename std::remove_reference<T>::type>::type>::convert(e);
+        }
+
+        template<typename E>
+        std::string convertUnknownEnumToString( E e ) {
+            return ::Catch::Detail::stringify(static_cast<typename std::underlying_type<E>::type>(e));
+        }
+
+#if defined(_MANAGED)
+        template <typename T>
+        std::string stringify( T^ e ) {
+            return ::Catch::StringMaker<T^>::convert(e);
+        }
+#endif
+
+    } // namespace Detail
+
+    // Some predefined specializations
+
+    template<>
+    struct StringMaker<std::string> {
+        static std::string convert(const std::string& str);
+    };
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+    template<>
+    struct StringMaker<std::string_view> {
+        static std::string convert(std::string_view str);
+    };
+#endif
+
+    template<>
+    struct StringMaker<char const *> {
+        static std::string convert(char const * str);
+    };
+    template<>
+    struct StringMaker<char *> {
+        static std::string convert(char * str);
+    };
+
+#ifdef CATCH_CONFIG_WCHAR
+    template<>
+    struct StringMaker<std::wstring> {
+        static std::string convert(const std::wstring& wstr);
+    };
+
+# ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+    template<>
+    struct StringMaker<std::wstring_view> {
+        static std::string convert(std::wstring_view str);
+    };
+# endif
+
+    template<>
+    struct StringMaker<wchar_t const *> {
+        static std::string convert(wchar_t const * str);
+    };
+    template<>
+    struct StringMaker<wchar_t *> {
+        static std::string convert(wchar_t * str);
+    };
+#endif
+
+    // TBD: Should we use `strnlen` to ensure that we don't go out of the buffer,
+    //      while keeping string semantics?
+    template<int SZ>
+    struct StringMaker<char[SZ]> {
+        static std::string convert(char const* str) {
+            return ::Catch::Detail::stringify(std::string{ str });
+        }
+    };
+    template<int SZ>
+    struct StringMaker<signed char[SZ]> {
+        static std::string convert(signed char const* str) {
+            return ::Catch::Detail::stringify(std::string{ reinterpret_cast<char const *>(str) });
+        }
+    };
+    template<int SZ>
+    struct StringMaker<unsigned char[SZ]> {
+        static std::string convert(unsigned char const* str) {
+            return ::Catch::Detail::stringify(std::string{ reinterpret_cast<char const *>(str) });
+        }
+    };
+
+#if defined(CATCH_CONFIG_CPP17_BYTE)
+    template<>
+    struct StringMaker<std::byte> {
+        static std::string convert(std::byte value);
+    };
+#endif // defined(CATCH_CONFIG_CPP17_BYTE)
+    template<>
+    struct StringMaker<int> {
+        static std::string convert(int value);
+    };
+    template<>
+    struct StringMaker<long> {
+        static std::string convert(long value);
+    };
+    template<>
+    struct StringMaker<long long> {
+        static std::string convert(long long value);
+    };
+    template<>
+    struct StringMaker<unsigned int> {
+        static std::string convert(unsigned int value);
+    };
+    template<>
+    struct StringMaker<unsigned long> {
+        static std::string convert(unsigned long value);
+    };
+    template<>
+    struct StringMaker<unsigned long long> {
+        static std::string convert(unsigned long long value);
+    };
+
+    template<>
+    struct StringMaker<bool> {
+        static std::string convert(bool b);
+    };
+
+    template<>
+    struct StringMaker<char> {
+        static std::string convert(char c);
+    };
+    template<>
+    struct StringMaker<signed char> {
+        static std::string convert(signed char c);
+    };
+    template<>
+    struct StringMaker<unsigned char> {
+        static std::string convert(unsigned char c);
+    };
+
+    template<>
+    struct StringMaker<std::nullptr_t> {
+        static std::string convert(std::nullptr_t);
+    };
+
+    template<>
+    struct StringMaker<float> {
+        static std::string convert(float value);
+        static int precision;
+    };
+
+    template<>
+    struct StringMaker<double> {
+        static std::string convert(double value);
+        static int precision;
+    };
+
+    template <typename T>
+    struct StringMaker<T*> {
+        template <typename U>
+        static std::string convert(U* p) {
+            if (p) {
+                return ::Catch::Detail::rawMemoryToString(p);
+            } else {
+                return "nullptr";
+            }
+        }
+    };
+
+    template <typename R, typename C>
+    struct StringMaker<R C::*> {
+        static std::string convert(R C::* p) {
+            if (p) {
+                return ::Catch::Detail::rawMemoryToString(p);
+            } else {
+                return "nullptr";
+            }
+        }
+    };
+
+#if defined(_MANAGED)
+    template <typename T>
+    struct StringMaker<T^> {
+        static std::string convert( T^ ref ) {
+            return ::Catch::Detail::clrReferenceToString(ref);
+        }
+    };
+#endif
+
+    namespace Detail {
+        template<typename InputIterator, typename Sentinel = InputIterator>
+        std::string rangeToString(InputIterator first, Sentinel last) {
+            ReusableStringStream rss;
+            rss << "{ ";
+            if (first != last) {
+                rss << ::Catch::Detail::stringify(*first);
+                for (++first; first != last; ++first)
+                    rss << ", " << ::Catch::Detail::stringify(*first);
+            }
+            rss << " }";
+            return rss.str();
+        }
+    }
+
+#ifdef __OBJC__
+    template<>
+    struct StringMaker<NSString*> {
+        static std::string convert(NSString * nsstring) {
+            if (!nsstring)
+                return "nil";
+            return std::string("@") + [nsstring UTF8String];
+        }
+    };
+    template<>
+    struct StringMaker<NSObject*> {
+        static std::string convert(NSObject* nsObject) {
+            return ::Catch::Detail::stringify([nsObject description]);
+        }
+
+    };
+    namespace Detail {
+        inline std::string stringify( NSString* nsstring ) {
+            return StringMaker<NSString*>::convert( nsstring );
+        }
+
+    } // namespace Detail
+#endif // __OBJC__
+
+} // namespace Catch
+
+//////////////////////////////////////////////////////
+// Separate std-lib types stringification, so it can be selectively enabled
+// This means that we do not bring in
+
+#if defined(CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS)
+#  define CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
+#endif
+
+// Separate std::pair specialization
+#if defined(CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER)
+#include <utility>
+namespace Catch {
+    template<typename T1, typename T2>
+    struct StringMaker<std::pair<T1, T2> > {
+        static std::string convert(const std::pair<T1, T2>& pair) {
+            ReusableStringStream rss;
+            rss << "{ "
+                << ::Catch::Detail::stringify(pair.first)
+                << ", "
+                << ::Catch::Detail::stringify(pair.second)
+                << " }";
+            return rss.str();
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
+
+#if defined(CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER) && defined(CATCH_CONFIG_CPP17_OPTIONAL)
+#include <optional>
+namespace Catch {
+    template<typename T>
+    struct StringMaker<std::optional<T> > {
+        static std::string convert(const std::optional<T>& optional) {
+            ReusableStringStream rss;
+            if (optional.has_value()) {
+                rss << ::Catch::Detail::stringify(*optional);
+            } else {
+                rss << "{ }";
+            }
+            return rss.str();
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
+
+// Separate std::tuple specialization
+#if defined(CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER)
+#include <tuple>
+namespace Catch {
+    namespace Detail {
+        template<
+            typename Tuple,
+            std::size_t N = 0,
+            bool = (N < std::tuple_size<Tuple>::value)
+            >
+            struct TupleElementPrinter {
+            static void print(const Tuple& tuple, std::ostream& os) {
+                os << (N ? ", " : " ")
+                    << ::Catch::Detail::stringify(std::get<N>(tuple));
+                TupleElementPrinter<Tuple, N + 1>::print(tuple, os);
+            }
+        };
+
+        template<
+            typename Tuple,
+            std::size_t N
+        >
+            struct TupleElementPrinter<Tuple, N, false> {
+            static void print(const Tuple&, std::ostream&) {}
+        };
+
+    }
+
+    template<typename ...Types>
+    struct StringMaker<std::tuple<Types...>> {
+        static std::string convert(const std::tuple<Types...>& tuple) {
+            ReusableStringStream rss;
+            rss << '{';
+            Detail::TupleElementPrinter<std::tuple<Types...>>::print(tuple, rss.get());
+            rss << " }";
+            return rss.str();
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
+
+#if defined(CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER) && defined(CATCH_CONFIG_CPP17_VARIANT)
+#include <variant>
+namespace Catch {
+    template<>
+    struct StringMaker<std::monostate> {
+        static std::string convert(const std::monostate&) {
+            return "{ }";
+        }
+    };
+
+    template<typename... Elements>
+    struct StringMaker<std::variant<Elements...>> {
+        static std::string convert(const std::variant<Elements...>& variant) {
+            if (variant.valueless_by_exception()) {
+                return "{valueless variant}";
+            } else {
+                return std::visit(
+                    [](const auto& value) {
+                        return ::Catch::Detail::stringify(value);
+                    },
+                    variant
+                );
+            }
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
+
+namespace Catch {
+    // Import begin/ end from std here
+    using std::begin;
+    using std::end;
+
+    namespace detail {
+        template <typename...>
+        struct void_type {
+            using type = void;
+        };
+
+        template <typename T, typename = void>
+        struct is_range_impl : std::false_type {
+        };
+
+        template <typename T>
+        struct is_range_impl<T, typename void_type<decltype(begin(std::declval<T>()))>::type> : std::true_type {
+        };
+    } // namespace detail
+
+    template <typename T>
+    struct is_range : detail::is_range_impl<T> {
+    };
+
+#if defined(_MANAGED) // Managed types are never ranges
+    template <typename T>
+    struct is_range<T^> {
+        static const bool value = false;
+    };
+#endif
+
+    template<typename Range>
+    std::string rangeToString( Range const& range ) {
+        return ::Catch::Detail::rangeToString( begin( range ), end( range ) );
+    }
+
+    // Handle vector<bool> specially
+    template<typename Allocator>
+    std::string rangeToString( std::vector<bool, Allocator> const& v ) {
+        ReusableStringStream rss;
+        rss << "{ ";
+        bool first = true;
+        for( bool b : v ) {
+            if( first )
+                first = false;
+            else
+                rss << ", ";
+            rss << ::Catch::Detail::stringify( b );
+        }
+        rss << " }";
+        return rss.str();
+    }
+
+    template<typename R>
+    struct StringMaker<R, typename std::enable_if<is_range<R>::value && !::Catch::Detail::IsStreamInsertable<R>::value>::type> {
+        static std::string convert( R const& range ) {
+            return rangeToString( range );
+        }
+    };
+
+    template <typename T, int SZ>
+    struct StringMaker<T[SZ]> {
+        static std::string convert(T const(&arr)[SZ]) {
+            return rangeToString(arr);
+        }
+    };
+
+} // namespace Catch
+
+// Separate std::chrono::duration specialization
+#if defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
+#include <ctime>
+#include <ratio>
+#include <chrono>
+
+namespace Catch {
+
+template <class Ratio>
+struct ratio_string {
+    static std::string symbol();
+};
+
+template <class Ratio>
+std::string ratio_string<Ratio>::symbol() {
+    Catch::ReusableStringStream rss;
+    rss << '[' << Ratio::num << '/'
+        << Ratio::den << ']';
+    return rss.str();
+}
+template <>
+struct ratio_string<std::atto> {
+    static std::string symbol();
+};
+template <>
+struct ratio_string<std::femto> {
+    static std::string symbol();
+};
+template <>
+struct ratio_string<std::pico> {
+    static std::string symbol();
+};
+template <>
+struct ratio_string<std::nano> {
+    static std::string symbol();
+};
+template <>
+struct ratio_string<std::micro> {
+    static std::string symbol();
+};
+template <>
+struct ratio_string<std::milli> {
+    static std::string symbol();
+};
+
+    ////////////
+    // std::chrono::duration specializations
+    template<typename Value, typename Ratio>
+    struct StringMaker<std::chrono::duration<Value, Ratio>> {
+        static std::string convert(std::chrono::duration<Value, Ratio> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << ' ' << ratio_string<Ratio>::symbol() << 's';
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<1>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<1>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " s";
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<60>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<60>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " m";
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<3600>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<3600>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " h";
+            return rss.str();
+        }
+    };
+
+    ////////////
+    // std::chrono::time_point specialization
+    // Generic time_point cannot be specialized, only std::chrono::time_point<system_clock>
+    template<typename Clock, typename Duration>
+    struct StringMaker<std::chrono::time_point<Clock, Duration>> {
+        static std::string convert(std::chrono::time_point<Clock, Duration> const& time_point) {
+            return ::Catch::Detail::stringify(time_point.time_since_epoch()) + " since epoch";
+        }
+    };
+    // std::chrono::time_point<system_clock> specialization
+    template<typename Duration>
+    struct StringMaker<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+        static std::string convert(std::chrono::time_point<std::chrono::system_clock, Duration> const& time_point) {
+            auto converted = std::chrono::system_clock::to_time_t(time_point);
+
+#ifdef _MSC_VER
+            std::tm timeInfo = {};
+            gmtime_s(&timeInfo, &converted);
+#else
+            std::tm* timeInfo = std::gmtime(&converted);
+#endif
+
+            auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
+            char timeStamp[timeStampSize];
+            const char * const fmt = "%Y-%m-%dT%H:%M:%SZ";
+
+#ifdef _MSC_VER
+            std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
+#else
+            std::strftime(timeStamp, timeStampSize, fmt, timeInfo);
+#endif
+            return std::string(timeStamp);
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
+
+#define INTERNAL_CATCH_REGISTER_ENUM( enumName, ... ) \
+namespace Catch { \
+    template<> struct StringMaker<enumName> { \
+        static std::string convert( enumName value ) { \
+            static const auto& enumInfo = ::Catch::getMutableRegistryHub().getMutableEnumValuesRegistry().registerEnum( #enumName, #__VA_ARGS__, { __VA_ARGS__ } ); \
+            return static_cast<std::string>(enumInfo.lookup( static_cast<int>( value ) )); \
+        } \
+    }; \
+}
+
+#define CATCH_REGISTER_ENUM( enumName, ... ) INTERNAL_CATCH_REGISTER_ENUM( enumName, __VA_ARGS__ )
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// end catch_tostring.h
+#include <iosfwd>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4389) // '==' : signed/unsigned mismatch
+#pragma warning(disable:4018) // more "signed/unsigned mismatch"
+#pragma warning(disable:4312) // Converting int to T* using reinterpret_cast (issue on x64 platform)
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning
+#pragma warning(disable:4800) // Forcing result to true or false
+#endif
+
+namespace Catch {
+
+    struct ITransientExpression {
+        auto isBinaryExpression() const -> bool { return m_isBinaryExpression; }
+        auto getResult() const -> bool { return m_result; }
+        virtual void streamReconstructedExpression( std::ostream &os ) const = 0;
+
+        ITransientExpression( bool isBinaryExpression, bool result )
+        :   m_isBinaryExpression( isBinaryExpression ),
+            m_result( result )
+        {}
+
+        // We don't actually need a virtual destructor, but many static analysers
+        // complain if it's not here :-(
+        virtual ~ITransientExpression();
+
+        bool m_isBinaryExpression;
+        bool m_result;
+
+    };
+
+    void formatReconstructedExpression( std::ostream &os, std::string const& lhs, StringRef op, std::string const& rhs );
+
+    template<typename LhsT, typename RhsT>
+    class BinaryExpr  : public ITransientExpression {
+        LhsT m_lhs;
+        StringRef m_op;
+        RhsT m_rhs;
+
+        void streamReconstructedExpression( std::ostream &os ) const override {
+            formatReconstructedExpression
+                    ( os, Catch::Detail::stringify( m_lhs ), m_op, Catch::Detail::stringify( m_rhs ) );
+        }
+
+    public:
+        BinaryExpr( bool comparisonResult, LhsT lhs, StringRef op, RhsT rhs )
+        :   ITransientExpression{ true, comparisonResult },
+            m_lhs( lhs ),
+            m_op( op ),
+            m_rhs( rhs )
+        {}
+
+        template<typename T>
+        auto operator && ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator || ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator == ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator != ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator > ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator < ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator >= ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator <= ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+    };
+
+    template<typename LhsT>
+    class UnaryExpr : public ITransientExpression {
+        LhsT m_lhs;
+
+        void streamReconstructedExpression( std::ostream &os ) const override {
+            os << Catch::Detail::stringify( m_lhs );
+        }
+
+    public:
+        explicit UnaryExpr( LhsT lhs )
+        :   ITransientExpression{ false, static_cast<bool>(lhs) },
+            m_lhs( lhs )
+        {}
+    };
+
+    // Specialised comparison functions to handle equality comparisons between ints and pointers (NULL deduces as an int)
+    template<typename LhsT, typename RhsT>
+    auto compareEqual( LhsT const& lhs, RhsT const& rhs ) -> bool { return static_cast<bool>(lhs == rhs); }
+    template<typename T>
+    auto compareEqual( T* const& lhs, int rhs ) -> bool { return lhs == reinterpret_cast<void const*>( rhs ); }
+    template<typename T>
+    auto compareEqual( T* const& lhs, long rhs ) -> bool { return lhs == reinterpret_cast<void const*>( rhs ); }
+    template<typename T>
+    auto compareEqual( int lhs, T* const& rhs ) -> bool { return reinterpret_cast<void const*>( lhs ) == rhs; }
+    template<typename T>
+    auto compareEqual( long lhs, T* const& rhs ) -> bool { return reinterpret_cast<void const*>( lhs ) == rhs; }
+
+    template<typename LhsT, typename RhsT>
+    auto compareNotEqual( LhsT const& lhs, RhsT&& rhs ) -> bool { return static_cast<bool>(lhs != rhs); }
+    template<typename T>
+    auto compareNotEqual( T* const& lhs, int rhs ) -> bool { return lhs != reinterpret_cast<void const*>( rhs ); }
+    template<typename T>
+    auto compareNotEqual( T* const& lhs, long rhs ) -> bool { return lhs != reinterpret_cast<void const*>( rhs ); }
+    template<typename T>
+    auto compareNotEqual( int lhs, T* const& rhs ) -> bool { return reinterpret_cast<void const*>( lhs ) != rhs; }
+    template<typename T>
+    auto compareNotEqual( long lhs, T* const& rhs ) -> bool { return reinterpret_cast<void const*>( lhs ) != rhs; }
+
+    template<typename LhsT>
+    class ExprLhs {
+        LhsT m_lhs;
+    public:
+        explicit ExprLhs( LhsT lhs ) : m_lhs( lhs ) {}
+
+        template<typename RhsT>
+        auto operator == ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { compareEqual( m_lhs, rhs ), m_lhs, "==", rhs };
+        }
+        auto operator == ( bool rhs ) -> BinaryExpr<LhsT, bool> const {
+            return { m_lhs == rhs, m_lhs, "==", rhs };
+        }
+
+        template<typename RhsT>
+        auto operator != ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { compareNotEqual( m_lhs, rhs ), m_lhs, "!=", rhs };
+        }
+        auto operator != ( bool rhs ) -> BinaryExpr<LhsT, bool> const {
+            return { m_lhs != rhs, m_lhs, "!=", rhs };
+        }
+
+        template<typename RhsT>
+        auto operator > ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs > rhs), m_lhs, ">", rhs };
+        }
+        template<typename RhsT>
+        auto operator < ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs < rhs), m_lhs, "<", rhs };
+        }
+        template<typename RhsT>
+        auto operator >= ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs >= rhs), m_lhs, ">=", rhs };
+        }
+        template<typename RhsT>
+        auto operator <= ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs <= rhs), m_lhs, "<=", rhs };
+        }
+        template <typename RhsT>
+        auto operator | (RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs | rhs), m_lhs, "|", rhs };
+        }
+        template <typename RhsT>
+        auto operator & (RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs & rhs), m_lhs, "&", rhs };
+        }
+        template <typename RhsT>
+        auto operator ^ (RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs ^ rhs), m_lhs, "^", rhs };
+        }
+
+        template<typename RhsT>
+        auto operator && ( RhsT const& ) -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<RhsT>::value,
+            "operator&& is not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename RhsT>
+        auto operator || ( RhsT const& ) -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<RhsT>::value,
+            "operator|| is not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        auto makeUnaryExpr() const -> UnaryExpr<LhsT> {
+            return UnaryExpr<LhsT>{ m_lhs };
+        }
+    };
+
+    void handleExpression( ITransientExpression const& expr );
+
+    template<typename T>
+    void handleExpression( ExprLhs<T> const& expr ) {
+        handleExpression( expr.makeUnaryExpr() );
+    }
+
+    struct Decomposer {
+        template<typename T>
+        auto operator <= ( T const& lhs ) -> ExprLhs<T const&> {
+            return ExprLhs<T const&>{ lhs };
+        }
+
+        auto operator <=( bool value ) -> ExprLhs<bool> {
+            return ExprLhs<bool>{ value };
+        }
+    };
+
+} // end namespace Catch
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// end catch_decomposer.h
+// start catch_interfaces_capture.h
+
+#include <string>
+#include <chrono>
+
+namespace Catch {
+
+    class AssertionResult;
+    struct AssertionInfo;
+    struct SectionInfo;
+    struct SectionEndInfo;
+    struct MessageInfo;
+    struct MessageBuilder;
+    struct Counts;
+    struct AssertionReaction;
+    struct SourceLineInfo;
+
+    struct ITransientExpression;
+    struct IGeneratorTracker;
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+    struct BenchmarkInfo;
+    template <typename Duration = std::chrono::duration<double, std::nano>>
+    struct BenchmarkStats;
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+    struct IResultCapture {
+
+        virtual ~IResultCapture();
+
+        virtual bool sectionStarted(    SectionInfo const& sectionInfo,
+                                        Counts& assertions ) = 0;
+        virtual void sectionEnded( SectionEndInfo const& endInfo ) = 0;
+        virtual void sectionEndedEarly( SectionEndInfo const& endInfo ) = 0;
+
+        virtual auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& = 0;
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+        virtual void benchmarkPreparing( std::string const& name ) = 0;
+        virtual void benchmarkStarting( BenchmarkInfo const& info ) = 0;
+        virtual void benchmarkEnded( BenchmarkStats<> const& stats ) = 0;
+        virtual void benchmarkFailed( std::string const& error ) = 0;
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+        virtual void pushScopedMessage( MessageInfo const& message ) = 0;
+        virtual void popScopedMessage( MessageInfo const& message ) = 0;
+
+        virtual void emplaceUnscopedMessage( MessageBuilder const& builder ) = 0;
+
+        virtual void handleFatalErrorCondition( StringRef message ) = 0;
+
+        virtual void handleExpr
+                (   AssertionInfo const& info,
+                    ITransientExpression const& expr,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleMessage
+                (   AssertionInfo const& info,
+                    ResultWas::OfType resultType,
+                    StringRef const& message,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleUnexpectedExceptionNotThrown
+                (   AssertionInfo const& info,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleUnexpectedInflightException
+                (   AssertionInfo const& info,
+                    std::string const& message,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleIncomplete
+                (   AssertionInfo const& info ) = 0;
+        virtual void handleNonExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    AssertionReaction &reaction ) = 0;
+
+        virtual bool lastAssertionPassed() = 0;
+        virtual void assertionPassed() = 0;
+
+        // Deprecated, do not use:
+        virtual std::string getCurrentTestName() const = 0;
+        virtual const AssertionResult* getLastResult() const = 0;
+        virtual void exceptionEarlyReported() = 0;
+    };
+
+    IResultCapture& getResultCapture();
+}
+
+// end catch_interfaces_capture.h
+namespace Catch {
+
+    struct TestFailureException{};
+    struct AssertionResultData;
+    struct IResultCapture;
+    class RunContext;
+
+    class LazyExpression {
+        friend class AssertionHandler;
+        friend struct AssertionStats;
+        friend class RunContext;
+
+        ITransientExpression const* m_transientExpression = nullptr;
+        bool m_isNegated;
+    public:
+        LazyExpression( bool isNegated );
+        LazyExpression( LazyExpression const& other );
+        LazyExpression& operator = ( LazyExpression const& ) = delete;
+
+        explicit operator bool() const;
+
+        friend auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream&;
+    };
+
+    struct AssertionReaction {
+        bool shouldDebugBreak = false;
+        bool shouldThrow = false;
+    };
+
+    class AssertionHandler {
+        AssertionInfo m_assertionInfo;
+        AssertionReaction m_reaction;
+        bool m_completed = false;
+        IResultCapture& m_resultCapture;
+
+    public:
+        AssertionHandler
+            (   StringRef const& macroName,
+                SourceLineInfo const& lineInfo,
+                StringRef capturedExpression,
+                ResultDisposition::Flags resultDisposition );
+        ~AssertionHandler() {
+            if ( !m_completed ) {
+                m_resultCapture.handleIncomplete( m_assertionInfo );
+            }
+        }
+
+        template<typename T>
+        void handleExpr( ExprLhs<T> const& expr ) {
+            handleExpr( expr.makeUnaryExpr() );
+        }
+        void handleExpr( ITransientExpression const& expr );
+
+        void handleMessage(ResultWas::OfType resultType, StringRef const& message);
+
+        void handleExceptionThrownAsExpected();
+        void handleUnexpectedExceptionNotThrown();
+        void handleExceptionNotThrownAsExpected();
+        void handleThrowingCallSkipped();
+        void handleUnexpectedInflightException();
+
+        void complete();
+        void setCompleted();
+
+        // query
+        auto allowThrows() const -> bool;
+    };
+
+    void handleExceptionMatchExpr( AssertionHandler& handler, std::string const& str, StringRef const& matcherString );
+
+} // namespace Catch
+
+// end catch_assertionhandler.h
+// start catch_message.h
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    struct MessageInfo {
+        MessageInfo(    StringRef const& _macroName,
+                        SourceLineInfo const& _lineInfo,
+                        ResultWas::OfType _type );
+
+        StringRef macroName;
+        std::string message;
+        SourceLineInfo lineInfo;
+        ResultWas::OfType type;
+        unsigned int sequence;
+
+        bool operator == ( MessageInfo const& other ) const;
+        bool operator < ( MessageInfo const& other ) const;
+    private:
+        static unsigned int globalCount;
+    };
+
+    struct MessageStream {
+
+        template<typename T>
+        MessageStream& operator << ( T const& value ) {
+            m_stream << value;
+            return *this;
+        }
+
+        ReusableStringStream m_stream;
+    };
+
+    struct MessageBuilder : MessageStream {
+        MessageBuilder( StringRef const& macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType type );
+
+        template<typename T>
+        MessageBuilder& operator << ( T const& value ) {
+            m_stream << value;
+            return *this;
+        }
+
+        MessageInfo m_info;
+    };
+
+    class ScopedMessage {
+    public:
+        explicit ScopedMessage( MessageBuilder const& builder );
+        ScopedMessage( ScopedMessage& duplicate ) = delete;
+        ScopedMessage( ScopedMessage&& old );
+        ~ScopedMessage();
+
+        MessageInfo m_info;
+        bool m_moved;
+    };
+
+    class Capturer {
+        std::vector<MessageInfo> m_messages;
+        IResultCapture& m_resultCapture = getResultCapture();
+        size_t m_captured = 0;
+    public:
+        Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names );
+        ~Capturer();
+
+        void captureValue( size_t index, std::string const& value );
+
+        template<typename T>
+        void captureValues( size_t index, T const& value ) {
+            captureValue( index, Catch::Detail::stringify( value ) );
+        }
+
+        template<typename T, typename... Ts>
+        void captureValues( size_t index, T const& value, Ts const&... values ) {
+            captureValue( index, Catch::Detail::stringify(value) );
+            captureValues( index+1, values... );
+        }
+    };
+
+} // end namespace Catch
+
+// end catch_message.h
+#if !defined(CATCH_CONFIG_DISABLE)
+
+#if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
+  #define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__
+#else
+  #define CATCH_INTERNAL_STRINGIFY(...) "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"
+#endif
+
+#if defined(CATCH_CONFIG_FAST_COMPILE) || defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+
+///////////////////////////////////////////////////////////////////////////////
+// Another way to speed-up compilation is to omit local try-catch for REQUIRE*
+// macros.
+#define INTERNAL_CATCH_TRY
+#define INTERNAL_CATCH_CATCH( capturer )
+
+#else // CATCH_CONFIG_FAST_COMPILE
+
+#define INTERNAL_CATCH_TRY try
+#define INTERNAL_CATCH_CATCH( handler ) catch(...) { handler.handleUnexpectedInflightException(); }
+
+#endif
+
+#define INTERNAL_CATCH_REACT( handler ) handler.complete();
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_TEST( macroName, resultDisposition, ... ) \
+    do { \
+        CATCH_INTERNAL_IGNORE_BUT_WARN(__VA_ARGS__); \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
+        INTERNAL_CATCH_TRY { \
+            CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+            catchAssertionHandler.handleExpr( Catch::Decomposer() <= __VA_ARGS__ ); \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( (void)0, (false) && static_cast<bool>( !!(__VA_ARGS__) ) )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_IF( macroName, resultDisposition, ... ) \
+    INTERNAL_CATCH_TEST( macroName, resultDisposition, __VA_ARGS__ ); \
+    if( Catch::getResultCapture().lastAssertionPassed() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_ELSE( macroName, resultDisposition, ... ) \
+    INTERNAL_CATCH_TEST( macroName, resultDisposition, __VA_ARGS__ ); \
+    if( !Catch::getResultCapture().lastAssertionPassed() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_NO_THROW( macroName, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
+        try { \
+            static_cast<void>(__VA_ARGS__); \
+            catchAssertionHandler.handleExceptionNotThrownAsExpected(); \
+        } \
+        catch( ... ) { \
+            catchAssertionHandler.handleUnexpectedInflightException(); \
+        } \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS( macroName, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                static_cast<void>(__VA_ARGS__); \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleExceptionThrownAsExpected(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS_AS( macroName, exceptionType, resultDisposition, expr ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(expr) ", " CATCH_INTERNAL_STRINGIFY(exceptionType), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                static_cast<void>(expr); \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( exceptionType const& ) { \
+                catchAssertionHandler.handleExceptionThrownAsExpected(); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleUnexpectedInflightException(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_MSG( macroName, messageType, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::StringRef(), resultDisposition ); \
+        catchAssertionHandler.handleMessage( messageType, ( Catch::MessageStream() << __VA_ARGS__ + ::Catch::StreamEndStop() ).m_stream.str() ); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_CAPTURE( varName, macroName, ... ) \
+    auto varName = Catch::Capturer( macroName, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info, #__VA_ARGS__ ); \
+    varName.captureValues( 0, __VA_ARGS__ )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_INFO( macroName, log ) \
+    Catch::ScopedMessage INTERNAL_CATCH_UNIQUE_NAME( scopedMessage )( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log );
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_UNSCOPED_INFO( macroName, log ) \
+    Catch::getResultCapture().emplaceUnscopedMessage( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
+
+///////////////////////////////////////////////////////////////////////////////
+// Although this is matcher-based, it can be used with just a string
+#define INTERNAL_CATCH_THROWS_STR_MATCHES( macroName, resultDisposition, matcher, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                static_cast<void>(__VA_ARGS__); \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( ... ) { \
+                Catch::handleExceptionMatchExpr( catchAssertionHandler, matcher, #matcher##_catch_sr ); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+#endif // CATCH_CONFIG_DISABLE
+
+// end catch_capture.hpp
+// start catch_section.h
+
+// start catch_section_info.h
+
+// start catch_totals.h
+
+#include <cstddef>
+
+namespace Catch {
+
+    struct Counts {
+        Counts operator - ( Counts const& other ) const;
+        Counts& operator += ( Counts const& other );
+
+        std::size_t total() const;
+        bool allPassed() const;
+        bool allOk() const;
+
+        std::size_t passed = 0;
+        std::size_t failed = 0;
+        std::size_t failedButOk = 0;
+    };
+
+    struct Totals {
+
+        Totals operator - ( Totals const& other ) const;
+        Totals& operator += ( Totals const& other );
+
+        Totals delta( Totals const& prevTotals ) const;
+
+        int error = 0;
+        Counts assertions;
+        Counts testCases;
+    };
+}
+
+// end catch_totals.h
+#include <string>
+
+namespace Catch {
+
+    struct SectionInfo {
+        SectionInfo
+            (   SourceLineInfo const& _lineInfo,
+                std::string const& _name );
+
+        // Deprecated
+        SectionInfo
+            (   SourceLineInfo const& _lineInfo,
+                std::string const& _name,
+                std::string const& ) : SectionInfo( _lineInfo, _name ) {}
+
+        std::string name;
+        std::string description; // !Deprecated: this will always be empty
+        SourceLineInfo lineInfo;
+    };
+
+    struct SectionEndInfo {
+        SectionInfo sectionInfo;
+        Counts prevAssertions;
+        double durationInSeconds;
+    };
+
+} // end namespace Catch
+
+// end catch_section_info.h
+// start catch_timer.h
+
+#include <cstdint>
+
+namespace Catch {
+
+    auto getCurrentNanosecondsSinceEpoch() -> uint64_t;
+    auto getEstimatedClockResolution() -> uint64_t;
+
+    class Timer {
+        uint64_t m_nanoseconds = 0;
+    public:
+        void start();
+        auto getElapsedNanoseconds() const -> uint64_t;
+        auto getElapsedMicroseconds() const -> uint64_t;
+        auto getElapsedMilliseconds() const -> unsigned int;
+        auto getElapsedSeconds() const -> double;
+    };
+
+} // namespace Catch
+
+// end catch_timer.h
+#include <string>
+
+namespace Catch {
+
+    class Section : NonCopyable {
+    public:
+        Section( SectionInfo const& info );
+        ~Section();
+
+        // This indicates whether the section should be executed or not
+        explicit operator bool() const;
+
+    private:
+        SectionInfo m_info;
+
+        std::string m_name;
+        Counts m_assertions;
+        bool m_sectionIncluded;
+        Timer m_timer;
+    };
+
+} // end namespace Catch
+
+#define INTERNAL_CATCH_SECTION( ... ) \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+    CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \
+    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#define INTERNAL_CATCH_DYNAMIC_SECTION( ... ) \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+    CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \
+    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, (Catch::ReusableStringStream() << __VA_ARGS__).str() ) ) \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+// end catch_section.h
+// start catch_interfaces_exception.h
+
+// start catch_interfaces_registry_hub.h
+
+#include <string>
+#include <memory>
+
+namespace Catch {
+
+    class TestCase;
+    struct ITestCaseRegistry;
+    struct IExceptionTranslatorRegistry;
+    struct IExceptionTranslator;
+    struct IReporterRegistry;
+    struct IReporterFactory;
+    struct ITagAliasRegistry;
+    struct IMutableEnumValuesRegistry;
+
+    class StartupExceptionRegistry;
+
+    using IReporterFactoryPtr = std::shared_ptr<IReporterFactory>;
+
+    struct IRegistryHub {
+        virtual ~IRegistryHub();
+
+        virtual IReporterRegistry const& getReporterRegistry() const = 0;
+        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
+        virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
+        virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
+
+        virtual StartupExceptionRegistry const& getStartupExceptionRegistry() const = 0;
+    };
+
+    struct IMutableRegistryHub {
+        virtual ~IMutableRegistryHub();
+        virtual void registerReporter( std::string const& name, IReporterFactoryPtr const& factory ) = 0;
+        virtual void registerListener( IReporterFactoryPtr const& factory ) = 0;
+        virtual void registerTest( TestCase const& testInfo ) = 0;
+        virtual void registerTranslator( const IExceptionTranslator* translator ) = 0;
+        virtual void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) = 0;
+        virtual void registerStartupException() noexcept = 0;
+        virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() = 0;
+    };
+
+    IRegistryHub const& getRegistryHub();
+    IMutableRegistryHub& getMutableRegistryHub();
+    void cleanUp();
+    std::string translateActiveException();
+
+}
+
+// end catch_interfaces_registry_hub.h
+#if defined(CATCH_CONFIG_DISABLE)
+    #define INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( translatorName, signature) \
+        static std::string translatorName( signature )
+#endif
+
+#include <exception>
+#include <string>
+#include <vector>
+
+namespace Catch {
+    using exceptionTranslateFunction = std::string(*)();
+
+    struct IExceptionTranslator;
+    using ExceptionTranslators = std::vector<std::unique_ptr<IExceptionTranslator const>>;
+
+    struct IExceptionTranslator {
+        virtual ~IExceptionTranslator();
+        virtual std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const = 0;
+    };
+
+    struct IExceptionTranslatorRegistry {
+        virtual ~IExceptionTranslatorRegistry();
+
+        virtual std::string translateActiveException() const = 0;
+    };
+
+    class ExceptionTranslatorRegistrar {
+        template<typename T>
+        class ExceptionTranslator : public IExceptionTranslator {
+        public:
+
+            ExceptionTranslator( std::string(*translateFunction)( T& ) )
+            : m_translateFunction( translateFunction )
+            {}
+
+            std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const override {
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+                return "";
+#else
+                try {
+                    if( it == itEnd )
+                        std::rethrow_exception(std::current_exception());
+                    else
+                        return (*it)->translate( it+1, itEnd );
+                }
+                catch( T& ex ) {
+                    return m_translateFunction( ex );
+                }
+#endif
+            }
+
+        protected:
+            std::string(*m_translateFunction)( T& );
+        };
+
+    public:
+        template<typename T>
+        ExceptionTranslatorRegistrar( std::string(*translateFunction)( T& ) ) {
+            getMutableRegistryHub().registerTranslator
+                ( new ExceptionTranslator<T>( translateFunction ) );
+        }
+    };
+}
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_TRANSLATE_EXCEPTION2( translatorName, signature ) \
+    static std::string translatorName( signature ); \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+    namespace{ Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionRegistrar )( &translatorName ); } \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+    static std::string translatorName( signature )
+
+#define INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION2( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
+
+// end catch_interfaces_exception.h
+// start catch_approx.h
+
+#include <type_traits>
+
+namespace Catch {
+namespace Detail {
+
+    class Approx {
+    private:
+        bool equalityComparisonImpl(double other) const;
+        // Validates the new margin (margin >= 0)
+        // out-of-line to avoid including stdexcept in the header
+        void setMargin(double margin);
+        // Validates the new epsilon (0 < epsilon < 1)
+        // out-of-line to avoid including stdexcept in the header
+        void setEpsilon(double epsilon);
+
+    public:
+        explicit Approx ( double value );
+
+        static Approx custom();
+
+        Approx operator-() const;
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        Approx operator()( T const& value ) {
+            Approx approx( static_cast<double>(value) );
+            approx.m_epsilon = m_epsilon;
+            approx.m_margin = m_margin;
+            approx.m_scale = m_scale;
+            return approx;
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        explicit Approx( T const& value ): Approx(static_cast<double>(value))
+        {}
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        friend bool operator == ( const T& lhs, Approx const& rhs ) {
+            auto lhs_v = static_cast<double>(lhs);
+            return rhs.equalityComparisonImpl(lhs_v);
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        friend bool operator == ( Approx const& lhs, const T& rhs ) {
+            return operator==( rhs, lhs );
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        friend bool operator != ( T const& lhs, Approx const& rhs ) {
+            return !operator==( lhs, rhs );
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        friend bool operator != ( Approx const& lhs, T const& rhs ) {
+            return !operator==( rhs, lhs );
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        friend bool operator <= ( T const& lhs, Approx const& rhs ) {
+            return static_cast<double>(lhs) < rhs.m_value || lhs == rhs;
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        friend bool operator <= ( Approx const& lhs, T const& rhs ) {
+            return lhs.m_value < static_cast<double>(rhs) || lhs == rhs;
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        friend bool operator >= ( T const& lhs, Approx const& rhs ) {
+            return static_cast<double>(lhs) > rhs.m_value || lhs == rhs;
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        friend bool operator >= ( Approx const& lhs, T const& rhs ) {
+            return lhs.m_value > static_cast<double>(rhs) || lhs == rhs;
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        Approx& epsilon( T const& newEpsilon ) {
+            double epsilonAsDouble = static_cast<double>(newEpsilon);
+            setEpsilon(epsilonAsDouble);
+            return *this;
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        Approx& margin( T const& newMargin ) {
+            double marginAsDouble = static_cast<double>(newMargin);
+            setMargin(marginAsDouble);
+            return *this;
+        }
+
+        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+        Approx& scale( T const& newScale ) {
+            m_scale = static_cast<double>(newScale);
+            return *this;
+        }
+
+        std::string toString() const;
+
+    private:
+        double m_epsilon;
+        double m_margin;
+        double m_scale;
+        double m_value;
+    };
+} // end namespace Detail
+
+namespace literals {
+    Detail::Approx operator "" _a(long double val);
+    Detail::Approx operator "" _a(unsigned long long val);
+} // end namespace literals
+
+template<>
+struct StringMaker<Catch::Detail::Approx> {
+    static std::string convert(Catch::Detail::Approx const& value);
+};
+
+} // end namespace Catch
+
+// end catch_approx.h
+// start catch_string_manip.h
+
+#include <string>
+#include <iosfwd>
+#include <vector>
+
+namespace Catch {
+
+    bool startsWith( std::string const& s, std::string const& prefix );
+    bool startsWith( std::string const& s, char prefix );
+    bool endsWith( std::string const& s, std::string const& suffix );
+    bool endsWith( std::string const& s, char suffix );
+    bool contains( std::string const& s, std::string const& infix );
+    void toLowerInPlace( std::string& s );
+    std::string toLower( std::string const& s );
+    //! Returns a new string without whitespace at the start/end
+    std::string trim( std::string const& str );
+    //! Returns a substring of the original ref without whitespace. Beware lifetimes!
+    StringRef trim(StringRef ref);
+
+    // !!! Be aware, returns refs into original string - make sure original string outlives them
+    std::vector<StringRef> splitStringRef( StringRef str, char delimiter );
+    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis );
+
+    struct pluralise {
+        pluralise( std::size_t count, std::string const& label );
+
+        friend std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser );
+
+        std::size_t m_count;
+        std::string m_label;
+    };
+}
+
+// end catch_string_manip.h
+#ifndef CATCH_CONFIG_DISABLE_MATCHERS
+// start catch_capture_matchers.h
+
+// start catch_matchers.h
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+namespace Matchers {
+    namespace Impl {
+
+        template<typename ArgT> struct MatchAllOf;
+        template<typename ArgT> struct MatchAnyOf;
+        template<typename ArgT> struct MatchNotOf;
+
+        class MatcherUntypedBase {
+        public:
+            MatcherUntypedBase() = default;
+            MatcherUntypedBase ( MatcherUntypedBase const& ) = default;
+            MatcherUntypedBase& operator = ( MatcherUntypedBase const& ) = delete;
+            std::string toString() const;
+
+        protected:
+            virtual ~MatcherUntypedBase();
+            virtual std::string describe() const = 0;
+            mutable std::string m_cachedToString;
+        };
+
+#ifdef __clang__
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+#endif
+
+        template<typename ObjectT>
+        struct MatcherMethod {
+            virtual bool match( ObjectT const& arg ) const = 0;
+        };
+
+#if defined(__OBJC__)
+        // Hack to fix Catch GH issue #1661. Could use id for generic Object support.
+        // use of const for Object pointers is very uncommon and under ARC it causes some kind of signature mismatch that breaks compilation
+        template<>
+        struct MatcherMethod<NSString*> {
+            virtual bool match( NSString* arg ) const = 0;
+        };
+#endif
+
+#ifdef __clang__
+#    pragma clang diagnostic pop
+#endif
+
+        template<typename T>
+        struct MatcherBase : MatcherUntypedBase, MatcherMethod<T> {
+
+            MatchAllOf<T> operator && ( MatcherBase const& other ) const;
+            MatchAnyOf<T> operator || ( MatcherBase const& other ) const;
+            MatchNotOf<T> operator ! () const;
+        };
+
+        template<typename ArgT>
+        struct MatchAllOf : MatcherBase<ArgT> {
+            bool match( ArgT const& arg ) const override {
+                for( auto matcher : m_matchers ) {
+                    if (!matcher->match(arg))
+                        return false;
+                }
+                return true;
+            }
+            std::string describe() const override {
+                std::string description;
+                description.reserve( 4 + m_matchers.size()*32 );
+                description += "( ";
+                bool first = true;
+                for( auto matcher : m_matchers ) {
+                    if( first )
+                        first = false;
+                    else
+                        description += " and ";
+                    description += matcher->toString();
+                }
+                description += " )";
+                return description;
+            }
+
+            MatchAllOf<ArgT> operator && ( MatcherBase<ArgT> const& other ) {
+                auto copy(*this);
+                copy.m_matchers.push_back( &other );
+                return copy;
+            }
+
+            std::vector<MatcherBase<ArgT> const*> m_matchers;
+        };
+        template<typename ArgT>
+        struct MatchAnyOf : MatcherBase<ArgT> {
+
+            bool match( ArgT const& arg ) const override {
+                for( auto matcher : m_matchers ) {
+                    if (matcher->match(arg))
+                        return true;
+                }
+                return false;
+            }
+            std::string describe() const override {
+                std::string description;
+                description.reserve( 4 + m_matchers.size()*32 );
+                description += "( ";
+                bool first = true;
+                for( auto matcher : m_matchers ) {
+                    if( first )
+                        first = false;
+                    else
+                        description += " or ";
+                    description += matcher->toString();
+                }
+                description += " )";
+                return description;
+            }
+
+            MatchAnyOf<ArgT> operator || ( MatcherBase<ArgT> const& other ) {
+                auto copy(*this);
+                copy.m_matchers.push_back( &other );
+                return copy;
+            }
+
+            std::vector<MatcherBase<ArgT> const*> m_matchers;
+        };
+
+        template<typename ArgT>
+        struct MatchNotOf : MatcherBase<ArgT> {
+
+            MatchNotOf( MatcherBase<ArgT> const& underlyingMatcher ) : m_underlyingMatcher( underlyingMatcher ) {}
+
+            bool match( ArgT const& arg ) const override {
+                return !m_underlyingMatcher.match( arg );
+            }
+
+            std::string describe() const override {
+                return "not " + m_underlyingMatcher.toString();
+            }
+            MatcherBase<ArgT> const& m_underlyingMatcher;
+        };
+
+        template<typename T>
+        MatchAllOf<T> MatcherBase<T>::operator && ( MatcherBase const& other ) const {
+            return MatchAllOf<T>() && *this && other;
+        }
+        template<typename T>
+        MatchAnyOf<T> MatcherBase<T>::operator || ( MatcherBase const& other ) const {
+            return MatchAnyOf<T>() || *this || other;
+        }
+        template<typename T>
+        MatchNotOf<T> MatcherBase<T>::operator ! () const {
+            return MatchNotOf<T>( *this );
+        }
+
+    } // namespace Impl
+
+} // namespace Matchers
+
+using namespace Matchers;
+using Matchers::Impl::MatcherBase;
+
+} // namespace Catch
+
+// end catch_matchers.h
+// start catch_matchers_exception.hpp
+
+namespace Catch {
+namespace Matchers {
+namespace Exception {
+
+class ExceptionMessageMatcher : public MatcherBase<std::exception> {
+    std::string m_message;
+public:
+
+    ExceptionMessageMatcher(std::string const& message):
+        m_message(message)
+    {}
+
+    bool match(std::exception const& ex) const override;
+
+    std::string describe() const override;
+};
+
+} // namespace Exception
+
+Exception::ExceptionMessageMatcher Message(std::string const& message);
+
+} // namespace Matchers
+} // namespace Catch
+
+// end catch_matchers_exception.hpp
+// start catch_matchers_floating.h
+
+namespace Catch {
+namespace Matchers {
+
+    namespace Floating {
+
+        enum class FloatingPointKind : uint8_t;
+
+        struct WithinAbsMatcher : MatcherBase<double> {
+            WithinAbsMatcher(double target, double margin);
+            bool match(double const& matchee) const override;
+            std::string describe() const override;
+        private:
+            double m_target;
+            double m_margin;
+        };
+
+        struct WithinUlpsMatcher : MatcherBase<double> {
+            WithinUlpsMatcher(double target, uint64_t ulps, FloatingPointKind baseType);
+            bool match(double const& matchee) const override;
+            std::string describe() const override;
+        private:
+            double m_target;
+            uint64_t m_ulps;
+            FloatingPointKind m_type;
+        };
+
+        // Given IEEE-754 format for floats and doubles, we can assume
+        // that float -> double promotion is lossless. Given this, we can
+        // assume that if we do the standard relative comparison of
+        // |lhs - rhs| <= epsilon * max(fabs(lhs), fabs(rhs)), then we get
+        // the same result if we do this for floats, as if we do this for
+        // doubles that were promoted from floats.
+        struct WithinRelMatcher : MatcherBase<double> {
+            WithinRelMatcher(double target, double epsilon);
+            bool match(double const& matchee) const override;
+            std::string describe() const override;
+        private:
+            double m_target;
+            double m_epsilon;
+        };
+
+    } // namespace Floating
+
+    // The following functions create the actual matcher objects.
+    // This allows the types to be inferred
+    Floating::WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff);
+    Floating::WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff);
+    Floating::WithinAbsMatcher WithinAbs(double target, double margin);
+    Floating::WithinRelMatcher WithinRel(double target, double eps);
+    // defaults epsilon to 100*numeric_limits<double>::epsilon()
+    Floating::WithinRelMatcher WithinRel(double target);
+    Floating::WithinRelMatcher WithinRel(float target, float eps);
+    // defaults epsilon to 100*numeric_limits<float>::epsilon()
+    Floating::WithinRelMatcher WithinRel(float target);
+
+} // namespace Matchers
+} // namespace Catch
+
+// end catch_matchers_floating.h
+// start catch_matchers_generic.hpp
+
+#include <functional>
+#include <string>
+
+namespace Catch {
+namespace Matchers {
+namespace Generic {
+
+namespace Detail {
+    std::string finalizeDescription(const std::string& desc);
+}
+
+template <typename T>
+class PredicateMatcher : public MatcherBase<T> {
+    std::function<bool(T const&)> m_predicate;
+    std::string m_description;
+public:
+
+    PredicateMatcher(std::function<bool(T const&)> const& elem, std::string const& descr)
+        :m_predicate(std::move(elem)),
+        m_description(Detail::finalizeDescription(descr))
+    {}
+
+    bool match( T const& item ) const override {
+        return m_predicate(item);
+    }
+
+    std::string describe() const override {
+        return m_description;
+    }
+};
+
+} // namespace Generic
+
+    // The following functions create the actual matcher objects.
+    // The user has to explicitly specify type to the function, because
+    // inferring std::function<bool(T const&)> is hard (but possible) and
+    // requires a lot of TMP.
+    template<typename T>
+    Generic::PredicateMatcher<T> Predicate(std::function<bool(T const&)> const& predicate, std::string const& description = "") {
+        return Generic::PredicateMatcher<T>(predicate, description);
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+// end catch_matchers_generic.hpp
+// start catch_matchers_string.h
+
+#include <string>
+
+namespace Catch {
+namespace Matchers {
+
+    namespace StdString {
+
+        struct CasedString
+        {
+            CasedString( std::string const& str, CaseSensitive::Choice caseSensitivity );
+            std::string adjustString( std::string const& str ) const;
+            std::string caseSensitivitySuffix() const;
+
+            CaseSensitive::Choice m_caseSensitivity;
+            std::string m_str;
+        };
+
+        struct StringMatcherBase : MatcherBase<std::string> {
+            StringMatcherBase( std::string const& operation, CasedString const& comparator );
+            std::string describe() const override;
+
+            CasedString m_comparator;
+            std::string m_operation;
+        };
+
+        struct EqualsMatcher : StringMatcherBase {
+            EqualsMatcher( CasedString const& comparator );
+            bool match( std::string const& source ) const override;
+        };
+        struct ContainsMatcher : StringMatcherBase {
+            ContainsMatcher( CasedString const& comparator );
+            bool match( std::string const& source ) const override;
+        };
+        struct StartsWithMatcher : StringMatcherBase {
+            StartsWithMatcher( CasedString const& comparator );
+            bool match( std::string const& source ) const override;
+        };
+        struct EndsWithMatcher : StringMatcherBase {
+            EndsWithMatcher( CasedString const& comparator );
+            bool match( std::string const& source ) const override;
+        };
+
+        struct RegexMatcher : MatcherBase<std::string> {
+            RegexMatcher( std::string regex, CaseSensitive::Choice caseSensitivity );
+            bool match( std::string const& matchee ) const override;
+            std::string describe() const override;
+
+        private:
+            std::string m_regex;
+            CaseSensitive::Choice m_caseSensitivity;
+        };
+
+    } // namespace StdString
+
+    // The following functions create the actual matcher objects.
+    // This allows the types to be inferred
+
+    StdString::EqualsMatcher Equals( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
+    StdString::ContainsMatcher Contains( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
+    StdString::EndsWithMatcher EndsWith( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
+    StdString::StartsWithMatcher StartsWith( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
+    StdString::RegexMatcher Matches( std::string const& regex, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
+
+} // namespace Matchers
+} // namespace Catch
+
+// end catch_matchers_string.h
+// start catch_matchers_vector.h
+
+#include <algorithm>
+
+namespace Catch {
+namespace Matchers {
+
+    namespace Vector {
+        template<typename T, typename Alloc>
+        struct ContainsElementMatcher : MatcherBase<std::vector<T, Alloc>> {
+
+            ContainsElementMatcher(T const &comparator) : m_comparator( comparator) {}
+
+            bool match(std::vector<T, Alloc> const &v) const override {
+                for (auto const& el : v) {
+                    if (el == m_comparator) {
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+            std::string describe() const override {
+                return "Contains: " + ::Catch::Detail::stringify( m_comparator );
+            }
+
+            T const& m_comparator;
+        };
+
+        template<typename T, typename AllocComp, typename AllocMatch>
+        struct ContainsMatcher : MatcherBase<std::vector<T, AllocMatch>> {
+
+            ContainsMatcher(std::vector<T, AllocComp> const &comparator) : m_comparator( comparator ) {}
+
+            bool match(std::vector<T, AllocMatch> const &v) const override {
+                // !TBD: see note in EqualsMatcher
+                if (m_comparator.size() > v.size())
+                    return false;
+                for (auto const& comparator : m_comparator) {
+                    auto present = false;
+                    for (const auto& el : v) {
+                        if (el == comparator) {
+                            present = true;
+                            break;
+                        }
+                    }
+                    if (!present) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+            std::string describe() const override {
+                return "Contains: " + ::Catch::Detail::stringify( m_comparator );
+            }
+
+            std::vector<T, AllocComp> const& m_comparator;
+        };
+
+        template<typename T, typename AllocComp, typename AllocMatch>
+        struct EqualsMatcher : MatcherBase<std::vector<T, AllocMatch>> {
+
+            EqualsMatcher(std::vector<T, AllocComp> const &comparator) : m_comparator( comparator ) {}
+
+            bool match(std::vector<T, AllocMatch> const &v) const override {
+                // !TBD: This currently works if all elements can be compared using !=
+                // - a more general approach would be via a compare template that defaults
+                // to using !=. but could be specialised for, e.g. std::vector<T, Alloc> etc
+                // - then just call that directly
+                if (m_comparator.size() != v.size())
+                    return false;
+                for (std::size_t i = 0; i < v.size(); ++i)
+                    if (m_comparator[i] != v[i])
+                        return false;
+                return true;
+            }
+            std::string describe() const override {
+                return "Equals: " + ::Catch::Detail::stringify( m_comparator );
+            }
+            std::vector<T, AllocComp> const& m_comparator;
+        };
+
+        template<typename T, typename AllocComp, typename AllocMatch>
+        struct ApproxMatcher : MatcherBase<std::vector<T, AllocMatch>> {
+
+            ApproxMatcher(std::vector<T, AllocComp> const& comparator) : m_comparator( comparator ) {}
+
+            bool match(std::vector<T, AllocMatch> const &v) const override {
+                if (m_comparator.size() != v.size())
+                    return false;
+                for (std::size_t i = 0; i < v.size(); ++i)
+                    if (m_comparator[i] != approx(v[i]))
+                        return false;
+                return true;
+            }
+            std::string describe() const override {
+                return "is approx: " + ::Catch::Detail::stringify( m_comparator );
+            }
+            template <typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+            ApproxMatcher& epsilon( T const& newEpsilon ) {
+                approx.epsilon(newEpsilon);
+                return *this;
+            }
+            template <typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+            ApproxMatcher& margin( T const& newMargin ) {
+                approx.margin(newMargin);
+                return *this;
+            }
+            template <typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
+            ApproxMatcher& scale( T const& newScale ) {
+                approx.scale(newScale);
+                return *this;
+            }
+
+            std::vector<T, AllocComp> const& m_comparator;
+            mutable Catch::Detail::Approx approx = Catch::Detail::Approx::custom();
+        };
+
+        template<typename T, typename AllocComp, typename AllocMatch>
+        struct UnorderedEqualsMatcher : MatcherBase<std::vector<T, AllocMatch>> {
+            UnorderedEqualsMatcher(std::vector<T, AllocComp> const& target) : m_target(target) {}
+            bool match(std::vector<T, AllocMatch> const& vec) const override {
+                if (m_target.size() != vec.size()) {
+                    return false;
+                }
+                return std::is_permutation(m_target.begin(), m_target.end(), vec.begin());
+            }
+
+            std::string describe() const override {
+                return "UnorderedEquals: " + ::Catch::Detail::stringify(m_target);
+            }
+        private:
+            std::vector<T, AllocComp> const& m_target;
+        };
+
+    } // namespace Vector
+
+    // The following functions create the actual matcher objects.
+    // This allows the types to be inferred
+
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    Vector::ContainsMatcher<T, AllocComp, AllocMatch> Contains( std::vector<T, AllocComp> const& comparator ) {
+        return Vector::ContainsMatcher<T, AllocComp, AllocMatch>( comparator );
+    }
+
+    template<typename T, typename Alloc = std::allocator<T>>
+    Vector::ContainsElementMatcher<T, Alloc> VectorContains( T const& comparator ) {
+        return Vector::ContainsElementMatcher<T, Alloc>( comparator );
+    }
+
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    Vector::EqualsMatcher<T, AllocComp, AllocMatch> Equals( std::vector<T, AllocComp> const& comparator ) {
+        return Vector::EqualsMatcher<T, AllocComp, AllocMatch>( comparator );
+    }
+
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    Vector::ApproxMatcher<T, AllocComp, AllocMatch> Approx( std::vector<T, AllocComp> const& comparator ) {
+        return Vector::ApproxMatcher<T, AllocComp, AllocMatch>( comparator );
+    }
+
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    Vector::UnorderedEqualsMatcher<T, AllocComp, AllocMatch> UnorderedEquals(std::vector<T, AllocComp> const& target) {
+        return Vector::UnorderedEqualsMatcher<T, AllocComp, AllocMatch>( target );
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+// end catch_matchers_vector.h
+namespace Catch {
+
+    template<typename ArgT, typename MatcherT>
+    class MatchExpr : public ITransientExpression {
+        ArgT const& m_arg;
+        MatcherT m_matcher;
+        StringRef m_matcherString;
+    public:
+        MatchExpr( ArgT const& arg, MatcherT const& matcher, StringRef const& matcherString )
+        :   ITransientExpression{ true, matcher.match( arg ) },
+            m_arg( arg ),
+            m_matcher( matcher ),
+            m_matcherString( matcherString )
+        {}
+
+        void streamReconstructedExpression( std::ostream &os ) const override {
+            auto matcherAsString = m_matcher.toString();
+            os << Catch::Detail::stringify( m_arg ) << ' ';
+            if( matcherAsString == Detail::unprintableString )
+                os << m_matcherString;
+            else
+                os << matcherAsString;
+        }
+    };
+
+    using StringMatcher = Matchers::Impl::MatcherBase<std::string>;
+
+    void handleExceptionMatchExpr( AssertionHandler& handler, StringMatcher const& matcher, StringRef const& matcherString  );
+
+    template<typename ArgT, typename MatcherT>
+    auto makeMatchExpr( ArgT const& arg, MatcherT const& matcher, StringRef const& matcherString  ) -> MatchExpr<ArgT, MatcherT> {
+        return MatchExpr<ArgT, MatcherT>( arg, matcher, matcherString );
+    }
+
+} // namespace Catch
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CHECK_THAT( macroName, matcher, resultDisposition, arg ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(arg) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        INTERNAL_CATCH_TRY { \
+            catchAssertionHandler.handleExpr( Catch::makeMatchExpr( arg, matcher, #matcher##_catch_sr ) ); \
+        } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS_MATCHES( macroName, exceptionType, resultDisposition, matcher, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(exceptionType) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                static_cast<void>(__VA_ARGS__ ); \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( exceptionType const& ex ) { \
+                catchAssertionHandler.handleExpr( Catch::makeMatchExpr( ex, matcher, #matcher##_catch_sr ) ); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleUnexpectedInflightException(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+// end catch_capture_matchers.h
+#endif
+// start catch_generators.hpp
+
+// start catch_interfaces_generatortracker.h
+
+
+#include <memory>
+
+namespace Catch {
+
+    namespace Generators {
+        class GeneratorUntypedBase {
+        public:
+            GeneratorUntypedBase() = default;
+            virtual ~GeneratorUntypedBase();
+            // Attempts to move the generator to the next element
+             //
+             // Returns true iff the move succeeded (and a valid element
+             // can be retrieved).
+            virtual bool next() = 0;
+        };
+        using GeneratorBasePtr = std::unique_ptr<GeneratorUntypedBase>;
+
+    } // namespace Generators
+
+    struct IGeneratorTracker {
+        virtual ~IGeneratorTracker();
+        virtual auto hasGenerator() const -> bool = 0;
+        virtual auto getGenerator() const -> Generators::GeneratorBasePtr const& = 0;
+        virtual void setGenerator( Generators::GeneratorBasePtr&& generator ) = 0;
+    };
+
+} // namespace Catch
+
+// end catch_interfaces_generatortracker.h
+// start catch_enforce.h
+
+#include <exception>
+
+namespace Catch {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    template <typename Ex>
+    [[noreturn]]
+    void throw_exception(Ex const& e) {
+        throw e;
+    }
+#else // ^^ Exceptions are enabled //  Exceptions are disabled vv
+    [[noreturn]]
+    void throw_exception(std::exception const& e);
+#endif
+
+    [[noreturn]]
+    void throw_logic_error(std::string const& msg);
+    [[noreturn]]
+    void throw_domain_error(std::string const& msg);
+    [[noreturn]]
+    void throw_runtime_error(std::string const& msg);
+
+} // namespace Catch;
+
+#define CATCH_MAKE_MSG(...) \
+    (Catch::ReusableStringStream() << __VA_ARGS__).str()
+
+#define CATCH_INTERNAL_ERROR(...) \
+    Catch::throw_logic_error(CATCH_MAKE_MSG( CATCH_INTERNAL_LINEINFO << ": Internal Catch2 error: " << __VA_ARGS__))
+
+#define CATCH_ERROR(...) \
+    Catch::throw_domain_error(CATCH_MAKE_MSG( __VA_ARGS__ ))
+
+#define CATCH_RUNTIME_ERROR(...) \
+    Catch::throw_runtime_error(CATCH_MAKE_MSG( __VA_ARGS__ ))
+
+#define CATCH_ENFORCE( condition, ... ) \
+    do{ if( !(condition) ) CATCH_ERROR( __VA_ARGS__ ); } while(false)
+
+// end catch_enforce.h
+#include <memory>
+#include <vector>
+#include <cassert>
+
+#include <utility>
+#include <exception>
+
+namespace Catch {
+
+class GeneratorException : public std::exception {
+    const char* const m_msg = "";
+
+public:
+    GeneratorException(const char* msg):
+        m_msg(msg)
+    {}
+
+    const char* what() const noexcept override final;
+};
+
+namespace Generators {
+
+    // !TBD move this into its own location?
+    namespace pf{
+        template<typename T, typename... Args>
+        std::unique_ptr<T> make_unique( Args&&... args ) {
+            return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+        }
+    }
+
+    template<typename T>
+    struct IGenerator : GeneratorUntypedBase {
+        virtual ~IGenerator() = default;
+
+        // Returns the current element of the generator
+        //
+        // \Precondition The generator is either freshly constructed,
+        // or the last call to `next()` returned true
+        virtual T const& get() const = 0;
+        using type = T;
+    };
+
+    template<typename T>
+    class SingleValueGenerator final : public IGenerator<T> {
+        T m_value;
+    public:
+        SingleValueGenerator(T&& value) : m_value(std::move(value)) {}
+
+        T const& get() const override {
+            return m_value;
+        }
+        bool next() override {
+            return false;
+        }
+    };
+
+    template<typename T>
+    class FixedValuesGenerator final : public IGenerator<T> {
+        static_assert(!std::is_same<T, bool>::value,
+            "FixedValuesGenerator does not support bools because of std::vector<bool>"
+            "specialization, use SingleValue Generator instead.");
+        std::vector<T> m_values;
+        size_t m_idx = 0;
+    public:
+        FixedValuesGenerator( std::initializer_list<T> values ) : m_values( values ) {}
+
+        T const& get() const override {
+            return m_values[m_idx];
+        }
+        bool next() override {
+            ++m_idx;
+            return m_idx < m_values.size();
+        }
+    };
+
+    template <typename T>
+    class GeneratorWrapper final {
+        std::unique_ptr<IGenerator<T>> m_generator;
+    public:
+        GeneratorWrapper(std::unique_ptr<IGenerator<T>> generator):
+            m_generator(std::move(generator))
+        {}
+        T const& get() const {
+            return m_generator->get();
+        }
+        bool next() {
+            return m_generator->next();
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<T> value(T&& value) {
+        return GeneratorWrapper<T>(pf::make_unique<SingleValueGenerator<T>>(std::forward<T>(value)));
+    }
+    template <typename T>
+    GeneratorWrapper<T> values(std::initializer_list<T> values) {
+        return GeneratorWrapper<T>(pf::make_unique<FixedValuesGenerator<T>>(values));
+    }
+
+    template<typename T>
+    class Generators : public IGenerator<T> {
+        std::vector<GeneratorWrapper<T>> m_generators;
+        size_t m_current = 0;
+
+        void populate(GeneratorWrapper<T>&& generator) {
+            m_generators.emplace_back(std::move(generator));
+        }
+        void populate(T&& val) {
+            m_generators.emplace_back(value(std::forward<T>(val)));
+        }
+        template<typename U>
+        void populate(U&& val) {
+            populate(T(std::forward<U>(val)));
+        }
+        template<typename U, typename... Gs>
+        void populate(U&& valueOrGenerator, Gs &&... moreGenerators) {
+            populate(std::forward<U>(valueOrGenerator));
+            populate(std::forward<Gs>(moreGenerators)...);
+        }
+
+    public:
+        template <typename... Gs>
+        Generators(Gs &&... moreGenerators) {
+            m_generators.reserve(sizeof...(Gs));
+            populate(std::forward<Gs>(moreGenerators)...);
+        }
+
+        T const& get() const override {
+            return m_generators[m_current].get();
+        }
+
+        bool next() override {
+            if (m_current >= m_generators.size()) {
+                return false;
+            }
+            const bool current_status = m_generators[m_current].next();
+            if (!current_status) {
+                ++m_current;
+            }
+            return m_current < m_generators.size();
+        }
+    };
+
+    template<typename... Ts>
+    GeneratorWrapper<std::tuple<Ts...>> table( std::initializer_list<std::tuple<typename std::decay<Ts>::type...>> tuples ) {
+        return values<std::tuple<Ts...>>( tuples );
+    }
+
+    // Tag type to signal that a generator sequence should convert arguments to a specific type
+    template <typename T>
+    struct as {};
+
+    template<typename T, typename... Gs>
+    auto makeGenerators( GeneratorWrapper<T>&& generator, Gs &&... moreGenerators ) -> Generators<T> {
+        return Generators<T>(std::move(generator), std::forward<Gs>(moreGenerators)...);
+    }
+    template<typename T>
+    auto makeGenerators( GeneratorWrapper<T>&& generator ) -> Generators<T> {
+        return Generators<T>(std::move(generator));
+    }
+    template<typename T, typename... Gs>
+    auto makeGenerators( T&& val, Gs &&... moreGenerators ) -> Generators<T> {
+        return makeGenerators( value( std::forward<T>( val ) ), std::forward<Gs>( moreGenerators )... );
+    }
+    template<typename T, typename U, typename... Gs>
+    auto makeGenerators( as<T>, U&& val, Gs &&... moreGenerators ) -> Generators<T> {
+        return makeGenerators( value( T( std::forward<U>( val ) ) ), std::forward<Gs>( moreGenerators )... );
+    }
+
+    auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker&;
+
+    template<typename L>
+    // Note: The type after -> is weird, because VS2015 cannot parse
+    //       the expression used in the typedef inside, when it is in
+    //       return type. Yeah.
+    auto generate( StringRef generatorName, SourceLineInfo const& lineInfo, L const& generatorExpression ) -> decltype(std::declval<decltype(generatorExpression())>().get()) {
+        using UnderlyingType = typename decltype(generatorExpression())::type;
+
+        IGeneratorTracker& tracker = acquireGeneratorTracker( generatorName, lineInfo );
+        if (!tracker.hasGenerator()) {
+            tracker.setGenerator(pf::make_unique<Generators<UnderlyingType>>(generatorExpression()));
+        }
+
+        auto const& generator = static_cast<IGenerator<UnderlyingType> const&>( *tracker.getGenerator() );
+        return generator.get();
+    }
+
+} // namespace Generators
+} // namespace Catch
+
+#define GENERATE( ... ) \
+    Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+#define GENERATE_COPY( ... ) \
+    Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+#define GENERATE_REF( ... ) \
+    Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+
+// end catch_generators.hpp
+// start catch_generators_generic.hpp
+
+namespace Catch {
+namespace Generators {
+
+    template <typename T>
+    class TakeGenerator : public IGenerator<T> {
+        GeneratorWrapper<T> m_generator;
+        size_t m_returned = 0;
+        size_t m_target;
+    public:
+        TakeGenerator(size_t target, GeneratorWrapper<T>&& generator):
+            m_generator(std::move(generator)),
+            m_target(target)
+        {
+            assert(target != 0 && "Empty generators are not allowed");
+        }
+        T const& get() const override {
+            return m_generator.get();
+        }
+        bool next() override {
+            ++m_returned;
+            if (m_returned >= m_target) {
+                return false;
+            }
+
+            const auto success = m_generator.next();
+            // If the underlying generator does not contain enough values
+            // then we cut short as well
+            if (!success) {
+                m_returned = m_target;
+            }
+            return success;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<T> take(size_t target, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(pf::make_unique<TakeGenerator<T>>(target, std::move(generator)));
+    }
+
+    template <typename T, typename Predicate>
+    class FilterGenerator : public IGenerator<T> {
+        GeneratorWrapper<T> m_generator;
+        Predicate m_predicate;
+    public:
+        template <typename P = Predicate>
+        FilterGenerator(P&& pred, GeneratorWrapper<T>&& generator):
+            m_generator(std::move(generator)),
+            m_predicate(std::forward<P>(pred))
+        {
+            if (!m_predicate(m_generator.get())) {
+                // It might happen that there are no values that pass the
+                // filter. In that case we throw an exception.
+                auto has_initial_value = next();
+                if (!has_initial_value) {
+                    Catch::throw_exception(GeneratorException("No valid value found in filtered generator"));
+                }
+            }
+        }
+
+        T const& get() const override {
+            return m_generator.get();
+        }
+
+        bool next() override {
+            bool success = m_generator.next();
+            if (!success) {
+                return false;
+            }
+            while (!m_predicate(m_generator.get()) && (success = m_generator.next()) == true);
+            return success;
+        }
+    };
+
+    template <typename T, typename Predicate>
+    GeneratorWrapper<T> filter(Predicate&& pred, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(std::unique_ptr<IGenerator<T>>(pf::make_unique<FilterGenerator<T, Predicate>>(std::forward<Predicate>(pred), std::move(generator))));
+    }
+
+    template <typename T>
+    class RepeatGenerator : public IGenerator<T> {
+        static_assert(!std::is_same<T, bool>::value,
+            "RepeatGenerator currently does not support bools"
+            "because of std::vector<bool> specialization");
+        GeneratorWrapper<T> m_generator;
+        mutable std::vector<T> m_returned;
+        size_t m_target_repeats;
+        size_t m_current_repeat = 0;
+        size_t m_repeat_index = 0;
+    public:
+        RepeatGenerator(size_t repeats, GeneratorWrapper<T>&& generator):
+            m_generator(std::move(generator)),
+            m_target_repeats(repeats)
+        {
+            assert(m_target_repeats > 0 && "Repeat generator must repeat at least once");
+        }
+
+        T const& get() const override {
+            if (m_current_repeat == 0) {
+                m_returned.push_back(m_generator.get());
+                return m_returned.back();
+            }
+            return m_returned[m_repeat_index];
+        }
+
+        bool next() override {
+            // There are 2 basic cases:
+            // 1) We are still reading the generator
+            // 2) We are reading our own cache
+
+            // In the first case, we need to poke the underlying generator.
+            // If it happily moves, we are left in that state, otherwise it is time to start reading from our cache
+            if (m_current_repeat == 0) {
+                const auto success = m_generator.next();
+                if (!success) {
+                    ++m_current_repeat;
+                }
+                return m_current_repeat < m_target_repeats;
+            }
+
+            // In the second case, we need to move indices forward and check that we haven't run up against the end
+            ++m_repeat_index;
+            if (m_repeat_index == m_returned.size()) {
+                m_repeat_index = 0;
+                ++m_current_repeat;
+            }
+            return m_current_repeat < m_target_repeats;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<T> repeat(size_t repeats, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(pf::make_unique<RepeatGenerator<T>>(repeats, std::move(generator)));
+    }
+
+    template <typename T, typename U, typename Func>
+    class MapGenerator : public IGenerator<T> {
+        // TBD: provide static assert for mapping function, for friendly error message
+        GeneratorWrapper<U> m_generator;
+        Func m_function;
+        // To avoid returning dangling reference, we have to save the values
+        T m_cache;
+    public:
+        template <typename F2 = Func>
+        MapGenerator(F2&& function, GeneratorWrapper<U>&& generator) :
+            m_generator(std::move(generator)),
+            m_function(std::forward<F2>(function)),
+            m_cache(m_function(m_generator.get()))
+        {}
+
+        T const& get() const override {
+            return m_cache;
+        }
+        bool next() override {
+            const auto success = m_generator.next();
+            if (success) {
+                m_cache = m_function(m_generator.get());
+            }
+            return success;
+        }
+    };
+
+    template <typename Func, typename U, typename T = FunctionReturnType<Func, U>>
+    GeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {
+        return GeneratorWrapper<T>(
+            pf::make_unique<MapGenerator<T, U, Func>>(std::forward<Func>(function), std::move(generator))
+        );
+    }
+
+    template <typename T, typename U, typename Func>
+    GeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {
+        return GeneratorWrapper<T>(
+            pf::make_unique<MapGenerator<T, U, Func>>(std::forward<Func>(function), std::move(generator))
+        );
+    }
+
+    template <typename T>
+    class ChunkGenerator final : public IGenerator<std::vector<T>> {
+        std::vector<T> m_chunk;
+        size_t m_chunk_size;
+        GeneratorWrapper<T> m_generator;
+        bool m_used_up = false;
+    public:
+        ChunkGenerator(size_t size, GeneratorWrapper<T> generator) :
+            m_chunk_size(size), m_generator(std::move(generator))
+        {
+            m_chunk.reserve(m_chunk_size);
+            if (m_chunk_size != 0) {
+                m_chunk.push_back(m_generator.get());
+                for (size_t i = 1; i < m_chunk_size; ++i) {
+                    if (!m_generator.next()) {
+                        Catch::throw_exception(GeneratorException("Not enough values to initialize the first chunk"));
+                    }
+                    m_chunk.push_back(m_generator.get());
+                }
+            }
+        }
+        std::vector<T> const& get() const override {
+            return m_chunk;
+        }
+        bool next() override {
+            m_chunk.clear();
+            for (size_t idx = 0; idx < m_chunk_size; ++idx) {
+                if (!m_generator.next()) {
+                    return false;
+                }
+                m_chunk.push_back(m_generator.get());
+            }
+            return true;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<std::vector<T>> chunk(size_t size, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<std::vector<T>>(
+            pf::make_unique<ChunkGenerator<T>>(size, std::move(generator))
+        );
+    }
+
+} // namespace Generators
+} // namespace Catch
+
+// end catch_generators_generic.hpp
+// start catch_generators_specific.hpp
+
+// start catch_context.h
+
+#include <memory>
+
+namespace Catch {
+
+    struct IResultCapture;
+    struct IRunner;
+    struct IConfig;
+    struct IMutableContext;
+
+    using IConfigPtr = std::shared_ptr<IConfig const>;
+
+    struct IContext
+    {
+        virtual ~IContext();
+
+        virtual IResultCapture* getResultCapture() = 0;
+        virtual IRunner* getRunner() = 0;
+        virtual IConfigPtr const& getConfig() const = 0;
+    };
+
+    struct IMutableContext : IContext
+    {
+        virtual ~IMutableContext();
+        virtual void setResultCapture( IResultCapture* resultCapture ) = 0;
+        virtual void setRunner( IRunner* runner ) = 0;
+        virtual void setConfig( IConfigPtr const& config ) = 0;
+
+    private:
+        static IMutableContext *currentContext;
+        friend IMutableContext& getCurrentMutableContext();
+        friend void cleanUpContext();
+        static void createContext();
+    };
+
+    inline IMutableContext& getCurrentMutableContext()
+    {
+        if( !IMutableContext::currentContext )
+            IMutableContext::createContext();
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *IMutableContext::currentContext;
+    }
+
+    inline IContext& getCurrentContext()
+    {
+        return getCurrentMutableContext();
+    }
+
+    void cleanUpContext();
+
+    class SimplePcg32;
+    SimplePcg32& rng();
+}
+
+// end catch_context.h
+// start catch_interfaces_config.h
+
+// start catch_option.hpp
+
+namespace Catch {
+
+    // An optional type
+    template<typename T>
+    class Option {
+    public:
+        Option() : nullableValue( nullptr ) {}
+        Option( T const& _value )
+        : nullableValue( new( storage ) T( _value ) )
+        {}
+        Option( Option const& _other )
+        : nullableValue( _other ? new( storage ) T( *_other ) : nullptr )
+        {}
+
+        ~Option() {
+            reset();
+        }
+
+        Option& operator= ( Option const& _other ) {
+            if( &_other != this ) {
+                reset();
+                if( _other )
+                    nullableValue = new( storage ) T( *_other );
+            }
+            return *this;
+        }
+        Option& operator = ( T const& _value ) {
+            reset();
+            nullableValue = new( storage ) T( _value );
+            return *this;
+        }
+
+        void reset() {
+            if( nullableValue )
+                nullableValue->~T();
+            nullableValue = nullptr;
+        }
+
+        T& operator*() { return *nullableValue; }
+        T const& operator*() const { return *nullableValue; }
+        T* operator->() { return nullableValue; }
+        const T* operator->() const { return nullableValue; }
+
+        T valueOr( T const& defaultValue ) const {
+            return nullableValue ? *nullableValue : defaultValue;
+        }
+
+        bool some() const { return nullableValue != nullptr; }
+        bool none() const { return nullableValue == nullptr; }
+
+        bool operator !() const { return nullableValue == nullptr; }
+        explicit operator bool() const {
+            return some();
+        }
+
+    private:
+        T *nullableValue;
+        alignas(alignof(T)) char storage[sizeof(T)];
+    };
+
+} // end namespace Catch
+
+// end catch_option.hpp
+#include <chrono>
+#include <iosfwd>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace Catch {
+
+    enum class Verbosity {
+        Quiet = 0,
+        Normal,
+        High
+    };
+
+    struct WarnAbout { enum What {
+        Nothing = 0x00,
+        NoAssertions = 0x01,
+        NoTests = 0x02
+    }; };
+
+    struct ShowDurations { enum OrNot {
+        DefaultForReporter,
+        Always,
+        Never
+    }; };
+    struct RunTests { enum InWhatOrder {
+        InDeclarationOrder,
+        InLexicographicalOrder,
+        InRandomOrder
+    }; };
+    struct UseColour { enum YesOrNo {
+        Auto,
+        Yes,
+        No
+    }; };
+    struct WaitForKeypress { enum When {
+        Never,
+        BeforeStart = 1,
+        BeforeExit = 2,
+        BeforeStartAndExit = BeforeStart | BeforeExit
+    }; };
+
+    class TestSpec;
+
+    struct IConfig : NonCopyable {
+
+        virtual ~IConfig();
+
+        virtual bool allowThrows() const = 0;
+        virtual std::ostream& stream() const = 0;
+        virtual std::string name() const = 0;
+        virtual bool includeSuccessfulResults() const = 0;
+        virtual bool shouldDebugBreak() const = 0;
+        virtual bool warnAboutMissingAssertions() const = 0;
+        virtual bool warnAboutNoTests() const = 0;
+        virtual int abortAfter() const = 0;
+        virtual bool showInvisibles() const = 0;
+        virtual ShowDurations::OrNot showDurations() const = 0;
+        virtual double minDuration() const = 0;
+        virtual TestSpec const& testSpec() const = 0;
+        virtual bool hasTestFilters() const = 0;
+        virtual std::vector<std::string> const& getTestsOrTags() const = 0;
+        virtual RunTests::InWhatOrder runOrder() const = 0;
+        virtual unsigned int rngSeed() const = 0;
+        virtual UseColour::YesOrNo useColour() const = 0;
+        virtual std::vector<std::string> const& getSectionsToRun() const = 0;
+        virtual Verbosity verbosity() const = 0;
+
+        virtual bool benchmarkNoAnalysis() const = 0;
+        virtual int benchmarkSamples() const = 0;
+        virtual double benchmarkConfidenceInterval() const = 0;
+        virtual unsigned int benchmarkResamples() const = 0;
+        virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
+    };
+
+    using IConfigPtr = std::shared_ptr<IConfig const>;
+}
+
+// end catch_interfaces_config.h
+// start catch_random_number_generator.h
+
+#include <cstdint>
+
+namespace Catch {
+
+    // This is a simple implementation of C++11 Uniform Random Number
+    // Generator. It does not provide all operators, because Catch2
+    // does not use it, but it should behave as expected inside stdlib's
+    // distributions.
+    // The implementation is based on the PCG family (http://pcg-random.org)
+    class SimplePcg32 {
+        using state_type = std::uint64_t;
+    public:
+        using result_type = std::uint32_t;
+        static constexpr result_type (min)() {
+            return 0;
+        }
+        static constexpr result_type (max)() {
+            return static_cast<result_type>(-1);
+        }
+
+        // Provide some default initial state for the default constructor
+        SimplePcg32():SimplePcg32(0xed743cc4U) {}
+
+        explicit SimplePcg32(result_type seed_);
+
+        void seed(result_type seed_);
+        void discard(uint64_t skip);
+
+        result_type operator()();
+
+    private:
+        friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+        friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+
+        // In theory we also need operator<< and operator>>
+        // In practice we do not use them, so we will skip them for now
+
+        std::uint64_t m_state;
+        // This part of the state determines which "stream" of the numbers
+        // is chosen -- we take it as a constant for Catch2, so we only
+        // need to deal with seeding the main state.
+        // Picked by reading 8 bytes from `/dev/random` :-)
+        static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;
+    };
+
+} // end namespace Catch
+
+// end catch_random_number_generator.h
+#include <random>
+
+namespace Catch {
+namespace Generators {
+
+template <typename Float>
+class RandomFloatingGenerator final : public IGenerator<Float> {
+    Catch::SimplePcg32& m_rng;
+    std::uniform_real_distribution<Float> m_dist;
+    Float m_current_number;
+public:
+
+    RandomFloatingGenerator(Float a, Float b):
+        m_rng(rng()),
+        m_dist(a, b) {
+        static_cast<void>(next());
+    }
+
+    Float const& get() const override {
+        return m_current_number;
+    }
+    bool next() override {
+        m_current_number = m_dist(m_rng);
+        return true;
+    }
+};
+
+template <typename Integer>
+class RandomIntegerGenerator final : public IGenerator<Integer> {
+    Catch::SimplePcg32& m_rng;
+    std::uniform_int_distribution<Integer> m_dist;
+    Integer m_current_number;
+public:
+
+    RandomIntegerGenerator(Integer a, Integer b):
+        m_rng(rng()),
+        m_dist(a, b) {
+        static_cast<void>(next());
+    }
+
+    Integer const& get() const override {
+        return m_current_number;
+    }
+    bool next() override {
+        m_current_number = m_dist(m_rng);
+        return true;
+    }
+};
+
+// TODO: Ideally this would be also constrained against the various char types,
+//       but I don't expect users to run into that in practice.
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value && !std::is_same<T, bool>::value,
+GeneratorWrapper<T>>::type
+random(T a, T b) {
+    return GeneratorWrapper<T>(
+        pf::make_unique<RandomIntegerGenerator<T>>(a, b)
+    );
+}
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value,
+GeneratorWrapper<T>>::type
+random(T a, T b) {
+    return GeneratorWrapper<T>(
+        pf::make_unique<RandomFloatingGenerator<T>>(a, b)
+    );
+}
+
+template <typename T>
+class RangeGenerator final : public IGenerator<T> {
+    T m_current;
+    T m_end;
+    T m_step;
+    bool m_positive;
+
+public:
+    RangeGenerator(T const& start, T const& end, T const& step):
+        m_current(start),
+        m_end(end),
+        m_step(step),
+        m_positive(m_step > T(0))
+    {
+        assert(m_current != m_end && "Range start and end cannot be equal");
+        assert(m_step != T(0) && "Step size cannot be zero");
+        assert(((m_positive && m_current <= m_end) || (!m_positive && m_current >= m_end)) && "Step moves away from end");
+    }
+
+    RangeGenerator(T const& start, T const& end):
+        RangeGenerator(start, end, (start < end) ? T(1) : T(-1))
+    {}
+
+    T const& get() const override {
+        return m_current;
+    }
+
+    bool next() override {
+        m_current += m_step;
+        return (m_positive) ? (m_current < m_end) : (m_current > m_end);
+    }
+};
+
+template <typename T>
+GeneratorWrapper<T> range(T const& start, T const& end, T const& step) {
+    static_assert(std::is_arithmetic<T>::value && !std::is_same<T, bool>::value, "Type must be numeric");
+    return GeneratorWrapper<T>(pf::make_unique<RangeGenerator<T>>(start, end, step));
+}
+
+template <typename T>
+GeneratorWrapper<T> range(T const& start, T const& end) {
+    static_assert(std::is_integral<T>::value && !std::is_same<T, bool>::value, "Type must be an integer");
+    return GeneratorWrapper<T>(pf::make_unique<RangeGenerator<T>>(start, end));
+}
+
+template <typename T>
+class IteratorGenerator final : public IGenerator<T> {
+    static_assert(!std::is_same<T, bool>::value,
+        "IteratorGenerator currently does not support bools"
+        "because of std::vector<bool> specialization");
+
+    std::vector<T> m_elems;
+    size_t m_current = 0;
+public:
+    template <typename InputIterator, typename InputSentinel>
+    IteratorGenerator(InputIterator first, InputSentinel last):m_elems(first, last) {
+        if (m_elems.empty()) {
+            Catch::throw_exception(GeneratorException("IteratorGenerator received no valid values"));
+        }
+    }
+
+    T const& get() const override {
+        return m_elems[m_current];
+    }
+
+    bool next() override {
+        ++m_current;
+        return m_current != m_elems.size();
+    }
+};
+
+template <typename InputIterator,
+          typename InputSentinel,
+          typename ResultType = typename std::iterator_traits<InputIterator>::value_type>
+GeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {
+    return GeneratorWrapper<ResultType>(pf::make_unique<IteratorGenerator<ResultType>>(from, to));
+}
+
+template <typename Container,
+          typename ResultType = typename Container::value_type>
+GeneratorWrapper<ResultType> from_range(Container const& cnt) {
+    return GeneratorWrapper<ResultType>(pf::make_unique<IteratorGenerator<ResultType>>(cnt.begin(), cnt.end()));
+}
+
+} // namespace Generators
+} // namespace Catch
+
+// end catch_generators_specific.hpp
+
+// These files are included here so the single_include script doesn't put them
+// in the conditionally compiled sections
+// start catch_test_case_info.h
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+namespace Catch {
+
+    struct ITestInvoker;
+
+    struct TestCaseInfo {
+        enum SpecialProperties{
+            None = 0,
+            IsHidden = 1 << 1,
+            ShouldFail = 1 << 2,
+            MayFail = 1 << 3,
+            Throws = 1 << 4,
+            NonPortable = 1 << 5,
+            Benchmark = 1 << 6
+        };
+
+        TestCaseInfo(   std::string const& _name,
+                        std::string const& _className,
+                        std::string const& _description,
+                        std::vector<std::string> const& _tags,
+                        SourceLineInfo const& _lineInfo );
+
+        friend void setTags( TestCaseInfo& testCaseInfo, std::vector<std::string> tags );
+
+        bool isHidden() const;
+        bool throws() const;
+        bool okToFail() const;
+        bool expectedToFail() const;
+
+        std::string tagsAsString() const;
+
+        std::string name;
+        std::string className;
+        std::string description;
+        std::vector<std::string> tags;
+        std::vector<std::string> lcaseTags;
+        SourceLineInfo lineInfo;
+        SpecialProperties properties;
+    };
+
+    class TestCase : public TestCaseInfo {
+    public:
+
+        TestCase( ITestInvoker* testCase, TestCaseInfo&& info );
+
+        TestCase withName( std::string const& _newName ) const;
+
+        void invoke() const;
+
+        TestCaseInfo const& getTestCaseInfo() const;
+
+        bool operator == ( TestCase const& other ) const;
+        bool operator < ( TestCase const& other ) const;
+
+    private:
+        std::shared_ptr<ITestInvoker> test;
+    };
+
+    TestCase makeTestCase(  ITestInvoker* testCase,
+                            std::string const& className,
+                            NameAndTags const& nameAndTags,
+                            SourceLineInfo const& lineInfo );
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+// end catch_test_case_info.h
+// start catch_interfaces_runner.h
+
+namespace Catch {
+
+    struct IRunner {
+        virtual ~IRunner();
+        virtual bool aborting() const = 0;
+    };
+}
+
+// end catch_interfaces_runner.h
+
+#ifdef __OBJC__
+// start catch_objc.hpp
+
+#import <objc/runtime.h>
+
+#include <string>
+
+// NB. Any general catch headers included here must be included
+// in catch.hpp first to make sure they are included by the single
+// header for non obj-usage
+
+///////////////////////////////////////////////////////////////////////////////
+// This protocol is really only here for (self) documenting purposes, since
+// all its methods are optional.
+@protocol OcFixture
+
+@optional
+
+-(void) setUp;
+-(void) tearDown;
+
+@end
+
+namespace Catch {
+
+    class OcMethod : public ITestInvoker {
+
+    public:
+        OcMethod( Class cls, SEL sel ) : m_cls( cls ), m_sel( sel ) {}
+
+        virtual void invoke() const {
+            id obj = [[m_cls alloc] init];
+
+            performOptionalSelector( obj, @selector(setUp)  );
+            performOptionalSelector( obj, m_sel );
+            performOptionalSelector( obj, @selector(tearDown)  );
+
+            arcSafeRelease( obj );
+        }
+    private:
+        virtual ~OcMethod() {}
+
+        Class m_cls;
+        SEL m_sel;
+    };
+
+    namespace Detail{
+
+        inline std::string getAnnotation(   Class cls,
+                                            std::string const& annotationName,
+                                            std::string const& testCaseName ) {
+            NSString* selStr = [[NSString alloc] initWithFormat:@"Catch_%s_%s", annotationName.c_str(), testCaseName.c_str()];
+            SEL sel = NSSelectorFromString( selStr );
+            arcSafeRelease( selStr );
+            id value = performOptionalSelector( cls, sel );
+            if( value )
+                return [(NSString*)value UTF8String];
+            return "";
+        }
+    }
+
+    inline std::size_t registerTestMethods() {
+        std::size_t noTestMethods = 0;
+        int noClasses = objc_getClassList( nullptr, 0 );
+
+        Class* classes = (CATCH_UNSAFE_UNRETAINED Class *)malloc( sizeof(Class) * noClasses);
+        objc_getClassList( classes, noClasses );
+
+        for( int c = 0; c < noClasses; c++ ) {
+            Class cls = classes[c];
+            {
+                u_int count;
+                Method* methods = class_copyMethodList( cls, &count );
+                for( u_int m = 0; m < count ; m++ ) {
+                    SEL selector = method_getName(methods[m]);
+                    std::string methodName = sel_getName(selector);
+                    if( startsWith( methodName, "Catch_TestCase_" ) ) {
+                        std::string testCaseName = methodName.substr( 15 );
+                        std::string name = Detail::getAnnotation( cls, "Name", testCaseName );
+                        std::string desc = Detail::getAnnotation( cls, "Description", testCaseName );
+                        const char* className = class_getName( cls );
+
+                        getMutableRegistryHub().registerTest( makeTestCase( new OcMethod( cls, selector ), className, NameAndTags( name.c_str(), desc.c_str() ), SourceLineInfo("",0) ) );
+                        noTestMethods++;
+                    }
+                }
+                free(methods);
+            }
+        }
+        return noTestMethods;
+    }
+
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+
+    namespace Matchers {
+        namespace Impl {
+        namespace NSStringMatchers {
+
+            struct StringHolder : MatcherBase<NSString*>{
+                StringHolder( NSString* substr ) : m_substr( [substr copy] ){}
+                StringHolder( StringHolder const& other ) : m_substr( [other.m_substr copy] ){}
+                StringHolder() {
+                    arcSafeRelease( m_substr );
+                }
+
+                bool match( NSString* str ) const override {
+                    return false;
+                }
+
+                NSString* CATCH_ARC_STRONG m_substr;
+            };
+
+            struct Equals : StringHolder {
+                Equals( NSString* substr ) : StringHolder( substr ){}
+
+                bool match( NSString* str ) const override {
+                    return  (str != nil || m_substr == nil ) &&
+                            [str isEqualToString:m_substr];
+                }
+
+                std::string describe() const override {
+                    return "equals string: " + Catch::Detail::stringify( m_substr );
+                }
+            };
+
+            struct Contains : StringHolder {
+                Contains( NSString* substr ) : StringHolder( substr ){}
+
+                bool match( NSString* str ) const override {
+                    return  (str != nil || m_substr == nil ) &&
+                            [str rangeOfString:m_substr].location != NSNotFound;
+                }
+
+                std::string describe() const override {
+                    return "contains string: " + Catch::Detail::stringify( m_substr );
+                }
+            };
+
+            struct StartsWith : StringHolder {
+                StartsWith( NSString* substr ) : StringHolder( substr ){}
+
+                bool match( NSString* str ) const override {
+                    return  (str != nil || m_substr == nil ) &&
+                            [str rangeOfString:m_substr].location == 0;
+                }
+
+                std::string describe() const override {
+                    return "starts with: " + Catch::Detail::stringify( m_substr );
+                }
+            };
+            struct EndsWith : StringHolder {
+                EndsWith( NSString* substr ) : StringHolder( substr ){}
+
+                bool match( NSString* str ) const override {
+                    return  (str != nil || m_substr == nil ) &&
+                            [str rangeOfString:m_substr].location == [str length] - [m_substr length];
+                }
+
+                std::string describe() const override {
+                    return "ends with: " + Catch::Detail::stringify( m_substr );
+                }
+            };
+
+        } // namespace NSStringMatchers
+        } // namespace Impl
+
+        inline Impl::NSStringMatchers::Equals
+            Equals( NSString* substr ){ return Impl::NSStringMatchers::Equals( substr ); }
+
+        inline Impl::NSStringMatchers::Contains
+            Contains( NSString* substr ){ return Impl::NSStringMatchers::Contains( substr ); }
+
+        inline Impl::NSStringMatchers::StartsWith
+            StartsWith( NSString* substr ){ return Impl::NSStringMatchers::StartsWith( substr ); }
+
+        inline Impl::NSStringMatchers::EndsWith
+            EndsWith( NSString* substr ){ return Impl::NSStringMatchers::EndsWith( substr ); }
+
+    } // namespace Matchers
+
+    using namespace Matchers;
+
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+
+} // namespace Catch
+
+///////////////////////////////////////////////////////////////////////////////
+#define OC_MAKE_UNIQUE_NAME( root, uniqueSuffix ) root##uniqueSuffix
+#define OC_TEST_CASE2( name, desc, uniqueSuffix ) \
++(NSString*) OC_MAKE_UNIQUE_NAME( Catch_Name_test_, uniqueSuffix ) \
+{ \
+return @ name; \
+} \
++(NSString*) OC_MAKE_UNIQUE_NAME( Catch_Description_test_, uniqueSuffix ) \
+{ \
+return @ desc; \
+} \
+-(void) OC_MAKE_UNIQUE_NAME( Catch_TestCase_test_, uniqueSuffix )
+
+#define OC_TEST_CASE( name, desc ) OC_TEST_CASE2( name, desc, __LINE__ )
+
+// end catch_objc.hpp
+#endif
+
+// Benchmarking needs the externally-facing parts of reporters to work
+#if defined(CATCH_CONFIG_EXTERNAL_INTERFACES) || defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+// start catch_external_interfaces.h
+
+// start catch_reporter_bases.hpp
+
+// start catch_interfaces_reporter.h
+
+// start catch_config.hpp
+
+// start catch_test_spec_parser.h
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+// start catch_test_spec.h
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+// start catch_wildcard_pattern.h
+
+namespace Catch
+{
+    class WildcardPattern {
+        enum WildcardPosition {
+            NoWildcard = 0,
+            WildcardAtStart = 1,
+            WildcardAtEnd = 2,
+            WildcardAtBothEnds = WildcardAtStart | WildcardAtEnd
+        };
+
+    public:
+
+        WildcardPattern( std::string const& pattern, CaseSensitive::Choice caseSensitivity );
+        virtual ~WildcardPattern() = default;
+        virtual bool matches( std::string const& str ) const;
+
+    private:
+        std::string normaliseString( std::string const& str ) const;
+        CaseSensitive::Choice m_caseSensitivity;
+        WildcardPosition m_wildcard = NoWildcard;
+        std::string m_pattern;
+    };
+}
+
+// end catch_wildcard_pattern.h
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace Catch {
+
+    struct IConfig;
+
+    class TestSpec {
+        class Pattern {
+        public:
+            explicit Pattern( std::string const& name );
+            virtual ~Pattern();
+            virtual bool matches( TestCaseInfo const& testCase ) const = 0;
+            std::string const& name() const;
+        private:
+            std::string const m_name;
+        };
+        using PatternPtr = std::shared_ptr<Pattern>;
+
+        class NamePattern : public Pattern {
+        public:
+            explicit NamePattern( std::string const& name, std::string const& filterString );
+            bool matches( TestCaseInfo const& testCase ) const override;
+        private:
+            WildcardPattern m_wildcardPattern;
+        };
+
+        class TagPattern : public Pattern {
+        public:
+            explicit TagPattern( std::string const& tag, std::string const& filterString );
+            bool matches( TestCaseInfo const& testCase ) const override;
+        private:
+            std::string m_tag;
+        };
+
+        class ExcludedPattern : public Pattern {
+        public:
+            explicit ExcludedPattern( PatternPtr const& underlyingPattern );
+            bool matches( TestCaseInfo const& testCase ) const override;
+        private:
+            PatternPtr m_underlyingPattern;
+        };
+
+        struct Filter {
+            std::vector<PatternPtr> m_patterns;
+
+            bool matches( TestCaseInfo const& testCase ) const;
+            std::string name() const;
+        };
+
+    public:
+        struct FilterMatch {
+            std::string name;
+            std::vector<TestCase const*> tests;
+        };
+        using Matches = std::vector<FilterMatch>;
+        using vectorStrings = std::vector<std::string>;
+
+        bool hasFilters() const;
+        bool matches( TestCaseInfo const& testCase ) const;
+        Matches matchesByFilter( std::vector<TestCase> const& testCases, IConfig const& config ) const;
+        const vectorStrings & getInvalidArgs() const;
+
+    private:
+        std::vector<Filter> m_filters;
+        std::vector<std::string> m_invalidArgs;
+        friend class TestSpecParser;
+    };
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+// end catch_test_spec.h
+// start catch_interfaces_tag_alias_registry.h
+
+#include <string>
+
+namespace Catch {
+
+    struct TagAlias;
+
+    struct ITagAliasRegistry {
+        virtual ~ITagAliasRegistry();
+        // Nullptr if not present
+        virtual TagAlias const* find( std::string const& alias ) const = 0;
+        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
+
+        static ITagAliasRegistry const& get();
+    };
+
+} // end namespace Catch
+
+// end catch_interfaces_tag_alias_registry.h
+namespace Catch {
+
+    class TestSpecParser {
+        enum Mode{ None, Name, QuotedName, Tag, EscapedName };
+        Mode m_mode = None;
+        Mode lastMode = None;
+        bool m_exclusion = false;
+        std::size_t m_pos = 0;
+        std::size_t m_realPatternPos = 0;
+        std::string m_arg;
+        std::string m_substring;
+        std::string m_patternName;
+        std::vector<std::size_t> m_escapeChars;
+        TestSpec::Filter m_currentFilter;
+        TestSpec m_testSpec;
+        ITagAliasRegistry const* m_tagAliases = nullptr;
+
+    public:
+        TestSpecParser( ITagAliasRegistry const& tagAliases );
+
+        TestSpecParser& parse( std::string const& arg );
+        TestSpec testSpec();
+
+    private:
+        bool visitChar( char c );
+        void startNewMode( Mode mode );
+        bool processNoneChar( char c );
+        void processNameChar( char c );
+        bool processOtherChar( char c );
+        void endMode();
+        void escape();
+        bool isControlChar( char c ) const;
+        void saveLastMode();
+        void revertBackToLastMode();
+        void addFilter();
+        bool separate();
+
+        // Handles common preprocessing of the pattern for name/tag patterns
+        std::string preprocessPattern();
+        // Adds the current pattern as a test name
+        void addNamePattern();
+        // Adds the current pattern as a tag
+        void addTagPattern();
+
+        inline void addCharToPattern(char c) {
+            m_substring += c;
+            m_patternName += c;
+            m_realPatternPos++;
+        }
+
+    };
+    TestSpec parseTestSpec( std::string const& arg );
+
+} // namespace Catch
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+// end catch_test_spec_parser.h
+// Libstdc++ doesn't like incomplete classes for unique_ptr
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#ifndef CATCH_CONFIG_CONSOLE_WIDTH
+#define CATCH_CONFIG_CONSOLE_WIDTH 80
+#endif
+
+namespace Catch {
+
+    struct IStream;
+
+    struct ConfigData {
+        bool listTests = false;
+        bool listTags = false;
+        bool listReporters = false;
+        bool listTestNamesOnly = false;
+
+        bool showSuccessfulTests = false;
+        bool shouldDebugBreak = false;
+        bool noThrow = false;
+        bool showHelp = false;
+        bool showInvisibles = false;
+        bool filenamesAsTags = false;
+        bool libIdentify = false;
+
+        int abortAfter = -1;
+        unsigned int rngSeed = 0;
+
+        bool benchmarkNoAnalysis = false;
+        unsigned int benchmarkSamples = 100;
+        double benchmarkConfidenceInterval = 0.95;
+        unsigned int benchmarkResamples = 100000;
+        std::chrono::milliseconds::rep benchmarkWarmupTime = 100;
+
+        Verbosity verbosity = Verbosity::Normal;
+        WarnAbout::What warnings = WarnAbout::Nothing;
+        ShowDurations::OrNot showDurations = ShowDurations::DefaultForReporter;
+        double minDuration = -1;
+        RunTests::InWhatOrder runOrder = RunTests::InDeclarationOrder;
+        UseColour::YesOrNo useColour = UseColour::Auto;
+        WaitForKeypress::When waitForKeypress = WaitForKeypress::Never;
+
+        std::string outputFilename;
+        std::string name;
+        std::string processName;
+#ifndef CATCH_CONFIG_DEFAULT_REPORTER
+#define CATCH_CONFIG_DEFAULT_REPORTER "console"
+#endif
+        std::string reporterName = CATCH_CONFIG_DEFAULT_REPORTER;
+#undef CATCH_CONFIG_DEFAULT_REPORTER
+
+        std::vector<std::string> testsOrTags;
+        std::vector<std::string> sectionsToRun;
+    };
+
+    class Config : public IConfig {
+    public:
+
+        Config() = default;
+        Config( ConfigData const& data );
+        virtual ~Config() = default;
+
+        std::string const& getFilename() const;
+
+        bool listTests() const;
+        bool listTestNamesOnly() const;
+        bool listTags() const;
+        bool listReporters() const;
+
+        std::string getProcessName() const;
+        std::string const& getReporterName() const;
+
+        std::vector<std::string> const& getTestsOrTags() const override;
+        std::vector<std::string> const& getSectionsToRun() const override;
+
+        TestSpec const& testSpec() const override;
+        bool hasTestFilters() const override;
+
+        bool showHelp() const;
+
+        // IConfig interface
+        bool allowThrows() const override;
+        std::ostream& stream() const override;
+        std::string name() const override;
+        bool includeSuccessfulResults() const override;
+        bool warnAboutMissingAssertions() const override;
+        bool warnAboutNoTests() const override;
+        ShowDurations::OrNot showDurations() const override;
+        double minDuration() const override;
+        RunTests::InWhatOrder runOrder() const override;
+        unsigned int rngSeed() const override;
+        UseColour::YesOrNo useColour() const override;
+        bool shouldDebugBreak() const override;
+        int abortAfter() const override;
+        bool showInvisibles() const override;
+        Verbosity verbosity() const override;
+        bool benchmarkNoAnalysis() const override;
+        int benchmarkSamples() const override;
+        double benchmarkConfidenceInterval() const override;
+        unsigned int benchmarkResamples() const override;
+        std::chrono::milliseconds benchmarkWarmupTime() const override;
+
+    private:
+
+        IStream const* openStream();
+        ConfigData m_data;
+
+        std::unique_ptr<IStream const> m_stream;
+        TestSpec m_testSpec;
+        bool m_hasTestFilters = false;
+    };
+
+} // end namespace Catch
+
+// end catch_config.hpp
+// start catch_assertionresult.h
+
+#include <string>
+
+namespace Catch {
+
+    struct AssertionResultData
+    {
+        AssertionResultData() = delete;
+
+        AssertionResultData( ResultWas::OfType _resultType, LazyExpression const& _lazyExpression );
+
+        std::string message;
+        mutable std::string reconstructedExpression;
+        LazyExpression lazyExpression;
+        ResultWas::OfType resultType;
+
+        std::string reconstructExpression() const;
+    };
+
+    class AssertionResult {
+    public:
+        AssertionResult() = delete;
+        AssertionResult( AssertionInfo const& info, AssertionResultData const& data );
+
+        bool isOk() const;
+        bool succeeded() const;
+        ResultWas::OfType getResultType() const;
+        bool hasExpression() const;
+        bool hasMessage() const;
+        std::string getExpression() const;
+        std::string getExpressionInMacro() const;
+        bool hasExpandedExpression() const;
+        std::string getExpandedExpression() const;
+        std::string getMessage() const;
+        SourceLineInfo getSourceInfo() const;
+        StringRef getTestMacroName() const;
+
+    //protected:
+        AssertionInfo m_info;
+        AssertionResultData m_resultData;
+    };
+
+} // end namespace Catch
+
+// end catch_assertionresult.h
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+// start catch_estimate.hpp
+
+ // Statistics estimates
+
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Duration>
+        struct Estimate {
+            Duration point;
+            Duration lower_bound;
+            Duration upper_bound;
+            double confidence_interval;
+
+            template <typename Duration2>
+            operator Estimate<Duration2>() const {
+                return { point, lower_bound, upper_bound, confidence_interval };
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_estimate.hpp
+// start catch_outlier_classification.hpp
+
+// Outlier information
+
+namespace Catch {
+    namespace Benchmark {
+        struct OutlierClassification {
+            int samples_seen = 0;
+            int low_severe = 0;     // more than 3 times IQR below Q1
+            int low_mild = 0;       // 1.5 to 3 times IQR below Q1
+            int high_mild = 0;      // 1.5 to 3 times IQR above Q3
+            int high_severe = 0;    // more than 3 times IQR above Q3
+
+            int total() const {
+                return low_severe + low_mild + high_mild + high_severe;
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_outlier_classification.hpp
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+#include <string>
+#include <iosfwd>
+#include <map>
+#include <set>
+#include <memory>
+#include <algorithm>
+
+namespace Catch {
+
+    struct ReporterConfig {
+        explicit ReporterConfig( IConfigPtr const& _fullConfig );
+
+        ReporterConfig( IConfigPtr const& _fullConfig, std::ostream& _stream );
+
+        std::ostream& stream() const;
+        IConfigPtr fullConfig() const;
+
+    private:
+        std::ostream* m_stream;
+        IConfigPtr m_fullConfig;
+    };
+
+    struct ReporterPreferences {
+        bool shouldRedirectStdOut = false;
+        bool shouldReportAllAssertions = false;
+    };
+
+    template<typename T>
+    struct LazyStat : Option<T> {
+        LazyStat& operator=( T const& _value ) {
+            Option<T>::operator=( _value );
+            used = false;
+            return *this;
+        }
+        void reset() {
+            Option<T>::reset();
+            used = false;
+        }
+        bool used = false;
+    };
+
+    struct TestRunInfo {
+        TestRunInfo( std::string const& _name );
+        std::string name;
+    };
+    struct GroupInfo {
+        GroupInfo(  std::string const& _name,
+                    std::size_t _groupIndex,
+                    std::size_t _groupsCount );
+
+        std::string name;
+        std::size_t groupIndex;
+        std::size_t groupsCounts;
+    };
+
+    struct AssertionStats {
+        AssertionStats( AssertionResult const& _assertionResult,
+                        std::vector<MessageInfo> const& _infoMessages,
+                        Totals const& _totals );
+
+        AssertionStats( AssertionStats const& )              = default;
+        AssertionStats( AssertionStats && )                  = default;
+        AssertionStats& operator = ( AssertionStats const& ) = delete;
+        AssertionStats& operator = ( AssertionStats && )     = delete;
+        virtual ~AssertionStats();
+
+        AssertionResult assertionResult;
+        std::vector<MessageInfo> infoMessages;
+        Totals totals;
+    };
+
+    struct SectionStats {
+        SectionStats(   SectionInfo const& _sectionInfo,
+                        Counts const& _assertions,
+                        double _durationInSeconds,
+                        bool _missingAssertions );
+        SectionStats( SectionStats const& )              = default;
+        SectionStats( SectionStats && )                  = default;
+        SectionStats& operator = ( SectionStats const& ) = default;
+        SectionStats& operator = ( SectionStats && )     = default;
+        virtual ~SectionStats();
+
+        SectionInfo sectionInfo;
+        Counts assertions;
+        double durationInSeconds;
+        bool missingAssertions;
+    };
+
+    struct TestCaseStats {
+        TestCaseStats(  TestCaseInfo const& _testInfo,
+                        Totals const& _totals,
+                        std::string const& _stdOut,
+                        std::string const& _stdErr,
+                        bool _aborting );
+
+        TestCaseStats( TestCaseStats const& )              = default;
+        TestCaseStats( TestCaseStats && )                  = default;
+        TestCaseStats& operator = ( TestCaseStats const& ) = default;
+        TestCaseStats& operator = ( TestCaseStats && )     = default;
+        virtual ~TestCaseStats();
+
+        TestCaseInfo testInfo;
+        Totals totals;
+        std::string stdOut;
+        std::string stdErr;
+        bool aborting;
+    };
+
+    struct TestGroupStats {
+        TestGroupStats( GroupInfo const& _groupInfo,
+                        Totals const& _totals,
+                        bool _aborting );
+        TestGroupStats( GroupInfo const& _groupInfo );
+
+        TestGroupStats( TestGroupStats const& )              = default;
+        TestGroupStats( TestGroupStats && )                  = default;
+        TestGroupStats& operator = ( TestGroupStats const& ) = default;
+        TestGroupStats& operator = ( TestGroupStats && )     = default;
+        virtual ~TestGroupStats();
+
+        GroupInfo groupInfo;
+        Totals totals;
+        bool aborting;
+    };
+
+    struct TestRunStats {
+        TestRunStats(   TestRunInfo const& _runInfo,
+                        Totals const& _totals,
+                        bool _aborting );
+
+        TestRunStats( TestRunStats const& )              = default;
+        TestRunStats( TestRunStats && )                  = default;
+        TestRunStats& operator = ( TestRunStats const& ) = default;
+        TestRunStats& operator = ( TestRunStats && )     = default;
+        virtual ~TestRunStats();
+
+        TestRunInfo runInfo;
+        Totals totals;
+        bool aborting;
+    };
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+    struct BenchmarkInfo {
+        std::string name;
+        double estimatedDuration;
+        int iterations;
+        int samples;
+        unsigned int resamples;
+        double clockResolution;
+        double clockCost;
+    };
+
+    template <class Duration>
+    struct BenchmarkStats {
+        BenchmarkInfo info;
+
+        std::vector<Duration> samples;
+        Benchmark::Estimate<Duration> mean;
+        Benchmark::Estimate<Duration> standardDeviation;
+        Benchmark::OutlierClassification outliers;
+        double outlierVariance;
+
+        template <typename Duration2>
+        operator BenchmarkStats<Duration2>() const {
+            std::vector<Duration2> samples2;
+            samples2.reserve(samples.size());
+            std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](Duration d) { return Duration2(d); });
+            return {
+                info,
+                std::move(samples2),
+                mean,
+                standardDeviation,
+                outliers,
+                outlierVariance,
+            };
+        }
+    };
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+    struct IStreamingReporter {
+        virtual ~IStreamingReporter() = default;
+
+        // Implementing class must also provide the following static methods:
+        // static std::string getDescription();
+        // static std::set<Verbosity> getSupportedVerbosities()
+
+        virtual ReporterPreferences getPreferences() const = 0;
+
+        virtual void noMatchingTestCases( std::string const& spec ) = 0;
+
+        virtual void reportInvalidArguments(std::string const&) {}
+
+        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
+        virtual void testGroupStarting( GroupInfo const& groupInfo ) = 0;
+
+        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
+        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+        virtual void benchmarkPreparing( std::string const& ) {}
+        virtual void benchmarkStarting( BenchmarkInfo const& ) {}
+        virtual void benchmarkEnded( BenchmarkStats<> const& ) {}
+        virtual void benchmarkFailed( std::string const& ) {}
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
+
+        // The return value indicates if the messages buffer should be cleared:
+        virtual bool assertionEnded( AssertionStats const& assertionStats ) = 0;
+
+        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
+        virtual void testGroupEnded( TestGroupStats const& testGroupStats ) = 0;
+        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
+
+        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
+
+        // Default empty implementation provided
+        virtual void fatalErrorEncountered( StringRef name );
+
+        virtual bool isMulti() const;
+    };
+    using IStreamingReporterPtr = std::unique_ptr<IStreamingReporter>;
+
+    struct IReporterFactory {
+        virtual ~IReporterFactory();
+        virtual IStreamingReporterPtr create( ReporterConfig const& config ) const = 0;
+        virtual std::string getDescription() const = 0;
+    };
+    using IReporterFactoryPtr = std::shared_ptr<IReporterFactory>;
+
+    struct IReporterRegistry {
+        using FactoryMap = std::map<std::string, IReporterFactoryPtr>;
+        using Listeners = std::vector<IReporterFactoryPtr>;
+
+        virtual ~IReporterRegistry();
+        virtual IStreamingReporterPtr create( std::string const& name, IConfigPtr const& config ) const = 0;
+        virtual FactoryMap const& getFactories() const = 0;
+        virtual Listeners const& getListeners() const = 0;
+    };
+
+} // end namespace Catch
+
+// end catch_interfaces_reporter.h
+#include <algorithm>
+#include <cstring>
+#include <cfloat>
+#include <cstdio>
+#include <cassert>
+#include <memory>
+#include <ostream>
+
+namespace Catch {
+    void prepareExpandedExpression(AssertionResult& result);
+
+    // Returns double formatted as %.3f (format expected on output)
+    std::string getFormattedDuration( double duration );
+
+    //! Should the reporter show
+    bool shouldShowDuration( IConfig const& config, double duration );
+
+    std::string serializeFilters( std::vector<std::string> const& container );
+
+    template<typename DerivedT>
+    struct StreamingReporterBase : IStreamingReporter {
+
+        StreamingReporterBase( ReporterConfig const& _config )
+        :   m_config( _config.fullConfig() ),
+            stream( _config.stream() )
+        {
+            m_reporterPrefs.shouldRedirectStdOut = false;
+            if( !DerivedT::getSupportedVerbosities().count( m_config->verbosity() ) )
+                CATCH_ERROR( "Verbosity level not supported by this reporter" );
+        }
+
+        ReporterPreferences getPreferences() const override {
+            return m_reporterPrefs;
+        }
+
+        static std::set<Verbosity> getSupportedVerbosities() {
+            return { Verbosity::Normal };
+        }
+
+        ~StreamingReporterBase() override = default;
+
+        void noMatchingTestCases(std::string const&) override {}
+
+        void reportInvalidArguments(std::string const&) override {}
+
+        void testRunStarting(TestRunInfo const& _testRunInfo) override {
+            currentTestRunInfo = _testRunInfo;
+        }
+
+        void testGroupStarting(GroupInfo const& _groupInfo) override {
+            currentGroupInfo = _groupInfo;
+        }
+
+        void testCaseStarting(TestCaseInfo const& _testInfo) override  {
+            currentTestCaseInfo = _testInfo;
+        }
+        void sectionStarting(SectionInfo const& _sectionInfo) override {
+            m_sectionStack.push_back(_sectionInfo);
+        }
+
+        void sectionEnded(SectionStats const& /* _sectionStats */) override {
+            m_sectionStack.pop_back();
+        }
+        void testCaseEnded(TestCaseStats const& /* _testCaseStats */) override {
+            currentTestCaseInfo.reset();
+        }
+        void testGroupEnded(TestGroupStats const& /* _testGroupStats */) override {
+            currentGroupInfo.reset();
+        }
+        void testRunEnded(TestRunStats const& /* _testRunStats */) override {
+            currentTestCaseInfo.reset();
+            currentGroupInfo.reset();
+            currentTestRunInfo.reset();
+        }
+
+        void skipTest(TestCaseInfo const&) override {
+            // Don't do anything with this by default.
+            // It can optionally be overridden in the derived class.
+        }
+
+        IConfigPtr m_config;
+        std::ostream& stream;
+
+        LazyStat<TestRunInfo> currentTestRunInfo;
+        LazyStat<GroupInfo> currentGroupInfo;
+        LazyStat<TestCaseInfo> currentTestCaseInfo;
+
+        std::vector<SectionInfo> m_sectionStack;
+        ReporterPreferences m_reporterPrefs;
+    };
+
+    template<typename DerivedT>
+    struct CumulativeReporterBase : IStreamingReporter {
+        template<typename T, typename ChildNodeT>
+        struct Node {
+            explicit Node( T const& _value ) : value( _value ) {}
+            virtual ~Node() {}
+
+            using ChildNodes = std::vector<std::shared_ptr<ChildNodeT>>;
+            T value;
+            ChildNodes children;
+        };
+        struct SectionNode {
+            explicit SectionNode(SectionStats const& _stats) : stats(_stats) {}
+            virtual ~SectionNode() = default;
+
+            bool operator == (SectionNode const& other) const {
+                return stats.sectionInfo.lineInfo == other.stats.sectionInfo.lineInfo;
+            }
+            bool operator == (std::shared_ptr<SectionNode> const& other) const {
+                return operator==(*other);
+            }
+
+            SectionStats stats;
+            using ChildSections = std::vector<std::shared_ptr<SectionNode>>;
+            using Assertions = std::vector<AssertionStats>;
+            ChildSections childSections;
+            Assertions assertions;
+            std::string stdOut;
+            std::string stdErr;
+        };
+
+        struct BySectionInfo {
+            BySectionInfo( SectionInfo const& other ) : m_other( other ) {}
+            BySectionInfo( BySectionInfo const& other ) : m_other( other.m_other ) {}
+            bool operator() (std::shared_ptr<SectionNode> const& node) const {
+                return ((node->stats.sectionInfo.name == m_other.name) &&
+                        (node->stats.sectionInfo.lineInfo == m_other.lineInfo));
+            }
+            void operator=(BySectionInfo const&) = delete;
+
+        private:
+            SectionInfo const& m_other;
+        };
+
+        using TestCaseNode = Node<TestCaseStats, SectionNode>;
+        using TestGroupNode = Node<TestGroupStats, TestCaseNode>;
+        using TestRunNode = Node<TestRunStats, TestGroupNode>;
+
+        CumulativeReporterBase( ReporterConfig const& _config )
+        :   m_config( _config.fullConfig() ),
+            stream( _config.stream() )
+        {
+            m_reporterPrefs.shouldRedirectStdOut = false;
+            if( !DerivedT::getSupportedVerbosities().count( m_config->verbosity() ) )
+                CATCH_ERROR( "Verbosity level not supported by this reporter" );
+        }
+        ~CumulativeReporterBase() override = default;
+
+        ReporterPreferences getPreferences() const override {
+            return m_reporterPrefs;
+        }
+
+        static std::set<Verbosity> getSupportedVerbosities() {
+            return { Verbosity::Normal };
+        }
+
+        void testRunStarting( TestRunInfo const& ) override {}
+        void testGroupStarting( GroupInfo const& ) override {}
+
+        void testCaseStarting( TestCaseInfo const& ) override {}
+
+        void sectionStarting( SectionInfo const& sectionInfo ) override {
+            SectionStats incompleteStats( sectionInfo, Counts(), 0, false );
+            std::shared_ptr<SectionNode> node;
+            if( m_sectionStack.empty() ) {
+                if( !m_rootSection )
+                    m_rootSection = std::make_shared<SectionNode>( incompleteStats );
+                node = m_rootSection;
+            }
+            else {
+                SectionNode& parentNode = *m_sectionStack.back();
+                auto it =
+                    std::find_if(   parentNode.childSections.begin(),
+                                    parentNode.childSections.end(),
+                                    BySectionInfo( sectionInfo ) );
+                if( it == parentNode.childSections.end() ) {
+                    node = std::make_shared<SectionNode>( incompleteStats );
+                    parentNode.childSections.push_back( node );
+                }
+                else
+                    node = *it;
+            }
+            m_sectionStack.push_back( node );
+            m_deepestSection = std::move(node);
+        }
+
+        void assertionStarting(AssertionInfo const&) override {}
+
+        bool assertionEnded(AssertionStats const& assertionStats) override {
+            assert(!m_sectionStack.empty());
+            // AssertionResult holds a pointer to a temporary DecomposedExpression,
+            // which getExpandedExpression() calls to build the expression string.
+            // Our section stack copy of the assertionResult will likely outlive the
+            // temporary, so it must be expanded or discarded now to avoid calling
+            // a destroyed object later.
+            prepareExpandedExpression(const_cast<AssertionResult&>( assertionStats.assertionResult ) );
+            SectionNode& sectionNode = *m_sectionStack.back();
+            sectionNode.assertions.push_back(assertionStats);
+            return true;
+        }
+        void sectionEnded(SectionStats const& sectionStats) override {
+            assert(!m_sectionStack.empty());
+            SectionNode& node = *m_sectionStack.back();
+            node.stats = sectionStats;
+            m_sectionStack.pop_back();
+        }
+        void testCaseEnded(TestCaseStats const& testCaseStats) override {
+            auto node = std::make_shared<TestCaseNode>(testCaseStats);
+            assert(m_sectionStack.size() == 0);
+            node->children.push_back(m_rootSection);
+            m_testCases.push_back(node);
+            m_rootSection.reset();
+
+            assert(m_deepestSection);
+            m_deepestSection->stdOut = testCaseStats.stdOut;
+            m_deepestSection->stdErr = testCaseStats.stdErr;
+        }
+        void testGroupEnded(TestGroupStats const& testGroupStats) override {
+            auto node = std::make_shared<TestGroupNode>(testGroupStats);
+            node->children.swap(m_testCases);
+            m_testGroups.push_back(node);
+        }
+        void testRunEnded(TestRunStats const& testRunStats) override {
+            auto node = std::make_shared<TestRunNode>(testRunStats);
+            node->children.swap(m_testGroups);
+            m_testRuns.push_back(node);
+            testRunEndedCumulative();
+        }
+        virtual void testRunEndedCumulative() = 0;
+
+        void skipTest(TestCaseInfo const&) override {}
+
+        IConfigPtr m_config;
+        std::ostream& stream;
+        std::vector<AssertionStats> m_assertions;
+        std::vector<std::vector<std::shared_ptr<SectionNode>>> m_sections;
+        std::vector<std::shared_ptr<TestCaseNode>> m_testCases;
+        std::vector<std::shared_ptr<TestGroupNode>> m_testGroups;
+
+        std::vector<std::shared_ptr<TestRunNode>> m_testRuns;
+
+        std::shared_ptr<SectionNode> m_rootSection;
+        std::shared_ptr<SectionNode> m_deepestSection;
+        std::vector<std::shared_ptr<SectionNode>> m_sectionStack;
+        ReporterPreferences m_reporterPrefs;
+    };
+
+    template<char C>
+    char const* getLineOfChars() {
+        static char line[CATCH_CONFIG_CONSOLE_WIDTH] = {0};
+        if( !*line ) {
+            std::memset( line, C, CATCH_CONFIG_CONSOLE_WIDTH-1 );
+            line[CATCH_CONFIG_CONSOLE_WIDTH-1] = 0;
+        }
+        return line;
+    }
+
+    struct TestEventListenerBase : StreamingReporterBase<TestEventListenerBase> {
+        TestEventListenerBase( ReporterConfig const& _config );
+
+        static std::set<Verbosity> getSupportedVerbosities();
+
+        void assertionStarting(AssertionInfo const&) override;
+        bool assertionEnded(AssertionStats const&) override;
+    };
+
+} // end namespace Catch
+
+// end catch_reporter_bases.hpp
+// start catch_console_colour.h
+
+namespace Catch {
+
+    struct Colour {
+        enum Code {
+            None = 0,
+
+            White,
+            Red,
+            Green,
+            Blue,
+            Cyan,
+            Yellow,
+            Grey,
+
+            Bright = 0x10,
+
+            BrightRed = Bright | Red,
+            BrightGreen = Bright | Green,
+            LightGrey = Bright | Grey,
+            BrightWhite = Bright | White,
+            BrightYellow = Bright | Yellow,
+
+            // By intention
+            FileName = LightGrey,
+            Warning = BrightYellow,
+            ResultError = BrightRed,
+            ResultSuccess = BrightGreen,
+            ResultExpectedFailure = Warning,
+
+            Error = BrightRed,
+            Success = Green,
+
+            OriginalExpression = Cyan,
+            ReconstructedExpression = BrightYellow,
+
+            SecondaryText = LightGrey,
+            Headers = White
+        };
+
+        // Use constructed object for RAII guard
+        Colour( Code _colourCode );
+        Colour( Colour&& other ) noexcept;
+        Colour& operator=( Colour&& other ) noexcept;
+        ~Colour();
+
+        // Use static method for one-shot changes
+        static void use( Code _colourCode );
+
+    private:
+        bool m_moved = false;
+    };
+
+    std::ostream& operator << ( std::ostream& os, Colour const& );
+
+} // end namespace Catch
+
+// end catch_console_colour.h
+// start catch_reporter_registrars.hpp
+
+
+namespace Catch {
+
+    template<typename T>
+    class ReporterRegistrar {
+
+        class ReporterFactory : public IReporterFactory {
+
+            IStreamingReporterPtr create( ReporterConfig const& config ) const override {
+                return std::unique_ptr<T>( new T( config ) );
+            }
+
+            std::string getDescription() const override {
+                return T::getDescription();
+            }
+        };
+
+    public:
+
+        explicit ReporterRegistrar( std::string const& name ) {
+            getMutableRegistryHub().registerReporter( name, std::make_shared<ReporterFactory>() );
+        }
+    };
+
+    template<typename T>
+    class ListenerRegistrar {
+
+        class ListenerFactory : public IReporterFactory {
+
+            IStreamingReporterPtr create( ReporterConfig const& config ) const override {
+                return std::unique_ptr<T>( new T( config ) );
+            }
+            std::string getDescription() const override {
+                return std::string();
+            }
+        };
+
+    public:
+
+        ListenerRegistrar() {
+            getMutableRegistryHub().registerListener( std::make_shared<ListenerFactory>() );
+        }
+    };
+}
+
+#if !defined(CATCH_CONFIG_DISABLE)
+
+#define CATCH_REGISTER_REPORTER( name, reporterType ) \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION         \
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS          \
+    namespace{ Catch::ReporterRegistrar<reporterType> catch_internal_RegistrarFor##reporterType( name ); } \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#define CATCH_REGISTER_LISTENER( listenerType ) \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION   \
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS    \
+    namespace{ Catch::ListenerRegistrar<listenerType> catch_internal_RegistrarFor##listenerType; } \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#else // CATCH_CONFIG_DISABLE
+
+#define CATCH_REGISTER_REPORTER(name, reporterType)
+#define CATCH_REGISTER_LISTENER(listenerType)
+
+#endif // CATCH_CONFIG_DISABLE
+
+// end catch_reporter_registrars.hpp
+// Allow users to base their work off existing reporters
+// start catch_reporter_compact.h
+
+namespace Catch {
+
+    struct CompactReporter : StreamingReporterBase<CompactReporter> {
+
+        using StreamingReporterBase::StreamingReporterBase;
+
+        ~CompactReporter() override;
+
+        static std::string getDescription();
+
+        void noMatchingTestCases(std::string const& spec) override;
+
+        void assertionStarting(AssertionInfo const&) override;
+
+        bool assertionEnded(AssertionStats const& _assertionStats) override;
+
+        void sectionEnded(SectionStats const& _sectionStats) override;
+
+        void testRunEnded(TestRunStats const& _testRunStats) override;
+
+    };
+
+} // end namespace Catch
+
+// end catch_reporter_compact.h
+// start catch_reporter_console.h
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
+                              // Note that 4062 (not all labels are handled
+                              // and default is missing) is enabled
+#endif
+
+namespace Catch {
+    // Fwd decls
+    struct SummaryColumn;
+    class TablePrinter;
+
+    struct ConsoleReporter : StreamingReporterBase<ConsoleReporter> {
+        std::unique_ptr<TablePrinter> m_tablePrinter;
+
+        ConsoleReporter(ReporterConfig const& config);
+        ~ConsoleReporter() override;
+        static std::string getDescription();
+
+        void noMatchingTestCases(std::string const& spec) override;
+
+        void reportInvalidArguments(std::string const&arg) override;
+
+        void assertionStarting(AssertionInfo const&) override;
+
+        bool assertionEnded(AssertionStats const& _assertionStats) override;
+
+        void sectionStarting(SectionInfo const& _sectionInfo) override;
+        void sectionEnded(SectionStats const& _sectionStats) override;
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+        void benchmarkPreparing(std::string const& name) override;
+        void benchmarkStarting(BenchmarkInfo const& info) override;
+        void benchmarkEnded(BenchmarkStats<> const& stats) override;
+        void benchmarkFailed(std::string const& error) override;
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+        void testCaseEnded(TestCaseStats const& _testCaseStats) override;
+        void testGroupEnded(TestGroupStats const& _testGroupStats) override;
+        void testRunEnded(TestRunStats const& _testRunStats) override;
+        void testRunStarting(TestRunInfo const& _testRunInfo) override;
+    private:
+
+        void lazyPrint();
+
+        void lazyPrintWithoutClosingBenchmarkTable();
+        void lazyPrintRunInfo();
+        void lazyPrintGroupInfo();
+        void printTestCaseAndSectionHeader();
+
+        void printClosedHeader(std::string const& _name);
+        void printOpenHeader(std::string const& _name);
+
+        // if string has a : in first line will set indent to follow it on
+        // subsequent lines
+        void printHeaderString(std::string const& _string, std::size_t indent = 0);
+
+        void printTotals(Totals const& totals);
+        void printSummaryRow(std::string const& label, std::vector<SummaryColumn> const& cols, std::size_t row);
+
+        void printTotalsDivider(Totals const& totals);
+        void printSummaryDivider();
+        void printTestFilters();
+
+    private:
+        bool m_headerPrinted = false;
+    };
+
+} // end namespace Catch
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+// end catch_reporter_console.h
+// start catch_reporter_junit.h
+
+// start catch_xmlwriter.h
+
+#include <vector>
+
+namespace Catch {
+    enum class XmlFormatting {
+        None = 0x00,
+        Indent = 0x01,
+        Newline = 0x02,
+    };
+
+    XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs);
+    XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs);
+
+    class XmlEncode {
+    public:
+        enum ForWhat { ForTextNodes, ForAttributes };
+
+        XmlEncode( std::string const& str, ForWhat forWhat = ForTextNodes );
+
+        void encodeTo( std::ostream& os ) const;
+
+        friend std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode );
+
+    private:
+        std::string m_str;
+        ForWhat m_forWhat;
+    };
+
+    class XmlWriter {
+    public:
+
+        class ScopedElement {
+        public:
+            ScopedElement( XmlWriter* writer, XmlFormatting fmt );
+
+            ScopedElement( ScopedElement&& other ) noexcept;
+            ScopedElement& operator=( ScopedElement&& other ) noexcept;
+
+            ~ScopedElement();
+
+            ScopedElement& writeText( std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent );
+
+            template<typename T>
+            ScopedElement& writeAttribute( std::string const& name, T const& attribute ) {
+                m_writer->writeAttribute( name, attribute );
+                return *this;
+            }
+
+        private:
+            mutable XmlWriter* m_writer = nullptr;
+            XmlFormatting m_fmt;
+        };
+
+        XmlWriter( std::ostream& os = Catch::cout() );
+        ~XmlWriter();
+
+        XmlWriter( XmlWriter const& ) = delete;
+        XmlWriter& operator=( XmlWriter const& ) = delete;
+
+        XmlWriter& startElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        ScopedElement scopedElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        XmlWriter& endElement(XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        XmlWriter& writeAttribute( std::string const& name, std::string const& attribute );
+
+        XmlWriter& writeAttribute( std::string const& name, bool attribute );
+
+        template<typename T>
+        XmlWriter& writeAttribute( std::string const& name, T const& attribute ) {
+            ReusableStringStream rss;
+            rss << attribute;
+            return writeAttribute( name, rss.str() );
+        }
+
+        XmlWriter& writeText( std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        XmlWriter& writeComment(std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        void writeStylesheetRef( std::string const& url );
+
+        XmlWriter& writeBlankLine();
+
+        void ensureTagClosed();
+
+    private:
+
+        void applyFormatting(XmlFormatting fmt);
+
+        void writeDeclaration();
+
+        void newlineIfNecessary();
+
+        bool m_tagIsOpen = false;
+        bool m_needsNewline = false;
+        std::vector<std::string> m_tags;
+        std::string m_indent;
+        std::ostream& m_os;
+    };
+
+}
+
+// end catch_xmlwriter.h
+namespace Catch {
+
+    class JunitReporter : public CumulativeReporterBase<JunitReporter> {
+    public:
+        JunitReporter(ReporterConfig const& _config);
+
+        ~JunitReporter() override;
+
+        static std::string getDescription();
+
+        void noMatchingTestCases(std::string const& /*spec*/) override;
+
+        void testRunStarting(TestRunInfo const& runInfo) override;
+
+        void testGroupStarting(GroupInfo const& groupInfo) override;
+
+        void testCaseStarting(TestCaseInfo const& testCaseInfo) override;
+        bool assertionEnded(AssertionStats const& assertionStats) override;
+
+        void testCaseEnded(TestCaseStats const& testCaseStats) override;
+
+        void testGroupEnded(TestGroupStats const& testGroupStats) override;
+
+        void testRunEndedCumulative() override;
+
+        void writeGroup(TestGroupNode const& groupNode, double suiteTime);
+
+        void writeTestCase(TestCaseNode const& testCaseNode);
+
+        void writeSection(std::string const& className,
+                          std::string const& rootName,
+                          SectionNode const& sectionNode);
+
+        void writeAssertions(SectionNode const& sectionNode);
+        void writeAssertion(AssertionStats const& stats);
+
+        XmlWriter xml;
+        Timer suiteTimer;
+        std::string stdOutForSuite;
+        std::string stdErrForSuite;
+        unsigned int unexpectedExceptions = 0;
+        bool m_okToFail = false;
+    };
+
+} // end namespace Catch
+
+// end catch_reporter_junit.h
+// start catch_reporter_xml.h
+
+namespace Catch {
+    class XmlReporter : public StreamingReporterBase<XmlReporter> {
+    public:
+        XmlReporter(ReporterConfig const& _config);
+
+        ~XmlReporter() override;
+
+        static std::string getDescription();
+
+        virtual std::string getStylesheetRef() const;
+
+        void writeSourceInfo(SourceLineInfo const& sourceInfo);
+
+    public: // StreamingReporterBase
+
+        void noMatchingTestCases(std::string const& s) override;
+
+        void testRunStarting(TestRunInfo const& testInfo) override;
+
+        void testGroupStarting(GroupInfo const& groupInfo) override;
+
+        void testCaseStarting(TestCaseInfo const& testInfo) override;
+
+        void sectionStarting(SectionInfo const& sectionInfo) override;
+
+        void assertionStarting(AssertionInfo const&) override;
+
+        bool assertionEnded(AssertionStats const& assertionStats) override;
+
+        void sectionEnded(SectionStats const& sectionStats) override;
+
+        void testCaseEnded(TestCaseStats const& testCaseStats) override;
+
+        void testGroupEnded(TestGroupStats const& testGroupStats) override;
+
+        void testRunEnded(TestRunStats const& testRunStats) override;
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+        void benchmarkPreparing(std::string const& name) override;
+        void benchmarkStarting(BenchmarkInfo const&) override;
+        void benchmarkEnded(BenchmarkStats<> const&) override;
+        void benchmarkFailed(std::string const&) override;
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+    private:
+        Timer m_testCaseTimer;
+        XmlWriter m_xml;
+        int m_sectionDepth = 0;
+    };
+
+} // end namespace Catch
+
+// end catch_reporter_xml.h
+
+// end catch_external_interfaces.h
+#endif
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+// start catch_benchmarking_all.hpp
+
+// A proxy header that includes all of the benchmarking headers to allow
+// concise include of the benchmarking features. You should prefer the
+// individual includes in standard use.
+
+// start catch_benchmark.hpp
+
+ // Benchmark
+
+// start catch_chronometer.hpp
+
+// User-facing chronometer
+
+
+// start catch_clock.hpp
+
+// Clocks
+
+
+#include <chrono>
+#include <ratio>
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Clock>
+        using ClockDuration = typename Clock::duration;
+        template <typename Clock>
+        using FloatDuration = std::chrono::duration<double, typename Clock::period>;
+
+        template <typename Clock>
+        using TimePoint = typename Clock::time_point;
+
+        using default_clock = std::chrono::steady_clock;
+
+        template <typename Clock>
+        struct now {
+            TimePoint<Clock> operator()() const {
+                return Clock::now();
+            }
+        };
+
+        using fp_seconds = std::chrono::duration<double, std::ratio<1>>;
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_clock.hpp
+// start catch_optimizer.hpp
+
+ // Hinting the optimizer
+
+
+#if defined(_MSC_VER)
+#   include <atomic> // atomic_thread_fence
+#endif
+
+namespace Catch {
+    namespace Benchmark {
+#if defined(__GNUC__) || defined(__clang__)
+        template <typename T>
+        inline void keep_memory(T* p) {
+            asm volatile("" : : "g"(p) : "memory");
+        }
+        inline void keep_memory() {
+            asm volatile("" : : : "memory");
+        }
+
+        namespace Detail {
+            inline void optimizer_barrier() { keep_memory(); }
+        } // namespace Detail
+#elif defined(_MSC_VER)
+
+#pragma optimize("", off)
+        template <typename T>
+        inline void keep_memory(T* p) {
+            // thanks @milleniumbug
+            *reinterpret_cast<char volatile*>(p) = *reinterpret_cast<char const volatile*>(p);
+        }
+        // TODO equivalent keep_memory()
+#pragma optimize("", on)
+
+        namespace Detail {
+            inline void optimizer_barrier() {
+                std::atomic_thread_fence(std::memory_order_seq_cst);
+            }
+        } // namespace Detail
+
+#endif
+
+        template <typename T>
+        inline void deoptimize_value(T&& x) {
+            keep_memory(&x);
+        }
+
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> typename std::enable_if<!std::is_same<void, decltype(fn(args...))>::value>::type {
+            deoptimize_value(std::forward<Fn>(fn) (std::forward<Args...>(args...)));
+        }
+
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> typename std::enable_if<std::is_same<void, decltype(fn(args...))>::value>::type {
+            std::forward<Fn>(fn) (std::forward<Args...>(args...));
+        }
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_optimizer.hpp
+// start catch_complete_invoke.hpp
+
+// Invoke with a special case for void
+
+
+#include <type_traits>
+#include <utility>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T>
+            struct CompleteType { using type = T; };
+            template <>
+            struct CompleteType<void> { struct type {}; };
+
+            template <typename T>
+            using CompleteType_t = typename CompleteType<T>::type;
+
+            template <typename Result>
+            struct CompleteInvoker {
+                template <typename Fun, typename... Args>
+                static Result invoke(Fun&& fun, Args&&... args) {
+                    return std::forward<Fun>(fun)(std::forward<Args>(args)...);
+                }
+            };
+            template <>
+            struct CompleteInvoker<void> {
+                template <typename Fun, typename... Args>
+                static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {
+                    std::forward<Fun>(fun)(std::forward<Args>(args)...);
+                    return {};
+                }
+            };
+
+            // invoke and not return void :(
+            template <typename Fun, typename... Args>
+            CompleteType_t<FunctionReturnType<Fun, Args...>> complete_invoke(Fun&& fun, Args&&... args) {
+                return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(std::forward<Fun>(fun), std::forward<Args>(args)...);
+            }
+
+            const std::string benchmarkErrorMsg = "a benchmark failed to run successfully";
+        } // namespace Detail
+
+        template <typename Fun>
+        Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
+            CATCH_TRY{
+                return Detail::complete_invoke(std::forward<Fun>(fun));
+            } CATCH_CATCH_ALL{
+                getResultCapture().benchmarkFailed(translateActiveException());
+                CATCH_RUNTIME_ERROR(Detail::benchmarkErrorMsg);
+            }
+        }
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_complete_invoke.hpp
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            struct ChronometerConcept {
+                virtual void start() = 0;
+                virtual void finish() = 0;
+                virtual ~ChronometerConcept() = default;
+            };
+            template <typename Clock>
+            struct ChronometerModel final : public ChronometerConcept {
+                void start() override { started = Clock::now(); }
+                void finish() override { finished = Clock::now(); }
+
+                ClockDuration<Clock> elapsed() const { return finished - started; }
+
+                TimePoint<Clock> started;
+                TimePoint<Clock> finished;
+            };
+        } // namespace Detail
+
+        struct Chronometer {
+        public:
+            template <typename Fun>
+            void measure(Fun&& fun) { measure(std::forward<Fun>(fun), is_callable<Fun(int)>()); }
+
+            int runs() const { return k; }
+
+            Chronometer(Detail::ChronometerConcept& meter, int k)
+                : impl(&meter)
+                , k(k) {}
+
+        private:
+            template <typename Fun>
+            void measure(Fun&& fun, std::false_type) {
+                measure([&fun](int) { return fun(); }, std::true_type());
+            }
+
+            template <typename Fun>
+            void measure(Fun&& fun, std::true_type) {
+                Detail::optimizer_barrier();
+                impl->start();
+                for (int i = 0; i < k; ++i) invoke_deoptimized(fun, i);
+                impl->finish();
+                Detail::optimizer_barrier();
+            }
+
+            Detail::ChronometerConcept* impl;
+            int k;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_chronometer.hpp
+// start catch_environment.hpp
+
+// Environment information
+
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Duration>
+        struct EnvironmentEstimate {
+            Duration mean;
+            OutlierClassification outliers;
+
+            template <typename Duration2>
+            operator EnvironmentEstimate<Duration2>() const {
+                return { mean, outliers };
+            }
+        };
+        template <typename Clock>
+        struct Environment {
+            using clock_type = Clock;
+            EnvironmentEstimate<FloatDuration<Clock>> clock_resolution;
+            EnvironmentEstimate<FloatDuration<Clock>> clock_cost;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_environment.hpp
+// start catch_execution_plan.hpp
+
+ // Execution plan
+
+
+// start catch_benchmark_function.hpp
+
+ // Dumb std::function implementation for consistent call overhead
+
+
+#include <cassert>
+#include <type_traits>
+#include <utility>
+#include <memory>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T>
+            using Decay = typename std::decay<T>::type;
+            template <typename T, typename U>
+            struct is_related
+                : std::is_same<Decay<T>, Decay<U>> {};
+
+            /// We need to reinvent std::function because every piece of code that might add overhead
+            /// in a measurement context needs to have consistent performance characteristics so that we
+            /// can account for it in the measurement.
+            /// Implementations of std::function with optimizations that aren't always applicable, like
+            /// small buffer optimizations, are not uncommon.
+            /// This is effectively an implementation of std::function without any such optimizations;
+            /// it may be slow, but it is consistently slow.
+            struct BenchmarkFunction {
+            private:
+                struct callable {
+                    virtual void call(Chronometer meter) const = 0;
+                    virtual callable* clone() const = 0;
+                    virtual ~callable() = default;
+                };
+                template <typename Fun>
+                struct model : public callable {
+                    model(Fun&& fun) : fun(std::move(fun)) {}
+                    model(Fun const& fun) : fun(fun) {}
+
+                    model<Fun>* clone() const override { return new model<Fun>(*this); }
+
+                    void call(Chronometer meter) const override {
+                        call(meter, is_callable<Fun(Chronometer)>());
+                    }
+                    void call(Chronometer meter, std::true_type) const {
+                        fun(meter);
+                    }
+                    void call(Chronometer meter, std::false_type) const {
+                        meter.measure(fun);
+                    }
+
+                    Fun fun;
+                };
+
+                struct do_nothing { void operator()() const {} };
+
+                template <typename T>
+                BenchmarkFunction(model<T>* c) : f(c) {}
+
+            public:
+                BenchmarkFunction()
+                    : f(new model<do_nothing>{ {} }) {}
+
+                template <typename Fun,
+                    typename std::enable_if<!is_related<Fun, BenchmarkFunction>::value, int>::type = 0>
+                    BenchmarkFunction(Fun&& fun)
+                    : f(new model<typename std::decay<Fun>::type>(std::forward<Fun>(fun))) {}
+
+                BenchmarkFunction(BenchmarkFunction&& that)
+                    : f(std::move(that.f)) {}
+
+                BenchmarkFunction(BenchmarkFunction const& that)
+                    : f(that.f->clone()) {}
+
+                BenchmarkFunction& operator=(BenchmarkFunction&& that) {
+                    f = std::move(that.f);
+                    return *this;
+                }
+
+                BenchmarkFunction& operator=(BenchmarkFunction const& that) {
+                    f.reset(that.f->clone());
+                    return *this;
+                }
+
+                void operator()(Chronometer meter) const { f->call(meter); }
+
+            private:
+                std::unique_ptr<callable> f;
+            };
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_benchmark_function.hpp
+// start catch_repeat.hpp
+
+// repeat algorithm
+
+
+#include <type_traits>
+#include <utility>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Fun>
+            struct repeater {
+                void operator()(int k) const {
+                    for (int i = 0; i < k; ++i) {
+                        fun();
+                    }
+                }
+                Fun fun;
+            };
+            template <typename Fun>
+            repeater<typename std::decay<Fun>::type> repeat(Fun&& fun) {
+                return { std::forward<Fun>(fun) };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_repeat.hpp
+// start catch_run_for_at_least.hpp
+
+// Run a function for a minimum amount of time
+
+
+// start catch_measure.hpp
+
+// Measure
+
+
+// start catch_timing.hpp
+
+// Timing
+
+
+#include <tuple>
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Duration, typename Result>
+        struct Timing {
+            Duration elapsed;
+            Result result;
+            int iterations;
+        };
+        template <typename Clock, typename Func, typename... Args>
+        using TimingOf = Timing<ClockDuration<Clock>, Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_timing.hpp
+#include <utility>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock, typename Fun, typename... Args>
+            TimingOf<Clock, Fun, Args...> measure(Fun&& fun, Args&&... args) {
+                auto start = Clock::now();
+                auto&& r = Detail::complete_invoke(fun, std::forward<Args>(args)...);
+                auto end = Clock::now();
+                auto delta = end - start;
+                return { delta, std::forward<decltype(r)>(r), 1 };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_measure.hpp
+#include <utility>
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock, typename Fun>
+            TimingOf<Clock, Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
+                return Detail::measure<Clock>(fun, iters);
+            }
+            template <typename Clock, typename Fun>
+            TimingOf<Clock, Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
+                Detail::ChronometerModel<Clock> meter;
+                auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));
+
+                return { meter.elapsed(), std::move(result), iters };
+            }
+
+            template <typename Clock, typename Fun>
+            using run_for_at_least_argument_t = typename std::conditional<is_callable<Fun(Chronometer)>::value, Chronometer, int>::type;
+
+            struct optimized_away_error : std::exception {
+                const char* what() const noexcept override {
+                    return "could not measure benchmark, maybe it was optimized away";
+                }
+            };
+
+            template <typename Clock, typename Fun>
+            TimingOf<Clock, Fun, run_for_at_least_argument_t<Clock, Fun>> run_for_at_least(ClockDuration<Clock> how_long, int seed, Fun&& fun) {
+                auto iters = seed;
+                while (iters < (1 << 30)) {
+                    auto&& Timing = measure_one<Clock>(fun, iters, is_callable<Fun(Chronometer)>());
+
+                    if (Timing.elapsed >= how_long) {
+                        return { Timing.elapsed, std::move(Timing.result), iters };
+                    }
+                    iters *= 2;
+                }
+                throw optimized_away_error{};
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_run_for_at_least.hpp
+#include <algorithm>
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Duration>
+        struct ExecutionPlan {
+            int iterations_per_sample;
+            Duration estimated_duration;
+            Detail::BenchmarkFunction benchmark;
+            Duration warmup_time;
+            int warmup_iterations;
+
+            template <typename Duration2>
+            operator ExecutionPlan<Duration2>() const {
+                return { iterations_per_sample, estimated_duration, benchmark, warmup_time, warmup_iterations };
+            }
+
+            template <typename Clock>
+            std::vector<FloatDuration<Clock>> run(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+                // warmup a bit
+                Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_iterations, Detail::repeat(now<Clock>{}));
+
+                std::vector<FloatDuration<Clock>> times;
+                times.reserve(cfg.benchmarkSamples());
+                std::generate_n(std::back_inserter(times), cfg.benchmarkSamples(), [this, env] {
+                    Detail::ChronometerModel<Clock> model;
+                    this->benchmark(Chronometer(model, iterations_per_sample));
+                    auto sample_time = model.elapsed() - env.clock_cost.mean;
+                    if (sample_time < FloatDuration<Clock>::zero()) sample_time = FloatDuration<Clock>::zero();
+                    return sample_time / iterations_per_sample;
+                });
+                return times;
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_execution_plan.hpp
+// start catch_estimate_clock.hpp
+
+ // Environment measurement
+
+
+// start catch_stats.hpp
+
+// Statistical analysis tools
+
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+#include <iterator>
+#include <numeric>
+#include <tuple>
+#include <cmath>
+#include <utility>
+#include <cstddef>
+#include <random>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            using sample = std::vector<double>;
+
+            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last);
+
+            template <typename Iterator>
+            OutlierClassification classify_outliers(Iterator first, Iterator last) {
+                std::vector<double> copy(first, last);
+
+                auto q1 = weighted_average_quantile(1, 4, copy.begin(), copy.end());
+                auto q3 = weighted_average_quantile(3, 4, copy.begin(), copy.end());
+                auto iqr = q3 - q1;
+                auto los = q1 - (iqr * 3.);
+                auto lom = q1 - (iqr * 1.5);
+                auto him = q3 + (iqr * 1.5);
+                auto his = q3 + (iqr * 3.);
+
+                OutlierClassification o;
+                for (; first != last; ++first) {
+                    auto&& t = *first;
+                    if (t < los) ++o.low_severe;
+                    else if (t < lom) ++o.low_mild;
+                    else if (t > his) ++o.high_severe;
+                    else if (t > him) ++o.high_mild;
+                    ++o.samples_seen;
+                }
+                return o;
+            }
+
+            template <typename Iterator>
+            double mean(Iterator first, Iterator last) {
+                auto count = last - first;
+                double sum = std::accumulate(first, last, 0.);
+                return sum / count;
+            }
+
+            template <typename URng, typename Iterator, typename Estimator>
+            sample resample(URng& rng, int resamples, Iterator first, Iterator last, Estimator& estimator) {
+                auto n = last - first;
+                std::uniform_int_distribution<decltype(n)> dist(0, n - 1);
+
+                sample out;
+                out.reserve(resamples);
+                std::generate_n(std::back_inserter(out), resamples, [n, first, &estimator, &dist, &rng] {
+                    std::vector<double> resampled;
+                    resampled.reserve(n);
+                    std::generate_n(std::back_inserter(resampled), n, [first, &dist, &rng] { return first[dist(rng)]; });
+                    return estimator(resampled.begin(), resampled.end());
+                });
+                std::sort(out.begin(), out.end());
+                return out;
+            }
+
+            template <typename Estimator, typename Iterator>
+            sample jackknife(Estimator&& estimator, Iterator first, Iterator last) {
+                auto n = last - first;
+                auto second = std::next(first);
+                sample results;
+                results.reserve(n);
+
+                for (auto it = first; it != last; ++it) {
+                    std::iter_swap(it, first);
+                    results.push_back(estimator(second, last));
+                }
+
+                return results;
+            }
+
+            inline double normal_cdf(double x) {
+                return std::erfc(-x / std::sqrt(2.0)) / 2.0;
+            }
+
+            double erfc_inv(double x);
+
+            double normal_quantile(double p);
+
+            template <typename Iterator, typename Estimator>
+            Estimate<double> bootstrap(double confidence_level, Iterator first, Iterator last, sample const& resample, Estimator&& estimator) {
+                auto n_samples = last - first;
+
+                double point = estimator(first, last);
+                // Degenerate case with a single sample
+                if (n_samples == 1) return { point, point, point, confidence_level };
+
+                sample jack = jackknife(estimator, first, last);
+                double jack_mean = mean(jack.begin(), jack.end());
+                double sum_squares, sum_cubes;
+                std::tie(sum_squares, sum_cubes) = std::accumulate(jack.begin(), jack.end(), std::make_pair(0., 0.), [jack_mean](std::pair<double, double> sqcb, double x) -> std::pair<double, double> {
+                    auto d = jack_mean - x;
+                    auto d2 = d * d;
+                    auto d3 = d2 * d;
+                    return { sqcb.first + d2, sqcb.second + d3 };
+                });
+
+                double accel = sum_cubes / (6 * std::pow(sum_squares, 1.5));
+                int n = static_cast<int>(resample.size());
+                double prob_n = std::count_if(resample.begin(), resample.end(), [point](double x) { return x < point; }) / (double)n;
+                // degenerate case with uniform samples
+                if (prob_n == 0) return { point, point, point, confidence_level };
+
+                double bias = normal_quantile(prob_n);
+                double z1 = normal_quantile((1. - confidence_level) / 2.);
+
+                auto cumn = [n](double x) -> int {
+                    return std::lround(normal_cdf(x) * n); };
+                auto a = [bias, accel](double b) { return bias + b / (1. - accel * b); };
+                double b1 = bias + z1;
+                double b2 = bias - z1;
+                double a1 = a(b1);
+                double a2 = a(b2);
+                auto lo = std::max(cumn(a1), 0);
+                auto hi = std::min(cumn(a2), n - 1);
+
+                return { point, resample[lo], resample[hi], confidence_level };
+            }
+
+            double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n);
+
+            struct bootstrap_analysis {
+                Estimate<double> mean;
+                Estimate<double> standard_deviation;
+                double outlier_variance;
+            };
+
+            bootstrap_analysis analyse_samples(double confidence_level, int n_resamples, std::vector<double>::iterator first, std::vector<double>::iterator last);
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_stats.hpp
+#include <algorithm>
+#include <iterator>
+#include <tuple>
+#include <vector>
+#include <cmath>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock>
+            std::vector<double> resolution(int k) {
+                std::vector<TimePoint<Clock>> times;
+                times.reserve(k + 1);
+                std::generate_n(std::back_inserter(times), k + 1, now<Clock>{});
+
+                std::vector<double> deltas;
+                deltas.reserve(k);
+                std::transform(std::next(times.begin()), times.end(), times.begin(),
+                    std::back_inserter(deltas),
+                    [](TimePoint<Clock> a, TimePoint<Clock> b) { return static_cast<double>((a - b).count()); });
+
+                return deltas;
+            }
+
+            const auto warmup_iterations = 10000;
+            const auto warmup_time = std::chrono::milliseconds(100);
+            const auto minimum_ticks = 1000;
+            const auto warmup_seed = 10000;
+            const auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
+            const auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
+            const auto clock_cost_estimation_tick_limit = 100000;
+            const auto clock_cost_estimation_time = std::chrono::milliseconds(10);
+            const auto clock_cost_estimation_iterations = 10000;
+
+            template <typename Clock>
+            int warmup() {
+                return run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time), warmup_seed, &resolution<Clock>)
+                    .iterations;
+            }
+            template <typename Clock>
+            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_resolution(int iterations) {
+                auto r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_resolution_estimation_time), iterations, &resolution<Clock>)
+                    .result;
+                return {
+                    FloatDuration<Clock>(mean(r.begin(), r.end())),
+                    classify_outliers(r.begin(), r.end()),
+                };
+            }
+            template <typename Clock>
+            EnvironmentEstimate<FloatDuration<Clock>> estimate_clock_cost(FloatDuration<Clock> resolution) {
+                auto time_limit = std::min(resolution * clock_cost_estimation_tick_limit, FloatDuration<Clock>(clock_cost_estimation_time_limit));
+                auto time_clock = [](int k) {
+                    return Detail::measure<Clock>([k] {
+                        for (int i = 0; i < k; ++i) {
+                            volatile auto ignored = Clock::now();
+                            (void)ignored;
+                        }
+                    }).elapsed;
+                };
+                time_clock(1);
+                int iters = clock_cost_estimation_iterations;
+                auto&& r = run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(clock_cost_estimation_time), iters, time_clock);
+                std::vector<double> times;
+                int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));
+                times.reserve(nsamples);
+                std::generate_n(std::back_inserter(times), nsamples, [time_clock, &r] {
+                    return static_cast<double>((time_clock(r.iterations) / r.iterations).count());
+                });
+                return {
+                    FloatDuration<Clock>(mean(times.begin(), times.end())),
+                    classify_outliers(times.begin(), times.end()),
+                };
+            }
+
+            template <typename Clock>
+            Environment<FloatDuration<Clock>> measure_environment() {
+                static Environment<FloatDuration<Clock>>* env = nullptr;
+                if (env) {
+                    return *env;
+                }
+
+                auto iters = Detail::warmup<Clock>();
+                auto resolution = Detail::estimate_clock_resolution<Clock>(iters);
+                auto cost = Detail::estimate_clock_cost<Clock>(resolution.mean);
+
+                env = new Environment<FloatDuration<Clock>>{ resolution, cost };
+                return *env;
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_estimate_clock.hpp
+// start catch_analyse.hpp
+
+ // Run and analyse one benchmark
+
+
+// start catch_sample_analysis.hpp
+
+// Benchmark results
+
+
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <iterator>
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Duration>
+        struct SampleAnalysis {
+            std::vector<Duration> samples;
+            Estimate<Duration> mean;
+            Estimate<Duration> standard_deviation;
+            OutlierClassification outliers;
+            double outlier_variance;
+
+            template <typename Duration2>
+            operator SampleAnalysis<Duration2>() const {
+                std::vector<Duration2> samples2;
+                samples2.reserve(samples.size());
+                std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](Duration d) { return Duration2(d); });
+                return {
+                    std::move(samples2),
+                    mean,
+                    standard_deviation,
+                    outliers,
+                    outlier_variance,
+                };
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_sample_analysis.hpp
+#include <algorithm>
+#include <iterator>
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Duration, typename Iterator>
+            SampleAnalysis<Duration> analyse(const IConfig &cfg, Environment<Duration>, Iterator first, Iterator last) {
+                if (!cfg.benchmarkNoAnalysis()) {
+                    std::vector<double> samples;
+                    samples.reserve(last - first);
+                    std::transform(first, last, std::back_inserter(samples), [](Duration d) { return d.count(); });
+
+                    auto analysis = Catch::Benchmark::Detail::analyse_samples(cfg.benchmarkConfidenceInterval(), cfg.benchmarkResamples(), samples.begin(), samples.end());
+                    auto outliers = Catch::Benchmark::Detail::classify_outliers(samples.begin(), samples.end());
+
+                    auto wrap_estimate = [](Estimate<double> e) {
+                        return Estimate<Duration> {
+                            Duration(e.point),
+                                Duration(e.lower_bound),
+                                Duration(e.upper_bound),
+                                e.confidence_interval,
+                        };
+                    };
+                    std::vector<Duration> samples2;
+                    samples2.reserve(samples.size());
+                    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2), [](double d) { return Duration(d); });
+                    return {
+                        std::move(samples2),
+                        wrap_estimate(analysis.mean),
+                        wrap_estimate(analysis.standard_deviation),
+                        outliers,
+                        analysis.outlier_variance,
+                    };
+                } else {
+                    std::vector<Duration> samples;
+                    samples.reserve(last - first);
+
+                    Duration mean = Duration(0);
+                    int i = 0;
+                    for (auto it = first; it < last; ++it, ++i) {
+                        samples.push_back(Duration(*it));
+                        mean += Duration(*it);
+                    }
+                    mean /= i;
+
+                    return {
+                        std::move(samples),
+                        Estimate<Duration>{mean, mean, mean, 0.0},
+                        Estimate<Duration>{Duration(0), Duration(0), Duration(0), 0.0},
+                        OutlierClassification{},
+                        0.0
+                    };
+                }
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+// end catch_analyse.hpp
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+#include <cmath>
+
+namespace Catch {
+    namespace Benchmark {
+        struct Benchmark {
+            Benchmark(std::string &&name)
+                : name(std::move(name)) {}
+
+            template <class FUN>
+            Benchmark(std::string &&name, FUN &&func)
+                : fun(std::move(func)), name(std::move(name)) {}
+
+            template <typename Clock>
+            ExecutionPlan<FloatDuration<Clock>> prepare(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
+                auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
+                auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(cfg.benchmarkWarmupTime()));
+                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(run_time), 1, fun);
+                int new_iters = static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
+                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FloatDuration<Clock>>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
+            }
+
+            template <typename Clock = default_clock>
+            void run() {
+                IConfigPtr cfg = getCurrentContext().getConfig();
+
+                auto env = Detail::measure_environment<Clock>();
+
+                getResultCapture().benchmarkPreparing(name);
+                CATCH_TRY{
+                    auto plan = user_code([&] {
+                        return prepare<Clock>(*cfg, env);
+                    });
+
+                    BenchmarkInfo info {
+                        name,
+                        plan.estimated_duration.count(),
+                        plan.iterations_per_sample,
+                        cfg->benchmarkSamples(),
+                        cfg->benchmarkResamples(),
+                        env.clock_resolution.mean.count(),
+                        env.clock_cost.mean.count()
+                    };
+
+                    getResultCapture().benchmarkStarting(info);
+
+                    auto samples = user_code([&] {
+                        return plan.template run<Clock>(*cfg, env);
+                    });
+
+                    auto analysis = Detail::analyse(*cfg, env, samples.begin(), samples.end());
+                    BenchmarkStats<FloatDuration<Clock>> stats{ info, analysis.samples, analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
+                    getResultCapture().benchmarkEnded(stats);
+
+                } CATCH_CATCH_ALL{
+                    if (translateActiveException() != Detail::benchmarkErrorMsg) // benchmark errors have been reported, otherwise rethrow.
+                        std::rethrow_exception(std::current_exception());
+                }
+            }
+
+            // sets lambda to be used in fun *and* executes benchmark!
+            template <typename Fun,
+                typename std::enable_if<!Detail::is_related<Fun, Benchmark>::value, int>::type = 0>
+                Benchmark & operator=(Fun func) {
+                fun = Detail::BenchmarkFunction(func);
+                run();
+                return *this;
+            }
+
+            explicit operator bool() {
+                return true;
+            }
+
+        private:
+            Detail::BenchmarkFunction fun;
+            std::string name;
+        };
+    }
+} // namespace Catch
+
+#define INTERNAL_CATCH_GET_1_ARG(arg1, arg2, ...) arg1
+#define INTERNAL_CATCH_GET_2_ARG(arg1, arg2, ...) arg2
+
+#define INTERNAL_CATCH_BENCHMARK(BenchmarkName, name, benchmarkIndex)\
+    if( Catch::Benchmark::Benchmark BenchmarkName{name} ) \
+        BenchmarkName = [&](int benchmarkIndex)
+
+#define INTERNAL_CATCH_BENCHMARK_ADVANCED(BenchmarkName, name)\
+    if( Catch::Benchmark::Benchmark BenchmarkName{name} ) \
+        BenchmarkName = [&]
+
+// end catch_benchmark.hpp
+// start catch_constructor.hpp
+
+// Constructor and destructor helpers
+
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T, bool Destruct>
+            struct ObjectStorage
+            {
+                using TStorage = typename std::aligned_storage<sizeof(T), std::alignment_of<T>::value>::type;
+
+                ObjectStorage() : data() {}
+
+                ObjectStorage(const ObjectStorage& other)
+                {
+                    new(&data) T(other.stored_object());
+                }
+
+                ObjectStorage(ObjectStorage&& other)
+                {
+                    new(&data) T(std::move(other.stored_object()));
+                }
+
+                ~ObjectStorage() { destruct_on_exit<T>(); }
+
+                template <typename... Args>
+                void construct(Args&&... args)
+                {
+                    new (&data) T(std::forward<Args>(args)...);
+                }
+
+                template <bool AllowManualDestruction = !Destruct>
+                typename std::enable_if<AllowManualDestruction>::type destruct()
+                {
+                    stored_object().~T();
+                }
+
+            private:
+                // If this is a constructor benchmark, destruct the underlying object
+                template <typename U>
+                void destruct_on_exit(typename std::enable_if<Destruct, U>::type* = 0) { destruct<true>(); }
+                // Otherwise, don't
+                template <typename U>
+                void destruct_on_exit(typename std::enable_if<!Destruct, U>::type* = 0) { }
+
+                T& stored_object() {
+                    return *static_cast<T*>(static_cast<void*>(&data));
+                }
+
+                T const& stored_object() const {
+                    return *static_cast<T*>(static_cast<void*>(&data));
+                }
+
+                TStorage data;
+            };
+        }
+
+        template <typename T>
+        using storage_for = Detail::ObjectStorage<T, true>;
+
+        template <typename T>
+        using destructable_object = Detail::ObjectStorage<T, false>;
+    }
+}
+
+// end catch_constructor.hpp
+// end catch_benchmarking_all.hpp
+#endif
+
+#endif // ! CATCH_CONFIG_IMPL_ONLY
+
+#ifdef CATCH_IMPL
+// start catch_impl.hpp
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+// Keep these here for external reporters
+// start catch_test_case_tracker.h
+
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace Catch {
+namespace TestCaseTracking {
+
+    struct NameAndLocation {
+        std::string name;
+        SourceLineInfo location;
+
+        NameAndLocation( std::string const& _name, SourceLineInfo const& _location );
+        friend bool operator==(NameAndLocation const& lhs, NameAndLocation const& rhs) {
+            return lhs.name == rhs.name
+                && lhs.location == rhs.location;
+        }
+    };
+
+    class ITracker;
+
+    using ITrackerPtr = std::shared_ptr<ITracker>;
+
+    class  ITracker {
+        NameAndLocation m_nameAndLocation;
+
+    public:
+        ITracker(NameAndLocation const& nameAndLoc) :
+            m_nameAndLocation(nameAndLoc)
+        {}
+
+        // static queries
+        NameAndLocation const& nameAndLocation() const {
+            return m_nameAndLocation;
+        }
+
+        virtual ~ITracker();
+
+        // dynamic queries
+        virtual bool isComplete() const = 0; // Successfully completed or failed
+        virtual bool isSuccessfullyCompleted() const = 0;
+        virtual bool isOpen() const = 0; // Started but not complete
+        virtual bool hasChildren() const = 0;
+        virtual bool hasStarted() const = 0;
+
+        virtual ITracker& parent() = 0;
+
+        // actions
+        virtual void close() = 0; // Successfully complete
+        virtual void fail() = 0;
+        virtual void markAsNeedingAnotherRun() = 0;
+
+        virtual void addChild( ITrackerPtr const& child ) = 0;
+        virtual ITrackerPtr findChild( NameAndLocation const& nameAndLocation ) = 0;
+        virtual void openChild() = 0;
+
+        // Debug/ checking
+        virtual bool isSectionTracker() const = 0;
+        virtual bool isGeneratorTracker() const = 0;
+    };
+
+    class TrackerContext {
+
+        enum RunState {
+            NotStarted,
+            Executing,
+            CompletedCycle
+        };
+
+        ITrackerPtr m_rootTracker;
+        ITracker* m_currentTracker = nullptr;
+        RunState m_runState = NotStarted;
+
+    public:
+
+        ITracker& startRun();
+        void endRun();
+
+        void startCycle();
+        void completeCycle();
+
+        bool completedCycle() const;
+        ITracker& currentTracker();
+        void setCurrentTracker( ITracker* tracker );
+    };
+
+    class TrackerBase : public ITracker {
+    protected:
+        enum CycleState {
+            NotStarted,
+            Executing,
+            ExecutingChildren,
+            NeedsAnotherRun,
+            CompletedSuccessfully,
+            Failed
+        };
+
+        using Children = std::vector<ITrackerPtr>;
+        TrackerContext& m_ctx;
+        ITracker* m_parent;
+        Children m_children;
+        CycleState m_runState = NotStarted;
+
+    public:
+        TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent );
+
+        bool isComplete() const override;
+        bool isSuccessfullyCompleted() const override;
+        bool isOpen() const override;
+        bool hasChildren() const override;
+        bool hasStarted() const override {
+            return m_runState != NotStarted;
+        }
+
+        void addChild( ITrackerPtr const& child ) override;
+
+        ITrackerPtr findChild( NameAndLocation const& nameAndLocation ) override;
+        ITracker& parent() override;
+
+        void openChild() override;
+
+        bool isSectionTracker() const override;
+        bool isGeneratorTracker() const override;
+
+        void open();
+
+        void close() override;
+        void fail() override;
+        void markAsNeedingAnotherRun() override;
+
+    private:
+        void moveToParent();
+        void moveToThis();
+    };
+
+    class SectionTracker : public TrackerBase {
+        std::vector<std::string> m_filters;
+        std::string m_trimmed_name;
+    public:
+        SectionTracker( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent );
+
+        bool isSectionTracker() const override;
+
+        bool isComplete() const override;
+
+        static SectionTracker& acquire( TrackerContext& ctx, NameAndLocation const& nameAndLocation );
+
+        void tryOpen();
+
+        void addInitialFilters( std::vector<std::string> const& filters );
+        void addNextFilters( std::vector<std::string> const& filters );
+        //! Returns filters active in this tracker
+        std::vector<std::string> const& getFilters() const;
+        //! Returns whitespace-trimmed name of the tracked section
+        std::string const& trimmedName() const;
+    };
+
+} // namespace TestCaseTracking
+
+using TestCaseTracking::ITracker;
+using TestCaseTracking::TrackerContext;
+using TestCaseTracking::SectionTracker;
+
+} // namespace Catch
+
+// end catch_test_case_tracker.h
+
+// start catch_leak_detector.h
+
+namespace Catch {
+
+    struct LeakDetector {
+        LeakDetector();
+        ~LeakDetector();
+    };
+
+}
+// end catch_leak_detector.h
+// Cpp files will be included in the single-header file here
+// start catch_stats.cpp
+
+// Statistical analysis tools
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+
+#include <cassert>
+#include <random>
+
+#if defined(CATCH_CONFIG_USE_ASYNC)
+#include <future>
+#endif
+
+namespace {
+    double erf_inv(double x) {
+        // Code accompanying the article "Approximating the erfinv function" in GPU Computing Gems, Volume 2
+        double w, p;
+
+        w = -log((1.0 - x) * (1.0 + x));
+
+        if (w < 6.250000) {
+            w = w - 3.125000;
+            p = -3.6444120640178196996e-21;
+            p = -1.685059138182016589e-19 + p * w;
+            p = 1.2858480715256400167e-18 + p * w;
+            p = 1.115787767802518096e-17 + p * w;
+            p = -1.333171662854620906e-16 + p * w;
+            p = 2.0972767875968561637e-17 + p * w;
+            p = 6.6376381343583238325e-15 + p * w;
+            p = -4.0545662729752068639e-14 + p * w;
+            p = -8.1519341976054721522e-14 + p * w;
+            p = 2.6335093153082322977e-12 + p * w;
+            p = -1.2975133253453532498e-11 + p * w;
+            p = -5.4154120542946279317e-11 + p * w;
+            p = 1.051212273321532285e-09 + p * w;
+            p = -4.1126339803469836976e-09 + p * w;
+            p = -2.9070369957882005086e-08 + p * w;
+            p = 4.2347877827932403518e-07 + p * w;
+            p = -1.3654692000834678645e-06 + p * w;
+            p = -1.3882523362786468719e-05 + p * w;
+            p = 0.0001867342080340571352 + p * w;
+            p = -0.00074070253416626697512 + p * w;
+            p = -0.0060336708714301490533 + p * w;
+            p = 0.24015818242558961693 + p * w;
+            p = 1.6536545626831027356 + p * w;
+        } else if (w < 16.000000) {
+            w = sqrt(w) - 3.250000;
+            p = 2.2137376921775787049e-09;
+            p = 9.0756561938885390979e-08 + p * w;
+            p = -2.7517406297064545428e-07 + p * w;
+            p = 1.8239629214389227755e-08 + p * w;
+            p = 1.5027403968909827627e-06 + p * w;
+            p = -4.013867526981545969e-06 + p * w;
+            p = 2.9234449089955446044e-06 + p * w;
+            p = 1.2475304481671778723e-05 + p * w;
+            p = -4.7318229009055733981e-05 + p * w;
+            p = 6.8284851459573175448e-05 + p * w;
+            p = 2.4031110387097893999e-05 + p * w;
+            p = -0.0003550375203628474796 + p * w;
+            p = 0.00095328937973738049703 + p * w;
+            p = -0.0016882755560235047313 + p * w;
+            p = 0.0024914420961078508066 + p * w;
+            p = -0.0037512085075692412107 + p * w;
+            p = 0.005370914553590063617 + p * w;
+            p = 1.0052589676941592334 + p * w;
+            p = 3.0838856104922207635 + p * w;
+        } else {
+            w = sqrt(w) - 5.000000;
+            p = -2.7109920616438573243e-11;
+            p = -2.5556418169965252055e-10 + p * w;
+            p = 1.5076572693500548083e-09 + p * w;
+            p = -3.7894654401267369937e-09 + p * w;
+            p = 7.6157012080783393804e-09 + p * w;
+            p = -1.4960026627149240478e-08 + p * w;
+            p = 2.9147953450901080826e-08 + p * w;
+            p = -6.7711997758452339498e-08 + p * w;
+            p = 2.2900482228026654717e-07 + p * w;
+            p = -9.9298272942317002539e-07 + p * w;
+            p = 4.5260625972231537039e-06 + p * w;
+            p = -1.9681778105531670567e-05 + p * w;
+            p = 7.5995277030017761139e-05 + p * w;
+            p = -0.00021503011930044477347 + p * w;
+            p = -0.00013871931833623122026 + p * w;
+            p = 1.0103004648645343977 + p * w;
+            p = 4.8499064014085844221 + p * w;
+        }
+        return p * x;
+    }
+
+    double standard_deviation(std::vector<double>::iterator first, std::vector<double>::iterator last) {
+        auto m = Catch::Benchmark::Detail::mean(first, last);
+        double variance = std::accumulate(first, last, 0., [m](double a, double b) {
+            double diff = b - m;
+            return a + diff * diff;
+            }) / (last - first);
+            return std::sqrt(variance);
+    }
+
+}
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+
+            double weighted_average_quantile(int k, int q, std::vector<double>::iterator first, std::vector<double>::iterator last) {
+                auto count = last - first;
+                double idx = (count - 1) * k / static_cast<double>(q);
+                int j = static_cast<int>(idx);
+                double g = idx - j;
+                std::nth_element(first, first + j, last);
+                auto xj = first[j];
+                if (g == 0) return xj;
+
+                auto xj1 = *std::min_element(first + (j + 1), last);
+                return xj + g * (xj1 - xj);
+            }
+
+            double erfc_inv(double x) {
+                return erf_inv(1.0 - x);
+            }
+
+            double normal_quantile(double p) {
+                static const double ROOT_TWO = std::sqrt(2.0);
+
+                double result = 0.0;
+                assert(p >= 0 && p <= 1);
+                if (p < 0 || p > 1) {
+                    return result;
+                }
+
+                result = -erfc_inv(2.0 * p);
+                // result *= normal distribution standard deviation (1.0) * sqrt(2)
+                result *= /*sd * */ ROOT_TWO;
+                // result += normal disttribution mean (0)
+                return result;
+            }
+
+            double outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n) {
+                double sb = stddev.point;
+                double mn = mean.point / n;
+                double mg_min = mn / 2.;
+                double sg = std::min(mg_min / 4., sb / std::sqrt(n));
+                double sg2 = sg * sg;
+                double sb2 = sb * sb;
+
+                auto c_max = [n, mn, sb2, sg2](double x) -> double {
+                    double k = mn - x;
+                    double d = k * k;
+                    double nd = n * d;
+                    double k0 = -n * nd;
+                    double k1 = sb2 - n * sg2 + nd;
+                    double det = k1 * k1 - 4 * sg2 * k0;
+                    return (int)(-2. * k0 / (k1 + std::sqrt(det)));
+                };
+
+                auto var_out = [n, sb2, sg2](double c) {
+                    double nc = n - c;
+                    return (nc / n) * (sb2 - nc * sg2);
+                };
+
+                return std::min(var_out(1), var_out(std::min(c_max(0.), c_max(mg_min)))) / sb2;
+            }
+
+            bootstrap_analysis analyse_samples(double confidence_level, int n_resamples, std::vector<double>::iterator first, std::vector<double>::iterator last) {
+                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+                CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
+                static std::random_device entropy;
+                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
+
+                auto mean = &Detail::mean<std::vector<double>::iterator>;
+                auto stddev = &standard_deviation;
+
+#if defined(CATCH_CONFIG_USE_ASYNC)
+                auto Estimate = [=](double(*f)(std::vector<double>::iterator, std::vector<double>::iterator)) {
+                    auto seed = entropy();
+                    return std::async(std::launch::async, [=] {
+                        std::mt19937 rng(seed);
+                        auto resampled = resample(rng, n_resamples, first, last, f);
+                        return bootstrap(confidence_level, first, last, resampled, f);
+                    });
+                };
+
+                auto mean_future = Estimate(mean);
+                auto stddev_future = Estimate(stddev);
+
+                auto mean_estimate = mean_future.get();
+                auto stddev_estimate = stddev_future.get();
+#else
+                auto Estimate = [=](double(*f)(std::vector<double>::iterator, std::vector<double>::iterator)) {
+                    auto seed = entropy();
+                    std::mt19937 rng(seed);
+                    auto resampled = resample(rng, n_resamples, first, last, f);
+                    return bootstrap(confidence_level, first, last, resampled, f);
+                };
+
+                auto mean_estimate = Estimate(mean);
+                auto stddev_estimate = Estimate(stddev);
+#endif // CATCH_USE_ASYNC
+
+                double outlier_variance = Detail::outlier_variance(mean_estimate, stddev_estimate, n);
+
+                return { mean_estimate, stddev_estimate, outlier_variance };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+// end catch_stats.cpp
+// start catch_approx.cpp
+
+#include <cmath>
+#include <limits>
+
+namespace {
+
+// Performs equivalent check of std::fabs(lhs - rhs) <= margin
+// But without the subtraction to allow for INFINITY in comparison
+bool marginComparison(double lhs, double rhs, double margin) {
+    return (lhs + margin >= rhs) && (rhs + margin >= lhs);
+}
+
+}
+
+namespace Catch {
+namespace Detail {
+
+    Approx::Approx ( double value )
+    :   m_epsilon( std::numeric_limits<float>::epsilon()*100 ),
+        m_margin( 0.0 ),
+        m_scale( 0.0 ),
+        m_value( value )
+    {}
+
+    Approx Approx::custom() {
+        return Approx( 0 );
+    }
+
+    Approx Approx::operator-() const {
+        auto temp(*this);
+        temp.m_value = -temp.m_value;
+        return temp;
+    }
+
+    std::string Approx::toString() const {
+        ReusableStringStream rss;
+        rss << "Approx( " << ::Catch::Detail::stringify( m_value ) << " )";
+        return rss.str();
+    }
+
+    bool Approx::equalityComparisonImpl(const double other) const {
+        // First try with fixed margin, then compute margin based on epsilon, scale and Approx's value
+        // Thanks to Richard Harris for his help refining the scaled margin value
+        return marginComparison(m_value, other, m_margin)
+            || marginComparison(m_value, other, m_epsilon * (m_scale + std::fabs(std::isinf(m_value)? 0 : m_value)));
+    }
+
+    void Approx::setMargin(double newMargin) {
+        CATCH_ENFORCE(newMargin >= 0,
+            "Invalid Approx::margin: " << newMargin << '.'
+            << " Approx::Margin has to be non-negative.");
+        m_margin = newMargin;
+    }
+
+    void Approx::setEpsilon(double newEpsilon) {
+        CATCH_ENFORCE(newEpsilon >= 0 && newEpsilon <= 1.0,
+            "Invalid Approx::epsilon: " << newEpsilon << '.'
+            << " Approx::epsilon has to be in [0, 1]");
+        m_epsilon = newEpsilon;
+    }
+
+} // end namespace Detail
+
+namespace literals {
+    Detail::Approx operator "" _a(long double val) {
+        return Detail::Approx(val);
+    }
+    Detail::Approx operator "" _a(unsigned long long val) {
+        return Detail::Approx(val);
+    }
+} // end namespace literals
+
+std::string StringMaker<Catch::Detail::Approx>::convert(Catch::Detail::Approx const& value) {
+    return value.toString();
+}
+
+} // end namespace Catch
+// end catch_approx.cpp
+// start catch_assertionhandler.cpp
+
+// start catch_debugger.h
+
+namespace Catch {
+    bool isDebuggerActive();
+}
+
+#ifdef CATCH_PLATFORM_MAC
+
+    #if defined(__i386__) || defined(__x86_64__)
+        #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */
+    #elif defined(__aarch64__)
+        #define CATCH_TRAP()  __asm__(".inst 0xd4200000")
+    #endif
+
+#elif defined(CATCH_PLATFORM_IPHONE)
+
+    // use inline assembler
+    #if defined(__i386__) || defined(__x86_64__)
+        #define CATCH_TRAP()  __asm__("int $3")
+    #elif defined(__aarch64__)
+        #define CATCH_TRAP()  __asm__(".inst 0xd4200000")
+    #elif defined(__arm__) && !defined(__thumb__)
+        #define CATCH_TRAP()  __asm__(".inst 0xe7f001f0")
+    #elif defined(__arm__) &&  defined(__thumb__)
+        #define CATCH_TRAP()  __asm__(".inst 0xde01")
+    #endif
+
+#elif defined(CATCH_PLATFORM_LINUX)
+    // If we can use inline assembler, do it because this allows us to break
+    // directly at the location of the failing check instead of breaking inside
+    // raise() called from it, i.e. one stack frame below.
+    #if defined(__GNUC__) && (defined(__i386) || defined(__x86_64))
+        #define CATCH_TRAP() asm volatile ("int $3") /* NOLINT */
+    #else // Fall back to the generic way.
+        #include <signal.h>
+
+        #define CATCH_TRAP() raise(SIGTRAP)
+    #endif
+#elif defined(_MSC_VER)
+    #define CATCH_TRAP() __debugbreak()
+#elif defined(__MINGW32__)
+    extern "C" __declspec(dllimport) void __stdcall DebugBreak();
+    #define CATCH_TRAP() DebugBreak()
+#endif
+
+#ifndef CATCH_BREAK_INTO_DEBUGGER
+    #ifdef CATCH_TRAP
+        #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }()
+    #else
+        #define CATCH_BREAK_INTO_DEBUGGER() []{}()
+    #endif
+#endif
+
+// end catch_debugger.h
+// start catch_run_context.h
+
+// start catch_fatal_condition.h
+
+// start catch_windows_h_proxy.h
+
+
+#if defined(CATCH_PLATFORM_WINDOWS)
+
+#if !defined(NOMINMAX) && !defined(CATCH_CONFIG_NO_NOMINMAX)
+#  define CATCH_DEFINED_NOMINMAX
+#  define NOMINMAX
+#endif
+#if !defined(WIN32_LEAN_AND_MEAN) && !defined(CATCH_CONFIG_NO_WIN32_LEAN_AND_MEAN)
+#  define CATCH_DEFINED_WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#ifdef __AFXDLL
+#include <AfxWin.h>
+#else
+#include <windows.h>
+#endif
+
+#ifdef CATCH_DEFINED_NOMINMAX
+#  undef NOMINMAX
+#endif
+#ifdef CATCH_DEFINED_WIN32_LEAN_AND_MEAN
+#  undef WIN32_LEAN_AND_MEAN
+#endif
+
+#endif // defined(CATCH_PLATFORM_WINDOWS)
+
+// end catch_windows_h_proxy.h
+#if defined( CATCH_CONFIG_WINDOWS_SEH )
+
+namespace Catch {
+
+    struct FatalConditionHandler {
+
+        static LONG CALLBACK handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo);
+        FatalConditionHandler();
+        static void reset();
+        ~FatalConditionHandler();
+
+    private:
+        static bool isSet;
+        static ULONG guaranteeSize;
+        static PVOID exceptionHandlerHandle;
+    };
+
+} // namespace Catch
+
+#elif defined ( CATCH_CONFIG_POSIX_SIGNALS )
+
+#include <signal.h>
+
+namespace Catch {
+
+    struct FatalConditionHandler {
+
+        static bool isSet;
+        static struct sigaction oldSigActions[];
+        static stack_t oldSigStack;
+        static char altStackMem[];
+
+        static void handleSignal( int sig );
+
+        FatalConditionHandler();
+        ~FatalConditionHandler();
+        static void reset();
+    };
+
+} // namespace Catch
+
+#else
+
+namespace Catch {
+    struct FatalConditionHandler {
+        void reset();
+    };
+}
+
+#endif
+
+// end catch_fatal_condition.h
+#include <string>
+
+namespace Catch {
+
+    struct IMutableContext;
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    class RunContext : public IResultCapture, public IRunner {
+
+    public:
+        RunContext( RunContext const& ) = delete;
+        RunContext& operator =( RunContext const& ) = delete;
+
+        explicit RunContext( IConfigPtr const& _config, IStreamingReporterPtr&& reporter );
+
+        ~RunContext() override;
+
+        void testGroupStarting( std::string const& testSpec, std::size_t groupIndex, std::size_t groupsCount );
+        void testGroupEnded( std::string const& testSpec, Totals const& totals, std::size_t groupIndex, std::size_t groupsCount );
+
+        Totals runTest(TestCase const& testCase);
+
+        IConfigPtr config() const;
+        IStreamingReporter& reporter() const;
+
+    public: // IResultCapture
+
+        // Assertion handlers
+        void handleExpr
+                (   AssertionInfo const& info,
+                    ITransientExpression const& expr,
+                    AssertionReaction& reaction ) override;
+        void handleMessage
+                (   AssertionInfo const& info,
+                    ResultWas::OfType resultType,
+                    StringRef const& message,
+                    AssertionReaction& reaction ) override;
+        void handleUnexpectedExceptionNotThrown
+                (   AssertionInfo const& info,
+                    AssertionReaction& reaction ) override;
+        void handleUnexpectedInflightException
+                (   AssertionInfo const& info,
+                    std::string const& message,
+                    AssertionReaction& reaction ) override;
+        void handleIncomplete
+                (   AssertionInfo const& info ) override;
+        void handleNonExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    AssertionReaction &reaction ) override;
+
+        bool sectionStarted( SectionInfo const& sectionInfo, Counts& assertions ) override;
+
+        void sectionEnded( SectionEndInfo const& endInfo ) override;
+        void sectionEndedEarly( SectionEndInfo const& endInfo ) override;
+
+        auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& override;
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+        void benchmarkPreparing( std::string const& name ) override;
+        void benchmarkStarting( BenchmarkInfo const& info ) override;
+        void benchmarkEnded( BenchmarkStats<> const& stats ) override;
+        void benchmarkFailed( std::string const& error ) override;
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+        void pushScopedMessage( MessageInfo const& message ) override;
+        void popScopedMessage( MessageInfo const& message ) override;
+
+        void emplaceUnscopedMessage( MessageBuilder const& builder ) override;
+
+        std::string getCurrentTestName() const override;
+
+        const AssertionResult* getLastResult() const override;
+
+        void exceptionEarlyReported() override;
+
+        void handleFatalErrorCondition( StringRef message ) override;
+
+        bool lastAssertionPassed() override;
+
+        void assertionPassed() override;
+
+    public:
+        // !TBD We need to do this another way!
+        bool aborting() const final;
+
+    private:
+
+        void runCurrentTest( std::string& redirectedCout, std::string& redirectedCerr );
+        void invokeActiveTestCase();
+
+        void resetAssertionInfo();
+        bool testForMissingAssertions( Counts& assertions );
+
+        void assertionEnded( AssertionResult const& result );
+        void reportExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    ITransientExpression const *expr,
+                    bool negated );
+
+        void populateReaction( AssertionReaction& reaction );
+
+    private:
+
+        void handleUnfinishedSections();
+
+        TestRunInfo m_runInfo;
+        IMutableContext& m_context;
+        TestCase const* m_activeTestCase = nullptr;
+        ITracker* m_testCaseTracker = nullptr;
+        Option<AssertionResult> m_lastResult;
+
+        IConfigPtr m_config;
+        Totals m_totals;
+        IStreamingReporterPtr m_reporter;
+        std::vector<MessageInfo> m_messages;
+        std::vector<ScopedMessage> m_messageScopes; /* Keeps owners of so-called unscoped messages. */
+        AssertionInfo m_lastAssertionInfo;
+        std::vector<SectionEndInfo> m_unfinishedSections;
+        std::vector<ITracker*> m_activeSections;
+        TrackerContext m_trackerContext;
+        bool m_lastAssertionPassed = false;
+        bool m_shouldReportUnexpected = true;
+        bool m_includeSuccessfulResults;
+    };
+
+    void seedRng(IConfig const& config);
+    unsigned int rngSeed();
+} // end namespace Catch
+
+// end catch_run_context.h
+namespace Catch {
+
+    namespace {
+        auto operator <<( std::ostream& os, ITransientExpression const& expr ) -> std::ostream& {
+            expr.streamReconstructedExpression( os );
+            return os;
+        }
+    }
+
+    LazyExpression::LazyExpression( bool isNegated )
+    :   m_isNegated( isNegated )
+    {}
+
+    LazyExpression::LazyExpression( LazyExpression const& other ) : m_isNegated( other.m_isNegated ) {}
+
+    LazyExpression::operator bool() const {
+        return m_transientExpression != nullptr;
+    }
+
+    auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream& {
+        if( lazyExpr.m_isNegated )
+            os << "!";
+
+        if( lazyExpr ) {
+            if( lazyExpr.m_isNegated && lazyExpr.m_transientExpression->isBinaryExpression() )
+                os << "(" << *lazyExpr.m_transientExpression << ")";
+            else
+                os << *lazyExpr.m_transientExpression;
+        }
+        else {
+            os << "{** error - unchecked empty expression requested **}";
+        }
+        return os;
+    }
+
+    AssertionHandler::AssertionHandler
+        (   StringRef const& macroName,
+            SourceLineInfo const& lineInfo,
+            StringRef capturedExpression,
+            ResultDisposition::Flags resultDisposition )
+    :   m_assertionInfo{ macroName, lineInfo, capturedExpression, resultDisposition },
+        m_resultCapture( getResultCapture() )
+    {}
+
+    void AssertionHandler::handleExpr( ITransientExpression const& expr ) {
+        m_resultCapture.handleExpr( m_assertionInfo, expr, m_reaction );
+    }
+    void AssertionHandler::handleMessage(ResultWas::OfType resultType, StringRef const& message) {
+        m_resultCapture.handleMessage( m_assertionInfo, resultType, message, m_reaction );
+    }
+
+    auto AssertionHandler::allowThrows() const -> bool {
+        return getCurrentContext().getConfig()->allowThrows();
+    }
+
+    void AssertionHandler::complete() {
+        setCompleted();
+        if( m_reaction.shouldDebugBreak ) {
+
+            // If you find your debugger stopping you here then go one level up on the
+            // call-stack for the code that caused it (typically a failed assertion)
+
+            // (To go back to the test and change execution, jump over the throw, next)
+            CATCH_BREAK_INTO_DEBUGGER();
+        }
+        if (m_reaction.shouldThrow) {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+            throw Catch::TestFailureException();
+#else
+            CATCH_ERROR( "Test failure requires aborting test!" );
+#endif
+        }
+    }
+    void AssertionHandler::setCompleted() {
+        m_completed = true;
+    }
+
+    void AssertionHandler::handleUnexpectedInflightException() {
+        m_resultCapture.handleUnexpectedInflightException( m_assertionInfo, Catch::translateActiveException(), m_reaction );
+    }
+
+    void AssertionHandler::handleExceptionThrownAsExpected() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+    void AssertionHandler::handleExceptionNotThrownAsExpected() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+
+    void AssertionHandler::handleUnexpectedExceptionNotThrown() {
+        m_resultCapture.handleUnexpectedExceptionNotThrown( m_assertionInfo, m_reaction );
+    }
+
+    void AssertionHandler::handleThrowingCallSkipped() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+
+    // This is the overload that takes a string and infers the Equals matcher from it
+    // The more general overload, that takes any string matcher, is in catch_capture_matchers.cpp
+    void handleExceptionMatchExpr( AssertionHandler& handler, std::string const& str, StringRef const& matcherString  ) {
+        handleExceptionMatchExpr( handler, Matchers::Equals( str ), matcherString );
+    }
+
+} // namespace Catch
+// end catch_assertionhandler.cpp
+// start catch_assertionresult.cpp
+
+namespace Catch {
+    AssertionResultData::AssertionResultData(ResultWas::OfType _resultType, LazyExpression const & _lazyExpression):
+        lazyExpression(_lazyExpression),
+        resultType(_resultType) {}
+
+    std::string AssertionResultData::reconstructExpression() const {
+
+        if( reconstructedExpression.empty() ) {
+            if( lazyExpression ) {
+                ReusableStringStream rss;
+                rss << lazyExpression;
+                reconstructedExpression = rss.str();
+            }
+        }
+        return reconstructedExpression;
+    }
+
+    AssertionResult::AssertionResult( AssertionInfo const& info, AssertionResultData const& data )
+    :   m_info( info ),
+        m_resultData( data )
+    {}
+
+    // Result was a success
+    bool AssertionResult::succeeded() const {
+        return Catch::isOk( m_resultData.resultType );
+    }
+
+    // Result was a success, or failure is suppressed
+    bool AssertionResult::isOk() const {
+        return Catch::isOk( m_resultData.resultType ) || shouldSuppressFailure( m_info.resultDisposition );
+    }
+
+    ResultWas::OfType AssertionResult::getResultType() const {
+        return m_resultData.resultType;
+    }
+
+    bool AssertionResult::hasExpression() const {
+        return !m_info.capturedExpression.empty();
+    }
+
+    bool AssertionResult::hasMessage() const {
+        return !m_resultData.message.empty();
+    }
+
+    std::string AssertionResult::getExpression() const {
+        // Possibly overallocating by 3 characters should be basically free
+        std::string expr; expr.reserve(m_info.capturedExpression.size() + 3);
+        if (isFalseTest(m_info.resultDisposition)) {
+            expr += "!(";
+        }
+        expr += m_info.capturedExpression;
+        if (isFalseTest(m_info.resultDisposition)) {
+            expr += ')';
+        }
+        return expr;
+    }
+
+    std::string AssertionResult::getExpressionInMacro() const {
+        std::string expr;
+        if( m_info.macroName.empty() )
+            expr = static_cast<std::string>(m_info.capturedExpression);
+        else {
+            expr.reserve( m_info.macroName.size() + m_info.capturedExpression.size() + 4 );
+            expr += m_info.macroName;
+            expr += "( ";
+            expr += m_info.capturedExpression;
+            expr += " )";
+        }
+        return expr;
+    }
+
+    bool AssertionResult::hasExpandedExpression() const {
+        return hasExpression() && getExpandedExpression() != getExpression();
+    }
+
+    std::string AssertionResult::getExpandedExpression() const {
+        std::string expr = m_resultData.reconstructExpression();
+        return expr.empty()
+                ? getExpression()
+                : expr;
+    }
+
+    std::string AssertionResult::getMessage() const {
+        return m_resultData.message;
+    }
+    SourceLineInfo AssertionResult::getSourceInfo() const {
+        return m_info.lineInfo;
+    }
+
+    StringRef AssertionResult::getTestMacroName() const {
+        return m_info.macroName;
+    }
+
+} // end namespace Catch
+// end catch_assertionresult.cpp
+// start catch_capture_matchers.cpp
+
+namespace Catch {
+
+    using StringMatcher = Matchers::Impl::MatcherBase<std::string>;
+
+    // This is the general overload that takes a any string matcher
+    // There is another overload, in catch_assertionhandler.h/.cpp, that only takes a string and infers
+    // the Equals matcher (so the header does not mention matchers)
+    void handleExceptionMatchExpr( AssertionHandler& handler, StringMatcher const& matcher, StringRef const& matcherString  ) {
+        std::string exceptionMessage = Catch::translateActiveException();
+        MatchExpr<std::string, StringMatcher const&> expr( exceptionMessage, matcher, matcherString );
+        handler.handleExpr( expr );
+    }
+
+} // namespace Catch
+// end catch_capture_matchers.cpp
+// start catch_commandline.cpp
+
+// start catch_commandline.h
+
+// start catch_clara.h
+
+// Use Catch's value for console width (store Clara's off to the side, if present)
+#ifdef CLARA_CONFIG_CONSOLE_WIDTH
+#define CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
+#undef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
+#endif
+#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CATCH_CONFIG_CONSOLE_WIDTH-1
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wweak-vtables"
+#pragma clang diagnostic ignored "-Wexit-time-destructors"
+#pragma clang diagnostic ignored "-Wshadow"
+#endif
+
+// start clara.hpp
+// Copyright 2017 Two Blue Cubes Ltd. All rights reserved.
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// See https://github.com/philsquared/Clara for more details
+
+// Clara v1.1.5
+
+
+#ifndef CATCH_CLARA_CONFIG_CONSOLE_WIDTH
+#define CATCH_CLARA_CONFIG_CONSOLE_WIDTH 80
+#endif
+
+#ifndef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
+#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CATCH_CLARA_CONFIG_CONSOLE_WIDTH
+#endif
+
+#ifndef CLARA_CONFIG_OPTIONAL_TYPE
+#ifdef __has_include
+#if __has_include(<optional>) && __cplusplus >= 201703L
+#include <optional>
+#define CLARA_CONFIG_OPTIONAL_TYPE std::optional
+#endif
+#endif
+#endif
+
+// ----------- #included from clara_textflow.hpp -----------
+
+// TextFlowCpp
+//
+// A single-header library for wrapping and laying out basic text, by Phil Nash
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// This project is hosted at https://github.com/philsquared/textflowcpp
+
+
+#include <cassert>
+#include <ostream>
+#include <sstream>
+#include <vector>
+
+#ifndef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
+#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH 80
+#endif
+
+namespace Catch {
+namespace clara {
+namespace TextFlow {
+
+inline auto isWhitespace(char c) -> bool {
+	static std::string chars = " \t\n\r";
+	return chars.find(c) != std::string::npos;
+}
+inline auto isBreakableBefore(char c) -> bool {
+	static std::string chars = "[({<|";
+	return chars.find(c) != std::string::npos;
+}
+inline auto isBreakableAfter(char c) -> bool {
+	static std::string chars = "])}>.,:;*+-=&/\\";
+	return chars.find(c) != std::string::npos;
+}
+
+class Columns;
+
+class Column {
+	std::vector<std::string> m_strings;
+	size_t m_width = CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH;
+	size_t m_indent = 0;
+	size_t m_initialIndent = std::string::npos;
+
+public:
+	class iterator {
+		friend Column;
+
+		Column const& m_column;
+		size_t m_stringIndex = 0;
+		size_t m_pos = 0;
+
+		size_t m_len = 0;
+		size_t m_end = 0;
+		bool m_suffix = false;
+
+		iterator(Column const& column, size_t stringIndex)
+			: m_column(column),
+			m_stringIndex(stringIndex) {}
+
+		auto line() const -> std::string const& { return m_column.m_strings[m_stringIndex]; }
+
+		auto isBoundary(size_t at) const -> bool {
+			assert(at > 0);
+			assert(at <= line().size());
+
+			return at == line().size() ||
+				(isWhitespace(line()[at]) && !isWhitespace(line()[at - 1])) ||
+				isBreakableBefore(line()[at]) ||
+				isBreakableAfter(line()[at - 1]);
+		}
+
+		void calcLength() {
+			assert(m_stringIndex < m_column.m_strings.size());
+
+			m_suffix = false;
+			auto width = m_column.m_width - indent();
+			m_end = m_pos;
+			if (line()[m_pos] == '\n') {
+				++m_end;
+			}
+			while (m_end < line().size() && line()[m_end] != '\n')
+				++m_end;
+
+			if (m_end < m_pos + width) {
+				m_len = m_end - m_pos;
+			} else {
+				size_t len = width;
+				while (len > 0 && !isBoundary(m_pos + len))
+					--len;
+				while (len > 0 && isWhitespace(line()[m_pos + len - 1]))
+					--len;
+
+				if (len > 0) {
+					m_len = len;
+				} else {
+					m_suffix = true;
+					m_len = width - 1;
+				}
+			}
+		}
+
+		auto indent() const -> size_t {
+			auto initial = m_pos == 0 && m_stringIndex == 0 ? m_column.m_initialIndent : std::string::npos;
+			return initial == std::string::npos ? m_column.m_indent : initial;
+		}
+
+		auto addIndentAndSuffix(std::string const &plain) const -> std::string {
+			return std::string(indent(), ' ') + (m_suffix ? plain + "-" : plain);
+		}
+
+	public:
+		using difference_type = std::ptrdiff_t;
+		using value_type = std::string;
+		using pointer = value_type * ;
+		using reference = value_type & ;
+		using iterator_category = std::forward_iterator_tag;
+
+		explicit iterator(Column const& column) : m_column(column) {
+			assert(m_column.m_width > m_column.m_indent);
+			assert(m_column.m_initialIndent == std::string::npos || m_column.m_width > m_column.m_initialIndent);
+			calcLength();
+			if (m_len == 0)
+				m_stringIndex++; // Empty string
+		}
+
+		auto operator *() const -> std::string {
+			assert(m_stringIndex < m_column.m_strings.size());
+			assert(m_pos <= m_end);
+			return addIndentAndSuffix(line().substr(m_pos, m_len));
+		}
+
+		auto operator ++() -> iterator& {
+			m_pos += m_len;
+			if (m_pos < line().size() && line()[m_pos] == '\n')
+				m_pos += 1;
+			else
+				while (m_pos < line().size() && isWhitespace(line()[m_pos]))
+					++m_pos;
+
+			if (m_pos == line().size()) {
+				m_pos = 0;
+				++m_stringIndex;
+			}
+			if (m_stringIndex < m_column.m_strings.size())
+				calcLength();
+			return *this;
+		}
+		auto operator ++(int) -> iterator {
+			iterator prev(*this);
+			operator++();
+			return prev;
+		}
+
+		auto operator ==(iterator const& other) const -> bool {
+			return
+				m_pos == other.m_pos &&
+				m_stringIndex == other.m_stringIndex &&
+				&m_column == &other.m_column;
+		}
+		auto operator !=(iterator const& other) const -> bool {
+			return !operator==(other);
+		}
+	};
+	using const_iterator = iterator;
+
+	explicit Column(std::string const& text) { m_strings.push_back(text); }
+
+	auto width(size_t newWidth) -> Column& {
+		assert(newWidth > 0);
+		m_width = newWidth;
+		return *this;
+	}
+	auto indent(size_t newIndent) -> Column& {
+		m_indent = newIndent;
+		return *this;
+	}
+	auto initialIndent(size_t newIndent) -> Column& {
+		m_initialIndent = newIndent;
+		return *this;
+	}
+
+	auto width() const -> size_t { return m_width; }
+	auto begin() const -> iterator { return iterator(*this); }
+	auto end() const -> iterator { return { *this, m_strings.size() }; }
+
+	inline friend std::ostream& operator << (std::ostream& os, Column const& col) {
+		bool first = true;
+		for (auto line : col) {
+			if (first)
+				first = false;
+			else
+				os << "\n";
+			os << line;
+		}
+		return os;
+	}
+
+	auto operator + (Column const& other)->Columns;
+
+	auto toString() const -> std::string {
+		std::ostringstream oss;
+		oss << *this;
+		return oss.str();
+	}
+};
+
+class Spacer : public Column {
+
+public:
+	explicit Spacer(size_t spaceWidth) : Column("") {
+		width(spaceWidth);
+	}
+};
+
+class Columns {
+	std::vector<Column> m_columns;
+
+public:
+
+	class iterator {
+		friend Columns;
+		struct EndTag {};
+
+		std::vector<Column> const& m_columns;
+		std::vector<Column::iterator> m_iterators;
+		size_t m_activeIterators;
+
+		iterator(Columns const& columns, EndTag)
+			: m_columns(columns.m_columns),
+			m_activeIterators(0) {
+			m_iterators.reserve(m_columns.size());
+
+			for (auto const& col : m_columns)
+				m_iterators.push_back(col.end());
+		}
+
+	public:
+		using difference_type = std::ptrdiff_t;
+		using value_type = std::string;
+		using pointer = value_type * ;
+		using reference = value_type & ;
+		using iterator_category = std::forward_iterator_tag;
+
+		explicit iterator(Columns const& columns)
+			: m_columns(columns.m_columns),
+			m_activeIterators(m_columns.size()) {
+			m_iterators.reserve(m_columns.size());
+
+			for (auto const& col : m_columns)
+				m_iterators.push_back(col.begin());
+		}
+
+		auto operator ==(iterator const& other) const -> bool {
+			return m_iterators == other.m_iterators;
+		}
+		auto operator !=(iterator const& other) const -> bool {
+			return m_iterators != other.m_iterators;
+		}
+		auto operator *() const -> std::string {
+			std::string row, padding;
+
+			for (size_t i = 0; i < m_columns.size(); ++i) {
+				auto width = m_columns[i].width();
+				if (m_iterators[i] != m_columns[i].end()) {
+					std::string col = *m_iterators[i];
+					row += padding + col;
+					if (col.size() < width)
+						padding = std::string(width - col.size(), ' ');
+					else
+						padding = "";
+				} else {
+					padding += std::string(width, ' ');
+				}
+			}
+			return row;
+		}
+		auto operator ++() -> iterator& {
+			for (size_t i = 0; i < m_columns.size(); ++i) {
+				if (m_iterators[i] != m_columns[i].end())
+					++m_iterators[i];
+			}
+			return *this;
+		}
+		auto operator ++(int) -> iterator {
+			iterator prev(*this);
+			operator++();
+			return prev;
+		}
+	};
+	using const_iterator = iterator;
+
+	auto begin() const -> iterator { return iterator(*this); }
+	auto end() const -> iterator { return { *this, iterator::EndTag() }; }
+
+	auto operator += (Column const& col) -> Columns& {
+		m_columns.push_back(col);
+		return *this;
+	}
+	auto operator + (Column const& col) -> Columns {
+		Columns combined = *this;
+		combined += col;
+		return combined;
+	}
+
+	inline friend std::ostream& operator << (std::ostream& os, Columns const& cols) {
+
+		bool first = true;
+		for (auto line : cols) {
+			if (first)
+				first = false;
+			else
+				os << "\n";
+			os << line;
+		}
+		return os;
+	}
+
+	auto toString() const -> std::string {
+		std::ostringstream oss;
+		oss << *this;
+		return oss.str();
+	}
+};
+
+inline auto Column::operator + (Column const& other) -> Columns {
+	Columns cols;
+	cols += *this;
+	cols += other;
+	return cols;
+}
+}
+
+}
+}
+
+// ----------- end of #include from clara_textflow.hpp -----------
+// ........... back in clara.hpp
+
+#include <cctype>
+#include <string>
+#include <memory>
+#include <set>
+#include <algorithm>
+
+#if !defined(CATCH_PLATFORM_WINDOWS) && ( defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) )
+#define CATCH_PLATFORM_WINDOWS
+#endif
+
+namespace Catch { namespace clara {
+namespace detail {
+
+    // Traits for extracting arg and return type of lambdas (for single argument lambdas)
+    template<typename L>
+    struct UnaryLambdaTraits : UnaryLambdaTraits<decltype( &L::operator() )> {};
+
+    template<typename ClassT, typename ReturnT, typename... Args>
+    struct UnaryLambdaTraits<ReturnT( ClassT::* )( Args... ) const> {
+        static const bool isValid = false;
+    };
+
+    template<typename ClassT, typename ReturnT, typename ArgT>
+    struct UnaryLambdaTraits<ReturnT( ClassT::* )( ArgT ) const> {
+        static const bool isValid = true;
+        using ArgType = typename std::remove_const<typename std::remove_reference<ArgT>::type>::type;
+        using ReturnType = ReturnT;
+    };
+
+    class TokenStream;
+
+    // Transport for raw args (copied from main args, or supplied via init list for testing)
+    class Args {
+        friend TokenStream;
+        std::string m_exeName;
+        std::vector<std::string> m_args;
+
+    public:
+        Args( int argc, char const* const* argv )
+            : m_exeName(argv[0]),
+              m_args(argv + 1, argv + argc) {}
+
+        Args( std::initializer_list<std::string> args )
+        :   m_exeName( *args.begin() ),
+            m_args( args.begin()+1, args.end() )
+        {}
+
+        auto exeName() const -> std::string {
+            return m_exeName;
+        }
+    };
+
+    // Wraps a token coming from a token stream. These may not directly correspond to strings as a single string
+    // may encode an option + its argument if the : or = form is used
+    enum class TokenType {
+        Option, Argument
+    };
+    struct Token {
+        TokenType type;
+        std::string token;
+    };
+
+    inline auto isOptPrefix( char c ) -> bool {
+        return c == '-'
+#ifdef CATCH_PLATFORM_WINDOWS
+            || c == '/'
+#endif
+        ;
+    }
+
+    // Abstracts iterators into args as a stream of tokens, with option arguments uniformly handled
+    class TokenStream {
+        using Iterator = std::vector<std::string>::const_iterator;
+        Iterator it;
+        Iterator itEnd;
+        std::vector<Token> m_tokenBuffer;
+
+        void loadBuffer() {
+            m_tokenBuffer.resize( 0 );
+
+            // Skip any empty strings
+            while( it != itEnd && it->empty() )
+                ++it;
+
+            if( it != itEnd ) {
+                auto const &next = *it;
+                if( isOptPrefix( next[0] ) ) {
+                    auto delimiterPos = next.find_first_of( " :=" );
+                    if( delimiterPos != std::string::npos ) {
+                        m_tokenBuffer.push_back( { TokenType::Option, next.substr( 0, delimiterPos ) } );
+                        m_tokenBuffer.push_back( { TokenType::Argument, next.substr( delimiterPos + 1 ) } );
+                    } else {
+                        if( next[1] != '-' && next.size() > 2 ) {
+                            std::string opt = "- ";
+                            for( size_t i = 1; i < next.size(); ++i ) {
+                                opt[1] = next[i];
+                                m_tokenBuffer.push_back( { TokenType::Option, opt } );
+                            }
+                        } else {
+                            m_tokenBuffer.push_back( { TokenType::Option, next } );
+                        }
+                    }
+                } else {
+                    m_tokenBuffer.push_back( { TokenType::Argument, next } );
+                }
+            }
+        }
+
+    public:
+        explicit TokenStream( Args const &args ) : TokenStream( args.m_args.begin(), args.m_args.end() ) {}
+
+        TokenStream( Iterator it, Iterator itEnd ) : it( it ), itEnd( itEnd ) {
+            loadBuffer();
+        }
+
+        explicit operator bool() const {
+            return !m_tokenBuffer.empty() || it != itEnd;
+        }
+
+        auto count() const -> size_t { return m_tokenBuffer.size() + (itEnd - it); }
+
+        auto operator*() const -> Token {
+            assert( !m_tokenBuffer.empty() );
+            return m_tokenBuffer.front();
+        }
+
+        auto operator->() const -> Token const * {
+            assert( !m_tokenBuffer.empty() );
+            return &m_tokenBuffer.front();
+        }
+
+        auto operator++() -> TokenStream & {
+            if( m_tokenBuffer.size() >= 2 ) {
+                m_tokenBuffer.erase( m_tokenBuffer.begin() );
+            } else {
+                if( it != itEnd )
+                    ++it;
+                loadBuffer();
+            }
+            return *this;
+        }
+    };
+
+    class ResultBase {
+    public:
+        enum Type {
+            Ok, LogicError, RuntimeError
+        };
+
+    protected:
+        ResultBase( Type type ) : m_type( type ) {}
+        virtual ~ResultBase() = default;
+
+        virtual void enforceOk() const = 0;
+
+        Type m_type;
+    };
+
+    template<typename T>
+    class ResultValueBase : public ResultBase {
+    public:
+        auto value() const -> T const & {
+            enforceOk();
+            return m_value;
+        }
+
+    protected:
+        ResultValueBase( Type type ) : ResultBase( type ) {}
+
+        ResultValueBase( ResultValueBase const &other ) : ResultBase( other ) {
+            if( m_type == ResultBase::Ok )
+                new( &m_value ) T( other.m_value );
+        }
+
+        ResultValueBase( Type, T const &value ) : ResultBase( Ok ) {
+            new( &m_value ) T( value );
+        }
+
+        auto operator=( ResultValueBase const &other ) -> ResultValueBase & {
+            if( m_type == ResultBase::Ok )
+                m_value.~T();
+            ResultBase::operator=(other);
+            if( m_type == ResultBase::Ok )
+                new( &m_value ) T( other.m_value );
+            return *this;
+        }
+
+        ~ResultValueBase() override {
+            if( m_type == Ok )
+                m_value.~T();
+        }
+
+        union {
+            T m_value;
+        };
+    };
+
+    template<>
+    class ResultValueBase<void> : public ResultBase {
+    protected:
+        using ResultBase::ResultBase;
+    };
+
+    template<typename T = void>
+    class BasicResult : public ResultValueBase<T> {
+    public:
+        template<typename U>
+        explicit BasicResult( BasicResult<U> const &other )
+        :   ResultValueBase<T>( other.type() ),
+            m_errorMessage( other.errorMessage() )
+        {
+            assert( type() != ResultBase::Ok );
+        }
+
+        template<typename U>
+        static auto ok( U const &value ) -> BasicResult { return { ResultBase::Ok, value }; }
+        static auto ok() -> BasicResult { return { ResultBase::Ok }; }
+        static auto logicError( std::string const &message ) -> BasicResult { return { ResultBase::LogicError, message }; }
+        static auto runtimeError( std::string const &message ) -> BasicResult { return { ResultBase::RuntimeError, message }; }
+
+        explicit operator bool() const { return m_type == ResultBase::Ok; }
+        auto type() const -> ResultBase::Type { return m_type; }
+        auto errorMessage() const -> std::string { return m_errorMessage; }
+
+    protected:
+        void enforceOk() const override {
+
+            // Errors shouldn't reach this point, but if they do
+            // the actual error message will be in m_errorMessage
+            assert( m_type != ResultBase::LogicError );
+            assert( m_type != ResultBase::RuntimeError );
+            if( m_type != ResultBase::Ok )
+                std::abort();
+        }
+
+        std::string m_errorMessage; // Only populated if resultType is an error
+
+        BasicResult( ResultBase::Type type, std::string const &message )
+        :   ResultValueBase<T>(type),
+            m_errorMessage(message)
+        {
+            assert( m_type != ResultBase::Ok );
+        }
+
+        using ResultValueBase<T>::ResultValueBase;
+        using ResultBase::m_type;
+    };
+
+    enum class ParseResultType {
+        Matched, NoMatch, ShortCircuitAll, ShortCircuitSame
+    };
+
+    class ParseState {
+    public:
+
+        ParseState( ParseResultType type, TokenStream const &remainingTokens )
+        : m_type(type),
+          m_remainingTokens( remainingTokens )
+        {}
+
+        auto type() const -> ParseResultType { return m_type; }
+        auto remainingTokens() const -> TokenStream { return m_remainingTokens; }
+
+    private:
+        ParseResultType m_type;
+        TokenStream m_remainingTokens;
+    };
+
+    using Result = BasicResult<void>;
+    using ParserResult = BasicResult<ParseResultType>;
+    using InternalParseResult = BasicResult<ParseState>;
+
+    struct HelpColumns {
+        std::string left;
+        std::string right;
+    };
+
+    template<typename T>
+    inline auto convertInto( std::string const &source, T& target ) -> ParserResult {
+        std::stringstream ss;
+        ss << source;
+        ss >> target;
+        if( ss.fail() )
+            return ParserResult::runtimeError( "Unable to convert '" + source + "' to destination type" );
+        else
+            return ParserResult::ok( ParseResultType::Matched );
+    }
+    inline auto convertInto( std::string const &source, std::string& target ) -> ParserResult {
+        target = source;
+        return ParserResult::ok( ParseResultType::Matched );
+    }
+    inline auto convertInto( std::string const &source, bool &target ) -> ParserResult {
+        std::string srcLC = source;
+        std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( unsigned char c ) { return static_cast<char>( std::tolower(c) ); } );
+        if (srcLC == "y" || srcLC == "1" || srcLC == "true" || srcLC == "yes" || srcLC == "on")
+            target = true;
+        else if (srcLC == "n" || srcLC == "0" || srcLC == "false" || srcLC == "no" || srcLC == "off")
+            target = false;
+        else
+            return ParserResult::runtimeError( "Expected a boolean value but did not recognise: '" + source + "'" );
+        return ParserResult::ok( ParseResultType::Matched );
+    }
+#ifdef CLARA_CONFIG_OPTIONAL_TYPE
+    template<typename T>
+    inline auto convertInto( std::string const &source, CLARA_CONFIG_OPTIONAL_TYPE<T>& target ) -> ParserResult {
+        T temp;
+        auto result = convertInto( source, temp );
+        if( result )
+            target = std::move(temp);
+        return result;
+    }
+#endif // CLARA_CONFIG_OPTIONAL_TYPE
+
+    struct NonCopyable {
+        NonCopyable() = default;
+        NonCopyable( NonCopyable const & ) = delete;
+        NonCopyable( NonCopyable && ) = delete;
+        NonCopyable &operator=( NonCopyable const & ) = delete;
+        NonCopyable &operator=( NonCopyable && ) = delete;
+    };
+
+    struct BoundRef : NonCopyable {
+        virtual ~BoundRef() = default;
+        virtual auto isContainer() const -> bool { return false; }
+        virtual auto isFlag() const -> bool { return false; }
+    };
+    struct BoundValueRefBase : BoundRef {
+        virtual auto setValue( std::string const &arg ) -> ParserResult = 0;
+    };
+    struct BoundFlagRefBase : BoundRef {
+        virtual auto setFlag( bool flag ) -> ParserResult = 0;
+        virtual auto isFlag() const -> bool { return true; }
+    };
+
+    template<typename T>
+    struct BoundValueRef : BoundValueRefBase {
+        T &m_ref;
+
+        explicit BoundValueRef( T &ref ) : m_ref( ref ) {}
+
+        auto setValue( std::string const &arg ) -> ParserResult override {
+            return convertInto( arg, m_ref );
+        }
+    };
+
+    template<typename T>
+    struct BoundValueRef<std::vector<T>> : BoundValueRefBase {
+        std::vector<T> &m_ref;
+
+        explicit BoundValueRef( std::vector<T> &ref ) : m_ref( ref ) {}
+
+        auto isContainer() const -> bool override { return true; }
+
+        auto setValue( std::string const &arg ) -> ParserResult override {
+            T temp;
+            auto result = convertInto( arg, temp );
+            if( result )
+                m_ref.push_back( temp );
+            return result;
+        }
+    };
+
+    struct BoundFlagRef : BoundFlagRefBase {
+        bool &m_ref;
+
+        explicit BoundFlagRef( bool &ref ) : m_ref( ref ) {}
+
+        auto setFlag( bool flag ) -> ParserResult override {
+            m_ref = flag;
+            return ParserResult::ok( ParseResultType::Matched );
+        }
+    };
+
+    template<typename ReturnType>
+    struct LambdaInvoker {
+        static_assert( std::is_same<ReturnType, ParserResult>::value, "Lambda must return void or clara::ParserResult" );
+
+        template<typename L, typename ArgType>
+        static auto invoke( L const &lambda, ArgType const &arg ) -> ParserResult {
+            return lambda( arg );
+        }
+    };
+
+    template<>
+    struct LambdaInvoker<void> {
+        template<typename L, typename ArgType>
+        static auto invoke( L const &lambda, ArgType const &arg ) -> ParserResult {
+            lambda( arg );
+            return ParserResult::ok( ParseResultType::Matched );
+        }
+    };
+
+    template<typename ArgType, typename L>
+    inline auto invokeLambda( L const &lambda, std::string const &arg ) -> ParserResult {
+        ArgType temp{};
+        auto result = convertInto( arg, temp );
+        return !result
+           ? result
+           : LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke( lambda, temp );
+    }
+
+    template<typename L>
+    struct BoundLambda : BoundValueRefBase {
+        L m_lambda;
+
+        static_assert( UnaryLambdaTraits<L>::isValid, "Supplied lambda must take exactly one argument" );
+        explicit BoundLambda( L const &lambda ) : m_lambda( lambda ) {}
+
+        auto setValue( std::string const &arg ) -> ParserResult override {
+            return invokeLambda<typename UnaryLambdaTraits<L>::ArgType>( m_lambda, arg );
+        }
+    };
+
+    template<typename L>
+    struct BoundFlagLambda : BoundFlagRefBase {
+        L m_lambda;
+
+        static_assert( UnaryLambdaTraits<L>::isValid, "Supplied lambda must take exactly one argument" );
+        static_assert( std::is_same<typename UnaryLambdaTraits<L>::ArgType, bool>::value, "flags must be boolean" );
+
+        explicit BoundFlagLambda( L const &lambda ) : m_lambda( lambda ) {}
+
+        auto setFlag( bool flag ) -> ParserResult override {
+            return LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke( m_lambda, flag );
+        }
+    };
+
+    enum class Optionality { Optional, Required };
+
+    struct Parser;
+
+    class ParserBase {
+    public:
+        virtual ~ParserBase() = default;
+        virtual auto validate() const -> Result { return Result::ok(); }
+        virtual auto parse( std::string const& exeName, TokenStream const &tokens) const -> InternalParseResult  = 0;
+        virtual auto cardinality() const -> size_t { return 1; }
+
+        auto parse( Args const &args ) const -> InternalParseResult {
+            return parse( args.exeName(), TokenStream( args ) );
+        }
+    };
+
+    template<typename DerivedT>
+    class ComposableParserImpl : public ParserBase {
+    public:
+        template<typename T>
+        auto operator|( T const &other ) const -> Parser;
+
+		template<typename T>
+        auto operator+( T const &other ) const -> Parser;
+    };
+
+    // Common code and state for Args and Opts
+    template<typename DerivedT>
+    class ParserRefImpl : public ComposableParserImpl<DerivedT> {
+    protected:
+        Optionality m_optionality = Optionality::Optional;
+        std::shared_ptr<BoundRef> m_ref;
+        std::string m_hint;
+        std::string m_description;
+
+        explicit ParserRefImpl( std::shared_ptr<BoundRef> const &ref ) : m_ref( ref ) {}
+
+    public:
+        template<typename T>
+        ParserRefImpl( T &ref, std::string const &hint )
+        :   m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
+            m_hint( hint )
+        {}
+
+        template<typename LambdaT>
+        ParserRefImpl( LambdaT const &ref, std::string const &hint )
+        :   m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
+            m_hint(hint)
+        {}
+
+        auto operator()( std::string const &description ) -> DerivedT & {
+            m_description = description;
+            return static_cast<DerivedT &>( *this );
+        }
+
+        auto optional() -> DerivedT & {
+            m_optionality = Optionality::Optional;
+            return static_cast<DerivedT &>( *this );
+        };
+
+        auto required() -> DerivedT & {
+            m_optionality = Optionality::Required;
+            return static_cast<DerivedT &>( *this );
+        };
+
+        auto isOptional() const -> bool {
+            return m_optionality == Optionality::Optional;
+        }
+
+        auto cardinality() const -> size_t override {
+            if( m_ref->isContainer() )
+                return 0;
+            else
+                return 1;
+        }
+
+        auto hint() const -> std::string { return m_hint; }
+    };
+
+    class ExeName : public ComposableParserImpl<ExeName> {
+        std::shared_ptr<std::string> m_name;
+        std::shared_ptr<BoundValueRefBase> m_ref;
+
+        template<typename LambdaT>
+        static auto makeRef(LambdaT const &lambda) -> std::shared_ptr<BoundValueRefBase> {
+            return std::make_shared<BoundLambda<LambdaT>>( lambda) ;
+        }
+
+    public:
+        ExeName() : m_name( std::make_shared<std::string>( "<executable>" ) ) {}
+
+        explicit ExeName( std::string &ref ) : ExeName() {
+            m_ref = std::make_shared<BoundValueRef<std::string>>( ref );
+        }
+
+        template<typename LambdaT>
+        explicit ExeName( LambdaT const& lambda ) : ExeName() {
+            m_ref = std::make_shared<BoundLambda<LambdaT>>( lambda );
+        }
+
+        // The exe name is not parsed out of the normal tokens, but is handled specially
+        auto parse( std::string const&, TokenStream const &tokens ) const -> InternalParseResult override {
+            return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, tokens ) );
+        }
+
+        auto name() const -> std::string { return *m_name; }
+        auto set( std::string const& newName ) -> ParserResult {
+
+            auto lastSlash = newName.find_last_of( "\\/" );
+            auto filename = ( lastSlash == std::string::npos )
+                    ? newName
+                    : newName.substr( lastSlash+1 );
+
+            *m_name = filename;
+            if( m_ref )
+                return m_ref->setValue( filename );
+            else
+                return ParserResult::ok( ParseResultType::Matched );
+        }
+    };
+
+    class Arg : public ParserRefImpl<Arg> {
+    public:
+        using ParserRefImpl::ParserRefImpl;
+
+        auto parse( std::string const &, TokenStream const &tokens ) const -> InternalParseResult override {
+            auto validationResult = validate();
+            if( !validationResult )
+                return InternalParseResult( validationResult );
+
+            auto remainingTokens = tokens;
+            auto const &token = *remainingTokens;
+            if( token.type != TokenType::Argument )
+                return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, remainingTokens ) );
+
+            assert( !m_ref->isFlag() );
+            auto valueRef = static_cast<detail::BoundValueRefBase*>( m_ref.get() );
+
+            auto result = valueRef->setValue( remainingTokens->token );
+            if( !result )
+                return InternalParseResult( result );
+            else
+                return InternalParseResult::ok( ParseState( ParseResultType::Matched, ++remainingTokens ) );
+        }
+    };
+
+    inline auto normaliseOpt( std::string const &optName ) -> std::string {
+#ifdef CATCH_PLATFORM_WINDOWS
+        if( optName[0] == '/' )
+            return "-" + optName.substr( 1 );
+        else
+#endif
+            return optName;
+    }
+
+    class Opt : public ParserRefImpl<Opt> {
+    protected:
+        std::vector<std::string> m_optNames;
+
+    public:
+        template<typename LambdaT>
+        explicit Opt( LambdaT const &ref ) : ParserRefImpl( std::make_shared<BoundFlagLambda<LambdaT>>( ref ) ) {}
+
+        explicit Opt( bool &ref ) : ParserRefImpl( std::make_shared<BoundFlagRef>( ref ) ) {}
+
+        template<typename LambdaT>
+        Opt( LambdaT const &ref, std::string const &hint ) : ParserRefImpl( ref, hint ) {}
+
+        template<typename T>
+        Opt( T &ref, std::string const &hint ) : ParserRefImpl( ref, hint ) {}
+
+        auto operator[]( std::string const &optName ) -> Opt & {
+            m_optNames.push_back( optName );
+            return *this;
+        }
+
+        auto getHelpColumns() const -> std::vector<HelpColumns> {
+            std::ostringstream oss;
+            bool first = true;
+            for( auto const &opt : m_optNames ) {
+                if (first)
+                    first = false;
+                else
+                    oss << ", ";
+                oss << opt;
+            }
+            if( !m_hint.empty() )
+                oss << " <" << m_hint << ">";
+            return { { oss.str(), m_description } };
+        }
+
+        auto isMatch( std::string const &optToken ) const -> bool {
+            auto normalisedToken = normaliseOpt( optToken );
+            for( auto const &name : m_optNames ) {
+                if( normaliseOpt( name ) == normalisedToken )
+                    return true;
+            }
+            return false;
+        }
+
+        using ParserBase::parse;
+
+        auto parse( std::string const&, TokenStream const &tokens ) const -> InternalParseResult override {
+            auto validationResult = validate();
+            if( !validationResult )
+                return InternalParseResult( validationResult );
+
+            auto remainingTokens = tokens;
+            if( remainingTokens && remainingTokens->type == TokenType::Option ) {
+                auto const &token = *remainingTokens;
+                if( isMatch(token.token ) ) {
+                    if( m_ref->isFlag() ) {
+                        auto flagRef = static_cast<detail::BoundFlagRefBase*>( m_ref.get() );
+                        auto result = flagRef->setFlag( true );
+                        if( !result )
+                            return InternalParseResult( result );
+                        if( result.value() == ParseResultType::ShortCircuitAll )
+                            return InternalParseResult::ok( ParseState( result.value(), remainingTokens ) );
+                    } else {
+                        auto valueRef = static_cast<detail::BoundValueRefBase*>( m_ref.get() );
+                        ++remainingTokens;
+                        if( !remainingTokens )
+                            return InternalParseResult::runtimeError( "Expected argument following " + token.token );
+                        auto const &argToken = *remainingTokens;
+                        if( argToken.type != TokenType::Argument )
+                            return InternalParseResult::runtimeError( "Expected argument following " + token.token );
+                        auto result = valueRef->setValue( argToken.token );
+                        if( !result )
+                            return InternalParseResult( result );
+                        if( result.value() == ParseResultType::ShortCircuitAll )
+                            return InternalParseResult::ok( ParseState( result.value(), remainingTokens ) );
+                    }
+                    return InternalParseResult::ok( ParseState( ParseResultType::Matched, ++remainingTokens ) );
+                }
+            }
+            return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, remainingTokens ) );
+        }
+
+        auto validate() const -> Result override {
+            if( m_optNames.empty() )
+                return Result::logicError( "No options supplied to Opt" );
+            for( auto const &name : m_optNames ) {
+                if( name.empty() )
+                    return Result::logicError( "Option name cannot be empty" );
+#ifdef CATCH_PLATFORM_WINDOWS
+                if( name[0] != '-' && name[0] != '/' )
+                    return Result::logicError( "Option name must begin with '-' or '/'" );
+#else
+                if( name[0] != '-' )
+                    return Result::logicError( "Option name must begin with '-'" );
+#endif
+            }
+            return ParserRefImpl::validate();
+        }
+    };
+
+    struct Help : Opt {
+        Help( bool &showHelpFlag )
+        :   Opt([&]( bool flag ) {
+                showHelpFlag = flag;
+                return ParserResult::ok( ParseResultType::ShortCircuitAll );
+            })
+        {
+            static_cast<Opt &>( *this )
+                    ("display usage information")
+                    ["-?"]["-h"]["--help"]
+                    .optional();
+        }
+    };
+
+    struct Parser : ParserBase {
+
+        mutable ExeName m_exeName;
+        std::vector<Opt> m_options;
+        std::vector<Arg> m_args;
+
+        auto operator|=( ExeName const &exeName ) -> Parser & {
+            m_exeName = exeName;
+            return *this;
+        }
+
+        auto operator|=( Arg const &arg ) -> Parser & {
+            m_args.push_back(arg);
+            return *this;
+        }
+
+        auto operator|=( Opt const &opt ) -> Parser & {
+            m_options.push_back(opt);
+            return *this;
+        }
+
+        auto operator|=( Parser const &other ) -> Parser & {
+            m_options.insert(m_options.end(), other.m_options.begin(), other.m_options.end());
+            m_args.insert(m_args.end(), other.m_args.begin(), other.m_args.end());
+            return *this;
+        }
+
+        template<typename T>
+        auto operator|( T const &other ) const -> Parser {
+            return Parser( *this ) |= other;
+        }
+
+        // Forward deprecated interface with '+' instead of '|'
+        template<typename T>
+        auto operator+=( T const &other ) -> Parser & { return operator|=( other ); }
+        template<typename T>
+        auto operator+( T const &other ) const -> Parser { return operator|( other ); }
+
+        auto getHelpColumns() const -> std::vector<HelpColumns> {
+            std::vector<HelpColumns> cols;
+            for (auto const &o : m_options) {
+                auto childCols = o.getHelpColumns();
+                cols.insert( cols.end(), childCols.begin(), childCols.end() );
+            }
+            return cols;
+        }
+
+        void writeToStream( std::ostream &os ) const {
+            if (!m_exeName.name().empty()) {
+                os << "usage:\n" << "  " << m_exeName.name() << " ";
+                bool required = true, first = true;
+                for( auto const &arg : m_args ) {
+                    if (first)
+                        first = false;
+                    else
+                        os << " ";
+                    if( arg.isOptional() && required ) {
+                        os << "[";
+                        required = false;
+                    }
+                    os << "<" << arg.hint() << ">";
+                    if( arg.cardinality() == 0 )
+                        os << " ... ";
+                }
+                if( !required )
+                    os << "]";
+                if( !m_options.empty() )
+                    os << " options";
+                os << "\n\nwhere options are:" << std::endl;
+            }
+
+            auto rows = getHelpColumns();
+            size_t consoleWidth = CATCH_CLARA_CONFIG_CONSOLE_WIDTH;
+            size_t optWidth = 0;
+            for( auto const &cols : rows )
+                optWidth = (std::max)(optWidth, cols.left.size() + 2);
+
+            optWidth = (std::min)(optWidth, consoleWidth/2);
+
+            for( auto const &cols : rows ) {
+                auto row =
+                        TextFlow::Column( cols.left ).width( optWidth ).indent( 2 ) +
+                        TextFlow::Spacer(4) +
+                        TextFlow::Column( cols.right ).width( consoleWidth - 7 - optWidth );
+                os << row << std::endl;
+            }
+        }
+
+        friend auto operator<<( std::ostream &os, Parser const &parser ) -> std::ostream& {
+            parser.writeToStream( os );
+            return os;
+        }
+
+        auto validate() const -> Result override {
+            for( auto const &opt : m_options ) {
+                auto result = opt.validate();
+                if( !result )
+                    return result;
+            }
+            for( auto const &arg : m_args ) {
+                auto result = arg.validate();
+                if( !result )
+                    return result;
+            }
+            return Result::ok();
+        }
+
+        using ParserBase::parse;
+
+        auto parse( std::string const& exeName, TokenStream const &tokens ) const -> InternalParseResult override {
+
+            struct ParserInfo {
+                ParserBase const* parser = nullptr;
+                size_t count = 0;
+            };
+            const size_t totalParsers = m_options.size() + m_args.size();
+            assert( totalParsers < 512 );
+            // ParserInfo parseInfos[totalParsers]; // <-- this is what we really want to do
+            ParserInfo parseInfos[512];
+
+            {
+                size_t i = 0;
+                for (auto const &opt : m_options) parseInfos[i++].parser = &opt;
+                for (auto const &arg : m_args) parseInfos[i++].parser = &arg;
+            }
+
+            m_exeName.set( exeName );
+
+            auto result = InternalParseResult::ok( ParseState( ParseResultType::NoMatch, tokens ) );
+            while( result.value().remainingTokens() ) {
+                bool tokenParsed = false;
+
+                for( size_t i = 0; i < totalParsers; ++i ) {
+                    auto&  parseInfo = parseInfos[i];
+                    if( parseInfo.parser->cardinality() == 0 || parseInfo.count < parseInfo.parser->cardinality() ) {
+                        result = parseInfo.parser->parse(exeName, result.value().remainingTokens());
+                        if (!result)
+                            return result;
+                        if (result.value().type() != ParseResultType::NoMatch) {
+                            tokenParsed = true;
+                            ++parseInfo.count;
+                            break;
+                        }
+                    }
+                }
+
+                if( result.value().type() == ParseResultType::ShortCircuitAll )
+                    return result;
+                if( !tokenParsed )
+                    return InternalParseResult::runtimeError( "Unrecognised token: " + result.value().remainingTokens()->token );
+            }
+            // !TBD Check missing required options
+            return result;
+        }
+    };
+
+    template<typename DerivedT>
+    template<typename T>
+    auto ComposableParserImpl<DerivedT>::operator|( T const &other ) const -> Parser {
+        return Parser() | static_cast<DerivedT const &>( *this ) | other;
+    }
+} // namespace detail
+
+// A Combined parser
+using detail::Parser;
+
+// A parser for options
+using detail::Opt;
+
+// A parser for arguments
+using detail::Arg;
+
+// Wrapper for argc, argv from main()
+using detail::Args;
+
+// Specifies the name of the executable
+using detail::ExeName;
+
+// Convenience wrapper for option parser that specifies the help option
+using detail::Help;
+
+// enum of result types from a parse
+using detail::ParseResultType;
+
+// Result type for parser operation
+using detail::ParserResult;
+
+}} // namespace Catch::clara
+
+// end clara.hpp
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+// Restore Clara's value for console width, if present
+#ifdef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
+#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
+#undef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
+#endif
+
+// end catch_clara.h
+namespace Catch {
+
+    clara::Parser makeCommandLineParser( ConfigData& config );
+
+} // end namespace Catch
+
+// end catch_commandline.h
+#include <fstream>
+#include <ctime>
+
+namespace Catch {
+
+    clara::Parser makeCommandLineParser( ConfigData& config ) {
+
+        using namespace clara;
+
+        auto const setWarning = [&]( std::string const& warning ) {
+                auto warningSet = [&]() {
+                    if( warning == "NoAssertions" )
+                        return WarnAbout::NoAssertions;
+
+                    if ( warning == "NoTests" )
+                        return WarnAbout::NoTests;
+
+                    return WarnAbout::Nothing;
+                }();
+
+                if (warningSet == WarnAbout::Nothing)
+                    return ParserResult::runtimeError( "Unrecognised warning: '" + warning + "'" );
+                config.warnings = static_cast<WarnAbout::What>( config.warnings | warningSet );
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const loadTestNamesFromFile = [&]( std::string const& filename ) {
+                std::ifstream f( filename.c_str() );
+                if( !f.is_open() )
+                    return ParserResult::runtimeError( "Unable to load input file: '" + filename + "'" );
+
+                std::string line;
+                while( std::getline( f, line ) ) {
+                    line = trim(line);
+                    if( !line.empty() && !startsWith( line, '#' ) ) {
+                        if( !startsWith( line, '"' ) )
+                            line = '"' + line + '"';
+                        config.testsOrTags.push_back( line );
+                        config.testsOrTags.emplace_back( "," );
+                    }
+                }
+                //Remove comma in the end
+                if(!config.testsOrTags.empty())
+                    config.testsOrTags.erase( config.testsOrTags.end()-1 );
+
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setTestOrder = [&]( std::string const& order ) {
+                if( startsWith( "declared", order ) )
+                    config.runOrder = RunTests::InDeclarationOrder;
+                else if( startsWith( "lexical", order ) )
+                    config.runOrder = RunTests::InLexicographicalOrder;
+                else if( startsWith( "random", order ) )
+                    config.runOrder = RunTests::InRandomOrder;
+                else
+                    return clara::ParserResult::runtimeError( "Unrecognised ordering: '" + order + "'" );
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setRngSeed = [&]( std::string const& seed ) {
+                if( seed != "time" )
+                    return clara::detail::convertInto( seed, config.rngSeed );
+                config.rngSeed = static_cast<unsigned int>( std::time(nullptr) );
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setColourUsage = [&]( std::string const& useColour ) {
+                    auto mode = toLower( useColour );
+
+                    if( mode == "yes" )
+                        config.useColour = UseColour::Yes;
+                    else if( mode == "no" )
+                        config.useColour = UseColour::No;
+                    else if( mode == "auto" )
+                        config.useColour = UseColour::Auto;
+                    else
+                        return ParserResult::runtimeError( "colour mode must be one of: auto, yes or no. '" + useColour + "' not recognised" );
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setWaitForKeypress = [&]( std::string const& keypress ) {
+                auto keypressLc = toLower( keypress );
+                if (keypressLc == "never")
+                    config.waitForKeypress = WaitForKeypress::Never;
+                else if( keypressLc == "start" )
+                    config.waitForKeypress = WaitForKeypress::BeforeStart;
+                else if( keypressLc == "exit" )
+                    config.waitForKeypress = WaitForKeypress::BeforeExit;
+                else if( keypressLc == "both" )
+                    config.waitForKeypress = WaitForKeypress::BeforeStartAndExit;
+                else
+                    return ParserResult::runtimeError( "keypress argument must be one of: never, start, exit or both. '" + keypress + "' not recognised" );
+            return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setVerbosity = [&]( std::string const& verbosity ) {
+            auto lcVerbosity = toLower( verbosity );
+            if( lcVerbosity == "quiet" )
+                config.verbosity = Verbosity::Quiet;
+            else if( lcVerbosity == "normal" )
+                config.verbosity = Verbosity::Normal;
+            else if( lcVerbosity == "high" )
+                config.verbosity = Verbosity::High;
+            else
+                return ParserResult::runtimeError( "Unrecognised verbosity, '" + verbosity + "'" );
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+        auto const setReporter = [&]( std::string const& reporter ) {
+            IReporterRegistry::FactoryMap const& factories = getRegistryHub().getReporterRegistry().getFactories();
+
+            auto lcReporter = toLower( reporter );
+            auto result = factories.find( lcReporter );
+
+            if( factories.end() != result )
+                config.reporterName = lcReporter;
+            else
+                return ParserResult::runtimeError( "Unrecognized reporter, '" + reporter + "'. Check available with --list-reporters" );
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+
+        auto cli
+            = ExeName( config.processName )
+            | Help( config.showHelp )
+            | Opt( config.listTests )
+                ["-l"]["--list-tests"]
+                ( "list all/matching test cases" )
+            | Opt( config.listTags )
+                ["-t"]["--list-tags"]
+                ( "list all/matching tags" )
+            | Opt( config.showSuccessfulTests )
+                ["-s"]["--success"]
+                ( "include successful tests in output" )
+            | Opt( config.shouldDebugBreak )
+                ["-b"]["--break"]
+                ( "break into debugger on failure" )
+            | Opt( config.noThrow )
+                ["-e"]["--nothrow"]
+                ( "skip exception tests" )
+            | Opt( config.showInvisibles )
+                ["-i"]["--invisibles"]
+                ( "show invisibles (tabs, newlines)" )
+            | Opt( config.outputFilename, "filename" )
+                ["-o"]["--out"]
+                ( "output filename" )
+            | Opt( setReporter, "name" )
+                ["-r"]["--reporter"]
+                ( "reporter to use (defaults to console)" )
+            | Opt( config.name, "name" )
+                ["-n"]["--name"]
+                ( "suite name" )
+            | Opt( [&]( bool ){ config.abortAfter = 1; } )
+                ["-a"]["--abort"]
+                ( "abort at first failure" )
+            | Opt( [&]( int x ){ config.abortAfter = x; }, "no. failures" )
+                ["-x"]["--abortx"]
+                ( "abort after x failures" )
+            | Opt( setWarning, "warning name" )
+                ["-w"]["--warn"]
+                ( "enable warnings" )
+            | Opt( [&]( bool flag ) { config.showDurations = flag ? ShowDurations::Always : ShowDurations::Never; }, "yes|no" )
+                ["-d"]["--durations"]
+                ( "show test durations" )
+            | Opt( config.minDuration, "seconds" )
+                ["-D"]["--min-duration"]
+                ( "show test durations for tests taking at least the given number of seconds" )
+            | Opt( loadTestNamesFromFile, "filename" )
+                ["-f"]["--input-file"]
+                ( "load test names to run from a file" )
+            | Opt( config.filenamesAsTags )
+                ["-#"]["--filenames-as-tags"]
+                ( "adds a tag for the filename" )
+            | Opt( config.sectionsToRun, "section name" )
+                ["-c"]["--section"]
+                ( "specify section to run" )
+            | Opt( setVerbosity, "quiet|normal|high" )
+                ["-v"]["--verbosity"]
+                ( "set output verbosity" )
+            | Opt( config.listTestNamesOnly )
+                ["--list-test-names-only"]
+                ( "list all/matching test cases names only" )
+            | Opt( config.listReporters )
+                ["--list-reporters"]
+                ( "list all reporters" )
+            | Opt( setTestOrder, "decl|lex|rand" )
+                ["--order"]
+                ( "test case order (defaults to decl)" )
+            | Opt( setRngSeed, "'time'|number" )
+                ["--rng-seed"]
+                ( "set a specific seed for random numbers" )
+            | Opt( setColourUsage, "yes|no" )
+                ["--use-colour"]
+                ( "should output be colourised" )
+            | Opt( config.libIdentify )
+                ["--libidentify"]
+                ( "report name and version according to libidentify standard" )
+            | Opt( setWaitForKeypress, "never|start|exit|both" )
+                ["--wait-for-keypress"]
+                ( "waits for a keypress before exiting" )
+            | Opt( config.benchmarkSamples, "samples" )
+                ["--benchmark-samples"]
+                ( "number of samples to collect (default: 100)" )
+            | Opt( config.benchmarkResamples, "resamples" )
+                ["--benchmark-resamples"]
+                ( "number of resamples for the bootstrap (default: 100000)" )
+            | Opt( config.benchmarkConfidenceInterval, "confidence interval" )
+                ["--benchmark-confidence-interval"]
+                ( "confidence interval for the bootstrap (between 0 and 1, default: 0.95)" )
+            | Opt( config.benchmarkNoAnalysis )
+                ["--benchmark-no-analysis"]
+                ( "perform only measurements; do not perform any analysis" )
+            | Opt( config.benchmarkWarmupTime, "benchmarkWarmupTime" )
+                ["--benchmark-warmup-time"]
+                ( "amount of time in milliseconds spent on warming up each test (default: 100)" )
+            | Arg( config.testsOrTags, "test name|pattern|tags" )
+                ( "which test or tests to use" );
+
+        return cli;
+    }
+
+} // end namespace Catch
+// end catch_commandline.cpp
+// start catch_common.cpp
+
+#include <cstring>
+#include <ostream>
+
+namespace Catch {
+
+    bool SourceLineInfo::operator == ( SourceLineInfo const& other ) const noexcept {
+        return line == other.line && (file == other.file || std::strcmp(file, other.file) == 0);
+    }
+    bool SourceLineInfo::operator < ( SourceLineInfo const& other ) const noexcept {
+        // We can assume that the same file will usually have the same pointer.
+        // Thus, if the pointers are the same, there is no point in calling the strcmp
+        return line < other.line || ( line == other.line && file != other.file && (std::strcmp(file, other.file) < 0));
+    }
+
+    std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ) {
+#ifndef __GNUG__
+        os << info.file << '(' << info.line << ')';
+#else
+        os << info.file << ':' << info.line;
+#endif
+        return os;
+    }
+
+    std::string StreamEndStop::operator+() const {
+        return std::string();
+    }
+
+    NonCopyable::NonCopyable() = default;
+    NonCopyable::~NonCopyable() = default;
+
+}
+// end catch_common.cpp
+// start catch_config.cpp
+
+namespace Catch {
+
+    Config::Config( ConfigData const& data )
+    :   m_data( data ),
+        m_stream( openStream() )
+    {
+        // We need to trim filter specs to avoid trouble with superfluous
+        // whitespace (esp. important for bdd macros, as those are manually
+        // aligned with whitespace).
+
+        for (auto& elem : m_data.testsOrTags) {
+            elem = trim(elem);
+        }
+        for (auto& elem : m_data.sectionsToRun) {
+            elem = trim(elem);
+        }
+
+        TestSpecParser parser(ITagAliasRegistry::get());
+        if (!m_data.testsOrTags.empty()) {
+            m_hasTestFilters = true;
+            for (auto const& testOrTags : m_data.testsOrTags) {
+                parser.parse(testOrTags);
+            }
+        }
+        m_testSpec = parser.testSpec();
+    }
+
+    std::string const& Config::getFilename() const {
+        return m_data.outputFilename ;
+    }
+
+    bool Config::listTests() const          { return m_data.listTests; }
+    bool Config::listTestNamesOnly() const  { return m_data.listTestNamesOnly; }
+    bool Config::listTags() const           { return m_data.listTags; }
+    bool Config::listReporters() const      { return m_data.listReporters; }
+
+    std::string Config::getProcessName() const { return m_data.processName; }
+    std::string const& Config::getReporterName() const { return m_data.reporterName; }
+
+    std::vector<std::string> const& Config::getTestsOrTags() const { return m_data.testsOrTags; }
+    std::vector<std::string> const& Config::getSectionsToRun() const { return m_data.sectionsToRun; }
+
+    TestSpec const& Config::testSpec() const { return m_testSpec; }
+    bool Config::hasTestFilters() const { return m_hasTestFilters; }
+
+    bool Config::showHelp() const { return m_data.showHelp; }
+
+    // IConfig interface
+    bool Config::allowThrows() const                   { return !m_data.noThrow; }
+    std::ostream& Config::stream() const               { return m_stream->stream(); }
+    std::string Config::name() const                   { return m_data.name.empty() ? m_data.processName : m_data.name; }
+    bool Config::includeSuccessfulResults() const      { return m_data.showSuccessfulTests; }
+    bool Config::warnAboutMissingAssertions() const    { return !!(m_data.warnings & WarnAbout::NoAssertions); }
+    bool Config::warnAboutNoTests() const              { return !!(m_data.warnings & WarnAbout::NoTests); }
+    ShowDurations::OrNot Config::showDurations() const { return m_data.showDurations; }
+    double Config::minDuration() const                 { return m_data.minDuration; }
+    RunTests::InWhatOrder Config::runOrder() const     { return m_data.runOrder; }
+    unsigned int Config::rngSeed() const               { return m_data.rngSeed; }
+    UseColour::YesOrNo Config::useColour() const       { return m_data.useColour; }
+    bool Config::shouldDebugBreak() const              { return m_data.shouldDebugBreak; }
+    int Config::abortAfter() const                     { return m_data.abortAfter; }
+    bool Config::showInvisibles() const                { return m_data.showInvisibles; }
+    Verbosity Config::verbosity() const                { return m_data.verbosity; }
+
+    bool Config::benchmarkNoAnalysis() const                      { return m_data.benchmarkNoAnalysis; }
+    int Config::benchmarkSamples() const                          { return m_data.benchmarkSamples; }
+    double Config::benchmarkConfidenceInterval() const            { return m_data.benchmarkConfidenceInterval; }
+    unsigned int Config::benchmarkResamples() const               { return m_data.benchmarkResamples; }
+    std::chrono::milliseconds Config::benchmarkWarmupTime() const { return std::chrono::milliseconds(m_data.benchmarkWarmupTime); }
+
+    IStream const* Config::openStream() {
+        return Catch::makeStream(m_data.outputFilename);
+    }
+
+} // end namespace Catch
+// end catch_config.cpp
+// start catch_console_colour.cpp
+
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+
+// start catch_errno_guard.h
+
+namespace Catch {
+
+    class ErrnoGuard {
+    public:
+        ErrnoGuard();
+        ~ErrnoGuard();
+    private:
+        int m_oldErrno;
+    };
+
+}
+
+// end catch_errno_guard.h
+#include <sstream>
+
+namespace Catch {
+    namespace {
+
+        struct IColourImpl {
+            virtual ~IColourImpl() = default;
+            virtual void use( Colour::Code _colourCode ) = 0;
+        };
+
+        struct NoColourImpl : IColourImpl {
+            void use( Colour::Code ) override {}
+
+            static IColourImpl* instance() {
+                static NoColourImpl s_instance;
+                return &s_instance;
+            }
+        };
+
+    } // anon namespace
+} // namespace Catch
+
+#if !defined( CATCH_CONFIG_COLOUR_NONE ) && !defined( CATCH_CONFIG_COLOUR_WINDOWS ) && !defined( CATCH_CONFIG_COLOUR_ANSI )
+#   ifdef CATCH_PLATFORM_WINDOWS
+#       define CATCH_CONFIG_COLOUR_WINDOWS
+#   else
+#       define CATCH_CONFIG_COLOUR_ANSI
+#   endif
+#endif
+
+#if defined ( CATCH_CONFIG_COLOUR_WINDOWS ) /////////////////////////////////////////
+
+namespace Catch {
+namespace {
+
+    class Win32ColourImpl : public IColourImpl {
+    public:
+        Win32ColourImpl() : stdoutHandle( GetStdHandle(STD_OUTPUT_HANDLE) )
+        {
+            CONSOLE_SCREEN_BUFFER_INFO csbiInfo;
+            GetConsoleScreenBufferInfo( stdoutHandle, &csbiInfo );
+            originalForegroundAttributes = csbiInfo.wAttributes & ~( BACKGROUND_GREEN | BACKGROUND_RED | BACKGROUND_BLUE | BACKGROUND_INTENSITY );
+            originalBackgroundAttributes = csbiInfo.wAttributes & ~( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE | FOREGROUND_INTENSITY );
+        }
+
+        void use( Colour::Code _colourCode ) override {
+            switch( _colourCode ) {
+                case Colour::None:      return setTextAttribute( originalForegroundAttributes );
+                case Colour::White:     return setTextAttribute( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
+                case Colour::Red:       return setTextAttribute( FOREGROUND_RED );
+                case Colour::Green:     return setTextAttribute( FOREGROUND_GREEN );
+                case Colour::Blue:      return setTextAttribute( FOREGROUND_BLUE );
+                case Colour::Cyan:      return setTextAttribute( FOREGROUND_BLUE | FOREGROUND_GREEN );
+                case Colour::Yellow:    return setTextAttribute( FOREGROUND_RED | FOREGROUND_GREEN );
+                case Colour::Grey:      return setTextAttribute( 0 );
+
+                case Colour::LightGrey:     return setTextAttribute( FOREGROUND_INTENSITY );
+                case Colour::BrightRed:     return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED );
+                case Colour::BrightGreen:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN );
+                case Colour::BrightWhite:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
+                case Colour::BrightYellow:  return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN );
+
+                case Colour::Bright: CATCH_INTERNAL_ERROR( "not a colour" );
+
+                default:
+                    CATCH_ERROR( "Unknown colour requested" );
+            }
+        }
+
+    private:
+        void setTextAttribute( WORD _textAttribute ) {
+            SetConsoleTextAttribute( stdoutHandle, _textAttribute | originalBackgroundAttributes );
+        }
+        HANDLE stdoutHandle;
+        WORD originalForegroundAttributes;
+        WORD originalBackgroundAttributes;
+    };
+
+    IColourImpl* platformColourInstance() {
+        static Win32ColourImpl s_instance;
+
+        IConfigPtr config = getCurrentContext().getConfig();
+        UseColour::YesOrNo colourMode = config
+            ? config->useColour()
+            : UseColour::Auto;
+        if( colourMode == UseColour::Auto )
+            colourMode = UseColour::Yes;
+        return colourMode == UseColour::Yes
+            ? &s_instance
+            : NoColourImpl::instance();
+    }
+
+} // end anon namespace
+} // end namespace Catch
+
+#elif defined( CATCH_CONFIG_COLOUR_ANSI ) //////////////////////////////////////
+
+#include <unistd.h>
+
+namespace Catch {
+namespace {
+
+    // use POSIX/ ANSI console terminal codes
+    // Thanks to Adam Strzelecki for original contribution
+    // (http://github.com/nanoant)
+    // https://github.com/philsquared/Catch/pull/131
+    class PosixColourImpl : public IColourImpl {
+    public:
+        void use( Colour::Code _colourCode ) override {
+            switch( _colourCode ) {
+                case Colour::None:
+                case Colour::White:     return setColour( "[0m" );
+                case Colour::Red:       return setColour( "[0;31m" );
+                case Colour::Green:     return setColour( "[0;32m" );
+                case Colour::Blue:      return setColour( "[0;34m" );
+                case Colour::Cyan:      return setColour( "[0;36m" );
+                case Colour::Yellow:    return setColour( "[0;33m" );
+                case Colour::Grey:      return setColour( "[1;30m" );
+
+                case Colour::LightGrey:     return setColour( "[0;37m" );
+                case Colour::BrightRed:     return setColour( "[1;31m" );
+                case Colour::BrightGreen:   return setColour( "[1;32m" );
+                case Colour::BrightWhite:   return setColour( "[1;37m" );
+                case Colour::BrightYellow:  return setColour( "[1;33m" );
+
+                case Colour::Bright: CATCH_INTERNAL_ERROR( "not a colour" );
+                default: CATCH_INTERNAL_ERROR( "Unknown colour requested" );
+            }
+        }
+        static IColourImpl* instance() {
+            static PosixColourImpl s_instance;
+            return &s_instance;
+        }
+
+    private:
+        void setColour( const char* _escapeCode ) {
+            getCurrentContext().getConfig()->stream()
+                << '\033' << _escapeCode;
+        }
+    };
+
+    bool useColourOnPlatform() {
+        return
+#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)
+            !isDebuggerActive() &&
+#endif
+#if !(defined(__DJGPP__) && defined(__STRICT_ANSI__))
+            isatty(STDOUT_FILENO)
+#else
+            false
+#endif
+            ;
+    }
+    IColourImpl* platformColourInstance() {
+        ErrnoGuard guard;
+        IConfigPtr config = getCurrentContext().getConfig();
+        UseColour::YesOrNo colourMode = config
+            ? config->useColour()
+            : UseColour::Auto;
+        if( colourMode == UseColour::Auto )
+            colourMode = useColourOnPlatform()
+                ? UseColour::Yes
+                : UseColour::No;
+        return colourMode == UseColour::Yes
+            ? PosixColourImpl::instance()
+            : NoColourImpl::instance();
+    }
+
+} // end anon namespace
+} // end namespace Catch
+
+#else  // not Windows or ANSI ///////////////////////////////////////////////
+
+namespace Catch {
+
+    static IColourImpl* platformColourInstance() { return NoColourImpl::instance(); }
+
+} // end namespace Catch
+
+#endif // Windows/ ANSI/ None
+
+namespace Catch {
+
+    Colour::Colour( Code _colourCode ) { use( _colourCode ); }
+    Colour::Colour( Colour&& other ) noexcept {
+        m_moved = other.m_moved;
+        other.m_moved = true;
+    }
+    Colour& Colour::operator=( Colour&& other ) noexcept {
+        m_moved = other.m_moved;
+        other.m_moved  = true;
+        return *this;
+    }
+
+    Colour::~Colour(){ if( !m_moved ) use( None ); }
+
+    void Colour::use( Code _colourCode ) {
+        static IColourImpl* impl = platformColourInstance();
+        // Strictly speaking, this cannot possibly happen.
+        // However, under some conditions it does happen (see #1626),
+        // and this change is small enough that we can let practicality
+        // triumph over purity in this case.
+        if (impl != nullptr) {
+            impl->use( _colourCode );
+        }
+    }
+
+    std::ostream& operator << ( std::ostream& os, Colour const& ) {
+        return os;
+    }
+
+} // end namespace Catch
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
+// end catch_console_colour.cpp
+// start catch_context.cpp
+
+namespace Catch {
+
+    class Context : public IMutableContext, NonCopyable {
+
+    public: // IContext
+        IResultCapture* getResultCapture() override {
+            return m_resultCapture;
+        }
+        IRunner* getRunner() override {
+            return m_runner;
+        }
+
+        IConfigPtr const& getConfig() const override {
+            return m_config;
+        }
+
+        ~Context() override;
+
+    public: // IMutableContext
+        void setResultCapture( IResultCapture* resultCapture ) override {
+            m_resultCapture = resultCapture;
+        }
+        void setRunner( IRunner* runner ) override {
+            m_runner = runner;
+        }
+        void setConfig( IConfigPtr const& config ) override {
+            m_config = config;
+        }
+
+        friend IMutableContext& getCurrentMutableContext();
+
+    private:
+        IConfigPtr m_config;
+        IRunner* m_runner = nullptr;
+        IResultCapture* m_resultCapture = nullptr;
+    };
+
+    IMutableContext *IMutableContext::currentContext = nullptr;
+
+    void IMutableContext::createContext()
+    {
+        currentContext = new Context();
+    }
+
+    void cleanUpContext() {
+        delete IMutableContext::currentContext;
+        IMutableContext::currentContext = nullptr;
+    }
+    IContext::~IContext() = default;
+    IMutableContext::~IMutableContext() = default;
+    Context::~Context() = default;
+
+    SimplePcg32& rng() {
+        static SimplePcg32 s_rng;
+        return s_rng;
+    }
+
+}
+// end catch_context.cpp
+// start catch_debug_console.cpp
+
+// start catch_debug_console.h
+
+#include <string>
+
+namespace Catch {
+    void writeToDebugConsole( std::string const& text );
+}
+
+// end catch_debug_console.h
+#if defined(CATCH_CONFIG_ANDROID_LOGWRITE)
+#include <android/log.h>
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            __android_log_write( ANDROID_LOG_DEBUG, "Catch", text.c_str() );
+        }
+    }
+
+#elif defined(CATCH_PLATFORM_WINDOWS)
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            ::OutputDebugStringA( text.c_str() );
+        }
+    }
+
+#else
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            // !TBD: Need a version for Mac/ XCode and other IDEs
+            Catch::cout() << text;
+        }
+    }
+
+#endif // Platform
+// end catch_debug_console.cpp
+// start catch_debugger.cpp
+
+#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)
+
+#  include <cassert>
+#  include <sys/types.h>
+#  include <unistd.h>
+#  include <cstddef>
+#  include <ostream>
+
+#ifdef __apple_build_version__
+    // These headers will only compile with AppleClang (XCode)
+    // For other compilers (Clang, GCC, ... ) we need to exclude them
+#  include <sys/sysctl.h>
+#endif
+
+    namespace Catch {
+        #ifdef __apple_build_version__
+        // The following function is taken directly from the following technical note:
+        // https://developer.apple.com/library/archive/qa/qa1361/_index.html
+
+        // Returns true if the current process is being debugged (either
+        // running under the debugger or has a debugger attached post facto).
+        bool isDebuggerActive(){
+            int                 mib[4];
+            struct kinfo_proc   info;
+            std::size_t         size;
+
+            // Initialize the flags so that, if sysctl fails for some bizarre
+            // reason, we get a predictable result.
+
+            info.kp_proc.p_flag = 0;
+
+            // Initialize mib, which tells sysctl the info we want, in this case
+            // we're looking for information about a specific process ID.
+
+            mib[0] = CTL_KERN;
+            mib[1] = KERN_PROC;
+            mib[2] = KERN_PROC_PID;
+            mib[3] = getpid();
+
+            // Call sysctl.
+
+            size = sizeof(info);
+            if( sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, nullptr, 0) != 0 ) {
+                Catch::cerr() << "\n** Call to sysctl failed - unable to determine if debugger is active **\n" << std::endl;
+                return false;
+            }
+
+            // We're being debugged if the P_TRACED flag is set.
+
+            return ( (info.kp_proc.p_flag & P_TRACED) != 0 );
+        }
+        #else
+        bool isDebuggerActive() {
+            // We need to find another way to determine this for non-appleclang compilers on macOS
+            return false;
+        }
+        #endif
+    } // namespace Catch
+
+#elif defined(CATCH_PLATFORM_LINUX)
+    #include <fstream>
+    #include <string>
+
+    namespace Catch{
+        // The standard POSIX way of detecting a debugger is to attempt to
+        // ptrace() the process, but this needs to be done from a child and not
+        // this process itself to still allow attaching to this process later
+        // if wanted, so is rather heavy. Under Linux we have the PID of the
+        // "debugger" (which doesn't need to be gdb, of course, it could also
+        // be strace, for example) in /proc/$PID/status, so just get it from
+        // there instead.
+        bool isDebuggerActive(){
+            // Libstdc++ has a bug, where std::ifstream sets errno to 0
+            // This way our users can properly assert over errno values
+            ErrnoGuard guard;
+            std::ifstream in("/proc/self/status");
+            for( std::string line; std::getline(in, line); ) {
+                static const int PREFIX_LEN = 11;
+                if( line.compare(0, PREFIX_LEN, "TracerPid:\t") == 0 ) {
+                    // We're traced if the PID is not 0 and no other PID starts
+                    // with 0 digit, so it's enough to check for just a single
+                    // character.
+                    return line.length() > PREFIX_LEN && line[PREFIX_LEN] != '0';
+                }
+            }
+
+            return false;
+        }
+    } // namespace Catch
+#elif defined(_MSC_VER)
+    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
+    namespace Catch {
+        bool isDebuggerActive() {
+            return IsDebuggerPresent() != 0;
+        }
+    }
+#elif defined(__MINGW32__)
+    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
+    namespace Catch {
+        bool isDebuggerActive() {
+            return IsDebuggerPresent() != 0;
+        }
+    }
+#else
+    namespace Catch {
+       bool isDebuggerActive() { return false; }
+    }
+#endif // Platform
+// end catch_debugger.cpp
+// start catch_decomposer.cpp
+
+namespace Catch {
+
+    ITransientExpression::~ITransientExpression() = default;
+
+    void formatReconstructedExpression( std::ostream &os, std::string const& lhs, StringRef op, std::string const& rhs ) {
+        if( lhs.size() + rhs.size() < 40 &&
+                lhs.find('\n') == std::string::npos &&
+                rhs.find('\n') == std::string::npos )
+            os << lhs << " " << op << " " << rhs;
+        else
+            os << lhs << "\n" << op << "\n" << rhs;
+    }
+}
+// end catch_decomposer.cpp
+// start catch_enforce.cpp
+
+#include <stdexcept>
+
+namespace Catch {
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER)
+    [[noreturn]]
+    void throw_exception(std::exception const& e) {
+        Catch::cerr() << "Catch will terminate because it needed to throw an exception.\n"
+                      << "The message was: " << e.what() << '\n';
+        std::terminate();
+    }
+#endif
+
+    [[noreturn]]
+    void throw_logic_error(std::string const& msg) {
+        throw_exception(std::logic_error(msg));
+    }
+
+    [[noreturn]]
+    void throw_domain_error(std::string const& msg) {
+        throw_exception(std::domain_error(msg));
+    }
+
+    [[noreturn]]
+    void throw_runtime_error(std::string const& msg) {
+        throw_exception(std::runtime_error(msg));
+    }
+
+} // namespace Catch;
+// end catch_enforce.cpp
+// start catch_enum_values_registry.cpp
+// start catch_enum_values_registry.h
+
+#include <vector>
+#include <memory>
+
+namespace Catch {
+
+    namespace Detail {
+
+        std::unique_ptr<EnumInfo> makeEnumInfo( StringRef enumName, StringRef allValueNames, std::vector<int> const& values );
+
+        class EnumValuesRegistry : public IMutableEnumValuesRegistry {
+
+            std::vector<std::unique_ptr<EnumInfo>> m_enumInfos;
+
+            EnumInfo const& registerEnum( StringRef enumName, StringRef allEnums, std::vector<int> const& values) override;
+        };
+
+        std::vector<StringRef> parseEnums( StringRef enums );
+
+    } // Detail
+
+} // Catch
+
+// end catch_enum_values_registry.h
+
+#include <map>
+#include <cassert>
+
+namespace Catch {
+
+    IMutableEnumValuesRegistry::~IMutableEnumValuesRegistry() {}
+
+    namespace Detail {
+
+        namespace {
+            // Extracts the actual name part of an enum instance
+            // In other words, it returns the Blue part of Bikeshed::Colour::Blue
+            StringRef extractInstanceName(StringRef enumInstance) {
+                // Find last occurence of ":"
+                size_t name_start = enumInstance.size();
+                while (name_start > 0 && enumInstance[name_start - 1] != ':') {
+                    --name_start;
+                }
+                return enumInstance.substr(name_start, enumInstance.size() - name_start);
+            }
+        }
+
+        std::vector<StringRef> parseEnums( StringRef enums ) {
+            auto enumValues = splitStringRef( enums, ',' );
+            std::vector<StringRef> parsed;
+            parsed.reserve( enumValues.size() );
+            for( auto const& enumValue : enumValues ) {
+                parsed.push_back(trim(extractInstanceName(enumValue)));
+            }
+            return parsed;
+        }
+
+        EnumInfo::~EnumInfo() {}
+
+        StringRef EnumInfo::lookup( int value ) const {
+            for( auto const& valueToName : m_values ) {
+                if( valueToName.first == value )
+                    return valueToName.second;
+            }
+            return "{** unexpected enum value **}"_sr;
+        }
+
+        std::unique_ptr<EnumInfo> makeEnumInfo( StringRef enumName, StringRef allValueNames, std::vector<int> const& values ) {
+            std::unique_ptr<EnumInfo> enumInfo( new EnumInfo );
+            enumInfo->m_name = enumName;
+            enumInfo->m_values.reserve( values.size() );
+
+            const auto valueNames = Catch::Detail::parseEnums( allValueNames );
+            assert( valueNames.size() == values.size() );
+            std::size_t i = 0;
+            for( auto value : values )
+                enumInfo->m_values.emplace_back(value, valueNames[i++]);
+
+            return enumInfo;
+        }
+
+        EnumInfo const& EnumValuesRegistry::registerEnum( StringRef enumName, StringRef allValueNames, std::vector<int> const& values ) {
+            m_enumInfos.push_back(makeEnumInfo(enumName, allValueNames, values));
+            return *m_enumInfos.back();
+        }
+
+    } // Detail
+} // Catch
+
+// end catch_enum_values_registry.cpp
+// start catch_errno_guard.cpp
+
+#include <cerrno>
+
+namespace Catch {
+        ErrnoGuard::ErrnoGuard():m_oldErrno(errno){}
+        ErrnoGuard::~ErrnoGuard() { errno = m_oldErrno; }
+}
+// end catch_errno_guard.cpp
+// start catch_exception_translator_registry.cpp
+
+// start catch_exception_translator_registry.h
+
+#include <vector>
+#include <string>
+#include <memory>
+
+namespace Catch {
+
+    class ExceptionTranslatorRegistry : public IExceptionTranslatorRegistry {
+    public:
+        ~ExceptionTranslatorRegistry();
+        virtual void registerTranslator( const IExceptionTranslator* translator );
+        std::string translateActiveException() const override;
+        std::string tryTranslators() const;
+
+    private:
+        std::vector<std::unique_ptr<IExceptionTranslator const>> m_translators;
+    };
+}
+
+// end catch_exception_translator_registry.h
+#ifdef __OBJC__
+#import "Foundation/Foundation.h"
+#endif
+
+namespace Catch {
+
+    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() {
+    }
+
+    void ExceptionTranslatorRegistry::registerTranslator( const IExceptionTranslator* translator ) {
+        m_translators.push_back( std::unique_ptr<const IExceptionTranslator>( translator ) );
+    }
+
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    std::string ExceptionTranslatorRegistry::translateActiveException() const {
+        try {
+#ifdef __OBJC__
+            // In Objective-C try objective-c exceptions first
+            @try {
+                return tryTranslators();
+            }
+            @catch (NSException *exception) {
+                return Catch::Detail::stringify( [exception description] );
+            }
+#else
+            // Compiling a mixed mode project with MSVC means that CLR
+            // exceptions will be caught in (...) as well. However, these
+            // do not fill-in std::current_exception and thus lead to crash
+            // when attempting rethrow.
+            // /EHa switch also causes structured exceptions to be caught
+            // here, but they fill-in current_exception properly, so
+            // at worst the output should be a little weird, instead of
+            // causing a crash.
+            if (std::current_exception() == nullptr) {
+                return "Non C++ exception. Possibly a CLR exception.";
+            }
+            return tryTranslators();
+#endif
+        }
+        catch( TestFailureException& ) {
+            std::rethrow_exception(std::current_exception());
+        }
+        catch( std::exception& ex ) {
+            return ex.what();
+        }
+        catch( std::string& msg ) {
+            return msg;
+        }
+        catch( const char* msg ) {
+            return msg;
+        }
+        catch(...) {
+            return "Unknown exception";
+        }
+    }
+
+    std::string ExceptionTranslatorRegistry::tryTranslators() const {
+        if (m_translators.empty()) {
+            std::rethrow_exception(std::current_exception());
+        } else {
+            return m_translators[0]->translate(m_translators.begin() + 1, m_translators.end());
+        }
+    }
+
+#else // ^^ Exceptions are enabled // Exceptions are disabled vv
+    std::string ExceptionTranslatorRegistry::translateActiveException() const {
+        CATCH_INTERNAL_ERROR("Attempted to translate active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
+    }
+
+    std::string ExceptionTranslatorRegistry::tryTranslators() const {
+        CATCH_INTERNAL_ERROR("Attempted to use exception translators under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
+    }
+#endif
+
+}
+// end catch_exception_translator_registry.cpp
+// start catch_fatal_condition.cpp
+
+#if defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH ) || defined( CATCH_CONFIG_POSIX_SIGNALS )
+
+namespace {
+    // Report the error condition
+    void reportFatal( char const * const message ) {
+        Catch::getCurrentContext().getResultCapture()->handleFatalErrorCondition( message );
+    }
+}
+
+#endif // signals/SEH handling
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH )
+
+namespace Catch {
+    struct SignalDefs { DWORD id; const char* name; };
+
+    // There is no 1-1 mapping between signals and windows exceptions.
+    // Windows can easily distinguish between SO and SigSegV,
+    // but SigInt, SigTerm, etc are handled differently.
+    static SignalDefs signalDefs[] = {
+        { static_cast<DWORD>(EXCEPTION_ILLEGAL_INSTRUCTION),  "SIGILL - Illegal instruction signal" },
+        { static_cast<DWORD>(EXCEPTION_STACK_OVERFLOW), "SIGSEGV - Stack overflow" },
+        { static_cast<DWORD>(EXCEPTION_ACCESS_VIOLATION), "SIGSEGV - Segmentation violation signal" },
+        { static_cast<DWORD>(EXCEPTION_INT_DIVIDE_BY_ZERO), "Divide by zero error" },
+    };
+
+    LONG CALLBACK FatalConditionHandler::handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo) {
+        for (auto const& def : signalDefs) {
+            if (ExceptionInfo->ExceptionRecord->ExceptionCode == def.id) {
+                reportFatal(def.name);
+            }
+        }
+        // If its not an exception we care about, pass it along.
+        // This stops us from eating debugger breaks etc.
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    FatalConditionHandler::FatalConditionHandler() {
+        isSet = true;
+        // 32k seems enough for Catch to handle stack overflow,
+        // but the value was found experimentally, so there is no strong guarantee
+        guaranteeSize = 32 * 1024;
+        exceptionHandlerHandle = nullptr;
+        // Register as first handler in current chain
+        exceptionHandlerHandle = AddVectoredExceptionHandler(1, handleVectoredException);
+        // Pass in guarantee size to be filled
+        SetThreadStackGuarantee(&guaranteeSize);
+    }
+
+    void FatalConditionHandler::reset() {
+        if (isSet) {
+            RemoveVectoredExceptionHandler(exceptionHandlerHandle);
+            SetThreadStackGuarantee(&guaranteeSize);
+            exceptionHandlerHandle = nullptr;
+            isSet = false;
+        }
+    }
+
+    FatalConditionHandler::~FatalConditionHandler() {
+        reset();
+    }
+
+bool FatalConditionHandler::isSet = false;
+ULONG FatalConditionHandler::guaranteeSize = 0;
+PVOID FatalConditionHandler::exceptionHandlerHandle = nullptr;
+
+} // namespace Catch
+
+#elif defined( CATCH_CONFIG_POSIX_SIGNALS )
+
+namespace Catch {
+
+    struct SignalDefs {
+        int id;
+        const char* name;
+    };
+
+    // 32kb for the alternate stack seems to be sufficient. However, this value
+    // is experimentally determined, so that's not guaranteed.
+    static constexpr std::size_t sigStackSize = 32768 >= MINSIGSTKSZ ? 32768 : MINSIGSTKSZ;
+
+    static SignalDefs signalDefs[] = {
+        { SIGINT,  "SIGINT - Terminal interrupt signal" },
+        { SIGILL,  "SIGILL - Illegal instruction signal" },
+        { SIGFPE,  "SIGFPE - Floating point error signal" },
+        { SIGSEGV, "SIGSEGV - Segmentation violation signal" },
+        { SIGTERM, "SIGTERM - Termination request signal" },
+        { SIGABRT, "SIGABRT - Abort (abnormal termination) signal" }
+    };
+
+    void FatalConditionHandler::handleSignal( int sig ) {
+        char const * name = "<unknown signal>";
+        for (auto const& def : signalDefs) {
+            if (sig == def.id) {
+                name = def.name;
+                break;
+            }
+        }
+        reset();
+        reportFatal(name);
+        raise( sig );
+    }
+
+    FatalConditionHandler::FatalConditionHandler() {
+        isSet = true;
+        stack_t sigStack;
+        sigStack.ss_sp = altStackMem;
+        sigStack.ss_size = sigStackSize;
+        sigStack.ss_flags = 0;
+        sigaltstack(&sigStack, &oldSigStack);
+        struct sigaction sa = { };
+
+        sa.sa_handler = handleSignal;
+        sa.sa_flags = SA_ONSTACK;
+        for (std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i) {
+            sigaction(signalDefs[i].id, &sa, &oldSigActions[i]);
+        }
+    }
+
+    FatalConditionHandler::~FatalConditionHandler() {
+        reset();
+    }
+
+    void FatalConditionHandler::reset() {
+        if( isSet ) {
+            // Set signals back to previous values -- hopefully nobody overwrote them in the meantime
+            for( std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i ) {
+                sigaction(signalDefs[i].id, &oldSigActions[i], nullptr);
+            }
+            // Return the old stack
+            sigaltstack(&oldSigStack, nullptr);
+            isSet = false;
+        }
+    }
+
+    bool FatalConditionHandler::isSet = false;
+    struct sigaction FatalConditionHandler::oldSigActions[sizeof(signalDefs)/sizeof(SignalDefs)] = {};
+    stack_t FatalConditionHandler::oldSigStack = {};
+    char FatalConditionHandler::altStackMem[sigStackSize] = {};
+
+} // namespace Catch
+
+#else
+
+namespace Catch {
+    void FatalConditionHandler::reset() {}
+}
+
+#endif // signals/SEH handling
+
+#if defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#endif
+// end catch_fatal_condition.cpp
+// start catch_generators.cpp
+
+#include <limits>
+#include <set>
+
+namespace Catch {
+
+IGeneratorTracker::~IGeneratorTracker() {}
+
+const char* GeneratorException::what() const noexcept {
+    return m_msg;
+}
+
+namespace Generators {
+
+    GeneratorUntypedBase::~GeneratorUntypedBase() {}
+
+    auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& {
+        return getResultCapture().acquireGeneratorTracker( generatorName, lineInfo );
+    }
+
+} // namespace Generators
+} // namespace Catch
+// end catch_generators.cpp
+// start catch_interfaces_capture.cpp
+
+namespace Catch {
+    IResultCapture::~IResultCapture() = default;
+}
+// end catch_interfaces_capture.cpp
+// start catch_interfaces_config.cpp
+
+namespace Catch {
+    IConfig::~IConfig() = default;
+}
+// end catch_interfaces_config.cpp
+// start catch_interfaces_exception.cpp
+
+namespace Catch {
+    IExceptionTranslator::~IExceptionTranslator() = default;
+    IExceptionTranslatorRegistry::~IExceptionTranslatorRegistry() = default;
+}
+// end catch_interfaces_exception.cpp
+// start catch_interfaces_registry_hub.cpp
+
+namespace Catch {
+    IRegistryHub::~IRegistryHub() = default;
+    IMutableRegistryHub::~IMutableRegistryHub() = default;
+}
+// end catch_interfaces_registry_hub.cpp
+// start catch_interfaces_reporter.cpp
+
+// start catch_reporter_listening.h
+
+namespace Catch {
+
+    class ListeningReporter : public IStreamingReporter {
+        using Reporters = std::vector<IStreamingReporterPtr>;
+        Reporters m_listeners;
+        IStreamingReporterPtr m_reporter = nullptr;
+        ReporterPreferences m_preferences;
+
+    public:
+        ListeningReporter();
+
+        void addListener( IStreamingReporterPtr&& listener );
+        void addReporter( IStreamingReporterPtr&& reporter );
+
+    public: // IStreamingReporter
+
+        ReporterPreferences getPreferences() const override;
+
+        void noMatchingTestCases( std::string const& spec ) override;
+
+        void reportInvalidArguments(std::string const&arg) override;
+
+        static std::set<Verbosity> getSupportedVerbosities();
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+        void benchmarkPreparing(std::string const& name) override;
+        void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) override;
+        void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) override;
+        void benchmarkFailed(std::string const&) override;
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+        void testRunStarting( TestRunInfo const& testRunInfo ) override;
+        void testGroupStarting( GroupInfo const& groupInfo ) override;
+        void testCaseStarting( TestCaseInfo const& testInfo ) override;
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+
+        // The return value indicates if the messages buffer should be cleared:
+        bool assertionEnded( AssertionStats const& assertionStats ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+        void testCaseEnded( TestCaseStats const& testCaseStats ) override;
+        void testGroupEnded( TestGroupStats const& testGroupStats ) override;
+        void testRunEnded( TestRunStats const& testRunStats ) override;
+
+        void skipTest( TestCaseInfo const& testInfo ) override;
+        bool isMulti() const override;
+
+    };
+
+} // end namespace Catch
+
+// end catch_reporter_listening.h
+namespace Catch {
+
+    ReporterConfig::ReporterConfig( IConfigPtr const& _fullConfig )
+    :   m_stream( &_fullConfig->stream() ), m_fullConfig( _fullConfig ) {}
+
+    ReporterConfig::ReporterConfig( IConfigPtr const& _fullConfig, std::ostream& _stream )
+    :   m_stream( &_stream ), m_fullConfig( _fullConfig ) {}
+
+    std::ostream& ReporterConfig::stream() const { return *m_stream; }
+    IConfigPtr ReporterConfig::fullConfig() const { return m_fullConfig; }
+
+    TestRunInfo::TestRunInfo( std::string const& _name ) : name( _name ) {}
+
+    GroupInfo::GroupInfo(  std::string const& _name,
+                           std::size_t _groupIndex,
+                           std::size_t _groupsCount )
+    :   name( _name ),
+        groupIndex( _groupIndex ),
+        groupsCounts( _groupsCount )
+    {}
+
+     AssertionStats::AssertionStats( AssertionResult const& _assertionResult,
+                                     std::vector<MessageInfo> const& _infoMessages,
+                                     Totals const& _totals )
+    :   assertionResult( _assertionResult ),
+        infoMessages( _infoMessages ),
+        totals( _totals )
+    {
+        assertionResult.m_resultData.lazyExpression.m_transientExpression = _assertionResult.m_resultData.lazyExpression.m_transientExpression;
+
+        if( assertionResult.hasMessage() ) {
+            // Copy message into messages list.
+            // !TBD This should have been done earlier, somewhere
+            MessageBuilder builder( assertionResult.getTestMacroName(), assertionResult.getSourceInfo(), assertionResult.getResultType() );
+            builder << assertionResult.getMessage();
+            builder.m_info.message = builder.m_stream.str();
+
+            infoMessages.push_back( builder.m_info );
+        }
+    }
+
+     AssertionStats::~AssertionStats() = default;
+
+    SectionStats::SectionStats(  SectionInfo const& _sectionInfo,
+                                 Counts const& _assertions,
+                                 double _durationInSeconds,
+                                 bool _missingAssertions )
+    :   sectionInfo( _sectionInfo ),
+        assertions( _assertions ),
+        durationInSeconds( _durationInSeconds ),
+        missingAssertions( _missingAssertions )
+    {}
+
+    SectionStats::~SectionStats() = default;
+
+    TestCaseStats::TestCaseStats(  TestCaseInfo const& _testInfo,
+                                   Totals const& _totals,
+                                   std::string const& _stdOut,
+                                   std::string const& _stdErr,
+                                   bool _aborting )
+    : testInfo( _testInfo ),
+        totals( _totals ),
+        stdOut( _stdOut ),
+        stdErr( _stdErr ),
+        aborting( _aborting )
+    {}
+
+    TestCaseStats::~TestCaseStats() = default;
+
+    TestGroupStats::TestGroupStats( GroupInfo const& _groupInfo,
+                                    Totals const& _totals,
+                                    bool _aborting )
+    :   groupInfo( _groupInfo ),
+        totals( _totals ),
+        aborting( _aborting )
+    {}
+
+    TestGroupStats::TestGroupStats( GroupInfo const& _groupInfo )
+    :   groupInfo( _groupInfo ),
+        aborting( false )
+    {}
+
+    TestGroupStats::~TestGroupStats() = default;
+
+    TestRunStats::TestRunStats(   TestRunInfo const& _runInfo,
+                    Totals const& _totals,
+                    bool _aborting )
+    :   runInfo( _runInfo ),
+        totals( _totals ),
+        aborting( _aborting )
+    {}
+
+    TestRunStats::~TestRunStats() = default;
+
+    void IStreamingReporter::fatalErrorEncountered( StringRef ) {}
+    bool IStreamingReporter::isMulti() const { return false; }
+
+    IReporterFactory::~IReporterFactory() = default;
+    IReporterRegistry::~IReporterRegistry() = default;
+
+} // end namespace Catch
+// end catch_interfaces_reporter.cpp
+// start catch_interfaces_runner.cpp
+
+namespace Catch {
+    IRunner::~IRunner() = default;
+}
+// end catch_interfaces_runner.cpp
+// start catch_interfaces_testcase.cpp
+
+namespace Catch {
+    ITestInvoker::~ITestInvoker() = default;
+    ITestCaseRegistry::~ITestCaseRegistry() = default;
+}
+// end catch_interfaces_testcase.cpp
+// start catch_leak_detector.cpp
+
+#ifdef CATCH_CONFIG_WINDOWS_CRTDBG
+#include <crtdbg.h>
+
+namespace Catch {
+
+    LeakDetector::LeakDetector() {
+        int flag = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+        flag |= _CRTDBG_LEAK_CHECK_DF;
+        flag |= _CRTDBG_ALLOC_MEM_DF;
+        _CrtSetDbgFlag(flag);
+        _CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
+        _CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDERR);
+        // Change this to leaking allocation's number to break there
+        _CrtSetBreakAlloc(-1);
+    }
+}
+
+#else
+
+    Catch::LeakDetector::LeakDetector() {}
+
+#endif
+
+Catch::LeakDetector::~LeakDetector() {
+    Catch::cleanUp();
+}
+// end catch_leak_detector.cpp
+// start catch_list.cpp
+
+// start catch_list.h
+
+#include <set>
+
+namespace Catch {
+
+    std::size_t listTests( Config const& config );
+
+    std::size_t listTestsNamesOnly( Config const& config );
+
+    struct TagInfo {
+        void add( std::string const& spelling );
+        std::string all() const;
+
+        std::set<std::string> spellings;
+        std::size_t count = 0;
+    };
+
+    std::size_t listTags( Config const& config );
+
+    std::size_t listReporters();
+
+    Option<std::size_t> list( std::shared_ptr<Config> const& config );
+
+} // end namespace Catch
+
+// end catch_list.h
+// start catch_text.h
+
+namespace Catch {
+    using namespace clara::TextFlow;
+}
+
+// end catch_text.h
+#include <limits>
+#include <algorithm>
+#include <iomanip>
+
+namespace Catch {
+
+    std::size_t listTests( Config const& config ) {
+        TestSpec const& testSpec = config.testSpec();
+        if( config.hasTestFilters() )
+            Catch::cout() << "Matching test cases:\n";
+        else {
+            Catch::cout() << "All available test cases:\n";
+        }
+
+        auto matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
+        for( auto const& testCaseInfo : matchedTestCases ) {
+            Colour::Code colour = testCaseInfo.isHidden()
+                ? Colour::SecondaryText
+                : Colour::None;
+            Colour colourGuard( colour );
+
+            Catch::cout() << Column( testCaseInfo.name ).initialIndent( 2 ).indent( 4 ) << "\n";
+            if( config.verbosity() >= Verbosity::High ) {
+                Catch::cout() << Column( Catch::Detail::stringify( testCaseInfo.lineInfo ) ).indent(4) << std::endl;
+                std::string description = testCaseInfo.description;
+                if( description.empty() )
+                    description = "(NO DESCRIPTION)";
+                Catch::cout() << Column( description ).indent(4) << std::endl;
+            }
+            if( !testCaseInfo.tags.empty() )
+                Catch::cout() << Column( testCaseInfo.tagsAsString() ).indent( 6 ) << "\n";
+        }
+
+        if( !config.hasTestFilters() )
+            Catch::cout() << pluralise( matchedTestCases.size(), "test case" ) << '\n' << std::endl;
+        else
+            Catch::cout() << pluralise( matchedTestCases.size(), "matching test case" ) << '\n' << std::endl;
+        return matchedTestCases.size();
+    }
+
+    std::size_t listTestsNamesOnly( Config const& config ) {
+        TestSpec const& testSpec = config.testSpec();
+        std::size_t matchedTests = 0;
+        std::vector<TestCase> matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
+        for( auto const& testCaseInfo : matchedTestCases ) {
+            matchedTests++;
+            if( startsWith( testCaseInfo.name, '#' ) )
+               Catch::cout() << '"' << testCaseInfo.name << '"';
+            else
+               Catch::cout() << testCaseInfo.name;
+            if ( config.verbosity() >= Verbosity::High )
+                Catch::cout() << "\t@" << testCaseInfo.lineInfo;
+            Catch::cout() << std::endl;
+        }
+        return matchedTests;
+    }
+
+    void TagInfo::add( std::string const& spelling ) {
+        ++count;
+        spellings.insert( spelling );
+    }
+
+    std::string TagInfo::all() const {
+        size_t size = 0;
+        for (auto const& spelling : spellings) {
+            // Add 2 for the brackes
+            size += spelling.size() + 2;
+        }
+
+        std::string out; out.reserve(size);
+        for (auto const& spelling : spellings) {
+            out += '[';
+            out += spelling;
+            out += ']';
+        }
+        return out;
+    }
+
+    std::size_t listTags( Config const& config ) {
+        TestSpec const& testSpec = config.testSpec();
+        if( config.hasTestFilters() )
+            Catch::cout() << "Tags for matching test cases:\n";
+        else {
+            Catch::cout() << "All available tags:\n";
+        }
+
+        std::map<std::string, TagInfo> tagCounts;
+
+        std::vector<TestCase> matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
+        for( auto const& testCase : matchedTestCases ) {
+            for( auto const& tagName : testCase.getTestCaseInfo().tags ) {
+                std::string lcaseTagName = toLower( tagName );
+                auto countIt = tagCounts.find( lcaseTagName );
+                if( countIt == tagCounts.end() )
+                    countIt = tagCounts.insert( std::make_pair( lcaseTagName, TagInfo() ) ).first;
+                countIt->second.add( tagName );
+            }
+        }
+
+        for( auto const& tagCount : tagCounts ) {
+            ReusableStringStream rss;
+            rss << "  " << std::setw(2) << tagCount.second.count << "  ";
+            auto str = rss.str();
+            auto wrapper = Column( tagCount.second.all() )
+                                                    .initialIndent( 0 )
+                                                    .indent( str.size() )
+                                                    .width( CATCH_CONFIG_CONSOLE_WIDTH-10 );
+            Catch::cout() << str << wrapper << '\n';
+        }
+        Catch::cout() << pluralise( tagCounts.size(), "tag" ) << '\n' << std::endl;
+        return tagCounts.size();
+    }
+
+    std::size_t listReporters() {
+        Catch::cout() << "Available reporters:\n";
+        IReporterRegistry::FactoryMap const& factories = getRegistryHub().getReporterRegistry().getFactories();
+        std::size_t maxNameLen = 0;
+        for( auto const& factoryKvp : factories )
+            maxNameLen = (std::max)( maxNameLen, factoryKvp.first.size() );
+
+        for( auto const& factoryKvp : factories ) {
+            Catch::cout()
+                    << Column( factoryKvp.first + ":" )
+                            .indent(2)
+                            .width( 5+maxNameLen )
+                    +  Column( factoryKvp.second->getDescription() )
+                            .initialIndent(0)
+                            .indent(2)
+                            .width( CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen-8 )
+                    << "\n";
+        }
+        Catch::cout() << std::endl;
+        return factories.size();
+    }
+
+    Option<std::size_t> list( std::shared_ptr<Config> const& config ) {
+        Option<std::size_t> listedCount;
+        getCurrentMutableContext().setConfig( config );
+        if( config->listTests() )
+            listedCount = listedCount.valueOr(0) + listTests( *config );
+        if( config->listTestNamesOnly() )
+            listedCount = listedCount.valueOr(0) + listTestsNamesOnly( *config );
+        if( config->listTags() )
+            listedCount = listedCount.valueOr(0) + listTags( *config );
+        if( config->listReporters() )
+            listedCount = listedCount.valueOr(0) + listReporters();
+        return listedCount;
+    }
+
+} // end namespace Catch
+// end catch_list.cpp
+// start catch_matchers.cpp
+
+namespace Catch {
+namespace Matchers {
+    namespace Impl {
+
+        std::string MatcherUntypedBase::toString() const {
+            if( m_cachedToString.empty() )
+                m_cachedToString = describe();
+            return m_cachedToString;
+        }
+
+        MatcherUntypedBase::~MatcherUntypedBase() = default;
+
+    } // namespace Impl
+} // namespace Matchers
+
+using namespace Matchers;
+using Matchers::Impl::MatcherBase;
+
+} // namespace Catch
+// end catch_matchers.cpp
+// start catch_matchers_exception.cpp
+
+namespace Catch {
+namespace Matchers {
+namespace Exception {
+
+bool ExceptionMessageMatcher::match(std::exception const& ex) const {
+    return ex.what() == m_message;
+}
+
+std::string ExceptionMessageMatcher::describe() const {
+    return "exception message matches \"" + m_message + "\"";
+}
+
+}
+Exception::ExceptionMessageMatcher Message(std::string const& message) {
+    return Exception::ExceptionMessageMatcher(message);
+}
+
+// namespace Exception
+} // namespace Matchers
+} // namespace Catch
+// end catch_matchers_exception.cpp
+// start catch_matchers_floating.cpp
+
+// start catch_polyfills.hpp
+
+namespace Catch {
+    bool isnan(float f);
+    bool isnan(double d);
+}
+
+// end catch_polyfills.hpp
+// start catch_to_string.hpp
+
+#include <string>
+
+namespace Catch {
+    template <typename T>
+    std::string to_string(T const& t) {
+#if defined(CATCH_CONFIG_CPP11_TO_STRING)
+        return std::to_string(t);
+#else
+        ReusableStringStream rss;
+        rss << t;
+        return rss.str();
+#endif
+    }
+} // end namespace Catch
+
+// end catch_to_string.hpp
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstdint>
+#include <cstring>
+#include <sstream>
+#include <type_traits>
+#include <iomanip>
+#include <limits>
+
+namespace Catch {
+namespace {
+
+    int32_t convert(float f) {
+        static_assert(sizeof(float) == sizeof(int32_t), "Important ULP matcher assumption violated");
+        int32_t i;
+        std::memcpy(&i, &f, sizeof(f));
+        return i;
+    }
+
+    int64_t convert(double d) {
+        static_assert(sizeof(double) == sizeof(int64_t), "Important ULP matcher assumption violated");
+        int64_t i;
+        std::memcpy(&i, &d, sizeof(d));
+        return i;
+    }
+
+    template <typename FP>
+    bool almostEqualUlps(FP lhs, FP rhs, uint64_t maxUlpDiff) {
+        // Comparison with NaN should always be false.
+        // This way we can rule it out before getting into the ugly details
+        if (Catch::isnan(lhs) || Catch::isnan(rhs)) {
+            return false;
+        }
+
+        auto lc = convert(lhs);
+        auto rc = convert(rhs);
+
+        if ((lc < 0) != (rc < 0)) {
+            // Potentially we can have +0 and -0
+            return lhs == rhs;
+        }
+
+        auto ulpDiff = std::abs(lc - rc);
+        return static_cast<uint64_t>(ulpDiff) <= maxUlpDiff;
+    }
+
+#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
+
+    float nextafter(float x, float y) {
+        return ::nextafterf(x, y);
+    }
+
+    double nextafter(double x, double y) {
+        return ::nextafter(x, y);
+    }
+
+#endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^
+
+template <typename FP>
+FP step(FP start, FP direction, uint64_t steps) {
+    for (uint64_t i = 0; i < steps; ++i) {
+#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
+        start = Catch::nextafter(start, direction);
+#else
+        start = std::nextafter(start, direction);
+#endif
+    }
+    return start;
+}
+
+// Performs equivalent check of std::fabs(lhs - rhs) <= margin
+// But without the subtraction to allow for INFINITY in comparison
+bool marginComparison(double lhs, double rhs, double margin) {
+    return (lhs + margin >= rhs) && (rhs + margin >= lhs);
+}
+
+template <typename FloatingPoint>
+void write(std::ostream& out, FloatingPoint num) {
+    out << std::scientific
+        << std::setprecision(std::numeric_limits<FloatingPoint>::max_digits10 - 1)
+        << num;
+}
+
+} // end anonymous namespace
+
+namespace Matchers {
+namespace Floating {
+
+    enum class FloatingPointKind : uint8_t {
+        Float,
+        Double
+    };
+
+    WithinAbsMatcher::WithinAbsMatcher(double target, double margin)
+        :m_target{ target }, m_margin{ margin } {
+        CATCH_ENFORCE(margin >= 0, "Invalid margin: " << margin << '.'
+            << " Margin has to be non-negative.");
+    }
+
+    // Performs equivalent check of std::fabs(lhs - rhs) <= margin
+    // But without the subtraction to allow for INFINITY in comparison
+    bool WithinAbsMatcher::match(double const& matchee) const {
+        return (matchee + m_margin >= m_target) && (m_target + m_margin >= matchee);
+    }
+
+    std::string WithinAbsMatcher::describe() const {
+        return "is within " + ::Catch::Detail::stringify(m_margin) + " of " + ::Catch::Detail::stringify(m_target);
+    }
+
+    WithinUlpsMatcher::WithinUlpsMatcher(double target, uint64_t ulps, FloatingPointKind baseType)
+        :m_target{ target }, m_ulps{ ulps }, m_type{ baseType } {
+        CATCH_ENFORCE(m_type == FloatingPointKind::Double
+                   || m_ulps < (std::numeric_limits<uint32_t>::max)(),
+            "Provided ULP is impossibly large for a float comparison.");
+    }
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+// Clang <3.5 reports on the default branch in the switch below
+#pragma clang diagnostic ignored "-Wunreachable-code"
+#endif
+
+    bool WithinUlpsMatcher::match(double const& matchee) const {
+        switch (m_type) {
+        case FloatingPointKind::Float:
+            return almostEqualUlps<float>(static_cast<float>(matchee), static_cast<float>(m_target), m_ulps);
+        case FloatingPointKind::Double:
+            return almostEqualUlps<double>(matchee, m_target, m_ulps);
+        default:
+            CATCH_INTERNAL_ERROR( "Unknown FloatingPointKind value" );
+        }
+    }
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+    std::string WithinUlpsMatcher::describe() const {
+        std::stringstream ret;
+
+        ret << "is within " << m_ulps << " ULPs of ";
+
+        if (m_type == FloatingPointKind::Float) {
+            write(ret, static_cast<float>(m_target));
+            ret << 'f';
+        } else {
+            write(ret, m_target);
+        }
+
+        ret << " ([";
+        if (m_type == FloatingPointKind::Double) {
+            write(ret, step(m_target, static_cast<double>(-INFINITY), m_ulps));
+            ret << ", ";
+            write(ret, step(m_target, static_cast<double>( INFINITY), m_ulps));
+        } else {
+            // We have to cast INFINITY to float because of MinGW, see #1782
+            write(ret, step(static_cast<float>(m_target), static_cast<float>(-INFINITY), m_ulps));
+            ret << ", ";
+            write(ret, step(static_cast<float>(m_target), static_cast<float>( INFINITY), m_ulps));
+        }
+        ret << "])";
+
+        return ret.str();
+    }
+
+    WithinRelMatcher::WithinRelMatcher(double target, double epsilon):
+        m_target(target),
+        m_epsilon(epsilon){
+        CATCH_ENFORCE(m_epsilon >= 0., "Relative comparison with epsilon <  0 does not make sense.");
+        CATCH_ENFORCE(m_epsilon  < 1., "Relative comparison with epsilon >= 1 does not make sense.");
+    }
+
+    bool WithinRelMatcher::match(double const& matchee) const {
+        const auto relMargin = m_epsilon * (std::max)(std::fabs(matchee), std::fabs(m_target));
+        return marginComparison(matchee, m_target,
+                                std::isinf(relMargin)? 0 : relMargin);
+    }
+
+    std::string WithinRelMatcher::describe() const {
+        Catch::ReusableStringStream sstr;
+        sstr << "and " << m_target << " are within " << m_epsilon * 100. << "% of each other";
+        return sstr.str();
+    }
+
+}// namespace Floating
+
+Floating::WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff) {
+    return Floating::WithinUlpsMatcher(target, maxUlpDiff, Floating::FloatingPointKind::Double);
+}
+
+Floating::WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff) {
+    return Floating::WithinUlpsMatcher(target, maxUlpDiff, Floating::FloatingPointKind::Float);
+}
+
+Floating::WithinAbsMatcher WithinAbs(double target, double margin) {
+    return Floating::WithinAbsMatcher(target, margin);
+}
+
+Floating::WithinRelMatcher WithinRel(double target, double eps) {
+    return Floating::WithinRelMatcher(target, eps);
+}
+
+Floating::WithinRelMatcher WithinRel(double target) {
+    return Floating::WithinRelMatcher(target, std::numeric_limits<double>::epsilon() * 100);
+}
+
+Floating::WithinRelMatcher WithinRel(float target, float eps) {
+    return Floating::WithinRelMatcher(target, eps);
+}
+
+Floating::WithinRelMatcher WithinRel(float target) {
+    return Floating::WithinRelMatcher(target, std::numeric_limits<float>::epsilon() * 100);
+}
+
+} // namespace Matchers
+} // namespace Catch
+
+// end catch_matchers_floating.cpp
+// start catch_matchers_generic.cpp
+
+std::string Catch::Matchers::Generic::Detail::finalizeDescription(const std::string& desc) {
+    if (desc.empty()) {
+        return "matches undescribed predicate";
+    } else {
+        return "matches predicate: \"" + desc + '"';
+    }
+}
+// end catch_matchers_generic.cpp
+// start catch_matchers_string.cpp
+
+#include <regex>
+
+namespace Catch {
+namespace Matchers {
+
+    namespace StdString {
+
+        CasedString::CasedString( std::string const& str, CaseSensitive::Choice caseSensitivity )
+        :   m_caseSensitivity( caseSensitivity ),
+            m_str( adjustString( str ) )
+        {}
+        std::string CasedString::adjustString( std::string const& str ) const {
+            return m_caseSensitivity == CaseSensitive::No
+                   ? toLower( str )
+                   : str;
+        }
+        std::string CasedString::caseSensitivitySuffix() const {
+            return m_caseSensitivity == CaseSensitive::No
+                   ? " (case insensitive)"
+                   : std::string();
+        }
+
+        StringMatcherBase::StringMatcherBase( std::string const& operation, CasedString const& comparator )
+        : m_comparator( comparator ),
+          m_operation( operation ) {
+        }
+
+        std::string StringMatcherBase::describe() const {
+            std::string description;
+            description.reserve(5 + m_operation.size() + m_comparator.m_str.size() +
+                                        m_comparator.caseSensitivitySuffix().size());
+            description += m_operation;
+            description += ": \"";
+            description += m_comparator.m_str;
+            description += "\"";
+            description += m_comparator.caseSensitivitySuffix();
+            return description;
+        }
+
+        EqualsMatcher::EqualsMatcher( CasedString const& comparator ) : StringMatcherBase( "equals", comparator ) {}
+
+        bool EqualsMatcher::match( std::string const& source ) const {
+            return m_comparator.adjustString( source ) == m_comparator.m_str;
+        }
+
+        ContainsMatcher::ContainsMatcher( CasedString const& comparator ) : StringMatcherBase( "contains", comparator ) {}
+
+        bool ContainsMatcher::match( std::string const& source ) const {
+            return contains( m_comparator.adjustString( source ), m_comparator.m_str );
+        }
+
+        StartsWithMatcher::StartsWithMatcher( CasedString const& comparator ) : StringMatcherBase( "starts with", comparator ) {}
+
+        bool StartsWithMatcher::match( std::string const& source ) const {
+            return startsWith( m_comparator.adjustString( source ), m_comparator.m_str );
+        }
+
+        EndsWithMatcher::EndsWithMatcher( CasedString const& comparator ) : StringMatcherBase( "ends with", comparator ) {}
+
+        bool EndsWithMatcher::match( std::string const& source ) const {
+            return endsWith( m_comparator.adjustString( source ), m_comparator.m_str );
+        }
+
+        RegexMatcher::RegexMatcher(std::string regex, CaseSensitive::Choice caseSensitivity): m_regex(std::move(regex)), m_caseSensitivity(caseSensitivity) {}
+
+        bool RegexMatcher::match(std::string const& matchee) const {
+            auto flags = std::regex::ECMAScript; // ECMAScript is the default syntax option anyway
+            if (m_caseSensitivity == CaseSensitive::Choice::No) {
+                flags |= std::regex::icase;
+            }
+            auto reg = std::regex(m_regex, flags);
+            return std::regex_match(matchee, reg);
+        }
+
+        std::string RegexMatcher::describe() const {
+            return "matches " + ::Catch::Detail::stringify(m_regex) + ((m_caseSensitivity == CaseSensitive::Choice::Yes)? " case sensitively" : " case insensitively");
+        }
+
+    } // namespace StdString
+
+    StdString::EqualsMatcher Equals( std::string const& str, CaseSensitive::Choice caseSensitivity ) {
+        return StdString::EqualsMatcher( StdString::CasedString( str, caseSensitivity) );
+    }
+    StdString::ContainsMatcher Contains( std::string const& str, CaseSensitive::Choice caseSensitivity ) {
+        return StdString::ContainsMatcher( StdString::CasedString( str, caseSensitivity) );
+    }
+    StdString::EndsWithMatcher EndsWith( std::string const& str, CaseSensitive::Choice caseSensitivity ) {
+        return StdString::EndsWithMatcher( StdString::CasedString( str, caseSensitivity) );
+    }
+    StdString::StartsWithMatcher StartsWith( std::string const& str, CaseSensitive::Choice caseSensitivity ) {
+        return StdString::StartsWithMatcher( StdString::CasedString( str, caseSensitivity) );
+    }
+
+    StdString::RegexMatcher Matches(std::string const& regex, CaseSensitive::Choice caseSensitivity) {
+        return StdString::RegexMatcher(regex, caseSensitivity);
+    }
+
+} // namespace Matchers
+} // namespace Catch
+// end catch_matchers_string.cpp
+// start catch_message.cpp
+
+// start catch_uncaught_exceptions.h
+
+namespace Catch {
+    bool uncaught_exceptions();
+} // end namespace Catch
+
+// end catch_uncaught_exceptions.h
+#include <cassert>
+#include <stack>
+
+namespace Catch {
+
+    MessageInfo::MessageInfo(   StringRef const& _macroName,
+                                SourceLineInfo const& _lineInfo,
+                                ResultWas::OfType _type )
+    :   macroName( _macroName ),
+        lineInfo( _lineInfo ),
+        type( _type ),
+        sequence( ++globalCount )
+    {}
+
+    bool MessageInfo::operator==( MessageInfo const& other ) const {
+        return sequence == other.sequence;
+    }
+
+    bool MessageInfo::operator<( MessageInfo const& other ) const {
+        return sequence < other.sequence;
+    }
+
+    // This may need protecting if threading support is added
+    unsigned int MessageInfo::globalCount = 0;
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    Catch::MessageBuilder::MessageBuilder( StringRef const& macroName,
+                                           SourceLineInfo const& lineInfo,
+                                           ResultWas::OfType type )
+        :m_info(macroName, lineInfo, type) {}
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    ScopedMessage::ScopedMessage( MessageBuilder const& builder )
+    : m_info( builder.m_info ), m_moved()
+    {
+        m_info.message = builder.m_stream.str();
+        getResultCapture().pushScopedMessage( m_info );
+    }
+
+    ScopedMessage::ScopedMessage( ScopedMessage&& old )
+    : m_info( old.m_info ), m_moved()
+    {
+        old.m_moved = true;
+    }
+
+    ScopedMessage::~ScopedMessage() {
+        if ( !uncaught_exceptions() && !m_moved ){
+            getResultCapture().popScopedMessage(m_info);
+        }
+    }
+
+    Capturer::Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names ) {
+        auto trimmed = [&] (size_t start, size_t end) {
+            while (names[start] == ',' || isspace(static_cast<unsigned char>(names[start]))) {
+                ++start;
+            }
+            while (names[end] == ',' || isspace(static_cast<unsigned char>(names[end]))) {
+                --end;
+            }
+            return names.substr(start, end - start + 1);
+        };
+        auto skipq = [&] (size_t start, char quote) {
+            for (auto i = start + 1; i < names.size() ; ++i) {
+                if (names[i] == quote)
+                    return i;
+                if (names[i] == '\\')
+                    ++i;
+            }
+            CATCH_INTERNAL_ERROR("CAPTURE parsing encountered unmatched quote");
+        };
+
+        size_t start = 0;
+        std::stack<char> openings;
+        for (size_t pos = 0; pos < names.size(); ++pos) {
+            char c = names[pos];
+            switch (c) {
+            case '[':
+            case '{':
+            case '(':
+            // It is basically impossible to disambiguate between
+            // comparison and start of template args in this context
+//            case '<':
+                openings.push(c);
+                break;
+            case ']':
+            case '}':
+            case ')':
+//           case '>':
+                openings.pop();
+                break;
+            case '"':
+            case '\'':
+                pos = skipq(pos, c);
+                break;
+            case ',':
+                if (start != pos && openings.empty()) {
+                    m_messages.emplace_back(macroName, lineInfo, resultType);
+                    m_messages.back().message = static_cast<std::string>(trimmed(start, pos));
+                    m_messages.back().message += " := ";
+                    start = pos;
+                }
+            }
+        }
+        assert(openings.empty() && "Mismatched openings");
+        m_messages.emplace_back(macroName, lineInfo, resultType);
+        m_messages.back().message = static_cast<std::string>(trimmed(start, names.size() - 1));
+        m_messages.back().message += " := ";
+    }
+    Capturer::~Capturer() {
+        if ( !uncaught_exceptions() ){
+            assert( m_captured == m_messages.size() );
+            for( size_t i = 0; i < m_captured; ++i  )
+                m_resultCapture.popScopedMessage( m_messages[i] );
+        }
+    }
+
+    void Capturer::captureValue( size_t index, std::string const& value ) {
+        assert( index < m_messages.size() );
+        m_messages[index].message += value;
+        m_resultCapture.pushScopedMessage( m_messages[index] );
+        m_captured++;
+    }
+
+} // end namespace Catch
+// end catch_message.cpp
+// start catch_output_redirect.cpp
+
+// start catch_output_redirect.h
+#ifndef TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H
+#define TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H
+
+#include <cstdio>
+#include <iosfwd>
+#include <string>
+
+namespace Catch {
+
+    class RedirectedStream {
+        std::ostream& m_originalStream;
+        std::ostream& m_redirectionStream;
+        std::streambuf* m_prevBuf;
+
+    public:
+        RedirectedStream( std::ostream& originalStream, std::ostream& redirectionStream );
+        ~RedirectedStream();
+    };
+
+    class RedirectedStdOut {
+        ReusableStringStream m_rss;
+        RedirectedStream m_cout;
+    public:
+        RedirectedStdOut();
+        auto str() const -> std::string;
+    };
+
+    // StdErr has two constituent streams in C++, std::cerr and std::clog
+    // This means that we need to redirect 2 streams into 1 to keep proper
+    // order of writes
+    class RedirectedStdErr {
+        ReusableStringStream m_rss;
+        RedirectedStream m_cerr;
+        RedirectedStream m_clog;
+    public:
+        RedirectedStdErr();
+        auto str() const -> std::string;
+    };
+
+    class RedirectedStreams {
+    public:
+        RedirectedStreams(RedirectedStreams const&) = delete;
+        RedirectedStreams& operator=(RedirectedStreams const&) = delete;
+        RedirectedStreams(RedirectedStreams&&) = delete;
+        RedirectedStreams& operator=(RedirectedStreams&&) = delete;
+
+        RedirectedStreams(std::string& redirectedCout, std::string& redirectedCerr);
+        ~RedirectedStreams();
+    private:
+        std::string& m_redirectedCout;
+        std::string& m_redirectedCerr;
+        RedirectedStdOut m_redirectedStdOut;
+        RedirectedStdErr m_redirectedStdErr;
+    };
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+
+    // Windows's implementation of std::tmpfile is terrible (it tries
+    // to create a file inside system folder, thus requiring elevated
+    // privileges for the binary), so we have to use tmpnam(_s) and
+    // create the file ourselves there.
+    class TempFile {
+    public:
+        TempFile(TempFile const&) = delete;
+        TempFile& operator=(TempFile const&) = delete;
+        TempFile(TempFile&&) = delete;
+        TempFile& operator=(TempFile&&) = delete;
+
+        TempFile();
+        ~TempFile();
+
+        std::FILE* getFile();
+        std::string getContents();
+
+    private:
+        std::FILE* m_file = nullptr;
+    #if defined(_MSC_VER)
+        char m_buffer[L_tmpnam] = { 0 };
+    #endif
+    };
+
+    class OutputRedirect {
+    public:
+        OutputRedirect(OutputRedirect const&) = delete;
+        OutputRedirect& operator=(OutputRedirect const&) = delete;
+        OutputRedirect(OutputRedirect&&) = delete;
+        OutputRedirect& operator=(OutputRedirect&&) = delete;
+
+        OutputRedirect(std::string& stdout_dest, std::string& stderr_dest);
+        ~OutputRedirect();
+
+    private:
+        int m_originalStdout = -1;
+        int m_originalStderr = -1;
+        TempFile m_stdoutFile;
+        TempFile m_stderrFile;
+        std::string& m_stdoutDest;
+        std::string& m_stderrDest;
+    };
+
+#endif
+
+} // end namespace Catch
+
+#endif // TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H
+// end catch_output_redirect.h
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+    #if defined(_MSC_VER)
+    #include <io.h>      //_dup and _dup2
+    #define dup _dup
+    #define dup2 _dup2
+    #define fileno _fileno
+    #else
+    #include <unistd.h>  // dup and dup2
+    #endif
+#endif
+
+namespace Catch {
+
+    RedirectedStream::RedirectedStream( std::ostream& originalStream, std::ostream& redirectionStream )
+    :   m_originalStream( originalStream ),
+        m_redirectionStream( redirectionStream ),
+        m_prevBuf( m_originalStream.rdbuf() )
+    {
+        m_originalStream.rdbuf( m_redirectionStream.rdbuf() );
+    }
+
+    RedirectedStream::~RedirectedStream() {
+        m_originalStream.rdbuf( m_prevBuf );
+    }
+
+    RedirectedStdOut::RedirectedStdOut() : m_cout( Catch::cout(), m_rss.get() ) {}
+    auto RedirectedStdOut::str() const -> std::string { return m_rss.str(); }
+
+    RedirectedStdErr::RedirectedStdErr()
+    :   m_cerr( Catch::cerr(), m_rss.get() ),
+        m_clog( Catch::clog(), m_rss.get() )
+    {}
+    auto RedirectedStdErr::str() const -> std::string { return m_rss.str(); }
+
+    RedirectedStreams::RedirectedStreams(std::string& redirectedCout, std::string& redirectedCerr)
+    :   m_redirectedCout(redirectedCout),
+        m_redirectedCerr(redirectedCerr)
+    {}
+
+    RedirectedStreams::~RedirectedStreams() {
+        m_redirectedCout += m_redirectedStdOut.str();
+        m_redirectedCerr += m_redirectedStdErr.str();
+    }
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+
+#if defined(_MSC_VER)
+    TempFile::TempFile() {
+        if (tmpnam_s(m_buffer)) {
+            CATCH_RUNTIME_ERROR("Could not get a temp filename");
+        }
+        if (fopen_s(&m_file, m_buffer, "w+")) {
+            char buffer[100];
+            if (strerror_s(buffer, errno)) {
+                CATCH_RUNTIME_ERROR("Could not translate errno to a string");
+            }
+            CATCH_RUNTIME_ERROR("Could not open the temp file: '" << m_buffer << "' because: " << buffer);
+        }
+    }
+#else
+    TempFile::TempFile() {
+        m_file = std::tmpfile();
+        if (!m_file) {
+            CATCH_RUNTIME_ERROR("Could not create a temp file.");
+        }
+    }
+
+#endif
+
+    TempFile::~TempFile() {
+         // TBD: What to do about errors here?
+         std::fclose(m_file);
+         // We manually create the file on Windows only, on Linux
+         // it will be autodeleted
+#if defined(_MSC_VER)
+         std::remove(m_buffer);
+#endif
+    }
+
+    FILE* TempFile::getFile() {
+        return m_file;
+    }
+
+    std::string TempFile::getContents() {
+        std::stringstream sstr;
+        char buffer[100] = {};
+        std::rewind(m_file);
+        while (std::fgets(buffer, sizeof(buffer), m_file)) {
+            sstr << buffer;
+        }
+        return sstr.str();
+    }
+
+    OutputRedirect::OutputRedirect(std::string& stdout_dest, std::string& stderr_dest) :
+        m_originalStdout(dup(1)),
+        m_originalStderr(dup(2)),
+        m_stdoutDest(stdout_dest),
+        m_stderrDest(stderr_dest) {
+        dup2(fileno(m_stdoutFile.getFile()), 1);
+        dup2(fileno(m_stderrFile.getFile()), 2);
+    }
+
+    OutputRedirect::~OutputRedirect() {
+        Catch::cout() << std::flush;
+        fflush(stdout);
+        // Since we support overriding these streams, we flush cerr
+        // even though std::cerr is unbuffered
+        Catch::cerr() << std::flush;
+        Catch::clog() << std::flush;
+        fflush(stderr);
+
+        dup2(m_originalStdout, 1);
+        dup2(m_originalStderr, 2);
+
+        m_stdoutDest += m_stdoutFile.getContents();
+        m_stderrDest += m_stderrFile.getContents();
+    }
+
+#endif // CATCH_CONFIG_NEW_CAPTURE
+
+} // namespace Catch
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+    #if defined(_MSC_VER)
+    #undef dup
+    #undef dup2
+    #undef fileno
+    #endif
+#endif
+// end catch_output_redirect.cpp
+// start catch_polyfills.cpp
+
+#include <cmath>
+
+namespace Catch {
+
+#if !defined(CATCH_CONFIG_POLYFILL_ISNAN)
+    bool isnan(float f) {
+        return std::isnan(f);
+    }
+    bool isnan(double d) {
+        return std::isnan(d);
+    }
+#else
+    // For now we only use this for embarcadero
+    bool isnan(float f) {
+        return std::_isnan(f);
+    }
+    bool isnan(double d) {
+        return std::_isnan(d);
+    }
+#endif
+
+} // end namespace Catch
+// end catch_polyfills.cpp
+// start catch_random_number_generator.cpp
+
+namespace Catch {
+
+namespace {
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4146) // we negate uint32 during the rotate
+#endif
+        // Safe rotr implementation thanks to John Regehr
+        uint32_t rotate_right(uint32_t val, uint32_t count) {
+            const uint32_t mask = 31;
+            count &= mask;
+            return (val >> count) | (val << (-count & mask));
+        }
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+}
+
+    SimplePcg32::SimplePcg32(result_type seed_) {
+        seed(seed_);
+    }
+
+    void SimplePcg32::seed(result_type seed_) {
+        m_state = 0;
+        (*this)();
+        m_state += seed_;
+        (*this)();
+    }
+
+    void SimplePcg32::discard(uint64_t skip) {
+        // We could implement this to run in O(log n) steps, but this
+        // should suffice for our use case.
+        for (uint64_t s = 0; s < skip; ++s) {
+            static_cast<void>((*this)());
+        }
+    }
+
+    SimplePcg32::result_type SimplePcg32::operator()() {
+        // prepare the output value
+        const uint32_t xorshifted = static_cast<uint32_t>(((m_state >> 18u) ^ m_state) >> 27u);
+        const auto output = rotate_right(xorshifted, m_state >> 59u);
+
+        // advance state
+        m_state = m_state * 6364136223846793005ULL + s_inc;
+
+        return output;
+    }
+
+    bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {
+        return lhs.m_state == rhs.m_state;
+    }
+
+    bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {
+        return lhs.m_state != rhs.m_state;
+    }
+}
+// end catch_random_number_generator.cpp
+// start catch_registry_hub.cpp
+
+// start catch_test_case_registry_impl.h
+
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <ios>
+
+namespace Catch {
+
+    class TestCase;
+    struct IConfig;
+
+    std::vector<TestCase> sortTests( IConfig const& config, std::vector<TestCase> const& unsortedTestCases );
+
+    bool isThrowSafe( TestCase const& testCase, IConfig const& config );
+    bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config );
+
+    void enforceNoDuplicateTestCases( std::vector<TestCase> const& functions );
+
+    std::vector<TestCase> filterTests( std::vector<TestCase> const& testCases, TestSpec const& testSpec, IConfig const& config );
+    std::vector<TestCase> const& getAllTestCasesSorted( IConfig const& config );
+
+    class TestRegistry : public ITestCaseRegistry {
+    public:
+        virtual ~TestRegistry() = default;
+
+        virtual void registerTest( TestCase const& testCase );
+
+        std::vector<TestCase> const& getAllTests() const override;
+        std::vector<TestCase> const& getAllTestsSorted( IConfig const& config ) const override;
+
+    private:
+        std::vector<TestCase> m_functions;
+        mutable RunTests::InWhatOrder m_currentSortOrder = RunTests::InDeclarationOrder;
+        mutable std::vector<TestCase> m_sortedFunctions;
+        std::size_t m_unnamedCount = 0;
+        std::ios_base::Init m_ostreamInit; // Forces cout/ cerr to be initialised
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    class TestInvokerAsFunction : public ITestInvoker {
+        void(*m_testAsFunction)();
+    public:
+        TestInvokerAsFunction( void(*testAsFunction)() ) noexcept;
+
+        void invoke() const override;
+    };
+
+    std::string extractClassName( StringRef const& classOrQualifiedMethodName );
+
+    ///////////////////////////////////////////////////////////////////////////
+
+} // end namespace Catch
+
+// end catch_test_case_registry_impl.h
+// start catch_reporter_registry.h
+
+#include <map>
+
+namespace Catch {
+
+    class ReporterRegistry : public IReporterRegistry {
+
+    public:
+
+        ~ReporterRegistry() override;
+
+        IStreamingReporterPtr create( std::string const& name, IConfigPtr const& config ) const override;
+
+        void registerReporter( std::string const& name, IReporterFactoryPtr const& factory );
+        void registerListener( IReporterFactoryPtr const& factory );
+
+        FactoryMap const& getFactories() const override;
+        Listeners const& getListeners() const override;
+
+    private:
+        FactoryMap m_factories;
+        Listeners m_listeners;
+    };
+}
+
+// end catch_reporter_registry.h
+// start catch_tag_alias_registry.h
+
+// start catch_tag_alias.h
+
+#include <string>
+
+namespace Catch {
+
+    struct TagAlias {
+        TagAlias(std::string const& _tag, SourceLineInfo _lineInfo);
+
+        std::string tag;
+        SourceLineInfo lineInfo;
+    };
+
+} // end namespace Catch
+
+// end catch_tag_alias.h
+#include <map>
+
+namespace Catch {
+
+    class TagAliasRegistry : public ITagAliasRegistry {
+    public:
+        ~TagAliasRegistry() override;
+        TagAlias const* find( std::string const& alias ) const override;
+        std::string expandAliases( std::string const& unexpandedTestSpec ) const override;
+        void add( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo );
+
+    private:
+        std::map<std::string, TagAlias> m_registry;
+    };
+
+} // end namespace Catch
+
+// end catch_tag_alias_registry.h
+// start catch_startup_exception_registry.h
+
+#include <vector>
+#include <exception>
+
+namespace Catch {
+
+    class StartupExceptionRegistry {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    public:
+        void add(std::exception_ptr const& exception) noexcept;
+        std::vector<std::exception_ptr> const& getExceptions() const noexcept;
+    private:
+        std::vector<std::exception_ptr> m_exceptions;
+#endif
+    };
+
+} // end namespace Catch
+
+// end catch_startup_exception_registry.h
+// start catch_singletons.hpp
+
+namespace Catch {
+
+    struct ISingleton {
+        virtual ~ISingleton();
+    };
+
+    void addSingleton( ISingleton* singleton );
+    void cleanupSingletons();
+
+    template<typename SingletonImplT, typename InterfaceT = SingletonImplT, typename MutableInterfaceT = InterfaceT>
+    class Singleton : SingletonImplT, public ISingleton {
+
+        static auto getInternal() -> Singleton* {
+            static Singleton* s_instance = nullptr;
+            if( !s_instance ) {
+                s_instance = new Singleton;
+                addSingleton( s_instance );
+            }
+            return s_instance;
+        }
+
+    public:
+        static auto get() -> InterfaceT const& {
+            return *getInternal();
+        }
+        static auto getMutable() -> MutableInterfaceT& {
+            return *getInternal();
+        }
+    };
+
+} // namespace Catch
+
+// end catch_singletons.hpp
+namespace Catch {
+
+    namespace {
+
+        class RegistryHub : public IRegistryHub, public IMutableRegistryHub,
+                            private NonCopyable {
+
+        public: // IRegistryHub
+            RegistryHub() = default;
+            IReporterRegistry const& getReporterRegistry() const override {
+                return m_reporterRegistry;
+            }
+            ITestCaseRegistry const& getTestCaseRegistry() const override {
+                return m_testCaseRegistry;
+            }
+            IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const override {
+                return m_exceptionTranslatorRegistry;
+            }
+            ITagAliasRegistry const& getTagAliasRegistry() const override {
+                return m_tagAliasRegistry;
+            }
+            StartupExceptionRegistry const& getStartupExceptionRegistry() const override {
+                return m_exceptionRegistry;
+            }
+
+        public: // IMutableRegistryHub
+            void registerReporter( std::string const& name, IReporterFactoryPtr const& factory ) override {
+                m_reporterRegistry.registerReporter( name, factory );
+            }
+            void registerListener( IReporterFactoryPtr const& factory ) override {
+                m_reporterRegistry.registerListener( factory );
+            }
+            void registerTest( TestCase const& testInfo ) override {
+                m_testCaseRegistry.registerTest( testInfo );
+            }
+            void registerTranslator( const IExceptionTranslator* translator ) override {
+                m_exceptionTranslatorRegistry.registerTranslator( translator );
+            }
+            void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) override {
+                m_tagAliasRegistry.add( alias, tag, lineInfo );
+            }
+            void registerStartupException() noexcept override {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+                m_exceptionRegistry.add(std::current_exception());
+#else
+                CATCH_INTERNAL_ERROR("Attempted to register active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
+#endif
+            }
+            IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() override {
+                return m_enumValuesRegistry;
+            }
+
+        private:
+            TestRegistry m_testCaseRegistry;
+            ReporterRegistry m_reporterRegistry;
+            ExceptionTranslatorRegistry m_exceptionTranslatorRegistry;
+            TagAliasRegistry m_tagAliasRegistry;
+            StartupExceptionRegistry m_exceptionRegistry;
+            Detail::EnumValuesRegistry m_enumValuesRegistry;
+        };
+    }
+
+    using RegistryHubSingleton = Singleton<RegistryHub, IRegistryHub, IMutableRegistryHub>;
+
+    IRegistryHub const& getRegistryHub() {
+        return RegistryHubSingleton::get();
+    }
+    IMutableRegistryHub& getMutableRegistryHub() {
+        return RegistryHubSingleton::getMutable();
+    }
+    void cleanUp() {
+        cleanupSingletons();
+        cleanUpContext();
+    }
+    std::string translateActiveException() {
+        return getRegistryHub().getExceptionTranslatorRegistry().translateActiveException();
+    }
+
+} // end namespace Catch
+// end catch_registry_hub.cpp
+// start catch_reporter_registry.cpp
+
+namespace Catch {
+
+    ReporterRegistry::~ReporterRegistry() = default;
+
+    IStreamingReporterPtr ReporterRegistry::create( std::string const& name, IConfigPtr const& config ) const {
+        auto it =  m_factories.find( name );
+        if( it == m_factories.end() )
+            return nullptr;
+        return it->second->create( ReporterConfig( config ) );
+    }
+
+    void ReporterRegistry::registerReporter( std::string const& name, IReporterFactoryPtr const& factory ) {
+        m_factories.emplace(name, factory);
+    }
+    void ReporterRegistry::registerListener( IReporterFactoryPtr const& factory ) {
+        m_listeners.push_back( factory );
+    }
+
+    IReporterRegistry::FactoryMap const& ReporterRegistry::getFactories() const {
+        return m_factories;
+    }
+    IReporterRegistry::Listeners const& ReporterRegistry::getListeners() const {
+        return m_listeners;
+    }
+
+}
+// end catch_reporter_registry.cpp
+// start catch_result_type.cpp
+
+namespace Catch {
+
+    bool isOk( ResultWas::OfType resultType ) {
+        return ( resultType & ResultWas::FailureBit ) == 0;
+    }
+    bool isJustInfo( int flags ) {
+        return flags == ResultWas::Info;
+    }
+
+    ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs ) {
+        return static_cast<ResultDisposition::Flags>( static_cast<int>( lhs ) | static_cast<int>( rhs ) );
+    }
+
+    bool shouldContinueOnFailure( int flags )    { return ( flags & ResultDisposition::ContinueOnFailure ) != 0; }
+    bool shouldSuppressFailure( int flags )      { return ( flags & ResultDisposition::SuppressFail ) != 0; }
+
+} // end namespace Catch
+// end catch_result_type.cpp
+// start catch_run_context.cpp
+
+#include <cassert>
+#include <algorithm>
+#include <sstream>
+
+namespace Catch {
+
+    namespace Generators {
+        struct GeneratorTracker : TestCaseTracking::TrackerBase, IGeneratorTracker {
+            GeneratorBasePtr m_generator;
+
+            GeneratorTracker( TestCaseTracking::NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent )
+            :   TrackerBase( nameAndLocation, ctx, parent )
+            {}
+            ~GeneratorTracker();
+
+            static GeneratorTracker& acquire( TrackerContext& ctx, TestCaseTracking::NameAndLocation const& nameAndLocation ) {
+                std::shared_ptr<GeneratorTracker> tracker;
+
+                ITracker& currentTracker = ctx.currentTracker();
+                // Under specific circumstances, the generator we want
+                // to acquire is also the current tracker. If this is
+                // the case, we have to avoid looking through current
+                // tracker's children, and instead return the current
+                // tracker.
+                // A case where this check is important is e.g.
+                //     for (int i = 0; i < 5; ++i) {
+                //         int n = GENERATE(1, 2);
+                //     }
+                //
+                // without it, the code above creates 5 nested generators.
+                if (currentTracker.nameAndLocation() == nameAndLocation) {
+                    auto thisTracker = currentTracker.parent().findChild(nameAndLocation);
+                    assert(thisTracker);
+                    assert(thisTracker->isGeneratorTracker());
+                    tracker = std::static_pointer_cast<GeneratorTracker>(thisTracker);
+                } else if ( TestCaseTracking::ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) {
+                    assert( childTracker );
+                    assert( childTracker->isGeneratorTracker() );
+                    tracker = std::static_pointer_cast<GeneratorTracker>( childTracker );
+                } else {
+                    tracker = std::make_shared<GeneratorTracker>( nameAndLocation, ctx, &currentTracker );
+                    currentTracker.addChild( tracker );
+                }
+
+                if( !tracker->isComplete() ) {
+                    tracker->open();
+                }
+
+                return *tracker;
+            }
+
+            // TrackerBase interface
+            bool isGeneratorTracker() const override { return true; }
+            auto hasGenerator() const -> bool override {
+                return !!m_generator;
+            }
+            void close() override {
+                TrackerBase::close();
+                // If a generator has a child (it is followed by a section)
+                // and none of its children have started, then we must wait
+                // until later to start consuming its values.
+                // This catches cases where `GENERATE` is placed between two
+                // `SECTION`s.
+                // **The check for m_children.empty cannot be removed**.
+                // doing so would break `GENERATE` _not_ followed by `SECTION`s.
+                const bool should_wait_for_child = [&]() {
+                    // No children -> nobody to wait for
+                    if ( m_children.empty() ) {
+                        return false;
+                    }
+                    // If at least one child started executing, don't wait
+                    if ( std::find_if(
+                             m_children.begin(),
+                             m_children.end(),
+                             []( TestCaseTracking::ITrackerPtr tracker ) {
+                                 return tracker->hasStarted();
+                             } ) != m_children.end() ) {
+                        return false;
+                    }
+
+                    // No children have started. We need to check if they _can_
+                    // start, and thus we should wait for them, or they cannot
+                    // start (due to filters), and we shouldn't wait for them
+                    auto* parent = m_parent;
+                    // This is safe: there is always at least one section
+                    // tracker in a test case tracking tree
+                    while ( !parent->isSectionTracker() ) {
+                        parent = &( parent->parent() );
+                    }
+                    assert( parent &&
+                            "Missing root (test case) level section" );
+
+                    auto const& parentSection =
+                        static_cast<SectionTracker&>( *parent );
+                    auto const& filters = parentSection.getFilters();
+                    // No filters -> no restrictions on running sections
+                    if ( filters.empty() ) {
+                        return true;
+                    }
+
+                    for ( auto const& child : m_children ) {
+                        if ( child->isSectionTracker() &&
+                             std::find( filters.begin(),
+                                        filters.end(),
+                                        static_cast<SectionTracker&>( *child )
+                                            .trimmedName() ) !=
+                                 filters.end() ) {
+                            return true;
+                        }
+                    }
+                    return false;
+                }();
+
+                // This check is a bit tricky, because m_generator->next()
+                // has a side-effect, where it consumes generator's current
+                // value, but we do not want to invoke the side-effect if
+                // this generator is still waiting for any child to start.
+                if ( should_wait_for_child ||
+                     ( m_runState == CompletedSuccessfully &&
+                       m_generator->next() ) ) {
+                    m_children.clear();
+                    m_runState = Executing;
+                }
+            }
+
+            // IGeneratorTracker interface
+            auto getGenerator() const -> GeneratorBasePtr const& override {
+                return m_generator;
+            }
+            void setGenerator( GeneratorBasePtr&& generator ) override {
+                m_generator = std::move( generator );
+            }
+        };
+        GeneratorTracker::~GeneratorTracker() {}
+    }
+
+    RunContext::RunContext(IConfigPtr const& _config, IStreamingReporterPtr&& reporter)
+    :   m_runInfo(_config->name()),
+        m_context(getCurrentMutableContext()),
+        m_config(_config),
+        m_reporter(std::move(reporter)),
+        m_lastAssertionInfo{ StringRef(), SourceLineInfo("",0), StringRef(), ResultDisposition::Normal },
+        m_includeSuccessfulResults( m_config->includeSuccessfulResults() || m_reporter->getPreferences().shouldReportAllAssertions )
+    {
+        m_context.setRunner(this);
+        m_context.setConfig(m_config);
+        m_context.setResultCapture(this);
+        m_reporter->testRunStarting(m_runInfo);
+    }
+
+    RunContext::~RunContext() {
+        m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, aborting()));
+    }
+
+    void RunContext::testGroupStarting(std::string const& testSpec, std::size_t groupIndex, std::size_t groupsCount) {
+        m_reporter->testGroupStarting(GroupInfo(testSpec, groupIndex, groupsCount));
+    }
+
+    void RunContext::testGroupEnded(std::string const& testSpec, Totals const& totals, std::size_t groupIndex, std::size_t groupsCount) {
+        m_reporter->testGroupEnded(TestGroupStats(GroupInfo(testSpec, groupIndex, groupsCount), totals, aborting()));
+    }
+
+    Totals RunContext::runTest(TestCase const& testCase) {
+        Totals prevTotals = m_totals;
+
+        std::string redirectedCout;
+        std::string redirectedCerr;
+
+        auto const& testInfo = testCase.getTestCaseInfo();
+
+        m_reporter->testCaseStarting(testInfo);
+
+        m_activeTestCase = &testCase;
+
+        ITracker& rootTracker = m_trackerContext.startRun();
+        assert(rootTracker.isSectionTracker());
+        static_cast<SectionTracker&>(rootTracker).addInitialFilters(m_config->getSectionsToRun());
+        do {
+            m_trackerContext.startCycle();
+            m_testCaseTracker = &SectionTracker::acquire(m_trackerContext, TestCaseTracking::NameAndLocation(testInfo.name, testInfo.lineInfo));
+            runCurrentTest(redirectedCout, redirectedCerr);
+        } while (!m_testCaseTracker->isSuccessfullyCompleted() && !aborting());
+
+        Totals deltaTotals = m_totals.delta(prevTotals);
+        if (testInfo.expectedToFail() && deltaTotals.testCases.passed > 0) {
+            deltaTotals.assertions.failed++;
+            deltaTotals.testCases.passed--;
+            deltaTotals.testCases.failed++;
+        }
+        m_totals.testCases += deltaTotals.testCases;
+        m_reporter->testCaseEnded(TestCaseStats(testInfo,
+                                  deltaTotals,
+                                  redirectedCout,
+                                  redirectedCerr,
+                                  aborting()));
+
+        m_activeTestCase = nullptr;
+        m_testCaseTracker = nullptr;
+
+        return deltaTotals;
+    }
+
+    IConfigPtr RunContext::config() const {
+        return m_config;
+    }
+
+    IStreamingReporter& RunContext::reporter() const {
+        return *m_reporter;
+    }
+
+    void RunContext::assertionEnded(AssertionResult const & result) {
+        if (result.getResultType() == ResultWas::Ok) {
+            m_totals.assertions.passed++;
+            m_lastAssertionPassed = true;
+        } else if (!result.isOk()) {
+            m_lastAssertionPassed = false;
+            if( m_activeTestCase->getTestCaseInfo().okToFail() )
+                m_totals.assertions.failedButOk++;
+            else
+                m_totals.assertions.failed++;
+        }
+        else {
+            m_lastAssertionPassed = true;
+        }
+
+        // We have no use for the return value (whether messages should be cleared), because messages were made scoped
+        // and should be let to clear themselves out.
+        static_cast<void>(m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals)));
+
+        if (result.getResultType() != ResultWas::Warning)
+            m_messageScopes.clear();
+
+        // Reset working state
+        resetAssertionInfo();
+        m_lastResult = result;
+    }
+    void RunContext::resetAssertionInfo() {
+        m_lastAssertionInfo.macroName = StringRef();
+        m_lastAssertionInfo.capturedExpression = "{Unknown expression after the reported line}"_sr;
+    }
+
+    bool RunContext::sectionStarted(SectionInfo const & sectionInfo, Counts & assertions) {
+        ITracker& sectionTracker = SectionTracker::acquire(m_trackerContext, TestCaseTracking::NameAndLocation(sectionInfo.name, sectionInfo.lineInfo));
+        if (!sectionTracker.isOpen())
+            return false;
+        m_activeSections.push_back(&sectionTracker);
+
+        m_lastAssertionInfo.lineInfo = sectionInfo.lineInfo;
+
+        m_reporter->sectionStarting(sectionInfo);
+
+        assertions = m_totals.assertions;
+
+        return true;
+    }
+    auto RunContext::acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& {
+        using namespace Generators;
+        GeneratorTracker& tracker = GeneratorTracker::acquire(m_trackerContext,
+                                                              TestCaseTracking::NameAndLocation( static_cast<std::string>(generatorName), lineInfo ) );
+        m_lastAssertionInfo.lineInfo = lineInfo;
+        return tracker;
+    }
+
+    bool RunContext::testForMissingAssertions(Counts& assertions) {
+        if (assertions.total() != 0)
+            return false;
+        if (!m_config->warnAboutMissingAssertions())
+            return false;
+        if (m_trackerContext.currentTracker().hasChildren())
+            return false;
+        m_totals.assertions.failed++;
+        assertions.failed++;
+        return true;
+    }
+
+    void RunContext::sectionEnded(SectionEndInfo const & endInfo) {
+        Counts assertions = m_totals.assertions - endInfo.prevAssertions;
+        bool missingAssertions = testForMissingAssertions(assertions);
+
+        if (!m_activeSections.empty()) {
+            m_activeSections.back()->close();
+            m_activeSections.pop_back();
+        }
+
+        m_reporter->sectionEnded(SectionStats(endInfo.sectionInfo, assertions, endInfo.durationInSeconds, missingAssertions));
+        m_messages.clear();
+        m_messageScopes.clear();
+    }
+
+    void RunContext::sectionEndedEarly(SectionEndInfo const & endInfo) {
+        if (m_unfinishedSections.empty())
+            m_activeSections.back()->fail();
+        else
+            m_activeSections.back()->close();
+        m_activeSections.pop_back();
+
+        m_unfinishedSections.push_back(endInfo);
+    }
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+    void RunContext::benchmarkPreparing(std::string const& name) {
+        m_reporter->benchmarkPreparing(name);
+    }
+    void RunContext::benchmarkStarting( BenchmarkInfo const& info ) {
+        m_reporter->benchmarkStarting( info );
+    }
+    void RunContext::benchmarkEnded( BenchmarkStats<> const& stats ) {
+        m_reporter->benchmarkEnded( stats );
+    }
+    void RunContext::benchmarkFailed(std::string const & error) {
+        m_reporter->benchmarkFailed(error);
+    }
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+    void RunContext::pushScopedMessage(MessageInfo const & message) {
+        m_messages.push_back(message);
+    }
+
+    void RunContext::popScopedMessage(MessageInfo const & message) {
+        m_messages.erase(std::remove(m_messages.begin(), m_messages.end(), message), m_messages.end());
+    }
+
+    void RunContext::emplaceUnscopedMessage( MessageBuilder const& builder ) {
+        m_messageScopes.emplace_back( builder );
+    }
+
+    std::string RunContext::getCurrentTestName() const {
+        return m_activeTestCase
+            ? m_activeTestCase->getTestCaseInfo().name
+            : std::string();
+    }
+
+    const AssertionResult * RunContext::getLastResult() const {
+        return &(*m_lastResult);
+    }
+
+    void RunContext::exceptionEarlyReported() {
+        m_shouldReportUnexpected = false;
+    }
+
+    void RunContext::handleFatalErrorCondition( StringRef message ) {
+        // First notify reporter that bad things happened
+        m_reporter->fatalErrorEncountered(message);
+
+        // Don't rebuild the result -- the stringification itself can cause more fatal errors
+        // Instead, fake a result data.
+        AssertionResultData tempResult( ResultWas::FatalErrorCondition, { false } );
+        tempResult.message = static_cast<std::string>(message);
+        AssertionResult result(m_lastAssertionInfo, tempResult);
+
+        assertionEnded(result);
+
+        handleUnfinishedSections();
+
+        // Recreate section for test case (as we will lose the one that was in scope)
+        auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
+        SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
+
+        Counts assertions;
+        assertions.failed = 1;
+        SectionStats testCaseSectionStats(testCaseSection, assertions, 0, false);
+        m_reporter->sectionEnded(testCaseSectionStats);
+
+        auto const& testInfo = m_activeTestCase->getTestCaseInfo();
+
+        Totals deltaTotals;
+        deltaTotals.testCases.failed = 1;
+        deltaTotals.assertions.failed = 1;
+        m_reporter->testCaseEnded(TestCaseStats(testInfo,
+                                  deltaTotals,
+                                  std::string(),
+                                  std::string(),
+                                  false));
+        m_totals.testCases.failed++;
+        testGroupEnded(std::string(), m_totals, 1, 1);
+        m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, false));
+    }
+
+    bool RunContext::lastAssertionPassed() {
+         return m_lastAssertionPassed;
+    }
+
+    void RunContext::assertionPassed() {
+        m_lastAssertionPassed = true;
+        ++m_totals.assertions.passed;
+        resetAssertionInfo();
+        m_messageScopes.clear();
+    }
+
+    bool RunContext::aborting() const {
+        return m_totals.assertions.failed >= static_cast<std::size_t>(m_config->abortAfter());
+    }
+
+    void RunContext::runCurrentTest(std::string & redirectedCout, std::string & redirectedCerr) {
+        auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
+        SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
+        m_reporter->sectionStarting(testCaseSection);
+        Counts prevAssertions = m_totals.assertions;
+        double duration = 0;
+        m_shouldReportUnexpected = true;
+        m_lastAssertionInfo = { "TEST_CASE"_sr, testCaseInfo.lineInfo, StringRef(), ResultDisposition::Normal };
+
+        seedRng(*m_config);
+
+        Timer timer;
+        CATCH_TRY {
+            if (m_reporter->getPreferences().shouldRedirectStdOut) {
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
+                RedirectedStreams redirectedStreams(redirectedCout, redirectedCerr);
+
+                timer.start();
+                invokeActiveTestCase();
+#else
+                OutputRedirect r(redirectedCout, redirectedCerr);
+                timer.start();
+                invokeActiveTestCase();
+#endif
+            } else {
+                timer.start();
+                invokeActiveTestCase();
+            }
+            duration = timer.getElapsedSeconds();
+        } CATCH_CATCH_ANON (TestFailureException&) {
+            // This just means the test was aborted due to failure
+        } CATCH_CATCH_ALL {
+            // Under CATCH_CONFIG_FAST_COMPILE, unexpected exceptions under REQUIRE assertions
+            // are reported without translation at the point of origin.
+            if( m_shouldReportUnexpected ) {
+                AssertionReaction dummyReaction;
+                handleUnexpectedInflightException( m_lastAssertionInfo, translateActiveException(), dummyReaction );
+            }
+        }
+        Counts assertions = m_totals.assertions - prevAssertions;
+        bool missingAssertions = testForMissingAssertions(assertions);
+
+        m_testCaseTracker->close();
+        handleUnfinishedSections();
+        m_messages.clear();
+        m_messageScopes.clear();
+
+        SectionStats testCaseSectionStats(testCaseSection, assertions, duration, missingAssertions);
+        m_reporter->sectionEnded(testCaseSectionStats);
+    }
+
+    void RunContext::invokeActiveTestCase() {
+        FatalConditionHandler fatalConditionHandler; // Handle signals
+        m_activeTestCase->invoke();
+        fatalConditionHandler.reset();
+    }
+
+    void RunContext::handleUnfinishedSections() {
+        // If sections ended prematurely due to an exception we stored their
+        // infos here so we can tear them down outside the unwind process.
+        for (auto it = m_unfinishedSections.rbegin(),
+             itEnd = m_unfinishedSections.rend();
+             it != itEnd;
+             ++it)
+            sectionEnded(*it);
+        m_unfinishedSections.clear();
+    }
+
+    void RunContext::handleExpr(
+        AssertionInfo const& info,
+        ITransientExpression const& expr,
+        AssertionReaction& reaction
+    ) {
+        m_reporter->assertionStarting( info );
+
+        bool negated = isFalseTest( info.resultDisposition );
+        bool result = expr.getResult() != negated;
+
+        if( result ) {
+            if (!m_includeSuccessfulResults) {
+                assertionPassed();
+            }
+            else {
+                reportExpr(info, ResultWas::Ok, &expr, negated);
+            }
+        }
+        else {
+            reportExpr(info, ResultWas::ExpressionFailed, &expr, negated );
+            populateReaction( reaction );
+        }
+    }
+    void RunContext::reportExpr(
+            AssertionInfo const &info,
+            ResultWas::OfType resultType,
+            ITransientExpression const *expr,
+            bool negated ) {
+
+        m_lastAssertionInfo = info;
+        AssertionResultData data( resultType, LazyExpression( negated ) );
+
+        AssertionResult assertionResult{ info, data };
+        assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;
+
+        assertionEnded( assertionResult );
+    }
+
+    void RunContext::handleMessage(
+            AssertionInfo const& info,
+            ResultWas::OfType resultType,
+            StringRef const& message,
+            AssertionReaction& reaction
+    ) {
+        m_reporter->assertionStarting( info );
+
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( resultType, LazyExpression( false ) );
+        data.message = static_cast<std::string>(message);
+        AssertionResult assertionResult{ m_lastAssertionInfo, data };
+        assertionEnded( assertionResult );
+        if( !assertionResult.isOk() )
+            populateReaction( reaction );
+    }
+    void RunContext::handleUnexpectedExceptionNotThrown(
+            AssertionInfo const& info,
+            AssertionReaction& reaction
+    ) {
+        handleNonExpr(info, Catch::ResultWas::DidntThrowException, reaction);
+    }
+
+    void RunContext::handleUnexpectedInflightException(
+            AssertionInfo const& info,
+            std::string const& message,
+            AssertionReaction& reaction
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
+        data.message = message;
+        AssertionResult assertionResult{ info, data };
+        assertionEnded( assertionResult );
+        populateReaction( reaction );
+    }
+
+    void RunContext::populateReaction( AssertionReaction& reaction ) {
+        reaction.shouldDebugBreak = m_config->shouldDebugBreak();
+        reaction.shouldThrow = aborting() || (m_lastAssertionInfo.resultDisposition & ResultDisposition::Normal);
+    }
+
+    void RunContext::handleIncomplete(
+            AssertionInfo const& info
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
+        data.message = "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE";
+        AssertionResult assertionResult{ info, data };
+        assertionEnded( assertionResult );
+    }
+    void RunContext::handleNonExpr(
+            AssertionInfo const &info,
+            ResultWas::OfType resultType,
+            AssertionReaction &reaction
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( resultType, LazyExpression( false ) );
+        AssertionResult assertionResult{ info, data };
+        assertionEnded( assertionResult );
+
+        if( !assertionResult.isOk() )
+            populateReaction( reaction );
+    }
+
+    IResultCapture& getResultCapture() {
+        if (auto* capture = getCurrentContext().getResultCapture())
+            return *capture;
+        else
+            CATCH_INTERNAL_ERROR("No result capture instance");
+    }
+
+    void seedRng(IConfig const& config) {
+        if (config.rngSeed() != 0) {
+            std::srand(config.rngSeed());
+            rng().seed(config.rngSeed());
+        }
+    }
+
+    unsigned int rngSeed() {
+        return getCurrentContext().getConfig()->rngSeed();
+    }
+
+}
+// end catch_run_context.cpp
+// start catch_section.cpp
+
+namespace Catch {
+
+    Section::Section( SectionInfo const& info )
+    :   m_info( info ),
+        m_sectionIncluded( getResultCapture().sectionStarted( m_info, m_assertions ) )
+    {
+        m_timer.start();
+    }
+
+    Section::~Section() {
+        if( m_sectionIncluded ) {
+            SectionEndInfo endInfo{ m_info, m_assertions, m_timer.getElapsedSeconds() };
+            if( uncaught_exceptions() )
+                getResultCapture().sectionEndedEarly( endInfo );
+            else
+                getResultCapture().sectionEnded( endInfo );
+        }
+    }
+
+    // This indicates whether the section should be executed or not
+    Section::operator bool() const {
+        return m_sectionIncluded;
+    }
+
+} // end namespace Catch
+// end catch_section.cpp
+// start catch_section_info.cpp
+
+namespace Catch {
+
+    SectionInfo::SectionInfo
+        (   SourceLineInfo const& _lineInfo,
+            std::string const& _name )
+    :   name( _name ),
+        lineInfo( _lineInfo )
+    {}
+
+} // end namespace Catch
+// end catch_section_info.cpp
+// start catch_session.cpp
+
+// start catch_session.h
+
+#include <memory>
+
+namespace Catch {
+
+    class Session : NonCopyable {
+    public:
+
+        Session();
+        ~Session() override;
+
+        void showHelp() const;
+        void libIdentify();
+
+        int applyCommandLine( int argc, char const * const * argv );
+    #if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)
+        int applyCommandLine( int argc, wchar_t const * const * argv );
+    #endif
+
+        void useConfigData( ConfigData const& configData );
+
+        template<typename CharT>
+        int run(int argc, CharT const * const argv[]) {
+            if (m_startupExceptions)
+                return 1;
+            int returnCode = applyCommandLine(argc, argv);
+            if (returnCode == 0)
+                returnCode = run();
+            return returnCode;
+        }
+
+        int run();
+
+        clara::Parser const& cli() const;
+        void cli( clara::Parser const& newParser );
+        ConfigData& configData();
+        Config& config();
+    private:
+        int runInternal();
+
+        clara::Parser m_cli;
+        ConfigData m_configData;
+        std::shared_ptr<Config> m_config;
+        bool m_startupExceptions = false;
+    };
+
+} // end namespace Catch
+
+// end catch_session.h
+// start catch_version.h
+
+#include <iosfwd>
+
+namespace Catch {
+
+    // Versioning information
+    struct Version {
+        Version( Version const& ) = delete;
+        Version& operator=( Version const& ) = delete;
+        Version(    unsigned int _majorVersion,
+                    unsigned int _minorVersion,
+                    unsigned int _patchNumber,
+                    char const * const _branchName,
+                    unsigned int _buildNumber );
+
+        unsigned int const majorVersion;
+        unsigned int const minorVersion;
+        unsigned int const patchNumber;
+
+        // buildNumber is only used if branchName is not null
+        char const * const branchName;
+        unsigned int const buildNumber;
+
+        friend std::ostream& operator << ( std::ostream& os, Version const& version );
+    };
+
+    Version const& libraryVersion();
+}
+
+// end catch_version.h
+#include <cstdlib>
+#include <iomanip>
+#include <set>
+#include <iterator>
+
+namespace Catch {
+
+    namespace {
+        const int MaxExitCode = 255;
+
+        IStreamingReporterPtr createReporter(std::string const& reporterName, IConfigPtr const& config) {
+            auto reporter = Catch::getRegistryHub().getReporterRegistry().create(reporterName, config);
+            CATCH_ENFORCE(reporter, "No reporter registered with name: '" << reporterName << "'");
+
+            return reporter;
+        }
+
+        IStreamingReporterPtr makeReporter(std::shared_ptr<Config> const& config) {
+            if (Catch::getRegistryHub().getReporterRegistry().getListeners().empty()) {
+                return createReporter(config->getReporterName(), config);
+            }
+
+            // On older platforms, returning std::unique_ptr<ListeningReporter>
+            // when the return type is std::unique_ptr<IStreamingReporter>
+            // doesn't compile without a std::move call. However, this causes
+            // a warning on newer platforms. Thus, we have to work around
+            // it a bit and downcast the pointer manually.
+            auto ret = std::unique_ptr<IStreamingReporter>(new ListeningReporter);
+            auto& multi = static_cast<ListeningReporter&>(*ret);
+            auto const& listeners = Catch::getRegistryHub().getReporterRegistry().getListeners();
+            for (auto const& listener : listeners) {
+                multi.addListener(listener->create(Catch::ReporterConfig(config)));
+            }
+            multi.addReporter(createReporter(config->getReporterName(), config));
+            return ret;
+        }
+
+        class TestGroup {
+        public:
+            explicit TestGroup(std::shared_ptr<Config> const& config)
+            : m_config{config}
+            , m_context{config, makeReporter(config)}
+            {
+                auto const& allTestCases = getAllTestCasesSorted(*m_config);
+                m_matches = m_config->testSpec().matchesByFilter(allTestCases, *m_config);
+                auto const& invalidArgs = m_config->testSpec().getInvalidArgs();
+
+                if (m_matches.empty() && invalidArgs.empty()) {
+                    for (auto const& test : allTestCases)
+                        if (!test.isHidden())
+                            m_tests.emplace(&test);
+                } else {
+                    for (auto const& match : m_matches)
+                        m_tests.insert(match.tests.begin(), match.tests.end());
+                }
+            }
+
+            Totals execute() {
+                auto const& invalidArgs = m_config->testSpec().getInvalidArgs();
+                Totals totals;
+                m_context.testGroupStarting(m_config->name(), 1, 1);
+                for (auto const& testCase : m_tests) {
+                    if (!m_context.aborting())
+                        totals += m_context.runTest(*testCase);
+                    else
+                        m_context.reporter().skipTest(*testCase);
+                }
+
+                for (auto const& match : m_matches) {
+                    if (match.tests.empty()) {
+                        m_context.reporter().noMatchingTestCases(match.name);
+                        totals.error = -1;
+                    }
+                }
+
+                if (!invalidArgs.empty()) {
+                    for (auto const& invalidArg: invalidArgs)
+                         m_context.reporter().reportInvalidArguments(invalidArg);
+                }
+
+                m_context.testGroupEnded(m_config->name(), totals, 1, 1);
+                return totals;
+            }
+
+        private:
+            using Tests = std::set<TestCase const*>;
+
+            std::shared_ptr<Config> m_config;
+            RunContext m_context;
+            Tests m_tests;
+            TestSpec::Matches m_matches;
+        };
+
+        void applyFilenamesAsTags(Catch::IConfig const& config) {
+            auto& tests = const_cast<std::vector<TestCase>&>(getAllTestCasesSorted(config));
+            for (auto& testCase : tests) {
+                auto tags = testCase.tags;
+
+                std::string filename = testCase.lineInfo.file;
+                auto lastSlash = filename.find_last_of("\\/");
+                if (lastSlash != std::string::npos) {
+                    filename.erase(0, lastSlash);
+                    filename[0] = '#';
+                }
+
+                auto lastDot = filename.find_last_of('.');
+                if (lastDot != std::string::npos) {
+                    filename.erase(lastDot);
+                }
+
+                tags.push_back(std::move(filename));
+                setTags(testCase, tags);
+            }
+        }
+
+    } // anon namespace
+
+    Session::Session() {
+        static bool alreadyInstantiated = false;
+        if( alreadyInstantiated ) {
+            CATCH_TRY { CATCH_INTERNAL_ERROR( "Only one instance of Catch::Session can ever be used" ); }
+            CATCH_CATCH_ALL { getMutableRegistryHub().registerStartupException(); }
+        }
+
+        // There cannot be exceptions at startup in no-exception mode.
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        const auto& exceptions = getRegistryHub().getStartupExceptionRegistry().getExceptions();
+        if ( !exceptions.empty() ) {
+            config();
+            getCurrentMutableContext().setConfig(m_config);
+
+            m_startupExceptions = true;
+            Colour colourGuard( Colour::Red );
+            Catch::cerr() << "Errors occurred during startup!" << '\n';
+            // iterate over all exceptions and notify user
+            for ( const auto& ex_ptr : exceptions ) {
+                try {
+                    std::rethrow_exception(ex_ptr);
+                } catch ( std::exception const& ex ) {
+                    Catch::cerr() << Column( ex.what() ).indent(2) << '\n';
+                }
+            }
+        }
+#endif
+
+        alreadyInstantiated = true;
+        m_cli = makeCommandLineParser( m_configData );
+    }
+    Session::~Session() {
+        Catch::cleanUp();
+    }
+
+    void Session::showHelp() const {
+        Catch::cout()
+                << "\nCatch v" << libraryVersion() << "\n"
+                << m_cli << std::endl
+                << "For more detailed usage please see the project docs\n" << std::endl;
+    }
+    void Session::libIdentify() {
+        Catch::cout()
+                << std::left << std::setw(16) << "description: " << "A Catch2 test executable\n"
+                << std::left << std::setw(16) << "category: " << "testframework\n"
+                << std::left << std::setw(16) << "framework: " << "Catch Test\n"
+                << std::left << std::setw(16) << "version: " << libraryVersion() << std::endl;
+    }
+
+    int Session::applyCommandLine( int argc, char const * const * argv ) {
+        if( m_startupExceptions )
+            return 1;
+
+        auto result = m_cli.parse( clara::Args( argc, argv ) );
+        if( !result ) {
+            config();
+            getCurrentMutableContext().setConfig(m_config);
+            Catch::cerr()
+                << Colour( Colour::Red )
+                << "\nError(s) in input:\n"
+                << Column( result.errorMessage() ).indent( 2 )
+                << "\n\n";
+            Catch::cerr() << "Run with -? for usage\n" << std::endl;
+            return MaxExitCode;
+        }
+
+        if( m_configData.showHelp )
+            showHelp();
+        if( m_configData.libIdentify )
+            libIdentify();
+        m_config.reset();
+        return 0;
+    }
+
+#if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)
+    int Session::applyCommandLine( int argc, wchar_t const * const * argv ) {
+
+        char **utf8Argv = new char *[ argc ];
+
+        for ( int i = 0; i < argc; ++i ) {
+            int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, nullptr, 0, nullptr, nullptr );
+
+            utf8Argv[ i ] = new char[ bufSize ];
+
+            WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, nullptr, nullptr );
+        }
+
+        int returnCode = applyCommandLine( argc, utf8Argv );
+
+        for ( int i = 0; i < argc; ++i )
+            delete [] utf8Argv[ i ];
+
+        delete [] utf8Argv;
+
+        return returnCode;
+    }
+#endif
+
+    void Session::useConfigData( ConfigData const& configData ) {
+        m_configData = configData;
+        m_config.reset();
+    }
+
+    int Session::run() {
+        if( ( m_configData.waitForKeypress & WaitForKeypress::BeforeStart ) != 0 ) {
+            Catch::cout() << "...waiting for enter/ return before starting" << std::endl;
+            static_cast<void>(std::getchar());
+        }
+        int exitCode = runInternal();
+        if( ( m_configData.waitForKeypress & WaitForKeypress::BeforeExit ) != 0 ) {
+            Catch::cout() << "...waiting for enter/ return before exiting, with code: " << exitCode << std::endl;
+            static_cast<void>(std::getchar());
+        }
+        return exitCode;
+    }
+
+    clara::Parser const& Session::cli() const {
+        return m_cli;
+    }
+    void Session::cli( clara::Parser const& newParser ) {
+        m_cli = newParser;
+    }
+    ConfigData& Session::configData() {
+        return m_configData;
+    }
+    Config& Session::config() {
+        if( !m_config )
+            m_config = std::make_shared<Config>( m_configData );
+        return *m_config;
+    }
+
+    int Session::runInternal() {
+        if( m_startupExceptions )
+            return 1;
+
+        if (m_configData.showHelp || m_configData.libIdentify) {
+            return 0;
+        }
+
+        CATCH_TRY {
+            config(); // Force config to be constructed
+
+            seedRng( *m_config );
+
+            if( m_configData.filenamesAsTags )
+                applyFilenamesAsTags( *m_config );
+
+            // Handle list request
+            if( Option<std::size_t> listed = list( m_config ) )
+                return static_cast<int>( *listed );
+
+            TestGroup tests { m_config };
+            auto const totals = tests.execute();
+
+            if( m_config->warnAboutNoTests() && totals.error == -1 )
+                return 2;
+
+            // Note that on unices only the lower 8 bits are usually used, clamping
+            // the return value to 255 prevents false negative when some multiple
+            // of 256 tests has failed
+            return (std::min) (MaxExitCode, (std::max) (totals.error, static_cast<int>(totals.assertions.failed)));
+        }
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        catch( std::exception& ex ) {
+            Catch::cerr() << ex.what() << std::endl;
+            return MaxExitCode;
+        }
+#endif
+    }
+
+} // end namespace Catch
+// end catch_session.cpp
+// start catch_singletons.cpp
+
+#include <vector>
+
+namespace Catch {
+
+    namespace {
+        static auto getSingletons() -> std::vector<ISingleton*>*& {
+            static std::vector<ISingleton*>* g_singletons = nullptr;
+            if( !g_singletons )
+                g_singletons = new std::vector<ISingleton*>();
+            return g_singletons;
+        }
+    }
+
+    ISingleton::~ISingleton() {}
+
+    void addSingleton(ISingleton* singleton ) {
+        getSingletons()->push_back( singleton );
+    }
+    void cleanupSingletons() {
+        auto& singletons = getSingletons();
+        for( auto singleton : *singletons )
+            delete singleton;
+        delete singletons;
+        singletons = nullptr;
+    }
+
+} // namespace Catch
+// end catch_singletons.cpp
+// start catch_startup_exception_registry.cpp
+
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+namespace Catch {
+void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexcept {
+        CATCH_TRY {
+            m_exceptions.push_back(exception);
+        } CATCH_CATCH_ALL {
+            // If we run out of memory during start-up there's really not a lot more we can do about it
+            std::terminate();
+        }
+    }
+
+    std::vector<std::exception_ptr> const& StartupExceptionRegistry::getExceptions() const noexcept {
+        return m_exceptions;
+    }
+
+} // end namespace Catch
+#endif
+// end catch_startup_exception_registry.cpp
+// start catch_stream.cpp
+
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <memory>
+
+namespace Catch {
+
+    Catch::IStream::~IStream() = default;
+
+    namespace Detail { namespace {
+        template<typename WriterF, std::size_t bufferSize=256>
+        class StreamBufImpl : public std::streambuf {
+            char data[bufferSize];
+            WriterF m_writer;
+
+        public:
+            StreamBufImpl() {
+                setp( data, data + sizeof(data) );
+            }
+
+            ~StreamBufImpl() noexcept {
+                StreamBufImpl::sync();
+            }
+
+        private:
+            int overflow( int c ) override {
+                sync();
+
+                if( c != EOF ) {
+                    if( pbase() == epptr() )
+                        m_writer( std::string( 1, static_cast<char>( c ) ) );
+                    else
+                        sputc( static_cast<char>( c ) );
+                }
+                return 0;
+            }
+
+            int sync() override {
+                if( pbase() != pptr() ) {
+                    m_writer( std::string( pbase(), static_cast<std::string::size_type>( pptr() - pbase() ) ) );
+                    setp( pbase(), epptr() );
+                }
+                return 0;
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        struct OutputDebugWriter {
+
+            void operator()( std::string const&str ) {
+                writeToDebugConsole( str );
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class FileStream : public IStream {
+            mutable std::ofstream m_ofs;
+        public:
+            FileStream( StringRef filename ) {
+                m_ofs.open( filename.c_str() );
+                CATCH_ENFORCE( !m_ofs.fail(), "Unable to open file: '" << filename << "'" );
+            }
+            ~FileStream() override = default;
+        public: // IStream
+            std::ostream& stream() const override {
+                return m_ofs;
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class CoutStream : public IStream {
+            mutable std::ostream m_os;
+        public:
+            // Store the streambuf from cout up-front because
+            // cout may get redirected when running tests
+            CoutStream() : m_os( Catch::cout().rdbuf() ) {}
+            ~CoutStream() override = default;
+
+        public: // IStream
+            std::ostream& stream() const override { return m_os; }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class DebugOutStream : public IStream {
+            std::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;
+            mutable std::ostream m_os;
+        public:
+            DebugOutStream()
+            :   m_streamBuf( new StreamBufImpl<OutputDebugWriter>() ),
+                m_os( m_streamBuf.get() )
+            {}
+
+            ~DebugOutStream() override = default;
+
+        public: // IStream
+            std::ostream& stream() const override { return m_os; }
+        };
+
+    }} // namespace anon::detail
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    auto makeStream( StringRef const &filename ) -> IStream const* {
+        if( filename.empty() )
+            return new Detail::CoutStream();
+        else if( filename[0] == '%' ) {
+            if( filename == "%debug" )
+                return new Detail::DebugOutStream();
+            else
+                CATCH_ERROR( "Unrecognised stream: '" << filename << "'" );
+        }
+        else
+            return new Detail::FileStream( filename );
+    }
+
+    // This class encapsulates the idea of a pool of ostringstreams that can be reused.
+    struct StringStreams {
+        std::vector<std::unique_ptr<std::ostringstream>> m_streams;
+        std::vector<std::size_t> m_unused;
+        std::ostringstream m_referenceStream; // Used for copy state/ flags from
+
+        auto add() -> std::size_t {
+            if( m_unused.empty() ) {
+                m_streams.push_back( std::unique_ptr<std::ostringstream>( new std::ostringstream ) );
+                return m_streams.size()-1;
+            }
+            else {
+                auto index = m_unused.back();
+                m_unused.pop_back();
+                return index;
+            }
+        }
+
+        void release( std::size_t index ) {
+            m_streams[index]->copyfmt( m_referenceStream ); // Restore initial flags and other state
+            m_unused.push_back(index);
+        }
+    };
+
+    ReusableStringStream::ReusableStringStream()
+    :   m_index( Singleton<StringStreams>::getMutable().add() ),
+        m_oss( Singleton<StringStreams>::getMutable().m_streams[m_index].get() )
+    {}
+
+    ReusableStringStream::~ReusableStringStream() {
+        static_cast<std::ostringstream*>( m_oss )->str("");
+        m_oss->clear();
+        Singleton<StringStreams>::getMutable().release( m_index );
+    }
+
+    auto ReusableStringStream::str() const -> std::string {
+        return static_cast<std::ostringstream*>( m_oss )->str();
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+
+#ifndef CATCH_CONFIG_NOSTDOUT // If you #define this you must implement these functions
+    std::ostream& cout() { return std::cout; }
+    std::ostream& cerr() { return std::cerr; }
+    std::ostream& clog() { return std::clog; }
+#endif
+}
+// end catch_stream.cpp
+// start catch_string_manip.cpp
+
+#include <algorithm>
+#include <ostream>
+#include <cstring>
+#include <cctype>
+#include <vector>
+
+namespace Catch {
+
+    namespace {
+        char toLowerCh(char c) {
+            return static_cast<char>( std::tolower( static_cast<unsigned char>(c) ) );
+        }
+    }
+
+    bool startsWith( std::string const& s, std::string const& prefix ) {
+        return s.size() >= prefix.size() && std::equal(prefix.begin(), prefix.end(), s.begin());
+    }
+    bool startsWith( std::string const& s, char prefix ) {
+        return !s.empty() && s[0] == prefix;
+    }
+    bool endsWith( std::string const& s, std::string const& suffix ) {
+        return s.size() >= suffix.size() && std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
+    }
+    bool endsWith( std::string const& s, char suffix ) {
+        return !s.empty() && s[s.size()-1] == suffix;
+    }
+    bool contains( std::string const& s, std::string const& infix ) {
+        return s.find( infix ) != std::string::npos;
+    }
+    void toLowerInPlace( std::string& s ) {
+        std::transform( s.begin(), s.end(), s.begin(), toLowerCh );
+    }
+    std::string toLower( std::string const& s ) {
+        std::string lc = s;
+        toLowerInPlace( lc );
+        return lc;
+    }
+    std::string trim( std::string const& str ) {
+        static char const* whitespaceChars = "\n\r\t ";
+        std::string::size_type start = str.find_first_not_of( whitespaceChars );
+        std::string::size_type end = str.find_last_not_of( whitespaceChars );
+
+        return start != std::string::npos ? str.substr( start, 1+end-start ) : std::string();
+    }
+
+    StringRef trim(StringRef ref) {
+        const auto is_ws = [](char c) {
+            return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+        };
+        size_t real_begin = 0;
+        while (real_begin < ref.size() && is_ws(ref[real_begin])) { ++real_begin; }
+        size_t real_end = ref.size();
+        while (real_end > real_begin && is_ws(ref[real_end - 1])) { --real_end; }
+
+        return ref.substr(real_begin, real_end - real_begin);
+    }
+
+    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis ) {
+        bool replaced = false;
+        std::size_t i = str.find( replaceThis );
+        while( i != std::string::npos ) {
+            replaced = true;
+            str = str.substr( 0, i ) + withThis + str.substr( i+replaceThis.size() );
+            if( i < str.size()-withThis.size() )
+                i = str.find( replaceThis, i+withThis.size() );
+            else
+                i = std::string::npos;
+        }
+        return replaced;
+    }
+
+    std::vector<StringRef> splitStringRef( StringRef str, char delimiter ) {
+        std::vector<StringRef> subStrings;
+        std::size_t start = 0;
+        for(std::size_t pos = 0; pos < str.size(); ++pos ) {
+            if( str[pos] == delimiter ) {
+                if( pos - start > 1 )
+                    subStrings.push_back( str.substr( start, pos-start ) );
+                start = pos+1;
+            }
+        }
+        if( start < str.size() )
+            subStrings.push_back( str.substr( start, str.size()-start ) );
+        return subStrings;
+    }
+
+    pluralise::pluralise( std::size_t count, std::string const& label )
+    :   m_count( count ),
+        m_label( label )
+    {}
+
+    std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser ) {
+        os << pluraliser.m_count << ' ' << pluraliser.m_label;
+        if( pluraliser.m_count != 1 )
+            os << 's';
+        return os;
+    }
+
+}
+// end catch_string_manip.cpp
+// start catch_stringref.cpp
+
+#include <algorithm>
+#include <ostream>
+#include <cstring>
+#include <cstdint>
+
+namespace Catch {
+    StringRef::StringRef( char const* rawChars ) noexcept
+    : StringRef( rawChars, static_cast<StringRef::size_type>(std::strlen(rawChars) ) )
+    {}
+
+    auto StringRef::c_str() const -> char const* {
+        CATCH_ENFORCE(isNullTerminated(), "Called StringRef::c_str() on a non-null-terminated instance");
+        return m_start;
+    }
+    auto StringRef::data() const noexcept -> char const* {
+        return m_start;
+    }
+
+    auto StringRef::substr( size_type start, size_type size ) const noexcept -> StringRef {
+        if (start < m_size) {
+            return StringRef(m_start + start, (std::min)(m_size - start, size));
+        } else {
+            return StringRef();
+        }
+    }
+    auto StringRef::operator == ( StringRef const& other ) const noexcept -> bool {
+        return m_size == other.m_size
+            && (std::memcmp( m_start, other.m_start, m_size ) == 0);
+    }
+
+    auto operator << ( std::ostream& os, StringRef const& str ) -> std::ostream& {
+        return os.write(str.data(), str.size());
+    }
+
+    auto operator+=( std::string& lhs, StringRef const& rhs ) -> std::string& {
+        lhs.append(rhs.data(), rhs.size());
+        return lhs;
+    }
+
+} // namespace Catch
+// end catch_stringref.cpp
+// start catch_tag_alias.cpp
+
+namespace Catch {
+    TagAlias::TagAlias(std::string const & _tag, SourceLineInfo _lineInfo): tag(_tag), lineInfo(_lineInfo) {}
+}
+// end catch_tag_alias.cpp
+// start catch_tag_alias_autoregistrar.cpp
+
+namespace Catch {
+
+    RegistrarForTagAliases::RegistrarForTagAliases(char const* alias, char const* tag, SourceLineInfo const& lineInfo) {
+        CATCH_TRY {
+            getMutableRegistryHub().registerTagAlias(alias, tag, lineInfo);
+        } CATCH_CATCH_ALL {
+            // Do not throw when constructing global objects, instead register the exception to be processed later
+            getMutableRegistryHub().registerStartupException();
+        }
+    }
+
+}
+// end catch_tag_alias_autoregistrar.cpp
+// start catch_tag_alias_registry.cpp
+
+#include <sstream>
+
+namespace Catch {
+
+    TagAliasRegistry::~TagAliasRegistry() {}
+
+    TagAlias const* TagAliasRegistry::find( std::string const& alias ) const {
+        auto it = m_registry.find( alias );
+        if( it != m_registry.end() )
+            return &(it->second);
+        else
+            return nullptr;
+    }
+
+    std::string TagAliasRegistry::expandAliases( std::string const& unexpandedTestSpec ) const {
+        std::string expandedTestSpec = unexpandedTestSpec;
+        for( auto const& registryKvp : m_registry ) {
+            std::size_t pos = expandedTestSpec.find( registryKvp.first );
+            if( pos != std::string::npos ) {
+                expandedTestSpec =  expandedTestSpec.substr( 0, pos ) +
+                                    registryKvp.second.tag +
+                                    expandedTestSpec.substr( pos + registryKvp.first.size() );
+            }
+        }
+        return expandedTestSpec;
+    }
+
+    void TagAliasRegistry::add( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) {
+        CATCH_ENFORCE( startsWith(alias, "[@") && endsWith(alias, ']'),
+                      "error: tag alias, '" << alias << "' is not of the form [@alias name].\n" << lineInfo );
+
+        CATCH_ENFORCE( m_registry.insert(std::make_pair(alias, TagAlias(tag, lineInfo))).second,
+                      "error: tag alias, '" << alias << "' already registered.\n"
+                      << "\tFirst seen at: " << find(alias)->lineInfo << "\n"
+                      << "\tRedefined at: " << lineInfo );
+    }
+
+    ITagAliasRegistry::~ITagAliasRegistry() {}
+
+    ITagAliasRegistry const& ITagAliasRegistry::get() {
+        return getRegistryHub().getTagAliasRegistry();
+    }
+
+} // end namespace Catch
+// end catch_tag_alias_registry.cpp
+// start catch_test_case_info.cpp
+
+#include <cctype>
+#include <exception>
+#include <algorithm>
+#include <sstream>
+
+namespace Catch {
+
+    namespace {
+        TestCaseInfo::SpecialProperties parseSpecialTag( std::string const& tag ) {
+            if( startsWith( tag, '.' ) ||
+                tag == "!hide" )
+                return TestCaseInfo::IsHidden;
+            else if( tag == "!throws" )
+                return TestCaseInfo::Throws;
+            else if( tag == "!shouldfail" )
+                return TestCaseInfo::ShouldFail;
+            else if( tag == "!mayfail" )
+                return TestCaseInfo::MayFail;
+            else if( tag == "!nonportable" )
+                return TestCaseInfo::NonPortable;
+            else if( tag == "!benchmark" )
+                return static_cast<TestCaseInfo::SpecialProperties>( TestCaseInfo::Benchmark | TestCaseInfo::IsHidden );
+            else
+                return TestCaseInfo::None;
+        }
+        bool isReservedTag( std::string const& tag ) {
+            return parseSpecialTag( tag ) == TestCaseInfo::None && tag.size() > 0 && !std::isalnum( static_cast<unsigned char>(tag[0]) );
+        }
+        void enforceNotReservedTag( std::string const& tag, SourceLineInfo const& _lineInfo ) {
+            CATCH_ENFORCE( !isReservedTag(tag),
+                          "Tag name: [" << tag << "] is not allowed.\n"
+                          << "Tag names starting with non alphanumeric characters are reserved\n"
+                          << _lineInfo );
+        }
+    }
+
+    TestCase makeTestCase(  ITestInvoker* _testCase,
+                            std::string const& _className,
+                            NameAndTags const& nameAndTags,
+                            SourceLineInfo const& _lineInfo )
+    {
+        bool isHidden = false;
+
+        // Parse out tags
+        std::vector<std::string> tags;
+        std::string desc, tag;
+        bool inTag = false;
+        for (char c : nameAndTags.tags) {
+            if( !inTag ) {
+                if( c == '[' )
+                    inTag = true;
+                else
+                    desc += c;
+            }
+            else {
+                if( c == ']' ) {
+                    TestCaseInfo::SpecialProperties prop = parseSpecialTag( tag );
+                    if( ( prop & TestCaseInfo::IsHidden ) != 0 )
+                        isHidden = true;
+                    else if( prop == TestCaseInfo::None )
+                        enforceNotReservedTag( tag, _lineInfo );
+
+                    // Merged hide tags like `[.approvals]` should be added as
+                    // `[.][approvals]`. The `[.]` is added at later point, so
+                    // we only strip the prefix
+                    if (startsWith(tag, '.') && tag.size() > 1) {
+                        tag.erase(0, 1);
+                    }
+                    tags.push_back( tag );
+                    tag.clear();
+                    inTag = false;
+                }
+                else
+                    tag += c;
+            }
+        }
+        if( isHidden ) {
+            // Add all "hidden" tags to make them behave identically
+            tags.insert( tags.end(), { ".", "!hide" } );
+        }
+
+        TestCaseInfo info( static_cast<std::string>(nameAndTags.name), _className, desc, tags, _lineInfo );
+        return TestCase( _testCase, std::move(info) );
+    }
+
+    void setTags( TestCaseInfo& testCaseInfo, std::vector<std::string> tags ) {
+        std::sort(begin(tags), end(tags));
+        tags.erase(std::unique(begin(tags), end(tags)), end(tags));
+        testCaseInfo.lcaseTags.clear();
+
+        for( auto const& tag : tags ) {
+            std::string lcaseTag = toLower( tag );
+            testCaseInfo.properties = static_cast<TestCaseInfo::SpecialProperties>( testCaseInfo.properties | parseSpecialTag( lcaseTag ) );
+            testCaseInfo.lcaseTags.push_back( lcaseTag );
+        }
+        testCaseInfo.tags = std::move(tags);
+    }
+
+    TestCaseInfo::TestCaseInfo( std::string const& _name,
+                                std::string const& _className,
+                                std::string const& _description,
+                                std::vector<std::string> const& _tags,
+                                SourceLineInfo const& _lineInfo )
+    :   name( _name ),
+        className( _className ),
+        description( _description ),
+        lineInfo( _lineInfo ),
+        properties( None )
+    {
+        setTags( *this, _tags );
+    }
+
+    bool TestCaseInfo::isHidden() const {
+        return ( properties & IsHidden ) != 0;
+    }
+    bool TestCaseInfo::throws() const {
+        return ( properties & Throws ) != 0;
+    }
+    bool TestCaseInfo::okToFail() const {
+        return ( properties & (ShouldFail | MayFail ) ) != 0;
+    }
+    bool TestCaseInfo::expectedToFail() const {
+        return ( properties & (ShouldFail ) ) != 0;
+    }
+
+    std::string TestCaseInfo::tagsAsString() const {
+        std::string ret;
+        // '[' and ']' per tag
+        std::size_t full_size = 2 * tags.size();
+        for (const auto& tag : tags) {
+            full_size += tag.size();
+        }
+        ret.reserve(full_size);
+        for (const auto& tag : tags) {
+            ret.push_back('[');
+            ret.append(tag);
+            ret.push_back(']');
+        }
+
+        return ret;
+    }
+
+    TestCase::TestCase( ITestInvoker* testCase, TestCaseInfo&& info ) : TestCaseInfo( std::move(info) ), test( testCase ) {}
+
+    TestCase TestCase::withName( std::string const& _newName ) const {
+        TestCase other( *this );
+        other.name = _newName;
+        return other;
+    }
+
+    void TestCase::invoke() const {
+        test->invoke();
+    }
+
+    bool TestCase::operator == ( TestCase const& other ) const {
+        return  test.get() == other.test.get() &&
+                name == other.name &&
+                className == other.className;
+    }
+
+    bool TestCase::operator < ( TestCase const& other ) const {
+        return name < other.name;
+    }
+
+    TestCaseInfo const& TestCase::getTestCaseInfo() const
+    {
+        return *this;
+    }
+
+} // end namespace Catch
+// end catch_test_case_info.cpp
+// start catch_test_case_registry_impl.cpp
+
+#include <algorithm>
+#include <sstream>
+
+namespace Catch {
+
+    namespace {
+        struct TestHasher {
+            explicit TestHasher(Catch::SimplePcg32& rng_instance) {
+                basis = rng_instance();
+                basis <<= 32;
+                basis |= rng_instance();
+            }
+
+            uint64_t basis;
+
+            uint64_t operator()(TestCase const& t) const {
+                // Modified FNV-1a hash
+                static constexpr uint64_t prime = 1099511628211;
+                uint64_t hash = basis;
+                for (const char c : t.name) {
+                    hash ^= c;
+                    hash *= prime;
+                }
+                return hash;
+            }
+        };
+    } // end unnamed namespace
+
+    std::vector<TestCase> sortTests( IConfig const& config, std::vector<TestCase> const& unsortedTestCases ) {
+        switch( config.runOrder() ) {
+            case RunTests::InDeclarationOrder:
+                // already in declaration order
+                break;
+
+            case RunTests::InLexicographicalOrder: {
+                std::vector<TestCase> sorted = unsortedTestCases;
+                std::sort( sorted.begin(), sorted.end() );
+                return sorted;
+            }
+
+            case RunTests::InRandomOrder: {
+                seedRng( config );
+                TestHasher h( rng() );
+
+                using hashedTest = std::pair<uint64_t, TestCase const*>;
+                std::vector<hashedTest> indexed_tests;
+                indexed_tests.reserve( unsortedTestCases.size() );
+
+                for (auto const& testCase : unsortedTestCases) {
+                    indexed_tests.emplace_back(h(testCase), &testCase);
+                }
+
+                std::sort(indexed_tests.begin(), indexed_tests.end(),
+                          [](hashedTest const& lhs, hashedTest const& rhs) {
+                          if (lhs.first == rhs.first) {
+                              return lhs.second->name < rhs.second->name;
+                          }
+                          return lhs.first < rhs.first;
+                });
+
+                std::vector<TestCase> sorted;
+                sorted.reserve( indexed_tests.size() );
+
+                for (auto const& hashed : indexed_tests) {
+                    sorted.emplace_back(*hashed.second);
+                }
+
+                return sorted;
+            }
+        }
+        return unsortedTestCases;
+    }
+
+    bool isThrowSafe( TestCase const& testCase, IConfig const& config ) {
+        return !testCase.throws() || config.allowThrows();
+    }
+
+    bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config ) {
+        return testSpec.matches( testCase ) && isThrowSafe( testCase, config );
+    }
+
+    void enforceNoDuplicateTestCases( std::vector<TestCase> const& functions ) {
+        std::set<TestCase> seenFunctions;
+        for( auto const& function : functions ) {
+            auto prev = seenFunctions.insert( function );
+            CATCH_ENFORCE( prev.second,
+                    "error: TEST_CASE( \"" << function.name << "\" ) already defined.\n"
+                    << "\tFirst seen at " << prev.first->getTestCaseInfo().lineInfo << "\n"
+                    << "\tRedefined at " << function.getTestCaseInfo().lineInfo );
+        }
+    }
+
+    std::vector<TestCase> filterTests( std::vector<TestCase> const& testCases, TestSpec const& testSpec, IConfig const& config ) {
+        std::vector<TestCase> filtered;
+        filtered.reserve( testCases.size() );
+        for (auto const& testCase : testCases) {
+            if ((!testSpec.hasFilters() && !testCase.isHidden()) ||
+                (testSpec.hasFilters() && matchTest(testCase, testSpec, config))) {
+                filtered.push_back(testCase);
+            }
+        }
+        return filtered;
+    }
+    std::vector<TestCase> const& getAllTestCasesSorted( IConfig const& config ) {
+        return getRegistryHub().getTestCaseRegistry().getAllTestsSorted( config );
+    }
+
+    void TestRegistry::registerTest( TestCase const& testCase ) {
+        std::string name = testCase.getTestCaseInfo().name;
+        if( name.empty() ) {
+            ReusableStringStream rss;
+            rss << "Anonymous test case " << ++m_unnamedCount;
+            return registerTest( testCase.withName( rss.str() ) );
+        }
+        m_functions.push_back( testCase );
+    }
+
+    std::vector<TestCase> const& TestRegistry::getAllTests() const {
+        return m_functions;
+    }
+    std::vector<TestCase> const& TestRegistry::getAllTestsSorted( IConfig const& config ) const {
+        if( m_sortedFunctions.empty() )
+            enforceNoDuplicateTestCases( m_functions );
+
+        if(  m_currentSortOrder != config.runOrder() || m_sortedFunctions.empty() ) {
+            m_sortedFunctions = sortTests( config, m_functions );
+            m_currentSortOrder = config.runOrder();
+        }
+        return m_sortedFunctions;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    TestInvokerAsFunction::TestInvokerAsFunction( void(*testAsFunction)() ) noexcept : m_testAsFunction( testAsFunction ) {}
+
+    void TestInvokerAsFunction::invoke() const {
+        m_testAsFunction();
+    }
+
+    std::string extractClassName( StringRef const& classOrQualifiedMethodName ) {
+        std::string className(classOrQualifiedMethodName);
+        if( startsWith( className, '&' ) )
+        {
+            std::size_t lastColons = className.rfind( "::" );
+            std::size_t penultimateColons = className.rfind( "::", lastColons-1 );
+            if( penultimateColons == std::string::npos )
+                penultimateColons = 1;
+            className = className.substr( penultimateColons, lastColons-penultimateColons );
+        }
+        return className;
+    }
+
+} // end namespace Catch
+// end catch_test_case_registry_impl.cpp
+// start catch_test_case_tracker.cpp
+
+#include <algorithm>
+#include <cassert>
+#include <stdexcept>
+#include <memory>
+#include <sstream>
+
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+
+namespace Catch {
+namespace TestCaseTracking {
+
+    NameAndLocation::NameAndLocation( std::string const& _name, SourceLineInfo const& _location )
+    :   name( _name ),
+        location( _location )
+    {}
+
+    ITracker::~ITracker() = default;
+
+    ITracker& TrackerContext::startRun() {
+        m_rootTracker = std::make_shared<SectionTracker>( NameAndLocation( "{root}", CATCH_INTERNAL_LINEINFO ), *this, nullptr );
+        m_currentTracker = nullptr;
+        m_runState = Executing;
+        return *m_rootTracker;
+    }
+
+    void TrackerContext::endRun() {
+        m_rootTracker.reset();
+        m_currentTracker = nullptr;
+        m_runState = NotStarted;
+    }
+
+    void TrackerContext::startCycle() {
+        m_currentTracker = m_rootTracker.get();
+        m_runState = Executing;
+    }
+    void TrackerContext::completeCycle() {
+        m_runState = CompletedCycle;
+    }
+
+    bool TrackerContext::completedCycle() const {
+        return m_runState == CompletedCycle;
+    }
+    ITracker& TrackerContext::currentTracker() {
+        return *m_currentTracker;
+    }
+    void TrackerContext::setCurrentTracker( ITracker* tracker ) {
+        m_currentTracker = tracker;
+    }
+
+    TrackerBase::TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent ):
+        ITracker(nameAndLocation),
+        m_ctx( ctx ),
+        m_parent( parent )
+    {}
+
+    bool TrackerBase::isComplete() const {
+        return m_runState == CompletedSuccessfully || m_runState == Failed;
+    }
+    bool TrackerBase::isSuccessfullyCompleted() const {
+        return m_runState == CompletedSuccessfully;
+    }
+    bool TrackerBase::isOpen() const {
+        return m_runState != NotStarted && !isComplete();
+    }
+    bool TrackerBase::hasChildren() const {
+        return !m_children.empty();
+    }
+
+    void TrackerBase::addChild( ITrackerPtr const& child ) {
+        m_children.push_back( child );
+    }
+
+    ITrackerPtr TrackerBase::findChild( NameAndLocation const& nameAndLocation ) {
+        auto it = std::find_if( m_children.begin(), m_children.end(),
+            [&nameAndLocation]( ITrackerPtr const& tracker ){
+                return
+                    tracker->nameAndLocation().location == nameAndLocation.location &&
+                    tracker->nameAndLocation().name == nameAndLocation.name;
+            } );
+        return( it != m_children.end() )
+            ? *it
+            : nullptr;
+    }
+    ITracker& TrackerBase::parent() {
+        assert( m_parent ); // Should always be non-null except for root
+        return *m_parent;
+    }
+
+    void TrackerBase::openChild() {
+        if( m_runState != ExecutingChildren ) {
+            m_runState = ExecutingChildren;
+            if( m_parent )
+                m_parent->openChild();
+        }
+    }
+
+    bool TrackerBase::isSectionTracker() const { return false; }
+    bool TrackerBase::isGeneratorTracker() const { return false; }
+
+    void TrackerBase::open() {
+        m_runState = Executing;
+        moveToThis();
+        if( m_parent )
+            m_parent->openChild();
+    }
+
+    void TrackerBase::close() {
+
+        // Close any still open children (e.g. generators)
+        while( &m_ctx.currentTracker() != this )
+            m_ctx.currentTracker().close();
+
+        switch( m_runState ) {
+            case NeedsAnotherRun:
+                break;
+
+            case Executing:
+                m_runState = CompletedSuccessfully;
+                break;
+            case ExecutingChildren:
+                if( std::all_of(m_children.begin(), m_children.end(), [](ITrackerPtr const& t){ return t->isComplete(); }) )
+                    m_runState = CompletedSuccessfully;
+                break;
+
+            case NotStarted:
+            case CompletedSuccessfully:
+            case Failed:
+                CATCH_INTERNAL_ERROR( "Illogical state: " << m_runState );
+
+            default:
+                CATCH_INTERNAL_ERROR( "Unknown state: " << m_runState );
+        }
+        moveToParent();
+        m_ctx.completeCycle();
+    }
+    void TrackerBase::fail() {
+        m_runState = Failed;
+        if( m_parent )
+            m_parent->markAsNeedingAnotherRun();
+        moveToParent();
+        m_ctx.completeCycle();
+    }
+    void TrackerBase::markAsNeedingAnotherRun() {
+        m_runState = NeedsAnotherRun;
+    }
+
+    void TrackerBase::moveToParent() {
+        assert( m_parent );
+        m_ctx.setCurrentTracker( m_parent );
+    }
+    void TrackerBase::moveToThis() {
+        m_ctx.setCurrentTracker( this );
+    }
+
+    SectionTracker::SectionTracker( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent )
+    :   TrackerBase( nameAndLocation, ctx, parent ),
+        m_trimmed_name(trim(nameAndLocation.name))
+    {
+        if( parent ) {
+            while( !parent->isSectionTracker() )
+                parent = &parent->parent();
+
+            SectionTracker& parentSection = static_cast<SectionTracker&>( *parent );
+            addNextFilters( parentSection.m_filters );
+        }
+    }
+
+    bool SectionTracker::isComplete() const {
+        bool complete = true;
+
+        if (m_filters.empty()
+            || m_filters[0] == ""
+            || std::find(m_filters.begin(), m_filters.end(), m_trimmed_name) != m_filters.end()) {
+            complete = TrackerBase::isComplete();
+        }
+        return complete;
+    }
+
+    bool SectionTracker::isSectionTracker() const { return true; }
+
+    SectionTracker& SectionTracker::acquire( TrackerContext& ctx, NameAndLocation const& nameAndLocation ) {
+        std::shared_ptr<SectionTracker> section;
+
+        ITracker& currentTracker = ctx.currentTracker();
+        if( ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) {
+            assert( childTracker );
+            assert( childTracker->isSectionTracker() );
+            section = std::static_pointer_cast<SectionTracker>( childTracker );
+        }
+        else {
+            section = std::make_shared<SectionTracker>( nameAndLocation, ctx, &currentTracker );
+            currentTracker.addChild( section );
+        }
+        if( !ctx.completedCycle() )
+            section->tryOpen();
+        return *section;
+    }
+
+    void SectionTracker::tryOpen() {
+        if( !isComplete() )
+            open();
+    }
+
+    void SectionTracker::addInitialFilters( std::vector<std::string> const& filters ) {
+        if( !filters.empty() ) {
+            m_filters.reserve( m_filters.size() + filters.size() + 2 );
+            m_filters.emplace_back(""); // Root - should never be consulted
+            m_filters.emplace_back(""); // Test Case - not a section filter
+            m_filters.insert( m_filters.end(), filters.begin(), filters.end() );
+        }
+    }
+    void SectionTracker::addNextFilters( std::vector<std::string> const& filters ) {
+        if( filters.size() > 1 )
+            m_filters.insert( m_filters.end(), filters.begin()+1, filters.end() );
+    }
+
+    std::vector<std::string> const& SectionTracker::getFilters() const {
+        return m_filters;
+    }
+
+    std::string const& SectionTracker::trimmedName() const {
+        return m_trimmed_name;
+    }
+
+} // namespace TestCaseTracking
+
+using TestCaseTracking::ITracker;
+using TestCaseTracking::TrackerContext;
+using TestCaseTracking::SectionTracker;
+
+} // namespace Catch
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+// end catch_test_case_tracker.cpp
+// start catch_test_registry.cpp
+
+namespace Catch {
+
+    auto makeTestInvoker( void(*testAsFunction)() ) noexcept -> ITestInvoker* {
+        return new(std::nothrow) TestInvokerAsFunction( testAsFunction );
+    }
+
+    NameAndTags::NameAndTags( StringRef const& name_ , StringRef const& tags_ ) noexcept : name( name_ ), tags( tags_ ) {}
+
+    AutoReg::AutoReg( ITestInvoker* invoker, SourceLineInfo const& lineInfo, StringRef const& classOrMethod, NameAndTags const& nameAndTags ) noexcept {
+        CATCH_TRY {
+            getMutableRegistryHub()
+                    .registerTest(
+                        makeTestCase(
+                            invoker,
+                            extractClassName( classOrMethod ),
+                            nameAndTags,
+                            lineInfo));
+        } CATCH_CATCH_ALL {
+            // Do not throw when constructing global objects, instead register the exception to be processed later
+            getMutableRegistryHub().registerStartupException();
+        }
+    }
+
+    AutoReg::~AutoReg() = default;
+}
+// end catch_test_registry.cpp
+// start catch_test_spec.cpp
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace Catch {
+
+    TestSpec::Pattern::Pattern( std::string const& name )
+    : m_name( name )
+    {}
+
+    TestSpec::Pattern::~Pattern() = default;
+
+    std::string const& TestSpec::Pattern::name() const {
+        return m_name;
+    }
+
+    TestSpec::NamePattern::NamePattern( std::string const& name, std::string const& filterString )
+    : Pattern( filterString )
+    , m_wildcardPattern( toLower( name ), CaseSensitive::No )
+    {}
+
+    bool TestSpec::NamePattern::matches( TestCaseInfo const& testCase ) const {
+        return m_wildcardPattern.matches( testCase.name );
+    }
+
+    TestSpec::TagPattern::TagPattern( std::string const& tag, std::string const& filterString )
+    : Pattern( filterString )
+    , m_tag( toLower( tag ) )
+    {}
+
+    bool TestSpec::TagPattern::matches( TestCaseInfo const& testCase ) const {
+        return std::find(begin(testCase.lcaseTags),
+                         end(testCase.lcaseTags),
+                         m_tag) != end(testCase.lcaseTags);
+    }
+
+    TestSpec::ExcludedPattern::ExcludedPattern( PatternPtr const& underlyingPattern )
+    : Pattern( underlyingPattern->name() )
+    , m_underlyingPattern( underlyingPattern )
+    {}
+
+    bool TestSpec::ExcludedPattern::matches( TestCaseInfo const& testCase ) const {
+        return !m_underlyingPattern->matches( testCase );
+    }
+
+    bool TestSpec::Filter::matches( TestCaseInfo const& testCase ) const {
+        return std::all_of( m_patterns.begin(), m_patterns.end(), [&]( PatternPtr const& p ){ return p->matches( testCase ); } );
+    }
+
+    std::string TestSpec::Filter::name() const {
+        std::string name;
+        for( auto const& p : m_patterns )
+            name += p->name();
+        return name;
+    }
+
+    bool TestSpec::hasFilters() const {
+        return !m_filters.empty();
+    }
+
+    bool TestSpec::matches( TestCaseInfo const& testCase ) const {
+        return std::any_of( m_filters.begin(), m_filters.end(), [&]( Filter const& f ){ return f.matches( testCase ); } );
+    }
+
+    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCase> const& testCases, IConfig const& config ) const
+    {
+        Matches matches( m_filters.size() );
+        std::transform( m_filters.begin(), m_filters.end(), matches.begin(), [&]( Filter const& filter ){
+            std::vector<TestCase const*> currentMatches;
+            for( auto const& test : testCases )
+                if( isThrowSafe( test, config ) && filter.matches( test ) )
+                    currentMatches.emplace_back( &test );
+            return FilterMatch{ filter.name(), currentMatches };
+        } );
+        return matches;
+    }
+
+    const TestSpec::vectorStrings& TestSpec::getInvalidArgs() const{
+        return  (m_invalidArgs);
+    }
+
+}
+// end catch_test_spec.cpp
+// start catch_test_spec_parser.cpp
+
+namespace Catch {
+
+    TestSpecParser::TestSpecParser( ITagAliasRegistry const& tagAliases ) : m_tagAliases( &tagAliases ) {}
+
+    TestSpecParser& TestSpecParser::parse( std::string const& arg ) {
+        m_mode = None;
+        m_exclusion = false;
+        m_arg = m_tagAliases->expandAliases( arg );
+        m_escapeChars.clear();
+        m_substring.reserve(m_arg.size());
+        m_patternName.reserve(m_arg.size());
+        m_realPatternPos = 0;
+
+        for( m_pos = 0; m_pos < m_arg.size(); ++m_pos )
+          //if visitChar fails
+           if( !visitChar( m_arg[m_pos] ) ){
+               m_testSpec.m_invalidArgs.push_back(arg);
+               break;
+           }
+        endMode();
+        return *this;
+    }
+    TestSpec TestSpecParser::testSpec() {
+        addFilter();
+        return m_testSpec;
+    }
+    bool TestSpecParser::visitChar( char c ) {
+        if( (m_mode != EscapedName) && (c == '\\') ) {
+            escape();
+            addCharToPattern(c);
+            return true;
+        }else if((m_mode != EscapedName) && (c == ',') )  {
+            return separate();
+        }
+
+        switch( m_mode ) {
+        case None:
+            if( processNoneChar( c ) )
+                return true;
+            break;
+        case Name:
+            processNameChar( c );
+            break;
+        case EscapedName:
+            endMode();
+            addCharToPattern(c);
+            return true;
+        default:
+        case Tag:
+        case QuotedName:
+            if( processOtherChar( c ) )
+                return true;
+            break;
+        }
+
+        m_substring += c;
+        if( !isControlChar( c ) ) {
+            m_patternName += c;
+            m_realPatternPos++;
+        }
+        return true;
+    }
+    // Two of the processing methods return true to signal the caller to return
+    // without adding the given character to the current pattern strings
+    bool TestSpecParser::processNoneChar( char c ) {
+        switch( c ) {
+        case ' ':
+            return true;
+        case '~':
+            m_exclusion = true;
+            return false;
+        case '[':
+            startNewMode( Tag );
+            return false;
+        case '"':
+            startNewMode( QuotedName );
+            return false;
+        default:
+            startNewMode( Name );
+            return false;
+        }
+    }
+    void TestSpecParser::processNameChar( char c ) {
+        if( c == '[' ) {
+            if( m_substring == "exclude:" )
+                m_exclusion = true;
+            else
+                endMode();
+            startNewMode( Tag );
+        }
+    }
+    bool TestSpecParser::processOtherChar( char c ) {
+        if( !isControlChar( c ) )
+            return false;
+        m_substring += c;
+        endMode();
+        return true;
+    }
+    void TestSpecParser::startNewMode( Mode mode ) {
+        m_mode = mode;
+    }
+    void TestSpecParser::endMode() {
+        switch( m_mode ) {
+        case Name:
+        case QuotedName:
+            return addNamePattern();
+        case Tag:
+            return addTagPattern();
+        case EscapedName:
+            revertBackToLastMode();
+            return;
+        case None:
+        default:
+            return startNewMode( None );
+        }
+    }
+    void TestSpecParser::escape() {
+        saveLastMode();
+        m_mode = EscapedName;
+        m_escapeChars.push_back(m_realPatternPos);
+    }
+    bool TestSpecParser::isControlChar( char c ) const {
+        switch( m_mode ) {
+            default:
+                return false;
+            case None:
+                return c == '~';
+            case Name:
+                return c == '[';
+            case EscapedName:
+                return true;
+            case QuotedName:
+                return c == '"';
+            case Tag:
+                return c == '[' || c == ']';
+        }
+    }
+
+    void TestSpecParser::addFilter() {
+        if( !m_currentFilter.m_patterns.empty() ) {
+            m_testSpec.m_filters.push_back( m_currentFilter );
+            m_currentFilter = TestSpec::Filter();
+        }
+    }
+
+    void TestSpecParser::saveLastMode() {
+      lastMode = m_mode;
+    }
+
+    void TestSpecParser::revertBackToLastMode() {
+      m_mode = lastMode;
+    }
+
+    bool TestSpecParser::separate() {
+      if( (m_mode==QuotedName) || (m_mode==Tag) ){
+         //invalid argument, signal failure to previous scope.
+         m_mode = None;
+         m_pos = m_arg.size();
+         m_substring.clear();
+         m_patternName.clear();
+         m_realPatternPos = 0;
+         return false;
+      }
+      endMode();
+      addFilter();
+      return true; //success
+    }
+
+    std::string TestSpecParser::preprocessPattern() {
+        std::string token = m_patternName;
+        for (std::size_t i = 0; i < m_escapeChars.size(); ++i)
+            token = token.substr(0, m_escapeChars[i] - i) + token.substr(m_escapeChars[i] - i + 1);
+        m_escapeChars.clear();
+        if (startsWith(token, "exclude:")) {
+            m_exclusion = true;
+            token = token.substr(8);
+        }
+
+        m_patternName.clear();
+        m_realPatternPos = 0;
+
+        return token;
+    }
+
+    void TestSpecParser::addNamePattern() {
+        auto token = preprocessPattern();
+
+        if (!token.empty()) {
+            TestSpec::PatternPtr pattern = std::make_shared<TestSpec::NamePattern>(token, m_substring);
+            if (m_exclusion)
+                pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);
+            m_currentFilter.m_patterns.push_back(pattern);
+        }
+        m_substring.clear();
+        m_exclusion = false;
+        m_mode = None;
+    }
+
+    void TestSpecParser::addTagPattern() {
+        auto token = preprocessPattern();
+
+        if (!token.empty()) {
+            // If the tag pattern is the "hide and tag" shorthand (e.g. [.foo])
+            // we have to create a separate hide tag and shorten the real one
+            if (token.size() > 1 && token[0] == '.') {
+                token.erase(token.begin());
+                TestSpec::PatternPtr pattern = std::make_shared<TestSpec::TagPattern>(".", m_substring);
+                if (m_exclusion) {
+                    pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);
+                }
+                m_currentFilter.m_patterns.push_back(pattern);
+            }
+
+            TestSpec::PatternPtr pattern = std::make_shared<TestSpec::TagPattern>(token, m_substring);
+
+            if (m_exclusion) {
+                pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);
+            }
+            m_currentFilter.m_patterns.push_back(pattern);
+        }
+        m_substring.clear();
+        m_exclusion = false;
+        m_mode = None;
+    }
+
+    TestSpec parseTestSpec( std::string const& arg ) {
+        return TestSpecParser( ITagAliasRegistry::get() ).parse( arg ).testSpec();
+    }
+
+} // namespace Catch
+// end catch_test_spec_parser.cpp
+// start catch_timer.cpp
+
+#include <chrono>
+
+static const uint64_t nanosecondsInSecond = 1000000000;
+
+namespace Catch {
+
+    auto getCurrentNanosecondsSinceEpoch() -> uint64_t {
+        return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
+    }
+
+    namespace {
+        auto estimateClockResolution() -> uint64_t {
+            uint64_t sum = 0;
+            static const uint64_t iterations = 1000000;
+
+            auto startTime = getCurrentNanosecondsSinceEpoch();
+
+            for( std::size_t i = 0; i < iterations; ++i ) {
+
+                uint64_t ticks;
+                uint64_t baseTicks = getCurrentNanosecondsSinceEpoch();
+                do {
+                    ticks = getCurrentNanosecondsSinceEpoch();
+                } while( ticks == baseTicks );
+
+                auto delta = ticks - baseTicks;
+                sum += delta;
+
+                // If we have been calibrating for over 3 seconds -- the clock
+                // is terrible and we should move on.
+                // TBD: How to signal that the measured resolution is probably wrong?
+                if (ticks > startTime + 3 * nanosecondsInSecond) {
+                    return sum / ( i + 1u );
+                }
+            }
+
+            // We're just taking the mean, here. To do better we could take the std. dev and exclude outliers
+            // - and potentially do more iterations if there's a high variance.
+            return sum/iterations;
+        }
+    }
+    auto getEstimatedClockResolution() -> uint64_t {
+        static auto s_resolution = estimateClockResolution();
+        return s_resolution;
+    }
+
+    void Timer::start() {
+       m_nanoseconds = getCurrentNanosecondsSinceEpoch();
+    }
+    auto Timer::getElapsedNanoseconds() const -> uint64_t {
+        return getCurrentNanosecondsSinceEpoch() - m_nanoseconds;
+    }
+    auto Timer::getElapsedMicroseconds() const -> uint64_t {
+        return getElapsedNanoseconds()/1000;
+    }
+    auto Timer::getElapsedMilliseconds() const -> unsigned int {
+        return static_cast<unsigned int>(getElapsedMicroseconds()/1000);
+    }
+    auto Timer::getElapsedSeconds() const -> double {
+        return getElapsedMicroseconds()/1000000.0;
+    }
+
+} // namespace Catch
+// end catch_timer.cpp
+// start catch_tostring.cpp
+
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#    pragma clang diagnostic ignored "-Wglobal-constructors"
+#endif
+
+// Enable specific decls locally
+#if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
+#define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
+#endif
+
+#include <cmath>
+#include <iomanip>
+
+namespace Catch {
+
+namespace Detail {
+
+    const std::string unprintableString = "{?}";
+
+    namespace {
+        const int hexThreshold = 255;
+
+        struct Endianness {
+            enum Arch { Big, Little };
+
+            static Arch which() {
+                int one = 1;
+                // If the lowest byte we read is non-zero, we can assume
+                // that little endian format is used.
+                auto value = *reinterpret_cast<char*>(&one);
+                return value ? Little : Big;
+            }
+        };
+    }
+
+    std::string rawMemoryToString( const void *object, std::size_t size ) {
+        // Reverse order for little endian architectures
+        int i = 0, end = static_cast<int>( size ), inc = 1;
+        if( Endianness::which() == Endianness::Little ) {
+            i = end-1;
+            end = inc = -1;
+        }
+
+        unsigned char const *bytes = static_cast<unsigned char const *>(object);
+        ReusableStringStream rss;
+        rss << "0x" << std::setfill('0') << std::hex;
+        for( ; i != end; i += inc )
+             rss << std::setw(2) << static_cast<unsigned>(bytes[i]);
+       return rss.str();
+    }
+}
+
+template<typename T>
+std::string fpToString( T value, int precision ) {
+    if (Catch::isnan(value)) {
+        return "nan";
+    }
+
+    ReusableStringStream rss;
+    rss << std::setprecision( precision )
+        << std::fixed
+        << value;
+    std::string d = rss.str();
+    std::size_t i = d.find_last_not_of( '0' );
+    if( i != std::string::npos && i != d.size()-1 ) {
+        if( d[i] == '.' )
+            i++;
+        d = d.substr( 0, i+1 );
+    }
+    return d;
+}
+
+//// ======================================================= ////
+//
+//   Out-of-line defs for full specialization of StringMaker
+//
+//// ======================================================= ////
+
+std::string StringMaker<std::string>::convert(const std::string& str) {
+    if (!getCurrentContext().getConfig()->showInvisibles()) {
+        return '"' + str + '"';
+    }
+
+    std::string s("\"");
+    for (char c : str) {
+        switch (c) {
+        case '\n':
+            s.append("\\n");
+            break;
+        case '\t':
+            s.append("\\t");
+            break;
+        default:
+            s.push_back(c);
+            break;
+        }
+    }
+    s.append("\"");
+    return s;
+}
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+std::string StringMaker<std::string_view>::convert(std::string_view str) {
+    return ::Catch::Detail::stringify(std::string{ str });
+}
+#endif
+
+std::string StringMaker<char const*>::convert(char const* str) {
+    if (str) {
+        return ::Catch::Detail::stringify(std::string{ str });
+    } else {
+        return{ "{null string}" };
+    }
+}
+std::string StringMaker<char*>::convert(char* str) {
+    if (str) {
+        return ::Catch::Detail::stringify(std::string{ str });
+    } else {
+        return{ "{null string}" };
+    }
+}
+
+#ifdef CATCH_CONFIG_WCHAR
+std::string StringMaker<std::wstring>::convert(const std::wstring& wstr) {
+    std::string s;
+    s.reserve(wstr.size());
+    for (auto c : wstr) {
+        s += (c <= 0xff) ? static_cast<char>(c) : '?';
+    }
+    return ::Catch::Detail::stringify(s);
+}
+
+# ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+std::string StringMaker<std::wstring_view>::convert(std::wstring_view str) {
+    return StringMaker<std::wstring>::convert(std::wstring(str));
+}
+# endif
+
+std::string StringMaker<wchar_t const*>::convert(wchar_t const * str) {
+    if (str) {
+        return ::Catch::Detail::stringify(std::wstring{ str });
+    } else {
+        return{ "{null string}" };
+    }
+}
+std::string StringMaker<wchar_t *>::convert(wchar_t * str) {
+    if (str) {
+        return ::Catch::Detail::stringify(std::wstring{ str });
+    } else {
+        return{ "{null string}" };
+    }
+}
+#endif
+
+#if defined(CATCH_CONFIG_CPP17_BYTE)
+#include <cstddef>
+std::string StringMaker<std::byte>::convert(std::byte value) {
+    return ::Catch::Detail::stringify(std::to_integer<unsigned long long>(value));
+}
+#endif // defined(CATCH_CONFIG_CPP17_BYTE)
+
+std::string StringMaker<int>::convert(int value) {
+    return ::Catch::Detail::stringify(static_cast<long long>(value));
+}
+std::string StringMaker<long>::convert(long value) {
+    return ::Catch::Detail::stringify(static_cast<long long>(value));
+}
+std::string StringMaker<long long>::convert(long long value) {
+    ReusableStringStream rss;
+    rss << value;
+    if (value > Detail::hexThreshold) {
+        rss << " (0x" << std::hex << value << ')';
+    }
+    return rss.str();
+}
+
+std::string StringMaker<unsigned int>::convert(unsigned int value) {
+    return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
+}
+std::string StringMaker<unsigned long>::convert(unsigned long value) {
+    return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
+}
+std::string StringMaker<unsigned long long>::convert(unsigned long long value) {
+    ReusableStringStream rss;
+    rss << value;
+    if (value > Detail::hexThreshold) {
+        rss << " (0x" << std::hex << value << ')';
+    }
+    return rss.str();
+}
+
+std::string StringMaker<bool>::convert(bool b) {
+    return b ? "true" : "false";
+}
+
+std::string StringMaker<signed char>::convert(signed char value) {
+    if (value == '\r') {
+        return "'\\r'";
+    } else if (value == '\f') {
+        return "'\\f'";
+    } else if (value == '\n') {
+        return "'\\n'";
+    } else if (value == '\t') {
+        return "'\\t'";
+    } else if ('\0' <= value && value < ' ') {
+        return ::Catch::Detail::stringify(static_cast<unsigned int>(value));
+    } else {
+        char chstr[] = "' '";
+        chstr[1] = value;
+        return chstr;
+    }
+}
+std::string StringMaker<char>::convert(char c) {
+    return ::Catch::Detail::stringify(static_cast<signed char>(c));
+}
+std::string StringMaker<unsigned char>::convert(unsigned char c) {
+    return ::Catch::Detail::stringify(static_cast<char>(c));
+}
+
+std::string StringMaker<std::nullptr_t>::convert(std::nullptr_t) {
+    return "nullptr";
+}
+
+int StringMaker<float>::precision = 5;
+
+std::string StringMaker<float>::convert(float value) {
+    return fpToString(value, precision) + 'f';
+}
+
+int StringMaker<double>::precision = 10;
+
+std::string StringMaker<double>::convert(double value) {
+    return fpToString(value, precision);
+}
+
+std::string ratio_string<std::atto>::symbol() { return "a"; }
+std::string ratio_string<std::femto>::symbol() { return "f"; }
+std::string ratio_string<std::pico>::symbol() { return "p"; }
+std::string ratio_string<std::nano>::symbol() { return "n"; }
+std::string ratio_string<std::micro>::symbol() { return "u"; }
+std::string ratio_string<std::milli>::symbol() { return "m"; }
+
+} // end namespace Catch
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
+// end catch_tostring.cpp
+// start catch_totals.cpp
+
+namespace Catch {
+
+    Counts Counts::operator - ( Counts const& other ) const {
+        Counts diff;
+        diff.passed = passed - other.passed;
+        diff.failed = failed - other.failed;
+        diff.failedButOk = failedButOk - other.failedButOk;
+        return diff;
+    }
+
+    Counts& Counts::operator += ( Counts const& other ) {
+        passed += other.passed;
+        failed += other.failed;
+        failedButOk += other.failedButOk;
+        return *this;
+    }
+
+    std::size_t Counts::total() const {
+        return passed + failed + failedButOk;
+    }
+    bool Counts::allPassed() const {
+        return failed == 0 && failedButOk == 0;
+    }
+    bool Counts::allOk() const {
+        return failed == 0;
+    }
+
+    Totals Totals::operator - ( Totals const& other ) const {
+        Totals diff;
+        diff.assertions = assertions - other.assertions;
+        diff.testCases = testCases - other.testCases;
+        return diff;
+    }
+
+    Totals& Totals::operator += ( Totals const& other ) {
+        assertions += other.assertions;
+        testCases += other.testCases;
+        return *this;
+    }
+
+    Totals Totals::delta( Totals const& prevTotals ) const {
+        Totals diff = *this - prevTotals;
+        if( diff.assertions.failed > 0 )
+            ++diff.testCases.failed;
+        else if( diff.assertions.failedButOk > 0 )
+            ++diff.testCases.failedButOk;
+        else
+            ++diff.testCases.passed;
+        return diff;
+    }
+
+}
+// end catch_totals.cpp
+// start catch_uncaught_exceptions.cpp
+
+// start catch_config_uncaught_exceptions.hpp
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE_1_0.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP
+#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP
+
+#if defined(_MSC_VER)
+#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
+#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#  endif
+#endif
+
+#include <exception>
+
+#if defined(__cpp_lib_uncaught_exceptions) \
+    && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif // __cpp_lib_uncaught_exceptions
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif
+
+#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP
+// end catch_config_uncaught_exceptions.hpp
+#include <exception>
+
+namespace Catch {
+    bool uncaught_exceptions() {
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        return false;
+#elif defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+        return std::uncaught_exceptions() > 0;
+#else
+        return std::uncaught_exception();
+#endif
+  }
+} // end namespace Catch
+// end catch_uncaught_exceptions.cpp
+// start catch_version.cpp
+
+#include <ostream>
+
+namespace Catch {
+
+    Version::Version
+        (   unsigned int _majorVersion,
+            unsigned int _minorVersion,
+            unsigned int _patchNumber,
+            char const * const _branchName,
+            unsigned int _buildNumber )
+    :   majorVersion( _majorVersion ),
+        minorVersion( _minorVersion ),
+        patchNumber( _patchNumber ),
+        branchName( _branchName ),
+        buildNumber( _buildNumber )
+    {}
+
+    std::ostream& operator << ( std::ostream& os, Version const& version ) {
+        os  << version.majorVersion << '.'
+            << version.minorVersion << '.'
+            << version.patchNumber;
+        // branchName is never null -> 0th char is \0 if it is empty
+        if (version.branchName[0]) {
+            os << '-' << version.branchName
+               << '.' << version.buildNumber;
+        }
+        return os;
+    }
+
+    Version const& libraryVersion() {
+        static Version version( 2, 13, 3, "", 0 );
+        return version;
+    }
+
+}
+// end catch_version.cpp
+// start catch_wildcard_pattern.cpp
+
+namespace Catch {
+
+    WildcardPattern::WildcardPattern( std::string const& pattern,
+                                      CaseSensitive::Choice caseSensitivity )
+    :   m_caseSensitivity( caseSensitivity ),
+        m_pattern( normaliseString( pattern ) )
+    {
+        if( startsWith( m_pattern, '*' ) ) {
+            m_pattern = m_pattern.substr( 1 );
+            m_wildcard = WildcardAtStart;
+        }
+        if( endsWith( m_pattern, '*' ) ) {
+            m_pattern = m_pattern.substr( 0, m_pattern.size()-1 );
+            m_wildcard = static_cast<WildcardPosition>( m_wildcard | WildcardAtEnd );
+        }
+    }
+
+    bool WildcardPattern::matches( std::string const& str ) const {
+        switch( m_wildcard ) {
+            case NoWildcard:
+                return m_pattern == normaliseString( str );
+            case WildcardAtStart:
+                return endsWith( normaliseString( str ), m_pattern );
+            case WildcardAtEnd:
+                return startsWith( normaliseString( str ), m_pattern );
+            case WildcardAtBothEnds:
+                return contains( normaliseString( str ), m_pattern );
+            default:
+                CATCH_INTERNAL_ERROR( "Unknown enum" );
+        }
+    }
+
+    std::string WildcardPattern::normaliseString( std::string const& str ) const {
+        return trim( m_caseSensitivity == CaseSensitive::No ? toLower( str ) : str );
+    }
+}
+// end catch_wildcard_pattern.cpp
+// start catch_xmlwriter.cpp
+
+#include <iomanip>
+#include <type_traits>
+
+namespace Catch {
+
+namespace {
+
+    size_t trailingBytes(unsigned char c) {
+        if ((c & 0xE0) == 0xC0) {
+            return 2;
+        }
+        if ((c & 0xF0) == 0xE0) {
+            return 3;
+        }
+        if ((c & 0xF8) == 0xF0) {
+            return 4;
+        }
+        CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
+    }
+
+    uint32_t headerValue(unsigned char c) {
+        if ((c & 0xE0) == 0xC0) {
+            return c & 0x1F;
+        }
+        if ((c & 0xF0) == 0xE0) {
+            return c & 0x0F;
+        }
+        if ((c & 0xF8) == 0xF0) {
+            return c & 0x07;
+        }
+        CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
+    }
+
+    void hexEscapeChar(std::ostream& os, unsigned char c) {
+        std::ios_base::fmtflags f(os.flags());
+        os << "\\x"
+            << std::uppercase << std::hex << std::setfill('0') << std::setw(2)
+            << static_cast<int>(c);
+        os.flags(f);
+    }
+
+    bool shouldNewline(XmlFormatting fmt) {
+        return !!(static_cast<std::underlying_type<XmlFormatting>::type>(fmt & XmlFormatting::Newline));
+    }
+
+    bool shouldIndent(XmlFormatting fmt) {
+        return !!(static_cast<std::underlying_type<XmlFormatting>::type>(fmt & XmlFormatting::Indent));
+    }
+
+} // anonymous namespace
+
+    XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs) {
+        return static_cast<XmlFormatting>(
+            static_cast<std::underlying_type<XmlFormatting>::type>(lhs) |
+            static_cast<std::underlying_type<XmlFormatting>::type>(rhs)
+        );
+    }
+
+    XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs) {
+        return static_cast<XmlFormatting>(
+            static_cast<std::underlying_type<XmlFormatting>::type>(lhs) &
+            static_cast<std::underlying_type<XmlFormatting>::type>(rhs)
+        );
+    }
+
+    XmlEncode::XmlEncode( std::string const& str, ForWhat forWhat )
+    :   m_str( str ),
+        m_forWhat( forWhat )
+    {}
+
+    void XmlEncode::encodeTo( std::ostream& os ) const {
+        // Apostrophe escaping not necessary if we always use " to write attributes
+        // (see: http://www.w3.org/TR/xml/#syntax)
+
+        for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
+            unsigned char c = m_str[idx];
+            switch (c) {
+            case '<':   os << "&lt;"; break;
+            case '&':   os << "&amp;"; break;
+
+            case '>':
+                // See: http://www.w3.org/TR/xml/#syntax
+                if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
+                    os << "&gt;";
+                else
+                    os << c;
+                break;
+
+            case '\"':
+                if (m_forWhat == ForAttributes)
+                    os << "&quot;";
+                else
+                    os << c;
+                break;
+
+            default:
+                // Check for control characters and invalid utf-8
+
+                // Escape control characters in standard ascii
+                // see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
+                if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                // Plain ASCII: Write it to stream
+                if (c < 0x7F) {
+                    os << c;
+                    break;
+                }
+
+                // UTF-8 territory
+                // Check if the encoding is valid and if it is not, hex escape bytes.
+                // Important: We do not check the exact decoded values for validity, only the encoding format
+                // First check that this bytes is a valid lead byte:
+                // This means that it is not encoded as 1111 1XXX
+                // Or as 10XX XXXX
+                if (c <  0xC0 ||
+                    c >= 0xF8) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                auto encBytes = trailingBytes(c);
+                // Are there enough bytes left to avoid accessing out-of-bounds memory?
+                if (idx + encBytes - 1 >= m_str.size()) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+                // The header is valid, check data
+                // The next encBytes bytes must together be a valid utf-8
+                // This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
+                bool valid = true;
+                uint32_t value = headerValue(c);
+                for (std::size_t n = 1; n < encBytes; ++n) {
+                    unsigned char nc = m_str[idx + n];
+                    valid &= ((nc & 0xC0) == 0x80);
+                    value = (value << 6) | (nc & 0x3F);
+                }
+
+                if (
+                    // Wrong bit pattern of following bytes
+                    (!valid) ||
+                    // Overlong encodings
+                    (value < 0x80) ||
+                    (0x80 <= value && value < 0x800   && encBytes > 2) ||
+                    (0x800 < value && value < 0x10000 && encBytes > 3) ||
+                    // Encoded value out of range
+                    (value >= 0x110000)
+                    ) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                // If we got here, this is in fact a valid(ish) utf-8 sequence
+                for (std::size_t n = 0; n < encBytes; ++n) {
+                    os << m_str[idx + n];
+                }
+                idx += encBytes - 1;
+                break;
+            }
+        }
+    }
+
+    std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
+        xmlEncode.encodeTo( os );
+        return os;
+    }
+
+    XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer, XmlFormatting fmt )
+    :   m_writer( writer ),
+        m_fmt(fmt)
+    {}
+
+    XmlWriter::ScopedElement::ScopedElement( ScopedElement&& other ) noexcept
+    :   m_writer( other.m_writer ),
+        m_fmt(other.m_fmt)
+    {
+        other.m_writer = nullptr;
+        other.m_fmt = XmlFormatting::None;
+    }
+    XmlWriter::ScopedElement& XmlWriter::ScopedElement::operator=( ScopedElement&& other ) noexcept {
+        if ( m_writer ) {
+            m_writer->endElement();
+        }
+        m_writer = other.m_writer;
+        other.m_writer = nullptr;
+        m_fmt = other.m_fmt;
+        other.m_fmt = XmlFormatting::None;
+        return *this;
+    }
+
+    XmlWriter::ScopedElement::~ScopedElement() {
+        if (m_writer) {
+            m_writer->endElement(m_fmt);
+        }
+    }
+
+    XmlWriter::ScopedElement& XmlWriter::ScopedElement::writeText( std::string const& text, XmlFormatting fmt ) {
+        m_writer->writeText( text, fmt );
+        return *this;
+    }
+
+    XmlWriter::XmlWriter( std::ostream& os ) : m_os( os )
+    {
+        writeDeclaration();
+    }
+
+    XmlWriter::~XmlWriter() {
+        while (!m_tags.empty()) {
+            endElement();
+        }
+        newlineIfNecessary();
+    }
+
+    XmlWriter& XmlWriter::startElement( std::string const& name, XmlFormatting fmt ) {
+        ensureTagClosed();
+        newlineIfNecessary();
+        if (shouldIndent(fmt)) {
+            m_os << m_indent;
+            m_indent += "  ";
+        }
+        m_os << '<' << name;
+        m_tags.push_back( name );
+        m_tagIsOpen = true;
+        applyFormatting(fmt);
+        return *this;
+    }
+
+    XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name, XmlFormatting fmt ) {
+        ScopedElement scoped( this, fmt );
+        startElement( name, fmt );
+        return scoped;
+    }
+
+    XmlWriter& XmlWriter::endElement(XmlFormatting fmt) {
+        m_indent = m_indent.substr(0, m_indent.size() - 2);
+
+        if( m_tagIsOpen ) {
+            m_os << "/>";
+            m_tagIsOpen = false;
+        } else {
+            newlineIfNecessary();
+            if (shouldIndent(fmt)) {
+                m_os << m_indent;
+            }
+            m_os << "</" << m_tags.back() << ">";
+        }
+        m_os << std::flush;
+        applyFormatting(fmt);
+        m_tags.pop_back();
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeAttribute( std::string const& name, std::string const& attribute ) {
+        if( !name.empty() && !attribute.empty() )
+            m_os << ' ' << name << "=\"" << XmlEncode( attribute, XmlEncode::ForAttributes ) << '"';
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeAttribute( std::string const& name, bool attribute ) {
+        m_os << ' ' << name << "=\"" << ( attribute ? "true" : "false" ) << '"';
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeText( std::string const& text, XmlFormatting fmt) {
+        if( !text.empty() ){
+            bool tagWasOpen = m_tagIsOpen;
+            ensureTagClosed();
+            if (tagWasOpen && shouldIndent(fmt)) {
+                m_os << m_indent;
+            }
+            m_os << XmlEncode( text );
+            applyFormatting(fmt);
+        }
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeComment( std::string const& text, XmlFormatting fmt) {
+        ensureTagClosed();
+        if (shouldIndent(fmt)) {
+            m_os << m_indent;
+        }
+        m_os << "<!--" << text << "-->";
+        applyFormatting(fmt);
+        return *this;
+    }
+
+    void XmlWriter::writeStylesheetRef( std::string const& url ) {
+        m_os << "<?xml-stylesheet type=\"text/xsl\" href=\"" << url << "\"?>\n";
+    }
+
+    XmlWriter& XmlWriter::writeBlankLine() {
+        ensureTagClosed();
+        m_os << '\n';
+        return *this;
+    }
+
+    void XmlWriter::ensureTagClosed() {
+        if( m_tagIsOpen ) {
+            m_os << '>' << std::flush;
+            newlineIfNecessary();
+            m_tagIsOpen = false;
+        }
+    }
+
+    void XmlWriter::applyFormatting(XmlFormatting fmt) {
+        m_needsNewline = shouldNewline(fmt);
+    }
+
+    void XmlWriter::writeDeclaration() {
+        m_os << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+    }
+
+    void XmlWriter::newlineIfNecessary() {
+        if( m_needsNewline ) {
+            m_os << std::endl;
+            m_needsNewline = false;
+        }
+    }
+}
+// end catch_xmlwriter.cpp
+// start catch_reporter_bases.cpp
+
+#include <cstring>
+#include <cfloat>
+#include <cstdio>
+#include <cassert>
+#include <memory>
+
+namespace Catch {
+    void prepareExpandedExpression(AssertionResult& result) {
+        result.getExpandedExpression();
+    }
+
+    // Because formatting using c++ streams is stateful, drop down to C is required
+    // Alternatively we could use stringstream, but its performance is... not good.
+    std::string getFormattedDuration( double duration ) {
+        // Max exponent + 1 is required to represent the whole part
+        // + 1 for decimal point
+        // + 3 for the 3 decimal places
+        // + 1 for null terminator
+        const std::size_t maxDoubleSize = DBL_MAX_10_EXP + 1 + 1 + 3 + 1;
+        char buffer[maxDoubleSize];
+
+        // Save previous errno, to prevent sprintf from overwriting it
+        ErrnoGuard guard;
+#ifdef _MSC_VER
+        sprintf_s(buffer, "%.3f", duration);
+#else
+        std::sprintf(buffer, "%.3f", duration);
+#endif
+        return std::string(buffer);
+    }
+
+    bool shouldShowDuration( IConfig const& config, double duration ) {
+        if ( config.showDurations() == ShowDurations::Always ) {
+            return true;
+        }
+        if ( config.showDurations() == ShowDurations::Never ) {
+            return false;
+        }
+        const double min = config.minDuration();
+        return min >= 0 && duration >= min;
+    }
+
+    std::string serializeFilters( std::vector<std::string> const& container ) {
+        ReusableStringStream oss;
+        bool first = true;
+        for (auto&& filter : container)
+        {
+            if (!first)
+                oss << ' ';
+            else
+                first = false;
+
+            oss << filter;
+        }
+        return oss.str();
+    }
+
+    TestEventListenerBase::TestEventListenerBase(ReporterConfig const & _config)
+        :StreamingReporterBase(_config) {}
+
+    std::set<Verbosity> TestEventListenerBase::getSupportedVerbosities() {
+        return { Verbosity::Quiet, Verbosity::Normal, Verbosity::High };
+    }
+
+    void TestEventListenerBase::assertionStarting(AssertionInfo const &) {}
+
+    bool TestEventListenerBase::assertionEnded(AssertionStats const &) {
+        return false;
+    }
+
+} // end namespace Catch
+// end catch_reporter_bases.cpp
+// start catch_reporter_compact.cpp
+
+namespace {
+
+#ifdef CATCH_PLATFORM_MAC
+    const char* failedString() { return "FAILED"; }
+    const char* passedString() { return "PASSED"; }
+#else
+    const char* failedString() { return "failed"; }
+    const char* passedString() { return "passed"; }
+#endif
+
+    // Colour::LightGrey
+    Catch::Colour::Code dimColour() { return Catch::Colour::FileName; }
+
+    std::string bothOrAll( std::size_t count ) {
+        return count == 1 ? std::string() :
+               count == 2 ? "both " : "all " ;
+    }
+
+} // anon namespace
+
+namespace Catch {
+namespace {
+// Colour, message variants:
+// - white: No tests ran.
+// -   red: Failed [both/all] N test cases, failed [both/all] M assertions.
+// - white: Passed [both/all] N test cases (no assertions).
+// -   red: Failed N tests cases, failed M assertions.
+// - green: Passed [both/all] N tests cases with M assertions.
+void printTotals(std::ostream& out, const Totals& totals) {
+    if (totals.testCases.total() == 0) {
+        out << "No tests ran.";
+    } else if (totals.testCases.failed == totals.testCases.total()) {
+        Colour colour(Colour::ResultError);
+        const std::string qualify_assertions_failed =
+            totals.assertions.failed == totals.assertions.total() ?
+            bothOrAll(totals.assertions.failed) : std::string();
+        out <<
+            "Failed " << bothOrAll(totals.testCases.failed)
+            << pluralise(totals.testCases.failed, "test case") << ", "
+            "failed " << qualify_assertions_failed <<
+            pluralise(totals.assertions.failed, "assertion") << '.';
+    } else if (totals.assertions.total() == 0) {
+        out <<
+            "Passed " << bothOrAll(totals.testCases.total())
+            << pluralise(totals.testCases.total(), "test case")
+            << " (no assertions).";
+    } else if (totals.assertions.failed) {
+        Colour colour(Colour::ResultError);
+        out <<
+            "Failed " << pluralise(totals.testCases.failed, "test case") << ", "
+            "failed " << pluralise(totals.assertions.failed, "assertion") << '.';
+    } else {
+        Colour colour(Colour::ResultSuccess);
+        out <<
+            "Passed " << bothOrAll(totals.testCases.passed)
+            << pluralise(totals.testCases.passed, "test case") <<
+            " with " << pluralise(totals.assertions.passed, "assertion") << '.';
+    }
+}
+
+// Implementation of CompactReporter formatting
+class AssertionPrinter {
+public:
+    AssertionPrinter& operator= (AssertionPrinter const&) = delete;
+    AssertionPrinter(AssertionPrinter const&) = delete;
+    AssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, bool _printInfoMessages)
+        : stream(_stream)
+        , result(_stats.assertionResult)
+        , messages(_stats.infoMessages)
+        , itMessage(_stats.infoMessages.begin())
+        , printInfoMessages(_printInfoMessages) {}
+
+    void print() {
+        printSourceInfo();
+
+        itMessage = messages.begin();
+
+        switch (result.getResultType()) {
+        case ResultWas::Ok:
+            printResultType(Colour::ResultSuccess, passedString());
+            printOriginalExpression();
+            printReconstructedExpression();
+            if (!result.hasExpression())
+                printRemainingMessages(Colour::None);
+            else
+                printRemainingMessages();
+            break;
+        case ResultWas::ExpressionFailed:
+            if (result.isOk())
+                printResultType(Colour::ResultSuccess, failedString() + std::string(" - but was ok"));
+            else
+                printResultType(Colour::Error, failedString());
+            printOriginalExpression();
+            printReconstructedExpression();
+            printRemainingMessages();
+            break;
+        case ResultWas::ThrewException:
+            printResultType(Colour::Error, failedString());
+            printIssue("unexpected exception with message:");
+            printMessage();
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::FatalErrorCondition:
+            printResultType(Colour::Error, failedString());
+            printIssue("fatal error condition with message:");
+            printMessage();
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::DidntThrowException:
+            printResultType(Colour::Error, failedString());
+            printIssue("expected exception, got none");
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::Info:
+            printResultType(Colour::None, "info");
+            printMessage();
+            printRemainingMessages();
+            break;
+        case ResultWas::Warning:
+            printResultType(Colour::None, "warning");
+            printMessage();
+            printRemainingMessages();
+            break;
+        case ResultWas::ExplicitFailure:
+            printResultType(Colour::Error, failedString());
+            printIssue("explicitly");
+            printRemainingMessages(Colour::None);
+            break;
+            // These cases are here to prevent compiler warnings
+        case ResultWas::Unknown:
+        case ResultWas::FailureBit:
+        case ResultWas::Exception:
+            printResultType(Colour::Error, "** internal error **");
+            break;
+        }
+    }
+
+private:
+    void printSourceInfo() const {
+        Colour colourGuard(Colour::FileName);
+        stream << result.getSourceInfo() << ':';
+    }
+
+    void printResultType(Colour::Code colour, std::string const& passOrFail) const {
+        if (!passOrFail.empty()) {
+            {
+                Colour colourGuard(colour);
+                stream << ' ' << passOrFail;
+            }
+            stream << ':';
+        }
+    }
+
+    void printIssue(std::string const& issue) const {
+        stream << ' ' << issue;
+    }
+
+    void printExpressionWas() {
+        if (result.hasExpression()) {
+            stream << ';';
+            {
+                Colour colour(dimColour());
+                stream << " expression was:";
+            }
+            printOriginalExpression();
+        }
+    }
+
+    void printOriginalExpression() const {
+        if (result.hasExpression()) {
+            stream << ' ' << result.getExpression();
+        }
+    }
+
+    void printReconstructedExpression() const {
+        if (result.hasExpandedExpression()) {
+            {
+                Colour colour(dimColour());
+                stream << " for: ";
+            }
+            stream << result.getExpandedExpression();
+        }
+    }
+
+    void printMessage() {
+        if (itMessage != messages.end()) {
+            stream << " '" << itMessage->message << '\'';
+            ++itMessage;
+        }
+    }
+
+    void printRemainingMessages(Colour::Code colour = dimColour()) {
+        if (itMessage == messages.end())
+            return;
+
+        const auto itEnd = messages.cend();
+        const auto N = static_cast<std::size_t>(std::distance(itMessage, itEnd));
+
+        {
+            Colour colourGuard(colour);
+            stream << " with " << pluralise(N, "message") << ':';
+        }
+
+        while (itMessage != itEnd) {
+            // If this assertion is a warning ignore any INFO messages
+            if (printInfoMessages || itMessage->type != ResultWas::Info) {
+                printMessage();
+                if (itMessage != itEnd) {
+                    Colour colourGuard(dimColour());
+                    stream << " and";
+                }
+                continue;
+            }
+            ++itMessage;
+        }
+    }
+
+private:
+    std::ostream& stream;
+    AssertionResult const& result;
+    std::vector<MessageInfo> messages;
+    std::vector<MessageInfo>::const_iterator itMessage;
+    bool printInfoMessages;
+};
+
+} // anon namespace
+
+        std::string CompactReporter::getDescription() {
+            return "Reports test results on a single line, suitable for IDEs";
+        }
+
+        void CompactReporter::noMatchingTestCases( std::string const& spec ) {
+            stream << "No test cases matched '" << spec << '\'' << std::endl;
+        }
+
+        void CompactReporter::assertionStarting( AssertionInfo const& ) {}
+
+        bool CompactReporter::assertionEnded( AssertionStats const& _assertionStats ) {
+            AssertionResult const& result = _assertionStats.assertionResult;
+
+            bool printInfoMessages = true;
+
+            // Drop out if result was successful and we're not printing those
+            if( !m_config->includeSuccessfulResults() && result.isOk() ) {
+                if( result.getResultType() != ResultWas::Warning )
+                    return false;
+                printInfoMessages = false;
+            }
+
+            AssertionPrinter printer( stream, _assertionStats, printInfoMessages );
+            printer.print();
+
+            stream << std::endl;
+            return true;
+        }
+
+        void CompactReporter::sectionEnded(SectionStats const& _sectionStats) {
+            double dur = _sectionStats.durationInSeconds;
+            if ( shouldShowDuration( *m_config, dur ) ) {
+                stream << getFormattedDuration( dur ) << " s: " << _sectionStats.sectionInfo.name << std::endl;
+            }
+        }
+
+        void CompactReporter::testRunEnded( TestRunStats const& _testRunStats ) {
+            printTotals( stream, _testRunStats.totals );
+            stream << '\n' << std::endl;
+            StreamingReporterBase::testRunEnded( _testRunStats );
+        }
+
+        CompactReporter::~CompactReporter() {}
+
+    CATCH_REGISTER_REPORTER( "compact", CompactReporter )
+
+} // end namespace Catch
+// end catch_reporter_compact.cpp
+// start catch_reporter_console.cpp
+
+#include <cfloat>
+#include <cstdio>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
+ // Note that 4062 (not all labels are handled and default is missing) is enabled
+#endif
+
+#if defined(__clang__)
+#  pragma clang diagnostic push
+// For simplicity, benchmarking-only helpers are always enabled
+#  pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+namespace Catch {
+
+namespace {
+
+// Formatter impl for ConsoleReporter
+class ConsoleAssertionPrinter {
+public:
+    ConsoleAssertionPrinter& operator= (ConsoleAssertionPrinter const&) = delete;
+    ConsoleAssertionPrinter(ConsoleAssertionPrinter const&) = delete;
+    ConsoleAssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, bool _printInfoMessages)
+        : stream(_stream),
+        stats(_stats),
+        result(_stats.assertionResult),
+        colour(Colour::None),
+        message(result.getMessage()),
+        messages(_stats.infoMessages),
+        printInfoMessages(_printInfoMessages) {
+        switch (result.getResultType()) {
+        case ResultWas::Ok:
+            colour = Colour::Success;
+            passOrFail = "PASSED";
+            //if( result.hasMessage() )
+            if (_stats.infoMessages.size() == 1)
+                messageLabel = "with message";
+            if (_stats.infoMessages.size() > 1)
+                messageLabel = "with messages";
+            break;
+        case ResultWas::ExpressionFailed:
+            if (result.isOk()) {
+                colour = Colour::Success;
+                passOrFail = "FAILED - but was ok";
+            } else {
+                colour = Colour::Error;
+                passOrFail = "FAILED";
+            }
+            if (_stats.infoMessages.size() == 1)
+                messageLabel = "with message";
+            if (_stats.infoMessages.size() > 1)
+                messageLabel = "with messages";
+            break;
+        case ResultWas::ThrewException:
+            colour = Colour::Error;
+            passOrFail = "FAILED";
+            messageLabel = "due to unexpected exception with ";
+            if (_stats.infoMessages.size() == 1)
+                messageLabel += "message";
+            if (_stats.infoMessages.size() > 1)
+                messageLabel += "messages";
+            break;
+        case ResultWas::FatalErrorCondition:
+            colour = Colour::Error;
+            passOrFail = "FAILED";
+            messageLabel = "due to a fatal error condition";
+            break;
+        case ResultWas::DidntThrowException:
+            colour = Colour::Error;
+            passOrFail = "FAILED";
+            messageLabel = "because no exception was thrown where one was expected";
+            break;
+        case ResultWas::Info:
+            messageLabel = "info";
+            break;
+        case ResultWas::Warning:
+            messageLabel = "warning";
+            break;
+        case ResultWas::ExplicitFailure:
+            passOrFail = "FAILED";
+            colour = Colour::Error;
+            if (_stats.infoMessages.size() == 1)
+                messageLabel = "explicitly with message";
+            if (_stats.infoMessages.size() > 1)
+                messageLabel = "explicitly with messages";
+            break;
+            // These cases are here to prevent compiler warnings
+        case ResultWas::Unknown:
+        case ResultWas::FailureBit:
+        case ResultWas::Exception:
+            passOrFail = "** internal error **";
+            colour = Colour::Error;
+            break;
+        }
+    }
+
+    void print() const {
+        printSourceInfo();
+        if (stats.totals.assertions.total() > 0) {
+            printResultType();
+            printOriginalExpression();
+            printReconstructedExpression();
+        } else {
+            stream << '\n';
+        }
+        printMessage();
+    }
+
+private:
+    void printResultType() const {
+        if (!passOrFail.empty()) {
+            Colour colourGuard(colour);
+            stream << passOrFail << ":\n";
+        }
+    }
+    void printOriginalExpression() const {
+        if (result.hasExpression()) {
+            Colour colourGuard(Colour::OriginalExpression);
+            stream << "  ";
+            stream << result.getExpressionInMacro();
+            stream << '\n';
+        }
+    }
+    void printReconstructedExpression() const {
+        if (result.hasExpandedExpression()) {
+            stream << "with expansion:\n";
+            Colour colourGuard(Colour::ReconstructedExpression);
+            stream << Column(result.getExpandedExpression()).indent(2) << '\n';
+        }
+    }
+    void printMessage() const {
+        if (!messageLabel.empty())
+            stream << messageLabel << ':' << '\n';
+        for (auto const& msg : messages) {
+            // If this assertion is a warning ignore any INFO messages
+            if (printInfoMessages || msg.type != ResultWas::Info)
+                stream << Column(msg.message).indent(2) << '\n';
+        }
+    }
+    void printSourceInfo() const {
+        Colour colourGuard(Colour::FileName);
+        stream << result.getSourceInfo() << ": ";
+    }
+
+    std::ostream& stream;
+    AssertionStats const& stats;
+    AssertionResult const& result;
+    Colour::Code colour;
+    std::string passOrFail;
+    std::string messageLabel;
+    std::string message;
+    std::vector<MessageInfo> messages;
+    bool printInfoMessages;
+};
+
+std::size_t makeRatio(std::size_t number, std::size_t total) {
+    std::size_t ratio = total > 0 ? CATCH_CONFIG_CONSOLE_WIDTH * number / total : 0;
+    return (ratio == 0 && number > 0) ? 1 : ratio;
+}
+
+std::size_t& findMax(std::size_t& i, std::size_t& j, std::size_t& k) {
+    if (i > j && i > k)
+        return i;
+    else if (j > k)
+        return j;
+    else
+        return k;
+}
+
+struct ColumnInfo {
+    enum Justification { Left, Right };
+    std::string name;
+    int width;
+    Justification justification;
+};
+struct ColumnBreak {};
+struct RowBreak {};
+
+class Duration {
+    enum class Unit {
+        Auto,
+        Nanoseconds,
+        Microseconds,
+        Milliseconds,
+        Seconds,
+        Minutes
+    };
+    static const uint64_t s_nanosecondsInAMicrosecond = 1000;
+    static const uint64_t s_nanosecondsInAMillisecond = 1000 * s_nanosecondsInAMicrosecond;
+    static const uint64_t s_nanosecondsInASecond = 1000 * s_nanosecondsInAMillisecond;
+    static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond;
+
+    double m_inNanoseconds;
+    Unit m_units;
+
+public:
+    explicit Duration(double inNanoseconds, Unit units = Unit::Auto)
+        : m_inNanoseconds(inNanoseconds),
+        m_units(units) {
+        if (m_units == Unit::Auto) {
+            if (m_inNanoseconds < s_nanosecondsInAMicrosecond)
+                m_units = Unit::Nanoseconds;
+            else if (m_inNanoseconds < s_nanosecondsInAMillisecond)
+                m_units = Unit::Microseconds;
+            else if (m_inNanoseconds < s_nanosecondsInASecond)
+                m_units = Unit::Milliseconds;
+            else if (m_inNanoseconds < s_nanosecondsInAMinute)
+                m_units = Unit::Seconds;
+            else
+                m_units = Unit::Minutes;
+        }
+
+    }
+
+    auto value() const -> double {
+        switch (m_units) {
+        case Unit::Microseconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMicrosecond);
+        case Unit::Milliseconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMillisecond);
+        case Unit::Seconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInASecond);
+        case Unit::Minutes:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMinute);
+        default:
+            return m_inNanoseconds;
+        }
+    }
+    auto unitsAsString() const -> std::string {
+        switch (m_units) {
+        case Unit::Nanoseconds:
+            return "ns";
+        case Unit::Microseconds:
+            return "us";
+        case Unit::Milliseconds:
+            return "ms";
+        case Unit::Seconds:
+            return "s";
+        case Unit::Minutes:
+            return "m";
+        default:
+            return "** internal error **";
+        }
+
+    }
+    friend auto operator << (std::ostream& os, Duration const& duration) -> std::ostream& {
+        return os << duration.value() << ' ' << duration.unitsAsString();
+    }
+};
+} // end anon namespace
+
+class TablePrinter {
+    std::ostream& m_os;
+    std::vector<ColumnInfo> m_columnInfos;
+    std::ostringstream m_oss;
+    int m_currentColumn = -1;
+    bool m_isOpen = false;
+
+public:
+    TablePrinter( std::ostream& os, std::vector<ColumnInfo> columnInfos )
+    :   m_os( os ),
+        m_columnInfos( std::move( columnInfos ) ) {}
+
+    auto columnInfos() const -> std::vector<ColumnInfo> const& {
+        return m_columnInfos;
+    }
+
+    void open() {
+        if (!m_isOpen) {
+            m_isOpen = true;
+            *this << RowBreak();
+
+			Columns headerCols;
+			Spacer spacer(2);
+			for (auto const& info : m_columnInfos) {
+				headerCols += Column(info.name).width(static_cast<std::size_t>(info.width - 2));
+				headerCols += spacer;
+			}
+			m_os << headerCols << '\n';
+
+            m_os << Catch::getLineOfChars<'-'>() << '\n';
+        }
+    }
+    void close() {
+        if (m_isOpen) {
+            *this << RowBreak();
+            m_os << std::endl;
+            m_isOpen = false;
+        }
+    }
+
+    template<typename T>
+    friend TablePrinter& operator << (TablePrinter& tp, T const& value) {
+        tp.m_oss << value;
+        return tp;
+    }
+
+    friend TablePrinter& operator << (TablePrinter& tp, ColumnBreak) {
+        auto colStr = tp.m_oss.str();
+        const auto strSize = colStr.size();
+        tp.m_oss.str("");
+        tp.open();
+        if (tp.m_currentColumn == static_cast<int>(tp.m_columnInfos.size() - 1)) {
+            tp.m_currentColumn = -1;
+            tp.m_os << '\n';
+        }
+        tp.m_currentColumn++;
+
+        auto colInfo = tp.m_columnInfos[tp.m_currentColumn];
+        auto padding = (strSize + 1 < static_cast<std::size_t>(colInfo.width))
+            ? std::string(colInfo.width - (strSize + 1), ' ')
+            : std::string();
+        if (colInfo.justification == ColumnInfo::Left)
+            tp.m_os << colStr << padding << ' ';
+        else
+            tp.m_os << padding << colStr << ' ';
+        return tp;
+    }
+
+    friend TablePrinter& operator << (TablePrinter& tp, RowBreak) {
+        if (tp.m_currentColumn > 0) {
+            tp.m_os << '\n';
+            tp.m_currentColumn = -1;
+        }
+        return tp;
+    }
+};
+
+ConsoleReporter::ConsoleReporter(ReporterConfig const& config)
+    : StreamingReporterBase(config),
+    m_tablePrinter(new TablePrinter(config.stream(),
+        [&config]() -> std::vector<ColumnInfo> {
+        if (config.fullConfig()->benchmarkNoAnalysis())
+        {
+            return{
+                { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, ColumnInfo::Left },
+                { "     samples", 14, ColumnInfo::Right },
+                { "  iterations", 14, ColumnInfo::Right },
+                { "        mean", 14, ColumnInfo::Right }
+            };
+        }
+        else
+        {
+            return{
+                { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, ColumnInfo::Left },
+                { "samples      mean       std dev", 14, ColumnInfo::Right },
+                { "iterations   low mean   low std dev", 14, ColumnInfo::Right },
+                { "estimated    high mean  high std dev", 14, ColumnInfo::Right }
+            };
+        }
+    }())) {}
+ConsoleReporter::~ConsoleReporter() = default;
+
+std::string ConsoleReporter::getDescription() {
+    return "Reports test results as plain lines of text";
+}
+
+void ConsoleReporter::noMatchingTestCases(std::string const& spec) {
+    stream << "No test cases matched '" << spec << '\'' << std::endl;
+}
+
+void ConsoleReporter::reportInvalidArguments(std::string const&arg){
+    stream << "Invalid Filter: " << arg << std::endl;
+}
+
+void ConsoleReporter::assertionStarting(AssertionInfo const&) {}
+
+bool ConsoleReporter::assertionEnded(AssertionStats const& _assertionStats) {
+    AssertionResult const& result = _assertionStats.assertionResult;
+
+    bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();
+
+    // Drop out if result was successful but we're not printing them.
+    if (!includeResults && result.getResultType() != ResultWas::Warning)
+        return false;
+
+    lazyPrint();
+
+    ConsoleAssertionPrinter printer(stream, _assertionStats, includeResults);
+    printer.print();
+    stream << std::endl;
+    return true;
+}
+
+void ConsoleReporter::sectionStarting(SectionInfo const& _sectionInfo) {
+    m_tablePrinter->close();
+    m_headerPrinted = false;
+    StreamingReporterBase::sectionStarting(_sectionInfo);
+}
+void ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) {
+    m_tablePrinter->close();
+    if (_sectionStats.missingAssertions) {
+        lazyPrint();
+        Colour colour(Colour::ResultError);
+        if (m_sectionStack.size() > 1)
+            stream << "\nNo assertions in section";
+        else
+            stream << "\nNo assertions in test case";
+        stream << " '" << _sectionStats.sectionInfo.name << "'\n" << std::endl;
+    }
+    double dur = _sectionStats.durationInSeconds;
+    if (shouldShowDuration(*m_config, dur)) {
+        stream << getFormattedDuration(dur) << " s: " << _sectionStats.sectionInfo.name << std::endl;
+    }
+    if (m_headerPrinted) {
+        m_headerPrinted = false;
+    }
+    StreamingReporterBase::sectionEnded(_sectionStats);
+}
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+void ConsoleReporter::benchmarkPreparing(std::string const& name) {
+	lazyPrintWithoutClosingBenchmarkTable();
+
+	auto nameCol = Column(name).width(static_cast<std::size_t>(m_tablePrinter->columnInfos()[0].width - 2));
+
+	bool firstLine = true;
+	for (auto line : nameCol) {
+		if (!firstLine)
+			(*m_tablePrinter) << ColumnBreak() << ColumnBreak() << ColumnBreak();
+		else
+			firstLine = false;
+
+		(*m_tablePrinter) << line << ColumnBreak();
+	}
+}
+
+void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
+    (*m_tablePrinter) << info.samples << ColumnBreak()
+        << info.iterations << ColumnBreak();
+    if (!m_config->benchmarkNoAnalysis())
+        (*m_tablePrinter) << Duration(info.estimatedDuration) << ColumnBreak();
+}
+void ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {
+    if (m_config->benchmarkNoAnalysis())
+    {
+        (*m_tablePrinter) << Duration(stats.mean.point.count()) << ColumnBreak();
+    }
+    else
+    {
+        (*m_tablePrinter) << ColumnBreak()
+            << Duration(stats.mean.point.count()) << ColumnBreak()
+            << Duration(stats.mean.lower_bound.count()) << ColumnBreak()
+            << Duration(stats.mean.upper_bound.count()) << ColumnBreak() << ColumnBreak()
+            << Duration(stats.standardDeviation.point.count()) << ColumnBreak()
+            << Duration(stats.standardDeviation.lower_bound.count()) << ColumnBreak()
+            << Duration(stats.standardDeviation.upper_bound.count()) << ColumnBreak() << ColumnBreak() << ColumnBreak() << ColumnBreak() << ColumnBreak();
+    }
+}
+
+void ConsoleReporter::benchmarkFailed(std::string const& error) {
+	Colour colour(Colour::Red);
+    (*m_tablePrinter)
+        << "Benchmark failed (" << error << ')'
+        << ColumnBreak() << RowBreak();
+}
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+void ConsoleReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
+    m_tablePrinter->close();
+    StreamingReporterBase::testCaseEnded(_testCaseStats);
+    m_headerPrinted = false;
+}
+void ConsoleReporter::testGroupEnded(TestGroupStats const& _testGroupStats) {
+    if (currentGroupInfo.used) {
+        printSummaryDivider();
+        stream << "Summary for group '" << _testGroupStats.groupInfo.name << "':\n";
+        printTotals(_testGroupStats.totals);
+        stream << '\n' << std::endl;
+    }
+    StreamingReporterBase::testGroupEnded(_testGroupStats);
+}
+void ConsoleReporter::testRunEnded(TestRunStats const& _testRunStats) {
+    printTotalsDivider(_testRunStats.totals);
+    printTotals(_testRunStats.totals);
+    stream << std::endl;
+    StreamingReporterBase::testRunEnded(_testRunStats);
+}
+void ConsoleReporter::testRunStarting(TestRunInfo const& _testInfo) {
+    StreamingReporterBase::testRunStarting(_testInfo);
+    printTestFilters();
+}
+
+void ConsoleReporter::lazyPrint() {
+
+    m_tablePrinter->close();
+    lazyPrintWithoutClosingBenchmarkTable();
+}
+
+void ConsoleReporter::lazyPrintWithoutClosingBenchmarkTable() {
+
+    if (!currentTestRunInfo.used)
+        lazyPrintRunInfo();
+    if (!currentGroupInfo.used)
+        lazyPrintGroupInfo();
+
+    if (!m_headerPrinted) {
+        printTestCaseAndSectionHeader();
+        m_headerPrinted = true;
+    }
+}
+void ConsoleReporter::lazyPrintRunInfo() {
+    stream << '\n' << getLineOfChars<'~'>() << '\n';
+    Colour colour(Colour::SecondaryText);
+    stream << currentTestRunInfo->name
+        << " is a Catch v" << libraryVersion() << " host application.\n"
+        << "Run with -? for options\n\n";
+
+    if (m_config->rngSeed() != 0)
+        stream << "Randomness seeded to: " << m_config->rngSeed() << "\n\n";
+
+    currentTestRunInfo.used = true;
+}
+void ConsoleReporter::lazyPrintGroupInfo() {
+    if (!currentGroupInfo->name.empty() && currentGroupInfo->groupsCounts > 1) {
+        printClosedHeader("Group: " + currentGroupInfo->name);
+        currentGroupInfo.used = true;
+    }
+}
+void ConsoleReporter::printTestCaseAndSectionHeader() {
+    assert(!m_sectionStack.empty());
+    printOpenHeader(currentTestCaseInfo->name);
+
+    if (m_sectionStack.size() > 1) {
+        Colour colourGuard(Colour::Headers);
+
+        auto
+            it = m_sectionStack.begin() + 1, // Skip first section (test case)
+            itEnd = m_sectionStack.end();
+        for (; it != itEnd; ++it)
+            printHeaderString(it->name, 2);
+    }
+
+    SourceLineInfo lineInfo = m_sectionStack.back().lineInfo;
+
+    stream << getLineOfChars<'-'>() << '\n';
+    Colour colourGuard(Colour::FileName);
+    stream << lineInfo << '\n';
+    stream << getLineOfChars<'.'>() << '\n' << std::endl;
+}
+
+void ConsoleReporter::printClosedHeader(std::string const& _name) {
+    printOpenHeader(_name);
+    stream << getLineOfChars<'.'>() << '\n';
+}
+void ConsoleReporter::printOpenHeader(std::string const& _name) {
+    stream << getLineOfChars<'-'>() << '\n';
+    {
+        Colour colourGuard(Colour::Headers);
+        printHeaderString(_name);
+    }
+}
+
+// if string has a : in first line will set indent to follow it on
+// subsequent lines
+void ConsoleReporter::printHeaderString(std::string const& _string, std::size_t indent) {
+    std::size_t i = _string.find(": ");
+    if (i != std::string::npos)
+        i += 2;
+    else
+        i = 0;
+    stream << Column(_string).indent(indent + i).initialIndent(indent) << '\n';
+}
+
+struct SummaryColumn {
+
+    SummaryColumn( std::string _label, Colour::Code _colour )
+    :   label( std::move( _label ) ),
+        colour( _colour ) {}
+    SummaryColumn addRow( std::size_t count ) {
+        ReusableStringStream rss;
+        rss << count;
+        std::string row = rss.str();
+        for (auto& oldRow : rows) {
+            while (oldRow.size() < row.size())
+                oldRow = ' ' + oldRow;
+            while (oldRow.size() > row.size())
+                row = ' ' + row;
+        }
+        rows.push_back(row);
+        return *this;
+    }
+
+    std::string label;
+    Colour::Code colour;
+    std::vector<std::string> rows;
+
+};
+
+void ConsoleReporter::printTotals( Totals const& totals ) {
+    if (totals.testCases.total() == 0) {
+        stream << Colour(Colour::Warning) << "No tests ran\n";
+    } else if (totals.assertions.total() > 0 && totals.testCases.allPassed()) {
+        stream << Colour(Colour::ResultSuccess) << "All tests passed";
+        stream << " ("
+            << pluralise(totals.assertions.passed, "assertion") << " in "
+            << pluralise(totals.testCases.passed, "test case") << ')'
+            << '\n';
+    } else {
+
+        std::vector<SummaryColumn> columns;
+        columns.push_back(SummaryColumn("", Colour::None)
+                          .addRow(totals.testCases.total())
+                          .addRow(totals.assertions.total()));
+        columns.push_back(SummaryColumn("passed", Colour::Success)
+                          .addRow(totals.testCases.passed)
+                          .addRow(totals.assertions.passed));
+        columns.push_back(SummaryColumn("failed", Colour::ResultError)
+                          .addRow(totals.testCases.failed)
+                          .addRow(totals.assertions.failed));
+        columns.push_back(SummaryColumn("failed as expected", Colour::ResultExpectedFailure)
+                          .addRow(totals.testCases.failedButOk)
+                          .addRow(totals.assertions.failedButOk));
+
+        printSummaryRow("test cases", columns, 0);
+        printSummaryRow("assertions", columns, 1);
+    }
+}
+void ConsoleReporter::printSummaryRow(std::string const& label, std::vector<SummaryColumn> const& cols, std::size_t row) {
+    for (auto col : cols) {
+        std::string value = col.rows[row];
+        if (col.label.empty()) {
+            stream << label << ": ";
+            if (value != "0")
+                stream << value;
+            else
+                stream << Colour(Colour::Warning) << "- none -";
+        } else if (value != "0") {
+            stream << Colour(Colour::LightGrey) << " | ";
+            stream << Colour(col.colour)
+                << value << ' ' << col.label;
+        }
+    }
+    stream << '\n';
+}
+
+void ConsoleReporter::printTotalsDivider(Totals const& totals) {
+    if (totals.testCases.total() > 0) {
+        std::size_t failedRatio = makeRatio(totals.testCases.failed, totals.testCases.total());
+        std::size_t failedButOkRatio = makeRatio(totals.testCases.failedButOk, totals.testCases.total());
+        std::size_t passedRatio = makeRatio(totals.testCases.passed, totals.testCases.total());
+        while (failedRatio + failedButOkRatio + passedRatio < CATCH_CONFIG_CONSOLE_WIDTH - 1)
+            findMax(failedRatio, failedButOkRatio, passedRatio)++;
+        while (failedRatio + failedButOkRatio + passedRatio > CATCH_CONFIG_CONSOLE_WIDTH - 1)
+            findMax(failedRatio, failedButOkRatio, passedRatio)--;
+
+        stream << Colour(Colour::Error) << std::string(failedRatio, '=');
+        stream << Colour(Colour::ResultExpectedFailure) << std::string(failedButOkRatio, '=');
+        if (totals.testCases.allPassed())
+            stream << Colour(Colour::ResultSuccess) << std::string(passedRatio, '=');
+        else
+            stream << Colour(Colour::Success) << std::string(passedRatio, '=');
+    } else {
+        stream << Colour(Colour::Warning) << std::string(CATCH_CONFIG_CONSOLE_WIDTH - 1, '=');
+    }
+    stream << '\n';
+}
+void ConsoleReporter::printSummaryDivider() {
+    stream << getLineOfChars<'-'>() << '\n';
+}
+
+void ConsoleReporter::printTestFilters() {
+    if (m_config->testSpec().hasFilters()) {
+        Colour guard(Colour::BrightYellow);
+        stream << "Filters: " << serializeFilters(m_config->getTestsOrTags()) << '\n';
+    }
+}
+
+CATCH_REGISTER_REPORTER("console", ConsoleReporter)
+
+} // end namespace Catch
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#endif
+// end catch_reporter_console.cpp
+// start catch_reporter_junit.cpp
+
+#include <cassert>
+#include <sstream>
+#include <ctime>
+#include <algorithm>
+
+namespace Catch {
+
+    namespace {
+        std::string getCurrentTimestamp() {
+            // Beware, this is not reentrant because of backward compatibility issues
+            // Also, UTC only, again because of backward compatibility (%z is C++11)
+            time_t rawtime;
+            std::time(&rawtime);
+            auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
+
+#ifdef _MSC_VER
+            std::tm timeInfo = {};
+            gmtime_s(&timeInfo, &rawtime);
+#else
+            std::tm* timeInfo;
+            timeInfo = std::gmtime(&rawtime);
+#endif
+
+            char timeStamp[timeStampSize];
+            const char * const fmt = "%Y-%m-%dT%H:%M:%SZ";
+
+#ifdef _MSC_VER
+            std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
+#else
+            std::strftime(timeStamp, timeStampSize, fmt, timeInfo);
+#endif
+            return std::string(timeStamp);
+        }
+
+        std::string fileNameTag(const std::vector<std::string> &tags) {
+            auto it = std::find_if(begin(tags),
+                                   end(tags),
+                                   [] (std::string const& tag) {return tag.front() == '#'; });
+            if (it != tags.end())
+                return it->substr(1);
+            return std::string();
+        }
+    } // anonymous namespace
+
+    JunitReporter::JunitReporter( ReporterConfig const& _config )
+        :   CumulativeReporterBase( _config ),
+            xml( _config.stream() )
+        {
+            m_reporterPrefs.shouldRedirectStdOut = true;
+            m_reporterPrefs.shouldReportAllAssertions = true;
+        }
+
+    JunitReporter::~JunitReporter() {}
+
+    std::string JunitReporter::getDescription() {
+        return "Reports test results in an XML format that looks like Ant's junitreport target";
+    }
+
+    void JunitReporter::noMatchingTestCases( std::string const& /*spec*/ ) {}
+
+    void JunitReporter::testRunStarting( TestRunInfo const& runInfo )  {
+        CumulativeReporterBase::testRunStarting( runInfo );
+        xml.startElement( "testsuites" );
+    }
+
+    void JunitReporter::testGroupStarting( GroupInfo const& groupInfo ) {
+        suiteTimer.start();
+        stdOutForSuite.clear();
+        stdErrForSuite.clear();
+        unexpectedExceptions = 0;
+        CumulativeReporterBase::testGroupStarting( groupInfo );
+    }
+
+    void JunitReporter::testCaseStarting( TestCaseInfo const& testCaseInfo ) {
+        m_okToFail = testCaseInfo.okToFail();
+    }
+
+    bool JunitReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        if( assertionStats.assertionResult.getResultType() == ResultWas::ThrewException && !m_okToFail )
+            unexpectedExceptions++;
+        return CumulativeReporterBase::assertionEnded( assertionStats );
+    }
+
+    void JunitReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        stdOutForSuite += testCaseStats.stdOut;
+        stdErrForSuite += testCaseStats.stdErr;
+        CumulativeReporterBase::testCaseEnded( testCaseStats );
+    }
+
+    void JunitReporter::testGroupEnded( TestGroupStats const& testGroupStats ) {
+        double suiteTime = suiteTimer.getElapsedSeconds();
+        CumulativeReporterBase::testGroupEnded( testGroupStats );
+        writeGroup( *m_testGroups.back(), suiteTime );
+    }
+
+    void JunitReporter::testRunEndedCumulative() {
+        xml.endElement();
+    }
+
+    void JunitReporter::writeGroup( TestGroupNode const& groupNode, double suiteTime ) {
+        XmlWriter::ScopedElement e = xml.scopedElement( "testsuite" );
+
+        TestGroupStats const& stats = groupNode.value;
+        xml.writeAttribute( "name", stats.groupInfo.name );
+        xml.writeAttribute( "errors", unexpectedExceptions );
+        xml.writeAttribute( "failures", stats.totals.assertions.failed-unexpectedExceptions );
+        xml.writeAttribute( "tests", stats.totals.assertions.total() );
+        xml.writeAttribute( "hostname", "tbd" ); // !TBD
+        if( m_config->showDurations() == ShowDurations::Never )
+            xml.writeAttribute( "time", "" );
+        else
+            xml.writeAttribute( "time", suiteTime );
+        xml.writeAttribute( "timestamp", getCurrentTimestamp() );
+
+        // Write properties if there are any
+        if (m_config->hasTestFilters() || m_config->rngSeed() != 0) {
+            auto properties = xml.scopedElement("properties");
+            if (m_config->hasTestFilters()) {
+                xml.scopedElement("property")
+                    .writeAttribute("name", "filters")
+                    .writeAttribute("value", serializeFilters(m_config->getTestsOrTags()));
+            }
+            if (m_config->rngSeed() != 0) {
+                xml.scopedElement("property")
+                    .writeAttribute("name", "random-seed")
+                    .writeAttribute("value", m_config->rngSeed());
+            }
+        }
+
+        // Write test cases
+        for( auto const& child : groupNode.children )
+            writeTestCase( *child );
+
+        xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite ), XmlFormatting::Newline );
+        xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite ), XmlFormatting::Newline );
+    }
+
+    void JunitReporter::writeTestCase( TestCaseNode const& testCaseNode ) {
+        TestCaseStats const& stats = testCaseNode.value;
+
+        // All test cases have exactly one section - which represents the
+        // test case itself. That section may have 0-n nested sections
+        assert( testCaseNode.children.size() == 1 );
+        SectionNode const& rootSection = *testCaseNode.children.front();
+
+        std::string className = stats.testInfo.className;
+
+        if( className.empty() ) {
+            className = fileNameTag(stats.testInfo.tags);
+            if ( className.empty() )
+                className = "global";
+        }
+
+        if ( !m_config->name().empty() )
+            className = m_config->name() + "." + className;
+
+        writeSection( className, "", rootSection );
+    }
+
+    void JunitReporter::writeSection(  std::string const& className,
+                        std::string const& rootName,
+                        SectionNode const& sectionNode ) {
+        std::string name = trim( sectionNode.stats.sectionInfo.name );
+        if( !rootName.empty() )
+            name = rootName + '/' + name;
+
+        if( !sectionNode.assertions.empty() ||
+            !sectionNode.stdOut.empty() ||
+            !sectionNode.stdErr.empty() ) {
+            XmlWriter::ScopedElement e = xml.scopedElement( "testcase" );
+            if( className.empty() ) {
+                xml.writeAttribute( "classname", name );
+                xml.writeAttribute( "name", "root" );
+            }
+            else {
+                xml.writeAttribute( "classname", className );
+                xml.writeAttribute( "name", name );
+            }
+            xml.writeAttribute( "time", ::Catch::Detail::stringify( sectionNode.stats.durationInSeconds ) );
+            // This is not ideal, but it should be enough to mimic gtest's
+            // junit output.
+            // Ideally the JUnit reporter would also handle `skipTest`
+            // events and write those out appropriately.
+            xml.writeAttribute( "status", "run" );
+
+            writeAssertions( sectionNode );
+
+            if( !sectionNode.stdOut.empty() )
+                xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), XmlFormatting::Newline );
+            if( !sectionNode.stdErr.empty() )
+                xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), XmlFormatting::Newline );
+        }
+        for( auto const& childNode : sectionNode.childSections )
+            if( className.empty() )
+                writeSection( name, "", *childNode );
+            else
+                writeSection( className, name, *childNode );
+    }
+
+    void JunitReporter::writeAssertions( SectionNode const& sectionNode ) {
+        for( auto const& assertion : sectionNode.assertions )
+            writeAssertion( assertion );
+    }
+
+    void JunitReporter::writeAssertion( AssertionStats const& stats ) {
+        AssertionResult const& result = stats.assertionResult;
+        if( !result.isOk() ) {
+            std::string elementName;
+            switch( result.getResultType() ) {
+                case ResultWas::ThrewException:
+                case ResultWas::FatalErrorCondition:
+                    elementName = "error";
+                    break;
+                case ResultWas::ExplicitFailure:
+                case ResultWas::ExpressionFailed:
+                case ResultWas::DidntThrowException:
+                    elementName = "failure";
+                    break;
+
+                // We should never see these here:
+                case ResultWas::Info:
+                case ResultWas::Warning:
+                case ResultWas::Ok:
+                case ResultWas::Unknown:
+                case ResultWas::FailureBit:
+                case ResultWas::Exception:
+                    elementName = "internalError";
+                    break;
+            }
+
+            XmlWriter::ScopedElement e = xml.scopedElement( elementName );
+
+            xml.writeAttribute( "message", result.getExpression() );
+            xml.writeAttribute( "type", result.getTestMacroName() );
+
+            ReusableStringStream rss;
+            if (stats.totals.assertions.total() > 0) {
+                rss << "FAILED" << ":\n";
+                if (result.hasExpression()) {
+                    rss << "  ";
+                    rss << result.getExpressionInMacro();
+                    rss << '\n';
+                }
+                if (result.hasExpandedExpression()) {
+                    rss << "with expansion:\n";
+                    rss << Column(result.getExpandedExpression()).indent(2) << '\n';
+                }
+            } else {
+                rss << '\n';
+            }
+
+            if( !result.getMessage().empty() )
+                rss << result.getMessage() << '\n';
+            for( auto const& msg : stats.infoMessages )
+                if( msg.type == ResultWas::Info )
+                    rss << msg.message << '\n';
+
+            rss << "at " << result.getSourceInfo();
+            xml.writeText( rss.str(), XmlFormatting::Newline );
+        }
+    }
+
+    CATCH_REGISTER_REPORTER( "junit", JunitReporter )
+
+} // end namespace Catch
+// end catch_reporter_junit.cpp
+// start catch_reporter_listening.cpp
+
+#include <cassert>
+
+namespace Catch {
+
+    ListeningReporter::ListeningReporter() {
+        // We will assume that listeners will always want all assertions
+        m_preferences.shouldReportAllAssertions = true;
+    }
+
+    void ListeningReporter::addListener( IStreamingReporterPtr&& listener ) {
+        m_listeners.push_back( std::move( listener ) );
+    }
+
+    void ListeningReporter::addReporter(IStreamingReporterPtr&& reporter) {
+        assert(!m_reporter && "Listening reporter can wrap only 1 real reporter");
+        m_reporter = std::move( reporter );
+        m_preferences.shouldRedirectStdOut = m_reporter->getPreferences().shouldRedirectStdOut;
+    }
+
+    ReporterPreferences ListeningReporter::getPreferences() const {
+        return m_preferences;
+    }
+
+    std::set<Verbosity> ListeningReporter::getSupportedVerbosities() {
+        return std::set<Verbosity>{ };
+    }
+
+    void ListeningReporter::noMatchingTestCases( std::string const& spec ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->noMatchingTestCases( spec );
+        }
+        m_reporter->noMatchingTestCases( spec );
+    }
+
+    void ListeningReporter::reportInvalidArguments(std::string const&arg){
+        for ( auto const& listener : m_listeners ) {
+            listener->reportInvalidArguments( arg );
+        }
+        m_reporter->reportInvalidArguments( arg );
+    }
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+    void ListeningReporter::benchmarkPreparing( std::string const& name ) {
+		for (auto const& listener : m_listeners) {
+			listener->benchmarkPreparing(name);
+		}
+		m_reporter->benchmarkPreparing(name);
+	}
+    void ListeningReporter::benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->benchmarkStarting( benchmarkInfo );
+        }
+        m_reporter->benchmarkStarting( benchmarkInfo );
+    }
+    void ListeningReporter::benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->benchmarkEnded( benchmarkStats );
+        }
+        m_reporter->benchmarkEnded( benchmarkStats );
+    }
+
+	void ListeningReporter::benchmarkFailed( std::string const& error ) {
+		for (auto const& listener : m_listeners) {
+			listener->benchmarkFailed(error);
+		}
+		m_reporter->benchmarkFailed(error);
+	}
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+    void ListeningReporter::testRunStarting( TestRunInfo const& testRunInfo ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->testRunStarting( testRunInfo );
+        }
+        m_reporter->testRunStarting( testRunInfo );
+    }
+
+    void ListeningReporter::testGroupStarting( GroupInfo const& groupInfo ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->testGroupStarting( groupInfo );
+        }
+        m_reporter->testGroupStarting( groupInfo );
+    }
+
+    void ListeningReporter::testCaseStarting( TestCaseInfo const& testInfo ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->testCaseStarting( testInfo );
+        }
+        m_reporter->testCaseStarting( testInfo );
+    }
+
+    void ListeningReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->sectionStarting( sectionInfo );
+        }
+        m_reporter->sectionStarting( sectionInfo );
+    }
+
+    void ListeningReporter::assertionStarting( AssertionInfo const& assertionInfo ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->assertionStarting( assertionInfo );
+        }
+        m_reporter->assertionStarting( assertionInfo );
+    }
+
+    // The return value indicates if the messages buffer should be cleared:
+    bool ListeningReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        for( auto const& listener : m_listeners ) {
+            static_cast<void>( listener->assertionEnded( assertionStats ) );
+        }
+        return m_reporter->assertionEnded( assertionStats );
+    }
+
+    void ListeningReporter::sectionEnded( SectionStats const& sectionStats ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->sectionEnded( sectionStats );
+        }
+        m_reporter->sectionEnded( sectionStats );
+    }
+
+    void ListeningReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->testCaseEnded( testCaseStats );
+        }
+        m_reporter->testCaseEnded( testCaseStats );
+    }
+
+    void ListeningReporter::testGroupEnded( TestGroupStats const& testGroupStats ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->testGroupEnded( testGroupStats );
+        }
+        m_reporter->testGroupEnded( testGroupStats );
+    }
+
+    void ListeningReporter::testRunEnded( TestRunStats const& testRunStats ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->testRunEnded( testRunStats );
+        }
+        m_reporter->testRunEnded( testRunStats );
+    }
+
+    void ListeningReporter::skipTest( TestCaseInfo const& testInfo ) {
+        for ( auto const& listener : m_listeners ) {
+            listener->skipTest( testInfo );
+        }
+        m_reporter->skipTest( testInfo );
+    }
+
+    bool ListeningReporter::isMulti() const {
+        return true;
+    }
+
+} // end namespace Catch
+// end catch_reporter_listening.cpp
+// start catch_reporter_xml.cpp
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
+                              // Note that 4062 (not all labels are handled
+                              // and default is missing) is enabled
+#endif
+
+namespace Catch {
+    XmlReporter::XmlReporter( ReporterConfig const& _config )
+    :   StreamingReporterBase( _config ),
+        m_xml(_config.stream())
+    {
+        m_reporterPrefs.shouldRedirectStdOut = true;
+        m_reporterPrefs.shouldReportAllAssertions = true;
+    }
+
+    XmlReporter::~XmlReporter() = default;
+
+    std::string XmlReporter::getDescription() {
+        return "Reports test results as an XML document";
+    }
+
+    std::string XmlReporter::getStylesheetRef() const {
+        return std::string();
+    }
+
+    void XmlReporter::writeSourceInfo( SourceLineInfo const& sourceInfo ) {
+        m_xml
+            .writeAttribute( "filename", sourceInfo.file )
+            .writeAttribute( "line", sourceInfo.line );
+    }
+
+    void XmlReporter::noMatchingTestCases( std::string const& s ) {
+        StreamingReporterBase::noMatchingTestCases( s );
+    }
+
+    void XmlReporter::testRunStarting( TestRunInfo const& testInfo ) {
+        StreamingReporterBase::testRunStarting( testInfo );
+        std::string stylesheetRef = getStylesheetRef();
+        if( !stylesheetRef.empty() )
+            m_xml.writeStylesheetRef( stylesheetRef );
+        m_xml.startElement( "Catch" );
+        if( !m_config->name().empty() )
+            m_xml.writeAttribute( "name", m_config->name() );
+        if (m_config->testSpec().hasFilters())
+            m_xml.writeAttribute( "filters", serializeFilters( m_config->getTestsOrTags() ) );
+        if( m_config->rngSeed() != 0 )
+            m_xml.scopedElement( "Randomness" )
+                .writeAttribute( "seed", m_config->rngSeed() );
+    }
+
+    void XmlReporter::testGroupStarting( GroupInfo const& groupInfo ) {
+        StreamingReporterBase::testGroupStarting( groupInfo );
+        m_xml.startElement( "Group" )
+            .writeAttribute( "name", groupInfo.name );
+    }
+
+    void XmlReporter::testCaseStarting( TestCaseInfo const& testInfo ) {
+        StreamingReporterBase::testCaseStarting(testInfo);
+        m_xml.startElement( "TestCase" )
+            .writeAttribute( "name", trim( testInfo.name ) )
+            .writeAttribute( "description", testInfo.description )
+            .writeAttribute( "tags", testInfo.tagsAsString() );
+
+        writeSourceInfo( testInfo.lineInfo );
+
+        if ( m_config->showDurations() == ShowDurations::Always )
+            m_testCaseTimer.start();
+        m_xml.ensureTagClosed();
+    }
+
+    void XmlReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        StreamingReporterBase::sectionStarting( sectionInfo );
+        if( m_sectionDepth++ > 0 ) {
+            m_xml.startElement( "Section" )
+                .writeAttribute( "name", trim( sectionInfo.name ) );
+            writeSourceInfo( sectionInfo.lineInfo );
+            m_xml.ensureTagClosed();
+        }
+    }
+
+    void XmlReporter::assertionStarting( AssertionInfo const& ) { }
+
+    bool XmlReporter::assertionEnded( AssertionStats const& assertionStats ) {
+
+        AssertionResult const& result = assertionStats.assertionResult;
+
+        bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();
+
+        if( includeResults || result.getResultType() == ResultWas::Warning ) {
+            // Print any info messages in <Info> tags.
+            for( auto const& msg : assertionStats.infoMessages ) {
+                if( msg.type == ResultWas::Info && includeResults ) {
+                    m_xml.scopedElement( "Info" )
+                            .writeText( msg.message );
+                } else if ( msg.type == ResultWas::Warning ) {
+                    m_xml.scopedElement( "Warning" )
+                            .writeText( msg.message );
+                }
+            }
+        }
+
+        // Drop out if result was successful but we're not printing them.
+        if( !includeResults && result.getResultType() != ResultWas::Warning )
+            return true;
+
+        // Print the expression if there is one.
+        if( result.hasExpression() ) {
+            m_xml.startElement( "Expression" )
+                .writeAttribute( "success", result.succeeded() )
+                .writeAttribute( "type", result.getTestMacroName() );
+
+            writeSourceInfo( result.getSourceInfo() );
+
+            m_xml.scopedElement( "Original" )
+                .writeText( result.getExpression() );
+            m_xml.scopedElement( "Expanded" )
+                .writeText( result.getExpandedExpression() );
+        }
+
+        // And... Print a result applicable to each result type.
+        switch( result.getResultType() ) {
+            case ResultWas::ThrewException:
+                m_xml.startElement( "Exception" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            case ResultWas::FatalErrorCondition:
+                m_xml.startElement( "FatalErrorCondition" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            case ResultWas::Info:
+                m_xml.scopedElement( "Info" )
+                    .writeText( result.getMessage() );
+                break;
+            case ResultWas::Warning:
+                // Warning will already have been written
+                break;
+            case ResultWas::ExplicitFailure:
+                m_xml.startElement( "Failure" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            default:
+                break;
+        }
+
+        if( result.hasExpression() )
+            m_xml.endElement();
+
+        return true;
+    }
+
+    void XmlReporter::sectionEnded( SectionStats const& sectionStats ) {
+        StreamingReporterBase::sectionEnded( sectionStats );
+        if( --m_sectionDepth > 0 ) {
+            XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResults" );
+            e.writeAttribute( "successes", sectionStats.assertions.passed );
+            e.writeAttribute( "failures", sectionStats.assertions.failed );
+            e.writeAttribute( "expectedFailures", sectionStats.assertions.failedButOk );
+
+            if ( m_config->showDurations() == ShowDurations::Always )
+                e.writeAttribute( "durationInSeconds", sectionStats.durationInSeconds );
+
+            m_xml.endElement();
+        }
+    }
+
+    void XmlReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        StreamingReporterBase::testCaseEnded( testCaseStats );
+        XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResult" );
+        e.writeAttribute( "success", testCaseStats.totals.assertions.allOk() );
+
+        if ( m_config->showDurations() == ShowDurations::Always )
+            e.writeAttribute( "durationInSeconds", m_testCaseTimer.getElapsedSeconds() );
+
+        if( !testCaseStats.stdOut.empty() )
+            m_xml.scopedElement( "StdOut" ).writeText( trim( testCaseStats.stdOut ), XmlFormatting::Newline );
+        if( !testCaseStats.stdErr.empty() )
+            m_xml.scopedElement( "StdErr" ).writeText( trim( testCaseStats.stdErr ), XmlFormatting::Newline );
+
+        m_xml.endElement();
+    }
+
+    void XmlReporter::testGroupEnded( TestGroupStats const& testGroupStats ) {
+        StreamingReporterBase::testGroupEnded( testGroupStats );
+        // TODO: Check testGroupStats.aborting and act accordingly.
+        m_xml.scopedElement( "OverallResults" )
+            .writeAttribute( "successes", testGroupStats.totals.assertions.passed )
+            .writeAttribute( "failures", testGroupStats.totals.assertions.failed )
+            .writeAttribute( "expectedFailures", testGroupStats.totals.assertions.failedButOk );
+        m_xml.scopedElement( "OverallResultsCases")
+            .writeAttribute( "successes", testGroupStats.totals.testCases.passed )
+            .writeAttribute( "failures", testGroupStats.totals.testCases.failed )
+            .writeAttribute( "expectedFailures", testGroupStats.totals.testCases.failedButOk );
+        m_xml.endElement();
+    }
+
+    void XmlReporter::testRunEnded( TestRunStats const& testRunStats ) {
+        StreamingReporterBase::testRunEnded( testRunStats );
+        m_xml.scopedElement( "OverallResults" )
+            .writeAttribute( "successes", testRunStats.totals.assertions.passed )
+            .writeAttribute( "failures", testRunStats.totals.assertions.failed )
+            .writeAttribute( "expectedFailures", testRunStats.totals.assertions.failedButOk );
+        m_xml.scopedElement( "OverallResultsCases")
+            .writeAttribute( "successes", testRunStats.totals.testCases.passed )
+            .writeAttribute( "failures", testRunStats.totals.testCases.failed )
+            .writeAttribute( "expectedFailures", testRunStats.totals.testCases.failedButOk );
+        m_xml.endElement();
+    }
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+    void XmlReporter::benchmarkPreparing(std::string const& name) {
+        m_xml.startElement("BenchmarkResults")
+            .writeAttribute("name", name);
+    }
+
+    void XmlReporter::benchmarkStarting(BenchmarkInfo const &info) {
+        m_xml.writeAttribute("samples", info.samples)
+            .writeAttribute("resamples", info.resamples)
+            .writeAttribute("iterations", info.iterations)
+            .writeAttribute("clockResolution", info.clockResolution)
+            .writeAttribute("estimatedDuration", info.estimatedDuration)
+            .writeComment("All values in nano seconds");
+    }
+
+    void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
+        m_xml.startElement("mean")
+            .writeAttribute("value", benchmarkStats.mean.point.count())
+            .writeAttribute("lowerBound", benchmarkStats.mean.lower_bound.count())
+            .writeAttribute("upperBound", benchmarkStats.mean.upper_bound.count())
+            .writeAttribute("ci", benchmarkStats.mean.confidence_interval);
+        m_xml.endElement();
+        m_xml.startElement("standardDeviation")
+            .writeAttribute("value", benchmarkStats.standardDeviation.point.count())
+            .writeAttribute("lowerBound", benchmarkStats.standardDeviation.lower_bound.count())
+            .writeAttribute("upperBound", benchmarkStats.standardDeviation.upper_bound.count())
+            .writeAttribute("ci", benchmarkStats.standardDeviation.confidence_interval);
+        m_xml.endElement();
+        m_xml.startElement("outliers")
+            .writeAttribute("variance", benchmarkStats.outlierVariance)
+            .writeAttribute("lowMild", benchmarkStats.outliers.low_mild)
+            .writeAttribute("lowSevere", benchmarkStats.outliers.low_severe)
+            .writeAttribute("highMild", benchmarkStats.outliers.high_mild)
+            .writeAttribute("highSevere", benchmarkStats.outliers.high_severe);
+        m_xml.endElement();
+        m_xml.endElement();
+    }
+
+    void XmlReporter::benchmarkFailed(std::string const &error) {
+        m_xml.scopedElement("failed").
+            writeAttribute("message", error);
+        m_xml.endElement();
+    }
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+    CATCH_REGISTER_REPORTER( "xml", XmlReporter )
+
+} // end namespace Catch
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+// end catch_reporter_xml.cpp
+
+namespace Catch {
+    LeakDetector leakDetector;
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+// end catch_impl.hpp
+#endif
+
+#ifdef CATCH_CONFIG_MAIN
+// start catch_default_main.hpp
+
+#ifndef __OBJC__
+
+#if defined(CATCH_CONFIG_WCHAR) && defined(CATCH_PLATFORM_WINDOWS) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN)
+// Standard C/C++ Win32 Unicode wmain entry point
+extern "C" int wmain (int argc, wchar_t * argv[], wchar_t * []) {
+#else
+// Standard C/C++ main entry point
+int main (int argc, char * argv[]) {
+#endif
+
+    return Catch::Session().run( argc, argv );
+}
+
+#else // __OBJC__
+
+// Objective-C entry point
+int main (int argc, char * const argv[]) {
+#if !CATCH_ARC_ENABLED
+    NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
+#endif
+
+    Catch::registerTestMethods();
+    int result = Catch::Session().run( argc, (char**)argv );
+
+#if !CATCH_ARC_ENABLED
+    [pool drain];
+#endif
+
+    return result;
+}
+
+#endif // __OBJC__
+
+// end catch_default_main.hpp
+#endif
+
+#if !defined(CATCH_CONFIG_IMPL_ONLY)
+
+#ifdef CLARA_CONFIG_MAIN_NOT_DEFINED
+#  undef CLARA_CONFIG_MAIN
+#endif
+
+#if !defined(CATCH_CONFIG_DISABLE)
+//////
+// If this config identifier is defined then all CATCH macros are prefixed with CATCH_
+#ifdef CATCH_CONFIG_PREFIX_ALL
+
+#define CATCH_REQUIRE( ... ) INTERNAL_CATCH_TEST( "CATCH_REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+#define CATCH_REQUIRE_FALSE( ... ) INTERNAL_CATCH_TEST( "CATCH_REQUIRE_FALSE", Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+
+#define CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+#define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CATCH_REQUIRE_THROWS_AS", exceptionType, Catch::ResultDisposition::Normal, expr )
+#define CATCH_REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CATCH_REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr )
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CATCH_REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CATCH_REQUIRE_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::Normal, matcher, expr )
+#endif// CATCH_CONFIG_DISABLE_MATCHERS
+#define CATCH_REQUIRE_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CATCH_REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+#define CATCH_CHECK( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CATCH_CHECK_FALSE( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK_FALSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+#define CATCH_CHECKED_IF( ... ) INTERNAL_CATCH_IF( "CATCH_CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CATCH_CHECKED_ELSE( ... ) INTERNAL_CATCH_ELSE( "CATCH_CHECKED_ELSE", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CATCH_CHECK_NOFAIL( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK_NOFAIL", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+
+#define CATCH_CHECK_THROWS( ... )  INTERNAL_CATCH_THROWS( "CATCH_CHECK_THROWS", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CATCH_CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CATCH_CHECK_THROWS_AS", exceptionType, Catch::ResultDisposition::ContinueOnFailure, expr )
+#define CATCH_CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CATCH_CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CATCH_CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CATCH_CHECK_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+#define CATCH_CHECK_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CATCH_CHECK_NOTHROW", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CATCH_CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CATCH_CHECK_THAT", matcher, Catch::ResultDisposition::ContinueOnFailure, arg )
+
+#define CATCH_REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CATCH_REQUIRE_THAT", matcher, Catch::ResultDisposition::Normal, arg )
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+
+#define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( "CATCH_INFO", msg )
+#define CATCH_UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "CATCH_UNSCOPED_INFO", msg )
+#define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( "CATCH_WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
+#define CATCH_CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CATCH_CAPTURE",__VA_ARGS__ )
+
+#define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
+#define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define CATCH_METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
+#define CATCH_REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
+#define CATCH_SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
+#define CATCH_DYNAMIC_SECTION( ... ) INTERNAL_CATCH_DYNAMIC_SECTION( __VA_ARGS__ )
+#define CATCH_FAIL( ... ) INTERNAL_CATCH_MSG( "CATCH_FAIL", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+#define CATCH_FAIL_CHECK( ... ) INTERNAL_CATCH_MSG( "CATCH_FAIL_CHECK", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CATCH_SUCCEED( ... ) INTERNAL_CATCH_MSG( "CATCH_SUCCEED", Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+
+#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ )
+#define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+#else
+#define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+#endif
+
+#if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)
+#define CATCH_STATIC_REQUIRE( ... )       static_assert(   __VA_ARGS__ ,      #__VA_ARGS__ );     CATCH_SUCCEED( #__VA_ARGS__ )
+#define CATCH_STATIC_REQUIRE_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); CATCH_SUCCEED( #__VA_ARGS__ )
+#else
+#define CATCH_STATIC_REQUIRE( ... )       CATCH_REQUIRE( __VA_ARGS__ )
+#define CATCH_STATIC_REQUIRE_FALSE( ... ) CATCH_REQUIRE_FALSE( __VA_ARGS__ )
+#endif
+
+// "BDD-style" convenience wrappers
+#define CATCH_SCENARIO( ... ) CATCH_TEST_CASE( "Scenario: " __VA_ARGS__ )
+#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
+#define CATCH_GIVEN( desc )     INTERNAL_CATCH_DYNAMIC_SECTION( "    Given: " << desc )
+#define CATCH_AND_GIVEN( desc ) INTERNAL_CATCH_DYNAMIC_SECTION( "And given: " << desc )
+#define CATCH_WHEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     When: " << desc )
+#define CATCH_AND_WHEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( " And when: " << desc )
+#define CATCH_THEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     Then: " << desc )
+#define CATCH_AND_THEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( "      And: " << desc )
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+#define CATCH_BENCHMARK(...) \
+    INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,))
+#define CATCH_BENCHMARK_ADVANCED(name) \
+    INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name)
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not required
+#else
+
+#define REQUIRE( ... ) INTERNAL_CATCH_TEST( "REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__  )
+#define REQUIRE_FALSE( ... ) INTERNAL_CATCH_TEST( "REQUIRE_FALSE", Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+
+#define REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+#define REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "REQUIRE_THROWS_AS", exceptionType, Catch::ResultDisposition::Normal, expr )
+#define REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr )
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "REQUIRE_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::Normal, matcher, expr )
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+#define REQUIRE_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+#define CHECK( ... ) INTERNAL_CATCH_TEST( "CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CHECK_FALSE( ... ) INTERNAL_CATCH_TEST( "CHECK_FALSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+#define CHECKED_IF( ... ) INTERNAL_CATCH_IF( "CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CHECKED_ELSE( ... ) INTERNAL_CATCH_ELSE( "CHECKED_ELSE", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CHECK_NOFAIL( ... ) INTERNAL_CATCH_TEST( "CHECK_NOFAIL", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+
+#define CHECK_THROWS( ... )  INTERNAL_CATCH_THROWS( "CHECK_THROWS", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CHECK_THROWS_AS", exceptionType, Catch::ResultDisposition::ContinueOnFailure, expr )
+#define CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CHECK_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+#define CHECK_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CHECK_NOTHROW", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CHECK_THAT", matcher, Catch::ResultDisposition::ContinueOnFailure, arg )
+
+#define REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "REQUIRE_THAT", matcher, Catch::ResultDisposition::Normal, arg )
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+
+#define INFO( msg ) INTERNAL_CATCH_INFO( "INFO", msg )
+#define UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "UNSCOPED_INFO", msg )
+#define WARN( msg ) INTERNAL_CATCH_MSG( "WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
+#define CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE",__VA_ARGS__ )
+
+#define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
+#define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
+#define REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
+#define SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
+#define DYNAMIC_SECTION( ... ) INTERNAL_CATCH_DYNAMIC_SECTION( __VA_ARGS__ )
+#define FAIL( ... ) INTERNAL_CATCH_MSG( "FAIL", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+#define FAIL_CHECK( ... ) INTERNAL_CATCH_MSG( "FAIL_CHECK", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define SUCCEED( ... ) INTERNAL_CATCH_MSG( "SUCCEED", Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ )
+#define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+#define TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__)
+#define TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#else
+#define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ ) )
+#define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ ) )
+#define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+#define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+#define TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ ) )
+#define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ ) )
+#define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+#define TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE( __VA_ARGS__ ) )
+#define TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+#endif
+
+#if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)
+#define STATIC_REQUIRE( ... )       static_assert(   __VA_ARGS__,  #__VA_ARGS__ ); SUCCEED( #__VA_ARGS__ )
+#define STATIC_REQUIRE_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); SUCCEED( "!(" #__VA_ARGS__ ")" )
+#else
+#define STATIC_REQUIRE( ... )       REQUIRE( __VA_ARGS__ )
+#define STATIC_REQUIRE_FALSE( ... ) REQUIRE_FALSE( __VA_ARGS__ )
+#endif
+
+#endif
+
+#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature )
+
+// "BDD-style" convenience wrappers
+#define SCENARIO( ... ) TEST_CASE( "Scenario: " __VA_ARGS__ )
+#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
+
+#define GIVEN( desc )     INTERNAL_CATCH_DYNAMIC_SECTION( "    Given: " << desc )
+#define AND_GIVEN( desc ) INTERNAL_CATCH_DYNAMIC_SECTION( "And given: " << desc )
+#define WHEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     When: " << desc )
+#define AND_WHEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( " And when: " << desc )
+#define THEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     Then: " << desc )
+#define AND_THEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( "      And: " << desc )
+
+#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
+#define BENCHMARK(...) \
+    INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,))
+#define BENCHMARK_ADVANCED(name) \
+    INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name)
+#endif // CATCH_CONFIG_ENABLE_BENCHMARKING
+
+using Catch::Detail::Approx;
+
+#else // CATCH_CONFIG_DISABLE
+
+//////
+// If this config identifier is defined then all CATCH macros are prefixed with CATCH_
+#ifdef CATCH_CONFIG_PREFIX_ALL
+
+#define CATCH_REQUIRE( ... )        (void)(0)
+#define CATCH_REQUIRE_FALSE( ... )  (void)(0)
+
+#define CATCH_REQUIRE_THROWS( ... ) (void)(0)
+#define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) (void)(0)
+#define CATCH_REQUIRE_THROWS_WITH( expr, matcher )     (void)(0)
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CATCH_REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
+#endif// CATCH_CONFIG_DISABLE_MATCHERS
+#define CATCH_REQUIRE_NOTHROW( ... ) (void)(0)
+
+#define CATCH_CHECK( ... )         (void)(0)
+#define CATCH_CHECK_FALSE( ... )   (void)(0)
+#define CATCH_CHECKED_IF( ... )    if (__VA_ARGS__)
+#define CATCH_CHECKED_ELSE( ... )  if (!(__VA_ARGS__))
+#define CATCH_CHECK_NOFAIL( ... )  (void)(0)
+
+#define CATCH_CHECK_THROWS( ... )  (void)(0)
+#define CATCH_CHECK_THROWS_AS( expr, exceptionType ) (void)(0)
+#define CATCH_CHECK_THROWS_WITH( expr, matcher )     (void)(0)
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CATCH_CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+#define CATCH_CHECK_NOTHROW( ... ) (void)(0)
+
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CATCH_CHECK_THAT( arg, matcher )   (void)(0)
+
+#define CATCH_REQUIRE_THAT( arg, matcher ) (void)(0)
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+
+#define CATCH_INFO( msg )          (void)(0)
+#define CATCH_UNSCOPED_INFO( msg ) (void)(0)
+#define CATCH_WARN( msg )          (void)(0)
+#define CATCH_CAPTURE( msg )       (void)(0)
+
+#define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
+#define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
+#define CATCH_METHOD_AS_TEST_CASE( method, ... )
+#define CATCH_REGISTER_TEST_CASE( Function, ... ) (void)(0)
+#define CATCH_SECTION( ... )
+#define CATCH_DYNAMIC_SECTION( ... )
+#define CATCH_FAIL( ... ) (void)(0)
+#define CATCH_FAIL_CHECK( ... ) (void)(0)
+#define CATCH_SUCCEED( ... ) (void)(0)
+
+#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)
+#define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)
+#define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__)
+#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#else
+#define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) )
+#define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__) )
+#define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ ) )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#endif
+
+// "BDD-style" convenience wrappers
+#define CATCH_SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
+#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), className )
+#define CATCH_GIVEN( desc )
+#define CATCH_AND_GIVEN( desc )
+#define CATCH_WHEN( desc )
+#define CATCH_AND_WHEN( desc )
+#define CATCH_THEN( desc )
+#define CATCH_AND_THEN( desc )
+
+#define CATCH_STATIC_REQUIRE( ... )       (void)(0)
+#define CATCH_STATIC_REQUIRE_FALSE( ... ) (void)(0)
+
+// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not required
+#else
+
+#define REQUIRE( ... )       (void)(0)
+#define REQUIRE_FALSE( ... ) (void)(0)
+
+#define REQUIRE_THROWS( ... ) (void)(0)
+#define REQUIRE_THROWS_AS( expr, exceptionType ) (void)(0)
+#define REQUIRE_THROWS_WITH( expr, matcher ) (void)(0)
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+#define REQUIRE_NOTHROW( ... ) (void)(0)
+
+#define CHECK( ... ) (void)(0)
+#define CHECK_FALSE( ... ) (void)(0)
+#define CHECKED_IF( ... ) if (__VA_ARGS__)
+#define CHECKED_ELSE( ... ) if (!(__VA_ARGS__))
+#define CHECK_NOFAIL( ... ) (void)(0)
+
+#define CHECK_THROWS( ... )  (void)(0)
+#define CHECK_THROWS_AS( expr, exceptionType ) (void)(0)
+#define CHECK_THROWS_WITH( expr, matcher ) (void)(0)
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+#define CHECK_NOTHROW( ... ) (void)(0)
+
+#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
+#define CHECK_THAT( arg, matcher ) (void)(0)
+
+#define REQUIRE_THAT( arg, matcher ) (void)(0)
+#endif // CATCH_CONFIG_DISABLE_MATCHERS
+
+#define INFO( msg ) (void)(0)
+#define UNSCOPED_INFO( msg ) (void)(0)
+#define WARN( msg ) (void)(0)
+#define CAPTURE( msg ) (void)(0)
+
+#define TEST_CASE( ... )  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
+#define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
+#define METHOD_AS_TEST_CASE( method, ... )
+#define REGISTER_TEST_CASE( Function, ... ) (void)(0)
+#define SECTION( ... )
+#define DYNAMIC_SECTION( ... )
+#define FAIL( ... ) (void)(0)
+#define FAIL_CHECK( ... ) (void)(0)
+#define SUCCEED( ... ) (void)(0)
+#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)
+#define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)
+#define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__)
+#define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE( ... ) TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#else
+#define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) )
+#define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__) )
+#define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__ ) )
+#define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ ) )
+#define TEMPLATE_PRODUCT_TEST_CASE( ... ) TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) TEMPLATE_TEST_CASE( __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+#endif
+
+#define STATIC_REQUIRE( ... )       (void)(0)
+#define STATIC_REQUIRE_FALSE( ... ) (void)(0)
+
+#endif
+
+#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
+
+// "BDD-style" convenience wrappers
+#define SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ) )
+#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), className )
+
+#define GIVEN( desc )
+#define AND_GIVEN( desc )
+#define WHEN( desc )
+#define AND_WHEN( desc )
+#define THEN( desc )
+#define AND_THEN( desc )
+
+using Catch::Detail::Approx;
+
+#endif
+
+#endif // ! CATCH_CONFIG_IMPL_ONLY
+
+// start catch_reenable_warnings.h
+
+
+#ifdef __clang__
+#    ifdef __ICC // icpc defines the __clang__ macro
+#        pragma warning(pop)
+#    else
+#        pragma clang diagnostic pop
+#    endif
+#elif defined __GNUC__
+#    pragma GCC diagnostic pop
+#endif
+
+// end catch_reenable_warnings.h
+// end catch.hpp
+#endif // TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
+
diff --git a/thirdParty/cmake-modules/FindADIOS.cmake b/thirdParty/cmake-modules/FindADIOS.cmake
index 0477c8c578..b64cccbd8c 100644
--- a/thirdParty/cmake-modules/FindADIOS.cmake
+++ b/thirdParty/cmake-modules/FindADIOS.cmake
@@ -8,7 +8,7 @@
 #                           #   component is not found
 #     [QUIET]               # ...
 #     [COMPONENTS <...>]    # Compiled in components: fortran, readonly,
-#                           # sequential (all are case insentative)
+#                           # sequential (all are case-insensitive)
 #   )
 #
 # Module that finds the includes and libraries for a working ADIOS install.
@@ -54,7 +54,7 @@
 #   target_link_libraries(foo ${ADIOS_LIBRARIES})
 # endif()
 ###############################################################################
-#Copyright (c) 2014, Axel Huebl and Felix Schmitt from http://picongpu.hzdr.de
+#Copyright (c) 2014-2019, Axel Huebl and Felix Schmitt from http://picongpu.hzdr.de
 #All rights reserved.
 
 #Redistribution and use in source and binary forms, with or without
diff --git a/thirdParty/cmake-modules/FindmallocMC.cmake b/thirdParty/cmake-modules/FindmallocMC.cmake
index de22268754..e4d25b378f 100644
--- a/thirdParty/cmake-modules/FindmallocMC.cmake
+++ b/thirdParty/cmake-modules/FindmallocMC.cmake
@@ -56,14 +56,16 @@
 
 # Required cmake version ######################################################
 #
-cmake_minimum_required(VERSION 2.8.12.2)
+cmake_minimum_required(VERSION 3.15.0)
 
 
 # dependencies ################################################################
 #
-find_package(CUDA 5.0 REQUIRED)
-find_package(Boost 1.48.0 REQUIRED)
+set(mallocMC_ALPAKA_PROVIDER "intern" CACHE STRING "Select which alpaka is used for mallocMC")
+set_property(CACHE mallocMC_ALPAKA_PROVIDER PROPERTY STRINGS "intern;extern")
+mark_as_advanced(mallocMC_ALPAKA_PROVIDER)
 
+find_package(Boost 1.65.1 REQUIRED)
 
 # find mallocMC installation ##################################################
 #
@@ -84,6 +86,13 @@ set(mallocMC_REQUIRED_VARS_LIST mallocMC_ROOT_DIR mallocMC_INCLUDE_DIRS)
 mark_as_advanced(mallocMC_ROOT_DIR)
 
 if(mallocMC_ROOT_DIR)
+    if(${mallocMC_ALPAKA_PROVIDER} STREQUAL "intern")
+        set(alpaka_BUILD_EXAMPLES OFF)
+        set(BUILD_TESTING OFF)
+        add_subdirectory(${mallocMC_ROOT_DIR}/../alpaka ${CMAKE_BINARY_DIR}/alpaka)
+    else()
+        find_package(alpaka HINTS $ENV{ALPAKA_ROOT})
+    endif()
 
     # find version ##############################################################
     #
@@ -113,68 +122,6 @@ if(mallocMC_ROOT_DIR)
 
     set(mallocMC_INCLUDE_DIRS ${mallocMC_ROOT_DIR}/include)
 
-    # check additional components ###############################################
-    #
-    foreach(COMPONENT ${mallocMC_FIND_COMPONENTS})
-        set(mallocMC_${COMPONENT}_FOUND TRUE)
-
-        if(${COMPONENT} STREQUAL "halloc")
-
-            # halloc linked library #################################################
-            #
-            list(APPEND mallocMC_REQUIRED_VARS_LIST mallocMC_LIBRARIES)
-            find_library(mallocMC_${COMPONENT}_LIBRARY
-                NAMES libhalloc.a
-                PATHS ${HALLOC_ROOT} "${mallocMC_ROOT_DIR}/../halloc/" ENV HALLOC_ROOT
-                PATH_SUFFIXES "lib" "bin"
-                DOC "Libraries for the mallocMC component ${COMPONENT}."
-                NO_DEFAULT_PATH
-            )
-            find_library(mallocMC_${COMPONENT}_LIBRARY
-                NAMES libhalloc.a
-                PATH_SUFFIXES "lib" "bin"
-                DOC "Libraries for the mallocMC component ${COMPONENT}."
-            )
-            if(mallocMC_${COMPONENT}_LIBRARY)
-                list(APPEND mallocMC_LIBRARIES ${mallocMC_${COMPONENT}_LIBRARY})
-            else(mallocMC_${COMPONENT}_LIBRARY)
-                if(mallocMC_FIND_REQUIRED OR NOT mallocMC_FIND_QUIETLY)
-                    message(WARNING "libhalloc.a not found. Ensure it is compiled correctly and set HALLOC_ROOT")
-                endif()
-                unset(mallocMC_${COMPONENT}_FOUND)
-            endif(mallocMC_${COMPONENT}_LIBRARY)
-
-            # halloc headers ########################################################
-            #
-            find_path(mallocMC_${COMPONENT}_INCLUDE_DIR
-                NAMES halloc.h
-                PATHS ${HALLOC_ROOT} "${mallocMC_ROOT_DIR}/../halloc/" ENV HALLOC_ROOT
-                PATH_SUFFIXES "include" "src"
-                DOC "Includes for the mallocMC component ${COMPONENT}."
-                NO_DEFAULT_PATH
-            )
-            find_path(mallocMC_${COMPONENT}_INCLUDE_DIR
-                NAMES halloc.h
-                PATH_SUFFIXES "include" "src"
-                DOC "Includes for the mallocMC component ${COMPONENT}."
-            )
-            if(mallocMC_${COMPONENT}_INCLUDE_DIR)
-                list(APPEND mallocMC_INCLUDE_DIRS ${mallocMC_${COMPONENT}_INCLUDE_DIR})
-            else(mallocMC_${COMPONENT}_INCLUDE_DIR)
-                unset(mallocMC_${COMPONENT}_FOUND)
-            endif(mallocMC_${COMPONENT}_INCLUDE_DIR)
-
-            # set separable compilation #############################################
-            #
-            if(mallocMC_${COMPONENT}_FOUND)
-                set(CUDA_SEPARABLE_COMPILATION ON PARENT_SCOPE)
-            endif(mallocMC_${COMPONENT}_FOUND)
-
-            mark_as_advanced(mallocMC_${COMPONENT}_INCLUDE_DIR mallocMC_${COMPONENT}_LIBRARY)
-        endif(${COMPONENT} STREQUAL "halloc")
-
-    endforeach(COMPONENT ${mallocMC_FIND_COMPONENTS})
-
 endif(mallocMC_ROOT_DIR)
 
 
@@ -201,13 +148,8 @@ if(NOT mallocMC_FOUND)
         unset(${REQ_VAR} CACHE)
     endforeach()
 
-    # user-level component vars
-    foreach(COMPONENT ${mallocMC_FIND_COMPONENTS})
-        unset(mallocMC_${COMPONENT}_FOUND)
-        unset(mallocMC_${COMPONENT}_LIBRARY CACHE)
-        unset(mallocMC_${COMPONENT}_INCLUDE_DIR CACHE)
-    endforeach()
 endif()
 
 # always clean internal required vars list
 unset(mallocMC_REQUIRED_VARS_LIST)
+
diff --git a/thirdParty/cupla/.gitignore b/thirdParty/cupla/.gitignore
index a79da23ddb..27970f2edd 100644
--- a/thirdParty/cupla/.gitignore
+++ b/thirdParty/cupla/.gitignore
@@ -8,6 +8,12 @@
 /*.cbp
 /*.layout
 
+# Visual Studio Code configuration files
+.vscode
+
+# JetBrains project files
+.idea/
+
 # python byte code
 *.pyc
 
diff --git a/thirdParty/cupla/.gitlab-ci.yml b/thirdParty/cupla/.gitlab-ci.yml
index 73190f9ab3..3b5d6f8f0a 100644
--- a/thirdParty/cupla/.gitlab-ci.yml
+++ b/thirdParty/cupla/.gitlab-ci.yml
@@ -1,113 +1,86 @@
-.base_job:
-  script:
-    # the default build type is Release
-    # if neccesary, you can rerun the pipeline with another build type-> https://docs.gitlab.com/ee/ci/pipelines.html#manually-executing-pipelines
-    # to change the build type, you must set the environment variable CUPLA_BUILD_TYPE
-    - if [[ ! -v CUPLA_BUILD_TYPE ]] ; then
-        CUPLA_BUILD_TYPE=Release ;
-      fi
-    - echo "number of processor threads $(nproc)"
-    - $CXX --version
-    - cmake --version
-    # print boost version
-    - echo -e "#include <boost/version.hpp>\n#include <iostream>\nint main() { std::cout << BOOST_VERSION << std::endl; return 0; }" | $CXX -x c++ - -o boost_version >/dev/null || { echo 0; }
-    - echo "Boost version $(./boost_version)"
-    - export cupla_DIR=$CI_PROJECT_DIR
-    # use one build directory for all build configurations
-    - mkdir build
-    - cd build
-    - echo "Build type-> $CUPLA_BUILD_TYPE"
-    # ALPAKA_ACCS contains the backends, which are used for each build
-    # the backends are set in the sepcialized base jobs .base_gcc,.base_clang and.base_cuda
-    - for CMAKE_FLAGS in $ALPAKA_ACCS ; do
-        echo "###################################################"
-        && echo "# Example Matrix Multiplication (adapted original)"
-        && echo "###################################################"
-        && echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example"
-        && echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (256)"
-        && if [[ $CMAKE_FLAGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then
-          cmake $cupla_DIR/example/CUDASamples/matrixMul/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
-          && make -j
-          && time ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64
-          && rm -r * ;
-        fi
-        && echo "###################################################"
-        && echo "# Example Async API (adapted original)"
-        && echo "###################################################"
-        && echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example"
-        && echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (512)"
-        && if [[ $CMAKE_FLAGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then
-          cmake $cupla_DIR/example/CUDASamples/asyncAPI/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
-          && make -j
-          && time ./asyncAPI
-          && rm -r * ;
-        fi
-        && echo "###################################################"
-        && echo "# Example Async API (added elements layer)"
-        && echo "###################################################"
-        && cmake $cupla_DIR/example/CUDASamples/asyncAPI_tuned/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
-        && make -j
-        && time ./asyncAPI_tuned
-        && rm -r *
-        && echo "###################################################"
-        && echo "Example vectorAdd (added elements layer)"
-        && echo "###################################################"
-        && cmake $cupla_DIR/example/CUDASamples/vectorAdd/ $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
-        && make -j
-        && time ./vectorAdd 100000
-        && rm -r * ;
-      done
+################################################################################
+# CUPLA_CXX                             : {g++, clang++}
+#   [g++]                               : {5, 6, 7, 8, 9} <list>
+#   [clang++]                           : {4.0, 5.0, 6.0, 7, 8, 9, 10} <list>
+# CUPLA_BOOST_VERSIONS                  : {1.65.1, 1.66.0, 1.67.0, 1.68.0, 1.69.0, 1.70.0, 1.71.0, 1.72.0, 1.73.0} <list>
+# CUPLA_BUILD_TYPE                      : {Debug, Release}
+# CUPLA_CMAKE_ARGS                      : <string>
+include:
+  - local: '/script/compiler_base.yml'
 
-.base_gcc:
+cuda92:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda92-gcc:1.1
   variables:
-    GIT_SUBMODULE_STRATEGY: normal
-    CXX: g++
-    CC: gcc
-    ALPAKA_ACCS: "-DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON
-                  -DALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=ON
-                  -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON"
-                  # -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON
-  extends: .base_job
-  # x86_64 tag is used to get a multi-core CPU for the tests
-  tags:
-    - x86_64
+    CUPLA_CXX: "g++-6"
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
+  extends: .base_cuda
+
+cuda100:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda100-gcc:1.1
+  variables:
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
+  extends: .base_cuda
 
-.base_clang:
+cuda101:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda101-gcc:1.1
   variables:
-    GIT_SUBMODULE_STRATEGY: normal
-    CXX: clang++
-    CC: clang
-    ALPAKA_ACCS: "-DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON
-                  -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON"
-                  # -DALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=ON
-                  # -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON
-  extends: .base_job
-  # x86_64 tag is used to get a multi-core CPU for the tests
-  tags:
-    - x86_64
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
+  extends: .base_cuda
+
+cuda102:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda102-gcc:1.1
+  variables:
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
+  extends: .base_cuda
+
+gcc1:
+  variables:
+    CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9"
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0"
+  extends: .base_gcc
 
-.base_cuda:
+gcc2:
   variables:
-    GIT_SUBMODULE_STRATEGY: normal
-    CXX: g++
-    CC: gcc
-    ALPAKA_ACCS: "-DALPAKA_ACC_GPU_CUDA_ENABLE=ON"
-  before_script:
-    - nvidia-smi
-    - nvcc --version
-  extends: .base_job
-  tags:
-    - cuda
-    - intel
+    CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9"
+    CUPLA_BOOST_VERSIONS: "1.68.0 1.69.0 1.70.0"
+  extends: .base_gcc
 
-gcc7:
-  image: registry.gitlab.com/hzdr/cupla-docker/gcc7:latest
+gcc3:
+  variables:
+    CUPLA_CXX: "g++-5 g++-6 g++-7 g++-8 g++-9"
+    CUPLA_BOOST_VERSIONS: "1.71.0 1.72.0 1.73.0"
   extends: .base_gcc
 
-clang7:
-  image: registry.gitlab.com/hzdr/cupla-docker/clang7:latest
+clang:
+  variables:
+    CUPLA_CXX: "clang++-5.0 clang++-6.0 clang++-7 clang++-8 clang++-9 clang++-10 clang++-11"
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
   extends: .base_clang
 
-cuda9:
-  image: registry.gitlab.com/hzdr/cupla-docker/cuda9:latest
-  extends: .base_cuda
+cudaClang92:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda92-clang:1.1
+  variables:
+    CUPLA_CXX: "clang++-8 clang++-10 clang++-11"
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
+  extends: .base_cuda_clang
+
+cudaClang100:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda100-clang:1.1
+  variables:
+    CUPLA_CXX: "clang++-8 clang++-9 clang++-10 clang++-11"
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
+  extends: .base_cuda_clang
+
+cudaClang101:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-cuda101-clang:1.1
+  variables:
+    CUPLA_CXX: "clang++-9 clang++-10 clang++-11"
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
+  extends: .base_cuda_clang
+
+hip38:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-rocm3.8:1.1
+  variables:
+    CMAKE_MODULE_PATH: "/opt/rocm-3.8.0/hip/cmake"
+    CUPLA_BOOST_VERSIONS: "1.65.1 1.66.0 1.67.0 1.68.0 1.69.0 1.70.0 1.71.0 1.72.0 1.73.0"
+  extends: .base_hip
diff --git a/thirdParty/cupla/.travis.yml b/thirdParty/cupla/.travis.yml
index 7f972607f5..0f1ae8a4ff 100644
--- a/thirdParty/cupla/.travis.yml
+++ b/thirdParty/cupla/.travis.yml
@@ -14,7 +14,7 @@ addons:
     sources:
       - ubuntu-toolchain-r-test
     packages:
-      - g++-4.9
+      - g++-5
       - clang-4.0
 
 env:
@@ -22,7 +22,7 @@ env:
     - CUDA_ROOT: $HOME/.cache/cuda
     - CMAKE_ROOT: $HOME/.cache/cmake
     - BOOST_ROOT: $HOME/.cache/boost
-    - BOOST_MIN: 106000 # careful: hard coded below
+    - BOOST_MIN: 106501 # careful: hard coded below
     - CUPLA_ROOT: $TRAVIS_BUILD_DIR
     - OMP_NUM_THREADS: 4 # ignored in thread layer
   matrix:
@@ -39,13 +39,14 @@ before_install:
   - mkdir -p $HOME/asyncAPI
   - mkdir -p $HOME/asyncAPI_tuned
   - mkdir -p $HOME/vectorAdd
+  - mkdir -p $HOME/cuplaVectorAdd
   - mkdir -p $HOME/blackScholes
   - mkdir -p $HOME/test/config
   - export CMAKE_FLAGS="-DALPAKA_ACC_"$STRATEGY"_ENABLE=ON"
   - if [ "$COMPILER" == "gcc" ]; then
-        echo "Using g++-4.9 and sequential OpenMP2 threads ...";
-        export CXX=g++-4.9;
-        export CC=gcc-4.9;
+        echo "Using g++-5 and sequential OpenMP2 threads ...";
+        export CXX=g++-5;
+        export CC=gcc-5;
         unset CUDA_ROOT;
     elif [ "$COMPILER" == "clang" ]; then
         echo "Using clang++-4.0 & sequential threads ...";
@@ -54,8 +55,8 @@ before_install:
         unset CUDA_ROOT;
     elif [ "$COMPILER" == "nvcc" ]; then
         echo "Using CUDA 7.5 ...";
-        export CXX=g++-4.9;
-        export CC=gcc-4.9;
+        export CXX=g++-5;
+        export CC=gcc-5;
         export PATH=$CUDA_ROOT/bin:$PATH;
     fi
   - echo "$CMAKE_FLAGS"
@@ -68,15 +69,15 @@ install:
   # CMAKE                                                                     #
   #############################################################################
   - export PATH=$CMAKE_ROOT/bin:$PATH
-  - CMAKE_3_11_4_FOUND=$(cmake --version | grep " 3\.11\.4" >/dev/null && { echo 0; } || { echo 1; })
-  - if [ $CMAKE_3_11_4_FOUND -ne 0 ]; then
+  - CMAKE_3_15_0_FOUND=$(cmake --version | grep " 3\.15\.0" >/dev/null && { echo 0; } || { echo 1; })
+  - if [ $CMAKE_3_15_0_FOUND -ne 0 ]; then
       mkdir -p $CMAKE_ROOT &&
       cd $CMAKE_ROOT &&
       rm -rf $CMAKE_ROOT/* &&
-      travis_retry wget --no-check-certificate http://cmake.org/files/v3.11/cmake-3.11.4-Linux-x86_64.tar.gz &&
-      tar -xzf cmake-3.11.4-Linux-x86_64.tar.gz &&
-      mv cmake-3.11.4-Linux-x86_64/* . &&
-      rm -rf cmake-3.11.4-Linux-x86_64.tar.gz cmake-3.11.4-Linux-x86_64 &&
+      travis_retry wget --no-check-certificate http://cmake.org/files/v3.15/cmake-3.15.0-Linux-x86_64.tar.gz &&
+      tar -xzf cmake-3.15.0-Linux-x86_64.tar.gz &&
+      mv cmake-3.15.0-Linux-x86_64/* . &&
+      rm -rf cmake-3.15.0-Linux-x86_64.tar.gz cmake-3.15.0-Linux-x86_64 &&
       cd -;
     fi
   - cmake --version
@@ -92,13 +93,13 @@ install:
   - if [ $BOOST_FOUND -ne 0 ]; then
       mkdir -p $ BOOST_ROOT &&
       cd $BOOST_ROOT &&
-      travis_retry wget --no-check-certificate -O boost.tar.bz2 http://sourceforge.net/projects/boost/files/boost/1.62.0/boost_1_62_0.tar.bz2/download &&
+      travis_retry wget --no-check-certificate -O boost.tar.bz2 http://sourceforge.net/projects/boost/files/boost/1.65.1/boost_1_65_1.tar.bz2/download &&
       tar -xjf boost.tar.bz2 &&
-      cd boost_1_62_0 &&
+      cd boost_1_65_1 &&
       ./bootstrap.sh --with-libraries=atomic,chrono,context,date_time,system,thread --prefix=$BOOST_ROOT &&
       ./b2 -j2 &&
       ./b2 install &&
-      rm -rf boost.tar.bz2 boost_1_62_0 &&
+      rm -rf boost.tar.bz2 boost_1_65_1 &&
       cd $HOME;
     fi
   #############################################################################
@@ -108,10 +109,10 @@ install:
   - if [ $NVCC_FOUND -ne 0 ] && [ $COMPILER == "nvcc" ]; then
       mkdir -p $CUDA_ROOT &&
       cd $CUDA_ROOT &&
-      travis_retry wget https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_linux-run &&
-      chmod u+x ./cuda_8.0.44_linux-run &&
-      ./cuda_8.0.44_linux-run --override --silent --verbose --toolkit --toolkitpath=$CUDA_ROOT &&
-      rm -rf cuda_8.0.44_linux-run $CUDA_ROOT/{samples,jre,doc,share} &&
+      travis_retry wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run &&
+      chmod u+x *-run &&
+      ./cuda_9.0.176_384.81_linux-run --override --silent --verbose --toolkit --toolkitpath=$CUDA_ROOT &&
+      rm -rf ./cuda_9.0.176_384.81_linux-run $CUDA_ROOT/{samples,jre,doc,share} &&
       cd -;
     fi
 
@@ -157,6 +158,16 @@ script:
       ./vectorAdd 100000;
     fi
   #############################################################################
+  # Example: cuplaVectorAdd (added elements layer)                            #
+  #############################################################################
+  - cd $HOME/cuplaVectorAdd
+  - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/cuplaVectorAdd/ $CMAKE_FLAGS
+  - make
+  - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] ||
+       [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then
+      ./cuplaVectorAdd 100000;
+    fi
+  #############################################################################
   # Example: BlackScholes (adapted original)                                 #
   #############################################################################
   - cd $HOME/blackScholes
diff --git a/thirdParty/cupla/INSTALL.md b/thirdParty/cupla/INSTALL.md
index 964358bb24..a9e3c51bc1 100644
--- a/thirdParty/cupla/INSTALL.md
+++ b/thirdParty/cupla/INSTALL.md
@@ -4,17 +4,17 @@ cupla Install Guide
 Requirements
 ------------
 
-- **cmake**  3.11.4 or higher
+- **cmake**  3.15.0 or higher
   - *Debian/Ubuntu:* `sudo apt-get install cmake file cmake-curses-gui`
   - *Arch Linux:* `sudo pacman --sync cmake`
 
 - **cupla**
-  - https://github.com/ComputationalRadiationPhysics/cupla
+  - https://github.com/alpaka-group/cupla
   - `export CUPLA_ROOT=<cupla_SRC_CODE_DIR>`
   - `export CMAKE_PREFIX_PATH=$CUPLA_ROOT:$CMAKE_PREFIX_PATH`
   - example:
     - `mkdir -p $HOME/src`
-    - `git clone git://github.com/ComputationalRadiationPhysics/cupla.git $HOME/src/cupla`
+    - `git clone https://github.com/alpaka-group/cupla.git $HOME/src/cupla`
     - `cd $HOME/src/cupla`
     - `export CUPLA_ROOT=$HOME/src/cupla`
   - use a different alpaka installation:
@@ -48,7 +48,7 @@ How to update alpaka as git subtree?
 # git author is generic to not mess up contribution statistics
 GIT_AUTHOR_NAME="Third Party" GIT_AUTHOR_EMAIL="crp-git@hzdr.de" \
  git subtree pull --prefix alpaka \
- https://github.com/ComputationalRadiationPhysics/alpaka.git develop --squash
+ https://github.com/alpaka-group/alpaka.git develop --squash
 ```
 
 **How to commit local changes to alpaka upstream?**
diff --git a/thirdParty/cupla/README.md b/thirdParty/cupla/README.md
index 2f2f3a7060..06a8d78ff7 100644
--- a/thirdParty/cupla/README.md
+++ b/thirdParty/cupla/README.md
@@ -1,14 +1,14 @@
 **cupla** - C++ User interface for the Platform independent Library Alpaka
 ==========================================================================
 
-[![Build Status dev](https://img.shields.io/travis/ComputationalRadiationPhysics/cupla/dev.svg?label=dev)](https://travis-ci.org/ComputationalRadiationPhysics/cupla/branches)
+[![Build Status dev](https://img.shields.io/travis/alpaka-group/cupla/dev.svg?label=dev)](https://travis-ci.org/alpaka-group/cupla/branches)
 
 ![cupla Release](doc/logo/cupla_logo_320x210.png)
 
 **cupla** [[qχɑpˈlɑʔ]](https://en.wiktionary.org/wiki/Qapla%27) is a simple user
 interface for the platform independent parallel kernel
 acceleration library
-[**alpaka**](https://github.com/ComputationalRadiationPhysics/alpaka).
+[**alpaka**](https://github.com/alpaka-group/alpaka).
 It follows a similar concept as the
 [NVIDIA® CUDA® API](https://developer.nvidia.com/cuda-zone) by
 providing a software layer to manage accelerator devices.
@@ -38,8 +38,8 @@ For more information see [LICENSE.md](LICENSE.md).
 Dependencies
 ------------
 
-- **cmake 3.11.4**
-- **[alpaka 0.4.0](https://github.com/ComputationalRadiationPhysics/alpaka/)**
+- **cmake 3.15.0**
+- **[alpaka 0.5.0](https://github.com/alpaka-group/alpaka/)**
   - alpaka is loaded as `git subtree` within **cupla**, see [INSTALL.md](INSTALL.md)
 
 Usage
diff --git a/thirdParty/cupla/alpaka/.clang-format b/thirdParty/cupla/alpaka/.clang-format
new file mode 100644
index 0000000000..d2b6fb8a3d
--- /dev/null
+++ b/thirdParty/cupla/alpaka/.clang-format
@@ -0,0 +1,117 @@
+---
+# General options
+Language: Cpp
+Standard: c++17
+DisableFormat: false
+
+AccessModifierOffset: -4
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: false
+AlignEscapedNewlines: Right
+AlignOperands: false
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Allman
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeComma
+BreakStringLiterals: true
+ColumnLimit: 119
+CommentPragmas:  '^ COMMENT pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks: Regroup
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: false
+IndentGotoLabels: true
+IndentPPDirectives: AfterHash
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: All
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 1000
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: Never
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+TabWidth: 4
+UseCRLF: false
+UseTab: Never
+
+# Project specific options
+IncludeCategories:
+  # Local headers (in "") above all else
+  - Regex: '"([A-Za-z0-9.\/-_])+"'
+    Priority: 1
+  # <alpaka/foo.hpp> after local headers
+  - Regex: '<alpaka/([A-Za-z0-9.\/-_])+>'
+    Priority: 2
+  # C++ standard library headers are the last group to be included
+  - Regex: '<([A-Za-z0-9\/-_])+>'
+    Priority: 4
+  # Includes that made it this far are third-party headers and will be placed
+  # below alpaka's includes
+  - Regex: '<([A-Za-z0-9.\/-_])+>'
+    Priority: 3
+
+# Future options - not supported in clang-format 11
+# AlignConsecutiveBitFields: false
+# AllowShortEnumsOnASingleLine: false
+# BitFieldColonSpacing: Both
+# IndentCaseBlocks: true
+# IndentExternBlock: AfterExternBlock
+# OperandAlignmentStyle: Align
+...
diff --git a/thirdParty/cupla/alpaka/.github/workflows/ci.yml b/thirdParty/cupla/alpaka/.github/workflows/ci.yml
new file mode 100644
index 0000000000..383206bbd3
--- /dev/null
+++ b/thirdParty/cupla/alpaka/.github/workflows/ci.yml
@@ -0,0 +1,537 @@
+#
+# Copyright 2015-2020 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+name: Continuous Integration
+
+on: [push, pull_request]
+
+################################################################################
+# NOTE: Testing the full matrix is not practical.
+# Therefore we aim to have each value been set in at lest one job.
+# CXX                                           : {g++, clang++}
+#   [g++] ALPAKA_CI_GCC_VER                     : {5, 6, 7, 8, 9, 10}
+#   [clang++] ALPAKA_CI_CLANG_VER               : {4.0, 5.0, 6.0, 7, 8, 9, 10}
+#   [cl.exe] ALPAKA_CI_CL_VER                   : {2017, 2019}
+#   ALPAKA_CI_STDLIB                            : {libstdc++, [CXX==clang++]:libc++}
+# CMAKE_BUILD_TYPE                              : {Debug, Release}
+# ALPAKA_CI                                     : {GITHUB}
+# ALPAKA_CI_DOCKER_BASE_IMAGE_NAME              : {ubuntu:16.04, ubuntu:18.04, ubuntu:20.04}
+# ALPAKA_CI_BOOST_BRANCH                        : {boost-1.65.1, boost-1.66.0, boost-1.67.0, boost-1.68.0, boost-1.69.0, boost-1.70.0, boost-1.71.0, boost-1.72.0, boost-1.73.0, boost-1.74.0}
+# ALPAKA_CI_CMAKE_VER                           : {3.15.7, 3.16.9, 3.17.5, 3.18.5, 3.19.0}
+# ALPAKA_CI_SANITIZERS                          : {ASan, UBsan, TSan}
+#    TSan is not currently used because it produces many unexpected errors
+# ALPAKA_CI_ANALYSIS                            : {ON, OFF}
+# ALPAKA_DEBUG                                  : {0, 1, 2}
+# ALPAKA_ACC_GPU_CUDA_ONLY_MODE                 : {ON, OFF}
+# ALPAKA_ACC_GPU_HIP_ONLY_MODE                  : {ON, OFF}
+# ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE             : {ON, OFF}
+# ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE         : {ON, OFF}
+# ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE          : {ON, OFF}
+# ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE            : {ON, OFF}
+#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
+# ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE            : {ON, OFF}
+#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
+# ALPAKA_ACC_ANY_BT_OMP5_ENABLE                 : {ON, OFF}
+#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
+# ALPAKA_ACC_GPU_CUDA_ENABLE                    : {ON, OFF}
+#   [ON] ALPAKA_CUDA_VERSION                    : {9.0, 9.1, 9.2, 10.0, 10.1, 10.2, 11.0}
+#   [ON] ALPAKA_CUDA_COMPILER                   : {nvcc, [CXX==clang++]:clang}
+# ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE             : {ON, OFF}
+# ALPAKA_ACC_GPU_HIP_ENABLE                     : {ON, OFF}
+#   [ON] ALPAKA_CI_HIP_BRANCH                   : {rocm-3.5.0}
+#   [ON] ALPAKA_HIP_PLATFORM                    : {nvcc}
+
+env:
+  ALPAKA_CI: GITHUB
+  TBB_ROOT: tbb
+  BOOST_ROOT: ${{ github.workspace }}/boost
+  ALPAKA_CI_BOOST_LIB_DIR: ${{ github.workspace }}/boost_libs
+  ALPAKA_CI_CMAKE_DIR: ${{ github.workspace }}/CMake
+  ALPAKA_CI_CUDA_DIR: ${{ github.workspace }}/CUDA
+  ALPAKA_CI_HIP_ROOT_DIR: ${{ github.workspace }}/hip
+  ALPAKA_CI_SANITIZERS:
+  ALPAKA_CI_ANALYSIS: OFF
+  ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: ON
+  ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: ON
+  ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: ON
+  ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: ON
+  ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: ON
+  ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: ON
+  ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF
+  ALPAKA_ACC_ANY_BT_OACC_ENABLE: OFF
+  ALPAKA_ACC_GPU_CUDA_ENABLE: OFF
+  ALPAKA_ACC_GPU_CUDA_ONLY_MODE: OFF
+  ALPAKA_ACC_GPU_HIP_ENABLE: OFF
+  ALPAKA_ACC_GPU_HIP_ONLY_MODE: OFF
+
+jobs:
+  clang-format:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: DoozyX/clang-format-lint-action@v0.11
+      with:
+        clangFormatVersion: 11
+  ci:
+    name: ${{ matrix.name }}
+    runs-on: ${{ matrix.os }}
+    env: ${{ matrix.env }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        ### Analysis builds
+        - name: linux_gcc-10_debug_analysis_omp5
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 10,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.18.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: ON, CMAKE_CXX_FLAGS: "-foffload=disable"}
+        - name: linux_gcc-8_debug_analysis
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 8,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.66.0, ALPAKA_CI_CMAKE_VER: 3.19.0, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2}
+        - name: linux_clang-8_debug_analysis
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2}
+        - name: linux_nvcc-9.1_gcc-5_debug_analysis
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: nvcc,        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_clang-9_cuda-9.2_debug_analysis
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 9,      ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.17.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 1, ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang,       ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: windows_cl-2019_debug_analysis
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.16.5,                     ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2,                                                                                                                                                  ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: macos_xcode-11.3_debug_analysis
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 11.3.1,                              CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1,                                                  ALPAKA_CI_ANALYSIS: ON, ALPAKA_DEBUG: 2,                                                                                                                                                  ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+
+        ### macOS
+        - name: macos_xcode-11.2.1_debug
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 11.2.1,                              CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0,                                                  ALPAKA_CXX_STANDARD: 17,                                                                                                                                                                  ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: macos_xcode-11.3.1_release
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 11.3.1,                              CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1,                                                                                                                                                                                                                                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: macos_xcode-11.4.1_debug
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 11.4.1,                              CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.67.0,                                                  ALPAKA_CXX_STANDARD: 17,                                                                                                                                                                  ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: macos_xcode-11.5.0_release
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 11.5.0,                              CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0,                                                                                                                                                                                                                                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: macos_xcode-11.6.0_debug
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 11.6.0,                              CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0,                                                                                                                                                                                                                                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: macos_xcode-117.0_release
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 11.7.0,                              CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0,                                                  ALPAKA_CXX_STANDARD: 17,                                                                                                                                                                  ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: macos_xcode-12.0.1_debug
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 12.0.1,                              CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.66.0,                                                  ALPAKA_CXX_STANDARD: 17,                                                                                                                                                                  ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: macos_xcode-12.1.1_release
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 12.1.1,                              CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0,                                                                                                                                                                                                                                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: macos_xcode-12.2.0_debug
+          os: macos-10.15
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_XCODE_VER: 12.2.0,                              CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0,                                                                                                                                                                                                                                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+
+        ### Windows
+        - name: windows_cl-2017_release
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 4,                                                                                                                                                                                           ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+        - name: windows_cl-2017_debug
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.18.5, OMP_NUM_THREADS: 3,                                                                                                                                                                                           ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF,  ALPAKA_CXX_STANDARD: 17}
+        - name: windows_cl-2019_release
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.17.5, OMP_NUM_THREADS: 1,                                                                                                                                                                                           ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+        - name: windows_cl-2019_debug
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.19.0, OMP_NUM_THREADS: 4,                                                                                                                                                                                           ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+
+        ## CUDA 10.0
+        # nvcc + MSVC
+        - name: windows_nvcc-10.0_cl-2017_release_cuda-only_separable-compilation
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON, ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION: ON,                    ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: windows_nvcc-10.0_cl-2017_debug
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.16.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0",                                                                                                                              ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+
+        ## CUDA 10.1
+        # nvcc + MSVC
+        - name: windows_nvcc-10.1_cl-2017_debug_cuda-only
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,                                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: windows_nvcc-10.1_cl-2017_release
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1",                                                                                                                              ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+        - name: windows_nvcc-10.1_cl-2019_debug_cuda-only
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.16.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,                                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF}
+        - name: windows_nvcc-10.1_cl-2019_release
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1",                                                                                                                              ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+
+        ## CUDA 10.2
+        # nvcc + MSVC
+        - name: windows_nvcc-10.2_cl-2017_debug_cuda-only
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,                                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: windows_nvcc-10.2_cl-2017_release
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2",                                                                                                                              ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+        - name: windows_nvcc-10.2_cl-2019_debug_cuda-only
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_ARCH: "30;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,                                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF}
+        - name: windows_nvcc-10.2_cl-2019_release
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2",                                                                                                                              ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+
+        ## CUDA 11.0
+        # nvcc + MSVC
+        - name: windows_nvcc-11.0_cl-2017_debug
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0",                                                                                                                              ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+        - name: windows_nvcc-11.0_cl-2017_release_cuda-only
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_ARCH: "35;80", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,                                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF}
+
+        ## CUDA 11.1
+        # nvcc + MSVC
+        - name: windows_nvcc-11.1_cl-2017_release
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1",                                                                                                                              ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+        - name: windows_nvcc-11.1_cl-2017_debug_cuda-only
+          os: windows-2016
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2017,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_ARCH: "35;80", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,                                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF}
+        - name: windows_nvcc-11.1_cl-2019_release_cuda-only
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_ARCH: "35;75", ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,                                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_ANY_BT_OMP5_ENABLE: OFF}
+        - name: windows_nvcc-11.1_cl-2019_debug
+          os: windows-2019
+          env: {CXX: cl.exe,  CC: cl.exe, ALPAKA_CI_CL_VER: 2019,                                   CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1",                                                                                                                              ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_BT_OMP4_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF}
+
+        ### Ubuntu
+        ## native
+        # g++
+        # We can not enable UBSan when using gcc because it does not have a -fsanitize-blacklist option to suppress errors in boost etc.
+        # gcc 6 ASan is triggered within libtbb.so
+        # gcc 7 ASan introduced 'stack-use-after-scope' which is triggered by GOMP_parallel
+        - name: linux_gcc-5_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.66.0, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04"}
+        - name: linux_gcc-6_debug_c++17
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 6,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CXX_STANDARD: 17, CMAKE_CXX_EXTENSIONS: OFF}
+        - name: linux_gcc-7_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 7,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.5, OMP_NUM_THREADS: 1, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04"}
+        - name: linux_gcc-8_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 8,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.18.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", CMAKE_CXX_EXTENSIONS: OFF}
+        - name: linux_gcc-9_debug_c++17
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 9,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CXX_STANDARD: 17}
+        - name: linux_gcc-10_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 10,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.19.0, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04"}
+        - name: linux_gcc-10_release_oacc
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 10,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.17.5, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", CMAKE_CXX_FLAGS: "-foffload=disable", ALPAKA_ACC_ANY_BT_OACC_ENABLE: ON, ALPAKA_OFFLOAD_MAX_BLOCK_SIZE: 1, ACC_DEVICE_TYPE: "host", ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+
+        # clang++
+        - name: linux_clang-4_debug_ubsan
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "4.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.19.0, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_CI_SANITIZERS: UBSan}
+        - name: linux_clang-5_debug_c++17
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "5.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 3, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CXX_STANDARD: 17}
+        - name: linux_clang-6_release_asan_c++17
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "6.0",  ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.5, OMP_NUM_THREADS: 2, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_CI_SANITIZERS: ASan, ALPAKA_CXX_STANDARD: 17}
+        - name: linux_clang-7_release_c++17
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 7,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 1, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_CXX_STANDARD: 17, CMAKE_CXX_EXTENSIONS: OFF}
+        - name: linux_clang-8_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.18.5, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", CMAKE_CXX_EXTENSIONS: OFF}
+        - name: linux_clang-9_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 9,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.16.5, OMP_NUM_THREADS: 1, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04"}
+        - name: linux_clang-10_release_omp5
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 10,     ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.15.7, OMP_NUM_THREADS: 4, ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", CMAKE_CXX_FLAGS: "-fopenmp=libomp -fopenmp-targets=x86_64-pc-linux-gnu -Wno-openmp-mapping", ALPAKA_ACC_ANY_BT_OMP5_ENABLE: ON, ALPAKA_OFFLOAD_MAX_BLOCK_SIZE: 1, CMAKE_EXE_LINKER_FLAGS: "-fopenmp"}
+
+        ## CUDA 9.0
+        # nvcc + g++
+        - name: linux_nvcc-9.0_gcc-5_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.16.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70",                         ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # clang++
+        - name: linux_clang-6_cuda-9.0_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "6.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-7_cuda-9.0_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 7,      ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35;70",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-8_cuda-9.0_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.0", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+
+        ## CUDA 9.1
+        # nvcc + g++
+        - name: linux_nvcc-9.1_gcc-5_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: nvcc,                                                 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # nvcc + clang++
+        - name: linux_nvcc-9.1_clang-4_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "4.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.16.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;70",                      ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # clang++
+        - name: linux_clang-7_cuda-9.1_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 7,      ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35;72",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-8_cuda-9.1_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.1", ALPAKA_CUDA_COMPILER: clang,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+
+        ## CUDA 9.2
+        # nvcc + g++
+        - name: linux_nvcc-9.2_gcc-5_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35",                      ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-9.2_gcc-6_debug_separable_compilation
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 6,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION: ON,     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-9.2_gcc-7_release_extended_lambda_off
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 7,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA: OFF,     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # nvcc + clang++
+        - name: linux_nvcc-9.2_clang-4_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "4.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;70",                      ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # clang++
+        - name: linux_clang-7_cuda-9.2_release_c++17
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 7,      ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CXX_STANDARD: 17, CMAKE_CXX_EXTENSIONS: OFF, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-8_cuda-9.2_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.70.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35;72",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-9_cuda-9.2_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 9,      ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang, ALPAKA_CUDA_ARCH: "35;72",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-10_cuda-9.2_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 10,     ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "9.2", ALPAKA_CUDA_COMPILER: clang,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+
+        ## CUDA 10.0
+        # nvcc + g++
+        - name: linux_nvcc-10.0_gcc-5_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.0_gcc-6_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 6,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.0_gcc-7_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 7,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # nvcc + clang++
+        - name: linux_nvcc-10.0_clang-4_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "4.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;60",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.0_clang-5_release_separable_compilation
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "5.0",  ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION: ON,    ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.0_clang-6_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "6.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # clang++
+        - name: linux_clang-8_cuda-10.0_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: clang,                                               ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-9_cuda-10.0_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 9,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: clang,                                               ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-10_cuda-10.0_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 10,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.0", ALPAKA_CUDA_COMPILER: clang,                                               ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+
+        ## CUDA 10.1
+        # nvcc + g++
+        - name: linux_nvcc-10.1_gcc-5_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.1_gcc-6_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 6,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.1_gcc-7_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 7,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.1_gcc-8_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 8,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # nvcc + clang++
+        - name: linux_nvcc-10.1_clang-4_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "4.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.1_clang-5_release_cuda_only
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "5.0",  ALPAKA_CI_STDLIB: libc++,    CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,             ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_GPU_HIP_ENABLE: OFF}
+        - name: linux_nvcc-10.1_clang-6_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "6.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.71.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "60",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.1_clang-7_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 7,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.1_clang-8_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # clang++
+        - name: linux_clang-9_cuda-10.1_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 9,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: clang,                                               ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+        - name: linux_clang-10_cuda-10.1_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 10,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.1", ALPAKA_CUDA_COMPILER: clang,                                               ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF}
+
+        ## CUDA 10.2
+        # nvcc + g++
+        - name: linux_nvcc-10.2_gcc-5_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;35",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.2_gcc-6_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 6,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.2_gcc-7_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 7,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.2_gcc-8_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 8,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # nvcc + clang++
+        - name: linux_nvcc-10.2_clang-4_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "4.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "30;60",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.2_clang-5_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "5.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.2_clang-6_release_cuda_only
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "6.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,             ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_GPU_HIP_ENABLE: OFF}
+        - name: linux_nvcc-10.2_clang-7_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 7,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-10.2_clang-8_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "10.2", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+
+        ## CUDA 11.0
+        # nvcc + g++
+        - name: linux_nvcc-11.0_gcc-5_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "35;80",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.0_gcc-6_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 6,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.0_gcc-7_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 7,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.0_gcc-8_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 8,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.0_gcc-9_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 9,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # nvcc + clang++
+        - name: linux_nvcc-11.0_clang-4_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "4.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "35;60",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.0_clang-5_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "5.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "80",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.0_clang-6_release_cuda_only
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "6.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,             ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_GPU_HIP_ENABLE: OFF}
+        - name: linux_nvcc-11.0_clang-7_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 7,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.0_clang-8_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.0_clang-9_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 9,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.0", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+
+        ## CUDA 11.1
+        # nvcc + g++
+        - name: linux_nvcc-11.1_gcc-5_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_gcc-6_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 6,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.68.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_gcc-7_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 7,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "35;80",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_gcc-8_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 8,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_gcc-9_release
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 9,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_gcc-10_debug
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 10,       ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "86",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        # nvcc + clang++
+        - name: linux_nvcc-11.1_clang-4_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "4.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "35;60",                     ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_clang-5_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "5.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.65.1, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "80",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_clang-6_release_cuda_only
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: "6.0",  ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.69.0, ALPAKA_CI_CMAKE_VER: 3.18.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_ACC_GPU_CUDA_ONLY_MODE: ON,             ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_GPU_HIP_ENABLE: OFF}
+        - name: linux_nvcc-11.1_clang-7_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 7,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:20.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "75",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_clang-8_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 8,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.16.9,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:16.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc,                                                ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_clang-9_debug
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 9,      ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.73.0, ALPAKA_CI_CMAKE_VER: 3.17.5,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "70",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+        - name: linux_nvcc-11.1_clang-10_release
+          os: ubuntu-latest
+          env: {CXX: clang++, CC: clang,  ALPAKA_CI_CLANG_VER: 10,     ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.72.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "ubuntu:18.04", ALPAKA_ACC_GPU_CUDA_ENABLE: ON, ALPAKA_CUDA_VERSION: "11.1", ALPAKA_CUDA_COMPILER: nvcc, ALPAKA_CUDA_ARCH: "86",                        ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF}
+
+        ## HIP
+        - name: linux_hip_nvcc-9.2_gcc-5_debug_hip_only
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Debug,   ALPAKA_CI_BOOST_BRANCH: boost-1.67.0, ALPAKA_CI_CMAKE_VER: 3.15.7,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "rrawther/rocm3.5_ubuntu16.04_py3.6_pytorch-ssd", ALPAKA_ACC_GPU_HIP_ENABLE: ON, ALPAKA_ACC_GPU_HIP_ONLY_MODE: ON, ALPAKA_HIP_PLATFORM: clang, ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_GPU_CUDA_ENABLE: OFF}
+        - name: linux_hip_nvcc-9.2_gcc-5_release_hip_only
+          os: ubuntu-latest
+          env: {CXX: g++,     CC: gcc,    ALPAKA_CI_GCC_VER: 5,        ALPAKA_CI_STDLIB: libstdc++, CMAKE_BUILD_TYPE: Release, ALPAKA_CI_BOOST_BRANCH: boost-1.74.0, ALPAKA_CI_CMAKE_VER: 3.19.0,                     ALPAKA_CI_DOCKER_BASE_IMAGE_NAME: "rrawther/rocm3.5_ubuntu16.04_py3.6_pytorch-ssd", ALPAKA_ACC_GPU_HIP_ENABLE: ON, ALPAKA_ACC_GPU_HIP_ONLY_MODE: ON, ALPAKA_HIP_PLATFORM: clang, ALPAKA_CUDA_COMPILER: clang, ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE: OFF, ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE: OFF, ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE: OFF, ALPAKA_ACC_GPU_CUDA_ENABLE: OFF}
+
+    steps:
+    - name: check filter
+      if: (contains(github.event.head_commit.message, 'ci_filter') && !contains(github.event.head_commit.message, matrix.name ))
+      run: exit 1
+    - uses: actions/checkout@v1
+    - name: build + test
+      if: (runner.os == 'Windows')
+      env:
+        ALPAKA_CI_OS_NAME: ${{runner.os}}
+      shell: bash
+      run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh
+    - name: build + test
+      if: (runner.os == 'Linux' || runner.os == 'macOS')
+      env:
+        ALPAKA_CI_OS_NAME: ${{runner.os}}
+      run: cd ${GITHUB_WORKSPACE} && ./script/ci.sh
diff --git a/thirdParty/cupla/alpaka/.github/workflows/gh-pages.yml b/thirdParty/cupla/alpaka/.github/workflows/gh-pages.yml
new file mode 100644
index 0000000000..78709be98d
--- /dev/null
+++ b/thirdParty/cupla/alpaka/.github/workflows/gh-pages.yml
@@ -0,0 +1,25 @@
+name: Publish documentation to gh-pages
+
+on:
+  push:
+    branches:
+      - develop
+
+jobs:
+  gh-pages:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v1
+    - name: Install doxygen
+      run: |
+          cd ${GITHUB_WORKSPACE}
+          ./script/install_doxygen.sh
+    - name: Run doxygen
+      run: |
+          cd ${GITHUB_WORKSPACE}
+          ./script/run_doxygen.sh ${{ github.repository }} ${{ secrets.github_token }}
+    - name: Publish documentation
+      run: |
+          cd ${GITHUB_WORKSPACE}
+          ./script/push_doc.sh
diff --git a/thirdParty/cupla/alpaka/.gitignore b/thirdParty/cupla/alpaka/.gitignore
index 7e8b50b81b..3be3354aec 100644
--- a/thirdParty/cupla/alpaka/.gitignore
+++ b/thirdParty/cupla/alpaka/.gitignore
@@ -1,7 +1,5 @@
-/doc/doxygen/*
-!/doc/doxygen/Doxyfile
-!/doc/doxygen/alpaka_doxygen.png
-/doc/latex/*
+/docs/doxygen/*
+
 **/build
 
 # tmp files
@@ -19,3 +17,11 @@
 
 # VIM project files
 .vimrc
+
+# IDE & tmp build
+.idea/
+cmake-build-*/
+.kdev?/
+*.kdev?
+spack-build*
+build/
diff --git a/thirdParty/cupla/alpaka/.readthedocs.yml b/thirdParty/cupla/alpaka/.readthedocs.yml
new file mode 100644
index 0000000000..bec3c9ff2a
--- /dev/null
+++ b/thirdParty/cupla/alpaka/.readthedocs.yml
@@ -0,0 +1,18 @@
+# Default [] (epub, pdf, htmlzip)
+# Note: PDF/epub/htmlzip output is not supported when using MkDocs
+formats: []
+
+requirements_file: docs/requirements.txt
+
+build:
+  image: latest
+
+python:
+  version: 3.7
+
+sphinx:
+  builder: html
+  configuration: conf.py
+  fail_on_warning: true
+
+# see: https://docs.readthedocs.io/en/stable/config-file/v2.html#supported-settings
diff --git a/thirdParty/cupla/alpaka/.travis.yml b/thirdParty/cupla/alpaka/.travis.yml
deleted file mode 100644
index a11d5a1ac3..0000000000
--- a/thirdParty/cupla/alpaka/.travis.yml
+++ /dev/null
@@ -1,382 +0,0 @@
-#
-# Copyright 2015-2019 Benjamin Worpitz, Erik Zenker
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-os: linux
-dist: xenial
-language: generic
-services:
-  - docker
-
-################################################################################
-# NOTE: Testing the full matrix is not practical.
-# Therefore we aim to have each value been set in at lest one job.
-# CXX                                           : {g++, clang++, cl.exe}
-#   [g++] ALPAKA_CI_GCC_VER                     : {4.9, 5, 6, 7, 8, 9}
-#   [clang++] ALPAKA_CI_CLANG_VER               : {4.0.0, 5.0.2, 6.0.1, 7.0.1, 8.0.0, 9.0.0}
-#   ALPAKA_CI_STDLIB                            : {libstdc++, [CXX==clang++]:libc++}
-#   [clang++] ALPAKA_CI_CLANG_LIBSTDCPP_VERSION : {5, 7}
-# CMAKE_BUILD_TYPE                              : {Debug, Release}
-# ALPAKA_CI                                     : {TRAVIS}
-# ALPAKA_CI_DOCKER_BASE_IMAGE_NAME              : {ubuntu:14.04, ubuntu:16.04, ubuntu:18.04}
-# ALPAKA_CI_BOOST_BRANCH                        : {[CXX!=cl.exe&&OS!=osx]:boost-1.62.0, [CXX!=cl.exe&&OS!=osx]:boost-1.63.0, [OS!=osx]boost-1.64.0, boost-1.65.1, boost-1.66.0, boost-1.67.0, boost-1.68.0, boost-1.69.0, boost-1.70.0, boost-1.71.0}
-# ALPAKA_CI_CMAKE_VER                           : {3.11.4, 3.12.4, 3.13.5, 3.14.7, 3.15.5, 3.16.0}
-# ALPAKA_CI_SANITIZERS                          : {ASan, UBsan, TSan}
-#    TSan is not currently used because it produces many unexpected errors
-# ALPAKA_CI_ANALYSIS                            : {ON, OFF}
-# ALPAKA_DEBUG                                  : {0, 1, 2}
-# ALPAKA_ACC_GPU_CUDA_ONLY_MODE                 : {ON, OFF}
-# ALPAKA_ACC_GPU_HIP_ONLY_MODE                  : {ON, OFF}
-# ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE             : {ON, OFF}
-# ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE         : {ON, OFF}
-# ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE          : {ON, OFF}
-# ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE            : {ON, OFF}
-#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
-# ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE            : {ON, OFF}
-#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
-# ALPAKA_ACC_CPU_BT_OMP4_ENABLE                 : {ON, OFF}
-#   [ON] OMP_NUM_THREADS                        : {1, 2, 3, 4}
-# ALPAKA_ACC_GPU_CUDA_ENABLE                    : {ON, OFF}
-#   [ON] ALPAKA_CUDA_VERSION                    : {8.0, 9.0, 9.1, 9.2, 10.0, 10.1, 10.2}
-#   [ON] ALPAKA_CUDA_COMPILER                   : {nvcc, [CXX==clang++]:clang}
-# ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE             : {ON, OFF}
-# ALPAKA_ACC_GPU_HIP_ENABLE                     : {ON, OFF}
-#   [ON] ALPAKA_CI_HIP_BRANCH                   : {master}
-#   [ON] ALPAKA_HIP_PLATFORM                    : {nvcc}
-env:
-    global:
-        - ALPAKA_CI=TRAVIS
-        - ALPAKA_CI_DOCKER_IMAGE_NAME=alpaka_ubuntu
-        - ALPAKA_CI_DOCKER_CACHE_DIR=${HOME}/cache/docker
-        - ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH=${ALPAKA_CI_DOCKER_CACHE_DIR}/${ALPAKA_CI_DOCKER_IMAGE_NAME}.tar.gz
-        - BOOST_ROOT=${HOME}/boost
-        - ALPAKA_CI_BOOST_LIB_DIR=${HOME}/boost_libs/
-        - ALPAKA_CI_CLANG_DIR=${HOME}/llvm
-        - ALPAKA_CI_CMAKE_DIR=${HOME}/CMake
-        - ALPAKA_CI_CUDA_DIR=${HOME}/CUDA
-        - ALPAKA_CI_HIP_ROOT_DIR=${HOME}/hip
-        - TBB_ROOT_DIR=${HOME}/tbb
-        - ALPAKA_CI_SANITIZERS=
-        - ALPAKA_CI_ANALYSIS=OFF
-        - ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=5
-        - ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
-        - ALPAKA_ACC_GPU_HIP_ENABLE=OFF
-
-matrix:
-    include:
-    ### Analysis builds
-    - name: nvcc-9.1 + gcc-4.9 Debug Analysis
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc
-    - name: gcc-8 Debug Analysis
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=8       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.66.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2
-    - name: clang-4 + CUDA-8.0 Debug Analysis
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-6 Debug Analysis
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2
-    - name: macOS 10.14 Xcode 11.2 Debug Analysis
-      os: osx
-      osx_image: xcode11.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.65.1                            ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2 ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-    - name: MSVC-2017 Debug Analysis
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_CI_ANALYSIS=ON  ALPAKA_DEBUG=2
-
-    ### macOS
-    - name: macOS 10.14 Xcode 10.2.1 Debug
-      os: osx
-      osx_image: xcode10.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14 Xcode 10.2.1 Release
-      os: osx
-      osx_image: xcode10.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    - name: macOS 10.14.4 Xcode 10.3 Debug
-      os: osx
-      osx_image: xcode10.3
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14.4 Xcode 10.3 Release
-      os: osx
-      osx_image: xcode10.3
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    - name: macOS 10.14 Xcode 11.0 Debug
-      os: osx
-      osx_image: xcode11
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14 Xcode 11.0 Release
-      os: osx
-      osx_image: xcode11
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    - name: macOS 10.14 Xcode 11.1 Debug
-      os: osx
-      osx_image: xcode11.1
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14 Xcode 11.1 Release
-      os: osx
-      osx_image: xcode11.1
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    - name: macOS 10.14 Xcode 11.2 Debug
-      os: osx
-      osx_image: xcode11.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-    - name: macOS 10.14 Xcode 11.2 Release
-      os: osx
-      osx_image: xcode11.2
-      env:                                               CXX=g++     CC=gcc                             CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0                            ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF
-
-    ### Windows
-    - name: MSVC-2017 Release
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.14.7 OMP_NUM_THREADS=4 ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF
-    - name: MSVC-2017 Debug
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.64.0 ALPAKA_CI_CMAKE_VER=3.11.4 OMP_NUM_THREADS=4 ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF ALPAKA_CXX_STANDARD=14
-
-    ### Ubuntu
-    ## native
-    # g++
-    # We can not enable UBSan when using gcc because it does not have a -fsanitize-blacklist option to suppress errors in boost etc.
-    # gcc 6 ASan is triggered within libtbb.so
-    # gcc 7 ASan introduced 'stack-use-after-scope' which is triggered by GOMP_parallel
-    - name: gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.62.0 ALPAKA_CI_CMAKE_VER=3.11.4 OMP_NUM_THREADS=4 ALPAKA_CXX_STANDARD=11
-    - name: gcc-5 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.66.0 ALPAKA_CI_CMAKE_VER=3.16.0 OMP_NUM_THREADS=3
-    - name: gcc-6 Debug c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 OMP_NUM_THREADS=2 ALPAKA_CXX_STANDARD=14
-    - name: gcc-7 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.63.0 ALPAKA_CI_CMAKE_VER=3.13.5 OMP_NUM_THREADS=3
-    - name: gcc-8 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=8       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.12.4 OMP_NUM_THREADS=4
-    - name: gcc-9 Debug c++17
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=9       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.15.5 OMP_NUM_THREADS=3 ALPAKA_CXX_STANDARD=17 ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF
-
-    # clang++
-    - name: clang-4 Debug UBSan
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.11.4 OMP_NUM_THREADS=4 ALPAKA_CI_SANITIZERS=UBSan
-    - name: clang-5 Debug c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.63.0 ALPAKA_CI_CMAKE_VER=3.14.7 OMP_NUM_THREADS=3 ALPAKA_CXX_STANDARD=14
-    - name: clang-6 Release ASan C++17
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 OMP_NUM_THREADS=2 ALPAKA_CI_SANITIZERS=ASan ALPAKA_CXX_STANDARD=17
-    - name: clang-7 Release c++17
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.13.5 OMP_NUM_THREADS=2 ALPAKA_CXX_STANDARD=17 ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=7
-    - name: clang-8 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.12.4 OMP_NUM_THREADS=4
-    - name: clang-9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.16.0 OMP_NUM_THREADS=3
-
-    ## CUDA 8.0
-    # nvcc + g++
-    - name: nvcc-8.0 + gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.62.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="20;60" ALPAKA_CXX_STANDARD=11
-    # clang++
-    - name: clang-4 + CUDA-8.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-5 + CUDA-8.0 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="20;35" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: clang-6 + CUDA-8.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.64.0 ALPAKA_CI_CMAKE_VER=3.16.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-7 + CUDA-8.0 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-8 + CUDA-8.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=8.0 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 9.0
-    # nvcc + g++
-    - name: nvcc-9.0 + gcc-4.9 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-9.0 + gcc-5 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70"
-    # clang++
-    - name: clang-6 + CUDA-9.0 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35"
-    - name: clang-7 + CUDA-9.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;70"
-    - name: clang-8 + CUDA-9.0 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.0 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 9.1
-    # nvcc + g++
-    - name: nvcc-9.1 + gcc-4.9 Debug ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;72" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-9.1 + gcc-5 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc
-    # nvcc + clang++
-    - name: nvcc-9.1 + clang-4 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;70"
-    # clang++
-    - name: clang-7 + CUDA-9.1 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;72"
-    - name: clang-8 + CUDA-9.1 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.1 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 9.2
-    # nvcc + g++
-    - name: nvcc-9.2 + gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;72"
-    - name: nvcc-9.2 + gcc-5 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-9.2 + gcc-6 Debug separable compilation
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=ON
-    - name: nvcc-9.2 + gcc-7 Release + relaxed constexpr off + extended lambda off
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR=OFF ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA=OFF
-    # nvcc + clang++
-    - name: nvcc-9.2 + clang-4 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;70"
-    # clang++
-    - name: clang-7 + CUDA-9.2 Release c++17
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang ALPAKA_CUDA_ARCH="35;72" ALPAKA_CXX_STANDARD=17
-    - name: clang-8 + CUDA-9.2 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-9 + CUDA-9.2 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.16.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 10.0
-    # nvcc + g++
-    - name: nvcc-10.0 + gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;75"
-    - name: nvcc-10.0 + gcc-5 Release c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CXX_STANDARD=14
-    - name: nvcc-10.0 + gcc-6 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.0 + gcc-7 Release c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CXX_STANDARD=14
-    # nvcc + clang++
-    - name: nvcc-10.0 + clang-4 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60"
-    - name: nvcc-10.0 + clang-5 Debug separable compilation
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=ON
-    - name: nvcc-10.0 + clang-6 Debug c++14
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" ALPAKA_CXX_STANDARD=14
-    # nvcc + MSVC
-    - name: nvcc-10.0 + MSVC-2017 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE separable compilation
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_ARCH="30;75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=ON
-    - name: nvcc-10.0 + MSVC-2017 Debug (Only one CPU backend enabled due to compile time)
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.66.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=OFF
-    # clang++
-    - name: clang-8 + CUDA-10.0 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=clang
-    - name: clang-9 + CUDA-10.0 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.0 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 10.1
-    # nvcc + g++
-    - name: nvcc-10.1 + gcc-4.9 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;75"
-    - name: nvcc-10.1 + gcc-5 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.1 + gcc-6 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.1 + gcc-7 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35"
-    - name: nvcc-10.1 + gcc-8 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=8       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35"
-    # nvcc + clang++
-    - name: nvcc-10.1 + clang-4 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60"
-    - name: nvcc-10.1 + clang-5 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libc++    ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-10.1 + clang-6 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70"
-    - name: nvcc-10.1 + clang-7 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70"
-    - name: nvcc-10.1 + clang-8 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75"
-    # nvcc + MSVC
-    - name: nvcc-10.1 + MSVC-2017 Debug ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_ARCH="30;75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-10.1 + MSVC-2017 Release (Only one CPU backend enabled due to compile time)
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=OFF
-    # clang++
-    - name: clang-9 + CUDA-10.1 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=9.0.0 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.1 ALPAKA_CUDA_COMPILER=clang
-
-    ## CUDA 10.2
-    # nvcc + g++
-    - name: nvcc-10.2 + gcc-4.9 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=4.9     CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;75"
-    - name: nvcc-10.2 + gcc-5 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.68.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35"
-    - name: nvcc-10.2 + gcc-6 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:14.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=6       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.16.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.2 + gcc-7 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=7       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc
-    - name: nvcc-10.2 + gcc-8 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:18.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=8       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;35"
-    # nvcc + clang++
-    - name: nvcc-10.2 + clang-4 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=4.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="30;60"
-    - name: nvcc-10.2 + clang-5 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=5.0.2 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.13.5 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75"
-    - name: nvcc-10.2 + clang-6 Release ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=6.0.1 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.69.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="70" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-10.2 + clang-7 Debug
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=7.0.1 CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75"
-    - name: nvcc-10.2 + clang-8 Release
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=clang++ CC=clang ALPAKA_CI_CLANG_VER=8.0.0 CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.70.0 ALPAKA_CI_CMAKE_VER=3.12.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_ARCH="75"
-    # nvcc + MSVC
-    - name: nvcc-10.2 + MSVC-2017 Debug ALPAKA_ACC_GPU_CUDA_ONLY_MODE
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Debug                              ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_CUDA_ARCH="30;75" ALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON
-    - name: nvcc-10.2 + MSVC-2017 Release (Only one CPU backend enabled due to compile time)
-      os: windows
-      dist: 1803-containers
-      language: cpp
-      env:                                               CXX=cl.exe  CC=cl.exe                          CMAKE_BUILD_TYPE=Release                            ALPAKA_CI_BOOST_BRANCH=boost-1.65.1 ALPAKA_CI_CMAKE_VER=3.14.7 ALPAKA_ACC_GPU_CUDA_ENABLE=ON  ALPAKA_CUDA_VERSION=10.2 ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=OFF ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=OFF ALPAKA_ACC_CPU_BT_OMP4_ENABLE=OFF ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=OFF
-
-    ## HIP
-    - name: HIP(nvcc9.2) + gcc-5 Debug ALPAKA_ACC_GPU_HIP_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Debug   ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.67.0 ALPAKA_CI_CMAKE_VER=3.11.4 ALPAKA_ACC_GPU_HIP_ENABLE=ON ALPAKA_ACC_GPU_HIP_ONLY_MODE=ON ALPAKA_CI_HIP_BRANCH="roc-2.8.0" ALPAKA_HIP_PLATFORM=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR=OFF ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA=OFF
-    - name: HIP(nvcc9.2) + gcc-5 Release ALPAKA_ACC_GPU_HIP_ONLY_MODE
-      env: ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=ubuntu:16.04 CXX=g++     CC=gcc   ALPAKA_CI_GCC_VER=5       CMAKE_BUILD_TYPE=Release ALPAKA_CI_STDLIB=libstdc++ ALPAKA_CI_BOOST_BRANCH=boost-1.71.0 ALPAKA_CI_CMAKE_VER=3.15.5 ALPAKA_ACC_GPU_HIP_ENABLE=ON ALPAKA_ACC_GPU_HIP_ONLY_MODE=ON ALPAKA_CI_HIP_BRANCH="roc-2.8.0" ALPAKA_HIP_PLATFORM=nvcc ALPAKA_CUDA_ARCH="30;35" ALPAKA_CUDA_VERSION=9.2 ALPAKA_CUDA_COMPILER=nvcc
-
-branches:
-    except:
-        - gh-pages
-
-cache:
-    directories:
-        - $ALPAKA_CI_DOCKER_CACHE_DIR
-
-script:
-    - set -eovx pipefail
-    - if [ "$TRAVIS_OS_NAME" = "linux" ] ;then sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install smem ;fi
-    - if [ "$TRAVIS_OS_NAME" = "linux" ] ;then sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install moreutils ;fi
-    - if [ "$TRAVIS_OS_NAME" = "osx" ] ;then brew install moreutils ;fi
-    - if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ] ;then ./script/travis/script.sh | ts ;fi
-    - if [ "$TRAVIS_OS_NAME" = "windows" ] ;then ./script/travis/script.sh ;fi
-
-after_failure:
-    - ./script/travis/after_failure.sh
-
-notifications:
-    email: false
diff --git a/thirdParty/cupla/alpaka/.zenodo.json b/thirdParty/cupla/alpaka/.zenodo.json
index 80d29a6ce4..f57fd507de 100644
--- a/thirdParty/cupla/alpaka/.zenodo.json
+++ b/thirdParty/cupla/alpaka/.zenodo.json
@@ -1,30 +1,101 @@
 {
-  "title": "Alpaka: Abstraction Library for Parallel Kernel Acceleration",
-  "description": "The alpaka library is a header-only C++11 abstraction library for accelerator development. Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.",
+  "title": "alpaka: Abstraction Library for Parallel Kernel Acceleration",
+  "description": "The alpaka library is a header-only C++14 abstraction library for accelerator development. Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.",
   "creators": [
     {
-      "affiliation": "LogMeIn, Inc.",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden, LogMeIn Inc.",
       "name": "Worpitz, Benjamin"
     },
     {
-      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden",
-      "name": "Matthes, Alexander",
-      "orcid": "0000-0002-6702-2015"
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
+      "name": "Widera, René",
+      "orcid": "0000-0003-1642-0459"
     },
     {
-      "affiliation": "LogMeIn, Inc.",
-      "name": "Zenker, Erik",
-      "orcid": "0000-0001-9417-8712"
+      "name": "Bastrakov, Sergei",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
+      "orcid": "0000-0003-3396-6154"
+    },
+    {
+      "name": "Colgrove, Mat",
+      "affiliation": "NVIDIA"
+    },
+    {
+      "name": "Ehrig, Simeon",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
+      "orcid": "0000-0002-8218-3116"
+    },
+    {
+      "name": "Gruber, Bernhard Manfred",
+      "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf, CERN",
+      "orcid": "0000-0001-7848-1690"
+    },
+    {
+      "name": "Kelling, Jeffrey",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
+      "orcid": "0000-0003-1761-2591"
+    },
+    {
+      "name": "Krude, Jakob",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf"
+    },
+    {
+      "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf",
+      "name": "Stephan, Jan",
+      "orcid": "0000-0001-7839-4386"
+    },
+    {
+      "name": "Gehrke, Valentin",
+      "affiliation": "TU Dresden"
     },
     {
       "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden",
       "name": "Huebl, Axel",
       "orcid": "0000-0003-1943-7141"
     },
+    {
+      "name": "Knespel, Maximilian",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf"
+    },
+    {
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf, TU Dresden",
+      "name": "Matthes, Alexander",
+      "orcid": "0000-0002-6702-2015"
+    },
+    {
+      "name": "Mewes, Hauke",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf"
+    },
+    {
+      "name": "Nash, Phil"
+    },
+    {
+      "name": "Saito, Mutsuo"
+    },
+    {
+      "name": "Schenke, Jonas",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf"
+    },
+    {
+      "name": "Vollmer, Daniel",
+      "affiliation": "Deutsches Zentrum für Luft- und Raumfahrt e.V."
+    },
+    {
+      "name": "Werner, Matthias",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf"
+    },
+    {
+      "name":"Wesarg, Bert",
+      "affiliation":"TU Dresden"
+    },
+    {
+      "name": "Zacharias, Malte",
+      "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf"
+    },
     {
       "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf",
-      "name": "Widera, René",
-      "orcid": "0000-0003-1642-0459"
+      "name": "Zenker, Erik",
+      "orcid": "0000-0001-9417-8712"
     }
   ],
   "access_right": "open",
diff --git a/thirdParty/cupla/alpaka/CHANGELOG.md b/thirdParty/cupla/alpaka/CHANGELOG.md
new file mode 100644
index 0000000000..8289296f27
--- /dev/null
+++ b/thirdParty/cupla/alpaka/CHANGELOG.md
@@ -0,0 +1,329 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+
+
+## [0.6.0] - 2021-01-20
+### Compatibility Changes:
+- support for CUDA 11, 11.1, and 11.2 #1076 #1086 #1147 #1231
+- remove support for CUDA 11.0 with MSVC 2019 #1227
+- support for CMake 3.18.0 and 3.19.0 #1087 #1217
+- set minimal HIP version to 3.5 #1110
+- remove CMake HIP module shipped with alpaka #1189
+- set HIP-clang as default compiler for HIP #1113
+- support for NVCC + VS 2019 #1121
+- support for boost-1.74.0 #1142
+- explicitly require backends and do not enable them by default #1111
+- remove support for Xcode 11.1 #1206 
+- support Xcode 11.21 - 12.2.0 #1206
+- update to Catch 2.13.3 #1215
+
+### Bug Fixes:
+- apply some clang-tidy fixes #1044
+- fix CUDA/HIP accelerator concept usage #1064
+- fix Intel compiler detection #1070
+- CMake: build type CXX flag not passed to nvcc #1073
+- work around Intel ICE (Internal Compiler Error) when using std::decay on empty template parameter packs #1074
+- BoostPredef.hpp: Add redefinition of BOOST_COMP_PGI #1082
+- fix min/max return type deduction #1085
+- CMake: fix boost fiber linking #1088
+- fix HIP-clang compile #1107
+- fix CUDA/HIP cmake flags #1152
+- fix error handling CUDA/HIP #1108
+- ALPAKA_DECAY_T: Fix Intel detection, Add PGI #1116
+- fix how to set HIP target architecture #1112 
+- fix and improve block shared mem st member sanity checks #1128
+- HIP: remove copy device2device workaround #1188
+- pass native pointers to kernel instead of buffer objects #1193
+- fix bug in `isPinned()` and `pin()` #1196
+- fix marking of unit tests for concepts #1226
+
+### New Features:
+- add functions `alpaka::atomicAnd` et. al. as shortcuts to `alpaka::atomicOp<alpaka::AtomicAnd>` et. al. #1005
+- warp voting functions #1003 #1049 #1090 #1092
+- Sphinx Doc: Fix Doxygen integration on readthedocs #1042 #1093 #1151
+- add cheat sheet to the docs #1057 #1177
+- extend AccDevProps with shared memory size per block #1084
+- OpenMP 5 target offload backend #1126
+- OpenACC backend #1127
+- option to set OpenMP schedule for the Omp2Blocks backend #1223
+
+### Misc
+- tests for BufferSlicing #1024
+- use std::invoke_result_t instead of std::result_of_t when available #1047
+- simplify shared memory usage in tests #1075 
+- remove boost::aligned_alloc #1094
+- add unit tests for work div #1095
+- change examples (except reduce) to use getValidWorkDiv #1104
+- example monte-carlo-integration #1106 
+- invoke docker run only once instead of twice #1109
+- cpu/SysInfo.hpp: Add #else for cpuid; Add PGI #1119
+- Pgi std atomic workaround #1120
+- make BlockSharedMemDynMember::staticAllocBytes a function #1118
+- add IntrinsicFallback: basic fallback implementations #1122
+- allow ALPAKA_CXX_STANDARD to propagate to nvcc with MSVC 1920 and above #1130
+- add set kernel #1132
+- make Queue test generic to handle QueueGenericThreads* with different devices #1133
+- IdxBtOmp: Add GetIdx specialization for 1d #1140
+- test CMAKE_CXX_EXTENSIONS=OFF #1153
+- change block memory size back to be stored as 32 bit #1187
+- add comments to math function traits that explain valid argument range #1190
+- provide docker_retry #1191
+- add .clang-format file #1204
+- add CI check whether code is correctly formatted #1213
+- make test/common a CMake INTERFACE library #1228
+
+### Breaking changes:
+
+The namespace structure of *alpaka* is now flattened. 
+The [script](https://gist.github.com/sliwowitz/0a55e1bed6350f7fcae17ef0d430040d) can help you to apply the changes to your code.
+The script only works if you used the full namespace `alpaka::*` for alpaka functions.
+
+- removed namespace `alpaka::dev`
+- removed namespace `alpaka::pltf`
+- renamed function `alpaka::vec::cast` to `alpaka::castVec`
+- renamed function `alpaka::vec::reverse` to `alpaka::reverseVec`
+- renamed function `alpaka::vec::concat` to `alpaka::concatVec`
+- removed namespace `alpaka::vec`
+- removed namespace `alpaka::workdiv`
+- removed namespace `alpaka::acc`
+- renamed functors `alpaka::atomic::op::And` et. al. to `alpaka::AtomicAnd` et. al. #1185
+- removed namespace `alpaka::atomic::op`
+- removed namespace `alpaka::atomic`
+- removed namespace `alpaka::queue`
+- removed namespace `alpaka::idx`
+- removed namespace `alpaka::dim`
+- removed namespace `alpaka::kernel`
+- removed namespace `alpaka::wait`
+- removed namespace `alpaka::mem`
+- removed namespace `alpaka::offset`
+- removed namespace `alpaka::elem`
+- removed namespace `alpaka::intrinsic`
+- renamed function `alpaka::event::test` to `alpaka::isComplete`
+- removed namespace `alpaka::event`
+- removed namespace `alpaka::time`
+- removed namespace `alpaka::example`
+- renamed function `alpaka::alloc::alloc` to `alpaka::malloc`
+- renamed function `alpaka::buf::alloc` to `alpaka::allocBuf`
+- removed namespace `alpaka::alloc`
+- removed namespace `alpaka::buf`
+- renamed function `alpaka::view::set` to `alpaka::memset`
+- renamed function `alpaka::view::copy` to `alpaka::memcpy`
+- removed namespace `alpaka::view`
+- removed namespace `alpaka::block::shared::st`
+- removed namespace `alpaka::block::shared::dyn`
+- removed namespace `alpaka::block::sync`
+- renamed function `getMem` to `getDynSharedMem` #1197
+- renamed function `getVar` to `declareSharedVar` #1197
+- renamed function `freeMem` to `freeSharedVars` #1197
+- renamed functors `alpaka::block::op::LogicalAnd` et. al. to `alpaka::BlockAnd` et. al.
+- removed namespace `alpaka::block::op`
+- removed namespace `alpaka::block`
+
+
+## [0.5.0] - 2020-06-26
+### Compatibility Changes:
+- the minimum required C++ version has been raised from C++11 to C++14 #900
+- drop support for CUDA 8.0 (does not support c++14)
+- drop support for gcc 4.9 (does not support c++14)
+- drop support for CMake versions lower than 3.15 (3.11, 3.12, 3.13 and 3.14)
+- raise minimum supported boost version from 1.62.0 to 1.65.1 #906
+- require HIP version to 3.3.0 #1006
+- drop HIP-hcc support #945
+
+### Bug Fixes:
+- fix CMake error #941
+- fix HIP math includes #947
+- fix: missing hipRand and rocRand library #948
+- fix VS 2017 CUDA builds #953
+- fix uninitialized pitch #963
+- fix windows CI builds #965
+- fix conversion warning in TinyMT #997
+
+### New Features:
+- add automated gh-pages deployment for branch develop #916
+- unify CUDA/HIP backend #928 #904 #950 #980 #981
+- add support for Visual Studio 2019 #949
+- simplify vector operator construction #977
+- example heat-equation #978
+- extend supported compiler combinations gcc-8+nvcc 10.1-10.2 #985
+- add support for CMake 3.17 #988
+- adds initial files for sphinx/rst and readthedocs. #990 #1017 #1048
+- add support for clang 10 #998
+- add popcount intrinsic #1004
+- emulate hip/cuda-Memcpy3D with a kernel #1014
+- simplify alpaka usage #1017
+
+
+## [0.4.0] - 2020-01-14
+### Compatibility Changes:
+- added support for CUDA 10.0, 10.1 and 10.2
+- dropped support for CUDA 7.0 and 7.5
+- added official support for Visual Studio 2017 on Windows with CUDA 10 (built on Travis CI instead of appveyor now)
+- added support for xcode10.2-11.3 (no official CUDA support yet)
+- added support for Ubuntu 18.04
+- added support for gcc 9
+- added support for clang 7.0, 8.0 and 9.0
+- dropped support for clang 3.5, 3.6, 3.7, 3.8 and 3.9
+- added support for CMake 3.13, 3.14, 3.15 and 3.16
+- dropped support for CMake 3.11.3 and lower, 3.11.4 is the lowest supported version
+- added support for Boost 1.69, 1.70 and 1.71
+- added support for usage of libc++ instead of libstdc++ for clang builds
+- removed dependency to Boost.MPL and BOOST_CURRENT_FUNCTION
+- replaced Boost.Test with Catch2 using an internal version of Catch2 by default but allowing to use an external one
+
+### Bug Fixes:
+- fixed some incorrect host/device function attributes
+- fixed warning about comparison unsigned < 0
+- There is no need to disable all other backends manually when using ALPAKA_ACC_GPU_CUDA_ONLY_MODE anymore
+- fixed static block shared memory of types with alignemnt higher than defaultAlignment
+- fixed race-condition in HIP/NVCC queue
+- fixed data races when a GPU updates host memory by aligning host memory buffers always to 4kib
+
+### New Features:
+- Added a new alpaka Logo!
+- the whole alpaka code has been relicensed to MPL2 and the examples to ISC
+- added ALPAKA_CXX_STANDARD CMake option which allows to select the C++ standard to be used
+- added ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION option to enable separable compilation for nvcc
+- added ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA and ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR CMake options to enable/disable those nvcc options (they were always ON before)
+- added headers for standalone usage without CMake (alpaka/standalone/GpuCudaRt.h, ...) which set the backend defines
+- added experimental HIP back-end with using nvcc (HIP >= 1.5.1 required, latest rocRand). More on HIP setup: doc/markdown/user/implementation/mapping/HIP.md
+- added sincos math function implementations
+- allowed to copy and move construct ViewPlainPtr
+- added support for CUDA atomics using "unsigned long int"
+- added compile-time error for atomic CUDA ops which are not available due to sm restrictions
+- added explicit errors for unsupported types/operations for CUDA atomics
+- replaced usages of assert with ALPAKA_ASSERT
+- replaced BOOST_VERIFY by ALPAKA_CHECK and returned success from all test kernels
+- added alpaka::ignore_unused as replacement for boost::ignore_unused
+
+### Breaking changes:
+- renamed Queue*Async to Queue*NonBlocking and Queue*Sync to Queue*Blocking
+- renamed alpaka::size::Size to alpaka::idx::Idx, alpaka::size::SizeType to alpaka::idx::IdxType (and TSize to TIdx internally)
+- replaced ALPAKA_FN_ACC_NO_CUDA by ALPAKA_FN_HOST
+- replaced ALPAKA_FN_ACC_CUDA_ONLY by direct usage of __device__
+- renamed ALPAKA_STATIC_DEV_MEM_CONSTANT to ALPAKA_STATIC_ACC_MEM_CONSTANT and ALPAKA_STATIC_DEV_MEM_GLOBAL to ALPAKA_STATIC_ACC_MEM_GLOBAL
+- renamed alpaka::kernel::createTaskExec to alpaka::kernel::createTaskKernel
+- QueueCpuSync now correctly blocks when called from multiple threads
+  - This broke some previous use-cases (e.g. usage within existing OpenMP parallel regions)
+  - This use case can now be handled with the support for external CPU queues as can bee seen in the example QueueCpuOmp2CollectiveImpl
+- previously it was possible to have kernels return values even though they were always ignored. Now kernels are checked to always return void
+- renamed all files with *Stl suffix to *StdLib
+- renamed BOOST_ARCH_CUDA_DEVICE to BOOST_ARCH_PTX
+- executors have been renamed due to the upcoming standard C++ feature with a different meaning. All files within alpaka/exec/ have been moved to alpaka/kernel/ and the files and classes have been renamed from Exec* to TaskKernel*. This should not affect users of alpaka but will affect extensions.
+
+## [0.3.6] - 2020-01-06
+### Bug Fixes:
+- fix cuda stream race condition #850
+- fix: cuda exceptions #844
+- math/abs: Added trait specialisation for double. #862
+- alpaka/math Overloaded float specialization #837
+- Fixes name conflicts in alpaka math functions. #784
+
+
+## [0.3.5] - 2018-11-18
+### New Features:
+- used OpenMP atomics instead of critical sections
+
+
+## [0.3.4] - 2018-10-17
+### Compatibility Changes:
+- added support for boost-1.68.0
+- added support for CUDA 10
+- support for glibc < 2.18 (fix missing macros)
+- added checks for available OpenMP versions
+
+### Bug Fixes:
+- fixed empty(StreamCpuAsync) returning true even though the last task is still in progress
+- fixed integer overflows in case of int16_t being used as accelerator index type
+- made some throwing destructors not throwing to support clang 7
+- fixed broken alpaka::math::min for non-integral types
+
+### New Features:
+- added prepareForAsyncCopy which can be called to enable async copies for a specific buffer (if it is supported)
+- allowed to run alpaka OpenMP 2 block accelerated kernels within existing parallel region
+- added alpaka::ignore_unused which can be used in kernels
+
+
+## [0.3.3] - 2018-08-10
+### New Features:
+- added CPU random number generators based on std::random_device and TinyMT32
+- made TinyMT32 the default random number generator
+- added alpaka::ignore_unused
+
+
+## [0.3.2] - 2018-10-17
+### New Features:
+- Enhanced the compiler compatibility checks within the CMake scripts
+
+### Bugs Fixed:
+- fixed missing error in case of wrong OpenMP thread count being used by the runtime that was not triggered when not in debug mode
+- fixed CUDA driver API error handling
+- fixed CUDA memcpy and memset for zero sized buffers (division by zero)
+- fixed OpenMP 4 execution
+- fixed the VS2017 CUDA build (not officially supported)
+- fixed CUDA callback execution not waiting for the task to finish executing
+- fixed cudaOnly test being part of make test when cuda only mode is not enabled
+
+### Compatibility Changes:
+- added support for CUDA 9.2
+
+
+## [0.3.1] - 2018-06-11
+### New Features:
+- CMake: added option to control tests BUILD_TESTING
+- CMake: unified requirement of CMake 3.7.0+
+- CMake: used targets for Boost dependencies
+- CMake: made alpaka a pure interface library
+
+### Bugs Fixed:
+- fixed getDevCount documentation
+- fixed undefined define warnings
+- fixed self containing header check for CUDA
+
+
+## [0.3.0] - 2018-03-15
+### Bugs Fixed:
+- fixed multiple bugs where CPU streams/events could deadlock or behaved different than the native CUDA events
+- fixed a bug where the block synchronization of the Boost.Fiber backend crashed due to uninitialized variables
+
+### New Features / Enhancements:
+- added support for stream callbacks allowing to enqueue arbitrary host code using alpaka::stream::enqueue(stream, [&](){...});
+- added support for compiling for multiple architectures using e.g. ALPAKA_CUDA_ARCH="20;35"
+- added support for using __host__ constexpr code within __device__ code
+- enhanced the CUDA error handling
+- enhanced the documentation for mapping CUDA to alpaka
+
+### Compatibility Changes:
+- added support for CUDA 9.0 and 9.1
+- added support for CMake 3.9 and 3.10
+- removed support for CMake 3.6 and older
+- added support for boost-1.65.0
+- removed support for boost-1.61.0 and older
+- added support for gcc 7
+- added support for clang 4 and 5
+- removed support for VS2015
+
+
+## [0.2.0] - 2017-06-19
+### Compatibility fixes and small enhancements:
+- the documentation has been greatly enhanced
+- adds support for CUDA 8.0
+- adds support for CMake versions 3.6, 3.7 and 3.8
+- adds support for Boost 1.62, 1.63 and 1.64
+- adds support for clang-3.9
+- adds support for Visual Studio 2017
+- alpaka now compiles clean even with clang -Weverything
+- re-enabled the boost::fiber accelerator backend which was disabled in the last release
+
+### API changes:
+- mapIdx is moved from namespace alpaka::core to alpaka::idx
+- Vec is moved from namespace alpaka to alpaka::vec
+- vec::Vec is now allowed to be zero-dimensional (was previously forbidden)
+- added vec::concat
+- added element-wise operator< for vec::Vec which returns a vector of bool
+- CPU accelerators now support arbitrary dimensionality (both kernel execution as well as memory operations)
+- added support for syncBlockThreadsPredicate with block::sync::op::LogicalOr, block::sync::op::LogicalAnd and block::sync::op::Count
+- memory allocations are now aligned optimally for the underlying architecture (16 bit for SSE, 32 bit for AVX, 64 bit for AVX512) instead of 16 bit for all architectures in the previous release
+- 
\ No newline at end of file
diff --git a/thirdParty/cupla/alpaka/CMakeLists.txt b/thirdParty/cupla/alpaka/CMakeLists.txt
index 0f7a6c59b5..ece2ec708e 100644
--- a/thirdParty/cupla/alpaka/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright 2015-2019 Benjamin Worpitz
+# Copyright 2015-2020 Benjamin Worpitz, Jan Stephan
 #
 # This file is part of alpaka.
 #
@@ -11,29 +11,131 @@
 ################################################################################
 # Required CMake version
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15)
 
-project("alpakaAll")
+cmake_policy(SET CMP0091 OLD)
 
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
+include(CMakePrintHelpers)
 
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
+#-------------------------------------------------------------------------------
+# Find alpaka version.
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_MAJOR_HPP REGEX "#define ALPAKA_VERSION_MAJOR ")
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_MINOR_HPP REGEX "#define ALPAKA_VERSION_MINOR ")
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_PATCH_HPP REGEX "#define ALPAKA_VERSION_PATCH ")
 
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
+string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_MAJOR  ${ALPAKA_VERSION_MAJOR_HPP})
+string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_MINOR  ${ALPAKA_VERSION_MINOR_HPP})
+string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_PATCH  ${ALPAKA_VERSION_PATCH_HPP})
+
+set(PACKAGE_VERSION "${ALPAKA_VERSION_MAJOR}.${ALPAKA_VERSION_MINOR}.${ALPAKA_VERSION_PATCH}")
+
+project(alpaka VERSION      ${ALPAKA_VERSION_MAJOR}.${ALPAKA_VERSION_MINOR}.${ALPAKA_VERSION_PATCH}
+               DESCRIPTION  "The alpaka library is a header-only C++14 abstraction library for accelerator development."
+               HOMEPAGE_URL "https://github.com/alpaka-group/alpaka"
+               LANGUAGES    CXX)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 ################################################################################
 # Options and Variants
 
-option(alpaka_BUILD_EXAMPLES "Build the examples" ON)
+option(alpaka_BUILD_EXAMPLES "Build the examples" OFF)
 
+option(BUILD_TESTING "Build the testing tree." OFF)
 include(CTest)
-# automatically defines: BUILD_TESTING, default is ON
+
+################################################################################
+# Internal variables.
+
+# Set found to true initially and set it to false if a required dependency is missing.
+set(_ALPAKA_FOUND TRUE)
+
+# This file's directory.
+set(_ALPAKA_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR})
+# Normalize the path (e.g. remove ../)
+get_filename_component(_ALPAKA_ROOT_DIR ${_ALPAKA_ROOT_DIR} ABSOLUTE)
+
+# Add common functions.
+set(_ALPAKA_COMMON_FILE "${_ALPAKA_ROOT_DIR}/cmake/common.cmake")
+include(${_ALPAKA_COMMON_FILE})
+
+# Add ALPAKA_ADD_EXECUTABLE function.
+set(_ALPAKA_ADD_EXECUTABLE_FILE "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake")
+include(${_ALPAKA_ADD_EXECUTABLE_FILE})
+
+# Add ALPAKA_ADD_LIBRARY function.
+set(_ALPAKA_ADD_LIBRARY_FILE "${_ALPAKA_ROOT_DIR}/cmake/addLibrary.cmake")
+include(${_ALPAKA_ADD_LIBRARY_FILE})
+
+# Add module search path
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${_ALPAKA_ROOT_DIR}/cmake/modules/")
+
+# parse environment variable `CMAKE_MODULE_PATH`
+if(DEFINED ENV{CMAKE_MODULE_PATH})
+    string(REPLACE ":" ";" ENV_CMAKE_MODULE_PATH $ENV{CMAKE_MODULE_PATH})
+    foreach(_PREFIX_PATH ${ENV_CMAKE_MODULE_PATH})
+        set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${_PREFIX_PATH}")
+    endforeach()
+endif()
+
+# Set include directories
+set(_ALPAKA_INCLUDE_DIRECTORY "${_ALPAKA_ROOT_DIR}/include")
+set(_ALPAKA_SUFFIXED_INCLUDE_DIR "${_ALPAKA_INCLUDE_DIRECTORY}/alpaka")
+
+include(${_ALPAKA_ROOT_DIR}/cmake/alpakaCommon.cmake)
+
+# Add all the source and include files in all recursive subdirectories and group them accordingly.
+append_recursive_files_add_to_src_group("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "hpp" _ALPAKA_FILES_HEADER)
+append_recursive_files_add_to_src_group("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "h" _ALPAKA_FILES_HEADER)
+
+append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/script" "${_ALPAKA_ROOT_DIR}" "sh" _ALPAKA_FILES_SCRIPT)
+set_source_files_properties(${_ALPAKA_FILES_SCRIPT} PROPERTIES HEADER_FILE_ONLY TRUE)
+
+append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/cmake" "${_ALPAKA_ROOT_DIR}" "cmake" _ALPAKA_FILES_CMAKE)
+list(APPEND _ALPAKA_FILES_CMAKE "${_ALPAKA_ROOT_DIR}/cmake/alpakaConfig.cmake.in" "${_ALPAKA_ROOT_DIR}/CMakeLists.txt")
+set_source_files_properties(${_ALPAKA_FILES_CMAKE} PROPERTIES HEADER_FILE_ONLY TRUE)
+
+append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/docs/markdown" "${_ALPAKA_ROOT_DIR}" "md" _ALPAKA_FILES_DOC)
+set_source_files_properties(${_ALPAKA_FILES_DOC} PROPERTIES HEADER_FILE_ONLY TRUE)
+
+append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/.github" "${_ALPAKA_ROOT_DIR}" "yml" _ALPAKA_FILES_OTHER)
+list(APPEND _ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/.clang-format" "${_ALPAKA_ROOT_DIR}/.gitignore" "${_ALPAKA_ROOT_DIR}/.zenodo.json" "${_ALPAKA_ROOT_DIR}/LICENSE" "${_ALPAKA_ROOT_DIR}/README.md")
+set_source_files_properties(${_ALPAKA_FILES_OTHER} PROPERTIES HEADER_FILE_ONLY TRUE)
+
+if(TARGET alpaka)
+    # HACK: Workaround for the limitation that files added to INTERFACE targets (target_sources) can not be marked as PUBLIC or PRIVATE but only as INTERFACE.
+    # Therefore those files will be added to projects "linking" to the INTERFACE library, but are not added to the project itself within an IDE.
+    add_custom_target("alpakaIde"
+                      SOURCES ${_ALPAKA_FILES_HEADER} ${_ALPAKA_FILES_SCRIPT} ${_ALPAKA_FILES_CMAKE} ${_ALPAKA_FILES_DOC} ${_ALPAKA_FILES_OTHER})
+endif()
+
+################################################################################
+# Export NVCC/HIPCC flags to parent scope if alpaka is used as a CMake
+# subdirectory.
+#
+# These flags are set in cmake/alpakaCommon.cmake but are visible in this scope
+# since alpakaCommon.cmake is included.
+
+if(NOT ${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    if(ALPAKA_ACC_GPU_CUDA_ENABLE)
+        # export NVCC flags to parent scope in case alpaka is another project's subdirectory
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} PARENT_SCOPE)
+        set(CUDA_HOST_COMPILER ${CUDA_HOST_COMPILER} PARENT_SCOPE)
+        set(CUDA_VERBOSE_BUILD ${CUDA_VERBOSE_BUILD} PARENT_SCOPE)
+        set(CUDA_SEPARABLE_COMPILATION ${CUDA_SEPARABLE_COMPILATION} PARENT_SCOPE)
+    endif()
+
+    if(ALPAKA_ACC_GPU_HIP_ENABLE)
+        # export HIPCC flags to parent scope in case alpaka is another project's subdirectory
+        set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS} PARENT_SCOPE)
+        set(HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS} PARENT_SCOPE)
+        set(HIP_VERBOSE_BUILD ${HIP_VERBOSE_BUILD} PARENT_SCOPE)
+    endif()
+
+    if((ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE) AND ALPAKA_CUDA_COMPILER MATCHES "nvcc")
+        set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} PARENT_SCOPE)
+    endif()
+endif()
 
 ################################################################################
 # Add subdirectories
@@ -44,3 +146,45 @@ endif()
 if(BUILD_TESTING)
     add_subdirectory("test/")
 endif()
+
+################################################################################
+# Installation.
+
+# Do not install if alpaka is used as a CMake subdirectory
+if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    include(CMakePackageConfigHelpers)
+    include(GNUInstallDirs)
+
+    set(_ALPAKA_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/alpaka")
+
+    install(TARGETS alpaka
+            ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+    write_basic_package_version_file(
+        "alpakaConfigVersion.cmake"
+        VERSION ${PROJECT_VERSION}
+        COMPATIBILITY SameMajorVersion)
+
+    configure_package_config_file(
+        "${_ALPAKA_ROOT_DIR}/cmake/alpakaConfig.cmake.in"
+        "${PROJECT_BINARY_DIR}/alpakaConfig.cmake" 
+        INSTALL_DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}")
+
+    install(FILES "${PROJECT_BINARY_DIR}/alpakaConfig.cmake"
+                  "${PROJECT_BINARY_DIR}/alpakaConfigVersion.cmake"
+            DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}")
+
+    install(DIRECTORY "${_ALPAKA_SUFFIXED_INCLUDE_DIR}"
+            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+
+    install(FILES "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake"
+                  "${_ALPAKA_ROOT_DIR}/cmake/addLibrary.cmake"
+                  "${_ALPAKA_ROOT_DIR}/cmake/alpakaCommon.cmake"
+                  "${_ALPAKA_ROOT_DIR}/cmake/common.cmake"
+            DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}")
+
+    install(DIRECTORY "${_ALPAKA_ROOT_DIR}/cmake/modules"
+            DESTINATION "${_ALPAKA_INSTALL_CMAKEDIR}")
+endif()
diff --git a/thirdParty/cupla/alpaka/CONTRIBUTING.md b/thirdParty/cupla/alpaka/CONTRIBUTING.md
new file mode 100644
index 0000000000..2a2ea1ce0a
--- /dev/null
+++ b/thirdParty/cupla/alpaka/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+# Contributing
+
+## Formatting
+
+Please format your code before before opening pull requests using clang-format 11 and the .clang-format file placed in the repository root.
+
+### Visual Studio and CLion
+Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1.
+The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey.
+
+### Bash
+First install clang-format-11. Instructions therefore can be found on the web.
+To format your changes since branching off develop, you can run this command in bash:
+```
+git clang-format-11 develop
+```
+To format all code in your working copy, you can run this command in bash:
+```
+find -iname '*.cpp' -o -iname '*.hpp' | xargs clang-format-11 -i
+```
diff --git a/thirdParty/cupla/alpaka/Findalpaka.cmake b/thirdParty/cupla/alpaka/Findalpaka.cmake
deleted file mode 100644
index 94d0187356..0000000000
--- a/thirdParty/cupla/alpaka/Findalpaka.cmake
+++ /dev/null
@@ -1,109 +0,0 @@
-#.rst:
-# Findalpaka
-# ----------
-#
-# Abstraction library for parallel kernel acceleration
-# https://github.com/ComputationalRadiationPhysics/alpaka
-#
-# Finding and Using alpaka
-# ^^^^^^^^^^^^^^^^^^^^^
-#
-# .. code-block:: cmake
-#
-#   FIND_PACKAGE(alpaka
-#     [version] [EXACT]     # Minimum or EXACT version, e.g. 1.0.0
-#     [REQUIRED]            # Fail with an error if alpaka or a required
-#                           # component is not found
-#     [QUIET]               # Do not warn if this module was not found
-#     [COMPONENTS <...>]    # Compiled in components: ignored
-#   )
-#   TARGET_LINK_LIBRARIES(<target> PUBLIC alpaka)
-#
-# To provide a hint to this module where to find the alpaka installation,
-# set the ALPAKA_ROOT variable.
-#
-# This module requires Boost. Make sure to provide a valid install of it
-# under the environment variable BOOST_ROOT.
-#
-# ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE will require Boost.Fiber to be built.
-# ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE and ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE will require a OpenMP 2.0+ capable compiler.
-# ALPAKA_ACC_CPU_BT_OMP4_ENABLE will require a OpenMP 4.0+ capable compiler.
-# ALPAKA_ACC_GPU_CUDA_ENABLE will require CUDA 8.0+ to be installed.
-# ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE will require TBB 2.2+ to be installed
-#
-# Set the following CMake variables BEFORE calling find_packages to
-# change the behaviour of this module:
-# - ``ALPAKA_ACC_GPU_CUDA_ONLY_MODE`` {ON, OFF}
-# - ``ALPAKA_ACC_GPU_HIP_ONLY_MODE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_CPU_BT_OMP4_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_GPU_CUDA_ENABLE`` {ON, OFF}
-# - ``ALPAKA_ACC_GPU_HIP_ENABLE`` {ON, OFF}
-# - ``ALPAKA_CUDA_VERSION`` {8.0, ...}
-# - ``ALPAKA_CUDA_ARCH`` {sm_20, sm...}
-# - ``ALPAKA_CUDA_FAST_MATH`` {ON, OFF}
-# - ``ALPAKA_CUDA_FTZ`` {ON, OFF}
-# - ``ALPAKA_CUDA_SHOW_REGISTER`` {ON, OFF}
-# - ``ALPAKA_CUDA_KEEP_FILES`` {ON, OFF}
-# - ``ALPAKA_CUDA_SHOW_CODELINES`` {ON, OFF}
-# - ``ALPAKA_DEBUG`` {0, 1, 2}
-# - ``ALPAKA_CXX_STANDARD`` {11, 14, 17}
-#
-# Result Variables
-# ^^^^^^^^^^^^^^^^
-#
-# - ``alpaka_FOUND``
-#   TRUE if alpaka found a working install.
-# - ``alpaka_VERSION``
-#   Version in format Major.Minor.Patch
-# - ``alpaka_COMPILE_OPTIONS``
-#   Compiler options.
-# - ``alpaka_COMPILE_DEFINITIONS``
-#   Compiler definitions (without "-D" prefix!).
-# - ``alpaka_DEFINITIONS``
-#   Deprecated old compiler definitions. Combination of alpaka_COMPILE_OPTIONS and alpaka_COMPILE_DEFINITIONS prefixed with "-D".
-# - ``alpaka_INCLUDE_DIRS``
-#   Include directories required by the alpaka headers.
-# - ``alpaka_LIBRARIES``
-#   Libraries required to link against to use alpaka.
-#
-#
-# IMPORTED Targets
-# ^^^^^^^^^^^^^^^^
-#
-# This module defines the :prop_tgt:`IMPORTED` target ``alpaka``, if alpaka has
-# been found.
-#
-
-
-################################################################################
-# Copyright 2015-2019 Benjamin Worpitz
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
-# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
-# USE OR PERFORMANCE OF THIS SOFTWARE.
-
-FIND_PATH(
-    _ALPAKA_ROOT_DIR
-    NAMES "include/alpaka/alpaka.hpp"
-    HINTS "${ALPAKA_ROOT}" ENV ALPAKA_ROOT
-    DOC "alpaka ROOT location")
-
-IF(_ALPAKA_ROOT_DIR)
-    INCLUDE("${_ALPAKA_ROOT_DIR}/alpakaConfig.cmake")
-ELSE()
-    MESSAGE(FATAL_ERROR "alpaka could not be found!")
-ENDIF()
diff --git a/thirdParty/cupla/alpaka/README.md b/thirdParty/cupla/alpaka/README.md
index 8370dac9c4..5f7c9d07c1 100644
--- a/thirdParty/cupla/alpaka/README.md
+++ b/thirdParty/cupla/alpaka/README.md
@@ -1,14 +1,17 @@
 **alpaka** - Abstraction Library for Parallel Kernel Acceleration
 =================================================================
 
-[![Travis CI Build Status](https://travis-ci.org/ComputationalRadiationPhysics/alpaka.svg?branch=develop)](https://travis-ci.org/ComputationalRadiationPhysics/alpaka)
-[![Language](https://img.shields.io/badge/language-C%2B%2B11-orange.svg)](https://isocpp.org/)
-[![Platforms](https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20mac-lightgrey.svg)](https://github.com/ComputationalRadiationPhysics/alpaka)
+
+[![Continuous Integration](https://github.com/alpaka-group/alpaka/workflows/Continuous%20Integration/badge.svg)](https://github.com/alpaka-group/alpaka/actions?query=workflow%3A%22Continuous+Integration%22)
+[![Documentation Status](https://readthedocs.org/projects/alpaka/badge/?version=latest)](https://alpaka.readthedocs.io)
+[![Doxygen](https://img.shields.io/badge/API-Doxygen-blue.svg)](https://alpaka-group.github.io/alpaka)
+[![Language](https://img.shields.io/badge/language-C%2B%2B14-orange.svg)](https://isocpp.org/)
+[![Platforms](https://img.shields.io/badge/platform-linux%20%7C%20windows%20%7C%20mac-lightgrey.svg)](https://github.com/alpaka-group/alpaka)
 [![License](https://img.shields.io/badge/license-MPL--2.0-blue.svg)](https://www.mozilla.org/en-US/MPL/2.0/)
 
-![Alpaka](doc/images/alpaka_401x135.png)
+![alpaka](docs/logo/alpaka_401x135.png)
 
-The **alpaka** library is a header-only C++11 abstraction library for accelerator development.
+The **alpaka** library is a header-only C++14 abstraction library for accelerator development.
 
 Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.
 
@@ -38,8 +41,9 @@ Software License
 Documentation
 -------------
 
-The [general documentation](doc/markdown/Index.md) is located within the `doc/markdown` subfolder of the repository.
-The [source code documentation](http://computationalradiationphysics.github.io/alpaka/) is generated with [doxygen](http://www.doxygen.org).
+The alpaka documentation can be found in the [online manual](https://alpaka.readthedocs.io).
+The documentation files in [`.rst` (reStructuredText)](https://www.sphinx-doc.org/en/stable/rest.html) format are located in the `docs` subfolder of this repository.
+The [source code documentation](https://alpaka-group.github.io/alpaka/) is generated with [doxygen](http://www.doxygen.org).
 
 
 Accelerator Back-ends
@@ -50,31 +54,34 @@ Accelerator Back-ends
 |Serial|n/a|Host CPU (single core)|sequential|sequential (only 1 thread per block)|
 |OpenMP 2.0+ blocks|OpenMP 2.0+|Host CPU (multi core)|parallel (preemptive multitasking)|sequential (only 1 thread per block)|
 |OpenMP 2.0+ threads|OpenMP 2.0+|Host CPU (multi core)|sequential|parallel (preemptive multitasking)|
-|OpenMP 4.0+ (CPU)|OpenMP 4.0+|Host CPU (multi core)|parallel (undefined)|parallel (preemptive multitasking)|
+|OpenMP 5.0+ |OpenMP 5.0+|Host CPU (multi core)|parallel (undefined)|parallel (preemptive multitasking)|
+| ||GPU|parallel (undefined)|parallel (lock-step within warps)|
+|OpenACC (experimental)|OpenACC 2.0+|Host CPU (multi core)|parallel (undefined)|parallel (preemptive multitasking)|
+|||GPU|parallel (undefined)|parallel (lock-step within warps)|
 | std::thread | std::thread |Host CPU (multi core)|sequential|parallel (preemptive multitasking)|
 | Boost.Fiber | boost::fibers::fiber |Host CPU (single core)|sequential|parallel (cooperative multitasking)|
 |TBB|TBB 2.2+|Host CPU (multi core)|parallel (preemptive multitasking)|sequential (only 1 thread per block)|
-|CUDA|CUDA 8.0-10.2|NVIDIA GPUs|parallel (undefined)|parallel (lock-step within warps)|
-|HIP(nvcc)|[HIP 1.5+](https://github.com/ROCm-Developer-Tools/HIP)|NVIDIA GPUs SM 2.0+|parallel (undefined)|parallel (lock-step within warps)|
+|CUDA|CUDA 9.0+|NVIDIA GPUs|parallel (undefined)|parallel (lock-step within warps)|
+|HIP(clang)|[HIP 3.5+](https://github.com/ROCm-Developer-Tools/HIP)|AMD GPUs |parallel (undefined)|parallel (lock-step within warps)|
 
 
 Supported Compilers
 -------------------
 
-This library uses C++11 (or newer when available).
+This library uses C++14 (or newer when available).
 
-|Accelerator Back-end|gcc 4.9.4 <br/> (Linux)|gcc 5.5 <br/> (Linux)|gcc 6.4/7.3 <br/> (Linux)|gcc 8.1/9.1 <br/> (Linux)|clang 4 <br/> (Linux)|clang 5 <br/> (Linux)|clang 6 <br/> (Linux)|clang 7 <br/> (Linux)|clang 8 <br/> (Linux)|clang 9 <br/> (Linux)|Apple LLVM 10.2-11.2 <br/> (macOS)|MSVC 2017.9 <br/> (Windows)|
-|---|---|---|---|---|---|---|---|---|---|---|---|---|
-|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-|OpenMP 2.0+ blocks|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|
-|OpenMP 2.0+ threads|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|
-|OpenMP 4.0+ (CPU)|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:x:|
-| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|
-|TBB|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-|CUDA (nvcc)|:white_check_mark: <br/> (CUDA 8.0-10.2)|:white_check_mark: <br/> (CUDA 9.0-10.2)|:white_check_mark: <br/> (CUDA 9.2-10.2) |:x:|:white_check_mark: <br/> (CUDA 9.1-10.2)|:white_check_mark: <br/> (CUDA 10.1-10.2)|:white_check_mark: <br/> (CUDA 10.1-10.2)|:white_check_mark: <br/> (CUDA 10.1-10.2)|:white_check_mark: <br/> (CUDA 10.1-10.2)|:x:|:x:|:white_check_mark: <br/> (CUDA 10.0-10.2)|
-|CUDA (clang) | - | - | - | - | :white_check_mark: <br/> (CUDA 8.0)| :white_check_mark: <br/> (CUDA 8.0)| :white_check_mark: <br/> (CUDA 8.0-9.0) | :white_check_mark: <br/> (CUDA 8.0-9.2) | :white_check_mark: <br/> (CUDA 8.0-10.0) | :white_check_mark: <br/> (CUDA 9.2-10.1) | - | - |
-|[HIP](doc/markdown/user/implementation/mapping/HIP.md) (nvcc)|:white_check_mark: <br/> (nvcc 9.0+)|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|
+|Accelerator Back-end|gcc 5.5 <br/> (Linux)|gcc 6.4/7.3 <br/> (Linux)|gcc 8.1 <br/> (Linux)|gcc 9.1 <br/> (Linux)|gcc 10.1 <br/> (Linux)|clang 4 <br/> (Linux)|clang 5 <br/> (Linux)|clang 6 <br/> (Linux)|clang 7 <br/> (Linux)|clang 8 <br/> (Linux)|clang 9 <br/> (Linux)|clang 10 <br/> (Linux)|Apple LLVM 11.2.1-12.2.0 <br/> (macOS)|MSVC 2017 <br/> (Windows)|MSVC 2019 <br/> (Windows)|
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
+|OpenMP 2.0+ blocks|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|
+|OpenMP 2.0+ threads|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|
+|OpenMP 4.0+ (CPU)|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:x:|:x:|
+| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
+| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|
+|TBB|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|:x:|
+|CUDA (nvcc)|:white_check_mark: <br/> (CUDA 9.0-11.1)|:white_check_mark: <br/> (CUDA 9.2-11.1) |:white_check_mark: <br/> (CUDA 10.1-11.1) |:white_check_mark: <br/> (CUDA 11.0-11.1)|:white_check_mark: <br/> (CUDA 11.1)|:white_check_mark: <br/> (CUDA 9.1-11.1)|:white_check_mark: <br/> (CUDA 10.1-11.1)|:white_check_mark: <br/> (CUDA 10.1-11.1)|:white_check_mark: <br/> (CUDA 10.1-11.1)|:white_check_mark: <br/> (CUDA 10.1-11.1)|:white_check_mark: <br/> (CUDA 11.0-11.1)|:white_check_mark: <br/> (CUDA 11.1)|:x:|:white_check_mark: <br/> (CUDA 10.0-11.1)|:white_check_mark: <br/> (CUDA 10.1-10.2 + 11.1)|
+|CUDA (clang) | - | - | - | - | - | - | - | :white_check_mark: <br/> (CUDA 9.0) | :white_check_mark: <br/> (CUDA 9.0-9.2) | :white_check_mark: <br/> (CUDA 9.0-10.0) | :white_check_mark: <br/> (CUDA 9.2-10.1) | :white_check_mark: <br/> (CUDA 9.2-10.1) | - | - | - |
+|[HIP](https://alpaka.readthedocs.io/en/latest/install/HIP.html) (clang)|:white_check_mark: |:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|:x:|
 
 
 Other compilers or combinations marked with :x: in the table above may work but are not tested in CI and are therefore not explicitly supported.
@@ -82,16 +89,16 @@ Other compilers or combinations marked with :x: in the table above may work but
 Dependencies
 ------------
 
-[Boost](https://boost.org/) 1.62+ is the only mandatory external dependency (for CUDA 9+ Boost >=1.65.1 is required).
+[Boost](https://boost.org/) 1.65.1+ is the only mandatory external dependency.
 The **alpaka** library itself just requires header-only libraries.
 However some of the accelerator back-end implementations require different boost libraries to be built.
 
-When an accelerator back-end using *Boost.Fiber* is enabled, `boost-fiber` and all of its dependencies are required to be built in C++11 mode `./b2 cxxflags="-std=c++11"`.
+When an accelerator back-end using *Boost.Fiber* is enabled, `boost-fiber` and all of its dependencies are required to be built in C++14 mode `./b2 cxxflags="-std=c++14"`.
 When *Boost.Fiber* is enabled and alpaka is built in C++17 mode with clang and libstc++, Boost >= 1.67.0 is required.
 
-When an accelerator back-end using *CUDA* is enabled, version *8.0* of the *CUDA SDK* is the minimum requirement.
+When an accelerator back-end using *CUDA* is enabled, version *9.0* of the *CUDA SDK* is the minimum requirement.
 *NOTE*: When using nvcc as *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with the *Boost.Fiber accelerator back-end* due to bugs in the nvcc compiler.
-*NOTE*: When using clang as a native *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with any *OpenMP accelerator back-end* because this combination is currently unsupported.
+*NOTE*: When using clang as a native *CUDA* compiler, the *CUDA accelerator back-end* can not be enabled together with the *Boost.Fiber accelerator back-end* or any *OpenMP accelerator back-end* because this combination is currently unsupported.
 *NOTE*: Separable compilation is only supported when using nvcc, not with clang as native *CUDA* compiler. It is disabled by default and can be enabled via the CMake flag `ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION`.
 
 When an accelerator back-end using *OpenMP* is enabled, the compiler and the platform have to support the corresponding minimum *OpenMP* version.
@@ -103,7 +110,7 @@ Usage
 -----
 
 The library is header only so nothing has to be built.
-CMake 3.11.4+ is required to provide the correct defines and include paths.
+CMake 3.15+ is required to provide the correct defines and include paths.
 Just call `ALPAKA_ADD_EXECUTABLE` instead of `CUDA_ADD_EXECUTABLE` or `ADD_EXECUTABLE` and the difficulties of the CUDA nvcc compiler in handling `.cu` and `.cpp` files are automatically taken care of.
 Source files do not need any special file ending.
 Examples of how to utilize alpaka within CMake can be found in the `example` folder.
@@ -111,6 +118,8 @@ Examples of how to utilize alpaka within CMake can be found in the `example` fol
 The whole alpaka library can be included with: `#include <alpaka/alpaka.hpp>`
 Code that is not intended to be utilized by the user is hidden in the `detail` namespace.
 
+Furthermore, for a CUDA-like experience when adopting alpaka we provide the library [*cupla*](https://github.com/alpaka-group/cupla).
+It enables a simple and straightforward way of porting existing CUDA applications to alpaka and thus to a variety of accelerators.
 
 Introduction
 ------------
@@ -194,17 +203,39 @@ consider citing us accordingly in your derived work and publications:
 }
 ```
 
+Contributing
+------------
+
+Rules for contributions can be found in [CONTRIBUTING.md](CONTRIBUTING.md)
 
 Authors
 -------
 
-### Maintainers and Core Developers
+### Maintainers* and Core Developers
 
-- Benjamin Worpitz (original author)
-- Rene Widera
+- Benjamin Worpitz* (original author)
+- Dr. Sergei Bastrakov*
+- Simeon Ehrig
+- Bernhard Manfred Gruber
+- Dr. Axel Huebl*
+- Dr. Jeffrey Kelling
+- Jakob Krude
+- Jan Stephan*
+- Rene Widera*
 
 ### Former Members, Contributions and Thanks
 
 - Dr. Michael Bussmann
-- Axel Huebl
+- Mat Colgrove
+- Valentin Gehrke
+- Maximilian Knespel
+- Alexander Matthes
+- Hauke Mewes
+- Phil Nash
+- Mutsuo Saito
+- Jonas Schenke
+- Daniel Vollmer
+- Matthias Werner
+- Bert Wesarg
+- Malte Zacharias
 - Erik Zenker
diff --git a/thirdParty/cupla/alpaka/README_OACC.md b/thirdParty/cupla/alpaka/README_OACC.md
new file mode 100644
index 0000000000..ac1cb2ac6a
--- /dev/null
+++ b/thirdParty/cupla/alpaka/README_OACC.md
@@ -0,0 +1,89 @@
+# Building configuring for the OpenACC backend
+
+## CMake Basics
+
+Alpaka requires cmake version >= 3.15.
+
+In the root of the alpaka dir, run:
+```bash
+mkdir build
+cd build
+```
+
+## Configuring Using CMake
+
+In the build directory, invoke cmake to configure. Use the options below to
+enable only the OpenACC backend.
+
+```bash
+cmake .. \
+  -DALPAKA_ACC_ANY_BT_OACC_ENABLE=on \
+  -DBUILD_TESTING=on \
+  -Dalpaka_BUILD_EXAMPLES=on \
+```
+All other backends are disabled for faster compilation/testing and reduced
+environment requirements. The cmake package OpenACC is used to detect the
+required OpenACC flags for the compiler. Additional flags can be added, e.g:
+- gcc, target x86:
+  ```bash
+    -DCMAKE_CXX_FLAGS="-foffload=disable"
+  ```
+  As of gcc 9.2 no test will compile if the nvptx backend is enabled. If cmake
+  fails to set the `-fopenacc` flag, it can be set manually.
+- pgi/nvhpc, target tesla (set `$CC`, `$CXX` and `$CUDA_HOME` to appropriate values
+  for your system to use pgi):
+  ```bash
+    -DCMAKE_CXX_FLAGS="-ta=tesla -Minfo"
+  ```
+
+## Limitations
+
+* *No separabel compilation*. OpenACC requires functions for which device code
+  should be generated for a not-inlined call in a target region to be marked with
+  pragmas. This cannot be wrapped by macros like `ALPAKA_FN_DEVICE` because they
+  appear between template parameter list and function name.
+  <https://github.com/alpaka-group/alpaka/pull/1126#discussion_r479761867>
+
+## Test targets
+
+### helloWorld
+
+```bash
+make helloWorld
+./examples/helloWorld/helloWorld
+```
+The output should end with something like
+```
+[z:3, y:7, x:15][linear:511] Hello World
+```
+Numbers can vary when teams are executed in parallel: 512 teams, with one worker
+each are started in a 3d grid. Each worker reports its grid coordinates and linear
+index.
+
+|compiler|compile status|target|run status|
+|---|---|---|---|
+|GCC 10| ok|x86|ok|
+|NVHPC 20.7| ok|tesla|ok|
+
+### vectorAdd
+
+```bash
+make vectorAdd
+./examples/vectorAdd/vectorAdd
+```
+The output should end with
+```
+Execution results correct!
+```
+
+|compiler|compile status|target|run status|
+|---|---|---|---|
+|GCC 10(dev)| ok|x86|ok|
+|NVHPC 20.7| ok|tesla|ok|
+
+## Building and Running all tests
+
+```bash
+make
+ctest
+```
diff --git a/thirdParty/cupla/alpaka/README_OMP5.md b/thirdParty/cupla/alpaka/README_OMP5.md
new file mode 100644
index 0000000000..c3fe9b8d06
--- /dev/null
+++ b/thirdParty/cupla/alpaka/README_OMP5.md
@@ -0,0 +1,162 @@
+# Building configuring for the OMP5 backend
+
+To make the build system enable the OpenMP5 backend, one has to tell CMake
+explicitly about the OpenMP version supported by the compiler. CMake does not
+determine it automatically for some compilers.
+```
+cmake -DOpenMP_CXX_VERSION=5 \
+  -DALPAKA_ACC_ANY_BT_OMP5_ENABLE=on \
+  -DBUILD_TESTING=on \
+  -Dalpaka_BUILD_EXAMPLES=on \
+```
+All other backends are disable for faster compilation/testing and reduced
+environment requirements. Add flags to set the required compiler and linker flags, e.g:
+- clang/AOMP, target x86:
+  ```
+    -DCMAKE_CXX_FLAGS="-fopenmp -fopenmp=libomp -fopenmp-targets=x86_64-pc-linux-gnu" \
+    -DCMAKE_EXE_LINKER_FLAGS="-fopenmp"
+  ```
+- clang/AOMP, target ppc64le:
+  ```
+    -DCMAKE_CXX_FLAGS="-fopenmp -fopenmp=libomp -fopenmp-targets=ppc64le-pc-linux-gnu" \
+    -DCMAKE_EXE_LINKER_FLAGS="-fopenmp"
+  ```
+- clang, target nvptx:
+  ```
+    -DCMAKE_CXX_FLAGS="-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -O2" \
+    -DCMAKE_EXE_LINKER_FLAGS="-fopenmp"
+  ```
+- AOMP, target amdhsa:
+  ```
+    -DCMAKE_CXX_FLAGS="-fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx900 --save-temps" \
+    -DCMAKE_EXE_LINKER_FLAGS="-fopenmp"
+  ```
+- GCC, target nvptx:
+  ```
+    -DCMAKE_CXX_FLAGS="-foffload=nvptx-none -foffload=-lm -fno-lto"
+  ```
+- GCC, target host:
+  ```
+    -DCMAKE_CXX_FLAGS="-foffload=disable -fno-lto"
+  ```
+- XL, offload:
+  ```
+    -DCMAKE_CXX_FLAGS="-qoffload -qsmp"
+  ```
+- XL, no offload:
+  ```
+    -DCMAKE_CXX_FLAGS=""
+  ```
+
+## Limitations
+
+* *No separabel compilation*. OpenMP 5 requires functions for which device code should be generated for a
+  not-inlined call in a target region to be marked with pragmas. This cannot be
+  wrapped by macros like `ALPAKA_FN_DEVICE` because they appear between template
+  parameter list and function name and because OpenMP requires two macros to
+  mark a region around the function.
+  <https://github.com/alpaka-group/alpaka/pull/1126#discussion_r479761867>
+
+## 1. Test target
+
+```
+make vectorAdd
+./example/vectorAdd/vectorAdd
+```
+If the run is successful, the last output line will be `Execution results
+correct!` otherwise it will print items where the result from the offload code
+disagrees with the expected result and print `Execution results
+incorrect!` at the end.
+
+## 2. Examples compilation status
+
+### branch omp4
+
+|target|compiler|compile status|target|run status|
+|---|---|---|---|---|
+|vectorAdd|
+||GGC 10 | ok|host|ok|
+||GGC 10 | ptxas error (2)|nvptx|--|
+||AOMP 0.7-4|ok|x86|omp_target_alloc() returns 0|
+||AOMP 0.7-4|linker: multiple def. of gpuHeap (1)|amdhsa|--|
+||AOMP 0.7-5|ok|x86|ok|
+||AOMP 0.7-5|ok	|amdhsa|ok|
+||LLVM 10 |ok| x86|ok|
+||XL 16.1.1-5 (Summit)|ok| nvptx|ok (num_threads workaround) (3)|
+||XL 16.1.1-5 (Summit)|ok| ppc64le| sigsegv (device mem alloc'son GPU)|
+
+#### errors:
+1. error: Linking globals named 'gpuHeap': symbol multiply defined!
+   ```
+    /usr/bin/ld: cannot find a.out-openmp-amdgcn-amd-amdhsa-gfx900
+    /usr/bin/ld: cannot find a.out-openmp-amdgcn-amd-amdhsa-gfx900
+    clang-9: error: amdgcn-link command failed with exit code 1 (use -v to see 
+    invocation)
+    clang-9: error: linker command failed with exit code 1 (use -v to see 
+    invocation)
+   ```
+2. ptxas:
+   ```
+    Linking CXX executable vectorAdd
+    ptxas /tmp/cccKHuiQ.o, line 216; error   : Label expected for argument 0 of instruction 'call'
+    ptxas /tmp/cccKHuiQ.o, line 216; error   : Function '_ZN6alpaka3ctx12CtxBlockOaccISt17integral_constantImLm1EEmEC1ERKNS_3vec3VecIS3_mEES9_S9_RKmSB_' not declared in this scope
+    ptxas /tmp/cccKHuiQ.o, line 216; fatal   : Call target not recognized
+    ptxas fatal   : Ptx assembly aborted due to errors
+    nvptx-as: ptxas returned 255 exit status
+   ```
+3. IBM XL: When setting num_threads, either in #pragma omp parallel or via
+   omp_set_num_threads to any value the runtime only executes one thread per
+   team. Workaround is to not do that with XL, which leads to $OMP_NUM_THREADS
+   being run per team. Minimal example:
+   https://github.com/jkelling/omp5tests/blob/master/parallel/parallel.cpp
+
+## 3. Integration and Unit Tests
+
+Run `make` and upon success `ctest`.
+
+|test|compiler|compile status|target|run status|
+|---|---|---|---|---|
+|ALL|
+||LLVM 10 |ok|x86|pass|
+||LLVM 11 |ok|x86|pass|
+||AOMP 0.7-5|linker error with static lib (7)x86|--|
+||AOMP 0.7-5|linker error with static lib (8)|amdhsa|--|
+||GCC 10 |mixed(1)|host|target alloc fail(2)|
+||GCC 11 |ok|host|target alloc fail(2)|
+||XL 16.1.1-5 (Summit)|no-halt [6]|nvptx|--|
+||XL 16.1.1-5 (Summit)|no-halt [6]|ppc64le|--|
+
+#### errors:
+1. Targets with multiple compilation units fail to link.
+   <https://github.com/alpaka-group/alpaka/pull/1126#discussion_r475591568>
+2. `omp_target_alloc()` allocates memory on GPU while code runs on host and
+   tries access it there => segfault
+3. _
+4. _
+5. _
+6. XL does not appear to terminate when compiling targets like `blockShared` in
+   which tests are executed through the fixture in
+   ~alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp .
+   Removing the call
+   alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp:92-94 yields finite
+   compilation time for BlockSharedMemDyn.cpp.o . XL is extremely slow
+   compiling code using the test framework catch2 used in Alpaka.
+7. aomp 0.7-5 x86:
+   ```
+   /usr/bin/ld: cannot find libcommon-openmp-x86_64-pc-linux-gnu-sm_20.o: No such file or directory
+   /usr/bin/ld: cannot find libcommon-host-x86_64-unknown-linux-gnu.o: No such file or directory
+   clang-9: error: linker command failed with exit code 1 (use -v to see invocation)
+   clang-9: error: linker command failed with exit code 1 (use -v to see invocation)
+   test/integ/matMul/CMakeFiles/matMul.dir/build.make:85: recipe for target 'test/integ/matMul/matMul' failed
+   ```
+8. aomp 0.7-5 HSA:
+   ```
+   /home/kelling/rocm/aomp_0.7-5/bin/clang-build-select-link: libcommon-openmp-amdgcn-amd-amdhsa-gfx900.o:1:2: error: expected integer
+   !<arch>
+    ^
+   /home/kelling/rocm/aomp_0.7-5/bin/clang-build-select-link: error:  loading file 'libcommon-openmp-amdgcn-amd-amdhsa-gfx900.o'
+   /usr/bin/ld: cannot find a.out-openmp-amdgcn-amd-amdhsa-gfx900
+   /usr/bin/ld: cannot find a.out-openmp-amdgcn-amd-amdhsa-gfx900
+   clang-9: error: amdgcn-link command failed with exit code 1 (use -v to see invocation)
+   clang-9: error: linker command failed with exit code 1 (use -v to see invocation)
+   ```
diff --git a/thirdParty/cupla/alpaka/alpakaConfig.cmake b/thirdParty/cupla/alpaka/alpakaConfig.cmake
deleted file mode 100644
index ad66efd118..0000000000
--- a/thirdParty/cupla/alpaka/alpakaConfig.cmake
+++ /dev/null
@@ -1,1220 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz, Erik Zenker, Axel Huebl
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-################################################################################
-# Required cmake version.
-
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-################################################################################
-# alpaka.
-
-# Return values.
-UNSET(alpaka_FOUND)
-UNSET(alpaka_VERSION)
-UNSET(alpaka_COMPILE_OPTIONS)
-UNSET(alpaka_COMPILE_DEFINITIONS)
-UNSET(alpaka_DEFINITIONS)
-UNSET(alpaka_INCLUDE_DIR)
-UNSET(alpaka_INCLUDE_DIRS)
-UNSET(alpaka_LIBRARY)
-UNSET(alpaka_LIBRARIES)
-
-# Internal usage.
-UNSET(_ALPAKA_FOUND)
-UNSET(_ALPAKA_COMPILE_OPTIONS_PUBLIC)
-UNSET(_ALPAKA_COMPILE_DEFINITIONS_PUBLIC)
-UNSET(_ALPAKA_INCLUDE_DIRECTORY)
-UNSET(_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC)
-UNSET(_ALPAKA_LINK_LIBRARIES_PUBLIC)
-UNSET(_ALPAKA_LINK_FLAGS_PUBLIC)
-UNSET(_ALPAKA_COMMON_FILE)
-UNSET(_ALPAKA_ADD_EXECUTABLE_FILE)
-UNSET(_ALPAKA_ADD_LIBRRAY_FILE)
-UNSET(_ALPAKA_FILES_HEADER)
-UNSET(_ALPAKA_FILES_OTHER)
-
-#-------------------------------------------------------------------------------
-# Common.
-
-# Directory of this file.
-SET(_ALPAKA_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR})
-# Normalize the path (e.g. remove ../)
-GET_FILENAME_COMPONENT(_ALPAKA_ROOT_DIR "${_ALPAKA_ROOT_DIR}" ABSOLUTE)
-
-# Add common functions.
-SET(_ALPAKA_COMMON_FILE "${_ALPAKA_ROOT_DIR}/cmake/common.cmake")
-INCLUDE("${_ALPAKA_COMMON_FILE}")
-
-# Add ALPAKA_ADD_EXECUTABLE function.
-SET(_ALPAKA_ADD_EXECUTABLE_FILE "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake")
-INCLUDE("${_ALPAKA_ADD_EXECUTABLE_FILE}")
-
-# Add ALPAKA_ADD_LIBRARY function.
-SET(_ALPAKA_ADD_LIBRARY_FILE "${_ALPAKA_ROOT_DIR}/cmake/addLibrary.cmake")
-INCLUDE("${_ALPAKA_ADD_LIBRARY_FILE}")
-
-# Set found to true initially and set it to false if a required dependency is missing.
-SET(_ALPAKA_FOUND TRUE)
-
-# Add module search path
-SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${_ALPAKA_ROOT_DIR}/cmake/modules/")
-
-#-------------------------------------------------------------------------------
-# Options.
-SET(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT ON)
-SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT ON)
-
-# HIP and platform selection and warning about unsupported features
-OPTION(ALPAKA_ACC_GPU_HIP_ENABLE "Enable the HIP back-end (all other back-ends must be disabled)" OFF)
-OPTION(ALPAKA_ACC_GPU_HIP_ONLY_MODE "Only back-ends using HIP can be enabled in this mode." OFF) # HIP only runs without other back-ends
-
-# Drop-down combo box in cmake-gui for HIP platforms.
-SET(ALPAKA_HIP_PLATFORM "nvcc" CACHE STRING "Specify HIP platform")
-SET_PROPERTY(CACHE ALPAKA_HIP_PLATFORM PROPERTY STRINGS "nvcc;hcc;clang")
-
-IF(ALPAKA_ACC_GPU_HIP_ENABLE AND NOT ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-    MESSAGE(WARNING "HIP back-end must be used together with ALPAKA_ACC_GPU_HIP_ONLY_MODE")
-    SET(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "" FORCE)
-ENDIF()
-
-IF(ALPAKA_ACC_GPU_HIP_ENABLE AND (ALPAKA_HIP_PLATFORM MATCHES "hcc" OR ALPAKA_HIP_PLATFORM MATCHES "clang"))
-    MESSAGE(WARNING
-        "The HIP back-end is currently experimental, especially for HCC. "
-        "In alpaka HIP(HCC) has a few workarounds and does not support 3D memory and constant memory. "
-        )
-ENDIF()
-
-OPTION(ALPAKA_ACC_GPU_CUDA_ONLY_MODE "Only back-ends using CUDA can be enabled in this mode (This allows to mix alpaka code with native CUDA code)." OFF)
-# If CUDA-only mode is enabled, we set the defaults for all CPU back-ends to OFF.
-# If they are explicitly set via the command line, the user will get an error later on.
-IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE OR ALPAKA_ACC_GPU_HIP_ONLY_MODE) # CUDA-only or HIP-only
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT OFF)
-    SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT OFF)
-ENDIF()
-
-OPTION(ALPAKA_ACC_GPU_CUDA_ENABLE "Enable the CUDA GPU back-end" ON)
-
-# If CUDA is enabled, we set the defaults for some unsupported back-ends to OFF.
-# If they are explicitly set via the command line, the user will get an error later on.
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    SET(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT OFF)
-    IF(ALPAKA_CUDA_COMPILER MATCHES "clang")
-        SET(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT OFF)
-        SET(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT OFF)
-        SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT OFF)
-    ENDIF()
-ENDIF()
-
-IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE AND NOT ALPAKA_ACC_GPU_CUDA_ENABLE)
-    MESSAGE(WARNING "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, ALPAKA_ACC_GPU_CUDA_ENABLE has to be enabled as well.")
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-IF(ALPAKA_ACC_GPU_HIP_ONLY_MODE AND NOT ALPAKA_ACC_GPU_HIP_ENABLE)
-    MESSAGE(WARNING "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, ALPAKA_ACC_GPU_HIP_ENABLE has to be enabled as well.")
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-
-
-OPTION(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE "Enable the serial CPU back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE "Enable the threads CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE "Enable the fibers CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE "Enable the TBB CPU grid block back-end" ${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE "Enable the OpenMP 2.0 CPU grid block back-end" ${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE "Enable the OpenMP 2.0 CPU block thread back-end" ${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEFAULT})
-OPTION(ALPAKA_ACC_CPU_BT_OMP4_ENABLE "Enable the OpenMP 4.0 CPU block and block thread back-end" ${ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEFAULT})
-
-IF((ALPAKA_ACC_GPU_CUDA_ONLY_MODE OR ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-   AND
-    (ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE OR
-    ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE OR
-    ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE OR
-    ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE OR
-    ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR
-    ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR
-    ALPAKA_ACC_CPU_BT_OMP4_ENABLE))
-    IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
-        MESSAGE(WARNING "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, only back-ends using CUDA can be enabled! This allows to mix alpaka code with native CUDA code. However, this prevents any non-CUDA back-ends from being enabled.")
-    ENDIF()
-    IF(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-        MESSAGE(WARNING "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, only back-ends using HIP can be enabled!")
-    ENDIF()
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-
-# avoids CUDA+HIP conflict
-IF(ALPAKA_ACC_GPU_HIP_ENABLE AND ALPAKA_ACC_GPU_CUDA_ENABLE)
-    MESSAGE(FATAL_ERROR "CUDA and HIP can not be enabled both at the same time.")
-ENDIF()
-
-# HIP is only supported on Linux
-IF(ALPAKA_ACC_GPU_HIP_ENABLE AND (MSVC OR WIN32))
-    MESSAGE(WARNING "Optional alpaka dependency HIP can not be built on Windows! HIP back-end disabled!")
-    SET(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "Enable the HIP GPU back-end" FORCE)
-ENDIF()
-
-# Drop-down combo box in cmake-gui.
-SET(ALPAKA_DEBUG "0" CACHE STRING "Debug level")
-SET_PROPERTY(CACHE ALPAKA_DEBUG PROPERTY STRINGS "0;1;2")
-
-SET(ALPAKA_CXX_STANDARD "11" CACHE STRING "C++ standard version")
-SET_PROPERTY(CACHE ALPAKA_CXX_STANDARD PROPERTY STRINGS "11;14;17")
-
-#-------------------------------------------------------------------------------
-# Debug output of common variables.
-IF(${ALPAKA_DEBUG} GREATER 1)
-    MESSAGE(STATUS "_ALPAKA_ROOT_DIR : ${_ALPAKA_ROOT_DIR}")
-    MESSAGE(STATUS "_ALPAKA_COMMON_FILE : ${_ALPAKA_COMMON_FILE}")
-    MESSAGE(STATUS "_ALPAKA_ADD_EXECUTABLE_FILE : ${_ALPAKA_ADD_EXECUTABLE_FILE}")
-    MESSAGE(STATUS "_ALPAKA_ADD_LIBRARY_FILE : ${_ALPAKA_ADD_LIBRARY_FILE}")
-    MESSAGE(STATUS "CMAKE_BUILD_TYPE : ${CMAKE_BUILD_TYPE}")
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Check supported compilers.
-IF(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0)
-    MESSAGE(FATAL_ERROR "Clang versions < 4.0 are not supported!")
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-
-IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE AND (ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE))
-    MESSAGE(FATAL_ERROR "Fibers and CUDA or HIP back-end can not be enabled both at the same time.")
-    SET(_ALPAKA_FOUND FALSE)
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Compiler settings.
-IF(MSVC)
-    # Empty append to define it if it does not already exist.
-    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC)
-
-    IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
-        LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "/wd4505")   # CUDA\v9.2\include\crt/host_runtime.h(265): warning C4505: '__cudaUnregisterBinaryUtil': unreferenced local function has been removed
-    ENDIF()
-ELSE()
-    # Add linker options.
-    # lipthread:
-    LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "general;pthread")
-    IF(NOT APPLE)
-        # librt: undefined reference to `clock_gettime'
-        LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "general;rt")
-    ENDIF()
-
-    # Clang<4.0 or AppleClang<9.0
-    #   https://bugs.llvm.org/show_bug.cgi?id=18417
-    #   https://github.com/llvm/llvm-project/commit/e55b4737c026ea2e0b44829e4115d208577a67b2
-    IF(("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang" AND
-        CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.1) OR
-       ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND
-        CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0))
-        LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-ftemplate-depth=1024")
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find Boost.
-SET(_ALPAKA_BOOST_MIN_VER "1.62.0")
-IF(${ALPAKA_DEBUG} GREATER 1)
-    SET(Boost_DEBUG ON)
-    SET(Boost_DETAILED_FAILURE_MSG ON)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
-    FIND_PACKAGE(Boost ${_ALPAKA_BOOST_MIN_VER} QUIET COMPONENTS fiber context system thread atomic chrono date_time)
-    IF(NOT Boost_FIBER_FOUND)
-        MESSAGE(STATUS "Optional alpaka dependency Boost fiber could not be found! Fibers back-end disabled!")
-        SET(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE OFF CACHE BOOL "Enable the Fibers CPU back-end" FORCE)
-        FIND_PACKAGE(Boost ${_ALPAKA_BOOST_MIN_VER} QUIET)
-    ELSE()
-        # On Win32 boost context triggers:
-        # libboost_context-vc141-mt-gd-1_64.lib(jump_i386_ms_pe_masm.obj) : error LNK2026: module unsafe for SAFESEH image.
-        IF(MSVC)
-            IF(CMAKE_SIZEOF_VOID_P EQUAL 4)
-                SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /SAFESEH:NO")
-                SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /SAFESEH:NO")
-                SET(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /SAFESEH:NO")
-            ENDIF()
-        ENDIF()
-    ENDIF()
-
-ELSE()
-    FIND_PACKAGE(Boost ${_ALPAKA_BOOST_MIN_VER} QUIET)
-ENDIF()
-
-IF(${ALPAKA_DEBUG} GREATER 1)
-    MESSAGE(STATUS "Boost in:")
-    MESSAGE(STATUS "BOOST_ROOT : ${BOOST_ROOT}")
-    MESSAGE(STATUS "BOOSTROOT : ${BOOSTROOT}")
-    MESSAGE(STATUS "BOOST_INCLUDEDIR: ${BOOST_INCLUDEDIR}")
-    MESSAGE(STATUS "BOOST_LIBRARYDIR: ${BOOST_LIBRARYDIR}")
-    MESSAGE(STATUS "Boost_NO_SYSTEM_PATHS: ${Boost_NO_SYSTEM_PATHS}")
-    MESSAGE(STATUS "Boost_ADDITIONAL_VERSIONS: ${Boost_ADDITIONAL_VERSIONS}")
-    MESSAGE(STATUS "Boost_USE_MULTITHREADED: ${Boost_USE_MULTITHREADED}")
-    MESSAGE(STATUS "Boost_USE_STATIC_LIBS: ${Boost_USE_STATIC_LIBS}")
-    MESSAGE(STATUS "Boost_USE_STATIC_RUNTIME: ${Boost_USE_STATIC_RUNTIME}")
-    MESSAGE(STATUS "Boost_USE_DEBUG_RUNTIME: ${Boost_USE_DEBUG_RUNTIME}")
-    MESSAGE(STATUS "Boost_USE_DEBUG_PYTHON: ${Boost_USE_DEBUG_PYTHON}")
-    MESSAGE(STATUS "Boost_USE_STLPORT: ${Boost_USE_STLPORT}")
-    MESSAGE(STATUS "Boost_USE_STLPORT_DEPRECATED_NATIVE_IOSTREAMS: ${Boost_USE_STLPORT_DEPRECATED_NATIVE_IOSTREAMS}")
-    MESSAGE(STATUS "Boost_COMPILER: ${Boost_COMPILER}")
-    MESSAGE(STATUS "Boost_THREADAPI: ${Boost_THREADAPI}")
-    MESSAGE(STATUS "Boost_NAMESPACE: ${Boost_NAMESPACE}")
-    MESSAGE(STATUS "Boost_DEBUG: ${Boost_DEBUG}")
-    MESSAGE(STATUS "Boost_DETAILED_FAILURE_MSG: ${Boost_DETAILED_FAILURE_MSG}")
-    MESSAGE(STATUS "Boost_REALPATH: ${Boost_REALPATH}")
-    MESSAGE(STATUS "Boost_NO_BOOST_CMAKE: ${Boost_NO_BOOST_CMAKE}")
-    MESSAGE(STATUS "Boost out:")
-    MESSAGE(STATUS "Boost_FOUND: ${Boost_FOUND}")
-    MESSAGE(STATUS "Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}")
-    MESSAGE(STATUS "Boost_LIBRARY_DIRS: ${Boost_LIBRARY_DIRS}")
-    MESSAGE(STATUS "Boost_LIBRARIES: ${Boost_LIBRARIES}")
-    MESSAGE(STATUS "Boost_FIBER_FOUND: ${Boost_FIBER_FOUND}")
-    MESSAGE(STATUS "Boost_FIBER_LIBRARY: ${Boost_FIBER_LIBRARY}")
-    MESSAGE(STATUS "Boost_CONTEXT_FOUND: ${Boost_CONTEXT_FOUND}")
-    MESSAGE(STATUS "Boost_CONTEXT_LIBRARY: ${Boost_CONTEXT_LIBRARY}")
-    MESSAGE(STATUS "Boost_SYSTEM_FOUND: ${Boost_SYSTEM_FOUND}")
-    MESSAGE(STATUS "Boost_SYSTEM_LIBRARY: ${Boost_SYSTEM_LIBRARY}")
-    MESSAGE(STATUS "Boost_THREAD_FOUND: ${Boost_THREAD_FOUND}")
-    MESSAGE(STATUS "Boost_THREAD_LIBRARY: ${Boost_THREAD_LIBRARY}")
-    MESSAGE(STATUS "Boost_ATOMIC_FOUND: ${Boost_ATOMIC_FOUND}")
-    MESSAGE(STATUS "Boost_ATOMIC_LIBRARY: ${Boost_ATOMIC_LIBRARY}")
-    MESSAGE(STATUS "Boost_CHRONO_FOUND: ${Boost_CHRONO_FOUND}")
-    MESSAGE(STATUS "Boost_CHRONO_LIBRARY: ${Boost_CHRONO_LIBRARY}")
-    MESSAGE(STATUS "Boost_DATE_TIME_FOUND: ${Boost_DATE_TIME_FOUND}")
-    MESSAGE(STATUS "Boost_DATE_TIME_LIBRARY: ${Boost_DATE_TIME_LIBRARY}")
-    MESSAGE(STATUS "Boost_VERSION: ${Boost_VERSION}")
-    MESSAGE(STATUS "Boost_LIB_VERSION: ${Boost_LIB_VERSION}")
-    MESSAGE(STATUS "Boost_MAJOR_VERSION: ${Boost_MAJOR_VERSION}")
-    MESSAGE(STATUS "Boost_MINOR_VERSION: ${Boost_MINOR_VERSION}")
-    MESSAGE(STATUS "Boost_SUBMINOR_VERSION: ${Boost_SUBMINOR_VERSION}")
-    MESSAGE(STATUS "Boost_LIB_DIAGNOSTIC_DEFINITIONS: ${Boost_LIB_DIAGNOSTIC_DEFINITIONS}")
-    MESSAGE(STATUS "Boost cached:")
-    MESSAGE(STATUS "Boost_INCLUDE_DIR: ${Boost_INCLUDE_DIR}")
-    MESSAGE(STATUS "Boost_LIBRARY_DIR: ${Boost_LIBRARY_DIR}")
-ENDIF()
-
-IF(NOT Boost_FOUND)
-    MESSAGE(WARNING "Required alpaka dependency Boost (>=${_ALPAKA_BOOST_MIN_VER}) could not be found!")
-    SET(_ALPAKA_FOUND FALSE)
-
-ELSE()
-    IF(Boost_FIBER_FOUND)
-        # Boost fiber and default header-only libraries
-        IF(TARGET Boost::fiber)
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC
-                 Boost::boost
-                 Boost::fiber Boost::context Boost::system Boost::thread
-                 Boost::chrono Boost::date_time Boost::atomic
-            )
-        ELSE()
-            # fallback: Boost version is too new for CMake
-            LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC ${Boost_INCLUDE_DIRS})
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC ${Boost_LIBRARIES})
-        ENDIF()
-    ELSE()
-        # header-only libraries
-        IF(TARGET Boost::boost)
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC Boost::boost)
-        ELSE()
-            # fallback: Boost version is too new for CMake
-            LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC ${Boost_INCLUDE_DIRS})
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC ${Boost_LIBRARIES})
-        ENDIF()
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find TBB.
-IF(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)
-    FIND_PACKAGE(TBB 2.2)
-    IF(NOT TBB_FOUND)
-        MESSAGE(STATUS "Optional alpaka dependency TBB could not be found! TBB grid block back-end disabled!")
-        SET(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE OFF CACHE BOOL "Enable the TBB grid block back-end" FORCE)
-    ELSE()
-        LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC ${TBB_LIBRARIES})
-        LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC ${TBB_INCLUDE_DIRS})
-        LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC ${TBB_DEFINITIONS})
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find OpenMP.
-IF(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR ALPAKA_ACC_CPU_BT_OMP4_ENABLE)
-    FIND_PACKAGE(OpenMP)
-
-    # Manually find OpenMP for the clang compiler if it was not already found.
-    # Even CMake 3.5 is unable to find libiomp and provide the correct OpenMP flags.
-    IF(NOT OPENMP_FOUND)
-        IF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-            FIND_PATH(_ALPAKA_LIBIOMP_INCLUDE_DIR NAMES "omp.h" PATH_SUFFIXES "include" "libiomp" "include/libiomp")
-            IF(_ALPAKA_LIBIOMP_INCLUDE_DIR)
-                SET(OPENMP_FOUND TRUE)
-                SET(OpenMP_CXX_FLAGS "-fopenmp=libiomp5")
-                SET(OpenMP_C_FLAGS "-fopenmp=libiomp5")
-                LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC "${_ALPAKA_LIBIOMP_INCLUDE_DIR}")
-            ENDIF()
-        ENDIF()
-    ENDIF()
-
-    IF(NOT OPENMP_FOUND)
-        MESSAGE(STATUS "Optional alpaka dependency OpenMP could not be found! OpenMP back-ends disabled!")
-        SET(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OFF CACHE BOOL "Enable the OpenMP 2.0 CPU grid block back-end" FORCE)
-        SET(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OFF CACHE BOOL "Enable the OpenMP 2.0 CPU block thread back-end" FORCE)
-        SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE OFF CACHE BOOL "Enable the OpenMP 4.0 CPU block and thread back-end" FORCE)
-
-    ELSE()
-
-        # Check whether OpenMP 4 is supported
-        IF(OpenMP_CXX_VERSION VERSION_LESS 4.0)
-            SET(ALPAKA_ACC_CPU_BT_OMP4_ENABLE OFF CACHE BOOL "Enable the OpenMP 4.0 CPU block and thread back-end" FORCE)
-        ENDIF()
-
-        LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC ${OpenMP_CXX_FLAGS})
-        IF(NOT MSVC)
-            LIST(APPEND _ALPAKA_LINK_FLAGS_PUBLIC ${OpenMP_CXX_FLAGS})
-        ENDIF()
-
-        # clang versions beginning with 3.9 support OpenMP 4.0 but only when given the corresponding flag
-        IF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-            IF(ALPAKA_ACC_CPU_BT_OMP4_ENABLE)
-                LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-fopenmp-version=40")
-            ENDIF()
-        ENDIF()
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find CUDA.
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-
-    IF(NOT DEFINED ALPAKA_CUDA_VERSION)
-        SET(ALPAKA_CUDA_VERSION 8.0)
-    ENDIF()
-
-    IF(ALPAKA_CUDA_VERSION VERSION_LESS 8.0)
-        MESSAGE(WARNING "CUDA Toolkit < 8.0 is not supported!")
-        SET(_ALPAKA_FOUND FALSE)
-
-    ELSE()
-        FIND_PACKAGE(CUDA "${ALPAKA_CUDA_VERSION}")
-        IF(NOT CUDA_FOUND)
-            MESSAGE(STATUS "Optional alpaka dependency CUDA could not be found! CUDA back-end disabled!")
-            SET(ALPAKA_ACC_GPU_CUDA_ENABLE OFF CACHE BOOL "Enable the CUDA GPU back-end" FORCE)
-
-        ELSE()
-            SET(ALPAKA_CUDA_VERSION "${CUDA_VERSION}")
-            IF(CUDA_VERSION VERSION_LESS 9.0)
-                SET(ALPAKA_CUDA_ARCH "20" CACHE STRING "GPU architecture")
-            ELSEIF(CUDA_VERSION VERSION_LESS 10.3)
-                SET(ALPAKA_CUDA_ARCH "30" CACHE STRING "GPU architecture")
-            ELSE()
-                SET(ALPAKA_CUDA_ARCH "35" CACHE STRING "GPU architecture")
-            ENDIF()
-            SET(ALPAKA_CUDA_COMPILER "nvcc" CACHE STRING "CUDA compiler")
-            SET_PROPERTY(CACHE ALPAKA_CUDA_COMPILER PROPERTY STRINGS "nvcc;clang")
-
-            OPTION(ALPAKA_CUDA_FAST_MATH "Enable fast-math" ON)
-            OPTION(ALPAKA_CUDA_FTZ "Set flush to zero for GPU" OFF)
-            OPTION(ALPAKA_CUDA_SHOW_REGISTER "Show kernel registers and create PTX" OFF)
-            OPTION(ALPAKA_CUDA_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps (folder: nvcc_tmp)" OFF)
-            OPTION(ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA "Enable experimental, extended host-device lambdas in NVCC" ON)
-            OPTION(ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR "Enable experimental, relaxed constexpr in NVCC" ON)
-            OPTION(ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION "Enable separable compilation in NVCC" OFF)
-
-            IF(ALPAKA_CUDA_COMPILER MATCHES "clang")
-                IF(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-                    MESSAGE(FATAL_ERROR "Using clang as CUDA compiler is only possible if clang is the host compiler!")
-                ENDIF()
-
-                IF(CMAKE_CXX_COMPILER_VERSION LESS 6.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 9.0)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 6 do not support CUDA 9 or greater!")
-                    ENDIF()
-                ELSEIF(CMAKE_CXX_COMPILER_VERSION LESS 7.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 9.1)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 7 do not support CUDA 9.1 or greater!")
-                    ENDIF()
-                ELSEIF(CMAKE_CXX_COMPILER_VERSION LESS 8.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 10.0)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 8 do not support CUDA 10.0 or greater!")
-                    ENDIF()
-                ELSEIF(CMAKE_CXX_COMPILER_VERSION LESS 9.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 10.1)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 9 do not support CUDA 10.1 or greater!")
-                    ENDIF()
-                ELSEIF(CMAKE_CXX_COMPILER_VERSION LESS 10.0)
-                    IF(CUDA_VERSION GREATER_EQUAL 10.2)
-                        MESSAGE(FATAL_ERROR "Clang versions lower than 10 do not support CUDA 10.2 or greater!")
-                    ENDIF()
-                ENDIF()
-
-                IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
-                    MESSAGE(FATAL_ERROR "Clang as a CUDA compiler does not support boost.fiber!")
-                ENDIF()
-                IF(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)
-                    MESSAGE(FATAL_ERROR "Clang as a CUDA compiler does not support OpenMP 2!")
-                ENDIF()
-                IF(ALPAKA_ACC_CPU_BT_OMP4_ENABLE)
-                    MESSAGE(FATAL_ERROR "Clang as a CUDA compiler does not support OpenMP 4!")
-                ENDIF()
-
-                FOREACH(_CUDA_ARCH_ELEM ${ALPAKA_CUDA_ARCH})
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "--cuda-gpu-arch=sm_${_CUDA_ARCH_ELEM}")
-                ENDFOREACH()
-
-                LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "--cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
-
-                # This flag silences the warning produced by the Dummy.cpp files:
-                # clang: warning: argument unused during compilation: '--cuda-gpu-arch=sm_XX'
-                # This seems to be a false positive as all flags are 'unused' for an empty file.
-                LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-Qunused-arguments")
-
-                # Silences warnings that are produced by boost because clang is not correctly identified.
-                LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-Wno-unused-local-typedef")
-
-                IF(ALPAKA_CUDA_FAST_MATH)
-                    # -ffp-contract=fast enables the usage of FMA
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-ffast-math" "-ffp-contract=fast")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_FTZ)
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-fcuda-flush-denormals-to-zero")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_SHOW_REGISTER)
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-Xcuda-ptxas=-v")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_KEEP_FILES)
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-save-temps")
-                ENDIF()
-
-                # When libstdc++ is used and -std=gnu++XX is set, we get the following compile error:
-                # /usr/lib/gcc/x86_64-linux-gnu/5.5.0/../../../../include/c++/5.5.0/type_traits:311:39: error: __float128 is not supported on this target struct __is_floating_point_helper<__float128>
-                # Clang doesn't support the __float128 type (at least when building CUDA device code)
-                # * Due to the minimum requirement to compile with C++11 and because extensions are enabled by default by CMake, it adds -std=gnu++11 instead of -std=c++11 to the command line.
-                #   Due to alpaka being an INTERFACE library (header-only) we are not allowed to set CXX_EXTENSIONS to OFF and transitively disable extensions for inherited targets.
-                # * Defining __float128 on the command line is the least invasive workaround found here: https://bugs.llvm.org/show_bug.cgi?id=13530#c6
-                LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "__float128=void")
-
-                # CMake 3.15 does not provide the `--std=c++11` argument to clang anymore.
-                # It is not necessary for basic c++ compilation because clangs default is already higher, but CUDA code compiled with -x cuda still defaults to c++98.
-                IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.15.0")
-                    LIST(APPEND _ALPAKA_COMPILE_OPTIONS_PUBLIC "-std=c++${ALPAKA_CXX_STANDARD}")
-                ENDIF()
-
-            ELSE()
-                IF("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-                    IF(CUDA_VERSION VERSION_EQUAL 8.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 5.4)
-                            MESSAGE(FATAL_ERROR "NVCC 8.0 does not support GCC 5.4+. Please use GCC 4.9 - 5.3!")
-                        ENDIF()
-                    ELSEIF((CUDA_VERSION VERSION_EQUAL 9.0) OR (CUDA_VERSION VERSION_EQUAL 9.1))
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 6.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.0 - 9.1 do not support GCC 7+ and fail compiling the std::tuple implementation in GCC 6+. Please use GCC 4.9 - 5.5!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 9.2)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 8.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.2 does not support GCC 8+. Please use GCC 4.9, 5, 6 or 7!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 8.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.0 does not support GCC 8+. Please use GCC 4.9, 5, 6 or 7!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.1)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.1 does not support GCC 9+. Please use GCC 4.9, 5, 6, 7 or 8!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.2)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.2 does not support GCC 9+. Please use GCC 4.9, 5, 6, 7 or 8!")
-                        ENDIF()
-                    ENDIF()
-                ELSEIF("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-                    IF(CUDA_VERSION VERSION_EQUAL 8.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 4.0)
-                            MESSAGE(FATAL_ERROR "NVCC 8.0 does not support clang 4+. Please use NVCC 9.1!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 9.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 4.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.0 does not support clang 4+. Please use NVCC 9.1!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 9.1)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 5.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.1 does not support clang 5+. Please use clang 4!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 9.2)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 5.0)
-                            MESSAGE(FATAL_ERROR "NVCC 9.2 does not support clang 6+ and fails compiling with clang 5. Please use clang 4!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.0)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 7.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.0 does not support clang 7+. Please use clang 4, 5 or 6!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.1)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.1 does not support clang 9+. Please use clang 4, 5, 6, 7 or 8!")
-                        ENDIF()
-                    ELSEIF(CUDA_VERSION VERSION_EQUAL 10.2)
-                        IF(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
-                            MESSAGE(FATAL_ERROR "NVCC 10.2 does not support clang 9+. Please use clang 4, 5, 6, 7 or 8!")
-                        ENDIF()
-                    ENDIF()
-                ENDIF()
-
-                # CUDA 9.0 removed the __CUDACC_VER__ macro. Boost versions lower than 1.65.1 still use this macro.
-                IF(CUDA_VERSION VERSION_GREATER_EQUAL 9.0 AND Boost_VERSION VERSION_LESS 1.65.1)
-                    MESSAGE(WARNING "CUDA 9.0 or newer requires boost-1.65.1 or newer!")
-                    SET(_ALPAKA_FOUND FALSE)
-                ENDIF()
-
-                # CUDA 9.0 is the first to support c++14.
-                IF((CUDA_VERSION VERSION_LESS 9.0) AND (ALPAKA_CXX_STANDARD GREATER 11))
-                    MESSAGE(WARNING "CUDA 9.0 or newer is required for c++14 or higher!")
-                    SET(_ALPAKA_FOUND FALSE)
-                ENDIF()
-
-                IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
-                    MESSAGE(FATAL_ERROR "NVCC does not support boost.fiber!")
-                ENDIF()
-
-                # Clean up the flags. Else, multiple find calls would result in duplicate flags. Furthermore, other modules may have set different settings.
-                SET(CUDA_NVCC_FLAGS)
-
-                IF(${ALPAKA_DEBUG} GREATER 1)
-                    SET(CUDA_VERBOSE_BUILD ON)
-                ENDIF()
-
-                SET(CUDA_PROPAGATE_HOST_FLAGS ON)
-
-                IF(ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION)
-                    SET(CUDA_SEPARABLE_COMPILATION ON)
-                ENDIF()
-
-                # nvcc sets no linux/__linux macros on OpenPOWER linux
-                # nvidia bug id: 2448610
-                IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-                    IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
-                        LIST(APPEND CUDA_NVCC_FLAGS "-Dlinux")
-                    ENDIF()
-                ENDIF()
-
-                IF(CUDA_VERSION VERSION_EQUAL 8.0)
-                    LIST(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-                ENDIF()
-
-
-                IF(ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--expt-extended-lambda")
-                ENDIF()
-                IF(ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
-                ENDIF()
-
-                FOREACH(_CUDA_ARCH_ELEM ${ALPAKA_CUDA_ARCH})
-                    # set flags to create device code for the given architecture
-                    LIST(APPEND CUDA_NVCC_FLAGS
-                        --generate-code arch=compute_${_CUDA_ARCH_ELEM},code=sm_${_CUDA_ARCH_ELEM}
-                        --generate-code arch=compute_${_CUDA_ARCH_ELEM},code=compute_${_CUDA_ARCH_ELEM}
-                    )
-                ENDFOREACH()
-
-                IF(NOT MSVC)
-                    LIST(APPEND CUDA_NVCC_FLAGS "-std=c++${ALPAKA_CXX_STANDARD}")
-                ENDIF()
-
-                SET(CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
-
-                IF(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-                    LIST(APPEND CUDA_NVCC_FLAGS "-g")
-                    # https://github.com/ComputationalRadiationPhysics/alpaka/issues/428
-                    IF(((CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) OR
-                        (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.8)) AND
-                        CUDA_VERSION VERSION_LESS 9.0)
-                        MESSAGE(WARNING "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} does not support -G with CUDA <= 8! "
-                                        "Device debug symbols NOT added.")
-                    ELSEIF(MSVC)
-                        MESSAGE(WARNING "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} does not support -G with CUDA! "
-                                        "Device debug symbols NOT added.")
-                    ELSE()
-                        LIST(APPEND CUDA_NVCC_FLAGS "-G")
-                    ENDIF()
-                ENDIF()
-
-                IF(ALPAKA_CUDA_FAST_MATH)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_FTZ)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--ftz=true")
-                ELSE()
-                    LIST(APPEND CUDA_NVCC_FLAGS "--ftz=false")
-                ENDIF()
-
-                IF(ALPAKA_CUDA_SHOW_REGISTER)
-                    LIST(APPEND CUDA_NVCC_FLAGS "-Xptxas=-v")
-                ENDIF()
-
-                # Always add warning/error numbers which can be used for suppressions
-                LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --display_error_number)
-
-                # avoids warnings on host-device signatured, default constructors/destructors
-                IF(CUDA_VERSION GREATER_EQUAL 9.0)
-                    LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored)
-                ENDIF()
-
-                # avoids warnings on host-device signature of 'std::__shared_count<>'
-                IF(CUDA_VERSION EQUAL 10.0)
-                    LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2905)
-                ELSEIF(CUDA_VERSION EQUAL 10.1)
-                    LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2912)
-                ELSEIF(CUDA_VERSION EQUAL 10.2)
-                    LIST(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=2976)
-                ENDIF()
-
-                IF(ALPAKA_CUDA_KEEP_FILES)
-                    MAKE_DIRECTORY("${PROJECT_BINARY_DIR}/nvcc_tmp")
-                    LIST(APPEND CUDA_NVCC_FLAGS "--keep" "--keep-dir" "${PROJECT_BINARY_DIR}/nvcc_tmp")
-                ENDIF()
-
-                OPTION(ALPAKA_CUDA_SHOW_CODELINES "Show kernel lines in cuda-gdb and cuda-memcheck" OFF)
-                IF(ALPAKA_CUDA_SHOW_CODELINES)
-                    LIST(APPEND CUDA_NVCC_FLAGS "--source-in-ptx" "-lineinfo")
-                    IF(NOT MSVC)
-                        LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-rdynamic")
-                    ENDIF()
-                    SET(ALPAKA_CUDA_KEEP_FILES ON CACHE BOOL "activate keep files" FORCE)
-                ENDIF()
-            ENDIF()
-
-            LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "general;${CUDA_CUDART_LIBRARY}")
-            LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC ${CUDA_INCLUDE_DIRS})
-        ENDIF()
-    ENDIF()
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find HIP.
-IF(ALPAKA_ACC_GPU_HIP_ENABLE)
-
-    IF(NOT DEFINED ALPAKA_HIP_VERSION)
-        SET(ALPAKA_HIP_VERSION 1.5)
-    ENDIF()
-
-    IF(ALPAKA_HIP_VERSION VERSION_LESS 1.5)
-        MESSAGE(WARNING "HIP < 1.5 is not supported!")
-        SET(_ALPAKA_FOUND FALSE)
-
-    ELSE()
-        # must set this for HIP package (note that you also need certain env vars)
-        SET(HIP_PLATFORM "${ALPAKA_HIP_PLATFORM}" CACHE STRING "")
-        SET(HIP_RUNTIME "${ALPAKA_HIP_PLATFORM}" CACHE STRING "")
-
-        FIND_PACKAGE(HIP "${ALPAKA_HIP_VERSION}")
-        IF(NOT HIP_FOUND)
-            MESSAGE(WARNING "Optional alpaka dependency HIP could not be found! HIP back-end disabled!")
-            SET(ALPAKA_ACC_GPU_HIP_ENABLE OFF CACHE BOOL "Enable the HIP GPU back-end" FORCE)
-
-        ELSE()
-            SET(ALPAKA_HIP_VERSION "${HIP_VERSION}")
-            IF(ALPAKA_HIP_VERSION VERSION_LESS 1.5.19211)
-                MESSAGE(STATUS "HIP < 1.5.19211 untested!")
-            ENDIF()
-            SET(ALPAKA_HIP_COMPILER "hipcc" CACHE STRING "HIP compiler")
-            SET_PROPERTY(CACHE ALPAKA_HIP_COMPILER PROPERTY STRINGS "hipcc")
-
-            OPTION(ALPAKA_HIP_FAST_MATH "Enable fast-math" ON)
-            OPTION(ALPAKA_HIP_FTZ "Set flush to zero for GPU" OFF)
-            OPTION(ALPAKA_HIP_SHOW_REGISTER "Show kernel registers and create PTX" OFF)
-            OPTION(ALPAKA_HIP_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps (folder: nvcc_tmp)" OFF)
-
-            SET(HIP_HIPCC_FLAGS)
-
-            IF(ALPAKA_HIP_PLATFORM MATCHES "nvcc")
-                FIND_PACKAGE(CUDA)
-                IF(NOT CUDA_FOUND)
-                    MESSAGE(WARNING "Could not found CUDA while HIP platform is set to nvcc. Compiling might fail.")
-                ENDIF()
-
-                IF(CUDA_VERSION VERSION_LESS 9.0)
-                    SET(ALPAKA_CUDA_ARCH "20" CACHE STRING "GPU architecture")
-                ELSE()
-                    SET(ALPAKA_CUDA_ARCH "30" CACHE STRING "GPU architecture")
-                ENDIF()
-
-                # CUDA 9.0 removed the __CUDACC_VER__ macro. Boost versions lower than 1.65.1 still use this macro.
-                IF(CUDA_VERSION VERSION_GREATER_EQUAL 9.0 AND Boost_VERSION VERSION_LESS 1.65.1)
-                    MESSAGE(WARNING "CUDA 9.0 or newer requires boost-1.65.1 or newer!")
-                    SET(_ALPAKA_FOUND FALSE)
-                ENDIF()
-
-                IF(CUDA_VERSION VERSION_EQUAL 8.0)
-                    LIST(APPEND HIP_HIPCC_FLAGS "-Wno-deprecated-gpu-targets")
-                ENDIF()
-
-                IF(CUDA_VERSION VERSION_LESS 8.0)
-                    MESSAGE(WARNING "CUDA Toolkit < 8.0 is not supported!")
-                    SET(_ALPAKA_FOUND FALSE)
-                ENDIF()
-
-                IF(${ALPAKA_DEBUG} GREATER 1)
-                    SET(HIP_VERBOSE_BUILD ON)
-                ENDIF()
-
-                LIST(APPEND HIP_NVCC_FLAGS "--expt-extended-lambda")
-                LIST(APPEND HIP_NVCC_FLAGS "--expt-relaxed-constexpr")
-                LIST(APPEND _ALPAKA_HIP_LIBRARIES "cudart")
-
-                FOREACH(_HIP_ARCH_ELEM ${ALPAKA_CUDA_ARCH})
-                    # set flags to create device code for the given architecture
-                    LIST(APPEND CUDA_NVCC_FLAGS
-                        --generate-code arch=compute_${_HIP_ARCH_ELEM},code=sm_${_HIP_ARCH_ELEM}
-                        --generate-code arch=compute_${_HIP_ARCH_ELEM},code=compute_${_HIP_ARCH_ELEM}
-                    )
-                ENDFOREACH()
-                # for CUDA cmake adds automatically compiler flags as nvcc does not do this,
-                # but for HIP we have to do this here
-                LIST(APPEND HIP_NVCC_FLAGS "-D__CUDACC__")
-                LIST(APPEND HIP_NVCC_FLAGS "-ccbin ${CMAKE_CXX_COMPILER}")
-                LIST(APPEND HIP_NVCC_FLAGS "-Xcompiler" "-g")
-
-                IF(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-                    LIST(APPEND HIP_HIPCC_FLAGS "-G")
-                ENDIF()
-                # propage host flags
-                # SET(CUDA_PROPAGATE_HOST_FLAGS ON) # does not exist in HIP, so do it manually
-                string(TOUPPER "${CMAKE_BUILD_TYPE}" build_config)
-                FOREACH( _flag ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${build_config}})
-                    LIST(APPEND HIP_NVCC_FLAGS "-Xcompiler ${_flag}")
-                ENDFOREACH()
-
-                IF(ALPAKA_HIP_FAST_MATH)
-                    LIST(APPEND HIP_HIPCC_FLAGS "--use_fast_math")
-                ENDIF()
-
-                IF(ALPAKA_HIP_FTZ)
-                    LIST(APPEND HIP_HIPCC_FLAGS "--ftz=true")
-                ELSE()
-                    LIST(APPEND HIP_HIPCC_FLAGS "--ftz=false")
-                ENDIF()
-
-                IF(ALPAKA_HIP_SHOW_REGISTER)
-                    LIST(APPEND HIP_HIPCC_FLAGS "-Xptxas=-v")
-                ENDIF()
-                IF(CUDA_VERSION GREATER_EQUAL 9.0)
-                    # avoids warnings on host-device signatured, default constructors/destructors
-                    LIST(APPEND HIP_HIPCC_FLAGS "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored")
-                ENDIF()
-
-                # random numbers library ( HIP(NVCC) ) /hiprand
-                # HIP_ROOT_DIR is set by FindHIP.cmake
-                FIND_PATH(HIP_RAND_INC
-                    NAMES "hiprand_kernel.h"
-                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}/include" "hiprand"
-                    PATHS "/opt/rocm/rocrand/hiprand"
-                    PATH_SUFFIXES "include" "hiprand")
-                FIND_LIBRARY(HIP_RAND_LIBRARY
-                    NAMES "hiprand-d" "hiprand"
-                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}" "hiprand"
-                    PATHS "/opt/rocm/rocrand/hiprand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "lib" "lib64")
-                IF(NOT HIP_RAND_INC)
-                    MESSAGE(FATAL_ERROR "Could not find hipRAND include (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}).")
-                ENDIF()
-                IF(NOT HIP_RAND_LIBRARY)
-                    MESSAGE(FATAL_ERROR "Could not find hipRAND library (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}).")
-                ENDIF()
-                LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC "${HIP_RAND_INC}")
-                LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "${HIP_RAND_LIBRARY}")
-            ENDIF() # nvcc
-
-            IF(ALPAKA_HIP_PLATFORM MATCHES "hcc")
-
-                # random numbers library ( HIP(HCC) ) /rocrand
-                FIND_PATH(ROC_RAND_INC
-                    rocrand_kernel.h
-                    PATHS "${HIP_ROOT_DIR}/rocrand" "${HIP_ROOT_DIR}" "rocrand"
-                    PATHS "/opt/rocm/rocrand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "include")
-                FIND_LIBRARY(ROC_RAND_LIBRARY
-                    rocrand-d
-                    rocrand
-                    PATHS "${HIP_ROOT_DIR}/rocrand" "${HIP_ROOT_DIR}" "rocrand"
-                    PATHS "/opt/rocm/rocrand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "lib" "lib64")
-
-                # random numbers library ( HIP(HCC) ) rocrand/hiprand
-                FIND_PATH(HIP_RAND_INC
-                    hiprand_kernel.h
-                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}" "hiprand"
-                    PATHS "/opt/rocm/hiprand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "include")
-                FIND_LIBRARY(HIP_RAND_LIBRARY
-                    hiprand-d
-                    hiprand
-                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}" "hiprand"
-                    PATHS "/opt/rocm/hiprand"
-                    ENV HIP_PATH
-                    PATH_SUFFIXES "lib" "lib64")
-                IF(NOT HIP_RAND_INC OR NOT HIP_RAND_LIBRARY)
-                    MESSAGE(FATAL_ERROR "Could not find hipRAND library")
-                ENDIF()
-                LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC "${HIP_RAND_INC}")
-                LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "${HIP_RAND_LIBRARY}")
-
-                IF(NOT ROC_RAND_INC OR NOT ROC_RAND_LIBRARY)
-                    MESSAGE(FATAL_ERROR "Could not find rocRAND library")
-                ENDIF()
-
-                LIST(APPEND _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC "${ROC_RAND_INC}")
-                LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "${ROC_RAND_LIBRARY}")
-
-            ENDIF()
-
-
-            LIST(APPEND HIP_HIPCC_FLAGS "-D__HIPCC__")
-            LIST(APPEND HIP_HIPCC_FLAGS "-std=c++${ALPAKA_CXX_STANDARD}")
-
-            IF(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-                LIST(APPEND HIP_HIPCC_FLAGS "-g")
-            ENDIF()
-
-
-            IF(ALPAKA_HIP_KEEP_FILES)
-                MAKE_DIRECTORY("${PROJECT_BINARY_DIR}/hip_tmp")
-                LIST(APPEND HIP_HIPCC_FLAGS "--keep" "--keep-dir" "${PROJECT_BINARY_DIR}/hip_tmp")
-            ENDIF()
-
-            OPTION(ALPAKA_HIP_SHOW_CODELINES "Show kernel lines in cuda-gdb and cuda-memcheck" OFF)
-            IF(ALPAKA_HIP_SHOW_CODELINES)
-                LIST(APPEND HIP_HIPCC_FLAGS "--source-in-ptx" "-lineinfo")
-                LIST(APPEND HIP_HIPCC_FLAGS "-Xcompiler" "-rdynamic")
-                SET(ALPAKA_HIP_KEEP_FILES ON CACHE BOOL "activate keep files" FORCE)
-            ENDIF()
-            IF(_ALPAKA_HIP_LIBRARIES)
-                LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "general;${_ALPAKA_HIP_LIBRARIES}")
-            ENDIF()
-        ENDIF()
-    ENDIF()
-ENDIF() # HIP
-
-#-------------------------------------------------------------------------------
-# alpaka.
-IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_GPU_CUDA_ONLY_MODE")
-    MESSAGE(STATUS ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
-ENDIF()
-
-IF(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_GPU_HIP_ONLY_MODE")
-    MESSAGE(STATUS ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-ENDIF()
-
-IF(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_CPU_BT_OMP4_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_CPU_BT_OMP4_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_CPU_BT_OMP4_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_GPU_CUDA_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_GPU_CUDA_ENABLED)
-ENDIF()
-IF(ALPAKA_ACC_GPU_HIP_ENABLE)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_ACC_GPU_HIP_ENABLED")
-    MESSAGE(STATUS ALPAKA_ACC_GPU_HIP_ENABLED)
-ENDIF()
-
-LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_DEBUG=${ALPAKA_DEBUG}")
-
-IF(ALPAKA_CI)
-    LIST(APPEND _ALPAKA_COMPILE_DEFINITIONS_PUBLIC "ALPAKA_CI")
-ENDIF()
-
-SET(_ALPAKA_INCLUDE_DIRECTORY "${_ALPAKA_ROOT_DIR}/include")
-SET(_ALPAKA_SUFFIXED_INCLUDE_DIR "${_ALPAKA_INCLUDE_DIRECTORY}/alpaka")
-
-SET(_ALPAKA_LINK_LIBRARY)
-
-# # cxx flags will not be forwarded to hip wrapped compiler, so it has to be provided manually
-IF(ALPAKA_ACC_GPU_HIP_ENABLE)
-    SET(_ALPAKA_COMPILE_DEFINITIONS_HIP ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC})
-    LIST_ADD_PREFIX("-D" _ALPAKA_COMPILE_DEFINITIONS_HIP)
-    LIST(APPEND HIP_HIPCC_FLAGS
-        ${_ALPAKA_COMPILE_DEFINITIONS_HIP}
-        )
-    HIP_INCLUDE_DIRECTORIES(
-        # ${_ALPAKA_INCLUDE_DIRECTORY}
-        # ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC}
-        ${HIP_INCLUDE_DIRS}
-        ${Boost_INCLUDE_DIRS}
-        ${_ALPAKA_ROOT_DIR}/test/common/include
-        )
-
-    IF(OPENMP_FOUND) # remove fopenmp link from nvcc, otherwise linker error will occur
-        LIST(REMOVE_ITEM _ALPAKA_LINK_FLAGS_PUBLIC "${OpenMP_CXX_FLAGS}")
-        LIST(APPEND _ALPAKA_LINK_FLAGS_PUBLIC "-Xcompiler ${OpenMP_CXX_FLAGS}")
-    ENDIF()
-    IF(ALPAKA_HIP_PLATFORM MATCHES "hcc")
-        # GFX600, GFX601, GFX700, GFX701, GFX702, GFX703, GFX704, GFX801, GFX802, GFX803, GFX810, GFX900, GFX902
-        SET(_ALPAKA_LINK_LIBRARIES_PUBLIC "${_ALPAKA_LINK_LIBRARIES_PUBLIC}" "--amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906")
-    ENDIF()
-ENDIF()
-
-# Add all the source and include files in all recursive subdirectories and group them accordingly.
-append_recursive_files_add_to_src_group("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "hpp" _ALPAKA_FILES_HEADER)
-append_recursive_files_add_to_src_group("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "h" _ALPAKA_FILES_HEADER)
-
-append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/script" "${_ALPAKA_ROOT_DIR}" "sh" _ALPAKA_FILES_SCRIPT)
-SET_SOURCE_FILES_PROPERTIES(${_ALPAKA_FILES_SCRIPT} PROPERTIES HEADER_FILE_ONLY TRUE)
-
-append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/cmake" "${_ALPAKA_ROOT_DIR}" "cmake" _ALPAKA_FILES_CMAKE)
-LIST(APPEND _ALPAKA_FILES_CMAKE "${_ALPAKA_ROOT_DIR}/alpakaConfig.cmake" "${_ALPAKA_ROOT_DIR}/Findalpaka.cmake" "${_ALPAKA_ROOT_DIR}/CMakeLists.txt" "${_ALPAKA_ROOT_DIR}/cmake/dev.cmake" "${_ALPAKA_ROOT_DIR}/cmake/common.cmake" "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake" "${_ALPAKA_ADD_LIBRRAY_FILE}")
-SET_SOURCE_FILES_PROPERTIES(${_ALPAKA_FILES_CMAKE} PROPERTIES HEADER_FILE_ONLY TRUE)
-
-append_recursive_files_add_to_src_group("${_ALPAKA_ROOT_DIR}/doc/markdown" "${_ALPAKA_ROOT_DIR}" "md" _ALPAKA_FILES_DOC)
-SET_SOURCE_FILES_PROPERTIES(${_ALPAKA_FILES_DOC} PROPERTIES HEADER_FILE_ONLY TRUE)
-
-SET(_ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/.gitignore" "${_ALPAKA_ROOT_DIR}/.travis.yml" "${_ALPAKA_ROOT_DIR}/.zenodo.json" "${_ALPAKA_ROOT_DIR}/LICENSE" "${_ALPAKA_ROOT_DIR}/README.md")
-SET_SOURCE_FILES_PROPERTIES(${_ALPAKA_FILES_OTHER} PROPERTIES HEADER_FILE_ONLY TRUE)
-
-#-------------------------------------------------------------------------------
-# Target.
-IF(NOT TARGET "alpaka")
-    ADD_LIBRARY("alpaka" INTERFACE)
-
-    # HACK: Workaround for the limitation that files added to INTERFACE targets (target_sources) can not be marked as PUBLIC or PRIVATE but only as INTERFACE.
-    # Therefore those files will be added to projects "linking" to the INTERFACE library, but are not added to the project itself within an IDE.
-    add_custom_target("alpakaIde"
-        SOURCES ${_ALPAKA_FILES_HEADER} ${_ALPAKA_FILES_SCRIPT} ${_ALPAKA_FILES_CMAKE} ${_ALPAKA_FILES_DOC} ${_ALPAKA_FILES_OTHER}
-    )
-
-    target_compile_features("alpaka"
-        INTERFACE cxx_std_${ALPAKA_CXX_STANDARD}
-    )
-
-    # Compile options.
-    IF(${ALPAKA_DEBUG} GREATER 1)
-        MESSAGE(STATUS "_ALPAKA_COMPILE_OPTIONS_PUBLIC: ${_ALPAKA_COMPILE_OPTIONS_PUBLIC}")
-    ENDIF()
-    LIST(
-        LENGTH
-        _ALPAKA_COMPILE_OPTIONS_PUBLIC
-        _ALPAKA_COMPILE_OPTIONS_PUBLIC_LENGTH)
-    IF(${_ALPAKA_COMPILE_OPTIONS_PUBLIC_LENGTH} GREATER 0)
-        TARGET_COMPILE_OPTIONS(
-            "alpaka"
-            INTERFACE ${_ALPAKA_COMPILE_OPTIONS_PUBLIC})
-    ENDIF()
-
-    # Compile definitions.
-    IF(${ALPAKA_DEBUG} GREATER 1)
-        MESSAGE(STATUS "_ALPAKA_COMPILE_DEFINITIONS_PUBLIC: ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC}")
-    ENDIF()
-    LIST(
-        LENGTH
-        _ALPAKA_COMPILE_DEFINITIONS_PUBLIC
-        _ALPAKA_COMPILE_DEFINITIONS_PUBLIC_LENGTH)
-    IF(${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC_LENGTH} GREATER 0)
-        TARGET_COMPILE_DEFINITIONS(
-            "alpaka"
-            INTERFACE ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC})
-    ENDIF()
-
-    # Include directories.
-    IF(${ALPAKA_DEBUG} GREATER 1)
-        MESSAGE(STATUS "_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC: ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC}")
-    ENDIF()
-    LIST(
-        LENGTH
-        _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC
-        _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC_LENGTH)
-    IF(${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC_LENGTH} GREATER 0)
-        TARGET_INCLUDE_DIRECTORIES(
-            "alpaka"
-            SYSTEM
-            INTERFACE ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC})
-    ENDIF()
-    # the alpaka library itself
-    TARGET_INCLUDE_DIRECTORIES(
-        "alpaka"
-        INTERFACE ${_ALPAKA_INCLUDE_DIRECTORY}
-    )
-
-    # Link libraries.
-    # There are no PUBLIC_LINK_FLAGS in CMAKE:
-    # http://stackoverflow.com/questions/26850889/cmake-keeping-link-flags-of-internal-libs
-    IF(${ALPAKA_DEBUG} GREATER 1)
-        MESSAGE(STATUS "_ALPAKA_LINK_LIBRARIES_PUBLIC: ${_ALPAKA_LINK_LIBRARIES_PUBLIC}")
-    ENDIF()
-    LIST(
-        LENGTH
-        _ALPAKA_LINK_LIBRARIES_PUBLIC
-        _ALPAKA_LINK_LIBRARIES_PUBLIC_LENGTH)
-    IF(${_ALPAKA_LINK_LIBRARIES_PUBLIC_LENGTH} GREATER 0)
-        TARGET_LINK_LIBRARIES(
-            "alpaka"
-            INTERFACE ${_ALPAKA_LINK_LIBRARIES_PUBLIC} ${_ALPAKA_LINK_FLAGS_PUBLIC})
-    ENDIF()
-ENDIF()
-
-# NVCC does not incorporate the COMPILE_OPTIONS of a target but only the CMAKE_CXX_FLAGS
-IF((ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE) AND ALPAKA_CUDA_COMPILER MATCHES "nvcc")
-    STRING(REPLACE ";" " " _ALPAKA_COMPILE_OPTIONS_STRING "${_ALPAKA_COMPILE_OPTIONS_PUBLIC}")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_ALPAKA_COMPILE_OPTIONS_STRING}")
-ENDIF()
-
-#-------------------------------------------------------------------------------
-# Find alpaka version.
-file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_MAJOR_HPP REGEX "#define ALPAKA_VERSION_MAJOR ")
-file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_MINOR_HPP REGEX "#define ALPAKA_VERSION_MINOR ")
-file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/include/alpaka/version.hpp" ALPAKA_VERSION_PATCH_HPP REGEX "#define ALPAKA_VERSION_PATCH ")
-
-string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_MAJOR  ${ALPAKA_VERSION_MAJOR_HPP})
-string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_MINOR  ${ALPAKA_VERSION_MINOR_HPP})
-string(REGEX MATCH "([0-9]+)" ALPAKA_VERSION_PATCH  ${ALPAKA_VERSION_PATCH_HPP})
-
-SET(PACKAGE_VERSION "${ALPAKA_VERSION_MAJOR}.${ALPAKA_VERSION_MINOR}.${ALPAKA_VERSION_PATCH}")
-
-#-------------------------------------------------------------------------------
-# Set return values.
-SET(alpaka_VERSION "${ALPAKA_VERSION_MAJOR}.${ALPAKA_VERSION_MINOR}.${ALPAKA_VERSION_PATCH}")
-SET(alpaka_COMPILE_OPTIONS ${_ALPAKA_COMPILE_OPTIONS_PUBLIC})
-SET(alpaka_COMPILE_DEFINITIONS ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC})
-# Add '-D' to the definitions
-SET(alpaka_DEFINITIONS ${_ALPAKA_COMPILE_DEFINITIONS_PUBLIC})
-list_add_prefix("-D" alpaka_DEFINITIONS)
-# Add the compile options to the definitions.
-LIST(APPEND alpaka_DEFINITIONS ${_ALPAKA_COMPILE_OPTIONS_PUBLIC})
-SET(alpaka_INCLUDE_DIR ${_ALPAKA_INCLUDE_DIRECTORY})
-SET(alpaka_INCLUDE_DIRS ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC})
-LIST(APPEND alpaka_INCLUDE_DIRS ${_ALPAKA_INCLUDE_DIRECTORY})
-SET(alpaka_LIBRARY ${_ALPAKA_LINK_LIBRARY})
-SET(alpaka_LIBRARIES ${_ALPAKA_LINK_FLAGS_PUBLIC})
-LIST(APPEND alpaka_LIBRARIES ${_ALPAKA_LINK_LIBRARIES_PUBLIC})
-
-#-------------------------------------------------------------------------------
-# Print the return values.
-IF(${ALPAKA_DEBUG} GREATER 0)
-    MESSAGE(STATUS "alpaka_FOUND: ${alpaka_FOUND}")
-    MESSAGE(STATUS "alpaka_VERSION: ${alpaka_VERSION}")
-    MESSAGE(STATUS "alpaka_COMPILE_OPTIONS: ${alpaka_COMPILE_OPTIONS}")
-    MESSAGE(STATUS "alpaka_COMPILE_DEFINITIONS: ${alpaka_COMPILE_DEFINITIONS}")
-    MESSAGE(STATUS "alpaka_DEFINITIONS: ${alpaka_DEFINITIONS}")
-    MESSAGE(STATUS "alpaka_INCLUDE_DIR: ${alpaka_INCLUDE_DIR}")
-    MESSAGE(STATUS "alpaka_INCLUDE_DIRS: ${alpaka_INCLUDE_DIRS}")
-    MESSAGE(STATUS "alpaka_LIBRARY: ${alpaka_LIBRARY}")
-    MESSAGE(STATUS "alpaka_LIBRARIES: ${alpaka_LIBRARIES}")
-ENDIF()
-
-# Unset already set variables if not found.
-IF(NOT _ALPAKA_FOUND)
-    UNSET(alpaka_FOUND)
-    UNSET(alpaka_VERSION)
-    UNSET(alpaka_COMPILE_OPTIONS)
-    UNSET(alpaka_COMPILE_DEFINITIONS)
-    UNSET(alpaka_DEFINITIONS)
-    UNSET(alpaka_INCLUDE_DIR)
-    UNSET(alpaka_INCLUDE_DIRS)
-    UNSET(alpaka_LIBRARY)
-    UNSET(alpaka_LIBRARIES)
-
-    UNSET(_ALPAKA_FOUND)
-    UNSET(_ALPAKA_COMPILE_OPTIONS_PUBLIC)
-    UNSET(_ALPAKA_COMPILE_DEFINITIONS_PUBLIC)
-    UNSET(_ALPAKA_COMPILE_DEFINITIONS_HIP)
-    UNSET(_ALPAKA_HIP_LIBRARIES)
-    UNSET(_ALPAKA_INCLUDE_DIRECTORY)
-    UNSET(_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC)
-    UNSET(_ALPAKA_LINK_LIBRARY)
-    UNSET(_ALPAKA_LINK_LIBRARIES_PUBLIC)
-    UNSET(_ALPAKA_LINK_FLAGS_PUBLIC)
-    UNSET(_ALPAKA_COMMON_FILE)
-    UNSET(_ALPAKA_ADD_EXECUTABLE_FILE)
-    UNSET(_ALPAKA_ADD_LIBRARY_FILE)
-    UNSET(_ALPAKA_FILES_HEADER)
-    UNSET(_ALPAKA_FILES_OTHER)
-    UNSET(_ALPAKA_BOOST_MIN_VER)
-ELSE()
-    # Make internal variables advanced options in the GUI.
-    MARK_AS_ADVANCED(
-        alpaka_INCLUDE_DIR
-        alpaka_LIBRARY
-        _ALPAKA_COMPILE_OPTIONS_PUBLIC
-        _ALPAKA_COMPILE_DEFINITIONS_PUBLIC
-        _ALPAKA_INCLUDE_DIRECTORY
-        _ALPAKA_INCLUDE_DIRECTORIES_PUBLIC
-        _ALPAKA_LINK_LIBRARY
-        _ALPAKA_LINK_LIBRARIES_PUBLIC
-        _ALPAKA_LINK_FLAGS_PUBLIC
-        _ALPAKA_COMMON_FILE
-        _ALPAKA_ADD_EXECUTABLE_FILE
-        _ALPAKA_ADD_LIBRARY_FILE
-        _ALPAKA_FILES_HEADER
-        _ALPAKA_FILES_OTHER
-        _ALPAKA_BOOST_MIN_VER)
-ENDIF()
-
-###############################################################################
-# FindPackage options
-
-# Handles the REQUIRED, QUIET and version-related arguments for FIND_PACKAGE.
-# NOTE: We do not check for alpaka_LIBRARIES and alpaka_DEFINITIONS because they can be empty.
-INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(
-    "alpaka"
-    FOUND_VAR alpaka_FOUND
-    REQUIRED_VARS alpaka_INCLUDE_DIR
-    VERSION_VAR alpaka_VERSION)
diff --git a/thirdParty/cupla/alpaka/cmake/addExecutable.cmake b/thirdParty/cupla/alpaka/cmake/addExecutable.cmake
index b602374544..3cb8dd651d 100644
--- a/thirdParty/cupla/alpaka/cmake/addExecutable.cmake
+++ b/thirdParty/cupla/alpaka/cmake/addExecutable.cmake
@@ -1,14 +1,14 @@
 #
 # Copyright 2014-2019 Benjamin Worpitz
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.15)
 
 #------------------------------------------------------------------------------
 # Calls CUDA_ADD_EXECUTABLE or ADD_EXECUTABLE depending on the enabled alpaka accelerators.
@@ -31,11 +31,9 @@ MACRO(ALPAKA_ADD_EXECUTABLE In_Name)
                     SET_SOURCE_FILES_PROPERTIES(${_file} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
                 ENDIF()
             ENDFOREACH()
-            IF (CMAKE_VERSION VERSION_LESS 3.9.0)
-                CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-            ELSE()
-                SET(CUDA_LINK_LIBRARIES_KEYWORD "PUBLIC")
-            ENDIF()
+
+            SET(CUDA_LINK_LIBRARIES_KEYWORD "PUBLIC")
+
             CUDA_ADD_EXECUTABLE(
                 ${In_Name}
                 ${ARGN})
@@ -46,11 +44,6 @@ MACRO(ALPAKA_ADD_EXECUTABLE In_Name)
 		            SET_SOURCE_FILES_PROPERTIES(${_file} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT OBJ)
 		        ENDIF()
 	      ENDFOREACH()
-        IF (CMAKE_VERSION VERSION_LESS 3.9.0)
-            CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-        ELSE()
-            SET(HIP_LINK_LIBRARIES_KEYWORD "PUBLIC")
-        ENDIF()
 
 	      HIP_ADD_EXECUTABLE(
 		        ${In_Name}
diff --git a/thirdParty/cupla/alpaka/cmake/addLibrary.cmake b/thirdParty/cupla/alpaka/cmake/addLibrary.cmake
index 2d1c497341..8ee02ceb47 100644
--- a/thirdParty/cupla/alpaka/cmake/addLibrary.cmake
+++ b/thirdParty/cupla/alpaka/cmake/addLibrary.cmake
@@ -1,14 +1,14 @@
 #
 # Copyright 2015-2019 Benjamin Worpitz, Maximilian Knespel
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.15)
 
 #------------------------------------------------------------------------------
 # Calls CUDA_ADD_LIBRARY or ADD_LIBRARY depending on the enabled alpaka
@@ -107,11 +107,7 @@ MACRO(ALPAKA_ADD_LIBRARY libraryName)
                     SET_SOURCE_FILES_PROPERTIES( ${_file} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ )
                 ENDIF()
             ENDFOREACH()
-            IF (CMAKE_VERSION VERSION_LESS 3.9.0)
-                CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-            ELSE()
-                SET(CUDA_LINK_LIBRARIES_KEYWORD "PUBLIC")
-            ENDIF()
+            SET(CUDA_LINK_LIBRARIES_KEYWORD "PUBLIC")
             CUDA_ADD_LIBRARY(
                 ${libraryName}
                 ${sourceFileNames}
@@ -128,7 +124,6 @@ MACRO(ALPAKA_ADD_LIBRARY libraryName)
                     SET_SOURCE_FILES_PROPERTIES( ${_file} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT OBJ )
                 ENDIF()
             ENDFOREACH()
-            CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_LIBRARY calls TARGET_LINK_LIBRARIES without keywords.
             HIP_ADD_LIBRARY(
                 ${libraryName}
                 ${sourceFileNames}
diff --git a/thirdParty/cupla/alpaka/cmake/alpakaCommon.cmake b/thirdParty/cupla/alpaka/cmake/alpakaCommon.cmake
new file mode 100644
index 0000000000..b4b7074750
--- /dev/null
+++ b/thirdParty/cupla/alpaka/cmake/alpakaCommon.cmake
@@ -0,0 +1,865 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Erik Zenker, Axel Huebl, Jan Stephan
+#                     Rene Widera
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+include(CMakePrintHelpers) # for easier printing of variables and properties
+
+#-------------------------------------------------------------------------------
+# Options.
+
+# HIP and platform selection and warning about unsupported features
+option(ALPAKA_ACC_GPU_HIP_ENABLE "Enable the HIP back-end (all other back-ends must be disabled)" OFF)
+option(ALPAKA_ACC_GPU_HIP_ONLY_MODE "Only back-ends using HIP can be enabled in this mode." OFF) # HIP only runs without other back-ends
+
+# Drop-down combo box in cmake-gui for HIP platforms.
+set(ALPAKA_HIP_PLATFORM "clang" CACHE STRING "Specify HIP platform")
+set_property(CACHE ALPAKA_HIP_PLATFORM PROPERTY STRINGS "nvcc;clang")
+
+if(ALPAKA_ACC_GPU_HIP_ENABLE AND NOT ALPAKA_ACC_GPU_HIP_ONLY_MODE AND ALPAKA_HIP_PLATFORM MATCHES "nvcc")
+    message(FATAL_ERROR "HIP back-end must be used together with ALPAKA_ACC_GPU_HIP_ONLY_MODE")
+endif()
+
+if(ALPAKA_ACC_GPU_HIP_ENABLE AND ALPAKA_HIP_PLATFORM MATCHES "clang")
+    message(WARNING
+        "The HIP back-end is currently experimental."
+        "alpaka HIP backend compiled with clang does not support callback functions."
+        )
+endif()
+
+option(ALPAKA_ACC_GPU_CUDA_ENABLE "Enable the CUDA GPU back-end" OFF)
+option(ALPAKA_ACC_GPU_CUDA_ONLY_MODE "Only back-ends using CUDA can be enabled in this mode (This allows to mix alpaka code with native CUDA code)." OFF)
+
+if(ALPAKA_ACC_GPU_CUDA_ONLY_MODE AND NOT ALPAKA_ACC_GPU_CUDA_ENABLE)
+    message(FATAL_ERROR "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, ALPAKA_ACC_GPU_CUDA_ENABLE has to be enabled as well.")
+endif()
+if(ALPAKA_ACC_GPU_HIP_ONLY_MODE AND NOT ALPAKA_ACC_GPU_HIP_ENABLE)
+    message(FATAL_ERROR "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, ALPAKA_ACC_GPU_HIP_ENABLE has to be enabled as well.")
+endif()
+
+option(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE "Enable the serial CPU back-end" OFF)
+option(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE "Enable the threads CPU block thread back-end" OFF)
+option(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE "Enable the fibers CPU block thread back-end" OFF)
+option(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE "Enable the TBB CPU grid block back-end" OFF)
+option(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE "Enable the OpenMP 2.0 CPU grid block back-end" OFF)
+option(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE "Enable the OpenMP 2.0 CPU block thread back-end" OFF)
+option(ALPAKA_ACC_ANY_BT_OMP5_ENABLE "Enable the OpenMP 5.0 CPU block and block thread back-end" OFF)
+option(ALPAKA_ACC_ANY_BT_OACC_ENABLE "Enable the OpenACC block and block thread back-end" OFF)
+
+if((ALPAKA_ACC_GPU_CUDA_ONLY_MODE OR ALPAKA_ACC_GPU_HIP_ONLY_MODE)
+   AND
+    (ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE OR
+    ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE OR
+    ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE OR
+    ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE OR
+    ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR
+    ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR
+    ALPAKA_ACC_ANY_BT_OMP5_ENABLE))
+    if(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
+        message(FATAL_ERROR "If ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled, only back-ends using CUDA can be enabled! This allows to mix alpaka code with native CUDA code. However, this prevents any non-CUDA back-ends from being enabled.")
+    endif()
+    if(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
+        message(FATAL_ERROR "If ALPAKA_ACC_GPU_HIP_ONLY_MODE is enabled, only back-ends using HIP can be enabled!")
+    endif()
+    set(_ALPAKA_FOUND FALSE)
+elseif(ALPAKA_ACC_ANY_BT_OACC_ENABLE)
+    if((ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR
+       ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR
+       ALPAKA_ACC_ANY_BT_OMP5_ENABLE))
+       message(WARNING "If ALPAKA_ACC_ANY_BT_OACC_ENABLE is enabled no OpenMP backend can be enabled.")
+    endif()
+endif()
+
+# avoids CUDA+HIP conflict
+if(ALPAKA_ACC_GPU_HIP_ENABLE AND ALPAKA_ACC_GPU_CUDA_ENABLE)
+    message(FATAL_ERROR "CUDA and HIP can not be enabled both at the same time.")
+endif()
+
+# HIP is only supported on Linux
+if(ALPAKA_ACC_GPU_HIP_ENABLE AND (MSVC OR WIN32))
+    message(FATAL_ERROR "Optional alpaka dependency HIP can not be built on Windows!")
+endif()
+
+# Drop-down combo box in cmake-gui.
+set(ALPAKA_DEBUG "0" CACHE STRING "Debug level")
+set_property(CACHE ALPAKA_DEBUG PROPERTY STRINGS "0;1;2")
+
+set(ALPAKA_CXX_STANDARD "14" CACHE STRING "C++ standard version")
+set_property(CACHE ALPAKA_CXX_STANDARD PROPERTY STRINGS "14;17;20")
+
+if(NOT TARGET alpaka)
+    add_library(alpaka INTERFACE)
+
+    target_compile_features(alpaka INTERFACE cxx_std_${ALPAKA_CXX_STANDARD})
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
+        # Workaround for STL atomic issue: https://forums.developer.nvidia.com/t/support-for-atomic-in-libstdc-missing/135403/2
+        # still appears in NVHPC 20.7
+        target_compile_definitions(alpaka INTERFACE "__GCC_ATOMIC_TEST_AND_SET_TRUEVAL=1")
+    endif()
+
+    add_library(alpaka::alpaka ALIAS alpaka)
+endif()
+
+set(ALPAKA_OFFLOAD_MAX_BLOCK_SIZE "256" CACHE STRING "Maximum number threads per block to be suggested by any target offloading backends ANY_BT_OMP5 and ANY_BT_OACC.")
+option(ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST "Allow host-only contructs like assert in offload code in debug mode." ON)
+set(ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB "30" CACHE STRING "Kibibytes (1024B) of memory to allocate for block shared memory for backends requiring static allocation (includes CPU_B_OMP2_T_SEQ, CPU_B_TBB_T_SEQ, CPU_B_SEQ_T_SEQ)")
+
+#-------------------------------------------------------------------------------
+# Debug output of common variables.
+if(${ALPAKA_DEBUG} GREATER 1)
+    cmake_print_variables(_ALPAKA_ROOT_DIR)
+    cmake_print_variables(_ALPAKA_COMMON_FILE)
+    cmake_print_variables(_ALPAKA_ADD_EXECUTABLE_FILE)
+    cmake_print_variables(_ALPAKA_ADD_LIBRARY_FILE)
+    cmake_print_variables(CMAKE_BUILD_TYPE)
+endif()
+
+#-------------------------------------------------------------------------------
+# Check supported compilers.
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0)
+    message(FATAL_ERROR "Clang versions < 4.0 are not supported!")
+endif()
+
+if(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE AND (ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE))
+    message(FATAL_ERROR "Fibers and CUDA or HIP back-end can not be enabled both at the same time.")
+endif()
+
+#-------------------------------------------------------------------------------
+# Compiler settings.
+
+if(MSVC)
+    # CUDA\v9.2\include\crt/host_runtime.h(265): warning C4505: '__cudaUnregisterBinaryUtil': unreferenced local function has been removed
+    if(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
+        target_compile_options(alpaka INTERFACE "/wd4505")
+    endif()
+else()
+    find_package(Threads REQUIRED)
+    target_link_libraries(alpaka INTERFACE Threads::Threads)
+
+    if(NOT APPLE)
+        # librt: undefined reference to `clock_gettime'
+        find_library(RT_LIBRARY rt)
+        if(RT_LIBRARY)
+            target_link_libraries(alpaka INTERFACE ${RT_LIBRARY})
+        endif()
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Find Boost.
+set(_ALPAKA_BOOST_MIN_VER "1.65.1")
+
+if(${ALPAKA_DEBUG} GREATER 1)
+    SET(Boost_DEBUG ON)
+    SET(Boost_DETAILED_FAILURE_MSG ON)
+endif()
+
+find_package(Boost ${_ALPAKA_BOOST_MIN_VER} REQUIRED
+             OPTIONAL_COMPONENTS fiber)
+
+target_link_libraries(alpaka INTERFACE Boost::headers)
+
+if(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
+    if(NOT Boost_FIBER_FOUND)
+        message(FATAL_ERROR "Optional alpaka dependency Boost.Fiber could not be found!")
+    endif()
+endif()
+
+if(${ALPAKA_DEBUG} GREATER 1)
+    message(STATUS "Boost in:")
+    cmake_print_variables(BOOST_ROOT)
+    cmake_print_variables(BOOSTROOT)
+    cmake_print_variables(BOOST_INCLUDEDIR)
+    cmake_print_variables(BOOST_LIBRARYDIR)
+    cmake_print_variables(Boost_NO_SYSTEM_PATHS)
+    cmake_print_variables(Boost_ADDITIONAL_VERSIONS)
+    cmake_print_variables(Boost_USE_MULTITHREADED)
+    cmake_print_variables(Boost_USE_STATIC_LIBS)
+    cmake_print_variables(Boost_USE_STATIC_RUNTIME)
+    cmake_print_variables(Boost_USE_DEBUG_RUNTIME)
+    cmake_print_variables(Boost_USE_DEBUG_PYTHON)
+    cmake_print_variables(Boost_USE_STLPORT)
+    cmake_print_variables(Boost_USE_STLPORT_DEPRECATED_NATIVE_IOSTREAMS)
+    cmake_print_variables(Boost_COMPILER)
+    cmake_print_variables(Boost_THREADAPI)
+    cmake_print_variables(Boost_NAMESPACE)
+    cmake_print_variables(Boost_DEBUG)
+    cmake_print_variables(Boost_DETAILED_FAILURE_MSG)
+    cmake_print_variables(Boost_REALPATH)
+    cmake_print_variables(Boost_NO_BOOST_CMAKE)
+    message(STATUS "Boost out:")
+    cmake_print_variables(Boost_FOUND)
+    cmake_print_variables(Boost_INCLUDE_DIRS)
+    cmake_print_variables(Boost_LIBRARY_DIRS)
+    cmake_print_variables(Boost_LIBRARIES)
+    cmake_print_variables(Boost_FIBER_FOUND)
+    cmake_print_variables(Boost_FIBER_LIBRARY)
+    cmake_print_variables(Boost_CONTEXT_FOUND)
+    cmake_print_variables(Boost_CONTEXT_LIBRARY)
+    cmake_print_variables(Boost_SYSTEM_FOUND)
+    cmake_print_variables(Boost_SYSTEM_LIBRARY)
+    cmake_print_variables(Boost_THREAD_FOUND)
+    cmake_print_variables(Boost_THREAD_LIBRARY)
+    cmake_print_variables(Boost_ATOMIC_FOUND)
+    cmake_print_variables(Boost_ATOMIC_LIBRARY)
+    cmake_print_variables(Boost_CHRONO_FOUND)
+    cmake_print_variables(Boost_CHRONO_LIBRARY)
+    cmake_print_variables(Boost_DATE_TIME_FOUND)
+    cmake_print_variables(Boost_DATE_TIME_LIBRARY)
+    cmake_print_variables(Boost_VERSION)
+    cmake_print_variables(Boost_LIB_VERSION)
+    cmake_print_variables(Boost_MAJOR_VERSION)
+    cmake_print_variables(Boost_MINOR_VERSION)
+    cmake_print_variables(Boost_SUBMINOR_VERSION)
+    cmake_print_variables(Boost_LIB_DIAGNOSTIC_DEFINITIONS)
+    message(STATUS "Boost cached:")
+    cmake_print_variables(Boost_INCLUDE_DIR)
+    cmake_print_variables(Boost_LIBRARY_DIR)
+endif()
+
+#-------------------------------------------------------------------------------
+# Find TBB.
+if(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)
+    find_package(TBB)
+    if(TBB_FOUND)
+        target_link_libraries(alpaka INTERFACE TBB::tbb)
+    else()
+        message(FATAL_ERROR "Optional alpaka dependency TBB could not be found!")
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Find OpenMP.
+if(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR ALPAKA_ACC_ANY_BT_OMP5_ENABLE)
+    find_package(OpenMP)
+
+    if(OpenMP_CXX_FOUND)
+        if(ALPAKA_ACC_ANY_BT_OMP5_ENABLED)
+            if(OpenMP_CXX_VERSION VERSION_LESS 4.0)
+                message(FATAL_ERROR "ALPAKA_ACC_ANY_BT_OMP5_ENABLE requires compiler support for OpenMP at least 4.0, 5.0 is recommended.")
+            elseif(OpenMP_CXX_VERSION VERSION_LESS 5.0)
+                message(WARNING "OpenMP < 5.0, for ALPAKA_ACC_ANY_BT_OMP5_ENABLE 5.0 is recommended.")
+            endif()
+        endif()
+
+        target_link_libraries(alpaka INTERFACE OpenMP::OpenMP_CXX)
+
+        # Clang versions starting from 3.9 support OpenMP 4.0 and higher only when given the corresponding flag
+        if(ALPAKA_ACC_ANY_BT_OMP5_ENABLE)
+            target_link_options(alpaka INTERFACE $<$<CXX_COMPILER_ID:AppleClang,Clang>:-fopenmp-version=40>)
+        endif()
+    else()
+        message(FATAL_ERROR "Optional alpaka dependency OpenMP could not be found!")
+    endif()
+endif()
+
+if(ALPAKA_ACC_ANY_BT_OACC_ENABLE)
+   find_package(OpenACC)
+   if(OpenACC_CXX_FOUND)
+      target_compile_options(alpaka INTERFACE ${OpenACC_CXX_OPTIONS})
+      target_link_options(alpaka INTERFACE ${OpenACC_CXX_OPTIONS})
+   endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Find CUDA.
+if(ALPAKA_ACC_GPU_CUDA_ENABLE)
+
+    if(NOT DEFINED ALPAKA_CUDA_VERSION)
+        set(ALPAKA_CUDA_VERSION 9.0)
+    endif()
+
+    if(ALPAKA_CUDA_VERSION VERSION_LESS 9.0)
+        message(FATAL_ERROR "CUDA Toolkit < 9.0 is not supported!")
+
+    else()
+        find_package(CUDA "${ALPAKA_CUDA_VERSION}")
+        if(NOT CUDA_FOUND)
+            message(FATAL_ERROR "Optional alpaka dependency CUDA could not be found!")
+        else()
+            set(ALPAKA_CUDA_VERSION "${CUDA_VERSION}")
+            if(CUDA_VERSION VERSION_LESS 10.3)
+                set(ALPAKA_CUDA_ARCH "30" CACHE STRING "GPU architecture")
+            else()
+                set(ALPAKA_CUDA_ARCH "35" CACHE STRING "GPU architecture")
+            endif()
+            set(ALPAKA_CUDA_COMPILER "nvcc" CACHE STRING "CUDA compiler")
+            set_property(CACHE ALPAKA_CUDA_COMPILER PROPERTY STRINGS "nvcc;clang")
+
+            option(ALPAKA_CUDA_FAST_MATH "Enable fast-math" ON)
+            option(ALPAKA_CUDA_FTZ "Set flush to zero for GPU" OFF)
+            option(ALPAKA_CUDA_SHOW_REGISTER "Show kernel registers and create PTX" OFF)
+            option(ALPAKA_CUDA_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps 'CMakeFiles/<targetname>.dir'" OFF)
+            option(ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA "Enable experimental, extended host-device lambdas in NVCC" ON)
+            option(ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION "Enable separable compilation in NVCC" OFF)
+
+            if(ALPAKA_CUDA_COMPILER MATCHES "clang")
+                if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+                    message(FATAL_ERROR "Using clang as CUDA compiler is only possible if clang is the host compiler!")
+                endif()
+
+                if(CMAKE_CXX_COMPILER_VERSION LESS 6.0)
+                    if(CUDA_VERSION GREATER_EQUAL 9.0)
+                        message(FATAL_ERROR "Clang versions lower than 6 do not support CUDA 9 or greater!")
+                    endif()
+                elseif(CMAKE_CXX_COMPILER_VERSION LESS 7.0)
+                    if(CUDA_VERSION GREATER_EQUAL 9.1)
+                        message(FATAL_ERROR "Clang versions lower than 7 do not support CUDA 9.1 or greater!")
+                    endif()
+                elseif(CMAKE_CXX_COMPILER_VERSION LESS 8.0)
+                    if(CUDA_VERSION GREATER_EQUAL 10.0)
+                        message(FATAL_ERROR "Clang versions lower than 8 do not support CUDA 10.0 or greater!")
+                    endif()
+                elseif(CMAKE_CXX_COMPILER_VERSION LESS 9.0)
+                    if(CUDA_VERSION GREATER_EQUAL 10.1)
+                        message(FATAL_ERROR "Clang versions lower than 9 do not support CUDA 10.1 or greater!")
+                    endif()
+                elseif(CMAKE_CXX_COMPILER_VERSION LESS 10.0)
+                    if(CUDA_VERSION GREATER_EQUAL 10.2)
+                        message(FATAL_ERROR "Clang versions lower than 10 do not support CUDA 10.2 or greater!")
+                    endif()
+                endif()
+
+                if(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
+                    message(FATAL_ERROR "Clang as a CUDA compiler does not support boost.fiber!")
+                endif()
+                if(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)
+                    message(FATAL_ERROR "Clang as a CUDA compiler does not support OpenMP 2!")
+                endif()
+                if(ALPAKA_ACC_ANY_BT_OMP5_ENABLE)
+                    message(FATAL_ERROR "Clang as a CUDA compiler does not support OpenMP 5!")
+                endif()
+
+                foreach(_CUDA_ARCH_ELEM ${ALPAKA_CUDA_ARCH})
+                    target_compile_options(alpaka INTERFACE  "--cuda-gpu-arch=sm_${_CUDA_ARCH_ELEM}")
+                endforeach()
+
+                target_compile_options(alpaka INTERFACE "--cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+
+                # This flag silences the warning produced by the Dummy.cpp files:
+                # clang: warning: argument unused during compilation: '--cuda-gpu-arch=sm_XX'
+                # This seems to be a false positive as all flags are 'unused' for an empty file.
+                target_compile_options(alpaka INTERFACE "-Qunused-arguments")
+
+                # Silences warnings that are produced by boost because clang is not correctly identified.
+                target_compile_options(alpaka INTERFACE "-Wno-unused-local-typedef")
+
+                if(ALPAKA_CUDA_FAST_MATH)
+                    # -ffp-contract=fast enables the usage of FMA
+                    target_compile_options(alpaka INTERFACE "-ffast-math" "-ffp-contract=fast")
+                endif()
+
+                if(ALPAKA_CUDA_FTZ)
+                    target_compile_options(alpaka INTERFACE "-fcuda-flush-denormals-to-zero")
+                endif()
+
+                if(ALPAKA_CUDA_SHOW_REGISTER)
+                    target_compile_options(alpaka INTERFACE "-Xcuda-ptxas=-v")
+                endif()
+
+                if(ALPAKA_CUDA_KEEP_FILES)
+                    target_compile_options(alpaka INTERFACE "-save-temps")
+                endif()
+
+                # CMake 3.15 does not provide the `--std=c++*` argument to clang anymore.
+                # It is not necessary for basic c++ compilation because clangs default is already higher, but CUDA code compiled with -x cuda still defaults to c++98.
+                if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.15.0")
+                    target_compile_options(alpaka INTERFACE "-std=c++${ALPAKA_CXX_STANDARD}")
+                endif()
+
+            else()
+                if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+                    if((CUDA_VERSION VERSION_EQUAL 9.0) OR (CUDA_VERSION VERSION_EQUAL 9.1))
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 6.0)
+                            message(FATAL_ERROR "NVCC 9.0 - 9.1 do not support GCC 7+ and fail compiling the std::tuple implementation in GCC 6+. Please use GCC 5!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 9.2)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 8.0)
+                            message(FATAL_ERROR "NVCC 9.2 does not support GCC 8+. Please use GCC 5, 6 or 7!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 10.0)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 8.0)
+                            message(FATAL_ERROR "NVCC 10.0 does not support GCC 8+. Please use GCC 5, 6 or 7!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 10.1)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
+                            message(FATAL_ERROR "NVCC 10.1 does not support GCC 9+. Please use GCC 5, 6, 7 or 8!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 10.2)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
+                            message(FATAL_ERROR "NVCC 10.2 does not support GCC 9+. Please use GCC 5, 6, 7 or 8!")
+                        endif()
+                    endif()
+                elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+                    if(CUDA_VERSION VERSION_EQUAL 9.0)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 4.0)
+                            message(FATAL_ERROR "NVCC 9.0 does not support clang 4+. Please use NVCC 9.1!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 9.1)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 5.0)
+                            message(FATAL_ERROR "NVCC 9.1 does not support clang 5+. Please use clang 4!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 9.2)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 5.0)
+                            message(FATAL_ERROR "NVCC 9.2 does not support clang 6+ and fails compiling with clang 5. Please use clang 4!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 10.0)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 7.0)
+                            message(FATAL_ERROR "NVCC 10.0 does not support clang 7+. Please use clang 4, 5 or 6!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 10.1)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
+                            message(FATAL_ERROR "NVCC 10.1 does not support clang 9+. Please use clang 4, 5, 6, 7 or 8!")
+                        endif()
+                    elseif(CUDA_VERSION VERSION_EQUAL 10.2)
+                        if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 9.0)
+                            message(FATAL_ERROR "NVCC 10.2 does not support clang 9+. Please use clang 4, 5, 6, 7 or 8!")
+                        endif()
+                    endif()
+                endif()
+
+                if(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
+                    message(FATAL_ERROR "NVCC does not support boost.fiber!")
+                endif()
+
+                # Clean up the flags. Else, multiple find calls would result in duplicate flags. Furthermore, other modules may have set different settings.
+                set(CUDA_NVCC_FLAGS)
+
+                if(${ALPAKA_DEBUG} GREATER 1)
+                    set(CUDA_VERBOSE_BUILD ON)
+                endif()
+
+                set(CUDA_PROPAGATE_HOST_FLAGS ON)
+
+                if(ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION)
+                    set(CUDA_SEPARABLE_COMPILATION ON)
+                endif()
+
+                # nvcc sets no linux/__linux macros on OpenPOWER linux
+                # nvidia bug id: 2448610
+                if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+                    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
+                        list(APPEND CUDA_NVCC_FLAGS -Dlinux)
+                    endif()
+                endif()
+
+                # NOTE: Since CUDA 10.2 this option is also alternatively called '--extended-lambda'
+                if(ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA)
+                    list(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
+                endif()
+                # This is mandatory because with c++14 many standard library functions we rely on are constexpr (std::min, std::multiplies, ...)
+                list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
+
+                foreach(_CUDA_ARCH_ELEM ${ALPAKA_CUDA_ARCH})
+                    # set flags to create device code for the given architecture
+                    list(APPEND CUDA_NVCC_FLAGS
+                        --generate-code=arch=compute_${_CUDA_ARCH_ELEM},code=sm_${_CUDA_ARCH_ELEM}
+                        --generate-code=arch=compute_${_CUDA_ARCH_ELEM},code=compute_${_CUDA_ARCH_ELEM}
+                    )
+                endforeach()
+
+                if(NOT MSVC OR MSVC_VERSION GREATER_EQUAL 1920)
+                    list(APPEND CUDA_NVCC_FLAGS -std=c++${ALPAKA_CXX_STANDARD})
+                endif()
+
+                set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+
+                if((CMAKE_BUILD_TYPE STREQUAL "Debug") OR (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo"))
+                    list(APPEND CUDA_NVCC_FLAGS -g)
+                    list(APPEND CUDA_NVCC_FLAGS -lineinfo)
+                endif()
+
+                if(ALPAKA_CUDA_FAST_MATH)
+                    list(APPEND CUDA_NVCC_FLAGS --use_fast_math)
+                endif()
+
+                if(ALPAKA_CUDA_FTZ)
+                    list(APPEND CUDA_NVCC_FLAGS --ftz=true)
+                else()
+                    list(APPEND CUDA_NVCC_FLAGS --ftz=false)
+                endif()
+
+                if(ALPAKA_CUDA_SHOW_REGISTER)
+                    list(APPEND CUDA_NVCC_FLAGS -Xptxas=-v)
+                endif()
+
+                # Always add warning/error numbers which can be used for suppressions
+                list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--display_error_number)
+
+                # avoids warnings on host-device signatured, default constructors/destructors
+                list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--diag_suppress=esa_on_defaulted_function_ignored)
+
+                # avoids warnings on host-device signature of 'std::__shared_count<>'
+                if(CUDA_VERSION EQUAL 10.0)
+                    list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--diag_suppress=2905)
+                elseif(CUDA_VERSION EQUAL 10.1)
+                    list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--diag_suppress=2912)
+                elseif(CUDA_VERSION EQUAL 10.2)
+                    list(APPEND CUDA_NVCC_FLAGS -Xcudafe=--diag_suppress=2976)
+                endif()
+
+                if(ALPAKA_CUDA_KEEP_FILES)
+                    #file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/nvcc_tmp")
+                    list(APPEND CUDA_NVCC_FLAGS --keep)
+                    #list(APPEND CUDA_NVCC_FLAGS --keep-dir="${PROJECT_BINARY_DIR}/nvcc_tmp")
+                endif()
+
+                option(ALPAKA_CUDA_SHOW_CODELINES "Show kernel lines in cuda-gdb and cuda-memcheck. If ALPAKA_CUDA_KEEP_FILES is enabled source code will be inlined in ptx." OFF)
+                if(ALPAKA_CUDA_SHOW_CODELINES)
+                    list(APPEND CUDA_NVCC_FLAGS --source-in-ptx -lineinfo)
+                    if(NOT MSVC)
+                        list(APPEND CUDA_NVCC_FLAGS -Xcompiler=-rdynamic)
+                    endif()
+                endif()
+            endif()
+
+            if(OpenMP_CXX_FOUND)
+                # correctly propagate OpenMP flags
+                # This can be removed once we support CMake's first class CUDA support.
+                target_compile_options(alpaka INTERFACE ${OpenMP_CXX_FLAGS})
+            endif()
+
+            target_link_libraries(alpaka INTERFACE ${CUDA_CUDART_LIBRARY})
+            target_include_directories(alpaka INTERFACE ${CUDA_INCLUDE_DIRS})
+        endif()
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Find HIP.
+if(ALPAKA_ACC_GPU_HIP_ENABLE)
+
+    if(NOT DEFINED ALPAKA_HIP_VERSION)
+        set(ALPAKA_HIP_VERSION 3.5)
+    endif()
+
+    if(ALPAKA_HIP_VERSION VERSION_LESS 3.5)
+        message(FATAL_ERROR "HIP < 3.5 is not supported!")
+    else()
+        # must set this for HIP package (note that you also need certain env vars)
+        set(HIP_PLATFORM "${ALPAKA_HIP_PLATFORM}" CACHE STRING "")
+        set(HIP_RUNTIME "${ALPAKA_HIP_PLATFORM}" CACHE STRING "")
+
+        find_package(HIP "${ALPAKA_HIP_VERSION}")
+        if(NOT HIP_FOUND)
+            message(FATAL_ERROR "Optional alpaka dependency HIP could not be found!")
+        else()
+            set(ALPAKA_HIP_VERSION "${HIP_VERSION}")
+            set(ALPAKA_HIP_COMPILER "hipcc" CACHE STRING "HIP compiler")
+            set_property(CACHE ALPAKA_HIP_COMPILER PROPERTY STRINGS "hipcc")
+
+            option(ALPAKA_HIP_FAST_MATH "Enable fast-math" ON)
+            option(ALPAKA_HIP_FTZ "Set flush to zero for GPU" OFF)
+            option(ALPAKA_HIP_SHOW_REGISTER "Show kernel registers and create PTX" OFF)
+            option(ALPAKA_HIP_KEEP_FILES "Keep all intermediate files that are generated during internal compilation steps in 'CMakeFiles/<targetname>.dir'." OFF)
+
+            set(HIP_HIPCC_FLAGS)
+
+            if(ALPAKA_HIP_PLATFORM MATCHES "nvcc")
+                find_package(CUDA)
+                if(NOT CUDA_FOUND)
+                    message(WARNING "Could not find CUDA while HIP platform is set to nvcc. Compilation might fail.")
+                endif()
+
+                if(CUDA_VERSION VERSION_LESS 10.3)
+                    set(ALPAKA_HIP_ARCH "30" CACHE STRING "GPU architecture")
+                else()
+                    set(ALPAKA_HIP_ARCH "35" CACHE STRING "GPU architecture")
+                endif()
+
+                if(CUDA_VERSION VERSION_LESS 9.0)
+                    message(FATAL_ERROR "CUDA Toolkit < 9.0 is not supported!")
+                endif()
+
+                if(${ALPAKA_DEBUG} GREATER 1)
+                    set(HIP_VERBOSE_BUILD ON)
+                endif()
+
+                list(APPEND HIP_NVCC_FLAGS --expt-extended-lambda)
+                list(APPEND HIP_NVCC_FLAGS --expt-relaxed-constexpr)
+                list(APPEND _ALPAKA_HIP_LIBRARIES "cudart")
+
+                foreach(_HIP_ARCH_ELEM ${ALPAKA_HIP_ARCH})
+                    # set flags to create device code for the given architecture
+                    list(APPEND CUDA_NVCC_FLAGS
+                        --generate-code=arch=compute_${_HIP_ARCH_ELEM},code=sm_${_HIP_ARCH_ELEM}
+                        --generate-code=arch=compute_${_HIP_ARCH_ELEM},code=compute_${_HIP_ARCH_ELEM}
+                    )
+                endforeach()
+                # for CUDA cmake automatically adds compiler flags as nvcc does not do this,
+                # but for HIP we have to do this here
+                list(APPEND HIP_NVCC_FLAGS -D__CUDACC__)
+                list(APPEND HIP_NVCC_FLAGS -ccbin ${CMAKE_CXX_COMPILER})
+
+                if((CMAKE_BUILD_TYPE STREQUAL "Debug") OR (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo"))
+                    list(APPEND CUDA_NVCC_FLAGS -lineinfo)
+                    list(APPEND HIP_NVCC_FLAGS -Xcompiler=-g)
+                endif()
+                # propage host flags
+                # SET(CUDA_PROPAGATE_HOST_FLAGS ON) # does not exist in HIP, so do it manually
+                string(TOUPPER "${CMAKE_BUILD_TYPE}" build_config)
+                foreach( _flag ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${build_config}})
+                    list(APPEND HIP_NVCC_FLAGS -Xcompiler=${_flag})
+                endforeach()
+
+                if(ALPAKA_HIP_FAST_MATH)
+                    list(APPEND HIP_HIPCC_FLAGS --use_fast_math)
+                endif()
+
+                if(ALPAKA_HIP_FTZ)
+                    list(APPEND HIP_HIPCC_FLAGS --ftz=true)
+                else()
+                    list(APPEND HIP_HIPCC_FLAGS --ftz=false)
+                endif()
+
+                if(ALPAKA_HIP_SHOW_REGISTER)
+                    list(APPEND HIP_HIPCC_FLAGS -Xptxas=-v)
+                endif()
+
+                # avoids warnings on host-device signatured, default constructors/destructors
+                list(APPEND HIP_HIPCC_FLAGS -Xcudafe=--diag_suppress=esa_on_defaulted_function_ignored)
+
+                # random numbers library ( HIP(NVCC) ) /hiprand
+                # HIP_ROOT_DIR is set by FindHIP.cmake
+                find_path(HIP_RAND_INC
+                    NAMES "hiprand_kernel.h"
+                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}/include" "hiprand"
+                    PATHS "/opt/rocm/rocrand/hiprand"
+                    PATH_SUFFIXES "include" "hiprand")
+                find_library(HIP_RAND_LIBRARY
+                    NAMES "hiprand-d" "hiprand"
+                    PATHS "${HIP_ROOT_DIR}/hiprand" "${HIP_ROOT_DIR}" "hiprand"
+                    PATHS "/opt/rocm/rocrand/hiprand"
+                    ENV HIP_PATH
+                    PATH_SUFFIXES "lib" "lib64")
+                if(NOT HIP_RAND_INC)
+                    message(FATAL_ERROR "Could not find hipRAND include (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}).")
+                endif()
+                if(NOT HIP_RAND_LIBRARY)
+                    message(FATAL_ERROR "Could not find hipRAND library (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}).")
+                endif()
+                target_include_directories(alpaka INTERFACE ${HIP_RAND_INC})
+                target_link_libraries(alpaka INTERFACE ${HIP_RAND_LIBRARY})
+            elseif(ALPAKA_HIP_PLATFORM MATCHES "clang")
+                # # hiprand requires ROCm implementation of random numbers by rocrand
+                find_package(rocrand REQUIRED CONFIG
+                    HINTS "${HIP_ROOT_DIR}/rocrand"
+                    HINTS "/opt/rocm/rocrand")
+                if(rocrand_FOUND)
+                    target_include_directories(alpaka INTERFACE ${rocrand_INCLUDE_DIRS})
+                    # ATTENTION: rocRand libraries are not required by alpaka
+                else()
+                    MESSAGE(FATAL_ERROR "Could not find rocRAND (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}/rocrand).")
+                endif()
+
+                if(ALPAKA_HIP_FAST_MATH)
+                    list(APPEND HIP_HIPCC_FLAGS -ffast-math)
+                endif()
+
+                # possible architectures can be found https://github.com/llvm/llvm-project/blob/master/clang/lib/Basic/Cuda.cpp#L65
+                # 900 -> AMD Vega64
+                # 902 -> AMD Vega 10
+                # 906 -> AMD Radeon VII, MI50/MI60
+                # 908 -> AMD MI100
+                set(ALPAKA_HIP_ARCH "906;908" CACHE STRING "AMD GPU architecture e.g. 906 for MI50/Radeon VII")
+
+                foreach(_HIP_ARCH_ELEM ${ALPAKA_HIP_ARCH})
+                    # set flags to create device code for the given architecture
+                    list(APPEND HIP_HIPCC_FLAGS --amdgpu-target=gfx${_HIP_ARCH_ELEM})
+                endforeach()
+            endif()
+
+            # # HIP random numbers
+            FIND_PACKAGE(hiprand REQUIRED CONFIG
+                HINTS "${HIP_ROOT_DIR}/hiprand"
+                HINTS "/opt/rocm/hiprand")
+            if(hiprand_FOUND)
+                target_include_directories(alpaka INTERFACE ${hiprand_INCLUDE_DIRS})
+                # ATTENTION: hipRand libraries are not required by alpaka
+            else()
+                MESSAGE(FATAL_ERROR "Could not find hipRAND (also searched in: HIP_ROOT_DIR=${HIP_ROOT_DIR}/hiprand).")
+            endif()
+
+            list(APPEND HIP_HIPCC_FLAGS -D__HIPCC__)
+            list(APPEND HIP_HIPCC_FLAGS -std=c++${ALPAKA_CXX_STANDARD})
+
+            if((CMAKE_BUILD_TYPE STREQUAL "Debug") OR (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo"))
+                list(APPEND HIP_HIPCC_FLAGS -g)
+            endif()
+
+            if(ALPAKA_HIP_KEEP_FILES)
+                list(APPEND HIP_HIPCC_FLAGS -save-temps)
+            endif()
+
+            if(_ALPAKA_HIP_LIBRARIES)
+                target_link_libraries(alpaka INTERFACE ${_ALPAKA_HIP_LIBRARIES})
+            endif()
+        endif()
+    endif()
+endif() # HIP
+
+#-------------------------------------------------------------------------------
+# alpaka.
+if(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_GPU_CUDA_ONLY_MODE")
+    message(STATUS ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
+endif()
+
+if(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_GPU_HIP_ONLY_MODE")
+    message(STATUS ALPAKA_ACC_GPU_HIP_ONLY_MODE)
+endif()
+
+if(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED")
+    message(STATUS ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+endif()
+
+if(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED")
+    message(STATUS ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+endif()
+if(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED")
+
+    if(MSVC AND (${CMAKE_SIZEOF_VOID_P} EQUAL 4))
+        # On Win32 boost context triggers:
+        # libboost_context-vc141-mt-gd-1_64.lib(jump_i386_ms_pe_masm.obj) : error LNK2026: module unsafe for SAFESEH image.
+        target_link_options(Boost::fiber INTERFACE "/SAFESEH:NO")
+    endif()
+    target_link_libraries(alpaka INTERFACE Boost::fiber)
+
+    message(STATUS ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED)
+endif()
+if(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED")
+    message(STATUS ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+endif()
+if(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED")
+    message(STATUS ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
+endif()
+if(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED")
+    message(STATUS ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED)
+endif()
+if(ALPAKA_ACC_ANY_BT_OMP5_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_ANY_BT_OMP5_ENABLED")
+    message(STATUS ALPAKA_ACC_ANY_BT_OMP5_ENABLED)
+endif()
+if(ALPAKA_ACC_ANY_BT_OACC_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_ANY_BT_OACC_ENABLED")
+    message(STATUS ALPAKA_ACC_ANY_BT_OACC_ENABLE)
+endif()
+if(ALPAKA_ACC_GPU_CUDA_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_GPU_CUDA_ENABLED")
+    message(STATUS ALPAKA_ACC_GPU_CUDA_ENABLED)
+endif()
+if(ALPAKA_ACC_GPU_HIP_ENABLE)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_ACC_GPU_HIP_ENABLED")
+    message(STATUS ALPAKA_ACC_GPU_HIP_ENABLED)
+endif()
+
+target_compile_definitions(alpaka INTERFACE "ALPAKA_DEBUG=${ALPAKA_DEBUG}")
+if(ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST)
+   target_compile_definitions(alpaka INTERFACE "ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST")
+endif()
+target_compile_definitions(alpaka INTERFACE "ALPAKA_OFFLOAD_MAX_BLOCK_SIZE=${ALPAKA_OFFLOAD_MAX_BLOCK_SIZE}")
+target_compile_definitions(alpaka INTERFACE "ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB=${ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB}")
+
+if(ALPAKA_CI)
+    target_compile_definitions(alpaka INTERFACE "ALPAKA_CI")
+endif()
+
+# cxx flags will not be forwarded to hip wrapped compiler, so it has to be provided manually
+if(ALPAKA_ACC_GPU_HIP_ENABLE)
+    get_property(_ALPAKA_COMPILE_DEFINITIONS_HIP
+                 TARGET alpaka
+                 PROPERTY INTERFACE_COMPILE_DEFINITIONS)
+    list_add_prefix("-D" _ALPAKA_COMPILE_DEFINITIONS_HIP)
+    list(APPEND HIP_HIPCC_FLAGS
+        ${_ALPAKA_COMPILE_DEFINITIONS_HIP}
+        )
+    HIP_INCLUDE_DIRECTORIES(
+        # ${_ALPAKA_INCLUDE_DIRECTORY}
+        # ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC}
+        ${HIP_INCLUDE_DIRS}
+        ${Boost_INCLUDE_DIRS}
+        ${_ALPAKA_ROOT_DIR}/test/common/include
+        )
+
+    if(OpenMP_CXX_FOUND) # remove fopenmp link from nvcc, otherwise linker error will occur
+        get_property(_ALPAKA_LINK_LIBRARIES_PUBLIC
+                     TARGET alpaka
+                     PROPERTY INTERFACE_LINK_LIBRARIES)
+        list(REMOVE_ITEM _ALPAKA_LINK_LIBRARIES_PUBLIC "OpenMP::OpenMP_CXX")
+
+        target_link_options(alpaka INTERFACE "-Xcompiler ${OpenMP_CXX_FLAGS}")
+        set_property(TARGET alpaka
+                     PROPERTY INTERFACE_LINK_LIBRARIES ${_ALPAKA_LINK_LIBRARIES_PUBLIC})
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Target.
+if(TARGET alpaka)
+
+    if(${ALPAKA_DEBUG} GREATER 1)
+        # Compile options.
+        get_property(_ALPAKA_COMPILE_OPTIONS_PUBLIC
+                     TARGET alpaka
+                     PROPERTY INTERFACE_COMPILE_OPTIONS)
+        cmake_print_variables(_ALPAKA_COMPILE_OPTIONS_PUBLIC)
+
+        # Compile definitions
+        get_property(_ALPAKA_COMPILE_DEFINITIONS_PUBLIC
+                     TARGET alpaka
+                     PROPERTY INTERFACE_COMPILE_DEFINITIONS)
+        cmake_print_variables(_ALPAKA_COMPILE_DEFINITIONS_PUBLIC)
+
+        # Include directories.
+        get_property(_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC
+                     TARGET alpaka
+                     PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+        cmake_print_variables(_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC)
+    endif()
+
+    # the alpaka library itself
+    target_include_directories(alpaka INTERFACE ${_ALPAKA_INCLUDE_DIRECTORY})
+
+    if(${ALPAKA_DEBUG} GREATER 1)
+        # Link libraries.
+        # There are no PUBLIC_LINK_FLAGS in CMAKE:
+        # http://stackoverflow.com/questions/26850889/cmake-keeping-link-flags-of-internal-libs
+        get_property(_ALPAKA_LINK_LIBRARIES_PUBLIC
+                     TARGET alpaka
+                     PROPERTY INTERFACE_LINK_LIBRARIES)
+        cmake_print_variables(_ALPAKA_LINK_LIBRARIES_PUBLIC)
+
+        get_property(_ALPAKA_LINK_FLAGS_PUBLIC
+                     TARGET alpaka
+                     PROPERTY INTERFACE_LINK_OPTIONS)
+        cmake_print_variables(_ALPAKA_LINK_FLAGS_PUBLIC)
+    endif()
+endif()
+
+# NVCC does not incorporate the COMPILE_OPTIONS of a target but only the CMAKE_CXX_FLAGS
+if((ALPAKA_ACC_GPU_CUDA_ENABLE OR ALPAKA_ACC_GPU_HIP_ENABLE) AND ALPAKA_CUDA_COMPILER MATCHES "nvcc")
+    get_property(_ALPAKA_COMPILE_OPTIONS_PUBLIC
+                 TARGET alpaka
+                 PROPERTY INTERFACE_COMPILE_OPTIONS)
+    string(REPLACE ";" " " _ALPAKA_COMPILE_OPTIONS_STRING "${_ALPAKA_COMPILE_OPTIONS_PUBLIC}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_ALPAKA_COMPILE_OPTIONS_STRING}")
+
+    # Append CMAKE_CXX_FLAGS_[Release|Debug|RelWithDebInfo] to CMAKE_CXX_FLAGS
+    # because FindCUDA only propagates the latter to nvcc.
+    string(TOUPPER "${CMAKE_BUILD_TYPE}" build_config)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${build_config}}")
+endif()
+
diff --git a/thirdParty/cupla/alpaka/cmake/alpakaConfig.cmake.in b/thirdParty/cupla/alpaka/cmake/alpakaConfig.cmake.in
new file mode 100644
index 0000000000..597ac51f6c
--- /dev/null
+++ b/thirdParty/cupla/alpaka/cmake/alpakaConfig.cmake.in
@@ -0,0 +1,78 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Erik Zenker, Axel Huebl, Jan Stephan
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+@PACKAGE_INIT@
+
+################################################################################
+# alpaka.
+
+set(ALPAKA_DEBUG "0" CACHE STRING "Debug level")
+set_property(CACHE ALPAKA_DEBUG PROPERTY STRINGS "0;1;2")
+
+#-------------------------------------------------------------------------------
+# Common.
+
+# This file's directory.
+set(_ALPAKA_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR})
+# Normalize the path (e.g. remove ../)
+get_filename_component(_ALPAKA_ROOT_DIR ${_ALPAKA_ROOT_DIR} ABSOLUTE)
+
+# Add common functions.
+set(_ALPAKA_COMMON_FILE "${_ALPAKA_ROOT_DIR}/common.cmake")
+include(${_ALPAKA_COMMON_FILE})
+
+# Add ALPAKA_ADD_EXECUTABLE function.
+set(_ALPAKA_ADD_EXECUTABLE_FILE "${_ALPAKA_ROOT_DIR}/addExecutable.cmake")
+include(${_ALPAKA_ADD_EXECUTABLE_FILE})
+
+# Add ALPAKA_ADD_LIBRARY function.
+set(_ALPAKA_ADD_LIBRARY_FILE "${_ALPAKA_ROOT_DIR}/addLibrary.cmake")
+include(${_ALPAKA_ADD_LIBRARY_FILE})
+
+# Set found to true initially and set it to false if a required dependency is missing.
+set(_ALPAKA_FOUND TRUE)
+
+# Add module search path
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${_ALPAKA_ROOT_DIR}/modules/")
+
+# parse environment variable `CMAKE_MODULE_PATH`
+if(DEFINED ENV{CMAKE_MODULE_PATH})
+    string(REPLACE ":" ";" ENV_CMAKE_MODULE_PATH $ENV{CMAKE_MODULE_PATH})
+    foreach(_PREFIX_PATH ${ENV_CMAKE_MODULE_PATH})
+    set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${_PREFIX_PATH}")
+    endforeach()
+endif()
+
+set(_ALPAKA_INCLUDE_DIRECTORY "@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@")
+
+include("${CMAKE_CURRENT_LIST_DIR}/alpakaCommon.cmake")
+
+check_required_components("alpaka")
+
+# Unset already set variables if not found.
+if(NOT _ALPAKA_FOUND)
+    unset(_ALPAKA_FOUND)
+    unset(_ALPAKA_COMPILE_OPTIONS_PUBLIC)
+    unset(_ALPAKA_COMPILE_DEFINITIONS_HIP)
+    unset(_ALPAKA_HIP_LIBRARIES)
+    unset(_ALPAKA_INCLUDE_DIRECTORY)
+    unset(_ALPAKA_ADD_EXECUTABLE_FILE)
+    unset(_ALPAKA_ADD_LIBRARY_FILE)
+    unset(_ALPAKA_BOOST_MIN_VER)
+else()
+    # Make internal variables advanced options in the GUI.
+    mark_as_advanced(
+        _ALPAKA_COMPILE_OPTIONS_PUBLIC
+        _ALPAKA_INCLUDE_DIRECTORY
+        _ALPAKA_COMMON_FILE
+        _ALPAKA_ADD_EXECUTABLE_FILE
+        _ALPAKA_ADD_LIBRARY_FILE
+        _ALPAKA_BOOST_MIN_VER)
+endif()
diff --git a/thirdParty/cupla/alpaka/cmake/common.cmake b/thirdParty/cupla/alpaka/cmake/common.cmake
index af212c1c2e..6041cbf187 100644
--- a/thirdParty/cupla/alpaka/cmake/common.cmake
+++ b/thirdParty/cupla/alpaka/cmake/common.cmake
@@ -1,7 +1,7 @@
 #
 # Copyright 2014-2019 Benjamin Worpitz
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -194,9 +194,6 @@ FUNCTION(list_add_prefix In_Prefix In_ListVariableName)
     FOREACH(
         item
         IN LISTS ${In_ListVariableName})
-        IF(POLICY CMP0054)
-            CMAKE_POLICY(SET CMP0054 NEW)   # Only interpret if() arguments as variables or keywords when unquoted.
-        ENDIF()
         IF(NOT "${item}" STREQUAL "")
             LIST(
                 APPEND
diff --git a/thirdParty/cupla/alpaka/cmake/dev.cmake b/thirdParty/cupla/alpaka/cmake/dev.cmake
deleted file mode 100644
index e1ca9d99ca..0000000000
--- a/thirdParty/cupla/alpaka/cmake/dev.cmake
+++ /dev/null
@@ -1,146 +0,0 @@
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-#-------------------------------------------------------------------------------
-# Compiler settings.
-#-------------------------------------------------------------------------------
-# By marking the boost headers as system headers, warnings produced within them are ignored.
-# Marking the boost headers as system headers does not work for nvcc (FindCUDA always uses -I)
-TARGET_INCLUDE_DIRECTORIES(
-    "alpaka"
-    SYSTEM
-    INTERFACE ${Boost_INCLUDE_DIRS})
-
-#MSVC
-IF(MSVC)
-    # Force to always compile with W4 and WX
-    LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/W4")
-    LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/WX")
-    # Improve debugging.
-    IF(CMAKE_BUILD_TYPE MATCHES "Debug")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-d2Zi+")
-    ENDIF()
-    IF(MSVC_VERSION GREATER 1900)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/permissive-")
-        IF(MSVC_VERSION GREATER 1910)
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/Zc:twoPhase-")
-        ENDIF()
-    ENDIF()
-    IF(MSVC_VERSION GREATER 1800)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/Zc:throwingNew" "/Zc:strictStrings")
-    ENDIF()
-ELSE()
-  IF(NOT(ALPAKA_ACC_GPU_CUDA_ENABLE) OR ALPAKA_CUDA_COMPILER MATCHES "clang"
-      OR(ALPAKA_ACC_GPU_HIP_ENABLE AND HIP_PLATFORM MATCHES "nvcc"))
-    # GNU
-    IF(CMAKE_COMPILER_IS_GNUCXX)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wall")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wextra")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-pedantic")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Werror")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdouble-promotion")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wmissing-include-dirs")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wunknown-pragmas")
-        # Higher levels (max is 5) produce some strange warnings
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wstrict-overflow=2")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wtrampolines")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wfloat-equal")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wundef")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wshadow")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-qual")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-align")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wwrite-strings")
-        # Too noisy as it warns for every operation using numeric types smaller then int.
-        # Such values are converted to int implicitly before the calculation is done.
-        # E.g.: uint16_t = uint16_t * uint16_t will trigger the following warning:
-        # conversion to ‘short unsigned int’ from ‘int’ may alter its value
-        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wconversion")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsign-conversion")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wvector-operation-performance")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wzero-as-null-pointer-constant")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdate-time")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wuseless-cast")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wlogical-op")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-aggressive-loop-optimizations")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wmissing-declarations")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-multichar")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wopenmp-simd")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wpacked")
-        # Too much noise
-        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wpadded")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wredundant-decls")
-        # Too much noise
-        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Winline")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdisabled-optimization")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-nonliteral")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-security")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-y2k")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wctor-dtor-privacy")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdelete-non-virtual-dtor")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wliteral-suffix")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnon-virtual-dtor")
-        # This warns about members that have not explicitly been listed in the constructor initializer list.
-        # This could be useful even for members that have a default constructor.
-        # However, it also issues this warning for defaulted constructurs.
-        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Weffc++")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Woverloaded-virtual")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsign-promo")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wconditionally-supported")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnoexcept")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wold-style-cast")
-        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-final-types")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-final-methods")
-            # This does not work correctly as it suggests override to methods that are already marked with final.
-            # Because final implies override, this is not useful.
-            #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-override")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnormalized")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-signedness")
-        ENDIF()
-        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnull-dereference")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wduplicated-cond")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsubobject-linkage")
-        ENDIF()
-        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
-            # This warning might be useful but it is triggered by comile-time code where it does not make any sense:
-            # E.g. "vec::Vec<dim::DimInt<(TidxDimOut < TidxDimIn) ? TidxDimIn : TidxDimOut>, TElem>" when both values are equal
-            #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wduplicated-branches")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Walloc-zero")
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Walloca")
-        ENDIF()
-        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0)
-            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-align=strict")
-        ENDIF()
-
-    # Clang or AppleClang
-    ELSEIF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Werror")
-        # Weverything really means everything (including Wall, Wextra, pedantic, ...)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Weverything")
-        # We are not C++98 compatible (we use C++11 features)
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-c++98-compat")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-c++98-compat-pedantic")
-        # The following warnings are triggered by all instantiations of BOOST_AUTO_TEST_SUITE
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-disabled-macro-expansion")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-global-constructors")
-        # This padding warning is generated by the execution tasks depending on the argument types
-        # as they are stored as members. Therefore, the padding warning is triggered by the calling code
-        # and does not indicate a failure within alpaka.
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-padded")
-    # ICC
-    ELSEIF(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wall")
-    # PGI
-    ELSEIF(${CMAKE_CXX_COMPILER_ID} STREQUAL "PGI")
-        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Minform=inform")
-    ENDIF()
-  ENDIF()
-ENDIF()
diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindHIP.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindHIP.cmake
deleted file mode 100644
index dd55e18228..0000000000
--- a/thirdParty/cupla/alpaka/cmake/modules/FindHIP.cmake
+++ /dev/null
@@ -1,601 +0,0 @@
-# /*
-# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-# */
-
-###############################################################################
-# FindHIP.cmake
-###############################################################################
-
-###############################################################################
-# SET: Variable defaults
-###############################################################################
-# User defined flags
-set(HIP_HIPCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HIPCC")
-set(HIP_HCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HCC")
-set(HIP_NVCC_FLAGS "" CACHE STRING "Semicolon delimted flags for NVCC")
-mark_as_advanced(HIP_HIPCC_FLAGS HIP_HCC_FLAGS HIP_NVCC_FLAGS)
-set(_hip_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
-list(REMOVE_DUPLICATES _hip_configuration_types)
-foreach(config ${_hip_configuration_types})
-    string(TOUPPER ${config} config_upper)
-    set(HIP_HIPCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HIPCC")
-    set(HIP_HCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HCC")
-    set(HIP_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for NVCC")
-    mark_as_advanced(HIP_HIPCC_FLAGS_${config_upper} HIP_HCC_FLAGS_${config_upper} HIP_NVCC_FLAGS_${config_upper})
-endforeach()
-option(HIP_HOST_COMPILATION_CPP "Host code compilation mode" ON)
-option(HIP_VERBOSE_BUILD "Print out the commands run while compiling the HIP source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
-mark_as_advanced(HIP_HOST_COMPILATION_CPP)
-
-###############################################################################
-# Set HIP CMAKE Flags
-###############################################################################
-# Copy the invocation styles from CXX to HIP
-set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE})
-set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND})
-set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH})
-set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG})
-set(CMAKE_SHARED_LIBRARY_CREATE_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CXX_FLAGS})
-#set(CMAKE_SHARED_LIBRARY_LINK_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG})
-set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP})
-set(CMAKE_SHARED_LIBRARY_LINK_STATIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_STATIC_CXX_FLAGS})
-set(CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_CXX_FLAGS})
-
-# Set the CMake Flags to use the HCC Compilier.
-set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
-set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
-set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-
-###############################################################################
-# FIND: HIP and associated helper binaries
-###############################################################################
-# HIP is supported on Linux only
-if(UNIX AND NOT APPLE AND NOT CYGWIN)
-    # Search for HIP installation
-    if(NOT HIP_ROOT_DIR)
-        # Search in user specified path first
-        find_path(
-            HIP_ROOT_DIR
-            NAMES hipconfig
-            PATHS
-            ENV ROCM_PATH
-            ENV HIP_PATH
-            PATH_SUFFIXES bin
-            DOC "HIP installed location"
-            NO_DEFAULT_PATH
-            )
-        # Now search in default path
-        find_path(
-            HIP_ROOT_DIR
-            NAMES hipconfig
-            PATHS
-            /opt/rocm
-            /opt/rocm/hip
-            PATH_SUFFIXES bin
-            DOC "HIP installed location"
-            )
-
-        # Check if we found HIP installation
-        if(HIP_ROOT_DIR)
-            # If so, fix the path
-            string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" HIP_ROOT_DIR ${HIP_ROOT_DIR})
-            # And push it back to the cache
-            set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
-        endif()
-        if(NOT EXISTS ${HIP_ROOT_DIR})
-            if(HIP_FIND_REQUIRED)
-                message(FATAL_ERROR "Specify HIP_ROOT_DIR")
-            elseif(NOT HIP_FIND_QUIETLY)
-                message("HIP_ROOT_DIR not found or specified")
-            endif()
-        endif()
-    endif()
-
-    # Find HIPCC executable
-    find_program(
-        HIP_HIPCC_EXECUTABLE
-        NAMES hipcc
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCC_EXECUTABLE)
-        # Now search in default paths
-        find_program(HIP_HIPCC_EXECUTABLE hipcc)
-    endif()
-    mark_as_advanced(HIP_HIPCC_EXECUTABLE)
-
-    # Find HIPCONFIG executable
-    find_program(
-        HIP_HIPCONFIG_EXECUTABLE
-        NAMES hipconfig
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCONFIG_EXECUTABLE)
-        # Now search in default paths
-        find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
-    endif()
-    mark_as_advanced(HIP_HIPCONFIG_EXECUTABLE)
-
-    # Find HIPCC_CMAKE_LINKER_HELPER executable
-    find_program(
-        HIP_HIPCC_CMAKE_LINKER_HELPER
-        NAMES hipcc_cmake_linker_helper
-        PATHS
-        "${HIP_ROOT_DIR}"
-        ENV ROCM_PATH
-        ENV HIP_PATH
-        /opt/rocm
-        /opt/rocm/hip
-        PATH_SUFFIXES bin
-        NO_DEFAULT_PATH
-        )
-    if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER)
-        # Now search in default paths
-        find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper)
-    endif()
-    mark_as_advanced(HIP_HIPCC_CMAKE_LINKER_HELPER)
-
-    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_VERSION)
-        # Compute the version
-        execute_process(
-            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
-            OUTPUT_VARIABLE _hip_version
-            ERROR_VARIABLE _hip_error
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            ERROR_STRIP_TRAILING_WHITESPACE
-            )
-        if(NOT _hip_error)
-            set(HIP_VERSION ${_hip_version} CACHE STRING "Version of HIP as computed from hipcc")
-        else()
-            set(HIP_VERSION "0.0.0" CACHE STRING "Version of HIP as computed by FindHIP()")
-        endif()
-        mark_as_advanced(HIP_VERSION)
-    endif()
-    if(HIP_VERSION)
-        string(REPLACE "." ";" _hip_version_list "${HIP_VERSION}")
-        list(GET _hip_version_list 0 HIP_VERSION_MAJOR)
-        list(GET _hip_version_list 1 HIP_VERSION_MINOR)
-        list(GET _hip_version_list 2 HIP_VERSION_PATCH)
-        set(HIP_VERSION_STRING "${HIP_VERSION}")
-    endif()
-
-    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_PLATFORM)
-        # Compute the platform
-        execute_process(
-            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform
-            OUTPUT_VARIABLE _hip_platform
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-            )
-        set(HIP_PLATFORM ${_hip_platform} CACHE STRING "HIP platform as computed by hipconfig")
-        mark_as_advanced(HIP_PLATFORM)
-    endif()
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(
-    HIP
-    REQUIRED_VARS
-    HIP_ROOT_DIR
-    HIP_HIPCC_EXECUTABLE
-    HIP_HIPCONFIG_EXECUTABLE
-    HIP_PLATFORM
-    VERSION_VAR HIP_VERSION
-    )
-
-###############################################################################
-# MACRO: Locate helper files
-###############################################################################
-macro(HIP_FIND_HELPER_FILE _name _extension)
-    set(_hip_full_name "${_name}.${_extension}")
-    get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-    set(HIP_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindHIP/${_hip_full_name}")
-    if(NOT EXISTS "${HIP_${_name}}")
-        set(error_message "${_hip_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindHIP")
-        if(HIP_FIND_REQUIRED)
-            message(FATAL_ERROR "${error_message}")
-        else()
-            if(NOT HIP_FIND_QUIETLY)
-                message(STATUS "${error_message}")
-            endif()
-        endif()
-    endif()
-    # Set this variable as internal, so the user isn't bugged with it.
-    set(HIP_${_name} ${HIP_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
-endmacro()
-
-###############################################################################
-hip_find_helper_file(run_make2cmake cmake)
-hip_find_helper_file(run_hipcc cmake)
-###############################################################################
-
-###############################################################################
-# MACRO: Reset compiler flags
-###############################################################################
-macro(HIP_RESET_FLAGS)
-    unset(HIP_HIPCC_FLAGS)
-    unset(HIP_HCC_FLAGS)
-    unset(HIP_NVCC_FLAGS)
-    foreach(config ${_hip_configuration_types})
-        string(TOUPPER ${config} config_upper)
-        unset(HIP_HIPCC_FLAGS_${config_upper})
-        unset(HIP_HCC_FLAGS_${config_upper})
-        unset(HIP_NVCC_FLAGS_${config_upper})
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Separate the options from the sources
-###############################################################################
-macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_options _nvcc_options)
-    set(${_sources})
-    set(${_cmake_options})
-    set(${_hipcc_options})
-    set(${_hcc_options})
-    set(${_nvcc_options})
-    set(_hipcc_found_options FALSE)
-    set(_hcc_found_options FALSE)
-    set(_nvcc_found_options FALSE)
-    foreach(arg ${ARGN})
-        if("x${arg}" STREQUAL "xHIPCC_OPTIONS")
-            set(_hipcc_found_options TRUE)
-            set(_hcc_found_options FALSE)
-            set(_nvcc_found_options FALSE)
-        elseif("x${arg}" STREQUAL "xHCC_OPTIONS")
-            set(_hipcc_found_options FALSE)
-            set(_hcc_found_options TRUE)
-            set(_nvcc_found_options FALSE)
-        elseif("x${arg}" STREQUAL "xNVCC_OPTIONS")
-            set(_hipcc_found_options FALSE)
-            set(_hcc_found_options FALSE)
-            set(_nvcc_found_options TRUE)
-        elseif(
-                "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
-                "x${arg}" STREQUAL "xSTATIC" OR
-                "x${arg}" STREQUAL "xSHARED" OR
-                "x${arg}" STREQUAL "xMODULE"
-                )
-            list(APPEND ${_cmake_options} ${arg})
-        else()
-            if(_hipcc_found_options)
-                list(APPEND ${_hipcc_options} ${arg})
-            elseif(_hcc_found_options)
-                list(APPEND ${_hcc_options} ${arg})
-            elseif(_nvcc_found_options)
-                list(APPEND ${_nvcc_options} ${arg})
-            else()
-                # Assume this is a file
-                list(APPEND ${_sources} ${arg})
-            endif()
-        endif()
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Add include directories to pass to the hipcc command
-###############################################################################
-set(HIP_HIPCC_INCLUDE_ARGS_USER "")
-macro(HIP_INCLUDE_DIRECTORIES)
-    foreach(dir ${ARGN})
-        list(APPEND HIP_HIPCC_INCLUDE_ARGS_USER $<$<BOOL:${dir}>:-I${dir}>)
-    endforeach()
-endmacro()
-
-###############################################################################
-# FUNCTION: Helper to avoid clashes of files with the same basename but different paths
-###############################################################################
-function(HIP_COMPUTE_BUILD_PATH path build_path)
-    # Convert to cmake style paths
-    file(TO_CMAKE_PATH "${path}" bpath)
-    if(IS_ABSOLUTE "${bpath}")
-        string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
-        if(_binary_dir_pos EQUAL 0)
-            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
-        else()
-            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
-        endif()
-    endif()
-
-    # Remove leading /
-    string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
-    # Avoid absolute paths by removing ':'
-    string(REPLACE ":" "_" bpath "${bpath}")
-    # Avoid relative paths that go up the tree
-    string(REPLACE "../" "__/" bpath "${bpath}")
-    # Avoid spaces
-    string(REPLACE " " "_" bpath "${bpath}")
-    # Strip off the filename
-    get_filename_component(bpath "${bpath}" PATH)
-
-    set(${build_path} "${bpath}" PARENT_SCOPE)
-endfunction()
-
-###############################################################################
-# MACRO: Parse OPTIONS from ARGN & set variables prefixed by _option_prefix
-###############################################################################
-macro(HIP_PARSE_HIPCC_OPTIONS _option_prefix)
-    set(_hip_found_config)
-    foreach(arg ${ARGN})
-        # Determine if we are dealing with a per-configuration flag
-        foreach(config ${_hip_configuration_types})
-            string(TOUPPER ${config} config_upper)
-            if(arg STREQUAL "${config_upper}")
-                set(_hip_found_config _${arg})
-                # Clear arg to prevent it from being processed anymore
-                set(arg)
-            endif()
-        endforeach()
-        if(arg)
-            list(APPEND ${_option_prefix}${_hip_found_config} "${arg}")
-        endif()
-    endforeach()
-endmacro()
-
-###############################################################################
-# MACRO: Try and include dependency file if it exists
-###############################################################################
-macro(HIP_INCLUDE_HIPCC_DEPENDENCIES dependency_file)
-    set(HIP_HIPCC_DEPEND)
-    set(HIP_HIPCC_DEPEND_REGENERATE FALSE)
-
-    # Create the dependency file if it doesn't exist
-    if(NOT EXISTS ${dependency_file})
-        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
-    endif()
-    # Include the dependency file
-    include(${dependency_file})
-
-    # Verify the existence of all the included files
-    if(HIP_HIPCC_DEPEND)
-        foreach(f ${HIP_HIPCC_DEPEND})
-            if(NOT EXISTS ${f})
-                # If they aren't there, regenerate the file again
-                set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
-            endif()
-        endforeach()
-    else()
-        # No dependencies, so regenerate the file
-        set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
-    endif()
-
-    # Regenerate the dependency file if needed
-    if(HIP_HIPCC_DEPEND_REGENERATE)
-        set(HIP_HIPCC_DEPEND ${dependency_file})
-        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
-    endif()
-endmacro()
-
-###############################################################################
-# MACRO: Prepare cmake commands for the target
-###############################################################################
-macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files)
-    set(_hip_flags "")
-    string(TOUPPER "${CMAKE_BUILD_TYPE}" _hip_build_configuration)
-    if(HIP_HOST_COMPILATION_CPP)
-        set(HIP_C_OR_CXX CXX)
-    else()
-        set(HIP_C_OR_CXX C)
-    endif()
-    set(generated_extension ${CMAKE_${HIP_C_OR_CXX}_OUTPUT_EXTENSION})
-
-    # Initialize list of includes with those specified by the user. Append with
-    # ones specified to cmake directly.
-    set(HIP_HIPCC_INCLUDE_ARGS ${HIP_HIPCC_INCLUDE_ARGS_USER})
-
-    # Add the include directories
-    set(include_directories_generator "$<TARGET_PROPERTY:${_target},INCLUDE_DIRECTORIES>")
-    list(APPEND HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:${include_directories_generator}>:-I$<JOIN:${include_directories_generator}, -I>>")
-
-    get_directory_property(_hip_include_directories INCLUDE_DIRECTORIES)
-    list(REMOVE_DUPLICATES _hip_include_directories)
-    if(_hip_include_directories)
-        foreach(dir ${_hip_include_directories})
-            list(APPEND HIP_HIPCC_INCLUDE_ARGS $<$<BOOL:${dir}>:-I${dir}>)
-        endforeach()
-    endif()
-
-    HIP_GET_SOURCES_AND_OPTIONS(_hip_sources _hip_cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_HIPCC_FLAGS ${_hipcc_options})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_HCC_FLAGS ${_hcc_options})
-    HIP_PARSE_HIPCC_OPTIONS(HIP_NVCC_FLAGS ${_nvcc_options})
-
-    # Add the compile definitions
-    set(compile_definition_generator "$<TARGET_PROPERTY:${_target},COMPILE_DEFINITIONS>")
-    list(APPEND HIP_HIPCC_FLAGS "$<$<BOOL:${compile_definition_generator}>:-D$<JOIN:${compile_definition_generator}, -D>>")
-
-    # Check if we are building shared library.
-    set(_hip_build_shared_libs FALSE)
-    list(FIND _hip_cmake_options SHARED _hip_found_SHARED)
-    list(FIND _hip_cmake_options MODULE _hip_found_MODULE)
-    if(_hip_found_SHARED GREATER -1 OR _hip_found_MODULE GREATER -1)
-        set(_hip_build_shared_libs TRUE)
-    endif()
-    list(FIND _hip_cmake_options STATIC _hip_found_STATIC)
-    if(_hip_found_STATIC GREATER -1)
-        set(_hip_build_shared_libs FALSE)
-    endif()
-
-    # If we are building a shared library, add extra flags to HIP_HIPCC_FLAGS
-    if(_hip_build_shared_libs)
-        list(APPEND HIP_HCC_FLAGS "-fPIC")
-        list(APPEND HIP_NVCC_FLAGS "--shared -Xcompiler '-fPIC'")
-    endif()
-
-    # Set host compiler
-    set(HIP_HOST_COMPILER "${CMAKE_${HIP_C_OR_CXX}_COMPILER}")
-
-    # Set compiler flags
-    set(_HIP_HOST_FLAGS "set(CMAKE_HOST_FLAGS ${CMAKE_${HIP_C_OR_CXX}_FLAGS})")
-    set(_HIP_HIPCC_FLAGS "set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS})")
-    set(_HIP_HCC_FLAGS "set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS})")
-    set(_HIP_NVCC_FLAGS "set(HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS})")
-    foreach(config ${_hip_configuration_types})
-        string(TOUPPER ${config} config_upper)
-        set(_HIP_HOST_FLAGS "${_HIP_HOST_FLAGS}\nset(CMAKE_HOST_FLAGS_${config_upper} ${CMAKE_${HIP_C_OR_CXX}_FLAGS_${config_upper}})")
-        set(_HIP_HIPCC_FLAGS "${_HIP_HIPCC_FLAGS}\nset(HIP_HIPCC_FLAGS_${config_upper} ${HIP_HIPCC_FLAGS_${config_upper}})")
-        set(_HIP_HCC_FLAGS "${_HIP_HCC_FLAGS}\nset(HIP_HCC_FLAGS_${config_upper} ${HIP_HCC_FLAGS_${config_upper}})")
-        set(_HIP_NVCC_FLAGS "${_HIP_NVCC_FLAGS}\nset(HIP_NVCC_FLAGS_${config_upper} ${HIP_NVCC_FLAGS_${config_upper}})")
-    endforeach()
-
-    # Reset the output variable
-    set(_hip_generated_files "")
-    set(_hip_source_files "")
-
-    # Iterate over all arguments and create custom commands for all source files
-    foreach(file ${ARGN})
-        # Ignore any file marked as a HEADER_FILE_ONLY
-        get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-        # Allow per source file overrides of the format. Also allows compiling non .cu files.
-        get_source_file_property(_hip_source_format ${file} HIP_SOURCE_PROPERTY_FORMAT)
-        if((${file} MATCHES "\\.cu$" OR _hip_source_format) AND NOT _is_header)
-            set(host_flag FALSE)
-        else()
-            set(host_flag TRUE)
-        endif()
-
-        if(NOT host_flag)
-            # Determine output directory
-            HIP_COMPUTE_BUILD_PATH("${file}" hip_build_path)
-            set(hip_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${hip_build_path}")
-
-            get_filename_component(basename ${file} NAME)
-            set(generated_file_path "${hip_compile_output_dir}/${CMAKE_CFG_INTDIR}")
-            set(generated_file_basename "${_target}_generated_${basename}${generated_extension}")
-
-            # Set file names
-            set(generated_file "${generated_file_path}/${generated_file_basename}")
-            set(cmake_dependency_file "${hip_compile_output_dir}/${generated_file_basename}.depend")
-            set(custom_target_script_pregen "${hip_compile_output_dir}/${generated_file_basename}.cmake.pre-gen")
-            set(custom_target_script "${hip_compile_output_dir}/${generated_file_basename}.cmake")
-
-            # Set properties for object files
-            set_source_files_properties("${generated_file}"
-                PROPERTIES
-                EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked
-                )
-
-            # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path
-            get_filename_component(file_path "${file}" PATH)
-            if(IS_ABSOLUTE "${file_path}")
-                set(source_file "${file}")
-            else()
-                set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
-            endif()
-
-            # Bring in the dependencies
-            HIP_INCLUDE_HIPCC_DEPENDENCIES(${cmake_dependency_file})
-
-            # Configure the build script
-            configure_file("${HIP_run_hipcc}" "${custom_target_script_pregen}" @ONLY)
-            file(GENERATE
-                OUTPUT "${custom_target_script}"
-                INPUT "${custom_target_script_pregen}"
-                )
-            set(main_dep DEPENDS ${source_file})
-            if(CMAKE_GENERATOR MATCHES "Makefiles")
-                set(verbose_output "$(VERBOSE)")
-            elseif(HIP_VERBOSE_BUILD)
-                set(verbose_output ON)
-            else()
-                set(verbose_output OFF)
-            endif()
-
-            # Create up the comment string
-            file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-            set(hip_build_comment_string "Building HIPCC object ${generated_file_relative_path}")
-
-            # Build the generated file and dependency file
-            add_custom_command(
-                OUTPUT ${generated_file}
-                # These output files depend on the source_file and the contents of cmake_dependency_file
-                ${main_dep}
-                DEPENDS ${HIP_HIPCC_DEPEND}
-                DEPENDS ${custom_target_script}
-                # Make sure the output directory exists before trying to write to it.
-                COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
-                COMMAND ${CMAKE_COMMAND} ARGS
-                -D verbose:BOOL=${verbose_output}
-                -D build_configuration:STRING=${_hip_build_configuration}
-                -D "generated_file:STRING=${generated_file}"
-                -P "${custom_target_script}"
-                WORKING_DIRECTORY "${hip_compile_output_dir}"
-                COMMENT "${hip_build_comment_string}"
-                )
-
-            # Make sure the build system knows the file is generated
-            set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
-            list(APPEND _hip_generated_files ${generated_file})
-            list(APPEND _hip_source_files ${file})
-        endif()
-    endforeach()
-
-    # Set the return parameter
-    set(${_generated_files} ${_hip_generated_files})
-    set(${_source_files} ${_hip_source_files})
-endmacro()
-
-###############################################################################
-# HIP_ADD_EXECUTABLE
-###############################################################################
-macro(HIP_ADD_EXECUTABLE hip_target)
-    # Separate the sources from the options
-    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-        list(REMOVE_ITEM _sources ${_source_files})
-    endif()
-    if("x${HCC_HOME}" STREQUAL "x")
-        set(HCC_HOME "/opt/rocm/hcc")
-    endif()
-    set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-    add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
-    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP)
-endmacro()
-
-###############################################################################
-# HIP_ADD_LIBRARY
-###############################################################################
-macro(HIP_ADD_LIBRARY hip_target)
-    # Separate the sources from the options
-    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
-    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
-    if(_source_files)
-        list(REMOVE_ITEM _sources ${_source_files})
-    endif()
-    add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
-    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX})
-endmacro()
-
-# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_hipcc.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_hipcc.cmake
deleted file mode 100644
index c9582bdbd4..0000000000
--- a/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_hipcc.cmake
+++ /dev/null
@@ -1,190 +0,0 @@
-# /*
-# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-# */
-
-###############################################################################
-# Runs commands using HIPCC
-###############################################################################
-
-###############################################################################
-# This file runs the hipcc commands to produce the desired output file
-# along with the dependency file needed by CMake to compute dependencies.
-#
-# Input variables:
-#
-# verbose:BOOL=<>               OFF: Be as quiet as possible (default)
-#                               ON : Describe each step
-# build_configuration:STRING=<> Build configuration. Defaults to Debug.
-# generated_file:STRING=<>      File to generate. Mandatory argument.
-
-if(NOT build_configuration)
-    set(build_configuration Debug)
-endif()
-if(NOT generated_file)
-    message(FATAL_ERROR "You must specify generated_file on the command line")
-endif()
-
-# Set these up as variables to make reading the generated file easier
-set(HIP_HIPCC_EXECUTABLE "@HIP_HIPCC_EXECUTABLE@") # path
-set(HIP_HIPCONFIG_EXECUTABLE "@HIP_HIPCONFIG_EXECUTABLE@") #path
-set(HIP_HOST_COMPILER "@HIP_HOST_COMPILER@") # path
-set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
-set(HIP_run_make2cmake "@HIP_run_make2cmake@") # path
-set(HCC_HOME "@HCC_HOME@") #path
-
-@HIP_HOST_FLAGS@
-@_HIP_HIPCC_FLAGS@
-@_HIP_HCC_FLAGS@
-@_HIP_NVCC_FLAGS@
-set(HIP_HIPCC_INCLUDE_ARGS "@HIP_HIPCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly)
-
-set(cmake_dependency_file "@cmake_dependency_file@") # path
-set(source_file "@source_file@") # path
-set(host_flag "@host_flag@") # bool
-
-# Determine compiler and compiler flags
-execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform OUTPUT_VARIABLE HIP_PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT host_flag)
-    set(__CC ${HIP_HIPCC_EXECUTABLE})
-    if(HIP_PLATFORM STREQUAL "hcc")
-        if(NOT "x${HCC_HOME}" STREQUAL "x")
-            set(ENV{HCC_HOME} ${HCC_HOME})
-        endif()
-        set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_HCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_HCC_FLAGS_${build_configuration}})
-    else()
-        set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_NVCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_NVCC_FLAGS_${build_configuration}})
-    endif()
-else()
-    set(__CC ${HIP_HOST_COMPILER})
-    set(__CC_FLAGS ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
-endif()
-set(__CC_INCLUDES ${HIP_HIPCC_INCLUDE_ARGS})
-
-# hip_execute_process - Executes a command with optional command echo and status message.
-#   status     - Status message to print if verbose is true
-#   command    - COMMAND argument from the usual execute_process argument structure
-#   ARGN       - Remaining arguments are the command with arguments
-#   HIP_result - Return value from running the command
-macro(hip_execute_process status command)
-    set(_command ${command})
-    if(NOT "x${_command}" STREQUAL "xCOMMAND")
-        message(FATAL_ERROR "Malformed call to hip_execute_process.  Missing COMMAND as second argument. (command = ${command})")
-    endif()
-    if(verbose)
-        execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
-        # Build command string to print
-        set(hip_execute_process_string)
-        foreach(arg ${ARGN})
-            # Escape quotes if any
-            string(REPLACE "\"" "\\\"" arg ${arg})
-            # Surround args with spaces with quotes
-            if(arg MATCHES " ")
-                list(APPEND hip_execute_process_string "\"${arg}\"")
-            else()
-                list(APPEND hip_execute_process_string ${arg})
-            endif()
-        endforeach()
-        # Echo the command
-        execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${hip_execute_process_string})
-    endif()
-    # Run the command
-    execute_process(COMMAND ${ARGN} RESULT_VARIABLE HIP_result)
-endmacro()
-
-# Delete the target file
-hip_execute_process(
-    "Removing ${generated_file}"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-    )
-
-# Generate the dependency file
-hip_execute_process(
-    "Generating dependency file: ${cmake_dependency_file}.pre"
-    COMMAND "${__CC}"
-    -M
-    "${source_file}"
-    -o "${cmake_dependency_file}.pre"
-    ${__CC_FLAGS}
-    ${__CC_INCLUDES}
-    )
-
-if(HIP_result)
-    message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the cmake readable dependency file to a temp file
-hip_execute_process(
-    "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
-    COMMAND "${CMAKE_COMMAND}"
-    -D "input_file:FILEPATH=${cmake_dependency_file}.pre"
-    -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
-    -D "verbose=${verbose}"
-    -P "${HIP_run_make2cmake}"
-    )
-
-if(HIP_result)
-    message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Copy the file if it is different
-hip_execute_process(
-    "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
-    COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
-    )
-
-if(HIP_result)
-    message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Delete the temporary file
-hip_execute_process(
-    "Removing ${cmake_dependency_file}.tmp and ${cmake_dependency_file}.pre"
-    COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${cmake_dependency_file}.pre"
-    )
-
-if(HIP_result)
-    message(FATAL_ERROR "Error generating ${generated_file}")
-endif()
-
-# Generate the output file
-hip_execute_process(
-    "Generating ${generated_file}"
-    COMMAND "${__CC}"
-    -c
-    "${source_file}"
-    -o "${generated_file}"
-    ${__CC_FLAGS}
-    ${__CC_INCLUDES}
-    )
-
-if(HIP_result)
-    # Make sure that we delete the output file
-    hip_execute_process(
-        "Removing ${generated_file}"
-        COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
-        )
-    message(FATAL_ERROR "Error generating file ${generated_file}")
-else()
-    if(verbose)
-        message("Generated ${generated_file} successfully.")
-    endif()
-endif()
-# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake
deleted file mode 100644
index 48a51fa039..0000000000
--- a/thirdParty/cupla/alpaka/cmake/modules/FindHIP/run_make2cmake.cmake
+++ /dev/null
@@ -1,72 +0,0 @@
-# /*
-# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-# */
-
-###############################################################################
-# Computes dependencies using HIPCC
-###############################################################################
-
-###############################################################################
-# This file converts dependency files generated using hipcc to a format that
-# cmake can understand.
-
-# Input variables:
-#
-# input_file:STRING=<> Dependency file to parse. Required argument
-# output_file:STRING=<> Output file to generate. Required argument
-
-if(NOT input_file OR NOT output_file)
-    message(FATAL_ERROR "You must specify input_file and output_file on the command line")
-endif()
-
-file(READ ${input_file} depend_text)
-
-if (NOT "${depend_text}" STREQUAL "")
-    string(REPLACE " /" "\n/" depend_text ${depend_text})
-    string(REGEX REPLACE "^.*:" "" depend_text ${depend_text})
-    string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
-
-    set(dependency_list "")
-
-    foreach(file ${depend_text})
-        string(REGEX REPLACE "^ +" "" file ${file})
-        if(NOT EXISTS "${file}")
-            message(WARNING " Removing non-existent dependency file: ${file}")
-            set(file "")
-        endif()
-
-        if(NOT IS_DIRECTORY "${file}")
-            get_filename_component(file_absolute "${file}" ABSOLUTE)
-            list(APPEND dependency_list "${file_absolute}")
-        endif()
-    endforeach()
-endif()
-
-# Remove the duplicate entries and sort them.
-list(REMOVE_DUPLICATES dependency_list)
-list(SORT dependency_list)
-
-foreach(file ${dependency_list})
-    set(hip_hipcc_depend "${hip_hipcc_depend} \"${file}\"\n")
-endforeach()
-
-file(WRITE ${output_file} "# Generated by: FindHIP.cmake. Do not edit.\nSET(HIP_HIPCC_DEPEND\n ${hip_hipcc_depend})\n\n")
-# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/thirdParty/cupla/alpaka/cmake/modules/FindTBB.cmake b/thirdParty/cupla/alpaka/cmake/modules/FindTBB.cmake
index 4cfabee852..f0d468c239 100644
--- a/thirdParty/cupla/alpaka/cmake/modules/FindTBB.cmake
+++ b/thirdParty/cupla/alpaka/cmake/modules/FindTBB.cmake
@@ -1,246 +1,418 @@
-# The MIT License (MIT)
+# - Find ThreadingBuildingBlocks include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(TBB
+#    [REQUIRED]             # Fail with error if TBB is not found
+#    )                      #
+# Once done, this will define
 #
-# Copyright (c) 2015 Justus Calvin
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
+#  TBB_FOUND - system has TBB
+#  TBB_INCLUDE_DIRS - the TBB include directories
+#  TBB_LIBRARIES - TBB libraries to be lined, doesn't include malloc or
+#                  malloc proxy
+#  TBB::tbb - imported target for the TBB library
+#
+#  TBB_VERSION_MAJOR - Major Product Version Number
+#  TBB_VERSION_MINOR - Minor Product Version Number
+#  TBB_INTERFACE_VERSION - Engineering Focused Version Number
+#  TBB_COMPATIBLE_INTERFACE_VERSION - The oldest major interface version
+#                                     still supported. This uses the engineering
+#                                     focused interface version numbers.
 #
-# FindTBB
-# -------
+#  TBB_MALLOC_FOUND - system has TBB malloc library
+#  TBB_MALLOC_INCLUDE_DIRS - the TBB malloc include directories
+#  TBB_MALLOC_LIBRARIES - The TBB malloc libraries to be lined
+#  TBB::malloc - imported target for the TBB malloc library
 #
-# Find TBB include directories and libraries.
+#  TBB_MALLOC_PROXY_FOUND - system has TBB malloc proxy library
+#  TBB_MALLOC_PROXY_INCLUDE_DIRS = the TBB malloc proxy include directories
+#  TBB_MALLOC_PROXY_LIBRARIES - The TBB malloc proxy libraries to be lined
+#  TBB::malloc_proxy - imported target for the TBB malloc proxy library
 #
-# Usage:
 #
-#  find_package(TBB [major[.minor]] [EXACT]
-#               [QUIET] [REQUIRED]
-#               [[COMPONENTS] [components...]]
-#               [OPTIONAL_COMPONENTS components...]) 
+# This module reads hints about search locations from variables:
+#  ENV TBB_ARCH_PLATFORM - for eg. set it to "mic" for Xeon Phi builds
+#  ENV TBB_ROOT or just TBB_ROOT - root directory of tbb installation
+#  ENV TBB_BUILD_PREFIX - specifies the build prefix for user built tbb
+#                         libraries. Should be specified with ENV TBB_ROOT
+#                         and optionally...
+#  ENV TBB_BUILD_DIR - if build directory is different than ${TBB_ROOT}/build
 #
-# where the allowed components are tbbmalloc and tbb_preview. Users may modify 
-# the behavior of this module with the following variables:
 #
-# * TBB_ROOT_DIR          - The base directory the of TBB installation.
-# * TBB_INCLUDE_DIR       - The directory that contains the TBB headers files.
-# * TBB_LIBRARY           - The directory that contains the TBB library files.
-# * TBB_<library>_LIBRARY - The path of the TBB the corresponding TBB library. 
-#                           These libraries, if specified, override the 
-#                           corresponding library search results, where <library>
-#                           may be tbb, tbb_debug, tbbmalloc, tbbmalloc_debug,
-#                           tbb_preview, or tbb_preview_debug.
-# * TBB_USE_DEBUG_BUILD   - The debug version of tbb libraries, if present, will
-#                           be used instead of the release version.
+# Modified by Robert Maynard from the original OGRE source
 #
-# Users may modify the behavior of this module with the following environment
-# variables:
+#-------------------------------------------------------------------
+# This file is part of the CMake build system for OGRE
+#     (Object-oriented Graphics Rendering Engine)
+# For the latest info, see http://www.ogre3d.org/
 #
-# * TBB_INSTALL_DIR 
-# * TBBROOT
-# * LIBRARY_PATH
+# The contents of this file are placed in the public domain. Feel
+# free to make use of it in any way you like.
+#-------------------------------------------------------------------
 #
-# This module will set the following variables:
+#=============================================================================
+# Copyright 2010-2012 Kitware, Inc.
+# Copyright 2012      Rolf Eike Beer <eike@sf-mail.de>
 #
-# * TBB_FOUND             - Set to false, or undefined, if we haven’t found, or
-#                           don’t want to use TBB.
-# * TBB_<component>_FOUND - If False, optional <component> part of TBB sytem is
-#                           not available.
-# * TBB_VERSION           - The full version string
-# * TBB_VERSION_MAJOR     - The major version
-# * TBB_VERSION_MINOR     - The minor version
-# * TBB_INTERFACE_VERSION - The interface version number defined in 
-#                           tbb/tbb_stddef.h.
-# * TBB_<library>_LIBRARY_RELEASE - The path of the TBB release version of 
-#                           <library>, where <library> may be tbb, tbb_debug,
-#                           tbbmalloc, tbbmalloc_debug, tbb_preview, or 
-#                           tbb_preview_debug.
-# * TBB_<library>_LIBRARY_DEGUG - The path of the TBB release version of 
-#                           <library>, where <library> may be tbb, tbb_debug,
-#                           tbbmalloc, tbbmalloc_debug, tbb_preview, or 
-#                           tbb_preview_debug.
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
 #
-# The following varibles should be used to build and link with TBB:
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+#=============================================================================
+#  FindTBB helper functions and macros
 #
-# * TBB_INCLUDE_DIRS - The include directory for TBB.
-# * TBB_LIBRARIES    - The libraries to link against to use TBB.
-# * TBB_DEFINITIONS  - Definitions to use when compiling code that uses TBB.
-
-include(FindPackageHandleStandardArgs)
-
-if(NOT TBB_FOUND)
-
-  ##################################
-  # Check the build type
-  ##################################
-  
-  if(NOT DEFINED TBB_USE_DEBUG_BUILD)
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-      message(STATUS "Set TBB_USE_DEBUG_BUILD to TRUE because CMAKE_BUILD_TYPE is one of the debug configurations.")
-      set(TBB_USE_DEBUG_BUILD TRUE)
-    else()
-      set(TBB_USE_DEBUG_BUILD FALSE)
-    endif()
+
+#====================================================
+# Fix the library path in case it is a linker script
+#====================================================
+function(tbb_extract_real_library library real_library)
+  if(NOT UNIX OR NOT EXISTS ${library})
+    set(${real_library} "${library}" PARENT_SCOPE)
+    return()
+  endif()
+
+  #Read in the first 4 bytes and see if they are the ELF magic number
+  set(_elf_magic "7f454c46")
+  file(READ ${library} _hex_data OFFSET 0 LIMIT 4 HEX)
+  if(_hex_data STREQUAL _elf_magic)
+    #we have opened a elf binary so this is what
+    #we should link to
+    set(${real_library} "${library}" PARENT_SCOPE)
+    return()
   endif()
-  
-  ##################################
-  # Set the TBB search directories
-  ##################################
-  
-  # Define search paths based on user input and environment variables
-  set(TBB_SEARCH_DIR ${TBB_ROOT_DIR} $ENV{TBB_INSTALL_DIR} $ENV{TBBROOT})
-  
-  # Define the search directories based on the current platform
-  if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
-    set(TBB_DEFAULT_SEARCH_DIR "C:/Program Files/Intel/TBB"
-                               "C:/Program Files (x86)/Intel/TBB")
-
-    # Set the target architecture
-    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-      set(TBB_ARCHITECTURE "intel64")
-    else()
-      set(TBB_ARCHITECTURE "ia32")
-    endif()
-
-    # Set the TBB search library path search suffix based on the version of VC
-    if(WINDOWS_STORE)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11_ui")
-    elseif(MSVC14)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc14")
-    elseif(MSVC12)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc12")
-    elseif(MSVC11)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11")
-    elseif(MSVC10)
-      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc10")
-    endif()
-
-    # Add the library path search suffix for the VC independent version of TBB
-    list(APPEND TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc_mt")
-
-  elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-    # OS X
-    set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb")
-    
-    # TODO: Check to see which C++ library is being used by the compiler.
-    if(NOT ${CMAKE_SYSTEM_VERSION} VERSION_LESS 13.0)
-      # The default C++ library on OS X 10.9 and later is libc++
-      set(TBB_LIB_PATH_SUFFIX "lib/libc++")
-    else()
-      set(TBB_LIB_PATH_SUFFIX "lib")
-    endif()
-  elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-    # Linux
-    set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb")
-    
-    # TODO: Check compiler version to see the suffix should be <arch>/gcc4.1 or
-    #       <arch>/gcc4.1. For now, assume that the compiler is more recent than
-    #       gcc 4.4.x or later.
-    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
-      set(TBB_LIB_PATH_SUFFIX "lib/intel64/gcc4.4")
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$")
-      set(TBB_LIB_PATH_SUFFIX "lib/ia32/gcc4.4")
-    endif()
+
+  file(READ ${library} _data OFFSET 0 LIMIT 1024)
+  if("${_data}" MATCHES "INPUT \\(([^(]+)\\)")
+    #extract out the .so name from REGEX MATCH command
+    set(_proper_so_name "${CMAKE_MATCH_1}")
+
+    #construct path to the real .so which is presumed to be in the same directory
+    #as the input file
+    get_filename_component(_so_dir "${library}" DIRECTORY)
+    set(${real_library} "${_so_dir}/${_proper_so_name}" PARENT_SCOPE)
+  else()
+    #unable to determine what this library is so just hope everything works
+    #and pass it unmodified.
+    set(${real_library} "${library}" PARENT_SCOPE)
   endif()
-  
-  ##################################
-  # Find the TBB include dir
-  ##################################
-  
-  find_path(TBB_INCLUDE_DIRS tbb/tbb.h
-      HINTS ${TBB_INCLUDE_DIR} ${TBB_SEARCH_DIR}
-      PATHS ${TBB_DEFAULT_SEARCH_DIR}
-      PATH_SUFFIXES include)
-  
-  ##################################
-  # Find TBB components
-  ##################################
-
-  # Find each component
-  foreach(_comp tbb_preview tbbmalloc tbb)
-    # Search for the libraries
-    find_library(TBB_${_comp}_LIBRARY_RELEASE ${_comp}
-        HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR}
-        PATHS ${TBB_DEFAULT_SEARCH_DIR}
-        PATH_SUFFIXES ${TBB_LIB_PATH_SUFFIX})
-
-    find_library(TBB_${_comp}_LIBRARY_DEBUG ${_comp}_debug
-        HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR}
-        PATHS ${TBB_DEFAULT_SEARCH_DIR} ENV LIBRARY_PATH
-        PATH_SUFFIXES ${TBB_LIB_PATH_SUFFIX})
-    
-    # Set the library to be used for the component
-    if(NOT TBB_${_comp}_LIBRARY)
-      if(TBB_USE_DEBUG_BUILD AND TBB_${_comp}_LIBRARY_DEBUG)
-        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_DEBUG}")
-      elseif(TBB_${_comp}_LIBRARY_RELEASE)
-        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_RELEASE}")
-      elseif(TBB_${_comp}_LIBRARY_DEBUG)
-        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_DEBUG}")
-        message(STATUS "Using the debug library of '${_comp}' because the release library could not be found!")
-      endif()
-    endif()
-    
-    # Set the TBB library list and component found variables
-    if(TBB_${_comp}_LIBRARY)
-      list(APPEND TBB_LIBRARIES "${TBB_${_comp}_LIBRARY}")
-      set(TBB_${_comp}_FOUND TRUE)
-    else()
-      set(TBB_${_comp}_FOUND FALSE)
-    endif()
-    
-    mark_as_advanced(TBB_${_comp}_LIBRARY_RELEASE)
-    mark_as_advanced(TBB_${_comp}_LIBRARY_DEBUG)
-    mark_as_advanced(TBB_${_comp}_LIBRARY)
-    
+endfunction()
+
+#===============================================
+# Do the final processing for the package find.
+#===============================================
+macro(findpkg_finish PREFIX TARGET_NAME)
+  if (${PREFIX}_INCLUDE_DIR AND ${PREFIX}_LIBRARY)
+    set(${PREFIX}_FOUND TRUE)
+    set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIR})
+    set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARY})
+  else ()
+    if (${PREFIX}_FIND_REQUIRED AND NOT ${PREFIX}_FIND_QUIETLY)
+      message(FATAL_ERROR "Required library ${PREFIX} not found.")
+    endif ()
+  endif ()
+
+  if (NOT TARGET "TBB::${TARGET_NAME}")
+    if (${PREFIX}_LIBRARY_RELEASE)
+      tbb_extract_real_library(${${PREFIX}_LIBRARY_RELEASE} real_release)
+    endif ()
+    if (${PREFIX}_LIBRARY_DEBUG)
+      tbb_extract_real_library(${${PREFIX}_LIBRARY_DEBUG} real_debug)
+    endif ()
+    add_library(TBB::${TARGET_NAME} UNKNOWN IMPORTED)
+    set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${${PREFIX}_INCLUDE_DIR}")
+    if (${PREFIX}_LIBRARY_DEBUG AND ${PREFIX}_LIBRARY_RELEASE)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_release}"
+        IMPORTED_LOCATION_DEBUG "${real_debug}"
+        IMPORTED_LOCATION_RELEASE "${real_release}")
+    elseif (${PREFIX}_LIBRARY_RELEASE)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_release}")
+    elseif (${PREFIX}_LIBRARY_DEBUG)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_debug}")
+    endif ()
+  endif ()
+
+  #mark the following variables as internal variables
+  mark_as_advanced(${PREFIX}_INCLUDE_DIR
+                   ${PREFIX}_LIBRARY
+                   ${PREFIX}_LIBRARY_DEBUG
+                   ${PREFIX}_LIBRARY_RELEASE)
+endmacro()
+
+#===============================================
+# Generate debug names from given release names
+#===============================================
+macro(get_debug_names PREFIX)
+  foreach(i ${${PREFIX}})
+    set(${PREFIX}_DEBUG ${${PREFIX}_DEBUG} ${i}d ${i}D ${i}_d ${i}_D ${i}_debug ${i})
   endforeach()
-  
-  ##################################
-  # Set compile flags
-  ##################################
-  
-  if(TBB_tbb_LIBRARY MATCHES "debug")
-    set(TBB_DEFINITIONS "-DTBB_USE_DEBUG=1")
+endmacro()
+
+#===============================================
+# See if we have env vars to help us find tbb
+#===============================================
+macro(getenv_path VAR)
+   set(ENV_${VAR} $ENV{${VAR}})
+   # replace won't work if var is blank
+   if (ENV_${VAR})
+     string( REGEX REPLACE "\\\\" "/" ENV_${VAR} ${ENV_${VAR}} )
+   endif ()
+endmacro()
+
+#===============================================
+# Couple a set of release AND debug libraries
+#===============================================
+macro(make_library_set PREFIX)
+  if (${PREFIX}_RELEASE AND ${PREFIX}_DEBUG)
+    set(${PREFIX} optimized ${${PREFIX}_RELEASE} debug ${${PREFIX}_DEBUG})
+  elseif (${PREFIX}_RELEASE)
+    set(${PREFIX} ${${PREFIX}_RELEASE})
+  elseif (${PREFIX}_DEBUG)
+    set(${PREFIX} ${${PREFIX}_DEBUG})
+  endif ()
+endmacro()
+
+
+#=============================================================================
+#  Now to actually find TBB
+#
+
+# Get path, convert backslashes as ${ENV_${var}}
+getenv_path(TBB_ROOT)
+
+# initialize search paths
+set(TBB_PREFIX_PATH ${TBB_ROOT} ${ENV_TBB_ROOT})
+set(TBB_INC_SEARCH_PATH "")
+set(TBB_LIB_SEARCH_PATH "")
+
+
+# If user built from sources
+set(TBB_BUILD_PREFIX $ENV{TBB_BUILD_PREFIX})
+if (TBB_BUILD_PREFIX AND ENV_TBB_ROOT)
+  getenv_path(TBB_BUILD_DIR)
+  if (NOT ENV_TBB_BUILD_DIR)
+    set(ENV_TBB_BUILD_DIR ${ENV_TBB_ROOT}/build)
+  endif ()
+
+  # include directory under ${ENV_TBB_ROOT}/include
+  list(APPEND TBB_LIB_SEARCH_PATH
+    ${ENV_TBB_BUILD_DIR}/${TBB_BUILD_PREFIX}_release
+    ${ENV_TBB_BUILD_DIR}/${TBB_BUILD_PREFIX}_debug)
+endif ()
+
+
+# For Windows, let's assume that the user might be using the precompiled
+# TBB packages from the main website. These use a rather awkward directory
+# structure (at least for automatically finding the right files) depending
+# on platform and compiler, but we'll do our best to accommodate it.
+# Not adding the same effort for the precompiled linux builds, though. Those
+# have different versions for CC compiler versions and linux kernels which
+# will never adequately match the user's setup, so there is no feasible way
+# to detect the "best" version to use. The user will have to manually
+# select the right files. (Chances are the distributions are shipping their
+# custom version of tbb, anyway, so the problem is probably nonexistent.)
+if (WIN32 AND MSVC)
+  set(COMPILER_PREFIX "vc7.1")
+  if (MSVC_VERSION EQUAL 1400)
+    set(COMPILER_PREFIX "vc8")
+  elseif(MSVC_VERSION EQUAL 1500)
+    set(COMPILER_PREFIX "vc9")
+  elseif(MSVC_VERSION EQUAL 1600)
+    set(COMPILER_PREFIX "vc10")
+  elseif(MSVC_VERSION EQUAL 1700)
+    set(COMPILER_PREFIX "vc11")
+  elseif(MSVC_VERSION EQUAL 1800)
+    set(COMPILER_PREFIX "vc12")
+  elseif(MSVC_VERSION GREATER_EQUAL 1900)
+    set(COMPILER_PREFIX "vc14")
+  endif ()
+
+  # for each prefix path, add ia32/64\${COMPILER_PREFIX}\lib to the lib search path
+  foreach (dir IN LISTS TBB_PREFIX_PATH)
+    if (CMAKE_CL_64)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia64/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia64/${COMPILER_PREFIX})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${COMPILER_PREFIX})
+    else ()
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32/${COMPILER_PREFIX})
+    endif ()
+  endforeach ()
+endif ()
+
+# For OS X binary distribution, choose libc++ based libraries for Mavericks (10.9)
+# and above and AppleClang
+if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND
+    NOT CMAKE_SYSTEM_VERSION VERSION_LESS 13.0)
+  set (USE_LIBCXX OFF)
+  cmake_policy(GET CMP0025 POLICY_VAR)
+
+  if (POLICY_VAR STREQUAL "NEW")
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+      set (USE_LIBCXX ON)
+    endif ()
+  else ()
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      set (USE_LIBCXX ON)
+    endif ()
+  endif ()
+
+  if (USE_LIBCXX)
+    foreach (dir IN LISTS TBB_PREFIX_PATH)
+      list (APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/libc++ ${dir}/libc++/lib)
+    endforeach ()
+  endif ()
+endif ()
+
+# check compiler ABI
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  set(COMPILER_PREFIX)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
+    list(APPEND COMPILER_PREFIX "gcc4.7")
   endif()
-  
-  ##################################
-  # Set version strings
-  ##################################
-  
-  if(TBB_INCLUDE_DIRS)
-    file(READ "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" _tbb_version_file)
-    string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1"
-            TBB_VERSION_MAJOR "${_tbb_version_file}")
-    string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1"
-            TBB_VERSION_MINOR "${_tbb_version_file}")
-    string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1"
-            TBB_INTERFACE_VERSION "${_tbb_version_file}")
-    set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}")
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4)
+    list(APPEND COMPILER_PREFIX "gcc4.4")
   endif()
-  
-  find_package_handle_standard_args(TBB 
-      REQUIRED_VARS TBB_INCLUDE_DIRS TBB_LIBRARIES
-      HANDLE_COMPONENTS
-      VERSION_VAR TBB_VERSION)
-  
-  mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARIES)
-
-  unset(TBB_ARCHITECTURE)
-  unset(TBB_LIB_PATH_SUFFIX)
-  unset(TBB_DEFAULT_SEARCH_DIR)
+  list(APPEND COMPILER_PREFIX "gcc4.1")
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(COMPILER_PREFIX)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.6)
+    list(APPEND COMPILER_PREFIX "gcc4.7")
+  endif()
+  list(APPEND COMPILER_PREFIX "gcc4.4")
+else() # Assume compatibility with 4.4 for other compilers
+  list(APPEND COMPILER_PREFIX "gcc4.4")
+endif ()
+
+# if platform architecture is explicitly specified
+set(TBB_ARCH_PLATFORM $ENV{TBB_ARCH_PLATFORM})
+if (TBB_ARCH_PLATFORM)
+  foreach (dir IN LISTS TBB_PREFIX_PATH)
+    list(APPEND TBB_LIB_SEARCH_PATH ${dir}/${TBB_ARCH_PLATFORM}/lib)
+    list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/${TBB_ARCH_PLATFORM})
+  endforeach ()
+endif ()
+
+foreach (dir IN LISTS TBB_PREFIX_PATH)
+  foreach (prefix IN LISTS COMPILER_PREFIX)
+    if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${prefix})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${prefix}/lib)
+    else ()
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32/${prefix})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/${prefix}/lib)
+    endif ()
+  endforeach()
+endforeach ()
+
+# add general search paths
+foreach (dir IN LISTS TBB_PREFIX_PATH)
+  list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib ${dir}/Lib ${dir}/lib/tbb
+    ${dir}/Libs)
+  list(APPEND TBB_INC_SEARCH_PATH ${dir}/include ${dir}/Include
+    ${dir}/include/tbb)
+endforeach ()
+
+set(TBB_LIBRARY_NAMES tbb)
+get_debug_names(TBB_LIBRARY_NAMES)
+
+
+find_path(TBB_INCLUDE_DIR
+          NAMES tbb/tbb.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_LIBRARY_RELEASE
+             NAMES ${TBB_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_LIBRARY_DEBUG
+             NAMES ${TBB_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_LIBRARY)
+
+findpkg_finish(TBB tbb)
+
+#if we haven't found TBB no point on going any further
+if (NOT TBB_FOUND)
+  return()
+endif ()
+
+#=============================================================================
+# Look for TBB's malloc package
+set(TBB_MALLOC_LIBRARY_NAMES tbbmalloc)
+get_debug_names(TBB_MALLOC_LIBRARY_NAMES)
+
+find_path(TBB_MALLOC_INCLUDE_DIR
+          NAMES tbb/tbb.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_MALLOC_LIBRARY_RELEASE
+             NAMES ${TBB_MALLOC_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_MALLOC_LIBRARY_DEBUG
+             NAMES ${TBB_MALLOC_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_MALLOC_LIBRARY)
+
+findpkg_finish(TBB_MALLOC tbbmalloc)
+
+#=============================================================================
+# Look for TBB's malloc proxy package
+set(TBB_MALLOC_PROXY_LIBRARY_NAMES tbbmalloc_proxy)
+get_debug_names(TBB_MALLOC_PROXY_LIBRARY_NAMES)
+
+find_path(TBB_MALLOC_PROXY_INCLUDE_DIR
+          NAMES tbb/tbbmalloc_proxy.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_MALLOC_PROXY_LIBRARY_RELEASE
+             NAMES ${TBB_MALLOC_PROXY_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_MALLOC_PROXY_LIBRARY_DEBUG
+             NAMES ${TBB_MALLOC_PROXY_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_MALLOC_PROXY_LIBRARY)
+
+findpkg_finish(TBB_MALLOC_PROXY tbbmalloc_proxy)
+
+
+#=============================================================================
+#parse all the version numbers from tbb
+if(NOT TBB_VERSION)
+
+ #only read the start of the file
+ file(STRINGS
+      "${TBB_INCLUDE_DIR}/tbb/tbb_stddef.h"
+      TBB_VERSION_CONTENTS
+      REGEX "VERSION")
+
+  string(REGEX REPLACE
+    ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1"
+    TBB_VERSION_MAJOR "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+    ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1"
+    TBB_VERSION_MINOR "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+        ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1"
+        TBB_INTERFACE_VERSION "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+        ".*#define TBB_COMPATIBLE_INTERFACE_VERSION ([0-9]+).*" "\\1"
+        TBB_COMPATIBLE_INTERFACE_VERSION "${TBB_VERSION_CONTENTS}")
 
 endif()
diff --git a/thirdParty/cupla/alpaka/doc/doxygen/Doxyfile b/thirdParty/cupla/alpaka/doc/doxygen/Doxyfile
deleted file mode 100644
index 1f7127b6a2..0000000000
--- a/thirdParty/cupla/alpaka/doc/doxygen/Doxyfile
+++ /dev/null
@@ -1,2501 +0,0 @@
-# Doxyfile 1.8.13
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = alpaka
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          = "Abstraction Library for Parallel Kernel Acceleration"
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           = alpaka_doxygen.png
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       =
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = YES
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = YES
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
-# to that level are automatically included in the table of contents, even if
-# they do not have an id attribute.
-# Note: This feature currently applies only to Markdown headings.
-# Minimum value: 0, maximum value: 99, default value: 0.
-# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
-
-TOC_INCLUDE_HEADINGS   = 0
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = YES
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = YES
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = YES
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = YES
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = YES
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = YES
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = YES
-
-# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
-# The default value is: NO.
-
-WARN_AS_ERROR          = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = ../../include/ \
-                         ../../README.md
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
-
-FILE_PATTERNS          = *.c \
-                         *.cc \
-                         *.cxx \
-                         *.cpp \
-                         *.c++ \
-                         *.java \
-                         *.ii \
-                         *.ixx \
-                         *.ipp \
-                         *.i++ \
-                         *.inl \
-                         *.idl \
-                         *.ddl \
-                         *.odl \
-                         *.h \
-                         *.hh \
-                         *.hxx \
-                         *.hpp \
-                         *.h++ \
-                         *.cs \
-                         *.d \
-                         *.php \
-                         *.php4 \
-                         *.php5 \
-                         *.phtml \
-                         *.inc \
-                         *.m \
-                         *.markdown \
-                         *.md \
-                         *.mm \
-                         *.dox \
-                         *.py \
-                         *.f90 \
-                         *.f \
-                         *.for \
-                         *.tcl \
-                         *.vhd \
-                         *.vhdl \
-                         *.ucf \
-                         *.qsf \
-                         *.as \
-                         *.js \
-                         *.c \
-                         *.cu
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE = ../../README.md
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = YES
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = NO
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = NO
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = YES
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = YES
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = YES
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 1
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             = ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED \
-                         ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED \
-                         ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED \
-                         ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED \
-                         ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED \
-                         ALPAKA_ACC_CPU_BT_OMP4_ENABLED \
-                         ALPAKA_ACC_GPU_CUDA_ENABLED \
-                         __CUDACC__ \
-                         _OPENMP=201307
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
-# configuration file for plantuml.
-
-PLANTUML_CFG_FILE      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/thirdParty/cupla/alpaka/doc/doxygen/alpaka_doxygen.png b/thirdParty/cupla/alpaka/doc/doxygen/alpaka_doxygen.png
deleted file mode 100644
index c1b472026a..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/doxygen/alpaka_doxygen.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/images/alpaka.svg b/thirdParty/cupla/alpaka/doc/images/alpaka.svg
deleted file mode 100644
index 4be84f586c..0000000000
--- a/thirdParty/cupla/alpaka/doc/images/alpaka.svg
+++ /dev/null
@@ -1,64 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   id="svg8"
-   version="1.1"
-   viewBox="0 0 50.465511 16.971249"
-   height="16.971249mm"
-   width="50.465511mm">
-  <defs
-     id="defs2" />
-  <metadata
-     id="metadata5">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <g
-     transform="translate(-41.516563,-12.22615)"
-     id="layer1">
-    <path
-       id="path819"
-       d="m 53.85481,28.688884 c -0.08662,-2.429007 0.267461,-4.599064 1.570208,-9.582376 m 0,0 c 0.374986,-1.028082 0.92916,-1.69228 1.552487,-2.101333 m 2.942738,-0.477216 c 0.456436,0.09162 0.851505,0.238892 1.131154,0.388398 m 0,0 c 1.235735,0.722107 2.545425,1.449922 2.292465,3.162361 0.201527,0.2565 1.733718,0.441462 1.748763,0.967881 0.01554,0.543585 -0.162828,0.449077 -0.135802,0.618137 0.058,0.362822 0.153237,0.360409 0.131151,0.600773 -0.04148,0.451381 -0.46527,0.487867 -0.849939,0.599423 -0.187513,0.05438 -0.07581,0.443721 -0.251826,0.515903 l -2.917528,0.401617"
-       style="fill:none;fill-rule:evenodd;stroke:#f17017;stroke-width:1;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
-    <g
-       id="text856"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:5.29166651px;line-height:5.66208363px;font-family:'Latin Modern Sans';-inkscape-font-specification:'Latin Modern Sans';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       aria-label="al  aka">
-      <path
-         id="path818"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         d="m 49.043509,23.742604 v -5.485695 c 0,-1.728611 -1.375834,-3.192639 -3.492501,-3.192639 -0.846666,0 -1.887361,0.105834 -3.051527,0.617361 -0.03528,0.03528 -0.352778,0.176389 -0.458612,0.246945 -0.123472,0.07056 -0.123472,0.246944 -0.123472,0.246944 0,0.194028 0,0.934861 0.3175,0.934861 0.05292,0 0.0882,0 0.335139,-0.15875 1.075972,-0.670277 1.993195,-0.846666 2.945695,-0.846666 1.446389,0 2.116666,1.023055 2.116666,2.2225 v 0.705555 c -4.568472,0.105834 -6.103055,1.569861 -6.103055,3.051528 0,1.27 0.970139,2.610556 2.628194,2.610556 1.164167,0 2.69875,-0.388056 3.510139,-1.446389 v 0.493889 c 0,0.352778 0,0.776111 0.687917,0.776111 0.687917,0 0.687917,-0.423333 0.687917,-0.776111 z m -1.411112,-2.081389 c 0,1.922639 -2.469444,1.922639 -2.628194,1.922639 -1.128889,0 -2.116667,-0.599722 -2.116667,-1.516945 0,-0.970139 0.917222,-1.552222 1.728611,-1.852083 1.11125,-0.405695 2.169584,-0.458611 3.01625,-0.493889 z" />
-      <path
-         id="path820"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         d="M 52.430111,23.742604 V 13.053437 c 0,-0.352778 0,-0.776111 -0.687917,-0.776111 -0.687917,0 -0.687917,0.423333 -0.687917,0.776111 v 10.689167 c 0,0.352778 0,0.776111 0.687917,0.776111 0.687917,0 0.687917,-0.423333 0.687917,-0.776111 z" />
-      <path
-         id="path822"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         d="m 73.773159,23.742604 v -5.485695 c 0,-1.728611 -1.375833,-3.192639 -3.4925,-3.192639 -0.846667,0 -1.887361,0.105834 -3.051528,0.617361 -0.03528,0.03528 -0.352778,0.176389 -0.458611,0.246945 -0.123472,0.07056 -0.123472,0.246944 -0.123472,0.246944 0,0.194028 0,0.934861 0.3175,0.934861 0.05292,0 0.08819,0 0.335139,-0.15875 1.075972,-0.670277 1.993194,-0.846666 2.945694,-0.846666 1.446389,0 2.116667,1.023055 2.116667,2.2225 v 0.705555 c -4.568472,0.105834 -6.103056,1.569861 -6.103056,3.051528 0,1.27 0.970139,2.610556 2.628195,2.610556 1.164166,0 2.69875,-0.388056 3.510139,-1.446389 v 0.493889 c 0,0.352778 0,0.776111 0.687916,0.776111 0.687917,0 0.687917,-0.423333 0.687917,-0.776111 z m -1.411111,-2.081389 c 0,1.922639 -2.469445,1.922639 -2.628195,1.922639 -1.128889,0 -2.116666,-0.599722 -2.116666,-1.516945 0,-0.970139 0.917222,-1.552222 1.728611,-1.852083 1.11125,-0.405695 2.169583,-0.458611 3.01625,-0.493889 z" />
-      <path
-         id="path824"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         d="m 83.8449,24.165937 c 0,-0.123472 -0.07055,-0.194028 -0.123472,-0.264583 l -3.915833,-4.885973 3.474861,-3.01625 c 0.176389,-0.141111 0.246944,-0.211666 0.246944,-0.352777 0,-0.3175 -0.282222,-0.3175 -0.652639,-0.3175 -0.440972,0 -0.670277,0 -1.040694,0.3175 l -4.691945,4.074583 v -6.6675 c 0,-0.3175 0,-0.776111 -0.652639,-0.776111 -0.652639,0 -0.652639,0.458611 -0.652639,0.776111 v 10.689167 c 0,0.3175 0,0.776111 0.652639,0.776111 0.635,0 0.635,-0.47625 0.635,-0.776111 v -2.398889 l 1.728612,-1.499306 3.474861,4.303889 c 0.264583,0.335139 0.352778,0.370417 0.881944,0.370417 0.335139,0 0.635,0 0.635,-0.352778 z" />
-      <path
-         id="path826"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         d="m 91.941088,23.742604 v -5.485695 c 0,-1.728611 -1.375834,-3.192639 -3.4925,-3.192639 -0.846667,0 -1.887362,0.105834 -3.051528,0.617361 -0.03528,0.03528 -0.352778,0.176389 -0.458611,0.246945 -0.123473,0.07056 -0.123473,0.246944 -0.123473,0.246944 0,0.194028 0,0.934861 0.3175,0.934861 0.05292,0 0.08819,0 0.335139,-0.15875 1.075973,-0.670277 1.993195,-0.846666 2.945695,-0.846666 1.446389,0 2.116667,1.023055 2.116667,2.2225 v 0.705555 c -4.568473,0.105834 -6.103056,1.569861 -6.103056,3.051528 0,1.27 0.970139,2.610556 2.628194,2.610556 1.164167,0 2.69875,-0.388056 3.510139,-1.446389 v 0.493889 c 0,0.352778 0,0.776111 0.687917,0.776111 0.687917,0 0.687917,-0.423333 0.687917,-0.776111 z m -1.411111,-2.081389 c 0,1.922639 -2.469445,1.922639 -2.628195,1.922639 -1.128889,0 -2.116667,-0.599722 -2.116667,-1.516945 0,-0.970139 0.917223,-1.552222 1.728612,-1.852083 1.11125,-0.405695 2.169583,-0.458611 3.01625,-0.493889 z" />
-    </g>
-    <path
-       id="path821"
-       d="m 57.419525,18.935009 c -0.532206,-2.625818 -0.388375,-3.813045 1.214407,-4.303875 1.400036,-0.295267 0.872802,2.314188 0.917903,2.566623"
-       style="fill-opacity:0;fill-rule:evenodd;stroke:#f17017;stroke-width:1;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;fill:#ffffff" />
-  </g>
-</svg>
diff --git a/thirdParty/cupla/alpaka/doc/images/alpaka_401x135.png b/thirdParty/cupla/alpaka/doc/images/alpaka_401x135.png
deleted file mode 100644
index ce1fcbdf21..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/images/alpaka_401x135.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/images/alpaka_inkscape.svg b/thirdParty/cupla/alpaka/doc/images/alpaka_inkscape.svg
deleted file mode 100644
index 9a0c1631d8..0000000000
--- a/thirdParty/cupla/alpaka/doc/images/alpaka_inkscape.svg
+++ /dev/null
@@ -1,103 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
-
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="50.465511mm"
-   height="16.971249mm"
-   viewBox="0 0 50.465511 16.971249"
-   version="1.1"
-   id="svg8"
-   inkscape:version="0.92.4 5da689c313, 2019-01-14"
-   sodipodi:docname="alpaka.svg"
-   inkscape:export-filename="/home/tornado/Arbeit/hzdr/presentations/poster/mephisto/src/resources/logos/alpaka.png"
-   inkscape:export-xdpi="82.543503"
-   inkscape:export-ydpi="82.543503">
-  <defs
-     id="defs2" />
-  <sodipodi:namedview
-     id="base"
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1.0"
-     inkscape:pageopacity="0.0"
-     inkscape:pageshadow="2"
-     inkscape:zoom="2.8284271"
-     inkscape:cx="-42.615327"
-     inkscape:cy="51.074084"
-     inkscape:document-units="mm"
-     inkscape:current-layer="layer1"
-     showgrid="false"
-     inkscape:snap-global="false"
-     inkscape:window-width="1920"
-     inkscape:window-height="1025"
-     inkscape:window-x="0"
-     inkscape:window-y="31"
-     inkscape:window-maximized="1"
-     fit-margin-top="0"
-     fit-margin-left="0"
-     fit-margin-right="0"
-     fit-margin-bottom="0"
-     showguides="false" />
-  <metadata
-     id="metadata5">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <g
-     inkscape:label="Ebene 1"
-     inkscape:groupmode="layer"
-     id="layer1"
-     transform="translate(-41.516563,-12.22615)">
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#f17017;stroke-width:1;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
-       d="m 53.85481,28.688884 c -0.08662,-2.429007 0.267461,-4.599064 1.570208,-9.582376 m 0,0 c 0.374986,-1.028082 0.92916,-1.69228 1.552487,-2.101333 m 2.942738,-0.477216 c 0.456436,0.09162 0.851505,0.238892 1.131154,0.388398 m 0,0 c 1.235735,0.722107 2.545425,1.449922 2.292465,3.162361 0.201527,0.2565 1.733718,0.441462 1.748763,0.967881 0.01554,0.543585 -0.162828,0.449077 -0.135802,0.618137 0.058,0.362822 0.153237,0.360409 0.131151,0.600773 -0.04148,0.451381 -0.46527,0.487867 -0.849939,0.599423 -0.187513,0.05438 -0.07581,0.443721 -0.251826,0.515903 l -2.917528,0.401617"
-       id="path819"
-       inkscape:connector-curvature="0"
-       sodipodi:nodetypes="ccccccccsssscc" />
-    <g
-       aria-label="al  aka"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:5.29166651px;line-height:5.66208363px;font-family:'Latin Modern Sans';-inkscape-font-specification:'Latin Modern Sans';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       id="text856">
-      <path
-         d="m 49.043509,23.742604 v -5.485695 c 0,-1.728611 -1.375834,-3.192639 -3.492501,-3.192639 -0.846666,0 -1.887361,0.105834 -3.051527,0.617361 -0.03528,0.03528 -0.352778,0.176389 -0.458612,0.246945 -0.123472,0.07056 -0.123472,0.246944 -0.123472,0.246944 0,0.194028 0,0.934861 0.3175,0.934861 0.05292,0 0.0882,0 0.335139,-0.15875 1.075972,-0.670277 1.993195,-0.846666 2.945695,-0.846666 1.446389,0 2.116666,1.023055 2.116666,2.2225 v 0.705555 c -4.568472,0.105834 -6.103055,1.569861 -6.103055,3.051528 0,1.27 0.970139,2.610556 2.628194,2.610556 1.164167,0 2.69875,-0.388056 3.510139,-1.446389 v 0.493889 c 0,0.352778 0,0.776111 0.687917,0.776111 0.687917,0 0.687917,-0.423333 0.687917,-0.776111 z m -1.411112,-2.081389 c 0,1.922639 -2.469444,1.922639 -2.628194,1.922639 -1.128889,0 -2.116667,-0.599722 -2.116667,-1.516945 0,-0.970139 0.917222,-1.552222 1.728611,-1.852083 1.11125,-0.405695 2.169584,-0.458611 3.01625,-0.493889 z"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         id="path818" />
-      <path
-         d="M 52.430111,23.742604 V 13.053437 c 0,-0.352778 0,-0.776111 -0.687917,-0.776111 -0.687917,0 -0.687917,0.423333 -0.687917,0.776111 v 10.689167 c 0,0.352778 0,0.776111 0.687917,0.776111 0.687917,0 0.687917,-0.423333 0.687917,-0.776111 z"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         id="path820" />
-      <path
-         d="m 73.773159,23.742604 v -5.485695 c 0,-1.728611 -1.375833,-3.192639 -3.4925,-3.192639 -0.846667,0 -1.887361,0.105834 -3.051528,0.617361 -0.03528,0.03528 -0.352778,0.176389 -0.458611,0.246945 -0.123472,0.07056 -0.123472,0.246944 -0.123472,0.246944 0,0.194028 0,0.934861 0.3175,0.934861 0.05292,0 0.08819,0 0.335139,-0.15875 1.075972,-0.670277 1.993194,-0.846666 2.945694,-0.846666 1.446389,0 2.116667,1.023055 2.116667,2.2225 v 0.705555 c -4.568472,0.105834 -6.103056,1.569861 -6.103056,3.051528 0,1.27 0.970139,2.610556 2.628195,2.610556 1.164166,0 2.69875,-0.388056 3.510139,-1.446389 v 0.493889 c 0,0.352778 0,0.776111 0.687916,0.776111 0.687917,0 0.687917,-0.423333 0.687917,-0.776111 z m -1.411111,-2.081389 c 0,1.922639 -2.469445,1.922639 -2.628195,1.922639 -1.128889,0 -2.116666,-0.599722 -2.116666,-1.516945 0,-0.970139 0.917222,-1.552222 1.728611,-1.852083 1.11125,-0.405695 2.169583,-0.458611 3.01625,-0.493889 z"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         id="path822" />
-      <path
-         d="m 83.8449,24.165937 c 0,-0.123472 -0.07055,-0.194028 -0.123472,-0.264583 l -3.915833,-4.885973 3.474861,-3.01625 c 0.176389,-0.141111 0.246944,-0.211666 0.246944,-0.352777 0,-0.3175 -0.282222,-0.3175 -0.652639,-0.3175 -0.440972,0 -0.670277,0 -1.040694,0.3175 l -4.691945,4.074583 v -6.6675 c 0,-0.3175 0,-0.776111 -0.652639,-0.776111 -0.652639,0 -0.652639,0.458611 -0.652639,0.776111 v 10.689167 c 0,0.3175 0,0.776111 0.652639,0.776111 0.635,0 0.635,-0.47625 0.635,-0.776111 v -2.398889 l 1.728612,-1.499306 3.474861,4.303889 c 0.264583,0.335139 0.352778,0.370417 0.881944,0.370417 0.335139,0 0.635,0 0.635,-0.352778 z"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         id="path824" />
-      <path
-         d="m 91.941088,23.742604 v -5.485695 c 0,-1.728611 -1.375834,-3.192639 -3.4925,-3.192639 -0.846667,0 -1.887362,0.105834 -3.051528,0.617361 -0.03528,0.03528 -0.352778,0.176389 -0.458611,0.246945 -0.123473,0.07056 -0.123473,0.246944 -0.123473,0.246944 0,0.194028 0,0.934861 0.3175,0.934861 0.05292,0 0.08819,0 0.335139,-0.15875 1.075973,-0.670277 1.993195,-0.846666 2.945695,-0.846666 1.446389,0 2.116667,1.023055 2.116667,2.2225 v 0.705555 c -4.568473,0.105834 -6.103056,1.569861 -6.103056,3.051528 0,1.27 0.970139,2.610556 2.628194,2.610556 1.164167,0 2.69875,-0.388056 3.510139,-1.446389 v 0.493889 c 0,0.352778 0,0.776111 0.687917,0.776111 0.687917,0 0.687917,-0.423333 0.687917,-0.776111 z m -1.411111,-2.081389 c 0,1.922639 -2.469445,1.922639 -2.628195,1.922639 -1.128889,0 -2.116667,-0.599722 -2.116667,-1.516945 0,-0.970139 0.917223,-1.552222 1.728612,-1.852083 1.11125,-0.405695 2.169583,-0.458611 3.01625,-0.493889 z"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:17.63888931px;font-family:'Latin Modern Sans Quotation';-inkscape-font-specification:'Latin Modern Sans Quotation';letter-spacing:-1.32291663px;fill:#00599d;stroke-width:0.26458332px"
-         id="path826" />
-    </g>
-    <path
-       style="fill-opacity:0;fill-rule:evenodd;stroke:#f17017;stroke-width:1;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;fill:#ffffff"
-       d="m 57.419525,18.935009 c -0.532206,-2.625818 -0.388375,-3.813045 1.214407,-4.303875 1.400036,-0.295267 0.872802,2.314188 0.917903,2.566623"
-       id="path821"
-       inkscape:connector-curvature="0"
-       sodipodi:nodetypes="ccc" />
-  </g>
-</svg>
diff --git a/thirdParty/cupla/alpaka/doc/markdown/Index.md b/thirdParty/cupla/alpaka/doc/markdown/Index.md
deleted file mode 100644
index 45aecef238..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/Index.md
+++ /dev/null
@@ -1,20 +0,0 @@
-* User Documentation
-  * 1. [Introduction](user/Introduction.md)
-  * 2. [Abstraction](user/Abstraction.md)
-    * 1. [Thread](user/abstraction/Thread.md)
-    * 2. [Block](user/abstraction/Block.md)
-    * 3. [Warp](user/abstraction/Warp.md)
-    * 4. [Element](user/abstraction/Element.md)
-  * 3. [Implementation](user/Implementation.md)
-    * 1. [Library Interface](user/implementation/Library.md)
-      * 1. [Structure](user/implementation/library/Structure.md)
-      * 2. [Usage](user/implementation/library/Usage.md)
-      * 3. [Rationale](user/implementation/library/Rationale.md)
-      * 4. [Details](user/implementation/library/Details.md)
-    * 2. [Mapping onto Specific Hardware Architectures](user/implementation/Mapping.md)
-      * 1. [CUDA GPUs](user/implementation/mapping/CUDA.md)
-      * 2. [x86 CPUs](user/implementation/mapping/x86.md)
-      * 3. [Accelerators](user/implementation/mapping/Accelerators.md)
-* Developer Documentation
-  * 1. [Code Formatting](dev/style.md)
-  * 2. [Publishing Doxygen Documentation on gh-pages](dev/gh-pages.md)
diff --git a/thirdParty/cupla/alpaka/doc/markdown/dev/gh-pages.md b/thirdParty/cupla/alpaka/doc/markdown/dev/gh-pages.md
deleted file mode 100644
index e3a8159546..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/dev/gh-pages.md
+++ /dev/null
@@ -1,38 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Publishing doxygen documentation on gh-pages
-============================================
-
-To deploy the doxygen documentation a copy of the repository is created inside the deployed folder.
-This copy is always in the gh-pages branch consisting only of the containing files.
-This folder is ignored in all other branches.
-
-Creation of gh-pages
---------------------
-
-*NOTE:* This has already been done once and does not have to be repeated!
-
-On working branch:
-- Add deploy directory to `.gitignore` (if not already done)
-- Create the `gh-pages` branch: `git checkout --orphan gh-pages`
-- Clean the branch: `git rm -rf .`
-- Commit and push the branch: `git add --all`, `git commit -m"add gh-pages branch"`, `git push`
-
-Setup
------
-
-*NOTE:* This has to be done once per cloned alpaka repository that is used to deploy the doxygen documentation!
-
-On working branch:
-- Clone the repo on the gh-pages branch inside the deploy folder: `git clone -b gh-pages git@github.com:ComputationalRadiationPhysics/alpaka.git doc/doxygen/html`
-
-Update
-------
-
-From within `develop`/`master`: 
-- Execute doxygen
-- `cd doc/doxygen/html`
-- `git add .`
-- `git commit -m "updated doxygen documentation"`
-- `git push`
-- `cd ../../../`
diff --git a/thirdParty/cupla/alpaka/doc/markdown/dev/style.md b/thirdParty/cupla/alpaka/doc/markdown/dev/style.md
deleted file mode 100644
index 70841cf6d1..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/dev/style.md
+++ /dev/null
@@ -1,146 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Style
-=====
-
-Naming
-------
-
-* Types are always in PascalCase (KernlExecCuda, BufT, ...) and singular.
-* Variables are always in camelCase (memBufHost, ...) and plural for collections and singular else.
-* Namespaces are always in lowercase and singular is preferred.
-* There are no two consecutive upper case letters (AccOpenMp, HtmlRenderer, IoHandler, ...). This makes names more easily readable.
-
-
-Types
------
-
-* Always use integral types with known width (`int32_t`, `uin64_t`, ...).
-Never use `int`, `unisgned long`, etc.
-
-
-Type Qualifiers
----------------------
-
-The order of  type qualifiers should be:
-```Type const * const``` for a const pointer to a const Type.
-```Type const &``` for a reference to a const Type.
-
-The reason is that types can be read from right to left correctly without jumping back and forth.
-```const Type * const``` and ```const Type &``` would require jumping in either way to read them correctly.
-
-
-Variables
----------
-
-* Variables should always be initialized on construction because this can produce hard to debug errors.
-This can (nearly) always be done even in performance critical code without sacrificing speed by using a functional programming style.
-* Variables should (nearly) always be `const` to make the code more easy to understand.
-This is equivalent to functional programming and the SSA (static single assignment) style used by LLVM.
-This should have no speed implication as every half baked compiler analyses the usage of variables and reuses registers.  
-* Variable definitions should be differentiated from assignments by using either `(...)` or `{...}` but never `=` for definitions. 
-Use `uint32_t const iUsageOfThisVariable(42);` instead of `uint32_t const iUsageOfThisVariable = 42;`
-
-
-Comments
---------
-
-* Always use C++-Style comments `//`
-* For types use `//#############################################################################` to start the comment block.
-* For functions use `//-----------------------------------------------------------------------------` to start the comment block.
-* Never write comments for closing braces (namespaces, classes, etc ...)
-
-
-Braces
-------
-
-* Braces (opening and closing) for classes, structs, functions, namespaces, etc. appear on a new line. Exception: If the function or class body is empty, the opening and closing braces are on the same (next) line.
-* Only braces for variable initialization can appear in-line.
-
-
-Indentation
------------
-
-* Always indent everything by *one level* (namespace body, class members, function body, ...)
-* Do not use more indentation e.g. to align function parameters.
-
-
-Spaces
-------
-
-* Trailing white-spaces are forbidden.
-* There is no space between keywords (if, for, ...) and the opening parenthesis.
-* There is no space after the opening `(` or `<` and before the closing `)` `>`.
-* There is a space before and after binary operators (=, *, +, ...)
-* There is no space after the unary operators !, ~, ...
-
-
-Functions
----------
-
-* Always use the trailing return type syntax with the return type on a new line even if the return type is void: 
-```C++
-auto func() 
--> bool
-```
-  * This makes it easier to see the return type because it is on its own line.
-  * This leads to a consistent style for constructs where there is no alternative style (lambdas, functions templates with dependent return types) and standard functions.
-
-* Each function parameter is on a new indented line:
-```C++
-auto func(
-    float f1,
-    float f2) 
--> bool
-{
-    return true
-}
-```
-```C++
-func(
-    1.0f,
-    2.0f);
-```
-  * Makes it easier to see how many parameters there are and which position they have. 
-
-
-Templates
----------  
-
-* Template parameters are prefixed with `T` to differentiate them from class or function local typedefs.
-
-* Each template parameter is on a new indented line:
-```C++
-template<
-    typename TParam,
-    typename TArgs...>
-auto func() 
--> bool
-```
-  * Makes it easier to see how many template parameters there are and which position they have. 
-
-* Always use ```typename``` for template parameters. There is NO difference to class and typename matches the intent better.
-
-
-Traits
-------
-
-* Trait classes always have one more template parameter (with default parameter) then is required for enabling SFINAE in the specialization:
-```C++
-template<
-    typename T, 
-    typename TSfinae = void>
-struct GetOffsets;
-```
-
-* Template trait aliases always end with a `T` e.g. `BufT` while the corresponding trait ends with `Type` e.g. `BufType`
-
-* Traits for implementations always have the same name as the accessor function but in PascalCase while the member function is camelCase again: `sin(){...}` and `Sin{sin(){...}};`
-
-Includes
---------
-
-* The order of includes is from the most specialized header to the most general one.
-This order helps to find missing includes in more specialized headers because the general ones are always included afterwards.
-
-* A comment with the types or functions included by a include file make it easier to find out why a special header is included.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/Abstraction.md b/thirdParty/cupla/alpaka/doc/markdown/user/Abstraction.md
deleted file mode 100644
index 3ba3199449..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/Abstraction.md
+++ /dev/null
@@ -1,131 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Abstraction
-===========
-
-<!---
-Objective of the abstraction is to separate the parallelization strategy from the algorithm itself.
-Algorithm code written by users should not depend on any parallelization library or specific strategy.
-This would allow to exchange the parallelization back-end without any changes to the algorithm itself.
-Besides allowing to test different parallelization strategies this also makes it possible to port algorithms to new, yet unsupported, platforms.
--->
-
-Parallelism and memory hierarchies at all levels need to be exploited in order to achieve performance portability across various types of accelerators.
-Within this chapter an abstraction will be derivated that tries to provide a maximum of parallelism while simultaneously considering implementability and applicability in hardware.
-
-Looking at the current HPC hardware landscape, we often see nodes with multiple sockets/processors extended by accelerators like GPUs or Intel Xeon Phi, each with their own processing units.
-Within a CPU or a Intel Xeon Phi there are cores with hyper-threads, vector units and a large caching infrastructure.
-Within a GPU there are many small cores and only few caches.
-Each entity in the hierarchy has access to different memories.
-For example, each socket / processor manages its RAM, while the cores additionally have non-explicit access to L3, L2 and L1 caches.
-On a GPU there are global, constant, shared and other memory types which all can be accessed explicitly.
-The interface has to abstract from these differences without sacrificing speed on any platform.
-
-A process running on a multi-socket node is the largest entity within *alpaka*.
-The abstraction is only about the task and data parallel execution on the process/node level and down.
-It does not provide any primitives for inter-node communication.
-However, such libraries can be combined with *alpaka*.
-
-An application process always has a main thread and is by definition running on the host.
-It can access the host memory and various accelerator devices.
-Such accelerators can be GPUs, Intel Xeon Phis, the host itself or other devices.
-Thus, the host not necessarily has to be different from the accelerator device used for the computations.
-For instance, an Intel Xeon Phi simultaneously can be the host and the accelerator device.
-
-The *alpaka* library can be used to offload the parallel execution of task and data parallel work simultaneously onto different accelerator devices.
-
-Task Parallelism
-----------------
-
-One of the basic building blocks of modern applications is task parallelism.
-For example, the operating system scheduler, deciding which thread of which process gets how many processing time on which CPU core, enables task parallelism of applications.
-It controls the execution of different tasks on different processing units.
-Such task parallelism can be, for instance, the output of the progress in parallel to a download.
-This can be implemented via two threads executing two different tasks.
-
-The valid dependencies between tasks within an application can be defined as a DAG (directed acyclic graph) in all cases.
-The tasks are represented by nodes and the dependencies by edges.
-In this model, a task is ready to be executed if the number of incoming edges is zero.
-After a task finished it's work, it is removed from the graph as well as all of it's outgoing edges,.
-This reduces the number of incoming edges of subsequent tasks.
-
-The problem with this model is the inherent overhead and the missing hardware and API support.
-When it is directly implemented as a graph, at least all depending tasks have to be updated and checked if they are ready to be executed after a task finished.
-Depending on the size of the graph and the number of edges this can be a huge overhead.
-
-*OpenCL* allows to define a task graph in a somewhat different way.
-Tasks can be enqueued into an out-of-order command queue combined with events that have to be finished before the newly enqueued task can be started.
-Tasks in the command queue with unmet dependencies are skipped and subsequent ones are executed.
-The `CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE` property of a command queue is an optional feature only supported by few vendors.
-Therefore, it can not be assumed to be available on all systems.
-
-*CUDA* on the other hand does currently (version 7.5) not support such out-of-order queues in any way.
-The user has to define dependencies explicitly through the order the tasks are enqueued into the queues (called queues in *CUDA*).
-Within a queue, tasks are always executed in sequential order, while multiple queues are executed in parallel.
-Queues can wait for events enqueued into other queues.
-
-In both APIs, *OpenCL* and *CUDA*, a task graph can be emulated by creating one queue per task and enqueuing a unique event after each task, which can be used to wait for the preceding task.
-However, this is not feasible due to the large queue and event creation costs as well as other overheads within this process.
-
-Therefore, to be compatible with a wide range of APIs, the interface for task parallelism has to be constrained.
-Instead of a general DAG, multiple queues of sequentially executed tasks will be used to describe task parallelism.
-Events that can be enqueued into the queues enhance the basic task parallelism by enabling synchronization between different queues, devices or the host threads.
-
-Data Parallelism
-----------------
-
-In contrast to task parallelism, data parallelism describes the execution of one and the same task on multiple, often related data elements.
-For example, an image color space conversion is a textbook example of a data parallel task.
-The same operation is executed independently on each pixel.
-Other data parallel algorithms additionally introduce dependencies between threads in the input-, intermediate-, or output-data.
-For example, the calculation of a brightness histogram has no input-data dependencies.
-However, all pixel brightness values finally have to be merged into a single result.
-Even these two simple examples show that it is necessary to think about the interaction of parallel entities to minimize the influence of data dependencies.
-
-Furthermore, it is necessary to respect the principles of spatial and temporal locality.
-Current hardware is built around these locality principles to reduce latency by using hierarchical memory as a trade-off between speed and hardware size.
-Multiple levels of caches, from small and very fast ones to very large and slower ones exploit temporal locality by keeping recently referenced data as close to the actual processing units as possible.
-Spatial locality in the main memory is also important for caches because they are usually divided into multiple lines that can only be exchanged one cache line at a time.
-If one data element is loaded and cached, it is highly likely that nearby elements are also cached.
-If the pixels of an image are stored row wise but are read out column wise, the spatial locality assumption of many CPUs is violated and the performance suffers.
-GPUs on the other hand do not have a large caching hierarchy but allow explicit access to a fast memory shared across multiple cores.
-Therefore, the best way to process individual data elements of a data parallel task is dependent on the data structure as well as the underlying hardware.
-
-The main part of the *alpaka* abstraction is the way it abstracts data parallelism and allows the algorithm writer to take into account the hierarchy of processing units, their data parallel features and corresponding memory regions.
-The abstraction developed is influenced and based on the groundbreaking *CUDA* and *OpenCL* abstractions of a multidimensional grid of threads with additional hierarchy levels in between.
-Another level of parallelism is added to those abstractions to unify the data parallel capabilities of modern hardware architectures.
-The explicit access to all hierarchy levels enables the user to write code that runs performant on all current platforms.
-However, the abstraction does not try to automatically optimize memory accesses or data structures but gives the user full freedom to use data structures matching the underlying hardware preferences.
-
-The individual levels are explained on the following pages:
-
-1. [Thread](abstraction/Thread.md)
-2. [Block](abstraction/Block.md)
-3. [Warp](abstraction/Warp.md)
-4. [Element](abstraction/Element.md)
-
-Summary
--------
-
-This abstraction is called *Redundant Hierarchical Parallelism*.
-This term is inspired by the paper *The Future of Accelerator Programming: Abstraction, Performance or Can We Have Both?* [PDF](http://olab.is.s.u-tokyo.ac.jp/~kamil.rocki/rocki_burtscher_sac14.pdf) [DOI](http://dx.doi.org/10.1109/ICPADS.2013.76).
-It investigates a similar *concept of copious parallel programming* reaching 80%-90% of the native performance while comparing CPU and GPU centric versions of an *OpenCL* n-body simulation with a general version utilizing parallelism on multiple hierarchy levels.
-
-The *CUDA* or *OpenCL* abstractions themselves are very similar to the one designed in the previous sections and consists of all but the Element level.
-However, as has been shown, all five abstraction hierarchy levels are necessary to fully utilize current architectures.
-By emulating unsupported or ignoring redundant levels of parallelism, algorithms written with this abstraction can always be mapped optimally to all supported accelerators. The following table summarizes the characteristics of the proposed hierarchy levels.
-
-| Hierarchy Level | Parallelism | Synchronizable |
-| --- | --- | --- |
-| grid | sequential / parallel | :x: / :white_check_mark: |
-| block | parallel | :x: |
-| warp | parallel | :white_check_mark: |
-| thread | parallel / lock-step| :white_check_mark: |
-| element | sequential | :x: |
-
-Depending on the queue a task is enqueued into, grids will either run in sequential order within the same queue or in parallel in different queues.
-They can be synchronized by using events.
-Blocks can not be synchronized and therefore can use the whole spectrum of parallelism ranging from fully parallel up to fully sequential execution depending on the device.
-Warps combine the execution of multiple threads in lock-step and can be synchronized implicitly by synchronizing the threads they contain.
-Threads within a block are executed in parallel warps and each thread computes a number of data elements sequentially.
-
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/Implementation.md b/thirdParty/cupla/alpaka/doc/markdown/user/Implementation.md
deleted file mode 100644
index 6dd7824082..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/Implementation.md
+++ /dev/null
@@ -1,10 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Implementation
-==============
-
-The implementation of the library in C++, especially the way C++11 allows to define the abstract concepts and to take advantage of the zero-overhead compile-time polymorphism is explained in this section.
-Furthermore, it is described how the abstraction can be mapped to real devices.
-
-1. [Library Interface](implementation/Library.md)
-2. [Mapping onto Specific Hardware Architectures](implementation/Mapping.md)
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/Introduction.md b/thirdParty/cupla/alpaka/doc/markdown/user/Introduction.md
deleted file mode 100644
index 87078731ad..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/Introduction.md
+++ /dev/null
@@ -1,326 +0,0 @@
-[:arrow_up: Up](../Index.md)
-
-Introduction
-============
-
-The *alpaka* library defines and implements an abstract interface for the *hierarchical redundant parallelism* model.
-This model exploits task- and data-parallelism as well as memory hierarchies at all levels of current multi-core architectures.
-This allows to achieve portability of performant codes across various types of accelerators by ignoring specific unsupported levels and utilizing only the ones supported on a specific accelerator.
-All hardware types (multi- and many-core CPUs, GPUs and other accelerators) are treated and can be programmed in the same way.
-The *alpaka* library provides back-ends for *CUDA*, *OpenMP*, *Boost.Fiber* and other methods.
-The policy-based C++ template interface provided allows for straightforward user-defined extension of the library to support other accelerators.
-
-The library name *alpaka* is an acronym standing for **A**bstraction **L**ibrary for **Pa**rallel **K**ernel **A**cceleration.
-
-
-Motivation
-----------
-
-What scales well on current hardware does not necessarily scale well on future architectures.
-The hardware landscape is always changing.
-In the past the big clusters have been CPU only.
-Today we see a change to accelerator supported computing.
-For example, GPUs, Intel Xeon Phis or other special purpose extension cards are extensively used.
-It is unpredictable what the next big step will be and how the Exaflop hardware will look like.
-It is not clear that GPUs will always be the best platform.
-Nevertheless, the underlying physical algorithms as well as the need for heterogeneous architectures will not change.
-
-Current highly parallel GPUs are optimized for throughput and hide latency and data dependencies by always keeping a ready pool of work.
-This allows to sustain the performance at a high percent of peak.
-CPUs in turn are designed to optimize the execution time of a single thread.
-Features like branch prediction, speculative execution, register renaming and many more *[...] would cost far too much energy to be replicated for thousands of parallel GPU threads but [...] are entirely appropriate for CPUs.* ([State-of-the-art in Heterogeneous Computing](http://dx.doi.org/10.1155/2010/540159))
-Even more specialized architectures will appear and find their way into HPC.
-
-*The essence of the heterogeneous computing model is that one size does not fit all. Parallel and serial segments of the workload execute on the best-suited processor delivering faster overall performance, greater efficiency, and lower energy and cost per unit of computation.* ([State-of-the-art in Heterogeneous Computing](http://dx.doi.org/10.1155/2010/540159))
-
-New hardware will not only allow to execute faster or calculate more but will furthermore enable the usage of new algorithms for more precise simulations.
-For example, some tasks may require random searches for only a few values in a lookup table of up to hundreds of gigabytes.
-This would perfectly fit to a CPUs, while the rest of the simulation would still be running on the GPUs.
-With new hardware bringing those two worlds closer together, exploiting the heterogeneous hardware with heterogenous algorithms will likely be the way to go in the future.
-Being able to express both of those parallel tasks in the same way would greatly enhance the productivity of the programmer and the clarity of the code.
-
-Porting a complicated simulation code from *CUDA* to x86 and possibly to other hardware architectures is a non-trivial task.
-A lot of developer time could be saved if this task would not have to be done repeatedly for every new hardware, but rather only once.
-Therefore, *alpaka* tries to solve the problems in porting highly scalable simulation codes on various multi-core architectures.
-
-
-Problems in Porting Performant HPC Codes
-----------
-
-Porting a highly performant code to a new architecture is a non-trivial task that poses many problems.
-Often it is a requirement to keep the simulation operative on the previous platform as well.
-This means that multiple hardware platforms have to be supported simultaneously.
-A great number of projects take the route that seems easiest at first and simply duplicate all the parallel algorithms and port them to the new back-end.
-All the specific API functions that have been used, have to be supplemented by the new pendants, possibly guarded by preprocessor macros to switch between the old and the new version.
-A switch of the back-end used in a simulation, for example, from *OpenMP* to *CUDA* often requires a near rewrite.
-Each newly supported platform would have to duplicate the API specific kernel and invocation code lines.
-
-The following paragraphs will summarize problems that arise when performant HPC codes have to be ported:
-
-### Sustainability
-Because the underlying HPC hardware is constantly changing, every new generation will require an adaption of the simulation.
-Even to deliver the performance reached on previous architectures is a tough task for programmers.
-Furthermore, nobody can guarantee the lifespan of the parallelization technique used.
-*OpenMP*, *CUDA*, *OpenACC* and all the other possibilities could be discontinued or get deprecated for any reason at any time.
-Therefore, an abstract interface is required that hides the particular back-end and allows to port the interface implementation and not the application using the interface itself.
-
-### Heterogeneity
-Some parts of a simulation perfectly map to current GPUs while other parts are better computed on CPUs or other accelerators.
-Furthermore, by letting one part of the heterogeneous cluster hardware idle, a lot of computing power is wasted.
-It is essential, especially for future architectures, that those resources are utilized to reach the peak performance of the systems.
-This heterogeneous work division not only depends on the architecture but also on the number of available hardware resources, the workload and many other factors.
-Therefore, to reach good scaling across a multitude of systems, it is necessary to be able to dynamically decide where to execute which part of the simulation either at make-time, compile-time or at run-time.
-Currently this requires to duplicate the kernels and write specific implementations per back-end.
-Many projects only allow to switch the back-end of the whole simulation at once or possibly even per kernel at make-time.
-This will not be enough on future architectures where the ability to mix the back-ends is required to optimally utilize different cluster architectures or to dynamically load balance tasks across a diverse set of (possibly failing) accelerator devices.
-Therefore, an abstract interface unifying the abilities of all the back-ends is required to let the application express parallelism of the different back-ends in a unified algorithm that can then be mapped to the device currently in use.
-
-### Maintainability
-Looking at the software engineering aspects, duplication is a bad solution because this leads to maintainability issues.
-In many projects such copies result in a large growth in the number of lines of code while only minimal new functionality is implemented.
-Most of the new code only executes things that have already been implemented for the initial platform.
-Developers having to change one of the algorithms additionally have to change all duplicates for all other back-ends.
-Depending on the similarity of the implementations, this can result in a doubling / multiplication of developer efforts in the worst-case scenario.
-Especially for open-source projects that rely on contributions from the community this raises the hurdle for new developers because they have to know not only one, but multiple different parallelization libraries.
-In the end good maintainability is what keeps a software project alive and what ensures a steady development progress.
-Therefore, an interface hiding the differences between all the back-ends is required to let the application express parallelism in a unified algorithm.
-
-### Testability
-Code duplication, being the easiest way to port a simulation, exacerbates testing.
-Each new kernel has to be tested separately because different bugs could have been introduced into the distinct implementations.
-If the versions can be mixed, it is even harder because all combinations have to be tested.
-Often the tests (continuous integration tests, unit tests, etc.) have to run on a special testing hardware or on the production systems due to the reliance on the availability of special accelerators.
-For example, *CUDA* compile tests are possible without appropriate hardware but it is not feasible to execute even simple runtime tests due to the missing CPU emulation support.
-An interface allowing to switch between acceleration back-ends, which are tested for compatibility among each other, enables easy testing on development and test systems.
-
-### Optimizability
-Even if the simulation code has encapsulated the APIs used, the optimal way to write performant algorithms often differs between distinct parallelization frameworks.
-It is necessary to allow the user to fine-tune the algorithm to run optimally on each different accelerator device by compile time specialization or policy based abstractions without the need to duplicate the kernel.
-Within the kernel there has to be knowledge about the underlying platform to adaptively use data structures that map optimally onto the current architecture.
-To ease this optimization work, libraries with data structures, communication patterns and other things hiding the differences between back-ends have to be implemented.
-This would allow to optimize the interface implementation and not the simulation itself.
-
-In summary, it can be stated that all the portability problems of current HPC codes could be solved by introducing an abstract interface that hides the particular back-end implementations and unifies the way to access the parallelism available on modern many-core architectures.
-
-
-Similar Projects
-----------------
-
-There are multiple other libraries targeting the (portable) parallel task execution within nodes.
-Some of them require language extensions, others pretend to achieve full performance portability across a multitude of devices.
-But none of these libraries can provide full control over the (possibly diverse) underlying hardware while being only minimal invasive.
-There is always a productivity-performance trade-off.
-
-Furthermore, many of the libraries do not satisfy the requirement for full single-source C++ support.
-This is essential because many simulation codes heavily rely on template meta-programming for method specialization and compile time optimizations.
-
-
-### CUDA - Compute Unified Device Architecture
-
-*CUDA* is a parallel computing platform and programming model developed by *NVIDIA*.
-It is used in science and research as well as in consumer software to compute highly parallel workloads on GPUs starting from image and video editing up to simulations on high-performance computers.
-Such usage of graphics processing units not only for computer graphics, but also for tasks that have traditionally been handled by the CPU is called GPGPU (general-purpose computing on graphics processing units).
-A disadvantage of *CUDA* is that its application is bound to the usage of *NVIDIA* GPUs.
-Currently no other vendors provide accelerators that support *CUDA*.
-Additionally there is no supported free emulator allowing to execute *CUDA* code on CPUs.
-
-The *CUDA* API is a higher level part of the programming model which allows to access and execute code on GPUs from multiple host languages including C++.
-The *CUDA* C/C++ language on the other hand is a mid level construct based on standard C++ with some extensions for accelerator programming and limitations in the supported constructs.
-For example, throwing and catching exceptions as well as run-time type information (RTTI) are not supported.
-*CUDA* C/C++ is compiled to a low level virtual instruction set called PTX (Parallel Thread Execution).
-The PTX code is later compiled to assembler code by the GPU driver.
-
-*NVIDIA* provides an extended C++ compiler based on the LLVM clang compiler called nvcc that allows to mix host C++ code using the *CUDA* API with *CUDA* C/C++.
-The host part of the C++ code is compiled by the respective host system compiler (gcc, icc, clang, MSVC) while the GPU device code is separately compiled to PTX.
-After the compilation steps both binaries are linked together to form the final assembly.
-
-*CUDA* defines a heterogeneous programming model where tasks are offloaded from the host CPU to the device GPU.
-Functions that should be offloaded to the GPU are called kernels.
-As can be seen in the figure below a grid of such kernels is executed in parallel by multiple threads organized in blocks.
-Threads within a block can synchronize, while blocks are executed independently and possibly in sequential order depending on the underlying hardware.
-![grid-of-thread-blocks](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/grid-of-thread-blocks.png)
-
-The global device memory is the slowest but largest memory accessible by all threads.
-It can be accessed from host code via methods provided by the *CUDA* API.
-Global memory is persistent across kernel invocations.
-Threads within a block can communicate through a fast but small shared memory.
-Each thread has a set of very low latency registers similar to CPU threads.
-Additionally there are special purpose memory sections for constant and texture data.
-
-The *CUDA* C/C++ language gives full control over memory, caches and the execution of kernels.
-
-
-### [PGI CUDA-X86](https://www.pgroup.com/resources/cuda-x86.htm)
-is a compiler technology that allows to generate x86-64 binary code from *CUDA* C/C++ applications using the *CUDA Runtime API* but does not support the *CUDA Driver API*.
-At run-time *CUDA* C programs compiled for x86 execute each *CUDA* thread block using a single host core, eliminating synchronization where possible.
-Multiple kernel threads are combined to be executed together via the CPUs SIMD (Single Instruction Multiple Data) capabilities for vectorized execution.
-The *PGI Unified Binary technology* allows to create a single binary that uses *NVIDIA* GPUs when available, or runs on multi-core CPUs else.
-The compiler is not always up-to-date with the latest *CUDA* versions and is not available for free.
-Furthermore, the compiler seems not to be developed actively since *NVIDIA* acquired *PGI* in 2013.
-Since 2012 no news were published and nothing could be found in the yearly release notes of the *PGI* compiler suite.
-
-
-### [GPU Ocelot](http://gpuocelot.gatech.edu/)
-<!--- https://github.com/gtcasl/gpuocelot --->
-is an open-source dynamic JIT compilation framework.
-It allows to execute native *CUDA* binaries by dynamically translating the *NVIDIA PTX* virtual instruction set architecture to other instruction sets.
-It supports *NVIDIA* and *AMD* GPUs as well as multicore CPUs via a PTX to LLVM (Low Level Virtual Machine) translator.
-The project is not in active development anymore.
-It only supports PTX up to version 3.1 (current version is 5.0).
-
-
-### [OpenMP](http://openmp.org//)
-is an open specification for vendor agnostic shared memory parallelization.
-By adding annotations (pragmas in C/C++) to loops or regions, it allows to easily parallelize existing sequential C/C++/Fortran code in an incremental manner.
-Due to the nature of pragmas, these hints are ignored if the compiler does not support them or thinks they are inappropriate.
-This allows those programs to be compiled as sequential or parallel versions by only changing a compiler flag.
-In C/C++ the syntax for *OpenMP* directives is `#pragma omp` followed by multiple clauses.
-For example, with the directive `#pragma omp parallel for`, the compiler will automatically distribute the iterations of the directly following loop across the available cores.
-*OpenMP* 4.0 introduced support for offloading computations to accelerator devices, substantially improved the task support and extended the SIMD capabilities.
-By embedding code within a `#pragma omp target` block, the contained code will be executed on the selected device.
-*OpenMP* 4.0 is missing the ability for unstructured data movement and only implements structured data movement from and to devices.
-The compiler directive `#pragma omp target data map(...) ...` at the begin of a code block will define which data is copied to, copied back from and is created on the device.
-At the end of the code block the memory is copied back or gets deleted.
-There is no way to allocate device memory that is persistent between kernel calls in different methods because it is not possible to create a device data region spanning both functions in the general case.
-*OpenMP* 4.1, expected for the end of 2015, is likely to introduce `#pragma omp target enter data`, `#pragma omp target exit data` and other unstructured data movement directives that allow to pass and obtain pointers of already resident memory to and from offloaded kernels.
-Currently *OpenMP* does not provide a way to control the hierarchical memory because its main assumption is a shared memory for all threads.
-Therefore, the block shared memory on *CUDA* devices can not be explicitly utilized.
-
-
-### [OpenACC](http://www.openacc-standard.org/)
-is a pragma based programming standard for heterogeneous computing.
-It is very similar to *OpenMP* and provides annotations for parallel execution and data movement as well as run-time functions for accelerator and device management.
-In contrast to *OpenMP* it allows limited access to *CUDA* block shared memory.
-Current compiler implementations support *NVIDA*, *AMD* and *Intel* accelerators.
-Only as of *OpenACC* 2.0 explicit memory management and tiling is supported.
-*OpenACC* does not support dynamic allocation of memory (`new`, `delete`) in kernel code.
-It is aimed to be fully merged with *OpenMP* at some point, but for now *OpenMP* 4.0 only introduced some parts of it.
-
-
-### [OpenCL](https://www.khronos.org/opencl/)
-is a programming framework for heterogeneous platforms.
-It is fully hardware independent and can utilize CPUs and GPUs of nearly all vendors.
-This is achieved by compiling the *OpenCL* kernel code (or the standardized *SPIR* intermediate representation) at run-time by the platform driver into the native instruction set.
-Versions prior to 2.1 (released in March 2015) did only support a C-like kernel language.
-Version 2.1 introduced a subset of C++14.
-*OpenCL* does not support single-source programming (combining C++ host code and accelerator code in a single file).
-This is a precondition for templated kernels which are required for policy based generic programming.
-It is necessary to note that *NVIDIA* seems to neglect their *OpenCL* implementation.
-Support for version 1.2 has just been added in April 2015 after only three and a half years after the publication of the standard.
-*OpenCL* does not support dynamic allocation of memory (`new`, `delete`) in kernel code.
-
-
-### [SYCL](https://www.khronos.org/sycl/)
-is a cross-platform abstraction layer based on *OpenCL*.
-The main advantage over *OpenCL* itself is that it allows to write single-source heterogeneous programs.
-It enables the usage of a single C++ template function for host and device code.
-As of now there is no usable free compiler implementation available that has good support for multiple accelerator devices.
-
-
-### [C++ AMP (Accelerated Massive Parallelism)](https://msdn.microsoft.com/en-us/library/hh265136.aspx)
-is an open specification from *Microsoft* currently implemented on top of *DirectX 11*.
-It is a language extension requiring compiler support that allows to annotate C++ code that can then be run on multiple accelerators.
-*C++ AMP* requires the usage of the `array` data structure or the `array_view` wrapper responsible for copying data to and from the accelerator devices.
-The `parallel_for_each` function is responsible for offloading the provided function object whose `operator()` has to be annotated with `restrict(amp)`.
-The threads can access shared memory and synchronize.
-The range of supported accelerator devices, plaforms and compilers is currently very limited.
-
-
-### [KOKKOS](https://github.com/kokkos)
-<!---
-https://www.xsede.org/documents/271087/586927/Edwards-2013-XSCALE13-Kokkos.pdf
-http://trilinos.org/oldsite/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf
-http://on-demand.gputechconf.com/supercomputing/2013/presentation/SC3103\_Towards-Performance-Portable-Applications-Kokkos.pdf
-http://dx.doi.org/10.3233/SPR-2012-0343
---->
-provides an abstract interface for portable, performant shared memory-programming.
-It is a C++ library that offers `parallel_for`, `parallel_reduce` and similar functions for describing the pattern of the parallel tasks.
-The execution policy determines how the threads are executed.
-For example, this influences the sizes of blocks of threads or if static or dynamic scheduling should be used.
-The library abstracts the kernel as a function object that can not have any user defined parameters for its `operator()`.
-Inconveniently, arguments have to be stored in members of the function object coupling algorithm and data together.
-*KOKKOS* provides both, abstractions for parallel execution of code and data management.
-Multidimensional arrays with a neutral indexing and an architecture dependent layout are available, which can be used, for example, to abstract the underlying hardwares preferred memory access scheme that could be row-major, column-major or even blocked.
-
-
-### [Thrust](https://thrust.github.io/)
-is a parallel algorithms library resembling the C++ Standard Template Library (STL).
-It allows to select either the *CUDA*, *TBB* or *OpenMP* back-end at make-time.
-Because it is based on generic `host_vector` and `device_vector` container objects, it is tightly coupling the data structure and the parallelization strategy.
-There exist many similar libraries such as [ArrayFire](http://www.arrayfire.com/) (*CUDA*, *OpenCL*, native C++), [VexCL](https://github.com/ddemidov/vexcl/) (*OpenCL*, *CUDA*), [ViennaCL](http://viennacl.sourceforge.net/) (*OpenCL*, *CUDA*, *OpenMP*) and [hemi](https://github.com/harrism/hemi/) (*CUDA*, native C++).
-
-<!---
-Phalanx
-See [here](http://www.mgarland.org/files/papers/phalanx-sc12-preprint.pdf).
-It is very similar to *alpaka* in the way it abstracts the accelerators.
-C++ Interface provides CUDA, OpenMP, and GASNet back-ends
-
-Aura
-
-Intel TBB
-
-U\PC++
---->
-
-Distinction of the *alpaka* Library
-------------------------------------------
-
-In the section about the problems we saw that all portability problems of current HPC codes could be solved with an abstract interface unifying the underlying accelerator back-ends.
-The previous section showed that there is currently no project available that could solve all of the problems highlighted.
-The C++ interface library proposed to solve all those problems is called *alpaka*.
-The subsequent enumeration will summarize the purpose of the library:
-
-### *alpaka* is ...
-* an **abstract interface** describing parallel execution on multiple hierarchy levels. It allows to implement a mapping to various hardware architectures but **is no optimal mapping itself**.
-
-* sustainably solving portability (50% on the way to reach full performance portability)
-
-* solving the **heterogeneity** problem. An identical algorithm / kernel can be executed on heterogeneous parallel systems by selecting the target device.
-
-* reducing the **maintainability** burden by not requiring to duplicate all the parts of the simulation that are directly facing the parallelization framework. Instead, it allows to provide a single version of the algorithm / kernel that can be used by all back-ends. All the accelerator dependent implementation details are hidden within the *alpaka* library.
-
-* simplifying the **testability** by enabling **easy back-end switching**. No special hardware is required for testing the kernels. Even if the simulation itself will always use the *CUDA* back-end, the tests can completely run on a CPU. As long as the *alpaka* library is thoroughly tested for compatibility between the acceleration back-ends, the user simulation code is guaranteed to generate identical results (ignoring rounding errors / non-determinism) and is portable without any changes.
-
-* **optimizable**. Everything in *alpaka* can be replaced by user code to optimize for special use-cases.
-
-* **extensible**. Every concept described by the *alpaka* abstraction can be implemented by users. Therefore it is possible to non-intrusively define new devices, queues, buffer types or even whole accelerator back-ends.
-
-* **data structure agnostic**. The user can use and define arbitrary data structures.
-
-### *alpaka* is not ...
-
-* an automatically **optimal mapping** of algorithms / kernels to various acceleration platforms. Except in trivial examples an optimal execution always depends on suitable selected data structure. An adaptive selection of data structures is a separate topic that has to be implemented in a distinct library.
-
-* automatically **optimizing concurrent data accesses**.
-
-* **handling** or hiding differences in arithmetic operations. For example, due to **different rounding** or different implementations of floating point operations, results can differ slightly between accelerators.
-
-* **guaranteeing any determinism** of results. Due to the freedom of the library to reorder or repartition the threads within the tasks it is not possible or even desired to preserve deterministic results. For example, the non-associativity of floating point operations give non-deterministic results within and across accelerators.
-
-The *alpaka* library is aimed at parallelization within nodes of a cluster.
-It does not compete with libraries for distribution of processes across nodes and communication among those.
-For these purposes libraries like MPI (Message Passing Interface) or others should be used.
-MPI is situated one layer higher and can be combined with *alpaka* to facilitate the hardware of a whole heterogeneous cluster.
-The *alpaka* library can be used for parallelization within nodes, MPI for parallelization across nodes.
-
-
-Comparison
-----------
-
-The following table summarizes which of the problems mentioned in section about the problems can be solved by current intra-node parallelization frameworks and the proof-of-concept *alpaka* abstraction library.
-
-| Framework / API | Open-Source | Free | Single-Source C++ | Portability | Heterogenity | Maintainability | Testability | Optimizability | Data structure agnostic |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| CUDA			| :x:              	| :white_check_mark: | :white_check_mark: | :x:               | :x:               | :x:               | :x:               | :white_check_mark: | :white_check_mark: |
-| PGI CUDA-x86	| :x:             	| :x:               | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| GPU Ocelot		| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| OpenMP			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| OpenACC			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| OpenCL			| :white_check_mark:	| :white_check_mark: | :x:               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| SYCL			| :white_check_mark:	| (:ballot_box_with_check:) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | (:ballot_box_with_check:) | :white_check_mark: |
-| C++AMP			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | (:ballot_box_with_check:) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :white_check_mark: |
-| KOKKOS			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :x:               | :large_orange_diamond: |
-| Thrust			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :large_orange_diamond: | :white_check_mark: | :white_check_mark: | :x:               | :x:               |
-| **alpaka**			| :white_check_mark:	| :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
-
-Properties of intra-node parallelization frameworks and their ability to solve the problems in porting performant HPC codes. :white_check_mark: : yes / fully solved, :large_orange_diamond: : partially solved, :x: : no / not solved
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Block.md b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Block.md
deleted file mode 100644
index db4db63188..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Block.md
+++ /dev/null
@@ -1,34 +0,0 @@
-[:arrow_up: Up](../Abstraction.md)
-
-Block
-=====
-
-Building a processor with possibly thousands of cores where all cores have an equal length connection for fast communication and synchronization is not viable.
-Either the processor size would have to grow exponentially with the number of cores or the all-to-all communication speed would decrease so much that computations on the processor would be impractical.
-Therefore, the communication and synchronization of threads has to be limited to sizes manageable by real hardware.
-
-Figure \ref{fig:block} depicts the solution of introducing a new hierarchy level in the abstraction.
-A hypothetical processor is allowed to provide synchronization and fast communication within blocks of threads but is not required to provide synchronization across blocks.
-The whole grid is subdivided into equal sized blocks with a fast but small shared memory.
-Current accelerator abstractions (*CUDA* and *OpenCL*) only support equal sized blocks.
-This restriction could possibly be lifted to support future accelerators with heterogeneous block sizes.
-![block](block/block.png)
-
-There is another reason why independent blocks are necessary.
-Threads that can communicate and synchronize require either a one-to-one mapping of threads to cores, which is impossible because the number of data elements is theoretically unlimited, or at least a space to store the state of each thread.
-Even old single core CPUs were able to execute many communicating and synchronizing threads by using cooperative or preemptive multitasking.
-Therefore, one might think that a single core would be enough to execute all the data parallel threads.
-But the problem is that even storing the set of registers and local data of all the possible millions of threads of a task grid is not always viable.
-The blocking scheme solves this by enabling fast interaction of threads on a local scale but additionally removes the necessity to store the state of all threads in the grid at once because only threads within a block must be executed in parallel.
-Within a block of cores there still has to be enough memory to store all registers of all contained threads.
-The independence of blocks allows applications to scale well across diverse devices.
-As can be seen in the following figure, the accelerator can assign blocks of the task grid to blocks of cores in arbitrary order depending on availability and workload.
-![block_scale](block/block_scale.png)
-
-Shared Memory
--------------
-
-Each block has its own shared memory.
-This memory can only be accessed explicitly by threads within the same block and gets discarded after the complete block finished its calculation.
-This memory is typically very fast but also very small.
-No variables are shared between kernels by default.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Element.md b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Element.md
deleted file mode 100644
index d89c0b7e19..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Element.md
+++ /dev/null
@@ -1,42 +0,0 @@
-[:arrow_up: Up](../Abstraction.md)
-
-Element
-=======
-
-To use the maximum available computing power of, for example, a modern x86 processor, the computation has to utilize the SIMD vector registers.
-Many current architectures support issuing a single instruction that can be applied to multiple data elements in parallel.
-
-The original x86 instruction set architecture did not support SIMD instructions but has been enhanced with MMX (64 bit width registers), SSE (128 bit width registers), AVX (256 bit width registers) and AVX-512 (512 bit width registers) extensions.
-In varying degree, they allow to process multiple 32 bit and 64 bit floating point numbers as well as 8, 16, 32 and 64 bit signed and unsigned integers.
-
-*CUDA* capable GPUs do not have vector registers where multiple values of type `float` or `double` can be manipulated by one instruction.
-Nevertheless, newer *CUDA* capable devices implement basic SIMD instructions on pairs of 16 bit values and quads of 8-bit values. 
-They are described in the documentation of the [PTX instruction set architecture](http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#axzz4OTzGGwcJ) chapter 8.7.13 but are only of any use in very special problem domains, for example for deep learning.
-
-It would be optimal if the compiler could automatically vectorize our kernels when they are called in a loop and vectorization is supported by the underlying accelerator.
-However, besides full blown vector processors, mainstream CPUs do not support predicated execution or similar complex things within vector registers.
-At most, there is support for masking operations which allow to emulate at least some conditional branching.
-Therefore, this missing hardware capability has to be circumvented by the compiler.
-There are scientific research projects such as the work done by Ralf Karrenberg et al [1](http://www.cdl.uni-saarland.de/publications/theses/karrenberg_msc.pdf) [2](http://www.cdl.uni-saarland.de/projects/wfv/wfv_cgo11_slides.pdf) [3](http://www.cdl.uni-saarland.de/papers/karrenberg_opencl.pdf) building on the *LLVM* compiler infrastructure supporting such whole-function vectorization.
-However, current mainstream compilers do not support automatic vectorization of basic, non trivial loops containing control flow statements (`if`, `else`, `for`, etc.) or other non-trivial memory operations.
-Therefore, it has to be made easier for the compiler to recognize the vectorization possibilities by making it more explicit.
-
-The opposite of automatic whole function vectorization is the fully explicit vectorization of expressions via compiler intrinsics directly resulting in the desired assembly instruction.
-A big problem when trying to utilize fully explicit vectorization is, that there is no common foundation supported by all explicit vectorization methods.
-A wrapper unifying the x86 SIMD intrinsics found in the `intrin.h` or `x86intrin.h` headers with those supported on other platforms, for example ARM NEON (`arm_neon.h`), PowerPC Altivec (`altivec.h`) or *CUDA* is not available and to write one is a huge task in itself.
-However, if this would become available in the future, it could easily be integrated into *alpaka* kernels.
-
-Due to current compilers being unable to vectorize whole functions and the explicit vectorization intrinsics not being portable, one has to rely on the vectorization capabilities of current compilers for primitive loops only consisting of a few computations.
-By creating a grid of data elements, where multiple elements are processed per thread and threads are pooled in independent blocks, as it is shown in the figure below, the user is free to loop sequentially over the elements or to use vectorization for selected expressions within the kernel.
-Even the sequential processing of multiple elements per thread can be useful depending on the architecture.
-For example, the *NVIDIA cuBLAS* general matrix-matrix multiplication (GEMM) internally executes only one thread for each second matrix data element to better utilize the registers available per thread.
-![element](element/element.png)
-
-<!---
-The best solution to vectorization would be one, where the user does not have to do anything.
-This is not possible because the smallest unit supplied by the user is a kernel which is executed in threads which can synchronize.
-
-It is not possible to execute multiple kernels sequentially to hide the vectorization by starting a kernel-thread for e.g. each 4th thread in a block and then looping over the 4 entries.
-This would prohibit the synchronization between these threads.
-By executing 4 fibers inside such a vectorization kernel-thread we would allow synchronization again but prevent the loop vectorizer from working.
---->
\ No newline at end of file
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Thread.md b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Thread.md
deleted file mode 100644
index a684001f10..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Thread.md
+++ /dev/null
@@ -1,38 +0,0 @@
-[:arrow_up: Up](../Abstraction.md)
-
-Thread
-======
-
-Theoretically, a basic data parallel task can be executed optimally by executing one thread per independent data element.
-In this context, the term thread does not correspond to a native kernel-thread, an *OpenMP* thread, a *CUDA* thread, a user-level thread or any other such threading variant.
-It only represents the execution of a sequence of commands forming the desired algorithm on a per data element level.
-This ideal one-to-one mapping of data elements to threads leads to the execution of a multidimensional grid of threads corresponding to the data structure of the underlying problem.
-The uniform function executed by each of the threads is called a kernel.
-Some algorithms such as reductions require the possibility to synchronize or communicate between threads to calculate a correct result in a time optimal manner.
-Therefore our basic abstraction requires a n-dimensional grid of synchronizable threads each executing the same kernel.
-The following figure shows an hypothetical processing unit that could optimally execute this data parallel task.
-The threads are mapped one-to-one to the cores of the processor.
-For a time optimal execution, the cores have to have an all-to-all equal length connection for communication and synchronization.
-![thread](thread/thread.png)
-
-The only difference between the threads is their positional index into the grid which allows each thread to compute a different part of the solution.
-Threads can always access their private registers and the global memory.
-
-Registers
----------
-
-All variables with default scope within a kernel are automatically saved in registers and are not shared automatically.
-This memory is local to each thread and can not be accessed by other threads.
-
-Global Memory
--------------
-
-The global memory can be accessed from every thread in the grid as well as from the host thread.
-This is typically the largest but also the slowest memory available.
-
-Individual threads within the grid are allowed to statically or dynamically allocate buffers in the global memory.
-
-Prior to the execution of a task, the host thread copies the input buffers and allocates the output buffers onto the accelerator device.
-Pointers to these buffers then can be given as arguments to the task invocation.
-By using the index of each thread within the grid, the offset into the global input and output buffers can be calculated.
-After the computation has finished, the output buffer can be used either as input to a subsequent task or can be copied back to the host.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Warp.md b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Warp.md
deleted file mode 100644
index c5b05df768..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/Warp.md
+++ /dev/null
@@ -1,29 +0,0 @@
-[:arrow_up: Up](../Abstraction.md)
-
-Warp
-====
-
-With the current abstraction only independent parallelism via blocks and synchronizable parallelism via threads can be expressed.
-However, there are more variants of parallelism in real hardware.
-Because all threads in the grid are executing the same kernel and even the same instruction at the same time when ignoring divergent control flows, a lot of chip space can be saved.
-Multiple threads can be executed in perfect synchronicity, which is also called lock-step.
-A group of such threads executing the same instruction at the same time is called a warp .
-All threads within a warp share a single instruction pointer (IP), and all cores executing the threads share one instruction fetch (IF) and instruction decode (ID) unit.
-![warp](warp/warp.png)
-
-Even threads with divergent control flows can be executed within one warp.
-*CUDA*, for example, solves this by supporting predicated execution and warp voting.
-For long conditional branches the compiler inserts code which checks if all threads in the warp take the same branch.
-For small branches, where this is too expensive, all threads always execute all branches.
-Control flow statements result in a predicate and only in those threads where it is true, the predicated instructions will have an effect.
-
-Not only *CUDA* GPUs support the execution of multiple threads in a warp.
-Full blown vector processors with good compilers are capable of combining multiple loop iterations containing complex control flow statements in a similar manner as *CUDA*.
-
-Due to the synchronictiy of threads within a warp, memory operations will always occur at the same time in all threads.
-This allows to coalesce memory accesses.
-Different *CUDA* devices support different levels of memory coalescing.
-Older ones only supported combining multiple memory accesses if they were aligned and sequential in the order of thread indices.
-Newer ones support unaligned scattered accesses as long as they target the same 128 byte segment.
-
-The ability of very fast context switches between warps and a queue of ready warps allows *CUDA* capable GPUs to hide the latency of global memory operations.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block.png b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block.png
deleted file mode 100644
index d7fa27b46b..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block_scale.png b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block_scale.png
deleted file mode 100644
index 826d963653..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/block/block_scale.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/element/element.png b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/element/element.png
deleted file mode 100644
index f36e08da26..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/element/element.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/thread/thread.png b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/thread/thread.png
deleted file mode 100644
index 253bacb1ef..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/thread/thread.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/warp/warp.png b/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/warp/warp.png
deleted file mode 100644
index 1ba31c40c8..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/abstraction/warp/warp.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Library.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Library.md
deleted file mode 100644
index 3a0c042164..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Library.md
+++ /dev/null
@@ -1,16 +0,0 @@
-[:arrow_up: Up](../Implementation.md)
-
-Library Interface
-=================
-
-As described in the chapter about the Abstraction, the general design of the library is very similar to *CUDA* and *OpenCL* but extends both by some points, while not requiring any language extensions.
-General interface design as well as interface implementation decisions differentiating *alpaka* from those libraries are described in the Rationale section.
-It uses C++ because it is one of the most performant languages available on nearly all systems.
-Furthermore, C++11 allows to describe the concepts in a very abstract way that is not possible with many other languages.
-The *alpaka* library extensively makes use of advanced functional C++ template meta-programming techniques.
-The Implementation Details  section discusses the C++ library and the way it provides extensibility and optimizability.
-
-1. [Structure](library/Structure.md)
-2. [Usage](library/Usage.md)
-2. [Rationale](library/Rationale.md)
-3. [Details](library/Details.md)
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Mapping.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Mapping.md
deleted file mode 100644
index 70a28fb6b5..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/Mapping.md
+++ /dev/null
@@ -1,24 +0,0 @@
-[:arrow_up: Up](../Implementation.md)
-
-Mapping onto Specific Hardware Architectures
-============================================
-
-By providing an accelerator independent interface for kernels, their execution and memory accesses at different hierarchy levels, *alpaka* allows the user to write accelerator independent code that does not neglect performance.
-
-The mapping of the decomposition to the execution environment is handled by the back-ends provided by the *alpaka* library as well as user defined back-ends.
-A computation that is described with a maximum of the parallelism available in the *redundant hierarchical parallelism* abstraction can not be mapped one to one to any existing hardware.
-GPUs do not have vector registers for `float` or `double` types.
-Therefore, the element level is often omitted on *CUDA* accelerators.
-CPUs in turn are not (currently) capable of running thousands of threads concurrently and do not have equivalently fast inter-thread synchronization and shared memory access as GPUs do.
-
-A major point of the *redundant hierarchical parallelism* abstraction is to ignore specific unsupported levels and utilize only the ones supported on a specific accelerator.
-This allows a mapping to various current and future accelerators in a variety of ways enabling optimal usage of the underlying compute and memory capabilities.
-
-The grid level is always mapped to the whole device being in consideration.
-The scheduler can always execute multiple kernel grids from multiple queues in parallel by statically or dynamically subdividing the available resources.
-However, this will only ever simplify the mapping due to less available processing units.
-Furthermore, being restricted to less resources automatically improves the locality of data due to spatial and temporal locality properties of the caching hierarchy.
-
-1. [CUDA GPUs](mapping/CUDA.md)
-2. [x86 CPUs](mapping/x86.md)
-2. [Accelerators](mapping/Accelerators.md)
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Details.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Details.md
deleted file mode 100644
index 0f76843013..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Details.md
+++ /dev/null
@@ -1,242 +0,0 @@
-[:arrow_up: Up](../Library.md)
-
-Details
-=======
-
-![Overview of the structure of the *alpaka* library with concepts and implementations.](structure.png)
-
-The full stack of concepts defined by the *alpaka* library and their inheritance hierarchy is shown in the third column of the preceding figure.
-Default implementations for those concepts can be seen in the blueish columns.
-The various accelerator implementations, shown in the lower half of the figure, only differ in some of their underlying concepts but can share most of the base implementations.
-The default implementations can, but do not have to be used at all.
-They can be replaced by user code in arbitrary granularity.
-By substituting, for instance, the atomic operation implementation of an accelerator, the execution can be fine-tuned, to better utilize the hardware instruction set of a specific processor.
-However, also complete accelerators, devices and all of the other concepts can be implemented by the user without the need to change any part of the *alpaka* library itself.
-The way this and other things are implemented is explained in the following paragraphs.
-
-Concept Implementations
------------------------
-
-The *alpaka* library has been implemented with extensibility in mind.
-This means that there are no predefined classes, modeling the concepts, the *alpaka* functions require as input parameters.
-They allow arbitrary types as parameters, as long as they model the required concept.
-
-C++ provides a language inherent object oriented abstraction allowing to check that parameters to a function comply with the concept they are required to model.
-By defining interface classes, which model the *alpaka* concepts, the user would be able to inherit his extension classes from the interfaces he wants to model and implement the abstract virtual methods the interfaces define.
-The *alpaka* functions in turn would use the corresponding interface types as their parameter types.
-For example, the `Buffer` concept requires methods for getting the pitch or changing the memory pinning state.
-With this intrusive object oriented design pattern the `BufCpu` or `BufCudaRt` classes would have to inherit from an `IBuffer` interface and implement the abstract methods it declares.
-An example of this basic pattern is shown in the following source snippet:
-
-```C++
-struct IBuffer
-{
-	virtual std::size_t getPitch() const = 0;
-	virtual void pin() = 0;
-	virtual void unpin() = 0;
-	...
-};
-
-struct BufCpu : public IBuffer
-{
-	virtual std::size_t getPitch() const override { ... }
-	virtual void pin() override { ... }
-	virtual void unpin() override { ... }
-	...
-};
-	
-ALPAKA_FN_HOST auto copy(
-	IBuffer & dst,
-	IBuffer const & src)
--> void
-{
-	...
-}
-```
-
-The compiler can then check at compile time that the objects the user wants to use as function parameters can be implicitly cast to the interface type, which is the case for inherited base classes.
-The compiler returns an error message on a type mismatch.
-However, if the *alpaka* library were using those language inherent object oriented abstractions, the extensibility and optimizability it promises would not be possible.
-Classes and run-time polymorphism require the implementer of extensions to intrusively inherit from predefined interfaces and override special virtual functions.
-
-This is feasible for user defined classes or types where the source code is available and where it can be changed.
-The `std::vector` class template on the other hand would not be able to model the `Buffer` concept because we can not change its definition to inherit from the `IBuffer` interface class since it is part of the standard library.
-The standard inheritance based object orientation of C++ only works well when all the code it is to interoperate with can be changed to implement the interfaces.
-It does not enable interaction with unalterable or existing code that is too complex to change, which is the reality in the majority of software projects.
-
-Another option to implement an extensible library is to follow the way the C++ standard library uses.
-It allows to specialize function templates for user types to model concepts without altering the types themselves.
-For example, the `std::begin` and `std::end` free function templates can be specialized for user defined types.
-With those functions specialized, the C++11 range-based for loops (`for(auto & i : userContainer){...}`) see *C++ Standard 6.5.4/1* can be used with user defined types.
-Equally specializations of `std::swap` and other standard library function templates can be defined to extend those with support for user types.
-One Problem with function specialization is, that only full specializations are allowed.
-A partial function template specialization is not allowed by the standard.
-Another problem can emerge due to users carelessly overloading the template functions instead of specializing them.
-Mixing function overloading and function template specialization on the same base template function can result in unexpected results.
-The reasons and effects of this are described more closely in an article from H. Sutter (currently convener of the ISO C++ committee) called *Sutter's Mill: Why Not Specialize Function Templates?* in the *C/C++ Users Journal* in July 2001.
-<!--- NOTE: different way: http://ericniebler.com/2014/10/21/customization-point-design-in-c11-and-beyond/ -->
-
-The solution given in the article is to provide *"a single function template that should never be specialized or overloaded"*.
-This function simply forwards its arguments *"to a class template containing a static function with the same signature"*.
-This template class can fully or partially be specialized without affecting overload resolution.
-
-The way the *alpaka* library implements this is by not using the C++ inherent object orientation but lifting those abstractions to a higher level.
-Instead of using a non-extensible`class`/`struct` for defining the interface, a namespace is utilized.
-In place of abstract virtual member functions of the interface, *alpaka* defines free functions within those namespaces.
-All those functions are templates allowing the user to call them with arbitrary self defined types and not only those inheriting from a special interface type.
-Unlike member functions, they have no implicit `this` pointer, so the object instance has to be explicitly given as a parameter.
-Overriding the abstract virtual interface methods is replaced by the specialization of a template type that is defined for each such namespace function.
-
-A concept is completely implemented by specializing the predefined template types.
-This allows to extend and fine-tune the implementation non-intrusively.
-For example, the corresponding pitch and memory pinning template types can be specialized for `std::vector`.
-After doing this, the `std::vector` can be used everywhere a buffer is accepted as argument throughout the whole *alpaka* library without ever touching its definition.
-
-A simple function allowing arbitrary tasks to be enqueued into a queue can be implemented in the way shown in the following code.
-The `TSfinae` template parameter will be explained in a [following section](#Template-Specialization-Selection-on-Arbitrary-Conditions).
-
-```C++
-namespace queue
-{
-	template<
-		typename TQueue,
-		typename TTask,
-		typename TSfinae = void>
-	struct Enqueue;
-		
-	template<
-		typename TQueue,
-		typename TTask>
-	ALPAKA_FN_HOST auto enqueue(
-		TQueue & queue,
-		TTask & task)
-	-> void
-	{
-		Enqueue<
-			TQueue,
-			TTask>
-		::enqueue(
-			queue,
-			task);
-	}
-}
-```
-
-A user who wants his queue type to be used with this `enqueue` function has to specialize the `Enqueue` template struct.
-This can be either done partially by only replacing the `TQueue` template parameter and accepting arbitrary tasks or by fully specializing and replacing both `TQueue` and `TTask`. This gives the user complete freedom of choice.
-The example given in the following code shows this by specializing the `Enqueue` type for a user queue type `UserQueue` and arbitrary tasks.
-
-```C++
-struct UserQueue{};
-
-namespace queue
-{
-	// partial specialization
-	template<
-		typename TTask>
-	struct Enqueue<
-		UserQueue
-		TTask>
-	{
-		ALPAKA_FN_HOST static auto enqueue(
-			UserQueue & queue,
-			TTask & task)
-		-> void
-		{
-			//...
-		}
-	};
-}
-```
-
-In addition the subsequent code shows a full specialization of the `Enqueue` type for a given `UserQueue` and a `UserTask`.
-
-```C++
-struct UserQueue{};
-struct UserTask{};
-
-namespace queue
-{
-	// full specialization
-	template<>
-	struct Enqueue<
-		UserQueue
-		UserTask>
-	{
-		ALPAKA_FN_HOST static auto enqueue(
-			UserQueue & queue,
-			UserTask & task)
-		-> void
-		{
-			//...
-		}
-	};
-}
-```
-
-When the `enqueue` function template is called with an instance of `UserQueue`, the most specialized version of the `Enqueue` template is selected depending on the type of the task `TTask` it is called with.
-
-A type can model the queue concept completely by defining specializations for `alpaka::queue::Enqueue` and `alpaka::queue::Empty`.
-This functionality can be accessed by the corresponding `alpaka::queue::enqueue` and `alpaka::queue::empty` template functions.
-
-Currently there is no native language support for describing and checking concepts in C++ at compile time.
-A study group (SG8) is working on the ISO [specification for conecpts](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4377.pdf) and compiler forks implementing them do exist.
-For usage in current C++ there are libraries like [*Boost.ConceptCheck*](http://www.boost.org/doc/libs/1_58_0/libs/concept_check/concept_check.htm) which try to emulate requirement checking of concept types.
-Those libraries often exploit the preprocessor and require non-trivial changes to the function declaration syntax.
-Therefore the *alpaka* library does not currently make use of *Boost.ConceptCheck*.
-Neither does it facilitate the proposed concept specification due to its dependency on non-standard compilers.
-
-The usage of concepts as described in the working draft would often dramatically enhance the compiler error messages in case of violation of concept requirements.
-Currently the error messages are pointing deeply inside the stack of library template invocations where the missing method or the like is called.
-Instead of this, with concept checking it would directly fail at the point of invocation of the outermost template function with an expressive error message about the parameter and its violation of the concept requirements.
-This would simplify especially the work with extendable template libraries like *Boost* or *alpaka*.
-However, in the way concept checking would be used in the *alpaka* library, omitting it does not change the semantic of the program, only the compile time error diagnostics.
-In the future when the standard incorporates concept checking and the major compilers support it, it will be added to the *alpaka* library.
-
-
-Template Specialization Selection on Arbitrary Conditions
----------------------------------------------------------
-
-Basic template specialization only allows for a selection of the most specialized version where all explicitly stated types have to be matched identically.
-It is not possible to enable or disable a specialization based on arbitrary compile time expressions depending on the parameter types.
-To allow such conditions, *alpaka* adds a defaulted and unused `TSfinae` template parameter to all declarations of the implementation template structs.
-This was shown using the example of the `Enqueue` template type.
-The C++ technique called SFINAE, an acronym for *Substitution failure is not an error* allows to disable arbitrary specializations depending on compile time conditions.
-Specializations where the substitution of the parameter types by the deduced types would result in invalid code will not result in a compile error, but will simply be omitted.
-An example in the context of the `Enqueue` template type is shown in the following code.
-
-```C++
-struct UserQueue{};
-
-namespace queue
-{
-	template<
-		typename TQueue,
-		typename TTask>
-	struct Enqueue<
-		TQueue
-		TTask,
-		typename std::enable_if<
-			std::is_base_of<UserQueue, TQueue>::value
-			&& (TTask::TaskId == 1u)
-		>::type>
-	{
-		ALPAKA_FN_HOST static auto enqueue(
-			TQueue & queue,
-			TTask & task)
-		-> void
-		{
-			//...
-		}
-	};
-}
-```
-
-The `Enqueue` specialization shown here does not require any direct type match for the `TQueue` or the `TTask` template parameter.
-It will be used in all contexts where `TQueue` has inherited from `UserQueue` and where the `TTask` has a static const integral member value `TaskId` that equals one.
-If the `TTask` type does not have a `TaskId` member, this code would be invalid and the substitution would fail.
-However, due to SFINAE, this would not result in a compiler error but rather only in omitting this specialization.
-The `std::enable_if` template results in a valid expression, if the condition it contains evaluates to true, and an invalid expression if it is false.
-Therefore it can be used to disable specializations depending on arbitrary boolean conditions.
-It is utilized in the case where the `TaskId` member is unequal one or the `TQueue` does not inherit from `UserQueue`.
-In this cirumstances, the condition itself results in valid code but because it evaluates to false, the `std::enable_if` specialization results in invalid code and the whole `Enqueue` template specialization gets omitted.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Rationale.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Rationale.md
deleted file mode 100644
index c816a5c0a2..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Rationale.md
+++ /dev/null
@@ -1,278 +0,0 @@
-[:arrow_up: Up](../Library.md)
-
-Rationale
-=========
-
-Interface Distinction
---------------------
-
-The *alpaka* library is different from other similar libraries (especially *CUDA*) in that it refrains from using implicit or hidden state.
-This and other interface design decisions will be explained int the following paragraphs.
-
-### No Current Device:
-The *CUDA* runtime API for example supplies a current device for each user code kernel-thread.
-Working with multiple devices requires to call `cudaSetDevice` to change the current device whenever an operation should be executed on a non-current device.
-Even the functions for creating a queue (`cudaStreamCreate`) or an event (`cudaEventCreate`) use the current device without any way to create them on a non current device.
-In the case of an event this dependency is not obvious, since at the same time queues can wait for events from multiple devices allowing cross-device synchronization without any additional work.
-So conceptually an event could also have been implemented device independently.
-This can lead to hard to track down bugs due to the non-explicit dependencies, especially in multi-threaded code using multiple devices.
-
-### No Default Device:
-In contrast to the *CUDA* runtime API *alpaka* does not provide a device by default per kernel-thread.
-Especially in combination with *OpenMP* parallelized host code this keeps users from surprises.
-The following code snippet shows that it does not necessarily do what one would expect.
-
-```C++
-cudaSetDevice(1);
-
-#pragma omp parallel for
-for(int i = 0; i<10; ++i)
-{
-    kernel<<<blocks,threads>>>(i);
-}
-```
-
-Depending on what the *CUDA* runtime API selects as default device for each of the *OpenMP* threads (due to each of them having its own current device), not all of the kernels will necessarily run on device one.
-
-In the *alpaka* library all such dependencies are made explicit.
-All functions depending on a device require it to be given as a parameter.
-The *alpaka* *CUDA* back-end checks before forwarding the calls to the *CUDA* runtime API whether the current device matches the given one and changes it if required.
-The *alpaka* *CUDA* back-end does not reset the current device to the one prior to the method invocation out of performance considerations.
-This has to be considered when native *CUDA* code is combined with *alpaka* code.
-
-### No Default Queue:
-*CUDA* allows to execute commands without specifying a queue.
-The default queue that is used synchronizes implicitly with all other queues on the device.
-If a command queue is issued to the default, all other asynchronous queues have to wait before executing any new commands, even when they have been enqueued much earlier.
-This can introduce hard to track down performance issues.
-As of *CUDA* 7.0 the default queue can be converted to a non synchronizing queue with a compiler option.
-Because concurrency is crucial for performance and users should think about the dependencies between their commands from begin on, *alpaka* does not provide such a default queue.
-All asynchronous operations (kernel launches, memory copies and memory sets) require a queue to be executed in.
-
-### No Implicit Built-in Variables and Functions:
-Within *CUDA* device functions (functions annotated with `__global__` or `__device__`) built-in functions (`__syncthreads`, `__threadfence`, `atomicAdd`, ... ) and variables (`gridDim`, `blockIdx`, `blockDim`, `threadIdx`, `warpSize`, ...) are provided.
-
-It would have been possible to emulate those implicit definitions by forcing the kernel function object to inherit from a class providing these functions and members.
-However functions outside the kernel function object would then pose a problem.
-They do not have access to those functions and members, the function object has inherited.
-To circumvent this, the functions and members would have to be public, the inheritance would have to be public and a reference to the currently executing function object would have to be passed as parameter to external functions.
-This would have been too cumbersome and inconsistent.
-Therefore access to the accelerator is given to the user kernel function object via one special input parameter representing the accelerator.
-After that this accelerator object can simply be passed to other functions.
-The built-in variables can be accessed by the user via query functions on this accelerator.
-
-  * Abandoning all the implicit and default state makes it much easier for users of the library to reason about their code. *
-
-### No Language Extensions:
-Unlike *CUDA*, the *alpaka* library does not extend the C++ language with any additional variable qualifiers (`__shared__`, `__constant__`, `__device__`) defining the memory space.
-Instead of those qualifiers *alpaka* provides accelerator functions to allocate memory in different the different memory spaces.
-
-### No Dimensionality Restriction:
-*CUDA* always uses three-dimensional indices and extents, even though the task may only be one or two dimensional.
-*OpenCL* on the other hand allows grid and block dimensions in the range [1,3] but does not provide corresponding n-dimensional indices, but rather provides functions like `get_global_id` or `get_local_id`, which require the dimension in which the one-dimensional ID is to be queried as a parameter.
-By itself this is no problem, but how can be assured that a two-dimensional kernel is called with grid and block extents of the correct dimensionality at compile time?
-How can it be assured that a kernel which only uses `threadIdx.x` or equivalently calls `get_global_id(0)` will not get called with two dimensional grid and block extents?
-Because the result in such a case is undefined, and most of the time not wanted by the kernel author, this should be easy to check and reject at compile-time.
-In *alpaka* all accelerators are templatized on the dimensionality.
-This allows a two-dimensional image filter to assert that it is only called with a two dimensional accelerator.
-Thereby the algorithms can check for supported dimensionality of the accelerator at compile time instead of runtime.
-Furthermore with the dimension being a template parameter, the CPU back-end implementations are able to use only the number of nested loops really necessary instead of the 6 loops (2 x 3 loops for grid blocks and block threads), which are mandatory to emulate the *CUDA* threaded blocking scheme.
-
-*By hiding all the accelerator functionality inside of the accelerator object that is passed to the user kernel, the user of the *alpaka* library is not faced with any non-standard C++ extensions.
-Nevertheless the *CUDA* back-end internally uses those language extensions.*
-
-### Integral Sizes of Arbitrary Type:
-The type of sizes such as extents, indices and related variables are depending on a template parameter of the accelerator and connected classes.
-This allows the kernel to be executed with sizes of arbitrary ranges.
-Thereby it is possible to force the accelerator back-ends to perform all internal index, extent and other integral size depending computations with a given precision.
-This is especially useful on current *NVIDIA* GPUs.
-Even though they support 64-bit integral operations, they are emulated with multiple 32-bit operations.
-This can be a huge performance penalty when the sizes of buffers, offsets, indices and other integral variables holding sizes are known to be limited.
-
-### No synchronous (blocking) and asynchronous (non-blocking) function versions:
-*CUDA* provides two versions of many of the runtime functions, for example, `cudaMemcpyAsync` and `cudaMemcpy`.
-The asynchronous version requires a queue while the synchronous version does not need a queue parameter.
-The asynchronous version immediately returns control back to the caller while the task is enqueued into the given queue and executed later in parallel to the host code.
-The synchronous version waits for the task to finish before the function call returns control to the caller.
-Inconsistently, all kernels in a *CUDA* program can only be started either asynchronously by default or synchronously if `CUDA_LAUNCH_BLOCKING` is defined.
-There is no way to specify this on a per kernel basis.
-To switch a whole application from asynchronous to synchronous calls, for example for debugging reasons, it is necessary to change the names of all the runtime functions being called as well as their parameters.
-In *alpaka* this is solved by always enqueuing all tasks into a queue and not defining a default queue.
-Non-blocking queues as well as blocking queues are provided for all devices.
-Changes to the synchronicity of multiple tasks can be made on a per queue basis by changing the queue type at the place of creation.
-There is no need to change any line of calling code.
-
-### Memory Management
-Memory buffers can not only be identified by the pointer to their first byte.
-The C++ `new` and `malloc`, the *CUDA* `cudaMalloc` as well as the *OpenCL* `clCreateBuffer` functions all return a plain pointer.
-This is not enough when working with multiple accelerators and multiple devices.
-To know where a specific pointer was allocated, additional information has to be stored to uniquely identify a memory buffer on a specific device.
-Memory copies between multiple buffers additionally require the buffer extents and pitches to be known.
-Many APIs, for example *CUDA*, require the user to store this information externally.
-To unify the usage, *alpaka* stores all the necessary information in a memory buffer object.
-
-Acceleratable Functions
------------------------
-
-Many parallelization libraries / frameworks do not fully support the separation of the parallelization strategy from the algorithm itself.
-*OpenMP*, for example, fully mixes the per thread algorithm and the parallelization strategy.
-This can be seen in the source listing showing a simple AXPY computation with OpenMP.
-
-```C++
-template<
-    typename TIdx,
-    typename TElem>
-void axpyOpenMP(
-    TIdx const n,
-    TElem const alpha,
-    TElem const * const X,
-    TElem * const Y)
-{
-    #pragma omp parallel for
-    for (i=0; i<n; i++)
-    {
-        Y[i] = alpha * X[i] + Y[i];
-    }
-}
-```
-
-Only one line of the function body, line 13, is the algorithm itself, while all surrounding lines represent the parallelization strategy.
-In *OpenACC* the parallelization and the algorithm are similarly combined.
-
-*CUDA*, *OpenCL* and other libraries allow, at least to some degree, to separate the algorithm from the parallelization strategy.
-They define the concept of a kernel representing the algorithm itself which is then parallelized depending on the underlying hardware.
-The AXPY *CUDA* kernel source code shown in figure consists only of the code of one single iteration.
-
-```C++
-template<
-    typename TIdx,
-    typename TElem>
-__global__ void axpyCUDA(
-    TIdx const n,
-    TElem const alpha,
-    TElem const * const X,
-    TElem * const Y)
-{
-    TIdx const i(blockIdx.x*blockDim.x + threadIdx.x)
-    if(i < n)
-    {
-        Y[i] = alpha * X[i] + Y[i];
-    }
-}
-```
-
-On the other hand the *CUDA* implementation is bloated with code handling the inherent blocking scheme.
-Even if the algorithm does not utilize blocking, as it is the case here, the algorithm writer has to calculate the global index of the current thread by hand (line 10).
-Furthermore, to support vectors larger then the predefined maximum number of threads per block (1024 for current *CUDA* devices), multiple blocks have to be used.
-When the number of blocks does not divide the number of vector elements, it has to be assured that the threads responsible for the vector elements behind the given length, do not access the memory to prevent a possible memory access error.
-
-By using the kernel concept, the parallelization strategy, whether all elements are executed in sequential order, in parallel or blocked is not hard coded into the algorithm itself.
-The possibly multidimensional nested loops do not have to be written by the user.
-For example, six loops would be required to emulate the *CUDA* execution pattern with a grid of blocks consisting of threads.
-
-Furthermore the kernel concept breaks the algorithm down to the per element level.
-Recombining multiple kernel iterations to loop over lines, columns, blocks or any other structure is always possible by changing the calling code and does not require a change of the kernel.
-In contrast, by using *OpenMP* this would not be possible.
-Therefore the *alpaka* interface builds on the kernel concept, being the body of the corresponding standard for loop executed in each thread.
-
-### Execution Domain Specifications
-
-*CUDA* requires the user to annotate its functions with execution domain specifications.
-Functions that can only be executed on the GPU have to be annotated with `__device__`, functions that can be executed on the host and on the GPU have to be annotated with `__host__ __device__` and host only functions can optionally be annotated with `__host__`.
-The nvcc *CUDA* compiler uses these annotations to decide with which back-ends a function has to be compiled.
-Depending on the compiler in use, *alpaka* defines the macros  `ALPAKA_FN_HOST`, `ALPAKA_FN_ACC` and `ALPAKA_FN_HOST_ACC` with the identical meaning which can be used in the same positions.
-When the *CUDA* compiler is used, they are defined to their *CUDA* equivalents, else they are empty.
-
-### Kernel Function
-
-#### Requirements
-
-- User kernels should be implemented independent of the accelerator.
-- A user kernel has to have access to accelerator methods (synchronization within blocks, index retrieval, ...).
-- For usage with CUDA, the kernel methods have to be attributed with \__device\__ \__host\__.
-- The user kernel has to fulfill std::is_trivially_copyable because only such objects can be copied into CUDA device memory.
-  A trivially copyable class is a class that
-   1. Has no non-trivial copy constructors(this also requires no virtual functions or virtual bases)
-   2. Has no non-trivial move constructors
-   3. Has no non-trivial copy assignment operators
-   4. Has no non-trivial move assignment operators
-   5. Has a trivial destructor
-
-#### Implementation Variants
-
-There are two possible ways to tell the kernel about the accelerator type:
- 1. The kernel is templated on the accelerator type ...
-  * + This allows users to specialize them for different accelerators. (Is this is really necessary or desired?)
-  * - The kernel has to be a class template. This does not allow C++ lambdas to be used as kernels because they are no templates themselves (but only their `operator()` can be templated in C++14).
-  * - This prevents the user from instantiating an accelerator independent kernel before executing it.
-  Because the memory layout in inheritance hierarchies is undefined a simple copy of the user kernel or its members to its specialized type is not possible platform independently.
-  This would require a copy from UserKernel<TDummyAcc> to UserKernel<TAcc> to be possible.
-  The only way to allow this would be to require the user to implement a templated copy constructor for every kernel.
-  This is not allowed for kernels that should be copyable to a CUDA device because std::is_trivially_copyable requires the kernel to have no non-trivial copy constructors.
-  * a) ... and inherits from the accelerator. 
-    * - The kernel itself has to inherit at least protected from the accelerator to allow the KernelExecutor to access the Accelerator.
-    * - How do accelerator functions called from the kernel (and not within the kernel class itself) access the accelerator methods?
-    Casting this to the accelerator type and giving it as parameter is too much to require from the user.
-  * b) ... and the `operator()` has a reference to the accelerator as parameter.
-    * + This allows to use the accelerator in functions called from the kernel (and not within the kernel class itself) to access the accelerator methods in the same way the kernel entry point function can.
-    * - This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless.
- 2. The `operator()` is templated on the accelerator type and has a reference to the accelerator as parameter.
-  * + The kernel can be an arbitrary function object with ALPAKA_FN_HOST_ACC attributes.
-  * + This would allow to instantiate the accelerator independent kernel and set its members before execution.
-  * +/- C++14 provides polymorphic lambdas. All compilers (even MSVC) support this.
-  * - The `operator()` could be overloaded on the accelerator type but there is no way to specialize the whole kernel class itself, so it always has the same members.
-  * - This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless.
-
-Currently we implement version 2.
-
-
-#### Implementation Notes
-
-Unlike *CUDA*, the *alpaka* library does not differentiate between the kernel function that represents the entry point and other functions that can be executed on the accelerator.
-The entry point function that has to be annotated with `__global__` in *CUDA* is internal to the *alpaka* *CUDA* back-end and is not exposed to the user.
-It directly calls into the user supplied kernel function object whose invocation operator is declared with `ALPAKA_FN_ACC`, which equals `__device__` in *CUDA*.
-In this respect there is no difference between the kernel entry point function and any other accelerator function in *alpaka*.
-
-The `operator()` of the kernel function object has to be `const`.
-This is especially important for the *CUDA* back-end, as it could possibly use the constant memory of the GPU to store the function object.
-The constant memory is a fast, cached, read-only memory that is beneficial when all threads uniformly read from the same address at the same time.
-In this case it is as fast as a read from a register.
-
-
-### Access to accelerator dependent functionality
-
-There are two possible ways to implement access to accelerator dependent functionality inside a kernel:
-* Making the functions/templates members of the accelerator (maybe by inheritance) and calling them like `acc.syncThreads()` or `acc.template getIdx<Grid, Thread, Dim1>()`.
-This would require the user to know and understand when to use the template keyword inside dependent type  object function calls.
-* The functions are only light wrappers around traits that can be specialized taking the accelerator as first value (it can not be the last value because of the potential use of variadic arguments). 
-The resulting code would look like `sync(acc)` or `getIdx<Grid, Thread, Dim1>(acc)`.
-Internally these wrappers would call trait templates that are specialized for the specific accelerator e.g. `template<typename TAcc> Sync{...};`
-
-The second version is easier to understand and usually shorter to use in user code.
-
-
-Index and Work Division
------------------------
-
-*CUDA* requires the user to calculate the global index of the current thread within the grid by hand (already shown as `axpyCUDA`).
-On the contrary, *OpenCL* provides the methods `get_global_size`, `get_global_id`, `get_local_size` and `get_local_id`.
-Called with the required dimension, they return the corresponding local or global index or extent (size).
-In *alpaka* this idea is extended to all dimensions.
-To unify the method interface and to avoid confusion between the differing terms and meanings of the functions in *OpenCL* and *CUDA*, in *alpaka* these methods are template functions.
-
-
-Block Shared Memory
--------------------
- 
-### Static Block Shared Memory
-
-The size of block shared memory that is allocated inside the kernel is required to be given as compile time constant.
-This is due to CUDA not allowing to allocate block shared memory inside a kernel at runtime.
- 
-### Dynamic Block Shared Memory
-
-The size of the external block shared memory is obtained from a trait that can be specialized for each kernel.
-The trait is called with the current kernel invocation parameters and the block-element extent prior to each kernel execution.
-Because the block shared memory size is only ever constant or dependent on the block-element extent or the parameters of the invocation this has multiple advantages:
-* It forces the separation of the kernel invocation from the calculation of the required block shared memory size.
-* It lets the user write this calculation once instead of multiple times spread across the code.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Structure.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Structure.md
deleted file mode 100644
index e0a2c88b2d..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Structure.md
+++ /dev/null
@@ -1,35 +0,0 @@
-[:arrow_up: Up](../Library.md)
-
-Structure
-=========
-
-The *alpaka* library allows offloading of computations from the host execution domain to the accelerator execution domain, whereby they are allowed to be identical.
-
-In the abstraction hierarchy the library code is interleaved with user supplied code as is depicted in the following figure.
-![Execution Domains](execution_domain.png)
-User code invokes library functions, which in turn execute the user provided thread function (kernel) in parallel on the accelerator.
-The kernel in turn calls library functions when accessing accelerator properties and methods.
-Additionally, the user can enhance or optimize the library implementations by extending or replacing specific parts.
-
-The *alpaka* abstraction itself only defines requirements a type has to fulfill to be usable with the template functions the library provides.
-These type constraints are called concepts in C++.
-
-*A concept is a set of requirements consisting of valid expressions, associated types, invariants, and complexity guarantees.
-A type that satisfies the requirements is said to model the concept.
-A concept can extend the requirements of another concept, which is called refinement.* [BoostConcepts](http://www.boost.org/community/generic_programming.html)
-
-Concepts allow to safely define polymorphic algorithms that work with objects of many different types.
-
-The *alpaka* library implements a stack of concepts and their interactions modeling the abstraction defined in the previous chapter.
-Furthermore, default implementations for various devices and accelerators modeling those are included in the library.
-The interaction of the main user facing concepts can be seen in the following figure.
-![user / alpaka code interaction](structure_assoc.png)
-
-For each type of `Device` there is a `Platform` for enumerating the available `Device`s.
-A `Device` is the requirement for creating `Queues` and `Events` as it is for allocating `Buffers` on the respective `Device`. `Buffers` can be copied, their memory be set and they can be pinned or mapped.
-Copying and setting a buffer requires the corresponding `Copy` and `Set` tasks to be enqueued into the `Queue`.
-An `Event` can be enqueued into a `Queue` and its completion state can be queried by the user.
-It is possible to wait for (synchronize with) a single `Event`, a `Queue` or a whole `Device`.
-An `Executor` can be enqueued into a `Queue` and will execute the `Kernel` (after all previous tasks in the queue have been completed).
-The `Kernel` in turn has access to the `Accelerator` it is running on.
-The `Accelerator` provides the `Kernel` with its current index in the block or grid, their extents or other data as well as it allows to allocate shared memory, execute atomic operations and many more.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Usage.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Usage.md
deleted file mode 100644
index 52382c47e5..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/Usage.md
+++ /dev/null
@@ -1,112 +0,0 @@
-[:arrow_up: Up](../Library.md)
-
-Interface Usage
-===============
-
-Accelerator Executable Functions
---------------------------------
-
-Functions that should be executable on an accelerator have to be annotated with the execution domain (one of `ALPAKA_FN_HOST`, `ALPAKA_FN_ACC` and `ALPAKA_FN_HOST_ACC`).
-They most probably also require access to the accelerator data and methods, such as indices and extents as well as functions to allocate shared memory and to synchronize all threads within a block. 
-Therefore the accelerator has to be passed in as a templated constant reference parameter as can be seen in the following code snippet.
-
-```C++
-template<
-    typename TAcc>
-ALPAKA_FN_ACC auto doSomethingOnAccelerator(
-    TAcc const & acc/*,
-    ...*/)                  // Arbitrary number of parameters
--> int                      // Arbitrary return type
-{
-    //...
-}
-```
-
-
-Kernel Definition
------------------
-
-A kernel is a special function object which has to conform to the following requirements:
-* it has to fulfill the `std::is_trivially_copyable` trait (has to be copyable via memcpy)
-* the `operator()` is the kernel entry point
-  * it has to be an accelerator executable function
-  * it has to return `void`.
-  * its first argument has to be the accelerator (templated for arbitrary accelerator backends).
-
-The following code snippet shows a basic example of a kernel function object.
-
-```C++
-struct MyKernel
-{
-    template<
-        typename TAcc>       // Templated on the accelerator type.
-    ALPAKA_FN_ACC            // Macro marking the function to be executable on all accelerators.
-    auto operator()(         // The function / kernel to execute.
-        TAcc const & acc/*,  // The specific accelerator implementation.
-        ...*/) const         // Must be 'const'.
-    -> void
-    {
-        //...
-    }
-                      // Class can have members but has to be std::is_trivially_copyable.
-                      // Classes must not have pointers or references to host memory!
-};
-```
-
-The kernel function object is shared across all threads in all blocks.
-Due to the block execution order being undefined, there is no safe and consistent way of altering state that is stored inside of the function object.
-Therefore, the `operator()` of the kernel function object has to be `const` and is not allowed to modify any of the object members.
-
-
-Index and Work Division
------------------------
-
-The `alpaka::workdiv::getWorkDiv` and the `alpaka::idx::getIdx` functions both return a vector of the dimensionality the accelerator has been defined with.
-They are parametrized by the origin of the calculation as well as the unit in which the values are calculated.
-For example, `alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)` returns a vector with the extents of the grid in units of threads.
-
-
-Memory Management
------------------
-
-The memory allocation function of the *alpaka* library (`alpaka::mem::buf::alloc<TElem>(device, extents)`) is uniform for all devices, even for the host device.
-It does not return raw pointers but reference counted memory buffer objects that remove the necessity for manual freeing and the possibility of memory leaks.
-Additionally the memory buffer objects know their extents, their pitches as well as the device they reside on.
-This allows buffers that possibly reside on different devices with different pitches to be copied only by providing the buffer objects as well as the extents of the region to copy (`alpaka::mem::view::copy(bufDevA, bufDevB, copyExtents`).
-
-Kernel Execution
-----------------
-
-The following source code listing shows the execution of a kernel by enqueuing the execution task into a queue.
-
-```C++
-// Define the dimensionality of the task.
-using Dim = alpaka::dim::DimInt<1u>;
-// Define the type of the indexes.
-using Idx = std::size_t;
-// Define the accelerator to use.
-using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
-// Select the queue type.
-using Queue = alpaka::queue::QueueCpuNonBlocking;
-
-// Select a device to execute on.
-auto devAcc(alpaka::pltf::getDevByIdx<alpaka::pltf::PltfCpu>(0));
-// Create a queue to enqueue the execution into.
-Queue queue(devAcc);
-
-// Create a 1-dimensional work division with 256 blocks a 16 threads.
-auto const workDiv(alpaka::workdiv::WorkDivMembers<Dim, Idx>(256u, 16u);
-// Create an instance of the kernel function object.
-MyKernel kernel;
-// Enqueue the execution task into the queue.
-alpaka::kernel::exec<Acc>(queue, workDiv, kernel/*, arguments ...*/);
-```
-
-The dimensionality of the task as well as the type for index and extent have to be defined explicitly.
-Following this, the type of accelerator to execute on, as well as the type of the queue have to be defined.
-For both of these types instances have to be created.
-For the accelerator this has to be done indirectly by enumerating the required device via the device manager, whereas the queue can be created directly.
-
-To execute the kernel, an instance of the kernel function object has to be constructed.
-Following this, an execution task combining the work division (grid and block sizes) with the kernel function object and the bound invocation arguments has to be created.
-After that this task can be enqueued into a queue for immediate or later execution (depending on the queue used).
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.png b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.png
deleted file mode 100644
index ac06bc1a74..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.svg b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.svg
deleted file mode 100644
index 702b0caffc..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/execution_domain.svg
+++ /dev/null
@@ -1,216 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="278.653259pt" height="85.337189pt" viewBox="0 0 278.653259 85.337189" version="1.1">
-<defs>
-<g>
-<symbol overflow="visible" id="glyph0-0">
-<path style="stroke:none;" d="M 0.761719 0 L 0.761719 -7.441406 L 2.265625 -7.441406 L 2.265625 -4.511719 L 5.207031 -4.511719 L 5.207031 -7.441406 L 6.710938 -7.441406 L 6.710938 0 L 5.207031 0 L 5.207031 -3.253906 L 2.265625 -3.253906 L 2.265625 0 Z M 0.761719 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-1">
-<path style="stroke:none;" d="M 0.417969 -2.769531 C 0.414063 -3.242188 0.53125 -3.699219 0.765625 -4.144531 C 0.996094 -4.585938 1.328125 -4.925781 1.757813 -5.160156 C 2.1875 -5.390625 2.664063 -5.507813 3.191406 -5.511719 C 4.003906 -5.507813 4.671875 -5.242188 5.195313 -4.714844 C 5.714844 -4.183594 5.976563 -3.515625 5.980469 -2.710938 C 5.976563 -1.894531 5.714844 -1.21875 5.191406 -0.683594 C 4.664063 -0.148438 4 0.117188 3.203125 0.121094 C 2.707031 0.117188 2.234375 0.0078125 1.789063 -0.210938 C 1.335938 -0.433594 0.996094 -0.761719 0.765625 -1.195313 C 0.53125 -1.625 0.414063 -2.148438 0.417969 -2.769531 Z M 1.878906 -2.695313 C 1.875 -2.160156 2 -1.75 2.257813 -1.464844 C 2.507813 -1.179688 2.824219 -1.035156 3.199219 -1.039063 C 3.570313 -1.035156 3.878906 -1.179688 4.132813 -1.464844 C 4.382813 -1.75 4.511719 -2.164063 4.511719 -2.707031 C 4.511719 -3.230469 4.382813 -3.632813 4.132813 -3.921875 C 3.878906 -4.203125 3.570313 -4.347656 3.199219 -4.351563 C 2.824219 -4.347656 2.507813 -4.203125 2.257813 -3.921875 C 2 -3.632813 1.875 -3.226563 1.878906 -2.695313 Z M 1.878906 -2.695313 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-2">
-<path style="stroke:none;" d="M 0.242188 -1.539063 L 1.675781 -1.757813 C 1.734375 -1.476563 1.855469 -1.265625 2.042969 -1.125 C 2.226563 -0.980469 2.488281 -0.90625 2.828125 -0.910156 C 3.191406 -0.90625 3.46875 -0.976563 3.660156 -1.113281 C 3.78125 -1.207031 3.84375 -1.332031 3.847656 -1.492188 C 3.84375 -1.597656 3.808594 -1.6875 3.746094 -1.761719 C 3.671875 -1.828125 3.515625 -1.890625 3.269531 -1.949219 C 2.117188 -2.199219 1.386719 -2.429688 1.082031 -2.644531 C 0.652344 -2.933594 0.4375 -3.339844 0.441406 -3.859375 C 0.4375 -4.324219 0.621094 -4.714844 0.992188 -5.035156 C 1.359375 -5.347656 1.933594 -5.507813 2.710938 -5.511719 C 3.445313 -5.507813 3.992188 -5.386719 4.355469 -5.152344 C 4.710938 -4.910156 4.960938 -4.554688 5.097656 -4.085938 L 3.75 -3.835938 C 3.691406 -4.042969 3.582031 -4.203125 3.421875 -4.320313 C 3.261719 -4.429688 3.03125 -4.488281 2.734375 -4.488281 C 2.355469 -4.488281 2.085938 -4.433594 1.929688 -4.328125 C 1.816406 -4.253906 1.761719 -4.15625 1.765625 -4.039063 C 1.761719 -3.933594 1.8125 -3.847656 1.910156 -3.78125 C 2.035156 -3.683594 2.476563 -3.550781 3.238281 -3.378906 C 3.996094 -3.203125 4.527344 -2.992188 4.832031 -2.746094 C 5.125 -2.492188 5.273438 -2.136719 5.277344 -1.683594 C 5.273438 -1.1875 5.066406 -0.765625 4.65625 -0.410156 C 4.242188 -0.0585938 3.632813 0.117188 2.828125 0.121094 C 2.089844 0.117188 1.507813 -0.0273438 1.082031 -0.324219 C 0.652344 -0.621094 0.371094 -1.027344 0.242188 -1.539063 Z M 0.242188 -1.539063 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-3">
-<path style="stroke:none;" d="M 3.21875 -5.390625 L 3.21875 -4.253906 L 2.242188 -4.253906 L 2.242188 -2.082031 C 2.238281 -1.636719 2.246094 -1.382813 2.269531 -1.3125 C 2.285156 -1.238281 2.328125 -1.175781 2.398438 -1.132813 C 2.460938 -1.082031 2.542969 -1.058594 2.640625 -1.0625 C 2.769531 -1.058594 2.960938 -1.105469 3.214844 -1.199219 L 3.335938 -0.0898438 C 3 0.0507813 2.625 0.117188 2.207031 0.121094 C 1.945313 0.117188 1.714844 0.078125 1.511719 -0.0078125 C 1.304688 -0.0898438 1.152344 -0.203125 1.058594 -0.339844 C 0.960938 -0.476563 0.894531 -0.660156 0.859375 -0.898438 C 0.824219 -1.0625 0.808594 -1.398438 0.8125 -1.902344 L 0.8125 -4.253906 L 0.15625 -4.253906 L 0.15625 -5.390625 L 0.8125 -5.390625 L 0.8125 -6.460938 L 2.242188 -7.292969 L 2.242188 -5.390625 Z M 3.21875 -5.390625 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-4">
-<path style="stroke:none;" d="M 0.757813 0 L 0.757813 -7.441406 L 6.273438 -7.441406 L 6.273438 -6.183594 L 2.257813 -6.183594 L 2.257813 -4.53125 L 5.996094 -4.53125 L 5.996094 -3.277344 L 2.257813 -3.277344 L 2.257813 -1.253906 L 6.414063 -1.253906 L 6.414063 0 Z M 0.757813 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-5">
-<path style="stroke:none;" d="M 0.0625 0 L 2.003906 -2.777344 L 0.140625 -5.390625 L 1.882813 -5.390625 L 2.835938 -3.910156 L 3.84375 -5.390625 L 5.515625 -5.390625 L 3.691406 -2.835938 L 5.683594 0 L 3.933594 0 L 2.835938 -1.667969 L 1.730469 0 Z M 0.0625 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-6">
-<path style="stroke:none;" d="M 3.867188 -1.714844 L 5.289063 -1.476563 C 5.105469 -0.953125 4.816406 -0.554688 4.421875 -0.285156 C 4.027344 -0.015625 3.535156 0.117188 2.945313 0.121094 C 2.003906 0.117188 1.308594 -0.1875 0.863281 -0.796875 C 0.503906 -1.285156 0.328125 -1.902344 0.328125 -2.65625 C 0.328125 -3.546875 0.558594 -4.246094 1.027344 -4.753906 C 1.492188 -5.253906 2.085938 -5.507813 2.800781 -5.511719 C 3.601563 -5.507813 4.234375 -5.242188 4.699219 -4.714844 C 5.160156 -4.183594 5.378906 -3.375 5.363281 -2.285156 L 1.792969 -2.285156 C 1.800781 -1.859375 1.914063 -1.527344 2.136719 -1.296875 C 2.351563 -1.058594 2.625 -0.941406 2.960938 -0.945313 C 3.179688 -0.941406 3.367188 -1.003906 3.519531 -1.125 C 3.671875 -1.246094 3.789063 -1.441406 3.867188 -1.714844 Z M 3.949219 -3.15625 C 3.9375 -3.566406 3.828125 -3.878906 3.628906 -4.097656 C 3.421875 -4.308594 3.175781 -4.417969 2.886719 -4.421875 C 2.574219 -4.417969 2.316406 -4.304688 2.117188 -4.082031 C 1.910156 -3.851563 1.8125 -3.542969 1.816406 -3.15625 Z M 3.949219 -3.15625 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-7">
-<path style="stroke:none;" d="M 5.445313 -3.796875 L 4.039063 -3.542969 C 3.988281 -3.820313 3.882813 -4.03125 3.714844 -4.175781 C 3.546875 -4.316406 3.328125 -4.386719 3.066406 -4.390625 C 2.707031 -4.386719 2.425781 -4.265625 2.214844 -4.023438 C 2.003906 -3.777344 1.898438 -3.367188 1.898438 -2.792969 C 1.898438 -2.148438 2.003906 -1.695313 2.21875 -1.433594 C 2.433594 -1.167969 2.722656 -1.035156 3.085938 -1.039063 C 3.355469 -1.035156 3.578125 -1.113281 3.75 -1.269531 C 3.921875 -1.421875 4.042969 -1.6875 4.117188 -2.066406 L 5.515625 -1.828125 C 5.367188 -1.183594 5.089844 -0.695313 4.679688 -0.371094 C 4.261719 -0.0429688 3.707031 0.117188 3.019531 0.121094 C 2.226563 0.117188 1.597656 -0.128906 1.132813 -0.625 C 0.660156 -1.121094 0.425781 -1.808594 0.429688 -2.691406 C 0.425781 -3.574219 0.664063 -4.265625 1.136719 -4.765625 C 1.605469 -5.257813 2.242188 -5.507813 3.046875 -5.511719 C 3.703125 -5.507813 4.222656 -5.367188 4.609375 -5.085938 C 4.996094 -4.804688 5.273438 -4.375 5.445313 -3.796875 Z M 5.445313 -3.796875 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-8">
-<path style="stroke:none;" d="M 4.292969 0 L 4.292969 -0.808594 C 4.09375 -0.519531 3.835938 -0.289063 3.519531 -0.125 C 3.195313 0.0390625 2.859375 0.117188 2.507813 0.121094 C 2.144531 0.117188 1.816406 0.0390625 1.53125 -0.117188 C 1.242188 -0.273438 1.035156 -0.496094 0.90625 -0.785156 C 0.777344 -1.074219 0.714844 -1.472656 0.714844 -1.980469 L 0.714844 -5.390625 L 2.140625 -5.390625 L 2.140625 -2.914063 C 2.136719 -2.152344 2.164063 -1.6875 2.21875 -1.519531 C 2.269531 -1.347656 2.367188 -1.214844 2.507813 -1.113281 C 2.644531 -1.011719 2.820313 -0.960938 3.035156 -0.964844 C 3.277344 -0.960938 3.496094 -1.027344 3.691406 -1.164063 C 3.882813 -1.296875 4.011719 -1.464844 4.085938 -1.664063 C 4.152344 -1.859375 4.1875 -2.34375 4.191406 -3.117188 L 4.191406 -5.390625 L 5.617188 -5.390625 L 5.617188 0 Z M 4.292969 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-9">
-<path style="stroke:none;" d="M 0.746094 -6.121094 L 0.746094 -7.441406 L 2.171875 -7.441406 L 2.171875 -6.121094 Z M 0.746094 0 L 0.746094 -5.390625 L 2.171875 -5.390625 L 2.171875 0 Z M 0.746094 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-10">
-<path style="stroke:none;" d="M 5.648438 0 L 4.222656 0 L 4.222656 -2.75 C 4.21875 -3.328125 4.1875 -3.707031 4.128906 -3.878906 C 4.066406 -4.050781 3.96875 -4.183594 3.832031 -4.277344 C 3.695313 -4.371094 3.53125 -4.417969 3.339844 -4.421875 C 3.089844 -4.417969 2.867188 -4.351563 2.671875 -4.21875 C 2.476563 -4.082031 2.339844 -3.902344 2.269531 -3.679688 C 2.195313 -3.453125 2.160156 -3.039063 2.164063 -2.441406 L 2.164063 0 L 0.734375 0 L 0.734375 -5.390625 L 2.0625 -5.390625 L 2.0625 -4.597656 C 2.53125 -5.203125 3.121094 -5.507813 3.835938 -5.511719 C 4.148438 -5.507813 4.4375 -5.449219 4.699219 -5.339844 C 4.957031 -5.222656 5.152344 -5.078125 5.289063 -4.90625 C 5.421875 -4.726563 5.515625 -4.527344 5.570313 -4.308594 C 5.621094 -4.082031 5.648438 -3.765625 5.648438 -3.351563 Z M 5.648438 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-11">
-<path style="stroke:none;" d=""/>
-</symbol>
-<symbol overflow="visible" id="glyph0-12">
-<path style="stroke:none;" d="M 0.75 -7.441406 L 3.496094 -7.441406 C 4.113281 -7.441406 4.585938 -7.390625 4.914063 -7.296875 C 5.347656 -7.164063 5.71875 -6.9375 6.03125 -6.613281 C 6.339844 -6.28125 6.578125 -5.878906 6.742188 -5.40625 C 6.902344 -4.929688 6.984375 -4.34375 6.988281 -3.648438 C 6.984375 -3.035156 6.910156 -2.507813 6.761719 -2.066406 C 6.574219 -1.523438 6.308594 -1.085938 5.964844 -0.75 C 5.703125 -0.496094 5.347656 -0.296875 4.90625 -0.15625 C 4.570313 -0.0507813 4.128906 0 3.578125 0 L 0.75 0 Z M 2.253906 -6.183594 L 2.253906 -1.253906 L 3.375 -1.253906 C 3.792969 -1.253906 4.09375 -1.277344 4.285156 -1.324219 C 4.523438 -1.386719 4.726563 -1.488281 4.890625 -1.632813 C 5.050781 -1.777344 5.183594 -2.015625 5.285156 -2.351563 C 5.386719 -2.679688 5.4375 -3.136719 5.4375 -3.714844 C 5.4375 -4.289063 5.386719 -4.730469 5.285156 -5.039063 C 5.183594 -5.347656 5.039063 -5.585938 4.855469 -5.761719 C 4.671875 -5.929688 4.441406 -6.046875 4.160156 -6.109375 C 3.949219 -6.15625 3.539063 -6.179688 2.929688 -6.183594 Z M 2.253906 -6.183594 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-13">
-<path style="stroke:none;" d="M 0.640625 -5.390625 L 1.953125 -5.390625 L 1.953125 -4.65625 C 2.421875 -5.222656 2.984375 -5.507813 3.632813 -5.511719 C 3.976563 -5.507813 4.273438 -5.4375 4.53125 -5.296875 C 4.78125 -5.15625 4.992188 -4.941406 5.15625 -4.65625 C 5.390625 -4.941406 5.644531 -5.15625 5.921875 -5.296875 C 6.195313 -5.4375 6.488281 -5.507813 6.800781 -5.511719 C 7.195313 -5.507813 7.53125 -5.425781 7.804688 -5.269531 C 8.078125 -5.105469 8.28125 -4.871094 8.421875 -4.5625 C 8.515625 -4.332031 8.566406 -3.960938 8.566406 -3.445313 L 8.566406 0 L 7.140625 0 L 7.140625 -3.082031 C 7.136719 -3.613281 7.089844 -3.957031 6.996094 -4.117188 C 6.863281 -4.316406 6.660156 -4.417969 6.386719 -4.421875 C 6.183594 -4.417969 5.992188 -4.355469 5.820313 -4.238281 C 5.640625 -4.113281 5.515625 -3.9375 5.441406 -3.703125 C 5.359375 -3.46875 5.320313 -3.097656 5.324219 -2.589844 L 5.324219 0 L 3.898438 0 L 3.898438 -2.953125 C 3.894531 -3.476563 3.871094 -3.8125 3.820313 -3.96875 C 3.769531 -4.117188 3.691406 -4.230469 3.585938 -4.308594 C 3.476563 -4.378906 3.332031 -4.417969 3.152344 -4.421875 C 2.929688 -4.417969 2.730469 -4.359375 2.554688 -4.242188 C 2.378906 -4.125 2.253906 -3.953125 2.179688 -3.730469 C 2.101563 -3.503906 2.0625 -3.132813 2.066406 -2.617188 L 2.066406 0 L 0.640625 0 Z M 0.640625 -5.390625 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-14">
-<path style="stroke:none;" d="M 1.8125 -3.746094 L 0.519531 -3.980469 C 0.660156 -4.496094 0.910156 -4.882813 1.265625 -5.136719 C 1.621094 -5.382813 2.148438 -5.507813 2.851563 -5.511719 C 3.488281 -5.507813 3.960938 -5.433594 4.273438 -5.285156 C 4.582031 -5.132813 4.800781 -4.941406 4.929688 -4.710938 C 5.054688 -4.476563 5.121094 -4.050781 5.121094 -3.4375 L 5.105469 -1.769531 C 5.101563 -1.296875 5.125 -0.945313 5.171875 -0.722656 C 5.214844 -0.496094 5.300781 -0.257813 5.429688 0 L 4.019531 0 C 3.980469 -0.09375 3.933594 -0.234375 3.882813 -0.421875 C 3.855469 -0.503906 3.839844 -0.558594 3.832031 -0.589844 C 3.585938 -0.351563 3.324219 -0.171875 3.046875 -0.0546875 C 2.769531 0.0625 2.472656 0.117188 2.164063 0.121094 C 1.601563 0.117188 1.164063 -0.03125 0.847656 -0.328125 C 0.527344 -0.628906 0.367188 -1.007813 0.371094 -1.472656 C 0.367188 -1.773438 0.441406 -2.046875 0.589844 -2.285156 C 0.734375 -2.523438 0.9375 -2.703125 1.199219 -2.832031 C 1.460938 -2.957031 1.839844 -3.070313 2.335938 -3.167969 C 3 -3.289063 3.460938 -3.40625 3.71875 -3.515625 L 3.71875 -3.660156 C 3.714844 -3.929688 3.648438 -4.125 3.515625 -4.246094 C 3.378906 -4.359375 3.121094 -4.417969 2.75 -4.421875 C 2.492188 -4.417969 2.296875 -4.367188 2.15625 -4.269531 C 2.015625 -4.167969 1.898438 -3.992188 1.8125 -3.746094 Z M 3.71875 -2.589844 C 3.535156 -2.527344 3.246094 -2.453125 2.851563 -2.367188 C 2.453125 -2.28125 2.195313 -2.199219 2.074219 -2.121094 C 1.886719 -1.988281 1.792969 -1.820313 1.796875 -1.621094 C 1.792969 -1.417969 1.867188 -1.246094 2.019531 -1.101563 C 2.164063 -0.953125 2.355469 -0.878906 2.589844 -0.882813 C 2.84375 -0.878906 3.085938 -0.964844 3.324219 -1.136719 C 3.492188 -1.261719 3.605469 -1.417969 3.664063 -1.609375 C 3.695313 -1.730469 3.714844 -1.960938 3.71875 -2.304688 Z M 3.71875 -2.589844 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-15">
-<path style="stroke:none;" d="M 5.515625 -2.734375 L 6.972656 -2.273438 C 6.75 -1.460938 6.378906 -0.855469 5.859375 -0.464844 C 5.339844 -0.0703125 4.679688 0.121094 3.882813 0.125 C 2.894531 0.121094 2.082031 -0.210938 1.445313 -0.886719 C 0.808594 -1.558594 0.492188 -2.484375 0.492188 -3.65625 C 0.492188 -4.890625 0.808594 -5.851563 1.449219 -6.539063 C 2.085938 -7.222656 2.929688 -7.5625 3.972656 -7.566406 C 4.882813 -7.5625 5.621094 -7.296875 6.191406 -6.761719 C 6.527344 -6.441406 6.78125 -5.984375 6.953125 -5.390625 L 5.464844 -5.035156 C 5.375 -5.417969 5.191406 -5.722656 4.914063 -5.949219 C 4.632813 -6.167969 4.292969 -6.28125 3.898438 -6.285156 C 3.34375 -6.28125 2.898438 -6.082031 2.554688 -5.691406 C 2.210938 -5.292969 2.039063 -4.652344 2.039063 -3.765625 C 2.039063 -2.824219 2.207031 -2.152344 2.546875 -1.753906 C 2.882813 -1.351563 3.324219 -1.152344 3.867188 -1.15625 C 4.265625 -1.152344 4.605469 -1.28125 4.894531 -1.535156 C 5.179688 -1.789063 5.386719 -2.1875 5.515625 -2.734375 Z M 5.515625 -2.734375 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-16">
-<path style="stroke:none;" d="M 5.691406 0 L 4.363281 0 L 4.363281 -0.792969 C 4.140625 -0.480469 3.882813 -0.25 3.585938 -0.101563 C 3.285156 0.046875 2.980469 0.117188 2.679688 0.121094 C 2.054688 0.117188 1.523438 -0.128906 1.085938 -0.625 C 0.640625 -1.121094 0.421875 -1.816406 0.425781 -2.714844 C 0.421875 -3.625 0.636719 -4.320313 1.070313 -4.796875 C 1.496094 -5.269531 2.039063 -5.507813 2.699219 -5.511719 C 3.300781 -5.507813 3.820313 -5.257813 4.261719 -4.761719 L 4.261719 -7.441406 L 5.691406 -7.441406 Z M 1.882813 -2.8125 C 1.878906 -2.234375 1.960938 -1.816406 2.121094 -1.5625 C 2.347656 -1.1875 2.667969 -1 3.085938 -1.003906 C 3.410156 -1 3.6875 -1.140625 3.921875 -1.421875 C 4.148438 -1.699219 4.265625 -2.117188 4.269531 -2.675781 C 4.265625 -3.292969 4.152344 -3.738281 3.933594 -4.011719 C 3.707031 -4.28125 3.421875 -4.417969 3.074219 -4.421875 C 2.734375 -4.417969 2.449219 -4.28125 2.222656 -4.015625 C 1.992188 -3.742188 1.878906 -3.34375 1.882813 -2.8125 Z M 1.882813 -2.8125 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-17">
-<path style="stroke:none;" d="M 0.746094 -7.441406 L 2.25 -7.441406 L 2.25 -3.410156 C 2.25 -2.769531 2.265625 -2.355469 2.304688 -2.167969 C 2.367188 -1.863281 2.519531 -1.617188 2.761719 -1.433594 C 3 -1.246094 3.332031 -1.152344 3.757813 -1.15625 C 4.179688 -1.152344 4.5 -1.242188 4.71875 -1.417969 C 4.933594 -1.589844 5.066406 -1.800781 5.109375 -2.058594 C 5.152344 -2.308594 5.171875 -2.730469 5.175781 -3.324219 L 5.175781 -7.441406 L 6.679688 -7.441406 L 6.679688 -3.53125 C 6.679688 -2.636719 6.636719 -2.003906 6.558594 -1.636719 C 6.472656 -1.265625 6.324219 -0.957031 6.109375 -0.703125 C 5.886719 -0.449219 5.59375 -0.246094 5.230469 -0.0976563 C 4.863281 0.0507813 4.386719 0.121094 3.800781 0.125 C 3.085938 0.121094 2.546875 0.0429688 2.183594 -0.121094 C 1.8125 -0.28125 1.523438 -0.492188 1.3125 -0.757813 C 1.101563 -1.015625 0.960938 -1.292969 0.894531 -1.582031 C 0.792969 -2.007813 0.742188 -2.636719 0.746094 -3.472656 Z M 0.746094 -7.441406 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-18">
-<path style="stroke:none;" d="M 2.113281 0 L 0.683594 0 L 0.683594 -5.390625 L 2.011719 -5.390625 L 2.011719 -4.625 C 2.234375 -4.984375 2.4375 -5.222656 2.621094 -5.339844 C 2.800781 -5.449219 3.007813 -5.507813 3.238281 -5.511719 C 3.5625 -5.507813 3.875 -5.417969 4.175781 -5.242188 L 3.734375 -4 C 3.492188 -4.152344 3.269531 -4.230469 3.066406 -4.234375 C 2.863281 -4.230469 2.695313 -4.175781 2.558594 -4.066406 C 2.417969 -3.957031 2.308594 -3.757813 2.230469 -3.46875 C 2.148438 -3.179688 2.109375 -2.578125 2.113281 -1.664063 Z M 2.113281 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-19">
-<path style="stroke:none;" d="M 0.746094 0 L 0.746094 -7.441406 L 2.171875 -7.441406 L 2.171875 0 Z M 0.746094 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-20">
-<path style="stroke:none;" d="M 0.707031 -5.390625 L 2.035156 -5.390625 L 2.035156 -4.597656 C 2.203125 -4.867188 2.4375 -5.085938 2.734375 -5.257813 C 3.027344 -5.421875 3.355469 -5.507813 3.714844 -5.511719 C 4.339844 -5.507813 4.871094 -5.261719 5.308594 -4.773438 C 5.746094 -4.28125 5.964844 -3.597656 5.964844 -2.726563 C 5.964844 -1.820313 5.742188 -1.121094 5.304688 -0.625 C 4.859375 -0.128906 4.328125 0.117188 3.707031 0.121094 C 3.40625 0.117188 3.136719 0.0625 2.894531 -0.0546875 C 2.652344 -0.171875 2.398438 -0.375 2.132813 -0.664063 L 2.132813 2.050781 L 0.707031 2.050781 Z M 2.117188 -2.785156 C 2.113281 -2.175781 2.234375 -1.730469 2.476563 -1.445313 C 2.714844 -1.152344 3.007813 -1.007813 3.355469 -1.011719 C 3.683594 -1.007813 3.957031 -1.140625 4.179688 -1.410156 C 4.398438 -1.671875 4.511719 -2.105469 4.511719 -2.714844 C 4.511719 -3.277344 4.398438 -3.699219 4.171875 -3.972656 C 3.945313 -4.246094 3.664063 -4.382813 3.328125 -4.386719 C 2.980469 -4.382813 2.691406 -4.246094 2.460938 -3.980469 C 2.230469 -3.707031 2.113281 -3.308594 2.117188 -2.785156 Z M 2.117188 -2.785156 "/>
-</symbol>
-<symbol overflow="visible" id="glyph0-21">
-<path style="stroke:none;" d="M 0.695313 0 L 0.695313 -7.441406 L 2.121094 -7.441406 L 2.121094 -3.492188 L 3.792969 -5.390625 L 5.546875 -5.390625 L 3.707031 -3.421875 L 5.679688 0 L 4.140625 0 L 2.785156 -2.421875 L 2.121094 -1.726563 L 2.121094 0 Z M 0.695313 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph1-0">
-<path style="stroke:none;" d="M 7.371094 0 L 5.757813 0 L 5.117188 -1.667969 L 2.179688 -1.667969 L 1.574219 0 L 0 0 L 2.863281 -7.347656 L 4.429688 -7.347656 Z M 4.640625 -2.90625 L 3.628906 -5.632813 L 2.636719 -2.90625 Z M 4.640625 -2.90625 "/>
-</symbol>
-<symbol overflow="visible" id="glyph1-1">
-<path style="stroke:none;" d="M 5.378906 -3.75 L 3.988281 -3.5 C 3.9375 -3.773438 3.832031 -3.984375 3.667969 -4.125 C 3.503906 -4.265625 3.289063 -4.335938 3.027344 -4.335938 C 2.671875 -4.335938 2.390625 -4.214844 2.1875 -3.972656 C 1.976563 -3.730469 1.875 -3.324219 1.875 -2.757813 C 1.875 -2.121094 1.980469 -1.675781 2.191406 -1.417969 C 2.402344 -1.15625 2.6875 -1.023438 3.046875 -1.027344 C 3.3125 -1.023438 3.53125 -1.101563 3.703125 -1.253906 C 3.875 -1.40625 3.996094 -1.667969 4.066406 -2.039063 L 5.449219 -1.804688 C 5.304688 -1.167969 5.027344 -0.6875 4.621094 -0.363281 C 4.210938 -0.0429688 3.664063 0.117188 2.980469 0.121094 C 2.199219 0.117188 1.578125 -0.125 1.117188 -0.613281 C 0.652344 -1.101563 0.421875 -1.78125 0.425781 -2.65625 C 0.421875 -3.53125 0.652344 -4.214844 1.121094 -4.707031 C 1.582031 -5.191406 2.210938 -5.4375 3.007813 -5.441406 C 3.652344 -5.4375 4.167969 -5.296875 4.554688 -5.023438 C 4.933594 -4.742188 5.210938 -4.320313 5.378906 -3.75 Z M 5.378906 -3.75 "/>
-</symbol>
-<symbol overflow="visible" id="glyph1-2">
-<path style="stroke:none;" d="M 3.820313 -1.695313 L 5.222656 -1.457031 C 5.039063 -0.941406 4.753906 -0.550781 4.367188 -0.28125 C 3.976563 -0.015625 3.492188 0.117188 2.90625 0.121094 C 1.980469 0.117188 1.292969 -0.183594 0.851563 -0.785156 C 0.496094 -1.269531 0.320313 -1.878906 0.324219 -2.621094 C 0.320313 -3.5 0.550781 -4.191406 1.015625 -4.691406 C 1.476563 -5.1875 2.0625 -5.4375 2.765625 -5.441406 C 3.554688 -5.4375 4.179688 -5.175781 4.640625 -4.65625 C 5.097656 -4.132813 5.316406 -3.332031 5.296875 -2.253906 L 1.769531 -2.253906 C 1.777344 -1.835938 1.890625 -1.511719 2.109375 -1.28125 C 2.328125 -1.046875 2.597656 -0.929688 2.921875 -0.933594 C 3.140625 -0.929688 3.324219 -0.992188 3.476563 -1.113281 C 3.625 -1.230469 3.742188 -1.421875 3.820313 -1.695313 Z M 3.898438 -3.117188 C 3.882813 -3.523438 3.777344 -3.832031 3.582031 -4.046875 C 3.378906 -4.257813 3.136719 -4.363281 2.851563 -4.367188 C 2.542969 -4.363281 2.289063 -4.253906 2.089844 -4.03125 C 1.886719 -3.804688 1.789063 -3.5 1.792969 -3.117188 Z M 3.898438 -3.117188 "/>
-</symbol>
-<symbol overflow="visible" id="glyph1-3">
-<path style="stroke:none;" d="M 0.738281 0 L 0.738281 -7.347656 L 2.144531 -7.347656 L 2.144531 0 Z M 0.738281 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph1-4">
-<path style="stroke:none;" d="M 2.085938 0 L 0.675781 0 L 0.675781 -5.324219 L 1.984375 -5.324219 L 1.984375 -4.566406 C 2.207031 -4.917969 2.40625 -5.152344 2.585938 -5.269531 C 2.761719 -5.378906 2.96875 -5.4375 3.199219 -5.441406 C 3.519531 -5.4375 3.828125 -5.347656 4.125 -5.175781 L 3.6875 -3.949219 C 3.445313 -4.101563 3.226563 -4.175781 3.027344 -4.179688 C 2.828125 -4.175781 2.660156 -4.121094 2.523438 -4.015625 C 2.386719 -3.90625 2.28125 -3.710938 2.203125 -3.429688 C 2.125 -3.140625 2.085938 -2.546875 2.085938 -1.644531 Z M 2.085938 0 "/>
-</symbol>
-<symbol overflow="visible" id="glyph1-5">
-<path style="stroke:none;" d="M 1.789063 -3.699219 L 0.511719 -3.929688 C 0.652344 -4.441406 0.902344 -4.820313 1.253906 -5.070313 C 1.605469 -5.3125 2.125 -5.4375 2.816406 -5.441406 C 3.441406 -5.4375 3.910156 -5.363281 4.21875 -5.21875 C 4.523438 -5.066406 4.738281 -4.878906 4.867188 -4.652344 C 4.992188 -4.421875 5.054688 -4 5.058594 -3.394531 L 5.042969 -1.75 C 5.039063 -1.277344 5.0625 -0.933594 5.109375 -0.714844 C 5.152344 -0.492188 5.234375 -0.253906 5.363281 0 L 3.96875 0 C 3.929688 -0.09375 3.886719 -0.230469 3.835938 -0.414063 C 3.808594 -0.496094 3.792969 -0.550781 3.785156 -0.582031 C 3.542969 -0.34375 3.285156 -0.167969 3.011719 -0.0546875 C 2.738281 0.0625 2.445313 0.117188 2.136719 0.121094 C 1.585938 0.117188 1.152344 -0.0273438 0.839844 -0.324219 C 0.519531 -0.621094 0.363281 -1 0.367188 -1.453125 C 0.363281 -1.753906 0.433594 -2.019531 0.582031 -2.257813 C 0.722656 -2.492188 0.925781 -2.675781 1.183594 -2.800781 C 1.441406 -2.925781 1.8125 -3.035156 2.304688 -3.128906 C 2.960938 -3.25 3.417969 -3.363281 3.675781 -3.472656 L 3.675781 -3.613281 C 3.671875 -3.882813 3.605469 -4.074219 3.472656 -4.191406 C 3.335938 -4.304688 3.082031 -4.363281 2.714844 -4.367188 C 2.460938 -4.363281 2.265625 -4.316406 2.128906 -4.21875 C 1.984375 -4.117188 1.871094 -3.941406 1.789063 -3.699219 Z M 3.675781 -2.554688 C 3.488281 -2.492188 3.203125 -2.417969 2.816406 -2.339844 C 2.421875 -2.253906 2.167969 -2.171875 2.050781 -2.09375 C 1.863281 -1.960938 1.769531 -1.796875 1.773438 -1.597656 C 1.769531 -1.398438 1.84375 -1.226563 1.992188 -1.085938 C 2.140625 -0.9375 2.328125 -0.867188 2.554688 -0.871094 C 2.808594 -0.867188 3.050781 -0.949219 3.28125 -1.121094 C 3.453125 -1.246094 3.5625 -1.402344 3.617188 -1.589844 C 3.652344 -1.707031 3.671875 -1.9375 3.675781 -2.277344 Z M 3.675781 -2.554688 "/>
-</symbol>
-<symbol overflow="visible" id="glyph1-6">
-<path style="stroke:none;" d="M 3.179688 -5.324219 L 3.179688 -4.199219 L 2.214844 -4.199219 L 2.214844 -2.054688 C 2.214844 -1.617188 2.222656 -1.363281 2.242188 -1.292969 C 2.257813 -1.21875 2.300781 -1.160156 2.367188 -1.117188 C 2.429688 -1.066406 2.507813 -1.042969 2.605469 -1.046875 C 2.734375 -1.042969 2.921875 -1.089844 3.171875 -1.183594 L 3.292969 -0.0898438 C 2.964844 0.0507813 2.59375 0.117188 2.179688 0.121094 C 1.925781 0.117188 1.695313 0.078125 1.492188 -0.0078125 C 1.285156 -0.0898438 1.136719 -0.203125 1.042969 -0.339844 C 0.949219 -0.472656 0.882813 -0.652344 0.847656 -0.886719 C 0.8125 -1.046875 0.796875 -1.378906 0.800781 -1.878906 L 0.800781 -4.199219 L 0.15625 -4.199219 L 0.15625 -5.324219 L 0.800781 -5.324219 L 0.800781 -6.378906 L 2.214844 -7.203125 L 2.214844 -5.324219 Z M 3.179688 -5.324219 "/>
-</symbol>
-<symbol overflow="visible" id="glyph1-7">
-<path style="stroke:none;" d="M 0.410156 -2.738281 C 0.410156 -3.199219 0.523438 -3.652344 0.753906 -4.09375 C 0.984375 -4.527344 1.3125 -4.859375 1.734375 -5.09375 C 2.15625 -5.320313 2.628906 -5.4375 3.152344 -5.441406 C 3.957031 -5.4375 4.617188 -5.175781 5.132813 -4.65625 C 5.648438 -4.132813 5.90625 -3.472656 5.90625 -2.675781 C 5.90625 -1.867188 5.644531 -1.199219 5.125 -0.671875 C 4.601563 -0.144531 3.949219 0.117188 3.164063 0.121094 C 2.671875 0.117188 2.203125 0.0078125 1.765625 -0.210938 C 1.320313 -0.429688 0.984375 -0.75 0.753906 -1.179688 C 0.523438 -1.601563 0.410156 -2.121094 0.410156 -2.738281 Z M 1.855469 -2.660156 C 1.851563 -2.132813 1.976563 -1.726563 2.230469 -1.449219 C 2.476563 -1.164063 2.785156 -1.023438 3.15625 -1.027344 C 3.523438 -1.023438 3.832031 -1.164063 4.082031 -1.449219 C 4.328125 -1.726563 4.453125 -2.136719 4.457031 -2.671875 C 4.453125 -3.191406 4.328125 -3.589844 4.082031 -3.875 C 3.832031 -4.152344 3.523438 -4.292969 3.15625 -4.296875 C 2.785156 -4.292969 2.476563 -4.152344 2.230469 -3.875 C 1.976563 -3.589844 1.851563 -3.1875 1.855469 -2.660156 Z M 1.855469 -2.660156 "/>
-</symbol>
-</g>
-<image id="image5" width="90" height="42" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAAAqCAYAAAAzikzDAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAETSURBVGhD7dsxa8JgGEXhSCsOLZQuDg4iglBou7T//795TwZRdJEkdzofPBCTuJy8ZMo3XK1VvMRas6AlTW8WJzfxHh/xqUloSEua0nZcVOcEN+ziEEdNQkNa0pS242RTnPpcOMVP/GoSGtKSprQdp/o1GHWewnf8xb8moSEtaUpbGo8vbkackedpPPqjnkdLmtKWxoZeiKFLDF1i6BJDlxi6xNAlhi4xdImhSwxdYugSQ5cYusTQJYYuMXSJoUsMXWLoEkOXGLrE0CWGLjF0iaFLDF1i6BJDlxi6xNAld6H9EH0Zd6HdWrGMr9jH5Yt/Nwstg8jbeIvLziwO3P42LzoSmdfGzX5DfhCcC5oHPdN1GM6So4mB0BJ9qQAAAABJRU5ErkJggg=="/>
-<pattern id="pattern0" patternUnits="userSpaceOnUse" width="90" height="42"  patternTransform="matrix(0.708749,0,0,0.708749,13.872422,13.862211)">
-  <use xlink:href="#image5"/>
-</pattern>
-<image id="image8" width="90" height="42" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAAAqCAYAAAAzikzDAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAE3SURBVGhD7du5SgRRFEXRdh4Qh8BATRwQA6dGUf//zzy7wEZBA33tifaDlVRXJ7suFdWdfTorsRYbWgpa0vTL4eJW7MVBHGkIDWlJU9pOh+pc4IbTOI9LDaEhLWlK22myKU59friOu7jXEBrSkqa0naZ6PRh1nsJtPMWzhtCQljSlLY2nFzcjzsjzNL77o36PljSlLY0N/U8MXWLoEkOXGLrE0CWGLjF0iaFLDF1i6BJDlxi6xNAlhi4xdImhSwxdYugSQ5cYusTQJYYuMXSJoUsMXWLoEkOXGLrE0CWGLvkx9FXM401LQUuaLkLzNfphXMRDvMSrhtCQljSl7fTF/8cOy1ncBDc8aggNaUnTxQ4LG0ObsR/HcRIsuejvaEhLmtJ2sW+4GrxHtmMndjWEhrSkadrOZu/LZ5bZi3jZrAAAAABJRU5ErkJggg=="/>
-<pattern id="pattern1" patternUnits="userSpaceOnUse" width="90" height="42"  patternTransform="matrix(0.708749,0,0,0.708749,13.82061,42.599514)">
-  <use xlink:href="#image8"/>
-</pattern>
-</defs>
-<g id="surface1">
-<path style="fill-rule:evenodd;fill:rgb(86.666667%,44.313725%,58.431373%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(86.666667%,44.313725%,58.431373%);stroke-opacity:1;stroke-miterlimit:10;" d="M 127.282099 36.291729 L 127.282099 -0.00129982 L 0.000215292 -0.00129982 L 0.000215292 36.291729 Z M 127.282099 36.291729 " transform="matrix(0.708749,0,0,0.708749,81.464691,14.883734)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-0" x="114.109375" y="31.261719"/>
-  <use xlink:href="#glyph0-1" x="121.616303" y="31.261719"/>
-  <use xlink:href="#glyph0-2" x="127.965976" y="31.261719"/>
-  <use xlink:href="#glyph0-3" x="133.747174" y="31.261719"/>
-</g>
-<path style="fill-rule:evenodd;fill:rgb(86.666667%,44.313725%,58.431373%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(86.666667%,44.313725%,58.431373%);stroke-opacity:1;stroke-miterlimit:10;" d="M 127.468649 36.259496 L 127.468649 -0.000464223 L -0.000624346 -0.000464223 L -0.000624346 36.259496 Z M 127.468649 36.259496 " transform="matrix(0.708749,0,0,0.708749,173.375443,14.89486)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph1-0" x="189.234375" y="31.261719"/>
-  <use xlink:href="#glyph1-1" x="196.647418" y="31.261719"/>
-  <use xlink:href="#glyph1-1" x="202.356314" y="31.261719"/>
-  <use xlink:href="#glyph1-2" x="208.065209" y="31.261719"/>
-  <use xlink:href="#glyph1-3" x="213.774104" y="31.261719"/>
-  <use xlink:href="#glyph1-2" x="216.626046" y="31.261719"/>
-  <use xlink:href="#glyph1-4" x="222.334941" y="31.261719"/>
-  <use xlink:href="#glyph1-5" x="226.329664" y="31.261719"/>
-  <use xlink:href="#glyph1-6" x="232.03856" y="31.261719"/>
-  <use xlink:href="#glyph1-7" x="235.45688" y="31.261719"/>
-  <use xlink:href="#glyph1-4" x="241.727142" y="31.261719"/>
-</g>
-<path style=" stroke:none;fill-rule:evenodd;fill:url(#pattern0);" d="M 13.871094 13.863281 L 77.222656 13.863281 L 77.222656 43.175781 L 13.871094 43.175781 Z M 13.871094 13.863281 "/>
-<path style="fill-rule:evenodd;fill:rgb(63.529412%,31.764706%,42.352941%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(63.529412%,31.764706%,42.352941%);stroke-opacity:1;stroke-miterlimit:10;" d="M 84.383362 36.359795 L 84.383362 0.000628383 L -0.0027517 0.000628383 L -0.0027517 36.359795 Z M 84.383362 36.359795 " transform="matrix(0.708749,0,0,0.708749,14.935544,14.925336)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-4" x="19.136719" y="24.882813"/>
-  <use xlink:href="#glyph0-5" x="26.070095" y="24.882813"/>
-  <use xlink:href="#glyph0-6" x="31.851292" y="24.882813"/>
-  <use xlink:href="#glyph0-7" x="37.63249" y="24.882813"/>
-  <use xlink:href="#glyph0-8" x="43.413687" y="24.882813"/>
-  <use xlink:href="#glyph0-3" x="49.76336" y="24.882813"/>
-  <use xlink:href="#glyph0-9" x="53.224973" y="24.882813"/>
-  <use xlink:href="#glyph0-1" x="56.113034" y="24.882813"/>
-  <use xlink:href="#glyph0-10" x="62.462707" y="24.882813"/>
-  <use xlink:href="#glyph0-11" x="68.81238" y="24.882813"/>
-</g>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-12" x="25.515625" y="37.640625"/>
-  <use xlink:href="#glyph0-1" x="33.022553" y="37.640625"/>
-  <use xlink:href="#glyph0-13" x="39.372226" y="37.640625"/>
-  <use xlink:href="#glyph0-14" x="48.615036" y="37.640625"/>
-  <use xlink:href="#glyph0-9" x="54.396233" y="37.640625"/>
-  <use xlink:href="#glyph0-10" x="57.284294" y="37.640625"/>
-</g>
-<path style=" stroke:none;fill-rule:evenodd;fill:url(#pattern1);" d="M 13.820313 42.597656 L 77.167969 42.597656 L 77.167969 72.226563 L 13.820313 72.226563 Z M 13.820313 42.597656 "/>
-<path style="fill-rule:evenodd;fill:rgb(73.72549%,50.196078%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(73.72549%,50.196078%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 84.379303 36.802097 L 84.379303 0.00201298 L -0.00129982 0.00201298 L -0.00129982 36.802097 Z M 84.379303 36.802097 " transform="matrix(0.708749,0,0,0.708749,14.883734,43.662636)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-15" x="30.476563" y="60.320313"/>
-  <use xlink:href="#glyph0-1" x="37.98349" y="60.320313"/>
-  <use xlink:href="#glyph0-16" x="44.333164" y="60.320313"/>
-  <use xlink:href="#glyph0-6" x="50.682837" y="60.320313"/>
-</g>
-<path style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 47.127954 16.965547 L 47.127954 0.00124331 L -0.000624346 0.00124331 L -0.000624346 16.965547 Z M 47.127954 16.965547 " transform="matrix(0.708749,0,0,0.708749,176.641068,49.889744)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-17" x="180.730469" y="59.613281"/>
-  <use xlink:href="#glyph0-2" x="188.237397" y="59.613281"/>
-  <use xlink:href="#glyph0-6" x="194.018594" y="59.613281"/>
-  <use xlink:href="#glyph0-18" x="199.799791" y="59.613281"/>
-</g>
-<path style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 61.237209 17.085459 L 61.237209 -0.0000968813 L -0.000731992 -0.0000968813 L -0.000731992 17.085459 Z M 61.237209 17.085459 " transform="matrix(0.708749,0,0,0.708749,220.367706,43.461006)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-14" x="225.382813" y="53.234375"/>
-  <use xlink:href="#glyph0-19" x="231.16401" y="53.234375"/>
-  <use xlink:href="#glyph0-20" x="234.052071" y="53.234375"/>
-  <use xlink:href="#glyph0-14" x="240.401744" y="53.234375"/>
-  <use xlink:href="#glyph0-21" x="246.182941" y="53.234375"/>
-  <use xlink:href="#glyph0-14" x="251.964139" y="53.234375"/>
-</g>
-<path style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 61.237295 16.383431 L 61.237295 -0.00216906 L -0.000645875 -0.00216906 L -0.000645875 16.383431 Z M 61.237295 16.383431 " transform="matrix(0.708749,0,0,0.708749,220.367645,58.337475)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-17" x="229.632813" y="68.117188"/>
-  <use xlink:href="#glyph0-2" x="237.13974" y="68.117188"/>
-  <use xlink:href="#glyph0-6" x="242.920938" y="68.117188"/>
-  <use xlink:href="#glyph0-18" x="248.702135" y="68.117188"/>
-</g>
-<path style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 47.124821 16.963793 L 47.124821 -0.000511318 L 0.00175463 -0.000511318 L 0.00175463 16.963793 Z M 47.124821 16.963793 " transform="matrix(0.708749,0,0,0.708749,81.412819,50.250362)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-17" x="85.757813" y="59.613281"/>
-  <use xlink:href="#glyph0-2" x="93.26474" y="59.613281"/>
-  <use xlink:href="#glyph0-6" x="99.045938" y="59.613281"/>
-  <use xlink:href="#glyph0-18" x="104.827135" y="59.613281"/>
-</g>
-<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M 0.00025835 0.00135096 L 6.421121 0.00135096 L 6.421121 11.630552 L 14.567073 11.630552 " transform="matrix(0.708749,0,0,0.708749,210.042786,55.901386)"/>
-<path style="fill-rule:evenodd;fill:rgb(13.72549%,43.137255%,63.137255%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M 14.567073 11.630552 L 9.369757 8.632313 C 10.461028 10.252685 10.461028 13.00842 9.369757 14.628792 Z M 14.567073 11.630552 " transform="matrix(0.708749,0,0,0.708749,210.042786,55.901386)"/>
-<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M 0.00025835 0.00135096 L 6.421121 0.00135096 L 6.421121 -9.009902 L 14.567073 -9.009902 " transform="matrix(0.708749,0,0,0.708749,210.042786,55.901386)"/>
-<path style="fill-rule:evenodd;fill:rgb(13.72549%,43.137255%,63.137255%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M 14.567073 -9.009902 L 9.369757 -12.008142 C 10.461028 -10.38777 10.461028 -7.626523 9.369757 -6.011663 Z M 14.567073 -9.009902 " transform="matrix(0.708749,0,0,0.708749,210.042786,55.901386)"/>
-<path style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 61.235078 17.090131 L 61.235078 -0.000936519 L 0.00264809 -0.000936519 L 0.00264809 17.090131 Z M 61.235078 17.090131 " transform="matrix(0.708749,0,0,0.708749,125.138748,43.719414)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-14" x="130.410156" y="53.234375"/>
-  <use xlink:href="#glyph0-19" x="136.191354" y="53.234375"/>
-  <use xlink:href="#glyph0-20" x="139.079414" y="53.234375"/>
-  <use xlink:href="#glyph0-14" x="145.429088" y="53.234375"/>
-  <use xlink:href="#glyph0-21" x="151.210285" y="53.234375"/>
-  <use xlink:href="#glyph0-14" x="156.991483" y="53.234375"/>
-</g>
-<path style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 61.235185 16.388 L 61.235185 0.0024005 L 0.00275573 0.0024005 L 0.00275573 16.388 Z M 61.235185 16.388 " transform="matrix(0.708749,0,0,0.708749,125.138672,58.595955)"/>
-<g style="fill:rgb(100%,100%,100%);fill-opacity:1;">
-  <use xlink:href="#glyph0-17" x="134.664063" y="68.117188"/>
-  <use xlink:href="#glyph0-2" x="142.17099" y="68.117188"/>
-  <use xlink:href="#glyph0-6" x="147.952188" y="68.117188"/>
-  <use xlink:href="#glyph0-18" x="153.733385" y="68.117188"/>
-</g>
-<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M -0.00187304 0.000505936 L 6.418989 0.000505936 L 6.418989 11.629707 L 14.570453 11.629707 " transform="matrix(0.708749,0,0,0.708749,114.813828,56.159798)"/>
-<path style="fill-rule:evenodd;fill:rgb(13.72549%,43.137255%,63.137255%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M 14.570453 11.629707 L 9.373137 8.631468 C 10.464408 10.25184 10.464408 13.007575 9.373137 14.627947 Z M 14.570453 11.629707 " transform="matrix(0.708749,0,0,0.708749,114.813828,56.159798)"/>
-<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M -0.00187304 0.000505936 L 6.418989 0.000505936 L 6.418989 -9.010747 L 14.570453 -9.010747 " transform="matrix(0.708749,0,0,0.708749,114.813828,56.159798)"/>
-<path style="fill-rule:evenodd;fill:rgb(13.72549%,43.137255%,63.137255%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M 14.570453 -9.010747 L 9.373137 -12.008987 C 10.464408 -10.388615 10.464408 -7.627368 9.373137 -6.006996 Z M 14.570453 -9.010747 " transform="matrix(0.708749,0,0,0.708749,114.813828,56.159798)"/>
-<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M -0.00210986 -0.00184613 L 3.547277 -0.00184613 L 3.547277 8.64565 L 11.428678 8.64565 " transform="matrix(0.708749,0,0,0.708749,168.540558,49.774746)"/>
-<path style="fill-rule:evenodd;fill:rgb(13.72549%,43.137255%,63.137255%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M 11.428678 8.64565 L 6.231362 5.641899 C 7.322633 7.262271 7.322633 10.023517 6.231362 11.64389 Z M 11.428678 8.64565 " transform="matrix(0.708749,0,0,0.708749,168.540558,49.774746)"/>
-<path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M -0.00210986 -0.000495171 L 3.547277 -0.000495171 L 3.547277 -11.993453 L 11.428678 -11.993453 " transform="matrix(0.708749,0,0,0.708749,168.540558,64.402695)"/>
-<path style="fill-rule:evenodd;fill:rgb(13.72549%,43.137255%,63.137255%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;" d="M 11.428678 -11.993453 L 6.231362 -14.997204 C 7.322633 -13.376832 7.322633 -10.615586 6.231362 -8.995214 Z M 11.428678 -11.993453 " transform="matrix(0.708749,0,0,0.708749,168.540558,64.402695)"/>
-</g>
-</svg>
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.png b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.png
deleted file mode 100644
index 3cf98ac8d7..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.svg b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.svg
deleted file mode 100644
index 97ee517594..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure.svg
+++ /dev/null
@@ -1,7501 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:xlink="http://www.w3.org/1999/xlink"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="565.58252pt"
-   height="530.85376pt"
-   viewBox="0 0 565.58252 530.85376"
-   version="1.1"
-   id="svg11116"
-   sodipodi:docname="structure.svg"
-   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
-   inkscape:export-filename="C:\Users\janst\workspace\alpaka\doc\markdown\user\implementation\library\structure.png"
-   inkscape:export-xdpi="101.45599"
-   inkscape:export-ydpi="101.45599">
-  <metadata
-     id="metadata11120">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="1920"
-     inkscape:window-height="1017"
-     id="namedview11118"
-     showgrid="false"
-     inkscape:zoom="1.6935207"
-     inkscape:cx="431.29848"
-     inkscape:cy="535.92444"
-     inkscape:window-x="-8"
-     inkscape:window-y="-8"
-     inkscape:window-maximized="1"
-     inkscape:current-layer="surface1" />
-  <defs
-     id="defs9091">
-    <g
-       id="g9056">
-      <symbol
-         overflow="visible"
-         id="glyph0-0">
-        <path
-           style="stroke:none;"
-           d="M 0.683594 -6.765625 L 3.179688 -6.765625 C 3.738281 -6.761719 4.167969 -6.71875 4.464844 -6.636719 C 4.863281 -6.515625 5.203125 -6.308594 5.484375 -6.011719 C 5.765625 -5.714844 5.980469 -5.347656 6.132813 -4.917969 C 6.277344 -4.480469 6.351563 -3.949219 6.355469 -3.316406 C 6.351563 -2.761719 6.28125 -2.28125 6.144531 -1.878906 C 5.972656 -1.382813 5.730469 -0.984375 5.421875 -0.683594 C 5.179688 -0.449219 4.859375 -0.269531 4.460938 -0.144531 C 4.15625 -0.046875 3.753906 0 3.253906 0 L 0.683594 0 Z M 2.046875 -5.621094 L 2.046875 -1.140625 L 3.070313 -1.140625 C 3.449219 -1.136719 3.722656 -1.15625 3.894531 -1.203125 C 4.113281 -1.257813 4.296875 -1.351563 4.445313 -1.484375 C 4.589844 -1.613281 4.707031 -1.832031 4.800781 -2.136719 C 4.890625 -2.4375 4.9375 -2.851563 4.941406 -3.378906 C 4.9375 -3.898438 4.890625 -4.296875 4.800781 -4.578125 C 4.707031 -4.855469 4.578125 -5.074219 4.414063 -5.234375 C 4.246094 -5.390625 4.039063 -5.5 3.785156 -5.554688 C 3.589844 -5.597656 3.214844 -5.617188 2.664063 -5.621094 Z M 2.046875 -5.621094 "
-           id="path7820" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-1">
-        <path
-           style="stroke:none;"
-           d="M 3.515625 -1.558594 L 4.808594 -1.34375 C 4.640625 -0.867188 4.378906 -0.507813 4.019531 -0.261719 C 3.660156 -0.015625 3.210938 0.105469 2.675781 0.109375 C 1.820313 0.105469 1.191406 -0.167969 0.785156 -0.722656 C 0.460938 -1.167969 0.296875 -1.734375 0.300781 -2.414063 C 0.296875 -3.226563 0.511719 -3.859375 0.9375 -4.320313 C 1.359375 -4.777344 1.894531 -5.007813 2.546875 -5.011719 C 3.273438 -5.007813 3.847656 -4.769531 4.273438 -4.289063 C 4.691406 -3.804688 4.894531 -3.066406 4.878906 -2.078125 L 1.628906 -2.078125 C 1.636719 -1.691406 1.738281 -1.390625 1.941406 -1.179688 C 2.136719 -0.960938 2.386719 -0.855469 2.691406 -0.859375 C 2.890625 -0.855469 3.0625 -0.910156 3.203125 -1.023438 C 3.339844 -1.132813 3.445313 -1.3125 3.515625 -1.558594 Z M 3.589844 -2.871094 C 3.578125 -3.246094 3.480469 -3.53125 3.296875 -3.726563 C 3.113281 -3.921875 2.890625 -4.019531 2.625 -4.019531 C 2.339844 -4.019531 2.105469 -3.914063 1.925781 -3.710938 C 1.738281 -3.5 1.648438 -3.222656 1.652344 -2.871094 Z M 3.589844 -2.871094 "
-           id="path7823" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-2">
-        <path
-           style="stroke:none;"
-           d="M 2.027344 0 L 0.0507813 -4.898438 L 1.410156 -4.898438 L 2.335938 -2.398438 L 2.601563 -1.5625 C 2.671875 -1.773438 2.714844 -1.914063 2.734375 -1.984375 C 2.777344 -2.121094 2.824219 -2.257813 2.875 -2.398438 L 3.808594 -4.898438 L 5.140625 -4.898438 L 3.191406 0 Z M 2.027344 0 "
-           id="path7826" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-3">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 -5.566406 L 0.679688 -6.765625 L 1.976563 -6.765625 L 1.976563 -5.566406 Z M 0.679688 0 L 0.679688 -4.898438 L 1.976563 -4.898438 L 1.976563 0 Z M 0.679688 0 "
-           id="path7829" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-4">
-        <path
-           style="stroke:none;"
-           d="M 4.949219 -3.453125 L 3.671875 -3.222656 C 3.628906 -3.476563 3.53125 -3.667969 3.378906 -3.796875 C 3.226563 -3.925781 3.027344 -3.988281 2.785156 -3.992188 C 2.457031 -3.988281 2.199219 -3.878906 2.011719 -3.65625 C 1.816406 -3.433594 1.722656 -3.058594 1.726563 -2.539063 C 1.722656 -1.953125 1.820313 -1.542969 2.019531 -1.304688 C 2.210938 -1.0625 2.472656 -0.941406 2.804688 -0.945313 C 3.050781 -0.941406 3.253906 -1.011719 3.410156 -1.15625 C 3.566406 -1.292969 3.675781 -1.535156 3.742188 -1.878906 L 5.015625 -1.660156 C 4.882813 -1.074219 4.628906 -0.632813 4.253906 -0.335938 C 3.878906 -0.0390625 3.375 0.105469 2.746094 0.109375 C 2.027344 0.105469 1.457031 -0.117188 1.03125 -0.566406 C 0.601563 -1.015625 0.386719 -1.640625 0.390625 -2.445313 C 0.386719 -3.25 0.601563 -3.882813 1.03125 -4.335938 C 1.460938 -4.785156 2.039063 -5.007813 2.769531 -5.011719 C 3.363281 -5.007813 3.835938 -4.878906 4.191406 -4.625 C 4.539063 -4.367188 4.792969 -3.976563 4.949219 -3.453125 Z M 4.949219 -3.453125 "
-           id="path7832" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-5">
-        <path
-           style="stroke:none;"
-           d="M 0.339844 -2.199219 L 1.671875 -2.332031 C 1.746094 -1.882813 1.910156 -1.554688 2.15625 -1.347656 C 2.398438 -1.136719 2.726563 -1.03125 3.148438 -1.035156 C 3.585938 -1.03125 3.917969 -1.125 4.140625 -1.3125 C 4.363281 -1.5 4.476563 -1.714844 4.476563 -1.964844 C 4.476563 -2.121094 4.429688 -2.257813 4.335938 -2.375 C 4.242188 -2.484375 4.078125 -2.582031 3.84375 -2.667969 C 3.683594 -2.71875 3.316406 -2.816406 2.75 -2.960938 C 2.015625 -3.140625 1.503906 -3.367188 1.207031 -3.632813 C 0.792969 -4.003906 0.585938 -4.457031 0.585938 -4.992188 C 0.585938 -5.335938 0.683594 -5.65625 0.878906 -5.957031 C 1.074219 -6.257813 1.355469 -6.488281 1.722656 -6.644531 C 2.089844 -6.800781 2.53125 -6.878906 3.054688 -6.878906 C 3.898438 -6.878906 4.539063 -6.691406 4.972656 -6.320313 C 5.398438 -5.945313 5.625 -5.449219 5.648438 -4.832031 L 4.28125 -4.769531 C 4.222656 -5.117188 4.097656 -5.367188 3.90625 -5.519531 C 3.710938 -5.671875 3.421875 -5.746094 3.039063 -5.75 C 2.640625 -5.746094 2.332031 -5.664063 2.109375 -5.503906 C 1.960938 -5.394531 1.886719 -5.257813 1.890625 -5.085938 C 1.886719 -4.921875 1.953125 -4.785156 2.09375 -4.675781 C 2.261719 -4.527344 2.679688 -4.375 3.347656 -4.222656 C 4.011719 -4.0625 4.503906 -3.898438 4.824219 -3.734375 C 5.136719 -3.5625 5.386719 -3.335938 5.570313 -3.046875 C 5.746094 -2.757813 5.835938 -2.398438 5.839844 -1.96875 C 5.835938 -1.582031 5.726563 -1.21875 5.515625 -0.882813 C 5.296875 -0.542969 4.996094 -0.289063 4.605469 -0.125 C 4.210938 0.0390625 3.71875 0.117188 3.132813 0.121094 C 2.277344 0.117188 1.621094 -0.078125 1.164063 -0.472656 C 0.703125 -0.867188 0.429688 -1.441406 0.339844 -2.199219 Z M 0.339844 -2.199219 "
-           id="path7835" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-6">
-        <path
-           style="stroke:none;"
-           d="M 2.925781 -4.898438 L 2.925781 -3.867188 L 2.039063 -3.867188 L 2.039063 -1.890625 C 2.039063 -1.488281 2.046875 -1.253906 2.0625 -1.191406 C 2.078125 -1.121094 2.117188 -1.066406 2.179688 -1.027344 C 2.238281 -0.980469 2.308594 -0.960938 2.398438 -0.964844 C 2.515625 -0.960938 2.691406 -1.003906 2.921875 -1.089844 L 3.03125 -0.0820313 C 2.726563 0.0429688 2.386719 0.105469 2.007813 0.109375 C 1.769531 0.105469 1.558594 0.0703125 1.375 -0.0078125 C 1.183594 -0.0820313 1.046875 -0.183594 0.960938 -0.3125 C 0.875 -0.433594 0.8125 -0.601563 0.78125 -0.816406 C 0.75 -0.964844 0.738281 -1.269531 0.738281 -1.730469 L 0.738281 -3.867188 L 0.144531 -3.867188 L 0.144531 -4.898438 L 0.738281 -4.898438 L 0.738281 -5.875 L 2.039063 -6.628906 L 2.039063 -4.898438 Z M 2.925781 -4.898438 "
-           id="path7838" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-7">
-        <path
-           style="stroke:none;"
-           d="M 1.917969 0 L 0.621094 0 L 0.621094 -4.898438 L 1.828125 -4.898438 L 1.828125 -4.203125 C 2.03125 -4.53125 2.214844 -4.75 2.382813 -4.855469 C 2.542969 -4.957031 2.730469 -5.007813 2.945313 -5.011719 C 3.238281 -5.007813 3.523438 -4.925781 3.796875 -4.765625 L 3.394531 -3.636719 C 3.175781 -3.773438 2.972656 -3.84375 2.785156 -3.847656 C 2.601563 -3.84375 2.449219 -3.792969 2.324219 -3.695313 C 2.195313 -3.59375 2.097656 -3.414063 2.027344 -3.15625 C 1.953125 -2.894531 1.914063 -2.347656 1.917969 -1.511719 Z M 1.917969 0 "
-           id="path7841" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-8">
-        <path
-           style="stroke:none;"
-           d="M 1.648438 -3.40625 L 0.46875 -3.617188 C 0.601563 -4.085938 0.828125 -4.4375 1.152344 -4.667969 C 1.472656 -4.894531 1.953125 -5.007813 2.59375 -5.011719 C 3.167969 -5.007813 3.597656 -4.941406 3.882813 -4.804688 C 4.164063 -4.667969 4.363281 -4.492188 4.480469 -4.285156 C 4.59375 -4.070313 4.652344 -3.683594 4.65625 -3.125 L 4.640625 -1.609375 C 4.640625 -1.175781 4.660156 -0.859375 4.703125 -0.65625 C 4.742188 -0.453125 4.820313 -0.234375 4.9375 0 L 3.65625 0 C 3.621094 -0.0859375 3.578125 -0.210938 3.53125 -0.382813 C 3.503906 -0.457031 3.488281 -0.507813 3.484375 -0.535156 C 3.261719 -0.316406 3.023438 -0.15625 2.773438 -0.0507813 C 2.515625 0.0546875 2.246094 0.105469 1.964844 0.109375 C 1.457031 0.105469 1.058594 -0.0273438 0.769531 -0.300781 C 0.476563 -0.570313 0.332031 -0.917969 0.335938 -1.339844 C 0.332031 -1.613281 0.398438 -1.859375 0.535156 -2.078125 C 0.664063 -2.292969 0.851563 -2.457031 1.089844 -2.574219 C 1.328125 -2.6875 1.671875 -2.789063 2.121094 -2.878906 C 2.726563 -2.988281 3.144531 -3.097656 3.382813 -3.199219 L 3.382813 -3.328125 C 3.378906 -3.574219 3.316406 -3.75 3.195313 -3.859375 C 3.070313 -3.964844 2.839844 -4.019531 2.5 -4.019531 C 2.269531 -4.019531 2.089844 -3.972656 1.960938 -3.882813 C 1.828125 -3.789063 1.722656 -3.628906 1.648438 -3.40625 Z M 3.382813 -2.351563 C 3.210938 -2.292969 2.949219 -2.226563 2.59375 -2.152344 C 2.230469 -2.074219 1.996094 -2 1.886719 -1.929688 C 1.714844 -1.804688 1.628906 -1.652344 1.632813 -1.472656 C 1.628906 -1.285156 1.695313 -1.128906 1.835938 -1 C 1.96875 -0.867188 2.140625 -0.800781 2.351563 -0.804688 C 2.582031 -0.800781 2.808594 -0.878906 3.023438 -1.035156 C 3.179688 -1.148438 3.28125 -1.289063 3.332031 -1.460938 C 3.363281 -1.570313 3.378906 -1.78125 3.382813 -2.09375 Z M 3.382813 -2.351563 "
-           id="path7844" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-9">
-        <path
-           style="stroke:none;"
-           d="M 0.582031 -4.898438 L 1.777344 -4.898438 L 1.777344 -4.230469 C 2.199219 -4.75 2.710938 -5.007813 3.304688 -5.011719 C 3.617188 -5.007813 3.886719 -4.945313 4.117188 -4.816406 C 4.347656 -4.6875 4.539063 -4.492188 4.6875 -4.230469 C 4.902344 -4.492188 5.132813 -4.6875 5.382813 -4.816406 C 5.628906 -4.945313 5.894531 -5.007813 6.183594 -5.011719 C 6.539063 -5.007813 6.84375 -4.9375 7.09375 -4.792969 C 7.339844 -4.644531 7.527344 -4.429688 7.65625 -4.148438 C 7.742188 -3.9375 7.785156 -3.597656 7.789063 -3.132813 L 7.789063 0 L 6.492188 0 L 6.492188 -2.800781 C 6.492188 -3.285156 6.445313 -3.597656 6.359375 -3.742188 C 6.234375 -3.925781 6.050781 -4.019531 5.804688 -4.019531 C 5.621094 -4.019531 5.449219 -3.960938 5.289063 -3.851563 C 5.128906 -3.738281 5.015625 -3.578125 4.945313 -3.367188 C 4.875 -3.152344 4.839844 -2.8125 4.839844 -2.351563 L 4.839844 0 L 3.542969 0 L 3.542969 -2.683594 C 3.539063 -3.160156 3.515625 -3.46875 3.472656 -3.609375 C 3.425781 -3.746094 3.355469 -3.847656 3.257813 -3.917969 C 3.160156 -3.984375 3.027344 -4.019531 2.867188 -4.019531 C 2.664063 -4.019531 2.484375 -3.964844 2.324219 -3.855469 C 2.164063 -3.746094 2.050781 -3.589844 1.980469 -3.390625 C 1.910156 -3.183594 1.875 -2.847656 1.878906 -2.382813 L 1.878906 0 L 0.582031 0 Z M 0.582031 -4.898438 "
-           id="path7847" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-10">
-        <path
-           style="stroke:none;"
-           d="M 0.667969 0 L 0.667969 -6.765625 L 2.714844 -6.765625 L 3.941406 -2.148438 L 5.152344 -6.765625 L 7.203125 -6.765625 L 7.203125 0 L 5.933594 0 L 5.933594 -5.324219 L 4.589844 0 L 3.277344 0 L 1.9375 -5.324219 L 1.9375 0 Z M 0.667969 0 "
-           id="path7850" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-11">
-        <path
-           style="stroke:none;"
-           d="M 5.136719 0 L 3.839844 0 L 3.839844 -2.5 C 3.835938 -3.027344 3.808594 -3.371094 3.753906 -3.527344 C 3.695313 -3.683594 3.605469 -3.804688 3.484375 -3.890625 C 3.359375 -3.976563 3.210938 -4.019531 3.035156 -4.019531 C 2.808594 -4.019531 2.605469 -3.957031 2.429688 -3.832031 C 2.25 -3.707031 2.128906 -3.542969 2.0625 -3.34375 C 1.996094 -3.136719 1.960938 -2.761719 1.964844 -2.21875 L 1.964844 0 L 0.667969 0 L 0.667969 -4.898438 L 1.875 -4.898438 L 1.875 -4.179688 C 2.300781 -4.730469 2.835938 -5.007813 3.488281 -5.011719 C 3.769531 -5.007813 4.03125 -4.957031 4.273438 -4.855469 C 4.507813 -4.75 4.6875 -4.617188 4.808594 -4.460938 C 4.929688 -4.296875 5.015625 -4.117188 5.0625 -3.917969 C 5.109375 -3.710938 5.132813 -3.421875 5.136719 -3.046875 Z M 5.136719 0 "
-           id="path7853" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-12">
-        <path
-           style="stroke:none;"
-           d="M 0.558594 0.324219 L 2.039063 0.503906 C 2.0625 0.675781 2.117188 0.792969 2.210938 0.859375 C 2.328125 0.949219 2.523438 0.996094 2.792969 0.996094 C 3.128906 0.996094 3.382813 0.945313 3.554688 0.84375 C 3.664063 0.773438 3.75 0.664063 3.8125 0.515625 C 3.847656 0.40625 3.867188 0.210938 3.871094 -0.078125 L 3.871094 -0.792969 C 3.480469 -0.261719 2.992188 0 2.402344 0 C 1.746094 0 1.222656 -0.277344 0.839844 -0.835938 C 0.535156 -1.273438 0.386719 -1.820313 0.386719 -2.476563 C 0.386719 -3.296875 0.582031 -3.925781 0.980469 -4.359375 C 1.371094 -4.792969 1.863281 -5.007813 2.453125 -5.011719 C 3.058594 -5.007813 3.558594 -4.742188 3.953125 -4.210938 L 3.953125 -4.898438 L 5.167969 -4.898438 L 5.167969 -0.503906 C 5.167969 0.0742188 5.117188 0.503906 5.023438 0.792969 C 4.925781 1.074219 4.792969 1.300781 4.621094 1.464844 C 4.445313 1.628906 4.214844 1.753906 3.933594 1.847656 C 3.644531 1.9375 3.28125 1.984375 2.84375 1.988281 C 2.007813 1.984375 1.417969 1.84375 1.074219 1.5625 C 0.722656 1.277344 0.550781 0.917969 0.554688 0.480469 C 0.550781 0.433594 0.550781 0.378906 0.558594 0.324219 Z M 1.714844 -2.550781 C 1.710938 -2.027344 1.8125 -1.648438 2.015625 -1.410156 C 2.21875 -1.164063 2.46875 -1.042969 2.765625 -1.046875 C 3.078125 -1.042969 3.34375 -1.167969 3.566406 -1.417969 C 3.78125 -1.664063 3.890625 -2.03125 3.894531 -2.519531 C 3.890625 -3.027344 3.785156 -3.402344 3.578125 -3.648438 C 3.367188 -3.894531 3.101563 -4.019531 2.785156 -4.019531 C 2.472656 -4.019531 2.21875 -3.898438 2.015625 -3.65625 C 1.8125 -3.414063 1.710938 -3.042969 1.714844 -2.550781 Z M 1.714844 -2.550781 "
-           id="path7856" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-13">
-        <path
-           style="stroke:none;"
-           d="M 0.707031 0 L 0.707031 -6.765625 L 2.070313 -6.765625 L 2.070313 -3.761719 L 4.832031 -6.765625 L 6.667969 -6.765625 L 4.121094 -4.128906 L 6.804688 0 L 5.039063 0 L 3.179688 -3.175781 L 2.070313 -2.042969 L 2.070313 0 Z M 0.707031 0 "
-           id="path7859" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-14">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 0 L 0.679688 -6.765625 L 1.976563 -6.765625 L 1.976563 0 Z M 0.679688 0 "
-           id="path7862" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-15">
-        <path
-           style="stroke:none;"
-           d="M 0.644531 0 L 0.644531 -6.765625 L 2.011719 -6.765625 L 2.011719 0 Z M 0.644531 0 "
-           id="path7865" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-16">
-        <path
-           style="stroke:none;"
-           d="M 5.171875 0 L 3.96875 0 L 3.96875 -0.71875 C 3.765625 -0.4375 3.53125 -0.230469 3.257813 -0.09375 C 2.984375 0.0390625 2.710938 0.105469 2.4375 0.109375 C 1.875 0.105469 1.390625 -0.117188 0.988281 -0.570313 C 0.585938 -1.019531 0.386719 -1.652344 0.386719 -2.46875 C 0.386719 -3.296875 0.582031 -3.929688 0.972656 -4.363281 C 1.363281 -4.792969 1.855469 -5.007813 2.453125 -5.011719 C 3 -5.007813 3.472656 -4.78125 3.875 -4.328125 L 3.875 -6.765625 L 5.171875 -6.765625 Z M 1.710938 -2.554688 C 1.710938 -2.03125 1.78125 -1.652344 1.929688 -1.421875 C 2.132813 -1.082031 2.425781 -0.914063 2.804688 -0.914063 C 3.101563 -0.914063 3.355469 -1.039063 3.566406 -1.292969 C 3.773438 -1.546875 3.878906 -1.925781 3.878906 -2.433594 C 3.878906 -2.992188 3.777344 -3.398438 3.574219 -3.648438 C 3.371094 -3.894531 3.109375 -4.019531 2.796875 -4.019531 C 2.484375 -4.019531 2.226563 -3.894531 2.023438 -3.652344 C 1.8125 -3.402344 1.710938 -3.039063 1.710938 -2.554688 Z M 1.710938 -2.554688 "
-           id="path7868" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-17">
-        <path
-           style="stroke:none;"
-           d="M 0.0546875 0 L 1.824219 -2.523438 L 0.128906 -4.898438 L 1.710938 -4.898438 L 2.578125 -3.554688 L 3.492188 -4.898438 L 5.015625 -4.898438 L 3.355469 -2.578125 L 5.167969 0 L 3.574219 0 L 2.578125 -1.519531 L 1.574219 0 Z M 0.0546875 0 "
-           id="path7871" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-18">
-        <path
-           style="stroke:none;"
-           d="M 0.691406 0 L 0.691406 -6.765625 L 3.566406 -6.765625 C 4.289063 -6.761719 4.8125 -6.699219 5.140625 -6.582031 C 5.464844 -6.457031 5.730469 -6.242188 5.929688 -5.933594 C 6.125 -5.621094 6.222656 -5.265625 6.226563 -4.867188 C 6.222656 -4.355469 6.074219 -3.9375 5.777344 -3.609375 C 5.476563 -3.277344 5.027344 -3.066406 4.4375 -2.984375 C 4.730469 -2.808594 4.976563 -2.621094 5.167969 -2.417969 C 5.359375 -2.210938 5.617188 -1.84375 5.949219 -1.320313 L 6.773438 0 L 5.140625 0 L 4.152344 -1.472656 C 3.796875 -1.996094 3.554688 -2.328125 3.429688 -2.464844 C 3.296875 -2.601563 3.160156 -2.695313 3.019531 -2.746094 C 2.871094 -2.796875 2.644531 -2.824219 2.335938 -2.824219 L 2.058594 -2.824219 L 2.058594 0 Z M 2.058594 -3.902344 L 3.070313 -3.902344 C 3.722656 -3.902344 4.128906 -3.929688 4.292969 -3.984375 C 4.453125 -4.039063 4.582031 -4.132813 4.675781 -4.273438 C 4.769531 -4.40625 4.816406 -4.578125 4.816406 -4.78125 C 4.816406 -5.007813 4.753906 -5.191406 4.632813 -5.332031 C 4.511719 -5.472656 4.339844 -5.558594 4.121094 -5.597656 C 4.003906 -5.609375 3.671875 -5.617188 3.125 -5.621094 L 2.058594 -5.621094 Z M 2.058594 -3.902344 "
-           id="path7874" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-19">
-        <path
-           style="stroke:none;"
-           d="M 1.96875 -6.765625 L 1.96875 -4.277344 C 2.386719 -4.765625 2.886719 -5.007813 3.46875 -5.011719 C 3.765625 -5.007813 4.035156 -4.953125 4.277344 -4.84375 C 4.515625 -4.730469 4.695313 -4.589844 4.816406 -4.417969 C 4.9375 -4.246094 5.019531 -4.054688 5.066406 -3.847656 C 5.109375 -3.636719 5.132813 -3.3125 5.136719 -2.875 L 5.136719 0 L 3.839844 0 L 3.839844 -2.589844 C 3.835938 -3.101563 3.8125 -3.425781 3.765625 -3.566406 C 3.714844 -3.703125 3.628906 -3.8125 3.503906 -3.894531 C 3.378906 -3.976563 3.222656 -4.019531 3.035156 -4.019531 C 2.820313 -4.019531 2.628906 -3.964844 2.460938 -3.863281 C 2.289063 -3.753906 2.164063 -3.597656 2.085938 -3.390625 C 2.007813 -3.175781 1.96875 -2.863281 1.96875 -2.453125 L 1.96875 0 L 0.671875 0 L 0.671875 -6.765625 Z M 1.96875 -6.765625 "
-           id="path7877" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-20">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 0 L 0.679688 -1.296875 L 1.976563 -1.296875 L 1.976563 0 Z M 0.679688 0 "
-           id="path7880" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-21">
-        <path
-           style="stroke:none;"
-           d="M 0.6875 0 L 0.6875 -6.765625 L 5.703125 -6.765625 L 5.703125 -5.621094 L 2.054688 -5.621094 L 2.054688 -4.121094 L 5.449219 -4.121094 L 5.449219 -2.980469 L 2.054688 -2.980469 L 2.054688 -1.140625 L 5.832031 -1.140625 L 5.832031 0 Z M 0.6875 0 "
-           id="path7883" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-22">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path7886" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-23">
-        <path
-           style="stroke:none;"
-           d="M 2.210938 0 L 2.210938 -5.621094 L 0.203125 -5.621094 L 0.203125 -6.765625 L 5.578125 -6.765625 L 5.578125 -5.621094 L 3.574219 -5.621094 L 3.574219 0 Z M 2.210938 0 "
-           id="path7889" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-24">
-        <path
-           style="stroke:none;"
-           d="M 0.222656 -1.398438 L 1.523438 -1.597656 C 1.578125 -1.34375 1.6875 -1.152344 1.859375 -1.019531 C 2.023438 -0.886719 2.261719 -0.820313 2.570313 -0.824219 C 2.902344 -0.820313 3.15625 -0.882813 3.328125 -1.011719 C 3.4375 -1.09375 3.492188 -1.207031 3.496094 -1.355469 C 3.492188 -1.449219 3.460938 -1.53125 3.40625 -1.601563 C 3.335938 -1.660156 3.191406 -1.71875 2.972656 -1.773438 C 1.921875 -2 1.261719 -2.210938 0.984375 -2.402344 C 0.59375 -2.667969 0.398438 -3.035156 0.402344 -3.507813 C 0.398438 -3.929688 0.566406 -4.285156 0.902344 -4.578125 C 1.238281 -4.863281 1.757813 -5.007813 2.464844 -5.011719 C 3.132813 -5.007813 3.628906 -4.898438 3.957031 -4.683594 C 4.28125 -4.460938 4.507813 -4.140625 4.632813 -3.714844 L 3.410156 -3.488281 C 3.355469 -3.675781 3.253906 -3.824219 3.113281 -3.925781 C 2.964844 -4.027344 2.757813 -4.078125 2.488281 -4.078125 C 2.144531 -4.078125 1.898438 -4.03125 1.753906 -3.9375 C 1.652344 -3.867188 1.605469 -3.777344 1.605469 -3.671875 C 1.605469 -3.578125 1.648438 -3.5 1.734375 -3.4375 C 1.851563 -3.347656 2.253906 -3.226563 2.945313 -3.070313 C 3.632813 -2.914063 4.117188 -2.722656 4.394531 -2.496094 C 4.664063 -2.261719 4.796875 -1.9375 4.800781 -1.53125 C 4.796875 -1.078125 4.609375 -0.695313 4.234375 -0.375 C 3.855469 -0.0546875 3.300781 0.105469 2.570313 0.109375 C 1.902344 0.105469 1.375 -0.0273438 0.984375 -0.296875 C 0.59375 -0.566406 0.339844 -0.933594 0.222656 -1.398438 Z M 0.222656 -1.398438 "
-           id="path7892" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-25">
-        <path
-           style="stroke:none;"
-           d="M 0.410156 -3.339844 C 0.410156 -4.027344 0.511719 -4.605469 0.71875 -5.074219 C 0.871094 -5.417969 1.082031 -5.726563 1.347656 -6.003906 C 1.613281 -6.273438 1.902344 -6.476563 2.222656 -6.613281 C 2.640625 -6.789063 3.128906 -6.878906 3.683594 -6.878906 C 4.675781 -6.878906 5.472656 -6.566406 6.074219 -5.949219 C 6.671875 -5.328125 6.972656 -4.46875 6.972656 -3.371094 C 6.972656 -2.277344 6.675781 -1.425781 6.082031 -0.808594 C 5.488281 -0.191406 4.691406 0.117188 3.699219 0.117188 C 2.691406 0.117188 1.894531 -0.1875 1.300781 -0.804688 C 0.707031 -1.414063 0.410156 -2.261719 0.410156 -3.339844 Z M 1.816406 -3.386719 C 1.816406 -2.621094 1.992188 -2.039063 2.347656 -1.644531 C 2.699219 -1.246094 3.148438 -1.046875 3.695313 -1.050781 C 4.238281 -1.046875 4.683594 -1.246094 5.035156 -1.640625 C 5.378906 -2.03125 5.554688 -2.617188 5.558594 -3.40625 C 5.554688 -4.175781 5.382813 -4.753906 5.046875 -5.136719 C 4.703125 -5.515625 4.253906 -5.707031 3.695313 -5.710938 C 3.128906 -5.707031 2.675781 -5.515625 2.332031 -5.132813 C 1.988281 -4.746094 1.816406 -4.164063 1.816406 -3.386719 Z M 1.816406 -3.386719 "
-           id="path7895" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-26">
-        <path
-           style="stroke:none;"
-           d="M 0.640625 -4.898438 L 1.851563 -4.898438 L 1.851563 -4.179688 C 2.007813 -4.421875 2.21875 -4.625 2.484375 -4.78125 C 2.75 -4.933594 3.046875 -5.007813 3.378906 -5.011719 C 3.945313 -5.007813 4.425781 -4.785156 4.824219 -4.339844 C 5.21875 -3.890625 5.417969 -3.269531 5.421875 -2.476563 C 5.417969 -1.65625 5.21875 -1.019531 4.820313 -0.570313 C 4.417969 -0.117188 3.933594 0.105469 3.367188 0.109375 C 3.09375 0.105469 2.847656 0.0546875 2.628906 -0.0507813 C 2.40625 -0.15625 2.175781 -0.339844 1.9375 -0.605469 L 1.9375 1.863281 L 0.640625 1.863281 Z M 1.925781 -2.535156 C 1.921875 -1.980469 2.03125 -1.574219 2.25 -1.3125 C 2.46875 -1.046875 2.734375 -0.914063 3.050781 -0.917969 C 3.351563 -0.914063 3.601563 -1.035156 3.800781 -1.28125 C 4 -1.519531 4.101563 -1.914063 4.101563 -2.46875 C 4.101563 -2.976563 3.996094 -3.359375 3.792969 -3.613281 C 3.582031 -3.859375 3.328125 -3.984375 3.027344 -3.988281 C 2.707031 -3.984375 2.445313 -3.863281 2.238281 -3.621094 C 2.027344 -3.375 1.921875 -3.011719 1.925781 -2.535156 Z M 1.925781 -2.535156 "
-           id="path7898" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-27">
-        <path
-           style="stroke:none;"
-           d="M 2.945313 0 L 2.945313 -1.359375 L 0.175781 -1.359375 L 0.175781 -2.496094 L 3.109375 -6.792969 L 4.199219 -6.792969 L 4.199219 -2.5 L 5.039063 -2.5 L 5.039063 -1.359375 L 4.199219 -1.359375 L 4.199219 0 Z M 2.945313 -2.5 L 2.945313 -4.8125 L 1.390625 -2.5 Z M 2.945313 -2.5 "
-           id="path7901" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-28">
-        <path
-           style="stroke:none;"
-           d="M 0.695313 0 L 0.695313 -6.765625 L 5.335938 -6.765625 L 5.335938 -5.621094 L 2.0625 -5.621094 L 2.0625 -4.019531 L 4.886719 -4.019531 L 4.886719 -2.875 L 2.0625 -2.875 L 2.0625 0 Z M 0.695313 0 "
-           id="path7904" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-29">
-        <path
-           style="stroke:none;"
-           d="M 0.621094 0 L 0.621094 -6.765625 L 1.917969 -6.765625 L 1.917969 -4.328125 C 2.316406 -4.78125 2.792969 -5.007813 3.339844 -5.011719 C 3.933594 -5.007813 4.425781 -4.792969 4.820313 -4.363281 C 5.207031 -3.929688 5.402344 -3.308594 5.40625 -2.5 C 5.402344 -1.660156 5.203125 -1.015625 4.808594 -0.566406 C 4.40625 -0.117188 3.921875 0.105469 3.359375 0.109375 C 3.074219 0.105469 2.796875 0.0390625 2.527344 -0.0976563 C 2.253906 -0.234375 2.023438 -0.441406 1.828125 -0.71875 L 1.828125 0 Z M 1.910156 -2.554688 C 1.910156 -2.046875 1.988281 -1.671875 2.148438 -1.429688 C 2.371094 -1.085938 2.671875 -0.914063 3.046875 -0.914063 C 3.328125 -0.914063 3.570313 -1.035156 3.773438 -1.28125 C 3.976563 -1.523438 4.078125 -1.910156 4.078125 -2.4375 C 4.078125 -2.996094 3.976563 -3.398438 3.773438 -3.648438 C 3.570313 -3.894531 3.308594 -4.019531 2.996094 -4.019531 C 2.679688 -4.019531 2.421875 -3.898438 2.21875 -3.65625 C 2.011719 -3.414063 1.910156 -3.046875 1.910156 -2.554688 Z M 1.910156 -2.554688 "
-           id="path7907" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-30">
-        <path
-           style="stroke:none;"
-           d="M 5.015625 -2.488281 L 6.339844 -2.066406 C 6.136719 -1.328125 5.796875 -0.78125 5.324219 -0.421875 C 4.851563 -0.0625 4.253906 0.117188 3.53125 0.117188 C 2.628906 0.117188 1.890625 -0.1875 1.316406 -0.804688 C 0.734375 -1.414063 0.445313 -2.253906 0.449219 -3.324219 C 0.445313 -4.445313 0.734375 -5.320313 1.320313 -5.945313 C 1.898438 -6.566406 2.664063 -6.878906 3.613281 -6.878906 C 4.4375 -6.878906 5.109375 -6.632813 5.628906 -6.144531 C 5.933594 -5.851563 6.164063 -5.4375 6.320313 -4.898438 L 4.96875 -4.578125 C 4.886719 -4.921875 4.71875 -5.199219 4.46875 -5.40625 C 4.214844 -5.605469 3.90625 -5.707031 3.542969 -5.710938 C 3.039063 -5.707031 2.632813 -5.527344 2.324219 -5.171875 C 2.007813 -4.808594 1.851563 -4.226563 1.855469 -3.421875 C 1.851563 -2.566406 2.003906 -1.957031 2.316406 -1.59375 C 2.621094 -1.230469 3.023438 -1.046875 3.515625 -1.050781 C 3.878906 -1.046875 4.191406 -1.164063 4.453125 -1.398438 C 4.710938 -1.628906 4.898438 -1.992188 5.015625 -2.488281 Z M 5.015625 -2.488281 "
-           id="path7910" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-31">
-        <path
-           style="stroke:none;"
-           d="M 3.902344 0 L 3.902344 -0.734375 C 3.722656 -0.46875 3.488281 -0.261719 3.199219 -0.113281 C 2.910156 0.0351563 2.601563 0.105469 2.28125 0.109375 C 1.945313 0.105469 1.648438 0.0351563 1.390625 -0.105469 C 1.125 -0.25 0.9375 -0.453125 0.824219 -0.714844 C 0.707031 -0.976563 0.648438 -1.335938 0.652344 -1.800781 L 0.652344 -4.898438 L 1.945313 -4.898438 L 1.945313 -2.648438 C 1.945313 -1.957031 1.96875 -1.53125 2.015625 -1.378906 C 2.0625 -1.222656 2.148438 -1.101563 2.277344 -1.011719 C 2.402344 -0.917969 2.5625 -0.871094 2.757813 -0.875 C 2.976563 -0.871094 3.175781 -0.933594 3.351563 -1.058594 C 3.527344 -1.179688 3.648438 -1.332031 3.714844 -1.511719 C 3.777344 -1.691406 3.808594 -2.128906 3.8125 -2.832031 L 3.8125 -4.898438 L 5.109375 -4.898438 L 5.109375 0 Z M 3.902344 0 "
-           id="path7913" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-32">
-        <path
-           style="stroke:none;"
-           d="M 0.691406 -6.765625 L 3.394531 -6.765625 C 3.925781 -6.761719 4.328125 -6.738281 4.59375 -6.695313 C 4.855469 -6.648438 5.089844 -6.554688 5.296875 -6.417969 C 5.503906 -6.273438 5.675781 -6.085938 5.816406 -5.851563 C 5.953125 -5.613281 6.023438 -5.351563 6.027344 -5.0625 C 6.023438 -4.742188 5.9375 -4.449219 5.769531 -4.1875 C 5.597656 -3.921875 5.367188 -3.722656 5.074219 -3.59375 C 5.488281 -3.46875 5.804688 -3.265625 6.027344 -2.980469 C 6.246094 -2.6875 6.355469 -2.347656 6.359375 -1.960938 C 6.355469 -1.652344 6.285156 -1.351563 6.144531 -1.0625 C 6 -0.769531 5.804688 -0.539063 5.558594 -0.367188 C 5.308594 -0.191406 5.003906 -0.0859375 4.640625 -0.046875 C 4.414063 -0.0195313 3.863281 -0.00390625 2.996094 0 L 0.691406 0 Z M 2.058594 -5.636719 L 2.058594 -4.074219 L 2.953125 -4.074219 C 3.484375 -4.070313 3.8125 -4.078125 3.945313 -4.097656 C 4.175781 -4.121094 4.359375 -4.203125 4.496094 -4.339844 C 4.628906 -4.46875 4.699219 -4.644531 4.699219 -4.867188 C 4.699219 -5.074219 4.640625 -5.246094 4.523438 -5.378906 C 4.40625 -5.507813 4.234375 -5.585938 4.011719 -5.617188 C 3.875 -5.628906 3.484375 -5.636719 2.84375 -5.636719 Z M 2.058594 -2.949219 L 2.058594 -1.140625 L 3.324219 -1.140625 C 3.8125 -1.136719 4.125 -1.148438 4.257813 -1.179688 C 4.457031 -1.214844 4.621094 -1.304688 4.753906 -1.449219 C 4.878906 -1.589844 4.945313 -1.785156 4.945313 -2.027344 C 4.945313 -2.230469 4.894531 -2.402344 4.796875 -2.542969 C 4.699219 -2.683594 4.558594 -2.785156 4.371094 -2.851563 C 4.183594 -2.914063 3.777344 -2.945313 3.160156 -2.949219 Z M 2.058594 -2.949219 "
-           id="path7916" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-33">
-        <path
-           style="stroke:none;"
-           d="M 1.648438 0 L 0.03125 -6.765625 L 1.429688 -6.765625 L 2.449219 -2.117188 L 3.6875 -6.765625 L 5.3125 -6.765625 L 6.496094 -2.039063 L 7.535156 -6.765625 L 8.910156 -6.765625 L 7.265625 0 L 5.820313 0 L 4.472656 -5.058594 L 3.128906 0 Z M 1.648438 0 "
-           id="path7919" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-34">
-        <path
-           style="stroke:none;"
-           d="M 0.15625 0 L 0.15625 -1.011719 L 1.992188 -3.121094 C 2.292969 -3.460938 2.515625 -3.703125 2.664063 -3.851563 C 2.511719 -3.839844 2.3125 -3.835938 2.066406 -3.835938 L 0.335938 -3.824219 L 0.335938 -4.898438 L 4.386719 -4.898438 L 4.386719 -3.980469 L 2.515625 -1.824219 L 1.855469 -1.109375 C 2.210938 -1.125 2.433594 -1.136719 2.523438 -1.140625 L 4.53125 -1.140625 L 4.53125 0 Z M 0.15625 0 "
-           id="path7922" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-0">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 0 L 0.679688 -6.675781 L 5.625 -6.675781 L 5.625 -5.542969 L 2.027344 -5.542969 L 2.027344 -4.066406 L 5.375 -4.066406 L 5.375 -2.941406 L 2.027344 -2.941406 L 2.027344 -1.125 L 5.753906 -1.125 L 5.753906 0 Z M 0.679688 0 "
-           id="path7925" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-1">
-        <path
-           style="stroke:none;"
-           d="M 2 0 L 0.0507813 -4.835938 L 1.394531 -4.835938 L 2.304688 -2.367188 L 2.566406 -1.542969 C 2.636719 -1.746094 2.679688 -1.886719 2.699219 -1.957031 C 2.738281 -2.09375 2.785156 -2.230469 2.835938 -2.367188 L 3.753906 -4.835938 L 5.070313 -4.835938 L 3.148438 0 Z M 2 0 "
-           id="path7928" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-2">
-        <path
-           style="stroke:none;"
-           d="M 3.46875 -1.539063 L 4.742188 -1.324219 C 4.574219 -0.855469 4.316406 -0.5 3.964844 -0.257813 C 3.609375 -0.015625 3.167969 0.105469 2.640625 0.109375 C 1.796875 0.105469 1.175781 -0.167969 0.773438 -0.714844 C 0.453125 -1.152344 0.292969 -1.707031 0.296875 -2.382813 C 0.292969 -3.179688 0.5 -3.804688 0.921875 -4.261719 C 1.335938 -4.714844 1.867188 -4.945313 2.511719 -4.945313 C 3.230469 -4.945313 3.796875 -4.707031 4.214844 -4.230469 C 4.628906 -3.753906 4.828125 -3.023438 4.8125 -2.046875 L 1.605469 -2.046875 C 1.613281 -1.664063 1.714844 -1.371094 1.914063 -1.164063 C 2.109375 -0.949219 2.355469 -0.84375 2.652344 -0.847656 C 2.851563 -0.84375 3.019531 -0.898438 3.15625 -1.011719 C 3.292969 -1.117188 3.398438 -1.292969 3.46875 -1.539063 Z M 3.542969 -2.832031 C 3.53125 -3.199219 3.433594 -3.480469 3.253906 -3.675781 C 3.066406 -3.863281 2.847656 -3.960938 2.589844 -3.964844 C 2.308594 -3.960938 2.078125 -3.859375 1.898438 -3.660156 C 1.714844 -3.453125 1.625 -3.175781 1.628906 -2.832031 Z M 3.542969 -2.832031 "
-           id="path7931" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-3">
-        <path
-           style="stroke:none;"
-           d="M 5.066406 0 L 3.789063 0 L 3.789063 -2.46875 C 3.785156 -2.984375 3.757813 -3.324219 3.707031 -3.480469 C 3.648438 -3.632813 3.558594 -3.75 3.4375 -3.835938 C 3.3125 -3.917969 3.164063 -3.960938 2.996094 -3.964844 C 2.769531 -3.960938 2.570313 -3.898438 2.398438 -3.78125 C 2.21875 -3.65625 2.097656 -3.496094 2.035156 -3.300781 C 1.964844 -3.097656 1.933594 -2.730469 1.9375 -2.191406 L 1.9375 0 L 0.660156 0 L 0.660156 -4.835938 L 1.847656 -4.835938 L 1.847656 -4.125 C 2.265625 -4.671875 2.796875 -4.945313 3.441406 -4.945313 C 3.71875 -4.945313 3.976563 -4.894531 4.214844 -4.792969 C 4.445313 -4.691406 4.625 -4.558594 4.746094 -4.402344 C 4.863281 -4.238281 4.945313 -4.058594 4.996094 -3.863281 C 5.039063 -3.660156 5.0625 -3.375 5.066406 -3.003906 Z M 5.066406 0 "
-           id="path7934" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-4">
-        <path
-           style="stroke:none;"
-           d="M 2.886719 -4.835938 L 2.886719 -3.816406 L 2.011719 -3.816406 L 2.011719 -1.867188 C 2.007813 -1.46875 2.015625 -1.238281 2.035156 -1.175781 C 2.050781 -1.109375 2.089844 -1.054688 2.148438 -1.015625 C 2.207031 -0.96875 2.28125 -0.949219 2.367188 -0.953125 C 2.484375 -0.949219 2.65625 -0.988281 2.882813 -1.074219 L 2.992188 -0.0820313 C 2.691406 0.0429688 2.355469 0.105469 1.980469 0.109375 C 1.746094 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.035156 -0.179688 0.949219 -0.304688 C 0.863281 -0.429688 0.800781 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.707031 L 0.726563 -3.816406 L 0.140625 -3.816406 L 0.140625 -4.835938 L 0.726563 -4.835938 L 0.726563 -5.796875 L 2.011719 -6.542969 L 2.011719 -4.835938 Z M 2.886719 -4.835938 "
-           id="path7937" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-0">
-        <path
-           style="stroke:none;"
-           d="M 0 0.761719 L 7.441406 0.761719 L 7.441406 2.265625 L 4.511719 2.265625 L 4.511719 5.207031 L 7.441406 5.207031 L 7.441406 6.710938 L 0 6.710938 L 0 5.207031 L 3.253906 5.207031 L 3.253906 2.265625 L 0 2.265625 Z M 0 0.761719 "
-           id="path7940" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-1">
-        <path
-           style="stroke:none;"
-           d="M 2.769531 0.417969 C 3.242188 0.414063 3.699219 0.53125 4.144531 0.765625 C 4.585938 0.996094 4.925781 1.328125 5.160156 1.757813 C 5.390625 2.1875 5.507813 2.664063 5.511719 3.191406 C 5.507813 4.003906 5.242188 4.671875 4.714844 5.195313 C 4.183594 5.714844 3.515625 5.976563 2.710938 5.980469 C 1.894531 5.976563 1.21875 5.714844 0.683594 5.191406 C 0.148438 4.664063 -0.117188 4 -0.121094 3.203125 C -0.117188 2.707031 -0.0078125 2.234375 0.210938 1.789063 C 0.433594 1.335938 0.761719 0.996094 1.195313 0.765625 C 1.625 0.53125 2.148438 0.414063 2.769531 0.417969 Z M 2.695313 1.878906 C 2.160156 1.875 1.75 2 1.464844 2.257813 C 1.179688 2.507813 1.035156 2.824219 1.039063 3.199219 C 1.035156 3.570313 1.179688 3.878906 1.464844 4.132813 C 1.75 4.382813 2.164063 4.511719 2.707031 4.511719 C 3.230469 4.511719 3.632813 4.382813 3.921875 4.132813 C 4.203125 3.878906 4.347656 3.570313 4.351563 3.199219 C 4.347656 2.824219 4.203125 2.507813 3.921875 2.257813 C 3.632813 2 3.226563 1.875 2.695313 1.878906 Z M 2.695313 1.878906 "
-           id="path7943" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-2">
-        <path
-           style="stroke:none;"
-           d="M 1.539063 0.242188 L 1.757813 1.675781 C 1.476563 1.734375 1.265625 1.855469 1.125 2.042969 C 0.980469 2.226563 0.90625 2.488281 0.910156 2.828125 C 0.90625 3.191406 0.976563 3.46875 1.113281 3.660156 C 1.207031 3.78125 1.332031 3.84375 1.492188 3.847656 C 1.597656 3.84375 1.6875 3.808594 1.761719 3.746094 C 1.828125 3.671875 1.890625 3.515625 1.949219 3.269531 C 2.199219 2.117188 2.429688 1.386719 2.644531 1.082031 C 2.933594 0.652344 3.339844 0.4375 3.859375 0.441406 C 4.324219 0.4375 4.714844 0.621094 5.035156 0.992188 C 5.347656 1.359375 5.507813 1.933594 5.511719 2.710938 C 5.507813 3.445313 5.386719 3.992188 5.152344 4.355469 C 4.910156 4.710938 4.554688 4.960938 4.085938 5.097656 L 3.835938 3.75 C 4.042969 3.691406 4.203125 3.582031 4.320313 3.421875 C 4.429688 3.261719 4.488281 3.03125 4.488281 2.734375 C 4.488281 2.355469 4.433594 2.085938 4.328125 1.929688 C 4.253906 1.816406 4.15625 1.761719 4.039063 1.765625 C 3.933594 1.761719 3.847656 1.8125 3.78125 1.910156 C 3.683594 2.035156 3.550781 2.476563 3.378906 3.238281 C 3.203125 3.996094 2.992188 4.527344 2.746094 4.832031 C 2.492188 5.125 2.136719 5.273438 1.683594 5.277344 C 1.1875 5.273438 0.765625 5.066406 0.410156 4.65625 C 0.0585938 4.242188 -0.117188 3.632813 -0.121094 2.828125 C -0.117188 2.089844 0.0273438 1.507813 0.324219 1.082031 C 0.621094 0.652344 1.027344 0.371094 1.539063 0.242188 Z M 1.539063 0.242188 "
-           id="path7946" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-3">
-        <path
-           style="stroke:none;"
-           d="M 5.390625 3.21875 L 4.253906 3.21875 L 4.253906 2.242188 L 2.082031 2.242188 C 1.636719 2.238281 1.382813 2.246094 1.3125 2.269531 C 1.238281 2.285156 1.175781 2.328125 1.132813 2.398438 C 1.082031 2.460938 1.058594 2.542969 1.0625 2.640625 C 1.058594 2.769531 1.105469 2.960938 1.199219 3.214844 L 0.0898438 3.335938 C -0.0507813 3 -0.117188 2.625 -0.121094 2.207031 C -0.117188 1.945313 -0.078125 1.714844 0.0078125 1.511719 C 0.0898438 1.304688 0.203125 1.152344 0.339844 1.058594 C 0.476563 0.960938 0.660156 0.894531 0.898438 0.859375 C 1.0625 0.824219 1.398438 0.808594 1.902344 0.8125 L 4.253906 0.8125 L 4.253906 0.15625 L 5.390625 0.15625 L 5.390625 0.8125 L 6.460938 0.8125 L 7.292969 2.242188 L 5.390625 2.242188 Z M 5.390625 3.21875 "
-           id="path7949" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-4">
-        <path
-           style="stroke:none;"
-           d="M 3.746094 1.8125 L 3.980469 0.519531 C 4.496094 0.660156 4.882813 0.910156 5.136719 1.265625 C 5.382813 1.621094 5.507813 2.148438 5.511719 2.851563 C 5.507813 3.488281 5.433594 3.960938 5.285156 4.273438 C 5.132813 4.582031 4.941406 4.800781 4.710938 4.929688 C 4.476563 5.054688 4.050781 5.121094 3.4375 5.121094 L 1.769531 5.105469 C 1.296875 5.101563 0.945313 5.125 0.722656 5.171875 C 0.496094 5.214844 0.257813 5.300781 0 5.429688 L 0 4.019531 C 0.09375 3.980469 0.234375 3.933594 0.421875 3.882813 C 0.503906 3.855469 0.558594 3.839844 0.589844 3.832031 C 0.351563 3.585938 0.171875 3.324219 0.0546875 3.046875 C -0.0625 2.769531 -0.117188 2.472656 -0.121094 2.164063 C -0.117188 1.601563 0.03125 1.164063 0.328125 0.847656 C 0.628906 0.527344 1.007813 0.367188 1.472656 0.371094 C 1.773438 0.367188 2.046875 0.441406 2.285156 0.589844 C 2.523438 0.734375 2.703125 0.9375 2.832031 1.199219 C 2.957031 1.460938 3.070313 1.839844 3.167969 2.335938 C 3.289063 3 3.40625 3.460938 3.515625 3.71875 L 3.660156 3.71875 C 3.929688 3.714844 4.125 3.648438 4.246094 3.515625 C 4.359375 3.378906 4.417969 3.121094 4.421875 2.75 C 4.417969 2.492188 4.367188 2.296875 4.269531 2.15625 C 4.167969 2.015625 3.992188 1.898438 3.746094 1.8125 Z M 2.589844 3.71875 C 2.527344 3.535156 2.453125 3.246094 2.367188 2.851563 C 2.28125 2.453125 2.199219 2.195313 2.121094 2.074219 C 1.988281 1.886719 1.820313 1.792969 1.621094 1.796875 C 1.417969 1.792969 1.246094 1.867188 1.101563 2.019531 C 0.953125 2.164063 0.878906 2.355469 0.882813 2.589844 C 0.878906 2.84375 0.964844 3.085938 1.136719 3.324219 C 1.261719 3.492188 1.417969 3.605469 1.609375 3.664063 C 1.730469 3.695313 1.960938 3.714844 2.304688 3.71875 Z M 2.589844 3.71875 "
-           id="path7952" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-5">
-        <path
-           style="stroke:none;"
-           d="M 0 0.746094 L 7.441406 0.746094 L 7.441406 2.171875 L 0 2.171875 Z M 0 0.746094 "
-           id="path7955" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-6">
-        <path
-           style="stroke:none;"
-           d="M 5.390625 0.707031 L 5.390625 2.035156 L 4.597656 2.035156 C 4.867188 2.203125 5.085938 2.4375 5.257813 2.734375 C 5.421875 3.027344 5.507813 3.355469 5.511719 3.714844 C 5.507813 4.339844 5.261719 4.871094 4.773438 5.308594 C 4.28125 5.746094 3.597656 5.964844 2.726563 5.964844 C 1.820313 5.964844 1.121094 5.742188 0.625 5.304688 C 0.128906 4.859375 -0.117188 4.328125 -0.121094 3.707031 C -0.117188 3.40625 -0.0625 3.136719 0.0546875 2.894531 C 0.171875 2.652344 0.375 2.398438 0.664063 2.132813 L -2.050781 2.132813 L -2.050781 0.707031 Z M 2.785156 2.117188 C 2.175781 2.113281 1.730469 2.234375 1.445313 2.476563 C 1.152344 2.714844 1.007813 3.007813 1.011719 3.355469 C 1.007813 3.683594 1.140625 3.957031 1.410156 4.179688 C 1.671875 4.398438 2.105469 4.511719 2.714844 4.511719 C 3.277344 4.511719 3.699219 4.398438 3.972656 4.171875 C 4.246094 3.945313 4.382813 3.664063 4.386719 3.328125 C 4.382813 2.980469 4.246094 2.691406 3.980469 2.460938 C 3.707031 2.230469 3.308594 2.113281 2.785156 2.117188 Z M 2.785156 2.117188 "
-           id="path7958" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-7">
-        <path
-           style="stroke:none;"
-           d="M 0 0.695313 L 7.441406 0.695313 L 7.441406 2.121094 L 3.492188 2.121094 L 5.390625 3.792969 L 5.390625 5.546875 L 3.421875 3.707031 L 0 5.679688 L 0 4.140625 L 2.421875 2.785156 L 1.726563 2.121094 L 0 2.121094 Z M 0 0.695313 "
-           id="path7961" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-8">
-        <path
-           style="stroke:none;"
-           d="M 7.441406 0.746094 L 7.441406 2.25 L 3.410156 2.25 C 2.769531 2.25 2.355469 2.265625 2.167969 2.304688 C 1.863281 2.367188 1.617188 2.519531 1.433594 2.761719 C 1.246094 3 1.152344 3.332031 1.15625 3.757813 C 1.152344 4.179688 1.242188 4.5 1.417969 4.71875 C 1.589844 4.933594 1.800781 5.066406 2.058594 5.109375 C 2.308594 5.152344 2.730469 5.171875 3.324219 5.175781 L 7.441406 5.175781 L 7.441406 6.679688 L 3.53125 6.679688 C 2.636719 6.679688 2.003906 6.636719 1.636719 6.558594 C 1.265625 6.472656 0.957031 6.324219 0.703125 6.109375 C 0.449219 5.886719 0.246094 5.59375 0.0976563 5.230469 C -0.0507813 4.863281 -0.121094 4.386719 -0.125 3.800781 C -0.121094 3.085938 -0.0429688 2.546875 0.121094 2.183594 C 0.28125 1.8125 0.492188 1.523438 0.757813 1.3125 C 1.015625 1.101563 1.292969 0.960938 1.582031 0.894531 C 2.007813 0.792969 2.636719 0.742188 3.472656 0.746094 Z M 7.441406 0.746094 "
-           id="path7964" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-9">
-        <path
-           style="stroke:none;"
-           d="M 1.714844 3.867188 L 1.476563 5.289063 C 0.953125 5.105469 0.554688 4.816406 0.285156 4.421875 C 0.015625 4.027344 -0.117188 3.535156 -0.121094 2.945313 C -0.117188 2.003906 0.1875 1.308594 0.796875 0.863281 C 1.285156 0.503906 1.902344 0.328125 2.65625 0.328125 C 3.546875 0.328125 4.246094 0.558594 4.753906 1.027344 C 5.253906 1.492188 5.507813 2.085938 5.511719 2.800781 C 5.507813 3.601563 5.242188 4.234375 4.714844 4.699219 C 4.183594 5.160156 3.375 5.378906 2.285156 5.363281 L 2.285156 1.792969 C 1.859375 1.800781 1.527344 1.914063 1.296875 2.136719 C 1.058594 2.351563 0.941406 2.625 0.945313 2.960938 C 0.941406 3.179688 1.003906 3.367188 1.125 3.519531 C 1.246094 3.671875 1.441406 3.789063 1.714844 3.867188 Z M 3.15625 3.949219 C 3.566406 3.9375 3.878906 3.828125 4.097656 3.628906 C 4.308594 3.421875 4.417969 3.175781 4.421875 2.886719 C 4.417969 2.574219 4.304688 2.316406 4.082031 2.117188 C 3.851563 1.910156 3.542969 1.8125 3.15625 1.816406 Z M 3.15625 3.949219 "
-           id="path7967" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-10">
-        <path
-           style="stroke:none;"
-           d="M 0 2.113281 L 0 0.683594 L 5.390625 0.683594 L 5.390625 2.011719 L 4.625 2.011719 C 4.984375 2.234375 5.222656 2.4375 5.339844 2.621094 C 5.449219 2.800781 5.507813 3.007813 5.511719 3.238281 C 5.507813 3.5625 5.417969 3.875 5.242188 4.175781 L 4 3.734375 C 4.152344 3.492188 4.230469 3.269531 4.234375 3.066406 C 4.230469 2.863281 4.175781 2.695313 4.066406 2.558594 C 3.957031 2.417969 3.757813 2.308594 3.46875 2.230469 C 3.179688 2.148438 2.578125 2.109375 1.664063 2.113281 Z M 0 2.113281 "
-           id="path7970" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-0">
-        <path
-           style="stroke:none;"
-           d="M 0 7.371094 L 0 5.757813 L 1.667969 5.117188 L 1.667969 2.179688 L 0 1.574219 L 0 0 L 7.347656 2.863281 L 7.347656 4.429688 Z M 2.90625 4.640625 L 5.632813 3.628906 L 2.90625 2.636719 Z M 2.90625 4.640625 "
-           id="path7973" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-1">
-        <path
-           style="stroke:none;"
-           d="M 3.75 5.378906 L 3.5 3.988281 C 3.773438 3.9375 3.984375 3.832031 4.125 3.667969 C 4.265625 3.503906 4.335938 3.289063 4.335938 3.027344 C 4.335938 2.671875 4.214844 2.390625 3.972656 2.1875 C 3.730469 1.976563 3.324219 1.875 2.757813 1.875 C 2.121094 1.875 1.675781 1.980469 1.417969 2.191406 C 1.15625 2.402344 1.023438 2.6875 1.027344 3.046875 C 1.023438 3.3125 1.101563 3.53125 1.253906 3.703125 C 1.40625 3.875 1.667969 3.996094 2.039063 4.066406 L 1.804688 5.449219 C 1.167969 5.304688 0.6875 5.027344 0.363281 4.621094 C 0.0429688 4.210938 -0.117188 3.664063 -0.121094 2.980469 C -0.117188 2.199219 0.125 1.578125 0.613281 1.117188 C 1.101563 0.652344 1.78125 0.421875 2.65625 0.425781 C 3.53125 0.421875 4.214844 0.652344 4.707031 1.121094 C 5.191406 1.582031 5.4375 2.210938 5.441406 3.007813 C 5.4375 3.652344 5.296875 4.167969 5.023438 4.554688 C 4.742188 4.933594 4.320313 5.210938 3.75 5.378906 Z M 3.75 5.378906 "
-           id="path7976" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-2">
-        <path
-           style="stroke:none;"
-           d="M 1.695313 3.820313 L 1.457031 5.222656 C 0.941406 5.039063 0.550781 4.753906 0.28125 4.367188 C 0.015625 3.976563 -0.117188 3.492188 -0.121094 2.90625 C -0.117188 1.980469 0.183594 1.292969 0.785156 0.851563 C 1.269531 0.496094 1.878906 0.320313 2.621094 0.324219 C 3.5 0.320313 4.191406 0.550781 4.691406 1.015625 C 5.1875 1.476563 5.4375 2.0625 5.441406 2.765625 C 5.4375 3.554688 5.175781 4.179688 4.65625 4.640625 C 4.132813 5.097656 3.332031 5.316406 2.253906 5.296875 L 2.253906 1.769531 C 1.835938 1.777344 1.511719 1.890625 1.28125 2.109375 C 1.046875 2.328125 0.929688 2.597656 0.933594 2.921875 C 0.929688 3.140625 0.992188 3.324219 1.113281 3.476563 C 1.230469 3.625 1.421875 3.742188 1.695313 3.820313 Z M 3.117188 3.898438 C 3.523438 3.882813 3.832031 3.777344 4.046875 3.582031 C 4.257813 3.378906 4.363281 3.136719 4.367188 2.851563 C 4.363281 2.542969 4.253906 2.289063 4.03125 2.089844 C 3.804688 1.886719 3.5 1.789063 3.117188 1.792969 Z M 3.117188 3.898438 "
-           id="path7979" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-3">
-        <path
-           style="stroke:none;"
-           d="M 0 0.738281 L 7.347656 0.738281 L 7.347656 2.144531 L 0 2.144531 Z M 0 0.738281 "
-           id="path7982" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-4">
-        <path
-           style="stroke:none;"
-           d="M 0 2.085938 L 0 0.675781 L 5.324219 0.675781 L 5.324219 1.984375 L 4.566406 1.984375 C 4.917969 2.207031 5.152344 2.40625 5.269531 2.585938 C 5.378906 2.761719 5.4375 2.96875 5.441406 3.199219 C 5.4375 3.519531 5.347656 3.828125 5.175781 4.125 L 3.949219 3.6875 C 4.101563 3.445313 4.175781 3.226563 4.179688 3.027344 C 4.175781 2.828125 4.121094 2.660156 4.015625 2.523438 C 3.90625 2.386719 3.710938 2.28125 3.429688 2.203125 C 3.140625 2.125 2.546875 2.085938 1.644531 2.085938 Z M 0 2.085938 "
-           id="path7985" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-5">
-        <path
-           style="stroke:none;"
-           d="M 3.699219 1.789063 L 3.929688 0.511719 C 4.441406 0.652344 4.820313 0.902344 5.070313 1.253906 C 5.3125 1.605469 5.4375 2.125 5.441406 2.816406 C 5.4375 3.441406 5.363281 3.910156 5.21875 4.21875 C 5.066406 4.523438 4.878906 4.738281 4.652344 4.867188 C 4.421875 4.992188 4 5.054688 3.394531 5.058594 L 1.75 5.042969 C 1.277344 5.039063 0.933594 5.0625 0.714844 5.109375 C 0.492188 5.152344 0.253906 5.234375 0 5.363281 L 0 3.96875 C 0.09375 3.929688 0.230469 3.886719 0.414063 3.835938 C 0.496094 3.808594 0.550781 3.792969 0.582031 3.785156 C 0.34375 3.542969 0.167969 3.285156 0.0546875 3.011719 C -0.0625 2.738281 -0.117188 2.445313 -0.121094 2.136719 C -0.117188 1.585938 0.0273438 1.152344 0.324219 0.839844 C 0.621094 0.519531 1 0.363281 1.453125 0.367188 C 1.753906 0.363281 2.019531 0.433594 2.257813 0.582031 C 2.492188 0.722656 2.675781 0.925781 2.800781 1.183594 C 2.925781 1.441406 3.035156 1.8125 3.128906 2.304688 C 3.25 2.960938 3.363281 3.417969 3.472656 3.675781 L 3.613281 3.675781 C 3.882813 3.671875 4.074219 3.605469 4.191406 3.472656 C 4.304688 3.335938 4.363281 3.082031 4.367188 2.714844 C 4.363281 2.460938 4.316406 2.265625 4.21875 2.128906 C 4.117188 1.984375 3.941406 1.871094 3.699219 1.789063 Z M 2.554688 3.675781 C 2.492188 3.488281 2.417969 3.203125 2.339844 2.816406 C 2.253906 2.421875 2.171875 2.167969 2.09375 2.050781 C 1.960938 1.863281 1.796875 1.769531 1.597656 1.773438 C 1.398438 1.769531 1.226563 1.84375 1.085938 1.992188 C 0.9375 2.140625 0.867188 2.328125 0.871094 2.554688 C 0.867188 2.808594 0.949219 3.050781 1.121094 3.28125 C 1.246094 3.453125 1.402344 3.5625 1.589844 3.617188 C 1.707031 3.652344 1.9375 3.671875 2.277344 3.675781 Z M 2.554688 3.675781 "
-           id="path7988" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-6">
-        <path
-           style="stroke:none;"
-           d="M 5.324219 3.179688 L 4.199219 3.179688 L 4.199219 2.214844 L 2.054688 2.214844 C 1.617188 2.214844 1.363281 2.222656 1.292969 2.242188 C 1.21875 2.257813 1.160156 2.300781 1.117188 2.367188 C 1.066406 2.429688 1.042969 2.507813 1.046875 2.605469 C 1.042969 2.734375 1.089844 2.921875 1.183594 3.171875 L 0.0898438 3.292969 C -0.0507813 2.964844 -0.117188 2.59375 -0.121094 2.179688 C -0.117188 1.925781 -0.078125 1.695313 0.0078125 1.492188 C 0.0898438 1.285156 0.203125 1.136719 0.339844 1.042969 C 0.472656 0.949219 0.652344 0.882813 0.886719 0.847656 C 1.046875 0.8125 1.378906 0.796875 1.878906 0.800781 L 4.199219 0.800781 L 4.199219 0.15625 L 5.324219 0.15625 L 5.324219 0.800781 L 6.378906 0.800781 L 7.203125 2.214844 L 5.324219 2.214844 Z M 5.324219 3.179688 "
-           id="path7991" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-7">
-        <path
-           style="stroke:none;"
-           d="M 2.738281 0.410156 C 3.199219 0.410156 3.652344 0.523438 4.09375 0.753906 C 4.527344 0.984375 4.859375 1.3125 5.09375 1.734375 C 5.320313 2.15625 5.4375 2.628906 5.441406 3.152344 C 5.4375 3.957031 5.175781 4.617188 4.65625 5.132813 C 4.132813 5.648438 3.472656 5.90625 2.675781 5.90625 C 1.867188 5.90625 1.199219 5.644531 0.671875 5.125 C 0.144531 4.601563 -0.117188 3.949219 -0.121094 3.164063 C -0.117188 2.671875 -0.0078125 2.203125 0.210938 1.765625 C 0.429688 1.320313 0.75 0.984375 1.179688 0.753906 C 1.601563 0.523438 2.121094 0.410156 2.738281 0.410156 Z M 2.660156 1.855469 C 2.132813 1.851563 1.726563 1.976563 1.449219 2.230469 C 1.164063 2.476563 1.023438 2.785156 1.027344 3.15625 C 1.023438 3.523438 1.164063 3.832031 1.449219 4.082031 C 1.726563 4.328125 2.136719 4.453125 2.671875 4.457031 C 3.191406 4.453125 3.589844 4.328125 3.875 4.082031 C 4.152344 3.832031 4.292969 3.523438 4.296875 3.15625 C 4.292969 2.785156 4.152344 2.476563 3.875 2.230469 C 3.589844 1.976563 3.1875 1.851563 2.660156 1.855469 Z M 2.660156 1.855469 "
-           id="path7994" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-0">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 0 L 0.675781 -6.667969 L 5.621094 -6.667969 L 5.621094 -5.539063 L 2.023438 -5.539063 L 2.023438 -4.0625 L 5.371094 -4.0625 L 5.371094 -2.9375 L 2.023438 -2.9375 L 2.023438 -1.125 L 5.746094 -1.125 L 5.746094 0 Z M 0.675781 0 "
-           id="path7997" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-1">
-        <path
-           style="stroke:none;"
-           d="M 0.0546875 0 L 1.796875 -2.488281 L 0.128906 -4.828125 L 1.6875 -4.828125 L 2.542969 -3.5 L 3.441406 -4.828125 L 4.941406 -4.828125 L 3.304688 -2.542969 L 5.09375 0 L 3.523438 0 L 2.542969 -1.496094 L 1.550781 0 Z M 0.0546875 0 "
-           id="path8000" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-2">
-        <path
-           style="stroke:none;"
-           d="M 3.464844 -1.535156 L 4.738281 -1.324219 C 4.570313 -0.855469 4.3125 -0.496094 3.960938 -0.253906 C 3.609375 -0.0117188 3.167969 0.105469 2.636719 0.109375 C 1.796875 0.105469 1.175781 -0.167969 0.773438 -0.714844 C 0.453125 -1.152344 0.292969 -1.707031 0.296875 -2.378906 C 0.292969 -3.175781 0.5 -3.800781 0.921875 -4.257813 C 1.335938 -4.707031 1.867188 -4.933594 2.511719 -4.9375 C 3.226563 -4.933594 3.792969 -4.695313 4.207031 -4.226563 C 4.621094 -3.75 4.820313 -3.023438 4.804688 -2.046875 L 1.605469 -2.046875 C 1.613281 -1.664063 1.714844 -1.371094 1.914063 -1.160156 C 2.109375 -0.949219 2.355469 -0.84375 2.652344 -0.847656 C 2.851563 -0.84375 3.019531 -0.898438 3.15625 -1.007813 C 3.289063 -1.117188 3.390625 -1.292969 3.464844 -1.535156 Z M 3.539063 -2.828125 C 3.527344 -3.195313 3.429688 -3.476563 3.25 -3.671875 C 3.066406 -3.863281 2.84375 -3.960938 2.585938 -3.960938 C 2.304688 -3.960938 2.074219 -3.859375 1.894531 -3.65625 C 1.710938 -3.453125 1.625 -3.175781 1.628906 -2.828125 Z M 3.539063 -2.828125 "
-           id="path8003" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-3">
-        <path
-           style="stroke:none;"
-           d="M 4.878906 -3.402344 L 3.621094 -3.175781 C 3.574219 -3.425781 3.476563 -3.613281 3.328125 -3.742188 C 3.179688 -3.867188 2.984375 -3.929688 2.746094 -3.933594 C 2.425781 -3.929688 2.171875 -3.820313 1.984375 -3.601563 C 1.792969 -3.378906 1.699219 -3.011719 1.699219 -2.5 C 1.699219 -1.925781 1.792969 -1.519531 1.988281 -1.285156 C 2.175781 -1.046875 2.4375 -0.929688 2.765625 -0.933594 C 3.007813 -0.929688 3.203125 -1 3.359375 -1.140625 C 3.511719 -1.277344 3.621094 -1.515625 3.6875 -1.851563 L 4.941406 -1.636719 C 4.808594 -1.058594 4.558594 -0.625 4.191406 -0.332031 C 3.816406 -0.0390625 3.324219 0.105469 2.707031 0.109375 C 1.996094 0.105469 1.429688 -0.113281 1.015625 -0.558594 C 0.59375 -1 0.386719 -1.617188 0.386719 -2.410156 C 0.386719 -3.203125 0.597656 -3.824219 1.019531 -4.269531 C 1.441406 -4.710938 2.007813 -4.933594 2.726563 -4.9375 C 3.3125 -4.933594 3.78125 -4.808594 4.128906 -4.558594 C 4.476563 -4.304688 4.726563 -3.917969 4.878906 -3.402344 Z M 4.878906 -3.402344 "
-           id="path8006" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-4">
-        <path
-           style="stroke:none;"
-           d="M 3.847656 0 L 3.847656 -0.722656 C 3.667969 -0.460938 3.4375 -0.257813 3.152344 -0.113281 C 2.863281 0.0351563 2.558594 0.105469 2.246094 0.109375 C 1.917969 0.105469 1.628906 0.0390625 1.375 -0.101563 C 1.113281 -0.242188 0.925781 -0.445313 0.8125 -0.703125 C 0.695313 -0.960938 0.636719 -1.316406 0.640625 -1.773438 L 0.640625 -4.828125 L 1.917969 -4.828125 L 1.917969 -2.609375 C 1.914063 -1.925781 1.9375 -1.511719 1.988281 -1.359375 C 2.03125 -1.207031 2.117188 -1.085938 2.246094 -0.996094 C 2.367188 -0.90625 2.527344 -0.859375 2.71875 -0.863281 C 2.9375 -0.859375 3.132813 -0.921875 3.304688 -1.042969 C 3.476563 -1.160156 3.59375 -1.308594 3.660156 -1.488281 C 3.722656 -1.664063 3.753906 -2.097656 3.757813 -2.792969 L 3.757813 -4.828125 L 5.035156 -4.828125 L 5.035156 0 Z M 3.847656 0 "
-           id="path8009" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-5">
-        <path
-           style="stroke:none;"
-           d="M 2.882813 -4.828125 L 2.882813 -3.8125 L 2.011719 -3.8125 L 2.011719 -1.863281 C 2.007813 -1.46875 2.015625 -1.238281 2.035156 -1.175781 C 2.046875 -1.109375 2.085938 -1.054688 2.148438 -1.011719 C 2.207031 -0.96875 2.277344 -0.949219 2.363281 -0.949219 C 2.480469 -0.949219 2.652344 -0.988281 2.878906 -1.074219 L 2.988281 -0.0820313 C 2.691406 0.0429688 2.351563 0.105469 1.976563 0.109375 C 1.742188 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.035156 -0.179688 0.949219 -0.304688 C 0.863281 -0.429688 0.800781 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.707031 L 0.726563 -3.8125 L 0.140625 -3.8125 L 0.140625 -4.828125 L 0.726563 -4.828125 L 0.726563 -5.789063 L 2.011719 -6.535156 L 2.011719 -4.828125 Z M 2.882813 -4.828125 "
-           id="path8012" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-6">
-        <path
-           style="stroke:none;"
-           d="M 0.371094 -2.484375 C 0.367188 -2.90625 0.472656 -3.316406 0.683594 -3.714844 C 0.890625 -4.113281 1.1875 -4.414063 1.574219 -4.625 C 1.957031 -4.828125 2.386719 -4.933594 2.859375 -4.9375 C 3.589844 -4.933594 4.1875 -4.695313 4.65625 -4.226563 C 5.121094 -3.75 5.355469 -3.152344 5.355469 -2.429688 C 5.355469 -1.695313 5.117188 -1.089844 4.648438 -0.609375 C 4.171875 -0.132813 3.582031 0.105469 2.871094 0.109375 C 2.425781 0.105469 2 0.0078125 1.601563 -0.191406 C 1.195313 -0.390625 0.890625 -0.683594 0.683594 -1.070313 C 0.472656 -1.457031 0.367188 -1.929688 0.371094 -2.484375 Z M 1.683594 -2.414063 C 1.679688 -1.933594 1.792969 -1.566406 2.023438 -1.3125 C 2.246094 -1.058594 2.527344 -0.929688 2.863281 -0.933594 C 3.195313 -0.929688 3.476563 -1.058594 3.703125 -1.3125 C 3.929688 -1.566406 4.042969 -1.933594 4.042969 -2.421875 C 4.042969 -2.890625 3.929688 -3.257813 3.703125 -3.515625 C 3.476563 -3.769531 3.195313 -3.894531 2.863281 -3.898438 C 2.527344 -3.894531 2.246094 -3.769531 2.023438 -3.515625 C 1.792969 -3.257813 1.679688 -2.890625 1.683594 -2.414063 Z M 1.683594 -2.414063 "
-           id="path8015" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-7">
-        <path
-           style="stroke:none;"
-           d="M 1.890625 0 L 0.613281 0 L 0.613281 -4.828125 L 1.800781 -4.828125 L 1.800781 -4.144531 C 2 -4.464844 2.183594 -4.679688 2.347656 -4.78125 C 2.507813 -4.882813 2.691406 -4.933594 2.902344 -4.9375 C 3.1875 -4.933594 3.46875 -4.855469 3.742188 -4.699219 L 3.347656 -3.582031 C 3.128906 -3.71875 2.929688 -3.789063 2.746094 -3.792969 C 2.5625 -3.789063 2.410156 -3.742188 2.289063 -3.644531 C 2.164063 -3.542969 2.066406 -3.363281 1.996094 -3.109375 C 1.921875 -2.847656 1.886719 -2.308594 1.890625 -1.492188 Z M 1.890625 0 "
-           id="path8018" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-0">
-        <path
-           style="stroke:none;"
-           d="M 5.515625 -2.734375 L 6.972656 -2.273438 C 6.75 -1.460938 6.378906 -0.855469 5.859375 -0.464844 C 5.339844 -0.0703125 4.679688 0.121094 3.882813 0.125 C 2.894531 0.121094 2.082031 -0.210938 1.445313 -0.886719 C 0.808594 -1.558594 0.492188 -2.484375 0.492188 -3.65625 C 0.492188 -4.890625 0.808594 -5.851563 1.449219 -6.539063 C 2.085938 -7.222656 2.929688 -7.5625 3.972656 -7.566406 C 4.882813 -7.5625 5.621094 -7.296875 6.191406 -6.761719 C 6.527344 -6.441406 6.78125 -5.984375 6.953125 -5.390625 L 5.464844 -5.035156 C 5.375 -5.417969 5.191406 -5.722656 4.914063 -5.949219 C 4.632813 -6.167969 4.292969 -6.28125 3.898438 -6.285156 C 3.34375 -6.28125 2.898438 -6.082031 2.554688 -5.691406 C 2.210938 -5.292969 2.039063 -4.652344 2.039063 -3.765625 C 2.039063 -2.824219 2.207031 -2.152344 2.546875 -1.753906 C 2.882813 -1.351563 3.324219 -1.152344 3.867188 -1.15625 C 4.265625 -1.152344 4.605469 -1.28125 4.894531 -1.535156 C 5.179688 -1.789063 5.386719 -2.1875 5.515625 -2.734375 Z M 5.515625 -2.734375 "
-           id="path8021" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-1">
-        <path
-           style="stroke:none;"
-           d="M 0.417969 -2.769531 C 0.414063 -3.242188 0.53125 -3.699219 0.765625 -4.144531 C 0.996094 -4.585938 1.328125 -4.925781 1.757813 -5.160156 C 2.1875 -5.390625 2.664063 -5.507813 3.191406 -5.511719 C 4.003906 -5.507813 4.671875 -5.242188 5.195313 -4.714844 C 5.714844 -4.183594 5.976563 -3.515625 5.980469 -2.710938 C 5.976563 -1.894531 5.714844 -1.21875 5.191406 -0.683594 C 4.664063 -0.148438 4 0.117188 3.203125 0.121094 C 2.707031 0.117188 2.234375 0.0078125 1.789063 -0.210938 C 1.335938 -0.433594 0.996094 -0.761719 0.765625 -1.195313 C 0.53125 -1.625 0.414063 -2.148438 0.417969 -2.769531 Z M 1.878906 -2.695313 C 1.875 -2.160156 2 -1.75 2.257813 -1.464844 C 2.507813 -1.179688 2.824219 -1.035156 3.199219 -1.039063 C 3.570313 -1.035156 3.878906 -1.179688 4.132813 -1.464844 C 4.382813 -1.75 4.511719 -2.164063 4.511719 -2.707031 C 4.511719 -3.230469 4.382813 -3.632813 4.132813 -3.921875 C 3.878906 -4.203125 3.570313 -4.347656 3.199219 -4.351563 C 2.824219 -4.347656 2.507813 -4.203125 2.257813 -3.921875 C 2 -3.632813 1.875 -3.226563 1.878906 -2.695313 Z M 1.878906 -2.695313 "
-           id="path8024" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-2">
-        <path
-           style="stroke:none;"
-           d="M 5.648438 0 L 4.222656 0 L 4.222656 -2.75 C 4.21875 -3.328125 4.1875 -3.707031 4.128906 -3.878906 C 4.066406 -4.050781 3.96875 -4.183594 3.832031 -4.277344 C 3.695313 -4.371094 3.53125 -4.417969 3.339844 -4.421875 C 3.089844 -4.417969 2.867188 -4.351563 2.671875 -4.21875 C 2.476563 -4.082031 2.339844 -3.902344 2.269531 -3.679688 C 2.195313 -3.453125 2.160156 -3.039063 2.164063 -2.441406 L 2.164063 0 L 0.734375 0 L 0.734375 -5.390625 L 2.0625 -5.390625 L 2.0625 -4.597656 C 2.53125 -5.203125 3.121094 -5.507813 3.835938 -5.511719 C 4.148438 -5.507813 4.4375 -5.449219 4.699219 -5.339844 C 4.957031 -5.222656 5.152344 -5.078125 5.289063 -4.90625 C 5.421875 -4.726563 5.515625 -4.527344 5.570313 -4.308594 C 5.621094 -4.082031 5.648438 -3.765625 5.648438 -3.351563 Z M 5.648438 0 "
-           id="path8027" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-3">
-        <path
-           style="stroke:none;"
-           d="M 5.445313 -3.796875 L 4.039063 -3.542969 C 3.988281 -3.820313 3.882813 -4.03125 3.714844 -4.175781 C 3.546875 -4.316406 3.328125 -4.386719 3.066406 -4.390625 C 2.707031 -4.386719 2.425781 -4.265625 2.214844 -4.023438 C 2.003906 -3.777344 1.898438 -3.367188 1.898438 -2.792969 C 1.898438 -2.148438 2.003906 -1.695313 2.21875 -1.433594 C 2.433594 -1.167969 2.722656 -1.035156 3.085938 -1.039063 C 3.355469 -1.035156 3.578125 -1.113281 3.75 -1.269531 C 3.921875 -1.421875 4.042969 -1.6875 4.117188 -2.066406 L 5.515625 -1.828125 C 5.367188 -1.183594 5.089844 -0.695313 4.679688 -0.371094 C 4.265625 -0.0429688 3.710938 0.117188 3.019531 0.121094 C 2.226563 0.117188 1.597656 -0.128906 1.132813 -0.625 C 0.660156 -1.121094 0.425781 -1.808594 0.429688 -2.691406 C 0.425781 -3.574219 0.664063 -4.265625 1.136719 -4.765625 C 1.605469 -5.257813 2.242188 -5.507813 3.046875 -5.511719 C 3.703125 -5.507813 4.222656 -5.367188 4.609375 -5.085938 C 4.996094 -4.804688 5.273438 -4.375 5.445313 -3.796875 Z M 5.445313 -3.796875 "
-           id="path8030" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-4">
-        <path
-           style="stroke:none;"
-           d="M 3.867188 -1.714844 L 5.289063 -1.476563 C 5.105469 -0.953125 4.816406 -0.554688 4.421875 -0.285156 C 4.027344 -0.015625 3.535156 0.117188 2.945313 0.121094 C 2.003906 0.117188 1.308594 -0.1875 0.863281 -0.796875 C 0.503906 -1.285156 0.328125 -1.902344 0.328125 -2.65625 C 0.328125 -3.546875 0.558594 -4.246094 1.027344 -4.753906 C 1.492188 -5.253906 2.085938 -5.507813 2.800781 -5.511719 C 3.601563 -5.507813 4.234375 -5.242188 4.699219 -4.714844 C 5.160156 -4.183594 5.378906 -3.375 5.363281 -2.285156 L 1.792969 -2.285156 C 1.800781 -1.859375 1.914063 -1.527344 2.136719 -1.296875 C 2.351563 -1.058594 2.625 -0.941406 2.960938 -0.945313 C 3.179688 -0.941406 3.367188 -1.003906 3.519531 -1.125 C 3.671875 -1.246094 3.789063 -1.441406 3.867188 -1.714844 Z M 3.949219 -3.15625 C 3.9375 -3.566406 3.828125 -3.878906 3.628906 -4.097656 C 3.421875 -4.308594 3.175781 -4.417969 2.886719 -4.421875 C 2.574219 -4.417969 2.316406 -4.304688 2.117188 -4.082031 C 1.910156 -3.851563 1.8125 -3.542969 1.816406 -3.15625 Z M 3.949219 -3.15625 "
-           id="path8033" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-5">
-        <path
-           style="stroke:none;"
-           d="M 0.707031 -5.390625 L 2.035156 -5.390625 L 2.035156 -4.597656 C 2.203125 -4.867188 2.4375 -5.085938 2.734375 -5.257813 C 3.027344 -5.421875 3.355469 -5.507813 3.714844 -5.511719 C 4.339844 -5.507813 4.871094 -5.261719 5.308594 -4.773438 C 5.746094 -4.28125 5.964844 -3.597656 5.964844 -2.726563 C 5.964844 -1.820313 5.742188 -1.121094 5.304688 -0.625 C 4.859375 -0.128906 4.328125 0.117188 3.707031 0.121094 C 3.40625 0.117188 3.136719 0.0625 2.894531 -0.0546875 C 2.652344 -0.171875 2.398438 -0.375 2.132813 -0.664063 L 2.132813 2.050781 L 0.707031 2.050781 Z M 2.117188 -2.785156 C 2.113281 -2.175781 2.234375 -1.730469 2.476563 -1.445313 C 2.714844 -1.152344 3.007813 -1.007813 3.355469 -1.011719 C 3.683594 -1.007813 3.957031 -1.140625 4.179688 -1.410156 C 4.398438 -1.671875 4.511719 -2.105469 4.511719 -2.714844 C 4.511719 -3.277344 4.398438 -3.699219 4.171875 -3.972656 C 3.945313 -4.246094 3.664063 -4.382813 3.328125 -4.386719 C 2.980469 -4.382813 2.691406 -4.246094 2.460938 -3.980469 C 2.230469 -3.707031 2.113281 -3.308594 2.117188 -2.785156 Z M 2.117188 -2.785156 "
-           id="path8036" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-6">
-        <path
-           style="stroke:none;"
-           d="M 3.21875 -5.390625 L 3.21875 -4.253906 L 2.242188 -4.253906 L 2.242188 -2.082031 C 2.238281 -1.636719 2.246094 -1.382813 2.269531 -1.3125 C 2.285156 -1.238281 2.328125 -1.175781 2.398438 -1.132813 C 2.460938 -1.082031 2.542969 -1.058594 2.640625 -1.0625 C 2.769531 -1.058594 2.960938 -1.105469 3.214844 -1.199219 L 3.335938 -0.0898438 C 3 0.0507813 2.625 0.117188 2.207031 0.121094 C 1.945313 0.117188 1.714844 0.078125 1.511719 -0.0078125 C 1.304688 -0.0898438 1.152344 -0.203125 1.058594 -0.339844 C 0.960938 -0.476563 0.894531 -0.660156 0.859375 -0.898438 C 0.824219 -1.0625 0.808594 -1.398438 0.8125 -1.902344 L 0.8125 -4.253906 L 0.15625 -4.253906 L 0.15625 -5.390625 L 0.8125 -5.390625 L 0.8125 -6.460938 L 2.242188 -7.292969 L 2.242188 -5.390625 Z M 3.21875 -5.390625 "
-           id="path8039" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-7">
-        <path
-           style="stroke:none;"
-           d="M 0.242188 -1.539063 L 1.675781 -1.757813 C 1.734375 -1.476563 1.855469 -1.265625 2.042969 -1.125 C 2.226563 -0.980469 2.488281 -0.90625 2.828125 -0.910156 C 3.191406 -0.90625 3.46875 -0.976563 3.660156 -1.113281 C 3.78125 -1.207031 3.84375 -1.332031 3.847656 -1.492188 C 3.84375 -1.597656 3.808594 -1.6875 3.746094 -1.761719 C 3.671875 -1.828125 3.515625 -1.890625 3.269531 -1.949219 C 2.117188 -2.199219 1.386719 -2.429688 1.082031 -2.644531 C 0.652344 -2.933594 0.4375 -3.339844 0.441406 -3.859375 C 0.4375 -4.324219 0.621094 -4.714844 0.992188 -5.035156 C 1.359375 -5.347656 1.933594 -5.507813 2.710938 -5.511719 C 3.445313 -5.507813 3.992188 -5.386719 4.355469 -5.152344 C 4.710938 -4.910156 4.960938 -4.554688 5.097656 -4.085938 L 3.75 -3.835938 C 3.691406 -4.042969 3.582031 -4.203125 3.421875 -4.320313 C 3.261719 -4.429688 3.03125 -4.488281 2.734375 -4.488281 C 2.355469 -4.488281 2.085938 -4.433594 1.929688 -4.328125 C 1.816406 -4.253906 1.761719 -4.15625 1.765625 -4.039063 C 1.761719 -3.933594 1.8125 -3.847656 1.910156 -3.78125 C 2.035156 -3.683594 2.476563 -3.550781 3.238281 -3.378906 C 3.996094 -3.203125 4.527344 -2.992188 4.832031 -2.746094 C 5.125 -2.492188 5.273438 -2.136719 5.277344 -1.683594 C 5.273438 -1.1875 5.066406 -0.765625 4.65625 -0.410156 C 4.242188 -0.0585938 3.632813 0.117188 2.828125 0.121094 C 2.089844 0.117188 1.507813 -0.0273438 1.082031 -0.324219 C 0.652344 -0.621094 0.371094 -1.027344 0.242188 -1.539063 Z M 0.242188 -1.539063 "
-           id="path8042" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-8">
-        <path
-           style="stroke:none;"
-           d="M 0.757813 0 L 0.757813 -7.441406 L 6.273438 -7.441406 L 6.273438 -6.183594 L 2.257813 -6.183594 L 2.257813 -4.53125 L 5.996094 -4.53125 L 5.996094 -3.277344 L 2.257813 -3.277344 L 2.257813 -1.253906 L 6.414063 -1.253906 L 6.414063 0 Z M 0.757813 0 "
-           id="path8045" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-9">
-        <path
-           style="stroke:none;"
-           d="M 0.0625 0 L 2.003906 -2.777344 L 0.140625 -5.390625 L 1.882813 -5.390625 L 2.835938 -3.910156 L 3.84375 -5.390625 L 5.515625 -5.390625 L 3.691406 -2.835938 L 5.683594 0 L 3.933594 0 L 2.835938 -1.667969 L 1.730469 0 Z M 0.0625 0 "
-           id="path8048" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-10">
-        <path
-           style="stroke:none;"
-           d="M 4.292969 0 L 4.292969 -0.808594 C 4.09375 -0.519531 3.835938 -0.289063 3.519531 -0.125 C 3.195313 0.0390625 2.859375 0.117188 2.507813 0.121094 C 2.144531 0.117188 1.816406 0.0390625 1.53125 -0.117188 C 1.242188 -0.273438 1.035156 -0.496094 0.90625 -0.785156 C 0.777344 -1.074219 0.714844 -1.472656 0.714844 -1.980469 L 0.714844 -5.390625 L 2.140625 -5.390625 L 2.140625 -2.914063 C 2.136719 -2.152344 2.164063 -1.6875 2.21875 -1.519531 C 2.269531 -1.347656 2.367188 -1.214844 2.507813 -1.113281 C 2.644531 -1.011719 2.820313 -0.960938 3.035156 -0.964844 C 3.277344 -0.960938 3.496094 -1.027344 3.691406 -1.164063 C 3.882813 -1.296875 4.011719 -1.464844 4.085938 -1.664063 C 4.152344 -1.859375 4.1875 -2.34375 4.191406 -3.117188 L 4.191406 -5.390625 L 5.617188 -5.390625 L 5.617188 0 Z M 4.292969 0 "
-           id="path8051" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-11">
-        <path
-           style="stroke:none;"
-           d="M 0.746094 -6.121094 L 0.746094 -7.441406 L 2.171875 -7.441406 L 2.171875 -6.121094 Z M 0.746094 0 L 0.746094 -5.390625 L 2.171875 -5.390625 L 2.171875 0 Z M 0.746094 0 "
-           id="path8054" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-12">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path8057" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-13">
-        <path
-           style="stroke:none;"
-           d="M 0.75 -7.441406 L 3.496094 -7.441406 C 4.113281 -7.441406 4.585938 -7.394531 4.914063 -7.300781 C 5.347656 -7.167969 5.71875 -6.9375 6.03125 -6.613281 C 6.339844 -6.28125 6.578125 -5.878906 6.742188 -5.40625 C 6.902344 -4.929688 6.984375 -4.34375 6.988281 -3.648438 C 6.984375 -3.035156 6.910156 -2.507813 6.761719 -2.066406 C 6.574219 -1.523438 6.308594 -1.085938 5.964844 -0.75 C 5.703125 -0.496094 5.347656 -0.296875 4.90625 -0.15625 C 4.570313 -0.0507813 4.128906 0 3.578125 0 L 0.75 0 Z M 2.253906 -6.183594 L 2.253906 -1.253906 L 3.375 -1.253906 C 3.792969 -1.253906 4.09375 -1.277344 4.285156 -1.324219 C 4.523438 -1.386719 4.726563 -1.488281 4.890625 -1.632813 C 5.050781 -1.777344 5.183594 -2.015625 5.285156 -2.351563 C 5.386719 -2.679688 5.4375 -3.136719 5.4375 -3.714844 C 5.4375 -4.289063 5.386719 -4.730469 5.285156 -5.039063 C 5.183594 -5.347656 5.039063 -5.585938 4.855469 -5.761719 C 4.671875 -5.929688 4.441406 -6.046875 4.160156 -6.109375 C 3.949219 -6.15625 3.539063 -6.179688 2.929688 -6.183594 Z M 2.253906 -6.183594 "
-           id="path8060" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-14">
-        <path
-           style="stroke:none;"
-           d="M 0.640625 -5.390625 L 1.953125 -5.390625 L 1.953125 -4.65625 C 2.421875 -5.222656 2.984375 -5.507813 3.632813 -5.511719 C 3.976563 -5.507813 4.273438 -5.4375 4.53125 -5.296875 C 4.78125 -5.15625 4.992188 -4.941406 5.15625 -4.65625 C 5.390625 -4.941406 5.644531 -5.15625 5.921875 -5.296875 C 6.195313 -5.4375 6.488281 -5.507813 6.800781 -5.511719 C 7.195313 -5.507813 7.53125 -5.425781 7.804688 -5.269531 C 8.078125 -5.105469 8.28125 -4.871094 8.421875 -4.5625 C 8.515625 -4.332031 8.566406 -3.960938 8.566406 -3.445313 L 8.566406 0 L 7.140625 0 L 7.140625 -3.082031 C 7.136719 -3.613281 7.089844 -3.957031 6.996094 -4.117188 C 6.863281 -4.316406 6.660156 -4.417969 6.386719 -4.421875 C 6.183594 -4.417969 5.992188 -4.355469 5.820313 -4.238281 C 5.640625 -4.113281 5.515625 -3.9375 5.441406 -3.703125 C 5.359375 -3.46875 5.320313 -3.097656 5.324219 -2.589844 L 5.324219 0 L 3.898438 0 L 3.898438 -2.953125 C 3.894531 -3.476563 3.871094 -3.8125 3.820313 -3.96875 C 3.769531 -4.117188 3.691406 -4.230469 3.585938 -4.308594 C 3.476563 -4.378906 3.332031 -4.417969 3.152344 -4.421875 C 2.929688 -4.417969 2.730469 -4.359375 2.554688 -4.242188 C 2.378906 -4.125 2.253906 -3.953125 2.179688 -3.730469 C 2.101563 -3.503906 2.0625 -3.132813 2.066406 -2.617188 L 2.066406 0 L 0.640625 0 Z M 0.640625 -5.390625 "
-           id="path8063" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-15">
-        <path
-           style="stroke:none;"
-           d="M 1.8125 -3.746094 L 0.519531 -3.980469 C 0.660156 -4.496094 0.910156 -4.882813 1.265625 -5.136719 C 1.621094 -5.382813 2.148438 -5.507813 2.851563 -5.511719 C 3.488281 -5.507813 3.960938 -5.433594 4.273438 -5.285156 C 4.582031 -5.132813 4.800781 -4.941406 4.929688 -4.710938 C 5.054688 -4.476563 5.121094 -4.050781 5.121094 -3.4375 L 5.105469 -1.769531 C 5.101563 -1.296875 5.125 -0.945313 5.171875 -0.722656 C 5.214844 -0.496094 5.300781 -0.257813 5.429688 0 L 4.019531 0 C 3.980469 -0.09375 3.933594 -0.234375 3.882813 -0.421875 C 3.855469 -0.503906 3.839844 -0.558594 3.832031 -0.589844 C 3.585938 -0.351563 3.324219 -0.171875 3.046875 -0.0546875 C 2.769531 0.0625 2.472656 0.117188 2.164063 0.121094 C 1.601563 0.117188 1.164063 -0.03125 0.847656 -0.328125 C 0.527344 -0.628906 0.367188 -1.007813 0.371094 -1.472656 C 0.367188 -1.773438 0.441406 -2.046875 0.589844 -2.285156 C 0.734375 -2.523438 0.9375 -2.703125 1.199219 -2.832031 C 1.460938 -2.957031 1.839844 -3.070313 2.335938 -3.167969 C 3 -3.289063 3.460938 -3.40625 3.71875 -3.515625 L 3.71875 -3.660156 C 3.714844 -3.929688 3.648438 -4.125 3.515625 -4.246094 C 3.378906 -4.359375 3.121094 -4.417969 2.75 -4.421875 C 2.492188 -4.417969 2.296875 -4.367188 2.15625 -4.269531 C 2.015625 -4.167969 1.898438 -3.992188 1.8125 -3.746094 Z M 3.71875 -2.589844 C 3.535156 -2.527344 3.246094 -2.453125 2.851563 -2.367188 C 2.453125 -2.28125 2.195313 -2.199219 2.074219 -2.121094 C 1.886719 -1.988281 1.792969 -1.820313 1.796875 -1.621094 C 1.792969 -1.417969 1.867188 -1.246094 2.019531 -1.101563 C 2.164063 -0.953125 2.355469 -0.878906 2.589844 -0.882813 C 2.84375 -0.878906 3.085938 -0.964844 3.324219 -1.136719 C 3.492188 -1.261719 3.605469 -1.417969 3.664063 -1.609375 C 3.695313 -1.730469 3.714844 -1.960938 3.71875 -2.304688 Z M 3.71875 -2.589844 "
-           id="path8066" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-16">
-        <path
-           style="stroke:none;"
-           d="M 5.691406 0 L 4.363281 0 L 4.363281 -0.792969 C 4.140625 -0.480469 3.882813 -0.25 3.585938 -0.101563 C 3.285156 0.046875 2.980469 0.117188 2.679688 0.121094 C 2.054688 0.117188 1.523438 -0.128906 1.085938 -0.625 C 0.640625 -1.121094 0.421875 -1.816406 0.425781 -2.714844 C 0.421875 -3.625 0.636719 -4.320313 1.070313 -4.796875 C 1.496094 -5.269531 2.039063 -5.507813 2.699219 -5.511719 C 3.300781 -5.507813 3.820313 -5.257813 4.261719 -4.761719 L 4.261719 -7.441406 L 5.691406 -7.441406 Z M 1.882813 -2.8125 C 1.878906 -2.234375 1.960938 -1.816406 2.121094 -1.5625 C 2.347656 -1.1875 2.667969 -1 3.085938 -1.003906 C 3.410156 -1 3.6875 -1.140625 3.921875 -1.421875 C 4.148438 -1.699219 4.265625 -2.117188 4.269531 -2.675781 C 4.265625 -3.292969 4.152344 -3.738281 3.933594 -4.011719 C 3.707031 -4.28125 3.421875 -4.417969 3.074219 -4.421875 C 2.734375 -4.417969 2.449219 -4.28125 2.222656 -4.015625 C 1.992188 -3.742188 1.878906 -3.34375 1.882813 -2.8125 Z M 1.882813 -2.8125 "
-           id="path8069" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-17">
-        <path
-           style="stroke:none;"
-           d="M 0.746094 -7.441406 L 2.25 -7.441406 L 2.25 -3.410156 C 2.25 -2.769531 2.265625 -2.355469 2.304688 -2.167969 C 2.367188 -1.863281 2.519531 -1.617188 2.761719 -1.433594 C 3 -1.246094 3.332031 -1.152344 3.757813 -1.15625 C 4.179688 -1.152344 4.5 -1.242188 4.71875 -1.417969 C 4.933594 -1.589844 5.066406 -1.800781 5.109375 -2.058594 C 5.152344 -2.308594 5.171875 -2.730469 5.175781 -3.324219 L 5.175781 -7.441406 L 6.679688 -7.441406 L 6.679688 -3.53125 C 6.679688 -2.636719 6.636719 -2.003906 6.558594 -1.636719 C 6.472656 -1.265625 6.324219 -0.957031 6.109375 -0.703125 C 5.886719 -0.449219 5.59375 -0.246094 5.230469 -0.0976563 C 4.863281 0.0507813 4.386719 0.121094 3.800781 0.125 C 3.085938 0.121094 2.546875 0.0429688 2.183594 -0.121094 C 1.8125 -0.28125 1.523438 -0.492188 1.3125 -0.757813 C 1.101563 -1.015625 0.960938 -1.292969 0.894531 -1.582031 C 0.792969 -2.007813 0.742188 -2.636719 0.746094 -3.472656 Z M 0.746094 -7.441406 "
-           id="path8072" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-18">
-        <path
-           style="stroke:none;"
-           d="M 2.113281 0 L 0.683594 0 L 0.683594 -5.390625 L 2.011719 -5.390625 L 2.011719 -4.625 C 2.234375 -4.984375 2.4375 -5.222656 2.621094 -5.339844 C 2.800781 -5.449219 3.007813 -5.507813 3.238281 -5.511719 C 3.5625 -5.507813 3.875 -5.417969 4.175781 -5.242188 L 3.734375 -4 C 3.492188 -4.152344 3.269531 -4.230469 3.066406 -4.234375 C 2.863281 -4.230469 2.695313 -4.175781 2.558594 -4.066406 C 2.417969 -3.957031 2.308594 -3.757813 2.230469 -3.46875 C 2.148438 -3.179688 2.109375 -2.578125 2.113281 -1.664063 Z M 2.113281 0 "
-           id="path8075" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-19">
-        <path
-           style="stroke:none;"
-           d="M 0.710938 0 L 0.710938 -7.441406 L 2.214844 -7.441406 L 2.214844 0 Z M 0.710938 0 "
-           id="path8078" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-20">
-        <path
-           style="stroke:none;"
-           d="M 0.746094 0 L 0.746094 -7.441406 L 2.171875 -7.441406 L 2.171875 0 Z M 0.746094 0 "
-           id="path8081" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-0">
-        <path
-           style="stroke:none;"
-           d="M 6.6875 0 L 5.222656 0 L 4.640625 -1.511719 L 1.976563 -1.511719 L 1.425781 0 L 0 0 L 2.59375 -6.664063 L 4.019531 -6.664063 Z M 4.207031 -2.636719 L 3.289063 -5.109375 L 2.390625 -2.636719 Z M 4.207031 -2.636719 "
-           id="path8084" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-1">
-        <path
-           style="stroke:none;"
-           d="M 4.875 -3.398438 L 3.617188 -3.171875 C 3.570313 -3.421875 3.476563 -3.613281 3.328125 -3.738281 C 3.179688 -3.863281 2.984375 -3.925781 2.746094 -3.929688 C 2.425781 -3.925781 2.171875 -3.816406 1.984375 -3.601563 C 1.792969 -3.378906 1.699219 -3.011719 1.699219 -2.5 C 1.699219 -1.925781 1.792969 -1.519531 1.988281 -1.285156 C 2.175781 -1.046875 2.433594 -0.929688 2.761719 -0.933594 C 3.003906 -0.929688 3.203125 -1 3.359375 -1.136719 C 3.507813 -1.273438 3.617188 -1.511719 3.6875 -1.851563 L 4.941406 -1.636719 C 4.808594 -1.058594 4.558594 -0.625 4.191406 -0.332031 C 3.816406 -0.0390625 3.320313 0.105469 2.703125 0.109375 C 1.992188 0.105469 1.429688 -0.113281 1.015625 -0.558594 C 0.59375 -1 0.386719 -1.617188 0.386719 -2.410156 C 0.386719 -3.203125 0.59375 -3.824219 1.015625 -4.269531 C 1.433594 -4.710938 2.003906 -4.933594 2.726563 -4.9375 C 3.3125 -4.933594 3.78125 -4.804688 4.128906 -4.554688 C 4.472656 -4.296875 4.722656 -3.914063 4.875 -3.398438 Z M 4.875 -3.398438 "
-           id="path8087" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-2">
-        <path
-           style="stroke:none;"
-           d="M 3.464844 -1.535156 L 4.734375 -1.324219 C 4.570313 -0.855469 4.3125 -0.496094 3.960938 -0.253906 C 3.605469 -0.0117188 3.164063 0.105469 2.636719 0.109375 C 1.796875 0.105469 1.175781 -0.167969 0.773438 -0.714844 C 0.453125 -1.152344 0.292969 -1.707031 0.296875 -2.378906 C 0.292969 -3.175781 0.5 -3.800781 0.921875 -4.257813 C 1.335938 -4.707031 1.867188 -4.933594 2.507813 -4.9375 C 3.226563 -4.933594 3.792969 -4.695313 4.207031 -4.222656 C 4.621094 -3.746094 4.820313 -3.019531 4.804688 -2.046875 L 1.605469 -2.046875 C 1.613281 -1.664063 1.714844 -1.371094 1.910156 -1.160156 C 2.105469 -0.949219 2.351563 -0.84375 2.648438 -0.84375 C 2.847656 -0.84375 3.015625 -0.898438 3.152344 -1.007813 C 3.289063 -1.117188 3.390625 -1.292969 3.464844 -1.535156 Z M 3.535156 -2.828125 C 3.523438 -3.195313 3.429688 -3.476563 3.25 -3.667969 C 3.066406 -3.859375 2.84375 -3.953125 2.585938 -3.957031 C 2.304688 -3.953125 2.074219 -3.851563 1.894531 -3.652344 C 1.707031 -3.449219 1.621094 -3.175781 1.628906 -2.828125 Z M 3.535156 -2.828125 "
-           id="path8090" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-3">
-        <path
-           style="stroke:none;"
-           d="M 0.667969 0 L 0.667969 -6.664063 L 1.945313 -6.664063 L 1.945313 0 Z M 0.667969 0 "
-           id="path8093" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-4">
-        <path
-           style="stroke:none;"
-           d="M 1.890625 0 L 0.613281 0 L 0.613281 -4.828125 L 1.800781 -4.828125 L 1.800781 -4.140625 C 2 -4.464844 2.183594 -4.679688 2.347656 -4.78125 C 2.507813 -4.882813 2.691406 -4.933594 2.898438 -4.9375 C 3.1875 -4.933594 3.46875 -4.851563 3.742188 -4.695313 L 3.34375 -3.582031 C 3.128906 -3.714844 2.929688 -3.785156 2.746094 -3.789063 C 2.5625 -3.785156 2.410156 -3.734375 2.289063 -3.640625 C 2.164063 -3.539063 2.066406 -3.363281 1.996094 -3.109375 C 1.921875 -2.847656 1.886719 -2.308594 1.890625 -1.492188 Z M 1.890625 0 "
-           id="path8096" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-5">
-        <path
-           style="stroke:none;"
-           d="M 1.621094 -3.355469 L 0.464844 -3.5625 C 0.589844 -4.027344 0.816406 -4.371094 1.136719 -4.597656 C 1.453125 -4.820313 1.925781 -4.933594 2.554688 -4.9375 C 3.125 -4.933594 3.546875 -4.867188 3.828125 -4.734375 C 4.101563 -4.597656 4.296875 -4.425781 4.414063 -4.21875 C 4.523438 -4.011719 4.582031 -3.628906 4.585938 -3.078125 L 4.574219 -1.585938 C 4.570313 -1.160156 4.589844 -0.847656 4.632813 -0.644531 C 4.671875 -0.441406 4.75 -0.226563 4.863281 0 L 3.601563 0 C 3.566406 -0.0859375 3.523438 -0.210938 3.476563 -0.378906 C 3.453125 -0.453125 3.4375 -0.503906 3.429688 -0.527344 C 3.210938 -0.316406 2.980469 -0.15625 2.730469 -0.0507813 C 2.480469 0.0546875 2.214844 0.105469 1.9375 0.109375 C 1.4375 0.105469 1.046875 -0.0273438 0.761719 -0.292969 C 0.472656 -0.5625 0.328125 -0.902344 0.332031 -1.316406 C 0.328125 -1.585938 0.394531 -1.828125 0.527344 -2.046875 C 0.65625 -2.257813 0.835938 -2.421875 1.074219 -2.535156 C 1.304688 -2.648438 1.644531 -2.75 2.089844 -2.835938 C 2.683594 -2.945313 3.097656 -3.046875 3.332031 -3.148438 L 3.332031 -3.277344 C 3.328125 -3.519531 3.265625 -3.695313 3.148438 -3.800781 C 3.023438 -3.902344 2.796875 -3.953125 2.464844 -3.957031 C 2.234375 -3.953125 2.054688 -3.910156 1.929688 -3.824219 C 1.800781 -3.734375 1.699219 -3.578125 1.621094 -3.355469 Z M 3.332031 -2.316406 C 3.164063 -2.257813 2.90625 -2.195313 2.554688 -2.121094 C 2.199219 -2.042969 1.964844 -1.96875 1.859375 -1.898438 C 1.6875 -1.78125 1.605469 -1.628906 1.609375 -1.449219 C 1.605469 -1.269531 1.671875 -1.117188 1.808594 -0.984375 C 1.9375 -0.851563 2.105469 -0.785156 2.316406 -0.789063 C 2.542969 -0.785156 2.765625 -0.863281 2.976563 -1.019531 C 3.128906 -1.132813 3.230469 -1.273438 3.28125 -1.441406 C 3.3125 -1.550781 3.328125 -1.757813 3.332031 -2.0625 Z M 3.332031 -2.316406 "
-           id="path8099" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-6">
-        <path
-           style="stroke:none;"
-           d="M 2.882813 -4.828125 L 2.882813 -3.808594 L 2.007813 -3.808594 L 2.007813 -1.863281 C 2.003906 -1.46875 2.011719 -1.238281 2.03125 -1.175781 C 2.046875 -1.109375 2.085938 -1.054688 2.144531 -1.011719 C 2.203125 -0.96875 2.273438 -0.949219 2.363281 -0.949219 C 2.476563 -0.949219 2.648438 -0.988281 2.878906 -1.074219 L 2.984375 -0.0820313 C 2.683594 0.0429688 2.347656 0.105469 1.976563 0.109375 C 1.742188 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.03125 -0.179688 0.945313 -0.304688 C 0.855469 -0.429688 0.796875 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.703125 L 0.726563 -3.808594 L 0.140625 -3.808594 L 0.140625 -4.828125 L 0.726563 -4.828125 L 0.726563 -5.785156 L 2.007813 -6.53125 L 2.007813 -4.828125 Z M 2.882813 -4.828125 "
-           id="path8102" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-7">
-        <path
-           style="stroke:none;"
-           d="M 0.371094 -2.480469 C 0.367188 -2.902344 0.472656 -3.3125 0.683594 -3.710938 C 0.890625 -4.105469 1.1875 -4.410156 1.574219 -4.621094 C 1.953125 -4.828125 2.382813 -4.933594 2.859375 -4.9375 C 3.585938 -4.933594 4.183594 -4.695313 4.652344 -4.222656 C 5.121094 -3.746094 5.355469 -3.148438 5.355469 -2.425781 C 5.355469 -1.695313 5.117188 -1.089844 4.648438 -0.609375 C 4.171875 -0.132813 3.578125 0.105469 2.867188 0.109375 C 2.421875 0.105469 2 0.0078125 1.601563 -0.191406 C 1.195313 -0.390625 0.890625 -0.683594 0.683594 -1.070313 C 0.472656 -1.453125 0.367188 -1.921875 0.371094 -2.480469 Z M 1.683594 -2.414063 C 1.679688 -1.933594 1.792969 -1.566406 2.023438 -1.3125 C 2.246094 -1.058594 2.527344 -0.929688 2.863281 -0.933594 C 3.195313 -0.929688 3.472656 -1.058594 3.699219 -1.3125 C 3.921875 -1.566406 4.035156 -1.933594 4.039063 -2.421875 C 4.035156 -2.890625 3.921875 -3.253906 3.699219 -3.511719 C 3.472656 -3.761719 3.195313 -3.890625 2.863281 -3.894531 C 2.527344 -3.890625 2.246094 -3.761719 2.023438 -3.511719 C 1.792969 -3.253906 1.679688 -2.890625 1.683594 -2.414063 Z M 1.683594 -2.414063 "
-           id="path8105" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-0">
-        <path
-           style="stroke:none;"
-           d="M 1.632813 0 L 0.03125 -6.703125 L 1.417969 -6.703125 L 2.429688 -2.097656 L 3.652344 -6.703125 L 5.265625 -6.703125 L 6.4375 -2.019531 L 7.46875 -6.703125 L 8.832031 -6.703125 L 7.203125 0 L 5.765625 0 L 4.433594 -5.011719 L 3.101563 0 Z M 1.632813 0 "
-           id="path8108" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-1">
-        <path
-           style="stroke:none;"
-           d="M 0.375 -2.496094 C 0.375 -2.921875 0.480469 -3.332031 0.691406 -3.734375 C 0.902344 -4.132813 1.199219 -4.4375 1.585938 -4.648438 C 1.96875 -4.855469 2.398438 -4.960938 2.875 -4.964844 C 3.609375 -4.960938 4.210938 -4.722656 4.683594 -4.25 C 5.148438 -3.769531 5.382813 -3.167969 5.386719 -2.441406 C 5.382813 -1.703125 5.148438 -1.09375 4.675781 -0.613281 C 4.199219 -0.132813 3.601563 0.105469 2.886719 0.109375 C 2.4375 0.105469 2.011719 0.0078125 1.613281 -0.191406 C 1.207031 -0.390625 0.902344 -0.683594 0.691406 -1.074219 C 0.480469 -1.460938 0.375 -1.9375 0.375 -2.496094 Z M 1.691406 -2.429688 C 1.6875 -1.945313 1.800781 -1.574219 2.035156 -1.320313 C 2.261719 -1.0625 2.546875 -0.9375 2.882813 -0.9375 C 3.214844 -0.9375 3.492188 -1.0625 3.722656 -1.320313 C 3.949219 -1.574219 4.066406 -1.949219 4.066406 -2.4375 C 4.066406 -2.914063 3.949219 -3.277344 3.722656 -3.535156 C 3.492188 -3.785156 3.214844 -3.914063 2.882813 -3.917969 C 2.546875 -3.914063 2.261719 -3.785156 2.035156 -3.535156 C 1.800781 -3.277344 1.6875 -2.910156 1.691406 -2.429688 Z M 1.691406 -2.429688 "
-           id="path8111" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-2">
-        <path
-           style="stroke:none;"
-           d="M 1.902344 0 L 0.617188 0 L 0.617188 -4.855469 L 1.8125 -4.855469 L 1.8125 -4.167969 C 2.011719 -4.488281 2.195313 -4.703125 2.359375 -4.808594 C 2.519531 -4.910156 2.707031 -4.960938 2.917969 -4.964844 C 3.207031 -4.960938 3.488281 -4.882813 3.765625 -4.722656 L 3.367188 -3.605469 C 3.148438 -3.738281 2.945313 -3.808594 2.761719 -3.8125 C 2.578125 -3.808594 2.425781 -3.757813 2.304688 -3.664063 C 2.175781 -3.5625 2.078125 -3.382813 2.007813 -3.125 C 1.933594 -2.863281 1.898438 -2.324219 1.902344 -1.5 Z M 1.902344 0 "
-           id="path8114" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-3">
-        <path
-           style="stroke:none;"
-           d="M 0.625 0 L 0.625 -6.703125 L 1.910156 -6.703125 L 1.910156 -3.148438 L 3.417969 -4.855469 L 5 -4.855469 L 3.339844 -3.082031 L 5.117188 0 L 3.730469 0 L 2.511719 -2.179688 L 1.910156 -1.554688 L 1.910156 0 Z M 0.625 0 "
-           id="path8117" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-4">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path8120" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-0">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 -6.683594 L 3.140625 -6.683594 C 3.695313 -6.679688 4.117188 -6.636719 4.414063 -6.558594 C 4.804688 -6.4375 5.140625 -6.234375 5.421875 -5.941406 C 5.695313 -5.644531 5.910156 -5.28125 6.058594 -4.859375 C 6.207031 -4.429688 6.28125 -3.902344 6.28125 -3.277344 C 6.28125 -2.722656 6.210938 -2.25 6.074219 -1.855469 C 5.90625 -1.367188 5.667969 -0.972656 5.359375 -0.675781 C 5.121094 -0.441406 4.804688 -0.265625 4.410156 -0.140625 C 4.109375 -0.046875 3.710938 0 3.214844 0 L 0.675781 0 Z M 2.023438 -5.554688 L 2.023438 -1.125 L 3.03125 -1.125 C 3.40625 -1.125 3.679688 -1.144531 3.847656 -1.191406 C 4.0625 -1.242188 4.242188 -1.335938 4.390625 -1.46875 C 4.53125 -1.597656 4.652344 -1.8125 4.746094 -2.113281 C 4.835938 -2.410156 4.878906 -2.820313 4.882813 -3.339844 C 4.878906 -3.855469 4.835938 -4.25 4.746094 -4.527344 C 4.652344 -4.804688 4.523438 -5.019531 4.363281 -5.175781 C 4.195313 -5.328125 3.988281 -5.433594 3.738281 -5.492188 C 3.550781 -5.53125 3.179688 -5.554688 2.632813 -5.554688 Z M 2.023438 -5.554688 "
-           id="path8123" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-1">
-        <path
-           style="stroke:none;"
-           d="M 0.671875 -5.5 L 0.671875 -6.683594 L 1.953125 -6.683594 L 1.953125 -5.5 Z M 0.671875 0 L 0.671875 -4.84375 L 1.953125 -4.84375 L 1.953125 0 Z M 0.671875 0 "
-           id="path8126" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-2">
-        <path
-           style="stroke:none;"
-           d="M 2.003906 0 L 0.0507813 -4.84375 L 1.394531 -4.84375 L 2.308594 -2.371094 L 2.570313 -1.546875 C 2.636719 -1.753906 2.679688 -1.890625 2.703125 -1.960938 C 2.742188 -2.097656 2.789063 -2.234375 2.839844 -2.371094 L 3.761719 -4.84375 L 5.082031 -4.84375 L 3.15625 0 Z M 2.003906 0 "
-           id="path8129" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-3">
-        <path
-           style="stroke:none;"
-           d="M 0.21875 -1.382813 L 1.503906 -1.578125 C 1.554688 -1.324219 1.664063 -1.136719 1.835938 -1.007813 C 2 -0.878906 2.234375 -0.8125 2.539063 -0.816406 C 2.867188 -0.8125 3.117188 -0.875 3.289063 -1 C 3.398438 -1.082031 3.457031 -1.195313 3.457031 -1.339844 C 3.457031 -1.4375 3.425781 -1.519531 3.367188 -1.582031 C 3.300781 -1.640625 3.15625 -1.695313 2.9375 -1.75 C 1.898438 -1.976563 1.246094 -2.183594 0.972656 -2.375 C 0.589844 -2.632813 0.398438 -2.996094 0.398438 -3.464844 C 0.398438 -3.878906 0.5625 -4.234375 0.894531 -4.523438 C 1.222656 -4.808594 1.734375 -4.949219 2.433594 -4.953125 C 3.09375 -4.949219 3.585938 -4.84375 3.910156 -4.628906 C 4.230469 -4.410156 4.453125 -4.089844 4.578125 -3.671875 L 3.371094 -3.449219 C 3.316406 -3.632813 3.21875 -3.777344 3.074219 -3.878906 C 2.929688 -3.980469 2.722656 -4.03125 2.457031 -4.03125 C 2.117188 -4.03125 1.878906 -3.984375 1.734375 -3.890625 C 1.632813 -3.820313 1.582031 -3.730469 1.585938 -3.628906 C 1.582031 -3.535156 1.625 -3.460938 1.714844 -3.398438 C 1.828125 -3.3125 2.226563 -3.191406 2.910156 -3.035156 C 3.589844 -2.878906 4.066406 -2.6875 4.339844 -2.46875 C 4.605469 -2.234375 4.738281 -1.917969 4.742188 -1.515625 C 4.738281 -1.066406 4.554688 -0.683594 4.183594 -0.367188 C 3.8125 -0.0507813 3.261719 0.105469 2.539063 0.109375 C 1.875 0.105469 1.355469 -0.0234375 0.972656 -0.289063 C 0.589844 -0.554688 0.335938 -0.921875 0.21875 -1.382813 Z M 0.21875 -1.382813 "
-           id="path8132" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-4">
-        <path
-           style="stroke:none;"
-           d="M 0.375 -2.488281 C 0.375 -2.914063 0.476563 -3.328125 0.6875 -3.726563 C 0.894531 -4.125 1.191406 -4.425781 1.578125 -4.636719 C 1.960938 -4.84375 2.390625 -4.949219 2.867188 -4.953125 C 3.597656 -4.949219 4.199219 -4.710938 4.667969 -4.238281 C 5.132813 -3.757813 5.367188 -3.15625 5.371094 -2.433594 C 5.367188 -1.699219 5.128906 -1.09375 4.660156 -0.613281 C 4.183594 -0.132813 3.589844 0.105469 2.878906 0.109375 C 2.429688 0.105469 2.007813 0.0078125 1.605469 -0.191406 C 1.203125 -0.390625 0.894531 -0.683594 0.6875 -1.074219 C 0.476563 -1.457031 0.375 -1.929688 0.375 -2.488281 Z M 1.6875 -2.421875 C 1.6875 -1.9375 1.800781 -1.570313 2.027344 -1.316406 C 2.253906 -1.058594 2.535156 -0.929688 2.871094 -0.933594 C 3.207031 -0.929688 3.488281 -1.058594 3.714844 -1.316406 C 3.941406 -1.570313 4.054688 -1.941406 4.054688 -2.429688 C 4.054688 -2.902344 3.941406 -3.265625 3.714844 -3.523438 C 3.488281 -3.773438 3.207031 -3.902344 2.871094 -3.90625 C 2.535156 -3.902344 2.253906 -3.773438 2.027344 -3.523438 C 1.800781 -3.265625 1.6875 -2.898438 1.6875 -2.421875 Z M 1.6875 -2.421875 "
-           id="path8135" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-5">
-        <path
-           style="stroke:none;"
-           d="M 5.074219 0 L 3.792969 0 L 3.792969 -2.472656 C 3.789063 -2.996094 3.761719 -3.332031 3.710938 -3.484375 C 3.652344 -3.636719 3.566406 -3.757813 3.445313 -3.84375 C 3.320313 -3.929688 3.171875 -3.972656 3 -3.972656 C 2.777344 -3.972656 2.578125 -3.910156 2.402344 -3.789063 C 2.226563 -3.664063 2.105469 -3.503906 2.039063 -3.304688 C 1.972656 -3.101563 1.9375 -2.730469 1.941406 -2.195313 L 1.941406 0 L 0.660156 0 L 0.660156 -4.84375 L 1.851563 -4.84375 L 1.851563 -4.132813 C 2.273438 -4.675781 2.804688 -4.949219 3.449219 -4.953125 C 3.726563 -4.949219 3.984375 -4.898438 4.222656 -4.800781 C 4.453125 -4.695313 4.628906 -4.566406 4.753906 -4.410156 C 4.871094 -4.25 4.957031 -4.070313 5.003906 -3.871094 C 5.050781 -3.667969 5.074219 -3.382813 5.074219 -3.011719 Z M 5.074219 0 "
-           id="path8138" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-0">
-        <path
-           style="stroke:none;"
-           d="M 6.609375 0 L 5.160156 0 L 4.585938 -1.496094 L 1.953125 -1.496094 L 1.410156 0 L 0 0 L 2.566406 -6.585938 L 3.972656 -6.585938 Z M 4.160156 -2.605469 L 3.253906 -5.050781 L 2.363281 -2.605469 Z M 4.160156 -2.605469 "
-           id="path8141" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-1">
-        <path
-           style="stroke:none;"
-           d="M 2.847656 -4.769531 L 2.847656 -3.765625 L 1.984375 -3.765625 L 1.984375 -1.84375 C 1.980469 -1.449219 1.988281 -1.222656 2.007813 -1.160156 C 2.023438 -1.097656 2.0625 -1.042969 2.121094 -1 C 2.179688 -0.957031 2.25 -0.9375 2.335938 -0.9375 C 2.449219 -0.9375 2.617188 -0.976563 2.84375 -1.058594 L 2.953125 -0.0820313 C 2.65625 0.0429688 2.320313 0.105469 1.953125 0.109375 C 1.722656 0.105469 1.519531 0.0703125 1.339844 -0.00390625 C 1.15625 -0.078125 1.023438 -0.175781 0.9375 -0.300781 C 0.851563 -0.417969 0.789063 -0.585938 0.757813 -0.796875 C 0.726563 -0.941406 0.714844 -1.234375 0.71875 -1.683594 L 0.71875 -3.765625 L 0.140625 -3.765625 L 0.140625 -4.769531 L 0.71875 -4.769531 L 0.71875 -5.71875 L 1.984375 -6.457031 L 1.984375 -4.769531 Z M 2.847656 -4.769531 "
-           id="path8144" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-2">
-        <path
-           style="stroke:none;"
-           d="M 0.367188 -2.453125 C 0.363281 -2.871094 0.464844 -3.277344 0.675781 -3.667969 C 0.878906 -4.058594 1.171875 -4.355469 1.554688 -4.566406 C 1.929688 -4.769531 2.355469 -4.875 2.824219 -4.878906 C 3.546875 -4.875 4.136719 -4.640625 4.601563 -4.175781 C 5.058594 -3.703125 5.289063 -3.109375 5.292969 -2.398438 C 5.289063 -1.671875 5.058594 -1.074219 4.59375 -0.601563 C 4.125 -0.128906 3.539063 0.105469 2.835938 0.109375 C 2.398438 0.105469 1.980469 0.0078125 1.582031 -0.1875 C 1.183594 -0.382813 0.878906 -0.671875 0.675781 -1.054688 C 0.464844 -1.4375 0.363281 -1.902344 0.367188 -2.453125 Z M 1.664063 -2.386719 C 1.664063 -1.910156 1.773438 -1.546875 2 -1.296875 C 2.21875 -1.042969 2.496094 -0.917969 2.832031 -0.921875 C 3.15625 -0.917969 3.433594 -1.042969 3.65625 -1.296875 C 3.878906 -1.546875 3.988281 -1.914063 3.992188 -2.394531 C 3.988281 -2.859375 3.878906 -3.21875 3.65625 -3.472656 C 3.433594 -3.722656 3.15625 -3.847656 2.832031 -3.851563 C 2.496094 -3.847656 2.21875 -3.722656 2 -3.472656 C 1.773438 -3.21875 1.664063 -2.855469 1.664063 -2.386719 Z M 1.664063 -2.386719 "
-           id="path8147" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-3">
-        <path
-           style="stroke:none;"
-           d="M 0.566406 -4.769531 L 1.730469 -4.769531 L 1.730469 -4.121094 C 2.144531 -4.621094 2.640625 -4.875 3.214844 -4.878906 C 3.519531 -4.875 3.785156 -4.8125 4.011719 -4.6875 C 4.234375 -4.5625 4.417969 -4.371094 4.5625 -4.121094 C 4.769531 -4.371094 4.996094 -4.5625 5.242188 -4.6875 C 5.480469 -4.8125 5.738281 -4.875 6.019531 -4.878906 C 6.367188 -4.875 6.664063 -4.804688 6.910156 -4.664063 C 7.148438 -4.523438 7.332031 -4.3125 7.453125 -4.039063 C 7.539063 -3.832031 7.582031 -3.503906 7.582031 -3.050781 L 7.582031 0 L 6.320313 0 L 6.320313 -2.726563 C 6.316406 -3.195313 6.273438 -3.503906 6.191406 -3.644531 C 6.070313 -3.824219 5.890625 -3.914063 5.652344 -3.914063 C 5.472656 -3.914063 5.304688 -3.859375 5.152344 -3.75 C 4.992188 -3.640625 4.878906 -3.480469 4.8125 -3.277344 C 4.742188 -3.066406 4.710938 -2.738281 4.710938 -2.292969 L 4.710938 0 L 3.449219 0 L 3.449219 -2.613281 C 3.445313 -3.078125 3.421875 -3.375 3.382813 -3.511719 C 3.335938 -3.644531 3.265625 -3.746094 3.171875 -3.8125 C 3.074219 -3.878906 2.949219 -3.914063 2.789063 -3.914063 C 2.59375 -3.914063 2.417969 -3.859375 2.261719 -3.753906 C 2.105469 -3.648438 1.992188 -3.496094 1.929688 -3.300781 C 1.859375 -3.097656 1.828125 -2.769531 1.828125 -2.316406 L 1.828125 0 L 0.566406 0 Z M 0.566406 -4.769531 "
-           id="path8150" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-4">
-        <path
-           style="stroke:none;"
-           d="M 0.660156 -5.417969 L 0.660156 -6.585938 L 1.921875 -6.585938 L 1.921875 -5.417969 Z M 0.660156 0 L 0.660156 -4.769531 L 1.921875 -4.769531 L 1.921875 0 Z M 0.660156 0 "
-           id="path8153" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-5">
-        <path
-           style="stroke:none;"
-           d="M 4.820313 -3.359375 L 3.574219 -3.136719 C 3.53125 -3.378906 3.433594 -3.566406 3.289063 -3.695313 C 3.140625 -3.820313 2.949219 -3.882813 2.714844 -3.886719 C 2.394531 -3.882813 2.144531 -3.773438 1.960938 -3.558594 C 1.769531 -3.339844 1.675781 -2.980469 1.679688 -2.472656 C 1.675781 -1.902344 1.769531 -1.5 1.964844 -1.269531 C 2.152344 -1.035156 2.410156 -0.917969 2.730469 -0.921875 C 2.96875 -0.917969 3.164063 -0.988281 3.320313 -1.125 C 3.472656 -1.257813 3.582031 -1.492188 3.644531 -1.828125 L 4.882813 -1.617188 C 4.75 -1.046875 4.503906 -0.617188 4.140625 -0.328125 C 3.773438 -0.0390625 3.285156 0.105469 2.671875 0.109375 C 1.972656 0.105469 1.414063 -0.113281 1.003906 -0.550781 C 0.585938 -0.992188 0.378906 -1.601563 0.382813 -2.382813 C 0.378906 -3.167969 0.585938 -3.78125 1.003906 -4.21875 C 1.417969 -4.65625 1.984375 -4.875 2.695313 -4.878906 C 3.273438 -4.875 3.734375 -4.75 4.082031 -4.503906 C 4.421875 -4.25 4.667969 -3.867188 4.820313 -3.359375 Z M 4.820313 -3.359375 "
-           id="path8156" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-0">
-        <path
-           style="stroke:none;"
-           d="M 0.6875 0 L 0.6875 -6.753906 L 2.875 -6.753906 C 3.699219 -6.75 4.238281 -6.714844 4.496094 -6.652344 C 4.878906 -6.546875 5.203125 -6.328125 5.46875 -5.988281 C 5.726563 -5.648438 5.859375 -5.210938 5.859375 -4.675781 C 5.859375 -4.261719 5.78125 -3.910156 5.632813 -3.628906 C 5.480469 -3.34375 5.289063 -3.125 5.058594 -2.964844 C 4.824219 -2.800781 4.589844 -2.691406 4.351563 -2.644531 C 4.023438 -2.574219 3.550781 -2.542969 2.9375 -2.546875 L 2.050781 -2.546875 L 2.050781 0 Z M 2.050781 -5.609375 L 2.050781 -3.695313 L 2.796875 -3.695313 C 3.328125 -3.691406 3.6875 -3.726563 3.871094 -3.800781 C 4.050781 -3.867188 4.191406 -3.976563 4.296875 -4.132813 C 4.398438 -4.28125 4.453125 -4.457031 4.453125 -4.65625 C 4.453125 -4.902344 4.378906 -5.105469 4.238281 -5.265625 C 4.089844 -5.421875 3.910156 -5.523438 3.691406 -5.566406 C 3.527344 -5.59375 3.199219 -5.605469 2.707031 -5.609375 Z M 2.050781 -5.609375 "
-           id="path8159" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-1">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 0 L 0.675781 -6.753906 L 1.972656 -6.753906 L 1.972656 0 Z M 0.675781 0 "
-           id="path8162" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-2">
-        <path
-           style="stroke:none;"
-           d="M 2.921875 -4.890625 L 2.921875 -3.859375 L 2.035156 -3.859375 L 2.035156 -1.890625 C 2.03125 -1.488281 2.039063 -1.253906 2.058594 -1.191406 C 2.074219 -1.121094 2.113281 -1.066406 2.175781 -1.027344 C 2.230469 -0.980469 2.304688 -0.960938 2.394531 -0.960938 C 2.511719 -0.960938 2.6875 -1 2.917969 -1.085938 L 3.027344 -0.0820313 C 2.722656 0.0429688 2.382813 0.105469 2.003906 0.109375 C 1.769531 0.105469 1.558594 0.0703125 1.371094 -0.0078125 C 1.183594 -0.0820313 1.046875 -0.183594 0.960938 -0.308594 C 0.871094 -0.429688 0.808594 -0.597656 0.777344 -0.816406 C 0.75 -0.964844 0.738281 -1.269531 0.738281 -1.726563 L 0.738281 -3.859375 L 0.144531 -3.859375 L 0.144531 -4.890625 L 0.738281 -4.890625 L 0.738281 -5.863281 L 2.035156 -6.621094 L 2.035156 -4.890625 Z M 2.921875 -4.890625 "
-           id="path8165" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-3">
-        <path
-           style="stroke:none;"
-           d="M 0.109375 -4.890625 L 0.828125 -4.890625 L 0.828125 -5.261719 C 0.824219 -5.671875 0.871094 -5.976563 0.960938 -6.183594 C 1.046875 -6.382813 1.207031 -6.546875 1.445313 -6.675781 C 1.675781 -6.800781 1.972656 -6.867188 2.335938 -6.867188 C 2.699219 -6.867188 3.058594 -6.8125 3.417969 -6.703125 L 3.242188 -5.800781 C 3.03125 -5.847656 2.835938 -5.871094 2.648438 -5.875 C 2.460938 -5.871094 2.324219 -5.828125 2.246094 -5.742188 C 2.160156 -5.652344 2.121094 -5.484375 2.125 -5.238281 L 2.125 -4.890625 L 3.089844 -4.890625 L 3.089844 -3.875 L 2.125 -3.875 L 2.125 0 L 0.828125 0 L 0.828125 -3.875 L 0.109375 -3.875 Z M 0.109375 -4.890625 "
-           id="path8168" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-4">
-        <path
-           style="stroke:none;"
-           d="M 5.007813 -2.484375 L 6.328125 -2.0625 C 6.125 -1.324219 5.789063 -0.777344 5.320313 -0.421875 C 4.847656 -0.0664063 4.246094 0.109375 3.523438 0.113281 C 2.625 0.109375 1.890625 -0.191406 1.3125 -0.804688 C 0.734375 -1.414063 0.445313 -2.253906 0.445313 -3.316406 C 0.445313 -4.441406 0.734375 -5.3125 1.316406 -5.933594 C 1.894531 -6.554688 2.65625 -6.867188 3.605469 -6.867188 C 4.429688 -6.867188 5.101563 -6.621094 5.621094 -6.136719 C 5.925781 -5.84375 6.15625 -5.429688 6.3125 -4.890625 L 4.960938 -4.570313 C 4.878906 -4.917969 4.710938 -5.191406 4.460938 -5.398438 C 4.203125 -5.597656 3.898438 -5.699219 3.539063 -5.703125 C 3.039063 -5.699219 2.632813 -5.519531 2.320313 -5.164063 C 2.007813 -4.800781 1.851563 -4.21875 1.851563 -3.417969 C 1.851563 -2.558594 2.003906 -1.953125 2.3125 -1.59375 C 2.617188 -1.230469 3.015625 -1.046875 3.511719 -1.050781 C 3.871094 -1.046875 4.183594 -1.164063 4.445313 -1.394531 C 4.703125 -1.625 4.890625 -1.988281 5.007813 -2.484375 Z M 5.007813 -2.484375 "
-           id="path8171" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-5">
-        <path
-           style="stroke:none;"
-           d="M 0.640625 -4.890625 L 1.847656 -4.890625 L 1.847656 -4.171875 C 2 -4.417969 2.210938 -4.617188 2.480469 -4.773438 C 2.746094 -4.925781 3.042969 -5.003906 3.371094 -5.003906 C 3.933594 -5.003906 4.417969 -4.78125 4.816406 -4.335938 C 5.214844 -3.890625 5.414063 -3.269531 5.414063 -2.472656 C 5.414063 -1.65625 5.210938 -1.019531 4.8125 -0.570313 C 4.410156 -0.117188 3.929688 0.105469 3.363281 0.109375 C 3.089844 0.105469 2.84375 0.0546875 2.625 -0.0507813 C 2.40625 -0.15625 2.175781 -0.339844 1.933594 -0.601563 L 1.933594 1.859375 L 0.640625 1.859375 Z M 1.921875 -2.527344 C 1.921875 -1.972656 2.027344 -1.566406 2.246094 -1.308594 C 2.460938 -1.042969 2.730469 -0.914063 3.046875 -0.917969 C 3.34375 -0.914063 3.589844 -1.035156 3.792969 -1.277344 C 3.988281 -1.519531 4.089844 -1.914063 4.09375 -2.464844 C 4.089844 -2.976563 3.988281 -3.355469 3.785156 -3.605469 C 3.578125 -3.851563 3.324219 -3.976563 3.023438 -3.980469 C 2.703125 -3.976563 2.4375 -3.855469 2.234375 -3.613281 C 2.023438 -3.367188 1.921875 -3.003906 1.921875 -2.527344 Z M 1.921875 -2.527344 "
-           id="path8174" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-6">
-        <path
-           style="stroke:none;"
-           d="M 3.898438 0 L 3.898438 -0.734375 C 3.714844 -0.46875 3.480469 -0.261719 3.191406 -0.113281 C 2.898438 0.0351563 2.59375 0.105469 2.277344 0.109375 C 1.945313 0.105469 1.648438 0.0351563 1.390625 -0.105469 C 1.125 -0.25 0.9375 -0.453125 0.824219 -0.714844 C 0.703125 -0.972656 0.644531 -1.332031 0.648438 -1.796875 L 0.648438 -4.890625 L 1.945313 -4.890625 L 1.945313 -2.644531 C 1.945313 -1.953125 1.96875 -1.53125 2.015625 -1.378906 C 2.0625 -1.222656 2.148438 -1.101563 2.273438 -1.011719 C 2.398438 -0.917969 2.558594 -0.871094 2.753906 -0.875 C 2.972656 -0.871094 3.167969 -0.933594 3.347656 -1.054688 C 3.519531 -1.175781 3.640625 -1.324219 3.707031 -1.507813 C 3.769531 -1.683594 3.800781 -2.125 3.804688 -2.828125 L 3.804688 -4.890625 L 5.101563 -4.890625 L 5.101563 0 Z M 3.898438 0 "
-           id="path8177" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-0">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 0 L 0.675781 -6.660156 L 2.835938 -6.660156 C 3.652344 -6.65625 4.183594 -6.621094 4.433594 -6.558594 C 4.8125 -6.457031 5.132813 -6.242188 5.390625 -5.90625 C 5.648438 -5.570313 5.777344 -5.136719 5.777344 -4.613281 C 5.777344 -4.199219 5.703125 -3.855469 5.554688 -3.578125 C 5.40625 -3.296875 5.214844 -3.078125 4.988281 -2.921875 C 4.757813 -2.761719 4.527344 -2.65625 4.292969 -2.609375 C 3.96875 -2.539063 3.503906 -2.507813 2.898438 -2.511719 L 2.023438 -2.511719 L 2.023438 0 Z M 2.023438 -5.535156 L 2.023438 -3.644531 L 2.757813 -3.644531 C 3.285156 -3.644531 3.640625 -3.675781 3.820313 -3.746094 C 4 -3.8125 4.140625 -3.921875 4.242188 -4.074219 C 4.34375 -4.21875 4.394531 -4.394531 4.394531 -4.59375 C 4.394531 -4.835938 4.320313 -5.035156 4.179688 -5.191406 C 4.03125 -5.347656 3.851563 -5.445313 3.640625 -5.488281 C 3.476563 -5.515625 3.15625 -5.53125 2.671875 -5.535156 Z M 2.023438 -5.535156 "
-           id="path8180" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-1">
-        <path
-           style="stroke:none;"
-           d="M 0.667969 0 L 0.667969 -6.660156 L 1.945313 -6.660156 L 1.945313 0 Z M 0.667969 0 "
-           id="path8183" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-2">
-        <path
-           style="stroke:none;"
-           d="M 2.878906 -4.824219 L 2.878906 -3.808594 L 2.007813 -3.808594 L 2.007813 -1.863281 C 2.003906 -1.46875 2.011719 -1.238281 2.03125 -1.175781 C 2.046875 -1.109375 2.085938 -1.054688 2.144531 -1.011719 C 2.203125 -0.96875 2.273438 -0.949219 2.363281 -0.949219 C 2.476563 -0.949219 2.648438 -0.988281 2.875 -1.070313 L 2.984375 -0.0820313 C 2.683594 0.0429688 2.347656 0.105469 1.976563 0.109375 C 1.742188 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.03125 -0.179688 0.945313 -0.304688 C 0.855469 -0.429688 0.796875 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.703125 L 0.726563 -3.808594 L 0.140625 -3.808594 L 0.140625 -4.824219 L 0.726563 -4.824219 L 0.726563 -5.785156 L 2.007813 -6.527344 L 2.007813 -4.824219 Z M 2.878906 -4.824219 "
-           id="path8186" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-3">
-        <path
-           style="stroke:none;"
-           d="M 0.109375 -4.824219 L 0.816406 -4.824219 L 0.816406 -5.1875 C 0.8125 -5.589844 0.855469 -5.894531 0.945313 -6.097656 C 1.03125 -6.296875 1.191406 -6.460938 1.425781 -6.585938 C 1.652344 -6.710938 1.945313 -6.773438 2.304688 -6.773438 C 2.664063 -6.773438 3.019531 -6.71875 3.371094 -6.609375 L 3.199219 -5.71875 C 2.996094 -5.765625 2.800781 -5.789063 2.613281 -5.792969 C 2.425781 -5.789063 2.292969 -5.746094 2.214844 -5.664063 C 2.132813 -5.574219 2.089844 -5.40625 2.09375 -5.164063 L 2.09375 -4.824219 L 3.046875 -4.824219 L 3.046875 -3.820313 L 2.09375 -3.820313 L 2.09375 0 L 0.816406 0 L 0.816406 -3.820313 L 0.109375 -3.820313 Z M 0.109375 -4.824219 "
-           id="path8189" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-4">
-        <path
-           style="stroke:none;"
-           d="M 4.9375 -2.449219 L 6.242188 -2.035156 C 6.039063 -1.304688 5.707031 -0.765625 5.242188 -0.414063 C 4.777344 -0.0625 4.1875 0.109375 3.476563 0.113281 C 2.585938 0.109375 1.859375 -0.1875 1.292969 -0.792969 C 0.722656 -1.394531 0.4375 -2.222656 0.441406 -3.269531 C 0.4375 -4.378906 0.722656 -5.238281 1.296875 -5.851563 C 1.867188 -6.464844 2.621094 -6.773438 3.558594 -6.773438 C 4.371094 -6.773438 5.035156 -6.53125 5.542969 -6.050781 C 5.84375 -5.765625 6.070313 -5.355469 6.222656 -4.824219 L 4.894531 -4.507813 C 4.808594 -4.847656 4.644531 -5.121094 4.398438 -5.324219 C 4.144531 -5.523438 3.839844 -5.625 3.488281 -5.625 C 2.992188 -5.625 2.59375 -5.445313 2.289063 -5.09375 C 1.980469 -4.734375 1.828125 -4.160156 1.828125 -3.371094 C 1.828125 -2.527344 1.976563 -1.929688 2.28125 -1.570313 C 2.578125 -1.210938 2.972656 -1.03125 3.460938 -1.035156 C 3.816406 -1.03125 4.125 -1.144531 4.382813 -1.375 C 4.640625 -1.601563 4.824219 -1.960938 4.9375 -2.449219 Z M 4.9375 -2.449219 "
-           id="path8192" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-5">
-        <path
-           style="stroke:none;"
-           d="M 3.84375 0 L 3.84375 -0.722656 C 3.664063 -0.460938 3.429688 -0.257813 3.148438 -0.113281 C 2.859375 0.0351563 2.558594 0.105469 2.246094 0.109375 C 1.917969 0.105469 1.628906 0.0390625 1.371094 -0.101563 C 1.113281 -0.242188 0.925781 -0.445313 0.8125 -0.703125 C 0.695313 -0.960938 0.636719 -1.316406 0.640625 -1.773438 L 0.640625 -4.824219 L 1.917969 -4.824219 L 1.917969 -2.609375 C 1.914063 -1.925781 1.9375 -1.511719 1.988281 -1.359375 C 2.03125 -1.207031 2.117188 -1.085938 2.242188 -0.996094 C 2.363281 -0.90625 2.523438 -0.859375 2.71875 -0.863281 C 2.933594 -0.859375 3.125 -0.921875 3.300781 -1.042969 C 3.472656 -1.160156 3.59375 -1.308594 3.65625 -1.488281 C 3.71875 -1.664063 3.75 -2.097656 3.753906 -2.789063 L 3.753906 -4.824219 L 5.03125 -4.824219 L 5.03125 0 Z M 3.84375 0 "
-           id="path8195" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-6">
-        <path
-           style="stroke:none;"
-           d="M 5.09375 0 L 3.90625 0 L 3.90625 -0.707031 C 3.707031 -0.429688 3.476563 -0.226563 3.210938 -0.09375 C 2.941406 0.0390625 2.667969 0.105469 2.398438 0.109375 C 1.839844 0.105469 1.367188 -0.113281 0.972656 -0.558594 C 0.578125 -1.003906 0.378906 -1.628906 0.382813 -2.429688 C 0.378906 -3.246094 0.570313 -3.867188 0.957031 -4.292969 C 1.339844 -4.71875 1.828125 -4.933594 2.417969 -4.933594 C 2.953125 -4.933594 3.417969 -4.707031 3.816406 -4.261719 L 3.816406 -6.660156 L 5.09375 -6.660156 Z M 1.683594 -2.515625 C 1.679688 -2 1.753906 -1.628906 1.898438 -1.398438 C 2.101563 -1.0625 2.386719 -0.894531 2.761719 -0.898438 C 3.050781 -0.894531 3.300781 -1.019531 3.511719 -1.273438 C 3.714844 -1.519531 3.820313 -1.894531 3.820313 -2.394531 C 3.820313 -2.945313 3.71875 -3.34375 3.519531 -3.589844 C 3.320313 -3.832031 3.066406 -3.953125 2.753906 -3.957031 C 2.449219 -3.953125 2.195313 -3.832031 1.992188 -3.59375 C 1.785156 -3.351563 1.679688 -2.992188 1.683594 -2.515625 Z M 1.683594 -2.515625 "
-           id="path8198" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-7">
-        <path
-           style="stroke:none;"
-           d="M 1.621094 -3.351563 L 0.464844 -3.5625 C 0.589844 -4.027344 0.816406 -4.371094 1.136719 -4.597656 C 1.453125 -4.820313 1.925781 -4.933594 2.554688 -4.933594 C 3.121094 -4.933594 3.542969 -4.863281 3.824219 -4.730469 C 4.101563 -4.59375 4.296875 -4.425781 4.414063 -4.21875 C 4.523438 -4.007813 4.582031 -3.625 4.585938 -3.074219 L 4.570313 -1.585938 C 4.570313 -1.160156 4.589844 -0.847656 4.628906 -0.644531 C 4.667969 -0.441406 4.742188 -0.226563 4.859375 0 L 3.597656 0 C 3.5625 -0.0859375 3.519531 -0.210938 3.476563 -0.378906 C 3.453125 -0.453125 3.4375 -0.503906 3.429688 -0.527344 C 3.207031 -0.316406 2.976563 -0.15625 2.730469 -0.0507813 C 2.480469 0.0546875 2.214844 0.105469 1.933594 0.109375 C 1.4375 0.105469 1.046875 -0.0273438 0.761719 -0.292969 C 0.472656 -0.5625 0.328125 -0.902344 0.332031 -1.316406 C 0.328125 -1.585938 0.394531 -1.828125 0.527344 -2.046875 C 0.65625 -2.257813 0.835938 -2.421875 1.074219 -2.535156 C 1.304688 -2.648438 1.644531 -2.75 2.089844 -2.835938 C 2.683594 -2.945313 3.097656 -3.046875 3.332031 -3.148438 L 3.332031 -3.277344 C 3.328125 -3.519531 3.265625 -3.695313 3.148438 -3.800781 C 3.023438 -3.902344 2.796875 -3.953125 2.460938 -3.957031 C 2.234375 -3.953125 2.054688 -3.910156 1.929688 -3.820313 C 1.796875 -3.730469 1.695313 -3.574219 1.621094 -3.351563 Z M 3.332031 -2.316406 C 3.164063 -2.257813 2.90625 -2.195313 2.554688 -2.121094 C 2.199219 -2.042969 1.964844 -1.96875 1.859375 -1.898438 C 1.6875 -1.78125 1.605469 -1.628906 1.609375 -1.449219 C 1.605469 -1.265625 1.671875 -1.113281 1.808594 -0.984375 C 1.9375 -0.851563 2.105469 -0.785156 2.316406 -0.789063 C 2.542969 -0.785156 2.765625 -0.863281 2.976563 -1.019531 C 3.128906 -1.132813 3.230469 -1.273438 3.28125 -1.441406 C 3.3125 -1.550781 3.328125 -1.757813 3.332031 -2.0625 Z M 3.332031 -2.316406 "
-           id="path8201" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-8">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 0 L 0.679688 -6.660156 L 3.511719 -6.660156 C 4.21875 -6.65625 4.738281 -6.597656 5.0625 -6.480469 C 5.382813 -6.359375 5.640625 -6.144531 5.835938 -5.839844 C 6.03125 -5.53125 6.128906 -5.183594 6.128906 -4.792969 C 6.128906 -4.292969 5.980469 -3.878906 5.6875 -3.554688 C 5.390625 -3.226563 4.953125 -3.023438 4.371094 -2.941406 C 4.660156 -2.769531 4.898438 -2.582031 5.089844 -2.378906 C 5.277344 -2.175781 5.53125 -1.816406 5.855469 -1.300781 L 6.667969 0 L 5.0625 0 L 4.089844 -1.449219 C 3.742188 -1.964844 3.503906 -2.292969 3.378906 -2.425781 C 3.25 -2.558594 3.113281 -2.648438 2.972656 -2.703125 C 2.828125 -2.75 2.605469 -2.777344 2.300781 -2.78125 L 2.027344 -2.78125 L 2.027344 0 Z M 2.027344 -3.84375 L 3.019531 -3.84375 C 3.660156 -3.84375 4.0625 -3.871094 4.226563 -3.925781 C 4.382813 -3.980469 4.511719 -4.074219 4.605469 -4.207031 C 4.695313 -4.339844 4.738281 -4.503906 4.742188 -4.707031 C 4.738281 -4.925781 4.679688 -5.109375 4.5625 -5.25 C 4.441406 -5.386719 4.273438 -5.472656 4.058594 -5.511719 C 3.945313 -5.523438 3.617188 -5.53125 3.074219 -5.535156 L 2.027344 -5.535156 Z M 2.027344 -3.84375 "
-           id="path8204" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph12-0">
-        <path
-           style="stroke:none;"
-           d="M 0.683594 -6.757813 L 3.175781 -6.757813 C 3.738281 -6.753906 4.164063 -6.710938 4.460938 -6.625 C 4.851563 -6.507813 5.191406 -6.300781 5.476563 -6.003906 C 5.757813 -5.703125 5.972656 -5.339844 6.125 -4.910156 C 6.269531 -4.476563 6.34375 -3.941406 6.347656 -3.3125 C 6.34375 -2.753906 6.273438 -2.277344 6.136719 -1.875 C 5.964844 -1.382813 5.726563 -0.984375 5.414063 -0.683594 C 5.175781 -0.449219 4.855469 -0.269531 4.457031 -0.144531 C 4.152344 -0.046875 3.75 0 3.25 0 L 0.683594 0 Z M 2.046875 -5.613281 L 2.046875 -1.136719 L 3.066406 -1.136719 C 3.441406 -1.136719 3.71875 -1.15625 3.890625 -1.203125 C 4.109375 -1.253906 4.292969 -1.347656 4.441406 -1.484375 C 4.585938 -1.613281 4.703125 -1.832031 4.796875 -2.136719 C 4.886719 -2.4375 4.933594 -2.851563 4.9375 -3.375 C 4.933594 -3.894531 4.886719 -4.292969 4.796875 -4.574219 C 4.703125 -4.851563 4.574219 -5.070313 4.410156 -5.230469 C 4.242188 -5.382813 4.03125 -5.488281 3.777344 -5.546875 C 3.585938 -5.589844 3.214844 -5.613281 2.660156 -5.613281 Z M 2.046875 -5.613281 "
-           id="path8207" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph12-1">
-        <path
-           style="stroke:none;"
-           d="M 3.511719 -1.558594 L 4.800781 -1.339844 C 4.632813 -0.867188 4.371094 -0.507813 4.015625 -0.261719 C 3.65625 -0.015625 3.210938 0.105469 2.671875 0.109375 C 1.820313 0.105469 1.191406 -0.167969 0.785156 -0.722656 C 0.460938 -1.164063 0.296875 -1.726563 0.300781 -2.410156 C 0.296875 -3.214844 0.507813 -3.851563 0.933594 -4.3125 C 1.355469 -4.773438 1.894531 -5.003906 2.542969 -5.003906 C 3.269531 -5.003906 3.84375 -4.761719 4.265625 -4.28125 C 4.6875 -3.800781 4.886719 -3.066406 4.871094 -2.074219 L 1.625 -2.074219 C 1.632813 -1.6875 1.738281 -1.386719 1.9375 -1.175781 C 2.136719 -0.960938 2.386719 -0.855469 2.6875 -0.855469 C 2.886719 -0.855469 3.058594 -0.910156 3.199219 -1.019531 C 3.335938 -1.128906 3.4375 -1.308594 3.511719 -1.558594 Z M 3.585938 -2.867188 C 3.574219 -3.238281 3.476563 -3.523438 3.292969 -3.722656 C 3.109375 -3.914063 2.882813 -4.011719 2.621094 -4.015625 C 2.335938 -4.011719 2.105469 -3.910156 1.921875 -3.707031 C 1.734375 -3.496094 1.640625 -3.214844 1.648438 -2.867188 Z M 3.585938 -2.867188 "
-           id="path8210" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph12-2">
-        <path
-           style="stroke:none;"
-           d="M 2.023438 0 L 0.0507813 -4.894531 L 1.410156 -4.894531 L 2.332031 -2.394531 L 2.597656 -1.5625 C 2.667969 -1.769531 2.714844 -1.910156 2.734375 -1.980469 C 2.773438 -2.117188 2.820313 -2.253906 2.871094 -2.394531 L 3.800781 -4.894531 L 5.132813 -4.894531 L 3.1875 0 Z M 2.023438 0 "
-           id="path8213" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph12-3">
-        <path
-           style="stroke:none;"
-           d="M 5.007813 -2.484375 L 6.332031 -2.066406 C 6.125 -1.324219 5.789063 -0.777344 5.320313 -0.421875 C 4.847656 -0.0664063 4.25 0.109375 3.527344 0.113281 C 2.628906 0.109375 1.890625 -0.191406 1.3125 -0.804688 C 0.734375 -1.414063 0.445313 -2.253906 0.445313 -3.316406 C 0.445313 -4.441406 0.734375 -5.3125 1.316406 -5.9375 C 1.894531 -6.554688 2.660156 -6.867188 3.609375 -6.871094 C 4.433594 -6.867188 5.101563 -6.621094 5.621094 -6.136719 C 5.925781 -5.847656 6.15625 -5.433594 6.3125 -4.894531 L 4.964844 -4.570313 C 4.878906 -4.921875 4.710938 -5.199219 4.460938 -5.402344 C 4.203125 -5.605469 3.898438 -5.707031 3.539063 -5.707031 C 3.039063 -5.707031 2.632813 -5.523438 2.320313 -5.164063 C 2.007813 -4.800781 1.851563 -4.21875 1.851563 -3.417969 C 1.851563 -2.558594 2.003906 -1.953125 2.3125 -1.59375 C 2.617188 -1.230469 3.015625 -1.046875 3.511719 -1.050781 C 3.871094 -1.046875 4.183594 -1.164063 4.445313 -1.394531 C 4.707031 -1.625 4.894531 -1.988281 5.007813 -2.484375 Z M 5.007813 -2.484375 "
-           id="path8216" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph12-4">
-        <path
-           style="stroke:none;"
-           d="M 0.640625 -4.894531 L 1.847656 -4.894531 L 1.847656 -4.175781 C 2 -4.417969 2.214844 -4.617188 2.484375 -4.773438 C 2.75 -4.925781 3.046875 -5.003906 3.375 -5.003906 C 3.941406 -5.003906 4.421875 -4.78125 4.820313 -4.335938 C 5.214844 -3.890625 5.414063 -3.269531 5.414063 -2.476563 C 5.414063 -1.65625 5.210938 -1.019531 4.8125 -0.570313 C 4.410156 -0.117188 3.929688 0.105469 3.363281 0.109375 C 3.089844 0.105469 2.84375 0.0546875 2.628906 -0.0507813 C 2.40625 -0.15625 2.175781 -0.339844 1.933594 -0.605469 L 1.933594 1.863281 L 0.640625 1.863281 Z M 1.921875 -2.53125 C 1.921875 -1.980469 2.027344 -1.574219 2.246094 -1.3125 C 2.460938 -1.046875 2.730469 -0.914063 3.046875 -0.917969 C 3.34375 -0.914063 3.59375 -1.035156 3.796875 -1.277344 C 3.996094 -1.519531 4.09375 -1.914063 4.097656 -2.464844 C 4.09375 -2.976563 3.992188 -3.355469 3.789063 -3.605469 C 3.582031 -3.851563 3.328125 -3.976563 3.023438 -3.980469 C 2.703125 -3.976563 2.4375 -3.855469 2.234375 -3.613281 C 2.023438 -3.367188 1.921875 -3.007813 1.921875 -2.53125 Z M 1.921875 -2.53125 "
-           id="path8219" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph12-5">
-        <path
-           style="stroke:none;"
-           d="M 3.898438 0 L 3.898438 -0.734375 C 3.71875 -0.46875 3.484375 -0.261719 3.195313 -0.113281 C 2.902344 0.0351563 2.597656 0.105469 2.277344 0.109375 C 1.945313 0.105469 1.648438 0.0351563 1.390625 -0.105469 C 1.125 -0.25 0.9375 -0.453125 0.824219 -0.714844 C 0.703125 -0.972656 0.644531 -1.332031 0.648438 -1.796875 L 0.648438 -4.894531 L 1.945313 -4.894531 L 1.945313 -2.644531 C 1.945313 -1.953125 1.96875 -1.53125 2.015625 -1.378906 C 2.0625 -1.222656 2.148438 -1.101563 2.273438 -1.011719 C 2.398438 -0.917969 2.558594 -0.871094 2.753906 -0.875 C 2.972656 -0.871094 3.167969 -0.933594 3.347656 -1.054688 C 3.519531 -1.175781 3.640625 -1.324219 3.707031 -1.507813 C 3.769531 -1.683594 3.800781 -2.125 3.804688 -2.828125 L 3.804688 -4.894531 L 5.101563 -4.894531 L 5.101563 0 Z M 3.898438 0 "
-           id="path8222" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-0">
-        <path
-           style="stroke:none;"
-           d="M 0.671875 -6.664063 L 3.132813 -6.664063 C 3.683594 -6.660156 4.105469 -6.621094 4.402344 -6.539063 C 4.789063 -6.421875 5.125 -6.214844 5.40625 -5.925781 C 5.683594 -5.628906 5.894531 -5.269531 6.042969 -4.84375 C 6.183594 -4.417969 6.257813 -3.894531 6.261719 -3.269531 C 6.257813 -2.71875 6.1875 -2.246094 6.054688 -1.851563 C 5.882813 -1.367188 5.648438 -0.972656 5.34375 -0.671875 C 5.109375 -0.441406 4.792969 -0.265625 4.398438 -0.140625 C 4.097656 -0.046875 3.699219 0 3.207031 0 L 0.671875 0 Z M 2.019531 -5.539063 L 2.019531 -1.125 L 3.023438 -1.125 C 3.398438 -1.125 3.667969 -1.144531 3.835938 -1.1875 C 4.050781 -1.238281 4.230469 -1.332031 4.378906 -1.464844 C 4.519531 -1.59375 4.640625 -1.804688 4.734375 -2.105469 C 4.824219 -2.402344 4.867188 -2.8125 4.871094 -3.328125 C 4.867188 -3.84375 4.824219 -4.238281 4.734375 -4.515625 C 4.640625 -4.789063 4.511719 -5.003906 4.351563 -5.160156 C 4.183594 -5.3125 3.976563 -5.417969 3.730469 -5.472656 C 3.539063 -5.515625 3.171875 -5.535156 2.625 -5.539063 Z M 2.019531 -5.539063 "
-           id="path8225" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-1">
-        <path
-           style="stroke:none;"
-           d="M 3.464844 -1.535156 L 4.738281 -1.324219 C 4.570313 -0.855469 4.3125 -0.496094 3.960938 -0.253906 C 3.609375 -0.0117188 3.167969 0.105469 2.636719 0.109375 C 1.796875 0.105469 1.175781 -0.167969 0.773438 -0.714844 C 0.453125 -1.152344 0.292969 -1.707031 0.296875 -2.378906 C 0.292969 -3.175781 0.5 -3.800781 0.921875 -4.257813 C 1.335938 -4.707031 1.867188 -4.933594 2.511719 -4.9375 C 3.226563 -4.933594 3.792969 -4.695313 4.207031 -4.226563 C 4.621094 -3.75 4.820313 -3.023438 4.804688 -2.046875 L 1.605469 -2.046875 C 1.613281 -1.664063 1.714844 -1.371094 1.914063 -1.160156 C 2.109375 -0.949219 2.355469 -0.84375 2.652344 -0.847656 C 2.847656 -0.84375 3.015625 -0.898438 3.152344 -1.007813 C 3.289063 -1.117188 3.390625 -1.292969 3.464844 -1.535156 Z M 3.539063 -2.828125 C 3.527344 -3.195313 3.429688 -3.476563 3.25 -3.671875 C 3.066406 -3.863281 2.84375 -3.960938 2.585938 -3.960938 C 2.304688 -3.960938 2.074219 -3.859375 1.894531 -3.65625 C 1.710938 -3.453125 1.625 -3.175781 1.628906 -2.828125 Z M 3.539063 -2.828125 "
-           id="path8228" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-2">
-        <path
-           style="stroke:none;"
-           d="M 1.996094 0 L 0.0507813 -4.828125 L 1.390625 -4.828125 L 2.300781 -2.363281 L 2.566406 -1.542969 C 2.632813 -1.746094 2.675781 -1.886719 2.695313 -1.957031 C 2.738281 -2.089844 2.78125 -2.222656 2.832031 -2.363281 L 3.75 -4.828125 L 5.066406 -4.828125 L 3.148438 0 Z M 1.996094 0 "
-           id="path8231" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-3">
-        <path
-           style="stroke:none;"
-           d="M 4.941406 -2.449219 L 6.246094 -2.035156 C 6.046875 -1.304688 5.714844 -0.765625 5.25 -0.414063 C 4.78125 -0.0625 4.1875 0.109375 3.476563 0.113281 C 2.589844 0.109375 1.863281 -0.1875 1.296875 -0.792969 C 0.722656 -1.394531 0.4375 -2.222656 0.441406 -3.273438 C 0.4375 -4.378906 0.726563 -5.242188 1.300781 -5.859375 C 1.871094 -6.472656 2.621094 -6.777344 3.558594 -6.78125 C 4.371094 -6.777344 5.035156 -6.535156 5.546875 -6.054688 C 5.847656 -5.765625 6.074219 -5.359375 6.230469 -4.828125 L 4.898438 -4.511719 C 4.816406 -4.855469 4.648438 -5.125 4.402344 -5.328125 C 4.148438 -5.523438 3.847656 -5.625 3.492188 -5.628906 C 2.996094 -5.625 2.59375 -5.449219 2.289063 -5.097656 C 1.980469 -4.742188 1.828125 -4.167969 1.828125 -3.375 C 1.828125 -2.53125 1.976563 -1.929688 2.28125 -1.570313 C 2.582031 -1.210938 2.976563 -1.03125 3.464844 -1.035156 C 3.816406 -1.03125 4.125 -1.144531 4.386719 -1.375 C 4.640625 -1.601563 4.824219 -1.960938 4.941406 -2.449219 Z M 4.941406 -2.449219 "
-           id="path8234" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-4">
-        <path
-           style="stroke:none;"
-           d="M 3.847656 0 L 3.847656 -0.722656 C 3.667969 -0.460938 3.4375 -0.257813 3.152344 -0.113281 C 2.863281 0.0351563 2.558594 0.105469 2.246094 0.109375 C 1.917969 0.105469 1.628906 0.0390625 1.371094 -0.101563 C 1.113281 -0.242188 0.925781 -0.445313 0.8125 -0.703125 C 0.695313 -0.960938 0.636719 -1.316406 0.640625 -1.773438 L 0.640625 -4.828125 L 1.917969 -4.828125 L 1.917969 -2.609375 C 1.914063 -1.925781 1.9375 -1.511719 1.988281 -1.359375 C 2.03125 -1.207031 2.117188 -1.085938 2.246094 -0.996094 C 2.367188 -0.90625 2.527344 -0.859375 2.71875 -0.863281 C 2.9375 -0.859375 3.132813 -0.921875 3.304688 -1.042969 C 3.476563 -1.160156 3.59375 -1.308594 3.660156 -1.488281 C 3.71875 -1.664063 3.75 -2.097656 3.753906 -2.792969 L 3.753906 -4.828125 L 5.035156 -4.828125 L 5.035156 0 Z M 3.847656 0 "
-           id="path8237" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-5">
-        <path
-           style="stroke:none;"
-           d="M 5.097656 0 L 3.910156 0 L 3.910156 -0.710938 C 3.707031 -0.429688 3.476563 -0.226563 3.210938 -0.09375 C 2.941406 0.0390625 2.671875 0.105469 2.402344 0.109375 C 1.847656 0.105469 1.371094 -0.113281 0.976563 -0.558594 C 0.578125 -1.003906 0.378906 -1.628906 0.382813 -2.433594 C 0.378906 -3.246094 0.570313 -3.867188 0.957031 -4.296875 C 1.339844 -4.71875 1.828125 -4.933594 2.417969 -4.9375 C 2.953125 -4.933594 3.421875 -4.710938 3.820313 -4.265625 L 3.820313 -6.664063 L 5.097656 -6.664063 Z M 1.6875 -2.519531 C 1.6875 -2.003906 1.757813 -1.628906 1.902344 -1.402344 C 2.105469 -1.0625 2.394531 -0.894531 2.765625 -0.898438 C 3.058594 -0.894531 3.308594 -1.019531 3.515625 -1.273438 C 3.71875 -1.523438 3.820313 -1.898438 3.824219 -2.394531 C 3.820313 -2.945313 3.71875 -3.34375 3.523438 -3.59375 C 3.320313 -3.835938 3.066406 -3.960938 2.753906 -3.960938 C 2.449219 -3.960938 2.195313 -3.839844 1.992188 -3.597656 C 1.789063 -3.355469 1.6875 -2.996094 1.6875 -2.519531 Z M 1.6875 -2.519531 "
-           id="path8240" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-6">
-        <path
-           style="stroke:none;"
-           d="M 1.625 -3.355469 L 0.464844 -3.566406 C 0.589844 -4.03125 0.816406 -4.375 1.136719 -4.601563 C 1.453125 -4.820313 1.925781 -4.933594 2.554688 -4.9375 C 3.125 -4.933594 3.546875 -4.867188 3.828125 -4.734375 C 4.101563 -4.601563 4.300781 -4.429688 4.417969 -4.222656 C 4.53125 -4.011719 4.585938 -3.628906 4.589844 -3.078125 L 4.574219 -1.585938 C 4.570313 -1.160156 4.589844 -0.847656 4.632813 -0.644531 C 4.671875 -0.441406 4.75 -0.226563 4.867188 0 L 3.601563 0 C 3.566406 -0.0859375 3.523438 -0.210938 3.476563 -0.378906 C 3.453125 -0.453125 3.441406 -0.503906 3.433594 -0.527344 C 3.214844 -0.316406 2.980469 -0.15625 2.730469 -0.0507813 C 2.480469 0.0546875 2.214844 0.105469 1.9375 0.109375 C 1.4375 0.105469 1.046875 -0.0273438 0.761719 -0.292969 C 0.472656 -0.5625 0.328125 -0.902344 0.332031 -1.320313 C 0.328125 -1.585938 0.394531 -1.828125 0.527344 -2.046875 C 0.65625 -2.257813 0.835938 -2.421875 1.074219 -2.539063 C 1.308594 -2.648438 1.648438 -2.75 2.089844 -2.835938 C 2.6875 -2.945313 3.101563 -3.050781 3.332031 -3.152344 L 3.332031 -3.277344 C 3.328125 -3.519531 3.265625 -3.695313 3.148438 -3.804688 C 3.023438 -3.90625 2.796875 -3.960938 2.464844 -3.960938 C 2.234375 -3.960938 2.058594 -3.914063 1.933594 -3.824219 C 1.804688 -3.734375 1.699219 -3.578125 1.625 -3.355469 Z M 3.332031 -2.320313 C 3.164063 -2.265625 2.90625 -2.199219 2.554688 -2.125 C 2.199219 -2.046875 1.964844 -1.972656 1.859375 -1.902344 C 1.6875 -1.78125 1.605469 -1.628906 1.609375 -1.449219 C 1.605469 -1.269531 1.671875 -1.117188 1.808594 -0.988281 C 1.941406 -0.855469 2.113281 -0.789063 2.320313 -0.792969 C 2.546875 -0.789063 2.765625 -0.867188 2.976563 -1.019531 C 3.132813 -1.132813 3.234375 -1.273438 3.28125 -1.441406 C 3.3125 -1.550781 3.328125 -1.757813 3.332031 -2.0625 Z M 3.332031 -2.320313 "
-           id="path8243" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-7">
-        <path
-           style="stroke:none;"
-           d="M 0.683594 0 L 0.683594 -6.664063 L 3.515625 -6.664063 C 4.226563 -6.660156 4.742188 -6.601563 5.066406 -6.484375 C 5.386719 -6.363281 5.644531 -6.152344 5.839844 -5.847656 C 6.03125 -5.539063 6.128906 -5.1875 6.132813 -4.796875 C 6.128906 -4.292969 5.980469 -3.878906 5.691406 -3.558594 C 5.394531 -3.230469 4.957031 -3.027344 4.375 -2.941406 C 4.664063 -2.769531 4.902344 -2.582031 5.09375 -2.382813 C 5.28125 -2.175781 5.539063 -1.816406 5.859375 -1.300781 L 6.675781 0 L 5.066406 0 L 4.09375 -1.449219 C 3.746094 -1.964844 3.507813 -2.292969 3.382813 -2.429688 C 3.25 -2.5625 3.117188 -2.65625 2.976563 -2.707031 C 2.835938 -2.753906 2.609375 -2.777344 2.300781 -2.78125 L 2.027344 -2.78125 L 2.027344 0 Z M 2.027344 -3.847656 L 3.023438 -3.847656 C 3.667969 -3.84375 4.070313 -3.871094 4.230469 -3.929688 C 4.390625 -3.980469 4.515625 -4.074219 4.609375 -4.210938 C 4.699219 -4.339844 4.746094 -4.507813 4.746094 -4.710938 C 4.746094 -4.933594 4.683594 -5.113281 4.566406 -5.253906 C 4.441406 -5.390625 4.273438 -5.476563 4.058594 -5.515625 C 3.949219 -5.527344 3.621094 -5.535156 3.078125 -5.539063 L 2.027344 -5.539063 Z M 2.027344 -3.847656 "
-           id="path8246" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph13-8">
-        <path
-           style="stroke:none;"
-           d="M 2.882813 -4.828125 L 2.882813 -3.808594 L 2.011719 -3.808594 L 2.011719 -1.863281 C 2.007813 -1.46875 2.015625 -1.238281 2.035156 -1.175781 C 2.046875 -1.109375 2.085938 -1.054688 2.148438 -1.011719 C 2.207031 -0.96875 2.277344 -0.949219 2.363281 -0.949219 C 2.480469 -0.949219 2.652344 -0.988281 2.878906 -1.074219 L 2.988281 -0.0820313 C 2.691406 0.0429688 2.351563 0.105469 1.976563 0.109375 C 1.742188 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.035156 -0.179688 0.949219 -0.304688 C 0.863281 -0.429688 0.800781 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.707031 L 0.726563 -3.808594 L 0.140625 -3.808594 L 0.140625 -4.828125 L 0.726563 -4.828125 L 0.726563 -5.789063 L 2.011719 -6.535156 L 2.011719 -4.828125 Z M 2.882813 -4.828125 "
-           id="path8249" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-0">
-        <path
-           style="stroke:none;"
-           d="M 0.332031 -2.148438 L 1.628906 -2.273438 C 1.707031 -1.835938 1.863281 -1.515625 2.105469 -1.3125 C 2.339844 -1.109375 2.664063 -1.007813 3.070313 -1.007813 C 3.5 -1.007813 3.820313 -1.097656 4.039063 -1.28125 C 4.253906 -1.460938 4.363281 -1.671875 4.367188 -1.917969 C 4.363281 -2.070313 4.320313 -2.203125 4.230469 -2.316406 C 4.136719 -2.421875 3.976563 -2.519531 3.75 -2.601563 C 3.59375 -2.65625 3.238281 -2.75 2.683594 -2.890625 C 1.96875 -3.0625 1.464844 -3.28125 1.179688 -3.542969 C 0.769531 -3.90625 0.566406 -4.347656 0.570313 -4.871094 C 0.566406 -5.203125 0.664063 -5.519531 0.855469 -5.8125 C 1.046875 -6.105469 1.320313 -6.328125 1.679688 -6.484375 C 2.035156 -6.636719 2.46875 -6.714844 2.980469 -6.714844 C 3.804688 -6.714844 4.429688 -6.53125 4.851563 -6.167969 C 5.269531 -5.804688 5.488281 -5.320313 5.511719 -4.714844 L 4.179688 -4.65625 C 4.117188 -4.992188 3.996094 -5.234375 3.8125 -5.386719 C 3.621094 -5.53125 3.339844 -5.605469 2.96875 -5.609375 C 2.578125 -5.605469 2.273438 -5.527344 2.058594 -5.371094 C 1.914063 -5.265625 1.84375 -5.128906 1.847656 -4.960938 C 1.84375 -4.800781 1.910156 -4.667969 2.042969 -4.5625 C 2.210938 -4.417969 2.621094 -4.269531 3.269531 -4.121094 C 3.917969 -3.964844 4.394531 -3.808594 4.707031 -3.644531 C 5.015625 -3.480469 5.261719 -3.253906 5.4375 -2.972656 C 5.609375 -2.683594 5.695313 -2.335938 5.699219 -1.921875 C 5.695313 -1.542969 5.589844 -1.1875 5.382813 -0.859375 C 5.171875 -0.527344 4.875 -0.28125 4.492188 -0.121094 C 4.105469 0.0390625 3.628906 0.117188 3.058594 0.117188 C 2.222656 0.117188 1.578125 -0.0742188 1.132813 -0.460938 C 0.683594 -0.84375 0.417969 -1.40625 0.332031 -2.148438 Z M 0.332031 -2.148438 "
-           id="path8252" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-1">
-        <path
-           style="stroke:none;"
-           d="M 2.855469 -4.78125 L 2.855469 -3.773438 L 1.992188 -3.773438 L 1.992188 -1.847656 C 1.992188 -1.453125 2 -1.226563 2.015625 -1.164063 C 2.03125 -1.097656 2.066406 -1.042969 2.125 -1.003906 C 2.183594 -0.957031 2.253906 -0.9375 2.339844 -0.941406 C 2.453125 -0.9375 2.625 -0.976563 2.851563 -1.0625 L 2.957031 -0.0820313 C 2.660156 0.0429688 2.328125 0.105469 1.957031 0.109375 C 1.730469 0.105469 1.523438 0.0703125 1.339844 -0.00390625 C 1.15625 -0.078125 1.023438 -0.179688 0.9375 -0.304688 C 0.851563 -0.425781 0.792969 -0.589844 0.761719 -0.796875 C 0.730469 -0.945313 0.714844 -1.242188 0.71875 -1.6875 L 0.71875 -3.773438 L 0.140625 -3.773438 L 0.140625 -4.78125 L 0.71875 -4.78125 L 0.71875 -5.730469 L 1.992188 -6.472656 L 1.992188 -4.78125 Z M 2.855469 -4.78125 "
-           id="path8255" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-2">
-        <path
-           style="stroke:none;"
-           d="M 1.875 0 L 0.609375 0 L 0.609375 -4.78125 L 1.785156 -4.78125 L 1.785156 -4.101563 C 1.984375 -4.421875 2.164063 -4.632813 2.324219 -4.738281 C 2.484375 -4.835938 2.667969 -4.886719 2.871094 -4.890625 C 3.160156 -4.886719 3.4375 -4.808594 3.707031 -4.652344 L 3.3125 -3.546875 C 3.097656 -3.679688 2.902344 -3.75 2.71875 -3.753906 C 2.539063 -3.75 2.386719 -3.699219 2.269531 -3.605469 C 2.144531 -3.503906 2.050781 -3.328125 1.980469 -3.078125 C 1.910156 -2.820313 1.875 -2.289063 1.875 -1.476563 Z M 1.875 0 "
-           id="path8258" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-3">
-        <path
-           style="stroke:none;"
-           d="M 3.429688 -1.523438 L 4.691406 -1.308594 C 4.527344 -0.84375 4.269531 -0.492188 3.921875 -0.253906 C 3.570313 -0.0117188 3.136719 0.105469 2.613281 0.109375 C 1.777344 0.105469 1.160156 -0.164063 0.765625 -0.707031 C 0.449219 -1.140625 0.292969 -1.691406 0.292969 -2.355469 C 0.292969 -3.144531 0.5 -3.761719 0.914063 -4.214844 C 1.328125 -4.660156 1.851563 -4.886719 2.484375 -4.890625 C 3.195313 -4.886719 3.753906 -4.652344 4.167969 -4.183594 C 4.574219 -3.714844 4.773438 -2.996094 4.757813 -2.027344 L 1.589844 -2.027344 C 1.59375 -1.652344 1.695313 -1.359375 1.894531 -1.148438 C 2.085938 -0.9375 2.332031 -0.832031 2.625 -0.835938 C 2.820313 -0.832031 2.984375 -0.886719 3.125 -0.996094 C 3.257813 -1.105469 3.359375 -1.28125 3.429688 -1.523438 Z M 3.503906 -2.800781 C 3.492188 -3.167969 3.394531 -3.445313 3.21875 -3.636719 C 3.035156 -3.824219 2.816406 -3.917969 2.5625 -3.921875 C 2.28125 -3.917969 2.054688 -3.820313 1.878906 -3.621094 C 1.695313 -3.417969 1.605469 -3.144531 1.613281 -2.800781 Z M 3.503906 -2.800781 "
-           id="path8261" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-4">
-        <path
-           style="stroke:none;"
-           d="M 1.609375 -3.324219 L 0.460938 -3.53125 C 0.585938 -3.988281 0.808594 -4.328125 1.125 -4.554688 C 1.4375 -4.773438 1.90625 -4.886719 2.53125 -4.890625 C 3.09375 -4.886719 3.511719 -4.820313 3.792969 -4.6875 C 4.066406 -4.554688 4.261719 -4.382813 4.375 -4.179688 C 4.484375 -3.96875 4.539063 -3.59375 4.542969 -3.046875 L 4.53125 -1.570313 C 4.527344 -1.148438 4.546875 -0.835938 4.589844 -0.640625 C 4.628906 -0.4375 4.707031 -0.226563 4.816406 0 L 3.566406 0 C 3.53125 -0.0820313 3.492188 -0.207031 3.445313 -0.375 C 3.421875 -0.449219 3.40625 -0.496094 3.398438 -0.523438 C 3.179688 -0.308594 2.945313 -0.152344 2.703125 -0.046875 C 2.453125 0.0546875 2.191406 0.105469 1.917969 0.109375 C 1.421875 0.105469 1.035156 -0.0273438 0.753906 -0.292969 C 0.46875 -0.558594 0.328125 -0.894531 0.328125 -1.304688 C 0.328125 -1.574219 0.390625 -1.816406 0.519531 -2.027344 C 0.648438 -2.238281 0.828125 -2.398438 1.0625 -2.515625 C 1.292969 -2.625 1.628906 -2.722656 2.070313 -2.808594 C 2.65625 -2.917969 3.066406 -3.023438 3.300781 -3.121094 L 3.300781 -3.246094 C 3.296875 -3.488281 3.238281 -3.660156 3.121094 -3.765625 C 3 -3.867188 2.773438 -3.917969 2.441406 -3.921875 C 2.210938 -3.917969 2.035156 -3.875 1.914063 -3.789063 C 1.785156 -3.699219 1.683594 -3.542969 1.609375 -3.324219 Z M 3.300781 -2.296875 C 3.136719 -2.242188 2.882813 -2.175781 2.53125 -2.101563 C 2.179688 -2.027344 1.949219 -1.953125 1.839844 -1.882813 C 1.675781 -1.761719 1.59375 -1.613281 1.59375 -1.4375 C 1.59375 -1.257813 1.660156 -1.105469 1.792969 -0.976563 C 1.925781 -0.847656 2.09375 -0.785156 2.296875 -0.785156 C 2.523438 -0.785156 2.738281 -0.859375 2.949219 -1.007813 C 3.097656 -1.121094 3.199219 -1.257813 3.25 -1.425781 C 3.28125 -1.53125 3.296875 -1.738281 3.300781 -2.042969 Z M 3.300781 -2.296875 "
-           id="path8264" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-5">
-        <path
-           style="stroke:none;"
-           d="M 0.566406 -4.78125 L 1.734375 -4.78125 L 1.734375 -4.128906 C 2.148438 -4.632813 2.644531 -4.886719 3.222656 -4.890625 C 3.527344 -4.886719 3.792969 -4.824219 4.019531 -4.699219 C 4.246094 -4.574219 4.429688 -4.382813 4.574219 -4.128906 C 4.78125 -4.382813 5.007813 -4.574219 5.253906 -4.699219 C 5.496094 -4.824219 5.757813 -4.886719 6.035156 -4.890625 C 6.386719 -4.886719 6.683594 -4.816406 6.925781 -4.675781 C 7.167969 -4.53125 7.347656 -4.320313 7.46875 -4.046875 C 7.554688 -3.839844 7.597656 -3.511719 7.601563 -3.058594 L 7.601563 0 L 6.335938 0 L 6.335938 -2.734375 C 6.332031 -3.207031 6.289063 -3.511719 6.207031 -3.652344 C 6.085938 -3.828125 5.90625 -3.917969 5.664063 -3.921875 C 5.484375 -3.917969 5.316406 -3.863281 5.164063 -3.757813 C 5.003906 -3.648438 4.890625 -3.492188 4.824219 -3.285156 C 4.753906 -3.074219 4.722656 -2.746094 4.722656 -2.296875 L 4.722656 0 L 3.457031 0 L 3.457031 -2.621094 C 3.457031 -3.082031 3.433594 -3.382813 3.390625 -3.519531 C 3.34375 -3.652344 3.273438 -3.753906 3.179688 -3.820313 C 3.082031 -3.886719 2.953125 -3.917969 2.796875 -3.921875 C 2.597656 -3.917969 2.421875 -3.867188 2.269531 -3.761719 C 2.109375 -3.65625 2 -3.503906 1.933594 -3.308594 C 1.863281 -3.109375 1.828125 -2.78125 1.832031 -2.324219 L 1.832031 0 L 0.566406 0 Z M 0.566406 -4.78125 "
-           id="path8267" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-6">
-        <path
-           style="stroke:none;"
-           d="M 4.894531 -2.425781 L 6.1875 -2.015625 C 5.988281 -1.296875 5.65625 -0.761719 5.199219 -0.410156 C 4.734375 -0.0625 4.152344 0.109375 3.445313 0.113281 C 2.566406 0.109375 1.84375 -0.1875 1.28125 -0.785156 C 0.714844 -1.382813 0.433594 -2.199219 0.4375 -3.242188 C 0.433594 -4.335938 0.71875 -5.191406 1.289063 -5.800781 C 1.855469 -6.410156 2.601563 -6.714844 3.527344 -6.714844 C 4.332031 -6.714844 4.984375 -6.476563 5.492188 -6 C 5.789063 -5.714844 6.015625 -5.308594 6.167969 -4.78125 L 4.851563 -4.46875 C 4.773438 -4.808594 4.609375 -5.078125 4.359375 -5.277344 C 4.109375 -5.472656 3.808594 -5.570313 3.457031 -5.574219 C 2.964844 -5.570313 2.566406 -5.394531 2.265625 -5.046875 C 1.957031 -4.691406 1.804688 -4.125 1.808594 -3.339844 C 1.804688 -2.503906 1.953125 -1.910156 2.257813 -1.558594 C 2.554688 -1.203125 2.945313 -1.023438 3.429688 -1.027344 C 3.78125 -1.023438 4.085938 -1.136719 4.34375 -1.363281 C 4.59375 -1.585938 4.777344 -1.941406 4.894531 -2.425781 Z M 4.894531 -2.425781 "
-           id="path8270" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-7">
-        <path
-           style="stroke:none;"
-           d="M 0.625 -4.78125 L 1.804688 -4.78125 L 1.804688 -4.078125 C 1.957031 -4.320313 2.164063 -4.515625 2.425781 -4.664063 C 2.6875 -4.8125 2.976563 -4.886719 3.296875 -4.890625 C 3.847656 -4.886719 4.320313 -4.667969 4.710938 -4.238281 C 5.097656 -3.800781 5.289063 -3.195313 5.292969 -2.417969 C 5.289063 -1.617188 5.09375 -0.996094 4.707031 -0.554688 C 4.3125 -0.113281 3.839844 0.105469 3.289063 0.109375 C 3.019531 0.105469 2.78125 0.0546875 2.566406 -0.046875 C 2.351563 -0.152344 2.125 -0.332031 1.890625 -0.589844 L 1.890625 1.820313 L 0.625 1.820313 Z M 1.878906 -2.472656 C 1.875 -1.933594 1.980469 -1.535156 2.195313 -1.28125 C 2.40625 -1.019531 2.667969 -0.890625 2.976563 -0.894531 C 3.269531 -0.890625 3.511719 -1.007813 3.710938 -1.246094 C 3.902344 -1.480469 4 -1.867188 4.003906 -2.410156 C 4 -2.90625 3.898438 -3.277344 3.699219 -3.523438 C 3.496094 -3.765625 3.25 -3.890625 2.953125 -3.890625 C 2.640625 -3.890625 2.382813 -3.769531 2.183594 -3.53125 C 1.976563 -3.292969 1.875 -2.941406 1.878906 -2.472656 Z M 1.878906 -2.472656 "
-           id="path8273" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-8">
-        <path
-           style="stroke:none;"
-           d="M 3.808594 0 L 3.808594 -0.714844 C 3.632813 -0.457031 3.402344 -0.253906 3.121094 -0.109375 C 2.835938 0.0351563 2.535156 0.105469 2.222656 0.109375 C 1.898438 0.105469 1.613281 0.0390625 1.359375 -0.101563 C 1.101563 -0.242188 0.914063 -0.441406 0.804688 -0.699219 C 0.6875 -0.949219 0.632813 -1.304688 0.636719 -1.757813 L 0.636719 -4.78125 L 1.898438 -4.78125 L 1.898438 -2.585938 C 1.898438 -1.90625 1.921875 -1.496094 1.96875 -1.347656 C 2.015625 -1.195313 2.097656 -1.074219 2.222656 -0.988281 C 2.34375 -0.898438 2.5 -0.855469 2.691406 -0.855469 C 2.902344 -0.855469 3.097656 -0.914063 3.273438 -1.03125 C 3.441406 -1.148438 3.558594 -1.292969 3.625 -1.472656 C 3.683594 -1.644531 3.714844 -2.078125 3.71875 -2.765625 L 3.71875 -4.78125 L 4.984375 -4.78125 L 4.984375 0 Z M 3.808594 0 "
-           id="path8276" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-9">
-        <path
-           style="stroke:none;"
-           d="M 6.625 0 L 5.175781 0 L 4.597656 -1.5 L 1.957031 -1.5 L 1.414063 0 L 0 0 L 2.570313 -6.601563 L 3.980469 -6.601563 Z M 4.167969 -2.613281 L 3.261719 -5.0625 L 2.367188 -2.613281 Z M 4.167969 -2.613281 "
-           id="path8279" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-10">
-        <path
-           style="stroke:none;"
-           d="M 0.214844 -1.363281 L 1.484375 -1.558594 C 1.535156 -1.308594 1.644531 -1.121094 1.8125 -0.996094 C 1.976563 -0.867188 2.210938 -0.800781 2.507813 -0.804688 C 2.835938 -0.800781 3.082031 -0.863281 3.246094 -0.984375 C 3.355469 -1.070313 3.410156 -1.183594 3.414063 -1.324219 C 3.410156 -1.417969 3.378906 -1.496094 3.324219 -1.5625 C 3.257813 -1.621094 3.117188 -1.675781 2.898438 -1.730469 C 1.875 -1.949219 1.230469 -2.15625 0.960938 -2.347656 C 0.578125 -2.601563 0.386719 -2.960938 0.390625 -3.421875 C 0.386719 -3.835938 0.550781 -4.183594 0.882813 -4.464844 C 1.207031 -4.746094 1.714844 -4.886719 2.40625 -4.890625 C 3.058594 -4.886719 3.542969 -4.78125 3.863281 -4.570313 C 4.175781 -4.355469 4.394531 -4.039063 4.519531 -3.625 L 3.328125 -3.40625 C 3.273438 -3.585938 3.175781 -3.730469 3.035156 -3.832031 C 2.894531 -3.929688 2.691406 -3.976563 2.425781 -3.980469 C 2.089844 -3.976563 1.851563 -3.929688 1.710938 -3.839844 C 1.613281 -3.769531 1.5625 -3.6875 1.566406 -3.585938 C 1.5625 -3.492188 1.605469 -3.414063 1.691406 -3.355469 C 1.804688 -3.269531 2.199219 -3.152344 2.875 -3 C 3.546875 -2.84375 4.015625 -2.65625 4.285156 -2.4375 C 4.546875 -2.210938 4.679688 -1.894531 4.683594 -1.496094 C 4.679688 -1.050781 4.496094 -0.675781 4.132813 -0.363281 C 3.765625 -0.0507813 3.226563 0.105469 2.507813 0.109375 C 1.855469 0.105469 1.339844 -0.0234375 0.960938 -0.285156 C 0.578125 -0.546875 0.328125 -0.90625 0.214844 -1.363281 Z M 0.214844 -1.363281 "
-           id="path8282" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-11">
-        <path
-           style="stroke:none;"
-           d="M 0.0625 -4.78125 L 1.410156 -4.78125 L 2.554688 -1.386719 L 3.671875 -4.78125 L 4.980469 -4.78125 L 3.292969 -0.179688 L 2.988281 0.652344 C 2.878906 0.925781 2.773438 1.140625 2.671875 1.289063 C 2.570313 1.4375 2.453125 1.554688 2.324219 1.648438 C 2.191406 1.738281 2.03125 1.8125 1.84375 1.863281 C 1.652344 1.914063 1.4375 1.9375 1.199219 1.941406 C 0.953125 1.9375 0.714844 1.914063 0.480469 1.863281 L 0.371094 0.875 C 0.566406 0.910156 0.746094 0.929688 0.90625 0.933594 C 1.199219 0.929688 1.417969 0.84375 1.558594 0.671875 C 1.699219 0.496094 1.804688 0.277344 1.882813 0.0117188 Z M 0.0625 -4.78125 "
-           id="path8285" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-12">
-        <path
-           style="stroke:none;"
-           d="M 5.011719 0 L 3.746094 0 L 3.746094 -2.441406 C 3.742188 -2.953125 3.714844 -3.285156 3.664063 -3.441406 C 3.609375 -3.589844 3.523438 -3.710938 3.402344 -3.796875 C 3.277344 -3.878906 3.132813 -3.917969 2.964844 -3.921875 C 2.742188 -3.917969 2.542969 -3.859375 2.371094 -3.742188 C 2.195313 -3.621094 2.078125 -3.460938 2.015625 -3.265625 C 1.949219 -3.0625 1.914063 -2.695313 1.917969 -2.164063 L 1.917969 0 L 0.652344 0 L 0.652344 -4.78125 L 1.828125 -4.78125 L 1.828125 -4.078125 C 2.242188 -4.617188 2.765625 -4.886719 3.40625 -4.890625 C 3.679688 -4.886719 3.933594 -4.835938 4.167969 -4.738281 C 4.394531 -4.636719 4.570313 -4.507813 4.691406 -4.355469 C 4.808594 -4.195313 4.890625 -4.019531 4.941406 -3.824219 C 4.984375 -3.625 5.007813 -3.339844 5.011719 -2.972656 Z M 5.011719 0 "
-           id="path8288" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph14-13">
-        <path
-           style="stroke:none;"
-           d="M 4.832031 -3.367188 L 3.585938 -3.144531 C 3.539063 -3.390625 3.441406 -3.578125 3.296875 -3.703125 C 3.144531 -3.828125 2.953125 -3.890625 2.71875 -3.894531 C 2.402344 -3.890625 2.148438 -3.78125 1.964844 -3.566406 C 1.773438 -3.347656 1.679688 -2.984375 1.683594 -2.476563 C 1.679688 -1.90625 1.777344 -1.503906 1.96875 -1.273438 C 2.160156 -1.035156 2.414063 -0.917969 2.738281 -0.921875 C 2.972656 -0.917969 3.171875 -0.988281 3.328125 -1.125 C 3.480469 -1.261719 3.585938 -1.496094 3.652344 -1.832031 L 4.894531 -1.621094 C 4.761719 -1.046875 4.515625 -0.617188 4.152344 -0.328125 C 3.78125 -0.0390625 3.289063 0.105469 2.679688 0.109375 C 1.976563 0.105469 1.417969 -0.113281 1.003906 -0.550781 C 0.585938 -0.992188 0.378906 -1.601563 0.382813 -2.386719 C 0.378906 -3.171875 0.589844 -3.789063 1.007813 -4.230469 C 1.421875 -4.667969 1.984375 -4.886719 2.703125 -4.890625 C 3.28125 -4.886719 3.746094 -4.761719 4.089844 -4.511719 C 4.433594 -4.261719 4.679688 -3.878906 4.832031 -3.367188 Z M 4.832031 -3.367188 "
-           id="path8291" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-0">
-        <path
-           style="stroke:none;"
-           d="M 0.273438 -1.761719 L 1.335938 -1.863281 C 1.398438 -1.503906 1.527344 -1.242188 1.726563 -1.078125 C 1.917969 -0.910156 2.179688 -0.824219 2.515625 -0.828125 C 2.867188 -0.824219 3.132813 -0.898438 3.3125 -1.050781 C 3.488281 -1.195313 3.578125 -1.371094 3.582031 -1.574219 C 3.578125 -1.699219 3.542969 -1.808594 3.46875 -1.898438 C 3.390625 -1.988281 3.257813 -2.066406 3.074219 -2.132813 C 2.945313 -2.175781 2.652344 -2.253906 2.199219 -2.371094 C 1.609375 -2.511719 1.199219 -2.691406 0.96875 -2.90625 C 0.632813 -3.203125 0.46875 -3.5625 0.46875 -3.992188 C 0.46875 -4.265625 0.546875 -4.523438 0.703125 -4.765625 C 0.859375 -5.003906 1.082031 -5.1875 1.378906 -5.316406 C 1.667969 -5.4375 2.023438 -5.5 2.445313 -5.503906 C 3.117188 -5.5 3.628906 -5.351563 3.976563 -5.054688 C 4.316406 -4.753906 4.496094 -4.355469 4.519531 -3.863281 L 3.425781 -3.816406 C 3.375 -4.089844 3.273438 -4.289063 3.125 -4.414063 C 2.96875 -4.53125 2.738281 -4.59375 2.433594 -4.597656 C 2.113281 -4.59375 1.867188 -4.527344 1.6875 -4.402344 C 1.570313 -4.316406 1.511719 -4.207031 1.511719 -4.066406 C 1.511719 -3.9375 1.566406 -3.828125 1.675781 -3.738281 C 1.8125 -3.621094 2.144531 -3.5 2.679688 -3.375 C 3.207031 -3.25 3.601563 -3.117188 3.859375 -2.984375 C 4.113281 -2.847656 4.3125 -2.667969 4.457031 -2.4375 C 4.597656 -2.203125 4.667969 -1.914063 4.671875 -1.578125 C 4.667969 -1.261719 4.582031 -0.972656 4.414063 -0.703125 C 4.238281 -0.433594 3.996094 -0.230469 3.683594 -0.101563 C 3.367188 0.03125 2.976563 0.09375 2.507813 0.0976563 C 1.820313 0.09375 1.292969 -0.0625 0.929688 -0.375 C 0.558594 -0.691406 0.339844 -1.152344 0.273438 -1.761719 Z M 0.273438 -1.761719 "
-           id="path8294" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-1">
-        <path
-           style="stroke:none;"
-           d="M 2.339844 -3.921875 L 2.339844 -3.09375 L 1.632813 -3.09375 L 1.632813 -1.511719 C 1.628906 -1.191406 1.636719 -1.003906 1.652344 -0.953125 C 1.664063 -0.898438 1.691406 -0.855469 1.742188 -0.824219 C 1.789063 -0.789063 1.847656 -0.773438 1.917969 -0.773438 C 2.011719 -0.773438 2.152344 -0.804688 2.335938 -0.871094 L 2.425781 -0.0664063 C 2.183594 0.0390625 1.910156 0.0859375 1.605469 0.0898438 C 1.417969 0.0859375 1.246094 0.0585938 1.097656 -0.00390625 C 0.945313 -0.0664063 0.835938 -0.148438 0.769531 -0.25 C 0.695313 -0.347656 0.648438 -0.480469 0.625 -0.652344 C 0.597656 -0.769531 0.585938 -1.015625 0.589844 -1.382813 L 0.589844 -3.09375 L 0.113281 -3.09375 L 0.113281 -3.921875 L 0.589844 -3.921875 L 0.589844 -4.699219 L 1.632813 -5.304688 L 1.632813 -3.921875 Z M 2.339844 -3.921875 "
-           id="path8297" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-2">
-        <path
-           style="stroke:none;"
-           d="M 1.535156 0 L 0.5 0 L 0.5 -3.921875 L 1.460938 -3.921875 L 1.460938 -3.363281 C 1.625 -3.625 1.773438 -3.796875 1.90625 -3.882813 C 2.035156 -3.964844 2.1875 -4.007813 2.355469 -4.007813 C 2.589844 -4.007813 2.816406 -3.941406 3.039063 -3.8125 L 2.71875 -2.910156 C 2.539063 -3.019531 2.375 -3.074219 2.230469 -3.078125 C 2.082031 -3.074219 1.957031 -3.035156 1.859375 -2.957031 C 1.757813 -2.875 1.679688 -2.730469 1.621094 -2.523438 C 1.5625 -2.3125 1.535156 -1.875 1.535156 -1.210938 Z M 1.535156 0 "
-           id="path8300" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-3">
-        <path
-           style="stroke:none;"
-           d="M 2.8125 -1.246094 L 3.847656 -1.074219 C 3.710938 -0.691406 3.5 -0.402344 3.214844 -0.207031 C 2.925781 -0.0078125 2.566406 0.0859375 2.140625 0.0898438 C 1.453125 0.0859375 0.949219 -0.132813 0.628906 -0.578125 C 0.363281 -0.933594 0.234375 -1.382813 0.238281 -1.929688 C 0.234375 -2.578125 0.402344 -3.089844 0.746094 -3.457031 C 1.082031 -3.824219 1.515625 -4.007813 2.039063 -4.007813 C 2.621094 -4.007813 3.078125 -3.8125 3.417969 -3.429688 C 3.753906 -3.039063 3.917969 -2.449219 3.902344 -1.660156 L 1.304688 -1.660156 C 1.308594 -1.347656 1.390625 -1.109375 1.550781 -0.941406 C 1.710938 -0.769531 1.910156 -0.683594 2.152344 -0.6875 C 2.308594 -0.683594 2.445313 -0.730469 2.558594 -0.820313 C 2.667969 -0.90625 2.753906 -1.046875 2.8125 -1.246094 Z M 2.871094 -2.296875 C 2.863281 -2.59375 2.785156 -2.820313 2.636719 -2.980469 C 2.488281 -3.132813 2.308594 -3.210938 2.101563 -3.214844 C 1.871094 -3.210938 1.683594 -3.128906 1.539063 -2.96875 C 1.386719 -2.800781 1.3125 -2.578125 1.320313 -2.296875 Z M 2.871094 -2.296875 "
-           id="path8303" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-4">
-        <path
-           style="stroke:none;"
-           d="M 1.316406 -2.722656 L 0.375 -2.894531 C 0.480469 -3.269531 0.660156 -3.546875 0.921875 -3.734375 C 1.175781 -3.914063 1.5625 -4.007813 2.074219 -4.007813 C 2.535156 -4.007813 2.878906 -3.953125 3.109375 -3.84375 C 3.332031 -3.734375 3.492188 -3.59375 3.585938 -3.425781 C 3.679688 -3.257813 3.726563 -2.949219 3.726563 -2.5 L 3.714844 -1.289063 C 3.714844 -0.945313 3.730469 -0.691406 3.761719 -0.527344 C 3.792969 -0.363281 3.855469 -0.1875 3.949219 0 L 2.921875 0 C 2.894531 -0.0664063 2.863281 -0.167969 2.824219 -0.304688 C 2.804688 -0.367188 2.789063 -0.40625 2.785156 -0.429688 C 2.605469 -0.25 2.417969 -0.121094 2.214844 -0.0390625 C 2.011719 0.046875 1.796875 0.0859375 1.574219 0.0898438 C 1.167969 0.0859375 0.847656 -0.0195313 0.617188 -0.238281 C 0.382813 -0.457031 0.269531 -0.734375 0.269531 -1.070313 C 0.269531 -1.289063 0.320313 -1.484375 0.425781 -1.660156 C 0.53125 -1.832031 0.679688 -1.964844 0.871094 -2.058594 C 1.0625 -2.148438 1.339844 -2.230469 1.699219 -2.304688 C 2.183594 -2.390625 2.519531 -2.476563 2.707031 -2.558594 L 2.707031 -2.660156 C 2.707031 -2.859375 2.65625 -3 2.558594 -3.085938 C 2.457031 -3.167969 2.269531 -3.210938 2 -3.214844 C 1.8125 -3.210938 1.667969 -3.175781 1.566406 -3.105469 C 1.460938 -3.03125 1.378906 -2.902344 1.316406 -2.722656 Z M 2.707031 -1.882813 C 2.574219 -1.835938 2.363281 -1.785156 2.074219 -1.722656 C 1.785156 -1.660156 1.597656 -1.597656 1.511719 -1.542969 C 1.375 -1.441406 1.304688 -1.320313 1.308594 -1.175781 C 1.304688 -1.03125 1.359375 -0.90625 1.46875 -0.800781 C 1.574219 -0.691406 1.710938 -0.636719 1.882813 -0.640625 C 2.066406 -0.636719 2.246094 -0.699219 2.417969 -0.828125 C 2.539063 -0.917969 2.621094 -1.035156 2.664063 -1.171875 C 2.691406 -1.257813 2.707031 -1.425781 2.707031 -1.675781 Z M 2.707031 -1.882813 "
-           id="path8306" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-5">
-        <path
-           style="stroke:none;"
-           d="M 0.464844 -3.921875 L 1.421875 -3.921875 L 1.421875 -3.386719 C 1.761719 -3.800781 2.167969 -4.007813 2.644531 -4.007813 C 2.890625 -4.007813 3.109375 -3.953125 3.296875 -3.851563 C 3.480469 -3.746094 3.632813 -3.59375 3.75 -3.386719 C 3.921875 -3.59375 4.105469 -3.746094 4.304688 -3.851563 C 4.503906 -3.953125 4.71875 -4.007813 4.945313 -4.007813 C 5.234375 -4.007813 5.476563 -3.949219 5.675781 -3.832031 C 5.875 -3.714844 6.023438 -3.542969 6.125 -3.320313 C 6.191406 -3.148438 6.226563 -2.878906 6.230469 -2.507813 L 6.230469 0 L 5.195313 0 L 5.195313 -2.242188 C 5.191406 -2.628906 5.15625 -2.878906 5.085938 -2.992188 C 4.988281 -3.136719 4.839844 -3.210938 4.644531 -3.214844 C 4.496094 -3.210938 4.359375 -3.167969 4.234375 -3.082031 C 4.105469 -2.992188 4.011719 -2.859375 3.957031 -2.691406 C 3.894531 -2.519531 3.867188 -2.25 3.871094 -1.882813 L 3.871094 0 L 2.835938 0 L 2.835938 -2.148438 C 2.835938 -2.527344 2.816406 -2.773438 2.777344 -2.886719 C 2.738281 -2.996094 2.679688 -3.078125 2.605469 -3.132813 C 2.527344 -3.183594 2.421875 -3.210938 2.292969 -3.214844 C 2.128906 -3.210938 1.984375 -3.167969 1.859375 -3.085938 C 1.726563 -2.996094 1.636719 -2.871094 1.585938 -2.710938 C 1.527344 -2.546875 1.5 -2.277344 1.503906 -1.90625 L 1.503906 0 L 0.464844 0 Z M 0.464844 -3.921875 "
-           id="path8309" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-6">
-        <path
-           style="stroke:none;"
-           d="M 4.011719 -1.988281 L 5.070313 -1.652344 C 4.90625 -1.0625 4.636719 -0.621094 4.261719 -0.335938 C 3.878906 -0.046875 3.402344 0.09375 2.824219 0.09375 C 2.105469 0.09375 1.511719 -0.148438 1.050781 -0.640625 C 0.585938 -1.128906 0.355469 -1.800781 0.359375 -2.65625 C 0.355469 -3.554688 0.589844 -4.253906 1.054688 -4.753906 C 1.519531 -5.25 2.128906 -5.5 2.890625 -5.503906 C 3.546875 -5.5 4.085938 -5.304688 4.503906 -4.917969 C 4.746094 -4.683594 4.929688 -4.351563 5.058594 -3.921875 L 3.976563 -3.660156 C 3.910156 -3.9375 3.777344 -4.160156 3.574219 -4.324219 C 3.371094 -4.488281 3.125 -4.570313 2.835938 -4.570313 C 2.433594 -4.570313 2.105469 -4.425781 1.859375 -4.136719 C 1.605469 -3.847656 1.480469 -3.378906 1.484375 -2.738281 C 1.480469 -2.050781 1.605469 -1.5625 1.851563 -1.273438 C 2.097656 -0.980469 2.417969 -0.835938 2.8125 -0.839844 C 3.101563 -0.835938 3.351563 -0.929688 3.5625 -1.117188 C 3.769531 -1.300781 3.917969 -1.589844 4.011719 -1.988281 Z M 4.011719 -1.988281 "
-           id="path8312" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-7">
-        <path
-           style="stroke:none;"
-           d="M 3.121094 0 L 3.121094 -0.585938 C 2.976563 -0.375 2.789063 -0.207031 2.558594 -0.0898438 C 2.324219 0.03125 2.078125 0.0859375 1.824219 0.0898438 C 1.558594 0.0859375 1.324219 0.03125 1.113281 -0.0820313 C 0.902344 -0.199219 0.75 -0.359375 0.660156 -0.570313 C 0.5625 -0.777344 0.515625 -1.070313 0.519531 -1.441406 L 0.519531 -3.921875 L 1.558594 -3.921875 L 1.558594 -2.117188 C 1.558594 -1.5625 1.574219 -1.226563 1.613281 -1.105469 C 1.648438 -0.976563 1.71875 -0.878906 1.824219 -0.808594 C 1.921875 -0.738281 2.050781 -0.703125 2.207031 -0.703125 C 2.382813 -0.703125 2.542969 -0.75 2.683594 -0.847656 C 2.824219 -0.941406 2.917969 -1.0625 2.972656 -1.207031 C 3.019531 -1.351563 3.046875 -1.703125 3.050781 -2.265625 L 3.050781 -3.921875 L 4.085938 -3.921875 L 4.085938 0 Z M 3.121094 0 "
-           id="path8315" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-8">
-        <path
-           style="stroke:none;"
-           d="M 4.136719 0 L 3.175781 0 L 3.175781 -0.574219 C 3.011719 -0.347656 2.820313 -0.179688 2.605469 -0.0742188 C 2.386719 0.0351563 2.167969 0.0859375 1.949219 0.0898438 C 1.496094 0.0859375 1.109375 -0.0898438 0.789063 -0.453125 C 0.464844 -0.8125 0.304688 -1.320313 0.308594 -1.976563 C 0.304688 -2.636719 0.460938 -3.140625 0.777344 -3.488281 C 1.085938 -3.832031 1.480469 -4.007813 1.964844 -4.007813 C 2.398438 -4.007813 2.777344 -3.824219 3.101563 -3.460938 L 3.101563 -5.410156 L 4.136719 -5.410156 Z M 1.371094 -2.046875 C 1.371094 -1.625 1.425781 -1.324219 1.542969 -1.136719 C 1.707031 -0.863281 1.941406 -0.726563 2.246094 -0.730469 C 2.480469 -0.726563 2.683594 -0.828125 2.851563 -1.035156 C 3.019531 -1.234375 3.105469 -1.539063 3.105469 -1.945313 C 3.105469 -2.394531 3.023438 -2.71875 2.859375 -2.917969 C 2.695313 -3.113281 2.488281 -3.210938 2.238281 -3.214844 C 1.988281 -3.210938 1.78125 -3.113281 1.617188 -2.921875 C 1.453125 -2.722656 1.371094 -2.429688 1.371094 -2.046875 Z M 1.371094 -2.046875 "
-           id="path8318" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-9">
-        <path
-           style="stroke:none;"
-           d="M 0.554688 0 L 0.554688 -5.410156 L 2.851563 -5.410156 C 3.429688 -5.40625 3.851563 -5.359375 4.113281 -5.265625 C 4.375 -5.167969 4.582031 -4.996094 4.742188 -4.746094 C 4.898438 -4.496094 4.980469 -4.210938 4.980469 -3.894531 C 4.980469 -3.484375 4.859375 -3.148438 4.621094 -2.886719 C 4.378906 -2.621094 4.023438 -2.453125 3.550781 -2.386719 C 3.785156 -2.246094 3.980469 -2.097656 4.136719 -1.933594 C 4.289063 -1.769531 4.496094 -1.476563 4.757813 -1.054688 L 5.417969 0 L 4.113281 0 L 3.324219 -1.175781 C 3.039063 -1.59375 2.847656 -1.859375 2.746094 -1.972656 C 2.640625 -2.078125 2.53125 -2.152344 2.417969 -2.195313 C 2.300781 -2.234375 2.117188 -2.253906 1.867188 -2.257813 L 1.644531 -2.257813 L 1.644531 0 Z M 1.644531 -3.121094 L 2.453125 -3.121094 C 2.976563 -3.117188 3.304688 -3.140625 3.4375 -3.1875 C 3.566406 -3.230469 3.667969 -3.304688 3.742188 -3.417969 C 3.816406 -3.523438 3.855469 -3.660156 3.855469 -3.824219 C 3.855469 -4.003906 3.804688 -4.152344 3.707031 -4.265625 C 3.609375 -4.375 3.472656 -4.445313 3.296875 -4.476563 C 3.207031 -4.488281 2.941406 -4.492188 2.5 -4.496094 L 1.644531 -4.496094 Z M 1.644531 -3.121094 "
-           id="path8321" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-10">
-        <path
-           style="stroke:none;"
-           d="M 5.429688 0 L 4.242188 0 L 3.769531 -1.230469 L 1.605469 -1.230469 L 1.160156 0 L 0 0 L 2.109375 -5.410156 L 3.261719 -5.410156 Z M 3.417969 -2.140625 L 2.671875 -4.148438 L 1.941406 -2.140625 Z M 3.417969 -2.140625 "
-           id="path8324" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-11">
-        <path
-           style="stroke:none;"
-           d="M 0.175781 -1.117188 L 1.21875 -1.277344 C 1.261719 -1.074219 1.351563 -0.917969 1.488281 -0.816406 C 1.621094 -0.707031 1.808594 -0.65625 2.054688 -0.660156 C 2.320313 -0.65625 2.523438 -0.707031 2.660156 -0.808594 C 2.75 -0.875 2.792969 -0.964844 2.796875 -1.085938 C 2.792969 -1.160156 2.769531 -1.226563 2.722656 -1.28125 C 2.671875 -1.328125 2.558594 -1.375 2.378906 -1.417969 C 1.539063 -1.601563 1.007813 -1.769531 0.785156 -1.921875 C 0.472656 -2.132813 0.316406 -2.425781 0.320313 -2.804688 C 0.316406 -3.144531 0.449219 -3.429688 0.722656 -3.660156 C 0.988281 -3.890625 1.40625 -4.007813 1.972656 -4.007813 C 2.507813 -4.007813 2.90625 -3.917969 3.167969 -3.746094 C 3.425781 -3.566406 3.605469 -3.308594 3.707031 -2.972656 L 2.726563 -2.789063 C 2.679688 -2.941406 2.601563 -3.058594 2.488281 -3.140625 C 2.371094 -3.21875 2.203125 -3.257813 1.988281 -3.261719 C 1.710938 -3.257813 1.515625 -3.21875 1.402344 -3.148438 C 1.320313 -3.089844 1.28125 -3.019531 1.285156 -2.9375 C 1.28125 -2.859375 1.316406 -2.796875 1.386719 -2.75 C 1.476563 -2.679688 1.800781 -2.582031 2.355469 -2.457031 C 2.910156 -2.328125 3.296875 -2.175781 3.515625 -1.996094 C 3.730469 -1.808594 3.835938 -1.550781 3.839844 -1.226563 C 3.835938 -0.863281 3.6875 -0.554688 3.386719 -0.296875 C 3.085938 -0.0390625 2.640625 0.0859375 2.054688 0.0898438 C 1.519531 0.0859375 1.097656 -0.0195313 0.785156 -0.234375 C 0.472656 -0.453125 0.269531 -0.746094 0.175781 -1.117188 Z M 0.175781 -1.117188 "
-           id="path8327" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-12">
-        <path
-           style="stroke:none;"
-           d="M 0.0507813 -3.921875 L 1.15625 -3.921875 L 2.09375 -1.136719 L 3.007813 -3.921875 L 4.082031 -3.921875 L 2.699219 -0.148438 L 2.449219 0.535156 C 2.359375 0.761719 2.273438 0.9375 2.191406 1.058594 C 2.105469 1.179688 2.007813 1.277344 1.90625 1.351563 C 1.796875 1.425781 1.667969 1.484375 1.511719 1.527344 C 1.355469 1.566406 1.175781 1.585938 0.980469 1.589844 C 0.777344 1.585938 0.582031 1.566406 0.394531 1.527344 L 0.300781 0.714844 C 0.460938 0.746094 0.609375 0.761719 0.742188 0.765625 C 0.980469 0.761719 1.160156 0.691406 1.277344 0.550781 C 1.390625 0.410156 1.476563 0.230469 1.542969 0.0117188 Z M 0.0507813 -3.921875 "
-           id="path8330" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-13">
-        <path
-           style="stroke:none;"
-           d="M 4.109375 0 L 3.070313 0 L 3.070313 -2 C 3.070313 -2.421875 3.046875 -2.695313 3.003906 -2.820313 C 2.957031 -2.941406 2.886719 -3.039063 2.789063 -3.109375 C 2.6875 -3.175781 2.566406 -3.210938 2.429688 -3.214844 C 2.246094 -3.210938 2.085938 -3.160156 1.945313 -3.066406 C 1.800781 -2.964844 1.703125 -2.835938 1.652344 -2.675781 C 1.597656 -2.511719 1.570313 -2.210938 1.574219 -1.777344 L 1.574219 0 L 0.535156 0 L 0.535156 -3.921875 L 1.5 -3.921875 L 1.5 -3.34375 C 1.839844 -3.785156 2.269531 -4.007813 2.789063 -4.007813 C 3.015625 -4.007813 3.222656 -3.964844 3.417969 -3.882813 C 3.605469 -3.800781 3.75 -3.695313 3.847656 -3.570313 C 3.941406 -3.4375 4.007813 -3.292969 4.050781 -3.132813 C 4.085938 -2.96875 4.105469 -2.738281 4.109375 -2.4375 Z M 4.109375 0 "
-           id="path8333" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-14">
-        <path
-           style="stroke:none;"
-           d="M 3.960938 -2.761719 L 2.9375 -2.578125 C 2.898438 -2.78125 2.820313 -2.933594 2.703125 -3.035156 C 2.578125 -3.136719 2.421875 -3.1875 2.230469 -3.191406 C 1.96875 -3.1875 1.761719 -3.097656 1.609375 -2.921875 C 1.453125 -2.742188 1.375 -2.445313 1.378906 -2.03125 C 1.375 -1.5625 1.453125 -1.234375 1.613281 -1.042969 C 1.765625 -0.851563 1.976563 -0.753906 2.246094 -0.757813 C 2.4375 -0.753906 2.597656 -0.8125 2.726563 -0.925781 C 2.851563 -1.035156 2.941406 -1.226563 2.992188 -1.503906 L 4.011719 -1.328125 C 3.90625 -0.859375 3.703125 -0.507813 3.402344 -0.269531 C 3.101563 -0.03125 2.699219 0.0859375 2.195313 0.0898438 C 1.621094 0.0859375 1.164063 -0.0898438 0.824219 -0.453125 C 0.480469 -0.8125 0.308594 -1.316406 0.3125 -1.957031 C 0.308594 -2.601563 0.480469 -3.101563 0.824219 -3.464844 C 1.164063 -3.824219 1.628906 -4.007813 2.214844 -4.007813 C 2.691406 -4.007813 3.070313 -3.902344 3.351563 -3.699219 C 3.632813 -3.488281 3.835938 -3.175781 3.960938 -2.761719 Z M 3.960938 -2.761719 "
-           id="path8336" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-15">
-        <path
-           style="stroke:none;"
-           d="M 0.515625 0 L 0.515625 -5.410156 L 1.609375 -5.410156 L 1.609375 0 Z M 0.515625 0 "
-           id="path8339" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-16">
-        <path
-           style="stroke:none;"
-           d="M 0.0429688 0 L 1.457031 -2.019531 L 0.101563 -3.921875 L 1.371094 -3.921875 L 2.0625 -2.84375 L 2.792969 -3.921875 L 4.011719 -3.921875 L 2.683594 -2.0625 L 4.132813 0 L 2.859375 0 L 2.0625 -1.214844 L 1.257813 0 Z M 0.0429688 0 "
-           id="path8342" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-17">
-        <path
-           style="stroke:none;"
-           d="M 3.066406 -1.988281 L 3.066406 -2.902344 L 5.421875 -2.902344 L 5.421875 -0.746094 C 5.1875 -0.519531 4.855469 -0.324219 4.425781 -0.160156 C 3.988281 0.0117188 3.550781 0.09375 3.109375 0.09375 C 2.539063 0.09375 2.042969 -0.0234375 1.625 -0.261719 C 1.203125 -0.5 0.890625 -0.839844 0.679688 -1.28125 C 0.46875 -1.722656 0.363281 -2.203125 0.363281 -2.722656 C 0.363281 -3.285156 0.480469 -3.785156 0.714844 -4.226563 C 0.949219 -4.660156 1.292969 -4.996094 1.753906 -5.234375 C 2.097656 -5.410156 2.53125 -5.5 3.050781 -5.503906 C 3.71875 -5.5 4.242188 -5.359375 4.625 -5.082031 C 5 -4.796875 5.242188 -4.410156 5.351563 -3.914063 L 4.265625 -3.710938 C 4.1875 -3.972656 4.042969 -4.183594 3.835938 -4.339844 C 3.625 -4.492188 3.363281 -4.570313 3.050781 -4.570313 C 2.570313 -4.570313 2.191406 -4.417969 1.910156 -4.113281 C 1.628906 -3.808594 1.488281 -3.359375 1.488281 -2.769531 C 1.488281 -2.125 1.628906 -1.640625 1.914063 -1.320313 C 2.199219 -0.996094 2.574219 -0.835938 3.039063 -0.839844 C 3.265625 -0.835938 3.496094 -0.882813 3.726563 -0.972656 C 3.953125 -1.0625 4.148438 -1.171875 4.320313 -1.304688 L 4.320313 -1.988281 Z M 3.066406 -1.988281 "
-           id="path8345" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-18">
-        <path
-           style="stroke:none;"
-           d="M 0.5 0 L 0.5 -5.410156 L 1.535156 -5.410156 L 1.535156 -3.460938 C 1.855469 -3.824219 2.234375 -4.007813 2.671875 -4.007813 C 3.148438 -4.007813 3.542969 -3.832031 3.859375 -3.488281 C 4.167969 -3.140625 4.324219 -2.644531 4.328125 -2 C 4.324219 -1.328125 4.164063 -0.8125 3.847656 -0.453125 C 3.527344 -0.0898438 3.140625 0.0859375 2.6875 0.0898438 C 2.460938 0.0859375 2.238281 0.0351563 2.023438 -0.078125 C 1.804688 -0.1875 1.617188 -0.355469 1.460938 -0.574219 L 1.460938 0 Z M 1.527344 -2.046875 C 1.523438 -1.636719 1.585938 -1.335938 1.71875 -1.144531 C 1.894531 -0.863281 2.136719 -0.726563 2.4375 -0.730469 C 2.664063 -0.726563 2.855469 -0.824219 3.019531 -1.023438 C 3.175781 -1.214844 3.257813 -1.523438 3.261719 -1.949219 C 3.257813 -2.394531 3.175781 -2.71875 3.019531 -2.917969 C 2.855469 -3.113281 2.648438 -3.210938 2.394531 -3.214844 C 2.144531 -3.210938 1.9375 -3.113281 1.773438 -2.925781 C 1.605469 -2.730469 1.523438 -2.4375 1.527344 -2.046875 Z M 1.527344 -2.046875 "
-           id="path8348" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-19">
-        <path
-           style="stroke:none;"
-           d="M 0.554688 -5.410156 L 2.71875 -5.410156 C 3.144531 -5.40625 3.460938 -5.390625 3.675781 -5.355469 C 3.882813 -5.320313 4.070313 -5.246094 4.238281 -5.132813 C 4.402344 -5.019531 4.542969 -4.867188 4.652344 -4.679688 C 4.761719 -4.488281 4.816406 -4.277344 4.820313 -4.050781 C 4.816406 -3.792969 4.746094 -3.5625 4.613281 -3.351563 C 4.472656 -3.140625 4.289063 -2.980469 4.0625 -2.875 C 4.386719 -2.777344 4.640625 -2.613281 4.820313 -2.382813 C 4.996094 -2.152344 5.085938 -1.882813 5.085938 -1.570313 C 5.085938 -1.320313 5.027344 -1.078125 4.914063 -0.851563 C 4.796875 -0.617188 4.640625 -0.433594 4.445313 -0.292969 C 4.246094 -0.152344 4.003906 -0.0664063 3.714844 -0.0351563 C 3.53125 -0.015625 3.089844 -0.00390625 2.394531 0 L 0.554688 0 Z M 1.644531 -4.511719 L 1.644531 -3.257813 L 2.363281 -3.257813 C 2.785156 -3.257813 3.050781 -3.261719 3.15625 -3.277344 C 3.339844 -3.296875 3.488281 -3.363281 3.597656 -3.472656 C 3.703125 -3.578125 3.753906 -3.71875 3.757813 -3.894531 C 3.753906 -4.058594 3.707031 -4.195313 3.617188 -4.300781 C 3.523438 -4.40625 3.386719 -4.46875 3.207031 -4.492188 C 3.097656 -4.503906 2.789063 -4.511719 2.273438 -4.511719 Z M 1.644531 -2.359375 L 1.644531 -0.910156 L 2.65625 -0.910156 C 3.046875 -0.90625 3.296875 -0.917969 3.40625 -0.945313 C 3.566406 -0.96875 3.699219 -1.042969 3.804688 -1.160156 C 3.902344 -1.273438 3.953125 -1.425781 3.957031 -1.621094 C 3.953125 -1.78125 3.914063 -1.917969 3.839844 -2.03125 C 3.757813 -2.144531 3.644531 -2.226563 3.496094 -2.28125 C 3.34375 -2.328125 3.019531 -2.355469 2.527344 -2.359375 Z M 1.644531 -2.359375 "
-           id="path8351" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-20">
-        <path
-           style="stroke:none;"
-           d="M 0.542969 -4.453125 L 0.542969 -5.410156 L 1.578125 -5.410156 L 1.578125 -4.453125 Z M 0.542969 0 L 0.542969 -3.921875 L 1.578125 -3.921875 L 1.578125 0 Z M 0.542969 0 "
-           id="path8354" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-21">
-        <path
-           style="stroke:none;"
-           d="M 0.542969 0 L 0.542969 -5.410156 L 1.578125 -5.410156 L 1.578125 0 Z M 0.542969 0 "
-           id="path8357" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-22">
-        <path
-           style="stroke:none;"
-           d="M 0.558594 0 L 0.558594 -5.410156 L 4.265625 -5.410156 L 4.265625 -4.496094 L 1.648438 -4.496094 L 1.648438 -3.214844 L 3.910156 -3.214844 L 3.910156 -2.300781 L 1.648438 -2.300781 L 1.648438 0 Z M 0.558594 0 "
-           id="path8360" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-23">
-        <path
-           style="stroke:none;"
-           d="M 0.535156 0 L 0.535156 -5.410156 L 2.171875 -5.410156 L 3.152344 -1.71875 L 4.125 -5.410156 L 5.761719 -5.410156 L 5.761719 0 L 4.746094 0 L 4.746094 -4.261719 L 3.671875 0 L 2.621094 0 L 1.550781 -4.261719 L 1.550781 0 Z M 0.535156 0 "
-           id="path8363" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-24">
-        <path
-           style="stroke:none;"
-           d="M 0.511719 -3.921875 L 1.480469 -3.921875 L 1.480469 -3.34375 C 1.601563 -3.539063 1.769531 -3.699219 1.988281 -3.824219 C 2.199219 -3.945313 2.4375 -4.007813 2.703125 -4.007813 C 3.15625 -4.007813 3.542969 -3.828125 3.859375 -3.472656 C 4.175781 -3.113281 4.335938 -2.617188 4.335938 -1.980469 C 4.335938 -1.324219 4.175781 -0.8125 3.855469 -0.453125 C 3.535156 -0.0898438 3.148438 0.0859375 2.695313 0.0898438 C 2.476563 0.0859375 2.277344 0.046875 2.105469 -0.0390625 C 1.925781 -0.121094 1.742188 -0.269531 1.550781 -0.484375 L 1.550781 1.492188 L 0.511719 1.492188 Z M 1.539063 -2.027344 C 1.535156 -1.585938 1.621094 -1.257813 1.800781 -1.050781 C 1.972656 -0.835938 2.1875 -0.730469 2.441406 -0.734375 C 2.679688 -0.730469 2.878906 -0.828125 3.039063 -1.023438 C 3.199219 -1.214844 3.28125 -1.53125 3.28125 -1.976563 C 3.28125 -2.382813 3.195313 -2.6875 3.03125 -2.886719 C 2.863281 -3.085938 2.660156 -3.1875 2.421875 -3.1875 C 2.164063 -3.1875 1.953125 -3.089844 1.789063 -2.894531 C 1.617188 -2.699219 1.535156 -2.410156 1.539063 -2.027344 Z M 1.539063 -2.027344 "
-           id="path8366" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-25">
-        <path
-           style="stroke:none;"
-           d="M 1.769531 0 L 1.769531 -4.496094 L 0.164063 -4.496094 L 0.164063 -5.410156 L 4.464844 -5.410156 L 4.464844 -4.496094 L 2.859375 -4.496094 L 2.859375 0 Z M 1.769531 0 "
-           id="path8369" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-26">
-        <path
-           style="stroke:none;"
-           d="M 1.578125 -5.410156 L 1.578125 -3.421875 C 1.90625 -3.8125 2.308594 -4.007813 2.777344 -4.007813 C 3.011719 -4.007813 3.226563 -3.960938 3.421875 -3.875 C 3.613281 -3.78125 3.757813 -3.667969 3.855469 -3.535156 C 3.949219 -3.394531 4.015625 -3.242188 4.054688 -3.078125 C 4.089844 -2.90625 4.105469 -2.648438 4.109375 -2.300781 L 4.109375 0 L 3.070313 0 L 3.070313 -2.070313 C 3.070313 -2.476563 3.050781 -2.738281 3.011719 -2.851563 C 2.972656 -2.960938 2.902344 -3.050781 2.800781 -3.117188 C 2.699219 -3.179688 2.574219 -3.210938 2.429688 -3.214844 C 2.253906 -3.210938 2.101563 -3.167969 1.96875 -3.089844 C 1.832031 -3.003906 1.730469 -2.878906 1.671875 -2.710938 C 1.605469 -2.539063 1.574219 -2.289063 1.578125 -1.964844 L 1.578125 0 L 0.539063 0 L 0.539063 -5.410156 Z M 1.578125 -5.410156 "
-           id="path8372" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-27">
-        <path
-           style="stroke:none;"
-           d="M 1.929688 0 L -0.00390625 -5.410156 L 1.179688 -5.410156 L 2.550781 -1.40625 L 3.875 -5.410156 L 5.035156 -5.410156 L 3.097656 0 Z M 1.929688 0 "
-           id="path8375" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-28">
-        <path
-           style="stroke:none;"
-           d="M 1.273438 0 L 0.0351563 -3.921875 L 1.039063 -3.921875 L 1.777344 -1.351563 L 2.449219 -3.921875 L 3.453125 -3.921875 L 4.105469 -1.351563 L 4.855469 -3.921875 L 5.875 -3.921875 L 4.617188 0 L 3.621094 0 L 2.945313 -2.519531 L 2.28125 0 Z M 1.273438 0 "
-           id="path8378" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-29">
-        <path
-           style="stroke:none;"
-           d="M 0.328125 -2.671875 C 0.328125 -3.222656 0.410156 -3.683594 0.574219 -4.0625 C 0.699219 -4.332031 0.867188 -4.578125 1.078125 -4.800781 C 1.289063 -5.019531 1.519531 -5.183594 1.777344 -5.289063 C 2.113281 -5.429688 2.503906 -5.5 2.945313 -5.503906 C 3.738281 -5.5 4.375 -5.253906 4.859375 -4.761719 C 5.335938 -4.265625 5.578125 -3.578125 5.578125 -2.699219 C 5.578125 -1.820313 5.339844 -1.136719 4.863281 -0.644531 C 4.386719 -0.152344 3.75 0.09375 2.960938 0.09375 C 2.152344 0.09375 1.511719 -0.148438 1.039063 -0.640625 C 0.5625 -1.128906 0.328125 -1.808594 0.328125 -2.671875 Z M 1.453125 -2.710938 C 1.453125 -2.09375 1.59375 -1.628906 1.878906 -1.3125 C 2.160156 -0.996094 2.519531 -0.835938 2.957031 -0.839844 C 3.390625 -0.835938 3.746094 -0.992188 4.027344 -1.308594 C 4.304688 -1.621094 4.445313 -2.09375 4.449219 -2.722656 C 4.445313 -3.34375 4.308594 -3.808594 4.039063 -4.113281 C 3.765625 -4.417969 3.40625 -4.570313 2.957031 -4.570313 C 2.503906 -4.570313 2.140625 -4.414063 1.867188 -4.105469 C 1.589844 -3.796875 1.453125 -3.332031 1.453125 -2.710938 Z M 1.453125 -2.710938 "
-           id="path8381" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph15-30">
-        <path
-           style="stroke:none;"
-           d="M 0.0898438 -3.921875 L 0.664063 -3.921875 L 0.664063 -4.214844 C 0.660156 -4.542969 0.695313 -4.789063 0.769531 -4.953125 C 0.835938 -5.113281 0.964844 -5.246094 1.15625 -5.351563 C 1.34375 -5.449219 1.582031 -5.5 1.871094 -5.503906 C 2.164063 -5.5 2.453125 -5.457031 2.738281 -5.371094 L 2.597656 -4.648438 C 2.429688 -4.683594 2.273438 -4.703125 2.121094 -4.707031 C 1.972656 -4.703125 1.863281 -4.667969 1.800781 -4.601563 C 1.730469 -4.527344 1.699219 -4.394531 1.703125 -4.195313 L 1.703125 -3.921875 L 2.476563 -3.921875 L 2.476563 -3.105469 L 1.703125 -3.105469 L 1.703125 0 L 0.664063 0 L 0.664063 -3.105469 L 0.0898438 -3.105469 Z M 0.0898438 -3.921875 "
-           id="path8384" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph16-0">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 0 L 0.675781 -6.664063 L 5.617188 -6.664063 L 5.617188 -5.535156 L 2.023438 -5.535156 L 2.023438 -4.058594 L 5.367188 -4.058594 L 5.367188 -2.9375 L 2.023438 -2.9375 L 2.023438 -1.121094 L 5.746094 -1.121094 L 5.746094 0 Z M 0.675781 0 "
-           id="path8387" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph16-1">
-        <path
-           style="stroke:none;"
-           d="M 1.996094 0 L 0.0507813 -4.828125 L 1.390625 -4.828125 L 2.300781 -2.363281 L 2.5625 -1.539063 C 2.628906 -1.746094 2.675781 -1.882813 2.695313 -1.953125 C 2.734375 -2.085938 2.777344 -2.222656 2.832031 -2.363281 L 3.75 -4.828125 L 5.0625 -4.828125 L 3.144531 0 Z M 1.996094 0 "
-           id="path8390" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph16-2">
-        <path
-           style="stroke:none;"
-           d="M 3.464844 -1.535156 L 4.734375 -1.324219 C 4.570313 -0.855469 4.3125 -0.496094 3.960938 -0.253906 C 3.605469 -0.0117188 3.164063 0.105469 2.636719 0.109375 C 1.796875 0.105469 1.175781 -0.167969 0.773438 -0.714844 C 0.453125 -1.152344 0.292969 -1.707031 0.296875 -2.378906 C 0.292969 -3.175781 0.5 -3.800781 0.921875 -4.257813 C 1.335938 -4.707031 1.867188 -4.933594 2.507813 -4.9375 C 3.226563 -4.933594 3.792969 -4.695313 4.207031 -4.222656 C 4.621094 -3.746094 4.820313 -3.019531 4.804688 -2.046875 L 1.605469 -2.046875 C 1.613281 -1.664063 1.714844 -1.371094 1.910156 -1.160156 C 2.105469 -0.949219 2.351563 -0.84375 2.648438 -0.84375 C 2.847656 -0.84375 3.015625 -0.898438 3.152344 -1.007813 C 3.289063 -1.117188 3.390625 -1.292969 3.464844 -1.535156 Z M 3.535156 -2.828125 C 3.523438 -3.195313 3.429688 -3.476563 3.25 -3.667969 C 3.066406 -3.859375 2.84375 -3.953125 2.585938 -3.957031 C 2.304688 -3.953125 2.074219 -3.851563 1.894531 -3.652344 C 1.707031 -3.449219 1.621094 -3.175781 1.628906 -2.828125 Z M 3.535156 -2.828125 "
-           id="path8393" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph16-3">
-        <path
-           style="stroke:none;"
-           d="M 5.058594 0 L 3.78125 0 L 3.78125 -2.464844 C 3.777344 -2.984375 3.75 -3.320313 3.699219 -3.472656 C 3.640625 -3.625 3.554688 -3.742188 3.433594 -3.828125 C 3.308594 -3.910156 3.160156 -3.953125 2.992188 -3.957031 C 2.769531 -3.953125 2.570313 -3.894531 2.394531 -3.773438 C 2.21875 -3.652344 2.097656 -3.492188 2.035156 -3.292969 C 1.964844 -3.09375 1.933594 -2.722656 1.9375 -2.1875 L 1.9375 0 L 0.660156 0 L 0.660156 -4.828125 L 1.84375 -4.828125 L 1.84375 -4.117188 C 2.261719 -4.660156 2.792969 -4.933594 3.4375 -4.9375 C 3.714844 -4.933594 3.972656 -4.882813 4.207031 -4.785156 C 4.441406 -4.679688 4.617188 -4.550781 4.738281 -4.394531 C 4.855469 -4.238281 4.9375 -4.058594 4.988281 -3.859375 C 5.03125 -3.65625 5.054688 -3.371094 5.058594 -3 Z M 5.058594 0 "
-           id="path8396" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph16-4">
-        <path
-           style="stroke:none;"
-           d="M 2.882813 -4.828125 L 2.882813 -3.808594 L 2.007813 -3.808594 L 2.007813 -1.863281 C 2.003906 -1.46875 2.011719 -1.238281 2.03125 -1.175781 C 2.046875 -1.109375 2.085938 -1.054688 2.144531 -1.011719 C 2.203125 -0.96875 2.273438 -0.949219 2.363281 -0.949219 C 2.476563 -0.949219 2.648438 -0.988281 2.878906 -1.074219 L 2.984375 -0.0820313 C 2.683594 0.0429688 2.347656 0.105469 1.976563 0.109375 C 1.742188 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.03125 -0.179688 0.945313 -0.304688 C 0.855469 -0.429688 0.796875 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.703125 L 0.726563 -3.808594 L 0.140625 -3.808594 L 0.140625 -4.828125 L 0.726563 -4.828125 L 0.726563 -5.785156 L 2.007813 -6.53125 L 2.007813 -4.828125 Z M 2.882813 -4.828125 "
-           id="path8399" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph16-5">
-        <path
-           style="stroke:none;"
-           d="M 4.941406 -2.449219 L 6.246094 -2.035156 C 6.042969 -1.304688 5.707031 -0.765625 5.246094 -0.414063 C 4.777344 -0.0625 4.1875 0.109375 3.476563 0.113281 C 2.585938 0.109375 1.859375 -0.1875 1.292969 -0.792969 C 0.722656 -1.394531 0.4375 -2.222656 0.441406 -3.273438 C 0.4375 -4.378906 0.726563 -5.238281 1.300781 -5.855469 C 1.871094 -6.464844 2.621094 -6.773438 3.558594 -6.777344 C 4.371094 -6.773438 5.035156 -6.53125 5.546875 -6.054688 C 5.847656 -5.765625 6.074219 -5.359375 6.226563 -4.828125 L 4.894531 -4.507813 C 4.8125 -4.851563 4.648438 -5.125 4.402344 -5.324219 C 4.148438 -5.523438 3.847656 -5.625 3.492188 -5.625 C 2.996094 -5.625 2.59375 -5.445313 2.289063 -5.09375 C 1.980469 -4.734375 1.828125 -4.160156 1.828125 -3.371094 C 1.828125 -2.527344 1.976563 -1.929688 2.28125 -1.570313 C 2.578125 -1.210938 2.972656 -1.03125 3.464844 -1.035156 C 3.816406 -1.03125 4.125 -1.144531 4.386719 -1.375 C 4.640625 -1.601563 4.824219 -1.960938 4.941406 -2.449219 Z M 4.941406 -2.449219 "
-           id="path8402" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph16-6">
-        <path
-           style="stroke:none;"
-           d="M 0.632813 -4.828125 L 1.824219 -4.828125 L 1.824219 -4.117188 C 1.972656 -4.355469 2.183594 -4.550781 2.449219 -4.707031 C 2.710938 -4.855469 3.003906 -4.933594 3.328125 -4.9375 C 3.886719 -4.933594 4.359375 -4.714844 4.753906 -4.277344 C 5.140625 -3.835938 5.335938 -3.222656 5.339844 -2.441406 C 5.335938 -1.632813 5.140625 -1.003906 4.75 -0.558594 C 4.355469 -0.113281 3.878906 0.105469 3.316406 0.109375 C 3.050781 0.105469 2.808594 0.0546875 2.59375 -0.0507813 C 2.375 -0.15625 2.148438 -0.335938 1.910156 -0.59375 L 1.910156 1.835938 L 0.632813 1.835938 Z M 1.894531 -2.496094 C 1.890625 -1.953125 2 -1.550781 2.214844 -1.292969 C 2.429688 -1.03125 2.691406 -0.902344 3.003906 -0.90625 C 3.296875 -0.902344 3.542969 -1.019531 3.742188 -1.261719 C 3.9375 -1.496094 4.035156 -1.886719 4.039063 -2.433594 C 4.035156 -2.933594 3.933594 -3.308594 3.734375 -3.558594 C 3.527344 -3.800781 3.277344 -3.925781 2.980469 -3.925781 C 2.664063 -3.925781 2.40625 -3.804688 2.203125 -3.5625 C 1.996094 -3.320313 1.890625 -2.964844 1.894531 -2.496094 Z M 1.894531 -2.496094 "
-           id="path8405" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph16-7">
-        <path
-           style="stroke:none;"
-           d="M 3.84375 0 L 3.84375 -0.722656 C 3.667969 -0.460938 3.4375 -0.257813 3.152344 -0.113281 C 2.863281 0.0351563 2.558594 0.105469 2.246094 0.109375 C 1.917969 0.105469 1.628906 0.0390625 1.371094 -0.101563 C 1.113281 -0.242188 0.925781 -0.445313 0.8125 -0.703125 C 0.695313 -0.960938 0.636719 -1.316406 0.640625 -1.773438 L 0.640625 -4.828125 L 1.917969 -4.828125 L 1.917969 -2.609375 C 1.914063 -1.925781 1.9375 -1.511719 1.988281 -1.359375 C 2.03125 -1.207031 2.117188 -1.085938 2.246094 -0.996094 C 2.367188 -0.90625 2.527344 -0.859375 2.71875 -0.863281 C 2.9375 -0.859375 3.132813 -0.921875 3.304688 -1.042969 C 3.476563 -1.160156 3.59375 -1.308594 3.65625 -1.488281 C 3.71875 -1.664063 3.75 -2.097656 3.753906 -2.789063 L 3.753906 -4.828125 L 5.03125 -4.828125 L 5.03125 0 Z M 3.84375 0 "
-           id="path8408" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph17-0">
-        <path
-           style="stroke:none;"
-           d="M 4.957031 -2.457031 L 6.265625 -2.042969 C 6.0625 -1.3125 5.730469 -0.769531 5.265625 -0.417969 C 4.796875 -0.0625 4.203125 0.109375 3.488281 0.113281 C 2.597656 0.109375 1.867188 -0.191406 1.296875 -0.796875 C 0.722656 -1.402344 0.4375 -2.230469 0.441406 -3.285156 C 0.4375 -4.394531 0.726563 -5.257813 1.304688 -5.875 C 1.878906 -6.488281 2.632813 -6.796875 3.570313 -6.800781 C 4.382813 -6.796875 5.046875 -6.554688 5.5625 -6.074219 C 5.863281 -5.785156 6.09375 -5.375 6.25 -4.84375 L 4.910156 -4.523438 C 4.832031 -4.867188 4.664063 -5.140625 4.414063 -5.34375 C 4.160156 -5.542969 3.859375 -5.640625 3.503906 -5.644531 C 3.003906 -5.640625 2.597656 -5.464844 2.292969 -5.109375 C 1.980469 -4.753906 1.828125 -4.175781 1.832031 -3.382813 C 1.828125 -2.535156 1.980469 -1.933594 2.289063 -1.578125 C 2.589844 -1.214844 2.984375 -1.035156 3.476563 -1.039063 C 3.828125 -1.035156 4.136719 -1.148438 4.398438 -1.378906 C 4.65625 -1.605469 4.84375 -1.964844 4.957031 -2.457031 Z M 4.957031 -2.457031 "
-           id="path8411" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph17-1">
-        <path
-           style="stroke:none;"
-           d="M 0.632813 -4.84375 L 1.828125 -4.84375 L 1.828125 -4.132813 C 1.984375 -4.371094 2.191406 -4.570313 2.457031 -4.722656 C 2.71875 -4.875 3.015625 -4.949219 3.339844 -4.953125 C 3.902344 -4.949219 4.378906 -4.730469 4.769531 -4.289063 C 5.160156 -3.847656 5.355469 -3.234375 5.359375 -2.449219 C 5.355469 -1.636719 5.15625 -1.007813 4.765625 -0.5625 C 4.367188 -0.117188 3.890625 0.105469 3.328125 0.109375 C 3.0625 0.105469 2.820313 0.0546875 2.601563 -0.0507813 C 2.382813 -0.15625 2.152344 -0.339844 1.914063 -0.597656 L 1.914063 1.84375 L 0.632813 1.84375 Z M 1.902344 -2.503906 C 1.898438 -1.957031 2.007813 -1.554688 2.226563 -1.296875 C 2.441406 -1.03125 2.703125 -0.902344 3.015625 -0.90625 C 3.308594 -0.902344 3.554688 -1.023438 3.757813 -1.265625 C 3.953125 -1.503906 4.054688 -1.894531 4.054688 -2.441406 C 4.054688 -2.945313 3.953125 -3.320313 3.75 -3.570313 C 3.546875 -3.8125 3.292969 -3.9375 2.992188 -3.941406 C 2.675781 -3.9375 2.414063 -3.816406 2.210938 -3.578125 C 2 -3.332031 1.898438 -2.972656 1.902344 -2.503906 Z M 1.902344 -2.503906 "
-           id="path8414" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph17-2">
-        <path
-           style="stroke:none;"
-           d="M 3.859375 0 L 3.859375 -0.726563 C 3.679688 -0.464844 3.445313 -0.257813 3.160156 -0.113281 C 2.871094 0.0351563 2.570313 0.105469 2.253906 0.109375 C 1.925781 0.105469 1.632813 0.0351563 1.375 -0.105469 C 1.113281 -0.246094 0.925781 -0.445313 0.816406 -0.707031 C 0.699219 -0.960938 0.644531 -1.316406 0.644531 -1.777344 L 0.644531 -4.84375 L 1.925781 -4.84375 L 1.925781 -2.617188 C 1.921875 -1.933594 1.945313 -1.515625 1.996094 -1.367188 C 2.039063 -1.210938 2.125 -1.089844 2.253906 -1 C 2.375 -0.910156 2.53125 -0.867188 2.726563 -0.867188 C 2.941406 -0.867188 3.136719 -0.925781 3.3125 -1.046875 C 3.484375 -1.164063 3.605469 -1.3125 3.667969 -1.492188 C 3.730469 -1.667969 3.761719 -2.105469 3.765625 -2.800781 L 3.765625 -4.84375 L 5.046875 -4.84375 L 5.046875 0 Z M 3.859375 0 "
-           id="path8417" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph17-3">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path8420" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-0">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 0 L 0.679688 -6.703125 L 5.652344 -6.703125 L 5.652344 -5.570313 L 2.035156 -5.570313 L 2.035156 -4.082031 L 5.398438 -4.082031 L 5.398438 -2.953125 L 2.035156 -2.953125 L 2.035156 -1.128906 L 5.78125 -1.128906 L 5.78125 0 Z M 0.679688 0 "
-           id="path8423" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-1">
-        <path
-           style="stroke:none;"
-           d="M 0.0546875 0 L 1.804688 -2.5 L 0.128906 -4.855469 L 1.695313 -4.855469 L 2.554688 -3.519531 L 3.460938 -4.855469 L 4.96875 -4.855469 L 3.324219 -2.554688 L 5.121094 0 L 3.542969 0 L 2.554688 -1.503906 L 1.558594 0 Z M 0.0546875 0 "
-           id="path8426" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-2">
-        <path
-           style="stroke:none;"
-           d="M 3.484375 -1.546875 L 4.765625 -1.332031 C 4.59375 -0.855469 4.335938 -0.5 3.984375 -0.257813 C 3.628906 -0.015625 3.183594 0.105469 2.652344 0.109375 C 1.804688 0.105469 1.179688 -0.167969 0.777344 -0.71875 C 0.453125 -1.15625 0.292969 -1.714844 0.296875 -2.390625 C 0.292969 -3.195313 0.503906 -3.824219 0.925781 -4.28125 C 1.347656 -4.734375 1.878906 -4.960938 2.523438 -4.964844 C 3.246094 -4.960938 3.816406 -4.722656 4.234375 -4.25 C 4.648438 -3.769531 4.847656 -3.039063 4.832031 -2.058594 L 1.613281 -2.058594 C 1.621094 -1.671875 1.726563 -1.375 1.925781 -1.167969 C 2.121094 -0.953125 2.367188 -0.847656 2.664063 -0.851563 C 2.863281 -0.847656 3.03125 -0.902344 3.171875 -1.015625 C 3.304688 -1.121094 3.410156 -1.300781 3.484375 -1.546875 Z M 3.558594 -2.84375 C 3.542969 -3.210938 3.449219 -3.496094 3.269531 -3.691406 C 3.085938 -3.886719 2.863281 -3.984375 2.601563 -3.984375 C 2.320313 -3.984375 2.085938 -3.878906 1.90625 -3.675781 C 1.71875 -3.46875 1.628906 -3.191406 1.636719 -2.84375 Z M 3.558594 -2.84375 "
-           id="path8429" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-3">
-        <path
-           style="stroke:none;"
-           d="M 4.90625 -3.421875 L 3.640625 -3.191406 C 3.59375 -3.441406 3.496094 -3.632813 3.347656 -3.761719 C 3.191406 -3.886719 2.996094 -3.949219 2.761719 -3.953125 C 2.4375 -3.949219 2.183594 -3.839844 1.996094 -3.621094 C 1.804688 -3.402344 1.710938 -3.03125 1.710938 -2.515625 C 1.710938 -1.9375 1.804688 -1.53125 2 -1.292969 C 2.1875 -1.054688 2.449219 -0.9375 2.78125 -0.9375 C 3.019531 -0.9375 3.21875 -1.003906 3.378906 -1.144531 C 3.53125 -1.277344 3.640625 -1.515625 3.707031 -1.859375 L 4.96875 -1.644531 C 4.835938 -1.0625 4.582031 -0.625 4.214844 -0.332031 C 3.839844 -0.0390625 3.34375 0.105469 2.71875 0.109375 C 2.007813 0.105469 1.441406 -0.117188 1.019531 -0.5625 C 0.597656 -1.007813 0.386719 -1.625 0.386719 -2.421875 C 0.386719 -3.21875 0.597656 -3.84375 1.023438 -4.292969 C 1.445313 -4.738281 2.019531 -4.960938 2.742188 -4.964844 C 3.332031 -4.960938 3.800781 -4.835938 4.152344 -4.582031 C 4.5 -4.328125 4.75 -3.941406 4.90625 -3.421875 Z M 4.90625 -3.421875 "
-           id="path8432" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-4">
-        <path
-           style="stroke:none;"
-           d="M 4.96875 -2.464844 L 6.28125 -2.046875 C 6.078125 -1.3125 5.742188 -0.769531 5.277344 -0.417969 C 4.808594 -0.0625 4.214844 0.109375 3.496094 0.113281 C 2.605469 0.109375 1.875 -0.191406 1.304688 -0.796875 C 0.730469 -1.40625 0.445313 -2.238281 0.445313 -3.292969 C 0.445313 -4.40625 0.730469 -5.269531 1.308594 -5.890625 C 1.878906 -6.503906 2.636719 -6.8125 3.582031 -6.816406 C 4.394531 -6.8125 5.0625 -6.570313 5.578125 -6.089844 C 5.882813 -5.800781 6.109375 -5.390625 6.265625 -4.855469 L 4.925781 -4.535156 C 4.84375 -4.882813 4.675781 -5.15625 4.425781 -5.359375 C 4.171875 -5.558594 3.867188 -5.660156 3.511719 -5.660156 C 3.011719 -5.660156 2.609375 -5.480469 2.300781 -5.125 C 1.992188 -4.765625 1.839844 -4.1875 1.839844 -3.394531 C 1.839844 -2.542969 1.992188 -1.941406 2.296875 -1.582031 C 2.601563 -1.222656 2.996094 -1.042969 3.484375 -1.042969 C 3.839844 -1.042969 4.148438 -1.15625 4.410156 -1.382813 C 4.667969 -1.609375 4.855469 -1.96875 4.96875 -2.464844 Z M 4.96875 -2.464844 "
-           id="path8435" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-5">
-        <path
-           style="stroke:none;"
-           d="M 3.867188 0 L 3.867188 -0.726563 C 3.6875 -0.46875 3.453125 -0.261719 3.167969 -0.113281 C 2.878906 0.0351563 2.574219 0.105469 2.257813 0.109375 C 1.929688 0.105469 1.636719 0.0351563 1.378906 -0.105469 C 1.117188 -0.246094 0.929688 -0.445313 0.816406 -0.707031 C 0.699219 -0.964844 0.644531 -1.324219 0.644531 -1.78125 L 0.644531 -4.855469 L 1.929688 -4.855469 L 1.929688 -2.625 C 1.925781 -1.941406 1.949219 -1.519531 2 -1.367188 C 2.042969 -1.210938 2.128906 -1.089844 2.257813 -1 C 2.378906 -0.910156 2.539063 -0.867188 2.734375 -0.867188 C 2.949219 -0.867188 3.144531 -0.925781 3.324219 -1.046875 C 3.496094 -1.167969 3.617188 -1.316406 3.679688 -1.496094 C 3.742188 -1.671875 3.773438 -2.109375 3.777344 -2.808594 L 3.777344 -4.855469 L 5.0625 -4.855469 L 5.0625 0 Z M 3.867188 0 "
-           id="path8438" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-6">
-        <path
-           style="stroke:none;"
-           d="M 5.125 0 L 3.933594 0 L 3.933594 -0.714844 C 3.730469 -0.433594 3.496094 -0.226563 3.230469 -0.09375 C 2.957031 0.0390625 2.6875 0.105469 2.414063 0.109375 C 1.855469 0.105469 1.375 -0.117188 0.980469 -0.5625 C 0.578125 -1.011719 0.378906 -1.636719 0.382813 -2.445313 C 0.378906 -3.265625 0.574219 -3.890625 0.964844 -4.320313 C 1.351563 -4.746094 1.839844 -4.960938 2.433594 -4.964844 C 2.972656 -4.960938 3.441406 -4.738281 3.839844 -4.289063 L 3.839844 -6.703125 L 5.125 -6.703125 Z M 1.695313 -2.53125 C 1.691406 -2.011719 1.765625 -1.636719 1.910156 -1.410156 C 2.117188 -1.070313 2.40625 -0.902344 2.78125 -0.90625 C 3.074219 -0.902344 3.324219 -1.027344 3.53125 -1.28125 C 3.738281 -1.53125 3.84375 -1.90625 3.84375 -2.410156 C 3.84375 -2.964844 3.742188 -3.367188 3.542969 -3.613281 C 3.339844 -3.859375 3.082031 -3.984375 2.769531 -3.984375 C 2.460938 -3.984375 2.207031 -3.859375 2.003906 -3.617188 C 1.796875 -3.371094 1.691406 -3.011719 1.695313 -2.53125 Z M 1.695313 -2.53125 "
-           id="path8441" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-7">
-        <path
-           style="stroke:none;"
-           d="M 1.632813 -3.375 L 0.464844 -3.585938 C 0.59375 -4.054688 0.820313 -4.398438 1.144531 -4.625 C 1.460938 -4.847656 1.9375 -4.960938 2.570313 -4.964844 C 3.140625 -4.960938 3.566406 -4.894531 3.847656 -4.761719 C 4.125 -4.625 4.324219 -4.453125 4.441406 -4.246094 C 4.554688 -4.035156 4.609375 -3.652344 4.613281 -3.09375 L 4.601563 -1.597656 C 4.597656 -1.167969 4.617188 -0.851563 4.660156 -0.652344 C 4.699219 -0.445313 4.777344 -0.230469 4.890625 0 L 3.621094 0 C 3.585938 -0.0859375 3.542969 -0.210938 3.496094 -0.378906 C 3.472656 -0.453125 3.457031 -0.503906 3.453125 -0.53125 C 3.226563 -0.316406 2.992188 -0.15625 2.746094 -0.0507813 C 2.492188 0.0546875 2.226563 0.105469 1.949219 0.109375 C 1.445313 0.105469 1.050781 -0.0273438 0.765625 -0.296875 C 0.472656 -0.566406 0.328125 -0.910156 0.332031 -1.324219 C 0.328125 -1.597656 0.394531 -1.839844 0.527344 -2.058594 C 0.660156 -2.269531 0.84375 -2.4375 1.082031 -2.554688 C 1.316406 -2.667969 1.65625 -2.765625 2.101563 -2.851563 C 2.699219 -2.964844 3.117188 -3.070313 3.351563 -3.167969 L 3.351563 -3.296875 C 3.351563 -3.542969 3.289063 -3.71875 3.167969 -3.824219 C 3.042969 -3.929688 2.8125 -3.984375 2.476563 -3.984375 C 2.246094 -3.984375 2.066406 -3.9375 1.941406 -3.847656 C 1.808594 -3.753906 1.707031 -3.597656 1.632813 -3.375 Z M 3.351563 -2.332031 C 3.1875 -2.277344 2.925781 -2.210938 2.570313 -2.132813 C 2.210938 -2.054688 1.976563 -1.980469 1.871094 -1.910156 C 1.699219 -1.792969 1.617188 -1.640625 1.617188 -1.457031 C 1.617188 -1.273438 1.683594 -1.117188 1.816406 -0.992188 C 1.949219 -0.859375 2.121094 -0.796875 2.332031 -0.796875 C 2.5625 -0.796875 2.78125 -0.871094 2.996094 -1.023438 C 3.144531 -1.136719 3.246094 -1.277344 3.300781 -1.449219 C 3.332031 -1.554688 3.351563 -1.765625 3.351563 -2.074219 Z M 3.351563 -2.332031 "
-           id="path8444" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-8">
-        <path
-           style="stroke:none;"
-           d="M 0.6875 0 L 0.6875 -6.703125 L 3.535156 -6.703125 C 4.246094 -6.703125 4.765625 -6.640625 5.09375 -6.519531 C 5.414063 -6.398438 5.675781 -6.183594 5.875 -5.878906 C 6.066406 -5.566406 6.164063 -5.214844 6.167969 -4.824219 C 6.164063 -4.316406 6.015625 -3.902344 5.722656 -3.578125 C 5.425781 -3.246094 4.984375 -3.039063 4.398438 -2.957031 C 4.6875 -2.785156 4.929688 -2.597656 5.121094 -2.394531 C 5.3125 -2.1875 5.570313 -1.824219 5.894531 -1.308594 L 6.710938 0 L 5.09375 0 L 4.113281 -1.457031 C 3.765625 -1.976563 3.527344 -2.304688 3.402344 -2.445313 C 3.273438 -2.578125 3.136719 -2.671875 2.996094 -2.722656 C 2.847656 -2.769531 2.621094 -2.792969 2.3125 -2.796875 L 2.039063 -2.796875 L 2.039063 0 Z M 2.039063 -3.867188 L 3.039063 -3.867188 C 3.6875 -3.867188 4.09375 -3.894531 4.257813 -3.949219 C 4.417969 -4.003906 4.542969 -4.097656 4.636719 -4.230469 C 4.722656 -4.363281 4.769531 -4.53125 4.773438 -4.738281 C 4.769531 -4.960938 4.710938 -5.140625 4.59375 -5.28125 C 4.472656 -5.417969 4.300781 -5.507813 4.082031 -5.546875 C 3.96875 -5.558594 3.640625 -5.566406 3.09375 -5.570313 L 2.039063 -5.570313 Z M 2.039063 -3.867188 "
-           id="path8447" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph18-9">
-        <path
-           style="stroke:none;"
-           d="M 2.898438 -4.855469 L 2.898438 -3.832031 L 2.019531 -3.832031 L 2.019531 -1.875 C 2.015625 -1.476563 2.023438 -1.242188 2.042969 -1.179688 C 2.058594 -1.109375 2.097656 -1.058594 2.160156 -1.019531 C 2.21875 -0.976563 2.292969 -0.953125 2.378906 -0.957031 C 2.496094 -0.953125 2.667969 -0.996094 2.894531 -1.078125 L 3.003906 -0.0820313 C 2.703125 0.0429688 2.363281 0.105469 1.988281 0.109375 C 1.753906 0.105469 1.546875 0.0703125 1.359375 -0.0078125 C 1.171875 -0.0820313 1.035156 -0.183594 0.953125 -0.308594 C 0.863281 -0.429688 0.804688 -0.597656 0.773438 -0.808594 C 0.742188 -0.957031 0.726563 -1.257813 0.730469 -1.714844 L 0.730469 -3.832031 L 0.140625 -3.832031 L 0.140625 -4.855469 L 0.730469 -4.855469 L 0.730469 -5.820313 L 2.019531 -6.570313 L 2.019531 -4.855469 Z M 2.898438 -4.855469 "
-           id="path8450" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-0">
-        <path
-           style="stroke:none;"
-           d="M 0.339844 -2.191406 L 1.660156 -2.320313 C 1.738281 -1.875 1.902344 -1.546875 2.144531 -1.339844 C 2.386719 -1.128906 2.714844 -1.023438 3.132813 -1.027344 C 3.566406 -1.023438 3.894531 -1.117188 4.121094 -1.304688 C 4.339844 -1.488281 4.453125 -1.707031 4.453125 -1.957031 C 4.453125 -2.113281 4.40625 -2.246094 4.3125 -2.363281 C 4.21875 -2.472656 4.054688 -2.570313 3.824219 -2.652344 C 3.664063 -2.707031 3.300781 -2.804688 2.738281 -2.949219 C 2.003906 -3.125 1.492188 -3.347656 1.203125 -3.613281 C 0.785156 -3.980469 0.578125 -4.433594 0.582031 -4.96875 C 0.578125 -5.308594 0.675781 -5.628906 0.875 -5.929688 C 1.066406 -6.226563 1.347656 -6.453125 1.714844 -6.609375 C 2.078125 -6.765625 2.519531 -6.84375 3.039063 -6.84375 C 3.882813 -6.84375 4.519531 -6.65625 4.945313 -6.289063 C 5.371094 -5.914063 5.59375 -5.421875 5.621094 -4.808594 L 4.261719 -4.746094 C 4.199219 -5.089844 4.074219 -5.335938 3.882813 -5.492188 C 3.691406 -5.640625 3.40625 -5.71875 3.027344 -5.722656 C 2.628906 -5.71875 2.320313 -5.636719 2.097656 -5.476563 C 1.953125 -5.371094 1.878906 -5.230469 1.882813 -5.058594 C 1.878906 -4.894531 1.949219 -4.761719 2.085938 -4.652344 C 2.253906 -4.503906 2.667969 -4.351563 3.332031 -4.199219 C 3.992188 -4.042969 4.480469 -3.882813 4.800781 -3.714844 C 5.113281 -3.546875 5.363281 -3.316406 5.542969 -3.03125 C 5.722656 -2.738281 5.8125 -2.382813 5.8125 -1.960938 C 5.8125 -1.574219 5.703125 -1.210938 5.488281 -0.875 C 5.273438 -0.535156 4.972656 -0.285156 4.582031 -0.125 C 4.191406 0.0390625 3.703125 0.117188 3.117188 0.121094 C 2.265625 0.117188 1.609375 -0.078125 1.15625 -0.46875 C 0.695313 -0.863281 0.425781 -1.4375 0.339844 -2.191406 Z M 0.339844 -2.191406 "
-           id="path8453" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-1">
-        <path
-           style="stroke:none;"
-           d="M 2.910156 -4.875 L 2.910156 -3.847656 L 2.03125 -3.847656 L 2.03125 -1.882813 C 2.027344 -1.480469 2.035156 -1.25 2.054688 -1.1875 C 2.066406 -1.121094 2.105469 -1.066406 2.167969 -1.023438 C 2.226563 -0.980469 2.296875 -0.960938 2.386719 -0.960938 C 2.503906 -0.960938 2.679688 -1 2.90625 -1.082031 L 3.015625 -0.0820313 C 2.714844 0.0429688 2.375 0.105469 1.996094 0.109375 C 1.761719 0.105469 1.550781 0.0703125 1.367188 -0.0078125 C 1.179688 -0.0820313 1.042969 -0.183594 0.957031 -0.308594 C 0.867188 -0.429688 0.808594 -0.597656 0.777344 -0.8125 C 0.746094 -0.960938 0.730469 -1.265625 0.734375 -1.722656 L 0.734375 -3.847656 L 0.140625 -3.847656 L 0.140625 -4.875 L 0.734375 -4.875 L 0.734375 -5.84375 L 2.03125 -6.597656 L 2.03125 -4.875 Z M 2.910156 -4.875 "
-           id="path8456" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-2">
-        <path
-           style="stroke:none;"
-           d="M 1.910156 0 L 0.621094 0 L 0.621094 -4.875 L 1.816406 -4.875 L 1.816406 -4.183594 C 2.019531 -4.511719 2.203125 -4.726563 2.371094 -4.828125 C 2.53125 -4.929688 2.71875 -4.980469 2.929688 -4.984375 C 3.222656 -4.980469 3.503906 -4.898438 3.777344 -4.742188 L 3.378906 -3.617188 C 3.160156 -3.753906 2.957031 -3.824219 2.773438 -3.828125 C 2.589844 -3.824219 2.4375 -3.777344 2.3125 -3.679688 C 2.183594 -3.578125 2.085938 -3.398438 2.015625 -3.140625 C 1.945313 -2.878906 1.910156 -2.332031 1.910156 -1.507813 Z M 1.910156 0 "
-           id="path8459" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-3">
-        <path
-           style="stroke:none;"
-           d="M 3.5 -1.550781 L 4.785156 -1.335938 C 4.617188 -0.863281 4.355469 -0.503906 4 -0.257813 C 3.640625 -0.015625 3.195313 0.105469 2.664063 0.109375 C 1.8125 0.105469 1.183594 -0.167969 0.78125 -0.722656 C 0.453125 -1.160156 0.292969 -1.722656 0.296875 -2.402344 C 0.292969 -3.207031 0.503906 -3.835938 0.929688 -4.296875 C 1.351563 -4.75 1.886719 -4.980469 2.535156 -4.984375 C 3.257813 -4.980469 3.828125 -4.742188 4.25 -4.265625 C 4.664063 -3.789063 4.867188 -3.054688 4.851563 -2.066406 L 1.621094 -2.066406 C 1.625 -1.679688 1.726563 -1.382813 1.929688 -1.171875 C 2.125 -0.960938 2.375 -0.855469 2.675781 -0.855469 C 2.875 -0.855469 3.046875 -0.910156 3.1875 -1.019531 C 3.324219 -1.128906 3.425781 -1.304688 3.5 -1.550781 Z M 3.570313 -2.855469 C 3.558594 -3.222656 3.464844 -3.507813 3.28125 -3.707031 C 3.097656 -3.898438 2.875 -3.996094 2.613281 -4 C 2.332031 -3.996094 2.097656 -3.894531 1.914063 -3.691406 C 1.730469 -3.484375 1.640625 -3.203125 1.644531 -2.855469 Z M 3.570313 -2.855469 "
-           id="path8462" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-4">
-        <path
-           style="stroke:none;"
-           d="M 1.640625 -3.386719 L 0.46875 -3.597656 C 0.597656 -4.070313 0.820313 -4.417969 1.144531 -4.644531 C 1.464844 -4.867188 1.945313 -4.980469 2.582031 -4.984375 C 3.15625 -4.980469 3.585938 -4.914063 3.867188 -4.78125 C 4.148438 -4.644531 4.34375 -4.46875 4.460938 -4.261719 C 4.570313 -4.050781 4.628906 -3.667969 4.632813 -3.109375 L 4.617188 -1.601563 C 4.617188 -1.171875 4.636719 -0.855469 4.679688 -0.652344 C 4.71875 -0.445313 4.796875 -0.230469 4.914063 0 L 3.636719 0 C 3.601563 -0.0859375 3.558594 -0.210938 3.511719 -0.382813 C 3.488281 -0.453125 3.472656 -0.503906 3.464844 -0.53125 C 3.242188 -0.316406 3.007813 -0.15625 2.757813 -0.0507813 C 2.503906 0.0546875 2.238281 0.105469 1.957031 0.109375 C 1.453125 0.105469 1.054688 -0.0273438 0.769531 -0.296875 C 0.476563 -0.566406 0.332031 -0.910156 0.335938 -1.332031 C 0.332031 -1.605469 0.398438 -1.851563 0.53125 -2.066406 C 0.660156 -2.28125 0.84375 -2.445313 1.085938 -2.5625 C 1.320313 -2.675781 1.664063 -2.777344 2.113281 -2.863281 C 2.710938 -2.976563 3.128906 -3.082031 3.367188 -3.183594 L 3.367188 -3.308594 C 3.363281 -3.554688 3.300781 -3.730469 3.183594 -3.839844 C 3.058594 -3.941406 2.828125 -3.996094 2.488281 -4 C 2.257813 -3.996094 2.078125 -3.949219 1.949219 -3.863281 C 1.820313 -3.769531 1.71875 -3.613281 1.640625 -3.386719 Z M 3.367188 -2.339844 C 3.199219 -2.28125 2.9375 -2.21875 2.578125 -2.144531 C 2.21875 -2.066406 1.984375 -1.988281 1.878906 -1.917969 C 1.707031 -1.792969 1.621094 -1.644531 1.625 -1.464844 C 1.621094 -1.285156 1.691406 -1.128906 1.828125 -0.996094 C 1.960938 -0.863281 2.128906 -0.796875 2.339844 -0.800781 C 2.570313 -0.796875 2.792969 -0.871094 3.007813 -1.027344 C 3.160156 -1.140625 3.265625 -1.285156 3.316406 -1.457031 C 3.347656 -1.5625 3.363281 -1.773438 3.367188 -2.085938 Z M 3.367188 -2.339844 "
-           id="path8465" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-5">
-        <path
-           style="stroke:none;"
-           d="M 0.578125 -4.875 L 1.769531 -4.875 L 1.769531 -4.210938 C 2.191406 -4.722656 2.695313 -4.980469 3.289063 -4.984375 C 3.59375 -4.980469 3.863281 -4.917969 4.097656 -4.792969 C 4.324219 -4.664063 4.515625 -4.46875 4.664063 -4.210938 C 4.875 -4.46875 5.105469 -4.664063 5.355469 -4.792969 C 5.605469 -4.917969 5.871094 -4.980469 6.152344 -4.984375 C 6.511719 -4.980469 6.8125 -4.90625 7.0625 -4.765625 C 7.304688 -4.617188 7.492188 -4.40625 7.617188 -4.128906 C 7.703125 -3.917969 7.746094 -3.582031 7.75 -3.117188 L 7.75 0 L 6.460938 0 L 6.460938 -2.785156 C 6.457031 -3.265625 6.414063 -3.578125 6.328125 -3.722656 C 6.207031 -3.902344 6.023438 -3.996094 5.777344 -4 C 5.59375 -3.996094 5.421875 -3.941406 5.265625 -3.832031 C 5.105469 -3.722656 4.992188 -3.558594 4.921875 -3.347656 C 4.851563 -3.132813 4.816406 -2.796875 4.816406 -2.339844 L 4.816406 0 L 3.527344 0 L 3.527344 -2.671875 C 3.527344 -3.144531 3.503906 -3.449219 3.457031 -3.589844 C 3.410156 -3.726563 3.335938 -3.828125 3.242188 -3.898438 C 3.140625 -3.960938 3.011719 -3.996094 2.851563 -4 C 2.652344 -3.996094 2.472656 -3.941406 2.3125 -3.835938 C 2.152344 -3.726563 2.039063 -3.570313 1.96875 -3.371094 C 1.898438 -3.167969 1.863281 -2.835938 1.867188 -2.367188 L 1.867188 0 L 0.578125 0 Z M 0.578125 -4.875 "
-           id="path8468" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-6">
-        <path
-           style="stroke:none;"
-           d="M 4.992188 -2.476563 L 6.308594 -2.058594 C 6.101563 -1.316406 5.765625 -0.769531 5.300781 -0.417969 C 4.828125 -0.0625 4.230469 0.109375 3.511719 0.113281 C 2.613281 0.109375 1.878906 -0.191406 1.308594 -0.800781 C 0.730469 -1.410156 0.445313 -2.246094 0.445313 -3.304688 C 0.445313 -4.421875 0.734375 -5.289063 1.3125 -5.914063 C 1.890625 -6.53125 2.648438 -6.84375 3.59375 -6.84375 C 4.414063 -6.84375 5.085938 -6.601563 5.601563 -6.117188 C 5.90625 -5.828125 6.132813 -5.414063 6.289063 -4.875 L 4.945313 -4.554688 C 4.863281 -4.898438 4.695313 -5.175781 4.445313 -5.378906 C 4.191406 -5.582031 3.886719 -5.683594 3.527344 -5.683594 C 3.027344 -5.683594 2.621094 -5.503906 2.3125 -5.144531 C 2 -4.785156 1.84375 -4.203125 1.847656 -3.40625 C 1.84375 -2.550781 1.996094 -1.945313 2.304688 -1.585938 C 2.609375 -1.222656 3.007813 -1.042969 3.5 -1.046875 C 3.855469 -1.042969 4.167969 -1.15625 4.429688 -1.390625 C 4.6875 -1.617188 4.875 -1.980469 4.992188 -2.476563 Z M 4.992188 -2.476563 "
-           id="path8471" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-7">
-        <path
-           style="stroke:none;"
-           d="M 0.636719 -4.875 L 1.839844 -4.875 L 1.839844 -4.160156 C 1.996094 -4.402344 2.207031 -4.597656 2.472656 -4.753906 C 2.738281 -4.902344 3.03125 -4.980469 3.359375 -4.984375 C 3.921875 -4.980469 4.402344 -4.757813 4.800781 -4.320313 C 5.191406 -3.875 5.390625 -3.257813 5.394531 -2.464844 C 5.390625 -1.648438 5.191406 -1.015625 4.796875 -0.566406 C 4.394531 -0.117188 3.914063 0.105469 3.351563 0.109375 C 3.082031 0.105469 2.835938 0.0546875 2.621094 -0.0507813 C 2.398438 -0.15625 2.167969 -0.339844 1.929688 -0.601563 L 1.929688 1.855469 L 0.636719 1.855469 Z M 1.914063 -2.519531 C 1.910156 -1.972656 2.019531 -1.566406 2.238281 -1.304688 C 2.457031 -1.042969 2.722656 -0.914063 3.035156 -0.914063 C 3.332031 -0.914063 3.578125 -1.03125 3.78125 -1.273438 C 3.976563 -1.507813 4.078125 -1.902344 4.082031 -2.457031 C 4.078125 -2.964844 3.976563 -3.34375 3.773438 -3.59375 C 3.566406 -3.839844 3.3125 -3.964844 3.011719 -3.96875 C 2.695313 -3.964844 2.433594 -3.84375 2.226563 -3.601563 C 2.015625 -3.355469 1.910156 -2.996094 1.914063 -2.519531 Z M 1.914063 -2.519531 "
-           id="path8474" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-8">
-        <path
-           style="stroke:none;"
-           d="M 3.882813 0 L 3.882813 -0.730469 C 3.703125 -0.46875 3.46875 -0.261719 3.183594 -0.113281 C 2.890625 0.0351563 2.585938 0.105469 2.269531 0.109375 C 1.9375 0.105469 1.644531 0.0351563 1.386719 -0.105469 C 1.125 -0.246094 0.9375 -0.445313 0.820313 -0.710938 C 0.703125 -0.96875 0.644531 -1.328125 0.648438 -1.789063 L 0.648438 -4.875 L 1.9375 -4.875 L 1.9375 -2.636719 C 1.933594 -1.949219 1.957031 -1.527344 2.007813 -1.375 C 2.050781 -1.21875 2.136719 -1.097656 2.265625 -1.007813 C 2.390625 -0.914063 2.550781 -0.867188 2.746094 -0.871094 C 2.964844 -0.867188 3.160156 -0.925781 3.335938 -1.050781 C 3.507813 -1.167969 3.628906 -1.320313 3.695313 -1.503906 C 3.757813 -1.679688 3.789063 -2.117188 3.792969 -2.820313 L 3.792969 -4.875 L 5.082031 -4.875 L 5.082031 0 Z M 3.882813 0 "
-           id="path8477" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-9">
-        <path
-           style="stroke:none;"
-           d="M 0.0625 -4.875 L 1.4375 -4.875 L 2.601563 -1.414063 L 3.742188 -4.875 L 5.078125 -4.875 L 3.355469 -0.183594 L 3.046875 0.664063 C 2.933594 0.945313 2.824219 1.164063 2.722656 1.316406 C 2.617188 1.464844 2.5 1.585938 2.371094 1.683594 C 2.234375 1.773438 2.070313 1.847656 1.878906 1.902344 C 1.683594 1.953125 1.464844 1.980469 1.222656 1.980469 C 0.96875 1.980469 0.726563 1.953125 0.492188 1.902344 L 0.375 0.890625 C 0.578125 0.929688 0.757813 0.949219 0.921875 0.949219 C 1.21875 0.949219 1.441406 0.859375 1.585938 0.683594 C 1.730469 0.507813 1.839844 0.285156 1.917969 0.015625 Z M 0.0625 -4.875 "
-           id="path8480" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-10">
-        <path
-           style="stroke:none;"
-           d="M 5.109375 0 L 3.820313 0 L 3.820313 -2.488281 C 3.820313 -3.011719 3.792969 -3.351563 3.738281 -3.507813 C 3.683594 -3.660156 3.59375 -3.78125 3.46875 -3.871094 C 3.34375 -3.953125 3.191406 -3.996094 3.019531 -4 C 2.792969 -3.996094 2.59375 -3.933594 2.417969 -3.816406 C 2.238281 -3.691406 2.117188 -3.53125 2.054688 -3.328125 C 1.988281 -3.125 1.957031 -2.75 1.957031 -2.207031 L 1.957031 0 L 0.664063 0 L 0.664063 -4.875 L 1.863281 -4.875 L 1.863281 -4.160156 C 2.289063 -4.707031 2.824219 -4.980469 3.472656 -4.984375 C 3.753906 -4.980469 4.011719 -4.929688 4.25 -4.832031 C 4.484375 -4.726563 4.664063 -4.59375 4.785156 -4.4375 C 4.90625 -4.273438 4.992188 -4.09375 5.039063 -3.894531 C 5.085938 -3.691406 5.109375 -3.402344 5.109375 -3.03125 Z M 5.109375 0 "
-           id="path8483" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph19-11">
-        <path
-           style="stroke:none;"
-           d="M 4.925781 -3.433594 L 3.65625 -3.203125 C 3.609375 -3.457031 3.511719 -3.648438 3.363281 -3.777344 C 3.210938 -3.90625 3.011719 -3.972656 2.773438 -3.972656 C 2.449219 -3.972656 2.195313 -3.859375 2.003906 -3.636719 C 1.8125 -3.414063 1.714844 -3.042969 1.71875 -2.523438 C 1.714844 -1.941406 1.8125 -1.53125 2.007813 -1.296875 C 2.199219 -1.054688 2.460938 -0.9375 2.792969 -0.941406 C 3.035156 -0.9375 3.234375 -1.007813 3.394531 -1.148438 C 3.546875 -1.289063 3.65625 -1.527344 3.722656 -1.867188 L 4.992188 -1.652344 C 4.859375 -1.070313 4.605469 -0.628906 4.230469 -0.335938 C 3.855469 -0.0390625 3.355469 0.105469 2.730469 0.109375 C 2.015625 0.105469 1.445313 -0.117188 1.023438 -0.5625 C 0.597656 -1.011719 0.386719 -1.632813 0.390625 -2.433594 C 0.386719 -3.234375 0.597656 -3.863281 1.027344 -4.3125 C 1.449219 -4.757813 2.027344 -4.980469 2.753906 -4.984375 C 3.347656 -4.980469 3.816406 -4.851563 4.167969 -4.601563 C 4.515625 -4.34375 4.769531 -3.957031 4.925781 -3.433594 Z M 4.925781 -3.433594 "
-           id="path8486" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph20-0">
-        <path
-           style="stroke:none;"
-           d="M 0.804688 -0.605469 C 0.800781 -0.832031 0.882813 -1.027344 1.046875 -1.191406 C 1.207031 -1.347656 1.402344 -1.429688 1.632813 -1.433594 C 1.863281 -1.429688 2.058594 -1.347656 2.222656 -1.191406 C 2.378906 -1.027344 2.460938 -0.832031 2.464844 -0.605469 C 2.460938 -0.371094 2.378906 -0.175781 2.222656 -0.0117188 C 2.058594 0.148438 1.863281 0.226563 1.632813 0.230469 C 1.402344 0.226563 1.207031 0.148438 1.046875 -0.0117188 C 0.882813 -0.175781 0.800781 -0.371094 0.804688 -0.605469 Z M 0.804688 -0.605469 "
-           id="path8489" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph21-0">
-        <path
-           style="stroke:none;"
-           d="M 6.507813 0 L 5.085938 0 L 4.519531 -1.472656 L 1.925781 -1.472656 L 1.390625 0 L 0 0 L 2.527344 -6.488281 L 3.910156 -6.488281 Z M 4.097656 -2.566406 L 3.203125 -4.972656 L 2.328125 -2.566406 Z M 4.097656 -2.566406 "
-           id="path8492" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph21-1">
-        <path
-           style="stroke:none;"
-           d="M 4.75 -3.308594 L 3.523438 -3.089844 C 3.476563 -3.332031 3.382813 -3.515625 3.242188 -3.640625 C 3.09375 -3.761719 2.90625 -3.824219 2.671875 -3.828125 C 2.359375 -3.824219 2.109375 -3.71875 1.929688 -3.507813 C 1.742188 -3.292969 1.652344 -2.933594 1.65625 -2.433594 C 1.652344 -1.871094 1.746094 -1.476563 1.9375 -1.25 C 2.121094 -1.015625 2.371094 -0.902344 2.691406 -0.90625 C 2.921875 -0.902344 3.117188 -0.96875 3.269531 -1.105469 C 3.417969 -1.238281 3.523438 -1.46875 3.589844 -1.800781 L 4.808594 -1.59375 C 4.679688 -1.03125 4.4375 -0.605469 4.078125 -0.324219 C 3.71875 -0.0351563 3.234375 0.105469 2.632813 0.105469 C 1.941406 0.105469 1.394531 -0.109375 0.988281 -0.542969 C 0.578125 -0.976563 0.375 -1.578125 0.375 -2.34375 C 0.375 -3.117188 0.578125 -3.71875 0.988281 -4.15625 C 1.398438 -4.585938 1.953125 -4.804688 2.65625 -4.804688 C 3.226563 -4.804688 3.683594 -4.679688 4.019531 -4.433594 C 4.355469 -4.1875 4.597656 -3.8125 4.75 -3.308594 Z M 4.75 -3.308594 "
-           id="path8495" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph21-2">
-        <path
-           style="stroke:none;"
-           d="M 4.808594 -2.386719 L 6.082031 -1.984375 C 5.886719 -1.273438 5.5625 -0.746094 5.109375 -0.40625 C 4.65625 -0.0625 4.082031 0.105469 3.386719 0.109375 C 2.523438 0.105469 1.8125 -0.1875 1.261719 -0.773438 C 0.703125 -1.359375 0.425781 -2.164063 0.429688 -3.1875 C 0.425781 -4.265625 0.707031 -5.101563 1.265625 -5.699219 C 1.820313 -6.296875 2.550781 -6.597656 3.464844 -6.597656 C 4.253906 -6.597656 4.898438 -6.363281 5.398438 -5.894531 C 5.6875 -5.613281 5.910156 -5.214844 6.0625 -4.699219 L 4.765625 -4.390625 C 4.6875 -4.722656 4.527344 -4.988281 4.285156 -5.183594 C 4.039063 -5.375 3.746094 -5.472656 3.398438 -5.476563 C 2.914063 -5.472656 2.523438 -5.300781 2.226563 -4.957031 C 1.925781 -4.613281 1.773438 -4.054688 1.777344 -3.285156 C 1.773438 -2.460938 1.921875 -1.875 2.21875 -1.53125 C 2.511719 -1.179688 2.894531 -1.007813 3.371094 -1.007813 C 3.714844 -1.007813 4.015625 -1.117188 4.269531 -1.339844 C 4.519531 -1.558594 4.699219 -1.90625 4.808594 -2.386719 Z M 4.808594 -2.386719 "
-           id="path8498" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph21-3">
-        <path
-           style="stroke:none;"
-           d="M 0.613281 -4.699219 L 1.773438 -4.699219 L 1.773438 -4.007813 C 1.921875 -4.242188 2.125 -4.433594 2.382813 -4.582031 C 2.636719 -4.730469 2.921875 -4.804688 3.238281 -4.804688 C 3.78125 -4.804688 4.246094 -4.589844 4.628906 -4.164063 C 5.007813 -3.734375 5.195313 -3.136719 5.199219 -2.375 C 5.195313 -1.585938 5.003906 -0.976563 4.625 -0.546875 C 4.238281 -0.109375 3.773438 0.105469 3.230469 0.105469 C 2.964844 0.105469 2.730469 0.0546875 2.523438 -0.046875 C 2.308594 -0.148438 2.085938 -0.324219 1.859375 -0.578125 L 1.859375 1.789063 L 0.613281 1.789063 Z M 1.84375 -2.429688 C 1.839844 -1.898438 1.945313 -1.507813 2.15625 -1.257813 C 2.367188 -1.003906 2.621094 -0.878906 2.925781 -0.878906 C 3.210938 -0.878906 3.453125 -0.992188 3.644531 -1.226563 C 3.835938 -1.453125 3.929688 -1.835938 3.933594 -2.367188 C 3.929688 -2.859375 3.832031 -3.222656 3.636719 -3.464844 C 3.4375 -3.699219 3.191406 -3.820313 2.902344 -3.824219 C 2.59375 -3.820313 2.34375 -3.703125 2.144531 -3.472656 C 1.941406 -3.234375 1.839844 -2.886719 1.84375 -2.429688 Z M 1.84375 -2.429688 "
-           id="path8501" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph21-4">
-        <path
-           style="stroke:none;"
-           d="M 3.742188 0 L 3.742188 -0.703125 C 3.570313 -0.453125 3.347656 -0.253906 3.070313 -0.109375 C 2.789063 0.0351563 2.492188 0.105469 2.1875 0.105469 C 1.867188 0.105469 1.585938 0.0390625 1.335938 -0.101563 C 1.085938 -0.234375 0.902344 -0.429688 0.792969 -0.683594 C 0.675781 -0.933594 0.621094 -1.28125 0.625 -1.726563 L 0.625 -4.699219 L 1.867188 -4.699219 L 1.867188 -2.539063 C 1.863281 -1.875 1.886719 -1.472656 1.933594 -1.324219 C 1.980469 -1.175781 2.0625 -1.054688 2.183594 -0.96875 C 2.304688 -0.878906 2.457031 -0.835938 2.644531 -0.839844 C 2.855469 -0.835938 3.046875 -0.894531 3.214844 -1.015625 C 3.382813 -1.128906 3.5 -1.273438 3.5625 -1.449219 C 3.625 -1.617188 3.65625 -2.042969 3.65625 -2.71875 L 3.65625 -4.699219 L 4.898438 -4.699219 L 4.898438 0 Z M 3.742188 0 "
-           id="path8504" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph22-0">
-        <path
-           style="stroke:none;"
-           d="M 0.410156 -3.320313 C 0.410156 -4.003906 0.511719 -4.578125 0.714844 -5.046875 C 0.867188 -5.382813 1.074219 -5.691406 1.339844 -5.96875 C 1.601563 -6.238281 1.890625 -6.441406 2.210938 -6.574219 C 2.625 -6.75 3.109375 -6.835938 3.660156 -6.839844 C 4.648438 -6.835938 5.441406 -6.53125 6.039063 -5.917969 C 6.632813 -5.300781 6.929688 -4.445313 6.933594 -3.355469 C 6.929688 -2.265625 6.636719 -1.414063 6.046875 -0.804688 C 5.453125 -0.191406 4.664063 0.109375 3.679688 0.113281 C 2.679688 0.109375 1.882813 -0.191406 1.292969 -0.800781 C 0.703125 -1.40625 0.410156 -2.246094 0.410156 -3.320313 Z M 1.808594 -3.367188 C 1.804688 -2.605469 1.980469 -2.027344 2.335938 -1.636719 C 2.683594 -1.238281 3.128906 -1.042969 3.675781 -1.046875 C 4.214844 -1.042969 4.660156 -1.238281 5.007813 -1.628906 C 5.351563 -2.019531 5.523438 -2.605469 5.527344 -3.386719 C 5.523438 -4.15625 5.355469 -4.730469 5.019531 -5.109375 C 4.679688 -5.488281 4.230469 -5.675781 3.675781 -5.679688 C 3.113281 -5.675781 2.664063 -5.484375 2.320313 -5.101563 C 1.976563 -4.714844 1.804688 -4.136719 1.808594 -3.367188 Z M 1.808594 -3.367188 "
-           id="path8507" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph22-1">
-        <path
-           style="stroke:none;"
-           d="M 0.578125 -4.871094 L 1.765625 -4.871094 L 1.765625 -4.207031 C 2.1875 -4.722656 2.695313 -4.980469 3.285156 -4.980469 C 3.59375 -4.980469 3.863281 -4.914063 4.097656 -4.789063 C 4.324219 -4.65625 4.511719 -4.464844 4.660156 -4.207031 C 4.871094 -4.464844 5.101563 -4.65625 5.351563 -4.789063 C 5.597656 -4.914063 5.863281 -4.980469 6.148438 -4.980469 C 6.5 -4.980469 6.804688 -4.90625 7.054688 -4.761719 C 7.300781 -4.617188 7.484375 -4.40625 7.609375 -4.125 C 7.699219 -3.914063 7.746094 -3.574219 7.746094 -3.113281 L 7.746094 0 L 6.453125 0 L 6.453125 -2.785156 C 6.449219 -3.261719 6.40625 -3.574219 6.320313 -3.71875 C 6.199219 -3.902344 6.015625 -3.996094 5.769531 -3.996094 C 5.585938 -3.996094 5.417969 -3.9375 5.261719 -3.828125 C 5.101563 -3.714844 4.984375 -3.554688 4.917969 -3.34375 C 4.84375 -3.128906 4.808594 -2.792969 4.8125 -2.339844 L 4.8125 0 L 3.523438 0 L 3.523438 -2.671875 C 3.519531 -3.144531 3.496094 -3.449219 3.453125 -3.585938 C 3.40625 -3.722656 3.335938 -3.824219 3.242188 -3.894531 C 3.140625 -3.960938 3.011719 -3.996094 2.847656 -3.996094 C 2.648438 -3.996094 2.46875 -3.941406 2.3125 -3.835938 C 2.152344 -3.726563 2.039063 -3.570313 1.96875 -3.371094 C 1.898438 -3.167969 1.863281 -2.835938 1.867188 -2.367188 L 1.867188 0 L 0.578125 0 Z M 0.578125 -4.871094 "
-           id="path8510" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph22-2">
-        <path
-           style="stroke:none;"
-           d="M 0.636719 -4.871094 L 1.839844 -4.871094 L 1.839844 -4.15625 C 1.996094 -4.398438 2.207031 -4.597656 2.472656 -4.75 C 2.738281 -4.902344 3.03125 -4.980469 3.359375 -4.980469 C 3.921875 -4.980469 4.402344 -4.757813 4.796875 -4.316406 C 5.191406 -3.871094 5.390625 -3.253906 5.390625 -2.464844 C 5.390625 -1.648438 5.191406 -1.015625 4.792969 -0.566406 C 4.394531 -0.117188 3.910156 0.105469 3.347656 0.109375 C 3.074219 0.105469 2.832031 0.0546875 2.617188 -0.0507813 C 2.394531 -0.15625 2.164063 -0.339844 1.925781 -0.601563 L 1.925781 1.851563 L 0.636719 1.851563 Z M 1.914063 -2.519531 C 1.910156 -1.972656 2.019531 -1.566406 2.238281 -1.304688 C 2.457031 -1.042969 2.71875 -0.914063 3.03125 -0.914063 C 3.328125 -0.914063 3.578125 -1.03125 3.777344 -1.273438 C 3.976563 -1.507813 4.078125 -1.902344 4.078125 -2.453125 C 4.078125 -2.960938 3.972656 -3.339844 3.769531 -3.589844 C 3.5625 -3.835938 3.308594 -3.960938 3.007813 -3.964844 C 2.6875 -3.960938 2.425781 -3.839844 2.222656 -3.597656 C 2.011719 -3.355469 1.910156 -2.996094 1.914063 -2.519531 Z M 1.914063 -2.519531 "
-           id="path8513" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph22-3">
-        <path
-           style="stroke:none;"
-           d="M 4.753906 -1.199219 L 4.753906 0 L 0.234375 0 C 0.28125 -0.453125 0.425781 -0.878906 0.675781 -1.285156 C 0.917969 -1.6875 1.402344 -2.226563 2.125 -2.898438 C 2.703125 -3.4375 3.058594 -3.804688 3.191406 -4 C 3.367188 -4.261719 3.457031 -4.523438 3.460938 -4.789063 C 3.457031 -5.074219 3.378906 -5.296875 3.226563 -5.453125 C 3.070313 -5.605469 2.855469 -5.683594 2.585938 -5.683594 C 2.3125 -5.683594 2.101563 -5.601563 1.945313 -5.441406 C 1.785156 -5.277344 1.691406 -5.007813 1.671875 -4.632813 L 0.386719 -4.761719 C 0.460938 -5.46875 0.699219 -5.976563 1.105469 -6.289063 C 1.507813 -6.59375 2.011719 -6.75 2.621094 -6.753906 C 3.277344 -6.75 3.800781 -6.570313 4.183594 -6.214844 C 4.5625 -5.855469 4.75 -5.410156 4.753906 -4.882813 C 4.75 -4.574219 4.695313 -4.285156 4.589844 -4.015625 C 4.476563 -3.738281 4.304688 -3.453125 4.074219 -3.15625 C 3.914063 -2.957031 3.632813 -2.671875 3.230469 -2.296875 C 2.820313 -1.921875 2.5625 -1.671875 2.457031 -1.554688 C 2.34375 -1.429688 2.257813 -1.3125 2.191406 -1.199219 Z M 4.753906 -1.199219 "
-           id="path8516" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph22-4">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path8519" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph23-0">
-        <path
-           style="stroke:none;"
-           d="M 0.683594 -6.695313 L 3.359375 -6.695313 C 3.886719 -6.691406 4.285156 -6.667969 4.546875 -6.628906 C 4.804688 -6.582031 5.035156 -6.492188 5.246094 -6.351563 C 5.449219 -6.210938 5.621094 -6.023438 5.757813 -5.789063 C 5.894531 -5.554688 5.964844 -5.292969 5.964844 -5.007813 C 5.964844 -4.691406 5.878906 -4.402344 5.710938 -4.144531 C 5.539063 -3.878906 5.308594 -3.683594 5.023438 -3.558594 C 5.429688 -3.433594 5.746094 -3.230469 5.964844 -2.949219 C 6.183594 -2.660156 6.292969 -2.324219 6.292969 -1.941406 C 6.292969 -1.632813 6.21875 -1.335938 6.078125 -1.050781 C 5.933594 -0.761719 5.742188 -0.535156 5.5 -0.363281 C 5.253906 -0.191406 4.953125 -0.0859375 4.59375 -0.046875 C 4.367188 -0.0195313 3.824219 -0.00390625 2.964844 0 L 0.683594 0 Z M 2.035156 -5.582031 L 2.035156 -4.03125 L 2.921875 -4.03125 C 3.445313 -4.03125 3.773438 -4.039063 3.90625 -4.054688 C 4.132813 -4.082031 4.3125 -4.160156 4.449219 -4.292969 C 4.578125 -4.421875 4.644531 -4.597656 4.648438 -4.816406 C 4.644531 -5.023438 4.589844 -5.191406 4.476563 -5.320313 C 4.359375 -5.449219 4.1875 -5.527344 3.96875 -5.558594 C 3.828125 -5.570313 3.445313 -5.578125 2.8125 -5.582031 Z M 2.035156 -2.917969 L 2.035156 -1.128906 L 3.289063 -1.128906 C 3.773438 -1.125 4.082031 -1.136719 4.214844 -1.167969 C 4.410156 -1.199219 4.574219 -1.289063 4.703125 -1.433594 C 4.828125 -1.574219 4.890625 -1.765625 4.894531 -2.003906 C 4.890625 -2.203125 4.84375 -2.371094 4.746094 -2.515625 C 4.648438 -2.652344 4.507813 -2.753906 4.324219 -2.820313 C 4.140625 -2.882813 3.742188 -2.917969 3.128906 -2.917969 Z M 2.035156 -2.917969 "
-           id="path8522" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph23-1">
-        <path
-           style="stroke:none;"
-           d="M 0.671875 0 L 0.671875 -6.695313 L 1.953125 -6.695313 L 1.953125 0 Z M 0.671875 0 "
-           id="path8525" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph23-2">
-        <path
-           style="stroke:none;"
-           d="M 0.375 -2.492188 C 0.375 -2.914063 0.476563 -3.328125 0.6875 -3.730469 C 0.894531 -4.125 1.195313 -4.429688 1.582031 -4.644531 C 1.964844 -4.851563 2.394531 -4.957031 2.871094 -4.960938 C 3.605469 -4.957031 4.207031 -4.71875 4.675781 -4.246094 C 5.144531 -3.765625 5.378906 -3.164063 5.378906 -2.4375 C 5.378906 -1.703125 5.140625 -1.09375 4.667969 -0.613281 C 4.195313 -0.132813 3.601563 0.105469 2.882813 0.109375 C 2.4375 0.105469 2.011719 0.0078125 1.609375 -0.191406 C 1.203125 -0.390625 0.894531 -0.683594 0.6875 -1.074219 C 0.476563 -1.457031 0.375 -1.929688 0.375 -2.492188 Z M 1.691406 -2.425781 C 1.6875 -1.941406 1.800781 -1.570313 2.03125 -1.320313 C 2.257813 -1.0625 2.539063 -0.9375 2.875 -0.9375 C 3.207031 -0.9375 3.488281 -1.0625 3.71875 -1.320313 C 3.941406 -1.570313 4.054688 -1.941406 4.058594 -2.433594 C 4.054688 -2.902344 3.941406 -3.269531 3.71875 -3.527344 C 3.488281 -3.785156 3.207031 -3.914063 2.875 -3.914063 C 2.539063 -3.914063 2.257813 -3.785156 2.03125 -3.527344 C 1.800781 -3.269531 1.6875 -2.902344 1.691406 -2.425781 Z M 1.691406 -2.425781 "
-           id="path8528" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph23-3">
-        <path
-           style="stroke:none;"
-           d="M 4.898438 -3.414063 L 3.636719 -3.1875 C 3.589844 -3.4375 3.492188 -3.625 3.34375 -3.757813 C 3.191406 -3.882813 2.996094 -3.949219 2.757813 -3.949219 C 2.433594 -3.949219 2.179688 -3.835938 1.992188 -3.617188 C 1.800781 -3.394531 1.703125 -3.027344 1.707031 -2.511719 C 1.703125 -1.933594 1.800781 -1.527344 1.996094 -1.292969 C 2.1875 -1.054688 2.449219 -0.9375 2.777344 -0.9375 C 3.019531 -0.9375 3.21875 -1.003906 3.375 -1.144531 C 3.53125 -1.277344 3.640625 -1.515625 3.703125 -1.859375 L 4.964844 -1.644531 C 4.828125 -1.0625 4.578125 -0.625 4.207031 -0.332031 C 3.835938 -0.0390625 3.339844 0.105469 2.71875 0.109375 C 2.007813 0.105469 1.441406 -0.117188 1.019531 -0.5625 C 0.597656 -1.007813 0.386719 -1.625 0.386719 -2.421875 C 0.386719 -3.21875 0.597656 -3.84375 1.019531 -4.289063 C 1.441406 -4.734375 2.011719 -4.957031 2.738281 -4.960938 C 3.324219 -4.957031 3.796875 -4.828125 4.148438 -4.578125 C 4.496094 -4.320313 4.746094 -3.933594 4.898438 -3.414063 Z M 4.898438 -3.414063 "
-           id="path8531" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph23-4">
-        <path
-           style="stroke:none;"
-           d="M 0.625 0 L 0.625 -6.695313 L 1.910156 -6.695313 L 1.910156 -3.140625 L 3.410156 -4.847656 L 4.992188 -4.847656 L 3.332031 -3.078125 L 5.109375 0 L 3.726563 0 L 2.507813 -2.179688 L 1.910156 -1.550781 L 1.910156 0 Z M 0.625 0 "
-           id="path8534" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph23-5">
-        <path
-           style="stroke:none;"
-           d="M 0.21875 -1.382813 L 1.507813 -1.578125 C 1.558594 -1.328125 1.671875 -1.140625 1.839844 -1.011719 C 2.003906 -0.878906 2.238281 -0.8125 2.542969 -0.816406 C 2.875 -0.8125 3.125 -0.875 3.292969 -1 C 3.402344 -1.082031 3.457031 -1.195313 3.460938 -1.34375 C 3.457031 -1.4375 3.425781 -1.519531 3.371094 -1.585938 C 3.300781 -1.644531 3.160156 -1.699219 2.941406 -1.753906 C 1.902344 -1.980469 1.246094 -2.191406 0.972656 -2.378906 C 0.589844 -2.640625 0.398438 -3.003906 0.398438 -3.46875 C 0.398438 -3.890625 0.5625 -4.242188 0.894531 -4.53125 C 1.226563 -4.8125 1.742188 -4.957031 2.4375 -4.960938 C 3.101563 -4.957031 3.59375 -4.847656 3.917969 -4.636719 C 4.238281 -4.417969 4.460938 -4.097656 4.585938 -3.675781 L 3.375 -3.453125 C 3.320313 -3.636719 3.21875 -3.78125 3.078125 -3.882813 C 2.929688 -3.980469 2.726563 -4.03125 2.460938 -4.035156 C 2.121094 -4.03125 1.878906 -3.984375 1.734375 -3.894531 C 1.636719 -3.824219 1.585938 -3.738281 1.589844 -3.636719 C 1.585938 -3.539063 1.628906 -3.460938 1.71875 -3.402344 C 1.828125 -3.3125 2.226563 -3.191406 2.914063 -3.039063 C 3.59375 -2.882813 4.074219 -2.691406 4.347656 -2.46875 C 4.613281 -2.238281 4.746094 -1.921875 4.75 -1.515625 C 4.746094 -1.066406 4.558594 -0.683594 4.191406 -0.367188 C 3.816406 -0.0507813 3.269531 0.105469 2.542969 0.109375 C 1.878906 0.105469 1.355469 -0.0273438 0.972656 -0.292969 C 0.589844 -0.558594 0.335938 -0.921875 0.21875 -1.382813 Z M 0.21875 -1.382813 "
-           id="path8537" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph24-0">
-        <path
-           style="stroke:none;"
-           d="M 6.464844 0 L 5.050781 0 L 4.488281 -1.464844 L 1.914063 -1.464844 L 1.378906 0 L 0 0 L 2.511719 -6.445313 L 3.886719 -6.445313 Z M 4.070313 -2.550781 L 3.183594 -4.941406 L 2.3125 -2.550781 Z M 4.070313 -2.550781 "
-           id="path8540" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph24-1">
-        <path
-           style="stroke:none;"
-           d="M 4.71875 -3.289063 L 3.5 -3.066406 C 3.453125 -3.308594 3.359375 -3.492188 3.21875 -3.617188 C 3.070313 -3.734375 2.882813 -3.796875 2.65625 -3.800781 C 2.34375 -3.796875 2.097656 -3.691406 1.917969 -3.480469 C 1.730469 -3.269531 1.640625 -2.914063 1.644531 -2.417969 C 1.640625 -1.859375 1.734375 -1.46875 1.921875 -1.242188 C 2.109375 -1.015625 2.359375 -0.902344 2.671875 -0.902344 C 2.90625 -0.902344 3.097656 -0.96875 3.246094 -1.101563 C 3.394531 -1.234375 3.5 -1.460938 3.566406 -1.789063 L 4.777344 -1.582031 C 4.648438 -1.023438 4.40625 -0.601563 4.050781 -0.320313 C 3.691406 -0.0351563 3.214844 0.105469 2.617188 0.105469 C 1.929688 0.105469 1.382813 -0.109375 0.980469 -0.539063 C 0.574219 -0.96875 0.375 -1.5625 0.375 -2.328125 C 0.375 -3.09375 0.578125 -3.695313 0.984375 -4.128906 C 1.390625 -4.554688 1.941406 -4.769531 2.636719 -4.773438 C 3.207031 -4.769531 3.65625 -4.648438 3.992188 -4.40625 C 4.324219 -4.160156 4.566406 -3.785156 4.71875 -3.289063 Z M 4.71875 -3.289063 "
-           id="path8543" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph24-2">
-        <path
-           style="stroke:none;"
-           d="M 4.777344 -2.371094 L 6.039063 -1.96875 C 5.84375 -1.265625 5.523438 -0.742188 5.074219 -0.402344 C 4.621094 -0.0625 4.050781 0.105469 3.363281 0.109375 C 2.503906 0.105469 1.796875 -0.183594 1.25 -0.765625 C 0.695313 -1.347656 0.421875 -2.148438 0.425781 -3.164063 C 0.421875 -4.234375 0.699219 -5.066406 1.257813 -5.664063 C 1.808594 -6.253906 2.535156 -6.550781 3.441406 -6.554688 C 4.226563 -6.550781 4.867188 -6.316406 5.363281 -5.855469 C 5.652344 -5.574219 5.875 -5.179688 6.023438 -4.667969 L 4.734375 -4.359375 C 4.65625 -4.695313 4.496094 -4.957031 4.257813 -5.152344 C 4.011719 -5.339844 3.71875 -5.4375 3.375 -5.441406 C 2.898438 -5.4375 2.507813 -5.265625 2.210938 -4.925781 C 1.910156 -4.582031 1.761719 -4.027344 1.765625 -3.261719 C 1.761719 -2.445313 1.910156 -1.863281 2.203125 -1.519531 C 2.496094 -1.171875 2.878906 -1 3.351563 -1.003906 C 3.695313 -1 3.992188 -1.109375 4.242188 -1.332031 C 4.492188 -1.546875 4.667969 -1.894531 4.777344 -2.371094 Z M 4.777344 -2.371094 "
-           id="path8546" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph24-3">
-        <path
-           style="stroke:none;"
-           d="M 0.609375 -4.667969 L 1.761719 -4.667969 L 1.761719 -3.984375 C 1.910156 -4.21875 2.113281 -4.40625 2.367188 -4.554688 C 2.621094 -4.695313 2.902344 -4.769531 3.21875 -4.773438 C 3.757813 -4.769531 4.21875 -4.558594 4.597656 -4.136719 C 4.972656 -3.710938 5.160156 -3.117188 5.164063 -2.359375 C 5.160156 -1.578125 4.972656 -0.972656 4.59375 -0.542969 C 4.210938 -0.109375 3.75 0.105469 3.210938 0.105469 C 2.949219 0.105469 2.714844 0.0546875 2.507813 -0.046875 C 2.296875 -0.148438 2.074219 -0.324219 1.847656 -0.574219 L 1.847656 1.777344 L 0.609375 1.777344 Z M 1.832031 -2.414063 C 1.828125 -1.886719 1.933594 -1.496094 2.144531 -1.25 C 2.351563 -0.996094 2.605469 -0.871094 2.90625 -0.875 C 3.191406 -0.871094 3.429688 -0.988281 3.621094 -1.21875 C 3.808594 -1.449219 3.902344 -1.824219 3.90625 -2.351563 C 3.902344 -2.835938 3.804688 -3.199219 3.613281 -3.441406 C 3.414063 -3.675781 3.171875 -3.796875 2.882813 -3.796875 C 2.578125 -3.796875 2.324219 -3.679688 2.128906 -3.445313 C 1.925781 -3.210938 1.828125 -2.867188 1.832031 -2.414063 Z M 1.832031 -2.414063 "
-           id="path8549" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph24-4">
-        <path
-           style="stroke:none;"
-           d="M 3.71875 0 L 3.71875 -0.699219 C 3.546875 -0.445313 3.324219 -0.25 3.046875 -0.109375 C 2.769531 0.0351563 2.476563 0.105469 2.171875 0.105469 C 1.855469 0.105469 1.574219 0.0390625 1.328125 -0.101563 C 1.074219 -0.234375 0.894531 -0.429688 0.785156 -0.679688 C 0.675781 -0.929688 0.621094 -1.273438 0.621094 -1.714844 L 0.621094 -4.667969 L 1.855469 -4.667969 L 1.855469 -2.523438 C 1.851563 -1.863281 1.875 -1.460938 1.921875 -1.316406 C 1.96875 -1.164063 2.050781 -1.046875 2.171875 -0.964844 C 2.289063 -0.875 2.441406 -0.832031 2.628906 -0.835938 C 2.835938 -0.832031 3.023438 -0.890625 3.195313 -1.007813 C 3.359375 -1.125 3.476563 -1.265625 3.539063 -1.4375 C 3.601563 -1.605469 3.632813 -2.027344 3.632813 -2.699219 L 3.632813 -4.667969 L 4.867188 -4.667969 L 4.867188 0 Z M 3.71875 0 "
-           id="path8552" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph24-5">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path8555" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph25-0">
-        <path
-           style="stroke:none;"
-           d="M 6.558594 0 L 5.125 0 L 4.550781 -1.484375 L 1.941406 -1.484375 L 1.398438 0 L 0 0 L 2.546875 -6.535156 L 3.941406 -6.535156 Z M 4.128906 -2.585938 L 3.226563 -5.011719 L 2.34375 -2.585938 Z M 4.128906 -2.585938 "
-           id="path8558" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph25-1">
-        <path
-           style="stroke:none;"
-           d="M 4.785156 -3.335938 L 3.550781 -3.113281 C 3.507813 -3.355469 3.410156 -3.542969 3.265625 -3.667969 C 3.117188 -3.792969 2.925781 -3.855469 2.691406 -3.855469 C 2.378906 -3.855469 2.132813 -3.746094 1.945313 -3.53125 C 1.757813 -3.316406 1.664063 -2.957031 1.667969 -2.453125 C 1.664063 -1.886719 1.757813 -1.488281 1.949219 -1.261719 C 2.136719 -1.027344 2.390625 -0.914063 2.710938 -0.914063 C 2.945313 -0.914063 3.140625 -0.980469 3.292969 -1.117188 C 3.445313 -1.25 3.550781 -1.484375 3.617188 -1.816406 L 4.847656 -1.605469 C 4.714844 -1.039063 4.46875 -0.609375 4.109375 -0.324219 C 3.742188 -0.0351563 3.257813 0.105469 2.652344 0.105469 C 1.957031 0.105469 1.40625 -0.109375 0.996094 -0.546875 C 0.582031 -0.980469 0.375 -1.585938 0.378906 -2.363281 C 0.375 -3.140625 0.582031 -3.75 0.996094 -4.1875 C 1.410156 -4.621094 1.96875 -4.839844 2.675781 -4.84375 C 3.25 -4.839844 3.707031 -4.714844 4.050781 -4.46875 C 4.386719 -4.21875 4.632813 -3.839844 4.785156 -3.335938 Z M 4.785156 -3.335938 "
-           id="path8561" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph25-2">
-        <path
-           style="stroke:none;"
-           d="M 4.847656 -2.402344 L 6.125 -1.996094 C 5.925781 -1.28125 5.601563 -0.75 5.148438 -0.40625 C 4.691406 -0.0585938 4.113281 0.109375 3.410156 0.113281 C 2.542969 0.109375 1.828125 -0.183594 1.269531 -0.777344 C 0.710938 -1.367188 0.433594 -2.179688 0.433594 -3.210938 C 0.433594 -4.296875 0.710938 -5.140625 1.273438 -5.746094 C 1.832031 -6.34375 2.574219 -6.644531 3.492188 -6.648438 C 4.289063 -6.644531 4.9375 -6.410156 5.441406 -5.941406 C 5.734375 -5.660156 5.957031 -5.257813 6.109375 -4.734375 L 4.800781 -4.421875 C 4.722656 -4.761719 4.5625 -5.03125 4.316406 -5.226563 C 4.070313 -5.421875 3.773438 -5.519531 3.425781 -5.519531 C 2.9375 -5.519531 2.542969 -5.34375 2.246094 -4.996094 C 1.941406 -4.648438 1.792969 -4.085938 1.792969 -3.308594 C 1.792969 -2.480469 1.941406 -1.890625 2.238281 -1.542969 C 2.535156 -1.1875 2.921875 -1.011719 3.398438 -1.015625 C 3.75 -1.011719 4.050781 -1.125 4.304688 -1.347656 C 4.554688 -1.570313 4.734375 -1.921875 4.847656 -2.402344 Z M 4.847656 -2.402344 "
-           id="path8564" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph25-3">
-        <path
-           style="stroke:none;"
-           d="M 3.773438 0 L 3.773438 -0.710938 C 3.601563 -0.453125 3.375 -0.253906 3.09375 -0.109375 C 2.8125 0.0351563 2.515625 0.105469 2.203125 0.105469 C 1.882813 0.105469 1.597656 0.0390625 1.347656 -0.101563 C 1.09375 -0.238281 0.910156 -0.4375 0.796875 -0.691406 C 0.683594 -0.941406 0.625 -1.289063 0.628906 -1.738281 L 0.628906 -4.734375 L 1.882813 -4.734375 L 1.882813 -2.558594 C 1.878906 -1.890625 1.902344 -1.484375 1.949219 -1.335938 C 1.992188 -1.183594 2.078125 -1.0625 2.203125 -0.976563 C 2.324219 -0.886719 2.476563 -0.84375 2.667969 -0.847656 C 2.878906 -0.84375 3.070313 -0.902344 3.242188 -1.023438 C 3.40625 -1.136719 3.523438 -1.28125 3.589844 -1.460938 C 3.648438 -1.632813 3.679688 -2.058594 3.683594 -2.738281 L 3.683594 -4.734375 L 4.9375 -4.734375 L 4.9375 0 Z M 3.773438 0 "
-           id="path8567" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph25-4">
-        <path
-           style="stroke:none;"
-           d="M 5 0 L 3.835938 0 L 3.835938 -0.695313 C 3.640625 -0.421875 3.410156 -0.21875 3.148438 -0.0898438 C 2.882813 0.0429688 2.621094 0.105469 2.355469 0.105469 C 1.808594 0.105469 1.339844 -0.113281 0.957031 -0.550781 C 0.566406 -0.988281 0.375 -1.597656 0.375 -2.386719 C 0.375 -3.183594 0.5625 -3.792969 0.941406 -4.214844 C 1.316406 -4.628906 1.792969 -4.839844 2.371094 -4.84375 C 2.898438 -4.839844 3.355469 -4.621094 3.746094 -4.183594 L 3.746094 -6.535156 L 5 -6.535156 Z M 1.65625 -2.46875 C 1.652344 -1.960938 1.722656 -1.597656 1.863281 -1.375 C 2.066406 -1.042969 2.347656 -0.878906 2.710938 -0.882813 C 2.996094 -0.878906 3.242188 -1 3.445313 -1.25 C 3.648438 -1.492188 3.75 -1.859375 3.75 -2.351563 C 3.75 -2.890625 3.648438 -3.28125 3.453125 -3.523438 C 3.253906 -3.757813 3.003906 -3.878906 2.703125 -3.882813 C 2.402344 -3.878906 2.15625 -3.761719 1.957031 -3.527344 C 1.753906 -3.292969 1.652344 -2.9375 1.65625 -2.46875 Z M 1.65625 -2.46875 "
-           id="path8570" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph25-5">
-        <path
-           style="stroke:none;"
-           d="M 1.59375 -3.289063 L 0.453125 -3.496094 C 0.578125 -3.953125 0.800781 -4.292969 1.113281 -4.511719 C 1.425781 -4.730469 1.890625 -4.839844 2.507813 -4.84375 C 3.0625 -4.839844 3.476563 -4.773438 3.753906 -4.644531 C 4.023438 -4.507813 4.214844 -4.339844 4.332031 -4.140625 C 4.441406 -3.933594 4.5 -3.558594 4.5 -3.019531 L 4.484375 -1.554688 C 4.480469 -1.136719 4.5 -0.832031 4.542969 -0.636719 C 4.582031 -0.4375 4.660156 -0.226563 4.769531 0 L 3.53125 0 C 3.496094 -0.0820313 3.457031 -0.203125 3.410156 -0.371094 C 3.386719 -0.441406 3.371094 -0.492188 3.367188 -0.515625 C 3.152344 -0.308594 2.921875 -0.152344 2.679688 -0.0507813 C 2.429688 0.0546875 2.171875 0.105469 1.898438 0.105469 C 1.410156 0.105469 1.023438 -0.0234375 0.746094 -0.289063 C 0.460938 -0.550781 0.320313 -0.886719 0.324219 -1.292969 C 0.320313 -1.558594 0.386719 -1.796875 0.515625 -2.007813 C 0.644531 -2.214844 0.824219 -2.375 1.054688 -2.488281 C 1.28125 -2.597656 1.613281 -2.695313 2.050781 -2.78125 C 2.636719 -2.890625 3.042969 -2.992188 3.269531 -3.089844 L 3.269531 -3.214844 C 3.269531 -3.453125 3.207031 -3.625 3.089844 -3.730469 C 2.964844 -3.828125 2.742188 -3.878906 2.417969 -3.882813 C 2.191406 -3.878906 2.015625 -3.835938 1.894531 -3.75 C 1.765625 -3.664063 1.667969 -3.507813 1.59375 -3.289063 Z M 3.269531 -2.273438 C 3.105469 -2.21875 2.847656 -2.152344 2.503906 -2.082031 C 2.152344 -2.003906 1.925781 -1.933594 1.824219 -1.863281 C 1.65625 -1.746094 1.574219 -1.597656 1.578125 -1.421875 C 1.574219 -1.246094 1.640625 -1.09375 1.773438 -0.96875 C 1.902344 -0.835938 2.070313 -0.773438 2.273438 -0.777344 C 2.5 -0.773438 2.714844 -0.847656 2.921875 -1 C 3.070313 -1.109375 3.167969 -1.246094 3.21875 -1.414063 C 3.25 -1.515625 3.269531 -1.71875 3.269531 -2.023438 Z M 3.269531 -2.273438 "
-           id="path8573" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph26-0">
-        <path
-           style="stroke:none;"
-           d="M 0.605469 0.804688 C 0.832031 0.800781 1.027344 0.882813 1.191406 1.046875 C 1.347656 1.207031 1.429688 1.402344 1.433594 1.632813 C 1.429688 1.863281 1.347656 2.058594 1.191406 2.222656 C 1.027344 2.378906 0.832031 2.460938 0.605469 2.464844 C 0.371094 2.460938 0.175781 2.378906 0.0117188 2.222656 C -0.148438 2.058594 -0.226563 1.863281 -0.230469 1.632813 C -0.226563 1.402344 -0.148438 1.207031 0.0117188 1.046875 C 0.175781 0.882813 0.371094 0.800781 0.605469 0.804688 Z M 0.605469 0.804688 "
-           id="path8576" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-0">
-        <path
-           style="stroke:none;"
-           d="M 1.632813 0 L 0.03125 -6.707031 L 1.417969 -6.707031 L 2.429688 -2.101563 L 3.65625 -6.707031 L 5.265625 -6.707031 L 6.441406 -2.023438 L 7.472656 -6.707031 L 8.835938 -6.707031 L 7.207031 0 L 5.769531 0 L 4.433594 -5.015625 L 3.101563 0 Z M 1.632813 0 "
-           id="path8579" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-1">
-        <path
-           style="stroke:none;"
-           d="M 0.375 -2.5 C 0.375 -2.921875 0.480469 -3.332031 0.691406 -3.734375 C 0.902344 -4.132813 1.199219 -4.441406 1.585938 -4.652344 C 1.96875 -4.863281 2.398438 -4.96875 2.878906 -4.96875 C 3.609375 -4.96875 4.210938 -4.726563 4.683594 -4.25 C 5.152344 -3.769531 5.390625 -3.167969 5.390625 -2.441406 C 5.390625 -1.703125 5.152344 -1.09375 4.675781 -0.613281 C 4.199219 -0.132813 3.601563 0.105469 2.886719 0.109375 C 2.4375 0.105469 2.011719 0.0078125 1.613281 -0.191406 C 1.207031 -0.390625 0.902344 -0.683594 0.691406 -1.074219 C 0.480469 -1.460938 0.375 -1.9375 0.375 -2.5 Z M 1.691406 -2.429688 C 1.6875 -1.945313 1.800781 -1.574219 2.035156 -1.320313 C 2.261719 -1.0625 2.546875 -0.9375 2.882813 -0.9375 C 3.21875 -0.9375 3.5 -1.0625 3.726563 -1.320313 C 3.953125 -1.574219 4.066406 -1.949219 4.066406 -2.4375 C 4.066406 -2.914063 3.953125 -3.277344 3.726563 -3.535156 C 3.5 -3.789063 3.21875 -3.917969 2.882813 -3.921875 C 2.546875 -3.917969 2.261719 -3.789063 2.035156 -3.535156 C 1.800781 -3.277344 1.6875 -2.910156 1.691406 -2.429688 Z M 1.691406 -2.429688 "
-           id="path8582" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-2">
-        <path
-           style="stroke:none;"
-           d="M 1.902344 0 L 0.617188 0 L 0.617188 -4.859375 L 1.8125 -4.859375 L 1.8125 -4.167969 C 2.011719 -4.488281 2.195313 -4.703125 2.363281 -4.8125 C 2.523438 -4.914063 2.710938 -4.96875 2.917969 -4.96875 C 3.210938 -4.96875 3.492188 -4.886719 3.765625 -4.726563 L 3.367188 -3.605469 C 3.148438 -3.742188 2.945313 -3.8125 2.761719 -3.816406 C 2.578125 -3.8125 2.425781 -3.765625 2.304688 -3.667969 C 2.175781 -3.566406 2.078125 -3.386719 2.007813 -3.128906 C 1.933594 -2.871094 1.898438 -2.328125 1.902344 -1.5 Z M 1.902344 0 "
-           id="path8585" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-3">
-        <path
-           style="stroke:none;"
-           d="M 0.625 0 L 0.625 -6.707031 L 1.914063 -6.707031 L 1.914063 -3.148438 L 3.417969 -4.859375 L 5 -4.859375 L 3.339844 -3.082031 L 5.121094 0 L 3.734375 0 L 2.511719 -2.183594 L 1.914063 -1.554688 L 1.914063 0 Z M 0.625 0 "
-           id="path8588" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-4">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 -6.707031 L 3.152344 -6.707031 C 3.707031 -6.703125 4.132813 -6.660156 4.429688 -6.578125 C 4.820313 -6.460938 5.15625 -6.253906 5.4375 -5.960938 C 5.71875 -5.664063 5.929688 -5.304688 6.078125 -4.875 C 6.222656 -4.445313 6.296875 -3.914063 6.300781 -3.289063 C 6.296875 -2.734375 6.230469 -2.261719 6.09375 -1.863281 C 5.925781 -1.371094 5.683594 -0.972656 5.375 -0.675781 C 5.136719 -0.445313 4.820313 -0.269531 4.425781 -0.140625 C 4.121094 -0.046875 3.722656 0 3.226563 0 L 0.675781 0 Z M 2.03125 -5.574219 L 2.03125 -1.128906 L 3.042969 -1.128906 C 3.417969 -1.125 3.691406 -1.148438 3.863281 -1.195313 C 4.078125 -1.25 4.261719 -1.339844 4.40625 -1.472656 C 4.550781 -1.601563 4.667969 -1.820313 4.761719 -2.121094 C 4.851563 -2.417969 4.898438 -2.824219 4.898438 -3.347656 C 4.898438 -3.863281 4.851563 -4.261719 4.761719 -4.542969 C 4.667969 -4.816406 4.539063 -5.035156 4.378906 -5.191406 C 4.210938 -5.347656 4.003906 -5.453125 3.75 -5.507813 C 3.5625 -5.550781 3.191406 -5.570313 2.640625 -5.574219 Z M 2.03125 -5.574219 "
-           id="path8591" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-5">
-        <path
-           style="stroke:none;"
-           d="M 0.671875 -5.519531 L 0.671875 -6.707031 L 1.957031 -6.707031 L 1.957031 -5.519531 Z M 0.671875 0 L 0.671875 -4.859375 L 1.957031 -4.859375 L 1.957031 0 Z M 0.671875 0 "
-           id="path8594" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-6">
-        <path
-           style="stroke:none;"
-           d="M 2.007813 0 L 0.0507813 -4.859375 L 1.398438 -4.859375 L 2.316406 -2.378906 L 2.582031 -1.550781 C 2.648438 -1.757813 2.691406 -1.898438 2.714844 -1.96875 C 2.753906 -2.101563 2.800781 -2.238281 2.851563 -2.378906 L 3.773438 -4.859375 L 5.097656 -4.859375 L 3.167969 0 Z M 2.007813 0 "
-           id="path8597" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-7">
-        <path
-           style="stroke:none;"
-           d="M 0.664063 0 L 0.664063 -6.707031 L 2.691406 -6.707031 L 3.90625 -2.132813 L 5.109375 -6.707031 L 7.140625 -6.707031 L 7.140625 0 L 5.882813 0 L 5.882813 -5.28125 L 4.550781 0 L 3.25 0 L 1.921875 -5.28125 L 1.921875 0 Z M 0.664063 0 "
-           id="path8600" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-8">
-        <path
-           style="stroke:none;"
-           d="M 3.484375 -1.546875 L 4.765625 -1.332031 C 4.597656 -0.855469 4.339844 -0.5 3.984375 -0.257813 C 3.628906 -0.015625 3.183594 0.105469 2.652344 0.109375 C 1.804688 0.105469 1.179688 -0.167969 0.777344 -0.71875 C 0.453125 -1.160156 0.292969 -1.71875 0.296875 -2.394531 C 0.292969 -3.195313 0.503906 -3.824219 0.925781 -4.285156 C 1.347656 -4.738281 1.878906 -4.96875 2.527344 -4.96875 C 3.246094 -4.96875 3.816406 -4.726563 4.234375 -4.25 C 4.652344 -3.769531 4.851563 -3.039063 4.835938 -2.058594 L 1.613281 -2.058594 C 1.621094 -1.675781 1.726563 -1.378906 1.925781 -1.167969 C 2.121094 -0.953125 2.367188 -0.847656 2.667969 -0.851563 C 2.863281 -0.847656 3.035156 -0.902344 3.175781 -1.015625 C 3.3125 -1.121094 3.414063 -1.300781 3.484375 -1.546875 Z M 3.558594 -2.847656 C 3.546875 -3.214844 3.453125 -3.496094 3.269531 -3.691406 C 3.085938 -3.886719 2.863281 -3.984375 2.601563 -3.984375 C 2.320313 -3.984375 2.085938 -3.882813 1.90625 -3.679688 C 1.71875 -3.472656 1.628906 -3.195313 1.636719 -2.847656 Z M 3.558594 -2.847656 "
-           id="path8603" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-9">
-        <path
-           style="stroke:none;"
-           d="M 0.578125 -4.859375 L 1.761719 -4.859375 L 1.761719 -4.195313 C 2.183594 -4.710938 2.6875 -4.96875 3.277344 -4.96875 C 3.582031 -4.96875 3.851563 -4.902344 4.085938 -4.777344 C 4.3125 -4.644531 4.5 -4.453125 4.648438 -4.195313 C 4.859375 -4.453125 5.089844 -4.644531 5.339844 -4.777344 C 5.582031 -4.902344 5.847656 -4.96875 6.128906 -4.96875 C 6.488281 -4.96875 6.789063 -4.894531 7.035156 -4.75 C 7.28125 -4.605469 7.464844 -4.394531 7.589844 -4.113281 C 7.675781 -3.902344 7.722656 -3.566406 7.722656 -3.105469 L 7.722656 0 L 6.4375 0 L 6.4375 -2.777344 C 6.433594 -3.257813 6.390625 -3.566406 6.304688 -3.710938 C 6.183594 -3.890625 6 -3.984375 5.753906 -3.984375 C 5.574219 -3.984375 5.402344 -3.929688 5.246094 -3.820313 C 5.085938 -3.710938 4.972656 -3.546875 4.90625 -3.335938 C 4.832031 -3.121094 4.796875 -2.789063 4.800781 -2.332031 L 4.800781 0 L 3.515625 0 L 3.515625 -2.664063 C 3.515625 -3.132813 3.492188 -3.4375 3.445313 -3.578125 C 3.398438 -3.714844 3.324219 -3.816406 3.230469 -3.882813 C 3.132813 -3.949219 3.003906 -3.984375 2.839844 -3.984375 C 2.640625 -3.984375 2.460938 -3.929688 2.304688 -3.824219 C 2.140625 -3.714844 2.027344 -3.5625 1.964844 -3.363281 C 1.894531 -3.160156 1.863281 -2.824219 1.863281 -2.359375 L 1.863281 0 L 0.578125 0 Z M 0.578125 -4.859375 "
-           id="path8606" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph27-10">
-        <path
-           style="stroke:none;"
-           d="M 0.617188 0 L 0.617188 -6.707031 L 1.902344 -6.707031 L 1.902344 -4.292969 C 2.296875 -4.742188 2.765625 -4.96875 3.3125 -4.96875 C 3.898438 -4.96875 4.390625 -4.753906 4.78125 -4.324219 C 5.167969 -3.894531 5.359375 -3.277344 5.363281 -2.480469 C 5.359375 -1.644531 5.164063 -1.007813 4.769531 -0.5625 C 4.371094 -0.117188 3.890625 0.105469 3.332031 0.109375 C 3.050781 0.105469 2.777344 0.0390625 2.507813 -0.0976563 C 2.238281 -0.234375 2.003906 -0.441406 1.8125 -0.714844 L 1.8125 0 Z M 1.894531 -2.535156 C 1.890625 -2.03125 1.972656 -1.660156 2.132813 -1.417969 C 2.351563 -1.074219 2.644531 -0.902344 3.019531 -0.90625 C 3.300781 -0.902344 3.542969 -1.023438 3.742188 -1.269531 C 3.941406 -1.507813 4.042969 -1.890625 4.042969 -2.414063 C 4.042969 -2.96875 3.941406 -3.367188 3.742188 -3.617188 C 3.539063 -3.859375 3.28125 -3.984375 2.96875 -3.984375 C 2.660156 -3.984375 2.402344 -3.863281 2.199219 -3.625 C 1.992188 -3.382813 1.890625 -3.019531 1.894531 -2.535156 Z M 1.894531 -2.535156 "
-           id="path8609" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-0">
-        <path
-           style="stroke:none;"
-           d="M 1.617188 0 L 0.03125 -6.640625 L 1.40625 -6.640625 L 2.40625 -2.078125 L 3.621094 -6.640625 L 5.214844 -6.640625 L 6.378906 -2.003906 L 7.398438 -6.640625 L 8.746094 -6.640625 L 7.136719 0 L 5.710938 0 L 4.390625 -4.964844 L 3.070313 0 Z M 1.617188 0 "
-           id="path8612" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-1">
-        <path
-           style="stroke:none;"
-           d="M 0.371094 -2.472656 C 0.367188 -2.894531 0.472656 -3.300781 0.683594 -3.699219 C 0.890625 -4.089844 1.1875 -4.394531 1.570313 -4.605469 C 1.953125 -4.8125 2.378906 -4.914063 2.847656 -4.917969 C 3.574219 -4.914063 4.167969 -4.679688 4.636719 -4.207031 C 5.097656 -3.734375 5.332031 -3.136719 5.335938 -2.417969 C 5.332031 -1.6875 5.097656 -1.085938 4.628906 -0.609375 C 4.160156 -0.132813 3.570313 0.105469 2.859375 0.109375 C 2.414063 0.105469 1.992188 0.0078125 1.59375 -0.1875 C 1.195313 -0.386719 0.890625 -0.679688 0.683594 -1.066406 C 0.472656 -1.449219 0.367188 -1.917969 0.371094 -2.472656 Z M 1.675781 -2.40625 C 1.675781 -1.925781 1.789063 -1.558594 2.015625 -1.308594 C 2.242188 -1.050781 2.519531 -0.925781 2.855469 -0.929688 C 3.183594 -0.925781 3.460938 -1.050781 3.6875 -1.308594 C 3.910156 -1.558594 4.023438 -1.929688 4.027344 -2.414063 C 4.023438 -2.882813 3.910156 -3.242188 3.6875 -3.5 C 3.460938 -3.75 3.183594 -3.878906 2.855469 -3.882813 C 2.519531 -3.878906 2.242188 -3.75 2.015625 -3.5 C 1.789063 -3.242188 1.675781 -2.878906 1.675781 -2.40625 Z M 1.675781 -2.40625 "
-           id="path8615" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-2">
-        <path
-           style="stroke:none;"
-           d="M 1.882813 0 L 0.613281 0 L 0.613281 -4.8125 L 1.792969 -4.8125 L 1.792969 -4.128906 C 1.996094 -4.449219 2.175781 -4.660156 2.339844 -4.765625 C 2.496094 -4.863281 2.679688 -4.914063 2.890625 -4.917969 C 3.175781 -4.914063 3.457031 -4.835938 3.726563 -4.679688 L 3.335938 -3.570313 C 3.117188 -3.703125 2.917969 -3.773438 2.734375 -3.777344 C 2.554688 -3.773438 2.402344 -3.722656 2.28125 -3.628906 C 2.15625 -3.527344 2.058594 -3.351563 1.988281 -3.097656 C 1.914063 -2.839844 1.878906 -2.300781 1.882813 -1.484375 Z M 1.882813 0 "
-           id="path8618" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-3">
-        <path
-           style="stroke:none;"
-           d="M 0.621094 0 L 0.621094 -6.640625 L 1.894531 -6.640625 L 1.894531 -3.117188 L 3.382813 -4.8125 L 4.953125 -4.8125 L 3.308594 -3.054688 L 5.070313 0 L 3.695313 0 L 2.488281 -2.160156 L 1.894531 -1.539063 L 1.894531 0 Z M 0.621094 0 "
-           id="path8621" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-4">
-        <path
-           style="stroke:none;"
-           d="M 0.671875 -6.640625 L 3.121094 -6.640625 C 3.671875 -6.636719 4.09375 -6.597656 4.386719 -6.515625 C 4.773438 -6.398438 5.105469 -6.191406 5.386719 -5.902344 C 5.660156 -5.605469 5.871094 -5.25 6.019531 -4.828125 C 6.160156 -4.402344 6.234375 -3.878906 6.238281 -3.257813 C 6.234375 -2.710938 6.167969 -2.238281 6.035156 -1.84375 C 5.867188 -1.355469 5.628906 -0.964844 5.324219 -0.671875 C 5.085938 -0.441406 4.769531 -0.265625 4.378906 -0.140625 C 4.082031 -0.046875 3.6875 0 3.195313 0 L 0.671875 0 Z M 2.011719 -5.519531 L 2.011719 -1.117188 L 3.011719 -1.117188 C 3.386719 -1.113281 3.65625 -1.136719 3.824219 -1.183594 C 4.039063 -1.238281 4.21875 -1.328125 4.363281 -1.457031 C 4.503906 -1.585938 4.621094 -1.800781 4.714844 -2.097656 C 4.804688 -2.394531 4.851563 -2.800781 4.851563 -3.316406 C 4.851563 -3.828125 4.804688 -4.21875 4.714844 -4.496094 C 4.621094 -4.769531 4.492188 -4.984375 4.332031 -5.140625 C 4.167969 -5.289063 3.964844 -5.394531 3.714844 -5.453125 C 3.527344 -5.496094 3.160156 -5.519531 2.613281 -5.519531 Z M 2.011719 -5.519531 "
-           id="path8624" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-5">
-        <path
-           style="stroke:none;"
-           d="M 0.664063 -5.464844 L 0.664063 -6.640625 L 1.9375 -6.640625 L 1.9375 -5.464844 Z M 0.664063 0 L 0.664063 -4.8125 L 1.9375 -4.8125 L 1.9375 0 Z M 0.664063 0 "
-           id="path8627" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-6">
-        <path
-           style="stroke:none;"
-           d="M 1.988281 0 L 0.0507813 -4.8125 L 1.386719 -4.8125 L 2.292969 -2.355469 L 2.554688 -1.535156 C 2.625 -1.742188 2.667969 -1.878906 2.6875 -1.949219 C 2.726563 -2.082031 2.773438 -2.21875 2.824219 -2.355469 L 3.738281 -4.8125 L 5.046875 -4.8125 L 3.136719 0 Z M 1.988281 0 "
-           id="path8630" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-7">
-        <path
-           style="stroke:none;"
-           d="M 4.925781 -2.441406 L 6.222656 -2.03125 C 6.023438 -1.300781 5.691406 -0.761719 5.230469 -0.414063 C 4.765625 -0.0625 4.175781 0.109375 3.464844 0.113281 C 2.582031 0.109375 1.859375 -0.1875 1.289063 -0.789063 C 0.71875 -1.390625 0.433594 -2.214844 0.4375 -3.261719 C 0.433594 -4.363281 0.71875 -5.222656 1.292969 -5.835938 C 1.863281 -6.445313 2.613281 -6.75 3.546875 -6.753906 C 4.355469 -6.75 5.015625 -6.511719 5.527344 -6.035156 C 5.824219 -5.75 6.050781 -5.339844 6.207031 -4.8125 L 4.878906 -4.492188 C 4.796875 -4.835938 4.632813 -5.109375 4.386719 -5.308594 C 4.136719 -5.507813 3.835938 -5.605469 3.480469 -5.609375 C 2.988281 -5.605469 2.585938 -5.429688 2.28125 -5.078125 C 1.96875 -4.722656 1.816406 -4.152344 1.820313 -3.363281 C 1.816406 -2.519531 1.96875 -1.917969 2.273438 -1.566406 C 2.574219 -1.207031 2.964844 -1.03125 3.453125 -1.03125 C 3.804688 -1.03125 4.113281 -1.144531 4.371094 -1.371094 C 4.628906 -1.597656 4.8125 -1.953125 4.925781 -2.441406 Z M 4.925781 -2.441406 "
-           id="path8633" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-8">
-        <path
-           style="stroke:none;"
-           d="M 3.832031 0 L 3.832031 -0.71875 C 3.65625 -0.460938 3.425781 -0.257813 3.140625 -0.113281 C 2.855469 0.0351563 2.554688 0.105469 2.238281 0.109375 C 1.914063 0.105469 1.621094 0.0390625 1.367188 -0.101563 C 1.109375 -0.242188 0.925781 -0.441406 0.8125 -0.699219 C 0.695313 -0.953125 0.636719 -1.308594 0.640625 -1.765625 L 0.640625 -4.8125 L 1.910156 -4.8125 L 1.910156 -2.601563 C 1.910156 -1.921875 1.933594 -1.503906 1.980469 -1.355469 C 2.027344 -1.199219 2.109375 -1.078125 2.234375 -0.992188 C 2.355469 -0.898438 2.515625 -0.855469 2.710938 -0.859375 C 2.925781 -0.855469 3.121094 -0.914063 3.292969 -1.039063 C 3.464844 -1.15625 3.582031 -1.304688 3.644531 -1.484375 C 3.707031 -1.65625 3.738281 -2.089844 3.742188 -2.78125 L 3.742188 -4.8125 L 5.015625 -4.8125 L 5.015625 0 Z M 3.832031 0 "
-           id="path8636" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-9">
-        <path
-           style="stroke:none;"
-           d="M 5.078125 0 L 3.894531 0 L 3.894531 -0.707031 C 3.695313 -0.429688 3.464844 -0.222656 3.199219 -0.0898438 C 2.933594 0.0429688 2.664063 0.105469 2.390625 0.109375 C 1.835938 0.105469 1.359375 -0.113281 0.96875 -0.558594 C 0.570313 -1 0.375 -1.621094 0.378906 -2.421875 C 0.375 -3.234375 0.566406 -3.855469 0.953125 -4.28125 C 1.335938 -4.703125 1.820313 -4.914063 2.410156 -4.917969 C 2.941406 -4.914063 3.40625 -4.691406 3.804688 -4.25 L 3.804688 -6.640625 L 5.078125 -6.640625 Z M 1.679688 -2.507813 C 1.675781 -1.992188 1.746094 -1.621094 1.894531 -1.394531 C 2.097656 -1.0625 2.386719 -0.894531 2.753906 -0.898438 C 3.046875 -0.894531 3.292969 -1.019531 3.5 -1.269531 C 3.703125 -1.515625 3.808594 -1.886719 3.808594 -2.386719 C 3.808594 -2.933594 3.707031 -3.332031 3.507813 -3.578125 C 3.308594 -3.820313 3.054688 -3.941406 2.746094 -3.945313 C 2.441406 -3.941406 2.1875 -3.824219 1.984375 -3.585938 C 1.777344 -3.34375 1.675781 -2.984375 1.679688 -2.507813 Z M 1.679688 -2.507813 "
-           id="path8639" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-10">
-        <path
-           style="stroke:none;"
-           d="M 1.617188 -3.34375 L 0.460938 -3.550781 C 0.589844 -4.015625 0.8125 -4.359375 1.132813 -4.582031 C 1.445313 -4.804688 1.917969 -4.914063 2.546875 -4.917969 C 3.113281 -4.914063 3.535156 -4.847656 3.8125 -4.714844 C 4.089844 -4.578125 4.285156 -4.40625 4.398438 -4.203125 C 4.511719 -3.992188 4.570313 -3.613281 4.570313 -3.066406 L 4.558594 -1.582031 C 4.558594 -1.15625 4.578125 -0.84375 4.617188 -0.644531 C 4.65625 -0.441406 4.730469 -0.226563 4.847656 0 L 3.585938 0 C 3.554688 -0.0820313 3.511719 -0.207031 3.464844 -0.375 C 3.441406 -0.449219 3.429688 -0.5 3.421875 -0.527344 C 3.203125 -0.3125 2.96875 -0.152344 2.722656 -0.046875 C 2.472656 0.0546875 2.207031 0.105469 1.929688 0.109375 C 1.429688 0.105469 1.039063 -0.0273438 0.757813 -0.292969 C 0.46875 -0.5625 0.328125 -0.902344 0.332031 -1.3125 C 0.328125 -1.585938 0.390625 -1.828125 0.523438 -2.039063 C 0.648438 -2.25 0.832031 -2.410156 1.070313 -2.527344 C 1.300781 -2.636719 1.636719 -2.738281 2.082031 -2.828125 C 2.675781 -2.9375 3.089844 -3.042969 3.320313 -3.140625 L 3.320313 -3.265625 C 3.316406 -3.507813 3.253906 -3.683594 3.136719 -3.789063 C 3.011719 -3.890625 2.785156 -3.941406 2.457031 -3.945313 C 2.226563 -3.941406 2.050781 -3.898438 1.925781 -3.8125 C 1.796875 -3.722656 1.695313 -3.566406 1.617188 -3.34375 Z M 3.320313 -2.308594 C 3.152344 -2.253906 2.894531 -2.1875 2.542969 -2.117188 C 2.191406 -2.039063 1.960938 -1.964844 1.851563 -1.894531 C 1.6875 -1.773438 1.605469 -1.625 1.605469 -1.445313 C 1.605469 -1.265625 1.671875 -1.113281 1.804688 -0.984375 C 1.9375 -0.851563 2.105469 -0.785156 2.308594 -0.789063 C 2.539063 -0.785156 2.757813 -0.859375 2.96875 -1.015625 C 3.121094 -1.125 3.222656 -1.265625 3.269531 -1.4375 C 3.300781 -1.539063 3.316406 -1.746094 3.320313 -2.058594 Z M 3.320313 -2.308594 "
-           id="path8642" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph28-11">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 -6.640625 L 3.335938 -6.640625 C 3.855469 -6.636719 4.246094 -6.617188 4.507813 -6.574219 C 4.765625 -6.53125 5 -6.4375 5.203125 -6.300781 C 5.40625 -6.160156 5.574219 -5.976563 5.710938 -5.746094 C 5.847656 -5.511719 5.917969 -5.253906 5.917969 -4.96875 C 5.917969 -4.65625 5.832031 -4.371094 5.664063 -4.113281 C 5.496094 -3.851563 5.269531 -3.65625 4.984375 -3.527344 C 5.386719 -3.410156 5.695313 -3.207031 5.914063 -2.925781 C 6.128906 -2.636719 6.238281 -2.304688 6.242188 -1.925781 C 6.238281 -1.621094 6.167969 -1.328125 6.03125 -1.042969 C 5.886719 -0.757813 5.695313 -0.527344 5.457031 -0.359375 C 5.210938 -0.1875 4.914063 -0.0859375 4.558594 -0.046875 C 4.332031 -0.0195313 3.792969 -0.00390625 2.941406 0 L 0.679688 0 Z M 2.019531 -5.535156 L 2.019531 -4 L 2.898438 -4 C 3.417969 -3.996094 3.742188 -4.003906 3.875 -4.023438 C 4.101563 -4.046875 4.28125 -4.125 4.414063 -4.257813 C 4.542969 -4.386719 4.609375 -4.5625 4.613281 -4.78125 C 4.609375 -4.984375 4.554688 -5.148438 4.441406 -5.277344 C 4.328125 -5.40625 4.160156 -5.484375 3.9375 -5.511719 C 3.804688 -5.523438 3.421875 -5.53125 2.789063 -5.535156 Z M 2.019531 -2.894531 L 2.019531 -1.117188 L 3.261719 -1.117188 C 3.742188 -1.113281 4.046875 -1.128906 4.179688 -1.160156 C 4.375 -1.195313 4.539063 -1.28125 4.667969 -1.425781 C 4.789063 -1.5625 4.851563 -1.75 4.855469 -1.988281 C 4.851563 -2.183594 4.804688 -2.355469 4.710938 -2.496094 C 4.613281 -2.632813 4.472656 -2.730469 4.292969 -2.796875 C 4.109375 -2.859375 3.710938 -2.894531 3.101563 -2.894531 Z M 2.019531 -2.894531 "
-           id="path8645" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph29-0">
-        <path
-           style="stroke:none;"
-           d="M 0.644531 0 L 0.644531 -6.761719 L 2.011719 -6.761719 L 2.011719 0 Z M 0.644531 0 "
-           id="path8648" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph29-1">
-        <path
-           style="stroke:none;"
-           d="M 5.167969 0 L 3.964844 0 L 3.964844 -0.71875 C 3.761719 -0.4375 3.527344 -0.230469 3.257813 -0.09375 C 2.984375 0.0390625 2.707031 0.105469 2.433594 0.109375 C 1.867188 0.105469 1.386719 -0.117188 0.988281 -0.570313 C 0.585938 -1.019531 0.386719 -1.652344 0.386719 -2.46875 C 0.386719 -3.292969 0.582031 -3.925781 0.972656 -4.359375 C 1.363281 -4.789063 1.855469 -5.003906 2.453125 -5.007813 C 2.996094 -5.003906 3.46875 -4.777344 3.875 -4.324219 L 3.875 -6.761719 L 5.167969 -6.761719 Z M 1.710938 -2.554688 C 1.710938 -2.03125 1.78125 -1.652344 1.925781 -1.421875 C 2.132813 -1.082031 2.425781 -0.914063 2.804688 -0.914063 C 3.097656 -0.914063 3.351563 -1.039063 3.5625 -1.292969 C 3.773438 -1.546875 3.878906 -1.925781 3.878906 -2.429688 C 3.878906 -2.988281 3.777344 -3.394531 3.574219 -3.644531 C 3.371094 -3.890625 3.109375 -4.011719 2.792969 -4.015625 C 2.484375 -4.011719 2.226563 -3.890625 2.019531 -3.648438 C 1.8125 -3.402344 1.710938 -3.039063 1.710938 -2.554688 Z M 1.710938 -2.554688 "
-           id="path8651" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph29-2">
-        <path
-           style="stroke:none;"
-           d="M 0.0546875 0 L 1.820313 -2.523438 L 0.128906 -4.898438 L 1.710938 -4.898438 L 2.578125 -3.550781 L 3.492188 -4.898438 L 5.011719 -4.898438 L 3.351563 -2.578125 L 5.164063 0 L 3.574219 0 L 2.578125 -1.515625 L 1.574219 0 Z M 0.0546875 0 "
-           id="path8654" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph29-3">
-        <path
-           style="stroke:none;"
-           d="M 3.832031 -2.484375 L 3.832031 -3.625 L 6.773438 -3.625 L 6.773438 -0.929688 C 6.484375 -0.652344 6.070313 -0.410156 5.53125 -0.199219 C 4.988281 0.0117188 4.4375 0.117188 3.882813 0.117188 C 3.171875 0.117188 2.554688 -0.03125 2.03125 -0.328125 C 1.503906 -0.625 1.109375 -1.046875 0.847656 -1.601563 C 0.582031 -2.148438 0.449219 -2.75 0.453125 -3.402344 C 0.449219 -4.105469 0.597656 -4.730469 0.894531 -5.28125 C 1.1875 -5.824219 1.621094 -6.246094 2.191406 -6.539063 C 2.621094 -6.761719 3.160156 -6.871094 3.808594 -6.875 C 4.644531 -6.871094 5.296875 -6.695313 5.773438 -6.347656 C 6.242188 -5.992188 6.546875 -5.507813 6.6875 -4.886719 L 5.332031 -4.632813 C 5.234375 -4.964844 5.054688 -5.226563 4.792969 -5.417969 C 4.527344 -5.609375 4.199219 -5.707031 3.808594 -5.707031 C 3.210938 -5.707031 2.734375 -5.515625 2.386719 -5.140625 C 2.03125 -4.757813 1.855469 -4.199219 1.859375 -3.457031 C 1.855469 -2.652344 2.035156 -2.050781 2.394531 -1.652344 C 2.75 -1.25 3.21875 -1.046875 3.796875 -1.050781 C 4.078125 -1.046875 4.363281 -1.105469 4.652344 -1.21875 C 4.941406 -1.328125 5.1875 -1.464844 5.394531 -1.628906 L 5.394531 -2.484375 Z M 3.832031 -2.484375 "
-           id="path8657" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph29-4">
-        <path
-           style="stroke:none;"
-           d="M 0.621094 0 L 0.621094 -6.761719 L 1.917969 -6.761719 L 1.917969 -4.324219 C 2.316406 -4.777344 2.792969 -5.003906 3.339844 -5.007813 C 3.933594 -5.003906 4.425781 -4.789063 4.820313 -4.359375 C 5.207031 -3.925781 5.402344 -3.304688 5.40625 -2.5 C 5.402344 -1.660156 5.203125 -1.015625 4.808594 -0.566406 C 4.40625 -0.117188 3.921875 0.105469 3.355469 0.109375 C 3.074219 0.105469 2.796875 0.0390625 2.527344 -0.0976563 C 2.253906 -0.234375 2.019531 -0.441406 1.824219 -0.71875 L 1.824219 0 Z M 1.910156 -2.554688 C 1.910156 -2.046875 1.988281 -1.671875 2.148438 -1.429688 C 2.371094 -1.085938 2.667969 -0.914063 3.042969 -0.914063 C 3.324219 -0.914063 3.570313 -1.035156 3.773438 -1.277344 C 3.976563 -1.519531 4.078125 -1.902344 4.078125 -2.433594 C 4.078125 -2.988281 3.976563 -3.394531 3.773438 -3.644531 C 3.570313 -3.890625 3.308594 -4.011719 2.992188 -4.015625 C 2.679688 -4.011719 2.421875 -3.890625 2.21875 -3.652344 C 2.011719 -3.410156 1.910156 -3.046875 1.910156 -2.554688 Z M 1.910156 -2.554688 "
-           id="path8660" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph29-5">
-        <path
-           style="stroke:none;"
-           d="M 0.691406 0 L 0.691406 -6.761719 L 3.5625 -6.761719 C 4.285156 -6.761719 4.808594 -6.699219 5.136719 -6.578125 C 5.460938 -6.453125 5.722656 -6.238281 5.921875 -5.929688 C 6.117188 -5.617188 6.214844 -5.261719 6.21875 -4.863281 C 6.214844 -4.355469 6.066406 -3.9375 5.769531 -3.609375 C 5.46875 -3.277344 5.023438 -3.066406 4.4375 -2.984375 C 4.726563 -2.808594 4.96875 -2.621094 5.164063 -2.414063 C 5.351563 -2.207031 5.613281 -1.839844 5.945313 -1.320313 L 6.769531 0 L 5.136719 0 L 4.148438 -1.472656 C 3.796875 -1.992188 3.554688 -2.324219 3.429688 -2.460938 C 3.296875 -2.597656 3.160156 -2.691406 3.019531 -2.742188 C 2.871094 -2.792969 2.644531 -2.816406 2.332031 -2.820313 L 2.054688 -2.820313 L 2.054688 0 Z M 2.054688 -3.902344 L 3.066406 -3.902344 C 3.714844 -3.902344 4.125 -3.929688 4.292969 -3.984375 C 4.453125 -4.039063 4.582031 -4.132813 4.675781 -4.269531 C 4.765625 -4.402344 4.808594 -4.570313 4.8125 -4.777344 C 4.808594 -5 4.75 -5.183594 4.628906 -5.328125 C 4.507813 -5.464844 4.335938 -5.554688 4.117188 -5.59375 C 4.003906 -5.605469 3.671875 -5.613281 3.121094 -5.617188 L 2.054688 -5.617188 Z M 2.054688 -3.902344 "
-           id="path8663" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph29-6">
-        <path
-           style="stroke:none;"
-           d="M 3.515625 -1.558594 L 4.804688 -1.34375 C 4.636719 -0.867188 4.371094 -0.507813 4.015625 -0.261719 C 3.65625 -0.015625 3.210938 0.105469 2.675781 0.109375 C 1.820313 0.105469 1.191406 -0.167969 0.785156 -0.722656 C 0.460938 -1.167969 0.296875 -1.730469 0.300781 -2.410156 C 0.296875 -3.21875 0.511719 -3.855469 0.9375 -4.316406 C 1.359375 -4.773438 1.894531 -5.003906 2.546875 -5.007813 C 3.269531 -5.003906 3.84375 -4.761719 4.269531 -4.285156 C 4.6875 -3.800781 4.890625 -3.066406 4.875 -2.074219 L 1.628906 -2.074219 C 1.636719 -1.691406 1.738281 -1.390625 1.941406 -1.179688 C 2.136719 -0.960938 2.386719 -0.855469 2.6875 -0.859375 C 2.886719 -0.855469 3.058594 -0.910156 3.199219 -1.023438 C 3.335938 -1.132813 3.441406 -1.3125 3.515625 -1.558594 Z M 3.585938 -2.867188 C 3.578125 -3.238281 3.480469 -3.523438 3.296875 -3.722656 C 3.109375 -3.914063 2.886719 -4.011719 2.625 -4.015625 C 2.339844 -4.011719 2.105469 -3.910156 1.921875 -3.707031 C 1.734375 -3.5 1.644531 -3.21875 1.652344 -2.867188 Z M 3.585938 -2.867188 "
-           id="path8666" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph29-7">
-        <path
-           style="stroke:none;"
-           d="M 0.109375 -4.898438 L 0.828125 -4.898438 L 0.828125 -5.265625 C 0.824219 -5.675781 0.871094 -5.984375 0.960938 -6.1875 C 1.046875 -6.390625 1.207031 -6.554688 1.445313 -6.683594 C 1.675781 -6.808594 1.972656 -6.871094 2.335938 -6.875 C 2.703125 -6.871094 3.066406 -6.816406 3.421875 -6.710938 L 3.246094 -5.804688 C 3.039063 -5.851563 2.839844 -5.875 2.652344 -5.878906 C 2.460938 -5.875 2.324219 -5.832031 2.246094 -5.746094 C 2.160156 -5.65625 2.121094 -5.488281 2.125 -5.242188 L 2.125 -4.898438 L 3.09375 -4.898438 L 3.09375 -3.878906 L 2.125 -3.878906 L 2.125 0 L 0.828125 0 L 0.828125 -3.878906 L 0.109375 -3.878906 Z M 0.109375 -4.898438 "
-           id="path8669" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-0">
-        <path
-           style="stroke:none;"
-           d="M 0.640625 0 L 0.640625 -6.71875 L 1.996094 -6.71875 L 1.996094 0 Z M 0.640625 0 "
-           id="path8672" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-1">
-        <path
-           style="stroke:none;"
-           d="M 5.136719 0 L 3.941406 0 L 3.941406 -0.714844 C 3.742188 -0.433594 3.507813 -0.226563 3.238281 -0.09375 C 2.964844 0.0390625 2.691406 0.105469 2.417969 0.109375 C 1.859375 0.105469 1.382813 -0.117188 0.984375 -0.566406 C 0.585938 -1.015625 0.386719 -1.644531 0.386719 -2.453125 C 0.386719 -3.273438 0.578125 -3.898438 0.964844 -4.332031 C 1.351563 -4.757813 1.84375 -4.972656 2.4375 -4.976563 C 2.980469 -4.972656 3.449219 -4.746094 3.847656 -4.296875 L 3.847656 -6.71875 L 5.136719 -6.71875 Z M 1.699219 -2.539063 C 1.699219 -2.015625 1.769531 -1.640625 1.914063 -1.410156 C 2.121094 -1.070313 2.410156 -0.902344 2.785156 -0.90625 C 3.078125 -0.902344 3.332031 -1.027344 3.542969 -1.285156 C 3.75 -1.535156 3.855469 -1.914063 3.855469 -2.414063 C 3.855469 -2.972656 3.753906 -3.375 3.550781 -3.621094 C 3.347656 -3.867188 3.089844 -3.988281 2.777344 -3.992188 C 2.472656 -3.988281 2.214844 -3.867188 2.007813 -3.625 C 1.800781 -3.378906 1.699219 -3.015625 1.699219 -2.539063 Z M 1.699219 -2.539063 "
-           id="path8675" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-2">
-        <path
-           style="stroke:none;"
-           d="M 0.0546875 0 L 1.808594 -2.507813 L 0.128906 -4.867188 L 1.699219 -4.867188 L 2.5625 -3.527344 L 3.46875 -4.867188 L 4.980469 -4.867188 L 3.332031 -2.5625 L 5.132813 0 L 3.550781 0 L 2.5625 -1.507813 L 1.5625 0 Z M 0.0546875 0 "
-           id="path8678" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-3">
-        <path
-           style="stroke:none;"
-           d="M 0.6875 -6.71875 L 3.371094 -6.71875 C 3.898438 -6.714844 4.296875 -6.691406 4.558594 -6.652344 C 4.820313 -6.605469 5.054688 -6.515625 5.261719 -6.375 C 5.464844 -6.234375 5.636719 -6.046875 5.777344 -5.8125 C 5.914063 -5.578125 5.980469 -5.316406 5.984375 -5.027344 C 5.980469 -4.710938 5.898438 -4.421875 5.730469 -4.160156 C 5.558594 -3.894531 5.328125 -3.695313 5.039063 -3.570313 C 5.445313 -3.445313 5.757813 -3.242188 5.980469 -2.960938 C 6.199219 -2.671875 6.308594 -2.335938 6.3125 -1.949219 C 6.308594 -1.640625 6.238281 -1.34375 6.097656 -1.054688 C 5.953125 -0.765625 5.757813 -0.535156 5.515625 -0.363281 C 5.265625 -0.191406 4.964844 -0.0859375 4.609375 -0.046875 C 4.378906 -0.0195313 3.832031 -0.00390625 2.972656 0 L 0.6875 0 Z M 2.042969 -5.597656 L 2.042969 -4.046875 L 2.933594 -4.046875 C 3.460938 -4.042969 3.789063 -4.050781 3.917969 -4.070313 C 4.148438 -4.09375 4.332031 -4.171875 4.464844 -4.308594 C 4.597656 -4.4375 4.664063 -4.613281 4.664063 -4.835938 C 4.664063 -5.039063 4.605469 -5.207031 4.492188 -5.339844 C 4.375 -5.464844 4.203125 -5.546875 3.980469 -5.578125 C 3.84375 -5.589844 3.460938 -5.59375 2.824219 -5.597656 Z M 2.042969 -2.929688 L 2.042969 -1.132813 L 3.300781 -1.132813 C 3.785156 -1.128906 4.097656 -1.144531 4.230469 -1.171875 C 4.429688 -1.207031 4.589844 -1.292969 4.71875 -1.4375 C 4.84375 -1.578125 4.910156 -1.769531 4.910156 -2.011719 C 4.910156 -2.207031 4.859375 -2.378906 4.761719 -2.523438 C 4.664063 -2.660156 4.523438 -2.761719 4.339844 -2.832031 C 4.152344 -2.894531 3.753906 -2.929688 3.140625 -2.929688 Z M 2.042969 -2.929688 "
-           id="path8681" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-4">
-        <path
-           style="stroke:none;"
-           d="M 2.90625 -4.867188 L 2.90625 -3.839844 L 2.023438 -3.839844 L 2.023438 -1.878906 C 2.019531 -1.476563 2.027344 -1.246094 2.046875 -1.183594 C 2.0625 -1.117188 2.101563 -1.0625 2.164063 -1.019531 C 2.21875 -0.976563 2.292969 -0.953125 2.382813 -0.957031 C 2.5 -0.953125 2.671875 -0.996094 2.902344 -1.082031 L 3.011719 -0.0820313 C 2.710938 0.0429688 2.371094 0.105469 1.992188 0.109375 C 1.757813 0.105469 1.546875 0.0703125 1.363281 -0.0078125 C 1.175781 -0.0820313 1.039063 -0.183594 0.953125 -0.308594 C 0.863281 -0.429688 0.804688 -0.597656 0.773438 -0.8125 C 0.746094 -0.957031 0.730469 -1.257813 0.734375 -1.71875 L 0.734375 -3.839844 L 0.140625 -3.839844 L 0.140625 -4.867188 L 0.734375 -4.867188 L 0.734375 -5.832031 L 2.023438 -6.585938 L 2.023438 -4.867188 Z M 2.90625 -4.867188 "
-           id="path8684" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-5">
-        <path
-           style="stroke:none;"
-           d="M 0.101563 0 L 0.101563 -1.222656 L 3.628906 -5.582031 L 0.5 -5.582031 L 0.5 -6.71875 L 5.414063 -6.71875 L 5.414063 -5.664063 L 1.738281 -1.132813 L 5.558594 -1.132813 L 5.558594 0 Z M 0.101563 0 "
-           id="path8687" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-6">
-        <path
-           style="stroke:none;"
-           d="M 3.492188 -1.546875 L 4.773438 -1.332031 C 4.605469 -0.859375 4.34375 -0.503906 3.992188 -0.257813 C 3.632813 -0.015625 3.1875 0.105469 2.65625 0.109375 C 1.804688 0.105469 1.179688 -0.167969 0.777344 -0.71875 C 0.453125 -1.160156 0.292969 -1.71875 0.296875 -2.394531 C 0.292969 -3.199219 0.503906 -3.832031 0.929688 -4.289063 C 1.347656 -4.746094 1.882813 -4.972656 2.53125 -4.976563 C 3.253906 -4.972656 3.824219 -4.734375 4.242188 -4.257813 C 4.660156 -3.777344 4.859375 -3.046875 4.84375 -2.0625 L 1.617188 -2.0625 C 1.625 -1.679688 1.726563 -1.378906 1.929688 -1.167969 C 2.125 -0.953125 2.375 -0.847656 2.671875 -0.851563 C 2.871094 -0.847656 3.039063 -0.902344 3.179688 -1.015625 C 3.316406 -1.125 3.421875 -1.304688 3.492188 -1.546875 Z M 3.566406 -2.851563 C 3.554688 -3.222656 3.457031 -3.503906 3.273438 -3.699219 C 3.089844 -3.890625 2.867188 -3.988281 2.605469 -3.992188 C 2.324219 -3.988281 2.09375 -3.886719 1.910156 -3.683594 C 1.726563 -3.476563 1.636719 -3.199219 1.640625 -2.851563 Z M 3.566406 -2.851563 "
-           id="path8690" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-7">
-        <path
-           style="stroke:none;"
-           d="M 1.90625 0 L 0.617188 0 L 0.617188 -4.867188 L 1.816406 -4.867188 L 1.816406 -4.175781 C 2.019531 -4.5 2.203125 -4.714844 2.367188 -4.820313 C 2.53125 -4.921875 2.714844 -4.972656 2.921875 -4.976563 C 3.214844 -4.972656 3.496094 -4.894531 3.769531 -4.734375 L 3.371094 -3.609375 C 3.152344 -3.75 2.953125 -3.820313 2.765625 -3.820313 C 2.585938 -3.820313 2.433594 -3.769531 2.308594 -3.671875 C 2.183594 -3.570313 2.082031 -3.390625 2.011719 -3.132813 C 1.9375 -2.871094 1.902344 -2.328125 1.90625 -1.503906 Z M 1.90625 0 "
-           id="path8693" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph30-8">
-        <path
-           style="stroke:none;"
-           d="M 0.375 -2.5 C 0.375 -2.925781 0.480469 -3.339844 0.691406 -3.742188 C 0.902344 -4.140625 1.199219 -4.445313 1.585938 -4.660156 C 1.972656 -4.867188 2.40625 -4.972656 2.882813 -4.976563 C 3.617188 -4.972656 4.21875 -4.734375 4.691406 -4.257813 C 5.160156 -3.777344 5.394531 -3.171875 5.398438 -2.445313 C 5.394531 -1.707031 5.15625 -1.097656 4.683594 -0.617188 C 4.207031 -0.132813 3.609375 0.105469 2.890625 0.109375 C 2.441406 0.105469 2.015625 0.0078125 1.613281 -0.191406 C 1.207031 -0.390625 0.902344 -0.6875 0.691406 -1.078125 C 0.480469 -1.46875 0.375 -1.941406 0.375 -2.5 Z M 1.695313 -2.433594 C 1.691406 -1.949219 1.808594 -1.582031 2.039063 -1.324219 C 2.265625 -1.066406 2.546875 -0.9375 2.886719 -0.9375 C 3.21875 -0.9375 3.5 -1.066406 3.730469 -1.324219 C 3.957031 -1.582031 4.070313 -1.953125 4.074219 -2.441406 C 4.070313 -2.914063 3.957031 -3.28125 3.730469 -3.539063 C 3.5 -3.796875 3.21875 -3.925781 2.886719 -3.925781 C 2.546875 -3.925781 2.265625 -3.796875 2.039063 -3.539063 C 1.808594 -3.28125 1.691406 -2.910156 1.695313 -2.433594 Z M 1.695313 -2.433594 "
-           id="path8696" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-0">
-        <path
-           style="stroke:none;"
-           d="M 6.574219 0 L 5.132813 0 L 4.5625 -1.488281 L 1.945313 -1.488281 L 1.402344 0 L 0 0 L 2.550781 -6.550781 L 3.949219 -6.550781 Z M 4.136719 -2.589844 L 3.234375 -5.023438 L 2.351563 -2.589844 Z M 4.136719 -2.589844 "
-           id="path8699" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-1">
-        <path
-           style="stroke:none;"
-           d="M 2.832031 -4.746094 L 2.832031 -3.746094 L 1.976563 -3.746094 L 1.976563 -1.832031 C 1.972656 -1.441406 1.980469 -1.214844 2 -1.152344 C 2.011719 -1.085938 2.050781 -1.035156 2.109375 -0.996094 C 2.167969 -0.953125 2.238281 -0.929688 2.324219 -0.933594 C 2.4375 -0.929688 2.605469 -0.972656 2.828125 -1.054688 L 2.933594 -0.0820313 C 2.640625 0.0429688 2.3125 0.105469 1.945313 0.105469 C 1.71875 0.105469 1.511719 0.0703125 1.332031 -0.0078125 C 1.144531 -0.078125 1.011719 -0.175781 0.929688 -0.300781 C 0.84375 -0.417969 0.785156 -0.582031 0.753906 -0.789063 C 0.726563 -0.933594 0.714844 -1.230469 0.714844 -1.675781 L 0.714844 -3.746094 L 0.136719 -3.746094 L 0.136719 -4.746094 L 0.714844 -4.746094 L 0.714844 -5.6875 L 1.976563 -6.421875 L 1.976563 -4.746094 Z M 2.832031 -4.746094 "
-           id="path8702" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-2">
-        <path
-           style="stroke:none;"
-           d="M 0.367188 -2.441406 C 0.363281 -2.855469 0.464844 -3.257813 0.675781 -3.648438 C 0.878906 -4.035156 1.171875 -4.332031 1.546875 -4.542969 C 1.921875 -4.746094 2.339844 -4.851563 2.808594 -4.851563 C 3.523438 -4.851563 4.113281 -4.617188 4.574219 -4.152344 C 5.03125 -3.683594 5.261719 -3.09375 5.261719 -2.386719 C 5.261719 -1.664063 5.027344 -1.070313 4.566406 -0.601563 C 4.101563 -0.128906 3.519531 0.105469 2.820313 0.105469 C 2.382813 0.105469 1.96875 0.0078125 1.574219 -0.1875 C 1.179688 -0.382813 0.878906 -0.667969 0.675781 -1.050781 C 0.464844 -1.425781 0.363281 -1.890625 0.367188 -2.441406 Z M 1.652344 -2.371094 C 1.652344 -1.898438 1.761719 -1.539063 1.988281 -1.289063 C 2.207031 -1.039063 2.484375 -0.914063 2.816406 -0.914063 C 3.140625 -0.914063 3.414063 -1.039063 3.636719 -1.289063 C 3.859375 -1.539063 3.972656 -1.902344 3.972656 -2.382813 C 3.972656 -2.84375 3.859375 -3.199219 3.636719 -3.453125 C 3.414063 -3.699219 3.140625 -3.824219 2.816406 -3.828125 C 2.484375 -3.824219 2.207031 -3.699219 1.988281 -3.453125 C 1.761719 -3.199219 1.652344 -2.839844 1.652344 -2.371094 Z M 1.652344 -2.371094 "
-           id="path8705" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-3">
-        <path
-           style="stroke:none;"
-           d="M 0.5625 -4.746094 L 1.71875 -4.746094 L 1.71875 -4.097656 C 2.128906 -4.597656 2.625 -4.851563 3.199219 -4.851563 C 3.503906 -4.851563 3.765625 -4.789063 3.988281 -4.664063 C 4.210938 -4.539063 4.394531 -4.347656 4.539063 -4.097656 C 4.746094 -4.347656 4.96875 -4.539063 5.210938 -4.664063 C 5.449219 -4.789063 5.710938 -4.851563 5.988281 -4.851563 C 6.335938 -4.851563 6.628906 -4.78125 6.871094 -4.640625 C 7.109375 -4.5 7.289063 -4.289063 7.414063 -4.015625 C 7.496094 -3.808594 7.539063 -3.484375 7.542969 -3.035156 L 7.542969 0 L 6.285156 0 L 6.285156 -2.710938 C 6.28125 -3.179688 6.238281 -3.484375 6.15625 -3.625 C 6.039063 -3.800781 5.859375 -3.890625 5.621094 -3.890625 C 5.441406 -3.890625 5.277344 -3.835938 5.121094 -3.730469 C 4.964844 -3.621094 4.851563 -3.464844 4.789063 -3.257813 C 4.71875 -3.050781 4.6875 -2.722656 4.6875 -2.277344 L 4.6875 0 L 3.429688 0 L 3.429688 -2.601563 C 3.425781 -3.0625 3.40625 -3.359375 3.363281 -3.492188 C 3.316406 -3.625 3.246094 -3.722656 3.15625 -3.792969 C 3.058594 -3.855469 2.929688 -3.890625 2.773438 -3.890625 C 2.578125 -3.890625 2.40625 -3.835938 2.25 -3.734375 C 2.09375 -3.625 1.980469 -3.476563 1.917969 -3.28125 C 1.847656 -3.085938 1.816406 -2.757813 1.820313 -2.304688 L 1.820313 0 L 0.5625 0 Z M 0.5625 -4.746094 "
-           id="path8708" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-4">
-        <path
-           style="stroke:none;"
-           d="M 0.65625 -5.386719 L 0.65625 -6.550781 L 1.914063 -6.550781 L 1.914063 -5.386719 Z M 0.65625 0 L 0.65625 -4.746094 L 1.914063 -4.746094 L 1.914063 0 Z M 0.65625 0 "
-           id="path8711" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-5">
-        <path
-           style="stroke:none;"
-           d="M 4.792969 -3.34375 L 3.554688 -3.117188 C 3.511719 -3.363281 3.417969 -3.546875 3.269531 -3.675781 C 3.121094 -3.796875 2.929688 -3.859375 2.699219 -3.863281 C 2.382813 -3.859375 2.132813 -3.753906 1.949219 -3.539063 C 1.761719 -3.324219 1.667969 -2.960938 1.671875 -2.457031 C 1.667969 -1.890625 1.761719 -1.492188 1.953125 -1.261719 C 2.140625 -1.027344 2.394531 -0.914063 2.714844 -0.914063 C 2.949219 -0.914063 3.144531 -0.980469 3.300781 -1.117188 C 3.449219 -1.253906 3.558594 -1.488281 3.625 -1.820313 L 4.855469 -1.609375 C 4.726563 -1.039063 4.480469 -0.609375 4.117188 -0.324219 C 3.753906 -0.0351563 3.269531 0.105469 2.660156 0.105469 C 1.964844 0.105469 1.410156 -0.113281 0.996094 -0.550781 C 0.582031 -0.988281 0.375 -1.59375 0.378906 -2.367188 C 0.375 -3.148438 0.582031 -3.757813 1 -4.195313 C 1.410156 -4.632813 1.96875 -4.851563 2.679688 -4.851563 C 3.253906 -4.851563 3.714844 -4.726563 4.058594 -4.476563 C 4.398438 -4.226563 4.644531 -3.847656 4.792969 -3.34375 Z M 4.792969 -3.34375 "
-           id="path8714" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-6">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 0 L 0.679688 -6.550781 L 1.964844 -6.550781 L 4.648438 -2.175781 L 4.648438 -6.550781 L 5.875 -6.550781 L 5.875 0 L 4.546875 0 L 1.90625 -4.269531 L 1.90625 0 Z M 0.679688 0 "
-           id="path8717" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-7">
-        <path
-           style="stroke:none;"
-           d="M 0.398438 -3.234375 C 0.398438 -3.898438 0.496094 -4.457031 0.695313 -4.914063 C 0.84375 -5.246094 1.046875 -5.546875 1.304688 -5.8125 C 1.558594 -6.078125 1.839844 -6.273438 2.152344 -6.402344 C 2.558594 -6.570313 3.03125 -6.65625 3.566406 -6.660156 C 4.527344 -6.65625 5.296875 -6.355469 5.878906 -5.761719 C 6.457031 -5.160156 6.75 -4.328125 6.75 -3.265625 C 6.75 -2.203125 6.460938 -1.375 5.886719 -0.78125 C 5.3125 -0.183594 4.542969 0.109375 3.582031 0.113281 C 2.605469 0.109375 1.832031 -0.183594 1.257813 -0.777344 C 0.683594 -1.371094 0.398438 -2.191406 0.398438 -3.234375 Z M 1.761719 -3.28125 C 1.757813 -2.539063 1.929688 -1.976563 2.273438 -1.59375 C 2.617188 -1.210938 3.050781 -1.019531 3.578125 -1.019531 C 4.105469 -1.019531 4.539063 -1.207031 4.875 -1.585938 C 5.210938 -1.964844 5.378906 -2.535156 5.382813 -3.296875 C 5.378906 -4.042969 5.214844 -4.601563 4.886719 -4.976563 C 4.558594 -5.34375 4.121094 -5.53125 3.578125 -5.53125 C 3.03125 -5.53125 2.59375 -5.34375 2.261719 -4.96875 C 1.925781 -4.59375 1.757813 -4.03125 1.761719 -3.28125 Z M 1.761719 -3.28125 "
-           id="path8720" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph31-8">
-        <path
-           style="stroke:none;"
-           d="M 0.621094 -4.746094 L 1.792969 -4.746094 L 1.792969 -4.046875 C 1.941406 -4.285156 2.144531 -4.480469 2.40625 -4.628906 C 2.664063 -4.777344 2.953125 -4.851563 3.269531 -4.851563 C 3.820313 -4.851563 4.285156 -4.632813 4.671875 -4.203125 C 5.054688 -3.765625 5.25 -3.164063 5.25 -2.398438 C 5.25 -1.601563 5.054688 -0.988281 4.667969 -0.550781 C 4.28125 -0.113281 3.8125 0.105469 3.261719 0.105469 C 2.996094 0.105469 2.757813 0.0546875 2.546875 -0.0507813 C 2.332031 -0.152344 2.109375 -0.332031 1.875 -0.585938 L 1.875 1.804688 L 0.621094 1.804688 Z M 1.863281 -2.453125 C 1.863281 -1.917969 1.96875 -1.523438 2.179688 -1.269531 C 2.390625 -1.015625 2.648438 -0.890625 2.953125 -0.890625 C 3.242188 -0.890625 3.484375 -1.003906 3.679688 -1.238281 C 3.875 -1.46875 3.972656 -1.855469 3.972656 -2.390625 C 3.972656 -2.886719 3.871094 -3.253906 3.671875 -3.496094 C 3.472656 -3.734375 3.226563 -3.855469 2.929688 -3.859375 C 2.625 -3.855469 2.371094 -3.738281 2.167969 -3.503906 C 1.964844 -3.265625 1.863281 -2.914063 1.863281 -2.453125 Z M 1.863281 -2.453125 "
-           id="path8723" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph32-0">
-        <path
-           style="stroke:none;"
-           d="M 0.691406 0 L 0.691406 -6.746094 L 3.558594 -6.746094 C 4.277344 -6.742188 4.800781 -6.683594 5.128906 -6.5625 C 5.453125 -6.441406 5.714844 -6.226563 5.914063 -5.917969 C 6.109375 -5.605469 6.210938 -5.25 6.210938 -4.855469 C 6.210938 -4.347656 6.058594 -3.929688 5.761719 -3.601563 C 5.460938 -3.269531 5.019531 -3.0625 4.429688 -2.976563 C 4.722656 -2.804688 4.964844 -2.613281 5.15625 -2.410156 C 5.347656 -2.203125 5.605469 -1.839844 5.933594 -1.316406 L 6.757813 0 L 5.128906 0 L 4.144531 -1.46875 C 3.789063 -1.988281 3.550781 -2.316406 3.421875 -2.457031 C 3.292969 -2.589844 3.15625 -2.683594 3.011719 -2.738281 C 2.867188 -2.785156 2.636719 -2.8125 2.328125 -2.816406 L 2.054688 -2.816406 L 2.054688 0 Z M 2.054688 -3.894531 L 3.0625 -3.894531 C 3.714844 -3.890625 4.121094 -3.917969 4.285156 -3.976563 C 4.445313 -4.027344 4.574219 -4.121094 4.667969 -4.261719 C 4.757813 -4.394531 4.804688 -4.566406 4.804688 -4.769531 C 4.804688 -4.996094 4.742188 -5.175781 4.621094 -5.316406 C 4.5 -5.453125 4.328125 -5.542969 4.109375 -5.582031 C 3.996094 -5.59375 3.667969 -5.601563 3.117188 -5.605469 L 2.054688 -5.605469 Z M 2.054688 -3.894531 "
-           id="path8726" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph32-1">
-        <path
-           style="stroke:none;"
-           d="M 1.644531 -3.398438 L 0.46875 -3.609375 C 0.601563 -4.078125 0.828125 -4.425781 1.148438 -4.65625 C 1.46875 -4.882813 1.945313 -4.996094 2.585938 -5 C 3.160156 -4.996094 3.589844 -4.929688 3.875 -4.792969 C 4.15625 -4.65625 4.355469 -4.480469 4.472656 -4.273438 C 4.582031 -4.058594 4.640625 -3.675781 4.644531 -3.117188 L 4.628906 -1.605469 C 4.628906 -1.175781 4.648438 -0.859375 4.691406 -0.65625 C 4.730469 -0.453125 4.808594 -0.234375 4.925781 0 L 3.644531 0 C 3.609375 -0.0859375 3.566406 -0.210938 3.519531 -0.382813 C 3.496094 -0.453125 3.480469 -0.503906 3.476563 -0.535156 C 3.25 -0.316406 3.015625 -0.15625 2.765625 -0.0507813 C 2.515625 0.0546875 2.246094 0.105469 1.960938 0.109375 C 1.457031 0.105469 1.058594 -0.0273438 0.769531 -0.296875 C 0.476563 -0.566406 0.332031 -0.914063 0.335938 -1.335938 C 0.332031 -1.609375 0.398438 -1.855469 0.53125 -2.074219 C 0.660156 -2.289063 0.84375 -2.453125 1.085938 -2.570313 C 1.320313 -2.683594 1.664063 -2.785156 2.117188 -2.871094 C 2.71875 -2.984375 3.140625 -3.089844 3.375 -3.191406 L 3.375 -3.320313 C 3.375 -3.566406 3.3125 -3.742188 3.1875 -3.847656 C 3.0625 -3.953125 2.832031 -4.007813 2.496094 -4.007813 C 2.261719 -4.007813 2.078125 -3.960938 1.953125 -3.871094 C 1.820313 -3.78125 1.71875 -3.625 1.644531 -3.398438 Z M 3.375 -2.347656 C 3.207031 -2.289063 2.941406 -2.222656 2.585938 -2.148438 C 2.222656 -2.070313 1.988281 -1.996094 1.882813 -1.925781 C 1.710938 -1.804688 1.628906 -1.652344 1.628906 -1.46875 C 1.628906 -1.285156 1.695313 -1.128906 1.832031 -0.996094 C 1.964844 -0.863281 2.136719 -0.796875 2.347656 -0.800781 C 2.578125 -0.796875 2.800781 -0.875 3.015625 -1.03125 C 3.167969 -1.148438 3.269531 -1.289063 3.324219 -1.460938 C 3.355469 -1.566406 3.375 -1.777344 3.375 -2.089844 Z M 3.375 -2.347656 "
-           id="path8729" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph32-2">
-        <path
-           style="stroke:none;"
-           d="M 5.125 0 L 3.828125 0 L 3.828125 -2.496094 C 3.824219 -3.019531 3.796875 -3.359375 3.746094 -3.515625 C 3.6875 -3.671875 3.597656 -3.792969 3.476563 -3.878906 C 3.351563 -3.964844 3.203125 -4.007813 3.027344 -4.007813 C 2.800781 -4.007813 2.601563 -3.945313 2.425781 -3.824219 C 2.246094 -3.699219 2.121094 -3.535156 2.058594 -3.335938 C 1.988281 -3.132813 1.957031 -2.761719 1.960938 -2.214844 L 1.960938 0 L 0.667969 0 L 0.667969 -4.886719 L 1.867188 -4.886719 L 1.867188 -4.171875 C 2.292969 -4.722656 2.832031 -4.996094 3.480469 -5 C 3.761719 -4.996094 4.019531 -4.945313 4.261719 -4.84375 C 4.496094 -4.738281 4.675781 -4.605469 4.796875 -4.449219 C 4.917969 -4.285156 5.003906 -4.105469 5.050781 -3.90625 C 5.097656 -3.703125 5.121094 -3.414063 5.125 -3.039063 Z M 5.125 0 "
-           id="path8732" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph32-3">
-        <path
-           style="stroke:none;"
-           d="M 5.160156 0 L 3.957031 0 L 3.957031 -0.71875 C 3.753906 -0.4375 3.519531 -0.230469 3.25 -0.09375 C 2.976563 0.0390625 2.703125 0.105469 2.429688 0.109375 C 1.867188 0.105469 1.386719 -0.117188 0.988281 -0.566406 C 0.585938 -1.015625 0.386719 -1.648438 0.386719 -2.460938 C 0.386719 -3.289063 0.578125 -3.917969 0.96875 -4.351563 C 1.355469 -4.78125 1.851563 -4.996094 2.449219 -5 C 2.996094 -4.996094 3.46875 -4.769531 3.867188 -4.316406 L 3.867188 -6.746094 L 5.160156 -6.746094 Z M 1.707031 -2.550781 C 1.703125 -2.023438 1.777344 -1.648438 1.925781 -1.417969 C 2.132813 -1.078125 2.421875 -0.90625 2.796875 -0.910156 C 3.09375 -0.90625 3.347656 -1.035156 3.558594 -1.289063 C 3.761719 -1.542969 3.867188 -1.921875 3.871094 -2.425781 C 3.867188 -2.988281 3.765625 -3.390625 3.566406 -3.636719 C 3.359375 -3.882813 3.101563 -4.007813 2.789063 -4.007813 C 2.480469 -4.007813 2.222656 -3.882813 2.015625 -3.640625 C 1.808594 -3.394531 1.703125 -3.03125 1.707031 -2.550781 Z M 1.707031 -2.550781 "
-           id="path8735" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph32-4">
-        <path
-           style="stroke:none;"
-           d="M 5.003906 -2.480469 L 6.324219 -2.0625 C 6.117188 -1.324219 5.78125 -0.777344 5.3125 -0.421875 C 4.839844 -0.0664063 4.242188 0.109375 3.519531 0.113281 C 2.621094 0.109375 1.882813 -0.191406 1.308594 -0.804688 C 0.730469 -1.414063 0.445313 -2.25 0.445313 -3.3125 C 0.445313 -4.433594 0.734375 -5.308594 1.316406 -5.929688 C 1.894531 -6.550781 2.65625 -6.859375 3.605469 -6.863281 C 4.425781 -6.859375 5.097656 -6.617188 5.617188 -6.132813 C 5.917969 -5.839844 6.148438 -5.425781 6.304688 -4.886719 L 4.957031 -4.566406 C 4.875 -4.910156 4.707031 -5.1875 4.457031 -5.394531 C 4.203125 -5.59375 3.894531 -5.695313 3.535156 -5.699219 C 3.03125 -5.695313 2.625 -5.515625 2.316406 -5.160156 C 2.003906 -4.796875 1.851563 -4.214844 1.851563 -3.414063 C 1.851563 -2.558594 2.003906 -1.953125 2.308594 -1.59375 C 2.613281 -1.230469 3.011719 -1.046875 3.507813 -1.050781 C 3.867188 -1.046875 4.179688 -1.164063 4.441406 -1.394531 C 4.699219 -1.621094 4.886719 -1.980469 5.003906 -2.480469 Z M 5.003906 -2.480469 "
-           id="path8738" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph32-5">
-        <path
-           style="stroke:none;"
-           d="M 3.894531 0 L 3.894531 -0.730469 C 3.714844 -0.46875 3.480469 -0.261719 3.191406 -0.113281 C 2.898438 0.0351563 2.59375 0.105469 2.273438 0.109375 C 1.945313 0.105469 1.648438 0.0351563 1.390625 -0.105469 C 1.125 -0.246094 0.9375 -0.445313 0.824219 -0.710938 C 0.703125 -0.96875 0.644531 -1.332031 0.648438 -1.796875 L 0.648438 -4.886719 L 1.941406 -4.886719 L 1.941406 -2.640625 C 1.9375 -1.949219 1.960938 -1.527344 2.011719 -1.375 C 2.058594 -1.21875 2.148438 -1.097656 2.273438 -1.007813 C 2.398438 -0.917969 2.558594 -0.871094 2.753906 -0.875 C 2.972656 -0.871094 3.167969 -0.933594 3.347656 -1.054688 C 3.519531 -1.175781 3.640625 -1.324219 3.703125 -1.507813 C 3.765625 -1.683594 3.796875 -2.125 3.800781 -2.828125 L 3.800781 -4.886719 L 5.09375 -4.886719 L 5.09375 0 Z M 3.894531 0 "
-           id="path8741" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-0">
-        <path
-           style="stroke:none;"
-           d="M 6.644531 0 L 5.191406 0 L 4.613281 -1.503906 L 1.964844 -1.503906 L 1.417969 0 L 0 0 L 2.578125 -6.621094 L 3.992188 -6.621094 Z M 4.183594 -2.621094 L 3.269531 -5.078125 L 2.375 -2.621094 Z M 4.183594 -2.621094 "
-           id="path8744" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-1">
-        <path
-           style="stroke:none;"
-           d="M 2.863281 -4.796875 L 2.863281 -3.785156 L 1.996094 -3.785156 L 1.996094 -1.851563 C 1.992188 -1.460938 2 -1.230469 2.019531 -1.167969 C 2.035156 -1.097656 2.074219 -1.046875 2.132813 -1.007813 C 2.191406 -0.964844 2.261719 -0.941406 2.347656 -0.945313 C 2.460938 -0.941406 2.632813 -0.984375 2.859375 -1.066406 L 2.96875 -0.0820313 C 2.667969 0.0429688 2.332031 0.105469 1.964844 0.109375 C 1.730469 0.105469 1.523438 0.0703125 1.34375 -0.00390625 C 1.15625 -0.078125 1.023438 -0.179688 0.941406 -0.304688 C 0.851563 -0.425781 0.792969 -0.589844 0.761719 -0.800781 C 0.734375 -0.945313 0.71875 -1.242188 0.722656 -1.695313 L 0.722656 -3.785156 L 0.140625 -3.785156 L 0.140625 -4.796875 L 0.722656 -4.796875 L 0.722656 -5.75 L 1.996094 -6.492188 L 1.996094 -4.796875 Z M 2.863281 -4.796875 "
-           id="path8747" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-2">
-        <path
-           style="stroke:none;"
-           d="M 0.371094 -2.464844 C 0.367188 -2.882813 0.472656 -3.289063 0.679688 -3.6875 C 0.886719 -4.078125 1.179688 -4.382813 1.5625 -4.59375 C 1.941406 -4.800781 2.367188 -4.902344 2.839844 -4.90625 C 3.5625 -4.902344 4.15625 -4.667969 4.625 -4.199219 C 5.085938 -3.726563 5.320313 -3.128906 5.320313 -2.410156 C 5.320313 -1.683594 5.085938 -1.082031 4.617188 -0.605469 C 4.148438 -0.128906 3.558594 0.105469 2.851563 0.109375 C 2.410156 0.105469 1.988281 0.0078125 1.589844 -0.1875 C 1.1875 -0.386719 0.886719 -0.675781 0.679688 -1.0625 C 0.472656 -1.445313 0.367188 -1.914063 0.371094 -2.464844 Z M 1.671875 -2.398438 C 1.667969 -1.917969 1.78125 -1.554688 2.007813 -1.304688 C 2.230469 -1.050781 2.507813 -0.925781 2.84375 -0.925781 C 3.171875 -0.925781 3.453125 -1.050781 3.679688 -1.304688 C 3.902344 -1.554688 4.011719 -1.921875 4.015625 -2.40625 C 4.011719 -2.875 3.902344 -3.238281 3.679688 -3.492188 C 3.453125 -3.742188 3.171875 -3.867188 2.84375 -3.871094 C 2.507813 -3.867188 2.230469 -3.742188 2.007813 -3.492188 C 1.78125 -3.238281 1.667969 -2.871094 1.671875 -2.398438 Z M 1.671875 -2.398438 "
-           id="path8750" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-3">
-        <path
-           style="stroke:none;"
-           d="M 0.570313 -4.796875 L 1.738281 -4.796875 L 1.738281 -4.140625 C 2.15625 -4.648438 2.65625 -4.902344 3.234375 -4.90625 C 3.539063 -4.902344 3.804688 -4.839844 4.03125 -4.714844 C 4.257813 -4.585938 4.441406 -4.394531 4.589844 -4.140625 C 4.796875 -4.394531 5.023438 -4.585938 5.269531 -4.714844 C 5.511719 -4.839844 5.773438 -4.902344 6.050781 -4.90625 C 6.402344 -4.902344 6.699219 -4.832031 6.945313 -4.691406 C 7.1875 -4.546875 7.371094 -4.335938 7.492188 -4.0625 C 7.578125 -3.851563 7.621094 -3.519531 7.625 -3.066406 L 7.625 0 L 6.355469 0 L 6.355469 -2.742188 C 6.351563 -3.21875 6.308594 -3.523438 6.222656 -3.664063 C 6.105469 -3.839844 5.925781 -3.929688 5.683594 -3.933594 C 5.503906 -3.929688 5.335938 -3.875 5.179688 -3.769531 C 5.023438 -3.660156 4.910156 -3.503906 4.839844 -3.292969 C 4.769531 -3.082031 4.734375 -2.75 4.738281 -2.304688 L 4.738281 0 L 3.46875 0 L 3.46875 -2.628906 C 3.46875 -3.09375 3.445313 -3.394531 3.398438 -3.53125 C 3.351563 -3.664063 3.28125 -3.765625 3.191406 -3.832031 C 3.09375 -3.898438 2.964844 -3.929688 2.804688 -3.933594 C 2.605469 -3.929688 2.429688 -3.878906 2.277344 -3.773438 C 2.117188 -3.667969 2.003906 -3.515625 1.941406 -3.320313 C 1.871094 -3.121094 1.839844 -2.792969 1.839844 -2.332031 L 1.839844 0 L 0.570313 0 Z M 0.570313 -4.796875 "
-           id="path8753" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-4">
-        <path
-           style="stroke:none;"
-           d="M 0.664063 -5.449219 L 0.664063 -6.621094 L 1.933594 -6.621094 L 1.933594 -5.449219 Z M 0.664063 0 L 0.664063 -4.796875 L 1.933594 -4.796875 L 1.933594 0 Z M 0.664063 0 "
-           id="path8756" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-5">
-        <path
-           style="stroke:none;"
-           d="M 4.847656 -3.378906 L 3.59375 -3.152344 C 3.546875 -3.402344 3.453125 -3.589844 3.308594 -3.714844 C 3.15625 -3.839844 2.960938 -3.902344 2.726563 -3.90625 C 2.40625 -3.902344 2.15625 -3.792969 1.96875 -3.578125 C 1.78125 -3.359375 1.6875 -2.996094 1.6875 -2.484375 C 1.6875 -1.914063 1.78125 -1.511719 1.972656 -1.277344 C 2.164063 -1.042969 2.421875 -0.925781 2.746094 -0.925781 C 2.984375 -0.925781 3.183594 -0.992188 3.339844 -1.128906 C 3.492188 -1.265625 3.597656 -1.503906 3.664063 -1.839844 L 4.910156 -1.625 C 4.777344 -1.050781 4.527344 -0.621094 4.164063 -0.328125 C 3.792969 -0.0390625 3.300781 0.105469 2.6875 0.109375 C 1.984375 0.105469 1.425781 -0.113281 1.007813 -0.554688 C 0.589844 -0.996094 0.378906 -1.609375 0.382813 -2.394531 C 0.378906 -3.183594 0.589844 -3.800781 1.011719 -4.242188 C 1.429688 -4.683594 1.996094 -4.902344 2.710938 -4.90625 C 3.292969 -4.902344 3.757813 -4.777344 4.101563 -4.527344 C 4.445313 -4.273438 4.691406 -3.890625 4.847656 -3.378906 Z M 4.847656 -3.378906 "
-           id="path8759" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-6">
-        <path
-           style="stroke:none;"
-           d="M 4.910156 -2.433594 L 6.207031 -2.023438 C 6.003906 -1.296875 5.671875 -0.761719 5.210938 -0.414063 C 4.746094 -0.0625 4.164063 0.109375 3.457031 0.113281 C 2.574219 0.109375 1.847656 -0.1875 1.285156 -0.785156 C 0.714844 -1.386719 0.433594 -2.207031 0.4375 -3.253906 C 0.433594 -4.351563 0.71875 -5.207031 1.289063 -5.820313 C 1.859375 -6.425781 2.605469 -6.730469 3.535156 -6.734375 C 4.339844 -6.730469 5 -6.492188 5.511719 -6.015625 C 5.808594 -5.730469 6.035156 -5.324219 6.1875 -4.796875 L 4.863281 -4.480469 C 4.785156 -4.820313 4.621094 -5.089844 4.375 -5.292969 C 4.125 -5.488281 3.824219 -5.589844 3.46875 -5.59375 C 2.976563 -5.589844 2.578125 -5.414063 2.273438 -5.0625 C 1.96875 -4.710938 1.816406 -4.140625 1.816406 -3.351563 C 1.816406 -2.515625 1.964844 -1.917969 2.265625 -1.5625 C 2.566406 -1.207031 2.957031 -1.03125 3.441406 -1.03125 C 3.792969 -1.03125 4.101563 -1.140625 4.359375 -1.367188 C 4.613281 -1.589844 4.796875 -1.945313 4.910156 -2.433594 Z M 4.910156 -2.433594 "
-           id="path8762" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-7">
-        <path
-           style="stroke:none;"
-           d="M 3.820313 0 L 3.820313 -0.71875 C 3.644531 -0.460938 3.414063 -0.257813 3.132813 -0.113281 C 2.847656 0.0351563 2.546875 0.105469 2.230469 0.109375 C 1.90625 0.105469 1.617188 0.0390625 1.363281 -0.101563 C 1.105469 -0.242188 0.921875 -0.441406 0.808594 -0.699219 C 0.691406 -0.953125 0.632813 -1.308594 0.636719 -1.761719 L 0.636719 -4.796875 L 1.90625 -4.796875 L 1.90625 -2.59375 C 1.902344 -1.917969 1.925781 -1.503906 1.976563 -1.351563 C 2.019531 -1.199219 2.105469 -1.078125 2.230469 -0.992188 C 2.351563 -0.898438 2.507813 -0.855469 2.699219 -0.859375 C 2.914063 -0.855469 3.109375 -0.914063 3.285156 -1.035156 C 3.453125 -1.152344 3.570313 -1.300781 3.636719 -1.476563 C 3.695313 -1.652344 3.726563 -2.082031 3.730469 -2.773438 L 3.730469 -4.796875 L 5 -4.796875 L 5 0 Z M 3.820313 0 "
-           id="path8765" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-8">
-        <path
-           style="stroke:none;"
-           d="M 5.0625 0 L 3.882813 0 L 3.882813 -0.703125 C 3.683594 -0.429688 3.453125 -0.222656 3.191406 -0.0898438 C 2.921875 0.0429688 2.652344 0.105469 2.386719 0.109375 C 1.832031 0.105469 1.359375 -0.113281 0.96875 -0.554688 C 0.570313 -0.996094 0.375 -1.617188 0.378906 -2.417969 C 0.375 -3.226563 0.566406 -3.84375 0.953125 -4.269531 C 1.332031 -4.691406 1.816406 -4.902344 2.402344 -4.90625 C 2.9375 -4.902344 3.398438 -4.679688 3.792969 -4.238281 L 3.792969 -6.621094 L 5.0625 -6.621094 Z M 1.675781 -2.503906 C 1.675781 -1.988281 1.746094 -1.617188 1.886719 -1.390625 C 2.089844 -1.054688 2.375 -0.890625 2.746094 -0.894531 C 3.035156 -0.890625 3.28125 -1.015625 3.488281 -1.265625 C 3.691406 -1.515625 3.796875 -1.886719 3.796875 -2.378906 C 3.796875 -2.929688 3.695313 -3.324219 3.5 -3.570313 C 3.296875 -3.808594 3.042969 -3.929688 2.738281 -3.933594 C 2.433594 -3.929688 2.183594 -3.8125 1.980469 -3.574219 C 1.777344 -3.332031 1.675781 -2.972656 1.675781 -2.503906 Z M 1.675781 -2.503906 "
-           id="path8768" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-9">
-        <path
-           style="stroke:none;"
-           d="M 1.613281 -3.332031 L 0.460938 -3.542969 C 0.589844 -4.003906 0.8125 -4.347656 1.128906 -4.570313 C 1.441406 -4.792969 1.910156 -4.902344 2.539063 -4.90625 C 3.101563 -4.902344 3.523438 -4.835938 3.800781 -4.703125 C 4.078125 -4.566406 4.273438 -4.394531 4.386719 -4.191406 C 4.5 -3.980469 4.558594 -3.605469 4.558594 -3.058594 L 4.542969 -1.578125 C 4.539063 -1.152344 4.558594 -0.84375 4.601563 -0.644531 C 4.640625 -0.441406 4.71875 -0.226563 4.832031 0 L 3.578125 0 C 3.542969 -0.0820313 3.503906 -0.207031 3.457031 -0.375 C 3.433594 -0.449219 3.417969 -0.496094 3.410156 -0.523438 C 3.191406 -0.308594 2.957031 -0.152344 2.714844 -0.046875 C 2.464844 0.0546875 2.203125 0.105469 1.925781 0.109375 C 1.429688 0.105469 1.039063 -0.0273438 0.753906 -0.292969 C 0.46875 -0.558594 0.328125 -0.894531 0.328125 -1.308594 C 0.328125 -1.574219 0.390625 -1.816406 0.523438 -2.03125 C 0.648438 -2.242188 0.832031 -2.40625 1.066406 -2.519531 C 1.300781 -2.632813 1.636719 -2.730469 2.078125 -2.820313 C 2.667969 -2.925781 3.078125 -3.03125 3.3125 -3.128906 L 3.3125 -3.257813 C 3.308594 -3.5 3.25 -3.671875 3.128906 -3.777344 C 3.007813 -3.878906 2.78125 -3.929688 2.449219 -3.933594 C 2.222656 -3.929688 2.042969 -3.886719 1.917969 -3.800781 C 1.789063 -3.710938 1.6875 -3.554688 1.613281 -3.332031 Z M 3.3125 -2.304688 C 3.144531 -2.246094 2.886719 -2.183594 2.539063 -2.109375 C 2.183594 -2.03125 1.953125 -1.957031 1.847656 -1.886719 C 1.675781 -1.769531 1.59375 -1.621094 1.597656 -1.441406 C 1.59375 -1.261719 1.660156 -1.105469 1.796875 -0.980469 C 1.925781 -0.847656 2.09375 -0.785156 2.304688 -0.785156 C 2.53125 -0.785156 2.75 -0.859375 2.957031 -1.011719 C 3.109375 -1.125 3.210938 -1.265625 3.261719 -1.433594 C 3.292969 -1.539063 3.308594 -1.746094 3.3125 -2.050781 Z M 3.3125 -2.304688 "
-           id="path8771" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-10">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 -6.621094 L 3.324219 -6.621094 C 3.84375 -6.621094 4.234375 -6.597656 4.496094 -6.554688 C 4.75 -6.511719 4.980469 -6.421875 5.1875 -6.28125 C 5.386719 -6.140625 5.558594 -5.953125 5.695313 -5.726563 C 5.828125 -5.492188 5.894531 -5.238281 5.898438 -4.957031 C 5.894531 -4.644531 5.8125 -4.359375 5.648438 -4.101563 C 5.480469 -3.84375 5.253906 -3.648438 4.96875 -3.519531 C 5.371094 -3.398438 5.679688 -3.199219 5.898438 -2.917969 C 6.113281 -2.632813 6.222656 -2.296875 6.222656 -1.917969 C 6.222656 -1.613281 6.152344 -1.320313 6.011719 -1.039063 C 5.871094 -0.75 5.679688 -0.523438 5.441406 -0.359375 C 5.195313 -0.1875 4.894531 -0.0859375 4.542969 -0.046875 C 4.316406 -0.0195313 3.78125 -0.00390625 2.929688 0 L 0.675781 0 Z M 2.015625 -5.519531 L 2.015625 -3.988281 L 2.890625 -3.988281 C 3.40625 -3.984375 3.730469 -3.992188 3.863281 -4.011719 C 4.089844 -4.035156 4.269531 -4.113281 4.402344 -4.246094 C 4.527344 -4.375 4.59375 -4.546875 4.597656 -4.765625 C 4.59375 -4.96875 4.535156 -5.136719 4.425781 -5.265625 C 4.308594 -5.390625 4.144531 -5.46875 3.925781 -5.496094 C 3.792969 -5.511719 3.410156 -5.519531 2.78125 -5.519531 Z M 2.015625 -2.886719 L 2.015625 -1.117188 L 3.253906 -1.117188 C 3.730469 -1.113281 4.035156 -1.125 4.167969 -1.15625 C 4.363281 -1.1875 4.527344 -1.277344 4.65625 -1.417969 C 4.777344 -1.558594 4.839844 -1.746094 4.84375 -1.984375 C 4.839844 -2.179688 4.792969 -2.347656 4.699219 -2.488281 C 4.601563 -2.625 4.460938 -2.726563 4.28125 -2.789063 C 4.097656 -2.851563 3.703125 -2.882813 3.09375 -2.886719 Z M 2.015625 -2.886719 "
-           id="path8774" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph33-11">
-        <path
-           style="stroke:none;"
-           d="M 0.664063 0 L 0.664063 -6.621094 L 1.933594 -6.621094 L 1.933594 0 Z M 0.664063 0 "
-           id="path8777" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph34-0">
-        <path
-           style="stroke:none;"
-           d="M 0.6875 0 L 0.6875 -6.761719 L 5.699219 -6.761719 L 5.699219 -5.617188 L 2.050781 -5.617188 L 2.050781 -4.117188 L 5.445313 -4.117188 L 5.445313 -2.980469 L 2.050781 -2.980469 L 2.050781 -1.140625 L 5.828125 -1.140625 L 5.828125 0 Z M 0.6875 0 "
-           id="path8780" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph34-1">
-        <path
-           style="stroke:none;"
-           d="M 0.0546875 0 L 1.820313 -2.523438 L 0.128906 -4.898438 L 1.710938 -4.898438 L 2.578125 -3.550781 L 3.492188 -4.898438 L 5.011719 -4.898438 L 3.351563 -2.578125 L 5.164063 0 L 3.574219 0 L 2.578125 -1.515625 L 1.574219 0 Z M 0.0546875 0 "
-           id="path8783" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph34-2">
-        <path
-           style="stroke:none;"
-           d="M 3.511719 -1.558594 L 4.804688 -1.34375 C 4.636719 -0.867188 4.371094 -0.507813 4.015625 -0.261719 C 3.65625 -0.015625 3.210938 0.105469 2.675781 0.109375 C 1.820313 0.105469 1.191406 -0.167969 0.785156 -0.722656 C 0.460938 -1.164063 0.296875 -1.726563 0.300781 -2.410156 C 0.296875 -3.21875 0.511719 -3.855469 0.9375 -4.316406 C 1.359375 -4.773438 1.894531 -5.003906 2.546875 -5.007813 C 3.269531 -5.003906 3.84375 -4.761719 4.269531 -4.285156 C 4.6875 -3.800781 4.890625 -3.066406 4.875 -2.074219 L 1.628906 -2.074219 C 1.636719 -1.691406 1.738281 -1.390625 1.941406 -1.179688 C 2.136719 -0.960938 2.386719 -0.855469 2.6875 -0.859375 C 2.886719 -0.855469 3.058594 -0.910156 3.199219 -1.023438 C 3.335938 -1.132813 3.4375 -1.3125 3.511719 -1.558594 Z M 3.585938 -2.867188 C 3.578125 -3.238281 3.480469 -3.523438 3.296875 -3.722656 C 3.109375 -3.914063 2.886719 -4.011719 2.625 -4.015625 C 2.339844 -4.011719 2.105469 -3.910156 1.921875 -3.707031 C 1.734375 -3.5 1.644531 -3.21875 1.652344 -2.867188 Z M 3.585938 -2.867188 "
-           id="path8786" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph34-3">
-        <path
-           style="stroke:none;"
-           d="M 4.949219 -3.449219 L 3.671875 -3.21875 C 3.625 -3.46875 3.527344 -3.660156 3.375 -3.792969 C 3.222656 -3.917969 3.023438 -3.984375 2.785156 -3.988281 C 2.457031 -3.984375 2.199219 -3.871094 2.011719 -3.652344 C 1.816406 -3.425781 1.722656 -3.054688 1.722656 -2.535156 C 1.722656 -1.953125 1.820313 -1.542969 2.015625 -1.304688 C 2.210938 -1.0625 2.472656 -0.941406 2.804688 -0.945313 C 3.046875 -0.941406 3.246094 -1.011719 3.40625 -1.152344 C 3.5625 -1.289063 3.675781 -1.53125 3.738281 -1.875 L 5.011719 -1.660156 C 4.875 -1.074219 4.621094 -0.632813 4.25 -0.335938 C 3.871094 -0.0390625 3.371094 0.105469 2.742188 0.109375 C 2.023438 0.105469 1.449219 -0.117188 1.027344 -0.566406 C 0.597656 -1.015625 0.386719 -1.640625 0.390625 -2.445313 C 0.386719 -3.25 0.601563 -3.878906 1.03125 -4.332031 C 1.457031 -4.777344 2.035156 -5.003906 2.765625 -5.007813 C 3.359375 -5.003906 3.832031 -4.875 4.1875 -4.621094 C 4.539063 -4.363281 4.792969 -3.972656 4.949219 -3.449219 Z M 4.949219 -3.449219 "
-           id="path8789" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph34-4">
-        <path
-           style="stroke:none;"
-           d="M 5.011719 -2.484375 L 6.335938 -2.066406 C 6.128906 -1.328125 5.792969 -0.78125 5.324219 -0.421875 C 4.851563 -0.0625 4.253906 0.117188 3.527344 0.117188 C 2.628906 0.117188 1.890625 -0.1875 1.3125 -0.800781 C 0.734375 -1.414063 0.445313 -2.253906 0.449219 -3.320313 C 0.445313 -4.441406 0.734375 -5.316406 1.320313 -5.941406 C 1.898438 -6.5625 2.664063 -6.871094 3.609375 -6.875 C 4.4375 -6.871094 5.109375 -6.628906 5.625 -6.140625 C 5.929688 -5.851563 6.160156 -5.4375 6.316406 -4.898438 L 4.964844 -4.574219 C 4.882813 -4.921875 4.71875 -5.199219 4.464844 -5.402344 C 4.210938 -5.605469 3.902344 -5.707031 3.542969 -5.707031 C 3.039063 -5.707031 2.632813 -5.527344 2.320313 -5.167969 C 2.007813 -4.808594 1.851563 -4.226563 1.855469 -3.421875 C 1.851563 -2.566406 2.003906 -1.957031 2.316406 -1.59375 C 2.621094 -1.230469 3.019531 -1.046875 3.511719 -1.050781 C 3.871094 -1.046875 4.183594 -1.164063 4.449219 -1.394531 C 4.707031 -1.625 4.894531 -1.988281 5.011719 -2.484375 Z M 5.011719 -2.484375 "
-           id="path8792" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph34-5">
-        <path
-           style="stroke:none;"
-           d="M 0.640625 -4.898438 L 1.847656 -4.898438 L 1.847656 -4.175781 C 2.003906 -4.421875 2.21875 -4.621094 2.484375 -4.777344 C 2.75 -4.925781 3.046875 -5.003906 3.375 -5.007813 C 3.941406 -5.003906 4.421875 -4.78125 4.820313 -4.339844 C 5.214844 -3.890625 5.414063 -3.269531 5.417969 -2.476563 C 5.414063 -1.65625 5.214844 -1.019531 4.816406 -0.570313 C 4.417969 -0.117188 3.933594 0.105469 3.367188 0.109375 C 3.09375 0.105469 2.847656 0.0546875 2.628906 -0.0507813 C 2.40625 -0.15625 2.175781 -0.339844 1.9375 -0.605469 L 1.9375 1.863281 L 0.640625 1.863281 Z M 1.921875 -2.53125 C 1.921875 -1.980469 2.03125 -1.574219 2.25 -1.3125 C 2.46875 -1.046875 2.734375 -0.914063 3.046875 -0.917969 C 3.347656 -0.914063 3.597656 -1.035156 3.796875 -1.277344 C 3.996094 -1.519531 4.09375 -1.914063 4.097656 -2.46875 C 4.09375 -2.976563 3.992188 -3.359375 3.789063 -3.609375 C 3.582031 -3.859375 3.328125 -3.984375 3.023438 -3.984375 C 2.707031 -3.984375 2.441406 -3.859375 2.234375 -3.617188 C 2.023438 -3.367188 1.921875 -3.007813 1.921875 -2.53125 Z M 1.921875 -2.53125 "
-           id="path8795" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph35-0">
-        <path
-           style="stroke:none;"
-           d="M 3.902344 0 L 3.902344 -0.734375 C 3.722656 -0.46875 3.484375 -0.261719 3.195313 -0.113281 C 2.902344 0.0351563 2.597656 0.105469 2.277344 0.109375 C 1.945313 0.105469 1.648438 0.0351563 1.390625 -0.105469 C 1.125 -0.25 0.9375 -0.453125 0.824219 -0.714844 C 0.703125 -0.972656 0.644531 -1.332031 0.648438 -1.796875 L 0.648438 -4.898438 L 1.945313 -4.898438 L 1.945313 -2.648438 C 1.945313 -1.957031 1.96875 -1.53125 2.015625 -1.378906 C 2.0625 -1.222656 2.148438 -1.101563 2.277344 -1.011719 C 2.402344 -0.917969 2.5625 -0.871094 2.757813 -0.875 C 2.976563 -0.871094 3.175781 -0.933594 3.351563 -1.058594 C 3.527344 -1.179688 3.644531 -1.332031 3.710938 -1.511719 C 3.773438 -1.691406 3.808594 -2.128906 3.808594 -2.832031 L 3.808594 -4.898438 L 5.105469 -4.898438 L 5.105469 0 Z M 3.902344 0 "
-           id="path8798" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph35-1">
-        <path
-           style="stroke:none;"
-           d="M 0.410156 -3.339844 C 0.410156 -4.027344 0.511719 -4.605469 0.71875 -5.074219 C 0.871094 -5.417969 1.082031 -5.726563 1.347656 -6 C 1.613281 -6.273438 1.902344 -6.476563 2.222656 -6.609375 C 2.640625 -6.785156 3.128906 -6.871094 3.679688 -6.875 C 4.675781 -6.871094 5.472656 -6.5625 6.070313 -5.949219 C 6.667969 -5.328125 6.964844 -4.46875 6.96875 -3.371094 C 6.964844 -2.277344 6.667969 -1.425781 6.078125 -0.808594 C 5.480469 -0.191406 4.6875 0.117188 3.699219 0.117188 C 2.691406 0.117188 1.894531 -0.1875 1.300781 -0.804688 C 0.707031 -1.414063 0.410156 -2.261719 0.410156 -3.339844 Z M 1.816406 -3.386719 C 1.816406 -2.621094 1.992188 -2.039063 2.347656 -1.644531 C 2.699219 -1.246094 3.148438 -1.046875 3.695313 -1.050781 C 4.238281 -1.046875 4.683594 -1.242188 5.035156 -1.636719 C 5.378906 -2.027344 5.554688 -2.617188 5.558594 -3.402344 C 5.554688 -4.175781 5.382813 -4.753906 5.046875 -5.136719 C 4.703125 -5.515625 4.253906 -5.707031 3.695313 -5.710938 C 3.128906 -5.707031 2.675781 -5.511719 2.332031 -5.128906 C 1.988281 -4.738281 1.816406 -4.160156 1.816406 -3.386719 Z M 1.816406 -3.386719 "
-           id="path8801" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph35-2">
-        <path
-           style="stroke:none;"
-           d="M 0.582031 -4.898438 L 1.777344 -4.898438 L 1.777344 -4.230469 C 2.199219 -4.746094 2.707031 -5.003906 3.300781 -5.007813 C 3.613281 -5.003906 3.886719 -4.9375 4.117188 -4.8125 C 4.347656 -4.679688 4.539063 -4.488281 4.6875 -4.230469 C 4.898438 -4.488281 5.128906 -4.679688 5.382813 -4.8125 C 5.628906 -4.9375 5.894531 -5.003906 6.179688 -5.007813 C 6.535156 -5.003906 6.839844 -4.929688 7.09375 -4.789063 C 7.339844 -4.640625 7.527344 -4.425781 7.652344 -4.144531 C 7.738281 -3.933594 7.78125 -3.597656 7.785156 -3.132813 L 7.785156 0 L 6.488281 0 L 6.488281 -2.800781 C 6.484375 -3.285156 6.441406 -3.597656 6.355469 -3.742188 C 6.234375 -3.921875 6.050781 -4.011719 5.800781 -4.015625 C 5.621094 -4.011719 5.449219 -3.957031 5.289063 -3.847656 C 5.128906 -3.734375 5.015625 -3.574219 4.945313 -3.363281 C 4.875 -3.148438 4.839844 -2.808594 4.839844 -2.351563 L 4.839844 0 L 3.542969 0 L 3.542969 -2.683594 C 3.539063 -3.160156 3.515625 -3.464844 3.472656 -3.605469 C 3.421875 -3.738281 3.351563 -3.84375 3.257813 -3.914063 C 3.160156 -3.980469 3.027344 -4.011719 2.863281 -4.015625 C 2.660156 -4.011719 2.480469 -3.960938 2.324219 -3.855469 C 2.164063 -3.746094 2.050781 -3.589844 1.980469 -3.390625 C 1.910156 -3.183594 1.875 -2.847656 1.878906 -2.378906 L 1.878906 0 L 0.582031 0 Z M 0.582031 -4.898438 "
-           id="path8804" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph35-3">
-        <path
-           style="stroke:none;"
-           d="M 0.640625 -4.898438 L 1.847656 -4.898438 L 1.847656 -4.179688 C 2.003906 -4.421875 2.21875 -4.621094 2.484375 -4.777344 C 2.75 -4.925781 3.046875 -5.003906 3.375 -5.007813 C 3.945313 -5.003906 4.425781 -4.78125 4.824219 -4.339844 C 5.214844 -3.890625 5.414063 -3.269531 5.417969 -2.476563 C 5.414063 -1.65625 5.214844 -1.019531 4.816406 -0.570313 C 4.417969 -0.117188 3.933594 0.105469 3.367188 0.109375 C 3.09375 0.105469 2.847656 0.0546875 2.628906 -0.0507813 C 2.40625 -0.15625 2.175781 -0.339844 1.9375 -0.605469 L 1.9375 1.863281 L 0.640625 1.863281 Z M 1.921875 -2.53125 C 1.921875 -1.980469 2.03125 -1.574219 2.25 -1.3125 C 2.46875 -1.046875 2.734375 -0.914063 3.046875 -0.917969 C 3.347656 -0.914063 3.597656 -1.035156 3.800781 -1.277344 C 4 -1.519531 4.101563 -1.914063 4.101563 -2.46875 C 4.101563 -2.976563 3.996094 -3.359375 3.789063 -3.609375 C 3.582031 -3.859375 3.328125 -3.984375 3.027344 -3.984375 C 2.707031 -3.984375 2.441406 -3.859375 2.234375 -3.617188 C 2.023438 -3.367188 1.921875 -3.007813 1.921875 -2.53125 Z M 1.921875 -2.53125 "
-           id="path8807" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph35-4">
-        <path
-           style="stroke:none;"
-           d="M 4.777344 -1.203125 L 4.777344 0 L 0.234375 0 C 0.28125 -0.453125 0.425781 -0.882813 0.675781 -1.292969 C 0.917969 -1.699219 1.40625 -2.238281 2.136719 -2.914063 C 2.71875 -3.453125 3.078125 -3.824219 3.210938 -4.023438 C 3.386719 -4.285156 3.472656 -4.550781 3.476563 -4.816406 C 3.472656 -5.105469 3.394531 -5.324219 3.242188 -5.480469 C 3.085938 -5.632813 2.875 -5.710938 2.601563 -5.714844 C 2.328125 -5.710938 2.109375 -5.628906 1.953125 -5.46875 C 1.789063 -5.304688 1.699219 -5.035156 1.679688 -4.660156 L 0.386719 -4.789063 C 0.464844 -5.5 0.703125 -6.011719 1.109375 -6.324219 C 1.511719 -6.628906 2.019531 -6.785156 2.632813 -6.789063 C 3.296875 -6.785156 3.820313 -6.605469 4.203125 -6.25 C 4.582031 -5.886719 4.773438 -5.4375 4.777344 -4.90625 C 4.773438 -4.597656 4.71875 -4.308594 4.613281 -4.035156 C 4.5 -3.757813 4.328125 -3.46875 4.09375 -3.171875 C 3.933594 -2.96875 3.652344 -2.683594 3.246094 -2.308594 C 2.835938 -1.933594 2.574219 -1.683594 2.46875 -1.5625 C 2.355469 -1.4375 2.269531 -1.316406 2.203125 -1.203125 Z M 4.777344 -1.203125 "
-           id="path8810" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph35-5">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path8813" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph36-0">
-        <path
-           style="stroke:none;"
-           d="M 0.683594 -6.695313 L 3.359375 -6.695313 C 3.886719 -6.691406 4.285156 -6.667969 4.546875 -6.628906 C 4.804688 -6.582031 5.035156 -6.492188 5.246094 -6.351563 C 5.449219 -6.210938 5.621094 -6.023438 5.757813 -5.789063 C 5.894531 -5.554688 5.964844 -5.292969 5.964844 -5.007813 C 5.964844 -4.691406 5.878906 -4.402344 5.710938 -4.144531 C 5.539063 -3.878906 5.308594 -3.683594 5.023438 -3.558594 C 5.429688 -3.433594 5.746094 -3.230469 5.964844 -2.949219 C 6.183594 -2.660156 6.292969 -2.324219 6.292969 -1.941406 C 6.292969 -1.632813 6.21875 -1.335938 6.078125 -1.050781 C 5.933594 -0.761719 5.742188 -0.535156 5.5 -0.363281 C 5.253906 -0.191406 4.953125 -0.0859375 4.59375 -0.046875 C 4.367188 -0.0195313 3.824219 -0.00390625 2.964844 0 L 0.683594 0 Z M 2.035156 -5.582031 L 2.035156 -4.03125 L 2.921875 -4.03125 C 3.445313 -4.03125 3.773438 -4.039063 3.90625 -4.054688 C 4.132813 -4.082031 4.3125 -4.160156 4.449219 -4.292969 C 4.578125 -4.421875 4.644531 -4.597656 4.648438 -4.816406 C 4.644531 -5.023438 4.589844 -5.191406 4.476563 -5.320313 C 4.359375 -5.449219 4.1875 -5.527344 3.96875 -5.558594 C 3.828125 -5.570313 3.445313 -5.578125 2.8125 -5.582031 Z M 2.035156 -2.917969 L 2.035156 -1.128906 L 3.289063 -1.128906 C 3.773438 -1.125 4.082031 -1.136719 4.214844 -1.167969 C 4.410156 -1.199219 4.574219 -1.289063 4.703125 -1.433594 C 4.828125 -1.574219 4.890625 -1.765625 4.894531 -2.003906 C 4.890625 -2.203125 4.84375 -2.371094 4.746094 -2.515625 C 4.648438 -2.652344 4.507813 -2.753906 4.324219 -2.820313 C 4.140625 -2.882813 3.742188 -2.917969 3.128906 -2.917969 Z M 2.035156 -2.917969 "
-           id="path8816" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph36-1">
-        <path
-           style="stroke:none;"
-           d="M 0.671875 0 L 0.671875 -6.695313 L 1.953125 -6.695313 L 1.953125 0 Z M 0.671875 0 "
-           id="path8819" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph36-2">
-        <path
-           style="stroke:none;"
-           d="M 0.375 -2.492188 C 0.375 -2.914063 0.476563 -3.328125 0.6875 -3.730469 C 0.894531 -4.125 1.195313 -4.429688 1.582031 -4.644531 C 1.964844 -4.851563 2.394531 -4.957031 2.871094 -4.960938 C 3.605469 -4.957031 4.207031 -4.71875 4.675781 -4.246094 C 5.144531 -3.765625 5.378906 -3.164063 5.378906 -2.4375 C 5.378906 -1.703125 5.140625 -1.09375 4.667969 -0.613281 C 4.195313 -0.132813 3.601563 0.105469 2.882813 0.109375 C 2.4375 0.105469 2.011719 0.0078125 1.609375 -0.191406 C 1.203125 -0.390625 0.894531 -0.683594 0.6875 -1.074219 C 0.476563 -1.457031 0.375 -1.929688 0.375 -2.492188 Z M 1.691406 -2.425781 C 1.6875 -1.941406 1.800781 -1.570313 2.03125 -1.320313 C 2.257813 -1.0625 2.539063 -0.9375 2.875 -0.9375 C 3.207031 -0.9375 3.488281 -1.0625 3.71875 -1.320313 C 3.941406 -1.570313 4.054688 -1.941406 4.058594 -2.433594 C 4.054688 -2.902344 3.941406 -3.269531 3.71875 -3.527344 C 3.488281 -3.785156 3.207031 -3.914063 2.875 -3.914063 C 2.539063 -3.914063 2.257813 -3.785156 2.03125 -3.527344 C 1.800781 -3.269531 1.6875 -2.902344 1.691406 -2.425781 Z M 1.691406 -2.425781 "
-           id="path8822" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph36-3">
-        <path
-           style="stroke:none;"
-           d="M 4.898438 -3.414063 L 3.636719 -3.1875 C 3.589844 -3.4375 3.492188 -3.625 3.34375 -3.757813 C 3.191406 -3.882813 2.996094 -3.949219 2.757813 -3.949219 C 2.433594 -3.949219 2.179688 -3.835938 1.992188 -3.617188 C 1.800781 -3.394531 1.703125 -3.027344 1.707031 -2.511719 C 1.703125 -1.933594 1.800781 -1.527344 1.996094 -1.292969 C 2.1875 -1.054688 2.449219 -0.9375 2.777344 -0.9375 C 3.019531 -0.9375 3.21875 -1.003906 3.375 -1.144531 C 3.53125 -1.277344 3.640625 -1.515625 3.703125 -1.859375 L 4.964844 -1.644531 C 4.828125 -1.0625 4.578125 -0.625 4.207031 -0.332031 C 3.835938 -0.0390625 3.339844 0.105469 2.71875 0.109375 C 2.007813 0.105469 1.441406 -0.117188 1.019531 -0.5625 C 0.597656 -1.007813 0.386719 -1.625 0.386719 -2.421875 C 0.386719 -3.21875 0.597656 -3.84375 1.019531 -4.289063 C 1.441406 -4.734375 2.011719 -4.957031 2.738281 -4.960938 C 3.324219 -4.957031 3.796875 -4.828125 4.148438 -4.578125 C 4.496094 -4.320313 4.746094 -3.933594 4.898438 -3.414063 Z M 4.898438 -3.414063 "
-           id="path8825" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph36-4">
-        <path
-           style="stroke:none;"
-           d="M 0.625 0 L 0.625 -6.695313 L 1.910156 -6.695313 L 1.910156 -3.140625 L 3.410156 -4.847656 L 4.992188 -4.847656 L 3.332031 -3.078125 L 5.109375 0 L 3.726563 0 L 2.507813 -2.179688 L 1.910156 -1.550781 L 1.910156 0 Z M 0.625 0 "
-           id="path8828" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph36-5">
-        <path
-           style="stroke:none;"
-           d="M 0.21875 -1.382813 L 1.507813 -1.578125 C 1.558594 -1.328125 1.671875 -1.140625 1.839844 -1.011719 C 2.003906 -0.878906 2.238281 -0.8125 2.542969 -0.816406 C 2.875 -0.8125 3.125 -0.875 3.292969 -1 C 3.402344 -1.082031 3.457031 -1.195313 3.460938 -1.34375 C 3.457031 -1.4375 3.425781 -1.519531 3.371094 -1.585938 C 3.300781 -1.644531 3.160156 -1.699219 2.941406 -1.753906 C 1.902344 -1.980469 1.246094 -2.191406 0.972656 -2.378906 C 0.589844 -2.640625 0.398438 -3.003906 0.398438 -3.46875 C 0.398438 -3.890625 0.5625 -4.242188 0.894531 -4.53125 C 1.226563 -4.8125 1.742188 -4.957031 2.4375 -4.960938 C 3.101563 -4.957031 3.59375 -4.847656 3.917969 -4.636719 C 4.238281 -4.417969 4.460938 -4.097656 4.585938 -3.675781 L 3.375 -3.453125 C 3.320313 -3.636719 3.21875 -3.78125 3.078125 -3.882813 C 2.929688 -3.980469 2.726563 -4.03125 2.460938 -4.035156 C 2.121094 -4.03125 1.878906 -3.984375 1.734375 -3.894531 C 1.636719 -3.824219 1.585938 -3.738281 1.589844 -3.636719 C 1.585938 -3.539063 1.628906 -3.460938 1.71875 -3.402344 C 1.828125 -3.3125 2.226563 -3.191406 2.914063 -3.039063 C 3.59375 -2.882813 4.074219 -2.691406 4.347656 -2.46875 C 4.613281 -2.238281 4.746094 -1.921875 4.75 -1.515625 C 4.746094 -1.066406 4.558594 -0.683594 4.191406 -0.367188 C 3.816406 -0.0507813 3.269531 0.105469 2.542969 0.109375 C 1.878906 0.105469 1.355469 -0.0273438 0.972656 -0.292969 C 0.589844 -0.558594 0.335938 -0.921875 0.21875 -1.382813 Z M 0.21875 -1.382813 "
-           id="path8831" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph37-0">
-        <path
-           style="stroke:none;"
-           d="M 0.335938 -2.160156 L 1.640625 -2.289063 C 1.71875 -1.847656 1.875 -1.523438 2.117188 -1.320313 C 2.351563 -1.113281 2.675781 -1.011719 3.089844 -1.015625 C 3.519531 -1.011719 3.847656 -1.105469 4.066406 -1.289063 C 4.285156 -1.46875 4.394531 -1.679688 4.394531 -1.929688 C 4.394531 -2.082031 4.347656 -2.214844 4.253906 -2.328125 C 4.160156 -2.4375 4 -2.535156 3.773438 -2.617188 C 3.617188 -2.667969 3.257813 -2.765625 2.699219 -2.90625 C 1.976563 -3.085938 1.472656 -3.304688 1.1875 -3.5625 C 0.777344 -3.929688 0.574219 -4.375 0.574219 -4.898438 C 0.574219 -5.234375 0.667969 -5.550781 0.859375 -5.847656 C 1.050781 -6.140625 1.328125 -6.363281 1.691406 -6.519531 C 2.050781 -6.671875 2.488281 -6.75 3 -6.753906 C 3.832031 -6.75 4.457031 -6.566406 4.878906 -6.203125 C 5.296875 -5.835938 5.519531 -5.347656 5.542969 -4.742188 L 4.203125 -4.683594 C 4.140625 -5.019531 4.019531 -5.265625 3.832031 -5.417969 C 3.644531 -5.566406 3.359375 -5.640625 2.984375 -5.644531 C 2.589844 -5.640625 2.285156 -5.5625 2.070313 -5.402344 C 1.921875 -5.300781 1.851563 -5.164063 1.855469 -4.992188 C 1.851563 -4.832031 1.917969 -4.695313 2.054688 -4.585938 C 2.222656 -4.441406 2.632813 -4.292969 3.285156 -4.144531 C 3.9375 -3.988281 4.421875 -3.828125 4.734375 -3.664063 C 5.046875 -3.496094 5.289063 -3.273438 5.46875 -2.988281 C 5.640625 -2.703125 5.730469 -2.351563 5.734375 -1.933594 C 5.730469 -1.550781 5.625 -1.191406 5.417969 -0.863281 C 5.203125 -0.527344 4.902344 -0.28125 4.519531 -0.125 C 4.128906 0.0390625 3.648438 0.117188 3.074219 0.117188 C 2.234375 0.117188 1.589844 -0.0742188 1.140625 -0.464844 C 0.6875 -0.847656 0.417969 -1.414063 0.335938 -2.160156 Z M 0.335938 -2.160156 "
-           id="path8834" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph37-1">
-        <path
-           style="stroke:none;"
-           d="M 2.871094 -4.808594 L 2.871094 -3.796875 L 2 -3.796875 L 2 -1.855469 C 1.996094 -1.460938 2.003906 -1.234375 2.023438 -1.171875 C 2.039063 -1.105469 2.078125 -1.050781 2.136719 -1.007813 C 2.195313 -0.964844 2.269531 -0.941406 2.355469 -0.945313 C 2.472656 -0.941406 2.640625 -0.984375 2.867188 -1.070313 L 2.976563 -0.0820313 C 2.679688 0.0429688 2.34375 0.105469 1.96875 0.109375 C 1.738281 0.105469 1.53125 0.0703125 1.347656 -0.00390625 C 1.164063 -0.078125 1.027344 -0.179688 0.945313 -0.304688 C 0.855469 -0.425781 0.796875 -0.589844 0.765625 -0.800781 C 0.738281 -0.945313 0.726563 -1.246094 0.726563 -1.699219 L 0.726563 -3.796875 L 0.140625 -3.796875 L 0.140625 -4.808594 L 0.726563 -4.808594 L 0.726563 -5.765625 L 2 -6.507813 L 2 -4.808594 Z M 2.871094 -4.808594 "
-           id="path8837" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph37-2">
-        <path
-           style="stroke:none;"
-           d="M 0.664063 0 L 0.664063 -6.640625 L 1.9375 -6.640625 L 1.9375 0 Z M 0.664063 0 "
-           id="path8840" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph37-3">
-        <path
-           style="stroke:none;"
-           d="M 0.710938 0 L 0.710938 -6.585938 L 2.050781 -6.585938 L 2.050781 -1.117188 L 5.386719 -1.117188 L 5.386719 0 Z M 0.710938 0 "
-           id="path8843" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph37-4">
-        <path
-           style="stroke:none;"
-           d="M 0.371094 -2.472656 C 0.367188 -2.894531 0.472656 -3.300781 0.683594 -3.699219 C 0.890625 -4.089844 1.183594 -4.394531 1.566406 -4.605469 C 1.945313 -4.8125 2.375 -4.914063 2.847656 -4.917969 C 3.574219 -4.914063 4.167969 -4.679688 4.636719 -4.207031 C 5.097656 -3.734375 5.332031 -3.136719 5.335938 -2.417969 C 5.332031 -1.6875 5.097656 -1.085938 4.628906 -0.609375 C 4.160156 -0.132813 3.570313 0.105469 2.859375 0.109375 C 2.414063 0.105469 1.992188 0.0078125 1.59375 -0.1875 C 1.195313 -0.386719 0.890625 -0.679688 0.683594 -1.066406 C 0.472656 -1.449219 0.367188 -1.917969 0.371094 -2.472656 Z M 1.675781 -2.40625 C 1.675781 -1.925781 1.789063 -1.558594 2.015625 -1.308594 C 2.242188 -1.050781 2.519531 -0.925781 2.851563 -0.929688 C 3.183594 -0.925781 3.460938 -1.050781 3.6875 -1.308594 C 3.910156 -1.558594 4.023438 -1.929688 4.027344 -2.414063 C 4.023438 -2.882813 3.910156 -3.242188 3.6875 -3.5 C 3.460938 -3.75 3.183594 -3.878906 2.851563 -3.882813 C 2.519531 -3.878906 2.242188 -3.75 2.015625 -3.5 C 1.789063 -3.242188 1.675781 -2.878906 1.675781 -2.40625 Z M 1.675781 -2.40625 "
-           id="path8846" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph37-5">
-        <path
-           style="stroke:none;"
-           d="M 4.859375 -3.386719 L 3.605469 -3.160156 C 3.558594 -3.410156 3.464844 -3.601563 3.316406 -3.726563 C 3.167969 -3.851563 2.972656 -3.914063 2.734375 -3.917969 C 2.417969 -3.914063 2.164063 -3.804688 1.976563 -3.589844 C 1.785156 -3.367188 1.691406 -3 1.695313 -2.492188 C 1.691406 -1.917969 1.789063 -1.515625 1.980469 -1.28125 C 2.171875 -1.042969 2.429688 -0.925781 2.753906 -0.929688 C 2.996094 -0.925781 3.191406 -0.992188 3.347656 -1.132813 C 3.496094 -1.265625 3.605469 -1.503906 3.671875 -1.84375 L 4.921875 -1.628906 C 4.789063 -1.054688 4.539063 -0.621094 4.175781 -0.328125 C 3.804688 -0.0390625 3.3125 0.105469 2.695313 0.109375 C 1.992188 0.105469 1.429688 -0.113281 1.011719 -0.554688 C 0.59375 -0.996094 0.386719 -1.609375 0.386719 -2.398438 C 0.386719 -3.1875 0.59375 -3.808594 1.015625 -4.253906 C 1.429688 -4.695313 2 -4.914063 2.71875 -4.917969 C 3.304688 -4.914063 3.769531 -4.789063 4.113281 -4.539063 C 4.457031 -4.285156 4.703125 -3.902344 4.859375 -3.386719 Z M 4.859375 -3.386719 "
-           id="path8849" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph37-6">
-        <path
-           style="stroke:none;"
-           d="M 0.621094 0 L 0.621094 -6.640625 L 1.894531 -6.640625 L 1.894531 -3.117188 L 3.382813 -4.808594 L 4.949219 -4.808594 L 3.304688 -3.050781 L 5.066406 0 L 3.695313 0 L 2.488281 -2.160156 L 1.894531 -1.539063 L 1.894531 0 Z M 0.621094 0 "
-           id="path8852" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-0">
-        <path
-           style="stroke:none;"
-           d="M 6.613281 0 L 5.164063 0 L 4.589844 -1.496094 L 1.957031 -1.496094 L 1.410156 0 L 0 0 L 2.566406 -6.589844 L 3.972656 -6.589844 Z M 4.164063 -2.609375 L 3.253906 -5.054688 L 2.363281 -2.609375 Z M 4.164063 -2.609375 "
-           id="path8855" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-1">
-        <path
-           style="stroke:none;"
-           d="M 2.851563 -4.773438 L 2.851563 -3.765625 L 1.988281 -3.765625 L 1.988281 -1.84375 C 1.984375 -1.449219 1.992188 -1.222656 2.011719 -1.160156 C 2.023438 -1.097656 2.0625 -1.042969 2.121094 -1.003906 C 2.179688 -0.957031 2.25 -0.9375 2.335938 -0.941406 C 2.449219 -0.9375 2.621094 -0.976563 2.847656 -1.0625 L 2.953125 -0.0820313 C 2.65625 0.0429688 2.324219 0.105469 1.957031 0.109375 C 1.726563 0.105469 1.519531 0.0703125 1.339844 -0.00390625 C 1.15625 -0.078125 1.023438 -0.175781 0.9375 -0.300781 C 0.851563 -0.417969 0.792969 -0.585938 0.761719 -0.796875 C 0.730469 -0.941406 0.714844 -1.238281 0.71875 -1.6875 L 0.71875 -3.765625 L 0.140625 -3.765625 L 0.140625 -4.773438 L 0.71875 -4.773438 L 0.71875 -5.722656 L 1.988281 -6.460938 L 1.988281 -4.773438 Z M 2.851563 -4.773438 "
-           id="path8858" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-2">
-        <path
-           style="stroke:none;"
-           d="M 0.367188 -2.453125 C 0.363281 -2.871094 0.464844 -3.277344 0.675781 -3.671875 C 0.878906 -4.0625 1.171875 -4.363281 1.554688 -4.570313 C 1.929688 -4.777344 2.355469 -4.878906 2.828125 -4.882813 C 3.546875 -4.878906 4.136719 -4.644531 4.601563 -4.179688 C 5.0625 -3.707031 5.296875 -3.117188 5.296875 -2.402344 C 5.296875 -1.675781 5.0625 -1.074219 4.597656 -0.601563 C 4.128906 -0.128906 3.542969 0.105469 2.835938 0.109375 C 2.398438 0.105469 1.980469 0.0078125 1.582031 -0.1875 C 1.183594 -0.382813 0.878906 -0.671875 0.675781 -1.058594 C 0.464844 -1.4375 0.363281 -1.902344 0.367188 -2.453125 Z M 1.664063 -2.386719 C 1.664063 -1.910156 1.773438 -1.546875 2 -1.296875 C 2.21875 -1.042969 2.496094 -0.917969 2.832031 -0.921875 C 3.160156 -0.917969 3.4375 -1.042969 3.660156 -1.296875 C 3.882813 -1.546875 3.996094 -1.914063 3.996094 -2.394531 C 3.996094 -2.859375 3.882813 -3.21875 3.660156 -3.472656 C 3.4375 -3.722656 3.160156 -3.847656 2.832031 -3.851563 C 2.496094 -3.847656 2.21875 -3.722656 2 -3.472656 C 1.773438 -3.21875 1.664063 -2.855469 1.664063 -2.386719 Z M 1.664063 -2.386719 "
-           id="path8861" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-3">
-        <path
-           style="stroke:none;"
-           d="M 0.566406 -4.773438 L 1.730469 -4.773438 L 1.730469 -4.121094 C 2.144531 -4.625 2.640625 -4.878906 3.21875 -4.882813 C 3.519531 -4.878906 3.785156 -4.816406 4.011719 -4.691406 C 4.234375 -4.5625 4.417969 -4.371094 4.566406 -4.121094 C 4.773438 -4.371094 5 -4.5625 5.246094 -4.691406 C 5.484375 -4.816406 5.746094 -4.878906 6.023438 -4.882813 C 6.375 -4.878906 6.671875 -4.808594 6.914063 -4.667969 C 7.15625 -4.523438 7.335938 -4.316406 7.457031 -4.042969 C 7.542969 -3.835938 7.585938 -3.503906 7.589844 -3.050781 L 7.589844 0 L 6.324219 0 L 6.324219 -2.730469 C 6.320313 -3.199219 6.277344 -3.503906 6.195313 -3.644531 C 6.074219 -3.824219 5.894531 -3.914063 5.65625 -3.914063 C 5.476563 -3.914063 5.3125 -3.859375 5.15625 -3.753906 C 5 -3.644531 4.886719 -3.488281 4.816406 -3.28125 C 4.746094 -3.070313 4.710938 -2.738281 4.714844 -2.292969 L 4.714844 0 L 3.453125 0 L 3.453125 -2.617188 C 3.449219 -3.078125 3.425781 -3.378906 3.382813 -3.515625 C 3.335938 -3.648438 3.269531 -3.746094 3.175781 -3.816406 C 3.082031 -3.878906 2.953125 -3.914063 2.792969 -3.914063 C 2.59375 -3.914063 2.417969 -3.859375 2.265625 -3.757813 C 2.105469 -3.648438 1.992188 -3.5 1.929688 -3.304688 C 1.859375 -3.105469 1.828125 -2.777344 1.828125 -2.320313 L 1.828125 0 L 0.566406 0 Z M 0.566406 -4.773438 "
-           id="path8864" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-4">
-        <path
-           style="stroke:none;"
-           d="M 0.660156 -5.421875 L 0.660156 -6.589844 L 1.925781 -6.589844 L 1.925781 -5.421875 Z M 0.660156 0 L 0.660156 -4.773438 L 1.925781 -4.773438 L 1.925781 0 Z M 0.660156 0 "
-           id="path8867" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-5">
-        <path
-           style="stroke:none;"
-           d="M 4.824219 -3.363281 L 3.578125 -3.136719 C 3.535156 -3.382813 3.441406 -3.570313 3.292969 -3.699219 C 3.144531 -3.824219 2.949219 -3.890625 2.714844 -3.890625 C 2.394531 -3.890625 2.144531 -3.78125 1.960938 -3.5625 C 1.769531 -3.34375 1.675781 -2.980469 1.679688 -2.472656 C 1.675781 -1.902344 1.769531 -1.5 1.964844 -1.269531 C 2.152344 -1.035156 2.410156 -0.917969 2.734375 -0.921875 C 2.972656 -0.917969 3.167969 -0.988281 3.320313 -1.125 C 3.472656 -1.261719 3.582031 -1.496094 3.644531 -1.828125 L 4.886719 -1.617188 C 4.757813 -1.046875 4.507813 -0.617188 4.144531 -0.328125 C 3.777344 -0.0390625 3.289063 0.105469 2.675781 0.109375 C 1.976563 0.105469 1.417969 -0.113281 1.003906 -0.550781 C 0.585938 -0.992188 0.378906 -1.601563 0.382813 -2.382813 C 0.378906 -3.167969 0.589844 -3.78125 1.007813 -4.222656 C 1.421875 -4.660156 1.984375 -4.878906 2.699219 -4.882813 C 3.277344 -4.878906 3.738281 -4.753906 4.082031 -4.507813 C 4.421875 -4.253906 4.667969 -3.875 4.824219 -3.363281 Z M 4.824219 -3.363281 "
-           id="path8870" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-6">
-        <path
-           style="stroke:none;"
-           d="M 0.398438 -3.253906 C 0.398438 -3.921875 0.5 -4.488281 0.703125 -4.945313 C 0.851563 -5.28125 1.054688 -5.582031 1.3125 -5.847656 C 1.570313 -6.113281 1.855469 -6.308594 2.167969 -6.441406 C 2.574219 -6.613281 3.046875 -6.703125 3.585938 -6.703125 C 4.554688 -6.703125 5.332031 -6.398438 5.917969 -5.796875 C 6.5 -5.191406 6.789063 -4.355469 6.792969 -3.285156 C 6.789063 -2.21875 6.5 -1.386719 5.925781 -0.789063 C 5.34375 -0.1875 4.570313 0.109375 3.605469 0.113281 C 2.621094 0.109375 1.84375 -0.183594 1.265625 -0.78125 C 0.6875 -1.375 0.398438 -2.199219 0.398438 -3.253906 Z M 1.769531 -3.300781 C 1.769531 -2.550781 1.941406 -1.984375 2.285156 -1.601563 C 2.628906 -1.210938 3.066406 -1.019531 3.601563 -1.023438 C 4.128906 -1.019531 4.5625 -1.210938 4.90625 -1.597656 C 5.242188 -1.976563 5.414063 -2.550781 5.417969 -3.316406 C 5.414063 -4.070313 5.25 -4.632813 4.921875 -5.007813 C 4.589844 -5.378906 4.148438 -5.566406 3.601563 -5.566406 C 3.050781 -5.566406 2.609375 -5.375 2.273438 -5 C 1.9375 -4.621094 1.769531 -4.054688 1.769531 -3.300781 Z M 1.769531 -3.300781 "
-           id="path8873" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-7">
-        <path
-           style="stroke:none;"
-           d="M 0.625 -4.773438 L 1.804688 -4.773438 L 1.804688 -4.074219 C 1.953125 -4.308594 2.15625 -4.503906 2.421875 -4.65625 C 2.679688 -4.804688 2.96875 -4.878906 3.289063 -4.882813 C 3.839844 -4.878906 4.3125 -4.664063 4.699219 -4.230469 C 5.085938 -3.792969 5.277344 -3.1875 5.28125 -2.414063 C 5.277344 -1.617188 5.082031 -0.996094 4.695313 -0.554688 C 4.304688 -0.113281 3.835938 0.105469 3.28125 0.109375 C 3.015625 0.105469 2.773438 0.0546875 2.5625 -0.046875 C 2.34375 -0.152344 2.121094 -0.332031 1.886719 -0.589844 L 1.886719 1.816406 L 0.625 1.816406 Z M 1.875 -2.46875 C 1.875 -1.929688 1.980469 -1.535156 2.191406 -1.277344 C 2.402344 -1.019531 2.660156 -0.890625 2.972656 -0.894531 C 3.261719 -0.890625 3.507813 -1.007813 3.703125 -1.246094 C 3.898438 -1.480469 3.996094 -1.867188 3.996094 -2.40625 C 3.996094 -2.902344 3.894531 -3.273438 3.695313 -3.519531 C 3.492188 -3.757813 3.242188 -3.878906 2.949219 -3.882813 C 2.636719 -3.878906 2.382813 -3.757813 2.179688 -3.523438 C 1.976563 -3.28125 1.875 -2.929688 1.875 -2.46875 Z M 1.875 -2.46875 "
-           id="path8876" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-8">
-        <path
-           style="stroke:none;"
-           d="M 4.886719 -2.421875 L 6.175781 -2.015625 C 5.976563 -1.292969 5.644531 -0.757813 5.1875 -0.410156 C 4.726563 -0.0625 4.144531 0.109375 3.4375 0.113281 C 2.558594 0.109375 1.839844 -0.183594 1.28125 -0.78125 C 0.714844 -1.375 0.433594 -2.195313 0.4375 -3.238281 C 0.433594 -4.332031 0.714844 -5.183594 1.285156 -5.792969 C 1.847656 -6.398438 2.59375 -6.703125 3.519531 -6.703125 C 4.324219 -6.703125 4.980469 -6.464844 5.484375 -5.988281 C 5.78125 -5.707031 6.003906 -5.300781 6.160156 -4.773438 L 4.84375 -4.460938 C 4.761719 -4.796875 4.597656 -5.066406 4.355469 -5.269531 C 4.105469 -5.464844 3.804688 -5.566406 3.453125 -5.566406 C 2.960938 -5.566406 2.566406 -5.390625 2.261719 -5.039063 C 1.957031 -4.6875 1.804688 -4.117188 1.808594 -3.335938 C 1.804688 -2.5 1.953125 -1.90625 2.257813 -1.554688 C 2.554688 -1.195313 2.945313 -1.019531 3.425781 -1.023438 C 3.777344 -1.019531 4.082031 -1.132813 4.335938 -1.359375 C 4.589844 -1.585938 4.773438 -1.9375 4.886719 -2.421875 Z M 4.886719 -2.421875 "
-           id="path8879" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-9">
-        <path
-           style="stroke:none;"
-           d="M 1.871094 0 L 0.605469 0 L 0.605469 -4.773438 L 1.78125 -4.773438 L 1.78125 -4.097656 C 1.980469 -4.414063 2.160156 -4.625 2.320313 -4.730469 C 2.480469 -4.828125 2.660156 -4.878906 2.867188 -4.882813 C 3.152344 -4.878906 3.429688 -4.800781 3.699219 -4.644531 L 3.308594 -3.542969 C 3.09375 -3.679688 2.894531 -3.75 2.714844 -3.75 C 2.535156 -3.75 2.386719 -3.699219 2.265625 -3.601563 C 2.140625 -3.503906 2.042969 -3.328125 1.976563 -3.074219 C 1.902344 -2.816406 1.867188 -2.285156 1.871094 -1.476563 Z M 1.871094 0 "
-           id="path8882" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph38-10">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path8885" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph39-0">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 -6.585938 L 3.308594 -6.585938 C 3.828125 -6.585938 4.214844 -6.5625 4.472656 -6.519531 C 4.722656 -6.476563 4.953125 -6.386719 5.160156 -6.25 C 5.359375 -6.109375 5.527344 -5.925781 5.664063 -5.699219 C 5.796875 -5.46875 5.863281 -5.210938 5.867188 -4.929688 C 5.863281 -4.617188 5.78125 -4.335938 5.617188 -4.078125 C 5.449219 -3.820313 5.222656 -3.625 4.941406 -3.5 C 5.339844 -3.378906 5.648438 -3.179688 5.867188 -2.902344 C 6.078125 -2.617188 6.1875 -2.289063 6.191406 -1.910156 C 6.1875 -1.609375 6.117188 -1.316406 5.980469 -1.035156 C 5.839844 -0.75 5.648438 -0.523438 5.410156 -0.355469 C 5.167969 -0.183594 4.871094 -0.0820313 4.519531 -0.046875 C 4.292969 -0.0195313 3.757813 -0.00390625 2.914063 0 L 0.675781 0 Z M 2.003906 -5.492188 L 2.003906 -3.96875 L 2.875 -3.96875 C 3.390625 -3.964844 3.710938 -3.972656 3.839844 -3.988281 C 4.066406 -4.015625 4.246094 -4.09375 4.378906 -4.226563 C 4.503906 -4.351563 4.570313 -4.523438 4.574219 -4.738281 C 4.570313 -4.941406 4.515625 -5.105469 4.40625 -5.234375 C 4.292969 -5.359375 4.125 -5.4375 3.90625 -5.46875 C 3.769531 -5.480469 3.390625 -5.488281 2.769531 -5.492188 Z M 2.003906 -2.871094 L 2.003906 -1.109375 L 3.234375 -1.109375 C 3.710938 -1.105469 4.015625 -1.121094 4.148438 -1.148438 C 4.34375 -1.183594 4.503906 -1.269531 4.628906 -1.410156 C 4.753906 -1.546875 4.816406 -1.734375 4.816406 -1.972656 C 4.816406 -2.167969 4.765625 -2.335938 4.671875 -2.476563 C 4.570313 -2.613281 4.433594 -2.714844 4.253906 -2.777344 C 4.074219 -2.839844 3.679688 -2.871094 3.078125 -2.871094 Z M 2.003906 -2.871094 "
-           id="path8888" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph39-1">
-        <path
-           style="stroke:none;"
-           d="M 3.800781 0 L 3.800781 -0.714844 C 3.625 -0.457031 3.394531 -0.253906 3.113281 -0.109375 C 2.828125 0.0351563 2.53125 0.105469 2.21875 0.109375 C 1.894531 0.105469 1.605469 0.0390625 1.355469 -0.101563 C 1.097656 -0.242188 0.914063 -0.441406 0.804688 -0.695313 C 0.6875 -0.949219 0.632813 -1.300781 0.632813 -1.753906 L 0.632813 -4.773438 L 1.894531 -4.773438 L 1.894531 -2.578125 C 1.890625 -1.90625 1.914063 -1.492188 1.964844 -1.34375 C 2.007813 -1.1875 2.09375 -1.070313 2.21875 -0.984375 C 2.339844 -0.898438 2.496094 -0.855469 2.6875 -0.855469 C 2.902344 -0.855469 3.09375 -0.914063 3.265625 -1.03125 C 3.433594 -1.148438 3.550781 -1.292969 3.617188 -1.46875 C 3.675781 -1.644531 3.707031 -2.074219 3.710938 -2.757813 L 3.710938 -4.773438 L 4.972656 -4.773438 L 4.972656 0 Z M 3.800781 0 "
-           id="path8891" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph39-2">
-        <path
-           style="stroke:none;"
-           d="M 0.109375 -4.773438 L 0.808594 -4.773438 L 0.808594 -5.132813 C 0.808594 -5.53125 0.851563 -5.828125 0.9375 -6.027344 C 1.023438 -6.222656 1.179688 -6.382813 1.410156 -6.511719 C 1.636719 -6.632813 1.925781 -6.695313 2.277344 -6.699219 C 2.632813 -6.695313 2.984375 -6.644531 3.332031 -6.539063 L 3.164063 -5.65625 C 2.960938 -5.703125 2.765625 -5.730469 2.582031 -5.730469 C 2.398438 -5.730469 2.265625 -5.6875 2.1875 -5.601563 C 2.105469 -5.515625 2.066406 -5.351563 2.070313 -5.109375 L 2.070313 -4.773438 L 3.015625 -4.773438 L 3.015625 -3.777344 L 2.070313 -3.777344 L 2.070313 0 L 0.808594 0 L 0.808594 -3.777344 L 0.109375 -3.777344 Z M 0.109375 -4.773438 "
-           id="path8894" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph39-3">
-        <path
-           style="stroke:none;"
-           d="M 3.421875 -1.519531 L 4.683594 -1.308594 C 4.515625 -0.84375 4.261719 -0.492188 3.914063 -0.253906 C 3.5625 -0.0117188 3.125 0.105469 2.605469 0.109375 C 1.773438 0.105469 1.160156 -0.164063 0.765625 -0.707031 C 0.449219 -1.136719 0.292969 -1.683594 0.292969 -2.351563 C 0.292969 -3.136719 0.496094 -3.757813 0.910156 -4.207031 C 1.320313 -4.652344 1.84375 -4.875 2.480469 -4.878906 C 3.1875 -4.875 3.75 -4.640625 4.160156 -4.175781 C 4.570313 -3.703125 4.765625 -2.984375 4.75 -2.023438 L 1.585938 -2.023438 C 1.589844 -1.644531 1.691406 -1.355469 1.890625 -1.148438 C 2.082031 -0.9375 2.324219 -0.832031 2.621094 -0.835938 C 2.8125 -0.832031 2.980469 -0.886719 3.117188 -0.996094 C 3.25 -1.105469 3.351563 -1.277344 3.421875 -1.519531 Z M 3.496094 -2.792969 C 3.484375 -3.15625 3.390625 -3.433594 3.210938 -3.628906 C 3.03125 -3.816406 2.8125 -3.914063 2.554688 -3.914063 C 2.28125 -3.914063 2.054688 -3.8125 1.875 -3.613281 C 1.695313 -3.410156 1.605469 -3.136719 1.609375 -2.792969 Z M 3.496094 -2.792969 "
-           id="path8897" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph39-4">
-        <path
-           style="stroke:none;"
-           d="M 1.867188 0 L 0.605469 0 L 0.605469 -4.773438 L 1.777344 -4.773438 L 1.777344 -4.09375 C 1.976563 -4.410156 2.160156 -4.621094 2.320313 -4.726563 C 2.480469 -4.824219 2.660156 -4.875 2.867188 -4.878906 C 3.148438 -4.875 3.425781 -4.796875 3.699219 -4.640625 L 3.308594 -3.539063 C 3.09375 -3.675781 2.894531 -3.742188 2.714844 -3.746094 C 2.535156 -3.742188 2.386719 -3.695313 2.265625 -3.601563 C 2.140625 -3.503906 2.042969 -3.328125 1.972656 -3.074219 C 1.898438 -2.816406 1.863281 -2.28125 1.867188 -1.472656 Z M 1.867188 0 "
-           id="path8900" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph40-0">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 -6.648438 L 3.335938 -6.648438 C 3.859375 -6.644531 4.253906 -6.621094 4.511719 -6.582031 C 4.769531 -6.535156 5 -6.445313 5.207031 -6.304688 C 5.40625 -6.164063 5.574219 -5.976563 5.714844 -5.75 C 5.847656 -5.515625 5.917969 -5.257813 5.921875 -4.972656 C 5.917969 -4.660156 5.832031 -4.375 5.667969 -4.117188 C 5.496094 -3.855469 5.269531 -3.660156 4.988281 -3.53125 C 5.386719 -3.410156 5.699219 -3.210938 5.921875 -2.929688 C 6.136719 -2.644531 6.246094 -2.308594 6.25 -1.925781 C 6.246094 -1.621094 6.175781 -1.328125 6.039063 -1.042969 C 5.894531 -0.757813 5.703125 -0.527344 5.460938 -0.359375 C 5.21875 -0.1875 4.917969 -0.0859375 4.5625 -0.046875 C 4.335938 -0.0195313 3.796875 -0.00390625 2.941406 0 L 0.679688 0 Z M 2.023438 -5.539063 L 2.023438 -4.003906 L 2.902344 -4.003906 C 3.421875 -4 3.746094 -4.007813 3.875 -4.027344 C 4.101563 -4.050781 4.285156 -4.128906 4.417969 -4.261719 C 4.550781 -4.390625 4.617188 -4.566406 4.617188 -4.785156 C 4.617188 -4.988281 4.558594 -5.15625 4.445313 -5.285156 C 4.332031 -5.410156 4.164063 -5.488281 3.941406 -5.519531 C 3.804688 -5.53125 3.421875 -5.535156 2.792969 -5.539063 Z M 2.023438 -2.898438 L 2.023438 -1.121094 L 3.265625 -1.121094 C 3.742188 -1.117188 4.050781 -1.132813 4.183594 -1.160156 C 4.382813 -1.195313 4.542969 -1.28125 4.671875 -1.425781 C 4.792969 -1.5625 4.855469 -1.753906 4.859375 -1.992188 C 4.855469 -2.1875 4.808594 -2.355469 4.714844 -2.496094 C 4.617188 -2.636719 4.476563 -2.738281 4.296875 -2.800781 C 4.109375 -2.863281 3.714844 -2.894531 3.105469 -2.898438 Z M 2.023438 -2.898438 "
-           id="path8903" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph40-1">
-        <path
-           style="stroke:none;"
-           d="M 3.835938 0 L 3.835938 -0.722656 C 3.660156 -0.460938 3.429688 -0.257813 3.144531 -0.113281 C 2.855469 0.0351563 2.554688 0.105469 2.238281 0.109375 C 1.914063 0.105469 1.621094 0.0390625 1.367188 -0.101563 C 1.109375 -0.242188 0.925781 -0.445313 0.8125 -0.703125 C 0.695313 -0.960938 0.636719 -1.316406 0.640625 -1.769531 L 0.640625 -4.816406 L 1.914063 -4.816406 L 1.914063 -2.601563 C 1.910156 -1.921875 1.933594 -1.503906 1.984375 -1.355469 C 2.027344 -1.199219 2.113281 -1.082031 2.238281 -0.996094 C 2.363281 -0.90625 2.519531 -0.859375 2.710938 -0.863281 C 2.925781 -0.859375 3.121094 -0.917969 3.296875 -1.039063 C 3.464844 -1.15625 3.582031 -1.304688 3.648438 -1.484375 C 3.710938 -1.65625 3.742188 -2.089844 3.746094 -2.785156 L 3.746094 -4.816406 L 5.019531 -4.816406 L 5.019531 0 Z M 3.835938 0 "
-           id="path8906" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph40-2">
-        <path
-           style="stroke:none;"
-           d="M 0.109375 -4.816406 L 0.816406 -4.816406 L 0.816406 -5.179688 C 0.8125 -5.582031 0.855469 -5.882813 0.945313 -6.082031 C 1.027344 -6.28125 1.1875 -6.441406 1.421875 -6.570313 C 1.652344 -6.695313 1.945313 -6.761719 2.300781 -6.761719 C 2.660156 -6.761719 3.015625 -6.707031 3.363281 -6.597656 L 3.191406 -5.707031 C 2.984375 -5.753906 2.789063 -5.777344 2.605469 -5.78125 C 2.421875 -5.777344 2.289063 -5.734375 2.210938 -5.652344 C 2.125 -5.5625 2.085938 -5.398438 2.089844 -5.15625 L 2.089844 -4.816406 L 3.042969 -4.816406 L 3.042969 -3.8125 L 2.089844 -3.8125 L 2.089844 0 L 0.816406 0 L 0.816406 -3.8125 L 0.109375 -3.8125 Z M 0.109375 -4.816406 "
-           id="path8909" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph40-3">
-        <path
-           style="stroke:none;"
-           d="M 4.929688 -2.445313 L 6.230469 -2.03125 C 6.027344 -1.300781 5.695313 -0.761719 5.234375 -0.414063 C 4.765625 -0.0625 4.179688 0.109375 3.46875 0.113281 C 2.585938 0.109375 1.859375 -0.1875 1.292969 -0.789063 C 0.722656 -1.390625 0.4375 -2.214844 0.441406 -3.265625 C 0.4375 -4.367188 0.722656 -5.226563 1.296875 -5.84375 C 1.863281 -6.453125 2.617188 -6.761719 3.550781 -6.761719 C 4.363281 -6.761719 5.023438 -6.519531 5.53125 -6.039063 C 5.832031 -5.753906 6.058594 -5.347656 6.210938 -4.816406 L 4.882813 -4.496094 C 4.800781 -4.839844 4.636719 -5.113281 4.390625 -5.3125 C 4.136719 -5.511719 3.835938 -5.613281 3.480469 -5.613281 C 2.988281 -5.613281 2.585938 -5.433594 2.28125 -5.082031 C 1.972656 -4.722656 1.820313 -4.152344 1.824219 -3.363281 C 1.820313 -2.523438 1.972656 -1.925781 2.277344 -1.570313 C 2.578125 -1.210938 2.96875 -1.03125 3.453125 -1.035156 C 3.804688 -1.03125 4.113281 -1.144531 4.375 -1.375 C 4.628906 -1.597656 4.8125 -1.953125 4.929688 -2.445313 Z M 4.929688 -2.445313 "
-           id="path8912" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph40-4">
-        <path
-           style="stroke:none;"
-           d="M 0.628906 -4.816406 L 1.816406 -4.816406 L 1.816406 -4.109375 C 1.972656 -4.34375 2.179688 -4.539063 2.445313 -4.695313 C 2.703125 -4.84375 2.996094 -4.921875 3.320313 -4.925781 C 3.875 -4.921875 4.347656 -4.703125 4.742188 -4.265625 C 5.128906 -3.828125 5.324219 -3.214844 5.328125 -2.433594 C 5.324219 -1.628906 5.128906 -1.003906 4.738281 -0.558594 C 4.34375 -0.113281 3.867188 0.105469 3.308594 0.109375 C 3.039063 0.105469 2.796875 0.0546875 2.585938 -0.046875 C 2.367188 -0.152344 2.140625 -0.332031 1.90625 -0.59375 L 1.90625 1.832031 L 0.628906 1.832031 Z M 1.890625 -2.488281 C 1.886719 -1.945313 1.992188 -1.546875 2.210938 -1.289063 C 2.421875 -1.03125 2.683594 -0.902344 2.996094 -0.902344 C 3.289063 -0.902344 3.535156 -1.019531 3.734375 -1.257813 C 3.929688 -1.492188 4.03125 -1.882813 4.03125 -2.425781 C 4.03125 -2.929688 3.929688 -3.304688 3.726563 -3.550781 C 3.523438 -3.792969 3.269531 -3.914063 2.972656 -3.917969 C 2.660156 -3.914063 2.402344 -3.792969 2.199219 -3.554688 C 1.988281 -3.3125 1.886719 -2.957031 1.890625 -2.488281 Z M 1.890625 -2.488281 "
-           id="path8915" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph41-0">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 -6.589844 L 3.308594 -6.589844 C 3.828125 -6.585938 4.214844 -6.5625 4.472656 -6.523438 C 4.726563 -6.476563 4.957031 -6.386719 5.160156 -6.253906 C 5.359375 -6.113281 5.527344 -5.929688 5.667969 -5.703125 C 5.800781 -5.46875 5.871094 -5.210938 5.871094 -4.929688 C 5.871094 -4.617188 5.785156 -4.335938 5.621094 -4.082031 C 5.449219 -3.820313 5.226563 -3.625 4.945313 -3.5 C 5.34375 -3.382813 5.652344 -3.183594 5.871094 -2.902344 C 6.085938 -2.617188 6.191406 -2.289063 6.195313 -1.910156 C 6.191406 -1.609375 6.121094 -1.316406 5.984375 -1.035156 C 5.84375 -0.75 5.65625 -0.523438 5.414063 -0.355469 C 5.171875 -0.183594 4.875 -0.0820313 4.523438 -0.046875 C 4.296875 -0.0195313 3.761719 -0.00390625 2.917969 0 L 0.675781 0 Z M 2.003906 -5.492188 L 2.003906 -3.96875 L 2.875 -3.96875 C 3.394531 -3.964844 3.71875 -3.972656 3.84375 -3.992188 C 4.070313 -4.015625 4.246094 -4.09375 4.378906 -4.226563 C 4.503906 -4.351563 4.570313 -4.523438 4.574219 -4.742188 C 4.570313 -4.941406 4.515625 -5.109375 4.40625 -5.238281 C 4.292969 -5.363281 4.125 -5.4375 3.90625 -5.46875 C 3.769531 -5.480469 3.390625 -5.488281 2.769531 -5.492188 Z M 2.003906 -2.871094 L 2.003906 -1.109375 L 3.234375 -1.109375 C 3.714844 -1.105469 4.019531 -1.121094 4.148438 -1.152344 C 4.34375 -1.183594 4.503906 -1.269531 4.632813 -1.410156 C 4.753906 -1.546875 4.816406 -1.734375 4.820313 -1.972656 C 4.816406 -2.167969 4.769531 -2.335938 4.675781 -2.476563 C 4.578125 -2.613281 4.4375 -2.714844 4.257813 -2.777344 C 4.074219 -2.839844 3.679688 -2.871094 3.078125 -2.871094 Z M 2.003906 -2.871094 "
-           id="path8918" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph41-1">
-        <path
-           style="stroke:none;"
-           d="M 3.800781 0 L 3.800781 -0.714844 C 3.625 -0.457031 3.398438 -0.253906 3.117188 -0.109375 C 2.832031 0.0351563 2.53125 0.105469 2.21875 0.109375 C 1.894531 0.105469 1.605469 0.0390625 1.355469 -0.101563 C 1.097656 -0.242188 0.914063 -0.441406 0.804688 -0.695313 C 0.6875 -0.949219 0.632813 -1.300781 0.632813 -1.753906 L 0.632813 -4.773438 L 1.898438 -4.773438 L 1.898438 -2.578125 C 1.898438 -1.90625 1.917969 -1.496094 1.964844 -1.347656 C 2.007813 -1.195313 2.09375 -1.074219 2.21875 -0.988281 C 2.339844 -0.898438 2.496094 -0.855469 2.6875 -0.855469 C 2.902344 -0.855469 3.09375 -0.914063 3.265625 -1.03125 C 3.433594 -1.148438 3.550781 -1.292969 3.617188 -1.472656 C 3.675781 -1.644531 3.707031 -2.074219 3.710938 -2.761719 L 3.710938 -4.773438 L 4.976563 -4.773438 L 4.976563 0 Z M 3.800781 0 "
-           id="path8921" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph41-2">
-        <path
-           style="stroke:none;"
-           d="M 0.109375 -4.773438 L 0.808594 -4.773438 L 0.808594 -5.132813 C 0.808594 -5.53125 0.851563 -5.828125 0.9375 -6.03125 C 1.023438 -6.226563 1.179688 -6.390625 1.410156 -6.515625 C 1.636719 -6.640625 1.925781 -6.703125 2.277344 -6.703125 C 2.632813 -6.703125 2.984375 -6.648438 3.335938 -6.539063 L 3.164063 -5.660156 C 2.960938 -5.707031 2.765625 -5.730469 2.585938 -5.730469 C 2.398438 -5.730469 2.265625 -5.6875 2.1875 -5.601563 C 2.105469 -5.515625 2.066406 -5.351563 2.070313 -5.109375 L 2.070313 -4.773438 L 3.015625 -4.773438 L 3.015625 -3.78125 L 2.070313 -3.78125 L 2.070313 0 L 0.808594 0 L 0.808594 -3.78125 L 0.109375 -3.78125 Z M 0.109375 -4.773438 "
-           id="path8924" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph41-3">
-        <path
-           style="stroke:none;"
-           d="M 4.886719 -2.421875 L 6.175781 -2.015625 C 5.976563 -1.292969 5.644531 -0.757813 5.1875 -0.410156 C 4.726563 -0.0625 4.144531 0.109375 3.4375 0.113281 C 2.558594 0.109375 1.839844 -0.183594 1.28125 -0.78125 C 0.714844 -1.375 0.433594 -2.195313 0.4375 -3.234375 C 0.433594 -4.332031 0.714844 -5.183594 1.285156 -5.792969 C 1.847656 -6.398438 2.59375 -6.703125 3.519531 -6.703125 C 4.324219 -6.703125 4.980469 -6.464844 5.484375 -5.988281 C 5.78125 -5.707031 6.003906 -5.300781 6.15625 -4.773438 L 4.839844 -4.457031 C 4.761719 -4.796875 4.597656 -5.066406 4.351563 -5.269531 C 4.105469 -5.464844 3.804688 -5.566406 3.453125 -5.566406 C 2.960938 -5.566406 2.566406 -5.390625 2.261719 -5.039063 C 1.957031 -4.6875 1.804688 -4.117188 1.808594 -3.335938 C 1.804688 -2.5 1.953125 -1.90625 2.257813 -1.554688 C 2.554688 -1.195313 2.945313 -1.019531 3.425781 -1.023438 C 3.777344 -1.019531 4.082031 -1.132813 4.335938 -1.359375 C 4.589844 -1.585938 4.773438 -1.9375 4.886719 -2.421875 Z M 4.886719 -2.421875 "
-           id="path8927" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph41-4">
-        <path
-           style="stroke:none;"
-           d="M 5.039063 0 L 3.867188 0 L 3.867188 -0.703125 C 3.671875 -0.429688 3.441406 -0.222656 3.175781 -0.0898438 C 2.910156 0.0429688 2.640625 0.105469 2.375 0.109375 C 1.820313 0.105469 1.351563 -0.113281 0.964844 -0.554688 C 0.570313 -0.996094 0.375 -1.613281 0.378906 -2.40625 C 0.375 -3.210938 0.566406 -3.824219 0.949219 -4.25 C 1.328125 -4.667969 1.808594 -4.878906 2.390625 -4.882813 C 2.921875 -4.878906 3.382813 -4.65625 3.777344 -4.214844 L 3.777344 -6.589844 L 5.039063 -6.589844 Z M 1.667969 -2.488281 C 1.664063 -1.976563 1.734375 -1.609375 1.878906 -1.382813 C 2.078125 -1.054688 2.363281 -0.890625 2.734375 -0.890625 C 3.019531 -0.890625 3.265625 -1.011719 3.472656 -1.261719 C 3.675781 -1.503906 3.777344 -1.875 3.78125 -2.367188 C 3.777344 -2.914063 3.679688 -3.308594 3.484375 -3.550781 C 3.285156 -3.792969 3.03125 -3.914063 2.722656 -3.914063 C 2.421875 -3.914063 2.171875 -3.792969 1.96875 -3.554688 C 1.765625 -3.316406 1.664063 -2.960938 1.667969 -2.488281 Z M 1.667969 -2.488281 "
-           id="path8930" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph41-5">
-        <path
-           style="stroke:none;"
-           d="M 1.605469 -3.316406 L 0.457031 -3.523438 C 0.585938 -3.980469 0.804688 -4.324219 1.121094 -4.546875 C 1.433594 -4.769531 1.902344 -4.878906 2.527344 -4.882813 C 3.085938 -4.878906 3.507813 -4.8125 3.785156 -4.679688 C 4.058594 -4.542969 4.25 -4.375 4.367188 -4.171875 C 4.476563 -3.96875 4.535156 -3.589844 4.535156 -3.042969 L 4.523438 -1.570313 C 4.523438 -1.148438 4.542969 -0.835938 4.582031 -0.636719 C 4.621094 -0.4375 4.695313 -0.226563 4.808594 0 L 3.558594 0 C 3.523438 -0.0820313 3.484375 -0.207031 3.4375 -0.375 C 3.414063 -0.445313 3.398438 -0.492188 3.394531 -0.519531 C 3.175781 -0.308594 2.945313 -0.152344 2.699219 -0.046875 C 2.453125 0.0546875 2.191406 0.105469 1.914063 0.109375 C 1.417969 0.105469 1.03125 -0.0234375 0.75 -0.289063 C 0.46875 -0.554688 0.328125 -0.894531 0.328125 -1.304688 C 0.328125 -1.570313 0.390625 -1.808594 0.519531 -2.023438 C 0.648438 -2.234375 0.828125 -2.398438 1.0625 -2.511719 C 1.289063 -2.621094 1.625 -2.71875 2.066406 -2.804688 C 2.65625 -2.914063 3.066406 -3.015625 3.292969 -3.113281 L 3.292969 -3.242188 C 3.292969 -3.480469 3.230469 -3.652344 3.113281 -3.757813 C 2.988281 -3.859375 2.765625 -3.914063 2.4375 -3.914063 C 2.210938 -3.914063 2.035156 -3.867188 1.910156 -3.78125 C 1.785156 -3.6875 1.683594 -3.535156 1.605469 -3.316406 Z M 3.292969 -2.292969 C 3.128906 -2.234375 2.871094 -2.171875 2.523438 -2.097656 C 2.171875 -2.023438 1.945313 -1.949219 1.839844 -1.878906 C 1.671875 -1.757813 1.585938 -1.609375 1.589844 -1.433594 C 1.585938 -1.253906 1.652344 -1.101563 1.789063 -0.972656 C 1.917969 -0.84375 2.085938 -0.777344 2.292969 -0.78125 C 2.515625 -0.777344 2.734375 -0.855469 2.945313 -1.007813 C 3.09375 -1.117188 3.195313 -1.253906 3.246094 -1.425781 C 3.277344 -1.527344 3.292969 -1.734375 3.292969 -2.039063 Z M 3.292969 -2.292969 "
-           id="path8933" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph41-6">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 0 L 0.675781 -6.589844 L 3.472656 -6.589844 C 4.175781 -6.585938 4.6875 -6.527344 5.007813 -6.410156 C 5.324219 -6.292969 5.578125 -6.082031 5.773438 -5.78125 C 5.960938 -5.476563 6.058594 -5.128906 6.0625 -4.742188 C 6.058594 -4.242188 5.914063 -3.835938 5.625 -3.515625 C 5.335938 -3.195313 4.902344 -2.992188 4.324219 -2.90625 C 4.609375 -2.738281 4.84375 -2.554688 5.035156 -2.355469 C 5.21875 -2.152344 5.472656 -1.792969 5.792969 -1.285156 L 6.597656 0 L 5.007813 0 L 4.046875 -1.433594 C 3.699219 -1.945313 3.464844 -2.269531 3.34375 -2.402344 C 3.214844 -2.535156 3.082031 -2.625 2.945313 -2.675781 C 2.800781 -2.722656 2.578125 -2.746094 2.273438 -2.75 L 2.003906 -2.75 L 2.003906 0 Z M 2.003906 -3.800781 L 2.988281 -3.800781 C 3.625 -3.796875 4.023438 -3.824219 4.183594 -3.882813 C 4.339844 -3.933594 4.460938 -4.027344 4.554688 -4.160156 C 4.640625 -4.292969 4.6875 -4.457031 4.691406 -4.65625 C 4.6875 -4.875 4.628906 -5.054688 4.511719 -5.191406 C 4.394531 -5.328125 4.226563 -5.414063 4.015625 -5.453125 C 3.902344 -5.464844 3.578125 -5.472656 3.042969 -5.476563 L 2.003906 -5.476563 Z M 2.003906 -3.800781 "
-           id="path8936" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph41-7">
-        <path
-           style="stroke:none;"
-           d="M 2.851563 -4.773438 L 2.851563 -3.765625 L 1.988281 -3.765625 L 1.988281 -1.84375 C 1.984375 -1.449219 1.992188 -1.222656 2.011719 -1.160156 C 2.023438 -1.097656 2.0625 -1.042969 2.121094 -1 C 2.179688 -0.957031 2.25 -0.9375 2.335938 -0.9375 C 2.449219 -0.9375 2.617188 -0.976563 2.84375 -1.0625 L 2.953125 -0.0820313 C 2.65625 0.0429688 2.324219 0.105469 1.957031 0.109375 C 1.726563 0.105469 1.519531 0.0703125 1.339844 -0.00390625 C 1.15625 -0.078125 1.023438 -0.175781 0.9375 -0.300781 C 0.851563 -0.417969 0.789063 -0.585938 0.757813 -0.796875 C 0.726563 -0.941406 0.714844 -1.238281 0.71875 -1.6875 L 0.71875 -3.765625 L 0.140625 -3.765625 L 0.140625 -4.773438 L 0.71875 -4.773438 L 0.71875 -5.722656 L 1.988281 -6.460938 L 1.988281 -4.773438 Z M 2.851563 -4.773438 "
-           id="path8939" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-0">
-        <path
-           style="stroke:none;"
-           d="M 0.207031 -1.308594 L 1.425781 -1.496094 C 1.472656 -1.253906 1.578125 -1.074219 1.742188 -0.957031 C 1.898438 -0.832031 2.121094 -0.773438 2.410156 -0.773438 C 2.71875 -0.773438 2.957031 -0.828125 3.117188 -0.945313 C 3.222656 -1.023438 3.273438 -1.132813 3.277344 -1.269531 C 3.273438 -1.359375 3.246094 -1.4375 3.191406 -1.5 C 3.128906 -1.554688 2.992188 -1.605469 2.785156 -1.660156 C 1.800781 -1.871094 1.179688 -2.070313 0.921875 -2.253906 C 0.554688 -2.5 0.375 -2.84375 0.375 -3.285156 C 0.375 -3.679688 0.53125 -4.015625 0.847656 -4.289063 C 1.160156 -4.558594 1.648438 -4.691406 2.308594 -4.695313 C 2.9375 -4.691406 3.402344 -4.589844 3.710938 -4.386719 C 4.011719 -4.179688 4.222656 -3.878906 4.34375 -3.480469 L 3.195313 -3.269531 C 3.140625 -3.449219 3.046875 -3.585938 2.914063 -3.679688 C 2.773438 -3.773438 2.582031 -3.820313 2.332031 -3.824219 C 2.011719 -3.820313 1.78125 -3.773438 1.644531 -3.6875 C 1.546875 -3.621094 1.5 -3.539063 1.503906 -3.441406 C 1.5 -3.351563 1.539063 -3.28125 1.625 -3.222656 C 1.730469 -3.140625 2.109375 -3.023438 2.757813 -2.878906 C 3.402344 -2.730469 3.855469 -2.550781 4.117188 -2.339844 C 4.367188 -2.121094 4.492188 -1.820313 4.496094 -1.4375 C 4.492188 -1.011719 4.316406 -0.648438 3.96875 -0.347656 C 3.613281 -0.0429688 3.09375 0.105469 2.410156 0.105469 C 1.777344 0.105469 1.28125 -0.0195313 0.921875 -0.273438 C 0.554688 -0.527344 0.316406 -0.871094 0.207031 -1.308594 Z M 0.207031 -1.308594 "
-           id="path8942" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-1">
-        <path
-           style="stroke:none;"
-           d="M 2.742188 -4.59375 L 2.742188 -3.625 L 1.910156 -3.625 L 1.910156 -1.773438 C 1.910156 -1.394531 1.917969 -1.175781 1.933594 -1.117188 C 1.949219 -1.050781 1.984375 -1 2.042969 -0.960938 C 2.097656 -0.921875 2.167969 -0.902344 2.25 -0.902344 C 2.359375 -0.902344 2.519531 -0.941406 2.738281 -1.019531 L 2.839844 -0.078125 C 2.554688 0.046875 2.234375 0.105469 1.882813 0.105469 C 1.65625 0.105469 1.457031 0.0703125 1.285156 -0.00390625 C 1.105469 -0.078125 0.976563 -0.175781 0.898438 -0.292969 C 0.8125 -0.410156 0.757813 -0.566406 0.730469 -0.765625 C 0.703125 -0.902344 0.691406 -1.1875 0.691406 -1.621094 L 0.691406 -3.625 L 0.132813 -3.625 L 0.132813 -4.59375 L 0.691406 -4.59375 L 0.691406 -5.503906 L 1.910156 -6.214844 L 1.910156 -4.59375 Z M 2.742188 -4.59375 "
-           id="path8945" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-2">
-        <path
-           style="stroke:none;"
-           d="M 4.847656 0 L 3.71875 0 L 3.71875 -0.675781 C 3.527344 -0.410156 3.304688 -0.214844 3.054688 -0.0859375 C 2.796875 0.0429688 2.542969 0.105469 2.285156 0.105469 C 1.753906 0.105469 1.300781 -0.105469 0.925781 -0.53125 C 0.550781 -0.957031 0.363281 -1.550781 0.363281 -2.3125 C 0.363281 -3.089844 0.542969 -3.679688 0.910156 -4.085938 C 1.273438 -4.488281 1.738281 -4.691406 2.300781 -4.695313 C 2.8125 -4.691406 3.257813 -4.480469 3.632813 -4.054688 L 3.632813 -6.339844 L 4.847656 -6.339844 Z M 1.605469 -2.394531 C 1.605469 -1.902344 1.671875 -1.546875 1.808594 -1.332031 C 2 -1.011719 2.273438 -0.855469 2.628906 -0.855469 C 2.90625 -0.855469 3.144531 -0.972656 3.34375 -1.210938 C 3.535156 -1.449219 3.632813 -1.804688 3.636719 -2.277344 C 3.632813 -2.804688 3.539063 -3.183594 3.351563 -3.417969 C 3.160156 -3.644531 2.914063 -3.761719 2.621094 -3.765625 C 2.328125 -3.761719 2.085938 -3.648438 1.894531 -3.421875 C 1.699219 -3.191406 1.605469 -2.847656 1.605469 -2.394531 Z M 1.605469 -2.394531 "
-           id="path8948" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-3">
-        <path
-           style="stroke:none;"
-           d="M 0.871094 -3.378906 L 0.871094 -4.59375 L 2.085938 -4.59375 L 2.085938 -3.378906 Z M 0.871094 0 L 0.871094 -1.214844 L 2.085938 -1.214844 L 2.085938 0 Z M 0.871094 0 "
-           id="path8951" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-4">
-        <path
-           style="stroke:none;"
-           d="M 1.898438 0 L 0.046875 -4.59375 L 1.324219 -4.59375 L 2.1875 -2.25 L 2.4375 -1.464844 C 2.503906 -1.664063 2.546875 -1.792969 2.566406 -1.859375 C 2.605469 -1.984375 2.648438 -2.117188 2.695313 -2.25 L 3.566406 -4.59375 L 4.816406 -4.59375 L 2.992188 0 Z M 1.898438 0 "
-           id="path8954" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-5">
-        <path
-           style="stroke:none;"
-           d="M 3.296875 -1.460938 L 4.507813 -1.257813 C 4.347656 -0.8125 4.101563 -0.472656 3.769531 -0.242188 C 3.429688 -0.0078125 3.011719 0.105469 2.507813 0.105469 C 1.707031 0.105469 1.113281 -0.15625 0.734375 -0.679688 C 0.429688 -1.09375 0.28125 -1.621094 0.28125 -2.261719 C 0.28125 -3.019531 0.480469 -3.613281 0.878906 -4.046875 C 1.277344 -4.476563 1.777344 -4.691406 2.386719 -4.695313 C 3.066406 -4.691406 3.605469 -4.46875 4.003906 -4.019531 C 4.394531 -3.566406 4.585938 -2.875 4.570313 -1.945313 L 1.527344 -1.945313 C 1.53125 -1.585938 1.628906 -1.304688 1.820313 -1.105469 C 2.003906 -0.902344 2.238281 -0.800781 2.519531 -0.804688 C 2.710938 -0.800781 2.871094 -0.855469 3 -0.960938 C 3.128906 -1.0625 3.226563 -1.226563 3.296875 -1.460938 Z M 3.363281 -2.691406 C 3.355469 -3.039063 3.261719 -3.308594 3.089844 -3.492188 C 2.914063 -3.671875 2.707031 -3.761719 2.460938 -3.765625 C 2.195313 -3.761719 1.976563 -3.664063 1.804688 -3.476563 C 1.628906 -3.28125 1.542969 -3.019531 1.546875 -2.691406 Z M 3.363281 -2.691406 "
-           id="path8957" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-6">
-        <path
-           style="stroke:none;"
-           d="M 4.640625 -3.234375 L 3.441406 -3.019531 C 3.398438 -3.253906 3.308594 -3.433594 3.167969 -3.558594 C 3.023438 -3.675781 2.839844 -3.738281 2.613281 -3.742188 C 2.308594 -3.738281 2.066406 -3.632813 1.886719 -3.425781 C 1.707031 -3.214844 1.617188 -2.867188 1.617188 -2.378906 C 1.617188 -1.832031 1.707031 -1.445313 1.890625 -1.222656 C 2.074219 -0.996094 2.320313 -0.882813 2.628906 -0.886719 C 2.859375 -0.882813 3.046875 -0.949219 3.195313 -1.082031 C 3.339844 -1.210938 3.445313 -1.4375 3.507813 -1.761719 L 4.699219 -1.558594 C 4.574219 -1.007813 4.335938 -0.589844 3.988281 -0.3125 C 3.632813 -0.03125 3.160156 0.105469 2.574219 0.105469 C 1.898438 0.105469 1.363281 -0.105469 0.964844 -0.53125 C 0.5625 -0.953125 0.363281 -1.539063 0.367188 -2.292969 C 0.363281 -3.046875 0.5625 -3.636719 0.96875 -4.0625 C 1.367188 -4.480469 1.910156 -4.691406 2.59375 -4.695313 C 3.152344 -4.691406 3.597656 -4.570313 3.929688 -4.332031 C 4.257813 -4.089844 4.496094 -3.726563 4.640625 -3.234375 Z M 4.640625 -3.234375 "
-           id="path8960" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-7">
-        <path
-           style="stroke:none;"
-           d="M 0.355469 -2.359375 C 0.351563 -2.761719 0.449219 -3.152344 0.652344 -3.53125 C 0.847656 -3.910156 1.132813 -4.199219 1.5 -4.398438 C 1.863281 -4.59375 2.269531 -4.691406 2.71875 -4.695313 C 3.414063 -4.691406 3.980469 -4.46875 4.425781 -4.019531 C 4.867188 -3.566406 5.089844 -2.996094 5.09375 -2.308594 C 5.089844 -1.613281 4.867188 -1.035156 4.421875 -0.578125 C 3.972656 -0.121094 3.410156 0.105469 2.730469 0.105469 C 2.308594 0.105469 1.90625 0.0117188 1.523438 -0.179688 C 1.140625 -0.371094 0.847656 -0.652344 0.652344 -1.019531 C 0.449219 -1.386719 0.351563 -1.832031 0.355469 -2.359375 Z M 1.601563 -2.296875 C 1.597656 -1.839844 1.707031 -1.488281 1.925781 -1.25 C 2.140625 -1.003906 2.40625 -0.882813 2.722656 -0.886719 C 3.039063 -0.882813 3.304688 -1.003906 3.519531 -1.25 C 3.734375 -1.488281 3.84375 -1.839844 3.84375 -2.304688 C 3.84375 -2.75 3.734375 -3.097656 3.519531 -3.34375 C 3.304688 -3.582031 3.039063 -3.703125 2.722656 -3.707031 C 2.40625 -3.703125 2.140625 -3.582031 1.925781 -3.34375 C 1.707031 -3.097656 1.597656 -2.75 1.601563 -2.296875 Z M 1.601563 -2.296875 "
-           id="path8963" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph42-8">
-        <path
-           style="stroke:none;"
-           d="M 1.800781 0 L 0.582031 0 L 0.582031 -4.59375 L 1.710938 -4.59375 L 1.710938 -3.941406 C 1.90625 -4.246094 2.078125 -4.449219 2.234375 -4.546875 C 2.382813 -4.644531 2.558594 -4.691406 2.757813 -4.695313 C 3.035156 -4.691406 3.300781 -4.617188 3.558594 -4.46875 L 3.183594 -3.40625 C 2.972656 -3.535156 2.785156 -3.601563 2.613281 -3.605469 C 2.441406 -3.601563 2.296875 -3.554688 2.179688 -3.464844 C 2.0625 -3.367188 1.96875 -3.199219 1.902344 -2.957031 C 1.832031 -2.710938 1.796875 -2.199219 1.800781 -1.417969 Z M 1.800781 0 "
-           id="path8966" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph43-0">
-        <path
-           style="stroke:none;"
-           d="M 0.214844 -1.34375 L 1.464844 -1.535156 C 1.519531 -1.292969 1.625 -1.105469 1.789063 -0.980469 C 1.949219 -0.851563 2.179688 -0.789063 2.472656 -0.792969 C 2.792969 -0.789063 3.035156 -0.851563 3.199219 -0.972656 C 3.308594 -1.054688 3.363281 -1.164063 3.363281 -1.304688 C 3.363281 -1.394531 3.332031 -1.472656 3.277344 -1.539063 C 3.210938 -1.597656 3.074219 -1.652344 2.859375 -1.703125 C 1.851563 -1.925781 1.210938 -2.128906 0.945313 -2.3125 C 0.570313 -2.566406 0.386719 -2.921875 0.386719 -3.375 C 0.386719 -3.78125 0.546875 -4.121094 0.871094 -4.402344 C 1.191406 -4.675781 1.691406 -4.816406 2.371094 -4.820313 C 3.015625 -4.816406 3.496094 -4.710938 3.808594 -4.503906 C 4.121094 -4.292969 4.335938 -3.984375 4.457031 -3.574219 L 3.28125 -3.355469 C 3.230469 -3.539063 3.132813 -3.679688 2.996094 -3.777344 C 2.851563 -3.875 2.652344 -3.925781 2.394531 -3.925781 C 2.0625 -3.925781 1.828125 -3.878906 1.6875 -3.785156 C 1.59375 -3.71875 1.546875 -3.632813 1.546875 -3.535156 C 1.546875 -3.441406 1.585938 -3.367188 1.667969 -3.308594 C 1.78125 -3.222656 2.171875 -3.105469 2.835938 -2.957031 C 3.5 -2.804688 3.960938 -2.621094 4.226563 -2.402344 C 4.484375 -2.179688 4.617188 -1.867188 4.617188 -1.472656 C 4.617188 -1.035156 4.433594 -0.664063 4.074219 -0.359375 C 3.710938 -0.046875 3.179688 0.105469 2.472656 0.105469 C 1.828125 0.105469 1.316406 -0.0234375 0.945313 -0.285156 C 0.570313 -0.542969 0.328125 -0.894531 0.214844 -1.34375 Z M 0.214844 -1.34375 "
-           id="path8969" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph43-1">
-        <path
-           style="stroke:none;"
-           d="M 2.816406 -4.714844 L 2.816406 -3.71875 L 1.960938 -3.71875 L 1.960938 -1.820313 C 1.957031 -1.433594 1.964844 -1.207031 1.984375 -1.144531 C 2 -1.078125 2.039063 -1.027344 2.097656 -0.988281 C 2.152344 -0.945313 2.222656 -0.925781 2.308594 -0.929688 C 2.421875 -0.925781 2.585938 -0.964844 2.808594 -1.046875 L 2.917969 -0.078125 C 2.625 0.046875 2.292969 0.105469 1.929688 0.105469 C 1.703125 0.105469 1.5 0.0703125 1.320313 -0.0078125 C 1.136719 -0.078125 1.007813 -0.175781 0.925781 -0.300781 C 0.839844 -0.417969 0.78125 -0.582031 0.75 -0.785156 C 0.722656 -0.929688 0.707031 -1.222656 0.710938 -1.664063 L 0.710938 -3.71875 L 0.136719 -3.71875 L 0.136719 -4.714844 L 0.710938 -4.714844 L 0.710938 -5.652344 L 1.960938 -6.378906 L 1.960938 -4.714844 Z M 2.816406 -4.714844 "
-           id="path8972" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph43-2">
-        <path
-           style="stroke:none;"
-           d="M 4.976563 0 L 3.816406 0 L 3.816406 -0.691406 C 3.621094 -0.421875 3.394531 -0.21875 3.136719 -0.0898438 C 2.871094 0.0429688 2.609375 0.105469 2.34375 0.105469 C 1.800781 0.105469 1.335938 -0.109375 0.949219 -0.546875 C 0.5625 -0.980469 0.367188 -1.589844 0.371094 -2.375 C 0.367188 -3.171875 0.554688 -3.78125 0.933594 -4.195313 C 1.308594 -4.609375 1.785156 -4.816406 2.363281 -4.820313 C 2.886719 -4.816406 3.34375 -4.597656 3.730469 -4.164063 L 3.730469 -6.507813 L 4.976563 -6.507813 Z M 1.648438 -2.460938 C 1.644531 -1.953125 1.714844 -1.585938 1.855469 -1.367188 C 2.054688 -1.039063 2.335938 -0.878906 2.699219 -0.878906 C 2.984375 -0.878906 3.226563 -1 3.429688 -1.246094 C 3.628906 -1.488281 3.730469 -1.851563 3.734375 -2.339844 C 3.730469 -2.878906 3.632813 -3.269531 3.441406 -3.507813 C 3.242188 -3.746094 2.992188 -3.867188 2.691406 -3.867188 C 2.390625 -3.867188 2.144531 -3.746094 1.945313 -3.511719 C 1.746094 -3.273438 1.644531 -2.925781 1.648438 -2.460938 Z M 1.648438 -2.460938 "
-           id="path8975" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph43-3">
-        <path
-           style="stroke:none;"
-           d="M 0.890625 -3.46875 L 0.890625 -4.714844 L 2.140625 -4.714844 L 2.140625 -3.46875 Z M 0.890625 0 L 0.890625 -1.246094 L 2.140625 -1.246094 L 2.140625 0 Z M 0.890625 0 "
-           id="path8978" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph43-4">
-        <path
-           style="stroke:none;"
-           d="M 1.585938 -3.277344 L 0.453125 -3.480469 C 0.578125 -3.933594 0.796875 -4.269531 1.109375 -4.492188 C 1.414063 -4.707031 1.878906 -4.816406 2.496094 -4.820313 C 3.050781 -4.816406 3.460938 -4.75 3.734375 -4.621094 C 4.003906 -4.488281 4.199219 -4.320313 4.3125 -4.121094 C 4.421875 -3.914063 4.476563 -3.542969 4.480469 -3.003906 L 4.464844 -1.550781 C 4.464844 -1.132813 4.484375 -0.828125 4.523438 -0.632813 C 4.5625 -0.433594 4.636719 -0.222656 4.75 0 L 3.515625 0 C 3.484375 -0.0820313 3.441406 -0.203125 3.394531 -0.367188 C 3.371094 -0.441406 3.359375 -0.492188 3.351563 -0.515625 C 3.136719 -0.304688 2.90625 -0.148438 2.667969 -0.046875 C 2.421875 0.0546875 2.164063 0.105469 1.890625 0.105469 C 1.402344 0.105469 1.019531 -0.0234375 0.742188 -0.289063 C 0.460938 -0.546875 0.320313 -0.882813 0.324219 -1.289063 C 0.320313 -1.554688 0.386719 -1.789063 0.515625 -2 C 0.640625 -2.203125 0.816406 -2.363281 1.050781 -2.476563 C 1.277344 -2.585938 1.609375 -2.683594 2.042969 -2.769531 C 2.625 -2.875 3.027344 -2.976563 3.253906 -3.078125 L 3.253906 -3.199219 C 3.25 -3.4375 3.191406 -3.605469 3.074219 -3.710938 C 2.953125 -3.8125 2.730469 -3.867188 2.40625 -3.867188 C 2.183594 -3.867188 2.011719 -3.820313 1.886719 -3.734375 C 1.761719 -3.644531 1.660156 -3.492188 1.585938 -3.277344 Z M 3.253906 -2.265625 C 3.089844 -2.207031 2.839844 -2.140625 2.496094 -2.070313 C 2.148438 -1.992188 1.921875 -1.921875 1.816406 -1.855469 C 1.652344 -1.738281 1.570313 -1.59375 1.570313 -1.417969 C 1.570313 -1.238281 1.632813 -1.085938 1.765625 -0.960938 C 1.890625 -0.835938 2.058594 -0.773438 2.265625 -0.773438 C 2.488281 -0.773438 2.703125 -0.847656 2.90625 -0.996094 C 3.054688 -1.105469 3.152344 -1.242188 3.207031 -1.40625 C 3.234375 -1.511719 3.25 -1.714844 3.253906 -2.015625 Z M 3.253906 -2.265625 "
-           id="path8981" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph43-5">
-        <path
-           style="stroke:none;"
-           d="M 1.847656 0 L 0.597656 0 L 0.597656 -4.714844 L 1.757813 -4.714844 L 1.757813 -4.042969 C 1.953125 -4.359375 2.128906 -4.566406 2.292969 -4.667969 C 2.449219 -4.765625 2.628906 -4.816406 2.832031 -4.820313 C 3.113281 -4.816406 3.386719 -4.738281 3.652344 -4.585938 L 3.265625 -3.5 C 3.054688 -3.632813 2.859375 -3.703125 2.679688 -3.703125 C 2.5 -3.703125 2.351563 -3.652344 2.234375 -3.558594 C 2.109375 -3.457031 2.015625 -3.285156 1.949219 -3.035156 C 1.878906 -2.785156 1.84375 -2.257813 1.847656 -1.457031 Z M 1.847656 0 "
-           id="path8984" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph43-6">
-        <path
-           style="stroke:none;"
-           d="M 0.0625 -4.714844 L 1.390625 -4.714844 L 2.515625 -1.367188 L 3.617188 -4.714844 L 4.910156 -4.714844 L 3.246094 -0.175781 L 2.949219 0.644531 C 2.835938 0.917969 2.730469 1.125 2.632813 1.273438 C 2.53125 1.414063 2.417969 1.53125 2.292969 1.625 C 2.160156 1.710938 2.003906 1.78125 1.816406 1.835938 C 1.628906 1.882813 1.414063 1.910156 1.179688 1.914063 C 0.9375 1.910156 0.703125 1.886719 0.476563 1.839844 L 0.363281 0.859375 C 0.558594 0.894531 0.734375 0.914063 0.890625 0.917969 C 1.179688 0.914063 1.394531 0.828125 1.535156 0.664063 C 1.675781 0.492188 1.78125 0.277344 1.855469 0.0117188 Z M 0.0625 -4.714844 "
-           id="path8987" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph44-0">
-        <path
-           style="stroke:none;"
-           d="M 0.613281 0 L 0.613281 -6.035156 L 5.085938 -6.035156 L 5.085938 -5.011719 L 1.832031 -5.011719 L 1.832031 -3.675781 L 4.859375 -3.675781 L 4.859375 -2.660156 L 1.832031 -2.660156 L 1.832031 -1.015625 L 5.203125 -1.015625 L 5.203125 0 Z M 0.613281 0 "
-           id="path8990" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph44-1">
-        <path
-           style="stroke:none;"
-           d="M 0.0507813 0 L 1.625 -2.25 L 0.117188 -4.371094 L 1.527344 -4.371094 L 2.300781 -3.167969 L 3.117188 -4.371094 L 4.472656 -4.371094 L 2.992188 -2.300781 L 4.609375 0 L 3.191406 0 L 2.300781 -1.355469 L 1.402344 0 Z M 0.0507813 0 "
-           id="path8993" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph44-2">
-        <path
-           style="stroke:none;"
-           d="M 2.609375 -4.371094 L 2.609375 -3.449219 L 1.820313 -3.449219 L 1.820313 -1.6875 C 1.816406 -1.328125 1.824219 -1.117188 1.839844 -1.0625 C 1.855469 -1 1.886719 -0.953125 1.941406 -0.917969 C 1.992188 -0.875 2.058594 -0.855469 2.140625 -0.859375 C 2.242188 -0.855469 2.398438 -0.894531 2.605469 -0.972656 L 2.703125 -0.0742188 C 2.429688 0.0390625 2.125 0.09375 1.789063 0.0976563 C 1.578125 0.09375 1.390625 0.0625 1.226563 -0.00390625 C 1.054688 -0.0703125 0.933594 -0.160156 0.859375 -0.277344 C 0.777344 -0.386719 0.722656 -0.539063 0.695313 -0.726563 C 0.667969 -0.859375 0.65625 -1.128906 0.660156 -1.542969 L 0.660156 -3.449219 L 0.128906 -3.449219 L 0.128906 -4.371094 L 0.660156 -4.371094 L 0.660156 -5.238281 L 1.820313 -5.914063 L 1.820313 -4.371094 Z M 2.609375 -4.371094 "
-           id="path8996" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph44-3">
-        <path
-           style="stroke:none;"
-           d="M 3.136719 -1.390625 L 4.289063 -1.199219 C 4.140625 -0.773438 3.90625 -0.449219 3.585938 -0.230469 C 3.265625 -0.0117188 2.863281 0.09375 2.386719 0.0976563 C 1.621094 0.09375 1.058594 -0.152344 0.699219 -0.644531 C 0.40625 -1.042969 0.261719 -1.542969 0.265625 -2.152344 C 0.261719 -2.871094 0.453125 -3.4375 0.832031 -3.851563 C 1.210938 -4.257813 1.691406 -4.464844 2.273438 -4.46875 C 2.921875 -4.464844 3.433594 -4.25 3.808594 -3.824219 C 4.183594 -3.390625 4.363281 -2.734375 4.351563 -1.851563 L 1.453125 -1.851563 C 1.460938 -1.507813 1.550781 -1.238281 1.730469 -1.050781 C 1.90625 -0.855469 2.128906 -0.761719 2.398438 -0.765625 C 2.578125 -0.761719 2.730469 -0.8125 2.855469 -0.914063 C 2.976563 -1.011719 3.070313 -1.167969 3.136719 -1.390625 Z M 3.203125 -2.558594 C 3.191406 -2.890625 3.105469 -3.144531 2.941406 -3.324219 C 2.777344 -3.496094 2.574219 -3.585938 2.339844 -3.585938 C 2.085938 -3.585938 1.878906 -3.492188 1.714844 -3.308594 C 1.546875 -3.121094 1.464844 -2.871094 1.472656 -2.558594 Z M 3.203125 -2.558594 "
-           id="path8999" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph44-4">
-        <path
-           style="stroke:none;"
-           d="M 4.582031 0 L 3.425781 0 L 3.425781 -2.230469 C 3.421875 -2.699219 3.398438 -3.003906 3.351563 -3.144531 C 3.300781 -3.28125 3.21875 -3.390625 3.109375 -3.46875 C 2.996094 -3.546875 2.863281 -3.585938 2.707031 -3.585938 C 2.503906 -3.585938 2.324219 -3.527344 2.167969 -3.417969 C 2.007813 -3.304688 1.898438 -3.160156 1.839844 -2.980469 C 1.78125 -2.796875 1.75 -2.464844 1.753906 -1.980469 L 1.753906 0 L 0.597656 0 L 0.597656 -4.371094 L 1.671875 -4.371094 L 1.671875 -3.730469 C 2.046875 -4.21875 2.527344 -4.464844 3.113281 -4.46875 C 3.363281 -4.464844 3.597656 -4.417969 3.808594 -4.332031 C 4.019531 -4.238281 4.179688 -4.121094 4.289063 -3.980469 C 4.398438 -3.832031 4.472656 -3.671875 4.515625 -3.492188 C 4.558594 -3.3125 4.582031 -3.050781 4.582031 -2.714844 Z M 4.582031 0 "
-           id="path9002" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-0">
-        <path
-           style="stroke:none;"
-           d="M 0.671875 0 L 0.671875 -6.609375 L 5.574219 -6.609375 L 5.574219 -5.492188 L 2.007813 -5.492188 L 2.007813 -4.027344 L 5.324219 -4.027344 L 5.324219 -2.914063 L 2.007813 -2.914063 L 2.007813 -1.113281 L 5.699219 -1.113281 L 5.699219 0 Z M 0.671875 0 "
-           id="path9005" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-1">
-        <path
-           style="stroke:none;"
-           d="M 1.980469 0 L 0.0507813 -4.789063 L 1.378906 -4.789063 L 2.28125 -2.34375 L 2.542969 -1.527344 C 2.613281 -1.734375 2.65625 -1.871094 2.675781 -1.9375 C 2.714844 -2.070313 2.757813 -2.207031 2.808594 -2.34375 L 3.71875 -4.789063 L 5.023438 -4.789063 L 3.121094 0 Z M 1.980469 0 "
-           id="path9008" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-2">
-        <path
-           style="stroke:none;"
-           d="M 3.4375 -1.523438 L 4.699219 -1.3125 C 4.535156 -0.847656 4.277344 -0.492188 3.929688 -0.253906 C 3.578125 -0.0117188 3.140625 0.105469 2.617188 0.109375 C 1.78125 0.105469 1.164063 -0.164063 0.765625 -0.707031 C 0.449219 -1.140625 0.292969 -1.691406 0.292969 -2.359375 C 0.292969 -3.152344 0.5 -3.773438 0.914063 -4.222656 C 1.328125 -4.671875 1.851563 -4.898438 2.488281 -4.898438 C 3.199219 -4.898438 3.761719 -4.660156 4.175781 -4.191406 C 4.585938 -3.714844 4.78125 -2.996094 4.765625 -2.03125 L 1.59375 -2.03125 C 1.601563 -1.652344 1.703125 -1.359375 1.898438 -1.152344 C 2.09375 -0.941406 2.335938 -0.835938 2.628906 -0.839844 C 2.824219 -0.835938 2.992188 -0.890625 3.128906 -1 C 3.261719 -1.105469 3.363281 -1.28125 3.4375 -1.523438 Z M 3.507813 -2.804688 C 3.496094 -3.167969 3.402344 -3.445313 3.222656 -3.640625 C 3.042969 -3.828125 2.824219 -3.925781 2.566406 -3.929688 C 2.289063 -3.925781 2.058594 -3.824219 1.878906 -3.625 C 1.695313 -3.421875 1.605469 -3.148438 1.613281 -2.804688 Z M 3.507813 -2.804688 "
-           id="path9011" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-3">
-        <path
-           style="stroke:none;"
-           d="M 5.019531 0 L 3.753906 0 L 3.753906 -2.445313 C 3.75 -2.957031 3.722656 -3.292969 3.671875 -3.445313 C 3.613281 -3.597656 3.523438 -3.714844 3.40625 -3.800781 C 3.28125 -3.882813 3.136719 -3.925781 2.96875 -3.929688 C 2.746094 -3.925781 2.546875 -3.863281 2.375 -3.746094 C 2.195313 -3.621094 2.078125 -3.464844 2.015625 -3.269531 C 1.953125 -3.070313 1.921875 -2.703125 1.921875 -2.167969 L 1.921875 0 L 0.652344 0 L 0.652344 -4.789063 L 1.832031 -4.789063 L 1.832031 -4.085938 C 2.246094 -4.625 2.773438 -4.898438 3.410156 -4.898438 C 3.6875 -4.898438 3.941406 -4.847656 4.175781 -4.746094 C 4.40625 -4.644531 4.582031 -4.515625 4.699219 -4.359375 C 4.816406 -4.203125 4.898438 -4.023438 4.949219 -3.828125 C 4.992188 -3.625 5.015625 -3.34375 5.019531 -2.976563 Z M 5.019531 0 "
-           id="path9014" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-4">
-        <path
-           style="stroke:none;"
-           d="M 2.859375 -4.789063 L 2.859375 -3.777344 L 1.992188 -3.777344 L 1.992188 -1.847656 C 1.992188 -1.453125 2 -1.226563 2.015625 -1.164063 C 2.03125 -1.097656 2.066406 -1.042969 2.128906 -1.003906 C 2.183594 -0.957031 2.257813 -0.9375 2.34375 -0.941406 C 2.460938 -0.9375 2.628906 -0.976563 2.855469 -1.0625 L 2.960938 -0.0820313 C 2.664063 0.0429688 2.332031 0.105469 1.960938 0.109375 C 1.730469 0.105469 1.523438 0.0703125 1.34375 -0.00390625 C 1.15625 -0.078125 1.023438 -0.179688 0.941406 -0.304688 C 0.851563 -0.425781 0.792969 -0.589844 0.761719 -0.796875 C 0.734375 -0.945313 0.71875 -1.242188 0.722656 -1.691406 L 0.722656 -3.777344 L 0.140625 -3.777344 L 0.140625 -4.789063 L 0.722656 -4.789063 L 0.722656 -5.742188 L 1.992188 -6.480469 L 1.992188 -4.789063 Z M 2.859375 -4.789063 "
-           id="path9017" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-5">
-        <path
-           style="stroke:none;"
-           d="M 4.902344 -2.429688 L 6.195313 -2.019531 C 5.992188 -1.296875 5.664063 -0.761719 5.207031 -0.410156 C 4.742188 -0.0625 4.15625 0.109375 3.449219 0.113281 C 2.570313 0.109375 1.847656 -0.1875 1.285156 -0.785156 C 0.714844 -1.386719 0.433594 -2.207031 0.4375 -3.246094 C 0.433594 -4.347656 0.71875 -5.199219 1.289063 -5.808594 C 1.855469 -6.414063 2.601563 -6.71875 3.53125 -6.722656 C 4.335938 -6.71875 4.992188 -6.480469 5.5 -6.007813 C 5.800781 -5.71875 6.027344 -5.3125 6.179688 -4.789063 L 4.855469 -4.472656 C 4.773438 -4.8125 4.609375 -5.085938 4.367188 -5.285156 C 4.117188 -5.480469 3.816406 -5.578125 3.464844 -5.582031 C 2.96875 -5.578125 2.570313 -5.402344 2.269531 -5.054688 C 1.960938 -4.699219 1.808594 -4.128906 1.8125 -3.347656 C 1.808594 -2.507813 1.960938 -1.914063 2.261719 -1.558594 C 2.5625 -1.203125 2.953125 -1.023438 3.4375 -1.027344 C 3.789063 -1.023438 4.09375 -1.136719 4.351563 -1.363281 C 4.605469 -1.585938 4.789063 -1.941406 4.902344 -2.429688 Z M 4.902344 -2.429688 "
-           id="path9020" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-6">
-        <path
-           style="stroke:none;"
-           d="M 3.816406 0 L 3.816406 -0.71875 C 3.636719 -0.457031 3.410156 -0.253906 3.128906 -0.109375 C 2.84375 0.0351563 2.542969 0.105469 2.226563 0.109375 C 1.90625 0.105469 1.617188 0.0390625 1.363281 -0.101563 C 1.105469 -0.242188 0.921875 -0.441406 0.808594 -0.699219 C 0.691406 -0.949219 0.632813 -1.304688 0.636719 -1.757813 L 0.636719 -4.789063 L 1.902344 -4.789063 L 1.902344 -2.589844 C 1.898438 -1.914063 1.921875 -1.5 1.972656 -1.347656 C 2.015625 -1.195313 2.101563 -1.074219 2.226563 -0.988281 C 2.351563 -0.898438 2.507813 -0.855469 2.695313 -0.855469 C 2.910156 -0.855469 3.101563 -0.914063 3.277344 -1.035156 C 3.445313 -1.152344 3.5625 -1.300781 3.628906 -1.476563 C 3.691406 -1.652344 3.726563 -2.082031 3.726563 -2.769531 L 3.726563 -4.789063 L 4.992188 -4.789063 L 4.992188 0 Z M 3.816406 0 "
-           id="path9023" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-7">
-        <path
-           style="stroke:none;"
-           d="M 5.054688 0 L 3.878906 0 L 3.878906 -0.703125 C 3.683594 -0.429688 3.449219 -0.222656 3.183594 -0.0898438 C 2.914063 0.0429688 2.648438 0.105469 2.382813 0.109375 C 1.828125 0.105469 1.355469 -0.113281 0.964844 -0.554688 C 0.570313 -0.996094 0.375 -1.617188 0.378906 -2.414063 C 0.375 -3.222656 0.566406 -3.835938 0.949219 -4.261719 C 1.332031 -4.683594 1.8125 -4.898438 2.398438 -4.898438 C 2.929688 -4.898438 3.394531 -4.675781 3.789063 -4.230469 L 3.789063 -6.609375 L 5.054688 -6.609375 Z M 1.671875 -2.5 C 1.667969 -1.984375 1.742188 -1.613281 1.886719 -1.390625 C 2.089844 -1.054688 2.375 -0.890625 2.742188 -0.894531 C 3.03125 -0.890625 3.277344 -1.015625 3.484375 -1.265625 C 3.6875 -1.511719 3.789063 -1.878906 3.792969 -2.375 C 3.789063 -2.921875 3.691406 -3.320313 3.496094 -3.566406 C 3.296875 -3.804688 3.042969 -3.925781 2.734375 -3.929688 C 2.429688 -3.925781 2.175781 -3.804688 1.976563 -3.570313 C 1.769531 -3.328125 1.667969 -2.972656 1.671875 -2.5 Z M 1.671875 -2.5 "
-           id="path9026" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-8">
-        <path
-           style="stroke:none;"
-           d="M 1.609375 -3.328125 L 0.460938 -3.535156 C 0.585938 -3.996094 0.808594 -4.339844 1.125 -4.5625 C 1.441406 -4.785156 1.910156 -4.898438 2.535156 -4.898438 C 3.097656 -4.898438 3.519531 -4.828125 3.796875 -4.695313 C 4.070313 -4.558594 4.261719 -4.390625 4.378906 -4.1875 C 4.488281 -3.980469 4.546875 -3.601563 4.550781 -3.054688 L 4.535156 -1.574219 C 4.535156 -1.148438 4.554688 -0.835938 4.597656 -0.640625 C 4.636719 -0.4375 4.710938 -0.226563 4.824219 0 L 3.570313 0 C 3.535156 -0.0820313 3.496094 -0.207031 3.449219 -0.375 C 3.425781 -0.449219 3.410156 -0.496094 3.40625 -0.523438 C 3.183594 -0.308594 2.953125 -0.152344 2.707031 -0.046875 C 2.460938 0.0546875 2.199219 0.105469 1.921875 0.109375 C 1.425781 0.105469 1.035156 -0.0273438 0.753906 -0.292969 C 0.46875 -0.558594 0.328125 -0.894531 0.328125 -1.308594 C 0.328125 -1.574219 0.390625 -1.816406 0.523438 -2.03125 C 0.648438 -2.238281 0.832031 -2.398438 1.066406 -2.515625 C 1.296875 -2.625 1.632813 -2.726563 2.074219 -2.8125 C 2.664063 -2.921875 3.074219 -3.023438 3.304688 -3.125 L 3.304688 -3.25 C 3.304688 -3.492188 3.242188 -3.667969 3.125 -3.773438 C 3 -3.875 2.773438 -3.925781 2.445313 -3.929688 C 2.214844 -3.925781 2.039063 -3.878906 1.914063 -3.792969 C 1.785156 -3.699219 1.683594 -3.546875 1.609375 -3.328125 Z M 3.304688 -2.300781 C 3.140625 -2.242188 2.882813 -2.175781 2.535156 -2.105469 C 2.179688 -2.027344 1.949219 -1.957031 1.84375 -1.886719 C 1.675781 -1.769531 1.59375 -1.617188 1.597656 -1.4375 C 1.59375 -1.257813 1.660156 -1.105469 1.792969 -0.976563 C 1.925781 -0.847656 2.09375 -0.785156 2.300781 -0.785156 C 2.527344 -0.785156 2.746094 -0.859375 2.953125 -1.011719 C 3.105469 -1.121094 3.207031 -1.261719 3.257813 -1.429688 C 3.289063 -1.539063 3.304688 -1.742188 3.304688 -2.046875 Z M 3.304688 -2.300781 "
-           id="path9029" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph45-9">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 0 L 0.675781 -6.609375 L 3.484375 -6.609375 C 4.191406 -6.609375 4.707031 -6.546875 5.027344 -6.429688 C 5.347656 -6.308594 5.601563 -6.097656 5.792969 -5.796875 C 5.984375 -5.492188 6.082031 -5.148438 6.082031 -4.757813 C 6.082031 -4.257813 5.933594 -3.847656 5.644531 -3.527344 C 5.347656 -3.203125 4.914063 -3 4.339844 -2.917969 C 4.625 -2.746094 4.863281 -2.558594 5.050781 -2.363281 C 5.238281 -2.160156 5.492188 -1.804688 5.8125 -1.289063 L 6.621094 0 L 5.023438 0 L 4.058594 -1.4375 C 3.710938 -1.949219 3.476563 -2.273438 3.351563 -2.410156 C 3.226563 -2.539063 3.09375 -2.632813 2.953125 -2.683594 C 2.8125 -2.734375 2.585938 -2.757813 2.28125 -2.761719 L 2.011719 -2.761719 L 2.011719 0 Z M 2.011719 -3.816406 L 3 -3.816406 C 3.640625 -3.8125 4.039063 -3.839844 4.199219 -3.894531 C 4.355469 -3.945313 4.480469 -4.039063 4.570313 -4.175781 C 4.660156 -4.304688 4.703125 -4.46875 4.707031 -4.671875 C 4.703125 -4.890625 4.644531 -5.070313 4.527344 -5.207031 C 4.40625 -5.34375 4.238281 -5.429688 4.027344 -5.46875 C 3.914063 -5.480469 3.589844 -5.488281 3.054688 -5.492188 L 2.011719 -5.492188 Z M 2.011719 -3.816406 "
-           id="path9032" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph46-0">
-        <path
-           style="stroke:none;"
-           d="M 0.613281 -6.082031 L 2.859375 -6.082031 C 3.363281 -6.082031 3.746094 -6.042969 4.015625 -5.964844 C 4.367188 -5.855469 4.671875 -5.667969 4.929688 -5.402344 C 5.179688 -5.132813 5.375 -4.804688 5.511719 -4.417969 C 5.640625 -4.027344 5.707031 -3.546875 5.710938 -2.980469 C 5.707031 -2.476563 5.644531 -2.046875 5.523438 -1.6875 C 5.371094 -1.242188 5.15625 -0.882813 4.875 -0.613281 C 4.660156 -0.40625 4.371094 -0.246094 4.011719 -0.128906 C 3.738281 -0.0429688 3.375 0 2.925781 0 L 0.613281 0 Z M 1.839844 -5.050781 L 1.839844 -1.023438 L 2.757813 -1.023438 C 3.097656 -1.019531 3.34375 -1.039063 3.5 -1.082031 C 3.695313 -1.128906 3.863281 -1.214844 3.996094 -1.335938 C 4.125 -1.453125 4.230469 -1.648438 4.316406 -1.921875 C 4.398438 -2.195313 4.441406 -2.566406 4.441406 -3.035156 C 4.441406 -3.503906 4.398438 -3.863281 4.316406 -4.117188 C 4.230469 -4.367188 4.113281 -4.5625 3.96875 -4.707031 C 3.816406 -4.84375 3.628906 -4.941406 3.402344 -4.992188 C 3.226563 -5.03125 2.890625 -5.050781 2.394531 -5.050781 Z M 1.839844 -5.050781 "
-           id="path9035" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph46-1">
-        <path
-           style="stroke:none;"
-           d="M 0.609375 -5 L 0.609375 -6.082031 L 1.773438 -6.082031 L 1.773438 -5 Z M 0.609375 0 L 0.609375 -4.40625 L 1.773438 -4.40625 L 1.773438 0 Z M 0.609375 0 "
-           id="path9038" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph46-2">
-        <path
-           style="stroke:none;"
-           d="M 0.523438 -4.40625 L 1.597656 -4.40625 L 1.597656 -3.804688 C 1.976563 -4.265625 2.433594 -4.5 2.96875 -4.503906 C 3.25 -4.5 3.496094 -4.441406 3.703125 -4.328125 C 3.910156 -4.210938 4.078125 -4.035156 4.214844 -3.804688 C 4.402344 -4.035156 4.613281 -4.210938 4.839844 -4.328125 C 5.0625 -4.441406 5.300781 -4.5 5.558594 -4.503906 C 5.878906 -4.5 6.152344 -4.433594 6.378906 -4.304688 C 6.601563 -4.171875 6.769531 -3.980469 6.878906 -3.730469 C 6.957031 -3.539063 6.996094 -3.234375 7 -2.816406 L 7 0 L 5.835938 0 L 5.835938 -2.515625 C 5.835938 -2.949219 5.792969 -3.234375 5.714844 -3.363281 C 5.605469 -3.527344 5.441406 -3.609375 5.21875 -3.613281 C 5.050781 -3.609375 4.898438 -3.558594 4.757813 -3.460938 C 4.613281 -3.359375 4.507813 -3.214844 4.445313 -3.023438 C 4.378906 -2.832031 4.347656 -2.527344 4.351563 -2.113281 L 4.351563 0 L 3.183594 0 L 3.183594 -2.414063 C 3.179688 -2.84375 3.160156 -3.117188 3.121094 -3.242188 C 3.078125 -3.363281 3.015625 -3.457031 2.929688 -3.519531 C 2.84375 -3.578125 2.722656 -3.609375 2.574219 -3.613281 C 2.390625 -3.609375 2.230469 -3.558594 2.089844 -3.464844 C 1.945313 -3.363281 1.84375 -3.226563 1.78125 -3.046875 C 1.71875 -2.863281 1.6875 -2.558594 1.6875 -2.140625 L 1.6875 0 L 0.523438 0 Z M 0.523438 -4.40625 "
-           id="path9041" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph47-0">
-        <path
-           style="stroke:none;"
-           d="M 0.308594 -1.980469 L 1.503906 -2.097656 C 1.570313 -1.695313 1.714844 -1.398438 1.941406 -1.210938 C 2.160156 -1.019531 2.457031 -0.925781 2.832031 -0.929688 C 3.222656 -0.925781 3.523438 -1.007813 3.726563 -1.179688 C 3.925781 -1.34375 4.023438 -1.542969 4.027344 -1.769531 C 4.023438 -1.910156 3.980469 -2.03125 3.898438 -2.136719 C 3.8125 -2.234375 3.667969 -2.320313 3.460938 -2.398438 C 3.3125 -2.445313 2.984375 -2.535156 2.476563 -2.667969 C 1.816406 -2.824219 1.355469 -3.027344 1.089844 -3.269531 C 0.714844 -3.601563 0.527344 -4.007813 0.527344 -4.492188 C 0.527344 -4.800781 0.613281 -5.089844 0.789063 -5.363281 C 0.964844 -5.628906 1.21875 -5.835938 1.550781 -5.980469 C 1.878906 -6.117188 2.277344 -6.1875 2.75 -6.191406 C 3.507813 -6.1875 4.082031 -6.019531 4.472656 -5.6875 C 4.855469 -5.351563 5.058594 -4.90625 5.082031 -4.347656 L 3.855469 -4.292969 C 3.800781 -4.601563 3.683594 -4.828125 3.511719 -4.96875 C 3.335938 -5.105469 3.078125 -5.171875 2.738281 -5.175781 C 2.375 -5.171875 2.097656 -5.097656 1.898438 -4.953125 C 1.765625 -4.855469 1.699219 -4.730469 1.703125 -4.578125 C 1.699219 -4.429688 1.761719 -4.308594 1.886719 -4.207031 C 2.039063 -4.074219 2.414063 -3.9375 3.011719 -3.800781 C 3.609375 -3.65625 4.050781 -3.511719 4.339844 -3.363281 C 4.625 -3.210938 4.851563 -3.003906 5.015625 -2.742188 C 5.175781 -2.480469 5.253906 -2.15625 5.257813 -1.773438 C 5.253906 -1.421875 5.15625 -1.09375 4.964844 -0.792969 C 4.769531 -0.484375 4.496094 -0.257813 4.144531 -0.113281 C 3.785156 0.0351563 3.34375 0.105469 2.820313 0.109375 C 2.046875 0.105469 1.457031 -0.0703125 1.046875 -0.425781 C 0.632813 -0.78125 0.386719 -1.300781 0.308594 -1.980469 Z M 0.308594 -1.980469 "
-           id="path9044" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph47-1">
-        <path
-           style="stroke:none;"
-           d="M 0.609375 -5.007813 L 0.609375 -6.089844 L 1.777344 -6.089844 L 1.777344 -5.007813 Z M 0.609375 0 L 0.609375 -4.410156 L 1.777344 -4.410156 L 1.777344 0 Z M 0.609375 0 "
-           id="path9047" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph47-2">
-        <path
-           style="stroke:none;"
-           d="M 0.140625 0 L 0.140625 -0.910156 L 1.792969 -2.808594 C 2.0625 -3.113281 2.261719 -3.335938 2.394531 -3.46875 C 2.257813 -3.460938 2.078125 -3.453125 1.859375 -3.449219 L 0.304688 -3.441406 L 0.304688 -4.410156 L 3.949219 -4.410156 L 3.949219 -3.582031 L 2.261719 -1.640625 L 1.667969 -0.996094 C 1.992188 -1.015625 2.195313 -1.023438 2.273438 -1.027344 L 4.078125 -1.027344 L 4.078125 0 Z M 0.140625 0 "
-           id="path9050" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph47-3">
-        <path
-           style="stroke:none;"
-           d="M 3.164063 -1.402344 L 4.328125 -1.207031 C 4.175781 -0.78125 3.9375 -0.457031 3.617188 -0.234375 C 3.292969 -0.0117188 2.890625 0.0976563 2.410156 0.101563 C 1.636719 0.0976563 1.070313 -0.148438 0.707031 -0.652344 C 0.414063 -1.050781 0.269531 -1.558594 0.269531 -2.171875 C 0.269531 -2.902344 0.460938 -3.476563 0.84375 -3.890625 C 1.226563 -4.304688 1.707031 -4.511719 2.292969 -4.511719 C 2.945313 -4.511719 3.464844 -4.292969 3.84375 -3.859375 C 4.222656 -3.425781 4.402344 -2.761719 4.390625 -1.867188 L 1.464844 -1.867188 C 1.472656 -1.519531 1.566406 -1.25 1.746094 -1.0625 C 1.925781 -0.867188 2.148438 -0.773438 2.421875 -0.773438 C 2.601563 -0.773438 2.757813 -0.820313 2.882813 -0.921875 C 3.007813 -1.015625 3.101563 -1.175781 3.164063 -1.402344 Z M 3.230469 -2.582031 C 3.21875 -2.917969 3.132813 -3.175781 2.96875 -3.351563 C 2.800781 -3.527344 2.597656 -3.613281 2.363281 -3.617188 C 2.105469 -3.613281 1.894531 -3.523438 1.730469 -3.339844 C 1.5625 -3.152344 1.484375 -2.898438 1.488281 -2.582031 Z M 3.230469 -2.582031 "
-           id="path9053" />
-      </symbol>
-    </g>
-    <image
-       id="image5"
-       width="89"
-       height="65"
-       xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFkAAABBCAYAAAC6jghZAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAFrSURBVHhe7dy5SgVRFAXR5zwgDoGBmjggBo4o6v//mbsabBQ08D7cUV1YSQ9J9aHDs/hyVmItNrQ0OtLz2+HiVuzFQRxpGP3oSE+6TofiXOCB0ziPSw2jHx3pSddpoqlNeW5cx13caxj96EhPuk7TvB6MOF/gNp7jRcPoR0d60pW+04+a0WbU+RI/vai/oSM96UpfI/8DIxcYucDIBUYuMHKBkQuMXGDkAiMXGLnAyAVGLjBygZELjFxg5AIjFxi5wMgFRi4wcoGRC4xcYOQCIxcYucDIBUYuMHKBkQuMXGDkAiMXGLnAyAVGLjBygZELjFxg5AIjFxi5wMgFRi4wcoGRC4xcYOQCIxcYucDIBUYuMHLBr5Gv4inetTQ60nOOzHK4w7iIh3iNNw2jHx3pSddp+d7nGsmzuAkeeNQw+tGRnvMaSRZ2bsZ+HMdJsGdSY+hHR3rSdV7xuxr8O7ZjJ3Y1jH50pGe6LhYfpJcUin/miZoAAAAASUVORK5CYII=" />
-    <pattern
-       id="pattern0"
-       patternUnits="userSpaceOnUse"
-       width="89"
-       height="65"
-       patternTransform="matrix(0.70875,0,0,0.70875,130.056335,14.006742)">
-      <use
-         xlink:href="#image5"
-         id="use9059" />
-    </pattern>
-    <image
-       id="image8"
-       width="90"
-       height="66"
-       xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAABCCAYAAADXLcH0AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAGKSURBVHhe7dxZK4ZhFEbhzzwkUynDidmBmZD8///lXm8RpejFfbR2XYmPk2X3HO7Jh5mKmZjTn6AlTT8NP1yIlViLDf3KetByPqZjGKoTmV/Yjf041GhHcRB7sRps9zBsM/WJfBIXcanRbuIqTmMrWOJhZoPngk0+j7u412iP8RBE34mlGIbV5tlg7fnwqz/Wzz0Fsa+DV2I5hjH03zJ0iaFLDF1i6BJDlxi6xNAlhi4xdImhSwxdYugSQ5cYusTQJYYuMXSJoUsMXWLoEkOXGLrE0CWGLjF0iaFLDF1i6BJDlxi6xNAlhi4xdImhSwxdYugSQ5cYusTQJYYuMXSJoUsMXWLoEkOXGLrE0CWGLjF0iaFLDF1i6BJDl3wbmguEHMfjFzXeSzwHLT+F5sAgtzQ588gFQo7j8R/ROETmKxcxt2Mxhnk7mcktzbMgNmuvcW6DJ/g4NoNDsMNwBJZvOFjKLU3OPLLyGo9NJjLnMlnk9+H0Lm81a86HvCsaj4Ys75c3pJ1/m8nkFS3LHh6rv6Y1AAAAAElFTkSuQmCC" />
-    <pattern
-       id="pattern1"
-       patternUnits="userSpaceOnUse"
-       width="90"
-       height="66"
-       patternTransform="matrix(0.70875,0,0,0.70875,13.820911,13.997812)">
-      <use
-         xlink:href="#image8"
-         id="use9063" />
-    </pattern>
-    <image
-       id="image11"
-       width="66"
-       height="66"
-       xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEIAAABCCAYAAADjVADoAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAEWSURBVHhe7dw5TsVAAARRswuEWAICIGERJKwCAfe/Gd2jCozECebXkzr5S+DSOLSXla1sJ9vbkPVae81/9MOD7Dg7zc4n31nWa93PtrOhVRqhP7jKbrK7iXef3WbX2UnW0zH0NLROIzxkz9nLxHvPXrPH7CLrIRh2s94OPQlP2Uf2OfG+s6+sUS6zw2zo0eht0WPTL//780z7yRrjLetdcJQNhoAhYAgYAoaAIWAIGAKGgCFgCBgChoAhYAgYAoaAIWAIGAKGgCFgCBgChoAhYAgYAoaAIWAIGAKGgCFgCBgChoAhYAgYAoaAIWAIGAKGgCFgCBgChoAhYAgYAoaAIWAI+AAsfCQaPiS/4msTVjb4RRrL8gtFQfl7BWWr8AAAAABJRU5ErkJggg==" />
-    <pattern
-       id="pattern2"
-       patternUnits="userSpaceOnUse"
-       width="66"
-       height="66"
-       patternTransform="matrix(0.70875,0,0,0.70875,80.443832,13.820624)">
-      <use
-         xlink:href="#image11"
-         id="use9067" />
-    </pattern>
-    <image
-       id="image14"
-       width="523"
-       height="65"
-       xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAgsAAABBCAYAAABfA+ftAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAH7SURBVHhe7do7bhVBFEXRftiAbVl8AgfYiT9CBHwFMsx/Zr6nA8ST8JlArSutpLo637rd2z9zGCfjJQCwtPRAuuBocvh6XI634z0AsKR0QHogXZA+2CflkINcuB634x4AWFI6ID2QLkgf7BuGVEMKIg8+ji/jKwCwpHRAeiBdkD7YtwunIyuHlMTn8XP8AgCWlA5ID6QL0gfphP1HhqwasnpIUfzvRQBgHemBdEH6IJ0gFgCAI2IBAKjEAgBQiQUAoBILAEAlFgCASiwAAJVYAAAqsQAAVGIBAKjEAgBQiQUAoBILAEAlFgCASiwAAJVYAAAqsQAAVGIBAKjEAgBQiQUAoBILAEAlFgCASiwAAJVYAAAqsQAAVGIBAKjEAgBQiQUAoBILAEAlFgCASiwAAJVYAAAqsQAAVGIBAKjEAgBQiQUAoBILAEAlFgCASiwAAJVYAAAqsQAAVGIBAKjEAgBQiQUAoBILAEAlFgCASiwAAJVYAAAqsQAAVGIBAKjEAgBQiQUAoBILAEAlFgCA6tlYeBg/xh8AYGnpgXTB31g4He/G3fg2HsdvAGBJ6YD0QLogfZBO2E7G5bgZn0YufAcAlpQOSA+kC9IH6YTtMF6NN+NqfBjXAMCS0gHpgXRB+iCdsM+LkW8SZ+N8XAAAS0oHpAfSBdMH2/YErIfwQJlbMN4AAAAASUVORK5CYII=" />
-    <pattern
-       id="pattern3"
-       patternUnits="userSpaceOnUse"
-       width="523"
-       height="65"
-       patternTransform="matrix(0.70875,0,0,0.70875,193.684357,14.006742)">
-      <use
-         xlink:href="#image14"
-         id="use9071" />
-    </pattern>
-    <linearGradient
-       id="linear0"
-       gradientUnits="userSpaceOnUse"
-       x1="515.65625"
-       y1="17.253906"
-       x2="0"
-       y2="17.253906"
-       gradientTransform="matrix(0.70875,0,0,0.70875,196.425797,273.148682)">
-      <stop
-         offset="0.09"
-         style="stop-color:rgb(20.392157%,59.607843%,85.882353%);stop-opacity:1;"
-         id="stop9074" />
-      <stop
-         offset="0.5"
-         style="stop-color:rgb(17.647059%,63.529412%,74.901961%);stop-opacity:1;"
-         id="stop9076" />
-      <stop
-         offset="1"
-         style="stop-color:rgb(17.647059%,63.529412%,74.901961%);stop-opacity:1;"
-         id="stop9078" />
-    </linearGradient>
-    <linearGradient
-       id="linear1"
-       gradientUnits="userSpaceOnUse"
-       x1="152.589844"
-       y1="0"
-       x2="152.589844"
-       y2="16.800781"
-       gradientTransform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,472.208221,299.636108)">
-      <stop
-         offset="0"
-         style="stop-color:rgb(20.392157%,59.607843%,85.882353%);stop-opacity:1;"
-         id="stop9081" />
-      <stop
-         offset="1"
-         style="stop-color:rgb(17.647059%,63.529412%,74.901961%);stop-opacity:1;"
-         id="stop9083" />
-    </linearGradient>
-    <linearGradient
-       id="linear2"
-       gradientUnits="userSpaceOnUse"
-       x1="146.574219"
-       y1="0"
-       x2="146.574219"
-       y2="16.800781"
-       gradientTransform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,472.168518,61.554932)">
-      <stop
-         offset="0"
-         style="stop-color:rgb(20.392157%,59.607843%,85.882353%);stop-opacity:1;"
-         id="stop9086" />
-      <stop
-         offset="1"
-         style="stop-color:rgb(17.647059%,63.529412%,74.901961%);stop-opacity:1;"
-         id="stop9088" />
-    </linearGradient>
-  </defs>
-  <g
-     id="surface1">
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 172.407715 37.152551 L 172.407715 -0.000226056 L -0.00189457 -0.000226056 L -0.00189457 37.152551 Z M 172.407715 37.152551 "
-       transform="matrix(0.70875,0,0,0.70875,336.509155,120.172035)"
-       id="path9725" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.632544 37.874241 L 83.632544 -0.000538229 L 0.00159316 -0.000538229 L 0.00159316 37.874241 Z M 83.632544 37.874241 "
-       transform="matrix(0.70875,0,0,0.70875,131.014496,91.098038)"
-       id="path9093" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9107">
-      <use
-         xlink:href="#glyph0-0"
-         x="144.585938"
-         y="100.429687"
-         id="use9095" />
-      <use
-         xlink:href="#glyph0-1"
-         x="151.410425"
-         y="100.429687"
-         id="use9097" />
-      <use
-         xlink:href="#glyph0-2"
-         x="156.666065"
-         y="100.429687"
-         id="use9099" />
-      <use
-         xlink:href="#glyph0-3"
-         x="161.921704"
-         y="100.429687"
-         id="use9101" />
-      <use
-         xlink:href="#glyph0-4"
-         x="164.547217"
-         y="100.429687"
-         id="use9103" />
-      <use
-         xlink:href="#glyph0-1"
-         x="169.802857"
-         y="100.429687"
-         id="use9105" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.632652 37.152444 L 83.632652 -0.000333702 L 0.0017008 -0.000333702 L 0.0017008 37.152444 Z M 83.632652 37.152444 "
-       transform="matrix(0.70875,0,0,0.70875,131.01442,120.172112)"
-       id="path9109" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.632652 37.873573 L 83.632652 -0.00120563 L 0.0017008 -0.00120563 L 0.0017008 37.873573 Z M 83.632652 37.873573 "
-       transform="matrix(0.70875,0,0,0.70875,131.01442,148.801636)"
-       id="path9125" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9137">
-      <use
-         xlink:href="#glyph1-0"
-         x="147.421875"
-         y="157.835938"
-         id="use9127" />
-      <use
-         xlink:href="#glyph1-1"
-         x="153.64049"
-         y="157.835938"
-         id="use9129" />
-      <use
-         xlink:href="#glyph1-2"
-         x="158.825704"
-         y="157.835938"
-         id="use9131" />
-      <use
-         xlink:href="#glyph1-3"
-         x="164.010918"
-         y="157.835938"
-         id="use9133" />
-      <use
-         xlink:href="#glyph1-4"
-         x="169.706004"
-         y="157.835938"
-         id="use9135" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.335636 37.997904 L 83.335636 0.00187304 L 0.00230362 0.00187304 L 0.00230362 37.997904 Z M 83.335636 37.997904 "
-       transform="matrix(0.70875,0,0,0.70875,131.119461,61.936172)"
-       id="path9139" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9153">
-      <use
-         xlink:href="#glyph0-0"
-         x="144.585938"
-         y="72.789063"
-         id="use9141" />
-      <use
-         xlink:href="#glyph0-1"
-         x="151.410425"
-         y="72.789063"
-         id="use9143" />
-      <use
-         xlink:href="#glyph0-2"
-         x="156.666065"
-         y="72.789063"
-         id="use9145" />
-      <use
-         xlink:href="#glyph0-3"
-         x="161.921704"
-         y="72.789063"
-         id="use9147" />
-      <use
-         xlink:href="#glyph0-4"
-         x="164.547217"
-         y="72.789063"
-         id="use9149" />
-      <use
-         xlink:href="#glyph0-1"
-         x="169.802857"
-         y="72.789063"
-         id="use9151" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9169">
-      <use
-         xlink:href="#glyph0-10"
-         x="140.332031"
-         y="84.128906"
-         id="use9155" />
-      <use
-         xlink:href="#glyph0-8"
-         x="148.203955"
-         y="84.128906"
-         id="use9157" />
-      <use
-         xlink:href="#glyph0-11"
-         x="153.459595"
-         y="84.128906"
-         id="use9159" />
-      <use
-         xlink:href="#glyph0-8"
-         x="159.232032"
-         y="84.128906"
-         id="use9161" />
-      <use
-         xlink:href="#glyph0-12"
-         x="164.487671"
-         y="84.128906"
-         id="use9163" />
-      <use
-         xlink:href="#glyph0-1"
-         x="170.260108"
-         y="84.128906"
-         id="use9165" />
-      <use
-         xlink:href="#glyph0-7"
-         x="175.515748"
-         y="84.128906"
-         id="use9167" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(86.666667%,44.313725%,58.431373%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(86.666667%,44.313725%,58.431373%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 292.748914 84.001329 L 292.74892 0.00110932 L 0.00198606 0.00108722 L 0.00197972 84.001307 Z M 292.748914 84.001329 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,74.418739,61.838436)"
-       id="path9171" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9181">
-      <use
-         xlink:href="#glyph2-0"
-         x="40.765625"
-         y="153.53125"
-         id="use9173" />
-      <use
-         xlink:href="#glyph2-1"
-         x="40.765626"
-         y="161.038186"
-         id="use9175" />
-      <use
-         xlink:href="#glyph2-2"
-         x="40.765626"
-         y="167.387865"
-         id="use9177" />
-      <use
-         xlink:href="#glyph2-3"
-         x="40.765626"
-         y="173.169068"
-         id="use9179" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.336777 33.664365 L 83.336777 0.000344466 L -0.0020668 0.000344466 L -0.0020668 33.664365 Z M 83.336777 33.664365 "
-       transform="matrix(0.70875,0,0,0.70875,131.157715,273.49585)"
-       id="path9183" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9197">
-      <use
-         xlink:href="#glyph0-13"
-         x="145.292969"
-         y="288.957031"
-         id="use9185" />
-      <use
-         xlink:href="#glyph0-1"
-         x="152.117456"
-         y="288.957031"
-         id="use9187" />
-      <use
-         xlink:href="#glyph0-7"
-         x="157.373096"
-         y="288.957031"
-         id="use9189" />
-      <use
-         xlink:href="#glyph0-11"
-         x="161.050659"
-         y="288.957031"
-         id="use9191" />
-      <use
-         xlink:href="#glyph0-1"
-         x="166.823096"
-         y="288.957031"
-         id="use9193" />
-      <use
-         xlink:href="#glyph0-14"
-         x="172.078736"
-         y="288.957031"
-         id="use9195" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(86.666667%,44.313725%,58.431373%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(86.666667%,44.313725%,58.431373%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 341.79939 84.001601 L 341.799397 0.00138214 L 0.000430583 0.00135634 L 0.000424241 84.001576 Z M 341.79939 84.001601 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,74.457993,273.398132)"
-       id="path9199" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9223">
-      <use
-         xlink:href="#glyph3-0"
-         x="40.820313"
-         y="365.425781"
-         id="use9201" />
-      <use
-         xlink:href="#glyph3-1"
-         x="40.820313"
-         y="372.838832"
-         id="use9203" />
-      <use
-         xlink:href="#glyph3-1"
-         x="40.820313"
-         y="378.547733"
-         id="use9205" />
-      <use
-         xlink:href="#glyph3-2"
-         x="40.820314"
-         y="384.256634"
-         id="use9207" />
-      <use
-         xlink:href="#glyph3-3"
-         x="40.820314"
-         y="389.965535"
-         id="use9209" />
-      <use
-         xlink:href="#glyph3-2"
-         x="40.820315"
-         y="392.81748"
-         id="use9211" />
-      <use
-         xlink:href="#glyph3-4"
-         x="40.820315"
-         y="398.526381"
-         id="use9213" />
-      <use
-         xlink:href="#glyph3-5"
-         x="40.820315"
-         y="402.521108"
-         id="use9215" />
-      <use
-         xlink:href="#glyph3-6"
-         x="40.820316"
-         y="408.230009"
-         id="use9217" />
-      <use
-         xlink:href="#glyph3-7"
-         x="40.820316"
-         y="411.648333"
-         id="use9219" />
-      <use
-         xlink:href="#glyph3-4"
-         x="40.820316"
-         y="417.918601"
-         id="use9221" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.336669 48.905672 L 83.336669 0.00245432 L -0.00217444 0.00245432 L -0.00217444 48.905672 Z M 83.336669 48.905672 "
-       transform="matrix(0.70875,0,0,0.70875,131.157791,234.213104)"
-       id="path9225" />
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#pattern0);"
-       d="M 130.054688 14.007813 L 192.664063 14.007813 L 192.664063 60.074219 L 130.054688 60.074219 Z M 130.054688 14.007813 "
-       id="path9245" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(68.627451%,34.901961%,12.941176%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(68.627451%,34.901961%,12.941176%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.335636 59.998423 L 83.335636 0.000628382 L 0.00230362 0.000628382 L 0.00230362 59.998423 Z M 83.335636 59.998423 "
-       transform="matrix(0.70875,0,0,0.70875,131.119461,15.069867)"
-       id="path9247" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9265">
-      <use
-         xlink:href="#glyph5-0"
-         x="136.078125"
-         y="39.765625"
-         id="use9249" />
-      <use
-         xlink:href="#glyph5-1"
-         x="143.585061"
-         y="39.765625"
-         id="use9251" />
-      <use
-         xlink:href="#glyph5-2"
-         x="149.934741"
-         y="39.765625"
-         id="use9253" />
-      <use
-         xlink:href="#glyph5-3"
-         x="156.284422"
-         y="39.765625"
-         id="use9255" />
-      <use
-         xlink:href="#glyph5-4"
-         x="162.065625"
-         y="39.765625"
-         id="use9257" />
-      <use
-         xlink:href="#glyph5-5"
-         x="167.846829"
-         y="39.765625"
-         id="use9259" />
-      <use
-         xlink:href="#glyph5-6"
-         x="174.196509"
-         y="39.765625"
-         id="use9261" />
-      <use
-         xlink:href="#glyph5-7"
-         x="177.658126"
-         y="39.765625"
-         id="use9263" />
-    </g>
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#pattern1);"
-       d="M 13.820313 13.996094 L 76.898438 13.996094 L 76.898438 60.066406 L 13.820313 60.066406 Z M 13.820313 13.996094 "
-       id="path9267" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(63.529412%,31.764706%,42.352941%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(63.529412%,31.764706%,42.352941%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.998497 60 L 83.998497 0.00220539 L -0.00172233 0.00220539 L -0.00172233 60 Z M 83.998497 60 "
-       transform="matrix(0.70875,0,0,0.70875,14.884033,15.060937)"
-       id="path9269" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9291">
-      <use
-         xlink:href="#glyph5-8"
-         x="19.136719"
-         y="33.390625"
-         id="use9271" />
-      <use
-         xlink:href="#glyph5-9"
-         x="26.070103"
-         y="33.390625"
-         id="use9273" />
-      <use
-         xlink:href="#glyph5-4"
-         x="31.851306"
-         y="33.390625"
-         id="use9275" />
-      <use
-         xlink:href="#glyph5-3"
-         x="37.63251"
-         y="33.390625"
-         id="use9277" />
-      <use
-         xlink:href="#glyph5-10"
-         x="43.413714"
-         y="33.390625"
-         id="use9279" />
-      <use
-         xlink:href="#glyph5-6"
-         x="49.763394"
-         y="33.390625"
-         id="use9281" />
-      <use
-         xlink:href="#glyph5-11"
-         x="53.22501"
-         y="33.390625"
-         id="use9283" />
-      <use
-         xlink:href="#glyph5-1"
-         x="56.113074"
-         y="33.390625"
-         id="use9285" />
-      <use
-         xlink:href="#glyph5-2"
-         x="62.462754"
-         y="33.390625"
-         id="use9287" />
-      <use
-         xlink:href="#glyph5-12"
-         x="68.812435"
-         y="33.390625"
-         id="use9289" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9305">
-      <use
-         xlink:href="#glyph5-13"
-         x="25.515625"
-         y="46.144531"
-         id="use9293" />
-      <use
-         xlink:href="#glyph5-1"
-         x="33.022561"
-         y="46.144531"
-         id="use9295" />
-      <use
-         xlink:href="#glyph5-14"
-         x="39.372241"
-         y="46.144531"
-         id="use9297" />
-      <use
-         xlink:href="#glyph5-15"
-         x="48.615061"
-         y="46.144531"
-         id="use9299" />
-      <use
-         xlink:href="#glyph5-11"
-         x="54.396265"
-         y="46.144531"
-         id="use9301" />
-      <use
-         xlink:href="#glyph5-2"
-         x="57.284329"
-         y="46.144531"
-         id="use9303" />
-    </g>
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#pattern2);"
-       d="M 80.445313 13.820313 L 126.511719 13.820313 L 126.511719 60.066406 L 80.445313 60.066406 Z M 80.445313 13.820313 "
-       id="path9307" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(73.72549%,50.196078%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(73.72549%,50.196078%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 59.999 60.249999 L 59.999 -0.0013227 L 0.00120563 -0.0013227 L 0.00120563 60.249999 Z M 59.999 60.249999 "
-       transform="matrix(0.70875,0,0,0.70875,81.506958,14.88375)"
-       id="path9309" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9319">
-      <use
-         xlink:href="#glyph5-0"
-         x="88.59375"
-         y="39.765625"
-         id="use9311" />
-      <use
-         xlink:href="#glyph5-1"
-         x="96.100686"
-         y="39.765625"
-         id="use9313" />
-      <use
-         xlink:href="#glyph5-16"
-         x="102.450366"
-         y="39.765625"
-         id="use9315" />
-      <use
-         xlink:href="#glyph5-4"
-         x="108.800047"
-         y="39.765625"
-         id="use9317" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 60.00012 33.802367 L 60.00012 0.000559758 L 0.00232515 0.000559758 L 0.00232515 33.802367 Z M 60.00012 33.802367 "
-       transform="matrix(0.70875,0,0,0.70875,81.545227,273.398041)"
-       id="path9321" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9331">
-      <use
-         xlink:href="#glyph5-17"
-         x="90.011719"
-         y="289.246094"
-         id="use9323" />
-      <use
-         xlink:href="#glyph5-7"
-         x="97.518655"
-         y="289.246094"
-         id="use9325" />
-      <use
-         xlink:href="#glyph5-4"
-         x="103.299859"
-         y="289.246094"
-         id="use9327" />
-      <use
-         xlink:href="#glyph5-18"
-         x="109.081062"
-         y="289.246094"
-         id="use9329" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.337573 303.585289 L 83.337573 -0.00266962 L -0.00127022 -0.00266962 L -0.00127022 303.585289 Z M 83.337573 303.585289 "
-       transform="matrix(0.70875,0,0,0.70875,131.262619,300.482361)"
-       id="path9333" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9357">
-      <use
-         xlink:href="#glyph6-0"
-         x="134.664063"
-         y="309.511719"
-         id="use9335" />
-      <use
-         xlink:href="#glyph6-1"
-         x="141.386327"
-         y="309.511719"
-         id="use9337" />
-      <use
-         xlink:href="#glyph6-1"
-         x="146.563244"
-         y="309.511719"
-         id="use9339" />
-      <use
-         xlink:href="#glyph6-2"
-         x="151.740161"
-         y="309.511719"
-         id="use9341" />
-      <use
-         xlink:href="#glyph6-3"
-         x="156.917077"
-         y="309.511719"
-         id="use9343" />
-      <use
-         xlink:href="#glyph6-2"
-         x="159.503263"
-         y="309.511719"
-         id="use9345" />
-      <use
-         xlink:href="#glyph6-4"
-         x="164.680179"
-         y="309.511719"
-         id="use9347" />
-      <use
-         xlink:href="#glyph6-5"
-         x="168.302658"
-         y="309.511719"
-         id="use9349" />
-      <use
-         xlink:href="#glyph6-6"
-         x="173.479574"
-         y="309.511719"
-         id="use9351" />
-      <use
-         xlink:href="#glyph6-7"
-         x="176.579361"
-         y="309.511719"
-         id="use9353" />
-      <use
-         xlink:href="#glyph6-4"
-         x="182.265333"
-         y="309.511719"
-         id="use9355" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 77.407669 48.510655 L 77.407669 -0.00124869 L -0.000839637 -0.00124869 L -0.000839637 48.510655 Z M 77.407669 48.510655 "
-       transform="matrix(0.70875,0,0,0.70875,133.418564,366.352448)"
-       id="path9359" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9371">
-      <use
-         xlink:href="#glyph0-15"
-         x="147.421875"
-         y="386.765625"
-         id="use9361" />
-      <use
-         xlink:href="#glyph0-11"
-         x="150.047388"
-         y="386.765625"
-         id="use9363" />
-      <use
-         xlink:href="#glyph0-16"
-         x="155.819824"
-         y="386.765625"
-         id="use9365" />
-      <use
-         xlink:href="#glyph0-1"
-         x="161.592261"
-         y="386.765625"
-         id="use9367" />
-      <use
-         xlink:href="#glyph0-17"
-         x="166.847901"
-         y="386.765625"
-         id="use9369" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 77.317763 35.807593 L 77.317763 -0.000387525 L -0.00256197 -0.000387525 L -0.00256197 35.807593 Z M 77.317763 35.807593 "
-       transform="matrix(0.70875,0,0,0.70875,133.482285,338.332306)"
-       id="path9373" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9385">
-      <use
-         xlink:href="#glyph7-0"
-         x="148.835938"
-         y="348.492188"
-         id="use9375" />
-      <use
-         xlink:href="#glyph7-1"
-         x="157.676464"
-         y="348.492188"
-         id="use9377" />
-      <use
-         xlink:href="#glyph7-2"
-         x="163.397881"
-         y="348.492188"
-         id="use9379" />
-      <use
-         xlink:href="#glyph7-3"
-         x="167.042941"
-         y="348.492188"
-         id="use9381" />
-      <use
-         xlink:href="#glyph7-4"
-         x="172.252129"
-         y="348.492188"
-         id="use9383" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9403">
-      <use
-         xlink:href="#glyph8-0"
-         x="142.457031"
-         y="359.832031"
-         id="use9387" />
-      <use
-         xlink:href="#glyph8-1"
-         x="149.201722"
-         y="359.832031"
-         id="use9389" />
-      <use
-         xlink:href="#glyph8-2"
-         x="151.796535"
-         y="359.832031"
-         id="use9391" />
-      <use
-         xlink:href="#glyph8-1"
-         x="156.990722"
-         y="359.832031"
-         id="use9393" />
-      <use
-         xlink:href="#glyph8-3"
-         x="159.585535"
-         y="359.832031"
-         id="use9395" />
-      <use
-         xlink:href="#glyph8-1"
-         x="164.779722"
-         y="359.832031"
-         id="use9397" />
-      <use
-         xlink:href="#glyph8-4"
-         x="167.374536"
-         y="359.832031"
-         id="use9399" />
-      <use
-         xlink:href="#glyph8-5"
-         x="173.079477"
-         y="359.832031"
-         id="use9401" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.926945 35.906024 L 76.926945 -0.00116257 L -0.0020668 -0.00116257 L -0.0020668 35.906024 Z M 76.926945 35.906024 "
-       transform="matrix(0.70875,0,0,0.70875,133.481934,403.309418)"
-       id="path9405" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9419">
-      <use
-         xlink:href="#glyph9-0"
-         x="144.585938"
-         y="419.367188"
-         id="use9407" />
-      <use
-         xlink:href="#glyph9-1"
-         x="151.230327"
-         y="419.367188"
-         id="use9409" />
-      <use
-         xlink:href="#glyph9-2"
-         x="154.294204"
-         y="419.367188"
-         id="use9411" />
-      <use
-         xlink:href="#glyph9-3"
-         x="159.914306"
-         y="419.367188"
-         id="use9413" />
-      <use
-         xlink:href="#glyph9-4"
-         x="168.095126"
-         y="419.367188"
-         id="use9415" />
-      <use
-         xlink:href="#glyph9-5"
-         x="170.651352"
-         y="419.367188"
-         id="use9417" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.926945 35.90189 L 76.926945 0.000215292 L -0.0020668 0.000215292 L -0.0020668 35.90189 Z M 76.926945 35.90189 "
-       transform="matrix(0.70875,0,0,0.70875,133.481934,431.546722)"
-       id="path9421" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9431">
-      <use
-         xlink:href="#glyph0-18"
-         x="148.128906"
-         y="447.714844"
-         id="use9423" />
-      <use
-         xlink:href="#glyph0-8"
-         x="154.953394"
-         y="447.714844"
-         id="use9425" />
-      <use
-         xlink:href="#glyph0-11"
-         x="160.209033"
-         y="447.714844"
-         id="use9427" />
-      <use
-         xlink:href="#glyph0-16"
-         x="165.98147"
-         y="447.714844"
-         id="use9429" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.928753 35.903268 L 76.928753 0.00159316 L -0.00025835 0.00159316 L -0.00025835 35.903268 Z M 76.928753 35.903268 "
-       transform="matrix(0.70875,0,0,0.70875,133.586121,459.784027)"
-       id="path9433" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9443">
-      <use
-         xlink:href="#glyph0-10"
-         x="148.835938"
-         y="476.066406"
-         id="use9435" />
-      <use
-         xlink:href="#glyph0-8"
-         x="156.707861"
-         y="476.066406"
-         id="use9437" />
-      <use
-         xlink:href="#glyph0-6"
-         x="161.963501"
-         y="476.066406"
-         id="use9439" />
-      <use
-         xlink:href="#glyph0-19"
-         x="165.110425"
-         y="476.066406"
-         id="use9441" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.928753 30.187105 L 76.928753 0.000818108 L -0.00025835 0.000818108 L -0.00025835 30.187105 Z M 76.928753 30.187105 "
-       transform="matrix(0.70875,0,0,0.70875,133.586121,490.003326)"
-       id="path9445" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9453">
-      <use
-         xlink:href="#glyph0-20"
-         x="155.925781"
-         y="503.707031"
-         id="use9447" />
-      <use
-         xlink:href="#glyph0-20"
-         x="158.551294"
-         y="503.707031"
-         id="use9449" />
-      <use
-         xlink:href="#glyph0-20"
-         x="161.176807"
-         y="503.707031"
-         id="use9451" />
-    </g>
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#pattern3);"
-       d="M 193.683594 14.007813 L 563.8125 14.007813 L 563.8125 60.074219 L 193.683594 60.074219 Z M 193.683594 14.007813 "
-       id="path9455" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(13.72549%,43.137255%,63.137255%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(13.72549%,43.137255%,63.137255%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 517.22687 59.998423 L 517.22687 0.000628382 L -0.00195915 0.000628382 L -0.00195915 59.998423 Z M 517.22687 59.998423 "
-       transform="matrix(0.70875,0,0,0.70875,194.747482,15.069867)"
-       id="path9457" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9489">
-      <use
-         xlink:href="#glyph5-19"
-         x="336.65625"
-         y="39.765625"
-         id="use9459" />
-      <use
-         xlink:href="#glyph5-14"
-         x="339.544314"
-         y="39.765625"
-         id="use9461" />
-      <use
-         xlink:href="#glyph5-5"
-         x="348.787134"
-         y="39.765625"
-         id="use9463" />
-      <use
-         xlink:href="#glyph5-20"
-         x="355.136814"
-         y="39.765625"
-         id="use9465" />
-      <use
-         xlink:href="#glyph5-4"
-         x="358.024878"
-         y="39.765625"
-         id="use9467" />
-      <use
-         xlink:href="#glyph5-14"
-         x="363.806082"
-         y="39.765625"
-         id="use9469" />
-      <use
-         xlink:href="#glyph5-4"
-         x="373.048902"
-         y="39.765625"
-         id="use9471" />
-      <use
-         xlink:href="#glyph5-2"
-         x="378.830106"
-         y="39.765625"
-         id="use9473" />
-      <use
-         xlink:href="#glyph5-6"
-         x="385.179786"
-         y="39.765625"
-         id="use9475" />
-      <use
-         xlink:href="#glyph5-15"
-         x="388.641402"
-         y="39.765625"
-         id="use9477" />
-      <use
-         xlink:href="#glyph5-6"
-         x="394.422606"
-         y="39.765625"
-         id="use9479" />
-      <use
-         xlink:href="#glyph5-11"
-         x="397.884222"
-         y="39.765625"
-         id="use9481" />
-      <use
-         xlink:href="#glyph5-1"
-         x="400.772286"
-         y="39.765625"
-         id="use9483" />
-      <use
-         xlink:href="#glyph5-2"
-         x="407.121966"
-         y="39.765625"
-         id="use9485" />
-      <use
-         xlink:href="#glyph5-7"
-         x="413.471647"
-         y="39.765625"
-         id="use9487" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 369.498736 37.997802 L 369.498736 0.00177077 L -0.000818108 0.00177077 L -0.000818108 37.997802 Z M 369.498736 37.997802 "
-       transform="matrix(0.70875,0,0,0.70875,196.680267,61.936245)"
-       id="path9491" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9507">
-      <use
-         xlink:href="#glyph10-0"
-         x="311.140625"
-         y="78.457031"
-         id="use9493" />
-      <use
-         xlink:href="#glyph10-1"
-         x="317.433597"
-         y="78.457031"
-         id="use9495" />
-      <use
-         xlink:href="#glyph10-2"
-         x="320.054901"
-         y="78.457031"
-         id="use9497" />
-      <use
-         xlink:href="#glyph10-3"
-         x="323.196781"
-         y="78.457031"
-         id="use9499" />
-      <use
-         xlink:href="#glyph10-4"
-         x="326.33866"
-         y="78.457031"
-         id="use9501" />
-      <use
-         xlink:href="#glyph10-5"
-         x="333.152208"
-         y="78.457031"
-         id="use9503" />
-      <use
-         xlink:href="#glyph10-6"
-         x="338.915391"
-         y="78.457031"
-         id="use9505" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(20.392157%,59.607843%,85.882353%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(20.392157%,59.607843%,85.882353%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 121.195366 37.997802 L 121.195366 0.00177077 L -0.00172233 0.00177077 L -0.00172233 37.997802 Z M 121.195366 37.997802 "
-       transform="matrix(0.70875,0,0,0.70875,475.145752,61.936245)"
-       id="path9509" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9531">
-      <use
-         xlink:href="#glyph11-0"
-         x="494"
-         y="78.457031"
-         id="use9511" />
-      <use
-         xlink:href="#glyph11-1"
-         x="500.206041"
-         y="78.457031"
-         id="use9513" />
-      <use
-         xlink:href="#glyph11-2"
-         x="502.791134"
-         y="78.457031"
-         id="use9515" />
-      <use
-         xlink:href="#glyph11-3"
-         x="505.889611"
-         y="78.457031"
-         id="use9517" />
-      <use
-         xlink:href="#glyph11-4"
-         x="508.988088"
-         y="78.457031"
-         id="use9519" />
-      <use
-         xlink:href="#glyph11-5"
-         x="515.707513"
-         y="78.457031"
-         id="use9521" />
-      <use
-         xlink:href="#glyph11-6"
-         x="521.391084"
-         y="78.457031"
-         id="use9523" />
-      <use
-         xlink:href="#glyph11-7"
-         x="527.074654"
-         y="78.457031"
-         id="use9525" />
-      <use
-         xlink:href="#glyph11-8"
-         x="532.249384"
-         y="78.457031"
-         id="use9527" />
-      <use
-         xlink:href="#glyph11-2"
-         x="538.968809"
-         y="78.457031"
-         id="use9529" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 369.498736 38.002781 L 369.498736 0.00123793 L -0.000818108 0.00123793 L -0.000818108 38.002781 Z M 369.498736 38.002781 "
-       id="path9533"
-       transform="matrix(0.70875,0,0,0.70875,196.680267,91.05381)" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9547">
-      <use
-         xlink:href="#glyph12-0"
-         x="309.722656"
-         y="107.515625"
-         id="use9535" />
-      <use
-         xlink:href="#glyph12-1"
-         x="316.538528"
-         y="107.515625"
-         id="use9537" />
-      <use
-         xlink:href="#glyph12-2"
-         x="321.787533"
-         y="107.515625"
-         id="use9539" />
-      <use
-         xlink:href="#glyph12-3"
-         x="327.036538"
-         y="107.515625"
-         id="use9541" />
-      <use
-         xlink:href="#glyph12-4"
-         x="333.852409"
-         y="107.515625"
-         id="use9543" />
-      <use
-         xlink:href="#glyph12-5"
-         x="339.617559"
-         y="107.515625"
-         id="use9545" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(20.392157%,59.607843%,85.882353%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(20.392157%,59.607843%,85.882353%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 121.360452 38.002781 L 121.360452 0.00123793 L -0.00198068 0.00123793 L -0.00198068 38.002781 Z M 121.360452 38.002781 "
-       transform="matrix(0.70875,0,0,0.70875,475.145935,91.05381)"
-       id="path9549" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9569">
-      <use
-         xlink:href="#glyph13-0"
-         x="492.582031"
-         y="107.515625"
-         id="use9551" />
-      <use
-         xlink:href="#glyph13-1"
-         x="499.307025"
-         y="107.515625"
-         id="use9553" />
-      <use
-         xlink:href="#glyph13-2"
-         x="504.486044"
-         y="107.515625"
-         id="use9555" />
-      <use
-         xlink:href="#glyph13-3"
-         x="509.665062"
-         y="107.515625"
-         id="use9557" />
-      <use
-         xlink:href="#glyph13-4"
-         x="516.390056"
-         y="107.515625"
-         id="use9559" />
-      <use
-         xlink:href="#glyph13-5"
-         x="522.078337"
-         y="107.515625"
-         id="use9561" />
-      <use
-         xlink:href="#glyph13-6"
-         x="527.766618"
-         y="107.515625"
-         id="use9563" />
-      <use
-         xlink:href="#glyph13-7"
-         x="532.945636"
-         y="107.515625"
-         id="use9565" />
-      <use
-         xlink:href="#glyph13-8"
-         x="539.67063"
-         y="107.515625"
-         id="use9567" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 195.097723 37.152444 L 195.097723 -0.000333702 L -0.0025835 -0.000333702 L -0.0025835 37.152444 Z M 195.097723 37.152444 "
-       transform="matrix(0.70875,0,0,0.70875,196.57605,120.172112)"
-       id="path9571" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9601">
-      <text
-         xml:space="preserve"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-         x="144.60757"
-         y="128.33141"
-         id="text11128"><tspan
-           sodipodi:role="line"
-           id="tspan11126"
-           x="144.60757"
-           y="128.33141"
-           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:10px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">Queue</tspan></text>
-      <text
-         xml:space="preserve"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-         x="213.2944"
-         y="136.33374"
-         id="text11128-4"><tspan
-           sodipodi:role="line"
-           id="tspan11126-8"
-           x="213.2944"
-           y="136.33374"
-           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">QueueCpuNonBlocking</tspan></text>
-      <text
-         xml:space="preserve"
-         style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-         x="353.8718"
-         y="136.33374"
-         id="text11128-4-9"><tspan
-           sodipodi:role="line"
-           id="tspan11126-8-9"
-           x="353.8718"
-           y="136.33374"
-           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">QueueCpuBlocking</tspan></text>
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(20.392157%,59.607843%,85.882353%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(20.392157%,59.607843%,85.882353%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 60.990633 35.745578 L 60.990633 -0.00177616 L 0.00077505 -0.00177616 L 0.00077505 35.745578 Z M 60.990633 35.745578 "
-       transform="matrix(0.70875,0,0,0.70875,475.093201,120.669228)"
-       id="path9603" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 369.933151 37.99855 L 369.933151 0.00251891 L -0.00180845 0.00251891 L -0.00180845 37.99855 Z M 369.933151 37.99855 "
-       transform="matrix(0.70875,0,0,0.70875,196.513,148.396652)"
-       id="path9643" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9661">
-      <use
-         xlink:href="#glyph16-0"
-         x="305.472656"
-         y="164.925781"
-         id="use9645" />
-      <use
-         xlink:href="#glyph16-1"
-         x="311.681305"
-         y="164.925781"
-         id="use9647" />
-      <use
-         xlink:href="#glyph16-2"
-         x="316.85821"
-         y="164.925781"
-         id="use9649" />
-      <use
-         xlink:href="#glyph16-3"
-         x="322.035114"
-         y="164.925781"
-         id="use9651" />
-      <use
-         xlink:href="#glyph16-4"
-         x="327.721073"
-         y="164.925781"
-         id="use9653" />
-      <use
-         xlink:href="#glyph16-5"
-         x="330.820852"
-         y="164.925781"
-         id="use9655" />
-      <use
-         xlink:href="#glyph16-6"
-         x="337.543101"
-         y="164.925781"
-         id="use9657" />
-      <use
-         xlink:href="#glyph16-7"
-         x="343.22906"
-         y="164.925781"
-         id="use9659" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 59.49666 49.87597 L 59.49666 0.0027342 L 0.000409054 0.0027342 L 0.000409054 49.87597 Z M 59.49666 49.87597 "
-       transform="matrix(0.70875,0,0,0.70875,196.683304,233.869156)"
-       id="path9663" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(20.392157%,59.607843%,85.882353%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(20.392157%,59.607843%,85.882353%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 121.837322 50.063338 L 121.837322 0.00271267 L 0.000904225 0.00271267 L 0.000904225 50.063338 Z M 121.837322 50.063338 "
-       transform="matrix(0.70875,0,0,0.70875,475.093109,233.275421)"
-       id="path9701" />
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#linear0);"
-       d="M 561.898438 297.601563 L 561.898438 273.148438 L 196.425781 273.148438 L 196.425781 297.601563 Z M 561.898438 297.601563 "
-       id="path9755" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9763">
-      <use
-         xlink:href="#glyph20-0"
-         x="373.511719"
-         y="290.246094"
-         id="use9757" />
-      <use
-         xlink:href="#glyph20-0"
-         x="376.981641"
-         y="290.246094"
-         id="use9759" />
-      <use
-         xlink:href="#glyph20-0"
-         x="380.451563"
-         y="290.246094"
-         id="use9761" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 59.597675 304.424151 L 59.597675 -0.0015501 L 0.0022175 -0.0015501 L 0.0022175 304.424151 Z M 59.597675 304.424151 "
-       transform="matrix(0.70875,0,0,0.70875,196.646866,300.184692)"
-       id="path9765" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9779">
-      <use
-         xlink:href="#glyph21-0"
-         x="199.867188"
-         y="309.511719"
-         id="use9767" />
-      <use
-         xlink:href="#glyph21-1"
-         x="206.412189"
-         y="309.511719"
-         id="use9769" />
-      <use
-         xlink:href="#glyph21-1"
-         x="211.452592"
-         y="309.511719"
-         id="use9771" />
-      <use
-         xlink:href="#glyph21-2"
-         x="216.492996"
-         y="309.511719"
-         id="use9773" />
-      <use
-         xlink:href="#glyph21-3"
-         x="223.037997"
-         y="309.511719"
-         id="use9775" />
-      <use
-         xlink:href="#glyph21-4"
-         x="228.574033"
-         y="309.511719"
-         id="use9777" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9793">
-      <use
-         xlink:href="#glyph0-5"
-         x="203.410156"
-         y="320.851563"
-         id="use9781" />
-      <use
-         xlink:href="#glyph0-1"
-         x="209.713233"
-         y="320.851563"
-         id="use9783" />
-      <use
-         xlink:href="#glyph0-7"
-         x="214.968872"
-         y="320.851563"
-         id="use9785" />
-      <use
-         xlink:href="#glyph0-3"
-         x="218.646436"
-         y="320.851563"
-         id="use9787" />
-      <use
-         xlink:href="#glyph0-8"
-         x="221.271948"
-         y="320.851563"
-         id="use9789" />
-      <use
-         xlink:href="#glyph0-14"
-         x="226.527588"
-         y="320.851563"
-         id="use9791" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 62.323632 304.49227 L 62.323632 0.000430583 L 0 0.000430583 L 0 304.49227 Z M 62.323632 304.49227 "
-       transform="matrix(0.70875,0,0,0.70875,413.910156,299.831726)"
-       id="path9795" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9809">
-      <use
-         xlink:href="#glyph21-0"
-         x="418.164063"
-         y="308.800781"
-         id="use9797" />
-      <use
-         xlink:href="#glyph21-1"
-         x="424.709064"
-         y="308.800781"
-         id="use9799" />
-      <use
-         xlink:href="#glyph21-1"
-         x="429.749467"
-         y="308.800781"
-         id="use9801" />
-      <use
-         xlink:href="#glyph21-2"
-         x="434.789871"
-         y="308.800781"
-         id="use9803" />
-      <use
-         xlink:href="#glyph21-3"
-         x="441.334872"
-         y="308.800781"
-         id="use9805" />
-      <use
-         xlink:href="#glyph21-4"
-         x="446.870908"
-         y="308.800781"
-         id="use9807" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9825">
-      <use
-         xlink:href="#glyph0-23"
-         x="417.453125"
-         y="320.140625"
-         id="use9811" />
-      <use
-         xlink:href="#glyph0-19"
-         x="423.225562"
-         y="320.140625"
-         id="use9813" />
-      <use
-         xlink:href="#glyph0-7"
-         x="428.997998"
-         y="320.140625"
-         id="use9815" />
-      <use
-         xlink:href="#glyph0-1"
-         x="432.675562"
-         y="320.140625"
-         id="use9817" />
-      <use
-         xlink:href="#glyph0-8"
-         x="437.931201"
-         y="320.140625"
-         id="use9819" />
-      <use
-         xlink:href="#glyph0-16"
-         x="443.186841"
-         y="320.140625"
-         id="use9821" />
-      <use
-         xlink:href="#glyph0-24"
-         x="448.959278"
-         y="320.140625"
-         id="use9823" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 57.226691 304.515306 L 57.226691 0.00142092 L 0.00116257 0.00142092 L 0.00116257 304.515306 Z M 57.226691 304.515306 "
-       transform="matrix(0.70875,0,0,0.70875,371.233551,299.823212)"
-       id="path9827" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9841">
-      <use
-         xlink:href="#glyph21-0"
-         x="374.21875"
-         y="308.800781"
-         id="use9829" />
-      <use
-         xlink:href="#glyph21-1"
-         x="380.763751"
-         y="308.800781"
-         id="use9831" />
-      <use
-         xlink:href="#glyph21-1"
-         x="385.804155"
-         y="308.800781"
-         id="use9833" />
-      <use
-         xlink:href="#glyph21-2"
-         x="390.844558"
-         y="308.800781"
-         id="use9835" />
-      <use
-         xlink:href="#glyph21-3"
-         x="397.389559"
-         y="308.800781"
-         id="use9837" />
-      <use
-         xlink:href="#glyph21-4"
-         x="402.925595"
-         y="308.800781"
-         id="use9839" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9851">
-      <use
-         xlink:href="#glyph0-25"
-         x="377.765625"
-         y="320.140625"
-         id="use9843" />
-      <use
-         xlink:href="#glyph0-9"
-         x="385.116138"
-         y="320.140625"
-         id="use9845" />
-      <use
-         xlink:href="#glyph0-26"
-         x="393.518701"
-         y="320.140625"
-         id="use9847" />
-      <use
-         xlink:href="#glyph0-27"
-         x="399.291138"
-         y="320.140625"
-         id="use9849" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 57.229899 304.490203 L 57.229899 -0.00163622 L -0.00114105 -0.00163622 L -0.00114105 304.490203 Z M 57.229899 304.490203 "
-       transform="matrix(0.70875,0,0,0.70875,240.953934,300.161316)"
-       id="path9853" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9867">
-      <use
-         xlink:href="#glyph21-0"
-         x="243.808594"
-         y="309.511719"
-         id="use9855" />
-      <use
-         xlink:href="#glyph21-1"
-         x="250.353595"
-         y="309.511719"
-         id="use9857" />
-      <use
-         xlink:href="#glyph21-1"
-         x="255.393998"
-         y="309.511719"
-         id="use9859" />
-      <use
-         xlink:href="#glyph21-2"
-         x="260.434402"
-         y="309.511719"
-         id="use9861" />
-      <use
-         xlink:href="#glyph21-3"
-         x="266.979403"
-         y="309.511719"
-         id="use9863" />
-      <use
-         xlink:href="#glyph21-4"
-         x="272.515439"
-         y="309.511719"
-         id="use9865" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9879">
-      <use
-         xlink:href="#glyph22-0"
-         x="247.355469"
-         y="320.851563"
-         id="use9869" />
-      <use
-         xlink:href="#glyph22-1"
-         x="254.663802"
-         y="320.851563"
-         id="use9871" />
-      <use
-         xlink:href="#glyph22-2"
-         x="263.01815"
-         y="320.851563"
-         id="use9873" />
-      <use
-         xlink:href="#glyph22-3"
-         x="268.757462"
-         y="320.851563"
-         id="use9875" />
-      <use
-         xlink:href="#glyph22-4"
-         x="273.982944"
-         y="320.851563"
-         id="use9877" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9893">
-      <use
-         xlink:href="#glyph23-0"
-         x="245.9375"
-         y="332.191406"
-         id="use9881" />
-      <use
-         xlink:href="#glyph23-1"
-         x="252.691283"
-         y="332.191406"
-         id="use9883" />
-      <use
-         xlink:href="#glyph23-2"
-         x="255.289595"
-         y="332.191406"
-         id="use9885" />
-      <use
-         xlink:href="#glyph23-3"
-         x="261.002227"
-         y="332.191406"
-         id="use9887" />
-      <use
-         xlink:href="#glyph23-4"
-         x="266.203416"
-         y="332.191406"
-         id="use9889" />
-      <use
-         xlink:href="#glyph23-5"
-         x="271.404606"
-         y="332.191406"
-         id="use9891" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 59.534573 304.425357 L 59.534573 -0.000344466 L -0.00025835 -0.000344466 L -0.00025835 304.425357 Z M 59.534573 304.425357 "
-       transform="matrix(0.70875,0,0,0.70875,327.023621,299.886963)"
-       id="path9895" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9909">
-      <use
-         xlink:href="#glyph21-0"
-         x="330.277344"
-         y="308.800781"
-         id="use9897" />
-      <use
-         xlink:href="#glyph21-1"
-         x="336.822345"
-         y="308.800781"
-         id="use9899" />
-      <use
-         xlink:href="#glyph21-1"
-         x="341.862748"
-         y="308.800781"
-         id="use9901" />
-      <use
-         xlink:href="#glyph21-2"
-         x="346.903152"
-         y="308.800781"
-         id="use9903" />
-      <use
-         xlink:href="#glyph21-3"
-         x="353.448153"
-         y="308.800781"
-         id="use9905" />
-      <use
-         xlink:href="#glyph21-4"
-         x="358.984189"
-         y="308.800781"
-         id="use9907" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9921">
-      <use
-         xlink:href="#glyph22-0"
-         x="334.53125"
-         y="320.140625"
-         id="use9911" />
-      <use
-         xlink:href="#glyph22-1"
-         x="341.839584"
-         y="320.140625"
-         id="use9913" />
-      <use
-         xlink:href="#glyph22-2"
-         x="350.193931"
-         y="320.140625"
-         id="use9915" />
-      <use
-         xlink:href="#glyph22-3"
-         x="355.933244"
-         y="320.140625"
-         id="use9917" />
-      <use
-         xlink:href="#glyph22-4"
-         x="361.158725"
-         y="320.140625"
-         id="use9919" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9937">
-      <use
-         xlink:href="#glyph0-23"
-         x="329.570313"
-         y="331.480469"
-         id="use9923" />
-      <use
-         xlink:href="#glyph0-19"
-         x="335.342749"
-         y="331.480469"
-         id="use9925" />
-      <use
-         xlink:href="#glyph0-7"
-         x="341.115186"
-         y="331.480469"
-         id="use9927" />
-      <use
-         xlink:href="#glyph0-1"
-         x="344.792749"
-         y="331.480469"
-         id="use9929" />
-      <use
-         xlink:href="#glyph0-8"
-         x="350.048389"
-         y="331.480469"
-         id="use9931" />
-      <use
-         xlink:href="#glyph0-16"
-         x="355.304029"
-         y="331.480469"
-         id="use9933" />
-      <use
-         xlink:href="#glyph0-24"
-         x="361.076465"
-         y="331.480469"
-         id="use9935" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 57.500025 304.424151 L 57.500025 -0.0015501 L -0.00107646 -0.0015501 L -0.00107646 304.424151 Z M 57.500025 304.424151 "
-       transform="matrix(0.70875,0,0,0.70875,283.668732,300.184692)"
-       id="path9939" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9955">
-      <use
-         xlink:href="#glyph24-0"
-         x="286.335938"
-         y="309.511719"
-         id="use9941" />
-      <use
-         xlink:href="#glyph24-1"
-         x="292.837499"
-         y="309.511719"
-         id="use9943" />
-      <use
-         xlink:href="#glyph24-1"
-         x="297.844448"
-         y="309.511719"
-         id="use9945" />
-      <use
-         xlink:href="#glyph24-2"
-         x="302.851397"
-         y="309.511719"
-         id="use9947" />
-      <use
-         xlink:href="#glyph24-3"
-         x="309.352958"
-         y="309.511719"
-         id="use9949" />
-      <use
-         xlink:href="#glyph24-4"
-         x="314.85225"
-         y="309.511719"
-         id="use9951" />
-      <use
-         xlink:href="#glyph24-5"
-         x="320.351542"
-         y="309.511719"
-         id="use9953" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9969">
-      <use
-         xlink:href="#glyph0-28"
-         x="289.878906"
-         y="320.851563"
-         id="use9957" />
-      <use
-         xlink:href="#glyph0-3"
-         x="295.651343"
-         y="320.851563"
-         id="use9959" />
-      <use
-         xlink:href="#glyph0-29"
-         x="298.276856"
-         y="320.851563"
-         id="use9961" />
-      <use
-         xlink:href="#glyph0-1"
-         x="304.049292"
-         y="320.851563"
-         id="use9963" />
-      <use
-         xlink:href="#glyph0-7"
-         x="309.304932"
-         y="320.851563"
-         id="use9965" />
-      <use
-         xlink:href="#glyph0-24"
-         x="312.982495"
-         y="320.851563"
-         id="use9967" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(20.392157%,59.607843%,85.882353%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(20.392157%,59.607843%,85.882353%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 121.283808 304.427295 L 121.283808 0.00159316 L -0.00146398 0.00159316 L -0.00146398 304.427295 Z M 121.283808 304.427295 "
-       transform="matrix(0.70875,0,0,0.70875,475.579163,299.88559)"
-       id="path9971" />
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#linear1);"
-       d="M 460.300781 515.933594 L 472.207031 515.933594 L 472.207031 299.636719 L 460.300781 299.636719 Z M 460.300781 515.933594 "
-       id="path9989" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g9997">
-      <use
-         xlink:href="#glyph26-0"
-         x="463.886719"
-         y="401.507813"
-         id="use9991" />
-      <use
-         xlink:href="#glyph26-0"
-         x="463.886719"
-         y="404.977734"
-         id="use9993" />
-      <use
-         xlink:href="#glyph26-0"
-         x="463.886719"
-         y="408.447656"
-         id="use9995" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 364.798039 35.804105 L 364.798039 0.00163622 L -0.000236821 0.00163622 L -0.000236821 35.804105 Z M 364.798039 35.804105 "
-       transform="matrix(0.70875,0,0,0.70875,198.91423,338.330872)"
-       id="path9999" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10027">
-      <use
-         xlink:href="#glyph27-0"
-         x="290.585938"
-         y="354.160156"
-         id="use10001" />
-      <use
-         xlink:href="#glyph27-1"
-         x="299.429771"
-         y="354.160156"
-         id="use10003" />
-      <use
-         xlink:href="#glyph27-2"
-         x="305.153328"
-         y="354.160156"
-         id="use10005" />
-      <use
-         xlink:href="#glyph27-3"
-         x="308.79975"
-         y="354.160156"
-         id="use10007" />
-      <use
-         xlink:href="#glyph27-4"
-         x="314.010886"
-         y="354.160156"
-         id="use10009" />
-      <use
-         xlink:href="#glyph27-5"
-         x="320.777585"
-         y="354.160156"
-         id="use10011" />
-      <use
-         xlink:href="#glyph27-6"
-         x="323.380866"
-         y="354.160156"
-         id="use10013" />
-      <use
-         xlink:href="#glyph27-7"
-         x="328.592002"
-         y="354.160156"
-         id="use10015" />
-      <use
-         xlink:href="#glyph27-8"
-         x="336.397268"
-         y="354.160156"
-         id="use10017" />
-      <use
-         xlink:href="#glyph27-9"
-         x="341.608404"
-         y="354.160156"
-         id="use10019" />
-      <use
-         xlink:href="#glyph27-10"
-         x="349.939816"
-         y="354.160156"
-         id="use10021" />
-      <use
-         xlink:href="#glyph27-8"
-         x="355.663373"
-         y="354.160156"
-         id="use10023" />
-      <use
-         xlink:href="#glyph27-2"
-         x="360.874509"
-         y="354.160156"
-         id="use10025" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(51.372549%,70.196078%,89.019608%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 117.742434 35.807593 L 117.742434 -0.000387525 L 0.0010334 -0.000387525 L 0.0010334 35.807593 Z M 117.742434 35.807593 "
-       transform="matrix(0.70875,0,0,0.70875,477.00708,338.332306)"
-       id="path10029" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10059">
-      <use
-         xlink:href="#glyph28-0"
-         x="480.53125"
-         y="348.492188"
-         id="use10031" />
-      <use
-         xlink:href="#glyph28-1"
-         x="489.28802"
-         y="348.492188"
-         id="use10033" />
-      <use
-         xlink:href="#glyph28-2"
-         x="494.955231"
-         y="348.492188"
-         id="use10035" />
-      <use
-         xlink:href="#glyph28-3"
-         x="498.565756"
-         y="348.492188"
-         id="use10037" />
-      <use
-         xlink:href="#glyph28-4"
-         x="503.725591"
-         y="348.492188"
-         id="use10039" />
-      <use
-         xlink:href="#glyph28-5"
-         x="510.425675"
-         y="348.492188"
-         id="use10041" />
-      <use
-         xlink:href="#glyph28-6"
-         x="513.003328"
-         y="348.492188"
-         id="use10043" />
-      <use
-         xlink:href="#glyph28-7"
-         x="518.163163"
-         y="348.492188"
-         id="use10045" />
-      <use
-         xlink:href="#glyph28-8"
-         x="524.863247"
-         y="348.492188"
-         id="use10047" />
-      <use
-         xlink:href="#glyph28-9"
-         x="530.530458"
-         y="348.492188"
-         id="use10049" />
-      <use
-         xlink:href="#glyph28-10"
-         x="536.197669"
-         y="348.492188"
-         id="use10051" />
-      <use
-         xlink:href="#glyph28-11"
-         x="541.357504"
-         y="348.492188"
-         id="use10053" />
-      <use
-         xlink:href="#glyph28-8"
-         x="548.057588"
-         y="348.492188"
-         id="use10055" />
-      <use
-         xlink:href="#glyph28-5"
-         x="553.724799"
-         y="348.492188"
-         id="use10057" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10069">
-      <use
-         xlink:href="#glyph0-14"
-         x="511.007813"
-         y="359.832031"
-         id="use10061" />
-      <use
-         xlink:href="#glyph0-6"
-         x="513.633325"
-         y="359.832031"
-         id="use10063" />
-      <use
-         xlink:href="#glyph0-15"
-         x="516.780249"
-         y="359.832031"
-         id="use10065" />
-      <use
-         xlink:href="#glyph0-11"
-         x="519.405762"
-         y="359.832031"
-         id="use10067" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 364.798039 16.863787 L 364.798039 -0.00129175 L -0.000236821 -0.00129175 L -0.000236821 16.863787 Z M 364.798039 16.863787 "
-       transform="matrix(0.70875,0,0,0.70875,198.91423,366.352478)"
-       id="path10071" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10089">
-      <use
-         xlink:href="#glyph29-0"
-         x="306.890625"
-         y="375.421875"
-         id="use10073" />
-      <use
-         xlink:href="#glyph29-1"
-         x="309.514392"
-         y="375.421875"
-         id="use10075" />
-      <use
-         xlink:href="#glyph29-2"
-         x="315.28299"
-         y="375.421875"
-         id="use10077" />
-      <use
-         xlink:href="#glyph29-3"
-         x="320.535135"
-         y="375.421875"
-         id="use10079" />
-      <use
-         xlink:href="#glyph29-4"
-         x="327.880759"
-         y="375.421875"
-         id="use10081" />
-      <use
-         xlink:href="#glyph29-5"
-         x="333.649357"
-         y="375.421875"
-         id="use10083" />
-      <use
-         xlink:href="#glyph29-6"
-         x="340.469307"
-         y="375.421875"
-         id="use10085" />
-      <use
-         xlink:href="#glyph29-7"
-         x="345.721451"
-         y="375.421875"
-         id="use10087" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 115.200379 28.397257 L 115.200379 0.00219597 L -0.000236821 0.00219597 L -0.000236821 28.397257 Z M 115.200379 28.397257 "
-       transform="matrix(0.70875,0,0,0.70875,198.91423,380.572662)"
-       id="path10091" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10111">
-      <use
-         xlink:href="#glyph30-0"
-         x="217.585938"
-         y="393.851563"
-         id="use10093" />
-      <use
-         xlink:href="#glyph30-1"
-         x="220.193132"
-         y="393.851563"
-         id="use10095" />
-      <use
-         xlink:href="#glyph30-2"
-         x="225.925294"
-         y="393.851563"
-         id="use10097" />
-      <use
-         xlink:href="#glyph30-3"
-         x="231.144265"
-         y="393.851563"
-         id="use10099" />
-      <use
-         xlink:href="#glyph30-4"
-         x="237.921138"
-         y="393.851563"
-         id="use10101" />
-      <use
-         xlink:href="#glyph30-5"
-         x="241.046106"
-         y="393.851563"
-         id="use10103" />
-      <use
-         xlink:href="#glyph30-6"
-         x="246.778268"
-         y="393.851563"
-         id="use10105" />
-      <use
-         xlink:href="#glyph30-7"
-         x="251.997239"
-         y="393.851563"
-         id="use10107" />
-      <use
-         xlink:href="#glyph30-8"
-         x="255.649144"
-         y="393.851563"
-         id="use10109" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 176.143454 35.807894 L 176.143454 -0.0000861166 L 0.0025835 -0.0000861166 L 0.0025835 35.807894 Z M 176.143454 35.807894 "
-       transform="matrix(0.70875,0,0,0.70875,198.814575,403.343811)"
-       id="path10113" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10135">
-      <use
-         xlink:href="#glyph31-0"
-         x="233.179688"
-         y="419.367188"
-         id="use10115" />
-      <use
-         xlink:href="#glyph31-1"
-         x="239.787793"
-         y="419.367188"
-         id="use10117" />
-      <use
-         xlink:href="#glyph31-2"
-         x="242.834938"
-         y="419.367188"
-         id="use10119" />
-      <use
-         xlink:href="#glyph31-3"
-         x="248.42435"
-         y="419.367188"
-         id="use10121" />
-      <use
-         xlink:href="#glyph31-4"
-         x="256.560496"
-         y="419.367188"
-         id="use10123" />
-      <use
-         xlink:href="#glyph31-5"
-         x="259.102762"
-         y="419.367188"
-         id="use10125" />
-      <use
-         xlink:href="#glyph31-6"
-         x="264.191763"
-         y="419.367188"
-         id="use10127" />
-      <use
-         xlink:href="#glyph31-2"
-         x="270.799869"
-         y="419.367188"
-         id="use10129" />
-      <use
-         xlink:href="#glyph31-7"
-         x="276.38928"
-         y="419.367188"
-         id="use10131" />
-      <use
-         xlink:href="#glyph31-8"
-         x="283.506733"
-         y="419.367188"
-         id="use10133" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 364.992038 35.804751 L 364.992038 0.00228209 L 0.000861166 0.00228209 L 0.000861166 35.804751 Z M 364.992038 35.804751 "
-       transform="matrix(0.70875,0,0,0.70875,198.776733,431.580414)"
-       id="path10137" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(51.372549%,70.196078%,89.019608%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 117.74041 35.804751 L 117.74041 0.00228209 L -0.000990341 0.00228209 L -0.000990341 35.804751 Z M 117.74041 35.804751 "
-       transform="matrix(0.70875,0,0,0.70875,477.008514,431.580414)"
-       id="path10155" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10177">
-      <use
-         xlink:href="#glyph32-0"
-         x="489.039063"
-         y="447.714844"
-         id="use10157" />
-      <use
-         xlink:href="#glyph32-1"
-         x="495.846879"
-         y="447.714844"
-         id="use10159" />
-      <use
-         xlink:href="#glyph32-2"
-         x="501.08968"
-         y="447.714844"
-         id="use10161" />
-      <use
-         xlink:href="#glyph32-3"
-         x="506.848015"
-         y="447.714844"
-         id="use10163" />
-      <use
-         xlink:href="#glyph32-4"
-         x="512.606351"
-         y="447.714844"
-         id="use10165" />
-      <use
-         xlink:href="#glyph32-5"
-         x="519.414167"
-         y="447.714844"
-         id="use10167" />
-      <use
-         xlink:href="#glyph32-0"
-         x="525.172503"
-         y="447.714844"
-         id="use10169" />
-      <use
-         xlink:href="#glyph32-1"
-         x="531.980319"
-         y="447.714844"
-         id="use10171" />
-      <use
-         xlink:href="#glyph32-2"
-         x="537.22312"
-         y="447.714844"
-         id="use10173" />
-      <use
-         xlink:href="#glyph32-3"
-         x="542.981455"
-         y="447.714844"
-         id="use10175" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 364.844025 35.805138 L 364.844025 0.00266962 L 0.00165774 0.00266962 L 0.00165774 35.805138 Z M 364.844025 35.805138 "
-       transform="matrix(0.70875,0,0,0.70875,198.881638,459.81842)"
-       id="path10179" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(51.372549%,70.196078%,89.019608%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 117.704155 35.805138 L 117.704155 0.00266962 L 0.00133481 0.00266962 L 0.00133481 35.805138 Z M 117.704155 35.805138 "
-       transform="matrix(0.70875,0,0,0.70875,477.022491,459.81842)"
-       id="path10197" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10229">
-      <use
-         xlink:href="#glyph0-10"
-         x="481.242188"
-         y="476.066406"
-         id="use10199" />
-      <use
-         xlink:href="#glyph0-8"
-         x="489.114111"
-         y="476.066406"
-         id="use10201" />
-      <use
-         xlink:href="#glyph0-6"
-         x="494.369751"
-         y="476.066406"
-         id="use10203" />
-      <use
-         xlink:href="#glyph0-19"
-         x="497.516675"
-         y="476.066406"
-         id="use10205" />
-      <use
-         xlink:href="#glyph0-30"
-         x="503.289112"
-         y="476.066406"
-         id="use10207" />
-      <use
-         xlink:href="#glyph0-31"
-         x="510.113599"
-         y="476.066406"
-         id="use10209" />
-      <use
-         xlink:href="#glyph0-16"
-         x="515.886036"
-         y="476.066406"
-         id="use10211" />
-      <use
-         xlink:href="#glyph0-8"
-         x="521.658472"
-         y="476.066406"
-         id="use10213" />
-      <use
-         xlink:href="#glyph0-32"
-         x="526.914112"
-         y="476.066406"
-         id="use10215" />
-      <use
-         xlink:href="#glyph0-31"
-         x="533.738599"
-         y="476.066406"
-         id="use10217" />
-      <use
-         xlink:href="#glyph0-3"
-         x="539.511036"
-         y="476.066406"
-         id="use10219" />
-      <use
-         xlink:href="#glyph0-14"
-         x="542.136549"
-         y="476.066406"
-         id="use10221" />
-      <use
-         xlink:href="#glyph0-6"
-         x="544.762061"
-         y="476.066406"
-         id="use10223" />
-      <use
-         xlink:href="#glyph0-15"
-         x="547.908985"
-         y="476.066406"
-         id="use10225" />
-      <use
-         xlink:href="#glyph0-11"
-         x="550.534498"
-         y="476.066406"
-         id="use10227" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(51.372549%,70.196078%,89.019608%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 117.200588 35.808884 L 117.200588 0.000904225 L -0.000688933 0.000904225 L -0.000688933 35.808884 Z M 117.200588 35.808884 "
-       transform="matrix(0.70875,0,0,0.70875,477.164551,403.343109)"
-       id="path10231" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10263">
-      <use
-         xlink:href="#glyph33-0"
-         x="480.53125"
-         y="413.695313"
-         id="use10233" />
-      <use
-         xlink:href="#glyph33-1"
-         x="487.211809"
-         y="413.695313"
-         id="use10235" />
-      <use
-         xlink:href="#glyph33-2"
-         x="490.292364"
-         y="413.695313"
-         id="use10237" />
-      <use
-         xlink:href="#glyph33-3"
-         x="495.943059"
-         y="413.695313"
-         id="use10239" />
-      <use
-         xlink:href="#glyph33-4"
-         x="504.168412"
-         y="413.695313"
-         id="use10241" />
-      <use
-         xlink:href="#glyph33-5"
-         x="506.738553"
-         y="413.695313"
-         id="use10243" />
-      <use
-         xlink:href="#glyph33-6"
-         x="511.883351"
-         y="413.695313"
-         id="use10245" />
-      <use
-         xlink:href="#glyph33-7"
-         x="518.56391"
-         y="413.695313"
-         id="use10247" />
-      <use
-         xlink:href="#glyph33-8"
-         x="524.214605"
-         y="413.695313"
-         id="use10249" />
-      <use
-         xlink:href="#glyph33-9"
-         x="529.865301"
-         y="413.695313"
-         id="use10251" />
-      <use
-         xlink:href="#glyph33-10"
-         x="535.010099"
-         y="413.695313"
-         id="use10253" />
-      <use
-         xlink:href="#glyph33-7"
-         x="541.690657"
-         y="413.695313"
-         id="use10255" />
-      <use
-         xlink:href="#glyph33-4"
-         x="547.341353"
-         y="413.695313"
-         id="use10257" />
-      <use
-         xlink:href="#glyph33-11"
-         x="549.911493"
-         y="413.695313"
-         id="use10259" />
-      <use
-         xlink:href="#glyph33-1"
-         x="552.481634"
-         y="413.695313"
-         id="use10261" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10269">
-      <use
-         xlink:href="#glyph0-15"
-         x="513.84375"
-         y="425.035156"
-         id="use10265" />
-      <use
-         xlink:href="#glyph0-11"
-         x="516.469263"
-         y="425.035156"
-         id="use10267" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(51.372549%,70.196078%,89.019608%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 117.640084 16.085508 L 117.640084 -0.00245432 L -0.00210986 -0.00245432 L -0.00210986 16.085508 Z M 117.640084 16.085508 "
-       transform="matrix(0.70875,0,0,0.70875,477.044464,366.62674)"
-       id="path10271" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10305">
-      <use
-         xlink:href="#glyph15-15"
-         x="485.492188"
-         y="375.132813"
-         id="use10273" />
-      <use
-         xlink:href="#glyph15-8"
-         x="487.592598"
-         y="375.132813"
-         id="use10275" />
-      <use
-         xlink:href="#glyph15-16"
-         x="492.210547"
-         y="375.132813"
-         id="use10277" />
-      <use
-         xlink:href="#glyph15-17"
-         x="496.415059"
-         y="375.132813"
-         id="use10279" />
-      <use
-         xlink:href="#glyph15-18"
-         x="502.295469"
-         y="375.132813"
-         id="use10281" />
-      <use
-         xlink:href="#glyph15-6"
-         x="506.913418"
-         y="375.132813"
-         id="use10283" />
-      <use
-         xlink:href="#glyph15-7"
-         x="512.373008"
-         y="375.132813"
-         id="use10285" />
-      <use
-         xlink:href="#glyph15-8"
-         x="516.990957"
-         y="375.132813"
-         id="use10287" />
-      <use
-         xlink:href="#glyph15-4"
-         x="521.608907"
-         y="375.132813"
-         id="use10289" />
-      <use
-         xlink:href="#glyph15-19"
-         x="525.813419"
-         y="375.132813"
-         id="use10291" />
-      <use
-         xlink:href="#glyph15-7"
-         x="531.273008"
-         y="375.132813"
-         id="use10293" />
-      <use
-         xlink:href="#glyph15-20"
-         x="535.890958"
-         y="375.132813"
-         id="use10295" />
-      <use
-         xlink:href="#glyph15-21"
-         x="537.991368"
-         y="375.132813"
-         id="use10297" />
-      <use
-         xlink:href="#glyph15-1"
-         x="540.091778"
-         y="375.132813"
-         id="use10299" />
-      <use
-         xlink:href="#glyph15-15"
-         x="542.609317"
-         y="375.132813"
-         id="use10301" />
-      <use
-         xlink:href="#glyph15-13"
-         x="544.709727"
-         y="375.132813"
-         id="use10303" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(51.372549%,70.196078%,89.019608%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 117.719182 28.39743 L 117.719182 0.00236821 L -0.000172233 0.00236821 L -0.000172233 28.39743 Z M 117.719182 28.39743 "
-       transform="matrix(0.70875,0,0,0.70875,477.015747,380.57254)"
-       id="path10307" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10341">
-      <use
-         xlink:href="#glyph15-15"
-         x="486.910156"
-         y="393.558594"
-         id="use10309" />
-      <use
-         xlink:href="#glyph15-8"
-         x="489.010566"
-         y="393.558594"
-         id="use10311" />
-      <use
-         xlink:href="#glyph15-16"
-         x="493.628516"
-         y="393.558594"
-         id="use10313" />
-      <use
-         xlink:href="#glyph15-19"
-         x="497.833027"
-         y="393.558594"
-         id="use10315" />
-      <use
-         xlink:href="#glyph15-1"
-         x="503.292617"
-         y="393.558594"
-         id="use10317" />
-      <use
-         xlink:href="#glyph15-6"
-         x="505.810157"
-         y="393.558594"
-         id="use10319" />
-      <use
-         xlink:href="#glyph15-7"
-         x="511.269746"
-         y="393.558594"
-         id="use10321" />
-      <use
-         xlink:href="#glyph15-8"
-         x="515.887696"
-         y="393.558594"
-         id="use10323" />
-      <use
-         xlink:href="#glyph15-4"
-         x="520.505645"
-         y="393.558594"
-         id="use10325" />
-      <use
-         xlink:href="#glyph15-19"
-         x="524.710157"
-         y="393.558594"
-         id="use10327" />
-      <use
-         xlink:href="#glyph15-7"
-         x="530.169747"
-         y="393.558594"
-         id="use10329" />
-      <use
-         xlink:href="#glyph15-20"
-         x="534.787696"
-         y="393.558594"
-         id="use10331" />
-      <use
-         xlink:href="#glyph15-21"
-         x="536.888106"
-         y="393.558594"
-         id="use10333" />
-      <use
-         xlink:href="#glyph15-1"
-         x="538.988516"
-         y="393.558594"
-         id="use10335" />
-      <use
-         xlink:href="#glyph15-15"
-         x="541.506055"
-         y="393.558594"
-         id="use10337" />
-      <use
-         xlink:href="#glyph15-13"
-         x="543.606466"
-         y="393.558594"
-         id="use10339" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 57.219393 50.228467 L 57.219393 0.00249738 L -0.000624346 0.00249738 L -0.000624346 50.228467 Z M 57.219393 50.228467 "
-       transform="matrix(0.70875,0,0,0.70875,240.957474,233.744324)"
-       id="path10343" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 58.262889 50.228639 L 58.262889 0.00266962 L 0.00120563 0.00266962 L 0.00120563 50.228639 Z M 58.262889 50.228639 "
-       transform="matrix(0.70875,0,0,0.70875,283.397583,233.744202)"
-       id="path10381" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 60.800402 50.228639 L 60.800402 0.00266962 L -0.0020668 0.00266962 L -0.0020668 50.228639 Z M 60.800402 50.228639 "
-       transform="matrix(0.70875,0,0,0.70875,326.575684,233.744202)"
-       id="path10425" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 61.062024 50.228639 L 61.062024 0.00266962 L 0.0005167 0.00266962 L 0.0005167 50.228639 Z M 61.062024 50.228639 "
-       transform="matrix(0.70875,0,0,0.70875,371.234009,233.744202)"
-       id="path10471" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 59.92438 50.228639 L 59.92438 0.00266962 L -0.00176539 0.00266962 L -0.00176539 50.228639 Z M 59.92438 50.228639 "
-       transform="matrix(0.70875,0,0,0.70875,416.130157,233.744202)"
-       id="path10511" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 55.198214 28.000604 L 55.198214 0.00236821 L 0.000904225 0.00236821 L 0.000904225 28.000604 Z M 55.198214 28.000604 "
-       transform="matrix(0.70875,0,0,0.70875,284.483734,380.57254)"
-       id="path10545" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10563">
-      <use
-         xlink:href="#glyph15-15"
-         x="288.460938"
-         y="388.597656"
-         id="use10547" />
-      <use
-         xlink:href="#glyph15-8"
-         x="290.561348"
-         y="388.597656"
-         id="use10549" />
-      <use
-         xlink:href="#glyph15-16"
-         x="295.179297"
-         y="388.597656"
-         id="use10551" />
-      <use
-         xlink:href="#glyph15-19"
-         x="299.383809"
-         y="388.597656"
-         id="use10553" />
-      <use
-         xlink:href="#glyph15-1"
-         x="304.843399"
-         y="388.597656"
-         id="use10555" />
-      <use
-         xlink:href="#glyph15-22"
-         x="307.360938"
-         y="388.597656"
-         id="use10557" />
-      <use
-         xlink:href="#glyph15-20"
-         x="311.978887"
-         y="388.597656"
-         id="use10559" />
-      <use
-         xlink:href="#glyph15-18"
-         x="314.079297"
-         y="388.597656"
-         id="use10561" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10579">
-      <use
-         xlink:href="#glyph15-3"
-         x="287.753906"
-         y="398.519531"
-         id="use10565" />
-      <use
-         xlink:href="#glyph15-2"
-         x="291.958418"
-         y="398.519531"
-         id="use10567" />
-      <use
-         xlink:href="#glyph15-15"
-         x="294.900469"
-         y="398.519531"
-         id="use10569" />
-      <use
-         xlink:href="#glyph15-8"
-         x="297.000879"
-         y="398.519531"
-         id="use10571" />
-      <use
-         xlink:href="#glyph15-23"
-         x="301.618828"
-         y="398.519531"
-         id="use10573" />
-      <use
-         xlink:href="#glyph15-4"
-         x="307.916367"
-         y="398.519531"
-         id="use10575" />
-      <use
-         xlink:href="#glyph15-24"
-         x="312.120879"
-         y="398.519531"
-         id="use10577" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 116.943401 28.39885 L 116.943401 -0.00172233 L 0.00116257 -0.00172233 L 0.00116257 28.39885 Z M 116.943401 28.39885 "
-       transform="matrix(0.70875,0,0,0.70875,328.190582,380.430908)"
-       id="path10581" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10599">
-      <use
-         xlink:href="#glyph0-15"
-         x="346.578125"
-         y="393.851563"
-         id="use10583" />
-      <use
-         xlink:href="#glyph0-16"
-         x="349.203638"
-         y="393.851563"
-         id="use10585" />
-      <use
-         xlink:href="#glyph0-17"
-         x="354.976074"
-         y="393.851563"
-         id="use10587" />
-      <use
-         xlink:href="#glyph0-32"
-         x="360.231714"
-         y="393.851563"
-         id="use10589" />
-      <use
-         xlink:href="#glyph0-6"
-         x="367.056201"
-         y="393.851563"
-         id="use10591" />
-      <use
-         xlink:href="#glyph0-25"
-         x="370.203125"
-         y="393.851563"
-         id="use10593" />
-      <use
-         xlink:href="#glyph0-9"
-         x="377.553638"
-         y="393.851563"
-         id="use10595" />
-      <use
-         xlink:href="#glyph0-26"
-         x="385.956202"
-         y="393.851563"
-         id="use10597" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 59.998053 28.198974 L 59.998053 0.00232515 L 0.00025835 0.00232515 L 0.00025835 28.198974 Z M 59.998053 28.198974 "
-       transform="matrix(0.70875,0,0,0.70875,414.941223,380.572571)"
-       id="path10601" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10621">
-      <use
-         xlink:href="#glyph15-15"
-         x="416.746094"
-         y="388.597656"
-         id="use10603" />
-      <use
-         xlink:href="#glyph15-8"
-         x="418.846504"
-         y="388.597656"
-         id="use10605" />
-      <use
-         xlink:href="#glyph15-16"
-         x="423.464453"
-         y="388.597656"
-         id="use10607" />
-      <use
-         xlink:href="#glyph15-19"
-         x="427.668965"
-         y="388.597656"
-         id="use10609" />
-      <use
-         xlink:href="#glyph15-1"
-         x="433.128555"
-         y="388.597656"
-         id="use10611" />
-      <use
-         xlink:href="#glyph15-25"
-         x="435.646094"
-         y="388.597656"
-         id="use10613" />
-      <use
-         xlink:href="#glyph15-26"
-         x="440.264043"
-         y="388.597656"
-         id="use10615" />
-      <use
-         xlink:href="#glyph15-2"
-         x="444.881993"
-         y="388.597656"
-         id="use10617" />
-      <use
-         xlink:href="#glyph15-3"
-         x="447.824043"
-         y="388.597656"
-         id="use10619" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10637">
-      <use
-         xlink:href="#glyph15-4"
-         x="418.871094"
-         y="398.519531"
-         id="use10623" />
-      <use
-         xlink:href="#glyph15-8"
-         x="423.075606"
-         y="398.519531"
-         id="use10625" />
-      <use
-         xlink:href="#glyph15-15"
-         x="427.693555"
-         y="398.519531"
-         id="use10627" />
-      <use
-         xlink:href="#glyph15-8"
-         x="429.793965"
-         y="398.519531"
-         id="use10629" />
-      <use
-         xlink:href="#glyph15-23"
-         x="434.411914"
-         y="398.519531"
-         id="use10631" />
-      <use
-         xlink:href="#glyph15-4"
-         x="440.709453"
-         y="398.519531"
-         id="use10633" />
-      <use
-         xlink:href="#glyph15-24"
-         x="444.913965"
-         y="398.519531"
-         id="use10635" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 59.998053 35.807894 L 59.998053 -0.0000861166 L 0.00025835 -0.0000861166 L 0.00025835 35.807894 Z M 59.998053 35.807894 "
-       transform="matrix(0.70875,0,0,0.70875,414.941223,403.343811)"
-       id="path10639" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(50.588235%,72.941176%,80.784314%);fill-opacity:0.901961;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,100%,100%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 116.943315 35.808884 L 116.943315 0.000904225 L 0.00107646 0.000904225 L 0.00107646 35.808884 Z M 116.943315 35.808884 "
-       transform="matrix(0.70875,0,0,0.70875,328.190643,403.343109)"
-       id="path10671" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10701">
-      <use
-         xlink:href="#glyph38-0"
-         x="335.238281"
-         y="413.695313"
-         id="use10673" />
-      <use
-         xlink:href="#glyph38-1"
-         x="341.887625"
-         y="413.695313"
-         id="use10675" />
-      <use
-         xlink:href="#glyph38-2"
-         x="344.953786"
-         y="413.695313"
-         id="use10677" />
-      <use
-         xlink:href="#glyph38-3"
-         x="350.578078"
-         y="413.695313"
-         id="use10679" />
-      <use
-         xlink:href="#glyph38-4"
-         x="358.764998"
-         y="413.695313"
-         id="use10681" />
-      <use
-         xlink:href="#glyph38-5"
-         x="361.323129"
-         y="413.695313"
-         id="use10683" />
-      <use
-         xlink:href="#glyph38-6"
-         x="366.443888"
-         y="413.695313"
-         id="use10685" />
-      <use
-         xlink:href="#glyph38-3"
-         x="373.605756"
-         y="413.695313"
-         id="use10687" />
-      <use
-         xlink:href="#glyph38-7"
-         x="381.792676"
-         y="413.695313"
-         id="use10689" />
-      <use
-         xlink:href="#glyph38-8"
-         x="387.416968"
-         y="413.695313"
-         id="use10691" />
-      <use
-         xlink:href="#glyph38-9"
-         x="394.066312"
-         y="413.695313"
-         id="use10693" />
-      <use
-         xlink:href="#glyph38-4"
-         x="397.649494"
-         y="413.695313"
-         id="use10695" />
-      <use
-         xlink:href="#glyph38-1"
-         x="400.207625"
-         y="413.695313"
-         id="use10697" />
-      <use
-         xlink:href="#glyph38-10"
-         x="403.273786"
-         y="413.695313"
-         id="use10699" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10709">
-      <use
-         xlink:href="#glyph0-5"
-         x="360.753906"
-         y="425.035156"
-         id="use10703" />
-      <use
-         xlink:href="#glyph0-1"
-         x="367.056983"
-         y="425.035156"
-         id="use10705" />
-      <use
-         xlink:href="#glyph0-4"
-         x="372.312622"
-         y="425.035156"
-         id="use10707" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 84.107714 75.365263 L 84.107714 0.00150704 L -0.0027342 0.00150704 L -0.0027342 75.365263 Z M 84.107714 75.365263 "
-       transform="matrix(0.70875,0,0,0.70875,130.732407,178.037994)"
-       id="path10711" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10725">
-      <use
-         xlink:href="#glyph39-0"
-         x="146.710938"
-         y="186.894531"
-         id="use10713" />
-      <use
-         xlink:href="#glyph39-1"
-         x="153.356088"
-         y="186.894531"
-         id="use10715" />
-      <use
-         xlink:href="#glyph39-2"
-         x="158.976834"
-         y="186.894531"
-         id="use10717" />
-      <use
-         xlink:href="#glyph39-2"
-         x="162.041062"
-         y="186.894531"
-         id="use10719" />
-      <use
-         xlink:href="#glyph39-3"
-         x="165.105289"
-         y="186.894531"
-         id="use10721" />
-      <use
-         xlink:href="#glyph39-4"
-         x="170.222819"
-         y="186.894531"
-         id="use10723" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 369.600203 23.105563 L 369.600203 0.00150704 L 0.00144245 0.00150704 L 0.00144245 23.105563 Z M 369.600203 23.105563 "
-       transform="matrix(0.70875,0,0,0.70875,196.748978,178.037994)"
-       id="path10727" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10741">
-      <use
-         xlink:href="#glyph40-0"
-         x="311.140625"
-         y="189.730469"
-         id="use10729" />
-      <use
-         xlink:href="#glyph40-1"
-         x="317.846676"
-         y="189.730469"
-         id="use10731" />
-      <use
-         xlink:href="#glyph40-2"
-         x="323.518935"
-         y="189.730469"
-         id="use10733" />
-      <use
-         xlink:href="#glyph40-3"
-         x="326.611245"
-         y="189.730469"
-         id="use10735" />
-      <use
-         xlink:href="#glyph40-4"
-         x="333.317296"
-         y="189.730469"
-         id="use10737" />
-      <use
-         xlink:href="#glyph40-1"
-         x="338.989555"
-         y="189.730469"
-         id="use10739" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(20.392157%,59.607843%,85.882353%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(20.392157%,59.607843%,85.882353%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 121.547109 74.573011 L 121.547109 -0.00260503 L -0.00271267 -0.00260503 L -0.00271267 74.573011 Z M 121.547109 74.573011 "
-       transform="matrix(0.70875,0,0,0.70875,475.185516,178.103409)"
-       id="path10743" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10763">
-      <use
-         xlink:href="#glyph41-0"
-         x="494"
-         y="208.160156"
-         id="use10745" />
-      <use
-         xlink:href="#glyph41-1"
-         x="500.647874"
-         y="208.160156"
-         id="use10747" />
-      <use
-         xlink:href="#glyph41-2"
-         x="506.270923"
-         y="208.160156"
-         id="use10749" />
-      <use
-         xlink:href="#glyph41-3"
-         x="509.336406"
-         y="208.160156"
-         id="use10751" />
-      <use
-         xlink:href="#glyph41-1"
-         x="515.98428"
-         y="208.160156"
-         id="use10753" />
-      <use
-         xlink:href="#glyph41-4"
-         x="521.607329"
-         y="208.160156"
-         id="use10755" />
-      <use
-         xlink:href="#glyph41-5"
-         x="527.230379"
-         y="208.160156"
-         id="use10757" />
-      <use
-         xlink:href="#glyph41-6"
-         x="532.350006"
-         y="208.160156"
-         id="use10759" />
-      <use
-         xlink:href="#glyph41-7"
-         x="538.997879"
-         y="208.160156"
-         id="use10761" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 369.600203 24.054999 L 369.600203 -0.00254044 L 0.00144245 -0.00254044 L 0.00144245 24.054999 Z M 369.600203 24.054999 "
-       transform="matrix(0.70875,0,0,0.70875,196.748978,196.220551)"
-       id="path10765" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10789">
-      <use
-         xlink:href="#glyph42-0"
-         x="304.054688"
-         y="208.160156"
-         id="use10767" />
-      <use
-         xlink:href="#glyph42-1"
-         x="308.980388"
-         y="208.160156"
-         id="use10769" />
-      <use
-         xlink:href="#glyph42-2"
-         x="311.929753"
-         y="208.160156"
-         id="use10771" />
-      <use
-         xlink:href="#glyph42-3"
-         x="317.339807"
-         y="208.160156"
-         id="use10773" />
-      <use
-         xlink:href="#glyph42-3"
-         x="320.289173"
-         y="208.160156"
-         id="use10775" />
-      <use
-         xlink:href="#glyph42-4"
-         x="323.238538"
-         y="208.160156"
-         id="use10777" />
-      <use
-         xlink:href="#glyph42-5"
-         x="328.164239"
-         y="208.160156"
-         id="use10779" />
-      <use
-         xlink:href="#glyph42-6"
-         x="333.089939"
-         y="208.160156"
-         id="use10781" />
-      <use
-         xlink:href="#glyph42-1"
-         x="338.015639"
-         y="208.160156"
-         id="use10783" />
-      <use
-         xlink:href="#glyph42-7"
-         x="340.965005"
-         y="208.160156"
-         id="use10785" />
-      <use
-         xlink:href="#glyph42-8"
-         x="346.375058"
-         y="208.160156"
-         id="use10787" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(17.647059%,63.529412%,74.901961%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,63.529412%,74.901961%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 369.676201 23.497028 L 369.676201 0.00165774 L 0.000279879 0.00165774 L 0.000279879 23.497028 Z M 369.676201 23.497028 "
-       transform="matrix(0.70875,0,0,0.70875,196.695114,214.893356)"
-       id="path10791" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10813">
-      <use
-         xlink:href="#glyph43-0"
-         x="306.179688"
-         y="226.585938"
-         id="use10793" />
-      <use
-         xlink:href="#glyph43-1"
-         x="311.236234"
-         y="226.585938"
-         id="use10795" />
-      <use
-         xlink:href="#glyph43-2"
-         x="314.263947"
-         y="226.585938"
-         id="use10797" />
-      <use
-         xlink:href="#glyph43-3"
-         x="319.817714"
-         y="226.585938"
-         id="use10799" />
-      <use
-         xlink:href="#glyph43-3"
-         x="322.845427"
-         y="226.585938"
-         id="use10801" />
-      <use
-         xlink:href="#glyph43-4"
-         x="325.87314"
-         y="226.585938"
-         id="use10803" />
-      <use
-         xlink:href="#glyph43-5"
-         x="330.929687"
-         y="226.585938"
-         id="use10805" />
-      <use
-         xlink:href="#glyph43-5"
-         x="334.467938"
-         y="226.585938"
-         id="use10807" />
-      <use
-         xlink:href="#glyph43-4"
-         x="338.006189"
-         y="226.585938"
-         id="use10809" />
-      <use
-         xlink:href="#glyph43-6"
-         x="343.062736"
-         y="226.585938"
-         id="use10811" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.798286 62.399286 L 76.798286 -0.00150704 L 0.0015501 -0.00150704 L 0.0015501 62.399286 Z M 76.798286 62.399286 "
-       transform="matrix(0.70875,0,0,0.70875,133.244995,187.676849)"
-       id="path10815" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10825">
-      <use
-         xlink:href="#glyph15-27"
-         x="150.253906"
-         y="195.820313"
-         id="use10817" />
-      <use
-         xlink:href="#glyph15-20"
-         x="155.296367"
-         y="195.820313"
-         id="use10819" />
-      <use
-         xlink:href="#glyph15-3"
-         x="157.396777"
-         y="195.820313"
-         id="use10821" />
-      <use
-         xlink:href="#glyph15-28"
-         x="161.601289"
-         y="195.820313"
-         id="use10823" />
-    </g>
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#linear2);"
-       d="M 460.261719 269.324219 L 472.167969 269.324219 L 472.167969 61.554688 L 460.261719 61.554688 Z M 460.261719 269.324219 "
-       id="path10827" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10835">
-      <use
-         xlink:href="#glyph26-0"
-         x="463.894531"
-         y="159.203125"
-         id="use10829" />
-      <use
-         xlink:href="#glyph26-0"
-         x="463.894532"
-         y="162.673047"
-         id="use10831" />
-      <use
-         xlink:href="#glyph26-0"
-         x="463.894532"
-         y="166.142968"
-         id="use10833" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.929291 13.789381 L 76.929291 -0.000301408 L 0.000279879 -0.000301408 L 0.000279879 13.789381 Z M 76.929291 13.789381 "
-       transform="matrix(0.70875,0,0,0.70875,133.429489,311.925995)"
-       id="path10837" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10845">
-      <use
-         xlink:href="#glyph0-0"
-         x="150.964844"
-         y="320.140625"
-         id="use10839" />
-      <use
-         xlink:href="#glyph0-3"
-         x="157.789331"
-         y="320.140625"
-         id="use10841" />
-      <use
-         xlink:href="#glyph0-9"
-         x="160.414844"
-         y="320.140625"
-         id="use10843" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 70.402749 9.597741 L 70.402749 0.00228209 L -0.000688933 0.00228209 L -0.000688933 9.597741 Z M 70.402749 9.597741 "
-       transform="matrix(0.70875,0,0,0.70875,135.703613,204.529633)"
-       id="path10847" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10861">
-      <use
-         xlink:href="#glyph44-0"
-         x="147.421875"
-         y="211.703125"
-         id="use10849" />
-      <use
-         xlink:href="#glyph44-1"
-         x="153.04368"
-         y="211.703125"
-         id="use10851" />
-      <use
-         xlink:href="#glyph44-2"
-         x="157.731262"
-         y="211.703125"
-         id="use10853" />
-      <use
-         xlink:href="#glyph44-3"
-         x="160.538049"
-         y="211.703125"
-         id="use10855" />
-      <use
-         xlink:href="#glyph44-4"
-         x="165.225631"
-         y="211.703125"
-         id="use10857" />
-      <use
-         xlink:href="#glyph44-2"
-         x="170.374151"
-         y="211.703125"
-         id="use10859" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(20.392157%,59.607843%,85.882353%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(20.392157%,59.607843%,85.882353%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 121.47335 39.243344 L 121.47335 0.00172233 L 0.000688933 0.00172233 L 0.000688933 39.243344 Z M 121.47335 39.243344 "
-       transform="matrix(0.70875,0,0,0.70875,475.237793,147.955811)"
-       id="path10863" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10887">
-      <use
-         xlink:href="#glyph45-0"
-         x="489.039063"
-         y="164.925781"
-         id="use10865" />
-      <use
-         xlink:href="#glyph45-1"
-         x="495.199122"
-         y="164.925781"
-         id="use10867" />
-      <use
-         xlink:href="#glyph45-2"
-         x="500.335512"
-         y="164.925781"
-         id="use10869" />
-      <use
-         xlink:href="#glyph45-3"
-         x="505.471901"
-         y="164.925781"
-         id="use10871" />
-      <use
-         xlink:href="#glyph45-4"
-         x="511.113361"
-         y="164.925781"
-         id="use10873" />
-      <use
-         xlink:href="#glyph45-5"
-         x="514.188881"
-         y="164.925781"
-         id="use10875" />
-      <use
-         xlink:href="#glyph45-6"
-         x="520.858521"
-         y="164.925781"
-         id="use10877" />
-      <use
-         xlink:href="#glyph45-7"
-         x="526.499981"
-         y="164.925781"
-         id="use10879" />
-      <use
-         xlink:href="#glyph45-8"
-         x="532.141442"
-         y="164.925781"
-         id="use10881" />
-      <use
-         xlink:href="#glyph45-9"
-         x="537.277831"
-         y="164.925781"
-         id="use10883" />
-      <use
-         xlink:href="#glyph45-4"
-         x="543.947471"
-         y="164.925781"
-         id="use10885" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 70.398357 9.197707 L 70.398357 -0.000925754 L 0.000430583 -0.000925754 L 0.000430583 9.197707 Z M 70.398357 9.197707 "
-       transform="matrix(0.70875,0,0,0.70875,135.589539,196.4655)"
-       id="path10889" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10903">
-      <use
-         xlink:href="#glyph15-29"
-         x="148.128906"
-         y="202.90625"
-         id="use10891" />
-      <use
-         xlink:href="#glyph15-30"
-         x="154.009316"
-         y="202.90625"
-         id="use10893" />
-      <use
-         xlink:href="#glyph15-30"
-         x="156.526856"
-         y="202.90625"
-         id="use10895" />
-      <use
-         xlink:href="#glyph15-11"
-         x="159.044395"
-         y="202.90625"
-         id="use10897" />
-      <use
-         xlink:href="#glyph15-3"
-         x="163.248906"
-         y="202.90625"
-         id="use10899" />
-      <use
-         xlink:href="#glyph15-1"
-         x="167.453418"
-         y="202.90625"
-         id="use10901" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.930002 14.797742 L 76.930002 -0.000538229 L 0.000990341 -0.000538229 L 0.000990341 14.797742 Z M 76.930002 14.797742 "
-       transform="matrix(0.70875,0,0,0.70875,133.495392,131.5121)"
-       id="path10905" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10923">
-      <use
-         xlink:href="#glyph0-33"
-         x="139.625"
-         y="140.117188"
-         id="use10907" />
-      <use
-         xlink:href="#glyph0-8"
-         x="148.54436"
-         y="140.117188"
-         id="use10909" />
-      <use
-         xlink:href="#glyph0-3"
-         x="153.8"
-         y="140.117188"
-         id="use10911" />
-      <use
-         xlink:href="#glyph0-6"
-         x="156.425513"
-         y="140.117188"
-         id="use10913" />
-      <use
-         xlink:href="#glyph0-8"
-         x="159.572437"
-         y="140.117188"
-         id="use10915" />
-      <use
-         xlink:href="#glyph0-29"
-         x="164.828077"
-         y="140.117188"
-         id="use10917" />
-      <use
-         xlink:href="#glyph0-14"
-         x="170.600513"
-         y="140.117188"
-         id="use10919" />
-      <use
-         xlink:href="#glyph0-1"
-         x="173.226026"
-         y="140.117188"
-         id="use10921" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.927375 15.469344 L 76.927375 -0.00133481 L -0.00163622 -0.00133481 L -0.00163622 15.469344 Z M 76.927375 15.469344 "
-       transform="matrix(0.70875,0,0,0.70875,133.286316,162.223602)"
-       id="path10925" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10943">
-      <use
-         xlink:href="#glyph0-33"
-         x="139.625"
-         y="171.304688"
-         id="use10927" />
-      <use
-         xlink:href="#glyph0-8"
-         x="148.54436"
-         y="171.304688"
-         id="use10929" />
-      <use
-         xlink:href="#glyph0-3"
-         x="153.8"
-         y="171.304688"
-         id="use10931" />
-      <use
-         xlink:href="#glyph0-6"
-         x="156.425513"
-         y="171.304688"
-         id="use10933" />
-      <use
-         xlink:href="#glyph0-8"
-         x="159.572437"
-         y="171.304688"
-         id="use10935" />
-      <use
-         xlink:href="#glyph0-29"
-         x="164.828077"
-         y="171.304688"
-         id="use10937" />
-      <use
-         xlink:href="#glyph0-14"
-         x="170.600513"
-         y="171.304688"
-         id="use10939" />
-      <use
-         xlink:href="#glyph0-1"
-         x="173.226026"
-         y="171.304688"
-         id="use10941" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.929184 15.777878 L 76.929184 -0.00144245 L 0.000172233 -0.00144245 L 0.000172233 15.777878 Z M 76.929184 15.777878 "
-       transform="matrix(0.70875,0,0,0.70875,133.390503,104.520554)"
-       id="path10945" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10963">
-      <use
-         xlink:href="#glyph0-33"
-         x="139.625"
-         y="113.1875"
-         id="use10947" />
-      <use
-         xlink:href="#glyph0-8"
-         x="148.54436"
-         y="113.1875"
-         id="use10949" />
-      <use
-         xlink:href="#glyph0-3"
-         x="153.8"
-         y="113.1875"
-         id="use10951" />
-      <use
-         xlink:href="#glyph0-6"
-         x="156.425513"
-         y="113.1875"
-         id="use10953" />
-      <use
-         xlink:href="#glyph0-8"
-         x="159.572437"
-         y="113.1875"
-         id="use10955" />
-      <use
-         xlink:href="#glyph0-29"
-         x="164.828077"
-         y="113.1875"
-         id="use10957" />
-      <use
-         xlink:href="#glyph0-14"
-         x="170.600513"
-         y="113.1875"
-         id="use10959" />
-      <use
-         xlink:href="#glyph0-1"
-         x="173.226026"
-         y="113.1875"
-         id="use10961" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.926945 13.699002 L 76.926945 -0.00249738 L -0.0020668 -0.00249738 L -0.0020668 13.699002 Z M 76.926945 13.699002 "
-       transform="matrix(0.70875,0,0,0.70875,133.481934,323.329895)"
-       id="path10965" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.926708 13.788391 L 76.926708 -0.00129175 L -0.00230362 -0.00129175 L -0.00230362 13.788391 Z M 76.926708 13.788391 "
-       transform="matrix(0.70875,0,0,0.70875,133.403976,245.817322)"
-       id="path10977" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g10985">
-      <use
-         xlink:href="#glyph0-0"
-         x="150.964844"
-         y="254.226563"
-         id="use10979" />
-      <use
-         xlink:href="#glyph0-3"
-         x="157.789331"
-         y="254.226563"
-         id="use10981" />
-      <use
-         xlink:href="#glyph0-9"
-         x="160.414844"
-         y="254.226563"
-         id="use10983" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 76.929894 13.698011 L 76.929894 0.00202374 L 0.000882695 0.00202374 L 0.000882695 13.698011 Z M 76.929894 13.698011 "
-       transform="matrix(0.70875,0,0,0.70875,133.456406,257.221222)"
-       id="path10987" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 70.401867 9.59886 L 70.401867 -0.00210986 L -0.00157163 -0.00210986 L -0.00157163 9.59886 Z M 70.401867 9.59886 "
-       transform="matrix(0.70875,0,0,0.70875,135.934708,213.271027)"
-       id="path10999" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g11007">
-      <use
-         xlink:href="#glyph46-0"
-         x="152.382813"
-         y="220.917969"
-         id="use11001" />
-      <use
-         xlink:href="#glyph46-1"
-         x="158.516863"
-         y="220.917969"
-         id="use11003" />
-      <use
-         xlink:href="#glyph46-2"
-         x="160.876751"
-         y="220.917969"
-         id="use11005" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(94.901961%,63.529412%,50.980392%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(100%,76.470588%,48.627451%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 70.401651 9.646569 L 70.401651 0.00150704 L -0.00178692 0.00150704 L -0.00178692 9.646569 Z M 70.401651 9.646569 "
-       transform="matrix(0.70875,0,0,0.70875,135.512985,222.830963)"
-       id="path11009" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 292.750517 28.998901 L 292.750519 -0.00242146 L -0.00192686 -0.00244356 L -0.00192905 28.998879 Z M 292.750517 28.998901 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,102.060768,61.696678)"
-       id="path11021" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g11035">
-      <use
-         xlink:href="#glyph2-4"
-         x="88.464844"
-         y="148.785156"
-         id="use11023" />
-      <use
-         xlink:href="#glyph2-5"
-         x="88.464844"
-         y="154.566359"
-         id="use11025" />
-      <use
-         xlink:href="#glyph2-6"
-         x="88.464844"
-         y="157.454423"
-         id="use11027" />
-      <use
-         xlink:href="#glyph2-4"
-         x="88.464845"
-         y="163.804103"
-         id="use11029" />
-      <use
-         xlink:href="#glyph2-7"
-         x="88.464845"
-         y="169.585306"
-         id="use11031" />
-      <use
-         xlink:href="#glyph2-4"
-         x="88.464846"
-         y="175.366509"
-         id="use11033" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 292.750517 29.001291 L 292.750519 -0.0000317208 L -0.00192686 -0.000053823 L -0.00192905 29.001269 Z M 292.750517 29.001291 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,124.070274,61.696678)"
-       id="path11037" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g11047">
-      <use
-         xlink:href="#glyph2-8"
-         x="110.472656"
-         y="152.996094"
-         id="use11039" />
-      <use
-         xlink:href="#glyph2-2"
-         x="110.472657"
-         y="160.503029"
-         id="use11041" />
-      <use
-         xlink:href="#glyph2-9"
-         x="110.472657"
-         y="166.284232"
-         id="use11043" />
-      <use
-         xlink:href="#glyph2-10"
-         x="110.472658"
-         y="172.065436"
-         id="use11045" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 304.562263 29.000667 L 304.562265 -0.000655175 L -0.00124869 -0.000678168 L -0.00125088 29.000644 Z M 304.562263 29.000667 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,102.788582,300.11026)"
-       id="path11049" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g11063">
-      <use
-         xlink:href="#glyph2-4"
-         x="88.691406"
-         y="391.363281"
-         id="use11051" />
-      <use
-         xlink:href="#glyph2-5"
-         x="88.691407"
-         y="397.144484"
-         id="use11053" />
-      <use
-         xlink:href="#glyph2-6"
-         x="88.691407"
-         y="400.032548"
-         id="use11055" />
-      <use
-         xlink:href="#glyph2-4"
-         x="88.691407"
-         y="406.382228"
-         id="use11057" />
-      <use
-         xlink:href="#glyph2-7"
-         x="88.691408"
-         y="412.163431"
-         id="use11059" />
-      <use
-         xlink:href="#glyph2-4"
-         x="88.691408"
-         y="417.944634"
-         id="use11061" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 304.562263 28.997567 L 304.562265 0.00175609 L -0.00124869 0.0017331 L -0.00125088 28.997544 Z M 304.562263 28.997567 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,124.798103,300.11026)"
-       id="path11065" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g11075">
-      <use
-         xlink:href="#glyph2-8"
-         x="110.699219"
-         y="395.578125"
-         id="use11067" />
-      <use
-         xlink:href="#glyph2-2"
-         x="110.699219"
-         y="403.085061"
-         id="use11069" />
-      <use
-         xlink:href="#glyph2-9"
-         x="110.69922"
-         y="408.866264"
-         id="use11071" />
-      <use
-         xlink:href="#glyph2-10"
-         x="110.69922"
-         y="414.647467"
-         id="use11073" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(20.392157%,59.607843%,85.882353%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(20.392157%,59.607843%,85.882353%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 57.657575 35.745578 L 57.657575 -0.00177616 L 0.00215292 -0.00177616 L 0.00215292 35.745578 Z M 57.657575 35.745578 "
-       transform="matrix(0.70875,0,0,0.70875,520.318787,120.669228)"
-       id="path11077" />
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="480.80798"
-       y="133.3116"
-       id="text11168"><tspan
-         sodipodi:role="line"
-         x="480.80798"
-         y="133.3116"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:6px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75"
-         id="tspan11182">QueueCuda</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="474.8078"
-       y="139.80453"
-       id="text11188"><tspan
-         sodipodi:role="line"
-         id="tspan11186"
-         x="474.8078"
-         y="139.80453"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:6px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">RtNonBlocking</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="522.67798"
-       y="133.3116"
-       id="text11168-4"><tspan
-         sodipodi:role="line"
-         x="522.67798"
-         y="133.3116"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:6px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75"
-         id="tspan11182-2">QueueCuda</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="523.99579"
-       y="139.80481"
-       id="text11188-1"><tspan
-         sodipodi:role="line"
-         id="tspan11186-8"
-         x="523.99579"
-         y="139.80481"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:6px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">RtBlocking</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="135.24565"
-       y="242.90166"
-       id="text11128-43"><tspan
-         sodipodi:role="line"
-         id="tspan11126-1"
-         x="135.24565"
-         y="242.90166"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="198.08562"
-       y="248.72046"
-       id="text11264"><tspan
-         sodipodi:role="line"
-         id="tspan11262"
-         x="198.08562"
-         y="248.72046"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="200.74303"
-       y="259.44925"
-       id="text11268"><tspan
-         sodipodi:role="line"
-         id="tspan11266"
-         x="200.74303"
-         y="259.44925"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">CpuSerial</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="241.14687"
-       y="248.72046"
-       id="text11264-7"><tspan
-         sodipodi:role="line"
-         id="tspan11262-7"
-         x="241.14687"
-         y="248.72046"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="242.82927"
-       y="259.44928"
-       id="text11268-8"><tspan
-         sodipodi:role="line"
-         id="tspan11266-7"
-         x="242.82927"
-         y="259.44928"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">CpuFibers</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="283.78662"
-       y="244.22047"
-       id="text11264-9"><tspan
-         sodipodi:role="line"
-         id="tspan11262-5"
-         x="283.78662"
-         y="244.22047"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="286.44403"
-       y="254.94928"
-       id="text11268-9"><tspan
-         sodipodi:role="line"
-         id="tspan11266-8"
-         x="286.44403"
-         y="254.94928"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">CpuOmp2</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="291.79535"
-       y="265.3866"
-       id="text11268-9-5"><tspan
-         sodipodi:role="line"
-         id="tspan11266-8-7"
-         x="291.79535"
-         y="265.3866"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">Blocks</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="328.01562"
-       y="244.22047"
-       id="text11264-9-6"><tspan
-         sodipodi:role="line"
-         id="tspan11262-5-0"
-         x="328.01562"
-         y="244.22047"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="330.67303"
-       y="254.94928"
-       id="text11268-9-2"><tspan
-         sodipodi:role="line"
-         id="tspan11266-8-5"
-         x="330.67303"
-         y="254.94928"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">CpuOmp2</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="334.59912"
-       y="265.38663"
-       id="text11268-9-5-9"><tspan
-         sodipodi:role="line"
-         id="tspan11266-8-7-4"
-         x="334.59912"
-         y="265.38663"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">Threads</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="373.34186"
-       y="248.72047"
-       id="text11264-5"><tspan
-         sodipodi:role="line"
-         id="tspan11262-4"
-         x="373.34186"
-         y="248.72047"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="371.27408"
-       y="259.44928"
-       id="text11268-90"><tspan
-         sodipodi:role="line"
-         id="tspan11266-75"
-         x="371.27408"
-         y="259.44928"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">CpuThreads</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="417.45255"
-       y="248.72046"
-       id="text11264-5-2"><tspan
-         sodipodi:role="line"
-         id="tspan11262-4-1"
-         x="417.45255"
-         y="248.72046"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="419.65979"
-       y="259.44928"
-       id="text11268-90-6"><tspan
-         sodipodi:role="line"
-         id="tspan11266-75-9"
-         x="419.65979"
-         y="259.44928"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:7.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">CpuOmp4</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="493.48337"
-       y="247.34921"
-       id="text11404"><tspan
-         sodipodi:role="line"
-         id="tspan11402"
-         x="493.48337"
-         y="247.34921"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:10px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="492.64798"
-       y="260.69086"
-       id="text11404-8"><tspan
-         sodipodi:role="line"
-         id="tspan11402-9"
-         x="492.64798"
-         y="260.69086"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:10px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">GpuCudaRt</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="486.80798"
-       y="309.20752"
-       id="text11426"><tspan
-         sodipodi:role="line"
-         id="tspan11424"
-         x="486.80798"
-         y="309.20752"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">AccGpuCudaRt</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="299.33441"
-       y="476.06265"
-       id="text2526"><tspan
-         sodipodi:role="line"
-         id="tspan2524"
-         x="299.33441"
-         y="476.06265"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;stroke-width:0.75">MathStdLib</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="298.69595"
-       y="448.33591"
-       id="text2526-9"><tspan
-         sodipodi:role="line"
-         id="tspan2524-3"
-         x="298.69595"
-         y="448.33591"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;stroke-width:0.75">RandStdLib</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="421.19025"
-       y="410.96387"
-       id="text2526-9-4"><tspan
-         sodipodi:role="line"
-         id="tspan2524-3-4"
-         x="421.19025"
-         y="410.96387"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;stroke-width:0.75">Atomic</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="422.19257"
-       y="418.88211"
-       id="text2526-9-4-2"><tspan
-         sodipodi:role="line"
-         id="tspan2524-3-4-5"
-         x="422.19257"
-         y="418.88211"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;stroke-width:0.75">StdLib</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="425.73904"
-       y="427.34244"
-       id="text2526-9-4-2-4"><tspan
-         sodipodi:role="line"
-         id="tspan2524-3-4-5-7"
-         x="425.73904"
-         y="427.34244"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;stroke-width:0.75">Lock</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="155.08113"
-       y="229.43451"
-       id="text2602"><tspan
-         sodipodi:role="line"
-         id="tspan2600"
-         x="155.08113"
-         y="229.43451"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:8px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;stroke-width:0.75">Idx</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="153.80452"
-       y="265.3833"
-       id="text2602-6"><tspan
-         sodipodi:role="line"
-         id="tspan2600-6"
-         x="153.80452"
-         y="265.3833"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;stroke-width:0.75">Idx</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="153.67401"
-       y="330.9798"
-       id="text2602-6-1"><tspan
-         sodipodi:role="line"
-         id="tspan2600-6-4"
-         x="153.67401"
-         y="330.9798"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;stroke-width:0.75">Idx</tspan></text>
-  </g>
-</svg>
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.png b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.png
deleted file mode 100644
index 9824a2b3be..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.svg b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.svg
deleted file mode 100644
index 4688afd581..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/library/structure_assoc.svg
+++ /dev/null
@@ -1,2072 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:xlink="http://www.w3.org/1999/xlink"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="346.578766pt"
-   height="368.691772pt"
-   viewBox="0 0 346.578766 368.691772"
-   version="1.1"
-   id="svg13076"
-   sodipodi:docname="structure_assoc.svg"
-   inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
-   inkscape:export-filename="C:\Users\janst\workspace\alpaka\doc\markdown\user\implementation\library\structure_assoc.png"
-   inkscape:export-xdpi="101.5959"
-   inkscape:export-ydpi="101.5959">
-  <metadata
-     id="metadata13080">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="1920"
-     inkscape:window-height="1017"
-     id="namedview13078"
-     showgrid="false"
-     inkscape:zoom="1.7250182"
-     inkscape:cx="231.05251"
-     inkscape:cy="245.79451"
-     inkscape:window-x="-8"
-     inkscape:window-y="-8"
-     inkscape:window-maximized="1"
-     inkscape:current-layer="surface1" />
-  <defs
-     id="defs12585">
-    <g
-       id="g12571">
-      <symbol
-         overflow="visible"
-         id="glyph0-0">
-        <path
-           style="stroke:none;"
-           d="M 0.683594 -6.765625 L 3.179688 -6.765625 C 3.738281 -6.761719 4.167969 -6.71875 4.464844 -6.636719 C 4.863281 -6.515625 5.203125 -6.308594 5.484375 -6.011719 C 5.765625 -5.714844 5.980469 -5.347656 6.132813 -4.917969 C 6.277344 -4.480469 6.351563 -3.949219 6.355469 -3.316406 C 6.351563 -2.761719 6.28125 -2.28125 6.144531 -1.878906 C 5.972656 -1.382813 5.730469 -0.984375 5.421875 -0.683594 C 5.179688 -0.449219 4.859375 -0.269531 4.460938 -0.144531 C 4.15625 -0.046875 3.753906 0 3.253906 0 L 0.683594 0 Z M 2.046875 -5.621094 L 2.046875 -1.140625 L 3.070313 -1.140625 C 3.449219 -1.136719 3.722656 -1.15625 3.894531 -1.203125 C 4.113281 -1.257813 4.296875 -1.351563 4.445313 -1.484375 C 4.589844 -1.613281 4.707031 -1.832031 4.800781 -2.136719 C 4.890625 -2.4375 4.9375 -2.851563 4.941406 -3.378906 C 4.9375 -3.898438 4.890625 -4.296875 4.800781 -4.578125 C 4.707031 -4.855469 4.578125 -5.074219 4.414063 -5.234375 C 4.246094 -5.390625 4.039063 -5.5 3.785156 -5.554688 C 3.589844 -5.597656 3.214844 -5.617188 2.664063 -5.621094 Z M 2.046875 -5.621094 "
-           id="path12259" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-1">
-        <path
-           style="stroke:none;"
-           d="M 3.515625 -1.558594 L 4.808594 -1.34375 C 4.640625 -0.867188 4.378906 -0.507813 4.019531 -0.261719 C 3.660156 -0.015625 3.210938 0.105469 2.675781 0.109375 C 1.820313 0.105469 1.191406 -0.167969 0.785156 -0.722656 C 0.460938 -1.167969 0.296875 -1.734375 0.300781 -2.414063 C 0.296875 -3.226563 0.511719 -3.859375 0.9375 -4.320313 C 1.359375 -4.777344 1.894531 -5.007813 2.546875 -5.011719 C 3.273438 -5.007813 3.847656 -4.769531 4.273438 -4.289063 C 4.691406 -3.804688 4.894531 -3.066406 4.878906 -2.078125 L 1.628906 -2.078125 C 1.636719 -1.691406 1.738281 -1.390625 1.941406 -1.179688 C 2.136719 -0.960938 2.386719 -0.855469 2.691406 -0.859375 C 2.890625 -0.855469 3.0625 -0.910156 3.203125 -1.023438 C 3.339844 -1.132813 3.445313 -1.3125 3.515625 -1.558594 Z M 3.589844 -2.871094 C 3.578125 -3.246094 3.480469 -3.53125 3.296875 -3.726563 C 3.113281 -3.921875 2.890625 -4.019531 2.625 -4.019531 C 2.339844 -4.019531 2.105469 -3.914063 1.925781 -3.710938 C 1.738281 -3.5 1.648438 -3.222656 1.652344 -2.871094 Z M 3.589844 -2.871094 "
-           id="path12262" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-2">
-        <path
-           style="stroke:none;"
-           d="M 2.027344 0 L 0.0507813 -4.898438 L 1.410156 -4.898438 L 2.335938 -2.398438 L 2.601563 -1.5625 C 2.671875 -1.773438 2.714844 -1.914063 2.734375 -1.984375 C 2.777344 -2.121094 2.824219 -2.257813 2.875 -2.398438 L 3.808594 -4.898438 L 5.140625 -4.898438 L 3.191406 0 Z M 2.027344 0 "
-           id="path12265" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-3">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 -5.566406 L 0.679688 -6.765625 L 1.976563 -6.765625 L 1.976563 -5.566406 Z M 0.679688 0 L 0.679688 -4.898438 L 1.976563 -4.898438 L 1.976563 0 Z M 0.679688 0 "
-           id="path12268" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-4">
-        <path
-           style="stroke:none;"
-           d="M 4.949219 -3.453125 L 3.671875 -3.222656 C 3.628906 -3.476563 3.53125 -3.667969 3.378906 -3.796875 C 3.226563 -3.925781 3.027344 -3.988281 2.785156 -3.992188 C 2.457031 -3.988281 2.199219 -3.878906 2.011719 -3.65625 C 1.816406 -3.433594 1.722656 -3.058594 1.726563 -2.539063 C 1.722656 -1.953125 1.820313 -1.542969 2.019531 -1.304688 C 2.210938 -1.0625 2.472656 -0.941406 2.804688 -0.945313 C 3.050781 -0.941406 3.253906 -1.011719 3.410156 -1.15625 C 3.566406 -1.292969 3.675781 -1.535156 3.742188 -1.878906 L 5.015625 -1.660156 C 4.882813 -1.074219 4.628906 -0.632813 4.253906 -0.335938 C 3.878906 -0.0390625 3.375 0.105469 2.746094 0.109375 C 2.027344 0.105469 1.457031 -0.117188 1.03125 -0.566406 C 0.601563 -1.015625 0.386719 -1.640625 0.390625 -2.445313 C 0.386719 -3.25 0.601563 -3.882813 1.03125 -4.335938 C 1.460938 -4.785156 2.039063 -5.007813 2.769531 -5.011719 C 3.363281 -5.007813 3.835938 -4.878906 4.191406 -4.625 C 4.539063 -4.367188 4.792969 -3.976563 4.949219 -3.453125 Z M 4.949219 -3.453125 "
-           id="path12271" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-5">
-        <path
-           style="stroke:none;"
-           d="M 0.339844 -2.199219 L 1.671875 -2.332031 C 1.746094 -1.882813 1.910156 -1.554688 2.15625 -1.347656 C 2.398438 -1.136719 2.726563 -1.03125 3.148438 -1.035156 C 3.585938 -1.03125 3.917969 -1.125 4.140625 -1.3125 C 4.363281 -1.5 4.476563 -1.714844 4.476563 -1.964844 C 4.476563 -2.121094 4.429688 -2.257813 4.335938 -2.375 C 4.242188 -2.484375 4.078125 -2.582031 3.84375 -2.667969 C 3.683594 -2.71875 3.316406 -2.816406 2.75 -2.960938 C 2.015625 -3.140625 1.503906 -3.367188 1.207031 -3.632813 C 0.792969 -4.003906 0.585938 -4.457031 0.585938 -4.992188 C 0.585938 -5.335938 0.683594 -5.65625 0.878906 -5.957031 C 1.074219 -6.257813 1.355469 -6.488281 1.722656 -6.644531 C 2.089844 -6.800781 2.53125 -6.878906 3.054688 -6.878906 C 3.898438 -6.878906 4.539063 -6.691406 4.972656 -6.320313 C 5.398438 -5.945313 5.625 -5.449219 5.648438 -4.832031 L 4.28125 -4.769531 C 4.222656 -5.117188 4.097656 -5.367188 3.90625 -5.519531 C 3.710938 -5.671875 3.421875 -5.746094 3.039063 -5.75 C 2.640625 -5.746094 2.332031 -5.664063 2.109375 -5.503906 C 1.960938 -5.394531 1.886719 -5.257813 1.890625 -5.085938 C 1.886719 -4.921875 1.953125 -4.785156 2.09375 -4.675781 C 2.261719 -4.527344 2.679688 -4.375 3.347656 -4.222656 C 4.011719 -4.0625 4.503906 -3.898438 4.824219 -3.734375 C 5.136719 -3.5625 5.386719 -3.335938 5.570313 -3.046875 C 5.746094 -2.757813 5.835938 -2.398438 5.839844 -1.96875 C 5.835938 -1.582031 5.726563 -1.21875 5.515625 -0.882813 C 5.296875 -0.542969 4.996094 -0.289063 4.605469 -0.125 C 4.210938 0.0390625 3.71875 0.117188 3.132813 0.121094 C 2.277344 0.117188 1.621094 -0.078125 1.164063 -0.472656 C 0.703125 -0.867188 0.429688 -1.441406 0.339844 -2.199219 Z M 0.339844 -2.199219 "
-           id="path12274" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-6">
-        <path
-           style="stroke:none;"
-           d="M 2.925781 -4.898438 L 2.925781 -3.867188 L 2.039063 -3.867188 L 2.039063 -1.890625 C 2.039063 -1.488281 2.046875 -1.253906 2.0625 -1.191406 C 2.078125 -1.121094 2.117188 -1.066406 2.179688 -1.027344 C 2.238281 -0.980469 2.308594 -0.960938 2.398438 -0.964844 C 2.515625 -0.960938 2.691406 -1.003906 2.921875 -1.089844 L 3.03125 -0.0820313 C 2.726563 0.0429688 2.386719 0.105469 2.007813 0.109375 C 1.769531 0.105469 1.558594 0.0703125 1.375 -0.0078125 C 1.183594 -0.0820313 1.046875 -0.183594 0.960938 -0.3125 C 0.875 -0.433594 0.8125 -0.601563 0.78125 -0.816406 C 0.75 -0.964844 0.738281 -1.269531 0.738281 -1.730469 L 0.738281 -3.867188 L 0.144531 -3.867188 L 0.144531 -4.898438 L 0.738281 -4.898438 L 0.738281 -5.875 L 2.039063 -6.628906 L 2.039063 -4.898438 Z M 2.925781 -4.898438 "
-           id="path12277" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-7">
-        <path
-           style="stroke:none;"
-           d="M 1.917969 0 L 0.621094 0 L 0.621094 -4.898438 L 1.828125 -4.898438 L 1.828125 -4.203125 C 2.03125 -4.53125 2.214844 -4.75 2.382813 -4.855469 C 2.542969 -4.957031 2.730469 -5.007813 2.945313 -5.011719 C 3.238281 -5.007813 3.523438 -4.925781 3.796875 -4.765625 L 3.394531 -3.636719 C 3.175781 -3.773438 2.972656 -3.84375 2.785156 -3.847656 C 2.601563 -3.84375 2.449219 -3.792969 2.324219 -3.695313 C 2.195313 -3.59375 2.097656 -3.414063 2.027344 -3.15625 C 1.953125 -2.894531 1.914063 -2.347656 1.917969 -1.511719 Z M 1.917969 0 "
-           id="path12280" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-8">
-        <path
-           style="stroke:none;"
-           d="M 1.648438 -3.40625 L 0.46875 -3.617188 C 0.601563 -4.085938 0.828125 -4.4375 1.152344 -4.667969 C 1.472656 -4.894531 1.953125 -5.007813 2.59375 -5.011719 C 3.167969 -5.007813 3.597656 -4.941406 3.882813 -4.804688 C 4.164063 -4.667969 4.363281 -4.492188 4.480469 -4.285156 C 4.59375 -4.070313 4.652344 -3.683594 4.65625 -3.125 L 4.640625 -1.609375 C 4.640625 -1.175781 4.660156 -0.859375 4.703125 -0.65625 C 4.742188 -0.453125 4.820313 -0.234375 4.9375 0 L 3.65625 0 C 3.621094 -0.0859375 3.578125 -0.210938 3.53125 -0.382813 C 3.503906 -0.457031 3.488281 -0.507813 3.484375 -0.535156 C 3.261719 -0.316406 3.023438 -0.15625 2.773438 -0.0507813 C 2.515625 0.0546875 2.246094 0.105469 1.964844 0.109375 C 1.457031 0.105469 1.058594 -0.0273438 0.769531 -0.300781 C 0.476563 -0.570313 0.332031 -0.917969 0.335938 -1.339844 C 0.332031 -1.613281 0.398438 -1.859375 0.535156 -2.078125 C 0.664063 -2.292969 0.851563 -2.457031 1.089844 -2.574219 C 1.328125 -2.6875 1.671875 -2.789063 2.121094 -2.878906 C 2.726563 -2.988281 3.144531 -3.097656 3.382813 -3.199219 L 3.382813 -3.328125 C 3.378906 -3.574219 3.316406 -3.75 3.195313 -3.859375 C 3.070313 -3.964844 2.839844 -4.019531 2.5 -4.019531 C 2.269531 -4.019531 2.089844 -3.972656 1.960938 -3.882813 C 1.828125 -3.789063 1.722656 -3.628906 1.648438 -3.40625 Z M 3.382813 -2.351563 C 3.210938 -2.292969 2.949219 -2.226563 2.59375 -2.152344 C 2.230469 -2.074219 1.996094 -2 1.886719 -1.929688 C 1.714844 -1.804688 1.628906 -1.652344 1.632813 -1.472656 C 1.628906 -1.285156 1.695313 -1.128906 1.835938 -1 C 1.96875 -0.867188 2.140625 -0.800781 2.351563 -0.804688 C 2.582031 -0.800781 2.808594 -0.878906 3.023438 -1.035156 C 3.179688 -1.148438 3.28125 -1.289063 3.332031 -1.460938 C 3.363281 -1.570313 3.378906 -1.78125 3.382813 -2.09375 Z M 3.382813 -2.351563 "
-           id="path12283" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-9">
-        <path
-           style="stroke:none;"
-           d="M 0.582031 -4.898438 L 1.777344 -4.898438 L 1.777344 -4.230469 C 2.199219 -4.75 2.710938 -5.007813 3.304688 -5.011719 C 3.617188 -5.007813 3.886719 -4.945313 4.117188 -4.816406 C 4.347656 -4.6875 4.539063 -4.492188 4.6875 -4.230469 C 4.902344 -4.492188 5.132813 -4.6875 5.382813 -4.816406 C 5.628906 -4.945313 5.894531 -5.007813 6.183594 -5.011719 C 6.539063 -5.007813 6.84375 -4.9375 7.09375 -4.792969 C 7.339844 -4.644531 7.527344 -4.429688 7.65625 -4.148438 C 7.742188 -3.9375 7.785156 -3.597656 7.789063 -3.132813 L 7.789063 0 L 6.492188 0 L 6.492188 -2.800781 C 6.492188 -3.285156 6.445313 -3.597656 6.359375 -3.742188 C 6.234375 -3.925781 6.050781 -4.019531 5.804688 -4.019531 C 5.621094 -4.019531 5.449219 -3.960938 5.289063 -3.851563 C 5.128906 -3.738281 5.015625 -3.578125 4.945313 -3.367188 C 4.875 -3.152344 4.839844 -2.8125 4.839844 -2.351563 L 4.839844 0 L 3.542969 0 L 3.542969 -2.683594 C 3.539063 -3.160156 3.515625 -3.46875 3.472656 -3.609375 C 3.425781 -3.746094 3.355469 -3.847656 3.257813 -3.917969 C 3.160156 -3.984375 3.027344 -4.019531 2.867188 -4.019531 C 2.664063 -4.019531 2.484375 -3.964844 2.324219 -3.855469 C 2.164063 -3.746094 2.050781 -3.589844 1.980469 -3.390625 C 1.910156 -3.183594 1.875 -2.847656 1.878906 -2.382813 L 1.878906 0 L 0.582031 0 Z M 0.582031 -4.898438 "
-           id="path12286" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-10">
-        <path
-           style="stroke:none;"
-           d="M 0.6875 0 L 0.6875 -6.765625 L 2.878906 -6.765625 C 3.707031 -6.761719 4.25 -6.726563 4.503906 -6.664063 C 4.890625 -6.558594 5.214844 -6.339844 5.476563 -6 C 5.738281 -5.660156 5.871094 -5.21875 5.871094 -4.683594 C 5.871094 -4.265625 5.792969 -3.917969 5.644531 -3.636719 C 5.488281 -3.351563 5.296875 -3.128906 5.066406 -2.96875 C 4.832031 -2.804688 4.597656 -2.699219 4.359375 -2.648438 C 4.03125 -2.582031 3.558594 -2.546875 2.945313 -2.550781 L 2.054688 -2.550781 L 2.054688 0 Z M 2.054688 -5.621094 L 2.054688 -3.699219 L 2.800781 -3.699219 C 3.339844 -3.695313 3.699219 -3.730469 3.878906 -3.804688 C 4.058594 -3.875 4.199219 -3.988281 4.304688 -4.136719 C 4.40625 -4.285156 4.457031 -4.460938 4.460938 -4.664063 C 4.457031 -4.910156 4.386719 -5.113281 4.242188 -5.273438 C 4.097656 -5.433594 3.914063 -5.53125 3.695313 -5.574219 C 3.53125 -5.601563 3.203125 -5.617188 2.714844 -5.621094 Z M 2.054688 -5.621094 "
-           id="path12289" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-11">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 0 L 0.679688 -6.765625 L 1.976563 -6.765625 L 1.976563 0 Z M 0.679688 0 "
-           id="path12292" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-12">
-        <path
-           style="stroke:none;"
-           d="M 0.109375 -4.898438 L 0.832031 -4.898438 L 0.832031 -5.269531 C 0.832031 -5.675781 0.875 -5.984375 0.960938 -6.191406 C 1.046875 -6.390625 1.207031 -6.554688 1.445313 -6.6875 C 1.679688 -6.8125 1.976563 -6.878906 2.339844 -6.878906 C 2.707031 -6.878906 3.070313 -6.824219 3.421875 -6.714844 L 3.25 -5.808594 C 3.039063 -5.855469 2.839844 -5.882813 2.652344 -5.882813 C 2.460938 -5.882813 2.328125 -5.835938 2.25 -5.75 C 2.167969 -5.660156 2.125 -5.492188 2.128906 -5.246094 L 2.128906 -4.898438 L 3.097656 -4.898438 L 3.097656 -3.878906 L 2.128906 -3.878906 L 2.128906 0 L 0.832031 0 L 0.832031 -3.878906 L 0.109375 -3.878906 Z M 0.109375 -4.898438 "
-           id="path12295" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-13">
-        <path
-           style="stroke:none;"
-           d="M 0.378906 -2.519531 C 0.375 -2.949219 0.480469 -3.363281 0.695313 -3.769531 C 0.90625 -4.167969 1.207031 -4.476563 1.597656 -4.691406 C 1.984375 -4.902344 2.417969 -5.007813 2.902344 -5.011719 C 3.640625 -5.007813 4.25 -4.769531 4.726563 -4.289063 C 5.199219 -3.804688 5.4375 -3.195313 5.4375 -2.464844 C 5.4375 -1.71875 5.195313 -1.105469 4.71875 -0.621094 C 4.234375 -0.136719 3.632813 0.105469 2.910156 0.109375 C 2.460938 0.105469 2.03125 0.0078125 1.625 -0.195313 C 1.214844 -0.394531 0.90625 -0.691406 0.695313 -1.085938 C 0.480469 -1.476563 0.375 -1.957031 0.378906 -2.519531 Z M 1.707031 -2.449219 C 1.703125 -1.964844 1.820313 -1.589844 2.050781 -1.332031 C 2.28125 -1.070313 2.566406 -0.941406 2.90625 -0.945313 C 3.242188 -0.941406 3.523438 -1.070313 3.757813 -1.332031 C 3.984375 -1.589844 4.101563 -1.96875 4.101563 -2.460938 C 4.101563 -2.9375 3.984375 -3.304688 3.757813 -3.566406 C 3.523438 -3.820313 3.242188 -3.949219 2.90625 -3.953125 C 2.566406 -3.949219 2.28125 -3.820313 2.050781 -3.566406 C 1.820313 -3.304688 1.703125 -2.933594 1.707031 -2.449219 Z M 1.707031 -2.449219 "
-           id="path12298" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-14">
-        <path
-           style="stroke:none;"
-           d="M 0.707031 0 L 0.707031 -6.765625 L 2.070313 -6.765625 L 2.070313 -3.761719 L 4.832031 -6.765625 L 6.667969 -6.765625 L 4.121094 -4.128906 L 6.804688 0 L 5.039063 0 L 3.179688 -3.175781 L 2.070313 -2.042969 L 2.070313 0 Z M 0.707031 0 "
-           id="path12301" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph0-15">
-        <path
-           style="stroke:none;"
-           d="M 5.136719 0 L 3.839844 0 L 3.839844 -2.5 C 3.835938 -3.027344 3.808594 -3.371094 3.753906 -3.527344 C 3.695313 -3.683594 3.605469 -3.804688 3.484375 -3.890625 C 3.359375 -3.976563 3.210938 -4.019531 3.035156 -4.019531 C 2.808594 -4.019531 2.605469 -3.957031 2.429688 -3.832031 C 2.25 -3.707031 2.128906 -3.542969 2.0625 -3.34375 C 1.996094 -3.136719 1.960938 -2.761719 1.964844 -2.21875 L 1.964844 0 L 0.667969 0 L 0.667969 -4.898438 L 1.875 -4.898438 L 1.875 -4.179688 C 2.300781 -4.730469 2.835938 -5.007813 3.488281 -5.011719 C 3.769531 -5.007813 4.03125 -4.957031 4.273438 -4.855469 C 4.507813 -4.75 4.6875 -4.617188 4.808594 -4.460938 C 4.929688 -4.296875 5.015625 -4.117188 5.0625 -3.917969 C 5.109375 -3.710938 5.132813 -3.421875 5.136719 -3.046875 Z M 5.136719 0 "
-           id="path12304" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-0">
-        <path
-           style="stroke:none;"
-           d="M 0.679688 0 L 0.679688 -6.675781 L 5.625 -6.675781 L 5.625 -5.542969 L 2.027344 -5.542969 L 2.027344 -4.066406 L 5.375 -4.066406 L 5.375 -2.941406 L 2.027344 -2.941406 L 2.027344 -1.125 L 5.753906 -1.125 L 5.753906 0 Z M 0.679688 0 "
-           id="path12307" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-1">
-        <path
-           style="stroke:none;"
-           d="M 2 0 L 0.0507813 -4.835938 L 1.394531 -4.835938 L 2.304688 -2.367188 L 2.566406 -1.542969 C 2.636719 -1.746094 2.679688 -1.886719 2.699219 -1.957031 C 2.738281 -2.09375 2.785156 -2.230469 2.835938 -2.367188 L 3.753906 -4.835938 L 5.070313 -4.835938 L 3.148438 0 Z M 2 0 "
-           id="path12310" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-2">
-        <path
-           style="stroke:none;"
-           d="M 3.46875 -1.539063 L 4.742188 -1.324219 C 4.574219 -0.855469 4.316406 -0.5 3.964844 -0.257813 C 3.609375 -0.015625 3.167969 0.105469 2.640625 0.109375 C 1.796875 0.105469 1.175781 -0.167969 0.773438 -0.714844 C 0.453125 -1.152344 0.292969 -1.707031 0.296875 -2.382813 C 0.292969 -3.179688 0.5 -3.804688 0.921875 -4.261719 C 1.335938 -4.714844 1.867188 -4.945313 2.511719 -4.945313 C 3.230469 -4.945313 3.796875 -4.707031 4.214844 -4.230469 C 4.628906 -3.753906 4.828125 -3.023438 4.8125 -2.046875 L 1.605469 -2.046875 C 1.613281 -1.664063 1.714844 -1.371094 1.914063 -1.164063 C 2.109375 -0.949219 2.355469 -0.84375 2.652344 -0.847656 C 2.851563 -0.84375 3.019531 -0.898438 3.15625 -1.011719 C 3.292969 -1.117188 3.398438 -1.292969 3.46875 -1.539063 Z M 3.542969 -2.832031 C 3.53125 -3.199219 3.433594 -3.480469 3.253906 -3.675781 C 3.066406 -3.863281 2.847656 -3.960938 2.589844 -3.964844 C 2.308594 -3.960938 2.078125 -3.859375 1.898438 -3.660156 C 1.714844 -3.453125 1.625 -3.175781 1.628906 -2.832031 Z M 3.542969 -2.832031 "
-           id="path12313" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-3">
-        <path
-           style="stroke:none;"
-           d="M 5.066406 0 L 3.789063 0 L 3.789063 -2.46875 C 3.785156 -2.984375 3.757813 -3.324219 3.707031 -3.480469 C 3.648438 -3.632813 3.558594 -3.75 3.4375 -3.835938 C 3.3125 -3.917969 3.164063 -3.960938 2.996094 -3.964844 C 2.769531 -3.960938 2.570313 -3.898438 2.398438 -3.78125 C 2.21875 -3.65625 2.097656 -3.496094 2.035156 -3.300781 C 1.964844 -3.097656 1.933594 -2.730469 1.9375 -2.191406 L 1.9375 0 L 0.660156 0 L 0.660156 -4.835938 L 1.847656 -4.835938 L 1.847656 -4.125 C 2.265625 -4.671875 2.796875 -4.945313 3.441406 -4.945313 C 3.71875 -4.945313 3.976563 -4.894531 4.214844 -4.792969 C 4.445313 -4.691406 4.625 -4.558594 4.746094 -4.402344 C 4.863281 -4.238281 4.945313 -4.058594 4.996094 -3.863281 C 5.039063 -3.660156 5.0625 -3.375 5.066406 -3.003906 Z M 5.066406 0 "
-           id="path12316" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph1-4">
-        <path
-           style="stroke:none;"
-           d="M 2.886719 -4.835938 L 2.886719 -3.816406 L 2.011719 -3.816406 L 2.011719 -1.867188 C 2.007813 -1.46875 2.015625 -1.238281 2.035156 -1.175781 C 2.050781 -1.109375 2.089844 -1.054688 2.148438 -1.015625 C 2.207031 -0.96875 2.28125 -0.949219 2.367188 -0.953125 C 2.484375 -0.949219 2.65625 -0.988281 2.882813 -1.074219 L 2.992188 -0.0820313 C 2.691406 0.0429688 2.355469 0.105469 1.980469 0.109375 C 1.746094 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.035156 -0.179688 0.949219 -0.304688 C 0.863281 -0.429688 0.800781 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.707031 L 0.726563 -3.816406 L 0.140625 -3.816406 L 0.140625 -4.835938 L 0.726563 -4.835938 L 0.726563 -5.796875 L 2.011719 -6.542969 L 2.011719 -4.835938 Z M 2.886719 -4.835938 "
-           id="path12319" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-0">
-        <path
-           style="stroke:none;"
-           d="M 0 0.761719 L 7.441406 0.761719 L 7.441406 2.265625 L 4.511719 2.265625 L 4.511719 5.207031 L 7.441406 5.207031 L 7.441406 6.710938 L 0 6.710938 L 0 5.207031 L 3.253906 5.207031 L 3.253906 2.265625 L 0 2.265625 Z M 0 0.761719 "
-           id="path12322" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-1">
-        <path
-           style="stroke:none;"
-           d="M 2.769531 0.417969 C 3.242188 0.414063 3.699219 0.53125 4.144531 0.765625 C 4.585938 0.996094 4.925781 1.328125 5.160156 1.757813 C 5.390625 2.1875 5.507813 2.664063 5.511719 3.191406 C 5.507813 4.003906 5.242188 4.671875 4.714844 5.195313 C 4.183594 5.714844 3.515625 5.976563 2.710938 5.980469 C 1.894531 5.976563 1.21875 5.714844 0.683594 5.191406 C 0.148438 4.664063 -0.117188 4 -0.121094 3.203125 C -0.117188 2.707031 -0.0078125 2.234375 0.210938 1.789063 C 0.433594 1.335938 0.761719 0.996094 1.195313 0.765625 C 1.625 0.53125 2.148438 0.414063 2.769531 0.417969 Z M 2.695313 1.878906 C 2.160156 1.875 1.75 2 1.464844 2.257813 C 1.179688 2.507813 1.035156 2.824219 1.039063 3.199219 C 1.035156 3.570313 1.179688 3.878906 1.464844 4.132813 C 1.75 4.382813 2.164063 4.511719 2.707031 4.511719 C 3.230469 4.511719 3.632813 4.382813 3.921875 4.132813 C 4.203125 3.878906 4.347656 3.570313 4.351563 3.199219 C 4.347656 2.824219 4.203125 2.507813 3.921875 2.257813 C 3.632813 2 3.226563 1.875 2.695313 1.878906 Z M 2.695313 1.878906 "
-           id="path12325" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-2">
-        <path
-           style="stroke:none;"
-           d="M 1.539063 0.242188 L 1.757813 1.675781 C 1.476563 1.734375 1.265625 1.855469 1.125 2.042969 C 0.980469 2.226563 0.90625 2.488281 0.910156 2.828125 C 0.90625 3.191406 0.976563 3.46875 1.113281 3.660156 C 1.207031 3.78125 1.332031 3.84375 1.492188 3.847656 C 1.597656 3.84375 1.6875 3.808594 1.761719 3.746094 C 1.828125 3.671875 1.890625 3.515625 1.949219 3.269531 C 2.199219 2.117188 2.429688 1.386719 2.644531 1.082031 C 2.933594 0.652344 3.339844 0.4375 3.859375 0.441406 C 4.324219 0.4375 4.714844 0.621094 5.035156 0.992188 C 5.347656 1.359375 5.507813 1.933594 5.511719 2.710938 C 5.507813 3.445313 5.386719 3.992188 5.152344 4.355469 C 4.910156 4.710938 4.554688 4.960938 4.085938 5.097656 L 3.835938 3.75 C 4.042969 3.691406 4.203125 3.582031 4.320313 3.421875 C 4.429688 3.261719 4.488281 3.03125 4.488281 2.734375 C 4.488281 2.355469 4.433594 2.085938 4.328125 1.929688 C 4.253906 1.816406 4.15625 1.761719 4.039063 1.765625 C 3.933594 1.761719 3.847656 1.8125 3.78125 1.910156 C 3.683594 2.035156 3.550781 2.476563 3.378906 3.238281 C 3.203125 3.996094 2.992188 4.527344 2.746094 4.832031 C 2.492188 5.125 2.136719 5.273438 1.683594 5.277344 C 1.1875 5.273438 0.765625 5.066406 0.410156 4.65625 C 0.0585938 4.242188 -0.117188 3.632813 -0.121094 2.828125 C -0.117188 2.089844 0.0273438 1.507813 0.324219 1.082031 C 0.621094 0.652344 1.027344 0.371094 1.539063 0.242188 Z M 1.539063 0.242188 "
-           id="path12328" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-3">
-        <path
-           style="stroke:none;"
-           d="M 5.390625 3.21875 L 4.253906 3.21875 L 4.253906 2.242188 L 2.082031 2.242188 C 1.636719 2.238281 1.382813 2.246094 1.3125 2.269531 C 1.238281 2.285156 1.175781 2.328125 1.132813 2.398438 C 1.082031 2.460938 1.058594 2.542969 1.0625 2.640625 C 1.058594 2.769531 1.105469 2.960938 1.199219 3.214844 L 0.0898438 3.335938 C -0.0507813 3 -0.117188 2.625 -0.121094 2.207031 C -0.117188 1.945313 -0.078125 1.714844 0.0078125 1.511719 C 0.0898438 1.304688 0.203125 1.152344 0.339844 1.058594 C 0.476563 0.960938 0.660156 0.894531 0.898438 0.859375 C 1.0625 0.824219 1.398438 0.808594 1.902344 0.8125 L 4.253906 0.8125 L 4.253906 0.15625 L 5.390625 0.15625 L 5.390625 0.8125 L 6.460938 0.8125 L 7.292969 2.242188 L 5.390625 2.242188 Z M 5.390625 3.21875 "
-           id="path12331" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-4">
-        <path
-           style="stroke:none;"
-           d="M 3.746094 1.8125 L 3.980469 0.519531 C 4.496094 0.660156 4.882813 0.910156 5.136719 1.265625 C 5.382813 1.621094 5.507813 2.148438 5.511719 2.851563 C 5.507813 3.488281 5.433594 3.960938 5.285156 4.273438 C 5.132813 4.582031 4.941406 4.800781 4.710938 4.929688 C 4.476563 5.054688 4.050781 5.121094 3.4375 5.121094 L 1.769531 5.105469 C 1.296875 5.101563 0.945313 5.125 0.722656 5.171875 C 0.496094 5.214844 0.257813 5.300781 0 5.429688 L 0 4.019531 C 0.09375 3.980469 0.234375 3.933594 0.421875 3.882813 C 0.503906 3.855469 0.558594 3.839844 0.589844 3.832031 C 0.351563 3.585938 0.171875 3.324219 0.0546875 3.046875 C -0.0625 2.769531 -0.117188 2.472656 -0.121094 2.164063 C -0.117188 1.601563 0.03125 1.164063 0.328125 0.847656 C 0.628906 0.527344 1.007813 0.367188 1.472656 0.371094 C 1.773438 0.367188 2.046875 0.441406 2.285156 0.589844 C 2.523438 0.734375 2.703125 0.9375 2.832031 1.199219 C 2.957031 1.460938 3.070313 1.839844 3.167969 2.335938 C 3.289063 3 3.40625 3.460938 3.515625 3.71875 L 3.660156 3.71875 C 3.929688 3.714844 4.125 3.648438 4.246094 3.515625 C 4.359375 3.378906 4.417969 3.121094 4.421875 2.75 C 4.417969 2.492188 4.367188 2.296875 4.269531 2.15625 C 4.167969 2.015625 3.992188 1.898438 3.746094 1.8125 Z M 2.589844 3.71875 C 2.527344 3.535156 2.453125 3.246094 2.367188 2.851563 C 2.28125 2.453125 2.199219 2.195313 2.121094 2.074219 C 1.988281 1.886719 1.820313 1.792969 1.621094 1.796875 C 1.417969 1.792969 1.246094 1.867188 1.101563 2.019531 C 0.953125 2.164063 0.878906 2.355469 0.882813 2.589844 C 0.878906 2.84375 0.964844 3.085938 1.136719 3.324219 C 1.261719 3.492188 1.417969 3.605469 1.609375 3.664063 C 1.730469 3.695313 1.960938 3.714844 2.304688 3.71875 Z M 2.589844 3.71875 "
-           id="path12334" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-5">
-        <path
-           style="stroke:none;"
-           d="M 0 0.746094 L 7.441406 0.746094 L 7.441406 2.171875 L 0 2.171875 Z M 0 0.746094 "
-           id="path12337" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-6">
-        <path
-           style="stroke:none;"
-           d="M 5.390625 0.707031 L 5.390625 2.035156 L 4.597656 2.035156 C 4.867188 2.203125 5.085938 2.4375 5.257813 2.734375 C 5.421875 3.027344 5.507813 3.355469 5.511719 3.714844 C 5.507813 4.339844 5.261719 4.871094 4.773438 5.308594 C 4.28125 5.746094 3.597656 5.964844 2.726563 5.964844 C 1.820313 5.964844 1.121094 5.742188 0.625 5.304688 C 0.128906 4.859375 -0.117188 4.328125 -0.121094 3.707031 C -0.117188 3.40625 -0.0625 3.136719 0.0546875 2.894531 C 0.171875 2.652344 0.375 2.398438 0.664063 2.132813 L -2.050781 2.132813 L -2.050781 0.707031 Z M 2.785156 2.117188 C 2.175781 2.113281 1.730469 2.234375 1.445313 2.476563 C 1.152344 2.714844 1.007813 3.007813 1.011719 3.355469 C 1.007813 3.683594 1.140625 3.957031 1.410156 4.179688 C 1.671875 4.398438 2.105469 4.511719 2.714844 4.511719 C 3.277344 4.511719 3.699219 4.398438 3.972656 4.171875 C 4.246094 3.945313 4.382813 3.664063 4.386719 3.328125 C 4.382813 2.980469 4.246094 2.691406 3.980469 2.460938 C 3.707031 2.230469 3.308594 2.113281 2.785156 2.117188 Z M 2.785156 2.117188 "
-           id="path12340" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-7">
-        <path
-           style="stroke:none;"
-           d="M 0 0.695313 L 7.441406 0.695313 L 7.441406 2.121094 L 3.492188 2.121094 L 5.390625 3.792969 L 5.390625 5.546875 L 3.421875 3.707031 L 0 5.679688 L 0 4.140625 L 2.421875 2.785156 L 1.726563 2.121094 L 0 2.121094 Z M 0 0.695313 "
-           id="path12343" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-8">
-        <path
-           style="stroke:none;"
-           d="M 7.441406 0.746094 L 7.441406 2.25 L 3.410156 2.25 C 2.769531 2.25 2.355469 2.265625 2.167969 2.304688 C 1.863281 2.367188 1.617188 2.519531 1.433594 2.761719 C 1.246094 3 1.152344 3.332031 1.15625 3.757813 C 1.152344 4.179688 1.242188 4.5 1.417969 4.71875 C 1.589844 4.933594 1.800781 5.066406 2.058594 5.109375 C 2.308594 5.152344 2.730469 5.171875 3.324219 5.175781 L 7.441406 5.175781 L 7.441406 6.679688 L 3.53125 6.679688 C 2.636719 6.679688 2.003906 6.636719 1.636719 6.558594 C 1.265625 6.472656 0.957031 6.324219 0.703125 6.109375 C 0.449219 5.886719 0.246094 5.59375 0.0976563 5.230469 C -0.0507813 4.863281 -0.121094 4.386719 -0.125 3.800781 C -0.121094 3.085938 -0.0429688 2.546875 0.121094 2.183594 C 0.28125 1.8125 0.492188 1.523438 0.757813 1.3125 C 1.015625 1.101563 1.292969 0.960938 1.582031 0.894531 C 2.007813 0.792969 2.636719 0.742188 3.472656 0.746094 Z M 7.441406 0.746094 "
-           id="path12346" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-9">
-        <path
-           style="stroke:none;"
-           d="M 1.714844 3.867188 L 1.476563 5.289063 C 0.953125 5.105469 0.554688 4.816406 0.285156 4.421875 C 0.015625 4.027344 -0.117188 3.535156 -0.121094 2.945313 C -0.117188 2.003906 0.1875 1.308594 0.796875 0.863281 C 1.285156 0.503906 1.902344 0.328125 2.65625 0.328125 C 3.546875 0.328125 4.246094 0.558594 4.753906 1.027344 C 5.253906 1.492188 5.507813 2.085938 5.511719 2.800781 C 5.507813 3.601563 5.242188 4.234375 4.714844 4.699219 C 4.183594 5.160156 3.375 5.378906 2.285156 5.363281 L 2.285156 1.792969 C 1.859375 1.800781 1.527344 1.914063 1.296875 2.136719 C 1.058594 2.351563 0.941406 2.625 0.945313 2.960938 C 0.941406 3.179688 1.003906 3.367188 1.125 3.519531 C 1.246094 3.671875 1.441406 3.789063 1.714844 3.867188 Z M 3.15625 3.949219 C 3.566406 3.9375 3.878906 3.828125 4.097656 3.628906 C 4.308594 3.421875 4.417969 3.175781 4.421875 2.886719 C 4.417969 2.574219 4.304688 2.316406 4.082031 2.117188 C 3.851563 1.910156 3.542969 1.8125 3.15625 1.816406 Z M 3.15625 3.949219 "
-           id="path12349" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph2-10">
-        <path
-           style="stroke:none;"
-           d="M 0 2.113281 L 0 0.683594 L 5.390625 0.683594 L 5.390625 2.011719 L 4.625 2.011719 C 4.984375 2.234375 5.222656 2.4375 5.339844 2.621094 C 5.449219 2.800781 5.507813 3.007813 5.511719 3.238281 C 5.507813 3.5625 5.417969 3.875 5.242188 4.175781 L 4 3.734375 C 4.152344 3.492188 4.230469 3.269531 4.234375 3.066406 C 4.230469 2.863281 4.175781 2.695313 4.066406 2.558594 C 3.957031 2.417969 3.757813 2.308594 3.46875 2.230469 C 3.179688 2.148438 2.578125 2.109375 1.664063 2.113281 Z M 0 2.113281 "
-           id="path12352" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-0">
-        <path
-           style="stroke:none;"
-           d="M 0 7.371094 L 0 5.757813 L 1.667969 5.117188 L 1.667969 2.179688 L 0 1.574219 L 0 0 L 7.347656 2.863281 L 7.347656 4.429688 Z M 2.90625 4.640625 L 5.632813 3.628906 L 2.90625 2.636719 Z M 2.90625 4.640625 "
-           id="path12355" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-1">
-        <path
-           style="stroke:none;"
-           d="M 3.75 5.378906 L 3.5 3.988281 C 3.773438 3.9375 3.984375 3.832031 4.125 3.667969 C 4.265625 3.503906 4.335938 3.289063 4.335938 3.027344 C 4.335938 2.671875 4.214844 2.390625 3.972656 2.1875 C 3.730469 1.976563 3.324219 1.875 2.757813 1.875 C 2.121094 1.875 1.675781 1.980469 1.417969 2.191406 C 1.15625 2.402344 1.023438 2.6875 1.027344 3.046875 C 1.023438 3.3125 1.101563 3.53125 1.253906 3.703125 C 1.40625 3.875 1.667969 3.996094 2.039063 4.066406 L 1.804688 5.449219 C 1.167969 5.304688 0.6875 5.027344 0.363281 4.621094 C 0.0429688 4.210938 -0.117188 3.664063 -0.121094 2.980469 C -0.117188 2.199219 0.125 1.578125 0.613281 1.117188 C 1.101563 0.652344 1.78125 0.421875 2.65625 0.425781 C 3.53125 0.421875 4.214844 0.652344 4.707031 1.121094 C 5.191406 1.582031 5.4375 2.210938 5.441406 3.007813 C 5.4375 3.652344 5.296875 4.167969 5.023438 4.554688 C 4.742188 4.933594 4.320313 5.210938 3.75 5.378906 Z M 3.75 5.378906 "
-           id="path12358" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-2">
-        <path
-           style="stroke:none;"
-           d="M 1.695313 3.820313 L 1.457031 5.222656 C 0.941406 5.039063 0.550781 4.753906 0.28125 4.367188 C 0.015625 3.976563 -0.117188 3.492188 -0.121094 2.90625 C -0.117188 1.980469 0.183594 1.292969 0.785156 0.851563 C 1.269531 0.496094 1.878906 0.320313 2.621094 0.324219 C 3.5 0.320313 4.191406 0.550781 4.691406 1.015625 C 5.1875 1.476563 5.4375 2.0625 5.441406 2.765625 C 5.4375 3.554688 5.175781 4.179688 4.65625 4.640625 C 4.132813 5.097656 3.332031 5.316406 2.253906 5.296875 L 2.253906 1.769531 C 1.835938 1.777344 1.511719 1.890625 1.28125 2.109375 C 1.046875 2.328125 0.929688 2.597656 0.933594 2.921875 C 0.929688 3.140625 0.992188 3.324219 1.113281 3.476563 C 1.230469 3.625 1.421875 3.742188 1.695313 3.820313 Z M 3.117188 3.898438 C 3.523438 3.882813 3.832031 3.777344 4.046875 3.582031 C 4.257813 3.378906 4.363281 3.136719 4.367188 2.851563 C 4.363281 2.542969 4.253906 2.289063 4.03125 2.089844 C 3.804688 1.886719 3.5 1.789063 3.117188 1.792969 Z M 3.117188 3.898438 "
-           id="path12361" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-3">
-        <path
-           style="stroke:none;"
-           d="M 0 0.738281 L 7.347656 0.738281 L 7.347656 2.144531 L 0 2.144531 Z M 0 0.738281 "
-           id="path12364" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-4">
-        <path
-           style="stroke:none;"
-           d="M 0 2.085938 L 0 0.675781 L 5.324219 0.675781 L 5.324219 1.984375 L 4.566406 1.984375 C 4.917969 2.207031 5.152344 2.40625 5.269531 2.585938 C 5.378906 2.761719 5.4375 2.96875 5.441406 3.199219 C 5.4375 3.519531 5.347656 3.828125 5.175781 4.125 L 3.949219 3.6875 C 4.101563 3.445313 4.175781 3.226563 4.179688 3.027344 C 4.175781 2.828125 4.121094 2.660156 4.015625 2.523438 C 3.90625 2.386719 3.710938 2.28125 3.429688 2.203125 C 3.140625 2.125 2.546875 2.085938 1.644531 2.085938 Z M 0 2.085938 "
-           id="path12367" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-5">
-        <path
-           style="stroke:none;"
-           d="M 3.699219 1.789063 L 3.929688 0.511719 C 4.441406 0.652344 4.820313 0.902344 5.070313 1.253906 C 5.3125 1.605469 5.4375 2.125 5.441406 2.816406 C 5.4375 3.441406 5.363281 3.910156 5.21875 4.21875 C 5.066406 4.523438 4.878906 4.738281 4.652344 4.867188 C 4.421875 4.992188 4 5.054688 3.394531 5.058594 L 1.75 5.042969 C 1.277344 5.039063 0.933594 5.0625 0.714844 5.109375 C 0.492188 5.152344 0.253906 5.234375 0 5.363281 L 0 3.96875 C 0.09375 3.929688 0.230469 3.886719 0.414063 3.835938 C 0.496094 3.808594 0.550781 3.792969 0.582031 3.785156 C 0.34375 3.542969 0.167969 3.285156 0.0546875 3.011719 C -0.0625 2.738281 -0.117188 2.445313 -0.121094 2.136719 C -0.117188 1.585938 0.0273438 1.152344 0.324219 0.839844 C 0.621094 0.519531 1 0.363281 1.453125 0.367188 C 1.753906 0.363281 2.019531 0.433594 2.257813 0.582031 C 2.492188 0.722656 2.675781 0.925781 2.800781 1.183594 C 2.925781 1.441406 3.035156 1.8125 3.128906 2.304688 C 3.25 2.960938 3.363281 3.417969 3.472656 3.675781 L 3.613281 3.675781 C 3.882813 3.671875 4.074219 3.605469 4.191406 3.472656 C 4.304688 3.335938 4.363281 3.082031 4.367188 2.714844 C 4.363281 2.460938 4.316406 2.265625 4.21875 2.128906 C 4.117188 1.984375 3.941406 1.871094 3.699219 1.789063 Z M 2.554688 3.675781 C 2.492188 3.488281 2.417969 3.203125 2.339844 2.816406 C 2.253906 2.421875 2.171875 2.167969 2.09375 2.050781 C 1.960938 1.863281 1.796875 1.769531 1.597656 1.773438 C 1.398438 1.769531 1.226563 1.84375 1.085938 1.992188 C 0.9375 2.140625 0.867188 2.328125 0.871094 2.554688 C 0.867188 2.808594 0.949219 3.050781 1.121094 3.28125 C 1.246094 3.453125 1.402344 3.5625 1.589844 3.617188 C 1.707031 3.652344 1.9375 3.671875 2.277344 3.675781 Z M 2.554688 3.675781 "
-           id="path12370" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-6">
-        <path
-           style="stroke:none;"
-           d="M 5.324219 3.179688 L 4.199219 3.179688 L 4.199219 2.214844 L 2.054688 2.214844 C 1.617188 2.214844 1.363281 2.222656 1.292969 2.242188 C 1.21875 2.257813 1.160156 2.300781 1.117188 2.367188 C 1.066406 2.429688 1.042969 2.507813 1.046875 2.605469 C 1.042969 2.734375 1.089844 2.921875 1.183594 3.171875 L 0.0898438 3.292969 C -0.0507813 2.964844 -0.117188 2.59375 -0.121094 2.179688 C -0.117188 1.925781 -0.078125 1.695313 0.0078125 1.492188 C 0.0898438 1.285156 0.203125 1.136719 0.339844 1.042969 C 0.472656 0.949219 0.652344 0.882813 0.886719 0.847656 C 1.046875 0.8125 1.378906 0.796875 1.878906 0.800781 L 4.199219 0.800781 L 4.199219 0.15625 L 5.324219 0.15625 L 5.324219 0.800781 L 6.378906 0.800781 L 7.203125 2.214844 L 5.324219 2.214844 Z M 5.324219 3.179688 "
-           id="path12373" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph3-7">
-        <path
-           style="stroke:none;"
-           d="M 2.738281 0.410156 C 3.199219 0.410156 3.652344 0.523438 4.09375 0.753906 C 4.527344 0.984375 4.859375 1.3125 5.09375 1.734375 C 5.320313 2.15625 5.4375 2.628906 5.441406 3.152344 C 5.4375 3.957031 5.175781 4.617188 4.65625 5.132813 C 4.132813 5.648438 3.472656 5.90625 2.675781 5.90625 C 1.867188 5.90625 1.199219 5.644531 0.671875 5.125 C 0.144531 4.601563 -0.117188 3.949219 -0.121094 3.164063 C -0.117188 2.671875 -0.0078125 2.203125 0.210938 1.765625 C 0.429688 1.320313 0.75 0.984375 1.179688 0.753906 C 1.601563 0.523438 2.121094 0.410156 2.738281 0.410156 Z M 2.660156 1.855469 C 2.132813 1.851563 1.726563 1.976563 1.449219 2.230469 C 1.164063 2.476563 1.023438 2.785156 1.027344 3.15625 C 1.023438 3.523438 1.164063 3.832031 1.449219 4.082031 C 1.726563 4.328125 2.136719 4.453125 2.671875 4.457031 C 3.191406 4.453125 3.589844 4.328125 3.875 4.082031 C 4.152344 3.832031 4.292969 3.523438 4.296875 3.15625 C 4.292969 2.785156 4.152344 2.476563 3.875 2.230469 C 3.589844 1.976563 3.1875 1.851563 2.660156 1.855469 Z M 2.660156 1.855469 "
-           id="path12376" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-0">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 0 L 0.675781 -6.667969 L 5.621094 -6.667969 L 5.621094 -5.539063 L 2.023438 -5.539063 L 2.023438 -4.0625 L 5.371094 -4.0625 L 5.371094 -2.9375 L 2.023438 -2.9375 L 2.023438 -1.125 L 5.746094 -1.125 L 5.746094 0 Z M 0.675781 0 "
-           id="path12379" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-1">
-        <path
-           style="stroke:none;"
-           d="M 0.0546875 0 L 1.796875 -2.488281 L 0.128906 -4.828125 L 1.6875 -4.828125 L 2.542969 -3.5 L 3.441406 -4.828125 L 4.941406 -4.828125 L 3.304688 -2.542969 L 5.09375 0 L 3.523438 0 L 2.542969 -1.496094 L 1.550781 0 Z M 0.0546875 0 "
-           id="path12382" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-2">
-        <path
-           style="stroke:none;"
-           d="M 3.464844 -1.535156 L 4.738281 -1.324219 C 4.570313 -0.855469 4.3125 -0.496094 3.960938 -0.253906 C 3.609375 -0.0117188 3.167969 0.105469 2.636719 0.109375 C 1.796875 0.105469 1.175781 -0.167969 0.773438 -0.714844 C 0.453125 -1.152344 0.292969 -1.707031 0.296875 -2.378906 C 0.292969 -3.175781 0.5 -3.800781 0.921875 -4.257813 C 1.335938 -4.707031 1.867188 -4.933594 2.511719 -4.9375 C 3.226563 -4.933594 3.792969 -4.695313 4.207031 -4.226563 C 4.621094 -3.75 4.820313 -3.023438 4.804688 -2.046875 L 1.605469 -2.046875 C 1.613281 -1.664063 1.714844 -1.371094 1.914063 -1.160156 C 2.109375 -0.949219 2.355469 -0.84375 2.652344 -0.847656 C 2.851563 -0.84375 3.019531 -0.898438 3.15625 -1.007813 C 3.289063 -1.117188 3.390625 -1.292969 3.464844 -1.535156 Z M 3.539063 -2.828125 C 3.527344 -3.195313 3.429688 -3.476563 3.25 -3.671875 C 3.066406 -3.863281 2.84375 -3.960938 2.585938 -3.960938 C 2.304688 -3.960938 2.074219 -3.859375 1.894531 -3.65625 C 1.710938 -3.453125 1.625 -3.175781 1.628906 -2.828125 Z M 3.539063 -2.828125 "
-           id="path12385" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-3">
-        <path
-           style="stroke:none;"
-           d="M 4.878906 -3.402344 L 3.621094 -3.175781 C 3.574219 -3.425781 3.476563 -3.613281 3.328125 -3.742188 C 3.179688 -3.867188 2.984375 -3.929688 2.746094 -3.933594 C 2.425781 -3.929688 2.171875 -3.820313 1.984375 -3.601563 C 1.792969 -3.378906 1.699219 -3.011719 1.699219 -2.5 C 1.699219 -1.925781 1.792969 -1.519531 1.988281 -1.285156 C 2.175781 -1.046875 2.4375 -0.929688 2.765625 -0.933594 C 3.007813 -0.929688 3.203125 -1 3.359375 -1.140625 C 3.511719 -1.277344 3.621094 -1.515625 3.6875 -1.851563 L 4.941406 -1.636719 C 4.808594 -1.058594 4.558594 -0.625 4.191406 -0.332031 C 3.816406 -0.0390625 3.324219 0.105469 2.707031 0.109375 C 1.996094 0.105469 1.429688 -0.113281 1.015625 -0.558594 C 0.59375 -1 0.386719 -1.617188 0.386719 -2.410156 C 0.386719 -3.203125 0.597656 -3.824219 1.019531 -4.269531 C 1.441406 -4.710938 2.007813 -4.933594 2.726563 -4.9375 C 3.3125 -4.933594 3.78125 -4.808594 4.128906 -4.558594 C 4.476563 -4.304688 4.726563 -3.917969 4.878906 -3.402344 Z M 4.878906 -3.402344 "
-           id="path12388" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-4">
-        <path
-           style="stroke:none;"
-           d="M 3.847656 0 L 3.847656 -0.722656 C 3.667969 -0.460938 3.4375 -0.257813 3.152344 -0.113281 C 2.863281 0.0351563 2.558594 0.105469 2.246094 0.109375 C 1.917969 0.105469 1.628906 0.0390625 1.375 -0.101563 C 1.113281 -0.242188 0.925781 -0.445313 0.8125 -0.703125 C 0.695313 -0.960938 0.636719 -1.316406 0.640625 -1.773438 L 0.640625 -4.828125 L 1.917969 -4.828125 L 1.917969 -2.609375 C 1.914063 -1.925781 1.9375 -1.511719 1.988281 -1.359375 C 2.03125 -1.207031 2.117188 -1.085938 2.246094 -0.996094 C 2.367188 -0.90625 2.527344 -0.859375 2.71875 -0.863281 C 2.9375 -0.859375 3.132813 -0.921875 3.304688 -1.042969 C 3.476563 -1.160156 3.59375 -1.308594 3.660156 -1.488281 C 3.722656 -1.664063 3.753906 -2.097656 3.757813 -2.792969 L 3.757813 -4.828125 L 5.035156 -4.828125 L 5.035156 0 Z M 3.847656 0 "
-           id="path12391" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-5">
-        <path
-           style="stroke:none;"
-           d="M 2.882813 -4.828125 L 2.882813 -3.8125 L 2.011719 -3.8125 L 2.011719 -1.863281 C 2.007813 -1.46875 2.015625 -1.238281 2.035156 -1.175781 C 2.046875 -1.109375 2.085938 -1.054688 2.148438 -1.011719 C 2.207031 -0.96875 2.277344 -0.949219 2.363281 -0.949219 C 2.480469 -0.949219 2.652344 -0.988281 2.878906 -1.074219 L 2.988281 -0.0820313 C 2.691406 0.0429688 2.351563 0.105469 1.976563 0.109375 C 1.742188 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.035156 -0.179688 0.949219 -0.304688 C 0.863281 -0.429688 0.800781 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.707031 L 0.726563 -3.8125 L 0.140625 -3.8125 L 0.140625 -4.828125 L 0.726563 -4.828125 L 0.726563 -5.789063 L 2.011719 -6.535156 L 2.011719 -4.828125 Z M 2.882813 -4.828125 "
-           id="path12394" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-6">
-        <path
-           style="stroke:none;"
-           d="M 0.371094 -2.484375 C 0.367188 -2.90625 0.472656 -3.316406 0.683594 -3.714844 C 0.890625 -4.113281 1.1875 -4.414063 1.574219 -4.625 C 1.957031 -4.828125 2.386719 -4.933594 2.859375 -4.9375 C 3.589844 -4.933594 4.1875 -4.695313 4.65625 -4.226563 C 5.121094 -3.75 5.355469 -3.152344 5.355469 -2.429688 C 5.355469 -1.695313 5.117188 -1.089844 4.648438 -0.609375 C 4.171875 -0.132813 3.582031 0.105469 2.871094 0.109375 C 2.425781 0.105469 2 0.0078125 1.601563 -0.191406 C 1.195313 -0.390625 0.890625 -0.683594 0.683594 -1.070313 C 0.472656 -1.457031 0.367188 -1.929688 0.371094 -2.484375 Z M 1.683594 -2.414063 C 1.679688 -1.933594 1.792969 -1.566406 2.023438 -1.3125 C 2.246094 -1.058594 2.527344 -0.929688 2.863281 -0.933594 C 3.195313 -0.929688 3.476563 -1.058594 3.703125 -1.3125 C 3.929688 -1.566406 4.042969 -1.933594 4.042969 -2.421875 C 4.042969 -2.890625 3.929688 -3.257813 3.703125 -3.515625 C 3.476563 -3.769531 3.195313 -3.894531 2.863281 -3.898438 C 2.527344 -3.894531 2.246094 -3.769531 2.023438 -3.515625 C 1.792969 -3.257813 1.679688 -2.890625 1.683594 -2.414063 Z M 1.683594 -2.414063 "
-           id="path12397" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph4-7">
-        <path
-           style="stroke:none;"
-           d="M 1.890625 0 L 0.613281 0 L 0.613281 -4.828125 L 1.800781 -4.828125 L 1.800781 -4.144531 C 2 -4.464844 2.183594 -4.679688 2.347656 -4.78125 C 2.507813 -4.882813 2.691406 -4.933594 2.902344 -4.9375 C 3.1875 -4.933594 3.46875 -4.855469 3.742188 -4.699219 L 3.347656 -3.582031 C 3.128906 -3.71875 2.929688 -3.789063 2.746094 -3.792969 C 2.5625 -3.789063 2.410156 -3.742188 2.289063 -3.644531 C 2.164063 -3.542969 2.066406 -3.363281 1.996094 -3.109375 C 1.921875 -2.847656 1.886719 -2.308594 1.890625 -1.492188 Z M 1.890625 0 "
-           id="path12400" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-0">
-        <path
-           style="stroke:none;"
-           d="M 0.734375 0 L 0.734375 -7.441406 L 2.984375 -7.441406 L 4.335938 -2.367188 L 5.667969 -7.441406 L 7.921875 -7.441406 L 7.921875 0 L 6.527344 0 L 6.527344 -5.855469 L 5.050781 0 L 3.605469 0 L 2.132813 -5.855469 L 2.132813 0 Z M 0.734375 0 "
-           id="path12403" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-1">
-        <path
-           style="stroke:none;"
-           d="M 1.8125 -3.746094 L 0.519531 -3.980469 C 0.660156 -4.496094 0.910156 -4.882813 1.265625 -5.136719 C 1.621094 -5.382813 2.148438 -5.507813 2.851563 -5.511719 C 3.488281 -5.507813 3.960938 -5.433594 4.273438 -5.285156 C 4.582031 -5.132813 4.800781 -4.941406 4.929688 -4.710938 C 5.054688 -4.476563 5.121094 -4.050781 5.121094 -3.4375 L 5.105469 -1.769531 C 5.101563 -1.296875 5.125 -0.945313 5.171875 -0.722656 C 5.214844 -0.496094 5.300781 -0.257813 5.429688 0 L 4.019531 0 C 3.980469 -0.09375 3.933594 -0.234375 3.882813 -0.421875 C 3.855469 -0.503906 3.839844 -0.558594 3.832031 -0.589844 C 3.585938 -0.351563 3.324219 -0.171875 3.046875 -0.0546875 C 2.769531 0.0625 2.472656 0.117188 2.164063 0.121094 C 1.601563 0.117188 1.164063 -0.03125 0.847656 -0.328125 C 0.527344 -0.628906 0.367188 -1.007813 0.371094 -1.472656 C 0.367188 -1.773438 0.441406 -2.046875 0.589844 -2.285156 C 0.734375 -2.523438 0.9375 -2.703125 1.199219 -2.832031 C 1.460938 -2.957031 1.839844 -3.070313 2.335938 -3.167969 C 3 -3.289063 3.460938 -3.40625 3.71875 -3.515625 L 3.71875 -3.660156 C 3.714844 -3.929688 3.648438 -4.125 3.515625 -4.246094 C 3.378906 -4.359375 3.121094 -4.417969 2.75 -4.421875 C 2.492188 -4.417969 2.296875 -4.367188 2.15625 -4.269531 C 2.015625 -4.167969 1.898438 -3.992188 1.8125 -3.746094 Z M 3.71875 -2.589844 C 3.535156 -2.527344 3.246094 -2.453125 2.851563 -2.367188 C 2.453125 -2.28125 2.195313 -2.199219 2.074219 -2.121094 C 1.886719 -1.988281 1.792969 -1.820313 1.796875 -1.621094 C 1.792969 -1.417969 1.867188 -1.246094 2.019531 -1.101563 C 2.164063 -0.953125 2.355469 -0.878906 2.589844 -0.882813 C 2.84375 -0.878906 3.085938 -0.964844 3.324219 -1.136719 C 3.492188 -1.261719 3.605469 -1.417969 3.664063 -1.609375 C 3.695313 -1.730469 3.714844 -1.960938 3.71875 -2.304688 Z M 3.71875 -2.589844 "
-           id="path12406" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-2">
-        <path
-           style="stroke:none;"
-           d="M 0.746094 -6.121094 L 0.746094 -7.441406 L 2.171875 -7.441406 L 2.171875 -6.121094 Z M 0.746094 0 L 0.746094 -5.390625 L 2.171875 -5.390625 L 2.171875 0 Z M 0.746094 0 "
-           id="path12409" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-3">
-        <path
-           style="stroke:none;"
-           d="M 5.648438 0 L 4.222656 0 L 4.222656 -2.75 C 4.21875 -3.328125 4.1875 -3.707031 4.128906 -3.878906 C 4.066406 -4.050781 3.96875 -4.183594 3.832031 -4.277344 C 3.695313 -4.371094 3.53125 -4.417969 3.339844 -4.421875 C 3.089844 -4.417969 2.867188 -4.351563 2.671875 -4.21875 C 2.476563 -4.082031 2.339844 -3.902344 2.269531 -3.679688 C 2.195313 -3.453125 2.160156 -3.039063 2.164063 -2.441406 L 2.164063 0 L 0.734375 0 L 0.734375 -5.390625 L 2.0625 -5.390625 L 2.0625 -4.597656 C 2.53125 -5.203125 3.121094 -5.507813 3.835938 -5.511719 C 4.148438 -5.507813 4.4375 -5.449219 4.699219 -5.339844 C 4.957031 -5.222656 5.152344 -5.078125 5.289063 -4.90625 C 5.421875 -4.726563 5.515625 -4.527344 5.570313 -4.308594 C 5.621094 -4.082031 5.648438 -3.765625 5.648438 -3.351563 Z M 5.648438 0 "
-           id="path12412" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-4">
-        <path
-           style="stroke:none;"
-           d=""
-           id="path12415" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-5">
-        <path
-           style="stroke:none;"
-           d="M 5.515625 -2.734375 L 6.972656 -2.273438 C 6.75 -1.460938 6.378906 -0.855469 5.859375 -0.464844 C 5.339844 -0.0703125 4.679688 0.121094 3.882813 0.125 C 2.894531 0.121094 2.082031 -0.210938 1.445313 -0.886719 C 0.808594 -1.558594 0.492188 -2.484375 0.492188 -3.65625 C 0.492188 -4.890625 0.808594 -5.851563 1.449219 -6.539063 C 2.085938 -7.222656 2.929688 -7.5625 3.972656 -7.566406 C 4.882813 -7.5625 5.621094 -7.296875 6.191406 -6.761719 C 6.527344 -6.441406 6.78125 -5.984375 6.953125 -5.390625 L 5.464844 -5.035156 C 5.375 -5.417969 5.191406 -5.722656 4.914063 -5.949219 C 4.632813 -6.167969 4.292969 -6.28125 3.898438 -6.285156 C 3.34375 -6.28125 2.898438 -6.082031 2.554688 -5.691406 C 2.210938 -5.292969 2.039063 -4.652344 2.039063 -3.765625 C 2.039063 -2.824219 2.207031 -2.152344 2.546875 -1.753906 C 2.882813 -1.351563 3.324219 -1.152344 3.867188 -1.15625 C 4.265625 -1.152344 4.605469 -1.28125 4.894531 -1.535156 C 5.179688 -1.789063 5.386719 -2.1875 5.515625 -2.734375 Z M 5.515625 -2.734375 "
-           id="path12418" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-6">
-        <path
-           style="stroke:none;"
-           d="M 0.417969 -2.769531 C 0.414063 -3.242188 0.53125 -3.699219 0.765625 -4.144531 C 0.996094 -4.585938 1.328125 -4.925781 1.757813 -5.160156 C 2.1875 -5.390625 2.664063 -5.507813 3.191406 -5.511719 C 4.003906 -5.507813 4.671875 -5.242188 5.195313 -4.714844 C 5.714844 -4.183594 5.976563 -3.515625 5.980469 -2.710938 C 5.976563 -1.894531 5.714844 -1.21875 5.191406 -0.683594 C 4.664063 -0.148438 4 0.117188 3.203125 0.121094 C 2.707031 0.117188 2.234375 0.0078125 1.789063 -0.210938 C 1.335938 -0.433594 0.996094 -0.761719 0.765625 -1.195313 C 0.53125 -1.625 0.414063 -2.148438 0.417969 -2.769531 Z M 1.878906 -2.695313 C 1.875 -2.160156 2 -1.75 2.257813 -1.464844 C 2.507813 -1.179688 2.824219 -1.035156 3.199219 -1.039063 C 3.570313 -1.035156 3.878906 -1.179688 4.132813 -1.464844 C 4.382813 -1.75 4.511719 -2.164063 4.511719 -2.707031 C 4.511719 -3.230469 4.382813 -3.632813 4.132813 -3.921875 C 3.878906 -4.203125 3.570313 -4.347656 3.199219 -4.351563 C 2.824219 -4.347656 2.507813 -4.203125 2.257813 -3.921875 C 2 -3.632813 1.875 -3.226563 1.878906 -2.695313 Z M 1.878906 -2.695313 "
-           id="path12421" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-7">
-        <path
-           style="stroke:none;"
-           d="M 5.445313 -3.796875 L 4.039063 -3.542969 C 3.988281 -3.820313 3.882813 -4.03125 3.714844 -4.175781 C 3.546875 -4.316406 3.328125 -4.386719 3.066406 -4.390625 C 2.707031 -4.386719 2.425781 -4.265625 2.214844 -4.023438 C 2.003906 -3.777344 1.898438 -3.367188 1.898438 -2.792969 C 1.898438 -2.148438 2.003906 -1.695313 2.21875 -1.433594 C 2.433594 -1.167969 2.722656 -1.035156 3.085938 -1.039063 C 3.355469 -1.035156 3.578125 -1.113281 3.75 -1.269531 C 3.921875 -1.421875 4.042969 -1.6875 4.117188 -2.066406 L 5.515625 -1.828125 C 5.367188 -1.183594 5.089844 -0.695313 4.679688 -0.371094 C 4.265625 -0.0429688 3.710938 0.117188 3.019531 0.121094 C 2.226563 0.117188 1.597656 -0.128906 1.132813 -0.625 C 0.660156 -1.121094 0.425781 -1.808594 0.429688 -2.691406 C 0.425781 -3.574219 0.664063 -4.265625 1.136719 -4.765625 C 1.605469 -5.257813 2.242188 -5.507813 3.046875 -5.511719 C 3.703125 -5.507813 4.222656 -5.367188 4.609375 -5.085938 C 4.996094 -4.804688 5.273438 -4.375 5.445313 -3.796875 Z M 5.445313 -3.796875 "
-           id="path12424" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-8">
-        <path
-           style="stroke:none;"
-           d="M 3.867188 -1.714844 L 5.289063 -1.476563 C 5.105469 -0.953125 4.816406 -0.554688 4.421875 -0.285156 C 4.027344 -0.015625 3.535156 0.117188 2.945313 0.121094 C 2.003906 0.117188 1.308594 -0.1875 0.863281 -0.796875 C 0.503906 -1.285156 0.328125 -1.902344 0.328125 -2.65625 C 0.328125 -3.546875 0.558594 -4.246094 1.027344 -4.753906 C 1.492188 -5.253906 2.085938 -5.507813 2.800781 -5.511719 C 3.601563 -5.507813 4.234375 -5.242188 4.699219 -4.714844 C 5.160156 -4.183594 5.378906 -3.375 5.363281 -2.285156 L 1.792969 -2.285156 C 1.800781 -1.859375 1.914063 -1.527344 2.136719 -1.296875 C 2.351563 -1.058594 2.625 -0.941406 2.960938 -0.945313 C 3.179688 -0.941406 3.367188 -1.003906 3.519531 -1.125 C 3.671875 -1.246094 3.789063 -1.441406 3.867188 -1.714844 Z M 3.949219 -3.15625 C 3.9375 -3.566406 3.828125 -3.878906 3.628906 -4.097656 C 3.421875 -4.308594 3.175781 -4.417969 2.886719 -4.421875 C 2.574219 -4.417969 2.316406 -4.304688 2.117188 -4.082031 C 1.910156 -3.851563 1.8125 -3.542969 1.816406 -3.15625 Z M 3.949219 -3.15625 "
-           id="path12427" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-9">
-        <path
-           style="stroke:none;"
-           d="M 0.707031 -5.390625 L 2.035156 -5.390625 L 2.035156 -4.597656 C 2.203125 -4.867188 2.4375 -5.085938 2.734375 -5.257813 C 3.027344 -5.421875 3.355469 -5.507813 3.714844 -5.511719 C 4.339844 -5.507813 4.871094 -5.261719 5.308594 -4.773438 C 5.746094 -4.28125 5.964844 -3.597656 5.964844 -2.726563 C 5.964844 -1.820313 5.742188 -1.121094 5.304688 -0.625 C 4.859375 -0.128906 4.328125 0.117188 3.707031 0.121094 C 3.40625 0.117188 3.136719 0.0625 2.894531 -0.0546875 C 2.652344 -0.171875 2.398438 -0.375 2.132813 -0.664063 L 2.132813 2.050781 L 0.707031 2.050781 Z M 2.117188 -2.785156 C 2.113281 -2.175781 2.234375 -1.730469 2.476563 -1.445313 C 2.714844 -1.152344 3.007813 -1.007813 3.355469 -1.011719 C 3.683594 -1.007813 3.957031 -1.140625 4.179688 -1.410156 C 4.398438 -1.671875 4.511719 -2.105469 4.511719 -2.714844 C 4.511719 -3.277344 4.398438 -3.699219 4.171875 -3.972656 C 3.945313 -4.246094 3.664063 -4.382813 3.328125 -4.386719 C 2.980469 -4.382813 2.691406 -4.246094 2.460938 -3.980469 C 2.230469 -3.707031 2.113281 -3.308594 2.117188 -2.785156 Z M 2.117188 -2.785156 "
-           id="path12430" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-10">
-        <path
-           style="stroke:none;"
-           d="M 3.21875 -5.390625 L 3.21875 -4.253906 L 2.242188 -4.253906 L 2.242188 -2.082031 C 2.238281 -1.636719 2.246094 -1.382813 2.269531 -1.3125 C 2.285156 -1.238281 2.328125 -1.175781 2.398438 -1.132813 C 2.460938 -1.082031 2.542969 -1.058594 2.640625 -1.0625 C 2.769531 -1.058594 2.960938 -1.105469 3.214844 -1.199219 L 3.335938 -0.0898438 C 3 0.0507813 2.625 0.117188 2.207031 0.121094 C 1.945313 0.117188 1.714844 0.078125 1.511719 -0.0078125 C 1.304688 -0.0898438 1.152344 -0.203125 1.058594 -0.339844 C 0.960938 -0.476563 0.894531 -0.660156 0.859375 -0.898438 C 0.824219 -1.0625 0.808594 -1.398438 0.8125 -1.902344 L 0.8125 -4.253906 L 0.15625 -4.253906 L 0.15625 -5.390625 L 0.8125 -5.390625 L 0.8125 -6.460938 L 2.242188 -7.292969 L 2.242188 -5.390625 Z M 3.21875 -5.390625 "
-           id="path12433" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-11">
-        <path
-           style="stroke:none;"
-           d="M 0.242188 -1.539063 L 1.675781 -1.757813 C 1.734375 -1.476563 1.855469 -1.265625 2.042969 -1.125 C 2.226563 -0.980469 2.488281 -0.90625 2.828125 -0.910156 C 3.191406 -0.90625 3.46875 -0.976563 3.660156 -1.113281 C 3.78125 -1.207031 3.84375 -1.332031 3.847656 -1.492188 C 3.84375 -1.597656 3.808594 -1.6875 3.746094 -1.761719 C 3.671875 -1.828125 3.515625 -1.890625 3.269531 -1.949219 C 2.117188 -2.199219 1.386719 -2.429688 1.082031 -2.644531 C 0.652344 -2.933594 0.4375 -3.339844 0.441406 -3.859375 C 0.4375 -4.324219 0.621094 -4.714844 0.992188 -5.035156 C 1.359375 -5.347656 1.933594 -5.507813 2.710938 -5.511719 C 3.445313 -5.507813 3.992188 -5.386719 4.355469 -5.152344 C 4.710938 -4.910156 4.960938 -4.554688 5.097656 -4.085938 L 3.75 -3.835938 C 3.691406 -4.042969 3.582031 -4.203125 3.421875 -4.320313 C 3.261719 -4.429688 3.03125 -4.488281 2.734375 -4.488281 C 2.355469 -4.488281 2.085938 -4.433594 1.929688 -4.328125 C 1.816406 -4.253906 1.761719 -4.15625 1.765625 -4.039063 C 1.761719 -3.933594 1.8125 -3.847656 1.910156 -3.78125 C 2.035156 -3.683594 2.476563 -3.550781 3.238281 -3.378906 C 3.996094 -3.203125 4.527344 -2.992188 4.832031 -2.746094 C 5.125 -2.492188 5.273438 -2.136719 5.277344 -1.683594 C 5.273438 -1.1875 5.066406 -0.765625 4.65625 -0.410156 C 4.242188 -0.0585938 3.632813 0.117188 2.828125 0.121094 C 2.089844 0.117188 1.507813 -0.0273438 1.082031 -0.324219 C 0.652344 -0.621094 0.371094 -1.027344 0.242188 -1.539063 Z M 0.242188 -1.539063 "
-           id="path12436" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-12">
-        <path
-           style="stroke:none;"
-           d="M 0.757813 0 L 0.757813 -7.441406 L 6.273438 -7.441406 L 6.273438 -6.183594 L 2.257813 -6.183594 L 2.257813 -4.53125 L 5.996094 -4.53125 L 5.996094 -3.277344 L 2.257813 -3.277344 L 2.257813 -1.253906 L 6.414063 -1.253906 L 6.414063 0 Z M 0.757813 0 "
-           id="path12439" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-13">
-        <path
-           style="stroke:none;"
-           d="M 0.0625 0 L 2.003906 -2.777344 L 0.140625 -5.390625 L 1.882813 -5.390625 L 2.835938 -3.910156 L 3.84375 -5.390625 L 5.515625 -5.390625 L 3.691406 -2.835938 L 5.683594 0 L 3.933594 0 L 2.835938 -1.667969 L 1.730469 0 Z M 0.0625 0 "
-           id="path12442" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-14">
-        <path
-           style="stroke:none;"
-           d="M 4.292969 0 L 4.292969 -0.808594 C 4.09375 -0.519531 3.835938 -0.289063 3.519531 -0.125 C 3.195313 0.0390625 2.859375 0.117188 2.507813 0.121094 C 2.144531 0.117188 1.816406 0.0390625 1.53125 -0.117188 C 1.242188 -0.273438 1.035156 -0.496094 0.90625 -0.785156 C 0.777344 -1.074219 0.714844 -1.472656 0.714844 -1.980469 L 0.714844 -5.390625 L 2.140625 -5.390625 L 2.140625 -2.914063 C 2.136719 -2.152344 2.164063 -1.6875 2.21875 -1.519531 C 2.269531 -1.347656 2.367188 -1.214844 2.507813 -1.113281 C 2.644531 -1.011719 2.820313 -0.960938 3.035156 -0.964844 C 3.277344 -0.960938 3.496094 -1.027344 3.691406 -1.164063 C 3.882813 -1.296875 4.011719 -1.464844 4.085938 -1.664063 C 4.152344 -1.859375 4.1875 -2.34375 4.191406 -3.117188 L 4.191406 -5.390625 L 5.617188 -5.390625 L 5.617188 0 Z M 4.292969 0 "
-           id="path12445" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-15">
-        <path
-           style="stroke:none;"
-           d="M 0.75 -7.441406 L 3.496094 -7.441406 C 4.113281 -7.441406 4.585938 -7.394531 4.914063 -7.300781 C 5.347656 -7.167969 5.71875 -6.9375 6.03125 -6.613281 C 6.339844 -6.28125 6.578125 -5.878906 6.742188 -5.40625 C 6.902344 -4.929688 6.984375 -4.34375 6.988281 -3.648438 C 6.984375 -3.035156 6.910156 -2.507813 6.761719 -2.066406 C 6.574219 -1.523438 6.308594 -1.085938 5.964844 -0.75 C 5.703125 -0.496094 5.347656 -0.296875 4.90625 -0.15625 C 4.570313 -0.0507813 4.128906 0 3.578125 0 L 0.75 0 Z M 2.253906 -6.183594 L 2.253906 -1.253906 L 3.375 -1.253906 C 3.792969 -1.253906 4.09375 -1.277344 4.285156 -1.324219 C 4.523438 -1.386719 4.726563 -1.488281 4.890625 -1.632813 C 5.050781 -1.777344 5.183594 -2.015625 5.285156 -2.351563 C 5.386719 -2.679688 5.4375 -3.136719 5.4375 -3.714844 C 5.4375 -4.289063 5.386719 -4.730469 5.285156 -5.039063 C 5.183594 -5.347656 5.039063 -5.585938 4.855469 -5.761719 C 4.671875 -5.929688 4.441406 -6.046875 4.160156 -6.109375 C 3.949219 -6.15625 3.539063 -6.179688 2.929688 -6.183594 Z M 2.253906 -6.183594 "
-           id="path12448" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-16">
-        <path
-           style="stroke:none;"
-           d="M 0.640625 -5.390625 L 1.953125 -5.390625 L 1.953125 -4.65625 C 2.421875 -5.222656 2.984375 -5.507813 3.632813 -5.511719 C 3.976563 -5.507813 4.273438 -5.4375 4.53125 -5.296875 C 4.78125 -5.15625 4.992188 -4.941406 5.15625 -4.65625 C 5.390625 -4.941406 5.644531 -5.15625 5.921875 -5.296875 C 6.195313 -5.4375 6.488281 -5.507813 6.800781 -5.511719 C 7.195313 -5.507813 7.53125 -5.425781 7.804688 -5.269531 C 8.078125 -5.105469 8.28125 -4.871094 8.421875 -4.5625 C 8.515625 -4.332031 8.566406 -3.960938 8.566406 -3.445313 L 8.566406 0 L 7.140625 0 L 7.140625 -3.082031 C 7.136719 -3.613281 7.089844 -3.957031 6.996094 -4.117188 C 6.863281 -4.316406 6.660156 -4.417969 6.386719 -4.421875 C 6.183594 -4.417969 5.992188 -4.355469 5.820313 -4.238281 C 5.640625 -4.113281 5.515625 -3.9375 5.441406 -3.703125 C 5.359375 -3.46875 5.320313 -3.097656 5.324219 -2.589844 L 5.324219 0 L 3.898438 0 L 3.898438 -2.953125 C 3.894531 -3.476563 3.871094 -3.8125 3.820313 -3.96875 C 3.769531 -4.117188 3.691406 -4.230469 3.585938 -4.308594 C 3.476563 -4.378906 3.332031 -4.417969 3.152344 -4.421875 C 2.929688 -4.417969 2.730469 -4.359375 2.554688 -4.242188 C 2.378906 -4.125 2.253906 -3.953125 2.179688 -3.730469 C 2.101563 -3.503906 2.0625 -3.132813 2.066406 -2.617188 L 2.066406 0 L 0.640625 0 Z M 0.640625 -5.390625 "
-           id="path12451" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-17">
-        <path
-           style="stroke:none;"
-           d="M 5.691406 0 L 4.363281 0 L 4.363281 -0.792969 C 4.140625 -0.480469 3.882813 -0.25 3.585938 -0.101563 C 3.285156 0.046875 2.980469 0.117188 2.679688 0.121094 C 2.054688 0.117188 1.523438 -0.128906 1.085938 -0.625 C 0.640625 -1.121094 0.421875 -1.816406 0.425781 -2.714844 C 0.421875 -3.625 0.636719 -4.320313 1.070313 -4.796875 C 1.496094 -5.269531 2.039063 -5.507813 2.699219 -5.511719 C 3.300781 -5.507813 3.820313 -5.257813 4.261719 -4.761719 L 4.261719 -7.441406 L 5.691406 -7.441406 Z M 1.882813 -2.8125 C 1.878906 -2.234375 1.960938 -1.816406 2.121094 -1.5625 C 2.347656 -1.1875 2.667969 -1 3.085938 -1.003906 C 3.410156 -1 3.6875 -1.140625 3.921875 -1.421875 C 4.148438 -1.699219 4.265625 -2.117188 4.269531 -2.675781 C 4.265625 -3.292969 4.152344 -3.738281 3.933594 -4.011719 C 3.707031 -4.28125 3.421875 -4.417969 3.074219 -4.421875 C 2.734375 -4.417969 2.449219 -4.28125 2.222656 -4.015625 C 1.992188 -3.742188 1.878906 -3.34375 1.882813 -2.8125 Z M 1.882813 -2.8125 "
-           id="path12454" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-18">
-        <path
-           style="stroke:none;"
-           d="M 0.746094 -7.441406 L 2.25 -7.441406 L 2.25 -3.410156 C 2.25 -2.769531 2.265625 -2.355469 2.304688 -2.167969 C 2.367188 -1.863281 2.519531 -1.617188 2.761719 -1.433594 C 3 -1.246094 3.332031 -1.152344 3.757813 -1.15625 C 4.179688 -1.152344 4.5 -1.242188 4.71875 -1.417969 C 4.933594 -1.589844 5.066406 -1.800781 5.109375 -2.058594 C 5.152344 -2.308594 5.171875 -2.730469 5.175781 -3.324219 L 5.175781 -7.441406 L 6.679688 -7.441406 L 6.679688 -3.53125 C 6.679688 -2.636719 6.636719 -2.003906 6.558594 -1.636719 C 6.472656 -1.265625 6.324219 -0.957031 6.109375 -0.703125 C 5.886719 -0.449219 5.59375 -0.246094 5.230469 -0.0976563 C 4.863281 0.0507813 4.386719 0.121094 3.800781 0.125 C 3.085938 0.121094 2.546875 0.0429688 2.183594 -0.121094 C 1.8125 -0.28125 1.523438 -0.492188 1.3125 -0.757813 C 1.101563 -1.015625 0.960938 -1.292969 0.894531 -1.582031 C 0.792969 -2.007813 0.742188 -2.636719 0.746094 -3.472656 Z M 0.746094 -7.441406 "
-           id="path12457" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph5-19">
-        <path
-           style="stroke:none;"
-           d="M 2.113281 0 L 0.683594 0 L 0.683594 -5.390625 L 2.011719 -5.390625 L 2.011719 -4.625 C 2.234375 -4.984375 2.4375 -5.222656 2.621094 -5.339844 C 2.800781 -5.449219 3.007813 -5.507813 3.238281 -5.511719 C 3.5625 -5.507813 3.875 -5.417969 4.175781 -5.242188 L 3.734375 -4 C 3.492188 -4.152344 3.269531 -4.230469 3.066406 -4.234375 C 2.863281 -4.230469 2.695313 -4.175781 2.558594 -4.066406 C 2.417969 -3.957031 2.308594 -3.757813 2.230469 -3.46875 C 2.148438 -3.179688 2.109375 -2.578125 2.113281 -1.664063 Z M 2.113281 0 "
-           id="path12460" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-0">
-        <path
-           style="stroke:none;"
-           d="M 6.6875 0 L 5.222656 0 L 4.640625 -1.511719 L 1.976563 -1.511719 L 1.425781 0 L 0 0 L 2.59375 -6.664063 L 4.019531 -6.664063 Z M 4.207031 -2.636719 L 3.289063 -5.109375 L 2.390625 -2.636719 Z M 4.207031 -2.636719 "
-           id="path12463" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-1">
-        <path
-           style="stroke:none;"
-           d="M 4.875 -3.398438 L 3.617188 -3.171875 C 3.570313 -3.421875 3.476563 -3.613281 3.328125 -3.738281 C 3.179688 -3.863281 2.984375 -3.925781 2.746094 -3.929688 C 2.425781 -3.925781 2.171875 -3.816406 1.984375 -3.601563 C 1.792969 -3.378906 1.699219 -3.011719 1.699219 -2.5 C 1.699219 -1.925781 1.792969 -1.519531 1.988281 -1.285156 C 2.175781 -1.046875 2.433594 -0.929688 2.761719 -0.933594 C 3.003906 -0.929688 3.203125 -1 3.359375 -1.136719 C 3.507813 -1.273438 3.617188 -1.511719 3.6875 -1.851563 L 4.941406 -1.636719 C 4.808594 -1.058594 4.558594 -0.625 4.191406 -0.332031 C 3.816406 -0.0390625 3.320313 0.105469 2.703125 0.109375 C 1.992188 0.105469 1.429688 -0.113281 1.015625 -0.558594 C 0.59375 -1 0.386719 -1.617188 0.386719 -2.410156 C 0.386719 -3.203125 0.59375 -3.824219 1.015625 -4.269531 C 1.433594 -4.710938 2.003906 -4.933594 2.726563 -4.9375 C 3.3125 -4.933594 3.78125 -4.804688 4.128906 -4.554688 C 4.472656 -4.296875 4.722656 -3.914063 4.875 -3.398438 Z M 4.875 -3.398438 "
-           id="path12466" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-2">
-        <path
-           style="stroke:none;"
-           d="M 3.464844 -1.535156 L 4.734375 -1.324219 C 4.570313 -0.855469 4.3125 -0.496094 3.960938 -0.253906 C 3.605469 -0.0117188 3.164063 0.105469 2.636719 0.109375 C 1.796875 0.105469 1.175781 -0.167969 0.773438 -0.714844 C 0.453125 -1.152344 0.292969 -1.707031 0.296875 -2.378906 C 0.292969 -3.175781 0.5 -3.800781 0.921875 -4.257813 C 1.335938 -4.707031 1.867188 -4.933594 2.507813 -4.9375 C 3.226563 -4.933594 3.792969 -4.695313 4.207031 -4.222656 C 4.621094 -3.746094 4.820313 -3.019531 4.804688 -2.046875 L 1.605469 -2.046875 C 1.613281 -1.664063 1.714844 -1.371094 1.910156 -1.160156 C 2.105469 -0.949219 2.351563 -0.84375 2.648438 -0.84375 C 2.847656 -0.84375 3.015625 -0.898438 3.152344 -1.007813 C 3.289063 -1.117188 3.390625 -1.292969 3.464844 -1.535156 Z M 3.535156 -2.828125 C 3.523438 -3.195313 3.429688 -3.476563 3.25 -3.667969 C 3.066406 -3.859375 2.84375 -3.953125 2.585938 -3.957031 C 2.304688 -3.953125 2.074219 -3.851563 1.894531 -3.652344 C 1.707031 -3.449219 1.621094 -3.175781 1.628906 -2.828125 Z M 3.535156 -2.828125 "
-           id="path12469" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-3">
-        <path
-           style="stroke:none;"
-           d="M 0.667969 0 L 0.667969 -6.664063 L 1.945313 -6.664063 L 1.945313 0 Z M 0.667969 0 "
-           id="path12472" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-4">
-        <path
-           style="stroke:none;"
-           d="M 1.890625 0 L 0.613281 0 L 0.613281 -4.828125 L 1.800781 -4.828125 L 1.800781 -4.140625 C 2 -4.464844 2.183594 -4.679688 2.347656 -4.78125 C 2.507813 -4.882813 2.691406 -4.933594 2.898438 -4.9375 C 3.1875 -4.933594 3.46875 -4.851563 3.742188 -4.695313 L 3.34375 -3.582031 C 3.128906 -3.714844 2.929688 -3.785156 2.746094 -3.789063 C 2.5625 -3.785156 2.410156 -3.734375 2.289063 -3.640625 C 2.164063 -3.539063 2.066406 -3.363281 1.996094 -3.109375 C 1.921875 -2.847656 1.886719 -2.308594 1.890625 -1.492188 Z M 1.890625 0 "
-           id="path12475" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-5">
-        <path
-           style="stroke:none;"
-           d="M 1.621094 -3.355469 L 0.464844 -3.5625 C 0.589844 -4.027344 0.816406 -4.371094 1.136719 -4.597656 C 1.453125 -4.820313 1.925781 -4.933594 2.554688 -4.9375 C 3.125 -4.933594 3.546875 -4.867188 3.828125 -4.734375 C 4.101563 -4.597656 4.296875 -4.425781 4.414063 -4.21875 C 4.523438 -4.011719 4.582031 -3.628906 4.585938 -3.078125 L 4.574219 -1.585938 C 4.570313 -1.160156 4.589844 -0.847656 4.632813 -0.644531 C 4.671875 -0.441406 4.75 -0.226563 4.863281 0 L 3.601563 0 C 3.566406 -0.0859375 3.523438 -0.210938 3.476563 -0.378906 C 3.453125 -0.453125 3.4375 -0.503906 3.429688 -0.527344 C 3.210938 -0.316406 2.980469 -0.15625 2.730469 -0.0507813 C 2.480469 0.0546875 2.214844 0.105469 1.9375 0.109375 C 1.4375 0.105469 1.046875 -0.0273438 0.761719 -0.292969 C 0.472656 -0.5625 0.328125 -0.902344 0.332031 -1.316406 C 0.328125 -1.585938 0.394531 -1.828125 0.527344 -2.046875 C 0.65625 -2.257813 0.835938 -2.421875 1.074219 -2.535156 C 1.304688 -2.648438 1.644531 -2.75 2.089844 -2.835938 C 2.683594 -2.945313 3.097656 -3.046875 3.332031 -3.148438 L 3.332031 -3.277344 C 3.328125 -3.519531 3.265625 -3.695313 3.148438 -3.800781 C 3.023438 -3.902344 2.796875 -3.953125 2.464844 -3.957031 C 2.234375 -3.953125 2.054688 -3.910156 1.929688 -3.824219 C 1.800781 -3.734375 1.699219 -3.578125 1.621094 -3.355469 Z M 3.332031 -2.316406 C 3.164063 -2.257813 2.90625 -2.195313 2.554688 -2.121094 C 2.199219 -2.042969 1.964844 -1.96875 1.859375 -1.898438 C 1.6875 -1.78125 1.605469 -1.628906 1.609375 -1.449219 C 1.605469 -1.269531 1.671875 -1.117188 1.808594 -0.984375 C 1.9375 -0.851563 2.105469 -0.785156 2.316406 -0.789063 C 2.542969 -0.785156 2.765625 -0.863281 2.976563 -1.019531 C 3.128906 -1.132813 3.230469 -1.273438 3.28125 -1.441406 C 3.3125 -1.550781 3.328125 -1.757813 3.332031 -2.0625 Z M 3.332031 -2.316406 "
-           id="path12478" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-6">
-        <path
-           style="stroke:none;"
-           d="M 2.882813 -4.828125 L 2.882813 -3.808594 L 2.007813 -3.808594 L 2.007813 -1.863281 C 2.003906 -1.46875 2.011719 -1.238281 2.03125 -1.175781 C 2.046875 -1.109375 2.085938 -1.054688 2.144531 -1.011719 C 2.203125 -0.96875 2.273438 -0.949219 2.363281 -0.949219 C 2.476563 -0.949219 2.648438 -0.988281 2.878906 -1.074219 L 2.984375 -0.0820313 C 2.683594 0.0429688 2.347656 0.105469 1.976563 0.109375 C 1.742188 0.105469 1.535156 0.0703125 1.355469 -0.00390625 C 1.167969 -0.078125 1.03125 -0.179688 0.945313 -0.304688 C 0.855469 -0.429688 0.796875 -0.59375 0.769531 -0.804688 C 0.738281 -0.949219 0.726563 -1.25 0.726563 -1.703125 L 0.726563 -3.808594 L 0.140625 -3.808594 L 0.140625 -4.828125 L 0.726563 -4.828125 L 0.726563 -5.785156 L 2.007813 -6.53125 L 2.007813 -4.828125 Z M 2.882813 -4.828125 "
-           id="path12481" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph6-7">
-        <path
-           style="stroke:none;"
-           d="M 0.371094 -2.480469 C 0.367188 -2.902344 0.472656 -3.3125 0.683594 -3.710938 C 0.890625 -4.105469 1.1875 -4.410156 1.574219 -4.621094 C 1.953125 -4.828125 2.382813 -4.933594 2.859375 -4.9375 C 3.585938 -4.933594 4.183594 -4.695313 4.652344 -4.222656 C 5.121094 -3.746094 5.355469 -3.148438 5.355469 -2.425781 C 5.355469 -1.695313 5.117188 -1.089844 4.648438 -0.609375 C 4.171875 -0.132813 3.578125 0.105469 2.867188 0.109375 C 2.421875 0.105469 2 0.0078125 1.601563 -0.191406 C 1.195313 -0.390625 0.890625 -0.683594 0.683594 -1.070313 C 0.472656 -1.453125 0.367188 -1.921875 0.371094 -2.480469 Z M 1.683594 -2.414063 C 1.679688 -1.933594 1.792969 -1.566406 2.023438 -1.3125 C 2.246094 -1.058594 2.527344 -0.929688 2.863281 -0.933594 C 3.195313 -0.929688 3.472656 -1.058594 3.699219 -1.3125 C 3.921875 -1.566406 4.035156 -1.933594 4.039063 -2.421875 C 4.035156 -2.890625 3.921875 -3.253906 3.699219 -3.511719 C 3.472656 -3.761719 3.195313 -3.890625 2.863281 -3.894531 C 2.527344 -3.890625 2.246094 -3.761719 2.023438 -3.511719 C 1.792969 -3.253906 1.679688 -2.890625 1.683594 -2.414063 Z M 1.683594 -2.414063 "
-           id="path12484" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-0">
-        <path
-           style="stroke:none;"
-           d="M 0.675781 -6.585938 L 3.308594 -6.585938 C 3.828125 -6.585938 4.214844 -6.5625 4.472656 -6.519531 C 4.722656 -6.476563 4.953125 -6.386719 5.160156 -6.25 C 5.359375 -6.109375 5.527344 -5.925781 5.664063 -5.699219 C 5.796875 -5.46875 5.863281 -5.210938 5.867188 -4.929688 C 5.863281 -4.617188 5.78125 -4.335938 5.617188 -4.078125 C 5.449219 -3.820313 5.222656 -3.625 4.941406 -3.5 C 5.339844 -3.378906 5.648438 -3.179688 5.867188 -2.902344 C 6.078125 -2.617188 6.1875 -2.289063 6.191406 -1.910156 C 6.1875 -1.609375 6.117188 -1.316406 5.980469 -1.035156 C 5.839844 -0.75 5.648438 -0.523438 5.410156 -0.355469 C 5.167969 -0.183594 4.871094 -0.0820313 4.519531 -0.046875 C 4.292969 -0.0195313 3.757813 -0.00390625 2.914063 0 L 0.675781 0 Z M 2.003906 -5.492188 L 2.003906 -3.96875 L 2.875 -3.96875 C 3.390625 -3.964844 3.710938 -3.972656 3.839844 -3.988281 C 4.066406 -4.015625 4.246094 -4.09375 4.378906 -4.226563 C 4.503906 -4.351563 4.570313 -4.523438 4.574219 -4.738281 C 4.570313 -4.941406 4.515625 -5.105469 4.40625 -5.234375 C 4.292969 -5.359375 4.125 -5.4375 3.90625 -5.46875 C 3.769531 -5.480469 3.390625 -5.488281 2.769531 -5.492188 Z M 2.003906 -2.871094 L 2.003906 -1.109375 L 3.234375 -1.109375 C 3.710938 -1.105469 4.015625 -1.121094 4.148438 -1.148438 C 4.34375 -1.183594 4.503906 -1.269531 4.628906 -1.410156 C 4.753906 -1.546875 4.816406 -1.734375 4.816406 -1.972656 C 4.816406 -2.167969 4.765625 -2.335938 4.671875 -2.476563 C 4.570313 -2.613281 4.433594 -2.714844 4.253906 -2.777344 C 4.074219 -2.839844 3.679688 -2.871094 3.078125 -2.871094 Z M 2.003906 -2.871094 "
-           id="path12487" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-1">
-        <path
-           style="stroke:none;"
-           d="M 3.800781 0 L 3.800781 -0.714844 C 3.625 -0.457031 3.394531 -0.253906 3.113281 -0.109375 C 2.828125 0.0351563 2.53125 0.105469 2.21875 0.109375 C 1.894531 0.105469 1.605469 0.0390625 1.355469 -0.101563 C 1.097656 -0.242188 0.914063 -0.441406 0.804688 -0.695313 C 0.6875 -0.949219 0.632813 -1.300781 0.632813 -1.753906 L 0.632813 -4.773438 L 1.894531 -4.773438 L 1.894531 -2.578125 C 1.890625 -1.90625 1.914063 -1.492188 1.964844 -1.34375 C 2.007813 -1.1875 2.09375 -1.070313 2.21875 -0.984375 C 2.339844 -0.898438 2.496094 -0.855469 2.6875 -0.855469 C 2.902344 -0.855469 3.09375 -0.914063 3.265625 -1.03125 C 3.433594 -1.148438 3.550781 -1.292969 3.617188 -1.46875 C 3.675781 -1.644531 3.707031 -2.074219 3.710938 -2.757813 L 3.710938 -4.773438 L 4.972656 -4.773438 L 4.972656 0 Z M 3.800781 0 "
-           id="path12490" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-2">
-        <path
-           style="stroke:none;"
-           d="M 0.109375 -4.773438 L 0.808594 -4.773438 L 0.808594 -5.132813 C 0.808594 -5.53125 0.851563 -5.828125 0.9375 -6.027344 C 1.023438 -6.222656 1.179688 -6.382813 1.410156 -6.511719 C 1.636719 -6.632813 1.925781 -6.695313 2.277344 -6.699219 C 2.632813 -6.695313 2.984375 -6.644531 3.332031 -6.539063 L 3.164063 -5.65625 C 2.960938 -5.703125 2.765625 -5.730469 2.582031 -5.730469 C 2.398438 -5.730469 2.265625 -5.6875 2.1875 -5.601563 C 2.105469 -5.515625 2.066406 -5.351563 2.070313 -5.109375 L 2.070313 -4.773438 L 3.015625 -4.773438 L 3.015625 -3.777344 L 2.070313 -3.777344 L 2.070313 0 L 0.808594 0 L 0.808594 -3.777344 L 0.109375 -3.777344 Z M 0.109375 -4.773438 "
-           id="path12493" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-3">
-        <path
-           style="stroke:none;"
-           d="M 3.421875 -1.519531 L 4.683594 -1.308594 C 4.515625 -0.84375 4.261719 -0.492188 3.914063 -0.253906 C 3.5625 -0.0117188 3.125 0.105469 2.605469 0.109375 C 1.773438 0.105469 1.160156 -0.164063 0.765625 -0.707031 C 0.449219 -1.136719 0.292969 -1.683594 0.292969 -2.351563 C 0.292969 -3.136719 0.496094 -3.757813 0.910156 -4.207031 C 1.320313 -4.652344 1.84375 -4.875 2.480469 -4.878906 C 3.1875 -4.875 3.75 -4.640625 4.160156 -4.175781 C 4.570313 -3.703125 4.765625 -2.984375 4.75 -2.023438 L 1.585938 -2.023438 C 1.589844 -1.644531 1.691406 -1.355469 1.890625 -1.148438 C 2.082031 -0.9375 2.324219 -0.832031 2.621094 -0.835938 C 2.8125 -0.832031 2.980469 -0.886719 3.117188 -0.996094 C 3.25 -1.105469 3.351563 -1.277344 3.421875 -1.519531 Z M 3.496094 -2.792969 C 3.484375 -3.15625 3.390625 -3.433594 3.210938 -3.628906 C 3.03125 -3.816406 2.8125 -3.914063 2.554688 -3.914063 C 2.28125 -3.914063 2.054688 -3.8125 1.875 -3.613281 C 1.695313 -3.410156 1.605469 -3.136719 1.609375 -2.792969 Z M 3.496094 -2.792969 "
-           id="path12496" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph7-4">
-        <path
-           style="stroke:none;"
-           d="M 1.867188 0 L 0.605469 0 L 0.605469 -4.773438 L 1.777344 -4.773438 L 1.777344 -4.09375 C 1.976563 -4.410156 2.160156 -4.621094 2.320313 -4.726563 C 2.480469 -4.824219 2.660156 -4.875 2.867188 -4.878906 C 3.148438 -4.875 3.425781 -4.796875 3.699219 -4.640625 L 3.308594 -3.539063 C 3.09375 -3.675781 2.894531 -3.742188 2.714844 -3.746094 C 2.535156 -3.742188 2.386719 -3.695313 2.265625 -3.601563 C 2.140625 -3.503906 2.042969 -3.328125 1.972656 -3.074219 C 1.898438 -2.816406 1.863281 -2.28125 1.867188 -1.472656 Z M 1.867188 0 "
-           id="path12499" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-0">
-        <path
-           style="stroke:none;"
-           d="M 3.894531 -1.546875 L 4.738281 -1.441406 C 4.601563 -0.949219 4.355469 -0.566406 4 -0.296875 C 3.636719 -0.0273438 3.179688 0.105469 2.625 0.109375 C 1.921875 0.105469 1.363281 -0.109375 0.957031 -0.539063 C 0.542969 -0.96875 0.339844 -1.574219 0.339844 -2.359375 C 0.339844 -3.164063 0.546875 -3.789063 0.960938 -4.238281 C 1.375 -4.683594 1.914063 -4.910156 2.582031 -4.910156 C 3.21875 -4.910156 3.742188 -4.691406 4.152344 -4.253906 C 4.554688 -3.816406 4.757813 -3.199219 4.761719 -2.410156 C 4.757813 -2.355469 4.757813 -2.285156 4.757813 -2.191406 L 1.179688 -2.191406 C 1.207031 -1.664063 1.355469 -1.257813 1.625 -0.980469 C 1.890625 -0.695313 2.226563 -0.554688 2.628906 -0.558594 C 2.925781 -0.554688 3.179688 -0.632813 3.394531 -0.792969 C 3.601563 -0.949219 3.769531 -1.203125 3.894531 -1.546875 Z M 1.226563 -2.859375 L 3.90625 -2.859375 C 3.867188 -3.261719 3.765625 -3.5625 3.597656 -3.769531 C 3.335938 -4.078125 3 -4.234375 2.589844 -4.238281 C 2.214844 -4.234375 1.898438 -4.109375 1.644531 -3.863281 C 1.390625 -3.609375 1.25 -3.277344 1.226563 -2.859375 Z M 1.226563 -2.859375 "
-           id="path12502" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-1">
-        <path
-           style="stroke:none;"
-           d="M 0.609375 0 L 0.609375 -4.800781 L 1.34375 -4.800781 L 1.34375 -4.117188 C 1.691406 -4.644531 2.203125 -4.910156 2.871094 -4.910156 C 3.160156 -4.910156 3.425781 -4.855469 3.667969 -4.753906 C 3.910156 -4.644531 4.089844 -4.507813 4.210938 -4.34375 C 4.332031 -4.171875 4.417969 -3.972656 4.464844 -3.742188 C 4.496094 -3.589844 4.511719 -3.328125 4.511719 -2.953125 L 4.511719 0 L 3.695313 0 L 3.695313 -2.917969 C 3.691406 -3.25 3.660156 -3.496094 3.601563 -3.664063 C 3.535156 -3.824219 3.421875 -3.957031 3.265625 -4.054688 C 3.101563 -4.152344 2.914063 -4.199219 2.699219 -4.203125 C 2.351563 -4.199219 2.050781 -4.089844 1.800781 -3.871094 C 1.542969 -3.648438 1.417969 -3.230469 1.421875 -2.621094 L 1.421875 0 Z M 0.609375 0 "
-           id="path12505" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-2">
-        <path
-           style="stroke:none;"
-           d="M 3.753906 0 L 3.753906 -0.703125 C 3.378906 -0.164063 2.871094 0.105469 2.234375 0.109375 C 1.945313 0.105469 1.679688 0.0546875 1.4375 -0.0546875 C 1.1875 -0.160156 1.007813 -0.296875 0.890625 -0.464844 C 0.773438 -0.625 0.6875 -0.828125 0.640625 -1.066406 C 0.605469 -1.222656 0.589844 -1.472656 0.59375 -1.824219 L 0.59375 -4.800781 L 1.40625 -4.800781 L 1.40625 -2.136719 C 1.40625 -1.710938 1.421875 -1.425781 1.457031 -1.277344 C 1.503906 -1.0625 1.609375 -0.894531 1.777344 -0.773438 C 1.941406 -0.652344 2.148438 -0.589844 2.394531 -0.59375 C 2.636719 -0.589844 2.867188 -0.652344 3.082031 -0.78125 C 3.292969 -0.902344 3.441406 -1.074219 3.535156 -1.289063 C 3.621094 -1.503906 3.667969 -1.816406 3.667969 -2.226563 L 3.667969 -4.800781 L 4.484375 -4.800781 L 4.484375 0 Z M 3.753906 0 "
-           id="path12508" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-3">
-        <path
-           style="stroke:none;"
-           d="M 0.609375 0 L 0.609375 -4.800781 L 1.335938 -4.800781 L 1.335938 -4.125 C 1.484375 -4.359375 1.683594 -4.546875 1.9375 -4.695313 C 2.183594 -4.835938 2.46875 -4.910156 2.792969 -4.910156 C 3.148438 -4.910156 3.441406 -4.835938 3.667969 -4.6875 C 3.894531 -4.539063 4.054688 -4.332031 4.148438 -4.066406 C 4.527344 -4.628906 5.019531 -4.910156 5.632813 -4.910156 C 6.101563 -4.910156 6.46875 -4.777344 6.726563 -4.511719 C 6.984375 -4.246094 7.113281 -3.839844 7.113281 -3.292969 L 7.113281 0 L 6.304688 0 L 6.304688 -3.023438 C 6.304688 -3.347656 6.277344 -3.582031 6.226563 -3.726563 C 6.171875 -3.867188 6.074219 -3.980469 5.9375 -4.070313 C 5.796875 -4.15625 5.636719 -4.199219 5.449219 -4.203125 C 5.113281 -4.199219 4.832031 -4.089844 4.609375 -3.867188 C 4.386719 -3.640625 4.277344 -3.28125 4.277344 -2.789063 L 4.277344 0 L 3.460938 0 L 3.460938 -3.117188 C 3.457031 -3.476563 3.390625 -3.746094 3.261719 -3.929688 C 3.125 -4.109375 2.910156 -4.199219 2.613281 -4.203125 C 2.382813 -4.199219 2.167969 -4.140625 1.976563 -4.019531 C 1.78125 -3.898438 1.640625 -3.722656 1.554688 -3.492188 C 1.460938 -3.257813 1.417969 -2.921875 1.421875 -2.492188 L 1.421875 0 Z M 0.609375 0 "
-           id="path12511" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-4">
-        <path
-           style="stroke:none;"
-           d="M 0.601563 0 L 0.601563 -4.800781 L 1.332031 -4.800781 L 1.332031 -4.070313 C 1.515625 -4.410156 1.691406 -4.636719 1.851563 -4.746094 C 2.007813 -4.855469 2.179688 -4.910156 2.371094 -4.910156 C 2.644531 -4.910156 2.921875 -4.820313 3.207031 -4.644531 L 2.929688 -3.890625 C 2.726563 -4.007813 2.527344 -4.066406 2.332031 -4.066406 C 2.152344 -4.066406 1.992188 -4.011719 1.851563 -3.90625 C 1.710938 -3.796875 1.609375 -3.648438 1.550781 -3.460938 C 1.457031 -3.167969 1.410156 -2.851563 1.414063 -2.511719 L 1.414063 0 Z M 0.601563 0 "
-           id="path12514" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-5">
-        <path
-           style="stroke:none;"
-           d="M 3.742188 -0.59375 C 3.4375 -0.332031 3.148438 -0.152344 2.871094 -0.046875 C 2.589844 0.0546875 2.289063 0.105469 1.976563 0.109375 C 1.445313 0.105469 1.039063 -0.0195313 0.757813 -0.277344 C 0.472656 -0.53125 0.332031 -0.863281 0.335938 -1.265625 C 0.332031 -1.5 0.386719 -1.710938 0.496094 -1.90625 C 0.601563 -2.097656 0.742188 -2.253906 0.914063 -2.375 C 1.085938 -2.492188 1.28125 -2.582031 1.5 -2.644531 C 1.65625 -2.683594 1.894531 -2.726563 2.222656 -2.765625 C 2.875 -2.84375 3.359375 -2.9375 3.675781 -3.046875 C 3.675781 -3.15625 3.679688 -3.226563 3.679688 -3.257813 C 3.679688 -3.585938 3.601563 -3.820313 3.449219 -3.960938 C 3.238281 -4.140625 2.929688 -4.230469 2.523438 -4.234375 C 2.136719 -4.230469 1.855469 -4.164063 1.675781 -4.03125 C 1.492188 -3.898438 1.355469 -3.660156 1.269531 -3.320313 L 0.472656 -3.429688 C 0.542969 -3.765625 0.664063 -4.042969 0.832031 -4.253906 C 0.996094 -4.464844 1.234375 -4.625 1.550781 -4.738281 C 1.859375 -4.851563 2.222656 -4.910156 2.640625 -4.910156 C 3.046875 -4.910156 3.378906 -4.859375 3.636719 -4.761719 C 3.890625 -4.664063 4.078125 -4.542969 4.203125 -4.398438 C 4.320313 -4.25 4.40625 -4.066406 4.457031 -3.847656 C 4.480469 -3.703125 4.492188 -3.453125 4.496094 -3.097656 L 4.496094 -2.011719 C 4.492188 -1.25 4.511719 -0.773438 4.546875 -0.574219 C 4.582031 -0.375 4.648438 -0.183594 4.753906 0 L 3.90625 0 C 3.816406 -0.167969 3.761719 -0.363281 3.742188 -0.59375 Z M 3.675781 -2.410156 C 3.375 -2.285156 2.933594 -2.183594 2.34375 -2.101563 C 2.007813 -2.050781 1.769531 -1.996094 1.636719 -1.9375 C 1.496094 -1.875 1.390625 -1.785156 1.316406 -1.671875 C 1.238281 -1.554688 1.199219 -1.429688 1.203125 -1.289063 C 1.199219 -1.070313 1.28125 -0.886719 1.449219 -0.742188 C 1.609375 -0.597656 1.851563 -0.527344 2.167969 -0.527344 C 2.480469 -0.527344 2.757813 -0.59375 3.003906 -0.734375 C 3.246094 -0.867188 3.425781 -1.054688 3.542969 -1.296875 C 3.628906 -1.476563 3.671875 -1.75 3.675781 -2.109375 Z M 3.675781 -2.410156 "
-           id="path12517" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph8-6">
-        <path
-           style="stroke:none;"
-           d="M 2.386719 -0.726563 L 2.503906 -0.0078125 C 2.269531 0.0390625 2.066406 0.0585938 1.890625 0.0625 C 1.589844 0.0585938 1.359375 0.015625 1.199219 -0.078125 C 1.035156 -0.167969 0.921875 -0.292969 0.855469 -0.445313 C 0.789063 -0.59375 0.753906 -0.914063 0.757813 -1.40625 L 0.757813 -4.167969 L 0.164063 -4.167969 L 0.164063 -4.800781 L 0.757813 -4.800781 L 0.757813 -5.988281 L 1.566406 -6.476563 L 1.566406 -4.800781 L 2.386719 -4.800781 L 2.386719 -4.167969 L 1.566406 -4.167969 L 1.566406 -1.359375 C 1.5625 -1.125 1.578125 -0.972656 1.609375 -0.910156 C 1.636719 -0.839844 1.683594 -0.789063 1.75 -0.753906 C 1.8125 -0.710938 1.90625 -0.691406 2.027344 -0.695313 C 2.117188 -0.691406 2.234375 -0.703125 2.386719 -0.726563 Z M 2.386719 -0.726563 "
-           id="path12520" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-0">
-        <path
-           style="stroke:none;"
-           d="M 3.820313 -1.796875 L 4.636719 -1.6875 C 4.542969 -1.125 4.316406 -0.683594 3.953125 -0.367188 C 3.582031 -0.0507813 3.128906 0.105469 2.597656 0.109375 C 1.925781 0.105469 1.386719 -0.109375 0.980469 -0.546875 C 0.570313 -0.980469 0.367188 -1.609375 0.371094 -2.433594 C 0.367188 -2.957031 0.457031 -3.421875 0.632813 -3.820313 C 0.804688 -4.214844 1.070313 -4.511719 1.429688 -4.710938 C 1.789063 -4.910156 2.179688 -5.007813 2.601563 -5.011719 C 3.132813 -5.007813 3.566406 -4.875 3.90625 -4.605469 C 4.242188 -4.335938 4.460938 -3.953125 4.558594 -3.460938 L 3.75 -3.335938 C 3.671875 -3.660156 3.535156 -3.910156 3.34375 -4.078125 C 3.144531 -4.242188 2.90625 -4.324219 2.632813 -4.328125 C 2.210938 -4.324219 1.871094 -4.175781 1.613281 -3.878906 C 1.347656 -3.578125 1.21875 -3.101563 1.222656 -2.453125 C 1.21875 -1.792969 1.34375 -1.316406 1.601563 -1.019531 C 1.851563 -0.71875 2.183594 -0.566406 2.589844 -0.570313 C 2.914063 -0.566406 3.183594 -0.667969 3.40625 -0.871094 C 3.621094 -1.070313 3.761719 -1.378906 3.820313 -1.796875 Z M 3.820313 -1.796875 "
-           id="path12523" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-1">
-        <path
-           style="stroke:none;"
-           d="M 0.613281 0 L 0.613281 -4.898438 L 1.359375 -4.898438 L 1.359375 -4.15625 C 1.550781 -4.5 1.726563 -4.730469 1.890625 -4.84375 C 2.050781 -4.953125 2.226563 -5.007813 2.421875 -5.011719 C 2.699219 -5.007813 2.984375 -4.917969 3.277344 -4.742188 L 2.988281 -3.972656 C 2.785156 -4.089844 2.582031 -4.148438 2.382813 -4.152344 C 2.199219 -4.148438 2.035156 -4.09375 1.890625 -3.988281 C 1.746094 -3.875 1.644531 -3.722656 1.582031 -3.535156 C 1.488281 -3.234375 1.441406 -2.914063 1.445313 -2.566406 L 1.445313 0 Z M 0.613281 0 "
-           id="path12526" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-2">
-        <path
-           style="stroke:none;"
-           d="M 3.976563 -1.578125 L 4.835938 -1.472656 C 4.695313 -0.964844 4.445313 -0.578125 4.082031 -0.304688 C 3.714844 -0.03125 3.246094 0.105469 2.679688 0.109375 C 1.960938 0.105469 1.394531 -0.113281 0.976563 -0.550781 C 0.554688 -0.992188 0.34375 -1.609375 0.347656 -2.410156 C 0.34375 -3.230469 0.558594 -3.871094 0.984375 -4.328125 C 1.40625 -4.78125 1.953125 -5.007813 2.632813 -5.011719 C 3.285156 -5.007813 3.820313 -4.785156 4.238281 -4.339844 C 4.652344 -3.890625 4.863281 -3.265625 4.863281 -2.460938 C 4.863281 -2.410156 4.859375 -2.335938 4.859375 -2.238281 L 1.203125 -2.238281 C 1.230469 -1.699219 1.382813 -1.285156 1.660156 -1 C 1.929688 -0.710938 2.273438 -0.566406 2.683594 -0.570313 C 2.988281 -0.566406 3.246094 -0.648438 3.464844 -0.808594 C 3.675781 -0.96875 3.847656 -1.222656 3.976563 -1.578125 Z M 1.25 -2.921875 L 3.988281 -2.921875 C 3.945313 -3.332031 3.839844 -3.640625 3.671875 -3.847656 C 3.40625 -4.164063 3.0625 -4.324219 2.644531 -4.328125 C 2.257813 -4.324219 1.9375 -4.195313 1.679688 -3.945313 C 1.417969 -3.6875 1.273438 -3.347656 1.25 -2.921875 Z M 1.25 -2.921875 "
-           id="path12529" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-3">
-        <path
-           style="stroke:none;"
-           d="M 3.820313 -0.605469 C 3.511719 -0.339844 3.214844 -0.15625 2.929688 -0.0507813 C 2.644531 0.0546875 2.339844 0.105469 2.015625 0.109375 C 1.476563 0.105469 1.0625 -0.0234375 0.773438 -0.285156 C 0.484375 -0.546875 0.339844 -0.882813 0.339844 -1.292969 C 0.339844 -1.53125 0.394531 -1.75 0.503906 -1.949219 C 0.613281 -2.144531 0.753906 -2.304688 0.933594 -2.425781 C 1.105469 -2.542969 1.304688 -2.632813 1.53125 -2.699219 C 1.691406 -2.742188 1.9375 -2.785156 2.269531 -2.824219 C 2.9375 -2.902344 3.433594 -2.996094 3.75 -3.109375 C 3.753906 -3.222656 3.753906 -3.296875 3.757813 -3.328125 C 3.753906 -3.664063 3.675781 -3.902344 3.519531 -4.042969 C 3.304688 -4.230469 2.988281 -4.324219 2.574219 -4.324219 C 2.179688 -4.324219 1.890625 -4.253906 1.707031 -4.117188 C 1.519531 -3.980469 1.382813 -3.738281 1.296875 -3.390625 L 0.484375 -3.503906 C 0.554688 -3.847656 0.675781 -4.125 0.847656 -4.34375 C 1.015625 -4.554688 1.261719 -4.71875 1.582031 -4.835938 C 1.902344 -4.949219 2.273438 -5.007813 2.695313 -5.011719 C 3.113281 -5.007813 3.453125 -4.960938 3.714844 -4.863281 C 3.972656 -4.761719 4.164063 -4.636719 4.289063 -4.492188 C 4.414063 -4.339844 4.5 -4.152344 4.550781 -3.925781 C 4.574219 -3.785156 4.585938 -3.527344 4.589844 -3.160156 L 4.589844 -2.054688 C 4.585938 -1.277344 4.605469 -0.789063 4.644531 -0.585938 C 4.679688 -0.382813 4.75 -0.1875 4.855469 0 L 3.988281 0 C 3.898438 -0.171875 3.84375 -0.371094 3.820313 -0.605469 Z M 3.75 -2.460938 C 3.449219 -2.335938 2.996094 -2.230469 2.394531 -2.144531 C 2.050781 -2.09375 1.808594 -2.035156 1.667969 -1.976563 C 1.523438 -1.914063 1.414063 -1.824219 1.339844 -1.707031 C 1.261719 -1.585938 1.222656 -1.457031 1.226563 -1.316406 C 1.222656 -1.09375 1.308594 -0.910156 1.476563 -0.761719 C 1.644531 -0.613281 1.890625 -0.539063 2.214844 -0.539063 C 2.535156 -0.539063 2.816406 -0.609375 3.066406 -0.75 C 3.3125 -0.890625 3.496094 -1.082031 3.617188 -1.324219 C 3.703125 -1.511719 3.75 -1.789063 3.75 -2.15625 Z M 3.75 -2.460938 "
-           id="path12532" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-4">
-        <path
-           style="stroke:none;"
-           d="M 2.4375 -0.742188 L 2.554688 -0.0078125 C 2.320313 0.0429688 2.109375 0.0625 1.929688 0.0664063 C 1.625 0.0625 1.390625 0.0195313 1.226563 -0.078125 C 1.054688 -0.171875 0.9375 -0.296875 0.875 -0.453125 C 0.804688 -0.605469 0.773438 -0.933594 0.773438 -1.433594 L 0.773438 -4.253906 L 0.167969 -4.253906 L 0.167969 -4.898438 L 0.773438 -4.898438 L 0.773438 -6.113281 L 1.601563 -6.613281 L 1.601563 -4.898438 L 2.4375 -4.898438 L 2.4375 -4.253906 L 1.601563 -4.253906 L 1.601563 -1.390625 C 1.597656 -1.148438 1.613281 -0.996094 1.644531 -0.933594 C 1.671875 -0.863281 1.71875 -0.808594 1.785156 -0.769531 C 1.851563 -0.726563 1.945313 -0.707031 2.070313 -0.710938 C 2.160156 -0.707031 2.285156 -0.71875 2.4375 -0.742188 Z M 2.4375 -0.742188 "
-           id="path12535" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-5">
-        <path
-           style="stroke:none;"
-           d="M 0.0703125 0 L 1.859375 -2.546875 L 0.203125 -4.898438 L 1.242188 -4.898438 L 1.992188 -3.75 C 2.132813 -3.53125 2.246094 -3.347656 2.335938 -3.203125 C 2.46875 -3.402344 2.59375 -3.582031 2.707031 -3.742188 L 3.535156 -4.898438 L 4.527344 -4.898438 L 2.832031 -2.59375 L 4.65625 0 L 3.636719 0 L 2.628906 -1.523438 L 2.363281 -1.933594 L 1.074219 0 Z M 0.0703125 0 "
-           id="path12538" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-6">
-        <path
-           style="stroke:none;"
-           d="M 3.835938 0 L 3.835938 -0.71875 C 3.449219 -0.167969 2.929688 0.105469 2.28125 0.109375 C 1.988281 0.105469 1.71875 0.0546875 1.46875 -0.0546875 C 1.214844 -0.164063 1.027344 -0.304688 0.910156 -0.472656 C 0.785156 -0.636719 0.703125 -0.84375 0.65625 -1.089844 C 0.621094 -1.25 0.601563 -1.507813 0.605469 -1.863281 L 0.605469 -4.898438 L 1.433594 -4.898438 L 1.433594 -2.183594 C 1.429688 -1.746094 1.445313 -1.453125 1.484375 -1.304688 C 1.53125 -1.082031 1.644531 -0.910156 1.816406 -0.789063 C 1.984375 -0.664063 2.191406 -0.601563 2.445313 -0.605469 C 2.691406 -0.601563 2.925781 -0.667969 3.144531 -0.796875 C 3.359375 -0.921875 3.515625 -1.09375 3.609375 -1.316406 C 3.699219 -1.535156 3.742188 -1.855469 3.746094 -2.273438 L 3.746094 -4.898438 L 4.578125 -4.898438 L 4.578125 0 Z M 3.835938 0 "
-           id="path12541" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph9-7">
-        <path
-           style="stroke:none;"
-           d="M 0.289063 -1.460938 L 1.113281 -1.59375 C 1.15625 -1.261719 1.28125 -1.007813 1.496094 -0.832031 C 1.703125 -0.65625 2 -0.566406 2.382813 -0.570313 C 2.761719 -0.566406 3.042969 -0.644531 3.230469 -0.804688 C 3.410156 -0.957031 3.503906 -1.140625 3.507813 -1.351563 C 3.503906 -1.535156 3.421875 -1.683594 3.261719 -1.796875 C 3.144531 -1.867188 2.863281 -1.960938 2.414063 -2.078125 C 1.804688 -2.230469 1.378906 -2.363281 1.144531 -2.476563 C 0.90625 -2.585938 0.730469 -2.742188 0.609375 -2.941406 C 0.488281 -3.136719 0.425781 -3.355469 0.429688 -3.597656 C 0.425781 -3.816406 0.476563 -4.015625 0.578125 -4.203125 C 0.675781 -4.386719 0.8125 -4.542969 0.988281 -4.667969 C 1.113281 -4.761719 1.289063 -4.84375 1.515625 -4.910156 C 1.738281 -4.976563 1.976563 -5.007813 2.234375 -5.011719 C 2.613281 -5.007813 2.953125 -4.953125 3.246094 -4.84375 C 3.535156 -4.730469 3.75 -4.582031 3.890625 -4.394531 C 4.027344 -4.203125 4.121094 -3.949219 4.175781 -3.636719 L 3.363281 -3.523438 C 3.324219 -3.773438 3.214844 -3.972656 3.042969 -4.113281 C 2.863281 -4.253906 2.613281 -4.324219 2.292969 -4.328125 C 1.910156 -4.324219 1.640625 -4.261719 1.476563 -4.136719 C 1.3125 -4.011719 1.230469 -3.863281 1.230469 -3.695313 C 1.230469 -3.585938 1.261719 -3.488281 1.332031 -3.40625 C 1.394531 -3.3125 1.503906 -3.238281 1.652344 -3.183594 C 1.734375 -3.148438 1.976563 -3.078125 2.386719 -2.972656 C 2.96875 -2.808594 3.378906 -2.679688 3.613281 -2.585938 C 3.84375 -2.484375 4.027344 -2.339844 4.160156 -2.148438 C 4.292969 -1.957031 4.359375 -1.722656 4.359375 -1.441406 C 4.359375 -1.160156 4.277344 -0.898438 4.117188 -0.65625 C 3.953125 -0.410156 3.71875 -0.21875 3.417969 -0.0898438 C 3.109375 0.0429688 2.765625 0.105469 2.386719 0.109375 C 1.746094 0.105469 1.261719 -0.0234375 0.929688 -0.285156 C 0.59375 -0.546875 0.378906 -0.9375 0.289063 -1.460938 Z M 0.289063 -1.460938 "
-           id="path12544" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-0">
-        <path
-           style="stroke:none;"
-           d="M 3.863281 -1.53125 L 4.699219 -1.429688 C 4.566406 -0.941406 4.320313 -0.5625 3.964844 -0.292969 C 3.609375 -0.0273438 3.15625 0.105469 2.605469 0.109375 C 1.90625 0.105469 1.351563 -0.105469 0.945313 -0.535156 C 0.535156 -0.960938 0.332031 -1.5625 0.335938 -2.339844 C 0.332031 -3.136719 0.539063 -3.757813 0.953125 -4.203125 C 1.363281 -4.640625 1.898438 -4.863281 2.558594 -4.867188 C 3.195313 -4.863281 3.714844 -4.644531 4.121094 -4.214844 C 4.519531 -3.777344 4.722656 -3.171875 4.726563 -2.390625 C 4.722656 -2.339844 4.71875 -2.265625 4.71875 -2.175781 L 1.171875 -2.175781 C 1.199219 -1.648438 1.34375 -1.25 1.613281 -0.972656 C 1.875 -0.691406 2.207031 -0.550781 2.609375 -0.554688 C 2.898438 -0.550781 3.152344 -0.628906 3.363281 -0.789063 C 3.574219 -0.941406 3.738281 -1.1875 3.863281 -1.53125 Z M 1.214844 -2.835938 L 3.871094 -2.835938 C 3.835938 -3.234375 3.734375 -3.535156 3.566406 -3.738281 C 3.308594 -4.046875 2.976563 -4.199219 2.570313 -4.203125 C 2.195313 -4.199219 1.882813 -4.078125 1.632813 -3.832031 C 1.378906 -3.582031 1.238281 -3.25 1.214844 -2.835938 Z M 1.214844 -2.835938 "
-           id="path12547" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-1">
-        <path
-           style="stroke:none;"
-           d="M 0.605469 0 L 0.605469 -4.761719 L 1.332031 -4.761719 L 1.332031 -4.082031 C 1.675781 -4.601563 2.183594 -4.863281 2.847656 -4.867188 C 3.132813 -4.863281 3.394531 -4.8125 3.636719 -4.710938 C 3.875 -4.609375 4.054688 -4.472656 4.175781 -4.308594 C 4.292969 -4.136719 4.378906 -3.9375 4.429688 -3.710938 C 4.457031 -3.558594 4.46875 -3.296875 4.472656 -2.925781 L 4.472656 0 L 3.667969 0 L 3.667969 -2.894531 C 3.667969 -3.222656 3.636719 -3.46875 3.574219 -3.632813 C 3.511719 -3.796875 3.398438 -3.925781 3.238281 -4.023438 C 3.078125 -4.117188 2.890625 -4.164063 2.675781 -4.167969 C 2.328125 -4.164063 2.03125 -4.054688 1.785156 -3.839844 C 1.53125 -3.621094 1.40625 -3.210938 1.410156 -2.601563 L 1.410156 0 Z M 0.605469 0 "
-           id="path12550" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-2">
-        <path
-           style="stroke:none;"
-           d="M 3.640625 1.824219 L 3.640625 -0.507813 C 3.511719 -0.328125 3.335938 -0.179688 3.113281 -0.0664063 C 2.882813 0.0507813 2.644531 0.105469 2.394531 0.109375 C 1.828125 0.105469 1.339844 -0.117188 0.933594 -0.566406 C 0.523438 -1.015625 0.320313 -1.636719 0.324219 -2.425781 C 0.320313 -2.902344 0.402344 -3.328125 0.570313 -3.710938 C 0.734375 -4.085938 0.976563 -4.375 1.292969 -4.574219 C 1.605469 -4.765625 1.953125 -4.863281 2.332031 -4.867188 C 2.917969 -4.863281 3.378906 -4.617188 3.71875 -4.125 L 3.71875 -4.761719 L 4.445313 -4.761719 L 4.445313 1.824219 Z M 1.152344 -2.394531 C 1.148438 -1.78125 1.277344 -1.320313 1.535156 -1.015625 C 1.792969 -0.703125 2.101563 -0.550781 2.460938 -0.554688 C 2.804688 -0.550781 3.097656 -0.695313 3.347656 -0.992188 C 3.589844 -1.28125 3.714844 -1.726563 3.71875 -2.320313 C 3.714844 -2.953125 3.585938 -3.429688 3.328125 -3.75 C 3.066406 -4.070313 2.757813 -4.230469 2.40625 -4.230469 C 2.054688 -4.230469 1.757813 -4.082031 1.515625 -3.785156 C 1.269531 -3.488281 1.148438 -3.023438 1.152344 -2.394531 Z M 1.152344 -2.394531 "
-           id="path12553" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph10-3">
-        <path
-           style="stroke:none;"
-           d="M 3.726563 0 L 3.726563 -0.699219 C 3.351563 -0.160156 2.847656 0.105469 2.214844 0.109375 C 1.933594 0.105469 1.667969 0.0546875 1.425781 -0.0507813 C 1.179688 -0.15625 1 -0.292969 0.882813 -0.457031 C 0.761719 -0.621094 0.679688 -0.820313 0.636719 -1.058594 C 0.601563 -1.214844 0.585938 -1.464844 0.585938 -1.8125 L 0.585938 -4.761719 L 1.394531 -4.761719 L 1.394531 -2.121094 C 1.394531 -1.699219 1.410156 -1.414063 1.445313 -1.269531 C 1.492188 -1.054688 1.597656 -0.886719 1.765625 -0.765625 C 1.925781 -0.644531 2.128906 -0.585938 2.375 -0.585938 C 2.613281 -0.585938 2.839844 -0.648438 3.054688 -0.773438 C 3.265625 -0.898438 3.417969 -1.066406 3.507813 -1.28125 C 3.59375 -1.492188 3.636719 -1.800781 3.640625 -2.210938 L 3.640625 -4.761719 L 4.445313 -4.761719 L 4.445313 0 Z M 3.726563 0 "
-           id="path12556" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-0">
-        <path
-           style="stroke:none;"
-           d="M 4.890625 -2.425781 L 6.179688 -2.015625 C 5.980469 -1.292969 5.652344 -0.757813 5.191406 -0.410156 C 4.730469 -0.0625 4.144531 0.109375 3.441406 0.113281 C 2.5625 0.109375 1.84375 -0.1875 1.28125 -0.785156 C 0.714844 -1.382813 0.433594 -2.199219 0.4375 -3.238281 C 0.433594 -4.332031 0.714844 -5.183594 1.285156 -5.792969 C 1.847656 -6.398438 2.59375 -6.703125 3.523438 -6.707031 C 4.324219 -6.703125 4.980469 -6.464844 5.488281 -5.992188 C 5.785156 -5.707031 6.007813 -5.300781 6.160156 -4.777344 L 4.84375 -4.460938 C 4.761719 -4.796875 4.597656 -5.066406 4.355469 -5.269531 C 4.105469 -5.464844 3.804688 -5.566406 3.453125 -5.566406 C 2.960938 -5.566406 2.566406 -5.390625 2.265625 -5.039063 C 1.957031 -4.6875 1.804688 -4.117188 1.808594 -3.335938 C 1.804688 -2.5 1.953125 -1.90625 2.257813 -1.554688 C 2.554688 -1.199219 2.945313 -1.023438 3.425781 -1.027344 C 3.777344 -1.023438 4.082031 -1.136719 4.339844 -1.363281 C 4.59375 -1.585938 4.777344 -1.941406 4.890625 -2.425781 Z M 4.890625 -2.425781 "
-           id="path12559" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-1">
-        <path
-           style="stroke:none;"
-           d="M 0.367188 -2.457031 C 0.363281 -2.871094 0.464844 -3.277344 0.675781 -3.675781 C 0.878906 -4.066406 1.175781 -4.367188 1.558594 -4.574219 C 1.9375 -4.777344 2.359375 -4.878906 2.828125 -4.882813 C 3.550781 -4.878906 4.144531 -4.644531 4.605469 -4.179688 C 5.066406 -3.707031 5.296875 -3.117188 5.296875 -2.402344 C 5.296875 -1.675781 5.0625 -1.074219 4.597656 -0.601563 C 4.128906 -0.128906 3.542969 0.105469 2.839844 0.109375 C 2.398438 0.105469 1.980469 0.0078125 1.582031 -0.1875 C 1.183594 -0.382813 0.878906 -0.671875 0.675781 -1.058594 C 0.464844 -1.4375 0.363281 -1.902344 0.367188 -2.457031 Z M 1.664063 -2.386719 C 1.664063 -1.910156 1.773438 -1.546875 2 -1.296875 C 2.222656 -1.042969 2.5 -0.917969 2.832031 -0.921875 C 3.160156 -0.917969 3.4375 -1.042969 3.664063 -1.296875 C 3.882813 -1.546875 3.996094 -1.914063 4 -2.398438 C 3.996094 -2.859375 3.882813 -3.21875 3.664063 -3.476563 C 3.4375 -3.726563 3.160156 -3.855469 2.832031 -3.855469 C 2.5 -3.855469 2.222656 -3.726563 2 -3.476563 C 1.773438 -3.21875 1.664063 -2.855469 1.664063 -2.386719 Z M 1.664063 -2.386719 "
-           id="path12562" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-2">
-        <path
-           style="stroke:none;"
-           d="M 0.625 -4.777344 L 1.804688 -4.777344 L 1.804688 -4.074219 C 1.953125 -4.308594 2.15625 -4.503906 2.421875 -4.65625 C 2.679688 -4.804688 2.972656 -4.878906 3.292969 -4.882813 C 3.847656 -4.878906 4.316406 -4.664063 4.703125 -4.230469 C 5.089844 -3.796875 5.285156 -3.191406 5.285156 -2.414063 C 5.285156 -1.617188 5.089844 -0.996094 4.699219 -0.554688 C 4.308594 -0.113281 3.835938 0.105469 3.285156 0.109375 C 3.015625 0.105469 2.777344 0.0546875 2.566406 -0.046875 C 2.351563 -0.152344 2.125 -0.332031 1.890625 -0.589844 L 1.890625 1.816406 L 0.625 1.816406 Z M 1.875 -2.46875 C 1.875 -1.929688 1.980469 -1.535156 2.195313 -1.277344 C 2.40625 -1.019531 2.664063 -0.890625 2.972656 -0.894531 C 3.261719 -0.890625 3.507813 -1.007813 3.707031 -1.246094 C 3.898438 -1.480469 3.996094 -1.867188 4 -2.40625 C 3.996094 -2.902344 3.894531 -3.273438 3.695313 -3.519531 C 3.492188 -3.761719 3.242188 -3.882813 2.949219 -3.886719 C 2.636719 -3.882813 2.382813 -3.765625 2.179688 -3.527344 C 1.976563 -3.289063 1.875 -2.933594 1.875 -2.46875 Z M 1.875 -2.46875 "
-           id="path12565" />
-      </symbol>
-      <symbol
-         overflow="visible"
-         id="glyph11-3">
-        <path
-           style="stroke:none;"
-           d="M 0.0625 -4.777344 L 1.40625 -4.777344 L 2.550781 -1.386719 L 3.664063 -4.777344 L 4.972656 -4.777344 L 3.289063 -0.179688 L 2.988281 0.652344 C 2.875 0.925781 2.765625 1.140625 2.667969 1.289063 C 2.566406 1.4375 2.453125 1.554688 2.324219 1.648438 C 2.191406 1.734375 2.03125 1.804688 1.839844 1.859375 C 1.648438 1.90625 1.433594 1.933594 1.195313 1.9375 C 0.953125 1.933594 0.714844 1.910156 0.480469 1.863281 L 0.367188 0.871094 C 0.5625 0.90625 0.742188 0.925781 0.902344 0.929688 C 1.195313 0.925781 1.410156 0.839844 1.554688 0.671875 C 1.695313 0.496094 1.804688 0.277344 1.878906 0.0117188 Z M 0.0625 -4.777344 "
-           id="path12568" />
-      </symbol>
-    </g>
-    <image
-       id="image5"
-       width="268"
-       height="65"
-       xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAQwAAABBCAYAAAAkPZqVAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAGzSURBVHhe7dhfSxRxHMXhWf+VIWqBkHmTmXaRmRUW0ft/X53vEuKFxLmfZ+Bhl52d2w/nN8uTaxO7sQ/wxHRh+vB4zQ8v4ihO4jVAnMZ04SB2YluOicXcfBfv4wOweldxGRdxHLM2tutiCjKxuI7b+AKs3n3cxU2cxQyLZS/mGDLL4nN8jx/A6v2Mh5h4nMdhbGfGHEdmgsyN5x4E1udXTDS+xpxAXoVgAM8SDKAmGEBNMICaYAA1wQBqggHUBAOoCQZQEwygJhhATTCAmmAANcEAaoIB1AQDqAkGUBMMoCYYQE0wgJpgADXBAGqCAdQEA6gJBlATDKAmGEBNMICaYAA1wQBqggHUBAOoCQZQEwygJhhATTCAmmAANcEAaoIB1AQDqAkGUBMMoCYYQE0wgJpgADXBAGqCAdQEA6gJBlATDKAmGEBNMICaYAA1wQBq/w3GVdzH/AngT/yO6cJjMPbiNC7jLh5iqgKs28RiPm/jbbyMZTeO4iI+xURjJgiwbt9iXlN8jDdxEMvm35fjOIvzmPkBMMtiYnEYMy62107Mu4yZHHNjzioA04MZFInFsvkLxj+4bT15eQkAAAAASUVORK5CYII=" />
-    <pattern
-       id="pattern0"
-       patternUnits="userSpaceOnUse"
-       width="268"
-       height="65"
-       patternTransform="matrix(0.70875,0,0,0.70875,130.055618,14.006742)">
-      <use
-         xlink:href="#image5"
-         id="use12574" />
-    </pattern>
-    <image
-       id="image8"
-       width="90"
-       height="66"
-       xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFoAAABCCAYAAADXLcH0AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAGKSURBVHhe7dxZK4ZhFEbhzzwkUynDidmBmZD8///lXm8RpejFfbR2XYmPk2X3HO7Jh5mKmZjTn6AlTT8NP1yIlViLDf3KetByPqZjGKoTmV/Yjf041GhHcRB7sRps9zBsM/WJfBIXcanRbuIqTmMrWOJhZoPngk0+j7u412iP8RBE34mlGIbV5tlg7fnwqz/Wzz0Fsa+DV2I5hjH03zJ0iaFLDF1i6BJDlxi6xNAlhi4xdImhSwxdYugSQ5cYusTQJYYuMXSJoUsMXWLoEkOXGLrE0CWGLjF0iaFLDF1i6BJDlxi6xNAlhi4xdImhSwxdYugSQ5cYusTQJYYuMXSJoUsMXWLoEkOXGLrE0CWGLjF0iaFLDF1i6BJDl3wbmguEHMfjFzXeSzwHLT+F5sAgtzQ588gFQo7j8R/ROETmKxcxt2Mxhnk7mcktzbMgNmuvcW6DJ/g4NoNDsMNwBJZvOFjKLU3OPLLyGo9NJjLnMlnk9+H0Lm81a86HvCsaj4Ys75c3pJ1/m8nkFS3LHh6rv6Y1AAAAAElFTkSuQmCC" />
-    <pattern
-       id="pattern1"
-       patternUnits="userSpaceOnUse"
-       width="90"
-       height="66"
-       patternTransform="matrix(0.70875,0,0,0.70875,13.820836,13.997812)">
-      <use
-         xlink:href="#image8"
-         id="use12578" />
-    </pattern>
-    <image
-       id="image11"
-       width="66"
-       height="66"
-       xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEIAAABCCAYAAADjVADoAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAAEWSURBVHhe7dw5TsVAAARRswuEWAICIGERJKwCAfe/Gd2jCozECebXkzr5S+DSOLSXla1sJ9vbkPVae81/9MOD7Dg7zc4n31nWa93PtrOhVRqhP7jKbrK7iXef3WbX2UnW0zH0NLROIzxkz9nLxHvPXrPH7CLrIRh2s94OPQlP2Uf2OfG+s6+sUS6zw2zo0eht0WPTL//780z7yRrjLetdcJQNhoAhYAgYAoaAIWAIGAKGgCFgCBgChoAhYAgYAoaAIWAIGAKGgCFgCBgChoAhYAgYAoaAIWAIGAKGgCFgCBgChoAhYAgYAoaAIWAIGAKGgCFgCBgChoAhYAgYAoaAIWAI+AAsfCQaPiS/4msTVjb4RRrL8gtFQfl7BWWr8AAAAABJRU5ErkJggg==" />
-    <pattern
-       id="pattern2"
-       patternUnits="userSpaceOnUse"
-       width="66"
-       height="66"
-       patternTransform="matrix(0.70875,0,0,0.70875,80.443832,13.820624)">
-      <use
-         xlink:href="#image11"
-         id="use12582" />
-    </pattern>
-  </defs>
-  <g
-     id="surface1">
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.630887 37.874542 L 83.630887 -0.000236821 L -0.0000645875 -0.000236821 L -0.0000645875 37.874542 Z M 83.630887 37.874542 "
-       transform="matrix(0.70875,0,0,0.70875,193.828171,115.57048)"
-       id="path12587" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12601">
-      <use
-         xlink:href="#glyph0-0"
-         x="207.664063"
-         y="132.324219"
-         id="use12589" />
-      <use
-         xlink:href="#glyph0-1"
-         x="214.48855"
-         y="132.324219"
-         id="use12591" />
-      <use
-         xlink:href="#glyph0-2"
-         x="219.74419"
-         y="132.324219"
-         id="use12593" />
-      <use
-         xlink:href="#glyph0-3"
-         x="224.999829"
-         y="132.324219"
-         id="use12595" />
-      <use
-         xlink:href="#glyph0-4"
-         x="227.625342"
-         y="132.324219"
-         id="use12597" />
-      <use
-         xlink:href="#glyph0-1"
-         x="232.880982"
-         y="132.324219"
-         id="use12599" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.631123 37.7768 L 83.631123 0.00122716 L 0.000172233 0.00122716 L 0.000172233 37.7768 Z M 83.631123 37.7768 "
-       transform="matrix(0.70875,0,0,0.70875,160.652222,164.651474)"
-       id="path12603" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.335291 37.740028 L 83.335291 -0.00247585 L 0.00195915 -0.00247585 L 0.00195915 37.740028 Z M 83.335291 37.740028 "
-       transform="matrix(0.70875,0,0,0.70875,194.787674,222.779099)"
-       id="path12619" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12631">
-      <use
-         xlink:href="#glyph1-0"
-         x="211.207031"
-         y="239.34375"
-         id="use12621" />
-      <use
-         xlink:href="#glyph1-1"
-         x="217.425646"
-         y="239.34375"
-         id="use12623" />
-      <use
-         xlink:href="#glyph1-2"
-         x="222.61086"
-         y="239.34375"
-         id="use12625" />
-      <use
-         xlink:href="#glyph1-3"
-         x="227.796075"
-         y="239.34375"
-         id="use12627" />
-      <use
-         xlink:href="#glyph1-4"
-         x="233.491161"
-         y="239.34375"
-         id="use12629" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.34048 37.998114 L 83.34048 0.00208295 L 0.00163622 0.00208295 L 0.00163622 37.998114 Z M 83.34048 37.998114 "
-       transform="matrix(0.70875,0,0,0.70875,193.932434,61.838367)"
-       id="path12633" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12651">
-      <use
-         xlink:href="#glyph0-10"
-         x="204.121094"
-         y="78.457031"
-         id="use12635" />
-      <use
-         xlink:href="#glyph0-11"
-         x="210.42417"
-         y="78.457031"
-         id="use12637" />
-      <use
-         xlink:href="#glyph0-8"
-         x="213.049683"
-         y="78.457031"
-         id="use12639" />
-      <use
-         xlink:href="#glyph0-6"
-         x="218.305322"
-         y="78.457031"
-         id="use12641" />
-      <use
-         xlink:href="#glyph0-12"
-         x="221.452246"
-         y="78.457031"
-         id="use12643" />
-      <use
-         xlink:href="#glyph0-13"
-         x="224.59917"
-         y="78.457031"
-         id="use12645" />
-      <use
-         xlink:href="#glyph0-7"
-         x="230.371607"
-         y="78.457031"
-         id="use12647" />
-      <use
-         xlink:href="#glyph0-9"
-         x="234.04917"
-         y="78.457031"
-         id="use12649" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(86.666667%,44.313725%,58.431373%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(86.666667%,44.313725%,58.431373%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 292.748914 84.001221 L 292.74892 0.00100168 L 0.00198606 0.000979577 L 0.00197972 84.001199 Z M 292.748914 84.001221 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,74.418663,61.838436)"
-       id="path12653" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12663">
-      <use
-         xlink:href="#glyph2-0"
-         x="40.765625"
-         y="153.53125"
-         id="use12655" />
-      <use
-         xlink:href="#glyph2-1"
-         x="40.765626"
-         y="161.038186"
-         id="use12657" />
-      <use
-         xlink:href="#glyph2-2"
-         x="40.765626"
-         y="167.387865"
-         id="use12659" />
-      <use
-         xlink:href="#glyph2-3"
-         x="40.765626"
-         y="173.169068"
-         id="use12661" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.337444 33.6626 L 83.337444 -0.00142092 L -0.0013994 -0.00142092 L -0.0013994 33.6626 Z M 83.337444 33.6626 "
-       transform="matrix(0.70875,0,0,0.70875,131.223648,272.879913)"
-       id="path12665" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12679">
-      <use
-         xlink:href="#glyph0-14"
-         x="145.292969"
-         y="288.246094"
-         id="use12667" />
-      <use
-         xlink:href="#glyph0-1"
-         x="152.117456"
-         y="288.246094"
-         id="use12669" />
-      <use
-         xlink:href="#glyph0-7"
-         x="157.373096"
-         y="288.246094"
-         id="use12671" />
-      <use
-         xlink:href="#glyph0-15"
-         x="161.050659"
-         y="288.246094"
-         id="use12673" />
-      <use
-         xlink:href="#glyph0-1"
-         x="166.823096"
-         y="288.246094"
-         id="use12675" />
-      <use
-         xlink:href="#glyph0-11"
-         x="172.078736"
-         y="288.246094"
-         id="use12677" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(86.666667%,44.313725%,58.431373%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(86.666667%,44.313725%,58.431373%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 112.6557 84.001294 L 112.655707 0.0010742 L 0.00137787 0.00106569 L 0.00137152 84.001285 Z M 112.6557 84.001294 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,74.457787,273.397461)"
-       id="path12681" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12705">
-      <use
-         xlink:href="#glyph3-0"
-         x="41.121094"
-         y="284.222656"
-         id="use12683" />
-      <use
-         xlink:href="#glyph3-1"
-         x="41.121094"
-         y="291.635707"
-         id="use12685" />
-      <use
-         xlink:href="#glyph3-1"
-         x="41.121095"
-         y="297.344608"
-         id="use12687" />
-      <use
-         xlink:href="#glyph3-2"
-         x="41.121095"
-         y="303.053509"
-         id="use12689" />
-      <use
-         xlink:href="#glyph3-3"
-         x="41.121096"
-         y="308.76241"
-         id="use12691" />
-      <use
-         xlink:href="#glyph3-2"
-         x="41.121096"
-         y="311.614355"
-         id="use12693" />
-      <use
-         xlink:href="#glyph3-4"
-         x="41.121096"
-         y="317.323256"
-         id="use12695" />
-      <use
-         xlink:href="#glyph3-5"
-         x="41.121097"
-         y="321.317983"
-         id="use12697" />
-      <use
-         xlink:href="#glyph3-6"
-         x="41.121097"
-         y="327.026884"
-         id="use12699" />
-      <use
-         xlink:href="#glyph3-7"
-         x="41.121097"
-         y="330.445208"
-         id="use12701" />
-      <use
-         xlink:href="#glyph3-4"
-         x="41.121098"
-         y="336.715476"
-         id="use12703" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.33555 37.596773 L 83.33555 -0.00243279 L 0.0022175 -0.00243279 L 0.0022175 37.596773 Z M 83.33555 37.596773 "
-       transform="matrix(0.70875,0,0,0.70875,131.119522,222.829849)"
-       id="path12707" />
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#pattern0);"
-       d="M 130.054688 14.007813 L 320 14.007813 L 320 60.074219 L 130.054688 60.074219 Z M 130.054688 14.007813 "
-       id="path12727" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(68.627451%,34.901961%,12.941176%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(68.627451%,34.901961%,12.941176%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 262.999344 59.998423 L 262.999344 0.000628382 L -0.00219597 0.000628382 L -0.00219597 59.998423 Z M 262.999344 59.998423 "
-       transform="matrix(0.70875,0,0,0.70875,131.118744,15.069867)"
-       id="path12729" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12757">
-      <use
-         xlink:href="#glyph5-0"
-         x="186.402344"
-         y="39.765625"
-         id="use12731" />
-      <use
-         xlink:href="#glyph5-1"
-         x="195.06146"
-         y="39.765625"
-         id="use12733" />
-      <use
-         xlink:href="#glyph5-2"
-         x="200.842664"
-         y="39.765625"
-         id="use12735" />
-      <use
-         xlink:href="#glyph5-3"
-         x="203.730728"
-         y="39.765625"
-         id="use12737" />
-      <use
-         xlink:href="#glyph5-4"
-         x="210.080408"
-         y="39.765625"
-         id="use12739" />
-      <use
-         xlink:href="#glyph5-5"
-         x="212.968472"
-         y="39.765625"
-         id="use12741" />
-      <use
-         xlink:href="#glyph5-6"
-         x="220.475408"
-         y="39.765625"
-         id="use12743" />
-      <use
-         xlink:href="#glyph5-3"
-         x="226.825088"
-         y="39.765625"
-         id="use12745" />
-      <use
-         xlink:href="#glyph5-7"
-         x="233.174769"
-         y="39.765625"
-         id="use12747" />
-      <use
-         xlink:href="#glyph5-8"
-         x="238.955972"
-         y="39.765625"
-         id="use12749" />
-      <use
-         xlink:href="#glyph5-9"
-         x="244.737176"
-         y="39.765625"
-         id="use12751" />
-      <use
-         xlink:href="#glyph5-10"
-         x="251.086856"
-         y="39.765625"
-         id="use12753" />
-      <use
-         xlink:href="#glyph5-11"
-         x="254.548473"
-         y="39.765625"
-         id="use12755" />
-    </g>
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#pattern1);"
-       d="M 13.820313 13.996094 L 76.898438 13.996094 L 76.898438 60.066406 L 13.820313 60.066406 Z M 13.820313 13.996094 "
-       id="path12759" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(63.529412%,31.764706%,42.352941%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(63.529412%,31.764706%,42.352941%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.998595 60 L 83.998595 0.00220539 L -0.00162411 0.00220539 L -0.00162411 60 Z M 83.998595 60 "
-       transform="matrix(0.70875,0,0,0.70875,14.883964,15.060937)"
-       id="path12761" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12783">
-      <use
-         xlink:href="#glyph5-12"
-         x="19.136719"
-         y="33.390625"
-         id="use12763" />
-      <use
-         xlink:href="#glyph5-13"
-         x="26.070103"
-         y="33.390625"
-         id="use12765" />
-      <use
-         xlink:href="#glyph5-8"
-         x="31.851306"
-         y="33.390625"
-         id="use12767" />
-      <use
-         xlink:href="#glyph5-7"
-         x="37.63251"
-         y="33.390625"
-         id="use12769" />
-      <use
-         xlink:href="#glyph5-14"
-         x="43.413714"
-         y="33.390625"
-         id="use12771" />
-      <use
-         xlink:href="#glyph5-10"
-         x="49.763394"
-         y="33.390625"
-         id="use12773" />
-      <use
-         xlink:href="#glyph5-2"
-         x="53.22501"
-         y="33.390625"
-         id="use12775" />
-      <use
-         xlink:href="#glyph5-6"
-         x="56.113074"
-         y="33.390625"
-         id="use12777" />
-      <use
-         xlink:href="#glyph5-3"
-         x="62.462754"
-         y="33.390625"
-         id="use12779" />
-      <use
-         xlink:href="#glyph5-4"
-         x="68.812435"
-         y="33.390625"
-         id="use12781" />
-    </g>
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12797">
-      <use
-         xlink:href="#glyph5-15"
-         x="25.515625"
-         y="46.144531"
-         id="use12785" />
-      <use
-         xlink:href="#glyph5-6"
-         x="33.022561"
-         y="46.144531"
-         id="use12787" />
-      <use
-         xlink:href="#glyph5-16"
-         x="39.372241"
-         y="46.144531"
-         id="use12789" />
-      <use
-         xlink:href="#glyph5-1"
-         x="48.615061"
-         y="46.144531"
-         id="use12791" />
-      <use
-         xlink:href="#glyph5-2"
-         x="54.396265"
-         y="46.144531"
-         id="use12793" />
-      <use
-         xlink:href="#glyph5-3"
-         x="57.284329"
-         y="46.144531"
-         id="use12795" />
-    </g>
-    <path
-       style=" stroke:none;fill-rule:evenodd;fill:url(#pattern2);"
-       d="M 80.445313 13.820313 L 126.511719 13.820313 L 126.511719 60.066406 L 80.445313 60.066406 Z M 80.445313 13.820313 "
-       id="path12799" />
-    <path
-       style="fill-rule:evenodd;fill:rgb(73.72549%,50.196078%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(73.72549%,50.196078%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 59.999 60.249999 L 59.999 -0.0013227 L 0.00120563 -0.0013227 L 0.00120563 60.249999 Z M 59.999 60.249999 "
-       transform="matrix(0.70875,0,0,0.70875,81.506958,14.88375)"
-       id="path12801" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12811">
-      <use
-         xlink:href="#glyph5-5"
-         x="88.59375"
-         y="39.765625"
-         id="use12803" />
-      <use
-         xlink:href="#glyph5-6"
-         x="96.100686"
-         y="39.765625"
-         id="use12805" />
-      <use
-         xlink:href="#glyph5-17"
-         x="102.450366"
-         y="39.765625"
-         id="use12807" />
-      <use
-         xlink:href="#glyph5-8"
-         x="108.800047"
-         y="39.765625"
-         id="use12809" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 292.748934 28.997889 L 292.748936 0.00207814 L 0.00200221 0.00205603 L 0.00200002 28.997867 Z M 292.748934 28.997889 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,102.060051,61.838425)"
-       id="path12813" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12827">
-      <use
-         xlink:href="#glyph2-4"
-         x="87.894531"
-         y="148.925781"
-         id="use12815" />
-      <use
-         xlink:href="#glyph2-5"
-         x="87.894532"
-         y="154.706984"
-         id="use12817" />
-      <use
-         xlink:href="#glyph2-6"
-         x="87.894532"
-         y="157.595048"
-         id="use12819" />
-      <use
-         xlink:href="#glyph2-4"
-         x="87.894532"
-         y="163.944728"
-         id="use12821" />
-      <use
-         xlink:href="#glyph2-7"
-         x="87.894533"
-         y="169.725931"
-         id="use12823" />
-      <use
-         xlink:href="#glyph2-4"
-         x="87.894533"
-         y="175.507134"
-         id="use12825" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 60.00012 33.802367 L 60.00012 0.000559758 L 0.00232515 0.000559758 L 0.00232515 33.802367 Z M 60.00012 33.802367 "
-       transform="matrix(0.70875,0,0,0.70875,81.545227,273.398041)"
-       id="path12829" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12839">
-      <use
-         xlink:href="#glyph5-18"
-         x="90.011719"
-         y="289.246094"
-         id="use12831" />
-      <use
-         xlink:href="#glyph5-11"
-         x="97.518655"
-         y="289.246094"
-         id="use12833" />
-      <use
-         xlink:href="#glyph5-8"
-         x="103.299859"
-         y="289.246094"
-         id="use12835" />
-      <use
-         xlink:href="#glyph5-19"
-         x="109.081062"
-         y="289.246094"
-         id="use12837" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 75.35743 28.947764 L 75.357432 0.00155579 L -0.000818108 0.0015501 L -0.000820293 28.947758 Z M 75.35743 28.947764 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,102.059692,299.832611)"
-       id="path12841" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12855">
-      <use
-         xlink:href="#glyph2-4"
-         x="87.984375"
-         y="309.863281"
-         id="use12843" />
-      <use
-         xlink:href="#glyph2-5"
-         x="87.984375"
-         y="315.644484"
-         id="use12845" />
-      <use
-         xlink:href="#glyph2-6"
-         x="87.984376"
-         y="318.532548"
-         id="use12847" />
-      <use
-         xlink:href="#glyph2-4"
-         x="87.984376"
-         y="324.882228"
-         id="use12849" />
-      <use
-         xlink:href="#glyph2-7"
-         x="87.984377"
-         y="330.663431"
-         id="use12851" />
-      <use
-         xlink:href="#glyph2-4"
-         x="87.984377"
-         y="336.444634"
-         id="use12853" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 83.337444 45.905585 L 83.337444 0.000602816 L -0.0013994 0.000602816 L -0.0013994 45.905585 Z M 83.337444 45.905585 "
-       transform="matrix(0.70875,0,0,0.70875,131.223648,320.706604)"
-       id="path12857" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12881">
-      <use
-         xlink:href="#glyph6-0"
-         x="133.953125"
-         y="339.984375"
-         id="use12859" />
-      <use
-         xlink:href="#glyph6-1"
-         x="140.67539"
-         y="339.984375"
-         id="use12861" />
-      <use
-         xlink:href="#glyph6-1"
-         x="145.852306"
-         y="339.984375"
-         id="use12863" />
-      <use
-         xlink:href="#glyph6-2"
-         x="151.029223"
-         y="339.984375"
-         id="use12865" />
-      <use
-         xlink:href="#glyph6-3"
-         x="156.20614"
-         y="339.984375"
-         id="use12867" />
-      <use
-         xlink:href="#glyph6-2"
-         x="158.792325"
-         y="339.984375"
-         id="use12869" />
-      <use
-         xlink:href="#glyph6-4"
-         x="163.969242"
-         y="339.984375"
-         id="use12871" />
-      <use
-         xlink:href="#glyph6-5"
-         x="167.59172"
-         y="339.984375"
-         id="use12873" />
-      <use
-         xlink:href="#glyph6-6"
-         x="172.768637"
-         y="339.984375"
-         id="use12875" />
-      <use
-         xlink:href="#glyph6-7"
-         x="175.868423"
-         y="339.984375"
-         id="use12877" />
-      <use
-         xlink:href="#glyph6-4"
-         x="181.554396"
-         y="339.984375"
-         id="use12879" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 84.1047 37.909419 L 84.1047 0.00157163 L -0.000236821 0.00157163 L -0.000236821 37.909419 Z M 84.1047 37.909419 "
-       transform="matrix(0.70875,0,0,0.70875,228.179855,164.604355)"
-       id="path12883" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g12897">
-      <use
-         xlink:href="#glyph7-0"
-         x="243.808594"
-         y="181.226563"
-         id="use12885" />
-      <use
-         xlink:href="#glyph7-1"
-         x="250.453744"
-         y="181.226563"
-         id="use12887" />
-      <use
-         xlink:href="#glyph7-2"
-         x="256.07449"
-         y="181.226563"
-         id="use12889" />
-      <use
-         xlink:href="#glyph7-2"
-         x="259.138718"
-         y="181.226563"
-         id="use12891" />
-      <use
-         xlink:href="#glyph7-3"
-         x="262.202946"
-         y="181.226563"
-         id="use12893" />
-      <use
-         xlink:href="#glyph7-4"
-         x="267.320475"
-         y="181.226563"
-         id="use12895" />
-    </g>
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -0.000688933 0.000172233 L -0.000688933 -37.813981 "
-       transform="matrix(0.70875,0,0,0.70875,223.465332,115.57019)"
-       id="path12899" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 7.500413 -15.002032 L -0.000688933 0.000172233 L -7.501791 -15.002032 "
-       transform="matrix(0.70875,0,0,0.70875,223.465332,115.57019)"
-       id="path12901" />
-    <g
-       style="fill:rgb(22.745098%,32.156863%,41.960784%);fill-opacity:1;"
-       id="g12921">
-      <use
-         xlink:href="#glyph8-0"
-         x="175.0625"
-         y="103.972656"
-         id="use12903" />
-      <use
-         xlink:href="#glyph8-1"
-         x="180.210309"
-         y="103.972656"
-         id="use12905" />
-      <use
-         xlink:href="#glyph8-2"
-         x="185.358118"
-         y="103.972656"
-         id="use12907" />
-      <use
-         xlink:href="#glyph8-3"
-         x="190.505928"
-         y="103.972656"
-         id="use12909" />
-      <use
-         xlink:href="#glyph8-0"
-         x="198.216343"
-         y="103.972656"
-         id="use12911" />
-      <use
-         xlink:href="#glyph8-4"
-         x="203.364152"
-         y="103.972656"
-         id="use12913" />
-      <use
-         xlink:href="#glyph8-5"
-         x="206.44651"
-         y="103.972656"
-         id="use12915" />
-      <use
-         xlink:href="#glyph8-6"
-         x="211.594319"
-         y="103.972656"
-         id="use12917" />
-      <use
-         xlink:href="#glyph8-0"
-         x="214.165964"
-         y="103.972656"
-         id="use12919" />
-    </g>
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -0.000538229 0.000710462 L 46.808324 -31.376053 "
-       transform="matrix(0.70875,0,0,0.70875,190.289444,164.65184)"
-       id="path12923" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 16.633059 -2.121203 L -0.000538229 0.000710462 L 8.283192 -14.582623 "
-       transform="matrix(0.70875,0,0,0.70875,190.289444,164.65184)"
-       id="path12925" />
-    <g
-       style="fill:rgb(22.745098%,32.156863%,41.960784%);fill-opacity:1;"
-       id="g12939">
-      <use
-         xlink:href="#glyph9-0"
-         x="211.207031"
-         y="153.585938"
-         id="use12927" />
-      <use
-         xlink:href="#glyph9-1"
-         x="215.932031"
-         y="153.585938"
-         id="use12929" />
-      <use
-         xlink:href="#glyph9-2"
-         x="219.078955"
-         y="153.585938"
-         id="use12931" />
-      <use
-         xlink:href="#glyph9-3"
-         x="224.334595"
-         y="153.585938"
-         id="use12933" />
-      <use
-         xlink:href="#glyph9-4"
-         x="229.590235"
-         y="153.585938"
-         id="use12935" />
-      <use
-         xlink:href="#glyph9-2"
-         x="232.215747"
-         y="153.585938"
-         id="use12937" />
-    </g>
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 0.00133481 -0.00269114 L -1.205676 -113.390035 "
-       transform="matrix(0.70875,0,0,0.70875,224.319366,222.779251)"
-       id="path12941" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 7.342605 -15.076545 L 0.00133481 -0.00269114 L -7.6596 -14.922224 "
-       transform="matrix(0.70875,0,0,0.70875,224.319366,222.779251)"
-       id="path12943" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -0.000861166 0.00157163 L -48.705667 -31.309054 "
-       transform="matrix(0.70875,0,0,0.70875,257.984985,164.604355)"
-       id="path12945" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -8.560164 -14.421929 L -0.000861166 0.00157163 L -16.673039 -1.800677 "
-       transform="matrix(0.70875,0,0,0.70875,257.984985,164.604355)"
-       id="path12947" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 0.00178692 -0.00163622 L -0.147023 -33.020816 "
-       transform="matrix(0.70875,0,0,0.70875,160.756546,272.880066)"
-       id="path12949" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 7.43124 -15.031398 L 0.00178692 -0.00163622 L -7.565453 -14.96526 "
-       transform="matrix(0.70875,0,0,0.70875,160.756546,272.880066)"
-       id="path12951" />
-    <g
-       style="fill:rgb(22.745098%,32.156863%,41.960784%);fill-opacity:1;"
-       id="g12967">
-      <use
-         xlink:href="#glyph9-2"
-         x="162.304688"
-         y="261.316406"
-         id="use12953" />
-      <use
-         xlink:href="#glyph9-5"
-         x="167.560327"
-         y="261.316406"
-         id="use12955" />
-      <use
-         xlink:href="#glyph9-2"
-         x="172.285327"
-         y="261.316406"
-         id="use12957" />
-      <use
-         xlink:href="#glyph9-0"
-         x="177.540967"
-         y="261.316406"
-         id="use12959" />
-      <use
-         xlink:href="#glyph9-6"
-         x="182.265967"
-         y="261.316406"
-         id="use12961" />
-      <use
-         xlink:href="#glyph9-4"
-         x="187.521607"
-         y="261.316406"
-         id="use12963" />
-      <use
-         xlink:href="#glyph9-2"
-         x="190.14712"
-         y="261.316406"
-         id="use12965" />
-    </g>
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 0.00178692 0.000688933 L -0.00923601 -33.817653 "
-       transform="matrix(0.70875,0,0,0.70875,160.756546,320.706543)"
-       id="path12969" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 7.497378 -15.001515 L 0.00178692 0.000688933 L -7.499315 -15.001515 "
-       transform="matrix(0.70875,0,0,0.70875,160.756546,320.706543)"
-       id="path12971" />
-    <g
-       style="fill:rgb(22.745098%,32.156863%,41.960784%);fill-opacity:1;"
-       id="g12979">
-      <use
-         xlink:href="#glyph9-6"
-         x="163.722656"
-         y="309.511719"
-         id="use12973" />
-      <use
-         xlink:href="#glyph9-7"
-         x="168.978296"
-         y="309.511719"
-         id="use12975" />
-      <use
-         xlink:href="#glyph9-2"
-         x="173.703296"
-         y="309.511719"
-         id="use12977" />
-    </g>
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -0.0000215292 0.00247585 L 41.815454 -44.309693 "
-       transform="matrix(0.70875,0,0,0.70875,160.652359,222.830276)"
-       id="path12981" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 15.751742 -5.762515 L -0.0000215292 0.00247585 L 4.839044 -16.05793 "
-       transform="matrix(0.70875,0,0,0.70875,160.652359,222.830276)"
-       id="path12983" />
-    <g
-       style="fill:rgb(22.745098%,32.156863%,41.960784%);fill-opacity:1;"
-       id="g12999">
-      <use
-         xlink:href="#glyph10-0"
-         x="183.566406"
-         y="207.449219"
-         id="use12985" />
-      <use
-         xlink:href="#glyph10-1"
-         x="188.672173"
-         y="207.449219"
-         id="use12987" />
-      <use
-         xlink:href="#glyph10-2"
-         x="193.77794"
-         y="207.449219"
-         id="use12989" />
-      <use
-         xlink:href="#glyph10-3"
-         x="198.883706"
-         y="207.449219"
-         id="use12991" />
-      <use
-         xlink:href="#glyph10-0"
-         x="203.989473"
-         y="207.449219"
-         id="use12993" />
-      <use
-         xlink:href="#glyph10-3"
-         x="209.09524"
-         y="207.449219"
-         id="use12995" />
-      <use
-         xlink:href="#glyph10-0"
-         x="214.201007"
-         y="207.449219"
-         id="use12997" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 292.748934 29.001291 L 292.748936 -0.000031721 L 0.00200221 -0.0000538227 L 0.00200002 29.001269 Z M 292.748934 29.001291 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,124.070274,61.838425)"
-       id="path13001" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g13011">
-      <use
-         xlink:href="#glyph2-8"
-         x="109.90625"
-         y="153.140625"
-         id="use13003" />
-      <use
-         xlink:href="#glyph2-2"
-         x="109.906251"
-         y="160.647561"
-         id="use13005" />
-      <use
-         xlink:href="#glyph2-9"
-         x="109.906251"
-         y="166.428764"
-         id="use13007" />
-      <use
-         xlink:href="#glyph2-10"
-         x="109.906251"
-         y="172.209967"
-         id="use13009" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(88.235294%,60.392157%,0%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(88.235294%,60.392157%,0%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 75.35743 28.946224 L 75.357432 0.0000164539 L -0.000818108 0.0000107645 L -0.000820293 28.946218 Z M 75.35743 28.946224 "
-       transform="matrix(0.0000000535091,0.70875,-0.70875,0.0000000535091,124.050789,299.832611)"
-       id="path13013" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g13023">
-      <use
-         xlink:href="#glyph2-8"
-         x="109.976563"
-         y="314.09375"
-         id="use13015" />
-      <use
-         xlink:href="#glyph2-2"
-         x="109.976563"
-         y="321.600686"
-         id="use13017" />
-      <use
-         xlink:href="#glyph2-9"
-         x="109.976564"
-         y="327.381889"
-         id="use13019" />
-      <use
-         xlink:href="#glyph2-10"
-         x="109.976564"
-         y="333.163092"
-         id="use13021" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 42.157961 37.906491 L 42.157961 -0.00135634 L 0.00077505 -0.00135634 L 0.00077505 37.906491 Z M 42.157961 37.906491 "
-       transform="matrix(0.70875,0,0,0.70875,291.866638,222.719711)"
-       id="path13025" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g13035">
-      <use
-         xlink:href="#glyph11-0"
-         x="294.839844"
-         y="239.34375"
-         id="use13027" />
-      <use
-         xlink:href="#glyph11-1"
-         x="301.492009"
-         y="239.34375"
-         id="use13029" />
-      <use
-         xlink:href="#glyph11-2"
-         x="307.118689"
-         y="239.34375"
-         id="use13031" />
-      <use
-         xlink:href="#glyph11-3"
-         x="312.745368"
-         y="239.34375"
-         id="use13033" />
-    </g>
-    <path
-       style="fill-rule:evenodd;fill:rgb(93.333333%,48.627451%,19.215686%);fill-opacity:1;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(93.333333%,48.627451%,19.215686%);stroke-opacity:1;stroke-miterlimit:10;"
-       d="M 41.55536 37.906491 L 41.55536 -0.00135634 L -0.00107646 -0.00135634 L -0.00107646 37.906491 Z M 41.55536 37.906491 "
-       transform="matrix(0.70875,0,0,0.70875,257.985138,222.719711)"
-       id="path13037" />
-    <g
-       style="fill:rgb(100%,100%,100%);fill-opacity:1;"
-       id="g13045">
-      <use
-         xlink:href="#glyph0-5"
-         x="265.074219"
-         y="239.34375"
-         id="use13039" />
-      <use
-         xlink:href="#glyph0-1"
-         x="271.377295"
-         y="239.34375"
-         id="use13041" />
-      <use
-         xlink:href="#glyph0-6"
-         x="276.632935"
-         y="239.34375"
-         id="use13043" />
-    </g>
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 0.00189457 -0.00137787 L -164.399558 -44.153714 "
-       transform="matrix(0.70875,0,0,0.70875,306.807251,222.719727)"
-       id="path13047" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -12.542197 -11.134535 L 0.00189457 -0.00137787 L -16.43329 3.349592 "
-       transform="matrix(0.70875,0,0,0.70875,306.807251,222.719727)"
-       id="path13049" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 0.000344466 -0.00137787 L -116.291541 -44.153714 "
-       transform="matrix(0.70875,0,0,0.70875,272.710693,222.719727)"
-       id="path13051" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -11.358782 -12.336034 L 0.000344466 -0.00137787 L -16.682856 1.68513 "
-       transform="matrix(0.70875,0,0,0.70875,272.710693,222.719727)"
-       id="path13053" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 0.00133481 -0.00269114 L -48.014538 -44.237699 "
-       transform="matrix(0.70875,0,0,0.70875,224.319366,222.779251)"
-       id="path13055" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -5.951046 -15.677294 L 0.00133481 -0.00269114 L -16.114185 -4.648855 "
-       transform="matrix(0.70875,0,0,0.70875,224.319366,222.779251)"
-       id="path13057" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -0.000861166 0.00144245 L 68.886925 44.087641 "
-       transform="matrix(0.70875,0,0,0.70875,257.984985,191.471634)"
-       id="path13059" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M 8.591511 14.402897 L -0.000861166 0.00144245 L 16.676828 1.770622 "
-       transform="matrix(0.70875,0,0,0.70875,257.984985,191.471634)"
-       id="path13061" />
-    <g
-       style="fill:rgb(22.745098%,32.156863%,41.960784%);fill-opacity:1;"
-       id="g13069">
-      <use
-         xlink:href="#glyph9-6"
-         x="262.945313"
-         y="207.449219"
-         id="use13063" />
-      <use
-         xlink:href="#glyph9-7"
-         x="268.200952"
-         y="207.449219"
-         id="use13065" />
-      <use
-         xlink:href="#glyph9-2"
-         x="272.925952"
-         y="207.449219"
-         id="use13067" />
-    </g>
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -0.000861166 0.00144245 L 20.777357 44.087641 "
-       transform="matrix(0.70875,0,0,0.70875,257.984985,191.471634)"
-       id="path13071" />
-    <path
-       style="fill:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(17.647059%,52.156863%,75.686275%);stroke-opacity:1;stroke-dasharray:6,4;stroke-miterlimit:10;"
-       d="M -0.392175 16.767315 L -0.000861166 0.00144245 L 13.177049 10.374017 "
-       transform="matrix(0.70875,0,0,0.70875,257.984985,191.471634)"
-       id="path13073" />
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="135.0209"
-       y="239.34195"
-       id="text13084"><tspan
-         sodipodi:role="line"
-         id="tspan13082"
-         x="135.0209"
-         y="239.34195"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">TaskKernel</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:30px;line-height:1.25;font-family:'Linux Biolinum';-inkscape-font-specification:'Linux Biolinum';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75"
-       x="175.44191"
-       y="180.65388"
-       id="text13084-7"><tspan
-         sodipodi:role="line"
-         id="tspan13082-2"
-         x="175.44191"
-         y="180.65388"
-         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:9.5px;font-family:Arial;-inkscape-font-specification:'Arial Bold';fill:#ffffff;fill-opacity:1;stroke-width:0.75">Queue</tspan></text>
-  </g>
-</svg>
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md
deleted file mode 100644
index 452bc6ac15..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/Accelerators.md
+++ /dev/null
@@ -1,76 +0,0 @@
-[:arrow_up: Up](../Mapping.md)
-
-Accelerator Implementations
-===========================
-
-|alpaka|Serial|std::thread|Boost.Fiber|OpenMP 2.0|OpenMP 4.0|CUDA 8.0|
-|---|---|---|---|---|---|---|
-|Devices|Host Core|Host Cores|Host Core|Host Cores|Host Cores|NVIDIA GPUs|
-|Lib/API|n/a|std::thread|boost::fibers::fiber|OpenMP 2.0|OpenMP 4.0|CUDA 8.0|
-|Kernel execution|n/a|std::thread(kernel)|boost::fibers::fiber(kernel)|omp_set_dynamic(0), #pragma omp parallel num_threads(iNumKernelsInBlock)|#pragma omp target, #pragma omp teams num_teams(...) thread_limit(...), #pragma omp distribute, #pragma omp parallel num_threads(...)|cudaConfigureCall, cudaSetupArgument, cudaLaunch|
-|Execution strategy grid-blocks|sequential|sequential|sequential|sequential|undefined|undefined|
-|Execution strategy block-kernels|sequential|preemptive multitasking|cooperative multithreading|preemptive multitasking|preemptive multitasking|lock-step within warps|
-|getIdx|n/a|*block-kernel*: mapping of std::this_thread::get_id() *grid-block*: member variable|*block-kernel*: mapping of std::this_fiber::get_id() *grid-block*: member variable|*block-kernel*: omp_get_thread_num() to 3D index mapping *grid-block*: member variable|*block-kernel*: omp_get_thread_num() to 3D index mapping *grid-block*: member variable|threadIdx, blockIdx|
-|getExtent|member variables|member variables|member variables|member variables|member variables|gridDim, blockDim|
-|getBlockSharedExternMem|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|allocated in memory prior to kernel execution|\__shared__|
-|allocBlockSharedMem|master thread allocates|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|syncBlockKernels -> master thread allocates -> syncBlockKernels|\__shared__|
-|syncBlockKernels|n/a|barrier|barrier|#pragma omp barrier|#pragma omp barrier|__syncthreads|
-|atomicOp|n/a|std::lock_guard< std::mutex >|n/a|#pragma omp critical|#pragma omp critical|atomicXXX|
-|ALPAKA_FN_HOST_ACC, ALPAKA_FN_ACC, ALPAKA_FN_HOST|inline|inline|inline|inline|inline|\__device__, \__host__, \__forceinline__|
-
-
-### Serial
-
-The serial accelerator only allows blocks with exactly one thread.
-Therefore it does not implement real synchronization or atomic primitives.
-
-### Threads
-
-#### Execution
-
-To prevent recreation of the threads between execution of different blocks in the grid, the threads are stored inside a thread pool.
-This thread pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage and lots of idling kernel-threads when there are multiple KernelExecutors around.
-Because the default policy of the threads in the pool is to yield instead of waiting, this would also slow down the system immensely.
-
-### Fibers
-
-#### Execution
-
-To prevent recreation of the fibers between execution of different blocks in the grid, the fibers are stored inside a fibers pool.
-This fiber pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage when there are multiple KernelExecutors around.
-
-### OpenMP
-
-#### Execution
-
-Parallel execution of the kernels in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line.
-So we have to spawn one real thread per kernel in a block.
-`omp for` is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required.
-Therefore we use `omp parallel` with the specified number of threads in a block.
-Another reason for not using `omp for` like `#pragma omp parallel for collapse(3) num_threads(blockDim.x*blockDim.y*blockDim.z)` is that `#pragma omp barrier` used for intra block synchronization is not allowed inside `omp for` blocks.
-
-Because OpenMP is designed for a 1:1 abstraction of hardware to software threads, the block size is restricted by the number of OpenMP threads allowed by the runtime. 
-This could be as little as 2 or 4 kernels but on a system with 4 cores and hyper-threading OpenMP can also allow 64 threads.
-
-#### Index
-
-OpenMP only provides a linear thread index. This index is converted to a 3 dimensional index at runtime.
-
-#### Atomic
-
-We can not use '#pragma omp atomic' because braces or calling other functions directly after `#pragma omp atomic` are not allowed.
-Because we are implementing the CUDA atomic operations which return the old value, this requires `#pragma omp critical` to be used.
-`omp_set_lock` is an alternative but is usually slower.
-
-### CUDA
-
-Nearly all CUDA functionality can be directly mapped to alpaka function calls.
-A major difference is that CUDA requires the block and grid sizes to be given in (x, y, z) order.
-Alpaka uses the mathematical C/C++ array indexing scheme [z][y][x].
-Dimension 0 in this case is z, dimensions 2 is x.
-
-Furthermore alpaka does not require the indices and extents to be 3-dimensional.
-The accelerators are templatized on and support arbitrary dimensionality.
-NOTE: Currently the CUDA implementation is restricted to a maximum of 3 dimensions!
-
-NOTE: The CUDA-accelerator back-end can change the current CUDA device and will NOT set the device back to the one prior to the invocation of the alpaka function!
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/CUDA.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/CUDA.md
deleted file mode 100644
index 964bc1d8db..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/CUDA.md
+++ /dev/null
@@ -1,243 +0,0 @@
-[:arrow_up: Up](../Mapping.md)
-
-CUDA GPUs
-=========
-
-Mapping the abstraction to GPUs supporting *CUDA* is straightforward because the hierarchy levels are identical up to the element level.
-So blocks of warps of threads will be mapped directly to their *CUDA* equivalent.
-
-The element level is supported through an additional run-time variable containing the extent of elements per thread.
-This variable can be accessed by all threads and should optimally be placed in constant device memory for fast access.
-
-Porting CUDA to *alpaka*
-------------------------
-
-Nearly all CUDA functionality can be directly mapped to alpaka function calls.
-A major difference is that CUDA requires the block and grid sizes to be given in (x, y, z) order. Alpaka uses the mathematical C/C++ array indexing scheme [z][y][x]. In both cases x is the innermost / fast running index.
-
-Furthermore alpaka does not require the indices and extents to be 3-dimensional.
-The accelerators are templatized on and support arbitrary dimensionality.
-NOTE: Currently the CUDA implementation is restricted to a maximum of 3 dimensions!
-
-NOTE: You have to be careful when mixing alpaka and non alpaka CUDA code. The CUDA-accelerator back-end can change the current CUDA device and will NOT set the device back to the one prior to the invocation of the alpaka function.
-
-
-### Programming Interface
-
-*Function Attributes*
-
-|CUDA|alpaka|
-|---|---|
-|\_\_host\_\_|ALPAKA_FN_HOST|
-|\_\_device\_\_|ALPAKA_FN_ACC*|
-|\_\_global\_\_|ALPAKA_FN_ACC*|
-|\_\_host\_\_ \_\_device\_\_|ALPAKA_FN_HOST_ACC|
-
-\* You can not call CUDA only methods except when ALPAKA_ACC_GPU_CUDA_ONLY_MODE is enabled.
-
-*Memory*
-
-|CUDA|alpaka|
-|---|---|
-|\_\_shared\_\_|[alpaka::block::shared::st::allocVar<std::uint32_t, \_\_COUNTER\_\_>(acc)](../../../../../test/unit/block/shared/src/BlockSharedMemSt.cpp#L69)|
-|\_\_constant\_\_|[ALPAKA_STATIC_ACC_MEM_CONSTANT](../../../../../test/unit/mem/view/src/ViewStaticAccMem.cpp#L58-L63)|
-|\_\_device\_\_|[ALPAKA_STATIC_ACC_MEM_GLOBAL](../../../../../test/unit/mem/view/src/ViewStaticAccMem.cpp#L164-L169)|
-
-*Index / Work Division*
-
-|CUDA|alpaka|
-|---|---|
-|threadIdx|alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)|
-|blockIdx|alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)|
-|blockDim|alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)|
-|gridDim|alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)|
-
-*Types*
-
-|CUDA|alpaka|
-|---|---|
-|dim3|[alpaka::vec::Vec< TDim, TVal >](../../../../../test/unit/vec/src/VecTest.cpp#L43-L45)|
-
-
-### CUDA Runtime API
-
-The following tables list the functions available in the [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/modules.html#modules) and their equivalent alpaka functions:
-
-*Device Management*
-
-|CUDA|alpaka|
-|---|---|
-|cudaChooseDevice|-|
-|cudaDeviceGetAttribute|-|
-|cudaDeviceGetByPCIBusId|-|
-|cudaDeviceGetCacheConfig|-|
-|cudaDeviceGetLimit|-|
-|cudaDeviceGetP2PAttribute|-|
-|cudaDeviceGetPCIBusId|-|
-|cudaDeviceGetSharedMemConfig|-|
-|cudaDeviceGetQueuePriorityRange|-|
-|cudaDeviceReset|alpaka::dev::reset(device)|
-|cudaDeviceSetCacheConfig|-|
-|cudaDeviceSetLimit|-|
-|cudaDeviceSetSharedMemConfig|-|
-|cudaDeviceSynchronize|void alpaka::wait::wait(device)|
-|cudaGetDevice|n/a (no current device)|
-|cudaGetDeviceCount|std::size_t alpaka::pltf::getDevCount< TPltf >()|
-|cudaGetDeviceFlags|-|
-|cudaGetDeviceProperties|alpaka::acc::getAccDevProps(dev) *NOTE: Only some properties available*|
-|cudaIpcCloseMemHandle|-|
-|cudaIpcGetEventHandle|-|
-|cudaIpcGetMemHandle|-|
-|cudaIpcOpenEventHandle|-|
-|cudaIpcOpenMemHandle|-|
-|cudaSetDevice|n/a (no current device)|
-|cudaSetDeviceFlags|-|
-|cudaSetValidDevices|-|
-
-*Error Handling*
-
-|CUDA|alpaka|
-|---|---|
-|cudaGetErrorName|n/a (handled internally, available in exception message)|
-|cudaGetErrorString|n/a (handled internally, available in exception message)|
-|cudaGetLastError|n/a (handled internally)|
-|cudaPeekAtLastError|n/a (handled internally)|
-
-*Queue Management*
-
-|CUDA|alpaka|
-|---|---|
-|cudaStreamAddCallback|alpaka::queue::enqueue(queue, \[\](){do_something();})|
-|cudaStreamAttachMemAsync|-|
-|cudaStreamCreate|<ul><li>queue = alpaka::queue::QueueCudaRtNonBlocking(device);</li><li>queue = alpaka::queue::QueueCudaRtBlocking(device);</li></ul>|
-|cudaStreamCreateWithFlags|see cudaStreamCreate (cudaStreamNonBlocking hard coded)|
-|cudaStreamCreateWithPriority|-|
-|cudaStreamDestroy|n/a (Destructor)|
-|cudaStreamGetFlags|-|
-|cudaStreamGetPriority|-|
-|cudaStreamQuery|bool alpaka::queue::empty(queue)|
-|cudaStreamSynchronize|void alpaka::wait::wait(queue)|
-|cudaStreamWaitEvent|void alpaka::wait::wait(queue, event)|
-
-*Event Management*
-
-|CUDA|alpaka|
-|---|---|
-|cudaEventCreate|alpaka::event::Event< TQueue > event(dev);|
-|cudaEventCreateWithFlags|-|
-|cudaEventDestroy|n/a (Destructor)|
-|cudaEventElapsedTime|-|
-|cudaEventQuery|bool alpaka::event::test(event)|
-|cudaEventRecord|void alpaka::queue::enqueue(queue, event)|
-|cudaEventSynchronize|void alpaka::wait::wait(event)|
-
-*Memory Management*
-
-|CUDA|alpaka|
-|---|---|
-|cudaArrayGetInfo|-|
-|cudaFree|n/a (automatic memory management with reference counted memory handles)|
-|cudaFreeArray|-|
-|cudaFreeHost|n/a|
-|cudaFreeMipmappedArray|-|
-|cudaGetMipmappedArrayLevel|-|
-|cudaGetSymbolAddress|-|
-|cudaGetSymbolSize|-|
-|cudaHostAlloc|n/a|
-|cudaHostGetDevicePointer|-|
-|cudaHostGetFlags|-|
-|cudaHostRegister|-|
-|cudaHostUnregister|-|
-|cudaMalloc|alpaka::mem::buf::alloc<TElement>(device, extents1D)|
-|cudaMalloc3D|alpaka::mem::buf::alloc<TElement>(device, extents3D)|
-|cudaMalloc3DArray|-|
-|cudaMallocArray|-|
-|cudaMallocHost|alpaka::mem::buf::alloc<TElement>(device, extents) *1D, 2D, 3D suppoorted!*|
-|cudaMallocManaged|-|
-|cudaMallocMipmappedArray|-|
-|cudaMallocPitch|alpaka::mem::alloc<TElement>(device, extents2D)|
-|cudaMemAdvise|-|
-|cudaMemGetInfo|<ul><li>alpaka::dev::getMemBytes</li><li>alpaka::dev::getFreeMemBytes</li><ul>|
-|cudaMemPrefetchAsync|-|
-|cudaMemRangeGetAttribute|-|
-|cudaMemRangeGetAttributes|-|
-|cudaMemcpy|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D)|
-|cudaMemcpy2D|alpaka::mem::view::copy(memBufDst, memBufSrc, extents2D)|
-|cudaMemcpy2DArrayToArray|-|
-|cudaMemcpy2DAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents2D, queue)|
-|cudaMemcpy2DFromArray|-|
-|cudaMemcpy2DFromArrayAsync|-|
-|cudaMemcpy2DToArray|-|
-|cudaMemcpy2DToArrayAsync|-|
-|cudaMemcpy3D|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D)|
-|cudaMemcpy3DAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D, queue)|
-|cudaMemcpy3DPeer|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D)|
-|cudaMemcpy3DPeerAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents3D, queue)|
-|cudaMemcpyArrayToArray|-|
-|cudaMemcpyAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D, queue)|
-|cudaMemcpyFromArray|-|
-|cudaMemcpyFromArrayAsync|-|
-|cudaMemcpyFromSymbol|-|
-|cudaMemcpyFromSymbolAsync|-|
-|cudaMemcpyPeer|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D)|
-|cudaMemcpyPeerAsync|alpaka::mem::view::copy(memBufDst, memBufSrc, extents1D, queue)|
-|cudaMemcpyToArray|-|
-|cudaMemcpyToArrayAsync|-|
-|cudaMemcpyToSymbol|-|
-|cudaMemcpyToSymbolAsync|-|
-|cudaMemset|alpaka::mem::view::set(memBufDst, byte, extents1D)|
-|cudaMemset2D|alpaka::mem::view::set(memBufDst, byte, extents2D)|
-|cudaMemset2DAsync|alpaka::mem::view::set(memBufDst, byte, extents2D, queue)|
-|cudaMemset3D|alpaka::mem::view::set(memBufDst, byte, extents3D)|
-|cudaMemset3DAsync|alpaka::mem::view::set(memBufDst, byte, extents3D, queue)|
-|cudaMemsetAsync|alpaka::mem::view::set(memBufDst, byte, extents1D, queue)|
-|make_cudaExtent|-|
-|make_cudaPitchedPtr|-|
-|make_cudaPos|-|
-|cudaMemcpyHostToDevice|n/a (direction of copy is determined automatically)|
-|cudaMemcpyDeviceToHost|n/a (direction of copy is determined automatically)|
-
-*Execution Control*
-
-|CUDA|alpaka|
-|---|---|
-|cudaFuncGetAttributes|-|
-|cudaFuncSetCacheConfig|-|
-|cudaFuncSetSharedMemConfig|-|
-|cudaLaunchKernel|<ul><li>alpaka::kernel::exec< TAcc >(queue, workDiv, kernel, params...)</li><li>alpaka::kernel::BlockSharedExternMemSizeBytes< TKernel< TAcc > >::getBlockSharedExternMemSizeBytes<...>(...)</li></ul>|
-|cudaSetDoubleForDevice|n/a (alpaka assumes double support)|
-|cudaSetDoubleForHost|n/a (alpaka assumes double support)|
-
-*Occupancy*
-
-|CUDA|alpaka|
-|---|---|
-|cudaOccupancyMaxActiveBlocksPerMultiprocessor|-|
-|cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags|-|
-
-
-*Unified Addressing*
-
-|CUDA|alpaka|
-|---|---|
-|cudaPointerGetAttributes|-|
-
-*Peer Device Memory Access*
-
-|CUDA|alpaka|
-|---|---|
-|cudaDeviceCanAccessPeer|-|
-|cudaDeviceDisablePeerAccess|-|
-|cudaDeviceEnablePeerAccess|automatically done when required|
-
-**OpenGL, Direct3D, VDPAU, EGL, Graphics Interoperability**
-
-*not available*
-
-**Texture/Surface Reference/Object Management**
-
-*not available*
-
-**Version Management**
-
-*not available*
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/HIP.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/HIP.md
deleted file mode 100644
index 34cf4bb275..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/HIP.md
+++ /dev/null
@@ -1,97 +0,0 @@
-## Current restrictions on HCC platform
-
-- Workaround for unsupported `syncthreads_{count|and|or}`.
-  - uses temporary shared value and atomics
-- Workaround for buggy `hipStreamQuery`, `hipStreamSynchronize`.
-  - introduces own queue management
-  - `hipStreamQuery` and `hipStreamSynchronize` did not work in multithreaded environment
-- Workaround for missing `cuStreamWaitValue32`.
-  - polls value each 10ms
-- device constant memory not supported yet
-- note, that `printf` in kernels is still not supported in HIP
-- 3D memory is currently disabled
-  - missing `hipMemcpy3DAsync` is replaced with `hipMemcpy3D` though
-  - exclude `hipMalloc3D` and `hipMallocPitch` when size is zero
-    - otherwise they throw an Unknown Error
-  - `TestAccs` excludes 3D specialization of Hip back-end for now
-  - ... because `verifyBytesSet` fails in `memView` for 3D specialization
-- `dim3` structure is not available on device (use `alpaka::vec::Vec` instead)
-- Constructors' attributes unified with destructors'.
-  - host/device signature must match in HIP(HCC)
-- a chain of functions must also provide correct host-device signatures
-  - e.g. a host function cannot be called from a host-device function
-- recompile your target when HCC linker returned the error:
-"File format not recognized
-clang-7: error: linker command failed with exit code 1"
-- if compile-error occurred, the linker still may link, but without the device code
-- AMD device architecture currently hardcoded in `alpakaConfig.cmake`
-
-## Compiling HIP from source
-
-Follow [this](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md "HIP installation") guide for installing HIP.
-HIP requires either `nvcc` or `hcc` to be installed on your system (see guide for further details).
-
-- If you want the hip binaries to be located in a directory that does not require superuser access, be sure to change the install directory of HIP by modifying the `CMAKE_INSTALL_PREFIX` cmake variable.
-- Also, after the installation is complete, add the following line to the `.profile` file in your home directory, in order to add the path to the HIP binaries to PATH:
-`PATH=$PATH:<path_to_binaries>`
-
-```bash
-git clone --recursive https://github.com/ROCm-Developer-Tools/HIP.git
-cd "HIP"
-mkdir -p build
-cd build
-cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX=${YOUR_HIP_INSTALL_DIR} -DBUILD_TESTING=OFF ..
-make
-make install
-```
-Set the appropriate paths (edit `${YOUR_**}` variables).
-```bash
-# HIP_PATH required by HIP tools
-export HIP_PATH=${YOUR_HIP_INSTALL_DIR}
-# Paths required by HIP tools
-export CUDA_PATH=${YOUR_CUDA_ROOT}
-# - if required, path to HCC compiler. Default /opt/rocm/hcc.
-export HCC_HOME=${YOUR_HCC_ROOT}
-# - if required, path to HSA include, lib. Default /opt/rocm/hsa.
-export HSA_PATH=${YOUR_HSA_PATH}
-# HIP binaries and libraries
-export PATH=${HIP_PATH}/bin:$PATH
-export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${LD_LIBRARY_PATH}
-```
-Test the HIP binaries.
-```bash
-# calls nvcc or hcc
-which hipcc
-hipcc -V
-which hipconfig
-hipconfig -v
-```
-
-
-## Verifying HIP installation
-- If PATH points to the location of the HIP binaries, the following command should list several relevant environment variables, and also the selected compiler on your system-`hipconfig -f`
-- Compile and run the [square sample](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/0_Intro/square), as pointed out in the [original](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#verify-your-installation) HIP install guide.
-
-## Compiling examples with HIP back-end
-As of now, the back-end has only been tested on the NVIDIA platform.
-### NVIDIA Platform
-* One issue in this branch of alpaka is that the host compiler flags don't propagate to the device compiler, as they do in CUDA. This is because a counterpart to the CUDA_PROPAGATE_HOST_FLAGS cmake variable has not been defined in the FindHIP.cmake file.
-Alpaka forwards the host compiler flags in cmake to the `HIP_NVCC_FLAGS` cmake variable, which also takes user-given flags. To add flags to this variable, toggle the advanced mode in `ccmake`.
-
-
-## Random Number Generator Library rocRAND for HIP back-end
-
-rocRAND provides an interface for HIP, where the cuRAND or rocRAND API is called depending on the chosen HIP platform (can be configured with cmake in alpaka).
-
-Clone the rocRAND repository, then build and install it:
-```bash
-git clone https://github.com/ROCmSoftwarePlatform/rocRAND
-cd rocRAND
-mkdir -p build
-cd build
-cmake -DCMAKE_INSTALL_PREFIX=${HIP_PATH} -DBUILD_BENCHMARK=OFF -DBUILD_TEST=OFF -DCMAKE_MODULE_PATH=${HIP_PATH}/cmake ..
-make
-```
-
-The `CMAKE_MODULE_PATH` is a cmake variable for locating module finding scripts like *FindHIP.cmake*.
-The paths to the `rocRAND` library and include directories should be appended to the `CMAKE_PREFIX_PATH` variable.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86.md b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86.md
deleted file mode 100644
index 0fca42c3f6..0000000000
--- a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86.md
+++ /dev/null
@@ -1,97 +0,0 @@
-[:arrow_up: Up](../Mapping.md)
-
-x86 CPUs
-========
-
-There are multiple possible ways to map the *alpaka* abstraction to x86 CPUs.
-The following figure shows the compute and memory hierarchy of a dual-socket (package) node with dual-core CPUs and symmetric multithreading (Hyper-Threading).
-Through symmetric multithreading (Hyper-Threading) each core represents two processing units.
-![x86_cpu](x86/x86_cpu.png)
-
-Thread
-------
-
-Mapping the thread level directly to the processing units is the most trivial part of the assignment of hierarchy levels to hardware units.
-However, the block and warp levels could be mapped to hardware components in different ways with varying advantages and disadvantages.
-
-Warp
-----
-
-Even though a warp seems to be identical to a vector register, because both execute a single uniform instruction on multiple data elements, they are not the same.
-[Warps](../../Abstraction.md) can handle branches with divergent control flows of multiple threads.
-There is no equivalent hardware unit in a CPU supporting this.
-Therefore, the warp level can not be utilized on CPUs leading to a one-to-one mapping of threads to warps which does not violate the rules of the abstraction.
-
-Block
------
-
-### One Block Per Node
-
-By combining all processing units (possibly Hyper-Threads) of all processors on a node into one block, the number of synchronizing and communicating threads can be enlarged.
-This high possible thread count would simplify the implementation of some types of algorithms but introduces performance issues on multi-core nodes.
-The shared memory between all cores on a node is the RAM.
-However, the RAM and the communication between the sockets is far too slow for fine-grained communication in the style of *CUDA* threads.
-
-### One Block Per Socket
-
-If each processor on each socket would concurrently execute one block, the L3 cache would be used as the fast shared memory.
-Although this is much better then to use the RAM, there is still a problem.
-Regions of the global memory and especially from the shared memory that are accessed are automatically cached in the L1 and / or L2 caches of each core.
-Not only the elements which are directly accessed will be cached but always the whole cache line they lie in.
-Cache lines typically have a size of 64 Bytes on modern x86 architectures.
-This leads to, for example, eight double precision floating point numbers being cached at once even though only one value really is required.
-As long as these values are only read there is no problem.
-However, if one thread writes to a value that is also cached on other cores, all such cache copies have to be invalidated.
-This results in a lot of cache and bus traffic.
-Due to the hierarchical decomposition of the grid of threads reflecting the data elements, neighboring threads are always combined into a common block.
-By mapping a block to a socket, threads that are executed concurrently always have very close indices into the grid.
-Therefore, the elements that are read and written by the threads are always very close together within the memory and will most probably share a cache line.
-This property is exploited on *CUDA* GPUs, where memory accesses within a warp are combined into one large transaction.
-However, when multiple threads from multiple CPU cores write to different elements within a cache line, this advantage is reversed into its opposite.
-This pattern non-intuitively leads to heavy performance degradation and is called false-sharing.
-
-### One Block Per Core
-
-The best compromise between a high number of threads per block and a fast communication between the threads is to map a block directly to a CPU core.
-Each processing unit (possibly a Hyper-Thread) executes one or more threads of our hierarchical abstraction while executing multiple elements locally either by processing them sequentially or in a vectorized fashion.
-This possible mapping of blocks, threads and elements to the compute and memory hierarchy of a dual-socket node with dual-core CPUs and symmetric multithreading is illustrated in the following figure.
-![x86_cpu](x86/x86_cpu_mapping.png)
-
-### One Block Per Thread
-
-If there is no symmetric multithreading or if it is desired, it is also possible to implement a mapping of one block with exactly one thread for each processing unit.
-This allows to completely remove the synchronization overhead for tasks where this is not required at all.
-
-Threading Mechanisms
---------------------
-
-The mapping of threads to processing units is independent of the threading mechanism that is used.
-As long as the thread affinity to cores can be set correctly, *OpenMP*, *pthread*, *std::thread* or other libraries and APIs can be used interchangeably to implement various *alpaka* back-ends.
-They all have different advantages and disadvantages.
-Real operating system threads like *pthread*, *std::thread* and others have a high cost of thread creation and thread change because their default stack size amounts to multiple megabytes.
-*OpenMP* threads on the other hand are by default much more lightweight.
-However, they are arbitrarily limited by the runtime implementation in the maximum number of concurrent threads a machine supports.
-All of the previous methods have non-deterministic thread changes in common.
-Therefore it is not possible to decide the order in which threads within a block are processed, which could be a good optimization opportunity.
-
-To allow blocks to contain more threads then the number of processing units each core provides, it is possible to simply start more threads then processing units are available.
-This is called oversubscription.
-Those threads can be bound to the correct cores and by relying on the operating system thread scheduler, they are preemptively multitasked while sharing a single cache and thereby avoiding false-sharing.
-However, this is not always beneficial because the cost of thread changes by the kernel-mode scheduler should not be underestimated.
-
-### Fibers
-
-To remove the overhead of the kernel mode scheduler as well as to enable the usage of deterministic thread context-switches, fibers can be used.
-A fiber is a user-space thread with cooperative context-switches and extends the concept of coroutines.
-A coroutine is basically a function that can be suspended and resumed but does not necessarily have a stack.
-In contrast, functions within most programming languages represent subroutines and not coroutines because they can neither be suspended in the mid of execution nor resumed exactly at the place they were suspended without losing values on the functions local stack.
-
-Multiple fibers can be executed within one operating system thread, which allows to simulate multiple threads per block without kernel-mode multithreading.
-This was not possible without fibers because only coroutines allow the kernel functions to be suspended at synchronization points and resumed when all fibers reached it.
-Each time an operating system thread executing a function would wait for an other thread or a resource, an equivalent fiber just switches to the next fiber within the executing host thread.
-Due to the context changes happening at user-level, the cost is much lower.
-Additionally, fiber context changes are deterministic and it is even possible to implement an user-level scheduler.
-An advantage of a user level scheduler over the operating system thread scheduler is the possibility to optimally utilize the caches by taking into account the memory access pattern of the algorithm.
-Furthermore, fibers reduce the number of locks and busy waits within a block because only one fiber is active per operating system thread at a time.
-
-There are multiple C++ Standards Committee Papers (N3858, N3985, N4134) discussing the inclusion of fibers, awaitable functions and similar concepts into C++.
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu.png b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu.png
deleted file mode 100644
index 880d167c8d..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu_mapping.png b/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu_mapping.png
deleted file mode 100644
index c7f016bbfd..0000000000
Binary files a/thirdParty/cupla/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu_mapping.png and /dev/null differ
diff --git a/thirdParty/cupla/alpaka/docs/Doxyfile b/thirdParty/cupla/alpaka/docs/Doxyfile
new file mode 100644
index 0000000000..1b4def7cae
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/Doxyfile
@@ -0,0 +1,2575 @@
+# Doxyfile 1.8.16
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = alpaka
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "Abstraction Library for Parallel Kernel Acceleration"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = logo/alpaka_doxygen.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doxygen/
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = YES
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = YES
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# (including Cygwin) ands Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = YES
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = YES
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = YES
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = ../include/ \
+                         ../README.md
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.f90 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.as \
+                         *.js \
+                         *.c \
+                         *.cu
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = ../README.md
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = NO
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = NO
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 1
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED \
+                         ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED \
+                         ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED \
+                         ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED \
+                         ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED \
+                         ALPAKA_ACC_ANY_BT_OMP5_ENABLED \
+                         ALPAKA_ACC_GPU_CUDA_ENABLED \
+                         __CUDACC__ \
+                         _OPENMP=201307
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/thirdParty/cupla/alpaka/docs/Makefile b/thirdParty/cupla/alpaka/docs/Makefile
new file mode 100644
index 0000000000..ad9de6edbf
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/Makefile
@@ -0,0 +1,22 @@
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?= --color
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+checklinks:
+	$(SPHINXBUILD) -b linkcheck $(SPHINXOPTS) "$(SOURCEDIR)" "$(BUILDDIR)"
+	@echo
+	@echo "Check finished. Report is in $(BUILDDIR)."
diff --git a/thirdParty/cupla/alpaka/docs/cheatsheet/README.md b/thirdParty/cupla/alpaka/docs/cheatsheet/README.md
new file mode 100644
index 0000000000..35ed849a85
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/cheatsheet/README.md
@@ -0,0 +1,17 @@
+# About
+
+The style sheet (cheatsheet.style) was originally developed Roberto Alsina.
+
+https://github.com/ralsina/rst-cheatsheet
+
+# Install
+
+``` bash
+pip install rst2pdf
+```
+
+# Build
+
+``` bash
+rst2pdf -s cheatsheet.style ../source/usage/cheatsheet.rst -o cheatsheet.pdf
+```
diff --git a/thirdParty/cupla/alpaka/docs/cheatsheet/cheatsheet.style b/thirdParty/cupla/alpaka/docs/cheatsheet/cheatsheet.style
new file mode 100644
index 0000000000..8e8162edc3
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/cheatsheet/cheatsheet.style
@@ -0,0 +1,158 @@
+{
+    "pageSetup": {"margin-left": 8,
+                  "margin-right": 8,
+                  "margin-top": 8,
+                  "margin-bottom": 8,
+                  "spacing-header": 0,
+                  "spacing-footer": 10,
+                  "firstTemplate": "twoColumn",
+                  "width": "29.7cm",
+                  "height": "21cm"
+		 },
+    "pageTemplates" : {
+        "threeColumn": {
+            "frames": [
+                ["2%", "0cm", "29.333%", "100%"],
+                ["35.333%", "0cm", "29.333%", "100%"],
+                ["68.666%", "0cm", "29.333%", "100%"]
+            ]
+	}
+    },
+    "fontsAlias" : {
+        "stdMono": "CPMono_v07 Plain"
+    },
+    "styles" : [
+	[ "base", {
+	    "fontSize": 10
+	}
+	],
+	["code" , {
+	    "parent": "literal",
+	    "leftIndent": 0,
+	    "spaceBefore": 0,
+	    "spaceAfter": 4,
+	    "backColor": null,
+	    "borderColor": null,
+	    "borderWidth": 0,
+	    "leading":7,
+	    "borderPadding": [1,1,5,1],
+	    "fontSize": 8
+	}],
+	["bodytext" , {
+	    "spaceBefore":0
+	}],
+	["small" , {
+	    "parent": "base",
+	    "fontSize": 6
+	}],
+	["heading1", {
+            "backColor": "#00599dff",
+            "borderColor": "#00599dff",
+            "borderWidth": 0.2,
+            "textColor": "#FFFFFF",
+            "leading": 10,
+            "alignment": "TA_CENTER",
+            "spaceBefore": 4,
+            "borderPadding": [3,0,5,0],
+            "leftIndent": 0,
+            "fontSize": 12,
+            "fontName": "stdSansBold"
+	}],
+	["faketitle" , {
+	    "borderPadding": [3,0,1,0],
+	    "fontSize": 8,
+	    "spaceBefore": 4,
+	    "spaceAfter": 4,
+	    "fontName": "stdSansBold"
+	}],
+	["nota", { "parent": "heading",
+		   "fontSize": 6,
+		   "fontName": "stdSansBold",
+		   "textColor": "#FFFFFF",
+		   "alignment": "TA_RIGHT"
+		 }],
+	["table" , {
+	    "spaceBefore":0,
+	    "spaceAfter":3,
+	    "colWidths": ["50%","50%"],
+	    "commands": [
+		[ "VALIGN", [0, 0], [-1, -1], "TOP" ],
+		[ "BOTTOMPADDING", [0, 0], [-1, -1], 0 ],
+		[ "TOPPADDING", [0, 0], [-1, -1], 1 ],
+		[ "LINEBELOW", [0, 0], [-1, -2], 0.2, "#E1E6EA" ]
+	    ]
+	}],
+	["exampletable1" , {
+	    "spaceBefore":0,
+	    "spaceAfter":3,
+	    "colWidths": ["33.3%","33.3%","33.3%"],
+	    "commands": [
+		[ "VALIGN", [0, 0], [-1, -1], "TOP" ],
+		[ "BOTTOMPADDING", [0, 0], [-1, -1], -1 ],
+		[ "GRID", [0, 0], [-1, -1], 0.2, "#446885" ],
+		[ "BOX", [0, 0], [-1, -1], 0.2, "#446885" ]
+	    ]
+	}],
+	["faketrans" , {
+	    "spaceBefore":3,
+	    "spaceAfter":3,
+	    "colWidths": ["100%"],
+	    "commands": [
+		[ "LINEABOVE", [0, 0], [-1, -1], 0.8, "#446885" ]
+	    ]
+	}],
+	["tablapie" , {
+	    "spaceBefore":0,
+	    "spaceAfter":0,
+	    "colWidths": ["52%","19%","12%","17%"],
+	    "commands": [
+		[ "VALIGN", [0, 0], [-1, -1], "TOP" ],
+		[ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
+		[ "LINEABOVE", [0, 0], [-1, -1], 0.4, "#446885" ]
+	    ]
+	}],
+	["izqfina" , {
+	    "spaceBefore":0,
+	    "spaceAfter":6,
+	    "colWidths": ["10%",null],
+	    "commands": [
+		[ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
+		[ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
+		[ "LINEBELOW", [0, 0], [-1, -2], 0.2, "#E1E6EA" ]
+	    ]
+	}],
+	["tablacreditos", {
+	    "parent": "bodytext",
+	    "spaceBefore":-1,
+	    "spaceAfter":0,
+	    "colWidths": ["50%","50%"],
+	    "commands": [
+		[ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
+		[ "BOTTOMPADDING", [0, 0], [-1, -1], -1 ],
+		[ "TOPPADDING", [0, 0], [1, 0], 3 ]
+	    ]
+	}],
+	[ "endnote", {
+            "parent": "bodytext",
+            "colWidths": [52,null],
+            "spaceAfter": 4,
+            "commands": [
+		[ "VALIGN", [ 0, 0 ], [ -1, -1 ], "TOP" ],
+		[ "BOTTOMPADDING", [0, 0], [-1, -1], 0 ],
+		[ "TOPPADDING", [0, 0], [-1, -1], 1 ],
+		[ "LINEBEFORE", [0, 0], [0,-1], 1, "#E1E6EA" ]
+            ]
+	}],
+	["extranote" , {
+	    "spaceBefore":0,
+	    "spaceAfter":0,
+	    "colWidths": [27,null],
+	    "commands": [
+		[ "VALIGN", [0, 0], [-1, -1], "MIDDLE" ],
+		[ "BOTTOMPADDING", [0, 0], [-1, -1], -3 ],
+		[ "BOX", [0, 0], [-1, -1], 0.2, "#446885" ],
+		[ "COLBACKGROUNDS", [0,0], [-1,-1], ["#446885", "#FFFFFF"]]
+	    ]
+	}]
+    ]
+}
diff --git a/thirdParty/cupla/alpaka/docs/logo/alpaka.pdf b/thirdParty/cupla/alpaka/docs/logo/alpaka.pdf
new file mode 100644
index 0000000000..5cc2153bf4
Binary files /dev/null and b/thirdParty/cupla/alpaka/docs/logo/alpaka.pdf differ
diff --git a/thirdParty/alpaka/doc/images/alpaka.svg b/thirdParty/cupla/alpaka/docs/logo/alpaka.svg
similarity index 100%
rename from thirdParty/alpaka/doc/images/alpaka.svg
rename to thirdParty/cupla/alpaka/docs/logo/alpaka.svg
diff --git a/thirdParty/alpaka/doc/images/alpaka_401x135.png b/thirdParty/cupla/alpaka/docs/logo/alpaka_401x135.png
similarity index 100%
rename from thirdParty/alpaka/doc/images/alpaka_401x135.png
rename to thirdParty/cupla/alpaka/docs/logo/alpaka_401x135.png
diff --git a/thirdParty/alpaka/doc/doxygen/alpaka_doxygen.png b/thirdParty/cupla/alpaka/docs/logo/alpaka_doxygen.png
similarity index 100%
rename from thirdParty/alpaka/doc/doxygen/alpaka_doxygen.png
rename to thirdParty/cupla/alpaka/docs/logo/alpaka_doxygen.png
diff --git a/thirdParty/alpaka/doc/images/alpaka_inkscape.svg b/thirdParty/cupla/alpaka/docs/logo/alpaka_inkscape.svg
similarity index 100%
rename from thirdParty/alpaka/doc/images/alpaka_inkscape.svg
rename to thirdParty/cupla/alpaka/docs/logo/alpaka_inkscape.svg
diff --git a/thirdParty/cupla/alpaka/docs/requirements.txt b/thirdParty/cupla/alpaka/docs/requirements.txt
new file mode 100644
index 0000000000..431a616fe0
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/requirements.txt
@@ -0,0 +1,12 @@
+sphinx_rtd_theme>=0.3.1
+#recommonmark
+sphinx==3.0.3
+breathe==4.16.0
+sphinxcontrib.programoutput
+#sphinxcontrib-napoleon>=0.7
+pygments
+# generate plots
+#matplotlib
+#scipy
+#numpy
+rst2pdf
diff --git a/thirdParty/cupla/alpaka/docs/source/_static/custom.css b/thirdParty/cupla/alpaka/docs/source/_static/custom.css
new file mode 100644
index 0000000000..db502b1bf4
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/_static/custom.css
@@ -0,0 +1,3 @@
+.section {
+     text-align:justify;
+}
diff --git a/thirdParty/cupla/alpaka/docs/source/_static/general.css b/thirdParty/cupla/alpaka/docs/source/_static/general.css
new file mode 100644
index 0000000000..f0c574cb54
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/_static/general.css
@@ -0,0 +1,4 @@
+/* justify the normal text blocks */
+.section {
+     text-align:justify;
+}
diff --git a/thirdParty/cupla/alpaka/docs/source/advanced/cmake.rst b/thirdParty/cupla/alpaka/docs/source/advanced/cmake.rst
new file mode 100644
index 0000000000..64b5e9f4da
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/advanced/cmake.rst
@@ -0,0 +1,250 @@
+CMake Arguments
+===============
+
+Alpaka configures a lot of its functionality at compile time. Therefore a lot of compiler and link flags are needed, which are set by ``CMake`` arguments. The beginning of this section introduces the general Alpaca flag. The last parts of the section describe back-end specific flags.
+
+.. hint::
+
+   To display the cmake variables with value and type in the build folder of your project, use ``cmake -LH <path-to-build>``.
+
+**Table of back-ends**
+
+   * :ref:`CPU Serial <cpu-serial>`
+   * :ref:`C++ Threads <cpp-threads>`
+   * :ref:`Boost Fiber <boost-fiber>`
+   * :ref:`Intel TBB <intel-tbb>`
+   * :ref:`OpenMP 2 Grid Block <openmp2-grid-block>`
+   * :ref:`OpenMP 2 Block Thread <openmp2-block-thread>`
+   * :ref:`OpenMP 5 <openmp5>`
+   * :ref:`CUDA <cuda>`
+   * :ref:`HIP <hip>`
+
+Common
+------
+
+ALPAKA_CXX_STANDARD
+  .. code-block::
+
+     Set the C++ standard version.
+
+alpaka_BUILD_EXAMPLES
+  .. code-block::
+
+     Build the examples.
+
+BUILD_TESTING
+  .. code-block::
+
+     Build the testing tree.
+
+ALPAKA_DEBUG
+  .. code-block::
+
+     Set Debug level:
+
+     0 - Is the default value. No additional logging.
+     1 - Enables some basic flow traces.
+     2 - Display as many information as possible. Especially pointers, sizes and other
+         parameters of copies, kernel invocations and other operations will be printed.
+
+ALPAKA_USE_INTERNAL_CATCH2
+  .. code-block::
+
+     Use internally shipped Catch2.
+
+
+ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST
+  .. code-block::
+
+     Allow host-only contructs like assert in offload code in debug mode.
+
+.. _cpu-serial:
+
+CPU Serial
+----------
+
+ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE
+  .. code-block::
+
+     Enable the serial CPU back-end.
+
+ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
+  .. code-block::
+
+     Kibibytes (1024B) of memory to allocate for block shared memory for backends
+     requiring static allocation.
+
+.. _cpp-threads:
+
+C++ Threads
+-----------
+
+ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE
+  .. code-block::
+
+     Enable the threads CPU block thread back-end.
+
+.. _boost-fiber:
+
+Boost Fiber
+-----------
+
+ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE
+  .. code-block::
+
+     Enable the fibers CPU block thread back-end.
+
+.. _intel-tbb:
+
+Intel TBB
+---------
+
+ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE
+  .. code-block::
+
+     Enable the TBB CPU grid block back-end.
+
+ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
+  .. code-block::
+
+     Kibibytes (1024B) of memory to allocate for block shared memory for backends
+     requiring static allocation.
+
+.. _openmp2-grid-block:
+
+OpenMP 2 Grid Block
+-------------------
+
+ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE
+  .. code-block::
+
+     Enable the OpenMP 2.0 CPU grid block back-end.
+
+ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
+  .. code-block::
+
+     Kibibytes (1024B) of memory to allocate for block shared memory for backends
+     requiring static allocation.
+
+.. _openmp2-block-thread:
+
+OpenMP 2 Block thread
+---------------------
+
+ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE
+  .. code-block::
+
+     Enable the OpenMP 2.0 CPU block thread back-end.
+
+.. _openmp5:
+
+OpenMP 5
+--------
+
+ALPAKA_ACC_ANY_BT_OMP5_ENABLE
+  .. code-block::
+
+     Enable the OpenMP 5.0 CPU block and block thread back-end.
+
+
+ALPAKA_OFFLOAD_MAX_BLOCK_SIZE
+  .. code-block::
+
+     Maximum number threads per block to be suggested by any target offloading backends
+     ANY_BT_OMP5 and ANY_BT_OACC.
+
+.. _cuda:
+
+CUDA
+----
+
+ALPAKA_ACC_GPU_CUDA_ENABLE
+  .. code-block::
+
+     Enable the CUDA GPU back-end.
+
+ALPAKA_ACC_GPU_CUDA_ONLY_MODE
+  .. code-block::
+
+     Only back-ends using CUDA can be enabled in this mode (This allows to mix
+     alpaka code with native CUDA code).
+
+
+ALPAKA_CUDA_ARCH
+  .. code-block::
+
+     Set the GPU architecture: e.g. "35".
+
+ALPAKA_CUDA_COMPILER
+  .. code-block::
+
+     Set the CUDA compiler: "nvcc" or "clang".
+
+ALPAKA_CUDA_FAST_MATH
+  .. code-block::
+
+     Enable fast-math.
+
+ALPAKA_CUDA_FTZ
+  .. code-block::
+
+     Set flush to zero for GPU.
+
+ALPAKA_CUDA_KEEP_FILES
+  .. code-block::
+
+     Keep all intermediate files that are generated during internal compilation
+     steps 'CMakeFiles/<targetname>.dir'.
+
+ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA
+  .. code-block::
+
+     Enable experimental, extended host-device lambdas in NVCC.
+
+ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION
+  .. code-block::
+
+     Enable separable compilation in NVCC.
+
+https://developer.nvidia.com/blog/separate-compilation-linking-cuda-device-code/
+
+ALPAKA_CUDA_SHOW_CODELINES
+  .. code-block::
+
+     Show kernel lines in cuda-gdb and cuda-memcheck. If ALPAKA_CUDA_KEEP_FILES
+     is enabled source code will be inlined in ptx.
+     One of the added flags is: --generate-line-info
+
+ALPAKA_CUDA_SHOW_REGISTER
+  .. code-block::
+
+     Show the number of used kernel registers during compilation and create PTX.
+
+.. _hip:
+
+HIP
+---
+
+To enable the HIP backend please provide the path to the CMake find module `FindHIP.cmake`.
+The path can be given via an environment variable `CMAKE_MODULE_PATH` or by providing the CMake flag `-DCMAKE_MODULE_PATH=<path>`.
+
+ALPAKA_ACC_GPU_HIP_ENABLE
+  .. code-block::
+
+     Enable the HIP back-end (all other back-ends must be disabled).
+
+ALPAKA_ACC_GPU_HIP_ONLY_MODE
+  .. code-block::
+
+     Only back-ends using HIP can be enabled in this mode.
+
+ALPAKA_HIP_PLATFORM
+  .. code-block::
+
+     Specify HIP platform. Can be "clang" or "nvcc".
+
+ALPAKA_HIP_KEEP_FILES
+  .. code-block::
+
+     Keep all intermediate files that are generated during internal compilation
+     steps 'CMakeFiles/<targetname>.dir'.
diff --git a/thirdParty/cupla/alpaka/docs/source/advanced/mapping.rst b/thirdParty/cupla/alpaka/docs/source/advanced/mapping.rst
new file mode 100644
index 0000000000..55ecc86dd7
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/advanced/mapping.rst
@@ -0,0 +1,129 @@
+Mapping onto Specific Hardware Architectures
+============================================
+
+By providing an accelerator independent interface for kernels, their execution and memory accesses at different hierarchy levels, *alpaka* allows the user to write accelerator independent code that does not neglect performance.
+
+The mapping of the decomposition to the execution environment is handled by the back-ends provided by the *alpaka* library as well as user defined back-ends.
+A computation that is described with a maximum of the parallelism available in the *redundant hierarchical parallelism* abstraction can not be mapped one to one to any existing hardware.
+GPUs do not have vector registers for ``float`` or ``double`` types.
+Therefore, the element level is often omitted on *CUDA* accelerators.
+CPUs in turn are not (currently) capable of running thousands of threads concurrently and do not have equivalently fast inter-thread synchronization and shared memory access as GPUs do.
+
+A major point of the *redundant hierarchical parallelism* abstraction is to ignore specific unsupported levels and utilize only the ones supported on a specific accelerator.
+This allows a mapping to various current and future accelerators in a variety of ways enabling optimal usage of the underlying compute and memory capabilities.
+
+The grid level is always mapped to the whole device being in consideration.
+The scheduler can always execute multiple kernel grids from multiple queues in parallel by statically or dynamically subdividing the available resources.
+However, this will only ever simplify the mapping due to less available processing units.
+Furthermore, being restricted to less resources automatically improves the locality of data due to spatial and temporal locality properties of the caching hierarchy.
+
+x86 CPUs
+````````
+
+There are multiple possible ways to map the *alpaka* abstraction to x86 CPUs.
+The following figure shows the compute and memory hierarchy of a dual-socket (package) node with dual-core CPUs and symmetric multithreading (Hyper-Threading).
+Through symmetric multithreading (Hyper-Threading) each core represents two processing units.
+
+.. image:: /images/x86_cpu.png
+
+Thread
+------
+
+Mapping the thread level directly to the processing units is the most trivial part of the assignment of hierarchy levels to hardware units.
+However, the block and warp levels could be mapped to hardware components in different ways with varying advantages and disadvantages.
+
+Warp
+----
+
+Even though a warp seems to be identical to a vector register, because both execute a single uniform instruction on multiple data elements, they are not the same.
+:doc:`Warps </basic/abstraction>` can handle branches with divergent control flows of multiple threads.
+There is no equivalent hardware unit in a CPU supporting this.
+Therefore, the warp level can not be utilized on CPUs leading to a one-to-one mapping of threads to warps which does not violate the rules of the abstraction.
+
+Block
+-----
+
+One Block Per Node
+++++++++++++++++++
+
+By combining all processing units (possibly Hyper-Threads) of all processors on a node into one block, the number of synchronizing and communicating threads can be enlarged.
+This high possible thread count would simplify the implementation of some types of algorithms but introduces performance issues on multi-core nodes.
+The shared memory between all cores on a node is the RAM.
+However, the RAM and the communication between the sockets is far too slow for fine-grained communication in the style of *CUDA* threads.
+
+One Block Per Socket
+++++++++++++++++++++
+
+If each processor on each socket would concurrently execute one block, the L3 cache would be used as the fast shared memory.
+Although this is much better then to use the RAM, there is still a problem.
+Regions of the global memory and especially from the shared memory that are accessed are automatically cached in the L1 and / or L2 caches of each core.
+Not only the elements which are directly accessed will be cached but always the whole cache line they lie in.
+Cache lines typically have a size of 64 Bytes on modern x86 architectures.
+This leads to, for example, eight double precision floating point numbers being cached at once even though only one value really is required.
+As long as these values are only read there is no problem.
+However, if one thread writes to a value that is also cached on other cores, all such cache copies have to be invalidated.
+This results in a lot of cache and bus traffic.
+Due to the hierarchical decomposition of the grid of threads reflecting the data elements, neighboring threads are always combined into a common block.
+By mapping a block to a socket, threads that are executed concurrently always have very close indices into the grid.
+Therefore, the elements that are read and written by the threads are always very close together within the memory and will most probably share a cache line.
+This property is exploited on *CUDA* GPUs, where memory accesses within a warp are combined into one large transaction.
+However, when multiple threads from multiple CPU cores write to different elements within a cache line, this advantage is reversed into its opposite.
+This pattern non-intuitively leads to heavy performance degradation and is called false-sharing.
+
+One Block Per Core
+++++++++++++++++++
+
+The best compromise between a high number of threads per block and a fast communication between the threads is to map a block directly to a CPU core.
+Each processing unit (possibly a Hyper-Thread) executes one or more threads of our hierarchical abstraction while executing multiple elements locally either by processing them sequentially or in a vectorized fashion.
+This possible mapping of blocks, threads and elements to the compute and memory hierarchy of a dual-socket node with dual-core CPUs and symmetric multithreading is illustrated in the following figure.
+![x86_cpu](x86/x86_cpu_mapping.png)
+
+One Block Per Thread
+++++++++++++++++++++
+
+If there is no symmetric multithreading or if it is desired, it is also possible to implement a mapping of one block with exactly one thread for each processing unit.
+This allows to completely remove the synchronization overhead for tasks where this is not required at all.
+
+Threading Mechanisms
+--------------------
+
+The mapping of threads to processing units is independent of the threading mechanism that is used.
+As long as the thread affinity to cores can be set correctly, *OpenMP*, *pthread*, *std::thread* or other libraries and APIs can be used interchangeably to implement various *alpaka* back-ends.
+They all have different advantages and disadvantages.
+Real operating system threads like *pthread*, *std::thread* and others have a high cost of thread creation and thread change because their default stack size amounts to multiple megabytes.
+*OpenMP* threads on the other hand are by default much more lightweight.
+However, they are arbitrarily limited by the runtime implementation in the maximum number of concurrent threads a machine supports.
+All of the previous methods have non-deterministic thread changes in common.
+Therefore it is not possible to decide the order in which threads within a block are processed, which could be a good optimization opportunity.
+
+To allow blocks to contain more threads then the number of processing units each core provides, it is possible to simply start more threads then processing units are available.
+This is called oversubscription.
+Those threads can be bound to the correct cores and by relying on the operating system thread scheduler, they are preemptively multitasked while sharing a single cache and thereby avoiding false-sharing.
+However, this is not always beneficial because the cost of thread changes by the kernel-mode scheduler should not be underestimated.
+
+Fibers
+++++++
+
+To remove the overhead of the kernel mode scheduler as well as to enable the usage of deterministic thread context-switches, fibers can be used.
+A fiber is a user-space thread with cooperative context-switches and extends the concept of coroutines.
+A coroutine is basically a function that can be suspended and resumed but does not necessarily have a stack.
+In contrast, functions within most programming languages represent subroutines and not coroutines because they can neither be suspended in the mid of execution nor resumed exactly at the place they were suspended without losing values on the functions local stack.
+
+Multiple fibers can be executed within one operating system thread, which allows to simulate multiple threads per block without kernel-mode multithreading.
+This was not possible without fibers because only coroutines allow the kernel functions to be suspended at synchronization points and resumed when all fibers reached it.
+Each time an operating system thread executing a function would wait for an other thread or a resource, an equivalent fiber just switches to the next fiber within the executing host thread.
+Due to the context changes happening at user-level, the cost is much lower.
+Additionally, fiber context changes are deterministic and it is even possible to implement an user-level scheduler.
+An advantage of a user level scheduler over the operating system thread scheduler is the possibility to optimally utilize the caches by taking into account the memory access pattern of the algorithm.
+Furthermore, fibers reduce the number of locks and busy waits within a block because only one fiber is active per operating system thread at a time.
+
+There are multiple C++ Standards Committee Papers (N3858, N3985, N4134) discussing the inclusion of fibers, awaitable functions and similar concepts into C++.
+
+GPUs (CUDA/HIP)
+```````````````
+
+Mapping the abstraction to GPUs supporting *CUDA* and *HIP* is straightforward because the hierarchy levels are identical up to the element level.
+So blocks of warps of threads will be mapped directly to their *CUDA*/*HIP* equivalent.
+
+The element level is supported through an additional run-time variable containing the extent of elements per thread.
+This variable can be accessed by all threads and should optimally be placed in constant device memory for fast access.
diff --git a/thirdParty/cupla/alpaka/docs/source/advanced/rationale.rst b/thirdParty/cupla/alpaka/docs/source/advanced/rationale.rst
new file mode 100644
index 0000000000..ed43518418
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/advanced/rationale.rst
@@ -0,0 +1,316 @@
+.. highlight:: cpp
+   :linenothreshold: 5
+
+Rationale
+=========
+
+Interface Distinction
+---------------------
+
+The *alpaka* library is different from other similar libraries (especially *CUDA*) in that it refrains from using implicit or hidden state.
+This and other interface design decisions will be explained int the following paragraphs.
+
+No Current Device:
+++++++++++++++++++
+
+The *CUDA* runtime API for example supplies a current device for each user code kernel-thread.
+Working with multiple devices requires to call ``cudaSetDevice`` to change the current device whenever an operation should be executed on a non-current device.
+Even the functions for creating a queue (``cudaStreamCreate``) or an event (``cudaEventCreate``) use the current device without any way to create them on a non current device.
+In the case of an event this dependency is not obvious, since at the same time queues can wait for events from multiple devices allowing cross-device synchronization without any additional work.
+So conceptually an event could also have been implemented device independently.
+This can lead to hard to track down bugs due to the non-explicit dependencies, especially in multi-threaded code using multiple devices.
+
+No Default Device:
+++++++++++++++++++
+
+In contrast to the *CUDA* runtime API *alpaka* does not provide a device by default per kernel-thread.
+Especially in combination with *OpenMP* parallelized host code this keeps users from surprises.
+The following code snippet shows that it does not necessarily do what one would expect.
+
+.. code-block::
+
+   cudaSetDevice(1);
+
+   #pragma omp parallel for
+   for(int i = 0; i<10; ++i)
+   {
+       kernel<<<blocks,threads>>>(i);
+   }
+
+Depending on what the *CUDA* runtime API selects as default device for each of the *OpenMP* threads (due to each of them having its own current device), not all of the kernels will necessarily run on device one.
+
+In the *alpaka* library all such dependencies are made explicit.
+All functions depending on a device require it to be given as a parameter.
+The *alpaka* *CUDA* back-end checks before forwarding the calls to the *CUDA* runtime API whether the current device matches the given one and changes it if required.
+The *alpaka* *CUDA* back-end does not reset the current device to the one prior to the method invocation out of performance considerations.
+This has to be considered when native *CUDA* code is combined with *alpaka* code.
+
+No Default Queue:
++++++++++++++++++
+
+*CUDA* allows to execute commands without specifying a queue.
+The default queue that is used synchronizes implicitly with all other queues on the device.
+If a command queue is issued to the default, all other asynchronous queues have to wait before executing any new commands, even when they have been enqueued much earlier.
+This can introduce hard to track down performance issues.
+As of *CUDA* 7.0 the default queue can be converted to a non synchronizing queue with a compiler option.
+Because concurrency is crucial for performance and users should think about the dependencies between their commands from begin on, *alpaka* does not provide such a default queue.
+All asynchronous operations (kernel launches, memory copies and memory sets) require a queue to be executed in.
+
+No Implicit Built-in Variables and Functions:
+---------------------------------------------
+
+Within *CUDA* device functions (functions annotated with `__global__` or `__device__`) built-in functions (`__syncthreads`, `__threadfence`, `atomicAdd`, ... ) and variables (`gridDim`, `blockIdx`, `blockDim`, `threadIdx`, `warpSize`, ...) are provided.
+
+It would have been possible to emulate those implicit definitions by forcing the kernel function object to inherit from a class providing these functions and members.
+However functions outside the kernel function object would then pose a problem.
+They do not have access to those functions and members, the function object has inherited.
+To circumvent this, the functions and members would have to be public, the inheritance would have to be public and a reference to the currently executing function object would have to be passed as parameter to external functions.
+This would have been too cumbersome and inconsistent.
+Therefore access to the accelerator is given to the user kernel function object via one special input parameter representing the accelerator.
+After that this accelerator object can simply be passed to other functions.
+The built-in variables can be accessed by the user via query functions on this accelerator.
+
+  * Abandoning all the implicit and default state makes it much easier for users of the library to reason about their code. *
+
+No Language Extensions:
+-----------------------
+
+Unlike *CUDA*, the *alpaka* library does not extend the C++ language with any additional variable qualifiers (`__shared__`, `__constant__`, `__device__`) defining the memory space.
+Instead of those qualifiers *alpaka* provides accelerator functions to allocate memory in different the different memory spaces.
+
+No Dimensionality Restriction:
+------------------------------
+
+*CUDA* always uses three-dimensional indices and extents, even though the task may only be one or two dimensional.
+*OpenCL* on the other hand allows grid and block dimensions in the range [1,3] but does not provide corresponding n-dimensional indices, but rather provides functions like `get_global_id` or `get_local_id`, which require the dimension in which the one-dimensional ID is to be queried as a parameter.
+By itself this is no problem, but how can be assured that a two-dimensional kernel is called with grid and block extents of the correct dimensionality at compile time?
+How can it be assured that a kernel which only uses `threadIdx.x` or equivalently calls `get_global_id(0)` will not get called with two dimensional grid and block extents?
+Because the result in such a case is undefined, and most of the time not wanted by the kernel author, this should be easy to check and reject at compile-time.
+In *alpaka* all accelerators are templatized on the dimensionality.
+This allows a two-dimensional image filter to assert that it is only called with a two dimensional accelerator.
+Thereby the algorithms can check for supported dimensionality of the accelerator at compile time instead of runtime.
+Furthermore with the dimension being a template parameter, the CPU back-end implementations are able to use only the number of nested loops really necessary instead of the 6 loops (2 x 3 loops for grid blocks and block threads), which are mandatory to emulate the *CUDA* threaded blocking scheme.
+
+*By hiding all the accelerator functionality inside of the accelerator object that is passed to the user kernel, the user of the *alpaka* library is not faced with any non-standard C++ extensions.
+Nevertheless the *CUDA* back-end internally uses those language extensions.*
+
+Integral Sizes of Arbitrary Type:
+---------------------------------
+
+The type of sizes such as extents, indices and related variables are depending on a template parameter of the accelerator and connected classes.
+This allows the kernel to be executed with sizes of arbitrary ranges.
+Thereby it is possible to force the accelerator back-ends to perform all internal index, extent and other integral size depending computations with a given precision.
+This is especially useful on current *NVIDIA* GPUs.
+Even though they support 64-bit integral operations, they are emulated with multiple 32-bit operations.
+This can be a huge performance penalty when the sizes of buffers, offsets, indices and other integral variables holding sizes are known to be limited.
+
+No Synchronous (Blocking) and Asynchronous (Non-Blocking) Function Versions:
+----------------------------------------------------------------------------
+
+*CUDA* provides two versions of many of the runtime functions, for example, `cudaMemcpyAsync` and `cudaMemcpy`.
+The asynchronous version requires a queue while the synchronous version does not need a queue parameter.
+The asynchronous version immediately returns control back to the caller while the task is enqueued into the given queue and executed later in parallel to the host code.
+The synchronous version waits for the task to finish before the function call returns control to the caller.
+Inconsistently, all kernels in a *CUDA* program can only be started either asynchronously by default or synchronously if `CUDA_LAUNCH_BLOCKING` is defined.
+There is no way to specify this on a per kernel basis.
+To switch a whole application from asynchronous to synchronous calls, for example for debugging reasons, it is necessary to change the names of all the runtime functions being called as well as their parameters.
+In *alpaka* this is solved by always enqueuing all tasks into a queue and not defining a default queue.
+Non-blocking queues as well as blocking queues are provided for all devices.
+Changes to the synchronicity of multiple tasks can be made on a per queue basis by changing the queue type at the place of creation.
+There is no need to change any line of calling code.
+
+Memory Management
+-----------------
+
+Memory buffers can not only be identified by the pointer to their first byte.
+The C++ `new` and `malloc`, the *CUDA* `cudaMalloc` as well as the *OpenCL* `clCreateBuffer` functions all return a plain pointer.
+This is not enough when working with multiple accelerators and multiple devices.
+To know where a specific pointer was allocated, additional information has to be stored to uniquely identify a memory buffer on a specific device.
+Memory copies between multiple buffers additionally require the buffer extents and pitches to be known.
+Many APIs, for example *CUDA*, require the user to store this information externally.
+To unify the usage, *alpaka* stores all the necessary information in a memory buffer object.
+
+Acceleratable Functions
+-----------------------
+
+Many parallelization libraries / frameworks do not fully support the separation of the parallelization strategy from the algorithm itself.
+*OpenMP*, for example, fully mixes the per thread algorithm and the parallelization strategy.
+This can be seen in the source listing showing a simple AXPY computation with OpenMP.
+
+.. code-block::
+
+   template<
+       typename TIdx,
+       typename TElem>
+   void axpyOpenMP(
+       TIdx const n,
+       TElem const alpha,
+       TElem const * const X,
+       TElem * const Y)
+   {
+       #pragma omp parallel for
+       for (i=0; i<n; i++)
+       {
+           Y[i] = alpha * X[i] + Y[i];
+       }
+   }
+
+Only one line of the function body, line 13, is the algorithm itself, while all surrounding lines represent the parallelization strategy.
+In *OpenACC* the parallelization and the algorithm are similarly combined.
+
+*CUDA*, *OpenCL* and other libraries allow, at least to some degree, to separate the algorithm from the parallelization strategy.
+They define the concept of a kernel representing the algorithm itself which is then parallelized depending on the underlying hardware.
+The AXPY *CUDA* kernel source code shown in figure consists only of the code of one single iteration.
+
+.. code-block::
+
+   template<
+       typename TIdx,
+       typename TElem>
+   __global__ void axpyCUDA(
+       TIdx const n,
+       TElem const alpha,
+       TElem const * const X,
+       TElem * const Y)
+   {
+       TIdx const i(blockIdx.x*blockDim.x + threadIdx.x)
+       if(i < n)
+       {
+           Y[i] = alpha * X[i] + Y[i];
+       }
+   }
+
+On the other hand the *CUDA* implementation is bloated with code handling the inherent blocking scheme.
+Even if the algorithm does not utilize blocking, as it is the case here, the algorithm writer has to calculate the global index of the current thread by hand (line 10).
+Furthermore, to support vectors larger then the predefined maximum number of threads per block (1024 for current *CUDA* devices), multiple blocks have to be used.
+When the number of blocks does not divide the number of vector elements, it has to be assured that the threads responsible for the vector elements behind the given length, do not access the memory to prevent a possible memory access error.
+
+By using the kernel concept, the parallelization strategy, whether all elements are executed in sequential order, in parallel or blocked is not hard coded into the algorithm itself.
+The possibly multidimensional nested loops do not have to be written by the user.
+For example, six loops would be required to emulate the *CUDA* execution pattern with a grid of blocks consisting of threads.
+
+Furthermore the kernel concept breaks the algorithm down to the per element level.
+Recombining multiple kernel iterations to loop over lines, columns, blocks or any other structure is always possible by changing the calling code and does not require a change of the kernel.
+In contrast, by using *OpenMP* this would not be possible.
+Therefore the *alpaka* interface builds on the kernel concept, being the body of the corresponding standard for loop executed in each thread.
+
+Execution Domain Specifications
+-------------------------------
+
+*CUDA* requires the user to annotate its functions with execution domain specifications.
+Functions that can only be executed on the GPU have to be annotated with ``__device__``, functions that can be executed on the host and on the GPU have to be annotated with ``__host__ __device__`` and host only functions can optionally be annotated with ``__host__``.
+The nvcc *CUDA* compiler uses these annotations to decide with which back-ends a function has to be compiled.
+Depending on the compiler in use, *alpaka* defines the macros  ``ALPAKA_FN_HOST``, ``ALPAKA_FN_ACC`` and ``ALPAKA_FN_HOST_ACC`` with the identical meaning which can be used in the same positions.
+When the *CUDA* compiler is used, they are defined to their *CUDA* equivalents, else they are empty.
+
+Kernel Function
+---------------
+
+Requirements
+++++++++++++
+
+- User kernels should be implemented independent of the accelerator.
+- A user kernel has to have access to accelerator methods (synchronization within blocks, index retrieval, ...).
+- For usage with CUDA, the kernel methods have to be attributed with ``__device__ __host__``.
+- The user kernel has to fulfill std::is_trivially_copyable because only such objects can be copied into CUDA device memory.
+  A trivially copyable class is a class that
+  #. Has no non-trivial copy constructors(this also requires no virtual functions or virtual bases)
+  #. Has no non-trivial move constructors
+  #. Has no non-trivial copy assignment operators
+  #. Has no non-trivial move assignment operators
+  #. Has a trivial destructor
+
+Implementation Variants
++++++++++++++++++++++++
+
+There are two possible ways to tell the kernel about the accelerator type:
+
+#. The kernel is templated on the accelerator type ...
+
+   * (+) This allows users to specialize them for different accelerators. (Is this is really necessary or desired?)
+   * (-) The kernel has to be a class template. This does not allow C++ lambdas to be used as kernels because they are no templates themselves (but only their ``operator()`` can be templated).
+   * (-) This prevents the user from instantiating an accelerator independent kernel before executing it.
+     Because the memory layout in inheritance hierarchies is undefined a simple copy of the user kernel or its members to its specialized type is not possible platform independently.
+     This would require a copy from UserKernel<TDummyAcc> to UserKernel<TAcc> to be possible.
+     The only way to allow this would be to require the user to implement a templated copy constructor for every kernel.
+     This is not allowed for kernels that should be copyable to a CUDA device because std::is_trivially_copyable requires the kernel to have no non-trivial copy constructors.
+
+   a) ... and inherits from the accelerator.
+
+     * (-) The kernel itself has to inherit at least protected from the accelerator to allow the KernelExecutor to access the Accelerator.
+
+     * (-) How do accelerator functions called from the kernel (and not within the kernel class itself) access the accelerator methods?
+
+     Casting this to the accelerator type and giving it as parameter is too much to require from the user.
+   b) ... and the ``operator()`` has a reference to the accelerator as parameter.
+
+     * (+) This allows to use the accelerator in functions called from the kernel (and not within the kernel class itself) to access the accelerator methods in the same way the kernel entry point function can.
+     * (-) This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless.
+
+#. The ``operator()`` is templated on the accelerator type and has a reference to the accelerator as parameter.
+
+  * (+) The kernel can be an arbitrary function object with ``ALPAKA_FN_HOST_ACC`` attributes.
+  * (+) This would allow to instantiate the accelerator independent kernel and set its members before execution.
+  * (+/-) usable with polymorphic lambdas.
+  * (-) The ``operator()`` could be overloaded on the accelerator type but there is no way to specialize the whole kernel class itself, so it always has the same members.
+  * (-) This would require an additional object (the accelerator) in device memory taking up valuable CUDA registers (opposed to the inheritance solution). At least on CUDA all the accelerator functions could be inlined nevertheless.
+
+Currently we implement version 2.
+
+
+Implementation Notes
+++++++++++++++++++++
+
+Unlike *CUDA*, the *alpaka* library does not differentiate between the kernel function that represents the entry point and other functions that can be executed on the accelerator.
+The entry point function that has to be annotated with ``__global__`` in *CUDA* is internal to the *alpaka* *CUDA* back-end and is not exposed to the user.
+It directly calls into the user supplied kernel function object whose invocation operator is declared with ``ALPAKA_FN_ACC``, which equals ``__device__`` in *CUDA*.
+In this respect there is no difference between the kernel entry point function and any other accelerator function in *alpaka*.
+
+The ``operator()`` of the kernel function object has to be ``const``.
+This is especially important for the *CUDA* back-end, as it could possibly use the constant memory of the GPU to store the function object.
+The constant memory is a fast, cached, read-only memory that is beneficial when all threads uniformly read from the same address at the same time.
+In this case it is as fast as a read from a register.
+
+
+Access to Accelerator-Dependent Functionality
++++++++++++++++++++++++++++++++++++++++++++++
+
+There are two possible ways to implement access to accelerator dependent functionality inside a kernel:
+
+* Making the functions/templates members of the accelerator (maybe by inheritance) and calling them like ``acc.syncThreads()`` or ``acc.template getIdx<Grid, Thread, Dim1>()``.
+  This would require the user to know and understand when to use the template keyword inside dependent type  object function calls.
+* The functions are only light wrappers around traits that can be specialized taking the accelerator as first value (it can not be the last value because of the potential use of variadic arguments).
+  The resulting code would look like ``sync(acc)`` or ``getIdx<Grid, Thread, Dim1>(acc)``.
+  Internally these wrappers would call trait templates that are specialized for the specific accelerator e.g. ``template<typename TAcc> Sync{...};``
+
+The second version is easier to understand and usually shorter to use in user code.
+
+
+Index and Work Division
+-----------------------
+
+*CUDA* requires the user to calculate the global index of the current thread within the grid by hand (already shown as ``axpyCUDA``).
+On the contrary, *OpenCL* provides the methods ``get_global_size``, ``get_global_id``, ``get_local_size`` and ``get_local_id``.
+Called with the required dimension, they return the corresponding local or global index or extent (size).
+In *alpaka* this idea is extended to all dimensions.
+To unify the method interface and to avoid confusion between the differing terms and meanings of the functions in *OpenCL* and *CUDA*, in *alpaka* these methods are template functions.
+
+
+Block Shared Memory
+-------------------
+
+Static Block Shared Memory
+++++++++++++++++++++++++++
+
+The size of block shared memory that is allocated inside the kernel is required to be given as compile time constant.
+This is due to CUDA not allowing to allocate block shared memory inside a kernel at runtime.
+
+Dynamic Block Shared Memory
++++++++++++++++++++++++++++
+
+The size of the external block shared memory is obtained from a trait that can be specialized for each kernel.
+The trait is called with the current kernel invocation parameters and the block-element extent prior to each kernel execution.
+Because the block shared memory size is only ever constant or dependent on the block-element extent or the parameters of the invocation this has multiple advantages:
+
+* It forces the separation of the kernel invocation from the calculation of the required block shared memory size.
+* It lets the user write this calculation once instead of multiple times spread across the code.
diff --git a/thirdParty/cupla/alpaka/docs/source/basic/abstraction.rst b/thirdParty/cupla/alpaka/docs/source/basic/abstraction.rst
new file mode 100644
index 0000000000..6ecd599a81
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/basic/abstraction.rst
@@ -0,0 +1,275 @@
+Abstraction
+===========
+
+.. note::
+
+   Objective of the abstraction is to separate the parallelization strategy from the algorithm itself.
+   Algorithm code written by users should not depend on any parallelization library or specific strategy.
+   This would enable exchanging the parallelization back-end without any changes to the algorithm itself.
+   Besides allowing to test different parallelization strategies this also makes it possible to port algorithms to new, yet unsupported, platforms.
+
+Parallelism and memory hierarchies at all levels need to be exploited in order to achieve performance portability across various types of accelerators.
+Within this chapter an abstraction will be derive that tries to provide a maximum of parallelism while simultaneously considering implementability and applicability in hardware.
+
+Looking at the current HPC hardware landscape, we often see nodes with multiple sockets/processors extended by accelerators like GPUs or Intel Xeon Phi, each with their own processing units.
+Within a CPU or a Intel Xeon Phi there are cores with hyper-threads, vector units and a large caching infrastructure.
+Within a GPU there are many small cores and only few caches.
+Each entity in the hierarchy has access to different memories.
+For example, each socket / processor manages its RAM, while the cores additionally have non-explicit access to L3, L2 and L1 caches.
+On a GPU there are global, constant, shared and other memory types which all can be accessed explicitly.
+The interface has to abstract from these differences without sacrificing speed on any platform.
+
+A process running on a multi-socket node is the largest entity within *alpaka*.
+The abstraction is only about the task and data parallel execution on the process/node level and down.
+It does not provide any primitives for inter-node communication.
+However, such libraries can be combined with *alpaka*.
+
+An application process always has a main thread and is by definition running on the host.
+It can access the host memory and various accelerator devices.
+Such accelerators can be GPUs, Intel Xeon Phis, the host itself or other devices.
+Thus, the host not necessarily has to be different from the accelerator device used for the computations.
+For instance, an Intel Xeon Phi simultaneously can be the host and the accelerator device.
+
+The *alpaka* library can be used to offload the parallel execution of task and data parallel work simultaneously onto different accelerator devices.
+
+Task Parallelism
+----------------
+
+One of the basic building blocks of modern applications is task parallelism.
+For example, the operating system scheduler, deciding which thread of which process gets how many processing time on which CPU core, enables task parallelism of applications.
+It controls the execution of different tasks on different processing units.
+Such task parallelism can be, for instance, the output of the progress in parallel to a download.
+This can be implemented via two threads executing two different tasks.
+
+The valid dependencies between tasks within an application can be defined as a DAG (directed acyclic graph) in all cases.
+The tasks are represented by nodes and the dependencies by edges.
+In this model, a task is ready to be executed if the number of incoming edges is zero.
+After a task finished it's work, it is removed from the graph as well as all of it's outgoing edges,.
+This reduces the number of incoming edges of subsequent tasks.
+
+The problem with this model is the inherent overhead and the missing hardware and API support.
+When it is directly implemented as a graph, at least all depending tasks have to be updated and checked if they are ready to be executed after a task finished.
+Depending on the size of the graph and the number of edges this can be a huge overhead.
+
+*OpenCL* allows to define a task graph in a somewhat different way.
+Tasks can be enqueued into an out-of-order command queue combined with events that have to be finished before the newly enqueued task can be started.
+Tasks in the command queue with unmet dependencies are skipped and subsequent ones are executed.
+The ``CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE`` property of a command queue is an optional feature only supported by few vendors.
+Therefore, it can not be assumed to be available on all systems.
+
+*CUDA* on the other hand does currently (version 7.5) not support such out-of-order queues in any way.
+The user has to define dependencies explicitly through the order the tasks are enqueued into the queues (called queues in *CUDA*).
+Within a queue, tasks are always executed in sequential order, while multiple queues are executed in parallel.
+Queues can wait for events enqueued into other queues.
+
+In both APIs, *OpenCL* and *CUDA*, a task graph can be emulated by creating one queue per task and enqueuing a unique event after each task, which can be used to wait for the preceding task.
+However, this is not feasible due to the large queue and event creation costs as well as other overheads within this process.
+
+Therefore, to be compatible with a wide range of APIs, the interface for task parallelism has to be constrained.
+Instead of a general DAG, multiple queues of sequentially executed tasks will be used to describe task parallelism.
+Events that can be enqueued into the queues enhance the basic task parallelism by enabling synchronization between different queues, devices or the host threads.
+
+Data Parallelism
+----------------
+
+In contrast to task parallelism, data parallelism describes the execution of one and the same task on multiple, often related data elements.
+For example, an image color space conversion is a textbook example of a data parallel task.
+The same operation is executed independently on each pixel.
+Other data parallel algorithms additionally introduce dependencies between threads in the input-, intermediate-, or output-data.
+For example, the calculation of a brightness histogram has no input-data dependencies.
+However, all pixel brightness values finally have to be merged into a single result.
+Even these two simple examples show that it is necessary to think about the interaction of parallel entities to minimize the influence of data dependencies.
+
+Furthermore, it is necessary to respect the principles of spatial and temporal locality.
+Current hardware is built around these locality principles to reduce latency by using hierarchical memory as a trade-off between speed and hardware size.
+Multiple levels of caches, from small and very fast ones to very large and slower ones exploit temporal locality by keeping recently referenced data as close to the actual processing units as possible.
+Spatial locality in the main memory is also important for caches because they are usually divided into multiple lines that can only be exchanged one cache line at a time.
+If one data element is loaded and cached, it is highly likely that nearby elements are also cached.
+If the pixels of an image are stored row wise but are read out column wise, the spatial locality assumption of many CPUs is violated and the performance suffers.
+GPUs on the other hand do not have a large caching hierarchy but allow explicit access to a fast memory shared across multiple cores.
+Therefore, the best way to process individual data elements of a data parallel task is dependent on the data structure as well as the underlying hardware.
+
+The main part of the *alpaka* abstraction is the way it abstracts data parallelism and allows the algorithm writer to take into account the hierarchy of processing units, their data parallel features and corresponding memory regions.
+The abstraction developed is influenced and based on the groundbreaking *CUDA* and *OpenCL* abstractions of a multidimensional grid of threads with additional hierarchy levels in between.
+Another level of parallelism is added to those abstractions to unify the data parallel capabilities of modern hardware architectures.
+The explicit access to all hierarchy levels enables the user to write code that runs performant on all current platforms.
+However, the abstraction does not try to automatically optimize memory accesses or data structures but gives the user full freedom to use data structures matching the underlying hardware preferences.
+
+Thread
+``````
+
+Theoretically, a basic data parallel task can be executed optimally by executing one thread per independent data element.
+In this context, the term thread does not correspond to a native kernel-thread, an *OpenMP* thread, a *CUDA* thread, a user-level thread or any other such threading variant.
+It only represents the execution of a sequence of commands forming the desired algorithm on a per data element level.
+This ideal one-to-one mapping of data elements to threads leads to the execution of a multidimensional grid of threads corresponding to the data structure of the underlying problem.
+The uniform function executed by each of the threads is called a kernel.
+Some algorithms such as reductions require the possibility to synchronize or communicate between threads to calculate a correct result in a time optimal manner.
+Therefore our basic abstraction requires a n-dimensional grid of synchronizable threads each executing the same kernel.
+The following figure shows an hypothetical processing unit that could optimally execute this data parallel task.
+The threads are mapped one-to-one to the cores of the processor.
+For a time optimal execution, the cores have to have an all-to-all equal length connection for communication and synchronization.
+
+.. image:: /images/thread.png
+
+The only difference between the threads is their positional index into the grid which allows each thread to compute a different part of the solution.
+Threads can always access their private registers and the global memory.
+
+Registers
++++++++++
+
+All variables with default scope within a kernel are automatically saved in registers and are not shared automatically.
+This memory is local to each thread and can not be accessed by other threads.
+
+Global Memory
++++++++++++++
+
+The global memory can be accessed from every thread in the grid as well as from the host thread.
+This is typically the largest but also the slowest memory available.
+
+Individual threads within the grid are allowed to statically or dynamically allocate buffers in the global memory.
+
+Prior to the execution of a task, the host thread copies the input buffers and allocates the output buffers onto the accelerator device.
+Pointers to these buffers then can be given as arguments to the task invocation.
+By using the index of each thread within the grid, the offset into the global input and output buffers can be calculated.
+After the computation has finished, the output buffer can be used either as input to a subsequent task or can be copied back to the host.
+
+Block
+`````
+
+Building a processor with possibly thousands of cores where all cores have an equal length connection for fast communication and synchronization is not viable.
+Either the processor size would have to grow exponentially with the number of cores or the all-to-all communication speed would decrease so much that computations on the processor would be impractical.
+Therefore, the communication and synchronization of threads has to be limited to sizes manageable by real hardware.
+
+Figure \ref{fig:block} depicts the solution of introducing a new hierarchy level in the abstraction.
+A hypothetical processor is allowed to provide synchronization and fast communication within blocks of threads but is not required to provide synchronization across blocks.
+The whole grid is subdivided into equal sized blocks with a fast but small shared memory.
+Current accelerator abstractions (*CUDA* and *OpenCL*) only support equal sized blocks.
+This restriction could possibly be lifted to support future accelerators with heterogeneous block sizes.
+
+.. image:: /images/block.png
+
+There is another reason why independent blocks are necessary.
+Threads that can communicate and synchronize require either a one-to-one mapping of threads to cores, which is impossible because the number of data elements is theoretically unlimited, or at least a space to store the state of each thread.
+Even old single core CPUs were able to execute many communicating and synchronizing threads by using cooperative or preemptive multitasking.
+Therefore, one might think that a single core would be enough to execute all the data parallel threads.
+But the problem is that even storing the set of registers and local data of all the possible millions of threads of a task grid is not always viable.
+The blocking scheme solves this by enabling fast interaction of threads on a local scale but additionally removes the necessity to store the state of all threads in the grid at once because only threads within a block must be executed in parallel.
+Within a block of cores there still has to be enough memory to store all registers of all contained threads.
+The independence of blocks allows applications to scale well across diverse devices.
+As can be seen in the following figure, the accelerator can assign blocks of the task grid to blocks of cores in arbitrary order depending on availability and workload.
+
+.. image:: /images/block_scale.png
+
+Shared Memory
++++++++++++++
+
+Each block has its own shared memory.
+This memory can only be accessed explicitly by threads within the same block and gets discarded after the complete block finished its calculation.
+This memory is typically very fast but also very small.
+No variables are shared between kernels by default.
+
+Warp
+````
+
+With the current abstraction only independent parallelism via blocks and synchronizable parallelism via threads can be expressed.
+However, there are more variants of parallelism in real hardware.
+Because all threads in the grid are executing the same kernel and even the same instruction at the same time when ignoring divergent control flows, a lot of chip space can be saved.
+Multiple threads can be executed in perfect synchronicity, which is also called lock-step.
+A group of such threads executing the same instruction at the same time is called a warp .
+All threads within a warp share a single instruction pointer (IP), and all cores executing the threads share one instruction fetch (IF) and instruction decode (ID) unit.
+
+.. image:: /images/warp.png
+
+Even threads with divergent control flows can be executed within one warp.
+*CUDA*, for example, solves this by supporting predicated execution and warp voting.
+For long conditional branches the compiler inserts code which checks if all threads in the warp take the same branch.
+For small branches, where this is too expensive, all threads always execute all branches.
+Control flow statements result in a predicate and only in those threads where it is true, the predicated instructions will have an effect.
+
+Not only *CUDA* GPUs support the execution of multiple threads in a warp.
+Full blown vector processors with good compilers are capable of combining multiple loop iterations containing complex control flow statements in a similar manner as *CUDA*.
+
+Due to the synchronitiy of threads within a warp, memory operations will always occur at the same time in all threads.
+This allows to coalesce memory accesses.
+Different *CUDA* devices support different levels of memory coalescing.
+Older ones only supported combining multiple memory accesses if they were aligned and sequential in the order of thread indices.
+Newer ones support unaligned scattered accesses as long as they target the same 128 byte segment.
+
+The ability of very fast context switches between warps and a queue of ready warps allows *CUDA* capable GPUs to hide the latency of global memory operations.
+
+Element
+```````
+
+To use the maximum available computing power of, for example, a modern x86 processor, the computation has to utilize the SIMD vector registers.
+Many current architectures support issuing a single instruction that can be applied to multiple data elements in parallel.
+
+The original x86 instruction set architecture did not support SIMD instructions but has been enhanced with MMX (64 bit width registers), SSE (128 bit width registers), AVX (256 bit width registers) and AVX-512 (512 bit width registers) extensions.
+In varying degree, they allow to process multiple 32 bit and 64 bit floating point numbers as well as 8, 16, 32 and 64 bit signed and unsigned integers.
+
+*CUDA* capable GPUs do not have vector registers where multiple values of type ``float`` or ``double`` can be manipulated by one instruction.
+Nevertheless, newer *CUDA* capable devices implement basic SIMD instructions on pairs of 16 bit values and quads of 8-bit values.
+They are described in the documentation of the `PTX instruction set architecture <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions>`_ chapter 9.7.13 but are only of any use in very special problem domains, for example for deep learning.
+
+It would be optimal if the compiler could automatically vectorize our kernels when they are called in a loop and vectorization is supported by the underlying accelerator.
+However, besides full blown vector processors, mainstream CPUs do not support predicated execution or similar complex things within vector registers.
+At most, there is support for masking operations which allow to emulate at least some conditional branching.
+Therefore, this missing hardware capability has to be circumvented by the compiler.
+There are scientific research projects such as the work done by Ralf Karrenberg et al [`1 <https://compilers.cs.uni-saarland.de/publications/theses/karrenberg_msc.pdf>`_, `2 <https://compilers.cs.uni-saarland.de/projects/wfv/wfv_cgo11_slides.pdf>`_, `3 <https://compilers.cs.uni-saarland.de/papers/karrenberg_opencl.pdf>`_ ] building on the *LLVM* compiler infrastructure supporting such whole-function vectorization.
+However, current mainstream compilers do not support automatic vectorization of basic, non trivial loops containing control flow statements (``if``, ``else``, ``for``, etc.) or other non-trivial memory operations.
+Therefore, it has to be made easier for the compiler to recognize the vectorization possibilities by making it more explicit.
+
+The opposite of automatic whole function vectorization is the fully explicit vectorization of expressions via compiler intrinsics directly resulting in the desired assembly instruction.
+A big problem when trying to utilize fully explicit vectorization is, that there is no common foundation supported by all explicit vectorization methods.
+A wrapper unifying the x86 SIMD intrinsics found in the ``intrin.h`` or ``x86intrin.h`` headers with those supported on other platforms, for example ARM NEON (``arm_neon.h``), PowerPC Altivec (``altivec.h``) or *CUDA* is not available and to write one is a huge task in itself.
+However, if this would become available in the future, it could easily be integrated into *alpaka* kernels.
+
+Due to current compilers being unable to vectorize whole functions and the explicit vectorization intrinsics not being portable, one has to rely on the vectorization capabilities of current compilers for primitive loops only consisting of a few computations.
+By creating a grid of data elements, where multiple elements are processed per thread and threads are pooled in independent blocks, as it is shown in the figure below, the user is free to loop sequentially over the elements or to use vectorization for selected expressions within the kernel.
+Even the sequential processing of multiple elements per thread can be useful depending on the architecture.
+For example, the *NVIDIA cuBLAS* general matrix-matrix multiplication (GEMM) internally executes only one thread for each second matrix data element to better utilize the registers available per thread.
+
+.. image:: /images/element.png
+
+.. note::
+   The best solution to vectorization would be one, where the user does not have to do anything.
+   This is not possible because the smallest unit supplied by the user is a kernel which is executed in threads which can synchronize.
+
+   It is not possible to execute multiple kernels sequentially to hide the vectorization by starting a kernel-thread for e.g. each 4th thread in a block and then looping over the 4 entries.
+   This would prohibit the synchronization between these threads.
+   By executing 4 fibers inside such a vectorization kernel-thread we would allow synchronization again but prevent the loop vectorizer from working.
+   
+Summary
+-------
+
+This abstraction is called *Redundant Hierarchical Parallelism*.
+This term is inspired by the paper *The Future of Accelerator Programming: Abstraction, Performance or Can We Have Both?*
+`PDF <http://olab.is.s.u-tokyo.ac.jp/~kamil.rocki/rocki_burtscher_sac14.pdf>`_
+`DOI <https://dx.doi.org/10.1109/ICPADS.2013.76>`_
+It investigates a similar *concept of copious parallel programming* reaching 80%-90% of the native performance while comparing CPU and GPU centric versions of an *OpenCL* n-body simulation with a general version utilizing parallelism on multiple hierarchy levels.
+
+The *CUDA* or *OpenCL* abstractions themselves are very similar to the one designed in the previous sections and consists of all but the Element level.
+However, as has been shown, all five abstraction hierarchy levels are necessary to fully utilize current architectures.
+By emulating unsupported or ignoring redundant levels of parallelism, algorithms written with this abstraction can always be mapped optimally to all supported accelerators. The following table summarizes the characteristics of the proposed hierarchy levels.
+
+    +-----------------+-----------------------+----------------+
+    | Hierarchy Level | Parallelism           | Synchronizable |
+    +-----------------+-----------------------+----------------+
+    | ---             | ---                   | ---            |
+    +-----------------+-----------------------+----------------+
+    | grid            | sequential / parallel | -- / X         |
+    +-----------------+-----------------------+----------------+
+    | block           | parallel              | --             |
+    +-----------------+-----------------------+----------------+
+    | warp            | parallel              | X              |
+    +-----------------+-----------------------+----------------+
+    | thread          | parallel / lock-step  | X              |
+    +-----------------+-----------------------+----------------+
+    | element         | sequential            | --             |
+    +-----------------+-----------------------+----------------+
+
+Depending on the queue a task is enqueued into, grids will either run in sequential order within the same queue or in parallel in different queues.
+They can be synchronized by using events.
+Blocks can not be synchronized and therefore can use the whole spectrum of parallelism ranging from fully parallel up to fully sequential execution depending on the device.
+Warps combine the execution of multiple threads in lock-step and can be synchronized implicitly by synchronizing the threads they contain.
+Threads within a block are executed in parallel warps and each thread computes a number of data elements sequentially.
+
diff --git a/thirdParty/cupla/alpaka/docs/source/basic/cheatsheet.rst b/thirdParty/cupla/alpaka/docs/source/basic/cheatsheet.rst
new file mode 100644
index 0000000000..08e9d4cba5
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/basic/cheatsheet.rst
@@ -0,0 +1,281 @@
+Cheatsheet
+==========
+
+.. only:: html
+
+   Download pdf version :download:`here <../../cheatsheet/cheatsheet.pdf>`
+
+General
+-------
+
+- Getting alpaka: https://github.com/alpaka-group/alpaka
+- Issue tracker, questions, support: https://github.com/alpaka-group/alpaka/issues
+- All alpaka names are in namespace alpaka and header file `alpaka/alpaka.hpp`
+- This document assumes
+
+  .. code-block:: c++
+
+     #include <alpaka/alpaka.hpp>
+     using namespace alpaka;
+
+.. raw:: pdf
+
+   Spacer 0,5
+
+Accelerator and Device
+----------------------
+
+Define in-kernel thread indexing type
+  .. code-block:: c++
+
+    using Dim = DimInt<constant>;
+    using Idx = IntegerType;
+
+Define accelerator type (CUDA, OpenMP,etc.)
+  .. code-block:: c++
+
+    using Acc = AcceleratorType<Dim,Idx>;
+
+  AcceleratorType:
+     .. code-block:: c++
+
+	AccGpuCudaRt,
+	AccCpuOmp2Blocks,
+	AccCpuOmp2Threads,
+	AccCpuOmp4,
+	AccCpuTbbBlocks,
+	AccCpuThreads,
+	AccCpuFibers,
+	AccCpuSerial
+
+
+Select device for the given accelerator by index
+   .. code-block:: c++
+
+      auto const device = getDevByIdx<Acc>(index);
+
+
+Queue and Events
+----------------
+
+Create a queue for a device
+  .. code-block:: c++
+
+    using Queue = Queue<Acc, Property>;
+    auto queue = Queue{device};
+
+  Property:
+     .. code-block:: c++
+
+	Blocking
+	NonBlocking
+
+Put a task for execution
+  .. code-block:: c++
+
+    enqueue(queue, task);
+
+Wait for all operations in the queue
+  .. code-block:: c++
+
+    wait(queue);
+
+Create an event
+  .. code-block:: c++
+
+     Event<Queue> event{device};
+
+Put an event to the queue
+  .. code-block:: c++
+
+     enqueue(queue, event);
+
+Check if the event is completed
+  .. code-block:: c++
+
+     isComplete(event);
+
+Wait for the event (and all operations put to the same queue before it)
+  .. code-block:: c++
+
+     wait(event);
+
+Memory
+------
+
+Memory allocation and transfers are symmetric for host and devices, both done via alpaka API
+
+Create a CPU device for memory allocation on the host side
+  .. code-block:: c++
+
+     auto const devHost = getDevByIdx<DevCpu>(0u);
+
+Allocate a buffer in host memory
+  .. code-block:: c++
+
+     Vec<Dim, Idx> extent = value;
+     using BufHost = Buf<DevHost, DataType, Dim, Idx>;
+     BufHost bufHost = allocBuf<DataType, Idx>(devHost, extent);
+
+(Optional, affects CPU – GPU memory copies) Prepare it for asynchronous memory copies
+  .. code-block:: c++
+
+     prepareForAsyncCopy(bufHost);
+
+Get a raw pointer to a buffer initialization, etc.
+  .. code-block:: c++
+
+     DataType * raw = view::getPtrNative(bufHost);
+
+Allocate a buffer in device memory
+  .. code-block:: c++
+
+     auto bufDevice = allocBuf<DataType, Idx>(device, extent);
+
+Enqueue a memory copy from host to device
+  .. code-block:: c++
+
+     memcpy(queue, bufDevice, bufHost, extent);
+
+Enqueue a memory copy from device to host
+  .. code-block:: c++
+
+     memcpy(queue, bufHost, bufDevice, extent);
+
+.. raw:: pdf
+
+   PageBreak
+
+Kernel Execution
+----------------
+
+Automatically select a valid kernel launch configuration
+  .. code-block:: c++
+
+     Vec<Dim, Idx> const globalThreadExtent = vectorValue;
+     Vec<Dim, Idx> const elementsPerThread = vectorValue;
+
+     auto autoWorkDiv = getValidWorkDiv<Acc>(
+       device,
+       globalThreadExtent, elementsPerThread,
+       false,
+       GridBlockExtentSubDivRestrictions::Unrestricted);
+
+Manually set a kernel launch configuration
+  .. code-block:: c++
+
+     Vec<Dim, Idx> const blocksPerGrid = vectorValue;
+     Vec<Dim, Idx> const threadsPerBlock = vectorValue;
+     Vec<Dim, Idx> const elementsPerThread = vectorValue;
+
+     using WorkDiv = WorkDivMembers<Dim, Idx>;
+     auto manualWorkDiv = WorkDiv{blocksPerGrid,
+                                  threadsPerBlock,
+				  elementsPerThread};
+
+Instantiate a kernel and create a task that will run it (does not launch it yet)
+  .. code-block:: c++
+
+     Kernel kernel{argumentsForConstructor};
+     auto taskRunKernel = createTaskKernel<Acc>(workDiv,
+                                                        kernel,
+							parameters);
+
+acc parameter of the kernel is provided automatically, does not need to be specified here
+
+Put the kernel for execution
+  .. code-block:: c++
+
+     enqueue(queue, taskRunKernel);
+
+Kernel Implementation
+---------------------
+
+Define a kernel as a C++ functor
+  .. code-block:: c++
+
+     struct Kernel {
+        template<typename Acc>
+        ALPAKA_FN_ACC void operator()(Acc const & acc, parameters) const { ... }
+     };
+
+``ALPAKA_FN_ACC`` is required for kernels and functions called inside, ``acc`` is mandatory first parameter, its type is the template parameter
+
+Access multi-dimensional indices and extents of blocks, threads, and elements
+  .. code-block:: c++
+
+     auto idx = getIdx<Origin, Unit>(acc);
+     auto extent = getWorkdiv<Origin, Unit>(acc);
+
+  Origin:
+     .. code-block:: c++
+
+	Grid, Block, Thread
+
+  Unit:
+     .. code-block:: c++
+
+	Blocks, Threads, Elems
+
+Access components of multi-dimensional indices and extents
+  .. code-block:: c++
+
+     auto idxX = idx[0];
+
+Linearize multi-dimensional vectors
+  .. code-block:: c++
+
+     auto linearIdx = mapIdx<1u>(idx, extent);
+
+.. raw:: pdf
+
+   Spacer 0,8
+
+Allocate static shared memory variable
+  .. code-block:: c++
+
+     Type & var = declareSharedVar<Type, __COUNTER__>(acc);
+
+Get dynamic shared memory pool, requires the kernel to specialize
+  .. code-block:: c++
+
+     traits::BlockSharedMemDynSizeBytes
+       Type * dynamicSharedMemoryPool = getDynSharedMem<Type>(acc);
+
+Synchronize threads of the same block
+  .. code-block:: c++
+
+     block::sync::syncBlockThreads(acc);
+
+Atomic operations
+  .. code-block:: c++
+
+     auto result = atomicOp<Operation>(acc,
+                                       arguments,
+                                       OperationHierarchy{});
+
+  Operation (all in `op`):
+     .. code-block:: c++
+
+	namespace op
+           Add, Sub, Min, Max, Exch, Inc, Dec, And, Or, Xor, Cas
+
+  OperationHierarchy (all in hierarchy):
+     .. code-block:: c++
+
+	namespace hierarchy
+	   Threads, Blocks, Grids
+
+Math functions take acc as additional first argument
+  .. code-block:: c++
+
+     math::sin(acc, argument);
+
+Similar for other math functions.
+
+Generate random numbers
+  .. code-block:: c++
+
+     auto distribution = rand::distribution::createNormalReal<double>(acc);
+     auto generator = rand::generator::createDefault(acc, seed, subsequence);
+     auto number = distribution(generator);
diff --git a/thirdParty/cupla/alpaka/docs/source/basic/install.rst b/thirdParty/cupla/alpaka/docs/source/basic/install.rst
new file mode 100644
index 0000000000..1d199c9571
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/basic/install.rst
@@ -0,0 +1,46 @@
+.. highlight:: bash
+
+Installation
+============
+
+* Clone alpaka from github.com
+
+.. code-block::
+
+  git clone https://github.com/alpaka-group/alpaka
+  cd alpaka
+
+* Install alpaka
+
+.. code-block::
+
+  # git clone https://github.com/alpaka-group/alpaka
+  # cd alpaka
+  mkdir build && cd build
+  cmake -DCMAKE_INSTALL_PREFIX=/install/ ..
+  cmake --install .
+
+* Configure Accelerators
+
+.. code-block::
+
+  # ..
+  cmake -DALPAKA_ACC_GPU_CUDA_ENABLE=ON ..
+
+* Build an example
+
+.. code-block::
+
+  # ..
+  cmake -Dalpaka_BUILD_EXAMPLES=ON ..
+  make vectorAdd
+  ./example/vectorAdd/vectorAdd # execution
+
+* Build and run tests
+
+.. code-block::
+
+  # ..
+  cmake -DBUILD_TESTING=ON ..
+  make
+  ctest
diff --git a/thirdParty/cupla/alpaka/docs/source/basic/intro.rst b/thirdParty/cupla/alpaka/docs/source/basic/intro.rst
new file mode 100644
index 0000000000..fed72512bc
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/basic/intro.rst
@@ -0,0 +1,90 @@
+Introduction
+============
+
+The *alpaka* library defines and implements an abstract interface for the *hierarchical redundant parallelism* model.
+This model exploits task- and data-parallelism as well as memory hierarchies at all levels of current multi-core architectures.
+This allows to achieve performance portability across various types of accelerators by ignoring specific unsupported levels and utilizing only the ones supported on a specific accelerator.
+All hardware types (multi- and many-core CPUs, GPUs and other accelerators) are treated and can be programmed in the same way.
+The *alpaka* library provides back-ends for *CUDA*, *OpenMP*, *Boost.Fiber* and other methods.
+The policy-based C++ template interface provided allows for straightforward user-defined extension of the library to support other accelerators.
+
+The library name *alpaka* is an acronym standing for **A**\ bstraction **L**\ ibrary for **Pa**\ rallel **K**\ ernel **A**\ cceleration.
+
+Example
+-------
+
+.. literalinclude:: ../../../example/helloWorld/src/helloWorld.cpp
+   :language: C++
+   :caption: helloWorld.cpp
+
+.. code-block:: cmake
+   :caption: CMakeLists.txt
+
+   cmake_minimum_required(VERSION 3.15)
+
+   set(_TARGET_NAME helloWorld)
+   project(${_TARGET_NAME})
+
+   find_package(alpaka REQUIRED)
+
+   alpaka_add_executable(${_TARGET_NAME} helloWorld.cpp)
+   target_link_libraries(
+     ${_TARGET_NAME}
+     PUBLIC alpaka::alpaka)
+
+You can integrate alpaka into your project via ``find_package()`` in your ``CMakeLists.txt``.
+This requires, that you :doc:`install </basic/install>` alpaka.
+If you do not install alpaka in a default path such as ``/usr/local/`` you have to set the ``CMake`` argument ``-Dalpaka_ROOT=/path/to/alpaka/install``.
+
+The cmake configuration decides which alpaka accelerators are available during compiling. For example, if you configure your ``cmake`` build with the CUDA back-end (``-DALPAKA_ACC_GPU_CUDA_ENABLE=ON``), ``cmake`` checks, if the CUDA SDK is available and if it found, the C++ template ``alpaka::acc::AccGpuCudaRt`` is available during compiling.
+
+About alpaka
+------------
+
+alpaka is ...
+~~~~~~~~~~~~~
+
+Abstract
+   It describes parallel execution on multiple hierarchy levels. It allows to implement a mapping to various hardware architectures but is no optimal mapping itself.
+
+Sustainable
+   *alpaka* decouples the application from the availability of different accelerator frameworks in different versions, such as OpenMP, CUDA, HIP, etc. (50% on the way to reach full performance portability).
+
+Heterogeneous
+   An identical algorithm / kernel can be executed on heterogeneous parallel systems by selecting the target device. This allows the best performance for each algorithm and/or a good utilization of the system without major code changes.
+
+Maintainable
+   *alpaka* allows to provide a single version of the algorithm / kernel that can be used by all back-ends. There is no need for "copy and paste" kernels with different API calls for different accelerators. All the accelerator dependent implementation details are hidden within the *alpaka* library.
+
+Testable
+   Due to the easy back-end switch, no special hardware is required for testing the kernels. Even if the simulation itself always uses the *CUDA* back-end, the tests can completely run on a CPU. As long as the *alpaka* library is thoroughly tested for compatibility between the acceleration back-ends, the user simulation code is guaranteed to generate identical results (ignoring rounding errors / non-determinism) and is portable without any changes.
+
+Optimizable
+   Everything in *alpaka* can be replaced by user code to optimize for special use-cases.
+
+Extensible
+   Every concept described by the *alpaka* abstraction can be implemented by users. Therefore it is possible to non-intrusively define new devices, queues, buffer types or even whole accelerator back-ends.
+
+Data Structure Agnostic
+   The user can use and define arbitrary data structures.
+
+alpaka does not ...
+~~~~~~~~~~~~~~~~~~~
+
+Automatically provide an optimal mapping of kernels to various acceleration platforms
+   Except in trivial examples an optimal execution always depends on suitable selected data structures. An adaptive selection of data structures is a separate topic that has to be implemented in a distinct library.
+
+Automatically optimize concurrent data access
+   *alpaka* does not provide feature to create optimized memory layouts.
+
+Handle differences in arithmetic operations
+   For example, due to **different rounding** or different implementations of floating point operations, results can differ slightly between accelerators.
+
+Guarantee determinism of results
+   Due to the freedom of the library to reorder or repartition the threads within the tasks it is not possible or even desired to preserve deterministic results. For example, the non-associativity of floating point operations give non-deterministic results within and across accelerators.
+
+The *alpaka* library is aimed at parallelization on shared memory, i.e. within nodes of a cluster.
+It does not compete with libraries for distribution of processes across nodes and communication among those.
+For these purposes libraries like MPI (Message Passing Interface) or others should be used.
+MPI is situated one layer higher and can be combined with *alpaka* to facilitate the hardware of a whole heterogeneous cluster.
+The *alpaka* library can be used for parallelization within nodes, MPI for parallelization across nodes.
diff --git a/thirdParty/cupla/alpaka/docs/source/basic/library.rst b/thirdParty/cupla/alpaka/docs/source/basic/library.rst
new file mode 100644
index 0000000000..0f06ab1b20
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/basic/library.rst
@@ -0,0 +1,162 @@
+Library Interface
+=================
+
+As described in the chapter about the :doc:`Abstraction </basic/abstraction>`, the general design of the library is very similar to *CUDA* and *OpenCL* but extends both by some points, while not requiring any language extensions.
+General interface design as well as interface implementation decisions differentiating *alpaka* from those libraries are described in the Rationale section.
+It uses C++ because it is one of the most performant languages available on nearly all systems.
+Furthermore, C++14 allows to describe the concepts in a very abstract way that is not possible with many other languages.
+The *alpaka* library extensively makes use of advanced functional C++ template meta-programming techniques.
+The Implementation Details section discusses the C++ library and the way it provides extensibility and optimizability.
+
+Structure
+---------
+
+The *alpaka* library allows offloading of computations from the host execution domain to the accelerator execution domain, whereby they are allowed to be identical.
+
+In the abstraction hierarchy the library code is interleaved with user supplied code as is depicted in the following figure.
+
+.. image:: /images/execution_domain.png
+   :alt: Execution Domains
+
+User code invokes library functions, which in turn execute the user provided thread function (kernel) in parallel on the accelerator.
+The kernel in turn calls library functions when accessing accelerator properties and methods.
+Additionally, the user can enhance or optimize the library implementations by extending or replacing specific parts.
+
+The *alpaka* abstraction itself only defines requirements a type has to fulfill to be usable with the template functions the library provides.
+These type constraints are called concepts in C++.
+
+*A concept is a set of requirements consisting of valid expressions, associated types, invariants, and complexity guarantees.
+A type that satisfies the requirements is said to model the concept.
+A concept can extend the requirements of another concept, which is called refinement.* `BoostConcepts <https://www.boost.org/community/generic_programming.html>`_
+
+Concepts allow to safely define polymorphic algorithms that work with objects of many different types.
+
+The *alpaka* library implements a stack of concepts and their interactions modeling the abstraction defined in the previous chapter.
+Furthermore, default implementations for various devices and accelerators modeling those are included in the library.
+The interaction of the main user facing concepts can be seen in the following figure.
+
+.. image:: /images/structure_assoc.png
+   :alt: user / alpaka code interaction
+
+
+For each type of ``Device`` there is a ``Platform`` for enumerating the available ``Device``s.
+A ``Device`` is the requirement for creating ``Queues`` and ``Events`` as it is for allocating ``Buffers`` on the respective ``Device``. ``Buffers`` can be copied, their memory be set and they can be pinned or mapped.
+Copying and setting a buffer requires the corresponding ``Copy`` and ``Set`` tasks to be enqueued into the ``Queue``.
+An ``Event`` can be enqueued into a ``Queue`` and its completion state can be queried by the user.
+It is possible to wait for (synchronize with) a single ``Event``, a ``Queue`` or a whole ``Device``.
+An ``Executor`` can be enqueued into a ``Queue`` and will execute the ``Kernel`` (after all previous tasks in the queue have been completed).
+The ``Kernel`` in turn has access to the ``Accelerator`` it is running on.
+The ``Accelerator`` provides the ``Kernel`` with its current index in the block or grid, their extents or other data as well as it allows to allocate shared memory, execute atomic operations and many more.
+
+
+Interface Usage
+---------------
+
+Accelerator Functions
+`````````````````````
+
+Functions that should be executable on an accelerator have to be annotated with the execution domain (one of ``ALPAKA_FN_HOST``, ``ALPAKA_FN_ACC`` and ``ALPAKA_FN_HOST_ACC``).
+They most probably also require access to the accelerator data and methods, such as indices and extents as well as functions to allocate shared memory and to synchronize all threads within a block.
+Therefore the accelerator has to be passed in as a templated constant reference parameter as can be seen in the following code snippet.
+
+.. code-block:: cpp
+
+   template<
+       typename TAcc>
+   ALPAKA_FN_ACC auto doSomethingOnAccelerator(
+       TAcc const & acc/*,
+       ...*/)                  // Arbitrary number of parameters
+   -> int                      // Arbitrary return type
+   {
+       //...
+   }
+
+
+Kernel Definition
+`````````````````
+
+A kernel is a special function object which has to conform to the following requirements:
+
+* it has to fulfill the ``std::is_trivially_copyable`` trait (has to be copyable via memcpy)
+* the ``operator()`` is the kernel entry point
+  * it has to be an accelerator executable function
+  * it has to return ``void``.
+  * its first argument has to be the accelerator (templated for arbitrary accelerator back-ends).
+
+The following code snippet shows a basic example of a kernel function object.
+
+.. code-block:: cpp
+
+   struct MyKernel
+   {
+       template<
+           typename TAcc>       // Templated on the accelerator type.
+       ALPAKA_FN_ACC            // Macro marking the function to be executable on all accelerators.
+       auto operator()(         // The function / kernel to execute.
+           TAcc const & acc/*,  // The specific accelerator implementation.
+           ...*/) const         // Must be 'const'.
+       -> void
+       {
+           //...
+       }
+                         // Class can have members but has to be std::is_trivially_copyable.
+                         // Classes must not have pointers or references to host memory!
+   };
+
+The kernel function object is shared across all threads in all blocks.
+Due to the block execution order being undefined, there is no safe and consistent way of altering state that is stored inside of the function object.
+Therefore, the ``operator()`` of the kernel function object has to be ``const`` and is not allowed to modify any of the object members.
+
+
+Index and Work Division
+```````````````````````
+
+The ``alpaka::getWorkDiv`` and the ``alpaka::getIdx`` functions both return a vector of the dimensionality the accelerator has been defined with.
+They are parametrized by the origin of the calculation as well as the unit in which the values are calculated.
+For example, ``alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)`` returns a vector with the extents of the grid in units of threads.
+
+
+Memory Management
+`````````````````
+
+The memory allocation function of the *alpaka* library (``alpaka::allocBuf<TElem>(device, extents)``) is uniform for all devices, even for the host device.
+It does not return raw pointers but reference counted memory buffer objects that remove the necessity for manual freeing and the possibility of memory leaks.
+Additionally the memory buffer objects know their extents, their pitches as well as the device they reside on.
+This allows buffers that possibly reside on different devices with different pitches to be copied only by providing the buffer objects as well as the extents of the region to copy (``alpaka::memcpy(bufDevA, bufDevB, copyExtents``).
+
+Kernel Execution
+````````````````
+
+The following source code listing shows the execution of a kernel by enqueuing the execution task into a queue.
+
+.. code-block:: cpp
+
+   // Define the dimensionality of the task.
+   using Dim = alpaka::DimInt<1u>;
+   // Define the type of the indexes.
+   using Idx = std::size_t;
+   // Define the accelerator to use.
+   using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+   // Select the queue type.
+   using Queue = alpaka::QueueCpuNonBlocking;
+
+   // Select a device to execute on.
+   auto devAcc(alpaka::getDevByIdx<alpaka::PltfCpu>(0));
+   // Create a queue to enqueue the execution into.
+   Queue queue(devAcc);
+
+   // Create a 1-dimensional work division with 256 blocks a 16 threads.
+   auto const workDiv(alpaka::WorkDivMembers<Dim, Idx>(256u, 16u);
+   // Create an instance of the kernel function object.
+   MyKernel kernel;
+   // Enqueue the execution task into the queue.
+   alpaka::exec<Acc>(queue, workDiv, kernel/*, arguments ...*/);
+
+The dimensionality of the task as well as the type for index and extent have to be defined explicitly.
+Following this, the type of accelerator to execute on, as well as the type of the queue have to be defined.
+For both of these types instances have to be created.
+For the accelerator this has to be done indirectly by enumerating the required device via the device manager, whereas the queue can be created directly.
+
+To execute the kernel, an instance of the kernel function object has to be constructed.
+Following this, an execution task combining the work division (grid and block sizes) with the kernel function object and the bound invocation arguments has to be created.
+After that this task can be enqueued into a queue for immediate or later execution (depending on the queue used).
diff --git a/thirdParty/cupla/alpaka/docs/source/conf.py b/thirdParty/cupla/alpaka/docs/source/conf.py
new file mode 100644
index 0000000000..7f7ec2fbe9
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/conf.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+# Configuration file for the Sphinx documentation builder.
+
+import os
+import subprocess
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'alpaka'
+copyright = 'Documentation under CC-BY 4.0, Benjamin Worpitz, René Widera, Axel Huebl, Michael Bussmann'
+author = 'Benjamin Worpitz, René Widera, Axel Huebl, Michael Bussmann'
+# The short X.Y version.
+version = u'0.6.0'
+# The full version, including alpha/beta/rc tags.
+release = u'0.6.0'
+
+# The master toctree document.
+master_doc = 'index'
+
+# -- General configuration ---------------------------------------------------
+
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
+show_authors = True
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.mathjax',
+#    'sphinx.ext.napoleon',
+    'breathe',
+    'sphinxcontrib.programoutput',
+#    'matplotlib.sphinxext.plot_directive'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["Thumbs.db", ".DS_Store"]
+
+source_suffix = ['.rst']
+master_doc = 'index'
+language = None
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx' #'default'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# modifies the HTML Sphinx Doc layout
+html_css_files = ["custom.css"]
+
+html_logo = "../logo/alpaka.svg"
+html_theme_options = {
+    "logo_only"  : True
+}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+htmlhelp_basename = 'alpakadoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+    'papersize': 'a4paper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+    'preamble': r'\setcounter{tocdepth}{2}',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+latex_logo = "../logo/alpaka.pdf"
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'alpaka-doc.tex', u'alpaka Documentation',
+     u'The alpaka Community', 'manual'),
+]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'alpaka', u'alpaka Documentation',
+     [author], 1)
+]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'alpaka', u'alpaka Documentation',
+     author, 'alpaka', 'Abstraction Library for Parallel Kernel Acceleration',
+     """
+     The alpaka library is a header-only C++14 abstraction library for
+     accelerator development. Its aim is to provide performance portability
+     across accelerators through the abstraction (not hiding!) of the underlying
+     levels of parallelism.
+     """),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+
+# -- Extension configuration -------------------------------------------------
+
+breathe_projects = { "alpaka": "../doxygen/xml" }
+breathe_default_project = "alpaka"
+
+breathe_domain_by_extension = {
+    "cpp":   "cpp",
+    "h":     "cpp",
+    "hpp":   "cpp",
+    "tpp":   "cpp"
+}
+
+# define alpaka attributes
+# breath has problems to parse C++ attributes
+cpp_id_attributes = ["ALPAKA_FN_ACC",
+                     "ALPAKA_FN_HOST",
+                     "ALPAKA_FN_HOST_ACC",
+                     "ALPAKA_FN_INLINE",
+                     "ALPAKA_NO_HOST_ACC_WARNING",
+                     "ALPAKA_STATIC_ACC_MEM_CONSTANT",
+                     "ALPAKA_STATIC_ACC_MEM_GLOBAL",
+                     ]
+
+# -- processing --
+
+if on_rtd:
+    subprocess.call('cd ..; doxygen', shell=True)
+    subprocess.call('cd ../cheatsheet; rst2pdf -s cheatsheet.style ../source/basic/cheatsheet.rst -o cheatsheet.pdf', shell=True)
+else:
+    import sphinx_rtd_theme
+    html_theme = "sphinx_rtd_theme"
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
diff --git a/thirdParty/cupla/alpaka/docs/source/dev/backends.rst b/thirdParty/cupla/alpaka/docs/source/dev/backends.rst
new file mode 100644
index 0000000000..36caa60792
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/dev/backends.rst
@@ -0,0 +1,650 @@
+.. highlight:: bash
+
+Back-ends
+=========
+
+Accelerator Implementations
+```````````````````````````
+The table shows which native implementation or information is used to represent an alpaka functionality.
+
+.. table::
+
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | alpaka                                                        | Serial                                        | std::thread                                                                     | Boost.Fiber                                                                    | OpenMP 2.0                                                                          | OpenMP 4.0                                                                                                                            | CUDA 9.0+                                        |
+    +===============================================================+===============================================+=================================================================================+================================================================================+=====================================================================================+=======================================================================================================================================+==================================================+
+    | Devices                                                       | Host Core                                     | Host Cores                                                                      | Host Core                                                                      | Host Cores                                                                          | Host Cores                                                                                                                            | NVIDIA GPUs                                      |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | Lib/API                                                       | standard C++                                  | std::thread                                                                     | boost::fibers::fiber                                                           | OpenMP 2.0                                                                          | OpenMP 4.0                                                                                                                            | CUDA 9.0+                                        |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | Kernel execution                                              | sequential                                    | std::thread(kernel)                                                             | boost::fibers::fiber(kernel)                                                   | omp_set_dynamic(0), #pragma omp parallel num_threads(iNumKernelsInBlock)            | #pragma omp target, #pragma omp teams num_teams(...) thread_limit(...), #pragma omp distribute, #pragma omp parallel num_threads(...) | cudaConfigureCall, cudaSetupArgument, cudaLaunch |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | Execution strategy grid-blocks                                | sequential                                    | sequential                                                                      | sequential                                                                     | sequential                                                                          | undefined                                                                                                                             | undefined                                        |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | Execution strategy block-kernels                              | sequential                                    | preemptive multitasking                                                         | cooperative multithreading                                                     | preemptive multitasking                                                             | preemptive multitasking                                                                                                               | lock-step within warps                           |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | getIdx                                                        | emulated                                      | block-kernel: mapping of std::this_thread::get_id() grid-block: member variable | block-kernel: mapping of std::this_fiber::get_id() grid-block: member variable | block-kernel: omp_get_num_threads() to 3D index mapping grid-block: member variable | block-kernel: omp_get_num_threads() to 3D index mapping grid-block: member variable                                                   | threadIdx, blockIdx                              |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | getExtent                                                     | member variables                              | member variables                                                                | member variables                                                               | member variables                                                                    | member variables                                                                                                                      | gridDim, blockDim                                |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | getBlockSharedMemDynSizeBytes                                 | allocated in memory prior to kernel execution | allocated in memory prior to kernel execution                                   | allocated in memory prior to kernel execution                                  | allocated in memory prior to kernel execution                                       | allocated in memory prior to kernel execution                                                                                         | __shared__                                       |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | allocBlockSharedMem                                           | master thread allocates                       | syncBlockKernels -> master thread allocates -> syncBlockKernels                 | syncBlockKernels -> master thread allocates -> syncBlockKernels                | syncBlockKernels -> master thread allocates -> syncBlockKernels                     | syncBlockKernels -> master thread allocates -> syncBlockKernels                                                                       | __shared__                                       |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | syncBlockKernels                                              | not required                                  | barrier                                                                         | barrier                                                                        | #pragma omp barrier                                                                 | #pragma omp barrier                                                                                                                   | __syncthreads                                    |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | atomicOp                                                      | hierarchy depended                            | std::lock_guard< std::mutex >                                                   | n/a                                                                            | #pragma omp critical                                                                | #pragma omp critical                                                                                                                  | atomicXXX                                        |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+    | ALPAKA_FN_HOST_ACC, ALPAKA_FN_ACC, ALPAKA_FN_HOST             | inline                                        | inline                                                                          | inline                                                                         | inline                                                                              | inline                                                                                                                                | __device__, __host__, __forceinline__            |
+    +---------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
+
+Serial
+``````
+
+The serial accelerator only allows blocks with exactly one thread.
+Therefore it does not implement real synchronization or atomic primitives.
+
+Threads
+```````
+
+Execution
++++++++++
+
+To prevent recreation of the threads between execution of different blocks in the grid, the threads are stored inside a thread pool.
+This thread pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage and lots of idling kernel-threads when there are multiple KernelExecutors around.
+Because the default policy of the threads in the pool is to yield instead of waiting, this would also slow down the system immensely.
+
+Fibers
+``````
+
+Execution
++++++++++
+
+To prevent recreation of the fibers between execution of different blocks in the grid, the fibers are stored inside a fibers pool.
+This fiber pool is local to the invocation because making it local to the KernelExecutor could mean a heavy memory usage when there are multiple KernelExecutors around.
+
+OpenMP
+``````
+
+Execution
++++++++++
+
+Parallel execution of the kernels in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line.
+So we have to spawn one real thread per kernel in a block.
+``omp for`` is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required.
+Therefore we use ``omp parallel`` with the specified number of threads in a block.
+Another reason for not using ``omp for`` like ``#pragma omp parallel for collapse(3) num_threads(blockDim.x*blockDim.y*blockDim.z)`` is that ``#pragma omp barrier`` used for intra block synchronization is not allowed inside ``omp for`` blocks.
+
+Because OpenMP is designed for a 1:1 abstraction of hardware to software threads, the block size is restricted by the number of OpenMP threads allowed by the runtime.
+This could be as little as 2 or 4 kernels but on a system with 4 cores and hyper-threading OpenMP can also allow 64 threads.
+
+Index
++++++
+
+OpenMP only provides a linear thread index. This index is converted to a 3 dimensional index at runtime.
+
+Atomic
+++++++
+
+We can not use ``#pragma omp atomic`` because braces or calling other functions directly after ``#pragma omp atomic`` are not allowed.
+Because we are implementing the CUDA atomic operations which return the old value, this requires ``#pragma omp critical`` to be used.
+``omp_set_lock`` is an alternative but is usually slower.
+
+CUDA
+````
+
+Nearly all CUDA functionality can be directly mapped to alpaka function calls.
+A major difference is that CUDA requires the block and grid sizes to be given in (x, y, z) order. alpaka uses the mathematical C/C++ array indexing scheme [z][y][x]. In both cases x is the innermost / fast running index.
+
+Furthermore alpaka does not require the indices and extents to be 3-dimensional.
+The accelerators are templatized on and support arbitrary dimensionality.
+NOTE: Currently the CUDA implementation is restricted to a maximum of 3 dimensions!
+
+NOTE: You have to be careful when mixing alpaka and non alpaka CUDA code. The CUDA-accelerator back-end can change the current CUDA device and will NOT set the device back to the one prior to the invocation of the alpaka function.
+
+
+Programming Interface
+---------------------
+
+*Function Attributes*
+
+.. table::
+
+   +-----------------------------------------------------+---------------------------------------------------------+
+   | CUDA                                                | alpaka                                                  |
+   +=====================================================+=========================================================+
+   | ``__host__``                                        | ``ALPAKA_FN_HOST``                                      |
+   +-----------------------------------------------------+---------------------------------------------------------+
+   | ``__device__``                                      | ``ALPAKA_FN_ACC``                                       |
+   +-----------------------------------------------------+---------------------------------------------------------+
+   | ``__global__``                                      | ``ALPAKA_FN_ACC``                                       |
+   +-----------------------------------------------------+---------------------------------------------------------+
+   | ``__host__ __device__``                             | ``ALPAKA_FN_HOST_ACC``                                  |
+   +-----------------------------------------------------+---------------------------------------------------------+
+
+.. note::
+
+   You can not call CUDA-only methods, except when ``ALPAKA_ACC_GPU_CUDA_ONLY_MODE`` is enabled.
+
+*Memory*
+
+.. table::
+
+   +-----------------------------------------------------+----------------------------------------------------------------------------+
+   | CUDA                                                | alpaka                                                                     |
+   +=====================================================+============================================================================+
+   | ``__shared__``                                      | ``alpaka::declareSharedVar<std::uint32_t, __COUNTER__>(acc)``              |
+   +-----------------------------------------------------+----------------------------------------------------------------------------+
+   | ``__constant__``                                    | ``ALPAKA_STATIC_ACC_MEM_CONSTANT``                                         |
+   +-----------------------------------------------------+----------------------------------------------------------------------------+
+   | ``__device__``                                      | ``ALPAKA_STATIC_ACC_MEM_GLOBAL``                                           |
+   +-----------------------------------------------------+----------------------------------------------------------------------------+
+
+.. doxygenfunction:: alpaka::declareSharedVar
+   :project: alpaka
+
+.. doxygendefine:: ALPAKA_STATIC_ACC_MEM_CONSTANT
+   :project: alpaka
+
+.. doxygendefine:: ALPAKA_STATIC_ACC_MEM_GLOBAL
+   :project: alpaka
+
+*Index / Work Division*
+
+.. table::
+
+    +---------------------------------+----------------------------------------------------------------------------------+
+    | CUDA                            | alpaka                                                                           |
+    +=================================+==================================================================================+
+    | ``threadIdx``                   | ``alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)``                          |
+    +---------------------------------+----------------------------------------------------------------------------------+
+    | ``blockIdx``                    | ``alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)``                            |
+    +---------------------------------+----------------------------------------------------------------------------------+
+    | ``blockDim``                    | ``alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)``                      |
+    +---------------------------------+----------------------------------------------------------------------------------+
+    | ``gridDim``                     | ``alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)``                        |
+    +---------------------------------+----------------------------------------------------------------------------------+
+    | ``warpSize``                    | ``alpaka::warp::getSize(acc)``                                                   |
+    +---------------------------------+----------------------------------------------------------------------------------+
+
+*Types*
+
+.. table::
+
+    +----------+-------------------------------------+
+    | CUDA     | alpaka                              |
+    +==========+=====================================+
+    | ``dim3`` | ``alpaka::Vec< TDim, TVal >``  |
+    +----------+-------------------------------------+
+
+
+
+CUDA Runtime API
+++++++++++++++++
+
+The following tables list the functions available in the `CUDA Runtime API <https://docs.nvidia.com/cuda/cuda-runtime-api/modules.html#modules>`_ and their equivalent alpaka functions:
+
+*Device Management*
+
+.. table::
+
+    +---------------------------------+-----------------------------------------------------------------------+
+    | CUDA                            | alpaka                                                                |
+    +=================================+=======================================================================+
+    | cudaChooseDevice                | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceGetAttribute          | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceGetByPCIBusId         | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceGetCacheConfig        | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceGetLimit              | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceGetP2PAttribute       | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceGetPCIBusId           | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceGetSharedMemConfig    | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceGetQueuePriorityRange | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceReset                 | alpaka::reset(device)                                                 |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceSetCacheConfig        | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceSetLimit              | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceSetSharedMemConfig    | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaDeviceSynchronize           | void alpaka::wait(device)                                             |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaGetDevice                   | n/a (no current device)                                               |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaGetDeviceCount              | std::sizet alpaka::getDevCount< TPltf >()                             |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaGetDeviceFlags              | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaGetDeviceProperties         | alpaka::getAccDevProps(dev) (Only some properties available)          |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaIpcCloseMemHandle           | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaIpcGetEventHandle           | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaIpcGetMemHandle             | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaIpcOpenEventHandle          | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaIpcOpenMemHandle            | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaSetDevice                   | n/a (no current device)                                               |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaSetDeviceFlags              | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+    | cudaSetValidDevices             | --                                                                    |
+    +---------------------------------+-----------------------------------------------------------------------+
+
+
+*Error Handling*
+
+.. table::
+
+    +---------------------+----------------------------------------------------------+
+    | CUDA                | alpaka                                                   |
+    +=====================+==========================================================+
+    | cudaGetErrorName    | n/a (handled internally, available in exception message) |
+    +---------------------+----------------------------------------------------------+
+    | cudaGetErrorString  | n/a (handled internally, available in exception message) |
+    +---------------------+----------------------------------------------------------+
+    | cudaGetLastError    | n/a (handled internally)                                 |
+    +---------------------+----------------------------------------------------------+
+    | cudaPeekAtLastError | n/a (handled internally)                                 |
+    +---------------------+----------------------------------------------------------+
+
+
+*Queue Management*
+
+.. table::
+
+    +------------------------------+---------------------------------------------------------+
+    | CUDA                         | alpaka                                                  |
+    +==============================+=========================================================+
+    | cudaStreamAddCallback        | alpaka::enqueue(queue, [](){dosomething();})            |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamAttachMemAsync     | --                                                      |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamCreate             | - queue=alpaka::QueueCudaRtNonBlocking(device);         |
+    | \                            | - queue=alpaka::QueueCudaRtBlocking(device);            |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamCreateWithFlags    | see cudaStreamCreate (cudaStreamNonBlocking hard coded) |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamCreateWithPriority | --                                                      |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamDestroy            | n/a (Destructor)                                        |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamGetFlags           | --                                                      |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamGetPriority        | --                                                      |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamQuery              | bool alpaka::empty(queue)                               |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamSynchronize        | void alpaka::wait(queue)                                |
+    +------------------------------+---------------------------------------------------------+
+    | cudaStreamWaitEvent          | void alpaka::wait(queue, event)                         |
+    +------------------------------+---------------------------------------------------------+
+
+*Event Management*
+
+.. table::
+
+    +--------------------------+--------------------------------------------+
+    | CUDA                     | alpaka                                     |
+    +==========================+============================================+
+    | cudaEventCreate          | alpaka::Event< TQueue > event(dev);        |
+    +--------------------------+--------------------------------------------+
+    | cudaEventCreateWithFlags | --                                         |
+    +--------------------------+--------------------------------------------+
+    | cudaEventDestroy         | n/a (Destructor)                           |
+    +--------------------------+--------------------------------------------+
+    | cudaEventElapsedTime     | --                                         |
+    +--------------------------+--------------------------------------------+
+    | cudaEventQuery           | bool alpaka::isComplete(event)             |
+    +--------------------------+--------------------------------------------+
+    | cudaEventRecord          | void alpaka::enqueue(queue, event)         |
+    +--------------------------+--------------------------------------------+
+    | cudaEventSynchronize     | void alpaka::wait(event)                   |
+    +--------------------------+--------------------------------------------+
+
+*Memory Management*
+
+.. table::
+
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | CUDA                       | alpaka                                                                                     |
+    +============================+============================================================================================+
+    | cudaArrayGetInfo           | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaFree                   | n/a (automatic memory management with reference counted memory handles)                    |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaFreeArray              | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaFreeHost               | n/a                                                                                        |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaFreeMipmappedArray     | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaGetMipmappedArrayLevel | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaGetSymbolAddress       | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaGetSymbolSize          | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaHostAlloc              | n/a, the existing buffer can be pinned using alpaka::prepareForAsyncCopy(memBuf)           |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaHostGetDevicePointer   | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaHostGetFlags           | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaHostRegister           | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaHostUnregister         | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMalloc                 | alpaka::allocBuf<TElement>(device, extents1D)                                              |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMalloc3D               | alpaka::allocBuf<TElement>(device, extents3D)                                              |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMalloc3DArray          | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMallocArray            | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMallocHost             | alpaka::allocBuf<TElement>(device, extents) 1D, 2D, 3D suppoorted!                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMallocManaged          | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMallocMipmappedArray   | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMallocPitch            | alpaka::allocBuf<TElement>(device, extents2D)                                              |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemAdvise              | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemGetInfo             | - alpaka::getMemBytes                                                                      |
+    |                            | - alpaka::getFreeMemBytes                                                                  |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemPrefetchAsync       | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemRangeGetAttribute   | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemRangeGetAttributes  | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy                 | alpaka::memcpy(memBufDst, memBufSrc, extents1D)                                            |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy2D               | alpaka::memcpy(memBufDst, memBufSrc, extents2D)                                            |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy2DArrayToArray   | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy2DAsync          | alpaka::memcpy(memBufDst, memBufSrc, extents2D, queue)                                     |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy2DFromArray      | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy2DFromArrayAsync | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy2DToArray        | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy2DToArrayAsync   | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy3D               | alpaka::memcpy(memBufDst, memBufSrc, extents3D)                                            |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy3DAsync          | alpaka::memcpy(memBufDst, memBufSrc, extents3D, queue)                                     |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy3DPeer           | alpaka::memcpy(memBufDst, memBufSrc, extents3D)                                            |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpy3DPeerAsync      | alpaka::memcpy(memBufDst, memBufSrc, extents3D, queue)                                     |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyArrayToArray     | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyAsync            | alpaka::memcpy(memBufDst, memBufSrc, extents1D, queue)                                     |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyFromArray        | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyFromArrayAsync   | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyFromSymbol       | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyFromSymbolAsync  | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyPeer             | alpaka::memcpy(memBufDst, memBufSrc, extents1D)                                            |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyPeerAsync        | alpaka::memcpy(memBufDst, memBufSrc, extents1D, queue)                                     |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyToArray          | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyToArrayAsync     | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyToSymbol         | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyToSymbolAsync    | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemset                 | alpaka::memset(memBufDst, byte, extents1D)                                                 |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemset2D               | alpaka::memset(memBufDst, byte, extents2D)                                                 |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemset2DAsync          | alpaka::memset(memBufDst, byte, extents2D, queue)                                          |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemset3D               | alpaka::memset(memBufDst, byte, extents3D)                                                 |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemset3DAsync          | alpaka::memset(memBufDst, byte, extents3D, queue)                                          |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemsetAsync            | alpaka::memset(memBufDst, byte, extents1D, queue)                                          |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | makecudaExtent             | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | makecudaPitchedPtr         | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | makecudaPos                | --                                                                                         |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyHostToDevice     | n/a (direction of copy is determined automatically)                                        |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+    | cudaMemcpyDeviceToHost     | n/a (direction of copy is determined automatically)                                        |
+    +----------------------------+--------------------------------------------------------------------------------------------+
+
+
+*Execution Control*
+
+.. table::
+
+    +----------------------------+--------------------------------------------------------------------------------------------------------------+
+    | CUDA                       | alpaka                                                                                                       |
+    +============================+==============================================================================================================+
+    | cudaFuncGetAttributes      | --                                                                                                           |
+    +----------------------------+--------------------------------------------------------------------------------------------------------------+
+    | cudaFuncSetCacheConfig     | --                                                                                                           |
+    +----------------------------+--------------------------------------------------------------------------------------------------------------+
+    | cudaFuncSetSharedMemConfig | --                                                                                                           |
+    +----------------------------+--------------------------------------------------------------------------------------------------------------+
+    | cudaLaunchKernel           | - alpaka::exec<TAcc>(queue, workDiv, kernel, params...)                                                      |
+    | \                          | - auto byteDynSharedMem = alpaka::getBlockSharedMemDynSizeBytes(kernel, ...)                                 |
+    +----------------------------+--------------------------------------------------------------------------------------------------------------+
+    | cudaSetDoubleForDevice     | n/a (alpaka assumes double support)                                                                          |
+    +----------------------------+--------------------------------------------------------------------------------------------------------------+
+    | cudaSetDoubleForHost       | n/a (alpaka assumes double support)                                                                          |
+    +----------------------------+--------------------------------------------------------------------------------------------------------------+
+
+*Occupancy*
+
+.. table::
+
+    +--------------------------------------------------------+--------+
+    | CUDA                                                   | alpaka |
+    +========================================================+========+
+    | cudaOccupancyMaxActiveBlocksPerMultiprocessor          | --     |
+    +--------------------------------------------------------+--------+
+    | cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags | --     |
+    +--------------------------------------------------------+--------+
+
+
+*Unified Addressing*
+
+.. table::
+
+    +--------------------------+--------+
+    | CUDA                     | alpaka |
+    +==========================+========+
+    | cudaPointerGetAttributes | --     |
+    +--------------------------+--------+
+
+
+*Peer Device Memory Access*
+
+.. table::
+
+    +-----------------------------+----------------------------------+
+    | CUDA                        | alpaka                           |
+    +=============================+==================================+
+    | cudaDeviceCanAccessPeer     | --                               |
+    +-----------------------------+----------------------------------+
+    | cudaDeviceDisablePeerAccess | --                               |
+    +-----------------------------+----------------------------------+
+    | cudaDeviceEnablePeerAccess  | automatically done when required |
+    +-----------------------------+----------------------------------+
+
+**OpenGL, Direct3D, VDPAU, EGL, Graphics Interoperability**
+
+*not available*
+
+**Texture/Surface Reference/Object Management**
+
+*not available*
+
+**Version Management**
+
+*not available*
+
+
+
+HIP
+```
+
+.. warning::
+
+   The HIP documentation is outdated and must be overworked.
+
+Current Restrictions on HCC platform
+++++++++++++++++++++++++++++++++++++
+
+- Workaround for unsupported ``syncthreads_{count|and|or}``.
+
+  - Uses temporary shared value and atomics
+
+- Workaround for buggy ``hipStreamQuery``, ``hipStreamSynchronize``.
+
+  - Introduces own queue management
+  - ``hipStreamQuery`` and ``hipStreamSynchronize`` do not work in multithreaded environment
+
+- Workaround for missing ``cuStreamWaitValue32``.
+
+  - Polls value each 10 ms
+
+- Device constant memory not supported yet
+- Note that ``printf`` in kernels is still not supported in HIP
+- Exclude ``hipMalloc3D`` and ``hipMallocPitch`` when size is zero otherwise they throw an Unknown Error
+- ``TestAccs`` excludes 3D specialization of HIP back-end for now because ``verifyBytesSet`` fails in ``memView`` for 3D specialization
+- ``dim3`` structure is not available on device (use ``alpaka::Vec`` instead)
+- Constructors' attributes unified with destructors'.
+
+  - Host/device signature must match in HIP(HCC)
+
+- A chain of functions must also provide correct host-device signatures
+
+  - E.g. a host function cannot be called from a host-device function
+
+- Recompile your target when HCC linker returned the error:
+  "File format not recognized
+  clang-7: error: linker command failed with exit code 1"
+- If compile-error occurred the linker still may link, but without the device code
+- AMD device architecture currently hardcoded in ``alpakaConfig.cmake``
+
+Compiling HIP from Source
++++++++++++++++++++++++++
+
+Follow `HIP Installation`_ guide for installing HIP.
+HIP requires either *nvcc* or *hcc* to be installed on your system (see guide for further details).
+
+.. _HIP Installation: https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md
+
+- If you want the HIP binaries to be located in a directory that does not require superuser access, be sure to change the install directory of HIP by modifying the ``CMAKE_INSTALL_PREFIX`` cmake variable.
+- Also, after the installation is complete, add the following line to the ``.profile`` file in your home directory, in order to add the path to the HIP binaries to PATH: ``PATH=$PATH:<path_to_binaries>``
+
+.. code-block::
+
+   git clone --recursive https://github.com/ROCm-Developer-Tools/HIP.git
+   cd HIP
+   mkdir -p build
+   cd build
+   cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX=${YOUR_HIP_INSTALL_DIR} -DBUILD_TESTING=OFF ..
+   make
+   make install
+
+- Set the appropriate paths (edit ``${YOUR_**}`` variables)
+
+.. code-block::
+
+  # HIP_PATH required by HIP tools
+  export HIP_PATH=${YOUR_HIP_INSTALL_DIR}
+  # Paths required by HIP tools
+  export CUDA_PATH=${YOUR_CUDA_ROOT}
+  # - if required, path to HCC compiler. Default /opt/rocm/hcc.
+  export HCC_HOME=${YOUR_HCC_ROOT}
+  # - if required, path to HSA include, lib. Default /opt/rocm/hsa.
+  export HSA_PATH=${YOUR_HSA_PATH}
+  # HIP binaries and libraries
+  export PATH=${HIP_PATH}/bin:$PATH
+  export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${LD_LIBRARY_PATH}
+
+- Test the HIP binaries
+
+.. code-block::
+
+  # calls nvcc or hcc
+  which hipcc
+  hipcc -V
+  which hipconfig
+  hipconfig -v
+
+
+Verifying HIP Installation
+++++++++++++++++++++++++++
+
+- If PATH points to the location of the HIP binaries, the following command should list several relevant environment variables, and also the selected compiler on your ``system-\`hipconfig -f\```
+- Compile and run the `square sample`_, as pointed out in the original `HIP install guide`_.
+
+.. _square sample: https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/0_Intro/square
+.. _HIP install guide: https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#user-content-verify-your-installation
+
+Compiling Examples with HIP Back End
+++++++++++++++++++++++++++++++++++++
+
+As of now, the back-end has only been tested on the NVIDIA platform.
+
+* NVIDIA Platform
+
+  * One issue in this branch of alpaka is that the host compiler flags don't propagate to the device compiler, as they do in CUDA. This is because a counterpart to the ``CUDA_PROPAGATE_HOST_FLAGS`` cmake variable has not been defined in the FindHIP.cmake file.
+    alpaka forwards the host compiler flags in cmake to the ``HIP_NVCC_FLAGS`` cmake variable, which also takes user-given flags. To add flags to this variable, toggle the advanced mode in ``ccmake``.
+
+
+Random Number Generator Library rocRAND for HIP Back End
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+*rocRAND* provides an interface for HIP, where the cuRAND or rocRAND API is called depending on the chosen HIP platform (can be configured with cmake in alpaka).
+
+Clone the rocRAND repository, then build and install it
+
+.. code-block::
+
+  git clone https://github.com/ROCmSoftwarePlatform/rocRAND
+  cd rocRAND
+  mkdir -p build
+  cd build
+  cmake -DCMAKE_INSTALL_PREFIX=${HIP_PATH} -DBUILD_BENCHMARK=OFF -DBUILD_TEST=OFF -DCMAKE_MODULE_PATH=${HIP_PATH}/cmake ..
+  make
+
+
+The ``CMAKE_MODULE_PATH`` is a cmake variable for locating module finding scripts like *FindHIP.cmake*.
+The paths to the *rocRAND* library and include directories should be appended to the ``CMAKE_PREFIX_PATH`` variable.
diff --git a/thirdParty/cupla/alpaka/docs/source/dev/details.rst b/thirdParty/cupla/alpaka/docs/source/dev/details.rst
new file mode 100644
index 0000000000..f855b9c573
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/dev/details.rst
@@ -0,0 +1,244 @@
+.. highlight:: cpp
+
+Details
+=======
+
+.. image:: /images/structure.png
+   :alt: Overview of the structure of the *alpaka* library with concepts and implementations.
+
+The full stack of concepts defined by the *alpaka* library and their inheritance hierarchy is shown in the third column of the preceding figure.
+Default implementations for those concepts can be seen in the blueish columns.
+The various accelerator implementations, shown in the lower half of the figure, only differ in some of their underlying concepts but can share most of the base implementations.
+The default implementations can, but do not have to be used at all.
+They can be replaced by user code in arbitrary granularity.
+By substituting, for instance, the atomic operation implementation of an accelerator, the execution can be fine-tuned, to better utilize the hardware instruction set of a specific processor.
+However, also complete accelerators, devices and all of the other concepts can be implemented by the user without the need to change any part of the *alpaka* library itself.
+The way this and other things are implemented is explained in the following paragraphs.
+
+Concept Implementations
+-----------------------
+
+The *alpaka* library has been implemented with extensibility in mind.
+This means that there are no predefined classes, modeling the concepts, the *alpaka* functions require as input parameters.
+They allow arbitrary types as parameters, as long as they model the required concept.
+
+C++ provides a language inherent object oriented abstraction allowing to check that parameters to a function comply with the concept they are required to model.
+By defining interface classes, which model the *alpaka* concepts, the user would be able to inherit his extension classes from the interfaces he wants to model and implement the abstract virtual methods the interfaces define.
+The *alpaka* functions in turn would use the corresponding interface types as their parameter types.
+For example, the ``Buffer`` concept requires methods for getting the pitch or changing the memory pinning state.
+With this intrusive object oriented design pattern the ``BufCpu`` or ``BufCudaRt`` classes would have to inherit from an ``IBuffer`` interface and implement the abstract methods it declares.
+An example of this basic pattern is shown in the following source snippet:
+
+.. code-block::
+
+   struct IBuffer
+   {
+     virtual std::size_t getPitch() const = 0;
+     virtual void pin() = 0;
+     virtual void unpin() = 0;
+     ...
+   };
+
+   struct BufCpu : public IBuffer
+   {
+     virtual std::size_t getPitch() const override { ... }
+     virtual void pin() override { ... }
+     virtual void unpin() override { ... }
+     ...
+   };
+
+   ALPAKA_FN_HOST auto copy(
+     IBuffer & dst,
+     IBuffer const & src)
+   -> void
+   {
+     ...
+   }
+
+The compiler can then check at compile time that the objects the user wants to use as function parameters can be implicitly cast to the interface type, which is the case for inherited base classes.
+The compiler returns an error message on a type mismatch.
+However, if the *alpaka* library were using those language inherent object oriented abstractions, the extensibility and optimizability it promises would not be possible.
+Classes and run-time polymorphism require the implementer of extensions to intrusively inherit from predefined interfaces and override special virtual functions.
+
+This is feasible for user defined classes or types where the source code is available and where it can be changed.
+The ``std::vector`` class template on the other hand would not be able to model the ``Buffer`` concept because we can not change its definition to inherit from the ``IBuffer`` interface class since it is part of the standard library.
+The standard inheritance based object orientation of C++ only works well when all the code it is to interoperate with can be changed to implement the interfaces.
+It does not enable interaction with unalterable or existing code that is too complex to change, which is the reality in the majority of software projects.
+
+Another option to implement an extensible library is to follow the way the C++ standard library uses.
+It allows to specialize function templates for user types to model concepts without altering the types themselves.
+For example, the ``std::begin`` and ``std::end`` free function templates can be specialized for user defined types.
+With those functions specialized, the C++11 range-based for loops (``for(auto & i : userContainer){...}``) see *C++ Standard 6.5.4/1* can be used with user defined types.
+Equally specializations of ``std::swap`` and other standard library function templates can be defined to extend those with support for user types.
+One Problem with function specialization is, that only full specializations are allowed.
+A partial function template specialization is not allowed by the standard.
+Another problem can emerge due to users carelessly overloading the template functions instead of specializing them.
+Mixing function overloading and function template specialization on the same base template function can result in unexpected results.
+The reasons and effects of this are described more closely in an article from H. Sutter (currently convener of the ISO C++ committee) called *Sutter's Mill: Why Not Specialize Function Templates?* in the *C/C++ Users Journal* in July 2001.
+
+.. seealso::
+   `different way <http://ericniebler.com/2014/10/21/customization-point-design-in-c11-and-beyond/>`_
+
+The solution given in the article is to provide *"a single function template that should never be specialized or overloaded"*.
+This function simply forwards its arguments *"to a class template containing a static function with the same signature"*.
+This template class can fully or partially be specialized without affecting overload resolution.
+
+The way the *alpaka* library implements this is by not using the C++ inherent object orientation but lifting those abstractions to a higher level.
+Instead of using a non-extensible``class``/``struct`` and abstract virtual member functions for the interface, *alpaka* defines free functions.
+All those functions are templates allowing the user to call them with arbitrary self defined types and not only those inheriting from a special interface type.
+Unlike member functions, they have no implicit ``this`` pointer, so the object instance has to be explicitly given as a parameter.
+Overriding the abstract virtual interface methods is replaced by the specialization of a template type that is defined for each such function.
+
+A concept is completely implemented by specializing the predefined template types.
+This allows to extend and fine-tune the implementation non-intrusively.
+For example, the corresponding pitch and memory pinning template types can be specialized for ``std::vector``.
+After doing this, the ``std::vector`` can be used everywhere a buffer is accepted as argument throughout the whole *alpaka* library without ever touching its definition.
+
+A simple function allowing arbitrary tasks to be enqueued into a queue can be implemented in the way shown in the following code.
+The ``TSfinae`` template parameter will be explained in a `following section <#Template-Specialization-Selection-on-Arbitrary-Conditions>`_.
+
+.. code-block::
+
+   namespace alpaka
+   {
+     template<
+       typename TQueue,
+       typename TTask,
+       typename TSfinae = void>
+     struct Enqueue;
+
+     template<
+       typename TQueue,
+       typename TTask>
+     ALPAKA_FN_HOST auto enqueue(
+       TQueue & queue,
+       TTask & task)
+     -> void
+     {
+       Enqueue<
+         TQueue,
+         TTask>
+       ::enqueue(
+         queue,
+         task);
+     }
+   }
+
+A user who wants his queue type to be used with this ``enqueue`` function has to specialize the ``Enqueue`` template struct.
+This can be either done partially by only replacing the ``TQueue`` template parameter and accepting arbitrary tasks or by fully specializing and replacing both ``TQueue`` and ``TTask``. This gives the user complete freedom of choice.
+The example given in the following code shows this by specializing the ``Enqueue`` type for a user queue type ``UserQueue`` and arbitrary tasks.
+
+.. code-block::
+
+   struct UserQueue{};
+
+   namespace alpaka
+   {
+     // partial specialization
+     template<
+       typename TTask>
+     struct Enqueue<
+       UserQueue
+       TTask>
+     {
+       ALPAKA_FN_HOST static auto enqueue(
+         UserQueue & queue,
+         TTask & task)
+       -> void
+       {
+         //...
+       }
+     };
+   }
+
+In addition the subsequent code shows a full specialization of the ``Enqueue`` type for a given ``UserQueue`` and a ``UserTask``.
+
+.. code-block::
+
+   struct UserQueue{};
+   struct UserTask{};
+
+   namespace alpaka
+   {
+     // full specialization
+     template<>
+     struct Enqueue<
+       UserQueue
+       UserTask>
+     {
+       ALPAKA_FN_HOST static auto enqueue(
+         UserQueue & queue,
+         UserTask & task)
+       -> void
+       {
+         //...
+       }
+     };
+   }
+
+When the ``enqueue`` function template is called with an instance of ``UserQueue``, the most specialized version of the ``Enqueue`` template is selected depending on the type of the task ``TTask`` it is called with.
+
+A type can model the queue concept completely by defining specializations for ``alpaka::Enqueue`` and ``alpaka::Empty``.
+This functionality can be accessed by the corresponding ``alpaka::enqueue`` and ``alpaka::empty`` template functions.
+
+Currently there is no native language support for describing and checking concepts in C++ at compile time.
+A study group (SG8) is working on the ISO `specification for conecpts <http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4377.pdf>`_ and compiler forks implementing them do exist.
+For usage in current C++ there are libraries like `Boost.ConceptCheck <https://www.boost.org/doc/libs/1_58_0/libs/concept_check/concept_check.htm>`_ which try to emulate requirement checking of concept types.
+Those libraries often exploit the preprocessor and require non-trivial changes to the function declaration syntax.
+Therefore the *alpaka* library does not currently make use of *Boost.ConceptCheck*.
+Neither does it facilitate the proposed concept specification due to its dependency on non-standard compilers.
+
+The usage of concepts as described in the working draft would often dramatically enhance the compiler error messages in case of violation of concept requirements.
+Currently the error messages are pointing deeply inside the stack of library template invocations where the missing method or the like is called.
+Instead of this, with concept checking it would directly fail at the point of invocation of the outermost template function with an expressive error message about the parameter and its violation of the concept requirements.
+This would simplify especially the work with extendable template libraries like *Boost* or *alpaka*.
+However, in the way concept checking would be used in the *alpaka* library, omitting it does not change the semantic of the program, only the compile time error diagnostics.
+In the future when the standard incorporates concept checking and the major compilers support it, it will be added to the *alpaka* library.
+
+
+Template Specialization Selection on Arbitrary Conditions
+---------------------------------------------------------
+
+Basic template specialization only allows for a selection of the most specialized version where all explicitly stated types have to be matched identically.
+It is not possible to enable or disable a specialization based on arbitrary compile time expressions depending on the parameter types.
+To allow such conditions, *alpaka* adds a defaulted and unused ``TSfinae`` template parameter to all declarations of the implementation template structs.
+This was shown using the example of the ``Enqueue`` template type.
+The C++ technique called SFINAE, an acronym for *Substitution failure is not an error* allows to disable arbitrary specializations depending on compile time conditions.
+Specializations where the substitution of the parameter types by the deduced types would result in invalid code will not result in a compile error, but will simply be omitted.
+An example in the context of the ``Enqueue`` template type is shown in the following code.
+
+.. code-block::
+
+   struct UserQueue{};
+
+   namespace alpaka
+   {
+     template<
+       typename TQueue,
+       typename TTask>
+     struct Enqueue<
+       TQueue
+       TTask,
+       std::enable_if_t<
+         std::is_base_of<UserQueue, TQueue>::value
+         && (TTask::TaskId == 1u)
+       >>
+     {
+       ALPAKA_FN_HOST static auto enqueue(
+         TQueue & queue,
+         TTask & task)
+       -> void
+       {
+         //...
+       }
+     };
+   }
+
+The ``Enqueue`` specialization shown here does not require any direct type match for the ``TQueue`` or the ``TTask`` template parameter.
+It will be used in all contexts where ``TQueue`` has inherited from ``UserQueue`` and where the ``TTask`` has a static const integral member value ``TaskId`` that equals one.
+If the ``TTask`` type does not have a ``TaskId`` member, this code would be invalid and the substitution would fail.
+However, due to SFINAE, this would not result in a compiler error but rather only in omitting this specialization.
+The ``std::enable_if`` template results in a valid expression, if the condition it contains evaluates to true, and an invalid expression if it is false.
+Therefore it can be used to disable specializations depending on arbitrary boolean conditions.
+It is utilized in the case where the ``TaskId`` member is unequal one or the ``TQueue`` does not inherit from ``UserQueue``.
+In this cirumstances, the condition itself results in valid code but because it evaluates to false, the ``std::enable_if`` specialization results in invalid code and the whole ``Enqueue`` template specialization gets omitted.
diff --git a/thirdParty/cupla/alpaka/docs/source/dev/sphinx.rst b/thirdParty/cupla/alpaka/docs/source/dev/sphinx.rst
new file mode 100644
index 0000000000..139af45540
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/dev/sphinx.rst
@@ -0,0 +1,105 @@
+Sphinx
+======
+
+.. sectionauthor:: Axel Huebl, alpaka-group
+
+In the following section we explain how to contribute to this documentation.
+
+If you are reading the `HTML version <https://alpaka.readthedocs.io>`_ and want to improve or correct existing pages, check the "*Edit on GitHub*" link on the right upper corner of each document.
+
+Alternatively, go to `docs/source` in our source code and follow the directory structure of `reStructuredText`_ (``.rst``) files there.
+For intrusive changes, like structural changes to chapters, please open an issue to discuss them beforehand.
+
+.. _reStructuredText: https://www.sphinx-doc.org/en/stable/rest.html
+
+Build Locally
+-------------
+
+This document is build based on free open-source software, namely `Sphinx`_, `Doxygen`_ (C++ APIs as XML), `Breathe`_ (to include doxygen XML in Sphinx) and `rst2pdf`_ (render the cheat sheet).
+A web-version is hosted on `ReadTheDocs`_.
+
+.. _Sphinx: https://github.com/sphinx-doc/sphinx
+.. _Doxygen: http://doxygen.org
+.. _Breathe: https://github.com/michaeljones/breathe
+.. _rst2pdf: https://rst2pdf.org/
+.. _ReadTheDocs: https://readthedocs.org/
+
+The following requirements need to be installed (once) to build our documentation successfully:
+
+.. code-block:: bash
+
+    cd docs/
+
+    # doxygen is not shipped via pip, install it externally,
+    # from the homepage, your package manager, conda, etc.
+    # example:
+    sudo apt-get install doxygen
+    # sudo pacman -S doxygen
+
+    # python tools & style theme
+    pip install -r requirements.txt # --user
+
+
+With all documentation-related software successfully installed, just run the following commands to build your docs locally.
+Please check your documentation build is successful and renders as you expected before opening a pull request!
+
+.. code-block:: bash
+
+    # skip this if you are still in docs/
+    cd docs/
+
+    # parse the C++ API documentation (default: xml format)
+    doxygen Doxyfile
+
+    # render the cheatsheet.pdf
+    rst2pdf -s cheatsheet/cheatsheet.style source/basic/cheatsheet.rst -o cheatsheet/cheatsheet.pdf
+
+    # render the '.rst' files with sphinx
+    make html
+
+    # open it, e.g. with firefox :)
+    firefox build/html/index.html
+
+    # now again for the pdf :)
+    make latexpdf
+
+    # open it, e.g. with okular
+    build/latex/alpaka.pdf
+
+.. hint::
+
+   Run `make clean` to clean the build directory before executing actual make. This is necessary to reflect changes outside the rst files.
+
+.. hint::
+
+   There is a checklinks target to check links in the rst files on availability:
+
+   .. code-block:: bash
+
+      # check existence of links
+      # cd docs/
+      make checklinks
+
+.. hint::
+
+   The Doxyfile for doxygen is configured to output in xml format per default.
+   Another targets can be configured in the Doxyfile. The final documentations are stored in ``docs/doxygen/``.
+
+   .. code-block:: bash
+
+      # run in docs/doxygen/
+      sed -i -E 's/(GENERATE_HTML\s*=\s*)NO/\1YES/g' Doxyfile
+
+readthedocs
+-----------
+
+To maintain or import a github project an account on `ReadTheDocs`_ is required.
+Further instructions can be found on `readthedocs on github <https://github.com/readthedocs/readthedocs.org>`_ and `readthedocs import guide <https://docs.readthedocs.io/en/stable/intro/import-guide.html>`_.
+
+Useful Links
+------------
+
+ * `A primer on writing reStructuredText files for sphinx <https://www.sphinx-doc.org/en/stable/rest.html>`_
+ * `Why You Shouldn't Use "Markdown" for Documentation <https://www.ericholscher.com/blog/2016/mar/15/dont-use-markdown-for-technical-docs/>`_
+ * `reStructuredText vs. Markdown <https://eli.thegreenplace.net/2017/restructuredtext-vs-markdown-for-technical-documentation/>`_
+ * `Markdown Limitations in Sphinx <https://docs.readthedocs.io/en/latest/intro/getting-started-with-sphinx.html#using-markdown-with-sphinx>`_
diff --git a/thirdParty/cupla/alpaka/docs/source/dev/style.rst b/thirdParty/cupla/alpaka/docs/source/dev/style.rst
new file mode 100644
index 0000000000..a19e641942
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/dev/style.rst
@@ -0,0 +1,150 @@
+.. highlight:: cpp
+
+Coding Guidelines
+==================
+
+.. attention::
+   The Coding Guidelines are currently revised
+
+General
+-------
+
+* Use the ``.clang-format`` file supplied in alpaka's top-level directory to format your code. This will handle indentation,
+whitespace and braces automatically. Usage:
+
+.. code-block:: bash
+
+  clang-format-11 -i <sourcefile>
+
+* If you want to format the entire code base execute the following command from alpaka's top-level directory:
+
+.. code-block:: bash
+
+  find example include test -name '*.hpp' -o -name '*.cpp' | xargs clang-format-11 -i
+
+Windows users should use `Visual Studio's native clang-format integration
+<https://devblogs.microsoft.com/cppblog/clangformat-support-in-visual-studio-2017-15-7-preview-1/>`.
+
+Naming
+------
+
+* Types are always in PascalCase (KernelExecCuda, BufT, ...) and singular.
+* Variables are always in camelCase (memBufHost, ...) and plural for collections and singular else.
+* Namespaces are always in lowercase and singular is preferred.
+* There are no two consecutive upper case letters (AccOpenMp, HtmlRenderer, IoHandler, ...). This makes names more easily readable.
+
+
+Types
+-----
+
+* Always use integral types with known width (``int32_t``, ``uint64_t``, ...).
+  Never use ``int``, ``unsigned long``, etc.
+
+
+Type Qualifiers
+---------------
+
+The order of  type qualifiers should be:
+``Type const * const`` for a const pointer to a const Type.
+``Type const &`` for a reference to a const Type.
+
+The reason is that types can be read from right to left correctly without jumping back and forth.
+``const Type * const`` and ``const Type &`` would require jumping in either way to read them correctly.
+
+
+Variables
+---------
+
+* Variables should always be initialized on construction because this can produce hard to debug errors.
+  This can (nearly) always be done even in performance critical code without sacrificing speed by using a functional programming style.
+* Variables should (nearly) always be ``const`` to make the code more easy to understand.
+  This is equivalent to functional programming and the SSA (static single assignment) style used by LLVM.
+  This should have no speed implication as every half baked compiler analyses the usage of variables and reuses registers.
+* Variable definitions should be differentiated from assignments by using either ``(...)`` or ``{...}`` but never ``=`` for definitions.
+  Use ``uint32_t const iUsageOfThisVariable(42);`` instead of ``uint32_t const iUsageOfThisVariable = 42;``
+
+
+Comments
+--------
+
+* Always use C++-Style comments ``//``
+* For types use
+  ``//#############################################################################``
+  to start the comment block.
+* For functions use
+  ``//-----------------------------------------------------------------------------``
+  to start the comment block.
+
+
+Functions
+---------
+
+* Always use the trailing return type syntax with the return type on a new line even if the return type is void:
+
+.. code-block::
+
+   auto func()
+   -> bool
+
+* This makes it easier to see the return type because it is on its own line.
+* This leads to a consistent style for constructs where there is no alternative style (lambdas, functions templates with dependent return types) and standard functions.
+* Each function parameter is on a new indented line:
+
+.. code-block::
+
+   auto func(
+       float f1,
+       float f2)
+   -> bool
+   {
+       return true
+   }
+
+.. code-block::
+
+   func(
+       1.0f,
+       2.0f);
+
+* Makes it easier to see how many parameters there are and which position they have.
+
+
+Templates
+---------
+
+* Template parameters are prefixed with ``T`` to differentiate them from class or function local typedefs.
+* Each template parameter is on a new indented line:
+
+.. code-block:: c++
+
+   template<
+       typename TParam,
+       typename TArgs...>
+   auto func()
+   -> bool
+
+* Makes it easier to see how many template parameters there are and which position they have.
+* Always use ``typename`` for template parameters. There is NO difference to class and typename matches the intent better.
+
+
+Traits
+------
+
+* Trait classes always have one more template parameter (with default parameter) then is required for enabling SFINAE in the specialization:
+
+.. code-block::
+
+   template<
+       typename T,
+       typename TSfinae = void>
+   struct GetOffsets;
+
+* Template trait aliases always end with a ``T`` e.g. ``BufT`` while the corresponding trait ends with ``Type`` e.g. ``BufType``
+* Traits for implementations always have the same name as the accessor function but in PascalCase while the member function is camelCase again: ``sin(){...}`` and ``Sin{sin(){...}};``
+
+Includes
+--------
+
+* The order of includes is from the most specialized header to the most general one.
+  This order helps to find missing includes in more specialized headers because the general ones are always included afterwards.
+* A comment with the types or functions included by a include file make it easier to find out why a special header is included.
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/block/block.png b/thirdParty/cupla/alpaka/docs/source/images/block.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/abstraction/block/block.png
rename to thirdParty/cupla/alpaka/docs/source/images/block.png
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/block/block_scale.png b/thirdParty/cupla/alpaka/docs/source/images/block_scale.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/abstraction/block/block_scale.png
rename to thirdParty/cupla/alpaka/docs/source/images/block_scale.png
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/element/element.png b/thirdParty/cupla/alpaka/docs/source/images/element.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/abstraction/element/element.png
rename to thirdParty/cupla/alpaka/docs/source/images/element.png
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/execution_domain.png b/thirdParty/cupla/alpaka/docs/source/images/execution_domain.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/implementation/library/execution_domain.png
rename to thirdParty/cupla/alpaka/docs/source/images/execution_domain.png
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/execution_domain.svg b/thirdParty/cupla/alpaka/docs/source/images/execution_domain.svg
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/implementation/library/execution_domain.svg
rename to thirdParty/cupla/alpaka/docs/source/images/execution_domain.svg
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/structure.png b/thirdParty/cupla/alpaka/docs/source/images/structure.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/implementation/library/structure.png
rename to thirdParty/cupla/alpaka/docs/source/images/structure.png
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/structure.svg b/thirdParty/cupla/alpaka/docs/source/images/structure.svg
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/implementation/library/structure.svg
rename to thirdParty/cupla/alpaka/docs/source/images/structure.svg
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/structure_assoc.png b/thirdParty/cupla/alpaka/docs/source/images/structure_assoc.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/implementation/library/structure_assoc.png
rename to thirdParty/cupla/alpaka/docs/source/images/structure_assoc.png
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/library/structure_assoc.svg b/thirdParty/cupla/alpaka/docs/source/images/structure_assoc.svg
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/implementation/library/structure_assoc.svg
rename to thirdParty/cupla/alpaka/docs/source/images/structure_assoc.svg
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/thread/thread.png b/thirdParty/cupla/alpaka/docs/source/images/thread.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/abstraction/thread/thread.png
rename to thirdParty/cupla/alpaka/docs/source/images/thread.png
diff --git a/thirdParty/alpaka/doc/markdown/user/abstraction/warp/warp.png b/thirdParty/cupla/alpaka/docs/source/images/warp.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/abstraction/warp/warp.png
rename to thirdParty/cupla/alpaka/docs/source/images/warp.png
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu.png b/thirdParty/cupla/alpaka/docs/source/images/x86_cpu.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu.png
rename to thirdParty/cupla/alpaka/docs/source/images/x86_cpu.png
diff --git a/thirdParty/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu_mapping.png b/thirdParty/cupla/alpaka/docs/source/images/x86_cpu_mapping.png
similarity index 100%
rename from thirdParty/alpaka/doc/markdown/user/implementation/mapping/x86/x86_cpu_mapping.png
rename to thirdParty/cupla/alpaka/docs/source/images/x86_cpu_mapping.png
diff --git a/thirdParty/cupla/alpaka/docs/source/index.rst b/thirdParty/cupla/alpaka/docs/source/index.rst
new file mode 100644
index 0000000000..ced931fcd5
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/index.rst
@@ -0,0 +1,74 @@
+:orphan:
+
+.. only:: html
+
+  .. image:: ../logo/alpaka.svg
+
+.. only:: latex
+
+  .. image:: ../logo/alpaka.pdf
+
+*alpaka - An Abstraction Library for Parallel Kernel Acceleration*
+
+The alpaka library is a header-only C++14 abstraction library for accelerator development. Its aim is to provide performance portability across accelerators through the abstraction (not hiding!) of the underlying levels of parallelism.
+
+.. CAUTION::
+   The readthedocs pages are work in progress and contain outdated sections.
+
+alpaka - How to Read This Document
+----------------------------------
+
+Generally, **follow the manual pages in-order** to get started.
+Individual chapters are based on the information of the chapters before.
+
+.. only:: html
+
+   The online version of this document is **versioned** and shows by default the manual of the last *stable* version of alpaka.
+   If you are looking for the latest *development* version, `click here <https://alpaka.readthedocs.io/en/latest/>`_.
+
+.. note::
+
+   Are you looking for our latest Doxygen docs for the API?
+
+   - See https://alpaka-group.github.io/alpaka/
+
+
+.. toctree::
+   :caption: Basic
+   :maxdepth: 1
+
+   basic/intro.rst
+   basic/install.rst
+   basic/abstraction.rst
+   basic/library.rst
+   basic/cheatsheet.rst
+
+.. toctree::
+   :caption: Advanced
+   :maxdepth: 1
+
+   advanced/rationale.rst
+   advanced/mapping.rst
+   advanced/cmake.rst
+
+.. toctree::
+   :caption: Extra Info
+   :maxdepth: 1
+
+   info/similar_projects.rst
+
+.. toctree::
+   :caption: Development
+   :maxdepth: 1
+
+   dev/backends.rst
+   dev/details.rst
+   dev/style
+   dev/sphinx
+   API Reference <https://alpaka-group.github.io/alpaka>
+
+Indices and Tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/thirdParty/cupla/alpaka/docs/source/info/similar_projects.rst b/thirdParty/cupla/alpaka/docs/source/info/similar_projects.rst
new file mode 100644
index 0000000000..4ba94d5708
--- /dev/null
+++ b/thirdParty/cupla/alpaka/docs/source/info/similar_projects.rst
@@ -0,0 +1,45 @@
+Similar Projects
+================
+
+`KOKKOS <https://github.com/kokkos>`_
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. seealso::
+   * https://www.xsede.org/documents/271087/586927/Edwards-2013-XSCALE13-Kokkos.pdf
+   * https://trilinos.org/oldsite/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf
+   * https://on-demand.gputechconf.com/supercomputing/2013/presentation/SC3103\_Towards-Performance-Portable-Applications-Kokkos.pdf
+   * https://dx.doi.org/10.3233/SPR-2012-0343
+
+Kokkos provides an abstract interface for portable, performant shared memory-programming.
+It is a C++ library that offers ``parallel_for``, ``parallel_reduce`` and similar functions
+for describing the pattern of the parallel tasks. The execution policy determines how the
+threads are executed. For example, this influences the sizes of blocks of threads or if
+static or dynamic scheduling should be used. The library abstracts the kernel as a function
+object that can not have any user defined parameters for its ``operator()``. Arguments have
+to be stored in members of the function object coupling algorithm and data together. *KOKKOS*
+provides both, abstractions for parallel execution of code and data management.
+Multidimensional arrays with a neutral indexing and an architecture dependent layout are
+available, which can be used, for example, to abstract the underlying hardwares preferred
+memory access scheme that could be row-major, column-major or even blocked.
+
+
+`Thrust <https://thrust.github.io/>`_
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Thrust is a parallel algorithms library resembling the C++ Standard Template Library (STL).
+It allows to select either the *CUDA*, *TBB* or *OpenMP* back-end at make-time. Because it is
+based on generic ``host_vector`` and ``device_vector`` container objects, it is tightly coupling
+the data structure and the parallelization strategy. There exist many similar libraries such
+as `ArrayFire <https://arrayfire.com/>`_ (*CUDA*, *OpenCL*, native C++),
+`VexCL <https://github.com/ddemidov/vexcl/>`_ (*OpenCL*, *CUDA*),
+`ViennaCL <http://viennacl.sourceforge.net/>`_ (*OpenCL*, *CUDA*, *OpenMP*) and
+`hemi <https://github.com/harrism/hemi/>`_ (*CUDA*, native C++).
+
+.. seealso::
+   * Phalanx
+     See `here <https://www.mgarland.org/files/papers/phalanx-sc12-preprint.pdf>`_
+     It is very similar to *alpaka* in the way it abstracts the accelerators.
+     C++ Interface provides CUDA, OpenMP, and GASNet back-ends
+   * Aura
+   * Intel TBB
+   * U\PC++
diff --git a/thirdParty/cupla/alpaka/example/CMakeLists.txt b/thirdParty/cupla/alpaka/example/CMakeLists.txt
index c9d1e3a68a..4929c3d5ba 100644
--- a/thirdParty/cupla/alpaka/example/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/example/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2015-2019 Benjamin Worpitz
+# Copyright 2015-2020 Benjamin Worpitz, Jan Stephan
 #
-# This file exemplifies usage of Alpaka.
+# This file exemplifies usage of alpaka.
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
@@ -20,16 +20,19 @@
 # Required CMake version.
 ################################################################################
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.0)
+cmake_minimum_required(VERSION 3.15)
 
-PROJECT("alpakaExamples")
+project("alpakaExamples")
 
 ################################################################################
 # Add subdirectories.
 ################################################################################
 
-ADD_SUBDIRECTORY("bufferCopy/")
-ADD_SUBDIRECTORY("helloWorld/")
-ADD_SUBDIRECTORY("helloWorldLambda/")
-ADD_SUBDIRECTORY("reduce/")
-ADD_SUBDIRECTORY("vectorAdd/")
+add_subdirectory("bufferCopy/")
+add_subdirectory("heatEquation/")
+add_subdirectory("helloWorld/")
+add_subdirectory("helloWorldLambda/")
+add_subdirectory("monteCarloIntegration/")
+add_subdirectory("openMPSchedule/")
+add_subdirectory("reduce/")
+add_subdirectory("vectorAdd/")
diff --git a/thirdParty/cupla/alpaka/example/bufferCopy/CMakeLists.txt b/thirdParty/cupla/alpaka/example/bufferCopy/CMakeLists.txt
index cbeebcbde5..b22eceaff2 100644
--- a/thirdParty/cupla/alpaka/example/bufferCopy/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/example/bufferCopy/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2014-2019 Erik Zenker, Benjamin Worpitz
+# Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan
 #
-# This file exemplifies usage of Alpaka.
+# This file exemplifies usage of alpaka.
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
@@ -19,44 +19,42 @@
 ################################################################################
 # Required CMake version.
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15)
 
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 ################################################################################
 # Project.
 
-SET(_TARGET_NAME bufferCopy)
+set(_TARGET_NAME bufferCopy)
 
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
+project(${_TARGET_NAME})
 
 #-------------------------------------------------------------------------------
 # Find alpaka.
 
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
+if(NOT TARGET alpaka::alpaka)
+    option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(USE_ALPAKA_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
 
 #-------------------------------------------------------------------------------
 # Add executable.
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     src/bufferCopy.cpp)
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
-    PUBLIC alpaka)
+    PUBLIC alpaka::alpaka)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/cupla/alpaka/example/bufferCopy/src/bufferCopy.cpp b/thirdParty/cupla/alpaka/example/bufferCopy/src/bufferCopy.cpp
index 585b46ae28..d424393b6e 100644
--- a/thirdParty/cupla/alpaka/example/bufferCopy/src/bufferCopy.cpp
+++ b/thirdParty/cupla/alpaka/example/bufferCopy/src/bufferCopy.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Erik Zenker, Matthias Werner
  *
- * This file exemplifies usage of Alpaka.
+ * This file exemplifies usage of alpaka.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -16,12 +16,13 @@
  */
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
 
-#include <iostream>
 #include <cstdint>
+#include <iostream>
 
 //-----------------------------------------------------------------------------
-template <size_t width>
+template<size_t width>
 ALPAKA_FN_ACC size_t linIdxToPitchedIdx(size_t const globalIdx, size_t const pitch)
 {
     const size_t idx_x = globalIdx % width;
@@ -34,28 +35,22 @@ ALPAKA_FN_ACC size_t linIdxToPitchedIdx(size_t const globalIdx, size_t const pit
 struct PrintBufferKernel
 {
     //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TData,
-        typename TExtent>
+    template<typename TAcc, typename TData, typename TExtent>
     ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TData const * const buffer,
-        TExtent const & extents,
-        size_t const pitch) const
-    -> void
+        TAcc const& acc,
+        TData const* const buffer,
+        TExtent const& extents,
+        size_t const pitch) const -> void
     {
-        auto const globalThreadIdx = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
 
-        auto const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(
-            globalThreadIdx,
-            globalThreadExtent);
+        auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
 
         for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
         {
             // NOTE: hard-coded for unsigned int
-            printf("%u:%u ", static_cast<uint32_t>(i), static_cast<uint32_t>(buffer[linIdxToPitchedIdx<2>(i,pitch)]));
+            printf("%u:%u ", static_cast<uint32_t>(i), static_cast<uint32_t>(buffer[linIdxToPitchedIdx<2>(i, pitch)]));
         }
     }
 };
@@ -66,35 +61,29 @@ struct PrintBufferKernel
 struct TestBufferKernel
 {
     //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TData,
-        typename TExtent>
+    template<typename TAcc, typename TData, typename TExtent>
     ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TData const * const
+        TAcc const& acc,
+        TData const* const
 #ifndef NDEBUG
-        data
+            data
 #endif
         ,
-        TExtent const & extents,
+        TExtent const& extents,
         size_t const
 #ifndef NDEBUG
-        pitch
+            pitch
 #endif
-        ) const
-    -> void
+    ) const -> void
     {
-        auto const globalThreadIdx = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
 
-        auto const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(
-            globalThreadIdx,
-            globalThreadExtent);
+        auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
 
         for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
         {
-            ALPAKA_ASSERT(data[linIdxToPitchedIdx<2>(i,pitch)] == i);
+            ALPAKA_ASSERT(data[linIdxToPitchedIdx<2>(i, pitch)] == i);
         }
     }
 };
@@ -103,22 +92,13 @@ struct TestBufferKernel
 //! Fills values of buffer with increasing elements starting from 0
 struct FillBufferKernel
 {
-    template<
-        typename TAcc,
-        typename TData,
-        typename TExtent>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TData * const data,
-        TExtent const & extents) const
-    -> void
+    template<typename TAcc, typename TData, typename TExtent>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, TData* const data, TExtent const& extents) const -> void
     {
-        auto const globalThreadIdx = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        auto const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
 
-        auto const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(
-            globalThreadIdx,
-            globalThreadExtent);
+        auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
 
         for(size_t i(linearizedGlobalThreadIdx[0]); i < extents.prod(); i += globalThreadExtent.prod())
         {
@@ -127,80 +107,78 @@ struct FillBufferKernel
     }
 };
 
-auto main()
--> int
+auto main() -> int
 {
 // Fallback for the CI with disabled sequential backend
 #if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
     return EXIT_SUCCESS;
 #else
     // Define the index domain
-    using Dim = alpaka::dim::DimInt<3u>;
+    using Dim = alpaka::DimInt<3u>;
     using Idx = std::size_t;
 
     // Define the device accelerator
     //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
+    // It is possible to choose from a set of accelerators:
     // - AccGpuCudaRt
+    // - AccGpuHipRt
     // - AccCpuThreads
     // - AccCpuFibers
     // - AccCpuOmp2Threads
     // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
+    // - AccOmp5
+    // - AccCpuTbbBlocks
     // - AccCpuSerial
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
+    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
     // Defines the synchronization behavior of a queue
     //
     // choose between Blocking and NonBlocking
-    using AccQueueProperty = alpaka::queue::Blocking;
-    using DevQueue = alpaka::queue::Queue<Acc, AccQueueProperty>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
+    using AccQueueProperty = alpaka::Blocking;
+    using DevQueue = alpaka::Queue<Acc, AccQueueProperty>;
 
     // Define the device accelerator
     //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
+    // It is possible to choose from a set of accelerators:
     // - AccCpuThreads
     // - AccCpuFibers
     // - AccCpuOmp2Threads
     // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
+    // - AccOmp5
     // - AccCpuSerial
-    using Host = alpaka::acc::AccCpuSerial<Dim, Idx>;
+    using Host = alpaka::AccCpuSerial<Dim, Idx>;
     // Defines the synchronization behavior of a queue
     //
     // choose between Blocking and NonBlocking
-    using HostQueueProperty = alpaka::queue::Blocking;
-    using HostQueue = alpaka::queue::Queue<Host, HostQueueProperty>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
+    using HostQueueProperty = alpaka::Blocking;
+    using HostQueue = alpaka::Queue<Host, HostQueueProperty>;
 
     // Select devices
-    DevAcc const devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
-    DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
+    auto const devHost = alpaka::getDevByIdx<Host>(0u);
 
     // Create queues
     DevQueue devQueue(devAcc);
     HostQueue hostQueue(devHost);
 
-    // Define the work division
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
+    // Define the work division for kernels to be run on devAcc and devHost
+    using Vec = alpaka::Vec<Dim, Idx>;
     Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
-
-    Vec const blocksPerGrid(
-        static_cast<Idx>(4),
-        static_cast<Idx>(8),
-        static_cast<Idx>(16));
-
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workdiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
-
+    Vec const threadsPerGrid(Vec::all(static_cast<Idx>(10)));
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    WorkDiv const devWorkDiv = alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        threadsPerGrid,
+        elementsPerThread,
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+    WorkDiv const hostWorkDiv = alpaka::getValidWorkDiv<Host>(
+        devHost,
+        threadsPerGrid,
+        elementsPerThread,
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
 
     // Create host and device buffers
     //
@@ -219,21 +197,21 @@ auto main()
     //
     // The `alloc` method returns a reference counted buffer handle.
     // When the last such handle is destroyed, the memory is freed automatically.
-    using BufHost = alpaka::mem::buf::Buf<DevHost, Data, Dim, Idx>;
-    BufHost hostBuffer(alpaka::mem::buf::alloc<Data, Idx>(devHost, extents));
+    using BufHost = alpaka::Buf<Host, Data, Dim, Idx>;
+    BufHost hostBuffer(alpaka::allocBuf<Data, Idx>(devHost, extents));
     // You can also use already allocated memory and wrap it within a view (irrespective of the device type).
     // The view does not own the underlying memory. So you have to make sure that
     // the view does not outlive its underlying memory.
     std::array<Data, nElementsPerDim * nElementsPerDim * nElementsPerDim> plainBuffer;
-    using ViewHost = alpaka::mem::view::ViewPlainPtr<DevHost, Data, Dim, Idx>;
+    using ViewHost = alpaka::ViewPlainPtr<Host, Data, Dim, Idx>;
     ViewHost hostViewPlainPtr(plainBuffer.data(), devHost, extents);
 
     // Allocate accelerator memory buffers
     //
     // The interface to allocate a buffer is the same on the host and on the device.
-    using BufAcc = alpaka::mem::buf::Buf<DevAcc, Data, Dim, Idx>;
-    BufAcc deviceBuffer1(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extents));
-    BufAcc deviceBuffer2(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extents));
+    using BufAcc = alpaka::Buf<Acc, Data, Dim, Idx>;
+    BufAcc deviceBuffer1(alpaka::allocBuf<Data, Idx>(devAcc, extents));
+    BufAcc deviceBuffer2(alpaka::allocBuf<Data, Idx>(devAcc, extents));
 
 
     // Init host buffer
@@ -242,7 +220,7 @@ auto main()
     // elements of a buffer directly, but
     // you can get the pointer to the memory
     // (getPtrNative).
-    Data * const pHostBuffer = alpaka::mem::view::getPtrNative(hostBuffer);
+    Data* const pHostBuffer = alpaka::getPtrNative(hostBuffer);
 
     // This pointer can be used to directly write
     // some values into the buffer memory.
@@ -256,16 +234,16 @@ auto main()
     // Memory views and buffers can also be initialized by executing a kernel.
     // To pass a buffer into a kernel, you can pass the
     // native pointer into the kernel invocation.
-    Data * const pHostViewPlainPtr = alpaka::mem::view::getPtrNative(hostViewPlainPtr);
+    Data* const pHostViewPlainPtr = alpaka::getPtrNative(hostViewPlainPtr);
 
     FillBufferKernel fillBufferKernel;
 
-    alpaka::kernel::exec<Host>(
+    alpaka::exec<Host>(
         hostQueue,
-        workdiv,
+        hostWorkDiv,
         fillBufferKernel,
         pHostViewPlainPtr, // 1st kernel argument
-        extents);          // 2nd kernel argument
+        extents); // 2nd kernel argument
 
 
     // Copy host to device Buffer
@@ -279,38 +257,42 @@ auto main()
     // not currently supported.
     // In this example both host buffers are copied
     // into device buffers.
-    alpaka::mem::view::copy(devQueue, deviceBuffer1, hostViewPlainPtr, extents);
-    alpaka::mem::view::copy(devQueue, deviceBuffer2, hostBuffer, extents);
-
-    Idx const deviceBuffer1Pitch(alpaka::mem::view::getPitchBytes<2u>(deviceBuffer1) / sizeof(Data));
-    Idx const deviceBuffer2Pitch(alpaka::mem::view::getPitchBytes<2u>(deviceBuffer2) / sizeof(Data));
-    Idx const hostBuffer1Pitch(alpaka::mem::view::getPitchBytes<2u>(hostBuffer) / sizeof(Data));
-    Idx const hostViewPlainPtrPitch(alpaka::mem::view::getPitchBytes<2u>(hostViewPlainPtr) / sizeof(Data));
+    alpaka::memcpy(devQueue, deviceBuffer1, hostViewPlainPtr, extents);
+    alpaka::memcpy(devQueue, deviceBuffer2, hostBuffer, extents);
+
+    // Depending on the accelerator, the allocation function may introduce
+    // padding between rows/planes of multidimensional memory allocations.
+    // Therefore the pitch (distance between consecutive rows/planes) may be
+    // greater than the space required for the data.
+    Idx const deviceBuffer1Pitch(alpaka::getPitchBytes<2u>(deviceBuffer1) / sizeof(Data));
+    Idx const deviceBuffer2Pitch(alpaka::getPitchBytes<2u>(deviceBuffer2) / sizeof(Data));
+    Idx const hostBuffer1Pitch(alpaka::getPitchBytes<2u>(hostBuffer) / sizeof(Data));
+    Idx const hostViewPlainPtrPitch(alpaka::getPitchBytes<2u>(hostViewPlainPtr) / sizeof(Data));
 
     // Test device Buffer
     //
     // This kernel tests if the copy operations
     // were successful. In the case something
     // went wrong an assert will fail.
-    Data const * const pDeviceBuffer1 = alpaka::mem::view::getPtrNative(deviceBuffer1);
-    Data const * const pDeviceBuffer2 = alpaka::mem::view::getPtrNative(deviceBuffer2);
+    Data const* const pDeviceBuffer1 = alpaka::getPtrNative(deviceBuffer1);
+    Data const* const pDeviceBuffer2 = alpaka::getPtrNative(deviceBuffer2);
 
     TestBufferKernel testBufferKernel;
-    alpaka::kernel::exec<Acc>(
+    alpaka::exec<Acc>(
         devQueue,
-        workdiv,
+        devWorkDiv,
         testBufferKernel,
-        pDeviceBuffer1,                                 // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        deviceBuffer1Pitch);                            // 3rd kernel argument
+        pDeviceBuffer1, // 1st kernel argument
+        extents, // 2nd kernel argument
+        deviceBuffer1Pitch); // 3rd kernel argument
 
-    alpaka::kernel::exec<Acc>(
+    alpaka::exec<Acc>(
         devQueue,
-        workdiv,
+        devWorkDiv,
         testBufferKernel,
-        pDeviceBuffer2,                                 // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        deviceBuffer2Pitch);                            // 3rd kernel argument
+        pDeviceBuffer2, // 1st kernel argument
+        extents, // 2nd kernel argument
+        deviceBuffer2Pitch); // 3rd kernel argument
 
 
     // Print device Buffer
@@ -324,44 +306,44 @@ auto main()
     // completely distorted.
 
     PrintBufferKernel printBufferKernel;
-    alpaka::kernel::exec<Acc>(
+    alpaka::exec<Acc>(
         devQueue,
-        workdiv,
+        devWorkDiv,
         printBufferKernel,
-        pDeviceBuffer1,                                 // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        deviceBuffer1Pitch);                            // 3rd kernel argument
-    alpaka::wait::wait(devQueue);
+        pDeviceBuffer1, // 1st kernel argument
+        extents, // 2nd kernel argument
+        deviceBuffer1Pitch); // 3rd kernel argument
+    alpaka::wait(devQueue);
     std::cout << std::endl;
 
-    alpaka::kernel::exec<Acc>(
+    alpaka::exec<Acc>(
         devQueue,
-        workdiv,
+        devWorkDiv,
         printBufferKernel,
-        pDeviceBuffer2,                                 // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        deviceBuffer2Pitch);                            // 3rd kernel argument
-    alpaka::wait::wait(devQueue);
+        pDeviceBuffer2, // 1st kernel argument
+        extents, // 2nd kernel argument
+        deviceBuffer2Pitch); // 3rd kernel argument
+    alpaka::wait(devQueue);
     std::cout << std::endl;
 
-    alpaka::kernel::exec<Host>(
+    alpaka::exec<Host>(
         hostQueue,
-        workdiv,
+        hostWorkDiv,
         printBufferKernel,
-        pHostBuffer,                                    // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        hostBuffer1Pitch);                              // 3rd kernel argument
-    alpaka::wait::wait(hostQueue);
+        pHostBuffer, // 1st kernel argument
+        extents, // 2nd kernel argument
+        hostBuffer1Pitch); // 3rd kernel argument
+    alpaka::wait(hostQueue);
     std::cout << std::endl;
 
-    alpaka::kernel::exec<Host>(
+    alpaka::exec<Host>(
         hostQueue,
-        workdiv,
+        hostWorkDiv,
         printBufferKernel,
-        pHostViewPlainPtr,                              // 1st kernel argument
-        extents,                                        // 2nd kernel argument
-        hostViewPlainPtrPitch);                         // 3rd kernel argument
-    alpaka::wait::wait(hostQueue);
+        pHostViewPlainPtr, // 1st kernel argument
+        extents, // 2nd kernel argument
+        hostViewPlainPtrPitch); // 3rd kernel argument
+    alpaka::wait(hostQueue);
     std::cout << std::endl;
 
     return EXIT_SUCCESS;
diff --git a/thirdParty/cupla/alpaka/example/heatEquation/CMakeLists.txt b/thirdParty/cupla/alpaka/example/heatEquation/CMakeLists.txt
new file mode 100644
index 0000000000..525bb3810a
--- /dev/null
+++ b/thirdParty/cupla/alpaka/example/heatEquation/CMakeLists.txt
@@ -0,0 +1,61 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Jan Stephan
+#
+# This file exemplifies usage of alpaka.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
+# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.15)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME heatEquation)
+
+project(${_TARGET_NAME})
+
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(USE_ALPAKA_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/heatEquation.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/cupla/alpaka/example/heatEquation/src/heatEquation.cpp b/thirdParty/cupla/alpaka/example/heatEquation/src/heatEquation.cpp
new file mode 100644
index 0000000000..cc3ba3432a
--- /dev/null
+++ b/thirdParty/cupla/alpaka/example/heatEquation/src/heatEquation.cpp
@@ -0,0 +1,204 @@
+/* Copyright 2020 Benjamin Worpitz, Matthias Werner, Jakob Krude,
+ *                Sergei Bastrakov
+ *
+ * This file exemplifies usage of alpaka.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <utility>
+
+
+//#############################################################################
+//! alpaka version of explicit finite-difference 1d heat equation solver
+//!
+//! Solving equation u_t(x, t) = u_xx(x, t) using a simple explicit scheme with
+//! forward difference in t and second-order central difference in x
+//!
+//! \param uCurrBuf grid values of u for each x and the current value of t:
+//!                 u(x, t) | t = t_current
+//! \param uNext resulting grid values of u for each x and the next value of t:
+//!              u(x, t) | t = t_current + dt
+//! \param extent number of grid nodes in x (eq. to numNodesX)
+//! \param dx step in x
+//! \param dt step in t
+
+struct HeatEquationKernel
+{
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        double const* const uCurrBuf,
+        double* const uNextBuf,
+        uint32_t const extent,
+        double const dx,
+        double const dt) const -> void
+    {
+        // Each kernel executes one element
+        double const r = dt / (dx * dx);
+        int idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+        if(idx > 0 && idx < extent - 1u)
+        {
+            uNextBuf[idx] = uCurrBuf[idx] * (1.0 - 2.0 * r) + uCurrBuf[idx - 1] * r + uCurrBuf[idx + 1] * r;
+        }
+    }
+};
+
+
+//! Exact solution to the test problem
+//! u_t(x, t) = u_xx(x, t), x in [0, 1], t in [0, T]
+//! u(0, t) = u(1, t) = 0
+//! u(x, 0) = sin(pi * x)
+//!
+//! \param x value of x
+//! \param t value of t
+double exactSolution(double const x, double const t)
+{
+    constexpr double pi = 3.14159265358979323846;
+    return std::exp(-pi * pi * t) * std::sin(pi * x);
+}
+
+
+//! Each kernel computes the next step for one point.
+//! Therefore the number of threads should be equal to numNodesX.
+//! Every time step the kernel will be executed numNodesX-times
+//! After every step the curr-buffer will be set to the calculated values
+//! from the next-buffer.
+auto main() -> int
+{
+#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+    return EXIT_SUCCESS;
+#else
+    // Parameters (a user is supposed to change numNodesX, numTimeSteps)
+    uint32_t const numNodesX = 1000;
+    uint32_t const numTimeSteps = 10000;
+    double const tMax = 0.001;
+    // x in [0, 1], t in [0, tMax]
+    double const dx = 1.0 / static_cast<double>(numNodesX - 1);
+    double const dt = tMax / static_cast<double>(numTimeSteps - 1);
+
+    // Check the stability condition
+    double const r = dt / (dx * dx);
+    if(r > 0.5)
+    {
+        std::cerr << "Stability condition check failed: dt/dx^2 = " << r << ", it is required to be <= 0.5\n";
+        return EXIT_FAILURE;
+    }
+
+    // Set Dim and Idx type
+    using Dim = alpaka::DimInt<1u>;
+    using Idx = uint32_t;
+
+    // Select accelerator-types for host and device
+    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
+
+    using DevHost = alpaka::DevCpu;
+
+    // Select specific devices
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
+    auto const devHost = alpaka::getDevByIdx<DevHost>(0u);
+
+    // Get valid workdiv for the given problem
+    uint32_t elemPerThread = 1;
+    alpaka::Vec<Dim, Idx> const extent{numNodesX};
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    auto workdiv = WorkDiv{alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        extent,
+        elemPerThread,
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
+
+    // Select queue
+    using QueueProperty = alpaka::Blocking;
+    using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
+    QueueAcc queue{devAcc};
+
+    // Initialize host-buffer
+    using BufHost = alpaka::Buf<DevHost, double, Dim, Idx>;
+    // This buffer holds the calculated values
+    auto uNextBufHost = BufHost{alpaka::allocBuf<double, Idx>(devHost, extent)};
+    // This buffer will hold the current values (used for the next step)
+    auto uCurrBufHost = BufHost{alpaka::allocBuf<double, Idx>(devHost, extent)};
+
+    double* const pCurrHost = alpaka::getPtrNative(uCurrBufHost);
+    double* const pNextHost = alpaka::getPtrNative(uNextBufHost);
+
+    // Accelerator buffer
+    using BufAcc = alpaka::Buf<Acc, double, Dim, Idx>;
+    auto uNextBufAcc = BufAcc{alpaka::allocBuf<double, Idx>(devAcc, extent)};
+    auto uCurrBufAcc = BufAcc{alpaka::allocBuf<double, Idx>(devAcc, extent)};
+
+    double* pCurrAcc = alpaka::getPtrNative(uCurrBufAcc);
+    double* pNextAcc = alpaka::getPtrNative(uNextBufAcc);
+
+    // Apply initial conditions for the test problem
+    for(uint32_t i = 0; i < numNodesX; i++)
+    {
+        pCurrHost[i] = exactSolution(i * dx, 0.0);
+    }
+
+    HeatEquationKernel kernel;
+
+    // Copy host -> device
+    alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost, extent);
+    // Copy to the buffer for next as well to have boundary values set
+    alpaka::memcpy(queue, uNextBufAcc, uCurrBufAcc, extent);
+    alpaka::wait(queue);
+
+    for(uint32_t step = 0; step < numTimeSteps; step++)
+    {
+        // Compute next values
+        alpaka::exec<Acc>(queue, workdiv, kernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
+
+        // We assume the boundary conditions are constant and so these values
+        // do not need to be updated.
+        // So we just swap next to curr (shallow copy)
+        std::swap(pCurrAcc, pNextAcc);
+    }
+
+    // Copy device -> host
+    alpaka::memcpy(queue, uNextBufHost, uNextBufAcc, extent);
+    alpaka::wait(queue);
+
+    // Calculate error
+    double maxError = 0.0;
+    for(uint32_t i = 0; i < numNodesX; i++)
+    {
+        auto const error = std::abs(pNextHost[i] - exactSolution(i * dx, tMax));
+        maxError = std::max(maxError, error);
+    }
+
+    double const errorThreshold = 1e-5;
+    bool resultCorrect = (maxError < errorThreshold);
+    if(resultCorrect)
+    {
+        std::cout << "Execution results correct!" << std::endl;
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        std::cout << "Execution results incorrect: error = " << maxError << " (the grid resolution may be too low)"
+                  << std::endl;
+        return EXIT_FAILURE;
+    }
+#endif
+}
diff --git a/thirdParty/cupla/alpaka/example/helloWorld/CMakeLists.txt b/thirdParty/cupla/alpaka/example/helloWorld/CMakeLists.txt
index b18da1282f..1456701fce 100644
--- a/thirdParty/cupla/alpaka/example/helloWorld/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/example/helloWorld/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2014-2019 Erik Zenker, Benjamin Worpitz
+# Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan
 #
-# This file exemplifies usage of Alpaka.
+# This file exemplifies usage of alpaka.
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
@@ -19,44 +19,42 @@
 ################################################################################
 # Required CMake version.
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15)
 
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 ################################################################################
 # Project.
 
-SET(_TARGET_NAME helloWorld)
+set(_TARGET_NAME helloWorld)
 
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
+project(${_TARGET_NAME})
 
 #-------------------------------------------------------------------------------
 # Find alpaka.
 
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
+if(NOT TARGET alpaka::alpaka)
+    option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(USE_ALPAKA_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
 
 #-------------------------------------------------------------------------------
 # Add executable.
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     src/helloWorld.cpp)
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
-    PUBLIC alpaka)
+    PUBLIC alpaka::alpaka)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/cupla/alpaka/example/helloWorld/src/helloWorld.cpp b/thirdParty/cupla/alpaka/example/helloWorld/src/helloWorld.cpp
index 5d92a8f890..b36b7b31e7 100644
--- a/thirdParty/cupla/alpaka/example/helloWorld/src/helloWorld.cpp
+++ b/thirdParty/cupla/alpaka/example/helloWorld/src/helloWorld.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker
  *
- * This file exemplifies usage of Alpaka.
+ * This file exemplifies usage of alpaka.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -16,6 +16,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
 
 #include <iostream>
 
@@ -26,31 +27,26 @@
 struct HelloWorldKernel
 {
     //-----------------------------------------------------------------------------
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc) const -> void
     {
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using Vec = alpaka::vec::Vec<Dim, Idx>;
-        using Vec1 = alpaka::vec::Vec<alpaka::dim::DimInt<1u>, Idx>;
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
+        using Vec = alpaka::Vec<Dim, Idx>;
+        using Vec1 = alpaka::Vec<alpaka::DimInt<1u>, Idx>;
 
         // In the most cases the parallel work distibution depends
         // on the current index of a thread and how many threads
         // exist overall. These information can be obtained by
         // getIdx() and getWorkDiv(). In this example these
         // values are obtained for a global scope.
-        Vec const globalThreadIdx = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        Vec const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+        Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        Vec const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
 
         // Map the three dimensional thread index into a
         // one dimensional thread index space. We call it
         // linearize the thread index.
-        Vec1 const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(
-            globalThreadIdx,
-            globalThreadExtent);
+        Vec1 const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
 
         // Each thread prints a hello world to the terminal
         // together with the global index of the thread in
@@ -66,8 +62,7 @@ struct HelloWorldKernel
     }
 };
 
-auto main()
--> int
+auto main() -> int
 {
 // Fallback for the CI with disabled sequential backend
 #if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
@@ -79,19 +74,20 @@ auto main()
     // the dimensionality as well as the type used for indices.
     // For small index domains 16 or 32 bit indices may be enough
     // and may be faster to calculate depending on the accelerator.
-    using Dim = alpaka::dim::DimInt<3>;
+    using Dim = alpaka::DimInt<3>;
     using Idx = std::size_t;
 
     // Define the accelerator
     //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
+    // It is possible to choose from a set of accelerators:
     // - AccGpuCudaRt
+    // - AccGpuHipRt
     // - AccCpuThreads
     // - AccCpuFibers
     // - AccCpuOmp2Threads
     // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
+    // - AccOmp5
+    // - AccCpuTbbBlocks
     // - AccCpuSerial
     //
     // Each accelerator has strengths and weaknesses. Therefore,
@@ -101,16 +97,15 @@ auto main()
     // automatically.
 
     // By exchanging the Acc and Queue types you can select where to execute the kernel.
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
+    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
     //
     // choose between Blocking and NonBlocking
-    using QueueProperty = alpaka::queue::Blocking;
-    using Queue = alpaka::queue::Queue<Acc, QueueProperty>;
-    using Dev = alpaka::dev::Dev<Acc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-
+    using QueueProperty = alpaka::Blocking;
+    using Queue = alpaka::Queue<Acc, QueueProperty>;
 
     // Select a device
     //
@@ -120,7 +115,7 @@ auto main()
     // by id (0 to the number of devices minus 1) or you
     // can also retrieve all devices in a vector (getDevs()).
     // In this example the first devices is choosen.
-    Dev const devAcc(alpaka::pltf::getDevByIdx<Pltf>(0u));
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
 
     // Create a queue on the device
     //
@@ -162,20 +157,16 @@ auto main()
     // memory. Elements are supposed to be used for vectorization.
     // Thus, a thread can process data element size wise with its
     // vector processing unit.
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
+    using Vec = alpaka::Vec<Dim, Idx>;
     Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
-    Vec const blocksPerGrid(
-        static_cast<Idx>(4),
-        static_cast<Idx>(8),
-        static_cast<Idx>(16));
-
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
-
+    Vec const threadsPerGrid(Vec::all(static_cast<Idx>(8)));
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    WorkDiv const workDiv = alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        threadsPerGrid,
+        elementsPerThread,
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
 
     // Instantiate the kernel function object
     //
@@ -193,12 +184,12 @@ auto main()
     // The queue can be blocking or non-blocking
     // depending on the choosen queue type (see type definitions above).
     // Here it is synchronous which means that the kernel is directly executed.
-    alpaka::kernel::exec<Acc>(
+    alpaka::exec<Acc>(
         queue,
         workDiv,
         helloWorldKernel
         /* put kernel arguments here */);
-    alpaka::wait::wait(queue);
+    alpaka::wait(queue);
 
     return EXIT_SUCCESS;
 #endif
diff --git a/thirdParty/cupla/alpaka/example/helloWorldLambda/CMakeLists.txt b/thirdParty/cupla/alpaka/example/helloWorldLambda/CMakeLists.txt
index 9c5727f17f..e262e792e3 100644
--- a/thirdParty/cupla/alpaka/example/helloWorldLambda/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/example/helloWorldLambda/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2014-2019 Erik Zenker, Benjamin Worpitz
+# Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan
 #
-# This file exemplifies usage of Alpaka.
+# This file exemplifies usage of alpaka.
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
@@ -19,44 +19,42 @@
 ################################################################################
 # Required CMake version.
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15)
 
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 ################################################################################
 # Project.
 
-SET(_TARGET_NAME helloWorldLambda)
+set(_TARGET_NAME helloWorldLambda)
 
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
+project(${_TARGET_NAME})
 
 #-------------------------------------------------------------------------------
 # Find alpaka.
 
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
+if(NOT TARGET alpaka::alpaka)
+    option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(USE_ALPAKA_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
 
 #-------------------------------------------------------------------------------
 # Add executable.
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     src/helloWorldLambda.cpp)
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
-    PUBLIC alpaka)
+    PUBLIC alpaka::alpaka)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/cupla/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp b/thirdParty/cupla/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp
index 69a4d980fc..011143dbe5 100644
--- a/thirdParty/cupla/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp
+++ b/thirdParty/cupla/alpaka/example/helloWorldLambda/src/helloWorldLambda.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker
  *
- * This file exemplifies usage of Alpaka.
+ * This file exemplifies usage of alpaka.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -16,6 +16,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
 
 #include <functional>
 
@@ -27,96 +28,90 @@
 //! and might be useful when it is necessary
 //! to lift an existing function into a kernel
 //! function.
-template<
-    typename TAcc>
-void ALPAKA_FN_ACC hiWorldFunction(
-    TAcc const & acc,
-    size_t const nExclamationMarks)
+template<typename TAcc>
+void ALPAKA_FN_ACC hiWorldFunction(TAcc const& acc, size_t const nExclamationMarks)
 {
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-    using Vec1 = alpaka::vec::Vec<alpaka::dim::DimInt<1u>, Idx>;
-
-    Vec const globalThreadIdx    = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-    Vec const globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-    Vec1 const linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(globalThreadIdx,
-                                                              globalThreadExtent);
-
-    printf("[z:%u, y:%u, x:%u][linear:%u] Hi world from a function",
-           static_cast<unsigned>(globalThreadIdx[0]),
-           static_cast<unsigned>(globalThreadIdx[1]),
-           static_cast<unsigned>(globalThreadIdx[2]),
-           static_cast<unsigned>(linearizedGlobalThreadIdx[0]));
-
-    for(size_t i = 0; i < nExclamationMarks; ++i){
+    using Dim = alpaka::Dim<TAcc>;
+    using Idx = alpaka::Idx<TAcc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using Vec1 = alpaka::Vec<alpaka::DimInt<1u>, Idx>;
+
+    Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+    Vec const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+    Vec1 const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
+
+    printf(
+        "[z:%u, y:%u, x:%u][linear:%u] Hi world from a function",
+        static_cast<unsigned>(globalThreadIdx[0]),
+        static_cast<unsigned>(globalThreadIdx[1]),
+        static_cast<unsigned>(globalThreadIdx[2]),
+        static_cast<unsigned>(linearizedGlobalThreadIdx[0]));
+
+    for(size_t i = 0; i < nExclamationMarks; ++i)
+    {
         printf("!");
     }
 
     printf("\n");
 }
 
-auto main()
--> int
+auto main() -> int
 {
 // It requires support for extended lambdas when using nvcc as CUDA compiler.
 // Requires sequential backend if CI is used
-#if (!defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_EXTENDED_LAMBDA__) )) && \
-    (!defined(ALPAKA_CI) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED))
+#if(!defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_EXTENDED_LAMBDA__)))                                 \
+    && (!defined(ALPAKA_CI) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED))
 
     // Define the index domain
-    using Dim = alpaka::dim::DimInt<3>;
+    using Dim = alpaka::DimInt<3>;
     using Idx = std::size_t;
 
     // Define the accelerator
     //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
+    // It is possible to choose from a set of accelerators:
     // - AccGpuCudaRt
+    // - AccGpuHipRt
     // - AccCpuThreads
     // - AccCpuFibers
     // - AccCpuOmp2Threads
     // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
+    // - AccOmp5
+    // - AccCpuTbbBlocks
     // - AccCpuSerial
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
+    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
     //
     // choose between Blocking and NonBlocking
-    using QueueProperty = alpaka::queue::Blocking;
-    using Queue = alpaka::queue::Queue<Acc, QueueProperty>;
-    using Dev = alpaka::dev::Dev<Acc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
+    using QueueProperty = alpaka::Blocking;
+    using Queue = alpaka::Queue<Acc, QueueProperty>;
 
     // Select a device
-    Dev const devAcc(alpaka::pltf::getDevByIdx<Pltf>(0u));
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
 
     // Create a queue on the device
     Queue queue(devAcc);
 
     // Define the work division
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
+    using Vec = alpaka::Vec<Dim, Idx>;
     Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
-    Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
-    Vec const blocksPerGrid(
-        static_cast<Idx>(1),
-        static_cast<Idx>(2),
-        static_cast<Idx>(4));
-
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
+    Vec const threadsPerGrid(Vec::all(static_cast<Idx>(8)));
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    WorkDiv const workDiv = alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        threadsPerGrid,
+        elementsPerThread,
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
 
     const size_t nExclamationMarks = 10;
 
     // Run "Hello World" kernel with a lambda function
     //
-    // Alpaka is able to execute lambda functions (anonymous functions) which
-    // are available since the C++11 standard.
-    // Alpaka forces the lambda function to accept
+    // alpaka is able to execute lambda functions (anonymous functions).
+    // alpaka forces the lambda function to accept
     // the utilized accelerator as first argument.
     // All following arguments can be provided after
     // the lambda function declaration or be captured.
@@ -124,30 +119,30 @@ auto main()
     // This example passes the number exclamation marks, that should
     // be written after we greet the world, to the
     // lambda function.
-    alpaka::kernel::exec<Acc>(
+    alpaka::exec<Acc>(
         queue,
         workDiv,
-        [] ALPAKA_FN_ACC (Acc const & acc, size_t const nExclamationMarksAsArg) -> void {
-            auto globalThreadIdx    = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-            auto globalThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-            auto linearizedGlobalThreadIdx = alpaka::idx::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
-
-            printf("[z:%u, y:%u, x:%u][linear:%u] Hello world from a lambda",
-               static_cast<unsigned>(globalThreadIdx[0]),
-               static_cast<unsigned>(globalThreadIdx[1]),
-               static_cast<unsigned>(globalThreadIdx[2]),
-               static_cast<unsigned>(linearizedGlobalThreadIdx[0]));
-
-            for(size_t i = 0; i < nExclamationMarksAsArg; ++i){
+        [] ALPAKA_FN_ACC(Acc const& acc, size_t const nExclamationMarksAsArg) -> void {
+            auto globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+            auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+            auto linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent);
+
+            printf(
+                "[z:%u, y:%u, x:%u][linear:%u] Hello world from a lambda",
+                static_cast<unsigned>(globalThreadIdx[0]),
+                static_cast<unsigned>(globalThreadIdx[1]),
+                static_cast<unsigned>(globalThreadIdx[2]),
+                static_cast<unsigned>(linearizedGlobalThreadIdx[0]));
+
+            for(size_t i = 0; i < nExclamationMarksAsArg; ++i)
+            {
                 printf("!");
             }
 
             printf("\n");
-
         },
-        nExclamationMarks
-    );
-    alpaka::wait::wait(queue);
+        nExclamationMarks);
+    alpaka::wait(queue);
 
     return EXIT_SUCCESS;
 
diff --git a/thirdParty/cupla/alpaka/example/monteCarloIntegration/CMakeLists.txt b/thirdParty/cupla/alpaka/example/monteCarloIntegration/CMakeLists.txt
new file mode 100644
index 0000000000..ee8c9e0e2c
--- /dev/null
+++ b/thirdParty/cupla/alpaka/example/monteCarloIntegration/CMakeLists.txt
@@ -0,0 +1,61 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Jan Stephan
+#
+# This file exemplifies usage of alpaka.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
+# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.15)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME monteCarloIntegration)
+
+project(${_TARGET_NAME})
+
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(USE_ALPAKA_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/monteCarloIntegration.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/cupla/alpaka/example/monteCarloIntegration/src/monteCarloIntegration.cpp b/thirdParty/cupla/alpaka/example/monteCarloIntegration/src/monteCarloIntegration.cpp
new file mode 100644
index 0000000000..ba1ff8e439
--- /dev/null
+++ b/thirdParty/cupla/alpaka/example/monteCarloIntegration/src/monteCarloIntegration.cpp
@@ -0,0 +1,150 @@
+/* Copyright 2020 Benjamin Worpitz, Sergei Bastrakov, Jakob Krude
+ *
+ * This file exemplifies usage of alpaka.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+
+//#############################################################################
+//! This functor defines the function for which the integral is to be computed.
+struct Function
+{
+    //-----------------------------------------------------------------------------
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \param acc The accelerator to be executed on.
+    //! \param x The argument.
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, float const x) -> float
+    {
+        return alpaka::math::sqrt(acc, (1.0f - x * x));
+    }
+};
+
+//#############################################################################
+//! The kernel executing the parallel logic.
+//! Each Thread generates X pseudo random numbers and compares them with the given function.
+//! The local result will be added to a global result.
+struct Kernel
+{
+    //-----------------------------------------------------------------------------
+    //! The kernel entry point.
+    //! \tparam TAcc The accelerator environment to be executed on.
+    //! \tparam TFunctor A wrapper for a function.
+    //! \param acc The accelerator to be executed on.
+    //! \param numPoints The total number of points to be calculated.
+    //! \param globalCounter The sum of all local results.
+    //! \param functor The function for which the integral is to be computed.
+    template<typename TAcc, typename TFunctor>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        size_t const numPoints,
+        uint32_t* const globalCounter,
+        TFunctor functor) const -> void
+    {
+        // Get the global linearized thread idx.
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+        auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0];
+        // Setup generator and distribution.
+        auto generator = alpaka::rand::generator::createDefault(
+            acc,
+            linearizedGlobalThreadIdx,
+            0); // No specific subsequence start.
+        // For simplicity the interval is fixed to [0.0,1.0].
+        auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));
+
+        uint32_t localCount = 0;
+        for(size_t i = linearizedGlobalThreadIdx; i < numPoints; i += globalThreadExtent.prod())
+        {
+            // Generate a point in the 2D interval.
+            float x = dist(generator);
+            float y = dist(generator);
+            // Count every time where the point is "below" the given function.
+            if(y <= functor(acc, x))
+            {
+                ++localCount;
+            }
+        }
+
+        // Add the local result to the sum of the other results.
+        alpaka::atomicAdd(acc, globalCounter, localCount, alpaka::hierarchy::Blocks{});
+    }
+};
+
+
+auto main() -> int
+{
+    // Defines and setup.
+    using Dim = alpaka::DimInt<1>;
+    using Idx = std::size_t;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Host = alpaka::DevCpu;
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
+    auto const devHost = alpaka::getDevByIdx<Host>(0u);
+    using QueueProperty = alpaka::Blocking;
+    using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
+    QueueAcc queue{devAcc};
+
+    using BufHost = alpaka::Buf<Host, uint32_t, Dim, Idx>;
+    using BufAcc = alpaka::Buf<Acc, uint32_t, Dim, Idx>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    // Problem parameter.
+    constexpr size_t numPoints = 100000000u;
+    constexpr size_t extent = 1u;
+    constexpr size_t numThreads = 100u; // Kernel will decide numCalcPerThread.
+    constexpr size_t numAlpakaElementsPerThread = 1;
+    WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        Vec(numThreads),
+        Vec(numAlpakaElementsPerThread),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
+
+    // Setup buffer.
+    BufHost bufHost{alpaka::allocBuf<uint32_t, Idx>(devHost, extent)};
+    uint32_t* const ptrBufHost{alpaka::getPtrNative(bufHost)};
+    BufAcc bufAcc{alpaka::allocBuf<uint32_t, Idx>(devAcc, extent)};
+    uint32_t* const ptrBufAcc{alpaka::getPtrNative(bufAcc)};
+
+    // Initialize the global count to 0.
+    ptrBufHost[0] = 0.0f;
+    alpaka::memcpy(queue, bufAcc, bufHost, extent);
+
+    Kernel kernel;
+    alpaka::exec<Acc>(queue, workdiv, kernel, numPoints, ptrBufAcc, Function{});
+    alpaka::memcpy(queue, bufHost, bufAcc, extent);
+    alpaka::wait(queue);
+
+    // Check the result.
+    uint32_t globalCount = *ptrBufHost;
+
+    // Final result.
+    float finalResult = globalCount / static_cast<float>(numPoints);
+    constexpr double pi = 3.14159265358979323846;
+    constexpr double exactResult = pi / 4.0;
+    auto const error = std::abs(finalResult - exactResult);
+
+    std::cout << "exact result (pi / 4): " << pi / 4.0 << "\n";
+    std::cout << "final result: " << finalResult << "\n";
+    std::cout << "error: " << error << "\n";
+    return error > 0.001 ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/thirdParty/cupla/alpaka/example/openMPSchedule/CMakeLists.txt b/thirdParty/cupla/alpaka/example/openMPSchedule/CMakeLists.txt
new file mode 100644
index 0000000000..e75afac733
--- /dev/null
+++ b/thirdParty/cupla/alpaka/example/openMPSchedule/CMakeLists.txt
@@ -0,0 +1,60 @@
+#
+# Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan, Sergei Bastrakov
+#
+# This file exemplifies usage of alpaka.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
+# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.15)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME openMPSchedule)
+
+project(${_TARGET_NAME})
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(USE_ALPAKA_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/openMPSchedule.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/cupla/alpaka/example/openMPSchedule/src/openMPSchedule.cpp b/thirdParty/cupla/alpaka/example/openMPSchedule/src/openMPSchedule.cpp
new file mode 100644
index 0000000000..19a9951e9d
--- /dev/null
+++ b/thirdParty/cupla/alpaka/example/openMPSchedule/src/openMPSchedule.cpp
@@ -0,0 +1,164 @@
+/* Copyright 2019-2020 Benjamin Worpitz, Erik Zenker, Sergei Bastrakov
+ *
+ * This file exemplifies usage of alpaka.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+
+#include <cstdint>
+#include <iostream>
+
+// This example only makes sense with alpaka AccCpuOmp2Blocks backend enabled
+// and OpenMP runtime supporting at least 3.0. Disable it for other cases.
+#if defined _OPENMP && _OPENMP >= 200805 && ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+
+//#############################################################################
+//! OpenMP schedule demonstration kernel
+//!
+//! Prints distribution of alpaka thread indices between OpenMP threads.
+//! Its operator() is reused in other kernels of this example.
+//! Sets no schedule explicitly, so the default is used, controlled by the OMP_SCHEDULE environment variable.
+struct OpenMPScheduleDefaultKernel
+{
+    //-----------------------------------------------------------------------------
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc) const -> void
+    {
+        // For simplicity assume 1d index space throughout this example
+        using Idx = alpaka::Idx<TAcc>;
+        Idx const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+
+        // Print work distribution between threads for illustration
+        printf(
+            "alpaka global thread index %u is processed by OpenMP thread %d\n",
+            static_cast<std::uint32_t>(globalThreadIdx),
+            omp_get_thread_num());
+    }
+};
+
+//#############################################################################
+//! Kernel that sets the schedule via a static member.
+//! We inherit OpenMPScheduleDefaultKernel just to reuse its operator().
+struct OpenMPScheduleMemberKernel : public OpenMPScheduleDefaultKernel
+{
+    //! Static member to set OpenMP schedule to be used by the AccCpuOmp2Blocks accelerator.
+    //! This member is only checked for when the OmpSchedule trait is not specialized for this kernel type.
+    //! Note that constexpr is not required, however otherwise there has to be an external definition.
+    static constexpr auto ompSchedule = alpaka::omp::Schedule{alpaka::omp::Schedule::Static, 1};
+};
+
+//#############################################################################
+//! Kernel that sets the schedule via trait specialization.
+//! We inherit OpenMPScheduleDefaultKernel just to reuse its operator().
+//! The schedule trait specialization is given underneath this struct.
+//! It has a higher priority than the internal static member.
+struct OpenMPScheduleTraitKernel : public OpenMPScheduleDefaultKernel
+{
+};
+
+namespace alpaka
+{
+    namespace traits
+    {
+        //! Schedule trait specialization for OpenMPScheduleTraitKernel.
+        //! This is the most general way to define a schedule.
+        //! In case neither the trait nor the member are provided, alpaka does not set any runtime schedule and the
+        //! schedule used is defined by omp_set_schedule() called on the user side, or otherwise by the OMP_SCHEDULE
+        //! environment variable.
+        template<typename TAcc>
+        struct OmpSchedule<OpenMPScheduleTraitKernel, TAcc>
+        {
+            template<typename TDim, typename... TArgs>
+            ALPAKA_FN_HOST static auto getOmpSchedule(
+                OpenMPScheduleTraitKernel const& kernelFnObj,
+                Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+                Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+                TArgs const&... args) -> alpaka::omp::Schedule
+            {
+                // Determine schedule at runtime for the given kernel and run parameters.
+                // For this particular example kernel, TArgs is an empty pack and can be removed.
+                alpaka::ignore_unused(kernelFnObj);
+                alpaka::ignore_unused(blockThreadExtent);
+                alpaka::ignore_unused(threadElemExtent);
+                alpaka::ignore_unused(args...);
+
+                return alpaka::omp::Schedule{alpaka::omp::Schedule::Dynamic, 2};
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+auto main() -> int
+{
+// Fallback for the CI with disabled sequential backend
+#    if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+    return EXIT_SUCCESS;
+#    else
+    using Idx = std::size_t;
+
+    // OpenMP schedule illustrated by this example only has effect with
+    // with the AccCpuOmp2Blocks accelerator.
+    // This example also assumes 1d for simplicity.
+    using Acc = alpaka::AccCpuOmp2Blocks<alpaka::DimInt<1>, Idx>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
+
+    // Defines the synchronization behavior of a queue
+    using QueueProperty = alpaka::Blocking;
+    using Queue = alpaka::Queue<Acc, QueueProperty>;
+
+    // Select a device
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
+
+    // Create a queue on the device
+    Queue queue(devAcc);
+
+    // Define the work division
+    Idx const threadsPerGrid = 16u;
+    Idx const elementsPerThread = 1u;
+    auto const workDiv = alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        threadsPerGrid,
+        elementsPerThread,
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+
+    // Run the kernel setting no schedule explicitly.
+    // In this case the schedule is controlled by the OMP_SCHEDULE environment variable.
+    std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
+    alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleDefaultKernel{});
+    alpaka::wait(queue);
+
+    // Run the kernel setting the schedule via a trait
+    std::cout << "\n\nOpenMPScheduleMemberKernel setting the schedule via a static member:\n";
+    alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleMemberKernel{});
+    alpaka::wait(queue);
+
+    // Run the kernel setting the schedule via a trait
+    std::cout << "\n\nOpenMPScheduleTraitKernel setting the schedule via trait:\n";
+    alpaka::exec<Acc>(queue, workDiv, OpenMPScheduleTraitKernel{});
+    alpaka::wait(queue);
+
+    return EXIT_SUCCESS;
+#    endif
+}
+#else
+auto main() -> int
+{
+    std::cout << "This example is disabled, as it requires OpenMP runtime version >= 3.0 and alpaka accelerator"
+              << " AccCpuOmp2Blocks\n";
+    return EXIT_SUCCESS;
+}
+#endif
diff --git a/thirdParty/cupla/alpaka/example/reduce/CMakeLists.txt b/thirdParty/cupla/alpaka/example/reduce/CMakeLists.txt
index beda0ef2f5..9e7e3f1f7d 100644
--- a/thirdParty/cupla/alpaka/example/reduce/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/example/reduce/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2014-2019 Erik Zenker, Benjamin Worpitz
+# Copyright 2014-2020 Erik Zenker, Benjamin Worpitz, Jan Stephan
 #
-# This file exemplifies usage of Alpaka.
+# This file exemplifies usage of alpaka.
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
@@ -19,47 +19,45 @@
 ################################################################################
 # Required CMake version.
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15)
 
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 ################################################################################
 # Project.
 
-SET(_TARGET_NAME reduce)
+set(_TARGET_NAME reduce)
 
-PROJECT(${_TARGET_NAME})
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
+project(${_TARGET_NAME})
 
 #-------------------------------------------------------------------------------
 # Find alpaka.
 
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
+if(NOT TARGET alpaka::alpaka)
+    option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(USE_ALPAKA_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
 
 #-------------------------------------------------------------------------------
 # Add executable.
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     src/alpakaConfig.hpp
     src/iterator.hpp
     src/kernel.hpp
     src/reduce.cpp)
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
-    PUBLIC alpaka)
+    PUBLIC alpaka::alpaka)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/cupla/alpaka/example/reduce/src/alpakaConfig.hpp b/thirdParty/cupla/alpaka/example/reduce/src/alpakaConfig.hpp
index d3bd018270..b614b7a81e 100644
--- a/thirdParty/cupla/alpaka/example/reduce/src/alpakaConfig.hpp
+++ b/thirdParty/cupla/alpaka/example/reduce/src/alpakaConfig.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Jonas Schenke
  *
- * This file exemplifies usage of Alpaka.
+ * This file exemplifies usage of alpaka.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -18,13 +18,14 @@
 #pragma once
 
 #include "iterator.hpp"
+
 #include <alpaka/alpaka.hpp>
 
 // Defines for dimensions and types.
-using Dim = alpaka::dim::DimInt<1u>;
+using Dim = alpaka::DimInt<1u>;
 using Idx = uint64_t;
 using Extent = uint64_t;
-using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Extent>;
+using WorkDiv = alpaka::WorkDivMembers<Dim, Extent>;
 
 //-----------------------------------------------------------------------------
 //! Returns the supplied number or the maxumim number of threads per block for a
@@ -32,11 +33,10 @@ using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Extent>;
 //!
 //! \tparam TAcc The accelerator object.
 //! \tparam TSize The desired size.
-template <typename TAcc, uint64_t TSize>
+template<typename TAcc, uint64_t TSize>
 static constexpr uint64_t getMaxBlockSize()
 {
-    return (TAcc::MaxBlockSize::value > TSize) ? TSize
-                                               : TAcc::MaxBlockSize::value;
+    return (TAcc::MaxBlockSize::value > TSize) ? TSize : TAcc::MaxBlockSize::value;
 }
 
 //#############################################################################
@@ -47,7 +47,7 @@ static constexpr uint64_t getMaxBlockSize()
 //! \tparam TAcc The accelerator type.
 //!
 //! Defines the appropriate iterator for an accelerator.
-template <typename T, typename TBuf, typename TAcc>
+template<typename T, typename TBuf, typename TAcc>
 struct GetIterator
 {
     using Iterator = IteratorCpu<TAcc, T, TBuf>;
@@ -62,51 +62,44 @@ struct GetIterator
 //! Defines Host, Device, etc. for the OpenMP 2 Blocks accelerator.
 struct CpuOmp2Blocks
 {
-    using Host = alpaka::acc::AccCpuOmp2Blocks<Dim, Extent>;
-    using Acc = alpaka::acc::AccCpuOmp2Blocks<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCpuBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using SmCount = alpaka::dim::DimInt<1u>;
-    using MaxBlockSize = alpaka::dim::DimInt<1u>;
+    using Host = alpaka::AccCpuOmp2Blocks<Dim, Extent>;
+    using Acc = alpaka::AccCpuOmp2Blocks<Dim, Extent>;
+    using SmCount = alpaka::DimInt<1u>;
+    using MaxBlockSize = alpaka::DimInt<1u>;
 };
 
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccCpuOmp2Blocks<TArgs...>>
+template<typename T, typename TBuf, typename... TArgs>
+struct GetIterator<T, TBuf, alpaka::AccCpuOmp2Blocks<TArgs...>>
 {
-    using Iterator =
-        IteratorCpu<alpaka::acc::AccCpuOmp2Blocks<TArgs...>, T, TBuf>;
+    using Iterator = IteratorCpu<alpaka::AccCpuOmp2Blocks<TArgs...>, T, TBuf>;
 };
 #endif
 
-#ifdef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+#    ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
 //#############################################################################
-//! OpenMP 4 defines
+//! OpenMP 5 defines
 //!
-//! Defines Host, Device, etc. for the OpenMP 4 accelerator.
-struct CpuOmp4
+//! Defines Host, Device, etc. for the OpenMP 5 accelerator.
+struct Omp5
 {
-    using Host = alpaka::acc::AccCpuSerial<Dim, Extent>;
-    using Acc = alpaka::acc::AccCpuOmp4<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCpuBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using MaxBlockSize = alpaka::dim::DimInt<1u>;
+    using Host = alpaka::AccCpuSerial<Dim, Extent>;
+    using Acc = alpaka::AccOmp5<Dim, Extent>;
+    using DevHost = alpaka::Dev<Host>;
+    using DevAcc = alpaka::Dev<Acc>;
+    using PltfHost = alpaka::Pltf<DevHost>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+    using Stream = alpaka::QueueCpuBlocking;
+    using Event = alpaka::Event<Stream>;
+    using MaxBlockSize = alpaka::DimInt<1u>;
 };
 
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccCpuOmp4<TArgs...>>
+template<typename T, typename TBuf, typename... TArgs>
+struct GetIterator<T, TBuf, alpaka::AccOmp5<TArgs...>>
 {
-    using Iterator = IteratorCpu<alpaka::acc::AccCpuOmp4<TArgs...>, T, TBuf>;
+    using Iterator = IteratorCpu<alpaka::AccOmp5<TArgs...>, T, TBuf>;
 };
-#endif
+#    endif
 #endif
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
@@ -116,21 +109,15 @@ struct GetIterator<T, TBuf, alpaka::acc::AccCpuOmp4<TArgs...>>
 //! Defines Host, Device, etc. for the serial CPU accelerator.
 struct CpuSerial
 {
-    using Host = alpaka::acc::AccCpuSerial<Dim, Extent>;
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCpuBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using MaxBlockSize = alpaka::dim::DimInt<1u>;
+    using Host = alpaka::AccCpuSerial<Dim, Extent>;
+    using Acc = alpaka::AccCpuSerial<Dim, Extent>;
+    using MaxBlockSize = alpaka::DimInt<1u>;
 };
 
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccCpuSerial<TArgs...>>
+template<typename T, typename TBuf, typename... TArgs>
+struct GetIterator<T, TBuf, alpaka::AccCpuSerial<TArgs...>>
 {
-    using Iterator = IteratorCpu<alpaka::acc::AccCpuSerial<TArgs...>, T, TBuf>;
+    using Iterator = IteratorCpu<alpaka::AccCpuSerial<TArgs...>, T, TBuf>;
 };
 #endif
 
@@ -141,47 +128,35 @@ struct GetIterator<T, TBuf, alpaka::acc::AccCpuSerial<TArgs...>>
 //! Defines Host, Device, etc. for the CPU Threads accelerator.
 struct CpuThreads
 {
-    using Host = alpaka::acc::AccCpuThreads<Dim, Extent>;
-    using Acc = alpaka::acc::AccCpuThreads<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCpuBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using MaxBlockSize = alpaka::dim::DimInt<1u>;
+    using Host = alpaka::AccCpuThreads<Dim, Extent>;
+    using Acc = alpaka::AccCpuThreads<Dim, Extent>;
+    using MaxBlockSize = alpaka::DimInt<1u>;
 };
 
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccCpuThreads<TArgs...>>
+template<typename T, typename TBuf, typename... TArgs>
+struct GetIterator<T, TBuf, alpaka::AccCpuThreads<TArgs...>>
 {
-    using Iterator = IteratorCpu<alpaka::acc::AccCpuThreads<TArgs...>, T, TBuf>;
+    using Iterator = IteratorCpu<alpaka::AccCpuThreads<TArgs...>, T, TBuf>;
 };
 #endif
 
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#    ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
 //#############################################################################
 //! CUDA defines
 //!
-//! Defines Host, Device, etc. for the CUDA accelerator.
+//! Defines Host, Device, etc. for the CUDA/HIP accelerator.
 struct GpuCudaRt
 {
-    using Host = alpaka::acc::AccCpuSerial<Dim, Extent>;
-    using Acc = alpaka::acc::AccGpuCudaRt<Dim, Extent>;
-    using DevHost = alpaka::dev::Dev<Host>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using Stream = alpaka::queue::QueueCudaRtNonBlocking;
-    using Event = alpaka::event::Event<Stream>;
-    using MaxBlockSize = alpaka::dim::DimInt<1024u>;
+    using Host = alpaka::AccCpuSerial<Dim, Extent>;
+    using Acc = alpaka::AccGpuCudaRt<Dim, Extent>;
+    using MaxBlockSize = alpaka::DimInt<1024u>;
 };
 
-template <typename T, typename TBuf, typename... TArgs>
-struct GetIterator<T, TBuf, alpaka::acc::AccGpuCudaRt<TArgs...>>
+template<typename T, typename TBuf, typename... TArgs>
+struct GetIterator<T, TBuf, alpaka::AccGpuUniformCudaHipRt<TArgs...>>
 {
-    using Iterator = IteratorGpu<alpaka::acc::AccGpuCudaRt<TArgs...>, T, TBuf>;
+    using Iterator = IteratorGpu<alpaka::AccGpuUniformCudaHipRt<TArgs...>, T, TBuf>;
 };
-#endif
+#    endif
 #endif
diff --git a/thirdParty/cupla/alpaka/example/reduce/src/iterator.hpp b/thirdParty/cupla/alpaka/example/reduce/src/iterator.hpp
index 15c3da3097..434b561277 100644
--- a/thirdParty/cupla/alpaka/example/reduce/src/iterator.hpp
+++ b/thirdParty/cupla/alpaka/example/reduce/src/iterator.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Jonas Schenke
  *
- * This file exemplifies usage of Alpaka.
+ * This file exemplifies usage of alpaka.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -24,11 +24,11 @@
 //!
 //! \tparam T The type.
 //! \tparam TBuf The buffer type (standard is T).
-template <typename T, typename TBuf = T>
+template<typename T, typename TBuf = T>
 class Iterator
 {
 protected:
-    const TBuf *mData;
+    const TBuf* mData;
     uint64_t mIndex;
     const uint64_t mMaximum;
 
@@ -39,10 +39,10 @@ class Iterator
     //! \param data A pointer to the data.
     //! \param index The index.
     //! \param maximum The first index outside of the iterator memory.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE Iterator(const TBuf *data,
-                                                 uint32_t index,
-                                                 uint64_t maximum)
-        : mData(data), mIndex(index), mMaximum(maximum)
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE Iterator(const TBuf* data, uint32_t index, uint64_t maximum)
+        : mData(data)
+        , mIndex(index)
+        , mMaximum(maximum)
     {
     }
 
@@ -50,7 +50,7 @@ class Iterator
     //! Constructor.
     //!
     //! \param other The other iterator object.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE Iterator(const Iterator &other) = default;
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE Iterator(const Iterator& other) = default;
 
     //-----------------------------------------------------------------------------
     //! Compare operator.
@@ -58,11 +58,9 @@ class Iterator
     //! \param other The other object.
     //!
     //! Returns true if objects are equal and false otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator==(const Iterator &other) const -> bool
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator==(const Iterator& other) const -> bool
     {
-        return (this->mData == other.mData) && (this->mIndex == other.mIndex) &&
-               (this->mMaximum == other.mMaximum);
+        return (this->mData == other.mData) && (this->mIndex == other.mIndex) && (this->mMaximum == other.mMaximum);
     }
 
     //-----------------------------------------------------------------------------
@@ -71,8 +69,7 @@ class Iterator
     //! \param other The other object.
     //!
     //! Returns false if objects are equal and true otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator!=(const Iterator &other) const -> bool
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator!=(const Iterator& other) const -> bool
     {
         return !operator==(other);
     }
@@ -84,8 +81,7 @@ class Iterator
     //!
     //! Returns false if the other object is equal or smaller and true
     //! otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator<(const Iterator &other) const -> bool
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator<(const Iterator& other) const -> bool
     {
         return mIndex < other.mIndex;
     }
@@ -96,8 +92,7 @@ class Iterator
     //! \param other The other object.
     //!
     //! Returns false if the other object is equal or bigger and true otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator>(const Iterator &other) const -> bool
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator>(const Iterator& other) const -> bool
     {
         return mIndex > other.mIndex;
     }
@@ -108,8 +103,7 @@ class Iterator
     //! \param other The other object.
     //!
     //! Returns true if the other object is equal or bigger and false otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator<=(const Iterator &other) const -> bool
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator<=(const Iterator& other) const -> bool
     {
         return mIndex <= other.mIndex;
     }
@@ -121,8 +115,7 @@ class Iterator
     //!
     //! Returns true if the other object is equal or smaller and false
     //! otherwise.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto
-    operator>=(const Iterator &other) const -> bool
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator>=(const Iterator& other) const -> bool
     {
         return mIndex >= other.mIndex;
     }
@@ -131,7 +124,7 @@ class Iterator
     //! Returns the current element.
     //!
     //! Returns a reference to the current index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator*() -> const T &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator*() -> const T&
     {
         return mData[mIndex];
     }
@@ -143,7 +136,7 @@ class Iterator
 //! \tparam TAcc The accelerator type.
 //! \tparam T The type.
 //! \tparam TBuf The buffer type (standard is T).
-template <typename TAcc, typename T, typename TBuf = T>
+template<typename TAcc, typename T, typename TBuf = T>
 class IteratorCpu : public Iterator<T, TBuf>
 {
 public:
@@ -155,17 +148,13 @@ class IteratorCpu : public Iterator<T, TBuf>
     //! \param linearizedIndex The linearized index.
     //! \param gridSize The grid size.
     //! \param n The problem size.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE IteratorCpu(const TAcc &acc,
-                                                    const TBuf *data,
-                                                    uint32_t linearizedIndex,
-                                                    uint32_t gridSize,
-                                                    uint64_t n)
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE
+    IteratorCpu(const TAcc& acc, const TBuf* data, uint32_t linearizedIndex, uint32_t gridSize, uint64_t n)
         : Iterator<T, TBuf>(
-              data,
-              static_cast<uint32_t>((n * linearizedIndex) / 
-                                    alpaka::math::min(acc, static_cast<uint64_t>(gridSize), n)),
-              static_cast<uint32_t>((n * (linearizedIndex + 1)) / 
-                  alpaka::math::min(acc, static_cast<uint64_t>(gridSize), n)))
+            data,
+            static_cast<uint32_t>((n * linearizedIndex) / alpaka::math::min(acc, static_cast<uint64_t>(gridSize), n)),
+            static_cast<uint32_t>(
+                (n * (linearizedIndex + 1)) / alpaka::math::min(acc, static_cast<uint64_t>(gridSize), n)))
     {
     }
 
@@ -183,7 +172,7 @@ class IteratorCpu : public Iterator<T, TBuf>
     //! element.
     //!
     //! Returns a reference to the next index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator++() -> IteratorCpu &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator++() -> IteratorCpu&
     {
         ++(this->mIndex);
         return *this;
@@ -206,7 +195,7 @@ class IteratorCpu : public Iterator<T, TBuf>
     //! element.
     //!
     //! Returns a reference to the previous index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator--() -> IteratorCpu &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator--() -> IteratorCpu&
     {
         --(this->mIndex);
         return *this;
@@ -228,8 +217,7 @@ class IteratorCpu : public Iterator<T, TBuf>
     //! Returns the index + a supplied offset.
     //!
     //! \param n The offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+(uint64_t n) const
-        -> IteratorCpu
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+(uint64_t n) const -> IteratorCpu
     {
         IteratorCpu ret(*this);
         ret.mIndex += n;
@@ -240,8 +228,7 @@ class IteratorCpu : public Iterator<T, TBuf>
     //! Returns the index - a supplied offset.
     //!
     //! \param n The offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-(uint64_t n) const
-        -> IteratorCpu
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-(uint64_t n) const -> IteratorCpu
     {
         IteratorCpu ret(*this);
         ret.mIndex -= n;
@@ -254,8 +241,7 @@ class IteratorCpu : public Iterator<T, TBuf>
     //! \param offset The offset.
     //!
     //! Returns the current object offset by the offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+=(uint64_t offset)
-        -> IteratorCpu &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+=(uint64_t offset) -> IteratorCpu&
     {
         this->mIndex += offset;
         return *this;
@@ -267,8 +253,7 @@ class IteratorCpu : public Iterator<T, TBuf>
     //! \param offset The offset.
     //!
     //! Returns the current object offset by the offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-=(uint64_t offset)
-        -> IteratorCpu &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-=(uint64_t offset) -> IteratorCpu&
     {
         this->mIndex -= offset;
         return *this;
@@ -281,7 +266,7 @@ class IteratorCpu : public Iterator<T, TBuf>
 //! \tparam TAcc The accelerator type.
 //! \tparam T The type.
 //! \tparam TBuf The buffer type (standard is T).
-template <typename TAcc, typename T, typename TBuf = T>
+template<typename TAcc, typename T, typename TBuf = T>
 class IteratorGpu : public Iterator<T, TBuf>
 {
 private:
@@ -295,12 +280,10 @@ class IteratorGpu : public Iterator<T, TBuf>
     //! \param linearizedIndex The linearized index.
     //! \param gridSize The grid size.
     //! \param n The problem size.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE IteratorGpu(const TAcc &,
-                                                    const TBuf *data,
-                                                    uint32_t linearizedIndex,
-                                                    uint32_t gridSize,
-                                                    uint64_t n)
-        : Iterator<T, TBuf>(data, linearizedIndex, n), mGridSize(gridSize)
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE
+    IteratorGpu(const TAcc&, const TBuf* data, uint32_t linearizedIndex, uint32_t gridSize, uint64_t n)
+        : Iterator<T, TBuf>(data, linearizedIndex, n)
+        , mGridSize(gridSize)
     {
     }
 
@@ -318,7 +301,7 @@ class IteratorGpu : public Iterator<T, TBuf>
     //! element.
     //!
     //! Returns a reference to the next index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator++() -> IteratorGpu &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator++() -> IteratorGpu&
     {
         this->mIndex += this->mGridSize;
         return *this;
@@ -341,7 +324,7 @@ class IteratorGpu : public Iterator<T, TBuf>
     //! element.
     //!
     //! Returns a reference to the previous index.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator--() -> IteratorGpu &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator--() -> IteratorGpu&
     {
         this->mIndex -= this->mGridSize;
         return *this;
@@ -363,8 +346,7 @@ class IteratorGpu : public Iterator<T, TBuf>
     //! Returns the index + a supplied offset.
     //!
     //! \param n The offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+(uint64_t n) const
-        -> IteratorGpu
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+(uint64_t n) const -> IteratorGpu
     {
         auto ret(*this);
         ret.mIndex += n * mGridSize;
@@ -375,8 +357,7 @@ class IteratorGpu : public Iterator<T, TBuf>
     //! Returns the index - a supplied offset.
     //!
     //! \param n The offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-(uint64_t n) const
-        -> IteratorGpu
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-(uint64_t n) const -> IteratorGpu
     {
         auto ret(*this);
         ret.mIndex -= n * mGridSize;
@@ -389,8 +370,7 @@ class IteratorGpu : public Iterator<T, TBuf>
     //! \param offset The offset.
     //!
     //! Returns the current object offset by the offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+=(uint64_t offset)
-        -> IteratorGpu &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator+=(uint64_t offset) -> IteratorGpu&
     {
         this->mIndex += offset * this->mGridSize;
         return *this;
@@ -402,8 +382,7 @@ class IteratorGpu : public Iterator<T, TBuf>
     //! \param offset The offset.
     //!
     //! Returns the current object offset by the offset.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-=(uint64_t offset)
-        -> IteratorGpu &
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto operator-=(uint64_t offset) -> IteratorGpu&
     {
         this->mIndex -= offset * this->mGridSize;
         return *this;
diff --git a/thirdParty/cupla/alpaka/example/reduce/src/kernel.hpp b/thirdParty/cupla/alpaka/example/reduce/src/kernel.hpp
index 63193a9976..59a04a60f4 100644
--- a/thirdParty/cupla/alpaka/example/reduce/src/kernel.hpp
+++ b/thirdParty/cupla/alpaka/example/reduce/src/kernel.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Jonas Schenke
  *
- * This file exemplifies usage of Alpaka.
+ * This file exemplifies usage of alpaka.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -21,7 +21,7 @@
 
 //#############################################################################
 //! A cheap wrapper around a C-style array in heap memory.
-template <typename T, uint64_t size>
+template<typename T, uint64_t size>
 struct cheapArray
 {
     T data[size];
@@ -32,7 +32,7 @@ struct cheapArray
     //! \param index The index of the element to be accessed.
     //!
     //! Returns the requested element per reference.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T &operator[](uint64_t index)
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE T& operator[](uint64_t index)
     {
         return data[index];
     }
@@ -43,7 +43,7 @@ struct cheapArray
     //! \param index The index of the element to be accessed.
     //!
     //! Returns the requested element per constant reference.
-    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE const T &operator[](uint64_t index) const
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE const T& operator[](uint64_t index) const
     {
         return data[index];
     }
@@ -55,7 +55,7 @@ struct cheapArray
 //! \tparam TBlockSize The block size.
 //! \tparam T The data type.
 //! \tparam TFunc The Functor type for the reduction function.
-template <uint32_t TBlockSize, typename T, typename TFunc>
+template<uint32_t TBlockSize, typename T, typename TFunc>
 struct ReduceKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -72,34 +72,28 @@ struct ReduceKernel
     //! \param destination The destination memory.
     //! \param n The problem size.
     //! \param func The reduction function.
-    template <typename TAcc, typename TElem, typename TIdx>
-    ALPAKA_FN_ACC auto operator()(TAcc const &acc,
-                                  TElem const *const source,
-                                  TElem *destination,
-                                  TIdx const &n,
-                                  TFunc func) const -> void
+    template<typename TAcc, typename TElem, typename TIdx>
+    ALPAKA_FN_ACC auto operator()(
+        TAcc const& acc,
+        TElem const* const source,
+        TElem* destination,
+        TIdx const& n,
+        TFunc func) const -> void
     {
-        auto &sdata(
-            alpaka::block::shared::st::allocVar<cheapArray<T, TBlockSize>,
-                                                __COUNTER__>(acc));
+        auto& sdata(alpaka::declareSharedVar<cheapArray<T, TBlockSize>, __COUNTER__>(acc));
 
-        const uint32_t blockIndex(static_cast<uint32_t>(
-            alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]));
-        const uint32_t threadIndex(static_cast<uint32_t>(
-            alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0]));
-        const uint32_t gridDimension(static_cast<uint32_t>(
-            alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0]));
+        const uint32_t blockIndex(static_cast<uint32_t>(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0]));
+        const uint32_t threadIndex(static_cast<uint32_t>(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0]));
+        const uint32_t gridDimension(static_cast<uint32_t>(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0]));
 
         // equivalent to blockIndex * TBlockSize + threadIndex
-        const uint32_t linearizedIndex(static_cast<uint32_t>(
-            alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]));
+        const uint32_t linearizedIndex(static_cast<uint32_t>(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]));
 
-        typename GetIterator<T, TElem, TAcc>::Iterator it(
-            acc, source, linearizedIndex, gridDimension * TBlockSize, n);
+        typename GetIterator<T, TElem, TAcc>::Iterator it(acc, source, linearizedIndex, gridDimension * TBlockSize, n);
 
         T result = 0; // suppresses compiler warnings
 
-        if (threadIndex < n)
+        if(threadIndex < n)
             result = *(it++); // avoids using the
                               // neutral element of specific
 
@@ -111,55 +105,46 @@ struct ReduceKernel
         // the thread of our block reduces its 4 grid-neighbored threads and
         // advances by grid-striding loop (maybe 128bit load improve perf)
 
-        while (it + 3 < it.end())
+        while(it + 3 < it.end())
         {
-            result = func(
-                func(func(result, func(*it, *(it + 1))), *(it + 2)), *(it + 3));
+            result = func(func(func(result, func(*it, *(it + 1))), *(it + 2)), *(it + 3));
             it += 4;
         }
 
         // doing the remaining blocks
-        while (it < it.end())
+        while(it < it.end())
             result = func(result, *(it++));
 
-        if (threadIndex < n)
+        if(threadIndex < n)
             sdata[threadIndex] = result;
 
-        alpaka::block::sync::syncBlockThreads(acc);
+        alpaka::syncBlockThreads(acc);
 
         // --------
         // Level 2: block + warp reduce, reading from shared memory
         // --------
 
         ALPAKA_UNROLL()
-        for (uint32_t currentBlockSize = TBlockSize,
-                      currentBlockSizeUp =
-                          (TBlockSize + 1) / 2; // ceil(TBlockSize/2.0)
-             currentBlockSize > 1;
-             currentBlockSize = currentBlockSize / 2,
-                      currentBlockSizeUp = (currentBlockSize + 1) /
-                                           2) // ceil(currentBlockSize/2.0)
+        for(uint32_t currentBlockSize = TBlockSize,
+                     currentBlockSizeUp = (TBlockSize + 1) / 2; // ceil(TBlockSize/2.0)
+            currentBlockSize > 1;
+            currentBlockSize = currentBlockSize / 2,
+                     currentBlockSizeUp = (currentBlockSize + 1) / 2) // ceil(currentBlockSize/2.0)
         {
-            bool cond =
-                threadIndex < currentBlockSizeUp // only first half of block
-                                                 // is working
-                && (threadIndex + currentBlockSizeUp) <
-                       TBlockSize // index for second half must be in bounds
-                && (blockIndex * TBlockSize + threadIndex +
-                    currentBlockSizeUp) < n &&
-                threadIndex <
-                    n; // if elem in second half has been initialized before
-
-            if (cond)
-                sdata[threadIndex] =
-                    func(sdata[threadIndex],
-                         sdata[threadIndex + currentBlockSizeUp]);
-
-            alpaka::block::sync::syncBlockThreads(acc);
+            bool cond = threadIndex < currentBlockSizeUp // only first half of block
+                                                         // is working
+                && (threadIndex + currentBlockSizeUp) < TBlockSize // index for second half must be in bounds
+                && (blockIndex * TBlockSize + threadIndex + currentBlockSizeUp) < n
+                && threadIndex < n; // if elem in second half has been initialized before
+
+            if(cond)
+                sdata[threadIndex] = func(sdata[threadIndex], sdata[threadIndex + currentBlockSizeUp]);
+
+            alpaka::syncBlockThreads(acc);
         }
 
         // store block result to gmem
-        if (threadIndex == 0 && threadIndex < n)
+        if(threadIndex == 0 && threadIndex < n)
             destination[blockIndex] = sdata[0];
     }
 };
diff --git a/thirdParty/cupla/alpaka/example/reduce/src/reduce.cpp b/thirdParty/cupla/alpaka/example/reduce/src/reduce.cpp
index 724979790a..91c52ea5f3 100644
--- a/thirdParty/cupla/alpaka/example/reduce/src/reduce.cpp
+++ b/thirdParty/cupla/alpaka/example/reduce/src/reduce.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Jonas Schenke, Matthias Werner
  *
- * This file exemplifies usage of Alpaka.
+ * This file exemplifies usage of alpaka.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -17,31 +17,31 @@
 
 #include "alpakaConfig.hpp"
 #include "kernel.hpp"
+
 #include <alpaka/alpaka.hpp>
+
 #include <cstdlib>
 #include <iostream>
 
 // It requires support for extended lambdas when using nvcc as CUDA compiler.
 // Requires sequential backend if CI is used
-#if (!defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_EXTENDED_LAMBDA__) )) && \
-    (!defined(ALPAKA_CI) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED))
+#if(!defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_EXTENDED_LAMBDA__)))                                 \
+    && (!defined(ALPAKA_CI) || defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED))
 
 // use defines of a specific accelerator from alpakaConfig.hpp
 // that are defined in alpakaConfig.hpp
 // - GpuCudaRt
 // - CpuThreads
 // - CpuOmp2Blocks
-// - CpuOmp4
+// - Omp5
 // - CpuSerial
 //
 using Accelerator = CpuSerial;
 
-using DevAcc = Accelerator::DevAcc;
-using DevHost = Accelerator::DevHost;
-using QueueAcc = Accelerator::Stream;
 using Acc = Accelerator::Acc;
-using PltfAcc = Accelerator::PltfAcc;
-using PltfHost = Accelerator::PltfHost;
+using Host = Accelerator::Host;
+using QueueProperty = alpaka::Blocking;
+using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
 using MaxBlockSize = Accelerator::MaxBlockSize;
 
 //-----------------------------------------------------------------------------
@@ -58,70 +58,66 @@ using MaxBlockSize = Accelerator::MaxBlockSize;
 //! \param func The reduction function.
 //!
 //! Returns true if the reduction was correct and false otherwise.
-template<typename T, typename TFunc>
-T reduce(DevHost devHost, DevAcc devAcc, QueueAcc queue, uint64_t n, alpaka::mem::buf::Buf<DevHost, T, Dim, Idx> hostMemory, TFunc func)
+template<typename T, typename DevHost, typename DevAcc, typename TFunc>
+T reduce(
+    DevHost devHost,
+    DevAcc devAcc,
+    QueueAcc queue,
+    uint64_t n,
+    alpaka::Buf<DevHost, T, Dim, Idx> hostMemory,
+    TFunc func)
 {
     static constexpr uint64_t blockSize = getMaxBlockSize<Accelerator, 256>();
 
     // calculate optimal block size (8 times the MP count proved to be
     // relatively near to peak performance in benchmarks)
-    uint32_t blockCount = static_cast<uint32_t>(
-        alpaka::acc::getAccDevProps<Acc, DevAcc>(devAcc).m_multiProcessorCount *
-        8);
-    uint32_t maxBlockCount = static_cast<uint32_t>(
-        (((n + 1) / 2) - 1) / blockSize + 1); // ceil(ceil(n/2.0)/blockSize)
+    uint32_t blockCount = static_cast<uint32_t>(alpaka::getAccDevProps<Acc>(devAcc).m_multiProcessorCount * 8);
+    uint32_t maxBlockCount = static_cast<uint32_t>((((n + 1) / 2) - 1) / blockSize + 1); // ceil(ceil(n/2.0)/blockSize)
 
-    if (blockCount > maxBlockCount)
+    if(blockCount > maxBlockCount)
         blockCount = maxBlockCount;
 
-    alpaka::mem::buf::Buf<DevAcc, T, Dim, Extent> sourceDeviceMemory =
-        alpaka::mem::buf::alloc<T, Idx>(devAcc, n);
+    alpaka::Buf<DevAcc, T, Dim, Extent> sourceDeviceMemory = alpaka::allocBuf<T, Idx>(devAcc, n);
 
-    alpaka::mem::buf::Buf<DevAcc, T, Dim, Extent> destinationDeviceMemory =
-        alpaka::mem::buf::alloc<T, Idx>(
-            devAcc, static_cast<Extent>(blockCount));
+    alpaka::Buf<DevAcc, T, Dim, Extent> destinationDeviceMemory
+        = alpaka::allocBuf<T, Idx>(devAcc, static_cast<Extent>(blockCount));
 
     // copy the data to the GPU
-    alpaka::mem::view::copy(queue, sourceDeviceMemory, hostMemory, n);
+    alpaka::memcpy(queue, sourceDeviceMemory, hostMemory, n);
 
     // create kernels with their workdivs
     ReduceKernel<blockSize, T, TFunc> kernel1, kernel2;
-    WorkDiv workDiv1{ static_cast<Extent>(blockCount),
-                      static_cast<Extent>(blockSize),
-                      static_cast<Extent>(1) };
-    WorkDiv workDiv2{ static_cast<Extent>(1),
-                      static_cast<Extent>(blockSize),
-                      static_cast<Extent>(1) };
+    WorkDiv workDiv1{static_cast<Extent>(blockCount), static_cast<Extent>(blockSize), static_cast<Extent>(1)};
+    WorkDiv workDiv2{static_cast<Extent>(1), static_cast<Extent>(blockSize), static_cast<Extent>(1)};
 
     // create main reduction kernel execution task
-    auto const taskKernelReduceMain(alpaka::kernel::createTaskKernel<Acc>(
+    auto const taskKernelReduceMain(alpaka::createTaskKernel<Acc>(
         workDiv1,
         kernel1,
-        alpaka::mem::view::getPtrNative(sourceDeviceMemory),
-        alpaka::mem::view::getPtrNative(destinationDeviceMemory),
+        alpaka::getPtrNative(sourceDeviceMemory),
+        alpaka::getPtrNative(destinationDeviceMemory),
         n,
         func));
 
     // create last block reduction kernel execution task
-    auto const taskKernelReduceLastBlock(alpaka::kernel::createTaskKernel<Acc>(
+    auto const taskKernelReduceLastBlock(alpaka::createTaskKernel<Acc>(
         workDiv2,
         kernel2,
-        alpaka::mem::view::getPtrNative(destinationDeviceMemory),
-        alpaka::mem::view::getPtrNative(destinationDeviceMemory),
+        alpaka::getPtrNative(destinationDeviceMemory),
+        alpaka::getPtrNative(destinationDeviceMemory),
         blockCount,
         func));
 
     // enqueue both kernel execution tasks
-    alpaka::queue::enqueue(queue, taskKernelReduceMain);
-    alpaka::queue::enqueue(queue, taskKernelReduceLastBlock);
+    alpaka::enqueue(queue, taskKernelReduceMain);
+    alpaka::enqueue(queue, taskKernelReduceLastBlock);
 
     //  download result from GPU
     T resultGpuHost;
-    auto resultGpuDevice =
-        alpaka::mem::view::ViewPlainPtr<DevHost, T, Dim, Idx>(
-            &resultGpuHost, devHost, static_cast<Extent>(blockSize));
+    auto resultGpuDevice
+        = alpaka::ViewPlainPtr<DevHost, T, Dim, Idx>(&resultGpuHost, devHost, static_cast<Extent>(blockSize));
 
-    alpaka::mem::view::copy(queue, resultGpuDevice, destinationDeviceMemory, 1);
+    alpaka::memcpy(queue, resultGpuDevice, destinationDeviceMemory, 1);
 
     return resultGpuHost;
 }
@@ -135,28 +131,25 @@ int main()
     using T = uint32_t;
     static constexpr uint64_t blockSize = getMaxBlockSize<Accelerator, 256>();
 
-    DevAcc devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(dev));
-    DevHost devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
+    auto devAcc = alpaka::getDevByIdx<Acc>(dev);
+    auto devHost = alpaka::getDevByIdx<Host>(0u);
     QueueAcc queue(devAcc);
 
     // calculate optimal block size (8 times the MP count proved to be
     // relatively near to peak performance in benchmarks)
-    uint32_t blockCount = static_cast<uint32_t>(
-        alpaka::acc::getAccDevProps<Acc, DevAcc>(devAcc).m_multiProcessorCount *
-        8);
-    uint32_t maxBlockCount = static_cast<uint32_t>(
-        (((n + 1) / 2) - 1) / blockSize + 1); // ceil(ceil(n/2.0)/blockSize)
+    uint32_t blockCount = static_cast<uint32_t>(alpaka::getAccDevProps<Acc>(devAcc).m_multiProcessorCount * 8);
+    uint32_t maxBlockCount = static_cast<uint32_t>((((n + 1) / 2) - 1) / blockSize + 1); // ceil(ceil(n/2.0)/blockSize)
 
-    if (blockCount > maxBlockCount)
+    if(blockCount > maxBlockCount)
         blockCount = maxBlockCount;
 
     // allocate memory
-    auto hostMemory = alpaka::mem::buf::alloc<T, Idx>(devHost, n);
+    auto hostMemory = alpaka::allocBuf<T, Idx>(devHost, n);
 
-    T *nativeHostMemory = alpaka::mem::view::getPtrNative(hostMemory);
+    T* nativeHostMemory = alpaka::getPtrNative(hostMemory);
 
     // fill array with data
-    for (uint64_t i = 0; i < n; i++)
+    for(uint64_t i = 0; i < n; i++)
         nativeHostMemory[i] = static_cast<T>(i + 1);
 
     // define the reduction function
@@ -164,14 +157,13 @@ int main()
 
     // reduce
     T result = reduce<T>(devHost, devAcc, queue, n, hostMemory, addFn);
-    alpaka::wait::wait(queue);
+    alpaka::wait(queue);
 
     // check result
     T expectedResult = static_cast<T>(n / 2 * (n + 1));
-    if (result != expectedResult)
+    if(result != expectedResult)
     {
-        std::cerr << "Results don't match: " << result << " != " << expectedResult
-                  << "\n";
+        std::cerr << "Results don't match: " << result << " != " << expectedResult << "\n";
         return EXIT_FAILURE;
     }
 
@@ -182,7 +174,8 @@ int main()
 
 #else
 
-int main() {
+int main()
+{
     return EXIT_SUCCESS;
 }
 
diff --git a/thirdParty/cupla/alpaka/example/vectorAdd/CMakeLists.txt b/thirdParty/cupla/alpaka/example/vectorAdd/CMakeLists.txt
index e6d5ab8ad4..11e45e289e 100644
--- a/thirdParty/cupla/alpaka/example/vectorAdd/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/example/vectorAdd/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz
+# Copyright 2014-2020 Benjamin Worpitz, Jan Stephan
 #
-# This file exemplifies usage of Alpaka.
+# This file exemplifies usage of alpaka.
 #
 # Permission to use, copy, modify, and/or distribute this software for any
 # purpose with or without fee is hereby granted, provided that the above
@@ -19,44 +19,43 @@
 ################################################################################
 # Required CMake version.
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15)
 
-SET_PROPERTY(GLOBAL PROPERTY USE_FOLDERS ON)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 ################################################################################
 # Project.
 
-SET(_TARGET_NAME vectorAdd)
+set(_TARGET_NAME vectorAdd)
 
-PROJECT(${_TARGET_NAME})
+project(${_TARGET_NAME})
 
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
 
 #-------------------------------------------------------------------------------
 # Find alpaka.
 
-SET(ALPAKA_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../" CACHE STRING "The location of the alpaka library")
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
+if(NOT TARGET alpaka::alpaka)
+    option(USE_ALPAKA_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(USE_ALPAKA_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
 
 #-------------------------------------------------------------------------------
 # Add executable.
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     src/vectorAdd.cpp)
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
-    PUBLIC alpaka)
+    PUBLIC alpaka::alpaka)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER example)
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/thirdParty/cupla/alpaka/example/vectorAdd/src/vectorAdd.cpp b/thirdParty/cupla/alpaka/example/vectorAdd/src/vectorAdd.cpp
index a7126c43e4..3bc1401ab7 100644
--- a/thirdParty/cupla/alpaka/example/vectorAdd/src/vectorAdd.cpp
+++ b/thirdParty/cupla/alpaka/example/vectorAdd/src/vectorAdd.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file exemplifies usage of Alpaka.
+ * This file exemplifies usage of alpaka.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -16,9 +16,11 @@
  */
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
 
-#include <random>
+#include <chrono>
 #include <iostream>
+#include <random>
 #include <typeinfo>
 
 //#############################################################################
@@ -37,34 +39,28 @@ class VectorAddKernel
     //! \param C The destination vector.
     //! \param numElements The number of elements.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem,
-        typename TIdx>
+    template<typename TAcc, typename TElem, typename TIdx>
     ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TElem const * const A,
-        TElem const * const B,
-        TElem * const C,
-        TIdx const & numElements) const
-    -> void
+        TAcc const& acc,
+        TElem const* const A,
+        TElem const* const B,
+        TElem* const C,
+        TIdx const& numElements) const -> void
     {
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 1,
-            "The VectorAddKernel expects 1-dimensional indices!");
+        static_assert(alpaka::Dim<TAcc>::value == 1, "The VectorAddKernel expects 1-dimensional indices!");
 
-        TIdx const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        TIdx const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
+        TIdx const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        TIdx const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
         TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
 
         if(threadFirstElemIdx < numElements)
         {
             // Calculate the number of elements to compute in this thread.
             // The result is uniform for all but the last thread.
-            TIdx const threadLastElemIdx(threadFirstElemIdx+threadElemExtent);
+            TIdx const threadLastElemIdx(threadFirstElemIdx + threadElemExtent);
             TIdx const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
 
-            for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i)
+            for(TIdx i(threadFirstElemIdx); i < threadLastElemIdxClipped; ++i)
             {
                 C[i] = A[i] + B[i];
             }
@@ -72,83 +68,82 @@ class VectorAddKernel
     }
 };
 
-auto main()
--> int
+auto main() -> int
 {
 // Fallback for the CI with disabled sequential backend
 #if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
     return EXIT_SUCCESS;
 #else
+
     // Define the index domain
-    using Dim = alpaka::dim::DimInt<1u>;
+    using Dim = alpaka::DimInt<1u>;
     using Idx = std::size_t;
 
     // Define the accelerator
     //
-    // It is possible to choose from a set of accelerators
-    // that are defined in the alpaka::acc namespace e.g.:
+    // It is possible to choose from a set of accelerators:
     // - AccGpuCudaRt
+    // - AccGpuHipRt
     // - AccCpuThreads
     // - AccCpuFibers
     // - AccCpuOmp2Threads
     // - AccCpuOmp2Blocks
-    // - AccCpuOmp4
+    // - AccOmp5
+    // - AccCpuTbbBlocks
     // - AccCpuSerial
-    using Acc = alpaka::acc::AccCpuSerial<Dim, Idx>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
+    // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
     //
     // choose between Blocking and NonBlocking
-    using QueueProperty = alpaka::queue::Blocking;
-    using QueueAcc = alpaka::queue::Queue<Acc, QueueProperty>;
+    using QueueProperty = alpaka::Blocking;
+    using QueueAcc = alpaka::Queue<Acc, QueueProperty>;
 
     // Select a device
-    DevAcc const devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
 
     // Create a queue on the device
     QueueAcc queue(devAcc);
 
     // Define the work division
     Idx const numElements(123456);
-    Idx const elementsPerThread(3u);
-    alpaka::vec::Vec<Dim, Idx> const extent(numElements);
+    Idx const elementsPerThread(8u);
+    alpaka::Vec<Dim, Idx> const extent(numElements);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extent,
-            elementsPerThread,
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
+    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        extent,
+        elementsPerThread,
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
 
     // Define the buffer element type
     using Data = std::uint32_t;
 
     // Get the host device for allocating memory on the host.
-    using DevHost = alpaka::dev::DevCpu;
-    using PltfHost = alpaka::pltf::Pltf<DevHost>;
-    DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
+    using DevHost = alpaka::DevCpu;
+    auto const devHost = alpaka::getDevByIdx<DevHost>(0u);
 
     // Allocate 3 host memory buffers
-    using BufHost = alpaka::mem::buf::Buf<DevHost, Data, Dim, Idx>;
-    BufHost bufHostA(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
-    BufHost bufHostB(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
-    BufHost bufHostC(alpaka::mem::buf::alloc<Data, Idx>(devHost, extent));
+    using BufHost = alpaka::Buf<DevHost, Data, Dim, Idx>;
+    BufHost bufHostA(alpaka::allocBuf<Data, Idx>(devHost, extent));
+    BufHost bufHostB(alpaka::allocBuf<Data, Idx>(devHost, extent));
+    BufHost bufHostC(alpaka::allocBuf<Data, Idx>(devHost, extent));
 
     // Initialize the host input vectors A and B
-    Data * const pBufHostA(alpaka::mem::view::getPtrNative(bufHostA));
-    Data * const pBufHostB(alpaka::mem::view::getPtrNative(bufHostB));
-    Data * const pBufHostC(alpaka::mem::view::getPtrNative(bufHostC));
+    Data* const pBufHostA(alpaka::getPtrNative(bufHostA));
+    Data* const pBufHostB(alpaka::getPtrNative(bufHostB));
+    Data* const pBufHostC(alpaka::getPtrNative(bufHostC));
 
-    // C++11 random generator for uniformly distributed numbers in {1,..,42}
+    // C++14 random generator for uniformly distributed numbers in {1,..,42}
     std::random_device rd{};
-    std::default_random_engine eng{ rd() };
+    std::default_random_engine eng{rd()};
     std::uniform_int_distribution<Data> dist(1, 42);
 
-    for (Idx i(0); i < numElements; ++i)
+    for(Idx i(0); i < numElements; ++i)
     {
         pBufHostA[i] = dist(eng);
         pBufHostB[i] = dist(eng);
@@ -156,57 +151,72 @@ auto main()
     }
 
     // Allocate 3 buffers on the accelerator
-    using BufAcc = alpaka::mem::buf::Buf<DevAcc, Data, Dim, Idx>;
-    BufAcc bufAccA(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
-    BufAcc bufAccB(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
-    BufAcc bufAccC(alpaka::mem::buf::alloc<Data, Idx>(devAcc, extent));
+    using BufAcc = alpaka::Buf<Acc, Data, Dim, Idx>;
+    BufAcc bufAccA(alpaka::allocBuf<Data, Idx>(devAcc, extent));
+    BufAcc bufAccB(alpaka::allocBuf<Data, Idx>(devAcc, extent));
+    BufAcc bufAccC(alpaka::allocBuf<Data, Idx>(devAcc, extent));
 
     // Copy Host -> Acc
-    alpaka::mem::view::copy(queue, bufAccA, bufHostA, extent);
-    alpaka::mem::view::copy(queue, bufAccB, bufHostB, extent);
-    alpaka::mem::view::copy(queue, bufAccC, bufHostC, extent);
+    alpaka::memcpy(queue, bufAccA, bufHostA, extent);
+    alpaka::memcpy(queue, bufAccB, bufHostB, extent);
+    alpaka::memcpy(queue, bufAccC, bufHostC, extent);
 
     // Instantiate the kernel function object
     VectorAddKernel kernel;
 
     // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
+    auto const taskKernel(alpaka::createTaskKernel<Acc>(
         workDiv,
         kernel,
-        alpaka::mem::view::getPtrNative(bufAccA),
-        alpaka::mem::view::getPtrNative(bufAccB),
-        alpaka::mem::view::getPtrNative(bufAccC),
+        alpaka::getPtrNative(bufAccA),
+        alpaka::getPtrNative(bufAccB),
+        alpaka::getPtrNative(bufAccC),
         numElements));
 
     // Enqueue the kernel execution task
-    alpaka::queue::enqueue(queue, taskKernel);
+    {
+        const auto beginT = std::chrono::high_resolution_clock::now();
+        alpaka::enqueue(queue, taskKernel);
+        alpaka::wait(queue); // wait in case we are using an asynchronous queue to time actual kernel runtime
+        const auto endT = std::chrono::high_resolution_clock::now();
+        std::cout << "Time for kernel execution: " << std::chrono::duration<double>(endT - beginT).count() << 's'
+                  << std::endl;
+    }
 
     // Copy back the result
-    alpaka::mem::view::copy(queue, bufHostC, bufAccC, extent);
-    alpaka::wait::wait(queue);
+    {
+        auto beginT = std::chrono::high_resolution_clock::now();
+        alpaka::memcpy(queue, bufHostC, bufAccC, extent);
+        alpaka::wait(queue);
+        const auto endT = std::chrono::high_resolution_clock::now();
+        std::cout << "Time for HtoD copy: " << std::chrono::duration<double>(endT - beginT).count() << 's'
+                  << std::endl;
+    }
 
-    bool resultCorrect(true);
-    for(Idx i(0u);
-        i < numElements;
-        ++i)
+    int falseResults = 0;
+    static constexpr int MAX_PRINT_FALSE_RESULTS = 20;
+    for(Idx i(0u); i < numElements; ++i)
     {
-        Data const & val(pBufHostC[i]);
+        Data const& val(pBufHostC[i]);
         Data const correctResult(pBufHostA[i] + pBufHostB[i]);
         if(val != correctResult)
         {
-            std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
-            resultCorrect = false;
+            if(falseResults < MAX_PRINT_FALSE_RESULTS)
+                std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
+            ++falseResults;
         }
     }
 
-    if(resultCorrect)
+    if(falseResults == 0)
     {
         std::cout << "Execution results correct!" << std::endl;
         return EXIT_SUCCESS;
     }
     else
     {
-        std::cout << "Execution results incorrect!" << std::endl;
+        std::cout << "Found " << falseResults << " false results, printed no more than " << MAX_PRINT_FALSE_RESULTS
+                  << "\n"
+                  << "Execution results incorrect!" << std::endl;
         return EXIT_FAILURE;
     }
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuFibers.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuFibers.hpp
index 3e2c2af0ff..95fbe11001 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuFibers.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuFibers.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,311 +12,234 @@
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
 
 // Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtRefFiberIdMap.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-#include <alpaka/block/sync/BlockSyncBarrierFiber.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicNoOp.hpp>
+#    include <alpaka/atomic/AtomicStdLibLock.hpp>
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
+#    include <alpaka/block/sync/BlockSyncBarrierFiber.hpp>
+#    include <alpaka/idx/bt/IdxBtRefFiberIdMap.hpp>
+#    include <alpaka/idx/gb/IdxGbRef.hpp>
+#    include <alpaka/intrinsic/IntrinsicCpu.hpp>
+#    include <alpaka/math/MathStdLib.hpp>
+#    include <alpaka/rand/RandStdLib.hpp>
+#    include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/warp/WarpSingleThread.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Fibers.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/core/ClipCast.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Fibers.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
 
-#include <memory>
-#include <thread>
-#include <typeinfo>
+#    include <memory>
+#    include <thread>
+#    include <typeinfo>
 
 namespace alpaka
 {
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuFibers;
-    }
-    namespace acc
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuFibers;
+
+    //#############################################################################
+    //! The CPU fibers accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a CPU device.
+    //! It uses boost::fibers to implement the cooperative parallelism.
+    //! By using fibers the shared memory can reside in the closest memory/cache available.
+    //! Furthermore there is no false sharing between neighboring threads as it is the case in real multi-threading.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccCpuFibers final :
+        public WorkDivMembers<TDim, TIdx>,
+        public gb::IdxGbRef<TDim, TIdx>,
+        public bt::IdxBtRefFiberIdMap<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicStdLibLock<16>, // grid atomics
+            AtomicStdLibLock<16>, // block atomics
+            AtomicNoOp         // thread atomics
+        >,
+        public math::MathStdLib,
+        public BlockSharedMemDynAlignedAlloc,
+        public BlockSharedMemStMasterSync,
+        public BlockSyncBarrierFiber<TIdx>,
+        public IntrinsicCpu,
+        public rand::RandStdLib,
+        public TimeStdLib,
+        public warp::WarpSingleThread,
+        public concepts::Implements<ConceptAcc, AccCpuFibers<TDim, TIdx>>
     {
-        //#############################################################################
-        //! The CPU fibers accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses boost::fibers to implement the cooperative parallelism.
-        //! By using fibers the shared memory can reside in the closest memory/cache available.
-        //! Furthermore there is no false sharing between neighboring threads as it is the case in real multi-threading.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuFibers final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>, // grid atomics
-                atomic::AtomicStdLibLock<16>, // block atomics
-                atomic::AtomicNoOp         // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStMasterSync,
-            public block::sync::BlockSyncBarrierFiber<TIdx>,
-            public rand::RandStdLib,
-            public time::TimeStdLib,
-            public concepts::Implements<ConceptAcc, AccCpuFibers<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuFibers;
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
 
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuFibers(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>(m_fibersToIndices),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>, // atomics between grids
-                        atomic::AtomicStdLibLock<16>, // atomics between blocks
-                        atomic::AtomicNoOp         // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStMasterSync(
-                        [this](){block::sync::syncBlockThreads(*this);},
-                        [this](){return (m_masterFiberId == boost::this_fiber::get_id());}),
-                    block::sync::BlockSyncBarrierFiber<TIdx>(
-                        workdiv::getWorkDiv<Block, Threads>(workDiv).prod()),
-                    rand::RandStdLib(),
-                    time::TimeStdLib(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuFibers;
 
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuFibers(AccCpuFibers const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuFibers(AccCpuFibers &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuFibers const &) -> AccCpuFibers & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuFibers &&) -> AccCpuFibers & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuFibers() = default;
+    private:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuFibers(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , bt::IdxBtRefFiberIdMap<TDim, TIdx>(m_fibersToIndices)
+            , AtomicHierarchy<
+                  AtomicStdLibLock<16>, // atomics between grids
+                  AtomicStdLibLock<16>, // atomics between blocks
+                  AtomicNoOp // atomics between threads
+                  >()
+            , math::MathStdLib()
+            , BlockSharedMemDynAlignedAlloc(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMasterSync(
+                  [this]() { syncBlockThreads(*this); },
+                  [this]() { return (m_masterFiberId == boost::this_fiber::get_id()); })
+            , BlockSyncBarrierFiber<TIdx>(getWorkDiv<Block, Threads>(workDiv).prod())
+            , rand::RandStdLib()
+            , TimeStdLib()
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
 
-        private:
-            // getIdx
-            typename idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>::FiberIdToIdxMap mutable m_fibersToIndices;  //!< The mapping of fibers id's to indices.
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;                    //!< The index of the currently executed block.
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuFibers(AccCpuFibers const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuFibers(AccCpuFibers&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuFibers const&) -> AccCpuFibers& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuFibers&&) -> AccCpuFibers& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AccCpuFibers() = default;
 
-            // allocBlockSharedArr
-            boost::fibers::fiber::id mutable m_masterFiberId;           //!< The id of the master fiber.
-        };
-    }
+    private:
+        // getIdx
+        typename bt::IdxBtRefFiberIdMap<TDim, TIdx>::
+            FiberIdToIdxMap mutable m_fibersToIndices; //!< The mapping of fibers id's to indices.
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+
+        // allocBlockSharedArr
+        boost::fibers::fiber::id mutable m_masterFiberId; //!< The id of the master fiber.
+    };
 
-    namespace acc
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU fibers accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuFibers<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU fibers accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                using type = acc::AccCpuFibers<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU fibers accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuFibers<TDim, TIdx>>
+            using type = AccCpuFibers<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The CPU fibers accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuFibers<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> alpaka::AccDevProps<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> alpaka::acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-#ifdef ALPAKA_CI
-                    auto const blockThreadCountMax(static_cast<TIdx>(3));
-#else
-                    auto const blockThreadCountMax(static_cast<TIdx>(4));  // \TODO: What is the maximum? Just set a reasonable value?
-#endif
-                    return {
-                        // m_multiProcessorCount
-                        std::max(static_cast<TIdx>(1), alpaka::core::clipCast<TIdx>(std::thread::hardware_concurrency())),   // \TODO: This may be inaccurate.
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+#    ifdef ALPAKA_CI
+                auto const blockThreadCountMax(static_cast<TIdx>(3));
+#    else
+                auto const blockThreadCountMax(
+                    static_cast<TIdx>(4)); // \TODO: What is the maximum? Just set a reasonable value?
+#    endif
+                return {// m_multiProcessorCount
+                        std::max(
+                            static_cast<TIdx>(1),
+                            alpaka::core::clipCast<TIdx>(
+                                std::thread::hardware_concurrency())), // \TODO: This may be inaccurate.
+                                                                       // m_gridBlockExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_gridBlockCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::all(blockThreadCountMax),
+                        Vec<TDim, TIdx>::all(blockThreadCountMax),
                         // m_blockThreadCountMax
                         blockThreadCountMax,
                         // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU fibers accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuFibers<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        getMemBytes(dev)};
+            }
+        };
+        //#############################################################################
+        //! The CPU fibers accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuFibers<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU fibers accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuFibers<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+                return "AccCpuFibers<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The CPU fibers accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuFibers<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU fibers accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU fibers accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuFibers<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU fibers accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuFibers<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuFibers<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuFibers<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU fibers accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuFibers<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
         {
-            //#############################################################################
-            //! The CPU fibers execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuFibers<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
             {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The CPU fibers execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccCpuFibers<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU fibers accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuFibers<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU fibers accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuFibers<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp
index 55465346ae..2494312b62 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Blocks.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,301 +11,222 @@
 
 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
 
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
 
 // Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtZero.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
-#include <alpaka/block/sync/BlockSyncNoOp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeOmp.hpp>
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicNoOp.hpp>
+#    include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
+#    include <alpaka/atomic/AtomicStdLibLock.hpp>
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStMember.hpp>
+#    include <alpaka/block/sync/BlockSyncNoOp.hpp>
+#    include <alpaka/idx/bt/IdxBtZero.hpp>
+#    include <alpaka/idx/gb/IdxGbRef.hpp>
+#    include <alpaka/intrinsic/IntrinsicCpu.hpp>
+#    include <alpaka/math/MathStdLib.hpp>
+#    include <alpaka/rand/RandStdLib.hpp>
+#    include <alpaka/time/TimeOmp.hpp>
+#    include <alpaka/warp/WarpSingleThread.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
 
-#include <limits>
-#include <typeinfo>
+#    include <limits>
+#    include <typeinfo>
 
 namespace alpaka
 {
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp2Blocks;
-    }
-    namespace acc
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuOmp2Blocks;
+
+    //#############################################################################
+    //! The CPU OpenMP 2.0 block accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a CPU device.
+    //! It uses OpenMP 2.0 to implement the grid block parallelism.
+    //! The block idx is restricted to 1x1x1.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccCpuOmp2Blocks final :
+        public WorkDivMembers<TDim, TIdx>,
+        public gb::IdxGbRef<TDim, TIdx>,
+        public bt::IdxBtZero<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicStdLibLock<16>,   // grid atomics
+            AtomicOmpBuiltIn,    // block atomics
+            AtomicNoOp           // thread atomics
+        >,
+        public math::MathStdLib,
+        public BlockSharedMemDynMember<>,
+        public BlockSharedMemStMember<>,
+        public BlockSyncNoOp,
+        public IntrinsicCpu,
+        public rand::RandStdLib,
+        public TimeOmp,
+        public warp::WarpSingleThread,
+        public concepts::Implements<ConceptAcc, AccCpuOmp2Blocks<TDim, TIdx>>
     {
-        //#############################################################################
-        //! The CPU OpenMP 2.0 block accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses OpenMP 2.0 to implement the grid block parallelism.
-        //! The block idx is restricted to 1x1x1.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuOmp2Blocks final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtZero<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>,   // grid atomics
-                atomic::AtomicOmpBuiltIn,    // block atomics
-                atomic::AtomicNoOp           // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStNoSync,
-            public block::sync::BlockSyncNoOp,
-            public rand::RandStdLib,
-            public time::TimeOmp,
-            public concepts::Implements<ConceptAcc, AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuOmp2Blocks;
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
 
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuOmp2Blocks(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtZero<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>,// atomics between grids
-                        atomic::AtomicOmpBuiltIn, // atomics between blocks
-                        atomic::AtomicNoOp        // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStNoSync(),
-                    block::sync::BlockSyncNoOp(),
-                    rand::RandStdLib(),
-                    time::TimeOmp(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuOmp2Blocks;
 
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp2Blocks(AccCpuOmp2Blocks const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp2Blocks(AccCpuOmp2Blocks &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp2Blocks const &) -> AccCpuOmp2Blocks & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp2Blocks &&) -> AccCpuOmp2Blocks & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuOmp2Blocks() = default;
+    private:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuOmp2Blocks(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , bt::IdxBtZero<TDim, TIdx>()
+            , AtomicHierarchy<
+                  AtomicStdLibLock<16>, // atomics between grids
+                  AtomicOmpBuiltIn, // atomics between blocks
+                  AtomicNoOp // atomics between threads
+                  >()
+            , math::MathStdLib()
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
+            , BlockSyncNoOp()
+            , rand::RandStdLib()
+            , TimeOmp()
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
 
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;   //!< The index of the currently executed block.
-        };
-    }
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuOmp2Blocks(AccCpuOmp2Blocks const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuOmp2Blocks(AccCpuOmp2Blocks&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuOmp2Blocks const&) -> AccCpuOmp2Blocks& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuOmp2Blocks&&) -> AccCpuOmp2Blocks& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AccCpuOmp2Blocks() = default;
+
+    private:
+        // getIdx
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+    };
 
-    namespace acc
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuOmp2Blocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                using type = acc::AccCpuOmp2Blocks<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
+            using type = AccCpuOmp2Blocks<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> alpaka::AccDevProps<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> alpaka::acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
+                alpaka::ignore_unused(dev);
 
-                    return {
-                        // m_multiProcessorCount
+                return {// m_multiProcessorCount
                         static_cast<TIdx>(1),
                         // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_gridBlockCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::ones(),
+                        Vec<TDim, TIdx>::ones(),
                         // m_blockThreadCountMax
                         static_cast<TIdx>(1),
                         // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuOmp2Blocks<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes())};
+            }
+        };
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuOmp2Blocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+                return "AccCpuOmp2Blocks<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuOmp2Blocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuOmp2Blocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuOmp2Blocks<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuOmp2Blocks<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuOmp2Blocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
             {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccCpuOmp2Blocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuOmp2Blocks<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp
index 3ef209d119..64a60293ff 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp2Threads.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,309 +11,228 @@
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
 
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
 
 // Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtOmp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-#include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeOmp.hpp>
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
+#    include <alpaka/atomic/AtomicStdLibLock.hpp>
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
+#    include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
+#    include <alpaka/idx/bt/IdxBtOmp.hpp>
+#    include <alpaka/idx/gb/IdxGbRef.hpp>
+#    include <alpaka/intrinsic/IntrinsicCpu.hpp>
+#    include <alpaka/math/MathStdLib.hpp>
+#    include <alpaka/rand/RandStdLib.hpp>
+#    include <alpaka/time/TimeOmp.hpp>
+#    include <alpaka/warp/WarpSingleThread.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/core/ClipCast.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
 
-#include <omp.h>
+#    include <omp.h>
 
-#include <limits>
-#include <typeinfo>
+#    include <limits>
+#    include <typeinfo>
 
 namespace alpaka
 {
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp2Threads;
-    }
-    namespace acc
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuOmp2Threads;
+
+    //#############################################################################
+    //! The CPU OpenMP 2.0 thread accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a CPU device.
+    //! It uses OpenMP 2.0 to implement the block thread parallelism.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccCpuOmp2Threads final :
+        public WorkDivMembers<TDim, TIdx>,
+        public gb::IdxGbRef<TDim, TIdx>,
+        public bt::IdxBtOmp<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicStdLibLock<16>,   // grid atomics
+            AtomicOmpBuiltIn,    // block atomics
+            AtomicOmpBuiltIn     // thread atomics
+        >,
+        public math::MathStdLib,
+        public BlockSharedMemDynAlignedAlloc,
+        public BlockSharedMemStMasterSync,
+        public BlockSyncBarrierOmp,
+        public IntrinsicCpu,
+        public rand::RandStdLib,
+        public TimeOmp,
+        public warp::WarpSingleThread,
+        public concepts::Implements<ConceptAcc, AccCpuOmp2Threads<TDim, TIdx>>
     {
-        //#############################################################################
-        //! The CPU OpenMP 2.0 thread accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses OpenMP 2.0 to implement the block thread parallelism.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuOmp2Threads final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtOmp<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>,   // grid atomics
-                atomic::AtomicOmpBuiltIn,    // block atomics
-                atomic::AtomicOmpBuiltIn     // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStMasterSync,
-            public block::sync::BlockSyncBarrierOmp,
-            public rand::RandStdLib,
-            public time::TimeOmp,
-            public concepts::Implements<ConceptAcc, AccCpuOmp2Threads<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuOmp2Threads;
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
 
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuOmp2Threads(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtOmp<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>,// atomics between grids
-                        atomic::AtomicOmpBuiltIn, // atomics between blocks
-                        atomic::AtomicOmpBuiltIn  // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStMasterSync(
-                        [this](){block::sync::syncBlockThreads(*this);},
-                        [](){return (::omp_get_thread_num() == 0);}),
-                    block::sync::BlockSyncBarrierOmp(),
-                    rand::RandStdLib(),
-                    time::TimeOmp(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuOmp2Threads;
 
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp2Threads(AccCpuOmp2Threads const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp2Threads(AccCpuOmp2Threads &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp2Threads const &) -> AccCpuOmp2Threads & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp2Threads &&) -> AccCpuOmp2Threads & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuOmp2Threads() = default;
+    private:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuOmp2Threads(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , bt::IdxBtOmp<TDim, TIdx>()
+            , AtomicHierarchy<
+                  AtomicStdLibLock<16>, // atomics between grids
+                  AtomicOmpBuiltIn, // atomics between blocks
+                  AtomicOmpBuiltIn // atomics between threads
+                  >()
+            , math::MathStdLib()
+            , BlockSharedMemDynAlignedAlloc(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMasterSync(
+                  [this]() { syncBlockThreads(*this); },
+                  []() { return (::omp_get_thread_num() == 0); })
+            , BlockSyncBarrierOmp()
+            , rand::RandStdLib()
+            , TimeOmp()
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
 
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;  //!< The index of the currently executed block.
-        };
-    }
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuOmp2Threads(AccCpuOmp2Threads const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuOmp2Threads(AccCpuOmp2Threads&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuOmp2Threads const&) -> AccCpuOmp2Threads& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuOmp2Threads&&) -> AccCpuOmp2Threads& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AccCpuOmp2Threads() = default;
 
-    namespace acc
+    private:
+        // getIdx
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+    };
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU OpenMP 2.0 thread accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuOmp2Threads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                using type = acc::AccCpuOmp2Threads<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
+            using type = AccCpuOmp2Threads<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The CPU OpenMP 2.0 thread accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> alpaka::AccDevProps<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> alpaka::acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-#ifdef ALPAKA_CI
-                    auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads())));
-#else
-                    auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(::omp_get_max_threads()));
-#endif
-                    return {
-                        // m_multiProcessorCount
+#    ifdef ALPAKA_CI
+                auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads())));
+#    else
+                auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(::omp_get_max_threads()));
+#    endif
+                return {// m_multiProcessorCount
                         static_cast<TIdx>(1),
                         // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_gridBlockCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::all(blockThreadCountMax),
+                        Vec<TDim, TIdx>::all(blockThreadCountMax),
                         // m_blockThreadCountMax
                         blockThreadCountMax,
                         // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuOmp2Threads<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        getMemBytes(dev)};
+            }
+        };
+        //#############################################################################
+        //! The CPU OpenMP 2.0 thread accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuOmp2Threads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+                return "AccCpuOmp2Threads<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 thread accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuOmp2Threads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 thread accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuOmp2Threads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuOmp2Threads<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuOmp2Threads<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuOmp2Threads<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 thread accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuOmp2Threads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
             {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 thread execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccCpuOmp2Threads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 thread accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuOmp2Threads<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 thread accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp4.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp4.hpp
deleted file mode 100644
index 2910fc032b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuOmp4.hpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-
-#if _OPENMP < 201307
-    #error If ALPAKA_ACC_CPU_BT_OMP4_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
-#endif
-
-// Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtOmp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-#include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeOmp.hpp>
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-
-#include <omp.h>
-
-#include <limits>
-#include <typeinfo>
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp4;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The CPU OpenMP 4.0 accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses CPU OpenMP4 to implement the parallelism.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuOmp4 final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtOmp<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>,   // grid atomics
-                atomic::AtomicOmpBuiltIn,    // block atomics
-                atomic::AtomicOmpBuiltIn     // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStMasterSync,
-            public block::sync::BlockSyncBarrierOmp,
-            public rand::RandStdLib,
-            public time::TimeOmp,
-            public concepts::Implements<ConceptAcc, AccCpuOmp4<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuOmp4;
-
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuOmp4(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtOmp<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>,// atomics between grids
-                        atomic::AtomicOmpBuiltIn, // atomics between blocks
-                        atomic::AtomicOmpBuiltIn  // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStMasterSync(
-                        [this](){block::sync::syncBlockThreads(*this);},
-                        [](){return (::omp_get_thread_num() == 0);}),
-                    block::sync::BlockSyncBarrierOmp(),
-                    rand::RandStdLib(),
-                    time::TimeOmp(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp4(AccCpuOmp4 const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuOmp4(AccCpuOmp4 &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp4 const &) -> AccCpuOmp4 & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuOmp4 &&) -> AccCpuOmp4 & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuOmp4() = default;
-
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;    //!< The index of the currently executed block.
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = acc::AccCpuOmp4<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-#ifdef ALPAKA_CI
-                    auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads())));
-#else
-                    auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(::omp_get_max_threads()));
-#endif
-                    return {
-                        // m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::all(blockThreadCountMax),
-                        // m_blockThreadCountMax
-                        blockThreadCountMax,
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuOmp4<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuOmp4<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuOmp4<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuOmp4<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuOmp4<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuSerial.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuSerial.hpp
index 43ff644c0e..3aec96a550 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuSerial.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuSerial.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,294 +12,215 @@
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
 
 // Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtZero.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
-#include <alpaka/block/sync/BlockSyncNoOp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicNoOp.hpp>
+#    include <alpaka/atomic/AtomicStdLibLock.hpp>
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStMember.hpp>
+#    include <alpaka/block/sync/BlockSyncNoOp.hpp>
+#    include <alpaka/idx/bt/IdxBtZero.hpp>
+#    include <alpaka/idx/gb/IdxGbRef.hpp>
+#    include <alpaka/intrinsic/IntrinsicCpu.hpp>
+#    include <alpaka/math/MathStdLib.hpp>
+#    include <alpaka/rand/RandStdLib.hpp>
+#    include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/warp/WarpSingleThread.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
 
-#include <memory>
-#include <typeinfo>
+#    include <memory>
+#    include <typeinfo>
 
 namespace alpaka
 {
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuSerial;
-    }
-    namespace acc
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuSerial;
+
+    //#############################################################################
+    //! The CPU serial accelerator.
+    //!
+    //! This accelerator allows serial kernel execution on a CPU device.
+    //! The block idx is restricted to 1x1x1 and all blocks are executed serially so there is no parallelism at all.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccCpuSerial final :
+        public WorkDivMembers<TDim, TIdx>,
+        public gb::IdxGbRef<TDim, TIdx>,
+        public bt::IdxBtZero<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicStdLibLock<16>, // grid atomics
+            AtomicNoOp,        // block atomics
+            AtomicNoOp         // thread atomics
+        >,
+        public math::MathStdLib,
+        public BlockSharedMemDynMember<>,
+        public BlockSharedMemStMember<>,
+        public BlockSyncNoOp,
+        public IntrinsicCpu,
+        public rand::RandStdLib,
+        public TimeStdLib,
+        public warp::WarpSingleThread,
+        public concepts::Implements<ConceptAcc, AccCpuSerial<TDim, TIdx>>
     {
-        //#############################################################################
-        //! The CPU serial accelerator.
-        //!
-        //! This accelerator allows serial kernel execution on a CPU device.
-        //! The block idx is restricted to 1x1x1 and all blocks are executed serially so there is no parallelism at all.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuSerial final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtZero<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>, // grid atomics
-                atomic::AtomicNoOp,        // block atomics
-                atomic::AtomicNoOp         // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStNoSync,
-            public block::sync::BlockSyncNoOp,
-            public rand::RandStdLib,
-            public time::TimeStdLib,
-            public concepts::Implements<ConceptAcc, AccCpuSerial<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuSerial;
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
 
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuSerial(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtZero<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>, // atomics between grids
-                        atomic::AtomicNoOp,        // atomics between blocks
-                        atomic::AtomicNoOp         // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStNoSync(),
-                    block::sync::BlockSyncNoOp(),
-                    rand::RandStdLib(),
-                    time::TimeStdLib(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuSerial;
 
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuSerial(AccCpuSerial const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuSerial(AccCpuSerial &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuSerial const &) -> AccCpuSerial & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuSerial &&) -> AccCpuSerial & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuSerial() = default;
+    private:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuSerial(TWorkDiv const& workDiv, size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , bt::IdxBtZero<TDim, TIdx>()
+            , AtomicHierarchy<
+                  AtomicStdLibLock<16>, // atomics between grids
+                  AtomicNoOp, // atomics between blocks
+                  AtomicNoOp // atomics between threads
+                  >()
+            , math::MathStdLib()
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
+            , BlockSyncNoOp()
+            , rand::RandStdLib()
+            , TimeStdLib()
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
 
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;    //!< The index of the currently executed block.
-        };
-    }
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuSerial(AccCpuSerial const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuSerial(AccCpuSerial&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuSerial const&) -> AccCpuSerial& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuSerial&&) -> AccCpuSerial& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AccCpuSerial() = default;
+
+    private:
+        // getIdx
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+    };
 
-    namespace acc
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU serial accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuSerial<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU serial accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                using type = acc::AccCpuSerial<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU serial accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuSerial<TDim, TIdx>>
+            using type = AccCpuSerial<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The CPU serial accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuSerial<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
+                alpaka::ignore_unused(dev);
 
-                    return {
-                        // m_multiProcessorCount
+                return {// m_multiProcessorCount
                         static_cast<TIdx>(1),
                         // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_gridBlockCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::ones(),
+                        Vec<TDim, TIdx>::ones(),
                         // m_blockThreadCountMax
                         static_cast<TIdx>(1),
                         // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU serial accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuSerial<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(AccCpuSerial<TDim, TIdx>::staticAllocBytes())};
+            }
+        };
+        //#############################################################################
+        //! The CPU serial accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuSerial<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU serial accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuSerial<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+                return "AccCpuSerial<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The CPU serial accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuSerial<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU serial accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU serial accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuSerial<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU serial accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuSerial<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuSerial<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuSerial<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU serial accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuSerial<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
         {
-            //#############################################################################
-            //! The CPU serial execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuSerial<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
             {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The CPU serial execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccCpuSerial<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU serial accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuSerial<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU serial accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuSerial<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp
index ea2bc156ee..ee2274b07c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuTbbBlocks.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,293 +12,212 @@
 #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
 
 // Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtZero.hpp>
-#include <alpaka/atomic/AtomicNoOp.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
-#include <alpaka/block/sync/BlockSyncNoOp.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicNoOp.hpp>
+#    include <alpaka/atomic/AtomicStdLibLock.hpp>
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStMember.hpp>
+#    include <alpaka/block/sync/BlockSyncNoOp.hpp>
+#    include <alpaka/idx/bt/IdxBtZero.hpp>
+#    include <alpaka/idx/gb/IdxGbRef.hpp>
+#    include <alpaka/intrinsic/IntrinsicCpu.hpp>
+#    include <alpaka/math/MathStdLib.hpp>
+#    include <alpaka/rand/RandStdLib.hpp>
+#    include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/warp/WarpSingleThread.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
 
-#include <memory>
-#include <typeinfo>
+#    include <memory>
+#    include <typeinfo>
 
 namespace alpaka
 {
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuTbbBlocks;
-    }
-    namespace acc
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuTbbBlocks;
+
+    //#############################################################################
+    //! The CPU TBB block accelerator.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccCpuTbbBlocks final :
+        public WorkDivMembers<TDim, TIdx>,
+        public gb::IdxGbRef<TDim, TIdx>,
+        public bt::IdxBtZero<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicStdLibLock<16>, // grid atomics
+            AtomicStdLibLock<16>, // block atomics
+            AtomicNoOp         // thread atomics
+        >,
+        public math::MathStdLib,
+        public BlockSharedMemDynMember<>,
+        public BlockSharedMemStMember<>,
+        public BlockSyncNoOp,
+        public IntrinsicCpu,
+        public rand::RandStdLib,
+        public TimeStdLib,
+        public warp::WarpSingleThread,
+        public concepts::Implements<ConceptAcc, AccCpuTbbBlocks<TDim, TIdx>>
     {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
 
-        //#############################################################################
-        //! The CPU TBB block accelerator.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuTbbBlocks final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtZero<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>, // grid atomics
-                atomic::AtomicStdLibLock<16>, // block atomics
-                atomic::AtomicNoOp         // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStNoSync,
-            public block::sync::BlockSyncNoOp,
-            public rand::RandStdLib,
-            public time::TimeStdLib,
-            public concepts::Implements<ConceptAcc, AccCpuTbbBlocks<TDim, TIdx>>
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuTbbBlocks;
+
+    private:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuTbbBlocks(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , bt::IdxBtZero<TDim, TIdx>()
+            , AtomicHierarchy<
+                  AtomicStdLibLock<16>, // atomics between grids
+                  AtomicStdLibLock<16>, // atomics between blocks
+                  AtomicNoOp // atomics between threads
+                  >()
+            , math::MathStdLib()
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
+            , BlockSyncNoOp()
+            , rand::RandStdLib()
+            , TimeStdLib()
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
         {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuTbbBlocks;
+        }
 
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuTbbBlocks(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtZero<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>, // atomics between grids
-                        atomic::AtomicStdLibLock<16>, // atomics between blocks
-                        atomic::AtomicNoOp         // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStNoSync(),
-                    block::sync::BlockSyncNoOp(),
-                    rand::RandStdLib(),
-                    time::TimeStdLib(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuTbbBlocks(AccCpuTbbBlocks const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuTbbBlocks(AccCpuTbbBlocks&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuTbbBlocks const&) -> AccCpuTbbBlocks& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuTbbBlocks&&) -> AccCpuTbbBlocks& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AccCpuTbbBlocks() = default;
 
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuTbbBlocks(AccCpuTbbBlocks const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuTbbBlocks(AccCpuTbbBlocks &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuTbbBlocks const &) -> AccCpuTbbBlocks & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuTbbBlocks &&) -> AccCpuTbbBlocks & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuTbbBlocks() = default;
+    private:
+        // getIdx
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+    };
 
-        private:
-            // getIdx
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;  //!< The index of the currently executed block.
-        };
-    }
-
-    namespace acc
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU TBB block accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuTbbBlocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU TBB block accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                using type = acc::AccCpuTbbBlocks<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU TBB block accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
+            using type = AccCpuTbbBlocks<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The CPU TBB block accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                  ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
+                alpaka::ignore_unused(dev);
 
-                    return {
-                        // m_multiProcessorCount
+                return {// m_multiProcessorCount
                         static_cast<TIdx>(1),
                         // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_gridBlockCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::ones(),
+                        Vec<TDim, TIdx>::ones(),
                         // m_blockThreadCountMax
                         static_cast<TIdx>(1),
                         // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-
-            };
-            //#############################################################################
-            //! The CPU TBB block accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuTbbBlocks<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(AccCpuTbbBlocks<TDim, TIdx>::staticAllocBytes())};
+            }
+        };
+        //#############################################################################
+        //! The CPU TBB block accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuTbbBlocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU TBB block accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+                return "AccCpuTbbBlocks<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The CPU TBB block accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuTbbBlocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU TBB block accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU TBB block accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuTbbBlocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU TBB block accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuTbbBlocks<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuTbbBlocks<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuTbbBlocks<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU TBB block accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuTbbBlocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
         {
-            //#############################################################################
-            //! The CPU TBB block execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
             {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The CPU TBB block execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccCpuTbbBlocks<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU TBB block accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuTbbBlocks<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU TBB block accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuThreads.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuThreads.hpp
index 5ce32b990a..13f2055d6a 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuThreads.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccCpuThreads.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,311 +12,233 @@
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
 
 // Base classes.
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-#include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/bt/IdxBtRefThreadIdMap.hpp>
-#include <alpaka/atomic/AtomicStdLibLock.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathStdLib.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-#include <alpaka/block/sync/BlockSyncBarrierThread.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
-#include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicStdLibLock.hpp>
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
+#    include <alpaka/block/sync/BlockSyncBarrierThread.hpp>
+#    include <alpaka/idx/bt/IdxBtRefThreadIdMap.hpp>
+#    include <alpaka/idx/gb/IdxGbRef.hpp>
+#    include <alpaka/intrinsic/IntrinsicCpu.hpp>
+#    include <alpaka/math/MathStdLib.hpp>
+#    include <alpaka/rand/RandStdLib.hpp>
+#    include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/warp/WarpSingleThread.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/core/ClipCast.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
 
-#include <memory>
-#include <thread>
-#include <typeinfo>
+#    include <memory>
+#    include <thread>
+#    include <typeinfo>
 
 namespace alpaka
 {
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuThreads;
-    }
-    namespace acc
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuThreads;
+
+    //#############################################################################
+    //! The CPU threads accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a CPU device.
+    //! It uses std::thread to implement the parallelism.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccCpuThreads final :
+        public WorkDivMembers<TDim, TIdx>,
+        public gb::IdxGbRef<TDim, TIdx>,
+        public bt::IdxBtRefThreadIdMap<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicStdLibLock<16>, // grid atomics
+            AtomicStdLibLock<16>, // block atomics
+            AtomicStdLibLock<16>  // thread atomics
+        >,
+        public math::MathStdLib,
+        public BlockSharedMemDynAlignedAlloc,
+        public BlockSharedMemStMasterSync,
+        public BlockSyncBarrierThread<TIdx>,
+        public IntrinsicCpu,
+        public rand::RandStdLib,
+        public TimeStdLib,
+        public warp::WarpSingleThread,
+        public concepts::Implements<ConceptAcc, AccCpuThreads<TDim, TIdx>>
     {
-        //#############################################################################
-        //! The CPU threads accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on a CPU device.
-        //! It uses C++11 std::thread to implement the parallelism.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccCpuThreads final :
-            public workdiv::WorkDivMembers<TDim, TIdx>,
-            public idx::gb::IdxGbRef<TDim, TIdx>,
-            public idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicStdLibLock<16>, // grid atomics
-                atomic::AtomicStdLibLock<16>, // block atomics
-                atomic::AtomicStdLibLock<16>  // thread atomics
-            >,
-            public math::MathStdLib,
-            public block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc,
-            public block::shared::st::BlockSharedMemStMasterSync,
-            public block::sync::BlockSyncBarrierThread<TIdx>,
-            public rand::RandStdLib,
-            public time::TimeStdLib,
-            public concepts::Implements<ConceptAcc, AccCpuThreads<TDim, TIdx>>
-        {
-        public:
-            // Partial specialization with the correct TDim and TIdx is not allowed.
-            template<
-                typename TDim2,
-                typename TIdx2,
-                typename TKernelFnObj,
-                typename... TArgs>
-            friend class ::alpaka::kernel::TaskKernelCpuThreads;
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
 
-        private:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST AccCpuThreads(
-                TWorkDiv const & workDiv,
-                TIdx const & blockSharedMemDynSizeBytes) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(workDiv),
-                    idx::gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx),
-                    idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>(m_threadToIndexMap),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicStdLibLock<16>, // atomics between grids
-                        atomic::AtomicStdLibLock<16>, // atomics between blocks
-                        atomic::AtomicStdLibLock<16>  // atomics between threads
-                    >(),
-                    math::MathStdLib(),
-                    block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc(static_cast<std::size_t>(blockSharedMemDynSizeBytes)),
-                    block::shared::st::BlockSharedMemStMasterSync(
-                        [this](){block::sync::syncBlockThreads(*this);},
-                        [this](){return (m_idMasterThread == std::this_thread::get_id());}),
-                    block::sync::BlockSyncBarrierThread<TIdx>(
-                        workdiv::getWorkDiv<Block, Threads>(workDiv).prod()),
-                    rand::RandStdLib(),
-                    time::TimeStdLib(),
-                    m_gridBlockIdx(vec::Vec<TDim, TIdx>::zeros())
-            {}
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuThreads;
 
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuThreads(AccCpuThreads const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccCpuThreads(AccCpuThreads &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuThreads const &) -> AccCpuThreads & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AccCpuThreads &&) -> AccCpuThreads & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AccCpuThreads() = default;
+    private:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuThreads(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , bt::IdxBtRefThreadIdMap<TDim, TIdx>(m_threadToIndexMap)
+            , AtomicHierarchy<
+                  AtomicStdLibLock<16>, // atomics between grids
+                  AtomicStdLibLock<16>, // atomics between blocks
+                  AtomicStdLibLock<16> // atomics between threads
+                  >()
+            , math::MathStdLib()
+            , BlockSharedMemDynAlignedAlloc(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMasterSync(
+                  [this]() { syncBlockThreads(*this); },
+                  [this]() { return (m_idMasterThread == std::this_thread::get_id()); })
+            , BlockSyncBarrierThread<TIdx>(getWorkDiv<Block, Threads>(workDiv).prod())
+            , rand::RandStdLib()
+            , TimeStdLib()
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
 
-        private:
-            // getIdx
-            std::mutex mutable m_mtxMapInsert;                              //!< The mutex used to secure insertion into the ThreadIdToIdxMap.
-            typename idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>::ThreadIdToIdxMap mutable m_threadToIndexMap;    //!< The mapping of thread id's to indices.
-            vec::Vec<TDim, TIdx> mutable m_gridBlockIdx;                   //!< The index of the currently executed block.
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuThreads(AccCpuThreads const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccCpuThreads(AccCpuThreads&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuThreads const&) -> AccCpuThreads& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AccCpuThreads&&) -> AccCpuThreads& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AccCpuThreads() = default;
 
-            // allocBlockSharedArr
-            std::thread::id mutable m_idMasterThread;                       //!< The id of the master thread.
-        };
-    }
+    private:
+        // getIdx
+        std::mutex mutable m_mtxMapInsert; //!< The mutex used to secure insertion into the ThreadIdToIdxMap.
+        typename bt::IdxBtRefThreadIdMap<TDim, TIdx>::
+            ThreadIdToIdxMap mutable m_threadToIndexMap; //!< The mapping of thread id's to indices.
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+
+        // allocBlockSharedArr
+        std::thread::id mutable m_idMasterThread; //!< The id of the master thread.
+    };
 
-    namespace acc
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU threads accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuThreads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU threads accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                using type = acc::AccCpuThreads<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The CPU threads accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccCpuThreads<TDim, TIdx>>
+            using type = AccCpuThreads<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The CPU threads accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuThreads<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCpu const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(dev);
-
-#ifdef ALPAKA_CI
-                    auto const blockThreadCountMax(static_cast<TIdx>(8));
-#else
-                    // \TODO: Magic number. What is the maximum? Just set a reasonable value? There is a implementation defined maximum where the creation of a new thread crashes.
-                    // std::thread::hardware_concurrency can return 0, so 1 is the default case?
-                    auto const blockThreadCountMax(std::max(static_cast<TIdx>(1), alpaka::core::clipCast<TIdx>(std::thread::hardware_concurrency() * 8)));
-#endif
-                    return {
-                        // m_multiProcessorCount
+#    ifdef ALPAKA_CI
+                auto const blockThreadCountMax(static_cast<TIdx>(8));
+#    else
+                // \TODO: Magic number. What is the maximum? Just set a reasonable value? There is a implementation
+                // defined maximum where the creation of a new thread crashes. std::thread::hardware_concurrency can
+                // return 0, so 1 is the default case?
+                auto const blockThreadCountMax(std::max(
+                    static_cast<TIdx>(1),
+                    alpaka::core::clipCast<TIdx>(std::thread::hardware_concurrency() * 8)));
+#    endif
+                return {// m_multiProcessorCount
                         static_cast<TIdx>(1),
                         // m_gridBlockExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_gridBlockCountMax
                         std::numeric_limits<TIdx>::max(),
                         // m_blockThreadExtentMax
-                        vec::Vec<TDim, TIdx>::all(blockThreadCountMax),
+                        Vec<TDim, TIdx>::all(blockThreadCountMax),
                         // m_blockThreadCountMax
                         blockThreadCountMax,
                         // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
                         // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The CPU threads accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccCpuThreads<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        getMemBytes(dev)};
+            }
+        };
+        //#############################################################################
+        //! The CPU threads accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuThreads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU threads accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccCpuThreads<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+                return "AccCpuThreads<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The CPU threads accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuThreads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU threads accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU threads accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuThreads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU threads accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccCpuThreads<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelCpuThreads<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelCpuThreads<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU threads accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuThreads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
         {
-            //#############################################################################
-            //! The CPU threads execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccCpuThreads<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
             {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The CPU threads execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccCpuThreads<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU threads accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccCpuThreads<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU threads accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuThreads<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccDevProps.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccDevProps.hpp
index e561453dd8..120d03cd2f 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccDevProps.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccDevProps.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,55 +9,57 @@
 
 #pragma once
 
-#include <alpaka/vec/Vec.hpp>
 #include <alpaka/core/Common.hpp>
+#include <alpaka/vec/Vec.hpp>
 
-#include <vector>
 #include <string>
+#include <vector>
 
 namespace alpaka
 {
-    namespace acc
+    //#############################################################################
+    //! The acceleration properties on a device.
+    //
+    // \TODO:
+    //  TIdx m_maxClockFrequencyHz;            //!< Maximum clock frequency of the device in Hz.
+    template<typename TDim, typename TIdx>
+    struct AccDevProps
     {
-        //#############################################################################
-        //! The acceleration properties on a device.
-        //
-        // \TODO:
-        //  TIdx m_maxClockFrequencyHz;            //!< Maximum clock frequency of the device in Hz.
-        //  TIdx m_sharedMemSizeBytes;             //!< Idx of the available block shared memory in bytes.
-        template<
-            typename TDim,
-            typename TIdx>
-        struct AccDevProps
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AccDevProps(
+            TIdx const& multiProcessorCount,
+            Vec<TDim, TIdx> const& gridBlockExtentMax,
+            TIdx const& gridBlockCountMax,
+            Vec<TDim, TIdx> const& blockThreadExtentMax,
+            TIdx const& blockThreadCountMax,
+            Vec<TDim, TIdx> const& threadElemExtentMax,
+            TIdx const& threadElemCountMax,
+            size_t const& sharedMemSizeBytes)
+            : m_gridBlockExtentMax(gridBlockExtentMax)
+            , m_blockThreadExtentMax(blockThreadExtentMax)
+            , m_threadElemExtentMax(threadElemExtentMax)
+            , m_gridBlockCountMax(gridBlockCountMax)
+            , m_blockThreadCountMax(blockThreadCountMax)
+            , m_threadElemCountMax(threadElemCountMax)
+            , m_multiProcessorCount(multiProcessorCount)
+            , m_sharedMemSizeBytes(sharedMemSizeBytes)
         {
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AccDevProps(
-                TIdx const & multiProcessorCount,
-                vec::Vec<TDim, TIdx> const & gridBlockExtentMax,
-                TIdx const & gridBlockCountMax,
-                vec::Vec<TDim, TIdx> const & blockThreadExtentMax,
-                TIdx const & blockThreadCountMax,
-                vec::Vec<TDim, TIdx> const & threadElemExtentMax,
-                TIdx const & threadElemCountMax) :
-                    m_gridBlockExtentMax(gridBlockExtentMax),
-                    m_blockThreadExtentMax(blockThreadExtentMax),
-                    m_threadElemExtentMax(threadElemExtentMax),
-                    m_gridBlockCountMax(gridBlockCountMax),
-                    m_blockThreadCountMax(blockThreadCountMax),
-                    m_threadElemCountMax(threadElemCountMax),
-                    m_multiProcessorCount(multiProcessorCount)
-            {}
+        }
 
-            // NOTE: The members have been reordered from the order in the constructor because gcc is buggy for some TDim and TIdx and generates invalid assembly.
-            vec::Vec<TDim, TIdx> m_gridBlockExtentMax;      //!< The maximum number of blocks in each dimension of the grid.
-            vec::Vec<TDim, TIdx> m_blockThreadExtentMax;    //!< The maximum number of threads in each dimension of a block.
-            vec::Vec<TDim, TIdx> m_threadElemExtentMax;     //!< The maximum number of elements in each dimension of a thread.
+        // NOTE: The members have been reordered from the order in the constructor because gcc is buggy for some TDim
+        // and TIdx and generates invalid assembly.
+        Vec<TDim, TIdx> m_gridBlockExtentMax; //!< The maximum number of blocks in each dimension of the grid.
+        Vec<TDim, TIdx> m_blockThreadExtentMax; //!< The maximum number of threads in each dimension of a block.
+        Vec<TDim, TIdx> m_threadElemExtentMax; //!< The maximum number of elements in each dimension of a thread.
 
-            TIdx m_gridBlockCountMax;                  //!< The maximum number of blocks in a grid.
-            TIdx m_blockThreadCountMax;                //!< The maximum number of threads in a block.
-            TIdx m_threadElemCountMax;                 //!< The maximum number of elements in a threads.
+        TIdx m_gridBlockCountMax; //!< The maximum number of blocks in a grid.
+        TIdx m_blockThreadCountMax; //!< The maximum number of threads in a block.
+        TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads.
 
-            TIdx m_multiProcessorCount;                //!< The number of multiprocessors.
-        };
-    }
-}
+        TIdx m_multiProcessorCount; //!< The number of multiprocessors.
+        size_t m_sharedMemSizeBytes; //!< The size of shared memory per block
+    };
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp
index 93ea45a7b6..1fc870fb42 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuCudaRt.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,358 +11,104 @@
 
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
 
-#include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/core/BoostPredef.hpp>
 
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
+#    if !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
 
 // Base classes.
-#include <alpaka/workdiv/WorkDivCudaBuiltIn.hpp>
-#include <alpaka/idx/gb/IdxGbCudaBuiltIn.hpp>
-#include <alpaka/idx/bt/IdxBtCudaBuiltIn.hpp>
-#include <alpaka/atomic/AtomicCudaBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathCudaBuiltIn.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp>
-#include <alpaka/block/sync/BlockSyncCudaBuiltIn.hpp>
-#include <alpaka/rand/RandCuRand.hpp>
-#include <alpaka/time/TimeCudaBuiltIn.hpp>
+#    include <alpaka/acc/AccGpuUniformCudaHipRt.hpp>
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
+#    include <alpaka/core/ClipCast.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Cuda.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
 
-#include <typeinfo>
+#    include <typeinfo>
 
 namespace alpaka
 {
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelGpuCudaRt;
-    }
-    namespace acc
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGpuUniformCudaHipRt;
+
+    //#############################################################################
+    //! The GPU CUDA accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on devices supporting CUDA.
+    template<typename TDim, typename TIdx>
+    class AccGpuCudaRt final
+        : public AccGpuUniformCudaHipRt<TDim, TIdx>
+        , public concepts::Implements<ConceptUniformCudaHip, AccGpuUniformCudaHipRt<TDim, TIdx>>
     {
-        //#############################################################################
-        //! The GPU CUDA accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on devices supporting CUDA.
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccGpuCudaRt final :
-            public workdiv::WorkDivCudaBuiltIn<TDim, TIdx>,
-            public idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>,
-            public idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicCudaBuiltIn, // grid atomics
-                atomic::AtomicCudaBuiltIn, // block atomics
-                atomic::AtomicCudaBuiltIn  // thread atomics
-            >,
-            public math::MathCudaBuiltIn,
-            public block::shared::dyn::BlockSharedMemDynCudaBuiltIn,
-            public block::shared::st::BlockSharedMemStCudaBuiltIn,
-            public block::sync::BlockSyncCudaBuiltIn,
-            public rand::RandCuRand,
-            public time::TimeCudaBuiltIn,
-            public concepts::Implements<ConceptAcc, AccGpuCudaRt<TDim, TIdx>>
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuCudaRt(Vec<TDim, TIdx> const& threadElemExtent)
+            : AccGpuUniformCudaHipRt<TDim, TIdx>(threadElemExtent)
         {
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuCudaRt(
-                vec::Vec<TDim, TIdx> const & threadElemExtent) :
-                    workdiv::WorkDivCudaBuiltIn<TDim, TIdx>(threadElemExtent),
-                    idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>(),
-                    idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicCudaBuiltIn, // atomics between grids
-                        atomic::AtomicCudaBuiltIn, // atomics between blocks
-                        atomic::AtomicCudaBuiltIn  // atomics between threads
-                    >(),
-                    math::MathCudaBuiltIn(),
-                    block::shared::dyn::BlockSharedMemDynCudaBuiltIn(),
-                    block::shared::st::BlockSharedMemStCudaBuiltIn(),
-                    block::sync::BlockSyncCudaBuiltIn(),
-                    rand::RandCuRand(),
-                    time::TimeCudaBuiltIn()
-            {}
-
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuCudaRt(AccGpuCudaRt const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuCudaRt(AccGpuCudaRt &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AccGpuCudaRt const &) -> AccGpuCudaRt & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AccGpuCudaRt &&) -> AccGpuCudaRt & = delete;
-            //-----------------------------------------------------------------------------
-            ~AccGpuCudaRt() = default;
-        };
-    }
+        }
 
-    namespace acc
+    public:
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuCudaRt(AccGpuCudaRt const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuCudaRt(AccGpuCudaRt&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(AccGpuCudaRt const&) -> AccGpuCudaRt& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(AccGpuCudaRt&&) -> AccGpuCudaRt& = delete;
+        //-----------------------------------------------------------------------------
+        ~AccGpuCudaRt() = default;
+    };
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The GPU CUDA accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccGpuCudaRt<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The GPU CUDA accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = acc::AccGpuCudaRt<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The GPU CUDA accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevCudaRt const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with cudaGetDeviceProperties
-                    // https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
-                    int multiProcessorCount = {};
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &multiProcessorCount,
-                        cudaDevAttrMultiProcessorCount,
-                        dev.m_iDevice));
-
-                    int maxGridSize[3] = {};
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxGridSize[0],
-                        cudaDevAttrMaxGridDimX,
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxGridSize[1],
-                        cudaDevAttrMaxGridDimY,
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxGridSize[2],
-                        cudaDevAttrMaxGridDimZ,
-                        dev.m_iDevice));
-
-                    int maxBlockDim[3] = {};
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxBlockDim[0],
-                        cudaDevAttrMaxBlockDimX,
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxBlockDim[1],
-                        cudaDevAttrMaxBlockDimY,
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxBlockDim[2],
-                        cudaDevAttrMaxBlockDimZ,
-                        dev.m_iDevice));
-
-                    int maxThreadsPerBlock = {};
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceGetAttribute(
-                        &maxThreadsPerBlock,
-                        cudaDevAttrMaxThreadsPerBlock,
-                        dev.m_iDevice));
+            using type = AccGpuCudaRt<TDim, TIdx>;
+        };
 
-                    return {
-                        // m_multiProcessorCount
-                        alpaka::core::clipCast<TIdx>(multiProcessorCount),
-                        // m_gridBlockExtentMax
-                        extent::getExtentVecEnd<TDim>(
-                            vec::Vec<dim::DimInt<3u>, TIdx>(
-                                alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
-                                alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
-                                alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        extent::getExtentVecEnd<TDim>(
-                            vec::Vec<dim::DimInt<3u>, TIdx>(
-                                alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
-                                alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
-                                alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
-                        // m_blockThreadCountMax
-                        alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The GPU CUDA accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccGpuCudaRt<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = dev::DevCudaRt;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
-    {
-        namespace detail
+        //#############################################################################
+        //! The GPU CUDA accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccGpuCudaRt<TDim, TIdx>>
         {
-            //#############################################################################
-            //! specialization of the TKernelFnObj return type evaluation
-            //
-            // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
-            // https://github.com/ComputationalRadiationPhysics/alpaka/pull/695#issuecomment-446103194
-            // The execution task TaskKernelGpuCudaRt is therefore performing this check on device side.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct CheckFnReturnType<
-                acc::AccGpuCudaRt<
-                    TDim,
-                    TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                template<
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                void operator()(
-                    TKernelFnObj const &,
-                    TArgs const & ...)
-                {
-
-                }
-            };
-        }
+                return "AccGpuCudaRt<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
 
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccGpuCudaRt<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelGpuCudaRt<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelGpuCudaRt<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU CUDA execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
-            {
-                using type = pltf::PltfCudaRt;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+        //#############################################################################
+        //! The GPU CUDA accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccGpuCudaRt<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
         {
-            //#############################################################################
-            //! The GPU CUDA accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccGpuCudaRt<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
             {
-                using type = TIdx;
-            };
-        }
-    }
-}
+                return TaskKernelGpuUniformCudaHipRt<AccGpuCudaRt<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuHipRt.hpp
index 101212e439..9930bf9a5e 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuHipRt.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuHipRt.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,320 +11,104 @@
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
 
-#include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/core/BoostPredef.hpp>
 
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
+#    if !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
 
 // Base classes.
-#include <alpaka/workdiv/WorkDivHipBuiltIn.hpp>
-#include <alpaka/idx/gb/IdxGbHipBuiltIn.hpp>
-#include <alpaka/idx/bt/IdxBtHipBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHipBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHierarchy.hpp>
-#include <alpaka/math/MathHipBuiltIn.hpp>
-#include <alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp>
-#include <alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp>
-#include <alpaka/block/sync/BlockSyncHipBuiltIn.hpp>
-#include <alpaka/rand/RandHipRand.hpp>
-#include <alpaka/time/TimeHipBuiltIn.hpp>
+#    include <alpaka/acc/AccGpuUniformCudaHipRt.hpp>
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/core/ClipCast.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/core/Hip.hpp>
+#    include <alpaka/core/ClipCast.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Hip.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
 
-#include <typeinfo>
+#    include <typeinfo>
 
 namespace alpaka
 {
-    namespace kernel
-    {
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelGpuHipRt;
-    }
-    namespace acc
-    {
-        //#############################################################################
-        //! The GPU HIP accelerator.
-        //!
-        //! This accelerator allows parallel kernel execution on devices supporting HIP or HCC
-        template<
-            typename TDim,
-            typename TIdx>
-        class AccGpuHipRt final :
-            public workdiv::WorkDivHipBuiltIn<TDim, TIdx>,
-            public idx::gb::IdxGbHipBuiltIn<TDim, TIdx>,
-            public idx::bt::IdxBtHipBuiltIn<TDim, TIdx>,
-            public atomic::AtomicHierarchy<
-                atomic::AtomicHipBuiltIn, // grid atomics
-                atomic::AtomicHipBuiltIn, // block atomics
-                atomic::AtomicHipBuiltIn  // thread atomics
-            >,
-            public math::MathHipBuiltIn,
-            public block::shared::dyn::BlockSharedMemDynHipBuiltIn,
-            public block::shared::st::BlockSharedMemStHipBuiltIn,
-            public block::sync::BlockSyncHipBuiltIn,
-            public rand::RandHipRand,
-            public time::TimeHipBuiltIn,
-            public concepts::Implements<ConceptAcc, AccGpuHipRt<TDim, TIdx>>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuHipRt(
-                vec::Vec<TDim, TIdx> const & threadElemExtent) :
-                    workdiv::WorkDivHipBuiltIn<TDim, TIdx>(threadElemExtent),
-                    idx::gb::IdxGbHipBuiltIn<TDim, TIdx>(),
-                    idx::bt::IdxBtHipBuiltIn<TDim, TIdx>(),
-                    atomic::AtomicHierarchy<
-                        atomic::AtomicHipBuiltIn, // atomics between grids
-                        atomic::AtomicHipBuiltIn, // atomics between blocks
-                        atomic::AtomicHipBuiltIn  // atomics between threads
-                    >(),
-                    math::MathHipBuiltIn(),
-                    block::shared::dyn::BlockSharedMemDynHipBuiltIn(),
-                    block::shared::st::BlockSharedMemStHipBuiltIn(),
-                    block::sync::BlockSyncHipBuiltIn(),
-                    rand::RandHipRand(),
-                    time::TimeHipBuiltIn()
-            {}
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGpuUniformCudaHipRt;
 
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuHipRt(AccGpuHipRt const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ AccGpuHipRt(AccGpuHipRt &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AccGpuHipRt const &) -> AccGpuHipRt & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AccGpuHipRt &&) -> AccGpuHipRt & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST_ACC ~AccGpuHipRt() = default;
-        };
-    }
-
-    namespace acc
+    //#############################################################################
+    //! The GPU HIP accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on devices supporting HIP
+    template<typename TDim, typename TIdx>
+    class AccGpuHipRt final
+        : public AccGpuUniformCudaHipRt<TDim, TIdx>
+        , public concepts::Implements<ConceptUniformCudaHip, AccGpuUniformCudaHipRt<TDim, TIdx>>
     {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct AccType<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                using type = acc::AccGpuHipRt<TDim, TIdx>;
-            };
-            //#############################################################################
-            //! The GPU HIP accelerator device properties get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccDevProps<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccDevProps(
-                    dev::DevHipRt const & dev)
-                -> acc::AccDevProps<TDim, TIdx>
-                {
-                    hipDeviceProp_t hipDevProp;
-                    ALPAKA_HIP_RT_CHECK(hipGetDeviceProperties(
-                        &hipDevProp,
-                        dev.m_iDevice));
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
 
-                    return {
-                        // m_multiProcessorCount
-                        alpaka::core::clipCast<TIdx>(hipDevProp.multiProcessorCount),
-                        // m_gridBlockExtentMax
-                        extent::getExtentVecEnd<TDim>(
-                            vec::Vec<dim::DimInt<3u>, TIdx>(
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[2u]),
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[1u]),
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[0u]))),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        extent::getExtentVecEnd<TDim>(
-                            vec::Vec<dim::DimInt<3u>, TIdx>(
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[2u]),
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[1u]),
-                                alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[0u]))),
-                        // m_blockThreadCountMax
-                        alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsPerBlock),
-                        // m_threadElemExtentMax
-                        vec::Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max()};
-                }
-            };
-            //#############################################################################
-            //! The GPU Hip accelerator name trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetAccName<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return "AccGpuHipRt<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
-                }
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+    public:
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuHipRt(Vec<TDim, TIdx> const& threadElemExtent)
+            : AccGpuUniformCudaHipRt<TDim, TIdx>(threadElemExtent)
         {
-            //#############################################################################
-            //! The GPU HIP accelerator device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                using type = dev::DevHipRt;
-            };
         }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                acc::AccGpuHipRt<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace kernel
+
+    public:
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuHipRt(AccGpuHipRt const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuHipRt(AccGpuHipRt&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(AccGpuHipRt const&) -> AccGpuHipRt& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(AccGpuHipRt&&) -> AccGpuHipRt& = delete;
+        //-----------------------------------------------------------------------------
+        ~AccGpuHipRt() = default;
+    };
+
+    namespace traits
     {
-        namespace detail
+        //#############################################################################
+        //! The GPU HIP accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccGpuHipRt<TDim, TIdx>>
         {
-            //#############################################################################
-            //! specialization of the TKernelFnObj return type evaluation
-            //
-            // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
-            // https://github.com/ComputationalRadiationPhysics/alpaka/pull/695#issuecomment-446103194
-            // The execution task TaskKernelGpuHipRt is therefore performing this check on device side.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct CheckFnReturnType<
-                acc::AccGpuHipRt<
-                    TDim,
-                    TIdx>>
-            {
-                template<
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                void operator()(
-                    TKernelFnObj const &,
-                    TArgs const & ...)
-                {
+            using type = AccGpuHipRt<TDim, TIdx>;
+        };
 
-                }
-            };
-        }
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator execution task type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct CreateTaskKernel<
-                acc::AccGpuHipRt<TDim, TIdx>,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto createTaskKernel(
-                    TWorkDiv const & workDiv,
-                    TKernelFnObj const & kernelFnObj,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> kernel::TaskKernelGpuHipRt<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>
-#endif
-                {
-                    return
-                        kernel::TaskKernelGpuHipRt<
-                            TDim,
-                            TIdx,
-                            TKernelFnObj,
-                            TArgs...>(
-                                workDiv,
-                                kernelFnObj,
-                                std::forward<TArgs>(args)...);
-                }
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+        //#############################################################################
+        //! The GPU Hip accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccGpuHipRt<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU HIP execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct PltfType<
-                acc::AccGpuHipRt<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                using type = pltf::PltfHipRt;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return "AccGpuHipRt<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The GPU HIP accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccGpuHipRt<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
         {
-            //#############################################################################
-            //! The GPU HIP accelerator idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                acc::AccGpuHipRt<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
             {
-                using type = TIdx;
-            };
-        }
-    }
-}
+                return TaskKernelGpuUniformCudaHipRt<AccGpuHipRt<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
new file mode 100644
index 0000000000..83e3091bfd
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
@@ -0,0 +1,308 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+// Base classes.
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/MathUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/rand/RandUniformCudaHipRand.hpp>
+#    include <alpaka/time/TimeUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/warp/WarpUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp>
+
+// Specialized traits.
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+
+// Implementation details.
+#    include <alpaka/core/ClipCast.hpp>
+#    include <alpaka/core/Cuda.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+
+#    include <typeinfo>
+
+namespace alpaka
+{
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGpuUniformCudaHipRt;
+
+    //#############################################################################
+    //! The GPU CUDA accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on devices supporting CUDA.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccGpuUniformCudaHipRt :
+        public WorkDivUniformCudaHipBuiltIn<TDim, TIdx>,
+        public gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>,
+        public bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicUniformCudaHipBuiltIn, // grid atomics
+            AtomicUniformCudaHipBuiltIn, // block atomics
+            AtomicUniformCudaHipBuiltIn  // thread atomics
+        >,
+        public math::MathUniformCudaHipBuiltIn,
+        public BlockSharedMemDynUniformCudaHipBuiltIn,
+        public BlockSharedMemStUniformCudaHipBuiltIn,
+        public BlockSyncUniformCudaHipBuiltIn,
+        public IntrinsicUniformCudaHipBuiltIn,
+        public rand::RandUniformCudaHipRand,
+        public TimeUniformCudaHipBuiltIn,
+        public warp::WarpUniformCudaHipBuiltIn,
+        public concepts::Implements<ConceptAcc, AccGpuUniformCudaHipRt<TDim, TIdx>>
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuUniformCudaHipRt(Vec<TDim, TIdx> const& threadElemExtent)
+            : WorkDivUniformCudaHipBuiltIn<TDim, TIdx>(threadElemExtent)
+            , gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>()
+            , bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>()
+            , AtomicHierarchy<
+                  AtomicUniformCudaHipBuiltIn, // atomics between grids
+                  AtomicUniformCudaHipBuiltIn, // atomics between blocks
+                  AtomicUniformCudaHipBuiltIn // atomics between threads
+                  >()
+            , math::MathUniformCudaHipBuiltIn()
+            , BlockSharedMemDynUniformCudaHipBuiltIn()
+            , BlockSharedMemStUniformCudaHipBuiltIn()
+            , BlockSyncUniformCudaHipBuiltIn()
+            , rand::RandUniformCudaHipRand()
+            , TimeUniformCudaHipBuiltIn()
+        {
+        }
+
+    public:
+        // using baseType = AccUniformCudaHip<TDim,TIdx>;
+
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(AccGpuUniformCudaHipRt const&) -> AccGpuUniformCudaHipRt& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(AccGpuUniformCudaHipRt&&) -> AccGpuUniformCudaHipRt& = delete;
+        //-----------------------------------------------------------------------------
+        ~AccGpuUniformCudaHipRt() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The GPU CUDA accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccGpuUniformCudaHipRt<TDim, TIdx>>
+        {
+            using type = AccGpuUniformCudaHipRt<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The GPU CUDA accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccGpuUniformCudaHipRt<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevUniformCudaHipRt const& dev) -> AccDevProps<TDim, TIdx>
+            {
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with
+                // cuda https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
+                int multiProcessorCount = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, dev.m_iDevice));
+
+                int maxGridSize[3] = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&maxGridSize[0], cudaDevAttrMaxGridDimX, dev.m_iDevice));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&maxGridSize[1], cudaDevAttrMaxGridDimY, dev.m_iDevice));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&maxGridSize[2], cudaDevAttrMaxGridDimZ, dev.m_iDevice));
+
+                int maxBlockDim[3] = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&maxBlockDim[0], cudaDevAttrMaxBlockDimX, dev.m_iDevice));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&maxBlockDim[1], cudaDevAttrMaxBlockDimY, dev.m_iDevice));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&maxBlockDim[2], cudaDevAttrMaxBlockDimZ, dev.m_iDevice));
+
+                int maxThreadsPerBlock = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&maxThreadsPerBlock, cudaDevAttrMaxThreadsPerBlock, dev.m_iDevice));
+
+                int sharedMemSizeBytes = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaDeviceGetAttribute(&sharedMemSizeBytes, cudaDevAttrMaxSharedMemoryPerBlock, dev.m_iDevice));
+
+                return {// m_multiProcessorCount
+                        alpaka::core::clipCast<TIdx>(multiProcessorCount),
+                        // m_gridBlockExtentMax
+                        extent::getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                            alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
+                            alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
+                            alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        extent::getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                            alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
+                            alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
+                            alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
+                        // m_blockThreadCountMax
+                        alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(sharedMemSizeBytes)};
+
+#    else
+                hipDeviceProp_t hipDevProp;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipGetDeviceProperties(&hipDevProp, dev.m_iDevice));
+
+                return {// m_multiProcessorCount
+                        alpaka::core::clipCast<TIdx>(hipDevProp.multiProcessorCount),
+                        // m_gridBlockExtentMax
+                        extent::getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                            alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[2u]),
+                            alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[1u]),
+                            alpaka::core::clipCast<TIdx>(hipDevProp.maxGridSize[0u]))),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        extent::getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                            alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[2u]),
+                            alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[1u]),
+                            alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsDim[0u]))),
+                        // m_blockThreadCountMax
+                        alpaka::core::clipCast<TIdx>(hipDevProp.maxThreadsPerBlock),
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(hipDevProp.sharedMemPerBlock)};
+#    endif
+            }
+        };
+        //#############################################################################
+        //! The GPU CUDA accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccGpuUniformCudaHipRt<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return "AccGpuUniformCudaHipRt<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The GPU CUDA accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccGpuUniformCudaHipRt<TDim, TIdx>>
+        {
+            using type = DevUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccGpuUniformCudaHipRt<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+    } // namespace traits
+    namespace detail
+    {
+        //#############################################################################
+        //! specialization of the TKernelFnObj return type evaluation
+        //
+        // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
+        // https://github.com/alpaka-group/alpaka/pull/695#issuecomment-446103194
+        // The execution task TaskKernelGpuUniformCudaHipRt is therefore performing this check on device side.
+        template<typename TDim, typename TIdx>
+        struct CheckFnReturnType<AccGpuUniformCudaHipRt<TDim, TIdx>>
+        {
+            template<typename TKernelFnObj, typename... TArgs>
+            void operator()(TKernelFnObj const&, TArgs const&...)
+            {
+            }
+        };
+    } // namespace detail
+    namespace traits
+    {
+        //#############################################################################
+        //! The GPU CUDA accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccGpuUniformCudaHipRt<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                return TaskKernelGpuUniformCudaHipRt<
+                    AccGpuUniformCudaHipRt<TDim, TIdx>,
+                    TDim,
+                    TIdx,
+                    TKernelFnObj,
+                    TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The CPU CUDA execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccGpuUniformCudaHipRt<TDim, TIdx>>
+        {
+            using type = PltfUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccGpuUniformCudaHipRt<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccOacc.hpp
new file mode 100644
index 0000000000..b0a7c111dd
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccOacc.hpp
@@ -0,0 +1,382 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+// Base classes.
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicOaccBuiltIn.hpp>
+#    include <alpaka/ctx/block/CtxBlockOacc.hpp>
+#    include <alpaka/idx/bt/IdxBtLinear.hpp>
+#    include <alpaka/intrinsic/IntrinsicFallback.hpp>
+#    include <alpaka/math/MathStdLib.hpp>
+#    include <alpaka/rand/RandStdLib.hpp>
+#    include <alpaka/time/TimeStdLib.hpp>
+#    include <alpaka/warp/WarpSingleThread.hpp>
+
+// Specialized traits.
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+
+// Implementation details.
+#    include <alpaka/core/ClipCast.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/ctx/block/CtxBlockOacc.hpp>
+#    include <alpaka/dev/DevOacc.hpp>
+
+#    include <limits>
+#    include <typeinfo>
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelOacc;
+
+    // define max gang/worker num because there is no standart way in OpenACC to
+    // get this information
+#    ifndef ALPAKA_OACC_MAX_GANG_NUM
+    constexpr size_t oaccMaxGangNum = std::numeric_limits<unsigned int>::max();
+#    else
+    constexpr size_t oaccMaxGangNum = ALPAKA_OACC_MAX_GANG_NUM;
+#    endif
+#    if defined(ALPAKA_OFFLOAD_MAX_BLOCK_SIZE) && ALPAKA_OFFLOAD_MAX_BLOCK_SIZE > 0
+    constexpr size_t oaccMaxWorkerNum = ALPAKA_OFFLOAD_MAX_BLOCK_SIZE;
+#    else
+    constexpr size_t oaccMaxWorkerNum = 1;
+#    endif
+
+    //#############################################################################
+    //! The OpenACC accelerator.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccOacc final :
+        public bt::IdxBtLinear<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicOaccBuiltIn,    // grid atomics
+            AtomicOaccBuiltIn,    // block atomics
+            AtomicOaccBuiltIn     // thread atomics
+        >,
+        public math::MathStdLib,
+        public rand::RandStdLib,
+        public TimeStdLib,
+        public warp::WarpSingleThread,
+        // NVHPC calls a builtin in the STL implementation, which fails in OpenACC offload, using fallback
+        public IntrinsicFallback,
+        public concepts::Implements<ConceptAcc, AccOacc<TDim, TIdx>>,
+        public concepts::Implements<ConceptWorkDiv, AccOacc<TDim, TIdx>>,
+        public concepts::Implements<ConceptBlockSharedDyn, AccOacc<TDim, TIdx>>,
+        public concepts::Implements<ConceptBlockSharedSt, AccOacc<TDim, TIdx>>,
+        public concepts::Implements<ConceptBlockSync, AccOacc<TDim, TIdx>>,
+        public concepts::Implements<ConceptIdxGb, AccOacc<TDim, TIdx>>
+    {
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelOacc;
+
+    protected:
+        //-----------------------------------------------------------------------------
+        AccOacc(TIdx const& blockThreadIdx, CtxBlockOacc<TDim, TIdx>& blockShared)
+            : bt::IdxBtLinear<TDim, TIdx>(blockThreadIdx)
+            , AtomicHierarchy<
+                  AtomicOaccBuiltIn, // grid atomics
+                  AtomicOaccBuiltIn, // block atomics
+                  AtomicOaccBuiltIn // thread atomics
+                  >()
+            , math::MathStdLib()
+            , rand::RandStdLib()
+            , TimeStdLib()
+            , m_blockShared(blockShared)
+        {
+        }
+
+    public:
+        //-----------------------------------------------------------------------------
+        AccOacc(AccOacc const&) = delete;
+        //-----------------------------------------------------------------------------
+        AccOacc(AccOacc&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(AccOacc const&) -> AccOacc& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(AccOacc&&) -> AccOacc& = delete;
+        //-----------------------------------------------------------------------------
+        ~AccOacc() = default;
+
+        CtxBlockOacc<TDim, TIdx>& m_blockShared;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenACC accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccOacc<TDim, TIdx>>
+        {
+            using type = AccOacc<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The OpenACC accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevOacc const& dev) -> AccDevProps<TDim, TIdx>
+            {
+                alpaka::ignore_unused(dev);
+
+#    ifdef ALPAKA_CI
+                auto const blockThreadCountMax(
+                    alpaka::core::clipCast<TIdx>(std::min(static_cast<size_t>(2u), oaccMaxWorkerNum)));
+                auto const gridBlockCountMax(
+                    alpaka::core::clipCast<TIdx>(std::min(static_cast<size_t>(2u), oaccMaxGangNum)));
+#    else
+                auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(oaccMaxWorkerNum));
+                auto const gridBlockCountMax(alpaka::core::clipCast<TIdx>(oaccMaxGangNum));
+#    endif
+                return {// m_multiProcessorCount
+                        static_cast<TIdx>(gridBlockCountMax),
+                        // m_gridBlockExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        Vec<TDim, TIdx>::all(blockThreadCountMax),
+                        // m_blockThreadCountMax
+                        blockThreadCountMax,
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        CtxBlockOacc<TDim, TIdx>::staticAllocBytes()};
+            }
+        };
+        //#############################################################################
+        //! The OpenACC accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return "AccOacc<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccOacc<TDim, TIdx>>
+        {
+            using type = DevOacc;
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccOacc<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccOacc<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                return TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccOacc<TDim, TIdx>>
+        {
+            using type = PltfOacc;
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccOacc<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerator grid block index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<AccOacc<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            static auto getIdx(AccOacc<TDim, TIdx> const& idx, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
+            {
+                // // \TODO: Would it be faster to precompute the index and cache it inside an array?
+                return mapIdx<TDim::value>(
+                    Vec<DimInt<1u>, TIdx>(idx.m_blockShared.m_gridBlockIdx),
+                    getWorkDiv<Grid, Blocks>(workDiv));
+            }
+        };
+
+        template<typename TIdx>
+        struct GetIdx<AccOacc<DimInt<1u>, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            static auto getIdx(AccOacc<DimInt<1u>, TIdx> const& idx, TWorkDiv const&) -> Vec<DimInt<1u>, TIdx>
+            {
+                return idx.m_blockShared.m_gridBlockIdx;
+            }
+        };
+
+        //#############################################################################
+        template<typename T, typename TDim, typename TIdx>
+        struct GetDynSharedMem<T, AccOacc<TDim, TIdx>>
+        {
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases
+                                                      // required alignment of target type"
+#    endif
+            //-----------------------------------------------------------------------------
+            static auto getMem(AccOacc<TDim, TIdx> const& mem) -> T*
+            {
+                return reinterpret_cast<T*>(mem.m_blockShared.dynMemBegin());
+            }
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic pop
+#    endif
+        };
+
+        //#############################################################################
+        template<typename T, typename TDim, typename TIdx, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, AccOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            static auto declareVar(AccOacc<TDim, TIdx> const& smem) -> T&
+            {
+                return alpaka::declareSharedVar<T, TuniqueId>(smem.m_blockShared);
+            }
+        };
+
+        //#############################################################################
+        template<typename TDim, typename TIdx>
+        struct FreeSharedVars<AccOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            static auto freeVars(AccOacc<TDim, TIdx> const& smem) -> void
+            {
+                alpaka::freeSharedVars(smem.m_blockShared);
+            }
+        };
+
+        //#############################################################################
+        template<typename TDim, typename TIdx>
+        struct SyncBlockThreads<AccOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            //! Execute op with single thread (any idx, last thread to
+            //! arrive at barrier executes) syncing before and after
+            template<typename TOp>
+            ALPAKA_FN_HOST static auto masterOpBlockThreads(AccOacc<TDim, TIdx> const& acc, TOp&& op) -> void
+            {
+                SyncBlockThreads<CtxBlockOacc<TDim, TIdx>>::masterOpBlockThreads(acc.m_blockShared, op);
+            }
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto syncBlockThreads(AccOacc<TDim, TIdx> const& acc) -> void
+            {
+                SyncBlockThreads<CtxBlockOacc<TDim, TIdx>>::syncBlockThreads(acc.m_blockShared);
+            }
+        };
+
+        //#############################################################################
+        template<typename TOp, typename TDim, typename TIdx>
+        struct SyncBlockThreadsPredicate<TOp, AccOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(AccOacc<TDim, TIdx> const& acc, int predicate) -> int
+            {
+                return SyncBlockThreadsPredicate<TOp, CtxBlockOacc<TDim, TIdx>>::syncBlockThreadsPredicate(
+                    acc.m_blockShared,
+                    predicate);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC grid block extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<AccOacc<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The number of blocks in each dimension of the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(AccOacc<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+            {
+                return GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Grid, unit::Blocks>::getWorkDiv(
+                    workDiv.m_blockShared);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC block thread extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<AccOacc<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The number of threads in each dimension of a block.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(AccOacc<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+            {
+                return GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Block, unit::Threads>::getWorkDiv(
+                    workDiv.m_blockShared);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC thread element extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<AccOacc<TDim, TIdx>, origin::Thread, unit::Elems>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The number of elements in each dimension of a thread.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(AccOacc<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+            {
+                return GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Thread, unit::Elems>::getWorkDiv(
+                    workDiv.m_blockShared);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/AccOmp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/AccOmp5.hpp
new file mode 100644
index 0000000000..209782de5b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/AccOmp5.hpp
@@ -0,0 +1,257 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+// Base classes.
+#    include <alpaka/atomic/AtomicHierarchy.hpp>
+#    include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStOmp5.hpp>
+#    include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
+#    include <alpaka/idx/bt/IdxBtOmp.hpp>
+#    include <alpaka/idx/gb/IdxGbLinear.hpp>
+#    include <alpaka/intrinsic/IntrinsicFallback.hpp>
+#    include <alpaka/math/MathStdLib.hpp>
+#    include <alpaka/rand/RandStdLib.hpp>
+#    include <alpaka/time/TimeOmp.hpp>
+#    include <alpaka/warp/WarpSingleThread.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+// Specialized traits.
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+
+// Implementation details.
+#    include <alpaka/core/ClipCast.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevOmp5.hpp>
+
+#    include <limits>
+#    include <typeinfo>
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelOmp5;
+
+    //#############################################################################
+    //! The CPU OpenMP 5.0 accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on an OpenMP target device.
+    template<
+        typename TDim,
+        typename TIdx>
+    class AccOmp5 final :
+        public WorkDivMembers<TDim, TIdx>,
+        public gb::IdxGbLinear<TDim, TIdx>,
+        public bt::IdxBtOmp<TDim, TIdx>,
+        public AtomicHierarchy<
+            AtomicOmpBuiltIn,   // grid atomics
+            AtomicOmpBuiltIn,    // block atomics
+            AtomicOmpBuiltIn     // thread atomics
+        >,
+        public math::MathStdLib,
+        public BlockSharedMemDynMember<>,
+        public BlockSharedMemStOmp5,
+        public BlockSyncBarrierOmp,
+        // cannot determine which intrinsics are safe to use (depends on target), using fallback
+        public IntrinsicFallback,
+        public rand::RandStdLib,
+        public TimeOmp,
+        public warp::WarpSingleThread,
+        public concepts::Implements<ConceptAcc, AccOmp5<TDim, TIdx>>
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelOmp5;
+
+    private:
+        //-----------------------------------------------------------------------------
+        AccOmp5(
+            Vec<TDim, TIdx> const& gridBlockExtent,
+            Vec<TDim, TIdx> const& blockThreadExtent,
+            Vec<TDim, TIdx> const& threadElemExtent,
+            TIdx const& gridBlockIdx,
+            std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, threadElemExtent)
+            , gb::IdxGbLinear<TDim, TIdx>(gridBlockIdx)
+            , bt::IdxBtOmp<TDim, TIdx>()
+            , AtomicHierarchy<
+                  AtomicOmpBuiltIn, // atomics between grids
+                  AtomicOmpBuiltIn, // atomics between blocks
+                  AtomicOmpBuiltIn // atomics between threads
+                  >()
+            , math::MathStdLib()
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            ,
+            //! \TODO can with some TMP determine the amount of statically alloced smem from the kernelFuncObj?
+            BlockSharedMemStOmp5(staticMemBegin(), staticMemCapacity())
+            , BlockSyncBarrierOmp()
+            , rand::RandStdLib()
+            , TimeOmp()
+        {
+        }
+
+    public:
+        //-----------------------------------------------------------------------------
+        AccOmp5(AccOmp5 const&) = delete;
+        //-----------------------------------------------------------------------------
+        AccOmp5(AccOmp5&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(AccOmp5 const&) -> AccOmp5& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(AccOmp5&&) -> AccOmp5& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AccOmp5() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenMP 5.0 accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccOmp5<TDim, TIdx>>
+        {
+            using type = AccOmp5<TDim, TIdx>;
+        };
+        //#############################################################################
+        //! The OpenMP 5.0 accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccOmp5<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(DevOmp5 const& dev) -> AccDevProps<TDim, TIdx>
+            {
+                alpaka::ignore_unused(dev);
+
+#    if defined(ALPAKA_OFFLOAD_MAX_BLOCK_SIZE) && ALPAKA_OFFLOAD_MAX_BLOCK_SIZE > 0
+                auto const blockThreadCount = std::min(::omp_get_max_threads(), ALPAKA_OFFLOAD_MAX_BLOCK_SIZE);
+#    else
+                auto const blockThreadCount = ::omp_get_max_threads();
+#    endif
+#    ifdef ALPAKA_CI
+                auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(std::min(4, blockThreadCount)));
+                auto const gridBlockCountMax(alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads())));
+#    else
+                auto const blockThreadCountMax(alpaka::core::clipCast<TIdx>(blockThreadCount));
+                //! \todo for a later OpenMP (or when compilers work with a GPU target): fix max block size for target
+                //!  On CPU we would want
+                //!  gridBlockCountMax = ::omp_get_max_threads() / blockThreadCountMax
+                //!  but this would lead to only one block running on GPU, or too small blocks (see
+                //!  ALPAKA_OFFLOAD_MAX_BLOCK_SIZE). OpenMP 5.0 may actually mandate, that
+                //!  ::omp_get_max_threads() == max_teams * threads_per_team ,
+                //!  however with the maximum grid size (i.e. max_teams) being INT_MAX this may not work.
+                //!  We actually want to set
+                //!  gridBlockCountMax = ::omp_get_max_teams()
+                //!  but there is no function ::omp_get_max_teams().
+                //!  Instead we set ::omp_get_max_threads() again, to have a
+                //!  number which does not kill CPUs and is reasonable
+                //!  (::omp_get_max_threads() seems to return the block size)
+                //!  for GPUs.
+                auto const gridBlockCountMax(alpaka::core::clipCast<TIdx>(::omp_get_max_threads()));
+#    endif
+                return {// m_multiProcessorCount
+                        static_cast<TIdx>(gridBlockCountMax),
+                        // m_gridBlockExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        Vec<TDim, TIdx>::all(blockThreadCountMax),
+                        // m_blockThreadCountMax
+                        blockThreadCountMax,
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        AccOmp5<TDim, TIdx>::staticAllocBytes()};
+            }
+        };
+        //#############################################################################
+        //! The OpenMP 5.0 accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccOmp5<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return "AccOmp5<" + std::to_string(TDim::value) + "," + typeid(TIdx).name() + ">";
+            }
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccOmp5<TDim, TIdx>>
+        {
+            using type = DevOmp5;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccOmp5<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccOmp5<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                return TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PltfType<AccOmp5<TDim, TIdx>>
+        {
+            using type = PltfOmp5;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccOmp5<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/acc/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/acc/Traits.hpp
index 10d0f1570e..2a68616397 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/acc/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/acc/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,117 +11,162 @@
 
 #include <alpaka/acc/AccDevProps.hpp>
 #include <alpaka/core/Common.hpp>
-
 #include <alpaka/core/Concepts.hpp>
-#include <alpaka/queue/Traits.hpp>
+#include <alpaka/dev/Traits.hpp>
+#include <alpaka/dim/Traits.hpp>
+#include <alpaka/idx/Traits.hpp>
+#include <alpaka/kernel/Traits.hpp>
 #include <alpaka/pltf/Traits.hpp>
+#include <alpaka/queue/Traits.hpp>
 
 #include <string>
-#include <typeinfo>
 #include <type_traits>
+#include <typeinfo>
 
 namespace alpaka
 {
+    struct ConceptUniformCudaHip
+    {
+    };
+
+    struct ConceptAcc
+    {
+    };
     //-----------------------------------------------------------------------------
-    //! The accelerator specifics.
-    namespace acc
+    //! The accelerator traits.
+    namespace traits
     {
-        struct ConceptAcc;
+        //#############################################################################
+        //! The accelerator type trait.
+        template<typename T, typename TSfinae = void>
+        struct AccType;
+
+        //#############################################################################
+        //! The device properties get trait.
+        template<typename TAcc, typename TSfinae = void>
+        struct GetAccDevProps;
 
-        //-----------------------------------------------------------------------------
-        //! The accelerator traits.
-        namespace traits
+        //#############################################################################
+        //! The accelerator name trait.
+        //!
+        //! The default implementation returns the mangled class name.
+        template<typename TAcc, typename TSfinae = void>
+        struct GetAccName
         {
-            //#############################################################################
-            //! The accelerator type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct AccType;
-
-            //#############################################################################
-            //! The device properties get trait.
-            template<
-                typename TAcc,
-                typename TSfinae = void>
-            struct GetAccDevProps;
-
-            //#############################################################################
-            //! The accelerator name trait.
-            //!
-            //! The default implementation returns the mangled class name.
-            template<
-                typename TAcc,
-                typename TSfinae = void>
-            struct GetAccName
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getAccName()
-                -> std::string
-                {
-                    return typeid(TAcc).name();
-                }
-            };
-        }
+                return typeid(TAcc).name();
+            }
+        };
 
         //#############################################################################
-        //! The accelerator type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Acc = typename traits::AccType<T>::type;
-
-        //-----------------------------------------------------------------------------
-        //! \return The acceleration properties on the given device.
-        template<
-            typename TAcc,
-            typename TDev>
-        ALPAKA_FN_HOST auto getAccDevProps(
-            TDev const & dev)
-        -> AccDevProps<dim::Dim<TAcc>, idx::Idx<TAcc>>
-        {
-            return
-                traits::GetAccDevProps<
-                    TAcc>
-                ::getAccDevProps(
-                    dev);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The accelerator name
-        //!
-        //! \tparam TAcc The accelerator type.
-        template<
-            typename TAcc>
-        ALPAKA_FN_HOST auto getAccName()
-        -> std::string
+        //! The GPU CUDA accelerator device properties get trait specialization.
+        template<typename TAcc>
+        struct GetAccDevProps<
+            TAcc,
+            typename std::enable_if<concepts::ImplementsConcept<ConceptUniformCudaHip, TAcc>::value>::type>
         {
-            return
-                traits::GetAccName<
-                    TAcc>
-                ::getAccName();
-        }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getAccDevProps(typename alpaka::traits::DevType<TAcc>::type const& dev)
+                -> AccDevProps<typename traits::DimType<TAcc>::type, typename traits::IdxType<TAcc>::type>
+            {
+                using ImplementationBase = typename concepts::ImplementationBase<ConceptUniformCudaHip, TAcc>;
+                return GetAccDevProps<ImplementationBase>::getAccDevProps(dev);
+            }
+        };
+    } // namespace traits
+
+    //#############################################################################
+    //! The accelerator type trait alias template to remove the ::type.
+    template<typename T>
+    using Acc = typename traits::AccType<T>::type;
+
+    //-----------------------------------------------------------------------------
+    //! \return The acceleration properties on the given device.
+    template<typename TAcc, typename TDev>
+    ALPAKA_FN_HOST auto getAccDevProps(TDev const& dev) -> AccDevProps<Dim<TAcc>, Idx<TAcc>>
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptAcc, TAcc>;
+        return traits::GetAccDevProps<ImplementationBase>::getAccDevProps(dev);
     }
 
-    namespace queue
+    //-----------------------------------------------------------------------------
+    //! \return The accelerator name
+    //!
+    //! \tparam TAcc The accelerator type.
+    template<typename TAcc>
+    ALPAKA_FN_HOST auto getAccName() -> std::string
     {
-        namespace traits
+        return traits::GetAccName<TAcc>::getAccName();
+    }
+
+    namespace detail
+    {
+        template<typename TAcc>
+        struct CheckFnReturnType<
+            TAcc,
+            typename std::enable_if<concepts::ImplementsConcept<ConceptUniformCudaHip, TAcc>::value>::type>
         {
-            template<
-                typename TAcc,
-                typename TProperty>
-            struct QueueType<
-                TAcc,
-                TProperty,
-                typename std::enable_if<
-                    concepts::ImplementsConcept<acc::ConceptAcc, TAcc>::value
-                >::type
-            >
+            template<typename TKernelFnObj, typename... TArgs>
+            void operator()(TKernelFnObj const& kernelFnObj, TArgs const&... args)
             {
-                using type = typename QueueType<
-                    typename pltf::traits::PltfType<TAcc>::type,
-                    TProperty
-                >::type;
-            };
-        }
-    }
-}
+                using ImplementationBase = typename concepts::ImplementationBase<ConceptUniformCudaHip, TAcc>;
+                CheckFnReturnType<ImplementationBase>{}(kernelFnObj, args...);
+            }
+        };
+    } // namespace detail
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The GPU HIP accelerator device type trait specialization.
+        template<typename TAcc>
+        struct DevType<
+            TAcc,
+            typename std::enable_if<concepts::ImplementsConcept<ConceptUniformCudaHip, TAcc>::value>::type>
+        {
+            using ImplementationBase = typename concepts::ImplementationBase<ConceptUniformCudaHip, TAcc>;
+            using type = typename DevType<ImplementationBase>::type;
+        };
+
+        //#############################################################################
+        //! The CPU HIP execution task platform type trait specialization.
+        template<typename TAcc>
+        struct PltfType<
+            TAcc,
+            typename std::enable_if<concepts::ImplementsConcept<ConceptUniformCudaHip, TAcc>::value>::type>
+        {
+            using ImplementationBase = typename concepts::ImplementationBase<ConceptUniformCudaHip, TAcc>;
+            using type = typename PltfType<ImplementationBase>::type;
+        };
+
+        //#############################################################################
+        //! The GPU HIP accelerator dimension getter trait specialization.
+        template<typename TAcc>
+        struct DimType<
+            TAcc,
+            typename std::enable_if<concepts::ImplementsConcept<ConceptUniformCudaHip, TAcc>::value>::type>
+        {
+            using ImplementationBase = typename concepts::ImplementationBase<ConceptUniformCudaHip, TAcc>;
+            using type = typename DimType<ImplementationBase>::type;
+        };
+
+        //#############################################################################
+        //! The GPU HIP accelerator idx type trait specialization.
+        template<typename TAcc>
+        struct IdxType<
+            TAcc,
+            typename std::enable_if<concepts::ImplementsConcept<ConceptUniformCudaHip, TAcc>::value>::type>
+        {
+            using ImplementationBase = typename concepts::ImplementationBase<ConceptUniformCudaHip, TAcc>;
+            using type = typename IdxType<ImplementationBase>::type;
+        };
+
+        template<typename TAcc, typename TProperty>
+        struct QueueType<TAcc, TProperty, std::enable_if_t<concepts::ImplementsConcept<ConceptAcc, TAcc>::value>>
+        {
+            using type = typename QueueType<typename alpaka::traits::PltfType<TAcc>::type, TProperty>::type;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/alpaka.hpp b/thirdParty/cupla/alpaka/include/alpaka/alpaka.hpp
index a268b6e0f4..7fad5bdb11 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/alpaka.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/alpaka.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -18,56 +18,57 @@
 #include <alpaka/version.hpp>
 //-----------------------------------------------------------------------------
 // acc
-#include <alpaka/acc/AccCpuSerial.hpp>
-#include <alpaka/acc/AccCpuThreads.hpp>
 #include <alpaka/acc/AccCpuFibers.hpp>
-#include <alpaka/acc/AccCpuTbbBlocks.hpp>
 #include <alpaka/acc/AccCpuOmp2Blocks.hpp>
 #include <alpaka/acc/AccCpuOmp2Threads.hpp>
-#include <alpaka/acc/AccCpuOmp4.hpp>
+#include <alpaka/acc/AccCpuSerial.hpp>
+#include <alpaka/acc/AccCpuTbbBlocks.hpp>
+#include <alpaka/acc/AccCpuThreads.hpp>
+#include <alpaka/acc/AccDevProps.hpp>
 #include <alpaka/acc/AccGpuCudaRt.hpp>
 #include <alpaka/acc/AccGpuHipRt.hpp>
-#include <alpaka/acc/AccDevProps.hpp>
+#include <alpaka/acc/AccGpuUniformCudaHipRt.hpp>
+#include <alpaka/acc/AccOacc.hpp>
+#include <alpaka/acc/AccOmp5.hpp>
 #include <alpaka/acc/Traits.hpp>
 //-----------------------------------------------------------------------------
 // atomic
-#include <alpaka/atomic/AtomicCudaBuiltIn.hpp>
-#include <alpaka/atomic/AtomicHipBuiltIn.hpp>
 #include <alpaka/atomic/AtomicNoOp.hpp>
 #include <alpaka/atomic/AtomicOmpBuiltIn.hpp>
 #include <alpaka/atomic/AtomicStdLibLock.hpp>
+#include <alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp>
 #include <alpaka/atomic/Op.hpp>
 #include <alpaka/atomic/Traits.hpp>
 //-----------------------------------------------------------------------------
 // block
-    //-----------------------------------------------------------------------------
-    // shared
-        //-----------------------------------------------------------------------------
-        // dynamic
-        #include <alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp>
-        #include <alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp>
-        #include <alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp>
-        #include <alpaka/block/shared/dyn/Traits.hpp>
-        //-----------------------------------------------------------------------------
-        // static
-        #include <alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp>
-        #include <alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp>
-        #include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
-        #include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
-        #include <alpaka/block/shared/st/Traits.hpp>
-    //-----------------------------------------------------------------------------
-    // sync
-    #include <alpaka/block/sync/BlockSyncBarrierFiber.hpp>
-    #include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
-    #include <alpaka/block/sync/BlockSyncBarrierThread.hpp>
-    #include <alpaka/block/sync/BlockSyncCudaBuiltIn.hpp>
-    #include <alpaka/block/sync/BlockSyncHipBuiltIn.hpp>
-    #include <alpaka/block/sync/BlockSyncNoOp.hpp>
-    #include <alpaka/block/sync/Traits.hpp>
+//-----------------------------------------------------------------------------
+// shared
+//-----------------------------------------------------------------------------
+// dynamic
+#include <alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp>
+#include <alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp>
+#include <alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp>
+#include <alpaka/block/shared/dyn/Traits.hpp>
+//-----------------------------------------------------------------------------
+// static
+#include <alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp>
+#include <alpaka/block/shared/st/BlockSharedMemStMember.hpp>
+#include <alpaka/block/shared/st/BlockSharedMemStNoSync.hpp>
+#include <alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp>
+#include <alpaka/block/shared/st/Traits.hpp>
+//-----------------------------------------------------------------------------
+// sync
+#include <alpaka/block/sync/BlockSyncBarrierFiber.hpp>
+#include <alpaka/block/sync/BlockSyncBarrierOmp.hpp>
+#include <alpaka/block/sync/BlockSyncBarrierThread.hpp>
+#include <alpaka/block/sync/BlockSyncNoOp.hpp>
+#include <alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp>
+#include <alpaka/block/sync/Traits.hpp>
 //-----------------------------------------------------------------------------
 // core
-#include <alpaka/core/Assert.hpp>
 #include <alpaka/core/Align.hpp>
+#include <alpaka/core/AlignedAlloc.hpp>
+#include <alpaka/core/Assert.hpp>
 #include <alpaka/core/BarrierThread.hpp>
 #include <alpaka/core/BoostPredef.hpp>
 #include <alpaka/core/ClipCast.hpp>
@@ -78,6 +79,7 @@
 #include <alpaka/core/Debug.hpp>
 #include <alpaka/core/Fibers.hpp>
 #include <alpaka/core/Hip.hpp>
+#include <alpaka/core/OmpSchedule.hpp>
 #include <alpaka/core/Positioning.hpp>
 #include <alpaka/core/Unroll.hpp>
 #include <alpaka/core/Unused.hpp>
@@ -85,11 +87,10 @@
 #include <alpaka/core/Vectorize.hpp>
 //-----------------------------------------------------------------------------
 // dev
-#include <alpaka/dev/DevCudaRt.hpp>
 #include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/dev/cpu/Wait.hpp>
+#include <alpaka/dev/DevUniformCudaHipRt.hpp>
 #include <alpaka/dev/Traits.hpp>
+#include <alpaka/dev/cpu/Wait.hpp>
 //-----------------------------------------------------------------------------
 // dim
 #include <alpaka/dim/DimArithmetic.hpp>
@@ -97,60 +98,58 @@
 #include <alpaka/dim/Traits.hpp>
 //-----------------------------------------------------------------------------
 // event
-#include <alpaka/event/EventCudaRt.hpp>
-#include <alpaka/event/EventHipRt.hpp>
 #include <alpaka/event/EventCpu.hpp>
+#include <alpaka/event/EventOacc.hpp>
+#include <alpaka/event/EventOmp5.hpp>
+#include <alpaka/event/EventUniformCudaHipRt.hpp>
 #include <alpaka/event/Traits.hpp>
 //-----------------------------------------------------------------------------
 // extent
 #include <alpaka/extent/Traits.hpp>
 //-----------------------------------------------------------------------------
 // idx
-#include <alpaka/idx/bt/IdxBtCudaBuiltIn.hpp>
-#include <alpaka/idx/bt/IdxBtHipBuiltIn.hpp>
+#include <alpaka/idx/Accessors.hpp>
+#include <alpaka/idx/MapIdx.hpp>
+#include <alpaka/idx/Traits.hpp>
 #include <alpaka/idx/bt/IdxBtOmp.hpp>
 #include <alpaka/idx/bt/IdxBtRefFiberIdMap.hpp>
 #include <alpaka/idx/bt/IdxBtRefThreadIdMap.hpp>
+#include <alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp>
 #include <alpaka/idx/bt/IdxBtZero.hpp>
-#include <alpaka/idx/gb/IdxGbCudaBuiltIn.hpp>
 #include <alpaka/idx/gb/IdxGbRef.hpp>
-#include <alpaka/idx/Accessors.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/idx/MapIdx.hpp>
+#include <alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp>
 //-----------------------------------------------------------------------------
 // kernel
-#include <alpaka/kernel/TaskKernelCpuSerial.hpp>
-#include <alpaka/kernel/TaskKernelCpuThreads.hpp>
 #include <alpaka/kernel/TaskKernelCpuFibers.hpp>
-#include <alpaka/kernel/TaskKernelCpuTbbBlocks.hpp>
 #include <alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp>
 #include <alpaka/kernel/TaskKernelCpuOmp2Threads.hpp>
-#include <alpaka/kernel/TaskKernelCpuOmp4.hpp>
-#include <alpaka/kernel/TaskKernelGpuCudaRt.hpp>
-#include <alpaka/kernel/TaskKernelGpuHipRt.hpp>
+#include <alpaka/kernel/TaskKernelCpuSerial.hpp>
+#include <alpaka/kernel/TaskKernelCpuTbbBlocks.hpp>
+#include <alpaka/kernel/TaskKernelCpuThreads.hpp>
+#include <alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp>
+#include <alpaka/kernel/TaskKernelOacc.hpp>
+#include <alpaka/kernel/TaskKernelOmp5.hpp>
 #include <alpaka/kernel/Traits.hpp>
 //-----------------------------------------------------------------------------
 // math
-#include <alpaka/math/MathCudaBuiltIn.hpp>
-#include <alpaka/math/MathHipBuiltIn.hpp>
 #include <alpaka/math/MathStdLib.hpp>
+#include <alpaka/math/MathUniformCudaHipBuiltIn.hpp>
 //-----------------------------------------------------------------------------
 // mem
-#include <alpaka/mem/alloc/AllocCpuBoostAligned.hpp>
+#include <alpaka/mem/alloc/AllocCpuAligned.hpp>
 #include <alpaka/mem/alloc/AllocCpuNew.hpp>
 #include <alpaka/mem/alloc/Traits.hpp>
-
 #include <alpaka/mem/buf/BufCpu.hpp>
-#include <alpaka/mem/buf/BufCudaRt.hpp>
-#include <alpaka/mem/buf/BufHipRt.hpp>
+#include <alpaka/mem/buf/BufOacc.hpp>
+#include <alpaka/mem/buf/BufOmp5.hpp>
+#include <alpaka/mem/buf/BufUniformCudaHipRt.hpp>
 #include <alpaka/mem/buf/Traits.hpp>
-
+#include <alpaka/mem/view/Traits.hpp>
 #include <alpaka/mem/view/ViewCompileTimeArray.hpp>
 #include <alpaka/mem/view/ViewPlainPtr.hpp>
 #include <alpaka/mem/view/ViewStdArray.hpp>
 #include <alpaka/mem/view/ViewStdVector.hpp>
 #include <alpaka/mem/view/ViewSubView.hpp>
-#include <alpaka/mem/view/Traits.hpp>
 //-----------------------------------------------------------------------------
 // meta
 #include <alpaka/meta/Apply.hpp>
@@ -161,6 +160,7 @@
 #include <alpaka/meta/Filter.hpp>
 #include <alpaka/meta/Fold.hpp>
 #include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/meta/Functional.hpp>
 #include <alpaka/meta/IntegerSequence.hpp>
 #include <alpaka/meta/Integral.hpp>
 #include <alpaka/meta/IsStrictBase.hpp>
@@ -168,32 +168,34 @@
 #include <alpaka/meta/NdLoop.hpp>
 #include <alpaka/meta/Set.hpp>
 #include <alpaka/meta/Transform.hpp>
+#include <alpaka/meta/Void.hpp>
 //-----------------------------------------------------------------------------
 // offset
 #include <alpaka/offset/Traits.hpp>
 //-----------------------------------------------------------------------------
 // platform
 #include <alpaka/pltf/PltfCpu.hpp>
-#include <alpaka/pltf/PltfCudaRt.hpp>
-#include <alpaka/pltf/PltfHipRt.hpp>
+#include <alpaka/pltf/PltfOacc.hpp>
+#include <alpaka/pltf/PltfOmp5.hpp>
+#include <alpaka/pltf/PltfUniformCudaHipRt.hpp>
 #include <alpaka/pltf/Traits.hpp>
 //-----------------------------------------------------------------------------
 // rand
-#include <alpaka/rand/RandCuRand.hpp>
-#include <alpaka/rand/RandHipRand.hpp>
-#include <alpaka/rand/RandStdLib.hpp>
+#include <alpaka/rand/RandUniformCudaHipRand.hpp>
 #include <alpaka/rand/Traits.hpp>
 //-----------------------------------------------------------------------------
 // idx
 #include <alpaka/idx/Traits.hpp>
 //-----------------------------------------------------------------------------
 // queue
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/queue/QueueCpuNonBlocking.hpp>
+#include <alpaka/queue/Properties.hpp>
 #include <alpaka/queue/QueueCpuBlocking.hpp>
+#include <alpaka/queue/QueueCpuNonBlocking.hpp>
+#include <alpaka/queue/QueueOaccBlocking.hpp>
+#include <alpaka/queue/QueueOmp5Blocking.hpp>
+#include <alpaka/queue/QueueUniformCudaHipRtBlocking.hpp>
+#include <alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp>
 #include <alpaka/queue/Traits.hpp>
-#include <alpaka/queue/Properties.hpp>
 //-----------------------------------------------------------------------------
 // time
 #include <alpaka/time/Traits.hpp>
@@ -202,10 +204,10 @@
 #include <alpaka/wait/Traits.hpp>
 //-----------------------------------------------------------------------------
 // workdiv
-#include <alpaka/workdiv/WorkDivMembers.hpp>
 #include <alpaka/workdiv/Traits.hpp>
 #include <alpaka/workdiv/WorkDivHelpers.hpp>
+#include <alpaka/workdiv/WorkDivMembers.hpp>
 //-----------------------------------------------------------------------------
 // vec
-#include <alpaka/vec/Vec.hpp>
 #include <alpaka/vec/Traits.hpp>
+#include <alpaka/vec/Vec.hpp>
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicCudaBuiltIn.hpp
deleted file mode 100644
index 410c8bae73..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicCudaBuiltIn.hpp
+++ /dev/null
@@ -1,1205 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/atomic/Op.hpp>
-#include <alpaka/atomic/Traits.hpp>
-#include <alpaka/meta/DependentFalseType.hpp>
-
-#include <climits>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-        //#############################################################################
-        //! The GPU CUDA accelerator atomic ops.
-        //
-        //  Atomics can used in the hierarchy level grids, blocks and threads.
-        //  Atomics are not guaranteed to be save between devices
-        class AtomicCudaBuiltIn
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            AtomicCudaBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            __device__ AtomicCudaBuiltIn(AtomicCudaBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ AtomicCudaBuiltIn(AtomicCudaBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AtomicCudaBuiltIn const &) -> AtomicCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AtomicCudaBuiltIn &&) -> AtomicCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AtomicCudaBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The specializations to execute the requested atomic ops of the CUDA accelerator.
-            // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions how to implement everything with CAS
-
-            //-----------------------------------------------------------------------------
-            // Add.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicAdd(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicAdd(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                float,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    float * const addr,
-                    float const & value)
-                -> float
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicCudaBuiltIn,
-                double,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    double * const addr,
-                    double const & value)
-                -> double
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(6, 0, 0)
-                    return atomicAdd(addr, value);
-#else
-                    // Code from: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
-
-                    unsigned long long int * address_as_ull(reinterpret_cast<unsigned long long int *>(addr));
-                    unsigned long long int old(*address_as_ull);
-                    unsigned long long int assumed;
-                    do
-                    {
-                        assumed = old;
-                        old = atomicCAS(
-                            address_as_ull,
-                            assumed,
-                            static_cast<unsigned long long>(__double_as_longlong(value + __longlong_as_double(static_cast<long long>(assumed)))));
-                        // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-                    }
-                    while(assumed != old);
-                    return __longlong_as_double(static_cast<long long>(old));
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Sub.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicSub(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicSub(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicSub(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Sub, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Min.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicMin(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicMin(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicMin(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMin(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Min, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-               typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMin(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Min, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Max.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicMax(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicMax(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicMax(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMax(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Max, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-               typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMax(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Max, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Exch.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicExch(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicExch(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicCudaBuiltIn,
-                float,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    float * const addr,
-                    float const & value)
-                -> float
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Inc.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Inc,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicInc(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Inc,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicInc(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Inc, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Dec.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Dec,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicDec(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Dec,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicDec(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Dec, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // And.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicAnd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicAnd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicAnd(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicAnd(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::And, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicAnd(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::And, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Or.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicOr(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicOr(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicOr(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicOr(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Or, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-               typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicOr(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Or, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Xor.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicXor(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicXor(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicXor(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicXor(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Xor, atomic::AtomicCudaBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-#endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-               typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicXor(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Xor, atomic::AtomicCudaBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Cas.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicCudaBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    int * const addr,
-                    int const & compare,
-                    int const & value)
-                -> int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicCudaBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & compare,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & compare,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicCAS(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(compare),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicCAS(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(compare),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicCudaBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & compare,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-
-            //#############################################################################
-            //! The GPU CUDA accelerator atomic operation.
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicCudaBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const & atomic,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<TOp, atomic::AtomicCudaBuiltIn, T>(atomic, addr, value) is not supported!");
-
-                    return T();
-                }
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicCudaBuiltIn const & atomic,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(compare);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<TOp, atomic::AtomicCudaBuiltIn, T>(atomic, addr, compare, value) is not supported!");
-
-                    return T();
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp
index 8f9141fb88..b1aab76954 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHierarchy.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,7 +10,6 @@
 #pragma once
 
 #include <alpaka/atomic/Traits.hpp>
-
 #include <alpaka/meta/InheritFromList.hpp>
 #include <alpaka/meta/Unique.hpp>
 
@@ -18,38 +17,24 @@
 
 namespace alpaka
 {
-    namespace atomic
-    {
-
-        //#############################################################################
-        //! build a single class to inherit from different atomic implementations
-        //
-        //  This implementation inherit from all three hierarchies.
-        //  The multiple usage of the same type for different levels is allowed.
-        //  The class provide the feature that each atomic operation can be focused
-        //  to a hierarchy level in Alpaka. A operation to a hierarchy is independent
-        //  to the memory hierarchy.
-        //
-        //  \tparam TGridAtomic atomic implementation for atomic operations between grids within a device
-        //  \tparam TBlockAtomic atomic implementation for atomic operations between blocks within a grid
-        //  \tparam TThreadAtomic atomic implementation for atomic operations between threads within a block
-        template<
-            typename TGridAtomic,
-            typename TBlockAtomic,
-            typename TThreadAtomic
-        >
-        using AtomicHierarchy
-            = alpaka::meta::InheritFromList<
-                alpaka::meta::Unique<
-                    std::tuple<
-                        TGridAtomic,
-                        TBlockAtomic,
-                        TThreadAtomic,
-                        concepts::Implements<ConceptAtomicGrids, TGridAtomic>,
-                        concepts::Implements<ConceptAtomicBlocks, TBlockAtomic>,
-                        concepts::Implements<ConceptAtomicThreads, TThreadAtomic>
-                    >
-                >
-            >;
-    }
-}
+    //#############################################################################
+    //! build a single class to inherit from different atomic implementations
+    //
+    //  This implementation inherit from all three hierarchies.
+    //  The multiple usage of the same type for different levels is allowed.
+    //  The class provide the feature that each atomic operation can be focused
+    //  to a hierarchy level in alpaka. A operation to a hierarchy is independent
+    //  to the memory hierarchy.
+    //
+    //  \tparam TGridAtomic atomic implementation for atomic operations between grids within a device
+    //  \tparam TBlockAtomic atomic implementation for atomic operations between blocks within a grid
+    //  \tparam TThreadAtomic atomic implementation for atomic operations between threads within a block
+    template<typename TGridAtomic, typename TBlockAtomic, typename TThreadAtomic>
+    using AtomicHierarchy = alpaka::meta::InheritFromList<alpaka::meta::Unique<std::tuple<
+        TGridAtomic,
+        TBlockAtomic,
+        TThreadAtomic,
+        concepts::Implements<ConceptAtomicGrids, TGridAtomic>,
+        concepts::Implements<ConceptAtomicBlocks, TBlockAtomic>,
+        concepts::Implements<ConceptAtomicThreads, TThreadAtomic>>>>;
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHipBuiltIn.hpp
deleted file mode 100644
index f2fc337b60..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicHipBuiltIn.hpp
+++ /dev/null
@@ -1,1201 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/atomic/Op.hpp>
-#include <alpaka/atomic/Traits.hpp>
-#include <alpaka/meta/DependentFalseType.hpp>
-
-#include <climits>
-
-namespace alpaka
-{
-    namespace atomic
-    {
-        //#############################################################################
-        //! The GPU HIP accelerator atomic ops.
-        //
-        //  Atomics can used in the hierarchy level grids, blocks and threads.
-        //  Atomics are not guaranteed to be save between devices
-        class AtomicHipBuiltIn
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            AtomicHipBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            __device__ AtomicHipBuiltIn(AtomicHipBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ AtomicHipBuiltIn(AtomicHipBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AtomicHipBuiltIn const &) -> AtomicHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(AtomicHipBuiltIn &&) -> AtomicHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ALPAKA_FN_HOST_ACC ~AtomicHipBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The specializations to execute the requested atomic ops of the HIP accelerator.
-            // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions how to implement everything with CAS
-
-            //-----------------------------------------------------------------------------
-            // Add.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicAdd(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicAdd(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                float,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    float * const addr,
-                    float const & value)
-                -> float
-                {
-                    return atomicAdd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicHipBuiltIn,
-                double,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    double * const addr,
-                    double const & value)
-                -> double
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(6,0,0)
-                    return atomicAdd(addr, value);
-#else
-                    // Code from: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
-
-                    unsigned long long int * address_as_ull(reinterpret_cast<unsigned long long int *>(addr));
-                    unsigned long long int old(*address_as_ull);
-                    unsigned long long int assumed;
-                    do
-                    {
-                        assumed = old;
-                        old = atomicCAS(
-                            address_as_ull,
-                            assumed,
-                            static_cast<unsigned long long>(__double_as_longlong(value + __longlong_as_double(static_cast<long long>(assumed)))));
-                        // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-                    }
-                    while(assumed != old);
-                    return __longlong_as_double(static_cast<long long>(old));
-#endif
-
-                }
-            };
-            //-----------------------------------------------------------------------------
-            // Sub.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicSub(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicSub(addr, value);
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicSub(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Sub, atomic::AtomicHipBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Min.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicMin(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicMin(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicMin(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMin(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Min, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Min,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMin(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Min, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            // Max.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicMax(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicMax(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Max,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicMax(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicMax(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Max, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-          //-----------------------------------------------------------------------------
-          //! The GPU HIP accelerator atomic operation.
-          template<
-            typename THierarchy>
-          struct AtomicOp<
-            op::Max,
-            atomic::AtomicHipBuiltIn,
-            unsigned long long int,
-            THierarchy>
-          {
-            //-----------------------------------------------------------------------------
-            __device__ static auto atomicOp(
-              atomic::AtomicHipBuiltIn const &,
-              unsigned long long int * const addr,
-              unsigned long long int const & value)
-              -> unsigned long long int
-              {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                return atomicMax(addr, value);
-#else
-                alpaka::ignore_unused(addr);
-                alpaka::ignore_unused(value);
-                static_assert(
-                  meta::DependentFalseType<THierarchy>::value,
-                  "atomicOp<op::Max, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                    }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Exch.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicExch(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicExch(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicHipBuiltIn,
-                float,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    float * const addr,
-                    float const & value)
-                -> float
-                {
-                    return atomicExch(addr, value);
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Inc.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Inc,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicInc(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Inc,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicInc(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Inc, atomic::AtomicHipBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Dec.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Dec,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicDec(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Dec,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicDec(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Dec, atomic::AtomicHipBuiltIn, unsigned long int> is only supported when sizeof(unsigned long int) == 4");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // And.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicAnd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicAnd(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicAnd(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicAnd(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::And, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicAnd(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::And, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            // Or.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicOr(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicOr(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicOr(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicOr(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Or, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicOr(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Or, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            // Xor.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & value)
-                -> int
-                {
-                    return atomicXor(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicXor(addr, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicXor(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-# if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicXor(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(value));
-# else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Xor, atomic::AtomicHipBuiltIn, unsigned long int> is only supported on sm >= 3.5");
-# endif
-#endif
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
-                    return atomicXor(addr, value);
-#else
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<op::Xor, atomic::AtomicHipBuiltIn, unsigned long long int> is only supported on sm >= 3.5");
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            // Cas.
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicHipBuiltIn,
-                int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    int * const addr,
-                    int const & compare,
-                    int const & value)
-                -> int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicHipBuiltIn,
-                unsigned int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned int * const addr,
-                    unsigned int const & compare,
-                    unsigned int const & value)
-                -> unsigned int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicHipBuiltIn,
-                unsigned long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long int * const addr,
-                    unsigned long int const & compare,
-                    unsigned long int const & value)
-                -> unsigned long int
-                {
-#if UINT_MAX == ULONG_MAX // LLP64
-                    return atomicCAS(
-                        reinterpret_cast<unsigned int *>(addr),
-                        static_cast<unsigned int>(compare),
-                        static_cast<unsigned int>(value));
-#else // ULONG_MAX == ULLONG_MAX LP64
-                    return atomicCAS(
-                        reinterpret_cast<unsigned long long int *>(addr),
-                        static_cast<unsigned long long int>(compare),
-                        static_cast<unsigned long long int>(value));
-#endif
-                }
-            };
-
-            //-----------------------------------------------------------------------------
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename THierarchy>
-            struct AtomicOp<
-                op::Cas,
-                atomic::AtomicHipBuiltIn,
-                unsigned long long int,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const &,
-                    unsigned long long int * const addr,
-                    unsigned long long int const & compare,
-                    unsigned long long int const & value)
-                -> unsigned long long int
-                {
-                    return atomicCAS(addr, compare, value);
-                }
-            };
-
-            //#############################################################################
-            //! The GPU HIP accelerator atomic operation.
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicHipBuiltIn,
-                T,
-                THierarchy>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const & atomic,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<TOp, atomic::AtomicHipBuiltIn, T>(atomic, addr, value) is not supported!");
-
-                    return T();
-                }
-                //-----------------------------------------------------------------------------
-                __device__ static auto atomicOp(
-                    atomic::AtomicHipBuiltIn const & atomic,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    alpaka::ignore_unused(addr);
-                    alpaka::ignore_unused(compare);
-                    alpaka::ignore_unused(value);
-                    static_assert(
-                        meta::DependentFalseType<THierarchy>::value,
-                        "atomicOp<TOp, atomic::AtomicHipBuiltIn, T>(atomic, addr, compare, value) is not supported!");
-
-                    return T();
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicNoOp.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicNoOp.hpp
index 997b98229b..4be37bcbfc 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicNoOp.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicNoOp.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,68 +10,52 @@
 #pragma once
 
 #include <alpaka/atomic/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
 
 namespace alpaka
 {
-    namespace atomic
+    //#############################################################################
+    //! The CPU fibers accelerator atomic ops.
+    class AtomicNoOp
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        AtomicNoOp() = default;
+        //-----------------------------------------------------------------------------
+        AtomicNoOp(AtomicNoOp const&) = delete;
+        //-----------------------------------------------------------------------------
+        AtomicNoOp(AtomicNoOp&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(AtomicNoOp const&) -> AtomicNoOp& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(AtomicNoOp&&) -> AtomicNoOp& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AtomicNoOp() = default;
+    };
+
+    namespace traits
     {
         //#############################################################################
-        //! The CPU fibers accelerator atomic ops.
-        class AtomicNoOp
+        //! The CPU fibers accelerator atomic operation.
+        template<typename TOp, typename T, typename THierarchy>
+        struct AtomicOp<TOp, AtomicNoOp, T, THierarchy>
         {
-        public:
-            //-----------------------------------------------------------------------------
-            AtomicNoOp() = default;
-            //-----------------------------------------------------------------------------
-            AtomicNoOp(AtomicNoOp const &) = delete;
-            //-----------------------------------------------------------------------------
-            AtomicNoOp(AtomicNoOp &&) = delete;
             //-----------------------------------------------------------------------------
-            auto operator=(AtomicNoOp const &) -> AtomicNoOp & = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(AtomicNoOp &&) -> AtomicNoOp & = delete;
+            ALPAKA_FN_HOST static auto atomicOp(AtomicNoOp const& atomic, T* const addr, T const& value) -> T
+            {
+                alpaka::ignore_unused(atomic);
+                return TOp()(addr, value);
+            }
             //-----------------------------------------------------------------------------
-            /*virtual*/ ~AtomicNoOp() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator atomic operation.
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicNoOp,
-                T,
-                THierarchy>
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicNoOp const& atomic,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicNoOp const & atomic,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    return TOp()(addr, value);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicNoOp const & atomic,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    alpaka::ignore_unused(atomic);
-                    return TOp()(addr, compare, value);
-                }
-            };
-        }
-    }
-}
+                alpaka::ignore_unused(atomic);
+                return TOp()(addr, compare, value);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOaccBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOaccBuiltIn.hpp
new file mode 100644
index 0000000000..2569bc2a56
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOaccBuiltIn.hpp
@@ -0,0 +1,282 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/atomic/Op.hpp>
+#    include <alpaka/atomic/Traits.hpp>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The OpenACC accelerator's atomic ops.
+    //
+    //  Atomics can be used in the blocks and threads hierarchy levels.
+    //  Atomics are not guaranteed to be safe between devices or grids.
+    class AtomicOaccBuiltIn
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        AtomicOaccBuiltIn() = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST_ACC AtomicOaccBuiltIn(AtomicOaccBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST_ACC AtomicOaccBuiltIn(AtomicOaccBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST_ACC auto operator=(AtomicOaccBuiltIn const&) -> AtomicOaccBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST_ACC auto operator=(AtomicOaccBuiltIn&&) -> AtomicOaccBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        ~AtomicOaccBuiltIn() = default;
+    };
+
+    namespace traits
+    {
+        // "omp atomic update capture" is not supported before OpenACC 2.5 and by PGI
+        // "omp atomic capture {}" works for PGI and GCC, using this even though non-standart
+        // #if _OPENACC >= 201510
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: ADD
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref += value;
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: SUB
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicSub, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref -= value;
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: EXCH
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+                // atomically update ref, but capture the original value in old
+#    if !BOOST_COMP_PGI || defined TPR28628 // triggers PGI TPR28628, not atomic until fixed
+#        pragma acc atomic capture
+#    else
+#        pragma message("Atomic exchange will not be atomic because of a compiler bug. Sorry :/")
+#    endif
+                {
+                    old = ref;
+                    ref = value;
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: AND
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicAnd, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref &= value;
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: OR
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicOr, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref |= value;
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: XOR
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicXor, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref ^= value;
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: Min
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicMin, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref = (ref <= value) ? ref : value;
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: Max
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicMax, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref = (ref >= value) ? ref : value;
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: Inc
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicInc, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref = ((ref >= value) ? 0 : (ref + 1));
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: Dec
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicDec, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(AtomicOaccBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref = ((ref == 0) || (ref > value)) ? value : (ref - 1);
+                }
+                return old;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerators atomic operation: Cas
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicCas, AtomicOaccBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST_ACC static auto atomicOp(
+                AtomicOaccBuiltIn const&,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#    pragma acc atomic capture
+                {
+                    old = ref;
+                    ref = (ref == compare ? value : ref);
+                }
+                return old;
+            }
+        };
+        // #endif
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
index 02cba6acfd..370f3e762c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,272 +11,198 @@
 
 #ifdef _OPENMP
 
-#include <alpaka/atomic/Traits.hpp>
-#include <alpaka/atomic/Op.hpp>
+#    include <alpaka/atomic/Op.hpp>
+#    include <alpaka/atomic/Traits.hpp>
 
 namespace alpaka
 {
-    namespace atomic
+    //#############################################################################
+    //! The OpenMP accelerators atomic ops.
+    //
+    //  Atomics can be used in the blocks and threads hierarchy levels.
+    //  Atomics are not guaranteed to be safe between devices or grids.
+    class AtomicOmpBuiltIn
     {
-        //#############################################################################
-        //! The OpenMP accelerators atomic ops.
-        //
-        //  Atomics can be used in the blocks and threads hierarchy levels.
-        //  Atomics are not guaranteed to be safe between devices or grids.
-        class AtomicOmpBuiltIn
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            AtomicOmpBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AtomicOmpBuiltIn(AtomicOmpBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST AtomicOmpBuiltIn(AtomicOmpBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AtomicOmpBuiltIn const &) -> AtomicOmpBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(AtomicOmpBuiltIn &&) -> AtomicOmpBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AtomicOmpBuiltIn() = default;
-        };
-
-        namespace traits
-        {
+    public:
+        //-----------------------------------------------------------------------------
+        AtomicOmpBuiltIn() = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AtomicOmpBuiltIn(AtomicOmpBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST AtomicOmpBuiltIn(AtomicOmpBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AtomicOmpBuiltIn const&) -> AtomicOmpBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(AtomicOmpBuiltIn&&) -> AtomicOmpBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AtomicOmpBuiltIn() = default;
+    };
 
+    namespace traits
+    {
 // check for OpenMP 3.1+
 // "omp atomic capture" is not supported before OpenMP 3.1
-#if _OPENMP >= 201107
+#    if _OPENMP >= 201107
 
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: ADD
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Add,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
+        //#############################################################################
+        //! The OpenMP accelerators atomic operation: ADD
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture
                 {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref += value;
-                    }
-                    return old;
+                    old = ref;
+                    ref += value;
                 }
-            };
+                return old;
+            }
+        };
 
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: SUB
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Sub,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
+        //#############################################################################
+        //! The OpenMP accelerators atomic operation: SUB
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicSub, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture
                 {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref -= value;
-                    }
-                    return old;
+                    old = ref;
+                    ref -= value;
                 }
-            };
+                return old;
+            }
+        };
 
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: EXCH
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Exch,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
+        //#############################################################################
+        //! The OpenMP accelerators atomic operation: EXCH
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture
                 {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref = value;
-                    }
-                    return old;
+                    old = ref;
+                    ref = value;
                 }
-            };
+                return old;
+            }
+        };
 
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: AND
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::And,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
+        //#############################################################################
+        //! The OpenMP accelerators atomic operation: AND
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicAnd, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture
                 {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref &= value;
-                    }
-                    return old;
+                    old = ref;
+                    ref &= value;
                 }
-            };
+                return old;
+            }
+        };
 
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: OR
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Or,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
+        //#############################################################################
+        //! The OpenMP accelerators atomic operation: OR
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicOr, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture
                 {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref |= value;
-                    }
-                    return old;
+                    old = ref;
+                    ref |= value;
                 }
-            };
+                return old;
+            }
+        };
 
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation: XOR
-            template<
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                op::Xor,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
+        //#############################################################################
+        //! The OpenMP accelerators atomic operation: XOR
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicXor, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture
                 {
-                    T old;
-                    auto & ref(*addr);
-                    // atomically update ref, but capture the original value in old
-                    #pragma omp atomic capture
-                    {
-                        old = ref;
-                        ref ^= value;
-                    }
-                    return old;
+                    old = ref;
+                    ref ^= value;
                 }
-            };
+                return old;
+            }
+        };
 
-#endif // _OPENMP >= 201107
+#    endif // _OPENMP >= 201107
 
-            //#############################################################################
-            //! The OpenMP accelerators atomic operation
-            //
-            // generic implementations for operations where native atomics are not available
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicOmpBuiltIn,
-                T,
-                THierarchy>
+        //#############################################################################
+        //! The OpenMP accelerators atomic operation
+        //
+        // generic implementations for operations where native atomics are not available
+        template<typename TOp, typename T, typename THierarchy>
+        struct AtomicOp<TOp, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & value)
-                -> T
+                T old;
+// \TODO: Currently not only the access to the same memory location is protected by a mutex but all atomic ops on all
+// threads.
+#    pragma omp critical(AlpakaOmpAtomicOp)
                 {
-                    T old;
-                    // \TODO: Currently not only the access to the same memory location is protected by a mutex but all atomic ops on all threads.
-                    #pragma omp critical (AlpakaOmpAtomicOp)
-                    {
-                        old = TOp()(addr, value);
-                    }
-                    return old;
+                    old = TOp()(addr, value);
                 }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicOmpBuiltIn const &,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
+                return old;
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicOmpBuiltIn const&,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                T old;
+// \TODO: Currently not only the access to the same memory location is protected by a mutex but all atomic ops on all
+// threads.
+#    pragma omp critical(AlpakaOmpAtomicOp2)
                 {
-                    T old;
-                    // \TODO: Currently not only the access to the same memory location is protected by a mutex but all atomic ops on all threads.
-                    #pragma omp critical (AlpakaOmpAtomicOp2)
-                    {
-                        old = TOp()(addr, compare, value);
-                    }
-                    return old;
+                    old = TOp()(addr, compare, value);
                 }
-            };
-        }
-    }
-}
+                return old;
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp
index 816149f1b0..6a28f8d9ee 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicStdLibLock.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,131 +10,113 @@
 #pragma once
 
 #include <alpaka/atomic/Traits.hpp>
-
 #include <alpaka/core/BoostPredef.hpp>
 
-#include <mutex>
 #include <array>
+#include <mutex>
 
 namespace alpaka
 {
-    namespace atomic
+    //#############################################################################
+    //! The CPU threads accelerator atomic ops.
+    //
+    //  Atomics can be used in the grids, blocks and threads hierarchy levels.
+    //  Atomics are not guaranteed to be save between devices.
+    //
+    // \tparam THashTableSize size of the hash table to allow concurrency between
+    //                        atomics to different addresses
+    template<size_t THashTableSize>
+    class AtomicStdLibLock
     {
-        //#############################################################################
-        //! The CPU threads accelerator atomic ops.
-        //
-        //  Atomics can be used in the grids, blocks and threads hierarchy levels.
-        //  Atomics are not guaranteed to be save between devices.
+    public:
+        template<typename TAtomic, typename TOp, typename T, typename THierarchy, typename TSfinae>
+        friend struct traits::AtomicOp;
+
+        static constexpr size_t nextPowerOf2(size_t const value, size_t const bit = 0u)
+        {
+            return value <= (static_cast<size_t>(1u) << bit) ? (static_cast<size_t>(1u) << bit)
+                                                             : nextPowerOf2(value, bit + 1u);
+        }
+
+        //-----------------------------------------------------------------------------
+        //! get a hash value of the pointer
         //
-        // \tparam THashTableSize size of the hash table to allow concurrency between
-        //                        atomics to different addresses
-        template<size_t THashTableSize>
-        class AtomicStdLibLock
+        // This is no perfect hash, there will be collisions if the size of pointer type
+        // is not a power of two.
+        template<typename TPtr>
+        static size_t hash(TPtr const* const ptr)
         {
-        public:
-            template<
-                typename TAtomic,
-                typename TOp,
-                typename T,
-                typename THierarchy,
-                typename TSfinae>
-            friend struct atomic::traits::AtomicOp;
+            size_t const ptrAddr = reinterpret_cast<size_t>(ptr);
+            // using power of two for the next division will increase the performance
+            constexpr size_t typeSizePowerOf2 = nextPowerOf2(sizeof(TPtr));
+            // division removes the stride between indices
+            return (ptrAddr / typeSizePowerOf2);
+        }
 
-            static constexpr size_t nextPowerOf2(size_t const value, size_t const bit = 0u)
-            {
-                return value <= (static_cast<size_t>(1u) << bit) ?
-                    (static_cast<size_t>(1u) << bit) : nextPowerOf2(value, bit + 1u);
-            }
+        //-----------------------------------------------------------------------------
+        AtomicStdLibLock() = default;
+        //-----------------------------------------------------------------------------
+        AtomicStdLibLock(AtomicStdLibLock const&) = delete;
+        //-----------------------------------------------------------------------------
+        AtomicStdLibLock(AtomicStdLibLock&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(AtomicStdLibLock const&) -> AtomicStdLibLock& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(AtomicStdLibLock&&) -> AtomicStdLibLock& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AtomicStdLibLock() = default;
 
+        template<typename TPtr>
+        std::mutex& getMutex(TPtr const* const ptr) const
+        {
             //-----------------------------------------------------------------------------
-            //! get a hash value of the pointer
+            //! get the size of the hash table
             //
-            // This is no perfect hash, there will be collisions if the size of pointer type
-            // is not a power of two.
-            template<typename TPtr>
-            static size_t hash(TPtr const * const ptr)
-            {
-                size_t const ptrAddr = reinterpret_cast< size_t >( ptr );
-                // using power of two for the next division will increase the performance
-                constexpr size_t typeSizePowerOf2 = nextPowerOf2(sizeof(TPtr));
-                // division removes the stride between indices
-                return (ptrAddr / typeSizePowerOf2);
-            }
-
-            //-----------------------------------------------------------------------------
-            AtomicStdLibLock() = default;
-            //-----------------------------------------------------------------------------
-            AtomicStdLibLock(AtomicStdLibLock const &) = delete;
-            //-----------------------------------------------------------------------------
-            AtomicStdLibLock(AtomicStdLibLock &&) = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(AtomicStdLibLock const &) -> AtomicStdLibLock & = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(AtomicStdLibLock &&) -> AtomicStdLibLock & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~AtomicStdLibLock() = default;
+            // The size is at least 1 or THashTableSize rounded up to the next power of 2
+            constexpr size_t hashTableSize = THashTableSize == 0u ? 1u : nextPowerOf2(THashTableSize);
 
-            template<typename TPtr>
-            std::mutex & getMutex(TPtr const * const ptr) const
-            {
-                //-----------------------------------------------------------------------------
-                //! get the size of the hash table
-                //
-                // The size is at least 1 or THashTableSize rounded up to the next power of 2
-                constexpr size_t hashTableSize = THashTableSize == 0u ? 1u : nextPowerOf2(THashTableSize);
-
-                size_t const hashedAddr = hash(ptr) & (hashTableSize - 1u);
+            size_t const hashedAddr = hash(ptr) & (hashTableSize - 1u);
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wexit-time-destructors"
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
 #endif
-                static std::array<
-                    std::mutex,
-                    hashTableSize> m_mtxAtomic; //!< The mutex protecting access for an atomic operation.
+            static std::array<
+                std::mutex,
+                hashTableSize>
+                m_mtxAtomic; //!< The mutex protecting access for an atomic operation.
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
+#    pragma clang diagnostic pop
 #endif
-                return m_mtxAtomic[hashedAddr];
-            }
-        };
+            return m_mtxAtomic[hashedAddr];
+        }
+    };
 
-        namespace traits
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU threads accelerator atomic operation.
+        template<typename TOp, typename T, typename THierarchy, size_t THashTableSize>
+        struct AtomicOp<TOp, AtomicStdLibLock<THashTableSize>, T, THierarchy>
         {
-            //#############################################################################
-            //! The CPU threads accelerator atomic operation.
-            template<
-                typename TOp,
-                typename T,
-                typename THierarchy,
-                size_t THashTableSize>
-            struct AtomicOp<
-                TOp,
-                atomic::AtomicStdLibLock<THashTableSize>,
-                T,
-                THierarchy>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicStdLibLock<THashTableSize> const& atomic,
+                T* const addr,
+                T const& value) -> T
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicStdLibLock<THashTableSize> const & atomic,
-                    T * const addr,
-                    T const & value)
-                -> T
-                {
-                    std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
-                    return TOp()(addr, value);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto atomicOp(
-                    atomic::AtomicStdLibLock<THashTableSize> const & atomic,
-                    T * const addr,
-                    T const & compare,
-                    T const & value)
-                -> T
-                {
-                    std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
-                    return TOp()(addr, compare, value);
-                }
-            };
-        }
-    }
-}
+                std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
+                return TOp()(addr, value);
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicStdLibLock<THashTableSize> const& atomic,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
+                return TOp()(addr, compare, value);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..546379deb3
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,917 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/atomic/Op.hpp>
+#    include <alpaka/atomic/Traits.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/meta/DependentFalseType.hpp>
+
+#    include <climits>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The GPU CUDA/HIP accelerator atomic ops.
+    //
+    //  Atomics can be used in the hierarchy level grids, blocks and threads.
+    //  Atomics are not guaranteed to be save between devices.
+    class AtomicUniformCudaHipBuiltIn
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        AtomicUniformCudaHipBuiltIn() = default;
+        //-----------------------------------------------------------------------------
+        __device__ AtomicUniformCudaHipBuiltIn(AtomicUniformCudaHipBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ AtomicUniformCudaHipBuiltIn(AtomicUniformCudaHipBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(AtomicUniformCudaHipBuiltIn const&) -> AtomicUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(AtomicUniformCudaHipBuiltIn&&) -> AtomicUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~AtomicUniformCudaHipBuiltIn() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The specializations to execute the requested atomic ops of the CUDA/HIP accelerator.
+        // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions how to implement everything with
+        // CAS
+
+        //-----------------------------------------------------------------------------
+        // Add.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, int* const addr, int const& value)
+                -> int
+            {
+                return ::atomicAdd(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIP accelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicAdd(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIP accelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicAdd(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+                return ::atomicAdd(
+                    reinterpret_cast<unsigned long long int*>(addr),
+                    static_cast<unsigned long long int>(value));
+#    endif
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIP accelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicUniformCudaHipBuiltIn, unsigned long long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long long int* const addr,
+                unsigned long long int const& value) -> unsigned long long int
+            {
+                return ::atomicAdd(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIP accelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicUniformCudaHipBuiltIn, float, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, float* const addr, float const& value)
+                -> float
+            {
+                return ::atomicAdd(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIP accelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicUniformCudaHipBuiltIn, double, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                double* const addr,
+                double const& value) -> double
+            {
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(6, 0, 0)
+                return ::atomicAdd(addr, value);
+#    else
+                // Code from: http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+
+                unsigned long long int* address_as_ull(reinterpret_cast<unsigned long long int*>(addr));
+                unsigned long long int old(*address_as_ull);
+                unsigned long long int assumed;
+                do
+                {
+                    assumed = old;
+                    old = ::atomicCAS(
+                        address_as_ull,
+                        assumed,
+                        static_cast<unsigned long long>(
+                            __double_as_longlong(value + __longlong_as_double(static_cast<long long>(assumed)))));
+                    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+                } while(assumed != old);
+                return __longlong_as_double(static_cast<long long>(old));
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Sub.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIP accelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicSub, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, int* const addr, int const& value)
+                -> int
+            {
+                return ::atomicSub(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIP accelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicSub, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicSub(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicSub, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicSub(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicSub, AtomicUniformCudaHipBuiltIn, unsigned long int> is only supported when "
+                    "sizeof(unsigned long int) == 4");
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Min.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicMin, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, int* const addr, int const& value)
+                -> int
+            {
+                return ::atomicMin(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicMin, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicMin(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicMin, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicMin(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicMin(
+                    reinterpret_cast<unsigned long long int*>(addr),
+                    static_cast<unsigned long long int>(value));
+#        else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicMin, AtomicUniformCudaHipBuiltIn, unsigned long int> is only supported on sm >= "
+                    "3.5");
+#        endif
+#    endif
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicMin, AtomicUniformCudaHipBuiltIn, unsigned long long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long long int* const addr,
+                unsigned long long int const& value) -> unsigned long long int
+            {
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicMin(addr, value);
+#    else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicMin, AtomicUniformCudaHipBuiltIn, unsigned long long int> is only supported on sm "
+                    ">= 3.5");
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Max.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicMax, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, int* const addr, int const& value)
+                -> int
+            {
+                return ::atomicMax(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicMax, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicMax(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicMax, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicMax(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicMax(
+                    reinterpret_cast<unsigned long long int*>(addr),
+                    static_cast<unsigned long long int>(value));
+#        else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicMax, AtomicUniformCudaHipBuiltIn, unsigned long int> is only supported on sm >= "
+                    "3.5");
+#        endif
+#    endif
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicMax, AtomicUniformCudaHipBuiltIn, unsigned long long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long long int* const addr,
+                unsigned long long int const& value) -> unsigned long long int
+            {
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicMax(addr, value);
+#    else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicMax, AtomicUniformCudaHipBuiltIn, unsigned long long int> is only supported on sm "
+                    ">= 3.5");
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Exch.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, int* const addr, int const& value)
+                -> int
+            {
+                return ::atomicExch(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicExch(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicExch(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+                return ::atomicExch(
+                    reinterpret_cast<unsigned long long int*>(addr),
+                    static_cast<unsigned long long int>(value));
+#    endif
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicUniformCudaHipBuiltIn, unsigned long long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long long int* const addr,
+                unsigned long long int const& value) -> unsigned long long int
+            {
+                return ::atomicExch(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicUniformCudaHipBuiltIn, float, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, float* const addr, float const& value)
+                -> float
+            {
+                return ::atomicExch(addr, value);
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Inc.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicInc, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicInc(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicInc, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicInc(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicInc, AtomicUniformCudaHipBuiltIn, unsigned long int> is only supported when "
+                    "sizeof(unsigned long int) == 4");
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Dec.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicDec, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicDec(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicDec, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicDec(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicDec, AtomicUniformCudaHipBuiltIn, unsigned long int> is only supported when "
+                    "sizeof(unsigned long int) == 4");
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // And.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAnd, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, int* const addr, int const& value)
+                -> int
+            {
+                return ::atomicAnd(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAnd, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicAnd(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAnd, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicAnd(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicAnd(
+                    reinterpret_cast<unsigned long long int*>(addr),
+                    static_cast<unsigned long long int>(value));
+#        else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicAnd, AtomicUniformCudaHipBuiltIn, unsigned long int> is only supported on sm >= "
+                    "3.5");
+#        endif
+#    endif
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicAnd, AtomicUniformCudaHipBuiltIn, unsigned long long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long long int* const addr,
+                unsigned long long int const& value) -> unsigned long long int
+            {
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicAnd(addr, value);
+#    else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicAnd, AtomicUniformCudaHipBuiltIn, unsigned long long int> is only supported on sm "
+                    ">= 3.5");
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Or.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicOr, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, int* const addr, int const& value)
+                -> int
+            {
+                return ::atomicOr(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicOr, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicOr(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicOr, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicOr(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicOr(
+                    reinterpret_cast<unsigned long long int*>(addr),
+                    static_cast<unsigned long long int>(value));
+#        else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicOr, AtomicUniformCudaHipBuiltIn, unsigned long int> is only supported on sm >= "
+                    "3.5");
+#        endif
+#    endif
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicOr, AtomicUniformCudaHipBuiltIn, unsigned long long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long long int* const addr,
+                unsigned long long int const& value) -> unsigned long long int
+            {
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicOr(addr, value);
+#    else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicOr, AtomicUniformCudaHipBuiltIn, unsigned long long int> is only supported on sm "
+                    ">= 3.5");
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Xor.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicXor, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const&, int* const addr, int const& value)
+                -> int
+            {
+                return ::atomicXor(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicXor, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicXor(addr, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicXor, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicXor(reinterpret_cast<unsigned int*>(addr), static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicXor(
+                    reinterpret_cast<unsigned long long int*>(addr),
+                    static_cast<unsigned long long int>(value));
+#        else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicXor, AtomicUniformCudaHipBuiltIn, unsigned long int> is only supported on sm >= "
+                    "3.5");
+#        endif
+#    endif
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicXor, AtomicUniformCudaHipBuiltIn, unsigned long long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long long int* const addr,
+                unsigned long long int const& value) -> unsigned long long int
+            {
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+                return ::atomicXor(addr, value);
+#    else
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<AtomicXor, AtomicUniformCudaHipBuiltIn, unsigned long long int> is only supported on sm "
+                    ">= 3.5");
+#    endif
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        // Cas.
+
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                int* const addr,
+                int const& compare,
+                int const& value) -> int
+            {
+                return ::atomicCAS(addr, compare, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, unsigned int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned int* const addr,
+                unsigned int const& compare,
+                unsigned int const& value) -> unsigned int
+            {
+                return ::atomicCAS(addr, compare, value);
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, unsigned long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long int* const addr,
+                unsigned long int const& compare,
+                unsigned long int const& value) -> unsigned long int
+            {
+#    if UINT_MAX == ULONG_MAX // LLP64
+                return ::atomicCAS(
+                    reinterpret_cast<unsigned int*>(addr),
+                    static_cast<unsigned int>(compare),
+                    static_cast<unsigned int>(value));
+#    else // ULONG_MAX == ULLONG_MAX LP64
+                return ::atomicCAS(
+                    reinterpret_cast<unsigned long long int*>(addr),
+                    static_cast<unsigned long long int>(compare),
+                    static_cast<unsigned long long int>(value));
+#    endif
+            }
+        };
+        //-----------------------------------------------------------------------------
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename THierarchy>
+        struct AtomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, unsigned long long int, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const&,
+                unsigned long long int* const addr,
+                unsigned long long int const& compare,
+                unsigned long long int const& value) -> unsigned long long int
+            {
+                return ::atomicCAS(addr, compare, value);
+            }
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIPaccelerator atomic operation.
+        template<typename TOp, typename T, typename THierarchy>
+        struct AtomicOp<TOp, AtomicUniformCudaHipBuiltIn, T, THierarchy>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(AtomicUniformCudaHipBuiltIn const& atomic, T* const addr, T const& value)
+                -> T
+            {
+                alpaka::ignore_unused(atomic);
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<TOp, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, value) is not supported!");
+
+                return T();
+            }
+            //-----------------------------------------------------------------------------
+            __device__ static auto atomicOp(
+                AtomicUniformCudaHipBuiltIn const& atomic,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                alpaka::ignore_unused(atomic);
+                alpaka::ignore_unused(addr);
+                alpaka::ignore_unused(compare);
+                alpaka::ignore_unused(value);
+                static_assert(
+                    meta::DependentFalseType<THierarchy>::value,
+                    "atomicOp<TOp, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, compare, value) is not supported!");
+
+                return T();
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/Op.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/Op.hpp
index b377cc55b8..dc57c09f0c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/atomic/Op.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/Op.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,253 +9,215 @@
 
 #pragma once
 
-#include <alpaka/vec/Vec.hpp>
+#include <alpaka/core/BoostPredef.hpp>
+#include <alpaka/core/Common.hpp>
 
 #include <algorithm>
 
 namespace alpaka
 {
-    namespace atomic
+    //#############################################################################
+    //! The addition function object.
+    struct AtomicAdd
     {
         //-----------------------------------------------------------------------------
-        //! Defines operation functors.
-        namespace op
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
         {
-            //#############################################################################
-            //! The addition function object.
-            struct Add
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref += value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The subtraction function object.
-            struct Sub
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref -= value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The minimum function object.
-            struct Min
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = std::min(ref, value);
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The maximum function object.
-            struct Max
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = std::max(ref, value);
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The exchange function object.
-            struct Exch
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The increment function object.
-            struct Inc
-            {
-                //-----------------------------------------------------------------------------
-                //! Increments up to value, then reset to 0.
-                //!
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = ((old >= value) ? 0 : old + 1);
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The decrement function object.
-            struct Dec
-            {
-                //-----------------------------------------------------------------------------
-                //! Decrement down to 0, then reset to value.
-                //!
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref = (((old == 0) || (old > value)) ? value : (old - 1));
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The and function object.
-            struct And
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref &= value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The or function object.
-            struct Or
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref |= value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The exclusive or function object.
-            struct Xor
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * const addr,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
-                    ref ^= value;
-                    return old;
-                }
-            };
-            //#############################################################################
-            //! The compare and swap function object.
-            struct Cas
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The old value of addr.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T>
-                ALPAKA_FN_HOST_ACC auto operator()(
-                    T * addr,
-                    T const & compare,
-                    T const & value) const
-                -> T
-                {
-                    auto const old(*addr);
-                    auto & ref(*addr);
+            auto const old(*addr);
+            auto& ref(*addr);
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wconversion"
+#endif
+            ref += value;
+            return old;
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+        }
+    };
+    //#############################################################################
+    //! The subtraction function object.
+    struct AtomicSub
+    {
+        //-----------------------------------------------------------------------------
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wconversion"
+#endif
+            ref -= value;
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The minimum function object.
+    struct AtomicMin
+    {
+        //-----------------------------------------------------------------------------
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+            ref = std::min(ref, value);
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The maximum function object.
+    struct AtomicMax
+    {
+        //-----------------------------------------------------------------------------
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+            ref = std::max(ref, value);
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The exchange function object.
+    struct AtomicExch
+    {
+        //-----------------------------------------------------------------------------
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+            ref = value;
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The increment function object.
+    struct AtomicInc
+    {
+        //-----------------------------------------------------------------------------
+        //! Increments up to value, then reset to 0.
+        //!
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+            ref = ((old >= value) ? 0 : static_cast<T>(old + 1));
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The decrement function object.
+    struct AtomicDec
+    {
+        //-----------------------------------------------------------------------------
+        //! Decrement down to 0, then reset to value.
+        //!
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+            ref = (((old == 0) || (old > value)) ? value : static_cast<T>(old - 1));
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The and function object.
+    struct AtomicAnd
+    {
+        //-----------------------------------------------------------------------------
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+            ref &= value;
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The or function object.
+    struct AtomicOr
+    {
+        //-----------------------------------------------------------------------------
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+            ref |= value;
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The exclusive or function object.
+    struct AtomicXor
+    {
+        //-----------------------------------------------------------------------------
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
+            ref ^= value;
+            return old;
+        }
+    };
+    //#############################################################################
+    //! The compare and swap function object.
+    struct AtomicCas
+    {
+        //-----------------------------------------------------------------------------
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
+        {
+            auto const old(*addr);
+            auto& ref(*addr);
 
 // gcc-7.4.0 assumes for an optimization that a signed overflow does not occur here.
 // That's fine, so ignore that warning.
 #if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-overflow"
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wstrict-overflow"
 #endif
-                    ref = ((old == compare) ? value : old);
+            ref = ((old == compare) ? value : old);
 #if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
-#pragma GCC diagnostic pop
+#    pragma GCC diagnostic pop
 #endif
-                    return old;
-                }
-            };
+            return old;
         }
-    }
-}
+    };
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/atomic/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/atomic/Traits.hpp
index ef5a198eab..6d0c847317 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/atomic/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/atomic/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,142 +9,314 @@
 
 #pragma once
 
-#include <alpaka/core/Positioning.hpp>
+#include <alpaka/atomic/Op.hpp>
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
+#include <alpaka/core/Positioning.hpp>
 
 #include <type_traits>
 
 namespace alpaka
 {
-    //-----------------------------------------------------------------------------
-    //! The atomic operation traits specifics.
-    namespace atomic
+    struct ConceptAtomicGrids
+    {
+    };
+    struct ConceptAtomicBlocks
+    {
+    };
+    struct ConceptAtomicThreads
     {
-        struct ConceptAtomicGrids;
-        struct ConceptAtomicBlocks;
-        struct ConceptAtomicThreads;
+    };
 
-        namespace detail
-        {
-            template<
-                typename THierarchy
-            >
-            struct AtomicHierarchyConceptType;
-
-            template<>
-            struct AtomicHierarchyConceptType<
-                hierarchy::Threads>
-            {
-                using type = ConceptAtomicThreads;
-            };
-
-            template<>
-            struct AtomicHierarchyConceptType<
-                hierarchy::Blocks>
-            {
-                using type = ConceptAtomicBlocks;
-            };
-
-            template<>
-            struct AtomicHierarchyConceptType<
-                hierarchy::Grids>
-            {
-                using type = ConceptAtomicGrids;
-            };
-        }
-
-        template<
-            typename THierarchy
-        >
-        using AtomicHierarchyConcept = typename detail::AtomicHierarchyConceptType<THierarchy>::type;
-
-        //-----------------------------------------------------------------------------
-        //! The atomic operation traits.
-        namespace traits
+    namespace detail
+    {
+        template<typename THierarchy>
+        struct AtomicHierarchyConceptType;
+
+        template<>
+        struct AtomicHierarchyConceptType<hierarchy::Threads>
         {
-            //#############################################################################
-            //! The atomic operation trait.
-            template<
-                typename TOp,
-                typename TAtomic,
-                typename T,
-                typename THierarchy,
-                typename TSfinae = void>
-            struct AtomicOp;
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Executes the given operation atomically.
-        //!
-        //! \tparam TOp The operation type.
-        //! \tparam T The value type.
-        //! \tparam TAtomic The atomic implementation type.
-        //! \param addr The value to change atomically.
-        //! \param value The value used in the atomic operation.
-        //! \param atomic The atomic implementation.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOp,
-            typename TAtomic,
-            typename T,
-            typename THierarchy = hierarchy::Grids>
-        ALPAKA_FN_HOST_ACC auto atomicOp(
-            TAtomic const & atomic,
-            T * const addr,
-            T const & value,
-            THierarchy const & = THierarchy())
-        -> T
+            using type = ConceptAtomicThreads;
+        };
+
+        template<>
+        struct AtomicHierarchyConceptType<hierarchy::Blocks>
         {
-            using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
-            return
-                traits::AtomicOp<
-                    TOp,
-                    ImplementationBase,
-                    T,
-                    THierarchy>
-                ::atomicOp(
-                    atomic,
-                    addr,
-                    value);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Executes the given operation atomically.
-        //!
-        //! \tparam TOp The operation type.
-        //! \tparam TAtomic The atomic implementation type.
-        //! \tparam T The value type.
-        //! \param atomic The atomic implementation.
-        //! \param addr The value to change atomically.
-        //! \param compare The comparison value used in the atomic operation.
-        //! \param value The value used in the atomic operation.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOp,
-            typename TAtomic,
-            typename T,
-            typename THierarchy = hierarchy::Grids>
-        ALPAKA_FN_HOST_ACC auto atomicOp(
-            TAtomic const & atomic,
-            T * const addr,
-            T const & compare,
-            T const & value,
-            THierarchy const & = THierarchy())
-        -> T
+            using type = ConceptAtomicBlocks;
+        };
+
+        template<>
+        struct AtomicHierarchyConceptType<hierarchy::Grids>
         {
-            using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
-            return
-                traits::AtomicOp<
-                    TOp,
-                    ImplementationBase,
-                    T,
-                    THierarchy>
-                ::atomicOp(
-                    atomic,
-                    addr,
-                    compare,
-                    value);
-        }
+            using type = ConceptAtomicGrids;
+        };
+    } // namespace detail
+
+    template<typename THierarchy>
+    using AtomicHierarchyConcept = typename detail::AtomicHierarchyConceptType<THierarchy>::type;
+
+    //-----------------------------------------------------------------------------
+    //! The atomic operation traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The atomic operation trait.
+        template<typename TOp, typename TAtomic, typename T, typename THierarchy, typename TSfinae = void>
+        struct AtomicOp;
+    } // namespace traits
+
+    //-----------------------------------------------------------------------------
+    //! Executes the given operation atomically.
+    //!
+    //! \tparam TOp The operation type.
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOp, typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicOp(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& = THierarchy()) -> T
+    {
+        using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
+        return traits::AtomicOp<TOp, ImplementationBase, T, THierarchy>::atomicOp(atomic, addr, value);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes the given operation atomically.
+    //!
+    //! \tparam TOp The operation type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \tparam T The value type.
+    //! \param atomic The atomic implementation.
+    //! \param addr The value to change atomically.
+    //! \param compare The comparison value used in the atomic operation.
+    //! \param value The value used in the atomic operation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOp, typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicOp(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& compare,
+        T const& value,
+        THierarchy const& = THierarchy()) -> T
+    {
+        using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
+        return traits::AtomicOp<TOp, ImplementationBase, T, THierarchy>::atomicOp(atomic, addr, compare, value);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic add operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicAdd(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicAdd>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic sub operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicSub(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicSub>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic min operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicMin(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicMin>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic max operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicMax(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicMax>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic exchange operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicExch(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicExch>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic increment operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicInc(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicInc>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic decrement operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicDec(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicDec>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic and operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicAnd(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicAnd>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic or operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicOr(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicOr>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic xor operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicXor(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicXor>(atomic, addr, value, hier);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Executes an atomic compare-and-swap operation.
+    //!
+    //! \tparam TAtomic The atomic implementation type.
+    //! \tparam T The value type.
+    //! \param atomic The atomic implementation.
+    //! \param addr The value to change atomically.
+    //! \param compare The comparison value used in the atomic operation.
+    //! \param value The value used in the atomic operation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicCas(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& compare,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicCas>(atomic, addr, compare, value, hier);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp
new file mode 100644
index 0000000000..dc080db846
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynAlignedAlloc.hpp
@@ -0,0 +1,79 @@
+/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/block/shared/dyn/Traits.hpp>
+#include <alpaka/core/AlignedAlloc.hpp>
+#include <alpaka/core/Common.hpp>
+#include <alpaka/core/Vectorize.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The block shared dynamic memory allocator without synchronization.
+    class BlockSharedMemDynAlignedAlloc
+        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynAlignedAlloc>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        BlockSharedMemDynAlignedAlloc(std::size_t const& blockSharedMemDynSizeBytes)
+        {
+            if(blockSharedMemDynSizeBytes > 0u)
+            {
+                m_blockSharedMemDyn.reset(reinterpret_cast<uint8_t*>(
+                    core::alignedAlloc(core::vectorization::defaultAlignment, blockSharedMemDynSizeBytes)));
+            }
+        }
+        //-----------------------------------------------------------------------------
+        BlockSharedMemDynAlignedAlloc(BlockSharedMemDynAlignedAlloc const&) = delete;
+        //-----------------------------------------------------------------------------
+        BlockSharedMemDynAlignedAlloc(BlockSharedMemDynAlignedAlloc&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(BlockSharedMemDynAlignedAlloc const&) -> BlockSharedMemDynAlignedAlloc& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(BlockSharedMemDynAlignedAlloc&&) -> BlockSharedMemDynAlignedAlloc& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSharedMemDynAlignedAlloc() = default;
+
+    public:
+        std::unique_ptr<uint8_t,
+                        core::AlignedDelete> mutable m_blockSharedMemDyn; //!< Block shared dynamic memory.
+    };
+
+    namespace traits
+    {
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
+#endif
+        //#############################################################################
+        template<typename T>
+        struct GetDynSharedMem<T, BlockSharedMemDynAlignedAlloc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getMem(BlockSharedMemDynAlignedAlloc const& blockSharedMemDyn) -> T*
+            {
+                static_assert(
+                    core::vectorization::defaultAlignment >= alignof(T),
+                    "Unable to get block shared dynamic memory for types with alignment higher than "
+                    "defaultAlignment!");
+
+                return reinterpret_cast<T*>(blockSharedMemDyn.m_blockSharedMemDyn.get());
+            }
+        };
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp
deleted file mode 100644
index 9206f1752c..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynBoostAlignedAlloc.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/core/Vectorize.hpp>
-#include <alpaka/block/shared/dyn/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <boost/align.hpp>
-
-#include <vector>
-#include <memory>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace dyn
-            {
-                //#############################################################################
-                //! The block shared dynamic memory allocator without synchronization.
-                class BlockSharedMemDynBoostAlignedAlloc : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynBoostAlignedAlloc>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemDynBoostAlignedAlloc(
-                        std::size_t const & blockSharedMemDynSizeBytes)
-                    {
-                        if(blockSharedMemDynSizeBytes > 0u)
-                        {
-                            m_blockSharedMemDyn.reset(
-                                reinterpret_cast<uint8_t *>(
-                                    boost::alignment::aligned_alloc(core::vectorization::defaultAlignment, blockSharedMemDynSizeBytes)));
-                        }
-                    }
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemDynBoostAlignedAlloc(BlockSharedMemDynBoostAlignedAlloc const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemDynBoostAlignedAlloc(BlockSharedMemDynBoostAlignedAlloc &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemDynBoostAlignedAlloc const &) -> BlockSharedMemDynBoostAlignedAlloc & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemDynBoostAlignedAlloc &&) -> BlockSharedMemDynBoostAlignedAlloc & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemDynBoostAlignedAlloc() = default;
-
-                public:
-                    std::unique_ptr<
-                        uint8_t,
-                        boost::alignment::aligned_delete> mutable
-                            m_blockSharedMemDyn;  //!< Block shared dynamic memory.
-                };
-
-                namespace traits
-                {
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
-#endif
-                    //#############################################################################
-                    template<
-                        typename T>
-                    struct GetMem<
-                        T,
-                        BlockSharedMemDynBoostAlignedAlloc>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto getMem(
-                            block::shared::dyn::BlockSharedMemDynBoostAlignedAlloc const & blockSharedMemDyn)
-                        -> T *
-                        {
-                            static_assert(
-                                core::vectorization::defaultAlignment >= alignof(T),
-                                "Unable to get block shared dynamic memory for types with alignment higher than defaultAlignment!");
-
-                            return reinterpret_cast<T*>(blockSharedMemDyn.m_blockSharedMemDyn.get());
-                        }
-                    };
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
-#endif
-                }
-            }
-        }
-    }
-}
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp
deleted file mode 100644
index 2900fc27dd..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynCudaBuiltIn.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/block/shared/dyn/Traits.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace dyn
-            {
-                //#############################################################################
-                //! The GPU CUDA block shared memory allocator.
-                class BlockSharedMemDynCudaBuiltIn : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynCudaBuiltIn>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemDynCudaBuiltIn() = default;
-                    //-----------------------------------------------------------------------------
-                    __device__ BlockSharedMemDynCudaBuiltIn(BlockSharedMemDynCudaBuiltIn const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ BlockSharedMemDynCudaBuiltIn(BlockSharedMemDynCudaBuiltIn &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ auto operator=(BlockSharedMemDynCudaBuiltIn const &) -> BlockSharedMemDynCudaBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ auto operator=(BlockSharedMemDynCudaBuiltIn &&) -> BlockSharedMemDynCudaBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemDynCudaBuiltIn() = default;
-                };
-
-                namespace traits
-                {
-                    //#############################################################################
-                    template<
-                        typename T>
-                    struct GetMem<
-                        T,
-                        BlockSharedMemDynCudaBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-                        __device__ static auto getMem(
-                            block::shared::dyn::BlockSharedMemDynCudaBuiltIn const &)
-                        -> T *
-                        {
-                            // Because unaligned access to variables is not allowed in device code,
-                            // we have to use the widest possible type to have all types aligned correctly.
-                            // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
-                            // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
-                            extern __shared__ float4 shMem[];
-                            return reinterpret_cast<T *>(shMem);
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp
deleted file mode 100644
index 5d63bf7bf6..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynHipBuiltIn.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/block/shared/dyn/Traits.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace dyn
-            {
-                //#############################################################################
-                //! The GPU HIP block shared memory allocator.
-                class BlockSharedMemDynHipBuiltIn : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynHipBuiltIn>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Default constructor.
-                    ALPAKA_FN_HOST_ACC BlockSharedMemDynHipBuiltIn() = default;
-                    //-----------------------------------------------------------------------------
-                    //! Copy constructor.
-                    __device__ BlockSharedMemDynHipBuiltIn(BlockSharedMemDynHipBuiltIn const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Move constructor.
-                    __device__ BlockSharedMemDynHipBuiltIn(BlockSharedMemDynHipBuiltIn &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Copy assignment operator.
-                    __device__ auto operator=(BlockSharedMemDynHipBuiltIn const &) -> BlockSharedMemDynHipBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Move assignment operator.
-                    __device__ auto operator=(BlockSharedMemDynHipBuiltIn &&) -> BlockSharedMemDynHipBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Destructor.
-                    /*virtual*/ ALPAKA_FN_HOST_ACC ~BlockSharedMemDynHipBuiltIn() = default;
-                };
-
-                namespace traits
-                {
-                    //#############################################################################
-                    //!
-                    template<
-                        typename T>
-                    struct GetMem<
-                        T,
-                        BlockSharedMemDynHipBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-
-                        __device__ static auto getMem(
-                            block::shared::dyn::BlockSharedMemDynHipBuiltIn const &)
-                        -> T *
-                        {
-                            // Because unaligned access to variables is not allowed in device code,
-                            // we have to use the widest possible type to have all types aligned correctly.
-                            extern __shared__ float4 shMem[];
-                            return reinterpret_cast<T *>(shMem);
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp
new file mode 100644
index 0000000000..a878117a65
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp
@@ -0,0 +1,138 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/block/shared/dyn/Traits.hpp>
+#include <alpaka/core/Assert.hpp>
+#include <alpaka/core/Vectorize.hpp>
+
+#include <array>
+#include <cstdint>
+#include <type_traits>
+
+#ifndef ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
+#    define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB 30
+#endif
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //#############################################################################
+        //! "namespace" for static constexpr members that should be in BlockSharedMemDynMember
+        //! but cannot be because having a static const member breaks GCC 10
+        //! OpenMP target and OpenACC: type not mappable.
+        template<std::size_t TStaticAllocKiB>
+        struct BlockSharedMemDynMemberStatic
+        {
+            //! Storage size in bytes
+            static constexpr std::uint32_t staticAllocBytes = static_cast<std::uint32_t>(TStaticAllocKiB << 10u);
+        };
+    } // namespace detail
+
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(push)
+#    pragma warning(disable : 4324) // warning C4324: structure was padded due to alignment specifier
+#endif
+    //#############################################################################
+    //! Dynamic block shared memory provider using fixed-size
+    //! member array to allocate memory on the stack or in shared
+    //! memory.
+    template<std::size_t TStaticAllocKiB = ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB>
+    class alignas(core::vectorization::defaultAlignment) BlockSharedMemDynMember
+        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynMember<TStaticAllocKiB>>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        BlockSharedMemDynMember(std::size_t sizeBytes) : m_dynPitch(getPitch(sizeBytes))
+        {
+#if(defined ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST) && (!defined NDEBUG)
+            ALPAKA_ASSERT(static_cast<std::uint32_t>(sizeBytes) <= staticAllocBytes());
+#endif
+        }
+        //-----------------------------------------------------------------------------
+        BlockSharedMemDynMember(BlockSharedMemDynMember const&) = delete;
+        //-----------------------------------------------------------------------------
+        BlockSharedMemDynMember(BlockSharedMemDynMember&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(BlockSharedMemDynMember const&) -> BlockSharedMemDynMember& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(BlockSharedMemDynMember&&) -> BlockSharedMemDynMember& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSharedMemDynMember() = default;
+
+        uint8_t* dynMemBegin() const
+        {
+            return m_mem.data();
+        }
+
+        /*! \return the pointer to the begin of data after the portion allocated as dynamical shared memory.
+         */
+        uint8_t* staticMemBegin() const
+        {
+            return m_mem.data() + m_dynPitch;
+        }
+
+        /*! \return the remaining capacity for static block shared memory,
+                    returns a 32-bit type for register efficiency on GPUs
+            */
+        std::uint32_t staticMemCapacity() const
+        {
+            return staticAllocBytes() - m_dynPitch;
+        }
+
+        //! \return size of statically allocated memory available for both
+        //!         dynamic and static shared memory. Value is of a 32-bit type
+        //!         for register efficiency on GPUs
+        static constexpr std::uint32_t staticAllocBytes()
+        {
+            return detail::BlockSharedMemDynMemberStatic<TStaticAllocKiB>::staticAllocBytes;
+        }
+
+    private:
+        static std::uint32_t getPitch(std::size_t sizeBytes)
+        {
+            constexpr auto alignment = core::vectorization::defaultAlignment;
+            return static_cast<std::uint32_t>((sizeBytes / alignment + (sizeBytes % alignment > 0u)) * alignment);
+        }
+
+        mutable std::array<uint8_t, detail::BlockSharedMemDynMemberStatic<TStaticAllocKiB>::staticAllocBytes> m_mem;
+        std::uint32_t m_dynPitch;
+    };
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(pop)
+#endif
+
+    namespace traits
+    {
+        //#############################################################################
+        template<typename T, std::size_t TStaticAllocKiB>
+        struct GetDynSharedMem<T, BlockSharedMemDynMember<TStaticAllocKiB>>
+        {
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
+#endif
+            //-----------------------------------------------------------------------------
+            static auto getMem(BlockSharedMemDynMember<TStaticAllocKiB> const& mem) -> T*
+            {
+                static_assert(
+                    core::vectorization::defaultAlignment >= alignof(T),
+                    "Unable to get block shared dynamic memory for types with alignment higher than "
+                    "defaultAlignment!");
+                return reinterpret_cast<T*>(mem.dynMemBegin());
+            }
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..4654c53aeb
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,72 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/block/shared/dyn/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The GPU CUDA/HIP block shared memory allocator.
+    class BlockSharedMemDynUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynUniformCudaHipBuiltIn>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        BlockSharedMemDynUniformCudaHipBuiltIn() = default;
+        //-----------------------------------------------------------------------------
+        __device__ BlockSharedMemDynUniformCudaHipBuiltIn(BlockSharedMemDynUniformCudaHipBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ BlockSharedMemDynUniformCudaHipBuiltIn(BlockSharedMemDynUniformCudaHipBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(BlockSharedMemDynUniformCudaHipBuiltIn const&)
+            -> BlockSharedMemDynUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(BlockSharedMemDynUniformCudaHipBuiltIn&&)
+            -> BlockSharedMemDynUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSharedMemDynUniformCudaHipBuiltIn() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<typename T>
+        struct GetDynSharedMem<T, BlockSharedMemDynUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto getMem(BlockSharedMemDynUniformCudaHipBuiltIn const&) -> T*
+            {
+                // Because unaligned access to variables is not allowed in device code,
+                // we have to use the widest possible type to have all types aligned correctly.
+                // See: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
+                // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
+                extern __shared__ float4 shMem[];
+                return reinterpret_cast<T*>(shMem);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/Traits.hpp
index 9dff622d08..bd55682b37 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/dyn/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,56 +16,37 @@
 
 namespace alpaka
 {
-    //-----------------------------------------------------------------------------
-    //! The grid block specifics
-    namespace block
+    struct ConceptBlockSharedDyn
     {
-        //-----------------------------------------------------------------------------
-        //! The block shared memory operation specifics.
-        namespace shared
-        {
-            //-----------------------------------------------------------------------------
-            //! The block shared dynamic memory operation specifics.
-            namespace dyn
-            {
-                struct ConceptBlockSharedDyn;
+    };
 
-                //-----------------------------------------------------------------------------
-                //! The block shared dynamic memory operation traits.
-                namespace traits
-                {
-                    //#############################################################################
-                    //! The block shared dynamic memory get trait.
-                    template<
-                        typename T,
-                        typename TBlockSharedMemDyn,
-                        typename TSfinae = void>
-                    struct GetMem;
-                }
+    //-----------------------------------------------------------------------------
+    //! The block shared dynamic memory operation traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The block shared dynamic memory get trait.
+        template<typename T, typename TBlockSharedMemDyn, typename TSfinae = void>
+        struct GetDynSharedMem;
+    } // namespace traits
 
-                //-----------------------------------------------------------------------------
-                //! Returns the pointr to the block shared dynamic memory.
-                //!
-                //! \tparam T The element type.
-                //! \tparam TBlockSharedMemDyn The block shared dynamic memory implementation type.
-                //! \param blockSharedMemDyn The block shared dynamic memory implementation.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T,
-                    typename TBlockSharedMemDyn>
-                ALPAKA_FN_ACC auto getMem(
-                    TBlockSharedMemDyn const & blockSharedMemDyn)
-                -> T *
-                {
-                    using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedDyn, TBlockSharedMemDyn>;
-                    return
-                        traits::GetMem<
-                            T,
-                            ImplementationBase>
-                        ::getMem(
-                            blockSharedMemDyn);
-                }
-            }
-        }
+    //-----------------------------------------------------------------------------
+    //! Get block shared dynamic memory.
+    //!
+    //! The available size of the memory can be defined by specializing the trait
+    //! BlockSharedMemDynSizeBytes for a kernel.
+    //! The Memory can be accessed by all threads within a block.
+    //! Access to the memory is not thread safe.
+    //!
+    //! \tparam T The element type.
+    //! \tparam TBlockSharedMemDyn The block shared dynamic memory implementation type.
+    //! \param blockSharedMemDyn The block shared dynamic memory implementation.
+    //! \return Pointer to pre-allocated contiguous memory.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TBlockSharedMemDyn>
+    ALPAKA_FN_ACC auto getDynSharedMem(TBlockSharedMemDyn const& blockSharedMemDyn) -> T*
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedDyn, TBlockSharedMemDyn>;
+        return traits::GetDynSharedMem<T, ImplementationBase>::getMem(blockSharedMemDyn);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp
deleted file mode 100644
index 9bfc0852ee..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStCudaBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/block/shared/st/Traits.hpp>
-
-#include <type_traits>
-#include <cstdint>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace st
-            {
-                //#############################################################################
-                //! The GPU CUDA block shared memory allocator.
-                class BlockSharedMemStCudaBuiltIn : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStCudaBuiltIn>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStCudaBuiltIn() = default;
-                    //-----------------------------------------------------------------------------
-                    __device__ BlockSharedMemStCudaBuiltIn(BlockSharedMemStCudaBuiltIn const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ BlockSharedMemStCudaBuiltIn(BlockSharedMemStCudaBuiltIn &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ auto operator=(BlockSharedMemStCudaBuiltIn const &) -> BlockSharedMemStCudaBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    __device__ auto operator=(BlockSharedMemStCudaBuiltIn &&) -> BlockSharedMemStCudaBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemStCudaBuiltIn() = default;
-                };
-
-                namespace traits
-                {
-                    //#############################################################################
-                    template<
-                        typename T,
-                        std::size_t TuniqueId>
-                    struct AllocVar<
-                        T,
-                        TuniqueId,
-                        BlockSharedMemStCudaBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-                        __device__ static auto allocVar(
-                            block::shared::st::BlockSharedMemStCudaBuiltIn const &)
-                        -> T &
-                        {
-                            __shared__ uint8_t shMem alignas(alignof(T)) [sizeof(T)];
-                            return *(
-                                reinterpret_cast<T*>( shMem ));
-                        }
-                    };
-                    //#############################################################################
-                    template<>
-                    struct FreeMem<
-                        BlockSharedMemStCudaBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-                        __device__ static auto freeMem(
-                            block::shared::st::BlockSharedMemStCudaBuiltIn const &)
-                        -> void
-                        {
-                            // Nothing to do. CUDA block shared memory is automatically freed when all threads left the block.
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp
deleted file mode 100644
index cf05f8c28d..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStHipBuiltIn.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/block/shared/st/Traits.hpp>
-
-#include <type_traits>
-#include <cstdint>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace shared
-        {
-            namespace st
-            {
-                //#############################################################################
-                //! The GPU HIP block shared memory allocator.
-                class BlockSharedMemStHipBuiltIn : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStHipBuiltIn>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Default constructor.
-                    ALPAKA_FN_HOST_ACC BlockSharedMemStHipBuiltIn() = default;
-                    //-----------------------------------------------------------------------------
-                    //! Copy constructor.
-                    __device__ BlockSharedMemStHipBuiltIn(BlockSharedMemStHipBuiltIn const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Move constructor.
-                    __device__ BlockSharedMemStHipBuiltIn(BlockSharedMemStHipBuiltIn &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Copy assignment operator.
-                    __device__ auto operator=(BlockSharedMemStHipBuiltIn const &) -> BlockSharedMemStHipBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Move assignment operator.
-                    __device__ auto operator=(BlockSharedMemStHipBuiltIn &&) -> BlockSharedMemStHipBuiltIn & = delete;
-                    //-----------------------------------------------------------------------------
-                    //! Destructor.
-                    /*virtual*/ ALPAKA_FN_HOST_ACC ~BlockSharedMemStHipBuiltIn() = default;
-                };
-
-                namespace traits
-                {
-                    //#############################################################################
-                    //!
-                    template<
-                        typename T,
-                        std::size_t TuniqueId>
-                    struct AllocVar<
-                        T,
-                        TuniqueId,
-                        BlockSharedMemStHipBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-
-                        __device__ static auto allocVar(
-                            block::shared::st::BlockSharedMemStHipBuiltIn const &)
-                        -> T &
-                        {
-                            __shared__ uint8_t shMem alignas(alignof(T)) [sizeof(T)];
-                            return *(
-                                reinterpret_cast<T*>( shMem ));
-                        }
-                    };
-                    //#############################################################################
-                    //!
-                    template<>
-                    struct FreeMem<
-                        BlockSharedMemStHipBuiltIn>
-                    {
-                        //-----------------------------------------------------------------------------
-
-                        __device__ static auto freeMem(
-                            block::shared::st::BlockSharedMemStHipBuiltIn const &)
-                        -> void
-                        {
-                            // Nothing to do. HIP block shared memory is automatically freed when all threads left the block.
-                        }
-                    };
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp
index ea52c7ea8c..915b328d54 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMasterSync.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,119 +9,90 @@
 
 #pragma once
 
-#include <alpaka/core/Vectorize.hpp>
 #include <alpaka/block/shared/st/Traits.hpp>
-
+#include <alpaka/core/AlignedAlloc.hpp>
 #include <alpaka/core/Common.hpp>
+#include <alpaka/core/Vectorize.hpp>
 
-#include <boost/align.hpp>
-
-#include <vector>
-#include <memory>
 #include <functional>
+#include <memory>
+#include <vector>
 
 namespace alpaka
 {
-    namespace block
+    //#############################################################################
+    //! The block shared memory allocator allocating memory with synchronization on the master thread.
+    class BlockSharedMemStMasterSync : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStMasterSync>
     {
-        namespace shared
+    public:
+        //-----------------------------------------------------------------------------
+        BlockSharedMemStMasterSync(std::function<void()> fnSync, std::function<bool()> fnIsMasterThread)
+            : m_syncFn(fnSync)
+            , m_isMasterThreadFn(fnIsMasterThread)
         {
-            namespace st
-            {
-                //#############################################################################
-                //! The block shared memory allocator allocating memory with synchronization on the master thread.
-                class BlockSharedMemStMasterSync : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStMasterSync>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStMasterSync(
-                        std::function<void()> fnSync,
-                        std::function<bool()> fnIsMasterThread) :
-                            m_syncFn(fnSync),
-                            m_isMasterThreadFn(fnIsMasterThread)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStMasterSync(BlockSharedMemStMasterSync const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStMasterSync(BlockSharedMemStMasterSync &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemStMasterSync const &) -> BlockSharedMemStMasterSync & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemStMasterSync &&) -> BlockSharedMemStMasterSync & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemStMasterSync() = default;
+        }
+        //-----------------------------------------------------------------------------
+        BlockSharedMemStMasterSync(BlockSharedMemStMasterSync const&) = delete;
+        //-----------------------------------------------------------------------------
+        BlockSharedMemStMasterSync(BlockSharedMemStMasterSync&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(BlockSharedMemStMasterSync const&) -> BlockSharedMemStMasterSync& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(BlockSharedMemStMasterSync&&) -> BlockSharedMemStMasterSync& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSharedMemStMasterSync() = default;
 
-                public:
-                    // TODO: We should add the size of the (current) allocation.
-                    // This would allow to assert that all parallel function calls request to allocate the same size.
-                    std::vector<
-                        std::unique_ptr<
-                            uint8_t,
-                            boost::alignment::aligned_delete>> mutable
-                        m_sharedAllocs;
+    public:
+        // TODO: We should add the size of the (current) allocation.
+        // This would allow to assert that all parallel function calls request to allocate the same size.
+        std::vector<std::unique_ptr<uint8_t, core::AlignedDelete>> mutable m_sharedAllocs;
 
-                    std::function<void()> m_syncFn;
-                    std::function<bool()> m_isMasterThreadFn;
-                };
+        std::function<void()> m_syncFn;
+        std::function<bool()> m_isMasterThreadFn;
+    };
 
-                namespace traits
-                {
+    namespace traits
+    {
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
 #endif
-                    //#############################################################################
-                    template<
-                        typename T,
-                        std::size_t TuniqueId>
-                    struct AllocVar<
-                        T,
-                        TuniqueId,
-                        BlockSharedMemStMasterSync>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto allocVar(
-                            block::shared::st::BlockSharedMemStMasterSync const & blockSharedMemSt)
-                        -> T &
-                        {
-                            // TODO: replace with constexpr std::max in C++14
-                            constexpr std::size_t alignmentInBytes = (core::vectorization::defaultAlignment < alignof(T)) ? alignof(T) : core::vectorization::defaultAlignment;
+        //#############################################################################
+        template<typename T, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStMasterSync>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto declareVar(BlockSharedMemStMasterSync const& blockSharedMemSt) -> T&
+            {
+                constexpr std::size_t alignmentInBytes = std::max(core::vectorization::defaultAlignment, alignof(T));
 
-                            // Assure that all threads have executed the return of the last allocBlockSharedArr function (if there was one before).
-                            blockSharedMemSt.m_syncFn();
+                // Assure that all threads have executed the return of the last allocBlockSharedArr function (if there
+                // was one before).
+                blockSharedMemSt.m_syncFn();
 
-                            if(blockSharedMemSt.m_isMasterThreadFn())
-                            {
-                                blockSharedMemSt.m_sharedAllocs.emplace_back(
-                                    reinterpret_cast<uint8_t *>(
-                                        boost::alignment::aligned_alloc(alignmentInBytes, sizeof(T))));
-                            }
-                            blockSharedMemSt.m_syncFn();
+                if(blockSharedMemSt.m_isMasterThreadFn())
+                {
+                    blockSharedMemSt.m_sharedAllocs.emplace_back(
+                        reinterpret_cast<uint8_t*>(core::alignedAlloc(alignmentInBytes, sizeof(T))));
+                }
+                blockSharedMemSt.m_syncFn();
 
-                            return
-                                std::ref(
-                                    *reinterpret_cast<T*>(
-                                        blockSharedMemSt.m_sharedAllocs.back().get()));
-                        }
-                    };
+                return std::ref(*reinterpret_cast<T*>(blockSharedMemSt.m_sharedAllocs.back().get()));
+            }
+        };
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
+#    pragma GCC diagnostic pop
 #endif
-                    //#############################################################################
-                    template<>
-                    struct FreeMem<
-                        BlockSharedMemStMasterSync>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto freeMem(
-                            block::shared::st::BlockSharedMemStMasterSync const & blockSharedMemSt)
-                        -> void
-                        {
-                            blockSharedMemSt.m_sharedAllocs.clear();
-                        }
-                    };
-                }
+        //#############################################################################
+        template<>
+        struct FreeSharedVars<BlockSharedMemStMasterSync>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto freeVars(BlockSharedMemStMasterSync const& blockSharedMemSt) -> void
+            {
+                blockSharedMemSt.m_sharedAllocs.clear();
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp
new file mode 100644
index 0000000000..050cc44822
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp
@@ -0,0 +1,140 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/block/shared/st/Traits.hpp>
+#include <alpaka/core/Assert.hpp>
+#include <alpaka/core/Vectorize.hpp>
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //#############################################################################
+        //! Implementation of static block shared memory provider.
+        template<std::size_t TDataAlignBytes = core::vectorization::defaultAlignment>
+        class BlockSharedMemStMemberImpl
+        {
+        public:
+            //-----------------------------------------------------------------------------
+#ifndef NDEBUG
+            BlockSharedMemStMemberImpl(uint8_t* mem, std::size_t capacity)
+                : m_mem(mem)
+                , m_capacity(static_cast<std::uint32_t>(capacity))
+            {
+#    ifdef ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST
+                ALPAKA_ASSERT((m_mem == nullptr) == (m_capacity == 0u));
+#    endif
+            }
+#else
+            BlockSharedMemStMemberImpl(uint8_t* mem, std::size_t) : m_mem(mem)
+            {
+            }
+#endif
+            //-----------------------------------------------------------------------------
+            BlockSharedMemStMemberImpl(BlockSharedMemStMemberImpl const&) = delete;
+            //-----------------------------------------------------------------------------
+            BlockSharedMemStMemberImpl(BlockSharedMemStMemberImpl&&) = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(BlockSharedMemStMemberImpl const&) -> BlockSharedMemStMemberImpl& = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(BlockSharedMemStMemberImpl&&) -> BlockSharedMemStMemberImpl& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~BlockSharedMemStMemberImpl() = default;
+
+            template<typename T>
+            void alloc() const
+            {
+                m_allocdBytes = allocPitch<T>();
+                m_allocdBytes += static_cast<std::uint32_t>(sizeof(T));
+#if(defined ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST) && (!defined NDEBUG)
+                ALPAKA_ASSERT(m_allocdBytes <= m_capacity);
+#endif
+            }
+
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
+#endif
+            template<typename T>
+            T& getLatestVar() const
+            {
+                return *reinterpret_cast<T*>(&m_mem[m_allocdBytes - sizeof(T)]);
+            }
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+
+            void free() const
+            {
+                m_allocdBytes = 0u;
+            }
+
+        private:
+            mutable std::uint32_t m_allocdBytes = 0;
+            mutable uint8_t* m_mem;
+#ifndef NDEBUG
+            const std::uint32_t m_capacity;
+#endif
+
+            template<typename T>
+            std::uint32_t allocPitch() const
+            {
+                static_assert(
+                    core::vectorization::defaultAlignment >= alignof(T),
+                    "Unable to get block shared static memory for types with alignment higher than defaultAlignment!");
+                constexpr std::uint32_t align = static_cast<std::uint32_t>(std::max(TDataAlignBytes, alignof(T)));
+                return (m_allocdBytes / align + (m_allocdBytes % align > 0u)) * align;
+            }
+        };
+    } // namespace detail
+    //#############################################################################
+    //! Static block shared memory provider using a pointer to
+    //! externally allocated fixed-size memory, likely provided by
+    //! BlockSharedMemDynMember.
+    template<std::size_t TDataAlignBytes = core::vectorization::defaultAlignment>
+    class BlockSharedMemStMember
+        : public detail::BlockSharedMemStMemberImpl<TDataAlignBytes>
+        , public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStMember<TDataAlignBytes>>
+    {
+    public:
+        using detail::BlockSharedMemStMemberImpl<TDataAlignBytes>::BlockSharedMemStMemberImpl;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<typename T, std::size_t TDataAlignBytes, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStMember<TDataAlignBytes>>
+        {
+            //-----------------------------------------------------------------------------
+            static auto declareVar(BlockSharedMemStMember<TDataAlignBytes> const& smem) -> T&
+            {
+                smem.template alloc<T>();
+                return smem.template getLatestVar<T>();
+            }
+        };
+        //#############################################################################
+        template<std::size_t TDataAlignBytes>
+        struct FreeSharedVars<BlockSharedMemStMember<TDataAlignBytes>>
+        {
+            //-----------------------------------------------------------------------------
+            static auto freeVars(BlockSharedMemStMember<TDataAlignBytes> const& mem) -> void
+            {
+                mem.free();
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp
index ae414d16f5..c28c8288f1 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStNoSync.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,102 +9,73 @@
 
 #pragma once
 
-#include <alpaka/core/Vectorize.hpp>
 #include <alpaka/block/shared/st/Traits.hpp>
-
+#include <alpaka/core/AlignedAlloc.hpp>
 #include <alpaka/core/Common.hpp>
+#include <alpaka/core/Vectorize.hpp>
 
-#include <boost/align.hpp>
-
-#include <vector>
 #include <memory>
+#include <vector>
 
 namespace alpaka
 {
-    namespace block
+    //#############################################################################
+    //! The block shared memory allocator without synchronization.
+    class BlockSharedMemStNoSync : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStNoSync>
     {
-        namespace shared
-        {
-            namespace st
-            {
-                //#############################################################################
-                //! The block shared memory allocator without synchronization.
-                class BlockSharedMemStNoSync : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStNoSync>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStNoSync() = default;
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStNoSync(BlockSharedMemStNoSync const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    BlockSharedMemStNoSync(BlockSharedMemStNoSync &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemStNoSync const &) -> BlockSharedMemStNoSync & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(BlockSharedMemStNoSync &&) -> BlockSharedMemStNoSync & = delete;
-                    //-----------------------------------------------------------------------------
-                    /*virtual*/ ~BlockSharedMemStNoSync() = default;
+    public:
+        //-----------------------------------------------------------------------------
+        BlockSharedMemStNoSync() = default;
+        //-----------------------------------------------------------------------------
+        BlockSharedMemStNoSync(BlockSharedMemStNoSync const&) = delete;
+        //-----------------------------------------------------------------------------
+        BlockSharedMemStNoSync(BlockSharedMemStNoSync&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(BlockSharedMemStNoSync const&) -> BlockSharedMemStNoSync& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(BlockSharedMemStNoSync&&) -> BlockSharedMemStNoSync& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSharedMemStNoSync() = default;
 
-                public:
-                    // TODO: We should add the size of the (current) allocation.
-                    // This would allow to assert that all parallel function calls request to allocate the same size.
-                    std::vector<
-                        std::unique_ptr<
-                            uint8_t,
-                            boost::alignment::aligned_delete>> mutable
-                        m_sharedAllocs;
-                };
+    public:
+        // TODO: We should add the size of the (current) allocation.
+        // This would allow to assert that all parallel function calls request to allocate the same size.
+        std::vector<std::unique_ptr<uint8_t, core::AlignedDelete>> mutable m_sharedAllocs;
+    };
 
-                namespace traits
-                {
+    namespace traits
+    {
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
 #endif
-                    //#############################################################################
-                    template<
-                        typename T,
-                        std::size_t TuniqueId>
-                    struct AllocVar<
-                        T,
-                        TuniqueId,
-                        BlockSharedMemStNoSync>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto allocVar(
-                            block::shared::st::BlockSharedMemStNoSync const & blockSharedMemSt)
-                        -> T &
-                        {
-                            // TODO: replace with constexpr std::max in C++14
-                            constexpr std::size_t alignmentInBytes = (core::vectorization::defaultAlignment < alignof(T)) ? alignof(T) : core::vectorization::defaultAlignment;
+        //#############################################################################
+        template<typename T, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStNoSync>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto declareVar(BlockSharedMemStNoSync const& blockSharedMemSt) -> T&
+            {
+                constexpr std::size_t alignmentInBytes = std::max(core::vectorization::defaultAlignment, alignof(T));
 
-                            blockSharedMemSt.m_sharedAllocs.emplace_back(
-                                reinterpret_cast<uint8_t *>(
-                                    boost::alignment::aligned_alloc(alignmentInBytes, sizeof(T))));
-                            return
-                                std::ref(
-                                    *reinterpret_cast<T*>(
-                                        blockSharedMemSt.m_sharedAllocs.back().get()));
-                        }
-                    };
+                blockSharedMemSt.m_sharedAllocs.emplace_back(
+                    reinterpret_cast<uint8_t*>(core::alignedAlloc(alignmentInBytes, sizeof(T))));
+                return std::ref(*reinterpret_cast<T*>(blockSharedMemSt.m_sharedAllocs.back().get()));
+            }
+        };
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
+#    pragma GCC diagnostic pop
 #endif
-                    //#############################################################################
-                    template<>
-                    struct FreeMem<
-                        BlockSharedMemStNoSync>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto freeMem(
-                            block::shared::st::BlockSharedMemStNoSync const & blockSharedMemSt)
-                        -> void
-                        {
-                            blockSharedMemSt.m_sharedAllocs.clear();
-                        }
-                    };
-                }
+        //#############################################################################
+        template<>
+        struct FreeSharedVars<BlockSharedMemStNoSync>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto freeVars(BlockSharedMemStNoSync const& blockSharedMemSt) -> void
+            {
+                blockSharedMemSt.m_sharedAllocs.clear();
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStOmp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStOmp5.hpp
new file mode 100644
index 0000000000..589b4013b7
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStOmp5.hpp
@@ -0,0 +1,66 @@
+/* Copyright 2019 Benjamin Worpitz, Erik Zenker, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/block/shared/st/BlockSharedMemStMember.hpp>
+#    include <alpaka/block/shared/st/Traits.hpp>
+
+#    include <omp.h>
+
+#    include <cstdint>
+#    include <type_traits>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The OpenMP 5 block shared memory allocator.
+    class BlockSharedMemStOmp5
+        : public detail::BlockSharedMemStMemberImpl<4>
+        , public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStOmp5>
+    {
+    public:
+        using BlockSharedMemStMemberImpl<4>::BlockSharedMemStMemberImpl;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<typename T, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            static auto declareVar(BlockSharedMemStOmp5 const& smem) -> T&
+            {
+#    pragma omp barrier
+                smem.alloc<T>();
+#    pragma omp barrier
+                return smem.getLatestVar<T>();
+            }
+        };
+        //#############################################################################
+        template<>
+        struct FreeSharedVars<BlockSharedMemStOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            static auto freeVars(BlockSharedMemStOmp5 const& mem) -> void
+            {
+                mem.free();
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..c8bff9bb24
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,85 @@
+/* Copyright 2019 Benjamin Worpitz, Erik Zenker, René Widera, Matthias Werner
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/block/shared/st/Traits.hpp>
+
+#    include <cstdint>
+#    include <type_traits>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The GPU CUDA/HIP block shared memory allocator.
+    class BlockSharedMemStUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStUniformCudaHipBuiltIn>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        //! Default constructor.
+        BlockSharedMemStUniformCudaHipBuiltIn() = default;
+        //-----------------------------------------------------------------------------
+        //! Copy constructor.
+        __device__ BlockSharedMemStUniformCudaHipBuiltIn(BlockSharedMemStUniformCudaHipBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        //! Move constructor.
+        __device__ BlockSharedMemStUniformCudaHipBuiltIn(BlockSharedMemStUniformCudaHipBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        //! Copy assignment operator.
+        __device__ auto operator=(BlockSharedMemStUniformCudaHipBuiltIn const&)
+            -> BlockSharedMemStUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        //! Move assignment operator.
+        __device__ auto operator=(BlockSharedMemStUniformCudaHipBuiltIn&&)
+            -> BlockSharedMemStUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        //! Destructor.
+        /*virtual*/ ~BlockSharedMemStUniformCudaHipBuiltIn() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<typename T, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto declareVar(BlockSharedMemStUniformCudaHipBuiltIn const&) -> T&
+            {
+                __shared__ uint8_t shMem alignas(alignof(T))[sizeof(T)];
+                return *(reinterpret_cast<T*>(shMem));
+            }
+        };
+        //#############################################################################
+        template<>
+        struct FreeSharedVars<BlockSharedMemStUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto freeVars(BlockSharedMemStUniformCudaHipBuiltIn const&) -> void
+            {
+                // Nothing to do. CUDA/HIP block shared memory is automatically freed when all threads left the block.
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/Traits.hpp
index 9118860a7e..2b544c9396 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/shared/st/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,87 +16,54 @@
 
 namespace alpaka
 {
-    //-----------------------------------------------------------------------------
-    //! The grid block specifics
-    namespace block
+    struct ConceptBlockSharedSt
     {
-        //-----------------------------------------------------------------------------
-        //! The block shared memory operation specifics.
-        namespace shared
-        {
-            //-----------------------------------------------------------------------------
-            //! The block shared static memory operation specifics.
-            namespace st
-            {
-                struct ConceptBlockSharedSt;
+    };
 
-                //-----------------------------------------------------------------------------
-                //! The block shared static memory operation traits.
-                namespace traits
-                {
-                    //#############################################################################
-                    //! The block shared static memory variable allocation operation trait.
-                    template<
-                        typename T,
-                        std::size_t TuniqueId,
-                        typename TBlockSharedMemSt,
-                        typename TSfinae = void>
-                    struct AllocVar;
-                    //#############################################################################
-                    //! The block shared static memory free operation trait.
-                    template<
-                        typename TBlockSharedMemSt,
-                        typename TSfinae = void>
-                    struct FreeMem;
-                }
+    //-----------------------------------------------------------------------------
+    //! The block shared static memory operation traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The block shared static memory variable allocation operation trait.
+        template<typename T, std::size_t TuniqueId, typename TBlockSharedMemSt, typename TSfinae = void>
+        struct DeclareSharedVar;
+        //#############################################################################
+        //! The block shared static memory free operation trait.
+        template<typename TBlockSharedMemSt, typename TSfinae = void>
+        struct FreeSharedVars;
+    } // namespace traits
 
-                //-----------------------------------------------------------------------------
-                //! Allocates a variable in block shared static memory.
-                //!
-                //! The allocated variable is uninitialized and not default constructed!
-                //!
-                //! \tparam T The element type.
-                //! \tparam TuniqueId id those is unique inside a kernel
-                //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
-                //! \param blockSharedMemSt The block shared allocator implementation.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename T,
-                    std::size_t TuniqueId,
-                    typename TBlockSharedMemSt>
-                ALPAKA_FN_ACC auto allocVar(
-                    TBlockSharedMemSt const & blockSharedMemSt)
-                -> T &
-                {
-                    using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
-                    return
-                        traits::AllocVar<
-                            T,
-                            TuniqueId,
-                            ImplementationBase>
-                        ::allocVar(
-                            blockSharedMemSt);
-                }
+    //-----------------------------------------------------------------------------
+    //! Declare a block shared variable.
+    //!
+    //! The variable is uninitialized and not default constructed!
+    //! The variable can be accessed by all threads within a block.
+    //! Access to the variable is not thread safe.
+    //!
+    //! \tparam T The element type.
+    //! \tparam TuniqueId id those is unique inside a kernel
+    //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
+    //! \param blockSharedMemSt The block shared allocator implementation.
+    //! \return Uninitialized variable stored in shared memory.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, std::size_t TuniqueId, typename TBlockSharedMemSt>
+    ALPAKA_FN_ACC auto declareSharedVar(TBlockSharedMemSt const& blockSharedMemSt) -> T&
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
+        return traits::DeclareSharedVar<T, TuniqueId, ImplementationBase>::declareVar(blockSharedMemSt);
+    }
 
-                //-----------------------------------------------------------------------------
-                //! Frees all block shared static memory.
-                //!
-                //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
-                //! \param blockSharedMemSt The block shared allocator implementation.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TBlockSharedMemSt>
-                ALPAKA_FN_ACC auto freeMem(
-                    TBlockSharedMemSt & blockSharedMemSt)
-                -> void
-                {
-                    using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
-                    traits::FreeMem<
-                        ImplementationBase>
-                    ::freeMem(
-                        blockSharedMemSt);
-                }
-            }
-        }
+    //-----------------------------------------------------------------------------
+    //! Frees all memory used by block shared variables.
+    //!
+    //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
+    //! \param blockSharedMemSt The block shared allocator implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TBlockSharedMemSt>
+    ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt& blockSharedMemSt) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
+        traits::FreeSharedVars<ImplementationBase>::freeVars(blockSharedMemSt);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp
index 0d8e2d6b2f..ad3dfde8b6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierFiber.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,116 +11,98 @@
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
 
-#include <alpaka/block/sync/Traits.hpp>
+#    include <alpaka/block/sync/Traits.hpp>
+#    include <alpaka/core/Common.hpp>
+#    include <alpaka/core/Fibers.hpp>
 
-#include <alpaka/core/Fibers.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <mutex>
-#include <map>
+#    include <map>
+#    include <mutex>
 
 namespace alpaka
 {
-    namespace block
+    //#############################################################################
+    //! The thread id map barrier block synchronization.
+    template<typename TIdx>
+    class BlockSyncBarrierFiber : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierFiber<TIdx>>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierFiber(TIdx const& blockThreadCount)
+            : m_barrier(static_cast<std::size_t>(blockThreadCount))
+            , m_threadCount(blockThreadCount)
+            , m_curThreadCount(static_cast<TIdx>(0u))
+            , m_generation(static_cast<TIdx>(0u))
+        {
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierFiber(BlockSyncBarrierFiber const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierFiber(BlockSyncBarrierFiber&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(BlockSyncBarrierFiber const&) -> BlockSyncBarrierFiber& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(BlockSyncBarrierFiber&&) -> BlockSyncBarrierFiber& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSyncBarrierFiber() = default;
+
+        boost::fibers::barrier mutable m_barrier;
+
+        TIdx mutable m_threadCount;
+        TIdx mutable m_curThreadCount;
+        TIdx mutable m_generation;
+        int mutable m_result[2u];
+    };
+
+    namespace traits
     {
-        namespace sync
+        //#############################################################################
+        template<typename TIdx>
+        struct SyncBlockThreads<BlockSyncBarrierFiber<TIdx>>
         {
-            //#############################################################################
-            //! The thread id map barrier block synchronization.
-            template<
-                typename TIdx>
-            class BlockSyncBarrierFiber : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierFiber<TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto syncBlockThreads(BlockSyncBarrierFiber<TIdx> const& blockSync) -> void
             {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierFiber(
-                    TIdx const & blockThreadCount) :
-                        m_barrier(static_cast<std::size_t>(blockThreadCount)),
-                        m_threadCount(blockThreadCount),
-                        m_curThreadCount(static_cast<TIdx>(0u)),
-                        m_generation(static_cast<TIdx>(0u))
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierFiber(BlockSyncBarrierFiber const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierFiber(BlockSyncBarrierFiber &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierFiber const &) -> BlockSyncBarrierFiber & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierFiber &&) -> BlockSyncBarrierFiber & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~BlockSyncBarrierFiber() = default;
-
-                boost::fibers::barrier mutable m_barrier;
-
-                TIdx mutable m_threadCount;
-                TIdx mutable m_curThreadCount;
-                TIdx mutable m_generation;
-                int mutable m_result[2u];
-            };
-
-            namespace traits
+                blockSync.m_barrier.wait();
+            }
+        };
+
+        //#############################################################################
+        template<typename TOp, typename TIdx>
+        struct SyncBlockThreadsPredicate<TOp, BlockSyncBarrierFiber<TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
+                BlockSyncBarrierFiber<TIdx> const& blockSync,
+                int predicate) -> int
             {
-                //#############################################################################
-                template<
-                    typename TIdx>
-                struct SyncBlockThreads<
-                    BlockSyncBarrierFiber<TIdx>>
+                if(blockSync.m_curThreadCount == blockSync.m_threadCount)
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto syncBlockThreads(
-                        block::sync::BlockSyncBarrierFiber<TIdx> const & blockSync)
-                    -> void
-                    {
-                        blockSync.m_barrier.wait();
-                    }
-                };
-
-                //#############################################################################
-                template<
-                    typename TOp,
-                    typename TIdx>
-                struct SyncBlockThreadsPredicate<
-                    TOp,
-                    BlockSyncBarrierFiber<TIdx>>
+                    blockSync.m_curThreadCount = static_cast<TIdx>(0u);
+                    ++blockSync.m_generation;
+                }
+
+                auto const generationMod2(blockSync.m_generation % static_cast<TIdx>(2u));
+
+                // The first fiber will reset the value to the initial value.
+                if(blockSync.m_curThreadCount == static_cast<TIdx>(0u))
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncBarrierFiber<TIdx> const & blockSync,
-                        int predicate)
-                    -> int
-                    {
-                        if(blockSync.m_curThreadCount == blockSync.m_threadCount)
-                        {
-                            blockSync.m_curThreadCount = static_cast<TIdx>(0u);
-                            ++blockSync.m_generation;
-                        }
-
-                        auto const generationMod2(blockSync.m_generation % static_cast<TIdx>(2u));
-
-                        // The first fiber will reset the value to the initial value.
-                        if(blockSync.m_curThreadCount == static_cast<TIdx>(0u))
-                        {
-                            blockSync.m_result[generationMod2] = TOp::InitialValue;
-                        }
-
-                        ++blockSync.m_curThreadCount;
-
-                        // We do not have to lock because there is only ever one fiber active per block.
-                        blockSync.m_result[generationMod2] = TOp()(blockSync.m_result[generationMod2], predicate);
-
-                        // After all block threads have combined their values ...
-                        blockSync.m_barrier.wait();
-
-                        // ... the result can be returned.
-                        return blockSync.m_result[generationMod2];
-                    }
-                };
+                    blockSync.m_result[generationMod2] = TOp::InitialValue;
+                }
+
+                ++blockSync.m_curThreadCount;
+
+                // We do not have to lock because there is only ever one fiber active per block.
+                blockSync.m_result[generationMod2] = TOp()(blockSync.m_result[generationMod2], predicate);
+
+                // After all block threads have combined their values ...
+                blockSync.m_barrier.wait();
+
+                // ... the result can be returned.
+                return blockSync.m_result[generationMod2];
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOacc.hpp
new file mode 100644
index 0000000000..3b1e92efe5
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOacc.hpp
@@ -0,0 +1,49 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/block/sync/Traits.hpp>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The OpenACC barrier block synchronization.
+    class BlockSyncBarrierOacc
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierOacc() = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierOacc(BlockSyncBarrierOacc const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierOacc(BlockSyncBarrierOacc&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(BlockSyncBarrierOacc const&) -> BlockSyncBarrierOacc& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(BlockSyncBarrierOacc&&) -> BlockSyncBarrierOacc& = delete;
+        //-----------------------------------------------------------------------------
+        ~BlockSyncBarrierOacc() = default;
+
+        std::uint8_t mutable m_generation = 0u;
+        // NVHPC 20.7: initializer causes warning:
+        // NVC++-W-0155-External and Static variables are not supported in acc routine - _T139951818207704_37530
+        //! m_synchCounter[ 2 generations  * 2 counters per]
+        int mutable m_syncCounter[2 * 2]{0, 0, 0, 0};
+        int mutable m_result;
+    };
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
index 1676b4b51d..dfbe4b6fd7 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,144 +11,125 @@
 
 #ifdef _OPENMP
 
-#include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
+#    include <alpaka/block/sync/Traits.hpp>
+#    include <alpaka/core/Common.hpp>
+#    include <alpaka/core/Unused.hpp>
 
 namespace alpaka
 {
-    namespace block
+    //#############################################################################
+    //! The OpenMP barrier block synchronization.
+    class BlockSyncBarrierOmp : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierOmp>
     {
-        namespace sync
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierOmp() : m_generation(0u)
         {
-            //#############################################################################
-            //! The OpenMP barrier block synchronization.
-            class BlockSyncBarrierOmp : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierOmp>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierOmp() :
-                    m_generation(0u)
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierOmp(BlockSyncBarrierOmp const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierOmp(BlockSyncBarrierOmp &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierOmp const &) -> BlockSyncBarrierOmp & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierOmp &&) -> BlockSyncBarrierOmp & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~BlockSyncBarrierOmp() = default;
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierOmp(BlockSyncBarrierOmp const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierOmp(BlockSyncBarrierOmp&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(BlockSyncBarrierOmp const&) -> BlockSyncBarrierOmp& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(BlockSyncBarrierOmp&&) -> BlockSyncBarrierOmp& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSyncBarrierOmp() = default;
 
-                std::uint8_t mutable m_generation;
-                int mutable m_result[2];
-            };
+        std::uint8_t mutable m_generation;
+        int mutable m_result[2];
+    };
 
-            namespace traits
+    namespace traits
+    {
+        //#############################################################################
+        template<>
+        struct SyncBlockThreads<BlockSyncBarrierOmp>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto syncBlockThreads(BlockSyncBarrierOmp const& blockSync) -> void
             {
-                //#############################################################################
-                template<>
-                struct SyncBlockThreads<
-                    BlockSyncBarrierOmp>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto syncBlockThreads(
-                        block::sync::BlockSyncBarrierOmp const & blockSync)
-                    -> void
-                    {
-                        alpaka::ignore_unused(blockSync);
+                alpaka::ignore_unused(blockSync);
 
-                        // NOTE: This waits for all threads in all blocks.
-                        // If multiple blocks are executed in parallel this is not optimal.
-                        #pragma omp barrier
-                    }
-                };
+// NOTE: This waits for all threads in all blocks.
+// If multiple blocks are executed in parallel this is not optimal.
+#    pragma omp barrier
+            }
+        };
 
-                namespace detail
+        namespace detail
+        {
+            //#############################################################################
+            template<typename TOp>
+            struct AtomicOp;
+            //#############################################################################
+            template<>
+            struct AtomicOp<BlockCount>
+            {
+                void operator()(int& result, bool value)
+                {
+#    pragma omp atomic
+                    result += static_cast<int>(value);
+                }
+            };
+            //#############################################################################
+            template<>
+            struct AtomicOp<BlockAnd>
+            {
+                void operator()(int& result, bool value)
+                {
+#    pragma omp atomic
+                    result &= static_cast<int>(value);
+                }
+            };
+            //#############################################################################
+            template<>
+            struct AtomicOp<BlockOr>
+            {
+                void operator()(int& result, bool value)
                 {
-                    //#############################################################################
-                    template<
-                        typename TOp>
-                    struct AtomicOp;
-                    //#############################################################################
-                    template<>
-                    struct AtomicOp<
-                        block::sync::op::Count>
-                    {
-                        void operator()(int& result, bool value)
-                        {
-                            #pragma omp atomic
-                            result += static_cast<int>(value);
-                        }
-                    };
-                    //#############################################################################
-                    template<>
-                    struct AtomicOp<
-                        block::sync::op::LogicalAnd>
-                    {
-                        void operator()(int& result, bool value)
-                        {
-                            #pragma omp atomic
-                            result &= static_cast<int>(value);
-                        }
-                    };
-                    //#############################################################################
-                    template<>
-                    struct AtomicOp<
-                        block::sync::op::LogicalOr>
-                    {
-                        void operator()(int& result, bool value)
-                        {
-                            #pragma omp atomic
-                            result |= static_cast<int>(value);
-                        }
-                    };
+#    pragma omp atomic
+                    result |= static_cast<int>(value);
                 }
+            };
+        } // namespace detail
 
-                //#############################################################################
-                template<
-                    typename TOp>
-                struct SyncBlockThreadsPredicate<
-                    TOp,
-                    BlockSyncBarrierOmp>
+        //#############################################################################
+        template<typename TOp>
+        struct SyncBlockThreadsPredicate<TOp, BlockSyncBarrierOmp>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(BlockSyncBarrierOmp const& blockSync, int predicate)
+                -> int
+            {
+// The first thread initializes the value.
+// There is an implicit barrier at the end of omp single.
+// NOTE: This code is executed only once for all OpenMP threads.
+// If multiple blocks with multiple threads are executed in parallel
+// this reduction is executed only for one block!
+#    pragma omp single
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncBarrierOmp const & blockSync,
-                        int predicate)
-                    -> int
-                    {
-                        // The first thread initializes the value.
-                        // There is an implicit barrier at the end of omp single.
-                        // NOTE: This code is executed only once for all OpenMP threads.
-                        // If multiple blocks with multiple threads are executed in parallel
-                        // this reduction is executed only for one block!
-                        #pragma omp single
-                        {
-                            ++blockSync.m_generation;
-                            blockSync.m_result[blockSync.m_generation % 2u] = TOp::InitialValue;
-                        }
+                    ++blockSync.m_generation;
+                    blockSync.m_result[blockSync.m_generation % 2u] = TOp::InitialValue;
+                }
 
-                        auto const generationMod2(blockSync.m_generation % 2u);
-                        int& result(blockSync.m_result[generationMod2]);
-                        bool const predicateBool(predicate != 0);
+                auto const generationMod2(blockSync.m_generation % 2u);
+                int& result(blockSync.m_result[generationMod2]);
+                bool const predicateBool(predicate != 0);
 
-                        detail::AtomicOp<TOp>()(result, predicateBool);
+                detail::AtomicOp<TOp>()(result, predicateBool);
 
-                        // Wait for all threads to write their predicate into the vector.
-                        // NOTE: This waits for all threads in all blocks.
-                        // If multiple blocks are executed in parallel this is not optimal.
-                        #pragma omp barrier
+// Wait for all threads to write their predicate into the vector.
+// NOTE: This waits for all threads in all blocks.
+// If multiple blocks are executed in parallel this is not optimal.
+#    pragma omp barrier
 
-                        return blockSync.m_result[generationMod2];
-                    }
-                };
+                return blockSync.m_result[generationMod2];
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
index 8f8755a1d9..50b677ebe0 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,91 +11,73 @@
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
 
-#include <alpaka/block/sync/Traits.hpp>
+#    include <alpaka/block/sync/Traits.hpp>
+#    include <alpaka/core/BarrierThread.hpp>
+#    include <alpaka/core/Common.hpp>
 
-#include <alpaka/core/BarrierThread.hpp>
-
-#include <alpaka/core/Common.hpp>
-
-#include <thread>
-#include <mutex>
-#include <map>
+#    include <map>
+#    include <mutex>
+#    include <thread>
 
 namespace alpaka
 {
-    namespace block
+    //#############################################################################
+    //! The thread id map barrier block synchronization.
+    template<typename TIdx>
+    class BlockSyncBarrierThread : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierThread<TIdx>>
     {
-        namespace sync
-        {
-            //#############################################################################
-            //! The thread id map barrier block synchronization.
-            template<
-                typename TIdx>
-            class BlockSyncBarrierThread : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierThread<TIdx>>
-            {
-            public:
-                using Barrier = core::threads::BarrierThread<TIdx>;
-                using BarrierWithPredicate = core::threads::BarrierThreadWithPredicate<TIdx>;
+    public:
+        using Barrier = core::threads::BarrierThread<TIdx>;
+        using BarrierWithPredicate = core::threads::BarrierThreadWithPredicate<TIdx>;
 
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierThread(
-                    TIdx const & blockThreadCount) :
-                        m_barrier(blockThreadCount),
-                        m_barrierWithPredicate(blockThreadCount)
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierThread(BlockSyncBarrierThread const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST BlockSyncBarrierThread(BlockSyncBarrierThread &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierThread const &) -> BlockSyncBarrierThread & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(BlockSyncBarrierThread &&) -> BlockSyncBarrierThread & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~BlockSyncBarrierThread() = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierThread(TIdx const& blockThreadCount)
+            : m_barrier(blockThreadCount)
+            , m_barrierWithPredicate(blockThreadCount)
+        {
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierThread(BlockSyncBarrierThread const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST BlockSyncBarrierThread(BlockSyncBarrierThread&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(BlockSyncBarrierThread const&) -> BlockSyncBarrierThread& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(BlockSyncBarrierThread&&) -> BlockSyncBarrierThread& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSyncBarrierThread() = default;
 
-                Barrier mutable m_barrier;
-                BarrierWithPredicate mutable m_barrierWithPredicate;
-            };
+        Barrier mutable m_barrier;
+        BarrierWithPredicate mutable m_barrierWithPredicate;
+    };
 
-            namespace traits
+    namespace traits
+    {
+        //#############################################################################
+        template<typename TIdx>
+        struct SyncBlockThreads<BlockSyncBarrierThread<TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto syncBlockThreads(BlockSyncBarrierThread<TIdx> const& blockSync) -> void
             {
-                //#############################################################################
-                template<
-                    typename TIdx>
-                struct SyncBlockThreads<
-                    BlockSyncBarrierThread<TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto syncBlockThreads(
-                        block::sync::BlockSyncBarrierThread<TIdx> const & blockSync)
-                    -> void
-                    {
-                        blockSync.m_barrier.wait();
-                    }
-                };
+                blockSync.m_barrier.wait();
+            }
+        };
 
-                //#############################################################################
-                template<
-                    typename TOp,
-                    typename TIdx>
-                struct SyncBlockThreadsPredicate<
-                    TOp,
-                    BlockSyncBarrierThread<TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncBarrierThread<TIdx> const & blockSync,
-                        int predicate)
-                    -> int
-                    {
-                        return blockSync.m_barrierWithPredicate.template wait<TOp>(predicate);
-                    }
-                };
+        //#############################################################################
+        template<typename TOp, typename TIdx>
+        struct SyncBlockThreadsPredicate<TOp, BlockSyncBarrierThread<TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
+                BlockSyncBarrierThread<TIdx> const& blockSync,
+                int predicate) -> int
+            {
+                return blockSync.m_barrierWithPredicate.template wait<TOp>(predicate);
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp
deleted file mode 100644
index cf2532e37f..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/block/sync/Traits.hpp>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace sync
-        {
-            //#############################################################################
-            //! The GPU CUDA block synchronization.
-            class BlockSyncCudaBuiltIn : public concepts::Implements<ConceptBlockSync, BlockSyncCudaBuiltIn>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                BlockSyncCudaBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                __device__ BlockSyncCudaBuiltIn(BlockSyncCudaBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ BlockSyncCudaBuiltIn(BlockSyncCudaBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(BlockSyncCudaBuiltIn const &) -> BlockSyncCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(BlockSyncCudaBuiltIn &&) -> BlockSyncCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~BlockSyncCudaBuiltIn() = default;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                template<>
-                struct SyncBlockThreads<
-                    BlockSyncCudaBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto syncBlockThreads(
-                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/)
-                    -> void
-                    {
-                        __syncthreads();
-                    }
-                };
-
-                //#############################################################################
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::Count,
-                    BlockSyncCudaBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-                        return __syncthreads_count(predicate);
-                    }
-                };
-
-                //#############################################################################
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::LogicalAnd,
-                    BlockSyncCudaBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-                        return __syncthreads_and(predicate);
-                    }
-                };
-
-                //#############################################################################
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::LogicalOr,
-                    BlockSyncCudaBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-                        return __syncthreads_or(predicate);
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncHipBuiltIn.hpp
deleted file mode 100644
index 6d1736d38a..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncHipBuiltIn.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/block/sync/Traits.hpp>
-
-namespace alpaka
-{
-    namespace block
-    {
-        namespace sync
-        {
-            //#############################################################################
-            //! The GPU HIP block synchronization.
-            class BlockSyncHipBuiltIn : public concepts::Implements<ConceptBlockSync, BlockSyncHipBuiltIn>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                //! Default constructor.
-                ALPAKA_FN_HOST_ACC BlockSyncHipBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                //! Copy constructor.
-                __device__ BlockSyncHipBuiltIn(BlockSyncHipBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                //! Move constructor.
-                __device__ BlockSyncHipBuiltIn(BlockSyncHipBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                //! Copy assignment operator.
-                __device__ auto operator=(BlockSyncHipBuiltIn const &) -> BlockSyncHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                //! Move assignment operator.
-                __device__ auto operator=(BlockSyncHipBuiltIn &&) -> BlockSyncHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                //! Destructor.
-                /*virtual*/ ALPAKA_FN_HOST_ACC ~BlockSyncHipBuiltIn() = default;
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                //!
-                template<>
-                struct SyncBlockThreads<
-                    BlockSyncHipBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto syncBlockThreads(
-                        block::sync::BlockSyncHipBuiltIn const & /*blockSync*/)
-                    -> void
-                    {
-                        __syncthreads();
-                    }
-                };
-
-                //#############################################################################
-                //!
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::Count,
-                    BlockSyncHipBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncHipBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-#if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__==0 && (BOOST_COMP_HCC || BOOST_COMP_HIP)
-                        // workaround for unsupported syncthreads_* operation on HIP(HCC)
-                        __shared__ int tmp;
-                        __syncthreads();
-                        if(threadIdx.x==0)
-                            tmp=0;
-                        __syncthreads();
-                        if(predicate)
-                            atomicAdd(&tmp, 1);
-                        __syncthreads();
-
-                        return tmp;
-#else
-                        return __syncthreads_count(predicate);
-#endif
-                    }
-                };
-
-                //#############################################################################
-                //!
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::LogicalAnd,
-                    BlockSyncHipBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncHipBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-#if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__==0 && (BOOST_COMP_HCC || BOOST_COMP_HIP)
-                        // workaround for unsupported syncthreads_* operation on HIP(HCC)
-                        __shared__ int tmp;
-                        __syncthreads();
-                        if(threadIdx.x==0)
-                            tmp=1;
-                        __syncthreads();
-                        if(!predicate)
-                            atomicAnd(&tmp, 0);
-                        __syncthreads();
-
-                        return tmp;
-#else
-                        return __syncthreads_and(predicate);
-#endif
-                    }
-                };
-
-                //#############################################################################
-                //!
-                template<>
-                struct SyncBlockThreadsPredicate<
-                    block::sync::op::LogicalOr,
-                    BlockSyncHipBuiltIn>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncHipBuiltIn const & /*blockSync*/,
-                        int predicate)
-                    -> int
-                    {
-#if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__==0 && (BOOST_COMP_HCC || BOOST_COMP_HIP)
-                        // workaround for unsupported syncthreads_* operation on HIP(HCC)
-                        __shared__ int tmp;
-                        __syncthreads();
-                        if(threadIdx.x==0)
-                            tmp=0;
-                        __syncthreads();
-                        if(predicate)
-                            atomicOr(&tmp, 1);
-                        __syncthreads();
-
-                        return tmp;
-#else
-                        return __syncthreads_or(predicate);
-#endif
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp
index d93c9acf90..9d42818c56 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncNoOp.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,72 +10,56 @@
 #pragma once
 
 #include <alpaka/block/sync/Traits.hpp>
-
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
 
 namespace alpaka
 {
-    namespace block
+    //#############################################################################
+    //! The no op block synchronization.
+    class BlockSyncNoOp : public concepts::Implements<ConceptBlockSync, BlockSyncNoOp>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_ACC BlockSyncNoOp() = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_ACC BlockSyncNoOp(BlockSyncNoOp const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_ACC BlockSyncNoOp(BlockSyncNoOp&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_ACC auto operator=(BlockSyncNoOp const&) -> BlockSyncNoOp& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_ACC auto operator=(BlockSyncNoOp&&) -> BlockSyncNoOp& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ALPAKA_FN_ACC ~BlockSyncNoOp() = default;
+    };
+
+    namespace traits
     {
-        namespace sync
+        //#############################################################################
+        template<>
+        struct SyncBlockThreads<BlockSyncNoOp>
         {
-            //#############################################################################
-            //! The no op block synchronization.
-            class BlockSyncNoOp : public concepts::Implements<ConceptBlockSync, BlockSyncNoOp>
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreads(BlockSyncNoOp const& blockSync) -> void
             {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC BlockSyncNoOp() = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC BlockSyncNoOp(BlockSyncNoOp const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC BlockSyncNoOp(BlockSyncNoOp &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC auto operator=(BlockSyncNoOp const &) -> BlockSyncNoOp & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_ACC auto operator=(BlockSyncNoOp &&) -> BlockSyncNoOp & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ALPAKA_FN_ACC ~BlockSyncNoOp() = default;
-            };
+                alpaka::ignore_unused(blockSync);
+                // Nothing to do.
+            }
+        };
 
-            namespace traits
+        //#############################################################################
+        template<typename TOp>
+        struct SyncBlockThreadsPredicate<TOp, BlockSyncNoOp>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(BlockSyncNoOp const& blockSync, int predicate) -> int
             {
-                //#############################################################################
-                template<>
-                struct SyncBlockThreads<
-                    BlockSyncNoOp>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreads(
-                        block::sync::BlockSyncNoOp const & blockSync)
-                    -> void
-                    {
-                        alpaka::ignore_unused(blockSync);
-                        // Nothing to do.
-                    }
-                };
-
-                //#############################################################################
-                template<
-                    typename TOp>
-                struct SyncBlockThreadsPredicate<
-                    TOp,
-                    BlockSyncNoOp>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                        block::sync::BlockSyncNoOp const & blockSync,
-                        int predicate)
-                    -> int
-                    {
-                        alpaka::ignore_unused(blockSync);
-                        return predicate;
-                    }
-                };
+                alpaka::ignore_unused(blockSync);
+                return predicate;
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..5782fd6856
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,144 @@
+/* Copyright 2019 Benjamin Worpitz, Matthias Werner
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/block/sync/Traits.hpp>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The GPU CUDA/HIP block synchronization.
+    class BlockSyncUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptBlockSync, BlockSyncUniformCudaHipBuiltIn>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        BlockSyncUniformCudaHipBuiltIn() = default;
+        //-----------------------------------------------------------------------------
+        __device__ BlockSyncUniformCudaHipBuiltIn(BlockSyncUniformCudaHipBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ BlockSyncUniformCudaHipBuiltIn(BlockSyncUniformCudaHipBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(BlockSyncUniformCudaHipBuiltIn const&) -> BlockSyncUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(BlockSyncUniformCudaHipBuiltIn&&) -> BlockSyncUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~BlockSyncUniformCudaHipBuiltIn() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<>
+        struct SyncBlockThreads<BlockSyncUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto syncBlockThreads(BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/) -> void
+            {
+                __syncthreads();
+            }
+        };
+
+        //#############################################################################
+        template<>
+        struct SyncBlockThreadsPredicate<BlockCount, BlockSyncUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto syncBlockThreadsPredicate(
+                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
+                int predicate) -> int
+            {
+#    if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
+                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
+                __shared__ int tmp;
+                __syncthreads();
+                if(threadIdx.x == 0)
+                    tmp = 0;
+                __syncthreads();
+                if(predicate)
+                    ::atomicAdd(&tmp, 1);
+                __syncthreads();
+
+                return tmp;
+#    else
+                return __syncthreads_count(predicate);
+#    endif
+            }
+        };
+
+        //#############################################################################
+        template<>
+        struct SyncBlockThreadsPredicate<BlockAnd, BlockSyncUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto syncBlockThreadsPredicate(
+                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
+                int predicate) -> int
+            {
+#    if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
+                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
+                __shared__ int tmp;
+                __syncthreads();
+                if(threadIdx.x == 0)
+                    tmp = 1;
+                __syncthreads();
+                if(!predicate)
+                    ::atomicAnd(&tmp, 0);
+                __syncthreads();
+
+                return tmp;
+#    else
+                return __syncthreads_and(predicate);
+#    endif
+            }
+        };
+
+        //#############################################################################
+        template<>
+        struct SyncBlockThreadsPredicate<BlockOr, BlockSyncUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto syncBlockThreadsPredicate(
+                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
+                int predicate) -> int
+            {
+#    if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
+                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
+                __shared__ int tmp;
+                __syncthreads();
+                if(threadIdx.x == 0)
+                    tmp = 0;
+                __syncthreads();
+                if(predicate)
+                    ::atomicOr(&tmp, 1);
+                __syncthreads();
+
+                return tmp;
+#    else
+                return __syncthreads_or(predicate);
+#    endif
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/block/sync/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/block/sync/Traits.hpp
index 57e769a24a..06e3924d6b 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/block/sync/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/block/sync/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,139 +16,103 @@
 
 namespace alpaka
 {
-    //-----------------------------------------------------------------------------
-    //! The grid block specifics
-    namespace block
+    struct ConceptBlockSync
     {
-        //-----------------------------------------------------------------------------
-        //! The block synchronization specifics.
-        namespace sync
-        {
-            struct ConceptBlockSync;
-
-            //-----------------------------------------------------------------------------
-            //! The block synchronization traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The block synchronization operation trait.
-                template<
-                    typename TBlockSync,
-                    typename TSfinae = void>
-                struct SyncBlockThreads;
+    };
 
-                //#############################################################################
-                //! The block synchronization and predicate operation trait.
-                template<
-                    typename TOp,
-                    typename TBlockSync,
-                    typename TSfinae = void>
-                struct SyncBlockThreadsPredicate;
-            }
+    //-----------------------------------------------------------------------------
+    //! The block synchronization traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The block synchronization operation trait.
+        template<typename TBlockSync, typename TSfinae = void>
+        struct SyncBlockThreads;
 
-            //-----------------------------------------------------------------------------
-            //! Synchronizes all threads within the current block (independently for all blocks).
-            //!
-            //! \tparam TBlockSync The block synchronization implementation type.
-            //! \param blockSync The block synchronization implementation.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TBlockSync>
-            ALPAKA_FN_ACC auto syncBlockThreads(
-                TBlockSync const & blockSync)
-            -> void
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
-                traits::SyncBlockThreads<
-                    ImplementationBase>
-                ::syncBlockThreads(
-                    blockSync);
-            }
+        //#############################################################################
+        //! The block synchronization and predicate operation trait.
+        template<typename TOp, typename TBlockSync, typename TSfinae = void>
+        struct SyncBlockThreadsPredicate;
+    } // namespace traits
 
-            //-----------------------------------------------------------------------------
-            //! Defines operation functors.
-            namespace op
-            {
-                //#############################################################################
-                //! The addition function object.
-                struct Count
-                {
-                    enum { InitialValue = 0u};
+    //-----------------------------------------------------------------------------
+    //! Synchronizes all threads within the current block (independently for all blocks).
+    //!
+    //! \tparam TBlockSync The block synchronization implementation type.
+    //! \param blockSync The block synchronization implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TBlockSync>
+    ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const& blockSync) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
+        traits::SyncBlockThreads<ImplementationBase>::syncBlockThreads(blockSync);
+    }
 
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename T>
-                    ALPAKA_FN_HOST_ACC auto operator()(
-                        T const & currentResult,
-                        T const & value) const
-                    -> T
-                    {
-                        return currentResult + static_cast<T>(value != static_cast<T>(0));
-                    }
-                };
-                //#############################################################################
-                //! The logical and function object.
-                struct LogicalAnd
-                {
-                    enum { InitialValue = 1u};
+    //#############################################################################
+    //! The counting function object.
+    struct BlockCount
+    {
+        enum
+        {
+            InitialValue = 0u
+        };
 
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename T>
-                    ALPAKA_FN_HOST_ACC auto operator()(
-                        T const & currentResult,
-                        T const & value) const
-                    -> T
-                    {
-                        return static_cast<T>(currentResult && (value != static_cast<T>(0)));
-                    }
-                };
-                //#############################################################################
-                //! The logical or function object.
-                struct LogicalOr
-                {
-                    enum { InitialValue = 0u};
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
+        {
+            return currentResult + static_cast<T>(value != static_cast<T>(0));
+        }
+    };
+    //#############################################################################
+    //! The logical and function object.
+    struct BlockAnd
+    {
+        enum
+        {
+            InitialValue = 1u
+        };
 
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename T>
-                    ALPAKA_FN_HOST_ACC auto operator()(
-                        T const & currentResult,
-                        T const & value) const
-                    -> T
-                    {
-                        return static_cast<T>(currentResult || (value != static_cast<T>(0)));
-                    }
-                };
-            }
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
+        {
+            return static_cast<T>(currentResult && (value != static_cast<T>(0)));
+        }
+    };
+    //#############################################################################
+    //! The logical or function object.
+    struct BlockOr
+    {
+        enum
+        {
+            InitialValue = 0u
+        };
 
-            //-----------------------------------------------------------------------------
-            //! Synchronizes all threads within the current block (independently for all blocks),
-            //! evaluates the predicate for all threads and returns the combination of all the results
-            //! computed via TOp.
-            //!
-            //! \tparam TOp The operation used to combine the predicate values of all threads.
-            //! \tparam TBlockSync The block synchronization implementation type.
-            //! \param blockSync The block synchronization implementation.
-            //! \param predicate The predicate value of the current thread.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TOp,
-                typename TBlockSync>
-            ALPAKA_FN_ACC auto syncBlockThreadsPredicate(
-                TBlockSync const & blockSync,
-                int predicate)
-            -> int
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
-                return
-                    traits::SyncBlockThreadsPredicate<
-                        TOp,
-                        ImplementationBase>
-                    ::syncBlockThreadsPredicate(
-                        blockSync,
-                        predicate);
-            }
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
+        {
+            return static_cast<T>(currentResult || (value != static_cast<T>(0)));
         }
+    };
+
+    //-----------------------------------------------------------------------------
+    //! Synchronizes all threads within the current block (independently for all blocks),
+    //! evaluates the predicate for all threads and returns the combination of all the results
+    //! computed via TOp.
+    //!
+    //! \tparam TOp The operation used to combine the predicate values of all threads.
+    //! \tparam TBlockSync The block synchronization implementation type.
+    //! \param blockSync The block synchronization implementation.
+    //! \param predicate The predicate value of the current thread.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOp, typename TBlockSync>
+    ALPAKA_FN_ACC auto syncBlockThreadsPredicate(TBlockSync const& blockSync, int predicate) -> int
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
+        return traits::SyncBlockThreadsPredicate<TOp, ImplementationBase>::syncBlockThreadsPredicate(
+            blockSync,
+            predicate);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Align.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Align.hpp
index c0b05a31c6..ada55eadb3 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Align.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Align.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -21,8 +21,7 @@ namespace alpaka
         //-----------------------------------------------------------------------------
         //! Rounds to the next higher power of two (if not already power of two).
         // Adapted from llvm/ADT/SmallPtrSet.h
-        template<
-            std::size_t N>
+        template<std::size_t N>
         struct RoundUpToPowerOfTwo;
 
         //-----------------------------------------------------------------------------
@@ -31,38 +30,26 @@ namespace alpaka
         {
             //-----------------------------------------------------------------------------
             //! Base case for N being a power of two.
-            template<
-                std::size_t N,
-                bool TisPowerTwo>
-            struct RoundUpToPowerOfTwoHelper :
-                std::integral_constant<
-                    std::size_t,
-                    N>
-            {};
+            template<std::size_t N, bool TisPowerTwo>
+            struct RoundUpToPowerOfTwoHelper : std::integral_constant<std::size_t, N>
+            {
+            };
             //-----------------------------------------------------------------------------
             //! Case for N not being a power of two.
             // We could just use NextVal = N+1, but this converges faster.  N|(N-1) sets
             // the right-most zero bits to one all at once, e.g. 0b0011000 -> 0b0011111.
-            template<
-                std::size_t N>
-            struct RoundUpToPowerOfTwoHelper<
-                N,
-                false> :
-                    std::integral_constant<
-                        std::size_t,
-                        RoundUpToPowerOfTwo<(N | (N - 1)) + 1>::value>
-            {};
-        }
+            template<std::size_t N>
+            struct RoundUpToPowerOfTwoHelper<N, false>
+                : std::integral_constant<std::size_t, RoundUpToPowerOfTwo<(N | (N - 1)) + 1>::value>
+            {
+            };
+        } // namespace detail
         //-----------------------------------------------------------------------------
-        template<
-            std::size_t N>
-        struct RoundUpToPowerOfTwo :
-            std::integral_constant<
-                std::size_t,
-                detail::RoundUpToPowerOfTwoHelper<
-                    N,
-                    (N&(N - 1)) == 0>::value>
-        {};
+        template<std::size_t N>
+        struct RoundUpToPowerOfTwo
+            : std::integral_constant<std::size_t, detail::RoundUpToPowerOfTwoHelper<N, (N & (N - 1)) == 0>::value>
+        {
+        };
 
         //-----------------------------------------------------------------------------
         //! The alignment specifics.
@@ -70,38 +57,39 @@ namespace alpaka
         {
             //-----------------------------------------------------------------------------
             //! Calculates the optimal alignment for data of the given size.
-            template<
-                std::size_t TsizeBytes>
-            struct OptimalAlignment :
-                std::integral_constant<
-                    std::size_t,
+            template<std::size_t TsizeBytes>
+            struct OptimalAlignment
+                : std::integral_constant<
+                      std::size_t,
 #if BOOST_COMP_GNUC
-                    // GCC does not support alignments larger then 128: "warning: requested alignment 256 is larger than 128[-Wattributes]".
-                    (TsizeBytes > 64)
-                        ? 128
-                        :
+                      // GCC does not support alignments larger then 128: "warning: requested alignment 256 is larger
+                      // than 128[-Wattributes]".
+                      (TsizeBytes > 64) ? 128 :
 #endif
-                            (RoundUpToPowerOfTwo<TsizeBytes>::value)>
-            {};
-        }
-    }
-}
+                                        (RoundUpToPowerOfTwo<TsizeBytes>::value)>
+            {
+            };
+        } // namespace align
+    } // namespace core
+} // namespace alpaka
 
 // ICC does not support constant expressions as parameters to alignas
 // The optimal alignment for a type is the next higher or equal power of two.
 #if BOOST_COMP_INTEL
-    #define ALPAKA_OPTIMAL_ALIGNMENT_SIZE(...)\
-            ((__VA_ARGS__)==1?1:\
-            ((__VA_ARGS__)<=2?2:\
-            ((__VA_ARGS__)<=4?4:\
-            ((__VA_ARGS__)<=8?8:\
-            ((__VA_ARGS__)<=16?16:\
-            ((__VA_ARGS__)<=32?32:\
-            ((__VA_ARGS__)<=64?64:128\
-            )))))))
-    #define ALPAKA_OPTIMAL_ALIGNMENT(...)\
-            ALPAKA_OPTIMAL_ALIGNMENT_SIZE(sizeof(typename std::remove_cv<__VA_ARGS__>::type))
+#    define ALPAKA_OPTIMAL_ALIGNMENT_SIZE(...)                                                                        \
+        ((__VA_ARGS__) == 1                                                                                           \
+             ? 1                                                                                                      \
+             : ((__VA_ARGS__) <= 2                                                                                    \
+                    ? 2                                                                                               \
+                    : ((__VA_ARGS__) <= 4                                                                             \
+                           ? 4                                                                                        \
+                           : ((__VA_ARGS__) <= 8                                                                      \
+                                  ? 8                                                                                 \
+                                  : ((__VA_ARGS__) <= 16                                                              \
+                                         ? 16                                                                         \
+                                         : ((__VA_ARGS__) <= 32 ? 32 : ((__VA_ARGS__) <= 64 ? 64 : 128)))))))
+#    define ALPAKA_OPTIMAL_ALIGNMENT(...) ALPAKA_OPTIMAL_ALIGNMENT_SIZE(sizeof(std::remove_cv_t<__VA_ARGS__>))
 #else
-    #define ALPAKA_OPTIMAL_ALIGNMENT(...)\
-            ::alpaka::core::align::OptimalAlignment<sizeof(typename std::remove_cv<__VA_ARGS__>::type)>::value
+#    define ALPAKA_OPTIMAL_ALIGNMENT(...)                                                                             \
+        ::alpaka::core::align::OptimalAlignment<sizeof(std::remove_cv_t<__VA_ARGS__>)>::value
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/AlignedAlloc.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/AlignedAlloc.hpp
new file mode 100644
index 0000000000..36423ab237
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/AlignedAlloc.hpp
@@ -0,0 +1,70 @@
+/* Copyright 2020 René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/BoostPredef.hpp>
+#include <alpaka/core/Common.hpp>
+
+#if BOOST_COMP_MSVC
+#    include <malloc.h>
+#else
+#    include <cstdlib>
+#endif
+
+namespace alpaka
+{
+    namespace core
+    {
+        //-----------------------------------------------------------------------------
+        //! Rounds to the next higher power of two (if not already power of two).
+        // Adapted from llvm/ADT/SmallPtrSet.h
+        ALPAKA_FN_INLINE ALPAKA_FN_HOST void* alignedAlloc(size_t alignment, size_t size)
+        {
+#if BOOST_OS_WINDOWS
+            return _aligned_malloc(size, alignment);
+#elif BOOST_OS_MACOS
+            void* ptr = nullptr;
+            posix_memalign(&ptr, alignment, size);
+            return ptr;
+#else
+            // the amount of bytes to allocate must be a multiple of the alignment
+            size_t sizeToAllocate = ((size + alignment - 1u) / alignment) * alignment;
+            return ::aligned_alloc(alignment, sizeToAllocate);
+#endif
+        }
+
+        ALPAKA_FN_INLINE ALPAKA_FN_HOST void alignedFree(void* ptr)
+        {
+#if BOOST_OS_WINDOWS
+            _aligned_free(ptr);
+#else
+            // linux and macos
+            ::free(ptr);
+#endif
+        }
+
+        //#############################################################################
+        //! destroy aligned object and free aligned memory
+        struct AlignedDelete
+        {
+            constexpr AlignedDelete() = default;
+
+            //-----------------------------------------------------------------------------
+            //! Calls ~T() on ptr to destroy the object and then calls aligned_free to free the allocated memory.
+            template<typename T>
+            void operator()(T* ptr) const
+            {
+                if(ptr)
+                    ptr->~T();
+                alignedFree(reinterpret_cast<void*>(ptr));
+            }
+        };
+    } // namespace core
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Assert.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Assert.hpp
index b10f9f9156..a2b1deae1c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Assert.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Assert.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,39 +16,7 @@
 #include <type_traits>
 
 
-#if !(defined(BOOST_LANG_HIP) && BOOST_LANG_HIP && BOOST_COMP_HCC)
-  #define ALPAKA_ASSERT(EXPRESSION) assert(EXPRESSION)
-#else
-
-  // Including assert.h would interfere with HIP's host-device implementation
-  // see: https://github.com/ROCm-Developer-Tools/HIP/issues/599
-  // However, cassert is still in some header, so we have to do a workaround for HIP.
-  #ifdef NDEBUG
-    #define ALPAKA_ASSERT(EXPRESSION) static_cast<void>(0)
-  #else
-    #define ALPAKA_ASSERT(EXPRESSION) assert_workaround(EXPRESSION)
-
-    #pragma push_macro("__DEVICE__")
-    #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
-            __attribute__((weak))
-
-     __DEVICE__ void __device_trap() __asm("llvm.trap");
-
-     __host__ __device__
-     __attribute__((always_inline))             \
-     __attribute__((weak))
-     void assert_workaround(bool expr) {
-       if(!expr) {
-         printf("assert failed.\n");
-         #if __HIP_DEVICE_COMPILE__==1
-           __device_trap();
-         #else
-           exit(1);
-         #endif
-       }
-     }
-  #endif //NDEBUG
-#endif
+#define ALPAKA_ASSERT(EXPRESSION) assert(EXPRESSION)
 
 namespace alpaka
 {
@@ -57,21 +25,14 @@ namespace alpaka
         namespace detail
         {
             //#############################################################################
-            template<
-                typename TArg,
-                typename TSfinae = void>
+            template<typename TArg, typename TSfinae = void>
             struct AssertValueUnsigned;
             //#############################################################################
-            template<
-                typename TArg>
-            struct AssertValueUnsigned<
-                TArg,
-                typename std::enable_if<!std::is_unsigned<TArg>::value>::type>
+            template<typename TArg>
+            struct AssertValueUnsigned<TArg, std::enable_if_t<!std::is_unsigned<TArg>::value>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto assertValueUnsigned(
-                    TArg const & arg)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto assertValueUnsigned(TArg const& arg) -> void
                 {
 #ifdef NDEBUG
                     alpaka::ignore_unused(arg);
@@ -81,59 +42,41 @@ namespace alpaka
                 }
             };
             //#############################################################################
-            template<
-                typename TArg>
-            struct AssertValueUnsigned<
-                TArg,
-                typename std::enable_if<std::is_unsigned<TArg>::value>::type>
+            template<typename TArg>
+            struct AssertValueUnsigned<TArg, std::enable_if_t<std::is_unsigned<TArg>::value>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto assertValueUnsigned(
-                    TArg const & arg)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto assertValueUnsigned(TArg const& arg) -> void
                 {
                     alpaka::ignore_unused(arg);
                     // Nothing to do for unsigned types.
                 }
             };
-        }
+        } // namespace detail
         //-----------------------------------------------------------------------------
         //! This method checks integral values if they are greater or equal zero.
         //! The implementation prevents warnings for checking this for unsigned types.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto assertValueUnsigned(
-            TArg const & arg)
-        -> void
+        template<typename TArg>
+        ALPAKA_FN_HOST_ACC auto assertValueUnsigned(TArg const& arg) -> void
         {
-            detail::AssertValueUnsigned<
-                TArg>
-            ::assertValueUnsigned(
-                arg);
+            detail::AssertValueUnsigned<TArg>::assertValueUnsigned(arg);
         }
 
         namespace detail
         {
             //#############################################################################
-            template<
-                typename TLhs,
-                typename TRhs,
-                typename TSfinae = void>
+            template<typename TLhs, typename TRhs, typename TSfinae = void>
             struct AssertGreaterThan;
             //#############################################################################
-            template<
-                typename TLhs,
-                typename TRhs>
+            template<typename TLhs, typename TRhs>
             struct AssertGreaterThan<
                 TLhs,
                 TRhs,
-                typename std::enable_if<!std::is_unsigned<TRhs>::value || (TLhs::value != 0u)>::type>
+                std::enable_if_t<!std::is_unsigned<TRhs>::value || (TLhs::value != 0u)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto assertGreaterThan(
-                    TRhs const & lhs)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto assertGreaterThan(TRhs const& lhs) -> void
                 {
 #ifdef NDEBUG
                     alpaka::ignore_unused(lhs);
@@ -143,39 +86,27 @@ namespace alpaka
                 }
             };
             //#############################################################################
-            template<
-                typename TLhs,
-                typename TRhs>
+            template<typename TLhs, typename TRhs>
             struct AssertGreaterThan<
                 TLhs,
                 TRhs,
-                typename std::enable_if<std::is_unsigned<TRhs>::value && (TLhs::value == 0u)>::type>
+                std::enable_if_t<std::is_unsigned<TRhs>::value && (TLhs::value == 0u)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto assertGreaterThan(
-                    TRhs const & lhs)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto assertGreaterThan(TRhs const& lhs) -> void
                 {
                     alpaka::ignore_unused(lhs);
                     // Nothing to do for unsigned types camparing to zero.
                 }
             };
-        }
+        } // namespace detail
         //-----------------------------------------------------------------------------
         //! This method asserts that the integral value TArg is less than Tidx.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TLhs,
-            typename TRhs>
-        ALPAKA_FN_HOST_ACC auto assertGreaterThan(
-            TRhs const & lhs)
-        -> void
+        template<typename TLhs, typename TRhs>
+        ALPAKA_FN_HOST_ACC auto assertGreaterThan(TRhs const& lhs) -> void
         {
-            detail::AssertGreaterThan<
-                TLhs,
-                TRhs>
-            ::assertGreaterThan(
-                lhs);
+            detail::AssertGreaterThan<TLhs, TRhs>::assertGreaterThan(lhs);
         }
-    }
-}
+    } // namespace core
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/BarrierThread.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/BarrierThread.hpp
index 8853b0970f..7a6f10077b 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/BarrierThread.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/BarrierThread.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,14 +12,14 @@
 // Uncomment this to disable the standard spinlock behaviour of the threads
 //#define ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
 
-#include <alpaka/core/Common.hpp>
 #include <alpaka/block/sync/Traits.hpp>
+#include <alpaka/core/Common.hpp>
 
-#include <mutex>
 #include <condition_variable>
+#include <mutex>
 #ifndef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-    #include <atomic>
-    #include <thread>
+#    include <atomic>
+#    include <thread>
 #endif
 
 namespace alpaka
@@ -30,33 +30,31 @@ namespace alpaka
         {
             //#############################################################################
             //! A self-resetting barrier.
-            template<
-                typename TIdx>
+            template<typename TIdx>
             class BarrierThread final
             {
             public:
                 //-----------------------------------------------------------------------------
-                explicit BarrierThread(
-                    TIdx const & threadCount) :
-                    m_threadCount(threadCount),
-                    m_curThreadCount(threadCount),
-                    m_generation(0)
-                {}
+                explicit BarrierThread(TIdx const& threadCount)
+                    : m_threadCount(threadCount)
+                    , m_curThreadCount(threadCount)
+                    , m_generation(0)
+                {
+                }
                 //-----------------------------------------------------------------------------
-                BarrierThread(BarrierThread const &) = delete;
+                BarrierThread(BarrierThread const&) = delete;
                 //-----------------------------------------------------------------------------
-                BarrierThread(BarrierThread &&) = delete;
+                BarrierThread(BarrierThread&&) = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(BarrierThread const &) -> BarrierThread & = delete;
+                auto operator=(BarrierThread const&) -> BarrierThread& = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(BarrierThread &&) -> BarrierThread & = delete;
+                auto operator=(BarrierThread&&) -> BarrierThread& = delete;
                 //-----------------------------------------------------------------------------
                 ~BarrierThread() = default;
 
                 //-----------------------------------------------------------------------------
                 //! Waits for all the other threads to reach the barrier.
-                auto wait()
-                -> void
+                auto wait() -> void
                 {
                     TIdx const generationWhenEnteredTheWait = m_generation;
 #ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
@@ -73,7 +71,9 @@ namespace alpaka
                     else
                     {
 #ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                        m_cvAllThreadsReachedBarrier.wait(lock, [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
+                        m_cvAllThreadsReachedBarrier.wait(lock, [this, generationWhenEnteredTheWait] {
+                            return generationWhenEnteredTheWait != m_generation;
+                        });
 #else
                         while(generationWhenEnteredTheWait == m_generation)
                         {
@@ -101,13 +101,11 @@ namespace alpaka
             namespace detail
             {
                 //#############################################################################
-                template<
-                    typename TOp>
+                template<typename TOp>
                 struct AtomicOp;
                 //#############################################################################
                 template<>
-                struct AtomicOp<
-                    block::sync::op::Count>
+                struct AtomicOp<BlockCount>
                 {
                     void operator()(std::atomic<int>& result, bool value)
                     {
@@ -116,8 +114,7 @@ namespace alpaka
                 };
                 //#############################################################################
                 template<>
-                struct AtomicOp<
-                    block::sync::op::LogicalAnd>
+                struct AtomicOp<BlockAnd>
                 {
                     void operator()(std::atomic<int>& result, bool value)
                     {
@@ -126,47 +123,43 @@ namespace alpaka
                 };
                 //#############################################################################
                 template<>
-                struct AtomicOp<
-                    block::sync::op::LogicalOr>
+                struct AtomicOp<BlockOr>
                 {
                     void operator()(std::atomic<int>& result, bool value)
                     {
                         result |= static_cast<int>(value);
                     }
                 };
-            }
+            } // namespace detail
 
             //#############################################################################
             //! A self-resetting barrier with barrier.
-            template<
-                typename TIdx>
+            template<typename TIdx>
             class BarrierThreadWithPredicate final
             {
             public:
                 //-----------------------------------------------------------------------------
-                explicit BarrierThreadWithPredicate(
-                    TIdx const & threadCount) :
-                    m_threadCount(threadCount),
-                    m_curThreadCount(threadCount),
-                    m_generation(0)
-                {}
+                explicit BarrierThreadWithPredicate(TIdx const& threadCount)
+                    : m_threadCount(threadCount)
+                    , m_curThreadCount(threadCount)
+                    , m_generation(0)
+                {
+                }
                 //-----------------------------------------------------------------------------
-                BarrierThreadWithPredicate(BarrierThreadWithPredicate const & other) = delete;
+                BarrierThreadWithPredicate(BarrierThreadWithPredicate const& other) = delete;
                 //-----------------------------------------------------------------------------
-                BarrierThreadWithPredicate(BarrierThreadWithPredicate &&) = delete;
+                BarrierThreadWithPredicate(BarrierThreadWithPredicate&&) = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(BarrierThreadWithPredicate const &) -> BarrierThreadWithPredicate & = delete;
+                auto operator=(BarrierThreadWithPredicate const&) -> BarrierThreadWithPredicate& = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(BarrierThreadWithPredicate &&) -> BarrierThreadWithPredicate & = delete;
+                auto operator=(BarrierThreadWithPredicate&&) -> BarrierThreadWithPredicate& = delete;
                 //-----------------------------------------------------------------------------
                 ~BarrierThreadWithPredicate() = default;
 
                 //-----------------------------------------------------------------------------
                 //! Waits for all the other threads to reach the barrier.
-                template<
-                    typename TOp>
-                ALPAKA_FN_HOST auto wait(int predicate)
-                -> int
+                template<typename TOp>
+                ALPAKA_FN_HOST auto wait(int predicate) -> int
                 {
                     TIdx const generationWhenEnteredTheWait = m_generation;
                     std::unique_lock<std::mutex> lock(m_mtxBarrier);
@@ -190,7 +183,9 @@ namespace alpaka
                     }
                     else
                     {
-                        m_cvAllThreadsReachedBarrier.wait(lock, [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
+                        m_cvAllThreadsReachedBarrier.wait(lock, [this, generationWhenEnteredTheWait] {
+                            return generationWhenEnteredTheWait != m_generation;
+                        });
                     }
                     return m_result[generationMod2];
                 }
@@ -203,6 +198,6 @@ namespace alpaka
                 TIdx m_generation;
                 std::atomic<int> m_result[2];
             };
-        }
-    }
-}
+        } // namespace threads
+    } // namespace core
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/BoostPredef.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/BoostPredef.hpp
index 14790d6804..9778bb454f 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/BoostPredef.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/BoostPredef.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -15,58 +15,48 @@
 // In boost since 1.68.0
 // BOOST_PREDEF_MAKE_10_VVRRP(V)
 #if !defined(BOOST_PREDEF_MAKE_10_VVRRP)
-    #define BOOST_PREDEF_MAKE_10_VVRRP(V) BOOST_VERSION_NUMBER(((V)/1000)%100,((V)/10)%100,(V)%10)
+#    define BOOST_PREDEF_MAKE_10_VVRRP(V) BOOST_VERSION_NUMBER(((V) / 1000) % 100, ((V) / 10) % 100, (V) % 10)
 #endif
 
 //---------------------------------------HIP-----------------------------------
-// __HIPCC__ is defined by hipcc (if either __HCC__ or __CUDACC__ is defined)
+// __HIPCC__ is defined by hipcc (if either __CUDACC__ is defined)
+// https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#compiler-defines-summary
 #if !defined(BOOST_LANG_HIP)
-  #if defined(__HIPCC__) && ( defined(__CUDACC__) || defined(__HCC__) || defined(__HIP__))
-    #include <hip/hip_runtime.h>
-    //HIP defines "abort()" as "{asm("trap;");}", which breaks some kernels
-    #undef abort
-    // there is no HIP_VERSION macro
-    #define BOOST_LANG_HIP BOOST_VERSION_NUMBER_AVAILABLE
-    #if defined(BOOST_LANG_CUDA) && BOOST_LANG_CUDA
-        #undef BOOST_LANG_CUDA
-        #define BOOST_LANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-  #else
-    #define BOOST_LANG_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
-  #endif
+#    if defined(__HIPCC__) && (defined(__CUDACC__) || defined(__HIP__))
+#        include <hip/hip_runtime.h>
+// HIP defines "abort()" as "{asm("trap;");}", which breaks some kernels
+#        undef abort
+// there is no HIP_VERSION macro
+#        define BOOST_LANG_HIP BOOST_VERSION_NUMBER_AVAILABLE
+#        if defined(BOOST_LANG_CUDA) && BOOST_LANG_CUDA
+#            undef BOOST_LANG_CUDA
+#            define BOOST_LANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#        endif
+#    else
+#        define BOOST_LANG_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
 #endif
 
 //-----------------------------------------------------------------------------
-// HSA device architecture detection (HSA generated via HIP(HCC) or HCC directly)
+// HSA device architecture detection (HSA generated via HIP(clang))
 #if !defined(BOOST_ARCH_HSA)
-    #if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__==1 && defined(__HCC__) \
-        || (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__!=0)
-        // __HIP_DEVICE_COMPILE__ does not represent feature capability of target device like CUDA_ARCH.
-        // For feature detection there are special macros, see ROCm's HIP porting guide.
-        #define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_AVAILABLE
-    #else
-        #define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// hcc HSA compiler detection
-#if !defined(BOOST_COMP_HCC)
-    #if defined(__HCC__)
-        #define BOOST_COMP_HCC BOOST_VERSION_NUMBER_AVAILABLE
-    #else
-        #define BOOST_COMP_HCC BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
+#    if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1 && defined(__HIP__)
+// __HIP_DEVICE_COMPILE__ does not represent feature capability of target device like CUDA_ARCH.
+// For feature detection there are special macros, see ROCm's HIP porting guide.
+#        define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_AVAILABLE
+#    else
+#        define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
 #endif
 
 //-----------------------------------------------------------------------------
 // hip compiler detection
 #if !defined(BOOST_COMP_HIP)
-    #if defined(__HIP__)
-        #define BOOST_COMP_HIP BOOST_VERSION_NUMBER_AVAILABLE
-    #else
-        #define BOOST_COMP_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
+#    if defined(__HIP__)
+#        define BOOST_COMP_HIP BOOST_VERSION_NUMBER_AVAILABLE
+#    else
+#        define BOOST_COMP_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
 #endif
 
 //-----------------------------------------------------------------------------
@@ -75,23 +65,23 @@
 // - clang defines __CUDA__ and __CUDACC__ when compiling CUDA code ('-x cuda')
 // - nvcc defines __CUDACC__ when compiling CUDA code
 #if !defined(BOOST_LANG_CUDA)
-    #if defined(__CUDA__) || defined(__CUDACC__)
-        #include <cuda.h>
-        #define BOOST_LANG_CUDA BOOST_PREDEF_MAKE_10_VVRRP(CUDA_VERSION)
-    #else
-        #define BOOST_LANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
+#    if defined(__CUDA__) || defined(__CUDACC__)
+#        include <cuda.h>
+#        define BOOST_LANG_CUDA BOOST_PREDEF_MAKE_10_VVRRP(CUDA_VERSION)
+#    else
+#        define BOOST_LANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
 #endif
 
 //-----------------------------------------------------------------------------
 // In boost since 1.68.0
 // CUDA device architecture detection
 #if !defined(BOOST_ARCH_PTX)
-    #if defined(__CUDA_ARCH__)
-        #define BOOST_ARCH_PTX BOOST_PREDEF_MAKE_10_VRP(__CUDA_ARCH__)
-    #else
-        #define BOOST_ARCH_PTX BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
+#    if defined(__CUDA_ARCH__)
+#        define BOOST_ARCH_PTX BOOST_PREDEF_MAKE_10_VRP(__CUDA_ARCH__)
+#    else
+#        define BOOST_ARCH_PTX BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
 #endif
 
 //-----------------------------------------------------------------------------
@@ -100,48 +90,49 @@
 
 #include <boost/version.hpp>
 #if BOOST_VERSION >= 106800
-    // BOOST_COMP_NVCC_EMULATED is defined by boost instead of BOOST_COMP_NVCC
-    #if defined(BOOST_COMP_NVCC) && defined(BOOST_COMP_NVCC_EMULATED)
-        #undef BOOST_COMP_NVCC
-        #define BOOST_COMP_NVCC BOOST_COMP_NVCC_EMULATED
-    #endif
+// BOOST_COMP_NVCC_EMULATED is defined by boost instead of BOOST_COMP_NVCC
+#    if defined(BOOST_COMP_NVCC) && defined(BOOST_COMP_NVCC_EMULATED)
+#        undef BOOST_COMP_NVCC
+#        define BOOST_COMP_NVCC BOOST_COMP_NVCC_EMULATED
+#    endif
 #endif
 
 #if !defined(BOOST_COMP_NVCC)
-    #if defined(__NVCC__)
-        // The __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__ and __CUDACC_VER_BUILD__
-        // have been added with nvcc 7.5 and have not been available before.
-        #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) || !defined(__CUDACC_VER_BUILD__)
-            #define BOOST_COMP_NVCC BOOST_VERSION_NUMBER_AVAILABLE
-        #else
-            #define BOOST_COMP_NVCC BOOST_VERSION_NUMBER(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, __CUDACC_VER_BUILD__)
-        #endif
-    #else
-        #define BOOST_COMP_NVCC BOOST_VERSION_NUMBER_NOT_AVAILABLE
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-// In boost since 1.64.0
-// Work around for broken intel detection
-#if BOOST_COMP_INTEL == 0
-    #if defined(__INTEL_COMPILER)
-        #ifdef BOOST_COMP_INTEL_DETECTION
-            #undef BOOST_COMP_INTEL_DETECTION
-        #endif
-        #define BOOST_COMP_INTEL_DETECTION BOOST_PREDEF_MAKE_10_VVRR(__INTEL_COMPILER)
-        #if defined(BOOST_COMP_INTEL)
-            #undef BOOST_COMP_INTEL
-        #endif
-        #define BOOST_COMP_INTEL BOOST_COMP_INTEL_DETECTION
-    #endif
+#    if defined(__NVCC__)
+// The __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__ and __CUDACC_VER_BUILD__
+// have been added with nvcc 7.5 and have not been available before.
+#        if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) || !defined(__CUDACC_VER_BUILD__)
+#            define BOOST_COMP_NVCC BOOST_VERSION_NUMBER_AVAILABLE
+#        else
+#            define BOOST_COMP_NVCC                                                                                   \
+                BOOST_VERSION_NUMBER(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, __CUDACC_VER_BUILD__)
+#        endif
+#    else
+#        define BOOST_COMP_NVCC BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
 #endif
 
 //-----------------------------------------------------------------------------
 // clang CUDA compiler detection
 // Currently __CUDA__ is only defined by clang when compiling CUDA code.
 #if defined(__clang__) && defined(__CUDA__)
-    #define BOOST_COMP_CLANG_CUDA BOOST_COMP_CLANG
+#    define BOOST_COMP_CLANG_CUDA BOOST_COMP_CLANG
 #else
-    #define BOOST_COMP_CLANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    define BOOST_COMP_CLANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#endif
+
+//-----------------------------------------------------------------------------
+// Intel compiler detection
+// BOOST_COMP_INTEL_EMULATED is defined by boost instead of BOOST_COMP_INTEL
+#if defined(BOOST_COMP_INTEL) && defined(BOOST_COMP_INTEL_EMULATED)
+#    undef BOOST_COMP_INTEL
+#    define BOOST_COMP_INTEL BOOST_COMP_INTEL_EMULATED
+#endif
+
+//-----------------------------------------------------------------------------
+// PGI and NV HPC SDK compiler detection
+// BOOST_COMP_PGI_EMULATED is defined by boost instead of BOOST_COMP_PGI
+#if defined(BOOST_COMP_PGI) && defined(BOOST_COMP_PGI_EMULATED)
+#    undef BOOST_COMP_PGI
+#    define BOOST_COMP_PGI BOOST_COMP_PGI_EMULATED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/ClipCast.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/ClipCast.hpp
index 1202c5c397..fc2909d710 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/ClipCast.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/ClipCast.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -20,19 +20,17 @@ namespace alpaka
     {
         //-----------------------------------------------------------------------------
         //! \return The input casted and clipped to T.
-        template<
-            typename T,
-            typename V>
-        auto clipCast(
-            V const & val)
-        -> T
+        template<typename T, typename V>
+        auto clipCast(V const& val) -> T
         {
-            static_assert(std::is_integral<T>::value && std::is_integral<V>::value, "clipCast can not be called with non-integral types!");
+            static_assert(
+                std::is_integral<T>::value && std::is_integral<V>::value,
+                "clipCast can not be called with non-integral types!");
 
             auto constexpr max = static_cast<V>(std::numeric_limits<alpaka::meta::LowerMax<T, V>>::max());
             auto constexpr min = static_cast<V>(std::numeric_limits<alpaka::meta::HigherMin<T, V>>::min());
 
             return static_cast<T>(std::max(min, std::min(max, val)));
         }
-    }
-}
+    } // namespace core
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Common.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Common.hpp
index e71d5296f5..c5ffceeff4 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Common.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Common.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -14,24 +14,7 @@
 
 // Boost.Uuid errors with VS2017 when intrin.h is not included
 #if defined(_MSC_VER) && _MSC_VER >= 1910
-    #include <intrin.h>
-#endif
-
-//-----------------------------------------------------------------------------
-// Boost does not yet correctly identify clang when compiling CUDA code.
-// After explicitly including <boost/config.hpp> we can safely undefine some of the wrong settings.
-#if BOOST_COMP_CLANG_CUDA
-    #include <boost/config.hpp>
-    #undef BOOST_NO_CXX11_VARIADIC_TEMPLATES
-#endif
-
-//-----------------------------------------------------------------------------
-// Boost disables variadic templates for nvcc (in some cases because it was buggy).
-// However, we rely on it being enabled.
-// After explicitly including <boost/config.hpp> we can safely undefine the wrong setting.
-#if BOOST_COMP_NVCC
-    #include <boost/config.hpp>
-    #undef BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#    include <intrin.h>
 #endif
 
 //-----------------------------------------------------------------------------
@@ -42,17 +25,17 @@
 //! auto add(std::int32_t a, std::int32_t b)
 //! -> std::int32_t;
 #if BOOST_LANG_CUDA || BOOST_LANG_HIP
-    #if defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) || defined(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-        #define ALPAKA_FN_ACC __device__
-    #else
-        #define ALPAKA_FN_ACC __device__ __host__
-    #endif
-    #define ALPAKA_FN_HOST_ACC __device__ __host__
-    #define ALPAKA_FN_HOST __host__
+#    if defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) || defined(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
+#        define ALPAKA_FN_ACC __device__
+#    else
+#        define ALPAKA_FN_ACC __device__ __host__
+#    endif
+#    define ALPAKA_FN_HOST_ACC __device__ __host__
+#    define ALPAKA_FN_HOST __host__
 #else
-    #define ALPAKA_FN_ACC
-    #define ALPAKA_FN_HOST_ACC
-    #define ALPAKA_FN_HOST
+#    define ALPAKA_FN_ACC
+#    define ALPAKA_FN_HOST_ACC
+#    define ALPAKA_FN_HOST
 #endif
 
 //-----------------------------------------------------------------------------
@@ -65,22 +48,22 @@
 //!
 //! WARNING: Only use this method if there is no other way.
 //! Most cases can be solved by #if BOOST_ARCH_PTX or #if BOOST_LANG_CUDA.
-#if (BOOST_LANG_CUDA && !BOOST_COMP_CLANG_CUDA) || BOOST_LANG_HIP
-    #if BOOST_COMP_MSVC
-        #define ALPAKA_NO_HOST_ACC_WARNING __pragma(hd_warning_disable)
-    #else
-        #define ALPAKA_NO_HOST_ACC_WARNING _Pragma("hd_warning_disable")
-    #endif
+#if(BOOST_LANG_CUDA && !BOOST_COMP_CLANG_CUDA) || BOOST_LANG_HIP
+#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#        define ALPAKA_NO_HOST_ACC_WARNING __pragma(hd_warning_disable)
+#    else
+#        define ALPAKA_NO_HOST_ACC_WARNING _Pragma("hd_warning_disable")
+#    endif
 #else
-    #define ALPAKA_NO_HOST_ACC_WARNING
+#    define ALPAKA_NO_HOST_ACC_WARNING
 #endif
 
 //-----------------------------------------------------------------------------
 //! Macro defining the inline function attribute.
 #if BOOST_LANG_CUDA || BOOST_LANG_HIP
-    #define ALPAKA_FN_INLINE __forceinline__
+#    define ALPAKA_FN_INLINE __forceinline__
 #else
-    #define ALPAKA_FN_INLINE inline
+#    define ALPAKA_FN_INLINE inline
 #endif
 
 //-----------------------------------------------------------------------------
@@ -99,10 +82,10 @@
 //! In contrast to ordinary variables, you can not define such variables
 //! as static compilation unit local variables with internal linkage
 //! because this is forbidden by CUDA.
-#if (BOOST_LANG_CUDA && BOOST_ARCH_PTX) || (BOOST_LANG_HIP && (BOOST_ARCH_HSA || BOOST_ARCH_PTX))
-    #define ALPAKA_STATIC_ACC_MEM_GLOBAL __device__
+#if(BOOST_LANG_CUDA && BOOST_ARCH_PTX) || (BOOST_LANG_HIP && (BOOST_ARCH_HSA || BOOST_ARCH_PTX))
+#    define ALPAKA_STATIC_ACC_MEM_GLOBAL __device__
 #else
-    #define ALPAKA_STATIC_ACC_MEM_GLOBAL
+#    define ALPAKA_STATIC_ACC_MEM_GLOBAL
 #endif
 
 //-----------------------------------------------------------------------------
@@ -121,8 +104,8 @@
 //! In contrast to ordinary variables, you can not define such variables
 //! as static compilation unit local variables with internal linkage
 //! because this is forbidden by CUDA.
-#if (BOOST_LANG_CUDA && BOOST_ARCH_PTX) || (BOOST_LANG_HIP && (BOOST_ARCH_HSA || BOOST_ARCH_PTX))
-    #define ALPAKA_STATIC_ACC_MEM_CONSTANT __constant__
+#if(BOOST_LANG_CUDA && BOOST_ARCH_PTX) || (BOOST_LANG_HIP && (BOOST_ARCH_HSA || BOOST_ARCH_PTX))
+#    define ALPAKA_STATIC_ACC_MEM_CONSTANT __constant__
 #else
-    #define ALPAKA_STATIC_ACC_MEM_CONSTANT
+#    define ALPAKA_STATIC_ACC_MEM_CONSTANT
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Concepts.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Concepts.hpp
index af3bc4251a..80fb6c665a 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Concepts.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Concepts.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -18,21 +18,17 @@ namespace alpaka
         //#############################################################################
         //! Tag used in class inheritance hierarchies that describes that a specific concept (TConcept)
         //! is implemented by the given base class (TBase).
-        template<
-            typename TConcept,
-            typename TBase>
+        template<typename TConcept, typename TBase>
         struct Implements
         {
         };
 
         //#############################################################################
         //! Checks whether the concept is implemented by the given class
-        template<
-            typename TConcept,
-            typename TDerived>
-        struct ImplementsConcept {
-            template<
-                typename TBase>
+        template<typename TConcept, typename TDerived>
+        struct ImplementsConcept
+        {
+            template<typename TBase>
             static auto implements(Implements<TConcept, TBase>&) -> std::true_type;
             static auto implements(...) -> std::false_type;
 
@@ -43,50 +39,43 @@ namespace alpaka
         {
             //#############################################################################
             //! Returns the type that implements the given concept in the inheritance hierarchy.
-            template<
-                typename TConcept,
-                typename TDerived,
-                typename Sfinae = void>
+            template<typename TConcept, typename TDerived, typename Sfinae = void>
             struct ImplementationBaseType;
 
             //#############################################################################
             //! Base case for types that do not inherit from "Implements<TConcept, ...>" is the type itself.
-            template<
-                typename TConcept,
-                typename TDerived>
+            template<typename TConcept, typename TDerived>
             struct ImplementationBaseType<
                 TConcept,
                 TDerived,
-                typename std::enable_if<!ImplementsConcept<TConcept, TDerived>::value>::type>
+                std::enable_if_t<!ImplementsConcept<TConcept, TDerived>::value>>
             {
                 using type = TDerived;
             };
 
             //#############################################################################
-            //! For types that inherit from "Implements<TConcept, ...>" it finds the base class (TBase) which implements the concept.
-            template<
-                typename TConcept,
-                typename TDerived>
+            //! For types that inherit from "Implements<TConcept, ...>" it finds the base class (TBase) which
+            //! implements the concept.
+            template<typename TConcept, typename TDerived>
             struct ImplementationBaseType<
                 TConcept,
                 TDerived,
-                typename std::enable_if<ImplementsConcept<TConcept, TDerived>::value>::type>
+                std::enable_if_t<ImplementsConcept<TConcept, TDerived>::value>>
             {
-                template<
-                    typename TBase>
+                template<typename TBase>
                 static auto implementer(Implements<TConcept, TBase>&) -> TBase;
 
                 using type = decltype(implementer(std::declval<TDerived&>()));
 
-                static_assert(std::is_base_of<type, TDerived>::value, "The type implementing the concept has to be a publicly accessible base class!");
+                static_assert(
+                    std::is_base_of<type, TDerived>::value,
+                    "The type implementing the concept has to be a publicly accessible base class!");
             };
-        }
+        } // namespace detail
 
         //#############################################################################
         //! Returns the type that implements the given concept in the inheritance hierarchy.
-        template<
-            typename TConcept,
-            typename TDerived>
+        template<typename TConcept, typename TDerived>
         using ImplementationBase = typename detail::ImplementationBaseType<TConcept, TDerived>::type;
-    }
-}
+    } // namespace concepts
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/ConcurrentExecPool.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/ConcurrentExecPool.hpp
index 303a512e8e..6ef47e0b58 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/ConcurrentExecPool.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/ConcurrentExecPool.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -15,19 +15,19 @@
 // std::current_exception, std::make_exception_ptr, etc. which are not declared in device code.
 // Therefore, we can not even parse those parts when compiling device code.
 //-----------------------------------------------------------------------------
+#include <alpaka/core/BoostPredef.hpp>
 #include <alpaka/core/Common.hpp>
 
-#include <boost/config.hpp>
-
-#include <queue>
-#include <mutex>
-#include <stdexcept>
-#include <vector>
-#include <exception>
-#include <utility>
 #include <atomic>
+#include <exception>
 #include <functional>
 #include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
 
 namespace alpaka
 {
@@ -36,27 +36,23 @@ namespace alpaka
         namespace detail
         {
             //#############################################################################
-            template<
-                typename T>
-            class ThreadSafeQueue :
-                private std::queue<T>
+            template<typename T>
+            class ThreadSafeQueue : private std::queue<T>
             {
             public:
                 //-----------------------------------------------------------------------------
                 ThreadSafeQueue()
-                {}
+                {
+                }
                 //-----------------------------------------------------------------------------
                 //! \return If the queue is empty.
-                auto empty() const
-                -> bool
+                auto empty() const -> bool
                 {
                     return std::queue<T>::empty();
                 }
                 //-----------------------------------------------------------------------------
                 //! Pushes the given value onto the back of the queue.
-                auto push(
-                    T && t)
-                -> void
+                auto push(T&& t) -> void
                 {
                     std::lock_guard<std::mutex> lk(m_Mutex);
 
@@ -64,9 +60,7 @@ namespace alpaka
                 }
                 //-----------------------------------------------------------------------------
                 //! Pops the given value from the front of the queue.
-                auto pop(
-                    T & t)
-                -> bool
+                auto pop(T& t) -> bool
                 {
                     std::lock_guard<std::mutex> lk(m_Mutex);
 
@@ -88,11 +82,11 @@ namespace alpaka
 
             //#############################################################################
             //! ITaskPkg.
-            // \NOTE: We can not use C++11 std::packaged_task as it forces the use of std::future
+            // \NOTE: We can not use std::packaged_task as it forces the use of std::future
             // but we additionally support boost::fibers::promise.
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
 #endif
             class ITaskPkg
             {
@@ -102,8 +96,7 @@ namespace alpaka
 
                 //-----------------------------------------------------------------------------
                 //! Runs this task.
-                auto runTask() noexcept
-                -> void
+                auto runTask() noexcept -> void
                 {
                     try
                     {
@@ -128,18 +121,17 @@ namespace alpaka
 #if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
                 //-----------------------------------------------------------------------------
                 //! Sets an exception.
-                virtual auto setException(
-                    std::exception_ptr const & exceptPtr)
-                -> void = 0;
+                virtual auto setException(std::exception_ptr const& exceptPtr) -> void = 0;
 #endif
             };
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
+#    pragma clang diagnostic pop
 #endif
 
             //#############################################################################
             template<
-                template<typename TFnObjReturn> class TPromise,
+                template<typename TFnObjReturn>
+                class TPromise,
                 typename TFnObj,
                 typename TFnObjReturn = decltype(std::declval<TFnObj>()())>
             class TaskPkg;
@@ -150,46 +142,39 @@ namespace alpaka
             //! \tparam TPromise The promise type returned by the task.
             //! \tparam TFnObj The type of the function to execute.
             //! \tparam TFnObjReturn The return type of the TFnObj. Used for class specialization.
-            template<
-                template<typename TFnObjReturn> class TPromise,
-                typename TFnObj,
-                typename TFnObjReturn>
-            class TaskPkg final :
-                public ITaskPkg
+            template<template<typename TFnObjReturn> class TPromise, typename TFnObj, typename TFnObjReturn>
+            class TaskPkg final : public ITaskPkg
             {
             public:
                 //-----------------------------------------------------------------------------
-                TaskPkg(
-                    TFnObj && func) :
-                        m_Promise(),
-                        m_FnObj(std::move(func))
-                {}
+                TaskPkg(TFnObj&& func) : m_Promise(), m_FnObj(std::move(func))
+                {
+                }
 
             private:
                 //-----------------------------------------------------------------------------
                 //! The execution function.
-                virtual auto run()
-                -> void final
+                virtual auto run() -> void final
                 {
                     m_Promise.set_value(this->m_FnObj());
                 }
+
             public:
 // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
 #if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
                 //-----------------------------------------------------------------------------
                 //! Sets an exception.
-                virtual auto setException(
-                    std::exception_ptr const & exceptPtr)
-                -> void final
+                virtual auto setException(std::exception_ptr const& exceptPtr) -> void final
                 {
                     m_Promise.set_exception(exceptPtr);
                 }
 #endif
                 TPromise<TFnObjReturn> m_Promise;
+
             private:
                 // NOTE: To avoid invalid memory accesses to memory of a different thread
                 // `std::remove_reference` enforces the function object to be copied.
-                typename std::remove_reference<TFnObj>::type m_FnObj;
+                std::remove_reference_t<TFnObj> m_FnObj;
             };
 
             //#############################################################################
@@ -197,62 +182,48 @@ namespace alpaka
             //!
             //! \tparam TPromise The promise type returned by the task.
             //! \tparam TFnObj The type of the function to execute.
-            template<
-                template<typename TFnObjReturn> class TPromise,
-                typename TFnObj>
-            class TaskPkg<
-                TPromise,
-                TFnObj,
-                void> final :
-                public ITaskPkg
+            template<template<typename TFnObjReturn> class TPromise, typename TFnObj>
+            class TaskPkg<TPromise, TFnObj, void> final : public ITaskPkg
             {
             public:
                 //-----------------------------------------------------------------------------
-                TaskPkg(
-                    TFnObj && func) :
-                        m_Promise(),
-                        m_FnObj(std::move(func))
-                {}
+                TaskPkg(TFnObj&& func) : m_Promise(), m_FnObj(std::move(func))
+                {
+                }
 
             private:
                 //-----------------------------------------------------------------------------
                 //! The execution function.
-                virtual auto run()
-                -> void final
+                virtual auto run() -> void final
                 {
                     this->m_FnObj();
                     m_Promise.set_value();
                 }
+
             public:
 // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
 #if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
                 //-----------------------------------------------------------------------------
                 //! Sets an exception.
-                virtual auto setException(
-                    std::exception_ptr const & exceptPtr)
-                -> void final
+                virtual auto setException(std::exception_ptr const& exceptPtr) -> void final
                 {
                     m_Promise.set_exception(exceptPtr);
                 }
 #endif
                 TPromise<void> m_Promise;
+
             private:
                 // NOTE: To avoid invalid memory accesses to memory of a different thread
                 // `std::remove_reference` enforces the function object to be copied.
-                typename std::remove_reference<TFnObj>::type m_FnObj;
+                std::remove_reference_t<TFnObj> m_FnObj;
             };
 
             //-----------------------------------------------------------------------------
             template<
                 typename TFnObj0,
                 typename TFnObj1,
-                typename = typename std::enable_if<!std::is_same<void, decltype(std::declval<TFnObj0>()())>::value>::type>
-            auto invokeBothReturnFirst(
-                    TFnObj0 && fn0,
-                    TFnObj1 && fn1)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-             -> decltype(std::declval<TFnObj0>()())
-#endif
+                typename = std::enable_if_t<!std::is_same<void, decltype(std::declval<TFnObj0>()())>::value>>
+            auto invokeBothReturnFirst(TFnObj0&& fn0, TFnObj1&& fn1)
             {
                 auto ret = fn0();
                 fn1();
@@ -263,11 +234,8 @@ namespace alpaka
             template<
                 typename TFnObj0,
                 typename TFnObj1,
-                typename = typename std::enable_if<std::is_same<void, decltype(std::declval<TFnObj0>()())>::value>::type>
-            auto invokeBothReturnFirst(
-                    TFnObj0 && fn0,
-                    TFnObj1 && fn1)
-            -> void
+                typename = std::enable_if_t<std::is_same<void, decltype(std::declval<TFnObj0>()())>::value>>
+            auto invokeBothReturnFirst(TFnObj0&& fn0, TFnObj1&& fn1) -> void
             {
                 fn0();
                 fn1();
@@ -278,14 +246,15 @@ namespace alpaka
             //!
             //! \tparam TConcurrentExec The type of concurrent executor (for example std::thread).
             //! \tparam TPromise The promise type returned by the task.
-            //! \tparam TYield The type is required to have a static method "void yield()" to yield the current thread if there is no work.
-            //! \tparam TMutex Unused. The mutex type used for locking threads.
-            //! \tparam TCondVar Unused. The condition variable type used to make the threads wait if there is no work.
-            //! \tparam TisYielding Boolean value if the threads should yield instead of wait for a condition variable.
+            //! \tparam TYield The type is required to have a static method "void yield()" to yield the current thread
+            //! if there is no work. \tparam TMutex Unused. The mutex type used for locking threads. \tparam TCondVar
+            //! Unused. The condition variable type used to make the threads wait if there is no work. \tparam
+            //! TisYielding Boolean value if the threads should yield instead of wait for a condition variable.
             template<
                 typename TIdx,
                 typename TConcurrentExec,
-                template<typename TFnObjReturn> class TPromise,
+                template<typename TFnObjReturn>
+                class TPromise,
                 typename TYield,
                 typename TMutex = void,
                 typename TCondVar = void,
@@ -294,21 +263,22 @@ namespace alpaka
             {
             public:
                 //-----------------------------------------------------------------------------
-                //! Creates a concurrent executor pool with a specific number of concurrent executors and a maximum number of queued tasks.
+                //! Creates a concurrent executor pool with a specific number of concurrent executors and a maximum
+                //! number of queued tasks.
                 //!
                 //! \param concurrentExecutionCount
                 //!    The guaranteed number of concurrent executors used in the pool.
                 //!    This is also the maximum number of tasks worked on concurrently.
-                ConcurrentExecPool(
-                    TIdx concurrentExecutionCount) :
-                    m_vConcurrentExecs(),
-                    m_qTasks(),
-                    m_numActiveTasks(0u),
-                    m_bShutdownFlag(false)
+                ConcurrentExecPool(TIdx concurrentExecutionCount)
+                    : m_vConcurrentExecs()
+                    , m_qTasks()
+                    , m_numActiveTasks(0u)
+                    , m_bShutdownFlag(false)
                 {
                     if(concurrentExecutionCount < 1)
                     {
-                        throw std::invalid_argument("The argument 'concurrentExecutionCount' has to be greate or equal to one!");
+                        throw std::invalid_argument(
+                            "The argument 'concurrentExecutionCount' has to be greate or equal to one!");
                     }
 
                     m_vConcurrentExecs.reserve(static_cast<std::size_t>(concurrentExecutionCount));
@@ -316,17 +286,17 @@ namespace alpaka
                     // Create all concurrent executors.
                     for(TIdx concurrentExec(0u); concurrentExec < concurrentExecutionCount; ++concurrentExec)
                     {
-                        m_vConcurrentExecs.emplace_back([this](){concurrentExecFn();});
+                        m_vConcurrentExecs.emplace_back([this]() { concurrentExecFn(); });
                     }
                 }
                 //-----------------------------------------------------------------------------
-                ConcurrentExecPool(ConcurrentExecPool const &) = delete;
+                ConcurrentExecPool(ConcurrentExecPool const&) = delete;
                 //-----------------------------------------------------------------------------
-                ConcurrentExecPool(ConcurrentExecPool &&) = delete;
+                ConcurrentExecPool(ConcurrentExecPool&&) = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(ConcurrentExecPool const &) -> ConcurrentExecPool & = delete;
+                auto operator=(ConcurrentExecPool const&) -> ConcurrentExecPool& = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(ConcurrentExecPool &&) -> ConcurrentExecPool & = delete;
+                auto operator=(ConcurrentExecPool&&) -> ConcurrentExecPool& = delete;
 
                 //-----------------------------------------------------------------------------
                 //! Completes any currently running task normally.
@@ -343,7 +313,8 @@ namespace alpaka
                     // Signal to each incomplete task that it will not complete due to pool destruction.
                     while(popTask(currentTaskPackage))
                     {
-                        auto const except(std::runtime_error("Could not perform task before ConcurrentExecPool destruction"));
+                        auto const except(
+                            std::runtime_error("Could not perform task before ConcurrentExecPool destruction"));
 // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
 #if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
                         currentTaskPackage->setException(std::make_exception_ptr(except));
@@ -359,37 +330,20 @@ namespace alpaka
                 //!                 Takes an arbitrary number of arguments and arbitrary return type.
                 //! \tparam TArgs   The argument types pack.
                 //! \param args     Arguments for task, cannot be moved.
-                //!                 If such parameters must be used, use a lambda and capture via move then move the lambda.
+                //!                 If such parameters must be used, use a lambda and capture via move then move the
+                //!                 lambda.
                 //!
                 //! \return Signals when the task has completed with either success or an exception.
                 //!         Also results in an exception if the pool is destroyed before execution has begun.
-                template<
-                    typename TFnObj,
-                    typename ... TArgs>
-                auto enqueueTask(
-                    TFnObj && task,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))
-                // FIXME: gcc 4.9 does not support the syntax below. Restricting the return type to void works because we never use something else within alpaka.
-                -> decltype(std::declval<TPromise<void>>().get_future())
-#else
-                -> decltype(std::declval<TPromise<decltype(task(args...))>>().get_future())
-#endif
-#endif
+                template<typename TFnObj, typename... TArgs>
+                auto enqueueTask(TFnObj&& task, TArgs&&... args)
                 {
-                    auto boundTask([=](){return task(args...);});
-                    auto decrementNumActiveTasks([this](){--m_numActiveTasks;});
+                    auto boundTask([=]() { return task(args...); });
+                    auto decrementNumActiveTasks([this]() { --m_numActiveTasks; });
 
-                    auto extendedTask(
-                        [boundTask, decrementNumActiveTasks]()
-                        {
-                            return
-                                invokeBothReturnFirst(
-                                    std::move(boundTask),
-                                    std::move(decrementNumActiveTasks)
-                                );
-                        });
+                    auto extendedTask([boundTask, decrementNumActiveTasks]() {
+                        return invokeBothReturnFirst(std::move(boundTask), std::move(decrementNumActiveTasks));
+                    });
 
                     using TaskPackage = TaskPkg<TPromise, decltype(extendedTask)>;
                     auto pTaskPackage(new TaskPackage(std::move(extendedTask)));
@@ -404,15 +358,13 @@ namespace alpaka
                 }
                 //-----------------------------------------------------------------------------
                 //! \return The number of concurrent executors available.
-                auto getConcurrentExecutionCount() const
-                -> TIdx
+                auto getConcurrentExecutionCount() const -> TIdx
                 {
                     return m_vConcurrentExecs.size();
                 }
                 //-----------------------------------------------------------------------------
                 //! \return If the thread pool is idle.
-                auto isIdle() const
-                -> bool
+                auto isIdle() const -> bool
                 {
                     return m_numActiveTasks == 0u;
                 }
@@ -443,16 +395,14 @@ namespace alpaka
                 //! Joins all concurrent executors.
                 void joinAllConcurrentExecs()
                 {
-                    for(auto && concurrentExec : m_vConcurrentExecs)
+                    for(auto&& concurrentExec : m_vConcurrentExecs)
                     {
                         concurrentExec.join();
                     }
                 }
                 //-----------------------------------------------------------------------------
                 //! Pops a task from the queue.
-                auto popTask(
-                    std::shared_ptr<ITaskPkg> & out)
-                -> bool
+                auto popTask(std::shared_ptr<ITaskPkg>& out) -> bool
                 {
                     if(m_qTasks.pop(out))
                     {
@@ -473,44 +423,39 @@ namespace alpaka
             //!
             //! \tparam TConcurrentExec The type of concurrent executor (for example std::thread).
             //! \tparam TPromise The promise type returned by the task.
-            //! \tparam TYield Unused. The type is required to have a static method "void yield()" to yield the current thread if there is no work.
-            //! \tparam TMutex The mutex type used for locking threads.
-            //! \tparam TCondVar The condition variable type used to make the threads wait if there is no work.
+            //! \tparam TYield Unused. The type is required to have a static method "void yield()" to yield the current
+            //! thread if there is no work. \tparam TMutex The mutex type used for locking threads. \tparam TCondVar
+            //! The condition variable type used to make the threads wait if there is no work.
             template<
                 typename TIdx,
                 typename TConcurrentExec,
-                template<typename TFnObjReturn> class TPromise,
+                template<typename TFnObjReturn>
+                class TPromise,
                 typename TYield,
                 typename TMutex,
                 typename TCondVar>
-            class ConcurrentExecPool<
-                TIdx,
-                TConcurrentExec,
-                TPromise,
-                TYield,
-                TMutex,
-                TCondVar,
-                false> final
+            class ConcurrentExecPool<TIdx, TConcurrentExec, TPromise, TYield, TMutex, TCondVar, false> final
             {
             public:
                 //-----------------------------------------------------------------------------
-                //! Creates a concurrent executors pool with a specific number of concurrent executors and a maximum number of queued tasks.
+                //! Creates a concurrent executors pool with a specific number of concurrent executors and a maximum
+                //! number of queued tasks.
                 //!
                 //! \param concurrentExecutionCount
                 //!    The guaranteed number of concurrent executors used in the pool.
                 //!    This is also the maximum number of tasks worked on concurrently.
-                ConcurrentExecPool(
-                    TIdx concurrentExecutionCount) :
-                    m_vConcurrentExecs(),
-                    m_qTasks(),
-                    m_numActiveTasks(0u),
-                    m_mtxWakeup(),
-                    m_cvWakeup(),
-                    m_bShutdownFlag(false)
+                ConcurrentExecPool(TIdx concurrentExecutionCount)
+                    : m_vConcurrentExecs()
+                    , m_qTasks()
+                    , m_numActiveTasks(0u)
+                    , m_mtxWakeup()
+                    , m_cvWakeup()
+                    , m_bShutdownFlag(false)
                 {
                     if(concurrentExecutionCount < 1)
                     {
-                        throw std::invalid_argument("The argument 'concurrentExecutionCount' has to be greate or equal to one!");
+                        throw std::invalid_argument(
+                            "The argument 'concurrentExecutionCount' has to be greate or equal to one!");
                     }
 
                     m_vConcurrentExecs.reserve(static_cast<std::size_t>(concurrentExecutionCount));
@@ -518,17 +463,17 @@ namespace alpaka
                     // Create all concurrent executors.
                     for(TIdx concurrentExec(0u); concurrentExec < concurrentExecutionCount; ++concurrentExec)
                     {
-                        m_vConcurrentExecs.emplace_back([this](){concurrentExecFn();});
+                        m_vConcurrentExecs.emplace_back([this]() { concurrentExecFn(); });
                     }
                 }
                 //-----------------------------------------------------------------------------
-                ConcurrentExecPool(ConcurrentExecPool const &) = delete;
+                ConcurrentExecPool(ConcurrentExecPool const&) = delete;
                 //-----------------------------------------------------------------------------
-                ConcurrentExecPool(ConcurrentExecPool &&) = delete;
+                ConcurrentExecPool(ConcurrentExecPool&&) = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(ConcurrentExecPool const &) -> ConcurrentExecPool & = delete;
+                auto operator=(ConcurrentExecPool const&) -> ConcurrentExecPool& = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(ConcurrentExecPool &&) -> ConcurrentExecPool & = delete;
+                auto operator=(ConcurrentExecPool&&) -> ConcurrentExecPool& = delete;
 
                 //-----------------------------------------------------------------------------
                 //! Completes any currently running task normally.
@@ -551,7 +496,8 @@ namespace alpaka
                     // Signal to each incomplete task that it will not complete due to pool destruction.
                     while(popTask(currentTaskPackage))
                     {
-                        auto const except(std::runtime_error("Could not perform task before ConcurrentExecPool destruction"));
+                        auto const except(
+                            std::runtime_error("Could not perform task before ConcurrentExecPool destruction"));
 // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
 #if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
                         currentTaskPackage->setException(std::make_exception_ptr(except));
@@ -567,37 +513,20 @@ namespace alpaka
                 //!                 Takes an arbitrary number of arguments and arbitrary return type.
                 //! \tparam TArgs   The argument types pack.
                 //! \param args     Arguments for task, cannot be moved.
-                //!                 If such parameters must be used, use a lambda and capture via move then move the lambda.
+                //!                 If such parameters must be used, use a lambda and capture via move then move the
+                //!                 lambda.
                 //!
                 //! \return Signals when the task has completed with either success or an exception.
                 //!         Also results in an exception if the pool is destroyed before execution has begun.
-                template<
-                    typename TFnObj,
-                    typename ... TArgs>
-                auto enqueueTask(
-                    TFnObj && task,
-                    TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))
-                // FIXME: gcc 4.9 does not support the syntax below. Restricting the return type to void works because we never use something else within alpaka.
-                -> decltype(std::declval<TPromise<void>>().get_future())
-#else
-                -> decltype(std::declval<TPromise<decltype(task(args...))>>().get_future())
-#endif
-#endif
+                template<typename TFnObj, typename... TArgs>
+                auto enqueueTask(TFnObj&& task, TArgs&&... args)
                 {
-                    auto boundTask([=](){return task(args...);});
-                    auto decrementNumActiveTasks([this](){--m_numActiveTasks;});
+                    auto boundTask([=]() { return task(args...); });
+                    auto decrementNumActiveTasks([this]() { --m_numActiveTasks; });
 
-                    auto extendedTask(
-                        [boundTask, decrementNumActiveTasks]()
-                        {
-                            return
-                                invokeBothReturnFirst(
-                                    std::move(boundTask),
-                                    std::move(decrementNumActiveTasks)
-                                );
-                        });
+                    auto extendedTask([boundTask, decrementNumActiveTasks]() {
+                        return invokeBothReturnFirst(std::move(boundTask), std::move(decrementNumActiveTasks));
+                    });
 
                     using TaskPackage = TaskPkg<TPromise, decltype(extendedTask)>;
                     auto pTaskPackage(new TaskPackage(std::move(extendedTask)));
@@ -617,15 +546,13 @@ namespace alpaka
                 }
                 //-----------------------------------------------------------------------------
                 //! \return The number of concurrent executors available.
-                auto getConcurrentExecutionCount() const
-                -> TIdx
+                auto getConcurrentExecutionCount() const -> TIdx
                 {
                     return m_vConcurrentExecs.size();
                 }
                 //-----------------------------------------------------------------------------
                 //! \return If the thread pool is idle.
-                auto isIdle() const
-                -> bool
+                auto isIdle() const -> bool
                 {
                     return m_numActiveTasks == 0u;
                 }
@@ -665,16 +592,14 @@ namespace alpaka
                 //! Joins all concurrent executors.
                 void joinAllConcurrentExecs()
                 {
-                    for(auto && concurrentExec : m_vConcurrentExecs)
+                    for(auto&& concurrentExec : m_vConcurrentExecs)
                     {
                         concurrentExec.join();
                     }
                 }
                 //-----------------------------------------------------------------------------
                 //! Pops a task from the queue.
-                auto popTask(
-                    std::shared_ptr<ITaskPkg> & out)
-                -> bool
+                auto popTask(std::shared_ptr<ITaskPkg>& out) -> bool
                 {
                     if(m_qTasks.pop(out))
                     {
@@ -692,6 +617,6 @@ namespace alpaka
                 TCondVar m_cvWakeup;
                 std::atomic<bool> m_bShutdownFlag;
             };
-        }
-    }
-}
+        } // namespace detail
+    } // namespace core
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Cuda.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Cuda.hpp
index 4ecb81d5e1..4a86472d23 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Cuda.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Cuda.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,146 +11,48 @@
 
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
 
-#include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/core/BoostPredef.hpp>
 
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
+#    if !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
 
-#include <alpaka/elem/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
+#    include <alpaka/elem/Traits.hpp>
+#    include <alpaka/extent/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/meta/Metafunctions.hpp>
+#    include <alpaka/offset/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
 
 // cuda_runtime_api.h: CUDA Runtime API C-style interface that does not require compiling with nvcc.
 // cuda_runtime.h: CUDA Runtime API  C++-style interface built on top of the C API.
 //  It wraps some of the C API routines, using overloading, references and default arguments.
 //  These wrappers can be used from C++ code and can be compiled with any C++ compiler.
-//  The C++ API also has some CUDA-specific wrappers that wrap C API routines that deal with symbols, textures, and device functions.
-//  These wrappers require the use of \p nvcc because they depend on code being generated by the compiler.
-//  For example, the execution configuration syntax to invoke kernels is only available in source code compiled with nvcc.
-#include <cuda_runtime.h>
-#include <cuda.h>
-
-#include <array>
-#include <type_traits>
-#include <utility>
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include <cstddef>
-
-#if (!defined(CUDART_VERSION) || (CUDART_VERSION < 8000))
-    #error "CUDA version 8.0 or greater required!"
-#endif
+//  The C++ API also has some CUDA-specific wrappers that wrap C API routines that deal with symbols, textures, and
+//  device functions. These wrappers require the use of \p nvcc because they depend on code being generated by the
+//  compiler. For example, the execution configuration syntax to invoke kernels is only available in source code
+//  compiled with nvcc.
+#    include <cuda.h>
+#    include <cuda_runtime.h>
 
-#if (!defined(CUDA_VERSION) || (CUDA_VERSION < 8000))
-    #error "CUDA version 8.0 or greater required!"
-#endif
-
-namespace alpaka
-{
-    namespace cuda
-    {
-        namespace detail
-        {
-            //-----------------------------------------------------------------------------
-            //! CUDA runtime API error checking with log and exception, ignoring specific error values
-            ALPAKA_FN_HOST inline auto cudaRtCheck(
-                cudaError_t const & error,
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                if(error != cudaSuccess)
-                {
-                    std::string const sError(std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '" + cudaGetErrorName(error) +  "': '" + std::string(cudaGetErrorString(error)) + "'!");
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cerr << sError << std::endl;
-#endif
-                    ALPAKA_DEBUG_BREAK;
-                    // reset the last error to allow user side error handling
-                    cudaGetLastError();
-                    throw std::runtime_error(sError);
-                }
-            }
-            //-----------------------------------------------------------------------------
-            //! CUDA runtime API error checking with log and exception, ignoring specific error values
-            // NOTE: All ignored errors have to be convertible to cudaError_t.
-            template<
-                typename... TErrors/*,
-                typename = typename std::enable_if<
-                    meta::Conjunction<
-                        std::true_type,
-                        std::is_convertible<
-                            TErrors,
-                            cudaError_t
-                        >...
-                    >::value>::type*/>
-            ALPAKA_FN_HOST auto cudaRtCheckIgnore(
-                cudaError_t const & error,
-                char const * cmd,
-                char const * file,
-                int const & line,
-                TErrors && ... ignoredErrorCodes)
-            -> void
-            {
-                if(error != cudaSuccess)
-                {
-                    // https://stackoverflow.com/questions/18792731/can-we-omit-the-double-braces-for-stdarray-in-c14/18792782#18792782
-                    std::array<cudaError_t, sizeof...(ignoredErrorCodes)> const aIgnoredErrorCodes{{ignoredErrorCodes...}};
+#    include <cstddef>
+#    include <stdexcept>
+#    include <string>
+#    include <type_traits>
+#    include <utility>
 
-                    // If the error code is not one of the ignored ones.
-                    if(std::find(aIgnoredErrorCodes.cbegin(), aIgnoredErrorCodes.cend(), error) == aIgnoredErrorCodes.cend())
-                    {
-                        cudaRtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line);
-                    }
-                }
-            }
-            //-----------------------------------------------------------------------------
-            //! CUDA runtime API last error checking with log and exception.
-            ALPAKA_FN_HOST inline auto cudaRtCheckLastError(
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                cudaError_t const error(cudaGetLastError());
-                cudaRtCheck(error, desc, file, line);
-            }
-        }
-    }
-}
+#    if(!defined(CUDART_VERSION) || (CUDART_VERSION < 9000))
+#        error "CUDA version 9.0 or greater required!"
+#    endif
 
-#if BOOST_COMP_MSVC
-    //-----------------------------------------------------------------------------
-    //! CUDA runtime error checking with log and exception, ignoring specific error values
-    #define ALPAKA_CUDA_RT_CHECK_IGNORE(cmd, ...)\
-        ::alpaka::cuda::detail::cudaRtCheckLastError("'" #cmd "' A previous CUDA call (not this one) set the error ", __FILE__, __LINE__);\
-        ::alpaka::cuda::detail::cudaRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, __VA_ARGS__)
-#else
-    #if BOOST_COMP_CLANG
-        #pragma clang diagnostic push
-        #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
-    #endif
-    //-----------------------------------------------------------------------------
-    //! CUDA runtime error checking with log and exception, ignoring specific error values
-    #define ALPAKA_CUDA_RT_CHECK_IGNORE(cmd, ...)\
-        ::alpaka::cuda::detail::cudaRtCheckLastError("'" #cmd "' A previous CUDA call (not this one) set the error ", __FILE__, __LINE__);\
-        ::alpaka::cuda::detail::cudaRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, ##__VA_ARGS__)
-    #if BOOST_COMP_CLANG
-        #pragma clang diagnostic pop
-    #endif
-#endif
+#    if(!defined(CUDA_VERSION) || (CUDA_VERSION < 9000))
+#        error "CUDA version 9.0 or greater required!"
+#    endif
 
-//-----------------------------------------------------------------------------
-//! CUDA runtime error checking with log and exception.
-#define ALPAKA_CUDA_RT_CHECK(cmd)\
-    ALPAKA_CUDA_RT_CHECK_IGNORE(cmd)
+#    define ALPAKA_PP_CONCAT_DO(X, Y) X##Y
+#    define ALPAKA_PP_CONCAT(X, Y) ALPAKA_PP_CONCAT_DO(X, Y)
+//! prefix a name with `cuda`
+#    define ALPAKA_API_PREFIX(name) ALPAKA_PP_CONCAT_DO(cuda, name)
 
 namespace alpaka
 {
@@ -161,49 +63,49 @@ namespace alpaka
             //-----------------------------------------------------------------------------
             //! CUDA driver API error checking with log and exception, ignoring specific error values
             ALPAKA_FN_HOST inline auto cudaDrvCheck(
-                CUresult const & error,
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
+                CUresult const& error,
+                char const* desc,
+                char const* file,
+                int const& line) -> void
             {
                 if(error == CUDA_SUCCESS)
                     return;
 
-                char const * cu_err_name = nullptr;
-                char const * cu_err_string = nullptr;
+                char const* cu_err_name = nullptr;
+                char const* cu_err_string = nullptr;
                 CUresult cu_result_name = cuGetErrorName(error, &cu_err_name);
                 CUresult cu_result_string = cuGetErrorString(error, &cu_err_string);
-                std::string sError = std::string(file)
-                                   + "(" + std::to_string(line) + ") "
-                                   + std::string(desc) + " : '";
-                if( cu_result_name == CUDA_SUCCESS && cu_result_string == CUDA_SUCCESS )
+                std::string sError
+                    = std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '";
+                if(cu_result_name == CUDA_SUCCESS && cu_result_string == CUDA_SUCCESS)
+                {
+                    sError += std::string(cu_err_name) + "': '" + std::string(cu_err_string) + "'!";
+                }
+                else
                 {
-                    sError += std::string(cu_err_name) +  "': '"
-                            + std::string(cu_err_string) + "'!";
-                } else {
                     // cuGetError*() failed, so append corresponding error message
-                    if( cu_result_name == CUDA_ERROR_INVALID_VALUE ) {
+                    if(cu_result_name == CUDA_ERROR_INVALID_VALUE)
+                    {
                         sError += " cuGetErrorName: 'Invalid Value'!";
                     }
-                    if( cu_result_string == CUDA_ERROR_INVALID_VALUE ) {
+                    if(cu_result_string == CUDA_ERROR_INVALID_VALUE)
+                    {
                         sError += " cuGetErrorString: 'Invalid Value'!";
                     }
                 }
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
                 std::cerr << sError << std::endl;
-#endif
+#    endif
                 ALPAKA_DEBUG_BREAK;
                 throw std::runtime_error(sError);
             }
-        }
-    }
-}
+        } // namespace detail
+    } // namespace cuda
+} // namespace alpaka
 
 //-----------------------------------------------------------------------------
 //! CUDA driver error checking with log and exception.
-#define ALPAKA_CUDA_DRV_CHECK(cmd)\
-    ::alpaka::cuda::detail::cudaDrvCheck(cmd, #cmd, __FILE__, __LINE__)
+#    define ALPAKA_CUDA_DRV_CHECK(cmd) ::alpaka::cuda::detail::cudaDrvCheck(cmd, #    cmd, __FILE__, __LINE__)
 
 
 //-----------------------------------------------------------------------------
@@ -218,551 +120,373 @@ namespace alpaka
         {
             //#############################################################################
             //! The CUDA vectors 1D dimension get trait specialization.
-            template<
-                typename T>
-            struct IsCudaBuiltInType :
-                std::integral_constant<
-                    bool,
-                    std::is_same<T, char1>::value
-                    || std::is_same<T, double1>::value
-                    || std::is_same<T, float1>::value
-                    || std::is_same<T, int1>::value
-                    || std::is_same<T, long1>::value
-                    || std::is_same<T, longlong1>::value
-                    || std::is_same<T, short1>::value
-                    || std::is_same<T, uchar1>::value
-                    || std::is_same<T, uint1>::value
-                    || std::is_same<T, ulong1>::value
-                    || std::is_same<T, ulonglong1>::value
-                    || std::is_same<T, ushort1>::value
-                    || std::is_same<T, char2>::value
-                    || std::is_same<T, double2>::value
-                    || std::is_same<T, float2>::value
-                    || std::is_same<T, int2>::value
-                    || std::is_same<T, long2>::value
-                    || std::is_same<T, longlong2>::value
-                    || std::is_same<T, short2>::value
-                    || std::is_same<T, uchar2>::value
-                    || std::is_same<T, uint2>::value
-                    || std::is_same<T, ulong2>::value
-                    || std::is_same<T, ulonglong2>::value
-                    || std::is_same<T, ushort2>::value
-                    || std::is_same<T, char3>::value
-                    || std::is_same<T, dim3>::value
-                    || std::is_same<T, double3>::value
-                    || std::is_same<T, float3>::value
-                    || std::is_same<T, int3>::value
-                    || std::is_same<T, long3>::value
-                    || std::is_same<T, longlong3>::value
-                    || std::is_same<T, short3>::value
-                    || std::is_same<T, uchar3>::value
-                    || std::is_same<T, uint3>::value
-                    || std::is_same<T, ulong3>::value
-                    || std::is_same<T, ulonglong3>::value
-                    || std::is_same<T, ushort3>::value
-                    || std::is_same<T, char4>::value
-                    || std::is_same<T, double4>::value
-                    || std::is_same<T, float4>::value
-                    || std::is_same<T, int4>::value
-                    || std::is_same<T, long4>::value
-                    || std::is_same<T, longlong4>::value
-                    || std::is_same<T, short4>::value
-                    || std::is_same<T, uchar4>::value
-                    || std::is_same<T, uint4>::value
-                    || std::is_same<T, ulong4>::value
-                    || std::is_same<T, ulonglong4>::value
-                    || std::is_same<T, ushort4>::value
+            template<typename T>
+            struct IsCudaBuiltInType
+                : std::integral_constant<
+                      bool,
+                      std::is_same<T, char1>::value || std::is_same<T, double1>::value
+                          || std::is_same<T, float1>::value || std::is_same<T, int1>::value
+                          || std::is_same<T, long1>::value || std::is_same<T, longlong1>::value
+                          || std::is_same<T, short1>::value || std::is_same<T, uchar1>::value
+                          || std::is_same<T, uint1>::value || std::is_same<T, ulong1>::value
+                          || std::is_same<T, ulonglong1>::value || std::is_same<T, ushort1>::value
+                          || std::is_same<T, char2>::value || std::is_same<T, double2>::value
+                          || std::is_same<T, float2>::value || std::is_same<T, int2>::value
+                          || std::is_same<T, long2>::value || std::is_same<T, longlong2>::value
+                          || std::is_same<T, short2>::value || std::is_same<T, uchar2>::value
+                          || std::is_same<T, uint2>::value || std::is_same<T, ulong2>::value
+                          || std::is_same<T, ulonglong2>::value || std::is_same<T, ushort2>::value
+                          || std::is_same<T, char3>::value || std::is_same<T, dim3>::value
+                          || std::is_same<T, double3>::value || std::is_same<T, float3>::value
+                          || std::is_same<T, int3>::value || std::is_same<T, long3>::value
+                          || std::is_same<T, longlong3>::value || std::is_same<T, short3>::value
+                          || std::is_same<T, uchar3>::value || std::is_same<T, uint3>::value
+                          || std::is_same<T, ulong3>::value || std::is_same<T, ulonglong3>::value
+                          || std::is_same<T, ushort3>::value || std::is_same<T, char4>::value
+                          || std::is_same<T, double4>::value || std::is_same<T, float4>::value
+                          || std::is_same<T, int4>::value || std::is_same<T, long4>::value
+                          || std::is_same<T, longlong4>::value || std::is_same<T, short4>::value
+                          || std::is_same<T, uchar4>::value || std::is_same<T, uint4>::value
+                          || std::is_same<T, ulong4>::value || std::is_same<T, ulonglong4>::value
+                          || std::is_same<T, ushort4>::value
 // CUDA built-in variables have special types in clang native CUDA compilation
 // defined in cuda_builtin_vars.h
-#if BOOST_COMP_CLANG_CUDA
-                    || std::is_same<T, __cuda_builtin_threadIdx_t>::value
-                    || std::is_same<T, __cuda_builtin_blockIdx_t>::value
-                    || std::is_same<T, __cuda_builtin_blockDim_t>::value
-                    || std::is_same<T, __cuda_builtin_gridDim_t>::value
-#endif
-                >
-            {};
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA vectors 1D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char1>::value
-                    || std::is_same<T, double1>::value
-                    || std::is_same<T, float1>::value
-                    || std::is_same<T, int1>::value
-                    || std::is_same<T, long1>::value
-                    || std::is_same<T, longlong1>::value
-                    || std::is_same<T, short1>::value
-                    || std::is_same<T, uchar1>::value
-                    || std::is_same<T, uint1>::value
-                    || std::is_same<T, ulong1>::value
-                    || std::is_same<T, ulonglong1>::value
-                    || std::is_same<T, ushort1>::value
-                >::type>
-            {
-                using type = dim::DimInt<1u>;
-            };
-            //#############################################################################
-            //! The CUDA vectors 2D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char2>::value
-                    || std::is_same<T, double2>::value
-                    || std::is_same<T, float2>::value
-                    || std::is_same<T, int2>::value
-                    || std::is_same<T, long2>::value
-                    || std::is_same<T, longlong2>::value
-                    || std::is_same<T, short2>::value
-                    || std::is_same<T, uchar2>::value
-                    || std::is_same<T, uint2>::value
-                    || std::is_same<T, ulong2>::value
-                    || std::is_same<T, ulonglong2>::value
-                    || std::is_same<T, ushort2>::value
-                >::type>
+#    if BOOST_COMP_CLANG_CUDA
+                          || std::is_same<T, __cuda_builtin_threadIdx_t>::value
+                          || std::is_same<T, __cuda_builtin_blockIdx_t>::value
+                          || std::is_same<T, __cuda_builtin_blockDim_t>::value
+                          || std::is_same<T, __cuda_builtin_gridDim_t>::value
+#    endif
+                      >
             {
-                using type = dim::DimInt<2u>;
             };
-            //#############################################################################
-            //! The CUDA vectors 3D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char3>::value
-                    || std::is_same<T, dim3>::value
-                    || std::is_same<T, double3>::value
-                    || std::is_same<T, float3>::value
-                    || std::is_same<T, int3>::value
-                    || std::is_same<T, long3>::value
-                    || std::is_same<T, longlong3>::value
-                    || std::is_same<T, short3>::value
-                    || std::is_same<T, uchar3>::value
-                    || std::is_same<T, uint3>::value
-                    || std::is_same<T, ulong3>::value
-                    || std::is_same<T, ulonglong3>::value
-                    || std::is_same<T, ushort3>::value
-#if BOOST_COMP_CLANG_CUDA
-                    || std::is_same<T, __cuda_builtin_threadIdx_t>::value
-                    || std::is_same<T, __cuda_builtin_blockIdx_t>::value
-                    || std::is_same<T, __cuda_builtin_blockDim_t>::value
-                    || std::is_same<T, __cuda_builtin_gridDim_t>::value
-#endif
-                >::type>
-            {
-                using type = dim::DimInt<3u>;
-            };
-            //#############################################################################
-            //! The CUDA vectors 4D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char4>::value
-                    || std::is_same<T, double4>::value
-                    || std::is_same<T, float4>::value
-                    || std::is_same<T, int4>::value
-                    || std::is_same<T, long4>::value
-                    || std::is_same<T, longlong4>::value
-                    || std::is_same<T, short4>::value
-                    || std::is_same<T, uchar4>::value
-                    || std::is_same<T, uint4>::value
-                    || std::is_same<T, ulong4>::value
-                    || std::is_same<T, ulonglong4>::value
-                    || std::is_same<T, ushort4>::value
-                >::type>
-            {
-                using type = dim::DimInt<4u>;
-            };
-        }
-    }
-    namespace elem
+        } // namespace traits
+    } // namespace cuda
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CUDA vectors 1D dimension get trait specialization.
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, char1>::value || std::is_same<T, double1>::value || std::is_same<T, float1>::value
+                || std::is_same<T, int1>::value || std::is_same<T, long1>::value || std::is_same<T, longlong1>::value
+                || std::is_same<T, short1>::value || std::is_same<T, uchar1>::value || std::is_same<T, uint1>::value
+                || std::is_same<T, ulong1>::value || std::is_same<T, ulonglong1>::value
+                || std::is_same<T, ushort1>::value>>
         {
-            //#############################################################################
-            //! The CUDA vectors elem type trait specialization.
-            template<
-                typename T>
-            struct ElemType<
-                T,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<T>::value>::type>
-            {
-                using type = decltype(std::declval<T>().x);
-            };
-        }
-    }
+            using type = DimInt<1u>;
+        };
+        //#############################################################################
+        //! The CUDA vectors 2D dimension get trait specialization.
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, char2>::value || std::is_same<T, double2>::value || std::is_same<T, float2>::value
+                || std::is_same<T, int2>::value || std::is_same<T, long2>::value || std::is_same<T, longlong2>::value
+                || std::is_same<T, short2>::value || std::is_same<T, uchar2>::value || std::is_same<T, uint2>::value
+                || std::is_same<T, ulong2>::value || std::is_same<T, ulonglong2>::value
+                || std::is_same<T, ushort2>::value>>
+        {
+            using type = DimInt<2u>;
+        };
+        //#############################################################################
+        //! The CUDA vectors 3D dimension get trait specialization.
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, char3>::value || std::is_same<T, dim3>::value || std::is_same<T, double3>::value
+                || std::is_same<T, float3>::value || std::is_same<T, int3>::value || std::is_same<T, long3>::value
+                || std::is_same<T, longlong3>::value || std::is_same<T, short3>::value
+                || std::is_same<T, uchar3>::value || std::is_same<T, uint3>::value || std::is_same<T, ulong3>::value
+                || std::is_same<T, ulonglong3>::value || std::is_same<T, ushort3>::value
+#    if BOOST_COMP_CLANG_CUDA
+                || std::is_same<T, __cuda_builtin_threadIdx_t>::value
+                || std::is_same<T, __cuda_builtin_blockIdx_t>::value
+                || std::is_same<T, __cuda_builtin_blockDim_t>::value
+                || std::is_same<T, __cuda_builtin_gridDim_t>::value
+#    endif
+                >>
+        {
+            using type = DimInt<3u>;
+        };
+        //#############################################################################
+        //! The CUDA vectors 4D dimension get trait specialization.
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, char4>::value || std::is_same<T, double4>::value || std::is_same<T, float4>::value
+                || std::is_same<T, int4>::value || std::is_same<T, long4>::value || std::is_same<T, longlong4>::value
+                || std::is_same<T, short4>::value || std::is_same<T, uchar4>::value || std::is_same<T, uint4>::value
+                || std::is_same<T, ulong4>::value || std::is_same<T, ulonglong4>::value
+                || std::is_same<T, ushort4>::value>>
+        {
+            using type = DimInt<4u>;
+        };
+
+        //#############################################################################
+        //! The CUDA vectors elem type trait specialization.
+        template<typename T>
+        struct ElemType<T, std::enable_if_t<cuda::traits::IsCudaBuiltInType<T>::value>>
+        {
+            using type = decltype(std::declval<T>().x);
+        };
+    } // namespace traits
     namespace extent
     {
         namespace traits
         {
             //#############################################################################
             //! The CUDA vectors extent get trait specialization.
-            template<
-                typename TExtent>
+            template<typename TExtent>
             struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 1u>,
+                DimInt<Dim<TExtent>::value - 1u>,
                 TExtent,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 1)>::type>
+                std::enable_if_t<cuda::traits::IsCudaBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 1)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.x)
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent)
                 {
                     return extent.x;
                 }
             };
             //#############################################################################
             //! The CUDA vectors extent get trait specialization.
-            template<
-                typename TExtent>
+            template<typename TExtent>
             struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 2u>,
+                DimInt<Dim<TExtent>::value - 2u>,
                 TExtent,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 2)>::type>
+                std::enable_if_t<cuda::traits::IsCudaBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 2)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.y)
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent)
                 {
                     return extent.y;
                 }
             };
             //#############################################################################
             //! The CUDA vectors extent get trait specialization.
-            template<
-                typename TExtent>
+            template<typename TExtent>
             struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 3u>,
+                DimInt<Dim<TExtent>::value - 3u>,
                 TExtent,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 3)>::type>
+                std::enable_if_t<cuda::traits::IsCudaBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 3)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.z)
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent)
                 {
                     return extent.z;
                 }
             };
             //#############################################################################
             //! The CUDA vectors extent get trait specialization.
-            template<
-                typename TExtent>
+            template<typename TExtent>
             struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 4u>,
+                DimInt<Dim<TExtent>::value - 4u>,
                 TExtent,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 4)>::type>
+                std::enable_if_t<cuda::traits::IsCudaBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 4)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.w)
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent)
                 {
                     return extent.w;
                 }
             };
             //#############################################################################
             //! The CUDA vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
+            template<typename TExtent, typename TExtentVal>
             struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 1u>,
+                DimInt<Dim<TExtent>::value - 1u>,
                 TExtent,
                 TExtentVal,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 1)>::type>
+                std::enable_if_t<cuda::traits::IsCudaBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 1)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent.x = extentVal;
                 }
             };
             //#############################################################################
             //! The CUDA vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
+            template<typename TExtent, typename TExtentVal>
             struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 2u>,
+                DimInt<Dim<TExtent>::value - 2u>,
                 TExtent,
                 TExtentVal,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 2)>::type>
+                std::enable_if_t<cuda::traits::IsCudaBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 2)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent.y = extentVal;
                 }
             };
             //#############################################################################
             //! The CUDA vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
+            template<typename TExtent, typename TExtentVal>
             struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 3u>,
+                DimInt<Dim<TExtent>::value - 3u>,
                 TExtent,
                 TExtentVal,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 3)>::type>
+                std::enable_if_t<cuda::traits::IsCudaBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 3)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent.z = extentVal;
                 }
             };
             //#############################################################################
             //! The CUDA vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
+            template<typename TExtent, typename TExtentVal>
             struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 4u>,
+                DimInt<Dim<TExtent>::value - 4u>,
                 TExtent,
                 TExtentVal,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 4)>::type>
+                std::enable_if_t<cuda::traits::IsCudaBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 4)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent.w = extentVal;
                 }
             };
-        }
-    }
-    namespace offset
+        } // namespace traits
+    } // namespace extent
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CUDA vectors offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<
+            DimInt<Dim<TOffsets>::value - 1u>,
+            TOffsets,
+            std::enable_if_t<cuda::traits::IsCudaBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 1)>>
         {
-            //#############################################################################
-            //! The CUDA vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 1u>,
-                TOffsets,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 1)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.x)
-                {
-                    return offsets.x;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 2u>,
-                TOffsets,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 2)>::type>
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offsets)
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.y)
-                {
-                    return offsets.y;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 3u>,
-                TOffsets,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 3)>::type>
+                return offsets.x;
+            }
+        };
+        //#############################################################################
+        //! The CUDA vectors offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<
+            DimInt<Dim<TOffsets>::value - 2u>,
+            TOffsets,
+            std::enable_if_t<cuda::traits::IsCudaBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 2)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offsets)
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.z)
-                {
-                    return offsets.z;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 4u>,
-                TOffsets,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 4)>::type>
+                return offsets.y;
+            }
+        };
+        //#############################################################################
+        //! The CUDA vectors offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<
+            DimInt<Dim<TOffsets>::value - 3u>,
+            TOffsets,
+            std::enable_if_t<cuda::traits::IsCudaBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 3)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offsets)
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.w)
-                {
-                    return offsets.w;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 1u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 1)>::type>
+                return offsets.z;
+            }
+        };
+        //#############################################################################
+        //! The CUDA vectors offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<
+            DimInt<Dim<TOffsets>::value - 4u>,
+            TOffsets,
+            std::enable_if_t<cuda::traits::IsCudaBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 4)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offsets)
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.x = offset;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 2u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 2)>::type>
+                return offsets.w;
+            }
+        };
+        //#############################################################################
+        //! The CUDA vectors offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<
+            DimInt<Dim<TOffsets>::value - 1u>,
+            TOffsets,
+            TOffset,
+            std::enable_if_t<cuda::traits::IsCudaBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 1)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.y = offset;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 3u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 3)>::type>
+                offsets.x = offset;
+            }
+        };
+        //#############################################################################
+        //! The CUDA vectors offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<
+            DimInt<Dim<TOffsets>::value - 2u>,
+            TOffsets,
+            TOffset,
+            std::enable_if_t<cuda::traits::IsCudaBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 2)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.z = offset;
-                }
-            };
-            //#############################################################################
-            //! The CUDA vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 4u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 4)>::type>
+                offsets.y = offset;
+            }
+        };
+        //#############################################################################
+        //! The CUDA vectors offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<
+            DimInt<Dim<TOffsets>::value - 3u>,
+            TOffsets,
+            TOffset,
+            std::enable_if_t<cuda::traits::IsCudaBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 3)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.w = offset;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                offsets.z = offset;
+            }
+        };
+        //#############################################################################
+        //! The CUDA vectors offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<
+            DimInt<Dim<TOffsets>::value - 4u>,
+            TOffsets,
+            TOffset,
+            std::enable_if_t<cuda::traits::IsCudaBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 4)>>
         {
-            //#############################################################################
-            //! The CUDA vectors idx type trait specialization.
-            template<
-                typename TIdx>
-            struct IdxType<
-                TIdx,
-                typename std::enable_if<
-                    cuda::traits::IsCudaBuiltInType<TIdx>::value>::type>
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                using type = std::size_t;
-            };
-        }
-    }
-}
+                offsets.w = offset;
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA vectors idx type trait specialization.
+        template<typename TIdx>
+        struct IdxType<TIdx, std::enable_if_t<cuda::traits::IsCudaBuiltInType<TIdx>::value>>
+        {
+            using type = std::size_t;
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#    include <alpaka/core/UniformCudaHip.hpp>
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Debug.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Debug.hpp
index def86d5e56..04c40c4a14 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Debug.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Debug.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Alexander Matthes, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,8 +11,8 @@
 
 #include <alpaka/core/BoostPredef.hpp>
 
-#include <string>
 #include <iostream>
+#include <string>
 
 //-----------------------------------------------------------------------------
 //! The no debug level.
@@ -25,9 +25,9 @@
 #define ALPAKA_DEBUG_FULL 2
 
 #ifndef ALPAKA_DEBUG
-    //-----------------------------------------------------------------------------
-    //! Set the minimum log level if it is not defined.
-    #define ALPAKA_DEBUG ALPAKA_DEBUG_DISABLED
+//-----------------------------------------------------------------------------
+//! Set the minimum log level if it is not defined.
+#    define ALPAKA_DEBUG ALPAKA_DEBUG_DISABLED
 #endif
 
 namespace alpaka
@@ -42,20 +42,18 @@ namespace alpaka
             {
             public:
                 //-----------------------------------------------------------------------------
-                ScopeLogStdOut(
-                    std::string const & sScope) :
-                        m_sScope(sScope)
+                explicit ScopeLogStdOut(std::string const& sScope) : m_sScope(sScope)
                 {
                     std::cout << "[+] " << m_sScope << std::endl;
                 }
                 //-----------------------------------------------------------------------------
-                ScopeLogStdOut(ScopeLogStdOut const &) = delete;
+                ScopeLogStdOut(ScopeLogStdOut const&) = delete;
                 //-----------------------------------------------------------------------------
-                ScopeLogStdOut(ScopeLogStdOut &&) = delete;
+                ScopeLogStdOut(ScopeLogStdOut&&) = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(ScopeLogStdOut const &) -> ScopeLogStdOut & = delete;
+                auto operator=(ScopeLogStdOut const&) -> ScopeLogStdOut& = delete;
                 //-----------------------------------------------------------------------------
-                auto operator=(ScopeLogStdOut &&) -> ScopeLogStdOut & = delete;
+                auto operator=(ScopeLogStdOut&&) -> ScopeLogStdOut& = delete;
                 //-----------------------------------------------------------------------------
                 ~ScopeLogStdOut()
                 {
@@ -65,41 +63,39 @@ namespace alpaka
             private:
                 std::string const m_sScope;
             };
-        }
-    }
-}
+        } // namespace detail
+    } // namespace core
+} // namespace alpaka
 
 //-----------------------------------------------------------------------------
 // Define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE.
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE\
-        ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
+#    define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
 #else
-    #define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
+#    define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
 #endif
 
 //-----------------------------------------------------------------------------
 // Define ALPAKA_DEBUG_FULL_LOG_SCOPE.
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-    #define ALPAKA_DEBUG_FULL_LOG_SCOPE\
-        ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
+#    define ALPAKA_DEBUG_FULL_LOG_SCOPE ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
 #else
-    #define ALPAKA_DEBUG_FULL_LOG_SCOPE
+#    define ALPAKA_DEBUG_FULL_LOG_SCOPE
 #endif
 
 //-----------------------------------------------------------------------------
 // Define ALPAKA_DEBUG_BREAK.
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #if BOOST_COMP_GNUC
-        #define ALPAKA_DEBUG_BREAK ::__builtin_trap()
-    #elif BOOST_COMP_INTEL
-        #define ALPAKA_DEBUG_BREAK ::__debugbreak()
-    #elif BOOST_COMP_MSVC
-        #define ALPAKA_DEBUG_BREAK ::__debugbreak()
-    #else
-        #define ALPAKA_DEBUG_BREAK
-        //#error debug-break for current compiler not implemented!
-    #endif
+#    if BOOST_COMP_GNUC
+#        define ALPAKA_DEBUG_BREAK ::__builtin_trap()
+#    elif BOOST_COMP_INTEL
+#        define ALPAKA_DEBUG_BREAK ::__debugbreak()
+#    elif BOOST_COMP_MSVC
+#        define ALPAKA_DEBUG_BREAK ::__debugbreak()
+#    else
+#        define ALPAKA_DEBUG_BREAK
+  //#error debug-break for current compiler not implemented!
+#    endif
 #else
-    #define ALPAKA_DEBUG_BREAK
+#    define ALPAKA_DEBUG_BREAK
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Decay.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Decay.hpp
new file mode 100644
index 0000000000..7a04c91d66
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Decay.hpp
@@ -0,0 +1,33 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/BoostPredef.hpp>
+
+#include <type_traits>
+
+//-----------------------------------------------------------------------------
+//! Wrapper around std::decay_t for parameter pack expansion expressions
+//
+// Works around Intel compiler internal error when used in empty template pack
+// extension as discussed in #995. It seems not possible to make a workaround
+// with pure C++ tools, like an alias template, so macro it is. Note that
+// there is no known issue outside of empty parameter pack expansions,
+// so the normal std::decay_t can and should be used there.
+//
+// The choice of macro over writing typename std::decay<Type>::type explicitly
+// in parameter pack expansion expressions is to avoid warnings from diagnostic
+// tools, and also for brevity.
+//-----------------------------------------------------------------------------
+#if BOOST_COMP_INTEL || BOOST_COMP_PGI
+#    define ALPAKA_DECAY_T(Type) typename std::decay<Type>::type
+#else
+#    define ALPAKA_DECAY_T(Type) std::decay_t<Type>
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Fibers.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Fibers.hpp
index a9a156b19c..66b45c1ff7 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Fibers.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Fibers.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,33 +11,38 @@
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
 
-#include <alpaka/core/BoostPredef.hpp>
-
-#if BOOST_COMP_MSVC
-    #pragma warning(push)
-
-    #pragma warning(disable: 4100)  // boost/context/detail/apply.hpp(31): warning C4100: "tpl": unreferenced formal parameter
-    #pragma warning(disable: 4245)  // boost/fiber/detail/futex.hpp(52): warning C4245: 'argument': conversion from 'int' to 'DWORD', signed/unsigned mismatch
-    #pragma warning(disable: 4324)  // boost/fiber/detail/context_mpsc_queue.hpp(41): warning C4324: 'boost::fibers::detail::context_mpsc_queue': structure was padded due to alignment specifier
-    #pragma warning(disable: 4456)  // boost/context/execution_context_v2.hpp(301): warning C4456: declaration of 'p' hides previous local declaration
-    #pragma warning(disable: 4702)  // boost/context/execution_context_v2.hpp(49): warning C4702: unreachable code
-    // Boost.Fiber indirectly includes windows.h for which we need to define some things.
-    #define NOMINMAX
-#endif
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if BOOST_COMP_MSVC
+#        pragma warning(push)
+
+#        pragma warning(disable : 4100) // boost/context/detail/apply.hpp(31): warning C4100: "tpl": unreferenced
+                                        // formal parameter
+#        pragma warning(disable : 4245) // boost/fiber/detail/futex.hpp(52): warning C4245: 'argument': conversion from
+                                        // 'int' to 'DWORD', signed/unsigned mismatch
+#        pragma warning(disable : 4324) // boost/fiber/detail/context_mpsc_queue.hpp(41): warning C4324:
+                                        // 'boost::fibers::detail::context_mpsc_queue': structure was padded due to
+                                        // alignment specifier
+#        pragma warning(disable : 4456) // boost/context/execution_context_v2.hpp(301): warning C4456: declaration of
+                                        // 'p' hides previous local declaration
+#        pragma warning(disable : 4702) // boost/context/execution_context_v2.hpp(49): warning C4702: unreachable code
+// Boost.Fiber indirectly includes windows.h for which we need to define some things.
+#        define NOMINMAX
+#    endif
 
 // Boost fiber:
 // http://www.boost.org/doc/libs/develop/libs/fiber/doc/html/index.html
 // https://github.com/boostorg/fiber
-#include <boost/fiber/fiber.hpp>
-#include <boost/fiber/operations.hpp>
-#include <boost/fiber/condition_variable.hpp>
-#include <boost/fiber/mutex.hpp>
-#include <boost/fiber/future.hpp>
-#include <boost/fiber/barrier.hpp>
-
-#if BOOST_COMP_MSVC
-    #undef NOMINMAX
-    #pragma warning(pop)
-#endif
+#    include <boost/fiber/barrier.hpp>
+#    include <boost/fiber/condition_variable.hpp>
+#    include <boost/fiber/fiber.hpp>
+#    include <boost/fiber/future.hpp>
+#    include <boost/fiber/mutex.hpp>
+#    include <boost/fiber/operations.hpp>
+
+#    if BOOST_COMP_MSVC
+#        undef NOMINMAX
+#        pragma warning(pop)
+#    endif
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Hip.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Hip.hpp
index 056f802c7d..b059c94bc5 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Hip.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Hip.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,132 +11,33 @@
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
 
-#include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/core/BoostPredef.hpp>
 
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
+#    if !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
 
-#include <alpaka/elem/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
+#    include <alpaka/elem/Traits.hpp>
+#    include <alpaka/extent/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/meta/Metafunctions.hpp>
+#    include <alpaka/offset/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
 
-#include <hip/hip_runtime.h>
+#    include <hip/hip_runtime.h>
 
-#include <array>
-#include <type_traits>
-#include <utility>
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include <cstddef>
+#    include <cstddef>
+#    include <type_traits>
+#    include <utility>
 
-#ifdef __HIP_PLATFORM_HCC__
-  #define HIPRT_CB
-#endif
+#    if BOOST_COMP_HIP
+#        define HIPRT_CB
+#    endif
 
-
-namespace alpaka
-{
-    namespace hip
-    {
-        namespace detail
-        {
-            //-----------------------------------------------------------------------------
-            //! HIP runtime API error checking with log and exception, ignoring specific error values
-            ALPAKA_FN_HOST inline auto hipRtCheck(
-                hipError_t const & error,
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                if(error != hipSuccess)
-                {
-                    std::string const sError(std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '" + hipGetErrorName(error) +  "': '" + std::string(hipGetErrorString(error)) + "'!");
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cerr << sError << std::endl;
-#endif
-                    ALPAKA_DEBUG_BREAK;
-                    throw std::runtime_error(sError);
-                }
-            }
-            //-----------------------------------------------------------------------------
-            //! HIP runtime API error checking with log and exception, ignoring specific error values
-            // NOTE: All ignored errors have to be convertible to hipError_t.
-            template<
-                typename... TErrors/*,
-                typename = typename std::enable_if<
-                    meta::Conjunction<
-                        std::true_type,
-                        std::is_convertible<
-                            TErrors,
-                            hipError_t
-                        >...
-                    >::value>::type*/>
-            ALPAKA_FN_HOST auto hipRtCheckIgnore(
-                hipError_t const & error,
-                char const * cmd,
-                char const * file,
-                int const & line,
-                TErrors && ... ignoredErrorCodes)
-            -> void
-            {
-                if(error != hipSuccess)
-                {
-                    // https://stackoverflow.com/questions/18792731/can-we-omit-the-double-braces-for-stdarray-in-c14/18792782#18792782
-                    std::array<hipError_t, sizeof...(ignoredErrorCodes)> const aIgnoredErrorCodes{{ignoredErrorCodes...}};
-                    // If the error code is not one of the ignored ones.
-                    if(std::find(aIgnoredErrorCodes.cbegin(), aIgnoredErrorCodes.cend(), error) == aIgnoredErrorCodes.cend())
-                    {
-                        hipRtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line);
-                    }
-                }
-            }
-            //-----------------------------------------------------------------------------
-            //! HIP runtime API last error checking with log and exception.
-            ALPAKA_FN_HOST inline auto hipRtCheckLastError(
-                char const * desc,
-                char const * file,
-                int const & line)
-            -> void
-            {
-                hipError_t const error(hipGetLastError());
-                hipRtCheck(error, desc, file, line);
-            }
-        }
-    }
-}
-
-#if BOOST_COMP_MSVC
-    //-----------------------------------------------------------------------------
-    //! HIP runtime error checking with log and exception, ignoring specific error values
-    #define ALPAKA_HIP_RT_CHECK_IGNORE(cmd, ...)\
-        ::alpaka::hip::detail::hipRtCheckLastError("'" #cmd "' A previous HIP call (not this one) set the error ", __FILE__, __LINE__);\
-        ::alpaka::hip::detail::hipRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, __VA_ARGS__)
-#else
-    #if BOOST_COMP_CLANG
-        #pragma clang diagnostic push
-        #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
-    #endif
-    //-----------------------------------------------------------------------------
-    //! HIP runtime error checking with log and exception, ignoring specific error values
-    #define ALPAKA_HIP_RT_CHECK_IGNORE(cmd, ...)\
-        ::alpaka::hip::detail::hipRtCheckLastError("'" #cmd "' A previous HIP call (not this one) set the error ", __FILE__, __LINE__);\
-        ::alpaka::hip::detail::hipRtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, ##__VA_ARGS__)
-    #if BOOST_COMP_CLANG
-        #pragma clang diagnostic pop
-    #endif
-#endif
-
-//-----------------------------------------------------------------------------
-//! HIP runtime error checking with log and exception.
-#define ALPAKA_HIP_RT_CHECK(cmd)\
-    ALPAKA_HIP_RT_CHECK_IGNORE(cmd)
+#    define ALPAKA_PP_CONCAT_DO(X, Y) X##Y
+#    define ALPAKA_PP_CONCAT(X, Y) ALPAKA_PP_CONCAT_DO(X, Y)
+//! prefix a name with `hip`
+#    define ALPAKA_API_PREFIX(name) ALPAKA_PP_CONCAT_DO(hip, name)
 
 //-----------------------------------------------------------------------------
 // HIP vector_types.h trait specializations.
@@ -150,576 +51,388 @@ namespace alpaka
         {
             //#############################################################################
             //! The HIP vectors 1D dimension get trait specialization.
-            template<
-                typename T>
-            struct IsHipBuiltInType :
-                std::integral_constant<
-                    bool,
-                    std::is_same<T, char1>::value
-                    || std::is_same<T, double1>::value
-                    || std::is_same<T, float1>::value
-                    || std::is_same<T, int1>::value
-                    || std::is_same<T, long1>::value
-                    || std::is_same<T, longlong1>::value
-                    || std::is_same<T, short1>::value
-                    || std::is_same<T, uchar1>::value
-                    || std::is_same<T, uint1>::value
-                    || std::is_same<T, ulong1>::value
-                    || std::is_same<T, ulonglong1>::value
-                    || std::is_same<T, ushort1>::value
-                    || std::is_same<T, char2>::value
-                    || std::is_same<T, double2>::value
-                    || std::is_same<T, float2>::value
-                    || std::is_same<T, int2>::value
-                    || std::is_same<T, long2>::value
-                    || std::is_same<T, longlong2>::value
-                    || std::is_same<T, short2>::value
-                    || std::is_same<T, uchar2>::value
-                    || std::is_same<T, uint2>::value
-                    || std::is_same<T, ulong2>::value
-                    || std::is_same<T, ulonglong2>::value
-                    || std::is_same<T, ushort2>::value
-                    || std::is_same<T, char3>::value
-                    || std::is_same<T, dim3>::value
-                    || std::is_same<T, double3>::value
-                    || std::is_same<T, float3>::value
-                    || std::is_same<T, int3>::value
-                    || std::is_same<T, long3>::value
-                    || std::is_same<T, longlong3>::value
-                    || std::is_same<T, short3>::value
-                    || std::is_same<T, uchar3>::value
-                    || std::is_same<T, uint3>::value
-                    || std::is_same<T, ulong3>::value
-                    || std::is_same<T, ulonglong3>::value
-                    || std::is_same<T, ushort3>::value
-                    || std::is_same<T, char4>::value
-                    || std::is_same<T, double4>::value
-                    || std::is_same<T, float4>::value
-                    || std::is_same<T, int4>::value
-                    || std::is_same<T, long4>::value
-                    || std::is_same<T, longlong4>::value
-                    || std::is_same<T, short4>::value
-                    || std::is_same<T, uchar4>::value
-                    || std::is_same<T, uint4>::value
-                    || std::is_same<T, ulong4>::value
-                    || std::is_same<T, ulonglong4>::value
-                    || std::is_same<T, ushort4>::value
-                >
-            {};
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-          // If you receive '"alpaka::dim::traits::DimType" has already been defined'
-          // then too many operators in the enable_if are used. Split them in two or more structs.
-          // (compiler: gcc 5.3.0)
-            //#############################################################################
-            //! The HIP vectors 1D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char1>::value
-                    || std::is_same<T, double1>::value
-                    || std::is_same<T, float1>::value
-                    || std::is_same<T, int1>::value
-                    || std::is_same<T, long1>::value
-                    || std::is_same<T, longlong1>::value
-                    || std::is_same<T, short1>::value
-                >::type>
-            {
-                using type = dim::DimInt<1u>;
-            };
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, uchar1>::value
-                    || std::is_same<T, uint1>::value
-                    || std::is_same<T, ulong1>::value
-                    || std::is_same<T, ulonglong1>::value
-                    || std::is_same<T, ushort1>::value
-                >::type>
-            {
-                using type = dim::DimInt<1u>;
-            };
-            //#############################################################################
-            //! The HIP vectors 2D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char2>::value
-                    || std::is_same<T, double2>::value
-                    || std::is_same<T, float2>::value
-                    || std::is_same<T, int2>::value
-                    || std::is_same<T, long2>::value
-                    || std::is_same<T, longlong2>::value
-                    || std::is_same<T, short2>::value
-                >::type>
-            {
-                using type = dim::DimInt<2u>;
-            };
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, uchar2>::value
-                    || std::is_same<T, uint2>::value
-                    || std::is_same<T, ulong2>::value
-                    || std::is_same<T, ulonglong2>::value
-                    || std::is_same<T, ushort2>::value
-                >::type>
-            {
-                using type = dim::DimInt<2u>;
-            };
-            //#############################################################################
-            //! The HIP vectors 3D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char3>::value
-                    || std::is_same<T, dim3>::value
-                    || std::is_same<T, double3>::value
-                    || std::is_same<T, float3>::value
-                    || std::is_same<T, int3>::value
-                    || std::is_same<T, long3>::value
-                    || std::is_same<T, longlong3>::value
-                    || std::is_same<T, short3>::value
-                >::type>
-            {
-                using type = dim::DimInt<3u>;
-            };
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, uchar3>::value
-                    || std::is_same<T, uint3>::value
-                    || std::is_same<T, ulong3>::value
-                    || std::is_same<T, ulonglong3>::value
-                    || std::is_same<T, ushort3>::value
-                >::type>
-            {
-                using type = dim::DimInt<3u>;
-            };
-            //#############################################################################
-            //! The HIP vectors 4D dimension get trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, char4>::value
-                    || std::is_same<T, double4>::value
-                    || std::is_same<T, float4>::value
-                    || std::is_same<T, int4>::value
-                    || std::is_same<T, long4>::value
-                    || std::is_same<T, longlong4>::value
-                    || std::is_same<T, short4>::value
-                >::type>
-            {
-                using type = dim::DimInt<4u>;
-            };
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<
-                    std::is_same<T, uchar4>::value
-                    || std::is_same<T, uint4>::value
-                    || std::is_same<T, ulong4>::value
-                    || std::is_same<T, ulonglong4>::value
-                    || std::is_same<T, ushort4>::value
-                >::type>
+            template<typename T>
+            struct IsHipBuiltInType
+                : std::integral_constant<
+                      bool,
+                      std::is_same<T, char1>::value || std::is_same<T, double1>::value
+                          || std::is_same<T, float1>::value || std::is_same<T, int1>::value
+                          || std::is_same<T, long1>::value || std::is_same<T, longlong1>::value
+                          || std::is_same<T, short1>::value || std::is_same<T, uchar1>::value
+                          || std::is_same<T, uint1>::value || std::is_same<T, ulong1>::value
+                          || std::is_same<T, ulonglong1>::value || std::is_same<T, ushort1>::value
+                          || std::is_same<T, char2>::value || std::is_same<T, double2>::value
+                          || std::is_same<T, float2>::value || std::is_same<T, int2>::value
+                          || std::is_same<T, long2>::value || std::is_same<T, longlong2>::value
+                          || std::is_same<T, short2>::value || std::is_same<T, uchar2>::value
+                          || std::is_same<T, uint2>::value || std::is_same<T, ulong2>::value
+                          || std::is_same<T, ulonglong2>::value || std::is_same<T, ushort2>::value
+                          || std::is_same<T, char3>::value || std::is_same<T, dim3>::value
+                          || std::is_same<T, double3>::value || std::is_same<T, float3>::value
+                          || std::is_same<T, int3>::value || std::is_same<T, long3>::value
+                          || std::is_same<T, longlong3>::value || std::is_same<T, short3>::value
+                          || std::is_same<T, uchar3>::value || std::is_same<T, uint3>::value
+                          || std::is_same<T, ulong3>::value || std::is_same<T, ulonglong3>::value
+                          || std::is_same<T, ushort3>::value || std::is_same<T, char4>::value
+                          || std::is_same<T, double4>::value || std::is_same<T, float4>::value
+                          || std::is_same<T, int4>::value || std::is_same<T, long4>::value
+                          || std::is_same<T, longlong4>::value || std::is_same<T, short4>::value
+                          || std::is_same<T, uchar4>::value || std::is_same<T, uint4>::value
+                          || std::is_same<T, ulong4>::value || std::is_same<T, ulonglong4>::value
+                          || std::is_same<T, ushort4>::value>
             {
-                using type = dim::DimInt<4u>;
             };
-        }
-    }
-    namespace elem
+        } // namespace traits
+    } // namespace hip
+    namespace traits
     {
-        namespace traits
+        // If you receive '"alpaka::traits::DimType" has already been defined'
+        // then too many operators in the enable_if are used. Split them in two or more structs.
+        // (compiler: gcc 5.3.0)
+        //#############################################################################
+        //! The HIP vectors 1D dimension get trait specialization.
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, char1>::value || std::is_same<T, double1>::value || std::is_same<T, float1>::value
+                || std::is_same<T, int1>::value || std::is_same<T, long1>::value || std::is_same<T, longlong1>::value
+                || std::is_same<T, short1>::value>>
         {
-            //#############################################################################
-            //! The HIP vectors elem type trait specialization.
-            template<
-                typename T>
-            struct ElemType<
-                T,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<T>::value>::type>
-            {
-                using type = decltype(std::declval<T>().x);
-            };
-        }
-    }
+            using type = DimInt<1u>;
+        };
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, uchar1>::value || std::is_same<T, uint1>::value || std::is_same<T, ulong1>::value
+                || std::is_same<T, ulonglong1>::value || std::is_same<T, ushort1>::value>>
+        {
+            using type = DimInt<1u>;
+        };
+        //#############################################################################
+        //! The HIP vectors 2D dimension get trait specialization.
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, char2>::value || std::is_same<T, double2>::value || std::is_same<T, float2>::value
+                || std::is_same<T, int2>::value || std::is_same<T, long2>::value || std::is_same<T, longlong2>::value
+                || std::is_same<T, short2>::value>>
+        {
+            using type = DimInt<2u>;
+        };
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, uchar2>::value || std::is_same<T, uint2>::value || std::is_same<T, ulong2>::value
+                || std::is_same<T, ulonglong2>::value || std::is_same<T, ushort2>::value>>
+        {
+            using type = DimInt<2u>;
+        };
+        //#############################################################################
+        //! The HIP vectors 3D dimension get trait specialization.
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, char3>::value || std::is_same<T, dim3>::value || std::is_same<T, double3>::value
+                || std::is_same<T, float3>::value || std::is_same<T, int3>::value || std::is_same<T, long3>::value
+                || std::is_same<T, longlong3>::value || std::is_same<T, short3>::value>>
+        {
+            using type = DimInt<3u>;
+        };
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, uchar3>::value || std::is_same<T, uint3>::value || std::is_same<T, ulong3>::value
+                || std::is_same<T, ulonglong3>::value || std::is_same<T, ushort3>::value>>
+        {
+            using type = DimInt<3u>;
+        };
+        //#############################################################################
+        //! The HIP vectors 4D dimension get trait specialization.
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, char4>::value || std::is_same<T, double4>::value || std::is_same<T, float4>::value
+                || std::is_same<T, int4>::value || std::is_same<T, long4>::value || std::is_same<T, longlong4>::value
+                || std::is_same<T, short4>::value>>
+        {
+            using type = DimInt<4u>;
+        };
+        template<typename T>
+        struct DimType<
+            T,
+            std::enable_if_t<
+                std::is_same<T, uchar4>::value || std::is_same<T, uint4>::value || std::is_same<T, ulong4>::value
+                || std::is_same<T, ulonglong4>::value || std::is_same<T, ushort4>::value>>
+        {
+            using type = DimInt<4u>;
+        };
+
+        //#############################################################################
+        //! The HIP vectors elem type trait specialization.
+        template<typename T>
+        struct ElemType<T, std::enable_if_t<hip::traits::IsHipBuiltInType<T>::value>>
+        {
+            using type = decltype(std::declval<T>().x);
+        };
+    } // namespace traits
     namespace extent
     {
         namespace traits
         {
             //#############################################################################
             //! The HIP vectors extent get trait specialization.
-            template<
-                typename TExtent>
+            template<typename TExtent>
             struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 1u>,
+                DimInt<Dim<TExtent>::value - 1u>,
                 TExtent,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 1)>::type>
+                std::enable_if_t<hip::traits::IsHipBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 1)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.x)
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent)
                 {
                     return extent.x;
                 }
             };
             //#############################################################################
             //! The HIP vectors extent get trait specialization.
-            template<
-                typename TExtent>
+            template<typename TExtent>
             struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 2u>,
+                DimInt<Dim<TExtent>::value - 2u>,
                 TExtent,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 2)>::type>
+                std::enable_if_t<hip::traits::IsHipBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 2)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.y)
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent)
                 {
                     return extent.y;
                 }
             };
             //#############################################################################
             //! The HIP vectors extent get trait specialization.
-            template<
-                typename TExtent>
+            template<typename TExtent>
             struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 3u>,
+                DimInt<Dim<TExtent>::value - 3u>,
                 TExtent,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 3)>::type>
+                std::enable_if_t<hip::traits::IsHipBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 3)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.z)
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent)
                 {
                     return extent.z;
                 }
             };
             //#############################################################################
             //! The HIP vectors extent get trait specialization.
-            template<
-                typename TExtent>
+            template<typename TExtent>
             struct GetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 4u>,
+                DimInt<Dim<TExtent>::value - 4u>,
                 TExtent,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 4)>::type>
+                std::enable_if_t<hip::traits::IsHipBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 4)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> decltype(extent.w)
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent)
                 {
                     return extent.w;
                 }
             };
             //#############################################################################
             //! The HIP vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
+            template<typename TExtent, typename TExtentVal>
             struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 1u>,
+                DimInt<Dim<TExtent>::value - 1u>,
                 TExtent,
                 TExtentVal,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 1)>::type>
+                std::enable_if_t<hip::traits::IsHipBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 1)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent.x = extentVal;
                 }
             };
             //#############################################################################
             //! The HIP vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
+            template<typename TExtent, typename TExtentVal>
             struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 2u>,
+                DimInt<Dim<TExtent>::value - 2u>,
                 TExtent,
                 TExtentVal,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 2)>::type>
+                std::enable_if_t<hip::traits::IsHipBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 2)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent.y = extentVal;
                 }
             };
             //#############################################################################
             //! The HIP vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
+            template<typename TExtent, typename TExtentVal>
             struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 3u>,
+                DimInt<Dim<TExtent>::value - 3u>,
                 TExtent,
                 TExtentVal,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 3)>::type>
+                std::enable_if_t<hip::traits::IsHipBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 3)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent.z = extentVal;
                 }
             };
             //#############################################################################
             //! The HIP vectors extent set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
+            template<typename TExtent, typename TExtentVal>
             struct SetExtent<
-                dim::DimInt<dim::Dim<TExtent>::value - 4u>,
+                DimInt<Dim<TExtent>::value - 4u>,
                 TExtent,
                 TExtentVal,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TExtent>::value
-                    && (dim::Dim<TExtent>::value >= 4)>::type>
+                std::enable_if_t<hip::traits::IsHipBuiltInType<TExtent>::value && (Dim<TExtent>::value >= 4)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent.w = extentVal;
                 }
             };
-        }
-    }
-    namespace offset
+        } // namespace traits
+    } // namespace extent
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The HIP vectors offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<
+            DimInt<Dim<TOffsets>::value - 1u>,
+            TOffsets,
+            std::enable_if_t<hip::traits::IsHipBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 1)>>
         {
-            //#############################################################################
-            //! The HIP vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 1u>,
-                TOffsets,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 1)>::type>
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offsets)
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.x)
-                {
-                    return offsets.x;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 2u>,
-                TOffsets,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 2)>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.y)
-                {
-                    return offsets.y;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 3u>,
-                TOffsets,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 3)>::type>
+                return offsets.x;
+            }
+        };
+        //#############################################################################
+        //! The HIP vectors offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<
+            DimInt<Dim<TOffsets>::value - 2u>,
+            TOffsets,
+            std::enable_if_t<hip::traits::IsHipBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 2)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offsets)
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.z)
-                {
-                    return offsets.z;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 4u>,
-                TOffsets,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 4)>::type>
+                return offsets.y;
+            }
+        };
+        //#############################################################################
+        //! The HIP vectors offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<
+            DimInt<Dim<TOffsets>::value - 3u>,
+            TOffsets,
+            std::enable_if_t<hip::traits::IsHipBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 3)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offsets)
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offsets)
-                -> decltype(offsets.w)
-                {
-                    return offsets.w;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 1u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 1)>::type>
+                return offsets.z;
+            }
+        };
+        //#############################################################################
+        //! The HIP vectors offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<
+            DimInt<Dim<TOffsets>::value - 4u>,
+            TOffsets,
+            std::enable_if_t<hip::traits::IsHipBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 4)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offsets)
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.x = offset;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 2u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 2)>::type>
+                return offsets.w;
+            }
+        };
+        //#############################################################################
+        //! The HIP vectors offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<
+            DimInt<Dim<TOffsets>::value - 1u>,
+            TOffsets,
+            TOffset,
+            std::enable_if_t<hip::traits::IsHipBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 1)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.y = offset;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 3u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 3)>::type>
+                offsets.x = offset;
+            }
+        };
+        //#############################################################################
+        //! The HIP vectors offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<
+            DimInt<Dim<TOffsets>::value - 2u>,
+            TOffsets,
+            TOffset,
+            std::enable_if_t<hip::traits::IsHipBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 2)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.z = offset;
-                }
-            };
-            //#############################################################################
-            //! The HIP vectors offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<dim::Dim<TOffsets>::value - 4u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TOffsets>::value
-                    && (dim::Dim<TOffsets>::value >= 4)>::type>
+                offsets.y = offset;
+            }
+        };
+        //#############################################################################
+        //! The HIP vectors offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<
+            DimInt<Dim<TOffsets>::value - 3u>,
+            TOffsets,
+            TOffset,
+            std::enable_if_t<hip::traits::IsHipBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 3)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets.w = offset;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                offsets.z = offset;
+            }
+        };
+        //#############################################################################
+        //! The HIP vectors offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<
+            DimInt<Dim<TOffsets>::value - 4u>,
+            TOffsets,
+            TOffset,
+            std::enable_if_t<hip::traits::IsHipBuiltInType<TOffsets>::value && (Dim<TOffsets>::value >= 4)>>
         {
-            //#############################################################################
-            //! The HIP vectors idx type trait specialization.
-            template<
-                typename TIdx>
-            struct IdxType<
-                TIdx,
-                typename std::enable_if<
-                    hip::traits::IsHipBuiltInType<TIdx>::value>::type>
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                using type = std::size_t;
-            };
-        }
-    }
-}
+                offsets.w = offset;
+            }
+        };
+
+        //#############################################################################
+        //! The HIP vectors idx type trait specialization.
+        template<typename TIdx>
+        struct IdxType<TIdx, std::enable_if_t<hip::traits::IsHipBuiltInType<TIdx>::value>>
+        {
+            using type = std::size_t;
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#    include <alpaka/core/UniformCudaHip.hpp>
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Omp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Omp5.hpp
new file mode 100644
index 0000000000..8b0354a259
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Omp5.hpp
@@ -0,0 +1,63 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/core/Common.hpp>
+
+#    include <omp.h>
+
+#    include <cstddef>
+#    include <iostream>
+#    include <sstream>
+#    include <stdexcept>
+#    include <string>
+#    include <type_traits>
+#    include <utility>
+
+namespace alpaka
+{
+    namespace omp5
+    {
+        namespace detail
+        {
+            //-----------------------------------------------------------------------------
+            //! OMP5 runtime API error checking with log and exception, ignoring specific error values
+            ALPAKA_FN_HOST inline auto omp5Check(int const& error, char const* desc, char const* file, int const& line)
+                -> void
+            {
+                if(error != 0)
+                {
+                    std::ostringstream os;
+                    os << std::string(file) << "(" << std::to_string(line) << ") " << std::string(desc) << " : '"
+                       << error << "': '"
+                       << "TODO"
+                       << "'!";
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                    std::cerr << os.str() << std::endl;
+#    endif
+                    ALPAKA_DEBUG_BREAK;
+                    throw std::runtime_error(os.str());
+                }
+            }
+        } // namespace detail
+    } // namespace omp5
+} // namespace alpaka
+
+//-----------------------------------------------------------------------------
+//! OMP5 runtime error checking with log and exception.
+#    define ALPAKA_OMP5_CHECK(cmd) ::alpaka::omp5::detail::omp5Check(cmd, #    cmd, __FILE__, __LINE__)
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/OmpSchedule.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/OmpSchedule.hpp
new file mode 100644
index 0000000000..07828d2a6e
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/OmpSchedule.hpp
@@ -0,0 +1,101 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/Common.hpp>
+
+#ifdef _OPENMP
+#    include <omp.h>
+#endif
+
+#include <cstdint>
+
+
+namespace alpaka
+{
+    namespace omp
+    {
+        //#############################################################################
+        //! Representation of OpenMP schedule information: kind and chunk size. This class can be used regardless of
+        //! whether OpenMP is enabled.
+        struct Schedule
+        {
+            //! Special schedule setting which does not change the internal control variables of OpenMP
+            constexpr static std::uint32_t NoSchedule = 0u;
+            //! Integers corresponding to the mandatory OpenMP omp_sched_t enum values as of version 5.0.
+            constexpr static std::uint32_t Static = 1u;
+            constexpr static std::uint32_t Dynamic = 2u;
+            constexpr static std::uint32_t Guided = 3u;
+            constexpr static std::uint32_t Auto = 4u;
+            //! Each schedule value can be combined with monotonic using + or |.
+            constexpr static std::uint32_t Monotonic = 0x80000000u;
+
+            //! Schedule kind.
+            //!
+            //! We cannot simply use type omp_sched_t since this struct is agnostic of OpenMP. We also cannot create an
+            //! own mirror enum, since OpenMP implementations are allowed to extend the range of values beyond the
+            //! standard ones defined above. So we have to accept and store any uint32_t value, and for non-standard
+            //! values a user has to ensure the underlying implementation supports it.
+            std::uint32_t kind;
+
+            //! Chunk size. Same as in OpenMP, value 0 corresponds to default chunk size. Using int and not a
+            //! fixed-width type to match OpenMP API.
+            int chunkSize;
+
+            //! The provided value myKind has to be supported by the underlying OpenMP implementation.
+            //! It does not have to be one of the constants defined above.
+            //! A default-constructed schedule does not affect internal control variables of OpenMP.
+            //! The constructor is constexpr to simplify creation of static constexpr ompSchedule in user code.
+            ALPAKA_FN_HOST constexpr Schedule(std::uint32_t myKind = NoSchedule, int myChunkSize = 0)
+                : kind(myKind)
+                , chunkSize(myChunkSize)
+            {
+            }
+        };
+
+        //-----------------------------------------------------------------------------
+        //! Get the OpenMP schedule that is applied when the runtime schedule is used.
+        //!
+        //! For OpenMP >= 3.0 returns the value of the internal control variable run-sched-var.
+        //! Without OpenMP or with OpenMP < 3.0, returns the default schedule.
+        //!
+        //! \return Schedule object.
+        ALPAKA_FN_HOST inline auto getSchedule()
+        {
+            // Getting a runtime schedule requires OpenMP 3.0 or newer
+#if defined _OPENMP && _OPENMP >= 200805
+            omp_sched_t ompKind;
+            int chunkSize = 0;
+            omp_get_schedule(&ompKind, &chunkSize);
+            return Schedule{static_cast<std::uint32_t>(ompKind), chunkSize};
+#else
+            return Schedule{};
+#endif
+        }
+
+        //-----------------------------------------------------------------------------
+        //! Set the OpenMP schedule that is applied when the runtime schedule is used for future parallel regions.
+        //!
+        //! For OpenMP >= 3.0 sets the value of the internal control variable run-sched-var according to the given
+        //! schedule. Without OpenMP or with OpenMP < 3.0, does nothing.
+        //!
+        //! Note that calling from inside a parallel region does not have an immediate effect.
+        ALPAKA_FN_HOST inline void setSchedule(Schedule schedule)
+        {
+            if(schedule.kind != Schedule::NoSchedule)
+            {
+#if defined _OPENMP && _OPENMP >= 200805
+                omp_set_schedule(static_cast<omp_sched_t>(schedule.kind), schedule.chunkSize);
+#endif
+            }
+        }
+
+    } // namespace omp
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Positioning.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Positioning.hpp
index 4147f8ccb1..aaad6ec886 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Positioning.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Positioning.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,15 +12,21 @@
 namespace alpaka
 {
     //#############################################################################
-    //! Defines the parallelism hierarchy levels of Alpaka
+    //! Defines the parallelism hierarchy levels of alpaka
     namespace hierarchy
     {
-        struct Grids{};
+        struct Grids
+        {
+        };
 
-        struct Blocks{};
+        struct Blocks
+        {
+        };
 
-        struct Threads{};
-    }
+        struct Threads
+        {
+        };
+    } // namespace hierarchy
     //-----------------------------------------------------------------------------
     //! Defines the origins available for getting extent and indices of kernel executions.
     namespace origin
@@ -34,7 +40,7 @@ namespace alpaka
         //#############################################################################
         //! This type is used to get the extents relative to the thread.
         struct Thread;
-    }
+    } // namespace origin
     //-----------------------------------------------------------------------------
     //! Defines the units available for getting extent and indices of kernel executions.
     namespace unit
@@ -48,8 +54,8 @@ namespace alpaka
         //#############################################################################
         //! This type is used to get the extents/indices in units of elements.
         struct Elems;
-    }
+    } // namespace unit
 
     using namespace origin;
     using namespace unit;
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/UniformCudaHip.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/UniformCudaHip.hpp
new file mode 100644
index 0000000000..33efadbc0f
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/UniformCudaHip.hpp
@@ -0,0 +1,141 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <array>
+#    include <stdexcept>
+#    include <string>
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace uniform_cuda_hip
+    {
+        namespace detail
+        {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            using Error_t = cudaError;
+#    else
+            using Error_t = hipError_t;
+#    endif
+            //-----------------------------------------------------------------------------
+            //! CUDA/HIP runtime API error checking with log and exception, ignoring specific error values
+            ALPAKA_FN_HOST inline auto rtCheck(
+                Error_t const& error,
+                char const* desc,
+                char const* file,
+                int const& line) -> void
+            {
+                if(error != ALPAKA_API_PREFIX(Success))
+                {
+                    std::string const sError(
+                        std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '"
+                        + ALPAKA_API_PREFIX(GetErrorName)(error) + "': '"
+                        + std::string(ALPAKA_API_PREFIX(GetErrorString)(error)) + "'!");
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                    std::cerr << sError << std::endl;
+#    endif
+                    ALPAKA_DEBUG_BREAK;
+                    // reset the last error to allow user side error handling
+                    ALPAKA_API_PREFIX(GetLastError)();
+                    throw std::runtime_error(sError);
+                }
+            }
+            //-----------------------------------------------------------------------------
+            //! CUDA/Hip runtime API error checking with log and exception, ignoring specific error values
+            // NOTE: All ignored errors have to be convertible to Error_t.
+            template<typename... TErrors>
+            ALPAKA_FN_HOST auto rtCheckIgnore(
+                Error_t const& error,
+                char const* cmd,
+                char const* file,
+                int const& line,
+                TErrors&&... ignoredErrorCodes) -> void
+            {
+                if(error != ALPAKA_API_PREFIX(Success))
+                {
+                    std::array<Error_t, sizeof...(ignoredErrorCodes)> const aIgnoredErrorCodes{ignoredErrorCodes...};
+
+                    // If the error code is not one of the ignored ones.
+                    if(std::find(aIgnoredErrorCodes.cbegin(), aIgnoredErrorCodes.cend(), error)
+                       == aIgnoredErrorCodes.cend())
+                    {
+                        rtCheck(error, ("'" + std::string(cmd) + "' returned error ").c_str(), file, line);
+                    }
+                    else
+                    {
+                        // reset the last error to avoid propagation to the next CUDA/HIP API call
+                        ALPAKA_API_PREFIX(GetLastError)();
+                    }
+                }
+            }
+            //-----------------------------------------------------------------------------
+            //! CUDA runtime API last error checking with log and exception.
+            ALPAKA_FN_HOST inline auto rtCheckLastError(char const* desc, char const* file, int const& line) -> void
+            {
+                Error_t const error(ALPAKA_API_PREFIX(GetLastError)());
+                rtCheck(error, desc, file, line);
+            }
+        } // namespace detail
+    } // namespace uniform_cuda_hip
+} // namespace alpaka
+
+#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+  //-----------------------------------------------------------------------------
+//! CUDA runtime error checking with log and exception, ignoring specific error values
+#        define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd, ...)                                                     \
+            ::alpaka::uniform_cuda_hip::detail::rtCheckLastError(                                                     \
+                "'" #cmd "' A previous API call (not this one) set the error ",                                       \
+                __FILE__,                                                                                             \
+                __LINE__);                                                                                            \
+            ::alpaka::uniform_cuda_hip::detail::rtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, __VA_ARGS__)
+#    else
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic push
+#            pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#        endif
+  //-----------------------------------------------------------------------------
+//! CUDA runtime error checking with log and exception, ignoring specific error values
+#        define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd, ...)                                                     \
+            ::alpaka::uniform_cuda_hip::detail::rtCheckLastError(                                                     \
+                "'" #cmd "' A previous API call (not this one) set the error ",                                       \
+                __FILE__,                                                                                             \
+                __LINE__);                                                                                            \
+            ::alpaka::uniform_cuda_hip::detail::rtCheckIgnore(cmd, #cmd, __FILE__, __LINE__, ##__VA_ARGS__)
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic pop
+#        endif
+#    endif
+
+//-----------------------------------------------------------------------------
+//! CUDA runtime error checking with log and exception.
+#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd) ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd)
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Unroll.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Unroll.hpp
index 829be5ca7f..aa8dfcadd7 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Unroll.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Unroll.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -19,19 +19,19 @@
 //!  for(...){...}`
 // \TODO: Implement for other compilers.
 #if BOOST_ARCH_PTX
-    #if BOOST_COMP_MSVC
-        #define ALPAKA_UNROLL(...) __pragma(unroll __VA_ARGS__)
-    #else
-        #define ALPAKA_UNROLL_STRINGIFY(x) #x
-        #define ALPAKA_UNROLL(...)  _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll __VA_ARGS__))
-    #endif
+#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#        define ALPAKA_UNROLL(...) __pragma(unroll __VA_ARGS__)
+#    else
+#        define ALPAKA_UNROLL_STRINGIFY(x) #        x
+#        define ALPAKA_UNROLL(...) _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll __VA_ARGS__))
+#    endif
 #else
-    #if BOOST_COMP_INTEL || BOOST_COMP_IBM || BOOST_COMP_SUNPRO || BOOST_COMP_HPACC
-        #define ALPAKA_UNROLL_STRINGIFY(x) #x
-        #define ALPAKA_UNROLL(...)  _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll(__VA_ARGS__)))
-    #elif BOOST_COMP_PGI
-        #define ALPAKA_UNROLL(...)  _Pragma("unroll")
-    #else
-        #define ALPAKA_UNROLL(...)
-    #endif
+#    if BOOST_COMP_INTEL || BOOST_COMP_IBM || BOOST_COMP_SUNPRO || BOOST_COMP_HPACC
+#        define ALPAKA_UNROLL_STRINGIFY(x) #        x
+#        define ALPAKA_UNROLL(...) _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll(__VA_ARGS__)))
+#    elif BOOST_COMP_PGI
+#        define ALPAKA_UNROLL(...) _Pragma("unroll")
+#    else
+#        define ALPAKA_UNROLL(...)
+#    endif
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Unused.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Unused.hpp
index 22fb5495d6..db8fd6fefa 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Unused.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Unused.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,27 +11,18 @@
 
 #include <alpaka/core/Common.hpp>
 
-#include <boost/config.hpp>
-
 namespace alpaka
 {
     ALPAKA_NO_HOST_ACC_WARNING
-    template< typename... Ts >
-    BOOST_FORCEINLINE
-    BOOST_CXX14_CONSTEXPR
-    ALPAKA_FN_HOST_ACC
-    void
-    ignore_unused( Ts const& ... )
-    {}
+    template<typename... Ts>
+    ALPAKA_FN_INLINE constexpr ALPAKA_FN_HOST_ACC void ignore_unused(Ts const&...)
+    {
+    }
 
     ALPAKA_NO_HOST_ACC_WARNING
-    template< typename... Ts >
-    BOOST_FORCEINLINE
-    BOOST_CXX14_CONSTEXPR
-    ALPAKA_FN_HOST_ACC
-    void
-    ignore_unused()
-    {}
+    template<typename... Ts>
+    ALPAKA_FN_INLINE constexpr ALPAKA_FN_HOST_ACC void ignore_unused()
+    {
+    }
 
 } // namespace alpaka
-
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Utility.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Utility.hpp
index e312fd9163..b717a0263d 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Utility.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Utility.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,9 +11,9 @@
 #include <alpaka/core/Common.hpp>
 
 #if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
-#   include <type_traits>
+#    include <type_traits>
 #else
-#   include <utility>
+#    include <utility>
 #endif
 
 namespace alpaka
@@ -28,12 +28,10 @@ namespace alpaka
         // This function can be used only within std::decltype().
         //-----------------------------------------------------------------------------
 #if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
-        template< class T >
-        ALPAKA_FN_HOST_ACC
-        typename std::add_rvalue_reference<T>::type
-        declval();
+        template<class T>
+        ALPAKA_FN_HOST_ACC std::add_rvalue_reference_t<T> declval();
 #else
         using std::declval;
 #endif
-    }
-}
+    } // namespace core
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/core/Vectorize.hpp b/thirdParty/cupla/alpaka/include/alpaka/core/Vectorize.hpp
index a90ef0a4f5..b4d06c6a04 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/core/Vectorize.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/core/Vectorize.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -50,13 +50,12 @@ namespace alpaka
 #else
                 16u
 #endif
-            ;
+                ;
 
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             // By default there is no vectorization.
-            template<
-                typename TElem>
+            template<typename TElem>
             struct GetVectorizationSizeElems
             {
                 static constexpr std::size_t value = 1u;
@@ -65,320 +64,310 @@ namespace alpaka
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                double>
+            struct GetVectorizationSizeElems<double>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512F__) || defined(__MIC__)
-                // addition (AVX512F,KNC): vaddpd / _mm512_add_pd
-                // subtraction (AVX512F,KNC): vsubpd / _mm512_sub_pd
-                // multiplication (AVX512F,KNC): vmulpd / _mm512_mul_pd
-                8u;
+                    // addition (AVX512F,KNC): vaddpd / _mm512_add_pd
+                    // subtraction (AVX512F,KNC): vsubpd / _mm512_sub_pd
+                    // multiplication (AVX512F,KNC): vmulpd / _mm512_mul_pd
+                    8u;
 #elif defined(__AVX__)
-                // addition (AVX): vaddpd / _mm256_add_pd
-                // subtraction (AVX): vsubpd / _mm256_sub_pd
-                // multiplication (AVX): vmulpd / _mm256_mul_pd
-                4u;
+                    // addition (AVX): vaddpd / _mm256_add_pd
+                    // subtraction (AVX): vsubpd / _mm256_sub_pd
+                    // multiplication (AVX): vmulpd / _mm256_mul_pd
+                    4u;
 #elif defined(__SSE2__)
-                // addition (SSE2): addpd / _mm_add_pd
-                // subtraction (SSE2): subpd / _mm_sub_pd
-                // multiplication (SSE2): mulpd / _mm_mul_pd
-                2u;
+                    // addition (SSE2): addpd / _mm_add_pd
+                    // subtraction (SSE2): subpd / _mm_sub_pd
+                    // multiplication (SSE2): mulpd / _mm_mul_pd
+                    2u;
 #elif defined(__ARM_NEON__)
-                // No support for double precision vectorization!
-                1u;
+                    // No support for double precision vectorization!
+                    1u;
 #elif defined(__ALTIVEC__)
-                2u;
+                    2u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                float>
+            struct GetVectorizationSizeElems<float>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512F__) || defined(__MIC__)
-                // addition (AVX512F,KNC): vaddps / _mm512_add_ps
-                // subtraction (AVX512F,KNC): vsubps / _mm512_sub_ps
-                // multiplication (AVX512F,KNC): vmulps / _mm512_mul_ps
-                16u;
+                    // addition (AVX512F,KNC): vaddps / _mm512_add_ps
+                    // subtraction (AVX512F,KNC): vsubps / _mm512_sub_ps
+                    // multiplication (AVX512F,KNC): vmulps / _mm512_mul_ps
+                    16u;
 #elif defined(__AVX__)
-                // addition (AVX): vaddps / _mm256_add_ps
-                // subtraction (AVX): vsubps / _mm256_sub_ps
-                // multiplication (AVX): vmulps / _mm256_mul_ps
-                8u;
+                    // addition (AVX): vaddps / _mm256_add_ps
+                    // subtraction (AVX): vsubps / _mm256_sub_ps
+                    // multiplication (AVX): vmulps / _mm256_mul_ps
+                    8u;
 #elif defined(__SSE__)
-                // addition (SSE): addps / _mm_add_ps
-                // subtraction (SSE): subps / _mm_sub_ps
-                // multiplication (SSE): mulps / _mm_mul_ps
-                4u;
+                    // addition (SSE): addps / _mm_add_ps
+                    // subtraction (SSE): subps / _mm_sub_ps
+                    // multiplication (SSE): mulps / _mm_mul_ps
+                    4u;
 #elif defined(__ARM_NEON__)
-                4u;
+                    4u;
 #elif defined(__ALTIVEC__)
-                4u;
+                    4u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                std::int8_t>
+            struct GetVectorizationSizeElems<std::int8_t>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512BW__)
-                // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
-                // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
-                // multiplication: -
-                64u;
+                    // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
+                    // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
+                    // multiplication: -
+                    64u;
 #elif defined(__AVX2__)
-                // addition (AVX2): vpaddb / _mm256_add_epi8
-                // subtraction (AVX2): vpsubb / _mm256_sub_epi8
-                // multiplication: -
-                32u;
+                    // addition (AVX2): vpaddb / _mm256_add_epi8
+                    // subtraction (AVX2): vpsubb / _mm256_sub_epi8
+                    // multiplication: -
+                    32u;
 #elif defined(__SSE2__)
-                // addition (SSE2): paddb / _mm_add_epi8
-                // subtraction (SSE2): psubb / _mm_sub_epi8
-                // multiplication: -
-                16u;
+                    // addition (SSE2): paddb / _mm_add_epi8
+                    // subtraction (SSE2): psubb / _mm_sub_epi8
+                    // multiplication: -
+                    16u;
 #elif defined(__ARM_NEON__)
-                16u;
+                    16u;
 #elif defined(__ALTIVEC__)
-                16u;
+                    16u;
 #elif defined(__CUDA_ARCH__)
-                // addition: __vadd4
-                // subtraction: __vsub4
-                // multiplication: -
-                4u;
+                    // addition: __vadd4
+                    // subtraction: __vsub4
+                    // multiplication: -
+                    4u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                std::uint8_t>
+            struct GetVectorizationSizeElems<std::uint8_t>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512BW__)
-                // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
-                // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
-                // multiplication: -
-                64u;
+                    // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
+                    // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
+                    // multiplication: -
+                    64u;
 #elif defined(__AVX2__)
-                // addition (AVX2): vpaddb / _mm256_add_epi8
-                // subtraction (AVX2): vpsubb / _mm256_sub_epi8
-                // multiplication: -
-                32u;
+                    // addition (AVX2): vpaddb / _mm256_add_epi8
+                    // subtraction (AVX2): vpsubb / _mm256_sub_epi8
+                    // multiplication: -
+                    32u;
 #elif defined(__SSE2__)
-                // addition (SSE2): paddb / _mm_add_epi8
-                // subtraction (SSE2): psubb / _mm_sub_epi8
-                // multiplication: -
-                16u;
+                    // addition (SSE2): paddb / _mm_add_epi8
+                    // subtraction (SSE2): psubb / _mm_sub_epi8
+                    // multiplication: -
+                    16u;
 #elif defined(__ARM_NEON__)
-                16u;
+                    16u;
 #elif defined(__ALTIVEC__)
-                16u;
+                    16u;
 #elif defined(__CUDA_ARCH__)
-                // addition: __vadd4
-                // subtraction: __vsub4
-                // multiplication: -
-                4u;
+                    // addition: __vadd4
+                    // subtraction: __vsub4
+                    // multiplication: -
+                    4u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                std::int16_t>
+            struct GetVectorizationSizeElems<std::int16_t>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512BW__)
-                // addition (AVX512BW): vpaddw / _mm512_mask_add_epi16
-                // subtraction (AVX512BW): vpsubw / _mm512_mask_sub_epi16
-                // multiplication (AVX512BW): vpmullw / _mm512_mask_mullo_epi16
-                32u;
+                    // addition (AVX512BW): vpaddw / _mm512_mask_add_epi16
+                    // subtraction (AVX512BW): vpsubw / _mm512_mask_sub_epi16
+                    // multiplication (AVX512BW): vpmullw / _mm512_mask_mullo_epi16
+                    32u;
 #elif defined(__AVX2__)
-                // addition (AVX2): vpaddw / _mm256_add_epi16
-                // subtraction (AVX2): vpsubw / _mm256_sub_epi16
-                // multiplication (AVX2): vpmullw / _mm256_mullo_epi16
-                16u;
+                    // addition (AVX2): vpaddw / _mm256_add_epi16
+                    // subtraction (AVX2): vpsubw / _mm256_sub_epi16
+                    // multiplication (AVX2): vpmullw / _mm256_mullo_epi16
+                    16u;
 #elif defined(__SSE2__)
-                // addition (SSE2): paddw / _mm_add_epi16
-                // subtraction (SSE2): psubw / _mm_sub_epi16
-                // multiplication (SSE2): pmullw / _mm_mullo_epi16
-                8u;
+                    // addition (SSE2): paddw / _mm_add_epi16
+                    // subtraction (SSE2): psubw / _mm_sub_epi16
+                    // multiplication (SSE2): pmullw / _mm_mullo_epi16
+                    8u;
 #elif defined(__ARM_NEON__)
-                8u;
+                    8u;
 #elif defined(__ALTIVEC__)
-                8u;
+                    8u;
 #elif defined(__CUDA_ARCH__)
-                // addition: __vadd2
-                // subtraction: __vsub2
-                // multiplication: -
-                2u;
+                    // addition: __vadd2
+                    // subtraction: __vsub2
+                    // multiplication: -
+                    2u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                std::uint16_t>
+            struct GetVectorizationSizeElems<std::uint16_t>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512BW__)
-                // addition (AVX512BW): vpaddusw / _mm512_mask_adds_epu16
-                // subtraction (AVX512BW): vpsubw / _mm512_subs_epu16
-                // multiplication: ?
-                32u;
+                    // addition (AVX512BW): vpaddusw / _mm512_mask_adds_epu16
+                    // subtraction (AVX512BW): vpsubw / _mm512_subs_epu16
+                    // multiplication: ?
+                    32u;
 #elif defined(__AVX2__)
-                // addition (AVX2): vpaddusw / _mm256_adds_epu16
-                // subtraction (AVX2): vpsubusw / _mm256_subs_epu16
-                // multiplication: ?
-                16u;
+                    // addition (AVX2): vpaddusw / _mm256_adds_epu16
+                    // subtraction (AVX2): vpsubusw / _mm256_subs_epu16
+                    // multiplication: ?
+                    16u;
 #elif defined(__SSE2__)
-                // addition (SSE2): paddusw / _mm_adds_epu16
-                // subtraction (SSE2): psubusw / _mm_subs_epu16
-                // multiplication: ?
-                8u;
+                    // addition (SSE2): paddusw / _mm_adds_epu16
+                    // subtraction (SSE2): psubusw / _mm_subs_epu16
+                    // multiplication: ?
+                    8u;
 #elif defined(__ARM_NEON__)
-                8u;
+                    8u;
 #elif defined(__ALTIVEC__)
-                8u;
+                    8u;
 #elif defined(__CUDA_ARCH__)
-                // addition: __vadd2
-                // subtraction: __vsub2
-                // multiplication: -
-                2u;
+                    // addition: __vadd2
+                    // subtraction: __vsub2
+                    // multiplication: -
+                    2u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                std::int32_t>
+            struct GetVectorizationSizeElems<std::int32_t>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512F__) || defined(__MIC__)
-                // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
-                // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
-                // multiplication (AVX512F,KNC): vpmulld / _mm512_mask_mullo_epi32
-                16u;
+                    // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
+                    // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
+                    // multiplication (AVX512F,KNC): vpmulld / _mm512_mask_mullo_epi32
+                    16u;
 #elif defined(__AVX2__)
-                // addition (AVX2): vpaddd / _mm256_add_epi32
-                // subtraction (AVX2): vpsubd / _mm256_sub_epi32
-                // multiplication (AVX2): vpmulld / _mm256_mullo_epi32
-                8u;
+                    // addition (AVX2): vpaddd / _mm256_add_epi32
+                    // subtraction (AVX2): vpsubd / _mm256_sub_epi32
+                    // multiplication (AVX2): vpmulld / _mm256_mullo_epi32
+                    8u;
 #elif defined(__SSE2__)
-                // addition (SSE2): paddd / _mm_add_epi32
-                // subtraction (SSE2): psubd / _mm_sub_epi32
-                // multiplication (SSE4.1): pmulld / _mm_mullo_epi32
-                4u;
+                    // addition (SSE2): paddd / _mm_add_epi32
+                    // subtraction (SSE2): psubd / _mm_sub_epi32
+                    // multiplication (SSE4.1): pmulld / _mm_mullo_epi32
+                    4u;
 #elif defined(__ARM_NEON__)
-                4u;
+                    4u;
 #elif defined(__ALTIVEC__)
-                4u;
+                    4u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                std::uint32_t>
+            struct GetVectorizationSizeElems<std::uint32_t>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512F__) || defined(__MIC__)
-                // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
-                // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
-                // multiplication: ?
-                16u;
+                    // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
+                    // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
+                    // multiplication: ?
+                    16u;
 #elif defined(__AVX2__)
-                // addition (AVX2): vpaddd / _mm256_add_epi32
-                // subtraction (AVX2): vpsubd / _mm256_sub_epi32
-                // multiplication: ?
-                8u;
+                    // addition (AVX2): vpaddd / _mm256_add_epi32
+                    // subtraction (AVX2): vpsubd / _mm256_sub_epi32
+                    // multiplication: ?
+                    8u;
 #elif defined(__SSE2__)
-                // addition (SSE2): paddd / _mm_add_epi32
-                // subtraction (SSE2): psubd / _mm_sub_epi32
-                // multiplication: ?
-                4u;
+                    // addition (SSE2): paddd / _mm_add_epi32
+                    // subtraction (SSE2): psubd / _mm_sub_epi32
+                    // multiplication: ?
+                    4u;
 #elif defined(__ARM_NEON__)
-                4u;
+                    4u;
 #elif defined(__ALTIVEC__)
-                4u;
+                    4u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                std::int64_t>
+            struct GetVectorizationSizeElems<std::int64_t>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512F__)
-                // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
-                // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
-                // multiplication (AVX512DQ): vpmullq / _mm512_mask_mullo_epi64
-                8u;
+                    // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
+                    // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
+                    // multiplication (AVX512DQ): vpmullq / _mm512_mask_mullo_epi64
+                    8u;
 #elif defined(__AVX2__)
-                // addition (AVX2): vpaddq / _mm256_add_epi64
-                // subtraction (AVX2): vpsubq / _mm256_sub_epi64
-                // multiplication: -
-                4u;
+                    // addition (AVX2): vpaddq / _mm256_add_epi64
+                    // subtraction (AVX2): vpsubq / _mm256_sub_epi64
+                    // multiplication: -
+                    4u;
 #elif defined(__SSE2__)
-                // addition (SSE2): paddq / _mm_add_epi64
-                // subtraction (SSE2): psubq / _mm_sub_epi64
-                // multiplication: -
-                2u;
+                    // addition (SSE2): paddq / _mm_add_epi64
+                    // subtraction (SSE2): psubq / _mm_sub_epi64
+                    // multiplication: -
+                    2u;
 #elif defined(__ARM_NEON__)
-                2u;
+                    2u;
 #else
-                1u;
+                    1u;
 #endif
             };
             //-----------------------------------------------------------------------------
             // Number of elements of the given type that can be processed in parallel in a vector register.
             template<>
-            struct GetVectorizationSizeElems<
-                std::uint64_t>
+            struct GetVectorizationSizeElems<std::uint64_t>
             {
                 static constexpr std::size_t value =
 #if defined(__AVX512F__)
-                // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
-                // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
-                // multiplication: ?
-                8u;
+                    // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
+                    // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
+                    // multiplication: ?
+                    8u;
 #elif defined(__AVX2__)
-                // addition (AVX2): vpaddq / _mm256_add_epi64
-                // subtraction (AVX2): vpsubq / _mm256_sub_epi64
-                // multiplication: ?
-                4u;
+                    // addition (AVX2): vpaddq / _mm256_add_epi64
+                    // subtraction (AVX2): vpsubq / _mm256_sub_epi64
+                    // multiplication: ?
+                    4u;
 #elif defined(__SSE2__)
-                // addition (SSE2): paddq / _mm_add_epi64
-                // subtraction (SSE2): psubq / _mm_sub_epi64
-                // multiplication: ?
-                2u;
+                    // addition (SSE2): paddq / _mm_add_epi64
+                    // subtraction (SSE2): psubq / _mm_sub_epi64
+                    // multiplication: ?
+                    2u;
 #elif defined(__ARM_NEON__)
-                2u;
+                    2u;
 #else
-                1u;
+                    1u;
 #endif
             };
-        }
-    }
-}
+        } // namespace vectorization
+    } // namespace core
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/ctx/block/CtxBlockOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/ctx/block/CtxBlockOacc.hpp
new file mode 100644
index 0000000000..1e057a0340
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/ctx/block/CtxBlockOacc.hpp
@@ -0,0 +1,251 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+// Base classes.
+#    include <alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp>
+#    include <alpaka/block/shared/st/BlockSharedMemStMember.hpp>
+#    include <alpaka/block/sync/BlockSyncBarrierOacc.hpp>
+#    include <alpaka/idx/gb/IdxGbLinear.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+// Specialized traits.
+#    include <alpaka/idx/Traits.hpp>
+
+#    include <limits>
+#    include <typeinfo>
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelOacc;
+
+    //#############################################################################
+    //! The OpenACC block context.
+    template<typename TDim, typename TIdx>
+    class CtxBlockOacc final
+        : public WorkDivMembers<TDim, TIdx>
+        , public gb::IdxGbLinear<TDim, TIdx>
+        , public BlockSharedMemDynMember<>
+        , public detail::BlockSharedMemStMemberImpl<4>
+        , public BlockSyncBarrierOacc
+        , public concepts::Implements<ConceptBlockSharedSt, CtxBlockOacc<TDim, TIdx>>
+    {
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelOacc;
+
+    protected:
+        //-----------------------------------------------------------------------------
+        CtxBlockOacc(
+            Vec<TDim, TIdx> const& gridBlockExtent,
+            Vec<TDim, TIdx> const& blockThreadExtent,
+            Vec<TDim, TIdx> const& threadElemExtent,
+            TIdx const& gridBlockIdx,
+            std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, threadElemExtent)
+            , gb::IdxGbLinear<TDim, TIdx>(gridBlockIdx)
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            ,
+            //! \TODO can with some TMP determine the amount of statically alloced smem from the kernelFuncObj?
+            detail::BlockSharedMemStMemberImpl<4>(staticMemBegin(), staticMemCapacity())
+            , BlockSyncBarrierOacc()
+        {
+        }
+
+    public:
+        //-----------------------------------------------------------------------------
+        CtxBlockOacc(CtxBlockOacc const&) = delete;
+        //-----------------------------------------------------------------------------
+        CtxBlockOacc(CtxBlockOacc&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(CtxBlockOacc const&) -> CtxBlockOacc& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(CtxBlockOacc&&) -> CtxBlockOacc& = delete;
+        //-----------------------------------------------------------------------------
+        ~CtxBlockOacc() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<typename TDim, typename TIdx>
+        struct SyncBlockThreads<CtxBlockOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            //! Execute op with single thread (any idx, last thread to
+            //! arrive at barrier executes) syncing before and after
+            template<typename TOp>
+            ALPAKA_FN_HOST static auto masterOpBlockThreads(CtxBlockOacc<TDim, TIdx> const& acc, TOp&& op) -> void
+            {
+                const auto slot = (acc.m_generation & 1) << 1;
+                const int workerNum = static_cast<int>(getWorkDiv<Block, Threads>(acc).prod());
+                int sum;
+                // Workaround to use an array in an atomic capture rather than
+                // using the data member m_syncCounter array directly.
+                // The change is semantically equivalent.
+                // However, this should work per the OpenACC standard, but appears to be compiler
+                // issue causing a runtime error.  The error was seen the 20.7 release
+                // of the NVIDIA HPC Compiler but may be corrected in future releases.
+                int* m_syncCounter = acc.m_syncCounter;
+#    pragma acc atomic capture
+                {
+                    ++m_syncCounter[slot];
+                    sum = m_syncCounter[slot];
+                }
+                if(sum == workerNum)
+                {
+                    ++acc.m_generation;
+                    const int nextSlot = (acc.m_generation & 1) << 1;
+                    m_syncCounter[nextSlot] = 0;
+                    m_syncCounter[nextSlot + 1] = 0;
+                    op();
+                }
+                while(sum < workerNum)
+                {
+#    pragma acc atomic read
+                    sum = m_syncCounter[slot];
+                }
+#    pragma acc atomic capture
+                {
+                    ++m_syncCounter[slot];
+                    sum = m_syncCounter[slot];
+                }
+                while(sum < workerNum)
+                {
+#    pragma acc atomic read
+                    sum = m_syncCounter[slot + 1];
+                }
+            }
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto syncBlockThreads(CtxBlockOacc<TDim, TIdx> const& acc) -> void
+            {
+                masterOpBlockThreads<>(acc, []() {});
+            }
+        };
+
+        namespace oacc
+        {
+            namespace detail
+            {
+                //#############################################################################
+                template<typename TOp>
+                struct AtomicOp;
+                //#############################################################################
+                template<>
+                struct AtomicOp<BlockCount>
+                {
+                    void operator()(int& result, bool value)
+                    {
+#    pragma acc atomic update
+                        result += static_cast<int>(value);
+                    }
+                };
+                //#############################################################################
+                template<>
+                struct AtomicOp<BlockAnd>
+                {
+                    void operator()(int& result, bool value)
+                    {
+#    pragma acc atomic update
+                        result &= static_cast<int>(value);
+                    }
+                };
+                //#############################################################################
+                template<>
+                struct AtomicOp<BlockOr>
+                {
+                    void operator()(int& result, bool value)
+                    {
+#    pragma acc atomic update
+                        result |= static_cast<int>(value);
+                    }
+                };
+            } // namespace detail
+        } // namespace oacc
+
+        //#############################################################################
+        template<typename TOp, typename TDim, typename TIdx>
+        struct SyncBlockThreadsPredicate<TOp, CtxBlockOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
+                CtxBlockOacc<TDim, TIdx> const& blockSync,
+                int predicate) -> int
+            {
+                // implicit snyc
+                SyncBlockThreads<CtxBlockOacc<TDim, TIdx>>::masterOpBlockThreads(blockSync, [&blockSync]() {
+                    blockSync.m_result = TOp::InitialValue;
+                });
+
+                int& result(blockSync.m_result);
+                bool const predicateBool(predicate != 0);
+
+                oacc::detail::AtomicOp<TOp>()(result, predicateBool);
+
+                SyncBlockThreads<CtxBlockOacc<TDim, TIdx>>::syncBlockThreads(blockSync);
+
+                return blockSync.m_result;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<CtxBlockOacc<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The OpenACC accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<CtxBlockOacc<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //#############################################################################
+        template<typename T, typename TDim, typename TIdx, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, CtxBlockOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            static auto declareVar(CtxBlockOacc<TDim, TIdx> const& smem) -> T&
+            {
+                traits::SyncBlockThreads<CtxBlockOacc<TDim, TIdx>>::masterOpBlockThreads(smem, [&smem]() {
+                    smem.template alloc<T>();
+                });
+                return smem.template getLatestVar<T>();
+            }
+        };
+
+        //#############################################################################
+        template<typename TDim, typename TIdx>
+        struct FreeSharedVars<CtxBlockOacc<TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            static auto freeVars(CtxBlockOacc<TDim, TIdx> const&) -> void
+            {
+                // Nothing to do. Block shared memory is automatically freed when all threads left the block.
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/DevCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/DevCpu.hpp
index 9fa0b6650e..6c5754eb2d 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/dev/DevCpu.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/dev/DevCpu.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,307 +9,268 @@
 
 #pragma once
 
+#include <alpaka/core/Unused.hpp>
 #include <alpaka/dev/Traits.hpp>
+#include <alpaka/dev/cpu/SysInfo.hpp>
 #include <alpaka/mem/buf/Traits.hpp>
 #include <alpaka/pltf/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/cpu/ICpuQueue.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/cpu/SysInfo.hpp>
-
-#include <alpaka/queue/Traits.hpp>
 #include <alpaka/queue/Properties.hpp>
+#include <alpaka/queue/QueueGenericThreadsBlocking.hpp>
+#include <alpaka/queue/QueueGenericThreadsNonBlocking.hpp>
+#include <alpaka/queue/Traits.hpp>
+#include <alpaka/queue/cpu/IGenericThreadsQueue.hpp>
+#include <alpaka/wait/Traits.hpp>
 
+#include <algorithm>
 #include <map>
-#include <mutex>
 #include <memory>
+#include <mutex>
 #include <vector>
-#include <algorithm>
 
 namespace alpaka
 {
-    namespace queue
+    class DevCpu;
+    namespace cpu
     {
-        class QueueCpuNonBlocking;
-        class QueueCpuBlocking;
-
-        namespace cpu
-        {
-            namespace detail
-            {
-                class QueueCpuNonBlockingImpl;
-                class QueueCpuBlockingImpl;
-            }
-        }
+        using ICpuQueue = IGenericThreadsQueue<DevCpu>;
     }
-    namespace pltf
+    namespace traits
     {
-        namespace traits
-        {
-            template<
-                typename TPltf,
-                typename TSfinae>
-            struct GetDevByIdx;
-        }
-        class PltfCpu;
+        template<typename TPltf, typename TSfinae>
+        struct GetDevByIdx;
     }
-    namespace dev
+    class PltfCpu;
+
+    //-----------------------------------------------------------------------------
+    //! The CPU device.
+    namespace cpu
     {
-        //-----------------------------------------------------------------------------
-        //! The CPU device.
-        namespace cpu
+        namespace detail
         {
-            namespace detail
+            //#############################################################################
+            //! The CPU device implementation.
+            class DevCpuImpl
             {
-                //#############################################################################
-                //! The CPU device implementation.
-                class DevCpuImpl
-                {
-                private:
+            public:
+                //-----------------------------------------------------------------------------
+                DevCpuImpl() = default;
+                //-----------------------------------------------------------------------------
+                DevCpuImpl(DevCpuImpl const&) = delete;
+                //-----------------------------------------------------------------------------
+                DevCpuImpl(DevCpuImpl&&) = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(DevCpuImpl const&) -> DevCpuImpl& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(DevCpuImpl&&) -> DevCpuImpl& = delete;
+                //-----------------------------------------------------------------------------
+                ~DevCpuImpl() = default;
 
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST auto GetAllQueueImpls(
-                        std::vector<std::weak_ptr<queue::cpu::ICpuQueue>> & queues) const
-                    -> std::vector<std::shared_ptr<queue::cpu::ICpuQueue>>
-                    {
-                        std::vector<std::shared_ptr<queue::cpu::ICpuQueue>> vspQueues;
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST auto getAllExistingQueues() const -> std::vector<std::shared_ptr<cpu::ICpuQueue>>
+                {
+                    std::vector<std::shared_ptr<cpu::ICpuQueue>> vspQueues;
 
-                        std::lock_guard<std::mutex> lk(m_Mutex);
+                    std::lock_guard<std::mutex> lk(m_Mutex);
+                    vspQueues.reserve(m_queues.size());
 
-                        for(auto it = queues.begin(); it != queues.end();)
+                    for(auto it = m_queues.begin(); it != m_queues.end();)
+                    {
+                        auto spQueue(it->lock());
+                        if(spQueue)
+                        {
+                            vspQueues.emplace_back(std::move(spQueue));
+                            ++it;
+                        }
+                        else
                         {
-                            auto spQueue(it->lock());
-                            if(spQueue)
-                            {
-                                vspQueues.emplace_back(std::move(spQueue));
-                                ++it;
-                            }
-                            else
-                            {
-                                it = queues.erase(it);
-                            }
+                            it = m_queues.erase(it);
                         }
-                        return vspQueues;
                     }
+                    return vspQueues;
+                }
 
-                public:
-                    //-----------------------------------------------------------------------------
-                    DevCpuImpl() = default;
-                    //-----------------------------------------------------------------------------
-                    DevCpuImpl(DevCpuImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    DevCpuImpl(DevCpuImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(DevCpuImpl const &) -> DevCpuImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(DevCpuImpl &&) -> DevCpuImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ~DevCpuImpl() = default;
-
-                    ALPAKA_FN_HOST auto GetAllQueues() const
-                    -> std::vector<std::shared_ptr<queue::cpu::ICpuQueue>>
-                    {
-                        return GetAllQueueImpls(m_queues);
-                    }
+                //-----------------------------------------------------------------------------
+                //! Registers the given queue on this device.
+                //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+                ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<cpu::ICpuQueue> spQueue) const -> void
+                {
+                    std::lock_guard<std::mutex> lk(m_Mutex);
 
-                    //-----------------------------------------------------------------------------
-                    //! Registers the given queue on this device.
-                    //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
-                    ALPAKA_FN_HOST auto RegisterQueue(std::shared_ptr<queue::cpu::ICpuQueue> spQueue)
-                    -> void
-                    {
-                        std::lock_guard<std::mutex> lk(m_Mutex);
+                    // Register this queue on the device.
+                    m_queues.push_back(spQueue);
+                }
 
-                        // Register this queue on the device.
-                        m_queues.push_back(spQueue);
-                    }
+            private:
+                std::mutex mutable m_Mutex;
+                std::vector<std::weak_ptr<cpu::ICpuQueue>> mutable m_queues;
+            };
+        } // namespace detail
+    } // namespace cpu
 
-                private:
-                    std::mutex mutable m_Mutex;
-                    std::vector<std::weak_ptr<queue::cpu::ICpuQueue>> mutable m_queues;
-                };
-            }
+    //#############################################################################
+    //! The CPU device handle.
+    class DevCpu
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevCpu>
+        , public concepts::Implements<ConceptDev, DevCpu>
+    {
+        friend struct traits::GetDevByIdx<PltfCpu>;
+
+    protected:
+        //-----------------------------------------------------------------------------
+        DevCpu() : m_spDevCpuImpl(std::make_shared<cpu::detail::DevCpuImpl>())
+        {
         }
 
+    public:
+        //-----------------------------------------------------------------------------
+        DevCpu(DevCpu const&) = default;
+        //-----------------------------------------------------------------------------
+        DevCpu(DevCpu&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(DevCpu const&) -> DevCpu& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(DevCpu&&) -> DevCpu& = default;
+        //-----------------------------------------------------------------------------
+        auto operator==(DevCpu const&) const -> bool
+        {
+            return true;
+        }
+        //-----------------------------------------------------------------------------
+        auto operator!=(DevCpu const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~DevCpu() = default;
+
+        ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<cpu::ICpuQueue>>
+        {
+            return m_spDevCpuImpl->getAllExistingQueues();
+        }
+
+        //-----------------------------------------------------------------------------
+        //! Registers the given queue on this device.
+        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<cpu::ICpuQueue> spQueue) const -> void
+        {
+            m_spDevCpuImpl->registerQueue(spQueue);
+        }
+
+    public:
+        std::shared_ptr<cpu::detail::DevCpuImpl> m_spDevCpuImpl;
+    };
+
+    namespace traits
+    {
         //#############################################################################
-        //! The CPU device handle.
-        class DevCpu : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, DevCpu>
+        //! The CPU device name get trait specialization.
+        template<>
+        struct GetName<DevCpu>
         {
-            friend struct pltf::traits::GetDevByIdx<pltf::PltfCpu>;
-        protected:
-            //-----------------------------------------------------------------------------
-            DevCpu() :
-                m_spDevCpuImpl(std::make_shared<cpu::detail::DevCpuImpl>())
-            {}
-        public:
-            //-----------------------------------------------------------------------------
-            DevCpu(DevCpu const &) = default;
-            //-----------------------------------------------------------------------------
-            DevCpu(DevCpu &&) = default;
             //-----------------------------------------------------------------------------
-            auto operator=(DevCpu const &) -> DevCpu & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevCpu &&) -> DevCpu & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(DevCpu const &) const
-            -> bool
+            ALPAKA_FN_HOST static auto getName(DevCpu const& dev) -> std::string
             {
-                return true;
+                alpaka::ignore_unused(dev);
+
+                return cpu::detail::getCpuName();
             }
+        };
+
+        //#############################################################################
+        //! The CPU device available memory get trait specialization.
+        template<>
+        struct GetMemBytes<DevCpu>
+        {
             //-----------------------------------------------------------------------------
-            auto operator!=(DevCpu const & rhs) const
-            -> bool
+            ALPAKA_FN_HOST static auto getMemBytes(DevCpu const& dev) -> std::size_t
             {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~DevCpu() = default;
+                alpaka::ignore_unused(dev);
 
-        public:
-            std::shared_ptr<cpu::detail::DevCpuImpl> m_spDevCpuImpl;
+                return cpu::detail::getTotalGlobalMemSizeBytes();
+            }
         };
-    }
 
-    namespace dev
-    {
-        namespace traits
+        //#############################################################################
+        //! The CPU device free memory get trait specialization.
+        template<>
+        struct GetFreeMemBytes<DevCpu>
         {
-            //#############################################################################
-            //! The CPU device name get trait specialization.
-            template<>
-            struct GetName<
-                dev::DevCpu>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getFreeMemBytes(DevCpu const& dev) -> std::size_t
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getName(
-                    dev::DevCpu const & dev)
-                -> std::string
-                {
-                    alpaka::ignore_unused(dev);
+                alpaka::ignore_unused(dev);
 
-                    return dev::cpu::detail::getCpuName();
-                }
-            };
+                return cpu::detail::getFreeGlobalMemSizeBytes();
+            }
+        };
 
-            //#############################################################################
-            //! The CPU device available memory get trait specialization.
-            template<>
-            struct GetMemBytes<
-                dev::DevCpu>
+        //#############################################################################
+        //! The CPU device warp size get trait specialization.
+        template<>
+        struct GetWarpSize<DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getWarpSize(DevCpu const& dev) -> std::size_t
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getMemBytes(
-                    dev::DevCpu const & dev)
-                -> std::size_t
-                {
-                    alpaka::ignore_unused(dev);
+                alpaka::ignore_unused(dev);
 
-                    return dev::cpu::detail::getTotalGlobalMemSizeBytes();
-                }
-            };
+                return 1u;
+            }
+        };
 
-            //#############################################################################
-            //! The CPU device free memory get trait specialization.
-            template<>
-            struct GetFreeMemBytes<
-                dev::DevCpu>
+        //#############################################################################
+        //! The CPU device reset trait specialization.
+        template<>
+        struct Reset<DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto reset(DevCpu const& dev) -> void
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getFreeMemBytes(
-                    dev::DevCpu const & dev)
-                -> std::size_t
-                {
-                    alpaka::ignore_unused(dev);
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-                    return dev::cpu::detail::getFreeGlobalMemSizeBytes();
-                }
-            };
+                alpaka::ignore_unused(dev);
 
-            //#############################################################################
-            //! The CPU device reset trait specialization.
-            template<>
-            struct Reset<
-                dev::DevCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto reset(
-                    dev::DevCpu const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+                // The CPU does nothing on reset.
+            }
+        };
+    } // namespace traits
 
-                    alpaka::ignore_unused(dev);
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufCpu;
 
-                    // The CPU does nothing on reset.
-                }
-            };
-        }
-    }
-    namespace mem
+    namespace traits
     {
-        namespace buf
+        //#############################################################################
+        //! The CPU device memory buffer type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufType<DevCpu, TElem, TDim, TIdx>
         {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCpu;
+            using type = BufCpu<TElem, TDim, TIdx>;
+        };
 
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU device memory buffer type trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct BufType<
-                    dev::DevCpu,
-                    TElem,
-                    TDim,
-                    TIdx>
-                {
-                    using type = mem::buf::BufCpu<TElem, TDim, TIdx>;
-                };
-            }
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+        //#############################################################################
+        //! The CPU device platform type trait specialization.
+        template<>
+        struct PltfType<DevCpu>
         {
-            //#############################################################################
-            //! The CPU device platform type trait specialization.
-            template<>
-            struct PltfType<
-                dev::DevCpu>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace queue
+            using type = PltfCpu;
+        };
+    } // namespace traits
+    using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
+    using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
+
+    namespace traits
     {
-        namespace traits
+        template<>
+        struct QueueType<DevCpu, Blocking>
         {
-            template<>
-            struct QueueType<
-                dev::DevCpu,
-                queue::Blocking
-            >
-            {
-                using type = queue::QueueCpuBlocking;
-            };
+            using type = QueueCpuBlocking;
+        };
 
-            template<>
-            struct QueueType<
-                dev::DevCpu,
-                queue::NonBlocking
-            >
-            {
-                using type = queue::QueueCpuNonBlocking;
-            };
-        }
-    }
-}
+        template<>
+        struct QueueType<DevCpu, NonBlocking>
+        {
+            using type = QueueCpuNonBlocking;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/DevCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/DevCudaRt.hpp
deleted file mode 100644
index 8fd7322845..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/dev/DevCudaRt.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/queue/Properties.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        namespace traits
-        {
-            template<
-                typename TPltf,
-                typename TSfinae>
-            struct GetDevByIdx;
-        }
-        class PltfCudaRt;
-    }
-
-    namespace queue
-    {
-        class QueueCudaRtBlocking;
-        class QueueCudaRtNonBlocking;
-    }
-
-    namespace dev
-    {
-        //#############################################################################
-        //! The CUDA RT device handle.
-        class DevCudaRt : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, DevCudaRt>
-        {
-            friend struct pltf::traits::GetDevByIdx<pltf::PltfCudaRt>;
-
-        protected:
-            //-----------------------------------------------------------------------------
-            DevCudaRt() = default;
-        public:
-            //-----------------------------------------------------------------------------
-            DevCudaRt(DevCudaRt const &) = default;
-            //-----------------------------------------------------------------------------
-            DevCudaRt(DevCudaRt &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevCudaRt const &) -> DevCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevCudaRt &&) -> DevCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(DevCudaRt const & rhs) const
-            -> bool
-            {
-                return m_iDevice == rhs.m_iDevice;
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(DevCudaRt const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~DevCudaRt() = default;
-
-        public:
-            int m_iDevice;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device name get trait specialization.
-            template<>
-            struct GetName<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getName(
-                    dev::DevCudaRt const & dev)
-                -> std::string
-                {
-                    // There is cudaDeviceGetAttribute as faster alternative to cudaGetDeviceProperties to get a single device property but it has no option to get the name
-                    cudaDeviceProp cudaDevProp;
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaGetDeviceProperties(
-                            &cudaDevProp,
-                            dev.m_iDevice));
-
-                    return std::string(cudaDevProp.name);
-                }
-            };
-
-            //#############################################################################
-            //! The CUDA RT device available memory get trait specialization.
-            template<>
-            struct GetMemBytes<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getMemBytes(
-                    dev::DevCudaRt const & dev)
-                -> std::size_t
-                {
-                    // Set the current device to wait for.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-
-                    std::size_t freeInternal(0u);
-                    std::size_t totalInternal(0u);
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemGetInfo(
-                            &freeInternal,
-                            &totalInternal));
-
-                    return totalInternal;
-                }
-            };
-
-            //#############################################################################
-            //! The CUDA RT device free memory get trait specialization.
-            template<>
-            struct GetFreeMemBytes<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getFreeMemBytes(
-                    dev::DevCudaRt const & dev)
-                -> std::size_t
-                {
-                    // Set the current device to wait for.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-
-                    std::size_t freeInternal(0u);
-                    std::size_t totalInternal(0u);
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemGetInfo(
-                            &freeInternal,
-                            &totalInternal));
-
-                    return freeInternal;
-                }
-            };
-
-            //#############################################################################
-            //! The CUDA RT device reset trait specialization.
-            template<>
-            struct Reset<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto reset(
-                    dev::DevCudaRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Set the current device to wait for.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaDeviceReset());
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCudaRt;
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA RT device memory buffer type trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct BufType<
-                    dev::DevCudaRt,
-                    TElem,
-                    TDim,
-                    TIdx>
-                {
-                    using type = mem::buf::BufCudaRt<TElem, TDim, TIdx>;
-                };
-            }
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device platform type trait specialization.
-            template<>
-            struct PltfType<
-                dev::DevCudaRt>
-            {
-                using type = pltf::PltfCudaRt;
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The thread CUDA device wait specialization.
-            //!
-            //! Blocks until the device has completed all preceding requested tasks.
-            //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
-            template<>
-            struct CurrentThreadWaitFor<
-                dev::DevCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    dev::DevCudaRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Set the current device to wait for.
-                    ALPAKA_CUDA_RT_CHECK(cudaSetDevice(
-                        dev.m_iDevice));
-                    ALPAKA_CUDA_RT_CHECK(cudaDeviceSynchronize());
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            template<>
-            struct QueueType<
-                dev::DevCudaRt,
-                queue::Blocking
-            >
-            {
-                using type = queue::QueueCudaRtBlocking;
-            };
-
-            template<>
-            struct QueueType<
-                dev::DevCudaRt,
-                queue::NonBlocking
-            >
-            {
-                using type = queue::QueueCudaRtNonBlocking;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/DevHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/DevHipRt.hpp
deleted file mode 100644
index d922626654..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/dev/DevHipRt.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/queue/Properties.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        namespace traits
-        {
-            template<
-                typename TPltf,
-                typename TSfinae>
-            struct GetDevByIdx;
-        }
-        class PltfHipRt;
-    }
-
-    namespace queue
-    {
-        class QueueHipRtBlocking;
-        class QueueHipRtNonBlocking;
-    }
-
-    namespace dev
-    {
-        //#############################################################################
-        //! The HIP RT device handle.
-        class DevHipRt : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, DevHipRt>
-        {
-            friend struct pltf::traits::GetDevByIdx<pltf::PltfHipRt>;
-
-        protected:
-            //-----------------------------------------------------------------------------
-            DevHipRt() = default;
-        public:
-            //-----------------------------------------------------------------------------
-            DevHipRt(DevHipRt const &) = default;
-            //-----------------------------------------------------------------------------
-            DevHipRt(DevHipRt &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevHipRt const &) -> DevHipRt & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(DevHipRt &&) -> DevHipRt & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(DevHipRt const & rhs) const
-            -> bool
-            {
-                return m_iDevice == rhs.m_iDevice;
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(DevHipRt const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST_ACC ~DevHipRt() = default;
-
-        public:
-            int m_iDevice;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device name get trait specialization.
-            template<>
-            struct GetName<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getName(
-                    dev::DevHipRt const & dev)
-                -> std::string
-                {
-                    hipDeviceProp_t hipDevProp;
-                    ALPAKA_HIP_RT_CHECK(
-                        hipGetDeviceProperties(
-                            &hipDevProp,
-                            dev.m_iDevice));
-
-                    return std::string(hipDevProp.name);
-                }
-            };
-
-            //#############################################################################
-            //! The HIP RT device available memory get trait specialization.
-            template<>
-            struct GetMemBytes<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getMemBytes(
-                    dev::DevHipRt const & dev)
-                -> std::size_t
-                {
-                    // Set the current device to wait for.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-
-                    std::size_t freeInternal(0u);
-                    std::size_t totalInternal(0u);
-
-                    // \TODO: Check which is faster: hipMemGetInfo().totalInternal vs hipGetDeviceProperties().totalGlobalMem
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemGetInfo(
-                            &freeInternal,
-                            &totalInternal));
-
-                    return totalInternal;
-                }
-            };
-
-            //#############################################################################
-            //! The HIP RT device free memory get trait specialization.
-            template<>
-            struct GetFreeMemBytes<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getFreeMemBytes(
-                    dev::DevHipRt const & dev)
-                -> std::size_t
-                {
-                    // Set the current device to wait for.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-
-                    std::size_t freeInternal(0u);
-                    std::size_t totalInternal(0u);
-
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemGetInfo(
-                            &freeInternal,
-                            &totalInternal));
-
-                    return freeInternal;
-                }
-            };
-
-            //#############################################################################
-            //! The HIP RT device reset trait specialization.
-            template<>
-            struct Reset<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto reset(
-                    dev::DevHipRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Set the current device to wait for.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-                    ALPAKA_HIP_RT_CHECK(
-                        hipDeviceReset());
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufHipRt;
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP RT device memory buffer type trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct BufType<
-                    dev::DevHipRt,
-                    TElem,
-                    TDim,
-                    TIdx>
-                {
-                    using type = mem::buf::BufHipRt<TElem, TDim, TIdx>;
-                };
-            }
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device platform type trait specialization.
-            template<>
-            struct PltfType<
-                dev::DevHipRt>
-            {
-                using type = pltf::PltfHipRt;
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The thread HIP device wait specialization.
-            //!
-            //! Blocks until the device has completed all preceding requested tasks.
-            //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
-            template<>
-            struct CurrentThreadWaitFor<
-                dev::DevHipRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    dev::DevHipRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Set the current device to wait for.
-                    ALPAKA_HIP_RT_CHECK(hipSetDevice(
-                        dev.m_iDevice));
-                    ALPAKA_HIP_RT_CHECK(hipDeviceSynchronize());
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            template<>
-            struct QueueType<
-                dev::DevHipRt,
-                queue::Blocking
-            >
-            {
-                using type = queue::QueueHipRtBlocking;
-            };
-
-            template<>
-            struct QueueType<
-                dev::DevHipRt,
-                queue::NonBlocking
-            >
-            {
-                using type = queue::QueueHipRtNonBlocking;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/DevOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/DevOacc.hpp
new file mode 100644
index 0000000000..6cefd928c2
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/dev/DevOacc.hpp
@@ -0,0 +1,309 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/mem/buf/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+#    include <alpaka/queue/Properties.hpp>
+#    include <alpaka/queue/QueueGenericThreadsBlocking.hpp>
+#    include <alpaka/queue/QueueGenericThreadsNonBlocking.hpp>
+#    include <alpaka/queue/Traits.hpp>
+#    include <alpaka/queue/cpu/IGenericThreadsQueue.hpp>
+#    include <alpaka/wait/Traits.hpp>
+
+#    include <openacc.h>
+
+namespace alpaka
+{
+    class DevOacc;
+
+    namespace traits
+    {
+        template<typename TPltf, typename TSfinae>
+        struct GetDevByIdx;
+    }
+    class PltfOacc;
+
+    namespace oacc
+    {
+        namespace detail
+        {
+            //#############################################################################
+            //! The OpenACC device implementation.
+            class DevOaccImpl
+            {
+            public:
+                //-----------------------------------------------------------------------------
+                DevOaccImpl(int iDevice) noexcept : m_deviceType(::acc_get_device_type()), m_iDevice(iDevice)
+                {
+                }
+                //-----------------------------------------------------------------------------
+                DevOaccImpl(DevOaccImpl const&) = delete;
+                //-----------------------------------------------------------------------------
+                DevOaccImpl(DevOaccImpl&&) = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(DevOaccImpl const&) -> DevOaccImpl& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(DevOaccImpl&&) -> DevOaccImpl& = delete;
+                //-----------------------------------------------------------------------------
+                ~DevOaccImpl() = default;
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST auto getAllExistingQueues() const
+                    -> std::vector<std::shared_ptr<IGenericThreadsQueue<DevOacc>>>
+                {
+                    std::vector<std::shared_ptr<IGenericThreadsQueue<DevOacc>>> vspQueues;
+
+                    std::lock_guard<std::mutex> lk(m_Mutex);
+                    vspQueues.reserve(m_queues.size());
+
+                    for(auto it = m_queues.begin(); it != m_queues.end();)
+                    {
+                        auto spQueue(it->lock());
+                        if(spQueue)
+                        {
+                            vspQueues.emplace_back(std::move(spQueue));
+                            ++it;
+                        }
+                        else
+                        {
+                            it = m_queues.erase(it);
+                        }
+                    }
+                    return vspQueues;
+                }
+
+                //-----------------------------------------------------------------------------
+                //! Registers the given queue on this device.
+                //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+                ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<IGenericThreadsQueue<DevOacc>> spQueue) -> void
+                {
+                    std::lock_guard<std::mutex> lk(m_Mutex);
+
+                    // Register this queue on the device.
+                    m_queues.push_back(std::move(spQueue));
+                }
+
+                int iDevice() const
+                {
+                    return m_iDevice;
+                }
+                acc_device_t deviceType() const
+                {
+                    return m_deviceType;
+                }
+
+            private:
+                std::mutex mutable m_Mutex;
+                std::vector<std::weak_ptr<IGenericThreadsQueue<DevOacc>>> mutable m_queues;
+                acc_device_t m_deviceType;
+                int m_iDevice;
+            };
+        } // namespace detail
+    } // namespace oacc
+    //#############################################################################
+    //! The OpenACC device handle.
+    class DevOacc
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevOacc>
+        , public concepts::Implements<ConceptDev, DevOacc>
+    {
+        friend struct traits::GetDevByIdx<PltfOacc>;
+
+    protected:
+        //-----------------------------------------------------------------------------
+        DevOacc(int iDevice) : m_spDevOaccImpl(std::make_shared<oacc::detail::DevOaccImpl>(iDevice))
+        {
+        }
+
+    public:
+        //-----------------------------------------------------------------------------
+        DevOacc(DevOacc const&) = default;
+        //-----------------------------------------------------------------------------
+        DevOacc(DevOacc&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(DevOacc const&) -> DevOacc& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(DevOacc&&) -> DevOacc& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator==(DevOacc const& rhs) const -> bool
+        {
+            return m_spDevOaccImpl->iDevice() == rhs.m_spDevOaccImpl->iDevice();
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator!=(DevOacc const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~DevOacc() = default;
+        int iDevice() const
+        {
+            return m_spDevOaccImpl->iDevice();
+        }
+        acc_device_t deviceType() const
+        {
+            return m_spDevOaccImpl->deviceType();
+        }
+        void makeCurrent() const
+        {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << "acc_set_device_num( " << m_spDevOaccImpl->iDevice() << ", [type] )" << std::endl;
+#    endif
+            acc_set_device_num(m_spDevOaccImpl->iDevice(), m_spDevOaccImpl->deviceType());
+        }
+
+        ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<IGenericThreadsQueue<DevOacc>>>
+        {
+            return m_spDevOaccImpl->getAllExistingQueues();
+        }
+
+        //-----------------------------------------------------------------------------
+        //! Registers the given queue on this device.
+        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<IGenericThreadsQueue<DevOacc>> spQueue) const -> void
+        {
+            m_spDevOaccImpl->registerQueue(spQueue);
+        }
+
+    public:
+        std::shared_ptr<oacc::detail::DevOaccImpl> m_spDevOaccImpl;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenACC device name get trait specialization.
+        template<>
+        struct GetName<DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getName(DevOacc const&) -> std::string
+            {
+                return std::string("OpenACC target");
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC device available memory get trait specialization.
+        template<>
+        struct GetMemBytes<DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getMemBytes(DevOacc const& dev) -> std::size_t
+            {
+                return acc_get_property(dev.iDevice(), dev.deviceType(), acc_property_memory);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC device free memory get trait specialization.
+        template<>
+        struct GetFreeMemBytes<DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getFreeMemBytes(DevOacc const& dev) -> std::size_t
+            {
+                return acc_get_property(dev.iDevice(), dev.deviceType(), acc_property_free_memory);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC device warp size get trait specialization.
+        template<>
+        struct GetWarpSize<DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getWarpSize(DevOacc const& dev) -> std::size_t
+            {
+                alpaka::ignore_unused(dev);
+
+                return 1u;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC device reset trait specialization.
+        template<>
+        struct Reset<DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto reset(DevOacc const& dev) -> void
+            {
+                alpaka::ignore_unused(dev); //! \TODO
+            }
+        };
+    } // namespace traits
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufOacc;
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenACC device memory buffer type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufType<DevOacc, TElem, TDim, TIdx>
+        {
+            using type = BufOacc<TElem, TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The OpenACC device platform type trait specialization.
+        template<>
+        struct PltfType<DevOacc>
+        {
+            using type = PltfOacc;
+        };
+    } // namespace traits
+
+    using QueueOaccNonBlocking = QueueGenericThreadsNonBlocking<DevOacc>;
+    using QueueOaccBlocking = QueueGenericThreadsBlocking<DevOacc>;
+
+    namespace traits
+    {
+        template<>
+        struct QueueType<DevOacc, Blocking>
+        {
+            using type = QueueOaccBlocking;
+        };
+
+        template<>
+        struct QueueType<DevOacc, NonBlocking>
+        {
+            using type = QueueOaccNonBlocking;
+        };
+
+        //#############################################################################
+        //! The thread OpenACC device wait specialization.
+        //!
+        //! Blocks until the device has completed all preceding requested tasks.
+        //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
+        template<>
+        struct CurrentThreadWaitFor<DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(DevOacc const& dev) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                generic::currentThreadWaitForDevice(dev);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/DevOmp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/DevOmp5.hpp
new file mode 100644
index 0000000000..65529ddd2c
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/dev/DevOmp5.hpp
@@ -0,0 +1,297 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/core/Omp5.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/mem/buf/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+#    include <alpaka/queue/Properties.hpp>
+#    include <alpaka/queue/QueueGenericThreadsBlocking.hpp>
+#    include <alpaka/queue/QueueGenericThreadsNonBlocking.hpp>
+#    include <alpaka/queue/Traits.hpp>
+#    include <alpaka/queue/cpu/IGenericThreadsQueue.hpp>
+#    include <alpaka/wait/Traits.hpp>
+
+namespace alpaka
+{
+    class DevOmp5;
+    namespace traits
+    {
+        template<typename TPltf, typename TSfinae>
+        struct GetDevByIdx;
+    }
+    class PltfOmp5;
+
+    namespace omp5
+    {
+        namespace detail
+        {
+            //#############################################################################
+            //! The Omp5 device implementation.
+            class DevOmp5Impl
+            {
+            public:
+                //-----------------------------------------------------------------------------
+                DevOmp5Impl(int iDevice) noexcept : m_iDevice(iDevice)
+                {
+                }
+                //-----------------------------------------------------------------------------
+                DevOmp5Impl(DevOmp5Impl const&) = delete;
+                //-----------------------------------------------------------------------------
+                DevOmp5Impl(DevOmp5Impl&&) = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(DevOmp5Impl const&) -> DevOmp5Impl& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(DevOmp5Impl&&) -> DevOmp5Impl& = delete;
+                //-----------------------------------------------------------------------------
+                ~DevOmp5Impl() = default;
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST auto getAllExistingQueues() const
+                    -> std::vector<std::shared_ptr<IGenericThreadsQueue<DevOmp5>>>
+                {
+                    std::vector<std::shared_ptr<IGenericThreadsQueue<DevOmp5>>> vspQueues;
+
+                    std::lock_guard<std::mutex> lk(m_Mutex);
+                    vspQueues.reserve(m_queues.size());
+
+                    for(auto it = m_queues.begin(); it != m_queues.end();)
+                    {
+                        auto spQueue(it->lock());
+                        if(spQueue)
+                        {
+                            vspQueues.emplace_back(std::move(spQueue));
+                            ++it;
+                        }
+                        else
+                        {
+                            it = m_queues.erase(it);
+                        }
+                    }
+                    return vspQueues;
+                }
+
+                //-----------------------------------------------------------------------------
+                //! Registers the given queue on this device.
+                //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+                ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<IGenericThreadsQueue<DevOmp5>> spQueue) -> void
+                {
+                    std::lock_guard<std::mutex> lk(m_Mutex);
+
+                    // Register this queue on the device.
+                    m_queues.push_back(spQueue);
+                }
+
+                int iDevice() const
+                {
+                    return m_iDevice;
+                }
+
+            private:
+                std::mutex mutable m_Mutex;
+                std::vector<std::weak_ptr<IGenericThreadsQueue<DevOmp5>>> mutable m_queues;
+                const int m_iDevice;
+            };
+        } // namespace detail
+    } // namespace omp5
+    //#############################################################################
+    //! The Omp5 device handle.
+    class DevOmp5
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevOmp5>
+        , public concepts::Implements<ConceptDev, DevOmp5>
+    {
+        friend struct traits::GetDevByIdx<PltfOmp5>;
+
+        //-----------------------------------------------------------------------------
+        DevOmp5(int iDevice) : m_spDevOmp5Impl(std::make_shared<omp5::detail::DevOmp5Impl>(iDevice))
+        {
+        }
+
+    public:
+        //-----------------------------------------------------------------------------
+        DevOmp5(DevOmp5 const&) = default;
+        //-----------------------------------------------------------------------------
+        DevOmp5(DevOmp5&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(DevOmp5 const&) -> DevOmp5& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(DevOmp5&&) -> DevOmp5& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator==(DevOmp5 const& rhs) const -> bool
+        {
+            return m_spDevOmp5Impl->iDevice() == rhs.m_spDevOmp5Impl->iDevice();
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator!=(DevOmp5 const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~DevOmp5() = default;
+        int iDevice() const
+        {
+            return m_spDevOmp5Impl->iDevice();
+        }
+
+        ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<IGenericThreadsQueue<DevOmp5>>>
+        {
+            return m_spDevOmp5Impl->getAllExistingQueues();
+        }
+
+        //-----------------------------------------------------------------------------
+        //! Registers the given queue on this device.
+        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<IGenericThreadsQueue<DevOmp5>> spQueue) const -> void
+        {
+            m_spDevOmp5Impl->registerQueue(spQueue);
+        }
+
+    public:
+        std::shared_ptr<omp5::detail::DevOmp5Impl> m_spDevOmp5Impl;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenMP 5.0 device name get trait specialization.
+        template<>
+        struct GetName<DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getName(DevOmp5 const&) -> std::string
+            {
+                return std::string("OMP5 target");
+            }
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 device available memory get trait specialization.
+        //!
+        //! Returns 0, because querying target mem is not supported by OpenMP
+        template<>
+        struct GetMemBytes<DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getMemBytes(DevOmp5 const& dev) -> std::size_t
+            {
+                alpaka::ignore_unused(dev); //! \todo query device .. somehow
+
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 device free memory get trait specialization.
+        //!
+        //! Returns 0, because querying free target mem is not supported by OpenMP
+        template<>
+        struct GetFreeMemBytes<DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getFreeMemBytes(DevOmp5 const& dev) -> std::size_t
+            {
+                alpaka::ignore_unused(dev);
+
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 device warp size get trait specialization.
+        template<>
+        struct GetWarpSize<DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getWarpSize(DevOmp5 const& dev) -> std::size_t
+            {
+                alpaka::ignore_unused(dev);
+
+                return 1u;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 device reset trait specialization.
+        template<>
+        struct Reset<DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto reset(DevOmp5 const& dev) -> void
+            {
+                alpaka::ignore_unused(dev); //! \TODO
+            }
+        };
+    } // namespace traits
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufOmp5;
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenMP 5.0 device memory buffer type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufType<DevOmp5, TElem, TDim, TIdx>
+        {
+            using type = BufOmp5<TElem, TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 device platform type trait specialization.
+        template<>
+        struct PltfType<DevOmp5>
+        {
+            using type = PltfOmp5;
+        };
+    } // namespace traits
+    using QueueOmp5NonBlocking = QueueGenericThreadsNonBlocking<DevOmp5>;
+    using QueueOmp5Blocking = QueueGenericThreadsBlocking<DevOmp5>;
+
+    namespace traits
+    {
+        template<>
+        struct QueueType<DevOmp5, Blocking>
+        {
+            using type = QueueOmp5Blocking;
+        };
+
+        template<>
+        struct QueueType<DevOmp5, NonBlocking>
+        {
+            using type = QueueOmp5NonBlocking;
+        };
+
+        //#############################################################################
+        //! The thread Omp5 device wait specialization.
+        //!
+        //! Blocks until the device has completed all preceding requested tasks.
+        //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
+        template<>
+        struct CurrentThreadWaitFor<DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(DevOmp5 const& dev) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                generic::currentThreadWaitForDevice(dev);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp
new file mode 100644
index 0000000000..6dd4b8f126
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/dev/DevUniformCudaHipRt.hpp
@@ -0,0 +1,247 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/mem/buf/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+#    include <alpaka/queue/Properties.hpp>
+#    include <alpaka/queue/Traits.hpp>
+#    include <alpaka/wait/Traits.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+namespace alpaka
+{
+    namespace traits
+    {
+        template<typename TPltf, typename TSfinae>
+        struct GetDevByIdx;
+    }
+    class PltfUniformCudaHipRt;
+    class QueueUniformCudaHipRtBlocking;
+    class QueueUniformCudaHipRtNonBlocking;
+
+    //#############################################################################
+    //! The CUDA/HIP RT device handle.
+    class DevUniformCudaHipRt
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevUniformCudaHipRt>
+        , public concepts::Implements<ConceptDev, DevUniformCudaHipRt>
+    {
+        friend struct traits::GetDevByIdx<PltfUniformCudaHipRt>;
+
+    protected:
+        //-----------------------------------------------------------------------------
+        DevUniformCudaHipRt() = default;
+
+    public:
+        //-----------------------------------------------------------------------------
+        DevUniformCudaHipRt(DevUniformCudaHipRt const&) = default;
+        //-----------------------------------------------------------------------------
+        DevUniformCudaHipRt(DevUniformCudaHipRt&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(DevUniformCudaHipRt const&) -> DevUniformCudaHipRt& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(DevUniformCudaHipRt&&) -> DevUniformCudaHipRt& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator==(DevUniformCudaHipRt const& rhs) const -> bool
+        {
+            return m_iDevice == rhs.m_iDevice;
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator!=(DevUniformCudaHipRt const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~DevUniformCudaHipRt() = default;
+
+    public:
+        int m_iDevice;
+    };
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    using DevCudaRt = DevUniformCudaHipRt;
+#    else
+    using DevHipRt = DevUniformCudaHipRt;
+#    endif
+
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA/HIP RT device name get trait specialization.
+        template<>
+        struct GetName<DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getName(DevUniformCudaHipRt const& dev) -> std::string
+            {
+                // There is cuda/hip-DeviceGetAttribute as faster alternative to cuda/hip-GetDeviceProperties to get a
+                // single device property but it has no option to get the name
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                cudaDeviceProp devProp;
+#    else
+                hipDeviceProp_t devProp;
+#    endif
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, dev.m_iDevice));
+
+                return std::string(devProp.name);
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT device available memory get trait specialization.
+        template<>
+        struct GetMemBytes<DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getMemBytes(DevUniformCudaHipRt const& dev) -> std::size_t
+            {
+                // Set the current device to wait for.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+
+                std::size_t freeInternal(0u);
+                std::size_t totalInternal(0u);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(MemGetInfo)(&freeInternal, &totalInternal));
+
+                return totalInternal;
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT device free memory get trait specialization.
+        template<>
+        struct GetFreeMemBytes<DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getFreeMemBytes(DevUniformCudaHipRt const& dev) -> std::size_t
+            {
+                // Set the current device to wait for.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+
+                std::size_t freeInternal(0u);
+                std::size_t totalInternal(0u);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(MemGetInfo)(&freeInternal, &totalInternal));
+
+                return freeInternal;
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT device warp size get trait specialization.
+        template<>
+        struct GetWarpSize<DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getWarpSize(DevUniformCudaHipRt const& dev) -> std::size_t
+            {
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                cudaDeviceProp devProp;
+#    else
+                hipDeviceProp_t devProp;
+#    endif
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, dev.m_iDevice));
+
+                return static_cast<std::size_t>(devProp.warpSize);
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT device reset trait specialization.
+        template<>
+        struct Reset<DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto reset(DevUniformCudaHipRt const& dev) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                // Set the current device to wait for.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceReset)());
+            }
+        };
+    } // namespace traits
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufUniformCudaHipRt;
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA/HIP RT device memory buffer type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufType<DevUniformCudaHipRt, TElem, TDim, TIdx>
+        {
+            using type = BufUniformCudaHipRt<TElem, TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT device platform type trait specialization.
+        template<>
+        struct PltfType<DevUniformCudaHipRt>
+        {
+            using type = PltfUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The thread CUDA/HIP device wait specialization.
+        //!
+        //! Blocks until the device has completed all preceding requested tasks.
+        //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
+        template<>
+        struct CurrentThreadWaitFor<DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(DevUniformCudaHipRt const& dev) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                // Set the current device to wait for.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceSynchronize)());
+            }
+        };
+
+        template<>
+        struct QueueType<DevUniformCudaHipRt, Blocking>
+        {
+            using type = QueueUniformCudaHipRtBlocking;
+        };
+
+        template<>
+        struct QueueType<DevUniformCudaHipRt, NonBlocking>
+        {
+            using type = QueueUniformCudaHipRtNonBlocking;
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/Traits.hpp
index 53930a909a..c7e77a19cb 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/dev/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/dev/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,143 +10,120 @@
 #pragma once
 
 #include <alpaka/core/Common.hpp>
-
-#include <boost/config.hpp>
+#include <alpaka/core/Concepts.hpp>
 
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
-    //! The device specifics.
-    namespace dev
+    //! The device traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The device traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The device type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct DevType;
-
-            //#############################################################################
-            //! The device get trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct GetDev;
-
-            //#############################################################################
-            //! The device name get trait.
-            template<
-                typename TDev,
-                typename TSfinae = void>
-            struct GetName;
-
-            //#############################################################################
-            //! The device memory size get trait.
-            template<
-                typename TDev,
-                typename TSfinae = void>
-            struct GetMemBytes;
-
-            //#############################################################################
-            //! The device free memory size get trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct GetFreeMemBytes;
-
-            //#############################################################################
-            //! The device reset trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct Reset;
-        }
+        //#############################################################################
+        //! The device type trait.
+        template<typename T, typename TSfinae = void>
+        struct DevType;
 
         //#############################################################################
-        //! The device type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Dev = typename traits::DevType<T>::type;
-
-        //-----------------------------------------------------------------------------
-        //! \return The device this object is bound to.
-        template<
-            typename T>
-        ALPAKA_FN_HOST auto getDev(
-            T const & t)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(traits::GetDev<T>::getDev(t))
-#endif
-        {
-            return
-                traits::GetDev<
-                    T>
-                ::getDev(
-                    t);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The device name.
-        template<
-            typename TDev>
-        ALPAKA_FN_HOST auto getName(
-            TDev const & dev)
-        -> std::string
-        {
-            return
-                traits::GetName<
-                    TDev>
-                ::getName(
-                    dev);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The memory on the device in Bytes.
-        template<
-            typename TDev>
-        ALPAKA_FN_HOST auto getMemBytes(
-            TDev const & dev)
-        -> std::size_t
-        {
-            return
-                traits::GetMemBytes<
-                    TDev>
-                ::getMemBytes(
-                    dev);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! \return The free memory on the device in Bytes.
-        template<
-            typename TDev>
-        ALPAKA_FN_HOST auto getFreeMemBytes(
-            TDev const & dev)
-        -> std::size_t
-        {
-            return
-                traits::GetFreeMemBytes<
-                    TDev>
-                ::getFreeMemBytes(
-                    dev);
-        }
-
-        //-----------------------------------------------------------------------------
-        //! Resets the device.
-        //! What this method does is dependent on the accelerator.
-        template<
-            typename TDev>
-        ALPAKA_FN_HOST auto reset(
-            TDev const & dev)
-        -> void
-        {
-            traits::Reset<
-                TDev>
-            ::reset(
-                dev);
-        }
+        //! The device get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetDev;
+
+        //#############################################################################
+        //! The device name get trait.
+        template<typename TDev, typename TSfinae = void>
+        struct GetName;
+
+        //#############################################################################
+        //! The device memory size get trait.
+        template<typename TDev, typename TSfinae = void>
+        struct GetMemBytes;
+
+        //#############################################################################
+        //! The device free memory size get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetFreeMemBytes;
+
+        //#############################################################################
+        //! The device warp size get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetWarpSize;
+
+        //#############################################################################
+        //! The device reset trait.
+        template<typename T, typename TSfinae = void>
+        struct Reset;
+    } // namespace traits
+
+    //#############################################################################
+    //! The device type trait alias template to remove the ::type.
+    template<typename T>
+    using Dev = typename traits::DevType<T>::type;
+
+    struct ConceptGetDev;
+
+    struct ConceptDev;
+
+    //-----------------------------------------------------------------------------
+    //! \return The device this object is bound to.
+    template<typename T>
+    ALPAKA_FN_HOST auto getDev(T const& t)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptGetDev, T>;
+        return traits::GetDev<ImplementationBase>::getDev(t);
     }
-}
+
+    //-----------------------------------------------------------------------------
+    //! \return The device name.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto getName(TDev const& dev) -> std::string
+    {
+        return traits::GetName<TDev>::getName(dev);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return The memory on the device in Bytes. Returns 0 if querying memory
+    //!  is not supported.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto getMemBytes(TDev const& dev) -> std::size_t
+    {
+        return traits::GetMemBytes<TDev>::getMemBytes(dev);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return The free memory on the device in Bytes.
+    //
+    //! \note Do not use this query if getMemBytes returned 0.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto getFreeMemBytes(TDev const& dev) -> std::size_t
+    {
+        return traits::GetFreeMemBytes<TDev>::getFreeMemBytes(dev);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return The warp size on the device in number of threads.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto getWarpSize(TDev const& dev) -> std::size_t
+    {
+        return traits::GetWarpSize<TDev>::getWarpSize(dev);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Resets the device.
+    //! What this method does is dependent on the accelerator.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto reset(TDev const& dev) -> void
+    {
+        traits::Reset<TDev>::reset(dev);
+    }
+
+    namespace traits
+    {
+        //#############################################################################
+        //! Get device type
+        template<typename TDev>
+        struct DevType<TDev, typename std::enable_if<concepts::ImplementsConcept<ConceptDev, TDev>::value>::type>
+        {
+            using type = typename concepts::ImplementationBase<ConceptDev, TDev>;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/SysInfo.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/SysInfo.hpp
index 879edaa0c8..c283953f38 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/SysInfo.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/SysInfo.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Daniel Vollmer, Erik Zenker, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,228 +12,249 @@
 #include <alpaka/core/BoostPredef.hpp>
 
 #if BOOST_OS_WINDOWS || BOOST_OS_CYGWIN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #ifndef WIN32_LEAN_AND_MEAN
-        #define WIN32_LEAN_AND_MEAN
-    #endif
-    // We could use some more macros to reduce the number of sub-headers included, but this would restrict user code.
-    #include <windows.h>
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    ifndef WIN32_LEAN_AND_MEAN
+#        define WIN32_LEAN_AND_MEAN
+#    endif
+// We could use some more macros to reduce the number of sub-headers included, but this would restrict user code.
+#    include <windows.h>
 #elif BOOST_OS_UNIX || BOOST_OS_MACOS
-    #include <cstdint>
-    #include <unistd.h>
-    #include <sys/types.h>
-    #include <sys/param.h>
-    #if BOOST_OS_BSD || BOOST_OS_MACOS
-        #include <sys/sysctl.h>
-    #endif
+#    include <sys/param.h>
+#    include <sys/types.h>
+#    include <unistd.h>
+
+#    include <cstdint>
+#    if BOOST_OS_BSD || BOOST_OS_MACOS
+#        include <sys/sysctl.h>
+#    endif
 #endif
 
 #if BOOST_OS_LINUX
-    #include <fstream>
+#    include <fstream>
 #endif
 
-#include <stdexcept>
 #include <cstring>
+#include <stdexcept>
 #include <string>
 
 namespace alpaka
 {
-    namespace dev
+    namespace cpu
     {
-        namespace cpu
+        namespace detail
         {
-            namespace detail
-            {
+            constexpr int NO_CPUID = 0;
+            constexpr int UNKNOWN_CPU = 0;
+            constexpr int UNKNOWN_COMPILER = 1;
 #if BOOST_ARCH_X86
-    #if BOOST_COMP_GNUC || BOOST_COMP_CLANG || (!BOOST_COMP_MSVC_EMULATED && defined(__INTEL_COMPILER))
-        #include <cpuid.h>
-                //-----------------------------------------------------------------------------
-                inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4])
-                -> void
-                {
-                    __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]);
-                }
+#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG || (!BOOST_COMP_MSVC_EMULATED && defined(__INTEL_COMPILER))               \
+        || BOOST_COMP_PGI
+#        include <cpuid.h>
+            //-----------------------------------------------------------------------------
+            inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4]) -> void
+            {
+                __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]);
+            }
 
-    #elif BOOST_COMP_MSVC || defined(__INTEL_COMPILER)
-        #include <intrin.h>
-                //-----------------------------------------------------------------------------
-                inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4])
-                -> void
-                {
-                    __cpuidex(reinterpret_cast<int*>(ex), level, subfunction);
-                }
-    #endif
+#    elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) || defined(__INTEL_COMPILER)
+#        include <intrin.h>
+            //-----------------------------------------------------------------------------
+            inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4]) -> void
+            {
+                __cpuidex(reinterpret_cast<int*>(ex), level, subfunction);
+            }
+#    else
+            //-----------------------------------------------------------------------------
+            inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4]) -> void
+            {
+                ex[0] = ex[2] = ex[3] = NO_CPUID;
+                ex[1] = UNKNOWN_COMPILER;
+            }
+#    endif
+#else
+            inline auto cpuid(std::uint32_t const level, std::uint32_t const subfunction, std::uint32_t ex[4]) -> void
+            {
+                ex[0] = ex[2] = ex[3] = NO_CPUID;
+                ex[1] = UNKNOWN_CPU;
+            }
 #endif
-                //-----------------------------------------------------------------------------
-                //! \return The name of the CPU the code is running on.
-                inline auto getCpuName()
-                -> std::string
+            //-----------------------------------------------------------------------------
+            //! \return The name of the CPU the code is running on.
+            inline auto getCpuName() -> std::string
+            {
+                // Get extended ids.
+                std::uint32_t ex[4] = {0};
+                cpuid(0x80000000, 0, ex);
+                std::uint32_t const nExIds(ex[0]);
+
+                if(!nExIds)
                 {
-#if BOOST_ARCH_X86
-                    // Get extended ids.
-                    std::uint32_t ex[4] = {0};
-                    cpuid(0x80000000, 0, ex);
-                    std::uint32_t const nExIds(ex[0]);
-
-                    // Get the information associated with each extended ID.
-                    char cpuBrandString[0x40] = {0};
-                    for(std::uint32_t i(0x80000000); i<=nExIds; ++i)
+                    switch(ex[1])
                     {
-                        cpuid(i, 0, ex);
+                    case UNKNOWN_COMPILER:
+                        return "<unknown: compiler>";
+                    case UNKNOWN_CPU:
+                        return "<unknown: CPU>";
+                    default:
+                        return "<unknown>";
+                    }
+                }
+#if BOOST_ARCH_X86
+                // Get the information associated with each extended ID.
+                char cpuBrandString[0x40] = {0};
+                for(std::uint32_t i(0x80000000); i <= nExIds; ++i)
+                {
+                    cpuid(i, 0, ex);
 
-                        // Interpret CPU brand string and cache information.
-                        if(i == 0x80000002)
-                        {
-                            std::memcpy(cpuBrandString, ex, sizeof(ex));
-                        }
-                        else if(i == 0x80000003)
-                        {
-                            std::memcpy(cpuBrandString + 16, ex, sizeof(ex));
-                        }
-                        else if(i == 0x80000004)
-                        {
-                            std::memcpy(cpuBrandString + 32, ex, sizeof(ex));
-                        }
+                    // Interpret CPU brand string and cache information.
+                    if(i == 0x80000002)
+                    {
+                        std::memcpy(cpuBrandString, ex, sizeof(ex));
+                    }
+                    else if(i == 0x80000003)
+                    {
+                        std::memcpy(cpuBrandString + 16, ex, sizeof(ex));
+                    }
+                    else if(i == 0x80000004)
+                    {
+                        std::memcpy(cpuBrandString + 32, ex, sizeof(ex));
                     }
-                    return std::string(cpuBrandString);
+                }
+                return std::string(cpuBrandString);
 #else
-                    return "<unknown>";
+                return std::string("unknown");
 #endif
-                }
-                //-----------------------------------------------------------------------------
-                //! \return The frequency of the CPU the code is running on.
-                // TODO: implement!
-                /*inline auto getCpuFrequency()
-                -> std::size_t
-                {
-                    return 0;
-                }*/
-                //-----------------------------------------------------------------------------
-                //! \return The total number of bytes of global memory.
-                //! Adapted from David Robert Nadeau: http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
-                inline auto getTotalGlobalMemSizeBytes()
-                -> std::size_t
-                {
+            }
+            //-----------------------------------------------------------------------------
+            //! \return The total number of bytes of global memory.
+            //! Adapted from David Robert Nadeau:
+            //! http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
+            inline auto getTotalGlobalMemSizeBytes() -> std::size_t
+            {
 #if BOOST_OS_WINDOWS
-                    MEMORYSTATUSEX status;
-                    status.dwLength = sizeof(status);
-                    GlobalMemoryStatusEx(&status);
-                    return static_cast<std::size_t>(status.ullTotalPhys);
+                MEMORYSTATUSEX status;
+                status.dwLength = sizeof(status);
+                GlobalMemoryStatusEx(&status);
+                return static_cast<std::size_t>(status.ullTotalPhys);
 
 #elif BOOST_OS_CYGWIN
-                    // New 64-bit MEMORYSTATUSEX isn't available.
-                    MEMORYSTATUS status;
-                    status.dwLength = sizeof(status);
-                    GlobalMemoryStatus(&status);
-                    return static_cast<std::size_t>(status.dwTotalPhys);
+                // New 64-bit MEMORYSTATUSEX isn't available.
+                MEMORYSTATUS status;
+                status.dwLength = sizeof(status);
+                GlobalMemoryStatus(&status);
+                return static_cast<std::size_t>(status.dwTotalPhys);
 
 #elif BOOST_OS_UNIX || BOOST_OS_MACOS
-                    // Unix : Prefer sysctl() over sysconf() except sysctl() with HW_REALMEM and HW_PHYSMEM which are not always reliable
-    #if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
-                    int mib[2] = {CTL_HW,
-        #if defined(HW_MEMSIZE)                                                 // OSX
+                // Unix : Prefer sysctl() over sysconf() except sysctl() with HW_REALMEM and HW_PHYSMEM which are not
+                // always reliable
+#    if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
+                int mib[2]
+                    = { CTL_HW,
+#        if defined(HW_MEMSIZE) // OSX
                         HW_MEMSIZE
-        #elif defined(HW_PHYSMEM64)                                             // NetBSD, OpenBSD.
+#        elif defined(HW_PHYSMEM64) // NetBSD, OpenBSD.
                         HW_PHYSMEM64
-        #endif
-                    };
-                    std::uint64_t size(0);
-                    std::size_t sizeLen{sizeof(size)};
-                    if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
-                    {
-                        throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
-                    }
-                    return static_cast<std::size_t>(size);
+#        endif
+                      };
+                std::uint64_t size(0);
+                std::size_t sizeLen{sizeof(size)};
+                if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
+                {
+                    throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
+                }
+                return static_cast<std::size_t>(size);
 
-    #elif defined(_SC_AIX_REALMEM)                                          // AIX.
-                    return static_cast<std::size_t>(sysconf(_SC_AIX_REALMEM)) * static_cast<std::size_t>(1024);
+#    elif defined(_SC_AIX_REALMEM) // AIX.
+                return static_cast<std::size_t>(sysconf(_SC_AIX_REALMEM)) * static_cast<std::size_t>(1024);
 
-    #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)                  // Linux, FreeBSD, OpenBSD, Solaris.
-                    return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES)) * static_cast<std::size_t>(sysconf(_SC_PAGESIZE));
+#    elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) // Linux, FreeBSD, OpenBSD, Solaris.
+                return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES))
+                    * static_cast<std::size_t>(sysconf(_SC_PAGESIZE));
 
-    #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE)                 // Legacy.
-                    return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES)) * static_cast<std::size_t>(sysconf(_SC_PAGE_SIZE));
+#    elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) // Legacy.
+                return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES))
+                    * static_cast<std::size_t>(sysconf(_SC_PAGE_SIZE));
 
-    #elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM))   // FreeBSD, DragonFly BSD, NetBSD, OpenBSD, and OSX.
-                    int mib[2] = {CTL_HW,
-        #if defined(HW_REALMEM)                                                 // FreeBSD.
+#    elif defined(CTL_HW)                                                                                             \
+        && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) // FreeBSD, DragonFly BSD, NetBSD, OpenBSD, and OSX.
+                int mib[2]
+                    = { CTL_HW,
+#        if defined(HW_REALMEM) // FreeBSD.
                         HW_REALMEM;
-        #elif defined(HW_PYSMEM)                                                // Others.
+#        elif defined(HW_PYSMEM) // Others.
                         HW_PHYSMEM;
-        #endif
-                    };
-                    std::uint32_t size(0);
-                    std::size_t const sizeLen{sizeof(size)};
-                    if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
-                    {
-                        throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
-                    }
-                    return static_cast<std::size_t>(size);
-    #endif
+#        endif
+            };
+            std::uint32_t size(0);
+            std::size_t const sizeLen{sizeof(size)};
+            if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
+            {
+                throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
+            }
+            return static_cast<std::size_t>(size);
+#    endif
 
 #else
-    #error "getTotalGlobalMemSizeBytes not implemented for this system!"
+#    error "getTotalGlobalMemSizeBytes not implemented for this system!"
 #endif
-                }
-                //-----------------------------------------------------------------------------
-                //! \return The free number of bytes of global memory.
-                //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors.
-                inline auto getFreeGlobalMemSizeBytes()
-                -> std::size_t
-                {
+            } // namespace detail
+            //-----------------------------------------------------------------------------
+            //! \return The free number of bytes of global memory.
+            //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors.
+            inline auto getFreeGlobalMemSizeBytes() -> std::size_t
+            {
 #if BOOST_OS_WINDOWS
-                    MEMORYSTATUSEX status;
-                    status.dwLength = sizeof(status);
-                    GlobalMemoryStatusEx(&status);
-                    return static_cast<std::size_t>(status.ullAvailPhys);
+                MEMORYSTATUSEX status;
+                status.dwLength = sizeof(status);
+                GlobalMemoryStatusEx(&status);
+                return static_cast<std::size_t>(status.ullAvailPhys);
 
 #elif BOOST_OS_LINUX
-                    std::string token;
-                    std::ifstream file("/proc/meminfo");
-                    if(file)
+                std::string token;
+                std::ifstream file("/proc/meminfo");
+                if(file)
+                {
+                    while(file >> token)
                     {
-                        while(file >> token)
+                        if(token == "MemFree:")
                         {
-                            if(token == "MemFree:")
+                            std::size_t freeGlobalMemSizeBytes(0);
+                            if(file >> freeGlobalMemSizeBytes)
+                            {
+                                return freeGlobalMemSizeBytes * size_t(1024);
+                            }
+                            else
                             {
-                                std::size_t freeGlobalMemSizeBytes(0);
-                                if(file >> freeGlobalMemSizeBytes)
-                                {
-                                    return freeGlobalMemSizeBytes * size_t(1024);
-                                }
-                                else
-                                {
-                                    throw std::runtime_error("Unable to read MemFree value!");
-                                }
+                                throw std::runtime_error("Unable to read MemFree value!");
                             }
                         }
-                        throw std::runtime_error("Unable to find MemFree in '/proc/meminfo'!");
-                    }
-                    else
-                    {
-                        throw std::runtime_error("Unable to open '/proc/meminfo'!");
                     }
+                    throw std::runtime_error("Unable to find MemFree in '/proc/meminfo'!");
+                }
+                else
+                {
+                    throw std::runtime_error("Unable to open '/proc/meminfo'!");
+                }
 #elif BOOST_OS_MACOS
-                    int free_pages = 0;
-                    std::size_t len = sizeof(free_pages);
-                    if(sysctlbyname("vm.page_free_count", &free_pages, &len, nullptr, 0) < 0)
-                    {
-                        throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.page_free_count)!");
-                    }
-                    int page_size = 0;
-                    len = sizeof(page_size);
-                    if(sysctlbyname("vm.pagesize", &page_size, &len, nullptr, 0) < 0)
-                    {
-                        throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.pagesize)!");
-                    }
-                    return static_cast<std::size_t>(free_pages) * static_cast<std::size_t>(page_size);
+                int free_pages = 0;
+                std::size_t len = sizeof(free_pages);
+                if(sysctlbyname("vm.page_free_count", &free_pages, &len, nullptr, 0) < 0)
+                {
+                    throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.page_free_count)!");
+                }
+                int page_size = 0;
+                len = sizeof(page_size);
+                if(sysctlbyname("vm.pagesize", &page_size, &len, nullptr, 0) < 0)
+                {
+                    throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.pagesize)!");
+                }
+                return static_cast<std::size_t>(free_pages) * static_cast<std::size_t>(page_size);
 #else
-    #error "getFreeGlobalMemSizeBytes not implemented for this system!"
+#    error "getFreeGlobalMemSizeBytes not implemented for this system!"
 #endif
-                }
             }
-        }
-    }
-}
+        } // namespace detail
+    } // namespace cpu
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/Wait.hpp b/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/Wait.hpp
index 4d95fc30c1..b9c0dc22c1 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/Wait.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/dev/cpu/Wait.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Rene Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,62 +11,27 @@
 
 #include <alpaka/dev/DevCpu.hpp>
 #include <alpaka/event/EventCpu.hpp>
-
 #include <alpaka/wait/Traits.hpp>
 
 namespace alpaka
 {
-    namespace wait
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU device thread wait specialization.
+        //!
+        //! Blocks until the device has completed all preceding requested tasks.
+        //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
+        template<>
+        struct CurrentThreadWaitFor<DevCpu>
         {
-            namespace detail
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(DevCpu const& dev) -> void
             {
-                template<typename TDevice, typename TQueueVector>
-                ALPAKA_FN_HOST auto currentThreadWaitForDevice(
-                    TDevice const & dev, TQueueVector & vQueues
-                )
-                ->void
-                {
-                    // Furthermore there should not even be a chance to enqueue something between getting the queues and adding our wait events!
-                    std::vector<event::EventCpu> vEvents;
-                    for(auto && spQueue : vQueues)
-                    {
-                        vEvents.emplace_back(dev);
-                        spQueue->enqueue(vEvents.back());
-                    }
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-                    // Now wait for all the events.
-                    for(auto && event : vEvents)
-                    {
-                        wait::wait(event);
-                    }
-                }
+                generic::currentThreadWaitForDevice(dev);
             }
-            //#############################################################################
-            //! The CPU device thread wait specialization.
-            //!
-            //! Blocks until the device has completed all preceding requested tasks.
-            //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
-            template<>
-            struct CurrentThreadWaitFor<
-                dev::DevCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    dev::DevCpu const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    // Get all the queues on the device at the time of invocation.
-                    // All queues added afterwards are ignored.
-                    auto vspQueues(
-                        dev.m_spDevCpuImpl->GetAllQueues());
-
-                    detail::currentThreadWaitForDevice(dev, vspQueues);
-                }
-            };
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dim/DimArithmetic.hpp b/thirdParty/cupla/alpaka/include/alpaka/dim/DimArithmetic.hpp
index e5ccd4a795..ee0ef2b403 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/dim/DimArithmetic.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/dim/DimArithmetic.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -15,22 +15,16 @@
 
 namespace alpaka
 {
-    namespace dim
+    //-----------------------------------------------------------------------------
+    // Trait specializations for unsigned integral types.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
+        //#############################################################################
+        //! The arithmetic type dimension getter trait specialization.
+        template<typename T>
+        struct DimType<T, std::enable_if_t<std::is_arithmetic<T>::value>>
         {
-            //#############################################################################
-            //! The arithmetic type dimension getter trait specialization.
-            template<
-                typename T>
-            struct DimType<
-                T,
-                typename std::enable_if<std::is_arithmetic<T>::value>::type>
-            {
-                using type = dim::DimInt<1u>;
-            };
-        }
-    }
-}
+            using type = DimInt<1u>;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dim/DimIntegralConst.hpp b/thirdParty/cupla/alpaka/include/alpaka/dim/DimIntegralConst.hpp
index 9f05d07255..b912537cf2 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/dim/DimIntegralConst.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/dim/DimIntegralConst.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -15,27 +15,8 @@
 
 namespace alpaka
 {
-    namespace dim
-    {
-        //-----------------------------------------------------------------------------
-        // N(th) dimension(s).
-        template<
-            std::size_t N>
-        using DimInt = std::integral_constant<std::size_t, N>;
-
-        //-----------------------------------------------------------------------------
-        // Trait specializations for integral_constant types.
-        /*namespace traits
-        {
-            //#############################################################################
-            //! The arithmetic type dimension getter trait specialization.
-            template<
-                std::size_t N>
-            struct DimType<
-                std::integral_constant<std::size_t, N>
-            {
-                using type = DimInt<N>;
-            };
-        }*/
-    }
-}
+    //-----------------------------------------------------------------------------
+    // N(th) dimension(s).
+    template<std::size_t N>
+    using DimInt = std::integral_constant<std::size_t, N>;
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/dim/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/dim/Traits.hpp
index 39736a61ec..fa5cb483a4 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/dim/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/dim/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,25 +12,17 @@
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
-    //! The dimension specifics.
-    namespace dim
+    //! The dimension traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The dimension traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The dimension getter type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct DimType;
-        }
-
         //#############################################################################
-        //! The dimension type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Dim = typename traits::DimType<T>::type;
-    }
-}
+        //! The dimension getter type trait.
+        template<typename T, typename TSfinae = void>
+        struct DimType;
+    } // namespace traits
+
+    //#############################################################################
+    //! The dimension type trait alias template to remove the ::type.
+    template<typename T>
+    using Dim = typename traits::DimType<T>::type;
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/elem/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/elem/Traits.hpp
index 26aa24e16f..021934aef6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/elem/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/elem/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -14,41 +14,30 @@
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
-    //! The element specifics.
-    namespace elem
+    //! The element traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The element traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The element type trait.
-            template<
-                typename TView,
-                typename TSfinae = void>
-            struct ElemType;
-        }
-
         //#############################################################################
-        //! The element type trait alias template to remove the ::type.
-        template<
-            typename TView>
-        using Elem = typename std::remove_volatile<typename traits::ElemType<TView>::type>::type;
+        //! The element type trait.
+        template<typename TView, typename TSfinae = void>
+        struct ElemType;
+    } // namespace traits
+
+    //#############################################################################
+    //! The element type trait alias template to remove the ::type.
+    template<typename TView>
+    using Elem = std::remove_volatile_t<typename traits::ElemType<TView>::type>;
 
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
+    //-----------------------------------------------------------------------------
+    // Trait specializations for unsigned integral types.
+    namespace traits
+    {
+        //#############################################################################
+        //! The fundamental type elem type trait specialization.
+        template<typename T>
+        struct ElemType<T, std::enable_if_t<std::is_fundamental<T>::value>>
         {
-            //#############################################################################
-            //! The fundamental type elem type trait specialization.
-            template<
-                typename T>
-            struct ElemType<
-                T,
-                typename std::enable_if<std::is_fundamental<T>::value>::type>
-            {
-                using type = T;
-            };
-        }
-    }
-}
+            using type = T;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventCpu.hpp
index 9df9c1400a..af5bd541be 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/event/EventCpu.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/event/EventCpu.hpp
@@ -1,6 +1,6 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
+/* Copyright 2020 Jeffrey Kelling
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,487 +9,10 @@
 
 #pragma once
 
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Unused.hpp>
 #include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/queue/QueueCpuNonBlocking.hpp>
-#include <alpaka/queue/QueueCpuBlocking.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-
-#include <mutex>
-#include <condition_variable>
-#include <future>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
+#include <alpaka/event/EventGenericThreads.hpp>
 
 namespace alpaka
 {
-    namespace event
-    {
-        namespace cpu
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CPU device event implementation.
-                class EventCpuImpl final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, EventCpuImpl>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    EventCpuImpl(
-                        dev::DevCpu const & dev) noexcept :
-                            m_dev(dev),
-                            m_mutex(),
-                            m_enqueueCount(0u),
-                            m_LastReadyEnqueueCount(0u)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    EventCpuImpl(EventCpuImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    EventCpuImpl(EventCpuImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventCpuImpl const &) -> EventCpuImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventCpuImpl &&) -> EventCpuImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ~EventCpuImpl() noexcept = default;
-
-                    //-----------------------------------------------------------------------------
-                    auto isReady() noexcept -> bool
-                    {
-                        return (m_LastReadyEnqueueCount == m_enqueueCount);
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    auto wait(std::size_t const & enqueueCount, std::unique_lock<std::mutex>& lk) const noexcept -> void
-                    {
-                        ALPAKA_ASSERT(enqueueCount <= m_enqueueCount);
-
-                        while(enqueueCount > m_LastReadyEnqueueCount)
-                        {
-                            auto future = m_future;
-                            lk.unlock();
-                            future.get();
-                            lk.lock();
-                        }
-                    }
-
-                public:
-                    dev::DevCpu const m_dev;                                //!< The device this event is bound to.
-
-                    std::mutex mutable m_mutex;                             //!< The mutex used to synchronize access to the event.
-                    std::shared_future<void> m_future;                      //!< The future signaling the event completion.
-                    std::size_t m_enqueueCount;                             //!< The number of times this event has been enqueued.
-                    std::size_t m_LastReadyEnqueueCount;                    //!< The time this event has been ready the last time.
-                                                                            //!< Ready means that the event was not waiting within a queue (not enqueued or already completed).
-                                                                            //!< If m_enqueueCount == m_LastReadyEnqueueCount, the event is currently not enqueued
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CPU device event.
-        class EventCpu final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, EventCpu>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! \param bBusyWaiting Unused. EventCpu never does busy waiting.
-            EventCpu(
-                dev::DevCpu const & dev,
-                bool bBusyWaiting = true) :
-                    m_spEventImpl(std::make_shared<cpu::detail::EventCpuImpl>(dev))
-            { 
-                alpaka::ignore_unused(bBusyWaiting);
-            }
-            //-----------------------------------------------------------------------------
-            EventCpu(EventCpu const &) = default;
-            //-----------------------------------------------------------------------------
-            EventCpu(EventCpu &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(EventCpu const &) -> EventCpu & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(EventCpu &&) -> EventCpu & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(EventCpu const & rhs) const
-            -> bool
-            {
-                return (m_spEventImpl == rhs.m_spEventImpl);
-            }
-            //-----------------------------------------------------------------------------
-            auto operator!=(EventCpu const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~EventCpu() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::EventCpuImpl> m_spEventImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event device get trait specialization.
-            template<>
-            struct GetDev<
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    event::EventCpu const & event)
-                -> dev::DevCpu
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event test trait specialization.
-            template<>
-            struct Test<
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return If the event is not waiting within a queue (not enqueued or already handled).
-                ALPAKA_FN_HOST static auto test(
-                    event::EventCpu const & event)
-                -> bool
-                {
-                    std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                    return event.m_spEventImpl->isReady();
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU non-blocking device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::cpu::detail::QueueCpuNonBlockingImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue::cpu::detail::QueueCpuNonBlockingImpl & queueImpl,
-#else
-                    queue::cpu::detail::QueueCpuNonBlockingImpl &,
-#endif
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer of the event implementation.
-                    // This is forwarded to the lambda that is enqueued into the queue to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    ++spEventImpl->m_enqueueCount;
-
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-                    // Enqueue a task that only resets the events flag if it is completed.
-                    spEventImpl->m_future = queueImpl.m_workerThread.enqueueTask(
-                        [spEventImpl, enqueueCount]()
-                        {
-                            std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
-
-                            // Nothing to do if it has been re-enqueued to a later position in the queue.
-                            if(enqueueCount == spEventImpl->m_enqueueCount)
-                            {
-                                spEventImpl->m_LastReadyEnqueueCount = spEventImpl->m_enqueueCount;
-                            }
-                        });
-#endif
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCpuNonBlocking,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuNonBlocking & queue,
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    queue::enqueue(*queue.m_spQueueImpl, event);
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::cpu::detail::QueueCpuBlockingImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::cpu::detail::QueueCpuBlockingImpl & queueImpl,
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    std::promise<void> promise;
-                    {
-                        std::lock_guard<std::mutex> lk(queueImpl.m_mutex);
-
-                        queueImpl.m_bCurrentlyExecutingTask = true;
-
-                        auto & eventImpl(*event.m_spEventImpl);
-
-                        {
-                            // Setting the event state and enqueuing it has to be atomic.
-                            std::lock_guard<std::mutex> evLk(eventImpl.m_mutex);
-
-                            ++eventImpl.m_enqueueCount;
-                            // NOTE: Difference to non-blocking version: directly set the event state instead of enqueuing.
-                            eventImpl.m_LastReadyEnqueueCount = eventImpl.m_enqueueCount;
-
-                            eventImpl.m_future = promise.get_future();
-                        }
-
-                        queueImpl.m_bCurrentlyExecutingTask = false;
-                    }
-                    promise.set_value();
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCpuBlocking,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuBlocking & queue,
-                    event::EventCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    queue::enqueue(*queue.m_spQueueImpl, event);
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event thread wait trait specialization.
-            //!
-            //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed.
-            //! If the event is not enqueued to a queue the method returns immediately.
-            template<>
-            struct CurrentThreadWaitFor<
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    event::EventCpu const & event)
-                -> void
-                {
-                    wait::wait(*event.m_spEventImpl);
-                }
-            };
-            //#############################################################################
-            //! The CPU device event implementation thread wait trait specialization.
-            //!
-            //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed.
-            //! If the event is not enqueued to a queue the method returns immediately.
-            //!
-            //! NOTE: This method is for internal usage only.
-            template<>
-            struct CurrentThreadWaitFor<
-                event::cpu::detail::EventCpuImpl>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    event::cpu::detail::EventCpuImpl const & eventImpl)
-                -> void
-                {
-                    std::unique_lock<std::mutex> lk(eventImpl.m_mutex);
-
-                    auto const enqueueCount = eventImpl.m_enqueueCount;
-                    eventImpl.wait(enqueueCount, lk);
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::cpu::detail::QueueCpuNonBlockingImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue::cpu::detail::QueueCpuNonBlockingImpl & queueImpl,
-#else
-                    queue::cpu::detail::QueueCpuNonBlockingImpl &,
-#endif
-                    event::EventCpu const & event)
-                -> void
-                {
-                    // Copy the shared pointer of the event implementation.
-                    // This is forwarded to the lambda that is enqueued into the queue to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    if(!spEventImpl->isReady())
-                    {
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                        auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-                        // Enqueue a task that waits for the given event.
-                        queueImpl.m_workerThread.enqueueTask(
-                            [spEventImpl, enqueueCount]()
-                            {
-                                std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
-                                spEventImpl->wait(enqueueCount, lk2);
-                            });
-#endif
-                    }
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCpuNonBlocking,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCpuNonBlocking & queue,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    wait::wait(*queue.m_spQueueImpl, event);
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::cpu::detail::QueueCpuBlockingImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::cpu::detail::QueueCpuBlockingImpl & queueImpl,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    alpaka::ignore_unused(queueImpl);
-
-                    // NOTE: Difference to non-blocking version: directly wait for event.
-                    wait::wait(*event.m_spEventImpl);
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCpuBlocking,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCpuBlocking & queue,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    wait::wait(*queue.m_spQueueImpl, event);
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device event wait trait specialization.
-            //!
-            //! Any future work submitted in any queue of this device will wait for event to complete before beginning execution.
-            template<>
-            struct WaiterWaitFor<
-                dev::DevCpu,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    dev::DevCpu & dev,
-                    event::EventCpu const & event)
-                -> void
-                {
-                    // Get all the queues on the device at the time of invocation.
-                    // All queues added afterwards are ignored.
-                    auto vspQueues(
-                        dev.m_spDevCpuImpl->GetAllQueues());
-
-                    // Let all the queues wait for this event.
-                    // Furthermore there should not even be a chance to enqueue something between getting the queues and adding our wait events!
-                    for(auto && spQueue : vspQueues)
-                    {
-                        spQueue->wait(event);
-                    }
-                }
-            };
-
-            //#############################################################################
-            //! The CPU non-blocking device queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCpuNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCpuNonBlocking const & queue)
-                -> void
-                {
-                    event::EventCpu event(
-                        dev::getDev(queue));
-                    queue::enqueue(
-                        const_cast<queue::QueueCpuNonBlocking &>(queue),
-                        event);
-                    wait::wait(
-                        event);
-                }
-            };
-        }
-    }
+    using EventCpu = EventGenericThreads<DevCpu>;
 }
diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventCudaRt.hpp
deleted file mode 100644
index abcd94a725..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/event/EventCudaRt.hpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/core/Cuda.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-
-namespace alpaka
-{
-    namespace event
-    {
-        namespace cuda
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CUDA RT device event implementation.
-                class EventCudaImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST EventCudaImpl(
-                        dev::DevCudaRt const & dev,
-                        bool bBusyWait) :
-                            m_dev(dev),
-                            m_CudaEvent()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // Create the event on the current device with the specified flags. Valid flags include:
-                        // - cudaEventDefault: Default event creation flag.
-                        // - cudaEventBlockingSync : Specifies that event should use blocking synchronization.
-                        //   A host thread that uses cudaEventSynchronize() to wait on an event created with this flag will block until the event actually completes.
-                        // - cudaEventDisableTiming : Specifies that the created event does not need to record timing data.
-                        //   Events created with this flag specified and the cudaEventBlockingSync flag not specified will provide the best performance when used with cudaStreamWaitEvent() and cudaEventQuery().
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaEventCreateWithFlags(
-                                &m_CudaEvent,
-                                (bBusyWait ? cudaEventDefault : cudaEventBlockingSync) | cudaEventDisableTiming));
-                    }
-                    //-----------------------------------------------------------------------------
-                    EventCudaImpl(EventCudaImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    EventCudaImpl(EventCudaImpl &&) = default;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventCudaImpl const &) -> EventCudaImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventCudaImpl &&) -> EventCudaImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~EventCudaImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before cudaEventDestroy required?
-                        ALPAKA_CUDA_RT_CHECK(cudaSetDevice(
-                            m_dev.m_iDevice));
-                        // In case event has been recorded but has not yet been completed when cudaEventDestroy() is called, the function will return immediately
-                        // and the resources associated with event will be released automatically once the device has completed event.
-                        // -> No need to synchronize here.
-                        ALPAKA_CUDA_RT_CHECK(cudaEventDestroy(
-                            m_CudaEvent));
-                    }
-
-                public:
-                    dev::DevCudaRt const m_dev;   //!< The device this event is bound to.
-                    cudaEvent_t m_CudaEvent;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CUDA RT device event.
-        class EventCudaRt final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, EventCudaRt>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST EventCudaRt(
-                dev::DevCudaRt const & dev,
-                bool bBusyWait = true) :
-                    m_spEventImpl(std::make_shared<cuda::detail::EventCudaImpl>(dev, bBusyWait))
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            }
-            //-----------------------------------------------------------------------------
-            EventCudaRt(EventCudaRt const &) = default;
-            //-----------------------------------------------------------------------------
-            EventCudaRt(EventCudaRt &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(EventCudaRt const &) -> EventCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(EventCudaRt &&) -> EventCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(EventCudaRt const & rhs) const
-            -> bool
-            {
-                return (m_spEventImpl == rhs.m_spEventImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(EventCudaRt const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~EventCudaRt() = default;
-
-        public:
-            std::shared_ptr<cuda::detail::EventCudaImpl> m_spEventImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device event device get trait specialization.
-            template<>
-            struct GetDev<
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    event::EventCudaRt const & event)
-                -> dev::DevCudaRt
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device event test trait specialization.
-            template<>
-            struct Test<
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto test(
-                    event::EventCudaRt const & event)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Query is allowed even for events on non current device.
-                    cudaError_t ret = cudaSuccess;
-                    ALPAKA_CUDA_RT_CHECK_IGNORE(
-                        ret = cudaEventQuery(
-                            event.m_spEventImpl->m_CudaEvent),
-                        cudaErrorNotReady);
-                    return (ret == cudaSuccess);
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    event::EventCudaRt & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_CUDA_RT_CHECK(cudaEventRecord(
-                        event.m_spEventImpl->m_CudaEvent,
-                        queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    event::EventCudaRt & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_CUDA_RT_CHECK(cudaEventRecord(
-                        event.m_spEventImpl->m_CudaEvent,
-                        queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device event thread wait trait specialization.
-            //!
-            //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed.
-            //! If the event is not enqueued to a queue the method returns immediately.
-            template<>
-            struct CurrentThreadWaitFor<
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    event::EventCudaRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Sync is allowed even for events on non current device.
-                    ALPAKA_CUDA_RT_CHECK(cudaEventSynchronize(
-                        event.m_spEventImpl->m_CudaEvent));
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCudaRtNonBlocking,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    event::EventCudaRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent(
-                        queue.m_spQueueImpl->m_CudaQueue,
-                        event.m_spEventImpl->m_CudaEvent,
-                        0));
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCudaRtBlocking,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCudaRtBlocking & queue,
-                    event::EventCudaRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent(
-                        queue.m_spQueueImpl->m_CudaQueue,
-                        event.m_spEventImpl->m_CudaEvent,
-                        0));
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT device event wait trait specialization.
-            //!
-            //! Any future work submitted in any queue of this device will wait for event to complete before beginning execution.
-            template<>
-            struct WaiterWaitFor<
-                dev::DevCudaRt,
-                event::EventCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    dev::DevCudaRt & dev,
-                    event::EventCudaRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamWaitEvent(
-                        nullptr,
-                        event.m_spEventImpl->m_CudaEvent,
-                        0));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventGenericThreads.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventGenericThreads.hpp
new file mode 100644
index 0000000000..dd869c5ec6
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/event/EventGenericThreads.hpp
@@ -0,0 +1,454 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/Assert.hpp>
+#include <alpaka/core/Unused.hpp>
+#include <alpaka/core/Utility.hpp>
+#include <alpaka/dev/Traits.hpp>
+#include <alpaka/event/Traits.hpp>
+#include <alpaka/queue/QueueGenericThreadsBlocking.hpp>
+#include <alpaka/queue/QueueGenericThreadsNonBlocking.hpp>
+#include <alpaka/wait/Traits.hpp>
+
+#include <condition_variable>
+#include <future>
+#include <mutex>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+
+namespace alpaka
+{
+    namespace generic
+    {
+        namespace detail
+        {
+            //#############################################################################
+            //! The CPU device event implementation.
+            template<typename TDev>
+            class EventGenericThreadsImpl final
+                : public concepts::Implements<ConceptCurrentThreadWaitFor, EventGenericThreadsImpl<TDev>>
+            {
+            public:
+                //-----------------------------------------------------------------------------
+                EventGenericThreadsImpl(TDev const& dev) noexcept
+                    : m_dev(dev)
+                    , m_mutex()
+                    , m_enqueueCount(0u)
+                    , m_LastReadyEnqueueCount(0u)
+                {
+                }
+                //-----------------------------------------------------------------------------
+                EventGenericThreadsImpl(EventGenericThreadsImpl<TDev> const&) = delete;
+                //-----------------------------------------------------------------------------
+                EventGenericThreadsImpl(EventGenericThreadsImpl<TDev>&&) = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(EventGenericThreadsImpl<TDev> const&) -> EventGenericThreadsImpl<TDev>& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(EventGenericThreadsImpl<TDev>&&) -> EventGenericThreadsImpl<TDev>& = delete;
+                //-----------------------------------------------------------------------------
+                ~EventGenericThreadsImpl() noexcept = default;
+
+                //-----------------------------------------------------------------------------
+                auto isReady() noexcept -> bool
+                {
+                    return (m_LastReadyEnqueueCount == m_enqueueCount);
+                }
+
+                //-----------------------------------------------------------------------------
+                auto wait(std::size_t const& enqueueCount, std::unique_lock<std::mutex>& lk) const noexcept -> void
+                {
+                    ALPAKA_ASSERT(enqueueCount <= m_enqueueCount);
+
+                    while(enqueueCount > m_LastReadyEnqueueCount)
+                    {
+                        auto future = m_future;
+                        lk.unlock();
+                        future.get();
+                        lk.lock();
+                    }
+                }
+
+            public:
+                TDev const m_dev; //!< The device this event is bound to.
+
+                std::mutex mutable m_mutex; //!< The mutex used to synchronize access to the event.
+                std::shared_future<void> m_future; //!< The future signaling the event completion.
+                std::size_t m_enqueueCount; //!< The number of times this event has been enqueued.
+                std::size_t m_LastReadyEnqueueCount; //!< The time this event has been ready the last time.
+                                                     //!< Ready means that the event was not waiting within a queue
+                                                     //!< (not enqueued or already completed). If m_enqueueCount ==
+                                                     //!< m_LastReadyEnqueueCount, the event is currently not enqueued
+            };
+        } // namespace detail
+    } // namespace generic
+
+    //#############################################################################
+    //! The CPU device event.
+    template<typename TDev>
+    class EventGenericThreads final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, EventGenericThreads<TDev>>
+        , public concepts::Implements<ConceptGetDev, EventGenericThreads<TDev>>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        //! \param bBusyWaiting Unused. EventGenericThreads never does busy waiting.
+        EventGenericThreads(TDev const& dev, bool bBusyWaiting = true)
+            : m_spEventImpl(std::make_shared<generic::detail::EventGenericThreadsImpl<TDev>>(dev))
+        {
+            alpaka::ignore_unused(bBusyWaiting);
+        }
+        //-----------------------------------------------------------------------------
+        EventGenericThreads(EventGenericThreads<TDev> const&) = default;
+        //-----------------------------------------------------------------------------
+        EventGenericThreads(EventGenericThreads<TDev>&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(EventGenericThreads<TDev> const&) -> EventGenericThreads<TDev>& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(EventGenericThreads<TDev>&&) -> EventGenericThreads<TDev>& = default;
+        //-----------------------------------------------------------------------------
+        auto operator==(EventGenericThreads<TDev> const& rhs) const -> bool
+        {
+            return (m_spEventImpl == rhs.m_spEventImpl);
+        }
+        //-----------------------------------------------------------------------------
+        auto operator!=(EventGenericThreads<TDev> const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~EventGenericThreads() = default;
+
+    public:
+        std::shared_ptr<generic::detail::EventGenericThreadsImpl<TDev>> m_spEventImpl;
+    };
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU device event device get trait specialization.
+        template<typename TDev>
+        struct GetDev<EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(EventGenericThreads<TDev> const& event) -> TDev
+            {
+                return event.m_spEventImpl->m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The CPU device event test trait specialization.
+        template<typename TDev>
+        struct IsComplete<EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return If the event is not waiting within a queue (not enqueued or already handled).
+            ALPAKA_FN_HOST static auto isComplete(EventGenericThreads<TDev> const& event) -> bool
+            {
+                std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
+
+                return event.m_spEventImpl->isReady();
+            }
+        };
+
+        //#############################################################################
+        //! The CPU non-blocking device queue enqueue trait specialization.
+        template<typename TDev>
+        struct Enqueue<alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>, EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>& queueImpl,
+                EventGenericThreads<TDev>& event) -> void
+            {
+#if(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+                alpaka::ignore_unused(queueImpl);
+#endif
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Copy the shared pointer of the event implementation.
+                // This is forwarded to the lambda that is enqueued into the queue to ensure that the event
+                // implementation is alive as long as it is enqueued.
+                auto spEventImpl(event.m_spEventImpl);
+
+                // Setting the event state and enqueuing it has to be atomic.
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+                ++spEventImpl->m_enqueueCount;
+
+// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
+#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+                auto const enqueueCount = spEventImpl->m_enqueueCount;
+
+                // Enqueue a task that only resets the events flag if it is completed.
+                spEventImpl->m_future = queueImpl.m_workerThread.enqueueTask([spEventImpl, enqueueCount]() {
+                    std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
+
+                    // Nothing to do if it has been re-enqueued to a later position in the queue.
+                    if(enqueueCount == spEventImpl->m_enqueueCount)
+                    {
+                        spEventImpl->m_LastReadyEnqueueCount = spEventImpl->m_enqueueCount;
+                    }
+                });
+#endif
+            }
+        };
+        //#############################################################################
+        //! The CPU non-blocking device queue enqueue trait specialization.
+        template<typename TDev>
+        struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueGenericThreadsNonBlocking<TDev>& queue,
+                EventGenericThreads<TDev>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                alpaka::enqueue(*queue.m_spQueueImpl, event);
+            }
+        };
+        //#############################################################################
+        //! The CPU blocking device queue enqueue trait specialization.
+        template<typename TDev>
+        struct Enqueue<alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>, EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>& queueImpl,
+                EventGenericThreads<TDev>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                std::promise<void> promise;
+                {
+                    std::lock_guard<std::mutex> lk(queueImpl.m_mutex);
+
+                    queueImpl.m_bCurrentlyExecutingTask = true;
+
+                    auto& eventImpl(*event.m_spEventImpl);
+
+                    {
+                        // Setting the event state and enqueuing it has to be atomic.
+                        std::lock_guard<std::mutex> evLk(eventImpl.m_mutex);
+
+                        ++eventImpl.m_enqueueCount;
+                        // NOTE: Difference to non-blocking version: directly set the event state instead of enqueuing.
+                        eventImpl.m_LastReadyEnqueueCount = eventImpl.m_enqueueCount;
+
+                        eventImpl.m_future = promise.get_future();
+                    }
+
+                    queueImpl.m_bCurrentlyExecutingTask = false;
+                }
+                promise.set_value();
+            }
+        };
+        //#############################################################################
+        //! The CPU blocking device queue enqueue trait specialization.
+        template<typename TDev>
+        struct Enqueue<QueueGenericThreadsBlocking<TDev>, EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueGenericThreadsBlocking<TDev>& queue,
+                EventGenericThreads<TDev>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                alpaka::enqueue(*queue.m_spQueueImpl, event);
+            }
+        };
+    } // namespace traits
+    namespace traits
+    {
+        namespace generic
+        {
+            template<typename TDev>
+            ALPAKA_FN_HOST auto currentThreadWaitForDevice(TDev const& dev) -> void
+            {
+                // Get all the queues on the device at the time of invocation.
+                // All queues added afterwards are ignored.
+                auto vQueues(dev.getAllQueues());
+                // Furthermore there should not even be a chance to enqueue something between getting the queues and
+                // adding our wait events!
+                std::vector<EventGenericThreads<TDev>> vEvents;
+                for(auto&& spQueue : vQueues)
+                {
+                    vEvents.emplace_back(dev);
+                    spQueue->enqueue(vEvents.back());
+                }
+
+                // Now wait for all the events.
+                for(auto&& event : vEvents)
+                {
+                    wait(event);
+                }
+            }
+        } // namespace generic
+
+        //#############################################################################
+        //! The CPU device event thread wait trait specialization.
+        //!
+        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
+        //! completed. If the event is not enqueued to a queue the method returns immediately.
+        template<typename TDev>
+        struct CurrentThreadWaitFor<EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(EventGenericThreads<TDev> const& event) -> void
+            {
+                wait(*event.m_spEventImpl);
+            }
+        };
+        //#############################################################################
+        //! The CPU device event implementation thread wait trait specialization.
+        //!
+        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
+        //! completed. If the event is not enqueued to a queue the method returns immediately.
+        //!
+        //! NOTE: This method is for internal usage only.
+        template<typename TDev>
+        struct CurrentThreadWaitFor<alpaka::generic::detail::EventGenericThreadsImpl<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(
+                alpaka::generic::detail::EventGenericThreadsImpl<TDev> const& eventImpl) -> void
+            {
+                std::unique_lock<std::mutex> lk(eventImpl.m_mutex);
+
+                auto const enqueueCount = eventImpl.m_enqueueCount;
+                eventImpl.wait(enqueueCount, lk);
+            }
+        };
+        //#############################################################################
+        //! The CPU non-blocking device queue event wait trait specialization.
+        template<typename TDev>
+        struct WaiterWaitFor<
+            alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>,
+            EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+                alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>& queueImpl,
+#else
+                alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>&,
+#endif
+                EventGenericThreads<TDev> const& event) -> void
+            {
+                // Copy the shared pointer of the event implementation.
+                // This is forwarded to the lambda that is enqueued into the queue to ensure that the event
+                // implementation is alive as long as it is enqueued.
+                auto spEventImpl(event.m_spEventImpl);
+
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+                if(!spEventImpl->isReady())
+                {
+// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
+#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+                    auto const enqueueCount = spEventImpl->m_enqueueCount;
+
+                    // Enqueue a task that waits for the given event.
+                    queueImpl.m_workerThread.enqueueTask([spEventImpl, enqueueCount]() {
+                        std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
+                        spEventImpl->wait(enqueueCount, lk2);
+                    });
+#endif
+                }
+            }
+        };
+        //#############################################################################
+        //! The CPU non-blocking device queue event wait trait specialization.
+        template<typename TDev>
+        struct WaiterWaitFor<QueueGenericThreadsNonBlocking<TDev>, EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                QueueGenericThreadsNonBlocking<TDev>& queue,
+                EventGenericThreads<TDev> const& event) -> void
+            {
+                wait(*queue.m_spQueueImpl, event);
+            }
+        };
+        //#############################################################################
+        //! The CPU blocking device queue event wait trait specialization.
+        template<typename TDev>
+        struct WaiterWaitFor<alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>, EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>& queueImpl,
+                EventGenericThreads<TDev> const& event) -> void
+            {
+                alpaka::ignore_unused(queueImpl);
+
+                // NOTE: Difference to non-blocking version: directly wait for event.
+                wait(*event.m_spEventImpl);
+            }
+        };
+        //#############################################################################
+        //! The CPU blocking device queue event wait trait specialization.
+        template<typename TDev>
+        struct WaiterWaitFor<QueueGenericThreadsBlocking<TDev>, EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                QueueGenericThreadsBlocking<TDev>& queue,
+                EventGenericThreads<TDev> const& event) -> void
+            {
+                wait(*queue.m_spQueueImpl, event);
+            }
+        };
+        //#############################################################################
+        //! The CPU non-blocking device event wait trait specialization.
+        //!
+        //! Any future work submitted in any queue of this device will wait for event to complete before beginning
+        //! execution.
+        template<typename TDev>
+        struct WaiterWaitFor<TDev, EventGenericThreads<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(TDev& dev, EventGenericThreads<TDev> const& event) -> void
+            {
+                // Get all the queues on the device at the time of invocation.
+                // All queues added afterwards are ignored.
+                auto vspQueues(dev.getAllQueues());
+
+                // Let all the queues wait for this event.
+                // Furthermore there should not even be a chance to enqueue something between getting the queues and
+                // adding our wait events!
+                for(auto&& spQueue : vspQueues)
+                {
+                    spQueue->wait(event);
+                }
+            }
+        };
+
+        //#############################################################################
+        //! The CPU non-blocking device queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<typename TDev>
+        struct CurrentThreadWaitFor<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueGenericThreadsNonBlocking<TDev> const& queue) -> void
+            {
+                EventGenericThreads<TDev> event(getDev(queue));
+                alpaka::enqueue(const_cast<QueueGenericThreadsNonBlocking<TDev>&>(queue), event);
+                wait(event);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventHipRt.hpp
deleted file mode 100644
index 295e780c16..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/event/EventHipRt.hpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/queue/QueueHipRtNonBlocking.hpp>
-#include <alpaka/queue/QueueHipRtBlocking.hpp>
-#include <alpaka/core/Hip.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-
-namespace alpaka
-{
-    namespace event
-    {
-        namespace hip
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The HIP RT device event implementation.
-                class EventHipImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST EventHipImpl(
-                        dev::DevHipRt const & dev,
-                        bool bBusyWait) :
-                            m_dev(dev),
-                            m_HipEvent()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // Create the event on the current device with the specified flags. Valid flags include:
-                        // - hipEventDefault: Default event creation flag.
-                        // - hipEventBlockingSync : Specifies that event should use blocking synchronization.
-                        //   A host thread that uses hipEventSynchronize() to wait on an event created with this flag will block until the event actually completes.
-                        // - hipEventDisableTiming : Specifies that the created event does not need to record timing data.
-                        //   Events created with this flag specified and the hipEventBlockingSync flag not specified will provide the best performance when used with hipQueueWaitEvent() and hipEventQuery().
-                        ALPAKA_HIP_RT_CHECK(
-                            hipEventCreateWithFlags(
-                                &m_HipEvent,
-                                (bBusyWait ? hipEventDefault : hipEventBlockingSync) | hipEventDisableTiming));
-                    }
-                    //-----------------------------------------------------------------------------
-                    EventHipImpl(EventHipImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    EventHipImpl(EventHipImpl &&) = default;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventHipImpl const &) -> EventHipImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(EventHipImpl &&) -> EventHipImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~EventHipImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before hipEventDestroy required?
-                        ALPAKA_HIP_RT_CHECK(hipSetDevice(
-                            m_dev.m_iDevice));
-                        // In case event has been recorded but has not yet been completed when hipEventDestroy() is called, the function will return immediately
-                        // and the resources associated with event will be released automatically once the device has completed event.
-                        // -> No need to synchronize here.
-                        ALPAKA_HIP_RT_CHECK(hipEventDestroy(
-                            m_HipEvent));
-                    }
-
-                public:
-                    dev::DevHipRt const m_dev;   //!< The device this event is bound to.
-                    hipEvent_t m_HipEvent;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The HIP RT device event.
-        class EventHipRt final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, EventHipRt>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! Constructor.
-            ALPAKA_FN_HOST EventHipRt(
-                dev::DevHipRt const & dev,
-                bool bBusyWait = true) :
-                    m_spEventImpl(std::make_shared<hip::detail::EventHipImpl>(dev, bBusyWait))
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            }
-            //-----------------------------------------------------------------------------
-            //! Copy constructor.
-            EventHipRt(EventHipRt const &) = default;
-            //-----------------------------------------------------------------------------
-            //! Move constructor.
-            EventHipRt(EventHipRt &&) = default;
-            //-----------------------------------------------------------------------------
-            //! Copy assignment operator.
-            auto operator=(EventHipRt const &) -> EventHipRt & = default;
-            //-----------------------------------------------------------------------------
-            //! Move assignment operator.
-            auto operator=(EventHipRt &&) -> EventHipRt & = default;
-            //-----------------------------------------------------------------------------
-            //! Equality comparison operator.
-            auto operator==(EventHipRt const & rhs) const
-            -> bool
-            {
-                return (m_spEventImpl->m_HipEvent == rhs.m_spEventImpl->m_HipEvent);
-            }
-            //-----------------------------------------------------------------------------
-            //! Equality comparison operator.
-            auto operator!=(EventHipRt const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            //! Destructor.
-            ALPAKA_FN_HOST_ACC ~EventHipRt() = default;
-
-        public:
-            std::shared_ptr<hip::detail::EventHipImpl> m_spEventImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device event device get trait specialization.
-            template<>
-            struct GetDev<
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto getDev(
-                    event::EventHipRt const & event)
-                -> dev::DevHipRt
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device event test trait specialization.
-            template<>
-            struct Test<
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto test(
-                    event::EventHipRt const & event)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Query is allowed even for events on non current device.
-                    hipError_t ret = hipSuccess;
-                    ALPAKA_HIP_RT_CHECK_IGNORE(
-                        ret = hipEventQuery(
-                            event.m_spEventImpl->m_HipEvent),
-                        hipErrorNotReady);
-                    return (ret == hipSuccess);
-                }
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    event::EventHipRt & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_HIP_RT_CHECK(hipEventRecord(
-                        event.m_spEventImpl->m_HipEvent,
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP RT queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    event::EventHipRt & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_HIP_RT_CHECK(hipEventRecord(
-                        event.m_spEventImpl->m_HipEvent,
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device event thread wait trait specialization.
-            //!
-            //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been completed.
-            //! If the event is not enqueued to a queue the method returns immediately.
-            //#############################################################################
-            template<>
-            struct CurrentThreadWaitFor<
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    event::EventHipRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Sync is allowed even for events on non current device.
-                    ALPAKA_HIP_RT_CHECK(hipEventSynchronize(
-                        event.m_spEventImpl->m_HipEvent));
-                }
-            };
-            //#############################################################################
-            //! The HIP RT queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueHipRtNonBlocking,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueHipRtNonBlocking & queue,
-                    event::EventHipRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent(
-                        queue.m_spQueueImpl->m_HipQueue,
-                        event.m_spEventImpl->m_HipEvent,
-                        0));
-                }
-            };
-            //#############################################################################
-            //! The HIP RT queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueHipRtBlocking,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueHipRtBlocking & queue,
-                    event::EventHipRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent(
-                        queue.m_spQueueImpl->m_HipQueue,
-                        event.m_spEventImpl->m_HipEvent,
-                        0));
-                }
-            };
-            //#############################################################################
-            //! The HIP RT device event wait trait specialization.
-            //!
-            //! Any future work submitted in any queue of this device will wait for event to complete before beginning execution.
-            //#############################################################################
-            template<>
-            struct WaiterWaitFor<
-                dev::DevHipRt,
-                event::EventHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    dev::DevHipRt & dev,
-                    event::EventHipRt const & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-
-                    ALPAKA_HIP_RT_CHECK(hipStreamWaitEvent(
-                        nullptr,
-                        event.m_spEventImpl->m_HipEvent,
-                        0));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventOacc.hpp
new file mode 100644
index 0000000000..e96b3e464c
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/event/EventOacc.hpp
@@ -0,0 +1,26 @@
+/* Copyright 2019 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/dev/DevOacc.hpp>
+#    include <alpaka/event/EventGenericThreads.hpp>
+
+namespace alpaka
+{
+    using EventOacc = EventGenericThreads<DevOacc>;
+}
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventOmp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventOmp5.hpp
new file mode 100644
index 0000000000..c1a006885b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/event/EventOmp5.hpp
@@ -0,0 +1,26 @@
+/* Copyright 2019 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/dev/DevOmp5.hpp>
+#    include <alpaka/event/EventGenericThreads.hpp>
+
+namespace alpaka
+{
+    using EventOmp5 = EventGenericThreads<DevOmp5>;
+}
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/EventUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/EventUniformCudaHipRt.hpp
new file mode 100644
index 0000000000..57d6e79583
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/event/EventUniformCudaHipRt.hpp
@@ -0,0 +1,291 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/event/Traits.hpp>
+#    include <alpaka/wait/Traits.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <alpaka/queue/QueueUniformCudaHipRtBlocking.hpp>
+#    include <alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp>
+
+#    include <functional>
+#    include <memory>
+#    include <stdexcept>
+
+namespace alpaka
+{
+    namespace uniform_cuda_hip
+    {
+        namespace detail
+        {
+            //#############################################################################
+            //! The CUDA/HIP RT device event implementation.
+            class EventUniformCudaHipImpl final
+            {
+            public:
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST EventUniformCudaHipImpl(DevUniformCudaHipRt const& dev, bool bBusyWait)
+                    : m_dev(dev)
+                    , m_UniformCudaHipEvent()
+                {
+                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                    // Set the current device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(m_dev.m_iDevice));
+
+                    // Create the event on the current device with the specified flags. Valid flags include:
+                    // - cuda/hip-EventDefault: Default event creation flag.
+                    // - cuda/hip-EventBlockingSync : Specifies that event should use blocking synchronization.
+                    //   A host thread that uses cuda/hip-EventSynchronize() to wait on an event created with this flag
+                    //   will block until the event actually completes.
+                    // - cuda/hip-EventDisableTiming : Specifies that the created event does not need to record timing
+                    // data.
+                    //   Events created with this flag specified and the cuda/hip-EventBlockingSync flag not specified
+                    //   will provide the best performance when used with cudaStreamWaitEvent() and cudaEventQuery().
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(EventCreateWithFlags)(
+                        &m_UniformCudaHipEvent,
+                        (bBusyWait ? ALPAKA_API_PREFIX(EventDefault) : ALPAKA_API_PREFIX(EventBlockingSync))
+                            | ALPAKA_API_PREFIX(EventDisableTiming)));
+                }
+                //-----------------------------------------------------------------------------
+                EventUniformCudaHipImpl(EventUniformCudaHipImpl const&) = delete;
+                //-----------------------------------------------------------------------------
+                EventUniformCudaHipImpl(EventUniformCudaHipImpl&&) = default;
+                //-----------------------------------------------------------------------------
+                auto operator=(EventUniformCudaHipImpl const&) -> EventUniformCudaHipImpl& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(EventUniformCudaHipImpl&&) -> EventUniformCudaHipImpl& = delete;
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST ~EventUniformCudaHipImpl()
+                {
+                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                    // Set the current device. \TODO: Is setting the current device before cuda/hip-EventDestroy
+                    // required?
+
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(m_dev.m_iDevice));
+                    // In case event has been recorded but has not yet been completed when cuda/hip-EventDestroy() is
+                    // called, the function will return immediately and the resources associated with event will be
+                    // released automatically once the device has completed event.
+                    // -> No need to synchronize here.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(EventDestroy)(m_UniformCudaHipEvent));
+                }
+
+            public:
+                DevUniformCudaHipRt const m_dev; //!< The device this event is bound to.
+
+                ALPAKA_API_PREFIX(Event_t) m_UniformCudaHipEvent;
+            };
+        } // namespace detail
+    } // namespace uniform_cuda_hip
+
+    //#############################################################################
+    //! The CUDA/HIP RT device event.
+    class EventUniformCudaHipRt final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, EventUniformCudaHipRt>
+        , public concepts::Implements<ConceptGetDev, EventUniformCudaHipRt>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST EventUniformCudaHipRt(DevUniformCudaHipRt const& dev, bool bBusyWait = true)
+            : m_spEventImpl(std::make_shared<uniform_cuda_hip::detail::EventUniformCudaHipImpl>(dev, bBusyWait))
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+        }
+        //-----------------------------------------------------------------------------
+        EventUniformCudaHipRt(EventUniformCudaHipRt const&) = default;
+        //-----------------------------------------------------------------------------
+        EventUniformCudaHipRt(EventUniformCudaHipRt&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(EventUniformCudaHipRt const&) -> EventUniformCudaHipRt& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(EventUniformCudaHipRt&&) -> EventUniformCudaHipRt& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator==(EventUniformCudaHipRt const& rhs) const -> bool
+        {
+            return (m_spEventImpl == rhs.m_spEventImpl);
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator!=(EventUniformCudaHipRt const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~EventUniformCudaHipRt() = default;
+
+    public:
+        std::shared_ptr<uniform_cuda_hip::detail::EventUniformCudaHipImpl> m_spEventImpl;
+    };
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA/HIP RT device event device get trait specialization.
+        template<>
+        struct GetDev<EventUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(EventUniformCudaHipRt const& event) -> DevUniformCudaHipRt
+            {
+                return event.m_spEventImpl->m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT device event test trait specialization.
+        template<>
+        struct IsComplete<EventUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto isComplete(EventUniformCudaHipRt const& event) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Query is allowed even for events on non current device.
+                ALPAKA_API_PREFIX(Error_t) ret = ALPAKA_API_PREFIX(Success);
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
+                    ret = ALPAKA_API_PREFIX(EventQuery)(event.m_spEventImpl->m_UniformCudaHipEvent),
+                    ALPAKA_API_PREFIX(ErrorNotReady));
+                return (ret == ALPAKA_API_PREFIX(Success));
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT queue enqueue trait specialization.
+        template<>
+        struct Enqueue<QueueUniformCudaHipRtNonBlocking, EventUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueUniformCudaHipRtNonBlocking& queue, EventUniformCudaHipRt& event)
+                -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(EventRecord)(
+                    event.m_spEventImpl->m_UniformCudaHipEvent,
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP RT queue enqueue trait specialization.
+        template<>
+        struct Enqueue<QueueUniformCudaHipRtBlocking, EventUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueUniformCudaHipRtBlocking& queue, EventUniformCudaHipRt& event)
+                -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(EventRecord)(
+                    event.m_spEventImpl->m_UniformCudaHipEvent,
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT device event thread wait trait specialization.
+        //!
+        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
+        //! completed. If the event is not enqueued to a queue the method returns immediately.
+        template<>
+        struct CurrentThreadWaitFor<EventUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(EventUniformCudaHipRt const& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Sync is allowed even for events on non current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    ALPAKA_API_PREFIX(EventSynchronize)(event.m_spEventImpl->m_UniformCudaHipEvent));
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP RT queue event wait trait specialization.
+        template<>
+        struct WaiterWaitFor<QueueUniformCudaHipRtNonBlocking, EventUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                EventUniformCudaHipRt const& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamWaitEvent)(
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue,
+                    event.m_spEventImpl->m_UniformCudaHipEvent,
+                    0));
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP RT queue event wait trait specialization.
+        template<>
+        struct WaiterWaitFor<QueueUniformCudaHipRtBlocking, EventUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                QueueUniformCudaHipRtBlocking& queue,
+                EventUniformCudaHipRt const& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamWaitEvent)(
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue,
+                    event.m_spEventImpl->m_UniformCudaHipEvent,
+                    0));
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP RT device event wait trait specialization.
+        //!
+        //! Any future work submitted in any queue of this device will wait for event to complete before beginning
+        //! execution.
+        template<>
+        struct WaiterWaitFor<DevUniformCudaHipRt, EventUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(DevUniformCudaHipRt& dev, EventUniformCudaHipRt const& event)
+                -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    ALPAKA_API_PREFIX(StreamWaitEvent)(nullptr, event.m_spEventImpl->m_UniformCudaHipEvent, 0));
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/event/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/event/Traits.hpp
index a828b4094c..3adb06c9bf 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/event/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/event/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,53 +10,35 @@
 #pragma once
 
 #include <alpaka/core/Common.hpp>
-
 #include <alpaka/dev/Traits.hpp>
 
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
-    //! The event management specifics.
-    namespace event
+    //! The event management traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The event management traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The event type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct EventType;
-
-            //#############################################################################
-            //! The event tester trait.
-            template<
-                typename TEvent,
-                typename TSfinae = void>
-            struct Test;
-        }
+        //#############################################################################
+        //! The event type trait.
+        template<typename T, typename TSfinae = void>
+        struct EventType;
 
         //#############################################################################
-        //! The event type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Event = typename traits::EventType<T>::type;
+        //! The event tester trait.
+        template<typename TEvent, typename TSfinae = void>
+        struct IsComplete;
+    } // namespace traits
 
-        //-----------------------------------------------------------------------------
-        //! Tests if the given event has already been completed.
-        template<
-            typename TEvent>
-        ALPAKA_FN_HOST auto test(
-            TEvent const & event)
-        -> bool
-        {
-            return
-                traits::Test<
-                    TEvent>
-                ::test(
-                    event);
-        }
+    //#############################################################################
+    //! The event type trait alias template to remove the ::type.
+    template<typename T>
+    using Event = typename traits::EventType<T>::type;
+
+    //-----------------------------------------------------------------------------
+    //! Tests if the given event has already been completed.
+    template<typename TEvent>
+    ALPAKA_FN_HOST auto isComplete(TEvent const& event) -> bool
+    {
+        return traits::IsComplete<TEvent>::isComplete(event);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/example/ExampleDefaultAcc.hpp b/thirdParty/cupla/alpaka/include/alpaka/example/ExampleDefaultAcc.hpp
new file mode 100644
index 0000000000..1021d3fb4d
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/example/ExampleDefaultAcc.hpp
@@ -0,0 +1,52 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file exemplifies usage of alpaka.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <alpaka/alpaka.hpp>
+
+#pragma once
+
+namespace alpaka
+{
+    //! Alias for the default accelerator used by examples. From a list of
+    //! all accelerators the first one which is enabled is chosen.
+    //! AccCpuSerial is selected last.
+    template<class TDim, class TIdx>
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccGpuCudaRt<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccGpuHipRt<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuOmp2Blocks<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuTbbBlocks<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuFibers<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuOmp2Threads<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuThreads<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_ANY_BT_OMP5_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccOmp5<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_ANY_BT_OACC_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccOacc<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuSerial<TDim, TIdx>;
+#else
+    class ExampleDefaultAcc;
+#    warning "No supported backend selected."
+#endif
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/extent/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/extent/Traits.hpp
index 48b05ff3ed..1954c0a803 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/extent/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/extent/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,12 +12,12 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
 #include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/meta/Fold.hpp>
 #include <alpaka/idx/Traits.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
+#include <alpaka/meta/Fold.hpp>
 
-#include <type_traits>
 #include <functional>
+#include <type_traits>
+#include <utility>
 
 namespace alpaka
 {
@@ -33,177 +33,111 @@ namespace alpaka
             //! The extent get trait.
             //!
             //! If not specialized explicitly it returns 1.
-            template<
-                typename TIdxIntegralConst,
-                typename TExtent,
-                typename TSfinae = void>
+            template<typename TIdxIntegralConst, typename TExtent, typename TSfinae = void>
             struct GetExtent
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const &)
-                -> idx::Idx<TExtent>
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const&) -> Idx<TExtent>
                 {
-                    return static_cast<idx::Idx<TExtent>>(1);
+                    return static_cast<Idx<TExtent>>(1);
                 }
             };
 
             //#############################################################################
             //! The extent set trait.
-            template<
-                typename TIdxIntegralConst,
-                typename TExtent,
-                typename TExtentVal,
-                typename TSfinae = void>
+            template<typename TIdxIntegralConst, typename TExtent, typename TExtentVal, typename TSfinae = void>
             struct SetExtent;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! \return The extent in the given dimension.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t Tidx,
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getExtent(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
+        template<std::size_t Tidx, typename TExtent>
+        ALPAKA_FN_HOST_ACC auto getExtent(TExtent const& extent = TExtent()) -> Idx<TExtent>
         {
-            return
-                traits::GetExtent<
-                    dim::DimInt<Tidx>,
-                    TExtent>
-                ::getExtent(
-                    extent);
+            return traits::GetExtent<DimInt<Tidx>, TExtent>::getExtent(extent);
         }
         //-----------------------------------------------------------------------------
         //! \return The width.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getWidth(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
+        template<typename TExtent>
+        ALPAKA_FN_HOST_ACC auto getWidth(TExtent const& extent = TExtent()) -> Idx<TExtent>
         {
-            return getExtent<dim::Dim<TExtent>::value - 1u>(extent);
+            return getExtent<Dim<TExtent>::value - 1u>(extent);
         }
         //-----------------------------------------------------------------------------
         //! \return The height.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getHeight(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
+        template<typename TExtent>
+        ALPAKA_FN_HOST_ACC auto getHeight(TExtent const& extent = TExtent()) -> Idx<TExtent>
         {
-            return getExtent<dim::Dim<TExtent>::value - 2u>(extent);
+            return getExtent<Dim<TExtent>::value - 2u>(extent);
         }
         //-----------------------------------------------------------------------------
         //! \return The depth.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getDepth(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
+        template<typename TExtent>
+        ALPAKA_FN_HOST_ACC auto getDepth(TExtent const& extent = TExtent()) -> Idx<TExtent>
         {
-            return getExtent<dim::Dim<TExtent>::value - 3u>(extent);
+            return getExtent<Dim<TExtent>::value - 3u>(extent);
         }
 
         namespace detail
         {
             //-----------------------------------------------------------------------------
             ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TExtent,
-                size_t... TIndices>
+            template<typename TExtent, size_t... TIndices>
             ALPAKA_FN_HOST_ACC auto getExtentProductInternal(
-                TExtent const & extent,
-                alpaka::meta::IndexSequence<TIndices...> const & indices)
-            -> idx::Idx<TExtent>
+                TExtent const& extent,
+                std::index_sequence<TIndices...> const& indices) -> Idx<TExtent>
             {
                 alpaka::ignore_unused(indices);
 
-                return
-                    meta::foldr(
-                        std::multiplies<idx::Idx<TExtent>>(),
-                        getExtent<TIndices>(extent)...);
+                return meta::foldr(std::multiplies<Idx<TExtent>>(), getExtent<TIndices>(extent)...);
             }
-        }
+        } // namespace detail
 
         //-----------------------------------------------------------------------------
         //! \return The product of the extent.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getExtentProduct(
-            TExtent const & extent = TExtent())
-        -> idx::Idx<TExtent>
+        template<typename TExtent>
+        ALPAKA_FN_HOST_ACC auto getExtentProduct(TExtent const& extent = TExtent()) -> Idx<TExtent>
         {
-            using IdxSequence = alpaka::meta::MakeIndexSequence<dim::Dim<TExtent>::value>;
-            return
-                detail::getExtentProductInternal(
-                    extent,
-                    IdxSequence());
+            using IdxSequence = std::make_index_sequence<Dim<TExtent>::value>;
+            return detail::getExtentProductInternal(extent, IdxSequence());
         }
 
         //-----------------------------------------------------------------------------
         //! Sets the extent in the given dimension.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t Tidx,
-            typename TExtent,
-            typename TExtentVal>
-        ALPAKA_FN_HOST_ACC auto setExtent(
-            TExtent & extent,
-            TExtentVal const & extentVal)
-        -> void
+        template<std::size_t Tidx, typename TExtent, typename TExtentVal>
+        ALPAKA_FN_HOST_ACC auto setExtent(TExtent& extent, TExtentVal const& extentVal) -> void
         {
-            traits::SetExtent<
-                dim::DimInt<Tidx>,
-                TExtent,
-                TExtentVal>
-            ::setExtent(
-                extent,
-                extentVal);
+            traits::SetExtent<DimInt<Tidx>, TExtent, TExtentVal>::setExtent(extent, extentVal);
         }
         //-----------------------------------------------------------------------------
         //! Sets the width.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent,
-            typename TWidth>
-        ALPAKA_FN_HOST_ACC auto setWidth(
-            TExtent & extent,
-            TWidth const & width)
-        -> void
+        template<typename TExtent, typename TWidth>
+        ALPAKA_FN_HOST_ACC auto setWidth(TExtent& extent, TWidth const& width) -> void
         {
-            setExtent<dim::Dim<TExtent>::value - 1u>(extent, width);
+            setExtent<Dim<TExtent>::value - 1u>(extent, width);
         }
         //-----------------------------------------------------------------------------
         //! Sets the height.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent,
-            typename THeight>
-        ALPAKA_FN_HOST_ACC auto setHeight(
-            TExtent & extent,
-            THeight const & height)
-        -> void
+        template<typename TExtent, typename THeight>
+        ALPAKA_FN_HOST_ACC auto setHeight(TExtent& extent, THeight const& height) -> void
         {
-            setExtent<dim::Dim<TExtent>::value - 2u>(extent, height);
+            setExtent<Dim<TExtent>::value - 2u>(extent, height);
         }
         //-----------------------------------------------------------------------------
         //! Sets the depth.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent,
-            typename TDepth>
-        ALPAKA_FN_HOST_ACC auto setDepth(
-            TExtent & extent,
-            TDepth const & depth)
-        -> void
+        template<typename TExtent, typename TDepth>
+        ALPAKA_FN_HOST_ACC auto setDepth(TExtent& extent, TDepth const& depth) -> void
         {
-            setExtent<dim::Dim<TExtent>::value - 3u>(extent, depth);
+            setExtent<Dim<TExtent>::value - 3u>(extent, depth);
         }
 
         //-----------------------------------------------------------------------------
@@ -212,43 +146,26 @@ namespace alpaka
         {
             //#############################################################################
             //! The unsigned integral width get trait specialization.
-            template<
-                typename TExtent>
-            struct GetExtent<
-                dim::DimInt<0u>,
-                TExtent,
-                typename std::enable_if<
-                    std::is_integral<TExtent>::value>::type>
+            template<typename TExtent>
+            struct GetExtent<DimInt<0u>, TExtent, std::enable_if_t<std::is_integral<TExtent>::value>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    TExtent const & extent)
-                -> idx::Idx<TExtent>
+                ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const& extent) -> Idx<TExtent>
                 {
                     return extent;
                 }
             };
             //#############################################################################
             //! The unsigned integral width set trait specialization.
-            template<
-                typename TExtent,
-                typename TExtentVal>
-            struct SetExtent<
-                dim::DimInt<0u>,
-                TExtent,
-                TExtentVal,
-                typename std::enable_if<
-                    std::is_integral<TExtent>::value>::type>
+            template<typename TExtent, typename TExtentVal>
+            struct SetExtent<DimInt<0u>, TExtent, TExtentVal, std::enable_if_t<std::is_integral<TExtent>::value>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    TExtent const & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(TExtent const& extent, TExtentVal const& extentVal) -> void
                 {
                     extent = extentVal;
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace extent
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/Accessors.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/Accessors.hpp
index 86bef954d2..c1d39c1773 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/Accessors.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/Accessors.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,207 +9,121 @@
 
 #pragma once
 
-#include <alpaka/core/Positioning.hpp>
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
+#include <alpaka/core/Positioning.hpp>
+#include <alpaka/core/Unused.hpp>
 #include <alpaka/dim/DimIntegralConst.hpp>
+#include <alpaka/dim/Traits.hpp>
+#include <alpaka/idx/Traits.hpp>
+#include <alpaka/vec/Vec.hpp>
 #include <alpaka/workdiv/Traits.hpp>
 
-#include <alpaka/core/Unused.hpp>
-
-#include <boost/config.hpp>
-
 #include <utility>
 
 namespace alpaka
 {
-    namespace idx
+    //-----------------------------------------------------------------------------
+    //! Get the indices requested.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOrigin, typename TUnit, typename TIdx, typename TWorkDiv>
+    ALPAKA_FN_HOST_ACC auto getIdx(TIdx const& idx, TWorkDiv const& workDiv) -> Vec<Dim<TWorkDiv>, Idx<TIdx>>
     {
-        //-----------------------------------------------------------------------------
-        //! Get the indices requested.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOrigin,
-            typename TUnit,
-            typename TIdx,
-            typename TWorkDiv>
-        ALPAKA_FN_HOST_ACC auto getIdx(
-            TIdx const & idx,
-            TWorkDiv const & workDiv)
-        -> vec::Vec<dim::Dim<TWorkDiv>, idx::Idx<TIdx>>
-        {
-            return
-                traits::GetIdx<
-                    TIdx,
-                    TOrigin,
-                    TUnit>
-                ::getIdx(
-                    idx,
-                    workDiv);
-        }
-        //-----------------------------------------------------------------------------
-        //! Get the indices requested.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOrigin,
-            typename TUnit,
-            typename TIdxWorkDiv>
-        ALPAKA_FN_HOST_ACC auto getIdx(
-            TIdxWorkDiv const & idxWorkDiv)
-        -> vec::Vec<dim::Dim<TIdxWorkDiv>, idx::Idx<TIdxWorkDiv>>
-        {
-            return
-                traits::GetIdx<
-                    TIdxWorkDiv,
-                    TOrigin,
-                    TUnit>
-                ::getIdx(
-                    idxWorkDiv,
-                    idxWorkDiv);
-        }
+        return traits::GetIdx<TIdx, TOrigin, TUnit>::getIdx(idx, workDiv);
+    }
+    //-----------------------------------------------------------------------------
+    //! Get the indices requested.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOrigin, typename TUnit, typename TIdxWorkDiv>
+    ALPAKA_FN_HOST_ACC auto getIdx(TIdxWorkDiv const& idxWorkDiv) -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
+    {
+        return traits::GetIdx<TIdxWorkDiv, TOrigin, TUnit>::getIdx(idxWorkDiv, idxWorkDiv);
+    }
 
-        namespace traits
+    namespace traits
+    {
+        //#############################################################################
+        //! The grid block index get trait specialization for classes with IdxGbBase member type.
+        template<typename TIdxGb>
+        struct GetIdx<TIdxGb, origin::Grid, unit::Blocks>
         {
-            //#############################################################################
-            //! The grid block index get trait specialization for classes with IdxGbBase member type.
-            template<
-                typename TIdxGb>
-            struct GetIdx<
-                TIdxGb,
-                origin::Grid,
-                unit::Blocks>
+            using ImplementationBase = concepts::ImplementationBase<ConceptIdxGb, TIdxGb>;
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST_ACC static auto getIdx(TIdxGb const& idx, TWorkDiv const& workDiv)
+                -> Vec<Dim<ImplementationBase>, Idx<ImplementationBase>>
             {
-                using ImplementationBase = concepts::ImplementationBase<ConceptIdxGb, TIdxGb>;
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    TIdxGb const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<dim::Dim<ImplementationBase>, idx::Idx<ImplementationBase>>
-                {
-                    return
-                        traits::GetIdx<
-                            ImplementationBase,
-                            origin::Grid,
-                            unit::Blocks>
-                        ::getIdx(
-                            idx,
-                            workDiv);
-                }
-            };
+                return traits::GetIdx<ImplementationBase, origin::Grid, unit::Blocks>::getIdx(idx, workDiv);
+            }
+        };
 
-            //#############################################################################
-            //! The block thread index get trait specialization for classes with IdxBtBase member type.
-            template<
-                typename TIdxBt>
-            struct GetIdx<
-                TIdxBt,
-                origin::Block,
-                unit::Threads>
+        //#############################################################################
+        //! The block thread index get trait specialization for classes with IdxBtBase member type.
+        template<typename TIdxBt>
+        struct GetIdx<TIdxBt, origin::Block, unit::Threads>
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptIdxBt, TIdxBt>;
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST_ACC static auto getIdx(TIdxBt const& idx, TWorkDiv const& workDiv)
+                -> Vec<Dim<ImplementationBase>, Idx<ImplementationBase>>
             {
-                using ImplementationBase = concepts::ImplementationBase<ConceptIdxBt, TIdxBt>;
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    TIdxBt const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<dim::Dim<ImplementationBase>, idx::Idx<ImplementationBase>>
-                {
-                    return
-                        traits::GetIdx<
-                            ImplementationBase,
-                            origin::Block,
-                            unit::Threads>
-                        ::getIdx(
-                            idx,
-                            workDiv);
-                }
-            };
+                return traits::GetIdx<ImplementationBase, origin::Block, unit::Threads>::getIdx(idx, workDiv);
+            }
+        };
 
-            //#############################################################################
-            //! The grid thread index get trait specialization.
-            template<
-                typename TIdx>
-            struct GetIdx<
-                TIdx,
-                origin::Grid,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    TIdx const & idx,
-                    TWorkDiv const & workDiv)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> decltype(
-                    idx::getIdx<origin::Grid, unit::Blocks>(idx, workDiv)
-                    * workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                    + idx::getIdx<origin::Block, unit::Threads>(idx, workDiv))
-#endif
-                {
-                    return
-                        idx::getIdx<origin::Grid, unit::Blocks>(idx, workDiv)
-                        * workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                        + idx::getIdx<origin::Block, unit::Threads>(idx, workDiv);
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! Get the index of the first element this thread computes.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TIdxWorkDiv,
-            typename TGridThreadIdx,
-            typename TThreadElemExtent>
-        ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
-            TIdxWorkDiv const & idxWorkDiv,
-            TGridThreadIdx const & gridThreadIdx,
-            TThreadElemExtent const & threadElemExtent)
-        -> vec::Vec<dim::Dim<TIdxWorkDiv>, idx::Idx<TIdxWorkDiv>>
+        //#############################################################################
+        //! The grid thread index get trait specialization.
+        template<typename TIdx>
+        struct GetIdx<TIdx, origin::Grid, unit::Threads>
         {
-            alpaka::ignore_unused(idxWorkDiv);
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST_ACC static auto getIdx(TIdx const& idx, TWorkDiv const& workDiv)
+            {
+                return alpaka::getIdx<origin::Grid, unit::Blocks>(idx, workDiv)
+                    * getWorkDiv<origin::Block, unit::Threads>(workDiv)
+                    + alpaka::getIdx<origin::Block, unit::Threads>(idx, workDiv);
+            }
+        };
+    } // namespace traits
+    //-----------------------------------------------------------------------------
+    //! Get the index of the first element this thread computes.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIdxWorkDiv, typename TGridThreadIdx, typename TThreadElemExtent>
+    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
+        TIdxWorkDiv const& idxWorkDiv,
+        TGridThreadIdx const& gridThreadIdx,
+        TThreadElemExtent const& threadElemExtent) -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
+    {
+        alpaka::ignore_unused(idxWorkDiv);
 
-            return gridThreadIdx * threadElemExtent;
-        }
-        //-----------------------------------------------------------------------------
-        //! Get the index of the first element this thread computes.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TIdxWorkDiv,
-            typename TGridThreadIdx>
-        ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
-            TIdxWorkDiv const & idxWorkDiv,
-            TGridThreadIdx const & gridThreadIdx)
-        -> vec::Vec<dim::Dim<TIdxWorkDiv>, idx::Idx<TIdxWorkDiv>>
-        {
-            auto const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(idxWorkDiv));
-            return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx, threadElemExtent);
-        }
-        //-----------------------------------------------------------------------------
-        //! Get the index of the first element this thread computes.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TIdxWorkDiv>
-        ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
-            TIdxWorkDiv const & idxWorkDiv)
-        -> vec::Vec<dim::Dim<TIdxWorkDiv>, idx::Idx<TIdxWorkDiv>>
-        {
-            auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(idxWorkDiv));
-            return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx);
-        }
+        return gridThreadIdx * threadElemExtent;
+    }
+    //-----------------------------------------------------------------------------
+    //! Get the index of the first element this thread computes.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIdxWorkDiv, typename TGridThreadIdx>
+    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(TIdxWorkDiv const& idxWorkDiv, TGridThreadIdx const& gridThreadIdx)
+        -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
+    {
+        auto const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(idxWorkDiv));
+        return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx, threadElemExtent);
+    }
+    //-----------------------------------------------------------------------------
+    //! Get the index of the first element this thread computes.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIdxWorkDiv>
+    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(TIdxWorkDiv const& idxWorkDiv)
+        -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
+    {
+        auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(idxWorkDiv));
+        return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/MapIdx.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/MapIdx.hpp
index 362480afa4..5602f2f8aa 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/MapIdx.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/MapIdx.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,151 +9,226 @@
 
 #pragma once
 
-#include <alpaka/vec/Vec.hpp>
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/vec/Vec.hpp>
+
+#include <type_traits>
 
 namespace alpaka
 {
-    namespace idx
+    namespace detail
     {
-        namespace detail
+        //#############################################################################
+        //! Maps a linear index to a N dimensional index.
+        template<std::size_t TidxDimOut, std::size_t TidxDimIn, typename TSfinae = void>
+        struct MapIdx;
+        //#############################################################################
+        //! Maps a N dimensional index to the same N dimensional index.
+        template<std::size_t TidxDim>
+        struct MapIdx<TidxDim, TidxDim>
         {
-            //#############################################################################
-            //! Maps a linear index to a N dimensional index.
-            template<
-                std::size_t TidxDimOut,
-                std::size_t TidxDimIn,
-                typename TSfinae = void>
-            struct MapIdx;
-            //#############################################################################
-            //! Maps a N dimensional index to the same N dimensional index.
-            template<
-                std::size_t TidxDim>
-            struct MapIdx<
-                TidxDim,
-                TidxDim>
+            //-----------------------------------------------------------------------------
+            // \tparam TElem Type of the index values.
+            // \param idx Idx to be mapped.
+            // \param extent Spatial size to map the index to.
+            // \return A N dimensional vector.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TElem>
+            ALPAKA_FN_HOST_ACC static auto mapIdx(
+                Vec<DimInt<TidxDim>, TElem> const& idx,
+                Vec<DimInt<TidxDim>, TElem> const& extent) -> Vec<DimInt<TidxDim>, TElem>
             {
-                //-----------------------------------------------------------------------------
-                // \tparam TElem Type of the index values.
-                // \param idx Idx to be mapped.
-                // \param extent Spatial size to map the index to.
-                // \return A N dimensional vector.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TElem>
-                ALPAKA_FN_HOST_ACC static auto mapIdx(
-                    vec::Vec<dim::DimInt<TidxDim>, TElem> const & idx,
-                    vec::Vec<dim::DimInt<TidxDim>, TElem> const & extent)
-                -> vec::Vec<dim::DimInt<TidxDim>, TElem>
-                {
-                    alpaka::ignore_unused(extent);
+                alpaka::ignore_unused(extent);
 
-                    return idx;
-                }
-            };
-            //#############################################################################
-            //! Maps a 1 dimensional index to a N dimensional index.
-            template<
-                std::size_t TidxDimOut>
-            struct MapIdx<
-                TidxDimOut,
-                1u,
-                typename std::enable_if<TidxDimOut != 1u>::type>
+                return idx;
+            }
+        };
+        //#############################################################################
+        //! Maps a 1 dimensional index to a N dimensional index.
+        template<std::size_t TidxDimOut>
+        struct MapIdx<TidxDimOut, 1u, std::enable_if_t<TidxDimOut != 1u>>
+        {
+            //-----------------------------------------------------------------------------
+            // \tparam TElem Type of the index values.
+            // \param idx Idx to be mapped.
+            // \param extent Spatial size to map the index to
+            // \return A N dimensional vector.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TElem>
+            ALPAKA_FN_HOST_ACC static auto mapIdx(
+                Vec<DimInt<1u>, TElem> const& idx,
+                Vec<DimInt<TidxDimOut>, TElem> const& extent) -> Vec<DimInt<TidxDimOut>, TElem>
             {
-                //-----------------------------------------------------------------------------
-                // \tparam TElem Type of the index values.
-                // \param idx Idx to be mapped.
-                // \param extent Spatial size to map the index to
-                // \return A N dimensional vector.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TElem>
-                ALPAKA_FN_HOST_ACC static auto mapIdx(
-                    vec::Vec<dim::DimInt<1u>, TElem> const & idx,
-                    vec::Vec<dim::DimInt<TidxDimOut>, TElem> const & extent)
-                -> vec::Vec<dim::DimInt<TidxDimOut>, TElem>
-                {
-                    auto idxNd(vec::Vec<dim::DimInt<TidxDimOut>, TElem>::all(0u));
+                auto idxNd(Vec<DimInt<TidxDimOut>, TElem>::all(0u));
 
-                    constexpr std::size_t lastIdx(TidxDimOut - 1u);
+                constexpr std::size_t lastIdx(TidxDimOut - 1u);
 
-                    // fast-dim
-                    idxNd[lastIdx] = static_cast<TElem>(idx[0u] % extent[lastIdx]);
+                // fast-dim
+                idxNd[lastIdx] = static_cast<TElem>(idx[0u] % extent[lastIdx]);
 
-                    // in-between
-                    TElem hyperPlanesBefore = extent[lastIdx];
-                    for(std::size_t r(1u); r < lastIdx; ++r)
-                    {
-                        std::size_t const d = lastIdx - r;
-                        idxNd[d] = static_cast<TElem>(idx[0u] / hyperPlanesBefore % extent[d]);
-                        hyperPlanesBefore *= extent[d];
-                    }
+                // in-between
+                TElem hyperPlanesBefore = extent[lastIdx];
+                for(std::size_t r(1u); r < lastIdx; ++r)
+                {
+                    std::size_t const d = lastIdx - r;
+                    idxNd[d] = static_cast<TElem>(idx[0u] / hyperPlanesBefore % extent[d]);
+                    hyperPlanesBefore *= extent[d];
+                }
 
-                    // slow-dim
-                    idxNd[0u] = static_cast<TElem>(idx[0u] / hyperPlanesBefore);
+                // slow-dim
+                idxNd[0u] = static_cast<TElem>(idx[0u] / hyperPlanesBefore);
 
-                    return idxNd;
+                return idxNd;
+            }
+        };
+        //#############################################################################
+        //! Maps a N dimensional index to a 1 dimensional index.
+        template<std::size_t TidxDimIn>
+        struct MapIdx<1u, TidxDimIn, std::enable_if_t<TidxDimIn != 1u>>
+        {
+            //-----------------------------------------------------------------------------
+            // \tparam TElem Type of the index values.
+            // \param idx Idx to be mapped.
+            // \param extent Spatial size to map the index to.
+            // \return A 1 dimensional vector.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TElem>
+            ALPAKA_FN_HOST_ACC static auto mapIdx(
+                Vec<DimInt<TidxDimIn>, TElem> const& idx,
+                Vec<DimInt<TidxDimIn>, TElem> const& extent) -> Vec<DimInt<1u>, TElem>
+            {
+                TElem idx1d(idx[0u]);
+                for(std::size_t d(1u); d < TidxDimIn; ++d)
+                {
+                    idx1d = static_cast<TElem>(idx1d * extent[d] + idx[d]);
                 }
-            };
-            //#############################################################################
-            //! Maps a N dimensional index to a 1 dimensional index.
-            template<
-                std::size_t TidxDimIn>
-            struct MapIdx<
-                1u,
-                TidxDimIn,
-                typename std::enable_if<TidxDimIn != 1u>::type>
+                return {idx1d};
+            }
+        };
+    } // namespace detail
+
+    //#############################################################################
+    //! Maps a N dimensional index to a N dimensional position.
+    //!
+    //! \tparam TidxDimOut Dimension of the index vector to map to.
+    //! \tparam TidxDimIn Dimension of the index vector to map from.
+    //! \tparam TElem Type of the elements of the index vector to map from.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<std::size_t TidxDimOut, std::size_t TidxDimIn, typename TElem>
+    ALPAKA_FN_HOST_ACC auto mapIdx(
+        Vec<DimInt<TidxDimIn>, TElem> const& idx,
+        Vec<DimInt<(TidxDimOut < TidxDimIn) ? TidxDimIn : TidxDimOut>, TElem> const& extent)
+        -> Vec<DimInt<TidxDimOut>, TElem>
+    {
+        static_assert(TidxDimOut > 0u, "The dimension of the output vector has to be greater than zero!");
+        static_assert(TidxDimIn > 0u, "The dimension of the input vector has to be greater than zero!");
+
+        return detail::MapIdx<TidxDimOut, TidxDimIn>::mapIdx(idx, extent);
+    }
+
+    namespace detail
+    {
+        //#############################################################################
+        //! Maps a linear index to a N dimensional index assuming a buffer wihtout padding.
+        template<std::size_t TidxDimOut, std::size_t TidxDimIn, typename TSfinae = void>
+        struct MapIdxPitchBytes;
+        //#############################################################################
+        //! Maps a N dimensional index to the same N dimensional index assuming a buffer wihtout padding.
+        template<std::size_t TidxDim>
+        struct MapIdxPitchBytes<TidxDim, TidxDim>
+        {
+            //-----------------------------------------------------------------------------
+            // \tparam TElem Type of the index values.
+            // \param idx Idx to be mapped.
+            // \param pitch Spatial pitch (in elems) to map the index to
+            // \return N dimensional vector.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TElem>
+            ALPAKA_FN_HOST_ACC static auto mapIdxPitchBytes(
+                Vec<DimInt<TidxDim>, TElem> const& idx,
+                Vec<DimInt<TidxDim>, TElem> const& pitch) -> Vec<DimInt<TidxDim>, TElem>
+            {
+                alpaka::ignore_unused(pitch);
+
+                return idx;
+            }
+        };
+        //#############################################################################
+        //! Maps a 1 dimensional index to a N dimensional index assuming a buffer wihtout padding.
+        template<std::size_t TidxDimOut>
+        struct MapIdxPitchBytes<TidxDimOut, 1u, typename std::enable_if<TidxDimOut != 1u>::type>
+        {
+            //-----------------------------------------------------------------------------
+            // \tparam TElem Type of the index values.
+            // \param idx Idx to be mapped.
+            // \param pitch Spatial pitch (in elems) to map the index to
+            // \return N dimensional vector.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TElem>
+            ALPAKA_FN_HOST_ACC static auto mapIdxPitchBytes(
+                Vec<DimInt<1u>, TElem> const& idx,
+                Vec<DimInt<TidxDimOut>, TElem> const& pitch) -> Vec<DimInt<TidxDimOut>, TElem>
             {
-                //-----------------------------------------------------------------------------
-                // \tparam TElem Type of the index values.
-                // \param idx Idx to be mapped.
-                // \param extent Spatial size to map the index to.
-                // \return A 1 dimensional vector.
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TElem>
-                ALPAKA_FN_HOST_ACC static auto mapIdx(
-                    vec::Vec<dim::DimInt<TidxDimIn>, TElem> const & idx,
-                    vec::Vec<dim::DimInt<TidxDimIn>, TElem> const & extent)
-                -> vec::Vec<dim::DimInt<1u>, TElem>
+                auto idxNd(Vec<DimInt<TidxDimOut>, TElem>::all(0u));
+
+                constexpr std::size_t lastIdx(TidxDimOut - 1u);
+
+                TElem tmp = idx[0u];
+                for(std::size_t d(0u); d < lastIdx; ++d)
                 {
-                    TElem idx1d(idx[0u]);
-                    for(std::size_t d(1u); d < TidxDimIn; ++d)
-                    {
-                        idx1d = static_cast<TElem>(idx1d * extent[d] + idx[d]);
-                    }
-                    return {idx1d};
+                    idxNd[d] = static_cast<TElem>(tmp / pitch[d + 1]);
+                    tmp %= pitch[d + 1];
                 }
-            };
-        }
+                idxNd[lastIdx] = tmp;
 
+                return idxNd;
+            }
+        };
         //#############################################################################
-        //! Maps a N dimensional index to a N dimensional position.
-        //!
-        //! \tparam TidxDimOut Dimension of the index vector to map to.
-        //! \tparam TidxDimIn Dimension of the index vector to map from.
-        //! \tparam TElem Type of the elements of the index vector to map from.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t TidxDimOut,
-            std::size_t TidxDimIn,
-            typename TElem>
-        ALPAKA_FN_HOST_ACC auto mapIdx(
-            vec::Vec<dim::DimInt<TidxDimIn>, TElem> const & idx,
-            vec::Vec<dim::DimInt<(TidxDimOut < TidxDimIn) ? TidxDimIn : TidxDimOut>, TElem> const & extent)
-        -> vec::Vec<dim::DimInt<TidxDimOut>, TElem>
+        //! Maps a N dimensional index to a 1 dimensional index assuming a buffer wihtout padding.
+        template<std::size_t TidxDimIn>
+        struct MapIdxPitchBytes<1u, TidxDimIn, typename std::enable_if<TidxDimIn != 1u>::type>
         {
-            static_assert(TidxDimOut > 0u, "The dimension of the output vector has to be greater than zero!");
-            static_assert(TidxDimIn > 0u, "The dimension of the input vector has to be greater than zero!");
-
-            return
-                detail::MapIdx<
-                    TidxDimOut,
-                    TidxDimIn>
-                ::mapIdx(
-                    idx,
-                    extent);
-        }
+            //-----------------------------------------------------------------------------
+            // \tparam TElem Type of the index values.
+            // \param idx Idx to be mapped.
+            // \param pitch Spatial pitch (in elems) to map the index to
+            // \return A 1 dimensional vector.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TElem>
+            ALPAKA_FN_HOST_ACC static auto mapIdxPitchBytes(
+                Vec<DimInt<TidxDimIn>, TElem> const& idx,
+                Vec<DimInt<TidxDimIn>, TElem> const& pitch) -> Vec<DimInt<1u>, TElem>
+            {
+                constexpr auto lastDim = TidxDimIn - 1;
+                TElem idx1d(idx[lastDim]);
+                for(std::size_t d(0u); d < lastDim; ++d)
+                {
+                    idx1d = static_cast<TElem>(idx1d + pitch[d + 1] * idx[d]);
+                }
+                return {idx1d};
+            }
+        };
+    } // namespace detail
+
+    //#############################################################################
+    //! Maps a N dimensional index to a N dimensional position based on
+    //! pitch in a buffer without padding or a byte buffer.
+    //!
+    //! \tparam TidxDimOut Dimension of the index vector to map to.
+    //! \tparam TidxDimIn Dimension of the index vector to map from.
+    //! \tparam TElem Type of the elements of the index vector to map from.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<std::size_t TidxDimOut, std::size_t TidxDimIn, typename TElem>
+    ALPAKA_FN_HOST_ACC auto mapIdxPitchBytes(
+        Vec<DimInt<TidxDimIn>, TElem> const& idx,
+        Vec<DimInt<(TidxDimOut < TidxDimIn) ? TidxDimIn : TidxDimOut>, TElem> const& pitch)
+        -> Vec<DimInt<TidxDimOut>, TElem>
+    {
+        static_assert(TidxDimOut > 0u, "The dimension of the output vector has to be greater than zero!");
+        static_assert(TidxDimIn > 0u, "The dimension of the input vector has to be greater than zero!");
+
+        return detail::MapIdxPitchBytes<TidxDimOut, TidxDimIn>::mapIdxPitchBytes(idx, pitch);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/Traits.hpp
index 3af4fdc78b..3beddb9e5b 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,61 +9,44 @@
 
 #pragma once
 
+#include <type_traits>
 #include <utility>
 
 namespace alpaka
 {
-    //-----------------------------------------------------------------------------
-    //! The index specifics.
-    namespace idx
+    struct ConceptIdxBt
+    {
+    };
+    struct ConceptIdxGb
     {
-        struct ConceptIdxBt;
-        struct ConceptIdxGb;
+    };
 
-        //-----------------------------------------------------------------------------
-        //! The idx traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The idx type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct IdxType;
-        }
+    //-----------------------------------------------------------------------------
+    //! The idx traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The idx type trait.
+        template<typename T, typename TSfinae = void>
+        struct IdxType;
+    } // namespace traits
 
-        template<
-            typename T>
-        using Idx = typename traits::IdxType<T>::type;
+    template<typename T>
+    using Idx = typename traits::IdxType<T>::type;
 
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
+    namespace traits
+    {
+        //#############################################################################
+        //! The arithmetic idx type trait specialization.
+        template<typename T>
+        struct IdxType<T, std::enable_if_t<std::is_arithmetic<T>::value>>
         {
-            //#############################################################################
-            //! The arithmetic idx type trait specialization.
-            template<
-                typename T>
-            struct IdxType<
-                T,
-                typename std::enable_if<std::is_arithmetic<T>::value>::type>
-            {
-                using type = typename std::decay<T>::type;
-            };
-        }
+            using type = std::decay_t<T>;
+        };
 
-        //-----------------------------------------------------------------------------
-        //! The index traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The index get trait.
-            template<
-                typename TIdx,
-                typename TOrigin,
-                typename TUnit,
-                typename TSfinae = void>
-            struct GetIdx;
-        }
-    }
-}
+        //#############################################################################
+        //! The index get trait.
+        template<typename TIdx, typename TOrigin, typename TUnit, typename TSfinae = void>
+        struct GetIdx;
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtCudaBuiltIn.hpp
deleted file mode 100644
index 8876308996..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtCudaBuiltIn.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace bt
-        {
-            //#############################################################################
-            //! The CUDA accelerator ND index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtCudaBuiltIn : public concepts::Implements<ConceptIdxBt, IdxBtCudaBuiltIn<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxBtCudaBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                __device__ IdxBtCudaBuiltIn(IdxBtCudaBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ IdxBtCudaBuiltIn(IdxBtCudaBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(IdxBtCudaBuiltIn const & ) -> IdxBtCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(IdxBtCudaBuiltIn &&) -> IdxBtCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtCudaBuiltIn() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                __device__ static auto getIdx(
-                    idx::bt::IdxBtCudaBuiltIn<TDim, TIdx> const & idx,
-                    TWorkDiv const &)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    return vec::cast<TIdx>(offset::getOffsetVecEnd<TDim>(threadIdx));
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtHipBuiltIn.hpp
deleted file mode 100644
index e5dd535c8c..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtHipBuiltIn.hpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace bt
-        {
-            //#############################################################################
-            //! The HIP accelerator ND index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtHipBuiltIn : public concepts::Implements<ConceptIdxBt, IdxBtHipBuiltIn<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST_ACC IdxBtHipBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                IdxBtHipBuiltIn(IdxBtHipBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                IdxBtHipBuiltIn(IdxBtHipBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxBtHipBuiltIn const & ) -> IdxBtHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxBtHipBuiltIn &&) -> IdxBtHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ALPAKA_FN_HOST_ACC ~IdxBtHipBuiltIn() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtHipBuiltIn<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    idx::bt::IdxBtHipBuiltIn<TDim, TIdx> const & idx,
-                    TWorkDiv const &)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    return offset::getOffsetVecEnd<TDim>(
-                        vec::Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                            static_cast<TIdx>(hipThreadIdx_z),
-                            static_cast<TIdx>(hipThreadIdx_y),
-                            static_cast<TIdx>(hipThreadIdx_x)));
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtLinear.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtLinear.hpp
new file mode 100644
index 0000000000..a26920d808
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtLinear.hpp
@@ -0,0 +1,95 @@
+/* Copyright 2020 Axel Huebl, Jeffrey Kelling, Benjamin Worpitz, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/Concepts.hpp>
+#include <alpaka/core/Positioning.hpp>
+#include <alpaka/core/Unused.hpp>
+#include <alpaka/idx/MapIdx.hpp>
+#include <alpaka/idx/Traits.hpp>
+#include <alpaka/vec/Vec.hpp>
+#include <alpaka/workdiv/Traits.hpp>
+
+namespace alpaka
+{
+    namespace bt
+    {
+        //#############################################################################
+        //! General ND bt index provider based on a linear index.
+        template<typename TDim, typename TIdx>
+        class IdxBtLinear : public concepts::Implements<ConceptIdxBt, IdxBtLinear<TDim, TIdx>>
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            IdxBtLinear(TIdx blockThreadIdx) : m_blockThreadIdx(blockThreadIdx)
+            {
+            }
+            //-----------------------------------------------------------------------------
+            IdxBtLinear(IdxBtLinear const&) = delete;
+            //-----------------------------------------------------------------------------
+            IdxBtLinear(IdxBtLinear&&) = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxBtLinear const&) -> IdxBtLinear& = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxBtLinear&&) -> IdxBtLinear& = delete;
+            //-----------------------------------------------------------------------------
+            ~IdxBtLinear() = default;
+
+            const TIdx m_blockThreadIdx;
+        };
+    } // namespace bt
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The IdxBtLinear index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtLinear<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The IdxBtLinear block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtLinear<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            static auto getIdx(bt::IdxBtLinear<TDim, TIdx> const& idx, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
+            {
+                return mapIdx<TDim::value>(
+                    Vec<DimInt<1u>, TIdx>(idx.m_blockThreadIdx),
+                    getWorkDiv<Block, Threads>(workDiv));
+            }
+        };
+
+        template<typename TIdx>
+        struct GetIdx<bt::IdxBtLinear<DimInt<1u>, TIdx>, origin::Block, unit::Threads>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            static auto getIdx(bt::IdxBtLinear<DimInt<1u>, TIdx> const& idx, TWorkDiv const&) -> Vec<DimInt<1u>, TIdx>
+            {
+                return idx.m_blockThreadIdx;
+            }
+        };
+
+        //#############################################################################
+        //! The IdxBtLinear block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtLinear<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp
index ba14dd111e..79b33cf55c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtOmp.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,114 +11,93 @@
 
 #ifdef _OPENMP
 
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/workdiv/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-
-#include <omp.h>
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Positioning.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/idx/MapIdx.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
+#    include <alpaka/workdiv/Traits.hpp>
 
+#    include <omp.h>
 
 namespace alpaka
 {
-    namespace idx
+    namespace bt
     {
-        namespace bt
+        //#############################################################################
+        //! The OpenMP accelerator index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtOmp : public concepts::Implements<ConceptIdxBt, IdxBtOmp<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The OpenMP accelerator index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtOmp : public concepts::Implements<ConceptIdxBt, IdxBtOmp<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxBtOmp() = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtOmp(IdxBtOmp const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtOmp(IdxBtOmp &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtOmp const &) -> IdxBtOmp & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtOmp &&) -> IdxBtOmp & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtOmp() = default;
-            };
-        }
-    }
+        public:
+            //-----------------------------------------------------------------------------
+            IdxBtOmp() = default;
+            //-----------------------------------------------------------------------------
+            IdxBtOmp(IdxBtOmp const&) = delete;
+            //-----------------------------------------------------------------------------
+            IdxBtOmp(IdxBtOmp&&) = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxBtOmp const&) -> IdxBtOmp& = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxBtOmp&&) -> IdxBtOmp& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~IdxBtOmp() = default;
+        };
+    } // namespace bt
 
-    namespace dim
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The OpenMP accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtOmp<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The OpenMP accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtOmp<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The OpenMP accelerator block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtOmp<TDim, TIdx>, origin::Block, unit::Threads>
         {
-            //#############################################################################
-            //! The OpenMP accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtOmp<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            static auto getIdx(bt::IdxBtOmp<TDim, TIdx> const& idx, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::bt::IdxBtOmp<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    // We assume that the thread id is positive.
-                    ALPAKA_ASSERT(::omp_get_thread_num()>=0);
-                    // \TODO: Would it be faster to precompute the index and cache it inside an array?
-                    return idx::mapIdx<TDim::value>(
-                        vec::Vec<dim::DimInt<1u>, TIdx>(static_cast<TIdx>(::omp_get_thread_num())),
-                        workdiv::getWorkDiv<Block, Threads>(workDiv));
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                alpaka::ignore_unused(idx);
+                // We assume that the thread id is positive.
+                ALPAKA_ASSERT(::omp_get_thread_num() >= 0);
+                // \TODO: Would it be faster to precompute the index and cache it inside an array?
+                return mapIdx<TDim::value>(
+                    Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(::omp_get_thread_num())),
+                    getWorkDiv<Block, Threads>(workDiv));
+            }
+        };
+
+        template<typename TIdx>
+        struct GetIdx<bt::IdxBtOmp<DimInt<1u>, TIdx>, origin::Block, unit::Threads>
         {
-            //#############################################################################
-            //! The OpenMP accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtOmp<TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            static auto getIdx(bt::IdxBtOmp<DimInt<1u>, TIdx> const& idx, TWorkDiv const&) -> Vec<DimInt<1u>, TIdx>
             {
-                using type = TIdx;
-            };
-        }
-    }
-}
+                alpaka::ignore_unused(idx);
+                return Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(::omp_get_thread_num()));
+            }
+        };
+
+        //#############################################################################
+        //! The OpenMP accelerator block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtOmp<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp
index c61fbcc3c0..1a769762e3 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefFiberIdMap.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,119 +11,86 @@
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
 
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Fibers.hpp>
+#    include <alpaka/core/Positioning.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
 
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Fibers.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <map>
+#    include <map>
 
 namespace alpaka
 {
-    namespace idx
+    namespace bt
     {
-        namespace bt
+        //#############################################################################
+        //! The fibers accelerator index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtRefFiberIdMap : public concepts::Implements<ConceptIdxBt, IdxBtRefFiberIdMap<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The fibers accelerator index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtRefFiberIdMap : public concepts::Implements<ConceptIdxBt, IdxBtRefFiberIdMap<TDim, TIdx>>
-            {
-            public:
-                using FiberIdToIdxMap = std::map<boost::fibers::fiber::id, vec::Vec<TDim, TIdx>>;
+        public:
+            using FiberIdToIdxMap = std::map<boost::fibers::fiber::id, Vec<TDim, TIdx>>;
 
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefFiberIdMap(
-                    FiberIdToIdxMap const & mFibersToIndices) :
-                    m_fibersToIndices(mFibersToIndices)
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefFiberIdMap(IdxBtRefFiberIdMap const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefFiberIdMap(IdxBtRefFiberIdMap &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtRefFiberIdMap const &) -> IdxBtRefFiberIdMap & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtRefFiberIdMap &&) -> IdxBtRefFiberIdMap & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtRefFiberIdMap() = default;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST IdxBtRefFiberIdMap(FiberIdToIdxMap const& mFibersToIndices)
+                : m_fibersToIndices(mFibersToIndices)
+            {
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST IdxBtRefFiberIdMap(IdxBtRefFiberIdMap const&) = delete;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST IdxBtRefFiberIdMap(IdxBtRefFiberIdMap&&) = delete;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator=(IdxBtRefFiberIdMap const&) -> IdxBtRefFiberIdMap& = delete;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator=(IdxBtRefFiberIdMap&&) -> IdxBtRefFiberIdMap& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~IdxBtRefFiberIdMap() = default;
 
-            public:
-                FiberIdToIdxMap const & m_fibersToIndices; //!< The mapping of fiber id's to fiber indices.
-            };
-        }
-    }
+        public:
+            FiberIdToIdxMap const& m_fibersToIndices; //!< The mapping of fiber id's to fiber indices.
+        };
+    } // namespace bt
 
-    namespace dim
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU fibers accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtRefFiberIdMap<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU fibers accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU fibers accelerator block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtRefFiberIdMap<TDim, TIdx>, origin::Block, unit::Threads>
         {
-            //#############################################################################
-            //! The CPU fibers accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST static auto getIdx(bt::IdxBtRefFiberIdMap<TDim, TIdx> const& idx, TWorkDiv const& workDiv)
+                -> Vec<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::bt::IdxBtRefFiberIdMap<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    auto const fiberId(boost::this_fiber::get_id());
-                    auto const fiberEntry(idx.m_fibersToIndices.find(fiberId));
-                    ALPAKA_ASSERT(fiberEntry != idx.m_fibersToIndices.end());
-                    return fiberEntry->second;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                alpaka::ignore_unused(workDiv);
+                auto const fiberId(boost::this_fiber::get_id());
+                auto const fiberEntry(idx.m_fibersToIndices.find(fiberId));
+                ALPAKA_ASSERT(fiberEntry != idx.m_fibersToIndices.end());
+                return fiberEntry->second;
+            }
+        };
+
+        //#############################################################################
+        //! The CPU fibers accelerator block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtRefFiberIdMap<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU fibers accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtRefFiberIdMap<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
index e43d83a672..c5c67d875e 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,119 +11,86 @@
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
 
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Positioning.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
 
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <thread>
-#include <map>
+#    include <map>
+#    include <thread>
 
 namespace alpaka
 {
-    namespace idx
+    namespace bt
     {
-        namespace bt
+        //#############################################################################
+        //! The threads accelerator index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtRefThreadIdMap : public concepts::Implements<ConceptIdxBt, IdxBtRefThreadIdMap<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The threads accelerator index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtRefThreadIdMap : public concepts::Implements<ConceptIdxBt, IdxBtRefThreadIdMap<TDim, TIdx>>
-            {
-            public:
-                using ThreadIdToIdxMap = std::map<std::thread::id, vec::Vec<TDim, TIdx>>;
+        public:
+            using ThreadIdToIdxMap = std::map<std::thread::id, Vec<TDim, TIdx>>;
 
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefThreadIdMap(
-                    ThreadIdToIdxMap const & mThreadToIndices) :
-                    m_threadToIndexMap(mThreadToIndices)
-                {}
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefThreadIdMap(IdxBtRefThreadIdMap const &) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST IdxBtRefThreadIdMap(IdxBtRefThreadIdMap &&) = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtRefThreadIdMap const &) -> IdxBtRefThreadIdMap & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator=(IdxBtRefThreadIdMap &&) -> IdxBtRefThreadIdMap & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtRefThreadIdMap() = default;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST IdxBtRefThreadIdMap(ThreadIdToIdxMap const& mThreadToIndices)
+                : m_threadToIndexMap(mThreadToIndices)
+            {
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST IdxBtRefThreadIdMap(IdxBtRefThreadIdMap const&) = delete;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST IdxBtRefThreadIdMap(IdxBtRefThreadIdMap&&) = delete;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator=(IdxBtRefThreadIdMap const&) -> IdxBtRefThreadIdMap& = delete;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator=(IdxBtRefThreadIdMap&&) -> IdxBtRefThreadIdMap& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~IdxBtRefThreadIdMap() = default;
 
-            public:
-                ThreadIdToIdxMap const & m_threadToIndexMap;   //!< The mapping of thread id's to thread indices.
-            };
-        }
-    }
+        public:
+            ThreadIdToIdxMap const& m_threadToIndexMap; //!< The mapping of thread id's to thread indices.
+        };
+    } // namespace bt
 
-    namespace dim
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU threads accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtRefThreadIdMap<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU threads accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU threads accelerator block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtRefThreadIdMap<TDim, TIdx>, origin::Block, unit::Threads>
         {
-            //#############################################################################
-            //! The CPU threads accelerator block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST static auto getIdx(bt::IdxBtRefThreadIdMap<TDim, TIdx> const& idx, TWorkDiv const& workDiv)
+                -> Vec<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::bt::IdxBtRefThreadIdMap<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    auto const threadId(std::this_thread::get_id());
-                    auto const threadEntry(idx.m_threadToIndexMap.find(threadId));
-                    ALPAKA_ASSERT(threadEntry != idx.m_threadToIndexMap.end());
-                    return threadEntry->second;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                alpaka::ignore_unused(workDiv);
+                auto const threadId(std::this_thread::get_id());
+                auto const threadEntry(idx.m_threadToIndexMap.find(threadId));
+                ALPAKA_ASSERT(threadEntry != idx.m_threadToIndexMap.end());
+                return threadEntry->second;
+            }
+        };
+
+        //#############################################################################
+        //! The CPU threads accelerator block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtRefThreadIdMap<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The CPU threads accelerator block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtRefThreadIdMap<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..209e1fa4d8
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,106 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Positioning.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+namespace alpaka
+{
+    namespace bt
+    {
+        //#############################################################################
+        //! The CUDA/HIP accelerator ND index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtUniformCudaHipBuiltIn
+            : public concepts::Implements<ConceptIdxBt, IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            IdxBtUniformCudaHipBuiltIn() = default;
+            //-----------------------------------------------------------------------------
+            __device__ IdxBtUniformCudaHipBuiltIn(IdxBtUniformCudaHipBuiltIn const&) = delete;
+            //-----------------------------------------------------------------------------
+            __device__ IdxBtUniformCudaHipBuiltIn(IdxBtUniformCudaHipBuiltIn&&) = delete;
+            //-----------------------------------------------------------------------------
+            __device__ auto operator=(IdxBtUniformCudaHipBuiltIn const&) -> IdxBtUniformCudaHipBuiltIn& = delete;
+            //-----------------------------------------------------------------------------
+            __device__ auto operator=(IdxBtUniformCudaHipBuiltIn&&) -> IdxBtUniformCudaHipBuiltIn& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~IdxBtUniformCudaHipBuiltIn() = default;
+        };
+    } // namespace bt
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            __device__ static auto getIdx(bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx> const& idx, TWorkDiv const&)
+                -> Vec<TDim, TIdx>
+            {
+                alpaka::ignore_unused(idx);
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return castVec<TIdx>(getOffsetVecEnd<TDim>(threadIdx));
+#    else
+                return getOffsetVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
+                    static_cast<TIdx>(hipThreadIdx_z),
+                    static_cast<TIdx>(hipThreadIdx_y),
+                    static_cast<TIdx>(hipThreadIdx_x)));
+#    endif
+            }
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp
index 7ae7c4fee5..2c297d7f1a 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/bt/IdxBtZero.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,103 +9,70 @@
 
 #pragma once
 
-#include <alpaka/idx/Traits.hpp>
-
 #include <alpaka/core/Concepts.hpp>
 #include <alpaka/core/Positioning.hpp>
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/idx/Traits.hpp>
 #include <alpaka/vec/Vec.hpp>
 
 namespace alpaka
 {
-    namespace idx
+    namespace bt
     {
-        namespace bt
+        //#############################################################################
+        //! A zero block thread index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtZero : public concepts::Implements<ConceptIdxBt, IdxBtZero<TDim, TIdx>>
         {
-            //#############################################################################
-            //! A zero block thread index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxBtZero : public concepts::Implements<ConceptIdxBt, IdxBtZero<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxBtZero() = default;
-                //-----------------------------------------------------------------------------
-                IdxBtZero(IdxBtZero const &) = delete;
-                //-----------------------------------------------------------------------------
-                IdxBtZero(IdxBtZero &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxBtZero const &) -> IdxBtZero & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxBtZero &&) -> IdxBtZero & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxBtZero() = default;
-            };
-        }
-    }
+        public:
+            //-----------------------------------------------------------------------------
+            IdxBtZero() = default;
+            //-----------------------------------------------------------------------------
+            IdxBtZero(IdxBtZero const&) = delete;
+            //-----------------------------------------------------------------------------
+            IdxBtZero(IdxBtZero&&) = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxBtZero const&) -> IdxBtZero& = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxBtZero&&) -> IdxBtZero& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~IdxBtZero() = default;
+        };
+    } // namespace bt
 
-    namespace dim
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The zero block thread index provider dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtZero<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The zero block thread index provider dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::bt::IdxBtZero<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The zero block thread index provider block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtZero<TDim, TIdx>, origin::Block, unit::Threads>
         {
-            //#############################################################################
-            //! The zero block thread index provider block thread index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::bt::IdxBtZero<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST static auto getIdx(bt::IdxBtZero<TDim, TIdx> const& idx, TWorkDiv const& workDiv)
+                -> Vec<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current thread in the block.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::bt::IdxBtZero<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    alpaka::ignore_unused(workDiv);
-                    return vec::Vec<TDim, TIdx>::zeros();
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                alpaka::ignore_unused(idx);
+                alpaka::ignore_unused(workDiv);
+                return Vec<TDim, TIdx>::zeros();
+            }
+        };
+
+        //#############################################################################
+        //! The zero block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtZero<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The zero block thread index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::bt::IdxBtZero<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbCudaBuiltIn.hpp
deleted file mode 100644
index 47b57e1c4d..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbCudaBuiltIn.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace gb
-        {
-            //#############################################################################
-            //! The CUDA accelerator ND index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxGbCudaBuiltIn : public concepts::Implements<ConceptIdxGb, IdxGbCudaBuiltIn<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxGbCudaBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                __device__ IdxGbCudaBuiltIn(IdxGbCudaBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ IdxGbCudaBuiltIn(IdxGbCudaBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(IdxGbCudaBuiltIn const & ) -> IdxGbCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                __device__ auto operator=(IdxGbCudaBuiltIn &&) -> IdxGbCudaBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxGbCudaBuiltIn() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator grid block index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current block in the grid.
-                template<
-                    typename TWorkDiv>
-                __device__ static auto getIdx(
-                    idx::gb::IdxGbCudaBuiltIn<TDim, TIdx> const & idx,
-                    TWorkDiv const &)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    return vec::cast<TIdx>(offset::getOffsetVecEnd<TDim>(blockIdx));
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator grid block index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::gb::IdxGbCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbHipBuiltIn.hpp
deleted file mode 100644
index 1dab7ffc2a..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbHipBuiltIn.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace idx
-    {
-        namespace gb
-        {
-            //#############################################################################
-            //! The HIP accelerator ND index provider.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxGbHipBuiltIn : public concepts::Implements<ConceptIdxGb, IdxGbHipBuiltIn<TDim, TIdx>>
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST_ACC IdxGbHipBuiltIn() = default;
-                //-----------------------------------------------------------------------------
-                IdxGbHipBuiltIn(IdxGbHipBuiltIn const &) = delete;
-                //-----------------------------------------------------------------------------
-                IdxGbHipBuiltIn(IdxGbHipBuiltIn &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxGbHipBuiltIn const & ) -> IdxGbHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxGbHipBuiltIn &&) -> IdxGbHipBuiltIn & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ALPAKA_FN_HOST_ACC ~IdxGbHipBuiltIn() = default;
-            };
-        }
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::gb::IdxGbHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator grid block index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::gb::IdxGbHipBuiltIn<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current block in the grid.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST_ACC static auto getIdx(
-                    idx::gb::IdxGbHipBuiltIn<TDim, TIdx> const & idx,
-                    TWorkDiv const &)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(idx);
-                    return offset::getOffsetVecEnd<TDim>(
-                        vec::Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                            static_cast<TIdx>(hipBlockIdx_z),
-                            static_cast<TIdx>(hipBlockIdx_y),
-                            static_cast<TIdx>(hipBlockIdx_x)));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU HIP accelerator grid block index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::gb::IdxGbHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbLinear.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbLinear.hpp
new file mode 100644
index 0000000000..cb20356c43
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbLinear.hpp
@@ -0,0 +1,96 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/Concepts.hpp>
+#include <alpaka/core/Positioning.hpp>
+#include <alpaka/core/Unused.hpp>
+#include <alpaka/idx/MapIdx.hpp>
+#include <alpaka/idx/Traits.hpp>
+#include <alpaka/vec/Vec.hpp>
+#include <alpaka/workdiv/Traits.hpp>
+
+namespace alpaka
+{
+    namespace gb
+    {
+        //#############################################################################
+        //! General ND index provider based on a linear index.
+        template<typename TDim, typename TIdx>
+        class IdxGbLinear : public concepts::Implements<ConceptIdxGb, IdxGbLinear<TDim, TIdx>>
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            IdxGbLinear(const TIdx& teamOffset = static_cast<TIdx>(0u)) : m_gridBlockIdx(teamOffset)
+            {
+            }
+            //-----------------------------------------------------------------------------
+            IdxGbLinear(IdxGbLinear const&) = delete;
+            //-----------------------------------------------------------------------------
+            IdxGbLinear(IdxGbLinear&&) = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxGbLinear const&) -> IdxGbLinear& = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxGbLinear&&) -> IdxGbLinear& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~IdxGbLinear() = default;
+
+            TIdx const m_gridBlockIdx;
+        };
+    } // namespace gb
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The IdxGbLinear index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<gb::IdxGbLinear<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The IdxGbLinear grid block index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<gb::IdxGbLinear<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            static auto getIdx(gb::IdxGbLinear<TDim, TIdx> const& idx, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
+            {
+                // \TODO: Would it be faster to precompute the index and cache it inside an array?
+                return mapIdx<TDim::value>(
+                    Vec<DimInt<1u>, TIdx>(idx.m_gridBlockIdx),
+                    getWorkDiv<Grid, Blocks>(workDiv));
+            }
+        };
+
+        template<typename TIdx>
+        struct GetIdx<gb::IdxGbLinear<DimInt<1u>, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            static auto getIdx(gb::IdxGbLinear<DimInt<1u>, TIdx> const& idx, TWorkDiv const&) -> Vec<DimInt<1u>, TIdx>
+            {
+                return idx.m_gridBlockIdx;
+            }
+        };
+
+        //#############################################################################
+        //! The IdxGbLinear grid block index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<gb::IdxGbLinear<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp
index 1bf7d9426f..95ff1adf4f 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbRef.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,109 +9,75 @@
 
 #pragma once
 
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-
 #include <alpaka/core/Concepts.hpp>
 #include <alpaka/core/Positioning.hpp>
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/dim/Traits.hpp>
+#include <alpaka/idx/Traits.hpp>
 #include <alpaka/vec/Vec.hpp>
 
 namespace alpaka
 {
-    namespace idx
+    namespace gb
     {
-        namespace gb
+        //#############################################################################
+        //! A IdxGbRef grid block index.
+        template<typename TDim, typename TIdx>
+        class IdxGbRef : public concepts::Implements<ConceptIdxGb, IdxGbRef<TDim, TIdx>>
         {
-            //#############################################################################
-            //! A IdxGbRef grid block index.
-            template<
-                typename TDim,
-                typename TIdx>
-            class IdxGbRef : public concepts::Implements<ConceptIdxGb, IdxGbRef<TDim, TIdx>>
+        public:
+            //-----------------------------------------------------------------------------
+            IdxGbRef(Vec<TDim, TIdx> const& gridBlockIdx) : m_gridBlockIdx(gridBlockIdx)
             {
-            public:
-                //-----------------------------------------------------------------------------
-                IdxGbRef(
-                    vec::Vec<TDim, TIdx> const & gridBlockIdx) :
-                        m_gridBlockIdx(gridBlockIdx)
-                {}
-                //-----------------------------------------------------------------------------
-                IdxGbRef(IdxGbRef const &) = delete;
-                //-----------------------------------------------------------------------------
-                IdxGbRef(IdxGbRef &&) = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxGbRef const &) -> IdxGbRef & = delete;
-                //-----------------------------------------------------------------------------
-                auto operator=(IdxGbRef &&) -> IdxGbRef & = delete;
-                //-----------------------------------------------------------------------------
-                /*virtual*/ ~IdxGbRef() = default;
+            }
+            //-----------------------------------------------------------------------------
+            IdxGbRef(IdxGbRef const&) = delete;
+            //-----------------------------------------------------------------------------
+            IdxGbRef(IdxGbRef&&) = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxGbRef const&) -> IdxGbRef& = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(IdxGbRef&&) -> IdxGbRef& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~IdxGbRef() = default;
 
-            public:
-                vec::Vec<TDim, TIdx> const & m_gridBlockIdx;
-            };
-        }
-    }
+        public:
+            Vec<TDim, TIdx> const& m_gridBlockIdx;
+        };
+    } // namespace gb
 
-    namespace dim
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The IdxGbRef grid block index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<gb::IdxGbRef<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The IdxGbRef grid block index dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                idx::gb::IdxGbRef<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The IdxGbRef grid block index grid block index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<gb::IdxGbRef<TDim, TIdx>, origin::Grid, unit::Blocks>
         {
-            //#############################################################################
-            //! The IdxGbRef grid block index grid block index get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetIdx<
-                idx::gb::IdxGbRef<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST static auto getIdx(gb::IdxGbRef<TDim, TIdx> const& idx, TWorkDiv const& workDiv)
+                -> Vec<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                //! \return The index of the current block in the grid.
-                template<
-                    typename TWorkDiv>
-                ALPAKA_FN_HOST static auto getIdx(
-                    idx::gb::IdxGbRef<TDim, TIdx> const & idx,
-                    TWorkDiv const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    return idx.m_gridBlockIdx;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                alpaka::ignore_unused(workDiv);
+                return idx.m_gridBlockIdx;
+            }
+        };
+
+        //#############################################################################
+        //! The IdxGbRef grid block index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<gb::IdxGbRef<TDim, TIdx>>
         {
-            //#############################################################################
-            //! The IdxGbRef grid block index idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                idx::gb::IdxGbRef<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..de9fd25eae
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,106 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera, Matthias Werner
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/core/Positioning.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+namespace alpaka
+{
+    namespace gb
+    {
+        //#############################################################################
+        //! The CUDA/HIP accelerator ND index provider.
+        template<typename TDim, typename TIdx>
+        class IdxGbUniformCudaHipBuiltIn
+            : public concepts::Implements<ConceptIdxGb, IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            IdxGbUniformCudaHipBuiltIn() = default;
+            //-----------------------------------------------------------------------------
+            __device__ IdxGbUniformCudaHipBuiltIn(IdxGbUniformCudaHipBuiltIn const&) = delete;
+            //-----------------------------------------------------------------------------
+            __device__ IdxGbUniformCudaHipBuiltIn(IdxGbUniformCudaHipBuiltIn&&) = delete;
+            //-----------------------------------------------------------------------------
+            __device__ auto operator=(IdxGbUniformCudaHipBuiltIn const&) -> IdxGbUniformCudaHipBuiltIn& = delete;
+            //-----------------------------------------------------------------------------
+            __device__ auto operator=(IdxGbUniformCudaHipBuiltIn&&) -> IdxGbUniformCudaHipBuiltIn& = delete;
+            //-----------------------------------------------------------------------------
+            /*virtual*/ ~IdxGbUniformCudaHipBuiltIn() = default;
+        };
+    } // namespace gb
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator grid block index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            __device__ static auto getIdx(gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx> const& idx, TWorkDiv const&)
+                -> Vec<TDim, TIdx>
+            {
+                alpaka::ignore_unused(idx);
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return castVec<TIdx>(getOffsetVecEnd<TDim>(blockIdx));
+#    else
+                return getOffsetVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
+                    static_cast<TIdx>(hipBlockIdx_z),
+                    static_cast<TIdx>(hipBlockIdx_y),
+                    static_cast<TIdx>(hipBlockIdx_x)));
+#    endif
+            }
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator grid block index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicCpu.hpp
new file mode 100644
index 0000000000..edc365b277
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicCpu.hpp
@@ -0,0 +1,117 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/BoostPredef.hpp>
+#include <alpaka/intrinsic/IntrinsicFallback.hpp>
+#include <alpaka/intrinsic/Traits.hpp>
+
+#include <bitset>
+
+#if BOOST_COMP_MSVC
+#    include <intrin.h>
+#endif
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The CPU intrinsic.
+    class IntrinsicCpu : public concepts::Implements<ConceptIntrinsic, IntrinsicCpu>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        IntrinsicCpu() = default;
+        //-----------------------------------------------------------------------------
+        IntrinsicCpu(IntrinsicCpu const&) = delete;
+        //-----------------------------------------------------------------------------
+        IntrinsicCpu(IntrinsicCpu&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(IntrinsicCpu const&) -> IntrinsicCpu& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(IntrinsicCpu&&) -> IntrinsicCpu& = delete;
+        //-----------------------------------------------------------------------------
+        ~IntrinsicCpu() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<>
+        struct Popcount<IntrinsicCpu>
+        {
+            //-----------------------------------------------------------------------------
+            static auto popcount(IntrinsicCpu const& /*intrinsic*/, std::uint32_t value) -> std::int32_t
+            {
+#if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_INTEL
+                return __builtin_popcount(value);
+#elif BOOST_COMP_MSVC
+                return __popcnt(value);
+#else
+                // Fallback to standard library
+                return static_cast<std::int32_t>(std::bitset<32>(value).count());
+#endif
+            }
+
+            //-----------------------------------------------------------------------------
+            static auto popcount(IntrinsicCpu const& /*intrinsic*/, std::uint64_t value) -> std::int32_t
+            {
+#if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_INTEL
+                return __builtin_popcountll(value);
+#elif BOOST_COMP_MSVC
+                return static_cast<std::int32_t>(__popcnt64(value));
+#else
+                // Fallback to standard library
+                return static_cast<std::int32_t>(std::bitset<64>(value).count());
+#endif
+            }
+        };
+
+        //#############################################################################
+        template<>
+        struct Ffs<IntrinsicCpu>
+        {
+            //-----------------------------------------------------------------------------
+            static auto ffs(IntrinsicCpu const& /*intrinsic*/, std::int32_t value) -> std::int32_t
+            {
+#if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_INTEL
+                return __builtin_ffs(value);
+#elif BOOST_COMP_MSVC
+                // Implementation based on
+                // https://gitlab.freedesktop.org/cairo/cairo/commit/f5167dc2e1a13d8c4e5d66d7178a24b9b5e7ac7a
+                unsigned long index = 0u;
+                if(_BitScanForward(&index, value) != 0)
+                    return static_cast<std::int32_t>(index + 1u);
+                else
+                    return 0;
+#else
+                return alpaka::detail::ffsFallback(value);
+#endif
+            }
+
+            //-----------------------------------------------------------------------------
+            static auto ffs(IntrinsicCpu const& /*intrinsic*/, std::int64_t value) -> std::int32_t
+            {
+#if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_INTEL
+                return __builtin_ffsll(value);
+#elif BOOST_COMP_MSVC
+                // Implementation based on
+                // https://gitlab.freedesktop.org/cairo/cairo/commit/f5167dc2e1a13d8c4e5d66d7178a24b9b5e7ac7a
+                unsigned long index = 0u;
+                if(_BitScanForward64(&index, value) != 0)
+                    return static_cast<std::int32_t>(index + 1u);
+                else
+                    return 0;
+#else
+                return alpaka::detail::ffsFallback(value);
+#endif
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicFallback.hpp b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicFallback.hpp
new file mode 100644
index 0000000000..f357ea1688
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicFallback.hpp
@@ -0,0 +1,104 @@
+/* Copyright 2020 Sergei Bastrakov, Jeffrey Kelling
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/intrinsic/Traits.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //#############################################################################
+        //! Fallback implementaion of popcount.
+        template<typename TValue>
+        static auto popcountFallback(TValue value) -> std::int32_t
+        {
+            TValue count = 0;
+            while(value != 0)
+            {
+                count += value & 1u;
+                value >>= 1u;
+            }
+            return static_cast<std::int32_t>(count);
+        }
+
+        //#############################################################################
+        //! Fallback implementaion of ffs.
+        template<typename TValue>
+        static auto ffsFallback(TValue value) -> std::int32_t
+        {
+            if(value == 0)
+                return 0;
+            std::int32_t result = 1;
+            while((value & 1) == 0)
+            {
+                value >>= 1;
+                result++;
+            }
+            return result;
+        }
+    } // namespace detail
+
+    //#############################################################################
+    //! The Fallback intrinsic.
+    class IntrinsicFallback : public concepts::Implements<ConceptIntrinsic, IntrinsicFallback>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        IntrinsicFallback() = default;
+        //-----------------------------------------------------------------------------
+        IntrinsicFallback(IntrinsicFallback const&) = delete;
+        //-----------------------------------------------------------------------------
+        IntrinsicFallback(IntrinsicFallback&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(IntrinsicFallback const&) -> IntrinsicFallback& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(IntrinsicFallback&&) -> IntrinsicFallback& = delete;
+        //-----------------------------------------------------------------------------
+        ~IntrinsicFallback() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<>
+        struct Popcount<IntrinsicFallback>
+        {
+            //-----------------------------------------------------------------------------
+            static auto popcount(IntrinsicFallback const& /*intrinsic*/, std::uint32_t value) -> std::int32_t
+            {
+                return alpaka::detail::popcountFallback(value);
+            }
+
+            //-----------------------------------------------------------------------------
+            static auto popcount(IntrinsicFallback const& /*intrinsic*/, std::uint64_t value) -> std::int32_t
+            {
+                return alpaka::detail::popcountFallback(value);
+            }
+        };
+
+        //#############################################################################
+        template<>
+        struct Ffs<IntrinsicFallback>
+        {
+            //-----------------------------------------------------------------------------
+            static auto ffs(IntrinsicFallback const& /*intrinsic*/, std::int32_t value) -> std::int32_t
+            {
+                return alpaka::detail::ffsFallback(value);
+            }
+
+            //-----------------------------------------------------------------------------
+            static auto ffs(IntrinsicFallback const& /*intrinsic*/, std::int64_t value) -> std::int32_t
+            {
+                return alpaka::detail::ffsFallback(value);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..d5b78b8e06
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,98 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/intrinsic/Traits.hpp>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The GPU CUDA/HIP intrinsic.
+    class IntrinsicUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptIntrinsic, IntrinsicUniformCudaHipBuiltIn>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        IntrinsicUniformCudaHipBuiltIn() = default;
+        //-----------------------------------------------------------------------------
+        __device__ IntrinsicUniformCudaHipBuiltIn(IntrinsicUniformCudaHipBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ IntrinsicUniformCudaHipBuiltIn(IntrinsicUniformCudaHipBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(IntrinsicUniformCudaHipBuiltIn const&) -> IntrinsicUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(IntrinsicUniformCudaHipBuiltIn&&) -> IntrinsicUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        ~IntrinsicUniformCudaHipBuiltIn() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        template<>
+        struct Popcount<IntrinsicUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto popcount(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::uint32_t value)
+                -> std::int32_t
+            {
+#    if BOOST_COMP_CLANG && BOOST_LANG_CUDA
+                return __popc(static_cast<int>(value));
+#    else
+                return __popc(static_cast<unsigned int>(value));
+#    endif
+            }
+
+            //-----------------------------------------------------------------------------
+            __device__ static auto popcount(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::uint64_t value)
+                -> std::int32_t
+            {
+#    if BOOST_COMP_CLANG && BOOST_LANG_CUDA
+                return __popcll(static_cast<long long>(value));
+#    else
+                return __popcll(static_cast<unsigned long long>(value));
+#    endif
+            }
+        };
+
+        //#############################################################################
+        template<>
+        struct Ffs<IntrinsicUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto ffs(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::int32_t value)
+                -> std::int32_t
+            {
+                return __ffs(static_cast<int>(value));
+            }
+
+            //-----------------------------------------------------------------------------
+            __device__ static auto ffs(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::int64_t value)
+                -> std::int32_t
+            {
+                return __ffsll(static_cast<long long>(value));
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/intrinsic/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/Traits.hpp
new file mode 100644
index 0000000000..d2dd01297b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/intrinsic/Traits.hpp
@@ -0,0 +1,96 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/Common.hpp>
+#include <alpaka/core/Concepts.hpp>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace alpaka
+{
+    struct ConceptIntrinsic
+    {
+    };
+
+    //-----------------------------------------------------------------------------
+    //! The intrinsics traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The popcount trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct Popcount;
+
+        //#############################################################################
+        //! The ffs trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct Ffs;
+    } // namespace traits
+
+    //-----------------------------------------------------------------------------
+    //! Returns the number of 1 bits in the given 32-bit value.
+    //!
+    //! \tparam TIntrinsic The intrinsic implementation type.
+    //! \param intrinsic The intrinsic implementation.
+    //! \param value The input value.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIntrinsic>
+    ALPAKA_FN_ACC auto popcount(TIntrinsic const& intrinsic, std::uint32_t value) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
+        return traits::Popcount<ImplementationBase>::popcount(intrinsic, value);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Returns the number of 1 bits in the given 64-bit value.
+    //!
+    //! \tparam TIntrinsic The intrinsic implementation type.
+    //! \param intrinsic The intrinsic implementation.
+    //! \param value The input value.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIntrinsic>
+    ALPAKA_FN_ACC auto popcount(TIntrinsic const& intrinsic, std::uint64_t value) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
+        return traits::Popcount<ImplementationBase>::popcount(intrinsic, value);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Returns the 1-based position of the least significant bit set to 1
+    //! in the given 32-bit value. Returns 0 for input value 0.
+    //!
+    //! \tparam TIntrinsic The intrinsic implementation type.
+    //! \param intrinsic The intrinsic implementation.
+    //! \param value The input value.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIntrinsic>
+    ALPAKA_FN_ACC auto ffs(TIntrinsic const& intrinsic, std::int32_t value) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
+        return traits::Ffs<ImplementationBase>::ffs(intrinsic, value);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Returns the 1-based position of the least significant bit set to 1
+    //! in the given 64-bit value. Returns 0 for input value 0.
+    //!
+    //! \tparam TIntrinsic The intrinsic implementation type.
+    //! \param intrinsic The intrinsic implementation.
+    //! \param value The input value.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIntrinsic>
+    ALPAKA_FN_ACC auto ffs(TIntrinsic const& intrinsic, std::int64_t value) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
+        return traits::Ffs<ImplementationBase>::ffs(intrinsic, value);
+    }
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp
index 3989b5a63d..5d8273305d 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuFibers.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,381 +12,293 @@
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/acc/AccCpuFibers.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Fibers.hpp>
-#include <alpaka/core/ConcurrentExecPool.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <algorithm>
-#include <vector>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
+#    include <alpaka/acc/AccCpuFibers.hpp>
+#    include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/core/ConcurrentExecPool.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/core/Fibers.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/meta/NdLoop.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+#    include <algorithm>
+#    include <functional>
+#    include <tuple>
+#    include <type_traits>
+#    include <vector>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
 
 namespace alpaka
 {
-    namespace kernel
+    //#############################################################################
+    //! The CPU fibers accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuFibers final : public WorkDivMembers<TDim, TIdx>
     {
+    private:
         //#############################################################################
-        //! The CPU fibers accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuFibers final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
+        //! The type given to the ConcurrentExecPool for yielding the current fiber.
+        struct FiberPoolYield
         {
-        private:
-            //#############################################################################
-            //! The type given to the ConcurrentExecPool for yielding the current fiber.
-            struct FiberPoolYield
-            {
-                //-----------------------------------------------------------------------------
-                //! Yields the current fiber.
-                ALPAKA_FN_HOST static auto yield()
-                -> void
-                {
-                    boost::this_fiber::yield();
-                }
-            };
-            //#############################################################################
-            // Yielding is not faster for fibers. Therefore we use condition variables.
-            // It is better to wake them up when the conditions are fulfilled because this does not cost as much as for real threads.
-            using FiberPool = alpaka::core::detail::ConcurrentExecPool<
-                TIdx,
-                boost::fibers::fiber,               // The concurrent execution type.
-                boost::fibers::promise,             // The promise type.
-                FiberPoolYield,                     // The type yielding the current concurrent execution.
-                boost::fibers::mutex,               // The mutex type to use. Only required if TisYielding is true.
-                boost::fibers::condition_variable,  // The condition variable type to use. Only required if TisYielding is true.
-                false>;                             // If the threads should yield.
-
-        public:
             //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuFibers(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
+            //! Yields the current fiber.
+            ALPAKA_FN_HOST static auto yield() -> void
             {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
+                boost::this_fiber::yield();
             }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuFibers(TaskKernelCpuFibers const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuFibers(TaskKernelCpuFibers &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuFibers const &) -> TaskKernelCpuFibers & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuFibers &&) -> TaskKernelCpuFibers & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuFibers() = default;
+        };
+        //#############################################################################
+        // Yielding is not faster for fibers. Therefore we use condition variables.
+        // It is better to wake them up when the conditions are fulfilled because this does not cost as much as for
+        // real threads.
+        using FiberPool = alpaka::core::detail::ConcurrentExecPool<
+            TIdx,
+            boost::fibers::fiber, // The concurrent execution type.
+            boost::fibers::promise, // The promise type.
+            FiberPoolYield, // The type yielding the current concurrent execution.
+            boost::fibers::mutex, // The mutex type to use. Only required if TisYielding is true.
+            boost::fibers::condition_variable, // The condition variable type to use. Only required if TisYielding is
+                                               // true.
+            false>; // If the threads should yield.
 
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuFibers<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                acc::AccCpuFibers<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuFibers(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuFibers(TaskKernelCpuFibers const&) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuFibers(TaskKernelCpuFibers&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuFibers const&) -> TaskKernelCpuFibers& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuFibers&&) -> TaskKernelCpuFibers& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelCpuFibers() = default;
 
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " Fiber stack idx: " << boost::fibers::fixedsize_stack::traits_type::default_size() << " B" << std::endl;
-#endif
+        //-----------------------------------------------------------------------------
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                auto const blockThreadCount(blockThreadExtent.prod());
-                FiberPool fiberPool(blockThreadCount);
-
-                auto const boundGridBlockExecHost(
-                    meta::apply(
-                        [this, &acc, &blockThreadExtent, &fiberPool](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            // Bind the kernel and its arguments to the grid block function.
-                            return
-                                std::bind(
-                                    &TaskKernelCpuFibers::gridBlockExecHost,
-                                    std::ref(acc),
-                                    std::placeholders::_1,
-                                    std::ref(blockThreadExtent),
-                                    std::ref(fiberPool),
-                                    std::ref(m_kernelFnObj),
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // Execute the blocks serially.
-                meta::ndLoopIncIdx(
-                    gridBlockExtent,
-                    boundGridBlockExecHost);
-            }
+            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(*this));
+            auto const blockThreadExtent(getWorkDiv<Block, Threads>(*this));
+            auto const threadElemExtent(getWorkDiv<Thread, Elems>(*this));
 
-        private:
-            //-----------------------------------------------------------------------------
-            //! The function executed for each grid block.
-            ALPAKA_FN_HOST static auto gridBlockExecHost(
-                acc::AccCpuFibers<TDim, TIdx> & acc,
-                vec::Vec<TDim, TIdx> const & gridBlockIdx,
-                vec::Vec<TDim, TIdx> const & blockThreadExtent,
-                FiberPool & fiberPool,
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                    // The futures of the threads in the current block.
-                std::vector<boost::fibers::future<void>> futuresInBlock;
-
-                // Set the index of the current block
-                acc.m_gridBlockIdx = gridBlockIdx;
-
-                // Bind the kernel and its arguments to the host block thread execution function.
-                auto boundBlockThreadExecHost(std::bind(
-                    &TaskKernelCpuFibers::blockThreadExecHost,
-                    std::ref(acc),
-                    std::ref(futuresInBlock),
-                    std::placeholders::_1,
-                    std::ref(fiberPool),
-                    std::ref(kernelFnObj),
-                    std::ref(args)...));
-                // Execute the block threads in parallel.
-                meta::ndLoopIncIdx(
-                    blockThreadExtent,
-                    boundBlockThreadExecHost);
-
-                // Wait for the completion of the block thread kernels.
-                std::for_each(
-                    futuresInBlock.begin(),
-                    futuresInBlock.end(),
-                    [](boost::fibers::future<void> & t)
-                    {
-                        t.wait();
-                    }
-                );
-                // Clean up.
-                futuresInBlock.clear();
-
-                acc.m_fibersToIndices.clear();
-
-                // After a block has been processed, the shared memory has to be deleted.
-                block::shared::st::freeMem(acc);
-            }
-            //-----------------------------------------------------------------------------
-            //! The function executed for each block thread.
-            ALPAKA_FN_HOST static auto blockThreadExecHost(
-                acc::AccCpuFibers<TDim, TIdx> & acc,
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                std::vector<boost::fibers::future<void>> & futuresInBlock,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                FiberPool & fiberPool,
-#else
-                std::vector<boost::fibers::future<void>> &,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                FiberPool &,
-#endif
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                // Bind the arguments to the accelerator block thread execution function.
-                // The blockThreadIdx is required to be copied in because the variable will get changed for the next iteration/thread.
-                auto boundBlockThreadExecAcc(
-                    [&, blockThreadIdx]()
-                    {
-                        blockThreadFiberFn(
-                            acc,
-                            blockThreadIdx,
-                            kernelFnObj,
-                            args...);
-                    });
-                // Add the bound function to the block thread pool.
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                futuresInBlock.emplace_back(
-                    fiberPool.enqueueTask(
-                        boundBlockThreadExecAcc));
-#else
-                (void)boundBlockThreadExecAcc;
-#endif
-            }
-            //-----------------------------------------------------------------------------
-            //! The fiber entry point.
-            ALPAKA_FN_HOST static auto blockThreadFiberFn(
-                acc::AccCpuFibers<TDim, TIdx> & acc,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                // We have to store the fiber data before the kernel is calling any of the methods of this class depending on them.
-                auto const fiberId(boost::this_fiber::get_id());
-
-                // Set the master thread id.
-                if(blockThreadIdx.sum() == 0)
-                {
-                    acc.m_masterFiberId = fiberId;
-                }
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes(meta::apply(
+                [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return getBlockSharedMemDynSizeBytes<AccCpuFibers<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args));
 
-                // Save the fiber id, and index.
-                acc.m_fibersToIndices.emplace(fiberId, blockThreadIdx);
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+            AccCpuFibers<TDim, TIdx> acc(
+                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                blockSharedMemDynSizeBytes);
 
-                // Sync all threads so that the maps with thread id's are complete and not changed after here.
-                syncBlockThreads(acc);
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__
+                      << " Fiber stack idx: " << boost::fibers::fixedsize_stack::traits_type::default_size() << " B"
+                      << std::endl;
+#    endif
 
-                // Execute the kernel itself.
-                kernelFnObj(
-                    const_cast<acc::AccCpuFibers<TDim, TIdx> const &>(acc),
-                    args...);
+            auto const blockThreadCount(blockThreadExtent.prod());
+            FiberPool fiberPool(blockThreadCount);
 
-                // We have to sync all fibers here because if a fiber would finish before all fibers have been started, the new fiber could get a recycled (then duplicate) fiber id!
-                syncBlockThreads(acc);
-            }
+            auto const boundGridBlockExecHost(meta::apply(
+                [this, &acc, &blockThreadExtent, &fiberPool](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    // Bind the kernel and its arguments to the grid block function.
+                    return std::bind(
+                        &TaskKernelCpuFibers::gridBlockExecHost,
+                        std::ref(acc),
+                        std::placeholders::_1,
+                        std::ref(blockThreadExtent),
+                        std::ref(fiberPool),
+                        std::ref(m_kernelFnObj),
+                        std::ref(args)...);
+                },
+                m_args));
 
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
+            // Execute the blocks serially.
+            meta::ndLoopIncIdx(gridBlockExtent, boundGridBlockExecHost);
+        }
 
-    namespace acc
-    {
-        namespace traits
+    private:
+        //-----------------------------------------------------------------------------
+        //! The function executed for each grid block.
+        ALPAKA_FN_HOST static auto gridBlockExecHost(
+            AccCpuFibers<TDim, TIdx>& acc,
+            Vec<TDim, TIdx> const& gridBlockIdx,
+            Vec<TDim, TIdx> const& blockThreadExtent,
+            FiberPool& fiberPool,
+            TKernelFnObj const& kernelFnObj,
+            std::decay_t<TArgs> const&... args) -> void
         {
-            //#############################################################################
-            //! The CPU fibers execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuFibers<TDim, TIdx>;
-            };
+            // The futures of the threads in the current block.
+            std::vector<boost::fibers::future<void>> futuresInBlock;
+
+            // Set the index of the current block
+            acc.m_gridBlockIdx = gridBlockIdx;
+
+            // Bind the kernel and its arguments to the host block thread execution function.
+            auto boundBlockThreadExecHost(std::bind(
+                &TaskKernelCpuFibers::blockThreadExecHost,
+                std::ref(acc),
+                std::ref(futuresInBlock),
+                std::placeholders::_1,
+                std::ref(fiberPool),
+                std::ref(kernelFnObj),
+                std::ref(args)...));
+            // Execute the block threads in parallel.
+            meta::ndLoopIncIdx(blockThreadExtent, boundBlockThreadExecHost);
+
+            // Wait for the completion of the block thread kernels.
+            std::for_each(futuresInBlock.begin(), futuresInBlock.end(), [](boost::fibers::future<void>& t) {
+                t.wait();
+            });
+            // Clean up.
+            futuresInBlock.clear();
+
+            acc.m_fibersToIndices.clear();
+
+            // After a block has been processed, the shared memory has to be deleted.
+            freeSharedVars(acc);
         }
-    }
-    namespace dev
-    {
-        namespace traits
+        //-----------------------------------------------------------------------------
+        //! The function executed for each block thread.
+        ALPAKA_FN_HOST static auto blockThreadExecHost(
+            AccCpuFibers<TDim, TIdx>& acc,
+#    if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+            std::vector<boost::fibers::future<void>>& futuresInBlock,
+            Vec<TDim, TIdx> const& blockThreadIdx,
+            FiberPool& fiberPool,
+#    else
+            std::vector<boost::fibers::future<void>>&,
+            Vec<TDim, TIdx> const& blockThreadIdx,
+            FiberPool&,
+#    endif
+            TKernelFnObj const& kernelFnObj,
+            std::decay_t<TArgs> const&... args) -> void
         {
-            //#############################################################################
-            //! The CPU fibers execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
+            // Bind the arguments to the accelerator block thread execution function.
+            // The blockThreadIdx is required to be copied in because the variable will get changed for the next
+            // iteration/thread.
+            auto boundBlockThreadExecAcc(
+                [&, blockThreadIdx]() { blockThreadFiberFn(acc, blockThreadIdx, kernelFnObj, args...); });
+            // Add the bound function to the block thread pool.
+// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
+#    if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+            futuresInBlock.emplace_back(fiberPool.enqueueTask(boundBlockThreadExecAcc));
+#    else
+            (void) boundBlockThreadExecAcc;
+#    endif
         }
-    }
-    namespace dim
-    {
-        namespace traits
+        //-----------------------------------------------------------------------------
+        //! The fiber entry point.
+        ALPAKA_FN_HOST static auto blockThreadFiberFn(
+            AccCpuFibers<TDim, TIdx>& acc,
+            Vec<TDim, TIdx> const& blockThreadIdx,
+            TKernelFnObj const& kernelFnObj,
+            std::decay_t<TArgs> const&... args) -> void
         {
-            //#############################################################################
-            //! The CPU fibers execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
+            // We have to store the fiber data before the kernel is calling any of the methods of this class depending
+            // on them.
+            auto const fiberId(boost::this_fiber::get_id());
+
+            // Set the master thread id.
+            if(blockThreadIdx.sum() == 0)
             {
-                using type = TDim;
-            };
+                acc.m_masterFiberId = fiberId;
+            }
+
+            // Save the fiber id, and index.
+            acc.m_fibersToIndices.emplace(fiberId, blockThreadIdx);
+
+            // Sync all threads so that the maps with thread id's are complete and not changed after here.
+            syncBlockThreads(acc);
+
+            // Execute the kernel itself.
+            kernelFnObj(const_cast<AccCpuFibers<TDim, TIdx> const&>(acc), args...);
+
+            // We have to sync all fibers here because if a fiber would finish before all fibers have been started, the
+            // new fiber could get a recycled (then duplicate) fiber id!
+            syncBlockThreads(acc);
         }
-    }
-    namespace pltf
+
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU fibers execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU fibers execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = AccCpuFibers<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The CPU fibers execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU fibers execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU fibers execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU fibers execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU fibers execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuFibers<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
index 39b6d7d29b..4b3e8d4191 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
@@ -1,6 +1,6 @@
-/* Copyright 2019 Benjamin Worpitz, Bert Wesarg, René Widera
+/* Copyright 2019-2020 Benjamin Worpitz, Bert Wesarg, René Widera, Sergei Bastrakov
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,307 +11,262 @@
 
 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
 
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/acc/AccCpuOmp2Blocks.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/acc/AccCpuOmp2Blocks.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/core/OmpSchedule.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/idx/MapIdx.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
 
-#include <omp.h>
+#    include <omp.h>
 
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
+#    include <functional>
+#    include <stdexcept>
+#    include <tuple>
+#    include <type_traits>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
 
 namespace alpaka
 {
-    namespace kernel
+    //#############################################################################
+    //! The CPU OpenMP 2.0 block accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuOmp2Blocks final : public WorkDivMembers<TDim, TIdx>
     {
-        //#############################################################################
-        //! The CPU OpenMP 2.0 block accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp2Blocks final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
         {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuOmp2Blocks(TaskKernelCpuOmp2Blocks const&) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuOmp2Blocks(TaskKernelCpuOmp2Blocks&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuOmp2Blocks const&) -> TaskKernelCpuOmp2Blocks& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuOmp2Blocks&&) -> TaskKernelCpuOmp2Blocks& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelCpuOmp2Blocks() = default;
+
+        //-----------------------------------------------------------------------------
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(*this));
+            auto const blockThreadExtent(getWorkDiv<Block, Threads>(*this));
+            auto const threadElemExtent(getWorkDiv<Thread, Elems>(*this));
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes(meta::apply(
+                [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return getBlockSharedMemDynSizeBytes<AccCpuOmp2Blocks<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+            // Bind all arguments except the accelerator.
+            // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
+            auto const boundKernelFnObj(meta::apply(
+                [this](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return std::bind(std::ref(m_kernelFnObj), std::placeholders::_1, std::ref(args)...);
+                },
+                m_args));
+
+            // The number of blocks in the grid.
+            TIdx const numBlocksInGrid(gridBlockExtent.prod());
+            if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
             {
+                throw std::runtime_error("Only one thread per block allowed in the OpenMP 2.0 block accelerator!");
+            }
 
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
+            if(::omp_in_parallel() != 0)
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " already within a parallel region." << std::endl;
+#    endif
+                parallelFn(boundKernelFnObj, blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent);
             }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp2Blocks(TaskKernelCpuOmp2Blocks const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp2Blocks(TaskKernelCpuOmp2Blocks &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp2Blocks const &) -> TaskKernelCpuOmp2Blocks & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp2Blocks &&) -> TaskKernelCpuOmp2Blocks & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuOmp2Blocks() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
+            else
             {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuOmp2Blocks<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // The number of blocks in the grid.
-                TIdx const numBlocksInGrid(gridBlockExtent.prod());
-                if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
+                //! Set the given OpenMP schedule while this object is alive.
+                //! Restore the old schedule afterwards.
+                class ScheduleGuard
                 {
-                    throw std::runtime_error("Only one thread per block allowed in the OpenMP 2.0 block accelerator!");
-                }
+                public:
+                    ScheduleGuard(omp::Schedule const schedule) : oldSchedule(omp::getSchedule())
+                    {
+                        omp::setSchedule(schedule);
+                    }
 
-                if(::omp_in_parallel() != 0)
-                {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__ << " already within a parallel region." << std::endl;
-#endif
-                    parallelFn(
-                        boundKernelFnObj,
-                        blockSharedMemDynSizeBytes,
-                        numBlocksInGrid,
-                        gridBlockExtent);
-                }
-                else
-                {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__ << " opening new parallel region." << std::endl;
-#endif
-                    #pragma omp parallel
-                    parallelFn(
-                        boundKernelFnObj,
-                        blockSharedMemDynSizeBytes,
-                        numBlocksInGrid,
-                        gridBlockExtent);
-                }
-            }
+                    ScheduleGuard(ScheduleGuard const&) = default;
 
-        private:
-            template<
-                typename FnObj>
-            ALPAKA_FN_HOST auto parallelFn(
-                FnObj const & boundKernelFnObj,
-                TIdx const & blockSharedMemDynSizeBytes,
-                TIdx const & numBlocksInGrid,
-                vec::Vec<TDim, TIdx> const & gridBlockExtent) const
-            -> void
-            {
-                #pragma omp single nowait
-                {
-                    // The OpenMP runtime does not create a parallel region when only one thread is required in the num_threads clause.
-                    // In all other cases we expect to be in a parallel region now.
-                    if((numBlocksInGrid > 1) && (::omp_in_parallel() == 0))
+                    ~ScheduleGuard()
                     {
-                        throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
+                        omp::setSchedule(oldSchedule);
                     }
 
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cout << __func__ << " omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
-#endif
-                }
+                private:
+                    omp::Schedule const oldSchedule;
+                };
 
-                acc::AccCpuOmp2Blocks<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
-
-                // NOTE: schedule(static) does not improve performance.
-#if _OPENMP < 200805    // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numBlocksInGrid));
-                std::intmax_t i;
-                #pragma omp for nowait schedule(guided)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#else
-                #pragma omp for nowait schedule(guided)
-                for(TIdx i = 0; i < numBlocksInGrid; ++i)
-#endif
-                {
-#if _OPENMP < 200805
-                    auto const i_tidx  = static_cast<TIdx>(i); // for issue #840
-                    auto const index   = vec::Vec<dim::DimInt<1u>, TIdx>( i_tidx ); // for issue #840
-#else
-                    auto const index   = vec::Vec<dim::DimInt<1u>, TIdx>( i ); // for issue #840
-#endif
-                    acc.m_gridBlockIdx = idx::mapIdx<TDim::value>(index,
-                                                                  gridBlockExtent);
+                // Get the OpenMP schedule.
+                // We only do it when outside of a parallel region, since
+                // otherwise the change of schedule would have no effect.
+                auto const schedule(meta::apply(
+                    [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                        return getOmpSchedule<AccCpuOmp2Blocks<TDim, TIdx>>(
+                            m_kernelFnObj,
+                            blockThreadExtent,
+                            threadElemExtent,
+                            args...);
+                    },
+                    m_args));
 
-                    boundKernelFnObj(
-                        acc);
+                // Schedule change is a scoped object, so that the old schedule is
+                // also restored in case of exception.
+                auto const scheduleGuard = ScheduleGuard{schedule};
 
-                    // After a block has been processed, the shared memory has to be deleted.
-                    block::shared::st::freeMem(acc);
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " opening new parallel region." << std::endl;
+#    endif
+#    pragma omp parallel
+                parallelFn(boundKernelFnObj, blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent);
+            }
+        }
+
+    private:
+        template<typename FnObj>
+        ALPAKA_FN_HOST auto parallelFn(
+            FnObj const& boundKernelFnObj,
+            std::size_t const& blockSharedMemDynSizeBytes,
+            TIdx const& numBlocksInGrid,
+            Vec<TDim, TIdx> const& gridBlockExtent) const -> void
+        {
+#    pragma omp single nowait
+            {
+                // The OpenMP runtime does not create a parallel region when either:
+                // * only one thread is required in the num_threads clause
+                // * or only one thread is available
+                // In all other cases we expect to be in a parallel region now.
+                if((numBlocksInGrid > 1) && (::omp_get_max_threads() > 1) && (::omp_in_parallel() == 0))
+                {
+                    throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
                 }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                std::cout << __func__ << " omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
+#    endif
             }
 
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
+            AccCpuOmp2Blocks<TDim, TIdx> acc(
+                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                blockSharedMemDynSizeBytes);
 
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 grid block execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+            std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numBlocksInGrid));
+            std::intmax_t i;
+#        pragma omp for nowait schedule(runtime)
+            for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(runtime)
+            for(TIdx i = 0; i < numBlocksInGrid; ++i)
+#    endif
             {
-                using type = acc::AccCpuOmp2Blocks<TDim, TIdx>;
-            };
+#    if _OPENMP < 200805
+                auto const i_tidx = static_cast<TIdx>(i); // for issue #840
+                auto const index = Vec<DimInt<1u>, TIdx>(i_tidx); // for issue #840
+#    else
+                auto const index = Vec<DimInt<1u>, TIdx>(i); // for issue #840
+#    endif
+                acc.m_gridBlockIdx = mapIdx<TDim::value>(index, gridBlockExtent);
+
+                boundKernelFnObj(acc);
+
+                // After a block has been processed, the shared memory has to be deleted.
+                freeSharedVars(acc);
+            }
         }
-    }
-    namespace dev
+
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU OpenMP 2.0 grid block execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 grid block execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+            using type = AccCpuOmp2Blocks<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 grid block execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 grid block execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 grid block execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 grid block execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 grid block execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
index 7911a50ed9..4c9a2dc16e 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,292 +11,212 @@
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
 
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/acc/AccCpuOmp2Threads.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <omp.h>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
+#    include <alpaka/acc/AccCpuOmp2Threads.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/meta/NdLoop.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+#    include <omp.h>
+
+#    include <functional>
+#    include <stdexcept>
+#    include <tuple>
+#    include <type_traits>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
 
 namespace alpaka
 {
-    namespace kernel
+    //#############################################################################
+    //! The CPU OpenMP 2.0 thread accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuOmp2Threads final : public WorkDivMembers<TDim, TIdx>
     {
-        //#############################################################################
-        //! The CPU OpenMP 2.0 thread accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp2Threads final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuOmp2Threads(TaskKernelCpuOmp2Threads const&) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuOmp2Threads(TaskKernelCpuOmp2Threads&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuOmp2Threads const&) -> TaskKernelCpuOmp2Threads& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuOmp2Threads&&) -> TaskKernelCpuOmp2Threads& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelCpuOmp2Threads() = default;
+
+        //-----------------------------------------------------------------------------
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
         {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(*this));
+            auto const blockThreadExtent(getWorkDiv<Block, Threads>(*this));
+            auto const threadElemExtent(getWorkDiv<Thread, Elems>(*this));
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes(meta::apply(
+                [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return getBlockSharedMemDynSizeBytes<AccCpuOmp2Threads<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+            // Bind all arguments except the accelerator.
+            // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
+            auto const boundKernelFnObj(meta::apply(
+                [this](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return std::bind(std::ref(m_kernelFnObj), std::placeholders::_1, std::ref(args)...);
+                },
+                m_args));
+
+            AccCpuOmp2Threads<TDim, TIdx> acc(
+                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                blockSharedMemDynSizeBytes);
+
+            // The number of threads in this block.
+            TIdx const blockThreadCount(blockThreadExtent.prod());
+            int const iBlockThreadCount(static_cast<int>(blockThreadCount));
+            alpaka::ignore_unused(iBlockThreadCount);
+
+            if(::omp_in_parallel() != 0)
             {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
+                throw std::runtime_error(
+                    "The OpenMP 2.0 thread backend can not be used within an existing parallel region!");
             }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp2Threads(TaskKernelCpuOmp2Threads const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp2Threads(TaskKernelCpuOmp2Threads &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp2Threads const &) -> TaskKernelCpuOmp2Threads & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp2Threads &&) -> TaskKernelCpuOmp2Threads & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuOmp2Threads() = default;
 
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuOmp2Threads<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
+            // Force the environment to use the given number of threads.
+            int const ompIsDynamic(::omp_get_dynamic());
+            ::omp_set_dynamic(0);
 
-                acc::AccCpuOmp2Threads<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
+            // Execute the blocks serially.
+            meta::ndLoopIncIdx(gridBlockExtent, [&](Vec<TDim, TIdx> const& gridBlockIdx) {
+                acc.m_gridBlockIdx = gridBlockIdx;
 
-                // The number of threads in this block.
-                TIdx const blockThreadCount(blockThreadExtent.prod());
-                int const iBlockThreadCount(static_cast<int>(blockThreadCount));
-                alpaka::ignore_unused(iBlockThreadCount);
+// Execute the threads in parallel.
 
-                if(::omp_in_parallel() != 0)
+// Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to
+// be done with their work up to this line. So we have to spawn one OS thread per thread in a block. 'omp for' is not
+// useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1
+// mapping is required. Therefore we use 'omp parallel' with the specified number of threads in a block.
+#    pragma omp parallel num_threads(iBlockThreadCount)
                 {
-                    throw std::runtime_error("The OpenMP 2.0 thread backend can not be used within an existing parallel region!");
-                }
-
-                // Force the environment to use the given number of threads.
-                int const ompIsDynamic(::omp_get_dynamic());
-                ::omp_set_dynamic(0);
-
-                // Execute the blocks serially.
-                meta::ndLoopIncIdx(
-                    gridBlockExtent,
-                    [&](vec::Vec<TDim, TIdx> const & gridBlockIdx)
+                    // The guard is for gcc internal compiler error, as discussed in #735
+#    if(!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 1, 0))
+#        pragma omp single nowait
                     {
-                        acc.m_gridBlockIdx = gridBlockIdx;
-
-                        // Execute the threads in parallel.
-
-                        // Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line.
-                        // So we have to spawn one OS thread per thread in a block.
-                        // 'omp for' is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required.
-                        // Therefore we use 'omp parallel' with the specified number of threads in a block.
-                        #pragma omp parallel num_threads(iBlockThreadCount)
+                        // The OpenMP runtime does not create a parallel region when only one thread is required in the
+                        // num_threads clause. In all other cases we expect to be in a parallel region now.
+                        if((iBlockThreadCount > 1) && (::omp_in_parallel() == 0))
                         {
-                            #pragma omp single nowait
-                            {
-                                // The OpenMP runtime does not create a parallel region when only one thread is required in the num_threads clause.
-                                // In all other cases we expect to be in a parallel region now.
-                                if((iBlockThreadCount > 1) && (::omp_in_parallel() == 0))
-                                {
-                                    throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
-                                }
-
-                                // GCC 5.1 fails with:
-                                // error: redeclaration of const int& iBlockThreadCount
-                                // if(numThreads != iBlockThreadCount)
-                                //                  ^
-                                // note: const int& iBlockThreadCount previously declared here
-                                // #pragma omp parallel num_threads(iBlockThreadCount)
-                                //         ^
-#if (!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0)) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(6, 0, 0))
-                                int const numThreads(::omp_get_num_threads());
-                                if(numThreads != iBlockThreadCount)
-                                {
-                                    throw std::runtime_error("The OpenMP 2.0 runtime did not use the number of threads that had been required!");
-                                }
-#endif
-                            }
-                            boundKernelFnObj(
-                                acc);
+                            throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
+                        }
 
-                            // Wait for all threads to finish before deleting the shared memory.
-                            // This is done by default if the omp 'nowait' clause is missing on the omp parallel directive
-                            //block::sync::syncBlockThreads(acc);
+                        int const numThreads(::omp_get_num_threads());
+                        if(numThreads != iBlockThreadCount)
+                        {
+                            throw std::runtime_error(
+                                "The OpenMP 2.0 runtime did not use the number of threads that had been required!");
                         }
+                    }
+#    endif
+                    boundKernelFnObj(acc);
 
-                        // After a block has been processed, the shared memory has to be deleted.
-                        block::shared::st::freeMem(acc);
-                    });
+                    // Wait for all threads to finish before deleting the shared memory.
+                    // This is done by default if the omp 'nowait' clause is missing on the omp parallel directive
+                    // syncBlockThreads(acc);
+                }
 
-                // Reset the dynamic thread number setting.
-                ::omp_set_dynamic(ompIsDynamic);
-            }
+                // After a block has been processed, the shared memory has to be deleted.
+                freeSharedVars(acc);
+            });
 
-        private:
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
+            // Reset the dynamic thread number setting.
+            ::omp_set_dynamic(ompIsDynamic);
+        }
+
+    private:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
 
-    namespace acc
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block thread execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuOmp2Threads<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+            using type = AccCpuOmp2Threads<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block thread execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block thread execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block thread execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU OpenMP 2.0 block thread execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU OpenMP 2.0 block thread execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp
deleted file mode 100644
index 83081d99df..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuOmp4.hpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-
-#if _OPENMP < 201307
-    #error If ALPAKA_ACC_CPU_BT_OMP4_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
-#endif
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccCpuOmp4.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <omp.h>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        //#############################################################################
-        //! The CPU OpenMP 4.0 accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuOmp4 final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuOmp4(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp4(TaskKernelCpuOmp4 const & other) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuOmp4(TaskKernelCpuOmp4 && other) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp4 const &) -> TaskKernelCpuOmp4 & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuOmp4 &&) -> TaskKernelCpuOmp4 & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuOmp4() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuOmp4<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // The number of blocks in the grid.
-                TIdx const gridBlockCount(gridBlockExtent.prod());
-                // The number of threads in a block.
-                TIdx const blockThreadCount(blockThreadExtent.prod());
-
-                // We have to make sure, that the OpenMP runtime keeps enough threads for executing a block in parallel.
-                auto const maxOmpThreadCount(::omp_get_max_threads());
-                auto const maxTeamCount(maxOmpThreadCount/static_cast<int>(blockThreadCount));
-                auto const teamCount(std::min(maxTeamCount, static_cast<int>(gridBlockCount)));
-
-                if(::omp_in_parallel() != 0)
-                {
-                    throw std::runtime_error("The OpenMP 4.0 backend can not be used within an existing parallel region!");
-                }
-
-                // Force the environment to use the given number of threads.
-                int const ompIsDynamic(::omp_get_dynamic());
-                ::omp_set_dynamic(0);
-
-                // `When an if(scalar-expression) evaluates to false, the structured block is executed on the host.`
-                #pragma omp target if(0)
-                {
-                    #pragma omp teams num_teams(teamCount) thread_limit(blockThreadCount)
-                    {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        // The first team does some checks ...
-                        if((::omp_get_team_num() == 0))
-                        {
-                            int const iNumTeams(::omp_get_num_teams());
-                            printf("%s omp_get_num_teams: %d\n", __func__, iNumTeams);
-                        }
-#endif
-                        acc::AccCpuOmp4<TDim, TIdx> acc(
-                            *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                            blockSharedMemDynSizeBytes);
-
-                        #pragma omp distribute
-                        for(TIdx b = 0u; b<gridBlockCount; ++b)
-                        {
-                            vec::Vec<dim::DimInt<1u>, TIdx> const gridBlockIdx(b);
-                            // When this is not repeated here:
-                            // error: gridBlockExtent referenced in target region does not have a mappable type
-                            auto const gridBlockExtent2(
-                                workdiv::getWorkDiv<Grid, Blocks>(*static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this)));
-                            acc.m_gridBlockIdx = idx::mapIdx<TDim::value>(
-                                gridBlockIdx,
-                                gridBlockExtent2);
-
-                            // Execute the threads in parallel.
-
-                            // Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to be done with their work up to this line.
-                            // So we have to spawn one OS thread per thread in a block.
-                            // 'omp for' is not useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1 mapping is required.
-                            // Therefore we use 'omp parallel' with the specified number of threads in a block.
-                            #pragma omp parallel num_threads(blockThreadCount)
-                            {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                                // The first thread does some checks in the first block executed.
-                                if((::omp_get_thread_num() == 0) && (b == 0))
-                                {
-                                    int const numThreads(::omp_get_num_threads());
-                                    printf("%s omp_get_num_threads: %d\n", __func__, numThreads);
-                                    if(numThreads != static_cast<int>(blockThreadCount))
-                                    {
-                                        throw std::runtime_error("ERROR: The OpenMP runtime did not use the number of threads that had been required!");
-                                    }
-                                }
-#endif
-                                boundKernelFnObj(
-                                    acc);
-
-                                // Wait for all threads to finish before deleting the shared memory.
-                                // This is done by default if the omp 'nowait' clause is missing
-                                //block::sync::syncBlockThreads(acc);
-                            }
-
-                            // After a block has been processed, the shared memory has to be deleted.
-                            block::shared::st::freeMem(acc);
-                        }
-                    }
-                }
-
-                // Reset the dynamic thread number setting.
-                ::omp_set_dynamic(ompIsDynamic);
-            }
-
-        private:
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuOmp4<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU OpenMP 4.0 execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuOmp4<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp
index 62dc8ab731..52b7d349f6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuSerial.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,235 +12,159 @@
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/acc/AccCpuSerial.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
+#    include <alpaka/acc/AccCpuSerial.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/meta/NdLoop.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+#    include <functional>
+#    include <tuple>
+#    include <type_traits>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
 
 namespace alpaka
 {
-    namespace kernel
+    //#############################################################################
+    //! The CPU serial execution task implementation.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuSerial final : public WorkDivMembers<TDim, TIdx>
     {
-        //#############################################################################
-        //! The CPU serial execution task implementation.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuSerial final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuSerial(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
         {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuSerial(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuSerial(TaskKernelCpuSerial const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuSerial(TaskKernelCpuSerial &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuSerial const &) -> TaskKernelCpuSerial & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuSerial &&) -> TaskKernelCpuSerial & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuSerial() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuSerial(TaskKernelCpuSerial const&) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuSerial(TaskKernelCpuSerial&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuSerial const&) -> TaskKernelCpuSerial& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuSerial&&) -> TaskKernelCpuSerial& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelCpuSerial() = default;
+
+        //-----------------------------------------------------------------------------
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(*this));
+            auto const blockThreadExtent(getWorkDiv<Block, Threads>(*this));
+            auto const threadElemExtent(getWorkDiv<Thread, Elems>(*this));
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes(meta::apply(
+                [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return getBlockSharedMemDynSizeBytes<AccCpuSerial<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+            // Bind all arguments except the accelerator.
+            // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
+            auto const boundKernelFnObj(meta::apply(
+                [this](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return std::bind(std::ref(m_kernelFnObj), std::placeholders::_1, std::ref(args)...);
+                },
+                m_args));
+
+            AccCpuSerial<TDim, TIdx> acc(
+                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                blockSharedMemDynSizeBytes);
+
+            if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
             {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuSerial<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                acc::AccCpuSerial<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
-
-                if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
-                {
-                    throw std::runtime_error("A block for the serial accelerator can only ever have one single thread!");
-                }
-
-                // Execute the blocks serially.
-                meta::ndLoopIncIdx(
-                    gridBlockExtent,
-                    [&](vec::Vec<TDim, TIdx> const & blockThreadIdx)
-                    {
-                        acc.m_gridBlockIdx = blockThreadIdx;
-
-                        boundKernelFnObj(
-                            acc);
-
-                        // After a block has been processed, the shared memory has to be deleted.
-                        block::shared::st::freeMem(acc);
-                    });
+                throw std::runtime_error("A block for the serial accelerator can only ever have one single thread!");
             }
 
-        private:
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
+            // Execute the blocks serially.
+            meta::ndLoopIncIdx(gridBlockExtent, [&](Vec<TDim, TIdx> const& blockThreadIdx) {
+                acc.m_gridBlockIdx = blockThreadIdx;
 
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU serial execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuSerial<TDim, TIdx>;
-            };
+                boundKernelFnObj(acc);
+
+                // After a block has been processed, the shared memory has to be deleted.
+                freeSharedVars(acc);
+            });
         }
-    }
-    namespace dev
+
+    private:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU serial execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU serial execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+            using type = AccCpuSerial<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The CPU serial execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU serial execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU serial execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU serial execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU serial execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU serial execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU serial execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
index 2efbd29af9..f98296f3b0 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,248 +12,165 @@
 #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/acc/AccCpuTbbBlocks.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/idx/MapIdx.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-#include <tbb/parallel_for.h>
-#include <tbb/blocked_range.h>
-#include <tbb/task_group.h>
+#    include <alpaka/acc/AccCpuTbbBlocks.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/idx/MapIdx.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/meta/NdLoop.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+#    include <functional>
+#    include <stdexcept>
+#    include <tuple>
+#    include <type_traits>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
+
+#    include <tbb/blocked_range.h>
+#    include <tbb/parallel_for.h>
+#    include <tbb/task_group.h>
 
 namespace alpaka
 {
-    namespace kernel
+    //#############################################################################
+    //! The CPU TBB block accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuTbbBlocks final : public WorkDivMembers<TDim, TIdx>
     {
-        //#############################################################################
-        //! The CPU TBB block accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuTbbBlocks final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuTbbBlocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuTbbBlocks(TaskKernelCpuTbbBlocks const&) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuTbbBlocks(TaskKernelCpuTbbBlocks&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuTbbBlocks const&) -> TaskKernelCpuTbbBlocks& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuTbbBlocks&&) -> TaskKernelCpuTbbBlocks& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelCpuTbbBlocks() = default;
+
+        //-----------------------------------------------------------------------------
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
         {
-        public:
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuTbbBlocks(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(*this));
+            auto const blockThreadExtent(getWorkDiv<Block, Threads>(*this));
+            auto const threadElemExtent(getWorkDiv<Thread, Elems>(*this));
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes(meta::apply(
+                [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return getBlockSharedMemDynSizeBytes<AccCpuTbbBlocks<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+            // Bind all arguments except the accelerator.
+            // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
+            auto const boundKernelFnObj(meta::apply(
+                [this](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return std::bind(std::ref(m_kernelFnObj), std::placeholders::_1, std::ref(args)...);
+                },
+                m_args));
+
+            // The number of blocks in the grid.
+            TIdx const numBlocksInGrid(gridBlockExtent.prod());
+
+            if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
             {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
+                throw std::runtime_error("A block for the TBB accelerator can only ever have one single thread!");
             }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuTbbBlocks(TaskKernelCpuTbbBlocks const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuTbbBlocks(TaskKernelCpuTbbBlocks &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuTbbBlocks const &) -> TaskKernelCpuTbbBlocks & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuTbbBlocks &&) -> TaskKernelCpuTbbBlocks & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuTbbBlocks() = default;
-
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuTbbBlocks<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                // Bind all arguments except the accelerator.
-                // TODO: With C++14 we could create a perfectly argument forwarding function object within the constructor.
-                auto const boundKernelFnObj(
-                    meta::apply(
-                        [this](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    std::ref(m_kernelFnObj),
-                                    std::placeholders::_1,
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // The number of blocks in the grid.
-                TIdx const numBlocksInGrid(gridBlockExtent.prod());
-
-                if(blockThreadExtent.prod() != static_cast<TIdx>(1u))
-                {
-                    throw std::runtime_error("A block for the TBB accelerator can only ever have one single thread!");
-                }
 
-                tbb::parallel_for(
-                    static_cast<TIdx>(0),
-                    static_cast<TIdx>(numBlocksInGrid),
-                    [&](TIdx i){
-                         acc::AccCpuTbbBlocks<TDim, TIdx> acc(
-                             *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                             blockSharedMemDynSizeBytes);
+            tbb::parallel_for(static_cast<TIdx>(0), static_cast<TIdx>(numBlocksInGrid), [&](TIdx i) {
+                AccCpuTbbBlocks<TDim, TIdx> acc(
+                    *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                    blockSharedMemDynSizeBytes);
 
-                         acc.m_gridBlockIdx =
-                             idx::mapIdx<TDim::value>(
-                                 vec::Vec<dim::DimInt<1u>, TIdx>(
-                                     static_cast<TIdx>(i)
-                                  ),
-                                  gridBlockExtent
-                             );
+                acc.m_gridBlockIdx = mapIdx<TDim::value>(Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(i)), gridBlockExtent);
 
-                         boundKernelFnObj(acc);
+                boundKernelFnObj(acc);
 
-                         block::shared::st::freeMem(acc);
-                });
-
-            }
+                freeSharedVars(acc);
+            });
+        }
 
-        private:
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
+    private:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
 
-    namespace acc
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU TBB block execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU TBB block execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuTbbBlocks<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
+            using type = AccCpuTbbBlocks<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The CPU TBB block execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU TBB block execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU TBB block execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU TBB block execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU TBB block execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU TBB block execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU TBB block execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU TBB block execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp
index 0b9cb85aec..0d23c39b38 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,383 +12,289 @@
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
 
 // Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
 
 // Implementation details.
-#include <alpaka/acc/AccCpuThreads.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/ConcurrentExecPool.hpp>
-#include <alpaka/meta/NdLoop.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-
-#include <algorithm>
-#include <thread>
-#include <vector>
-#include <tuple>
-#include <type_traits>
-#include <future>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
+#    include <alpaka/acc/AccCpuThreads.hpp>
+#    include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/core/ConcurrentExecPool.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/meta/NdLoop.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+#    include <algorithm>
+#    include <functional>
+#    include <future>
+#    include <thread>
+#    include <tuple>
+#    include <type_traits>
+#    include <vector>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
 
 namespace alpaka
 {
-    namespace kernel
+    //#############################################################################
+    //! The CPU threads execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuThreads final : public WorkDivMembers<TDim, TIdx>
     {
+    private:
         //#############################################################################
-        //! The CPU threads execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelCpuThreads final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
+        //! The type given to the ConcurrentExecPool for yielding the current thread.
+        struct ThreadPoolYield
         {
-        private:
-            //#############################################################################
-            //! The type given to the ConcurrentExecPool for yielding the current thread.
-            struct ThreadPoolYield
-            {
-                //-----------------------------------------------------------------------------
-                //! Yields the current thread.
-                ALPAKA_FN_HOST static auto yield()
-                -> void
-                {
-                    std::this_thread::yield();
-                }
-            };
-            //#############################################################################
-            // When using the thread pool the threads are yielding because this is faster.
-            // Using condition variables and going to sleep is very costly for real threads.
-            // Especially when the time to wait is really short (syncBlockThreads) yielding is much faster.
-            using ThreadPool = alpaka::core::detail::ConcurrentExecPool<
-                TIdx,
-                std::thread,        // The concurrent execution type.
-                std::promise,       // The promise type.
-                ThreadPoolYield>;   // The type yielding the current concurrent execution.
-
-        public:
             //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelCpuThreads(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
+            //! Yields the current thread.
+            ALPAKA_FN_HOST static auto yield() -> void
             {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
+                std::this_thread::yield();
             }
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuThreads(TaskKernelCpuThreads const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelCpuThreads(TaskKernelCpuThreads &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuThreads const &) -> TaskKernelCpuThreads & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelCpuThreads &&) -> TaskKernelCpuThreads & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelCpuThreads() = default;
+        };
+        //#############################################################################
+        // When using the thread pool the threads are yielding because this is faster.
+        // Using condition variables and going to sleep is very costly for real threads.
+        // Especially when the time to wait is really short (syncBlockThreads) yielding is much faster.
+        using ThreadPool = alpaka::core::detail::ConcurrentExecPool<
+            TIdx,
+            std::thread, // The concurrent execution type.
+            std::promise, // The promise type.
+            ThreadPoolYield>; // The type yielding the current concurrent execution.
 
-            //-----------------------------------------------------------------------------
-            //! Executes the kernel function object.
-            ALPAKA_FN_HOST auto operator()() const
-            -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                auto const gridBlockExtent(
-                    workdiv::getWorkDiv<Grid, Blocks>(*this));
-                auto const blockThreadExtent(
-                    workdiv::getWorkDiv<Block, Threads>(*this));
-                auto const threadElemExtent(
-                    workdiv::getWorkDiv<Thread, Elems>(*this));
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes(
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                kernel::getBlockSharedMemDynSizeBytes<
-                                    acc::AccCpuThreads<TDim, TIdx>>(
-                                        m_kernelFnObj,
-                                        blockThreadExtent,
-                                        threadElemExtent,
-                                        args...);
-                        },
-                        m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__
-                    << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-                acc::AccCpuThreads<TDim, TIdx> acc(
-                    *static_cast<workdiv::WorkDivMembers<TDim, TIdx> const *>(this),
-                    blockSharedMemDynSizeBytes);
-
-                auto const blockThreadCount(blockThreadExtent.prod());
-                ThreadPool threadPool(blockThreadCount);
-
-                // Bind the kernel and its arguments to the grid block function.
-                auto const boundGridBlockExecHost(
-                    meta::apply(
-                        [this, &acc, &blockThreadExtent, &threadPool](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            return
-                                std::bind(
-                                    &TaskKernelCpuThreads::gridBlockExecHost,
-                                    std::ref(acc),
-                                    std::placeholders::_1,
-                                    std::ref(blockThreadExtent),
-                                    std::ref(threadPool),
-                                    std::ref(m_kernelFnObj),
-                                    std::ref(args)...);
-                        },
-                        m_args));
-
-                // Execute the blocks serially.
-                meta::ndLoopIncIdx(
-                    gridBlockExtent,
-                    boundGridBlockExecHost);
-            }
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuThreads(TaskKernelCpuThreads const&) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelCpuThreads(TaskKernelCpuThreads&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuThreads const&) -> TaskKernelCpuThreads& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelCpuThreads&&) -> TaskKernelCpuThreads& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelCpuThreads() = default;
 
-        private:
-            //-----------------------------------------------------------------------------
-            //! The function executed for each grid block.
-            ALPAKA_FN_HOST static auto gridBlockExecHost(
-                acc::AccCpuThreads<TDim, TIdx> & acc,
-                vec::Vec<TDim, TIdx> const & gridBlockIdx,
-                vec::Vec<TDim, TIdx> const & blockThreadExtent,
-                ThreadPool & threadPool,
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                    // The futures of the threads in the current block.
-                std::vector<std::future<void>> futuresInBlock;
-
-                // Set the index of the current block
-                acc.m_gridBlockIdx = gridBlockIdx;
-
-                // Bind the kernel and its arguments to the host block thread execution function.
-                auto boundBlockThreadExecHost(std::bind(
-                    &TaskKernelCpuThreads::blockThreadExecHost,
-                    std::ref(acc),
-                    std::ref(futuresInBlock),
-                    std::placeholders::_1,
-                    std::ref(threadPool),
-                    std::ref(kernelFnObj),
-                    std::ref(args)...));
-                // Execute the block threads in parallel.
-                meta::ndLoopIncIdx(
-                    blockThreadExtent,
-                    boundBlockThreadExecHost);
+        //-----------------------------------------------------------------------------
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(*this));
+            auto const blockThreadExtent(getWorkDiv<Block, Threads>(*this));
+            auto const threadElemExtent(getWorkDiv<Thread, Elems>(*this));
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes(meta::apply(
+                [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return getBlockSharedMemDynSizeBytes<AccCpuThreads<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+            AccCpuThreads<TDim, TIdx> acc(
+                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                blockSharedMemDynSizeBytes);
+
+            auto const blockThreadCount(blockThreadExtent.prod());
+            ThreadPool threadPool(blockThreadCount);
+
+            // Bind the kernel and its arguments to the grid block function.
+            auto const boundGridBlockExecHost(meta::apply(
+                [this, &acc, &blockThreadExtent, &threadPool](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return std::bind(
+                        &TaskKernelCpuThreads::gridBlockExecHost,
+                        std::ref(acc),
+                        std::placeholders::_1,
+                        std::ref(blockThreadExtent),
+                        std::ref(threadPool),
+                        std::ref(m_kernelFnObj),
+                        std::ref(args)...);
+                },
+                m_args));
+
+            // Execute the blocks serially.
+            meta::ndLoopIncIdx(gridBlockExtent, boundGridBlockExecHost);
+        }
+
+    private:
+        //-----------------------------------------------------------------------------
+        //! The function executed for each grid block.
+        ALPAKA_FN_HOST static auto gridBlockExecHost(
+            AccCpuThreads<TDim, TIdx>& acc,
+            Vec<TDim, TIdx> const& gridBlockIdx,
+            Vec<TDim, TIdx> const& blockThreadExtent,
+            ThreadPool& threadPool,
+            TKernelFnObj const& kernelFnObj,
+            std::decay_t<TArgs> const&... args) -> void
+        {
+            // The futures of the threads in the current block.
+            std::vector<std::future<void>> futuresInBlock;
+
+            // Set the index of the current block
+            acc.m_gridBlockIdx = gridBlockIdx;
+
+            // Bind the kernel and its arguments to the host block thread execution function.
+            auto boundBlockThreadExecHost(std::bind(
+                &TaskKernelCpuThreads::blockThreadExecHost,
+                std::ref(acc),
+                std::ref(futuresInBlock),
+                std::placeholders::_1,
+                std::ref(threadPool),
+                std::ref(kernelFnObj),
+                std::ref(args)...));
+            // Execute the block threads in parallel.
+            meta::ndLoopIncIdx(blockThreadExtent, boundBlockThreadExecHost);
 // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                // Wait for the completion of the block thread kernels.
-                std::for_each(
-                    futuresInBlock.begin(),
-                    futuresInBlock.end(),
-                    [](std::future<void> & t)
-                    {
-                        t.wait();
-                    }
-                );
-#endif
-                // Clean up.
-                futuresInBlock.clear();
+#    if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+            // Wait for the completion of the block thread kernels.
+            std::for_each(futuresInBlock.begin(), futuresInBlock.end(), [](std::future<void>& t) { t.wait(); });
+#    endif
+            // Clean up.
+            futuresInBlock.clear();
 
-                acc.m_threadToIndexMap.clear();
+            acc.m_threadToIndexMap.clear();
 
-                // After a block has been processed, the shared memory has to be deleted.
-                block::shared::st::freeMem(acc);
-            }
-            //-----------------------------------------------------------------------------
-            //! The function executed for each block thread on the host.
-            ALPAKA_FN_HOST static auto blockThreadExecHost(
-                acc::AccCpuThreads<TDim, TIdx> & acc,
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                std::vector<std::future<void>> & futuresInBlock,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                ThreadPool & threadPool,
-#else
-                std::vector<std::future<void>> &,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                ThreadPool &,
-#endif
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
-            {
-                // Bind the arguments to the accelerator block thread execution function.
-                // The blockThreadIdx is required to be copied in because the variable will get changed for the next iteration/thread.
-                auto boundBlockThreadExecAcc(
-                    [&, blockThreadIdx]()
-                    {
-                        blockThreadExecAcc(
-                            acc,
-                            blockThreadIdx,
-                            kernelFnObj,
-                            args...);
-                    });
-                // Add the bound function to the block thread pool.
+            // After a block has been processed, the shared memory has to be deleted.
+            freeSharedVars(acc);
+        }
+        //-----------------------------------------------------------------------------
+        //! The function executed for each block thread on the host.
+        ALPAKA_FN_HOST static auto blockThreadExecHost(
+            AccCpuThreads<TDim, TIdx>& acc,
+#    if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+            std::vector<std::future<void>>& futuresInBlock,
+            Vec<TDim, TIdx> const& blockThreadIdx,
+            ThreadPool& threadPool,
+#    else
+            std::vector<std::future<void>>&,
+            Vec<TDim, TIdx> const& blockThreadIdx,
+            ThreadPool&,
+#    endif
+            TKernelFnObj const& kernelFnObj,
+            std::decay_t<TArgs> const&... args) -> void
+        {
+            // Bind the arguments to the accelerator block thread execution function.
+            // The blockThreadIdx is required to be copied in because the variable will get changed for the next
+            // iteration/thread.
+            auto boundBlockThreadExecAcc(
+                [&, blockThreadIdx]() { blockThreadExecAcc(acc, blockThreadIdx, kernelFnObj, args...); });
+            // Add the bound function to the block thread pool.
 // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                futuresInBlock.emplace_back(
-                    threadPool.enqueueTask(
-                        boundBlockThreadExecAcc));
-#else
-                (void)boundBlockThreadExecAcc;
-#endif
+#    if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+            futuresInBlock.emplace_back(threadPool.enqueueTask(boundBlockThreadExecAcc));
+#    else
+            (void) boundBlockThreadExecAcc;
+#    endif
+        }
+        //-----------------------------------------------------------------------------
+        //! The thread entry point on the accelerator.
+        ALPAKA_FN_HOST static auto blockThreadExecAcc(
+            AccCpuThreads<TDim, TIdx>& acc,
+            Vec<TDim, TIdx> const& blockThreadIdx,
+            TKernelFnObj const& kernelFnObj,
+            std::decay_t<TArgs> const&... args) -> void
+        {
+            // We have to store the thread data before the kernel is calling any of the methods of this class depending
+            // on them.
+            auto const threadId(std::this_thread::get_id());
+
+            // Set the master thread id.
+            if(blockThreadIdx.sum() == 0)
+            {
+                acc.m_idMasterThread = threadId;
             }
-            //-----------------------------------------------------------------------------
-            //! The thread entry point on the accelerator.
-            ALPAKA_FN_HOST static auto blockThreadExecAcc(
-                acc::AccCpuThreads<TDim, TIdx> & acc,
-                vec::Vec<TDim, TIdx> const & blockThreadIdx,
-                TKernelFnObj const & kernelFnObj,
-                typename std::decay<TArgs>::type const & ... args)
-            -> void
+
             {
-                // We have to store the thread data before the kernel is calling any of the methods of this class depending on them.
-                auto const threadId(std::this_thread::get_id());
-
-                // Set the master thread id.
-                if(blockThreadIdx.sum() == 0)
-                {
-                    acc.m_idMasterThread = threadId;
-                }
-
-                {
-                    // The insertion of elements has to be done one thread at a time.
-                    std::lock_guard<std::mutex> lock(acc.m_mtxMapInsert);
-
-                    // Save the thread id, and index.
-                    acc.m_threadToIndexMap.emplace(threadId, blockThreadIdx);
-                }
-
-                // Sync all threads so that the maps with thread id's are complete and not changed after here.
-                syncBlockThreads(acc);
-
-                // Execute the kernel itself.
-                kernelFnObj(
-                    const_cast<acc::AccCpuThreads<TDim, TIdx> const &>(acc),
-                    args...);
-
-                // We have to sync all threads here because if a thread would finish before all threads have been started,
-                // a new thread could get the recycled (then duplicate) thread id!
-                syncBlockThreads(acc);
+                // The insertion of elements has to be done one thread at a time.
+                std::lock_guard<std::mutex> lock(acc.m_mtxMapInsert);
+
+                // Save the thread id, and index.
+                acc.m_threadToIndexMap.emplace(threadId, blockThreadIdx);
             }
 
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
+            // Sync all threads so that the maps with thread id's are complete and not changed after here.
+            syncBlockThreads(acc);
 
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU threads execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccCpuThreads<TDim, TIdx>;
-            };
+            // Execute the kernel itself.
+            kernelFnObj(const_cast<AccCpuThreads<TDim, TIdx> const&>(acc), args...);
+
+            // We have to sync all threads here because if a thread would finish before all threads have been started,
+            // a new thread could get the recycled (then duplicate) thread id!
+            syncBlockThreads(acc);
         }
-    }
-    namespace dev
+
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU threads execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU threads execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+            using type = AccCpuThreads<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The CPU threads execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU threads execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+
+        //#############################################################################
+        //! The CPU threads execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU threads execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCpu;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU threads execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
         {
-            //#############################################################################
-            //! The CPU threads execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = PltfCpu;
+        };
+
+        //#############################################################################
+        //! The CPU threads execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
deleted file mode 100644
index a50b4d00bd..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
+++ /dev/null
@@ -1,524 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccGpuCudaRt.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <alpaka/acc/Traits.hpp>
-    #include <alpaka/dev/Traits.hpp>
-    #include <alpaka/workdiv/WorkDivHelpers.hpp>
-#endif
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        namespace cuda
-        {
-            namespace detail
-            {
-                //-----------------------------------------------------------------------------
-                //! The GPU CUDA kernel entry point.
-                // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
-                template<
-                    typename TDim,
-                    typename TIdx,
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                __global__ void cudaKernel(
-                    vec::Vec<TDim, TIdx> const threadElemExtent,
-                    TKernelFnObj const kernelFnObj,
-                    TArgs ... args)
-                {
-#if BOOST_ARCH_PTX && (BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(2, 0, 0))
-    #error "Cuda device capability >= 2.0 is required!"
-#endif
-
-// with clang it is not possible to query std::result_of for a pure device lambda created on the host side
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
-                    static_assert(
-                        std::is_same<typename std::result_of<
-                            TKernelFnObj(acc::AccGpuCudaRt<TDim, TIdx> const &, TArgs const & ...)>::type, void>::value,
-                        "The TKernelFnObj is required to return void!");
-#endif
-                    acc::AccGpuCudaRt<TDim, TIdx> acc(threadElemExtent);
-
-                    kernelFnObj(
-                        const_cast<acc::AccGpuCudaRt<TDim, TIdx> const &>(acc),
-                        args...);
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TIdx
-                >
-                ALPAKA_FN_HOST auto checkVecOnly3Dim(
-                    vec::Vec<TDim, TIdx> const & vec)
-                -> void
-                {
-                    for(auto i(std::min(static_cast<typename TDim::value_type>(3), TDim::value)); i<TDim::value; ++i)
-                    {
-                        if(vec[TDim::value-1u-i] != 1)
-                        {
-                            throw std::runtime_error("The CUDA accelerator supports a maximum of 3 dimensions. All work division extents of the dimensions higher 3 have to be 1!");
-                        }
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TIdx
-                >
-                ALPAKA_FN_HOST auto convertVecToCudaDim(
-                    vec::Vec<TDim, TIdx> const & vec)
-                -> dim3
-                {
-                    dim3 dim(1, 1, 1);
-                    for(auto i(static_cast<typename TDim::value_type>(0)); i<std::min(static_cast<typename TDim::value_type>(3), TDim::value); ++i)
-                    {
-                        reinterpret_cast<unsigned int *>(&dim)[i] = static_cast<unsigned int>(vec[TDim::value-1u-i]);
-                    }
-                    checkVecOnly3Dim(vec);
-                    return dim;
-                }
-            }
-        }
-
-        //#############################################################################
-        //! The GPU CUDA accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelGpuCudaRt final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-// gcc-4.9 libstdc++ does not support std::is_trivially_copyable.
-// MSVC std::is_trivially_copyable seems to be buggy (last tested at 15.7).
-// libc++ in combination with CUDA does not seem to work.
-#if (!BOOST_COMP_MSVC) && !(defined(__GLIBCXX__) && (__GLIBCXX__)) && !(defined(_LIBCPP_VERSION) && BOOST_LANG_CUDA)
-            static_assert(
-                meta::Conjunction<
-                    std::is_trivially_copyable<
-                        TKernelFnObj>,
-                    std::is_trivially_copyable<
-                        TArgs>...
-                    >::value,
-                "The given kernel function object and its arguments have to fulfill is_trivially_copyable!");
-#endif
-
-            //-----------------------------------------------------------------------------
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelGpuCudaRt(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            TaskKernelGpuCudaRt(TaskKernelGpuCudaRt const &) = default;
-            //-----------------------------------------------------------------------------
-            TaskKernelGpuCudaRt(TaskKernelGpuCudaRt &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelGpuCudaRt const &) -> TaskKernelGpuCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(TaskKernelGpuCudaRt &&) -> TaskKernelGpuCudaRt & = default;
-            //-----------------------------------------------------------------------------
-            ~TaskKernelGpuCudaRt() = default;
-
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccGpuCudaRt<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevCudaRt;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU CUDA execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfCudaRt;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA non-blocking kernel enqueue trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    //std::size_t printfFifoSize;
-                    //cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                    //cudaDeviceSetLimit(cudaLimitPrintfFifoSize, printfFifoSize*10);
-                    //cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
-#endif
-                    auto const gridBlockExtent(
-                        workdiv::getWorkDiv<Grid, Blocks>(task));
-                    auto const blockThreadExtent(
-                        workdiv::getWorkDiv<Block, Threads>(task));
-                    auto const threadElemExtent(
-                        workdiv::getWorkDiv<Thread, Elems>(task));
-
-                    dim3 const gridDim(kernel::cuda::detail::convertVecToCudaDim(gridBlockExtent));
-                    dim3 const blockDim(kernel::cuda::detail::convertVecToCudaDim(blockThreadExtent));
-                    kernel::cuda::detail::checkVecOnly3Dim(threadElemExtent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__
-                        << " gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x
-                        << " blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x
-                        << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                    if(!workdiv::isValidWorkDiv<acc::AccGpuCudaRt<TDim, TIdx>>(dev::getDev(queue), task))
-                    {
-                        throw std::runtime_error("The given work division is not valid or not supported by the device of type " + acc::getAccName<acc::AccGpuCudaRt<TDim, TIdx>>() + "!");
-                    }
-#endif
-
-                    // Get the size of the block shared dynamic memory.
-                    auto const blockSharedMemDynSizeBytes(
-                        meta::apply(
-                            [&](typename std::decay<TArgs>::type const & ... args)
-                            {
-                                return
-                                    kernel::getBlockSharedMemDynSizeBytes<
-                                        acc::AccGpuCudaRt<TDim, TIdx>>(
-                                            task.m_kernelFnObj,
-                                            blockThreadExtent,
-                                            threadElemExtent,
-                                            args...);
-                            },
-                            task.m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the block shared memory idx.
-                    std::cout << __func__
-                        << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the function attributes.
-                    cudaFuncAttributes funcAttrs;
-                    cudaFuncGetAttributes(&funcAttrs, kernel::cuda::detail::cudaKernel<TDim, TIdx, TKernelFnObj, TArgs...>);
-                    std::cout << __func__
-                        << " binaryVersion: " << funcAttrs.binaryVersion
-                        << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                        << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                        << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                        << " numRegs: " << funcAttrs.numRegs
-                        << " ptxVersion: " << funcAttrs.ptxVersion
-                        << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B"
-                        << std::endl;
-#endif
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            queue.m_spQueueImpl->m_dev.m_iDevice));
-                    // Enqueue the kernel execution.
-                    // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch language extension expects the arguments by value.
-                    // This forces the type of a float argument given with std::forward to this function to be of type float instead of e.g. "float const & __ptr64" (MSVC).
-                    // If not given by value, the kernel launch code does not copy the value but the pointer to the value location.
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            kernel::cuda::detail::cudaKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type...><<<
-                                gridDim,
-                                blockDim,
-                                static_cast<std::size_t>(blockSharedMemDynSizeBytes),
-                                queue.m_spQueueImpl->m_CudaQueue>>>(
-                                    threadElemExtent,
-                                    task.m_kernelFnObj,
-                                    args...);
-                        },
-                        task.m_args);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message.
-                    cudaStreamSynchronize(
-                        queue.m_spQueueImpl->m_CudaQueue);
-                    std::string const kernelName("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
-                    ::alpaka::cuda::detail::cudaRtCheckLastError(kernelName.c_str(), __FILE__, __LINE__);
-#endif
-                }
-            };
-            //#############################################################################
-            //! The CUDA synchronous kernel enqueue trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    kernel::TaskKernelGpuCudaRt<TDim, TIdx, TKernelFnObj, TArgs...> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    //std::size_t printfFifoSize;
-                    //cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                    //cudaDeviceSetLimit(cudaLimitPrintfFifoSize, printfFifoSize*10);
-                    //cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
-#endif
-                    auto const gridBlockExtent(
-                        workdiv::getWorkDiv<Grid, Blocks>(task));
-                    auto const blockThreadExtent(
-                        workdiv::getWorkDiv<Block, Threads>(task));
-                    auto const threadElemExtent(
-                        workdiv::getWorkDiv<Thread, Elems>(task));
-
-                    dim3 const gridDim(kernel::cuda::detail::convertVecToCudaDim(gridBlockExtent));
-                    dim3 const blockDim(kernel::cuda::detail::convertVecToCudaDim(blockThreadExtent));
-                    kernel::cuda::detail::checkVecOnly3Dim(threadElemExtent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__ << "gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x << std::endl;
-                    std::cout << __func__ << "blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                    if(!workdiv::isValidWorkDiv<acc::AccGpuCudaRt<TDim, TIdx>>(dev::getDev(queue), task))
-                    {
-                        throw std::runtime_error("The given work division is not valid or not supported by the device of type " + acc::getAccName<acc::AccGpuCudaRt<TDim, TIdx>>() + "!");
-                    }
-#endif
-
-                    // Get the size of the block shared dynamic memory.
-                    auto const blockSharedMemDynSizeBytes(
-                        meta::apply(
-                            [&](typename std::decay<TArgs>::type const & ... args)
-                            {
-                                return
-                                    kernel::getBlockSharedMemDynSizeBytes<
-                                        acc::AccGpuCudaRt<TDim, TIdx>>(
-                                            task.m_kernelFnObj,
-                                            blockThreadExtent,
-                                            threadElemExtent,
-                                            args...);
-                            },
-                            task.m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the block shared memory idx.
-                    std::cout << __func__
-                        << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the function attributes.
-                    cudaFuncAttributes funcAttrs;
-                    cudaFuncGetAttributes(&funcAttrs, kernel::cuda::detail::cudaKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type...>);
-                    std::cout << __func__
-                        << " binaryVersion: " << funcAttrs.binaryVersion
-                        << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                        << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                        << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                        << " numRegs: " << funcAttrs.numRegs
-                        << " ptxVersion: " << funcAttrs.ptxVersion
-                        << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B"
-                        << std::endl;
-#endif
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            queue.m_spQueueImpl->m_dev.m_iDevice));
-                    // Enqueue the kernel execution.
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            kernel::cuda::detail::cudaKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type...><<<
-                                gridDim,
-                                blockDim,
-                                static_cast<std::size_t>(blockSharedMemDynSizeBytes),
-                                queue.m_spQueueImpl->m_CudaQueue>>>(
-                                    threadElemExtent,
-                                    task.m_kernelFnObj,
-                                    args...);
-                        },
-                        task.m_args);
-
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message.
-                    cudaStreamSynchronize(
-                        queue.m_spQueueImpl->m_CudaQueue);
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::string const kernelName("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
-                    ::alpaka::cuda::detail::cudaRtCheckLastError(kernelName.c_str(), __FILE__, __LINE__);
-#endif
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuHipRt.hpp
deleted file mode 100644
index bf8cb5de77..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuHipRt.hpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-// Specialized traits.
-#include <alpaka/acc/Traits.hpp>
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-// Implementation details.
-#include <alpaka/acc/AccGpuHipRt.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/kernel/Traits.hpp>
-#include <alpaka/queue/QueueHipRtBlocking.hpp>
-#include <alpaka/queue/QueueHipRtNonBlocking.hpp>
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <alpaka/acc/Traits.hpp>
-    #include <alpaka/dev/Traits.hpp>
-    #include <alpaka/workdiv/WorkDivHelpers.hpp>
-#endif
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Utility.hpp>
-#include <alpaka/meta/ApplyTuple.hpp>
-#include <alpaka/meta/Metafunctions.hpp>
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-    #include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace kernel
-    {
-        namespace hip
-        {
-            namespace detail
-            {
-                //-----------------------------------------------------------------------------
-                //! The GPU HIP kernel entry point.
-                // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
-                template<
-                    typename TDim,
-                    typename TIdx,
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                __global__ void hipKernel(
-                    hipLaunchParm lp,
-                    vec::Vec<TDim, TIdx> const threadElemExtent,
-                    TKernelFnObj const kernelFnObj,
-                    TArgs ... args)
-                {
-#if BOOST_ARCH_PTX && (BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(2, 0, 0))
-    #error "Cuda device capability >= 2.0 is required!"
-#endif
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wignored-attributes"
-                    static_assert(
-                        std::is_same<
-                            decltype(kernelFnObj(
-                                alpaka::core::declval<acc::AccGpuHipRt<TDim, TIdx> const>(),
-                                args...)),
-                        void>::value,
-                        "The TKernelFnObj is required to return void!");
-#pragma clang diagnostic pop
-
-                    acc::AccGpuHipRt<TDim, TIdx> acc(threadElemExtent);
-
-                    kernelFnObj(
-                        const_cast<acc::AccGpuHipRt<TDim, TIdx> const &>(acc),
-                        args...);
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TIdx
-                    >
-                ALPAKA_FN_HOST auto checkVecOnly3Dim(
-                    vec::Vec<TDim, TIdx> const & vec)
-                    -> void
-                {
-                    for(auto i(std::min(static_cast<typename TDim::value_type>(3), TDim::value)); i<TDim::value; ++i)
-                    {
-                        if(vec[TDim::value-1u-i] != 1)
-                        {
-                            throw std::runtime_error("The CUDA accelerator supports a maximum of 3 dimensions. All work division extents of the dimensions higher 3 have to be 1!");
-                        }
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TIdx
-                    >
-                ALPAKA_FN_HOST auto convertVecToHipDim(
-                    vec::Vec<TDim, TIdx> const & vec)
-                    -> dim3
-                {
-                    dim3 dim(1, 1, 1);
-                    for(auto i(static_cast<typename TDim::value_type>(0)); i<std::min(static_cast<typename TDim::value_type>(3), TDim::value); ++i)
-                    {
-                        reinterpret_cast<unsigned int *>(&dim)[i] = static_cast<unsigned int>(vec[TDim::value-1u-i]);
-                    }
-                    checkVecOnly3Dim(vec);
-                    return dim;
-                }
-
-            }
-        }
-        //#############################################################################
-        //! The GPU HIP accelerator execution task.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        class TaskKernelGpuHipRt final :
-            public workdiv::WorkDivMembers<TDim, TIdx>
-        {
-        public:
-// gcc-4.9 libstdc++ does not support std::is_trivially_copyable.
-// MSVC std::is_trivially_copyable seems to be buggy (last tested at 15.7).
-#if (!__GLIBCXX__) && (!BOOST_COMP_MSVC)
-            static_assert(
-                meta::Conjunction<
-                    std::is_trivially_copyable<
-                        TKernelFnObj>,
-                    std::is_trivially_copyable<
-                        TArgs>...
-                    >::value,
-                "The given kernel function object and its arguments have to fulfill is_trivially_copyable!");
-#endif
-
-            //-----------------------------------------------------------------------------
-            //! Constructor.
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST TaskKernelGpuHipRt(
-                TWorkDiv && workDiv,
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args) :
-                    workdiv::WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv)),
-                    m_kernelFnObj(kernelFnObj),
-                    m_args(std::forward<TArgs>(args)...)
-            {
-                static_assert(
-                    dim::Dim<typename std::decay<TWorkDiv>::type>::value == TDim::value,
-                    "The work division and the execution task have to be of the same dimensionality!");
-            }
-            //-----------------------------------------------------------------------------
-            //! Copy constructor.
-            TaskKernelGpuHipRt(TaskKernelGpuHipRt const &) = default;
-            //-----------------------------------------------------------------------------
-            //! Move constructor.
-            TaskKernelGpuHipRt(TaskKernelGpuHipRt &&) = default;
-            //-----------------------------------------------------------------------------
-            //! Copy assignment operator.
-            auto operator=(TaskKernelGpuHipRt const &) -> TaskKernelGpuHipRt & = default;
-            //-----------------------------------------------------------------------------
-            //! Move assignment operator.
-            auto operator=(TaskKernelGpuHipRt &&) -> TaskKernelGpuHipRt & = default;
-            //-----------------------------------------------------------------------------
-            //! Destructor.
-            ALPAKA_FN_HOST_ACC ~TaskKernelGpuHipRt() = default;
-
-            TKernelFnObj m_kernelFnObj;
-            std::tuple<typename std::decay<TArgs>::type...> m_args;
-        };
-    }
-
-    namespace acc
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP execution task accelerator type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct AccType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = acc::AccGpuHipRt<TDim, TIdx>;
-            };
-        }
-    }
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP execution task device type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DevType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = dev::DevHipRt;
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP execution task dimension getter trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct DimType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU HIP execution task platform type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct PltfType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = pltf::PltfHipRt;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP execution task idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct IdxType<
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP non-blocking kernel enqueue trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory size
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    //std::size_t printfFifoSize;
-                    //hipDeviceGetLimit(&printfFifoSize, hipLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                    //hipDeviceSetLimit(hipLimitPrintfFifoSize, printfFifoSize*10);
-                    //hipDeviceGetLimit(&printfFifoSize, hipLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
-#endif
-                    auto const gridBlockExtent(
-                        workdiv::getWorkDiv<Grid, Blocks>(task));
-                    auto const blockThreadExtent(
-                        workdiv::getWorkDiv<Block, Threads>(task));
-                    auto const threadElemExtent(
-                        workdiv::getWorkDiv<Thread, Elems>(task));
-
-                    dim3 const gridDim(kernel::hip::detail::convertVecToHipDim(gridBlockExtent));
-                    dim3 const blockDim(kernel::hip::detail::convertVecToHipDim(blockThreadExtent));
-                    kernel::hip::detail::checkVecOnly3Dim(threadElemExtent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__
-                        << " gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x
-                        << " blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x
-                        << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                    if(!workdiv::isValidWorkDiv<acc::AccGpuHipRt<TDim, TIdx>>(dev::getDev(queue), task))
-                    {
-                        throw std::runtime_error("The given work division is not valid or not supported by the device of type " + acc::getAccName<acc::AccGpuHipRt<TDim, TIdx>>() + "!");
-                    }
-#endif
-
-                    // Get the size of the block shared dynamic memory.
-                    auto const blockSharedMemDynSizeBytes(
-                        meta::apply(
-                            // workaround for HIP(HCC) to
-                            // avoid forbidden host-call
-                            // within host-device functions
-                            #if defined(BOOST_COMP_HCC) && BOOST_COMP_HCC
-                            ALPAKA_FN_HOST_ACC
-                            #endif
-                            [&](typename std::decay<TArgs>::type const & ... args)
-                            {
-                                return
-                                    kernel::getBlockSharedMemDynSizeBytes<
-                                        acc::AccGpuHipRt<TDim, TIdx>>(
-                                            task.m_kernelFnObj,
-                                            blockThreadExtent,
-                                            threadElemExtent,
-                                            args...);
-                            },
-                            task.m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the block shared memory size.
-                    std::cout << __func__
-                        << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the function attributes.
-                    /*hipFuncAttributes funcAttrs;
-                    hipFuncGetAttributes(&funcAttrs, kernel::hip::detail::hipKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type...>);
-                    std::cout << __func__
-                        << " binaryVersion: " << funcAttrs.binaryVersion
-                        << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                        << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                        << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                        << " numRegs: " << funcAttrs.numRegs
-                        << " ptxVersion: " << funcAttrs.ptxVersion
-                        << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B"
-                        << std::endl; */
-#endif
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            queue.m_spQueueImpl->m_dev.m_iDevice));
-
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            hipLaunchKernelGGL(
-                                HIP_KERNEL_NAME(kernel::hip::detail::hipKernel< TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type... >),
-                                gridDim,
-                                blockDim,
-                                static_cast<std::uint32_t>(blockSharedMemDynSizeBytes),
-                                queue.m_spQueueImpl->m_HipQueue,
-                                hipLaunchParm{},
-                                threadElemExtent,
-                                task.m_kernelFnObj,
-                                args...
-                            );
-
-                        },
-                        task.m_args);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message.
-                    hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue);
-                    std::string const kernelName("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
-                    ::alpaka::hip::detail::hipRtCheckLastError(kernelName.c_str(), __FILE__, __LINE__);
-#endif
-                }
-            };
-            //#############################################################################
-            //! The HIP synchronous kernel enqueue trait specialization.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...>>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    kernel::TaskKernelGpuHipRt<TDim, TIdx, TKernelFnObj, TArgs...> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory size
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    //std::size_t printfFifoSize;
-                    //hipDeviceGetLimit(&printfFifoSize, hipLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                    //hipDeviceSetLimit(hipLimitPrintfFifoSize, printfFifoSize*10);
-                    //hipDeviceGetLimit(&printfFifoSize, hipLimitPrintfFifoSize);
-                    //std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
-#endif
-                    auto const gridBlockExtent(
-                        workdiv::getWorkDiv<Grid, Blocks>(task));
-                    auto const blockThreadExtent(
-                        workdiv::getWorkDiv<Block, Threads>(task));
-                    auto const threadElemExtent(
-                        workdiv::getWorkDiv<Thread, Elems>(task));
-
-                    dim3 gridDim(kernel::hip::detail::convertVecToHipDim(gridBlockExtent));
-                    dim3 blockDim(kernel::hip::detail::convertVecToHipDim(blockThreadExtent));
-                    kernel::hip::detail::checkVecOnly3Dim(threadElemExtent);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << __func__ << "gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x << std::endl;
-                    std::cout << __func__ << "blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    // This checks for a valid work division that is also compliant with the maxima of the accelerator.
-                    if(!workdiv::isValidWorkDiv<acc::AccGpuHipRt<TDim, TIdx>>(dev::getDev(queue), task))
-                    {
-                        throw std::runtime_error("The given work division is not valid or not supported by the device of type " + acc::getAccName<acc::AccGpuHipRt<TDim, TIdx>>() + "!");
-                    }
-#endif
-
-                    // Get the size of the block shared dynamic memory.
-                    auto const blockSharedMemDynSizeBytes(
-                        meta::apply(
-                            [&](typename std::decay<TArgs>::type const & ... args)
-                            {
-                                return
-                                    kernel::getBlockSharedMemDynSizeBytes<
-                                        acc::AccGpuHipRt<TDim, TIdx>>(
-                                            task.m_kernelFnObj,
-                                            blockThreadExtent,
-                                            threadElemExtent,
-                                            args...);
-                            },
-                            task.m_args));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // Log the block shared memory size.
-                    std::cout << __func__
-                        << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B" << std::endl;
-#endif
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    // hipFuncAttributes not ported from HIP to HIP.
-                    // Log the function attributes.
-                    /*hipFuncAttributes funcAttrs;
-                    hipFuncGetAttributes(&funcAttrs, kernel::hip::detail::hipKernel<TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type....>);
-                    std::cout << __func__
-                        << " binaryVersion: " << funcAttrs.binaryVersion
-                        << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                        << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                        << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                        << " numRegs: " << funcAttrs.numRegs
-                        << " ptxVersion: " << funcAttrs.ptxVersion
-                        << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B"
-                        << std::endl;*/
-#endif
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            queue.m_spQueueImpl->m_dev.m_iDevice));
-
-                    meta::apply(
-                        [&](typename std::decay<TArgs>::type const & ... args)
-                        {
-                            hipLaunchKernelGGL(
-                                HIP_KERNEL_NAME(kernel::hip::detail::hipKernel< TDim, TIdx, TKernelFnObj, typename std::decay<TArgs>::type... >),
-                                gridDim,
-                                blockDim,
-                                static_cast<std::uint32_t>(blockSharedMemDynSizeBytes),
-                                queue.m_spQueueImpl->m_HipQueue,
-                                hipLaunchParm{},
-                                threadElemExtent,
-                                task.m_kernelFnObj,
-                                args...
-                            );
-                        },
-                        task.m_args);
-
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom error message.
-                    hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue);
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::string const kernelName("'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
-                    ::alpaka::hip::detail::hipRtCheckLastError(kernelName.c_str(), __FILE__, __LINE__);
-#endif
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
new file mode 100644
index 0000000000..9cbf83e101
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
@@ -0,0 +1,460 @@
+/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+// Specialized traits.
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+#    include <alpaka/queue/Traits.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+// Implementation details.
+#    include <alpaka/acc/AccGpuUniformCudaHipRt.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/queue/QueueUniformCudaHipRtBlocking.hpp>
+#    include <alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <alpaka/acc/Traits.hpp>
+#        include <alpaka/dev/Traits.hpp>
+#        include <alpaka/workdiv/WorkDivHelpers.hpp>
+#    endif
+
+#    include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/meta/Metafunctions.hpp>
+
+#    include <stdexcept>
+#    include <tuple>
+#    include <type_traits>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
+
+namespace alpaka
+{
+    namespace uniform_cuda_hip
+    {
+        namespace detail
+        {
+            //-----------------------------------------------------------------------------
+            //! The GPU CUDA/HIP kernel entry point.
+            // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
+            template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+            __global__ void uniformCudaHipKernel(
+                Vec<TDim, TIdx> const threadElemExtent,
+                TKernelFnObj const kernelFnObj,
+                TArgs... args)
+            {
+#    if BOOST_ARCH_PTX && (BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(2, 0, 0))
+#        error "Device capability >= 2.0 is required!"
+#    endif
+
+                const TAcc acc(threadElemExtent);
+
+// with clang it is not possible to query std::result_of for a pure device lambda created on the host side
+#    if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
+                static_assert(
+                    std::is_same<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)), void>::value,
+                    "The TKernelFnObj is required to return void!");
+#    endif
+                kernelFnObj(const_cast<TAcc const&>(acc), args...);
+            }
+
+            //-----------------------------------------------------------------------------
+            template<typename TDim, typename TIdx>
+            ALPAKA_FN_HOST auto checkVecOnly3Dim(Vec<TDim, TIdx> const& vec) -> void
+            {
+                for(auto i(std::min(static_cast<typename TDim::value_type>(3), TDim::value)); i < TDim::value; ++i)
+                {
+                    if(vec[TDim::value - 1u - i] != 1)
+                    {
+                        throw std::runtime_error("The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
+                                                 "work division extents of the dimensions higher 3 have to be 1!");
+                    }
+                }
+            }
+
+            //-----------------------------------------------------------------------------
+            template<typename TDim, typename TIdx>
+            ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(Vec<TDim, TIdx> const& vec) -> dim3
+            {
+                dim3 dim(1, 1, 1);
+                for(auto i(static_cast<typename TDim::value_type>(0));
+                    i < std::min(static_cast<typename TDim::value_type>(3), TDim::value);
+                    ++i)
+                {
+                    reinterpret_cast<unsigned int*>(&dim)[i] = static_cast<unsigned int>(vec[TDim::value - 1u - i]);
+                }
+                checkVecOnly3Dim(vec);
+                return dim;
+            }
+        } // namespace detail
+    } // namespace uniform_cuda_hip
+
+    //#############################################################################
+    //! The GPU CUDA/HIP accelerator execution task.
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGpuUniformCudaHipRt final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelGpuUniformCudaHipRt(
+            TWorkDiv&& workDiv,
+            TKernelFnObj const& kernelFnObj,
+            TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelGpuUniformCudaHipRt(TaskKernelGpuUniformCudaHipRt const&) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelGpuUniformCudaHipRt(TaskKernelGpuUniformCudaHipRt&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelGpuUniformCudaHipRt const&) -> TaskKernelGpuUniformCudaHipRt& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelGpuUniformCudaHipRt&&) -> TaskKernelGpuUniformCudaHipRt& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelGpuUniformCudaHipRt() = default;
+
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The GPU CUDA/HIP execution task accelerator type trait specialization.
+        template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccGpuUniformCudaHipRt<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP execution task device type trait specialization.
+        template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP execution task dimension getter trait specialization.
+        template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The CPU CUDA/HIP execution task platform type trait specialization.
+        template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PltfUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP execution task idx type trait specialization.
+        template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP non-blocking kernel enqueue trait specialization.
+        template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking,
+            TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                // std::size_t printfFifoSize;
+                // cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
+                // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
+                // cudaDeviceSetLimit(cudaLimitPrintfFifoSize, printfFifoSize*10);
+                // cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
+                // std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
+#    endif
+                auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(task));
+                auto const blockThreadExtent(getWorkDiv<Block, Threads>(task));
+                auto const threadElemExtent(getWorkDiv<Thread, Elems>(task));
+
+                dim3 const gridDim(uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent));
+                dim3 const blockDim(uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent));
+                uniform_cuda_hip::detail::checkVecOnly3Dim(threadElemExtent);
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " gridDim: " << gridDim.z << " " << gridDim.y << " " << gridDim.x
+                          << " blockDim: " << blockDim.z << " " << blockDim.y << " " << blockDim.x << std::endl;
+#    endif
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                // This checks for a valid work division that is also compliant with the maxima of the accelerator.
+                if(!isValidWorkDiv<TAcc>(getDev(queue), task))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid or not supported by the device of type "
+                        + getAccName<AccGpuUniformCudaHipRt<TDim, TIdx>>() + "!");
+                }
+#    endif
+
+                // Get the size of the block shared dynamic memory.
+                auto const blockSharedMemDynSizeBytes(meta::apply(
+                    [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                        return getBlockSharedMemDynSizeBytes<TAcc>(
+                            task.m_kernelFnObj,
+                            blockThreadExtent,
+                            threadElemExtent,
+                            args...);
+                    },
+                    task.m_args));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                // Log the block shared memory idx.
+                std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                          << std::endl;
+#    endif
+                auto kernelName = uniform_cuda_hip::detail::
+                    uniformCudaHipKernel<TAcc, TDim, TIdx, TKernelFnObj, std::decay_t<TArgs>...>;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+
+                // Log the function attributes.
+                cudaFuncAttributes funcAttrs;
+                cudaFuncGetAttributes(&funcAttrs, kernelName);
+                std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
+                          << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
+                          << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
+                          << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
+                          << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
+                          << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
+#        endif
+#    endif
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(queue.m_spQueueImpl->m_dev.m_iDevice));
+                // Enqueue the kernel execution.
+                // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
+                // language extension expects the arguments by value. This forces the type of a float argument given
+                // with std::forward to this function to be of type float instead of e.g. "float const & __ptr64"
+                // (MSVC). If not given by value, the kernel launch code does not copy the value but the pointer to the
+                // value location.
+                meta::apply(
+                    [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                        kernelName<<<
+                            gridDim,
+                            blockDim,
+                            static_cast<std::size_t>(blockSharedMemDynSizeBytes),
+                            queue.m_spQueueImpl->m_UniformCudaHipQueue>>>(
+                            threadElemExtent,
+                            task.m_kernelFnObj,
+                            args...);
+#    else
+                        hipLaunchKernelGGL(
+                            HIP_KERNEL_NAME(kernelName),
+                            gridDim,
+                            blockDim,
+                            static_cast<std::uint32_t>(blockSharedMemDynSizeBytes),
+                            queue.m_spQueueImpl->m_UniformCudaHipQueue,
+                            threadElemExtent,
+                            task.m_kernelFnObj,
+                            args...);
+#    endif
+                    },
+                    task.m_args);
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                // Wait for the kernel execution to finish but do not check error return of this call.
+                // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom
+                // error message.
+                ALPAKA_API_PREFIX(StreamSynchronize)(queue.m_spQueueImpl->m_UniformCudaHipQueue);
+                std::string const msg(
+                    "'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
+                ::alpaka::uniform_cuda_hip::detail::rtCheckLastError(msg.c_str(), __FILE__, __LINE__);
+#    endif
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP synchronous kernel enqueue trait specialization.
+        template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking,
+            TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking& queue,
+                TaskKernelGpuUniformCudaHipRt<TAcc, TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                // std::size_t printfFifoSize;
+                // cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
+                // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
+                // cudaDeviceSetLimit(cudaLimitPrintfFifoSize, printfFifoSize*10);
+                // cudaDeviceGetLimit(&printfFifoSize, cudaLimitPrintfFifoSize);
+                // std::cout << __func__ << "INFO: printfFifoSize: " <<  printfFifoSize << std::endl;
+#    endif
+                auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(task));
+                auto const blockThreadExtent(getWorkDiv<Block, Threads>(task));
+                auto const threadElemExtent(getWorkDiv<Thread, Elems>(task));
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                dim3 const gridDim(uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent));
+                dim3 const blockDim(uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent));
+                uniform_cuda_hip::detail::checkVecOnly3Dim(threadElemExtent);
+#    else
+                dim3 gridDim(uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent));
+                dim3 blockDim(uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent));
+                uniform_cuda_hip::detail::checkVecOnly3Dim(threadElemExtent);
+#    endif
+
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << "gridDim: " << gridDim.z << " " << gridDim.y << " " << gridDim.x << std::endl;
+                std::cout << __func__ << "blockDim: " << blockDim.z << " " << blockDim.y << " " << blockDim.x
+                          << std::endl;
+#    endif
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                // This checks for a valid work division that is also compliant with the maxima of the accelerator.
+                if(!isValidWorkDiv<TAcc>(getDev(queue), task))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid or not supported by the device of type "
+                        + getAccName<AccGpuUniformCudaHipRt<TDim, TIdx>>() + "!");
+                }
+#    endif
+
+                // Get the size of the block shared dynamic memory.
+                auto const blockSharedMemDynSizeBytes(meta::apply(
+                    [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                        return getBlockSharedMemDynSizeBytes<TAcc>(
+                            task.m_kernelFnObj,
+                            blockThreadExtent,
+                            threadElemExtent,
+                            args...);
+                    },
+                    task.m_args));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                // Log the block shared memory idx.
+                std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                          << std::endl;
+#    endif
+
+                auto kernelName = uniform_cuda_hip::detail::
+                    uniformCudaHipKernel<TAcc, TDim, TIdx, TKernelFnObj, std::decay_t<TArgs>...>;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                // hipFuncAttributes not ported from HIP to HIP.
+                // TODO why this is currently not possible
+                //
+                // Log the function attributes.
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                ALPAKA_API_PREFIX(FuncAttributes) funcAttrs;
+                ALPAKA_API_PREFIX(FuncGetAttributes)(&funcAttrs, kernelName);
+                std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
+                          << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
+                          << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
+                          << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
+                          << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
+                          << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
+#        endif
+#    endif
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(queue.m_spQueueImpl->m_dev.m_iDevice));
+
+                // Enqueue the kernel execution.
+                meta::apply(
+                    [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                        kernelName<<<
+                            gridDim,
+                            blockDim,
+                            static_cast<std::size_t>(blockSharedMemDynSizeBytes),
+                            queue.m_spQueueImpl->m_UniformCudaHipQueue>>>(
+                            threadElemExtent,
+                            task.m_kernelFnObj,
+                            args...);
+#    else
+                        hipLaunchKernelGGL(
+                            HIP_KERNEL_NAME(kernelName),
+                            gridDim,
+                            blockDim,
+                            static_cast<std::uint32_t>(blockSharedMemDynSizeBytes),
+                            queue.m_spQueueImpl->m_UniformCudaHipQueue,
+                            threadElemExtent,
+                            task.m_kernelFnObj,
+                            args...);
+#    endif
+                    },
+                    task.m_args);
+
+                // Wait for the kernel execution to finish but do not check error return of this call.
+                // Do not use the alpaka::wait method because it checks the error itself but we want to give a custom
+                // error message.
+                ALPAKA_API_PREFIX(StreamSynchronize)(queue.m_spQueueImpl->m_UniformCudaHipQueue);
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                std::string const msg(
+                    "'execution of kernel: '" + std::string(typeid(TKernelFnObj).name()) + "' failed with");
+                ::alpaka::uniform_cuda_hip::detail::rtCheckLastError(msg.c_str(), __FILE__, __LINE__);
+#    endif
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelOacc.hpp
new file mode 100644
index 0000000000..e706831f04
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelOacc.hpp
@@ -0,0 +1,244 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+// Specialized traits.
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+
+// Implementation details.
+#    include <alpaka/acc/AccOacc.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/ctx/block/CtxBlockOacc.hpp>
+#    include <alpaka/dev/DevOacc.hpp>
+#    include <alpaka/idx/MapIdx.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+#    include <algorithm>
+#    include <functional>
+#    include <stdexcept>
+#    include <tuple>
+#    include <type_traits>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The OpenACC accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelOacc final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelOacc(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelOacc(TaskKernelOacc const& other) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelOacc(TaskKernelOacc&& other) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelOacc const&) -> TaskKernelOacc& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelOacc&&) -> TaskKernelOacc& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelOacc() = default;
+
+        //-----------------------------------------------------------------------------
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()(const DevOacc& dev) const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(*this));
+            auto const blockThreadExtent(getWorkDiv<Block, Threads>(*this));
+            auto const threadElemExtent(getWorkDiv<Thread, Elems>(*this));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+            std::cout << "m_gridBlockExtent=" << this->m_gridBlockExtent << "\tgridBlockExtent=" << gridBlockExtent
+                      << std::endl;
+            std::cout << "m_blockThreadExtent=" << this->m_blockThreadExtent
+                      << "\tblockThreadExtent=" << blockThreadExtent << std::endl;
+            std::cout << "m_threadElemExtent=" << this->m_threadElemExtent << "\tthreadElemExtent=" << threadElemExtent
+                      << std::endl;
+#    endif
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes(meta::apply(
+                [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return getBlockSharedMemDynSizeBytes<AccOacc<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args));
+
+#    if ALPAKA_DEBUG > ALPAKA_DEBUG_MINIMAL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+            // The number of blocks in the grid.
+            TIdx const gridBlockCount(gridBlockExtent.prod());
+            // The number of threads in a block.
+            TIdx const blockThreadCount(blockThreadExtent.prod());
+
+            if(gridBlockCount == 0 || blockThreadCount == 0)
+            { //! empty grid is a NOP
+                return;
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+            std::cout << "threadElemCount=" << threadElemExtent[0u] << "\tgridBlockCount=" << gridBlockCount
+                      << std::endl;
+#    endif
+            auto argsD = m_args;
+            auto kernelFnObj = m_kernelFnObj;
+            dev.makeCurrent();
+#    pragma acc parallel num_workers(blockThreadCount)                                                                \
+        copyin(threadElemExtent, blockThreadExtent, argsD, gridBlockExtent) default(present)
+            {
+                {
+#    pragma acc loop gang
+                    for(TIdx b = 0u; b < gridBlockCount; ++b)
+                    {
+                        CtxBlockOacc<TDim, TIdx> blockShared(
+                            gridBlockExtent,
+                            blockThreadExtent,
+                            threadElemExtent,
+                            b,
+                            blockSharedMemDynSizeBytes);
+
+// Execute the threads in parallel.
+
+// Parallel execution of the threads in a block is required because when
+// syncBlockThreads is called all of them have to be done with their work up
+// to this line.  So we have to spawn one OS thread per thread in a block.
+//! \warning The OpenACC is technically allowed to ignore the value in the num_workers clause
+//! and could run fewer threads. The standard provides no way to check how many worker threads are running.
+//! If fewer threads are run, syncBlockThreads will dead-lock. It is up to the developer/user
+//! to choose a blockThreadCount which the runtime will respect.
+#    pragma acc loop worker
+                        for(TIdx w = 0; w < blockThreadCount; ++w)
+                        {
+                            AccOacc<TDim, TIdx> acc(w, blockShared);
+
+                            meta::apply(
+                                [kernelFnObj, &acc](typename std::decay<TArgs>::type const&... args) {
+                                    kernelFnObj(acc, args...);
+                                },
+                                argsD);
+                        }
+                        freeSharedVars(blockShared);
+                    }
+                }
+            }
+        }
+
+    private:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenACC execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccOacc<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The OpenACC execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevOacc;
+        };
+
+        //#############################################################################
+        //! The OpenACC execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The OpenACC execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PltfOacc;
+        };
+
+        //#############################################################################
+        //! The OpenACC execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct Enqueue<QueueOaccBlocking, TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueOaccBlocking& queue,
+                TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            {
+                std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+
+                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true;
+
+                task(queue.m_spQueueImpl->m_dev);
+
+                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false;
+            }
+        };
+
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct Enqueue<QueueOaccNonBlocking, TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueOaccNonBlocking& queue,
+                TaskKernelOacc<TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            {
+                queue.m_spQueueImpl->m_workerThread.enqueueTask(
+                    [&queue, task]() { task(queue.m_spQueueImpl->m_dev); });
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelOmp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelOmp5.hpp
new file mode 100644
index 0000000000..7e7ce1c505
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/TaskKernelOmp5.hpp
@@ -0,0 +1,290 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+// Specialized traits.
+#    include <alpaka/acc/Traits.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/Traits.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+
+// Implementation details.
+#    include <alpaka/acc/AccOmp5.hpp>
+#    include <alpaka/core/Decay.hpp>
+#    include <alpaka/dev/DevOmp5.hpp>
+#    include <alpaka/idx/MapIdx.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/meta/ApplyTuple.hpp>
+#    include <alpaka/workdiv/WorkDivMembers.hpp>
+
+#    include <omp.h>
+
+#    include <algorithm>
+#    include <functional>
+#    include <stdexcept>
+#    include <tuple>
+#    include <type_traits>
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        include <iostream>
+#    endif
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The OpenMP 5.0 accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelOmp5 final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelOmp5(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+        //-----------------------------------------------------------------------------
+        TaskKernelOmp5(TaskKernelOmp5 const& other) = default;
+        //-----------------------------------------------------------------------------
+        TaskKernelOmp5(TaskKernelOmp5&& other) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelOmp5 const&) -> TaskKernelOmp5& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(TaskKernelOmp5&&) -> TaskKernelOmp5& = default;
+        //-----------------------------------------------------------------------------
+        ~TaskKernelOmp5() = default;
+
+        //-----------------------------------------------------------------------------
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()(const DevOmp5& dev) const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(*this));
+            auto const blockThreadExtent(getWorkDiv<Block, Threads>(*this));
+            auto const threadElemExtent(getWorkDiv<Thread, Elems>(*this));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+            std::cout << "m_gridBlockExtent=" << this->m_gridBlockExtent << "\tgridBlockExtent=" << gridBlockExtent
+                      << std::endl;
+            std::cout << "m_blockThreadExtent=" << this->m_blockThreadExtent
+                      << "\tblockThreadExtent=" << blockThreadExtent << std::endl;
+            std::cout << "m_threadElemExtent=" << this->m_threadElemExtent << "\tthreadElemExtent=" << threadElemExtent
+                      << std::endl;
+#    endif
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes(meta::apply(
+                [&](ALPAKA_DECAY_T(TArgs) const&... args) {
+                    return getBlockSharedMemDynSizeBytes<AccOmp5<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+            // We have to make sure, that the OpenMP runtime keeps enough threads for executing a block in parallel.
+            TIdx const maxOmpThreadCount(static_cast<TIdx>(::omp_get_max_threads()));
+            // The number of blocks in the grid.
+            TIdx const gridBlockCount(gridBlockExtent.prod());
+            // The number of threads in a block.
+            TIdx const blockThreadCount(blockThreadExtent.prod());
+
+            if(gridBlockCount == 0 || blockThreadCount == 0)
+            { //! empty grid is a NOP
+                return;
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+            if(maxOmpThreadCount < blockThreadExtent.prod())
+            {
+                std::cout
+                    << "Warning: TaskKernelOmp5: maxOmpThreadCount smaller than blockThreadCount requested by caller:"
+                    << maxOmpThreadCount << " < " << blockThreadExtent.prod() << std::endl;
+            }
+#    endif
+            // make sure there is at least on team
+            TIdx const teamCount(std::max(
+                std::min(static_cast<TIdx>(maxOmpThreadCount / blockThreadCount), gridBlockCount),
+                static_cast<TIdx>(1u)));
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << "threadElemCount=" << threadElemExtent[0u] << std::endl;
+            std::cout << "teamCount=" << teamCount << "\tgridBlockCount=" << gridBlockCount << std::endl;
+#    endif
+
+            if(::omp_in_parallel() != 0)
+            {
+                throw std::runtime_error("The OpenMP 5.0 backend can not be used within an existing parallel region!");
+            }
+
+            // Force the environment to use the given number of threads.
+            int const ompIsDynamic(::omp_get_dynamic());
+            ::omp_set_dynamic(0);
+
+            // `When an if(scalar-expression) evaluates to false, the structured block is executed on the host.`
+            auto argsD = m_args;
+            auto kernelFnObj = m_kernelFnObj;
+            const auto iDevice = dev.iDevice();
+#    pragma omp target device(iDevice)
+            {
+#    pragma omp teams distribute num_teams(teamCount) // thread_limit(blockThreadCount)
+                for(TIdx t = 0u; t < gridBlockCount; ++t)
+                {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL || defined ALPAKA_CI
+                    // The first team does some checks ...
+                    if(t == 0)
+                    {
+                        int const iNumTeams(::omp_get_num_teams());
+                        printf("%s omp_get_num_teams: %d\n", __func__, iNumTeams);
+                    }
+                    printf("threadElemCount_dev %d\n", int(threadElemExtent[0u]));
+#    endif
+                    AccOmp5<TDim, TIdx>
+                        acc(gridBlockExtent, blockThreadExtent, threadElemExtent, t, blockSharedMemDynSizeBytes);
+
+                    // Execute the threads in parallel.
+
+                    // Parallel execution of the threads in a block is required because when syncBlockThreads is called
+                    // all of them have to be done with their work up to this line. So we have to spawn one OS thread
+                    // per thread in a block. 'omp for' is not useful because it is meant for cases where multiple
+                    // iterations are executed by one thread but in our case a 1:1 mapping is required. Therefore we
+                    // use 'omp parallel' with the specified number of threads in a block.
+#    ifndef __ibmxl_vrm__
+// setting num_threads to any value leads XL to run only one thread per team
+#        pragma omp parallel num_threads(blockThreadCount)
+#    else
+#        pragma omp parallel
+#    endif
+                    {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL || defined ALPAKA_CI
+                        // The first thread does some checks in the first block executed.
+                        if((::omp_get_thread_num() == 0) && (t == 0))
+                        {
+                            int const numThreads(::omp_get_num_threads());
+                            printf("%s omp_get_num_threads: %d\n", __func__, numThreads);
+                            if(numThreads != static_cast<int>(blockThreadCount))
+                            {
+                                printf("ERROR: The OpenMP runtime did not use the number of threads that had been "
+                                       "requested!\n");
+                            }
+                        }
+#    endif
+                        meta::apply(
+                            [kernelFnObj, &acc](typename std::decay<TArgs>::type const&... args) {
+                                kernelFnObj(acc, args...);
+                            },
+                            argsD);
+
+                        // Wait for all threads to finish before deleting the shared memory.
+                        // This is done by default if the omp 'nowait' clause is missing
+                        // syncBlockThreads(acc);
+                    }
+                }
+            }
+
+            // Reset the dynamic thread number setting.
+            ::omp_set_dynamic(ompIsDynamic);
+        }
+
+    private:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenMP 5.0 execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccOmp5<TDim, TIdx>;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevOmp5;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PltfType<TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PltfOmp5;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5.0 execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct Enqueue<QueueOmp5Blocking, TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueOmp5Blocking& queue,
+                TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            {
+                std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+
+                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true;
+
+                task(queue.m_spQueueImpl->m_dev);
+
+                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false;
+            }
+        };
+
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct Enqueue<QueueOmp5NonBlocking, TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueOmp5NonBlocking& queue,
+                TaskKernelOmp5<TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            {
+                queue.m_spQueueImpl->m_workerThread.enqueueTask(
+                    [&queue, task]() { task(queue.m_spQueueImpl->m_dev); });
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/kernel/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/kernel/Traits.hpp
index d851fd00c3..caba1043c4 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/kernel/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/kernel/Traits.hpp
@@ -1,6 +1,6 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
+/* Copyright 2019-2020 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,18 +9,18 @@
 
 #pragma once
 
-#include <alpaka/vec/Vec.hpp>
+#include <alpaka/core/BoostPredef.hpp>
 #include <alpaka/core/Common.hpp>
+#include <alpaka/core/Debug.hpp>
+#include <alpaka/core/OmpSchedule.hpp>
 #include <alpaka/core/Unused.hpp>
-
 #include <alpaka/dim/Traits.hpp>
 #include <alpaka/idx/Traits.hpp>
+#include <alpaka/meta/Void.hpp>
 #include <alpaka/queue/Traits.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Debug.hpp>
+#include <alpaka/vec/Vec.hpp>
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-    #include <alpaka/workdiv/Traits.hpp>
+#    include <alpaka/workdiv/Traits.hpp>
 #endif
 
 #include <type_traits>
@@ -30,231 +30,284 @@
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
-    //! The kernel specifics.
-    namespace kernel
+    //! The kernel traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The kernel traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The kernel execution task creation trait.
-            template<
-                typename TAcc,
-                typename TWorkDiv,
-                typename TKernelFnObj,
-                typename... TArgs/*,
-                typename TSfinae = void*/>
-            struct CreateTaskKernel;
+        //#############################################################################
+        //! The kernel execution task creation trait.
+        template<
+            typename TAcc,
+            typename TWorkDiv,
+            typename TKernelFnObj,
+            typename... TArgs/*,
+            typename TSfinae = void*/>
+        struct CreateTaskKernel;
 
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory of a kernel.
-            //!
-            //! \tparam TKernelFnObj The kernel function object.
-            //! \tparam TAcc The accelerator.
-            //!
-            //! The default implementation returns 0.
-            template<
-                typename TKernelFnObj,
-                typename TAcc,
-                typename TSfinae = void>
-            struct BlockSharedMemDynSizeBytes
-            {
+        //#############################################################################
+        //! The trait for getting the size of the block shared dynamic memory of a kernel.
+        //!
+        //! \tparam TKernelFnObj The kernel function object.
+        //! \tparam TAcc The accelerator.
+        //!
+        //! The default implementation returns 0.
+        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
+        struct BlockSharedMemDynSizeBytes
+        {
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"  // clang does not support the syntax for variadic template arguments "args,..."
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
-                //-----------------------------------------------------------------------------
-                //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-                //! \param blockThreadExtent The block thread extent.
-                //! \param threadElemExtent The thread element extent.
-                //! \tparam TArgs The kernel invocation argument types pack.
-                //! \param args,... The kernel invocation arguments.
-                //! \return The size of the shared memory allocated for a block in bytes.
-                //! The default version always returns zero.
+            //-----------------------------------------------------------------------------
+            //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+            //! \param blockThreadExtent The block thread extent.
+            //! \param threadElemExtent The thread element extent.
+            //! \tparam TArgs The kernel invocation argument types pack.
+            //! \param args,... The kernel invocation arguments.
+            //! \return The size of the shared memory allocated for a block in bytes.
+            //! The default version always returns zero.
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
+#    pragma clang diagnostic pop
 #endif
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename... TArgs>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    TKernelFnObj const & kernelFnObj,
-                    vec::Vec<TDim, idx::Idx<TAcc>> const & blockThreadExtent,
-                    vec::Vec<TDim, idx::Idx<TAcc>> const & threadElemExtent,
-                    TArgs const & ... args)
-                -> idx::Idx<TAcc>
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TDim, typename... TArgs>
+            ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+                TKernelFnObj const& kernelFnObj,
+                Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+                Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+                TArgs const&... args) -> std::size_t
+            {
+                alpaka::ignore_unused(kernelFnObj);
+                alpaka::ignore_unused(blockThreadExtent);
+                alpaka::ignore_unused(threadElemExtent);
+                alpaka::ignore_unused(args...);
+
+                return 0u;
+            }
+        };
+
+        namespace detail
+        {
+            //#############################################################################
+            //! Functor to get OpenMP schedule defined by kernel class.
+            //! When no schedule is defined, return a default one.
+            template<class TKernel, class = void>
+            struct GetOmpSchedule
+            {
+                ALPAKA_FN_HOST static auto get()
                 {
-                    alpaka::ignore_unused(kernelFnObj);
-                    alpaka::ignore_unused(blockThreadExtent);
-                    alpaka::ignore_unused(threadElemExtent);
-                    alpaka::ignore_unused(args...);
+                    return alpaka::omp::Schedule{};
+                }
+            };
 
-                    return 0;
+            //! Functor to get OpenMP schedule for kernel classes with
+            //! ompSchedule static member.
+            //! That member is never odr-used by alpaka.
+            template<class TKernel>
+            struct GetOmpSchedule<TKernel, meta::Void<decltype(TKernel::ompSchedule)>>
+            {
+                ALPAKA_FN_HOST static auto get()
+                {
+                    // Just having return TKernel::ompSchedule here would be
+                    // a non-odr use of that variable, since it would be an
+                    // argument of the copy constructor. So have to manually
+                    // create a new identical object and then return it.
+                    return alpaka::omp::Schedule{TKernel::ompSchedule.kind, TKernel::ompSchedule.chunkSize};
                 }
             };
-        }
+        } // namespace detail
 
+        //#############################################################################
+        //! The trait for getting the schedule to use when a kernel is run using
+        //! the CpuOmp2Blocks accelerator.
+        //!
+        //! Has no effect on other accelerators or when run using OpenMP
+        //! implementation not supporting at least version 3.0.
+        //!
+        //! A user could either specialize this trait for their kernel, or define
+        //! a public static member ompSchedule of type alpaka::omp::Schedule
+        //! inside it, which would be picked up by this implementation.
+        //! In the latter case, alpaka never odr-uses that member.
+        //!
+        //! \tparam TKernelFnObj The kernel function object.
+        //! \tparam TAcc The accelerator.
+        //!
+        //! The default implementation returns 0.
+        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
+        struct OmpSchedule
+        {
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"  // clang does not support the syntax for variadic template arguments "args,..."
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
-        //-----------------------------------------------------------------------------
-        //! \tparam TAcc The accelerator type.
-        //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-        //! \param blockThreadExtent The block thread extent.
-        //! \param threadElemExtent The thread element extent.
-        //! \param args,... The kernel invocation arguments.
-        //! \return The size of the shared memory allocated for a block in bytes.
-        //! The default implementation always returns zero.
+            //-----------------------------------------------------------------------------
+            //! \param kernelFnObj The kernel object for which the schedule should be returned.
+            //! \param blockThreadExtent The block thread extent.
+            //! \param threadElemExtent The thread element extent.
+            //! \tparam TArgs The kernel invocation argument types pack.
+            //! \param args,... The kernel invocation arguments.
+            //! \return The OpenMP schedule information.
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
+#    pragma clang diagnostic pop
 #endif
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TAcc,
-            typename TKernelFnObj,
-            typename TDim,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(
-            TKernelFnObj const & kernelFnObj,
-            vec::Vec<TDim, idx::Idx<TAcc>> const & blockThreadExtent,
-            vec::Vec<TDim, idx::Idx<TAcc>> const & threadElemExtent,
-            TArgs const & ... args)
-        -> idx::Idx<TAcc>
-        {
-            return
-                traits::BlockSharedMemDynSizeBytes<
-                    TKernelFnObj,
-                    TAcc>
-                ::getBlockSharedMemDynSizeBytes(
-                    kernelFnObj,
-                    blockThreadExtent,
-                    threadElemExtent,
-                    args...);
-        }
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TDim, typename... TArgs>
+            ALPAKA_FN_HOST static auto getOmpSchedule(
+                TKernelFnObj const& kernelFnObj,
+                Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+                Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+                TArgs const&... args) -> alpaka::omp::Schedule
+            {
+                alpaka::ignore_unused(kernelFnObj);
+                alpaka::ignore_unused(blockThreadExtent);
+                alpaka::ignore_unused(threadElemExtent);
+                alpaka::ignore_unused(args...);
+
+                return detail::GetOmpSchedule<TKernelFnObj>::get();
+            }
+        };
+    } // namespace traits
 
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"  // clang does not support the syntax for variadic template arguments "args,..."
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
+    //-----------------------------------------------------------------------------
+    //! \tparam TAcc The accelerator type.
+    //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+    //! \param blockThreadExtent The block thread extent.
+    //! \param threadElemExtent The thread element extent.
+    //! \param args,... The kernel invocation arguments.
+    //! \return The size of the shared memory allocated for a block in bytes.
+    //! The default implementation always returns zero.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename TKernelFnObj, typename TDim, typename... TArgs>
+    ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(
+        TKernelFnObj const& kernelFnObj,
+        Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+        Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+        TArgs const&... args) -> std::size_t
+    {
+        return traits::BlockSharedMemDynSizeBytes<TKernelFnObj, TAcc>::getBlockSharedMemDynSizeBytes(
+            kernelFnObj,
+            blockThreadExtent,
+            threadElemExtent,
+            args...);
+    }
 
-        namespace detail
-        {
-            //#############################################################################
-            //! Check that the return of TKernelFnObj is void
-            template<typename TAcc>
-            struct CheckFnReturnType
-            {
-                template<
-                    typename TKernelFnObj,
-                    typename... TArgs>
-                void operator()(
-                    TKernelFnObj const &,
-                    TArgs const & ...)
-                {
-                    static_assert(
-                        std::is_same<typename std::result_of<TKernelFnObj(TAcc const &, TArgs const & ...)>::type, void>::value,
-                        "The TKernelFnObj is required to return void!");
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! Creates a kernel execution task.
-        //!
-        //! \tparam TAcc The accelerator type.
-        //! \param workDiv The index domain work division.
-        //! \param kernelFnObj The kernel function object which should be executed.
-        //! \param args,... The kernel invocation arguments.
-        //! \return The kernel execution task.
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
-        template<
-            typename TAcc,
-            typename TWorkDiv,
-            typename TKernelFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST auto createTaskKernel(
-            TWorkDiv const & workDiv,
-            TKernelFnObj const & kernelFnObj,
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::CreateTaskKernel<
-                TAcc,
-                TWorkDiv,
-                TKernelFnObj,
-                TArgs...>
-            ::createTaskKernel(
-                workDiv,
-                kernelFnObj,
-                std::forward<TArgs>(args)...))
+    //-----------------------------------------------------------------------------
+    //! \tparam TAcc The accelerator type.
+    //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+    //! \param blockThreadExtent The block thread extent.
+    //! \param threadElemExtent The thread element extent.
+    //! \param args,... The kernel invocation arguments.
+    //! \return The OpenMP schedule information.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
 #endif
+    template<typename TAcc, typename TKernelFnObj, typename TDim, typename... TArgs>
+    ALPAKA_FN_HOST auto getOmpSchedule(
+        TKernelFnObj const& kernelFnObj,
+        Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+        Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+        TArgs const&... args) -> omp::Schedule
+    {
+        return traits::OmpSchedule<TKernelFnObj, TAcc>::getOmpSchedule(
+            kernelFnObj,
+            blockThreadExtent,
+            threadElemExtent,
+            args...);
+    }
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
+#endif
+
+    namespace detail
+    {
+        //#############################################################################
+        //! Check that the return of TKernelFnObj is void
+        template<typename TAcc, typename TSfinae = void>
+        struct CheckFnReturnType
         {
-            // check for void return type
-            detail::CheckFnReturnType<TAcc>{}(kernelFnObj, args...);
+            template<typename TKernelFnObj, typename... TArgs>
+            void operator()(TKernelFnObj const&, TArgs const&...)
+            {
+#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703
+                using Result = std::invoke_result_t<TKernelFnObj, TAcc const&, TArgs const&...>;
+#else
+                using Result = std::result_of_t<TKernelFnObj(TAcc const&, TArgs const&...)>;
+#endif
+                static_assert(std::is_same<Result, void>::value, "The TKernelFnObj is required to return void!");
+            }
+        };
+    } // namespace detail
+    //-----------------------------------------------------------------------------
+    //! Creates a kernel execution task.
+    //!
+    //! \tparam TAcc The accelerator type.
+    //! \param workDiv The index domain work division.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args,... The kernel invocation arguments.
+    //! \return The kernel execution task.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+    template<typename TAcc, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+    {
+        // check for void return type
+        detail::CheckFnReturnType<TAcc>{}(kernelFnObj, args...);
 
-            static_assert(
-                dim::Dim<typename std::decay<TWorkDiv>::type>::value == dim::Dim<TAcc>::value,
-                "The dimensions of TAcc and TWorkDiv have to be identical!");
-            static_assert(
-                std::is_same<idx::Idx<typename std::decay<TWorkDiv>::type>, idx::Idx<TAcc>>::value,
-                "The idx type of TAcc and the idx type of TWorkDiv have to be identical!");
+        static_assert(
+            Dim<std::decay_t<TWorkDiv>>::value == Dim<TAcc>::value,
+            "The dimensions of TAcc and TWorkDiv have to be identical!");
+        static_assert(
+            std::is_same<Idx<std::decay_t<TWorkDiv>>, Idx<TAcc>>::value,
+            "The idx type of TAcc and the idx type of TWorkDiv have to be identical!");
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            std::cout << __func__
-                << " workDiv: " << workDiv
-                << ", kernelFnObj: " << typeid(kernelFnObj).name()
-                << std::endl;
+        std::cout << __func__ << " workDiv: " << workDiv << ", kernelFnObj: " << typeid(kernelFnObj).name()
+                  << std::endl;
 #endif
-            return
-                traits::CreateTaskKernel<
-                    TAcc,
-                    TWorkDiv,
-                    TKernelFnObj,
-                    TArgs...>::createTaskKernel(
-                        workDiv,
-                        kernelFnObj,
-                        std::forward<TArgs>(args)...);
-        }
+        return traits::CreateTaskKernel<TAcc, TWorkDiv, TKernelFnObj, TArgs...>::createTaskKernel(
+            workDiv,
+            kernelFnObj,
+            std::forward<TArgs>(args)...);
+    }
 
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdocumentation"  // clang does not support the syntax for variadic template arguments "args,..."
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
 #endif
-        //-----------------------------------------------------------------------------
-        //! Executes the given kernel in the given queue.
-        //!
-        //! \tparam TAcc The accelerator type.
-        //! \param queue The queue to enqueue the view copy task into.
-        //! \param workDiv The index domain work division.
-        //! \param kernelFnObj The kernel function object which should be executed.
-        //! \param args,... The kernel invocation arguments.
+    //-----------------------------------------------------------------------------
+    //! Executes the given kernel in the given queue.
+    //!
+    //! \tparam TAcc The accelerator type.
+    //! \param queue The queue to enqueue the view copy task into.
+    //! \param workDiv The index domain work division.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args,... The kernel invocation arguments.
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
+#    pragma clang diagnostic pop
 #endif
-        template<
-            typename TAcc,
-            typename TQueue,
-            typename TWorkDiv,
-            typename TKernelFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST auto exec(
-            TQueue & queue,
-            TWorkDiv const & workDiv,
-            TKernelFnObj const & kernelFnObj,
-            TArgs && ... args)
+    template<typename TAcc, typename TQueue, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto exec(TQueue& queue, TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
         -> void
-        {
-            queue::enqueue(
-                queue,
-                kernel::createTaskKernel<
-                    TAcc>(
-                    workDiv,
-                    kernelFnObj,
-                    std::forward<TArgs>(args)...));
-        }
+    {
+        enqueue(queue, createTaskKernel<TAcc>(workDiv, kernelFnObj, std::forward<TArgs>(args)...));
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/MathCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/MathCudaBuiltIn.hpp
deleted file mode 100644
index 34b8af0058..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/MathCudaBuiltIn.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/abs/AbsCudaBuiltIn.hpp>
-#include <alpaka/math/acos/AcosCudaBuiltIn.hpp>
-#include <alpaka/math/asin/AsinCudaBuiltIn.hpp>
-#include <alpaka/math/atan/AtanCudaBuiltIn.hpp>
-#include <alpaka/math/atan2/Atan2CudaBuiltIn.hpp>
-#include <alpaka/math/cbrt/CbrtCudaBuiltIn.hpp>
-#include <alpaka/math/ceil/CeilCudaBuiltIn.hpp>
-#include <alpaka/math/cos/CosCudaBuiltIn.hpp>
-#include <alpaka/math/erf/ErfCudaBuiltIn.hpp>
-#include <alpaka/math/exp/ExpCudaBuiltIn.hpp>
-#include <alpaka/math/floor/FloorCudaBuiltIn.hpp>
-#include <alpaka/math/fmod/FmodCudaBuiltIn.hpp>
-#include <alpaka/math/log/LogCudaBuiltIn.hpp>
-#include <alpaka/math/max/MaxCudaBuiltIn.hpp>
-#include <alpaka/math/min/MinCudaBuiltIn.hpp>
-#include <alpaka/math/pow/PowCudaBuiltIn.hpp>
-#include <alpaka/math/remainder/RemainderCudaBuiltIn.hpp>
-#include <alpaka/math/round/RoundCudaBuiltIn.hpp>
-#include <alpaka/math/rsqrt/RsqrtCudaBuiltIn.hpp>
-#include <alpaka/math/sin/SinCudaBuiltIn.hpp>
-#include <alpaka/math/sincos/SinCosCudaBuiltIn.hpp>
-#include <alpaka/math/sqrt/SqrtCudaBuiltIn.hpp>
-#include <alpaka/math/tan/TanCudaBuiltIn.hpp>
-#include <alpaka/math/trunc/TruncCudaBuiltIn.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The mathematical operation specifics.
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library math trait specializations.
-        class MathCudaBuiltIn :
-            public AbsCudaBuiltIn,
-            public AcosCudaBuiltIn,
-            public AsinCudaBuiltIn,
-            public AtanCudaBuiltIn,
-            public Atan2CudaBuiltIn,
-            public CbrtCudaBuiltIn,
-            public CeilCudaBuiltIn,
-            public CosCudaBuiltIn,
-            public ErfCudaBuiltIn,
-            public ExpCudaBuiltIn,
-            public FloorCudaBuiltIn,
-            public FmodCudaBuiltIn,
-            public LogCudaBuiltIn,
-            public MaxCudaBuiltIn,
-            public MinCudaBuiltIn,
-            public PowCudaBuiltIn,
-            public RemainderCudaBuiltIn,
-            public RoundCudaBuiltIn,
-            public RsqrtCudaBuiltIn,
-            public SinCudaBuiltIn,
-            public SinCosCudaBuiltIn,
-            public SqrtCudaBuiltIn,
-            public TanCudaBuiltIn,
-            public TruncCudaBuiltIn
-        {};
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/MathHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/MathHipBuiltIn.hpp
deleted file mode 100644
index e362dfd35f..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/MathHipBuiltIn.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/abs/AbsHipBuiltIn.hpp>
-#include <alpaka/math/acos/AcosHipBuiltIn.hpp>
-#include <alpaka/math/asin/AsinHipBuiltIn.hpp>
-#include <alpaka/math/atan/AtanHipBuiltIn.hpp>
-#include <alpaka/math/atan2/Atan2HipBuiltIn.hpp>
-#include <alpaka/math/cbrt/CbrtHipBuiltIn.hpp>
-#include <alpaka/math/ceil/CeilHipBuiltIn.hpp>
-#include <alpaka/math/cos/CosHipBuiltIn.hpp>
-#include <alpaka/math/erf/ErfHipBuiltIn.hpp>
-#include <alpaka/math/exp/ExpHipBuiltIn.hpp>
-#include <alpaka/math/floor/FloorHipBuiltIn.hpp>
-#include <alpaka/math/fmod/FmodHipBuiltIn.hpp>
-#include <alpaka/math/log/LogHipBuiltIn.hpp>
-#include <alpaka/math/max/MaxHipBuiltIn.hpp>
-#include <alpaka/math/min/MinHipBuiltIn.hpp>
-#include <alpaka/math/pow/PowHipBuiltIn.hpp>
-#include <alpaka/math/remainder/RemainderHipBuiltIn.hpp>
-#include <alpaka/math/round/RoundHipBuiltIn.hpp>
-#include <alpaka/math/rsqrt/RsqrtHipBuiltIn.hpp>
-#include <alpaka/math/sin/SinHipBuiltIn.hpp>
-#include <alpaka/math/sincos/SinCosHipBuiltIn.hpp>
-#include <alpaka/math/sqrt/SqrtHipBuiltIn.hpp>
-#include <alpaka/math/tan/TanHipBuiltIn.hpp>
-#include <alpaka/math/trunc/TruncHipBuiltIn.hpp>
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    //! The mathematical operation specifics.
-    namespace math
-    {
-        //#############################################################################
-        //! The standard library math trait specializations.
-        class MathHipBuiltIn :
-            public AbsHipBuiltIn,
-            public AcosHipBuiltIn,
-            public AsinHipBuiltIn,
-            public AtanHipBuiltIn,
-            public Atan2HipBuiltIn,
-            public CbrtHipBuiltIn,
-            public CeilHipBuiltIn,
-            public CosHipBuiltIn,
-            public ErfHipBuiltIn,
-            public ExpHipBuiltIn,
-            public FloorHipBuiltIn,
-            public FmodHipBuiltIn,
-            public LogHipBuiltIn,
-            public MaxHipBuiltIn,
-            public MinHipBuiltIn,
-            public PowHipBuiltIn,
-            public RemainderHipBuiltIn,
-            public RoundHipBuiltIn,
-            public RsqrtHipBuiltIn,
-            public SinCosHipBuiltIn,
-            public SinHipBuiltIn,
-            public SqrtHipBuiltIn,
-            public TanHipBuiltIn,
-            public TruncHipBuiltIn
-        {};
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/MathStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/MathStdLib.hpp
index eea098bcda..389eaa913e 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/MathStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/MathStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -42,31 +42,32 @@ namespace alpaka
     {
         //#############################################################################
         //! The standard library math trait specializations.
-        class MathStdLib :
-            public AbsStdLib,
-            public AcosStdLib,
-            public AsinStdLib,
-            public AtanStdLib,
-            public Atan2StdLib,
-            public CbrtStdLib,
-            public CeilStdLib,
-            public CosStdLib,
-            public ErfStdLib,
-            public ExpStdLib,
-            public FloorStdLib,
-            public FmodStdLib,
-            public LogStdLib,
-            public MaxStdLib,
-            public MinStdLib,
-            public PowStdLib,
-            public RemainderStdLib,
-            public RoundStdLib,
-            public RsqrtStdLib,
-            public SinStdLib,
-            public SinCosStdLib,
-            public SqrtStdLib,
-            public TanStdLib,
-            public TruncStdLib
-        {};
-    }
-}
+        class MathStdLib
+            : public AbsStdLib
+            , public AcosStdLib
+            , public AsinStdLib
+            , public AtanStdLib
+            , public Atan2StdLib
+            , public CbrtStdLib
+            , public CeilStdLib
+            , public CosStdLib
+            , public ErfStdLib
+            , public ExpStdLib
+            , public FloorStdLib
+            , public FmodStdLib
+            , public LogStdLib
+            , public MaxStdLib
+            , public MinStdLib
+            , public PowStdLib
+            , public RemainderStdLib
+            , public RoundStdLib
+            , public RsqrtStdLib
+            , public SinStdLib
+            , public SinCosStdLib
+            , public SqrtStdLib
+            , public TanStdLib
+            , public TruncStdLib
+        {
+        };
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..627c671352
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,102 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/math/abs/AbsUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/acos/AcosUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/asin/AsinUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/atan/AtanUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/atan2/Atan2UniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/cbrt/CbrtUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/ceil/CeilUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/cos/CosUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/erf/ErfUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/exp/ExpUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/floor/FloorUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/fmod/FmodUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/log/LogUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/max/MaxUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/min/MinUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/pow/PowUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/remainder/RemainderUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/round/RoundUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/rsqrt/RsqrtUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/sin/SinUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/sincos/SinCosUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/sqrt/SqrtUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/tan/TanUniformCudaHipBuiltIn.hpp>
+#    include <alpaka/math/trunc/TruncUniformCudaHipBuiltIn.hpp>
+
+namespace alpaka
+{
+    //-----------------------------------------------------------------------------
+    //! The mathematical operation specifics.
+    namespace math
+    {
+        //#############################################################################
+        //! The standard library math trait specializations.
+        class MathUniformCudaHipBuiltIn
+            : public AbsUniformCudaHipBuiltIn
+            , public AcosUniformCudaHipBuiltIn
+            , public AsinUniformCudaHipBuiltIn
+            , public AtanUniformCudaHipBuiltIn
+            , public Atan2UniformCudaHipBuiltIn
+            , public CbrtUniformCudaHipBuiltIn
+            , public CeilUniformCudaHipBuiltIn
+            , public CosUniformCudaHipBuiltIn
+            , public ErfUniformCudaHipBuiltIn
+            , public ExpUniformCudaHipBuiltIn
+            , public FloorUniformCudaHipBuiltIn
+            , public FmodUniformCudaHipBuiltIn
+            , public LogUniformCudaHipBuiltIn
+            , public MaxUniformCudaHipBuiltIn
+            , public MinUniformCudaHipBuiltIn
+            , public PowUniformCudaHipBuiltIn
+            , public RemainderUniformCudaHipBuiltIn
+            , public RoundUniformCudaHipBuiltIn
+            , public RsqrtUniformCudaHipBuiltIn
+            , public SinUniformCudaHipBuiltIn
+            , public SinCosUniformCudaHipBuiltIn
+            , public SqrtUniformCudaHipBuiltIn
+            , public TanUniformCudaHipBuiltIn
+            , public TruncUniformCudaHipBuiltIn
+        {
+        };
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsCudaBuiltIn.hpp
deleted file mode 100644
index f75de34d8e..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsCudaBuiltIn.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/abs/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in abs.
-        class AbsCudaBuiltIn : public concepts::Implements<ConceptMathAbs, AbsCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA built in abs trait specialization.
-            template<
-                typename TArg>
-            struct Abs<
-                AbsCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto abs(
-                    AbsCudaBuiltIn const & abs_ctx,
-                    TArg const & arg)
-                -> decltype(::abs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::abs(arg);
-                }
-            };
-            //! The CUDA built in abs double specialization.
-            template<>
-            struct Abs<
-                AbsCudaBuiltIn,
-                double>
-            {
-                __device__ static auto abs(
-                    AbsCudaBuiltIn const & abs_ctx,
-                    double const & arg)
-                -> decltype(::fabs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::fabs(arg);
-                }
-            };
-            //! The CUDA built in abs float specialization.
-            template<>
-            struct Abs<
-                AbsCudaBuiltIn,
-                float>
-            {
-                __device__ static auto abs(
-                    AbsCudaBuiltIn const & abs_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::fabsf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsHipBuiltIn.hpp
deleted file mode 100644
index e1dba07f61..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsHipBuiltIn.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/abs/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP built in abs.
-        class AbsHipBuiltIn : public concepts::Implements<ConceptMathAbs, AbsHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP built in abs trait specialization.
-            template<
-                typename TArg>
-            struct Abs<
-                AbsHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto abs(
-                    AbsHipBuiltIn const & abs_ctx,
-                    TArg const & arg)
-                -> decltype(::abs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::abs(arg);
-                }
-            };
-            //! The HIP built in abs double specialization.
-            template<>
-            struct Abs<
-                AbsHipBuiltIn,
-                double>
-            {
-                __device__ static auto abs(
-                    AbsHipBuiltIn const & abs_ctx,
-                    double const & arg)
-                -> decltype(::fabs(arg))
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::fabs(arg);
-                }
-            };
-            //! The HIP built in abs float specialization.
-            template<>
-            struct Abs<
-                AbsHipBuiltIn,
-                float>
-            {
-                __device__ static auto abs(
-                    AbsHipBuiltIn const & abs_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(abs_ctx);
-                    return ::fabsf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsStdLib.hpp
index 2a32da3f16..8e64fa24c9 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,13 +9,12 @@
 
 #pragma once
 
-#include <alpaka/math/abs/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/abs/Traits.hpp>
 
-#include <type_traits>
-#include <cstdlib>
 #include <cmath>
+#include <cstdlib>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -31,24 +30,18 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library abs trait specialization.
-            template<
-                typename TArg>
+            template<typename TArg>
             struct Abs<
                 AbsStdLib,
                 TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value
-                    && std::is_signed<TArg>::value>::type>
+                std::enable_if_t<std::is_arithmetic<TArg>::value && std::is_signed<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto abs(
-                    AbsStdLib const & abs_ctx,
-                    TArg const & arg)
-                -> decltype(std::abs(arg))
+                ALPAKA_FN_HOST static auto abs(AbsStdLib const& abs_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(abs_ctx);
                     return std::abs(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..ae9d7741be
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/abs/AbsUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,92 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/abs/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in abs.
+        class AbsUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAbs, AbsUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA built in abs trait specialization.
+            template<typename TArg>
+            struct Abs<AbsUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto abs(AbsUniformCudaHipBuiltIn const& abs_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(abs_ctx);
+                    return ::abs(arg);
+                }
+            };
+            //! The CUDA built in abs double specialization.
+            template<>
+            struct Abs<AbsUniformCudaHipBuiltIn, double>
+            {
+                __device__ static auto abs(AbsUniformCudaHipBuiltIn const& abs_ctx, double const& arg)
+                {
+                    alpaka::ignore_unused(abs_ctx);
+                    return ::fabs(arg);
+                }
+            };
+            //! The CUDA built in abs float specialization.
+            template<>
+            struct Abs<AbsUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto abs(AbsUniformCudaHipBuiltIn const& abs_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(abs_ctx);
+                    return ::fabsf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/abs/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/abs/Traits.hpp
index 460ca80817..6c7708ab53 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/abs/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/abs/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,15 +12,15 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathAbs;
+        struct ConceptMathAbs
+        {
+        };
 
         //-----------------------------------------------------------------------------
         //! The math traits.
@@ -28,12 +28,9 @@ namespace alpaka
         {
             //#############################################################################
             //! The abs trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Abs;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the absolute value.
@@ -43,30 +40,11 @@ namespace alpaka
         //! \param abs_ctx The object specializing Abs.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto abs(
-            T const & abs_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Abs<
-                concepts::ImplementationBase<ConceptMathAbs, T>,
-                TArg>
-            ::abs(
-                abs_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto abs(T const& abs_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathAbs, T>;
-            return
-                traits::Abs<
-                    ImplementationBase,
-                    TArg>
-                ::abs(
-                    abs_ctx,
-                    arg);
+            return traits::Abs<ImplementationBase, TArg>::abs(abs_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosCudaBuiltIn.hpp
deleted file mode 100644
index 67140c7ee0..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/acos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in acos.
-        class AcosCudaBuiltIn : public concepts::Implements<ConceptMathAcos, AcosCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA acos trait specialization.
-            template<
-                typename TArg>
-            struct Acos<
-                AcosCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto acos(
-                    AcosCudaBuiltIn const & acos_ctx,
-                    TArg const & arg)
-                -> decltype(::acos(arg))
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return ::acos(arg);
-                }
-            };
-
-            template<>
-            struct Acos<
-                AcosCudaBuiltIn,
-                float>
-            {
-                __device__ static auto acos(
-                    AcosCudaBuiltIn const & acos_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return ::acosf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosHipBuiltIn.hpp
deleted file mode 100644
index 442b9ce865..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosHipBuiltIn.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/acos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP acos.
-        class AcosHipBuiltIn : public concepts::Implements<ConceptMathAcos, AcosHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP acos trait specialization.
-            template<
-                typename TArg>
-            struct Acos<
-                AcosHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                __device__ static auto acos(
-                    AcosHipBuiltIn const & acos_ctx,
-                    TArg const & arg)
-                -> decltype(::acos(arg))
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return ::acos(arg);
-                }
-            };
-            //! The HIP acos float specialization.
-            template<>
-            struct Acos<
-                AcosHipBuiltIn,
-                float>
-            {
-                ALPAKA_NO_HOST_ACC_WARNING
-                __device__ static auto acos(
-                    AcosHipBuiltIn const & acos_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(acos_ctx);
-                    return ::acosf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosStdLib.hpp
index aacf9d1bbe..38992daf44 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/acos/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/acos/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library acos trait specialization.
-            template<
-                typename TArg>
-            struct Acos<
-                AcosStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Acos<AcosStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto acos(
-                    AcosStdLib const & acos_ctx,
-                    TArg const & arg)
-                -> decltype(std::acos(arg))
+                ALPAKA_FN_HOST static auto acos(AcosStdLib const& acos_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(acos_ctx);
                     return std::acos(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..da056c659d
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/acos/AcosUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/acos/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in acos.
+        class AcosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAcos, AcosUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA acos trait specialization.
+            template<typename TArg>
+            struct Acos<AcosUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto acos(AcosUniformCudaHipBuiltIn const& acos_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(acos_ctx);
+                    return ::acos(arg);
+                }
+            };
+
+            template<>
+            struct Acos<AcosUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto acos(AcosUniformCudaHipBuiltIn const& acos_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(acos_ctx);
+                    return ::acosf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/acos/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/acos/Traits.hpp
index 7d5b853554..aa244bcad3 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/acos/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/acos/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,58 +12,40 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathAcos;
+        struct ConceptMathAcos
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The acos trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Acos;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the principal value of the arc cosine.
         //!
+        //! The valid real argument range is [-1.0, 1.0]. For other values
+        //! the result may depend on the backend and compilation options, will
+        //! likely be NaN.
+        //!
         //! \tparam TArg The arg type.
         //! \param acos_ctx The object specializing Acos.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto acos(
-            T const & acos_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Acos<
-                concepts::ImplementationBase<ConceptMathAcos, T>,
-                TArg>
-            ::acos(
-                acos_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto acos(T const& acos_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathAcos, T>;
-            return
-                traits::Acos<
-                    ImplementationBase,
-                    TArg>
-                ::acos(
-                    acos_ctx,
-                    arg);
+            return traits::Acos<ImplementationBase, TArg>::acos(acos_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinCudaBuiltIn.hpp
deleted file mode 100644
index 751bdc2eeb..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/asin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in asin.
-        class AsinCudaBuiltIn : public concepts::Implements<ConceptMathAsin, AsinCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA asin trait specialization.
-            template<
-                typename TArg>
-            struct Asin<
-                AsinCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto asin(
-                    AsinCudaBuiltIn const & asin_ctx,
-                    TArg const & arg)
-                -> decltype(::asin(arg))
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return ::asin(arg);
-                }
-            };
-
-            template<>
-            struct Asin<
-                AsinCudaBuiltIn,
-                float>
-            {
-                __device__ static auto asin(
-                    AsinCudaBuiltIn const & asin_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return ::asinf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinHipBuiltIn.hpp
deleted file mode 100644
index 9111b4fd08..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/asin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP asin.
-        class AsinHipBuiltIn : public concepts::Implements<ConceptMathAsin, AsinHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP asin trait specialization.
-            template<
-                typename TArg>
-            struct Asin<
-                AsinHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto asin(
-                    AsinHipBuiltIn const & asin_ctx,
-                    TArg const & arg)
-                -> decltype(::asin(arg))
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return ::asin(arg);
-                }
-            };
-            //! The HIP asin float specialization.
-            template<>
-            struct Asin<
-                AsinHipBuiltIn,
-                float>
-            {
-                __device__ static auto asin(
-                    AsinHipBuiltIn const & asin_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(asin_ctx);
-                    return ::asinf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinStdLib.hpp
index 248f0d2bfa..af53099044 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/asin/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/asin/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library asin trait specialization.
-            template<
-                typename TArg>
-            struct Asin<
-                AsinStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Asin<AsinStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto asin(
-                    AsinStdLib const & asin_ctx,
-                    TArg const & arg)
-                -> decltype(std::asin(arg))
+                ALPAKA_FN_HOST static auto asin(AsinStdLib const& asin_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(asin_ctx);
                     return std::asin(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..70aad5e33b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/asin/AsinUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/asin/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in asin.
+        class AsinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAsin, AsinUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA asin trait specialization.
+            template<typename TArg>
+            struct Asin<AsinUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto asin(AsinUniformCudaHipBuiltIn const& asin_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(asin_ctx);
+                    return ::asin(arg);
+                }
+            };
+
+            template<>
+            struct Asin<AsinUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto asin(AsinUniformCudaHipBuiltIn const& asin_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(asin_ctx);
+                    return ::asinf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/asin/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/asin/Traits.hpp
index 903b5da7fe..b5c975c2fe 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/asin/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/asin/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,58 +12,40 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathAsin;
+        struct ConceptMathAsin
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The asin trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Asin;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the principal value of the arc sine.
         //!
+        //! The valid real argument range is [-1.0, 1.0]. For other values
+        //! the result may depend on the backend and compilation options, will
+        //! likely be NaN.
+        //!
         //! \tparam TArg The arg type.
         //! \param asin_ctx The object specializing Asin.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto asin(
-            T const & asin_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Asin<
-                concepts::ImplementationBase<ConceptMathAsin, T>,
-                TArg>
-            ::asin(
-                asin_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto asin(T const& asin_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathAsin, T>;
-            return
-                traits::Asin<
-                    ImplementationBase,
-                    TArg>
-                ::asin(
-                    asin_ctx,
-                    arg);
+            return traits::Asin<ImplementationBase, TArg>::asin(asin_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanCudaBuiltIn.hpp
deleted file mode 100644
index 3d96c3711a..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/atan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in atan.
-        class AtanCudaBuiltIn : public concepts::Implements<ConceptMathAtan, AtanCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA atan trait specialization.
-            template<
-                typename TArg>
-            struct Atan<
-                AtanCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto atan(
-                    AtanCudaBuiltIn const & atan_ctx,
-                    TArg const & arg)
-                -> decltype(::atan(arg))
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return ::atan(arg);
-                }
-            };
-
-            template<>
-            struct Atan<
-                AtanCudaBuiltIn,
-                float>
-            {
-                __device__ static auto atan(
-                    AtanCudaBuiltIn const & atan_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return ::atanf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanHipBuiltIn.hpp
deleted file mode 100644
index bc792c38a0..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/atan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP atan.
-        class AtanHipBuiltIn : public concepts::Implements<ConceptMathAtan, AtanHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP atan trait specialization.
-            template<
-                typename TArg>
-            struct Atan<
-                AtanHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto atan(
-                    AtanHipBuiltIn const & atan_ctx,
-                    TArg const & arg)
-                -> decltype(::atan(arg))
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return ::atan(arg);
-                }
-            };
-            //! The HIP atan float specialization.
-            template<>
-            struct Atan<
-                AtanHipBuiltIn,
-                float>
-            {
-                __device__ static auto atan(
-                    AtanHipBuiltIn const & atan_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(atan_ctx);
-                    return ::atanf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanStdLib.hpp
index 91b95358f7..88c7d22af8 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/atan/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/atan/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library atan trait specialization.
-            template<
-                typename TArg>
-            struct Atan<
-                AtanStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Atan<AtanStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto atan(
-                    AtanStdLib const & atan_ctx,
-                    TArg const & arg)
-                -> decltype(std::atan(arg))
+                ALPAKA_FN_HOST static auto atan(AtanStdLib const& atan_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(atan_ctx);
                     return std::atan(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..30894e4275
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan/AtanUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/atan/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in atan.
+        class AtanUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAtan, AtanUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA atan trait specialization.
+            template<typename TArg>
+            struct Atan<AtanUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto atan(AtanUniformCudaHipBuiltIn const& atan_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(atan_ctx);
+                    return ::atan(arg);
+                }
+            };
+
+            template<>
+            struct Atan<AtanUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto atan(AtanUniformCudaHipBuiltIn const& atan_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(atan_ctx);
+                    return ::atanf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan/Traits.hpp
index 2303c82dd1..2131ab3776 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/atan/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathAtan;
+        struct ConceptMathAtan
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The atan trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Atan;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the principal value of the arc tangent.
@@ -40,30 +37,11 @@ namespace alpaka
         //! \param atan_ctx The object specializing Atan.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto atan(
-            T const & atan_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Atan<
-                concepts::ImplementationBase<ConceptMathAtan, T>,
-                TArg>
-            ::atan(
-                atan_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto atan(T const& atan_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathAtan, T>;
-            return
-                traits::Atan<
-                    ImplementationBase,
-                    TArg>
-                ::atan(
-                    atan_ctx,
-                    arg);
+            return traits::Atan<ImplementationBase, TArg>::atan(atan_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2CudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2CudaBuiltIn.hpp
deleted file mode 100644
index fb7cd0cc36..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2CudaBuiltIn.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/atan2/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in atan2.
-        class Atan2CudaBuiltIn : public concepts::Implements<ConceptMathAtan2, Atan2CudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA atan2 trait specialization.
-            template<
-                typename Ty,
-                typename Tx>
-            struct Atan2<
-                Atan2CudaBuiltIn,
-                Ty,
-                Tx,
-                typename std::enable_if<
-                    std::is_floating_point<Ty>::value
-                    && std::is_floating_point<Tx>::value>::type>
-            {
-                __device__ static auto atan2(
-                    Atan2CudaBuiltIn const & atan2_ctx,
-                    Ty const & y,
-                    Tx const & x)
-                -> decltype(::atan2(y, x))
-                {
-                    alpaka::ignore_unused(atan2_ctx);
-                    return ::atan2(y, x);
-                }
-            };
-
-            template<>
-            struct Atan2<
-                Atan2CudaBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto atan2(
-                    Atan2CudaBuiltIn const & atan2_ctx,
-                    float const & y,
-                    float const & x)
-                -> float
-                {
-                    alpaka::ignore_unused(atan2_ctx);
-                    return ::atan2f(y, x);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2HipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2HipBuiltIn.hpp
deleted file mode 100644
index 51f13b52e6..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2HipBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/atan2/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP atan2.
-        class Atan2HipBuiltIn : public concepts::Implements<ConceptMathAtan2, Atan2HipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP atan2 trait specialization.
-            template<
-                typename Ty,
-                typename Tx>
-            struct Atan2<
-                Atan2HipBuiltIn,
-                Ty,
-                Tx,
-                typename std::enable_if<
-                    std::is_floating_point<Ty>::value
-                    && std::is_floating_point<Tx>::value>::type>
-            {
-                __device__ static auto atan2(
-                    Atan2HipBuiltIn const & atan2_ctx,
-                    Ty const & y,
-                    Tx const & x)
-                -> decltype(::atan2(y, x))
-                {
-                    alpaka::ignore_unused(atan2_ctx);
-                    return ::atan2(y, x);
-                }
-            };
-            //! The HIP sin float specialization.
-            template<>
-            struct Atan2<
-                Atan2HipBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto atan2(
-                    Atan2HipBuiltIn const & atan2_ctx,
-                    float const & y,
-                    float const & x)
-                -> float
-                {
-                    alpaka::ignore_unused(atan2_ctx);
-                    return ::atan2f(y, x);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp
index f5810060b7..864e004804 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2StdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/atan2/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/atan2/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,27 +29,19 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library atan2 trait specialization.
-            template<
-                typename Ty,
-                typename Tx>
+            template<typename Ty, typename Tx>
             struct Atan2<
                 Atan2StdLib,
                 Ty,
                 Tx,
-                typename std::enable_if<
-                    std::is_arithmetic<Ty>::value
-                    && std::is_arithmetic<Tx>::value>::type>
+                std::enable_if_t<std::is_arithmetic<Ty>::value && std::is_arithmetic<Tx>::value>>
             {
-                ALPAKA_FN_HOST static auto atan2(
-                    Atan2StdLib const & abs,
-                    Ty const & y,
-                    Tx const & x)
-                -> decltype(std::atan2(y, x))
+                ALPAKA_FN_HOST static auto atan2(Atan2StdLib const& abs, Ty const& y, Tx const& x)
                 {
                     alpaka::ignore_unused(abs);
                     return std::atan2(y, x);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2UniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2UniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..b5b9ce9653
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Atan2UniformCudaHipBuiltIn.hpp
@@ -0,0 +1,89 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/atan2/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in atan2.
+        class Atan2UniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAtan2, Atan2UniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA atan2 trait specialization.
+            template<typename Ty, typename Tx>
+            struct Atan2<
+                Atan2UniformCudaHipBuiltIn,
+                Ty,
+                Tx,
+                std::enable_if_t<std::is_floating_point<Ty>::value && std::is_floating_point<Tx>::value>>
+            {
+                __device__ static auto atan2(Atan2UniformCudaHipBuiltIn const& atan2_ctx, Ty const& y, Tx const& x)
+                {
+                    alpaka::ignore_unused(atan2_ctx);
+                    return ::atan2(y, x);
+                }
+            };
+
+            template<>
+            struct Atan2<Atan2UniformCudaHipBuiltIn, float, float>
+            {
+                __device__ static auto atan2(
+                    Atan2UniformCudaHipBuiltIn const& atan2_ctx,
+                    float const& y,
+                    float const& x) -> float
+                {
+                    alpaka::ignore_unused(atan2_ctx);
+                    return ::atan2f(y, x);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Traits.hpp
index 32d4bda83d..aa3435b658 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/atan2/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,27 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathAtan2;
+        struct ConceptMathAtan2
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The atan2 trait.
-            template<
-                typename T,
-                typename Ty,
-                typename Tx,
-                typename TSfinae = void>
+            template<typename T, typename Ty, typename Tx, typename TSfinae = void>
             struct Atan2;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
@@ -44,36 +40,11 @@ namespace alpaka
         //! \param y The y arg.
         //! \param x The x arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Ty,
-            typename Tx>
-        ALPAKA_FN_HOST_ACC auto atan2(
-            T const & atan2_ctx,
-            Ty const & y,
-            Tx const & x)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Atan2<
-                concepts::ImplementationBase<ConceptMathAtan2, T>,
-                Ty,
-                Tx>
-            ::atan2(
-                atan2_ctx,
-                y,
-                x))
-#endif
+        template<typename T, typename Ty, typename Tx>
+        ALPAKA_FN_HOST_ACC auto atan2(T const& atan2_ctx, Ty const& y, Tx const& x)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathAtan2, T>;
-            return
-                traits::Atan2<
-                    ImplementationBase,
-                    Ty,
-                    Tx>
-                ::atan2(
-                    atan2_ctx,
-                    y,
-                    x);
+            return traits::Atan2<ImplementationBase, Ty, Tx>::atan2(atan2_ctx, y, x);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtCudaBuiltIn.hpp
deleted file mode 100644
index 48dd9525be..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/cbrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in cbrt.
-        class CbrtCudaBuiltIn : public concepts::Implements<ConceptMathCbrt, CbrtCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA cbrt trait specialization.
-            template<
-                typename TArg>
-            struct Cbrt<
-                CbrtCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                __device__ static auto cbrt(
-                    CbrtCudaBuiltIn const & cbrt_ctx,
-                    TArg const & arg)
-                -> decltype(::cbrt(arg))
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return ::cbrt(arg);
-                }
-            };
-
-            template<>
-            struct Cbrt<
-                CbrtCudaBuiltIn,
-                float>
-            {
-                __device__ static auto cbrt(
-                    CbrtCudaBuiltIn const & cbrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return ::cbrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtHipBuiltIn.hpp
deleted file mode 100644
index 2f86fe390b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/cbrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP cbrt.
-        class CbrtHipBuiltIn : public concepts::Implements<ConceptMathCbrt, CbrtHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP cbrt trait specialization.
-            template<
-                typename TArg>
-            struct Cbrt<
-                CbrtHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                __device__ static auto cbrt(
-                    CbrtHipBuiltIn const & cbrt_ctx,
-                    TArg const & arg)
-                -> decltype(::cbrt(arg))
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return ::cbrt(arg);
-                }
-            };
-            //! The HIP cbrt float specialization.
-            template<>
-            struct Cbrt<
-                CbrtHipBuiltIn,
-                float>
-            {
-                __device__ static auto cbrt(
-                    CbrtHipBuiltIn const & cbrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(cbrt_ctx);
-                    return ::cbrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp
index a2da289bea..723e769046 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/cbrt/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/cbrt/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library cbrt trait specialization.
-            template<
-                typename TArg>
-            struct Cbrt<
-                CbrtStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Cbrt<CbrtStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto cbrt(
-                    CbrtStdLib const & cbrt_ctx,
-                    TArg const & arg)
-                -> decltype(std::cbrt(arg))
+                ALPAKA_FN_HOST static auto cbrt(CbrtStdLib const& cbrt_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(cbrt_ctx);
                     return std::cbrt(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..edf5e6ae51
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/CbrtUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/cbrt/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in cbrt.
+        class CbrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCbrt, CbrtUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA cbrt trait specialization.
+            template<typename TArg>
+            struct Cbrt<CbrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
+            {
+                __device__ static auto cbrt(CbrtUniformCudaHipBuiltIn const& cbrt_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(cbrt_ctx);
+                    return ::cbrt(arg);
+                }
+            };
+
+            template<>
+            struct Cbrt<CbrtUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto cbrt(CbrtUniformCudaHipBuiltIn const& cbrt_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(cbrt_ctx);
+                    return ::cbrtf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/Traits.hpp
index 92a6650ede..09fd2f55ba 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/cbrt/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathCbrt;
+        struct ConceptMathCbrt
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The cbrt trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Cbrt;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the cbrt.
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param cbrt_ctx The object specializing Cbrt.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto cbrt(
-            T const & cbrt_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Cbrt<
-                concepts::ImplementationBase<ConceptMathCbrt, T>,
-                TArg>
-            ::cbrt(
-                cbrt_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto cbrt(T const& cbrt_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathCbrt, T>;
-            return
-                traits::Cbrt<
-                    ImplementationBase,
-                    TArg>
-                ::cbrt(
-                    cbrt_ctx,
-                    arg);
+            return traits::Cbrt<ImplementationBase, TArg>::cbrt(cbrt_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilCudaBuiltIn.hpp
deleted file mode 100644
index e21b5ff0b5..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/ceil/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in ceil.
-        class CeilCudaBuiltIn : public concepts::Implements<ConceptMathCeil, CeilCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA ceil trait specialization.
-            template<
-                typename TArg>
-            struct Ceil<
-                CeilCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto ceil(
-                    CeilCudaBuiltIn const & ceil_ctx,
-                    TArg const & arg)
-                -> decltype(::ceil(arg))
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return ::ceil(arg);
-                }
-            };
-            //
-            template<>
-            struct Ceil<
-                CeilCudaBuiltIn,
-                float>
-            {
-                __device__ static auto ceil(
-                    CeilCudaBuiltIn const & ceil_ctx,
-                    float const & arg)
-                ->float
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return ::ceilf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilHipBuiltIn.hpp
deleted file mode 100644
index cd65ab6db5..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/ceil/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP ceil.
-        class CeilHipBuiltIn : public concepts::Implements<ConceptMathCeil, CeilHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP ceil trait specialization.
-            template<
-                typename TArg>
-            struct Ceil<
-                CeilHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto ceil(
-                    CeilHipBuiltIn const & ceil_ctx,
-                    TArg const & arg)
-                -> decltype(::ceil(arg))
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return ::ceil(arg);
-                }
-            };
-            //! The HIP cos float specialization.
-            template<>
-            struct Ceil<
-                CeilHipBuiltIn,
-                float>
-            {
-                __device__ static auto ceil(
-                    CeilHipBuiltIn const & ceil_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(ceil_ctx);
-                    return ::ceilf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp
index f1895c40b0..faeff99442 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/ceil/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/ceil/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library ceil trait specialization.
-            template<
-                typename TArg>
-            struct Ceil<
-                CeilStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Ceil<CeilStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto ceil(
-                    CeilStdLib const & ceil_ctx,
-                    TArg const & arg)
-                -> decltype(std::ceil(arg))
+                ALPAKA_FN_HOST static auto ceil(CeilStdLib const& ceil_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(ceil_ctx);
                     return std::ceil(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..6e88ae5b21
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/CeilUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/ceil/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in ceil.
+        class CeilUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCeil, CeilUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA ceil trait specialization.
+            template<typename TArg>
+            struct Ceil<CeilUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto ceil(CeilUniformCudaHipBuiltIn const& ceil_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(ceil_ctx);
+                    return ::ceil(arg);
+                }
+            };
+            //
+            template<>
+            struct Ceil<CeilUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto ceil(CeilUniformCudaHipBuiltIn const& ceil_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(ceil_ctx);
+                    return ::ceilf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/Traits.hpp
index c9de6a6512..bc6111a205 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/ceil/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/ceil/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathCeil;
+        struct ConceptMathCeil
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The ceil trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Ceil;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the smallest integer value not less than arg.
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param ceil_ctx The object specializing Ceil.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto ceil(
-            T const & ceil_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Ceil<
-                concepts::ImplementationBase<ConceptMathCeil, T>,
-                TArg>
-            ::ceil(
-                ceil_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto ceil(T const& ceil_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathCeil, T>;
-            return
-                traits::Ceil<
-                    ImplementationBase,
-                    TArg>
-                ::ceil(
-                    ceil_ctx,
-                    arg);
+            return traits::Ceil<ImplementationBase, TArg>::ceil(ceil_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosCudaBuiltIn.hpp
deleted file mode 100644
index 701f795aba..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/cos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in cos.
-        class CosCudaBuiltIn : public concepts::Implements<ConceptMathCos, CosCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA cos trait specialization.
-            template<
-                typename TArg>
-            struct Cos<
-                CosCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto cos(
-                    CosCudaBuiltIn const & cos_ctx,
-                    TArg const & arg)
-                -> decltype(::cos(arg))
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return ::cos(arg);
-                }
-            };
-
-            template<>
-            struct Cos<
-                CosCudaBuiltIn,
-                float>
-            {
-                __device__ static auto cos(
-                    CosCudaBuiltIn const & cos_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return ::cosf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosHipBuiltIn.hpp
deleted file mode 100644
index 4e8ab79100..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/cos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP cos.
-        class CosHipBuiltIn : public concepts::Implements<ConceptMathCos, CosHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP cos trait specialization.
-            template<
-                typename TArg>
-            struct Cos<
-                CosHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto cos(
-                    CosHipBuiltIn const & cos_ctx,
-                    TArg const & arg)
-                -> decltype(::cos(arg))
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return ::cos(arg);
-                }
-            };
-            //! The HIP cos float specialization.
-            template<>
-            struct Cos<
-                CosHipBuiltIn,
-                float>
-            {
-                __device__ static auto cos(
-                    CosHipBuiltIn const & cos_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(cos_ctx);
-                    return ::cosf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosStdLib.hpp
index 6acff5d064..50808212c6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/cos/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/cos/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library cos trait specialization.
-            template<
-                typename TArg>
-            struct Cos<
-                CosStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Cos<CosStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto cos(
-                    CosStdLib const & cos_ctx,
-                    TArg const & arg)
-                -> decltype(std::cos(arg))
+                ALPAKA_FN_HOST static auto cos(CosStdLib const& cos_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(cos_ctx);
                     return std::cos(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..c22a43e4e9
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/cos/CosUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/cos/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in cos.
+        class CosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCos, CosUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA cos trait specialization.
+            template<typename TArg>
+            struct Cos<CosUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto cos(CosUniformCudaHipBuiltIn const& cos_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(cos_ctx);
+                    return ::cos(arg);
+                }
+            };
+
+            template<>
+            struct Cos<CosUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto cos(CosUniformCudaHipBuiltIn const& cos_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(cos_ctx);
+                    return ::cosf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/cos/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/cos/Traits.hpp
index 59302bf6c0..21a228bbc8 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/cos/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/cos/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathCos;
+        struct ConceptMathCos
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The cos trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Cos;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the cosine (measured in radians).
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param cos_ctx The object specializing Cos.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto cos(
-            T const & cos_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Cos<
-                concepts::ImplementationBase<ConceptMathCos, T>,
-                TArg>
-            ::cos(
-                cos_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto cos(T const& cos_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathCos, T>;
-            return
-                traits::Cos<
-                    ImplementationBase,
-                    TArg>
-                ::cos(
-                    cos_ctx,
-                    arg);
+            return traits::Cos<ImplementationBase, TArg>::cos(cos_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfCudaBuiltIn.hpp
deleted file mode 100644
index 28dab47b3b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/erf/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in erf.
-        class ErfCudaBuiltIn : public concepts::Implements<ConceptMathErf, ErfCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA erf trait specialization.
-            template<
-                typename TArg>
-            struct Erf<
-                ErfCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto erf(
-                    ErfCudaBuiltIn const & erf_ctx,
-                    TArg const & arg)
-                -> decltype(::erf(arg))
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return ::erf(arg);
-                }
-            };
-
-            template<>
-            struct Erf<
-                ErfCudaBuiltIn,
-                float>
-            {
-                __device__ static auto erf(
-                    ErfCudaBuiltIn const & erf_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return ::erff(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfHipBuiltIn.hpp
deleted file mode 100644
index e6fa0d529b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/erf/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP erf.
-        class ErfHipBuiltIn : public concepts::Implements<ConceptMathErf, ErfHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP erf trait specialization.
-            template<
-                typename TArg>
-            struct Erf<
-                ErfHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto erf(
-                    ErfHipBuiltIn const & erf_ctx,
-                    TArg const & arg)
-                -> decltype(::erf(arg))
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return ::erf(arg);
-                }
-            };
-            //! The HIP erf float specialization.
-            template<>
-            struct Erf<
-                ErfHipBuiltIn,
-                float>
-            {
-                __device__ static auto erf(
-                    ErfHipBuiltIn const & erf_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(erf_ctx);
-                    return ::erff(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfStdLib.hpp
index 6028cceb9a..f6493c3fe3 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/erf/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/erf/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library erf trait specialization.
-            template<
-                typename TArg>
-            struct Erf<
-                ErfStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Erf<ErfStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto erf(
-                    ErfStdLib const & erf_ctx,
-                    TArg const & arg)
-                -> decltype(std::erf(arg))
+                ALPAKA_FN_HOST static auto erf(ErfStdLib const& erf_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(erf_ctx);
                     return std::erf(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..38c20ad20e
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/erf/ErfUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/erf/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in erf.
+        class ErfUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathErf, ErfUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA erf trait specialization.
+            template<typename TArg>
+            struct Erf<ErfUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto erf(ErfUniformCudaHipBuiltIn const& erf_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(erf_ctx);
+                    return ::erf(arg);
+                }
+            };
+
+            template<>
+            struct Erf<ErfUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto erf(ErfUniformCudaHipBuiltIn const& erf_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(erf_ctx);
+                    return ::erff(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/erf/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/erf/Traits.hpp
index 188d907688..9fd103494c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/erf/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/erf/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathErf;
+        struct ConceptMathErf
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The erf trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Erf;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the error function of arg.
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param erf_ctx The object specializing Erf.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto erf(
-            T const & erf_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Erf<
-                concepts::ImplementationBase<ConceptMathErf, T>,
-                TArg>
-            ::erf(
-                erf_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto erf(T const& erf_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathErf, T>;
-            return
-                traits::Erf<
-                    ImplementationBase,
-                    TArg>
-                ::erf(
-                    erf_ctx,
-                    arg);
+            return traits::Erf<ImplementationBase, TArg>::erf(erf_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpCudaBuiltIn.hpp
deleted file mode 100644
index c52d51696b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/exp/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in exp.
-        class ExpCudaBuiltIn : public concepts::Implements<ConceptMathExp, ExpCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA exp trait specialization.
-            template<
-                typename TArg>
-            struct Exp<
-                ExpCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto exp(
-                    ExpCudaBuiltIn const & exp_ctx,
-                    TArg const & arg)
-                -> decltype(::exp(arg))
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return ::exp(arg);
-                }
-            };
-            //! The CUDA exp float specialization.
-            template<>
-            struct Exp<
-                ExpCudaBuiltIn,
-                float>
-            {
-                __device__ static auto exp(
-                    ExpCudaBuiltIn const & exp_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return ::expf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpHipBuiltIn.hpp
deleted file mode 100644
index 325582b330..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/exp/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP exp.
-        class ExpHipBuiltIn : public concepts::Implements<ConceptMathExp, ExpHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP exp trait specialization.
-            template<
-                typename TArg>
-            struct Exp<
-                ExpHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto exp(
-                    ExpHipBuiltIn const & exp_ctx,
-                    TArg const & arg)
-                -> decltype(::exp(arg))
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return ::exp(arg);
-                }
-            };
-            //! The HIP exp float specialization.
-            template<>
-            struct Exp<
-                ExpHipBuiltIn,
-                float>
-            {
-                __device__ static auto exp(
-                    ExpHipBuiltIn const & exp_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(exp_ctx);
-                    return ::expf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpStdLib.hpp
index 70cbc53b79..a2a5b33515 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/exp/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/exp/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library exp trait specialization.
-            template<
-                typename TArg>
-            struct Exp<
-                ExpStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Exp<ExpStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto exp(
-                    ExpStdLib const & exp_ctx,
-                    TArg const & arg)
-                -> decltype(std::exp(arg))
+                ALPAKA_FN_HOST static auto exp(ExpStdLib const& exp_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(exp_ctx);
                     return std::exp(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..9ae58362b6
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/exp/ExpUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/exp/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in exp.
+        class ExpUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathExp, ExpUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA exp trait specialization.
+            template<typename TArg>
+            struct Exp<ExpUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto exp(ExpUniformCudaHipBuiltIn const& exp_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(exp_ctx);
+                    return ::exp(arg);
+                }
+            };
+            //! The CUDA exp float specialization.
+            template<>
+            struct Exp<ExpUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto exp(ExpUniformCudaHipBuiltIn const& exp_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(exp_ctx);
+                    return ::expf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/exp/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/exp/Traits.hpp
index 0d077c7928..617a63735b 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/exp/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/exp/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathExp;
+        struct ConceptMathExp
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The exp trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Exp;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the e (Euler's number, 2.7182818) raised to the given power arg.
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param exp_ctx The object specializing Exp.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto exp(
-            T const & exp_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Exp<
-                concepts::ImplementationBase<ConceptMathExp, T>,
-                TArg>
-            ::exp(
-                exp_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto exp(T const& exp_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathExp, T>;
-            return
-                traits::Exp<
-                    ImplementationBase,
-                    TArg>
-                ::exp(
-                    exp_ctx,
-                    arg);
+            return traits::Exp<ImplementationBase, TArg>::exp(exp_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorCudaBuiltIn.hpp
deleted file mode 100644
index 1ae4713d8c..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/floor/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in floor.
-        class FloorCudaBuiltIn : public concepts::Implements<ConceptMathFloor, FloorCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA floor trait specialization.
-            template<
-                typename TArg>
-            struct Floor<
-                FloorCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto floor(
-                    FloorCudaBuiltIn const & floor_ctx,
-                    TArg const & arg)
-                -> decltype(::floor(arg))
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return ::floor(arg);
-                }
-            };
-            //! The CUDA floor float specialization.
-            template<>
-            struct Floor<
-                FloorCudaBuiltIn,
-                float>
-            {
-                __device__ static auto floor(
-                    FloorCudaBuiltIn const & floor_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return ::floorf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorHipBuiltIn.hpp
deleted file mode 100644
index b045746988..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/floor/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP floor.
-        class FloorHipBuiltIn : public concepts::Implements<ConceptMathFloor, FloorHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP floor trait specialization.
-            template<
-                typename TArg>
-            struct Floor<
-                FloorHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto floor(
-                    FloorHipBuiltIn const & floor_ctx,
-                    TArg const & arg)
-                -> decltype(::floor(arg))
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return ::floor(arg);
-                }
-            };
-            //! The HIP floor float specialization.
-            template<>
-            struct Floor<
-                FloorHipBuiltIn,
-                float>
-            {
-                __device__ static auto floor(
-                    FloorHipBuiltIn const & floor_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(floor_ctx);
-                    return ::floorf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorStdLib.hpp
index 3f6f670aa6..499577fbb3 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/floor/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/floor/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library floor trait specialization.
-            template<
-                typename TArg>
-            struct Floor<
-                FloorStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Floor<FloorStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto floor(
-                    FloorStdLib const & floor_ctx,
-                    TArg const & arg)
-                -> decltype(std::floor(arg))
+                ALPAKA_FN_HOST static auto floor(FloorStdLib const& floor_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(floor_ctx);
                     return std::floor(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..b049f10d0b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/floor/FloorUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/floor/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in floor.
+        class FloorUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathFloor, FloorUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA floor trait specialization.
+            template<typename TArg>
+            struct Floor<FloorUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto floor(FloorUniformCudaHipBuiltIn const& floor_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(floor_ctx);
+                    return ::floor(arg);
+                }
+            };
+            //! The CUDA floor float specialization.
+            template<>
+            struct Floor<FloorUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto floor(FloorUniformCudaHipBuiltIn const& floor_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(floor_ctx);
+                    return ::floorf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/floor/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/floor/Traits.hpp
index a4f2e87c05..2a0978a17e 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/floor/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/floor/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathFloor;
+        struct ConceptMathFloor
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The floor trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Floor;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the largest integer value not greater than arg.
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param floor_ctx The object specializing Floor.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto floor(
-            T const & floor_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Floor<
-                concepts::ImplementationBase<ConceptMathFloor, T>,
-                TArg>
-            ::floor(
-                floor_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto floor(T const& floor_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathFloor, T>;
-            return
-                traits::Floor<
-                    ImplementationBase,
-                    TArg>
-                ::floor(
-                    floor_ctx,
-                    arg);
+            return traits::Floor<ImplementationBase, TArg>::floor(floor_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodCudaBuiltIn.hpp
deleted file mode 100644
index f927e7322b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodCudaBuiltIn.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/fmod/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in fmod.
-        class FmodCudaBuiltIn : public concepts::Implements<ConceptMathFmod, FmodCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA fmod trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Fmod<
-                FmodCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                __device__ static auto fmod(
-                    FmodCudaBuiltIn const & fmod_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmod(x, y))
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return ::fmod(
-                        x,
-                        y);
-                }
-            };
-            //! The CUDA fmod float specialization.
-            template<>
-            struct Fmod<
-                FmodCudaBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto fmod(
-                    FmodCudaBuiltIn const & fmod_ctx,
-                    float const & x,
-                    float const & y)
-                -> float
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return ::fmodf(
-                        x,
-                        y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodHipBuiltIn.hpp
deleted file mode 100644
index 83e230cb9c..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodHipBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/fmod/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP fmod.
-        class FmodHipBuiltIn : public concepts::Implements<ConceptMathFmod, FmodHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP fmod trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Fmod<
-                FmodHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                __device__ static auto fmod(
-                    FmodHipBuiltIn const & fmod_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmod(x, y))
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return ::fmod(x, y);
-                }
-            };
-            //! The HIP fmod float specialization.
-            template<>
-            struct Fmod<
-                FmodHipBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto fmod(
-                    FmodHipBuiltIn const & fmod_ctx,
-                    float const & x,
-                    float const & y)
-                -> float
-                {
-                    alpaka::ignore_unused(fmod_ctx);
-                    return ::fmodf(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp
index c25ebf4adc..d400dc4e71 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/fmod/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/fmod/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,27 +29,19 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library fmod trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
+            template<typename Tx, typename Ty>
             struct Fmod<
                 FmodStdLib,
                 Tx,
                 Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value>::type>
+                std::enable_if_t<std::is_arithmetic<Tx>::value && std::is_arithmetic<Ty>::value>>
             {
-                ALPAKA_FN_HOST static auto fmod(
-                    FmodStdLib const & fmod_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::fmod(x, y))
+                ALPAKA_FN_HOST static auto fmod(FmodStdLib const& fmod_ctx, Tx const& x, Ty const& y)
                 {
                     alpaka::ignore_unused(fmod_ctx);
                     return std::fmod(x, y);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..79fb4049a2
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/FmodUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,87 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/fmod/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in fmod.
+        class FmodUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathFmod, FmodUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA fmod trait specialization.
+            template<typename Tx, typename Ty>
+            struct Fmod<
+                FmodUniformCudaHipBuiltIn,
+                Tx,
+                Ty,
+                std::enable_if_t<std::is_floating_point<Tx>::value && std::is_floating_point<Ty>::value>>
+            {
+                __device__ static auto fmod(FmodUniformCudaHipBuiltIn const& fmod_ctx, Tx const& x, Ty const& y)
+                {
+                    alpaka::ignore_unused(fmod_ctx);
+                    return ::fmod(x, y);
+                }
+            };
+            //! The CUDA fmod float specialization.
+            template<>
+            struct Fmod<FmodUniformCudaHipBuiltIn, float, float>
+            {
+                __device__ static auto fmod(FmodUniformCudaHipBuiltIn const& fmod_ctx, float const& x, float const& y)
+                    -> float
+                {
+                    alpaka::ignore_unused(fmod_ctx);
+                    return ::fmodf(x, y);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/Traits.hpp
index faa821505f..dfb06ce2c1 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/fmod/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/fmod/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,27 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathFmod;
+        struct ConceptMathFmod
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The fmod trait.
-            template<
-                typename T,
-                typename Tx,
-                typename Ty,
-                typename TSfinae = void>
+            template<typename T, typename Tx, typename Ty, typename TSfinae = void>
             struct Fmod;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the floating-point remainder of the division operation x/y.
@@ -44,36 +40,11 @@ namespace alpaka
         //! \param x The first argument.
         //! \param y The second argument.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Tx,
-            typename Ty>
-        ALPAKA_FN_HOST_ACC auto fmod(
-            T const & fmod_ctx,
-            Tx const & x,
-            Ty const & y)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Fmod<
-                concepts::ImplementationBase<ConceptMathFmod, T>,
-                Tx,
-                Ty>
-            ::fmod(
-                fmod_ctx,
-                x,
-                y))
-#endif
+        template<typename T, typename Tx, typename Ty>
+        ALPAKA_FN_HOST_ACC auto fmod(T const& fmod_ctx, Tx const& x, Ty const& y)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathFmod, T>;
-            return
-                traits::Fmod<
-                    ImplementationBase,
-                    Tx,
-                    Ty>
-                ::fmod(
-                    fmod_ctx,
-                    x,
-                    y);
+            return traits::Fmod<ImplementationBase, Tx, Ty>::fmod(fmod_ctx, x, y);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogCudaBuiltIn.hpp
deleted file mode 100644
index 3f28b81ac6..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogCudaBuiltIn.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/log/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        // ! The CUDA built in log.
-        class LogCudaBuiltIn : public concepts::Implements<ConceptMathLog, LogCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA log trait specialization.
-            template<
-                typename TArg>
-            struct Log<
-                LogCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto log(
-                    LogCudaBuiltIn const & log_ctx,
-                    TArg const & arg)
-                -> decltype(::log(arg))
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return ::log(arg);
-                }
-            };
-            //! The CUDA log float specialization.
-            template<>
-            struct Log<
-                LogCudaBuiltIn,
-                float>
-            {
-                __device__ static auto log(
-                    LogCudaBuiltIn const & log_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return ::logf(arg);
-                }
-            };
-
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogHipBuiltIn.hpp
deleted file mode 100644
index 80ee8193fb..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/log/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP log.
-        class LogHipBuiltIn : public concepts::Implements<ConceptMathLog, LogHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP log trait specialization.
-            template<
-                typename TArg>
-            struct Log<
-                LogHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto log(
-                    LogHipBuiltIn const & log_ctx,
-                    TArg const & arg)
-                -> decltype(::log(arg))
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return ::log(arg);
-                }
-            };
-            //! The HIP log float specialization.
-            template<>
-            struct Log<
-                LogHipBuiltIn,
-                float>
-            {
-                __device__ static auto log(
-                    LogHipBuiltIn const & log_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(log_ctx);
-                    return ::logf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogStdLib.hpp
index 116007feed..db2a961a25 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/log/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/log/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library log trait specialization.
-            template<
-                typename TArg>
-            struct Log<
-                LogStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Log<LogStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto log(
-                    LogStdLib const & log_ctx,
-                    TArg const & arg)
-                -> decltype(std::log(arg))
+                ALPAKA_FN_HOST static auto log(LogStdLib const& log_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(log_ctx);
                     return std::log(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/log/LogUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..727b05be9a
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/log/LogUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,83 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/log/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        // ! The CUDA built in log.
+        class LogUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathLog, LogUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA log trait specialization.
+            template<typename TArg>
+            struct Log<LogUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto log(LogUniformCudaHipBuiltIn const& log_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(log_ctx);
+                    return ::log(arg);
+                }
+            };
+            //! The CUDA log float specialization.
+            template<>
+            struct Log<LogUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto log(LogUniformCudaHipBuiltIn const& log_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(log_ctx);
+                    return ::logf(arg);
+                }
+            };
+
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/log/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/log/Traits.hpp
index 0ba09e8249..4bb4ba492e 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/log/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/log/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,59 +12,41 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathLog;
+        struct ConceptMathLog
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The log trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Log;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the the natural (base e) logarithm of arg.
         //!
+        //! Valid real arguments are non-negative. For other values the result
+        //! may depend on the backend and compilation options, will likely
+        //! be NaN.
+        //!
         //! \tparam T The type of the object specializing Log.
         //! \tparam TArg The arg type.
         //! \param log_ctx The object specializing Log.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto log(
-            T const & log_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Log<
-                concepts::ImplementationBase<ConceptMathLog, T>,
-                TArg>
-            ::log(
-                log_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto log(T const& log_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathLog, T>;
-            return
-                traits::Log<
-                    ImplementationBase,
-                    TArg>
-                ::log(
-                    log_ctx,
-                    arg);
+            return traits::Log<ImplementationBase, TArg>::log(log_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxCudaBuiltIn.hpp
deleted file mode 100644
index 1c9a9806b3..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxCudaBuiltIn.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/max/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in max.
-        class MaxCudaBuiltIn : public concepts::Implements<ConceptMathMax, MaxCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The standard library integral max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                __device__ static auto max(
-                    MaxCudaBuiltIn const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::max(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return ::max(x, y);
-                }
-            };
-            //#############################################################################
-            //! The CUDA mixed integral floating point max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                __device__ static auto max(
-                    MaxCudaBuiltIn const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmax(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return ::fmax(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxHipBuiltIn.hpp
deleted file mode 100644
index f16cc8e0e2..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxHipBuiltIn.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/max/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP max.
-        class MaxHipBuiltIn : public concepts::Implements<ConceptMathMax, MaxHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP integral max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                __device__ static auto max(
-                    MaxHipBuiltIn const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::max(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return ::max(x, y);
-                }
-            };
-            //#############################################################################
-            //! The HIP mixed integral floating point max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                __device__ static auto max(
-                    MaxHipBuiltIn const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmax(x, y))
-                {
-                    alpaka::ignore_unused(max_ctx);
-                    return ::fmax(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxStdLib.hpp
index 4a433c2483..cce6798408 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,13 +9,12 @@
 
 #pragma once
 
-#include <alpaka/math/max/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/max/Traits.hpp>
 
-#include <type_traits>
-#include <cmath>
 #include <algorithm>
+#include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -31,22 +30,10 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library integral max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Max<
-                MaxStdLib,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
+            template<typename Tx, typename Ty>
+            struct Max<MaxStdLib, Tx, Ty, std::enable_if_t<std::is_integral<Tx>::value && std::is_integral<Ty>::value>>
             {
-                ALPAKA_FN_HOST static auto max(
-                    MaxStdLib const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::max(x, y))
+                ALPAKA_FN_HOST static auto max(MaxStdLib const& max_ctx, Tx const& x, Ty const& y)
                 {
                     alpaka::ignore_unused(max_ctx);
                     return std::max(x, y);
@@ -54,29 +41,21 @@ namespace alpaka
             };
             //#############################################################################
             //! The standard library mixed integral floating point max trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
+            template<typename Tx, typename Ty>
             struct Max<
                 MaxStdLib,
                 Tx,
                 Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
+                std::enable_if_t<
+                    std::is_arithmetic<Tx>::value && std::is_arithmetic<Ty>::value
+                    && !(std::is_integral<Tx>::value && std::is_integral<Ty>::value)>>
             {
-                ALPAKA_FN_HOST static auto max(
-                    MaxStdLib const & max_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::fmax(x, y))
+                ALPAKA_FN_HOST static auto max(MaxStdLib const& max_ctx, Tx const& x, Ty const& y)
                 {
                     alpaka::ignore_unused(max_ctx);
                     return std::fmax(x, y);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..af1b48d87c
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/max/MaxUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,94 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/max/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in max.
+        class MaxUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathMax, MaxUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The standard library integral max trait specialization.
+            template<typename Tx, typename Ty>
+            struct Max<
+                MaxUniformCudaHipBuiltIn,
+                Tx,
+                Ty,
+                std::enable_if_t<std::is_integral<Tx>::value && std::is_integral<Ty>::value>>
+            {
+                __device__ static auto max(MaxUniformCudaHipBuiltIn const& max_ctx, Tx const& x, Ty const& y)
+                    -> decltype(::max(x, y))
+                {
+                    alpaka::ignore_unused(max_ctx);
+                    return ::max(x, y);
+                }
+            };
+            //#############################################################################
+            //! The CUDA mixed integral floating point max trait specialization.
+            template<typename Tx, typename Ty>
+            struct Max<
+                MaxUniformCudaHipBuiltIn,
+                Tx,
+                Ty,
+                std::enable_if_t<
+                    std::is_arithmetic<Tx>::value && std::is_arithmetic<Ty>::value
+                    && !(std::is_integral<Tx>::value && std::is_integral<Ty>::value)>>
+            {
+                __device__ static auto max(MaxUniformCudaHipBuiltIn const& max_ctx, Tx const& x, Ty const& y)
+                    -> decltype(::fmax(x, y))
+                {
+                    alpaka::ignore_unused(max_ctx);
+                    return ::fmax(x, y);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/max/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/max/Traits.hpp
index 083c5e8e13..fc99ecf225 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/max/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/max/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,27 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathMax;
+        struct ConceptMathMax
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The max trait.
-            template<
-                typename T,
-                typename Tx,
-                typename Ty,
-                typename TSfinae = void>
+            template<typename T, typename Tx, typename Ty, typename TSfinae = void>
             struct Max;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Returns the larger of two arguments.
@@ -45,36 +41,11 @@ namespace alpaka
         //! \param x The first argument.
         //! \param y The second argument.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Tx,
-            typename Ty>
-        ALPAKA_FN_HOST_ACC auto max(
-            T const & max_ctx,
-            Tx const & x,
-            Ty const & y)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Max<
-                concepts::ImplementationBase<ConceptMathMax, T>,
-                Tx,
-                Ty>
-            ::max(
-                max_ctx,
-                x,
-                y))
-#endif
+        template<typename T, typename Tx, typename Ty>
+        ALPAKA_FN_HOST_ACC auto max(T const& max_ctx, Tx const& x, Ty const& y)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathMax, T>;
-            return
-                traits::Max<
-                    ImplementationBase,
-                    Tx,
-                    Ty>
-                ::max(
-                    max_ctx,
-                    x,
-                    y);
+            return traits::Max<ImplementationBase, Tx, Ty>::max(max_ctx, x, y);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinCudaBuiltIn.hpp
deleted file mode 100644
index e922f173e5..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinCudaBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/min/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in min.
-        class MinCudaBuiltIn : public concepts::Implements<ConceptMathMin, MinCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA integral min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                __device__ static auto min(
-                    MinCudaBuiltIn const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::min(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return ::min(x, y);
-                }
-            };
-            //#############################################################################
-            //! The standard library mixed integral floating point min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                __device__ static auto min(
-                    MinCudaBuiltIn const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmin(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return ::fmin(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinHipBuiltIn.hpp
deleted file mode 100644
index 36ca06feee..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinHipBuiltIn.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/min/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-#include <algorithm>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP min.
-        class MinHipBuiltIn : public concepts::Implements<ConceptMathMin, MinHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP integral min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
-            {
-                __device__ static auto min(
-                    MinHipBuiltIn const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::min(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return ::min(x, y);
-                }
-            };
-            //#############################################################################
-            //! The HIP mixed integral floating point min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
-            {
-                __device__ static auto min(
-                    MinHipBuiltIn const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::fmin(x, y))
-                {
-                    alpaka::ignore_unused(min_ctx);
-                    return ::fmin(x, y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinStdLib.hpp
index ec0c40fc74..0d2e3b3a3c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,13 +9,12 @@
 
 #pragma once
 
-#include <alpaka/math/min/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/min/Traits.hpp>
 
-#include <type_traits>
-#include <cmath>
 #include <algorithm>
+#include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -31,22 +30,10 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library integral min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Min<
-                MinStdLib,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_integral<Tx>::value
-                    && std::is_integral<Ty>::value>::type>
+            template<typename Tx, typename Ty>
+            struct Min<MinStdLib, Tx, Ty, std::enable_if_t<std::is_integral<Tx>::value && std::is_integral<Ty>::value>>
             {
-                ALPAKA_FN_HOST static auto min(
-                    MinStdLib const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::min(x, y))
+                ALPAKA_FN_HOST static auto min(MinStdLib const& min_ctx, Tx const& x, Ty const& y)
                 {
                     alpaka::ignore_unused(min_ctx);
                     return std::min(x, y);
@@ -54,29 +41,21 @@ namespace alpaka
             };
             //#############################################################################
             //! The standard library mixed integral floating point min trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
+            template<typename Tx, typename Ty>
             struct Min<
                 MinStdLib,
                 Tx,
                 Ty,
-                typename std::enable_if<
-                    std::is_arithmetic<Tx>::value
-                    && std::is_arithmetic<Ty>::value
-                    && !(std::is_integral<Tx>::value
-                        && std::is_integral<Ty>::value)>::type>
+                std::enable_if_t<
+                    std::is_arithmetic<Tx>::value && std::is_arithmetic<Ty>::value
+                    && !(std::is_integral<Tx>::value && std::is_integral<Ty>::value)>>
             {
-                ALPAKA_FN_HOST static auto min(
-                    MinStdLib const & min_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::fmin(x, y))
+                ALPAKA_FN_HOST static auto min(MinStdLib const& min_ctx, Tx const& x, Ty const& y)
                 {
                     alpaka::ignore_unused(min_ctx);
                     return std::fmin(x, y);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/min/MinUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..5d0efb1c44
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/min/MinUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,95 @@
+/* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/min/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in min.
+        class MinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathMin, MinUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA integral min trait specialization.
+            template<typename Tx, typename Ty>
+            struct Min<
+                MinUniformCudaHipBuiltIn,
+                Tx,
+                Ty,
+                std::enable_if_t<std::is_integral<Tx>::value && std::is_integral<Ty>::value>>
+            {
+                __device__ static auto min(MinUniformCudaHipBuiltIn const& min_ctx, Tx const& x, Ty const& y)
+                    -> decltype(::min(x, y))
+                {
+                    alpaka::ignore_unused(min_ctx);
+                    return ::min(x, y);
+                }
+            };
+            //#############################################################################
+            //! The standard library mixed integral floating point min trait specialization.
+            template<typename Tx, typename Ty>
+            struct Min<
+                MinUniformCudaHipBuiltIn,
+                Tx,
+                Ty,
+                std::enable_if_t<
+                    std::is_arithmetic<Tx>::value && std::is_arithmetic<Ty>::value
+                    && !(std::is_integral<Tx>::value && std::is_integral<Ty>::value)>>
+            {
+                __device__ static auto min(MinUniformCudaHipBuiltIn const& min_ctx, Tx const& x, Ty const& y)
+                    -> decltype(::fmin(x, y))
+                {
+                    alpaka::ignore_unused(min_ctx);
+                    return ::fmin(x, y);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/min/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/min/Traits.hpp
index 913e0d93c7..86a7aeee5c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/min/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/min/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,27 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathMin;
+        struct ConceptMathMin
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The min trait.
-            template<
-                typename T,
-                typename Tx,
-                typename Ty,
-                typename TSfinae = void>
+            template<typename T, typename Tx, typename Ty, typename TSfinae = void>
             struct Min;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Returns the smaller of two arguments.
@@ -45,36 +41,11 @@ namespace alpaka
         //! \param x The first argument.
         //! \param y The second argument.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Tx,
-            typename Ty>
-        ALPAKA_FN_HOST_ACC auto min(
-            T const & min_ctx,
-            Tx const & x,
-            Ty const & y)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Min<
-                concepts::ImplementationBase<ConceptMathMin, T>,
-                Tx,
-                Ty>
-            ::min(
-                min_ctx,
-                x,
-                y))
-#endif
+        template<typename T, typename Tx, typename Ty>
+        ALPAKA_FN_HOST_ACC auto min(T const& min_ctx, Tx const& x, Ty const& y)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathMin, T>;
-            return
-                traits::Min<
-                    ImplementationBase,
-                    Tx,
-                    Ty>
-                ::min(
-                    min_ctx,
-                    x,
-                    y);
+            return traits::Min<ImplementationBase, Tx, Ty>::min(min_ctx, x, y);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowCudaBuiltIn.hpp
deleted file mode 100644
index bb1e41a48b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowCudaBuiltIn.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/pow/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in pow.
-        class PowCudaBuiltIn : public concepts::Implements<ConceptMathPow, PowCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA pow trait specialization.
-            template<
-                typename TBase,
-                typename TExp>
-            struct Pow<
-                PowCudaBuiltIn,
-                TBase,
-                TExp,
-                typename std::enable_if<
-                    std::is_floating_point<TBase>::value
-                    && std::is_floating_point<TExp>::value>::type>
-            {
-                __device__ static auto pow(
-                    PowCudaBuiltIn const & pow_ctx,
-                    TBase const & base,
-                    TExp const & exp)
-                -> decltype(::pow(base, exp))
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return ::pow(base, exp);
-                }
-            };
-            //! The CUDA pow float specialization.
-            template<>
-            struct Pow<
-                PowCudaBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto pow(
-                    PowCudaBuiltIn const & pow_ctx,
-                    float const & base,
-                    float const & exp)
-                -> float
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return ::powf(base, exp);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowHipBuiltIn.hpp
deleted file mode 100644
index de6642f5be..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowHipBuiltIn.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/pow/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP pow.
-        class PowHipBuiltIn : public concepts::Implements<ConceptMathPow, PowHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP pow trait specialization.
-            template<
-                typename TBase,
-                typename TExp>
-            struct Pow<
-                PowHipBuiltIn,
-                TBase,
-                TExp,
-                typename std::enable_if<
-                    std::is_floating_point<TBase>::value
-                    && std::is_floating_point<TExp>::value>::type>
-            {
-                __device__ static auto pow(
-                    PowHipBuiltIn const & pow_ctx,
-                    TBase const & base,
-                    TExp const & exp)
-                -> decltype(::pow(base, exp))
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return ::pow(base, exp);
-                }
-            };
-            //! The HIP pow float specialization.
-            template<>
-            struct Pow<
-                PowHipBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto pow(
-                    PowHipBuiltIn const & pow_ctx,
-                    float const & base,
-                    float const & exp)
-                -> float
-                {
-                    alpaka::ignore_unused(pow_ctx);
-                    return ::powf(base, exp);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowStdLib.hpp
index 219432ab8b..a684d6bdb7 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/pow/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/pow/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,27 +29,19 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library pow trait specialization.
-            template<
-                typename TBase,
-                typename TExp>
+            template<typename TBase, typename TExp>
             struct Pow<
                 PowStdLib,
                 TBase,
                 TExp,
-                typename std::enable_if<
-                    std::is_arithmetic<TBase>::value
-                    && std::is_arithmetic<TExp>::value>::type>
+                std::enable_if_t<std::is_arithmetic<TBase>::value && std::is_arithmetic<TExp>::value>>
             {
-                ALPAKA_FN_HOST static auto pow(
-                    PowStdLib const & pow_ctx,
-                    TBase const & base,
-                    TExp const & exp)
-                -> decltype(std::pow(base, exp))
+                ALPAKA_FN_HOST static auto pow(PowStdLib const& pow_ctx, TBase const& base, TExp const& exp)
                 {
                     alpaka::ignore_unused(pow_ctx);
                     return std::pow(base, exp);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..3b0300cbfd
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/pow/PowUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,89 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/pow/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in pow.
+        class PowUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathPow, PowUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA pow trait specialization.
+            template<typename TBase, typename TExp>
+            struct Pow<
+                PowUniformCudaHipBuiltIn,
+                TBase,
+                TExp,
+                std::enable_if_t<std::is_floating_point<TBase>::value && std::is_floating_point<TExp>::value>>
+            {
+                __device__ static auto pow(PowUniformCudaHipBuiltIn const& pow_ctx, TBase const& base, TExp const& exp)
+                {
+                    alpaka::ignore_unused(pow_ctx);
+                    return ::pow(base, exp);
+                }
+            };
+            //! The CUDA pow float specialization.
+            template<>
+            struct Pow<PowUniformCudaHipBuiltIn, float, float>
+            {
+                __device__ static auto pow(
+                    PowUniformCudaHipBuiltIn const& pow_ctx,
+                    float const& base,
+                    float const& exp) -> float
+                {
+                    alpaka::ignore_unused(pow_ctx);
+                    return ::powf(base, exp);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/pow/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/pow/Traits.hpp
index f45629cf03..f5940ec8e6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/pow/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/pow/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,31 +12,31 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathPow;
+        struct ConceptMathPow
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The pow trait.
-            template<
-                typename T,
-                typename TBase,
-                typename TExp,
-                typename TSfinae = void>
+            template<typename T, typename TBase, typename TExp, typename TSfinae = void>
             struct Pow;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the value of base raised to the power exp.
         //!
+        //! Valid real arguments for base are non-negative. For other values
+        //! the result may depend on the backend and compilation options, will
+        //! likely be NaN.
+        //!
         //! \tparam T The type of the object specializing Pow.
         //! \tparam TBase The base type.
         //! \tparam TExp The exponent type.
@@ -44,36 +44,11 @@ namespace alpaka
         //! \param base The base.
         //! \param exp The exponent.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TBase,
-            typename TExp>
-        ALPAKA_FN_HOST_ACC auto pow(
-            T const & pow_ctx,
-            TBase const & base,
-            TExp const & exp)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Pow<
-                concepts::ImplementationBase<ConceptMathPow, T>,
-                TBase,
-                TExp>
-            ::pow(
-                pow_ctx,
-                base,
-                exp))
-#endif
+        template<typename T, typename TBase, typename TExp>
+        ALPAKA_FN_HOST_ACC auto pow(T const& pow_ctx, TBase const& base, TExp const& exp)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathPow, T>;
-            return
-                traits::Pow<
-                    ImplementationBase,
-                    TBase,
-                    TExp>
-                ::pow(
-                    pow_ctx,
-                    base,
-                    exp);
+            return traits::Pow<ImplementationBase, TBase, TExp>::pow(pow_ctx, base, exp);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderCudaBuiltIn.hpp
deleted file mode 100644
index ae7db890ae..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderCudaBuiltIn.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/remainder/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA built in remainder.
-        class RemainderCudaBuiltIn : public concepts::Implements<ConceptMathRemainder, RemainderCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA remainder trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Remainder<
-                RemainderCudaBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                __device__ static auto remainder(
-                    RemainderCudaBuiltIn const & remainder_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::remainder(
-                    x,
-                    y))
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return ::remainder(
-                        x,
-                        y);
-                }
-            };
-            //! The CUDA remainder float specialization.
-            template<>
-            struct Remainder<
-                RemainderCudaBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto remainder(
-                    RemainderCudaBuiltIn const & remainder_ctx,
-                    float const & x,
-                    float const & y)
-                -> float
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return ::remainderf(
-                        x,
-                        y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderHipBuiltIn.hpp
deleted file mode 100644
index 2e3fe8ac8b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderHipBuiltIn.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/remainder/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP remainder.
-        class RemainderHipBuiltIn : public concepts::Implements<ConceptMathRemainder, RemainderHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP remainder trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
-            struct Remainder<
-                RemainderHipBuiltIn,
-                Tx,
-                Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
-            {
-                __device__ static auto remainder(
-                    RemainderHipBuiltIn const & remainder_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(::remainder(x, y))
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return ::remainder(x, y);
-                }
-            };
-            //! The HIP remainder float specialization.
-            template<>
-            struct Remainder<
-                RemainderHipBuiltIn,
-                float,
-                float>
-            {
-                __device__ static auto remainder(
-                    RemainderHipBuiltIn const & remainder_ctx,
-                    float const & x,
-                    float const & y)
-                -> float
-                {
-                    alpaka::ignore_unused(remainder_ctx);
-                    return ::remainderf(
-                        x,
-                        y);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp
index f7e21cbbc5..22c4c701cc 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/remainder/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/remainder/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,27 +29,19 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library remainder trait specialization.
-            template<
-                typename Tx,
-                typename Ty>
+            template<typename Tx, typename Ty>
             struct Remainder<
                 RemainderStdLib,
                 Tx,
                 Ty,
-                typename std::enable_if<
-                    std::is_floating_point<Tx>::value
-                    && std::is_floating_point<Ty>::value>::type>
+                std::enable_if_t<std::is_floating_point<Tx>::value && std::is_floating_point<Ty>::value>>
             {
-                ALPAKA_FN_HOST static auto remainder(
-                    RemainderStdLib const & remainder_ctx,
-                    Tx const & x,
-                    Ty const & y)
-                -> decltype(std::remainder(x, y))
+                ALPAKA_FN_HOST static auto remainder(RemainderStdLib const& remainder_ctx, Tx const& x, Ty const& y)
                 {
                     alpaka::ignore_unused(remainder_ctx);
                     return std::remainder(x, y);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..af038989c0
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/RemainderUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,93 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/remainder/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA built in remainder.
+        class RemainderUniformCudaHipBuiltIn
+            : public concepts::Implements<ConceptMathRemainder, RemainderUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA remainder trait specialization.
+            template<typename Tx, typename Ty>
+            struct Remainder<
+                RemainderUniformCudaHipBuiltIn,
+                Tx,
+                Ty,
+                std::enable_if_t<std::is_floating_point<Tx>::value && std::is_floating_point<Ty>::value>>
+            {
+                __device__ static auto remainder(
+                    RemainderUniformCudaHipBuiltIn const& remainder_ctx,
+                    Tx const& x,
+                    Ty const& y)
+                {
+                    alpaka::ignore_unused(remainder_ctx);
+                    return ::remainder(x, y);
+                }
+            };
+            //! The CUDA remainder float specialization.
+            template<>
+            struct Remainder<RemainderUniformCudaHipBuiltIn, float, float>
+            {
+                __device__ static auto remainder(
+                    RemainderUniformCudaHipBuiltIn const& remainder_ctx,
+                    float const& x,
+                    float const& y) -> float
+                {
+                    alpaka::ignore_unused(remainder_ctx);
+                    return ::remainderf(x, y);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/Traits.hpp
index 9300bf6fdc..2e82a045d5 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/remainder/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/remainder/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,27 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathRemainder;
+        struct ConceptMathRemainder
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The remainder trait.
-            template<
-                typename T,
-                typename Tx,
-                typename Ty,
-                typename TSfinae = void>
+            template<typename T, typename Tx, typename Ty, typename TSfinae = void>
             struct Remainder;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the IEEE remainder of the floating point division operation x/y.
@@ -44,36 +40,11 @@ namespace alpaka
         //! \param x The first argument.
         //! \param y The second argument.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename Tx,
-            typename Ty>
-        ALPAKA_FN_HOST_ACC auto remainder(
-            T const & remainder_ctx,
-            Tx const & x,
-            Ty const & y)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Remainder<
-                concepts::ImplementationBase<ConceptMathRemainder, T>,
-                Tx,
-                Ty>
-            ::remainder(
-                remainder_ctx,
-                x,
-                y))
-#endif
+        template<typename T, typename Tx, typename Ty>
+        ALPAKA_FN_HOST_ACC auto remainder(T const& remainder_ctx, Tx const& x, Ty const& y)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathRemainder, T>;
-            return
-                traits::Remainder<
-                    ImplementationBase,
-                    Tx,
-                    Ty>
-                ::remainder(
-                    remainder_ctx,
-                    x,
-                    y);
+            return traits::Remainder<ImplementationBase, Tx, Ty>::remainder(remainder_ctx, x, y);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundCudaBuiltIn.hpp
deleted file mode 100644
index 883f39584c..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundCudaBuiltIn.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/round/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA round.
-        class RoundCudaBuiltIn : public concepts::Implements<ConceptMathRound, RoundCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA round trait specialization.
-            template<
-                typename TArg>
-            struct Round<
-                RoundCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto round(
-                    RoundCudaBuiltIn const & round_ctx,
-                    TArg const & arg)
-                -> decltype(::round(arg))
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return ::round(arg);
-                }
-            };
-            //#############################################################################
-            //! The CUDA lround trait specialization.
-            template<
-                typename TArg>
-            struct Lround<
-                RoundCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto lround(
-                    RoundCudaBuiltIn const & lround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(lround_ctx);
-                    return ::lround(arg);
-                }
-            };
-            //#############################################################################
-            //! The CUDA llround trait specialization.
-            template<
-                typename TArg>
-            struct Llround<
-                RoundCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto llround(
-                    RoundCudaBuiltIn const & llround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(llround_ctx);
-                    return ::llround(arg);
-                }
-            };
-            //! The CUDA round float specialization.
-            template<>
-            struct Round<
-                RoundCudaBuiltIn,
-                float>
-            {
-                __device__ static auto round(
-                    RoundCudaBuiltIn const & round_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return ::roundf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundHipBuiltIn.hpp
deleted file mode 100644
index 7e1aeb798b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundHipBuiltIn.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/round/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP round.
-        class RoundHipBuiltIn : public concepts::Implements<ConceptMathRound, RoundHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP round trait specialization.
-            template<
-                typename TArg>
-            struct Round<
-                RoundHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto round(
-                    RoundHipBuiltIn const & round_ctx,
-                    TArg const & arg)
-                -> decltype(::round(arg))
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return ::round(arg);
-                }
-            };
-            //#############################################################################
-            //! The HIP round trait specialization.
-            template<
-                typename TArg>
-            struct Lround<
-                RoundHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto lround(
-                    RoundHipBuiltIn const & lround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(lround_ctx);
-                    return ::lround(arg);
-                }
-            };
-            //#############################################################################
-            //! The standard library round trait specialization.
-            template<
-                typename TArg>
-            struct Llround<
-                RoundHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto llround(
-                    RoundHipBuiltIn const & llround_ctx,
-                    TArg const & arg)
-                -> long int
-                {
-                    alpaka::ignore_unused(llround_ctx);
-                    return ::llround(arg);
-                }
-            };
-
-            template<>
-            struct Round<
-                RoundHipBuiltIn,
-                float>
-            {
-                __device__ static auto round(
-                    RoundHipBuiltIn const & round_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(round_ctx);
-                    return ::roundf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundStdLib.hpp
index b16ae63e03..a8ca736e8b 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/round/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/round/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,18 +29,10 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library round trait specialization.
-            template<
-                typename TArg>
-            struct Round<
-                RoundStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Round<RoundStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto round(
-                    RoundStdLib const & round_ctx,
-                    TArg const & arg)
-                -> decltype(std::round(arg))
+                ALPAKA_FN_HOST static auto round(RoundStdLib const& round_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(round_ctx);
                     return std::round(arg);
@@ -49,18 +40,10 @@ namespace alpaka
             };
             //#############################################################################
             //! The standard library round trait specialization.
-            template<
-                typename TArg>
-            struct Lround<
-                RoundStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Lround<RoundStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto lround(
-                    RoundStdLib const & lround_ctx,
-                    TArg const & arg)
-                -> long int
+                ALPAKA_FN_HOST static auto lround(RoundStdLib const& lround_ctx, TArg const& arg) -> long int
                 {
                     alpaka::ignore_unused(lround_ctx);
                     return std::lround(arg);
@@ -68,23 +51,15 @@ namespace alpaka
             };
             //#############################################################################
             //! The standard library round trait specialization.
-            template<
-                typename TArg>
-            struct Llround<
-                RoundStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Llround<RoundStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto llround(
-                    RoundStdLib const & llround_ctx,
-                    TArg const & arg)
-                -> long int
+                ALPAKA_FN_HOST static auto llround(RoundStdLib const& llround_ctx, TArg const& arg) -> long int
                 {
                     alpaka::ignore_unused(llround_ctx);
                     return std::llround(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..10c3144220
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/round/RoundUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,106 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/round/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA round.
+        class RoundUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathRound, RoundUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA round trait specialization.
+            template<typename TArg>
+            struct Round<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto round(RoundUniformCudaHipBuiltIn const& round_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(round_ctx);
+                    return ::round(arg);
+                }
+            };
+            //#############################################################################
+            //! The CUDA lround trait specialization.
+            template<typename TArg>
+            struct Lround<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto lround(RoundUniformCudaHipBuiltIn const& lround_ctx, TArg const& arg)
+                    -> long int
+                {
+                    alpaka::ignore_unused(lround_ctx);
+                    return ::lround(arg);
+                }
+            };
+            //#############################################################################
+            //! The CUDA llround trait specialization.
+            template<typename TArg>
+            struct Llround<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto llround(RoundUniformCudaHipBuiltIn const& llround_ctx, TArg const& arg)
+                    -> long int
+                {
+                    alpaka::ignore_unused(llround_ctx);
+                    return ::llround(arg);
+                }
+            };
+            //! The CUDA round float specialization.
+            template<>
+            struct Round<RoundUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto round(RoundUniformCudaHipBuiltIn const& round_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(round_ctx);
+                    return ::roundf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/round/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/round/Traits.hpp
index 1029281383..5973d88e78 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/round/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/round/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,9 +11,6 @@
 
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <boost/config.hpp>
 
 #include <type_traits>
 
@@ -21,117 +18,72 @@ namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathRound;
+        struct ConceptMathRound
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The round trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Round;
 
             //#############################################################################
             //! The round trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Lround;
 
             //#############################################################################
             //! The round trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Llround;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
-        //! Computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from zero, regardless of the current rounding mode.
+        //! Computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from
+        //! zero, regardless of the current rounding mode.
         //!
         //! \tparam T The type of the object specializing Round.
         //! \tparam TArg The arg type.
         //! \param round_ctx The object specializing Round.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto round(
-            T const & round_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Round<
-                concepts::ImplementationBase<ConceptMathRound, T>,
-                TArg>
-            ::round(
-                round_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto round(T const& round_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-            return
-                traits::Round<
-                    ImplementationBase,
-                    TArg>
-                ::round(
-                    round_ctx,
-                    arg);
+            return traits::Round<ImplementationBase, TArg>::round(round_ctx, arg);
         }
         //-----------------------------------------------------------------------------
-        //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero, regardless of the current rounding mode.
+        //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero,
+        //! regardless of the current rounding mode.
         //!
         //! \tparam T The type of the object specializing Round.
         //! \tparam TArg The arg type.
         //! \param lround_ctx The object specializing Round.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto lround(
-            T const & lround_ctx,
-            TArg const & arg)
-        -> long int
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto lround(T const& lround_ctx, TArg const& arg) -> long int
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-            return
-                traits::Lround<
-                    ImplementationBase,
-                    TArg>
-                ::lround(
-                    lround_ctx,
-                    arg);
+            return traits::Lround<ImplementationBase, TArg>::lround(lround_ctx, arg);
         }
         //-----------------------------------------------------------------------------
-        //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero, regardless of the current rounding mode.
+        //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero,
+        //! regardless of the current rounding mode.
         //!
         //! \tparam T The type of the object specializing Round.
         //! \tparam TArg The arg type.
         //! \param llround_ctx The object specializing Round.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto llround(
-            T const & llround_ctx,
-            TArg const & arg)
-        -> long long int
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto llround(T const& llround_ctx, TArg const& arg) -> long long int
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-            return
-                traits::Llround<
-                    ImplementationBase,
-                    TArg>
-                ::llround(
-                    llround_ctx,
-                    arg);
+            return traits::Llround<ImplementationBase, TArg>::llround(llround_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtCudaBuiltIn.hpp
deleted file mode 100644
index eb80f4b2fc..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/rsqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA rsqrt.
-        class RsqrtCudaBuiltIn : public concepts::Implements<ConceptMathRsqrt, RsqrtCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA rsqrt trait specialization.
-            template<
-                typename TArg>
-            struct Rsqrt<
-                RsqrtCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                __device__ static auto rsqrt(
-                    RsqrtCudaBuiltIn const & rsqrt_ctx,
-                    TArg const & arg)
-                -> decltype(::rsqrt(arg))
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return ::rsqrt(arg);
-                }
-            };
-            //! The CUDA rsqrt float specialization.
-            template<>
-            struct Rsqrt<
-                RsqrtCudaBuiltIn,
-                float>
-            {
-                __device__ static auto rsqrt(
-                    RsqrtCudaBuiltIn const & rsqrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return ::rsqrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtHipBuiltIn.hpp
deleted file mode 100644
index a6f989136c..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/rsqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP rsqrt.
-        class RsqrtHipBuiltIn : public concepts::Implements<ConceptMathRsqrt, RsqrtHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP rsqrt trait specialization.
-            template<
-                typename TArg>
-            struct Rsqrt<
-                RsqrtHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
-            {
-                __device__ static auto rsqrt(
-                    RsqrtHipBuiltIn const & rsqrt_ctx,
-                    TArg const & arg)
-                -> decltype(::rsqrt(arg))
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return ::rsqrt(arg);
-                }
-            };
-            //! The HIP rsqrt float specialization.
-            template<>
-            struct Rsqrt<
-                RsqrtHipBuiltIn,
-                float>
-            {
-                __device__ static auto rsqrt(
-                    RsqrtHipBuiltIn const & rsqrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(rsqrt_ctx);
-                    return ::rsqrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp
index 3d5bd68048..b7978362a9 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/rsqrt/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/rsqrt/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library rsqrt trait specialization.
-            template<
-                typename TArg>
-            struct Rsqrt<
-                RsqrtStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Rsqrt<RsqrtStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto rsqrt(
-                    RsqrtStdLib const & rsqrt_ctx,
-                    TArg const & arg)
-                -> decltype(std::sqrt(arg))
+                ALPAKA_FN_HOST static auto rsqrt(RsqrtStdLib const& rsqrt_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(rsqrt_ctx);
-                    return static_cast<TArg>(1)/std::sqrt(arg);
+                    return static_cast<TArg>(1) / std::sqrt(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..058b907511
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/RsqrtUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/rsqrt/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA rsqrt.
+        class RsqrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathRsqrt, RsqrtUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA rsqrt trait specialization.
+            template<typename TArg>
+            struct Rsqrt<RsqrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
+            {
+                __device__ static auto rsqrt(RsqrtUniformCudaHipBuiltIn const& rsqrt_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(rsqrt_ctx);
+                    return ::rsqrt(arg);
+                }
+            };
+            //! The CUDA rsqrt float specialization.
+            template<>
+            struct Rsqrt<RsqrtUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto rsqrt(RsqrtUniformCudaHipBuiltIn const& rsqrt_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(rsqrt_ctx);
+                    return ::rsqrtf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/Traits.hpp
index 8e33c29f19..c1116d549c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/rsqrt/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,59 +12,41 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathRsqrt;
+        struct ConceptMathRsqrt
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The rsqrt trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Rsqrt;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the rsqrt.
         //!
+        //! Valid real arguments are positive. For other values the result
+        //! may depend on the backend and compilation options, will likely
+        //! be NaN.
+        //!
         //! \tparam T The type of the object specializing Rsqrt.
         //! \tparam TArg The arg type.
         //! \param rsqrt_ctx The object specializing Rsqrt.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto rsqrt(
-            T const & rsqrt_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Rsqrt<
-                concepts::ImplementationBase<ConceptMathRsqrt, T>,
-                TArg>
-            ::rsqrt(
-                rsqrt_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto rsqrt(T const& rsqrt_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathRsqrt, T>;
-            return
-                traits::Rsqrt<
-                    ImplementationBase,
-                    TArg>
-                ::rsqrt(
-                    rsqrt_ctx,
-                    arg);
+            return traits::Rsqrt<ImplementationBase, TArg>::rsqrt(rsqrt_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinCudaBuiltIn.hpp
deleted file mode 100644
index e52ec751af..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/sin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA sin.
-        class SinCudaBuiltIn : public concepts::Implements<ConceptMathSin, SinCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA sin trait specialization.
-            template<
-                typename TArg>
-            struct Sin<
-                SinCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto sin(
-                    SinCudaBuiltIn const & sin_ctx,
-                    TArg const & arg)
-                -> decltype(::sin(arg))
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return ::sin(arg);
-                }
-            };
-            //! The CUDA sin float specialization.
-            template<>
-            struct Sin<
-                SinCudaBuiltIn,
-                float>
-            {
-                __device__ static auto sin(
-                    SinCudaBuiltIn const & sin_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return ::sinf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinHipBuiltIn.hpp
deleted file mode 100644
index 86faad5c55..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/sin/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP sin.
-        class SinHipBuiltIn : public concepts::Implements<ConceptMathSin, SinHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP sin trait specialization.
-            template<
-                typename TArg>
-            struct Sin<
-                SinHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto sin(
-                    SinHipBuiltIn const & sin_ctx,
-                    TArg const & arg)
-                -> decltype(::sin(arg))
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return ::sin(arg);
-                }
-            };
-            //! The HIP sin float specialization.
-            template<>
-            struct Sin<
-                SinHipBuiltIn,
-                float>
-            {
-                __device__ static auto sin(
-                    SinHipBuiltIn const & sin_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(sin_ctx);
-                    return ::sinf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinStdLib.hpp
index a9b56c2ed7..816128f61c 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/sin/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/sin/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library sin trait specialization.
-            template<
-                typename TArg>
-            struct Sin<
-                SinStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Sin<SinStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto sin(
-                    SinStdLib const & sin_ctx,
-                    TArg const & arg)
-                -> decltype(std::sin(arg))
+                ALPAKA_FN_HOST static auto sin(SinStdLib const& sin_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(sin_ctx);
                     return std::sin(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..44a07ecf62
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sin/SinUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/sin/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA sin.
+        class SinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSin, SinUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA sin trait specialization.
+            template<typename TArg>
+            struct Sin<SinUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto sin(SinUniformCudaHipBuiltIn const& sin_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(sin_ctx);
+                    return ::sin(arg);
+                }
+            };
+            //! The CUDA sin float specialization.
+            template<>
+            struct Sin<SinUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto sin(SinUniformCudaHipBuiltIn const& sin_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(sin_ctx);
+                    return ::sinf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sin/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sin/Traits.hpp
index 8c93297905..41a54abd23 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sin/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sin/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathSin;
+        struct ConceptMathSin
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The sin trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Sin;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the sine (measured in radians).
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param sin_ctx The object specializing Sin.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto sin(
-            T const & sin_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Sin<
-                concepts::ImplementationBase<ConceptMathSin, T>,
-                TArg>
-            ::sin(
-                sin_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto sin(T const& sin_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathSin, T>;
-            return
-                traits::Sin<
-                    ImplementationBase,
-                    TArg>
-                ::sin(
-                    sin_ctx,
-                    arg);
+            return traits::Sin<ImplementationBase, TArg>::sin(sin_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosCudaBuiltIn.hpp
deleted file mode 100644
index ad54c19e99..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosCudaBuiltIn.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/sincos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA sincos.
-        class SinCosCudaBuiltIn : public concepts::Implements<ConceptMathSinCos, SinCosCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-
-            //! sincos trait specialization.
-            template<>
-            struct SinCos<
-                SinCosCudaBuiltIn,
-                double>
-            {
-                __device__ static auto sincos(
-                    SinCosCudaBuiltIn const & sincos_ctx,
-                    double const & arg,
-                    double & result_sin,
-                    double & result_cos)
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    ::sincos(arg, &result_sin, &result_cos);
-                }
-            };
-
-            //! The CUDA sin float specialization.
-            template<>
-            struct SinCos<
-                SinCosCudaBuiltIn,
-                float>
-            {
-                __device__ static auto sincos(
-                    SinCosCudaBuiltIn const & sincos_ctx,
-                    float const & arg,
-                    float & result_sin,
-                    float & result_cos)
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    ::sincosf(arg, &result_sin, &result_cos);
-                }
-            };
-
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosHipBuiltIn.hpp
deleted file mode 100644
index 3033cc0fd0..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosHipBuiltIn.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/sincos/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! sincos.
-        class SinCosHipBuiltIn : public concepts::Implements<ConceptMathSinCos, SinCosHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! sincos trait specialization.
-            template<>
-            struct SinCos<SinCosHipBuiltIn, double>
-            {
-                __device__ static auto sincos(
-                    SinCosHipBuiltIn const & sincos_ctx,
-                    double const & arg,
-                    double & result_sin,
-                    double & result_cos)
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    ::sincos(arg, &result_sin, &result_cos);
-                }
-            };
-
-            //! The sincos float specialization.
-            template<>
-            struct SinCos<SinCosHipBuiltIn, float>
-            {
-                __device__ static auto sincos(
-                    SinCosHipBuiltIn const & sincos_ctx,
-                    float const & arg,
-                    float & result_sin,
-                    float & result_cos)
-                -> void
-                {
-                    alpaka::ignore_unused(sincos_ctx);
-                    ::sincosf(arg, &result_sin, &result_cos);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp
index e39c4d2f65..2a2e8e80e4 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/sincos/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/sincos/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,26 +29,20 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library sincos trait specialization.
-            template<
-                typename TArg>
-            struct SinCos<
-                SinCosStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
+            template<typename TArg>
+            struct SinCos<SinCosStdLib, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
             {
                 ALPAKA_FN_HOST static auto sincos(
-                    SinCosStdLib const & sincos_ctx,
-                    TArg const & arg,
-                    TArg & result_sin,
-                    TArg & result_cos )
-                -> void
+                    SinCosStdLib const& sincos_ctx,
+                    TArg const& arg,
+                    TArg& result_sin,
+                    TArg& result_cos) -> void
                 {
                     alpaka::ignore_unused(sincos_ctx);
                     result_sin = std::sin(arg);
                     result_cos = std::cos(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..e3cfec2adb
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/SinCosUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,93 @@
+/* Copyright 2019 Benjamin Worpitz, Matthias Werner
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/sincos/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA sincos.
+        class SinCosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSinCos, SinCosUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+
+            //! sincos trait specialization.
+            template<>
+            struct SinCos<SinCosUniformCudaHipBuiltIn, double>
+            {
+                __device__ static auto sincos(
+                    SinCosUniformCudaHipBuiltIn const& sincos_ctx,
+                    double const& arg,
+                    double& result_sin,
+                    double& result_cos) -> void
+                {
+                    alpaka::ignore_unused(sincos_ctx);
+                    ::sincos(arg, &result_sin, &result_cos);
+                }
+            };
+
+            //! The CUDA sin float specialization.
+            template<>
+            struct SinCos<SinCosUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto sincos(
+                    SinCosUniformCudaHipBuiltIn const& sincos_ctx,
+                    float const& arg,
+                    float& result_sin,
+                    float& result_cos) -> void
+                {
+                    alpaka::ignore_unused(sincos_ctx);
+                    ::sincosf(arg, &result_sin, &result_cos);
+                }
+            };
+
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/Traits.hpp
index 355134cf86..4e3dbbf4cf 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sincos/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sincos/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathSinCos;
+        struct ConceptMathSinCos
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The sincos trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct SinCos;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the sine and cosine (measured in radians).
@@ -43,26 +40,12 @@ namespace alpaka
         //! \param result_sin result of sine
         //! \param result_cos result of cosine
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto sincos(
-            T const & sincos_ctx,
-            TArg const & arg,
-            TArg & result_sin,
-            TArg & result_cos)
-        -> void
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto sincos(T const& sincos_ctx, TArg const& arg, TArg& result_sin, TArg& result_cos)
+            -> void
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathSinCos, T>;
-            traits::SinCos<
-                ImplementationBase,
-                TArg>
-                ::sincos(
-                    sincos_ctx,
-                    arg,
-                    result_sin,
-                    result_cos
-                    );
+            traits::SinCos<ImplementationBase, TArg>::sincos(sincos_ctx, arg, result_sin, result_cos);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtCudaBuiltIn.hpp
deleted file mode 100644
index 2e597e893d..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtCudaBuiltIn.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/sqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA sqrt.
-        class SqrtCudaBuiltIn : public concepts::Implements<ConceptMathSqrt, SqrtCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA sqrt trait specialization.
-            template<
-                typename TArg>
-            struct Sqrt<
-                SqrtCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto sqrt(
-                    SqrtCudaBuiltIn const & sqrt_ctx,
-                    TArg const & arg)
-                -> decltype(::sqrt(arg))
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return ::sqrt(arg);
-                }
-            };
-            //! The CUDA sqrt float specialization.
-            template<>
-            struct Sqrt<
-                SqrtCudaBuiltIn,
-                float>
-            {
-                __device__ static auto sqrt(
-                    SqrtCudaBuiltIn const & sqrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return ::sqrtf(arg);
-                }
-            };
-
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtHipBuiltIn.hpp
deleted file mode 100644
index fdf9b8ec01..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner, Valentin Gehrke
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/sqrt/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP sqrt.
-        class SqrtHipBuiltIn : public concepts::Implements<ConceptMathSqrt, SqrtHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP sqrt trait specialization.
-            template<
-                typename TArg>
-            struct Sqrt<
-                SqrtHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto sqrt(
-                    SqrtHipBuiltIn const & sqrt_ctx,
-                    TArg const & arg)
-                -> decltype(::sqrt(arg))
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return ::sqrt(arg);
-                }
-            };
-            //! The HIP sqrt float specialization.
-            template<>
-            struct Sqrt<
-                SqrtHipBuiltIn,
-                float>
-            {
-                __device__ static auto sqrt(
-                    SqrtHipBuiltIn const & sqrt_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(sqrt_ctx);
-                    return ::sqrtf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp
index 4fb1f3eae6..7ccb5932d6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/sqrt/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/sqrt/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library sqrt trait specialization.
-            template<
-                typename TArg>
-            struct Sqrt<
-                SqrtStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Sqrt<SqrtStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto sqrt(
-                    SqrtStdLib const & sqrt_ctx,
-                    TArg const & arg)
-                -> decltype(std::sqrt(arg))
+                ALPAKA_FN_HOST static auto sqrt(SqrtStdLib const& sqrt_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(sqrt_ctx);
                     return std::sqrt(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..efe757e293
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/SqrtUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,83 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Valentin Gehrke
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/sqrt/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA sqrt.
+        class SqrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSqrt, SqrtUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA sqrt trait specialization.
+            template<typename TArg>
+            struct Sqrt<SqrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto sqrt(SqrtUniformCudaHipBuiltIn const& sqrt_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(sqrt_ctx);
+                    return ::sqrt(arg);
+                }
+            };
+            //! The CUDA sqrt float specialization.
+            template<>
+            struct Sqrt<SqrtUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto sqrt(SqrtUniformCudaHipBuiltIn const& sqrt_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(sqrt_ctx);
+                    return ::sqrtf(arg);
+                }
+            };
+
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/Traits.hpp
index e83124fbd0..6203b51d39 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/sqrt/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,59 +12,41 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathSqrt;
+        struct ConceptMathSqrt
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The sqrt trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Sqrt;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the square root of arg.
         //!
+        //! Valid real arguments are non-negative. For other values the result
+        //! may depend on the backend and compilation options, will likely
+        //! be NaN.
+        //!
         //! \tparam T The type of the object specializing Sqrt.
         //! \tparam TArg The arg type.
         //! \param sqrt_ctx The object specializing Sqrt.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto sqrt(
-            T const & sqrt_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Sqrt<
-                concepts::ImplementationBase<ConceptMathSqrt, T>,
-                TArg>
-            ::sqrt(
-                sqrt_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto sqrt(T const& sqrt_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathSqrt, T>;
-            return
-                traits::Sqrt<
-                    ImplementationBase,
-                    TArg>
-                ::sqrt(
-                    sqrt_ctx,
-                    arg);
+            return traits::Sqrt<ImplementationBase, TArg>::sqrt(sqrt_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanCudaBuiltIn.hpp
deleted file mode 100644
index 96691b2410..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/tan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA tan.
-        class TanCudaBuiltIn : public concepts::Implements<ConceptMathTan, TanCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA tan trait specialization.
-            template<
-                typename TArg>
-            struct Tan<
-                TanCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto tan(
-                    TanCudaBuiltIn const & tan_ctx,
-                    TArg const & arg)
-                -> decltype(::tan(arg))
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return ::tan(arg);
-                }
-            };
-            //! The CUDA tan float specialization.
-            template<>
-            struct Tan<
-                TanCudaBuiltIn,
-                float>
-            {
-                __device__ static auto tan(
-                    TanCudaBuiltIn const & tan_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return ::tanf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanHipBuiltIn.hpp
deleted file mode 100644
index d6e6d1deb6..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/tan/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP tan.
-        class TanHipBuiltIn : public concepts::Implements<ConceptMathTan, TanHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP tan trait specialization.
-            template<
-                typename TArg>
-            struct Tan<
-                TanHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto tan(
-                    TanHipBuiltIn const & tan_ctx,
-                    TArg const & arg)
-                -> decltype(::tan(arg))
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return ::tan(arg);
-                }
-            };
-            //! The HIP tan float specialization.
-            template<>
-            struct Tan<
-                TanHipBuiltIn,
-                float>
-            {
-                __device__ static auto tan(
-                    TanHipBuiltIn const & tan_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(tan_ctx);
-                    return ::tanf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanStdLib.hpp
index f7453360bb..8b06f2eaac 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/tan/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/tan/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library tan trait specialization.
-            template<
-                typename TArg>
-            struct Tan<
-                TanStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Tan<TanStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto tan(
-                    TanStdLib const & tan_ctx,
-                    TArg const & arg)
-                -> decltype(std::tan(arg))
+                ALPAKA_FN_HOST static auto tan(TanStdLib const& tan_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(tan_ctx);
                     return std::tan(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..00811fefab
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/tan/TanUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/tan/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA tan.
+        class TanUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathTan, TanUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA tan trait specialization.
+            template<typename TArg>
+            struct Tan<TanUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto tan(TanUniformCudaHipBuiltIn const& tan_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(tan_ctx);
+                    return ::tan(arg);
+                }
+            };
+            //! The CUDA tan float specialization.
+            template<>
+            struct Tan<TanUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto tan(TanUniformCudaHipBuiltIn const& tan_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(tan_ctx);
+                    return ::tanf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/tan/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/tan/Traits.hpp
index d366d5f336..fa56116055 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/tan/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/tan/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathTan;
+        struct ConceptMathTan
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The tan trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Tan;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the tangent (measured in radians).
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param tan_ctx The object specializing Tan.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto tan(
-            T const & tan_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Tan<
-                concepts::ImplementationBase<ConceptMathTan, T>,
-                TArg>
-            ::tan(
-                tan_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto tan(T const& tan_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathTan, T>;
-            return
-                traits::Tan<
-                    ImplementationBase,
-                    TArg>
-                ::tan(
-                    tan_ctx,
-                    arg);
+            return traits::Tan<ImplementationBase, TArg>::tan(tan_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/Traits.hpp
index 2444acb810..111f495ad1 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,23 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
 {
     namespace math
     {
-        struct ConceptMathTrunc;
+        struct ConceptMathTrunc
+        {
+        };
 
         namespace traits
         {
             //#############################################################################
             //! The trunc trait.
-            template<
-                typename T,
-                typename TArg,
-                typename TSfinae = void>
+            template<typename T, typename TArg, typename TSfinae = void>
             struct Trunc;
-        }
+        } // namespace traits
 
         //-----------------------------------------------------------------------------
         //! Computes the nearest integer not greater in magnitude than arg.
@@ -41,30 +38,11 @@ namespace alpaka
         //! \param trunc_ctx The object specializing Trunc.
         //! \param arg The arg.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename T,
-            typename TArg>
-        ALPAKA_FN_HOST_ACC auto trunc(
-            T const & trunc_ctx,
-            TArg const & arg)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Trunc<
-                concepts::ImplementationBase<ConceptMathTrunc, T>,
-                TArg>
-            ::trunc(
-                trunc_ctx,
-                arg))
-#endif
+        template<typename T, typename TArg>
+        ALPAKA_FN_HOST_ACC auto trunc(T const& trunc_ctx, TArg const& arg)
         {
             using ImplementationBase = concepts::ImplementationBase<ConceptMathTrunc, T>;
-            return
-                traits::Trunc<
-                    ImplementationBase,
-                    TArg>
-                ::trunc(
-                    trunc_ctx,
-                    arg);
+            return traits::Trunc<ImplementationBase, TArg>::trunc(trunc_ctx, arg);
         }
-    }
-}
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp
deleted file mode 100644
index b0febf6e16..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/math/trunc/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#include <cuda_runtime.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The CUDA trunc.
-        class TruncCudaBuiltIn : public concepts::Implements<ConceptMathTrunc, TruncCudaBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA trunc trait specialization.
-            template<
-                typename TArg>
-            struct Trunc<
-                TruncCudaBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto trunc(
-                    TruncCudaBuiltIn const & trunc_ctx,
-                    TArg const & arg)
-                -> decltype(::trunc(arg))
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return ::trunc(arg);
-                }
-            };
-            //! The CUDA trunc float specialization.
-            template<>
-            struct Trunc<
-                TruncCudaBuiltIn,
-                float>
-            {
-                __device__ static auto trunc(
-                    TruncCudaBuiltIn const & trunc_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return ::truncf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncHipBuiltIn.hpp
deleted file mode 100644
index 2618199613..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncHipBuiltIn.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/math/trunc/Traits.hpp>
-
-#include <alpaka/core/Unused.hpp>
-
-#if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
-    #include <cuda_runtime_api.h>
-#else
-    #if BOOST_COMP_HCC || BOOST_COMP_HIP
-        #include <math_functions.h>
-    #else
-        #include <math_functions.hpp>
-    #endif
-#endif
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        //#############################################################################
-        //! The HIP trunc.
-        class TruncHipBuiltIn : public concepts::Implements<ConceptMathTrunc, TruncHipBuiltIn>
-        {
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP trunc trait specialization.
-            template<
-                typename TArg>
-            struct Trunc<
-                TruncHipBuiltIn,
-                TArg,
-                typename std::enable_if<
-                    std::is_floating_point<TArg>::value>::type>
-            {
-                __device__ static auto trunc(
-                    TruncHipBuiltIn const & trunc_ctx,
-                    TArg const & arg)
-                -> decltype(::trunc(arg))
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return ::trunc(arg);
-                }
-            };
-            //! The HIP trunc float specialization.
-            template<>
-            struct Trunc<
-                TruncHipBuiltIn,
-                float>
-            {
-                __device__ static auto trunc(
-                    TruncHipBuiltIn const & trunc_ctx,
-                    float const & arg)
-                -> float
-                {
-                    alpaka::ignore_unused(trunc_ctx);
-                    return ::truncf(arg);
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp
index 81059a0b47..615fea514e 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,12 +9,11 @@
 
 #pragma once
 
-#include <alpaka/math/trunc/Traits.hpp>
-
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/math/trunc/Traits.hpp>
 
-#include <type_traits>
 #include <cmath>
+#include <type_traits>
 
 namespace alpaka
 {
@@ -30,23 +29,15 @@ namespace alpaka
         {
             //#############################################################################
             //! The standard library trunc trait specialization.
-            template<
-                typename TArg>
-            struct Trunc<
-                TruncStdLib,
-                TArg,
-                typename std::enable_if<
-                    std::is_arithmetic<TArg>::value>::type>
+            template<typename TArg>
+            struct Trunc<TruncStdLib, TArg, std::enable_if_t<std::is_arithmetic<TArg>::value>>
             {
-                ALPAKA_FN_HOST static auto trunc(
-                    TruncStdLib const & trunc_ctx,
-                    TArg const & arg)
-                -> decltype(std::trunc(arg))
+                ALPAKA_FN_HOST static auto trunc(TruncStdLib const& trunc_ctx, TArg const& arg)
                 {
                     alpaka::ignore_unused(trunc_ctx);
                     return std::trunc(arg);
                 }
             };
-        }
-    }
-}
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..1a37d10d5d
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/math/trunc/TruncUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,82 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <cuda_runtime.h>
+#        if !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#        if BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(9, 0, 0)
+#            include <cuda_runtime_api.h>
+#        else
+#            if BOOST_COMP_HIP
+#                include <hip/math_functions.h>
+#            else
+#                include <math_functions.hpp>
+#            endif
+#        endif
+
+#        if !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/math/trunc/Traits.hpp>
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        //#############################################################################
+        //! The CUDA trunc.
+        class TruncUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathTrunc, TruncUniformCudaHipBuiltIn>
+        {
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA trunc trait specialization.
+            template<typename TArg>
+            struct Trunc<TruncUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point<TArg>::value>>
+            {
+                __device__ static auto trunc(TruncUniformCudaHipBuiltIn const& trunc_ctx, TArg const& arg)
+                {
+                    alpaka::ignore_unused(trunc_ctx);
+                    return ::trunc(arg);
+                }
+            };
+            //! The CUDA trunc float specialization.
+            template<>
+            struct Trunc<TruncUniformCudaHipBuiltIn, float>
+            {
+                __device__ static auto trunc(TruncUniformCudaHipBuiltIn const& trunc_ctx, float const& arg) -> float
+                {
+                    alpaka::ignore_unused(trunc_ctx);
+                    return ::truncf(arg);
+                }
+            };
+        } // namespace traits
+    } // namespace math
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuAligned.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuAligned.hpp
new file mode 100644
index 0000000000..d9ab2b0559
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuAligned.hpp
@@ -0,0 +1,74 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/AlignedAlloc.hpp>
+#include <alpaka/core/Common.hpp>
+#include <alpaka/core/Unused.hpp>
+#include <alpaka/mem/alloc/Traits.hpp>
+
+#include <algorithm>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The CPU boost aligned allocator.
+    //!
+    //! \tparam TAlignment An integral constant containing the alignment.
+    template<typename TAlignment>
+    class AllocCpuAligned : public concepts::Implements<ConceptMemAlloc, AllocCpuAligned<TAlignment>>
+    {
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU boost aligned allocator memory allocation trait specialization.
+        template<typename T, typename TAlignment>
+        struct Malloc<T, AllocCpuAligned<TAlignment>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto malloc(AllocCpuAligned<TAlignment> const& alloc, std::size_t const& sizeElems)
+                -> T*
+            {
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+                // For CUDA host memory must be aligned to 4 kib to pin it with `cudaHostRegister`,
+                // this was described in older programming guides but was removed later.
+                // From testing with PIConGPU and cuda-memcheck we found out that the alignment is still required.
+                //
+                // For HIP the required alignment is the size of a cache line.
+                // https://rocm-developer-tools.github.io/HIP/group__Memory.html#gab8258f051e1a1f7385f794a15300e674
+                // To avoid issues with HIP(cuda) the alignment will be set also for HIP(clang)
+                // to 4kib.
+                // @todo evaluate requirements when the HIP ecosystem is more stable
+                constexpr size_t minAlignement = 4096;
+#else
+                constexpr size_t minAlignement = TAlignment::value;
+#endif
+                alpaka::ignore_unused(alloc);
+                return reinterpret_cast<T*>(
+                    core::alignedAlloc(std::max(TAlignment::value, minAlignement), sizeElems * sizeof(T)));
+            }
+        };
+
+        //#############################################################################
+        //! The CPU boost aligned allocator memory free trait specialization.
+        template<typename T, typename TAlignment>
+        struct Free<T, AllocCpuAligned<TAlignment>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto free(AllocCpuAligned<TAlignment> const& alloc, T const* const ptr) -> void
+            {
+                alpaka::ignore_unused(alloc);
+                core::alignedFree(const_cast<void*>(reinterpret_cast<void const*>(ptr)));
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp
deleted file mode 100644
index a89ef98eba..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuBoostAligned.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#include <alpaka/mem/alloc/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <boost/align.hpp>
-
-#include <algorithm>
-
-namespace alpaka
-{
-    namespace mem
-    {
-        //-----------------------------------------------------------------------------
-        //! The allocator specifics.
-        namespace alloc
-        {
-            //#############################################################################
-            //! The CPU boost aligned allocator.
-            //!
-            //! \tparam TAlignment An integral constant containing the alignment.
-            template<
-                typename TAlignment>
-            class AllocCpuBoostAligned : public concepts::Implements<ConceptMemAlloc, AllocCpuBoostAligned<TAlignment>>
-            {
-            };
-
-            namespace traits
-            {
-                //#############################################################################
-                //! The CPU boost aligned allocator memory allocation trait specialization.
-                template<
-                    typename T,
-                    typename TAlignment>
-                struct Alloc<
-                    T,
-                    AllocCpuBoostAligned<TAlignment>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto alloc(
-                        AllocCpuBoostAligned<TAlignment> const & alloc,
-                        std::size_t const & sizeElems)
-                    -> T *
-                    {
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                        // For CUDA host memory must be aligned to 4 kib to pin it with `cudaHostRegister`,
-                        // this was described in older programming guides but was removed later.
-                        // From testing with PIConGPU and cuda-memcheck we found out that the alignment is still required.
-                        //
-                        // For HIP the required alignment is the size of a cache line.
-                        // https://rocm-developer-tools.github.io/HIP/group__Memory.html#gab8258f051e1a1f7385f794a15300e674
-                        // To avoid issues with HIP(cuda) the alignment will be set also for HIP(clang/hcc)
-                        // to 4kib.
-                        // @todo evaluate requirements when the HIP ecosystem is more stable
-                        constexpr size_t minAlignement = 4096;
-#else
-                        constexpr size_t minAlignement = TAlignment::value;
-#endif
-                        alpaka::ignore_unused(alloc);
-                        return
-                            reinterpret_cast<T *>(
-                                boost::alignment::aligned_alloc(std::max(TAlignment::value, minAlignement), sizeElems * sizeof(T)));
-                    }
-                };
-
-                //#############################################################################
-                //! The CPU boost aligned allocator memory free trait specialization.
-                template<
-                    typename T,
-                    typename TAlignment>
-                struct Free<
-                    T,
-                    AllocCpuBoostAligned<TAlignment>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto free(
-                        AllocCpuBoostAligned<TAlignment> const & alloc,
-                        T const * const ptr)
-                    -> void
-                    {
-                        alpaka::ignore_unused(alloc);
-                            boost::alignment::aligned_free(
-                                const_cast<void *>(
-                                    reinterpret_cast<void const *>(ptr)));
-                    }
-                };
-            }
-        }
-    }
-}
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp
index badd8f5608..9afce8e4c7 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/AllocCpuNew.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,65 +9,44 @@
 
 #pragma once
 
-#include <alpaka/mem/alloc/Traits.hpp>
-
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/mem/alloc/Traits.hpp>
 
 namespace alpaka
 {
-    namespace mem
+    //#############################################################################
+    //! The CPU new allocator.
+    class AllocCpuNew : public concepts::Implements<ConceptMemAlloc, AllocCpuNew>
+    {
+    };
+
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The allocator specifics.
-        namespace alloc
+        //#############################################################################
+        //! The CPU new allocator memory allocation trait specialization.
+        template<typename T>
+        struct Malloc<T, AllocCpuNew>
         {
-            //#############################################################################
-            //! The CPU new allocator.
-            class AllocCpuNew : public concepts::Implements<ConceptMemAlloc, AllocCpuNew>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto malloc(AllocCpuNew const& alloc, std::size_t const& sizeElems) -> T*
             {
-            };
+                alpaka::ignore_unused(alloc);
+                return new T[sizeElems];
+            }
+        };
 
-            namespace traits
+        //#############################################################################
+        //! The CPU new allocator memory free trait specialization.
+        template<typename T>
+        struct Free<T, AllocCpuNew>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto free(AllocCpuNew const& alloc, T const* const ptr) -> void
             {
-                //#############################################################################
-                //! The CPU new allocator memory allocation trait specialization.
-                template<
-                    typename T>
-                struct Alloc<
-                    T,
-                    AllocCpuNew>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto alloc(
-                        AllocCpuNew const & alloc,
-                        std::size_t const & sizeElems)
-                    -> T *
-                    {
-                        alpaka::ignore_unused(alloc);
-                        return new T[sizeElems];
-                    }
-                };
-
-                //#############################################################################
-                //! The CPU new allocator memory free trait specialization.
-                template<
-                    typename T>
-                struct Free<
-                    T,
-                    AllocCpuNew>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto free(
-                        AllocCpuNew const & alloc,
-                        T const * const ptr)
-                    -> void
-                    {
-                        alpaka::ignore_unused(alloc);
-                        return delete[] ptr;
-                    }
-                };
+                alpaka::ignore_unused(alloc);
+                return delete[] ptr;
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/Traits.hpp
index 90ab18dcaf..cf0f32701d 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/alloc/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,82 +9,48 @@
 
 #pragma once
 
+#include <alpaka/core/Common.hpp>
+#include <alpaka/core/Concepts.hpp>
 #include <alpaka/dev/Traits.hpp>
 #include <alpaka/dim/Traits.hpp>
 #include <alpaka/extent/Traits.hpp>
 
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Concepts.hpp>
-
 namespace alpaka
 {
-    namespace mem
+    struct ConceptMemAlloc
     {
-        //-----------------------------------------------------------------------------
-        //! The allocator specifics.
-        namespace alloc
-        {
-            struct ConceptMemAlloc;
-
-            //-----------------------------------------------------------------------------
-            //! The allocator traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The memory allocation trait.
-                template<
-                    typename T,
-                    typename TAlloc,
-                    typename TSfinae = void>
-                struct Alloc;
+    };
 
-                //#############################################################################
-                //! The memory free trait.
-                template<
-                    typename T,
-                    typename TAlloc,
-                    typename TSfinae = void>
-                struct Free;
-            }
-
-            //-----------------------------------------------------------------------------
-            //! \return The pointer to the allocated memory.
-            template<
-                typename T,
-                typename TAlloc>
-            ALPAKA_FN_HOST auto alloc(
-                TAlloc const & alloc,
-                std::size_t const & sizeElems)
-            -> T *
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
-                return
-                    traits::Alloc<
-                        T,
-                        ImplementationBase>
-                    ::alloc(
-                        alloc,
-                        sizeElems);
-            }
+    //-----------------------------------------------------------------------------
+    //! The allocator traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The memory allocation trait.
+        template<typename T, typename TAlloc, typename TSfinae = void>
+        struct Malloc;
+
+        //#############################################################################
+        //! The memory free trait.
+        template<typename T, typename TAlloc, typename TSfinae = void>
+        struct Free;
+    } // namespace traits
+
+    //-----------------------------------------------------------------------------
+    //! \return The pointer to the allocated memory.
+    template<typename T, typename TAlloc>
+    ALPAKA_FN_HOST auto malloc(TAlloc const& alloc, std::size_t const& sizeElems) -> T*
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
+        return traits::Malloc<T, ImplementationBase>::malloc(alloc, sizeElems);
+    }
 
-            //-----------------------------------------------------------------------------
-            //! Frees the memory identified by the given pointer.
-            template<
-                typename TAlloc,
-                typename T>
-            ALPAKA_FN_HOST auto free(
-                TAlloc const & alloc,
-                T const * const ptr)
-            -> void
-            {
-                using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
-                traits::Free<
-                    T,
-                    ImplementationBase>
-                ::free(
-                    alloc,
-                    ptr);
-            }
-        }
+    //-----------------------------------------------------------------------------
+    //! Frees the memory identified by the given pointer.
+    template<typename TAlloc, typename T>
+    ALPAKA_FN_HOST auto free(TAlloc const& alloc, T const* const ptr) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
+        traits::Free<T, ImplementationBase>::free(alloc, ptr);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCpu.hpp
index d373f77736..20b304c3ee 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCpu.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCpu.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Alexander Matthes, Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,654 +9,449 @@
 
 #pragma once
 
-#include <alpaka/core/Vectorize.hpp>
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/core/Vectorize.hpp>
 #include <alpaka/dev/DevCpu.hpp>
-
 #include <alpaka/dev/Traits.hpp>
 #include <alpaka/mem/buf/Traits.hpp>
-
 #include <alpaka/vec/Vec.hpp>
 
-// \TODO: Remove CUDA inclusion for BufCpu by replacing pinning with non CUDA code!
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-    #include <alpaka/core/Cuda.hpp>
+// Backend specific includes.
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#    include <alpaka/core/Cuda.hpp>
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#    include <alpaka/core/Hip.hpp>
 #endif
 
-#include <alpaka/mem/alloc/AllocCpuBoostAligned.hpp>
-
+#include <alpaka/mem/alloc/AllocCpuAligned.hpp>
 #include <alpaka/meta/DependentFalseType.hpp>
 
 #include <memory>
+#include <type_traits>
 
 namespace alpaka
 {
-    namespace mem
+    namespace detail
     {
-        namespace buf
+        //#############################################################################
+        //! The CPU memory buffer.
+        template<typename TElem, typename TDim, typename TIdx>
+        class BufCpuImpl final
+            : public AllocCpuAligned<std::integral_constant<std::size_t, core::vectorization::defaultAlignment>>
         {
-            namespace cpu
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CPU memory buffer.
-                    template<
-                        typename TElem,
-                        typename TDim,
-                        typename TIdx>
-                    class BufCpuImpl final :
-                        public mem::alloc::AllocCpuBoostAligned<std::integral_constant<std::size_t, core::vectorization::defaultAlignment>>
-                    {
-                        static_assert(
-                            !std::is_const<TElem>::value,
-                            "The elem type of the buffer can not be const because the C++ Standard forbids containers of const elements!");
-                        static_assert(
-                            !std::is_const<TIdx>::value,
-                            "The idx type of the buffer can not be const!");
-                    public:
-                        //-----------------------------------------------------------------------------
-                        template<
-                            typename TExtent>
-                        ALPAKA_FN_HOST BufCpuImpl(
-                            dev::DevCpu const & dev,
-                            TExtent const & extent) :
-                                mem::alloc::AllocCpuBoostAligned<std::integral_constant<std::size_t, core::vectorization::defaultAlignment>>(),
-                                m_dev(dev),
-                                m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                                m_pMem(mem::alloc::alloc<TElem>(*this, static_cast<std::size_t>(computeElementCount(extent)))),
-                                m_pitchBytes(static_cast<TIdx>(extent::getWidth(extent) * static_cast<TIdx>(sizeof(TElem))))
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                                ,m_bPinned(false)
+            static_assert(
+                !std::is_const<TElem>::value,
+                "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
+                "elements!");
+            static_assert(!std::is_const<TIdx>::value, "The idx type of the buffer can not be const!");
+
+        public:
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST BufCpuImpl(DevCpu const& dev, TExtent const& extent)
+                : AllocCpuAligned<std::integral_constant<std::size_t, core::vectorization::defaultAlignment>>()
+                , m_dev(dev)
+                , m_extentElements(extent::getExtentVecEnd<TDim>(extent))
+                , m_pMem(alpaka::malloc<TElem>(*this, static_cast<std::size_t>(computeElementCount(extent))))
+                , m_pitchBytes(static_cast<TIdx>(extent::getWidth(extent) * static_cast<TIdx>(sizeof(TElem))))
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+                , m_bPinned(false)
 #endif
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                            static_assert(
-                                TDim::value == dim::Dim<TExtent>::value,
-                                "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be identical!");
-                            static_assert(
-                                std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                                "The idx type of TExtent and the TIdx template parameter have to be identical!");
+                static_assert(
+                    TDim::value == Dim<TExtent>::value,
+                    "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
+                    "identical!");
+                static_assert(
+                    std::is_same<TIdx, Idx<TExtent>>::value,
+                    "The idx type of TExtent and the TIdx template parameter have to be identical!");
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            std::cout << __func__
-                                << " e: " << m_extentElements
-                                << " ptr: " << static_cast<void *>(m_pMem)
-                                << " pitch: " << m_pitchBytes
-                                << std::endl;
+                std::cout << __func__ << " e: " << m_extentElements << " ptr: " << static_cast<void*>(m_pMem)
+                          << " pitch: " << m_pitchBytes << std::endl;
 #endif
-                        }
-                        //-----------------------------------------------------------------------------
-                        BufCpuImpl(BufCpuImpl const &) = delete;
-                        //-----------------------------------------------------------------------------
-                        BufCpuImpl(BufCpuImpl &&) = default;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(BufCpuImpl const &) -> BufCpuImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(BufCpuImpl &&) -> BufCpuImpl & = default;
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST ~BufCpuImpl()
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+            }
+            //-----------------------------------------------------------------------------
+            BufCpuImpl(BufCpuImpl const&) = delete;
+            //-----------------------------------------------------------------------------
+            BufCpuImpl(BufCpuImpl&&) = default;
+            //-----------------------------------------------------------------------------
+            auto operator=(BufCpuImpl const&) -> BufCpuImpl& = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(BufCpuImpl&&) -> BufCpuImpl& = default;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST ~BufCpuImpl()
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                            // Unpin this memory if it is currently pinned.
-                            mem::buf::unpin(*this);
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+                // Unpin this memory if it is currently pinned.
+                unpin(*this);
 #endif
-                            // NOTE: m_pMem is allowed to be a nullptr here.
-                            mem::alloc::free(*this, m_pMem);
-                        }
-
-                    private:
-                        //-----------------------------------------------------------------------------
-                        //! \return The number of elements to allocate.
-                        template<
-                            typename TExtent>
-                        ALPAKA_FN_HOST static auto computeElementCount(
-                            TExtent const & extent)
-                        -> TIdx
-                        {
-                            auto const extentElementCount(extent::getExtentProduct(extent));
+                // NOTE: m_pMem is allowed to be a nullptr here.
+                alpaka::free(*this, m_pMem);
+            }
 
-                            return extentElementCount;
-                        }
+        private:
+            //-----------------------------------------------------------------------------
+            //! \return The number of elements to allocate.
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto computeElementCount(TExtent const& extent) -> TIdx
+            {
+                auto const extentElementCount(extent::getExtentProduct(extent));
 
-                    public:
-                        dev::DevCpu const m_dev;
-                        vec::Vec<TDim, TIdx> const m_extentElements;
-                        TElem * const m_pMem;
-                        TIdx const m_pitchBytes;
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                        bool m_bPinned;
-#endif
-                    };
-                }
+                return extentElementCount;
             }
-            //#############################################################################
-            //! The CPU memory buffer.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCpu
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST BufCpu(
-                    dev::DevCpu const & dev,
-                    TExtent const & extent) :
-                        m_spBufCpuImpl(std::make_shared<cpu::detail::BufCpuImpl<TElem, TDim, TIdx>>(dev, extent))
-                {}
-                //-----------------------------------------------------------------------------
-                BufCpu(BufCpu const &) = default;
-                //-----------------------------------------------------------------------------
-                BufCpu(BufCpu &&) = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(BufCpu const &) -> BufCpu & = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(BufCpu &&) -> BufCpu & = default;
-                //-----------------------------------------------------------------------------
-                ~BufCpu() = default;
 
-            public:
-                std::shared_ptr<cpu::detail::BufCpuImpl<TElem, TDim, TIdx>> m_spBufCpuImpl;
-            };
-        }
-    }
+        public:
+            DevCpu const m_dev;
+            Vec<TDim, TIdx> const m_extentElements;
+            TElem* const m_pMem;
+            TIdx const m_pitchBytes;
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+            bool m_bPinned;
+#endif
+        };
+    } // namespace detail
 
-    namespace dev
+    //#############################################################################
+    //! The CPU memory buffer.
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufCpu
     {
-        namespace traits
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TExtent>
+        ALPAKA_FN_HOST BufCpu(DevCpu const& dev, TExtent const& extent)
+            : m_spBufCpuImpl(std::make_shared<detail::BufCpuImpl<TElem, TDim, TIdx>>(dev, extent))
         {
-            //#############################################################################
-            //! The BufCpu device type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                using type = dev::DevCpu;
-            };
-            //#############################################################################
-            //! The BufCpu device get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetDev<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                ALPAKA_FN_HOST static auto getDev(
-                    mem::buf::BufCpu<TElem, TDim, TIdx> const & buf)
-                -> dev::DevCpu
-                {
-                    return buf.m_spBufCpuImpl->m_dev;
-                }
-            };
         }
-    }
-    namespace dim
+        //-----------------------------------------------------------------------------
+        BufCpu(BufCpu const&) = default;
+        //-----------------------------------------------------------------------------
+        BufCpu(BufCpu&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(BufCpu const&) -> BufCpu& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(BufCpu&&) -> BufCpu& = default;
+        //-----------------------------------------------------------------------------
+        ~BufCpu() = default;
+
+    public:
+        std::shared_ptr<detail::BufCpuImpl<TElem, TDim, TIdx>> m_spBufCpuImpl;
+    };
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The BufCpu device type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DevType<BufCpu<TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The BufCpu dimension getter trait.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
+            using type = DevCpu;
+        };
+        //#############################################################################
+        //! The BufCpu device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetDev<BufCpu<TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The BufCpu memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct ElemType<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
+            ALPAKA_FN_HOST static auto getDev(BufCpu<TElem, TDim, TIdx> const& buf) -> DevCpu
             {
-                using type = TElem;
-            };
-        }
-    }
+                return buf.m_spBufCpuImpl->m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The BufCpu dimension getter trait.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DimType<BufCpu<TElem, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The BufCpu memory element type get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct ElemType<BufCpu<TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+    } // namespace traits
     namespace extent
     {
         namespace traits
         {
             //#############################################################################
             //! The BufCpu width get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
+            template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
             struct GetExtent<
                 TIdxIntegralConst,
-                mem::buf::BufCpu<TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+                BufCpu<TElem, TDim, TIdx>,
+                std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
             {
                 //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    mem::buf::BufCpu<TElem, TDim, TIdx> const & extent)
-                -> TIdx
+                ALPAKA_FN_HOST static auto getExtent(BufCpu<TElem, TDim, TIdx> const& extent) -> TIdx
                 {
                     return extent.m_spBufCpuImpl->m_extentElements[TIdxIntegralConst::value];
                 }
             };
-        }
-    }
-    namespace mem
+        } // namespace traits
+    } // namespace extent
+    namespace traits
     {
-        namespace view
+        //#############################################################################
+        //! The BufCpu native pointer get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrNative<BufCpu<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(BufCpu<TElem, TDim, TIdx> const& buf) -> TElem const*
+            {
+                return buf.m_spBufCpuImpl->m_pMem;
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(BufCpu<TElem, TDim, TIdx>& buf) -> TElem*
+            {
+                return buf.m_spBufCpuImpl->m_pMem;
+            }
+        };
+        //#############################################################################
+        //! The BufCpu pointer on device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevCpu>
         {
-            namespace traits
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevCpu const& dev)
+                -> TElem const*
             {
-                //#############################################################################
-                //! The BufCpu native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
+                if(dev == getDev(buf))
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf)
-                    -> TElem const *
-                    {
-                        return buf.m_spBufCpuImpl->m_pMem;
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf)
-                    -> TElem *
-                    {
-                        return buf.m_spBufCpuImpl->m_pMem;
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu pointer on device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCpu>
+                    return buf.m_spBufCpuImpl->m_pMem;
+                }
+                else
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf,
-                        dev::DevCpu const & dev)
-                    -> TElem const *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spBufCpuImpl->m_pMem;
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCpu const & dev)
-                    -> TElem *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spBufCpuImpl->m_pMem;
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu pitch get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    dim::DimInt<TDim::value - 1u>,
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevCpu const& dev) -> TElem*
+            {
+                if(dev == getDev(buf))
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & pitch)
-                    -> TIdx
-                    {
-                        return pitch.m_spBufCpuImpl->m_pitchBytes;
-                    }
-                };
+                    return buf.m_spBufCpuImpl->m_pMem;
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
             }
-        }
-        namespace buf
+        };
+        //#############################################################################
+        //! The BufCpu pitch get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPitchBytes<DimInt<TDim::value - 1u>, BufCpu<TElem, TDim, TIdx>>
         {
-            namespace traits
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPitchBytes(BufCpu<TElem, TDim, TIdx> const& pitch) -> TIdx
             {
-                //#############################################################################
-                //! The BufCpu memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    TDim,
-                    TIdx,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevCpu const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufCpu<TElem, TDim, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                return pitch.m_spBufCpuImpl->m_pitchBytes;
+            }
+        };
 
-                        return mem::buf::BufCpu<
-                            TElem,
-                            TDim,
-                            TIdx>(
-                                dev,
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCpu>
+        //#############################################################################
+        //! The BufCpu memory allocation trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufAlloc<TElem, TDim, TIdx, DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevCpu const& dev, TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                return BufCpu<TElem, TDim, TIdx>(dev, extent);
+            }
+        };
+        //#############################################################################
+        //! The BufCpu memory mapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Map<BufCpu<TElem, TDim, TIdx>, DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto map(BufCpu<TElem, TDim, TIdx>& buf, DevCpu const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev)
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCpu const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                    throw std::runtime_error("Memory mapping of BufCpu between two devices is not implemented!");
+                }
+                // If it is the same device, nothing has to be mapped.
+            }
+        };
+        //#############################################################################
+        //! The BufCpu memory unmapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unmap<BufCpu<TElem, TDim, TIdx>, DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unmap(BufCpu<TElem, TDim, TIdx>& buf, DevCpu const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Memory mapping of BufCpu between two devices is not implemented!");
-                        }
-                        // If it is the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCpu>
+                if(getDev(buf) != dev)
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCpu const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                    throw std::runtime_error("Memory unmapping of BufCpu between two devices is not implemented!");
+                }
+                // If it is the same device, nothing has to be mapped.
+            }
+        };
+        //#############################################################################
+        //! The BufCpu memory pinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Pin<BufCpu<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto pin(BufCpu<TElem, TDim, TIdx>& buf) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Memory unmapping of BufCpu between two devices is not implemented!");
-                        }
-                        // If it is the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory pinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Pin<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
+                if(!isPinned(buf))
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto pin(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf)
-                    -> void
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+                    if(buf.m_spBufCpuImpl->m_extentElements.prod() != 0)
                     {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(!mem::buf::isPinned(buf))
-                        {
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA)
-                            if(buf.m_spBufCpuImpl->m_extentElements.prod() != 0)
-                            {
-                                // - cudaHostRegisterDefault:
-                                //   See http://cgi.cs.indiana.edu/~nhusted/dokuwiki/doku.php?id=programming:cudaperformance1
-                                // - cudaHostRegisterPortable:
-                                //   The memory returned by this call will be considered as pinned memory by all CUDA contexts, not just the one that performed the allocation.
-                                ALPAKA_CUDA_RT_CHECK_IGNORE(
-                                    cudaHostRegister(
-                                        const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                        extent::getExtentProduct(buf) * sizeof(elem::Elem<buf::BufCpu<TElem, TDim, TIdx>>),
-                                        cudaHostRegisterDefault),
-                                    cudaErrorHostMemoryAlreadyRegistered);
+                        // - cudaHostRegisterDefault:
+                        //   See http://cgi.cs.indiana.edu/~nhusted/dokuwiki/doku.php?id=programming:cudaperformance1
+                        // - cudaHostRegisterPortable:
+                        //   The memory returned by this call will be considered as pinned memory by all CUDA contexts,
+                        //   not just the one that performed the allocation.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
+                            ALPAKA_API_PREFIX(HostRegister)(
+                                const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf))),
+                                extent::getExtentProduct(buf) * sizeof(Elem<BufCpu<TElem, TDim, TIdx>>),
+                                ALPAKA_API_PREFIX(HostRegisterDefault)),
+                            ALPAKA_API_PREFIX(ErrorHostMemoryAlreadyRegistered));
 
-                                buf.m_spBufCpuImpl->m_bPinned = true;
-                            }
-#elif (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                            if(buf.m_spBufCpuImpl->m_extentElements.prod() != 0)
-                            {
-                                ALPAKA_HIP_RT_CHECK_IGNORE(
-                                    hipHostRegister(
-                                        const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                        extent::getExtentProduct(buf) * sizeof(elem::Elem<buf::BufCpu<TElem, TDim, TIdx>>),
-                                        hipHostRegisterDefault),
-                                    hipErrorHostMemoryAlreadyRegistered);
-
-                                buf.m_spBufCpuImpl->m_bPinned = true;
-                            }
+                        buf.m_spBufCpuImpl->m_bPinned = true;
+                    }
 #else
-                            static_assert(
-                                meta::DependentFalseType<TElem>::value,
-                                "Memory pinning of BufCpu is not implemented when CUDA or HIP is not enabled!");
+                    static_assert(
+                        meta::DependentFalseType<TElem>::value,
+                        "Memory pinning of BufCpu is not implemented when CUDA or HIP is not enabled!");
 #endif
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory unpinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unpin<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unpin(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf)
-                    -> void
-                    {
-                        mem::buf::unpin(*buf.m_spBufCpuImpl.get());
-                    }
-                };
-                //#############################################################################
-                //! The BufCpuImpl memory unpinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unpin<
-                    mem::buf::cpu::detail::BufCpuImpl<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unpin(
-                        mem::buf::cpu::detail::BufCpuImpl<TElem, TDim, TIdx> & bufImpl)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(mem::buf::isPinned(bufImpl))
-                        {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-                            ALPAKA_CUDA_RT_CHECK_IGNORE(
-                                cudaHostUnregister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(bufImpl.m_pMem))),
-                                cudaErrorHostMemoryNotRegistered);
+                }
+            }
+        };
+        //#############################################################################
+        //! The BufCpu memory unpinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unpin<BufCpu<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unpin(BufCpu<TElem, TDim, TIdx>& buf) -> void
+            {
+                alpaka::unpin(*buf.m_spBufCpuImpl.get());
+            }
+        };
+        //#############################################################################
+        //! The BufCpuImpl memory unpinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unpin<alpaka::detail::BufCpuImpl<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unpin(alpaka::detail::BufCpuImpl<TElem, TDim, TIdx>& bufImpl) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                            bufImpl.m_bPinned = false;
-#elif (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                            ALPAKA_HIP_RT_CHECK_IGNORE(
-                                hipHostUnregister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(bufImpl.m_pMem))),
-                                hipErrorHostMemoryNotRegistered);
+                if(isPinned(bufImpl))
+                {
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
+                        ALPAKA_API_PREFIX(HostUnregister)(
+                            const_cast<void*>(reinterpret_cast<void const*>(bufImpl.m_pMem))),
+                        ALPAKA_API_PREFIX(ErrorHostMemoryNotRegistered));
 
-                            bufImpl.m_bPinned = false;
+                    bufImpl.m_bPinned = false;
 #else
-                            static_assert(
-                                meta::DependentFalseType<TElem>::value,
-                                "Memory unpinning of BufCpu is not implemented when CUDA or HIP is not enabled!");
+                    static_assert(
+                        meta::DependentFalseType<TElem>::value,
+                        "Memory unpinning of BufCpu is not implemented when CUDA or HIP is not enabled!");
 #endif
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory pin state trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct IsPinned<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isPinned(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf)
-                    -> bool
-                    {
-                        return mem::buf::isPinned(*buf.m_spBufCpuImpl.get());
-                    }
-                };
-                //#############################################################################
-                //! The BufCpuImpl memory pin state trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct IsPinned<
-                    mem::buf::cpu::detail::BufCpuImpl<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isPinned(
-                        mem::buf::cpu::detail::BufCpuImpl<TElem, TDim, TIdx> const & bufImpl)
-                    -> bool
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                }
+            }
+        };
+        //#############################################################################
+        //! The BufCpu memory pin state trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IsPinned<BufCpu<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto isPinned(BufCpu<TElem, TDim, TIdx> const& buf) -> bool
+            {
+                return alpaka::isPinned(*buf.m_spBufCpuImpl.get());
+            }
+        };
+        //#############################################################################
+        //! The BufCpuImpl memory pin state trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IsPinned<alpaka::detail::BufCpuImpl<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto isPinned(alpaka::detail::BufCpuImpl<TElem, TDim, TIdx> const& bufImpl) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                        return bufImpl.m_bPinned;
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+                return bufImpl.m_bPinned;
 #else
-                        alpaka::ignore_unused(bufImpl);
-                        return false;
+                alpaka::ignore_unused(bufImpl);
+                return false;
 #endif
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu memory prepareForAsyncCopy trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct PrepareForAsyncCopy<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto prepareForAsyncCopy(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                        // to optimize the data transfer performance between a cuda/hip device the cpu buffer has to be pinned,
-                        // for exclusive cpu use, no preparing is needed
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                        pin( buf );
+            }
+        };
+        //#############################################################################
+        //! The BufCpu memory prepareForAsyncCopy trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct PrepareForAsyncCopy<BufCpu<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto prepareForAsyncCopy(BufCpu<TElem, TDim, TIdx>& buf) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                // to optimize the data transfer performance between a cuda/hip device the cpu buffer has to be pinned,
+                // for exclusive cpu use, no preparing is needed
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+                pin(buf);
 #else
-                        alpaka::ignore_unused( buf );
+                alpaka::ignore_unused(buf);
 #endif
-                    }
-                };
             }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
+        };
+
+        //#############################################################################
+        //! The BufCpu offset get trait specialization.
+        template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+        struct GetOffset<TIdxIntegralConst, BufCpu<TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The BufCpu offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getOffset(BufCpu<TElem, TDim, TIdx> const&) -> TIdx
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                    mem::buf::BufCpu<TElem, TDim, TIdx> const &)
-                -> TIdx
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The BufCpu idx type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IdxType<BufCpu<TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The BufCpu idx type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                mem::buf::BufCpu<TElem, TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-}
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #include <alpaka/mem/buf/cpu/Copy.hpp>
 #include <alpaka/mem/buf/cpu/Set.hpp>
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCudaRt.hpp
deleted file mode 100644
index 8f14dc6e5c..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufCudaRt.hpp
+++ /dev/null
@@ -1,778 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-
-#include <memory>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevCudaRt;
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCpu;
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            //#############################################################################
-            //! The CUDA memory buffer.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCudaRt
-            {
-                static_assert(
-                    !std::is_const<TElem>::value,
-                    "The elem type of the buffer can not be const because the C++ Standard forbids containers of const elements!");
-                static_assert(
-                    !std::is_const<TIdx>::value,
-                    "The idx type of the buffer can not be const!");
-            private:
-                using Elem = TElem;
-                using Dim = TDim;
-
-            public:
-                //-----------------------------------------------------------------------------
-                //! Constructor
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST BufCudaRt(
-                    dev::DevCudaRt const & dev,
-                    TElem * const pMem,
-                    TIdx const & pitchBytes,
-                    TExtent const & extent) :
-                        m_dev(dev),
-                        m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                        m_spMem(
-                            pMem,
-                            // NOTE: Because the BufCudaRt object can be copied and the original object could have been destroyed,
-                            // a std::ref(m_dev) or a this pointer can not be bound to the callback because they are not always valid at time of destruction.
-                            std::bind(&BufCudaRt::freeBuffer, std::placeholders::_1, m_dev)),
-                        m_pitchBytes(pitchBytes)
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        TDim::value == dim::Dim<TExtent>::value,
-                        "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! Frees the shared buffer.
-                ALPAKA_FN_HOST static auto freeBuffer(
-                    TElem * const memPtr,
-                    dev::DevCudaRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            dev.m_iDevice));
-                    // Free the buffer.
-                    ALPAKA_CUDA_RT_CHECK(
-                      cudaFree(reinterpret_cast<void *>(memPtr)));
-                }
-
-            public:
-                dev::DevCudaRt m_dev;               // NOTE: The device has to be destructed after the memory pointer because it is required for destruction.
-                vec::Vec<TDim, TIdx> m_extentElements;
-                std::shared_ptr<TElem> m_spMem;
-                TIdx m_pitchBytes;
-            };
-        }
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt device type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                using type = dev::DevCudaRt;
-            };
-            //#############################################################################
-            //! The BufCudaRt device get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetDev<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                ALPAKA_FN_HOST static auto getDev(
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf)
-                -> dev::DevCudaRt
-                {
-                    return buf.m_dev;
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt dimension getter trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct ElemType<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt extent get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetExtent<
-                TIdxIntegralConst,
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx> const & extent)
-                -> TIdx
-                {
-                    return extent.m_extentElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCudaRt native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf)
-                    -> TElem const *
-                    {
-                        return buf.m_spMem.get();
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> & buf)
-                    -> TElem *
-                    {
-                        return buf.m_spMem.get();
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt pointer on device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevCudaRt const & dev)
-                    -> TElem const *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spMem.get();
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> & buf,
-                        dev::DevCudaRt const & dev)
-                    -> TElem *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spMem.get();
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt pitch get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    dim::DimInt<TDim::value - 1u>,
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf)
-                    -> TIdx
-                    {
-                        return buf.m_pitchBytes;
-                    }
-                };
-            }
-        }
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA 1D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<1u>,
-                    TIdx,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevCudaRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufCudaRt<TElem, dim::DimInt<1u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        auto const width(extent::getWidth(extent));
-                        auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                dev.m_iDevice));
-                        // Allocate the buffer on this device.
-                        void * memPtr;
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMalloc(
-                                &memPtr,
-                                static_cast<std::size_t>(widthBytes)));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << width
-                            << " ewb: " << widthBytes
-                            << " ptr: " << memPtr
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufCudaRt<TElem, dim::DimInt<1u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(memPtr),
-                                static_cast<TIdx>(widthBytes),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The CUDA 2D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<2u>,
-                    TIdx,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevCudaRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufCudaRt<TElem, dim::DimInt<2u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        auto const width(extent::getWidth(extent));
-                        auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
-                        auto const height(extent::getHeight(extent));
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                dev.m_iDevice));
-                        // Allocate the buffer on this device.
-                        void * memPtr;
-                        std::size_t pitchBytes;
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMallocPitch(
-                                &memPtr,
-                                &pitchBytes,
-                                static_cast<std::size_t>(widthBytes),
-                                static_cast<std::size_t>(height)));
-                        ALPAKA_ASSERT(pitchBytes >= static_cast<std::size_t>(widthBytes) || (width * height) == 0);
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << width
-                            << " eh: " << height
-                            << " ewb: " << widthBytes
-                            << " ptr: " << memPtr
-                            << " pitch: " << pitchBytes
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufCudaRt<TElem, dim::DimInt<2u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(memPtr),
-                                static_cast<TIdx>(pitchBytes),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The CUDA 3D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<3u>,
-                    TIdx,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevCudaRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufCudaRt<TElem, dim::DimInt<3u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        cudaExtent const cudaExtentVal(
-                            make_cudaExtent(
-                                static_cast<std::size_t>(extent::getWidth(extent) * static_cast<TIdx>(sizeof(TElem))),
-                                static_cast<std::size_t>(extent::getHeight(extent)),
-                                static_cast<std::size_t>(extent::getDepth(extent))));
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                dev.m_iDevice));
-                        // Allocate the buffer on this device.
-                        cudaPitchedPtr cudaPitchedPtrVal;
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMalloc3D(
-                                &cudaPitchedPtrVal,
-                                cudaExtentVal));
-
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << extent::getWidth(extent)
-                            << " eh: " << cudaExtentVal.height
-                            << " ed: " << cudaExtentVal.depth
-                            << " ewb: " << cudaExtentVal.width
-                            << " ptr: " << cudaPitchedPtrVal.ptr
-                            << " pitch: " << cudaPitchedPtrVal.pitch
-                            << " wb: " << cudaPitchedPtrVal.xsize
-                            << " h: " << cudaPitchedPtrVal.ysize
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufCudaRt<TElem, dim::DimInt<3u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(cudaPitchedPtrVal.ptr),
-                                static_cast<TIdx>(cudaPitchedPtrVal.pitch),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt CUDA device memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevCudaRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Mapping memory from one CUDA device into an other CUDA device not implemented!");
-                        }
-                        // If it is already the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt CUDA device memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevCudaRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Unmapping memory mapped from one CUDA device into an other CUDA device not implemented!");
-                        }
-                        // If it is already the same device, nothing has to be unmapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt memory pinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Pin<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto pin(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // CUDA device memory is always pinned, it can not be swapped out.
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt memory unpinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unpin<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unpin(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // CUDA device memory is always pinned, it can not be swapped out.
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt memory pin state trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct IsPinned<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isPinned(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> const &)
-                    -> bool
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // CUDA device memory is always pinned, it can not be swapped out.
-                        return true;
-                    }
-                };
-                //#############################################################################
-                //! The BufCudaRt memory prepareForAsyncCopy trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct PrepareForAsyncCopy<
-                    mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto prepareForAsyncCopy(
-                        mem::buf::BufCudaRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // CUDA device memory is always ready for async copy
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                   mem::buf::BufCudaRt<TElem, TDim, TIdx> const &)
-                -> TIdx
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufCudaRt idx type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                mem::buf::BufCudaRt<TElem, TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // Trait specializations for BufCpu.
-    namespace mem
-    {
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu CUDA device memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCudaRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            // cudaHostRegisterMapped:
-                            //   Maps the allocation into the CUDA address space.The device pointer to the memory may be obtained by calling cudaHostGetDevicePointer().
-                            //   This feature is available only on GPUs with compute capability greater than or equal to 1.1.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaHostRegister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                    extent::getExtentProduct(buf) * sizeof(elem::Elem<BufCpu<TElem, TDim, TIdx>>),
-                                    cudaHostRegisterMapped));
-                        }
-                        // If it is already the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu CUDA device memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCudaRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            // Unmaps the memory range whose base address is specified by ptr, and makes it pageable again.
-                            // \FIXME: If the memory has separately been pinned before we destroy the pinning state.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaHostUnregister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf)))));
-                        }
-                        // If it is already the same device, nothing has to be unmapped.
-                    }
-                };
-            }
-        }
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu pointer on CUDA device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf,
-                        dev::DevCudaRt const &)
-                    -> TElem const *
-                    {
-                        // TODO: Check if the memory is mapped at all!
-                        TElem * pDev(nullptr);
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaHostGetDevicePointer(
-                                &pDev,
-                                const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                0));
-                        return pDev;
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevCudaRt const &)
-                    -> TElem *
-                    {
-                        // TODO: Check if the memory is mapped at all!
-                        TElem * pDev(nullptr);
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaHostGetDevicePointer(
-                                &pDev,
-                                mem::view::getPtrNative(buf),
-                                0));
-                        return pDev;
-                    }
-                };
-            }
-        }
-    }
-}
-
-#include <alpaka/mem/buf/cuda/Copy.hpp>
-#include <alpaka/mem/buf/cuda/Set.hpp>
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufHipRt.hpp
deleted file mode 100644
index 0b043117c7..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufHipRt.hpp
+++ /dev/null
@@ -1,791 +0,0 @@
-/* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/mem/buf/Traits.hpp>
-
-#include <memory>
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevHipRt;
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufCpu;
-        }
-    }
-    namespace mem
-    {
-        namespace buf
-        {
-            //#############################################################################
-            //! The HIP memory buffer.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class BufHipRt
-            {
-                static_assert(
-                    !std::is_const<TElem>::value,
-                    "The elem type of the buffer can not be const because the C++ Standard forbids containers of const elements!");
-                static_assert(
-                    !std::is_const<TIdx>::value,
-                    "The idx type of the buffer can not be const!");
-            private:
-                using Elem = TElem;
-                using Dim = TDim;
-
-            public:
-                //-----------------------------------------------------------------------------
-                //! Constructor
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST BufHipRt(
-                    dev::DevHipRt const & dev,
-                    TElem * const pMem,
-                    TIdx const & pitchBytes,
-                    TExtent const & extent) :
-                        m_dev(dev),
-                        m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                        m_spMem(
-                            pMem,
-                            // NOTE: Because the BufHipRt object can be copied and the original object could have been destroyed,
-                            // a std::ref(m_dev) or a this pointer can not be bound to the callback because they are not always valid at time of destruction.
-                            std::bind(&BufHipRt::freeBuffer, std::placeholders::_1, m_dev)),
-                        m_pitchBytes(pitchBytes)
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        TDim::value == dim::Dim<TExtent>::value,
-                        "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! Frees the shared buffer.
-                ALPAKA_FN_HOST static auto freeBuffer(
-                    TElem * const memPtr,
-                    dev::DevHipRt const & dev)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            dev.m_iDevice));
-                    // Free the buffer.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipFree(reinterpret_cast<void *>(memPtr)));
-                }
-
-            public:
-                dev::DevHipRt m_dev;               // NOTE: The device has to be destructed after the memory pointer because it is required for destruction.
-                vec::Vec<TDim, TIdx> m_extentElements;
-                std::shared_ptr<TElem> m_spMem;
-                TIdx m_pitchBytes;
-            };
-        }
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt device type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                using type = dev::DevHipRt;
-            };
-            //#############################################################################
-            //! The BufHipRt device get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetDev<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                ALPAKA_FN_HOST static auto getDev(
-                    mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf)
-                -> dev::DevHipRt
-                {
-                    return buf.m_dev;
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt dimension getter trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct ElemType<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
-    namespace extent
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt extent get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetExtent<
-                TIdxIntegralConst,
-                mem::buf::BufHipRt<TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    mem::buf::BufHipRt<TElem, TDim, TIdx> const & extent)
-                -> TIdx
-                {
-                    return extent.m_extentElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace mem
-    {
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufHipRt native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf)
-                    -> TElem const *
-                    {
-                        return buf.m_spMem.get();
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> & buf)
-                    -> TElem *
-                    {
-                        return buf.m_spMem.get();
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt pointer on device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevHipRt const & dev)
-                    -> TElem const *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spMem.get();
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> & buf,
-                        dev::DevHipRt const & dev)
-                    -> TElem *
-                    {
-                        if(dev == dev::getDev(buf))
-                        {
-                            return buf.m_spMem.get();
-                        }
-                        else
-                        {
-                            throw std::runtime_error("The buffer is not accessible from the given device!");
-                        }
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt pitch get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    dim::DimInt<TDim::value - 1u>,
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf)
-                    -> TIdx
-                    {
-                        return buf.m_pitchBytes;
-                    }
-                };
-            }
-        }
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP 1D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<1u>,
-                    TIdx,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevHipRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufHipRt<TElem, dim::DimInt<1u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        auto const width(extent::getWidth(extent));
-                        auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                dev.m_iDevice));
-                        // Allocate the buffer on this device.
-                        void * memPtr;
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMalloc(
-                                &memPtr,
-                                static_cast<std::size_t>(widthBytes)));
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << width
-                            << " ewb: " << widthBytes
-                            << " ptr: " << memPtr
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufHipRt<TElem, dim::DimInt<1u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(memPtr),
-                                static_cast<TIdx>(widthBytes),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The HIP 2D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<2u>,
-                    TIdx,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevHipRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufHipRt<TElem, dim::DimInt<2u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        auto const width(extent::getWidth(extent));
-                        auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
-                        auto const height(extent::getHeight(extent));
-
-                        void * memPtr = nullptr;
-                        std::size_t pitchBytes = widthBytes;
-
-                        //FIXME: hcc cannot handle zero-size input (throws Unknown Error)
-                        if(width!=0 && height!=0) {
-
-                            // Set the current device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    dev.m_iDevice));
-
-
-                            // Allocate the buffer on this device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMallocPitch(
-                                    &memPtr,
-                                    &pitchBytes,
-                                    static_cast<std::size_t>(widthBytes),
-                                    static_cast<std::size_t>(height)));
-                            ALPAKA_ASSERT(pitchBytes >= static_cast<std::size_t>(widthBytes) || (width * height) == 0);
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << width
-                            << " eh: " << height
-                            << " ewb: " << widthBytes
-                            << " ptr: " << memPtr
-                            << " pitch: " << pitchBytes
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufHipRt<TElem, dim::DimInt<2u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(memPtr),
-                                static_cast<TIdx>(pitchBytes),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The HIP 3D memory allocation trait specialization.
-                template<
-                    typename TElem,
-                    typename TIdx>
-                struct Alloc<
-                    TElem,
-                    dim::DimInt<3u>,
-                    TIdx,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent>
-                    ALPAKA_FN_HOST static auto alloc(
-                        dev::DevHipRt const & dev,
-                        TExtent const & extent)
-                    -> mem::buf::BufHipRt<TElem, dim::DimInt<3u>, TIdx>
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        hipExtent const hipExtentVal(
-                            make_hipExtent(
-                                static_cast<std::size_t>(extent::getWidth(extent) * static_cast<TIdx>(sizeof(TElem))),
-                                static_cast<std::size_t>(extent::getHeight(extent)),
-                                static_cast<std::size_t>(extent::getDepth(extent))));
-
-                        hipPitchedPtr hipPitchedPtrVal = {0};
-
-                        //FIXME: hcc cannot handle zero-size input
-                        if(hipExtentVal.width!=0
-                           && hipExtentVal.height!=0
-                           && hipExtentVal.depth!=0) {
-
-                            // Set the current device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    dev.m_iDevice));
-                            // Allocate the buffer on this device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMalloc3D(
-                                    &hipPitchedPtrVal,
-                                    hipExtentVal));
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        std::cout << __func__
-                            << " ew: " << extent::getWidth(extent)
-                            << " eh: " << hipExtentVal.height
-                            << " ed: " << hipExtentVal.depth
-                            << " ewb: " << hipExtentVal.width
-                            << " ptr: " << hipPitchedPtrVal.ptr
-                            << " pitch: " << hipPitchedPtrVal.pitch
-                            << " wb: " << hipPitchedPtrVal.xsize
-                            << " h: " << hipPitchedPtrVal.ysize
-                            << std::endl;
-#endif
-                        return
-                            mem::buf::BufHipRt<TElem, dim::DimInt<3u>, TIdx>(
-                                dev,
-                                reinterpret_cast<TElem *>(hipPitchedPtrVal.ptr),
-                                static_cast<TIdx>(hipPitchedPtrVal.pitch),
-                                extent);
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt HIP device memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevHipRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Mapping memory from one HIP device into an other HIP device not implemented!");
-                        }
-                        // If it is already the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt HIP device memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const & buf,
-                        dev::DevHipRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            throw std::runtime_error("Unmapping memory mapped from one HIP device into an other HIP device not implemented!");
-                        }
-                        // If it is already the same device, nothing has to be unmapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt memory pinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Pin<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto pin(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // HIP device memory is always pinned, it can not be swapped out.
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt memory unpinning trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unpin<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unpin(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // HIP device memory is always pinned, it can not be swapped out.
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt memory pin state trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct IsPinned<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isPinned(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> const &)
-                    -> bool
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // HIP device memory is always pinned, it can not be swapped out.
-                        return true;
-                    }
-                };
-                //#############################################################################
-                //! The BufHipRt memory prepareForAsyncCopy trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct PrepareForAsyncCopy<
-                    mem::buf::BufHipRt<TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto prepareForAsyncCopy(
-                        mem::buf::BufHipRt<TElem, TDim, TIdx> &)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // HIP device memory is always ready for async copy
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                   mem::buf::BufHipRt<TElem, TDim, TIdx> const &)
-                -> TIdx
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The BufHipRt idx type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                mem::buf::BufHipRt<TElem, TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-
-    //-----------------------------------------------------------------------------
-    // Trait specializations for BufCpu.
-    namespace mem
-    {
-        namespace buf
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu HIP device memory mapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Map<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto map(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevHipRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            // hipHostRegisterMapped:
-                            //   Maps the allocation into the HIP address space.The device pointer to the memory may be obtained by calling hipHostGetDevicePointer().
-                            //   This feature is available only on GPUs with compute capability greater than or equal to 1.1.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipHostRegister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                    extent::getExtentProduct(buf) * sizeof(elem::Elem<BufCpu<TElem, TDim, TIdx>>),
-                                    hipHostRegisterMapped));
-                        }
-                        // If it is already the same device, nothing has to be mapped.
-                    }
-                };
-                //#############################################################################
-                //! The BufCpu HIP device memory unmapping trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct Unmap<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto unmap(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevHipRt const & dev)
-                    -> void
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        if(dev::getDev(buf) != dev)
-                        {
-                            // Unmaps the memory range whose base address is specified by ptr, and makes it pageable again.
-                            // \FIXME: If the memory has separately been pinned before we destroy the pinning state.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipHostUnregister(
-                                    const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf)))));
-                        }
-                        // If it is already the same device, nothing has to be unmapped.
-                    }
-                };
-            }
-        }
-        namespace view
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The BufCpu pointer on HIP device get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrDev<
-                    mem::buf::BufCpu<TElem, TDim, TIdx>,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> const & buf,
-                        dev::DevHipRt const &)
-                    -> TElem const *
-                    {
-                        // TODO: Check if the memory is mapped at all!
-                        TElem * pDev(nullptr);
-                        ALPAKA_HIP_RT_CHECK(
-                            hipHostGetDevicePointer(
-                                &pDev,
-                                const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                0));
-                        return pDev;
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrDev(
-                        mem::buf::BufCpu<TElem, TDim, TIdx> & buf,
-                        dev::DevHipRt const &)
-                    -> TElem *
-                    {
-                        // TODO: Check if the memory is mapped at all!
-                        TElem * pDev(nullptr);
-                        ALPAKA_HIP_RT_CHECK(
-                            hipHostGetDevicePointer(
-                                &pDev,
-                                mem::view::getPtrNative(buf),
-                                0));
-                        return pDev;
-                    }
-                };
-            }
-        }
-    }
-}
-
-#include <alpaka/mem/buf/hip/Copy.hpp>
-#include <alpaka/mem/buf/hip/Set.hpp>
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufOacc.hpp
new file mode 100644
index 0000000000..f184630476
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufOacc.hpp
@@ -0,0 +1,482 @@
+/* Copyright 2020 Jeffrey Kelling, Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/dev/DevOacc.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/mem/buf/Traits.hpp>
+#    include <alpaka/queue/QueueOaccBlocking.hpp>
+#    include <alpaka/vec/Vec.hpp>
+
+#    include <openacc.h>
+
+#    include <memory>
+
+namespace alpaka
+{
+    class DevOacc;
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufCpu;
+
+    namespace oacc
+    {
+        namespace detail
+        {
+            //#############################################################################
+            //! The OpenACC memory buffer detail.
+            template<typename TElem, typename TDim, typename TIdx>
+            class BufOaccImpl
+            {
+                static_assert(
+                    !std::is_const<TElem>::value,
+                    "The elem type of the buffer can not be const because the C++ Standard forbids containers of "
+                    "const elements!");
+                static_assert(!std::is_const<TIdx>::value, "The idx type of the buffer can not be const!");
+
+            private:
+                using Elem = TElem;
+                using Dim = TDim;
+                //-----------------------------------------------------------------------------
+                //! Calculate the pitches purely from the extents.
+                template<typename TExtent>
+                ALPAKA_FN_HOST static auto calculatePitchesFromExtents(TExtent const& extent) -> Vec<TDim, TIdx>
+                {
+                    Vec<TDim, TIdx> pitchBytes(Vec<TDim, TIdx>::all(0));
+                    pitchBytes[TDim::value - 1u] = extent[TDim::value - 1u] * static_cast<TIdx>(sizeof(TElem));
+                    for(TIdx i = TDim::value - 1u; i > static_cast<TIdx>(0u); --i)
+                    {
+                        pitchBytes[i - 1] = extent[i - 1] * pitchBytes[i];
+                    }
+                    return pitchBytes;
+                }
+
+            public:
+                //-----------------------------------------------------------------------------
+                //! Constructor
+                template<typename TExtent>
+                ALPAKA_FN_HOST BufOaccImpl(DevOacc const& dev, TElem* const pMem, TExtent const& extent)
+                    : m_dev(dev)
+                    , m_extentElements(extent::getExtentVecEnd<TDim>(extent))
+                    , m_pitchBytes(calculatePitchesFromExtents(m_extentElements))
+                    , m_pMem(pMem)
+                {
+                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                    static_assert(
+                        TDim::value == alpaka::Dim<TExtent>::value,
+                        "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to "
+                        "be identical!");
+                    static_assert(
+                        std::is_same<TIdx, Idx<TExtent>>::value,
+                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
+                }
+
+            public:
+                DevOacc m_dev;
+                Vec<TDim, TIdx> m_extentElements;
+                Vec<TDim, TIdx> m_pitchBytes;
+                TElem* m_pMem;
+
+                BufOaccImpl(const BufOaccImpl&) = delete;
+                BufOaccImpl(BufOaccImpl&&) = default;
+                BufOaccImpl& operator=(const BufOaccImpl&) = delete;
+                BufOaccImpl& operator=(BufOaccImpl&&) = default;
+                ~BufOaccImpl()
+                {
+                    m_dev.makeCurrent();
+                    acc_free(m_pMem);
+                }
+            };
+        } // namespace detail
+    } // namespace oacc
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufOacc
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        //! Constructor
+        template<typename TExtent>
+        ALPAKA_FN_HOST BufOacc(DevOacc const& dev, TElem* const pMem, TExtent const& extent)
+            : m_spBufImpl(std::make_shared<oacc::detail::BufOaccImpl<TElem, TDim, TIdx>>(dev, pMem, extent))
+        {
+        }
+
+        BufOacc(const BufOacc&) = default;
+        BufOacc(BufOacc&&) = default;
+        BufOacc& operator=(const BufOacc&) = default;
+        BufOacc& operator=(BufOacc&&) = default;
+
+        oacc::detail::BufOaccImpl<TElem, TDim, TIdx>& operator*()
+        {
+            return *m_spBufImpl;
+        }
+        const oacc::detail::BufOaccImpl<TElem, TDim, TIdx>& operator*() const
+        {
+            return *m_spBufImpl;
+        }
+
+        const Vec<TDim, TIdx>& extentElements() const
+        {
+            return m_spBufImpl->m_extentElements;
+        }
+
+    private:
+        std::shared_ptr<oacc::detail::BufOaccImpl<TElem, TDim, TIdx>> m_spBufImpl;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The BufOacc device type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DevType<BufOacc<TElem, TDim, TIdx>>
+        {
+            using type = DevOacc;
+        };
+        //#############################################################################
+        //! The BufOacc device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetDev<BufOacc<TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getDev(BufOacc<TElem, TDim, TIdx> const& buf) -> DevOacc
+            {
+                return (*buf).m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The BufOacc dimension getter trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DimType<BufOacc<TElem, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The BufOacc memory element type get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct ElemType<BufOacc<TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+    } // namespace traits
+
+    namespace extent
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The BufOacc extent get trait specialization.
+            template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+            struct GetExtent<
+                TIdxIntegralConst,
+                BufOacc<TElem, TDim, TIdx>,
+                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+            {
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST static auto getExtent(BufOacc<TElem, TDim, TIdx> const& extent) -> TIdx
+                {
+                    return extent.extentElements()[TIdxIntegralConst::value];
+                }
+            };
+        } // namespace traits
+    } // namespace extent
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The BufOacc native pointer get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrNative<BufOacc<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(BufOacc<TElem, TDim, TIdx> const& buf) -> TElem const*
+            {
+                return (*buf).m_pMem;
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(BufOacc<TElem, TDim, TIdx>& buf) -> TElem*
+            {
+                return (*buf).m_pMem;
+            }
+        };
+        //#############################################################################
+        //! The BufOacc pointer on device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufOacc<TElem, TDim, TIdx>, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufOacc<TElem, TDim, TIdx> const& buf, DevOacc const& dev)
+                -> TElem const*
+            {
+                if(dev == getDev(buf))
+                {
+                    return *buf.m_pMem;
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufOacc<TElem, TDim, TIdx>& buf, DevOacc const& dev) -> TElem*
+            {
+                if(dev == getDev(buf))
+                {
+                    return *buf.m_pMem;
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+        };
+        //#############################################################################
+        //! The BufOacc pitch get trait specialization.
+        template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+        struct GetPitchBytes<TIdxIntegralConst, BufOacc<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPitchBytes(BufOacc<TElem, TDim, TIdx> const& pitch) -> TIdx
+            {
+                return (*pitch).m_pitchBytes[TIdxIntegralConst::value];
+            }
+        };
+
+        //#############################################################################
+        //! The BufOacc 1D memory allocation trait specialization.
+        template<typename TElem, typename TIdx>
+        struct BufAlloc<TElem, DimInt<1u>, TIdx, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevOacc const& dev, TExtent const& extent)
+                -> BufOacc<TElem, DimInt<1u>, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                auto const width(extent::getWidth(extent));
+                auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
+
+                dev.makeCurrent();
+                void* memPtr = acc_malloc(static_cast<std::size_t>(widthBytes));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << " ptr: " << memPtr
+                          << " device: " << dev.m_spDevOaccImpl->iDevice() << std::endl;
+#    endif
+                return BufOacc<TElem, DimInt<1u>, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), extent);
+            }
+        };
+
+        //#############################################################################
+        //! The BufOacc nD memory allocation trait specialization. \todo Add pitch
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufAlloc<TElem, TDim, TIdx, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevOacc const& dev, TExtent const& extent)
+                -> BufOacc<TElem, TDim, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                const std::size_t size = static_cast<std::size_t>(extent::getExtentVec(extent).prod()) * sizeof(TElem);
+
+                dev.makeCurrent();
+                void* memPtr = acc_malloc(size);
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << "alloc'd " << TDim::value << "D device ptr: " << memPtr << " on device "
+                          << dev.m_spDevOaccImpl->iDevice() << " size: " << size << std::endl;
+#    endif
+                return BufOacc<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), extent);
+            }
+        };
+
+        //#############################################################################
+        //! The BufOacc device memory mapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Map<BufOacc<TElem, TDim, TIdx>, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto map(BufOacc<TElem, TDim, TIdx> const& buf, DevOacc const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev)
+                {
+                    throw std::runtime_error(
+                        "Mapping memory from one OpenACC device into an other OpenACC device not implemented!");
+                }
+                // If it is already the same device, nothing has to be mapped.
+            }
+        };
+        //#############################################################################
+        //! The BufOacc device memory unmapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unmap<BufOacc<TElem, TDim, TIdx>, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unmap(BufOacc<TElem, TDim, TIdx> const& buf, DevOacc const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev)
+                {
+                    throw std::runtime_error("Unmapping memory mapped from one OpenACC device into an other OpenACC "
+                                             "device not implemented!");
+                }
+                // If it is already the same device, nothing has to be unmapped.
+            }
+        };
+        //#############################################################################
+        //! The BufOacc memory pinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Pin<BufOacc<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto pin(BufOacc<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // No explicit pinning in OpenACC? GPU would be pinned anyway.
+            }
+        };
+        //#############################################################################
+        //! The BufOacc memory unpinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unpin<BufOacc<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unpin(BufOacc<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // No explicit pinning in OpenACC? GPU would be pinned anyway.
+            }
+        };
+        //#############################################################################
+        //! The BufOacc memory pin state trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IsPinned<BufOacc<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto isPinned(BufOacc<TElem, TDim, TIdx> const&) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // No explicit pinning in OpenACC? GPU would be pinned anyway.
+                return true;
+            }
+        };
+        //#############################################################################
+        //! The BufOacc memory prepareForAsyncCopy trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct PrepareForAsyncCopy<BufOacc<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto prepareForAsyncCopy(BufOacc<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // OpenACC device memory is always ready for async copy
+            }
+        };
+
+        //#############################################################################
+        //! The BufOacc offset get trait specialization.
+        template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+        struct GetOffset<TIdxIntegralConst, BufOacc<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getOffset(BufOacc<TElem, TDim, TIdx> const&) -> TIdx
+            {
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The BufOacc idx type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IdxType<BufOacc<TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //#############################################################################
+        //! The BufCpu CUDA device memory mapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Map<BufCpu<TElem, TDim, TIdx>, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto map(BufCpu<TElem, TDim, TIdx>& buf, DevOacc const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev) //! \TODO WTF?
+                {
+                    //   Maps the allocation into the CUDA address space.The device pointer to the memory may be
+                    //   obtained by calling cudaHostGetDevicePointer().
+                    throw std::runtime_error("Mapping host memory to OpenACC device not implemented!");
+                }
+                // If it is already the same device, nothing has to be mapped.
+            }
+        };
+        //#############################################################################
+        //! The BufCpu CUDA device memory unmapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unmap<BufCpu<TElem, TDim, TIdx>, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unmap(BufCpu<TElem, TDim, TIdx>& buf, DevOacc const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev) //! \TODO WTF?
+                {
+                    throw std::runtime_error("Mapping host memory to OpenACC device not implemented!");
+                }
+                // If it is already the same device, nothing has to be unmapped.
+            }
+        };
+
+        //#############################################################################
+        //! The BufCpu pointer on CUDA device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const&, DevOacc const&) -> TElem const*
+            {
+                throw std::runtime_error("Mapping host memory to OpenACC device not implemented!");
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>&, DevOacc const&) -> TElem*
+            {
+                throw std::runtime_error("Mapping host memory to OpenACC device not implemented!");
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#    include <alpaka/mem/buf/oacc/Copy.hpp>
+#    include <alpaka/mem/buf/oacc/Set.hpp>
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufOmp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufOmp5.hpp
new file mode 100644
index 0000000000..a5316d5e24
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufOmp5.hpp
@@ -0,0 +1,475 @@
+/* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/dev/DevOmp5.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/mem/buf/Traits.hpp>
+#    include <alpaka/queue/QueueOmp5Blocking.hpp>
+#    include <alpaka/vec/Vec.hpp>
+
+#    include <omp.h>
+
+#    include <memory>
+
+namespace alpaka
+{
+    class DevOmp5;
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufCpu;
+
+    namespace detail
+    {
+        //#############################################################################
+        //! The OMP5 memory buffer detail.
+        template<typename TElem, typename TDim, typename TIdx>
+        class BufOmp5Impl
+        {
+            static_assert(
+                !std::is_const<TElem>::value,
+                "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
+                "elements!");
+            static_assert(!std::is_const<TIdx>::value, "The idx type of the buffer can not be const!");
+
+        private:
+            using Elem = TElem;
+            using Dim = TDim;
+            //-----------------------------------------------------------------------------
+            //! Calculate the pitches purely from the extents.
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto calculatePitchesFromExtents(TExtent const& extent) -> Vec<TDim, TIdx>
+            {
+                Vec<TDim, TIdx> pitchBytes(Vec<TDim, TIdx>::all(0));
+                pitchBytes[TDim::value - 1u] = extent[TDim::value - 1u] * static_cast<TIdx>(sizeof(TElem));
+                for(TIdx i = TDim::value - 1u; i > static_cast<TIdx>(0u); --i)
+                {
+                    pitchBytes[i - 1] = extent[i - 1] * pitchBytes[i];
+                }
+                return pitchBytes;
+            }
+
+        public:
+            //-----------------------------------------------------------------------------
+            //! Constructor
+            template<typename TExtent>
+            ALPAKA_FN_HOST BufOmp5Impl(DevOmp5 const& dev, TElem* const pMem, TExtent const& extent)
+                : m_dev(dev)
+                , m_extentElements(extent::getExtentVecEnd<TDim>(extent))
+                , m_pitchBytes(calculatePitchesFromExtents(m_extentElements))
+                , m_pMem(pMem)
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                static_assert(
+                    TDim::value == alpaka::Dim<TExtent>::value,
+                    "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
+                    "identical!");
+                static_assert(
+                    std::is_same<TIdx, Idx<TExtent>>::value,
+                    "The idx type of TExtent and the TIdx template parameter have to be identical!");
+            }
+
+        public:
+            DevOmp5 m_dev;
+            Vec<TDim, TIdx> m_extentElements;
+            Vec<TDim, TIdx> m_pitchBytes;
+            TElem* m_pMem;
+
+            BufOmp5Impl(const BufOmp5Impl&) = delete;
+            BufOmp5Impl(BufOmp5Impl&&) = default;
+            BufOmp5Impl& operator=(const BufOmp5Impl&) = delete;
+            BufOmp5Impl& operator=(BufOmp5Impl&&) = default;
+            ~BufOmp5Impl()
+            {
+                omp_target_free(m_pMem, m_dev.m_spDevOmp5Impl->iDevice());
+            }
+        };
+    } // namespace detail
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufOmp5
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        //! Constructor
+        template<typename TExtent>
+        ALPAKA_FN_HOST BufOmp5(DevOmp5 const& dev, TElem* const pMem, TExtent const& extent)
+            : m_spBufImpl(std::make_shared<detail::BufOmp5Impl<TElem, TDim, TIdx>>(dev, pMem, extent))
+        {
+        }
+
+        BufOmp5(const BufOmp5&) = default;
+        BufOmp5(BufOmp5&&) = default;
+        BufOmp5& operator=(const BufOmp5&) = default;
+        BufOmp5& operator=(BufOmp5&&) = default;
+
+        detail::BufOmp5Impl<TElem, TDim, TIdx>& operator*()
+        {
+            return *m_spBufImpl;
+        }
+        const detail::BufOmp5Impl<TElem, TDim, TIdx>& operator*() const
+        {
+            return *m_spBufImpl;
+        }
+
+        inline const Vec<TDim, TIdx>& extentElements() const
+        {
+            return m_spBufImpl->m_extentElements;
+        }
+
+    private:
+        std::shared_ptr<detail::BufOmp5Impl<TElem, TDim, TIdx>> m_spBufImpl;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The BufOmp5 device type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DevType<BufOmp5<TElem, TDim, TIdx>>
+        {
+            using type = DevOmp5;
+        };
+        //#############################################################################
+        //! The BufOmp5 device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetDev<BufOmp5<TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getDev(BufOmp5<TElem, TDim, TIdx> const& buf) -> DevOmp5
+            {
+                return (*buf).m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The BufOmp5 dimension getter trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DimType<BufOmp5<TElem, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The BufOmp5 memory element type get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct ElemType<BufOmp5<TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+    } // namespace traits
+    namespace extent
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The BufOmp5 extent get trait specialization.
+            template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+            struct GetExtent<
+                TIdxIntegralConst,
+                BufOmp5<TElem, TDim, TIdx>,
+                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+            {
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST static auto getExtent(BufOmp5<TElem, TDim, TIdx> const& extent) -> TIdx
+                {
+                    return extent.extentElements()[TIdxIntegralConst::value];
+                }
+            };
+        } // namespace traits
+    } // namespace extent
+    namespace traits
+    {
+        //#############################################################################
+        //! The BufOmp5 native pointer get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrNative<BufOmp5<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(BufOmp5<TElem, TDim, TIdx> const& buf) -> TElem const*
+            {
+                return (*buf).m_pMem;
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(BufOmp5<TElem, TDim, TIdx>& buf) -> TElem*
+            {
+                return (*buf).m_pMem;
+            }
+        };
+        //#############################################################################
+        //! The BufOmp5 pointer on device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufOmp5<TElem, TDim, TIdx>, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufOmp5<TElem, TDim, TIdx> const& buf, DevOmp5 const& dev)
+                -> TElem const*
+            {
+                if(dev == getDev(buf))
+                {
+                    return *buf.m_pMem;
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufOmp5<TElem, TDim, TIdx>& buf, DevOmp5 const& dev) -> TElem*
+            {
+                if(dev == getDev(buf))
+                {
+                    return *buf.m_pMem;
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+        };
+        //#############################################################################
+        //! The BufOmp5 pitch get trait specialization.
+        template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+        struct GetPitchBytes<TIdxIntegralConst, BufOmp5<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPitchBytes(BufOmp5<TElem, TDim, TIdx> const& pitch) -> TIdx
+            {
+                return (*pitch).m_pitchBytes[TIdxIntegralConst::value];
+            }
+        };
+
+        //#############################################################################
+        //! The BufOmp5 1D memory allocation trait specialization.
+        template<typename TElem, typename TIdx>
+        struct BufAlloc<TElem, DimInt<1u>, TIdx, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevOmp5 const& dev, TExtent const& extent)
+                -> BufOmp5<TElem, DimInt<1u>, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                auto const width(extent::getWidth(extent));
+                auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
+
+                void* memPtr = omp_target_alloc(static_cast<std::size_t>(widthBytes), dev.m_spDevOmp5Impl->iDevice());
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << " ptr: " << memPtr
+                          << " device: " << dev.m_spDevOmp5Impl->iDevice() << std::endl;
+#    endif
+                return BufOmp5<TElem, DimInt<1u>, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), extent);
+            }
+        };
+
+        //#############################################################################
+        //! The BufOmp5 nD memory allocation trait specialization. \todo Add pitch
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufAlloc<TElem, TDim, TIdx, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevOmp5 const& dev, TExtent const& extent)
+                -> BufOmp5<TElem, TDim, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                const std::size_t size = static_cast<std::size_t>(extent::getExtentVec(extent).prod()) * sizeof(TElem);
+
+                void* memPtr = omp_target_alloc(size, dev.m_spDevOmp5Impl->iDevice());
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " dim: " << TDim::value << " extent: " << extent::getExtentVec(extent)
+                          << " ewb: " << size << " ptr: " << memPtr << " device: " << dev.m_spDevOmp5Impl->iDevice()
+                          << std::endl;
+#    endif
+                return BufOmp5<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), extent);
+            }
+        };
+
+        //#############################################################################
+        //! The BufOmp5 device memory mapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Map<BufOmp5<TElem, TDim, TIdx>, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto map(BufOmp5<TElem, TDim, TIdx> const& buf, DevOmp5 const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev)
+                {
+                    throw std::runtime_error(
+                        "Mapping memory from one OMP5 device into an other OMP5 device not implemented!");
+                }
+                // If it is already the same device, nothing has to be mapped.
+            }
+        };
+        //#############################################################################
+        //! The BufOmp5 device memory unmapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unmap<BufOmp5<TElem, TDim, TIdx>, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unmap(BufOmp5<TElem, TDim, TIdx> const& buf, DevOmp5 const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev)
+                {
+                    throw std::runtime_error(
+                        "Unmapping memory mapped from one OMP5 device into an other OMP5 device not implemented!");
+                }
+                // If it is already the same device, nothing has to be unmapped.
+            }
+        };
+        //#############################################################################
+        //! The BufOmp5 memory pinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Pin<BufOmp5<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto pin(BufOmp5<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // No explicit pinning in OMP5? GPU would be pinned anyway.
+            }
+        };
+        //#############################################################################
+        //! The BufOmp5 memory unpinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unpin<BufOmp5<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unpin(BufOmp5<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // No explicit pinning in OMP5? GPU would be pinned anyway.
+            }
+        };
+        //#############################################################################
+        //! The BufOmp5 memory pin state trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IsPinned<BufOmp5<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto isPinned(BufOmp5<TElem, TDim, TIdx> const&) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // No explicit pinning in OMP5? GPU would be pinned anyway.
+                return true;
+            }
+        };
+        //#############################################################################
+        //! The BufOmp5 memory prepareForAsyncCopy trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct PrepareForAsyncCopy<BufOmp5<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto prepareForAsyncCopy(BufOmp5<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // OMP5 device memory is always ready for async copy
+            }
+        };
+
+        //#############################################################################
+        //! The BufOmp5 offset get trait specialization.
+        template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+        struct GetOffset<TIdxIntegralConst, BufOmp5<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getOffset(BufOmp5<TElem, TDim, TIdx> const&) -> TIdx
+            {
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The BufOmp5 idx type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IdxType<BufOmp5<TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //#############################################################################
+        //! The BufCpu CUDA device memory mapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Map<BufCpu<TElem, TDim, TIdx>, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto map(BufCpu<TElem, TDim, TIdx>& buf, DevOmp5 const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev) //! \TODO WTF?
+                {
+                    //   Maps the allocation into the CUDA address space.The device pointer to the memory may be
+                    //   obtained by calling cudaHostGetDevicePointer().
+                    throw std::runtime_error("Mapping host memory to OMP5 device not implemented!");
+                }
+                // If it is already the same device, nothing has to be mapped.
+            }
+        };
+        //#############################################################################
+        //! The BufCpu CUDA device memory unmapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unmap<BufCpu<TElem, TDim, TIdx>, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unmap(BufCpu<TElem, TDim, TIdx>& buf, DevOmp5 const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev) //! \TODO WTF?
+                {
+                    throw std::runtime_error("Mapping host memory to OMP5 device not implemented!");
+                }
+                // If it is already the same device, nothing has to be unmapped.
+            }
+        };
+
+        //#############################################################################
+        //! The BufCpu pointer on CUDA device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const&, DevOmp5 const&) -> TElem const*
+            {
+                throw std::runtime_error("Mapping host memory to OMP5 device not implemented!");
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>&, DevOmp5 const&) -> TElem*
+            {
+                throw std::runtime_error("Mapping host memory to OMP5 device not implemented!");
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#    include <alpaka/mem/buf/omp5/Copy.hpp>
+#    include <alpaka/mem/buf/omp5/Set.hpp>
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
new file mode 100644
index 0000000000..8696c44eb9
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
@@ -0,0 +1,556 @@
+/* Copyright 2019 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/mem/buf/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
+
+#    include <functional>
+#    include <memory>
+#    include <type_traits>
+
+namespace alpaka
+{
+    class DevUniformCudaHipRt;
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufCpu;
+
+    //#############################################################################
+    //! The CUDA/HIP memory buffer.
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufUniformCudaHipRt
+    {
+        static_assert(
+            !std::is_const<TElem>::value,
+            "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
+            "elements!");
+        static_assert(!std::is_const<TIdx>::value, "The idx type of the buffer can not be const!");
+
+    private:
+        using Elem = TElem;
+        using Dim = TDim;
+
+    public:
+        //-----------------------------------------------------------------------------
+        //! Constructor
+        template<typename TExtent>
+        ALPAKA_FN_HOST BufUniformCudaHipRt(
+            DevUniformCudaHipRt const& dev,
+            TElem* const pMem,
+            TIdx const& pitchBytes,
+            TExtent const& extent)
+            : m_dev(dev)
+            , m_extentElements(extent::getExtentVecEnd<TDim>(extent))
+            , m_spMem(
+                  pMem,
+                  // NOTE: Because the BufUniformCudaHipRt object can be copied and the original object could have been
+                  // destroyed, a std::ref(m_dev) or a this pointer can not be bound to the callback because they are
+                  // not always valid at time of destruction.
+                  std::bind(&BufUniformCudaHipRt::freeBuffer, std::placeholders::_1, m_dev))
+            , m_pitchBytes(pitchBytes)
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            static_assert(
+                TDim::value == alpaka::Dim<TExtent>::value,
+                "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
+                "identical!");
+            static_assert(
+                std::is_same<TIdx, Idx<TExtent>>::value,
+                "The idx type of TExtent and the TIdx template parameter have to be identical!");
+        }
+
+    private:
+        //-----------------------------------------------------------------------------
+        //! Frees the shared buffer.
+        ALPAKA_FN_HOST static auto freeBuffer(TElem* const memPtr, DevUniformCudaHipRt const& dev) -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Set the current device.
+            ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+            // Free the buffer.
+            ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(Free)(reinterpret_cast<void*>(memPtr)));
+        }
+
+    public:
+        DevUniformCudaHipRt m_dev; // NOTE: The device has to be destructed after the memory pointer because it is
+                                   // required for destruction.
+        Vec<TDim, TIdx> m_extentElements;
+        std::shared_ptr<TElem> m_spMem;
+        TIdx m_pitchBytes;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The BufUniformCudaHipRt device type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DevType<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            using type = DevUniformCudaHipRt;
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetDev<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getDev(BufUniformCudaHipRt<TElem, TDim, TIdx> const& buf) -> DevUniformCudaHipRt
+            {
+                return buf.m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The BufUniformCudaHipRt dimension getter trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DimType<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The BufUniformCudaHipRt memory element type get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct ElemType<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+    } // namespace traits
+    namespace extent
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The BufUniformCudaHipRt extent get trait specialization.
+            template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+            struct GetExtent<
+                TIdxIntegralConst,
+                BufUniformCudaHipRt<TElem, TDim, TIdx>,
+                std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
+            {
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST static auto getExtent(BufUniformCudaHipRt<TElem, TDim, TIdx> const& extent) -> TIdx
+                {
+                    return extent.m_extentElements[TIdxIntegralConst::value];
+                }
+            };
+        } // namespace traits
+    } // namespace extent
+    namespace traits
+    {
+        //#############################################################################
+        //! The BufUniformCudaHipRt native pointer get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrNative<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(BufUniformCudaHipRt<TElem, TDim, TIdx> const& buf) -> TElem const*
+            {
+                return buf.m_spMem.get();
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(BufUniformCudaHipRt<TElem, TDim, TIdx>& buf) -> TElem*
+            {
+                return buf.m_spMem.get();
+            }
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt pointer on device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufUniformCudaHipRt<TElem, TDim, TIdx>, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(
+                BufUniformCudaHipRt<TElem, TDim, TIdx> const& buf,
+                DevUniformCudaHipRt const& dev) -> TElem const*
+            {
+                if(dev == getDev(buf))
+                {
+                    return buf.m_spMem.get();
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(
+                BufUniformCudaHipRt<TElem, TDim, TIdx>& buf,
+                DevUniformCudaHipRt const& dev) -> TElem*
+            {
+                if(dev == getDev(buf))
+                {
+                    return buf.m_spMem.get();
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt pitch get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPitchBytes<DimInt<TDim::value - 1u>, BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPitchBytes(BufUniformCudaHipRt<TElem, TDim, TIdx> const& buf) -> TIdx
+            {
+                return buf.m_pitchBytes;
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP 1D memory allocation trait specialization.
+        template<typename TElem, typename TIdx>
+        struct BufAlloc<TElem, DimInt<1u>, TIdx, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevUniformCudaHipRt const& dev, TExtent const& extent)
+                -> BufUniformCudaHipRt<TElem, DimInt<1u>, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                auto const width(extent::getWidth(extent));
+                auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+                // Allocate the buffer on this device.
+                void* memPtr;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    ALPAKA_API_PREFIX(Malloc)(&memPtr, static_cast<std::size_t>(widthBytes)));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << " ptr: " << memPtr << std::endl;
+#    endif
+                return BufUniformCudaHipRt<TElem, DimInt<1u>, TIdx>(
+                    dev,
+                    reinterpret_cast<TElem*>(memPtr),
+                    static_cast<TIdx>(widthBytes),
+                    extent);
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP 2D memory allocation trait specialization.
+        template<typename TElem, typename TIdx>
+        struct BufAlloc<TElem, DimInt<2u>, TIdx, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevUniformCudaHipRt const& dev, TExtent const& extent)
+                -> BufUniformCudaHipRt<TElem, DimInt<2u>, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                auto const width(extent::getWidth(extent));
+                auto const widthBytes(width * static_cast<TIdx>(sizeof(TElem)));
+                auto const height(extent::getHeight(extent));
+
+
+                void* memPtr = nullptr;
+                std::size_t pitchBytes = 0u;
+#    ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+                // FIXME: HIP cannot handle zero-size input (throws Unknown Error)
+                if(width != 0 && height != 0)
+#    endif
+                {
+                    // Set the current device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+
+
+                    // Allocate the buffer on this device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(MallocPitch)(
+                        &memPtr,
+                        &pitchBytes,
+                        static_cast<std::size_t>(widthBytes),
+                        static_cast<std::size_t>(height)));
+                    ALPAKA_ASSERT(pitchBytes >= static_cast<std::size_t>(widthBytes) || (width * height) == 0);
+                }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " ew: " << width << " eh: " << height << " ewb: " << widthBytes
+                          << " ptr: " << memPtr << " pitch: " << pitchBytes << std::endl;
+#    endif
+                return BufUniformCudaHipRt<TElem, DimInt<2u>, TIdx>(
+                    dev,
+                    reinterpret_cast<TElem*>(memPtr),
+                    static_cast<TIdx>(pitchBytes),
+                    extent);
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP 3D memory allocation trait specialization.
+        template<typename TElem, typename TIdx>
+        struct BufAlloc<TElem, DimInt<3u>, TIdx, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevUniformCudaHipRt const& dev, TExtent const& extent)
+                -> BufUniformCudaHipRt<TElem, DimInt<3u>, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_API_PREFIX(Extent)
+                const extentVal(ALPAKA_PP_CONCAT(make_, ALPAKA_API_PREFIX(Extent))(
+                    static_cast<std::size_t>(extent::getWidth(extent) * static_cast<TIdx>(sizeof(TElem))),
+                    static_cast<std::size_t>(extent::getHeight(extent)),
+                    static_cast<std::size_t>(extent::getDepth(extent))));
+
+                ALPAKA_API_PREFIX(PitchedPtr) pitchedPtrVal;
+                pitchedPtrVal.ptr = nullptr;
+#    ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+                pitchedPtrVal.pitch = 0u;
+                // FIXME: HIP cannot handle zero-size input
+                if(extentVal.width != 0 && extentVal.height != 0 && extentVal.depth != 0)
+#    endif
+                {
+                    // Set the current device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(dev.m_iDevice));
+                    // Allocate the buffer on this device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(Malloc3D)(&pitchedPtrVal, extentVal));
+                }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " ew: " << extent::getWidth(extent) << " eh: " << extentVal.height
+                          << " ed: " << extentVal.depth << " ewb: " << extentVal.width << " ptr: " << pitchedPtrVal.ptr
+                          << " pitch: " << pitchedPtrVal.pitch << " wb: " << pitchedPtrVal.xsize
+                          << " h: " << pitchedPtrVal.ysize << std::endl;
+#    endif
+
+                return BufUniformCudaHipRt<TElem, DimInt<3u>, TIdx>(
+                    dev,
+                    reinterpret_cast<TElem*>(pitchedPtrVal.ptr),
+                    static_cast<TIdx>(pitchedPtrVal.pitch),
+                    extent);
+            }
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt CUDA/HIP device memory mapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Map<BufUniformCudaHipRt<TElem, TDim, TIdx>, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto map(
+                BufUniformCudaHipRt<TElem, TDim, TIdx> const& buf,
+                DevUniformCudaHipRt const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev)
+                {
+                    throw std::runtime_error(
+                        "Mapping memory from one CUDA/HIP device into an other CUDA/HIP device not implemented!");
+                }
+                // If it is already the same device, nothing has to be mapped.
+            }
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt CUDA/HIP device memory unmapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unmap<BufUniformCudaHipRt<TElem, TDim, TIdx>, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unmap(
+                BufUniformCudaHipRt<TElem, TDim, TIdx> const& buf,
+                DevUniformCudaHipRt const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev)
+                {
+                    throw std::runtime_error("Unmapping memory mapped from one CUDA/HIP device into an other CUDA/HIP "
+                                             "device not implemented!");
+                }
+                // If it is already the same device, nothing has to be unmapped.
+            }
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt memory pinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Pin<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto pin(BufUniformCudaHipRt<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // CUDA/HIP device memory is always pinned, it can not be swapped out.
+            }
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt memory unpinning trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unpin<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unpin(BufUniformCudaHipRt<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // CUDA/HIP device memory is always pinned, it can not be swapped out.
+            }
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt memory pin state trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IsPinned<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto isPinned(BufUniformCudaHipRt<TElem, TDim, TIdx> const&) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // CUDA/HIP device memory is always pinned, it can not be swapped out.
+                return true;
+            }
+        };
+        //#############################################################################
+        //! The BufUniformCudaHipRt memory prepareForAsyncCopy trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct PrepareForAsyncCopy<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto prepareForAsyncCopy(BufUniformCudaHipRt<TElem, TDim, TIdx>&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // CUDA/HIP device memory is always ready for async copy
+            }
+        };
+
+        //#############################################################################
+        //! The BufUniformCudaHipRt offset get trait specialization.
+        template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx>
+        struct GetOffset<TIdxIntegralConst, BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getOffset(BufUniformCudaHipRt<TElem, TDim, TIdx> const&) -> TIdx
+            {
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The BufUniformCudaHipRt idx type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IdxType<BufUniformCudaHipRt<TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //#############################################################################
+        //! The BufCpu CUDA/HIP device memory mapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Map<BufCpu<TElem, TDim, TIdx>, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto map(BufCpu<TElem, TDim, TIdx>& buf, DevUniformCudaHipRt const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // If it is already the same device, nothing has to be mapped.
+                if(getDev(buf) != dev)
+                {
+                    // cuda/hip-HostRegisterMapped:
+                    //   Maps the allocation into the CUDA/HIP address space.The device pointer to the memory may be
+                    //   obtained by calling cudaHostGetDevicePointer(). This feature is available only on GPUs with
+                    //   compute capability greater than or equal to 1.1.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(HostRegister)(
+                        const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf))),
+                        extent::getExtentProduct(buf) * sizeof(Elem<BufCpu<TElem, TDim, TIdx>>),
+                        ALPAKA_API_PREFIX(HostRegisterMapped)));
+                }
+            }
+        };
+        //#############################################################################
+        //! The BufCpu CUDA/HIP device memory unmapping trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct Unmap<BufCpu<TElem, TDim, TIdx>, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto unmap(BufCpu<TElem, TDim, TIdx>& buf, DevUniformCudaHipRt const& dev) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(getDev(buf) != dev)
+                {
+                    // Unmaps the memory range whose base address is specified by ptr, and makes it pageable again.
+                    // \FIXME: If the memory has separately been pinned before we destroy the pinning state.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(HostUnregister)(
+                        const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf)))));
+                }
+                // If it is already the same device, nothing has to be unmapped.
+            }
+        };
+
+        //#############################################################################
+        //! The BufCpu pointer on CUDA/HIP device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevUniformCudaHipRt const&)
+                -> TElem const*
+            {
+                // TODO: Check if the memory is mapped at all!
+                TElem* pDev(nullptr);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(HostGetDevicePointer)(
+                    &pDev,
+                    const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf))),
+                    0));
+
+                return pDev;
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevUniformCudaHipRt const&) -> TElem*
+            {
+                // TODO: Check if the memory is mapped at all!
+                TElem* pDev(nullptr);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(HostGetDevicePointer)(&pDev, getPtrNative(buf), 0));
+
+                return pDev;
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#    include <alpaka/mem/buf/uniformCudaHip/Copy.hpp>
+#    include <alpaka/mem/buf/uniformCudaHip/Set.hpp>
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/SetKernel.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/SetKernel.hpp
new file mode 100644
index 0000000000..085851784b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/SetKernel.hpp
@@ -0,0 +1,63 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/idx/Accessors.hpp>
+#include <alpaka/idx/MapIdx.hpp>
+#include <alpaka/idx/Traits.hpp>
+#include <alpaka/mem/buf/Traits.hpp>
+#include <alpaka/meta/Fold.hpp>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! any device ND memory set kernel.
+    class MemSetKernel
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        //! The kernel entry point.
+        //!
+        //! All but the last element of threadElemExtent must be one.
+        //!
+        //! \tparam TAcc The accelerator environment to be executed on.
+        //! \tparam TExtent extent type.
+        //! \param acc The accelerator to be executed on.
+        //! \param val value to set.
+        //! \param dst target mem ptr.
+        //! \param extent area to set.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc, typename TExtent, typename TPitch>
+        ALPAKA_FN_ACC auto operator()(
+            TAcc const& acc,
+            std::uint8_t const val,
+            std::uint8_t* dst,
+            TExtent extent,
+            TPitch pitch) const -> void
+        {
+            using Idx = typename alpaka::traits::IdxType<TExtent>::type;
+            auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc));
+            auto const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc));
+            auto const idxThreadFirstElem = getIdxThreadFirstElem(acc, gridThreadIdx, threadElemExtent);
+            auto idx = mapIdxPitchBytes<1u, Dim<TAcc>::value>(idxThreadFirstElem, pitch)[0];
+            constexpr auto lastDim = Dim<TAcc>::value - 1;
+            const auto lastIdx = idx
+                + std::min(threadElemExtent[lastDim], static_cast<Idx>(extent[lastDim] - idxThreadFirstElem[lastDim]));
+
+            if((idxThreadFirstElem < extent).foldrAll(std::logical_and<bool>()))
+            {
+                for(; idx < lastIdx; ++idx)
+                {
+                    *(dst + idx) = val;
+                }
+            }
+        }
+    };
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/Traits.hpp
index d4baea9ceb..dfe1bf56cc 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Alexander Matthes, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,254 +9,139 @@
 
 #pragma once
 
-#include <alpaka/mem/view/Traits.hpp>
-
 #include <alpaka/core/Common.hpp>
-
-#include <boost/config.hpp>
+#include <alpaka/mem/view/Traits.hpp>
 
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
-    //! The memory specifics.
-    namespace mem
+    //! The buffer traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The buffer specifics.
-        namespace buf
-        {
-            //-----------------------------------------------------------------------------
-            //! The buffer traits.
-            namespace traits
-            {
-                //#############################################################################
-                //! The memory buffer type trait.
-                template<
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx,
-                    typename TSfinae = void>
-                struct BufType;
+        //#############################################################################
+        //! The memory buffer type trait.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx, typename TSfinae = void>
+        struct BufType;
 
-                //#############################################################################
-                //! The memory allocator trait.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct Alloc;
+        //#############################################################################
+        //! The memory allocator trait.
+        template<typename TElem, typename TDim, typename TIdx, typename TDev, typename TSfinae = void>
+        struct BufAlloc;
 
-                //#############################################################################
-                //! The memory mapping trait.
-                template<
-                    typename TBuf,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct Map;
+        //#############################################################################
+        //! The memory mapping trait.
+        template<typename TBuf, typename TDev, typename TSfinae = void>
+        struct Map;
 
-                //#############################################################################
-                //! The memory unmapping trait.
-                template<
-                    typename TBuf,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct Unmap;
+        //#############################################################################
+        //! The memory unmapping trait.
+        template<typename TBuf, typename TDev, typename TSfinae = void>
+        struct Unmap;
 
-                //#############################################################################
-                //! The memory pinning trait.
-                template<
-                    typename TBuf,
-                    typename TSfinae = void>
-                struct Pin;
+        //#############################################################################
+        //! The memory pinning trait.
+        template<typename TBuf, typename TSfinae = void>
+        struct Pin;
 
-                //#############################################################################
-                //! The memory unpinning trait.
-                template<
-                    typename TBuf,
-                    typename TSfinae = void>
-                struct Unpin;
+        //#############################################################################
+        //! The memory unpinning trait.
+        template<typename TBuf, typename TSfinae = void>
+        struct Unpin;
 
-                //#############################################################################
-                //! The memory pin state trait.
-                template<
-                    typename TBuf,
-                    typename TSfinae = void>
-                struct IsPinned;
+        //#############################################################################
+        //! The memory pin state trait.
+        template<typename TBuf, typename TSfinae = void>
+        struct IsPinned;
 
-                //#############################################################################
-                //! The memory prepareForAsyncCopy trait.
-                template<
-                    typename TBuf,
-                    typename TSfinae = void>
-                struct PrepareForAsyncCopy;
-            }
+        //#############################################################################
+        //! The memory prepareForAsyncCopy trait.
+        template<typename TBuf, typename TSfinae = void>
+        struct PrepareForAsyncCopy;
+    } // namespace traits
 
-            //#############################################################################
-            //! The memory buffer type trait alias template to remove the ::type.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            using Buf = typename traits::BufType<TDev, TElem, TDim, TIdx>::type;
+    //#############################################################################
+    //! The memory buffer type trait alias template to remove the ::type.
+    template<typename TDev, typename TElem, typename TDim, typename TIdx>
+    using Buf = typename traits::BufType<alpaka::Dev<TDev>, TElem, TDim, TIdx>::type;
 
-            //-----------------------------------------------------------------------------
-            //! Allocates memory on the given device.
-            //!
-            //! \tparam TElem The element type of the returned buffer.
-            //! \tparam TExtent The extent of the buffer.
-            //! \tparam TDev The type of device the buffer is allocated on.
-            //! \param dev The device to allocate the buffer on.
-            //! \param extent The extent of the buffer.
-            //! \return The newly allocated buffer.
-            template<
-                typename TElem,
-                typename TIdx,
-                typename TExtent,
-                typename TDev>
-            ALPAKA_FN_HOST auto alloc(
-                TDev const & dev,
-                TExtent const & extent = TExtent())
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::Alloc<
-                    TElem,
-                    dim::Dim<TExtent>,
-                    TIdx,
-                    TDev>
-                ::alloc(
-                    dev,
-                    extent))
-#endif
-            {
-                return
-                    traits::Alloc<
-                        TElem,
-                        dim::Dim<TExtent>,
-                        TIdx,
-                        TDev>
-                    ::alloc(
-                        dev,
-                        extent);
-            }
-            //-----------------------------------------------------------------------------
-            //! Maps the buffer into the memory of the given device.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \tparam TDev The device type.
-            //! \param buf The buffer to map into the device memory.
-            //! \param dev The device to map the buffer into.
-            template<
-                typename TBuf,
-                typename TDev>
-            ALPAKA_FN_HOST auto map(
-                TBuf & buf,
-                TDev const & dev)
-            -> void
-            {
-                return
-                    traits::Map<
-                        TBuf,
-                        TDev>
-                    ::map(
-                        buf,
-                        dev);
-            }
-            //-----------------------------------------------------------------------------
-            //! Unmaps the buffer from the memory of the given device.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \tparam TDev The device type.
-            //! \param buf The buffer to unmap from the device memory.
-            //! \param dev The device to unmap the buffer from.
-            template<
-                typename TBuf,
-                typename TDev>
-            ALPAKA_FN_HOST auto unmap(
-                TBuf & buf,
-                TDev const & dev)
-            -> void
-            {
-                return
-                    traits::Unmap<
-                        TBuf,
-                        TDev>
-                    ::unmap(
-                        buf,
-                        dev);
-            }
-            //-----------------------------------------------------------------------------
-            //! Pins the buffer.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \param buf The buffer to pin in the device memory.
-            template<
-                typename TBuf>
-            ALPAKA_FN_HOST auto pin(
-                TBuf & buf)
-            -> void
-            {
-                return
-                    traits::Pin<
-                        TBuf>
-                    ::pin(
-                        buf);
-            }
-            //-----------------------------------------------------------------------------
-            //! Unpins the buffer.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \param buf The buffer to unpin from the device memory.
-            template<
-                typename TBuf>
-            ALPAKA_FN_HOST auto unpin(
-                TBuf & buf)
-            -> void
-            {
-                return
-                    traits::Unpin<
-                        TBuf>
-                    ::unpin(
-                        buf);
-            }
-            //-----------------------------------------------------------------------------
-            //! The pin state of the buffer.
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \param buf The buffer to get the pin state of.
-            template<
-                typename TBuf>
-            ALPAKA_FN_HOST auto isPinned(
-                TBuf const & buf)
-            -> bool
-            {
-                return
-                    traits::IsPinned<
-                        TBuf>
-                    ::isPinned(
-                        buf);
-            }
-            //-----------------------------------------------------------------------------
-            //! Prepares the buffer for non-blocking copy operations, e.g. pinning if
-            //! non-blocking copy between a cpu and a cuda device is wanted
-            //!
-            //! \tparam TBuf The buffer type.
-            //! \param buf The buffer to prepare in the device memory.
-            template<
-                typename TBuf>
-            ALPAKA_FN_HOST auto prepareForAsyncCopy(
-                TBuf & buf)
-            -> void
-            {
-                return
-                    traits::PrepareForAsyncCopy<
-                        TBuf>
-                    ::prepareForAsyncCopy(
-                        buf);
-            }
-        }
+    //-----------------------------------------------------------------------------
+    //! Allocates memory on the given device.
+    //!
+    //! \tparam TElem The element type of the returned buffer.
+    //! \tparam TIdx The linear index type of the buffer.
+    //! \tparam TExtent The extent type of the buffer.
+    //! \tparam TDev The type of device the buffer is allocated on.
+    //! \param dev The device to allocate the buffer on.
+    //! \param extent The extent of the buffer.
+    //! \return The newly allocated buffer.
+    template<typename TElem, typename TIdx, typename TExtent, typename TDev>
+    ALPAKA_FN_HOST auto allocBuf(TDev const& dev, TExtent const& extent = TExtent())
+    {
+        return traits::BufAlloc<TElem, Dim<TExtent>, TIdx, TDev>::allocBuf(dev, extent);
+    }
+    //-----------------------------------------------------------------------------
+    //! Maps the buffer into the memory of the given device.
+    //!
+    //! \tparam TBuf The buffer type.
+    //! \tparam TDev The device type.
+    //! \param buf The buffer to map into the device memory.
+    //! \param dev The device to map the buffer into.
+    template<typename TBuf, typename TDev>
+    ALPAKA_FN_HOST auto map(TBuf& buf, TDev const& dev) -> void
+    {
+        return traits::Map<TBuf, TDev>::map(buf, dev);
+    }
+    //-----------------------------------------------------------------------------
+    //! Unmaps the buffer from the memory of the given device.
+    //!
+    //! \tparam TBuf The buffer type.
+    //! \tparam TDev The device type.
+    //! \param buf The buffer to unmap from the device memory.
+    //! \param dev The device to unmap the buffer from.
+    template<typename TBuf, typename TDev>
+    ALPAKA_FN_HOST auto unmap(TBuf& buf, TDev const& dev) -> void
+    {
+        return traits::Unmap<TBuf, TDev>::unmap(buf, dev);
+    }
+    //-----------------------------------------------------------------------------
+    //! Pins the buffer.
+    //!
+    //! \tparam TBuf The buffer type.
+    //! \param buf The buffer to pin in the device memory.
+    template<typename TBuf>
+    ALPAKA_FN_HOST auto pin(TBuf& buf) -> void
+    {
+        return traits::Pin<TBuf>::pin(buf);
+    }
+    //-----------------------------------------------------------------------------
+    //! Unpins the buffer.
+    //!
+    //! \tparam TBuf The buffer type.
+    //! \param buf The buffer to unpin from the device memory.
+    template<typename TBuf>
+    ALPAKA_FN_HOST auto unpin(TBuf& buf) -> void
+    {
+        return traits::Unpin<TBuf>::unpin(buf);
+    }
+    //-----------------------------------------------------------------------------
+    //! The pin state of the buffer.
+    //!
+    //! \tparam TBuf The buffer type.
+    //! \param buf The buffer to get the pin state of.
+    template<typename TBuf>
+    ALPAKA_FN_HOST auto isPinned(TBuf const& buf) -> bool
+    {
+        return traits::IsPinned<TBuf>::isPinned(buf);
+    }
+    //-----------------------------------------------------------------------------
+    //! Prepares the buffer for non-blocking copy operations, e.g. pinning if
+    //! non-blocking copy between a cpu and a cuda device is wanted
+    //!
+    //! \tparam TBuf The buffer type.
+    //! \param buf The buffer to prepare in the device memory.
+    template<typename TBuf>
+    ALPAKA_FN_HOST auto prepareForAsyncCopy(TBuf& buf) -> void
+    {
+        return traits::PrepareForAsyncCopy<TBuf>::prepareForAsyncCopy(buf);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp
index 4ce4d28b26..5cc04de434 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Copy.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, Rene Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -13,251 +13,196 @@
 #include <alpaka/dim/DimIntegralConst.hpp>
 #include <alpaka/extent/Traits.hpp>
 #include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/meta/NdLoop.hpp>
 #include <alpaka/meta/Integral.hpp>
+#include <alpaka/meta/NdLoop.hpp>
 
 #include <cstring>
 
 namespace alpaka
 {
-    namespace dev
-    {
-        class DevCpu;
-    }
+    class DevCpu;
 }
 
 namespace alpaka
 {
-    namespace mem
+    namespace detail
     {
-        namespace view
+        //#############################################################################
+        //! The CPU device memory copy task base.
+        //!
+        //! Copies from CPU memory into CPU memory.
+        template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyCpuBase
         {
-            namespace cpu
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CPU device memory copy task base.
-                    //!
-                    //! Copies from CPU memory into CPU memory.
-                    template<
-                        typename TDim,
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCpuBase
-                    {
-                        using ExtentSize = idx::Idx<TExtent>;
-                        using DstSize = idx::Idx<TViewDst>;
-                        using SrcSize = idx::Idx<TViewSrc>;
-                        using Elem = elem::Elem<TViewSrc>;
-
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == TDim::value,
-                            "The destination view and the input TDim are required to have the same dimensionality!");
-
-                        static_assert(
-                            meta::IsIntegralSuperset<DstSize, ExtentSize>::value,
-                            "The destination view and the extent are required to have compatible idx type!");
-                        static_assert(
-                            meta::IsIntegralSuperset<SrcSize, ExtentSize>::value,
-                            "The source view and the extent are required to have compatible idx type!");
-
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        //-----------------------------------------------------------------------------
-                        TaskCopyCpuBase(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent) :
-                                m_extent(extent::getExtentVec(extent)),
-                                m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem))),
-#if (!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                                m_dstExtent(extent::getExtentVec(viewDst)),
-                                m_srcExtent(extent::getExtentVec(viewSrc)),
+            using ExtentSize = Idx<TExtent>;
+            using DstSize = Idx<TViewDst>;
+            using SrcSize = Idx<TViewSrc>;
+            using Elem = alpaka::Elem<TViewSrc>;
+
+            static_assert(!std::is_const<TViewDst>::value, "The destination view can not be const!");
+
+            static_assert(
+                Dim<TViewDst>::value == Dim<TViewSrc>::value,
+                "The source and the destination view are required to have the same dimensionality!");
+            static_assert(
+                Dim<TViewDst>::value == Dim<TExtent>::value,
+                "The views and the extent are required to have the same dimensionality!");
+            static_assert(
+                Dim<TViewDst>::value == TDim::value,
+                "The destination view and the input TDim are required to have the same dimensionality!");
+
+            static_assert(
+                meta::IsIntegralSuperset<DstSize, ExtentSize>::value,
+                "The destination view and the extent are required to have compatible idx type!");
+            static_assert(
+                meta::IsIntegralSuperset<SrcSize, ExtentSize>::value,
+                "The source view and the extent are required to have compatible idx type!");
+
+            static_assert(
+                std::is_same<alpaka::Elem<TViewDst>, std::remove_const_t<alpaka::Elem<TViewSrc>>>::value,
+                "The source and the destination view are required to have the same element type!");
+
+            //-----------------------------------------------------------------------------
+            TaskCopyCpuBase(TViewDst& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+                : m_extent(extent::getExtentVec(extent))
+                , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
+                ,
+#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                m_dstExtent(extent::getExtentVec(viewDst))
+                , m_srcExtent(extent::getExtentVec(viewSrc))
+                ,
 #endif
-                                m_dstPitchBytes(mem::view::getPitchBytesVec(viewDst)),
-                                m_srcPitchBytes(mem::view::getPitchBytesVec(viewSrc)),
+                m_dstPitchBytes(getPitchBytesVec(viewDst))
+                , m_srcPitchBytes(getPitchBytesVec(viewSrc))
+                ,
 
-                                m_dstMemNative(reinterpret_cast<std::uint8_t *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<std::uint8_t const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-                            ALPAKA_ASSERT((vec::cast<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
-                            ALPAKA_ASSERT((vec::cast<SrcSize>(m_extent) <= m_srcExtent).foldrAll(std::logical_or<bool>()));
-                            ALPAKA_ASSERT(static_cast<DstSize>(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 1u]);
-                            ALPAKA_ASSERT(static_cast<SrcSize>(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 1u]);
-                        }
+                m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
+                ALPAKA_ASSERT((castVec<SrcSize>(m_extent) <= m_srcExtent).foldrAll(std::logical_or<bool>()));
+                ALPAKA_ASSERT(static_cast<DstSize>(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 1u]);
+                ALPAKA_ASSERT(static_cast<SrcSize>(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 1u]);
+            }
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " e: " << m_extent
-                                << " ewb: " << this->m_extentWidthBytes
-                                << " de: " << m_dstExtent
-                                << " dptr: " << reinterpret_cast<void *>(m_dstMemNative)
-                                << " dpitchb: " << m_dstPitchBytes
-                                << " se: " << m_srcExtent
-                                << " sptr: " << reinterpret_cast<void const *>(m_srcMemNative)
-                                << " spitchb: " << m_srcPitchBytes
-                                << std::endl;
-                        }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes
+                          << " de: " << m_dstExtent << " dptr: " << reinterpret_cast<void*>(m_dstMemNative)
+                          << " dpitchb: " << m_dstPitchBytes << " se: " << m_srcExtent
+                          << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
+                          << " spitchb: " << m_srcPitchBytes << std::endl;
+            }
 #endif
 
-                        vec::Vec<TDim, ExtentSize> const m_extent;
-                        ExtentSize const m_extentWidthBytes;
-#if (!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                        vec::Vec<TDim, DstSize> const m_dstExtent;
-                        vec::Vec<TDim, SrcSize> const m_srcExtent;
+            Vec<TDim, ExtentSize> const m_extent;
+            ExtentSize const m_extentWidthBytes;
+#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            Vec<TDim, DstSize> const m_dstExtent;
+            Vec<TDim, SrcSize> const m_srcExtent;
 #endif
-                        vec::Vec<TDim, DstSize> const m_dstPitchBytes;
-                        vec::Vec<TDim, SrcSize> const m_srcPitchBytes;
-
-                        std::uint8_t * const m_dstMemNative;
-                        std::uint8_t const * const m_srcMemNative;
-                    };
+            Vec<TDim, DstSize> const m_dstPitchBytes;
+            Vec<TDim, SrcSize> const m_srcPitchBytes;
 
+            std::uint8_t* const m_dstMemNative;
+            std::uint8_t const* const m_srcMemNative;
+        };
 
 
-                    //#############################################################################
-                    //! The CPU device ND memory copy task.
-                    template<
-                        typename TDim,
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCpu : public TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>
-                    {
-                        using DimMin1 = dim::DimInt<TDim::value - 1u>;
-                        using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
-                        using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
-                        using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
+        //#############################################################################
+        //! The CPU device ND memory copy task.
+        template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyCpu : public TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>
+        {
+            using DimMin1 = DimInt<TDim::value - 1u>;
+            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
+            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
+            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
 
-                        //-----------------------------------------------------------------------------
-                        using TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
+            //-----------------------------------------------------------------------------
+            using TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
 
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto operator()() const
-                        -> void
-                        {
-#if defined(BOOST_COMP_HCC) && !defined(__HIP_DEVICE_COMPILE__)
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            this->printDebug();
-#endif
-                            // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one iteration.
-                            vec::Vec<DimMin1, ExtentSize> const extentWithoutInnermost(vec::subVecBegin<DimMin1>(this->m_extent));
-                            // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required.
-                            vec::Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(vec::subVecEnd<DimMin1>(this->m_dstPitchBytes));
-                            vec::Vec<DimMin1, SrcSize> const srcPitchBytesWithoutOutmost(vec::subVecEnd<DimMin1>(this->m_srcPitchBytes));
-
-                            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                            {
-                                meta::ndLoopIncIdx(
-                                    extentWithoutInnermost,
-                                    [&](vec::Vec<DimMin1, ExtentSize> const & idx)
-                                    {
-                                        std::memcpy(
-                                            reinterpret_cast<void *>(this->m_dstMemNative + (vec::cast<DstSize>(idx) * dstPitchBytesWithoutOutmost).foldrAll(std::plus<DstSize>())),
-                                            reinterpret_cast<void const *>(this->m_srcMemNative + (vec::cast<SrcSize>(idx) * srcPitchBytesWithoutOutmost).foldrAll(std::plus<SrcSize>())),
-                                            static_cast<std::size_t>(this->m_extentWidthBytes));
-                                    });
-                            }
+                this->printDebug();
 #endif
-                        }
-                    };
+                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+                // iteration.
+                Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+                // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required.
+                Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_dstPitchBytes));
+                Vec<DimMin1, SrcSize> const srcPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_srcPitchBytes));
+
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    meta::ndLoopIncIdx(extentWithoutInnermost, [&](Vec<DimMin1, ExtentSize> const& idx) {
+                        std::memcpy(
+                            reinterpret_cast<void*>(
+                                this->m_dstMemNative
+                                + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost)
+                                      .foldrAll(std::plus<DstSize>())),
+                            reinterpret_cast<void const*>(
+                                this->m_srcMemNative
+                                + (castVec<SrcSize>(idx) * srcPitchBytesWithoutOutmost)
+                                      .foldrAll(std::plus<SrcSize>())),
+                            static_cast<std::size_t>(this->m_extentWidthBytes));
+                    });
+                }
+            }
+        };
 
-                    //#############################################################################
-                    //! The CPU device 1D memory copy task.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCpu<
-                        dim::DimInt<1u>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent> : public TaskCopyCpuBase<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>
-                    {
-                        //-----------------------------------------------------------------------------
-                        using TaskCopyCpuBase<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
+        //#############################################################################
+        //! The CPU device 1D memory copy task.
+        template<typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyCpu<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+            : public TaskCopyCpuBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+        {
+            //-----------------------------------------------------------------------------
+            using TaskCopyCpuBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
 
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto operator()() const
-                        -> void
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            this->printDebug();
+                this->printDebug();
 #endif
-                            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                            {
-                                std::memcpy(
-                                    reinterpret_cast<void *>(this->m_dstMemNative),
-                                    reinterpret_cast<void const *>(this->m_srcMemNative),
-                                    static_cast<std::size_t>(this->m_extentWidthBytes));
-                            }
-                        }
-                    };
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    std::memcpy(
+                        reinterpret_cast<void*>(this->m_dstMemNative),
+                        reinterpret_cast<void const*>(this->m_srcMemNative),
+                        static_cast<std::size_t>(this->m_extentWidthBytes));
                 }
             }
+        };
+    } // namespace detail
 
-            namespace traits
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU device memory copy trait specialization.
+        //!
+        //! Copies from CPU memory into CPU memory.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevCpu, DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::TaskCopyCpu<TDim, TViewDst, TViewSrc, TExtent>
             {
-                //#############################################################################
-                //! The CPU device memory copy trait specialization.
-                //!
-                //! Copies from CPU memory into CPU memory.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCpu,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> cpu::detail::TaskCopyCpu<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        return
-                            cpu::detail::TaskCopyCpu<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent);
-                    }
-                };
+                return alpaka::detail::TaskCopyCpu<TDim, TViewDst, TViewSrc, TExtent>(viewDst, viewSrc, extent);
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Set.hpp
index 6a06c6e61f..015e006656 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Set.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cpu/Set.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -13,219 +13,158 @@
 #include <alpaka/dim/DimIntegralConst.hpp>
 #include <alpaka/extent/Traits.hpp>
 #include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/meta/NdLoop.hpp>
 #include <alpaka/meta/Integral.hpp>
+#include <alpaka/meta/NdLoop.hpp>
 
 #include <cstring>
 
 namespace alpaka
 {
-    namespace dev
-    {
-        class DevCpu;
-    }
-}
+    class DevCpu;
 
-namespace alpaka
-{
-    namespace mem
+    namespace detail
     {
-        namespace view
+        //#############################################################################
+        //! The CPU device ND memory set task base.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetCpuBase
         {
-            namespace cpu
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CPU device ND memory set task base.
-                    template<
-                        typename TDim,
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetCpuBase
-                    {
-                        using ExtentSize = idx::Idx<TExtent>;
-                        using DstSize = idx::Idx<TView>;
-                        using Elem = elem::Elem<TView>;
-
-                        static_assert(
-                            !std::is_const<TView>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                            "The destination view and the extent are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TView>::value == TDim::value,
-                            "The destination view and the input TDim are required to have the same dimensionality!");
-
-                        static_assert(
-                            meta::IsIntegralSuperset<DstSize, ExtentSize>::value,
-                            "The view and the extent are required to have compatible idx type!");
-
-                        //-----------------------------------------------------------------------------
-                        TaskSetCpuBase(
-                            TView & view,
-                            std::uint8_t const & byte,
-                            TExtent const & extent) :
-                                m_byte(byte),
-                                m_extent(extent::getExtentVec(extent)),
-                                m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem))),
-#if (!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                                m_dstExtent(extent::getExtentVec(view)),
+            using ExtentSize = Idx<TExtent>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            static_assert(!std::is_const<TView>::value, "The destination view can not be const!");
+
+            static_assert(
+                Dim<TView>::value == Dim<TExtent>::value,
+                "The destination view and the extent are required to have the same dimensionality!");
+            static_assert(
+                Dim<TView>::value == TDim::value,
+                "The destination view and the input TDim are required to have the same dimensionality!");
+
+            static_assert(
+                meta::IsIntegralSuperset<DstSize, ExtentSize>::value,
+                "The view and the extent are required to have compatible idx type!");
+
+            //-----------------------------------------------------------------------------
+            TaskSetCpuBase(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                : m_byte(byte)
+                , m_extent(extent::getExtentVec(extent))
+                , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
+                ,
+#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                m_dstExtent(extent::getExtentVec(view))
+                ,
 #endif
-                                m_dstPitchBytes(mem::view::getPitchBytesVec(view)),
-                                m_dstMemNative(reinterpret_cast<std::uint8_t *>(mem::view::getPtrNative(view)))
-                        {
-                            ALPAKA_ASSERT((vec::cast<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]);
-                        }
+                m_dstPitchBytes(getPitchBytesVec(view))
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]);
+            }
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " e: " << this->m_extent
-                                << " ewb: " << this->m_extentWidthBytes
-                                << " de: " << this->m_dstExtent
-                                << " dptr: " << reinterpret_cast<void *>(this->m_dstMemNative)
-                                << " dpitchb: " << this->m_dstPitchBytes
-                                << std::endl;
-                        }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes
+                          << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast<void*>(this->m_dstMemNative)
+                          << " dpitchb: " << this->m_dstPitchBytes << std::endl;
+            }
 #endif
 
-                        std::uint8_t const m_byte;
-                        vec::Vec<TDim, ExtentSize> const m_extent;
-                        ExtentSize const m_extentWidthBytes;
-#if (!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                        vec::Vec<TDim, DstSize> const m_dstExtent;
+            std::uint8_t const m_byte;
+            Vec<TDim, ExtentSize> const m_extent;
+            ExtentSize const m_extentWidthBytes;
+#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            Vec<TDim, DstSize> const m_dstExtent;
 #endif
-                        vec::Vec<TDim, DstSize> const m_dstPitchBytes;
-                        std::uint8_t * const m_dstMemNative;
-                    };
-
-                    //#############################################################################
-                    //! The CPU device ND memory set task.
-                    template<
-                        typename TDim,
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetCpu : public TaskSetCpuBase<TDim, TView, TExtent>
-                    {
-                        using DimMin1 = dim::DimInt<TDim::value - 1u>;
-                        using typename TaskSetCpuBase<TDim, TView, TExtent>::ExtentSize;
-                        using typename TaskSetCpuBase<TDim, TView, TExtent>::DstSize;
-
-                        //-----------------------------------------------------------------------------
-                        using TaskSetCpuBase<TDim, TView, TExtent>::TaskSetCpuBase;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto operator()() const
-                        -> void
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+            Vec<TDim, DstSize> const m_dstPitchBytes;
+            std::uint8_t* const m_dstMemNative;
+        };
+
+        //#############################################################################
+        //! The CPU device ND memory set task.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetCpu : public TaskSetCpuBase<TDim, TView, TExtent>
+        {
+            using DimMin1 = DimInt<TDim::value - 1u>;
+            using typename TaskSetCpuBase<TDim, TView, TExtent>::ExtentSize;
+            using typename TaskSetCpuBase<TDim, TView, TExtent>::DstSize;
+
+            //-----------------------------------------------------------------------------
+            using TaskSetCpuBase<TDim, TView, TExtent>::TaskSetCpuBase;
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            this->printDebug();
+                this->printDebug();
 #endif
-                            // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one iteration.
-                            vec::Vec<DimMin1, ExtentSize> const extentWithoutInnermost(vec::subVecBegin<DimMin1>(this->m_extent));
-                            // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required.
-                            vec::Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(vec::subVecEnd<DimMin1>(this->m_dstPitchBytes));
-
-                            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                            {
-                                meta::ndLoopIncIdx(
-                                    extentWithoutInnermost,
-
-                                    // workaround for HIP(HCC) to
-                                    // avoid forbidden host-call
-                                    // within host-device functions
-                                    #if defined(BOOST_COMP_HCC) && BOOST_COMP_HCC
-                                    ALPAKA_FN_HOST_ACC
-                                    #endif
-                                    [&](vec::Vec<DimMin1, ExtentSize> const & idx)
-                                    {
-
-                                        memset(
-                                            reinterpret_cast<void *>(this->m_dstMemNative + (vec::cast<DstSize>(idx) * dstPitchBytesWithoutOutmost).foldrAll(std::plus<DstSize>())),
-                                            this->m_byte,
-                                            static_cast<std::size_t>(this->m_extentWidthBytes));
-                                    });
-                            }
-                        }
-                    };
-
-                    //#############################################################################
-                    //! The CPU device 1D memory set task.
-                    template<
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetCpu<
-                        dim::DimInt<1u>,
-                        TView,
-                        TExtent> : public TaskSetCpuBase<dim::DimInt<1u>, TView, TExtent>
-                    {
-                        //-----------------------------------------------------------------------------
-                        using TaskSetCpuBase<dim::DimInt<1u>, TView, TExtent>::TaskSetCpuBase;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto operator()() const
-                        -> void
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+                // iteration.
+                Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+                // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required.
+                Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_dstPitchBytes));
+
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    meta::ndLoopIncIdx(extentWithoutInnermost, [&](Vec<DimMin1, ExtentSize> const& idx) {
+                        std::memset(
+                            reinterpret_cast<void*>(
+                                this->m_dstMemNative
+                                + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost)
+                                      .foldrAll(std::plus<DstSize>())),
+                            this->m_byte,
+                            static_cast<std::size_t>(this->m_extentWidthBytes));
+                    });
+                }
+            }
+        };
+
+        //#############################################################################
+        //! The CPU device 1D memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetCpu<DimInt<1u>, TView, TExtent> : public TaskSetCpuBase<DimInt<1u>, TView, TExtent>
+        {
+            //-----------------------------------------------------------------------------
+            using TaskSetCpuBase<DimInt<1u>, TView, TExtent>::TaskSetCpuBase;
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            this->printDebug();
+                this->printDebug();
 #endif
-                            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                            {
-                                std::memset(
-                                    reinterpret_cast<void *>(this->m_dstMemNative),
-                                    this->m_byte,
-                                    static_cast<std::size_t>(this->m_extentWidthBytes));
-                            }
-                        }
-                    };
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    std::memset(
+                        reinterpret_cast<void*>(this->m_dstMemNative),
+                        this->m_byte,
+                        static_cast<std::size_t>(this->m_extentWidthBytes));
                 }
             }
+        };
+    } // namespace detail
 
-            namespace traits
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU device memory set trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemset<TDim, DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TView>
+            ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                -> alpaka::detail::TaskSetCpu<TDim, TView, TExtent>
             {
-                //#############################################################################
-                //! The CPU device memory set trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskSet<
-                    TDim,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TView>
-                    ALPAKA_FN_HOST static auto createTaskSet(
-                        TView & view,
-                        std::uint8_t const & byte,
-                        TExtent const & extent)
-                    -> cpu::detail::TaskSetCpu<
-                        TDim,
-                        TView,
-                        TExtent>
-                    {
-                        return
-                            cpu::detail::TaskSetCpu<
-                                TDim,
-                                TView,
-                                TExtent>(
-                                    view,
-                                    byte,
-                                    extent);
-                    }
-                };
+                return alpaka::detail::TaskSetCpu<TDim, TView, TExtent>(view, byte, extent);
             }
-        }
-    }
-}
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cuda/Copy.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cuda/Copy.hpp
deleted file mode 100644
index 5e62ff9a0d..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cuda/Copy.hpp
+++ /dev/null
@@ -1,1173 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Cuda.hpp>
-
-#include <set>
-#include <tuple>
-
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            namespace cuda
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CUDA memory copy trait.
-                    template<
-                        typename TDim,
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCuda;
-
-                    //#############################################################################
-                    //! The 1D CUDA memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCuda<
-                        dim::DimInt<1>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyCuda(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            cudaMemcpyKind const & cudaMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_cudaMemCpyKind(cudaMemCpyKind),
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_extentWidth(extent::getWidth(extent)),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),
-#endif
-                                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ddev: " << m_iDstDevice
-                                << " ew: " << m_extentWidth
-                                << " ewb: " << m_extentWidthBytes
-                                << " dw: " << m_dstWidth
-                                << " dptr: " << m_dstMemNative
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sptr: " << m_srcMemNative
-                                << std::endl;
-                        }
-#endif
-                        cudaMemcpyKind m_cudaMemCpyKind;
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_extentWidth;
-                        Idx m_dstWidth;
-                        Idx m_srcWidth;
-#endif
-                        Idx m_extentWidthBytes;
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-                    //#############################################################################
-                    //! The 2D CUDA memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCuda<
-                        dim::DimInt<2>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyCuda(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            cudaMemcpyKind const & cudaMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_cudaMemCpyKind(cudaMemCpyKind),
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_extentWidth(extent::getWidth(extent)),
-#endif
-                                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),      // required for 3D peer copy
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),      // required for 3D peer copy
-
-                                m_extentHeight(extent::getHeight(extent)),
-                                m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst))),    // required for 3D peer copy
-                                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc))),    // required for 3D peer copy
-
-                                m_dstpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - 1u>(viewDst))),
-                                m_srcpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - 1u>(viewSrc))),
-                                m_dstPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - (2u % dim::Dim<TViewDst>::value)>(viewDst))),
-                                m_srcPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - (2u % dim::Dim<TViewDst>::value)>(viewSrc))),
-
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ew: " << m_extentWidth
-                                << " eh: " << m_extentHeight
-                                << " ewb: " << m_extentWidthBytes
-                                << " ddev: " << m_iDstDevice
-                                << " dw: " << m_dstWidth
-                                << " dh: " << m_dstHeight
-                                << " dptr: " << m_dstMemNative
-                                << " dpitchb: " << m_dstpitchBytesX
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sh: " << m_srcHeight
-                                << " sptr: " << m_srcMemNative
-                                << " spitchb: " << m_srcpitchBytesX
-                                << std::endl;
-                        }
-#endif
-                        cudaMemcpyKind m_cudaMemCpyKind;
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_extentWidth;
-#endif
-                        Idx m_extentWidthBytes;
-                        Idx m_dstWidth;          // required for 3D peer copy
-                        Idx m_srcWidth;          // required for 3D peer copy
-
-                        Idx m_extentHeight;
-                        Idx m_dstHeight;         // required for 3D peer copy
-                        Idx m_srcHeight;         // required for 3D peer copy
-
-                        Idx m_dstpitchBytesX;
-                        Idx m_srcpitchBytesX;
-                        Idx m_dstPitchBytesY;
-                        Idx m_srcPitchBytesY;
-
-
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-                    //#############################################################################
-                    //! The 3D CUDA memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyCuda<
-                        dim::DimInt<3>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyCuda(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            cudaMemcpyKind const & cudaMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_cudaMemCpyKind(cudaMemCpyKind),
-
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-
-                                m_extentWidth(extent::getWidth(extent)),
-                                m_extentWidthBytes(m_extentWidth * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),
-
-                                m_extentHeight(extent::getHeight(extent)),
-                                m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst))),
-                                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc))),
-
-                                m_extentDepth(extent::getDepth(extent)),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_dstDepth(static_cast<Idx>(extent::getDepth(viewDst))),
-                                m_srcDepth(static_cast<Idx>(extent::getDepth(viewSrc))),
-#endif
-                                m_dstpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - 1u>(viewDst))),
-                                m_srcpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - 1u>(viewSrc))),
-                                m_dstPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - (2u % dim::Dim<TViewDst>::value)>(viewDst))),
-                                m_srcPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - (2u % dim::Dim<TViewDst>::value)>(viewSrc))),
-
-
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                            ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                            ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ew: " << m_extentWidth
-                                << " eh: " << m_extentHeight
-                                << " ed: " << m_extentDepth
-                                << " ewb: " << m_extentWidthBytes
-                                << " ddev: " << m_iDstDevice
-                                << " dw: " << m_dstWidth
-                                << " dh: " << m_dstHeight
-                                << " dd: " << m_dstDepth
-                                << " dptr: " << m_dstMemNative
-                                << " dpitchb: " << m_dstpitchBytesX
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sh: " << m_srcHeight
-                                << " sd: " << m_srcDepth
-                                << " sptr: " << m_srcMemNative
-                                << " spitchb: " << m_srcpitchBytesX
-                                << std::endl;
-                        }
-#endif
-                        cudaMemcpyKind m_cudaMemCpyKind;
-
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-
-                        Idx m_extentWidth;
-                        Idx m_extentWidthBytes;
-                        Idx m_dstWidth;
-                        Idx m_srcWidth;
-
-                        Idx m_extentHeight;
-                        Idx m_dstHeight;
-                        Idx m_srcHeight;
-
-                        Idx m_extentDepth;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_dstDepth;
-                        Idx m_srcDepth;
-#endif
-                        Idx m_dstpitchBytesX;
-                        Idx m_srcpitchBytesX;
-                        Idx m_dstPitchBytesY;
-                        Idx m_srcPitchBytesY;
-
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-
-                    //-----------------------------------------------------------------------------
-                    //! Not being able to enable peer access does not prevent such device to device memory copies.
-                    //! However, those copies may be slower because the memory is copied via the CPU.
-                    inline auto enablePeerAccessIfPossible(
-                        const int & devSrc,
-                        const int & devDst)
-                    -> void
-                    {
-                        ALPAKA_ASSERT(devSrc != devDst);
-
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wexit-time-destructors"
-#endif
-                        static std::set<std::pair<int, int>> alreadyCheckedPeerAccessDevices;
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                        auto const devicePair = std::make_pair(devSrc, devDst);
-
-                        if(alreadyCheckedPeerAccessDevices.find(devicePair) == alreadyCheckedPeerAccessDevices.end())
-                        {
-                            alreadyCheckedPeerAccessDevices.insert(devicePair);
-
-                            int canAccessPeer = 0;
-                            ALPAKA_CUDA_RT_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devSrc, devDst));
-                            if(!canAccessPeer) {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            std::cout << __func__
-                                << " Direct peer access between given GPUs is not possible!"
-                                << " src=" << devSrc
-                                << " dst=" << devDst
-                                << std::endl;
-#endif
-                                return;
-                            }
-
-                            ALPAKA_CUDA_RT_CHECK(cudaSetDevice(devSrc));
-                            // NOTE: "until access is explicitly disabled using cudaDeviceDisablePeerAccess() or either device is reset using cudaDeviceReset()."
-                            // We do not remove a device from the enabled device pairs on cudaDeviceReset.
-                            // Note that access granted by this call is unidirectional and that in order to access memory on the current device from peerDevice, a separate symmetric call to cudaDeviceEnablePeerAccess() is required.
-                            ALPAKA_CUDA_RT_CHECK(cudaDeviceEnablePeerAccess(devDst, 0));
-                        }
-                    }
-                }
-            }
-
-            //-----------------------------------------------------------------------------
-            // Trait specializations for CreateTaskCopy.
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA to CPU memory copy trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCpu,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::cuda::detail::TaskCopyCuda<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const iDevice(
-                            dev::getDev(viewSrc).m_iDevice);
-
-                        return
-                            mem::view::cuda::detail::TaskCopyCuda<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    cudaMemcpyDeviceToHost,
-                                    iDevice,
-                                    iDevice);
-                    }
-                };
-                //#############################################################################
-                //! The CPU to CUDA memory copy trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCudaRt,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::cuda::detail::TaskCopyCuda<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const iDevice(
-                            dev::getDev(viewDst).m_iDevice);
-
-                        return
-                            mem::view::cuda::detail::TaskCopyCuda<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    cudaMemcpyHostToDevice,
-                                    iDevice,
-                                    iDevice);
-                    }
-                };
-                //#############################################################################
-                //! The CUDA to CUDA memory copy trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCudaRt,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::cuda::detail::TaskCopyCuda<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        return
-                            mem::view::cuda::detail::TaskCopyCuda<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    cudaMemcpyDeviceToDevice,
-                                    dev::getDev(viewDst).m_iDevice,
-                                    dev::getDev(viewSrc).m_iDevice);
-                    }
-                };
-            }
-            namespace cuda
-            {
-                namespace detail
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST auto buildCudaMemcpy3DParms(
-                        mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3>, TViewDst, TViewSrc, TExtent> const & task)
-                    -> cudaMemcpy3DParms
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & dstWidth(task.m_dstWidth);
-                        auto const & srcWidth(task.m_srcWidth);
-
-                        auto const & extentHeight(task.m_extentHeight);
-                        //auto const & dstHeight(task.m_dstHeight);
-                        //auto const & srcHeight(task.m_srcHeight);
-
-                        auto const & extentDepth(task.m_extentDepth);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-                        auto const & dstPitchBytesY(task.m_dstPitchBytesY);
-                        auto const & srcPitchBytesY(task.m_srcPitchBytesY);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        // Fill CUDA parameter structure.
-                        cudaMemcpy3DParms cudaMemCpy3DParms;
-                        cudaMemCpy3DParms.srcArray = nullptr;  // Either srcArray or srcPtr.
-                        cudaMemCpy3DParms.srcPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DParms.srcPtr =
-                            make_cudaPitchedPtr(
-                                const_cast<void *>(srcNativePtr),
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(srcWidth),
-                                static_cast<std::size_t>(srcPitchBytesY/srcPitchBytesX));
-                        cudaMemCpy3DParms.dstArray = nullptr;  // Either dstArray or dstPtr.
-                        cudaMemCpy3DParms.dstPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DParms.dstPtr =
-                            make_cudaPitchedPtr(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                static_cast<std::size_t>(dstWidth),
-                                static_cast<std::size_t>(dstPitchBytesY / dstPitchBytesX));
-                        cudaMemCpy3DParms.extent =
-                            make_cudaExtent(
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                static_cast<std::size_t>(extentDepth));
-                        cudaMemCpy3DParms.kind = task.m_cudaMemCpyKind;
-
-                        return cudaMemCpy3DParms;
-                    }
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms(
-                        mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2>, TViewDst, TViewSrc, TExtent> const & task)
-                    -> cudaMemcpy3DPeerParms
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const & iDstDev(task.m_iDstDevice);
-                        auto const & iSrcDev(task.m_iSrcDevice);
-
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & dstWidth(task.m_dstWidth);
-                        auto const & srcWidth(task.m_srcWidth);
-
-                        auto const & extentHeight(task.m_extentHeight);
-                        //auto const & dstHeight(task.m_dstHeight);
-                        //auto const & srcHeight(task.m_srcHeight);
-
-                        auto const extentDepth(1u);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-                        auto const & dstPitchBytesY(task.m_dstPitchBytesY);
-                        auto const & srcPitchBytesY(task.m_srcPitchBytesY);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        // Fill CUDA parameter structure.
-                        cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms;
-                        cudaMemCpy3DPeerParms.dstArray = nullptr;  // Either dstArray or dstPtr.
-                        cudaMemCpy3DPeerParms.dstDevice = iDstDev;
-                        cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DPeerParms.dstPtr =
-                            make_cudaPitchedPtr(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                static_cast<std::size_t>(dstWidth),
-                                static_cast<std::size_t>(dstPitchBytesY / dstPitchBytesX));
-                        cudaMemCpy3DPeerParms.extent =
-                            make_cudaExtent(
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                static_cast<std::size_t>(extentDepth));
-                        cudaMemCpy3DPeerParms.srcArray = nullptr;  // Either srcArray or srcPtr.
-                        cudaMemCpy3DPeerParms.srcDevice = iSrcDev;
-                        cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DPeerParms.srcPtr =
-                            make_cudaPitchedPtr(
-                                const_cast<void *>(srcNativePtr),
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(srcWidth),
-                                static_cast<std::size_t>(srcPitchBytesY / srcPitchBytesX));
-
-                        return cudaMemCpy3DPeerParms;
-                    }
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms(
-                        mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3>, TViewDst, TViewSrc, TExtent> const & task)
-                    -> cudaMemcpy3DPeerParms
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const & iDstDev(task.m_iDstDevice);
-                        auto const & iSrcDev(task.m_iSrcDevice);
-
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & dstWidth(task.m_dstWidth);
-                        auto const & srcWidth(task.m_srcWidth);
-
-                        auto const & extentHeight(task.m_extentHeight);
-                        //auto const & dstHeight(task.m_dstHeight);
-                        //auto const & srcHeight(task.m_srcHeight);
-
-                        auto const & extentDepth(task.m_extentDepth);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-                        auto const & dstPitchBytesY(task.m_dstPitchBytesY);
-                        auto const & srcPitchBytesY(task.m_srcPitchBytesY);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        // Fill CUDA parameter structure.
-                        cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms;
-                        cudaMemCpy3DPeerParms.dstArray = nullptr;  // Either dstArray or dstPtr.
-                        cudaMemCpy3DPeerParms.dstDevice = iDstDev;
-                        cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DPeerParms.dstPtr =
-                            make_cudaPitchedPtr(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                static_cast<std::size_t>(dstWidth),
-                                static_cast<std::size_t>(dstPitchBytesY/dstPitchBytesX));
-                        cudaMemCpy3DPeerParms.extent =
-                            make_cudaExtent(
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                static_cast<std::size_t>(extentDepth));
-                        cudaMemCpy3DPeerParms.srcArray = nullptr;  // Either srcArray or srcPtr.
-                        cudaMemCpy3DPeerParms.srcDevice = iSrcDev;
-                        cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0);  // Optional. Offset in bytes.
-                        cudaMemCpy3DPeerParms.srcPtr =
-                            make_cudaPitchedPtr(
-                                const_cast<void *>(srcNativePtr),
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(srcWidth),
-                                static_cast<std::size_t>(srcPitchBytesY / srcPitchBytesX));
-
-                        return cudaMemCpy3DPeerParms;
-                    }
-                }
-            }
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA non-blocking device queue 1D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    if(task.m_extentWidthBytes == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & cudaMemCpyKind(task.m_cudaMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpyAsync(
-                                dstNativePtr,
-                                srcNativePtr,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                cudaMemCpyKind,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpyPeerAsync(
-                                dstNativePtr,
-                                iDstDev,
-                                srcNativePtr,
-                                iSrcDev,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 1D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    if(task.m_extentWidthBytes == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & cudaMemCpyKind(task.m_cudaMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpyAsync(
-                                dstNativePtr,
-                                srcNativePtr,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                cudaMemCpyKind,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpyPeerAsync(
-                                dstNativePtr,
-                                iDstDev,
-                                srcNativePtr,
-                                iSrcDev,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA non-blocking device queue 2D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & extentHeight(task.m_extentHeight);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        auto const & cudaMemCpyKind(task.m_cudaMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy2DAsync(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                srcNativePtr,
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                cudaMemCpyKind,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync.
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DPeerParms(
-                                task));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DPeerAsync(
-                                &cudaMemCpy3DPeerParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 2D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & extentHeight(task.m_extentHeight);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        auto const & cudaMemCpyKind(task.m_cudaMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy2DAsync(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                srcNativePtr,
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                cudaMemCpyKind,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync.
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DPeerParms(
-                                task));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DPeerAsync(
-                                &cudaMemCpy3DPeerParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA non-blocking device queue 3D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DParms const cudaMemCpy3DParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DParms(
-                                task));
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DAsync(
-                                &cudaMemCpy3DParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DPeerParms(
-                                task));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DPeerAsync(
-                                &cudaMemCpy3DPeerParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 3D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskCopyCuda<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DParms const cudaMemCpy3DParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DParms(
-                                task));
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DAsync(
-                                &cudaMemCpy3DParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    else
-                    {
-                        alpaka::mem::view::cuda::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-
-                        // Create the struct describing the copy.
-                        cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(
-                            mem::view::cuda::detail::buildCudaMemcpy3DPeerParms(
-                                task));
-                        // Initiate the memory copy.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaMemcpy3DPeerAsync(
-                                &cudaMemCpy3DPeerParms,
-                                queue.m_spQueueImpl->m_CudaQueue));
-                    }
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cuda/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cuda/Set.hpp
deleted file mode 100644
index b1a80bb5c1..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/cuda/Set.hpp
+++ /dev/null
@@ -1,550 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/queue/QueueCudaRtBlocking.hpp>
-#include <alpaka/queue/QueueCudaRtNonBlocking.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Cuda.hpp>
-
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevCudaRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            namespace cuda
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The CUDA memory set trait.
-                    template<
-                        typename TDim,
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetCuda
-                    {
-                        //-----------------------------------------------------------------------------
-                        TaskSetCuda(
-                            TView & view,
-                            std::uint8_t const & byte,
-                            TExtent const & extent) :
-                                m_view(view),
-                                m_byte(byte),
-                                m_extent(extent),
-                                m_iDevice(dev::getDev(view).m_iDevice)
-                        {
-                            static_assert(
-                                !std::is_const<TView>::value,
-                                "The destination view can not be const!");
-
-                            static_assert(
-                                dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                                "The destination view and the extent are required to have the same dimensionality!");
-                        }
-
-                        TView & m_view;
-                        std::uint8_t const m_byte;
-                        TExtent const m_extent;
-                        std::int32_t const m_iDevice;
-                    };
-                }
-            }
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA device memory set trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskSet<
-                    TDim,
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TView>
-                    ALPAKA_FN_HOST static auto createTaskSet(
-                        TView & view,
-                        std::uint8_t const & byte,
-                        TExtent const & extent)
-                    -> mem::view::cuda::detail::TaskSetCuda<
-                        TDim,
-                        TView,
-                        TExtent>
-                    {
-                        return
-                            mem::view::cuda::detail::TaskSetCuda<
-                                TDim,
-                                TView,
-                                TExtent>(
-                                    view,
-                                    byte,
-                                    extent);
-                    }
-                };
-            }
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA non-blocking device queue 1D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<1u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<1u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 1u,
-                        "The destination buffer is required to be 1-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-
-                    if(extentWidth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-#endif
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemsetAsync(
-                            dstNativePtr,
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 1D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<1u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<1u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 1u,
-                        "The destination buffer is required to be 1-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-
-                    if(extentWidth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-#endif
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemsetAsync(
-                            dstNativePtr,
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            queue.m_spQueueImpl->m_CudaQueue));
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA non-blocking device queue 2D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<2u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<2u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 2u,
-                        "The destination buffer is required to be 2-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-
-                    if(extentWidth == 0 || extentHeight == 0)
-                    {
-                        return;
-                    }
-
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-                    auto const dstHeight(extent::getHeight(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemset2DAsync(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            static_cast<size_t>(extentHeight),
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 2D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<2u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<2u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 2u,
-                        "The destination buffer is required to be 2-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-
-                    if(extentWidth == 0 || extentHeight == 0)
-                    {
-                        return;
-                    }
-
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-                    auto const dstHeight(extent::getHeight(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemset2DAsync(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            static_cast<size_t>(extentHeight),
-                            queue.m_spQueueImpl->m_CudaQueue));
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA non-blocking device queue 3D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<3u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<3u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 3u,
-                        "The destination buffer is required to be 3-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Elem = alpaka::elem::Elem<TView>;
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    auto const extentDepth(extent::getDepth(extent));
-
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const dstWidth(extent::getWidth(view));
-#if !defined(NDEBUG)
-                    auto const dstHeight(extent::getHeight(view));
-                    auto const dstDepth(extent::getDepth(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstPitchBytesY(mem::view::getPitchBytes<dim::Dim<TView>::value - (2u % dim::Dim<TView>::value)>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-                    ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                    // Fill CUDA parameter structures.
-                    cudaPitchedPtr const cudaPitchedPtrVal(
-                        make_cudaPitchedPtr(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(dstPitchBytesY / dstPitchBytesX)));
-
-                    cudaExtent const cudaExtentVal(
-                        make_cudaExtent(
-                            static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(extentHeight),
-                            static_cast<size_t>(extentDepth)));
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemset3DAsync(
-                            cudaPitchedPtrVal,
-                            static_cast<int>(byte),
-                            cudaExtentVal,
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-            //#############################################################################
-            //! The CUDA blocking device queue 3D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                mem::view::cuda::detail::TaskSetCuda<dim::DimInt<3u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    mem::view::cuda::detail::TaskSetCuda<dim::DimInt<3u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 3u,
-                        "The destination buffer is required to be 3-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Elem = alpaka::elem::Elem<TView>;
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    auto const extentDepth(extent::getDepth(extent));
-
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const dstWidth(extent::getWidth(view));
-#if !defined(NDEBUG)
-                    auto const dstHeight(extent::getHeight(view));
-                    auto const dstDepth(extent::getDepth(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstPitchBytesY(mem::view::getPitchBytes<dim::Dim<TView>::value - (2u % dim::Dim<TView>::value)>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-                    ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                    // Fill CUDA parameter structures.
-                    cudaPitchedPtr const cudaPitchedPtrVal(
-                        make_cudaPitchedPtr(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(dstPitchBytesY / dstPitchBytesX)));
-
-                    cudaExtent const cudaExtentVal(
-                        make_cudaExtent(
-                            static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(extentHeight),
-                            static_cast<size_t>(extentDepth)));
-
-                    // Set the current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaMemset3DAsync(
-                            cudaPitchedPtrVal,
-                            static_cast<int>(byte),
-                            cudaExtentVal,
-                            queue.m_spQueueImpl->m_CudaQueue));
-
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/hip/Copy.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/hip/Copy.hpp
deleted file mode 100644
index a77369b404..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/hip/Copy.hpp
+++ /dev/null
@@ -1,1015 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/queue/QueueHipRtBlocking.hpp>
-#include <alpaka/queue/QueueHipRtNonBlocking.hpp>
-
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Hip.hpp>
-
-#include <set>
-#include <tuple>
-
-
-namespace alpaka
-{
-    namespace mem
-    {
-        namespace view
-        {
-            namespace hip
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The HIP memory copy trait.
-                    template<
-                        typename TDim,
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyHip;
-
-                    //#############################################################################
-                    //! The 1D HIP memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyHip<
-                        dim::DimInt<1>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Size of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyHip(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            hipMemcpyKind const & hipMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_hipMemCpyKind(hipMemCpyKind),
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_extentWidth(extent::getWidth(extent)),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),
-#endif
-                                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ddev: " << m_iDstDevice
-                                << " ew: " << m_extentWidth
-                                << " ewb: " << m_extentWidthBytes
-                                << " dw: " << m_dstWidth
-                                << " dptr: " << m_dstMemNative
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sptr: " << m_srcMemNative
-                                << std::endl;
-                        }
-#endif
-                        hipMemcpyKind m_hipMemCpyKind;
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_extentWidth;
-                        Idx m_dstWidth;
-                        Idx m_srcWidth;
-#endif
-                        Idx m_extentWidthBytes;
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-                    //#############################################################################
-                    //! The 2D HIP memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyHip<
-                        dim::DimInt<2>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Size of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyHip(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            hipMemcpyKind const & hipMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_hipMemCpyKind(hipMemCpyKind),
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_extentWidth(extent::getWidth(extent)),
-#endif
-                                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),      // required for 3D peer copy
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),      // required for 3D peer copy
-
-                                m_extentHeight(extent::getHeight(extent)),
-                                m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst))),    // required for 3D peer copy
-                                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc))),    // required for 3D peer copy
-
-                                m_dstpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - 1u>(viewDst))),
-                                m_srcpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - 1u>(viewSrc))),
-                                m_dstPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - (2u % dim::Dim<TViewDst>::value)>(viewDst))),
-                                m_srcPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - (2u % dim::Dim<TViewDst>::value)>(viewSrc))),
-
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        //!
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ew: " << m_extentWidth
-                                << " eh: " << m_extentHeight
-                                << " ewb: " << m_extentWidthBytes
-                                << " ddev: " << m_iDstDevice
-                                << " dw: " << m_dstWidth
-                                << " dh: " << m_dstHeight
-                                << " dptr: " << m_dstMemNative
-                                << " dpitchb: " << m_dstpitchBytesX
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sh: " << m_srcHeight
-                                << " sptr: " << m_srcMemNative
-                                << " spitchb: " << m_srcpitchBytesX
-                                << std::endl;
-                        }
-#endif
-                        hipMemcpyKind m_hipMemCpyKind;
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_extentWidth;
-#endif
-                        Idx m_extentWidthBytes;
-                        Idx m_dstWidth;          // required for 3D peer copy
-                        Idx m_srcWidth;          // required for 3D peer copy
-
-                        Idx m_extentHeight;
-                        Idx m_dstHeight;         // required for 3D peer copy
-                        Idx m_srcHeight;         // required for 3D peer copy
-
-                        Idx m_dstpitchBytesX;
-                        Idx m_srcpitchBytesX;
-                        Idx m_dstPitchBytesY;
-                        Idx m_srcPitchBytesY;
-
-
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-                    //#############################################################################
-                    //! The 3D HIP memory copy trait.
-                    template<
-                        typename TViewDst,
-                        typename TViewSrc,
-                        typename TExtent>
-                    struct TaskCopyHip<
-                        dim::DimInt<3>,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        static_assert(
-                            !std::is_const<TViewDst>::value,
-                            "The destination view can not be const!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                            "The source and the destination view are required to have the same dimensionality!");
-                        static_assert(
-                            dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                            "The views and the extent are required to have the same dimensionality!");
-                        // TODO: Maybe check for Size of TViewDst and TViewSrc to have greater or equal range than TExtent.
-                        static_assert(
-                            std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                            "The source and the destination view are required to have the same element type!");
-
-                        using Idx = idx::Idx<TExtent>;
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST TaskCopyHip(
-                            TViewDst & viewDst,
-                            TViewSrc const & viewSrc,
-                            TExtent const & extent,
-                            hipMemcpyKind const & hipMemCpyKind,
-                            int const & iDstDevice,
-                            int const & iSrcDevice) :
-                                m_hipMemCpyKind(hipMemCpyKind),
-
-                                m_iDstDevice(iDstDevice),
-                                m_iSrcDevice(iSrcDevice),
-
-                                m_extentWidth(extent::getWidth(extent)),
-                                m_extentWidthBytes(m_extentWidth * static_cast<Idx>(sizeof(elem::Elem<TViewDst>))),
-                                m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst))),
-                                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc))),
-
-                                m_extentHeight(extent::getHeight(extent)),
-                                m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst))),
-                                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc))),
-
-                                m_extentDepth(extent::getDepth(extent)),
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                                m_dstDepth(static_cast<Idx>(extent::getDepth(viewDst))),
-                                m_srcDepth(static_cast<Idx>(extent::getDepth(viewSrc))),
-#endif
-                                m_dstpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - 1u>(viewDst))),
-                                m_srcpitchBytesX(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - 1u>(viewSrc))),
-                                m_dstPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewDst>::value - (2u % dim::Dim<TViewDst>::value)>(viewDst))),
-                                m_srcPitchBytesY(static_cast<Idx>(mem::view::getPitchBytes<dim::Dim<TViewSrc>::value - (2u % dim::Dim<TViewDst>::value)>(viewSrc))),
-
-
-                                m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(viewDst))),
-                                m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(viewSrc)))
-                        {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                            ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
-                            ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                            ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                            ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
-                            ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
-#endif
-                        }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST auto printDebug() const
-                        -> void
-                        {
-                            std::cout << __func__
-                                << " ew: " << m_extentWidth
-                                << " eh: " << m_extentHeight
-                                << " ed: " << m_extentDepth
-                                << " ewb: " << m_extentWidthBytes
-                                << " ddev: " << m_iDstDevice
-                                << " dw: " << m_dstWidth
-                                << " dh: " << m_dstHeight
-                                << " dd: " << m_dstDepth
-                                << " dptr: " << m_dstMemNative
-                                << " dpitchb: " << m_dstpitchBytesX
-                                << " sdev: " << m_iSrcDevice
-                                << " sw: " << m_srcWidth
-                                << " sh: " << m_srcHeight
-                                << " sd: " << m_srcDepth
-                                << " sptr: " << m_srcMemNative
-                                << " spitchb: " << m_srcpitchBytesX
-                                << std::endl;
-                        }
-#endif
-                        hipMemcpyKind m_hipMemCpyKind;
-
-                        int m_iDstDevice;
-                        int m_iSrcDevice;
-
-                        Idx m_extentWidth;
-                        Idx m_extentWidthBytes;
-                        Idx m_dstWidth;
-                        Idx m_srcWidth;
-
-                        Idx m_extentHeight;
-                        Idx m_dstHeight;
-                        Idx m_srcHeight;
-
-                        Idx m_extentDepth;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        Idx m_dstDepth;
-                        Idx m_srcDepth;
-#endif
-                        Idx m_dstpitchBytesX;
-                        Idx m_srcpitchBytesX;
-                        Idx m_dstPitchBytesY;
-                        Idx m_srcPitchBytesY;
-
-                        void * m_dstMemNative;
-                        void const * m_srcMemNative;
-                    };
-
-                    //-----------------------------------------------------------------------------
-                    //! Not being able to enable peer access does not prevent such device to device memory copies.
-                    //! However, those copies may be slower because the memory is copied via the CPU.
-                    inline auto enablePeerAccessIfPossible(
-                        const int & devSrc,
-                        const int & devDst)
-                    -> void
-                    {
-                        ALPAKA_ASSERT(devSrc != devDst);
-
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wexit-time-destructors"
-#endif
-                        static std::set<std::pair<int, int>> alreadyCheckedPeerAccessDevices;
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                        auto const devicePair = std::make_pair(devSrc, devDst);
-
-                        if(alreadyCheckedPeerAccessDevices.find(devicePair) == alreadyCheckedPeerAccessDevices.end())
-                        {
-                            alreadyCheckedPeerAccessDevices.insert(devicePair);
-
-                            int canAccessPeer = 0;
-                            ALPAKA_HIP_RT_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, devSrc, devDst));
-                            if(!canAccessPeer) {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                            std::cout << __func__
-                                << " Direct peer access between given GPUs is not possible!"
-                                << " src=" << devSrc
-                                << " dst=" << devDst
-                                << std::endl;
-#endif
-                                return;
-                            }
-
-                            ALPAKA_HIP_RT_CHECK(hipSetDevice(devSrc));
-                            // NOTE: "until access is explicitly disabled using hipDeviceDisablePeerAccess() or either device is reset using hipDeviceReset()."
-                            // We do not remove a device from the enabled device pairs on hipDeviceReset.
-                            // Note that access granted by this call is unidirectional and that in order to access memory on the current device from peerDevice, a separate symmetric call to hipDeviceEnablePeerAccess() is required.
-                            ALPAKA_HIP_RT_CHECK(hipDeviceEnablePeerAccess(devDst, 0));
-                        }
-                    }
-                }
-            }
-
-            //-----------------------------------------------------------------------------
-            // Trait specializations for CreateTaskCopy.
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP to CPU memory copy trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevCpu,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::hip::detail::TaskCopyHip<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const iDevice(
-                            dev::getDev(viewSrc).m_iDevice);
-
-                        return
-                            mem::view::hip::detail::TaskCopyHip<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    hipMemcpyDeviceToHost,
-                                    iDevice,
-                                    iDevice);
-                    }
-                };
-                //#############################################################################
-                //! The CPU to HIP memory copy trait specialization.
-                //#############################################################################
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevHipRt,
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::hip::detail::TaskCopyHip<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const iDevice(
-                            dev::getDev(viewDst).m_iDevice);
-
-                        return
-                            mem::view::hip::detail::TaskCopyHip<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    hipMemcpyHostToDevice,
-                                    iDevice,
-                                    iDevice);
-                    }
-                };
-                //#############################################################################
-                //! The HIP to HIP memory copy trait specialization.
-                //#############################################################################
-                template<
-                    typename TDim>
-                struct CreateTaskCopy<
-                    TDim,
-                    dev::DevHipRt,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST static auto createTaskCopy(
-                        TViewDst & viewDst,
-                        TViewSrc const & viewSrc,
-                        TExtent const & extent)
-                    -> mem::view::hip::detail::TaskCopyHip<
-                        TDim,
-                        TViewDst,
-                        TViewSrc,
-                        TExtent>
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        return
-                            mem::view::hip::detail::TaskCopyHip<
-                                TDim,
-                                TViewDst,
-                                TViewSrc,
-                                TExtent>(
-                                    viewDst,
-                                    viewSrc,
-                                    extent,
-                                    hipMemcpyDeviceToDevice,
-                                    dev::getDev(viewDst).m_iDevice,
-                                    dev::getDev(viewSrc).m_iDevice);
-                    }
-                };
-            }
-            namespace hip
-            {
-                namespace detail
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TViewSrc,
-                        typename TViewDst>
-                    ALPAKA_FN_HOST auto buildHipMemcpy3DParms(
-                        mem::view::hip::detail::TaskCopyHip<dim::DimInt<3>, TViewDst, TViewSrc, TExtent> const & task)
-                    -> hipMemcpy3DParms
-                    {
-                        ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                        auto const & extentWidthBytes(task.m_extentWidthBytes);
-                        auto const & dstWidth(task.m_dstWidth);
-                        auto const & srcWidth(task.m_srcWidth);
-
-                        auto const & extentHeight(task.m_extentHeight);
-                        //auto const & dstHeight(task.m_dstHeight);
-                        //auto const & srcHeight(task.m_srcHeight);
-
-                        auto const & extentDepth(task.m_extentDepth);
-
-                        auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                        auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-                        auto const & dstPitchBytesY(task.m_dstPitchBytesY);
-                        auto const & srcPitchBytesY(task.m_srcPitchBytesY);
-
-                        auto const & dstNativePtr(task.m_dstMemNative);
-                        auto const & srcNativePtr(task.m_srcMemNative);
-
-                        // Fill HIP parameter structure.
-                        hipMemcpy3DParms hipMemCpy3DParms;
-                        hipMemCpy3DParms.srcArray = nullptr;  // Either srcArray or srcPtr.
-                        hipMemCpy3DParms.srcPos = make_hipPos(0, 0, 0);  // Optional. Offset in bytes.
-                        hipMemCpy3DParms.srcPtr =
-                            make_hipPitchedPtr(
-                                const_cast<void *>(srcNativePtr),
-                                static_cast<std::size_t>(srcPitchBytesX),
-                                static_cast<std::size_t>(srcWidth),
-                                static_cast<std::size_t>(srcPitchBytesY/srcPitchBytesX));
-                        hipMemCpy3DParms.dstArray = nullptr;  // Either dstArray or dstPtr.
-                        hipMemCpy3DParms.dstPos = make_hipPos(0, 0, 0);  // Optional. Offset in bytes.
-                        hipMemCpy3DParms.dstPtr =
-                            make_hipPitchedPtr(
-                                dstNativePtr,
-                                static_cast<std::size_t>(dstPitchBytesX),
-                                static_cast<std::size_t>(dstWidth),
-                                static_cast<std::size_t>(dstPitchBytesY/dstPitchBytesX));
-                        hipMemCpy3DParms.extent =
-                            make_hipExtent(
-                                static_cast<std::size_t>(extentWidthBytes),
-                                static_cast<std::size_t>(extentHeight),
-                                static_cast<std::size_t>(extentDepth));
-#ifdef __HIP_PLATFORM_NVCC__
-                        hipMemCpy3DParms.kind = hipMemcpyKindToCudaMemcpyKind(task.m_hipMemCpyKind);
-#else
-                        hipMemCpy3DParms.kind = task.m_hipMemCpyKind;
-#endif
-
-                        return hipMemCpy3DParms;
-                    }
-                }
-            }
-        }
-    }
-
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP non-blocking device queue 1D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    if(task.m_extentWidthBytes == 0)
-                    {
-                        return;
-                    }
-
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & hipMemCpyKind(task.m_hipMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMemcpyAsync(
-                                dstNativePtr,
-                                srcNativePtr,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                hipMemCpyKind,
-                                queue.m_spQueueImpl->m_HipQueue));
-                    }
-                    else
-                    {
-                        // Initiate the memory copy.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMemcpyPeerAsync(
-                                dstNativePtr,
-                                iDstDev,
-                                srcNativePtr,
-                                iSrcDev,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                queue.m_spQueueImpl->m_HipQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 1D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<1u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    if(task.m_extentWidthBytes == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    if(iDstDev == iSrcDev)
-                    {
-                        auto const & hipMemCpyKind(task.m_hipMemCpyKind);
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                iDstDev));
-                        // Initiate the memory copy.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMemcpyAsync(
-                                dstNativePtr,
-                                srcNativePtr,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                hipMemCpyKind,
-                                queue.m_spQueueImpl->m_HipQueue));
-                    }
-                    else
-                    {
-                        // Initiate the memory copy.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipMemcpyPeerAsync(
-                                dstNativePtr,
-                                iDstDev,
-                                srcNativePtr,
-                                iSrcDev,
-                                static_cast<std::size_t>(extentWidthBytes),
-                                queue.m_spQueueImpl->m_HipQueue));
-                    }
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP non-blocking device queue 2D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-                    auto const & extentHeight(task.m_extentHeight);
-
-                    auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                    auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    auto const & hipMemCpyKind(task.m_hipMemCpyKind);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDstDev));
-
-                    if(iDstDev != iSrcDev)
-                    {
-                        // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers.
-                        // P2P access has to be enabled to avoid host transfer.
-                        // Checks if devices are connected via PCIe switch and enable P2P access then.
-                        alpaka::mem::view::hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-                    }
-
-                    // Initiate the memory copy.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemcpy2DAsync(
-                            dstNativePtr,
-                            static_cast<std::size_t>(dstPitchBytesX),
-                            srcNativePtr,
-                            static_cast<std::size_t>(srcPitchBytesX),
-                            static_cast<std::size_t>(extentWidthBytes),
-                            static_cast<std::size_t>(extentHeight),
-                            hipMemCpyKind,
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 2D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<2u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    auto const & extentWidthBytes(task.m_extentWidthBytes);
-                    auto const & extentHeight(task.m_extentHeight);
-
-                    auto const & dstPitchBytesX(task.m_dstpitchBytesX);
-                    auto const & srcPitchBytesX(task.m_srcpitchBytesX);
-
-                    auto const & dstNativePtr(task.m_dstMemNative);
-                    auto const & srcNativePtr(task.m_srcMemNative);
-
-                    auto const & hipMemCpyKind(task.m_hipMemCpyKind);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDstDev));
-
-                    if(iDstDev != iSrcDev)
-                    {
-                        // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers.
-                        // P2P access has to be enabled to avoid host transfer.
-                        // Checks if devices are connected via PCIe switch and enable P2P access then.
-                        alpaka::mem::view::hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-                    }
-
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemcpy2DAsync(
-                            dstNativePtr,
-                            static_cast<std::size_t>(dstPitchBytesX),
-                            srcNativePtr,
-                            static_cast<std::size_t>(srcPitchBytesX),
-                            static_cast<std::size_t>(extentWidthBytes),
-                            static_cast<std::size_t>(extentHeight),
-                            hipMemCpyKind,
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-
-                }
-            };
-            //#############################################################################
-            //! The HIP non-blocking device queue 3D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    // Create the struct describing the copy.
-                    hipMemcpy3DParms const hipMemCpy3DParms(
-                        mem::view::hip::detail::buildHipMemcpy3DParms(
-                            task));
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDstDev));
-
-                    if(iDstDev != iSrcDev)
-                    {
-                        // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers.
-                        // P2P access has to be enabled to avoid host transfer.
-                        // Checks if devices are connected via PCIe switch and enable P2P access then.
-                        alpaka::mem::view::hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-                    }
-
-                    // Initiate the memory copy.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemcpy3DAsync(
-                            &hipMemCpy3DParms,
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 3D copy enqueue trait specialization.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskCopyHip<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskCopyHip<dim::DimInt<3u>, TViewDst, TViewSrc, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    task.printDebug();
-#endif
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(task.m_extentWidthBytes == 0 || task.m_extentHeight == 0 || task.m_extentDepth == 0)
-                    {
-                        return;
-                    }
-                    auto const & iDstDev(task.m_iDstDevice);
-                    auto const & iSrcDev(task.m_iSrcDevice);
-
-                    // Create the struct describing the copy.
-                    hipMemcpy3DParms const hipMemCpy3DParms(
-                        mem::view::hip::detail::buildHipMemcpy3DParms(
-                            task));
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDstDev));
-
-                    if(iDstDev != iSrcDev)
-                    {
-                        // HIP relies on unified memory, so memcpy commands automatically do device-to-device transfers.
-                        // P2P access has to be enabled to avoid host transfer.
-                        // Checks if devices are connected via PCIe switch and enable P2P access then.
-                        alpaka::mem::view::hip::detail::enablePeerAccessIfPossible(iSrcDev, iDstDev);
-                    }
-
-                    // Initiate the memory copy.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemcpy3DAsync(
-                            &hipMemCpy3DParms,
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
-
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/hip/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/hip/Set.hpp
deleted file mode 100644
index e28f17c4ec..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/hip/Set.hpp
+++ /dev/null
@@ -1,541 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/queue/QueueHipRtBlocking.hpp>
-#include <alpaka/queue/QueueHipRtNonBlocking.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Hip.hpp>
-
-
-namespace alpaka
-{
-    namespace dev
-    {
-        class DevHipRt;
-    }
-}
-
-namespace alpaka
-{
-    //-----------------------------------------------------------------------------
-    // Trait specializations for Set.
-    //-----------------------------------------------------------------------------
-    namespace mem
-    {
-        namespace view
-        {
-            namespace hip
-            {
-                namespace detail
-                {
-                    //#############################################################################
-                    //! The HIP memory set trait.
-                    template<
-                        typename TDim,
-                        typename TView,
-                        typename TExtent>
-                    struct TaskSetHip
-                    {
-                        //-----------------------------------------------------------------------------
-                        TaskSetHip(
-                            TView & view,
-                            std::uint8_t const & byte,
-                            TExtent const & extent) :
-                                m_view(view),
-                                m_byte(byte),
-                                m_extent(extent),
-                                m_iDevice(dev::getDev(view).m_iDevice)
-                        {
-                            static_assert(
-                                !std::is_const<TView>::value,
-                                "The destination view can not be const!");
-                            static_assert(
-                                dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                                "The destination view and the extent are required to have the same dimensionality!");
-                        }
-
-                        TView & m_view;
-                        std::uint8_t const m_byte;
-                        TExtent const m_extent;
-                        std::int32_t const m_iDevice;
-                    };
-                }
-            }
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP device memory set trait specialization.
-                template<
-                    typename TDim>
-                struct CreateTaskSet<
-                    TDim,
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    //!
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TExtent,
-                        typename TView>
-                    ALPAKA_FN_HOST static auto createTaskSet(
-                        TView & view,
-                        std::uint8_t const & byte,
-                        TExtent const & extent)
-                    -> mem::view::hip::detail::TaskSetHip<
-                        TDim,
-                        TView,
-                        TExtent>
-                    {
-                        return
-                            mem::view::hip::detail::TaskSetHip<
-                                TDim,
-                                TView,
-                                TExtent>(
-                                    view,
-                                    byte,
-                                    extent);
-                    }
-                };
-            }
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP non-blocking device queue 1D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<1u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<1u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 1u,
-                        "The destination buffer is required to be 1-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    if(extentWidth == 0)
-                    {
-                        return;
-                    }
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-#endif
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemsetAsync(
-                            dstNativePtr,
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 1D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<1u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<1u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 1u,
-                        "The destination buffer is required to be 1-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    if(extentWidth == 0)
-                    {
-                        return;
-                    }
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-#endif
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemsetAsync(
-                            dstNativePtr,
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP non-blocking device queue 2D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<2u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<2u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 2u,
-                        "The destination buffer is required to be 2-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    if(extentWidth == 0 || extentHeight == 0)
-                    {
-                        return;
-                    }
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-                    auto const dstHeight(extent::getHeight(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemset2DAsync(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            static_cast<size_t>(extentHeight),
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 2D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<2u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<2u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    static_assert(
-                        dim::Dim<TView>::value == 2u,
-                        "The destination buffer is required to be 2-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    if(extentWidth == 0 || extentHeight == 0)
-                    {
-                        return;
-                    }
-                    auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(elem::Elem<TView>)));
-#if !defined(NDEBUG)
-                    auto const dstWidth(extent::getWidth(view));
-                    auto const dstHeight(extent::getHeight(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemset2DAsync(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<int>(byte),
-                            static_cast<size_t>(extentWidthBytes),
-                            static_cast<size_t>(extentHeight),
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP non-blocking device queue 3D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<3u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<3u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    static_assert(
-                        dim::Dim<TView>::value == 3u,
-                        "The destination buffer is required to be 3-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Elem = alpaka::elem::Elem<TView>;
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    auto const extentDepth(extent::getDepth(extent));
-
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const dstWidth(extent::getWidth(view));
-#if !defined(NDEBUG)
-                    auto const dstHeight(extent::getHeight(view));
-                    auto const dstDepth(extent::getDepth(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstPitchBytesY(mem::view::getPitchBytes<dim::Dim<TView>::value - (2u % dim::Dim<TView>::value)>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-                    ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                    // Fill HIP parameter structures.
-                    hipPitchedPtr const hipPitchedPtrVal(
-                        make_hipPitchedPtr(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(dstPitchBytesY/dstPitchBytesX)));
-
-                    hipExtent const hipExtentVal(
-                        make_hipExtent(
-                            static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(extentHeight),
-                            static_cast<size_t>(extentDepth)));
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemset3DAsync(
-                            hipPitchedPtrVal,
-                            static_cast<int>(byte),
-                            hipExtentVal,
-                            queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-            //#############################################################################
-            //! The HIP blocking device queue 3D set enqueue trait specialization.
-            template<
-                typename TView,
-                typename TExtent>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                mem::view::hip::detail::TaskSetHip<dim::DimInt<3u>, TView, TExtent>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    mem::view::hip::detail::TaskSetHip<dim::DimInt<3u>, TView, TExtent> const & task)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                    static_assert(
-                        dim::Dim<TView>::value == 3u,
-                        "The destination buffer is required to be 3-dimensional for this specialization!");
-                    static_assert(
-                        dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                        "The destination buffer and the extent are required to have the same dimensionality!");
-
-                    using Elem = alpaka::elem::Elem<TView>;
-                    using Idx = idx::Idx<TExtent>;
-
-                    auto & view(task.m_view);
-                    auto const & byte(task.m_byte);
-                    auto const & extent(task.m_extent);
-                    auto const & iDevice(task.m_iDevice);
-
-                    auto const extentWidth(extent::getWidth(extent));
-                    auto const extentHeight(extent::getHeight(extent));
-                    auto const extentDepth(extent::getDepth(extent));
-
-                    // This is not only an optimization but also prevents a division by zero.
-                    if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                    {
-                        return;
-                    }
-
-                    auto const dstWidth(extent::getWidth(view));
-#if !defined(NDEBUG)
-                    auto const dstHeight(extent::getHeight(view));
-                    auto const dstDepth(extent::getDepth(view));
-#endif
-                    auto const dstPitchBytesX(mem::view::getPitchBytes<dim::Dim<TView>::value - 1u>(view));
-                    auto const dstPitchBytesY(mem::view::getPitchBytes<dim::Dim<TView>::value - (2u % dim::Dim<TView>::value)>(view));
-                    auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(view)));
-                    ALPAKA_ASSERT(extentWidth <= dstWidth);
-                    ALPAKA_ASSERT(extentHeight <= dstHeight);
-                    ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                    // Fill HIP parameter structures.
-                    hipPitchedPtr const hipPitchedPtrVal(
-                        make_hipPitchedPtr(
-                            dstNativePtr,
-                            static_cast<size_t>(dstPitchBytesX),
-                            static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(dstPitchBytesY/dstPitchBytesX)));
-
-                    hipExtent const hipExtentVal(
-                        make_hipExtent(
-                            static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
-                            static_cast<size_t>(extentHeight),
-                            static_cast<size_t>(extentDepth)));
-
-                    // Set the current device.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipSetDevice(
-                            iDevice));
-                    // Initiate the memory set.
-                    ALPAKA_HIP_RT_CHECK(
-                        hipMemset3DAsync(
-                            hipPitchedPtrVal,
-                            static_cast<int>(byte),
-                            hipExtentVal,
-                            queue.m_spQueueImpl->m_HipQueue));
-
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-                }
-            };
-
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/oacc/Copy.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/oacc/Copy.hpp
new file mode 100644
index 0000000000..f2cf11b258
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/oacc/Copy.hpp
@@ -0,0 +1,314 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/dev/DevOacc.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/extent/Traits.hpp>
+#    include <alpaka/mem/view/Traits.hpp>
+#    include <alpaka/meta/Integral.hpp>
+#    include <alpaka/meta/NdLoop.hpp>
+#    include <alpaka/queue/QueueOaccBlocking.hpp>
+#    include <alpaka/vec/Vec.hpp>
+
+#    include <set>
+#    include <tuple>
+#    include <utility>
+
+#    if _OPENACC < 201510
+#        include <vector>
+#    endif
+
+namespace alpaka
+{
+    namespace oacc
+    {
+        namespace detail
+        {
+            template<
+                template<
+                    typename TTDim,
+                    typename TTViewDst,
+                    typename TTViewSrc,
+                    typename TTExtent,
+                    typename TTCopyPred>
+                class TTask,
+                typename TDim,
+                typename TViewDst,
+                typename TViewSrc,
+                typename TExtent,
+                typename TCopyPred>
+            auto makeTaskCopyOacc(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                DevOacc const& dev,
+                TCopyPred copyPred)
+            {
+                return TTask<TDim, TViewDst, TViewSrc, TExtent, TCopyPred>(viewDst, viewSrc, extent, dev, copyPred);
+            }
+
+            //#############################################################################
+            //! The OpenACC device memory copy task base.
+            //!
+            template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent, typename TCopyPred>
+            struct TaskCopyOaccBase
+            {
+                using ExtentSize = alpaka::Idx<TExtent>;
+                using DstSize = alpaka::Idx<TViewDst>;
+                using SrcSize = alpaka::Idx<TViewSrc>;
+                using Elem = alpaka::Elem<TViewSrc>;
+
+                static_assert(!std::is_const<TViewDst>::value, "The destination view can not be const!");
+
+                static_assert(
+                    Dim<TViewSrc>::value == TDim::value,
+                    "The source view is required to have dimensionality TDim!");
+                static_assert(
+                    Dim<TViewDst>::value == Dim<TViewSrc>::value,
+                    "The source and the destination view are required to have the same dimensionality!");
+                static_assert(
+                    Dim<TViewDst>::value == Dim<TExtent>::value,
+                    "The views and the extent are required to have the same dimensionality!");
+                // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
+                static_assert(
+                    std::is_same<alpaka::Elem<TViewDst>, typename std::remove_const<alpaka::Elem<TViewSrc>>::type>::
+                        value,
+                    "The source and the destination view are required to have the same element type!");
+
+                using Idx = alpaka::Idx<TExtent>;
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST TaskCopyOaccBase(
+                    TViewDst& viewDst,
+                    TViewSrc const& viewSrc,
+                    TExtent const& extent,
+                    DevOacc const& dev,
+                    TCopyPred copyPred)
+                    : m_dev(dev)
+                    , m_extent(extent::getExtentVec(extent))
+                    , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
+                    , m_dstPitchBytes(getPitchBytesVec(viewDst))
+                    , m_srcPitchBytes(getPitchBytesVec(viewSrc))
+                    ,
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                    m_dstExtent(extent::getExtentVec(viewDst))
+                    , m_srcExtent(extent::getExtentVec(viewSrc))
+                    ,
+#    endif
+                    m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
+                    , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
+                    , m_copyPred(copyPred)
+                {
+                    ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
+                    ALPAKA_ASSERT((castVec<SrcSize>(m_extent) <= m_srcExtent).foldrAll(std::logical_or<bool>()));
+                    ALPAKA_ASSERT(static_cast<DstSize>(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 1u]);
+                    ALPAKA_ASSERT(static_cast<SrcSize>(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 1u]);
+                }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST auto printDebug() const -> void
+                {
+                    std::cout << __func__ << " dev: " << m_dev.iDevice() << " ew: " << m_extent
+                              << " dw: " << m_dstExtent << " dptr: " << static_cast<const void*>(m_dstMemNative)
+                              << " sw: " << m_srcExtent << " sptr: " << static_cast<const void*>(m_srcMemNative)
+                              << std::endl;
+                }
+#    endif
+                const DevOacc m_dev;
+                Vec<TDim, ExtentSize> m_extent;
+                ExtentSize const m_extentWidthBytes;
+                Vec<TDim, DstSize> m_dstPitchBytes;
+                Vec<TDim, SrcSize> m_srcPitchBytes;
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                Vec<TDim, DstSize> const m_dstExtent;
+                Vec<TDim, SrcSize> const m_srcExtent;
+#    endif
+                std::uint8_t* const m_dstMemNative;
+                std::uint8_t const* const m_srcMemNative;
+                TCopyPred m_copyPred;
+            };
+
+            //#############################################################################
+            //! The OpenACC Nd device memory copy task.
+            //!
+            template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent, typename TCopyPred>
+            struct TaskCopyOacc : public TaskCopyOaccBase<TDim, TViewDst, TViewSrc, TExtent, TCopyPred>
+            {
+                using DimMin1 = DimInt<TDim::value - 1u>;
+                using typename TaskCopyOaccBase<TDim, TViewDst, TViewSrc, TExtent, TCopyPred>::ExtentSize;
+                using typename TaskCopyOaccBase<TDim, TViewDst, TViewSrc, TExtent, TCopyPred>::DstSize;
+                using typename TaskCopyOaccBase<TDim, TViewDst, TViewSrc, TExtent, TCopyPred>::SrcSize;
+
+                //-----------------------------------------------------------------------------
+                using TaskCopyOaccBase<TDim, TViewDst, TViewSrc, TExtent, TCopyPred>::TaskCopyOaccBase;
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST auto operator()() const -> void
+                {
+                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                    this->printDebug();
+#    endif
+                    Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+                    // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required.
+                    Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_dstPitchBytes));
+                    Vec<DimMin1, SrcSize> const srcPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_srcPitchBytes));
+
+                    if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                    {
+                        this->m_dev.makeCurrent();
+                        meta::ndLoopIncIdx(extentWithoutInnermost, [&](Vec<DimMin1, ExtentSize> const& idx) {
+                            this->m_copyPred(
+                                reinterpret_cast<void*>(
+                                    this->m_dstMemNative
+                                    + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost).sum()),
+                                const_cast<void*>(reinterpret_cast<const void*>(
+                                    this->m_srcMemNative
+                                    + (castVec<SrcSize>(idx) * srcPitchBytesWithoutOutmost).sum())),
+                                static_cast<std::size_t>(this->m_extentWidthBytes));
+                        });
+                    }
+                }
+            };
+
+            //#############################################################################
+            //! The 1d OpenACC memory copy task.
+            template<typename TViewDst, typename TViewSrc, typename TExtent, typename TCopyPred>
+            struct TaskCopyOacc<DimInt<1>, TViewDst, TViewSrc, TExtent, TCopyPred>
+                : public TaskCopyOaccBase<DimInt<1>, TViewDst, TViewSrc, TExtent, TCopyPred>
+            {
+                //-----------------------------------------------------------------------------
+                using TaskCopyOaccBase<DimInt<1u>, TViewDst, TViewSrc, TExtent, TCopyPred>::TaskCopyOaccBase;
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST auto operator()() const -> void
+                {
+                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                    this->printDebug();
+#    endif
+                    if(this->m_extent.prod() != 0)
+                    {
+                        this->m_dev.makeCurrent();
+                        this->m_copyPred(
+                            reinterpret_cast<void*>(this->m_dstMemNative),
+                            const_cast<void*>(reinterpret_cast<void const*>(this->m_srcMemNative)),
+                            static_cast<std::size_t>(this->m_extentWidthBytes));
+                    }
+                }
+            };
+        } // namespace detail
+    } // namespace oacc
+
+    //-----------------------------------------------------------------------------
+    // Trait specializations for CreateTaskCopy.
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU to OpenACC memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevOacc, DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent)
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return alpaka::oacc::detail::
+                    makeTaskCopyOacc<alpaka::oacc::detail::TaskCopyOacc, TDim, TViewDst, TViewSrc, TExtent>(
+                        viewDst,
+                        viewSrc,
+                        extent,
+                        getDev(viewDst),
+                        acc_memcpy_to_device);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC to CPU memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevCpu, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent)
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return alpaka::oacc::detail::
+                    makeTaskCopyOacc<alpaka::oacc::detail::TaskCopyOacc, TDim, TViewDst, TViewSrc, TExtent>(
+                        viewDst,
+                        viewSrc,
+                        extent,
+                        getDev(viewSrc),
+                        acc_memcpy_from_device);
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC to OpenACC memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevOacc, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent)
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                ALPAKA_ASSERT(
+                    getDev(viewDst).m_spDevOaccImpl->iDevice() == getDev(viewSrc).m_spDevOaccImpl->iDevice());
+
+                return alpaka::oacc::detail::
+                    makeTaskCopyOacc<alpaka::oacc::detail::TaskCopyOacc, TDim, TViewDst, TViewSrc, TExtent>(
+                        viewDst,
+                        viewSrc,
+                        extent,
+                        getDev(viewDst),
+#    if _OPENACC >= 201510 && (!defined __GNUC__)
+                        acc_memcpy_device
+#    else
+                        // acc_memcpy_device is only available since OpenACC2.5
+                        // , but we want the tests to compile anyway
+                        [](void* dst, void* src, std::size_t size) {
+                            std::vector<std::size_t> buf(size / sizeof(std::size_t));
+                            acc_memcpy_from_device(static_cast<void*>(buf.data()), src, size);
+                            acc_memcpy_to_device(dst, static_cast<void*>(buf.data()), size);
+                        }
+#    endif
+                    );
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/oacc/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/oacc/Set.hpp
new file mode 100644
index 0000000000..f08f1f1543
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/oacc/Set.hpp
@@ -0,0 +1,95 @@
+/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/core/Utility.hpp>
+#    include <alpaka/dev/DevOacc.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/extent/Traits.hpp>
+#    include <alpaka/idx/Accessors.hpp>
+#    include <alpaka/kernel/TaskKernelOacc.hpp>
+#    include <alpaka/mem/buf/SetKernel.hpp>
+#    include <alpaka/mem/view/Traits.hpp>
+#    include <alpaka/meta/Integral.hpp>
+#    include <alpaka/queue/QueueOaccBlocking.hpp>
+#    include <alpaka/vec/Vec.hpp>
+#    include <alpaka/workdiv/WorkDivHelpers.hpp>
+
+#    include <cstring>
+
+namespace alpaka
+{
+    class DevOacc;
+}
+
+namespace alpaka
+{
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenACC device memory set trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemset<TDim, DevOacc>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TView>
+            ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+            {
+                using Idx = typename traits::IdxType<TExtent>::type;
+                auto pitch = getPitchBytesVec(view);
+                auto byteExtent = extent::getExtentVec(extent);
+                byteExtent[TDim::value - 1] *= static_cast<Idx>(sizeof(Elem<TView>));
+                constexpr auto lastDim = TDim::value - 1;
+
+                if(pitch[0] <= 0)
+                    return createTaskKernel<AccOacc<TDim, Idx>>(
+                        WorkDivMembers<TDim, Idx>(
+                            Vec<TDim, Idx>::zeros(),
+                            Vec<TDim, Idx>::zeros(),
+                            Vec<TDim, Idx>::zeros()),
+                        MemSetKernel(),
+                        byte,
+                        reinterpret_cast<std::uint8_t*>(alpaka::getPtrNative(view)),
+                        byteExtent,
+                        pitch); // NOP if size is zero
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                std::cout << "Set TDim=" << TDim::value << " pitch=" << pitch << " byteExtent=" << byteExtent
+                          << std::endl;
+#    endif
+                auto elementsPerThread = Vec<TDim, Idx>::all(static_cast<Idx>(1u));
+                elementsPerThread[lastDim] = 4;
+                // Let alpaka calculate good block and grid sizes given our full problem extent
+                WorkDivMembers<TDim, Idx> const workDiv(getValidWorkDiv<AccOacc<TDim, Idx>>(
+                    getDev(view),
+                    byteExtent,
+                    elementsPerThread,
+                    false,
+                    alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+                return createTaskKernel<AccOacc<TDim, Idx>>(
+                    workDiv,
+                    MemSetKernel(),
+                    byte,
+                    reinterpret_cast<std::uint8_t*>(alpaka::getPtrNative(view)),
+                    byteExtent,
+                    pitch);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/omp5/Copy.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/omp5/Copy.hpp
new file mode 100644
index 0000000000..f3e5131ccd
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/omp5/Copy.hpp
@@ -0,0 +1,346 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/core/Omp5.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/dev/DevOmp5.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/extent/Traits.hpp>
+#    include <alpaka/mem/view/Traits.hpp>
+#    include <alpaka/queue/QueueOmp5Blocking.hpp>
+#    include <alpaka/vec/Vec.hpp>
+
+#    include <set>
+#    include <tuple>
+#    include <utility>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //#############################################################################
+        //! The Omp5 memory copy trait.
+        template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyOmp5
+        {
+            static_assert(!std::is_const<TViewDst>::value, "The destination view can not be const!");
+
+            static_assert(
+                Dim<TViewSrc>::value == TDim::value,
+                "The source view is required to have dimensionality TDim!");
+            static_assert(
+                Dim<TViewDst>::value == Dim<TViewSrc>::value,
+                "The source and the destination view are required to have the same dimensionality!");
+            static_assert(
+                Dim<TViewDst>::value == Dim<TExtent>::value,
+                "The views and the extent are required to have the same dimensionality!");
+            // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
+            static_assert(
+                std::is_same<Elem<TViewDst>, typename std::remove_const<Elem<TViewSrc>>::type>::value,
+                "The source and the destination view are required to have the same element type!");
+
+            using Idx = alpaka::Idx<TExtent>;
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST TaskCopyOmp5(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+                , m_extent(castVec<size_t>(extent::getExtentVec(extent)))
+                , m_dstPitchBytes(castVec<size_t>(getPitchBytesVec(viewDst)))
+                , m_srcPitchBytes(castVec<size_t>(getPitchBytesVec(viewSrc)))
+                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                const auto dstExtent(castVec<size_t>(extent::getExtentVec(viewDst)));
+                const auto srcExtent(castVec<size_t>(extent::getExtentVec(viewSrc)));
+                for(auto i = static_cast<decltype(TDim::value)>(0u); i < TDim::value; ++i)
+                {
+                    ALPAKA_ASSERT(m_extent[i] <= dstExtent[i]);
+                    ALPAKA_ASSERT(m_extent[i] <= srcExtent[i]);
+                }
+                std::cout << "TaskCopyOmp5<" << TDim::value << ",...>::ctor\tsrcExtent=" << srcExtent
+                          << ", dstExtent=" << dstExtent << std::endl;
+#    endif
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << m_extent
+                          << " dptr: " << m_dstMemNative << " sdev: " << m_iSrcDevice << " sptr: " << m_srcMemNative
+                          << std::endl;
+            }
+#    endif
+            int m_iDstDevice;
+            int m_iSrcDevice;
+            Vec<TDim, size_t> m_extent;
+            Vec<TDim, size_t> m_dstPitchBytes;
+            Vec<TDim, size_t> m_srcPitchBytes;
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+
+            //-----------------------------------------------------------------------------
+            //! Executes the kernel function object.
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                constexpr auto lastDim = TDim::value - 1;
+
+                if(m_extent.prod() > 0)
+                {
+                    // offsets == 0 by ptr shift (?)
+                    auto dstOffset(Vec<TDim, size_t>::zeros());
+                    auto srcOffset(Vec<TDim, size_t>::zeros());
+
+                    auto dstExtentFull(Vec<TDim, size_t>::zeros());
+                    auto srcExtentFull(Vec<TDim, size_t>::zeros());
+
+                    const size_t elementSize
+                        = (m_dstPitchBytes[0] % sizeof(Elem<TViewDst>) || m_srcPitchBytes[0] % sizeof(Elem<TViewDst>))
+                        ? 1
+                        : sizeof(Elem<TViewDst>);
+
+                    dstExtentFull[lastDim] = m_dstPitchBytes[lastDim] / elementSize;
+                    srcExtentFull[lastDim] = m_srcPitchBytes[lastDim] / elementSize;
+                    for(int i = lastDim - 1; i >= 0; --i)
+                    {
+                        dstExtentFull[i] = m_dstPitchBytes[i] / m_dstPitchBytes[i + 1];
+                        srcExtentFull[i] = m_srcPitchBytes[i] / m_srcPitchBytes[i + 1];
+                    }
+
+                    ALPAKA_OMP5_CHECK(omp_target_memcpy_rect(
+                        m_dstMemNative,
+                        const_cast<void*>(m_srcMemNative),
+                        sizeof(Elem<TViewDst>),
+                        TDim::value,
+                        reinterpret_cast<size_t const*>(&m_extent),
+                        reinterpret_cast<size_t const*>(&dstOffset),
+                        reinterpret_cast<size_t const*>(&srcOffset),
+                        reinterpret_cast<size_t const*>(&dstExtentFull),
+                        reinterpret_cast<size_t const*>(&srcExtentFull),
+                        m_iDstDevice,
+                        m_iSrcDevice));
+                }
+            }
+        };
+
+        //#############################################################################
+        //! The Omp5 memory copy trait.
+        template<typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyOmp5<DimInt<1>, TViewDst, TViewSrc, TExtent>
+        {
+            static_assert(!std::is_const<TViewDst>::value, "The destination view can not be const!");
+
+            static_assert(Dim<TViewSrc>::value == 1, "The source view is required to have dimensionality 1!");
+            static_assert(Dim<TViewDst>::value == 1, "The source view is required to have dimensionality 1!");
+            static_assert(Dim<TExtent>::value == 1, "The extent is required to have dimensionality 1!");
+            // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
+            static_assert(
+                std::is_same<Elem<TViewDst>, typename std::remove_const<Elem<TViewSrc>>::type>::value,
+                "The source and the destination view are required to have the same element type!");
+
+            using Idx = alpaka::Idx<TExtent>;
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST TaskCopyOmp5(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+                ,
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                m_extentWidth(extent::getWidth(extent))
+                , m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst)))
+                , m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc)))
+                ,
+#    endif
+                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(Elem<TViewDst>)))
+                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
+                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
+#    endif
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << m_extentWidth
+                          << " ewb: " << m_extentWidthBytes << " dw: " << m_dstWidth << " dptr: " << m_dstMemNative
+                          << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sptr: " << m_srcMemNative
+                          << std::endl;
+            }
+#    endif
+            int m_iDstDevice;
+            int m_iSrcDevice;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            Idx m_extentWidth;
+            Idx m_dstWidth;
+            Idx m_srcWidth;
+#    endif
+            Idx m_extentWidthBytes;
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+
+            //-----------------------------------------------------------------------------
+            //! Executes the kernel function object.
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                if(m_extentWidthBytes == 0)
+                {
+                    return;
+                }
+
+                ALPAKA_OMP5_CHECK(omp_target_memcpy(
+                    m_dstMemNative,
+                    const_cast<void*>(m_srcMemNative),
+                    static_cast<std::size_t>(m_extentWidthBytes),
+                    0,
+                    0,
+                    m_iDstDevice,
+                    m_iSrcDevice));
+            }
+        };
+    } // namespace detail
+
+    //-----------------------------------------------------------------------------
+    // Trait specializations for CreateTaskMemcpy.
+    namespace traits
+    {
+        namespace detail
+        {
+            //#############################################################################
+            //! The Omp5 memory copy task creation trait detail.
+            template<typename TDim, typename TDevDst, typename TDevSrc>
+            struct CreateTaskCopyImpl
+            {
+                //-----------------------------------------------------------------------------
+                template<typename TExtent, typename TViewSrc, typename TViewDst>
+                ALPAKA_FN_HOST static auto createTaskMemcpy(
+                    TViewDst& viewDst,
+                    TViewSrc const& viewSrc,
+                    TExtent const& extent,
+                    int iDeviceDst = 0,
+                    int iDeviceSrc = 0) -> alpaka::detail::TaskCopyOmp5<TDim, TViewDst, TViewSrc, TExtent>
+                {
+                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                    return alpaka::detail::TaskCopyOmp5<TDim, TViewDst, TViewSrc, TExtent>(
+                        viewDst,
+                        viewSrc,
+                        extent,
+                        iDeviceDst,
+                        iDeviceSrc);
+                }
+            };
+        } // namespace detail
+
+        //#############################################################################
+        //! The CPU to Omp5 memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevOmp5, DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::TaskCopyOmp5<TDim, TViewDst, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return alpaka::detail::TaskCopyOmp5<TDim, TViewDst, TViewSrc, TExtent>(
+                    viewDst,
+                    viewSrc,
+                    extent,
+                    getDev(viewDst).m_spDevOmp5Impl->iDevice(),
+                    omp_get_initial_device());
+            }
+        };
+
+        //#############################################################################
+        //! The Omp5 to CPU memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevCpu, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::TaskCopyOmp5<TDim, TViewDst, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return alpaka::detail::TaskCopyOmp5<TDim, TViewDst, TViewSrc, TExtent>(
+                    viewDst,
+                    viewSrc,
+                    extent,
+                    omp_get_initial_device(),
+                    getDev(viewSrc).m_spDevOmp5Impl->iDevice());
+            }
+        };
+
+        //#############################################################################
+        //! The Omp5 to Omp5 memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevOmp5, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::TaskCopyOmp5<TDim, TViewDst, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return alpaka::detail::TaskCopyOmp5<TDim, TViewDst, TViewSrc, TExtent>(
+                    viewDst,
+                    viewSrc,
+                    extent,
+                    getDev(viewDst).m_spDevOmp5Impl->iDevice(),
+                    getDev(viewSrc).m_spDevOmp5Impl->iDevice());
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/omp5/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/omp5/Set.hpp
new file mode 100644
index 0000000000..ee988406c2
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/omp5/Set.hpp
@@ -0,0 +1,94 @@
+/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/core/Utility.hpp>
+#    include <alpaka/dev/DevOmp5.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/extent/Traits.hpp>
+#    include <alpaka/idx/Accessors.hpp>
+#    include <alpaka/kernel/TaskKernelOmp5.hpp>
+#    include <alpaka/mem/buf/SetKernel.hpp>
+#    include <alpaka/mem/view/Traits.hpp>
+#    include <alpaka/meta/Integral.hpp>
+#    include <alpaka/queue/QueueOmp5Blocking.hpp>
+#    include <alpaka/vec/Vec.hpp>
+#    include <alpaka/workdiv/WorkDivHelpers.hpp>
+
+#    include <cstring>
+
+namespace alpaka
+{
+    class DevOmp5;
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OMP5 device memory set trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemset<TDim, DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TView>
+            ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+            {
+                using Idx = typename alpaka::traits::IdxType<TExtent>::type;
+                auto pitch = getPitchBytesVec(view);
+                auto byteExtent = extent::getExtentVec(extent);
+                constexpr auto lastDim = TDim::value - 1;
+                byteExtent[lastDim] *= static_cast<Idx>(sizeof(Elem<TView>));
+
+                if(pitch[0] == 0)
+                {
+                    return createTaskKernel<AccOmp5<TDim, Idx>>(
+                        WorkDivMembers<TDim, Idx>(
+                            Vec<TDim, Idx>::zeros(),
+                            Vec<TDim, Idx>::zeros(),
+                            Vec<TDim, Idx>::zeros()),
+                        MemSetKernel(),
+                        byte,
+                        reinterpret_cast<std::uint8_t*>(alpaka::getPtrNative(view)),
+                        byteExtent,
+                        pitch); // NOP if size is zero
+                }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << "Set TDim=" << TDim::value << " pitch=" << pitch << " byteExtent=" << byteExtent
+                          << std::endl;
+#    endif
+                auto elementsPerThread = Vec<TDim, Idx>::all(static_cast<Idx>(1u));
+                elementsPerThread[lastDim] = 4;
+                // Let alpaka calculate good block and grid sizes given our full problem extent
+                WorkDivMembers<TDim, Idx> const workDiv(getValidWorkDiv<AccOmp5<TDim, Idx>>(
+                    getDev(view),
+                    byteExtent,
+                    elementsPerThread,
+                    false,
+                    alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+                return createTaskKernel<AccOmp5<TDim, Idx>>(
+                    workDiv,
+                    MemSetKernel(),
+                    byte,
+                    reinterpret_cast<std::uint8_t*>(alpaka::getPtrNative(view)),
+                    byteExtent,
+                    pitch);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp
new file mode 100644
index 0000000000..162efbcf00
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp
@@ -0,0 +1,809 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner,
+ *                Rene Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/extent/Traits.hpp>
+#    include <alpaka/mem/view/Traits.hpp>
+#    include <alpaka/queue/QueueUniformCudaHipRtBlocking.hpp>
+#    include <alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp>
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <cstdint>
+#    include <set>
+#    include <tuple>
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //-----------------------------------------------------------------------------
+        //! Not being able to enable peer access does not prevent such device to device memory copies.
+        //! However, those copies may be slower because the memory is copied via the CPU.
+        inline auto enablePeerAccessIfPossible(const int& devSrc, const int& devDst) -> void
+        {
+            ALPAKA_ASSERT(devSrc != devDst);
+
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wexit-time-destructors"
+#    endif
+            static std::set<std::pair<int, int>> alreadyCheckedPeerAccessDevices;
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+            auto const devicePair = std::make_pair(devSrc, devDst);
+
+            if(alreadyCheckedPeerAccessDevices.find(devicePair) == alreadyCheckedPeerAccessDevices.end())
+            {
+                alreadyCheckedPeerAccessDevices.insert(devicePair);
+
+                int canAccessPeer = 0;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    ALPAKA_API_PREFIX(DeviceCanAccessPeer)(&canAccessPeer, devSrc, devDst));
+
+                if(!canAccessPeer)
+                {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                    std::cout << __func__ << " Direct peer access between given GPUs is not possible!"
+                              << " src=" << devSrc << " dst=" << devDst << std::endl;
+#    endif
+                    return;
+                }
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(devSrc));
+
+                // NOTE: "until access is explicitly disabled using cudaDeviceDisablePeerAccess() or either device is
+                // reset using cudaDeviceReset()." We do not remove a device from the enabled device pairs on
+                // cudaDeviceReset. Note that access granted by this call is unidirectional and that in order to access
+                // memory on the current device from peerDevice, a separate symmetric call to
+                // cudaDeviceEnablePeerAccess() is required.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(DeviceEnablePeerAccess)(devDst, 0));
+            }
+        }
+
+        //#############################################################################
+        //! The CUDA/HIP memory copy trait.
+        template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip;
+
+        //#############################################################################
+        //! The 1D CUDA/HIP memory copy trait.
+        template<typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip<DimInt<1>, TViewDst, TViewSrc, TExtent>
+        {
+            using MemcpyKind = ALPAKA_API_PREFIX(MemcpyKind);
+
+            static_assert(!std::is_const<TViewDst>::value, "The destination view can not be const!");
+
+            static_assert(
+                Dim<TViewDst>::value == Dim<TViewSrc>::value,
+                "The source and the destination view are required to have the same dimensionality!");
+            static_assert(
+                Dim<TViewDst>::value == Dim<TExtent>::value,
+                "The views and the extent are required to have the same dimensionality!");
+            // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
+            static_assert(
+                std::is_same<Elem<TViewDst>, std::remove_const_t<Elem<TViewSrc>>>::value,
+                "The source and the destination view are required to have the same element type!");
+
+            using Idx = Idx<TExtent>;
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                MemcpyKind const& uniformMemCpyKind,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_uniformMemCpyKind(uniformMemCpyKind)
+                , m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+                ,
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                m_extentWidth(extent::getWidth(extent))
+                , m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst)))
+                , m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc)))
+                ,
+#    endif
+                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(Elem<TViewDst>)))
+                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
+                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
+#    endif
+            }
+
+            //-----------------------------------------------------------------------------
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                if(m_extentWidthBytes == 0)
+                {
+                    return;
+                }
+
+                auto const& uniformCudaHipMemCpyKind(m_uniformMemCpyKind);
+
+                if(m_iDstDevice == m_iSrcDevice)
+                {
+                    // Set the current device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(m_iDstDevice));
+                    // Initiate the memory copy.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(MemcpyAsync)(
+                        m_dstMemNative,
+                        m_srcMemNative,
+                        static_cast<std::size_t>(m_extentWidthBytes),
+                        uniformCudaHipMemCpyKind,
+                        queue.m_spQueueImpl->m_UniformCudaHipQueue));
+                }
+                else
+                {
+                    alpaka::detail::enablePeerAccessIfPossible(m_iSrcDevice, m_iDstDevice);
+
+                    // Initiate the memory copy.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(MemcpyPeerAsync)(
+                        m_dstMemNative,
+                        m_iDstDevice,
+                        m_srcMemNative,
+                        m_iSrcDevice,
+                        static_cast<std::size_t>(m_extentWidthBytes),
+                        queue.m_spQueueImpl->m_UniformCudaHipQueue));
+                }
+            }
+
+        private:
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << m_extentWidth
+                          << " ewb: " << m_extentWidthBytes << " dw: " << m_dstWidth << " dptr: " << m_dstMemNative
+                          << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sptr: " << m_srcMemNative
+                          << std::endl;
+            }
+#    endif
+
+            MemcpyKind m_uniformMemCpyKind;
+            int m_iDstDevice;
+            int m_iSrcDevice;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            Idx m_extentWidth;
+            Idx m_dstWidth;
+            Idx m_srcWidth;
+#    endif
+            Idx m_extentWidthBytes;
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+        };
+        //#############################################################################
+        //! The 2D CUDA/HIP memory copy trait.
+        template<typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip<DimInt<2>, TViewDst, TViewSrc, TExtent>
+        {
+            using MemcpyKind = ALPAKA_API_PREFIX(MemcpyKind);
+
+            static_assert(!std::is_const<TViewDst>::value, "The destination view can not be const!");
+
+            static_assert(
+                Dim<TViewDst>::value == Dim<TViewSrc>::value,
+                "The source and the destination view are required to have the same dimensionality!");
+            static_assert(
+                Dim<TViewDst>::value == Dim<TExtent>::value,
+                "The views and the extent are required to have the same dimensionality!");
+            // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
+            static_assert(
+                std::is_same<Elem<TViewDst>, std::remove_const_t<Elem<TViewSrc>>>::value,
+                "The source and the destination view are required to have the same element type!");
+
+            using Idx = Idx<TExtent>;
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                MemcpyKind const& uniformMemcpyKind,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_uniformMemCpyKind(uniformMemcpyKind)
+                , m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+                ,
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                m_extentWidth(extent::getWidth(extent))
+                ,
+#    endif
+                m_extentWidthBytes(extent::getWidth(extent) * static_cast<Idx>(sizeof(Elem<TViewDst>)))
+                , m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst)))
+                , // required for 3D peer copy
+                m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc)))
+                , // required for 3D peer copy
+
+                m_extentHeight(extent::getHeight(extent))
+                , m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst)))
+                , // required for 3D peer copy
+                m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc)))
+                , // required for 3D peer copy
+
+                m_dstpitchBytesX(static_cast<Idx>(getPitchBytes<Dim<TViewDst>::value - 1u>(viewDst)))
+                , m_srcpitchBytesX(static_cast<Idx>(getPitchBytes<Dim<TViewSrc>::value - 1u>(viewSrc)))
+                , m_dstPitchBytesY(
+                      static_cast<Idx>(getPitchBytes<Dim<TViewDst>::value - (2u % Dim<TViewDst>::value)>(viewDst)))
+                , m_srcPitchBytesY(
+                      static_cast<Idx>(getPitchBytes<Dim<TViewSrc>::value - (2u % Dim<TViewDst>::value)>(viewSrc)))
+                ,
+
+                m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
+                ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
+                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
+                ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
+#    endif
+            }
+
+            //-----------------------------------------------------------------------------
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                // This is not only an optimization but also prevents a division by zero.
+                if(m_extentWidthBytes == 0 || m_extentHeight == 0)
+                {
+                    return;
+                }
+
+                if(m_iDstDevice == m_iSrcDevice)
+                {
+                    // Set the current device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(m_iDstDevice));
+                    // Initiate the memory copy.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(Memcpy2DAsync)(
+                        m_dstMemNative,
+                        static_cast<std::size_t>(m_dstpitchBytesX),
+                        m_srcMemNative,
+                        static_cast<std::size_t>(m_srcpitchBytesX),
+                        static_cast<std::size_t>(m_extentWidthBytes),
+                        static_cast<std::size_t>(m_extentHeight),
+                        m_uniformMemCpyKind,
+                        queue.m_spQueueImpl->m_UniformCudaHipQueue));
+                }
+                else
+                {
+                    alpaka::detail::enablePeerAccessIfPossible(m_iSrcDevice, m_iDstDevice);
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    // There is no cudaMemcpy2DPeerAsync, therefore we use cudaMemcpy3DPeerAsync.
+                    // Create the struct describing the copy.
+                    ALPAKA_API_PREFIX(Memcpy3DPeerParms) const memCpy3DPeerParms(buildCudaMemcpy3DPeerParms());
+                    // Initiate the memory copy.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                        cudaMemcpy3DPeerAsync(&memCpy3DPeerParms, queue.m_spQueueImpl->m_UniformCudaHipQueue));
+#    endif
+                }
+            }
+
+        private:
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms() const -> cudaMemcpy3DPeerParms
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                // Fill CUDA parameter structure.
+                cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms;
+                cudaMemCpy3DPeerParms.dstArray = nullptr; // Either dstArray or dstPtr.
+                cudaMemCpy3DPeerParms.dstDevice = m_iDstDevice;
+                cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes.
+                cudaMemCpy3DPeerParms.dstPtr = make_cudaPitchedPtr(
+                    m_dstMemNative,
+                    static_cast<std::size_t>(m_dstpitchBytesX),
+                    static_cast<std::size_t>(m_dstWidth),
+                    static_cast<std::size_t>(m_dstPitchBytesY / m_dstpitchBytesX));
+                cudaMemCpy3DPeerParms.extent = make_cudaExtent(
+                    static_cast<std::size_t>(m_extentWidthBytes),
+                    static_cast<std::size_t>(m_extentHeight),
+                    static_cast<std::size_t>(1u));
+                cudaMemCpy3DPeerParms.srcArray = nullptr; // Either srcArray or srcPtr.
+                cudaMemCpy3DPeerParms.srcDevice = m_iSrcDevice;
+                cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes.
+                cudaMemCpy3DPeerParms.srcPtr = make_cudaPitchedPtr(
+                    const_cast<void*>(m_srcMemNative),
+                    static_cast<std::size_t>(m_srcpitchBytesX),
+                    static_cast<std::size_t>(m_srcWidth),
+                    static_cast<std::size_t>(m_srcPitchBytesY / m_srcpitchBytesX));
+
+                return cudaMemCpy3DPeerParms;
+            }
+#    endif
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
+                          << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth
+                          << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative << " dpitchb: " << m_dstpitchBytesX
+                          << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight
+                          << " sptr: " << m_srcMemNative << " spitchb: " << m_srcpitchBytesX << std::endl;
+            }
+#    endif
+
+            MemcpyKind m_uniformMemCpyKind;
+            int m_iDstDevice;
+            int m_iSrcDevice;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            Idx m_extentWidth;
+#    endif
+            Idx m_extentWidthBytes;
+            Idx m_dstWidth; // required for 3D peer copy
+            Idx m_srcWidth; // required for 3D peer copy
+
+            Idx m_extentHeight;
+            Idx m_dstHeight; // required for 3D peer copy
+            Idx m_srcHeight; // required for 3D peer copy
+
+            Idx m_dstpitchBytesX;
+            Idx m_srcpitchBytesX;
+            Idx m_dstPitchBytesY;
+            Idx m_srcPitchBytesY;
+
+
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+        };
+        //#############################################################################
+        //! The 3D CUDA/HIP memory copy trait.
+        template<typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip<DimInt<3>, TViewDst, TViewSrc, TExtent>
+        {
+            using MemcpyKind = ALPAKA_API_PREFIX(MemcpyKind);
+
+            static_assert(!std::is_const<TViewDst>::value, "The destination view can not be const!");
+
+            static_assert(
+                Dim<TViewDst>::value == Dim<TViewSrc>::value,
+                "The source and the destination view are required to have the same dimensionality!");
+            static_assert(
+                Dim<TViewDst>::value == Dim<TExtent>::value,
+                "The views and the extent are required to have the same dimensionality!");
+            // TODO: Maybe check for Idx of TViewDst and TViewSrc to have greater or equal range than TExtent.
+            static_assert(
+                std::is_same<Elem<TViewDst>, std::remove_const_t<Elem<TViewSrc>>>::value,
+                "The source and the destination view are required to have the same element type!");
+
+            using Idx = Idx<TExtent>;
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                MemcpyKind const& uniformMemcpyKind,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_uniformMemCpyKind(uniformMemcpyKind)
+                , m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+                ,
+
+                m_extentWidth(extent::getWidth(extent))
+                , m_extentWidthBytes(m_extentWidth * static_cast<Idx>(sizeof(Elem<TViewDst>)))
+                , m_dstWidth(static_cast<Idx>(extent::getWidth(viewDst)))
+                , m_srcWidth(static_cast<Idx>(extent::getWidth(viewSrc)))
+                ,
+
+                m_extentHeight(extent::getHeight(extent))
+                , m_dstHeight(static_cast<Idx>(extent::getHeight(viewDst)))
+                , m_srcHeight(static_cast<Idx>(extent::getHeight(viewSrc)))
+                ,
+
+                m_extentDepth(extent::getDepth(extent))
+                ,
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                m_dstDepth(static_cast<Idx>(extent::getDepth(viewDst)))
+                , m_srcDepth(static_cast<Idx>(extent::getDepth(viewSrc)))
+                ,
+#    endif
+                m_dstpitchBytesX(static_cast<Idx>(getPitchBytes<Dim<TViewDst>::value - 1u>(viewDst)))
+                , m_srcpitchBytesX(static_cast<Idx>(getPitchBytes<Dim<TViewSrc>::value - 1u>(viewSrc)))
+                , m_dstPitchBytesY(
+                      static_cast<Idx>(getPitchBytes<Dim<TViewDst>::value - (2u % Dim<TViewDst>::value)>(viewDst)))
+                , m_srcPitchBytesY(
+                      static_cast<Idx>(getPitchBytes<Dim<TViewSrc>::value - (2u % Dim<TViewDst>::value)>(viewSrc)))
+                ,
+
+
+                m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
+                ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
+                ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
+                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
+                ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
+                ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstpitchBytesX);
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_srcpitchBytesX);
+#    endif
+            }
+
+            //-----------------------------------------------------------------------------
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                // This is not only an optimization but also prevents a division by zero.
+                if(m_extentWidthBytes == 0 || m_extentHeight == 0 || m_extentDepth == 0)
+                {
+                    return;
+                }
+
+                if(m_iDstDevice == m_iSrcDevice)
+                {
+                    // Create the struct describing the copy.
+                    ALPAKA_API_PREFIX(Memcpy3DParms)
+                    const uniformCudaHipMemCpy3DParms(buildUniformCudaHipMemcpy3DParms());
+                    // Set the current device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(m_iDstDevice));
+
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(
+                        Memcpy3DAsync)(&uniformCudaHipMemCpy3DParms, queue.m_spQueueImpl->m_UniformCudaHipQueue));
+                }
+                else
+                {
+                    alpaka::detail::enablePeerAccessIfPossible(m_iSrcDevice, m_iDstDevice);
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    // Create the struct describing the copy.
+                    cudaMemcpy3DPeerParms const cudaMemCpy3DPeerParms(buildCudaMemcpy3DPeerParms());
+                    // Initiate the memory copy.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                        cudaMemcpy3DPeerAsync(&cudaMemCpy3DPeerParms, queue.m_spQueueImpl->m_UniformCudaHipQueue));
+#    endif
+                }
+            }
+
+        private:
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto buildUniformCudaHipMemcpy3DParms() const -> ALPAKA_API_PREFIX(Memcpy3DParms)
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                // Fill CUDA/HIP parameter structure.
+                ALPAKA_API_PREFIX(Memcpy3DParms) memCpy3DParms;
+                memCpy3DParms.srcArray = nullptr; // Either srcArray or srcPtr.
+                memCpy3DParms.srcPos
+                    = ALPAKA_PP_CONCAT(make_, ALPAKA_API_PREFIX(Pos))(0, 0, 0); // Optional. Offset in bytes.
+                memCpy3DParms.srcPtr = ALPAKA_PP_CONCAT(make_, ALPAKA_API_PREFIX(PitchedPtr))(
+                    const_cast<void*>(m_srcMemNative),
+                    static_cast<std::size_t>(m_srcpitchBytesX),
+                    static_cast<std::size_t>(m_srcWidth),
+                    static_cast<std::size_t>(m_srcPitchBytesY / m_srcpitchBytesX));
+                memCpy3DParms.dstArray = nullptr; // Either dstArray or dstPtr.
+                memCpy3DParms.dstPos
+                    = ALPAKA_PP_CONCAT(make_, ALPAKA_API_PREFIX(Pos))(0, 0, 0); // Optional. Offset in bytes.
+                memCpy3DParms.dstPtr = ALPAKA_PP_CONCAT(make_, ALPAKA_API_PREFIX(PitchedPtr))(
+                    m_dstMemNative,
+                    static_cast<std::size_t>(m_dstpitchBytesX),
+                    static_cast<std::size_t>(m_dstWidth),
+                    static_cast<std::size_t>(m_dstPitchBytesY / m_dstpitchBytesX));
+                memCpy3DParms.extent = ALPAKA_PP_CONCAT(make_, ALPAKA_API_PREFIX(Extent))(
+                    static_cast<std::size_t>(m_extentWidthBytes),
+                    static_cast<std::size_t>(m_extentHeight),
+                    static_cast<std::size_t>(m_extentDepth));
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_PLATFORM_NVCC__)
+                memCpy3DParms.kind = hipMemcpyKindToCudaMemcpyKind(m_uniformMemCpyKind);
+#    else
+                memCpy3DParms.kind = m_uniformMemCpyKind;
+#    endif
+                return memCpy3DParms;
+            }
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto buildCudaMemcpy3DPeerParms() const -> cudaMemcpy3DPeerParms
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                // Fill CUDA parameter structure.
+                cudaMemcpy3DPeerParms cudaMemCpy3DPeerParms;
+                cudaMemCpy3DPeerParms.dstArray = nullptr; // Either dstArray or dstPtr.
+                cudaMemCpy3DPeerParms.dstDevice = m_iDstDevice;
+                cudaMemCpy3DPeerParms.dstPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes.
+                cudaMemCpy3DPeerParms.dstPtr = make_cudaPitchedPtr(
+                    m_dstMemNative,
+                    static_cast<std::size_t>(m_dstpitchBytesX),
+                    static_cast<std::size_t>(m_dstWidth),
+                    static_cast<std::size_t>(m_dstPitchBytesY / m_dstpitchBytesX));
+                cudaMemCpy3DPeerParms.extent = make_cudaExtent(
+                    static_cast<std::size_t>(m_extentWidthBytes),
+                    static_cast<std::size_t>(m_extentHeight),
+                    static_cast<std::size_t>(m_extentDepth));
+                cudaMemCpy3DPeerParms.srcArray = nullptr; // Either srcArray or srcPtr.
+                cudaMemCpy3DPeerParms.srcDevice = m_iSrcDevice;
+                cudaMemCpy3DPeerParms.srcPos = make_cudaPos(0, 0, 0); // Optional. Offset in bytes.
+                cudaMemCpy3DPeerParms.srcPtr = make_cudaPitchedPtr(
+                    const_cast<void*>(m_srcMemNative),
+                    static_cast<std::size_t>(m_srcpitchBytesX),
+                    static_cast<std::size_t>(m_srcWidth),
+                    static_cast<std::size_t>(m_srcPitchBytesY / m_srcpitchBytesX));
+
+                return cudaMemCpy3DPeerParms;
+            }
+#    endif
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
+                          << " ed: " << m_extentDepth << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice
+                          << " dw: " << m_dstWidth << " dh: " << m_dstHeight << " dd: " << m_dstDepth
+                          << " dptr: " << m_dstMemNative << " dpitchb: " << m_dstpitchBytesX
+                          << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight
+                          << " sd: " << m_srcDepth << " sptr: " << m_srcMemNative << " spitchb: " << m_srcpitchBytesX
+                          << std::endl;
+            }
+#    endif
+            MemcpyKind m_uniformMemCpyKind;
+            int m_iDstDevice;
+            int m_iSrcDevice;
+
+            Idx m_extentWidth;
+            Idx m_extentWidthBytes;
+            Idx m_dstWidth;
+            Idx m_srcWidth;
+
+            Idx m_extentHeight;
+            Idx m_dstHeight;
+            Idx m_srcHeight;
+
+            Idx m_extentDepth;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            Idx m_dstDepth;
+            Idx m_srcDepth;
+#    endif
+            Idx m_dstpitchBytesX;
+            Idx m_srcpitchBytesX;
+            Idx m_dstPitchBytesY;
+            Idx m_srcPitchBytesY;
+
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+        };
+    } // namespace detail
+
+    //-----------------------------------------------------------------------------
+    // Trait specializations for CreateTaskMemcpy.
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA/HIP to CPU memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevCpu, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::TaskCopyUniformCudaHip<TDim, TViewDst, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                auto const iDevice(getDev(viewSrc).m_iDevice);
+
+                return alpaka::detail::TaskCopyUniformCudaHip<TDim, TViewDst, TViewSrc, TExtent>(
+                    viewDst,
+                    viewSrc,
+                    extent,
+                    ALPAKA_API_PREFIX(MemcpyDeviceToHost),
+                    iDevice,
+                    iDevice);
+            }
+        };
+        //#############################################################################
+        //! The CPU to CUDA/HIP memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevUniformCudaHipRt, DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::TaskCopyUniformCudaHip<TDim, TViewDst, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                auto const iDevice(getDev(viewDst).m_iDevice);
+
+                return alpaka::detail::TaskCopyUniformCudaHip<TDim, TViewDst, TViewSrc, TExtent>(
+                    viewDst,
+                    viewSrc,
+                    extent,
+                    ALPAKA_API_PREFIX(MemcpyHostToDevice),
+                    iDevice,
+                    iDevice);
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP to CUDA/HIP memory copy trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevUniformCudaHipRt, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TViewSrc, typename TViewDst>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDst& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::TaskCopyUniformCudaHip<TDim, TViewDst, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return alpaka::detail::TaskCopyUniformCudaHip<TDim, TViewDst, TViewSrc, TExtent>(
+                    viewDst,
+                    viewSrc,
+                    extent,
+                    ALPAKA_API_PREFIX(MemcpyDeviceToDevice),
+                    getDev(viewDst).m_iDevice,
+                    getDev(viewSrc).m_iDevice);
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP non-blocking device queue 1D copy enqueue trait specialization.
+        template<typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking,
+            alpaka::detail::TaskCopyUniformCudaHip<DimInt<1u>, TViewDst, TViewSrc, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP blocking device queue 1D copy enqueue trait specialization.
+        template<typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking,
+            alpaka::detail::TaskCopyUniformCudaHip<DimInt<1u>, TViewDst, TViewSrc, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    ALPAKA_API_PREFIX(StreamSynchronize)(queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP non-blocking device queue 2D copy enqueue trait specialization.
+        template<typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking,
+            alpaka::detail::TaskCopyUniformCudaHip<DimInt<2u>, TViewDst, TViewSrc, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP blocking device queue 2D copy enqueue trait specialization.
+        template<typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking,
+            alpaka::detail::TaskCopyUniformCudaHip<DimInt<2u>, TViewDst, TViewSrc, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    ALPAKA_API_PREFIX(StreamSynchronize)(queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP non-blocking device queue 3D copy enqueue trait specialization.
+        template<typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking,
+            alpaka::detail::TaskCopyUniformCudaHip<DimInt<3u>, TViewDst, TViewSrc, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+        //#############################################################################
+        //! The CUDA/HIP blocking device queue 3D copy enqueue trait specialization.
+        template<typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking,
+            alpaka::detail::TaskCopyUniformCudaHip<DimInt<3u>, TViewDst, TViewSrc, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    ALPAKA_API_PREFIX(StreamSynchronize)(queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Set.hpp
new file mode 100644
index 0000000000..0c8681aef4
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/buf/uniformCudaHip/Set.hpp
@@ -0,0 +1,390 @@
+/* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if !BOOST_LANG_CUDA && !BOOST_LANG_HIP
+#        error Compiler has to support CUDA/HIP!
+#    endif
+
+#    include <alpaka/core/Assert.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/dim/DimIntegralConst.hpp>
+#    include <alpaka/extent/Traits.hpp>
+#    include <alpaka/mem/view/Traits.hpp>
+#    include <alpaka/queue/QueueUniformCudaHipRtBlocking.hpp>
+#    include <alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp>
+#    include <alpaka/queue/Traits.hpp>
+#    include <alpaka/wait/Traits.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+
+namespace alpaka
+{
+    class DevUniformCudaHipRt;
+
+    namespace detail
+    {
+        //#############################################################################
+        //! The CUDA/HIP memory set task base.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetUniformCudaHipBase
+        {
+            //-----------------------------------------------------------------------------
+            TaskSetUniformCudaHipBase(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                : m_view(view)
+                , m_byte(byte)
+                , m_extent(extent)
+                , m_iDevice(getDev(view).m_iDevice)
+            {
+                static_assert(!std::is_const<TView>::value, "The destination view can not be const!");
+
+                static_assert(
+                    Dim<TView>::value == Dim<TExtent>::value,
+                    "The destination view and the extent are required to have the same dimensionality!");
+            }
+
+        protected:
+            TView& m_view;
+            std::uint8_t const m_byte;
+            TExtent const m_extent;
+            std::int32_t const m_iDevice;
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP memory set task.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip;
+
+        //#############################################################################
+        //! The 1D CUDA/HIP memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip<DimInt<1>, TView, TExtent>
+            : public TaskSetUniformCudaHipBase<DimInt<1>, TView, TExtent>
+        {
+            //-----------------------------------------------------------------------------
+            TaskSetUniformCudaHip(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                : TaskSetUniformCudaHipBase<DimInt<1>, TView, TExtent>(view, byte, extent)
+            {
+            }
+
+            //-----------------------------------------------------------------------------
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+                static_assert(
+                    Dim<TView>::value == 1u,
+                    "The destination buffer is required to be 1-dimensional for this specialization!");
+                static_assert(
+                    Dim<TView>::value == Dim<TExtent>::value,
+                    "The destination buffer and the extent are required to have the same dimensionality!");
+
+                using Idx = Idx<TExtent>;
+
+                auto& view(this->m_view);
+                auto const& extent(this->m_extent);
+
+                auto const extentWidth(extent::getWidth(extent));
+
+                if(extentWidth == 0)
+                {
+                    return;
+                }
+
+                auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(Elem<TView>)));
+#    if !defined(NDEBUG)
+                auto const dstWidth(extent::getWidth(view));
+#    endif
+                auto const dstNativePtr(reinterpret_cast<void*>(getPtrNative(view)));
+                ALPAKA_ASSERT(extentWidth <= dstWidth);
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(this->m_iDevice));
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(MemsetAsync)(
+                    dstNativePtr,
+                    static_cast<int>(this->m_byte),
+                    static_cast<size_t>(extentWidthBytes),
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+        //#############################################################################
+        //! The 2D CUDA/HIP memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip<DimInt<2>, TView, TExtent>
+            : public TaskSetUniformCudaHipBase<DimInt<2>, TView, TExtent>
+        {
+            //-----------------------------------------------------------------------------
+            TaskSetUniformCudaHip(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                : TaskSetUniformCudaHipBase<DimInt<2>, TView, TExtent>(view, byte, extent)
+            {
+            }
+
+            //-----------------------------------------------------------------------------
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+                static_assert(
+                    Dim<TView>::value == 2u,
+                    "The destination buffer is required to be 2-dimensional for this specialization!");
+                static_assert(
+                    Dim<TView>::value == Dim<TExtent>::value,
+                    "The destination buffer and the extent are required to have the same dimensionality!");
+
+                using Idx = Idx<TExtent>;
+
+                auto& view(this->m_view);
+                auto const& extent(this->m_extent);
+
+                auto const extentWidth(extent::getWidth(extent));
+                auto const extentHeight(extent::getHeight(extent));
+
+                if(extentWidth == 0 || extentHeight == 0)
+                {
+                    return;
+                }
+
+                auto const extentWidthBytes(extentWidth * static_cast<Idx>(sizeof(Elem<TView>)));
+
+#    if !defined(NDEBUG)
+                auto const dstWidth(extent::getWidth(view));
+                auto const dstHeight(extent::getHeight(view));
+#    endif
+                auto const dstPitchBytesX(getPitchBytes<Dim<TView>::value - 1u>(view));
+                auto const dstNativePtr(reinterpret_cast<void*>(getPtrNative(view)));
+                ALPAKA_ASSERT(extentWidth <= dstWidth);
+                ALPAKA_ASSERT(extentHeight <= dstHeight);
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(this->m_iDevice));
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(Memset2DAsync)(
+                    dstNativePtr,
+                    static_cast<size_t>(dstPitchBytesX),
+                    static_cast<int>(this->m_byte),
+                    static_cast<size_t>(extentWidthBytes),
+                    static_cast<size_t>(extentHeight),
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+        //#############################################################################
+        //! The 3D CUDA/HIP memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip<DimInt<3>, TView, TExtent>
+            : public TaskSetUniformCudaHipBase<DimInt<3>, TView, TExtent>
+        {
+            //-----------------------------------------------------------------------------
+            TaskSetUniformCudaHip(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                : TaskSetUniformCudaHipBase<DimInt<3>, TView, TExtent>(view, byte, extent)
+            {
+            }
+
+            //-----------------------------------------------------------------------------
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+                static_assert(
+                    Dim<TView>::value == 3u,
+                    "The destination buffer is required to be 3-dimensional for this specialization!");
+                static_assert(
+                    Dim<TView>::value == Dim<TExtent>::value,
+                    "The destination buffer and the extent are required to have the same dimensionality!");
+
+                using Elem = alpaka::Elem<TView>;
+                using Idx = Idx<TExtent>;
+
+                auto& view(this->m_view);
+                auto const& extent(this->m_extent);
+
+                auto const extentWidth(extent::getWidth(extent));
+                auto const extentHeight(extent::getHeight(extent));
+                auto const extentDepth(extent::getDepth(extent));
+
+                // This is not only an optimization but also prevents a division by zero.
+                if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
+                {
+                    return;
+                }
+
+                auto const dstWidth(extent::getWidth(view));
+#    if !defined(NDEBUG)
+                auto const dstHeight(extent::getHeight(view));
+                auto const dstDepth(extent::getDepth(view));
+#    endif
+                auto const dstPitchBytesX(getPitchBytes<Dim<TView>::value - 1u>(view));
+                auto const dstPitchBytesY(getPitchBytes<Dim<TView>::value - (2u % Dim<TView>::value)>(view));
+                auto const dstNativePtr(reinterpret_cast<void*>(getPtrNative(view)));
+                ALPAKA_ASSERT(extentWidth <= dstWidth);
+                ALPAKA_ASSERT(extentHeight <= dstHeight);
+                ALPAKA_ASSERT(extentDepth <= dstDepth);
+
+                // Fill CUDA parameter structures.
+                ALPAKA_API_PREFIX(PitchedPtr)
+                const pitchedPtrVal(ALPAKA_PP_CONCAT(make_, ALPAKA_API_PREFIX(PitchedPtr))(
+                    dstNativePtr,
+                    static_cast<size_t>(dstPitchBytesX),
+                    static_cast<size_t>(dstWidth * static_cast<Idx>(sizeof(Elem))),
+                    static_cast<size_t>(dstPitchBytesY / dstPitchBytesX)));
+
+                ALPAKA_API_PREFIX(Extent)
+                const extentVal(ALPAKA_PP_CONCAT(make_, ALPAKA_API_PREFIX(Extent))(
+                    static_cast<size_t>(extentWidth * static_cast<Idx>(sizeof(Elem))),
+                    static_cast<size_t>(extentHeight),
+                    static_cast<size_t>(extentDepth)));
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(this->m_iDevice));
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(Memset3DAsync)(
+                    pitchedPtrVal,
+                    static_cast<int>(this->m_byte),
+                    extentVal,
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+    } // namespace detail
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA device memory set trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemset<TDim, DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TExtent, typename TView>
+            ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                -> alpaka::detail::TaskSetUniformCudaHip<TDim, TView, TExtent>
+            {
+                return alpaka::detail::TaskSetUniformCudaHip<TDim, TView, TExtent>(view, byte, extent);
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA non-blocking device queue 1D set enqueue trait specialization.
+        template<typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking,
+            alpaka::detail::TaskSetUniformCudaHip<DimInt<1u>, TView, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                alpaka::detail::TaskSetUniformCudaHip<DimInt<1u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+        //#############################################################################
+        //! The CUDA blocking device queue 1D set enqueue trait specialization.
+        template<typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking,
+            alpaka::detail::TaskSetUniformCudaHip<DimInt<1u>, TView, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking& queue,
+                alpaka::detail::TaskSetUniformCudaHip<DimInt<1u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                wait(queue);
+            }
+        };
+        //#############################################################################
+        //! The CUDA non-blocking device queue 2D set enqueue trait specialization.
+        template<typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking,
+            alpaka::detail::TaskSetUniformCudaHip<DimInt<2u>, TView, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                alpaka::detail::TaskSetUniformCudaHip<DimInt<2u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+        //#############################################################################
+        //! The CUDA blocking device queue 2D set enqueue trait specialization.
+        template<typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking,
+            alpaka::detail::TaskSetUniformCudaHip<DimInt<2u>, TView, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking& queue,
+                alpaka::detail::TaskSetUniformCudaHip<DimInt<2u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                wait(queue);
+            }
+        };
+        //#############################################################################
+        //! The CUDA non-blocking device queue 3D set enqueue trait specialization.
+        template<typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking,
+            alpaka::detail::TaskSetUniformCudaHip<DimInt<3u>, TView, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                alpaka::detail::TaskSetUniformCudaHip<DimInt<3u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+        //#############################################################################
+        //! The CUDA blocking device queue 3D set enqueue trait specialization.
+        template<typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking,
+            alpaka::detail::TaskSetUniformCudaHip<DimInt<3u>, TView, TExtent>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking& queue,
+                alpaka::detail::TaskSetUniformCudaHip<DimInt<3u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                wait(queue);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/Traits.hpp
index d8a3753d00..cb5d1fbbe8 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,601 +9,378 @@
 
 #pragma once
 
+#include <alpaka/core/Common.hpp>
+#include <alpaka/core/Unused.hpp>
 #include <alpaka/dev/Traits.hpp>
 #include <alpaka/dim/Traits.hpp>
 #include <alpaka/elem/Traits.hpp>
 #include <alpaka/extent/Traits.hpp>
+#include <alpaka/meta/Fold.hpp>
 #include <alpaka/offset/Traits.hpp>
 #include <alpaka/queue/Traits.hpp>
-
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/meta/Fold.hpp>
 #include <alpaka/vec/Vec.hpp>
 
-#include <boost/config.hpp>
-
 #include <iosfwd>
+#include <type_traits>
 
 namespace alpaka
 {
-    namespace mem
+    //-----------------------------------------------------------------------------
+    //! The view traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The view specifics.
-        namespace view
+        //#############################################################################
+        //! The native pointer get trait.
+        template<typename TView, typename TSfinae = void>
+        struct GetPtrNative;
+
+        //#############################################################################
+        //! The pointer on device get trait.
+        template<typename TView, typename TDev, typename TSfinae = void>
+        struct GetPtrDev;
+
+        namespace detail
+        {
+            //#############################################################################
+            template<typename TIdx, typename TView, typename TSfinae = void>
+            struct GetPitchBytesDefault;
+        } // namespace detail
+
+        //#############################################################################
+        //! The pitch in bytes.
+        //! This is the distance in bytes in the linear memory between two consecutive elements in the next higher
+        //! dimension (TIdx-1).
+        //!
+        //! The default implementation uses the extent to calculate the pitch.
+        template<typename TIdx, typename TView, typename TSfinae = void>
+        struct GetPitchBytes
         {
             //-----------------------------------------------------------------------------
-            //! The view traits.
-            namespace traits
+            ALPAKA_FN_HOST static auto getPitchBytes(TView const& view) -> Idx<TView>
             {
-                //#############################################################################
-                //! The native pointer get trait.
-                template<
-                    typename TView,
-                    typename TSfinae = void>
-                struct GetPtrNative;
-
-                //#############################################################################
-                //! The pointer on device get trait.
-                template<
-                    typename TView,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct GetPtrDev;
+                return detail::GetPitchBytesDefault<TIdx, TView>::getPitchBytesDefault(view);
+            }
+        };
 
-                namespace detail
+        namespace detail
+        {
+            //#############################################################################
+            template<typename TIdx, typename TView>
+                struct GetPitchBytesDefault < TIdx,
+                TView, std::enable_if_t<TIdx::value<(Dim<TView>::value - 1)>>
+            {
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST static auto getPitchBytesDefault(TView const& view) -> Idx<TView>
                 {
-                    //#############################################################################
-                    template<
-                        typename TIdx,
-                        typename TView,
-                        typename TSfinae = void>
-                    struct GetPitchBytesDefault;
+                    return extent::getExtent<TIdx::value>(view)
+                        * GetPitchBytes<DimInt<TIdx::value + 1>, TView>::getPitchBytes(view);
                 }
-
-                //#############################################################################
-                //! The pitch in bytes.
-                //! This is the distance in bytes in the linear memory between two consecutive elements in the next higher dimension (TIdx-1).
-                //!
-                //! The default implementation uses the extent to calculate the pitch.
-                template<
-                    typename TIdx,
-                    typename TView,
-                    typename TSfinae = void>
-                struct GetPitchBytes
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        TView const & view)
-                    -> idx::Idx<TView>
-                    {
-                        return detail::GetPitchBytesDefault<TIdx, TView>::getPitchBytesDefault(view);
-                    }
-                };
-
-                namespace detail
+            };
+            //#############################################################################
+            template<typename TView>
+            struct GetPitchBytesDefault<DimInt<Dim<TView>::value - 1u>, TView>
+            {
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST static auto getPitchBytesDefault(TView const& view) -> Idx<TView>
                 {
-                    //#############################################################################
-                    template<
-                        typename TIdx,
-                        typename TView>
-                    struct GetPitchBytesDefault<
-                        TIdx,
-                        TView,
-                        typename std::enable_if<TIdx::value < (dim::Dim<TView>::value - 1)>::type>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto getPitchBytesDefault(
-                            TView const & view)
-                        -> idx::Idx<TView>
-                        {
-                            return
-                                extent::getExtent<TIdx::value>(view)
-                                * GetPitchBytes<dim::DimInt<TIdx::value+1>, TView>::getPitchBytes(view);
-                        }
-                    };
-                    //#############################################################################
-                    template<
-                        typename TView>
-                    struct GetPitchBytesDefault<
-                        dim::DimInt<dim::Dim<TView>::value - 1u>,
-                        TView>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto getPitchBytesDefault(
-                            TView const & view)
-                        -> idx::Idx<TView>
-                        {
-                            return
-                                extent::getExtent<dim::Dim<TView>::value - 1u>(view)
-                                * sizeof(elem::Elem<TView>);
-                        }
-                    };
-                    //#############################################################################
-                    template<
-                        typename TView>
-                    struct GetPitchBytesDefault<
-                        dim::DimInt<dim::Dim<TView>::value>,
-                        TView>
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto getPitchBytesDefault(
-                            TView const &)
-                        -> idx::Idx<TView>
-                        {
-                            return
-                                sizeof(elem::Elem<TView>);
-                        }
-                    };
+                    return extent::getExtent<Dim<TView>::value - 1u>(view) * sizeof(Elem<TView>);
                 }
-
-                //#############################################################################
-                //! The memory set task trait.
-                //!
-                //! Fills the view with data.
-                template<
-                    typename TDim,
-                    typename TDev,
-                    typename TSfinae = void>
-                struct CreateTaskSet;
-
-                //#############################################################################
-                //! The memory copy task trait.
-                //!
-                //! Copies memory from one view into another view possibly on a different device.
-                template<
-                    typename TDim,
-                    typename TDevDst,
-                    typename TDevSrc,
-                    typename TSfinae = void>
-                struct CreateTaskCopy;
-
-                //#############################################################################
-                //! The static device memory view creation trait.
-                template<
-                    typename TDev,
-                    typename TSfinae = void>
-                struct CreateStaticDevMemView;
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Gets the native pointer of the memory view.
-            //!
-            //! \param view The memory view.
-            //! \return The native pointer.
-            template<
-                typename TView>
-            ALPAKA_FN_HOST auto getPtrNative(
-                TView const & view)
-            -> elem::Elem<TView> const *
-            {
-                return
-                    traits::GetPtrNative<
-                        TView>
-                    ::getPtrNative(
-                        view);
-            }
-            //-----------------------------------------------------------------------------
-            //! Gets the native pointer of the memory view.
-            //!
-            //! \param view The memory view.
-            //! \return The native pointer.
-            template<
-                typename TView>
-            ALPAKA_FN_HOST auto getPtrNative(
-                TView & view)
-            -> elem::Elem<TView> *
+            };
+            //#############################################################################
+            template<typename TView>
+            struct GetPitchBytesDefault<DimInt<Dim<TView>::value>, TView>
             {
-                return
-                    traits::GetPtrNative<
-                        TView>
-                    ::getPtrNative(
-                        view);
-            }
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST static auto getPitchBytesDefault(TView const&) -> Idx<TView>
+                {
+                    return sizeof(Elem<TView>);
+                }
+            };
+        } // namespace detail
+
+        //#############################################################################
+        //! The memory set task trait.
+        //!
+        //! Fills the view with data.
+        template<typename TDim, typename TDev, typename TSfinae = void>
+        struct CreateTaskMemset;
+
+        //#############################################################################
+        //! The memory copy task trait.
+        //!
+        //! Copies memory from one view into another view possibly on a different device.
+        template<typename TDim, typename TDevDst, typename TDevSrc, typename TSfinae = void>
+        struct CreateTaskMemcpy;
+
+        //#############################################################################
+        //! The static device memory view creation trait.
+        template<typename TDev, typename TSfinae = void>
+        struct CreateStaticDevMemView;
+    } // namespace traits
+
+    //-----------------------------------------------------------------------------
+    //! Gets the native pointer of the memory view.
+    //!
+    //! \param view The memory view.
+    //! \return The native pointer.
+    template<typename TView>
+    ALPAKA_FN_HOST auto getPtrNative(TView const& view) -> Elem<TView> const*
+    {
+        return traits::GetPtrNative<TView>::getPtrNative(view);
+    }
+    //-----------------------------------------------------------------------------
+    //! Gets the native pointer of the memory view.
+    //!
+    //! \param view The memory view.
+    //! \return The native pointer.
+    template<typename TView>
+    ALPAKA_FN_HOST auto getPtrNative(TView& view) -> Elem<TView>*
+    {
+        return traits::GetPtrNative<TView>::getPtrNative(view);
+    }
 
-            //-----------------------------------------------------------------------------
-            //! Gets the pointer to the view on the given device.
-            //!
-            //! \param view The memory view.
-            //! \param dev The device.
-            //! \return The pointer on the device.
-            template<
-                typename TView,
-                typename TDev>
-            ALPAKA_FN_HOST auto getPtrDev(
-                TView const & view,
-                TDev const & dev)
-            -> elem::Elem<TView> const *
-            {
-                return
-                    traits::GetPtrDev<
-                        TView,
-                        TDev>
-                    ::getPtrDev(
-                        view,
-                        dev);
-            }
-            //-----------------------------------------------------------------------------
-            //! Gets the pointer to the view on the given device.
-            //!
-            //! \param view The memory view.
-            //! \param dev The device.
-            //! \return The pointer on the device.
-            template<
-                typename TView,
-                typename TDev>
-            ALPAKA_FN_HOST auto getPtrDev(
-                TView & view,
-                TDev const & dev)
-            -> elem::Elem<TView> *
-            {
-                return
-                    traits::GetPtrDev<
-                        TView,
-                        TDev>
-                    ::getPtrDev(
-                        view,
-                        dev);
-            }
+    //-----------------------------------------------------------------------------
+    //! Gets the pointer to the view on the given device.
+    //!
+    //! \param view The memory view.
+    //! \param dev The device.
+    //! \return The pointer on the device.
+    template<typename TView, typename TDev>
+    ALPAKA_FN_HOST auto getPtrDev(TView const& view, TDev const& dev) -> Elem<TView> const*
+    {
+        return traits::GetPtrDev<TView, TDev>::getPtrDev(view, dev);
+    }
+    //-----------------------------------------------------------------------------
+    //! Gets the pointer to the view on the given device.
+    //!
+    //! \param view The memory view.
+    //! \param dev The device.
+    //! \return The pointer on the device.
+    template<typename TView, typename TDev>
+    ALPAKA_FN_HOST auto getPtrDev(TView& view, TDev const& dev) -> Elem<TView>*
+    {
+        return traits::GetPtrDev<TView, TDev>::getPtrDev(view, dev);
+    }
 
-            //-----------------------------------------------------------------------------
-            //! \return The pitch in bytes. This is the distance in bytes between two consecutive elements in the given dimension.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                std::size_t Tidx,
-                typename TView>
-            ALPAKA_FN_HOST_ACC
-            auto getPitchBytes(
-                TView const & view)
-            -> idx::Idx<TView>
-            {
-                return
-                    traits::GetPitchBytes<
-                        dim::DimInt<Tidx>,
-                        TView>
-                    ::getPitchBytes(
-                        view);
-            }
+    //-----------------------------------------------------------------------------
+    //! \return The pitch in bytes. This is the distance in bytes between two consecutive elements in the given
+    //! dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<std::size_t Tidx, typename TView>
+    ALPAKA_FN_HOST_ACC auto getPitchBytes(TView const& view) -> Idx<TView>
+    {
+        return traits::GetPitchBytes<DimInt<Tidx>, TView>::getPitchBytes(view);
+    }
 
-            //-----------------------------------------------------------------------------
-            //! Create a memory set task.
-            //!
-            //! \param view The memory view to fill.
-            //! \param byte Value to set for each element of the specified view.
-            //! \param extent The extent of the view to fill.
-            template<
-                typename TExtent,
-                typename TView>
-            ALPAKA_FN_HOST auto createTaskSet(
-                TView & view,
-                std::uint8_t const & byte,
-                TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateTaskSet<
-                    dim::Dim<TView>,
-                    dev::Dev<TView>>
-                ::createTaskSet(
-                    view,
-                    byte,
-                    extent))
-#endif
-            {
-                static_assert(
-                    dim::Dim<TView>::value == dim::Dim<TExtent>::value,
-                    "The view and the extent are required to have the same dimensionality!");
+    //-----------------------------------------------------------------------------
+    //! Create a memory set task.
+    //!
+    //! \param view The memory view to fill.
+    //! \param byte Value to set for each element of the specified view.
+    //! \param extent The extent of the view to fill.
+    template<typename TExtent, typename TView>
+    ALPAKA_FN_HOST auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+    {
+        static_assert(
+            Dim<TView>::value == Dim<TExtent>::value,
+            "The view and the extent are required to have the same dimensionality!");
 
-                return
-                    traits::CreateTaskSet<
-                        dim::Dim<TView>,
-                        dev::Dev<TView>>
-                    ::createTaskSet(
-                        view,
-                        byte,
-                        extent);
-            }
+        return traits::CreateTaskMemset<Dim<TView>, Dev<TView>>::createTaskMemset(view, byte, extent);
+    }
 
-            //-----------------------------------------------------------------------------
-            //! Sets the memory to the given value.
-            //!
-            //! \param queue The queue to enqueue the view fill task into.
-            //! \param view The memory view to fill.
-            //! \param byte Value to set for each element of the specified view.
-            //! \param extent The extent of the view to fill.
-            template<
-                typename TExtent,
-                typename TView,
-                typename TQueue>
-            ALPAKA_FN_HOST auto set(
-                TQueue & queue,
-                TView & view,
-                std::uint8_t const & byte,
-                TExtent const & extent)
-            -> void
-            {
-                queue::enqueue(
-                    queue,
-                    mem::view::createTaskSet(
-                        view,
-                        byte,
-                        extent));
-            }
+    //-----------------------------------------------------------------------------
+    //! Sets the memory to the given value.
+    //!
+    //! \param queue The queue to enqueue the view fill task into.
+    //! \param view The memory view to fill.
+    //! \param byte Value to set for each element of the specified view.
+    //! \param extent The extent of the view to fill.
+    template<typename TExtent, typename TView, typename TQueue>
+    ALPAKA_FN_HOST auto memset(TQueue& queue, TView& view, std::uint8_t const& byte, TExtent const& extent) -> void
+    {
+        enqueue(queue, createTaskMemset(view, byte, extent));
+    }
 
-            //-----------------------------------------------------------------------------
-            //! Creates a memory copy task.
-            //!
-            //! \param viewDst The destination memory view.
-            //! \param viewSrc The source memory view.
-            //! \param extent The extent of the view to copy.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst>
-            ALPAKA_FN_HOST auto createTaskCopy(
-                TViewDst & viewDst,
-                TViewSrc const & viewSrc,
-                TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateTaskCopy<
-                    dim::Dim<TViewDst>,
-                    dev::Dev<TViewDst>,
-                    dev::Dev<TViewSrc>>
-                ::createTaskCopy(
-                    viewDst,
-                    viewSrc,
-                    extent))
-#endif
-            {
-                static_assert(
-                    dim::Dim<TViewDst>::value == dim::Dim<TViewSrc>::value,
-                    "The source and the destination view are required to have the same dimensionality!");
-                static_assert(
-                    dim::Dim<TViewDst>::value == dim::Dim<TExtent>::value,
-                    "The destination view and the extent are required to have the same dimensionality!");
-                static_assert(
-                    std::is_same<elem::Elem<TViewDst>, typename std::remove_const<elem::Elem<TViewSrc>>::type>::value,
-                    "The source and the destination view are required to have the same element type!");
+    //-----------------------------------------------------------------------------
+    //! Creates a memory copy task.
+    //!
+    //! \param viewDst The destination memory view.
+    //! \param viewSrc The source memory view.
+    //! \param extent The extent of the view to copy.
+    template<typename TExtent, typename TViewSrc, typename TViewDst>
+    ALPAKA_FN_HOST auto createTaskMemcpy(TViewDst& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+    {
+        static_assert(
+            Dim<TViewDst>::value == Dim<TViewSrc>::value,
+            "The source and the destination view are required to have the same dimensionality!");
+        static_assert(
+            Dim<TViewDst>::value == Dim<TExtent>::value,
+            "The destination view and the extent are required to have the same dimensionality!");
+        static_assert(
+            std::is_same<Elem<TViewDst>, std::remove_const_t<Elem<TViewSrc>>>::value,
+            "The source and the destination view are required to have the same element type!");
+
+        return traits::CreateTaskMemcpy<Dim<TViewDst>, Dev<TViewDst>, Dev<TViewSrc>>::createTaskMemcpy(
+            viewDst,
+            viewSrc,
+            extent);
+    }
 
-                return
-                    traits::CreateTaskCopy<
-                        dim::Dim<TViewDst>,
-                        dev::Dev<TViewDst>,
-                        dev::Dev<TViewSrc>>
-                    ::createTaskCopy(
-                        viewDst,
-                        viewSrc,
-                        extent);
-            }
+    //-----------------------------------------------------------------------------
+    //! Copies memory possibly between different memory spaces.
+    //!
+    //! \param queue The queue to enqueue the view copy task into.
+    //! \param viewDst The destination memory view.
+    //! \param viewSrc The source memory view.
+    //! \param extent The extent of the view to copy.
+    template<typename TExtent, typename TViewSrc, typename TViewDst, typename TQueue>
+    ALPAKA_FN_HOST auto memcpy(TQueue& queue, TViewDst& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+        -> void
+    {
+        enqueue(queue, createTaskMemcpy(viewDst, viewSrc, extent));
+    }
 
-            //-----------------------------------------------------------------------------
-            //! Copies memory possibly between different memory spaces.
-            //!
-            //! \param queue The queue to enqueue the view copy task into.
-            //! \param viewDst The destination memory view.
-            //! \param viewSrc The source memory view.
-            //! \param extent The extent of the view to copy.
-            template<
-                typename TExtent,
-                typename TViewSrc,
-                typename TViewDst,
-                typename TQueue>
-            ALPAKA_FN_HOST auto copy(
-                TQueue & queue,
-                TViewDst & viewDst,
-                TViewSrc const & viewSrc,
-                TExtent const & extent)
-            -> void
+    namespace detail
+    {
+        //-----------------------------------------------------------------------------
+        template<typename TDim, typename TView>
+        struct Print
+        {
+            ALPAKA_FN_HOST static auto print(
+                TView const& view,
+                Elem<TView> const* const ptr,
+                Vec<Dim<TView>, Idx<TView>> const& extent,
+                std::ostream& os,
+                std::string const& elementSeparator,
+                std::string const& rowSeparator,
+                std::string const& rowPrefix,
+                std::string const& rowSuffix) -> void
             {
-                queue::enqueue(
-                    queue,
-                    mem::view::createTaskCopy(
-                        viewDst,
-                        viewSrc,
-                        extent));
-            }
+                os << rowPrefix;
 
-            namespace detail
-            {
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TDim,
-                    typename TView>
-                struct Print
+                auto const pitch(getPitchBytes<TDim::value + 1u>(view));
+                auto const lastIdx(extent[TDim::value] - 1u);
+                for(auto i(decltype(lastIdx)(0)); i <= lastIdx; ++i)
                 {
-                    ALPAKA_FN_HOST static auto print(
-                        TView const & view,
-                        elem::Elem<TView> const * const ptr,
-                        vec::Vec<dim::Dim<TView>, idx::Idx<TView>> const & extent,
-                        std::ostream & os,
-                        std::string const & elementSeparator,
-                        std::string const & rowSeparator,
-                        std::string const & rowPrefix,
-                        std::string const & rowSuffix)
-                    -> void
+                    Print<DimInt<TDim::value + 1u>, TView>::print(
+                        view,
+                        reinterpret_cast<Elem<TView> const*>(reinterpret_cast<std::uint8_t const*>(ptr) + i * pitch),
+                        extent,
+                        os,
+                        elementSeparator,
+                        rowSeparator,
+                        rowPrefix,
+                        rowSuffix);
+
+                    // While we are not at the end of a row, add the row separator.
+                    if(i != lastIdx)
                     {
-                        os << rowPrefix;
-
-                        auto const pitch(view::getPitchBytes<TDim::value+1u>(view));
-                        auto const lastIdx(extent[TDim::value]-1u);
-                        for(auto i(decltype(lastIdx)(0)); i<=lastIdx ;++i)
-                        {
-                            Print<
-                                dim::DimInt<TDim::value+1u>,
-                                TView>
-                            ::print(
-                                view,
-                                reinterpret_cast<elem::Elem<TView> const *>(reinterpret_cast<std::uint8_t const *>(ptr)+i*pitch),
-                                extent,
-                                os,
-                                elementSeparator,
-                                rowSeparator,
-                                rowPrefix,
-                                rowSuffix);
-
-                            // While we are not at the end of a row, add the row separator.
-                            if(i != lastIdx)
-                            {
-                                os << rowSeparator;
-                            }
-                        }
-
-                        os << rowSuffix;
+                        os << rowSeparator;
                     }
-                };
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TView>
-                struct Print<
-                    dim::DimInt<dim::Dim<TView>::value-1u>,
-                    TView>
-                {
-                    ALPAKA_FN_HOST static auto print(
-                        TView const & view,
-                        elem::Elem<TView> const * const ptr,
-                        vec::Vec<dim::Dim<TView>, idx::Idx<TView>> const & extent,
-                        std::ostream & os,
-                        std::string const & elementSeparator,
-                        std::string const & rowSeparator,
-                        std::string const & rowPrefix,
-                        std::string const & rowSuffix)
-                    -> void
-                    {
-                        alpaka::ignore_unused(view);
-                        alpaka::ignore_unused(rowSeparator);
-
-                        os << rowPrefix;
-
-                        auto const lastIdx(extent[dim::Dim<TView>::value-1u]-1u);
-                        for(auto i(decltype(lastIdx)(0)); i<=lastIdx ;++i)
-                        {
-                            // Add the current element.
-                            os << *(ptr+i);
-
-                            // While we are not at the end of a line, add the element separator.
-                            if(i != lastIdx)
-                            {
-                                os << elementSeparator;
-                            }
-                        }
+                }
 
-                        os << rowSuffix;
-                    }
-                };
+                os << rowSuffix;
             }
-            //-----------------------------------------------------------------------------
-            //! Prints the content of the view to the given queue.
-            // \TODO: Add precision flag.
-            // \TODO: Add column alignment flag.
-            template<
-                typename TView>
-            ALPAKA_FN_HOST auto print(
-                TView const & view,
-                std::ostream & os,
-                std::string const & elementSeparator = ", ",
-                std::string const & rowSeparator = "\n",
-                std::string const & rowPrefix = "[",
-                std::string const & rowSuffix = "]")
-            -> void
+        };
+        //-----------------------------------------------------------------------------
+        template<typename TView>
+        struct Print<DimInt<Dim<TView>::value - 1u>, TView>
+        {
+            ALPAKA_FN_HOST static auto print(
+                TView const& view,
+                Elem<TView> const* const ptr,
+                Vec<Dim<TView>, Idx<TView>> const& extent,
+                std::ostream& os,
+                std::string const& elementSeparator,
+                std::string const& rowSeparator,
+                std::string const& rowPrefix,
+                std::string const& rowSuffix) -> void
             {
-                detail::Print<
-                    dim::DimInt<0u>,
-                    TView>
-                ::print(
-                    view,
-                    mem::view::getPtrNative(view),
-                    extent::getExtentVec(view),
-                    os,
-                    elementSeparator,
-                    rowSeparator,
-                    rowPrefix,
-                    rowSuffix);
-            }
+                alpaka::ignore_unused(view);
+                alpaka::ignore_unused(rowSeparator);
 
-            namespace detail
-            {
-                //#############################################################################
-                //! A class with a create method that returns the pitch for each index.
-                template<
-                    std::size_t Tidx>
-                struct CreatePitchBytes
+                os << rowPrefix;
+
+                auto const lastIdx(extent[Dim<TView>::value - 1u] - 1u);
+                for(auto i(decltype(lastIdx)(0)); i <= lastIdx; ++i)
                 {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename TPitch>
-                    ALPAKA_FN_HOST_ACC
-                    static auto create(
-                        TPitch const & pitch)
-                    -> idx::Idx<TPitch>
+                    // Add the current element.
+                    os << *(ptr + i);
+
+                    // While we are not at the end of a line, add the element separator.
+                    if(i != lastIdx)
                     {
-                        return mem::view::getPitchBytes<Tidx>(pitch);
+                        os << elementSeparator;
                     }
-                };
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The pitch vector.
-            template<
-                typename TPitch>
-            auto getPitchBytesVec(
-                TPitch const & pitch = TPitch())
-            -> vec::Vec<dim::Dim<TPitch>, idx::Idx<TPitch>>
-            {
-                return
-                    vec::createVecFromIndexedFnWorkaround<
-                        dim::Dim<TPitch>,
-                        idx::Idx<TPitch>,
-                        detail::CreatePitchBytes>(
-                            pitch);
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The pitch but only the last N elements.
-            template<
-                typename TDim,
-                typename TPitch>
-            ALPAKA_FN_HOST auto getPitchBytesVecEnd(
-                TPitch const & pitch = TPitch())
-            -> vec::Vec<TDim, idx::Idx<TPitch>>
-            {
-                using IdxOffset = std::integral_constant<std::intmax_t, static_cast<std::intmax_t>(dim::Dim<TPitch>::value) - static_cast<std::intmax_t>(TDim::value)>;
-                return
-                    vec::createVecFromIndexedFnOffsetWorkaround<
-                        TDim,
-                        idx::Idx<TPitch>,
-                        detail::CreatePitchBytes,
-                        IdxOffset>(
-                            pitch);
+                }
+
+                os << rowSuffix;
             }
+        };
+    } // namespace detail
+    //-----------------------------------------------------------------------------
+    //! Prints the content of the view to the given queue.
+    // \TODO: Add precision flag.
+    // \TODO: Add column alignment flag.
+    template<typename TView>
+    ALPAKA_FN_HOST auto print(
+        TView const& view,
+        std::ostream& os,
+        std::string const& elementSeparator = ", ",
+        std::string const& rowSeparator = "\n",
+        std::string const& rowPrefix = "[",
+        std::string const& rowSuffix = "]") -> void
+    {
+        detail::Print<DimInt<0u>, TView>::print(
+            view,
+            getPtrNative(view),
+            extent::getExtentVec(view),
+            os,
+            elementSeparator,
+            rowSeparator,
+            rowPrefix,
+            rowSuffix);
+    }
 
+    namespace detail
+    {
+        //#############################################################################
+        //! A class with a create method that returns the pitch for each index.
+        template<std::size_t Tidx>
+        struct CreatePitchBytes
+        {
             //-----------------------------------------------------------------------------
-            //! \return A view to static device memory.
-            template<
-                typename TElem,
-                typename TDev,
-                typename TExtent>
-            auto createStaticDevMemView(
-                TElem * pMem,
-                TDev const & dev,
-                TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateStaticDevMemView<
-                        TDev>
-                    ::createStaticDevMemView(
-                        pMem,
-                        dev,
-                        extent))
-#endif
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TPitch>
+            ALPAKA_FN_HOST_ACC static auto create(TPitch const& pitch) -> Idx<TPitch>
             {
-                return
-                    traits::CreateStaticDevMemView<
-                        TDev>
-                    ::createStaticDevMemView(
-                        pMem,
-                        dev,
-                        extent);
+                return getPitchBytes<Tidx>(pitch);
             }
-        }
+        };
+    } // namespace detail
+    //-----------------------------------------------------------------------------
+    //! \return The pitch vector.
+    template<typename TPitch>
+    auto getPitchBytesVec(TPitch const& pitch = TPitch()) -> Vec<Dim<TPitch>, Idx<TPitch>>
+    {
+        return createVecFromIndexedFn<Dim<TPitch>, detail::CreatePitchBytes>(pitch);
+    }
+    //-----------------------------------------------------------------------------
+    //! \return The pitch but only the last N elements.
+    template<typename TDim, typename TPitch>
+    ALPAKA_FN_HOST auto getPitchBytesVecEnd(TPitch const& pitch = TPitch()) -> Vec<TDim, Idx<TPitch>>
+    {
+        using IdxOffset = std::integral_constant<
+            std::intmax_t,
+            static_cast<std::intmax_t>(Dim<TPitch>::value) - static_cast<std::intmax_t>(TDim::value)>;
+        return createVecFromIndexedFnOffset<TDim, detail::CreatePitchBytes, IdxOffset>(pitch);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return A view to static device memory.
+    template<typename TElem, typename TDev, typename TExtent>
+    auto createStaticDevMemView(TElem* pMem, TDev const& dev, TExtent const& extent)
+    {
+        return traits::CreateStaticDevMemView<TDev>::createStaticDevMemView(pMem, dev, extent);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp
index 3cf6615797..9a8ca1ab09 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewCompileTimeArray.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -23,72 +23,59 @@ namespace alpaka
     // Trait specializations for fixed idx arrays.
     //
     // This allows the usage of multidimensional compile time arrays e.g. int[4][3] as argument to memory ops.
-    /*namespace dev
+    /*namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The fixed idx array device type trait specialization.
+        template<
+            typename TFixedSizeArray>
+        struct DevType<
+            TFixedSizeArray,
+            std::enable_if_t<std::is_array<TFixedSizeArray>::value>>
         {
-            //#############################################################################
-            //! The fixed idx array device type trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct DevType<
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
-            {
-                using type = dev::DevCpu;
-            };
+            using type = DevCpu;
+        };
 
-            //#############################################################################
-            //! The fixed idx array device get trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct GetDev<
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    TFixedSizeArray const & view)
-                -> dev::DevCpu
-                {
-                    // \FIXME: CUDA device?
-                    return pltf::getDevByIdx<pltf::PltfCpu>(0u);
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+        //#############################################################################
+        //! The fixed idx array device get trait specialization.
+        template<
+            typename TFixedSizeArray>
+        struct GetDev<
+            TFixedSizeArray,
+            std::enable_if_t<std::is_array<TFixedSizeArray>::value>>
         {
-            //#############################################################################
-            //! The fixed idx array dimension getter trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct DimType<
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(
+                TFixedSizeArray const & view)
+            -> DevCpu
             {
-                using type = dim::DimInt<std::rank<TFixedSizeArray>::value>;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
+                // \FIXME: CUDA device?
+                return getDevByIdx<PltfCpu>(0u);
+            }
+        };
+
+        //#############################################################################
+        //! The fixed idx array dimension getter trait specialization.
+        template<
+            typename TFixedSizeArray>
+        struct DimType<
+            TFixedSizeArray,
+            std::enable_if_t<std::is_array<TFixedSizeArray>::value>>
         {
-            //#############################################################################
-            //! The fixed idx array memory element type get trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct ElemType<
-                TFixedSizeArray,
-                typename std::enable_if<
-                    std::is_array<TFixedSizeArray>::value>::type>
-            {
-                using type = typename std::remove_all_extent<TFixedSizeArray>::type;
-            };
-        }
+            using type = DimInt<std::rank<TFixedSizeArray>::value>;
+        };
+
+        //#############################################################################
+        //! The fixed idx array memory element type get trait specialization.
+        template<
+            typename TFixedSizeArray>
+        struct ElemType<
+            TFixedSizeArray,
+            std::enable_if_t<
+                std::is_array<TFixedSizeArray>::value>>
+        {
+            using type = std::remove_all_extent_t<TFixedSizeArray>;
+        };
     }
     namespace extent
     {
@@ -102,119 +89,102 @@ namespace alpaka
             struct GetExtent<
                 TIdxIntegralConst,
                 TFixedSizeArray,
-                typename std::enable_if<
+                std::enable_if_t<
                     std::is_array<TFixedSizeArray>::value
                     && (std::rank<TFixedSizeArray>::value > TIdxIntegralConst::value)
-                    && (std::extent<TFixedSizeArray, TIdxIntegralConst::value>::value > 0u)>::type>
+                    && (std::extent<TFixedSizeArray, TIdxIntegralConst::value>::value > 0u)>>
             {
                 //-----------------------------------------------------------------------------
                 static constexpr auto getExtent(
-                    TFixedSizeArray const & //extent
+                    TFixedSizeArray const & extent
                 )
-                -> idx::Idx<TFixedSizeArray>
+                -> Idx<TFixedSizeArray>
                 {
-                    // C++14 constexpr with void return
-                    //alpaka::ignore_unused(extent);
+                    alpaka::ignore_unused(extent);
                     return std::extent<TFixedSizeArray, TIdxIntegralConst::value>::value;
                 }
             };
         }
     }
-    namespace mem
+    namespace traits
     {
-        namespace view
+        //#############################################################################
+        //! The fixed idx array native pointer get trait specialization.
+        template<
+            typename TFixedSizeArray>
+        struct GetPtrNative<
+            TFixedSizeArray,
+            std::enable_if_t<
+                std::is_array<TFixedSizeArray>::value>>
         {
-            namespace traits
-            {
-                //#############################################################################
-                //! The fixed idx array native pointer get trait specialization.
-                template<
-                    typename TFixedSizeArray>
-                struct GetPtrNative<
-                    TFixedSizeArray,
-                    typename std::enable_if<
-                        std::is_array<TFixedSizeArray>::value>::type>
-                {
-                    using TElem = typename std::remove_all_extent<TFixedSizeArray>::type;
+            using TElem = std::remove_all_extent_t<TFixedSizeArray>;
 
-                    //-----------------------------------------------------------------------------
-                    static auto getPtrNative(
-                        TFixedSizeArray const & view)
-                    -> TElem const *
-                    {
-                        return view;
-                    }
-                    //-----------------------------------------------------------------------------
-                    static auto getPtrNative(
-                        TFixedSizeArray & view)
-                    -> TElem *
-                    {
-                        return view;
-                    }
-                };
+            //-----------------------------------------------------------------------------
+            static auto getPtrNative(
+                TFixedSizeArray const & view)
+            -> TElem const *
+            {
+                return view;
+            }
+            //-----------------------------------------------------------------------------
+            static auto getPtrNative(
+                TFixedSizeArray & view)
+            -> TElem *
+            {
+                return view;
+            }
+        };
 
-                //#############################################################################
-                //! The fixed idx array pitch get trait specialization.
-                template<
-                    typename TFixedSizeArray>
-                struct GetPitchBytes<
-                    dim::DimInt<std::rank<TFixedSizeArray>::value - 1u>,
-                    TFixedSizeArray,
-                    typename std::enable_if<
-                        std::is_array<TFixedSizeArray>::value
-                        && (std::extent<TFixedSizeArray, std::rank<TFixedSizeArray>::value - 1u>::value > 0u)>::type>
-                {
-                    using TElem = typename std::remove_all_extent<TFixedSizeArray>::type;
+        //#############################################################################
+        //! The fixed idx array pitch get trait specialization.
+        template<
+            typename TFixedSizeArray>
+        struct GetPitchBytes<
+            DimInt<std::rank<TFixedSizeArray>::value - 1u>,
+            TFixedSizeArray,
+            std::enable_if_t<
+                std::is_array<TFixedSizeArray>::value
+                && (std::extent<TFixedSizeArray, std::rank<TFixedSizeArray>::value - 1u>::value > 0u)>>
+        {
+            using TElem = std::remove_all_extent_t<TFixedSizeArray>;
 
-                    //-----------------------------------------------------------------------------
-                    static constexpr auto getPitchBytes(
-                        TFixedSizeArray const &)
-                    -> idx::Idx<TFixedSizeArray>
-                    {
-                        return sizeof(TElem) * std::extent<TFixedSizeArray, std::rank<TFixedSizeArray>::value - 1u>::value;
-                    }
-                };
+            //-----------------------------------------------------------------------------
+            static constexpr auto getPitchBytes(
+                TFixedSizeArray const &)
+            -> Idx<TFixedSizeArray>
+            {
+                return sizeof(TElem) * std::extent<TFixedSizeArray, std::rank<TFixedSizeArray>::value - 1u>::value;
             }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
+        };
+
+        //#############################################################################
+        //! The fixed idx array offset get trait specialization.
+        template<
+            typename TIdx,
+            typename TFixedSizeArray>
+        struct GetOffset<
+            TIdx,
+            TFixedSizeArray,
+            std::enable_if_t<std::is_array<TFixedSizeArray>::value>>
         {
-            //#############################################################################
-            //! The fixed idx array offset get trait specialization.
-            template<
-                typename TIdx,
-                typename TFixedSizeArray>
-            struct GetOffset<
-                TIdx,
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
+            //-----------------------------------------------------------------------------
+            static auto getOffset(
+                TFixedSizeArray const &)
+            -> Idx<TFixedSizeArray>
             {
-                //-----------------------------------------------------------------------------
-                static auto getOffset(
-                    TFixedSizeArray const &)
-                -> idx::Idx<TFixedSizeArray>
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The std::vector idx type trait specialization.
+        template<
+            typename TFixedSizeArray>
+        struct IdxType<
+            TFixedSizeArray,
+            std::enable_if_t<std::is_array<TFixedSizeArray>::value>>
         {
-            //#############################################################################
-            //! The std::vector idx type trait specialization.
-            template<
-                typename TFixedSizeArray>
-            struct IdxType<
-                TFixedSizeArray,
-                typename std::enable_if<std::is_array<TFixedSizeArray>::value>::type>
-            {
-                using type = std::size_t;
-            };
-        }
+            using type = std::size_t;
+        };
     }*/
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp
index 19ddb1cab3..44f486a9a0 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewPlainPtr.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,427 +9,285 @@
 
 #pragma once
 
+#include <alpaka/dev/DevCpu.hpp>
+#include <alpaka/dev/DevOacc.hpp>
+#include <alpaka/dev/DevOmp5.hpp>
+#include <alpaka/dev/DevUniformCudaHipRt.hpp>
 #include <alpaka/mem/view/Traits.hpp>
-
 #include <alpaka/vec/Vec.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
+
+#include <type_traits>
 
 namespace alpaka
 {
-    namespace mem
+    //#############################################################################
+    //! The memory view to wrap plain pointers.
+    template<typename TDev, typename TElem, typename TDim, typename TIdx>
+    class ViewPlainPtr final
     {
-        namespace view
-        {
-            //#############################################################################
-            //! The memory view to wrap plain pointers.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class ViewPlainPtr final
-            {
-                static_assert(
-                    !std::is_const<TIdx>::value,
-                    "The idx type of the view can not be const!");
-            public:
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST ViewPlainPtr(
-                    TElem * pMem,
-                    TDev const & dev,
-                    TExtent const & extent = TExtent()) :
-                        m_pMem(pMem),
-                        m_dev(dev),
-                        m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                        m_pitchBytes(calculatePitchesFromExtents(m_extentElements))
-                {}
+        static_assert(!std::is_const<TIdx>::value, "The idx type of the view can not be const!");
 
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TExtent,
-                    typename TPitch>
-                ALPAKA_FN_HOST ViewPlainPtr(
-                    TElem * pMem,
-                    TDev const dev,
-                    TExtent const & extent,
-                    TPitch const & pitchBytes) :
-                        m_pMem(pMem),
-                        m_dev(dev),
-                        m_extentElements(extent::getExtentVecEnd<TDim>(extent)),
-                        m_pitchBytes(
-                            vec::subVecEnd<TDim>(
-                               static_cast<
-                                    vec::Vec<TDim, TIdx> >(pitchBytes)
-                            )
-                        )
-                {}
+        using Dev = alpaka::Dev<TDev>;
 
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST
-                ViewPlainPtr(ViewPlainPtr const &) = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST
-                ViewPlainPtr(ViewPlainPtr && other) :
-                        m_pMem(other.m_pMem),
-                        m_dev(other.m_dev),
-                        m_extentElements(other.m_extentElements),
-                        m_pitchBytes(other.m_pitchBytes)
-                {
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST
-                auto operator=(ViewPlainPtr const &) -> ViewPlainPtr & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST
-                auto operator=(ViewPlainPtr &&) -> ViewPlainPtr & = delete;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST ~ViewPlainPtr() = default;
+    public:
+        //-----------------------------------------------------------------------------
+        template<typename TExtent>
+        ALPAKA_FN_HOST ViewPlainPtr(TElem* pMem, Dev const& dev, TExtent const& extent = TExtent())
+            : m_pMem(pMem)
+            , m_dev(dev)
+            , m_extentElements(extent::getExtentVecEnd<TDim>(extent))
+            , m_pitchBytes(calculatePitchesFromExtents(m_extentElements))
+        {
+        }
 
-            private:
-                //-----------------------------------------------------------------------------
-                //! Calculate the pitches purely from the extents.
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST static auto calculatePitchesFromExtents(
-                    TExtent const & extent)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    vec::Vec<TDim, TIdx> pitchBytes(vec::Vec<TDim, TIdx>::all(0));
-                    pitchBytes[TDim::value - 1u] = extent[TDim::value - 1u] * static_cast<TIdx>(sizeof(TElem));
-                    for(TIdx i = TDim::value - 1u; i > static_cast<TIdx>(0u); --i)
-                    {
-                        pitchBytes[i-1] = extent[i-1] * pitchBytes[i];
-                    }
-                    return pitchBytes;
-                }
+        //-----------------------------------------------------------------------------
+        template<typename TExtent, typename TPitch>
+        ALPAKA_FN_HOST ViewPlainPtr(TElem* pMem, Dev const dev, TExtent const& extent, TPitch const& pitchBytes)
+            : m_pMem(pMem)
+            , m_dev(dev)
+            , m_extentElements(extent::getExtentVecEnd<TDim>(extent))
+            , m_pitchBytes(subVecEnd<TDim>(static_cast<Vec<TDim, TIdx>>(pitchBytes)))
+        {
+        }
 
-            public:
-                TElem * const m_pMem;
-                TDev const m_dev;
-                vec::Vec<TDim, TIdx> const m_extentElements;
-                vec::Vec<TDim, TIdx> const m_pitchBytes;
-            };
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST
+        ViewPlainPtr(ViewPlainPtr const&) = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST
+        ViewPlainPtr(ViewPlainPtr&& other) noexcept
+            : m_pMem(other.m_pMem)
+            , m_dev(other.m_dev)
+            , m_extentElements(other.m_extentElements)
+            , m_pitchBytes(other.m_pitchBytes)
+        {
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST
+        auto operator=(ViewPlainPtr const&) -> ViewPlainPtr& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST
+        auto operator=(ViewPlainPtr&&) -> ViewPlainPtr& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST ~ViewPlainPtr() = default;
+
+    private:
+        //-----------------------------------------------------------------------------
+        //! Calculate the pitches purely from the extents.
+        template<typename TExtent>
+        ALPAKA_FN_HOST static auto calculatePitchesFromExtents(TExtent const& extent) -> Vec<TDim, TIdx>
+        {
+            Vec<TDim, TIdx> pitchBytes(Vec<TDim, TIdx>::all(0));
+            pitchBytes[TDim::value - 1u] = extent[TDim::value - 1u] * static_cast<TIdx>(sizeof(TElem));
+            for(TIdx i = TDim::value - 1u; i > static_cast<TIdx>(0u); --i)
+            {
+                pitchBytes[i - 1] = extent[i - 1] * pitchBytes[i];
+            }
+            return pitchBytes;
         }
-    }
+
+    public:
+        TElem* const m_pMem;
+        Dev const m_dev;
+        Vec<TDim, TIdx> const m_extentElements;
+        Vec<TDim, TIdx> const m_pitchBytes;
+    };
 
     //-----------------------------------------------------------------------------
     // Trait specializations for ViewPlainPtr.
-    namespace dev
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The ViewPlainPtr device type trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct DevType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The ViewPlainPtr device type trait specialization.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DevType<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TDev;
-            };
+            using type = alpaka::Dev<TDev>;
+        };
 
-            //#############################################################################
-            //! The ViewPlainPtr device get trait specialization.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetDev<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                static auto getDev(
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & view)
-                    -> TDev
-                {
-                    return view.m_dev;
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+        //#############################################################################
+        //! The ViewPlainPtr device get trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetDev<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The ViewPlainPtr dimension getter trait.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+            static auto getDev(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) -> alpaka::Dev<TDev>
             {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
+                return view.m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The ViewPlainPtr dimension getter trait.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct DimType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The ViewPlainPtr memory element type get trait specialization.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct ElemType<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The ViewPlainPtr memory element type get trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct ElemType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+    } // namespace traits
     namespace extent
     {
         namespace traits
         {
             //#############################################################################
             //! The ViewPlainPtr width get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
+            template<typename TIdxIntegralConst, typename TDev, typename TElem, typename TDim, typename TIdx>
             struct GetExtent<
                 TIdxIntegralConst,
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+                ViewPlainPtr<TDev, TElem, TDim, TIdx>,
+                std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
                 ALPAKA_FN_HOST_ACC
-                static auto getExtent(
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & extent)
-                -> TIdx
+                static auto getExtent(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& extent) -> TIdx
                 {
                     return extent.m_extentElements[TIdxIntegralConst::value];
                 }
             };
-        }
-    }
-    namespace mem
+        } // namespace traits
+    } // namespace extent
+
+    namespace traits
     {
-        namespace view
+        //#############################################################################
+        //! The ViewPlainPtr native pointer get trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetPtrNative<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
         {
-            namespace traits
+            static auto getPtrNative(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) -> TElem const*
             {
-                //#############################################################################
-                //! The ViewPlainPtr native pointer get trait specialization.
-                template<
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-                {
-                    static auto getPtrNative(
-                        mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & view)
-                    -> TElem const *
-                    {
-                        return view.m_pMem;
-                    }
-                    static auto getPtrNative(
-                        mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> & view)
-                    -> TElem *
-                    {
-                        return view.m_pMem;
-                    }
-                };
+                return view.m_pMem;
+            }
+            static auto getPtrNative(ViewPlainPtr<TDev, TElem, TDim, TIdx>& view) -> TElem*
+            {
+                return view.m_pMem;
+            }
+        };
 
-                //#############################################################################
-                //! The ViewPlainPtr memory pitch get trait specialization.
-                template<
-                    typename TIdxIntegralConst,
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    TIdxIntegralConst,
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>,
-                    typename std::enable_if<TIdxIntegralConst::value < TDim::value>::type>
-                {
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & view)
-                    -> TIdx
-                    {
-                        return view.m_pitchBytes[TIdxIntegralConst::value];
-                    }
-                };
+        //#############################################################################
+        //! The ViewPlainPtr memory pitch get trait specialization.
+        template<typename TIdxIntegralConst, typename TDev, typename TElem, typename TDim, typename TIdx>
+            struct GetPitchBytes < TIdxIntegralConst,
+            ViewPlainPtr<TDev, TElem, TDim, TIdx>, std::enable_if_t<TIdxIntegralConst::value<TDim::value>>
+        {
+            ALPAKA_FN_HOST static auto getPitchBytes(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) -> TIdx
+            {
+                return view.m_pitchBytes[TIdxIntegralConst::value];
+            }
+        };
 
-                //#############################################################################
-                //! The CPU device CreateStaticDevMemView trait specialization.
-                template<>
-                struct CreateStaticDevMemView<
-                    dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TElem,
-                        typename TExtent>
-                    static auto createStaticDevMemView(
-                        TElem * pMem,
-                        dev::DevCpu const & dev,
-                        TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                    -> alpaka::mem::view::ViewPlainPtr<dev::DevCpu, TElem, alpaka::dim::Dim<TExtent>, alpaka::idx::Idx<TExtent>>
-#endif
-                    {
-                        return
-                            alpaka::mem::view::ViewPlainPtr<
-                                dev::DevCpu,
-                                TElem,
-                                alpaka::dim::Dim<TExtent>,
-                                alpaka::idx::Idx<TExtent>>(
-                                    pMem,
-                                    dev,
-                                    extent);
-                    }
-                };
+        //#############################################################################
+        //! The CPU device CreateStaticDevMemView trait specialization.
+        template<>
+        struct CreateStaticDevMemView<DevCpu>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TElem, typename TExtent>
+            static auto createStaticDevMemView(TElem* pMem, DevCpu const& dev, TExtent const& extent)
+            {
+                return alpaka::ViewPlainPtr<DevCpu, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMem,
+                    dev,
+                    extent);
+            }
+        };
 
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                //#############################################################################
-                //! The CUDA RT device CreateStaticDevMemView trait specialization.
-                template<>
-                struct CreateStaticDevMemView<
-                    dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TElem,
-                        typename TExtent>
-                    static auto createStaticDevMemView(
-                        TElem * pMem,
-                        dev::DevCudaRt const & dev,
-                        TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                    -> alpaka::mem::view::ViewPlainPtr<dev::DevCudaRt, TElem, alpaka::dim::Dim<TExtent>, alpaka::idx::Idx<TExtent>>
-#endif
-                    {
-                        TElem* pMemAcc(nullptr);
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaGetSymbolAddress(
-                                reinterpret_cast<void **>(&pMemAcc),
-                                *pMem));
-                        return
-                            alpaka::mem::view::ViewPlainPtr<
-                                dev::DevCudaRt,
-                                TElem,
-                                alpaka::dim::Dim<TExtent>,
-                                alpaka::idx::Idx<TExtent>>(
-                                    pMemAcc,
-                                    dev,
-                                    extent);
-                    }
-                };
-#endif
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+        //#############################################################################
+        //! The CUDA/HIP RT device CreateStaticDevMemView trait specialization.
+        template<>
+        struct CreateStaticDevMemView<DevUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TElem, typename TExtent>
+            static auto createStaticDevMemView(TElem* pMem, DevUniformCudaHipRt const& dev, TExtent const& extent)
+            {
+                TElem* pMemAcc(nullptr);
 
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-                //#############################################################################
-                //! The HIP RT device CreateStaticDevMemView trait specialization.
-                template<>
-                struct CreateStaticDevMemView<
-                    dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TElem,
-                        typename TExtent>
-                    static auto createStaticDevMemView(
-                        TElem * pMem,
-                        dev::DevHipRt const & dev,
-                        TExtent const & extent)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                    -> alpaka::mem::view::ViewPlainPtr<dev::DevHipRt, TElem, alpaka::dim::Dim<TExtent>, alpaka::idx::Idx<TExtent>>
-#endif
-                    {
-                        TElem* pMemAcc(nullptr);
-#ifdef __HIP_PLATFORM_NVCC__
-                        ALPAKA_HIP_RT_CHECK(hipCUDAErrorTohipError(
-                            cudaGetSymbolAddress(
-                                reinterpret_cast<void **>(&pMemAcc),
-                                *pMem)));
-#else
-                        // FIXME: still does not work in HIP(HCC) (results in hipErrorNotFound)
-                        // HIP_SYMBOL(X) not useful because it only does #X on HIP(HCC), while &X on HIP(NVCC)
-                        ALPAKA_HIP_RT_CHECK(
-                            hipGetSymbolAddress(
-                                reinterpret_cast<void **>(&pMemAcc),
-                                pMem));
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *pMem));
+#    else
+#        ifdef __HIP_PLATFORM_NVCC__
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    hipCUDAErrorTohipError(cudaGetSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *pMem)));
+#        else
+                // FIXME: still does not work in HIP(clang) (results in hipErrorNotFound)
+                // HIP_SYMBOL(X) not useful because it only does #X on HIP(clang), while &X on HIP(NVCC)
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipGetSymbolAddress(reinterpret_cast<void**>(&pMemAcc), pMem));
+#        endif
+#    endif
+                return alpaka::ViewPlainPtr<DevUniformCudaHipRt, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMemAcc,
+                    dev,
+                    extent);
+            }
+        };
 #endif
 
-                        return
-                            alpaka::mem::view::ViewPlainPtr<
-                                dev::DevHipRt,
-                                TElem,
-                                alpaka::dim::Dim<TExtent>,
-                                alpaka::idx::Idx<TExtent>>(
-                                    pMemAcc,
-                                    dev,
-                                    extent);
-                    }
-                };
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+        //#############################################################################
+        //! The Omp5 device CreateStaticDevMemView trait specialization.
+        //! \todo What ist this for? Does this exist in OMP5?
+        template<>
+        struct CreateStaticDevMemView<DevOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            template<typename TElem, typename TExtent>
+            static auto createStaticDevMemView(TElem* pMem, DevOmp5 const& dev, TExtent const& extent)
+            {
+                return alpaka::ViewPlainPtr<DevOmp5, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMem,
+                    dev,
+                    extent);
+            }
+        };
 #endif
 
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+        //#############################################################################
+        //! The Oacc device CreateStaticDevMemView trait specialization.
+        template<>
+        struct CreateStaticDevMemView<DevOacc>
         {
-            //#############################################################################
-            //! The ViewPlainPtr offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            template<typename TElem, typename TExtent>
+            static auto createStaticDevMemView(TElem* pMem, DevOacc const& dev, TExtent const& extent)
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC
-                static auto getOffset(
-                    mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const &)
-                -> TIdx
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return alpaka::ViewPlainPtr<DevOacc, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMem,
+                    dev,
+                    extent);
+            }
+        };
+#endif
+
+        //#############################################################################
+        //! The ViewPlainPtr offset get trait specialization.
+        template<typename TIdxIntegralConst, typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetOffset<TIdxIntegralConst, ViewPlainPtr<TDev, TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The ViewPlainPtr idx type trait specialization.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC
+            static auto getOffset(ViewPlainPtr<TDev, TElem, TDim, TIdx> const&) -> TIdx
             {
-                using type = TIdx;
-            };
-        }
-    }
-}
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The ViewPlainPtr idx type trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct IdxType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdArray.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdArray.hpp
index 561f479b86..2c970f31dd 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdArray.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdArray.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -19,185 +19,115 @@
 
 namespace alpaka
 {
-    namespace dev
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The std::array device type trait specialization.
+        template<typename TElem, std::size_t Tsize>
+        struct DevType<std::array<TElem, Tsize>>
         {
-            //#############################################################################
-            //! The std::array device type trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct DevType<
-                std::array<TElem, Tsize>>
-            {
-                using type = dev::DevCpu;
-            };
+            using type = DevCpu;
+        };
 
-            //#############################################################################
-            //! The std::array device get trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct GetDev<
-                std::array<TElem, Tsize>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    std::array<TElem, Tsize> const & view)
-                -> dev::DevCpu
-                {
-                    alpaka::ignore_unused(view);
-                    return pltf::getDevByIdx<pltf::PltfCpu>(0u);
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+        //#############################################################################
+        //! The std::array device get trait specialization.
+        template<typename TElem, std::size_t Tsize>
+        struct GetDev<std::array<TElem, Tsize>>
         {
-            //#############################################################################
-            //! The std::array dimension getter trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct DimType<
-                std::array<TElem, Tsize>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(std::array<TElem, Tsize> const& view) -> DevCpu
             {
-                using type = dim::DimInt<1u>;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
+                alpaka::ignore_unused(view);
+                return getDevByIdx<PltfCpu>(0u);
+            }
+        };
+
+        //#############################################################################
+        //! The std::array dimension getter trait specialization.
+        template<typename TElem, std::size_t Tsize>
+        struct DimType<std::array<TElem, Tsize>>
         {
-            //#############################################################################
-            //! The std::array memory element type get trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct ElemType<
-                std::array<TElem, Tsize>>
-            {
-                using type = TElem;
-            };
-        }
-    }
+            using type = DimInt<1u>;
+        };
+
+        //#############################################################################
+        //! The std::array memory element type get trait specialization.
+        template<typename TElem, std::size_t Tsize>
+        struct ElemType<std::array<TElem, Tsize>>
+        {
+            using type = TElem;
+        };
+    } // namespace traits
     namespace extent
     {
         namespace traits
         {
             //#############################################################################
             //! The std::array width get trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct GetExtent<
-                dim::DimInt<0u>,
-                std::array<TElem, Tsize>>
+            template<typename TElem, std::size_t Tsize>
+            struct GetExtent<DimInt<0u>, std::array<TElem, Tsize>>
             {
                 //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static constexpr auto getExtent(
-                    std::array<TElem, Tsize> const & /*extent*/)
-                -> idx::Idx<std::array<TElem, Tsize>>
+                ALPAKA_FN_HOST static constexpr auto getExtent(std::array<TElem, Tsize> const& extent)
+                    -> Idx<std::array<TElem, Tsize>>
                 {
-                    // C++14 constexpr with void return
-                    /*alpaka::ignore_unused(extent);*/
+                    alpaka::ignore_unused(extent);
                     return Tsize;
                 }
             };
-        }
-    }
-    namespace mem
+        } // namespace traits
+    } // namespace extent
+
+    namespace traits
     {
-        namespace view
+        //#############################################################################
+        //! The std::array native pointer get trait specialization.
+        template<typename TElem, std::size_t Tsize>
+        struct GetPtrNative<std::array<TElem, Tsize>>
         {
-            namespace traits
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(std::array<TElem, Tsize> const& view) -> TElem const*
             {
-                //#############################################################################
-                //! The std::array native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    std::size_t Tsize>
-                struct GetPtrNative<
-                    std::array<TElem, Tsize>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        std::array<TElem, Tsize> const & view)
-                    -> TElem const *
-                    {
-                        return view.data();
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        std::array<TElem, Tsize> & view)
-                    -> TElem *
-                    {
-                        return view.data();
-                    }
-                };
-
-                //#############################################################################
-                //! The std::array pitch get trait specialization.
-                template<
-                    typename TElem,
-                    std::size_t Tsize>
-                struct GetPitchBytes<
-                    dim::DimInt<0u>,
-                    std::array<TElem, Tsize>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        std::array<TElem, Tsize> const & pitch)
-                    -> idx::Idx<std::array<TElem, Tsize>>
-                    {
-                        return sizeof(TElem) * pitch.size();
-                    }
-                };
+                return view.data();
             }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(std::array<TElem, Tsize>& view) -> TElem*
+            {
+                return view.data();
+            }
+        };
+
+        //#############################################################################
+        //! The std::array pitch get trait specialization.
+        template<typename TElem, std::size_t Tsize>
+        struct GetPitchBytes<DimInt<0u>, std::array<TElem, Tsize>>
         {
-            //#############################################################################
-            //! The std::array offset get trait specialization.
-            template<
-                typename TIdx,
-                typename TElem,
-                std::size_t Tsize>
-            struct GetOffset<
-                TIdx,
-                std::array<TElem, Tsize>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPitchBytes(std::array<TElem, Tsize> const& pitch)
+                -> Idx<std::array<TElem, Tsize>>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                    std::array<TElem, Tsize> const &)
-                -> idx::Idx<std::array<TElem, Tsize>>
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return sizeof(TElem) * pitch.size();
+            }
+        };
+
+        //#############################################################################
+        //! The std::array offset get trait specialization.
+        template<typename TIdx, typename TElem, std::size_t Tsize>
+        struct GetOffset<TIdx, std::array<TElem, Tsize>>
         {
-            //#############################################################################
-            //! The std::vector idx type trait specialization.
-            template<
-                typename TElem,
-                std::size_t Tsize>
-            struct IdxType<
-                std::array<TElem, Tsize>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getOffset(std::array<TElem, Tsize> const&) -> Idx<std::array<TElem, Tsize>>
             {
-                using type = std::size_t;
-            };
-        }
-    }
-}
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The std::vector idx type trait specialization.
+        template<typename TElem, std::size_t Tsize>
+        struct IdxType<std::array<TElem, Tsize>>
+        {
+            using type = std::size_t;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdVector.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdVector.hpp
index 860108261f..b02ef2a921 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdVector.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewStdVector.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -19,183 +19,114 @@
 
 namespace alpaka
 {
-    namespace dev
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The std::vector device type trait specialization.
+        template<typename TElem, typename TAllocator>
+        struct DevType<std::vector<TElem, TAllocator>>
         {
-            //#############################################################################
-            //! The std::vector device type trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct DevType<
-                std::vector<TElem, TAllocator>>
-            {
-                using type = dev::DevCpu;
-            };
+            using type = DevCpu;
+        };
 
-            //#############################################################################
-            //! The std::vector device get trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct GetDev<
-                std::vector<TElem, TAllocator>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    std::vector<TElem, TAllocator> const & view)
-                -> dev::DevCpu
-                {
-                    alpaka::ignore_unused(view);
-                    return pltf::getDevByIdx<pltf::PltfCpu>(0u);
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+        //#############################################################################
+        //! The std::vector device get trait specialization.
+        template<typename TElem, typename TAllocator>
+        struct GetDev<std::vector<TElem, TAllocator>>
         {
-            //#############################################################################
-            //! The std::vector dimension getter trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct DimType<
-                std::vector<TElem, TAllocator>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(std::vector<TElem, TAllocator> const& view) -> DevCpu
             {
-                using type = dim::DimInt<1u>;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
+                alpaka::ignore_unused(view);
+                return getDevByIdx<PltfCpu>(0u);
+            }
+        };
+
+        //#############################################################################
+        //! The std::vector dimension getter trait specialization.
+        template<typename TElem, typename TAllocator>
+        struct DimType<std::vector<TElem, TAllocator>>
         {
-            //#############################################################################
-            //! The std::vector memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct ElemType<
-                std::vector<TElem, TAllocator>>
-            {
-                using type = TElem;
-            };
-        }
-    }
+            using type = DimInt<1u>;
+        };
+
+        //#############################################################################
+        //! The std::vector memory element type get trait specialization.
+        template<typename TElem, typename TAllocator>
+        struct ElemType<std::vector<TElem, TAllocator>>
+        {
+            using type = TElem;
+        };
+    } // namespace traits
     namespace extent
     {
         namespace traits
         {
             //#############################################################################
             //! The std::vector width get trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct GetExtent<
-                dim::DimInt<0u>,
-                std::vector<TElem, TAllocator>>
+            template<typename TElem, typename TAllocator>
+            struct GetExtent<DimInt<0u>, std::vector<TElem, TAllocator>>
             {
                 //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    std::vector<TElem, TAllocator> const & extent)
-                -> idx::Idx<std::vector<TElem, TAllocator>>
+                ALPAKA_FN_HOST static auto getExtent(std::vector<TElem, TAllocator> const& extent)
+                    -> Idx<std::vector<TElem, TAllocator>>
                 {
                     return extent.size();
                 }
             };
-        }
-    }
-    namespace mem
+        } // namespace traits
+    } // namespace extent
+    namespace traits
     {
-        namespace view
+        //#############################################################################
+        //! The std::vector native pointer get trait specialization.
+        template<typename TElem, typename TAllocator>
+        struct GetPtrNative<std::vector<TElem, TAllocator>>
         {
-            namespace traits
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(std::vector<TElem, TAllocator> const& view) -> TElem const*
             {
-                //#############################################################################
-                //! The std::vector native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TAllocator>
-                struct GetPtrNative<
-                    std::vector<TElem, TAllocator>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        std::vector<TElem, TAllocator> const & view)
-                    -> TElem const *
-                    {
-                        return view.data();
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        std::vector<TElem, TAllocator> & view)
-                    -> TElem *
-                    {
-                        return view.data();
-                    }
-                };
-
-                //#############################################################################
-                //! The std::vector pitch get trait specialization.
-                template<
-                    typename TElem,
-                    typename TAllocator>
-                struct GetPitchBytes<
-                    dim::DimInt<0u>,
-                    std::vector<TElem, TAllocator>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        std::vector<TElem, TAllocator> const & pitch)
-                    -> idx::Idx<std::vector<TElem, TAllocator>>
-                    {
-                        return sizeof(TElem) * pitch.size();
-                    }
-                };
+                return view.data();
             }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(std::vector<TElem, TAllocator>& view) -> TElem*
+            {
+                return view.data();
+            }
+        };
+
+        //#############################################################################
+        //! The std::vector pitch get trait specialization.
+        template<typename TElem, typename TAllocator>
+        struct GetPitchBytes<DimInt<0u>, std::vector<TElem, TAllocator>>
         {
-            //#############################################################################
-            //! The std::vector offset get trait specialization.
-            template<
-                typename TIdx,
-                typename TElem,
-                typename TAllocator>
-            struct GetOffset<
-                TIdx,
-                std::vector<TElem, TAllocator>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPitchBytes(std::vector<TElem, TAllocator> const& pitch)
+                -> Idx<std::vector<TElem, TAllocator>>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                    std::vector<TElem, TAllocator> const &)
-                -> idx::Idx<std::vector<TElem, TAllocator>>
-                {
-                    return 0u;
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return sizeof(TElem) * pitch.size();
+            }
+        };
+
+        //#############################################################################
+        //! The std::vector offset get trait specialization.
+        template<typename TIdx, typename TElem, typename TAllocator>
+        struct GetOffset<TIdx, std::vector<TElem, TAllocator>>
         {
-            //#############################################################################
-            //! The std::vector idx type trait specialization.
-            template<
-                typename TElem,
-                typename TAllocator>
-            struct IdxType<
-                std::vector<TElem, TAllocator>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getOffset(std::vector<TElem, TAllocator> const&)
+                -> Idx<std::vector<TElem, TAllocator>>
             {
-                using type = std::size_t;
-            };
-        }
-    }
-}
+                return 0u;
+            }
+        };
+
+        //#############################################################################
+        //! The std::vector idx type trait specialization.
+        template<typename TElem, typename TAllocator>
+        struct IdxType<std::vector<TElem, TAllocator>>
+        {
+            using type = std::size_t;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewSubView.hpp b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewSubView.hpp
index 57159cadf1..190c606034 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewSubView.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/mem/view/ViewSubView.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,431 +9,289 @@
 
 #pragma once
 
-#include <alpaka/dim/Traits.hpp>
+#include <alpaka/core/Assert.hpp>
+#include <alpaka/core/Common.hpp>
 #include <alpaka/dev/Traits.hpp>
+#include <alpaka/dim/Traits.hpp>
 #include <alpaka/extent/Traits.hpp>
-#include <alpaka/mem/view/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
 #include <alpaka/idx/Traits.hpp>
-
+#include <alpaka/mem/view/Traits.hpp>
 #include <alpaka/mem/view/ViewPlainPtr.hpp>
+#include <alpaka/offset/Traits.hpp>
 #include <alpaka/vec/Vec.hpp>
 
-#include <alpaka/core/Assert.hpp>
-#include <alpaka/core/Common.hpp>
-
 #include <type_traits>
+#include <utility>
 
 namespace alpaka
 {
-    namespace mem
+    //#############################################################################
+    //! A sub-view to a view.
+    template<typename TDev, typename TElem, typename TDim, typename TIdx>
+    class ViewSubView
     {
-        namespace view
-        {
-            //#############################################################################
-            //! A sub-view to a view.
-            template<
-                typename TDev,
-                typename TElem,
-                typename TDim,
-                typename TIdx>
-            class ViewSubView
-            {
-                static_assert(
-                    !std::is_const<TIdx>::value,
-                    "The idx type of the view can not be const!");
-            public:
-                //-----------------------------------------------------------------------------
-                //! Constructor.
-                //! \param view The view this view is a sub-view of.
-                //! \param extentElements The extent in elements.
-                //! \param relativeOffsetsElements The offsets in elements.
-                template<
-                    typename TView,
-                    typename TOffsets,
-                    typename TExtent>
-                ViewSubView(
-                    TView const & view,
-                    TExtent const & extentElements,
-                    TOffsets const & relativeOffsetsElements = TOffsets()) :
-                        m_viewParentView(
-                            mem::view::getPtrNative(view),
-                            dev::getDev(view),
-                            extent::getExtentVec(view),
-                            mem::view::getPitchBytesVec(view)),
-                        m_extentElements(extent::getExtentVec(extentElements)),
-                        m_offsetsElements(offset::getOffsetVec(relativeOffsetsElements))
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+        static_assert(!std::is_const<TIdx>::value, "The idx type of the view can not be const!");
 
-                    static_assert(
-                        std::is_same<TDev, dev::Dev<TView>>::value,
-                        "The dev type of TView and the TDev template parameter have to be identical!");
+        using Dev = alpaka::Dev<TDev>;
 
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TView>>::value,
-                        "The idx type of TView and the TIdx template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TOffsets>>::value,
-                        "The idx type of TOffsets and the TIdx template parameter have to be identical!");
+    public:
+        //-----------------------------------------------------------------------------
+        //! Constructor.
+        //! \param view The view this view is a sub-view of.
+        //! \param extentElements The extent in elements.
+        //! \param relativeOffsetsElements The offsets in elements.
+        template<typename TView, typename TOffsets, typename TExtent>
+        ViewSubView(
+            TView const& view,
+            TExtent const& extentElements,
+            TOffsets const& relativeOffsetsElements = TOffsets())
+            : m_viewParentView(getPtrNative(view), getDev(view), extent::getExtentVec(view), getPitchBytesVec(view))
+            , m_extentElements(extent::getExtentVec(extentElements))
+            , m_offsetsElements(getOffsetVec(relativeOffsetsElements))
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TView>>::value,
-                        "The dim type of TView and the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TExtent>>::value,
-                        "The dim type of TExtent and the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TOffsets>>::value,
-                        "The dim type of TOffsets and the TDim template parameter have to be identical!");
+            static_assert(
+                std::is_same<Dev, alpaka::Dev<TView>>::value,
+                "The dev type of TView and the Dev template parameter have to be identical!");
 
-                    ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= extent::getExtentVec(view)).foldrAll(std::logical_and<bool>()));
-                }
-                //-----------------------------------------------------------------------------
-                //! Constructor.
-                //! \param view The view this view is a sub-view of.
-                //! \param extentElements The extent in elements.
-                //! \param relativeOffsetsElements The offsets in elements.
-                template<
-                    typename TView,
-                    typename TOffsets,
-                    typename TExtent>
-                ViewSubView(
-                    TView & view,
-                    TExtent const & extentElements,
-                    TOffsets const & relativeOffsetsElements = TOffsets()) :
-                        m_viewParentView(
-                            mem::view::getPtrNative(view),
-                            dev::getDev(view),
-                            extent::getExtentVec(view),
-                            mem::view::getPitchBytesVec(view)),
-                        m_extentElements(extent::getExtentVec(extentElements)),
-                        m_offsetsElements(offset::getOffsetVec(relativeOffsetsElements))
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+            static_assert(
+                std::is_same<TIdx, Idx<TView>>::value,
+                "The idx type of TView and the TIdx template parameter have to be identical!");
+            static_assert(
+                std::is_same<TIdx, Idx<TExtent>>::value,
+                "The idx type of TExtent and the TIdx template parameter have to be identical!");
+            static_assert(
+                std::is_same<TIdx, Idx<TOffsets>>::value,
+                "The idx type of TOffsets and the TIdx template parameter have to be identical!");
 
-                    static_assert(
-                        std::is_same<TDev, dev::Dev<TView>>::value,
-                        "The dev type of TView and the TDev template parameter have to be identical!");
+            static_assert(
+                std::is_same<TDim, Dim<TView>>::value,
+                "The dim type of TView and the TDim template parameter have to be identical!");
+            static_assert(
+                std::is_same<TDim, Dim<TExtent>>::value,
+                "The dim type of TExtent and the TDim template parameter have to be identical!");
+            static_assert(
+                std::is_same<TDim, Dim<TOffsets>>::value,
+                "The dim type of TOffsets and the TDim template parameter have to be identical!");
 
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TView>>::value,
-                        "The idx type of TView and the TIdx template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TExtent>>::value,
-                        "The idx type of TExtent and the TIdx template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TIdx, idx::Idx<TOffsets>>::value,
-                        "The idx type of TOffsets and the TIdx template parameter have to be identical!");
+            ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= extent::getExtentVec(view))
+                              .foldrAll(std::logical_and<bool>()));
+        }
+        //-----------------------------------------------------------------------------
+        //! Constructor.
+        //! \param view The view this view is a sub-view of.
+        //! \param extentElements The extent in elements.
+        //! \param relativeOffsetsElements The offsets in elements.
+        template<typename TView, typename TOffsets, typename TExtent>
+        ViewSubView(TView& view, TExtent const& extentElements, TOffsets const& relativeOffsetsElements = TOffsets())
+            : m_viewParentView(getPtrNative(view), getDev(view), extent::getExtentVec(view), getPitchBytesVec(view))
+            , m_extentElements(extent::getExtentVec(extentElements))
+            , m_offsetsElements(getOffsetVec(relativeOffsetsElements))
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TView>>::value,
-                        "The dim type of TView and the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TExtent>>::value,
-                        "The dim type of TExtent and the TDim template parameter have to be identical!");
-                    static_assert(
-                        std::is_same<TDim, dim::Dim<TOffsets>>::value,
-                        "The dim type of TOffsets and the TDim template parameter have to be identical!");
+            static_assert(
+                std::is_same<Dev, alpaka::Dev<TView>>::value,
+                "The dev type of TView and the Dev template parameter have to be identical!");
 
-                    ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= extent::getExtentVec(view)).foldrAll(std::logical_and<bool>()));
-                }
+            static_assert(
+                std::is_same<TIdx, Idx<TView>>::value,
+                "The idx type of TView and the TIdx template parameter have to be identical!");
+            static_assert(
+                std::is_same<TIdx, Idx<TExtent>>::value,
+                "The idx type of TExtent and the TIdx template parameter have to be identical!");
+            static_assert(
+                std::is_same<TIdx, Idx<TOffsets>>::value,
+                "The idx type of TOffsets and the TIdx template parameter have to be identical!");
 
-                //-----------------------------------------------------------------------------
-                //! \param view The view this view is a sub-view of.
-                template<
-                    typename TView>
-                ViewSubView(
-                    TView const & view) :
-                        ViewSubView(
-                            view,
-                            view,
-                            vec::Vec<TDim, TIdx>::all(0))
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-                }
+            static_assert(
+                std::is_same<TDim, Dim<TView>>::value,
+                "The dim type of TView and the TDim template parameter have to be identical!");
+            static_assert(
+                std::is_same<TDim, Dim<TExtent>>::value,
+                "The dim type of TExtent and the TDim template parameter have to be identical!");
+            static_assert(
+                std::is_same<TDim, Dim<TOffsets>>::value,
+                "The dim type of TOffsets and the TDim template parameter have to be identical!");
 
-                //-----------------------------------------------------------------------------
-                //! \param view The view this view is a sub-view of.
-                template<
-                    typename TView>
-                ViewSubView(
-                    TView & view) :
-                        ViewSubView(
-                            view,
-                            view,
-                            vec::Vec<TDim, TIdx>::all(0))
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-                }
+            ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= extent::getExtentVec(view))
+                              .foldrAll(std::logical_and<bool>()));
+        }
 
-            public:
-                mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> m_viewParentView; // This wraps the parent view.
-                vec::Vec<TDim, TIdx> m_extentElements;     // The extent of this view.
-                vec::Vec<TDim, TIdx> m_offsetsElements;    // The offset relative to the parent view.
-            };
+        //-----------------------------------------------------------------------------
+        //! \param view The view this view is a sub-view of.
+        template<typename TView>
+        explicit ViewSubView(TView const& view) : ViewSubView(view, view, Vec<TDim, TIdx>::all(0))
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
         }
-    }
+
+        //-----------------------------------------------------------------------------
+        //! \param view The view this view is a sub-view of.
+        template<typename TView>
+        explicit ViewSubView(TView& view) : ViewSubView(view, view, Vec<TDim, TIdx>::all(0))
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+        }
+
+    public:
+        ViewPlainPtr<Dev, TElem, TDim, TIdx> m_viewParentView; // This wraps the parent view.
+        Vec<TDim, TIdx> m_extentElements; // The extent of this view.
+        Vec<TDim, TIdx> m_offsetsElements; // The offset relative to the parent view.
+    };
 
     //-----------------------------------------------------------------------------
     // Trait specializations for ViewSubView.
-    namespace dev
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The ViewSubView device type trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct DevType<ViewSubView<TDev, TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The ViewSubView device type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct DevType<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TDev;
-            };
+            using type = alpaka::Dev<TDev>;
+        };
 
-            //#############################################################################
-            //! The ViewSubView device get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct GetDev<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & view)
-                -> TDev
-                {
-                    return
-                        dev::getDev(
-                            view.m_viewParentView);
-                }
-            };
-        }
-    }
-    namespace dim
-    {
-        namespace traits
+        //#############################################################################
+        //! The ViewSubView device get trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct GetDev<ViewSubView<TDev, TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The ViewSubView dimension getter trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct DimType<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(ViewSubView<TDev, TElem, TDim, TIdx> const& view) -> alpaka::Dev<TDev>
             {
-                using type = TDim;
-            };
-        }
-    }
-    namespace elem
-    {
-        namespace traits
+                return alpaka::getDev(view.m_viewParentView);
+            }
+        };
+
+        //#############################################################################
+        //! The ViewSubView dimension getter trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct DimType<ViewSubView<TDev, TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The ViewSubView memory element type get trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct ElemType<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-            {
-                using type = TElem;
-            };
-        }
-    }
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The ViewSubView memory element type get trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct ElemType<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+    } // namespace traits
     namespace extent
     {
         namespace traits
         {
             //#############################################################################
             //! The ViewSubView width get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
+            template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TDev, typename TIdx>
             struct GetExtent<
                 TIdxIntegralConst,
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+                ViewSubView<TDev, TElem, TDim, TIdx>,
+                std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
             {
                 //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getExtent(
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & extent)
-                -> TIdx
+                ALPAKA_FN_HOST static auto getExtent(ViewSubView<TDev, TElem, TDim, TIdx> const& extent) -> TIdx
                 {
                     return extent.m_extentElements[TIdxIntegralConst::value];
                 }
             };
-        }
-    }
-    namespace mem
+        } // namespace traits
+    } // namespace extent
+    namespace traits
     {
-        namespace view
-        {
-            namespace traits
-            {
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type"
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type"
 #endif
-                //#############################################################################
-                //! The ViewSubView native pointer get trait specialization.
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TDev,
-                    typename TIdx>
-                struct GetPtrNative<
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-                {
-                private:
-                    using IdxSequence = meta::MakeIntegerSequence<std::size_t, TDim::value>;
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & view)
-                    -> TElem const *
-                    {
-                        // \TODO: pre-calculate this pointer for faster execution.
-                        return
-                            reinterpret_cast<TElem const *>(
-                                reinterpret_cast<std::uint8_t const *>(mem::view::getPtrNative(view.m_viewParentView))
-                                + pitchedOffsetBytes(view, IdxSequence()));
-                    }
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPtrNative(
-                        mem::view::ViewSubView<TDev, TElem, TDim, TIdx> & view)
-                    -> TElem *
-                    {
-                        // \TODO: pre-calculate this pointer for faster execution.
-                        return
-                            reinterpret_cast<TElem *>(
-                                reinterpret_cast<std::uint8_t *>(mem::view::getPtrNative(view.m_viewParentView))
-                                + pitchedOffsetBytes(view, IdxSequence()));
-                    }
+        //#############################################################################
+        //! The ViewSubView native pointer get trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct GetPtrNative<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+        private:
+            using IdxSequence = std::make_index_sequence<TDim::value>;
 
-                private:
-                    //-----------------------------------------------------------------------------
-                    //! For a 3D vector this calculates:
-                    //!
-                    //! offset::getOffset<0u>(view) * mem::view::getPitchBytes<1u>(view)
-                    //! + offset::getOffset<1u>(view) * mem::view::getPitchBytes<2u>(view)
-                    //! + offset::getOffset<2u>(view) * mem::view::getPitchBytes<3u>(view)
-                    //! while mem::view::getPitchBytes<3u>(view) is equivalent to sizeof(TElem)
-                    template<
-                        typename TView,
-                        std::size_t... TIndices>
-                    ALPAKA_FN_HOST static auto pitchedOffsetBytes(
-                        TView const & view,
-                        meta::IntegerSequence<std::size_t, TIndices...> const &)
-                    -> TIdx
-                    {
-                        return
-                            meta::foldr(
-                                std::plus<TIdx>(),
-                                pitchedOffsetBytesDim<TIndices>(view)...);
-                    }
-                    //-----------------------------------------------------------------------------
-                    template<
-                        std::size_t Tidx,
-                        typename TView>
-                    ALPAKA_FN_HOST static auto pitchedOffsetBytesDim(
-                        TView const & view)
-                    -> TIdx
-                    {
-                        return
-                            offset::getOffset<Tidx>(view)
-                            * mem::view::getPitchBytes<Tidx + 1u>(view);
-                    }
-                };
+        public:
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(ViewSubView<TDev, TElem, TDim, TIdx> const& view) -> TElem const*
+            {
+                // \TODO: pre-calculate this pointer for faster execution.
+                return reinterpret_cast<TElem const*>(
+                    reinterpret_cast<std::uint8_t const*>(alpaka::getPtrNative(view.m_viewParentView))
+                    + pitchedOffsetBytes(view, IdxSequence()));
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPtrNative(ViewSubView<TDev, TElem, TDim, TIdx>& view) -> TElem*
+            {
+                // \TODO: pre-calculate this pointer for faster execution.
+                return reinterpret_cast<TElem*>(
+                    reinterpret_cast<std::uint8_t*>(alpaka::getPtrNative(view.m_viewParentView))
+                    + pitchedOffsetBytes(view, IdxSequence()));
+            }
+
+        private:
+            //-----------------------------------------------------------------------------
+            //! For a 3D vector this calculates:
+            //!
+            //! getOffset<0u>(view) * getPitchBytes<1u>(view)
+            //! + getOffset<1u>(view) * getPitchBytes<2u>(view)
+            //! + getOffset<2u>(view) * getPitchBytes<3u>(view)
+            //! while getPitchBytes<3u>(view) is equivalent to sizeof(TElem)
+            template<typename TView, std::size_t... TIndices>
+            ALPAKA_FN_HOST static auto pitchedOffsetBytes(TView const& view, std::index_sequence<TIndices...> const&)
+                -> TIdx
+            {
+                return meta::foldr(std::plus<TIdx>(), pitchedOffsetBytesDim<TIndices>(view)...);
+            }
+            //-----------------------------------------------------------------------------
+            template<std::size_t Tidx, typename TView>
+            ALPAKA_FN_HOST static auto pitchedOffsetBytesDim(TView const& view) -> TIdx
+            {
+                return getOffset<Tidx>(view) * getPitchBytes<Tidx + 1u>(view);
+            }
+        };
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
+#    pragma GCC diagnostic pop
 #endif
 
-                //#############################################################################
-                //! The ViewSubView pitch get trait specialization.
-                template<
-                    typename TIdxIntegralConst,
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx>
-                struct GetPitchBytes<
-                    TIdxIntegralConst,
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto getPitchBytes(
-                        mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & view)
-                    -> TIdx
-                    {
-                        return
-                            mem::view::getPitchBytes<TIdxIntegralConst::value>(
-                                view.m_viewParentView);
-                    }
-                };
-            }
-        }
-    }
-    namespace offset
-    {
-        namespace traits
+        //#############################################################################
+        //! The ViewSubView pitch get trait specialization.
+        template<typename TIdxIntegralConst, typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetPitchBytes<TIdxIntegralConst, ViewSubView<TDev, TElem, TDim, TIdx>>
         {
-            //#############################################################################
-            //! The ViewSubView x offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct GetOffset<
-                TIdxIntegralConst,
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getPitchBytes(ViewSubView<TDev, TElem, TDim, TIdx> const& view) -> TIdx
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getOffset(
-                    mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & offset)
-                -> TIdx
-                {
-                    return offset.m_offsetsElements[TIdxIntegralConst::value];
-                }
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
+                return alpaka::getPitchBytes<TIdxIntegralConst::value>(view.m_viewParentView);
+            }
+        };
+
+        //#############################################################################
+        //! The ViewSubView x offset get trait specialization.
+        template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct GetOffset<
+            TIdxIntegralConst,
+            ViewSubView<TDev, TElem, TDim, TIdx>,
+            std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
         {
-            //#############################################################################
-            //! The ViewSubView idx type trait specialization.
-            template<
-                typename TElem,
-                typename TDim,
-                typename TDev,
-                typename TIdx>
-            struct IdxType<
-                mem::view::ViewSubView<TDev, TElem, TDim, TIdx>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getOffset(ViewSubView<TDev, TElem, TDim, TIdx> const& offset) -> TIdx
             {
-                using type = TIdx;
-            };
-        }
-    }
-}
+                return offset.m_offsetsElements[TIdxIntegralConst::value];
+            }
+        };
+
+        //#############################################################################
+        //! The ViewSubView idx type trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct IdxType<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Apply.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Apply.hpp
index c7d4e7908a..85e624c348 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Apply.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Apply.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,27 +16,17 @@ namespace alpaka
         namespace detail
         {
             //#############################################################################
-            template<
-                typename TList,
-                template<typename...> class TApplicant>
+            template<typename TList, template<typename...> class TApplicant>
             struct ApplyImpl;
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename...> class TApplicant,
-                typename... T>
-            struct ApplyImpl<
-                TList<T...>,
-                TApplicant>
+            template<template<typename...> class TList, template<typename...> class TApplicant, typename... T>
+            struct ApplyImpl<TList<T...>, TApplicant>
             {
-                using type =
-                    TApplicant<T...>;
+                using type = TApplicant<T...>;
             };
-        }
+        } // namespace detail
         //#############################################################################
-        template<
-            typename TList,
-            template<typename...> class TApplicant>
+        template<typename TList, template<typename...> class TApplicant>
         using Apply = typename detail::ApplyImpl<TList, TApplicant>::type;
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/ApplyTuple.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/ApplyTuple.hpp
index 223a30909d..ba82ae2404 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/ApplyTuple.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/ApplyTuple.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,13 +12,9 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
 
-#include <alpaka/meta/IntegerSequence.hpp>
-
-#include <boost/config.hpp>
-
-#include <utility>
 #include <tuple>
 #include <type_traits>
+#include <utility>
 
 namespace alpaka
 {
@@ -29,46 +25,40 @@ namespace alpaka
         namespace detail
         {
             template<class F, class... Args>
-            inline auto invoke_impl(F && f, Args &&... args)
-            -> decltype(std::forward<F>(f)(std::forward<Args>(args)...))
+            inline auto invoke_impl(F&& f, Args&&... args) -> decltype(std::forward<F>(f)(std::forward<Args>(args)...))
             {
                 return std::forward<F>(f)(std::forward<Args>(args)...);
             }
 
             template<class Base, class T, class Derived>
-            inline auto invoke_impl(T Base::*pmd, Derived && ref)
-            -> decltype(std::forward<Derived>(ref).*pmd)
+            inline auto invoke_impl(T Base::*pmd, Derived&& ref) -> decltype(std::forward<Derived>(ref).*pmd)
             {
                 return std::forward<Derived>(ref).*pmd;
             }
 
             template<class PMD, class Pointer>
-            inline auto invoke_impl(PMD pmd, Pointer && ptr)
-            -> decltype((*std::forward<Pointer>(ptr)).*pmd)
+            inline auto invoke_impl(PMD pmd, Pointer&& ptr) -> decltype((*std::forward<Pointer>(ptr)).*pmd)
             {
                 return (*std::forward<Pointer>(ptr)).*pmd;
             }
 
             template<class Base, class T, class Derived, class... Args>
-            inline auto invoke_impl(T Base::*pmf, Derived && ref, Args &&... args)
-            -> decltype((std::forward<Derived>(ref).*pmf)(std::forward<Args>(args)...))
+            inline auto invoke_impl(T Base::*pmf, Derived&& ref, Args&&... args)
+                -> decltype((std::forward<Derived>(ref).*pmf)(std::forward<Args>(args)...))
             {
                 return (std::forward<Derived>(ref).*pmf)(std::forward<Args>(args)...);
             }
 
             template<class PMF, class Pointer, class... Args>
-            inline auto invoke_impl(PMF pmf, Pointer && ptr, Args &&... args)
-            -> decltype(((*std::forward<Pointer>(ptr)).*pmf)(std::forward<Args>(args)...))
+            inline auto invoke_impl(PMF pmf, Pointer&& ptr, Args&&... args)
+                -> decltype(((*std::forward<Pointer>(ptr)).*pmf)(std::forward<Args>(args)...))
             {
                 return ((*std::forward<Pointer>(ptr)).*pmf)(std::forward<Args>(args)...);
             }
-        }
+        } // namespace detail
 
-        template< class F, class... ArgTypes>
-        auto invoke(F && f, ArgTypes &&... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(detail::invoke_impl(std::forward<F>(f), std::forward<ArgTypes>(args)...))
-#endif
+        template<class F, class... ArgTypes>
+        auto invoke(F&& f, ArgTypes&&... args)
         {
             return detail::invoke_impl(std::forward<F>(f), std::forward<ArgTypes>(args)...);
         }
@@ -78,39 +68,22 @@ namespace alpaka
         namespace detail
         {
             template<class F, class Tuple, std::size_t... I>
-            auto apply_impl( F && f, Tuple && t, meta::IndexSequence<I...> )
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                meta::invoke(
-                    std::forward<F>(f),
-                    std::get<I>(std::forward<Tuple>(t))...))
-#endif
+            auto apply_impl(F&& f, Tuple&& t, std::index_sequence<I...>)
             {
-                // If the the index sequence is empty, t will not be used at all.
+                // If the index sequence is empty, t will not be used at all.
                 alpaka::ignore_unused(t);
 
-                return
-                    meta::invoke(
-                        std::forward<F>(f),
-                        std::get<I>(std::forward<Tuple>(t))...);
+                return meta::invoke(std::forward<F>(f), std::get<I>(std::forward<Tuple>(t))...);
             }
-        }
+        } // namespace detail
 
         template<class F, class Tuple>
-        auto apply(F && f, Tuple && t)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            detail::apply_impl(
+        auto apply(F&& f, Tuple&& t)
+        {
+            return detail::apply_impl(
                 std::forward<F>(f),
                 std::forward<Tuple>(t),
-                meta::MakeIndexSequence<std::tuple_size<typename std::decay<Tuple>::type>::value>{}))
-#endif
-        {
-            return
-                detail::apply_impl(
-                    std::forward<F>(f),
-                    std::forward<Tuple>(t),
-                    meta::MakeIndexSequence<std::tuple_size<typename std::decay<Tuple>::type>::value>{});
+                std::make_index_sequence<std::tuple_size<std::decay_t<Tuple>>::value>{});
         }
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/CartesianProduct.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/CartesianProduct.hpp
index c9ae52bb86..a52103efb5 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/CartesianProduct.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/CartesianProduct.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -17,73 +17,52 @@ namespace alpaka
     {
         //-----------------------------------------------------------------------------
         // This is based on code by Patrick Fromberg.
-        // See http://stackoverflow.com/questions/9122028/how-to-create-the-cartesian-product-of-a-type-list/19611856#19611856
+        // See
+        // http://stackoverflow.com/questions/9122028/how-to-create-the-cartesian-product-of-a-type-list/19611856#19611856
         namespace detail
         {
             //#############################################################################
-            template<
-                typename... Ts>
+            template<typename... Ts>
             struct CartesianProductImplHelper;
             //#############################################################################
             // Stop condition.
-            template<
-                template<typename...> class TList,
-                typename... Ts>
-            struct CartesianProductImplHelper<
-                TList<Ts...>>
+            template<template<typename...> class TList, typename... Ts>
+            struct CartesianProductImplHelper<TList<Ts...>>
             {
                 using type = TList<Ts...>;
             };
             //#############################################################################
             // Catches first empty tuple.
-            template<
-                template<typename...> class TList,
-                typename... Ts>
-            struct CartesianProductImplHelper<
-                TList<TList<>>,
-                Ts...>
+            template<template<typename...> class TList, typename... Ts>
+            struct CartesianProductImplHelper<TList<TList<>>, Ts...>
             {
                 using type = TList<>;
             };
             //#############################################################################
             // Catches any empty tuple except first.
-            template<
-                template<typename...> class TList,
-                typename... Ts,
-                typename... Rests>
-            struct CartesianProductImplHelper<
-                TList<Ts...>,
-                TList<>,
-                Rests...>
+            template<template<typename...> class TList, typename... Ts, typename... Rests>
+            struct CartesianProductImplHelper<TList<Ts...>, TList<>, Rests...>
             {
                 using type = TList<>;
             };
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... X,
-                typename H,
-                typename... Rests>
-            struct CartesianProductImplHelper<
-                TList<X...>,
-                TList<H>,
-                Rests...>
+            template<template<typename...> class TList, typename... X, typename H, typename... Rests>
+            struct CartesianProductImplHelper<TList<X...>, TList<H>, Rests...>
             {
                 using type1 = TList<Concatenate<X, TList<H>>...>;
                 using type = typename CartesianProductImplHelper<type1, Rests...>::type;
             };
             //#############################################################################
             template<
-                template<typename...> class TList,
+                template<typename...>
+                class TList,
                 typename... X,
-                template<typename...> class Head,
+                template<typename...>
+                class Head,
                 typename T,
                 typename... Ts,
                 typename... Rests>
-            struct CartesianProductImplHelper<
-                TList<X...>,
-                Head<T, Ts...>,
-                Rests...>
+            struct CartesianProductImplHelper<TList<X...>, Head<T, Ts...>, Rests...>
             {
                 using type1 = TList<Concatenate<X, TList<T>>...>;
                 using type2 = typename CartesianProductImplHelper<TList<X...>, TList<Ts...>>::type;
@@ -92,43 +71,32 @@ namespace alpaka
             };
 
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... Ts>
+            template<template<typename...> class TList, typename... Ts>
             struct CartesianProductImpl;
             //#############################################################################
             // The base case for no input returns an empty sequence.
-            template<
-                template<typename...> class TList>
-            struct CartesianProductImpl<
-                TList>
+            template<template<typename...> class TList>
+            struct CartesianProductImpl<TList>
             {
                 using type = TList<>;
             };
             //#############################################################################
             // R is the return type, Head<A...> is the first input list
             template<
-                template<typename...> class TList,
-                template<typename...> class Head,
+                template<typename...>
+                class TList,
+                template<typename...>
+                class Head,
                 typename... Ts,
                 typename... Tail>
-            struct CartesianProductImpl<
-                TList,
-                Head<Ts...>,
-                Tail...>
+            struct CartesianProductImpl<TList, Head<Ts...>, Tail...>
             {
-                using type =
-                    typename detail::CartesianProductImplHelper<
-                        TList<TList<Ts>...>,
-                        Tail...
-                    >::type;
+                using type = typename detail::CartesianProductImplHelper<TList<TList<Ts>...>, Tail...>::type;
             };
-        }
+        } // namespace detail
 
         //#############################################################################
-        template<
-            template<typename...> class TList,
-            typename... Ts>
+        template<template<typename...> class TList, typename... Ts>
         using CartesianProduct = typename detail::CartesianProductImpl<TList, Ts...>::type;
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Concatenate.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Concatenate.hpp
index 2a10a52fef..ad6889202d 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Concatenate.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Concatenate.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,38 +16,23 @@ namespace alpaka
         namespace detail
         {
             //#############################################################################
-            template<
-                typename... T>
+            template<typename... T>
             struct ConcatenateImpl;
             //#############################################################################
-            template<
-                typename T>
-            struct ConcatenateImpl<
-                T>
+            template<typename T>
+            struct ConcatenateImpl<T>
             {
                 using type = T;
             };
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... As,
-                typename... Bs,
-                typename... TRest>
-            struct ConcatenateImpl<
-                TList<As...>,
-                TList<Bs...>,
-                TRest...>
+            template<template<typename...> class TList, typename... As, typename... Bs, typename... TRest>
+            struct ConcatenateImpl<TList<As...>, TList<Bs...>, TRest...>
             {
-                using type =
-                    typename ConcatenateImpl<
-                        TList<As..., Bs...>,
-                        TRest...
-                    >::type;
+                using type = typename ConcatenateImpl<TList<As..., Bs...>, TRest...>::type;
             };
-        }
+        } // namespace detail
         //#############################################################################
-        template<
-            typename... T>
+        template<typename... T>
         using Concatenate = typename detail::ConcatenateImpl<T...>::type;
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/DependentFalseType.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/DependentFalseType.hpp
index 4b0d15c5b6..7c040325e4 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/DependentFalseType.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/DependentFalseType.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -18,10 +18,9 @@ namespace alpaka
         //#############################################################################
         //! A false_type being dependent on a ignored template parameter.
         //! This allows to use static_assert in uninstantiated template specializations without triggering.
-        template<
-            typename T>
-        struct DependentFalseType :
-            std::false_type
-        {};
-    }
-}
+        template<typename T>
+        struct DependentFalseType : std::false_type
+        {
+        };
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Filter.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Filter.hpp
index d876bd5e89..d1264ad363 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Filter.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Filter.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -20,66 +20,36 @@ namespace alpaka
         namespace detail
         {
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename> class TPred,
-                typename... Ts>
+            template<template<typename...> class TList, template<typename> class TPred, typename... Ts>
             struct FilterImplHelper;
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename> class TPred>
-            struct FilterImplHelper<
-                TList,
-                TPred>
+            template<template<typename...> class TList, template<typename> class TPred>
+            struct FilterImplHelper<TList, TPred>
             {
                 using type = TList<>;
             };
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename> class TPred,
-                typename T,
-                typename... Ts>
-            struct FilterImplHelper<
-                TList,
-                TPred,
-                T,
-                Ts...>
+            template<template<typename...> class TList, template<typename> class TPred, typename T, typename... Ts>
+            struct FilterImplHelper<TList, TPred, T, Ts...>
             {
-                using type =
-                    typename std::conditional<
-                        TPred<T>::value,    // TODO: Remove '::value' when C++14 variable templates are supported.
-                        Concatenate<TList<T>, typename FilterImplHelper<TList, TPred, Ts...>::type>,
-                        typename FilterImplHelper<TList, TPred, Ts...>::type >::type;
+                using type = std::conditional_t<
+                    TPred<T>::value,
+                    Concatenate<TList<T>, typename FilterImplHelper<TList, TPred, Ts...>::type>,
+                    typename FilterImplHelper<TList, TPred, Ts...>::type>;
             };
 
             //#############################################################################
-            template<
-                typename TList,
-                template<typename> class TPred>
+            template<typename TList, template<typename> class TPred>
             struct FilterImpl;
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                template<typename> class TPred,
-                typename... Ts>
-            struct FilterImpl<
-                TList<Ts...>,
-                TPred>
+            template<template<typename...> class TList, template<typename> class TPred, typename... Ts>
+            struct FilterImpl<TList<Ts...>, TPred>
             {
-                using type =
-                    typename detail::FilterImplHelper<
-                        TList,
-                        TPred,
-                        Ts...
-                    >::type;
+                using type = typename detail::FilterImplHelper<TList, TPred, Ts...>::type;
             };
-        }
+        } // namespace detail
         //#############################################################################
-        template<
-            typename TList,
-            template<typename> class TPred>
+        template<typename TList, template<typename> class TPred>
         using Filter = typename detail::FilterImpl<TList, TPred>::type;
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Fold.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Fold.hpp
index aa3e8efe90..b031b44b81 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Fold.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Fold.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,102 +12,25 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
 
-#include <boost/config.hpp>
-
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-    #include <type_traits>
-#endif
-
 namespace alpaka
 {
     namespace meta
     {
         //-----------------------------------------------------------------------------
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TFnObj,
-            typename T>
-        ALPAKA_FN_HOST_ACC auto foldr(
-            TFnObj const & f,
-            T const & t)
-        -> T
+        template<typename TFnObj, typename T>
+        ALPAKA_FN_HOST_ACC auto foldr(TFnObj const& f, T const& t) -> T
         {
             alpaka::ignore_unused(f);
 
             return t;
         }
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                typename TFnObj,
-                typename... T>
-            struct TypeOfFold;
-            //#############################################################################
-            template<
-                typename TFnObj,
-                typename T>
-            struct TypeOfFold<
-                TFnObj,
-                T>
-            {
-                using type = T;
-            };
-            //#############################################################################
-            template<
-                typename TFnObj,
-                typename T,
-                typename... P>
-            struct TypeOfFold<
-                TFnObj,
-                T,
-                P...>
-            {
-                using type =
-                    typename std::result_of<
-                        TFnObj(T, typename TypeOfFold<TFnObj, P...>::type)>::type;
-            };
-        }
-
-        //-----------------------------------------------------------------------------
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TFnObj,
-            typename T0,
-            typename T1,
-            typename... Ts>
-        ALPAKA_FN_HOST_ACC auto foldr(
-            TFnObj const & f,
-            T0 const & t0,
-            T1 const & t1,
-            Ts const & ... ts)
-        // NOTE: The following line is not allowed because the point of function declaration is after the trailing return type.
-        // Thus the function itself is not available inside its return type declaration.
-        // http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#1433
-        // http://stackoverflow.com/questions/3744400/trailing-return-type-using-decltype-with-a-variadic-template-function
-        // http://stackoverflow.com/questions/11596898/variadic-template-and-inferred-return-type-in-concat/11597196#11597196
-        //-> decltype(f(t0, foldr(f, t1, ts...)))
-        -> typename detail::TypeOfFold<TFnObj, T0, T1, Ts...>::type
-        {
-            return f(t0, foldr(f, t1, ts...));
-        }
-#else
         //-----------------------------------------------------------------------------
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TFnObj,
-            typename T0,
-            typename T1,
-            typename... Ts>
-        ALPAKA_FN_HOST_ACC auto foldr(
-            TFnObj const & f,
-            T0 const & t0,
-            T1 const & t1,
-            Ts const & ... ts)
+        template<typename TFnObj, typename T0, typename T1, typename... Ts>
+        ALPAKA_FN_HOST_ACC auto foldr(TFnObj const& f, T0 const& t0, T1 const& t1, Ts const&... ts)
         {
             return f(t0, foldr(f, t1, ts...));
         }
-#endif
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/ForEachType.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/ForEachType.hpp
index 8a344f161b..2f3baa8cec 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/ForEachType.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/ForEachType.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,7 +9,6 @@
 
 #pragma once
 
-#include <alpaka/core/BoostPredef.hpp>
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
 
@@ -22,82 +21,47 @@ namespace alpaka
         namespace detail
         {
             //#############################################################################
-            template<
-                typename TList>
+            template<typename TList>
             struct ForEachTypeHelper;
             //#############################################################################
-            template<
-                template<typename...> class TList>
-            struct ForEachTypeHelper<
-                TList<>>
+            template<template<typename...> class TList>
+            struct ForEachTypeHelper<TList<>>
             {
                 //-----------------------------------------------------------------------------
                 ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TFnObj,
-                    typename... TArgs>
-                ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(
-                    TFnObj && f,
-                    TArgs && ... args)
-                -> void
+                template<typename TFnObj, typename... TArgs>
+                ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(TFnObj&& f, TArgs&&... args) -> void
                 {
                     alpaka::ignore_unused(f);
                     alpaka::ignore_unused(args...);
                 }
             };
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename T,
-                typename... Ts>
-            struct ForEachTypeHelper<
-                TList<T, Ts...>>
+            template<template<typename...> class TList, typename T, typename... Ts>
+            struct ForEachTypeHelper<TList<T, Ts...>>
             {
                 //-----------------------------------------------------------------------------
                 ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TFnObj,
-                    typename... TArgs>
-                ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(
-                    TFnObj && f,
-                    TArgs && ... args)
-                -> void
+                template<typename TFnObj, typename... TArgs>
+                ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(TFnObj&& f, TArgs&&... args) -> void
                 {
-                    // Call the function object template call operator.
-#if BOOST_COMP_MSVC && !BOOST_COMP_NVCC
-                    f.operator()<T>(
-                        std::forward<TArgs>(args)...);
-#else
-                    f.template operator()<T>(
-                        std::forward<TArgs>(args)...);
-#endif
-                    ForEachTypeHelper<
-                        TList<Ts...>>
-                    ::forEachTypeHelper(
+                    f.template operator()<T>(std::forward<TArgs>(args)...);
+                    ForEachTypeHelper<TList<Ts...>>::forEachTypeHelper(
                         std::forward<TFnObj>(f),
                         std::forward<TArgs>(args)...);
                 }
             };
-        }
+        } // namespace detail
 
         //-----------------------------------------------------------------------------
-        //! Equivalent to boost::mpl::for_each but does not require the types of the sequence to be default constructible.
-        //! This function does not create instances of the types instead it passes the types as template parameter.
+        //! Equivalent to boost::mpl::for_each but does not require the types of the sequence to be default
+        //! constructible. This function does not create instances of the types instead it passes the types as template
+        //! parameter.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TList,
-            typename TFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto forEachType(
-            TFnObj && f,
-            TArgs && ... args)
-        -> void
+        template<typename TList, typename TFnObj, typename... TArgs>
+        ALPAKA_FN_HOST_ACC auto forEachType(TFnObj&& f, TArgs&&... args) -> void
         {
-            detail::ForEachTypeHelper<
-                TList>
-            ::forEachTypeHelper(
-                std::forward<TFnObj>(f),
-                std::forward<TArgs>(args)...);
+            detail::ForEachTypeHelper<TList>::forEachTypeHelper(std::forward<TFnObj>(f), std::forward<TArgs>(args)...);
         }
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Functional.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Functional.hpp
new file mode 100644
index 0000000000..ae253e33f1
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Functional.hpp
@@ -0,0 +1,40 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/Common.hpp>
+
+namespace alpaka
+{
+    namespace meta
+    {
+        template<typename T>
+        struct min
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC
+            constexpr auto operator()(const T& lhs, const T& rhs) const
+            {
+                return (lhs < rhs) ? lhs : rhs;
+            }
+        };
+
+        template<typename T>
+        struct max
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC
+            constexpr auto operator()(const T& lhs, const T& rhs) const
+            {
+                return (lhs > rhs) ? lhs : rhs;
+            }
+        };
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/InheritFromList.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/InheritFromList.hpp
index 196c2fc510..cb7acd5185 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/InheritFromList.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/InheritFromList.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -13,20 +13,12 @@ namespace alpaka
 {
     namespace meta
     {
-        template<
-            typename TBaseList
-        >
+        template<typename TBaseList>
         class InheritFromList;
 
-        template<
-            template<typename...> class TList,
-            typename... TBases
-        >
-        class InheritFromList<
-            TList<TBases...>
-        >
-            : public TBases...
+        template<template<typename...> class TList, typename... TBases>
+        class InheritFromList<TList<TBases...>> : public TBases...
         {
         };
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/IntegerSequence.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/IntegerSequence.hpp
index 4abdfb830d..13a5e54de8 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/IntegerSequence.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/IntegerSequence.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,95 +9,32 @@
 
 #pragma once
 
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Common.hpp>
 #include <alpaka/meta/Set.hpp>
 
-#include <type_traits>
 #include <cstddef>
+#include <type_traits>
+#include <utility>
 
 namespace alpaka
 {
     namespace meta
     {
-        //#############################################################################
-        // This could be replaced with c++14 std::IntegerSequence if we raise the minimum.
-        template<
-            typename T,
-            T... Tvals>
-        struct IntegerSequence
-        {
-            static_assert(std::is_integral<T>::value, "IntegerSequence<T, I...> requires T to be an integral type.");
-
-            using type = IntegerSequence<T, Tvals...>;
-            using value_type = T;
-
-            ALPAKA_FN_HOST_ACC static auto size() noexcept
-            -> std::size_t
-            {
-                return (sizeof...(Tvals));
-            }
-        };
-
         namespace detail
         {
             //#############################################################################
-            template<
-                typename TDstType,
-                typename TIntegerSequence>
+            template<typename TDstType, typename TIntegerSequence>
             struct ConvertIntegerSequence;
             //#############################################################################
-            template<
-                typename TDstType,
-                typename T,
-                T... Tvals>
-            struct ConvertIntegerSequence<
-                TDstType,
-                IntegerSequence<T, Tvals...>>
+            template<typename TDstType, typename T, T... Tvals>
+            struct ConvertIntegerSequence<TDstType, std::integer_sequence<T, Tvals...>>
             {
-                using type = IntegerSequence<TDstType, static_cast<TDstType>(Tvals)...>;
+                using type = std::integer_sequence<TDstType, static_cast<TDstType>(Tvals)...>;
             };
-        }
+        } // namespace detail
         //#############################################################################
-        template<
-            typename TDstType,
-            typename TIntegerSequence>
+        template<typename TDstType, typename TIntegerSequence>
         using ConvertIntegerSequence = typename detail::ConvertIntegerSequence<TDstType, TIntegerSequence>::type;
 
-        namespace detail
-        {
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename T,
-                template<T> class TOp,
-                typename TIntegerSequence>
-            struct TransformIntegerSequence;
-            //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename T,
-                template<T> class TOp,
-                T... Tvals>
-            struct TransformIntegerSequence<
-                TList,
-                T,
-                TOp,
-                IntegerSequence<T, Tvals...>>
-            {
-                using type =
-                    TList<
-                        TOp<Tvals>...>;
-            };
-        }
-        //#############################################################################
-        template<
-            template<typename...> class TList,
-            typename T,
-            template<T> class TOp,
-            typename TIntegerSequence>
-        using TransformIntegerSequence = typename detail::TransformIntegerSequence<TList, T, TOp, TIntegerSequence>::type;
-
         namespace detail
         {
             //#############################################################################
@@ -108,53 +45,49 @@ namespace alpaka
             };
             //#############################################################################
             template<typename T, T Tbegin, T... Tvals>
-            struct MakeIntegerSequenceHelper<false, true, T, Tbegin, std::integral_constant<T, Tbegin>, IntegerSequence<T, Tvals...> > :
-                IntegerSequence<T, Tvals...>
-            {};
+            struct MakeIntegerSequenceHelper<
+                false,
+                true,
+                T,
+                Tbegin,
+                std::integral_constant<T, Tbegin>,
+                std::integer_sequence<T, Tvals...>>
+            {
+                using type = std::integer_sequence<T, Tvals...>;
+            };
             //#############################################################################
             template<typename T, T Tbegin, T TIdx, T... Tvals>
-            struct MakeIntegerSequenceHelper<false, false, T, Tbegin, std::integral_constant<T, TIdx>, IntegerSequence<T, Tvals...> > :
-                MakeIntegerSequenceHelper<false, TIdx == (Tbegin+1), T, Tbegin, std::integral_constant<T, TIdx - 1>, IntegerSequence<T, TIdx - 1, Tvals...> >
-            {};
-        }
+            struct MakeIntegerSequenceHelper<
+                false,
+                false,
+                T,
+                Tbegin,
+                std::integral_constant<T, TIdx>,
+                std::integer_sequence<T, Tvals...>>
+            {
+                using type = typename MakeIntegerSequenceHelper<
+                    false,
+                    TIdx == (Tbegin + 1),
+                    T,
+                    Tbegin,
+                    std::integral_constant<T, TIdx - 1>,
+                    std::integer_sequence<T, TIdx - 1, Tvals...>>::type;
+            };
+        } // namespace detail
 
         //#############################################################################
         template<typename T, T Tbegin, T Tsize>
-        using MakeIntegerSequenceOffset = typename detail::MakeIntegerSequenceHelper<(Tsize < 0), (Tsize == 0), T, Tbegin, std::integral_constant<T, Tbegin+Tsize>, IntegerSequence<T> >::type;
-
-        //#############################################################################
-        template<typename T, T Tsize>
-        using MakeIntegerSequence = MakeIntegerSequenceOffset<T, 0u, Tsize>;
-
-
-        //#############################################################################
-        template<
-            std::size_t... Tvals>
-        using IndexSequence = IntegerSequence<std::size_t, Tvals...>;
-
-        //#############################################################################
-        template<
-            typename T,
-            T Tbegin,
-            T Tsize>
-        using MakeIndexSequenceOffset = MakeIntegerSequenceOffset<std::size_t, Tbegin, Tsize>;
-
-        //#############################################################################
-        template<
-            std::size_t Tsize>
-        using MakeIndexSequence = MakeIntegerSequence<std::size_t, Tsize>;
-
-        //#############################################################################
-        template<
-            typename... Ts>
-        using IndexSequenceFor = MakeIndexSequence<sizeof...(Ts)>;
-
+        using MakeIntegerSequenceOffset = typename detail::MakeIntegerSequenceHelper<
+            (Tsize < 0),
+            (Tsize == 0),
+            T,
+            Tbegin,
+            std::integral_constant<T, Tbegin + Tsize>,
+            std::integer_sequence<T>>::type;
 
         //#############################################################################
         //! Checks if the integral values are unique.
-        template<
-            typename T,
-            T... Tvals>
+        template<typename T, T... Tvals>
         struct IntegralValuesUnique
         {
             static constexpr bool value = meta::IsParameterPackSet<std::integral_constant<T, Tvals>...>::value;
@@ -162,81 +95,46 @@ namespace alpaka
 
         //#############################################################################
         //! Checks if the values in the index sequence are unique.
-        template<
-            typename TIntegerSequence>
+        template<typename TIntegerSequence>
         struct IntegerSequenceValuesUnique;
         //#############################################################################
         //! Checks if the values in the index sequence are unique.
-        template<
-            typename T,
-            T... Tvals>
-        struct IntegerSequenceValuesUnique<
-            IntegerSequence<T, Tvals...>>
+        template<typename T, T... Tvals>
+        struct IntegerSequenceValuesUnique<std::integer_sequence<T, Tvals...>>
         {
             static constexpr bool value = IntegralValuesUnique<T, Tvals...>::value;
         };
 
         //#############################################################################
         //! Checks if the integral values are within the given range.
-        template<
-            typename T,
-            T Tmin,
-            T Tmax,
-            T... Tvals>
+        template<typename T, T Tmin, T Tmax, T... Tvals>
         struct IntegralValuesInRange;
         //#############################################################################
         //! Checks if the integral values are within the given range.
-        template<
-            typename T,
-            T Tmin,
-            T Tmax>
-        struct IntegralValuesInRange<
-            T,
-            Tmin,
-            Tmax>
+        template<typename T, T Tmin, T Tmax>
+        struct IntegralValuesInRange<T, Tmin, Tmax>
         {
             static constexpr bool value = true;
         };
         //#############################################################################
         //! Checks if the integral values are within the given range.
-        template<
-            typename T,
-            T Tmin,
-            T Tmax,
-            T I,
-            T... Tvals>
-        struct IntegralValuesInRange<
-            T,
-            Tmin,
-            Tmax,
-            I,
-            Tvals...>
+        template<typename T, T Tmin, T Tmax, T I, T... Tvals>
+        struct IntegralValuesInRange<T, Tmin, Tmax, I, Tvals...>
         {
-            static constexpr bool value = (I >= Tmin) && (I <=Tmax) && IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
+            static constexpr bool value
+                = (I >= Tmin) && (I <= Tmax) && IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
         };
 
         //#############################################################################
         //! Checks if the values in the index sequence are within the given range.
-        template<
-            typename TIntegerSequence,
-            typename T,
-            T Tmin,
-            T Tmax>
+        template<typename TIntegerSequence, typename T, T Tmin, T Tmax>
         struct IntegerSequenceValuesInRange;
         //#############################################################################
         //! Checks if the values in the index sequence are within the given range.
-        template<
-            typename T,
-            T... Tvals,
-            T Tmin,
-            T Tmax>
-        struct IntegerSequenceValuesInRange<
-            IntegerSequence<T, Tvals...>,
-            T,
-            Tmin,
-            Tmax>
+        template<typename T, T... Tvals, T Tmin, T Tmax>
+        struct IntegerSequenceValuesInRange<std::integer_sequence<T, Tvals...>, T, Tmin, Tmax>
         {
             static constexpr bool value = IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
         };
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Integral.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Integral.hpp
index 222be7d9c3..5de91b5a44 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Integral.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Integral.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -17,86 +17,59 @@ namespace alpaka
     {
         //#############################################################################
         //! The trait is true if all values of TSubset are contained in TSuperset.
-        template<
-            typename TSuperset,
-            typename TSubset>
-        using IsIntegralSuperset =
-            std::integral_constant<
-                bool,
-                std::is_integral<TSuperset>::value && std::is_integral<TSubset>::value
+        template<typename TSuperset, typename TSubset>
+        using IsIntegralSuperset = std::integral_constant<
+            bool,
+            std::is_integral<TSuperset>::value && std::is_integral<TSubset>::value
                 && (
                     // If the signdness is equal, the sizes have to be greater or equal to be a superset.
-                    ((std::is_unsigned<TSuperset>::value == std::is_unsigned<TSubset>::value) && (sizeof(TSuperset) >= sizeof(TSubset)))
+                    ((std::is_unsigned<TSuperset>::value == std::is_unsigned<TSubset>::value)
+                     && (sizeof(TSuperset) >= sizeof(TSubset)))
                     // If the signdness is non-equal, the superset has to have at least one bit more.
-                    || ((std::is_unsigned<TSuperset>::value != std::is_unsigned<TSubset>::value) && (sizeof(TSuperset) > sizeof(TSubset)))
-                )>;
+                    || ((std::is_unsigned<TSuperset>::value != std::is_unsigned<TSubset>::value)
+                        && (sizeof(TSuperset) > sizeof(TSubset))))>;
 
         //#############################################################################
         //! The type that has the higher max value.
-        template<
-            typename T0,
-            typename T1>
-        using HigherMax =
-            typename std::conditional<
-                (sizeof(T0) > sizeof(T1)),
+        template<typename T0, typename T1>
+        using HigherMax = std::conditional_t<
+            (sizeof(T0) > sizeof(T1)),
+            T0,
+            std::conditional_t<
+                ((sizeof(T0) == sizeof(T1)) && std::is_unsigned<T0>::value && std::is_signed<T1>::value),
                 T0,
-                typename std::conditional<
-                    ((sizeof(T0) == sizeof(T1)) && std::is_unsigned<T0>::value && std::is_signed<T1>::value),
-                        T0,
-                        T1>::type>::type;
+                T1>>;
 
         //#############################################################################
         //! The type that has the lower max value.
-        template<
-            typename T0,
-            typename T1>
-        using LowerMax =
-            typename std::conditional<
-                (sizeof(T0) < sizeof(T1)),
+        template<typename T0, typename T1>
+        using LowerMax = std::conditional_t<
+            (sizeof(T0) < sizeof(T1)),
+            T0,
+            std::conditional_t<
+                ((sizeof(T0) == sizeof(T1)) && std::is_signed<T0>::value && std::is_unsigned<T1>::value),
                 T0,
-                typename std::conditional<
-                    ((sizeof(T0) == sizeof(T1)) && std::is_signed<T0>::value && std::is_unsigned<T1>::value),
-                        T0,
-                        T1>::type>::type;
+                T1>>;
 
         //#############################################################################
-        //! The type that has the higher min value. If both types have the same min value, the type with the wider range is chosen.
-        template<
-            typename T0,
-            typename T1>
-        using HigherMin =
-            typename std::conditional<
-                (std::is_unsigned<T0>::value == std::is_unsigned<T1>::value),
-                typename std::conditional<
-                    std::is_unsigned<T0>::value,
-                        typename std::conditional<
-                        (sizeof(T0) < sizeof(T1)),
-                            T1,
-                            T0>::type,
-                        typename std::conditional<
-                        (sizeof(T0) < sizeof(T1)),
-                            T0,
-                            T1>::type>::type,
-                typename std::conditional<
-                    std::is_unsigned<T0>::value,
-                        T0,
-                        T1>::type>::type;
+        //! The type that has the higher min value. If both types have the same min value, the type with the wider
+        //! range is chosen.
+        template<typename T0, typename T1>
+        using HigherMin = std::conditional_t<
+            (std::is_unsigned<T0>::value == std::is_unsigned<T1>::value),
+            std::conditional_t<
+                std::is_unsigned<T0>::value,
+                std::conditional_t<(sizeof(T0) < sizeof(T1)), T1, T0>,
+                std::conditional_t<(sizeof(T0) < sizeof(T1)), T0, T1>>,
+            std::conditional_t<std::is_unsigned<T0>::value, T0, T1>>;
 
         //#############################################################################
-        //! The type that has the lower min value. If both types have the same min value, the type with the wider range is chosen.
-        template<
-            typename T0,
-            typename T1>
-        using LowerMin =
-            typename std::conditional<
-                (std::is_unsigned<T0>::value == std::is_unsigned<T1>::value),
-                typename std::conditional<
-                    (sizeof(T0) > sizeof(T1)),
-                        T0,
-                        T1>::type,
-                typename std::conditional<
-                    std::is_signed<T0>::value,
-                        T0,
-                        T1>::type>::type;
-    }
-}
+        //! The type that has the lower min value. If both types have the same min value, the type with the wider range
+        //! is chosen.
+        template<typename T0, typename T1>
+        using LowerMin = std::conditional_t<
+            (std::is_unsigned<T0>::value == std::is_unsigned<T1>::value),
+            std::conditional_t<(sizeof(T0) > sizeof(T1)), T0, T1>,
+            std::conditional_t<std::is_signed<T0>::value, T0, T1>>;
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/IsStrictBase.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/IsStrictBase.hpp
index 720fbb2866..7c8fd581a6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/IsStrictBase.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/IsStrictBase.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -17,13 +17,9 @@ namespace alpaka
     {
         //#############################################################################
         //! The trait is true if TDerived is derived from TBase but is not TBase itself.
-        template<
-            typename TBase,
-            typename TDerived>
-        using IsStrictBase =
-            std::integral_constant<
-                bool,
-                std::is_base_of<TBase, TDerived>::value
-                && !std::is_same<TBase, typename std::decay<TDerived>::type>::value>;
-    }
-}
+        template<typename TBase, typename TDerived>
+        using IsStrictBase = std::integral_constant<
+            bool,
+            std::is_base_of<TBase, TDerived>::value && !std::is_same<TBase, std::decay_t<TDerived>>::value>;
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Metafunctions.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Metafunctions.hpp
index 8bc184c517..43acc7eae4 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Metafunctions.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Metafunctions.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -19,71 +19,55 @@ namespace alpaka
         {
             //#############################################################################
             // TODO: Replace with C++17 std::conjunction
-            template<
-                typename...>
-            struct ConjunctionImpl :
-                std::true_type
-            {};
+            template<typename...>
+            struct ConjunctionImpl : std::true_type
+            {
+            };
             //#############################################################################
             // TODO: Replace with C++17 std::conjunction
-            template<
-                typename B1>
-            struct ConjunctionImpl<B1> :
-                B1
-            {};
+            template<typename B1>
+            struct ConjunctionImpl<B1> : B1
+            {
+            };
             //#############################################################################
             // TODO: Replace with C++17 std::conjunction
-            template<
-                typename B1,
-                typename... Bn>
-            struct ConjunctionImpl<
-                B1,
-                Bn...> :
-                    std::conditional<B1::value != false, ConjunctionImpl<Bn...>, B1>::type
-            {};
-        }
+            template<typename B1, typename... Bn>
+            struct ConjunctionImpl<B1, Bn...> : std::conditional<B1::value != false, ConjunctionImpl<Bn...>, B1>::type
+            {
+            };
+        } // namespace detail
         //#############################################################################
-        template<
-            typename... B>
+        template<typename... B>
         using Conjunction = typename detail::ConjunctionImpl<B...>::type;
 
         namespace detail
         {
             //#############################################################################
             // TODO: Replace with C++17 std::disjunction
-            template<
-                typename...>
-            struct DisjunctionImpl :
-                std::false_type
-            {};
+            template<typename...>
+            struct DisjunctionImpl : std::false_type
+            {
+            };
             //#############################################################################
             // TODO: Replace with C++17 std::disjunction
-            template<
-                typename B1>
-            struct DisjunctionImpl<B1> :
-                B1
-            {};
+            template<typename B1>
+            struct DisjunctionImpl<B1> : B1
+            {
+            };
             //#############################################################################
             // TODO: Replace with C++17 std::disjunction
-            template<
-                typename B1,
-                typename... Bn>
-            struct DisjunctionImpl<
-                B1,
-                Bn...> :
-                    std::conditional<B1::value != false, B1, DisjunctionImpl<Bn...>>::type
-            {};
-        }
+            template<typename B1, typename... Bn>
+            struct DisjunctionImpl<B1, Bn...> : std::conditional<B1::value != false, B1, DisjunctionImpl<Bn...>>::type
+            {
+            };
+        } // namespace detail
         //#############################################################################
-        template<
-            typename... B>
+        template<typename... B>
         using Disjunction = typename detail::DisjunctionImpl<B...>;
 
         //#############################################################################
         // TODO: Replace with C++17 std::negation
-        template<
-            typename B>
+        template<typename B>
         using Negation = std::integral_constant<bool, !B::value>;
-    }
-}
-
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/NdLoop.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/NdLoop.hpp
index 9f6d92432c..e4dfce71cb 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/NdLoop.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/NdLoop.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,9 +12,10 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
 #include <alpaka/dim/Traits.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
 #include <alpaka/vec/Vec.hpp>
 
+#include <utility>
+
 namespace alpaka
 {
     namespace meta
@@ -23,26 +24,17 @@ namespace alpaka
         {
             //#############################################################################
             //! N-dimensional loop iteration template.
-            template<
-                typename TIndexSequence>
+            template<typename TIndexSequence>
             struct NdLoop;
             //#############################################################################
             //! N-dimensional loop iteration template.
             template<>
-            struct NdLoop<
-                meta::IndexSequence<>>
+            struct NdLoop<std::index_sequence<>>
             {
                 //-----------------------------------------------------------------------------
                 ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TIndex,
-                    typename TExtentVec,
-                    typename TFnObj>
-                ALPAKA_FN_HOST_ACC static auto ndLoop(
-                    TIndex & idx,
-                    TExtentVec const & extent,
-                    TFnObj const & f)
-                -> void
+                template<typename TIndex, typename TExtentVec, typename TFnObj>
+                ALPAKA_FN_HOST_ACC static auto ndLoop(TIndex& idx, TExtentVec const& extent, TFnObj const& f) -> void
                 {
                     alpaka::ignore_unused(idx);
                     alpaka::ignore_unused(extent);
@@ -51,31 +43,22 @@ namespace alpaka
             };
             //#############################################################################
             //! N-dimensional loop iteration template.
-            template<
-                std::size_t Tdim>
-            struct NdLoop<
-                meta::IndexSequence<Tdim>>
+            template<std::size_t Tdim>
+            struct NdLoop<std::index_sequence<Tdim>>
             {
                 //-----------------------------------------------------------------------------
                 ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TIndex,
-                    typename TExtentVec,
-                    typename TFnObj>
-                ALPAKA_FN_HOST_ACC static auto ndLoop(
-                    TIndex & idx,
-                    TExtentVec const & extent,
-                    TFnObj const & f)
-                -> void
+                template<typename TIndex, typename TExtentVec, typename TFnObj>
+                ALPAKA_FN_HOST_ACC static auto ndLoop(TIndex& idx, TExtentVec const& extent, TFnObj const& f) -> void
                 {
                     static_assert(
-                        dim::Dim<TIndex>::value > 0u,
+                        Dim<TIndex>::value > 0u,
                         "The dimension given to ndLoop has to be larger than zero!");
                     static_assert(
-                        dim::Dim<TIndex>::value == dim::Dim<TExtentVec>::value,
+                        Dim<TIndex>::value == Dim<TExtentVec>::value,
                         "The dimensions of the iteration vector and the extent vector have to be identical!");
                     static_assert(
-                        dim::Dim<TIndex>::value > Tdim,
+                        Dim<TIndex>::value > Tdim,
                         "The current dimension has to be in the range [0,dim-1]!");
 
                     for(idx[Tdim] = 0u; idx[Tdim] < extent[Tdim]; ++idx[Tdim])
@@ -86,86 +69,65 @@ namespace alpaka
             };
             //#############################################################################
             //! N-dimensional loop iteration template.
-            template<
-                std::size_t Tdim0,
-                std::size_t Tdim1,
-                std::size_t... Tdims>
-            struct NdLoop<
-                meta::IndexSequence<Tdim0, Tdim1, Tdims...>>
+            template<std::size_t Tdim0, std::size_t Tdim1, std::size_t... Tdims>
+            struct NdLoop<std::index_sequence<Tdim0, Tdim1, Tdims...>>
             {
                 //-----------------------------------------------------------------------------
                 ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TIndex,
-                    typename TExtentVec,
-                    typename TFnObj>
-                ALPAKA_FN_HOST_ACC static auto ndLoop(
-                    TIndex & idx,
-                    TExtentVec const & extent,
-                    TFnObj const & f)
-                -> void
+                template<typename TIndex, typename TExtentVec, typename TFnObj>
+                ALPAKA_FN_HOST_ACC static auto ndLoop(TIndex& idx, TExtentVec const& extent, TFnObj const& f) -> void
                 {
                     static_assert(
-                        dim::Dim<TIndex>::value > 0u,
+                        Dim<TIndex>::value > 0u,
                         "The dimension given to ndLoop has to be larger than zero!");
                     static_assert(
-                        dim::Dim<TIndex>::value == dim::Dim<TExtentVec>::value,
+                        Dim<TIndex>::value == Dim<TExtentVec>::value,
                         "The dimensions of the iteration vector and the extent vector have to be identical!");
                     static_assert(
-                        dim::Dim<TIndex>::value > Tdim0,
+                        Dim<TIndex>::value > Tdim0,
                         "The current dimension has to be in the range [0,dim-1]!");
 
                     for(idx[Tdim0] = 0u; idx[Tdim0] < extent[Tdim0]; ++idx[Tdim0])
                     {
-                        detail::NdLoop<
-                            meta::IndexSequence<Tdim1, Tdims...>>
-                        ::template ndLoop(
-                                idx,
-                                extent,
-                                f);
+                        detail::NdLoop<std::index_sequence<Tdim1, Tdims...>>::template ndLoop(idx, extent, f);
                     }
                 }
             };
-        }
+        } // namespace detail
         //-----------------------------------------------------------------------------
         //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
-        //! The loops are nested in the order given by the IndexSequence with the first element being the outermost and the last index the innermost loop.
+        //! The loops are nested in the order given by the index_sequence with the first element being the outermost
+        //! and the last index the innermost loop.
         //!
         //! \param indexSequence A sequence of indices being a permutation of the values [0, dim-1].
         //! \param extent N-dimensional loop extent.
         //! \param f The function called at each iteration.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtentVec,
-            typename TFnObj,
-            std::size_t... Tdims>
+        template<typename TExtentVec, typename TFnObj, std::size_t... Tdims>
         ALPAKA_FN_HOST_ACC auto ndLoop(
-            meta::IndexSequence<Tdims...> const & indexSequence,
-            TExtentVec const & extent,
-            TFnObj const & f)
-        -> void
+            std::index_sequence<Tdims...> const& indexSequence,
+            TExtentVec const& extent,
+            TFnObj const& f) -> void
         {
             alpaka::ignore_unused(indexSequence);
 
             static_assert(
-                dim::Dim<TExtentVec>::value > 0u,
+                Dim<TExtentVec>::value > 0u,
                 "The dimension of the extent given to ndLoop has to be larger than zero!");
             static_assert(
-                meta::IntegerSequenceValuesInRange<meta::IndexSequence<Tdims...>, std::size_t, 0, dim::Dim<TExtentVec>::value>::value,
-                "The values in the IndexSequence have to be in the range [0,dim-1]!");
+                meta::IntegerSequenceValuesInRange<
+                    std::index_sequence<Tdims...>,
+                    std::size_t,
+                    0,
+                    Dim<TExtentVec>::value>::value,
+                "The values in the index_sequence have to be in the range [0,dim-1]!");
             static_assert(
-                meta::IntegerSequenceValuesUnique<meta::IndexSequence<Tdims...>>::value,
-                "The values in the IndexSequence have to be unique!");
+                meta::IntegerSequenceValuesUnique<std::index_sequence<Tdims...>>::value,
+                "The values in the index_sequence have to be unique!");
 
-            auto idx(
-                vec::Vec<dim::Dim<TExtentVec>, idx::Idx<TExtentVec>>::zeros());
+            auto idx(Vec<Dim<TExtentVec>, Idx<TExtentVec>>::zeros());
 
-            detail::NdLoop<
-                meta::IndexSequence<Tdims...>>
-            ::template ndLoop(
-                    idx,
-                    extent,
-                    f);
+            detail::NdLoop<std::index_sequence<Tdims...>>::template ndLoop(idx, extent, f);
         }
         //-----------------------------------------------------------------------------
         //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
@@ -174,18 +136,10 @@ namespace alpaka
         //! \param extent N-dimensional loop extent.
         //! \param f The function called at each iteration.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtentVec,
-            typename TFnObj>
-        ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(
-            TExtentVec const & extent,
-            TFnObj const & f)
-        -> void
+        template<typename TExtentVec, typename TFnObj>
+        ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const& extent, TFnObj const& f) -> void
         {
-            ndLoop(
-                meta::MakeIndexSequence<dim::Dim<TExtentVec>::value>(),
-                extent,
-                f);
+            ndLoop(std::make_index_sequence<Dim<TExtentVec>::value>(), extent, f);
         }
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Set.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Set.hpp
index aeb2aea6ac..129e19d9b3 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Set.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Set.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -19,14 +19,13 @@ namespace alpaka
         {
             //#############################################################################
             //! Empty dependent type.
-            template<
-                typename T>
+            template<typename T>
             struct Empty
-            {};
+            {
+            };
 
             //#############################################################################
-            template<
-                typename... Ts>
+            template<typename... Ts>
             struct IsParameterPackSetImpl;
             //#############################################################################
             template<>
@@ -37,44 +36,36 @@ namespace alpaka
             //#############################################################################
             // Based on code by Roland Bock: https://gist.github.com/rbock/ad8eedde80c060132a18
             // Linearly inherits from empty<T> and checks if it has already inherited from this type.
-            template<
-                typename T,
-                typename... Ts>
-            struct IsParameterPackSetImpl<T, Ts...> :
-                public IsParameterPackSetImpl<Ts...>,
-                public virtual Empty<T>
+            template<typename T, typename... Ts>
+            struct IsParameterPackSetImpl<T, Ts...>
+                : public IsParameterPackSetImpl<Ts...>
+                , public virtual Empty<T>
             {
                 using Base = IsParameterPackSetImpl<Ts...>;
 
                 static constexpr bool value = Base::value && !std::is_base_of<Empty<T>, Base>::value;
             };
-        }
+        } // namespace detail
         //#############################################################################
         //! Trait that tells if the parameter pack contains only unique (no equal) types.
-        template<
-            typename... Ts>
+        template<typename... Ts>
         using IsParameterPackSet = detail::IsParameterPackSetImpl<Ts...>;
 
         namespace detail
         {
             //#############################################################################
-            template<
-                typename TList>
+            template<typename TList>
             struct IsSetImpl;
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... Ts>
-            struct IsSetImpl<
-                TList<Ts...>>
+            template<template<typename...> class TList, typename... Ts>
+            struct IsSetImpl<TList<Ts...>>
             {
                 static constexpr bool value = IsParameterPackSet<Ts...>::value;
             };
-        }
+        } // namespace detail
         //#############################################################################
         //! Trait that tells if the template contains only unique (no equal) types.
-        template<
-            typename TList>
+        template<typename TList>
         using IsSet = detail::IsSetImpl<TList>;
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Transform.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Transform.hpp
index 22c2805fe9..b0491b1fcc 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Transform.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Transform.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,28 +16,17 @@ namespace alpaka
         namespace detail
         {
             //#############################################################################
-            template<
-                typename Ts,
-                template<typename...> class TOp>
+            template<typename Ts, template<typename...> class TOp>
             struct TransformImpl;
             //#############################################################################
-            template<
-                template<typename...> class TList,
-                typename... Ts,
-                template<typename...> class TOp>
-            struct TransformImpl<
-                TList<Ts...>,
-                TOp>
+            template<template<typename...> class TList, typename... Ts, template<typename...> class TOp>
+            struct TransformImpl<TList<Ts...>, TOp>
             {
-                using type =
-                    TList<
-                        TOp<Ts>...>;
+                using type = TList<TOp<Ts>...>;
             };
-        }
+        } // namespace detail
         //#############################################################################
-        template<
-            typename Ts,
-            template<typename...> class TOp>
+        template<typename Ts, template<typename...> class TOp>
         using Transform = typename detail::TransformImpl<Ts, TOp>::type;
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Unique.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Unique.hpp
index dbb6adc43a..32ddd1b154 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/meta/Unique.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Unique.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,52 +9,44 @@
 
 #pragma once
 
-#include <type_traits>
-
 #include <alpaka/meta/Metafunctions.hpp>
 
+#include <type_traits>
+
 namespace alpaka
 {
     namespace meta
     {
         namespace detail
         {
-            template<
-                typename T,
-                typename... Ts>
+            template<typename T, typename... Ts>
             struct UniqueHelper
             {
                 using type = T;
             };
 
-            template<
-                template<typename...> class TList,
-                typename... Ts,
-                typename U,
-                typename... Us>
+            template<template<typename...> class TList, typename... Ts, typename U, typename... Us>
             struct UniqueHelper<TList<Ts...>, U, Us...>
-                : std::conditional<(Disjunction<std::is_same<U, Ts>...>::value)
-                    , UniqueHelper<TList<Ts...>, Us...>
-                    , UniqueHelper<TList<Ts..., U>, Us...>>::type
-            {};
+                : std::conditional<
+                      (Disjunction<std::is_same<U, Ts>...>::value),
+                      UniqueHelper<TList<Ts...>, Us...>,
+                      UniqueHelper<TList<Ts..., U>, Us...>>::type
+            {
+            };
 
-            template<
-                typename T>
+            template<typename T>
             struct UniqueImpl;
 
-            template<
-                template<typename...> class TList,
-                typename... Ts>
+            template<template<typename...> class TList, typename... Ts>
             struct UniqueImpl<TList<Ts...>>
             {
                 using type = typename UniqueHelper<TList<>, Ts...>::type;
             };
-        }
+        } // namespace detail
 
         //#############################################################################
         //! Trait that returns a list with only unique (no equal) types (a set). Duplicates will be filtered out.
-        template<
-            typename TList>
+        template<typename TList>
         using Unique = typename detail::UniqueImpl<TList>::type;
-    }
-}
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/meta/Void.hpp b/thirdParty/cupla/alpaka/include/alpaka/meta/Void.hpp
new file mode 100644
index 0000000000..a88057dbbd
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/meta/Void.hpp
@@ -0,0 +1,21 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+namespace alpaka
+{
+    namespace meta
+    {
+        //#############################################################################
+        //! Mirror of C++17 std::void_t, maps a sequence of any types to type void
+        template<class...>
+        using Void = void;
+    } // namespace meta
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/offset/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/offset/Traits.hpp
index 996a37407b..1f440b32d4 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/offset/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/offset/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,205 +9,129 @@
 
 #pragma once
 
+#include <alpaka/core/Common.hpp>
 #include <alpaka/dim/DimIntegralConst.hpp>
 #include <alpaka/idx/Traits.hpp>
-#include <alpaka/core/Common.hpp>
 
 #include <type_traits>
 
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
-    //! The offset specifics.
-    namespace offset
+    //! The offset traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The offset traits.
-        namespace traits
+        //#############################################################################
+        //! The x offset get trait.
+        //!
+        //! If not specialized explicitly it returns 0.
+        template<typename TIdx, typename TOffsets, typename TSfinae = void>
+        struct GetOffset
         {
-            //#############################################################################
-            //! The x offset get trait.
-            //!
-            //! If not specialized explicitly it returns 0.
-            template<
-                typename TIdx,
-                typename TOffsets,
-                typename TSfinae = void>
-            struct GetOffset
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const&) -> Idx<TOffsets>
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const &)
-                -> idx::Idx<TOffsets>
-                {
-                    return static_cast<idx::Idx<TOffsets>>(0);
-                }
-            };
+                return static_cast<Idx<TOffsets>>(0);
+            }
+        };
 
-            //#############################################################################
-            //! The x offset set trait.
-            template<
-                typename TIdx,
-                typename TOffsets,
-                typename TOffset,
-                typename TSfinae = void>
-            struct SetOffset;
-        }
+        //#############################################################################
+        //! The x offset set trait.
+        template<typename TIdx, typename TOffsets, typename TOffset, typename TSfinae = void>
+        struct SetOffset;
+    } // namespace traits
 
-        //-----------------------------------------------------------------------------
-        //! \return The offset in the given dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t Tidx,
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffset(
-            TOffsets const & offsets)
-        -> idx::Idx<TOffsets>
-        {
-            return
-                traits::GetOffset<
-                    dim::DimInt<Tidx>,
-                    TOffsets>
-                ::getOffset(
-                    offsets);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The offset in x dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetX(
-            TOffsets const & offsets = TOffsets())
-        -> idx::Idx<TOffsets>
-        {
-            return getOffset<dim::Dim<TOffsets>::value - 1u>(offsets);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The offset in y dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetY(
-            TOffsets const & offsets = TOffsets())
-        -> idx::Idx<TOffsets>
-        {
-            return getOffset<dim::Dim<TOffsets>::value - 2u>(offsets);
-        }
-        //-----------------------------------------------------------------------------
-        //! \return The offset in z dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetZ(
-            TOffsets const & offsets = TOffsets())
-        -> idx::Idx<TOffsets>
-        {
-            return getOffset<dim::Dim<TOffsets>::value - 3u>(offsets);
-        }
+    //-----------------------------------------------------------------------------
+    //! \return The offset in the given dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<std::size_t Tidx, typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffset(TOffsets const& offsets) -> Idx<TOffsets>
+    {
+        return traits::GetOffset<DimInt<Tidx>, TOffsets>::getOffset(offsets);
+    }
+    //-----------------------------------------------------------------------------
+    //! \return The offset in x dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffsetX(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
+    {
+        return getOffset<Dim<TOffsets>::value - 1u>(offsets);
+    }
+    //-----------------------------------------------------------------------------
+    //! \return The offset in y dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffsetY(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
+    {
+        return getOffset<Dim<TOffsets>::value - 2u>(offsets);
+    }
+    //-----------------------------------------------------------------------------
+    //! \return The offset in z dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffsetZ(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
+    {
+        return getOffset<Dim<TOffsets>::value - 3u>(offsets);
+    }
 
-        //-----------------------------------------------------------------------------
-        //! Sets the offset in the given dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            std::size_t Tidx,
-            typename TOffsets,
-            typename TOffset>
-        ALPAKA_FN_HOST_ACC auto setOffset(
-            TOffsets const & offsets,
-            TOffset const & offset)
-        -> void
-        {
-            traits::SetOffset<
-                dim::DimInt<Tidx>,
-                TOffsets,
-                TOffset>
-            ::setOffset(
-                offsets,
-                offset);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the offset in x dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets,
-            typename TOffset>
-        ALPAKA_FN_HOST_ACC auto setOffsetX(
-            TOffsets const & offsets,
-            TOffset const & offset)
-        -> void
-        {
-            setOffset<dim::Dim<TOffsets>::value - 1u>(offsets, offset);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the offset in y dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets,
-            typename TOffset>
-        ALPAKA_FN_HOST_ACC auto setOffsetY(
-            TOffsets const & offsets,
-            TOffset const & offset)
-        -> void
-        {
-            setOffset<dim::Dim<TOffsets>::value - 2u>(offsets, offset);
-        }
-        //-----------------------------------------------------------------------------
-        //! Sets the offset in z dimension.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets,
-            typename TOffset>
-        ALPAKA_FN_HOST_ACC auto setOffsetZ(
-            TOffsets const & offsets,
-            TOffset const & offset)
-        -> void
-        {
-            setOffset<dim::Dim<TOffsets>::value - 3u>(offsets, offset);
-        }
+    //-----------------------------------------------------------------------------
+    //! Sets the offset in the given dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<std::size_t Tidx, typename TOffsets, typename TOffset>
+    ALPAKA_FN_HOST_ACC auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
+    {
+        traits::SetOffset<DimInt<Tidx>, TOffsets, TOffset>::setOffset(offsets, offset);
+    }
+    //-----------------------------------------------------------------------------
+    //! Sets the offset in x dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets, typename TOffset>
+    ALPAKA_FN_HOST_ACC auto setOffsetX(TOffsets const& offsets, TOffset const& offset) -> void
+    {
+        setOffset<Dim<TOffsets>::value - 1u>(offsets, offset);
+    }
+    //-----------------------------------------------------------------------------
+    //! Sets the offset in y dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets, typename TOffset>
+    ALPAKA_FN_HOST_ACC auto setOffsetY(TOffsets const& offsets, TOffset const& offset) -> void
+    {
+        setOffset<Dim<TOffsets>::value - 2u>(offsets, offset);
+    }
+    //-----------------------------------------------------------------------------
+    //! Sets the offset in z dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets, typename TOffset>
+    ALPAKA_FN_HOST_ACC auto setOffsetZ(TOffsets const& offsets, TOffset const& offset) -> void
+    {
+        setOffset<Dim<TOffsets>::value - 3u>(offsets, offset);
+    }
 
-        //-----------------------------------------------------------------------------
-        // Trait specializations for unsigned integral types.
-        namespace traits
+    //-----------------------------------------------------------------------------
+    // Trait specializations for unsigned integral types.
+    namespace traits
+    {
+        //#############################################################################
+        //! The unsigned integral x offset get trait specialization.
+        template<typename TOffsets>
+        struct GetOffset<DimInt<0u>, TOffsets, std::enable_if_t<std::is_integral<TOffsets>::value>>
         {
-            //#############################################################################
-            //! The unsigned integral x offset get trait specialization.
-            template<
-                typename TOffsets>
-            struct GetOffset<
-                dim::DimInt<0u>,
-                TOffsets,
-                typename std::enable_if<
-                    std::is_integral<TOffsets>::value>::type>
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const& offset) -> Idx<TOffsets>
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    TOffsets const & offset)
-                -> idx::Idx<TOffsets>
-                {
-                    return offset;
-                }
-            };
-            //#############################################################################
-            //! The unsigned integral x offset set trait specialization.
-            template<
-                typename TOffsets,
-                typename TOffset>
-            struct SetOffset<
-                dim::DimInt<0u>,
-                TOffsets,
-                TOffset,
-                typename std::enable_if<
-                    std::is_integral<TOffsets>::value>::type>
+                return offset;
+            }
+        };
+        //#############################################################################
+        //! The unsigned integral x offset set trait specialization.
+        template<typename TOffsets, typename TOffset>
+        struct SetOffset<DimInt<0u>, TOffsets, TOffset, std::enable_if_t<std::is_integral<TOffsets>::value>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(TOffsets const& offsets, TOffset const& offset) -> void
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    TOffsets const & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets = offset;
-                }
-            };
-        }
-    }
-}
+                offsets = offset;
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCpu.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCpu.hpp
index 61c2328762..45df3d941e 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCpu.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCpu.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,86 +9,69 @@
 
 #pragma once
 
-#include <alpaka/pltf/Traits.hpp>
-#include <alpaka/dev/DevCpu.hpp>
 #include <alpaka/core/Concepts.hpp>
+#include <alpaka/dev/DevCpu.hpp>
+#include <alpaka/pltf/Traits.hpp>
 
 #include <sstream>
 #include <vector>
 
 namespace alpaka
 {
-    namespace pltf
+    //#############################################################################
+    //! The CPU device platform.
+    class PltfCpu : public concepts::Implements<ConceptPltf, PltfCpu>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST PltfCpu() = delete;
+    };
+
+    namespace traits
     {
         //#############################################################################
-        //! The CPU device platform.
-        class PltfCpu :
-            public concepts::Implements<ConceptPltf, PltfCpu>
+        //! The CPU device device type trait specialization.
+        template<>
+        struct DevType<PltfCpu>
         {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST PltfCpu() = delete;
+            using type = DevCpu;
         };
-    }
 
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device device type trait specialization.
-            template<>
-            struct DevType<
-                pltf::PltfCpu>
-            {
-                using type = dev::DevCpu;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
+        //#############################################################################
+        //! The CPU platform device count get trait specialization.
+        template<>
+        struct GetDevCount<PltfCpu>
         {
-            //#############################################################################
-            //! The CPU platform device count get trait specialization.
-            template<>
-            struct GetDevCount<
-                pltf::PltfCpu>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDevCount() -> std::size_t
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDevCount()
-                -> std::size_t
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-                    return 1;
-                }
-            };
+                return 1;
+            }
+        };
 
-            //#############################################################################
-            //! The CPU platform device get trait specialization.
-            template<>
-            struct GetDevByIdx<
-                pltf::PltfCpu>
+        //#############################################################################
+        //! The CPU platform device get trait specialization.
+        template<>
+        struct GetDevByIdx<PltfCpu>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDevByIdx(std::size_t const& devIdx) -> DevCpu
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDevByIdx(
-                    std::size_t const & devIdx)
-                -> dev::DevCpu
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    std::size_t const devCount(pltf::getDevCount<pltf::PltfCpu>());
-                    if(devIdx >= devCount)
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for CPU device with index " << devIdx << " because there are only " << devCount << " devices!";
-                        throw std::runtime_error(ssErr.str());
-                    }
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-                    return {};
+                std::size_t const devCount(getDevCount<PltfCpu>());
+                if(devIdx >= devCount)
+                {
+                    std::stringstream ssErr;
+                    ssErr << "Unable to return device handle for CPU device with index " << devIdx
+                          << " because there are only " << devCount << " devices!";
+                    throw std::runtime_error(ssErr.str());
                 }
-            };
-        }
-    }
-}
+
+                return {};
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCudaRt.hpp
deleted file mode 100644
index 3fa715bf6f..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfCudaRt.hpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        //#############################################################################
-        //! The CUDA RT device manager.
-        class PltfCudaRt :
-            public concepts::Implements<ConceptPltf, PltfCudaRt>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST PltfCudaRt() = delete;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT device manager device type trait specialization.
-            template<>
-            struct DevType<
-                pltf::PltfCudaRt>
-            {
-                using type = dev::DevCudaRt;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU platform device count get trait specialization.
-            template<>
-            struct GetDevCount<
-                pltf::PltfCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDevCount()
-                -> std::size_t
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    int iNumDevices(0);
-                    cudaError_t error = cudaGetDeviceCount(&iNumDevices);
-                    if(error != cudaSuccess)
-                        iNumDevices = 0;
-
-                    return static_cast<std::size_t>(iNumDevices);
-                }
-            };
-
-            //#############################################################################
-            //! The CPU platform device get trait specialization.
-            template<>
-            struct GetDevByIdx<
-                pltf::PltfCudaRt>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDevByIdx(
-                    std::size_t const & devIdx)
-                -> dev::DevCudaRt
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    dev::DevCudaRt dev;
-
-                    std::size_t const devCount(pltf::getDevCount<pltf::PltfCudaRt>());
-                    if(devIdx >= devCount)
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount << " CUDA devices!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    if(isDevUsable(devIdx))
-                    {
-                        dev.m_iDevice = static_cast<int>(devIdx);
-
-                        // Log this device.
-    #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        cudaDeviceProp devProp;
-                        ALPAKA_CUDA_RT_CHECK(cudaGetDeviceProperties(&devProp, dev.m_iDevice));
-    #endif
-    #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        printDeviceProperties(devProp);
-    #elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        std::cout << __func__ << devProp.name << std::endl;
-    #endif
-                    }
-                    else
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    return dev;
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! \return If the device is usable.
-                ALPAKA_FN_HOST static auto isDevUsable(
-                    std::size_t iDevice)
-                -> bool
-                {
-                    cudaError rc(cudaSetDevice(static_cast<int>(iDevice)));
-
-                    cudaStream_t queue = {};
-                    // Create a dummy queue to check if the device is already used by an other process.
-                    // cudaSetDevice never returns an error if another process already uses the selected device and gpu compute mode is set "process exclusive".
-                    // \TODO: Check if this workaround is needed!
-                    if(rc == cudaSuccess)
-                    {
-                        rc = cudaStreamCreate(&queue);
-                    }
-
-                    if(rc == cudaSuccess)
-                    {
-                        // Destroy the dummy queue.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamDestroy(
-                                queue));
-                        return true;
-                    }
-                    else
-                    {
-                        // Return the previous error from cudaStreamCreate.
-                        ALPAKA_CUDA_RT_CHECK(
-                            rc);
-                        // Reset the Error state.
-                        cudaGetLastError();
-
-                        return false;
-                    }
-                }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                //-----------------------------------------------------------------------------
-                //! Prints all the device properties to std::cout.
-                ALPAKA_FN_HOST static auto printDeviceProperties(
-                    cudaDeviceProp const & devProp)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    std::size_t const kiB(1024);
-                    std::size_t const miB(kiB * kiB);
-                    std::cout << "name: " << devProp.name << std::endl;
-                    std::cout << "totalGlobalMem: " << devProp.totalGlobalMem/miB << " MiB" << std::endl;
-                    std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock/kiB << " KiB" << std::endl;
-                    std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
-                    std::cout << "warpSize: " << devProp.warpSize << std::endl;
-                    std::cout << "memPitch: " << devProp.memPitch << " B" << std::endl;
-                    std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
-                    std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1] << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
-                    std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", " << devProp.maxGridSize[2] << ")" << std::endl;
-                    std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
-                    std::cout << "totalConstMem: " << devProp.totalConstMem/kiB << " KiB" << std::endl;
-                    std::cout << "major: " << devProp.major << std::endl;
-                    std::cout << "minor: " << devProp.minor << std::endl;
-                    std::cout << "textureAlignment: " << devProp.textureAlignment << std::endl;
-                    std::cout << "texturePitchAlignment: " << devProp.texturePitchAlignment << std::endl;
-                    //std::cout << "deviceOverlap: " << devProp.deviceOverlap << std::endl;    // Deprecated
-                    std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
-                    std::cout << "kernelExecTimeoutEnabled: " << devProp.kernelExecTimeoutEnabled << std::endl;
-                    std::cout << "integrated: " << devProp.integrated << std::endl;
-                    std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
-                    std::cout << "computeMode: " << devProp.computeMode << std::endl;
-                    std::cout << "maxTexture1D: " << devProp.maxTexture1D << std::endl;
-                    std::cout << "maxTexture1DLinear: " << devProp.maxTexture1DLinear << std::endl;
-                    std::cout << "maxTexture2D[2]: " << devProp.maxTexture2D[0] << "x" << devProp.maxTexture2D[1] << std::endl;
-                    std::cout << "maxTexture2DLinear[3]: " << devProp.maxTexture2DLinear[0] << "x" << devProp.maxTexture2DLinear[1] << "x" << devProp.maxTexture2DLinear[2] << std::endl;
-                    std::cout << "maxTexture2DGather[2]: " << devProp.maxTexture2DGather[0] << "x" << devProp.maxTexture2DGather[1] << std::endl;
-                    std::cout << "maxTexture3D[3]: " << devProp.maxTexture3D[0] << "x" << devProp.maxTexture3D[1] << "x" << devProp.maxTexture3D[2] << std::endl;
-                    std::cout << "maxTextureCubemap: " << devProp.maxTextureCubemap << std::endl;
-                    std::cout << "maxTexture1DLayered[2]: " << devProp.maxTexture1DLayered[0] << "x" << devProp.maxTexture1DLayered[1] << std::endl;
-                    std::cout << "maxTexture2DLayered[3]: " << devProp.maxTexture2DLayered[0] << "x" << devProp.maxTexture2DLayered[1] << "x" << devProp.maxTexture2DLayered[2] << std::endl;
-                    std::cout << "maxTextureCubemapLayered[2]: " << devProp.maxTextureCubemapLayered[0] << "x" << devProp.maxTextureCubemapLayered[1] << std::endl;
-                    std::cout << "maxSurface1D: " << devProp.maxSurface1D << std::endl;
-                    std::cout << "maxSurface2D[2]: " << devProp.maxSurface2D[0] << "x" << devProp.maxSurface2D[1] << std::endl;
-                    std::cout << "maxSurface3D[3]: " << devProp.maxSurface3D[0] << "x" << devProp.maxSurface3D[1] << "x" << devProp.maxSurface3D[2] << std::endl;
-                    std::cout << "maxSurface1DLayered[2]: " << devProp.maxSurface1DLayered[0] << "x" << devProp.maxSurface1DLayered[1] << std::endl;
-                    std::cout << "maxSurface2DLayered[3]: " << devProp.maxSurface2DLayered[0] << "x" << devProp.maxSurface2DLayered[1] << "x" << devProp.maxSurface2DLayered[2] << std::endl;
-                    std::cout << "maxSurfaceCubemap: " << devProp.maxSurfaceCubemap << std::endl;
-                    std::cout << "maxSurfaceCubemapLayered[2]: " << devProp.maxSurfaceCubemapLayered[0] << "x" << devProp.maxSurfaceCubemapLayered[1] << std::endl;
-                    std::cout << "surfaceAlignment: " << devProp.surfaceAlignment << std::endl;
-                    std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
-                    std::cout << "ECCEnabled: " << devProp.ECCEnabled << std::endl;
-                    std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
-                    std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
-                    std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
-                    std::cout << "tccDriver: " << devProp.tccDriver << std::endl;
-                    std::cout << "asyncEngineCount: " << devProp.asyncEngineCount << std::endl;
-                    std::cout << "unifiedAddressing: " << devProp.unifiedAddressing << std::endl;
-                    std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
-                    std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
-                    std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
-                    std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
-                    std::cout << "streamPrioritiesSupported: " << devProp.streamPrioritiesSupported << std::endl;
-                    std::cout << "globalL1CacheSupported: " << devProp.globalL1CacheSupported << std::endl;
-                    std::cout << "localL1CacheSupported: " << devProp.localL1CacheSupported << std::endl;
-                    std::cout << "sharedMemPerMultiprocessor: " << devProp.sharedMemPerMultiprocessor << std::endl;
-                    std::cout << "regsPerMultiprocessor: " << devProp.regsPerMultiprocessor << std::endl;
-                    std::cout << "managedMemory: " << devProp.managedMemory << std::endl;
-                    std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
-                    std::cout << "multiGpuBoardGroupID: " << devProp.multiGpuBoardGroupID << std::endl;
-                    std::cout << "singleToDoublePrecisionPerfRatio: " << devProp.singleToDoublePrecisionPerfRatio << std::endl;
-                    std::cout << "pageableMemoryAccess: " << devProp.pageableMemoryAccess << std::endl;
-                    std::cout << "concurrentManagedAccess: " << devProp.concurrentManagedAccess << std::endl;
-                    std::cout << "computePreemptionSupported: " << devProp.computePreemptionSupported << std::endl;
-                    std::cout << "canUseHostPointerForRegisteredMem: " << devProp.canUseHostPointerForRegisteredMem << std::endl;
-                    std::cout << "cooperativeLaunch: " << devProp.cooperativeLaunch << std::endl;
-                    std::cout << "cooperativeMultiDeviceLaunch: " << devProp.cooperativeMultiDeviceLaunch << std::endl;
-                }
-#endif
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfHipRt.hpp
deleted file mode 100644
index ec9abf1393..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfHipRt.hpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-#include <alpaka/core/Concepts.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/dev/DevHipRt.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-namespace alpaka
-{
-    namespace pltf
-    {
-        //#############################################################################
-        //! The HIP RT device manager.
-        class PltfHipRt :
-            public concepts::Implements<ConceptPltf, PltfHipRt>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! Constructor.
-            ALPAKA_FN_HOST PltfHipRt() = delete;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT device manager device type trait specialization.
-            template<>
-            struct DevType<
-                pltf::PltfHipRt>
-            {
-                using type = dev::DevHipRt;
-            };
-        }
-    }
-    namespace pltf
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU platform device count get trait specialization.
-            template<>
-            struct GetDevCount<
-                pltf::PltfHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto getDevCount()
-                -> std::size_t
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    int iNumDevices(0);
-                    hipError_t error = hipGetDeviceCount(&iNumDevices);
-                    if(error != hipSuccess)
-                        iNumDevices = 0;
-                    return static_cast<std::size_t>(iNumDevices);
-                }
-            };
-
-            //#############################################################################
-            //! The CPU platform device get trait specialization.
-            template<>
-            struct GetDevByIdx<
-                pltf::PltfHipRt>
-            {
-                //-----------------------------------------------------------------------------
-
-                ALPAKA_FN_HOST static auto getDevByIdx(
-                    std::size_t const & devIdx)
-                -> dev::DevHipRt
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    dev::DevHipRt dev;
-
-                    std::size_t const devCount(pltf::getDevCount<pltf::PltfHipRt>());
-                    if(devIdx >= devCount)
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount << " HIP devices!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    if(isDevUsable(devIdx))
-                    {
-                        dev.m_iDevice = static_cast<int>(devIdx);
-
-                        // Log this device.
-    #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        hipDeviceProp_t devProp;
-                        ALPAKA_HIP_RT_CHECK(hipGetDeviceProperties(&devProp, dev.m_iDevice));
-    #endif
-    #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                        printDeviceProperties(devProp);
-    #elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                        std::cout << __func__ << devProp.name << std::endl;
-    #endif
-                    }
-                    else
-                    {
-                        std::stringstream ssErr;
-                        ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
-                        throw std::runtime_error(ssErr.str());
-                    }
-
-                    return dev;
-                }
-
-            private:
-                //-----------------------------------------------------------------------------
-                //! \return If the device is usable.
-                ALPAKA_FN_HOST static auto isDevUsable(
-                    std::size_t iDevice)
-                -> bool
-                {
-                    hipError_t rc(hipSetDevice(static_cast<int>(iDevice)));
-
-                    hipStream_t queue = {};
-                    // Create a dummy queue to check if the device is already used by an other process.
-                    // hipSetDevice never returns an error if another process already uses the selected device and gpu compute mode is set "process exclusive".
-                    // \TODO: Check if this workaround is needed!
-                    if(rc == hipSuccess)
-                    {
-                        rc = hipStreamCreate(&queue);
-                    }
-
-                    if(rc == hipSuccess)
-                    {
-                        // Destroy the dummy queue.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamDestroy(
-                                queue));
-                        return true;
-                    }
-                    else
-                    {
-                        // Return the previous error from hipStreamCreate.
-                        ALPAKA_HIP_RT_CHECK(
-                            rc);
-                        // Reset the Error state.
-                        hipGetLastError();
-
-                        return false;
-                    }
-                }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                //-----------------------------------------------------------------------------
-                //! Prints all the device properties to std::cout.
-                ALPAKA_FN_HOST static auto printDeviceProperties(
-                    hipDeviceProp_t const & devProp)
-                -> void
-                {
-                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                    std::size_t const kiB(1024);
-                    std::size_t const miB(kiB * kiB);
-                    std::cout << "name: " << devProp.name << std::endl;
-                    std::cout << "totalGlobalMem: " << devProp.totalGlobalMem/miB << " MiB" << std::endl;
-                    std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock/kiB << " KiB" << std::endl;
-                    std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
-                    std::cout << "warpSize: " << devProp.warpSize << std::endl;
-                    std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
-                    std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1] << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
-                    std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", " << devProp.maxGridSize[2] << ")" << std::endl;
-                    std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
-                    std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
-                    std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
-                    std::cout << "totalConstMem: " << devProp.totalConstMem/kiB << " KiB" << std::endl;
-                    std::cout << "major: " << devProp.major << std::endl;
-                    std::cout << "minor: " << devProp.minor << std::endl;
-                    std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
-                    std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
-                    std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
-                    std::cout << "computeMode: " << devProp.computeMode << std::endl;
-                    std::cout << "clockInstructionRate: " << devProp.clockInstructionRate << "kHz" << std::endl;
-                    std::cout << "arch: " << std::endl;
-                    std::cout << "    hasGlobalInt32Atomics: " << devProp.arch.hasGlobalInt32Atomics << std::endl;
-                    std::cout << "    hasGlobalFloatAtomicExch: " << devProp.arch.hasGlobalFloatAtomicExch << std::endl;
-                    std::cout << "    hasSharedInt32Atomics: " << devProp.arch.hasSharedInt32Atomics << std::endl;
-                    std::cout << "    hasSharedFloatAtomicExch: " << devProp.arch.hasSharedFloatAtomicExch << std::endl;
-                    std::cout << "    hasFloatAtomicAdd: " << devProp.arch.hasFloatAtomicAdd << std::endl;
-                    std::cout << "    hasGlobalInt64Atomics: " << devProp.arch.hasGlobalInt64Atomics << std::endl;
-                    std::cout << "    hasSharedInt64Atomics: " << devProp.arch.hasSharedInt64Atomics << std::endl;
-                    std::cout << "    hasDoubles: " << devProp.arch.hasDoubles << std::endl;
-                    std::cout << "    hasWarpVote: " << devProp.arch.hasWarpVote << std::endl;
-                    std::cout << "    hasWarpBallot: " << devProp.arch.hasWarpBallot << std::endl;
-                    std::cout << "    hasWarpShuffle: " << devProp.arch.hasWarpShuffle << std::endl;
-                    std::cout << "    hasFunnelShift: " << devProp.arch.hasFunnelShift << std::endl;
-                    std::cout << "    hasThreadFenceSystem: " << devProp.arch.hasThreadFenceSystem << std::endl;
-                    std::cout << "    hasSyncThreadsExt: " << devProp.arch.hasSyncThreadsExt << std::endl;
-                    std::cout << "    hasSurfaceFuncs: " << devProp.arch.hasSurfaceFuncs << std::endl;
-                    std::cout << "    has3dGrid: " << devProp.arch.has3dGrid << std::endl;
-                    std::cout << "    hasDynamicParallelism: " << devProp.arch.hasDynamicParallelism << std::endl;
-                    std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
-                    std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
-                    std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
-                    std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
-                    std::cout << "maxSharedMemoryPerMultiProcessor: " << devProp.maxSharedMemoryPerMultiProcessor/kiB << " KiB" << std::endl;
-                    std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
-                    std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
-                    std::cout << "gcnArch: " << devProp.gcnArch << std::endl;
-                    std::cout << "integrated: " << devProp.integrated << std::endl;
-                }
-#endif
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfOacc.hpp
new file mode 100644
index 0000000000..fc8f59bac8
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfOacc.hpp
@@ -0,0 +1,86 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/dev/DevOacc.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+
+#    include <sstream>
+#    include <vector>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The OpenACC device platform.
+    class PltfOacc : public concepts::Implements<ConceptPltf, PltfOacc>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST PltfOacc() = delete;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenACC device device type trait specialization.
+        template<>
+        struct DevType<PltfOacc>
+        {
+            using type = DevOacc;
+        };
+
+        //#############################################################################
+        //! The OpenACC platform device count get trait specialization.
+        template<>
+        struct GetDevCount<PltfOacc>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDevCount() -> std::size_t
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return static_cast<std::size_t>(::acc_get_num_devices(::acc_get_device_type()));
+            }
+        };
+
+        //#############################################################################
+        //! The OpenACC platform device get trait specialization.
+        template<>
+        struct GetDevByIdx<PltfOacc>
+        {
+            //-----------------------------------------------------------------------------
+            //! \param devIdx device id, less than GetDevCount
+            ALPAKA_FN_HOST static auto getDevByIdx(std::size_t devIdx) -> DevOacc
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                std::size_t const devCount(getDevCount<PltfOacc>());
+                if(devIdx >= devCount)
+                {
+                    std::stringstream ssErr;
+                    ssErr << "Unable to return device handle for OpenACC device with index " << devIdx
+                          << " because there are only " << devCount << " devices!";
+                    throw std::runtime_error(ssErr.str());
+                }
+
+                return {static_cast<int>(devIdx)};
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfOmp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfOmp5.hpp
new file mode 100644
index 0000000000..f9963aac5f
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfOmp5.hpp
@@ -0,0 +1,96 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/dev/DevOmp5.hpp>
+#    include <alpaka/pltf/Traits.hpp>
+
+#    include <limits>
+#    include <sstream>
+#    include <vector>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The OpenMP 5 device platform.
+    class PltfOmp5 : public concepts::Implements<ConceptPltf, PltfOmp5>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST PltfOmp5() = delete;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The OpenMP 5 device device type trait specialization.
+        template<>
+        struct DevType<PltfOmp5>
+        {
+            using type = DevOmp5;
+        };
+
+        //#############################################################################
+        //! The OpenMP 5 platform device count get trait specialization.
+        template<>
+        struct GetDevCount<PltfOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDevCount() -> std::size_t
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                const std::size_t count = static_cast<std::size_t>(::omp_get_num_devices());
+                // runtime will report zero devices if not target device is available or if offloading is disabled
+                return count > 0 ? count : 1;
+            }
+        };
+
+        //#############################################################################
+        //! The OpenMP 5 platform device get trait specialization.
+        template<>
+        struct GetDevByIdx<PltfOmp5>
+        {
+            //-----------------------------------------------------------------------------
+            //! \param devIdx device id, less than GetDevCount or equal, yielding omp_get_initial_device()
+            ALPAKA_FN_HOST static auto getDevByIdx(std::size_t devIdx) -> DevOmp5
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                std::size_t const devCount(static_cast<std::size_t>(::omp_get_num_devices()));
+                int devIdxOmp5 = static_cast<int>(devIdx);
+                if(devIdx == devCount || (devCount == 0 && devIdx == 1 /* getDevCount */))
+                { // take this case to use the initial device
+                    devIdxOmp5 = ::omp_get_initial_device();
+                }
+                else if(devIdx > devCount)
+                {
+                    std::stringstream ssErr;
+                    ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount
+                          << " target devices"
+                             "and the initial device with index "
+                          << devCount;
+                    throw std::runtime_error(ssErr.str());
+                }
+
+                return {devIdxOmp5};
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfUniformCudaHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfUniformCudaHipRt.hpp
new file mode 100644
index 0000000000..a1ffb00118
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/pltf/PltfUniformCudaHipRt.hpp
@@ -0,0 +1,293 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/dev/Traits.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <iostream>
+#    include <sstream>
+#    include <stdexcept>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The CUDA/HIP RT platform.
+    class PltfUniformCudaHipRt : public concepts::Implements<ConceptPltf, PltfUniformCudaHipRt>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST PltfUniformCudaHipRt() = delete;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA/HIP RT platform device type trait specialization.
+        template<>
+        struct DevType<PltfUniformCudaHipRt>
+        {
+            using type = DevUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT platform device count get trait specialization.
+        template<>
+        struct GetDevCount<PltfUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDevCount() -> std::size_t
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                int iNumDevices(0);
+                ALPAKA_API_PREFIX(Error_t) error = ALPAKA_API_PREFIX(GetDeviceCount)(&iNumDevices);
+                if(error != ALPAKA_API_PREFIX(Success))
+                    iNumDevices = 0;
+
+                return static_cast<std::size_t>(iNumDevices);
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT platform device get trait specialization.
+        template<>
+        struct GetDevByIdx<PltfUniformCudaHipRt>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDevByIdx(std::size_t const& devIdx) -> DevUniformCudaHipRt
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                DevUniformCudaHipRt dev;
+
+                std::size_t const devCount(getDevCount<PltfUniformCudaHipRt>());
+                if(devIdx >= devCount)
+                {
+                    std::stringstream ssErr;
+                    ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount
+                          << " devices!";
+                    throw std::runtime_error(ssErr.str());
+                }
+
+                if(isDevUsable(devIdx))
+                {
+                    dev.m_iDevice = static_cast<int>(devIdx);
+
+                    // Log this device.
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    cudaDeviceProp devProp;
+#        else
+                    hipDeviceProp_t devProp;
+#        endif
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(GetDeviceProperties)(&devProp, dev.m_iDevice));
+#    endif
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                    printDeviceProperties(devProp);
+#    elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                    std::cout << __func__ << devProp.name << std::endl;
+#    endif
+                }
+                else
+                {
+                    std::stringstream ssErr;
+                    ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
+                    throw std::runtime_error(ssErr.str());
+                }
+
+                return dev;
+            }
+
+        private:
+            //-----------------------------------------------------------------------------
+            //! \return If the device is usable.
+            ALPAKA_FN_HOST static auto isDevUsable(std::size_t iDevice) -> bool
+            {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                cudaError rc(cudaSetDevice(static_cast<int>(iDevice)));
+#    else
+                hipError_t rc(hipSetDevice(static_cast<int>(iDevice)));
+#    endif
+
+                ALPAKA_API_PREFIX(Stream_t) queue = {};
+                // Create a dummy queue to check if the device is already used by an other process.
+                // cuda/hip-SetDevice never returns an error if another process already uses the selected device and
+                // gpu compute mode is set "process exclusive". \TODO: Check if this workaround is needed!
+                if(rc == ALPAKA_API_PREFIX(Success))
+                {
+                    rc = ALPAKA_API_PREFIX(StreamCreate)(&queue);
+                }
+
+                if(rc == ALPAKA_API_PREFIX(Success))
+                {
+                    // Destroy the dummy queue.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamDestroy)(queue));
+                    return true;
+                }
+                else
+                {
+                    // Return the previous error from cudaStreamCreate.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(rc);
+                    // Reset the Error state.
+                    ALPAKA_API_PREFIX(GetLastError)();
+                    return false;
+                }
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            //-----------------------------------------------------------------------------
+            //! Prints all the device properties to std::cout.
+            ALPAKA_FN_HOST static auto printDeviceProperties(
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                cudaDeviceProp const& devProp
+#        else
+                hipDeviceProp_t const& devProp
+#        endif
+                ) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                std::size_t const kiB(1024);
+                std::size_t const miB(kiB * kiB);
+                std::cout << "name: " << devProp.name << std::endl;
+                std::cout << "totalGlobalMem: " << devProp.totalGlobalMem / miB << " MiB" << std::endl;
+                std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock / kiB << " KiB" << std::endl;
+                std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
+                std::cout << "warpSize: " << devProp.warpSize << std::endl;
+                std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
+                std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1]
+                          << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
+                std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", "
+                          << devProp.maxGridSize[2] << ")" << std::endl;
+                std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
+                std::cout << "totalConstMem: " << devProp.totalConstMem / kiB << " KiB" << std::endl;
+                std::cout << "major: " << devProp.major << std::endl;
+                std::cout << "minor: " << devProp.minor << std::endl;
+
+                // std::cout << "deviceOverlap: " << devProp.deviceOverlap << std::endl;    // Deprecated
+                std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
+                std::cout << "integrated: " << devProp.integrated << std::endl;
+                std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
+                std::cout << "computeMode: " << devProp.computeMode << std::endl;
+                std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
+                std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
+                std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
+                std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
+                std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
+                std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
+                std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
+                std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
+                std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                std::cout << "memPitch: " << devProp.memPitch << " B" << std::endl;
+                std::cout << "textureAlignment: " << devProp.textureAlignment << std::endl;
+                std::cout << "texturePitchAlignment: " << devProp.texturePitchAlignment << std::endl;
+                std::cout << "kernelExecTimeoutEnabled: " << devProp.kernelExecTimeoutEnabled << std::endl;
+                std::cout << "unifiedAddressing: " << devProp.unifiedAddressing << std::endl;
+                std::cout << "multiGpuBoardGroupID: " << devProp.multiGpuBoardGroupID << std::endl;
+                std::cout << "singleToDoublePrecisionPerfRatio: " << devProp.singleToDoublePrecisionPerfRatio
+                          << std::endl;
+                std::cout << "pageableMemoryAccess: " << devProp.pageableMemoryAccess << std::endl;
+                std::cout << "concurrentManagedAccess: " << devProp.concurrentManagedAccess << std::endl;
+                std::cout << "computePreemptionSupported: " << devProp.computePreemptionSupported << std::endl;
+                std::cout << "canUseHostPointerForRegisteredMem: " << devProp.canUseHostPointerForRegisteredMem
+                          << std::endl;
+                std::cout << "cooperativeLaunch: " << devProp.cooperativeLaunch << std::endl;
+                std::cout << "cooperativeMultiDeviceLaunch: " << devProp.cooperativeMultiDeviceLaunch << std::endl;
+                std::cout << "maxTexture1D: " << devProp.maxTexture1D << std::endl;
+                std::cout << "maxTexture1DLinear: " << devProp.maxTexture1DLinear << std::endl;
+                std::cout << "maxTexture2D[2]: " << devProp.maxTexture2D[0] << "x" << devProp.maxTexture2D[1]
+                          << std::endl;
+                std::cout << "maxTexture2DLinear[3]: " << devProp.maxTexture2DLinear[0] << "x"
+                          << devProp.maxTexture2DLinear[1] << "x" << devProp.maxTexture2DLinear[2] << std::endl;
+                std::cout << "maxTexture2DGather[2]: " << devProp.maxTexture2DGather[0] << "x"
+                          << devProp.maxTexture2DGather[1] << std::endl;
+                std::cout << "maxTexture3D[3]: " << devProp.maxTexture3D[0] << "x" << devProp.maxTexture3D[1] << "x"
+                          << devProp.maxTexture3D[2] << std::endl;
+                std::cout << "maxTextureCubemap: " << devProp.maxTextureCubemap << std::endl;
+                std::cout << "maxTexture1DLayered[2]: " << devProp.maxTexture1DLayered[0] << "x"
+                          << devProp.maxTexture1DLayered[1] << std::endl;
+                std::cout << "maxTexture2DLayered[3]: " << devProp.maxTexture2DLayered[0] << "x"
+                          << devProp.maxTexture2DLayered[1] << "x" << devProp.maxTexture2DLayered[2] << std::endl;
+                std::cout << "maxTextureCubemapLayered[2]: " << devProp.maxTextureCubemapLayered[0] << "x"
+                          << devProp.maxTextureCubemapLayered[1] << std::endl;
+                std::cout << "maxSurface1D: " << devProp.maxSurface1D << std::endl;
+                std::cout << "maxSurface2D[2]: " << devProp.maxSurface2D[0] << "x" << devProp.maxSurface2D[1]
+                          << std::endl;
+                std::cout << "maxSurface3D[3]: " << devProp.maxSurface3D[0] << "x" << devProp.maxSurface3D[1] << "x"
+                          << devProp.maxSurface3D[2] << std::endl;
+                std::cout << "maxSurface1DLayered[2]: " << devProp.maxSurface1DLayered[0] << "x"
+                          << devProp.maxSurface1DLayered[1] << std::endl;
+                std::cout << "maxSurface2DLayered[3]: " << devProp.maxSurface2DLayered[0] << "x"
+                          << devProp.maxSurface2DLayered[1] << "x" << devProp.maxSurface2DLayered[2] << std::endl;
+                std::cout << "maxSurfaceCubemap: " << devProp.maxSurfaceCubemap << std::endl;
+                std::cout << "maxSurfaceCubemapLayered[2]: " << devProp.maxSurfaceCubemapLayered[0] << "x"
+                          << devProp.maxSurfaceCubemapLayered[1] << std::endl;
+                std::cout << "surfaceAlignment: " << devProp.surfaceAlignment << std::endl;
+                std::cout << "ECCEnabled: " << devProp.ECCEnabled << std::endl;
+                std::cout << "tccDriver: " << devProp.tccDriver << std::endl;
+                std::cout << "asyncEngineCount: " << devProp.asyncEngineCount << std::endl;
+                std::cout << "streamPrioritiesSupported: " << devProp.streamPrioritiesSupported << std::endl;
+                std::cout << "globalL1CacheSupported: " << devProp.globalL1CacheSupported << std::endl;
+                std::cout << "localL1CacheSupported: " << devProp.localL1CacheSupported << std::endl;
+                std::cout << "sharedMemPerMultiprocessor: " << devProp.sharedMemPerMultiprocessor << std::endl;
+                std::cout << "regsPerMultiprocessor: " << devProp.regsPerMultiprocessor << std::endl;
+                std::cout << "managedMemory: " << devProp.managedMemory << std::endl;
+#        else
+                std::cout << "clockInstructionRate: " << devProp.clockInstructionRate << "kHz" << std::endl;
+                std::cout << "maxSharedMemoryPerMultiProcessor: " << devProp.maxSharedMemoryPerMultiProcessor / kiB
+                          << " KiB" << std::endl;
+                std::cout << "gcnArch: " << devProp.gcnArch << std::endl;
+                std::cout << "arch: " << std::endl;
+                std::cout << "    hasGlobalInt32Atomics: " << devProp.arch.hasGlobalInt32Atomics << std::endl;
+                std::cout << "    hasGlobalFloatAtomicExch: " << devProp.arch.hasGlobalFloatAtomicExch << std::endl;
+                std::cout << "    hasSharedInt32Atomics: " << devProp.arch.hasSharedInt32Atomics << std::endl;
+                std::cout << "    hasSharedFloatAtomicExch: " << devProp.arch.hasSharedFloatAtomicExch << std::endl;
+                std::cout << "    hasFloatAtomicAdd: " << devProp.arch.hasFloatAtomicAdd << std::endl;
+                std::cout << "    hasGlobalInt64Atomics: " << devProp.arch.hasGlobalInt64Atomics << std::endl;
+                std::cout << "    hasSharedInt64Atomics: " << devProp.arch.hasSharedInt64Atomics << std::endl;
+                std::cout << "    hasDoubles: " << devProp.arch.hasDoubles << std::endl;
+                std::cout << "    hasWarpVote: " << devProp.arch.hasWarpVote << std::endl;
+                std::cout << "    hasWarpBallot: " << devProp.arch.hasWarpBallot << std::endl;
+                std::cout << "    hasWarpShuffle: " << devProp.arch.hasWarpShuffle << std::endl;
+                std::cout << "    hasFunnelShift: " << devProp.arch.hasFunnelShift << std::endl;
+                std::cout << "    hasThreadFenceSystem: " << devProp.arch.hasThreadFenceSystem << std::endl;
+                std::cout << "    hasSyncThreadsExt: " << devProp.arch.hasSyncThreadsExt << std::endl;
+                std::cout << "    hasSurfaceFuncs: " << devProp.arch.hasSurfaceFuncs << std::endl;
+                std::cout << "    has3dGrid: " << devProp.arch.has3dGrid << std::endl;
+                std::cout << "    hasDynamicParallelism: " << devProp.arch.hasDynamicParallelism << std::endl;
+#        endif
+            }
+#    endif
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/pltf/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/pltf/Traits.hpp
index 3c8a8a8207..254d996605 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/pltf/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/pltf/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,124 +10,88 @@
 #pragma once
 
 #include <alpaka/core/Common.hpp>
-#include <alpaka/dev/Traits.hpp>
-
 #include <alpaka/core/Concepts.hpp>
-#include <alpaka/queue/Traits.hpp>
 #include <alpaka/dev/Traits.hpp>
+#include <alpaka/queue/Traits.hpp>
 
-#include <boost/config.hpp>
-
-#include <vector>
 #include <type_traits>
+#include <vector>
 
 namespace alpaka
 {
+    struct ConceptPltf
+    {
+    };
+
     //-----------------------------------------------------------------------------
-    //! The platform specifics.
-    namespace pltf
+    //! The platform traits.
+    namespace traits
     {
-        struct ConceptPltf;
+        //#############################################################################
+        //! The platform type trait.
+        template<typename T, typename TSfinae = void>
+        struct PltfType;
 
-        //-----------------------------------------------------------------------------
-        //! The platform traits.
-        namespace traits
+        template<typename TPltf>
+        struct PltfType<TPltf, typename std::enable_if<concepts::ImplementsConcept<ConceptPltf, TPltf>::value>::type>
         {
-            //#############################################################################
-            //! The platform type trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct PltfType;
+            using type = typename concepts::ImplementationBase<ConceptDev, TPltf>;
+        };
 
-            //#############################################################################
-            //! The device count get trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct GetDevCount;
-
-            //#############################################################################
-            //! The device get trait.
-            template<
-                typename T,
-                typename TSfinae = void>
-            struct GetDevByIdx;
-        }
+        //#############################################################################
+        //! The device count get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetDevCount;
 
         //#############################################################################
-        //! The platform type trait alias template to remove the ::type.
-        template<
-            typename T>
-        using Pltf = typename traits::PltfType<T>::type;
+        //! The device get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetDevByIdx;
+    } // namespace traits
 
-        //-----------------------------------------------------------------------------
-        //! \return The device identified by its index.
-        template<
-            typename TPltf>
-        ALPAKA_FN_HOST auto getDevCount()
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(traits::GetDevCount<TPltf>::getDevCount())
-#endif
-        {
-            return
-                traits::GetDevCount<
-                    TPltf>
-                ::getDevCount();
-        }
+    //#############################################################################
+    //! The platform type trait alias template to remove the ::type.
+    template<typename T>
+    using Pltf = typename traits::PltfType<T>::type;
 
-        //-----------------------------------------------------------------------------
-        //! \return The device identified by its index.
-        template<
-            typename TPltf>
-        ALPAKA_FN_HOST auto getDevByIdx(
-            std::size_t const & devIdx)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(traits::GetDevByIdx<TPltf>::getDevByIdx(devIdx))
-#endif
-        {
-            return
-                traits::GetDevByIdx<
-                    TPltf>
-                ::getDevByIdx(
-                    devIdx);
-        }
+    //-----------------------------------------------------------------------------
+    //! \return The device identified by its index.
+    template<typename TPltf>
+    ALPAKA_FN_HOST auto getDevCount()
+    {
+        return traits::GetDevCount<Pltf<TPltf>>::getDevCount();
+    }
 
-        //-----------------------------------------------------------------------------
-        //! \return All the devices available on this accelerator.
-        template<
-            typename TPltf>
-        ALPAKA_FN_HOST auto getDevs()
-        -> std::vector<dev::Dev<TPltf>>
-        {
-            std::vector<dev::Dev<TPltf>> devs;
+    //-----------------------------------------------------------------------------
+    //! \return The device identified by its index.
+    template<typename TPltf>
+    ALPAKA_FN_HOST auto getDevByIdx(std::size_t const& devIdx)
+    {
+        return traits::GetDevByIdx<Pltf<TPltf>>::getDevByIdx(devIdx);
+    }
 
-            std::size_t const devCount(getDevCount<TPltf>());
-            for(std::size_t devIdx(0); devIdx < devCount; ++devIdx)
-            {
-                devs.push_back(getDevByIdx<TPltf>(devIdx));
-            }
+    //-----------------------------------------------------------------------------
+    //! \return All the devices available on this accelerator.
+    template<typename TPltf>
+    ALPAKA_FN_HOST auto getDevs() -> std::vector<Dev<Pltf<TPltf>>>
+    {
+        std::vector<Dev<Pltf<TPltf>>> devs;
 
-            return devs;
+        std::size_t const devCount(getDevCount<Pltf<TPltf>>());
+        for(std::size_t devIdx(0); devIdx < devCount; ++devIdx)
+        {
+            devs.push_back(getDevByIdx<Pltf<TPltf>>(devIdx));
         }
+
+        return devs;
     }
-    namespace queue
+
+    namespace traits
     {
-        namespace traits
+        template<typename TPltf, typename TProperty>
+        struct QueueType<TPltf, TProperty, std::enable_if_t<concepts::ImplementsConcept<ConceptPltf, TPltf>::value>>
         {
-            template<
-                typename TPltf,
-                typename TProperty>
-            struct QueueType<
-                TPltf,
-                TProperty,
-                typename std::enable_if<concepts::ImplementsConcept<pltf::ConceptPltf, TPltf>::value>::type
-            >
-            {
-                using type = typename QueueType<
-                    typename dev::traits::DevType<TPltf>::type,
-                    TProperty>::type;
-            };
-        }
-    }
-}
+            using type = typename QueueType<typename alpaka::traits::DevType<TPltf>::type, TProperty>::type;
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/Properties.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/Properties.hpp
index 67da1cac30..4de139e6da 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/Properties.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/Properties.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Rene Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,21 +11,17 @@
 
 namespace alpaka
 {
-    namespace queue
+    //-----------------------------------------------------------------------------
+    //! Properties to define queue behavior
+    namespace property
     {
-        //-----------------------------------------------------------------------------
-        //! Properties to define queue behavior
-        namespace property
-        {
-            //#############################################################################
-            //! The caller is waiting until the enqueued task is finished
-            struct Blocking{};
+        //#############################################################################
+        //! The caller is waiting until the enqueued task is finished
+        struct Blocking;
 
-            //#############################################################################
-            //! The caller is NOT waiting until the enqueued task is finished
-            struct NonBlocking{};
-        }
-
-        using namespace property;
-    }
-}
+        //#############################################################################
+        //! The caller is NOT waiting until the enqueued task is finished
+        struct NonBlocking;
+    } // namespace property
+    using namespace property;
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp
index fe02591d7f..44b61a4881 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuBlocking.hpp
@@ -1,6 +1,6 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
+/* Copyright 2020 Jeffrey Kelling
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,232 +9,10 @@
 
 #pragma once
 
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/queue/cpu/ICpuQueue.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <atomic>
-#include <mutex>
+#include <alpaka/event/EventCpu.hpp>
+#include <alpaka/queue/QueueGenericThreadsBlocking.hpp>
 
 namespace alpaka
 {
-    namespace event
-    {
-        class EventCpu;
-    }
+    using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
 }
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cpu
-        {
-            namespace detail
-            {
-#if BOOST_COMP_CLANG
-    // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]"
-    // https://stackoverflow.com/a/29288300
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-                //#############################################################################
-                //! The CPU device queue implementation.
-                class QueueCpuBlockingImpl final : public cpu::ICpuQueue
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    QueueCpuBlockingImpl(
-                        dev::DevCpu const & dev) noexcept :
-                            m_dev(dev),
-                            m_bCurrentlyExecutingTask(false)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    QueueCpuBlockingImpl(QueueCpuBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCpuBlockingImpl(QueueCpuBlockingImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuBlockingImpl const &) -> QueueCpuBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuBlockingImpl &&) -> QueueCpuBlockingImpl & = delete;
-
-                    //-----------------------------------------------------------------------------
-                    void enqueue(event::EventCpu & ev) final
-                    {
-                        queue::enqueue(*this, ev);
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    void wait(event::EventCpu const & ev) final
-                    {
-                        wait::wait(*this, ev);
-                    }
-
-                public:
-                    dev::DevCpu const m_dev;            //!< The device this queue is bound to.
-                    std::mutex mutable m_mutex;
-                    std::atomic<bool> m_bCurrentlyExecutingTask;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CPU device queue.
-        class QueueCpuBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCpuBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            QueueCpuBlocking(
-                dev::DevCpu const & dev) :
-                    m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuBlockingImpl>(dev))
-            {
-                dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            QueueCpuBlocking(QueueCpuBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCpuBlocking(QueueCpuBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuBlocking const &) -> QueueCpuBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuBlocking &&) -> QueueCpuBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(QueueCpuBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            auto operator!=(QueueCpuBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCpuBlocking() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::QueueCpuBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCpuBlocking>
-            {
-                using type = dev::DevCpu;
-            };
-            //#############################################################################
-            //! The CPU blocking device queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCpuBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCpuBlocking const & queue)
-                -> dev::DevCpu
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCpuBlocking>
-            {
-                using type = event::EventCpu;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            //! This default implementation for all tasks directly invokes the function call operator of the task.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCpuBlocking,
-                TTask>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-
-                    queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true;
-
-                    task();
-
-                    queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false;
-                }
-            };
-            //#############################################################################
-            //! The CPU blocking device queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCpuBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCpuBlocking const & queue)
-                -> bool
-                {
-                    return !queue.m_spQueueImpl->m_bCurrentlyExecutingTask;
-                }
-            };
-        }
-    }
-
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU blocking device queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCpuBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCpuBlocking const & queue)
-                -> void
-                {
-                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                }
-            };
-        }
-    }
-}
-
-#include <alpaka/event/EventCpu.hpp>
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp
index 94030569d1..1cba1f86c3 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCpuNonBlocking.hpp
@@ -1,6 +1,6 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
+/* Copyright 2020 Jeffrey Kelling
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,226 +9,10 @@
 
 #pragma once
 
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/queue/cpu/ICpuQueue.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/ConcurrentExecPool.hpp>
-
-#include <type_traits>
-#include <thread>
-#include <mutex>
-#include <future>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCpu;
-    }
-}
+#include <alpaka/event/EventCpu.hpp>
+#include <alpaka/queue/QueueGenericThreadsNonBlocking.hpp>
 
 namespace alpaka
 {
-    namespace queue
-    {
-        namespace cpu
-        {
-            namespace detail
-            {
-#if BOOST_COMP_CLANG
-    // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]"
-    // https://stackoverflow.com/a/29288300
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-                //#############################################################################
-                //! The CPU device queue implementation.
-                class QueueCpuNonBlockingImpl final : public cpu::ICpuQueue
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-                {
-                private:
-                    //#############################################################################
-                    using ThreadPool = alpaka::core::detail::ConcurrentExecPool<
-                        std::size_t,
-                        std::thread,                // The concurrent execution type.
-                        std::promise,               // The promise type.
-                        void,                       // The type yielding the current concurrent execution.
-                        std::mutex,                 // The mutex type to use. Only required if TisYielding is true.
-                        std::condition_variable,    // The condition variable type to use. Only required if TisYielding is true.
-                        false>;                     // If the threads should yield.
-
-                public:
-                    //-----------------------------------------------------------------------------
-                    QueueCpuNonBlockingImpl(
-                        dev::DevCpu const & dev) :
-                            m_dev(dev),
-                            m_workerThread(1u)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    QueueCpuNonBlockingImpl(QueueCpuNonBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCpuNonBlockingImpl(QueueCpuNonBlockingImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuNonBlockingImpl const &) -> QueueCpuNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuNonBlockingImpl &&) -> QueueCpuNonBlockingImpl & = delete;
-
-                    //-----------------------------------------------------------------------------
-                    void enqueue(event::EventCpu & ev) final
-                    {
-                        queue::enqueue(*this, ev);
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    void wait(event::EventCpu const & ev) final
-                    {
-                        wait::wait(*this, ev);
-                    }
-
-                public:
-                    dev::DevCpu const m_dev;            //!< The device this queue is bound to.
-
-                    ThreadPool m_workerThread;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CPU device queue.
-        class QueueCpuNonBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCpuNonBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            QueueCpuNonBlocking(
-                dev::DevCpu const & dev) :
-                    m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuNonBlockingImpl>(dev))
-            {
-                dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            QueueCpuNonBlocking(QueueCpuNonBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCpuNonBlocking(QueueCpuNonBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuNonBlocking const &) -> QueueCpuNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuNonBlocking &&) -> QueueCpuNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(QueueCpuNonBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            auto operator!=(QueueCpuNonBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCpuNonBlocking() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::QueueCpuNonBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU non-blocking device queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCpuNonBlocking>
-            {
-                using type = dev::DevCpu;
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCpuNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCpuNonBlocking const & queue)
-                -> dev::DevCpu
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU non-blocking device queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCpuNonBlocking>
-            {
-                using type = event::EventCpu;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU non-blocking device queue enqueue trait specialization.
-            //! This default implementation for all tasks directly invokes the function call operator of the task.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCpuNonBlocking,
-                TTask>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue::QueueCpuNonBlocking & queue,
-                    TTask const & task)
-#else
-                    queue::QueueCpuNonBlocking &,
-                    TTask const &)
-#endif
-                -> void
-                {
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue.m_spQueueImpl->m_workerThread.enqueueTask(
-                        task);
-#endif
-                }
-            };
-            //#############################################################################
-            //! The CPU non-blocking device queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCpuNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCpuNonBlocking const & queue)
-                -> bool
-                {
-                    return queue.m_spQueueImpl->m_workerThread.isIdle();
-                }
-            };
-        }
-    }
+    using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
 }
-
-#include <alpaka/event/EventCpu.hpp>
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCudaRtBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCudaRtBlocking.hpp
deleted file mode 100644
index bd9640bc40..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCudaRtBlocking.hpp
+++ /dev/null
@@ -1,345 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCudaRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cuda
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CUDA RT blocking queue implementation.
-                class QueueCudaRtBlockingImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST QueueCudaRtBlockingImpl(
-                        dev::DevCudaRt const & dev) :
-                            m_dev(dev),
-                            m_CudaQueue()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // - cudaStreamDefault: Default queue creation flag.
-                        // - cudaStreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue),
-                        //   and that the created queue should perform no implicit synchronization with queue 0.
-                        // Create the queue on the current device.
-                        // NOTE: cudaStreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue.
-                        // It would be too much work to implement implicit default queue synchronization on CPU.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamCreateWithFlags(
-                                &m_CudaQueue,
-                                cudaStreamNonBlocking));
-                    }
-                    //-----------------------------------------------------------------------------
-                    QueueCudaRtBlockingImpl(QueueCudaRtBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCudaRtBlockingImpl(QueueCudaRtBlockingImpl &&) = default;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCudaRtBlockingImpl const &) -> QueueCudaRtBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCudaRtBlockingImpl &&) -> QueueCudaRtBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~QueueCudaRtBlockingImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before cudaStreamDestroy required?
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // In case the device is still doing work in the queue when cudaStreamDestroy() is called, the function will return immediately
-                        // and the resources associated with queue will be released automatically once the device has completed all work in queue.
-                        // -> No need to synchronize here.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamDestroy(
-                                m_CudaQueue));
-                    }
-
-                public:
-                    dev::DevCudaRt const m_dev;   //!< The device this queue is bound to.
-                    cudaStream_t m_CudaQueue;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CUDA RT blocking queue.
-        class QueueCudaRtBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCudaRtBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST QueueCudaRtBlocking(
-                dev::DevCudaRt const & dev) :
-                m_spQueueImpl(std::make_shared<cuda::detail::QueueCudaRtBlockingImpl>(dev))
-            {}
-            //-----------------------------------------------------------------------------
-            QueueCudaRtBlocking(QueueCudaRtBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCudaRtBlocking(QueueCudaRtBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCudaRtBlocking const &) -> QueueCudaRtBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCudaRtBlocking &&) -> QueueCudaRtBlocking & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(QueueCudaRtBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(QueueCudaRtBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCudaRtBlocking() = default;
-
-        public:
-            std::shared_ptr<cuda::detail::QueueCudaRtBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT blocking queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCudaRtBlocking>
-            {
-                using type = dev::DevCudaRt;
-            };
-            //#############################################################################
-            //! The CUDA RT blocking queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCudaRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCudaRtBlocking const & queue)
-                -> dev::DevCudaRt
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT blocking queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCudaRtBlocking>
-            {
-                using type = event::EventCudaRt;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT blocking queue enqueue trait specialization.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                TTask>
-            {
-                //#############################################################################
-                enum class CallbackState
-                {
-                    enqueued,
-                    notified,
-                    finished,
-                };
-
-                //#############################################################################
-                struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
-                {
-                    std::mutex m_mutex;
-                    std::condition_variable m_event;
-                    CallbackState state = CallbackState::enqueued;
-                };
-
-                //-----------------------------------------------------------------------------
-                static void CUDART_CB cudaRtCallback(cudaStream_t /*queue*/, cudaError_t /*status*/, void *arg)
-                {
-                    // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
-                    const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
-
-                    // Notify the executing thread.
-                    {
-                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                        pCallbackSynchronizationData->state = CallbackState::notified;
-                    }
-                    pCallbackSynchronizationData->m_event.notify_one();
-
-                    // Wait for the executing thread to finish the task if it has not already finished.
-                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                    if(pCallbackSynchronizationData->state != CallbackState::finished)
-                    {
-                        pCallbackSynchronizationData->m_event.wait(
-                            lock,
-                            [pCallbackSynchronizationData](){
-                                return pCallbackSynchronizationData->state == CallbackState::finished;
-                            }
-                        );
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamAddCallback(
-                        queue.m_spQueueImpl->m_CudaQueue,
-                        cudaRtCallback,
-                        pCallbackSynchronizationData.get(),
-                        0u));
-
-                    // We start a new std::thread which stores the task to be executed.
-                    // This circumvents the limitation that it is not possible to call CUDA methods within the CUDA callback thread.
-                    // The CUDA thread signals the std::thread when it is ready to execute the task.
-                    // The CUDA thread is waiting for the std::thread to signal that it is finished executing the task
-                    // before it executes the next task in the queue (CUDA stream).
-                    std::thread t(
-                        [pCallbackSynchronizationData, task](){
-
-                            // If the callback has not yet been called, we wait for it.
-                            {
-                                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                                if(pCallbackSynchronizationData->state != CallbackState::notified)
-                                {
-                                    pCallbackSynchronizationData->m_event.wait(
-                                        lock,
-                                        [pCallbackSynchronizationData](){
-                                            return pCallbackSynchronizationData->state == CallbackState::notified;
-                                        }
-                                    );
-                                }
-
-                                task();
-
-                                // Notify the waiting CUDA thread.
-                                pCallbackSynchronizationData->state = CallbackState::finished;
-                            }
-                            pCallbackSynchronizationData->m_event.notify_one();
-                        }
-                    );
-
-                    t.join();
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT blocking queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCudaRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCudaRtBlocking const & queue)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Query is allowed even for queues on non current device.
-                    cudaError_t ret = cudaSuccess;
-                    ALPAKA_CUDA_RT_CHECK_IGNORE(
-                        ret = cudaStreamQuery(
-                            queue.m_spQueueImpl->m_CudaQueue),
-                        cudaErrorNotReady);
-                    return (ret == cudaSuccess);
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT blocking queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCudaRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCudaRtBlocking const & queue)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Sync is allowed even for queues on non current device.
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamSynchronize(
-                        queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCudaRtNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCudaRtNonBlocking.hpp
deleted file mode 100644
index d89f860131..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueCudaRtNonBlocking.hpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventCudaRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cuda
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The CUDA RT non-blocking queue implementation.
-                class QueueCudaRtNonBlockingImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST QueueCudaRtNonBlockingImpl(
-                        dev::DevCudaRt const & dev) :
-                            m_dev(dev),
-                            m_CudaQueue()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // - cudaStreamDefault: Default queue creation flag.
-                        // - cudaStreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue),
-                        //   and that the created queue should perform no implicit synchronization with queue 0.
-                        // Create the queue on the current device.
-                        // NOTE: cudaStreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue.
-                        // It would be too much work to implement implicit default queue synchronization on CPU.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamCreateWithFlags(
-                                &m_CudaQueue,
-                                cudaStreamNonBlocking));
-                    }
-                    //-----------------------------------------------------------------------------
-                    QueueCudaRtNonBlockingImpl(QueueCudaRtNonBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCudaRtNonBlockingImpl(QueueCudaRtNonBlockingImpl &&) = default;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCudaRtNonBlockingImpl const &) -> QueueCudaRtNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCudaRtNonBlockingImpl &&) -> QueueCudaRtNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~QueueCudaRtNonBlockingImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before cudaStreamDestroy required?
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaSetDevice(
-                                m_dev.m_iDevice));
-                        // In case the device is still doing work in the queue when cudaStreamDestroy() is called, the function will return immediately
-                        // and the resources associated with queue will be released automatically once the device has completed all work in queue.
-                        // -> No need to synchronize here.
-                        ALPAKA_CUDA_RT_CHECK(
-                            cudaStreamDestroy(
-                                m_CudaQueue));
-                    }
-
-                public:
-                    dev::DevCudaRt const m_dev;   //!< The device this queue is bound to.
-                    cudaStream_t m_CudaQueue;
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The CUDA RT non-blocking queue.
-        class QueueCudaRtNonBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCudaRtNonBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST QueueCudaRtNonBlocking(
-                dev::DevCudaRt const & dev) :
-                m_spQueueImpl(std::make_shared<cuda::detail::QueueCudaRtNonBlockingImpl>(dev))
-            {}
-            //-----------------------------------------------------------------------------
-            QueueCudaRtNonBlocking(QueueCudaRtNonBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCudaRtNonBlocking(QueueCudaRtNonBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCudaRtNonBlocking const &) -> QueueCudaRtNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCudaRtNonBlocking &&) -> QueueCudaRtNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(QueueCudaRtNonBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(QueueCudaRtNonBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ~QueueCudaRtNonBlocking() = default;
-
-        public:
-            std::shared_ptr<cuda::detail::QueueCudaRtNonBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT non-blocking queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCudaRtNonBlocking>
-            {
-                using type = dev::DevCudaRt;
-            };
-            //#############################################################################
-            //! The CUDA RT non-blocking queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCudaRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCudaRtNonBlocking const & queue)
-                -> dev::DevCudaRt
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT non-blocking queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCudaRtNonBlocking>
-            {
-                using type = event::EventCudaRt;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT sync queue enqueue trait specialization.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                TTask>
-            {
-                //#############################################################################
-                enum class CallbackState
-                {
-                    enqueued,
-                    notified,
-                    finished,
-                };
-
-                //#############################################################################
-                struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
-                {
-                    std::mutex m_mutex;
-                    std::condition_variable m_event;
-                    CallbackState state = CallbackState::enqueued;
-                };
-
-                //-----------------------------------------------------------------------------
-                static void CUDART_CB cudaRtCallback(cudaStream_t /*queue*/, cudaError_t /*status*/, void *arg)
-                {
-                    // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
-                    const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
-
-                    // Notify the executing thread.
-                    {
-                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                        pCallbackSynchronizationData->state = CallbackState::notified;
-                    }
-                    pCallbackSynchronizationData->m_event.notify_one();
-
-                    // Wait for the executing thread to finish the task if it has not already finished.
-                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                    if(pCallbackSynchronizationData->state != CallbackState::finished)
-                    {
-                        pCallbackSynchronizationData->m_event.wait(
-                            lock,
-                            [pCallbackSynchronizationData](){
-                                return pCallbackSynchronizationData->state == CallbackState::finished;
-                            }
-                        );
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
-
-                    ALPAKA_CUDA_RT_CHECK(cudaStreamAddCallback(
-                        queue.m_spQueueImpl->m_CudaQueue,
-                        cudaRtCallback,
-                        pCallbackSynchronizationData.get(),
-                        0u));
-
-                    // We start a new std::thread which stores the task to be executed.
-                    // This circumvents the limitation that it is not possible to call CUDA methods within the CUDA callback thread.
-                    // The CUDA thread signals the std::thread when it is ready to execute the task.
-                    // The CUDA thread is waiting for the std::thread to signal that it is finished executing the task
-                    // before it executes the next task in the queue (CUDA stream).
-                    std::thread t(
-                        [pCallbackSynchronizationData, task](){
-
-                            // If the callback has not yet been called, we wait for it.
-                            {
-                                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                                if(pCallbackSynchronizationData->state != CallbackState::notified)
-                                {
-                                    pCallbackSynchronizationData->m_event.wait(
-                                        lock,
-                                        [pCallbackSynchronizationData](){
-                                            return pCallbackSynchronizationData->state == CallbackState::notified;
-                                        }
-                                    );
-                                }
-
-                                task();
-
-                                // Notify the waiting CUDA thread.
-                                pCallbackSynchronizationData->state = CallbackState::finished;
-                            }
-                            pCallbackSynchronizationData->m_event.notify_one();
-                        }
-                    );
-
-                    t.detach();
-                }
-            };
-            //#############################################################################
-            //! The CUDA RT non-blocking queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCudaRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCudaRtNonBlocking const & queue)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Query is allowed even for queues on non current device.
-                    cudaError_t ret = cudaSuccess;
-                    ALPAKA_CUDA_RT_CHECK_IGNORE(
-                        ret = cudaStreamQuery(
-                            queue.m_spQueueImpl->m_CudaQueue),
-                        cudaErrorNotReady);
-                    return (ret == cudaSuccess);
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA RT non-blocking queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCudaRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCudaRtNonBlocking const & queue)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Sync is allowed even for queues on non current device.
-                    ALPAKA_CUDA_RT_CHECK(
-                        cudaStreamSynchronize(
-                            queue.m_spQueueImpl->m_CudaQueue));
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsBlocking.hpp
new file mode 100644
index 0000000000..bd1b4d8539
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsBlocking.hpp
@@ -0,0 +1,202 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/Unused.hpp>
+#include <alpaka/dev/Traits.hpp>
+#include <alpaka/event/Traits.hpp>
+#include <alpaka/queue/Traits.hpp>
+#include <alpaka/queue/cpu/IGenericThreadsQueue.hpp>
+#include <alpaka/wait/Traits.hpp>
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+
+namespace alpaka
+{
+    template<typename TDev>
+    class EventGenericThreads;
+
+    namespace generic
+    {
+        namespace detail
+        {
+#if BOOST_COMP_CLANG
+// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
+// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+            //#############################################################################
+            //! The CPU device queue implementation.
+            template<typename TDev>
+            class QueueGenericThreadsBlockingImpl final : public IGenericThreadsQueue<TDev>
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+            {
+            public:
+                //-----------------------------------------------------------------------------
+                explicit QueueGenericThreadsBlockingImpl(TDev const& dev) noexcept
+                    : m_dev(dev)
+                    , m_bCurrentlyExecutingTask(false)
+                {
+                }
+                //-----------------------------------------------------------------------------
+                QueueGenericThreadsBlockingImpl(QueueGenericThreadsBlockingImpl<TDev> const&) = delete;
+                //-----------------------------------------------------------------------------
+                QueueGenericThreadsBlockingImpl(QueueGenericThreadsBlockingImpl<TDev>&&) = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueGenericThreadsBlockingImpl<TDev> const&)
+                    -> QueueGenericThreadsBlockingImpl<TDev>& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueGenericThreadsBlockingImpl<TDev>&&)
+                    -> QueueGenericThreadsBlockingImpl<TDev>& = delete;
+
+                //-----------------------------------------------------------------------------
+                void enqueue(EventGenericThreads<TDev>& ev) final
+                {
+                    alpaka::enqueue(*this, ev);
+                }
+
+                //-----------------------------------------------------------------------------
+                void wait(EventGenericThreads<TDev> const& ev) final
+                {
+                    alpaka::wait(*this, ev);
+                }
+
+            public:
+                TDev const m_dev; //!< The device this queue is bound to.
+                std::mutex mutable m_mutex;
+                std::atomic<bool> m_bCurrentlyExecutingTask;
+            };
+        } // namespace detail
+    } // namespace generic
+
+    //#############################################################################
+    //! The CPU device queue.
+    template<typename TDev>
+    class QueueGenericThreadsBlocking final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericThreadsBlocking<TDev>>
+        , public concepts::Implements<ConceptQueue, QueueGenericThreadsBlocking<TDev>>
+        , public concepts::Implements<ConceptGetDev, QueueGenericThreadsBlocking<TDev>>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        explicit QueueGenericThreadsBlocking(TDev const& dev)
+            : m_spQueueImpl(std::make_shared<generic::detail::QueueGenericThreadsBlockingImpl<TDev>>(dev))
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            dev.registerQueue(m_spQueueImpl);
+        }
+        //-----------------------------------------------------------------------------
+        QueueGenericThreadsBlocking(QueueGenericThreadsBlocking<TDev> const&) = default;
+        //-----------------------------------------------------------------------------
+        QueueGenericThreadsBlocking(QueueGenericThreadsBlocking<TDev>&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueGenericThreadsBlocking<TDev> const&) -> QueueGenericThreadsBlocking<TDev>& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueGenericThreadsBlocking<TDev>&&) -> QueueGenericThreadsBlocking<TDev>& = default;
+        //-----------------------------------------------------------------------------
+        auto operator==(QueueGenericThreadsBlocking<TDev> const& rhs) const -> bool
+        {
+            return (m_spQueueImpl == rhs.m_spQueueImpl);
+        }
+        //-----------------------------------------------------------------------------
+        auto operator!=(QueueGenericThreadsBlocking<TDev> const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~QueueGenericThreadsBlocking() = default;
+
+    public:
+        std::shared_ptr<generic::detail::QueueGenericThreadsBlockingImpl<TDev>> m_spQueueImpl;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU blocking device queue device type trait specialization.
+        template<typename TDev>
+        struct DevType<QueueGenericThreadsBlocking<TDev>>
+        {
+            using type = TDev;
+        };
+        //#############################################################################
+        //! The CPU blocking device queue device get trait specialization.
+        template<typename TDev>
+        struct GetDev<QueueGenericThreadsBlocking<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(QueueGenericThreadsBlocking<TDev> const& queue) -> TDev
+            {
+                return queue.m_spQueueImpl->m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The CPU blocking device queue event type trait specialization.
+        template<typename TDev>
+        struct EventType<QueueGenericThreadsBlocking<TDev>>
+        {
+            using type = EventGenericThreads<TDev>;
+        };
+
+        //#############################################################################
+        //! The CPU blocking device queue enqueue trait specialization.
+        //! This default implementation for all tasks directly invokes the function call operator of the task.
+        template<typename TDev, typename TTask>
+        struct Enqueue<QueueGenericThreadsBlocking<TDev>, TTask>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueGenericThreadsBlocking<TDev>& queue, TTask const& task) -> void
+            {
+                std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+
+                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true;
+
+                task();
+
+                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false;
+            }
+        };
+        //#############################################################################
+        //! The CPU blocking device queue test trait specialization.
+        template<typename TDev>
+        struct Empty<QueueGenericThreadsBlocking<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto empty(QueueGenericThreadsBlocking<TDev> const& queue) -> bool
+            {
+                return !queue.m_spQueueImpl->m_bCurrentlyExecutingTask;
+            }
+        };
+
+        //#############################################################################
+        //! The CPU blocking device queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<typename TDev>
+        struct CurrentThreadWaitFor<QueueGenericThreadsBlocking<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueGenericThreadsBlocking<TDev> const& queue) -> void
+            {
+                std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#include <alpaka/event/EventGenericThreads.hpp>
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp
new file mode 100644
index 0000000000..f0be17fff1
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp
@@ -0,0 +1,200 @@
+/* Copyright 2019 Benjamin Worpitz, Matthias Werner
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/ConcurrentExecPool.hpp>
+#include <alpaka/core/Unused.hpp>
+#include <alpaka/dev/Traits.hpp>
+#include <alpaka/event/Traits.hpp>
+#include <alpaka/queue/Traits.hpp>
+#include <alpaka/queue/cpu/IGenericThreadsQueue.hpp>
+#include <alpaka/wait/Traits.hpp>
+
+#include <future>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <type_traits>
+
+namespace alpaka
+{
+    template<typename TDev>
+    class EventGenericThreads;
+
+    namespace generic
+    {
+        namespace detail
+        {
+#if BOOST_COMP_CLANG
+// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
+// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+            //#############################################################################
+            //! The CPU device queue implementation.
+            template<typename TDev>
+            class QueueGenericThreadsNonBlockingImpl final : public IGenericThreadsQueue<TDev>
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+            {
+            private:
+                //#############################################################################
+                using ThreadPool = alpaka::core::detail::ConcurrentExecPool<
+                    std::size_t,
+                    std::thread, // The concurrent execution type.
+                    std::promise, // The promise type.
+                    void, // The type yielding the current concurrent execution.
+                    std::mutex, // The mutex type to use. Only required if TisYielding is true.
+                    std::condition_variable, // The condition variable type to use. Only required if TisYielding is
+                                             // true.
+                    false>; // If the threads should yield.
+
+            public:
+                //-----------------------------------------------------------------------------
+                explicit QueueGenericThreadsNonBlockingImpl(TDev const& dev) : m_dev(dev), m_workerThread(1u)
+                {
+                }
+                //-----------------------------------------------------------------------------
+                QueueGenericThreadsNonBlockingImpl(QueueGenericThreadsNonBlockingImpl<TDev> const&) = delete;
+                //-----------------------------------------------------------------------------
+                QueueGenericThreadsNonBlockingImpl(QueueGenericThreadsNonBlockingImpl<TDev>&&) = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueGenericThreadsNonBlockingImpl<TDev> const&)
+                    -> QueueGenericThreadsNonBlockingImpl<TDev>& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueGenericThreadsNonBlockingImpl<TDev>&&)
+                    -> QueueGenericThreadsNonBlockingImpl<TDev>& = delete;
+
+                //-----------------------------------------------------------------------------
+                void enqueue(EventGenericThreads<TDev>& ev) final
+                {
+                    alpaka::enqueue(*this, ev);
+                }
+
+                //-----------------------------------------------------------------------------
+                void wait(EventGenericThreads<TDev> const& ev) final
+                {
+                    alpaka::wait(*this, ev);
+                }
+
+            public:
+                TDev const m_dev; //!< The device this queue is bound to.
+
+                ThreadPool m_workerThread;
+            };
+        } // namespace detail
+    } // namespace generic
+
+    //#############################################################################
+    //! The CPU device queue.
+    template<typename TDev>
+    class QueueGenericThreadsNonBlocking final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericThreadsNonBlocking<TDev>>
+        , public concepts::Implements<ConceptQueue, QueueGenericThreadsNonBlocking<TDev>>
+        , public concepts::Implements<ConceptGetDev, QueueGenericThreadsNonBlocking<TDev>>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        explicit QueueGenericThreadsNonBlocking(TDev const& dev)
+            : m_spQueueImpl(std::make_shared<generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>>(dev))
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            dev.registerQueue(m_spQueueImpl);
+        }
+        //-----------------------------------------------------------------------------
+        QueueGenericThreadsNonBlocking(QueueGenericThreadsNonBlocking<TDev> const&) = default;
+        //-----------------------------------------------------------------------------
+        QueueGenericThreadsNonBlocking(QueueGenericThreadsNonBlocking<TDev>&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueGenericThreadsNonBlocking<TDev> const&) -> QueueGenericThreadsNonBlocking<TDev>& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueGenericThreadsNonBlocking<TDev>&&) -> QueueGenericThreadsNonBlocking<TDev>& = default;
+        //-----------------------------------------------------------------------------
+        auto operator==(QueueGenericThreadsNonBlocking<TDev> const& rhs) const -> bool
+        {
+            return (m_spQueueImpl == rhs.m_spQueueImpl);
+        }
+        //-----------------------------------------------------------------------------
+        auto operator!=(QueueGenericThreadsNonBlocking<TDev> const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~QueueGenericThreadsNonBlocking() = default;
+
+    public:
+        std::shared_ptr<generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>> m_spQueueImpl;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU non-blocking device queue device type trait specialization.
+        template<typename TDev>
+        struct DevType<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            using type = TDev;
+        };
+        //#############################################################################
+        //! The CPU non-blocking device queue device get trait specialization.
+        template<typename TDev>
+        struct GetDev<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(QueueGenericThreadsNonBlocking<TDev> const& queue) -> TDev
+            {
+                return queue.m_spQueueImpl->m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The CPU non-blocking device queue event type trait specialization.
+        template<typename TDev>
+        struct EventType<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            using type = EventGenericThreads<TDev>;
+        };
+
+        //#############################################################################
+        //! The CPU non-blocking device queue enqueue trait specialization.
+        //! This default implementation for all tasks directly invokes the function call operator of the task.
+        template<typename TDev, typename TTask>
+        struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, TTask>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueGenericThreadsNonBlocking<TDev>& queue, TTask const& task) -> void
+            {
+// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
+#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+                queue.m_spQueueImpl->m_workerThread.enqueueTask(task);
+#else
+                alpaka::ignore_unused(queue);
+                alpaka::ignore_unused(task);
+#endif
+            }
+        };
+        //#############################################################################
+        //! The CPU non-blocking device queue test trait specialization.
+        template<typename TDev>
+        struct Empty<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto empty(QueueGenericThreadsNonBlocking<TDev> const& queue) -> bool
+            {
+                return queue.m_spQueueImpl->m_workerThread.isIdle();
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#include <alpaka/event/EventGenericThreads.hpp>
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueHipRtBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueHipRtBlocking.hpp
deleted file mode 100644
index cab8b7db33..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueHipRtBlocking.hpp
+++ /dev/null
@@ -1,385 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/DevHipRt.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventHipRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace hip
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The HIP RT blocking queue implementation.
-                class QueueHipRtBlockingImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST QueueHipRtBlockingImpl(
-                        dev::DevHipRt const & dev) :
-                            m_dev(dev),
-                            m_HipQueue()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // - hipStreamDefault: Default queue creation flag.
-                        // - hipStreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue),
-                        //   and that the created queue should perform no implicit synchronization with queue 0.
-                        // Create the queue on the current device.
-                        // NOTE: hipStreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue.
-                        // It would be too much work to implement implicit default queue synchronization on CPU.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamCreateWithFlags(
-                                &m_HipQueue,
-                                hipStreamNonBlocking));
-                    }
-                    //-----------------------------------------------------------------------------
-                    QueueHipRtBlockingImpl(QueueHipRtBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueHipRtBlockingImpl(QueueHipRtBlockingImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueHipRtBlockingImpl const &) -> QueueHipRtBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueHipRtBlockingImpl &&) -> QueueHipRtBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~QueueHipRtBlockingImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device. \TODO: Is setting the current device before hipStreamDestroy required?
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // In case the device is still doing work in the queue when hipStreamDestroy() is called, the function will return immediately
-                        // and the resources associated with queue will be released automatically once the device has completed all work in queue.
-                        // -> No need to synchronize here.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamDestroy(
-                                m_HipQueue));
-                    }
-
-                public:
-                    dev::DevHipRt const m_dev;   //!< The device this queue is bound to.
-                    hipStream_t m_HipQueue;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    int m_callees = 0;
-                    std::mutex m_mutex;
-#endif
-                };
-            } // detail
-        } // hip
-
-        //#############################################################################
-        //! The HIP RT blocking queue.
-        class QueueHipRtBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueHipRtBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST QueueHipRtBlocking(
-                dev::DevHipRt const & dev) :
-                m_spQueueImpl(std::make_shared<hip::detail::QueueHipRtBlockingImpl>(dev))
-            {}
-            //-----------------------------------------------------------------------------
-            QueueHipRtBlocking(QueueHipRtBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueHipRtBlocking(QueueHipRtBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueHipRtBlocking const &) -> QueueHipRtBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueHipRtBlocking &&) -> QueueHipRtBlocking & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(QueueHipRtBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(QueueHipRtBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            // NOTE: for HCC streams workaround: no need to sync with spawned tasks as this queue is already syncing in enqueue
-            ALPAKA_FN_HOST ~QueueHipRtBlocking() = default;
-
-        public:
-            std::shared_ptr<hip::detail::QueueHipRtBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueHipRtBlocking>
-            {
-                using type = dev::DevHipRt;
-            };
-            //#############################################################################
-            //! The HIP RT blocking queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueHipRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueHipRtBlocking const & queue)
-                -> dev::DevHipRt
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueHipRtBlocking>
-            {
-                using type = event::EventHipRt;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue enqueue trait specialization.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                TTask>
-            {
-                //#############################################################################
-                enum class CallbackState
-                {
-                    enqueued,
-                    notified,
-                    finished,
-                };
-
-                //#############################################################################
-                struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
-                {
-                    std::mutex m_mutex;
-                    std::condition_variable m_event;
-                    CallbackState state = CallbackState::enqueued;
-                };
-
-                //-----------------------------------------------------------------------------
-                static void HIPRT_CB hipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg)
-                {
-                    // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
-                    const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
-
-                    // Notify the executing thread.
-                    {
-                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                        pCallbackSynchronizationData->state = CallbackState::notified;
-                    }
-                    pCallbackSynchronizationData->m_event.notify_one();
-
-                    // Wait for the executing thread to finish the task if it has not already finished.
-                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                    if(pCallbackSynchronizationData->state != CallbackState::finished)
-                    {
-                        pCallbackSynchronizationData->m_event.wait(
-                            lock,
-                            [pCallbackSynchronizationData](){
-                                return pCallbackSynchronizationData->state == CallbackState::finished;
-                            }
-                        );
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    {
-                        // thread-safe callee incrementing
-                        std::lock_guard<std::mutex> guard(queue.m_spQueueImpl->m_mutex);
-                        queue.m_spQueueImpl->m_callees += 1;
-                    }
-#endif
-
-                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
-
-                    ALPAKA_HIP_RT_CHECK(hipStreamAddCallback(
-                        queue.m_spQueueImpl->m_HipQueue,
-                        hipRtCallback,
-                        pCallbackSynchronizationData.get(),
-                        0u));
-
-                    // We start a new std::thread which stores the task to be executed.
-                    // This circumvents the limitation that it is not possible to call HIP methods within the HIP callback thread.
-                    // The HIP thread signals the std::thread when it is ready to execute the task.
-                    // The HIP thread is waiting for the std::thread to signal that it is finished executing the task
-                    // before it executes the next task in the queue (HIP stream).
-                    std::thread t(
-                        [pCallbackSynchronizationData,
-                         task
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                         ,&queue
-#endif
-                        ](){
-
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                            // thread-safe task execution and callee decrementing
-                            std::lock_guard<std::mutex> guard(queue.m_spQueueImpl->m_mutex);
-#endif
-
-                            // If the callback has not yet been called, we wait for it.
-                            {
-                                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                                if(pCallbackSynchronizationData->state != CallbackState::notified)
-                                {
-                                    pCallbackSynchronizationData->m_event.wait(
-                                        lock,
-                                        [pCallbackSynchronizationData](){
-                                            return pCallbackSynchronizationData->state == CallbackState::notified;
-                                        }
-                                    );
-                                }
-
-                                task();
-
-                                // Notify the waiting HIP thread.
-                                pCallbackSynchronizationData->state = CallbackState::finished;
-                            }
-                            pCallbackSynchronizationData->m_event.notify_one();
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                            queue.m_spQueueImpl->m_callees -= 1;
-#endif
-                        }
-                    );
-
-                    t.join();
-                }
-            };
-            //#############################################################################
-            //! The HIP RT blocking queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueHipRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueHipRtBlocking const & queue)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // see: https://github.com/ROCm-Developer-Tools/HIP/blob/roc-1.9.x/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    return (queue.m_spQueueImpl->m_callees==0);
-#else
-                    // Query is allowed even for queues on non current device.
-                    hipError_t ret = hipSuccess;
-                    ALPAKA_HIP_RT_CHECK_IGNORE(
-                        ret = hipStreamQuery(
-                            queue.m_spQueueImpl->m_HipQueue),
-                        hipErrorNotReady);
-                    return (ret == hipSuccess);
-#endif
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueHipRtBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueHipRtBlocking const & queue)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    while(queue.m_spQueueImpl->m_callees>0) {
-                        std::this_thread::sleep_for(std::chrono::milliseconds(10u));
-                    }
-#else
-                    // Sync is allowed even for queues on non current device.
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                        queue.m_spQueueImpl->m_HipQueue));
-#endif
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueHipRtNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueHipRtNonBlocking.hpp
deleted file mode 100644
index 6d0f25760c..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueHipRtNonBlocking.hpp
+++ /dev/null
@@ -1,397 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/dev/DevHipRt.hpp>
-
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
-#include <alpaka/meta/DependentFalseType.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-#include <stdexcept>
-#include <memory>
-#include <functional>
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-
-namespace alpaka
-{
-    namespace event
-    {
-        class EventHipRt;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace hip
-        {
-            namespace detail
-            {
-                //#############################################################################
-                //! The HIP RT non-blocking queue implementation.
-                class QueueHipRtNonBlockingImpl final
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST QueueHipRtNonBlockingImpl(
-                        dev::DevHipRt const & dev) :
-                            m_dev(dev),
-                            m_HipQueue()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // - hipStreamDefault: Default queue creation flag.
-                        // - hipStreamNonBlocking: Specifies that work running in the created queue may run concurrently with work in queue 0 (the NULL queue),
-                        //   and that the created queue should perform no implicit synchronization with queue 0.
-                        // Create the queue on the current device.
-                        // NOTE: hipStreamNonBlocking is required to match the semantic implemented in the alpaka CPU queue.
-                        // It would be too much work to implement implicit default queue synchronization on CPU.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamCreateWithFlags(
-                                &m_HipQueue,
-                                hipStreamNonBlocking));
-                    }
-                    //-----------------------------------------------------------------------------
-                    QueueHipRtNonBlockingImpl(QueueHipRtNonBlockingImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueHipRtNonBlockingImpl(QueueHipRtNonBlockingImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueHipRtNonBlockingImpl const &) -> QueueHipRtNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueHipRtNonBlockingImpl &&) -> QueueHipRtNonBlockingImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST ~QueueHipRtNonBlockingImpl()
-                    {
-                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                        // Set the current device.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipSetDevice(
-                                m_dev.m_iDevice));
-                        // In case the device is still doing work in the queue when hipStreamDestroy() is called, the function will return immediately
-                        // and the resources associated with queue will be released automatically once the device has completed all work in queue.
-                        // -> No need to synchronize here.
-                        ALPAKA_HIP_RT_CHECK(
-                            hipStreamDestroy(
-                                m_HipQueue));
-                    }
-
-                public:
-                    dev::DevHipRt const m_dev;   //!< The device this queue is bound to.
-                    hipStream_t m_HipQueue;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    int m_callees = 0;
-                    std::mutex m_mutex;
-#endif
-                };
-            }
-        }
-
-        //#############################################################################
-        //! The HIP RT non-blocking queue.
-        class QueueHipRtNonBlocking final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueHipRtNonBlocking>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST QueueHipRtNonBlocking(
-                dev::DevHipRt const & dev) :
-                m_spQueueImpl(std::make_shared<hip::detail::QueueHipRtNonBlockingImpl>(dev))
-            {}
-            //-----------------------------------------------------------------------------
-            QueueHipRtNonBlocking(QueueHipRtNonBlocking const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueHipRtNonBlocking(QueueHipRtNonBlocking &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueHipRtNonBlocking const &) -> QueueHipRtNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueHipRtNonBlocking &&) -> QueueHipRtNonBlocking & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator==(QueueHipRtNonBlocking const & rhs) const
-            -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator!=(QueueHipRtNonBlocking const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST ~QueueHipRtNonBlocking() {
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                // we are a non-blocking queue, so we have to wait here with its destruction until all spawned tasks have been processed
-                alpaka::wait::wait(*this);
-#endif
-            }
-
-        public:
-            std::shared_ptr<hip::detail::QueueHipRtNonBlockingImpl> m_spQueueImpl;
-        };
-    }
-
-    namespace dev
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT non-blocking queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueHipRtNonBlocking>
-            {
-                using type = dev::DevHipRt;
-            };
-            //#############################################################################
-            //! The HIP RT non-blocking queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueHipRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueHipRtNonBlocking const & queue)
-                -> dev::DevHipRt
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT non-blocking queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueHipRtNonBlocking>
-            {
-                using type = event::EventHipRt;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT blocking queue enqueue trait specialization.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                TTask>
-            {
-                //#############################################################################
-                enum class CallbackState
-                {
-                    enqueued,
-                    notified,
-                    finished,
-                };
-
-                //#############################################################################
-                struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
-                {
-                    std::mutex m_mutex;
-                    std::condition_variable m_event;
-                    CallbackState state = CallbackState::enqueued;
-                };
-
-                //-----------------------------------------------------------------------------
-                static void HIPRT_CB hipRtCallback(hipStream_t /*queue*/, hipError_t /*status*/, void *arg)
-                {
-                    // explicitly copy the shared_ptr so that this method holds the state even when the executing thread has already finished.
-                    const auto pCallbackSynchronizationData = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
-
-                    // Notify the executing thread.
-                    {
-                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                        pCallbackSynchronizationData->state = CallbackState::notified;
-                    }
-                    pCallbackSynchronizationData->m_event.notify_one();
-
-                    // Wait for the executing thread to finish the task if it has not already finished.
-                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                    if(pCallbackSynchronizationData->state != CallbackState::finished)
-                    {
-                        pCallbackSynchronizationData->m_event.wait(
-                            lock,
-                            [pCallbackSynchronizationData](){
-                                return pCallbackSynchronizationData->state == CallbackState::finished;
-                            }
-                        );
-                    }
-                }
-
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    TTask const & task)
-                -> void
-                {
-#if BOOST_COMP_HIP
-                    // NOTE: hip callbacks are not blocking the stream.
-                    // The workaround used for HIP(hcc) would avoid the usage in a workflow with
-                    // many stream/event synchronizations (e.g. PIConGPU).
-                    // @todo remove this assert when hipStreamAddCallback is fixed
-                    static_assert(
-                                meta::DependentFalseType<TTask>::value,
-                                "Callbacks are not supported for HIP-clang");
-#endif
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    {
-                        // thread-safe callee incrementing
-                        std::lock_guard<std::mutex> guard(queue.m_spQueueImpl->m_mutex);
-                        queue.m_spQueueImpl->m_callees += 1;
-                    }
-#endif
-                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
-                    // test example: https://github.com/ROCm-Developer-Tools/HIP/blob/roc-1.9.x/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
-                    ALPAKA_HIP_RT_CHECK(hipStreamAddCallback(
-                        queue.m_spQueueImpl->m_HipQueue,
-                        hipRtCallback,
-                        pCallbackSynchronizationData.get(),
-                        0u));
-
-                    // We start a new std::thread which stores the task to be executed.
-                    // This circumvents the limitation that it is not possible to call HIP methods within the HIP callback thread.
-                    // The HIP thread signals the std::thread when it is ready to execute the task.
-                    // The HIP thread is waiting for the std::thread to signal that it is finished executing the task
-                    // before it executes the next task in the queue (HIP stream).
-                    std::thread t(
-                        [pCallbackSynchronizationData,
-                         task
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                         ,&queue // requires queue's destructor to wait for all tasks
-#endif
-                        ](){
-
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                            // thread-safe task execution and callee decrementing
-                            std::lock_guard<std::mutex> guard(queue.m_spQueueImpl->m_mutex);
-#endif
-
-                            // If the callback has not yet been called, we wait for it.
-                            {
-                                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
-                                if(pCallbackSynchronizationData->state != CallbackState::notified)
-                                {
-                                    pCallbackSynchronizationData->m_event.wait(
-                                        lock,
-                                        [pCallbackSynchronizationData](){
-                                            return pCallbackSynchronizationData->state == CallbackState::notified;
-                                        }
-                                    );
-                                }
-
-                                task();
-
-                                // Notify the waiting HIP thread.
-                                pCallbackSynchronizationData->state = CallbackState::finished;
-                            }
-                            pCallbackSynchronizationData->m_event.notify_one();
-#if BOOST_COMP_HCC // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                            queue.m_spQueueImpl->m_callees -= 1;
-#endif
-                        }
-                    );
-
-                    t.detach();
-                }
-            };
-            //#############################################################################
-            //! The HIP RT non-blocking queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueHipRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueHipRtNonBlocking const & queue)
-                -> bool
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    return (queue.m_spQueueImpl->m_callees==0);
-#else
-
-                    // Query is allowed even for queues on non current device.
-                    hipError_t ret = hipSuccess;
-                    ALPAKA_HIP_RT_CHECK_IGNORE(
-                        ret = hipStreamQuery(
-                            queue.m_spQueueImpl->m_HipQueue),
-                        hipErrorNotReady);
-                    return (ret == hipSuccess);
-#endif
-                }
-            };
-        }
-    }
-    namespace wait
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP RT non-blocking queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueHipRtNonBlocking>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueHipRtNonBlocking const & queue)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if BOOST_COMP_HCC  // NOTE: workaround for unwanted nonblocking hip streams for HCC (NVCC streams are blocking)
-                    while(queue.m_spQueueImpl->m_callees>0) {
-                        std::this_thread::sleep_for(std::chrono::milliseconds(10u));
-                    }
-#else
-                    // Sync is allowed even for queues on non current device.
-                    ALPAKA_HIP_RT_CHECK( hipStreamSynchronize(
-                            queue.m_spQueueImpl->m_HipQueue));
-#endif
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOaccBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOaccBlocking.hpp
new file mode 100644
index 0000000000..3bf635608b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOaccBlocking.hpp
@@ -0,0 +1,26 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/dev/DevOacc.hpp>
+#    include <alpaka/queue/QueueGenericThreadsBlocking.hpp>
+
+namespace alpaka
+{
+    using QueueOaccBlocking = QueueGenericThreadsBlocking<DevOacc>;
+}
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOaccNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOaccNonBlocking.hpp
new file mode 100644
index 0000000000..0ad9301019
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOaccNonBlocking.hpp
@@ -0,0 +1,26 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+
+#    if _OPENACC < 201306
+#        error If ALPAKA_ACC_ANY_BT_OACC_ENABLED is set, the compiler has to support OpenACC 2.0 or higher!
+#    endif
+
+#    include <alpaka/dev/DevOacc.hpp>
+#    include <alpaka/queue/QueueGenericThreadsNonBlocking.hpp>
+
+namespace alpaka
+{
+    using QueueOaccNonBlocking = QueueGenericThreadsNonBlocking<DevOacc>;
+}
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOmp5Blocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOmp5Blocking.hpp
new file mode 100644
index 0000000000..9d41746b8f
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOmp5Blocking.hpp
@@ -0,0 +1,26 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/dev/DevOmp5.hpp>
+#    include <alpaka/queue/QueueGenericThreadsBlocking.hpp>
+
+namespace alpaka
+{
+    using QueueOmp5Blocking = QueueGenericThreadsBlocking<DevOmp5>;
+}
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOmp5NonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOmp5NonBlocking.hpp
new file mode 100644
index 0000000000..100fdcb623
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueOmp5NonBlocking.hpp
@@ -0,0 +1,26 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+
+#    if _OPENMP < 201307
+#        error If ALPAKA_ACC_ANY_BT_OMP5_ENABLED is set, the compiler has to support OpenMP 4.0 or higher!
+#    endif
+
+#    include <alpaka/dev/DevOmp5.hpp>
+#    include <alpaka/queue/QueueGenericThreadsNonBlocking.hpp>
+
+namespace alpaka
+{
+    using QueueOmp5NonBlocking = QueueGenericThreadsNonBlocking<DevOmp5>;
+}
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp
new file mode 100644
index 0000000000..6c4d50ef95
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp
@@ -0,0 +1,201 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/event/Traits.hpp>
+#    include <alpaka/meta/DependentFalseType.hpp>
+#    include <alpaka/queue/Traits.hpp>
+#    include <alpaka/queue/cuda_hip/QueueUniformCudaHipRtBase.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <condition_variable>
+#    include <functional>
+#    include <memory>
+#    include <mutex>
+#    include <stdexcept>
+#    include <thread>
+
+namespace alpaka
+{
+    class EventUniformCudaHipRt;
+
+    //#############################################################################
+    //! The CUDA/HIP RT blocking queue.
+    class QueueUniformCudaHipRtBlocking final : public uniform_cuda_hip::detail::QueueUniformCudaHipRtBase
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST QueueUniformCudaHipRtBlocking(DevUniformCudaHipRt const& dev)
+            : uniform_cuda_hip::detail::QueueUniformCudaHipRtBase(dev)
+        {
+        }
+        //-----------------------------------------------------------------------------
+        QueueUniformCudaHipRtBlocking(QueueUniformCudaHipRtBlocking const&) = default;
+        //-----------------------------------------------------------------------------
+        QueueUniformCudaHipRtBlocking(QueueUniformCudaHipRtBlocking&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueUniformCudaHipRtBlocking const&) -> QueueUniformCudaHipRtBlocking& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueUniformCudaHipRtBlocking&&) -> QueueUniformCudaHipRtBlocking& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRtBlocking const& rhs) const -> bool
+        {
+            return (m_spQueueImpl == rhs.m_spQueueImpl);
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRtBlocking const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~QueueUniformCudaHipRtBlocking() = default;
+    };
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    using QueueCudaRtBlocking = QueueUniformCudaHipRtBlocking;
+#    else
+    using QueueHipRtBlocking = QueueUniformCudaHipRtBlocking;
+#    endif
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA/HIP RT blocking queue device type trait specialization.
+        template<>
+        struct DevType<QueueUniformCudaHipRtBlocking>
+        {
+            using type = DevUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT blocking queue event type trait specialization.
+        template<>
+        struct EventType<QueueUniformCudaHipRtBlocking>
+        {
+            using type = EventUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT blocking queue enqueue trait specialization.
+        template<typename TTask>
+        struct Enqueue<QueueUniformCudaHipRtBlocking, TTask>
+        {
+            //#############################################################################
+            enum class CallbackState
+            {
+                enqueued,
+                notified,
+                finished,
+            };
+
+            //#############################################################################
+            struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
+            {
+                std::mutex m_mutex;
+                std::condition_variable m_event;
+                CallbackState state = CallbackState::enqueued;
+            };
+
+            //-----------------------------------------------------------------------------
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            static void CUDART_CB
+#    else
+            static void HIPRT_CB
+#    endif
+            uniformCudaHipRtCallback(
+                ALPAKA_API_PREFIX(Stream_t) /*queue*/,
+                ALPAKA_API_PREFIX(Error_t) /*status*/,
+                void* arg)
+            {
+                // explicitly copy the shared_ptr so that this method holds the state even when the executing thread
+                // has already finished.
+                const auto pCallbackSynchronizationData
+                    = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
+
+                // Notify the executing thread.
+                {
+                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+                    pCallbackSynchronizationData->state = CallbackState::notified;
+                }
+                pCallbackSynchronizationData->m_event.notify_one();
+
+                // Wait for the executing thread to finish the task if it has not already finished.
+                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+                if(pCallbackSynchronizationData->state != CallbackState::finished)
+                {
+                    pCallbackSynchronizationData->m_event.wait(lock, [pCallbackSynchronizationData]() {
+                        return pCallbackSynchronizationData->state == CallbackState::finished;
+                    });
+                }
+            }
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueUniformCudaHipRtBlocking& queue, TTask const& task) -> void
+            {
+                auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamAddCallback)(
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue,
+                    uniformCudaHipRtCallback,
+                    pCallbackSynchronizationData.get(),
+                    0u));
+
+                // We start a new std::thread which stores the task to be executed.
+                // This circumvents the limitation that it is not possible to call CUDA/HIP methods within the CUDA/HIP
+                // callback thread. The CUDA/HIP thread signals the std::thread when it is ready to execute the task.
+                // The CUDA/HIP thread is waiting for the std::thread to signal that it is finished executing the task
+                // before it executes the next task in the queue (CUDA/HIP stream).
+                std::thread t([pCallbackSynchronizationData, task]() {
+                    // If the callback has not yet been called, we wait for it.
+                    {
+                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+                        if(pCallbackSynchronizationData->state != CallbackState::notified)
+                        {
+                            pCallbackSynchronizationData->m_event.wait(lock, [pCallbackSynchronizationData]() {
+                                return pCallbackSynchronizationData->state == CallbackState::notified;
+                            });
+                        }
+
+                        task();
+
+                        // Notify the waiting CUDA/HIP thread.
+                        pCallbackSynchronizationData->state = CallbackState::finished;
+                    }
+                    pCallbackSynchronizationData->m_event.notify_one();
+                });
+
+                t.join();
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp
new file mode 100644
index 0000000000..1d11ac119d
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp
@@ -0,0 +1,206 @@
+/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/event/Traits.hpp>
+#    include <alpaka/meta/DependentFalseType.hpp>
+#    include <alpaka/queue/Traits.hpp>
+#    include <alpaka/queue/cuda_hip/QueueUniformCudaHipRtBase.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <condition_variable>
+#    include <functional>
+#    include <memory>
+#    include <mutex>
+#    include <stdexcept>
+#    include <thread>
+
+namespace alpaka
+{
+    class EventUniformCudaHipRt;
+
+    //#############################################################################
+    //! The CUDA/HIP RT non-blocking queue.
+    class QueueUniformCudaHipRtNonBlocking final : public uniform_cuda_hip::detail::QueueUniformCudaHipRtBase
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST QueueUniformCudaHipRtNonBlocking(DevUniformCudaHipRt const& dev)
+            : uniform_cuda_hip::detail::QueueUniformCudaHipRtBase(dev)
+        {
+        }
+        //-----------------------------------------------------------------------------
+        QueueUniformCudaHipRtNonBlocking(QueueUniformCudaHipRtNonBlocking const&) = default;
+        //-----------------------------------------------------------------------------
+        QueueUniformCudaHipRtNonBlocking(QueueUniformCudaHipRtNonBlocking&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueUniformCudaHipRtNonBlocking const&) -> QueueUniformCudaHipRtNonBlocking& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueUniformCudaHipRtNonBlocking&&) -> QueueUniformCudaHipRtNonBlocking& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRtNonBlocking const& rhs) const -> bool
+        {
+            return (m_spQueueImpl == rhs.m_spQueueImpl);
+        }
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRtNonBlocking const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~QueueUniformCudaHipRtNonBlocking() = default;
+    };
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    using QueueCudaRtNonBlocking = QueueUniformCudaHipRtNonBlocking;
+#    else
+    using QueueHipRtNonBlocking = QueueUniformCudaHipRtNonBlocking;
+#    endif
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA/HIP RT non-blocking queue device type trait specialization.
+        template<>
+        struct DevType<QueueUniformCudaHipRtNonBlocking>
+        {
+            using type = DevUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT non-blocking queue event type trait specialization.
+        template<>
+        struct EventType<QueueUniformCudaHipRtNonBlocking>
+        {
+            using type = EventUniformCudaHipRt;
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT sync queue enqueue trait specialization.
+        template<typename TTask>
+        struct Enqueue<QueueUniformCudaHipRtNonBlocking, TTask>
+        {
+            //#############################################################################
+            enum class CallbackState
+            {
+                enqueued,
+                notified,
+                finished,
+            };
+
+            //#############################################################################
+            struct CallbackSynchronizationData : public std::enable_shared_from_this<CallbackSynchronizationData>
+            {
+                std::mutex m_mutex;
+                std::condition_variable m_event;
+                CallbackState state = CallbackState::enqueued;
+            };
+
+            //-----------------------------------------------------------------------------
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            static void CUDART_CB
+#    else
+            static void HIPRT_CB
+#    endif
+            uniformCudaHipRtCallback(
+                ALPAKA_API_PREFIX(Stream_t) /*queue*/,
+                ALPAKA_API_PREFIX(Error_t) /*status*/,
+                void* arg)
+            {
+                // explicitly copy the shared_ptr so that this method holds the state even when the executing thread
+                // has already finished.
+                const auto pCallbackSynchronizationData
+                    = reinterpret_cast<CallbackSynchronizationData*>(arg)->shared_from_this();
+
+                // Notify the executing thread.
+                {
+                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+                    pCallbackSynchronizationData->state = CallbackState::notified;
+                }
+                pCallbackSynchronizationData->m_event.notify_one();
+
+                // Wait for the executing thread to finish the task if it has not already finished.
+                std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+                if(pCallbackSynchronizationData->state != CallbackState::finished)
+                {
+                    pCallbackSynchronizationData->m_event.wait(lock, [pCallbackSynchronizationData]() {
+                        return pCallbackSynchronizationData->state == CallbackState::finished;
+                    });
+                }
+            }
+
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueUniformCudaHipRtNonBlocking& queue, TTask const& task) -> void
+            {
+#    if BOOST_COMP_HIP
+                // NOTE: hip callbacks are not blocking the stream.
+                // @todo remove this assert when hipStreamAddCallback is fixed
+                static_assert(meta::DependentFalseType<TTask>::value, "Callbacks are not supported for HIP-clang");
+#    endif
+
+                auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamAddCallback)(
+                    queue.m_spQueueImpl->m_UniformCudaHipQueue,
+                    uniformCudaHipRtCallback,
+                    pCallbackSynchronizationData.get(),
+                    0u));
+
+                // We start a new std::thread which stores the task to be executed.
+                // This circumvents the limitation that it is not possible to call CUDA methods within the CUDA/HIP
+                // callback thread. The CUDA/HIP thread signals the std::thread when it is ready to execute the task.
+                // The CUDA/HIP thread is waiting for the std::thread to signal that it is finished executing the task
+                // before it executes the next task in the queue (CUDA/HIP stream).
+                std::thread t([pCallbackSynchronizationData, task]() {
+                    // If the callback has not yet been called, we wait for it.
+                    {
+                        std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+                        if(pCallbackSynchronizationData->state != CallbackState::notified)
+                        {
+                            pCallbackSynchronizationData->m_event.wait(lock, [pCallbackSynchronizationData]() {
+                                return pCallbackSynchronizationData->state == CallbackState::notified;
+                            });
+                        }
+
+                        task();
+
+                        // Notify the waiting CUDA thread.
+                        pCallbackSynchronizationData->state = CallbackState::finished;
+                    }
+                    pCallbackSynchronizationData->m_event.notify_one();
+                });
+
+                t.detach();
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/Traits.hpp
index 47f6be0828..75312546b6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,93 +9,65 @@
 
 #pragma once
 
-#include <alpaka/wait/Traits.hpp>
-
 #include <alpaka/core/Common.hpp>
+#include <alpaka/core/Concepts.hpp>
+#include <alpaka/wait/Traits.hpp>
 
 #include <type_traits>
 #include <utility>
 
 namespace alpaka
 {
+    struct ConceptQueue;
+
     //-----------------------------------------------------------------------------
-    //! The queue specifics.
-    namespace queue
+    //! The queue traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The queue traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The queue enqueue trait.
-            template<
-                typename TQueue,
-                typename TTask,
-                typename TSfinae = void>
-            struct Enqueue;
-
-            //#############################################################################
-            //! The queue empty trait.
-            template<
-                typename TQueue,
-                typename TSfinae = void>
-            struct Empty;
+        //#############################################################################
+        //! The queue enqueue trait.
+        template<typename TQueue, typename TTask, typename TSfinae = void>
+        struct Enqueue;
 
-            //#############################################################################
-            //! Queue for an accelerator
-            template<
-                typename TAcc,
-                typename TProperty,
-                typename TSfinae = void>
-            struct QueueType;
-        }
+        //#############################################################################
+        //! The queue empty trait.
+        template<typename TQueue, typename TSfinae = void>
+        struct Empty;
 
-        //-----------------------------------------------------------------------------
-        //! Queues the given task in the given queue.
-        //!
-        //! Special Handling for events:
-        //!   If the event has previously been queued, then this call will overwrite any existing state of the event.
-        //!   Any subsequent calls which examine the status of event will only examine the completion of this most recent call to enqueue.
-        template<
-            typename TQueue,
-            typename TTask>
-        ALPAKA_FN_HOST auto enqueue(
-            TQueue & queue,
-            TTask && task)
-        -> void
-        {
-            traits::Enqueue<
-                TQueue,
-                typename std::decay<TTask>::type>
-            ::enqueue(
-                queue,
-                std::forward<TTask>(task));
-        }
+        //#############################################################################
+        //! Queue for an accelerator
+        template<typename TAcc, typename TProperty, typename TSfinae = void>
+        struct QueueType;
+    } // namespace traits
 
-        //-----------------------------------------------------------------------------
-        //! Tests if the queue is empty (all ops in the given queue have been completed).
-        template<
-            typename TQueue>
-        ALPAKA_FN_HOST auto empty(
-            TQueue const & queue)
-        -> bool
-        {
-            return
-                traits::Empty<
-                    TQueue>
-                ::empty(
-                    queue);
-        }
+    //-----------------------------------------------------------------------------
+    //! Queues the given task in the given queue.
+    //!
+    //! Special Handling for events:
+    //!   If the event has previously been queued, then this call will overwrite any existing state of the event.
+    //!   Any subsequent calls which examine the status of event will only examine the completion of this most recent
+    //!   call to enqueue.
+    template<typename TQueue, typename TTask>
+    ALPAKA_FN_HOST auto enqueue(TQueue& queue, TTask&& task) -> void
+    {
+        traits::Enqueue<TQueue, std::decay_t<TTask>>::enqueue(queue, std::forward<TTask>(task));
+    }
 
-        //-----------------------------------------------------------------------------
-        //! Queue based on the environment and a property
-        //
-        // \tparam TEnv Environment type, e.g.  accelerator, device or a platform.
-        //              queue::traits::QueueType must be specialized for TEnv
-        // \tparam TProperty Property to define the behavior of TEnv.
-        template<
-            typename TEnv,
-            typename TProperty>
-        using Queue = typename traits::QueueType<TEnv, TProperty>::type;
+    //-----------------------------------------------------------------------------
+    //! Tests if the queue is empty (all ops in the given queue have been completed).
+    template<typename TQueue>
+    ALPAKA_FN_HOST auto empty(TQueue const& queue) -> bool
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptQueue, TQueue>;
+        return traits::Empty<ImplementationBase>::empty(queue);
     }
-}
+
+    //-----------------------------------------------------------------------------
+    //! Queue based on the environment and a property
+    //!
+    //! \tparam TEnv Environment type, e.g.  accelerator, device or a platform.
+    //!              traits::QueueType must be specialized for TEnv
+    //! \tparam TProperty Property to define the behavior of TEnv.
+    template<typename TEnv, typename TProperty>
+    using Queue = typename traits::QueueType<TEnv, TProperty>::type;
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp
index d13cc0f87e..c2af32b4ea 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/ICpuQueue.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,48 +9,15 @@
 
 #pragma once
 
-#include <alpaka/core/BoostPredef.hpp>
+#include <alpaka/dev/DevCpu.hpp>
+#include <alpaka/queue/cpu/IGenericThreadsQueue.hpp>
 
 namespace alpaka
 {
-    namespace event
+    namespace cpu
     {
-        class EventCpu;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cpu
-        {
-
-
-#if BOOST_COMP_CLANG
-    // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]"
-    // https://stackoverflow.com/a/29288300
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-
-            //#############################################################################
-            //! The CPU queue interface
-            class ICpuQueue
-            {
-            public:
-                //-----------------------------------------------------------------------------
-                //! enqueue the event
-                virtual void enqueue(event::EventCpu &) = 0;
-                //-----------------------------------------------------------------------------
-                //! waiting for the event
-                virtual void wait(event::EventCpu const &) = 0;
-                //-----------------------------------------------------------------------------
-                virtual ~ICpuQueue() = default;
-            };
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
-        }
-    }
-}
+        //#############################################################################
+        //! The CPU queue interface
+        using ICpuQueue = IGenericThreadsQueue<DevCpu>;
+    } // namespace cpu
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp
new file mode 100644
index 0000000000..536e10fd6f
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp
@@ -0,0 +1,44 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/BoostPredef.hpp>
+
+namespace alpaka
+{
+    template<typename TDev>
+    class EventGenericThreads;
+
+#if BOOST_COMP_CLANG
+// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
+// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+    //#############################################################################
+    //! The CPU queue interface
+    template<typename TDev>
+    class IGenericThreadsQueue
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        //! enqueue the event
+        virtual void enqueue(EventGenericThreads<TDev>&) = 0;
+        //-----------------------------------------------------------------------------
+        //! waiting for the event
+        virtual void wait(EventGenericThreads<TDev> const&) = 0;
+        //-----------------------------------------------------------------------------
+        virtual ~IGenericThreadsQueue() = default;
+    };
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRtBase.hpp b/thirdParty/cupla/alpaka/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRtBase.hpp
new file mode 100644
index 0000000000..e46442e04a
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRtBase.hpp
@@ -0,0 +1,201 @@
+/* Copyright 2020 Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/core/Concepts.hpp>
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/queue/Traits.hpp>
+#    include <alpaka/wait/Traits.hpp>
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <memory>
+
+namespace alpaka
+{
+    namespace uniform_cuda_hip
+    {
+        namespace detail
+        {
+            //#############################################################################
+            //! The CUDA/HIP RT blocking queue implementation.
+            class QueueUniformCudaHipRtImpl final
+            {
+            public:
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(DevUniformCudaHipRt const& dev)
+                    : m_dev(dev)
+                    , m_UniformCudaHipQueue()
+                {
+                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                    // Set the current device.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(m_dev.m_iDevice));
+
+                    // - [cuda/hip]StreamDefault: Default queue creation flag.
+                    // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run
+                    // concurrently with work in queue 0 (the NULL queue),
+                    //   and that the created queue should perform no implicit synchronization with queue 0.
+                    // Create the queue on the current device.
+                    // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka
+                    // CPU queue. It would be too much work to implement implicit default queue synchronization on CPU.
+
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(
+                        StreamCreateWithFlags)(&m_UniformCudaHipQueue, ALPAKA_API_PREFIX(StreamNonBlocking)));
+                }
+                //-----------------------------------------------------------------------------
+                QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl const&) = delete;
+                //-----------------------------------------------------------------------------
+                QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl&&) = default;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueUniformCudaHipRtImpl const&) -> QueueUniformCudaHipRtImpl& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueUniformCudaHipRtImpl&&) -> QueueUniformCudaHipRtImpl& = delete;
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST ~QueueUniformCudaHipRtImpl()
+                {
+                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                    // Set the current device. \TODO: Is setting the current device before [cuda/hip]StreamDestroy
+                    // required?
+
+                    // In case the device is still doing work in the queue when [cuda/hip]StreamDestroy() is called,
+                    // the function will return immediately and the resources associated with queue will be released
+                    // automatically once the device has completed all work in queue.
+                    // -> No need to synchronize here.
+
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(SetDevice)(m_dev.m_iDevice));
+                    // In case the device is still doing work in the queue when cuda/hip-StreamDestroy() is called, the
+                    // function will return immediately and the resources associated with queue will be released
+                    // automatically once the device has completed all work in queue.
+                    // -> No need to synchronize here.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ALPAKA_API_PREFIX(StreamDestroy)(m_UniformCudaHipQueue));
+                }
+
+            public:
+                DevUniformCudaHipRt const m_dev; //!< The device this queue is bound to.
+                ALPAKA_API_PREFIX(Stream_t) m_UniformCudaHipQueue;
+            };
+
+            //#############################################################################
+            //! The CUDA RT blocking queue.
+            class QueueUniformCudaHipRtBase
+                : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueUniformCudaHipRtBase>
+                , public concepts::Implements<ConceptQueue, QueueUniformCudaHipRtBase>
+                , public concepts::Implements<ConceptGetDev, QueueUniformCudaHipRtBase>
+            {
+            public:
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST QueueUniformCudaHipRtBase(DevUniformCudaHipRt const& dev)
+                    : m_spQueueImpl(std::make_shared<QueueUniformCudaHipRtImpl>(dev))
+                {
+                }
+                //-----------------------------------------------------------------------------
+                QueueUniformCudaHipRtBase(QueueUniformCudaHipRtBase const&) = default;
+                //-----------------------------------------------------------------------------
+                QueueUniformCudaHipRtBase(QueueUniformCudaHipRtBase&&) = default;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueUniformCudaHipRtBase const&) -> QueueUniformCudaHipRtBase& = default;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueUniformCudaHipRtBase&&) -> QueueUniformCudaHipRtBase& = default;
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRtBase const& rhs) const -> bool
+                {
+                    return (m_spQueueImpl == rhs.m_spQueueImpl);
+                }
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRtBase const& rhs) const -> bool
+                {
+                    return !((*this) == rhs);
+                }
+                //-----------------------------------------------------------------------------
+                ~QueueUniformCudaHipRtBase() = default;
+
+            public:
+                std::shared_ptr<QueueUniformCudaHipRtImpl> m_spQueueImpl;
+            };
+        } // namespace detail
+    } // namespace uniform_cuda_hip
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA/HIP RT non-blocking queue device get trait specialization.
+        template<>
+        struct GetDev<uniform_cuda_hip::detail::QueueUniformCudaHipRtBase>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(uniform_cuda_hip::detail::QueueUniformCudaHipRtBase const& queue)
+                -> DevUniformCudaHipRt
+            {
+                return queue.m_spQueueImpl->m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT blocking queue test trait specialization.
+        template<>
+        struct Empty<uniform_cuda_hip::detail::QueueUniformCudaHipRtBase>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto empty(uniform_cuda_hip::detail::QueueUniformCudaHipRtBase const& queue) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Query is allowed even for queues on non current device.
+                ALPAKA_API_PREFIX(Error_t) ret = ALPAKA_API_PREFIX(Success);
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
+                    ret = ALPAKA_API_PREFIX(StreamQuery)(queue.m_spQueueImpl->m_UniformCudaHipQueue),
+                    ALPAKA_API_PREFIX(ErrorNotReady));
+                return (ret == ALPAKA_API_PREFIX(Success));
+            }
+        };
+
+        //#############################################################################
+        //! The CUDA/HIP RT blocking queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<>
+        struct CurrentThreadWaitFor<uniform_cuda_hip::detail::QueueUniformCudaHipRtBase>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(
+                uniform_cuda_hip::detail::QueueUniformCudaHipRtBase const& queue) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Sync is allowed even for queues on non current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    ALPAKA_API_PREFIX(StreamSynchronize)(queue.m_spQueueImpl->m_UniformCudaHipQueue));
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/RandCuRand.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/RandCuRand.hpp
deleted file mode 100644
index fb7b82144b..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/rand/RandCuRand.hpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/rand/Traits.hpp>
-
-#include <alpaka/dev/DevCudaRt.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-
-#include <curand_kernel.h>
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace rand
-    {
-        //#############################################################################
-        //! The CUDA rand implementation.
-        class RandCuRand : public concepts::Implements<ConceptRand, RandCuRand>
-        {
-        };
-
-        namespace generator
-        {
-            namespace cuda
-            {
-                //#############################################################################
-                //! The CUDA Xor random number generator.
-                class Xor
-                {
-                public:
-
-                    //-----------------------------------------------------------------------------
-                    // After calling this constructor the instance is not valid initialized and
-                    // need to be overwritten with a valid object
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST_ACC Xor() : m_State(curandStateXORWOW_t{})
-                    {
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    __device__ Xor(
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence = 0,
-                        std::uint32_t const & offset = 0)
-                    {
-                        curand_init(
-                            seed,
-                            subsequence,
-                            offset,
-                            &m_State);
-                    }
-
-                public:
-                    curandStateXORWOW_t m_State;
-                };
-            }
-        }
-        namespace distribution
-        {
-            namespace cuda
-            {
-                //#############################################################################
-                //! The CUDA random number floating point normal distribution.
-                template<
-                    typename T>
-                class NormalReal;
-
-                //#############################################################################
-                //! The CUDA random number float normal distribution.
-                template<>
-                class NormalReal<
-                    float>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> float
-                    {
-                        return curand_normal(&generator.m_State);
-                    }
-                };
-                //#############################################################################
-                //! The CUDA random number float normal distribution.
-                template<>
-                class NormalReal<
-                    double>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> double
-                    {
-                        return curand_normal_double(&generator.m_State);
-                    }
-                };
-
-                //#############################################################################
-                //! The CUDA random number floating point uniform distribution.
-                template<
-                    typename T>
-                class UniformReal;
-
-                //#############################################################################
-                //! The CUDA random number float uniform distribution.
-                template<>
-                class UniformReal<
-                    float>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> float
-                    {
-                        // (0.f, 1.0f]
-                        float const fUniformRand(curand_uniform(&generator.m_State));
-                        // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return denormalized floats around 0.f.
-                        // [0.f, 1.0f)
-                        return fUniformRand * static_cast<float>( fUniformRand != 1.0f );
-                    }
-                };
-                //#############################################################################
-                //! The CUDA random number float uniform distribution.
-                template<>
-                class UniformReal<
-                    double>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> double
-                    {
-                        // (0.f, 1.0f]
-                        double const fUniformRand(curand_uniform_double(&generator.m_State));
-                        // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to return denormalized floats around 0.f.
-                        // [0.f, 1.0f)
-                        return fUniformRand * static_cast<double>( fUniformRand != 1.0 );
-                    }
-                };
-
-                //#############################################################################
-                //! The CUDA random number integer uniform distribution.
-                template<
-                    typename T>
-                class UniformUint;
-
-                //#############################################################################
-                //! The CUDA random number unsigned integer uniform distribution.
-                template<>
-                class UniformUint<
-                    unsigned int>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    UniformUint() = default;
-
-                    //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> unsigned int
-                    {
-                        return curand(&generator.m_State);
-                    }
-                };
-            }
-        }
-
-        namespace distribution
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA random number float normal distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateNormalReal<
-                    RandCuRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto createNormalReal(
-                        RandCuRand const & /*rand*/)
-                    -> rand::distribution::cuda::NormalReal<T>
-                    {
-                        return rand::distribution::cuda::NormalReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The CUDA random number float uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformReal<
-                    RandCuRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto createUniformReal(
-                        RandCuRand const & /*rand*/)
-                    -> rand::distribution::cuda::UniformReal<T>
-                    {
-                        return rand::distribution::cuda::UniformReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The CUDA random number integer uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformUint<
-                    RandCuRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_integral<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto createUniformUint(
-                        RandCuRand const & /*rand*/)
-                    -> rand::distribution::cuda::UniformUint<T>
-                    {
-                        return rand::distribution::cuda::UniformUint<T>();
-                    }
-                };
-            }
-        }
-        namespace generator
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The CUDA random number default generator get trait specialization.
-                template<>
-                struct CreateDefault<
-                    RandCuRand>
-                {
-                    //-----------------------------------------------------------------------------
-                    __device__ static auto createDefault(
-                        RandCuRand const & /*rand*/,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::cuda::Xor
-                    {
-                        return rand::generator::cuda::Xor(
-                            seed,
-                            subsequence);
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/RandHipRand.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/RandHipRand.hpp
deleted file mode 100644
index d4fcbb87ec..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/rand/RandHipRand.hpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/rand/Traits.hpp>
-
-#include <alpaka/dev/DevHipRt.hpp>
-
-#include <alpaka/core/Hip.hpp>
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wduplicate-decl-specifier"
-
-#include <hiprand_kernel.h>
-
-#pragma clang diagnostic pop
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace rand
-    {
-        //#############################################################################
-        //! The HIP rand implementation.
-        class RandHipRand : public concepts::Implements<ConceptRand, RandHipRand>
-        {
-        };
-
-        namespace generator
-        {
-            namespace hip
-            {
-                //#############################################################################
-                //! The HIP Xor random number generator.
-                class Xor
-                {
-                public:
-
-                    //-----------------------------------------------------------------------------
-                    // After calling this constructor the instance is not valid initialized and
-                    // need to be overwritten with a valid object
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST_ACC Xor() : m_State(hiprandStateXORWOW_t{})
-                    {
-                    }
-
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    __device__ Xor(
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence = 0,
-                        std::uint32_t const & offset = 0)
-                    {
-                        hiprand_init(
-                            seed,
-                            subsequence,
-                            offset,
-                            &m_State);
-                    }
-
-                public:
-                    hiprandStateXORWOW_t m_State;
-                };
-            }
-        }
-        namespace distribution
-        {
-            namespace hip
-            {
-                //#############################################################################
-                //! The HIP random number floating point normal distribution.
-                template<
-                    typename T>
-                class NormalReal;
-
-                //#############################################################################
-                //! The HIP random number float normal distribution.
-                template<>
-                class NormalReal<
-                    float>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> float
-                    {
-                        return hiprand_normal(&generator.m_State);
-                    }
-                };
-                //#############################################################################
-                //! The HIP random number float normal distribution.
-                template<>
-                class NormalReal<
-                    double>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    NormalReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> double
-                    {
-                        return hiprand_normal_double(&generator.m_State);
-                    }
-                };
-
-                //#############################################################################
-                //! The HIP random number floating point uniform distribution.
-                template<
-                    typename T>
-                class UniformReal;
-
-                //#############################################################################
-                //! The HIP random number float uniform distribution.
-                template<>
-                class UniformReal<
-                    float>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> float
-                    {
-                        // (0.f, 1.0f]
-                        float const fUniformRand(hiprand_uniform(&generator.m_State));
-                        // NOTE: (1.0f - hiprand_uniform) does not work, because hiprand_uniform seems to return denormalized floats around 0.f.
-                        // [0.f, 1.0f)
-                        return fUniformRand * static_cast<float>( fUniformRand != 1.0f );
-                    }
-                };
-                //#############################################################################
-                //! The HIP random number float uniform distribution.
-                template<>
-                class UniformReal<
-                    double>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    UniformReal() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> double
-                    {
-                        // (0.f, 1.0f]
-                        double const fUniformRand(hiprand_uniform_double(&generator.m_State));
-                        // NOTE: (1.0f - hiprand_uniform_double) does not work, because hiprand_uniform_double seems to return denormalized floats around 0.f.
-                        // [0.f, 1.0f)
-                        return fUniformRand * static_cast<double>( fUniformRand != 1.0f );
-                    }
-                };
-
-                //#############################################################################
-                //! The HIP random number integer uniform distribution.
-                template<
-                    typename T>
-                class UniformUint;
-
-                //#############################################################################
-                //! The HIP random number unsigned integer uniform distribution.
-                template<>
-                class UniformUint<
-                    unsigned int>
-                {
-                public:
-                    //-----------------------------------------------------------------------------
-                    //! Constructor.
-                    UniformUint() = default;
-
-                    //-----------------------------------------------------------------------------
-                    //! Call operator.
-                    template<
-                        typename TGenerator>
-                    __device__ auto operator()(
-                        TGenerator & generator)
-                    -> unsigned int
-                    {
-                        return hiprand(&generator.m_State);
-                    }
-                };
-            }
-        }
-
-        namespace distribution
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP random number float normal distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateNormalReal<
-                    RandHipRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    ALPAKA_FN_HOST_ACC static auto createNormalReal(
-                        RandHipRand const & /*rand*/)
-                    -> rand::distribution::hip::NormalReal<T>
-                    {
-                        return rand::distribution::hip::NormalReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The HIP random number float uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformReal<
-                    RandHipRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    ALPAKA_FN_HOST_ACC static auto createUniformReal(
-                        RandHipRand const & /*rand*/)
-                    -> rand::distribution::hip::UniformReal<T>
-                    {
-                        return rand::distribution::hip::UniformReal<T>();
-                    }
-                };
-                //#############################################################################
-                //! The HIP random number integer uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformUint<
-                    RandHipRand,
-                    T,
-                    typename std::enable_if<
-                        std::is_integral<T>::value>::type>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    ALPAKA_FN_HOST_ACC static auto createUniformUint(
-                        RandHipRand const & /*rand*/)
-                    -> rand::distribution::hip::UniformUint<T>
-                    {
-                        return rand::distribution::hip::UniformUint<T>();
-                    }
-                };
-            }
-        }
-        namespace generator
-        {
-            namespace traits
-            {
-                //#############################################################################
-                //! The HIP random number default generator get trait specialization.
-                template<>
-                struct CreateDefault<
-                    RandHipRand>
-                {
-                    //-----------------------------------------------------------------------------
-
-                    __device__ static auto createDefault(
-                        RandHipRand const & /*rand*/,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::hip::Xor
-                    {
-                        return rand::generator::hip::Xor(
-                            seed,
-                            subsequence);
-                    }
-                };
-            }
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/RandStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/RandStdLib.hpp
index c93cc352fd..47e3560764 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/rand/RandStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/rand/RandStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,11 +9,10 @@
 
 #pragma once
 
-#include <alpaka/rand/Traits.hpp>
-#include <alpaka/rand/TinyMT/Engine.hpp>
-
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/rand/TinyMT/Engine.hpp>
+#include <alpaka/rand/Traits.hpp>
 
 #include <cstdint>
 #include <random>
@@ -53,16 +52,15 @@ namespace alpaka
                 class MersenneTwister
                 {
                 public:
-
                     //-----------------------------------------------------------------------------
                     MersenneTwister() = default;
 
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_HOST MersenneTwister(
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence = 0,
-                        std::uint32_t const & offset = 0) :
-                        // NOTE: XOR the seed and the subsequence to generate a unique seed.
+                        std::uint32_t const& seed,
+                        std::uint32_t const& subsequence = 0,
+                        std::uint32_t const& offset = 0)
+                        : // NOTE: XOR the seed and the subsequence to generate a unique seed.
                         m_State((seed ^ subsequence) + offset)
                     {
                     }
@@ -90,10 +88,10 @@ namespace alpaka
 
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_HOST TinyMersenneTwister(
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence = 0,
-                        std::uint32_t const & offset = 0) :
-                        // NOTE: XOR the seed and the subsequence to generate a unique seed.
+                        std::uint32_t const& seed,
+                        std::uint32_t const& subsequence = 0,
+                        std::uint32_t const& offset = 0)
+                        : // NOTE: XOR the seed and the subsequence to generate a unique seed.
                         m_State((seed ^ subsequence) + offset)
                     {
                     }
@@ -114,25 +112,24 @@ namespace alpaka
                 public:
                     //-----------------------------------------------------------------------------
                     RandomDevice() = default;
-                    RandomDevice(RandomDevice&&) :
-                        m_State{}
+                    RandomDevice(RandomDevice&&) : m_State{}
                     {
                     }
 
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_HOST RandomDevice(
-                        std::uint32_t const &,
-                        std::uint32_t const & = 0,
-                        std::uint32_t const & = 0) :
-                        m_State{}
+                        std::uint32_t const&,
+                        std::uint32_t const& = 0,
+                        std::uint32_t const& = 0)
+                        : m_State{}
                     {
                     }
 
                 public:
                     std::random_device m_State;
                 };
-            }
-        }
+            } // namespace cpu
+        } // namespace generator
 
         namespace distribution
         {
@@ -140,8 +137,7 @@ namespace alpaka
             {
                 //#############################################################################
                 //! The CPU random number normal distribution.
-                template<
-                    typename T>
+                template<typename T>
                 class NormalReal
                 {
                 public:
@@ -149,11 +145,8 @@ namespace alpaka
                     NormalReal() = default;
 
                     //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    ALPAKA_FN_HOST auto operator()(
-                        TGenerator & generator)
-                    -> T
+                    template<typename TGenerator>
+                    ALPAKA_FN_HOST auto operator()(TGenerator& generator) -> T
                     {
                         return m_dist(generator.m_State);
                     }
@@ -162,8 +155,7 @@ namespace alpaka
 
                 //#############################################################################
                 //! The CPU random number uniform distribution.
-                template<
-                    typename T>
+                template<typename T>
                 class UniformReal
                 {
                 public:
@@ -171,11 +163,8 @@ namespace alpaka
                     UniformReal() = default;
 
                     //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    ALPAKA_FN_HOST auto operator()(
-                        TGenerator & generator)
-                    -> T
+                    template<typename TGenerator>
+                    ALPAKA_FN_HOST auto operator()(TGenerator& generator) -> T
                     {
                         return m_dist(generator.m_State);
                     }
@@ -184,31 +173,28 @@ namespace alpaka
 
                 //#############################################################################
                 //! The CPU random number normal distribution.
-                template<
-                    typename T>
+                template<typename T>
                 class UniformUint
                 {
                 public:
                     //-----------------------------------------------------------------------------
-                    UniformUint() :
-                        m_dist(
-                            0,  // For signed integer: std::numeric_limits<T>::lowest()
+                    UniformUint()
+                        : m_dist(
+                            0, // For signed integer: std::numeric_limits<T>::lowest()
                             std::numeric_limits<T>::max())
-                    {}
+                    {
+                    }
 
                     //-----------------------------------------------------------------------------
-                    template<
-                        typename TGenerator>
-                    ALPAKA_FN_HOST auto operator()(
-                        TGenerator & generator)
-                    -> T
+                    template<typename TGenerator>
+                    ALPAKA_FN_HOST auto operator()(TGenerator& generator) -> T
                     {
                         return m_dist(generator.m_State);
                     }
                     std::uniform_int_distribution<T> m_dist;
                 };
-            }
-        }
+            } // namespace cpu
+        } // namespace distribution
 
         namespace distribution
         {
@@ -216,18 +202,12 @@ namespace alpaka
             {
                 //#############################################################################
                 //! The CPU device random number float normal distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateNormalReal<
-                    RandStdLib,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
+                template<typename T>
+                struct CreateNormalReal<RandStdLib, T, std::enable_if_t<std::is_floating_point<T>::value>>
                 {
                     //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createNormalReal(
-                        RandStdLib const & rand)
-                    -> rand::distribution::cpu::NormalReal<T>
+                    ALPAKA_FN_HOST static auto createNormalReal(RandStdLib const& rand)
+                        -> rand::distribution::cpu::NormalReal<T>
                     {
                         alpaka::ignore_unused(rand);
                         return rand::distribution::cpu::NormalReal<T>();
@@ -235,18 +215,12 @@ namespace alpaka
                 };
                 //#############################################################################
                 //! The CPU device random number float uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformReal<
-                    RandStdLib,
-                    T,
-                    typename std::enable_if<
-                        std::is_floating_point<T>::value>::type>
+                template<typename T>
+                struct CreateUniformReal<RandStdLib, T, std::enable_if_t<std::is_floating_point<T>::value>>
                 {
                     //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createUniformReal(
-                        RandStdLib const & rand)
-                    -> rand::distribution::cpu::UniformReal<T>
+                    ALPAKA_FN_HOST static auto createUniformReal(RandStdLib const& rand)
+                        -> rand::distribution::cpu::UniformReal<T>
                     {
                         alpaka::ignore_unused(rand);
                         return rand::distribution::cpu::UniformReal<T>();
@@ -254,25 +228,19 @@ namespace alpaka
                 };
                 //#############################################################################
                 //! The CPU device random number integer uniform distribution get trait specialization.
-                template<
-                    typename T>
-                struct CreateUniformUint<
-                    RandStdLib,
-                    T,
-                    typename std::enable_if<
-                        std::is_integral<T>::value>::type>
+                template<typename T>
+                struct CreateUniformUint<RandStdLib, T, std::enable_if_t<std::is_integral<T>::value>>
                 {
                     //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto createUniformUint(
-                        RandStdLib const & rand)
-                    -> rand::distribution::cpu::UniformUint<T>
+                    ALPAKA_FN_HOST static auto createUniformUint(RandStdLib const& rand)
+                        -> rand::distribution::cpu::UniformUint<T>
                     {
                         alpaka::ignore_unused(rand);
                         return rand::distribution::cpu::UniformUint<T>();
                     }
                 };
-            }
-        }
+            } // namespace traits
+        } // namespace distribution
         namespace generator
         {
             namespace traits
@@ -280,59 +248,47 @@ namespace alpaka
                 //#############################################################################
                 //! The CPU device random number default generator get trait specialization.
                 template<>
-                struct CreateDefault<
-                    TinyMersenneTwister>
+                struct CreateDefault<TinyMersenneTwister>
                 {
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_HOST static auto createDefault(
-                        TinyMersenneTwister const & rand,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::cpu::TinyMersenneTwister
+                        TinyMersenneTwister const& rand,
+                        std::uint32_t const& seed,
+                        std::uint32_t const& subsequence) -> rand::generator::cpu::TinyMersenneTwister
                     {
                         alpaka::ignore_unused(rand);
-                        return rand::generator::cpu::TinyMersenneTwister(
-                            seed,
-                            subsequence);
+                        return rand::generator::cpu::TinyMersenneTwister(seed, subsequence);
                     }
                 };
 
                 template<>
-                struct CreateDefault<
-                    MersenneTwister>
+                struct CreateDefault<MersenneTwister>
                 {
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_HOST static auto createDefault(
-                        MersenneTwister const & rand,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::cpu::MersenneTwister
+                        MersenneTwister const& rand,
+                        std::uint32_t const& seed,
+                        std::uint32_t const& subsequence) -> rand::generator::cpu::MersenneTwister
                     {
                         alpaka::ignore_unused(rand);
-                        return rand::generator::cpu::MersenneTwister(
-                            seed,
-                            subsequence);
+                        return rand::generator::cpu::MersenneTwister(seed, subsequence);
                     }
                 };
 
                 template<>
-                struct CreateDefault<
-                    RandomDevice>
+                struct CreateDefault<RandomDevice>
                 {
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_HOST static auto createDefault(
-                        RandomDevice const & rand,
-                        std::uint32_t const & seed,
-                        std::uint32_t const & subsequence)
-                    -> rand::generator::cpu::RandomDevice
+                        RandomDevice const& rand,
+                        std::uint32_t const& seed,
+                        std::uint32_t const& subsequence) -> rand::generator::cpu::RandomDevice
                     {
                         alpaka::ignore_unused(rand);
-                        return rand::generator::cpu::RandomDevice(
-                            seed,
-                            subsequence);
+                        return rand::generator::cpu::RandomDevice(seed, subsequence);
                     }
                 };
-            }
-        }
-    }
-}
+            } // namespace traits
+        } // namespace generator
+    } // namespace rand
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp
new file mode 100644
index 0000000000..1574b9ec65
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/rand/RandUniformCudaHipRand.hpp
@@ -0,0 +1,302 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/dev/DevUniformCudaHipRt.hpp>
+#    include <alpaka/rand/Traits.hpp>
+
+// Backend specific imports.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+
+#        include <curand_kernel.h>
+
+#    elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#        include <alpaka/core/Hip.hpp>
+
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wduplicate-decl-specifier"
+
+#        include <hiprand_kernel.h>
+
+#        pragma clang diagnostic pop
+#    endif
+
+
+#    include <type_traits>
+
+namespace alpaka
+{
+    namespace rand
+    {
+        //#############################################################################
+        //! The CUDA/HIP rand implementation.
+        class RandUniformCudaHipRand : public concepts::Implements<ConceptRand, RandUniformCudaHipRand>
+        {
+        };
+
+        namespace generator
+        {
+            namespace uniform_cuda_hip
+            {
+                //#############################################################################
+                //! The CUDA/HIP Xor random number generator.
+                class Xor
+                {
+                public:
+                    //-----------------------------------------------------------------------------
+                    // After calling this constructor the instance is not valid initialized and
+                    // need to be overwritten with a valid object
+                    //-----------------------------------------------------------------------------
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    ALPAKA_FN_HOST_ACC Xor() : m_State(curandStateXORWOW_t{})
+#    else
+                    ALPAKA_FN_HOST_ACC Xor() : m_State(hiprandStateXORWOW_t{})
+#    endif
+                    {
+                    }
+
+#    if BOOST_COMP_HIP
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_FN_HOST_ACC ~Xor() = default;
+#    endif
+
+                    //-----------------------------------------------------------------------------
+                    __device__ Xor(
+                        std::uint32_t const& seed,
+                        std::uint32_t const& subsequence = 0,
+                        std::uint32_t const& offset = 0)
+                    {
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                        curand_init(seed, subsequence, offset, &m_State);
+#    else
+                        hiprand_init(seed, subsequence, offset, &m_State);
+#    endif
+                    }
+
+                public:
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                    curandStateXORWOW_t m_State;
+#    else
+                    hiprandStateXORWOW_t m_State;
+#    endif
+                };
+            } // namespace uniform_cuda_hip
+        } // namespace generator
+        namespace distribution
+        {
+            namespace uniform_cuda_hip
+            {
+                //#############################################################################
+                //! The CUDA/HIP random number floating point normal distribution.
+                template<typename T>
+                class NormalReal;
+
+                //#############################################################################
+                //! The CUDA/HIP random number float normal distribution.
+                template<>
+                class NormalReal<float>
+                {
+                public:
+                    //-----------------------------------------------------------------------------
+                    NormalReal() = default;
+
+                    //-----------------------------------------------------------------------------
+                    template<typename TGenerator>
+                    __device__ auto operator()(TGenerator& generator) -> float
+                    {
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                        return curand_normal(&generator.m_State);
+#    else
+                        return hiprand_normal(&generator.m_State);
+#    endif
+                    }
+                };
+                //#############################################################################
+                //! The CUDA/HIP random number float normal distribution.
+                template<>
+                class NormalReal<double>
+                {
+                public:
+                    //-----------------------------------------------------------------------------
+                    NormalReal() = default;
+
+                    //-----------------------------------------------------------------------------
+                    template<typename TGenerator>
+                    __device__ auto operator()(TGenerator& generator) -> double
+                    {
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                        return curand_normal_double(&generator.m_State);
+#    else
+                        return hiprand_normal_double(&generator.m_State);
+#    endif
+                    }
+                };
+
+                //#############################################################################
+                //! The CUDA/HIP random number floating point uniform distribution.
+                template<typename T>
+                class UniformReal;
+
+                //#############################################################################
+                //! The CUDA/HIP random number float uniform distribution.
+                template<>
+                class UniformReal<float>
+                {
+                public:
+                    //-----------------------------------------------------------------------------
+                    UniformReal() = default;
+
+                    //-----------------------------------------------------------------------------
+                    template<typename TGenerator>
+                    __device__ auto operator()(TGenerator& generator) -> float
+                    {
+                        // (0.f, 1.0f]
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                        float const fUniformRand(curand_uniform(&generator.m_State));
+#    else
+                        float const fUniformRand(hiprand_uniform(&generator.m_State));
+#    endif
+                        // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return
+                        // denormalized floats around 0.f. [0.f, 1.0f)
+                        return fUniformRand * static_cast<float>(fUniformRand != 1.0f);
+                    }
+                };
+                //#############################################################################
+                //! The CUDA/HIP random number float uniform distribution.
+                template<>
+                class UniformReal<double>
+                {
+                public:
+                    //-----------------------------------------------------------------------------
+                    UniformReal() = default;
+
+                    //-----------------------------------------------------------------------------
+                    template<typename TGenerator>
+                    __device__ auto operator()(TGenerator& generator) -> double
+                    {
+                        // (0.f, 1.0f]
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                        double const fUniformRand(curand_uniform_double(&generator.m_State));
+#    else
+                        double const fUniformRand(hiprand_uniform_double(&generator.m_State));
+#    endif
+                        // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to
+                        // return denormalized floats around 0.f. [0.f, 1.0f)
+                        return fUniformRand * static_cast<double>(fUniformRand != 1.0);
+                    }
+                };
+
+                //#############################################################################
+                //! The CUDA/HIP random number integer uniform distribution.
+                template<typename T>
+                class UniformUint;
+
+                //#############################################################################
+                //! The CUDA/HIP random number unsigned integer uniform distribution.
+                template<>
+                class UniformUint<unsigned int>
+                {
+                public:
+                    //-----------------------------------------------------------------------------
+                    UniformUint() = default;
+
+                    //-----------------------------------------------------------------------------
+                    template<typename TGenerator>
+                    __device__ auto operator()(TGenerator& generator) -> unsigned int
+                    {
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                        return curand(&generator.m_State);
+#    else
+                        return hiprand(&generator.m_State);
+#    endif
+                    }
+                };
+            } // namespace uniform_cuda_hip
+        } // namespace distribution
+
+        namespace distribution
+        {
+            namespace traits
+            {
+                //#############################################################################
+                //! The CUDA/HIP random number float normal distribution get trait specialization.
+                template<typename T>
+                struct CreateNormalReal<RandUniformCudaHipRand, T, std::enable_if_t<std::is_floating_point<T>::value>>
+                {
+                    //-----------------------------------------------------------------------------
+                    __device__ static auto createNormalReal(RandUniformCudaHipRand const& /*rand*/)
+                        -> rand::distribution::uniform_cuda_hip::NormalReal<T>
+                    {
+                        return rand::distribution::uniform_cuda_hip::NormalReal<T>();
+                    }
+                };
+                //#############################################################################
+                //! The CUDA/HIP random number float uniform distribution get trait specialization.
+                template<typename T>
+                struct CreateUniformReal<RandUniformCudaHipRand, T, std::enable_if_t<std::is_floating_point<T>::value>>
+                {
+                    //-----------------------------------------------------------------------------
+                    __device__ static auto createUniformReal(RandUniformCudaHipRand const& /*rand*/)
+                        -> rand::distribution::uniform_cuda_hip::UniformReal<T>
+                    {
+                        return rand::distribution::uniform_cuda_hip::UniformReal<T>();
+                    }
+                };
+                //#############################################################################
+                //! The CUDA/HIP random number integer uniform distribution get trait specialization.
+                template<typename T>
+                struct CreateUniformUint<RandUniformCudaHipRand, T, std::enable_if_t<std::is_integral<T>::value>>
+                {
+                    //-----------------------------------------------------------------------------
+                    __device__ static auto createUniformUint(RandUniformCudaHipRand const& /*rand*/)
+                        -> rand::distribution::uniform_cuda_hip::UniformUint<T>
+                    {
+                        return rand::distribution::uniform_cuda_hip::UniformUint<T>();
+                    }
+                };
+            } // namespace traits
+        } // namespace distribution
+        namespace generator
+        {
+            namespace traits
+            {
+                //#############################################################################
+                //! The CUDA/HIP random number default generator get trait specialization.
+                template<>
+                struct CreateDefault<RandUniformCudaHipRand>
+                {
+                    //-----------------------------------------------------------------------------
+                    __device__ static auto createDefault(
+                        RandUniformCudaHipRand const& /*rand*/,
+                        std::uint32_t const& seed,
+                        std::uint32_t const& subsequence) -> rand::generator::uniform_cuda_hip::Xor
+                    {
+                        return rand::generator::uniform_cuda_hip::Xor(seed, subsequence);
+                    }
+                };
+            } // namespace traits
+        } // namespace generator
+    } // namespace rand
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/Engine.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/Engine.hpp
index fb2582992d..9d09d25557 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/Engine.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/Engine.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,68 +16,68 @@
 
 namespace alpaka
 {
-namespace rand
-{
-namespace generator
-{
-namespace cpu
-{
-    //! Implementation of std::UniformRandomBitGenerator for TinyMT32
-    struct TinyMTengine
+    namespace rand
     {
-        using result_type = std::uint32_t;
-
-        static constexpr result_type default_seed()
-        {
-            return 42u;
-        }
-
-        void seed( result_type value = default_seed() )
-        {
-            // parameters from TinyMT/jump/sample.c
-            prng.mat1 = 0x8f7011ee;
-            prng.mat2 = 0xfc78ff1f;
-            prng.tmat = 0x3793fdff;
-
-            tinymt32_init( &prng, value );
-        }
-
-        TinyMTengine( std::uint32_t const & seedValue )
-        {
-            seed( seedValue );
-        }
-
-        TinyMTengine()
+        namespace generator
         {
-            std::uint32_t const magicSeed = 42u;
-            seed( magicSeed );
-        }
-
-        result_type operator()()
-        {
-            return tinymt32_generate_uint32( &prng );
-        }
-
-        static constexpr result_type min()
-        {
-            return 0u;
-        }
-
-        static constexpr result_type max()
-        {
-            return UINT32_MAX;
-        }
-
-        void discard( unsigned long long ) // z
-        {
-            // not implemented
-            // tinymt32_jump( &prng, z, z );
-        }
-
-        tinymt32_t prng;
-    };
-
-} // namespace cpu
-} // namespace generator
-} // namespace rand
+            namespace cpu
+            {
+                //! Implementation of std::UniformRandomBitGenerator for TinyMT32
+                struct TinyMTengine
+                {
+                    using result_type = std::uint32_t;
+
+                    static constexpr result_type default_seed()
+                    {
+                        return 42u;
+                    }
+
+                    void seed(result_type value = default_seed())
+                    {
+                        // parameters from TinyMT/jump/sample.c
+                        prng.mat1 = 0x8f7011ee;
+                        prng.mat2 = 0xfc78ff1f;
+                        prng.tmat = 0x3793fdff;
+
+                        tinymt32_init(&prng, value);
+                    }
+
+                    TinyMTengine(std::uint32_t const& seedValue)
+                    {
+                        seed(seedValue);
+                    }
+
+                    TinyMTengine()
+                    {
+                        std::uint32_t const magicSeed = 42u;
+                        seed(magicSeed);
+                    }
+
+                    result_type operator()()
+                    {
+                        return tinymt32_generate_uint32(&prng);
+                    }
+
+                    static constexpr result_type min()
+                    {
+                        return 0u;
+                    }
+
+                    static constexpr result_type max()
+                    {
+                        return UINT32_MAX;
+                    }
+
+                    void discard(unsigned long long) // z
+                    {
+                        // not implemented
+                        // tinymt32_jump( &prng, z, z );
+                    }
+
+                    tinymt32_t prng;
+                };
+
+            } // namespace cpu
+        } // namespace generator
+    } // namespace rand
 } // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt
index 7496ebe318..88bd89660f 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt
+++ b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/LICENSE.txt
@@ -1,6 +1,6 @@
 /* Copyright 2019 Mutsuo Saito
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/tinymt32.h b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/tinymt32.h
index 52ada12142..c47d5ccc1b 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/tinymt32.h
+++ b/thirdParty/cupla/alpaka/include/alpaka/rand/TinyMT/tinymt32.h
@@ -1,11 +1,12 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Mutsuo Saito
+/* Copyright 2019-2020 Axel Huebl, Benjamin Worpitz, Mutsuo Saito
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+// clang-format off
 #ifndef TINYMT32_H
 #define TINYMT32_H
 /**
@@ -41,13 +42,12 @@
 #if BOOST_COMP_CLANG
 #   pragma clang diagnostic push
 #   pragma clang diagnostic ignored "-Wold-style-cast"
-#   pragma clang diagnostic ignored "-Wsign-conversion"
 #endif
 #if BOOST_COMP_GNUC
 #   pragma GCC diagnostic push
-#   pragma GCC diagnostic ignored "-Wsign-conversion"
+#   pragma GCC diagnostic ignored "-Wold-style-cast"
 #endif
-#if BOOST_COMP_MSVC
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
     #pragma warning(push)
     #pragma warning(disable: 4100)  // tinymt32.h(60): warning C4100: 'random': unreferenced formal parameter
 #endif
@@ -114,8 +114,10 @@ inline static void tinymt32_next_state(tinymt32_t * random) {
     random->status[1] = random->status[2];
     random->status[2] = x ^ (y << TINYMT32_SH1);
     random->status[3] = y;
-    random->status[1] ^= -((int32_t)(y & 1)) & random->mat1;
-    random->status[2] ^= -((int32_t)(y & 1)) & random->mat2;
+    int32_t const a = -((int32_t)(y & 1)) & (int32_t)random->mat1;
+    int32_t const b = -((int32_t)(y & 1)) & (int32_t)random->mat2;
+    random->status[1] ^= (uint32_t)a;
+    random->status[2] ^= (uint32_t)b;
 }
 
 /**
@@ -135,7 +137,9 @@ inline static uint32_t tinymt32_temper(tinymt32_t * random) {
         + (random->status[2] >> TINYMT32_SH8);
 #endif
     t0 ^= t1;
-    t0 ^= -((int32_t)(t1 & 1)) & random->tmat;
+    if ((t1 & 1) != 0) {
+        t0 ^= random->tmat;
+    }
     return t0;
 }
 
@@ -161,8 +165,11 @@ inline static float tinymt32_temper_conv(tinymt32_t * random) {
         + (random->status[2] >> TINYMT32_SH8);
 #endif
     t0 ^= t1;
-    conv.u = ((t0 ^ (-((int32_t)(t1 & 1)) & random->tmat)) >> 9)
-              | UINT32_C(0x3f800000);
+    if ((t1 & 1) != 0) {
+        conv.u  = ((t0 ^ random->tmat) >> 9) | UINT32_C(0x3f800000);
+    } else {
+        conv.u  = (t0 >> 9) | UINT32_C(0x3f800000);
+    }
     return conv.f;
 }
 
@@ -188,8 +195,11 @@ inline static float tinymt32_temper_conv_open(tinymt32_t * random) {
         + (random->status[2] >> TINYMT32_SH8);
 #endif
     t0 ^= t1;
-    conv.u = ((t0 ^ (-((int32_t)(t1 & 1)) & random->tmat)) >> 9)
-              | UINT32_C(0x3f800001);
+    if ((t1 & 1) != 0) {
+        conv.u  = ((t0 ^ random->tmat) >> 9) | UINT32_C(0x3f800001);
+    } else {
+        conv.u  = (t0 >> 9) | UINT32_C(0x3f800001);
+    }
     return conv.f;
 }
 
@@ -213,7 +223,7 @@ inline static uint32_t tinymt32_generate_uint32(tinymt32_t * random) {
  */
 inline static float tinymt32_generate_float(tinymt32_t * random) {
     tinymt32_next_state(random);
-    return (tinymt32_temper(random) >> 8) * TINYMT32_MUL;
+    return (float)(tinymt32_temper(random) >> 8) * TINYMT32_MUL;
 }
 
 /**
@@ -306,13 +316,13 @@ static uint32_t ini_func2(uint32_t x) {
  */
 static void period_certification(tinymt32_t * random) {
     if ((random->status[0] & TINYMT32_MASK) == 0 &&
-	random->status[1] == 0 &&
-	random->status[2] == 0 &&
-	random->status[3] == 0) {
-	random->status[0] = 'T';
-	random->status[1] = 'I';
-	random->status[2] = 'N';
-	random->status[3] = 'Y';
+        random->status[1] == 0 &&
+        random->status[2] == 0 &&
+        random->status[3] == 0) {
+        random->status[0] = 'T';
+        random->status[1] = 'I';
+        random->status[2] = 'N';
+        random->status[3] = 'Y';
     }
 }
 
@@ -322,19 +332,19 @@ static void period_certification(tinymt32_t * random) {
  * @param random tinymt state vector.
  * @param seed a 32-bit unsigned integer used as a seed.
  */
-inline void tinymt32_init(tinymt32_t * random, uint32_t seed) {
+void tinymt32_init(tinymt32_t * random, uint32_t seed) {
     random->status[0] = seed;
     random->status[1] = random->mat1;
     random->status[2] = random->mat2;
     random->status[3] = random->tmat;
-    for (uint32_t i = 1; i < MIN_LOOP; i++) {
-	random->status[i & 3] ^= i + UINT32_C(1812433253)
-	    * (random->status[(i - 1) & 3]
-	       ^ (random->status[(i - 1) & 3] >> 30));
+    for (unsigned int i = 1; i < MIN_LOOP; i++) {
+        random->status[i & 3] ^= i + UINT32_C(1812433253)
+            * (random->status[(i - 1) & 3]
+               ^ (random->status[(i - 1) & 3] >> 30));
     }
     period_certification(random);
-    for (int i = 0; i < PRE_LOOP; i++) {
-	tinymt32_next_state(random);
+    for (unsigned int i = 0; i < PRE_LOOP; i++) {
+        tinymt32_next_state(random);
     }
 }
 
@@ -345,14 +355,13 @@ inline void tinymt32_init(tinymt32_t * random, uint32_t seed) {
  * @param init_key the array of 32-bit integers, used as a seed.
  * @param key_length the length of init_key.
  */
-inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
-			    int key_length) {
-    const int lag = 1;
-    const int mid = 1;
-    const int size = 4;
-    uint32_t i;
-    int j;
-    int count;
+void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
+                            int key_length) {
+    const unsigned int lag = 1;
+    const unsigned int mid = 1;
+    const unsigned int size = 4;
+    unsigned int i, j;
+    unsigned int count;
     uint32_t r;
     uint32_t * st = &random->status[0];
 
@@ -361,50 +370,50 @@ inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
     st[2] = random->mat2;
     st[3] = random->tmat;
     if (key_length + 1 > MIN_LOOP) {
-	count = key_length + 1;
+        count = (unsigned int)key_length + 1;
     } else {
-	count = MIN_LOOP;
+        count = MIN_LOOP;
     }
     r = ini_func1(st[0] ^ st[mid % size]
-		  ^ st[(size - 1) % size]);
+                  ^ st[(size - 1) % size]);
     st[mid % size] += r;
-    r += uint32_t(key_length);
+    r += (unsigned int)key_length;
     st[(mid + lag) % size] += r;
     st[0] = r;
     count--;
-    for (i = 1, j = 0; (j < count) && (j < key_length); j++) {
-	r = ini_func1(st[i % size]
-		      ^ st[(i + mid) % size]
-		      ^ st[(i + size - 1) % size]);
-	st[(i + mid) % size] += r;
-	r += init_key[j] + i;
-	st[(i + mid + lag) % size] += r;
-	st[i % size] = r;
-	i = (i + 1) % size;
+    for (i = 1, j = 0; (j < count) && (j < (unsigned int)key_length); j++) {
+        r = ini_func1(st[i % size]
+                      ^ st[(i + mid) % size]
+                      ^ st[(i + size - 1) % size]);
+        st[(i + mid) % size] += r;
+        r += init_key[j] + i;
+        st[(i + mid + lag) % size] += r;
+        st[i % size] = r;
+        i = (i + 1) % size;
     }
     for (; j < count; j++) {
-	r = ini_func1(st[i % size]
-		      ^ st[(i + mid) % size]
-		      ^ st[(i + size - 1) % size]);
-	st[(i + mid) % size] += r;
-	r += i;
-	st[(i + mid + lag) % size] += r;
-	st[i % size] = r;
-	i = (i + 1) % size;
+        r = ini_func1(st[i % size]
+                      ^ st[(i + mid) % size]
+                      ^ st[(i + size - 1) % size]);
+        st[(i + mid) % size] += r;
+        r += i;
+        st[(i + mid + lag) % size] += r;
+        st[i % size] = r;
+        i = (i + 1) % size;
     }
     for (j = 0; j < size; j++) {
-	r = ini_func2(st[i % size]
-		      + st[(i + mid) % size]
-		      + st[(i + size - 1) % size]);
-	st[(i + mid) % size] ^= r;
-	r -= i;
-	st[(i + mid + lag) % size] ^= r;
-	st[i % size] = r;
-	i = (i + 1) % size;
+        r = ini_func2(st[i % size]
+                      + st[(i + mid) % size]
+                      + st[(i + size - 1) % size]);
+        st[(i + mid) % size] ^= r;
+        r -= i;
+        st[(i + mid + lag) % size] ^= r;
+        st[i % size] = r;
+        i = (i + 1) % size;
     }
     period_certification(random);
     for (i = 0; i < PRE_LOOP; i++) {
-	tinymt32_next_state(random);
+        tinymt32_next_state(random);
     }
 }
 
@@ -417,7 +426,7 @@ inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
 #if BOOST_COMP_GNUC
 #   pragma GCC diagnostic pop
 #endif
-#if BOOST_COMP_MSVC
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
 #   pragma warning(pop)
 #endif
 
diff --git a/thirdParty/cupla/alpaka/include/alpaka/rand/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/rand/Traits.hpp
index 579509377c..45af391939 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/rand/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/rand/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,8 +12,6 @@
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
 
-#include <boost/config.hpp>
-
 #include <type_traits>
 
 namespace alpaka
@@ -22,7 +20,9 @@ namespace alpaka
     //! The random number generation specifics.
     namespace rand
     {
-        struct ConceptRand;
+        struct ConceptRand
+        {
+        };
 
         //-----------------------------------------------------------------------------
         //! The random number generator distribution specifics.
@@ -34,117 +34,56 @@ namespace alpaka
             {
                 //#############################################################################
                 //! The random number float normal distribution get trait.
-                template<
-                    typename TRand,
-                    typename T,
-                    typename TSfinae = void>
+                template<typename TRand, typename T, typename TSfinae = void>
                 struct CreateNormalReal;
 
                 //#############################################################################
                 //! The random number float uniform distribution get trait.
-                template<
-                    typename TRand,
-                    typename T,
-                    typename TSfinae = void>
+                template<typename TRand, typename T, typename TSfinae = void>
                 struct CreateUniformReal;
 
                 //#############################################################################
                 //! The random number integer uniform distribution get trait.
-                template<
-                    typename TRand,
-                    typename T,
-                    typename TSfinae = void>
+                template<typename TRand, typename T, typename TSfinae = void>
                 struct CreateUniformUint;
-            }
+            } // namespace traits
 
             //-----------------------------------------------------------------------------
             //! \return A normal float distribution with mean 0.0f and standard deviation 1.0f.
             ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename T,
-                typename TRand>
-            ALPAKA_FN_HOST_ACC auto createNormalReal(
-                TRand const & rand)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateNormalReal<
-                    concepts::ImplementationBase<ConceptRand, TRand>,
-                    T>
-                ::createNormalReal(
-                    rand))
-#endif
+            template<typename T, typename TRand>
+            ALPAKA_FN_HOST_ACC auto createNormalReal(TRand const& rand)
             {
-                static_assert(
-                    std::is_floating_point<T>::value,
-                    "The value type T has to be a floating point type!");
+                static_assert(std::is_floating_point<T>::value, "The value type T has to be a floating point type!");
 
                 using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-                return
-                    traits::CreateNormalReal<
-                        ImplementationBase,
-                        T>
-                    ::createNormalReal(
-                        rand);
+                return traits::CreateNormalReal<ImplementationBase, T>::createNormalReal(rand);
             }
             //-----------------------------------------------------------------------------
             //! \return A uniform floating point distribution [0.0, 1.0).
             ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename T,
-                typename TRand>
-            ALPAKA_FN_HOST_ACC auto createUniformReal(
-                TRand const & rand)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateUniformReal<
-                    concepts::ImplementationBase<ConceptRand, TRand>,
-                    T>
-                ::createUniformReal(
-                    rand))
-#endif
+            template<typename T, typename TRand>
+            ALPAKA_FN_HOST_ACC auto createUniformReal(TRand const& rand)
             {
-                static_assert(
-                    std::is_floating_point<T>::value,
-                    "The value type T has to be a floating point type!");
+                static_assert(std::is_floating_point<T>::value, "The value type T has to be a floating point type!");
 
                 using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-                return
-                    traits::CreateUniformReal<
-                        ImplementationBase,
-                        T>
-                    ::createUniformReal(
-                        rand);
+                return traits::CreateUniformReal<ImplementationBase, T>::createUniformReal(rand);
             }
             //-----------------------------------------------------------------------------
             //! \return A uniform integer distribution [0, UINT_MAX].
             ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename T,
-                typename TRand>
-            ALPAKA_FN_HOST_ACC auto createUniformUint(
-                TRand const & rand)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateUniformUint<
-                    concepts::ImplementationBase<ConceptRand, TRand>,
-                    T>
-                ::createUniformUint(
-                    rand))
-#endif
+            template<typename T, typename TRand>
+            ALPAKA_FN_HOST_ACC auto createUniformUint(TRand const& rand)
             {
                 static_assert(
                     std::is_integral<T>::value && std::is_unsigned<T>::value,
                     "The value type T has to be a unsigned integral type!");
 
                 using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-                return
-                    traits::CreateUniformUint<
-                        ImplementationBase,
-                        T>
-                    ::createUniformUint(
-                        rand);
+                return traits::CreateUniformUint<ImplementationBase, T>::createUniformUint(rand);
             }
-        }
+        } // namespace distribution
 
         //-----------------------------------------------------------------------------
         //! The random number generator specifics.
@@ -156,39 +95,21 @@ namespace alpaka
             {
                 //#############################################################################
                 //! The random number default generator get trait.
-                template<
-                    typename TRand,
-                    typename TSfinae = void>
+                template<typename TRand, typename TSfinae = void>
                 struct CreateDefault;
-            }
+            } // namespace traits
             //-----------------------------------------------------------------------------
             //! \return A default random number generator.
             ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TRand>
+            template<typename TRand>
             ALPAKA_FN_HOST_ACC auto createDefault(
-                TRand const & rand,
-                std::uint32_t const & seed,
-                std::uint32_t const & subsequence)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                traits::CreateDefault<
-                    concepts::ImplementationBase<ConceptRand, TRand>>
-                ::createDefault(
-                    rand,
-                    seed,
-                    subsequence))
-#endif
+                TRand const& rand,
+                std::uint32_t const& seed,
+                std::uint32_t const& subsequence)
             {
                 using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-                return
-                    traits::CreateDefault<
-                        ImplementationBase>
-                    ::createDefault(
-                        rand,
-                        seed,
-                        subsequence);
+                return traits::CreateDefault<ImplementationBase>::createDefault(rand, seed, subsequence);
             }
-        }
-    }
-}
+        } // namespace generator
+    } // namespace rand
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/AnyOacc.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/AnyOacc.hpp
new file mode 100644
index 0000000000..7d535c1b01
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/AnyOacc.hpp
@@ -0,0 +1,14 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_ANY_BT_OACC_ENABLED
+#    define ALPAKA_ACC_ANY_BT_OACC_ENABLED
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/AnyOmp5.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/AnyOmp5.hpp
new file mode 100644
index 0000000000..a14eaa5070
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/AnyOmp5.hpp
@@ -0,0 +1,14 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+#    define ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuFibers.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuFibers.hpp
index 2a8180f98e..5647d80017 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuFibers.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuFibers.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,5 +10,5 @@
 #pragma once
 
 #ifndef ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
-    #define ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp
index 1cecf21bf9..8b65ff6549 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Blocks.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,5 +10,5 @@
 #pragma once
 
 #ifndef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-    #define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp
index c3cf763e37..eec6afcc57 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp2Threads.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,5 +10,5 @@
 #pragma once
 
 #ifndef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-    #define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp4.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp4.hpp
deleted file mode 100644
index f93665dc57..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuOmp4.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-    #define ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuSerial.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuSerial.hpp
index 7a4ab7013e..59191a97b2 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuSerial.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuSerial.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,5 +10,5 @@
 #pragma once
 
 #ifndef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-    #define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp
index 4c0f7ae0db..bccbf12599 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuTbbBlocks.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,5 +10,5 @@
 #pragma once
 
 #ifndef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-    #define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuThreads.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuThreads.hpp
index 791ef4b4cf..74ddc67ad4 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuThreads.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/CpuThreads.hpp
@@ -1,14 +1,14 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
- 
+
 #pragma once
 
 #ifndef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-    #define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuCudaRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuCudaRt.hpp
index 2648e0c222..b835e19d13 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuCudaRt.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuCudaRt.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,5 +10,5 @@
 #pragma once
 
 #ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
-    #define ALPAKA_ACC_GPU_CUDA_ENABLED
+#    define ALPAKA_ACC_GPU_CUDA_ENABLED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuHipRt.hpp b/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuHipRt.hpp
index f322b14c89..c50074f328 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuHipRt.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/standalone/GpuHipRt.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,5 +10,5 @@
 #pragma once
 
 #ifndef ALPAKA_ACC_GPU_HIP_ENABLED
-    #define ALPAKA_ACC_GPU_HIP_ENABLED
+#    define ALPAKA_ACC_GPU_HIP_ENABLED
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/TimeCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/TimeCudaBuiltIn.hpp
deleted file mode 100644
index f4ddbcc1f7..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/time/TimeCudaBuiltIn.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/time/Traits.hpp>
-
-namespace alpaka
-{
-    namespace time
-    {
-        //#############################################################################
-        //! The GPU CUDA accelerator time implementation.
-        class TimeCudaBuiltIn : public concepts::Implements<ConceptTime, TimeCudaBuiltIn>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            TimeCudaBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            __device__ TimeCudaBuiltIn(TimeCudaBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ TimeCudaBuiltIn(TimeCudaBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(TimeCudaBuiltIn const &) -> TimeCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(TimeCudaBuiltIn &&) -> TimeCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~TimeCudaBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CUDA built-in clock operation.
-            template<>
-            struct Clock<
-                time::TimeCudaBuiltIn>
-            {
-                //-----------------------------------------------------------------------------
-                __device__ static auto clock(
-                    time::TimeCudaBuiltIn const &)
-                -> std::uint64_t
-                {
-                    // This can be converted to a wall-clock time in seconds by dividing through the shader clock rate given by cudaDeviceProp::clockRate.
-                    // This clock rate is double the main clock rate on Fermi and older cards. 
-                    return
-                        static_cast<std::uint64_t>(
-                            clock64());
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/TimeHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/TimeHipBuiltIn.hpp
deleted file mode 100644
index 1c1d314933..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/time/TimeHipBuiltIn.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/time/Traits.hpp>
-
-namespace alpaka
-{
-    namespace time
-    {
-        //#############################################################################
-        //! The GPU HIP accelerator time implementation.
-        class TimeHipBuiltIn : public concepts::Implements<ConceptTime, TimeHipBuiltIn>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! Default constructor.
-            ALPAKA_FN_HOST_ACC TimeHipBuiltIn() = default;
-            //-----------------------------------------------------------------------------
-            //! Copy constructor.
-            __device__ TimeHipBuiltIn(TimeHipBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            //! Move constructor.
-            __device__ TimeHipBuiltIn(TimeHipBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            //! Copy assignment operator.
-            __device__ auto operator=(TimeHipBuiltIn const &) -> TimeHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            //! Move assignment operator.
-            __device__ auto operator=(TimeHipBuiltIn &&) -> TimeHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            //! Destructor.
-            /*virtual*/ ALPAKA_FN_HOST_ACC ~TimeHipBuiltIn() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The HIP built-in clock operation.
-            template<>
-            struct Clock<
-                time::TimeHipBuiltIn>
-            {
-                //-----------------------------------------------------------------------------
-
-                __device__ static auto clock(
-                    time::TimeHipBuiltIn const &)
-                -> std::uint64_t
-                {
-                    // This can be converted to a wall-clock time in seconds by dividing through the shader clock rate given by hipDeviceProp::clockRate.
-                    // This clock rate is double the main clock rate on Fermi and older cards.
-                    return
-                        static_cast<std::uint64_t>(
-                            clock64());
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/TimeOmp.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/TimeOmp.hpp
index a4bad6edfe..3ce3623a40 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/time/TimeOmp.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/time/TimeOmp.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,60 +11,52 @@
 
 #ifdef _OPENMP
 
-#include <alpaka/time/Traits.hpp>
+#    include <alpaka/core/Common.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/time/Traits.hpp>
 
-#include <alpaka/core/Common.hpp>
-#include <alpaka/core/Unused.hpp>
-
-#include <omp.h>
+#    include <omp.h>
 
 namespace alpaka
 {
-    namespace time
+    //#############################################################################
+    //! The OpenMP accelerator time implementation.
+    class TimeOmp : public concepts::Implements<ConceptTime, TimeOmp>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        TimeOmp() = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST TimeOmp(TimeOmp const&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST TimeOmp(TimeOmp&&) = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(TimeOmp const&) -> TimeOmp& = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST auto operator=(TimeOmp&&) -> TimeOmp& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~TimeOmp() = default;
+    };
+
+    namespace traits
     {
         //#############################################################################
-        //! The OpenMP accelerator time implementation.
-        class TimeOmp : public concepts::Implements<ConceptTime, TimeOmp>
+        //! The OpenMP accelerator clock operation.
+        template<>
+        struct Clock<TimeOmp>
         {
-        public:
-            //-----------------------------------------------------------------------------
-            TimeOmp() = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST TimeOmp(TimeOmp const &) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST TimeOmp(TimeOmp &&) = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(TimeOmp const &) -> TimeOmp & = delete;
             //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST auto operator=(TimeOmp &&) -> TimeOmp & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~TimeOmp() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The OpenMP accelerator clock operation.
-            template<>
-            struct Clock<
-                time::TimeOmp>
+            ALPAKA_FN_HOST static auto clock(TimeOmp const& time) -> std::uint64_t
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto clock(
-                    time::TimeOmp const & time)
-                -> std::uint64_t
-                {
-                    alpaka::ignore_unused(time);
-                    // NOTE: We compute the number of clock ticks by dividing the following durations:
-                    // - omp_get_wtime returns the elapsed wall clock time in seconds.
-                    // - omp_get_wtick gets the timer precision, i.e., the number of seconds between two successive clock ticks. 
-                    return
-                        static_cast<std::uint64_t>(
-                            omp_get_wtime() / omp_get_wtick());
-                }
-            };
-        }
-    }
-}
+                alpaka::ignore_unused(time);
+                // NOTE: We compute the number of clock ticks by dividing the following durations:
+                // - omp_get_wtime returns the elapsed wall clock time in seconds.
+                // - omp_get_wtick gets the timer precision, i.e., the number of seconds between two successive clock
+                // ticks.
+                return static_cast<std::uint64_t>(omp_get_wtime() / omp_get_wtick());
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/TimeStdLib.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/TimeStdLib.hpp
index 4cacf5584d..42537a2725 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/time/TimeStdLib.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/time/TimeStdLib.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,60 +9,50 @@
 
 #pragma once
 
-#include <alpaka/time/Traits.hpp>
-
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
+#include <alpaka/time/Traits.hpp>
 
 #include <chrono>
 
 namespace alpaka
 {
-    namespace time
+    //#############################################################################
+    //! The CPU fibers accelerator time implementation.
+    class TimeStdLib : public concepts::Implements<ConceptTime, TimeStdLib>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        TimeStdLib() = default;
+        //-----------------------------------------------------------------------------
+        TimeStdLib(TimeStdLib const&) = delete;
+        //-----------------------------------------------------------------------------
+        TimeStdLib(TimeStdLib&&) = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(TimeStdLib const&) -> TimeStdLib& = delete;
+        //-----------------------------------------------------------------------------
+        auto operator=(TimeStdLib&&) -> TimeStdLib& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~TimeStdLib() = default;
+    };
+
+    namespace traits
     {
         //#############################################################################
-        //! The CPU fibers accelerator time implementation.
-        class TimeStdLib : public concepts::Implements<ConceptTime, TimeStdLib>
+        //! The CPU fibers accelerator clock operation.
+        template<>
+        struct Clock<TimeStdLib>
         {
-        public:
-            //-----------------------------------------------------------------------------
-            TimeStdLib() = default;
-            //-----------------------------------------------------------------------------
-            TimeStdLib(TimeStdLib const &) = delete;
             //-----------------------------------------------------------------------------
-            TimeStdLib(TimeStdLib &&) = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(TimeStdLib const &) -> TimeStdLib & = delete;
-            //-----------------------------------------------------------------------------
-            auto operator=(TimeStdLib &&) -> TimeStdLib & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~TimeStdLib() = default;
-        };
-
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU fibers accelerator clock operation.
-            template<>
-            struct Clock<
-                TimeStdLib>
+            ALPAKA_FN_HOST static auto clock(TimeStdLib const& time) -> std::uint64_t
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto clock(
-                    time::TimeStdLib const & time)
-                -> std::uint64_t
-                {
-                    alpaka::ignore_unused(time);
+                alpaka::ignore_unused(time);
 
-                    // NOTE: high_resolution_clock returns a non-steady wall-clock time!
-                    // This means that it is not ensured that the values will always increase monotonically.
-                    return
-                        static_cast<std::uint64_t>(
-                            std::chrono::high_resolution_clock::now()
-                                .time_since_epoch()
-                                    .count());
-                }
-            };
-        }
-    }
-}
+                // NOTE: high_resolution_clock returns a non-steady wall-clock time!
+                // This means that it is not ensured that the values will always increase monotonically.
+                return static_cast<std::uint64_t>(
+                    std::chrono::high_resolution_clock::now().time_since_epoch().count());
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/TimeUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/TimeUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..a6da9ff315
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/time/TimeUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,66 @@
+/* Copyright 2019 Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/time/Traits.hpp>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The GPU CUDA accelerator time implementation.
+    class TimeUniformCudaHipBuiltIn : public concepts::Implements<ConceptTime, TimeUniformCudaHipBuiltIn>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        TimeUniformCudaHipBuiltIn() = default;
+        //-----------------------------------------------------------------------------
+        __device__ TimeUniformCudaHipBuiltIn(TimeUniformCudaHipBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ TimeUniformCudaHipBuiltIn(TimeUniformCudaHipBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(TimeUniformCudaHipBuiltIn const&) -> TimeUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(TimeUniformCudaHipBuiltIn&&) -> TimeUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~TimeUniformCudaHipBuiltIn() = default;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The CUDA built-in clock operation.
+        template<>
+        struct Clock<TimeUniformCudaHipBuiltIn>
+        {
+            //-----------------------------------------------------------------------------
+            __device__ static auto clock(TimeUniformCudaHipBuiltIn const&) -> std::uint64_t
+            {
+                // This can be converted to a wall-clock time in seconds by dividing through the shader clock rate
+                // given by uniformCudaHipDeviceProp::clockRate. This clock rate is double the main clock rate on Fermi
+                // and older cards.
+                return static_cast<std::uint64_t>(clock64());
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/time/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/time/Traits.hpp
index 97f408453c..92665fd372 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/time/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/time/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -16,42 +16,30 @@
 
 namespace alpaka
 {
-    //-----------------------------------------------------------------------------
-    //! The time traits specifics.
-    namespace time
+    struct ConceptTime
     {
-        struct ConceptTime;
+    };
 
-        //-----------------------------------------------------------------------------
-        //! The time traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The clock trait.
-            template<
-                typename TTime,
-                typename TSfinae = void>
-            struct Clock;
-        }
+    //-----------------------------------------------------------------------------
+    //! The time traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The clock trait.
+        template<typename TTime, typename TSfinae = void>
+        struct Clock;
+    } // namespace traits
 
-        //-----------------------------------------------------------------------------
-        //! \return A counter that is increasing every clock cycle.
-        //!
-        //! \tparam TTime The time implementation type.
-        //! \param time The time implementation.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TTime>
-        ALPAKA_FN_HOST_ACC auto clock(
-            TTime const & time)
-        -> std::uint64_t
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptTime, TTime>;
-            return
-                traits::Clock<
-                    ImplementationBase>
-                ::clock(
-                    time);
-        }
+    //-----------------------------------------------------------------------------
+    //! \return A counter that is increasing every clock cycle.
+    //!
+    //! \tparam TTime The time implementation type.
+    //! \param time The time implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TTime>
+    ALPAKA_FN_HOST_ACC auto clock(TTime const& time) -> std::uint64_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptTime, TTime>;
+        return traits::Clock<ImplementationBase>::clock(time);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/vec/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/vec/Traits.hpp
index 5f4b0890ed..2e113def93 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/vec/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/vec/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,225 +12,106 @@
 
 #include <alpaka/dim/Traits.hpp>
 #include <alpaka/extent/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
 #include <alpaka/idx/Traits.hpp>
+#include <alpaka/meta/IntegerSequence.hpp>
+#include <alpaka/offset/Traits.hpp>
 
-#include <boost/config.hpp>
+#include <utility>
 
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
-    //! The vec specifics.
-    namespace vec
+    //! The vec traits.
+    namespace traits
     {
-        //-----------------------------------------------------------------------------
-        //! The vec traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! Trait for selecting a sub-vector.
-            template<
-                typename TVec,
-                typename TIndexSequence,
-                typename TSfinae = void>
-            struct SubVecFromIndices;
+        //#############################################################################
+        //! Trait for selecting a sub-vector.
+        template<typename TVec, typename TIndexSequence, typename TSfinae = void>
+        struct SubVecFromIndices;
 
-            //#############################################################################
-            //! Trait for casting a vector.
-            template<
-                typename TVal,
-                typename TVec,
-                typename TSfinae = void>
-            struct Cast;
+        //#############################################################################
+        //! Trait for casting a vector.
+        template<typename TVal, typename TVec, typename TSfinae = void>
+        struct CastVec;
 
-            //#############################################################################
-            //! Trait for reversing a vector.
-            template<
-                typename TVec,
-                typename TSfinae = void>
-            struct Reverse;
+        //#############################################################################
+        //! Trait for reversing a vector.
+        template<typename TVec, typename TSfinae = void>
+        struct ReverseVec;
 
-            //#############################################################################
-            //! Trait for concatenating two vectors.
-            template<
-                typename TVecL,
-                typename TVecR,
-                typename TSfinae = void>
-            struct Concat;
-        }
+        //#############################################################################
+        //! Trait for concatenating two vectors.
+        template<typename TVecL, typename TVecR, typename TSfinae = void>
+        struct ConcatVec;
+    } // namespace traits
 
-        //-----------------------------------------------------------------------------
-        //! Builds a new vector by selecting the elements of the source vector in the given order.
-        //! Repeating and swizzling elements is allowed.
-        //! \return The sub-vector consisting of the elements specified by the indices.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TIndexSequence,
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto subVecFromIndices(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::SubVecFromIndices<
-                TVec,
-                TIndexSequence>
-            ::subVecFromIndices(
-                vec))
-#endif
-        {
-            return
-                traits::SubVecFromIndices<
-                    TVec,
-                    TIndexSequence>
-                ::subVecFromIndices(
-                    vec);
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TVec has to specialize SubVecFromIndices.
-        //! \return The sub-vector consisting of the first N elements of the source vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TSubDim,
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto subVecBegin(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            subVecFromIndices<
-                meta::MakeIntegerSequence<
-                    std::size_t,
-                    TSubDim::value
-                >
-            >(
-                vec))
-#endif
-        {
-            static_assert(
-                TSubDim::value <= dim::Dim<TVec>::value,
-                "The sub-Vec has to be smaller (or same size) then the original Vec.");
+    //-----------------------------------------------------------------------------
+    //! Builds a new vector by selecting the elements of the source vector in the given order.
+    //! Repeating and swizzling elements is allowed.
+    //! \return The sub-vector consisting of the elements specified by the indices.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIndexSequence, typename TVec>
+    ALPAKA_FN_HOST_ACC auto subVecFromIndices(TVec const& vec)
+    {
+        return traits::SubVecFromIndices<TVec, TIndexSequence>::subVecFromIndices(vec);
+    }
+    //-----------------------------------------------------------------------------
+    //! \tparam TVec has to specialize SubVecFromIndices.
+    //! \return The sub-vector consisting of the first N elements of the source vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TSubDim, typename TVec>
+    ALPAKA_FN_HOST_ACC auto subVecBegin(TVec const& vec)
+    {
+        static_assert(
+            TSubDim::value <= Dim<TVec>::value,
+            "The sub-Vec has to be smaller (or same size) then the original Vec.");
 
-            //! A sequence of integers from 0 to dim-1.
-            using IdxSubSequence =
-                meta::MakeIntegerSequence<
-                    std::size_t,
-                    TSubDim::value>;
-            return
-                subVecFromIndices<
-                    IdxSubSequence>(
-                        vec);
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TVec has to specialize SubVecFromIndices.
-        //! \return The sub-vector consisting of the last N elements of the source vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TSubDim,
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto subVecEnd(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            subVecFromIndices<
-                meta::MakeIntegerSequenceOffset<
-                    std::size_t,
-                    dim::Dim<TVec>::value - TSubDim::value,
-                    TSubDim::value
-                >
-            >(
-                vec))
-#endif
-        {
-            static_assert(
-                TSubDim::value <= dim::Dim<TVec>::value,
-                "The sub-Vec has to be smaller (or same size) then the original Vec.");
+        //! A sequence of integers from 0 to dim-1.
+        using IdxSubSequence = std::make_integer_sequence<std::size_t, TSubDim::value>;
+        return subVecFromIndices<IdxSubSequence>(vec);
+    }
+    //-----------------------------------------------------------------------------
+    //! \tparam TVec has to specialize SubVecFromIndices.
+    //! \return The sub-vector consisting of the last N elements of the source vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TSubDim, typename TVec>
+    ALPAKA_FN_HOST_ACC auto subVecEnd(TVec const& vec)
+    {
+        static_assert(
+            TSubDim::value <= Dim<TVec>::value,
+            "The sub-Vec has to be smaller (or same size) then the original Vec.");
 
-            constexpr std::size_t idxOffset = dim::Dim<TVec>::value - TSubDim::value;
+        constexpr std::size_t idxOffset = Dim<TVec>::value - TSubDim::value;
 
-            //! A sequence of integers from 0 to dim-1.
-            using IdxSubSequence =
-                meta::MakeIntegerSequenceOffset<
-                    std::size_t,
-                    idxOffset,
-                    TSubDim::value>;
-            return
-                subVecFromIndices<
-                    IdxSubSequence>(
-                        vec);
-        }
+        //! A sequence of integers from 0 to dim-1.
+        using IdxSubSequence = meta::MakeIntegerSequenceOffset<std::size_t, idxOffset, TSubDim::value>;
+        return subVecFromIndices<IdxSubSequence>(vec);
+    }
 
-        //-----------------------------------------------------------------------------
-        //! \return The casted vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TVal,
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto cast(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Cast<
-                TVal,
-                TVec>
-            ::cast(
-                vec))
-#endif
-        {
-            return
-                traits::Cast<
-                    TVal,
-                    TVec>
-                ::cast(
-                    vec);
-        }
+    //-----------------------------------------------------------------------------
+    //! \return The casted vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TVal, typename TVec>
+    ALPAKA_FN_HOST_ACC auto castVec(TVec const& vec)
+    {
+        return traits::CastVec<TVal, TVec>::castVec(vec);
+    }
 
-        //-----------------------------------------------------------------------------
-        //! \return The reverse vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TVec>
-        ALPAKA_FN_HOST_ACC auto reverse(
-            TVec const & vec)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Reverse<
-                TVec>
-            ::reverse(
-                vec))
-#endif
-        {
-            return
-                traits::Reverse<
-                    TVec>
-                ::reverse(
-                    vec);
-        }
+    //-----------------------------------------------------------------------------
+    //! \return The reverseVec vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TVec>
+    ALPAKA_FN_HOST_ACC auto reverseVec(TVec const& vec)
+    {
+        return traits::ReverseVec<TVec>::reverseVec(vec);
+    }
 
-        //-----------------------------------------------------------------------------
-        //! \return The concatenated vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TVecL,
-            typename TVecR>
-        ALPAKA_FN_HOST_ACC auto concat(
-            TVecL const & vecL,
-            TVecR const & vecR)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            traits::Concat<
-                TVecL,
-                TVecR>
-            ::concat(
-                vecL,
-                vecR))
-#endif
-        {
-            return
-                traits::Concat<
-                    TVecL,
-                    TVecR>
-                ::concat(
-                    vecL,
-                    vecR);
-        }
+    //-----------------------------------------------------------------------------
+    //! \return The concatenated vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TVecL, typename TVecR>
+    ALPAKA_FN_HOST_ACC auto concatVec(TVecL const& vecL, TVecR const& vecR)
+    {
+        return traits::ConcatVec<TVecL, TVecR>::concatVec(vecL, vecR);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/vec/Vec.hpp b/thirdParty/cupla/alpaka/include/alpaka/vec/Vec.hpp
index 52fcb7238e..33f9a5cc45 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/vec/Vec.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/vec/Vec.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,1152 +9,619 @@
 
 #pragma once
 
-#include <alpaka/vec/Traits.hpp>
-#include <alpaka/dim/Traits.hpp>
-#include <alpaka/dim/DimIntegralConst.hpp>
-#include <alpaka/extent/Traits.hpp>
-#include <alpaka/offset/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
 #include <alpaka/core/Align.hpp>
 #include <alpaka/core/Assert.hpp>
 #include <alpaka/core/BoostPredef.hpp>
 #include <alpaka/core/Common.hpp>
 #include <alpaka/core/Unused.hpp>
-#include <alpaka/meta/IntegerSequence.hpp>
+#include <alpaka/dim/DimIntegralConst.hpp>
+#include <alpaka/dim/Traits.hpp>
+#include <alpaka/extent/Traits.hpp>
+#include <alpaka/idx/Traits.hpp>
 #include <alpaka/meta/Fold.hpp>
+#include <alpaka/meta/Functional.hpp>
+#include <alpaka/meta/IntegerSequence.hpp>
+#include <alpaka/offset/Traits.hpp>
+#include <alpaka/vec/Traits.hpp>
 
-#include <boost/config.hpp>
-
+#include <algorithm>
 #include <cstdint>
 #include <ostream>
 #include <type_traits>
-#include <algorithm>
-
-// Some compilers do not support the out of class versions:
-// - the nvcc CUDA compiler (at least 8.0)
-// - the intel compiler
-#if BOOST_COMP_HCC || BOOST_COMP_HIP || BOOST_COMP_NVCC || BOOST_COMP_INTEL || (BOOST_COMP_CLANG_CUDA >= BOOST_VERSION_NUMBER(4, 0, 0)) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 0, 0))
-    #define ALPAKA_CREATE_VEC_IN_CLASS
-#endif
+#include <utility>
 
 namespace alpaka
 {
-    namespace vec
+    template<typename TDim, typename TVal>
+    class Vec;
+
+    //-----------------------------------------------------------------------------
+    //! Single value constructor helper.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<
+        typename TDim,
+        template<std::size_t>
+        class TTFnObj,
+        typename... TArgs,
+        typename TIdxSize,
+        TIdxSize... TIndices>
+    ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnArbitrary(
+        std::integer_sequence<TIdxSize, TIndices...> const& indices,
+        TArgs&&... args)
     {
-        template<
-            typename TDim,
-            typename TVal>
-        class Vec;
+        alpaka::ignore_unused(indices);
 
-#ifndef ALPAKA_CREATE_VEC_IN_CLASS
+        return Vec<TDim, decltype(TTFnObj<0>::create(std::forward<TArgs>(args)...))>(
+            (TTFnObj<TIndices>::create(std::forward<TArgs>(args)...))...);
+    }
+    //-----------------------------------------------------------------------------
+    //! Creator using func<idx>(args...) to initialize all values of the vector.
+    //! The idx is in the range [0, TDim].
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, template<std::size_t> class TTFnObj, typename... TArgs>
+    ALPAKA_FN_HOST_ACC auto createVecFromIndexedFn(TArgs&&... args)
+    {
+        using IdxSequence = std::make_integer_sequence<typename TDim::value_type, TDim::value>;
+        return createVecFromIndexedFnArbitrary<TDim, TTFnObj>(IdxSequence(), std::forward<TArgs>(args)...);
+    }
+    //-----------------------------------------------------------------------------
+    //! Creator using func<idx>(args...) to initialize all values of the vector.
+    //! The idx is in the range [TIdxOffset, TIdxOffset + TDim].
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, template<std::size_t> class TTFnObj, typename TIdxOffset, typename... TArgs>
+    ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnOffset(TArgs&&... args)
+    {
+        using IdxSubSequenceSigned = meta::MakeIntegerSequenceOffset<std::intmax_t, TIdxOffset::value, TDim::value>;
+        using IdxSubSequence = meta::ConvertIntegerSequence<typename TIdxOffset::value_type, IdxSubSequenceSigned>;
+        return createVecFromIndexedFnArbitrary<TDim, TTFnObj>(IdxSubSequence(), std::forward<TArgs>(args)...);
+    }
+
+    //#############################################################################
+    //! A n-dimensional vector.
+    template<typename TDim, typename TVal>
+    class Vec final
+    {
+    public:
+        static_assert(TDim::value >= 0u, "Invalid dimensionality");
+
+        using Dim = TDim;
+        using Val = TVal;
+
+    private:
+        //! A sequence of integers from 0 to dim-1.
+        //! This can be used to write compile time indexing algorithms.
+        using IdxSequence = std::make_integer_sequence<std::size_t, TDim::value>;
+
+    public:
         //-----------------------------------------------------------------------------
-        //! Single value constructor helper.
+        // The default constructor is only available when the vector is zero-dimensional.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            template<std::size_t> class TTFnObj,
-            typename... TArgs,
-            typename TIdxSize,
-            TIdxSize... TIndices>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnArbitrary(
-            meta::IntegerSequence<TIdxSize, TIndices...> const & indices,
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> Vec<TDim, decltype(TTFnObj<0>::create(std::forward<TArgs>(args)...))>
-#endif
+        template<bool B = (TDim::value == 0u), typename = std::enable_if_t<B>>
+        ALPAKA_FN_HOST_ACC Vec() : m_data{static_cast<TVal>(0u)}
         {
-            alpaka::ignore_unused(indices);
-
-            return Vec<TDim, decltype(TTFnObj<0>::create(std::forward<TArgs>(args)...))>(
-                (TTFnObj<TIndices>::create(std::forward<TArgs>(args)...))...);
         }
+
+
         //-----------------------------------------------------------------------------
-        //! Creator using func<idx>(args...) to initialize all values of the vector.
-        //! The idx is in the range [0, TDim].
+        //! Value constructor.
+        //! This constructor is only available if the number of parameters matches the vector idx.
         ALPAKA_NO_HOST_ACC_WARNING
         template<
-            typename TDim,
-            template<std::size_t> class TTFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFn(
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            createVecFromIndexedFnArbitrary<
-                TDim,
-                TTFnObj>(
-                    meta::MakeIntegerSequence<typename TDim::value_type, TDim::value>(),
-                    std::forward<TArgs>(args)...))
-#endif
+            typename TArg0,
+            typename... TArgs,
+            typename = std::enable_if_t<
+                // There have to be dim arguments.
+                (sizeof...(TArgs) + 1 == TDim::value) && (std::is_same<TVal, std::decay_t<TArg0>>::value)>>
+        ALPAKA_FN_HOST_ACC Vec(TArg0&& arg0, TArgs&&... args)
+            : m_data{std::forward<TArg0>(arg0), std::forward<TArgs>(args)...}
         {
-            using IdxSequence = meta::MakeIntegerSequence<typename TDim::value_type, TDim::value>;
-            return
-                createVecFromIndexedFnArbitrary<
-                    TDim,
-                    TTFnObj>(
-                        IdxSequence(),
-                        std::forward<TArgs>(args)...);
         }
+
         //-----------------------------------------------------------------------------
-        //! Creator using func<idx>(args...) to initialize all values of the vector.
-        //! The idx is in the range [TIdxOffset, TIdxOffset + TDim].
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            template<std::size_t> class TTFnObj,
-            typename TIdxOffset,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnOffset(
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> decltype(
-            createVecFromIndexedFnArbitrary<
-                TDim,
-                TTFnObj>(
-                    meta::ConvertIntegerSequence<typename TIdxOffset::value_type, meta::MakeIntegerSequenceOffset<std::intmax_t, TIdxOffset::value, TDim::value>>(),
-                    std::forward<TArgs>(args)...))
-#endif
-        {
-            using IdxSubSequenceSigned = meta::MakeIntegerSequenceOffset<std::intmax_t, TIdxOffset::value, TDim::value>;
-            using IdxSubSequence = meta::ConvertIntegerSequence<typename TIdxOffset::value_type, IdxSubSequenceSigned>;
-            return
-                createVecFromIndexedFnArbitrary<
-                    TDim,
-                    TTFnObj>(
-                        IdxSubSequence(),
-                        std::forward<TArgs>(args)...);
-        }
-#endif
+        ALPAKA_FN_HOST_ACC
+        Vec(Vec const&) = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC
+        Vec(Vec&&) noexcept = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC
+        auto operator=(Vec const&) -> Vec& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC
+        auto operator=(Vec&&) noexcept -> Vec& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC ~Vec() = default;
 
+    private:
         //#############################################################################
-        //! A n-dimensional vector.
-        template<
-            typename TDim,
-            typename TVal>
-        class Vec final
+        //! A function object that returns the given value for each index.
+        template<std::size_t Tidx>
+        struct CreateSingleVal
         {
-        public:
-            static_assert(TDim::value >= 0u, "Invalid dimensionality");
-
-            using Dim = TDim;
-            static constexpr auto s_uiDim = TDim::value;
-            using Val = TVal;
-
-        private:
-            //! A sequence of integers from 0 to dim-1.
-            //! This can be used to write compile time indexing algorithms.
-            using IdxSequence = meta::MakeIntegerSequence<std::size_t, TDim::value>;
-
-        public:
-            //-----------------------------------------------------------------------------
-            // The default constructor is only available when the vector is zero-dimensional.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                bool B = (TDim::value == 0u),
-                typename = typename std::enable_if<B>::type>
-            ALPAKA_FN_HOST_ACC Vec() :
-                m_data{static_cast<TVal>(0u)}
-            {}
-
-
-            //-----------------------------------------------------------------------------
-            //! Value constructor.
-            //! This constructor is only available if the number of parameters matches the vector idx.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TArg0,
-                typename... TArgs,
-                typename = typename std::enable_if<
-                    // There have to be dim arguments.
-                    (sizeof...(TArgs)+1 == TDim::value)
-                    &&
-                    (std::is_same<TVal, typename std::decay<TArg0>::type>::value)
-                    >::type>
-            ALPAKA_FN_HOST_ACC Vec(
-                TArg0 && arg0,
-                TArgs && ... args) :
-                    m_data{std::forward<TArg0>(arg0), std::forward<TArgs>(args)...}
-            {}
-
-#ifdef ALPAKA_CREATE_VEC_IN_CLASS
             //-----------------------------------------------------------------------------
-            //! Creator using func<idx>(args...) to initialize all values of the vector.
             ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                template<std::size_t> class TTFnObj,
-                typename... TArgs,
-                typename TIdxSize,
-                TIdxSize... TIndices>
-            ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFnArbitrary(
-                meta::IntegerSequence<TIdxSize, TIndices...> const & indices,
-                TArgs && ... args)
-            -> Vec<TDim, TVal>
+            ALPAKA_FN_HOST_ACC static auto create(TVal const& val) -> TVal
             {
-                alpaka::ignore_unused(indices);
-
-                return Vec<TDim, TVal>(
-                    (TTFnObj<TIndices>::create(std::forward<TArgs>(args)...))...);
+                return val;
             }
-            //-----------------------------------------------------------------------------
-            //! Creator using func<idx>(args...) to initialize all values of the vector.
-            //! The idx is in the range [0, TDim].
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                template<std::size_t> class TTFnObj,
-                typename... TArgs>
-            ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFn(
-                TArgs && ... args)
-            -> Vec<TDim, TVal>
-            {
-                return
-                    createVecFromIndexedFnArbitrary<
-                        TTFnObj>(
-                            IdxSequence(),
-                            std::forward<TArgs>(args)...);
-            }
-            //-----------------------------------------------------------------------------
-            //! Creator using func<idx>(args...) to initialize all values of the vector.
-            //! The idx is in the range [TIdxOffset, TIdxOffset + TDim].
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                template<std::size_t> class TTFnObj,
-                typename TIdxOffset,
-                typename... TArgs>
-            ALPAKA_FN_HOST_ACC static auto createVecFromIndexedFnOffset(
-                TArgs && ... args)
-            -> Vec<TDim, TVal>
-            {
-                using IdxSubSequenceSigned = meta::MakeIntegerSequenceOffset<std::intmax_t, TIdxOffset::value, TDim::value>;
-                using IdxSubSequence = meta::ConvertIntegerSequence<typename TDim::value_type, IdxSubSequenceSigned>;
-                return
-                    createVecFromIndexedFnArbitrary<
-                        TTFnObj>(
-                            IdxSubSequence(),
-                            std::forward<TArgs>(args)...);
-            }
-#endif
-
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            Vec(Vec const &) = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            Vec(Vec &&) = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            auto operator=(Vec const &) -> Vec & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            auto operator=(Vec &&) -> Vec & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC ~Vec() = default;
-
-        private:
-            //#############################################################################
-            //! A function object that returns the given value for each index.
-            template<
-                std::size_t Tidx>
-            struct CreateSingleVal
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto create(
-                    TVal const & val)
-                -> TVal
-                {
-                    return val;
-                }
-            };
-        public:
-            //-----------------------------------------------------------------------------
-            //! \brief Single value constructor.
-            //!
-            //! Creates a vector with all values set to val.
-            //! \param val The initial value.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto all(
-                TVal const & val)
-            -> Vec<TDim, TVal>
-            {
-                return
-                    createVecFromIndexedFn<
-#ifndef ALPAKA_CREATE_VEC_IN_CLASS
-                        TDim,
-#endif
-                        CreateSingleVal>(
-                            val);
-            }
-            //-----------------------------------------------------------------------------
-            //! Zero value constructor.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto zeros()
-            -> Vec<TDim, TVal>
-            {
-                return all(static_cast<TVal>(0));
-            }
-            //-----------------------------------------------------------------------------
-            //! One value constructor.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto ones()
-            -> Vec<TDim, TVal>
-            {
-                return all(static_cast<TVal>(1));
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Value reference accessor at the given non-unsigned integer index.
-            //! \return A reference to the value at the given index.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TIdx,
-                typename = typename std::enable_if<
-                    std::is_integral<TIdx>::value>::type>
-            ALPAKA_FN_HOST_ACC auto operator[](
-                TIdx const iIdx)
-            -> TVal &
-            {
-                core::assertValueUnsigned(iIdx);
-                auto const idx(static_cast<typename TDim::value_type>(iIdx));
-                core::assertGreaterThan<TDim>(idx);
-                return m_data[idx];
-            }
-
-            //-----------------------------------------------------------------------------
-            //! Value accessor at the given non-unsigned integer index.
-            //! \return The value at the given index.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TIdx,
-                typename = typename std::enable_if<
-                    std::is_integral<TIdx>::value>::type>
-            ALPAKA_FN_HOST_ACC auto operator[](
-                TIdx const iIdx) const
-            -> TVal
-            {
-                core::assertValueUnsigned(iIdx);
-                auto const idx(static_cast<typename TDim::value_type>(iIdx));
-                core::assertGreaterThan<TDim>(idx);
-                return m_data[idx];
-            }
-
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto operator==(
-                Vec const & rhs) const
-            -> bool
-            {
-                for(typename TDim::value_type i(0); i < TDim::value; ++i)
-                {
-                    if((*this)[i] != rhs[i])
-                    {
-                        return false;
-                    }
-                }
-                return true;
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto operator!=(
-                Vec const & rhs) const
-            -> bool
-            {
-                return !((*this) == rhs);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TFnObj,
-                std::size_t... TIndices>
-            ALPAKA_FN_HOST_ACC auto foldrByIndices(
-                TFnObj const & f,
-                meta::IntegerSequence<std::size_t, TIndices...> const & indices) const
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-                meta::foldr(
-                    f,
-                    ((*this)[TIndices])...))
-#endif
-            {
-                alpaka::ignore_unused(indices);
-
-                return
-                    meta::foldr(
-                        f,
-                        ((*this)[TIndices])...);
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TFnObj>
-            ALPAKA_FN_HOST_ACC auto foldrAll(
-                TFnObj const & f) const
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-            -> decltype(
-#if (BOOST_COMP_GNUC && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))) || BOOST_COMP_INTEL || BOOST_COMP_NVCC
-                this->foldrByIndices(
-#else
-                foldrByIndices(
-#endif
-                    f,
-                    IdxSequence()))
-#endif
-            {
-                return
-                    foldrByIndices(
-                        f,
-                        IdxSequence());
-            }
-// suppress strange warning produced by nvcc+MSVC in release mode
-#if BOOST_COMP_MSVC
-    #pragma warning(push)
-    #pragma warning(disable: 4702)  // unreachable code
-#endif
-            //-----------------------------------------------------------------------------
-            //! \return The product of all values.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto prod() const
-            -> TVal
-            {
-                return foldrAll(
-                    [](TVal a, TVal b)
-                    {
-                        return static_cast<TVal>(a * b);
-                    });
-            }
-#if BOOST_COMP_MSVC
-    #pragma warning(pop)
-#endif
-            //-----------------------------------------------------------------------------
-            //! \return The sum of all values.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto sum() const
-            -> TVal
-            {
-                return foldrAll(
-                    [](TVal a, TVal b)
-                    {
-                        return static_cast<TVal>(a + b);
-                    });
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The min of all values.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto min() const
-            -> TVal
-            {
-                return foldrAll(
-                    [](TVal a, TVal b)
-                    {
-                        return (b < a) ? b : a;
-                    });
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The max of all values.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto max() const
-            -> TVal
-            {
-                return foldrAll(
-                    [](TVal a, TVal b)
-                    {
-                        return (b > a) ? b : a;
-                    });
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The index of the minimal element.
-            ALPAKA_FN_HOST auto minElem() const
-            -> typename TDim::value_type
-            {
-                return
-                    static_cast<typename TDim::value_type>(
-                        std::distance(
-                            std::begin(m_data),
-                            std::min_element(
-                                std::begin(m_data),
-                                std::end(m_data))));
-            }
-            //-----------------------------------------------------------------------------
-            //! \return The index of the maximal element.
-            ALPAKA_FN_HOST auto maxElem() const
-            -> typename TDim::value_type
-            {
-                return
-                    static_cast<typename TDim::value_type>(
-                        std::distance(
-                            std::begin(m_data),
-                            std::max_element(
-                                std::begin(m_data),
-                                std::end(m_data))));
-            }
-
-        private:
-            // Zero sized arrays are not allowed, therefore zero-dimensional vectors have one member.
-            TVal m_data[TDim::value == 0u ? 1u : TDim::value];
         };
 
+    public:
         //-----------------------------------------------------------------------------
-        //! This is a conveniance method to have a out-of-class factory method even though the out-of-class version is not supported by all compilers.
-        //! Depending of the compiler conformance, the internal or external factory function is called.
-        //! This has the draw-back, that it requires the TVal parameter even though it should not be necessary.
+        //! \brief Single value constructor.
+        //!
+        //! Creates a vector with all values set to val.
+        //! \param val The initial value.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal,
-            template<std::size_t> class TTFnObj,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnWorkaround(
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> alpaka::vec::Vec<TDim, TVal>
-#endif
+        ALPAKA_FN_HOST_ACC static auto all(TVal const& val) -> Vec<TDim, TVal>
         {
-            return
-                alpaka::vec::
-#ifdef ALPAKA_CREATE_VEC_IN_CLASS
-                Vec<TDim, TVal>::template
-#endif
-                createVecFromIndexedFn<
-#ifndef ALPAKA_CREATE_VEC_IN_CLASS
-                    TDim,
-#endif
-                    TTFnObj>(
-                        std::forward<TArgs>(args)...);
+            return createVecFromIndexedFn<TDim, CreateSingleVal>(val);
         }
-
         //-----------------------------------------------------------------------------
-        //! This is a conveniance method to have a out-of-class factory method even though the out-of-class version is not supported by all compilers.
-        //! Depending of the compiler conformance, the internal or external factory function is called.
-        //! This has the draw-back, that it requires the TVal parameter even though it should not be necessary.
+        //! Zero value constructor.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal,
-            template<std::size_t> class TTFnObj,
-            typename TIdxOffset,
-            typename... TArgs>
-        ALPAKA_FN_HOST_ACC auto createVecFromIndexedFnOffsetWorkaround(
-            TArgs && ... args)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-        -> alpaka::vec::Vec<TDim, TVal>
-#endif
+        ALPAKA_FN_HOST_ACC static auto zeros() -> Vec<TDim, TVal>
         {
-            return
-                alpaka::vec::
-#ifdef ALPAKA_CREATE_VEC_IN_CLASS
-                Vec<TDim, TVal>::template
-#endif
-                createVecFromIndexedFnOffset<
-#ifndef ALPAKA_CREATE_VEC_IN_CLASS
-                    TDim,
-#endif
-                    TTFnObj,
-                    TIdxOffset>(
-                        std::forward<TArgs>(args)...);
-        }
-
-        namespace detail
-        {
-            //#############################################################################
-            //! A function object that returns the sum of the two input vectors elements.
-            template<
-                std::size_t Tidx>
-            struct CreateAdd
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> TVal
-                {
-                    return p[Tidx] + q[Tidx];
-                }
-            };
+            return all(static_cast<TVal>(0));
         }
         //-----------------------------------------------------------------------------
-        //! \return The element-wise sum of two vectors.
+        //! One value constructor.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator+(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, TVal>
+        ALPAKA_FN_HOST_ACC static auto ones() -> Vec<TDim, TVal>
         {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    TVal,
-                    detail::CreateAdd>(
-                        p,
-                        q);
+            return all(static_cast<TVal>(1));
         }
 
-        namespace detail
+        //-----------------------------------------------------------------------------
+        //! Value reference accessor at the given non-unsigned integer index.
+        //! \return A reference to the value at the given index.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TIdx, typename = std::enable_if_t<std::is_integral<TIdx>::value>>
+        ALPAKA_FN_HOST_ACC auto operator[](TIdx const iIdx) -> TVal&
         {
-            //##################################################################################
-            //! A function object that returns the difference of the two input vectors elements.
-            template<
-                std::size_t Tidx>
-            struct CreateSub
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> TVal
-                {
-                    return p[Tidx] - q[Tidx];
-                }
-            };
+            core::assertValueUnsigned(iIdx);
+            auto const idx(static_cast<typename TDim::value_type>(iIdx));
+            core::assertGreaterThan<TDim>(idx);
+            return m_data[idx];
         }
 
         //-----------------------------------------------------------------------------
-        //! \return The element-wise difference of two vectors.
+        //! Value accessor at the given non-unsigned integer index.
+        //! \return The value at the given index.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator-(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, TVal>
+        template<typename TIdx, typename = std::enable_if_t<std::is_integral<TIdx>::value>>
+        ALPAKA_FN_HOST_ACC auto operator[](TIdx const iIdx) const -> TVal
         {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    TVal,
-                    detail::CreateSub>(
-                        p,
-                        q);
+            core::assertValueUnsigned(iIdx);
+            auto const idx(static_cast<typename TDim::value_type>(iIdx));
+            core::assertGreaterThan<TDim>(idx);
+            return m_data[idx];
         }
 
-        namespace detail
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC auto operator==(Vec const& rhs) const -> bool
         {
-            //#############################################################################
-            //! A function object that returns the product of the two input vectors elements.
-            template<
-                std::size_t Tidx>
-            struct CreateMul
+            for(typename TDim::value_type i(0); i < TDim::value; ++i)
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> TVal
+                if((*this)[i] != rhs[i])
                 {
-                    return p[Tidx] * q[Tidx];
+                    return false;
                 }
-            };
+            }
+            return true;
         }
-
         //-----------------------------------------------------------------------------
-        //! \return The element-wise product of two vectors.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator*(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, TVal>
+        ALPAKA_FN_HOST_ACC auto operator!=(Vec const& rhs) const -> bool
         {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    TVal,
-                    detail::CreateMul>(
-                        p,
-                        q);
+            return !((*this) == rhs);
         }
-
-        namespace detail
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TFnObj, std::size_t... TIndices>
+        ALPAKA_FN_HOST_ACC auto foldrByIndices(
+            TFnObj const& f,
+            std::integer_sequence<std::size_t, TIndices...> const& indices) const
         {
-            //#############################################################################
-            //! A function object that returns the element-wise less than relation of two vectors.
-            template<
-                std::size_t Tidx>
-            struct CreateLess
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> bool
-                {
-                    return p[Tidx] < q[Tidx];
-                }
-            };
-        }
+            alpaka::ignore_unused(indices);
 
+            return meta::foldr(f, ((*this)[TIndices])...);
+        }
         //-----------------------------------------------------------------------------
-        //! \return The element-wise less than relation of two vectors.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator<(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, bool>
+        template<typename TFnObj>
+        ALPAKA_FN_HOST_ACC auto foldrAll(TFnObj const& f) const
         {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    bool,
-                    detail::CreateLess>(
-                        p,
-                        q);
+            return foldrByIndices(f, IdxSequence());
         }
-
-        namespace detail
+// suppress strange warning produced by nvcc+MSVC in release mode
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(push)
+#    pragma warning(disable : 4702) // unreachable code
+#endif
+        //-----------------------------------------------------------------------------
+        //! \return The product of all values.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC auto prod() const -> TVal
         {
-            //#############################################################################
-            //! A function object that returns the element-wise less than or equal relation of two vectors.
-            template<
-                std::size_t Tidx>
-            struct CreateLessEqual
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> bool
-                {
-                    return p[Tidx] <= q[Tidx];
-                }
-            };
+            return foldrAll(std::multiplies<TVal>());
         }
-
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(pop)
+#endif
         //-----------------------------------------------------------------------------
-        //! \return The element-wise less than or equal relation of two vectors.
+        //! \return The sum of all values.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator<=(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, bool>
+        ALPAKA_FN_HOST_ACC auto sum() const -> TVal
         {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    bool,
-                    detail::CreateLessEqual>(
-                        p,
-                        q);
+            return foldrAll(std::plus<TVal>());
         }
-
-        namespace detail
+        //-----------------------------------------------------------------------------
+        //! \return The min of all values.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC auto min() const -> TVal
         {
-            //#############################################################################
-            //! A function object that returns the element-wise greater than or equal relation of two vectors.
-            template<
-                std::size_t Tidx>
-            struct CreateGreaterEqual
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> bool
-                {
-                    return p[Tidx] >= q[Tidx];
-                }
-            };
+            return foldrAll(meta::min<TVal>());
         }
-
         //-----------------------------------------------------------------------------
-        //! \return The element-wise greater than or equal relation of two vectors.
+        //! \return The max of all values.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator>=(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, bool>
+        ALPAKA_FN_HOST_ACC auto max() const -> TVal
         {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    bool,
-                    detail::CreateGreaterEqual>(
-                        p,
-                        q);
+            return foldrAll(meta::max<TVal>());
         }
-
-        namespace detail
+        //-----------------------------------------------------------------------------
+        //! \return The index of the minimal element.
+        ALPAKA_FN_HOST auto minElem() const -> typename TDim::value_type
         {
-            //#############################################################################
-            //! A function object that returns the element-wise greater than relation of two vectors.
-            template<
-                std::size_t Tidx>
-            struct CreateGreater
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & p,
-                    Vec<TDim, TVal> const & q)
-                -> bool
-                {
-                    return p[Tidx] > q[Tidx];
-                }
-            };
+            return static_cast<typename TDim::value_type>(
+                std::distance(std::begin(m_data), std::min_element(std::begin(m_data), std::end(m_data))));
         }
-
-        //-----------------------------------------------------------------------------
-        //! \return The element-wise greater than relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST_ACC auto operator>(
-            Vec<TDim, TVal> const & p,
-            Vec<TDim, TVal> const & q)
-        -> Vec<TDim, bool>
+        //-----------------------------------------------------------------------------
+        //! \return The index of the maximal element.
+        ALPAKA_FN_HOST auto maxElem() const -> typename TDim::value_type
         {
-            return
-                createVecFromIndexedFnWorkaround<
-                    TDim,
-                    bool,
-                    detail::CreateGreater>(
-                        p,
-                        q);
+            return static_cast<typename TDim::value_type>(
+                std::distance(std::begin(m_data), std::max_element(std::begin(m_data), std::end(m_data))));
         }
 
-        //-----------------------------------------------------------------------------
-        //! Stream out operator.
-        template<
-            typename TDim,
-            typename TVal>
-        ALPAKA_FN_HOST auto operator<<(
-            std::ostream & os,
-            Vec<TDim, TVal> const & v)
-        -> std::ostream &
+    private:
+        // Zero sized arrays are not allowed, therefore zero-dimensional vectors have one member.
+        TVal m_data[TDim::value == 0u ? 1u : TDim::value];
+    };
+
+    namespace detail
+    {
+        //#############################################################################
+        //! This is used to create a Vec by applying a binary operation onto the corresponding elements of two input
+        //! vectors.
+        template<template<typename> class TFnObj, std::size_t Tidx>
+        struct CreateVecByApplyingBinaryFnToTwoIndexedVecs
         {
-            os << "(";
-            for(typename TDim::value_type i(0); i<TDim::value; ++i)
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TDim, typename TVal>
+            ALPAKA_FN_HOST_ACC static auto create(Vec<TDim, TVal> const& p, Vec<TDim, TVal> const& q)
             {
-                os << v[i];
-                if(i != TDim::value-1)
-                {
-                    os << ", ";
-                }
+                return TFnObj<TVal>()(p[Tidx], q[Tidx]);
             }
-            os << ")";
+        };
+    } // namespace detail
 
-            return os;
-        }
+    namespace detail
+    {
+        template<std::size_t Tidx>
+        using CreateVecFromTwoIndexedVecsPlus = CreateVecByApplyingBinaryFnToTwoIndexedVecs<std::plus, Tidx>;
     }
 
-    namespace dim
+    //-----------------------------------------------------------------------------
+    //! \return The element-wise sum of two vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST_ACC auto operator+(Vec<TDim, TVal> const& p, Vec<TDim, TVal> const& q) -> Vec<TDim, TVal>
     {
-        namespace traits
-        {
-            //#############################################################################
-            //! The Vec dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TVal>
-            struct DimType<
-                vec::Vec<TDim, TVal>>
-            {
-                using type = TDim;
-            };
-        }
+        return createVecFromIndexedFn<TDim, detail::CreateVecFromTwoIndexedVecsPlus>(p, q);
+    }
+
+    namespace detail
+    {
+        template<std::size_t Tidx>
+        using CreateVecFromTwoIndexedVecsMinus = CreateVecByApplyingBinaryFnToTwoIndexedVecs<std::minus, Tidx>;
     }
-    namespace idx
+
+    //-----------------------------------------------------------------------------
+    //! \return The element-wise difference of two vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST_ACC auto operator-(Vec<TDim, TVal> const& p, Vec<TDim, TVal> const& q) -> Vec<TDim, TVal>
     {
-        namespace traits
+        return createVecFromIndexedFn<TDim, detail::CreateVecFromTwoIndexedVecsMinus>(p, q);
+    }
+
+    namespace detail
+    {
+        template<std::size_t Tidx>
+        using CreateVecFromTwoIndexedVecsMul = CreateVecByApplyingBinaryFnToTwoIndexedVecs<std::multiplies, Tidx>;
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return The element-wise product of two vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST_ACC auto operator*(Vec<TDim, TVal> const& p, Vec<TDim, TVal> const& q) -> Vec<TDim, TVal>
+    {
+        return createVecFromIndexedFn<TDim, detail::CreateVecFromTwoIndexedVecsMul>(p, q);
+    }
+
+    namespace detail
+    {
+        template<std::size_t Tidx>
+        using CreateVecFromTwoIndexedVecsLess = CreateVecByApplyingBinaryFnToTwoIndexedVecs<std::less, Tidx>;
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return The element-wise less than relation of two vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST_ACC auto operator<(Vec<TDim, TVal> const& p, Vec<TDim, TVal> const& q) -> Vec<TDim, bool>
+    {
+        return createVecFromIndexedFn<TDim, detail::CreateVecFromTwoIndexedVecsLess>(p, q);
+    }
+
+    namespace detail
+    {
+        template<std::size_t Tidx>
+        using CreateVecFromTwoIndexedVecsLessEqual
+            = CreateVecByApplyingBinaryFnToTwoIndexedVecs<std::less_equal, Tidx>;
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return The element-wise less than or equal relation of two vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST_ACC auto operator<=(Vec<TDim, TVal> const& p, Vec<TDim, TVal> const& q) -> Vec<TDim, bool>
+    {
+        return createVecFromIndexedFn<TDim, detail::CreateVecFromTwoIndexedVecsLessEqual>(p, q);
+    }
+
+    namespace detail
+    {
+        template<std::size_t Tidx>
+        using CreateVecFromTwoIndexedVecsGreaterEqual
+            = CreateVecByApplyingBinaryFnToTwoIndexedVecs<std::greater_equal, Tidx>;
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return The element-wise greater than or equal relation of two vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST_ACC auto operator>=(Vec<TDim, TVal> const& p, Vec<TDim, TVal> const& q) -> Vec<TDim, bool>
+    {
+        return createVecFromIndexedFn<TDim, detail::CreateVecFromTwoIndexedVecsGreaterEqual>(p, q);
+    }
+
+    namespace detail
+    {
+        template<std::size_t Tidx>
+        using CreateVecFromTwoIndexedVecsGreater = CreateVecByApplyingBinaryFnToTwoIndexedVecs<std::greater, Tidx>;
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \return The element-wise greater than relation of two vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST_ACC auto operator>(Vec<TDim, TVal> const& p, Vec<TDim, TVal> const& q) -> Vec<TDim, bool>
+    {
+        return createVecFromIndexedFn<TDim, detail::CreateVecFromTwoIndexedVecsGreater>(p, q);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! Stream out operator.
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST auto operator<<(std::ostream& os, Vec<TDim, TVal> const& v) -> std::ostream&
+    {
+        os << "(";
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
         {
-            //#############################################################################
-            //! The Vec idx type trait specialization.
-            template<
-                typename TDim,
-                typename TVal>
-            struct IdxType<
-                vec::Vec<TDim, TVal>>
+            os << v[i];
+            if(i != TDim::value - 1)
             {
-                using type = TVal;
-            };
+                os << ", ";
+            }
         }
+        os << ")";
+
+        return os;
     }
-    namespace vec
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The Vec dimension get trait specialization.
+        template<typename TDim, typename TVal>
+        struct DimType<Vec<TDim, TVal>>
         {
-            //#############################################################################
-            //! Specialization for selecting a sub-vector.
-            template<
-                typename TDim,
-                typename TVal,
-                std::size_t... TIndices>
-            struct SubVecFromIndices<
-                Vec<TDim, TVal>,
-                meta::IntegerSequence<std::size_t, TIndices...>,
-                typename std::enable_if<
-                    !std::is_same<
-                        meta::IntegerSequence<std::size_t, TIndices...>,
-                        meta::MakeIntegerSequence<std::size_t, TDim::value>
-                    >::value
-                >::type>
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The Vec idx type trait specialization.
+        template<typename TDim, typename TVal>
+        struct IdxType<Vec<TDim, TVal>>
+        {
+            using type = TVal;
+        };
+
+        //#############################################################################
+        //! Specialization for selecting a sub-vector.
+        template<typename TDim, typename TVal, std::size_t... TIndices>
+        struct SubVecFromIndices<
+            Vec<TDim, TVal>,
+            std::integer_sequence<std::size_t, TIndices...>,
+            std::enable_if_t<!std::is_same<
+                std::integer_sequence<std::size_t, TIndices...>,
+                std::make_integer_sequence<std::size_t, TDim::value>>::value>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto subVecFromIndices(Vec<TDim, TVal> const& vec)
+                -> Vec<DimInt<sizeof...(TIndices)>, TVal>
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto subVecFromIndices(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<dim::DimInt<sizeof...(TIndices)>, TVal>
-                {
-                    // In the case of a zero dimensional vector, vec is unused.
-                    alpaka::ignore_unused(vec);
+                // In the case of a zero dimensional vector, vec is unused.
+                alpaka::ignore_unused(vec);
 
-                    static_assert(sizeof...(TIndices) <= TDim::value, "The sub-vector has to be smaller (or same idx) then the origin vector.");
+                static_assert(
+                    sizeof...(TIndices) <= TDim::value,
+                    "The sub-vector has to be smaller (or same size) than the origin vector.");
 
-                    return Vec<dim::DimInt<sizeof...(TIndices)>, TVal>(vec[TIndices]...);
-                }
-            };
-            //#############################################################################
-            //! Specialization for selecting the whole vector.
-            template<
-                typename TDim,
-                typename TVal>
-            struct SubVecFromIndices<
-                Vec<TDim, TVal>,
-                meta::MakeIntegerSequence<std::size_t, TDim::value>>
+                return Vec<DimInt<sizeof...(TIndices)>, TVal>(vec[TIndices]...);
+            }
+        };
+        //#############################################################################
+        //! Specialization for selecting the whole vector.
+        template<typename TDim, typename TVal, std::size_t... TIndices>
+        struct SubVecFromIndices<
+            Vec<TDim, TVal>,
+            std::integer_sequence<std::size_t, TIndices...>,
+            std::enable_if_t<std::is_same<
+                std::integer_sequence<std::size_t, TIndices...>,
+                std::make_integer_sequence<std::size_t, TDim::value>>::value>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto subVecFromIndices(Vec<TDim, TVal> const& vec) -> Vec<TDim, TVal>
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto subVecFromIndices(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<TDim, TVal>
-                {
-                    return vec;
-                }
-            };
-        }
+                return vec;
+            }
+        };
+    } // namespace traits
 
-        namespace detail
+    namespace detail
+    {
+        //#############################################################################
+        //! A function object that returns the given value for each index.
+        template<std::size_t Tidx>
+        struct CreateCast
         {
-            //#############################################################################
-            //! A function object that returns the given value for each index.
-            template<
-                std::size_t Tidx>
-            struct CreateCast
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TSizeNew, typename TDim, typename TVal>
+            ALPAKA_FN_HOST_ACC static auto create(TSizeNew const& /* valNew*/, Vec<TDim, TVal> const& vec) -> TSizeNew
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TSizeNew,
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    TSizeNew const &/* valNew*/,
-                    Vec<TDim, TVal> const & vec)
-                -> TSizeNew
-                {
-                    return
-                        static_cast<TSizeNew>(
-                            vec[Tidx]);
-                }
-            };
-        }
-        namespace traits
+                return static_cast<TSizeNew>(vec[Tidx]);
+            }
+        };
+    } // namespace detail
+    namespace traits
+    {
+        //#############################################################################
+        //! CastVec specialization for Vec.
+        template<typename TSizeNew, typename TDim, typename TVal>
+        struct CastVec<TSizeNew, Vec<TDim, TVal>>
         {
-            //#############################################################################
-            //! Cast specialization for Vec.
-            template<
-                typename TSizeNew,
-                typename TDim,
-                typename TVal>
-            struct Cast<
-                TSizeNew,
-                Vec<TDim, TVal>>
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto castVec(Vec<TDim, TVal> const& vec) -> Vec<TDim, TSizeNew>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto cast(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<TDim, TSizeNew>
-                {
-                    return
-                        createVecFromIndexedFnWorkaround<
-                            TDim,
-                            TSizeNew,
-                            vec::detail::CreateCast>(
-                                TSizeNew(),
-                                vec);
-                }
-            };
+                return createVecFromIndexedFn<TDim, alpaka::detail::CreateCast>(TSizeNew(), vec);
+            }
+        };
 
-            //#############################################################################
-            //! (Non-)Cast specialization for Vec when src and dst types are identical.
-            //#############################################################################
-            template<
-                typename TDim,
-                typename TVal>
-            struct Cast<
-                TVal,
-                Vec<TDim, TVal>>
+        //#############################################################################
+        //! (Non-)CastVec specialization for Vec when src and dst types are identical.
+        //#############################################################################
+        template<typename TDim, typename TVal>
+        struct CastVec<TVal, Vec<TDim, TVal>>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto castVec(Vec<TDim, TVal> const& vec) -> Vec<TDim, TVal>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto cast(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<TDim, TVal>
-                {
-                    return vec;
-                }
-            };
-        }
+                return vec;
+            }
+        };
+    } // namespace traits
 
-        namespace detail
+    namespace detail
+    {
+        //#############################################################################
+        //! A function object that returns the value at the index from the back of the vector.
+        template<std::size_t Tidx>
+        struct CreateReverse
         {
-            //#############################################################################
-            //! A function object that returns the value at the index from the back of the vector.
-            template<
-                std::size_t Tidx>
-            struct CreateReverse
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TDim, typename TVal>
+            ALPAKA_FN_HOST_ACC static auto create(Vec<TDim, TVal> const& vec) -> TVal
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDim,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDim, TVal> const & vec)
-                -> TVal
-                {
-                    return vec[TDim::value - 1u - Tidx];
-                }
-            };
-        }
-        namespace traits
+                return vec[TDim::value - 1u - Tidx];
+            }
+        };
+    } // namespace detail
+    namespace traits
+    {
+        //#############################################################################
+        //! ReverseVec specialization for Vec.
+        template<typename TDim, typename TVal>
+        struct ReverseVec<Vec<TDim, TVal>>
         {
-            //#############################################################################
-            //! Reverse specialization for Vec.
-            template<
-                typename TDim,
-                typename TVal>
-            struct Reverse<
-                Vec<TDim, TVal>>
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto reverseVec(Vec<TDim, TVal> const& vec) -> Vec<TDim, TVal>
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto reverse(
-                    Vec<TDim, TVal> const & vec)
-                -> Vec<TDim, TVal>
-                {
-                    return
-                        createVecFromIndexedFnWorkaround<
-                            TDim,
-                            TVal,
-                            vec::detail::CreateReverse>(
-                                vec);
-                }
-            };
+                return createVecFromIndexedFn<TDim, alpaka::detail::CreateReverse>(vec);
+            }
+        };
 
-            //#############################################################################
-            //! (Non-)Reverse specialization for 1D Vec.
-            template<
-                typename TVal>
-            struct Reverse<
-                Vec<dim::DimInt<1u>, TVal>>
+        //#############################################################################
+        //! (Non-)ReverseVec specialization for 1D Vec.
+        template<typename TVal>
+        struct ReverseVec<Vec<DimInt<1u>, TVal>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto reverseVec(Vec<DimInt<1u>, TVal> const& vec) -> Vec<DimInt<1u>, TVal>
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto reverse(
-                    Vec<dim::DimInt<1u>, TVal> const & vec)
-                -> Vec<dim::DimInt<1u>, TVal>
-                {
-                    return vec;
-                }
-            };
-        }
+                return vec;
+            }
+        };
+    } // namespace traits
 
-        namespace detail
+    namespace detail
+    {
+        //#############################################################################
+        //! A function object that returns the value at the index from the back of the vector.
+        template<std::size_t Tidx>
+        struct CreateConcat
         {
-            //#############################################################################
-            //! A function object that returns the value at the index from the back of the vector.
-            template<
-                std::size_t Tidx>
-            struct CreateConcat
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TDimL, typename TDimR, typename TVal>
+            ALPAKA_FN_HOST_ACC static auto create(Vec<TDimL, TVal> const& vecL, Vec<TDimR, TVal> const& vecR) -> TVal
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TDimL,
-                    typename TDimR,
-                    typename TVal>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    Vec<TDimL, TVal> const & vecL,
-                    Vec<TDimR, TVal> const & vecR)
-                -> TVal
-                {
-                    return Tidx < TDimL::value ? vecL[Tidx] : vecR[Tidx - TDimL::value];
-                }
-            };
-        }
-        namespace traits
+                return Tidx < TDimL::value ? vecL[Tidx] : vecR[Tidx - TDimL::value];
+            }
+        };
+    } // namespace detail
+    namespace traits
+    {
+        //#############################################################################
+        //! Concatenation specialization for Vec.
+        template<typename TDimL, typename TDimR, typename TVal>
+        struct ConcatVec<Vec<TDimL, TVal>, Vec<TDimR, TVal>>
         {
-            //#############################################################################
-            //! Concatenation specialization for Vec.
-            template<
-                typename TDimL,
-                typename TDimR,
-                typename TVal>
-            struct Concat<
-                Vec<TDimL, TVal>,
-                Vec<TDimR, TVal>>
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto concatVec(Vec<TDimL, TVal> const& vecL, Vec<TDimR, TVal> const& vecR)
+                -> Vec<DimInt<TDimL::value + TDimR::value>, TVal>
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto concat(
-                    Vec<TDimL, TVal> const & vecL,
-                    Vec<TDimR, TVal> const & vecR)
-                -> Vec<dim::DimInt<TDimL::value + TDimR::value>, TVal>
-                {
-                    return
-                        createVecFromIndexedFnWorkaround<
-                            dim::DimInt<TDimL::value + TDimR::value>,
-                            TVal,
-                            vec::detail::CreateConcat>(
-                                vecL,
-                                vecR);
-                }
-            };
-        }
-    }
+                return createVecFromIndexedFn<DimInt<TDimL::value + TDimR::value>, alpaka::detail::CreateConcat>(
+                    vecL,
+                    vecR);
+            }
+        };
+    } // namespace traits
 
     namespace extent
     {
@@ -1162,120 +629,78 @@ namespace alpaka
         {
             //#############################################################################
             //! A function object that returns the extent for each index.
-            template<
-                std::size_t Tidx>
+            template<std::size_t Tidx>
             struct CreateExtent
             {
                 //-----------------------------------------------------------------------------
                 ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TExtent>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    TExtent const & extent)
-                -> idx::Idx<TExtent>
+                template<typename TExtent>
+                ALPAKA_FN_HOST_ACC static auto create(TExtent const& extent) -> Idx<TExtent>
                 {
                     return extent::getExtent<Tidx>(extent);
                 }
             };
-        }
+        } // namespace detail
         //-----------------------------------------------------------------------------
         //! \tparam TExtent has to specialize extent::GetExtent.
         //! \return The extent vector.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getExtentVec(
-            TExtent const & extent = TExtent())
-        -> vec::Vec<dim::Dim<TExtent>, idx::Idx<TExtent>>
+        template<typename TExtent>
+        ALPAKA_FN_HOST_ACC auto getExtentVec(TExtent const& extent = TExtent()) -> Vec<Dim<TExtent>, Idx<TExtent>>
         {
-            return
-                vec::createVecFromIndexedFnWorkaround<
-                    dim::Dim<TExtent>,
-                    idx::Idx<TExtent>,
-                    detail::CreateExtent>(
-                        extent);
+            return createVecFromIndexedFn<Dim<TExtent>, detail::CreateExtent>(extent);
         }
         //-----------------------------------------------------------------------------
         //! \tparam TExtent has to specialize extent::GetExtent.
         //! \return The extent but only the last N elements.
         ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TExtent>
-        ALPAKA_FN_HOST_ACC auto getExtentVecEnd(
-            TExtent const & extent = TExtent())
-        -> vec::Vec<TDim, idx::Idx<TExtent>>
+        template<typename TDim, typename TExtent>
+        ALPAKA_FN_HOST_ACC auto getExtentVecEnd(TExtent const& extent = TExtent()) -> Vec<TDim, Idx<TExtent>>
         {
-            using IdxOffset = std::integral_constant<std::intmax_t, static_cast<std::intmax_t>(dim::Dim<TExtent>::value) - static_cast<std::intmax_t>(TDim::value)>;
-            return
-                vec::createVecFromIndexedFnOffsetWorkaround<
-                    TDim,
-                    idx::Idx<TExtent>,
-                    detail::CreateExtent,
-                    IdxOffset>(
-                        extent);
+            using IdxOffset = std::integral_constant<
+                std::intmax_t,
+                static_cast<std::intmax_t>(Dim<TExtent>::value) - static_cast<std::intmax_t>(TDim::value)>;
+            return createVecFromIndexedFnOffset<TDim, detail::CreateExtent, IdxOffset>(extent);
         }
-    }
+    } // namespace extent
 
-    namespace offset
+    namespace detail
     {
-        namespace detail
+        //#############################################################################
+        //! A function object that returns the offsets for each index.
+        template<std::size_t Tidx>
+        struct CreateOffset
         {
-            //#############################################################################
-            //! A function object that returns the offsets for each index.
-            template<
-                std::size_t Tidx>
-            struct CreateOffset
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TOffsets>
+            ALPAKA_FN_HOST_ACC static auto create(TOffsets const& offsets) -> Idx<TOffsets>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                template<
-                    typename TOffsets>
-                ALPAKA_FN_HOST_ACC static auto create(
-                    TOffsets const & offsets)
-                -> idx::Idx<TOffsets>
-                {
-                    return offset::getOffset<Tidx>(offsets);
-                }
-            };
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TOffsets has to specialize offset::GetOffset.
-        //! \return The offset vector.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetVec(
-            TOffsets const & offsets = TOffsets())
-        -> vec::Vec<dim::Dim<TOffsets>, idx::Idx<TOffsets>>
-        {
-            return
-                vec::createVecFromIndexedFnWorkaround<
-                    dim::Dim<TOffsets>,
-                    idx::Idx<TOffsets>,
-                    detail::CreateOffset>(
-                        offsets);
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TOffsets has to specialize offset::GetOffset.
-        //! \return The offset vector but only the last N elements.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TDim,
-            typename TOffsets>
-        ALPAKA_FN_HOST_ACC auto getOffsetVecEnd(
-            TOffsets const & offsets = TOffsets())
-        -> vec::Vec<TDim, idx::Idx<TOffsets>>
-        {
-            using IdxOffset = std::integral_constant<std::size_t, static_cast<std::size_t>(static_cast<std::intmax_t>(dim::Dim<TOffsets>::value) - static_cast<std::intmax_t>(TDim::value))>;
-            return
-                vec::createVecFromIndexedFnOffsetWorkaround<
-                    TDim,
-                    idx::Idx<TOffsets>,
-                    detail::CreateOffset,
-                    IdxOffset>(
-                        offsets);
-        }
+                return getOffset<Tidx>(offsets);
+            }
+        };
+    } // namespace detail
+    //-----------------------------------------------------------------------------
+    //! \tparam TOffsets has to specialize GetOffset.
+    //! \return The offset vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffsetVec(TOffsets const& offsets = TOffsets()) -> Vec<Dim<TOffsets>, Idx<TOffsets>>
+    {
+        return createVecFromIndexedFn<Dim<TOffsets>, detail::CreateOffset>(offsets);
+    }
+    //-----------------------------------------------------------------------------
+    //! \tparam TOffsets has to specialize GetOffset.
+    //! \return The offset vector but only the last N elements.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffsetVecEnd(TOffsets const& offsets = TOffsets()) -> Vec<TDim, Idx<TOffsets>>
+    {
+        using IdxOffset = std::integral_constant<
+            std::size_t,
+            static_cast<std::size_t>(
+                static_cast<std::intmax_t>(Dim<TOffsets>::value) - static_cast<std::intmax_t>(TDim::value))>;
+        return createVecFromIndexedFnOffset<TDim, detail::CreateOffset, IdxOffset>(offsets);
     }
     namespace extent
     {
@@ -1283,92 +708,65 @@ namespace alpaka
         {
             //#############################################################################
             //! The Vec extent get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDim,
-                typename TVal>
+            template<typename TIdxIntegralConst, typename TDim, typename TVal>
             struct GetExtent<
                 TIdxIntegralConst,
-                vec::Vec<TDim, TVal>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+                Vec<TDim, TVal>,
+                std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getExtent(
-                    vec::Vec<TDim, TVal> const & extent)
-                -> TVal
+                ALPAKA_FN_HOST_ACC static auto getExtent(Vec<TDim, TVal> const& extent) -> TVal
                 {
                     return extent[TIdxIntegralConst::value];
                 }
             };
             //#############################################################################
             //! The Vec extent set trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDim,
-                typename TVal,
-                typename TExtentVal>
+            template<typename TIdxIntegralConst, typename TDim, typename TVal, typename TExtentVal>
             struct SetExtent<
                 TIdxIntegralConst,
-                vec::Vec<TDim, TVal>,
+                Vec<TDim, TVal>,
                 TExtentVal,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+                std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
             {
                 ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setExtent(
-                    vec::Vec<TDim, TVal> & extent,
-                    TExtentVal const & extentVal)
-                -> void
+                ALPAKA_FN_HOST_ACC static auto setExtent(Vec<TDim, TVal>& extent, TExtentVal const& extentVal) -> void
                 {
                     extent[TIdxIntegralConst::value] = extentVal;
                 }
             };
-        }
-    }
-    namespace offset
+        } // namespace traits
+    } // namespace extent
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The Vec offset get trait specialization.
+        template<typename TIdxIntegralConst, typename TDim, typename TVal>
+        struct GetOffset<
+            TIdxIntegralConst,
+            Vec<TDim, TVal>,
+            std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
         {
-            //#############################################################################
-            //! The Vec offset get trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDim,
-                typename TVal>
-            struct GetOffset<
-                TIdxIntegralConst,
-                vec::Vec<TDim, TVal>,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getOffset(Vec<TDim, TVal> const& offsets) -> TVal
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getOffset(
-                    vec::Vec<TDim, TVal> const & offsets)
-                -> TVal
-                {
-                    return offsets[TIdxIntegralConst::value];
-                }
-            };
-            //#############################################################################
-            //! The Vec offset set trait specialization.
-            template<
-                typename TIdxIntegralConst,
-                typename TDim,
-                typename TVal,
-                typename TOffset>
-            struct SetOffset<
-                TIdxIntegralConst,
-                vec::Vec<TDim, TVal>,
-                TOffset,
-                typename std::enable_if<(TDim::value > TIdxIntegralConst::value)>::type>
+                return offsets[TIdxIntegralConst::value];
+            }
+        };
+        //#############################################################################
+        //! The Vec offset set trait specialization.
+        template<typename TIdxIntegralConst, typename TDim, typename TVal, typename TOffset>
+        struct SetOffset<
+            TIdxIntegralConst,
+            Vec<TDim, TVal>,
+            TOffset,
+            std::enable_if_t<(TDim::value > TIdxIntegralConst::value)>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto setOffset(Vec<TDim, TVal>& offsets, TOffset const& offset) -> void
             {
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto setOffset(
-                    vec::Vec<TDim, TVal> & offsets,
-                    TOffset const & offset)
-                -> void
-                {
-                    offsets[TIdxIntegralConst::value] = offset;
-                }
-            };
-        }
-    }
-}
+                offsets[TIdxIntegralConst::value] = offset;
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/version.hpp b/thirdParty/cupla/alpaka/include/alpaka/version.hpp
index ba7830a5a7..2129df5063 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/version.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/version.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,7 +12,7 @@
 #include <boost/predef/version_number.h>
 
 #define ALPAKA_VERSION_MAJOR 0
-#define ALPAKA_VERSION_MINOR 4
+#define ALPAKA_VERSION_MINOR 6
 #define ALPAKA_VERSION_PATCH 0
 
 //! The alpaka library version number
diff --git a/thirdParty/cupla/alpaka/include/alpaka/wait/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/wait/Traits.hpp
index a37c1c32cc..ca941f6ea7 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/wait/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/wait/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -14,63 +14,39 @@
 
 namespace alpaka
 {
-    //-----------------------------------------------------------------------------
-    //! The wait specifics.
-    namespace wait
+    struct ConceptCurrentThreadWaitFor
     {
-        struct ConceptCurrentThreadWaitFor;
+    };
 
-        //-----------------------------------------------------------------------------
-        //! The wait traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The thread wait trait.
-            template<
-                typename TAwaited,
-                typename TSfinae = void>
-            struct CurrentThreadWaitFor;
+    //-----------------------------------------------------------------------------
+    //! The wait traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The thread wait trait.
+        template<typename TAwaited, typename TSfinae = void>
+        struct CurrentThreadWaitFor;
 
-            //#############################################################################
-            //! The waiter wait trait.
-            template<
-                typename TWaiter,
-                typename TAwaited,
-                typename TSfinae = void>
-            struct WaiterWaitFor;
-        }
+        //#############################################################################
+        //! The waiter wait trait.
+        template<typename TWaiter, typename TAwaited, typename TSfinae = void>
+        struct WaiterWaitFor;
+    } // namespace traits
 
-        //-----------------------------------------------------------------------------
-        //! Waits the thread for the completion of the given awaited action to complete.
-        template<
-            typename TAwaited>
-        ALPAKA_FN_HOST auto wait(
-            TAwaited const & awaited)
-        -> void
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptCurrentThreadWaitFor, TAwaited>;
-            traits::CurrentThreadWaitFor<
-                ImplementationBase>
-            ::currentThreadWaitFor(
-                awaited);
-        }
+    //-----------------------------------------------------------------------------
+    //! Waits the thread for the completion of the given awaited action to complete.
+    template<typename TAwaited>
+    ALPAKA_FN_HOST auto wait(TAwaited const& awaited) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptCurrentThreadWaitFor, TAwaited>;
+        traits::CurrentThreadWaitFor<ImplementationBase>::currentThreadWaitFor(awaited);
+    }
 
-        //-----------------------------------------------------------------------------
-        //! The waiter waits for the given awaited action to complete.
-        template<
-            typename TWaiter,
-            typename TAwaited>
-        ALPAKA_FN_HOST auto wait(
-            TWaiter & waiter,
-            TAwaited const & awaited)
-        -> void
-        {
-            traits::WaiterWaitFor<
-                TWaiter,
-                TAwaited>
-            ::waiterWaitFor(
-                waiter,
-                awaited);
-        }
+    //-----------------------------------------------------------------------------
+    //! The waiter waits for the given awaited action to complete.
+    template<typename TWaiter, typename TAwaited>
+    ALPAKA_FN_HOST auto wait(TWaiter& waiter, TAwaited const& awaited) -> void
+    {
+        traits::WaiterWaitFor<TWaiter, TAwaited>::waiterWaitFor(waiter, awaited);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/warp/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/warp/Traits.hpp
new file mode 100644
index 0000000000..a7b8354f2f
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/warp/Traits.hpp
@@ -0,0 +1,154 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/core/Common.hpp>
+#include <alpaka/core/Concepts.hpp>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace alpaka
+{
+    //-----------------------------------------------------------------------------
+    //! The thread warp specifics
+    namespace warp
+    {
+        struct ConceptWarp
+        {
+        };
+
+        //-----------------------------------------------------------------------------
+        //! The warp traits.
+        namespace traits
+        {
+            //#############################################################################
+            //! The warp size trait.
+            template<typename TWarp, typename TSfinae = void>
+            struct GetSize;
+
+            //#############################################################################
+            //! The all warp vote trait.
+            template<typename TWarp, typename TSfinae = void>
+            struct All;
+
+            //#############################################################################
+            //! The any warp vote trait.
+            template<typename TWarp, typename TSfinae = void>
+            struct Any;
+
+            //#############################################################################
+            //! The ballot warp vote trait.
+            template<typename TWarp, typename TSfinae = void>
+            struct Ballot;
+
+            //#############################################################################
+            //! The active mask trait.
+            template<typename TWarp, typename TSfinae = void>
+            struct Activemask;
+        } // namespace traits
+
+        //-----------------------------------------------------------------------------
+        //! Returns warp size.
+        //!
+        //! \tparam TWarp The warp implementation type.
+        //! \param warp The warp implementation.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto getSize(TWarp const& warp) -> std::int32_t
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::GetSize<ImplementationBase>::getSize(warp);
+        }
+
+        //-----------------------------------------------------------------------------
+        //! Returns a 32- or 64-bit unsigned integer (depending on the
+        //! accelerator) whose Nth bit is set if and only if the Nth thread
+        //! of the warp is active.
+        //!
+        //! Note: decltype for return type is required there, otherwise
+        //! compilcation with a CPU and a GPU accelerator enabled fails as it
+        //! tries to call device function from a host-device one. The reason
+        //! is unclear, but likely related to deducing the return type.
+        //!
+        //! \tparam TWarp The warp implementation type.
+        //! \param warp The warp implementation.
+        //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto activemask(TWarp const& warp)
+            -> decltype(traits::Activemask<concepts::ImplementationBase<ConceptWarp, TWarp>>::activemask(warp))
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::Activemask<ImplementationBase>::activemask(warp);
+        }
+
+        //-----------------------------------------------------------------------------
+        //! Evaluates predicate for all active threads of the warp and returns
+        //! non-zero if and only if predicate evaluates to non-zero for all of them.
+        //!
+        //! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP,
+        //! the operation is applied for all active threads.
+        //! The modern CUDA counterpart would be __all_sync(__activemask(), predicate).
+        //!
+        //! \tparam TWarp The warp implementation type.
+        //! \param warp The warp implementation.
+        //! \param predicate The predicate value for current thread.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto all(TWarp const& warp, std::int32_t predicate) -> std::int32_t
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::All<ImplementationBase>::all(warp, predicate);
+        }
+
+        //-----------------------------------------------------------------------------
+        //! Evaluates predicate for all active threads of the warp and returns
+        //! non-zero if and only if predicate evaluates to non-zero for any of them.
+        //!
+        //! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP,
+        //! the operation is applied for all active threads.
+        //! The modern CUDA counterpart would be __any_sync(__activemask(), predicate).
+        //!
+        //! \tparam TWarp The warp implementation type.
+        //! \param warp The warp implementation.
+        //! \param predicate The predicate value for current thread.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto any(TWarp const& warp, std::int32_t predicate) -> std::int32_t
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::Any<ImplementationBase>::any(warp, predicate);
+        }
+
+        //-----------------------------------------------------------------------------
+        //! Evaluates predicate for all non-exited threads in a warp and returns
+        //! a 32- or 64-bit unsigned integer (depending on the accelerator)
+        //! whose Nth bit is set if and only if predicate evaluates to non-zero
+        //! for the Nth thread of the warp and the Nth thread is active.
+        //!
+        //! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP,
+        //! the operation is applied for all active threads.
+        //! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate).
+        //! Return type is 64-bit to fit all platforms.
+        //!
+        //! \tparam TWarp The warp implementation type.
+        //! \param warp The warp implementation.
+        //! \param predicate The predicate value for current thread.
+        //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto ballot(TWarp const& warp, std::int32_t predicate)
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::Ballot<ImplementationBase>::ballot(warp, predicate);
+        }
+    } // namespace warp
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/warp/WarpSingleThread.hpp b/thirdParty/cupla/alpaka/include/alpaka/warp/WarpSingleThread.hpp
new file mode 100644
index 0000000000..1451aa0de7
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/warp/WarpSingleThread.hpp
@@ -0,0 +1,97 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/warp/Traits.hpp>
+
+#include <cstdint>
+
+namespace alpaka
+{
+    namespace warp
+    {
+        //#############################################################################
+        //! The single-threaded warp to emulate it on CPUs.
+        class WarpSingleThread : public concepts::Implements<ConceptWarp, WarpSingleThread>
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            WarpSingleThread() = default;
+            //-----------------------------------------------------------------------------
+            WarpSingleThread(WarpSingleThread const&) = delete;
+            //-----------------------------------------------------------------------------
+            WarpSingleThread(WarpSingleThread&&) = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(WarpSingleThread const&) -> WarpSingleThread& = delete;
+            //-----------------------------------------------------------------------------
+            auto operator=(WarpSingleThread&&) -> WarpSingleThread& = delete;
+            //-----------------------------------------------------------------------------
+            ~WarpSingleThread() = default;
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            template<>
+            struct GetSize<WarpSingleThread>
+            {
+                //-----------------------------------------------------------------------------
+                static auto getSize(warp::WarpSingleThread const& /*warp*/)
+                {
+                    return 1;
+                }
+            };
+
+            //#############################################################################
+            template<>
+            struct Activemask<WarpSingleThread>
+            {
+                //-----------------------------------------------------------------------------
+                static auto activemask(warp::WarpSingleThread const& /*warp*/)
+                {
+                    return 1u;
+                }
+            };
+
+            //#############################################################################
+            template<>
+            struct All<WarpSingleThread>
+            {
+                //-----------------------------------------------------------------------------
+                static auto all(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
+                {
+                    return predicate;
+                }
+            };
+
+            //#############################################################################
+            template<>
+            struct Any<WarpSingleThread>
+            {
+                //-----------------------------------------------------------------------------
+                static auto any(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
+                {
+                    return predicate;
+                }
+            };
+
+            //#############################################################################
+            template<>
+            struct Ballot<WarpSingleThread>
+            {
+                //-----------------------------------------------------------------------------
+                static auto ballot(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
+                {
+                    return predicate ? 1u : 0u;
+                }
+            };
+        } // namespace traits
+    } // namespace warp
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..09d9584444
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,152 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/warp/Traits.hpp>
+
+#    include <cstdint>
+
+namespace alpaka
+{
+    namespace warp
+    {
+        //#############################################################################
+        //! The GPU CUDA/HIP warp.
+        class WarpUniformCudaHipBuiltIn : public concepts::Implements<ConceptWarp, WarpUniformCudaHipBuiltIn>
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            WarpUniformCudaHipBuiltIn() = default;
+            //-----------------------------------------------------------------------------
+            __device__ WarpUniformCudaHipBuiltIn(WarpUniformCudaHipBuiltIn const&) = delete;
+            //-----------------------------------------------------------------------------
+            __device__ WarpUniformCudaHipBuiltIn(WarpUniformCudaHipBuiltIn&&) = delete;
+            //-----------------------------------------------------------------------------
+            __device__ auto operator=(WarpUniformCudaHipBuiltIn const&) -> WarpUniformCudaHipBuiltIn& = delete;
+            //-----------------------------------------------------------------------------
+            __device__ auto operator=(WarpUniformCudaHipBuiltIn&&) -> WarpUniformCudaHipBuiltIn& = delete;
+            //-----------------------------------------------------------------------------
+            ~WarpUniformCudaHipBuiltIn() = default;
+        };
+
+        namespace traits
+        {
+            //#############################################################################
+            template<>
+            struct GetSize<WarpUniformCudaHipBuiltIn>
+            {
+                //-----------------------------------------------------------------------------
+                __device__ static auto getSize(warp::WarpUniformCudaHipBuiltIn const& /*warp*/) -> std::int32_t
+                {
+                    return warpSize;
+                }
+            };
+
+            //#############################################################################
+            template<>
+            struct Activemask<WarpUniformCudaHipBuiltIn>
+            {
+                //-----------------------------------------------------------------------------
+                __device__ static auto activemask(warp::WarpUniformCudaHipBuiltIn const& /*warp*/)
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    -> std::uint32_t
+#    else
+                    -> std::uint64_t
+#    endif
+                {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    // Workaround for clang + CUDA 9.2 which uses the wrong PTX ISA,
+                    // discussion in https://github.com/alpaka-group/alpaka/pull/1003
+                    // Can't use __activemask(), so emulate with __ballot_sync()
+#        if BOOST_COMP_CLANG_CUDA && BOOST_LANG_CUDA == BOOST_VERSION_NUMBER(9, 2, 0)
+                    return __ballot_sync(0xffffffff, 1);
+#        else
+                    return __activemask();
+#        endif
+#    else
+                    // No HIP intrinsic for it, emulate via ballot
+                    return __ballot(1);
+#    endif
+                }
+            };
+
+            //#############################################################################
+            template<>
+            struct All<WarpUniformCudaHipBuiltIn>
+            {
+                //-----------------------------------------------------------------------------
+                __device__ static auto all(warp::WarpUniformCudaHipBuiltIn const& warp, std::int32_t predicate)
+                    -> std::int32_t
+                {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    return __all_sync(activemask(warp), predicate);
+#    else
+                    ignore_unused(warp);
+                    return __all(predicate);
+#    endif
+                }
+            };
+
+            //#############################################################################
+            template<>
+            struct Any<WarpUniformCudaHipBuiltIn>
+            {
+                //-----------------------------------------------------------------------------
+                __device__ static auto any(warp::WarpUniformCudaHipBuiltIn const& warp, std::int32_t predicate)
+                    -> std::int32_t
+                {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    return __any_sync(activemask(warp), predicate);
+#    else
+                    ignore_unused(warp);
+                    return __any(predicate);
+#    endif
+                }
+            };
+
+            //#############################################################################
+            template<>
+            struct Ballot<WarpUniformCudaHipBuiltIn>
+            {
+                //-----------------------------------------------------------------------------
+                __device__ static auto ballot(warp::WarpUniformCudaHipBuiltIn const& warp, std::int32_t predicate)
+                // return type is required by the compiler
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    -> std::uint32_t
+#    else
+                    -> std::uint64_t
+#    endif
+                {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    return __ballot_sync(activemask(warp), predicate);
+#    else
+                    ignore_unused(warp);
+                    return __ballot(predicate);
+#    endif
+                }
+            };
+        } // namespace traits
+    } // namespace warp
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/workdiv/Traits.hpp b/thirdParty/cupla/alpaka/include/alpaka/workdiv/Traits.hpp
index c965649be2..fe8b6d5610 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/workdiv/Traits.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/workdiv/Traits.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,135 +9,81 @@
 
 #pragma once
 
-#include <alpaka/idx/Traits.hpp>
+#include <alpaka/core/Common.hpp>
 #include <alpaka/core/Concepts.hpp>
-
-#include <alpaka/vec/Vec.hpp>
 #include <alpaka/core/Positioning.hpp>
-#include <alpaka/core/Common.hpp>
-
-#include <boost/config.hpp>
+#include <alpaka/idx/Traits.hpp>
+#include <alpaka/vec/Vec.hpp>
 
 #include <type_traits>
 #include <utility>
 
 namespace alpaka
 {
-    //-----------------------------------------------------------------------------
-    //! The work division traits specifics.
-    namespace workdiv
+    struct ConceptWorkDiv
     {
-        struct ConceptWorkDiv;
+    };
 
-        //-----------------------------------------------------------------------------
-        //! The work division traits.
-        namespace traits
-        {
-            //#############################################################################
-            //! The work div trait.
-            template<
-                typename TWorkDiv,
-                typename TOrigin,
-                typename TUnit,
-                typename TSfinae = void>
-            struct GetWorkDiv;
-        }
+    //-----------------------------------------------------------------------------
+    //! The work division traits.
+    namespace traits
+    {
+        //#############################################################################
+        //! The work div trait.
+        template<typename TWorkDiv, typename TOrigin, typename TUnit, typename TSfinae = void>
+        struct GetWorkDiv;
+    } // namespace traits
 
-        //-----------------------------------------------------------------------------
-        //! Get the extent requested.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<
-            typename TOrigin,
-            typename TUnit,
-            typename TWorkDiv>
-        ALPAKA_FN_HOST_ACC auto getWorkDiv(
-            TWorkDiv const & workDiv)
-        -> vec::Vec<dim::Dim<TWorkDiv>, idx::Idx<TWorkDiv>>
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptWorkDiv, TWorkDiv>;
-            return
-                traits::GetWorkDiv<
-                    ImplementationBase,
-                    TOrigin,
-                    TUnit>
-                ::getWorkDiv(
-                    workDiv);
-        }
+    //-----------------------------------------------------------------------------
+    //! Get the extent requested.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOrigin, typename TUnit, typename TWorkDiv>
+    ALPAKA_FN_HOST_ACC auto getWorkDiv(TWorkDiv const& workDiv) -> Vec<Dim<TWorkDiv>, Idx<TWorkDiv>>
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWorkDiv, TWorkDiv>;
+        return traits::GetWorkDiv<ImplementationBase, TOrigin, TUnit>::getWorkDiv(workDiv);
+    }
 
-        namespace traits
+    namespace traits
+    {
+        //#############################################################################
+        //! The work div grid thread extent trait specialization.
+        template<typename TWorkDiv>
+        struct GetWorkDiv<TWorkDiv, origin::Grid, unit::Threads>
         {
-            //#############################################################################
-            //! The work div grid thread extent trait specialization.
-            template<
-                typename TWorkDiv>
-            struct GetWorkDiv<
-                TWorkDiv,
-                origin::Grid,
-                unit::Threads>
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    TWorkDiv const & workDiv)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> decltype(
-                    workdiv::getWorkDiv<origin::Grid, unit::Blocks>(workDiv)
-                    * workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv))
-#endif
-                {
-                    return
-                        workdiv::getWorkDiv<origin::Grid, unit::Blocks>(workDiv)
-                        * workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv);
-                }
-            };
-            //#############################################################################
-            //! The work div grid element extent trait specialization.
-            template<
-                typename TWorkDiv>
-            struct GetWorkDiv<
-                TWorkDiv,
-                origin::Grid,
-                unit::Elems>
+                return alpaka::getWorkDiv<origin::Grid, unit::Blocks>(workDiv)
+                    * alpaka::getWorkDiv<origin::Block, unit::Threads>(workDiv);
+            }
+        };
+        //#############################################################################
+        //! The work div grid element extent trait specialization.
+        template<typename TWorkDiv>
+        struct GetWorkDiv<TWorkDiv, origin::Grid, unit::Elems>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    TWorkDiv const & workDiv)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> decltype(
-                    workdiv::getWorkDiv<origin::Grid, unit::Threads>(workDiv)
-                    * workdiv::getWorkDiv<origin::Thread, unit::Elems>(workDiv))
-#endif
-                {
-                    return
-                        workdiv::getWorkDiv<origin::Grid, unit::Threads>(workDiv)
-                        * workdiv::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
-                }
-            };
-            //#############################################################################
-            //! The work div block element extent trait specialization.
-            template<
-                typename TWorkDiv>
-            struct GetWorkDiv<
-                TWorkDiv,
-                origin::Block,
-                unit::Elems>
+                return alpaka::getWorkDiv<origin::Grid, unit::Threads>(workDiv)
+                    * alpaka::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
+            }
+        };
+        //#############################################################################
+        //! The work div block element extent trait specialization.
+        template<typename TWorkDiv>
+        struct GetWorkDiv<TWorkDiv, origin::Block, unit::Elems>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    TWorkDiv const & workDiv)
-#ifdef BOOST_NO_CXX14_RETURN_TYPE_DEDUCTION
-                -> decltype(
-                    workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                    * workdiv::getWorkDiv<origin::Thread, unit::Elems>(workDiv))
-#endif
-                {
-                    return
-                        workdiv::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                        * workdiv::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
-                }
-            };
-        }
-    }
-}
+                return alpaka::getWorkDiv<origin::Block, unit::Threads>(workDiv)
+                    * alpaka::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivCudaBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivCudaBuiltIn.hpp
deleted file mode 100644
index f7031774e6..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivCudaBuiltIn.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#include <alpaka/workdiv/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/core/Cuda.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-namespace alpaka
-{
-    namespace workdiv
-    {
-        //#############################################################################
-        //! The GPU CUDA accelerator work division.
-        template<
-            typename TDim,
-            typename TIdx>
-        class WorkDivCudaBuiltIn : public concepts::Implements<ConceptWorkDiv, WorkDivCudaBuiltIn<TDim, TIdx>>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            __device__ WorkDivCudaBuiltIn(
-                vec::Vec<TDim, TIdx> const & threadElemExtent) :
-                    m_threadElemExtent(threadElemExtent)
-            {}
-            //-----------------------------------------------------------------------------
-            __device__ WorkDivCudaBuiltIn(WorkDivCudaBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ WorkDivCudaBuiltIn(WorkDivCudaBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(WorkDivCudaBuiltIn const &) -> WorkDivCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            __device__ auto operator=(WorkDivCudaBuiltIn &&) -> WorkDivCudaBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            /*virtual*/ ~WorkDivCudaBuiltIn() = default;
-
-        public:
-            // \TODO: Optimize! Add WorkDivCudaBuiltInNoElems that has no member m_threadElemExtent as well as AccGpuCudaRtNoElems.
-            // Use it instead of AccGpuCudaRt if the thread element extent is one to reduce the register usage.
-            vec::Vec<TDim, TIdx> const & m_threadElemExtent;
-        };
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator work division dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                workdiv::WorkDivCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator work division idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                workdiv::WorkDivCudaBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace workdiv
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU CUDA accelerator work division grid block extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivCudaBuiltIn<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                __device__ static auto getWorkDiv(
-                    WorkDivCudaBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    return vec::cast<TIdx>(extent::getExtentVecEnd<TDim>(gridDim));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU CUDA accelerator work division block thread extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivCudaBuiltIn<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of threads in each dimension of a block.
-                __device__ static auto getWorkDiv(
-                    WorkDivCudaBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-                    return vec::cast<TIdx>(extent::getExtentVecEnd<TDim>(blockDim));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU CUDA accelerator work division thread element extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivCudaBuiltIn<TDim, TIdx>,
-                origin::Thread,
-                unit::Elems>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                __device__ static auto getWorkDiv(
-                    WorkDivCudaBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_threadElemExtent;
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp
index 6989283365..c39810ea40 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHelpers.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,491 +9,430 @@
 
 #pragma once
 
-#include <alpaka/workdiv/WorkDivMembers.hpp>
-
-#include <alpaka/dev/Traits.hpp>
 #include <alpaka/acc/Traits.hpp>
-
-#include <alpaka/vec/Vec.hpp>
-
 #include <alpaka/core/Assert.hpp>
 #include <alpaka/core/Common.hpp>
+#include <alpaka/dev/Traits.hpp>
+#include <alpaka/vec/Vec.hpp>
+#include <alpaka/workdiv/WorkDivMembers.hpp>
 
-#include <cmath>
 #include <algorithm>
+#include <array>
+#include <cmath>
 #include <functional>
 #include <set>
-#include <array>
+#include <type_traits>
 
 //-----------------------------------------------------------------------------
 //! The alpaka library.
 namespace alpaka
 {
-    namespace workdiv
+    //#############################################################################
+    //! The grid block extent subdivision restrictions.
+    enum class GridBlockExtentSubDivRestrictions
     {
-        //#############################################################################
-        //! The grid block extent subdivision restrictions.
-        enum class GridBlockExtentSubDivRestrictions
-        {
-            EqualExtent,       //!< The block thread extent will be equal in all dimensions.
-            CloseToEqualExtent,//!< The block thread extent will be as close to equal as possible in all dimensions.
-            Unrestricted,      //!< The block thread extent will not have any restrictions.
-        };
+        EqualExtent, //!< The block thread extent will be equal in all dimensions.
+        CloseToEqualExtent, //!< The block thread extent will be as close to equal as possible in all dimensions.
+        Unrestricted, //!< The block thread extent will not have any restrictions.
+    };
 
-        namespace detail
+    namespace detail
+    {
+        //-----------------------------------------------------------------------------
+        //! \param maxDivisor The maximum divisor.
+        //! \param dividend The dividend.
+        //! \return The biggest number that satisfies the following conditions:
+        //!     1) dividend/ret==0
+        //!     2) ret<=maxDivisor
+        template<typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
+        ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const& maxDivisor, T const& dividend) -> T
         {
-            //-----------------------------------------------------------------------------
-            //! \param maxDivisor The maximum divisor.
-            //! \param dividend The dividend.
-            //! \return The biggest number that satisfies the following conditions:
-            //!     1) dividend/ret==0
-            //!     2) ret<=maxDivisor
-            template<
-                typename T,
-                typename = typename std::enable_if<std::is_integral<T>::value>::type>
-            ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(
-                T const & maxDivisor,
-                T const & dividend)
-            -> T
-            {
-                T divisor(maxDivisor);
+            T divisor(maxDivisor);
 
-                core::assertValueUnsigned(dividend);
-                core::assertValueUnsigned(maxDivisor);
-                ALPAKA_ASSERT(dividend <= maxDivisor);
+            core::assertValueUnsigned(dividend);
+            core::assertValueUnsigned(maxDivisor);
+            ALPAKA_ASSERT(dividend <= maxDivisor);
 
-                while((dividend%divisor) != 0)
-                {
-                    --divisor;
-                }
-
-                return divisor;
-            }
-            //-----------------------------------------------------------------------------
-            //! \param val The value to find divisors of.
-            //! \param maxDivisor The maximum.
-            //! \return A list of all divisors less then or equal to the given maximum.
-            template<
-                typename T,
-                typename = typename std::enable_if<std::is_integral<T>::value>::type>
-            ALPAKA_FN_HOST auto allDivisorsLessOrEqual(
-                T const & val,
-                T const & maxDivisor)
-            -> std::set<T>
+            while((dividend % divisor) != 0)
             {
-                std::set<T> divisorSet;
-
-                core::assertValueUnsigned(val);
-                core::assertValueUnsigned(maxDivisor);
-                ALPAKA_ASSERT(maxDivisor <= val);
-
-                for(T i(1); i <= std::min(val, maxDivisor); ++i)
-                {
-                    if(val % i == 0)
-                    {
-                        divisorSet.insert(val/i);
-                    }
-                }
-
-                return divisorSet;
+                --divisor;
             }
-        }
 
+            return divisor;
+        }
         //-----------------------------------------------------------------------------
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \param accDevProps The maxima for the work division.
-        //! \return If the accelerator device properties are valid.
-        template<
-            typename TDim,
-            typename TIdx>
-        ALPAKA_FN_HOST auto isValidAccDevProps(
-            acc::AccDevProps<TDim, TIdx> const & accDevProps)
-        -> bool
+        //! \param val The value to find divisors of.
+        //! \param maxDivisor The maximum.
+        //! \return A list of all divisors less then or equal to the given maximum.
+        template<typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
+        ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const& val, T const& maxDivisor) -> std::set<T>
         {
-            // Check that the maximum counts are greater or equal 1.
-            if((accDevProps.m_gridBlockCountMax < 1)
-                || (accDevProps.m_blockThreadCountMax < 1)
-                || (accDevProps.m_threadElemCountMax < 1))
-            {
-                return false;
-            }
+            std::set<T> divisorSet;
 
-            // Store the maxima allowed for extents of grid, blocks and threads.
-            auto const gridBlockExtentMax(vec::subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax));
-            auto const blockThreadExtentMax(vec::subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax));
-            auto const threadElemExtentMax(vec::subVecEnd<TDim>(accDevProps.m_threadElemExtentMax));
+            core::assertValueUnsigned(val);
+            core::assertValueUnsigned(maxDivisor);
+            ALPAKA_ASSERT(maxDivisor <= val);
 
-            // Check that the extents for all dimensions are correct.
-            for(typename TDim::value_type i(0); i<TDim::value; ++i)
+            for(T i(1); i <= std::min(val, maxDivisor); ++i)
             {
-                // Check that the maximum extents are greater or equal 1.
-                if((gridBlockExtentMax[i] < 1)
-                    || (blockThreadExtentMax[i] < 1)
-                    || (threadElemExtentMax[i] < 1))
+                if(val % i == 0)
                 {
-                    return false;
+                    divisorSet.insert(static_cast<T>(val / i));
                 }
             }
 
-            return true;
+            return divisorSet;
+        }
+    } // namespace detail
+
+    //-----------------------------------------------------------------------------
+    //! \tparam TDim The dimensionality of the accelerator device properties.
+    //! \tparam TIdx The idx type of the accelerator device properties.
+    //! \param accDevProps The maxima for the work division.
+    //! \return If the accelerator device properties are valid.
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps<TDim, TIdx> const& accDevProps) -> bool
+    {
+        // Check that the maximum counts are greater or equal 1.
+        if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
+           || (accDevProps.m_threadElemCountMax < 1))
+        {
+            return false;
         }
 
-        //-----------------------------------------------------------------------------
-        //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
-        //! 1. The the maxima block, thread and element extent and counts
-        //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
-        //! 3. The requirement of the block extent.
-        //!
-        //! \param gridElemExtent
-        //!     The full extent of elements in the grid.
-        //! \param threadElemExtent
-        //!     the number of elements computed per thread.
-        //! \param accDevProps
-        //!     The maxima for the work division.
-        //! \param requireBlockThreadExtentToDivideGridThreadExtent
-        //!     If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
-        //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block thread extent will be one in this dimension.
-        //! \param gridBlockExtentSubDivRestrictions
-        //!     The grid block extent subdivision restrictions.
-        template<
-            typename TDim,
-            typename TIdx>
-        ALPAKA_FN_HOST auto subDivideGridElems(
-            vec::Vec<TDim, TIdx> const & gridElemExtent,
-            vec::Vec<TDim, TIdx> threadElemExtent,
-            acc::AccDevProps<TDim, TIdx> const & accDevProps,
-            bool requireBlockThreadExtentToDivideGridThreadExtent = true,
-            GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions = GridBlockExtentSubDivRestrictions::Unrestricted)
-        -> workdiv::WorkDivMembers<TDim, TIdx>
+        // Store the maxima allowed for extents of grid, blocks and threads.
+        auto const gridBlockExtentMax(subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax));
+        auto const blockThreadExtentMax(subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax));
+        auto const threadElemExtentMax(subVecEnd<TDim>(accDevProps.m_threadElemExtentMax));
+
+        // Check that the extents for all dimensions are correct.
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
         {
-            ///////////////////////////////////////////////////////////////////
-            // Check that the input data is valid.
-            for(typename TDim::value_type i(0); i<TDim::value; ++i)
+            // Check that the maximum extents are greater or equal 1.
+            if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
             {
-                ALPAKA_ASSERT(gridElemExtent[i] >= 1);
-                ALPAKA_ASSERT(threadElemExtent[i] >= 1);
-                ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
+                return false;
             }
-            ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
-            ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
+        }
 
-            ///////////////////////////////////////////////////////////////////
-            // Handle the given threadElemExtent. After this only the blockThreadExtent has to be optimized.
+        return true;
+    }
 
-            // Restrict the thread elem extent with the grid elem extent.
-            for(typename TDim::value_type i(0); i<TDim::value; ++i)
-            {
-                threadElemExtent[i] = std::min(threadElemExtent[i], gridElemExtent[i]);
-            }
+    //-----------------------------------------------------------------------------
+    //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
+    //! 1. The the maxima block, thread and element extent and counts
+    //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
+    //! 3. The requirement of the block extent.
+    //!
+    //! \param gridElemExtent
+    //!     The full extent of elements in the grid.
+    //! \param threadElemExtent
+    //!     the number of elements computed per thread.
+    //! \param accDevProps
+    //!     The maxima for the work division.
+    //! \param requireBlockThreadExtentToDivideGridThreadExtent
+    //!     If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
+    //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
+    //!     thread extent will be one in this dimension.
+    //! \param gridBlockExtentSubDivRestrictions
+    //!     The grid block extent subdivision restrictions.
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto subDivideGridElems(
+        Vec<TDim, TIdx> const& gridElemExtent,
+        Vec<TDim, TIdx> threadElemExtent,
+        AccDevProps<TDim, TIdx> const& accDevProps,
+        bool requireBlockThreadExtentToDivideGridThreadExtent = true,
+        GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
+        = GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers<TDim, TIdx>
+    {
+        ///////////////////////////////////////////////////////////////////
+        // Check that the input data is valid.
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
+        {
+            ALPAKA_ASSERT(gridElemExtent[i] >= 1);
+            ALPAKA_ASSERT(threadElemExtent[i] >= 1);
+            ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
+        }
+        ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
+        ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
 
-            // Calculate the grid thread extent.
-            auto gridThreadExtent(vec::Vec<TDim, TIdx>::zeros());
-            for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-            {
-                gridThreadExtent[i] =
-                    static_cast<TIdx>(
-                        std::ceil(
-                            static_cast<double>(gridElemExtent[i])
-                            / static_cast<double>(threadElemExtent[i])));
-            }
+        ///////////////////////////////////////////////////////////////////
+        // Handle the given threadElemExtent. After this only the blockThreadExtent has to be optimized.
 
-            ///////////////////////////////////////////////////////////////////
-            // Try to calculate an optimal blockThreadExtent.
+        // Restrict the thread elem extent with the grid elem extent.
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
+        {
+            threadElemExtent[i] = std::min(threadElemExtent[i], gridElemExtent[i]);
+        }
 
-            // Initialize the block thread extent with the maximum possible.
-            auto blockThreadExtent(accDevProps.m_blockThreadExtentMax);
+        // Calculate the grid thread extent.
+        auto gridThreadExtent(Vec<TDim, TIdx>::zeros());
+        for(typename TDim::value_type i(0u); i < TDim::value; ++i)
+        {
+            gridThreadExtent[i] = static_cast<TIdx>(
+                std::ceil(static_cast<double>(gridElemExtent[i]) / static_cast<double>(threadElemExtent[i])));
+        }
+
+        ///////////////////////////////////////////////////////////////////
+        // Try to calculate an optimal blockThreadExtent.
+
+        // Initialize the block thread extent with the maximum possible.
+        auto blockThreadExtent(accDevProps.m_blockThreadExtentMax);
+
+        // Restrict the max block thread extent with the grid thread extent.
+        // This removes dimensions not required in the grid thread extent.
+        // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
+        for(typename TDim::value_type i(0u); i < TDim::value; ++i)
+        {
+            blockThreadExtent[i] = std::min(blockThreadExtent[i], gridThreadExtent[i]);
+        }
 
-            // Restrict the max block thread extent with the grid thread extent.
-            // This removes dimensions not required in the grid thread extent.
-            // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
-            for(typename TDim::value_type i(0u); i<TDim::value; ++i)
+        // For equal block thread extent, restrict it to its minimum component.
+        // For example (512, 256, 1024) will get (256, 256, 256).
+        if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
+        {
+            auto const minBlockThreadExtent(blockThreadExtent.min());
+            for(typename TDim::value_type i(0u); i < TDim::value; ++i)
             {
-                blockThreadExtent[i] = std::min(blockThreadExtent[i], gridThreadExtent[i]);
+                blockThreadExtent[i] = minBlockThreadExtent;
             }
+        }
+
+        auto const& blockThreadCountMax(accDevProps.m_blockThreadCountMax);
+        // Adjust blockThreadExtent if its product is too large.
+        if(blockThreadExtent.prod() > blockThreadCountMax)
+        {
+            // Satisfy the following equation:
+            // blockThreadCountMax >= blockThreadExtent.prod()
+            // For example 1024 >= 512 * 512 * 1024
 
-            // For equal block thread extent, restrict it to its minimum component.
-            // For example (512, 256, 1024) will get (256, 256, 256).
+            // For equal block thread extent this is easily the nth root of blockThreadCountMax.
             if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
             {
-                auto const minBlockThreadExtent(blockThreadExtent.min());
-                for(typename TDim::value_type i(0u); i<TDim::value; ++i)
+                double const fNthRoot(
+                    std::pow(static_cast<double>(blockThreadCountMax), 1.0 / static_cast<double>(TDim::value)));
+                TIdx const nthRoot(static_cast<TIdx>(fNthRoot));
+                for(typename TDim::value_type i(0u); i < TDim::value; ++i)
                 {
-                    blockThreadExtent[i] = minBlockThreadExtent;
+                    blockThreadExtent[i] = nthRoot;
                 }
             }
-
-            auto const & blockThreadCountMax(accDevProps.m_blockThreadCountMax);
-            // Adjust blockThreadExtent if its product is too large.
-            if(blockThreadExtent.prod() > blockThreadCountMax)
+            else if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
             {
-                // Satisfy the following equation:
-                // blockThreadCountMax >= blockThreadExtent.prod()
-                // For example 1024 >= 512 * 512 * 1024
-
-                // For equal block thread extent this is easily the nth root of blockThreadCountMax.
-                if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
-                {
-                    double const fNthRoot(std::pow(blockThreadCountMax, 1.0/static_cast<double>(TDim::value)));
-                    TIdx const nthRoot(static_cast<TIdx>(fNthRoot));
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        blockThreadExtent[i] = nthRoot;
-                    }
-                }
-                else if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
+                // Very primitive clipping. Just halve the largest value until it fits.
+                while(blockThreadExtent.prod() > blockThreadCountMax)
                 {
-                    // Very primitive clipping. Just halve the largest value until it fits.
-                    while(blockThreadExtent.prod() > blockThreadCountMax)
-                    {
-                        auto const maxElemIdx(blockThreadExtent.maxElem());
-                        blockThreadExtent[maxElemIdx] = blockThreadExtent[maxElemIdx] / static_cast<TIdx>(2u);
-                    }
+                    auto const maxElemIdx(blockThreadExtent.maxElem());
+                    blockThreadExtent[maxElemIdx] = blockThreadExtent[maxElemIdx] / static_cast<TIdx>(2u);
                 }
-                else
+            }
+            else
+            {
+                // Very primitive clipping. Just halve the smallest value until it fits.
+                while(blockThreadExtent.prod() > blockThreadCountMax)
                 {
-                    // Very primitive clipping. Just halve the smallest value until it fits.
-                    while(blockThreadExtent.prod() > blockThreadCountMax)
-                    {
-                        // Compute the minimum element index but ignore ones.
-                        // Ones compare always larger to everything else.
-                        auto const minElemIdx(
-                            static_cast<TIdx>(
-                                std::distance(
-                                    &blockThreadExtent[0u],
-                                    std::min_element(
-                                        &blockThreadExtent[0u],
-                                        &blockThreadExtent[TDim::value-1u],
-                                        [](TIdx const & a, TIdx const & b)
-                                        {
-                                            // This first case is redundant.
-                                            /*if((a == 1u) && (b == 1u))
-                                            {
-                                                return false;
-                                            }
-                                            else */if(a == static_cast<TIdx>(1u))
-                                            {
-                                                return false;
-                                            }
-                                            else if(b == static_cast<TIdx>(1u))
-                                            {
-                                                return true;
-                                            }
-                                            else
-                                            {
-                                                return a < b;
-                                            }
-                                        }))));
-                        blockThreadExtent[minElemIdx] = blockThreadExtent[minElemIdx] / static_cast<TIdx>(2u);
-                    }
+                    // Compute the minimum element index but ignore ones.
+                    // Ones compare always larger to everything else.
+                    auto const minElemIdx(static_cast<TIdx>(std::distance(
+                        &blockThreadExtent[0u],
+                        std::min_element(
+                            &blockThreadExtent[0u],
+                            &blockThreadExtent[TDim::value - 1u],
+                            [](TIdx const& a, TIdx const& b) {
+                                // This first case is redundant.
+                                /*if((a == 1u) && (b == 1u))
+                                {
+                                    return false;
+                                }
+                                else */
+                                if(a == static_cast<TIdx>(1u))
+                                {
+                                    return false;
+                                }
+                                else if(b == static_cast<TIdx>(1u))
+                                {
+                                    return true;
+                                }
+                                else
+                                {
+                                    return a < b;
+                                }
+                            }))));
+                    blockThreadExtent[minElemIdx] = blockThreadExtent[minElemIdx] / static_cast<TIdx>(2u);
                 }
             }
+        }
 
-            // Make the block thread extent divide the grid thread extent.
-            if(requireBlockThreadExtentToDivideGridThreadExtent)
+        // Make the block thread extent divide the grid thread extent.
+        if(requireBlockThreadExtentToDivideGridThreadExtent)
+        {
+            if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
             {
-                if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
+                // For equal size block extent we have to compute the gcd of all grid thread extent that is less then
+                // the current maximal block thread extent. For this we compute the divisors of all grid thread extent
+                // less then the current maximal block thread extent.
+                std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
+                for(typename TDim::value_type i(0u); i < TDim::value; ++i)
                 {
-                    // For equal size block extent we have to compute the gcd of all grid thread extent that is less then the current maximal block thread extent.
-                    // For this we compute the divisors of all grid thread extent less then the current maximal block thread extent.
-                    std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        gridThreadExtentDivisors[i] =
-                            detail::allDivisorsLessOrEqual(
-                                gridThreadExtent[i],
-                                blockThreadExtent[i]);
-                    }
-                    // The maximal common divisor of all block thread extent is the optimal solution.
-                    std::set<TIdx> intersects[2u];
-                    for(typename TDim::value_type i(1u); i<TDim::value; ++i)
-                    {
-                        intersects[(i-1u)%2u] = gridThreadExtentDivisors[0];
-                        intersects[(i)%2u].clear();
-                        set_intersection(
-                            intersects[(i-1u)%2u].begin(),
-                            intersects[(i-1u)%2u].end(),
-                            gridThreadExtentDivisors[i].begin(),
-                            gridThreadExtentDivisors[i].end(),
-                            std::inserter(intersects[i%2], intersects[i%2u].begin()));
-                    }
-                    TIdx const maxCommonDivisor(*(--intersects[(TDim::value-1)%2u].end()));
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        blockThreadExtent[i] = maxCommonDivisor;
-                    }
+                    gridThreadExtentDivisors[i]
+                        = detail::allDivisorsLessOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
                 }
-                else if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
+                // The maximal common divisor of all block thread extent is the optimal solution.
+                std::set<TIdx> intersects[2u];
+                for(typename TDim::value_type i(1u); i < TDim::value; ++i)
                 {
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        blockThreadExtent[i] =
-                            detail::nextDivisorLowerOrEqual(
-                                blockThreadExtent[i],
-                                gridThreadExtent[i]);
-                    }
+                    intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
+                    intersects[(i) % 2u].clear();
+                    set_intersection(
+                        intersects[(i - 1u) % 2u].begin(),
+                        intersects[(i - 1u) % 2u].end(),
+                        gridThreadExtentDivisors[i].begin(),
+                        gridThreadExtentDivisors[i].end(),
+                        std::inserter(intersects[i % 2], intersects[i % 2u].begin()));
                 }
-                else
+                TIdx const maxCommonDivisor(*(--intersects[(TDim::value - 1) % 2u].end()));
+                for(typename TDim::value_type i(0u); i < TDim::value; ++i)
                 {
-                    for(typename TDim::value_type i(0u); i<TDim::value; ++i)
-                    {
-                        blockThreadExtent[i] =
-                            detail::nextDivisorLowerOrEqual(
-                                blockThreadExtent[i],
-                                gridThreadExtent[i]);
-                    }
+                    blockThreadExtent[i] = maxCommonDivisor;
                 }
             }
-
-            ///////////////////////////////////////////////////////////////////
-            // Compute the gridBlockExtent.
-
-            // Set the grid block extent (rounded to the next integer not less then the quotient.
-            auto gridBlockExtent(vec::Vec<TDim, TIdx>::ones());
-            for(typename TDim::value_type i(0u); i<TDim::value; ++i)
+            else if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::CloseToEqualExtent)
             {
-                gridBlockExtent[i] =
-                    static_cast<TIdx>(
-                        std::ceil(
-                            static_cast<double>(gridThreadExtent[i])
-                            / static_cast<double>(blockThreadExtent[i])));
+                for(typename TDim::value_type i(0u); i < TDim::value; ++i)
+                {
+                    blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(blockThreadExtent[i], gridThreadExtent[i]);
+                }
+            }
+            else
+            {
+                for(typename TDim::value_type i(0u); i < TDim::value; ++i)
+                {
+                    blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(blockThreadExtent[i], gridThreadExtent[i]);
+                }
             }
+        }
+
+        ///////////////////////////////////////////////////////////////////
+        // Compute the gridBlockExtent.
 
-            ///////////////////////////////////////////////////////////////////
-            // Return the final work division.
-            return
-                workdiv::WorkDivMembers<TDim, TIdx>(
-                    gridBlockExtent,
-                    blockThreadExtent,
-                    threadElemExtent);
+        // Set the grid block extent (rounded to the next integer not less then the quotient.
+        auto gridBlockExtent(Vec<TDim, TIdx>::ones());
+        for(typename TDim::value_type i(0u); i < TDim::value; ++i)
+        {
+            gridBlockExtent[i] = static_cast<TIdx>(
+                std::ceil(static_cast<double>(gridThreadExtent[i]) / static_cast<double>(blockThreadExtent[i])));
         }
 
-        //-----------------------------------------------------------------------------
-        //! \tparam TAcc The accelerator for which this work division has to be valid.
-        //! \tparam TGridElemExtent The type of the grid element extent.
-        //! \tparam TThreadElemExtent The type of the thread element extent.
-        //! \tparam TDev The type of the device.
-        //! \param dev
-        //!     The device the work division should be valid for.
-        //! \param gridElemExtent
-        //!     The full extent of elements in the grid.
-        //! \param threadElemExtents
-        //!     the number of elements computed per thread.
-        //! \param requireBlockThreadExtentToDivideGridThreadExtent
-        //!     If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
-        //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block thread extent will be one in this dimension.
-        //! \param gridBlockExtentSubDivRestrictions
-        //!     The grid block extent subdivision restrictions.
-        //! \return The work division.
-        template<
-            typename TAcc,
-            typename TGridElemExtent,
-            typename TThreadElemExtent,
-            typename TDev>
-        ALPAKA_FN_HOST auto getValidWorkDiv(
-            TDev const & dev,
-            TGridElemExtent const & gridElemExtent = TGridElemExtent(),
-            TThreadElemExtent const & threadElemExtents = TThreadElemExtent(),
-            bool requireBlockThreadExtentToDivideGridThreadExtent = true,
-            GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions = GridBlockExtentSubDivRestrictions::Unrestricted)
-        -> workdiv::WorkDivMembers<dim::Dim<TGridElemExtent>, idx::Idx<TGridElemExtent>>
+        ///////////////////////////////////////////////////////////////////
+        // Return the final work division.
+        return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, threadElemExtent);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \tparam TAcc The accelerator for which this work division has to be valid.
+    //! \tparam TGridElemExtent The type of the grid element extent.
+    //! \tparam TThreadElemExtent The type of the thread element extent.
+    //! \tparam TDev The type of the device.
+    //! \param dev
+    //!     The device the work division should be valid for.
+    //! \param gridElemExtent
+    //!     The full extent of elements in the grid.
+    //! \param threadElemExtents
+    //!     the number of elements computed per thread.
+    //! \param requireBlockThreadExtentToDivideGridThreadExtent
+    //!     If this is true, the grid thread extent will be multiples of the corresponding block thread extent.
+    //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
+    //!     thread extent will be one in this dimension.
+    //! \param gridBlockExtentSubDivRestrictions
+    //!     The grid block extent subdivision restrictions.
+    //! \return The work division.
+    template<typename TAcc, typename TGridElemExtent, typename TThreadElemExtent, typename TDev>
+    ALPAKA_FN_HOST auto getValidWorkDiv(
+        TDev const& dev,
+        TGridElemExtent const& gridElemExtent = TGridElemExtent(),
+        TThreadElemExtent const& threadElemExtents = TThreadElemExtent(),
+        bool requireBlockThreadExtentToDivideGridThreadExtent = true,
+        GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
+        = GridBlockExtentSubDivRestrictions::Unrestricted)
+        -> WorkDivMembers<Dim<TGridElemExtent>, Idx<TGridElemExtent>>
+    {
+        static_assert(
+            Dim<TGridElemExtent>::value == Dim<TAcc>::value,
+            "The dimension of TAcc and the dimension of TGridElemExtent have to be identical!");
+        static_assert(
+            Dim<TThreadElemExtent>::value == Dim<TAcc>::value,
+            "The dimension of TAcc and the dimension of TThreadElemExtent have to be identical!");
+        static_assert(
+            std::is_same<Idx<TGridElemExtent>, Idx<TAcc>>::value,
+            "The idx type of TAcc and the idx type of TGridElemExtent have to be identical!");
+        static_assert(
+            std::is_same<Idx<TThreadElemExtent>, Idx<TAcc>>::value,
+            "The idx type of TAcc and the idx type of TThreadElemExtent have to be identical!");
+
+        return subDivideGridElems(
+            extent::getExtentVec(gridElemExtent),
+            extent::getExtentVec(threadElemExtents),
+            getAccDevProps<TAcc>(dev),
+            requireBlockThreadExtentToDivideGridThreadExtent,
+            gridBlockExtentSubDivRestrictions);
+    }
+
+    //-----------------------------------------------------------------------------
+    //! \tparam TDim The dimensionality of the accelerator device properties.
+    //! \tparam TIdx The idx type of the accelerator device properties.
+    //! \tparam TWorkDiv The type of the work division.
+    //! \param accDevProps The maxima for the work division.
+    //! \param workDiv The work division to test for validity.
+    //! \return If the work division is valid for the given accelerator device properties.
+    template<typename TDim, typename TIdx, typename TWorkDiv>
+    ALPAKA_FN_HOST auto isValidWorkDiv(AccDevProps<TDim, TIdx> const& accDevProps, TWorkDiv const& workDiv) -> bool
+    {
+        // Store the maxima allowed for extents of grid, blocks and threads.
+        auto const gridBlockExtentMax(subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax));
+        auto const blockThreadExtentMax(subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax));
+        auto const threadElemExtentMax(subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax));
+
+        // Get the extents of grid, blocks and threads of the work division to check.
+        auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(workDiv));
+        auto const blockThreadExtent(getWorkDiv<Block, Threads>(workDiv));
+        auto const threadElemExtent(getWorkDiv<Block, Threads>(workDiv));
+
+        // Check that the maximal counts are satisfied.
+        if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
+        {
+            return false;
+        }
+        if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
+        {
+            return false;
+        }
+        if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
         {
-            static_assert(
-                dim::Dim<TGridElemExtent>::value == dim::Dim<TAcc>::value,
-                "The dimension of TAcc and the dimension of TGridElemExtent have to be identical!");
-            static_assert(
-                dim::Dim<TThreadElemExtent>::value == dim::Dim<TAcc>::value,
-                "The dimension of TAcc and the dimension of TThreadElemExtent have to be identical!");
-            static_assert(
-                std::is_same<idx::Idx<TGridElemExtent>, idx::Idx<TAcc>>::value,
-                "The idx type of TAcc and the idx type of TGridElemExtent have to be identical!");
-            static_assert(
-                std::is_same<idx::Idx<TThreadElemExtent>, idx::Idx<TAcc>>::value,
-                "The idx type of TAcc and the idx type of TThreadElemExtent have to be identical!");
-
-            return subDivideGridElems(
-                extent::getExtentVec(gridElemExtent),
-                extent::getExtentVec(threadElemExtents),
-                acc::getAccDevProps<TAcc>(dev),
-                requireBlockThreadExtentToDivideGridThreadExtent,
-                gridBlockExtentSubDivRestrictions);
+            return false;
         }
 
-        //-----------------------------------------------------------------------------
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \tparam TWorkDiv The type of the work division.
-        //! \param accDevProps The maxima for the work division.
-        //! \param workDiv The work division to test for validity.
-        //! \return If the work division is valid for the given accelerator device properties.
-        template<
-            typename TDim,
-            typename TIdx,
-            typename TWorkDiv>
-        ALPAKA_FN_HOST auto isValidWorkDiv(
-            acc::AccDevProps<TDim, TIdx> const & accDevProps,
-            TWorkDiv const & workDiv)
-        -> bool
+        // Check that the extents for all dimensions are correct.
+        for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
         {
-            // Store the maxima allowed for extents of grid, blocks and threads.
-            auto const gridBlockExtentMax(vec::subVecEnd<dim::Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax));
-            auto const blockThreadExtentMax(vec::subVecEnd<dim::Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax));
-            auto const threadElemExtentMax(vec::subVecEnd<dim::Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax));
-
-            // Get the extents of grid, blocks and threads of the work division to check.
-            auto const gridBlockExtent(getWorkDiv<Grid, Blocks>(workDiv));
-            auto const blockThreadExtent(getWorkDiv<Block, Threads>(workDiv));
-            auto const threadElemExtent(getWorkDiv<Block, Threads>(workDiv));
-
-            // Check that the maximal counts are satisfied.
-            if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
+            // No extent is allowed to be zero or greater then the allowed maximum.
+            if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
+               || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
+               || (threadElemExtentMax[i] < threadElemExtent[i]))
             {
                 return false;
             }
-            if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
-            {
-                return false;
-            }
-            if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
-            {
-                return false;
-            }
-
-            // Check that the extents for all dimensions are correct.
-            for(typename dim::Dim<TWorkDiv>::value_type i(0); i<dim::Dim<TWorkDiv>::value; ++i)
-            {
-                // No extent is allowed to be zero or greater then the allowed maximum.
-                if((gridBlockExtent[i] < 1)
-                    || (blockThreadExtent[i] < 1)
-                    || (threadElemExtent[i] < 1)
-                    || (gridBlockExtentMax[i] < gridBlockExtent[i])
-                    || (blockThreadExtentMax[i] < blockThreadExtent[i])
-                    || (threadElemExtentMax[i] < threadElemExtent[i]))
-                {
-                    return false;
-                }
-            }
-
-            return true;
-        }
-        //-----------------------------------------------------------------------------
-        //! \tparam TAcc The accelerator to test the validity on.
-        //! \param dev The device to test the work division for validity on.
-        //! \param workDiv The work division to test for validity.
-        //! \return If the work division is valid on this accelerator.
-        template<
-            typename TAcc,
-            typename TDev,
-            typename TWorkDiv>
-        ALPAKA_FN_HOST auto isValidWorkDiv(
-            TDev const & dev,
-            TWorkDiv const & workDiv)
-        -> bool
-        {
-            return
-                workdiv::isValidWorkDiv(
-                    acc::getAccDevProps<TAcc>(dev),
-                    workDiv);
         }
+
+        return true;
+    }
+    //-----------------------------------------------------------------------------
+    //! \tparam TAcc The accelerator to test the validity on.
+    //! \param dev The device to test the work division for validity on.
+    //! \param workDiv The work division to test for validity.
+    //! \return If the work division is valid on this accelerator.
+    template<typename TAcc, typename TDev, typename TWorkDiv>
+    ALPAKA_FN_HOST auto isValidWorkDiv(TDev const& dev, TWorkDiv const& workDiv) -> bool
+    {
+        return isValidWorkDiv(getAccDevProps<TAcc>(dev), workDiv);
     }
-}
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHipBuiltIn.hpp
deleted file mode 100644
index c0f9353651..0000000000
--- a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivHipBuiltIn.hpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#pragma once
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#include <alpaka/core/BoostPredef.hpp>
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include <alpaka/workdiv/Traits.hpp>
-#include <alpaka/idx/Traits.hpp>
-
-#include <alpaka/core/Hip.hpp>
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/vec/Vec.hpp>
-
-#include <hip/hip_runtime.h>
-
-
-namespace alpaka
-{
-    namespace workdiv
-    {
-        //#############################################################################
-        //! The GPU HIP accelerator work division.
-        template<
-            typename TDim,
-            typename TIdx>
-        class WorkDivHipBuiltIn : public concepts::Implements<ConceptWorkDiv, WorkDivHipBuiltIn<TDim, TIdx>>
-        {
-        public:
-            //-----------------------------------------------------------------------------
-            //! Default constructor.
-            __device__ WorkDivHipBuiltIn(
-                vec::Vec<TDim, TIdx> const & threadElemExtent) :
-                    m_threadElemExtent(threadElemExtent)
-            {}
-            //-----------------------------------------------------------------------------
-            //! Copy constructor.
-            __device__ WorkDivHipBuiltIn(WorkDivHipBuiltIn const &) = delete;
-            //-----------------------------------------------------------------------------
-            //! Move constructor.
-            __device__ WorkDivHipBuiltIn(WorkDivHipBuiltIn &&) = delete;
-            //-----------------------------------------------------------------------------
-            //! Copy assignment operator.
-            __device__ auto operator=(WorkDivHipBuiltIn const &) -> WorkDivHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            //! Move assignment operator.
-            __device__ auto operator=(WorkDivHipBuiltIn &&) -> WorkDivHipBuiltIn & = delete;
-            //-----------------------------------------------------------------------------
-            //! Destructor.
-            /*virtual*/ ALPAKA_FN_HOST_ACC ~WorkDivHipBuiltIn() = default;
-
-        public:
-            // \TODO: Optimize! Add WorkDivHipBuiltInNoElems that has no member m_threadElemExtent as well as AccGpuHipRtNoElems.
-            // Use it instead of AccGpuHipRt if the thread element extent is one to reduce the register usage.
-            vec::Vec<TDim, TIdx> const & m_threadElemExtent;
-        };
-    }
-
-    namespace dim
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator work division dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                workdiv::WorkDivHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
-        }
-    }
-    namespace idx
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator work division idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                workdiv::WorkDivHipBuiltIn<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
-        }
-    }
-    namespace workdiv
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The GPU HIP accelerator work division grid block extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivHipBuiltIn<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-#if defined(BOOST_COMP_HCC) && BOOST_COMP_HCC /* hcc requires matching host-device signature */
-                ALPAKA_FN_HOST_ACC
-#else /* nvcc does not know about blockDim.x etc. on host */
-                __device__
-#endif
-                static auto getWorkDiv(
-                    WorkDivHipBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-
-                    return extent::getExtentVecEnd<TDim>(
-                        vec::Vec<
-                          std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                            static_cast<TIdx>(hipGridDim_z),
-                            static_cast<TIdx>(hipGridDim_y),
-                            static_cast<TIdx>(hipGridDim_x)));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU HIP accelerator work division block thread extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivHipBuiltIn<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of threads in each dimension of a block.
-                ALPAKA_NO_HOST_ACC_WARNING
-#if defined(BOOST_COMP_HCC) && BOOST_COMP_HCC /* hcc requires matching host-device signature */
-                ALPAKA_FN_HOST_ACC
-#else /* nvcc does not know about blockDim.x etc. on host */
-                __device__
-#endif
-                static auto getWorkDiv(
-                    WorkDivHipBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    alpaka::ignore_unused(workDiv);
-
-                    return extent::getExtentVecEnd<TDim>(
-                        vec::Vec<
-                          std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                            static_cast<TIdx>(hipBlockDim_z),
-                            static_cast<TIdx>(hipBlockDim_y),
-                            static_cast<TIdx>(hipBlockDim_x)));
-                }
-            };
-
-            //#############################################################################
-            //! The GPU HIP accelerator work division thread element extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivHipBuiltIn<TDim, TIdx>,
-                origin::Thread,
-                unit::Elems>
-            {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    WorkDivHipBuiltIn<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_threadElemExtent;
-                }
-            };
-        }
-    }
-}
-
-#endif
diff --git a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp
index 7815908dc2..31a322bda6 100644
--- a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp
+++ b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivMembers.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,210 +9,151 @@
 
 #pragma once
 
-#include <alpaka/workdiv/Traits.hpp>
+#include <alpaka/core/Common.hpp>
 #include <alpaka/idx/Traits.hpp>
-
 #include <alpaka/vec/Vec.hpp>
-#include <alpaka/core/Common.hpp>
+#include <alpaka/workdiv/Traits.hpp>
 
 #include <iosfwd>
 
 namespace alpaka
 {
-    namespace workdiv
+    //#############################################################################
+    //! A basic class holding the work division as grid block extent, block thread and thread element extent.
+    template<typename TDim, typename TIdx>
+    class WorkDivMembers : public concepts::Implements<ConceptWorkDiv, WorkDivMembers<TDim, TIdx>>
     {
-        //#############################################################################
-        //! A basic class holding the work division as grid block extent, block thread and thread element extent.
-        template<
-            typename TDim,
-            typename TIdx>
-        class WorkDivMembers : public concepts::Implements<ConceptWorkDiv, WorkDivMembers<TDim, TIdx>>
+    public:
+        //-----------------------------------------------------------------------------
+        ALPAKA_FN_HOST_ACC WorkDivMembers() = delete;
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TGridBlockExtent, typename TBlockThreadExtent, typename TThreadElemExtent>
+        ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
+            TGridBlockExtent const& gridBlockExtent = TGridBlockExtent(),
+            TBlockThreadExtent const& blockThreadExtent = TBlockThreadExtent(),
+            TThreadElemExtent const& threadElemExtent = TThreadElemExtent())
+            : m_gridBlockExtent(extent::getExtentVecEnd<TDim>(gridBlockExtent))
+            , m_blockThreadExtent(extent::getExtentVecEnd<TDim>(blockThreadExtent))
+            , m_threadElemExtent(extent::getExtentVecEnd<TDim>(threadElemExtent))
         {
-        public:
-            //-----------------------------------------------------------------------------
-            ALPAKA_FN_HOST_ACC WorkDivMembers() = delete;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TGridBlockExtent,
-                typename TBlockThreadExtent,
-                typename TThreadElemExtent>
-            ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
-                TGridBlockExtent const & gridBlockExtent = TGridBlockExtent(),
-                TBlockThreadExtent const & blockThreadExtent = TBlockThreadExtent(),
-                TThreadElemExtent const & threadElemExtent = TThreadElemExtent()) :
-                m_gridBlockExtent(extent::getExtentVecEnd<TDim>(gridBlockExtent)),
-                m_blockThreadExtent(extent::getExtentVecEnd<TDim>(blockThreadExtent)),
-                m_threadElemExtent(extent::getExtentVecEnd<TDim>(threadElemExtent))
-            {}
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
-                WorkDivMembers const & other) :
-                    m_gridBlockExtent(other.m_gridBlockExtent),
-                    m_blockThreadExtent(other.m_blockThreadExtent),
-                    m_threadElemExtent(other.m_threadElemExtent)
-            {}
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
-                TWorkDiv const & other) :
-                    m_gridBlockExtent(vec::subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other))),
-                    m_blockThreadExtent(vec::subVecEnd<TDim>(getWorkDiv<Block, Threads>(other))),
-                    m_threadElemExtent(vec::subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other)))
-            {}
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            WorkDivMembers(WorkDivMembers &&) = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            auto operator=(WorkDivMembers const &) -> WorkDivMembers & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC
-            auto operator=(WorkDivMembers &&) -> WorkDivMembers & = default;
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TWorkDiv>
-            ALPAKA_FN_HOST_ACC auto operator=(
-                TWorkDiv const & other)
-            -> WorkDivMembers<TDim, TIdx> &
-            {
-                m_gridBlockExtent = vec::subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other));
-                m_blockThreadExtent = vec::subVecEnd<TDim>(getWorkDiv<Block, Threads>(other));
-                m_threadElemExtent = vec::subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other));
-                return *this;
-            }
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            /*virtual*/ ALPAKA_FN_HOST_ACC ~WorkDivMembers() = default;
-
-        public:
-            vec::Vec<TDim, TIdx> m_gridBlockExtent;
-            vec::Vec<TDim, TIdx> m_blockThreadExtent;
-            vec::Vec<TDim, TIdx> m_threadElemExtent;
-        };
-
+        }
         //-----------------------------------------------------------------------------
-        template<
-            typename TDim,
-            typename TIdx>
-        ALPAKA_FN_HOST auto operator<<(
-            std::ostream & os,
-            WorkDivMembers<TDim, TIdx> const & workDiv)
-        -> std::ostream &
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC explicit WorkDivMembers(WorkDivMembers const& other)
+            : m_gridBlockExtent(other.m_gridBlockExtent)
+            , m_blockThreadExtent(other.m_blockThreadExtent)
+            , m_threadElemExtent(other.m_threadElemExtent)
         {
-            return (os
-                << "{gridBlockExtent: " << workDiv.m_gridBlockExtent
-                << ", blockThreadExtent: " << workDiv.m_blockThreadExtent
-                << ", threadElemExtent: " << workDiv.m_threadElemExtent
-                << "}");
         }
-    }
-
-    namespace dim
-    {
-        namespace traits
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST_ACC explicit WorkDivMembers(TWorkDiv const& other)
+            : m_gridBlockExtent(subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other)))
+            , m_blockThreadExtent(subVecEnd<TDim>(getWorkDiv<Block, Threads>(other)))
+            , m_threadElemExtent(subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other)))
         {
-            //#############################################################################
-            //! The WorkDivMembers dimension get trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct DimType<
-                workdiv::WorkDivMembers<TDim, TIdx>>
-            {
-                using type = TDim;
-            };
         }
-    }
-    namespace idx
-    {
-        namespace traits
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC
+        WorkDivMembers(WorkDivMembers&&) = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC
+        auto operator=(WorkDivMembers const&) -> WorkDivMembers& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC
+        auto operator=(WorkDivMembers&&) -> WorkDivMembers& = default;
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST_ACC auto operator=(TWorkDiv const& other) -> WorkDivMembers<TDim, TIdx>&
         {
-            //#############################################################################
-            //! The WorkDivMembers idx type trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct IdxType<
-                workdiv::WorkDivMembers<TDim, TIdx>>
-            {
-                using type = TIdx;
-            };
+            m_gridBlockExtent = subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other));
+            m_blockThreadExtent = subVecEnd<TDim>(getWorkDiv<Block, Threads>(other));
+            m_threadElemExtent = subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other));
+            return *this;
         }
+        //-----------------------------------------------------------------------------
+        ALPAKA_NO_HOST_ACC_WARNING
+        /*virtual*/ ALPAKA_FN_HOST_ACC ~WorkDivMembers() = default;
+
+    public:
+        Vec<TDim, TIdx> m_gridBlockExtent;
+        Vec<TDim, TIdx> m_blockThreadExtent;
+        Vec<TDim, TIdx> m_threadElemExtent;
+    };
+
+    //-----------------------------------------------------------------------------
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto operator<<(std::ostream& os, WorkDivMembers<TDim, TIdx> const& workDiv) -> std::ostream&
+    {
+        return (
+            os << "{gridBlockExtent: " << workDiv.m_gridBlockExtent << ", blockThreadExtent: "
+               << workDiv.m_blockThreadExtent << ", threadElemExtent: " << workDiv.m_threadElemExtent << "}");
     }
-    namespace workdiv
+
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The WorkDivMembers dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<WorkDivMembers<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The WorkDivMembers idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<WorkDivMembers<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //#############################################################################
+        //! The WorkDivMembers grid block extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Grid, unit::Blocks>
         {
-            //#############################################################################
-            //! The WorkDivMembers grid block extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivMembers<TDim, TIdx>,
-                origin::Grid,
-                unit::Blocks>
+            //-----------------------------------------------------------------------------
+            //! \return The number of blocks in each dimension of the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                //! \return The number of blocks in each dimension of the grid.
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    WorkDivMembers<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_gridBlockExtent;
-                }
-            };
+                return workDiv.m_gridBlockExtent;
+            }
+        };
 
-            //#############################################################################
-            //! The WorkDivMembers block thread extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivMembers<TDim, TIdx>,
-                origin::Block,
-                unit::Threads>
+        //#############################################################################
+        //! The WorkDivMembers block thread extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The number of threads in each dimension of a block.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                //! \return The number of threads in each dimension of a block.
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    WorkDivMembers<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_blockThreadExtent;
-                }
-            };
+                return workDiv.m_blockThreadExtent;
+            }
+        };
 
-            //#############################################################################
-            //! The WorkDivMembers thread element extent trait specialization.
-            template<
-                typename TDim,
-                typename TIdx>
-            struct GetWorkDiv<
-                WorkDivMembers<TDim, TIdx>,
-                origin::Thread,
-                unit::Elems>
+        //#############################################################################
+        //! The WorkDivMembers thread element extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Thread, unit::Elems>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The number of elements in each dimension of a thread.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
             {
-                //-----------------------------------------------------------------------------
-                //! \return The number of elements in each dimension of a thread.
-                ALPAKA_NO_HOST_ACC_WARNING
-                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
-                    WorkDivMembers<TDim, TIdx> const & workDiv)
-                -> vec::Vec<TDim, TIdx>
-                {
-                    return workDiv.m_threadElemExtent;
-                }
-            };
-        }
-    }
-}
+                return workDiv.m_threadElemExtent;
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000000..3e934856b7
--- /dev/null
+++ b/thirdParty/cupla/alpaka/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,146 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    include <alpaka/core/BoostPredef.hpp>
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+// Backend specific includes.
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <alpaka/core/Cuda.hpp>
+#    else
+#        include <alpaka/core/Hip.hpp>
+#    endif
+
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/idx/Traits.hpp>
+#    include <alpaka/vec/Vec.hpp>
+#    include <alpaka/workdiv/Traits.hpp>
+
+namespace alpaka
+{
+    //#############################################################################
+    //! The GPU CUDA/HIP accelerator work division.
+    template<typename TDim, typename TIdx>
+    class WorkDivUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptWorkDiv, WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        __device__ WorkDivUniformCudaHipBuiltIn(Vec<TDim, TIdx> const& threadElemExtent)
+            : m_threadElemExtent(threadElemExtent)
+        {
+        }
+        //-----------------------------------------------------------------------------
+        __device__ WorkDivUniformCudaHipBuiltIn(WorkDivUniformCudaHipBuiltIn const&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ WorkDivUniformCudaHipBuiltIn(WorkDivUniformCudaHipBuiltIn&&) = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(WorkDivUniformCudaHipBuiltIn const&) -> WorkDivUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        __device__ auto operator=(WorkDivUniformCudaHipBuiltIn&&) -> WorkDivUniformCudaHipBuiltIn& = delete;
+        //-----------------------------------------------------------------------------
+        /*virtual*/ ~WorkDivUniformCudaHipBuiltIn() = default;
+
+    public:
+        // \TODO: Optimize! Add WorkDivUniformCudaHipBuiltInNoElems that has no member m_threadElemExtent as well as
+        // AccGpuUniformCudaHipRtNoElems. Use it instead of AccGpuUniformCudaHipRt if the thread element extent is one
+        // to reduce the register usage.
+        Vec<TDim, TIdx> const& m_threadElemExtent;
+    };
+
+    namespace traits
+    {
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator work division dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator work division idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator work division grid block extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The number of blocks in each dimension of the grid.
+            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& workDiv)
+                -> Vec<TDim, TIdx>
+            {
+                alpaka::ignore_unused(workDiv);
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                return castVec<TIdx>(extent::getExtentVecEnd<TDim>(gridDim));
+#    else
+                return extent::getExtentVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
+                    static_cast<TIdx>(hipGridDim_z),
+                    static_cast<TIdx>(hipGridDim_y),
+                    static_cast<TIdx>(hipGridDim_x)));
+#    endif
+            }
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator work division block thread extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The number of threads in each dimension of a block.
+            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& workDiv)
+                -> Vec<TDim, TIdx>
+            {
+                alpaka::ignore_unused(workDiv);
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                return castVec<TIdx>(extent::getExtentVecEnd<TDim>(blockDim));
+#    else
+                return extent::getExtentVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
+                    static_cast<TIdx>(hipBlockDim_z),
+                    static_cast<TIdx>(hipBlockDim_y),
+                    static_cast<TIdx>(hipBlockDim_x)));
+#    endif
+            }
+        };
+
+        //#############################################################################
+        //! The GPU CUDA/HIP accelerator work division thread element extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Thread, unit::Elems>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return The number of blocks in each dimension of the grid.
+            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& workDiv)
+                -> Vec<TDim, TIdx>
+            {
+                return workDiv.m_threadElemExtent;
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+#endif
diff --git a/thirdParty/cupla/alpaka/script/after_failure.sh b/thirdParty/cupla/alpaka/script/after_failure.sh
new file mode 100755
index 0000000000..132612e5b7
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/after_failure.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+#
+# Copyright 2018-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+  sudo smem
+  sudo free -m -t
+  # show actions of the OOM killer
+  sudo dmesg
+fi
diff --git a/thirdParty/cupla/alpaka/script/before_install.sh b/thirdParty/cupla/alpaka/script/before_install.sh
new file mode 100755
index 0000000000..960d59dab8
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/before_install.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+#-------------------------------------------------------------------------------
+# gcc
+if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
+then
+    ALPAKA_CI_GCC_VER_SEMANTIC=( ${ALPAKA_CI_GCC_VER//./ } )
+    export ALPAKA_CI_GCC_VER_MAJOR="${ALPAKA_CI_GCC_VER_SEMANTIC[0]}"
+    echo ALPAKA_CI_GCC_VER_MAJOR: "${ALPAKA_CI_GCC_VER_MAJOR}"
+
+    if [[ "$(cat /etc/os-release)" == *"20.04"* ]]
+    then
+        if (( "${ALPAKA_CI_GCC_VER_MAJOR}" <= 6 ))
+        then
+            echo "Ubuntu 20.04 does not provide gcc-6 and older anymore."
+            exit 1
+        fi
+    fi
+fi
+
+#-------------------------------------------------------------------------------
+# Boost.
+ALPAKA_CI_BOOST_BRANCH_MAJOR=${ALPAKA_CI_BOOST_BRANCH:6:1}
+echo ALPAKA_CI_BOOST_BRANCH_MAJOR: "${ALPAKA_CI_BOOST_BRANCH_MAJOR}"
+ALPAKA_CI_BOOST_BRANCH_MINOR=${ALPAKA_CI_BOOST_BRANCH:8:2}
+echo ALPAKA_CI_BOOST_BRANCH_MINOR: "${ALPAKA_CI_BOOST_BRANCH_MINOR}"
+
+#-------------------------------------------------------------------------------
+# CUDA
+export ALPAKA_CI_INSTALL_CUDA="OFF"
+if [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "ON" ]
+then
+    export ALPAKA_CI_INSTALL_CUDA="ON"
+fi
+if [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "ON" ]
+then
+    if [ "${ALPAKA_HIP_PLATFORM}" == "nvcc" ]
+    then
+        export ALPAKA_CI_INSTALL_CUDA="ON"
+    fi
+fi
+
+#-------------------------------------------------------------------------------
+# HIP
+export ALPAKA_CI_INSTALL_HIP="OFF"
+if [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "ON" ]
+then
+    export ALPAKA_CI_INSTALL_HIP="ON"
+
+    # if platform is nvcc, CUDA part is already processed in this file.
+    if [ "${ALPAKA_HIP_PLATFORM}" == "hcc" ]
+    then
+        echo "HIP(hcc) is not supported."
+        exit 1
+    fi
+fi
+
+#-------------------------------------------------------------------------------
+# TBB
+export ALPAKA_CI_INSTALL_TBB="OFF"
+if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
+then
+    if [ "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}" = "ON" ]
+    then
+        export ALPAKA_CI_INSTALL_TBB="ON"
+    fi
+else
+    # If the variable is not set, the backend will most probably be used by default so we install it.
+    export ALPAKA_CI_INSTALL_TBB="ON"
+fi
+
+#-------------------------------------------------------------------------------
+# Fibers
+export ALPAKA_CI_INSTALL_FIBERS="OFF"
+if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ]
+then
+    if [ "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}" = "ON" ]
+    then
+        export ALPAKA_CI_INSTALL_FIBERS="ON"
+    fi
+else
+    # If the variable is not set, the backend will most probably be used by default so we install it.
+    export ALPAKA_CI_INSTALL_FIBERS="ON"
+fi
+
+
+# GCC-5.5 has broken avx512vlintrin.h in Release mode with NVCC 9.X
+#   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=76731
+#   https://github.com/tensorflow/tensorflow/issues/10220
+if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
+then
+    if [ "${CXX}" == "g++" ]
+    then
+        if (( "${ALPAKA_CI_GCC_VER_MAJOR}" == 5 ))
+        then
+            if [ "${ALPAKA_CUDA_COMPILER}" == "nvcc" ]
+            then
+                if [ "${CMAKE_BUILD_TYPE}" == "Release" ]
+                then
+                    export CMAKE_BUILD_TYPE=Debug
+                fi
+            fi
+        fi
+    fi
+fi
+
+#-------------------------------------------------------------------------------
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
+    then
+        if [ "${CXX}" == "g++" ]
+        then
+            echo "using libc++ with g++ not yet supported."
+            exit 1
+        fi
+    fi
+
+    if [ "${ALPAKA_CI_STDLIB}" == "libstdc++" ]
+    then
+        if [ ! -z "${ALPAKA_CXX_STANDARD+x}" ]
+        then
+            if (( "${ALPAKA_CXX_STANDARD}" >= 17 ))
+            then
+                if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
+                then
+                    if (( ( ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" == 1 ) && ( "${ALPAKA_CI_BOOST_BRANCH_MINOR}" < 67 ) ) || ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" < 1 ) ))
+                    then
+                        # https://github.com/boostorg/coroutine2/issues/26
+                        echo "libstdc++ in c++17 mode is not compatible with boost.fibers in boost-1.66 and below."
+                        exit 1
+                    fi
+                fi
+            fi
+        fi
+    fi
+fi
\ No newline at end of file
diff --git a/thirdParty/cupla/alpaka/script/ci.sh b/thirdParty/cupla/alpaka/script/ci.sh
new file mode 100755
index 0000000000..a7c3dfe04e
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/ci.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#
+# Copyright 2018-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+./script/print_env.sh
+source ./script/before_install.sh
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+  ./script/docker_ci.sh
+elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+then
+  ./script/install.sh
+  ./script/run.sh
+fi
diff --git a/thirdParty/cupla/alpaka/script/docker_ci.sh b/thirdParty/cupla/alpaka/script/docker_ci.sh
new file mode 100755
index 0000000000..342dd20d66
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/docker_ci.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+source ./script/docker_retry.sh
+
+# runtime and compile time options
+ALPAKA_DOCKER_ENV_LIST=()
+ALPAKA_DOCKER_ENV_LIST+=("--env" "CC=${CC}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "CXX=${CXX}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_OS_NAME=${ALPAKA_CI_OS_NAME}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_ANALYSIS=${ALPAKA_CI_ANALYSIS}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_BRANCH=${ALPAKA_CI_BOOST_BRANCH}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "BOOST_ROOT=${BOOST_ROOT}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_LIB_DIR=${ALPAKA_CI_BOOST_LIB_DIR}")
+if [ ! -z "${ALPAKA_CI_CLANG_VER+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_VER=${ALPAKA_CI_CLANG_VER}")
+fi
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_STDLIB=${ALPAKA_CI_STDLIB}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_VER=${ALPAKA_CI_CMAKE_VER}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_DIR=${ALPAKA_CI_CMAKE_DIR}")
+if [ ! -z "${CMAKE_CXX_FLAGS+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
+fi
+if [ ! -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}")
+fi
+if [ ! -z "${CMAKE_CXX_EXTENSIONS+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
+fi
+if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_GCC_VER=${ALPAKA_CI_GCC_VER}")
+fi
+if [ ! -z "${ALPAKA_CI_SANITIZERS+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_SANITIZERS=${ALPAKA_CI_SANITIZERS}")
+fi
+if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_ACC_ANY_BT_OMP5_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_ANY_BT_OMP5_ENABLE=${ALPAKA_ACC_ANY_BT_OMP5_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_ACC_ANY_BT_OACC_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_ANY_BT_OACC_ENABLE=${ALPAKA_ACC_ANY_BT_OACC_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_OFFLOAD_MAX_BLOCK_SIZE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_OFFLOAD_MAX_BLOCK_SIZE=${ALPAKA_OFFLOAD_MAX_BLOCK_SIZE}")
+fi
+if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ENABLE=${ALPAKA_ACC_GPU_CUDA_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_ACC_GPU_HIP_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ENABLE=${ALPAKA_ACC_GPU_HIP_ENABLE}")
+fi
+if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}")
+fi
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_CUDA=${ALPAKA_CI_INSTALL_CUDA}")
+if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CUDA_DIR=${ALPAKA_CI_CUDA_DIR}")
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_VERSION=${ALPAKA_CUDA_VERSION}")
+    if [ ! -z "${ALPAKA_CUDA_COMPILER+x}" ]
+    then
+        ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_COMPILER=${ALPAKA_CUDA_COMPILER}")
+    fi
+    if [ ! -z "${ALPAKA_CUDA_ARCH+x}" ]
+    then
+        ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_ARCH=${ALPAKA_CUDA_ARCH}")
+    fi
+fi
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_HIP=${ALPAKA_CI_INSTALL_HIP}")
+if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_ROOT_DIR=${ALPAKA_CI_HIP_ROOT_DIR}")
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_HIP_PLATFORM=${ALPAKA_HIP_PLATFORM}")
+fi
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_TBB=${ALPAKA_CI_INSTALL_TBB}")
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_FIBERS=${ALPAKA_CI_INSTALL_FIBERS}")
+
+# runtime only options
+ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI=${ALPAKA_CI}")
+if [ ! -z "${ALPAKA_DEBUG+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_DEBUG=${ALPAKA_DEBUG}")
+fi
+if [ ! -z "${ALPAKA_CXX_STANDARD+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CXX_STANDARD=${ALPAKA_CXX_STANDARD}")
+fi
+if [ ! -z "${OMP_NUM_THREADS+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "OMP_NUM_THREADS=${OMP_NUM_THREADS}")
+fi
+if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ONLY_MODE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ONLY_MODE=${ALPAKA_ACC_GPU_CUDA_ONLY_MODE}")
+fi
+if [ ! -z "${ALPAKA_ACC_GPU_HIP_ONLY_MODE+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ONLY_MODE=${ALPAKA_ACC_GPU_HIP_ONLY_MODE}")
+fi
+if [ ! -z "${ALPAKA_CUDA_FAST_MATH+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_FAST_MATH=${ALPAKA_CUDA_FAST_MATH}")
+fi
+if [ ! -z "${ALPAKA_CUDA_FTZ+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_FTZ=${ALPAKA_CUDA_FTZ}")
+fi
+if [ ! -z "${ALPAKA_CUDA_SHOW_REGISTER+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_SHOW_REGISTER=${ALPAKA_CUDA_SHOW_REGISTER}")
+fi
+if [ ! -z "${ALPAKA_CUDA_KEEP_FILES+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_KEEP_FILES=${ALPAKA_CUDA_KEEP_FILES}")
+fi
+if [ ! -z "${ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA=${ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA}")
+fi
+if [ ! -z "${ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION+x}" ]
+then
+    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=${ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION}")
+fi
+
+docker_retry docker run -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" /bin/bash -c "./script/install.sh && ./script/run.sh"
diff --git a/thirdParty/cupla/alpaka/script/docker_retry.sh b/thirdParty/cupla/alpaka/script/docker_retry.sh
new file mode 100755
index 0000000000..d4bc0c0ea3
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/docker_retry.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+#
+# Copyright 2019-2020 Benjamin Worpitz, Rene Widera
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+ANSI_RED="\033[31m"
+ANSI_RESET="\033[0m"
+
+# rerun docker command if error 125 (
+#   - triggered by image download problems
+#   - wait 30 seconds before retry
+docker_retry() {
+  set +euo pipefail
+  local result=0
+  local count=1
+  while [ $count -le 3 ]; do
+    [ $result -eq 125 ] && {
+      echo -e "\n${ANSI_RED}The command \"$*\" failed. Retrying, $count of 3.${ANSI_RESET}\n" >&2
+    }
+    "$@"
+    result=$?
+    [ $result -ne 125 ] && break
+    count=$((count + 1))
+    sleep 30
+  done
+  [ $count -gt 3 ] && {
+    echo -e "\n${ANSI_RED}The command \"$*\" failed 3 times.${ANSI_RESET}\n" >&2
+  }
+  return $result
+}
diff --git a/thirdParty/cupla/alpaka/script/install.sh b/thirdParty/cupla/alpaka/script/install.sh
new file mode 100755
index 0000000000..020931fa47
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+: ${ALPAKA_CI_ANALYSIS?"ALPAKA_CI_ANALYSIS must be specified"}
+: ${ALPAKA_CI_INSTALL_CUDA?"ALPAKA_CI_INSTALL_CUDA must be specified"}
+: ${ALPAKA_CI_INSTALL_HIP?"ALPAKA_CI_INSTALL_HIP must be specified"}
+: ${ALPAKA_CI_INSTALL_TBB?"ALPAKA_CI_INSTALL_TBB must be specified"}
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    travis_retry apt-get -y --quiet update
+    travis_retry apt-get -y install sudo
+
+    # tzdata is installed by software-properties-common but it requires some special handling
+    if [[ "$(cat /etc/os-release)" == *"20.04"* ]]
+    then
+        export DEBIAN_FRONTEND=noninteractive
+        travis_retry sudo apt-get --quiet --allow-unauthenticated --no-install-recommends install tzdata
+    fi
+
+    # software-properties-common: 'add-apt-repository' and certificates for wget https download
+    # binutils: ld
+    # xz-utils: xzcat
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install software-properties-common wget git make binutils xz-utils
+fi
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    ./script/install_cmake.sh
+fi
+
+if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/install_analysis.sh ;fi
+
+# Install CUDA before installing gcc as it installs gcc-4.8 and overwrites our selected compiler
+if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ] ;then ./script/install_cuda.sh ;fi
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    if [ "${CXX}" == "g++" ] ;then ./script/install_gcc.sh ;fi
+    if [ "${CXX}" == "clang++" ] ;then source ./script/install_clang.sh ;fi
+elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+then
+    echo "### list all applications ###"
+    ls "/Applications/"
+    echo "### end list all applications ###"
+    sudo xcode-select -s "/Applications/Xcode_${ALPAKA_CI_XCODE_VER}.app/Contents/Developer"
+fi
+
+if [ "${ALPAKA_CI_INSTALL_TBB}" = "ON" ]
+then
+    ./script/install_tbb.sh
+fi
+
+./script/install_boost.sh
+
diff --git a/thirdParty/cupla/alpaka/script/install_analysis.sh b/thirdParty/cupla/alpaka/script/install_analysis.sh
new file mode 100755
index 0000000000..2ade9f2288
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install_analysis.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    #-------------------------------------------------------------------------------
+    # Install sloc
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install sloccount
+    sloccount --version
+
+    #-------------------------------------------------------------------------------
+    # Install shellcheck
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install shellcheck
+    shellcheck --version
+
+elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+then
+    #-------------------------------------------------------------------------------
+    # Install sloc
+    brew install sloccount
+    sloccount --version
+
+    #-------------------------------------------------------------------------------
+    # Install shellcheck
+    brew install shellcheck
+    shellcheck --version
+
+fi
diff --git a/thirdParty/cupla/alpaka/script/install_boost.sh b/thirdParty/cupla/alpaka/script/install_boost.sh
new file mode 100755
index 0000000000..589cfad474
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install_boost.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+source ./script/set.sh
+
+: "${BOOST_ROOT?'BOOST_ROOT must be specified'}"
+: "${ALPAKA_CI_BOOST_LIB_DIR?'ALPAKA_CI_BOOST_LIB_DIR must be specified'}"
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
+fi
+: "${CMAKE_BUILD_TYPE?'CMAKE_BUILD_TYPE must be specified'}"
+: "${CXX?'CXX must be specified'}"
+: "${CC?'CC must be specified'}"
+: "${ALPAKA_CI_INSTALL_FIBERS?'ALPAKA_CI_INSTALL_FIBERS must be specified'}"
+: "${ALPAKA_CI_BOOST_LIB_DIR?'ALPAKA_CI_BOOST_LIB_DIR must be specified'}"
+if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    : "${ALPAKA_CI_CL_VER?'ALPAKA_CI_CL_VER must be specified'}"
+fi
+
+travis_retry rm -rf ${BOOST_ROOT} && git clone -b "${ALPAKA_CI_BOOST_BRANCH}" --quiet --recursive --single-branch --depth 1 https://github.com/boostorg/boost.git "${BOOST_ROOT}"
+
+# Bootstrap boost.
+if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    (cd "${BOOST_ROOT}"; ./bootstrap.bat)
+else
+    (cd "${BOOST_ROOT}"; sudo ./bootstrap.sh --with-toolset="${CC}")
+fi
+(cd "${BOOST_ROOT}"; cat ./bootstrap.log)
+
+# Create file links.
+if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    (cd "${BOOST_ROOT}"; ./b2 headers)
+else
+    (cd "${BOOST_ROOT}"; sudo ./b2 headers)
+fi
+
+# Only build boost if we need some of the non-header-only libraries
+if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
+then
+    # Prepare the library destination directory.
+    mkdir -p "${ALPAKA_CI_BOOST_LIB_DIR}"
+
+    # Create the boost build command.
+    ALPAKA_BOOST_B2=""
+    ALPAKA_BOOST_B2_CFLAGS=""
+    ALPAKA_BOOST_B2_CXXFLAGS=""
+
+    if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+    then
+        ALPAKA_BOOST_B2+="sudo "
+    fi
+    ALPAKA_BOOST_B2+="./b2 -j1"
+
+    if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+    then
+        ALPAKA_BOOST_B2_CFLAGS+="-fPIC"
+        ALPAKA_BOOST_B2_CXXFLAGS+="-fPIC"
+    fi
+
+    if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+    then
+        ALPAKA_BOOST_B2+=" --layout=versioned"
+        if [ "$ALPAKA_CI_CL_VER" = "2017" ]
+        then
+            ALPAKA_BOOST_B2+=" --toolset=msvc-14.1"
+        elif [ "$ALPAKA_CI_CL_VER" = "2019" ]
+        then
+            ALPAKA_BOOST_B2+=" --toolset=msvc-14.2"
+        fi
+    else
+        ALPAKA_BOOST_B2+=" --layout=tagged --toolset=${CC}"
+    fi
+
+    # TODO: Win32: adress-model=32
+    ALPAKA_BOOST_B2+=" architecture=x86 address-model=64 link=static threading=multi runtime-link=shared"
+
+    if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+    then
+        ALPAKA_BOOST_B2+=" define=_CRT_NONSTDC_NO_DEPRECATE define=_CRT_SECURE_NO_DEPRECATE define=_SCL_SECURE_NO_DEPRECAT define=BOOST_USE_WINFIBERS define=_ENABLE_EXTENDED_ALIGNED_STORAGE"
+    fi
+
+    if [ "${CMAKE_BUILD_TYPE}" == "Debug" ]
+    then
+      ALPAKA_BOOST_B2+=" variant=debug"
+    else
+      ALPAKA_BOOST_B2+=" variant=release"
+    fi
+
+    # Clang is not supported by the FindBoost script.
+    # boost (especially old versions) produces too much warnings when using clang (newer versions) so that the 4 MiB log is too short.
+    if [ "${CXX}" == "clang++" ]
+    then
+        ALPAKA_BOOST_B2_CXXFLAGS+=" -Wunused-private-field -Wno-unused-local-typedef -Wno-c99-extensions -Wno-variadic-macros"
+    fi
+    # Select the libraries required.
+    # If the variable is not set, the backend will most probably be used by default so we install it.
+    if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
+    then
+        if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+        then
+            ALPAKA_BOOST_B2_CXXFLAGS+=" -std=c++14"
+        fi
+        ALPAKA_BOOST_B2+=" --with-fiber --with-context --with-thread --with-atomic --with-system --with-chrono --with-date_time"
+    fi
+    if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+    then
+        if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
+        then
+            ALPAKA_BOOST_B2_CXXFLAGS+=" -stdlib=libc++"
+        fi
+    fi
+    if [ "${ALPAKA_BOOST_B2_CFLAGS}" != "" ]
+    then
+        ALPAKA_BOOST_B2+=' cflags="'
+        ALPAKA_BOOST_B2+="${ALPAKA_BOOST_B2_CFLAGS}"
+        ALPAKA_BOOST_B2+='"'
+    fi
+    if [ "${ALPAKA_BOOST_B2_CXXFLAGS}" != "" ]
+    then
+        ALPAKA_BOOST_B2+=' cxxflags="'
+        ALPAKA_BOOST_B2+="${ALPAKA_BOOST_B2_CXXFLAGS}"
+        ALPAKA_BOOST_B2+='"'
+    fi
+
+    if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+    then
+        if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
+        then
+            ALPAKA_BOOST_B2+=' linkflags="-stdlib=libc++"'
+        fi
+    fi
+
+    ALPAKA_BOOST_B2+=" --stagedir=${ALPAKA_CI_BOOST_LIB_DIR} stage"
+
+    # Build boost.
+    #echo "ALPAKA_BOOST_B2=${ALPAKA_BOOST_B2}"
+    (cd "${BOOST_ROOT}"; eval "${ALPAKA_BOOST_B2}")
+
+    # Clean the intermediate build files.
+    if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+    then
+        rm -rf bin.v2
+    else
+        sudo rm -rf bin.v2
+    fi
+fi
diff --git a/thirdParty/cupla/alpaka/script/install_clang.sh b/thirdParty/cupla/alpaka/script/install_clang.sh
new file mode 100755
index 0000000000..4523f3619e
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install_clang.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+: "${ALPAKA_CI_CLANG_VER?'ALPAKA_CI_CLANG_VER must be specified'}"
+: "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
+: "${CXX?'CXX must be specified'}"
+
+travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install clang-${ALPAKA_CI_CLANG_VER}
+
+if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
+then
+    travis_retry sudo apt-get -y --quiet update
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++-dev
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++abi-dev
+fi
+
+if [ "${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE}" = "ON" ] || [ "${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE}" = "ON" ] || [ "${ALPAKA_ACC_ANY_BT_OMP5_ENABLE}" = "ON" ]
+then
+    if [[ "${ALPAKA_CI_CLANG_VER}" =~ ^[0-9]+$ ]] && [ "${ALPAKA_CI_CLANG_VER}" -ge 8 ]
+    then
+        LIBOMP_PACKAGE=libomp-${ALPAKA_CI_CLANG_VER}-dev
+    else
+        LIBOMP_PACKAGE=libomp-dev
+    fi
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install "${LIBOMP_PACKAGE}"
+    if [ "${ALPAKA_ACC_ANY_BT_OMP5_ENABLE}" = "ON" ]
+    then
+        travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install \
+            clang-tools-${ALPAKA_CI_CLANG_VER} llvm-${ALPAKA_CI_CLANG_VER}
+    fi
+fi
+
+sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-"${ALPAKA_CI_CLANG_VER}" 50
+sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-"${ALPAKA_CI_CLANG_VER}" 50
+sudo update-alternatives --install /usr/bin/cc cc /usr/bin/clang-"${ALPAKA_CI_CLANG_VER}" 50
+sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++-"${ALPAKA_CI_CLANG_VER}" 50
+
+which "${CXX}"
+${CXX} -v
diff --git a/thirdParty/cupla/alpaka/script/install_cmake.sh b/thirdParty/cupla/alpaka/script/install_cmake.sh
new file mode 100755
index 0000000000..57880a1dcf
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install_cmake.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
+: "${ALPAKA_CI_CMAKE_VER?'ALPAKA_CI_CMAKE_VER must be specified'}"
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    # Download the selected version.
+    if [ -z "$(ls -A ${ALPAKA_CI_CMAKE_DIR})" ]
+    then
+        ALPAKA_CI_CMAKE_VER_SEMANTIC=( ${ALPAKA_CI_CMAKE_VER//./ } )
+        ALPAKA_CI_CMAKE_VER_MAJOR="${ALPAKA_CI_CMAKE_VER_SEMANTIC[0]}"
+        ALPAKA_CI_CMAKE_VER_MINOR="${ALPAKA_CI_CMAKE_VER_SEMANTIC[1]}"
+
+        ALPAKA_CMAKE_PKG_FILE_NAME_BASE=cmake-${ALPAKA_CI_CMAKE_VER}-Linux-x86_64
+        ALPAKA_CMAKE_PKG_FILE_NAME=${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}.tar.gz
+        travis_retry wget --no-verbose https://cmake.org/files/v"${ALPAKA_CI_CMAKE_VER_MAJOR}"."${ALPAKA_CI_CMAKE_VER_MINOR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME}"
+        mkdir -p "${ALPAKA_CI_CMAKE_DIR}"
+        tar -xzf "${ALPAKA_CMAKE_PKG_FILE_NAME}" -C "${ALPAKA_CI_CMAKE_DIR}"
+        sudo cp -fR "${ALPAKA_CI_CMAKE_DIR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}"/* "${ALPAKA_CI_CMAKE_DIR}"
+        sudo rm -rf "${ALPAKA_CMAKE_PKG_FILE_NAME}" "${ALPAKA_CI_CMAKE_DIR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}"
+    fi
+elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    choco uninstall cmake.install
+    choco install cmake.install --no-progress --version ${ALPAKA_CI_CMAKE_VER}
+fi
diff --git a/thirdParty/cupla/alpaka/script/install_cuda.sh b/thirdParty/cupla/alpaka/script/install_cuda.sh
new file mode 100755
index 0000000000..c34656cdd1
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install_cuda.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+: "${ALPAKA_CUDA_VERSION?'ALPAKA_CUDA_VERSION must be specified'}"
+
+ALPAKA_CUDA_VER_SEMANTIC=( ${ALPAKA_CUDA_VERSION//./ } )
+ALPAKA_CUDA_VER_MAJOR="${ALPAKA_CUDA_VER_SEMANTIC[0]}"
+echo ALPAKA_CUDA_VER_MAJOR: "${ALPAKA_CUDA_VER_MAJOR}"
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    : "${ALPAKA_CI_CUDA_DIR?'ALPAKA_CI_CUDA_DIR must be specified'}"
+    : "${ALPAKA_CUDA_COMPILER?'ALPAKA_CUDA_COMPILER must be specified'}"
+
+    # Ubuntu 18.04 requires some extra keys for verification
+    if [[ "$(cat /etc/os-release)" == *"18.04"* ]]
+    then
+        travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install dirmngr gpg-agent
+        travis_retry sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80
+    elif [[ "$(cat /etc/os-release)" == *"20.04"* ]]
+    then
+        travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install dirmngr gpg-agent
+        travis_retry sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80
+    fi
+
+    # Set the correct CUDA downloads
+    if [ "${ALPAKA_CUDA_VERSION}" == "9.0" ]
+    then
+        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-0-local
+        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.0.176-1_amd64-deb
+        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "9.1" ]
+    then
+        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-1-local
+        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.1.85-1_amd64
+        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "9.2" ]
+    then
+        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-2-local
+        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.2.88-1_amd64
+        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "10.0" ]
+    then
+        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-0-local
+        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.0.130-410.48_1.0-1_amd64
+        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "10.1" ]
+    then
+        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-1-local
+        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.1.168-418.67_1.0-1_amd64.deb
+        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "10.2" ]
+    then
+        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-2-local
+        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.2.89-440.33.01_1.0-1_amd64.deb
+        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "11.0" ]
+    then
+        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-11-0-local
+        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_11.0.3-450.51.06-1_amd64.deb
+        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "11.1" ]
+    then
+        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-11-1-local
+        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_11.1.0-455.23.05-1_amd64.deb
+        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/11.1.0/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    else
+        echo CUDA versions other than 9.0, 9.1, 9.2, 10.0, 10.1, 10.2, 11.0 and 11.1 are not currently supported on linux!
+    fi
+    if [ -z "$(ls -A ${ALPAKA_CI_CUDA_DIR})" ]
+    then
+        mkdir -p "${ALPAKA_CI_CUDA_DIR}"
+        travis_retry wget --no-verbose -O "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}" "${ALPAKA_CUDA_PKG_FILE_PATH}"
+    fi
+    sudo dpkg --install "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}"
+
+    travis_retry sudo apt-get -y --quiet update
+
+    # Install CUDA
+    # Currently we do not install CUDA fully: sudo apt-get --quiet -y install cuda
+    # We only install the minimal packages. Because of our manual partial installation we have to create a symlink at /usr/local/cuda
+    if (( "${ALPAKA_CUDA_VER_MAJOR}" >= 11 ))
+    then
+      sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install cuda-compiler-"${ALPAKA_CUDA_VERSION}" cuda-cudart-"${ALPAKA_CUDA_VERSION}" cuda-cudart-dev-"${ALPAKA_CUDA_VERSION}" libcurand-"${ALPAKA_CUDA_VERSION}" libcurand-dev-"${ALPAKA_CUDA_VERSION}"
+    else
+      sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install cuda-core-"${ALPAKA_CUDA_VERSION}" cuda-cudart-"${ALPAKA_CUDA_VERSION}" cuda-cudart-dev-"${ALPAKA_CUDA_VERSION}" cuda-curand-"${ALPAKA_CUDA_VERSION}" cuda-curand-dev-"${ALPAKA_CUDA_VERSION}"
+    fi
+    sudo ln -s /usr/local/cuda-"${ALPAKA_CUDA_VERSION}" /usr/local/cuda
+
+    if [ "${ALPAKA_CUDA_COMPILER}" == "clang" ]
+    then
+        travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install g++-multilib
+    fi
+
+    # clean up
+    sudo rm -rf "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}"
+    sudo dpkg --purge "${ALPAKA_CUDA_PKG_DEB_NAME}"
+elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    if [ "${ALPAKA_CUDA_VERSION}" == "10.0" ]
+    then
+        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.0.130_411.31_win10
+        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "10.1" ]
+    then
+        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.1.168_425.25_win10.exe
+        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "10.2" ]
+    then
+        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.2.89_441.22_win10.exe
+        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "11.0" ]
+    then
+        ALPAKA_CUDA_PKG_FILE_NAME=cuda_11.0.3_451.82_win10.exe
+        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    elif [ "${ALPAKA_CUDA_VERSION}" == "11.1" ]
+    then
+        ALPAKA_CUDA_PKG_FILE_NAME=cuda_11.1.0_456.43_win10.exe
+        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/11.1.0/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
+    else
+        echo CUDA versions other than 10.0, 10.1, 10.2, 11.0 and 11.1 are not currently supported on Windows!
+    fi
+
+    curl -L -o cuda_installer.exe ${ALPAKA_CUDA_PKG_FILE_PATH}
+    ./cuda_installer.exe -s "nvcc_${ALPAKA_CUDA_VERSION}" "curand_dev_${ALPAKA_CUDA_VERSION}" "cudart_${ALPAKA_CUDA_VERSION}"
+fi
diff --git a/thirdParty/cupla/alpaka/script/install_doxygen.sh b/thirdParty/cupla/alpaka/script/install_doxygen.sh
new file mode 100755
index 0000000000..85aa67096b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install_doxygen.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+#
+# Copyright 2020 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+travis_retry sudo apt-get -y --quiet install --no-install-recommends doxygen graphviz
diff --git a/thirdParty/cupla/alpaka/script/install_gcc.sh b/thirdParty/cupla/alpaka/script/install_gcc.sh
new file mode 100755
index 0000000000..5a840b806f
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install_gcc.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+: "${ALPAKA_CI_GCC_VER?'ALPAKA_CI_GCC_VER must be specified'}"
+: "${ALPAKA_CI_SANITIZERS?'ALPAKA_CI_SANITIZERS must be specified'}"
+: "${CXX?'CXX must be specified'}"
+
+travis_retry sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+travis_retry sudo apt-get -y --quiet update
+
+travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install g++-"${ALPAKA_CI_GCC_VER}"
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"${ALPAKA_CI_GCC_VER}" 50
+sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"${ALPAKA_CI_GCC_VER}" 50
+if [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]]
+then
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libtsan0
+fi
+
+which "${CXX}"
+${CXX} -v
diff --git a/thirdParty/cupla/alpaka/script/install_tbb.sh b/thirdParty/cupla/alpaka/script/install_tbb.sh
new file mode 100755
index 0000000000..bd4fa7d359
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/install_tbb.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+# Install TBB
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libtbb-dev
+elif [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+then
+    brew install tbb
+elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    TBB_ARCHIVE_VER="tbb44_20160526oss"
+    TBB_DOWNLOAD_URL="https://github.com/intel/tbb/releases/download/4.4.5/${TBB_ARCHIVE_VER}_win.zip"
+    TBB_DST_PATH="tbb.zip"
+    powershell.exe -Command '[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 ; Invoke-WebRequest "'${TBB_DOWNLOAD_URL}'" -OutFile "'${TBB_DST_PATH}'"'
+    mkdir "${TBB_ROOT}"
+    unzip -q "${TBB_DST_PATH}" -d "${TBB_ROOT}"
+    rm "${TBB_DST_PATH}"
+    TBB_UNZIP_DIR="${TBB_ROOT}/${TBB_ARCHIVE_VER}"
+    mv ${TBB_UNZIP_DIR}/* "${TBB_ROOT}/"
+    rmdir "${TBB_UNZIP_DIR}"
+fi
diff --git a/thirdParty/cupla/alpaka/script/prepare_sanitizers.sh b/thirdParty/cupla/alpaka/script/prepare_sanitizers.sh
new file mode 100755
index 0000000000..fedacea00f
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/prepare_sanitizers.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+#-------------------------------------------------------------------------------
+# Exports the CMAKE_CXX_FLAGS and CMAKE_EXE_LINKER_FLAGS to enable the sanitizers listed in ALPAKA_CI_SANITIZERS.
+if [ -z "${CMAKE_CXX_FLAGS+x}" ]
+then
+    export CMAKE_CXX_FLAGS=
+fi
+if [ -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
+then
+    export CMAKE_EXE_LINKER_FLAGS=
+fi
+if [ -z "${ASAN_OPTIONS+x}" ]
+then
+    export ASAN_OPTIONS=
+fi
+if [ -z "${LSAN_OPTIONS+x}" ]
+then
+    export LSAN_OPTIONS=
+fi
+
+#-------------------------------------------------------------------------------
+# sanitizers
+# General sanitizer settings
+if [[ "${ALPAKA_CI_SANITIZERS}" != "" ]]
+then
+    # - to get nicer stack-traces:
+    CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer"
+    # - to get perfect stack-traces:
+    CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fno-optimize-sibling-calls"
+
+    # g++ needs to use a different linker
+    if [[ "${CXX}" == "g++" ]]
+    then
+        CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold"
+    fi
+
+    # UBSan - http://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html
+    if [[ "${ALPAKA_CI_SANITIZERS}" == *"UBSan"* ]]
+    then
+        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=undefined"
+
+        if [[ "${CXX}" == "clang++" ]]
+        then
+            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize-blacklist=$(pwd)/test/sanitizer_ubsan_blacklist.txt"
+
+            # Previously 'local-bounds' was part of UBsan but has been removed because it is not a pure front-end check
+            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=local-bounds"
+            # 'unsigned-integer-overflow' is not really undefined behaviour but we want to handle it as such for our tests.
+            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=unsigned-integer-overflow"
+        fi
+    fi
+
+    # ASan - http://clang.llvm.org/docs/AddressSanitizer.html
+    if [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]]
+    then
+        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]] )
+        then
+            echo ASan is not supported in combination with TSan or MSan
+            exit 1
+        fi
+
+        if ( [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "ON" ] && [ "${ALPAKA_CUDA_COMPILER}" == "clang" ] )
+        then
+            # fatal error: error in backend: Module has a nontrivial global ctor, which NVPTX does not support.
+            # clang-3.9: error: clang frontend command failed with exit code 70 (use -v to see invocation)
+            echo ASan is not supported in combination with clang used as CUDA compiler
+            exit 1
+        fi
+
+        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=address"
+
+        if [[ "${CXX}" != "clang++" ]]
+        then
+            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize-address-use-after-scope"
+        fi
+
+        ASAN_OPTIONS="strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1"
+        LSAN_OPTIONS="print_suppressions=1:suppressions=$(pwd)/test/sanitizer_lsan_blacklist.txt"
+    fi
+
+    # TSan - http://clang.llvm.org/docs/ThreadSanitizer.html
+    # TSan requires PositionIndependentCode -pie;-fPIE;-fPIC. clang sets this automatically, gcc not.
+    # All base libraries (e.g. boost) have to be build with this flag.
+    # Furthermore, by installing gcc, libtsan0 is not automatically installed.
+    if [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]]
+    then
+        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]] )
+        then
+            echo TSan is not supported in combination with ASan or MSan
+            exit 1
+        fi
+
+        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=thread"
+        if [ "${CXX}" == "g++" ]
+        then
+            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -pie -fPIE"
+            CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -ltsan"
+        fi
+    fi
+
+    # MSan - http://clang.llvm.org/docs/MemorySanitizer.html
+    # NOTE: Currently we can not enable this for CI as this finds some 'use-of-uninitialized-value' inside:
+    #   - boost`s smart pointers used by the unit test framework
+    #   - alpaka/test/integ/mandelbrot/src/main.cpp:450:9 std::replace
+    #   - alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp:307:21 used alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp:130:44
+    if [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]]
+    then
+        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]] )
+        then
+            echo MSan is not supported in combination with ASan or TSan
+            exit 1
+        fi
+
+        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=memory -fsanitize-memory-track-origins"
+    fi
+fi
diff --git a/thirdParty/cupla/alpaka/script/print_env.sh b/thirdParty/cupla/alpaka/script/print_env.sh
new file mode 100755
index 0000000000..0c2b4ea213
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/print_env.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+#-------------------------------------------------------------------------------
+if [ "$ALPAKA_CI" = "GITHUB" ]
+then
+    echo GITHUB_WORKSPACE: "${GITHUB_WORKSPACE}"
+fi
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    # Show all running services
+    sudo service --status-all
+
+    # Stop some unnecessary services to save memory
+    sudo /etc/init.d/mysql stop
+
+    # Show memory stats
+    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install smem
+    sudo smem
+    sudo free -m -t
+fi
diff --git a/thirdParty/cupla/alpaka/script/push_doc.sh b/thirdParty/cupla/alpaka/script/push_doc.sh
new file mode 100755
index 0000000000..27106fe1d7
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/push_doc.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+#
+# Copyright 2020 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+cd docs/doxygen/html
+
+git config --global user.email "action@github.com"
+git config --global user.name "GitHub Action"
+
+git add -f .
+
+git log -n 3
+
+git diff --quiet && git diff --staged --quiet || (git commit -m "Update documentation skip-checks: true"; git push origin gh-pages)
+
+cd ../../../
diff --git a/thirdParty/cupla/alpaka/script/run.sh b/thirdParty/cupla/alpaka/script/run.sh
new file mode 100755
index 0000000000..531267fad1
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/run.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
+echo "ALPAKA_CI_CMAKE_DIR: ${ALPAKA_CI_CMAKE_DIR}"
+: "${ALPAKA_CI_ANALYSIS?'ALPAKA_CI_ANALYSIS must be specified'}"
+echo "ALPAKA_CI_ANALYSIS: ${ALPAKA_CI_ANALYSIS}"
+: "${ALPAKA_CI_INSTALL_CUDA?'ALPAKA_CI_INSTALL_CUDA must be specified'}"
+: "${ALPAKA_CI_INSTALL_HIP?'ALPAKA_CI_INSTALL_HIP must be specified'}"
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
+    echo "ALPAKA_CI_STDLIB: ${ALPAKA_CI_STDLIB}"
+fi
+: "${CXX?'CXX must be specified'}"
+echo "CXX: ${CXX}"
+
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    if [ -z "${LD_LIBRARY_PATH+x}" ]
+    then
+        LD_LIBRARY_PATH=
+    fi
+    if [ "${CXX}" = "clang++" ]
+    then
+        if [ "${ALPAKA_CI_CLANG_VER}" -ge "10" ]
+        then
+            export LD_LIBRARY_PATH="/usr/lib/llvm-${ALPAKA_CI_CLANG_VER}/lib/:${LD_LIBRARY_PATH}"
+        fi
+    fi
+fi
+
+# CMake
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    export PATH=${ALPAKA_CI_CMAKE_DIR}/bin:${PATH}
+fi
+cmake --version
+
+#TBB
+if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    ALPAKA_TBB_BIN_DIR="${TBB_ROOT}/bin/intel64/vc14"
+    export PATH=${PATH}:"${ALPAKA_TBB_BIN_DIR}"
+fi
+
+# CUDA
+if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
+then
+    : "${ALPAKA_CUDA_VERSION?'ALPAKA_CUDA_VERSION must be specified'}"
+
+    if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+    then
+        # CUDA
+        export PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}/bin:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}/lib64:${LD_LIBRARY_PATH}
+        # We have to explicitly add the stub libcuda.so to CUDA_LIB_PATH because the real one would be installed by the driver (which we can not install).
+        export CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs/
+
+        if [ "${ALPAKA_CUDA_COMPILER}" == "nvcc" ]
+        then
+            which nvcc
+            nvcc -V
+        fi
+    elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+    then
+        export PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${ALPAKA_CUDA_VERSION}\bin":$PATH
+        export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${ALPAKA_CUDA_VERSION}"
+    fi
+fi
+
+# HIP
+if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
+then
+: "${ALPAKA_CI_HIP_ROOT_DIR?'ALPAKA_CI_HIP_ROOT_DIR must be specified'}"
+
+    # HIP
+    # HIP_PATH required by HIP tools
+    export HIP_PATH=/opt/rocm
+
+    export PATH=${HIP_PATH}/bin:$PATH
+    export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${HIP_PATH}/hiprand/lib:${LD_LIBRARY_PATH}
+    export CMAKE_PREFIX_PATH=${HIP_PATH}:${HIP_PATH}/hiprand:${CMAKE_PREFIX_PATH:-}
+    export CMAKE_MODULE_PATH=${HIP_PATH}/hip/cmake
+    # calls nvcc or clang
+    which hipcc
+    hipcc --version
+    which hipconfig
+    hipconfig --platform
+    hipconfig -v
+    # print newline as previous command does not do this
+    echo
+
+fi
+
+# stdlib
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
+    then
+        if [ -z "${CMAKE_CXX_FLAGS+x}" ]
+        then
+            export CMAKE_CXX_FLAGS=
+        fi
+        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -stdlib=libc++"
+
+        if [ -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
+        then
+            export CMAKE_EXE_LINKER_FLAGS=
+        fi
+        CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -lc++ -lc++abi"
+    fi
+
+    which "${CXX}"
+    ${CXX} -v
+
+    source ./script/prepare_sanitizers.sh
+fi
+
+if [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    : ${ALPAKA_CI_CL_VER?"ALPAKA_CI_CL_VER must be specified"}
+
+    # Use the 64 bit compiler
+    # FIXME: Path not found but does not seem to be necessary anymore
+    #"./C/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Auxiliary/Build/vcvarsall.bat" amd64
+
+    # Add msbuild to the path
+    if [ "$ALPAKA_CI_CL_VER" = "2017" ]
+    then
+        export MSBUILD_EXECUTABLE="/C/Program Files (x86)/Microsoft Visual Studio/2017/Enterprise/MSBuild/15.0/Bin/MSBuild.exe"
+    elif [ "$ALPAKA_CI_CL_VER" = "2019" ]
+    then
+        export MSBUILD_EXECUTABLE=$(vswhere.exe -latest -requires Microsoft.Component.MSBuild -find "MSBuild\**\Bin\MSBuild.exe")
+    fi
+    "$MSBUILD_EXECUTABLE" -version
+fi
+
+./script/run_generate.sh
+./script/run_build.sh
+if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./script/run_tests.sh ;fi
+if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/run_analysis.sh ;fi
diff --git a/thirdParty/cupla/alpaka/script/run_analysis.sh b/thirdParty/cupla/alpaka/script/run_analysis.sh
new file mode 100755
index 0000000000..add4b06232
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/run_analysis.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+then
+    #-------------------------------------------------------------------------------
+    # sloc
+    sloccount .
+
+    #-------------------------------------------------------------------------------
+    # TODO/FIXME/HACK
+    grep -r HACK ./* || true
+    grep -r FIXME ./* || true
+    grep -r TODO ./* || true
+
+    #-------------------------------------------------------------------------------
+    # check shell script with shellcheck
+    find . -type f -name "*.sh" -exec shellcheck {} \;
+fi
diff --git a/thirdParty/cupla/alpaka/script/run_build.sh b/thirdParty/cupla/alpaka/script/run_build.sh
new file mode 100755
index 0000000000..edd7bf88c9
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/run_build.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#
+# Copyright 2014-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+cd build/
+
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+then
+    make VERBOSE=1
+elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    "$MSBUILD_EXECUTABLE" "alpaka.sln" -p:Configuration=${CMAKE_BUILD_TYPE} -maxcpucount:1 -verbosity:minimal
+fi
+
+cd ..
diff --git a/thirdParty/cupla/alpaka/script/run_doxygen.sh b/thirdParty/cupla/alpaka/script/run_doxygen.sh
new file mode 100755
index 0000000000..ca1e890d80
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/run_doxygen.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#
+# Copyright 2020 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/travis_retry.sh
+
+source ./script/set.sh
+
+#To deploy the doxygen documentation a copy of the repository is created inside the deployed folder.
+#This copy is always in the gh-pages branch consisting only of the containing files.
+#This folder is ignored in all other branches.
+#*NOTE:* This has already been done once and does not have to be repeated!
+#On working branch:
+#- Add deploy directory to `.gitignore` (if not already done)
+#- Create the `gh-pages` branch: `git checkout --orphan gh-pages`
+#- Clean the branch: `git rm -rf .`
+#- Commit and push the branch: `git add --all`, `git commit -m"add gh-pages branch"`, `git push`
+
+# Clone the gh-pages branch into the docs/doxygen/html folder.
+travis_retry rm -rf docs/doxygen/html || git clone -b gh-pages https://x-access-token:${2}@github.com/${1}.git docs/doxygen/html
+
+cd docs/
+
+rm -rf doxygen/html/*
+rm -rf doxygen/xml/*
+
+doxygen Doxyfile
+
+cd ../
diff --git a/thirdParty/cupla/alpaka/script/run_generate.sh b/thirdParty/cupla/alpaka/script/run_generate.sh
new file mode 100755
index 0000000000..b422ed6ad5
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/run_generate.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+#
+# Copyright 2014-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+#-------------------------------------------------------------------------------
+
+# create a cmake variable definition if an environment variable exists
+#
+# This function can not handle environment variables with spaces in its content.
+#
+# @param $1 cmake/environment variable name
+#
+# @result if $1 exists cmake variable definition else nothing is returned
+#
+# @code{.bash}
+# FOO=ON
+# echo "$(env2cmake FOO)" # returns "-DFOO=ON"
+# echo "$(env2cmake BAR)" # returns nothing
+# @endcode
+function env2cmake()
+{
+    if [ ! -z "${!1}" ] ; then
+        echo -n "-D$1=${!1}"
+    fi
+}
+
+#-------------------------------------------------------------------------------
+if [ ! -z "${CMAKE_CXX_FLAGS+x}" ]
+then
+    echo "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+fi
+if [ ! -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
+then
+    echo "CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}"
+fi
+
+ALPAKA_CI_CMAKE_EXECUTABLE=cmake
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ]
+then
+    ALPAKA_CI_CMAKE_EXECUTABLE="${ALPAKA_CI_CMAKE_DIR}/bin/cmake"
+fi
+
+ALPAKA_CI_CMAKE_GENERATOR_PLATFORM=
+if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+then
+    ALPAKA_CI_CMAKE_GENERATOR="Unix Makefiles"
+elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+then
+    : ${ALPAKA_CI_CL_VER?"ALPAKA_CI_CL_VER must be specified"}
+
+    # Select the generator
+    if [ "$ALPAKA_CI_CL_VER" = "2017" ]
+    then
+        ALPAKA_CI_CMAKE_GENERATOR="Visual Studio 15 2017"
+    elif [ "$ALPAKA_CI_CL_VER" = "2019" ]
+    then
+        ALPAKA_CI_CMAKE_GENERATOR="Visual Studio 16 2019"
+    fi
+    ALPAKA_CI_CMAKE_GENERATOR_PLATFORM="-A x64"
+fi
+
+mkdir -p build/
+cd build/
+
+"${ALPAKA_CI_CMAKE_EXECUTABLE}" --verbose -G "${ALPAKA_CI_CMAKE_GENERATOR}" ${ALPAKA_CI_CMAKE_GENERATOR_PLATFORM}\
+    -Dalpaka_BUILD_EXAMPLES=ON -DBUILD_TESTING=ON \
+    "$(env2cmake BOOST_ROOT)" -DBOOST_LIBRARYDIR="${ALPAKA_CI_BOOST_LIB_DIR}/lib" -DBoost_USE_STATIC_LIBS=ON -DBoost_USE_MULTITHREADED=ON -DBoost_USE_STATIC_RUNTIME=OFF -DBoost_ARCHITECTURE="-x64" \
+    "$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" "$(env2cmake CMAKE_CXX_EXTENSIONS)"\
+    "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)" \
+    "$(env2cmake ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)" \
+    "$(env2cmake ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)" \
+    "$(env2cmake ALPAKA_ACC_ANY_BT_OMP5_ENABLE)" "$(env2cmake ALPAKA_ACC_ANY_BT_OACC_ENABLE)" "$(env2cmake ALPAKA_OFFLOAD_MAX_BLOCK_SIZE)" \
+    "$(env2cmake TBB_ROOT)" \
+    "$(env2cmake ALPAKA_ACC_GPU_CUDA_ENABLE)" "$(env2cmake ALPAKA_CUDA_VERSION)" "$(env2cmake ALPAKA_ACC_GPU_CUDA_ONLY_MODE)" "$(env2cmake ALPAKA_CUDA_ARCH)" "$(env2cmake ALPAKA_CUDA_COMPILER)" \
+    "$(env2cmake ALPAKA_CUDA_FAST_MATH)" "$(env2cmake ALPAKA_CUDA_FTZ)" "$(env2cmake ALPAKA_CUDA_SHOW_REGISTER)" "$(env2cmake ALPAKA_CUDA_KEEP_FILES)" "$(env2cmake ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA)" "$(env2cmake ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION)" \
+    "$(env2cmake ALPAKA_ACC_GPU_HIP_ENABLE)" "$(env2cmake ALPAKA_ACC_GPU_HIP_ONLY_MODE)" "$(env2cmake ALPAKA_HIP_PLATFORM)" \
+    "$(env2cmake ALPAKA_DEBUG)" "$(env2cmake ALPAKA_CI)" "$(env2cmake ALPAKA_CI_ANALYSIS)" "$(env2cmake ALPAKA_CXX_STANDARD)" \
+    ".."
+
+cd ..
diff --git a/thirdParty/cupla/alpaka/script/run_tests.sh b/thirdParty/cupla/alpaka/script/run_tests.sh
new file mode 100755
index 0000000000..2a6a0c00f0
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/run_tests.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+#
+# Copyright 2017-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+source ./script/set.sh
+
+: "${ALPAKA_ACC_GPU_CUDA_ENABLE?'ALPAKA_ACC_GPU_CUDA_ENABLE must be specified'}"
+: "${ALPAKA_ACC_GPU_HIP_ENABLE?'ALPAKA_ACC_GPU_HIP_ENABLE must be specified'}"
+
+if [ ! -z "${OMP_THREAD_LIMIT+x}" ]
+then
+    echo "OMP_THREAD_LIMIT=${OMP_THREAD_LIMIT}"
+fi
+if [ ! -z "${OMP_NUM_THREADS+x}" ]
+then
+    echo "OMP_NUM_THREADS=${OMP_NUM_THREADS}"
+fi
+
+if [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "OFF" ] && [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "OFF" ];
+then
+    cd build/
+
+    if [ "$ALPAKA_CI_OS_NAME" = "Linux" ] || [ "$ALPAKA_CI_OS_NAME" = "macOS" ]
+    then
+        ctest -V
+    elif [ "$ALPAKA_CI_OS_NAME" = "Windows" ]
+    then
+        ctest -V -C ${CMAKE_BUILD_TYPE}
+    fi
+
+    cd ..
+fi
diff --git a/thirdParty/cupla/alpaka/script/set.sh b/thirdParty/cupla/alpaka/script/set.sh
new file mode 100755
index 0000000000..7f2172ec82
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/set.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#
+# Copyright 2018-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+#-------------------------------------------------------------------------------
+# -e: exit as soon as one command returns a non-zero exit code
+# -o pipefail: pipeline returns exit code of the rightmost command with a non-zero exit code
+# -u: treat unset variables as an error
+# -v: Print shell input lines as they are read
+# -x: Print command traces before executing command
+set -eouvx pipefail
diff --git a/thirdParty/cupla/alpaka/script/travis/after_failure.sh b/thirdParty/cupla/alpaka/script/travis/after_failure.sh
deleted file mode 100755
index 4f791807d6..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/after_failure.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2018-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-  sudo smem
-  sudo free -m -t
-  # show actions of the OOM killer
-  sudo dmesg
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/before_install.sh b/thirdParty/cupla/alpaka/script/travis/before_install.sh
deleted file mode 100755
index 3f87e6ed72..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/before_install.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# Those are set to g++/gcc within the git bash even though they are overwritten in the .travis.yml file.
-if [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    CXX=cl.exe
-    CC=cl.exe
-fi
-
-#-------------------------------------------------------------------------------
-# gcc
-if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
-then
-    ALPAKA_CI_GCC_VER_SEMANTIC=( ${ALPAKA_CI_GCC_VER//./ } )
-    export ALPAKA_CI_GCC_VER_MAJOR="${ALPAKA_CI_GCC_VER_SEMANTIC[0]}"
-    echo ALPAKA_CI_GCC_VER_MAJOR: "${ALPAKA_CI_GCC_VER_MAJOR}"
-fi
-
-#-------------------------------------------------------------------------------
-# Boost.
-ALPAKA_CI_BOOST_BRANCH_MAJOR=${ALPAKA_CI_BOOST_BRANCH:6:1}
-echo ALPAKA_CI_BOOST_BRANCH_MAJOR: "${ALPAKA_CI_BOOST_BRANCH_MAJOR}"
-ALPAKA_CI_BOOST_BRANCH_MINOR=${ALPAKA_CI_BOOST_BRANCH:8:2}
-echo ALPAKA_CI_BOOST_BRANCH_MINOR: "${ALPAKA_CI_BOOST_BRANCH_MINOR}"
-
-#-------------------------------------------------------------------------------
-# CUDA
-export ALPAKA_CI_INSTALL_CUDA="OFF"
-if [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "ON" ]
-then
-    export ALPAKA_CI_INSTALL_CUDA="ON"
-fi
-if [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "ON" ]
-then
-    if [ "${ALPAKA_HIP_PLATFORM}" == "nvcc" ]
-    then
-        export ALPAKA_CI_INSTALL_CUDA="ON"
-    fi
-fi
-
-#-------------------------------------------------------------------------------
-# HIP
-export ALPAKA_CI_INSTALL_HIP="OFF"
-if [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "ON" ]
-then
-    export ALPAKA_CI_INSTALL_HIP="ON"
-
-    # if platform is nvcc, CUDA part is already processed in this file.
-    if [ "${ALPAKA_HIP_PLATFORM}" == "hcc" ]
-    then
-        echo "HIP(hcc) not supported yet."
-        exit 1
-    fi
-fi
-
-#-------------------------------------------------------------------------------
-# TBB
-export ALPAKA_CI_INSTALL_TBB="OFF"
-if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
-then
-    if [ "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}" = "ON" ]
-    then
-        export ALPAKA_CI_INSTALL_TBB="ON"
-    fi
-else
-    # If the variable is not set, the backend will most probably be used by default so we install it.
-    export ALPAKA_CI_INSTALL_TBB="ON"
-fi
-
-#-------------------------------------------------------------------------------
-# Fibers
-export ALPAKA_CI_INSTALL_FIBERS="OFF"
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ]
-then
-    if [ "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}" = "ON" ]
-    then
-        export ALPAKA_CI_INSTALL_FIBERS="ON"
-    fi
-else
-    # If the variable is not set, the backend will most probably be used by default so we install it.
-    export ALPAKA_CI_INSTALL_FIBERS="ON"
-fi
-
-
-# GCC-5.5 has broken avx512vlintrin.h in Release mode with NVCC 9.X
-#   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=76731
-#   https://github.com/tensorflow/tensorflow/issues/10220
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    if [ "${CXX}" == "g++" ]
-    then
-        if (( "${ALPAKA_CI_GCC_VER_MAJOR}" == 5 ))
-        then
-            if [ "${ALPAKA_CUDA_COMPILER}" == "nvcc" ]
-            then
-                if [ "${CMAKE_BUILD_TYPE}" == "Release" ]
-                then
-                    export CMAKE_BUILD_TYPE=Debug
-                fi
-            fi
-        fi
-    fi
-fi
-
-#-------------------------------------------------------------------------------
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-    then
-        if [ "${CXX}" == "g++" ]
-        then
-            echo "using libc++ with g++ not yet supported."
-            exit 1
-        fi
-
-        if [ "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" == "ubuntu:14.04" ]
-        then
-            echo "using libc++ with ubuntu:14.04 not supported."
-            exit 1
-        fi
-
-        if (( ( ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" == 1 ) && ( "${ALPAKA_CI_BOOST_BRANCH_MINOR}" < 65 ) ) || ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" < 1 ) ))
-        then
-            echo "using libc++ with boost < 1.65 is not supported."
-            exit 1
-        fi
-    fi
-
-    if [ "${ALPAKA_CI_STDLIB}" == "libstdc++" ]
-    then
-        if [ ! -z "${ALPAKA_CXX_STANDARD+x}" ]
-        then
-            if (( "${ALPAKA_CXX_STANDARD}" >= 17 ))
-            then
-                if [ "${CXX}" == "clang++" ]
-                then
-                    if (( "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}" < 7 ))
-                    then
-                        echo "Clang used in c++17 mode requires libstdc++-7 or newer."
-                        exit 1
-                    fi
-                fi
-                if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
-                then
-                    if (( ( ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" == 1 ) && ( "${ALPAKA_CI_BOOST_BRANCH_MINOR}" < 67 ) ) || ( "${ALPAKA_CI_BOOST_BRANCH_MAJOR}" < 1 ) ))
-                    then
-                        # https://github.com/boostorg/coroutine2/issues/26
-                        echo "libstdc++ in c++17 mode is not compatible with boost.fibers in boost-1.66 and below."
-                        exit 1
-                    fi
-                fi
-            fi
-        fi
-    fi
-fi
\ No newline at end of file
diff --git a/thirdParty/cupla/alpaka/script/travis/docker_install.sh b/thirdParty/cupla/alpaka/script/travis/docker_install.sh
deleted file mode 100755
index 02b267cf46..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/docker_install.sh
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-ls "${ALPAKA_CI_DOCKER_CACHE_DIR}"
-
-ALPAKA_DOCKER_BUILD_REQUIRED=1
-
-if [ -f "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}" ]
-then
-    # NOTE: The image being available is not the only precondition. If anything within any of the scripts has changed in comparison to the ones that created the docker image, we might have to rebuild the image.
-    ALPAKA_DOCKER_BUILD_REQUIRED=0
-fi
-
-# runtime and compile time options
-ALPAKA_DOCKER_ENV_LIST=()
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CC=${CC}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CXX=${CXX}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "TRAVIS_OS_NAME=${TRAVIS_OS_NAME}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_DOCKER_BASE_IMAGE_NAME=${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_ANALYSIS=${ALPAKA_CI_ANALYSIS}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_BRANCH=${ALPAKA_CI_BOOST_BRANCH}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "BOOST_ROOT=${BOOST_ROOT}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_LIB_DIR=${ALPAKA_CI_BOOST_LIB_DIR}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_DIR=${ALPAKA_CI_CLANG_DIR}")
-if [ ! -z "${ALPAKA_CI_CLANG_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_VER=${ALPAKA_CI_CLANG_VER}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_STDLIB=${ALPAKA_CI_STDLIB}")
-if [ ! -z ${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION+x} ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_VER=${ALPAKA_CI_CMAKE_VER}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_DIR=${ALPAKA_CI_CMAKE_DIR}")
-if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_GCC_VER=${ALPAKA_CI_GCC_VER}")
-fi
-if [ ! -z "${ALPAKA_CI_SANITIZERS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_SANITIZERS=${ALPAKA_CI_SANITIZERS}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_BT_OMP4_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_BT_OMP4_ENABLE=${ALPAKA_ACC_CPU_BT_OMP4_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ENABLE=${ALPAKA_ACC_GPU_CUDA_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_HIP_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ENABLE=${ALPAKA_ACC_GPU_HIP_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_CUDA=${ALPAKA_CI_INSTALL_CUDA}")
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CUDA_DIR=${ALPAKA_CI_CUDA_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_VERSION=${ALPAKA_CUDA_VERSION}")
-    if [ ! -z "${ALPAKA_CUDA_COMPILER+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_COMPILER=${ALPAKA_CUDA_COMPILER}")
-    fi
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_HIP=${ALPAKA_CI_INSTALL_HIP}")
-if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_ROOT_DIR=${ALPAKA_CI_HIP_ROOT_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_BRANCH=${ALPAKA_CI_HIP_BRANCH}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_HIP_PLATFORM=${ALPAKA_HIP_PLATFORM}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_TBB=${ALPAKA_CI_INSTALL_TBB}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_FIBERS=${ALPAKA_CI_INSTALL_FIBERS}")
-
-if [ "${ALPAKA_DOCKER_BUILD_REQUIRED}" -eq 1 ]
-then
-  docker run -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" /bin/bash ./script/travis/install.sh
-
-  ALPAKA_DOCKER_CONTAINER_NAME=$(docker ps -l -q)
-  docker commit "${ALPAKA_DOCKER_CONTAINER_NAME}" "${ALPAKA_CI_DOCKER_IMAGE_NAME}"
-
-  # delete the container and the base image to save disc space
-  docker stop "${ALPAKA_DOCKER_CONTAINER_NAME}"
-  docker rm "${ALPAKA_DOCKER_CONTAINER_NAME}"
-  docker rmi "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}"
-
-  docker save "${ALPAKA_CI_DOCKER_IMAGE_NAME}" | gzip > "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}"
-
-  docker images
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/docker_run.sh b/thirdParty/cupla/alpaka/script/travis/docker_run.sh
deleted file mode 100755
index 59a2ec8575..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/docker_run.sh
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-# runtime and compile time options
-ALPAKA_DOCKER_ENV_LIST=()
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CC=${CC}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CXX=${CXX}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "TRAVIS_OS_NAME=${TRAVIS_OS_NAME}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_ANALYSIS=${ALPAKA_CI_ANALYSIS}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_BRANCH=${ALPAKA_CI_BOOST_BRANCH}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "BOOST_ROOT=${BOOST_ROOT}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_BOOST_LIB_DIR=${ALPAKA_CI_BOOST_LIB_DIR}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_DIR=${ALPAKA_CI_CLANG_DIR}")
-if [ ! -z "${ALPAKA_CI_CLANG_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_VER=${ALPAKA_CI_CLANG_VER}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_STDLIB=${ALPAKA_CI_STDLIB}")
-if [ ! -z "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CLANG_LIBSTDCPP_VERSION=${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_VER=${ALPAKA_CI_CMAKE_VER}")
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CMAKE_DIR=${ALPAKA_CI_CMAKE_DIR}")
-if [ ! -z "${ALPAKA_CI_GCC_VER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_GCC_VER=${ALPAKA_CI_GCC_VER}")
-fi
-if [ ! -z "${ALPAKA_CI_SANITIZERS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_SANITIZERS=${ALPAKA_CI_SANITIZERS}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_BT_OMP4_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_BT_OMP4_ENABLE=${ALPAKA_ACC_CPU_BT_OMP4_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ENABLE=${ALPAKA_ACC_GPU_CUDA_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_HIP_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ENABLE=${ALPAKA_ACC_GPU_HIP_ENABLE}")
-fi
-if [ ! -z "${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE=${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE}")
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_CUDA=${ALPAKA_CI_INSTALL_CUDA}")
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_CUDA_DIR=${ALPAKA_CI_CUDA_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_VERSION=${ALPAKA_CUDA_VERSION}")
-    if [ ! -z "${ALPAKA_CUDA_COMPILER+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_COMPILER=${ALPAKA_CUDA_COMPILER}")
-    fi
-    if [ ! -z "${ALPAKA_CUDA_ARCH+x}" ]
-    then
-        ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_ARCH=${ALPAKA_CUDA_ARCH}")
-    fi
-fi
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_INSTALL_HIP=${ALPAKA_CI_INSTALL_HIP}")
-if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI_HIP_ROOT_DIR=${ALPAKA_CI_HIP_ROOT_DIR}")
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_HIP_PLATFORM=${ALPAKA_HIP_PLATFORM}")
-fi
-
-# runtime only options
-ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CI=${ALPAKA_CI}")
-if [ ! -z "${ALPAKA_DEBUG+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_DEBUG=${ALPAKA_DEBUG}")
-fi
-if [ ! -z "${ALPAKA_CXX_STANDARD+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CXX_STANDARD=${ALPAKA_CXX_STANDARD}")
-fi
-if [ ! -z "${OMP_NUM_THREADS+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "OMP_NUM_THREADS=${OMP_NUM_THREADS}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_CUDA_ONLY_MODE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_CUDA_ONLY_MODE=${ALPAKA_ACC_GPU_CUDA_ONLY_MODE}")
-fi
-if [ ! -z "${ALPAKA_ACC_GPU_HIP_ONLY_MODE+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_ACC_GPU_HIP_ONLY_MODE=${ALPAKA_ACC_GPU_HIP_ONLY_MODE}")
-fi
-if [ ! -z "${ALPAKA_CUDA_FAST_MATH+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_FAST_MATH=${ALPAKA_CUDA_FAST_MATH}")
-fi
-if [ ! -z "${ALPAKA_CUDA_FTZ+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_FTZ=${ALPAKA_CUDA_FTZ}")
-fi
-if [ ! -z "${ALPAKA_CUDA_SHOW_REGISTER+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_SHOW_REGISTER=${ALPAKA_CUDA_SHOW_REGISTER}")
-fi
-if [ ! -z "${ALPAKA_CUDA_KEEP_FILES+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_KEEP_FILES=${ALPAKA_CUDA_KEEP_FILES}")
-fi
-if [ ! -z "${ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA=${ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA}")
-fi
-if [ ! -z "${ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR=${ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR}")
-fi
-if [ ! -z "${ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION+x}" ]
-then
-    ALPAKA_DOCKER_ENV_LIST+=("--env" "ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION=${ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION}")
-fi
-
-docker images
-docker images -q ${ALPAKA_CI_DOCKER_IMAGE_NAME}
-
-# If we have created the image in the current run, we do not have to load it again, because it is already available.
-if [[ "$(docker images -q ${ALPAKA_CI_DOCKER_IMAGE_NAME} 2> /dev/null)" == "" ]]; then
-    gzip -dc "${ALPAKA_CI_DOCKER_CACHE_IMAGE_FILE_PATH}" | docker load
-fi
-
-# --cap-add SYS_PTRACE is required for LSAN to work
-docker run --cap-add SYS_PTRACE -v "$(pwd)":"$(pwd)" -w "$(pwd)" "${ALPAKA_DOCKER_ENV_LIST[@]}" --rm "${ALPAKA_CI_DOCKER_IMAGE_NAME}" /bin/bash ./script/travis/run.sh
diff --git a/thirdParty/cupla/alpaka/script/travis/install.sh b/thirdParty/cupla/alpaka/script/travis/install.sh
deleted file mode 100755
index 6115216845..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: ${ALPAKA_CI_ANALYSIS?"ALPAKA_CI_ANALYSIS must be specified"}
-: ${ALPAKA_CI_INSTALL_CUDA?"ALPAKA_CI_INSTALL_CUDA must be specified"}
-: ${ALPAKA_CI_INSTALL_HIP?"ALPAKA_CI_INSTALL_HIP must be specified"}
-: ${ALPAKA_CI_INSTALL_TBB?"ALPAKA_CI_INSTALL_TBB must be specified"}
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    travis_retry apt-get -y --quiet update
-    travis_retry apt-get -y install sudo
-
-    # software-properties-common: 'add-apt-repository' and certificates for wget https download
-    # binutils: ld
-    # xz-utils: xzcat
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install software-properties-common wget git make binutils xz-utils
-fi
-
-if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    ./script/travis/install_cmake.sh
-fi
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/travis/install_analysis.sh ;fi
-fi
-
-# Install CUDA before installing gcc as it installs gcc-4.8 and overwrites our selected compiler
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ] ;then ./script/travis/install_cuda.sh ;fi
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ "${CXX}" == "g++" ] ;then ./script/travis/install_gcc.sh ;fi
-    if [ "${CXX}" == "clang++" ] ;then source ./script/travis/install_clang.sh ;fi
-    if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ] ;then ./script/travis/install_hip.sh ;fi
-fi
-
-if [ "${ALPAKA_CI_INSTALL_TBB}" = "ON" ]
-then
-    ./script/travis/install_tbb.sh
-fi
-
-./script/travis/install_boost.sh
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    # Minimize docker image size
-    sudo apt-get --quiet --purge autoremove
-    sudo apt-get clean
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/install_analysis.sh b/thirdParty/cupla/alpaka/script/travis/install_analysis.sh
deleted file mode 100755
index adec4c9ca0..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install_analysis.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# Install sloc
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install sloccount
-sloccount --version
-
-#-------------------------------------------------------------------------------
-# Install shellcheck
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install shellcheck
-shellcheck --version
diff --git a/thirdParty/cupla/alpaka/script/travis/install_boost.sh b/thirdParty/cupla/alpaka/script/travis/install_boost.sh
deleted file mode 100755
index 86b720452d..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install_boost.sh
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-: "${BOOST_ROOT?'BOOST_ROOT must be specified'}"
-: "${ALPAKA_CI_BOOST_LIB_DIR?'ALPAKA_CI_BOOST_LIB_DIR must be specified'}"
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
-fi
-: "${CMAKE_BUILD_TYPE?'CMAKE_BUILD_TYPE must be specified'}"
-: "${CXX?'CXX must be specified'}"
-: "${CC?'CC must be specified'}"
-: "${ALPAKA_CI_INSTALL_FIBERS?'ALPAKA_CI_INSTALL_FIBERS must be specified'}"
-
-git clone -b "${ALPAKA_CI_BOOST_BRANCH}" --quiet --recursive --single-branch --depth 1 https://github.com/boostorg/boost.git "${BOOST_ROOT}"
-
-# Bootstrap boost.
-if [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    (cd "${BOOST_ROOT}"; ./bootstrap.bat)
-else
-    (cd "${BOOST_ROOT}"; sudo ./bootstrap.sh --with-toolset="${CC}")
-fi
-(cd "${BOOST_ROOT}"; cat ./bootstrap.log)
-
-# Create file links.
-if [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    (cd "${BOOST_ROOT}"; ./b2 headers)
-else
-    (cd "${BOOST_ROOT}"; sudo ./b2 headers)
-fi
-
-# Only build boost if we need some of the non-header-only libraries
-if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
-then
-    # Prepare the library destination directory.
-    mkdir -p "${ALPAKA_CI_BOOST_LIB_DIR}"
-
-    # Create the boost build command.
-    ALPAKA_BOOST_B2=""
-    ALPAKA_BOOST_B2_CFLAGS=""
-    ALPAKA_BOOST_B2_CXXFLAGS=""
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-    then
-        ALPAKA_BOOST_B2+="sudo "
-    fi
-    ALPAKA_BOOST_B2+="./b2 -j1"
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-    then
-        ALPAKA_BOOST_B2_CFLAGS+="-fPIC"
-        ALPAKA_BOOST_B2_CXXFLAGS+="-fPIC"
-    fi
-
-    if [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        ALPAKA_BOOST_B2+=" --layout=versioned --toolset=msvc-14.1"
-    else
-        ALPAKA_BOOST_B2+=" --layout=tagged --toolset=${CC}"
-    fi
-
-    # TODO: Win32: adress-model=32
-    ALPAKA_BOOST_B2+=" architecture=x86 address-model=64 link=static threading=multi runtime-link=shared"
-
-    if [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        ALPAKA_BOOST_B2+=" define=_CRT_NONSTDC_NO_DEPRECATE define=_CRT_SECURE_NO_DEPRECATE define=_SCL_SECURE_NO_DEPRECAT define=BOOST_USE_WINFIBERS define=_ENABLE_EXTENDED_ALIGNED_STORAGE"
-    fi
-
-    if [ "${CMAKE_BUILD_TYPE}" == "Debug" ]
-    then
-      ALPAKA_BOOST_B2+=" variant=debug"
-    else
-      ALPAKA_BOOST_B2+=" variant=release"
-    fi
-
-    # Clang is not supported by the FindBoost script.
-    # boost (especially old versions) produces too much warnings when using clang (newer versions) so that the 4 MiB log is too short.
-    if [ "${CXX}" == "clang++" ]
-    then
-        ALPAKA_BOOST_B2_CXXFLAGS+=" -Wunused-private-field -Wno-unused-local-typedef -Wno-c99-extensions -Wno-variadic-macros"
-    fi
-    # Select the libraries required.
-    # If the variable is not set, the backend will most probably be used by default so we install it.
-    if [ "${ALPAKA_CI_INSTALL_FIBERS}" == "ON" ]
-    then
-        if [ "$TRAVIS_OS_NAME" = "linux" ]
-        then
-            ALPAKA_BOOST_B2_CXXFLAGS+=" -std=c++11"
-        fi
-        ALPAKA_BOOST_B2+=" --with-fiber --with-context --with-thread --with-atomic --with-system --with-chrono --with-date_time"
-    fi
-    if [ "${ALPAKA_BOOST_B2_CFLAGS}" != "" ]
-    then
-        ALPAKA_BOOST_B2+=' cflags="'
-        ALPAKA_BOOST_B2+="${ALPAKA_BOOST_B2_CFLAGS}"
-        ALPAKA_BOOST_B2+='"'
-    fi
-    if [ "${ALPAKA_BOOST_B2_CXXFLAGS}" != "" ]
-    then
-        ALPAKA_BOOST_B2+=' cxxflags="'
-        ALPAKA_BOOST_B2+="${ALPAKA_BOOST_B2_CXXFLAGS}"
-        if [ "$TRAVIS_OS_NAME" = "linux" ]
-        then
-            if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-            then
-                ALPAKA_BOOST_B2+=" -stdlib=libc++"
-            fi
-        fi
-        ALPAKA_BOOST_B2+='"'
-    fi
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ]
-    then
-        if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-        then
-            ALPAKA_BOOST_B2+=' linkflags="-stdlib=libc++"'
-        fi
-    fi
-
-    ALPAKA_BOOST_B2+=" --stagedir=${ALPAKA_CI_BOOST_LIB_DIR} stage"
-
-    # Build boost.
-    #echo "ALPAKA_BOOST_B2=${ALPAKA_BOOST_B2}"
-    (cd "${BOOST_ROOT}"; eval "${ALPAKA_BOOST_B2}")
-
-    # Clean the intermediate build files.
-    if [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        rm -rf bin.v2
-    else
-        sudo rm -rf bin.v2
-    fi
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/install_clang.sh b/thirdParty/cupla/alpaka/script/travis/install_clang.sh
deleted file mode 100755
index c7b03de736..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install_clang.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_CLANG_DIR?'ALPAKA_CI_CLANG_DIR must be specified'}"
-: "${ALPAKA_CI_CLANG_VER?'ALPAKA_CI_CLANG_VER must be specified'}"
-: "${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION?'ALPAKA_CI_CLANG_LIBSTDCPP_VERSION must be specified'}"
-: "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
-: "${CXX?'CXX must be specified'}"
-
-if [ -z "$(ls -A "${ALPAKA_CI_CLANG_DIR}")" ]
-then
-    ALPAKA_CLANG_PKG_FILE_NAME=clang+llvm-${ALPAKA_CI_CLANG_VER}-x86_64-linux-gnu-ubuntu-16.04.tar.xz
-    travis_retry wget --no-verbose "http://llvm.org/releases/${ALPAKA_CI_CLANG_VER}/${ALPAKA_CLANG_PKG_FILE_NAME}"
-    mkdir -p "${ALPAKA_CI_CLANG_DIR}"
-    xzcat "${ALPAKA_CLANG_PKG_FILE_NAME}" | tar -xf - --strip 1 -C "${ALPAKA_CI_CLANG_DIR}"
-    sudo rm -rf "${ALPAKA_CLANG_PKG_FILE_NAME}"
-fi
-"${ALPAKA_CI_CLANG_DIR}/bin/llvm-config" --version
-export LLVM_CONFIG="${ALPAKA_CI_CLANG_DIR}/bin/llvm-config"
-
-travis_retry sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-travis_retry sudo apt-get -y --quiet update
-
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libstdc++-"${ALPAKA_CI_CLANG_LIBSTDCPP_VERSION}"-dev
-if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-then
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++-dev
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libc++abi-dev
-fi
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libiomp-dev
-sudo update-alternatives --install /usr/bin/clang clang "${ALPAKA_CI_CLANG_DIR}"/bin/clang 50
-sudo update-alternatives --install /usr/bin/clang++ clang++ "${ALPAKA_CI_CLANG_DIR}"/bin/clang++ 50
-sudo update-alternatives --install /usr/bin/cc cc "${ALPAKA_CI_CLANG_DIR}"/bin/clang 50
-sudo update-alternatives --install /usr/bin/c++ c++ "${ALPAKA_CI_CLANG_DIR}"/bin/clang++ 50
-# We have to prepend /usr/bin to the path because else the preinstalled clang from usr/bin/local/ is used.
-export PATH=${ALPAKA_CI_CLANG_DIR}/bin:${PATH}
-if [ -z ${LD_LIBRARY_PATH+x} ]
-then
-    LD_LIBRARY_PATH=
-fi
-export LD_LIBRARY_PATH=${ALPAKA_CI_CLANG_DIR}/lib:${LD_LIBRARY_PATH}
-if [ -z ${CPPFLAGS+x} ]
-then
-    CPPFLAGS=
-fi
-export CPPFLAGS="-I ${ALPAKA_CI_CLANG_DIR}/include/c++/v1 ${CPPFLAGS}"
-
-which "${CXX}"
-${CXX} -v
diff --git a/thirdParty/cupla/alpaka/script/travis/install_cmake.sh b/thirdParty/cupla/alpaka/script/travis/install_cmake.sh
deleted file mode 100755
index db218ee6ba..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install_cmake.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
-: "${ALPAKA_CI_CMAKE_VER?'ALPAKA_CI_CMAKE_VER must be specified'}"
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    # Download the selected version.
-    if [ -z "$(ls -A ${ALPAKA_CI_CMAKE_DIR})" ]
-    then
-        ALPAKA_CI_CMAKE_VER_SEMANTIC=( ${ALPAKA_CI_CMAKE_VER//./ } )
-        ALPAKA_CI_CMAKE_VER_MAJOR="${ALPAKA_CI_CMAKE_VER_SEMANTIC[0]}"
-        ALPAKA_CI_CMAKE_VER_MINOR="${ALPAKA_CI_CMAKE_VER_SEMANTIC[1]}"
-
-        ALPAKA_CMAKE_PKG_FILE_NAME_BASE=cmake-${ALPAKA_CI_CMAKE_VER}-Linux-x86_64
-        ALPAKA_CMAKE_PKG_FILE_NAME=${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}.tar.gz
-        travis_retry wget --no-verbose https://cmake.org/files/v"${ALPAKA_CI_CMAKE_VER_MAJOR}"."${ALPAKA_CI_CMAKE_VER_MINOR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME}"
-        mkdir -p "${ALPAKA_CI_CMAKE_DIR}"
-        tar -xzf "${ALPAKA_CMAKE_PKG_FILE_NAME}" -C "${ALPAKA_CI_CMAKE_DIR}"
-        sudo cp -fR "${ALPAKA_CI_CMAKE_DIR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}"/* "${ALPAKA_CI_CMAKE_DIR}"
-        sudo rm -rf "${ALPAKA_CMAKE_PKG_FILE_NAME}" "${ALPAKA_CI_CMAKE_DIR}"/"${ALPAKA_CMAKE_PKG_FILE_NAME_BASE}"
-    fi
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    choco uninstall cmake.install
-    choco install cmake.install --version ${ALPAKA_CI_CMAKE_VER}
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/install_cuda.sh b/thirdParty/cupla/alpaka/script/travis/install_cuda.sh
deleted file mode 100755
index 31e095b2b5..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install_cuda.sh
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CUDA_VERSION?'ALPAKA_CUDA_VERSION must be specified'}"
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    : "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME?'ALPAKA_CI_DOCKER_BASE_IMAGE_NAME must be specified'}"
-    : "${ALPAKA_CI_CUDA_DIR?'ALPAKA_CI_CUDA_DIR must be specified'}"
-    : "${ALPAKA_CUDA_COMPILER?'ALPAKA_CUDA_COMPILER must be specified'}"
-
-    # Ubuntu 18.04 requires some extra keys for verification
-    if [[ "${ALPAKA_CI_DOCKER_BASE_IMAGE_NAME}" == *"18.04"* ]]
-    then
-        travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install dirmngr gpg-agent
-        travis_retry sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80
-    fi
-
-    # Set the correct CUDA downloads
-    if [ "${ALPAKA_CUDA_VERSION}" == "8.0" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1404-8-0-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_8.0.44-1_amd64-deb
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "9.0" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-0-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.0.176-1_amd64-deb
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "9.1" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-1-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.1.85-1_amd64
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "9.2" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1604-9-2-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"_9.2.88-1_amd64
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.0" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-0-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.0.130-410.48_1.0-1_amd64
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.1" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-1-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.1.168-418.67_1.0-1_amd64.deb
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.2" ]
-    then
-        ALPAKA_CUDA_PKG_DEB_NAME=cuda-repo-ubuntu1804-10-2-local
-        ALPAKA_CUDA_PKG_FILE_NAME="${ALPAKA_CUDA_PKG_DEB_NAME}"-10.2.89-440.33.01_1.0-1_amd64.deb
-        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    else
-        echo CUDA versions other than 8.0, 9.0, 9.1, 9.2, 10.0, 10.1 and 10.2 are not currently supported on linux!
-    fi
-    if [ -z "$(ls -A ${ALPAKA_CI_CUDA_DIR})" ]
-    then
-        mkdir -p "${ALPAKA_CI_CUDA_DIR}"
-        travis_retry wget --no-verbose -O "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}" "${ALPAKA_CUDA_PKG_FILE_PATH}"
-    fi
-    sudo dpkg --install "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}"
-
-    travis_retry sudo apt-get -y --quiet update
-
-    # Install CUDA
-    # Currently we do not install CUDA fully: sudo apt-get --quiet -y install cuda
-    # We only install the minimal packages. Because of our manual partial installation we have to create a symlink at /usr/local/cuda
-    sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install cuda-core-"${ALPAKA_CUDA_VERSION}" cuda-cudart-"${ALPAKA_CUDA_VERSION}" cuda-cudart-dev-"${ALPAKA_CUDA_VERSION}" cuda-curand-"${ALPAKA_CUDA_VERSION}" cuda-curand-dev-"${ALPAKA_CUDA_VERSION}"
-    sudo ln -s /usr/local/cuda-"${ALPAKA_CUDA_VERSION}" /usr/local/cuda
-
-    if [ "${ALPAKA_CUDA_COMPILER}" == "clang" ]
-    then
-        travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install g++-multilib
-    fi
-
-    # clean up
-    sudo rm -rf "${ALPAKA_CI_CUDA_DIR}"/"${ALPAKA_CUDA_PKG_FILE_NAME}"
-    sudo dpkg --purge "${ALPAKA_CUDA_PKG_DEB_NAME}"
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    if [ "${ALPAKA_CUDA_VERSION}" == "10.0" ]
-    then
-        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.0.130_411.31_win10
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.1" ]
-    then
-        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.1.168_425.25_win10.exe
-        ALPAKA_CUDA_PKG_FILE_PATH=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    elif [ "${ALPAKA_CUDA_VERSION}" == "10.2" ]
-    then
-        ALPAKA_CUDA_PKG_FILE_NAME=cuda_10.2.89_441.22_win10.exe
-        ALPAKA_CUDA_PKG_FILE_PATH=http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/${ALPAKA_CUDA_PKG_FILE_NAME}
-    else
-        echo CUDA versions other than 10.0, 10.1 and 10.2 are not currently supported on Windows!
-    fi
-
-    curl -L -o cuda_installer.exe ${ALPAKA_CUDA_PKG_FILE_PATH}
-    ./cuda_installer.exe -s "nvcc_${ALPAKA_CUDA_VERSION}" "curand_dev_${ALPAKA_CUDA_VERSION}"
-    # Deleting the installer worked until 08/2019 but something changed and this line now takes up to 25 minutes.
-    #rm -f cuda_installer.exe
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/install_gcc.sh b/thirdParty/cupla/alpaka/script/travis/install_gcc.sh
deleted file mode 100755
index 74e7c28262..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install_gcc.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_GCC_VER?'ALPAKA_CI_GCC_VER must be specified'}"
-: "${ALPAKA_CI_SANITIZERS?'ALPAKA_CI_SANITIZERS must be specified'}"
-: "${CXX?'CXX must be specified'}"
-
-travis_retry sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-travis_retry sudo apt-get -y --quiet update
-
-travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install g++-"${ALPAKA_CI_GCC_VER}"
-sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"${ALPAKA_CI_GCC_VER}" 50
-sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"${ALPAKA_CI_GCC_VER}" 50
-if [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]]
-then
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libtsan0
-fi
-
-which "${CXX}"
-${CXX} -v
diff --git a/thirdParty/cupla/alpaka/script/travis/install_hip.sh b/thirdParty/cupla/alpaka/script/travis/install_hip.sh
deleted file mode 100755
index 2eb91e6b21..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install_hip.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2018-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_HIP_ROOT_DIR?'ALPAKA_CI_HIP_ROOT_DIR must be specified'}"
-: "${ALPAKA_CI_HIP_BRANCH?'ALPAKA_CI_HIP_BRANCH must be specified'}"
-: "${CMAKE_BUILD_TYPE?'CMAKE_BUILD_TYPE must be specified'}"
-: "${CXX?'CXX must be specified'}"
-: "${CC?'CC must be specified'}"
-: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
-
-# CMake
-export PATH=${ALPAKA_CI_CMAKE_DIR}/bin:${PATH}
-cmake --version
-
-HIP_SOURCE_DIR=${ALPAKA_CI_HIP_ROOT_DIR}/source-hip/
-
-git clone -b "${ALPAKA_CI_HIP_BRANCH}" --quiet --recursive --single-branch https://github.com/ROCm-Developer-Tools/HIP.git "${HIP_SOURCE_DIR}"
-(cd "${HIP_SOURCE_DIR}"; mkdir -p build; cd build; cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX="${ALPAKA_CI_HIP_ROOT_DIR}" -DBUILD_TESTING=OFF .. && make && make install)
-
-
-## rocRAND
-export HIP_PLATFORM=nvcc
-export HIP_RUNTIME=nvcc
-export ROCRAND_SOURCE_DIR=${ALPAKA_CI_HIP_ROOT_DIR}/source-rocrand/
-if [ ! -d "${ROCRAND_SOURCE_DIR}" ]
-then
-    # install it into the HIP install dir
-    git clone --quiet --recursive https://github.com/ROCmSoftwarePlatform/rocRAND "${ROCRAND_SOURCE_DIR}"
-    (cd "${ROCRAND_SOURCE_DIR}"; mkdir -p build; cd build; cmake -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" -DCMAKE_INSTALL_PREFIX="${ALPAKA_CI_HIP_ROOT_DIR}" -DBUILD_BENCHMARK=OFF -DBUILD_TEST=OFF -DNVGPU_TARGETS="30" -DCMAKE_MODULE_PATH="${ALPAKA_CI_HIP_ROOT_DIR}/cmake" -DHIP_PLATFORM="${HIP_PLATFORM}" .. && make && make install)
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/install_tbb.sh b/thirdParty/cupla/alpaka/script/travis/install_tbb.sh
deleted file mode 100755
index 420cb08c33..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/install_tbb.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/travis_retry.sh
-
-source ./script/travis/set.sh
-
-# Install TBB
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    travis_retry sudo apt-get -y --quiet --allow-unauthenticated --no-install-recommends install libtbb-dev
-elif [ "$TRAVIS_OS_NAME" = "osx" ]
-then
-    brew unlink python@2
-    brew install tbb
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    TBB_ARCHIVE_VER="tbb44_20160526oss"
-    TBB_DOWNLOAD_URL="https://github.com/intel/tbb/releases/download/4.4.5/${TBB_ARCHIVE_VER}_win.zip"
-    TBB_DST_PATH="tbb.zip"
-    powershell.exe -Command '[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 ; Invoke-WebRequest "'${TBB_DOWNLOAD_URL}'" -OutFile "'${TBB_DST_PATH}'"'
-    mkdir "${TBB_ROOT_DIR}"
-    unzip -q "${TBB_DST_PATH}" -d "${TBB_ROOT_DIR}"
-    rm "${TBB_DST_PATH}"
-    TBB_UNZIP_DIR="${TBB_ROOT_DIR}/${TBB_ARCHIVE_VER}"
-    mv ${TBB_UNZIP_DIR}/* "${TBB_ROOT_DIR}/"
-    rmdir "${TBB_UNZIP_DIR}"
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/prepare_sanitizers.sh b/thirdParty/cupla/alpaka/script/travis/prepare_sanitizers.sh
deleted file mode 100755
index 69832e7dd9..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/prepare_sanitizers.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# Exports the CMAKE_CXX_FLAGS and CMAKE_EXE_LINKER_FLAGS to enable the sanitizers listed in ALPAKA_CI_SANITIZERS.
-if [ -z "${CMAKE_CXX_FLAGS+x}" ]
-then
-    export CMAKE_CXX_FLAGS=
-fi
-if [ -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
-then
-    export CMAKE_EXE_LINKER_FLAGS=
-fi
-if [ -z "${ASAN_OPTIONS+x}" ]
-then
-    export ASAN_OPTIONS=
-fi
-if [ -z "${LSAN_OPTIONS+x}" ]
-then
-    export LSAN_OPTIONS=
-fi
-
-#-------------------------------------------------------------------------------
-# sanitizers
-# General sanitizer settings
-if [[ "${ALPAKA_CI_SANITIZERS}" != "" ]]
-then
-    # - to get nicer stack-traces:
-    CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer"
-    # - to get perfect stack-traces:
-    CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fno-optimize-sibling-calls"
-
-    # g++ needs to use a different linker
-    if [[ "${CXX}" == "g++" ]]
-    then
-        CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold"
-    fi
-
-    # UBSan - http://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html
-    if [[ "${ALPAKA_CI_SANITIZERS}" == *"UBSan"* ]]
-    then
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=undefined"
-
-        if [[ "${CXX}" == "clang++" ]]
-        then
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize-blacklist=$(pwd)/test/sanitizer_ubsan_blacklist.txt"
-
-            # Previously 'local-bounds' was part of UBsan but has been removed because it is not a pure front-end check
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=local-bounds"
-            # 'unsigned-integer-overflow' is not really undefined behaviour but we want to handle it as such for our tests.
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=unsigned-integer-overflow"
-        fi
-    fi
-
-    # ASan - http://clang.llvm.org/docs/AddressSanitizer.html
-    if [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]]
-    then
-        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]] )
-        then
-            echo ASan is not supported in combination with TSan or MSan
-            exit 1
-        fi
-
-        if ( [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "ON" ] && [ "${ALPAKA_CUDA_COMPILER}" == "clang" ] )
-        then
-            # fatal error: error in backend: Module has a nontrivial global ctor, which NVPTX does not support.
-            # clang-3.9: error: clang frontend command failed with exit code 70 (use -v to see invocation)
-            echo ASan is not supported in combination with clang used as CUDA compiler
-            exit 1
-        fi
-
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=address"
-
-        if [[ "${CXX}" != "clang++" ]]
-        then
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize-address-use-after-scope"
-        fi
-
-        ASAN_OPTIONS="strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1"
-        LSAN_OPTIONS="print_suppressions=1:suppressions=$(pwd)/test/sanitizer_lsan_blacklist.txt"
-    fi
-
-    # TSan - http://clang.llvm.org/docs/ThreadSanitizer.html
-    # TSan requires PositionIndependentCode -pie;-fPIE;-fPIC. clang sets this automatically, gcc not.
-    # All base libraries (e.g. boost) have to be build with this flag.
-    # Furthermore, by installing gcc, libtsan0 is not automatically installed.
-    if [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]]
-    then
-        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]] )
-        then
-            echo TSan is not supported in combination with ASan or MSan
-            exit 1
-        fi
-
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=thread"
-        if [ "${CXX}" == "g++" ]
-        then
-            CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -pie -fPIE"
-            CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -ltsan"
-        fi
-    fi
-
-    # MSan - http://clang.llvm.org/docs/MemorySanitizer.html
-    # NOTE: Currently we can not enable this for CI as this finds some 'use-of-uninitialized-value' inside:
-    #   - boost`s smart pointers used by the unit test framework
-    #   - alpaka/test/integ/mandelbrot/src/main.cpp:450:9 std::replace
-    #   - alpaka/include/alpaka/kernel/TaskKernelCpuThreads.hpp:307:21 used alpaka/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp:130:44
-    if [[ "${ALPAKA_CI_SANITIZERS}" == *"MSan"* ]]
-    then
-        if ( [[ "${ALPAKA_CI_SANITIZERS}" == *"ASan"* ]] || [[ "${ALPAKA_CI_SANITIZERS}" == *"TSan"* ]] )
-        then
-            echo MSan is not supported in combination with ASan or TSan
-            exit 1
-        fi
-
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fsanitize=memory -fsanitize-memory-track-origins"
-    fi
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/print_travisEnv.sh b/thirdParty/cupla/alpaka/script/travis/print_travisEnv.sh
deleted file mode 100755
index 47fe9ea9dd..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/print_travisEnv.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# Print the travis environment variables: http://docs.travis-ci.com/user/ci-environment/
-echo TRAVIS_BRANCH: "${TRAVIS_BRANCH}"
-echo TRAVIS_BUILD_DIR: "${TRAVIS_BUILD_DIR}"
-echo TRAVIS_BUILD_ID: "${TRAVIS_BUILD_ID}"
-echo TRAVIS_BUILD_NUMBER: "${TRAVIS_BUILD_NUMBER}"
-echo TRAVIS_COMMIT: "${TRAVIS_COMMIT}"
-echo TRAVIS_COMMIT_RANGE: "${TRAVIS_COMMIT_RANGE}"
-echo TRAVIS_JOB_ID: "${TRAVIS_JOB_ID}"
-echo TRAVIS_JOB_NUMBER: "${TRAVIS_JOB_NUMBER}"
-echo TRAVIS_PULL_REQUEST: "${TRAVIS_PULL_REQUEST}"
-echo TRAVIS_SECURE_ENV_VARS: "${TRAVIS_SECURE_ENV_VARS}"
-echo TRAVIS_REPO_SLUG: "${TRAVIS_REPO_SLUG}"
-echo TRAVIS_OS_NAME: "${TRAVIS_OS_NAME}"
-echo TRAVIS_TAG: "${TRAVIS_TAG}"
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    # Show all running services
-    sudo service --status-all
-
-    # Stop some unnecessary services to save memory
-    sudo /etc/init.d/mysql stop
-    sudo /etc/init.d/postgresql stop
-    sudo /etc/init.d/redis-server stop
-
-    # Show memory stats
-    sudo smem
-    sudo free -m -t
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/run.sh b/thirdParty/cupla/alpaka/script/travis/run.sh
deleted file mode 100755
index 58a0159486..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/run.sh
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_CI_CMAKE_DIR?'ALPAKA_CI_CMAKE_DIR must be specified'}"
-echo "ALPAKA_CI_CMAKE_DIR: ${ALPAKA_CI_CMAKE_DIR}"
-: "${ALPAKA_CI_ANALYSIS?'ALPAKA_CI_ANALYSIS must be specified'}"
-echo "ALPAKA_CI_ANALYSIS: ${ALPAKA_CI_ANALYSIS}"
-: "${ALPAKA_CI_INSTALL_CUDA?'ALPAKA_CI_INSTALL_CUDA must be specified'}"
-: "${ALPAKA_CI_INSTALL_HIP?'ALPAKA_CI_INSTALL_HIP must be specified'}"
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    : "${ALPAKA_CI_STDLIB?'ALPAKA_CI_STDLIB must be specified'}"
-    echo "ALPAKA_CI_STDLIB: ${ALPAKA_CI_STDLIB}"
-fi
-: "${CXX?'CXX must be specified'}"
-echo "CXX: ${CXX}"
-
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ -z "${LD_LIBRARY_PATH+x}" ]
-    then
-        LD_LIBRARY_PATH=
-    fi
-fi
-
-# CMake
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    export PATH=${ALPAKA_CI_CMAKE_DIR}/bin:${PATH}
-fi
-cmake --version
-
-#TBB
-if [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    #ALPAKA_TBB_BIN_DIR="${TBB_ROOT_DIR}/bin/ia32/vc14"
-    ALPAKA_TBB_BIN_DIR="${TBB_ROOT_DIR}/bin/intel64/vc14"
-    export PATH=${PATH}:"${ALPAKA_TBB_BIN_DIR}"
-fi
-
-# CUDA
-if [ "${ALPAKA_CI_INSTALL_CUDA}" == "ON" ]
-then
-    : "${ALPAKA_CUDA_VERSION?'ALPAKA_CUDA_VERSION must be specified'}"
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ]
-    then
-        # CUDA
-        export PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}/bin:$PATH
-        export LD_LIBRARY_PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}/lib64:${LD_LIBRARY_PATH}
-        # We have to explicitly add the stub libcuda.so to CUDA_LIB_PATH because the real one would be installed by the driver (which we can not install).
-        export CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs/
-
-        if [ "${ALPAKA_CUDA_COMPILER}" == "nvcc" ]
-        then
-            which nvcc
-            nvcc -V
-        fi
-    elif [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        export PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${ALPAKA_CUDA_VERSION}\bin":$PATH
-        export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${ALPAKA_CUDA_VERSION}"
-    fi
-fi
-
-# HIP
-if [ "${ALPAKA_CI_INSTALL_HIP}" == "ON" ]
-then
-: "${ALPAKA_CI_HIP_ROOT_DIR?'ALPAKA_CI_HIP_ROOT_DIR must be specified'}"
-
-    # HIP
-    # HIP_PATH required by HIP tools
-    export HIP_PATH=${ALPAKA_CI_HIP_ROOT_DIR}
-    # CUDA_PATH required by HIP tools
-    if [ -n "$(command -v nvcc)" ]
-    then
-        export CUDA_PATH=$(dirname $(which nvcc))/../
-    else
-        export CUDA_PATH=/usr/local/cuda-${ALPAKA_CUDA_VERSION}
-    fi
-
-    export PATH=${HIP_PATH}/bin:$PATH
-    export LD_LIBRARY_PATH=${HIP_PATH}/lib64:${HIP_PATH}/hiprand/lib:${LD_LIBRARY_PATH}
-    export CMAKE_PREFIX_PATH=${HIP_PATH}:${HIP_PATH}/hiprand:${CMAKE_PREFIX_PATH:-}
-    # to avoid "use of uninitialized value .." warnings in perl script hipcc
-    # TODO: rely on CI vars for platform and architecture
-    export HIP_PLATFORM=nvcc
-    export HIP_RUNTIME=nvcc
-    # calls nvcc or hcc
-    which hipcc
-    hipcc -V
-    which hipconfig
-    hipconfig --platform
-    hipconfig -v
-    # print newline as previous command does not do this
-    echo
-
-fi
-
-# clang
-if [ "${CXX}" == "clang++" ]
-then
-    # We have to prepend /usr/bin to the path because else the preinstalled clang from usr/bin/local/ is used.
-    export PATH=${ALPAKA_CI_CLANG_DIR}/bin:${PATH}
-    export LD_LIBRARY_PATH=${ALPAKA_CI_CLANG_DIR}/lib:${LD_LIBRARY_PATH}
-    if [ -z "${CPPFLAGS+x}" ]
-    then
-        CPPFLAGS=
-    fi
-    export CPPFLAGS="-I ${ALPAKA_CI_CLANG_DIR}/include/c++/v1 ${CPPFLAGS}"
-fi
-
-# stdlib
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-    if [ "${ALPAKA_CI_STDLIB}" == "libc++" ]
-    then
-        if [ -z "${CMAKE_CXX_FLAGS+x}" ]
-        then
-            export CMAKE_CXX_FLAGS=
-        fi
-        CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -stdlib=libc++"
-
-        if [ -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
-        then
-            export CMAKE_EXE_LINKER_FLAGS=
-        fi
-        CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -lc++ -lc++abi"
-    fi
-
-    which "${CXX}"
-    ${CXX} -v
-
-    source ./script/travis/prepare_sanitizers.sh
-    if [ "${ALPAKA_CI_ANALYSIS}" == "ON" ] ;then ./script/travis/run_analysis.sh ;fi
-fi
-
-./script/travis/run_build.sh
-
-if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./script/travis/run_tests.sh ;fi
diff --git a/thirdParty/cupla/alpaka/script/travis/run_analysis.sh b/thirdParty/cupla/alpaka/script/travis/run_analysis.sh
deleted file mode 100755
index de94432400..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/run_analysis.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-# sloc
-sloccount .
-
-#-------------------------------------------------------------------------------
-# TODO/FIXME/HACK
-grep -r HACK ./* || true
-grep -r FIXME ./* || true
-grep -r TODO ./* || true
-
-#-------------------------------------------------------------------------------
-# check shell script with shellcheck
-find . -type f -name "*.sh" -exec shellcheck {} \;
diff --git a/thirdParty/cupla/alpaka/script/travis/run_build.sh b/thirdParty/cupla/alpaka/script/travis/run_build.sh
deleted file mode 100755
index ac60242a15..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/run_build.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2014-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-#-------------------------------------------------------------------------------
-
-# create a cmake variable definition if an environment variable exists
-#
-# This function can not handle environment variables with spaces in its content.
-#
-# @param $1 cmake/environment variable name
-#
-# @result if $1 exists cmake variable definition else nothing is returned
-#
-# @code{.bash}
-# FOO=ON
-# echo "$(env2cmake FOO)" # returns "-DFOO=ON"
-# echo "$(env2cmake BAR)" # returns nothing
-# @endcode
-function env2cmake()
-{
-    if [ ! -z "${1+x}" ] ; then
-        echo -n "-D$1=${!1}"
-    fi
-}
-
-#-------------------------------------------------------------------------------
-# Build and execute all tests.
-if [ ! -z "${CMAKE_CXX_FLAGS+x}" ]
-then
-    echo "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-fi
-if [ ! -z "${CMAKE_EXE_LINKER_FLAGS+x}" ]
-then
-    echo "CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}"
-fi
-if [ ! -z "${KMP_DEVICE_THREAD_LIMIT+x}" ]
-then
-    echo "KMP_DEVICE_THREAD_LIMIT=${KMP_DEVICE_THREAD_LIMIT}"
-fi
-if [ ! -z "${KMP_ALL_THREADS+x}" ]
-then
-    echo "KMP_ALL_THREADS=${KMP_ALL_THREADS}"
-fi
-if [ ! -z "${KMP_TEAMS_THREAD_LIMIT+x}" ]
-then
-    echo "KMP_TEAMS_THREAD_LIMIT=${KMP_TEAMS_THREAD_LIMIT}"
-fi
-if [ ! -z "${OMP_THREAD_LIMIT+x}" ]
-then
-    echo "OMP_THREAD_LIMIT=${OMP_THREAD_LIMIT}"
-fi
-if [ ! -z "${OMP_NUM_THREADS+x}" ]
-then
-    echo "OMP_NUM_THREADS=${OMP_NUM_THREADS}"
-fi
-
-mkdir -p build/
-cd build/
-
-if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-then
-    ALPAKA_CI_CMAKE_GENERATOR="Unix Makefiles"
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    # Use the 64 bit compiler
-    # FIXME: Path not found but does not seem to be necessary anymore
-    #"./C/Program Files (x86)/Microsoft Visual Studio/2017/Community/VC/Auxiliary/Build/vcvarsall.bat" amd64
-
-    # Add msbuild to the path
-    MSBUILD_PATH="/C/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/MSBuild/15.0/Bin"
-    export PATH=$MSBUILD_PATH:$PATH
-    MSBuild.exe -version
-
-    # Select the generator
-    ALPAKA_CI_CMAKE_GENERATOR="Visual Studio 15 2017 Win64"
-fi
-
-cmake -G "${ALPAKA_CI_CMAKE_GENERATOR}" \
-    "$(env2cmake BOOST_ROOT)" -DBOOST_LIBRARYDIR="${ALPAKA_CI_BOOST_LIB_DIR}/lib" -DBoost_USE_STATIC_LIBS=ON -DBoost_USE_MULTITHREADED=ON -DBoost_USE_STATIC_RUNTIME=OFF \
-    "$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" \
-    "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE)" \
-    "$(env2cmake ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE)" \
-    "$(env2cmake ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE)" "$(env2cmake ALPAKA_ACC_CPU_BT_OMP4_ENABLE)" \
-    "$(env2cmake TBB_ROOT_DIR)" \
-    "$(env2cmake ALPAKA_ACC_GPU_CUDA_ENABLE)" "$(env2cmake ALPAKA_CUDA_VERSION)" "$(env2cmake ALPAKA_ACC_GPU_CUDA_ONLY_MODE)" "$(env2cmake ALPAKA_CUDA_ARCH)" "$(env2cmake ALPAKA_CUDA_COMPILER)" \
-    "$(env2cmake ALPAKA_CUDA_FAST_MATH)" "$(env2cmake ALPAKA_CUDA_FTZ)" "$(env2cmake ALPAKA_CUDA_SHOW_REGISTER)" "$(env2cmake ALPAKA_CUDA_KEEP_FILES)" "$(env2cmake ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA)" "$(env2cmake ALPAKA_CUDA_NVCC_EXPT_RELAXED_CONSTEXPR)" "$(env2cmake ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION)" \
-    "$(env2cmake ALPAKA_ACC_GPU_HIP_ENABLE)" "$(env2cmake ALPAKA_ACC_GPU_HIP_ONLY_MODE)" "$(env2cmake ALPAKA_HIP_PLATFORM)" \
-    "$(env2cmake ALPAKA_DEBUG)" "$(env2cmake ALPAKA_CI)" "$(env2cmake ALPAKA_CI_ANALYSIS)" "$(env2cmake ALPAKA_CXX_STANDARD)" \
-    ".."
-if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-then
-    make VERBOSE=1
-elif [ "$TRAVIS_OS_NAME" = "windows" ]
-then
-    MSBuild.exe "alpakaAll.sln" -p:Configuration=${CMAKE_BUILD_TYPE} -maxcpucount:2 -verbosity:minimal
-fi
-
-cd ..
diff --git a/thirdParty/cupla/alpaka/script/travis/run_tests.sh b/thirdParty/cupla/alpaka/script/travis/run_tests.sh
deleted file mode 100755
index 656bffc069..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/run_tests.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2017-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-: "${ALPAKA_ACC_GPU_CUDA_ENABLE?'ALPAKA_ACC_GPU_CUDA_ENABLE must be specified'}"
-: "${ALPAKA_ACC_GPU_HIP_ENABLE?'ALPAKA_ACC_GPU_HIP_ENABLE must be specified'}"
-
-if [ "${ALPAKA_ACC_GPU_CUDA_ENABLE}" == "OFF" ] && [ "${ALPAKA_ACC_GPU_HIP_ENABLE}" == "OFF" ];
-then
-    cd build/
-
-    if [ "$TRAVIS_OS_NAME" = "linux" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-    then
-        ctest -V
-    elif [ "$TRAVIS_OS_NAME" = "windows" ]
-    then
-        ctest -V -C ${CMAKE_BUILD_TYPE}
-    fi
-
-    cd ..
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/script.sh b/thirdParty/cupla/alpaka/script/travis/script.sh
deleted file mode 100755
index 666b4ab3e9..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/script.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2018-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-source ./script/travis/set.sh
-
-./script/travis/print_travisEnv.sh
-source ./script/travis/before_install.sh
-
-if [ "$TRAVIS_OS_NAME" = "linux" ]
-then
-  ./script/travis/docker_install.sh
-  ./script/travis/docker_run.sh
-elif [ "$TRAVIS_OS_NAME" = "windows" ] || [ "$TRAVIS_OS_NAME" = "osx" ]
-then
-  ./script/travis/install.sh
-  ./script/travis/run.sh
-fi
diff --git a/thirdParty/cupla/alpaka/script/travis/set.sh b/thirdParty/cupla/alpaka/script/travis/set.sh
deleted file mode 100755
index 262b6a77e5..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/set.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-#
-# Copyright 2018-2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-#-------------------------------------------------------------------------------
-# -e: exit as soon as one command returns a non-zero exit code
-# -o pipefail: pipeline returns exit code of the rightmost command with a non-zero exit code
-# -u: treat unset variables as an error
-# -v: Print shell input lines as they are read
-# -x: Print command traces before executing command
-set -eouvx pipefail
diff --git a/thirdParty/cupla/alpaka/script/travis/travis_retry.sh b/thirdParty/cupla/alpaka/script/travis/travis_retry.sh
deleted file mode 100755
index d29ab93241..0000000000
--- a/thirdParty/cupla/alpaka/script/travis/travis_retry.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2019 Benjamin Worpitz
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-set -euo pipefail
-
-travis_retry() {
-  local result=0
-  local count=1
-  while [ $count -le 3 ]; do
-    [ $result -ne 0 ] && {
-      echo -e "\n${ANSI_RED}The command \"$*\" failed. Retrying, $count of 3.${ANSI_RESET}\n" >&2
-    }
-    "$@"
-    result=$?
-    [ $result -eq 0 ] && break
-    count=$((count + 1))
-    sleep 1
-  done
-  [ $count -gt 3 ] && {
-    echo -e "\n${ANSI_RED}The command \"$*\" failed 3 times.${ANSI_RESET}\n" >&2
-  }
-  return $result
-}
diff --git a/thirdParty/cupla/alpaka/script/travis_retry.sh b/thirdParty/cupla/alpaka/script/travis_retry.sh
new file mode 100755
index 0000000000..bfd05ac0f3
--- /dev/null
+++ b/thirdParty/cupla/alpaka/script/travis_retry.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+#
+# Copyright 2019 Benjamin Worpitz, Rene Widera
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+ANSI_RED="\033[31m"
+ANSI_RESET="\033[0m"
+
+travis_retry() {
+  set +euo pipefail
+  local result=0
+  local count=1
+  while [ $count -le 3 ]; do
+    [ $result -ne 0 ] && {
+      echo -e "\n${ANSI_RED}The command \"$*\" failed. Retrying, $count of 3.${ANSI_RESET}\n" >&2
+    }
+    "$@"
+    result=$?
+    [ $result -eq 0 ] && break
+    count=$((count + 1))
+    sleep 1
+  done
+  [ $count -gt 3 ] && {
+    echo -e "\n${ANSI_RED}The command \"$*\" failed 3 times.${ANSI_RESET}\n" >&2
+  }
+  return $result
+}
diff --git a/thirdParty/cupla/alpaka/test/CMakeLists.txt b/thirdParty/cupla/alpaka/test/CMakeLists.txt
index e8140a1171..9654384edd 100644
--- a/thirdParty/cupla/alpaka/test/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/CMakeLists.txt
@@ -1,80 +1,41 @@
 #
-# Copyright 2015-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2015-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15)
 
-# Search in <PackageName>_ROOT:
-# https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
-LIST(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
-FIND_PACKAGE(alpaka REQUIRED)
-
-ADD_SUBDIRECTORY("common/")
-
-OPTION(ALPAKA_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON)
+add_subdirectory(catch_main)
 
-IF(ALPAKA_USE_INTERNAL_CATCH2)
-    message(STATUS "Catch2: Using INTERNAL version 2.11.0")
-ELSE()
-    find_package(Catch2 2.11.0 CONFIG REQUIRED)
-    set_target_properties(Catch2::Catch2 PROPERTIES IMPORTED_GLOBAL TRUE)
-    message(STATUS "Catch2: Found version ${Catch2_VERSION}")
-ENDIF()
-
-add_library(CatchMain CatchMain.cpp)
-# target_compile_features(CatchMain PUBLIC cxx_std_11)  # min C++11
-set_target_properties(CatchMain PROPERTIES
-    FOLDER "test"
-    CXX_STANDARD 11  # exactly C++11
-    CXX_EXTENSIONS OFF
-    CXX_STANDARD_REQUIRED ON
-    POSITION_INDEPENDENT_CODE ON
-    WINDOWS_EXPORT_ALL_SYMBOLS ON
-)
-target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE")
-IF(MSVC)
-    target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_WINDOWS_CRTDBG")
-    target_compile_options(CatchMain PUBLIC "/bigobj")
-ENDIF()
+if(NOT TARGET alpaka::alpaka)
+    list(APPEND CMAKE_MODULE_PATH "${ALPAKA_ROOT}")
+    find_package(alpaka REQUIRED)
+endif()
 
-IF(ALPAKA_USE_INTERNAL_CATCH2)
-    target_include_directories(CatchMain SYSTEM PUBLIC
-        ${CMAKE_CURRENT_LIST_DIR}/../thirdParty/catch2/include)
-ELSE()
-    target_include_directories(CatchMain SYSTEM PUBLIC
-        $<TARGET_PROPERTY:Catch2::Catch2,INTERFACE_INCLUDE_DIRECTORIES>)
-ENDIF()
-SET_TARGET_PROPERTIES(
-    CatchMain
-    PROPERTIES FOLDER "test")
+add_subdirectory(common)
 
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE AND ALPAKA_CUDA_COMPILER MATCHES "nvcc")
-    # NVCC does not incorporate the COMPILE_OPTIONS of a target but only the CMAKE_CXX_FLAGS
-    GET_TARGET_PROPERTY(_COMMON_COMPILE_OPTIONS common COMPILE_OPTIONS)
+if(ALPAKA_ACC_GPU_CUDA_ENABLE AND ALPAKA_CUDA_COMPILER MATCHES "nvcc")
+    # NVCC does not incorporate the INTERFACE_COMPILE_OPTIONS of a target but only the CMAKE_CXX_FLAGS
+    get_target_property(_COMMON_COMPILE_OPTIONS common INTERFACE_COMPILE_OPTIONS)
     # If the property does not exist, the variable is set to NOTFOUND.
-    IF(_COMMON_COMPILE_OPTIONS)
-        STRING(REPLACE ";" " " _COMMON_COMPILE_OPTIONS_STRING "${_COMMON_COMPILE_OPTIONS}")
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_COMMON_COMPILE_OPTIONS_STRING}")
-    ENDIF()
+    if(_COMMON_COMPILE_OPTIONS)
+        string(REPLACE ";" " " _COMMON_COMPILE_OPTIONS_STRING "${_COMMON_COMPILE_OPTIONS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_COMMON_COMPILE_OPTIONS_STRING}")
+    endif()
     # nvcc supports werror starting with 10.2
-    IF(CUDA_VERSION GREATER_EQUAL 10.2)
-        MESSAGE("adding -Werror=all-warnings")
-        LIST(APPEND CUDA_NVCC_FLAGS -Werror=all-warnings)
-    ENDIF()
-ENDIF()
+    if(CUDA_VERSION GREATER_EQUAL 10.2)
+        message("adding -Werror=all-warnings")
+        list(APPEND CUDA_NVCC_FLAGS -Werror=all-warnings)
+    endif()
+endif()
 
-LIST(APPEND _ALPAKA_TEST_OPTIONS "--use-colour yes")
+list(APPEND _ALPAKA_TEST_OPTIONS --use-colour yes)
 
-ADD_SUBDIRECTORY("analysis/")
-ADD_SUBDIRECTORY("integ/")
-ADD_SUBDIRECTORY("unit/")
+add_subdirectory(analysis)
+add_subdirectory(integ)
+add_subdirectory(unit)
diff --git a/thirdParty/cupla/alpaka/test/CatchMain.cpp b/thirdParty/cupla/alpaka/test/CatchMain.cpp
deleted file mode 100644
index 31734b3a0a..0000000000
--- a/thirdParty/cupla/alpaka/test/CatchMain.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-/* Copyright 2019 Axel Huebl
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#define CATCH_CONFIG_MAIN
-#include <catch2/catch.hpp>
diff --git a/thirdParty/cupla/alpaka/test/analysis/CMakeLists.txt b/thirdParty/cupla/alpaka/test/analysis/CMakeLists.txt
index b1481957e1..451db4de00 100644
--- a/thirdParty/cupla/alpaka/test/analysis/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/analysis/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz
+# Copyright 2014-2020 Benjamin Worpitz, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,12 +12,12 @@
 # Required CMake version.
 ################################################################################
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.0)
+cmake_minimum_required(VERSION 3.15)
 
-PROJECT("alpakaAnalysisTest")
+project("alpakaAnalysisTest")
 
 ################################################################################
 # Add subdirectories.
 ################################################################################
 
-ADD_SUBDIRECTORY("headerCheck/")
+add_subdirectory("headerCheck/")
diff --git a/thirdParty/cupla/alpaka/test/analysis/headerCheck/CMakeLists.txt b/thirdParty/cupla/alpaka/test/analysis/headerCheck/CMakeLists.txt
index b37407d1df..641e9f7632 100644
--- a/thirdParty/cupla/alpaka/test/analysis/headerCheck/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/analysis/headerCheck/CMakeLists.txt
@@ -1,59 +1,55 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-IF(NOT ALPAKA_CI OR (ALPAKA_CI AND ALPAKA_CI_ANALYSIS))
+if(NOT ALPAKA_CI OR (ALPAKA_CI AND ALPAKA_CI_ANALYSIS))
 
-SET(_TARGET_NAME "headerCheck")
+    set(_TARGET_NAME "headerCheck")
 
-#-------------------------------------------------------------------------------
-# Create source files.
+    #---------------------------------------------------------------------------
+    # Create source files.
+    set(_ALPAKA_INCLUDE_DIRECTORY "${_ALPAKA_ROOT_DIR}/include")
+    set(_ALPAKA_SUFFIXED_INCLUDE_DIR "${_ALPAKA_INCLUDE_DIRECTORY}/alpaka")
+    append_recursive_files("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "hpp" "_ALPAKA_FILES_HEADER")
 
-SET(_ALPAKA_INCLUDE_DIRECTORY "${_ALPAKA_ROOT_DIR}/include")
-SET(_ALPAKA_SUFFIXED_INCLUDE_DIR "${_ALPAKA_INCLUDE_DIRECTORY}/alpaka")
-append_recursive_files("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "hpp" "_ALPAKA_FILES_HEADER")
+    set(_GENERATED_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
 
-SET(_GENERATED_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src")
+    file(REMOVE_RECURSE ${_GENERATED_SOURCE_DIR})
 
-FILE(REMOVE_RECURSE ${_GENERATED_SOURCE_DIR})
+    foreach(_HEADER_FILE ${_ALPAKA_FILES_HEADER})
+        # Remove the parent directory from the path.
+        # NOTE: This is not correct because it does not only replace at the beginning of the string.
+        #  "STRING(REGEX REPLACE" would be correct if there was an easy way to escape arbitrary strings.
+        string(
+            REPLACE "${_ALPAKA_SUFFIXED_INCLUDE_DIR}/" ""
+            _HEADER_FILE
+            "${_HEADER_FILE}")
+        set(_SOURCE_FILE "${_GENERATED_SOURCE_DIR}/${_HEADER_FILE}.cpp")
+        file(WRITE "${_SOURCE_FILE}" "#include <alpaka/${_HEADER_FILE}>\n#include <alpaka/${_HEADER_FILE}>\n")
+    endforeach()
 
-FOREACH(_HEADER_FILE ${_ALPAKA_FILES_HEADER})
-    # Remove the parent directory from the path.
-    # NOTE: This is not correct because it does not only replace at the beginning of the string.
-    #  "STRING(REGEX REPLACE" would be correct if there was an easy way to escape arbitrary strings.
-    STRING(
-        REPLACE "${_ALPAKA_SUFFIXED_INCLUDE_DIR}/" ""
-        _HEADER_FILE
-        "${_HEADER_FILE}")
-    SET(_SOURCE_FILE "${_GENERATED_SOURCE_DIR}/${_HEADER_FILE}.cpp")
-    FILE(WRITE "${_SOURCE_FILE}" "#include <alpaka/${_HEADER_FILE}>\n#include <alpaka/${_HEADER_FILE}>\n")
-ENDFOREACH()
+    #---------------------------------------------------------------------------
+    # Add executable.
 
-#-------------------------------------------------------------------------------
-# Add executable.
+    append_recursive_files_add_to_src_group("${_GENERATED_SOURCE_DIR}" "${_GENERATED_SOURCE_DIR}" "cpp" "_FILES_SOURCE")
+    list(APPEND _FILES_SOURCE "src/main.cpp")
 
-append_recursive_files_add_to_src_group("${_GENERATED_SOURCE_DIR}" "${_GENERATED_SOURCE_DIR}" "cpp" "_FILES_SOURCE")
-LIST(APPEND _FILES_SOURCE "src/main.cpp")
+    # Always add all files to the target executable build call to add them to the build project.
+    alpaka_add_executable(
+        ${_TARGET_NAME}
+        ${_FILES_SOURCE})
+    target_link_libraries(
+        ${_TARGET_NAME}
+        PRIVATE common)
 
-# Always add all files to the target executable build call to add them to the build project.
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
+    set_target_properties(headerCheck PROPERTIES FOLDER "test/analysis")
 
-SET_TARGET_PROPERTIES(headerCheck PROPERTIES FOLDER "test/analysis")
+    add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
-
-ENDIF()
+endif()
diff --git a/thirdParty/cupla/alpaka/test/analysis/headerCheck/src/main.cpp b/thirdParty/cupla/alpaka/test/analysis/headerCheck/src/main.cpp
index 11d6b7196f..bd7800d15d 100644
--- a/thirdParty/cupla/alpaka/test/analysis/headerCheck/src/main.cpp
+++ b/thirdParty/cupla/alpaka/test/analysis/headerCheck/src/main.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
diff --git a/thirdParty/cupla/alpaka/test/catch_main/CMakeLists.txt b/thirdParty/cupla/alpaka/test/catch_main/CMakeLists.txt
new file mode 100644
index 0000000000..e00aef39f0
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/catch_main/CMakeLists.txt
@@ -0,0 +1,49 @@
+#
+# Copyright 2015-2020 Benjamin Worpitz, Axel Huebl
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+option(ALPAKA_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON)
+
+if(ALPAKA_USE_INTERNAL_CATCH2)
+    message(STATUS "Catch2: Using INTERNAL version 2.13.3")
+else()
+    find_package(Catch2 2.13.3 CONFIG REQUIRED)
+    set_target_properties(Catch2::Catch2 PROPERTIES IMPORTED_GLOBAL TRUE)
+    message(STATUS "Catch2: Found version ${Catch2_VERSION}")
+endif()
+
+add_library(CatchMain src/CatchMain.cpp)
+# target_compile_features(CatchMain PUBLIC cxx_std_14)  # min C++14
+set_target_properties(CatchMain PROPERTIES
+    FOLDER "test"
+    CXX_STANDARD 14  # exactly C++14
+    CXX_EXTENSIONS OFF
+    CXX_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON
+    WINDOWS_EXPORT_ALL_SYMBOLS ON
+)
+
+target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE")
+if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
+    # Workaround for STL atomic issue: https://forums.developer.nvidia.com/t/support-for-atomic-in-libstdc-missing/135403/2
+    # still appears in NVHPC 20.7
+    target_compile_definitions(CatchMain PUBLIC "__GCC_ATOMIC_TEST_AND_SET_TRUEVAL=1")
+endif()
+if(MSVC)
+    target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_WINDOWS_CRTDBG")
+    target_compile_options(CatchMain PUBLIC "/bigobj")
+endif()
+
+if(ALPAKA_USE_INTERNAL_CATCH2)
+    target_include_directories(CatchMain SYSTEM PUBLIC
+        ${CMAKE_CURRENT_LIST_DIR}/../../thirdParty/catch2/include)
+else()
+    target_include_directories(CatchMain SYSTEM PUBLIC
+        $<TARGET_PROPERTY:Catch2::Catch2,INTERFACE_INCLUDE_DIRECTORIES>)
+endif()
diff --git a/thirdParty/cupla/alpaka/test/catch_main/src/CatchMain.cpp b/thirdParty/cupla/alpaka/test/catch_main/src/CatchMain.cpp
new file mode 100644
index 0000000000..992e94f13d
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/catch_main/src/CatchMain.cpp
@@ -0,0 +1,10 @@
+/* Copyright 2019 Axel Huebl
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch.hpp>
diff --git a/thirdParty/cupla/alpaka/test/common/CMakeLists.txt b/thirdParty/cupla/alpaka/test/common/CMakeLists.txt
index 9d622dc02a..71807bc550 100644
--- a/thirdParty/cupla/alpaka/test/common/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/common/CMakeLists.txt
@@ -1,63 +1,46 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz
+# Copyright 2014-2020 Benjamin Worpitz, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15)
 
-SET(_COMMON_TARGET_NAME "common")
-
-SET(_COMMON_INCLUDE_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/include")
-LIST(APPEND _COMMON_INCLUDE_DIRECTORIES_PUBLIC "${_COMMON_INCLUDE_DIRECTORY}")
-SET(_COMMON_SOURCE_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/src")
+set(_COMMON_TARGET_NAME "common")
+set(_COMMON_COMPILE_OPTIONS_FILE "devCompileOptions.cmake")
+set(_COMMON_INCLUDE_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/include")
 
 # Add all the source files in all recursive subdirectories and group them accordingly.
 append_recursive_files_add_to_src_group("${_COMMON_INCLUDE_DIRECTORY}" "${_COMMON_INCLUDE_DIRECTORY}" "hpp" _COMMON_FILES_HEADER)
-append_recursive_files_add_to_src_group("${_COMMON_SOURCE_DIRECTORY}" "${_COMMON_SOURCE_DIRECTORY}" "cpp" _COMMON_FILES_SOURCE)
 
-INCLUDE("${_ALPAKA_ROOT_DIR}/cmake/dev.cmake")
-LIST(APPEND _COMMON_COMPILE_OPTIONS_PUBLIC ${ALPAKA_DEV_COMPILE_OPTIONS})
-IF(MSVC)
-    LIST(APPEND _COMMON_COMPILE_OPTIONS_PUBLIC "/wd4996")   # This function or variable may be unsafe. Consider using <safe_version> instead.
-ENDIF()
+add_library(${_COMMON_TARGET_NAME} INTERFACE)
+
+target_include_directories(${_COMMON_TARGET_NAME} INTERFACE ${_COMMON_INCLUDE_DIRECTORY})
+
+include(${_COMMON_COMPILE_OPTIONS_FILE})
+target_compile_options(${_COMMON_TARGET_NAME} INTERFACE ${ALPAKA_DEV_COMPILE_OPTIONS})
+
+if(MSVC)
+    target_compile_options(${_COMMON_TARGET_NAME} INTERFACE "/wd4996") # This function or variable may be unsafe. Consider using <safe_version> instead.
+    target_compile_options(${_COMMON_TARGET_NAME} INTERFACE "/bigobj")
+endif()
 
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE OR (ALPAKA_ACC_GPU_HIP_ENABLE AND HIP_PLATFORM MATCHES "nvcc"))
+if(ALPAKA_ACC_GPU_CUDA_ENABLE OR (ALPAKA_ACC_GPU_HIP_ENABLE AND HIP_PLATFORM MATCHES "nvcc"))
     # CUDA driver API is used by EventHostManualTrigger
-    LIST(APPEND _COMMON_LINK_LIBRARIES_PUBLIC "${CUDA_CUDA_LIBRARY}")
-    LIST(APPEND _COMMON_COMPILE_DEFINITIONS_PUBLIC "CUDA_API_PER_THREAD_DEFAULT_STREAM")
-ENDIF()
-
-ADD_LIBRARY(
-    ${_COMMON_TARGET_NAME}
-    STATIC
-    ${_COMMON_FILES_HEADER} ${_COMMON_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_COMMON_TARGET_NAME}
-    PUBLIC ${_COMMON_INCLUDE_DIRECTORIES_PUBLIC})
-LIST(
-    LENGTH
-    _COMMON_COMPILE_DEFINITIONS_PUBLIC
-    _COMMON_COMPILE_DEFINITIONS_PUBLIC_LENGTH)
-IF(${_COMMON_COMPILE_DEFINITIONS_PUBLIC_LENGTH} GREATER 0)
-    TARGET_COMPILE_DEFINITIONS(
-        ${_COMMON_TARGET_NAME}
-        PUBLIC ${_COMMON_COMPILE_DEFINITIONS_PUBLIC})
-ENDIF()
-TARGET_COMPILE_OPTIONS(
-    ${_COMMON_TARGET_NAME}
-    PUBLIC ${_COMMON_COMPILE_OPTIONS_PUBLIC})
-TARGET_LINK_LIBRARIES(
-    ${_COMMON_TARGET_NAME}
-    PUBLIC "alpaka;${_COMMON_LINK_LIBRARIES_PUBLIC}")
-SET_TARGET_PROPERTIES(
-    ${_COMMON_TARGET_NAME}
-    PROPERTIES FOLDER "test")
-
-TARGET_LINK_LIBRARIES(
-    ${_COMMON_TARGET_NAME}
-    PUBLIC CatchMain)
+    target_link_libraries(${_COMMON_TARGET_NAME} INTERFACE "${CUDA_CUDA_LIBRARY}")
+    target_compile_definitions(${_COMMON_TARGET_NAME} INTERFACE "CUDA_API_PER_THREAD_DEFAULT_STREAM")
+endif()
+
+target_link_libraries(${_COMMON_TARGET_NAME} INTERFACE alpaka::alpaka)
+target_link_libraries(${_COMMON_TARGET_NAME} INTERFACE CatchMain)
+
+if(TARGET ${_COMMON_TARGET_NAME})
+    # HACK: Workaround for the limitation that files added to INTERFACE targets (target_sources) can not be marked as PUBLIC or PRIVATE but only as INTERFACE.
+    # Therefore those files will be added to projects "linking" to the INTERFACE library, but are not added to the project itself within an IDE.
+    add_custom_target("${_COMMON_TARGET_NAME}Ide" SOURCES ${_COMMON_FILES_HEADER} ${_COMMON_COMPILE_OPTIONS_FILE})
+    set_target_properties("${_COMMON_TARGET_NAME}Ide" PROPERTIES FOLDER "test")
+endif()
diff --git a/thirdParty/cupla/alpaka/test/common/devCompileOptions.cmake b/thirdParty/cupla/alpaka/test/common/devCompileOptions.cmake
new file mode 100644
index 0000000000..3d720315c0
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/common/devCompileOptions.cmake
@@ -0,0 +1,158 @@
+#
+# Copyright 2014-2019 Benjamin Worpitz
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+#-------------------------------------------------------------------------------
+# Compiler settings.
+#-------------------------------------------------------------------------------
+# By marking the boost headers as system headers, warnings produced within them are ignored.
+# Marking the boost headers as system headers does not work for nvcc (FindCUDA always uses -I)
+TARGET_INCLUDE_DIRECTORIES(
+    "alpaka"
+    SYSTEM
+    INTERFACE ${Boost_INCLUDE_DIRS})
+
+IF(ALPAKA_ACC_GPU_CUDA_ENABLE AND (ALPAKA_CUDA_COMPILER MATCHES "nvcc") AND (ALPAKA_CUDA_VERSION VERSION_GREATER_EQUAL 11.0))
+    LIST(APPEND CUDA_NVCC_FLAGS -Wdefault-stream-launch -Werror=default-stream-launch)
+ENDIF()
+
+#MSVC
+IF(MSVC)
+    # Force to always compile with W4 and WX
+    LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/W4")
+    LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/WX")
+    # Improve debugging.
+    IF(CMAKE_BUILD_TYPE MATCHES "Debug")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/Zo")
+    ENDIF()
+    IF(MSVC_VERSION GREATER 1900)
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/permissive-")
+        IF(MSVC_VERSION GREATER 1910)
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/Zc:twoPhase-")
+        ENDIF()
+    ENDIF()
+    IF(MSVC_VERSION GREATER 1800)
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "/Zc:throwingNew" "/Zc:strictStrings")
+    ENDIF()
+ELSE()
+  IF(NOT(ALPAKA_ACC_GPU_CUDA_ENABLE) OR ALPAKA_CUDA_COMPILER MATCHES "clang"
+      OR(ALPAKA_ACC_GPU_HIP_ENABLE AND HIP_PLATFORM MATCHES "nvcc"))
+    # GNU
+    IF(CMAKE_COMPILER_IS_GNUCXX)
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wall")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wextra")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-pedantic")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Werror")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdouble-promotion")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wmissing-include-dirs")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wconversion")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wunknown-pragmas")
+        # Higher levels (max is 5) produce some strange warnings
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wstrict-overflow=2")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wtrampolines")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wfloat-equal")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wundef")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wshadow")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-qual")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-align")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wwrite-strings")
+        # Too noisy as it warns for every operation using numeric types smaller then int.
+        # Such values are converted to int implicitly before the calculation is done.
+        # E.g.: uint16_t = uint16_t * uint16_t will trigger the following warning:
+        # conversion to ‘short unsigned int’ from ‘int’ may alter its value
+        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wconversion")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsign-conversion")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wvector-operation-performance")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wzero-as-null-pointer-constant")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdate-time")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wuseless-cast")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wlogical-op")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-aggressive-loop-optimizations")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wmissing-declarations")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-multichar")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wopenmp-simd")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wpacked")
+        # Too much noise
+        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wpadded")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wredundant-decls")
+        # Too much noise
+        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Winline")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdisabled-optimization")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-nonliteral")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-security")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-y2k")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wctor-dtor-privacy")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wdelete-non-virtual-dtor")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wliteral-suffix")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnon-virtual-dtor")
+        # This warns about members that have not explicitly been listed in the constructor initializer list.
+        # This could be useful even for members that have a default constructor.
+        # However, it also issues this warning for defaulted constructurs.
+        #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Weffc++")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Woverloaded-virtual")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsign-promo")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wconditionally-supported")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnoexcept")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wold-style-cast")
+        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-final-types")
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-final-methods")
+            # This does not work correctly as it suggests override to methods that are already marked with final.
+            # Because final implies override, this is not useful.
+            #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsuggest-override")
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnormalized")
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wformat-signedness")
+        ENDIF()
+        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wnull-dereference")
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wduplicated-cond")
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wsubobject-linkage")
+        ENDIF()
+        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
+            # This warning might be useful but it is triggered by comile-time code where it does not make any sense:
+            # E.g. "Vec<DimInt<(TidxDimOut < TidxDimIn) ? TidxDimIn : TidxDimOut>, TElem>" when both values are equal
+            #LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wduplicated-branches")
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Walloc-zero")
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Walloca")
+        ENDIF()
+        IF(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0)
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wcast-align=strict")
+        ENDIF()
+
+    # Clang or AppleClang
+    ELSEIF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Werror")
+        # Weverything really means everything (including Wall, Wextra, pedantic, ...)
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Weverything")
+        # We are not C++98 compatible
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-c++98-compat")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-c++98-compat-pedantic")
+        # The following warnings are triggered by all instantiations of BOOST_AUTO_TEST_SUITE
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-disabled-macro-expansion")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-global-constructors")
+        # This padding warning is generated by the execution tasks depending on the argument types
+        # as they are stored as members. Therefore, the padding warning is triggered by the calling code
+        # and does not indicate a failure within alpaka.
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-padded")
+        # Triggers for all instances of ALPAKA_DEBUG_MINIMAL_LOG_SCOPE and similar macros followed by semicolon
+        IF(CLANG_VERSION_MAJOR GREATER 7)
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-extra-semi-stmt")
+        ENDIF()
+        IF(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0)
+            LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wno-poison-system-directories")
+        ENDIF()
+    # ICC
+    ELSEIF(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Wall")
+    # PGI
+    ELSEIF(${CMAKE_CXX_COMPILER_ID} STREQUAL "PGI")
+        LIST(APPEND ALPAKA_DEV_COMPILE_OPTIONS "-Minform=inform")
+    ENDIF()
+  ENDIF()
+ENDIF()
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Array.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Array.hpp
index 89291cb2a8..2714929b57 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Array.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Array.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -14,27 +14,22 @@ namespace alpaka
     namespace test
     {
         //#############################################################################
-        template<
-            typename TType,
-            size_t TSize>
-        struct Array {
+        template<typename TType, size_t TSize>
+        struct Array
+        {
             TType m_data[TSize];
 
-            template<
-                typename T_Idx>
-            ALPAKA_FN_HOST_ACC const TType &operator[](
-                const T_Idx idx) const
+            template<typename T_Idx>
+            ALPAKA_FN_HOST_ACC const TType& operator[](const T_Idx idx) const
             {
                 return m_data[idx];
             }
 
-            template<
-                typename TIdx>
-            ALPAKA_FN_HOST_ACC TType & operator[](
-                const TIdx idx)
+            template<typename TIdx>
+            ALPAKA_FN_HOST_ACC TType& operator[](const TIdx idx)
             {
                 return m_data[idx];
             }
         };
-    }
-}
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Check.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Check.hpp
index c7acd759d5..941d063b4b 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Check.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Check.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,12 +11,12 @@
 
 #include <cstdio>
 
-#define ALPAKA_CHECK(success, expression) \
-    do \
-    { \
-        if(!(expression)) \
-        { \
-            printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression); \
-            success = false; \
-        } \
-    } while ( 0 )
+#define ALPAKA_CHECK(success, expression)                                                                             \
+    do                                                                                                                \
+    {                                                                                                                 \
+        if(!(expression))                                                                                             \
+        {                                                                                                             \
+            printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);                                             \
+            success = false;                                                                                          \
+        }                                                                                                             \
+    } while(0)
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Extent.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Extent.hpp
index cc74121902..b4d827419d 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Extent.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/Extent.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,56 +9,63 @@
 
 #pragma once
 
+#include <alpaka/alpaka.hpp>
+
+#include <cstddef>
+
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
     //! The test specifics.
     namespace test
     {
-        //#############################################################################
-        //! 1D: (5)
-        //! 2D: (5, 4)
-        //! 3D: (5, 4, 3)
-        //! 4D: (5, 4, 3, 2)
-        // We have to be careful with the extents used.
-        // When TIdx is a 8 bit signed integer and Dim is 4, the extent is extremely limited.
-        template<
-            std::size_t Tidx>
-        struct CreateExtentBufVal
+        template<typename TIdx>
+        struct CreateVecWithIdx
         {
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TIdx>
-            ALPAKA_FN_HOST_ACC
-            static auto create(
-                TIdx)
-            -> TIdx
+            //#############################################################################
+            //! 1D: (11)
+            //! 2D: (11, 10)
+            //! 3D: (11, 10, 9)
+            //! 4D: (11, 10, 9, 8)
+            template<std::size_t Tidx>
+            struct ForExtentBuf
             {
-                return static_cast<TIdx>(5u - Tidx);
-            }
-        };
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST_ACC static auto create()
+                {
+                    return static_cast<TIdx>(11u - Tidx);
+                }
+            };
 
-        //#############################################################################
-        //! 1D: (4)
-        //! 2D: (4, 3)
-        //! 3D: (4, 3, 2)
-        //! 4D: (4, 3, 2, 1)
-        template<
-            std::size_t Tidx>
-        struct CreateExtentViewVal
-        {
-            //-----------------------------------------------------------------------------
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<
-                typename TIdx>
-            ALPAKA_FN_HOST_ACC
-            static auto create(
-                TIdx)
-            -> TIdx
+            //#############################################################################
+            //! 1D: (8)
+            //! 2D: (8, 6)
+            //! 3D: (8, 6, 4)
+            //! 4D: (8, 6, 4, 2)
+            template<std::size_t Tidx>
+            struct ForExtentSubView
+            {
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST_ACC static auto create()
+                {
+                    return static_cast<TIdx>(8u - (Tidx * 2u));
+                }
+            };
+
+            //#############################################################################
+            //! 1D: (2)
+            //! 2D: (2, 3)
+            //! 3D: (2, 3, 4)
+            //! 4D: (2, 3, 4, 5)
+            template<std::size_t Tidx>
+            struct ForOffset
             {
-                return static_cast<TIdx>(4u - Tidx);
-            }
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST_ACC static auto create()
+                {
+                    return static_cast<TIdx>(2u + Tidx);
+                }
+            };
         };
-    }
-}
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp
index ea11697c0e..cc28f325c1 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/KernelExecutionFixture.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,7 +10,6 @@
 #pragma once
 
 #include <alpaka/alpaka.hpp>
-
 #include <alpaka/test/Check.hpp>
 #include <alpaka/test/queue/Queue.hpp>
 
@@ -20,74 +19,71 @@ namespace alpaka
     {
         //#############################################################################
         //! The fixture for executing a kernel on a given accelerator.
-        template<
-            typename TAcc>
+        template<typename TAcc>
         class KernelExecutionFixture
         {
         public:
             using Acc = TAcc;
-            using Dim = alpaka::dim::Dim<Acc>;
-            using Idx = alpaka::idx::Idx<Acc>;
-            using DevAcc = alpaka::dev::Dev<Acc>;
-            using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-            using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
+            using Dim = alpaka::Dim<Acc>;
+            using Idx = alpaka::Idx<Acc>;
+            using DevAcc = alpaka::Dev<Acc>;
+            using PltfAcc = alpaka::Pltf<DevAcc>;
+            using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
+            using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
 
         public:
             //-----------------------------------------------------------------------------
-            template<
-                typename TExtent>
-            KernelExecutionFixture(
-                TExtent const & extent) :
-                    m_devHost(alpaka::pltf::getDevByIdx<pltf::PltfCpu>(0u)),
-                    m_devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u)),
-                    m_queue(m_devAcc),
-                    m_workDiv(
-                        alpaka::workdiv::getValidWorkDiv<Acc>(
-                            m_devAcc,
-                            extent,
-                            alpaka::vec::Vec<Dim, Idx>::ones(),
-                            false,
-                            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted))
-            {}
+            template<typename TExtent>
+            KernelExecutionFixture(TExtent const& extent)
+                : m_devHost(alpaka::getDevByIdx<PltfCpu>(0u))
+                , m_devAcc(alpaka::getDevByIdx<PltfAcc>(0u))
+                , m_queue(m_devAcc)
+                , m_workDiv(alpaka::getValidWorkDiv<Acc>(
+                      m_devAcc,
+                      extent,
+                      alpaka::Vec<Dim, Idx>::ones(),
+                      false,
+                      alpaka::GridBlockExtentSubDivRestrictions::Unrestricted))
+            {
+            }
             //-----------------------------------------------------------------------------
-            template<
-                typename TKernelFnObj,
-                typename... TArgs>
-            auto operator()(
-                TKernelFnObj const & kernelFnObj,
-                TArgs && ... args)
-            -> bool
+            KernelExecutionFixture(WorkDiv const& workDiv)
+                : m_devHost(alpaka::getDevByIdx<PltfCpu>(0u))
+                , m_devAcc(alpaka::getDevByIdx<PltfAcc>(0u))
+                , m_queue(m_devAcc)
+                , m_workDiv(workDiv)
+            {
+            }
+            //-----------------------------------------------------------------------------
+            template<typename TKernelFnObj, typename... TArgs>
+            auto operator()(TKernelFnObj const& kernelFnObj, TArgs&&... args) -> bool
             {
                 // Allocate the result value
-                auto bufAccResult(alpaka::mem::buf::alloc<bool, Idx>(m_devAcc, static_cast<Idx>(1u)));
-                alpaka::mem::view::set(
-                    m_queue,
-                    bufAccResult,
-                    static_cast<std::uint8_t>(true),
-                    bufAccResult);
+                auto bufAccResult(alpaka::allocBuf<bool, Idx>(m_devAcc, static_cast<Idx>(1u)));
+                alpaka::memset(m_queue, bufAccResult, static_cast<std::uint8_t>(true), bufAccResult);
 
-                alpaka::kernel::exec<Acc>(
+                alpaka::exec<Acc>(
                     m_queue,
                     m_workDiv,
                     kernelFnObj,
-                    alpaka::mem::view::getPtrNative(bufAccResult),
+                    alpaka::getPtrNative(bufAccResult),
                     std::forward<TArgs>(args)...);
 
                 // Copy the result value to the host
-                auto bufHostResult(alpaka::mem::buf::alloc<bool, Idx>(m_devHost, static_cast<Idx>(1u)));
-                alpaka::mem::view::copy(m_queue, bufHostResult, bufAccResult, bufAccResult);
-                alpaka::wait::wait(m_queue);
+                auto bufHostResult(alpaka::allocBuf<bool, Idx>(m_devHost, static_cast<Idx>(1u)));
+                alpaka::memcpy(m_queue, bufHostResult, bufAccResult, bufAccResult);
+                alpaka::wait(m_queue);
 
-                auto const result(*alpaka::mem::view::getPtrNative(bufHostResult));
+                auto const result(*alpaka::getPtrNative(bufHostResult));
 
                 return result;
             }
 
         private:
-            alpaka::dev::DevCpu m_devHost;
+            alpaka::DevCpu m_devHost;
             DevAcc m_devAcc;
             QueueAcc m_queue;
-            alpaka::workdiv::WorkDivMembers<Dim, Idx> m_workDiv;
+            WorkDiv m_workDiv;
         };
-    }
-}
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp
index 790b8fd512..038786caaa 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/MeasureKernelRunTime.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -22,32 +22,25 @@ namespace alpaka
         {
             //-----------------------------------------------------------------------------
             //! \return The run time of the given kernel.
-            template<
-                typename TQueue,
-                typename TTask>
-            auto measureTaskRunTimeMs(
-                TQueue & queue,
-                TTask && task)
-            -> std::chrono::milliseconds::rep
+            template<typename TQueue, typename TTask>
+            auto measureTaskRunTimeMs(TQueue& queue, TTask&& task) -> std::chrono::milliseconds::rep
             {
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                std::cout
-                    << "measureKernelRunTime("
-                    << " queue: " << typeid(TQueue).name()
-                    << " task: " << typeid(typename std::decay<TTask>::type).name()
-                    << ")" << std::endl;
+                std::cout << "measureKernelRunTime("
+                          << " queue: " << typeid(TQueue).name() << " task: " << typeid(std::decay_t<TTask>).name()
+                          << ")" << std::endl;
 #endif
                 // Wait for the queue to finish all tasks enqueued prior to the giventask.
-                alpaka::wait::wait(queue);
+                alpaka::wait(queue);
 
                 // Take the time prior to the execution.
                 auto const tpStart(std::chrono::high_resolution_clock::now());
 
                 // Enqueue the task.
-                alpaka::queue::enqueue(queue, std::forward<TTask>(task));
+                alpaka::enqueue(queue, std::forward<TTask>(task));
 
                 // Wait for the queue to finish the task execution to measure its run time.
-                alpaka::wait::wait(queue);
+                alpaka::wait(queue);
 
                 // Take the time after the execution.
                 auto const tpEnd(std::chrono::high_resolution_clock::now());
@@ -57,6 +50,6 @@ namespace alpaka
                 // Return the duration.
                 return std::chrono::duration_cast<std::chrono::milliseconds>(durElapsed).count();
             }
-        }
-    }
-}
+        } // namespace integ
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp
index a418f80a7c..e7a4db9759 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/acc/TestAccs.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,22 +10,21 @@
 #pragma once
 
 #include <alpaka/alpaka.hpp>
-
 #include <alpaka/test/dim/TestDims.hpp>
 #include <alpaka/test/idx/TestIdxs.hpp>
 
+#include <iosfwd>
 #include <tuple>
 #include <type_traits>
-#include <iosfwd>
 
 // When compiling the tests with CUDA enabled (nvcc or native clang) on the CI infrastructure
 // we have to dramatically reduce the number of tested combinations.
 // Else the log length would be exceeded.
 #if defined(ALPAKA_CI)
-  #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA \
-   || defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP && !BOOST_COMP_HCC
-    #define ALPAKA_CUDA_CI
-  #endif
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA                                                       \
+        || defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
+#        define ALPAKA_CUDA_CI
+#    endif
 #endif
 
 namespace alpaka
@@ -35,251 +34,198 @@ namespace alpaka
     namespace test
     {
         //-----------------------------------------------------------------------------
-        //! The test accelerator specifics.
-        namespace acc
+        //! The detail namespace is used to separate implementation details from user accessible code.
+        namespace detail
         {
-            //-----------------------------------------------------------------------------
-            //! The detail namespace is used to separate implementation details from user accessible code.
-            namespace detail
-            {
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuSerialIfAvailableElseInt = alpaka::acc::AccCpuSerial<TDim, TIdx>;
+            template<typename TDim, typename TIdx>
+            using AccCpuSerialIfAvailableElseInt = alpaka::AccCpuSerial<TDim, TIdx>;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuSerialIfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccCpuSerialIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) && !defined(ALPAKA_CUDA_CI)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuThreadsIfAvailableElseInt = alpaka::acc::AccCpuThreads<TDim, TIdx>;
+            template<typename TDim, typename TIdx>
+            using AccCpuThreadsIfAvailableElseInt = alpaka::AccCpuThreads<TDim, TIdx>;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuThreadsIfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccCpuThreadsIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLED)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuFibersIfAvailableElseInt = alpaka::acc::AccCpuFibers<TDim, TIdx>;
+            template<typename TDim, typename TIdx>
+            using AccCpuFibersIfAvailableElseInt = alpaka::AccCpuFibers<TDim, TIdx>;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuFibersIfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccCpuFibersIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuTbbIfAvailableElseInt = alpaka::acc::AccCpuTbbBlocks<TDim, TIdx>;
+            template<typename TDim, typename TIdx>
+            using AccCpuTbbIfAvailableElseInt = alpaka::AccCpuTbbBlocks<TDim, TIdx>;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuTbbIfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccCpuTbbIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp2BlocksIfAvailableElseInt = alpaka::acc::AccCpuOmp2Blocks<TDim, TIdx>;
+            template<typename TDim, typename TIdx>
+            using AccCpuOmp2BlocksIfAvailableElseInt = alpaka::AccCpuOmp2Blocks<TDim, TIdx>;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp2BlocksIfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccCpuOmp2BlocksIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) && !defined(ALPAKA_CUDA_CI)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp2ThreadsIfAvailableElseInt = alpaka::acc::AccCpuOmp2Threads<TDim, TIdx>;
+            template<typename TDim, typename TIdx>
+            using AccCpuOmp2ThreadsIfAvailableElseInt = alpaka::AccCpuOmp2Threads<TDim, TIdx>;
+#else
+            template<typename TDim, typename TIdx>
+            using AccCpuOmp2ThreadsIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_ANY_BT_OMP5_ENABLED) && !defined(TEST_UNIT_KERNEL_KERNEL_STD_FUNCTION)                         \
+    && !(                                                                                                             \
+        BOOST_COMP_GNUC                                                                                               \
+        && (((BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(11, 0, 0)) /* tests excluded because of GCC10 Oacc / Omp5 target \
+                                                                   symbol bug with multiple units */                  \
+             && (defined(TEST_UNIT_BLOCK_SHARED) || defined(TEST_UNIT_BLOCK_SYNC) || defined(TEST_UNIT_WARP)          \
+                 || defined(TEST_UNIT_INTRINSIC) || defined(TEST_UNIT_KERNEL) || defined(TEST_UNIT_MEM_VIEW)))        \
+            || defined(TEST_UNIT_MATH) /* because of static const members */                                          \
+            ))                                                                                                        \
+    && !(                                                                                                             \
+        !defined(ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST)                                                                    \
+        && (defined(TEST_UNIT_ATOMIC) /* clang nvptx atomic ICEs */                                                   \
+            ))
+            template<typename TDim, typename TIdx>
+            using AccOmp5IfAvailableElseInt = alpaka::AccOmp5<TDim, TIdx>;
+#else
+            template<typename TDim, typename TIdx>
+            using AccOmp5IfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_ANY_BT_OACC_ENABLED) && !(defined(TEST_UNIT_KERNEL_KERNEL_STD_FUNCTION))                       \
+    && !(                                                                                                             \
+        BOOST_COMP_GNUC                                                                                               \
+        && (((BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(11, 0, 0)) /* tests excluded because of GCC10 Oacc / Omp5 target \
+                                                                   symbol bug with multiple units */                  \
+             && (defined(TEST_UNIT_BLOCK_SHARED) || defined(TEST_UNIT_BLOCK_SYNC) || defined(TEST_UNIT_WARP)          \
+                 || defined(TEST_UNIT_INTRINSIC) || defined(TEST_UNIT_KERNEL) || defined(TEST_UNIT_MEM_VIEW)))        \
+            || defined(TEST_UNIT_MATH) /* because of static const members */                                          \
+            || defined(TEST_UNIT_MEM_BUF) /* actually works, but hangs when ran by ctest */                           \
+            ))
+            template<typename TDim, typename TIdx>
+            using AccOaccIfAvailableElseInt = alpaka::AccOacc<TDim, TIdx>;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp2ThreadsIfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccOaccIfAvailableElseInt = int;
 #endif
-#if defined(ALPAKA_ACC_CPU_BT_OMP4_ENABLED) && !defined(ALPAKA_CUDA_CI)
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp4IfAvailableElseInt = alpaka::acc::AccCpuOmp4<TDim, TIdx>;
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+            template<typename TDim, typename TIdx>
+            using AccGpuUniformCudaHipRtIfAvailableElseInt = alpaka::AccGpuUniformCudaHipRt<TDim, TIdx>;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccCpuOmp4IfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccGpuUniformCudaHipRtIfAvailableElseInt = int;
 #endif
+
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccGpuCudaRtIfAvailableElseInt = alpaka::acc::AccGpuCudaRt<TDim, TIdx>;
+            template<typename TDim, typename TIdx>
+            using AccGpuCudaRtIfAvailableElseInt = alpaka::AccGpuCudaRt<TDim, TIdx>;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccGpuCudaRtIfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccGpuCudaRtIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccGpuHipRtIfAvailableElseInt = typename
-                    std::conditional<
-                    std::is_same<TDim,alpaka::dim::DimInt<3u>>::value==false,
-                    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-                    int>::type;
+            template<typename TDim, typename TIdx>
+            using AccGpuHipRtIfAvailableElseInt = typename std::conditional<
+                std::is_same<TDim, alpaka::DimInt<3u>>::value == false,
+                alpaka::AccGpuHipRt<TDim, TIdx>,
+                int>::type;
 #else
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using AccGpuHipRtIfAvailableElseInt = int;
+            template<typename TDim, typename TIdx>
+            using AccGpuHipRtIfAvailableElseInt = int;
 #endif
-                //#############################################################################
-                //! A vector containing all available accelerators and void's.
-                template<
-                    typename TDim,
-                    typename TIdx>
-                using EnabledAccsElseInt =
-                    std::tuple<
-                        AccCpuSerialIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuThreadsIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuFibersIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuTbbIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuOmp2BlocksIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuOmp2ThreadsIfAvailableElseInt<TDim, TIdx>,
-                        AccCpuOmp4IfAvailableElseInt<TDim, TIdx>,
-                        AccGpuCudaRtIfAvailableElseInt<TDim, TIdx>,
-                        AccGpuHipRtIfAvailableElseInt<TDim, TIdx>
-                    >;
-            }
-
             //#############################################################################
-            //! A vector containing all available accelerators.
-            template<
-                typename TDim,
-                typename TIdx>
-            using EnabledAccs =
-                typename alpaka::meta::Filter<
-                    detail::EnabledAccsElseInt<TDim, TIdx>,
-                    std::is_class
-                >;
-
-            namespace detail
+            //! A vector containing all available accelerators and void's.
+            template<typename TDim, typename TIdx>
+            using EnabledAccsElseInt = std::tuple<
+                AccCpuSerialIfAvailableElseInt<TDim, TIdx>,
+                AccCpuThreadsIfAvailableElseInt<TDim, TIdx>,
+                AccCpuFibersIfAvailableElseInt<TDim, TIdx>,
+                AccCpuTbbIfAvailableElseInt<TDim, TIdx>,
+                AccCpuOmp2BlocksIfAvailableElseInt<TDim, TIdx>,
+                AccCpuOmp2ThreadsIfAvailableElseInt<TDim, TIdx>,
+                AccOmp5IfAvailableElseInt<TDim, TIdx>,
+                AccOaccIfAvailableElseInt<TDim, TIdx>,
+                AccGpuUniformCudaHipRtIfAvailableElseInt<TDim, TIdx>,
+                AccGpuCudaRtIfAvailableElseInt<TDim, TIdx>,
+                AccGpuHipRtIfAvailableElseInt<TDim, TIdx>>;
+        } // namespace detail
+
+        //#############################################################################
+        //! A vector containing all available accelerators.
+        template<typename TDim, typename TIdx>
+        using EnabledAccs = typename alpaka::meta::Filter<detail::EnabledAccsElseInt<TDim, TIdx>, std::is_class>;
+
+        namespace detail
+        {
+            //#############################################################################
+            //! The accelerator name write wrapper.
+            struct StreamOutAccName
             {
-                //#############################################################################
-                //! The accelerator name write wrapper.
-                struct StreamOutAccName
+                template<typename TAcc>
+                ALPAKA_FN_HOST auto operator()(std::ostream& os) -> void
                 {
-                    template<
-                        typename TAcc>
-                    ALPAKA_FN_HOST auto operator()(
-                        std::ostream & os)
-                    -> void
-                    {
-                        os << alpaka::acc::getAccName<TAcc>();
-                        os << " ";
-                    }
-                };
-            }
+                    os << alpaka::getAccName<TAcc>();
+                    os << " ";
+                }
+            };
+        } // namespace detail
 
-            //-----------------------------------------------------------------------------
-            //! Writes the enabled accelerators to the given stream.
-            template<
-                typename TDim,
-                typename TIdx>
-            ALPAKA_FN_HOST auto writeEnabledAccs(
-                std::ostream & os)
-            -> void
-            {
-                os << "Accelerators enabled: ";
-
-                alpaka::meta::forEachType<
-                    EnabledAccs<TDim, TIdx>>(
-                        detail::StreamOutAccName(),
-                        std::ref(os));
+        //-----------------------------------------------------------------------------
+        //! Writes the enabled accelerators to the given stream.
+        template<typename TDim, typename TIdx>
+        ALPAKA_FN_HOST auto writeEnabledAccs(std::ostream& os) -> void
+        {
+            os << "Accelerators enabled: ";
 
-                os << std::endl;
-            }
+            alpaka::meta::forEachType<EnabledAccs<TDim, TIdx>>(detail::StreamOutAccName(), std::ref(os));
 
-            namespace detail
-            {
-                //#############################################################################
-                //! A std::tuple holding multiple std::tuple consisting of a dimension and a idx type.
-                //!
-                //! TestDimIdxTuples =
-                //!     tuple<
-                //!         tuple<Dim1,Idx1>,
-                //!         tuple<Dim2,Idx1>,
-                //!         tuple<Dim3,Idx1>,
-                //!         ...,
-                //!         tuple<DimN,IdxN>>
-                using TestDimIdxTuples =
-                    alpaka::meta::CartesianProduct<
-                        std::tuple,
-                        dim::TestDims,
-                        idx::TestIdxs
-                    >;
-
-                //#############################################################################
-                //! Transforms a std::tuple holding a dimension and a idx type to a fully instantiated accelerator.
-                //!
-                //! EnabledAccs<Dim,Idx> = tuple<Acc1<Dim,Idx>, ..., AccN<Dim,Idx>>
-                template<
-                    typename TTestAccParamSet>
-                struct InstantiateEnabledAccsWithTestParamSetImpl
-                {
-                    using type =
-                        EnabledAccs<
-                            typename std::tuple_element<0, TTestAccParamSet>::type,
-                            typename std::tuple_element<1, TTestAccParamSet>::type
-                        >;
-                };
+            os << std::endl;
+        }
 
-                template<
-                    typename TTestAccParamSet>
-                using InstantiateEnabledAccsWithTestParamSet = typename InstantiateEnabledAccsWithTestParamSetImpl<TTestAccParamSet>::type;
+        namespace detail
+        {
+            //#############################################################################
+            //! A std::tuple holding multiple std::tuple consisting of a dimension and a idx type.
+            //!
+            //! TestDimIdxTuples =
+            //!     tuple<
+            //!         tuple<Dim1,Idx1>,
+            //!         tuple<Dim2,Idx1>,
+            //!         tuple<Dim3,Idx1>,
+            //!         ...,
+            //!         tuple<DimN,IdxN>>
+            using TestDimIdxTuples = alpaka::meta::CartesianProduct<std::tuple, TestDims, TestIdxs>;
 
-                //#############################################################################
-                //! A std::tuple containing std::tuple with fully instantiated accelerators.
-                //!
-                //! TestEnabledAccs =
-                //!     tuple<
-                //!         tuple<Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>>,
-                //!         tuple<Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>>,
-                //!         ...,
-                //!         tuple<Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>>
-                using InstantiatedEnabledAccs =
-                    alpaka::meta::Transform<
-                        TestDimIdxTuples,
-                        InstantiateEnabledAccsWithTestParamSet
-                    >;
-            }
+            template<typename TList>
+            using ApplyEnabledAccs = alpaka::meta::Apply<TList, EnabledAccs>;
 
             //#############################################################################
-            //! A std::tuple containing fully instantiated accelerators.
+            //! A std::tuple containing std::tuple with fully instantiated accelerators.
             //!
-            //! TestAccs =
+            //! TestEnabledAccs =
             //!     tuple<
-            //!         Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>,
-            //!         Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>,
+            //!         tuple<Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>>,
+            //!         tuple<Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>>,
             //!         ...,
-            //!         Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>
-            using TestAccs =
-                alpaka::meta::Apply<
-                    detail::InstantiatedEnabledAccs,
-                    alpaka::meta::Concatenate
-                >;
-        }
-    }
-}
+            //!         tuple<Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>>
+            using InstantiatedEnabledAccs = alpaka::meta::Transform<TestDimIdxTuples, ApplyEnabledAccs>;
+        } // namespace detail
+
+        //#############################################################################
+        //! A std::tuple containing fully instantiated accelerators.
+        //!
+        //! TestAccs =
+        //!     tuple<
+        //!         Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>,
+        //!         Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>,
+        //!         ...,
+        //!         Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>
+        using TestAccs = alpaka::meta::Apply<detail::InstantiatedEnabledAccs, alpaka::meta::Concatenate>;
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp
index f88ca302ac..68556f391e 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/dim/TestDims.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -13,38 +13,23 @@
 
 #include <tuple>
 
-// When compiling the tests with CUDA enabled (nvcc or native clang) on the CI infrastructure
-// we have to dramatically reduce the number of tested combinations.
-// Else the log length would be exceeded.
-#if defined(ALPAKA_CI)
-  #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA \
-   || defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP && !BOOST_COMP_HCC
-    #define ALPAKA_CUDA_CI
-  #endif
-#endif
-
 namespace alpaka
 {
     namespace test
     {
-        namespace dim
-        {
-            //#############################################################################
-            //! A std::tuple holding dimensions.
-            using TestDims =
-                std::tuple<
-                    alpaka::dim::DimInt<1u>
-#if !defined(ALPAKA_CUDA_CI)
-                    ,alpaka::dim::DimInt<2u>
-#endif
-                    ,alpaka::dim::DimInt<3u>
-                    // The CUDA & HIP accelerators do not currently support 4D buffers and 4D acceleration.
+        //#############################################################################
+        //! A std::tuple holding dimensions.
+        using TestDims = std::tuple<
+            alpaka::DimInt<1u>,
+            alpaka::DimInt<2u>,
+            alpaka::DimInt<3u>
+        // The CUDA & HIP accelerators do not currently support 4D buffers and 4D acceleration.
 #if !(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA)
-  #if !(defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
-                    ,alpaka::dim::DimInt<4u>
-  #endif
+#    if !(defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+            ,
+            alpaka::DimInt<4u>
+#    endif
 #endif
-                >;
-        }
-    }
-}
+            >;
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp
index d7d58ae867..71f5d23711 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/event/EventHostManualTrigger.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,8 +11,8 @@
 
 #include <alpaka/alpaka.hpp>
 
-#include <mutex>
 #include <condition_variable>
+#include <mutex>
 
 namespace alpaka
 {
@@ -21,983 +21,868 @@ namespace alpaka
     //-----------------------------------------------------------------------------
     namespace test
     {
-        //-----------------------------------------------------------------------------
-        //! The test event specifics.
-        //-----------------------------------------------------------------------------
-        namespace event
+        namespace traits
         {
-            namespace traits
-            {
-                //#############################################################################
-                //!
-                //#############################################################################
-                template<
-                    typename TDev>
-                struct EventHostManualTriggerType;
-                //#############################################################################
-                //!
-                //#############################################################################
-                template<
-                    typename TDev>
-                struct IsEventHostManualTriggerSupported;
-            }
-
             //#############################################################################
-            //! The event host manual trigger type trait alias template to remove the ::type.
+            //!
             //#############################################################################
-            template<
-                typename TDev>
-            using EventHostManualTrigger = typename traits::EventHostManualTriggerType<TDev>::type;
-
-            //-----------------------------------------------------------------------------
-            template<
-                typename TDev>
-            ALPAKA_FN_HOST auto isEventHostManualTriggerSupported(
-                TDev const & dev)
-            -> bool
-            {
-                return
-                    traits::IsEventHostManualTriggerSupported<
-                        TDev>
-                    ::isSupported(
-                        dev);
-            }
+            template<typename TDev>
+            struct EventHostManualTriggerType;
+            //#############################################################################
+            //!
+            //#############################################################################
+            template<typename TDev>
+            struct IsEventHostManualTriggerSupported;
+        } // namespace traits
+
+        //#############################################################################
+        //! The event host manual trigger type trait alias template to remove the ::type.
+        //#############################################################################
+        template<typename TDev>
+        using EventHostManualTrigger = typename traits::EventHostManualTriggerType<TDev>::type;
 
-            namespace cpu
+        //-----------------------------------------------------------------------------
+        template<typename TDev>
+        ALPAKA_FN_HOST auto isEventHostManualTriggerSupported(TDev const& dev) -> bool
+        {
+            return traits::IsEventHostManualTriggerSupported<TDev>::isSupported(dev);
+        }
+
+        namespace cpu
+        {
+            namespace detail
             {
-                namespace detail
+                //#############################################################################
+                //! Event that can be enqueued into a queue and can be triggered by the Host.
+                //#############################################################################
+                template<class TDev = DevCpu>
+                class EventHostManualTriggerCpuImpl
                 {
-                    //#############################################################################
-                    //! Event that can be enqueued into a queue and can be triggered by the Host.
-                    //#############################################################################
-                    class EventHostManualTriggerCpuImpl
+                public:
+                    //-----------------------------------------------------------------------------
+                    //! Constructor.
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_FN_HOST EventHostManualTriggerCpuImpl(TDev const& dev) noexcept
+                        : m_dev(dev)
+                        , m_mutex()
+                        , m_enqueueCount(0u)
+                        , m_bIsReady(true)
+                    {
+                    }
+                    //-----------------------------------------------------------------------------
+                    //! Copy constructor.
+                    //-----------------------------------------------------------------------------
+                    EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl const& other) = delete;
+                    //-----------------------------------------------------------------------------
+                    //! Move constructor.
+                    //-----------------------------------------------------------------------------
+                    EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl&&) = delete;
+                    //-----------------------------------------------------------------------------
+                    //! Copy assignment operator.
+                    //-----------------------------------------------------------------------------
+                    auto operator=(EventHostManualTriggerCpuImpl const&) -> EventHostManualTriggerCpuImpl& = delete;
+                    //-----------------------------------------------------------------------------
+                    //! Move assignment operator.
+                    //-----------------------------------------------------------------------------
+                    auto operator=(EventHostManualTriggerCpuImpl&&) -> EventHostManualTriggerCpuImpl& = delete;
+
+                    //-----------------------------------------------------------------------------
+                    //!
+                    //-----------------------------------------------------------------------------
+                    void trigger()
                     {
-                    public:
-                        //-----------------------------------------------------------------------------
-                        //! Constructor.
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST EventHostManualTriggerCpuImpl(
-                            dev::DevCpu const & dev) noexcept :
-                                m_dev(dev),
-                                m_mutex(),
-                                m_enqueueCount(0u),
-                                m_bIsReady(true)
-                        {}
-                        //-----------------------------------------------------------------------------
-                        //! Copy constructor.
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl const & other) = delete;
-                        //-----------------------------------------------------------------------------
-                        //! Move constructor.
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl &&) = delete;
-                        //-----------------------------------------------------------------------------
-                        //! Copy assignment operator.
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerCpuImpl const &) -> EventHostManualTriggerCpuImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        //! Move assignment operator.
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerCpuImpl &&) -> EventHostManualTriggerCpuImpl & = delete;
-
-                        //-----------------------------------------------------------------------------
-                        //!
-                        //-----------------------------------------------------------------------------
-                        void trigger()
                         {
-                            {
-                                std::unique_lock<std::mutex> lock(m_mutex);
-                                m_bIsReady = true;
-                            }
-                            m_conditionVariable.notify_one();
+                            std::unique_lock<std::mutex> lock(m_mutex);
+                            m_bIsReady = true;
                         }
+                        m_conditionVariable.notify_one();
+                    }
 
-                    public:
-                        dev::DevCpu const m_dev;                                //!< The device this event is bound to.
+                public:
+                    TDev const m_dev; //!< The device this event is bound to.
 
-                        mutable std::mutex m_mutex;                             //!< The mutex used to synchronize access to the event.
+                    mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
 
-                        mutable std::condition_variable m_conditionVariable;    //!< The condition signaling the event completion.
-                        std::size_t m_enqueueCount;                             //!< The number of times this event has been enqueued.
+                    mutable std::condition_variable
+                        m_conditionVariable; //!< The condition signaling the event completion.
+                    std::size_t m_enqueueCount; //!< The number of times this event has been enqueued.
 
-                        bool m_bIsReady;                                        //!< If the event is not waiting within a queue (not enqueued or already completed).
-                    };
-                }
+                    bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
+                                     //!< completed).
+                };
+            } // namespace detail
+        } // namespace cpu
+
+        //#############################################################################
+        //! Event that can be enqueued into a queue and can be triggered by the Host.
+        //#############################################################################
+        template<class TDev = DevCpu>
+        class EventHostManualTriggerCpu
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            //! Constructor.
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST EventHostManualTriggerCpu(TDev const& dev)
+                : m_spEventImpl(std::make_shared<cpu::detail::EventHostManualTriggerCpuImpl<TDev>>(dev))
+            {
             }
-
-            //#############################################################################
-            //! Event that can be enqueued into a queue and can be triggered by the Host.
-            //#############################################################################
-            class EventHostManualTriggerCpu
+            //-----------------------------------------------------------------------------
+            //! Copy constructor.
+            //-----------------------------------------------------------------------------
+            EventHostManualTriggerCpu(EventHostManualTriggerCpu const&) = default;
+            //-----------------------------------------------------------------------------
+            //! Move constructor.
+            //-----------------------------------------------------------------------------
+            EventHostManualTriggerCpu(EventHostManualTriggerCpu&&) = default;
+            //-----------------------------------------------------------------------------
+            //! Copy assignment operator.
+            //-----------------------------------------------------------------------------
+            auto operator=(EventHostManualTriggerCpu const&) -> EventHostManualTriggerCpu& = default;
+            //-----------------------------------------------------------------------------
+            //! Move assignment operator.
+            //-----------------------------------------------------------------------------
+            auto operator=(EventHostManualTriggerCpu&&) -> EventHostManualTriggerCpu& = default;
+            //-----------------------------------------------------------------------------
+            //! Equality comparison operator.
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCpu const& rhs) const -> bool
             {
-            public:
-                //-----------------------------------------------------------------------------
-                //! Constructor.
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST EventHostManualTriggerCpu(
-                    dev::DevCpu const & dev) :
-                        m_spEventImpl(std::make_shared<cpu::detail::EventHostManualTriggerCpuImpl>(dev))
-                {}
-                //-----------------------------------------------------------------------------
-                //! Copy constructor.
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerCpu(EventHostManualTriggerCpu const &) = default;
-                //-----------------------------------------------------------------------------
-                //! Move constructor.
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerCpu(EventHostManualTriggerCpu &&) = default;
-                //-----------------------------------------------------------------------------
-                //! Copy assignment operator.
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerCpu const &) -> EventHostManualTriggerCpu & = default;
-                //-----------------------------------------------------------------------------
-                //! Move assignment operator.
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerCpu &&) -> EventHostManualTriggerCpu & = default;
-                //-----------------------------------------------------------------------------
-                //! Equality comparison operator.
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCpu const & rhs) const
-                -> bool
-                {
-                    return (m_spEventImpl == rhs.m_spEventImpl);
-                }
-                //-----------------------------------------------------------------------------
-                //! Inequality comparison operator.
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCpu const & rhs) const
-                -> bool
-                {
-                    return !((*this) == rhs);
-                }
-
-                //-----------------------------------------------------------------------------
-                //!
-                //-----------------------------------------------------------------------------
-                void trigger()
-                {
-                    m_spEventImpl->trigger();
-                }
-
-            public:
-                std::shared_ptr<cpu::detail::EventHostManualTriggerCpuImpl> m_spEventImpl;
-            };
+                return (m_spEventImpl == rhs.m_spEventImpl);
+            }
+            //-----------------------------------------------------------------------------
+            //! Inequality comparison operator.
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCpu const& rhs) const -> bool
+            {
+                return !((*this) == rhs);
+            }
 
-            namespace traits
+            //-----------------------------------------------------------------------------
+            //!
+            //-----------------------------------------------------------------------------
+            void trigger()
             {
-                //#############################################################################
-                //!
-                //#############################################################################
-                template<>
-                struct EventHostManualTriggerType<
-                    alpaka::dev::DevCpu>
-                {
-                    using type = alpaka::test::event::EventHostManualTriggerCpu;
-                };
-                //#############################################################################
-                //! The CPU event host manual trigger support get trait specialization.
-                template<>
-                struct IsEventHostManualTriggerSupported<
-                    alpaka::dev::DevCpu>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isSupported(
-                        alpaka::dev::DevCpu const &)
-                    -> bool
-                    {
-                        return true;
-                    }
-                };
+                m_spEventImpl->trigger();
             }
-        }
-    }
-    namespace dev
-    {
+
+        public:
+            std::shared_ptr<cpu::detail::EventHostManualTriggerCpuImpl<TDev>> m_spEventImpl;
+        };
+
         namespace traits
         {
             //#############################################################################
-            //! The CPU device event device get trait specialization.
+            //!
             //#############################################################################
             template<>
-            struct GetDev<
-                test::event::EventHostManualTriggerCpu>
+            struct EventHostManualTriggerType<alpaka::DevCpu>
+            {
+                using type = alpaka::test::EventHostManualTriggerCpu<DevCpu>;
+            };
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+            //#############################################################################
+            //!
+            //#############################################################################
+            template<>
+            struct EventHostManualTriggerType<alpaka::DevOmp5>
+            {
+                using type = alpaka::test::EventHostManualTriggerCpu<alpaka::DevOmp5>;
+            };
+#elif defined(ALPAKA_ACC_ANY_BT_OACC_ENABLED)
+            //#############################################################################
+            //!
+            //#############################################################################
+            template<>
+            struct EventHostManualTriggerType<alpaka::DevOacc>
+            {
+                using type = alpaka::test::EventHostManualTriggerCpu<alpaka::DevOacc>;
+            };
+#endif
+            //#############################################################################
+            //! The CPU event host manual trigger support get trait specialization.
+            template<>
+            struct IsEventHostManualTriggerSupported<alpaka::DevCpu>
             {
                 //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    test::event::EventHostManualTriggerCpu const & event)
-                -> dev::DevCpu
+                ALPAKA_FN_HOST static auto isSupported(alpaka::DevCpu const&) -> bool
                 {
-                    return event.m_spEventImpl->m_dev;
+                    return true;
                 }
             };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //! The CPU device event test trait specialization.
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
             //#############################################################################
+            //! The Omp5 event host manual trigger support get trait specialization.
             template<>
-            struct Test<
-                test::event::EventHostManualTriggerCpu>
+            struct IsEventHostManualTriggerSupported<alpaka::DevOmp5>
             {
                 //-----------------------------------------------------------------------------
-                //! \return If the event is not waiting within a queue (not enqueued or already handled).
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto test(
-                    test::event::EventHostManualTriggerCpu const & event)
-                -> bool
+                ALPAKA_FN_HOST static auto isSupported(alpaka::DevOmp5 const&) -> bool
                 {
-                    std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                    return event.m_spEventImpl->m_bIsReady;
+                    return true;
                 }
             };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
-            //#############################################################################
-            //!
+#elif defined(ALPAKA_ACC_ANY_BT_OACC_ENABLED)
             //#############################################################################
+            //! The OpenACC event host manual trigger support get trait specialization.
             template<>
-            struct Enqueue<
-                queue::QueueCpuNonBlocking,
-                test::event::EventHostManualTriggerCpu>
+            struct IsEventHostManualTriggerSupported<alpaka::DevOacc>
             {
                 //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
+                ALPAKA_FN_HOST static auto isSupported(alpaka::DevOacc const&) -> bool
+                {
+                    return true;
+                }
+            };
+#endif
+        } // namespace traits
+    } // namespace test
+    namespace traits
+    {
+        //#############################################################################
+        //! The CPU device event device get trait specialization.
+        //#############################################################################
+        template<typename TDev>
+        struct GetDev<test::EventHostManualTriggerCpu<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerCpu<TDev> const& event) -> TDev
+            {
+                return event.m_spEventImpl->m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The CPU device event test trait specialization.
+        //#############################################################################
+        template<typename TDev>
+        struct IsComplete<test::EventHostManualTriggerCpu<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return If the event is not waiting within a queue (not enqueued or already handled).
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerCpu<TDev> const& event) -> bool
+            {
+                std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
+
+                return event.m_spEventImpl->m_bIsReady;
+            }
+        };
+
+        //#############################################################################
+        //!
+        //#############################################################################
+        template<typename TDev>
+        struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, test::EventHostManualTriggerCpu<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
 #if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    queue::QueueCpuNonBlocking & queue,
+                QueueGenericThreadsNonBlocking<TDev>& queue,
 #else
-                    queue::QueueCpuNonBlocking &,
+                QueueGenericThreadsNonBlocking<TDev>&,
 #endif
-                    test::event::EventHostManualTriggerCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                test::EventHostManualTriggerCpu<TDev>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
+                // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+                auto spEventImpl(event.m_spEventImpl);
 
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+                // Setting the event state and enqueuing it has to be atomic.
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
 
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+                // The event should not yet be enqueued.
+                ALPAKA_ASSERT(spEventImpl->m_bIsReady);
 
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
+                // Set its state to enqueued.
+                spEventImpl->m_bIsReady = false;
 
-                    // Increment the enqueue counter. This is used to skip waits for events that had already been finished and re-enqueued which would lead to deadlocks.
-                    ++spEventImpl->m_enqueueCount;
+                // Increment the enqueue counter. This is used to skip waits for events that had already been finished
+                // and re-enqueued which would lead to deadlocks.
+                ++spEventImpl->m_enqueueCount;
 
-                    // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
+                // Workaround: Clang can not support this when natively compiling device code. See
+                // ConcurrentExecPool.hpp.
 #if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
-                    auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-                    // Enqueue a task that only resets the events flag if it is completed.
-                    queue.m_spQueueImpl->m_workerThread.enqueueTask(
-                        [spEventImpl, enqueueCount]()
-                        {
-                            std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
-                            spEventImpl->m_conditionVariable.wait(
-                                lk2,
-                                [spEventImpl, enqueueCount]
-                                {
-                                    return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady;
-                                });
-                        });
+                auto const enqueueCount = spEventImpl->m_enqueueCount;
+
+                // Enqueue a task that only resets the events flag if it is completed.
+                queue.m_spQueueImpl->m_workerThread.enqueueTask([spEventImpl, enqueueCount]() {
+                    std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
+                    spEventImpl->m_conditionVariable.wait(lk2, [spEventImpl, enqueueCount] {
+                        return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady;
+                    });
+                });
 #endif
-                }
-            };
-            //#############################################################################
-            //!
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCpuBlocking,
-                test::event::EventHostManualTriggerCpu>
+            }
+        };
+        //#############################################################################
+        //!
+        //#############################################################################
+        template<typename TDev>
+        struct Enqueue<QueueGenericThreadsBlocking<TDev>, test::EventHostManualTriggerCpu<TDev>>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueGenericThreadsBlocking<TDev>&,
+                test::EventHostManualTriggerCpu<TDev>& event) -> void
             {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuBlocking &,
-                    test::event::EventHostManualTriggerCpu & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
+                // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+                auto spEventImpl(event.m_spEventImpl);
 
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::unique_lock<std::mutex> lk(spEventImpl->m_mutex);
+                // Setting the event state and enqueuing it has to be atomic.
+                std::unique_lock<std::mutex> lk(spEventImpl->m_mutex);
 
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+                // The event should not yet be enqueued.
+                ALPAKA_ASSERT(spEventImpl->m_bIsReady);
 
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
+                // Set its state to enqueued.
+                spEventImpl->m_bIsReady = false;
 
-                    // Increment the enqueue counter. This is used to skip waits for events that had already been finished and re-enqueued which would lead to deadlocks.
-                    ++spEventImpl->m_enqueueCount;
+                // Increment the enqueue counter. This is used to skip waits for events that had already been finished
+                // and re-enqueued which would lead to deadlocks.
+                ++spEventImpl->m_enqueueCount;
 
-                    auto const enqueueCount = spEventImpl->m_enqueueCount;
+                auto const enqueueCount = spEventImpl->m_enqueueCount;
 
-                    spEventImpl->m_conditionVariable.wait(
-                        lk,
-                        [spEventImpl, enqueueCount]
-                        {
-                            return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady;
-                        });
-                }
-            };
-        }
-    }
-}
+                spEventImpl->m_conditionVariable.wait(lk, [spEventImpl, enqueueCount] {
+                    return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady;
+                });
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
 
-#include <cuda.h>
+#    include <alpaka/core/BoostPredef.hpp>
 
-#include <alpaka/core/BoostPredef.hpp>
+#    include <cuda.h>
 
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
+#    if !BOOST_LANG_CUDA
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
 
-#include <alpaka/core/Cuda.hpp>
+#    include <alpaka/core/Cuda.hpp>
 
 namespace alpaka
 {
     namespace test
     {
-        namespace event
+        namespace uniform_cuda_hip
         {
-            namespace cuda
+            namespace detail
             {
-                namespace detail
+                //#############################################################################
+                class EventHostManualTriggerCudaImpl final
                 {
-                    //#############################################################################
-                    class EventHostManualTriggerCudaImpl final
+                public:
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_FN_HOST EventHostManualTriggerCudaImpl(DevUniformCudaHipRt const& dev)
+                        : m_dev(dev)
+                        , m_mutex()
+                        , m_bIsReady(true)
                     {
-                    public:
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST EventHostManualTriggerCudaImpl(
-                            dev::DevCudaRt const & dev) :
-                                m_dev(dev),
-                                m_mutex(),
-                                m_bIsReady(true)
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                            // Set the current device.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaSetDevice(
-                                    m_dev.m_iDevice));
-                            // Allocate the buffer on this device.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaMalloc(
-                                    &m_devMem,
-                                    static_cast<size_t>(sizeof(int32_t))));
-                            // Initiate the memory set.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaMemset(
-                                    m_devMem,
-                                    static_cast<int>(0u),
-                                    static_cast<size_t>(sizeof(int32_t))));
-                        }
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl const &) = delete;
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl &&) = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerCudaImpl const &) -> EventHostManualTriggerCudaImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerCudaImpl &&) -> EventHostManualTriggerCudaImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST ~EventHostManualTriggerCudaImpl()
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                            // Set the current device.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaSetDevice(
-                                    m_dev.m_iDevice));
-                            // Free the buffer.
-                            ALPAKA_CUDA_RT_CHECK(cudaFree(m_devMem));
-                        }
+                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                        // Set the current device.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.m_iDevice));
+                        // Allocate the buffer on this device.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaMalloc(&m_devMem, static_cast<size_t>(sizeof(int32_t))));
+                        // Initiate the memory set.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                            cudaMemset(m_devMem, static_cast<int>(0u), static_cast<size_t>(sizeof(int32_t))));
+                    }
+                    //-----------------------------------------------------------------------------
+                    EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl const&) = delete;
+                    //-----------------------------------------------------------------------------
+                    EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl&&) = delete;
+                    //-----------------------------------------------------------------------------
+                    auto operator=(EventHostManualTriggerCudaImpl const&) -> EventHostManualTriggerCudaImpl& = delete;
+                    //-----------------------------------------------------------------------------
+                    auto operator=(EventHostManualTriggerCudaImpl&&) -> EventHostManualTriggerCudaImpl& = delete;
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_FN_HOST ~EventHostManualTriggerCudaImpl()
+                    {
+                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                        //-----------------------------------------------------------------------------
-                        void trigger()
-                        {
-                            std::unique_lock<std::mutex> lock(m_mutex);
-                            m_bIsReady = true;
+                        // Set the current device.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.m_iDevice));
+                        // Free the buffer.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaFree(m_devMem));
+                    }
 
-                            // Set the current device.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaSetDevice(
-                                    m_dev.m_iDevice));
-                            // Initiate the memory set.
-                            ALPAKA_CUDA_RT_CHECK(
-                                cudaMemset(
-                                    m_devMem,
-                                    static_cast<int>(1u),
-                                    static_cast<size_t>(sizeof(int32_t))));
-                        }
+                    //-----------------------------------------------------------------------------
+                    void trigger()
+                    {
+                        std::unique_lock<std::mutex> lock(m_mutex);
+                        m_bIsReady = true;
+
+                        // Set the current device.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.m_iDevice));
+                        // Initiate the memory set.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                            cudaMemset(m_devMem, static_cast<int>(1u), static_cast<size_t>(sizeof(int32_t))));
+                    }
 
-                    public:
-                        dev::DevCudaRt const m_dev;     //!< The device this event is bound to.
+                public:
+                    DevUniformCudaHipRt const m_dev; //!< The device this event is bound to.
 
-                        mutable std::mutex m_mutex;     //!< The mutex used to synchronize access to the event.
-                        void * m_devMem;
+                    mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
+                    void* m_devMem;
 
-                        bool m_bIsReady;                //!< If the event is not waiting within a queue (not enqueued or already completed).
-                    };
-                }
-            }
+                    bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
+                                     //!< completed).
+                };
+            } // namespace detail
+        } // namespace uniform_cuda_hip
 
-            //#############################################################################
-            class EventHostManualTriggerCuda final
+        //#############################################################################
+        class EventHostManualTriggerCuda final
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST EventHostManualTriggerCuda(DevUniformCudaHipRt const& dev)
+                : m_spEventImpl(std::make_shared<uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl>(dev))
             {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST EventHostManualTriggerCuda(
-                    dev::DevCudaRt const & dev) :
-                        m_spEventImpl(std::make_shared<cuda::detail::EventHostManualTriggerCudaImpl>(dev))
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                }
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerCuda(EventHostManualTriggerCuda const &) = default;
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerCuda(EventHostManualTriggerCuda &&) = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerCuda const &) -> EventHostManualTriggerCuda & = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerCuda &&) -> EventHostManualTriggerCuda & = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCuda const & rhs) const
-                -> bool
-                {
-                    return (m_spEventImpl == rhs.m_spEventImpl);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCuda const & rhs) const
-                -> bool
-                {
-                    return !((*this) == rhs);
-                }
-                //-----------------------------------------------------------------------------
-                ~EventHostManualTriggerCuda() = default;
-
-                //-----------------------------------------------------------------------------
-                void trigger()
-                {
-                    m_spEventImpl->trigger();
-                }
-
-            public:
-                std::shared_ptr<cuda::detail::EventHostManualTriggerCudaImpl> m_spEventImpl;
-            };
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+            }
+            //-----------------------------------------------------------------------------
+            EventHostManualTriggerCuda(EventHostManualTriggerCuda const&) = default;
+            //-----------------------------------------------------------------------------
+            EventHostManualTriggerCuda(EventHostManualTriggerCuda&&) = default;
+            //-----------------------------------------------------------------------------
+            auto operator=(EventHostManualTriggerCuda const&) -> EventHostManualTriggerCuda& = default;
+            //-----------------------------------------------------------------------------
+            auto operator=(EventHostManualTriggerCuda&&) -> EventHostManualTriggerCuda& = default;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCuda const& rhs) const -> bool
+            {
+                return (m_spEventImpl == rhs.m_spEventImpl);
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCuda const& rhs) const -> bool
+            {
+                return !((*this) == rhs);
+            }
+            //-----------------------------------------------------------------------------
+            ~EventHostManualTriggerCuda() = default;
 
-            namespace traits
+            //-----------------------------------------------------------------------------
+            void trigger()
             {
-                //#############################################################################
-                template<>
-                struct EventHostManualTriggerType<
-                    alpaka::dev::DevCudaRt>
-                {
-                    using type = alpaka::test::event::EventHostManualTriggerCuda;
-                };
-                //#############################################################################
-                //! The CPU event host manual trigger support get trait specialization.
-                template<>
-                struct IsEventHostManualTriggerSupported<
-                    alpaka::dev::DevCudaRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    ALPAKA_FN_HOST static auto isSupported(
-#if BOOST_LANG_CUDA >= BOOST_VERSION_NUMBER(9, 0, 0)
-                        alpaka::dev::DevCudaRt const & dev)
-#else
-                        alpaka::dev::DevCudaRt const &)
-#endif
-                    -> bool
-                    {
-#if BOOST_LANG_CUDA >= BOOST_VERSION_NUMBER(9, 0, 0)
-                        int result = 0;
-                        cuDeviceGetAttribute(
-                            &result,
-                            CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS,
-                            dev.m_iDevice);
-                        return result != 0;
-#else
-                        // In CUDA 8.0 there is no way to find out if those operations are really supported.
-                        return false;
-#endif
-                    }
-                };
+                m_spEventImpl->trigger();
             }
-        }
-    }
-    namespace dev
-    {
+
+        public:
+            std::shared_ptr<uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl> m_spEventImpl;
+        };
+
         namespace traits
         {
             //#############################################################################
-            //! The CPU device event device get trait specialization.
             template<>
-            struct GetDev<
-                test::event::EventHostManualTriggerCuda>
+            struct EventHostManualTriggerType<alpaka::DevUniformCudaHipRt>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    test::event::EventHostManualTriggerCuda const & event)
-                -> dev::DevCudaRt
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
+                using type = alpaka::test::EventHostManualTriggerCuda;
             };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
             //#############################################################################
-            //! The CPU device event test trait specialization.
+            //! The CPU event host manual trigger support get trait specialization.
             template<>
-            struct Test<
-                test::event::EventHostManualTriggerCuda>
+            struct IsEventHostManualTriggerSupported<alpaka::DevUniformCudaHipRt>
             {
                 //-----------------------------------------------------------------------------
-                //! \return If the event is not waiting within a queue (not enqueued or already handled).
-                ALPAKA_FN_HOST static auto test(
-                    test::event::EventHostManualTriggerCuda const & event)
-                -> bool
+                ALPAKA_FN_HOST static auto isSupported(alpaka::DevCudaRt const& dev) -> bool
                 {
-                    std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                    return event.m_spEventImpl->m_bIsReady;
+                    int result = 0;
+                    cuDeviceGetAttribute(&result, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, dev.m_iDevice);
+                    return result != 0;
                 }
             };
-        }
-    }
-    namespace queue
+        } // namespace traits
+    } // namespace test
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU device event device get trait specialization.
+        template<>
+        struct GetDev<test::EventHostManualTriggerCuda>
         {
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCudaRtNonBlocking,
-                test::event::EventHostManualTriggerCuda>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerCuda const& event) -> DevUniformCudaHipRt
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtNonBlocking & queue,
-                    test::event::EventHostManualTriggerCuda & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // PGI Profiler`s User Guide:
-                    // The following are known issues related to Events and Metrics:
-                    // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-                    //   on host updates may hang. This includes synchronization between the host and
-                    //   the device build upon value-based CUDA queue synchronization APIs such as
-                    //   cuStreamWaitValue32() and cuStreamWriteValue32().
-                    ALPAKA_CUDA_DRV_CHECK(
-                        cuStreamWaitValue32(
-                            static_cast<CUstream>(queue.m_spQueueImpl->m_CudaQueue),
-                            reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
-                            0x01010101u,
-                            CU_STREAM_WAIT_VALUE_GEQ));
-                }
-            };
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCudaRtBlocking,
-                test::event::EventHostManualTriggerCuda>
+                return event.m_spEventImpl->m_dev;
+            }
+        };
+
+        //#############################################################################
+        //! The CPU device event test trait specialization.
+        template<>
+        struct IsComplete<test::EventHostManualTriggerCuda>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return If the event is not waiting within a queue (not enqueued or already handled).
+            ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerCuda const& event) -> bool
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCudaRtBlocking & queue,
-                    test::event::EventHostManualTriggerCuda & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // PGI Profiler`s User Guide:
-                    // The following are known issues related to Events and Metrics:
-                    // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-                    //   on host updates may hang. This includes synchronization between the host and
-                    //   the device build upon value-based CUDA queue synchronization APIs such as
-                    //   cuStreamWaitValue32() and cuStreamWriteValue32().
-                    ALPAKA_CUDA_DRV_CHECK(
-                        cuStreamWaitValue32(
-                            static_cast<CUstream>(queue.m_spQueueImpl->m_CudaQueue),
-                            reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
-                            0x01010101u,
-                            CU_STREAM_WAIT_VALUE_GEQ));
-                }
-            };
-        }
-    }
-}
+                std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
+
+                return event.m_spEventImpl->m_bIsReady;
+            }
+        };
+
+        //#############################################################################
+        template<>
+        struct Enqueue<QueueUniformCudaHipRtNonBlocking, test::EventHostManualTriggerCuda>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking& queue,
+                test::EventHostManualTriggerCuda& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+                auto spEventImpl(event.m_spEventImpl);
+
+                // Setting the event state and enqueuing it has to be atomic.
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+                // The event should not yet be enqueued.
+                ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+                // Set its state to enqueued.
+                spEventImpl->m_bIsReady = false;
+
+                // PGI Profiler`s User Guide:
+                // The following are known issues related to Events and Metrics:
+                // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
+                //   on host updates may hang. This includes synchronization between the host and
+                //   the device build upon value-based CUDA queue synchronization APIs such as
+                //   cuStreamWaitValue32() and cuStreamWriteValue32().
+                ALPAKA_CUDA_DRV_CHECK(cuStreamWaitValue32(
+                    static_cast<CUstream>(queue.m_spQueueImpl->m_UniformCudaHipQueue),
+                    reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
+                    0x01010101u,
+                    CU_STREAM_WAIT_VALUE_GEQ));
+            }
+        };
+        //#############################################################################
+        template<>
+        struct Enqueue<QueueUniformCudaHipRtBlocking, test::EventHostManualTriggerCuda>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking& queue,
+                test::EventHostManualTriggerCuda& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+                auto spEventImpl(event.m_spEventImpl);
+
+                // Setting the event state and enqueuing it has to be atomic.
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+                // The event should not yet be enqueued.
+                ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+                // Set its state to enqueued.
+                spEventImpl->m_bIsReady = false;
+
+                // PGI Profiler`s User Guide:
+                // The following are known issues related to Events and Metrics:
+                // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
+                //   on host updates may hang. This includes synchronization between the host and
+                //   the device build upon value-based CUDA queue synchronization APIs such as
+                //   cuStreamWaitValue32() and cuStreamWriteValue32().
+                ALPAKA_CUDA_DRV_CHECK(cuStreamWaitValue32(
+                    static_cast<CUstream>(queue.m_spQueueImpl->m_UniformCudaHipQueue),
+                    reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
+                    0x01010101u,
+                    CU_STREAM_WAIT_VALUE_GEQ));
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 #endif
 
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
 
-#include <hip/hip_runtime.h>
+#    include <hip/hip_runtime.h>
 
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
+#    if !BOOST_LANG_HIP
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
 
-#include <alpaka/core/Hip.hpp>
+#    include <alpaka/core/Hip.hpp>
 
 namespace alpaka
 {
     namespace test
     {
-        namespace event
+        namespace hip
         {
-            namespace hip
+            namespace detail
             {
-                namespace detail
+                //#############################################################################
+                class EventHostManualTriggerHipImpl final
                 {
-                    //#############################################################################
-                    class EventHostManualTriggerHipImpl final
+                public:
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_FN_HOST EventHostManualTriggerHipImpl(DevHipRt const& dev)
+                        : m_dev(dev)
+                        , m_mutex()
+                        , m_bIsReady(true)
                     {
-                    public:
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST EventHostManualTriggerHipImpl(
-                            dev::DevHipRt const & dev) :
-                                m_dev(dev),
-                                m_mutex(),
-                                m_bIsReady(true)
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                            // Set the current device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    m_dev.m_iDevice));
-                            // Allocate the buffer on this device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMalloc(
-                                    &m_devMem,
-                                    static_cast<size_t>(sizeof(int32_t))));
-                            // Initiate the memory set.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMemset(
-                                    m_devMem,
-                                    static_cast<int>(0u),
-                                    static_cast<size_t>(sizeof(int32_t))));
-                        }
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl const &) = delete;
-                        //-----------------------------------------------------------------------------
-                        EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl &&) = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerHipImpl const &) -> EventHostManualTriggerHipImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        auto operator=(EventHostManualTriggerHipImpl &&) -> EventHostManualTriggerHipImpl & = delete;
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST ~EventHostManualTriggerHipImpl()
-                        {
-                            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                        // Set the current device.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.m_iDevice));
+                        // Allocate the buffer on this device.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMalloc(&m_devMem, static_cast<size_t>(sizeof(int32_t))));
+                        // Initiate the memory set.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                            hipMemset(m_devMem, static_cast<int>(0u), static_cast<size_t>(sizeof(int32_t))));
+                    }
+                    //-----------------------------------------------------------------------------
+                    EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl const&) = delete;
+                    //-----------------------------------------------------------------------------
+                    EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl&&) = delete;
+                    //-----------------------------------------------------------------------------
+                    auto operator=(EventHostManualTriggerHipImpl const&) -> EventHostManualTriggerHipImpl& = delete;
+                    //-----------------------------------------------------------------------------
+                    auto operator=(EventHostManualTriggerHipImpl&&) -> EventHostManualTriggerHipImpl& = delete;
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_FN_HOST ~EventHostManualTriggerHipImpl()
+                    {
+                        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    m_dev.m_iDevice));
-                            // Free the buffer.
-                            ALPAKA_HIP_RT_CHECK(hipFree(m_devMem));
-                        }
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.m_iDevice));
+                        // Free the buffer.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipFree(m_devMem));
+                    }
 
-                        //-----------------------------------------------------------------------------
-                        void trigger()
-                        {
-                            std::unique_lock<std::mutex> lock(m_mutex);
-                            m_bIsReady = true;
+                    //-----------------------------------------------------------------------------
+                    void trigger()
+                    {
+                        std::unique_lock<std::mutex> lock(m_mutex);
+                        m_bIsReady = true;
+
+                        // Set the current device.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.m_iDevice));
+                        // Initiate the memory set.
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                            hipMemset(m_devMem, static_cast<int>(1u), static_cast<size_t>(sizeof(int32_t))));
+                    }
 
-                            // Set the current device.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipSetDevice(
-                                    m_dev.m_iDevice));
-                            // Initiate the memory set.
-                            ALPAKA_HIP_RT_CHECK(
-                                hipMemset(
-                                    m_devMem,
-                                    static_cast<int>(1u),
-                                    static_cast<size_t>(sizeof(int32_t))));
-                        }
+                public:
+                    DevHipRt const m_dev; //!< The device this event is bound to.
 
-                    public:
-                        dev::DevHipRt const m_dev;     //!< The device this event is bound to.
+                    mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
+                    void* m_devMem;
 
-                        mutable std::mutex m_mutex;     //!< The mutex used to synchronize access to the event.
-                        void * m_devMem;
+                    bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
+                                     //!< completed).
+                };
+            } // namespace detail
+        } // namespace hip
 
-                        bool m_bIsReady;                //!< If the event is not waiting within a queue (not enqueued or already completed).
-                    };
-                }
+        //#############################################################################
+        class EventHostManualTriggerHip final
+        {
+        public:
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST EventHostManualTriggerHip(DevHipRt const& dev)
+                : m_spEventImpl(std::make_shared<hip::detail::EventHostManualTriggerHipImpl>(dev))
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
             }
-
-            //#############################################################################
-            class EventHostManualTriggerHip final
+            //-----------------------------------------------------------------------------
+            EventHostManualTriggerHip(EventHostManualTriggerHip const&) = default;
+            //-----------------------------------------------------------------------------
+            EventHostManualTriggerHip(EventHostManualTriggerHip&&) = default;
+            //-----------------------------------------------------------------------------
+            auto operator=(EventHostManualTriggerHip const&) -> EventHostManualTriggerHip& = default;
+            //-----------------------------------------------------------------------------
+            auto operator=(EventHostManualTriggerHip&&) -> EventHostManualTriggerHip& = default;
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator==(EventHostManualTriggerHip const& rhs) const -> bool
             {
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST EventHostManualTriggerHip(
-                    dev::DevHipRt const & dev) :
-                        m_spEventImpl(std::make_shared<hip::detail::EventHostManualTriggerHipImpl>(dev))
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                }
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerHip(EventHostManualTriggerHip const &) = default;
-                //-----------------------------------------------------------------------------
-                EventHostManualTriggerHip(EventHostManualTriggerHip &&) = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerHip const &) -> EventHostManualTriggerHip & = default;
-                //-----------------------------------------------------------------------------
-                auto operator=(EventHostManualTriggerHip &&) -> EventHostManualTriggerHip & = default;
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator==(EventHostManualTriggerHip const & rhs) const
-                -> bool
-                {
-                    return (m_spEventImpl == rhs.m_spEventImpl);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerHip const & rhs) const
-                -> bool
-                {
-                    return !((*this) == rhs);
-                }
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST ~EventHostManualTriggerHip() = default;
-
-                //-----------------------------------------------------------------------------
-                void trigger()
-                {
-                    m_spEventImpl->trigger();
-                }
-
-            public:
-                std::shared_ptr<hip::detail::EventHostManualTriggerHipImpl> m_spEventImpl;
-            };
-
-            namespace traits
+                return (m_spEventImpl == rhs.m_spEventImpl);
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerHip const& rhs) const -> bool
             {
-                //#############################################################################
-                template<>
-                struct EventHostManualTriggerType<
-                    alpaka::dev::DevHipRt>
-                {
-                    using type = alpaka::test::event::EventHostManualTriggerHip;
-                };
+                return !((*this) == rhs);
+            }
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST ~EventHostManualTriggerHip() = default;
 
-                //#############################################################################
-                //! The HIP event host manual trigger support get trait specialization.
-                template<>
-                struct IsEventHostManualTriggerSupported<
-                    alpaka::dev::DevHipRt>
-                {
-                    //-----------------------------------------------------------------------------
-                    // TODO: there is no CUDA_VERSION in the HIP compiler path.
-                    // TODO: there is a hipDeviceGetAttribute, but there is no pendant for CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
-                    ALPAKA_FN_HOST static auto isSupported(
-                        alpaka::dev::DevHipRt const &)
-                    -> bool
-                    {
-                        return false;
-                    }
-                };
+            //-----------------------------------------------------------------------------
+            void trigger()
+            {
+                m_spEventImpl->trigger();
             }
-        }
-    }
-    namespace dev
-    {
+
+        public:
+            std::shared_ptr<hip::detail::EventHostManualTriggerHipImpl> m_spEventImpl;
+        };
+
         namespace traits
         {
             //#############################################################################
-            //! The CPU device event device get trait specialization.
             template<>
-            struct GetDev<
-                test::event::EventHostManualTriggerHip>
+            struct EventHostManualTriggerType<alpaka::DevHipRt>
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    test::event::EventHostManualTriggerHip const & event)
-                -> dev::DevHipRt
-                {
-                    return event.m_spEventImpl->m_dev;
-                }
+                using type = alpaka::test::EventHostManualTriggerHip;
             };
-        }
-    }
-    namespace event
-    {
-        namespace traits
-        {
+
             //#############################################################################
-            //! The CPU device event test trait specialization.
+            //! The HIP event host manual trigger support get trait specialization.
             template<>
-            struct Test<
-                test::event::EventHostManualTriggerHip>
+            struct IsEventHostManualTriggerSupported<alpaka::DevHipRt>
             {
                 //-----------------------------------------------------------------------------
-                //! \return If the event is not waiting within a queue (not enqueued or already handled).
-                ALPAKA_FN_HOST static auto test(
-                    test::event::EventHostManualTriggerHip const & event)
-                -> bool
+                // TODO: there is no CUDA_VERSION in the HIP compiler path.
+                // TODO: there is a hipDeviceGetAttribute, but there is no pendant for
+                // CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+                ALPAKA_FN_HOST static auto isSupported(alpaka::DevHipRt const&) -> bool
                 {
-                    std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                    return event.m_spEventImpl->m_bIsReady;
+                    return false;
                 }
             };
-        }
-    }
-    namespace queue
+        } // namespace traits
+    } // namespace test
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The CPU device event device get trait specialization.
+        template<>
+        struct GetDev<test::EventHostManualTriggerHip>
         {
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueHipRtNonBlocking,
-                test::event::EventHostManualTriggerHip>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerHip const& event) -> DevHipRt
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtNonBlocking & queue,
-                    test::event::EventHostManualTriggerHip & event)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+                return event.m_spEventImpl->m_dev;
+            }
+        };
 
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+        //#############################################################################
+        //! The CPU device event test trait specialization.
+        template<>
+        struct IsComplete<test::EventHostManualTriggerHip>
+        {
+            //-----------------------------------------------------------------------------
+            //! \return If the event is not waiting within a queue (not enqueued or already handled).
+            ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerHip const& event) -> bool
+            {
+                std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
 
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
+                return event.m_spEventImpl->m_bIsReady;
+            }
+        };
 
-                    // PGI Profiler`s User Guide:
-                    // The following are known issues related to Events and Metrics:
-                    // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-                    //   on host updates may hang. This includes synchronization between the host and
-                    //   the device build upon value-based CUDA queue synchronization APIs such as
-                    //   cuStreamWaitValue32() and cuStreamWriteValue32().
-                    int32_t hostMem=0;
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cerr << "[Workaround] polling of device-located value in stream, as hipStreamWaitValue32 is not available.\n";
-#endif
-                    while(hostMem<0x01010101u) {
-                      ALPAKA_HIP_RT_CHECK(hipMemcpyDtoHAsync(&hostMem,
-                                                             reinterpret_cast<hipDeviceptr_t>(event.m_spEventImpl->m_devMem),
-                                                             sizeof(int32_t),
-                                                             queue.m_spQueueImpl->m_HipQueue));
-                      ALPAKA_HIP_RT_CHECK(hipStreamSynchronize(queue.m_spQueueImpl->m_HipQueue));
-                    }
-                }
-            };
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueHipRtBlocking,
-                test::event::EventHostManualTriggerHip>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueHipRtBlocking & queue,
-                    test::event::EventHostManualTriggerHip & event)
+        //#############################################################################
+        template<>
+        struct Enqueue<QueueHipRtNonBlocking, test::EventHostManualTriggerHip>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, test::EventHostManualTriggerHip& event)
                 -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-                    auto spEventImpl(event.m_spEventImpl);
-
-                    // Setting the event state and enqueuing it has to be atomic.
-                    std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                    // The event should not yet be enqueued.
-                    ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-                    // Set its state to enqueued.
-                    spEventImpl->m_bIsReady = false;
-
-                    // PGI Profiler`s User Guide:
-                    // The following are known issues related to Events and Metrics:
-                    // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-                    //   on host updates may hang. This includes synchronization between the host and
-                    //   the device build upon value-based HIP queue synchronization APIs such as
-                    //   cuStreamWaitValue32() and cuStreamWriteValue32().
-#if BOOST_COMP_NVCC
-                    ALPAKA_HIP_RT_CHECK(hipCUResultTohipError(
-                        cuStreamWaitValue32(
-                            static_cast<CUstream>(queue.m_spQueueImpl->m_HipQueue),
-                            reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
-                            0x01010101u,
-                            CU_STREAM_WAIT_VALUE_GEQ)));
-#else
-                    // workaround for missing cuStreamWaitValue32 in HIP(HCC)
-                    std::uint32_t hmem = 0;
-                    do {
-                        std::this_thread::sleep_for(std::chrono::milliseconds(10u));
-                        ALPAKA_HIP_RT_CHECK(hipMemcpy(&hmem, event.m_spEventImpl->m_devMem, sizeof(std::uint32_t), hipMemcpyDefault));
-                    } while(hmem < 0x01010101u);
-
-#endif
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+                auto spEventImpl(event.m_spEventImpl);
+
+                // Setting the event state and enqueuing it has to be atomic.
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+                // The event should not yet be enqueued.
+                ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+                // Set its state to enqueued.
+                spEventImpl->m_bIsReady = false;
+
+                // PGI Profiler`s User Guide:
+                // The following are known issues related to Events and Metrics:
+                // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
+                //   on host updates may hang. This includes synchronization between the host and
+                //   the device build upon value-based CUDA queue synchronization APIs such as
+                //   cuStreamWaitValue32() and cuStreamWriteValue32().
+                int32_t hostMem = 0;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                std::cerr << "[Workaround] polling of device-located value in stream, as hipStreamWaitValue32 is not "
+                             "available.\n";
+#    endif
+                while(hostMem < 0x01010101u)
+                {
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMemcpyDtoHAsync(
+                        &hostMem,
+                        reinterpret_cast<hipDeviceptr_t>(event.m_spEventImpl->m_devMem),
+                        sizeof(int32_t),
+                        queue.m_spQueueImpl->m_UniformCudaHipQueue));
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipStreamSynchronize(queue.m_spQueueImpl->m_UniformCudaHipQueue));
                 }
-            };
-        }
-    }
-}
+            }
+        };
+        //#############################################################################
+        template<>
+        struct Enqueue<QueueHipRtBlocking, test::EventHostManualTriggerHip>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueHipRtBlocking& queue, test::EventHostManualTriggerHip& event)
+                -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+                auto spEventImpl(event.m_spEventImpl);
+
+                // Setting the event state and enqueuing it has to be atomic.
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+                // The event should not yet be enqueued.
+                ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+                // Set its state to enqueued.
+                spEventImpl->m_bIsReady = false;
+
+                // PGI Profiler`s User Guide:
+                // The following are known issues related to Events and Metrics:
+                // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
+                //   on host updates may hang. This includes synchronization between the host and
+                //   the device build upon value-based HIP queue synchronization APIs such as
+                //   cuStreamWaitValue32() and cuStreamWriteValue32().
+#    if BOOST_COMP_NVCC
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipCUResultTohipError(cuStreamWaitValue32(
+                    static_cast<CUstream>(queue.m_spQueueImpl->m_UniformCudaHipQueue),
+                    reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
+                    0x01010101u,
+                    CU_STREAM_WAIT_VALUE_GEQ)));
+#    else
+                // workaround for missing cuStreamWaitValue32 in HIP
+                std::uint32_t hmem = 0;
+                do
+                {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(10u));
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                        hipMemcpy(&hmem, event.m_spEventImpl->m_devMem, sizeof(std::uint32_t), hipMemcpyDefault));
+                } while(hmem < 0x01010101u);
+
+#    endif
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 #endif
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp
index 0d860ad2b4..d37fa6d70e 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/idx/TestIdxs.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -18,31 +18,22 @@ namespace alpaka
     //! The test specifics.
     namespace test
     {
-        //-----------------------------------------------------------------------------
-        //! The test accelerator specifics.
-        namespace idx
-        {
-            //#############################################################################
-            //! A std::tuple holding idx types.
-            using TestIdxs =
-                std::tuple<
-                    // size_t is most probably identical to either std::uint64_t or std::uint32_t.
-                    // This would lead to duplicate tests (especially test names) which is not allowed.
-                    //std::size_t,
+        //#############################################################################
+        //! A std::tuple holding idx types.
+        using TestIdxs = std::tuple<
+        // size_t is most probably identical to either std::uint64_t or std::uint32_t.
+        // This would lead to duplicate tests (especially test names) which is not allowed.
+        // std::size_t,
 #if !defined(ALPAKA_CI)
-                    std::int64_t,
+            std::int64_t,
 #endif
-                    std::uint64_t,
-                    std::int32_t,
+            std::uint64_t,
+            std::int32_t
 #if !defined(ALPAKA_CI)
-                    std::uint32_t,
-                    std::int16_t,
+            ,
+            std::uint32_t
 #endif
-                    std::uint16_t/*,
-                    // When Idx is a 8 bit integer, extents within the tests would be extremely limited
-                    // (especially when Dim is 4). Therefore, we do not test it.
-                    std::int8_t,
-                    std::uint8_t*/>;
-        }
-    }
-}
+            // index type must be >=32bit
+            >;
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp
index 8ffb49fefa..92609baffb 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/Iterator.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Erik Zenker
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,6 +11,8 @@
 
 #include <alpaka/alpaka.hpp>
 
+#include <type_traits>
+
 namespace alpaka
 {
     //-----------------------------------------------------------------------------
@@ -18,222 +20,175 @@ namespace alpaka
     namespace test
     {
         //-----------------------------------------------------------------------------
-        //! The test mem specifics.
-        namespace mem
+        //!
+        namespace traits
         {
-            //-----------------------------------------------------------------------------
-            //!
-            namespace view
+            //#############################################################################
+            // \tparam T Type to conditionally make const.
+            // \tparam TSource Type to mimic the constness of.
+            template<typename T, typename TSource>
+            using MimicConst
+                = std::conditional_t<std::is_const<TSource>::value, std::add_const_t<T>, std::remove_const_t<T>>;
+
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'Byte*' to 'Elem*' increases required alignment of target type"
+#endif
+            //#############################################################################
+            template<typename TView, typename TSfinae = void>
+            class IteratorView
             {
+                using TViewDecayed = std::decay_t<TView>;
+                using Dim = alpaka::Dim<TViewDecayed>;
+                using Idx = alpaka::Idx<TViewDecayed>;
+                using Elem = MimicConst<alpaka::Elem<TViewDecayed>, TView>;
+
+            public:
                 //-----------------------------------------------------------------------------
-                //!
-                namespace traits
+                ALPAKA_FN_HOST IteratorView(TView& view, Idx const idx)
+                    : m_nativePtr(alpaka::getPtrNative(view))
+                    , m_currentIdx(idx)
+                    , m_extents(alpaka::extent::getExtentVec(view))
+                    , m_pitchBytes(alpaka::getPitchBytesVec(view))
                 {
-                    //#############################################################################
-                    // \tparam T Type to conditionally make const.
-                    // \tparam TSource Type to mimic the constness of.
-                    template<
-                        typename T,
-                        typename TSource>
-                    using MimicConst = typename std::conditional<
-                        std::is_const<TSource>::value,
-                        typename std::add_const<T>::type,
-                        typename std::remove_const<T>::type>;
+                }
 
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'Byte*' to 'Elem*' increases required alignment of target type"
-#endif
-                    //#############################################################################
-                    template<
-                        typename TView,
-                        typename TSfinae = void>
-                    class IteratorView
-                    {
-                        using TViewDecayed = typename std::decay<TView>::type;
-                        using Dim = alpaka::dim::Dim<TViewDecayed>;
-                        using Idx = alpaka::idx::Idx<TViewDecayed>;
-                        using Elem = typename MimicConst<alpaka::elem::Elem<TViewDecayed>, TView>::type;
-
-                    public:
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST IteratorView(
-                            TView & view,
-                            Idx const idx) :
-                                m_nativePtr(alpaka::mem::view::getPtrNative(view)),
-                                m_currentIdx(idx),
-                                m_extents(alpaka::extent::getExtentVec(view)),
-                                m_pitchBytes(alpaka::mem::view::getPitchBytesVec(view))
-                        {}
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST IteratorView(
-                            TView & view) :
-                                IteratorView(view, 0)
-                        {}
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator++()
-                        -> IteratorView&
-                        {
-                            ++m_currentIdx;
-                            return *this;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator--()
-                        -> IteratorView&
-                        {
-                            --m_currentIdx;
-                            return *this;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator++(
-                            int)
-                        -> IteratorView
-                        {
-                            IteratorView iterCopy = *this;
-                            m_currentIdx++;
-                            return iterCopy;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator--(
-                            int)
-                        -> IteratorView
-                        {
-                            IteratorView iterCopy = *this;
-                            m_currentIdx--;
-                            return iterCopy;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        template<typename TIter>
-                        ALPAKA_FN_HOST_ACC auto operator==(
-                            TIter &other) const
-                        -> bool
-                        {
-                            return m_currentIdx == other.m_currentIdx;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        template<typename TIter>
-                        ALPAKA_FN_HOST_ACC auto operator!=(
-                            TIter &other) const
-                        -> bool
-                        {
-                            return m_currentIdx != other.m_currentIdx;
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST_ACC auto operator*() const
-                        -> Elem &
-                        {
-                            using Dim1 = alpaka::dim::DimInt<1>;
-                            using DimMin1 = alpaka::dim::DimInt<Dim::value - 1u>;
-
-                            vec::Vec<Dim1, Idx> const currentIdxDim1{m_currentIdx};
-                            vec::Vec<Dim, Idx> const currentIdxDimx(alpaka::idx::mapIdx<Dim::value>(currentIdxDim1, m_extents));
-
-                            // [pz, py, px] -> [py, px]
-                            auto const pitchWithoutOutermost(vec::subVecEnd<DimMin1>(m_pitchBytes));
-                            // [ElemSize]
-                            vec::Vec<Dim1, Idx> const elementSizeVec(static_cast<Idx>(sizeof(Elem)));
-                            // [py, px] ++ [ElemSize] -> [py, px, ElemSize]
-                            vec::Vec<Dim, Idx> const dstPitchBytes(vec::concat(pitchWithoutOutermost, elementSizeVec));
-                            // [py, px, ElemSize] [z, y, x] -> [py*z, px*y, ElemSize*x]
-                            auto const dimensionalOffsetsInByte(currentIdxDimx * dstPitchBytes);
-                            // sum{[py*z, px*y, ElemSize*x]} -> offset in byte
-                            auto const offsetInByte(dimensionalOffsetsInByte.foldrAll(
-                                [](Idx a, Idx b)
-                                {
-                                    return static_cast<Idx>(a + b);
-                                }));
-
-                            using Byte = typename MimicConst<std::uint8_t, Elem>::type;
-                            Byte* ptr(reinterpret_cast<Byte*>(m_nativePtr) + offsetInByte);
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST IteratorView(TView& view) : IteratorView(view, 0)
+                {
+                }
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST_ACC auto operator++() -> IteratorView&
+                {
+                    ++m_currentIdx;
+                    return *this;
+                }
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST_ACC auto operator--() -> IteratorView&
+                {
+                    --m_currentIdx;
+                    return *this;
+                }
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST_ACC auto operator++(int) -> IteratorView
+                {
+                    IteratorView iterCopy = *this;
+                    m_currentIdx++;
+                    return iterCopy;
+                }
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST_ACC auto operator--(int) -> IteratorView
+                {
+                    IteratorView iterCopy = *this;
+                    m_currentIdx--;
+                    return iterCopy;
+                }
+
+                //-----------------------------------------------------------------------------
+                template<typename TIter>
+                ALPAKA_FN_HOST_ACC auto operator==(TIter& other) const -> bool
+                {
+                    return m_currentIdx == other.m_currentIdx;
+                }
+
+                //-----------------------------------------------------------------------------
+                template<typename TIter>
+                ALPAKA_FN_HOST_ACC auto operator!=(TIter& other) const -> bool
+                {
+                    return m_currentIdx != other.m_currentIdx;
+                }
+
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST_ACC auto operator*() const -> Elem&
+                {
+                    using Dim1 = alpaka::DimInt<1>;
+                    using DimMin1 = alpaka::DimInt<Dim::value - 1u>;
+
+                    Vec<Dim1, Idx> const currentIdxDim1{m_currentIdx};
+                    Vec<Dim, Idx> const currentIdxDimx(alpaka::mapIdx<Dim::value>(currentIdxDim1, m_extents));
+
+                    // [pz, py, px] -> [py, px]
+                    auto const pitchWithoutOutermost(subVecEnd<DimMin1>(m_pitchBytes));
+                    // [ElemSize]
+                    Vec<Dim1, Idx> const elementSizeVec(static_cast<Idx>(sizeof(Elem)));
+                    // [py, px] ++ [ElemSize] -> [py, px, ElemSize]
+                    Vec<Dim, Idx> const dstPitchBytes(concatVec(pitchWithoutOutermost, elementSizeVec));
+                    // [py, px, ElemSize] [z, y, x] -> [py*z, px*y, ElemSize*x]
+                    auto const dimensionalOffsetsInByte(currentIdxDimx * dstPitchBytes);
+                    // sum{[py*z, px*y, ElemSize*x]} -> offset in byte
+                    auto const offsetInByte(dimensionalOffsetsInByte.foldrAll(std::plus<Idx>()));
+
+                    using Byte = MimicConst<std::uint8_t, Elem>;
+                    Byte* ptr(reinterpret_cast<Byte*>(m_nativePtr) + offsetInByte);
 
 #if 0
-                            std::cout
-                                << " i1: " << currentIdxDim1
-                                << " in: " << currentIdxDimx
-                                << " dpb: " << dstPitchBytes
-                                << " offb: " << offsetInByte
-                                << " ptr: " << reinterpret_cast<void const *>(ptr)
-                                << " v: " << *reinterpret_cast<Elem *>(ptr)
-                                << std::endl;
-#endif
-                            return *reinterpret_cast<Elem *>(ptr);
-                        }
-
-                    private:
-                        Elem * const m_nativePtr;
-                        Idx m_currentIdx;
-                        vec::Vec<Dim, Idx> const m_extents;
-                        vec::Vec<Dim, Idx> const m_pitchBytes;
-                    };
-#if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
+                    std::cout
+                        << " i1: " << currentIdxDim1
+                        << " in: " << currentIdxDimx
+                        << " dpb: " << dstPitchBytes
+                        << " offb: " << offsetInByte
+                        << " ptr: " << reinterpret_cast<void const *>(ptr)
+                        << " v: " << *reinterpret_cast<Elem *>(ptr)
+                        << std::endl;
 #endif
-
-                    //#############################################################################
-                    template<
-                        typename TView,
-                        typename TSfinae = void>
-                    struct Begin
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto begin(
-                            TView & view)
-                        -> IteratorView<TView>
-                        {
-                            return IteratorView<TView>(view);
-                        }
-                    };
-
-                    //#############################################################################
-                    template<
-                        typename TView,
-                        typename TSfinae = void>
-                    struct End
-                    {
-                        //-----------------------------------------------------------------------------
-                        ALPAKA_FN_HOST static auto end(
-                            TView & view)
-                        -> IteratorView<TView>
-                        {
-                            auto extents = alpaka::extent::getExtentVec(view);
-                            return IteratorView<TView>(view, extents.prod());
-                        }
-                    };
+                    return *reinterpret_cast<Elem*>(ptr);
                 }
 
-                //#############################################################################
-                template<
-                    typename TView>
-                using Iterator = traits::IteratorView<TView>;
+            private:
+                Elem* const m_nativePtr;
+                Idx m_currentIdx;
+                Vec<Dim, Idx> const m_extents;
+                Vec<Dim, Idx> const m_pitchBytes;
+            };
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
 
+            //#############################################################################
+            template<typename TView, typename TSfinae = void>
+            struct Begin
+            {
                 //-----------------------------------------------------------------------------
-                template<
-                    typename TView>
-                ALPAKA_FN_HOST auto begin(
-                    TView & view)
-                -> Iterator<TView>
+                ALPAKA_FN_HOST static auto begin(TView& view) -> IteratorView<TView>
                 {
-                    return traits::Begin<TView>::begin(view);
+                    return IteratorView<TView>(view);
                 }
+            };
 
+            //#############################################################################
+            template<typename TView, typename TSfinae = void>
+            struct End
+            {
                 //-----------------------------------------------------------------------------
-                template<
-                    typename TView>
-                ALPAKA_FN_HOST auto end(
-                    TView & view)
-                -> Iterator<TView>
+                ALPAKA_FN_HOST static auto end(TView& view) -> IteratorView<TView>
                 {
-                    return traits::End<TView>::end(view);
+                    auto extents = alpaka::extent::getExtentVec(view);
+                    return IteratorView<TView>(view, extents.prod());
                 }
-            }
+            };
+        } // namespace traits
+
+        //#############################################################################
+        template<typename TView>
+        using Iterator = traits::IteratorView<TView>;
+
+        //-----------------------------------------------------------------------------
+        template<typename TView>
+        ALPAKA_FN_HOST auto begin(TView& view) -> Iterator<TView>
+        {
+            return traits::Begin<TView>::begin(view);
+        }
+
+        //-----------------------------------------------------------------------------
+        template<typename TView>
+        ALPAKA_FN_HOST auto end(TView& view) -> Iterator<TView>
+        {
+            return traits::End<TView>::end(view);
         }
-    }
-}
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp
index 7844aac565..eddc7bcc54 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/mem/view/ViewTest.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,13 +10,13 @@
 #pragma once
 
 #include <alpaka/alpaka.hpp>
-
 #include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/test/mem/view/Iterator.hpp>
 
 #include <catch2/catch.hpp>
 
 #include <numeric>
+#include <type_traits>
 
 
 namespace alpaka
@@ -26,353 +26,294 @@ namespace alpaka
     namespace test
     {
         //-----------------------------------------------------------------------------
-        //! The test mem specifics.
-        namespace mem
+        template<typename TElem, typename TDim, typename TIdx, typename TDev, typename TView>
+        ALPAKA_FN_HOST auto testViewImmutable(
+            TView const& view,
+            TDev const& dev,
+            alpaka::Vec<TDim, TIdx> const& extent,
+            alpaka::Vec<TDim, TIdx> const& offset) -> void
         {
             //-----------------------------------------------------------------------------
-            namespace view
+            // alpaka::traits::DevType
             {
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TIdx,
-                    typename TDev,
-                    typename TView>
-                ALPAKA_FN_HOST auto testViewImmutable(
-                    TView const & view,
-                    TDev const & dev,
-                    alpaka::vec::Vec<TDim, TIdx> const & extent,
-                    alpaka::vec::Vec<TDim, TIdx> const & offset)
-                -> void
-                {
-                    //-----------------------------------------------------------------------------
-                    // alpaka::dev::traits::DevType
-                    {
-                        static_assert(
-                            std::is_same<alpaka::dev::Dev<TView>, TDev>::value,
-                            "The device type of the view has to be equal to the specified one.");
-                    }
+                static_assert(
+                    std::is_same<alpaka::Dev<TView>, TDev>::value,
+                    "The device type of the view has to be equal to the specified one.");
+            }
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::dev::traits::GetDev
-                    {
-                        REQUIRE(
-                            dev == alpaka::dev::getDev(view));
-                    }
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::GetDev
+            {
+                REQUIRE(dev == alpaka::getDev(view));
+            }
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::dim::traits::DimType
-                    {
-                        static_assert(
-                            alpaka::dim::Dim<TView>::value == TDim::value,
-                            "The dimensionality of the view has to be equal to the specified one.");
-                    }
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::DimType
+            {
+                static_assert(
+                    alpaka::Dim<TView>::value == TDim::value,
+                    "The dimensionality of the view has to be equal to the specified one.");
+            }
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::elem::traits::ElemType
-                    {
-                        static_assert(
-                            std::is_same<alpaka::elem::Elem<TView>, TElem>::value,
-                            "The element type of the view has to be equal to the specified one.");
-                    }
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::ElemType
+            {
+                static_assert(
+                    std::is_same<alpaka::Elem<TView>, TElem>::value,
+                    "The element type of the view has to be equal to the specified one.");
+            }
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::extent::traits::GetExtent
-                    {
-                        REQUIRE(
-                            extent ==
-                            alpaka::extent::getExtentVec(view));
-                    }
+            //-----------------------------------------------------------------------------
+            // alpaka::extent::traits::GetExtent
+            {
+                REQUIRE(extent == alpaka::extent::getExtentVec(view));
+            }
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::traits::GetPitchBytes
-                    {
-                        // The pitches have to be at least as large as the values we calculate here.
-                        auto pitchMinimum(alpaka::vec::Vec<alpaka::dim::DimInt<TDim::value + 1u>, TIdx>::ones());
-                        // Initialize the pitch between two elements of the X dimension ...
-                        pitchMinimum[TDim::value] = sizeof(TElem);
-                        // ... and fill all the other dimensions.
-                        for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
-                        {
-                            pitchMinimum[i-1] = extent[i-1] * pitchMinimum[i];
-                        }
-
-                        auto const pitchView(alpaka::mem::view::getPitchBytesVec(view));
-
-                        for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
-                        {
-                            REQUIRE(
-                                pitchView[i-1] >=
-                                pitchMinimum[i-1]);
-                        }
-                    }
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::GetPitchBytes
+            {
+                // The pitches have to be at least as large as the values we calculate here.
+                auto pitchMinimum(alpaka::Vec<alpaka::DimInt<TDim::value + 1u>, TIdx>::ones());
+                // Initialize the pitch between two elements of the X dimension ...
+                pitchMinimum[TDim::value] = sizeof(TElem);
+                // ... and fill all the other dimensions.
+                for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
+                {
+                    pitchMinimum[i - 1] = extent[i - 1] * pitchMinimum[i];
+                }
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::traits::GetPtrNative
-                    {
-                        // The view is a const& so the pointer has to point to a const value.
-                        using NativePtr = decltype(alpaka::mem::view::getPtrNative(view));
-                        static_assert(
-                            std::is_pointer<NativePtr>::value,
-                            "The value returned by getPtrNative has to be a pointer.");
-                        static_assert(
-                            std::is_const<typename std::remove_pointer<NativePtr>::type>::value,
-                            "The value returned by getPtrNative has to be const when the view is const.");
-
-                        if(alpaka::extent::getExtentProduct(view) != static_cast<TIdx>(0u))
-                        {
-                            // The pointer is only required to be non-null when the extent is > 0.
-                            TElem const * const invalidPtr(nullptr);
-                            REQUIRE(
-                                invalidPtr !=
-                                alpaka::mem::view::getPtrNative(view));
-                        }
-                        else
-                        {
-                            // When the extent is 0, the pointer is undefined but it should still be possible get it.
-                            alpaka::mem::view::getPtrNative(view);
-                        }
-                    }
+                auto const pitchView(alpaka::getPitchBytesVec(view));
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::offset::traits::GetOffset
-                    {
-                        REQUIRE(
-                            offset ==
-                            alpaka::offset::getOffsetVec(view));
-                    }
+                for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
+                {
+                    REQUIRE(pitchView[i - 1] >= pitchMinimum[i - 1]);
+                }
+            }
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::idx::traits::IdxType
-                    {
-                        static_assert(
-                            std::is_same<alpaka::idx::Idx<TView>, TIdx>::value,
-                            "The idx type of the view has to be equal to the specified one.");
-                    }
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::GetPtrNative
+            {
+                // The view is a const& so the pointer has to point to a const value.
+                using NativePtr = decltype(alpaka::getPtrNative(view));
+                static_assert(
+                    std::is_pointer<NativePtr>::value,
+                    "The value returned by getPtrNative has to be a pointer.");
+                static_assert(
+                    std::is_const<std::remove_pointer_t<NativePtr>>::value,
+                    "The value returned by getPtrNative has to be const when the view is const.");
+
+                if(alpaka::extent::getExtentProduct(view) != static_cast<TIdx>(0u))
+                {
+                    // The pointer is only required to be non-null when the extent is > 0.
+                    TElem const* const invalidPtr(nullptr);
+                    REQUIRE(invalidPtr != alpaka::getPtrNative(view));
+                }
+                else
+                {
+                    // When the extent is 0, the pointer is undefined but it should still be possible get it.
+                    alpaka::getPtrNative(view);
                 }
+            }
+
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::GetOffset
+            {
+                REQUIRE(offset == alpaka::getOffsetVec(view));
+            }
 
-                //#############################################################################
-                //! Compares element-wise that all bytes are set to the same value.
-                struct VerifyBytesSetKernel
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::IdxType
+            {
+                static_assert(
+                    std::is_same<alpaka::Idx<TView>, TIdx>::value,
+                    "The idx type of the view has to be equal to the specified one.");
+            }
+        }
+
+        //#############################################################################
+        //! Compares element-wise that all bytes are set to the same value.
+        struct VerifyBytesSetKernel
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TAcc, typename TIter>
+            ALPAKA_FN_ACC void operator()(
+                TAcc const& acc,
+                bool* success,
+                TIter const& begin,
+                TIter const& end,
+                std::uint8_t const& byte) const
+            {
+                alpaka::ignore_unused(acc);
+
+                constexpr auto elemSizeInByte = sizeof(decltype(*begin));
+                for(auto it = begin; it != end; ++it)
                 {
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename TAcc,
-                        typename TIter>
-                    ALPAKA_FN_ACC void operator()(
-                        TAcc const & acc,
-                        bool * success,
-                        TIter const & begin,
-                        TIter const & end,
-                        std::uint8_t const & byte) const
+                    auto const& elem = *it;
+                    auto const pBytes = reinterpret_cast<std::uint8_t const*>(&elem);
+                    for(std::size_t i = 0u; i < elemSizeInByte; ++i)
                     {
-                        alpaka::ignore_unused(acc);
-
-                        constexpr auto elemSizeInByte = sizeof(decltype(*begin));
-                        for(auto it = begin; it != end; ++it)
-                        {
-                            auto const& elem = *it;
-                            auto const pBytes = reinterpret_cast<std::uint8_t const *>(&elem);
-                            for(std::size_t i = 0u; i < elemSizeInByte; ++i)
-                            {
-                                ALPAKA_CHECK(*success, pBytes[i] == byte);
-                            }
-                        }
+                        ALPAKA_CHECK(*success, pBytes[i] == byte);
                     }
-                };
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TAcc,
-                    typename TView>
-                ALPAKA_FN_HOST auto verifyBytesSet(
-                    TView const & view,
-                    std::uint8_t const & byte)
-                -> void
-                {
-                    using Dim = alpaka::dim::Dim<TView>;
-                    using Idx = alpaka::idx::Idx<TView>;
+                }
+            }
+        };
+        //-----------------------------------------------------------------------------
+        template<typename TAcc, typename TView>
+        ALPAKA_FN_HOST auto verifyBytesSet(TView const& view, std::uint8_t const& byte) -> void
+        {
+            using Dim = alpaka::Dim<TView>;
+            using Idx = alpaka::Idx<TView>;
 
-                    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-                        alpaka::vec::Vec<Dim, Idx>::ones());
+            alpaka::test::KernelExecutionFixture<TAcc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
-                    VerifyBytesSetKernel verifyBytesSet;
+            VerifyBytesSetKernel verifyBytesSet;
 
-                    REQUIRE(
-                        fixture(
-                            verifyBytesSet,
-                            alpaka::test::mem::view::begin(view),
-                            alpaka::test::mem::view::end(view),
-                            byte));
-                }
+            REQUIRE(fixture(verifyBytesSet, alpaka::test::begin(view), alpaka::test::end(view), byte));
+        }
 
-                //#############################################################################
-                //! Compares iterators element-wise
+        //#############################################################################
+        //! Compares iterators element-wise
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wfloat-equal"  // "comparing floating point with == or != is unsafe"
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal" // "comparing floating point with == or != is unsafe"
 #endif
-                struct VerifyViewsEqualKernel
-                {
-                    ALPAKA_NO_HOST_ACC_WARNING
-                    template<
-                        typename TAcc,
-                        typename TIterA,
-                        typename TIterB>
-                    ALPAKA_FN_ACC void operator()(
-                        TAcc const & acc,
-                        bool * success,
-                        TIterA beginA,
-                        TIterA const & endA,
-                        TIterB beginB) const
-                    {
-                        alpaka::ignore_unused(acc);
+        struct VerifyViewsEqualKernel
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TAcc, typename TIterA, typename TIterB>
+            ALPAKA_FN_ACC void operator()(
+                TAcc const& acc,
+                bool* success,
+                TIterA beginA,
+                TIterA const& endA,
+                TIterB beginB) const
+            {
+                alpaka::ignore_unused(acc);
 
-                        for(; beginA != endA; ++beginA, ++beginB)
-                        {
+                for(; beginA != endA; ++beginA, ++beginB)
+                {
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wfloat-equal" // "comparing floating point with == or != is unsafe"
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wfloat-equal" // "comparing floating point with == or != is unsafe"
 #endif
-                            ALPAKA_CHECK(*success, *beginA == *beginB);
+                    ALPAKA_CHECK(*success, *beginA == *beginB);
 #if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
+#    pragma clang diagnostic pop
 #endif
-                        }
-                    }
-                };
+                }
+            }
+        };
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
+#    pragma GCC diagnostic pop
 #endif
 
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TAcc,
-                    typename TViewB,
-                    typename TViewA>
-                ALPAKA_FN_HOST auto verifyViewsEqual(
-                    TViewA const & viewA,
-                    TViewB const & viewB)
-                -> void
-                {
-                    using DimA = alpaka::dim::Dim<TViewA>;
-                    using DimB = alpaka::dim::Dim<TViewB>;
-                    static_assert(DimA::value == DimB::value, "viewA and viewB are required to have identical Dim");
-                    using IdxA = alpaka::idx::Idx<TViewA>;
-                    using IdxB = alpaka::idx::Idx<TViewB>;
-                    static_assert(std::is_same<IdxA, IdxB>::value, "viewA and viewB are required to have identical Idx");
-
-                    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-                        alpaka::vec::Vec<DimA, IdxA>::ones());
-
-                    VerifyViewsEqualKernel verifyViewsEqualKernel;
-
-                    REQUIRE(
-                        fixture(
-                            verifyViewsEqualKernel,
-                            alpaka::test::mem::view::begin(viewA),
-                            alpaka::test::mem::view::end(viewA),
-                            alpaka::test::mem::view::begin(viewB)));
-                }
+        //-----------------------------------------------------------------------------
+        template<typename TAcc, typename TViewB, typename TViewA>
+        ALPAKA_FN_HOST auto verifyViewsEqual(TViewA const& viewA, TViewB const& viewB) -> void
+        {
+            using DimA = alpaka::Dim<TViewA>;
+            using DimB = alpaka::Dim<TViewB>;
+            static_assert(DimA::value == DimB::value, "viewA and viewB are required to have identical Dim");
+            using IdxA = alpaka::Idx<TViewA>;
+            using IdxB = alpaka::Idx<TViewB>;
+            static_assert(std::is_same<IdxA, IdxB>::value, "viewA and viewB are required to have identical Idx");
+
+            alpaka::test::KernelExecutionFixture<TAcc> fixture(alpaka::Vec<DimA, IdxA>::ones());
+
+            VerifyViewsEqualKernel verifyViewsEqualKernel;
+
+            REQUIRE(fixture(
+                verifyViewsEqualKernel,
+                alpaka::test::begin(viewA),
+                alpaka::test::end(viewA),
+                alpaka::test::begin(viewB)));
+        }
 
-                //-----------------------------------------------------------------------------
-                //! Fills the given view with increasing values starting at 0.
-                template<
-                    typename TView,
-                    typename TQueue>
-                ALPAKA_FN_HOST auto iotaFillView(
-                    TQueue & queue,
-                    TView & view)
-                -> void
-                {
-                    using Dim = alpaka::dim::Dim<TView>;
-                    using Idx = alpaka::idx::Idx<TView>;
+        //-----------------------------------------------------------------------------
+        //! Fills the given view with increasing values starting at 0.
+        template<typename TView, typename TQueue>
+        ALPAKA_FN_HOST auto iotaFillView(TQueue& queue, TView& view) -> void
+        {
+            using Dim = alpaka::Dim<TView>;
+            using Idx = alpaka::Idx<TView>;
 
-                    using DevHost = alpaka::dev::DevCpu;
-                    using PltfHost = alpaka::pltf::Pltf<DevHost>;
+            using DevHost = alpaka::DevCpu;
+            using PltfHost = alpaka::Pltf<DevHost>;
 
-                    using Elem = alpaka::elem::Elem<TView>;
+            using Elem = alpaka::Elem<TView>;
 
-                    using ViewPlainPtr = alpaka::mem::view::ViewPlainPtr<DevHost, Elem, Dim, Idx>;
+            using ViewPlainPtr = alpaka::ViewPlainPtr<DevHost, Elem, Dim, Idx>;
 
-                    DevHost const devHost(alpaka::pltf::getDevByIdx<PltfHost>(0));
+            DevHost const devHost(alpaka::getDevByIdx<PltfHost>(0));
 
-                    auto const extent(alpaka::extent::getExtentVec(view));
+            auto const extent(alpaka::extent::getExtentVec(view));
 
-                    // Init buf with increasing values
-                    std::vector<Elem> v(static_cast<std::size_t>(extent.prod()), static_cast<Elem>(0));
-                    std::iota(v.begin(), v.end(), static_cast<Elem>(0));
-                    ViewPlainPtr plainBuf(v.data(), devHost, extent);
+            // Init buf with increasing values
+            std::vector<Elem> v(static_cast<std::size_t>(extent.prod()), static_cast<Elem>(0));
+            std::iota(v.begin(), v.end(), static_cast<Elem>(0));
+            ViewPlainPtr plainBuf(v.data(), devHost, extent);
 
-                    // Copy the generated content into the given view.
-                    alpaka::mem::view::copy(queue, view, plainBuf, extent);
+            // Copy the generated content into the given view.
+            alpaka::memcpy(queue, view, plainBuf, extent);
 
-                    alpaka::wait::wait(queue);
-                }
+            alpaka::wait(queue);
+        }
 
-                //-----------------------------------------------------------------------------
-                template<
-                    typename TAcc,
-                    typename TView,
-                    typename TQueue>
-                ALPAKA_FN_HOST auto testViewMutable(
-                    TQueue & queue,
-                    TView & view)
-                -> void
-                {
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::traits::GetPtrNative
-                    {
-                        // The view is a non-const so the pointer has to point to a non-const value.
-                        using NativePtr = decltype(alpaka::mem::view::getPtrNative(view));
-                        static_assert(
-                            std::is_pointer<NativePtr>::value,
-                            "The value returned by getPtrNative has to be a pointer.");
-                        static_assert(
-                            !std::is_const<typename std::remove_pointer<NativePtr>::type>::value,
-                            "The value returned by getPtrNative has to be non-const when the view is non-const.");
-                    }
+        //-----------------------------------------------------------------------------
+        template<typename TAcc, typename TView, typename TQueue>
+        ALPAKA_FN_HOST auto testViewMutable(TQueue& queue, TView& view) -> void
+        {
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::GetPtrNative
+            {
+                // The view is a non-const so the pointer has to point to a non-const value.
+                using NativePtr = decltype(alpaka::getPtrNative(view));
+                static_assert(
+                    std::is_pointer<NativePtr>::value,
+                    "The value returned by getPtrNative has to be a pointer.");
+                static_assert(
+                    !std::is_const<std::remove_pointer_t<NativePtr>>::value,
+                    "The value returned by getPtrNative has to be non-const when the view is non-const.");
+            }
 
-                    auto const extent(alpaka::extent::getExtentVec(view));
+            auto const extent(alpaka::extent::getExtentVec(view));
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::set
-                    {
-                        std::uint8_t const byte(static_cast<uint8_t>(42u));
-                        alpaka::mem::view::set(queue, view, byte, extent);
-                        alpaka::wait::wait(queue);
-                        verifyBytesSet<TAcc>(view, byte);
-                    }
+            //-----------------------------------------------------------------------------
+            // alpaka::set
+            {
+                std::uint8_t const byte(static_cast<uint8_t>(42u));
+                alpaka::memset(queue, view, byte, extent);
+                alpaka::wait(queue);
+                verifyBytesSet<TAcc>(view, byte);
+            }
 
-                    //-----------------------------------------------------------------------------
-                    // alpaka::mem::view::copy
-                    {
-                        using Elem = alpaka::elem::Elem<TView>;
-                        using Idx = alpaka::idx::Idx<TView>;
-
-                        auto const devAcc = alpaka::dev::getDev(view);
-
-                        //-----------------------------------------------------------------------------
-                        // alpaka::mem::view::copy into given view
-                        {
-                            auto srcBufAcc(alpaka::mem::buf::alloc<Elem, Idx>(devAcc, extent));
-                            iotaFillView(queue, srcBufAcc);
-                            alpaka::mem::view::copy(queue, view, srcBufAcc, extent);
-                            alpaka::wait::wait(queue);
-                            verifyViewsEqual<TAcc>(view, srcBufAcc);
-                        }
-
-                        //-----------------------------------------------------------------------------
-                        // alpaka::mem::view::copy from given view
-                        {
-                            auto dstBufAcc(alpaka::mem::buf::alloc<Elem, Idx>(devAcc, extent));
-                            alpaka::mem::view::copy(queue, dstBufAcc, view, extent);
-                            alpaka::wait::wait(queue);
-                            verifyViewsEqual<TAcc>(dstBufAcc, view);
-                        }
-                    }
+            //-----------------------------------------------------------------------------
+            // alpaka::copy
+            {
+                using Elem = alpaka::Elem<TView>;
+                using Idx = alpaka::Idx<TView>;
+
+                auto const devAcc = alpaka::getDev(view);
+
+                //-----------------------------------------------------------------------------
+                // alpaka::copy into given view
+                {
+                    auto srcBufAcc(alpaka::allocBuf<Elem, Idx>(devAcc, extent));
+                    iotaFillView(queue, srcBufAcc);
+                    alpaka::memcpy(queue, view, srcBufAcc, extent);
+                    alpaka::wait(queue);
+                    verifyViewsEqual<TAcc>(view, srcBufAcc);
+                }
+
+                //-----------------------------------------------------------------------------
+                // alpaka::copy from given view
+                {
+                    auto dstBufAcc(alpaka::allocBuf<Elem, Idx>(devAcc, extent));
+                    alpaka::memcpy(queue, dstBufAcc, view, extent);
+                    alpaka::wait(queue);
+                    verifyViewsEqual<TAcc>(dstBufAcc, view);
                 }
             }
         }
-    }
-}
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/Queue.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/Queue.hpp
index 95a950a95f..82a61a3203 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/Queue.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/Queue.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -17,175 +17,141 @@ namespace alpaka
     //! The test specifics.
     namespace test
     {
-        //-----------------------------------------------------------------------------
-        //! The test queue specifics.
-        namespace queue
+        namespace traits
         {
-            namespace traits
+            //#############################################################################
+            //! The default queue type trait for devices.
+            template<typename TDev, typename TSfinae = void>
+            struct DefaultQueueType;
+
+            //#############################################################################
+            //! The default queue type trait specialization for the CPU device.
+            template<>
+            struct DefaultQueueType<alpaka::DevCpu>
             {
-                //#############################################################################
-                //! The default queue type trait for devices.
-                template<
-                    typename TDev,
-                    typename TSfinae = void>
-                struct DefaultQueueType;
-
-                //#############################################################################
-                //! The default queue type trait specialization for the CPU device.
-                template<>
-                struct DefaultQueueType<
-                    alpaka::dev::DevCpu>
-                {
-#if (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                    using type = alpaka::queue::QueueCpuBlocking;
+#if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                using type = alpaka::QueueCpuBlocking;
 #else
-                    using type = alpaka::queue::QueueCpuNonBlocking;
+                using type = alpaka::QueueCpuNonBlocking;
 #endif
-                };
+            };
 
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
 
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-                //#############################################################################
-                //! The default queue type trait specialization for the CUDA device.
-                template<>
-                struct DefaultQueueType<
-                    alpaka::dev::DevCudaRt>
-                {
-#if (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                    using type = alpaka::queue::QueueCudaRtBlocking;
-#else
-                    using type = alpaka::queue::QueueCudaRtNonBlocking;
-#endif
-                };
+            //#############################################################################
+            //! The default queue type trait specialization for the CUDA/HIP device.
+            template<>
+            struct DefaultQueueType<alpaka::DevUniformCudaHipRt>
+            {
+#    if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                using type = alpaka::QueueUniformCudaHipRtBlocking;
+#    else
+                using type = alpaka::QueueUniformCudaHipRtNonBlocking;
+#    endif
+            };
 #endif
+        } // namespace traits
+        //#############################################################################
+        //! The queue type that should be used for the given accelerator.
+        template<typename TAcc>
+        using DefaultQueue = typename traits::DefaultQueueType<TAcc>::type;
 
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-                //#############################################################################
-                //! The default queue type trait specialization for the HIP device.
-                template<>
-                struct DefaultQueueType<
-                    alpaka::dev::DevHipRt>
-                {
-#if (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                    using type = alpaka::queue::QueueHipRtBlocking;
-#else
-                    using type = alpaka::queue::QueueHipRtNonBlocking;
-#endif
-                };
-#endif
+        namespace traits
+        {
+            //#############################################################################
+            //! The blocking queue trait.
+            template<typename TQueue, typename TSfinae = void>
+            struct IsBlockingQueue;
 
-            }
             //#############################################################################
-            //! The queue type that should be used for the given accelerator.
-            template<
-                typename TAcc>
-            using DefaultQueue = typename traits::DefaultQueueType<TAcc>::type;
+            //! The blocking queue trait specialization for a blocking CPU queue.
+            template<typename TDev>
+            struct IsBlockingQueue<alpaka::QueueGenericThreadsBlocking<TDev>>
+            {
+                static constexpr bool value = true;
+            };
 
-            namespace traits
+            //#############################################################################
+            //! The blocking queue trait specialization for a non-blocking CPU queue.
+            template<typename TDev>
+            struct IsBlockingQueue<alpaka::QueueGenericThreadsNonBlocking<TDev>>
             {
-                //#############################################################################
-                //! The blocking queue trait.
-                template<
-                    typename TQueue,
-                    typename TSfinae = void>
-                struct IsBlockingQueue;
-
-                //#############################################################################
-                //! The blocking queue trait specialization for a blocking CPU queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCpuBlocking>
-                {
-                    static constexpr bool value = true;
-                };
-
-                //#############################################################################
-                //! The blocking queue trait specialization for a non-blocking CPU queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCpuNonBlocking>
-                {
-                    static constexpr bool value = false;
-                };
+                static constexpr bool value = false;
+            };
 
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
 
-#if !BOOST_LANG_CUDA
-    #error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-                //#############################################################################
-                //! The blocking queue trait specialization for a blocking CUDA RT queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCudaRtBlocking>
-                {
-                    static constexpr bool value = true;
-                };
-
-                //#############################################################################
-                //! The blocking queue trait specialization for a non-blocking CUDA RT queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCudaRtNonBlocking>
-                {
-                    static constexpr bool value = false;
-                };
+            //#############################################################################
+            //! The blocking queue trait specialization for a blocking CUDA/HIP RT queue.
+            template<>
+            struct IsBlockingQueue<alpaka::QueueUniformCudaHipRtBlocking>
+            {
+                static constexpr bool value = true;
+            };
+
+            //#############################################################################
+            //! The blocking queue trait specialization for a non-blocking CUDA/HIP RT queue.
+            template<>
+            struct IsBlockingQueue<alpaka::QueueUniformCudaHipRtNonBlocking>
+            {
+                static constexpr bool value = false;
+            };
 #endif
 
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
 
-#if !BOOST_LANG_HIP
-    #error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-                //#############################################################################
-                //! The blocking queue trait specialization for a blocking HIP RT queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueHipRtBlocking>
-                {
-                    static constexpr bool value = true;
-                };
-
-                //#############################################################################
-                //! The blocking queue trait specialization for a non-blocking HIP RT queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueHipRtNonBlocking>
-                {
-                    static constexpr bool value = false;
-                };
-#endif
-            }
             //#############################################################################
-            //! The queue type that should be used for the given accelerator.
-            template<
-                typename TQueue>
-            using IsBlockingQueue = traits::IsBlockingQueue<TQueue>;
+            //! The default queue type trait specialization for the Omp5 device.
+            template<>
+            struct DefaultQueueType<alpaka::DevOmp5>
+            {
+#    if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                using type = alpaka::QueueOmp5Blocking;
+#    else
+                using type = alpaka::QueueOmp5Blocking;
+#    endif
+            };
+#elif defined ALPAKA_ACC_ANY_BT_OACC_ENABLED
 
             //#############################################################################
-            //! A std::tuple holding tuples of devices and corresponding queue types.
-            using TestQueues =
-                std::tuple<
-                    std::tuple<alpaka::dev::DevCpu, alpaka::queue::QueueCpuBlocking>,
-                    std::tuple<alpaka::dev::DevCpu, alpaka::queue::QueueCpuNonBlocking>
+            //! The default queue type trait specialization for the OMP4 device.
+            template<>
+            struct DefaultQueueType<alpaka::DevOacc>
+            {
+                using type = alpaka::QueueOaccBlocking;
+            };
+#endif
+
+        } // namespace traits
+        //#############################################################################
+        //! The queue type that should be used for the given accelerator.
+        template<typename TQueue>
+        using IsBlockingQueue = traits::IsBlockingQueue<TQueue>;
+
+        //#############################################################################
+        //! A std::tuple holding tuples of devices and corresponding queue types.
+        using TestQueues = std::tuple<
+            std::tuple<alpaka::DevCpu, alpaka::QueueCpuBlocking>,
+            std::tuple<alpaka::DevCpu, alpaka::QueueCpuNonBlocking>
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                    ,
-                    std::tuple<alpaka::dev::DevCudaRt, alpaka::queue::QueueCudaRtBlocking>,
-                    std::tuple<alpaka::dev::DevCudaRt, alpaka::queue::QueueCudaRtNonBlocking>
+            ,
+            std::tuple<alpaka::DevUniformCudaHipRt, alpaka::QueueUniformCudaHipRtBlocking>,
+            std::tuple<alpaka::DevUniformCudaHipRt, alpaka::QueueUniformCudaHipRtNonBlocking>
 #endif
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-                    ,
-                    std::tuple<alpaka::dev::DevHipRt, alpaka::queue::QueueHipRtBlocking>,
-                    std::tuple<alpaka::dev::DevHipRt, alpaka::queue::QueueHipRtNonBlocking>
+            ,
+            std::tuple<alpaka::DevHipRt, alpaka::QueueHipRtBlocking>,
+            std::tuple<alpaka::DevHipRt, alpaka::QueueHipRtNonBlocking>
+#endif
+#ifdef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+            ,
+            std::tuple<alpaka::DevOmp5, alpaka::QueueOmp5Blocking>,
+            std::tuple<alpaka::DevOmp5, alpaka::QueueOmp5NonBlocking>
+#elif defined(ALPAKA_ACC_ANY_BT_OACC_ENABLED)
+            ,
+            std::tuple<alpaka::DevOacc, alpaka::QueueOaccBlocking>,
+            std::tuple<alpaka::DevOacc, alpaka::QueueOaccNonBlocking>
 #endif
-                >;
-        }
-    }
-}
+            >;
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
index 095984054e..c851594ac9 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,452 +11,362 @@
 
 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
 
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
-
-#include <alpaka/test/queue/Queue.hpp>
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
 
-#include <alpaka/core/Unused.hpp>
-#include <alpaka/dev/DevCpu.hpp>
-#include <alpaka/queue/cpu/ICpuQueue.hpp>
-#include <alpaka/queue/QueueCpuBlocking.hpp>
-#include <alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp>
-#include <alpaka/test/event/EventHostManualTrigger.hpp>
+#    include <alpaka/core/Unused.hpp>
+#    include <alpaka/dev/DevCpu.hpp>
+#    include <alpaka/dev/Traits.hpp>
+#    include <alpaka/event/EventCpu.hpp>
+#    include <alpaka/event/Traits.hpp>
+#    include <alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp>
+#    include <alpaka/queue/QueueCpuBlocking.hpp>
+#    include <alpaka/queue/Traits.hpp>
+#    include <alpaka/queue/cpu/ICpuQueue.hpp>
+#    include <alpaka/test/event/EventHostManualTrigger.hpp>
+#    include <alpaka/test/queue/Queue.hpp>
+#    include <alpaka/wait/Traits.hpp>
 
-#include <alpaka/dev/Traits.hpp>
-#include <alpaka/event/Traits.hpp>
-#include <alpaka/queue/Traits.hpp>
-#include <alpaka/wait/Traits.hpp>
+#    include <omp.h>
 
-#include <atomic>
-#include <mutex>
-#include <omp.h>
+#    include <atomic>
+#    include <mutex>
 
 namespace alpaka
 {
-    namespace event
+    namespace cpu
     {
-        class EventCpu;
-    }
-}
-
-namespace alpaka
-{
-    namespace queue
-    {
-        namespace cpu
+        namespace detail
         {
-            namespace detail
+#    if BOOST_COMP_CLANG
+// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
+// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wweak-vtables"
+#    endif
+            //#############################################################################
+            //! The CPU collective device queue implementation.
+            class QueueCpuOmp2CollectiveImpl final : public cpu::ICpuQueue
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
             {
-#if BOOST_COMP_CLANG
-    // avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every translation unit [-Werror,-Wweak-vtables]"
-    // https://stackoverflow.com/a/29288300
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-                //#############################################################################
-                //! The CPU collective device queue implementation.
-                class QueueCpuOmp2CollectiveImpl final : public cpu::ICpuQueue
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
+            public:
+                //-----------------------------------------------------------------------------
+                QueueCpuOmp2CollectiveImpl(DevCpu const& dev) noexcept : m_dev(dev), m_uCurrentlyExecutingTask(0u)
                 {
-                public:
-                    //-----------------------------------------------------------------------------
-                    QueueCpuOmp2CollectiveImpl(
-                        dev::DevCpu const & dev) noexcept :
-                            m_dev(dev),
-                            m_uCurrentlyExecutingTask(0u)
-                    {}
-                    //-----------------------------------------------------------------------------
-                    QueueCpuOmp2CollectiveImpl(QueueCpuOmp2CollectiveImpl const &) = delete;
-                    //-----------------------------------------------------------------------------
-                    QueueCpuOmp2CollectiveImpl(QueueCpuOmp2CollectiveImpl &&) = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuOmp2CollectiveImpl const &) -> QueueCpuOmp2CollectiveImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    auto operator=(QueueCpuOmp2CollectiveImpl &&) -> QueueCpuOmp2CollectiveImpl & = delete;
-                    //-----------------------------------------------------------------------------
-                    void enqueue(event::EventCpu & ev) final
-                    {
-                        queue::enqueue(*this, ev);
-                    }
-                    //-----------------------------------------------------------------------------
-                    void wait(event::EventCpu const & ev) final
-                    {
-                        wait::wait(*this, ev);
-                    }
+                }
+                //-----------------------------------------------------------------------------
+                QueueCpuOmp2CollectiveImpl(QueueCpuOmp2CollectiveImpl const&) = delete;
+                //-----------------------------------------------------------------------------
+                QueueCpuOmp2CollectiveImpl(QueueCpuOmp2CollectiveImpl&&) = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueCpuOmp2CollectiveImpl const&) -> QueueCpuOmp2CollectiveImpl& = delete;
+                //-----------------------------------------------------------------------------
+                auto operator=(QueueCpuOmp2CollectiveImpl&&) -> QueueCpuOmp2CollectiveImpl& = delete;
+                //-----------------------------------------------------------------------------
+                void enqueue(EventCpu& ev) final
+                {
+                    alpaka::enqueue(*this, ev);
+                }
+                //-----------------------------------------------------------------------------
+                void wait(EventCpu const& ev) final
+                {
+                    alpaka::wait(*this, ev);
+                }
 
-                public:
-                    dev::DevCpu const m_dev;            //!< The device this queue is bound to.
-                    std::mutex mutable m_mutex;
-                    std::atomic<uint32_t> m_uCurrentlyExecutingTask;
-                };
-            }
+            public:
+                DevCpu const m_dev; //!< The device this queue is bound to.
+                std::mutex mutable m_mutex;
+                std::atomic<uint32_t> m_uCurrentlyExecutingTask;
+            };
+        } // namespace detail
+    } // namespace cpu
+
+    //#############################################################################
+    //! The CPU collective device queue.
+    //
+    // @attention Queue can only be used together with the accelerator AccCpuOmp2Blocks.
+    //
+    // This queue is an example for a user provided queue and the behavior is strongly coupled
+    // to the user workflows.
+    //
+    // Within a OpenMP parallel region kernel will be performed collectively.
+    // All other operations will be performed from one thread (it is not defined which thread).
+    //
+    // Outside of a OpenMP parallel region the queue behaves like QueueCpuBlocking.
+    class QueueCpuOmp2Collective final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueCpuOmp2Collective>
+    {
+    public:
+        //-----------------------------------------------------------------------------
+        QueueCpuOmp2Collective(DevCpu const& dev)
+            : m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuOmp2CollectiveImpl>(dev))
+            , m_spBlockingQueue(std::make_shared<QueueCpuBlocking>(dev))
+        {
+            dev.registerQueue(m_spQueueImpl);
+        }
+        //-----------------------------------------------------------------------------
+        QueueCpuOmp2Collective(QueueCpuOmp2Collective const&) = default;
+        //-----------------------------------------------------------------------------
+        QueueCpuOmp2Collective(QueueCpuOmp2Collective&&) = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueCpuOmp2Collective const&) -> QueueCpuOmp2Collective& = default;
+        //-----------------------------------------------------------------------------
+        auto operator=(QueueCpuOmp2Collective&&) -> QueueCpuOmp2Collective& = default;
+        //-----------------------------------------------------------------------------
+        auto operator==(QueueCpuOmp2Collective const& rhs) const -> bool
+        {
+            return m_spQueueImpl == rhs.m_spQueueImpl && m_spBlockingQueue == rhs.m_spBlockingQueue;
         }
+        //-----------------------------------------------------------------------------
+        auto operator!=(QueueCpuOmp2Collective const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+        //-----------------------------------------------------------------------------
+        ~QueueCpuOmp2Collective() = default;
 
+    public:
+        std::shared_ptr<cpu::detail::QueueCpuOmp2CollectiveImpl> m_spQueueImpl;
+        std::shared_ptr<QueueCpuBlocking> m_spBlockingQueue;
+    };
+
+    namespace traits
+    {
         //#############################################################################
-        //! The CPU collective device queue.
-        //
-        // @attention Queue can only be used together with the accelerator AccCpuOmp2Blocks.
-        //
-        // This queue is an example for a user provided queue and the behavior is strongly coupled
-        // to the user workflows.
-        //
-        // Within a OpenMP parallel region kernel will be performed collectively.
-        // All other operations will be performed from one thread (it is not defined which thread).
-        //
-        // Outside of a OpenMP parallel region the queue behaves like QueueCpuBlocking.
-        class QueueCpuOmp2Collective final : public concepts::Implements<wait::ConceptCurrentThreadWaitFor, QueueCpuOmp2Collective>
+        //! The CPU blocking device queue device type trait specialization.
+        template<>
+        struct DevType<QueueCpuOmp2Collective>
+        {
+            using type = DevCpu;
+        };
+        //#############################################################################
+        //! The CPU blocking device queue device get trait specialization.
+        template<>
+        struct GetDev<QueueCpuOmp2Collective>
         {
-        public:
             //-----------------------------------------------------------------------------
-            QueueCpuOmp2Collective(
-                dev::DevCpu const & dev) :
-                    m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuOmp2CollectiveImpl>(dev)),
-                    m_spBlockingQueue(std::make_shared<QueueCpuBlocking>(dev))
+            ALPAKA_FN_HOST static auto getDev(QueueCpuOmp2Collective const& queue) -> DevCpu
             {
-                dev.m_spDevCpuImpl->RegisterQueue(m_spQueueImpl);
+                return queue.m_spQueueImpl->m_dev;
             }
+        };
+
+        //#############################################################################
+        //! The CPU blocking device queue event type trait specialization.
+        template<>
+        struct EventType<QueueCpuOmp2Collective>
+        {
+            using type = EventCpu;
+        };
+
+        //#############################################################################
+        //! The CPU blocking device queue enqueue trait specialization.
+        //! This default implementation for all tasks directly invokes the function call operator of the task.
+        template<typename TTask>
+        struct Enqueue<QueueCpuOmp2Collective, TTask>
+        {
             //-----------------------------------------------------------------------------
-            QueueCpuOmp2Collective(QueueCpuOmp2Collective const &) = default;
-            //-----------------------------------------------------------------------------
-            QueueCpuOmp2Collective(QueueCpuOmp2Collective &&) = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuOmp2Collective const &) -> QueueCpuOmp2Collective & = default;
-            //-----------------------------------------------------------------------------
-            auto operator=(QueueCpuOmp2Collective &&) -> QueueCpuOmp2Collective & = default;
-            //-----------------------------------------------------------------------------
-            auto operator==(QueueCpuOmp2Collective const & rhs) const
-            -> bool
+            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, TTask const& task) -> void
             {
-                return m_spQueueImpl == rhs.m_spQueueImpl && m_spBlockingQueue == rhs.m_spBlockingQueue;
+                if(::omp_in_parallel() != 0)
+                {
+                    // wait for all tasks en-queued before the parallel region
+                    while(!empty(*queue.m_spBlockingQueue))
+                    {
+                    }
+                    queue.m_spQueueImpl->m_uCurrentlyExecutingTask += 1u;
+
+#    pragma omp single nowait
+                    task();
+
+                    queue.m_spQueueImpl->m_uCurrentlyExecutingTask -= 1u;
+                }
+                else
+                {
+                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+                    alpaka::enqueue(*queue.m_spBlockingQueue, task);
+                }
             }
+        };
+
+        //#############################################################################
+        //! The CPU blocking device queue test trait specialization.
+        template<>
+        struct Empty<QueueCpuOmp2Collective>
+        {
             //-----------------------------------------------------------------------------
-            auto operator!=(QueueCpuOmp2Collective const & rhs) const
-            -> bool
+            ALPAKA_FN_HOST static auto empty(QueueCpuOmp2Collective const& queue) -> bool
             {
-                return !((*this) == rhs);
+                return queue.m_spQueueImpl->m_uCurrentlyExecutingTask == 0u && alpaka::empty(*queue.m_spBlockingQueue);
             }
-            //-----------------------------------------------------------------------------
-            ~QueueCpuOmp2Collective() = default;
-
-        public:
-            std::shared_ptr<cpu::detail::QueueCpuOmp2CollectiveImpl> m_spQueueImpl;
-            std::shared_ptr<QueueCpuBlocking> m_spBlockingQueue;
         };
-    }
 
-    namespace dev
-    {
-        namespace traits
+        //#############################################################################
+        //! The CPU OpenMP2 collective device queue enqueue trait specialization.
+        template<>
+        struct Enqueue<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
         {
-            //#############################################################################
-            //! The CPU blocking device queue device type trait specialization.
-            template<>
-            struct DevType<
-                queue::QueueCpuOmp2Collective>
-            {
-                using type = dev::DevCpu;
-            };
-            //#############################################################################
-            //! The CPU blocking device queue device get trait specialization.
-            template<>
-            struct GetDev<
-                queue::QueueCpuOmp2Collective>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(cpu::detail::QueueCpuOmp2CollectiveImpl&, EventCpu&) -> void
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto getDev(
-                    queue::QueueCpuOmp2Collective const & queue)
-                -> dev::DevCpu
-                {
-                    return queue.m_spQueueImpl->m_dev;
-                }
-            };
-        }
-    }
-    namespace event
-    {
-        namespace traits
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    pragma omp barrier
+            }
+        };
+        //#############################################################################
+        //! The CPU OpenMP2 collective device queue enqueue trait specialization.
+        template<>
+        struct Enqueue<QueueCpuOmp2Collective, EventCpu>
         {
-            //#############################################################################
-            //! The CPU blocking device queue event type trait specialization.
-            template<>
-            struct EventType<
-                queue::QueueCpuOmp2Collective>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, EventCpu& event) -> void
             {
-                using type = event::EventCpu;
-            };
-        }
-    }
-    namespace queue
-    {
-        namespace traits
-        {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            //! This default implementation for all tasks directly invokes the function call operator of the task.
-            template<
-                typename TTask>
-            struct Enqueue<
-                queue::QueueCpuOmp2Collective,
-                TTask>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuOmp2Collective & queue,
-                    TTask const & task)
-                -> void
+                if(::omp_in_parallel() != 0)
                 {
-                    if(::omp_in_parallel() != 0)
-                    {
-                        // wait for all tasks en-queued before the parallel region
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        queue.m_spQueueImpl->m_uCurrentlyExecutingTask += 1u;
-
-                        #pragma omp single nowait
-                        task();
-
-                        queue.m_spQueueImpl->m_uCurrentlyExecutingTask -= 1u;
-                    }
-                    else
+                    // wait for all tasks en-queued before the parallel region
+                    while(!empty(*queue.m_spBlockingQueue))
                     {
-                        std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                        queue::enqueue(*queue.m_spBlockingQueue, task);
                     }
+#    pragma omp barrier
                 }
-            };
-
-            //#############################################################################
-            //! The CPU blocking device queue test trait specialization.
-            template<>
-            struct Empty<
-                queue::QueueCpuOmp2Collective>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto empty(
-                    queue::QueueCpuOmp2Collective const & queue)
-                -> bool
+                else
                 {
-                    return queue.m_spQueueImpl->m_uCurrentlyExecutingTask == 0u &&
-                        queue::empty(*queue.m_spBlockingQueue);
+                    alpaka::enqueue(*queue.m_spBlockingQueue, event);
                 }
-            };
+            }
+        };
 
-            //#############################################################################
-            //! The CPU OpenMP2 collective device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::cpu::detail::QueueCpuOmp2CollectiveImpl,
-                event::EventCpu>
-            {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::cpu::detail::QueueCpuOmp2CollectiveImpl &,
-                    event::EventCpu &)
-                -> void
-                {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+        //#############################################################################
+        //! The CPU blocking device queue enqueue trait specialization.
+        //! This default implementation for all tasks directly invokes the function call operator of the task.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct Enqueue<QueueCpuOmp2Collective, TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+        private:
+            using Task = TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>;
 
-                    #pragma omp barrier
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP2 collective device queue enqueue trait specialization.
-            template<>
-            struct Enqueue<
-                queue::QueueCpuOmp2Collective,
-                event::EventCpu>
+        public:
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, Task const& task) -> void
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuOmp2Collective & queue,
-                    event::EventCpu & event)
-                -> void
+                if(::omp_in_parallel() != 0)
                 {
-                    ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                    if(::omp_in_parallel() != 0)
+                    while(!empty(*queue.m_spBlockingQueue))
                     {
-                        // wait for all tasks en-queued before the parallel region
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        #pragma omp barrier
                     }
-                    else
-                    {
-                        queue::enqueue(*queue.m_spBlockingQueue, event);
-                    }
-
+                    // execute within an OpenMP parallel region
+                    queue.m_spQueueImpl->m_uCurrentlyExecutingTask += 1u;
+                    // execute task within an OpenMP parallel region
+                    task();
+                    queue.m_spQueueImpl->m_uCurrentlyExecutingTask -= 1u;
                 }
-            };
-
-            //#############################################################################
-            //! The CPU blocking device queue enqueue trait specialization.
-            //! This default implementation for all tasks directly invokes the function call operator of the task.
-            template<
-                typename TDim,
-                typename TIdx,
-                typename TKernelFnObj,
-                typename... TArgs>
-            struct Enqueue<
-                queue::QueueCpuOmp2Collective,
-                kernel::TaskKernelCpuOmp2Blocks<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>>
-            {
-            private:
-                using Task = kernel::TaskKernelCpuOmp2Blocks<
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs ...>;
-            public:
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuOmp2Collective & queue,
-                    Task const & task)
-                -> void
+                else
                 {
-                    if(::omp_in_parallel() != 0)
-                    {
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        // execute within an OpenMP parallel region
-                        queue.m_spQueueImpl->m_uCurrentlyExecutingTask += 1u;
-                        // execute task within an OpenMP parallel region
-                        task();
-                        queue.m_spQueueImpl->m_uCurrentlyExecutingTask -= 1u;
-                    }
-                    else
-                    {
-                        std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                        queue::enqueue(*queue.m_spBlockingQueue, task);
-                    }
+                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+                    alpaka::enqueue(*queue.m_spBlockingQueue, task);
                 }
-            };
+            }
+        };
 
-            //#############################################################################
-            //!
-            //#############################################################################
-            template<>
-            struct Enqueue<
-                queue::QueueCpuOmp2Collective,
-                test::event::EventHostManualTriggerCpu>
+        //#############################################################################
+        //!
+        //#############################################################################
+        template<>
+        struct Enqueue<QueueCpuOmp2Collective, test::EventHostManualTriggerCpu<>>
+        {
+            //-----------------------------------------------------------------------------
+            //
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective&, test::EventHostManualTriggerCpu<>&) -> void
             {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    queue::QueueCpuOmp2Collective & ,
-                    test::event::EventHostManualTriggerCpu & )
-                -> void
-                {
-                    // EventHostManualTriggerCpu are not supported for together with the queue QueueCpuOmp2Collective
-                    // but a specialization is needed to path the EventTests
-                }
-            };
-        }
-    }
+                // EventHostManualTriggerCpu are not supported for together with the queue QueueCpuOmp2Collective
+                // but a specialization is needed to path the EventTests
+            }
+        };
 
-    namespace wait
-    {
-        namespace traits
+        //#############################################################################
+        //! The CPU blocking device queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<>
+        struct CurrentThreadWaitFor<QueueCpuOmp2Collective>
         {
-            //#############################################################################
-            //! The CPU blocking device queue thread wait trait specialization.
-            //!
-            //! Blocks execution of the calling thread until the queue has finished processing all previously requested tasks (kernels, data copies, ...)
-            template<>
-            struct CurrentThreadWaitFor<
-                queue::QueueCpuOmp2Collective>
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueCpuOmp2Collective const& queue) -> void
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                    queue::QueueCpuOmp2Collective const & queue)
-                -> void
+                if(::omp_in_parallel() != 0)
                 {
-                    if(::omp_in_parallel() != 0)
+                    // wait for all tasks en-queued before the parallel region
+                    while(!empty(*queue.m_spBlockingQueue))
                     {
-                        // wait for all tasks en-queued before the parallel region
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        #pragma omp barrier
-                    }
-                    else
-                    {
-                        std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                        wait::wait(*queue.m_spBlockingQueue);
                     }
+#    pragma omp barrier
                 }
-            };
+                else
+                {
+                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+                    wait(*queue.m_spBlockingQueue);
+                }
+            }
+        };
 
 
-            //#############################################################################
-            //! The CPU OpenMP2 collective device queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::cpu::detail::QueueCpuOmp2CollectiveImpl,
-                event::EventCpu>
+        //#############################################################################
+        //! The CPU OpenMP2 collective device queue event wait trait specialization.
+        template<>
+        struct WaiterWaitFor<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(cpu::detail::QueueCpuOmp2CollectiveImpl&, EventCpu const&) -> void
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::cpu::detail::QueueCpuOmp2CollectiveImpl &,
-                    event::EventCpu const &)
-                -> void
-                {
-                    #pragma omp barrier
-                }
-            };
-            //#############################################################################
-            //! The CPU OpenMP2 collective queue event wait trait specialization.
-            template<>
-            struct WaiterWaitFor<
-                queue::QueueCpuOmp2Collective,
-                event::EventCpu>
+#    pragma omp barrier
+            }
+        };
+        //#############################################################################
+        //! The CPU OpenMP2 collective queue event wait trait specialization.
+        template<>
+        struct WaiterWaitFor<QueueCpuOmp2Collective, EventCpu>
+        {
+            //-----------------------------------------------------------------------------
+            ALPAKA_FN_HOST static auto waiterWaitFor(QueueCpuOmp2Collective& queue, EventCpu const& event) -> void
             {
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto waiterWaitFor(
-                    queue::QueueCpuOmp2Collective & queue,
-                    event::EventCpu const & event)
-                -> void
+                if(::omp_in_parallel() != 0)
                 {
-                    if(::omp_in_parallel() != 0)
+                    // wait for all tasks en-queued before the parallel region
+                    while(!empty(*queue.m_spBlockingQueue))
                     {
-                        // wait for all tasks en-queued before the parallel region
-                        while(!queue::empty(*queue.m_spBlockingQueue)){}
-                        wait::wait(queue);
                     }
-                    else
-                        wait::wait(*queue.m_spBlockingQueue, event);
+                    wait(queue);
                 }
-            };
-        }
-    }
+                else
+                    wait(*queue.m_spBlockingQueue, event);
+            }
+        };
+    } // namespace traits
     //-----------------------------------------------------------------------------
     //! The test specifics.
     namespace test
     {
-        //-----------------------------------------------------------------------------
-        //! The test queue specifics.
-        namespace queue
+        namespace traits
         {
-            namespace traits
+            //#############################################################################
+            //! The blocking queue trait specialization for a OpenMP2 collective CPU queue.
+            template<>
+            struct IsBlockingQueue<alpaka::QueueCpuOmp2Collective>
             {
-                //#############################################################################
-                //! The blocking queue trait specialization for a OpenMP2 collective CPU queue.
-                template<>
-                struct IsBlockingQueue<
-                    alpaka::queue::QueueCpuOmp2Collective>
-                {
-                    static constexpr bool value = true;
-                };
-            }
-        }
-    }
-}
+                static constexpr bool value = true;
+            };
+        } // namespace traits
+    } // namespace test
+} // namespace alpaka
 
-#include <alpaka/event/EventCpu.hpp>
+#    include <alpaka/event/EventCpu.hpp>
 
 #endif
diff --git a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp
index 1fed8832c2..ced874526a 100644
--- a/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp
+++ b/thirdParty/cupla/alpaka/test/common/include/alpaka/test/queue/QueueTestFixture.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,32 +9,28 @@
 
 #include <alpaka/alpaka.hpp>
 
+#include <tuple>
+
 namespace alpaka
 {
     namespace test
     {
-        namespace queue
+        //#############################################################################
+        template<typename TDevQueue>
+        struct QueueTestFixture
         {
-            //#############################################################################
-            template<
-                typename TDevQueue>
-            struct QueueTestFixture
-            {
-                using Dev = typename std::tuple_element<0, TDevQueue>::type;
-                using Queue = typename std::tuple_element<1, TDevQueue>::type;
+            using Dev = std::tuple_element_t<0, TDevQueue>;
+            using Queue = std::tuple_element_t<1, TDevQueue>;
 
-                using Pltf = alpaka::pltf::Pltf<Dev>;
+            using Pltf = alpaka::Pltf<Dev>;
 
-                //-----------------------------------------------------------------------------
-                QueueTestFixture() :
-                    m_dev(alpaka::pltf::getDevByIdx<Pltf>(0u)),
-                    m_queue(m_dev)
-                {
-                }
+            //-----------------------------------------------------------------------------
+            QueueTestFixture() : m_dev(alpaka::getDevByIdx<Pltf>(0u)), m_queue(m_dev)
+            {
+            }
 
-                Dev m_dev;
-                Queue m_queue;
-            };
-        }
-    }
-}
+            Dev m_dev;
+            Queue m_queue;
+        };
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/common/src/Dummy.cpp b/thirdParty/cupla/alpaka/test/common/src/Dummy.cpp
deleted file mode 100644
index dbe641fb23..0000000000
--- a/thirdParty/cupla/alpaka/test/common/src/Dummy.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-// This file is here because CMake does not allow to create a header only library.
diff --git a/thirdParty/cupla/alpaka/test/integ/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/CMakeLists.txt
index ecc338bd7c..7909c19ecb 100644
--- a/thirdParty/cupla/alpaka/test/integ/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/integ/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz
+# Copyright 2014-2020 Benjamin Worpitz, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,17 +12,17 @@
 # Required CMake version.
 ################################################################################
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.0)
+cmake_minimum_required(VERSION 3.15)
 
-PROJECT("alpakaIntegTest")
+project("alpakaIntegTest")
 
 ################################################################################
 # Add subdirectories.
 ################################################################################
 
-ADD_SUBDIRECTORY("axpy/")
-ADD_SUBDIRECTORY("cudaOnly/")
-ADD_SUBDIRECTORY("mandelbrot/")
-ADD_SUBDIRECTORY("matMul/")
-ADD_SUBDIRECTORY("separableCompilation/")
-ADD_SUBDIRECTORY("sharedMem/")
+add_subdirectory("axpy/")
+add_subdirectory("cudaOnly/")
+add_subdirectory("mandelbrot/")
+add_subdirectory("matMul/")
+add_subdirectory("separableCompilation/")
+add_subdirectory("sharedMem/")
diff --git a/thirdParty/cupla/alpaka/test/integ/axpy/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/axpy/CMakeLists.txt
index c9fcc109e6..054adb6109 100644
--- a/thirdParty/cupla/alpaka/test/integ/axpy/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/integ/axpy/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "axpy")
+set(_TARGET_NAME "axpy")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/integ/axpy/src/axpy.cpp b/thirdParty/cupla/alpaka/test/integ/axpy/src/axpy.cpp
index 605d6736b3..0c580dd166 100644
--- a/thirdParty/cupla/alpaka/test/integ/axpy/src/axpy.cpp
+++ b/thirdParty/cupla/alpaka/test/integ/axpy/src/axpy.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,19 +8,18 @@
  */
 
 #include <alpaka/alpaka.hpp>
-
 #include <alpaka/test/MeasureKernelRunTime.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/queue/Queue.hpp>
 
 #include <catch2/catch.hpp>
 
+#include <algorithm>
+#include <cmath>
 #include <iostream>
-#include <typeinfo>
-#include <random>
 #include <limits>
-#include <cmath>
-#include <algorithm>
+#include <random>
+#include <typeinfo>
 
 //#############################################################################
 //! A vector addition kernel.
@@ -38,34 +37,28 @@ class AxpyKernel
     //! \param X Vector of at least n elements.
     //! \param Y Vector of at least n elements.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem,
-        typename TIdx>
+    template<typename TAcc, typename TElem, typename TIdx>
     ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TIdx const & numElements,
-        TElem const & alpha,
-        TElem const * const X,
-        TElem * const Y) const
-    -> void
+        TAcc const& acc,
+        TIdx const& numElements,
+        TElem const& alpha,
+        TElem const* const X,
+        TElem* const Y) const -> void
     {
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 1,
-            "The AxpyKernel expects 1-dimensional indices!");
+        static_assert(alpaka::Dim<TAcc>::value == 1, "The AxpyKernel expects 1-dimensional indices!");
 
-        auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
+        auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
         auto const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
 
         if(threadFirstElemIdx < numElements)
         {
             // Calculate the number of elements to compute in this thread.
             // The result is uniform for all but the last thread.
-            auto const threadLastElemIdx(threadFirstElemIdx+threadElemExtent);
+            auto const threadLastElemIdx(threadFirstElemIdx + threadElemExtent);
             auto const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
 
-            for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i)
+            for(TIdx i(threadFirstElemIdx); i < threadLastElemIdxClipped; ++i)
             {
                 Y[i] = alpha * X[i] + Y[i];
             }
@@ -73,145 +66,130 @@ class AxpyKernel
     }
 };
 
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::size_t>;
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::size_t>;
 
-TEMPLATE_LIST_TEST_CASE( "axpy", "[axpy]", TestAccs)
+TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
 #ifdef ALPAKA_CI
-    Idx const numElements = 1u<<9u;
+    Idx const numElements = 1u << 9u;
 #else
-    Idx const numElements = 1u<<16u;
+    Idx const numElements = 1u << 16u;
 #endif
 
     using Val = float;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
-    using PltfHost = alpaka::pltf::PltfCpu;
+    using DevAcc = alpaka::Dev<Acc>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+    using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
+    using PltfHost = alpaka::PltfCpu;
 
     // Create the kernel function object.
     AxpyKernel kernel;
 
     // Get the host device.
-    auto const devHost(
-        alpaka::pltf::getDevByIdx<PltfHost>(0u));
+    auto const devHost(alpaka::getDevByIdx<PltfHost>(0u));
 
     // Select a device to execute on.
-    auto const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0u));
+    auto const devAcc(alpaka::getDevByIdx<PltfAcc>(0u));
 
     // Get a queue on this device.
     QueueAcc queue(devAcc);
 
-    alpaka::vec::Vec<Dim, Idx> const extent(
-        numElements);
+    alpaka::Vec<Dim, Idx> const extent(numElements);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extent,
-            static_cast<Idx>(3u),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout
-        << "AxpyKernel("
-        << " numElements:" << numElements
-        << ", accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", kernel: " << typeid(kernel).name()
-        << ", workDiv: " << workDiv
-        << ")" << std::endl;
+    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        extent,
+        static_cast<Idx>(3u),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+    std::cout << "AxpyKernel("
+              << " numElements:" << numElements << ", accelerator: " << alpaka::getAccName<Acc>()
+              << ", kernel: " << typeid(kernel).name() << ", workDiv: " << workDiv << ")" << std::endl;
 
     // Allocate host memory buffers.
-    auto memBufHostX(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    auto memBufHostOrigY(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    auto memBufHostY(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    Val * const pBufHostX = alpaka::mem::view::getPtrNative(memBufHostX);
-    Val * const pBufHostOrigY = alpaka::mem::view::getPtrNative(memBufHostOrigY);
-    Val * const pBufHostY = alpaka::mem::view::getPtrNative(memBufHostY);
-
-    // C++11 random generator for uniformly distributed numbers in [0,1)
+    auto memBufHostX(alpaka::allocBuf<Val, Idx>(devHost, extent));
+    auto memBufHostOrigY(alpaka::allocBuf<Val, Idx>(devHost, extent));
+    auto memBufHostY(alpaka::allocBuf<Val, Idx>(devHost, extent));
+    Val* const pBufHostX = alpaka::getPtrNative(memBufHostX);
+    Val* const pBufHostOrigY = alpaka::getPtrNative(memBufHostOrigY);
+    Val* const pBufHostY = alpaka::getPtrNative(memBufHostY);
+
+    // random generator for uniformly distributed numbers in [0,1)
     // keep in mind, this can generate different values on different platforms
     std::random_device rd{};
     auto const seed = rd();
-    std::default_random_engine eng{ seed };
+    std::default_random_engine eng{seed};
     std::uniform_real_distribution<Val> dist(0.0, 1.0);
     std::cout << "using seed: " << seed << "\n";
     // Initialize the host input vectors
-    for (Idx i(0); i < numElements; ++i)
+    for(Idx i(0); i < numElements; ++i)
     {
         pBufHostX[i] = dist(eng);
         pBufHostOrigY[i] = dist(eng);
     }
-    Val const alpha( dist(eng) );
+    Val const alpha(dist(eng));
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-    std::cout << __func__
-        << " alpha: " << alpha << std::endl;
+    std::cout << __func__ << " alpha: " << alpha << std::endl;
     std::cout << __func__ << " X_host: ";
-    alpaka::mem::view::print(memBufHostX, std::cout);
+    alpaka::print(memBufHostX, std::cout);
     std::cout << std::endl;
     std::cout << __func__ << " Y_host: ";
-    alpaka::mem::view::print(memBufHostOrigY, std::cout);
+    alpaka::print(memBufHostOrigY, std::cout);
     std::cout << std::endl;
 #endif
 
     // Allocate the buffer on the accelerator.
-    auto memBufAccX(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-    auto memBufAccY(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
+    auto memBufAccX(alpaka::allocBuf<Val, Idx>(devAcc, extent));
+    auto memBufAccY(alpaka::allocBuf<Val, Idx>(devAcc, extent));
 
     // Copy Host -> Acc.
-    alpaka::mem::view::copy(queue, memBufAccX, memBufHostX, extent);
-    alpaka::mem::view::copy(queue, memBufAccY, memBufHostOrigY, extent);
+    alpaka::memcpy(queue, memBufAccX, memBufHostX, extent);
+    alpaka::memcpy(queue, memBufAccY, memBufHostOrigY, extent);
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-    alpaka::wait::wait(queue);
+    alpaka::wait(queue);
 
     std::cout << __func__ << " X_Dev: ";
-    alpaka::mem::view::print(memBufHostX, std::cout);
+    alpaka::print(memBufHostX, std::cout);
     std::cout << std::endl;
     std::cout << __func__ << " Y_Dev: ";
-    alpaka::mem::view::print(memBufHostX, std::cout);
+    alpaka::print(memBufHostX, std::cout);
     std::cout << std::endl;
 #endif
 
     // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
+    auto const taskKernel(alpaka::createTaskKernel<Acc>(
         workDiv,
         kernel,
         numElements,
         alpha,
-        alpaka::mem::view::getPtrNative(memBufAccX),
-        alpaka::mem::view::getPtrNative(memBufAccY)));
+        alpaka::getPtrNative(memBufAccX),
+        alpaka::getPtrNative(memBufAccY)));
 
     // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queue,
-            taskKernel)
-        << " ms"
-        << std::endl;
+    std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms"
+              << std::endl;
 
     // Copy back the result.
-    alpaka::mem::view::copy(queue, memBufHostY, memBufAccY, extent);
+    alpaka::memcpy(queue, memBufHostY, memBufAccY, extent);
 
     // Wait for the queue to finish the memory operation.
-    alpaka::wait::wait(queue);
+    alpaka::wait(queue);
 
     bool resultCorrect(true);
     for(Idx i(0u); i < numElements; ++i)
     {
-        auto const & val(pBufHostY[i]);
+        auto const& val(pBufHostY[i]);
         auto const correctResult(alpha * pBufHostX[i] + pBufHostOrigY[i]);
         auto const relDiff = std::abs((val - correctResult) / std::min(val, correctResult));
-        if( relDiff > std::numeric_limits<Val>::epsilon() )
+        if(relDiff > std::numeric_limits<Val>::epsilon())
         {
             std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
             resultCorrect = false;
diff --git a/thirdParty/cupla/alpaka/test/integ/cudaOnly/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/cudaOnly/CMakeLists.txt
index 7c60fc14db..d517629346 100644
--- a/thirdParty/cupla/alpaka/test/integ/cudaOnly/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/integ/cudaOnly/CMakeLists.txt
@@ -1,29 +1,26 @@
 #
-# Copyright 2016-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2016-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "cudaOnly")
+set(_TARGET_NAME "cudaOnly")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
 
-IF(ALPAKA_ACC_GPU_CUDA_ONLY_MODE AND ALPAKA_ACC_GPU_CUDA_ENABLE)
-    ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
-ENDIF()
+if(ALPAKA_ACC_GPU_CUDA_ONLY_MODE AND ALPAKA_ACC_GPU_CUDA_ENABLE)
+    add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+endif()
diff --git a/thirdParty/cupla/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp b/thirdParty/cupla/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp
index 4b5fa2d9cf..0c3ab61266 100644
--- a/thirdParty/cupla/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp
+++ b/thirdParty/cupla/alpaka/test/integ/cudaOnly/src/cudaNativeFunctions.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,7 +8,6 @@
  */
 
 #include <alpaka/alpaka.hpp>
-
 #include <alpaka/test/KernelExecutionFixture.hpp>
 
 #include <catch2/catch.hpp>
@@ -17,30 +16,25 @@
 
 //-----------------------------------------------------------------------------
 //! Native CUDA function.
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wmissing-prototypes"
-#endif
-__device__ auto userDefinedThreadFence()
--> void
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wmissing-prototypes"
+#    endif
+__device__ auto userDefinedThreadFence() -> void
 {
     __threadfence();
 }
-#if BOOST_COMP_CLANG
-    #pragma clang diagnostic pop
-#endif
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
 
 //#############################################################################
 class CudaOnlyTestKernel
 {
 public:
     //-----------------------------------------------------------------------------
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
         alpaka::ignore_unused(acc);
 
@@ -57,12 +51,11 @@ class CudaOnlyTestKernel
 //-----------------------------------------------------------------------------
 TEST_CASE("cudaOnlyModeWorking", "[cudaOnly]")
 {
-    using TAcc = alpaka::acc::AccGpuCudaRt<alpaka::dim::DimInt<1u>, std::uint32_t>;
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
+    using TAcc = alpaka::AccGpuCudaRt<alpaka::DimInt<1u>, std::uint32_t>;
+    using Dim = alpaka::Dim<TAcc>;
+    using Idx = alpaka::Idx<TAcc>;
 
-    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<TAcc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     CudaOnlyTestKernel kernel;
 
diff --git a/thirdParty/cupla/alpaka/test/integ/mandelbrot/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/mandelbrot/CMakeLists.txt
index 8e2242d2df..58a49d4f0c 100644
--- a/thirdParty/cupla/alpaka/test/integ/mandelbrot/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/integ/mandelbrot/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "mandelbrot")
+set(_TARGET_NAME "mandelbrot")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp b/thirdParty/cupla/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp
index 4031357289..1cad905caa 100644
--- a/thirdParty/cupla/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp
+++ b/thirdParty/cupla/alpaka/test/integ/mandelbrot/src/mandelbrot.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,70 +8,60 @@
  */
 
 #include <alpaka/alpaka.hpp>
-
 #include <alpaka/test/MeasureKernelRunTime.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/queue/Queue.hpp>
 
 #include <catch2/catch.hpp>
 
+#include <algorithm>
+#include <fstream>
 #include <iostream>
 #include <typeinfo>
-#include <fstream>
-#include <algorithm>
 
 //#define ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING  // Define this to enable the continuous color mapping.
 
 //#############################################################################
 //! Complex Number.
-template<
-    typename T>
+template<typename T>
 class SimpleComplex
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC SimpleComplex(
-        T const & a,
-        T const & b) :
-            r(a),
-            i(b)
-    {}
+    ALPAKA_FN_HOST_ACC SimpleComplex(T const& a, T const& b) : r(a), i(b)
+    {
+    }
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
     ALPAKA_FN_INLINE
-    ALPAKA_FN_HOST_ACC auto absSq() const
-    -> T
+    ALPAKA_FN_HOST_ACC auto absSq() const -> T
     {
-        return r*r + i*i;
+        return r * r + i * i;
     }
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC auto operator*(SimpleComplex const & a)
-    -> SimpleComplex
+    ALPAKA_FN_HOST_ACC auto operator*(SimpleComplex const& a) -> SimpleComplex
     {
-        return SimpleComplex(r*a.r - i*a.i, i*a.r + r*a.i);
+        return SimpleComplex(r * a.r - i * a.i, i * a.r + r * a.i);
     }
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC auto operator*(float const & a)
-    -> SimpleComplex
+    ALPAKA_FN_HOST_ACC auto operator*(float const& a) -> SimpleComplex
     {
-        return SimpleComplex(r*a, i*a);
+        return SimpleComplex(r * a, i * a);
     }
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC auto operator+(SimpleComplex const & a)
-    -> SimpleComplex
+    ALPAKA_FN_HOST_ACC auto operator+(SimpleComplex const& a) -> SimpleComplex
     {
-        return SimpleComplex(r+a.r, i+a.i);
+        return SimpleComplex(r + a.r, i + a.i);
     }
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_HOST_ACC auto operator+(float const & a)
-    -> SimpleComplex
+    ALPAKA_FN_HOST_ACC auto operator+(float const& a) -> SimpleComplex
     {
-        return SimpleComplex(r+a, i);
+        return SimpleComplex(r + a, i);
     }
 
 public:
@@ -120,34 +110,30 @@ class MandelbrotKernel
     //! \param fMaxI The top border.
     //! \param maxIterations The maximum number of iterations.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
+    template<typename TAcc>
     ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        std::uint32_t * const pColors,
-        std::uint32_t const & numRows,
-        std::uint32_t const & numCols,
-        std::uint32_t const & pitchBytes,
-        float const & fMinR,
-        float const & fMaxR,
-        float const & fMinI,
-        float const & fMaxI,
-        std::uint32_t const & maxIterations) const
-    -> void
+        TAcc const& acc,
+        std::uint32_t* const pColors,
+        std::uint32_t const& numRows,
+        std::uint32_t const& numCols,
+        std::uint32_t const& pitchBytes,
+        float const& fMinR,
+        float const& fMaxR,
+        float const& fMinI,
+        float const& fMaxI,
+        std::uint32_t const& maxIterations) const -> void
     {
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 2,
-            "The MandelbrotKernel expects 2-dimensional indices!");
+        static_assert(alpaka::Dim<TAcc>::value == 2, "The MandelbrotKernel expects 2-dimensional indices!");
 
-        auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc));
-        auto const & gridThreadIdxX(gridThreadIdx[1u]);
-        auto const & gridThreadIdxY(gridThreadIdx[0u]);
+        auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc));
+        auto const& gridThreadIdxX(gridThreadIdx[1u]);
+        auto const& gridThreadIdxY(gridThreadIdx[0u]);
 
         if((gridThreadIdxY < numRows) && (gridThreadIdxX < numCols))
         {
             SimpleComplex<float> c(
-                (fMinR + (static_cast<float>(gridThreadIdxX)/float(numCols-1)*(fMaxR - fMinR))),
-                (fMinI + (static_cast<float>(gridThreadIdxY)/float(numRows-1)*(fMaxI - fMinI))));
+                (fMinR + (static_cast<float>(gridThreadIdxX) / float(numCols - 1) * (fMaxR - fMinR))),
+                (fMinI + (static_cast<float>(gridThreadIdxY) / float(numRows - 1) * (fMaxI - fMinI))));
 
             auto const iterationCount(iterateMandelbrot(c, maxIterations));
 
@@ -161,17 +147,16 @@ class MandelbrotKernel
         }
     }
     //-----------------------------------------------------------------------------
-    //! \return The number of iterations until the Mandelbrot iteration with the given Value reaches the absolute value of 2.
+    //! \return The number of iterations until the Mandelbrot iteration with the given Value reaches the absolute value
+    //! of 2.
     //!     Only does maxIterations steps and returns maxIterations if the value would be higher.
-    ALPAKA_FN_ACC static auto iterateMandelbrot(
-        SimpleComplex<float> const & c,
-        std::uint32_t const & maxIterations)
-    -> std::uint32_t
+    ALPAKA_FN_ACC static auto iterateMandelbrot(SimpleComplex<float> const& c, std::uint32_t const& maxIterations)
+        -> std::uint32_t
     {
         SimpleComplex<float> z(0.0f, 0.0f);
-        for(std::uint32_t iterations(0); iterations<maxIterations; ++iterations)
+        for(std::uint32_t iterations(0); iterations < maxIterations; ++iterations)
         {
-            z = z*z + c;
+            z = z * z + c;
             if(z.absSq() > 4.0f)
             {
                 return iterations;
@@ -182,12 +167,11 @@ class MandelbrotKernel
 
     //-----------------------------------------------------------------------------
     ALPAKA_FN_HOST_ACC static auto convertRgbSingleToBgra(
-        std::uint32_t const & r,
-        std::uint32_t const & g,
-        std::uint32_t const & b)
-    -> std::uint32_t
+        std::uint32_t const& r,
+        std::uint32_t const& g,
+        std::uint32_t const& b) -> std::uint32_t
     {
-        return 0xFF000000 | (r<<16) | (g<<8) | b;
+        return 0xFF000000 | (r << 16) | (g << 8) | b;
     }
 
 #ifdef ALPAKA_MANDELBROT_TEST_CONTINOUS_COLOR_MAPPING
@@ -196,28 +180,25 @@ class MandelbrotKernel
     //! This leads to banding but allows a all pixels to be colored.
     ALPAKA_NO_HOST_ACC_WARNING
     ALPAKA_FN_ACC static auto iterationCountToContinousColor(
-        std::uint32_t const & iterationCount,
-        std::uint32_t const & maxIterations)
-    -> std::uint32_t
+        std::uint32_t const& iterationCount,
+        std::uint32_t const& maxIterations) -> std::uint32_t
     {
         // Map the iteration count on the 0..1 interval.
-        float const t(static_cast<float>(iterationCount)/static_cast<float>(maxIterations));
-        float const oneMinusT(1.0f-t);
+        float const t(static_cast<float>(iterationCount) / static_cast<float>(maxIterations));
+        float const oneMinusT(1.0f - t);
         // Use some modified Bernstein polynomials for r, g, b.
-        std::uint32_t const r(static_cast<std::uint32_t>(9.0f*oneMinusT*t*t*t*255.0f));
-        std::uint32_t const g(static_cast<std::uint32_t>(15.0f*oneMinusT*oneMinusT*t*t*255.0f));
-        std::uint32_t const b(static_cast<std::uint32_t>(8.5f*oneMinusT*oneMinusT*oneMinusT*t*255.0f));
+        std::uint32_t const r(static_cast<std::uint32_t>(9.0f * oneMinusT * t * t * t * 255.0f));
+        std::uint32_t const g(static_cast<std::uint32_t>(15.0f * oneMinusT * oneMinusT * t * t * 255.0f));
+        std::uint32_t const b(static_cast<std::uint32_t>(8.5f * oneMinusT * oneMinusT * oneMinusT * t * 255.0f));
         return convertRgbSingleToBgra(r, g, b);
     }
 #else
     //-----------------------------------------------------------------------------
     //! This uses a simple mapping from iteration count to colors.
     //! This leads to banding but allows a all pixels to be colored.
-    ALPAKA_FN_ACC auto iterationCountToRepeatedColor(
-        std::uint32_t const & iterationCount) const
-    -> std::uint32_t
+    ALPAKA_FN_ACC auto iterationCountToRepeatedColor(std::uint32_t const& iterationCount) const -> std::uint32_t
     {
-        return m_colors[iterationCount%16];
+        return m_colors[iterationCount % 16];
     }
 
     std::uint32_t m_colors[16];
@@ -226,23 +207,15 @@ class MandelbrotKernel
 
 //-----------------------------------------------------------------------------
 //! Writes the buffer color data to a file.
-template<
-    typename TBuf>
-auto writeTgaColorImage(
-    std::string const & fileName,
-    TBuf const & bufRgba)
--> void
+template<typename TBuf>
+auto writeTgaColorImage(std::string const& fileName, TBuf const& bufRgba) -> void
 {
-    static_assert(
-        alpaka::dim::Dim<TBuf>::value == 2,
-        "The buffer has to be 2 dimensional!");
-    static_assert(
-        std::is_integral<alpaka::elem::Elem<TBuf>>::value,
-        "The buffer element type has to be integral!");
+    static_assert(alpaka::Dim<TBuf>::value == 2, "The buffer has to be 2 dimensional!");
+    static_assert(std::is_integral<alpaka::Elem<TBuf>>::value, "The buffer element type has to be integral!");
 
     // The width of the input buffer is in input elements.
     auto const bufWidthElems(alpaka::extent::getWidth(bufRgba));
-    auto const bufWidthBytes(bufWidthElems * sizeof(alpaka::elem::Elem<TBuf>));
+    auto const bufWidthBytes(bufWidthElems * sizeof(alpaka::Elem<TBuf>));
     // The row width in bytes has to be dividable by 4 Bytes (RGBA).
     ALPAKA_ASSERT(bufWidthBytes % sizeof(std::uint32_t) == 0);
     // The number of colors in a row.
@@ -250,72 +223,64 @@ auto writeTgaColorImage(
     ALPAKA_ASSERT(bufWidthColors >= 1);
     auto const bufHeightColors(alpaka::extent::getHeight(bufRgba));
     ALPAKA_ASSERT(bufHeightColors >= 1);
-    auto const bufPitchBytes(alpaka::mem::view::getPitchBytes<alpaka::dim::Dim<TBuf>::value - 1u>(bufRgba));
+    auto const bufPitchBytes(alpaka::getPitchBytes<alpaka::Dim<TBuf>::value - 1u>(bufRgba));
     ALPAKA_ASSERT(bufPitchBytes >= bufWidthBytes);
 
-    std::ofstream ofs(
-        fileName,
-        std::ofstream::out | std::ofstream::binary);
+    std::ofstream ofs(fileName, std::ofstream::out | std::ofstream::binary);
     if(!ofs.is_open())
     {
-        throw std::invalid_argument("Unable to open file: "+fileName);
+        throw std::invalid_argument("Unable to open file: " + fileName);
     }
 
     // Write tga image header.
-    ofs.put(0x00);                      // Number of Characters in Identification Field.
-    ofs.put(0x00);                      // Color Map Type.
-    ofs.put(0x02);                      // Image Type Code.
-    ofs.put(0x00);                      // Color Map Origin.
+    ofs.put(0x00); // Number of Characters in Identification Field.
+    ofs.put(0x00); // Color Map Type.
+    ofs.put(0x02); // Image Type Code.
+    ofs.put(0x00); // Color Map Origin.
     ofs.put(0x00);
-    ofs.put(0x00);                      // Color Map Length.
+    ofs.put(0x00); // Color Map Length.
     ofs.put(0x00);
-    ofs.put(0x00);                      // Color Map Entry Size.
-    ofs.put(0x00);                      // X Origin of Image.
+    ofs.put(0x00); // Color Map Entry Size.
+    ofs.put(0x00); // X Origin of Image.
     ofs.put(0x00);
-    ofs.put(0x00);                      // Y Origin of Image.
+    ofs.put(0x00); // Y Origin of Image.
     ofs.put(0x00);
     ofs.put(static_cast<char>(bufWidthColors & 0xFFu)); // Width of Image.
     ofs.put(static_cast<char>((bufWidthColors >> 8) & 0xFFu));
-    ofs.put(static_cast<char>(bufHeightColors & 0xFFu));// Height of Image.
+    ofs.put(static_cast<char>(bufHeightColors & 0xFFu)); // Height of Image.
     ofs.put(static_cast<char>((bufHeightColors >> 8) & 0xFFu));
-    ofs.put(0x20);                      // Image Pixel Size.
-    ofs.put(0x20);                      // Image Descriptor Byte.
+    ofs.put(0x20); // Image Pixel Size.
+    ofs.put(0x20); // Image Descriptor Byte.
 
     // Write the data.
-    char const * pData(reinterpret_cast<char const *>(alpaka::mem::view::getPtrNative(bufRgba)));
+    char const* pData(reinterpret_cast<char const*>(alpaka::getPtrNative(bufRgba)));
     // If there is no padding, we can directly write the whole buffer data ...
     if(bufPitchBytes == bufWidthBytes)
     {
-        ofs.write(
-            pData,
-            static_cast<std::streamsize>(bufWidthBytes*bufHeightColors));
+        ofs.write(pData, static_cast<std::streamsize>(bufWidthBytes * bufHeightColors));
     }
     // ... else we have to write row by row.
     else
     {
-        for(auto row(decltype(bufHeightColors)(0)); row<bufHeightColors; ++row)
+        for(auto row(decltype(bufHeightColors)(0)); row < bufHeightColors; ++row)
         {
-            ofs.write(
-                pData + bufPitchBytes*row,
-                static_cast<std::streamsize>(bufWidthBytes));
+            ofs.write(pData + bufPitchBytes * row, static_cast<std::streamsize>(bufWidthBytes));
         }
     }
 }
 
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<2u>,
-    std::uint32_t>;
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<2u>, std::uint32_t>;
 
-TEMPLATE_LIST_TEST_CASE( "mandelbrot", "[mandelbrot]", TestAccs)
+TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
 #ifdef ALPAKA_CI
-    Idx const imageSize(1u<<5u);
+    Idx const imageSize(1u << 5u);
 #else
-    Idx const imageSize(1u<<10u);
+    Idx const imageSize(1u << 10u);
 #endif
     Idx const numRows(imageSize);
     Idx const numCols(imageSize);
@@ -326,68 +291,55 @@ TEMPLATE_LIST_TEST_CASE( "mandelbrot", "[mandelbrot]", TestAccs)
     Idx const maxIterations(300u);
 
     using Val = std::uint32_t;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
-    using PltfHost = alpaka::pltf::PltfCpu;
+    using DevAcc = alpaka::Dev<Acc>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+    using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
+    using PltfHost = alpaka::PltfCpu;
 
     // Create the kernel function object.
     MandelbrotKernel kernel;
 
     // Get the host device.
-    auto const devHost(
-        alpaka::pltf::getDevByIdx<PltfHost>(0u));
+    auto const devHost(alpaka::getDevByIdx<PltfHost>(0u));
 
     // Select a device to execute on.
-    auto const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0u));
+    auto const devAcc(alpaka::getDevByIdx<PltfAcc>(0u));
 
     // Get a queue on this device.
-    QueueAcc queue(
-        devAcc);
+    QueueAcc queue(devAcc);
 
-    alpaka::vec::Vec<Dim, Idx> const extent(
-        static_cast<Idx>(numRows),
-        static_cast<Idx>(numCols));
+    alpaka::Vec<Dim, Idx> const extent(static_cast<Idx>(numRows), static_cast<Idx>(numCols));
 
     // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extent,
-            alpaka::vec::Vec<Dim, Idx>::ones(),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout
-        << "MandelbrotKernel("
-        << " numRows:" << numRows
-        << ", numCols:" << numCols
-        << ", maxIterations:" << maxIterations
-        << ", accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", kernel: " << typeid(kernel).name()
-        << ", workDiv: " << workDiv
-        << ")" << std::endl;
+    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        extent,
+        alpaka::Vec<Dim, Idx>::ones(),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+    std::cout << "MandelbrotKernel("
+              << " numRows:" << numRows << ", numCols:" << numCols << ", maxIterations:" << maxIterations
+              << ", accelerator: " << alpaka::getAccName<Acc>() << ", kernel: " << typeid(kernel).name()
+              << ", workDiv: " << workDiv << ")" << std::endl;
 
     // allocate host memory
-    auto bufColorHost(
-        alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
+    auto bufColorHost(alpaka::allocBuf<Val, Idx>(devHost, extent));
 
     // Allocate the buffer on the accelerator.
-    auto bufColorAcc(
-        alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
+    auto bufColorAcc(alpaka::allocBuf<Val, Idx>(devAcc, extent));
 
     // Copy Host -> Acc.
-    alpaka::mem::view::copy(queue, bufColorAcc, bufColorHost, extent);
+    alpaka::memcpy(queue, bufColorAcc, bufColorHost, extent);
 
     // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
+    auto const taskKernel(alpaka::createTaskKernel<Acc>(
         workDiv,
         kernel,
-        alpaka::mem::view::getPtrNative(bufColorAcc),
+        alpaka::getPtrNative(bufColorAcc),
         numRows,
         numCols,
-        alpaka::mem::view::getPitchBytes<1u>(bufColorAcc),
+        alpaka::getPitchBytes<1u>(bufColorAcc),
         fMinR,
         fMaxR,
         fMinI,
@@ -395,24 +347,20 @@ TEMPLATE_LIST_TEST_CASE( "mandelbrot", "[mandelbrot]", TestAccs)
         maxIterations));
 
     // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queue,
-            taskKernel)
-        << " ms"
-        << std::endl;
+    std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms"
+              << std::endl;
 
     // Copy back the result.
-    alpaka::mem::view::copy(queue, bufColorHost, bufColorAcc, extent);
+    alpaka::memcpy(queue, bufColorHost, bufColorAcc, extent);
 
     // Wait for the queue to finish the memory operation.
-    alpaka::wait::wait(queue);
+    alpaka::wait(queue);
 
     // Write the image to a file.
-    std::string fileName("mandelbrot"+std::to_string(numCols)+"x"+std::to_string(numRows)+"_"+alpaka::acc::getAccName<Acc>()+".tga");
+    std::string fileName(
+        "mandelbrot" + std::to_string(numCols) + "x" + std::to_string(numRows) + "_" + alpaka::getAccName<Acc>()
+        + ".tga");
     std::replace(fileName.begin(), fileName.end(), '<', '_');
     std::replace(fileName.begin(), fileName.end(), '>', '_');
-    writeTgaColorImage(
-        fileName,
-        bufColorHost);
+    writeTgaColorImage(fileName, bufColorHost);
 }
diff --git a/thirdParty/cupla/alpaka/test/integ/matMul/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/matMul/CMakeLists.txt
index 6c5e91a091..109d87206f 100644
--- a/thirdParty/cupla/alpaka/test/integ/matMul/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/integ/matMul/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "matMul")
+set(_TARGET_NAME "matMul")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/integ/matMul/src/matMul.cpp b/thirdParty/cupla/alpaka/test/integ/matMul/src/matMul.cpp
index cda1f0855a..94744d7051 100644
--- a/thirdParty/cupla/alpaka/test/integ/matMul/src/matMul.cpp
+++ b/thirdParty/cupla/alpaka/test/integ/matMul/src/matMul.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,17 +8,16 @@
  */
 
 #include <alpaka/alpaka.hpp>
-
 #include <alpaka/test/MeasureKernelRunTime.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/queue/Queue.hpp>
 
 #include <catch2/catch.hpp>
 
+#include <functional>
 #include <iostream>
 #include <typeinfo>
 #include <vector>
-#include <functional>
 
 //#############################################################################
 //! A matrix multiplication kernel.
@@ -41,50 +40,47 @@ class MatMulKernel
     //! \param C The pointer to the matrix C data.
     //! \param ldc The pitch of the C matrix in elements.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem,
-        typename TIndex>
+    template<typename TAcc, typename TElem, typename TIndex>
     ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TIndex const & m,
-        TIndex const & n,
-        TIndex const & k,
-        TElem const & alpha,
-        TElem const * const A,
-        TIndex const & lda,
-        TElem const * const B,
-        TIndex const & ldb,
-        TElem const & beta,
-        TElem * const C,
-        TIndex const & ldc) const
-    -> void
+        TAcc const& acc,
+        TIndex const& m,
+        TIndex const& n,
+        TIndex const& k,
+        TElem const& alpha,
+        TElem const* const A,
+        TIndex const& lda,
+        TElem const* const B,
+        TIndex const& ldb,
+        TElem const& beta,
+        TElem* const C,
+        TIndex const& ldc) const -> void
     {
-        static_assert(alpaka::dim::Dim<TAcc>::value == 2u,
+        static_assert(
+            alpaka::Dim<TAcc>::value == 2u,
             "The accelerator used for the GemmAlpakaKernel has to be 2 dimensional!");
 
         // Column and row of C to calculate.
-        auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc));
-        auto const & gridThreadIdxX(gridThreadIdx[1u]);
-        auto const & gridThreadIdxY(gridThreadIdx[0u]);
+        auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc));
+        auto const& gridThreadIdxX(gridThreadIdx[1u]);
+        auto const& gridThreadIdxY(gridThreadIdx[0u]);
 
         // Column and row inside the block of C to calculate.
-        auto const blockThreadIdx(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc));
-        auto const & blockThreadIdxX(blockThreadIdx[1u]);
-        auto const & blockThreadIdxY(blockThreadIdx[0u]);
+        auto const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+        auto const& blockThreadIdxX(blockThreadIdx[1u]);
+        auto const& blockThreadIdxY(blockThreadIdx[0u]);
 
         // The block threads extent.
-        auto const blockThreadExtent(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc));
-        auto const & blockThreadExtentX(blockThreadExtent[1u]);
-        auto const & blockThreadExtentY(blockThreadExtent[0u]);
-        //ALPAKA_ASSERT(blockThreadExtentX == blockThreadExtentY);
-        auto const & blockThreadExtentVal(blockThreadExtentX);
+        auto const blockThreadExtent(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc));
+        auto const& blockThreadExtentX(blockThreadExtent[1u]);
+        auto const& blockThreadExtentY(blockThreadExtent[0u]);
+        // ALPAKA_ASSERT(blockThreadExtentX == blockThreadExtentY);
+        auto const& blockThreadExtentVal(blockThreadExtentX);
 
         // Shared memory used to store the current blocks of A and B.
-        auto * const pBlockSharedA(alpaka::block::shared::dyn::getMem<TElem>(acc));
-        auto * const pBlockSharedB(pBlockSharedA + blockThreadExtentX*blockThreadExtentY);
+        auto* const pBlockSharedA(alpaka::getDynSharedMem<TElem>(acc));
+        auto* const pBlockSharedB(pBlockSharedA + blockThreadExtentX * blockThreadExtentY);
 
-        auto const sharedBlockIdx1d(blockThreadIdxY*blockThreadExtentX + blockThreadIdxX);
+        auto const sharedBlockIdx1d(blockThreadIdxY * blockThreadExtentX + blockThreadIdxX);
 
         // If the element corresponding to the current thread is outside of the respective matrix.
         bool const insideA(gridThreadIdxY < m);
@@ -94,48 +90,45 @@ class MatMulKernel
         TElem dotProduct(0);
 
         // Loop over all blocks of A and B that are required to compute the C block.
-        auto const blockMulCount(static_cast<TIndex>(std::ceil(static_cast<float>(k)/static_cast<float>(blockThreadExtentVal))));
+        auto const blockMulCount(
+            static_cast<TIndex>(std::ceil(static_cast<float>(k) / static_cast<float>(blockThreadExtentVal))));
         for(TIndex k2(0u); k2 < blockMulCount; ++k2)
         {
             // Copy the current blocks of A and B into shared memory in parallel.
             // If the element of the current thread is outside of the matrix, zero is written into the shared memory.
             // This is possible because zero is a result neutral extension of the matrices regarding the dot product.
-            auto const AIdxX(k2*blockThreadExtentX + blockThreadIdxX);
-            auto const AIdx1d(gridThreadIdxY*lda + AIdxX);
-            pBlockSharedA[sharedBlockIdx1d] = (
-                ((!insideA) || (AIdxX>=k))
-                ? static_cast<TElem>(0)
-                : A[AIdx1d]);
-
-            auto const BIdxY(k2*blockThreadExtentY + blockThreadIdxY);
-            auto const BIdx1d(BIdxY*ldb + gridThreadIdxX);
-            pBlockSharedB[sharedBlockIdx1d] = (
-                ((!insideB) || (BIdxY>=k))
-                ? static_cast<TElem>(0)
-                : B[BIdx1d]);
+            auto const AIdxX(k2 * blockThreadExtentX + blockThreadIdxX);
+            auto const AIdx1d(gridThreadIdxY * lda + AIdxX);
+            pBlockSharedA[sharedBlockIdx1d] = (((!insideA) || (AIdxX >= k)) ? static_cast<TElem>(0) : A[AIdx1d]);
+
+            auto const BIdxY(k2 * blockThreadExtentY + blockThreadIdxY);
+            auto const BIdx1d(BIdxY * ldb + gridThreadIdxX);
+            pBlockSharedB[sharedBlockIdx1d] = (((!insideB) || (BIdxY >= k)) ? static_cast<TElem>(0) : B[BIdx1d]);
 
             // Synchronize to make sure the complete blocks are loaded before starting the computation.
-            alpaka::block::sync::syncBlockThreads(acc);
+            alpaka::syncBlockThreads(acc);
 
             // Not really necessary because we wrote zeros into those cells.
-            //if(insideC)
+            // if(insideC)
             //{
-                // Compute the dot products within shared memory.
-                for(TIndex k3(0); k3 < blockThreadExtentVal; ++k3)
-                {
-                    dotProduct += pBlockSharedA[blockThreadIdxY*blockThreadExtentX + k3]
-                        * pBlockSharedB[k3*blockThreadExtentY + blockThreadIdxX];
-                }
+            // Compute the dot products within shared memory.
+            for(TIndex k3(0); k3 < blockThreadExtentVal; ++k3)
+            {
+                dotProduct += pBlockSharedA[blockThreadIdxY * blockThreadExtentX + k3]
+                    * pBlockSharedB[k3 * blockThreadExtentY + blockThreadIdxX];
+            }
             //}
 
-            // Synchronize to make sure that the preceding computation is done before loading the next blocks of A and B.
-            alpaka::block::sync::syncBlockThreads(acc);
+            // Synchronize to make sure that the preceding computation is done before loading the next blocks of A and
+            // B.
+            alpaka::syncBlockThreads(acc);
         }
 
-        // If the element is outside of the matrix it was only a helper thread that did not calculate any meaningful results.
+        // If the element is outside of the matrix it was only a helper thread that did not calculate any meaningful
+        // results.
         if(insideC)
         {
-            auto const CIdx1d(gridThreadIdxY*ldc + gridThreadIdxX);
+            auto const CIdx1d(gridThreadIdxY * ldc + gridThreadIdxX);
             C[CIdx1d] = alpha * dotProduct + beta * C[CIdx1d];
         }
     }
@@ -143,209 +136,171 @@ class MatMulKernel
 
 namespace alpaka
 {
-    namespace kernel
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The trait for getting the size of the block shared dynamic memory for a kernel.
+        template<typename TAcc>
+        struct BlockSharedMemDynSizeBytes<MatMulKernel, TAcc>
         {
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory for a kernel.
-            template<
-                typename TAcc>
-            struct BlockSharedMemDynSizeBytes<
-                MatMulKernel,
-                TAcc>
+            //-----------------------------------------------------------------------------
+            //! \return The size of the shared memory allocated for a block.
+            template<typename TVec, typename TIndex, typename TElem>
+            ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+                MatMulKernel const& matMulKernel,
+                TVec const& blockThreadExtent,
+                TVec const& threadElemExtent,
+                TIndex const& m,
+                TIndex const& n,
+                TIndex const& k,
+                TElem const& alpha,
+                TElem const* const A,
+                TIndex const& lda,
+                TElem const* const B,
+                TIndex const& ldb,
+                TElem const& beta,
+                TElem* const C,
+                TIndex const& ldc)
             {
-                //-----------------------------------------------------------------------------
-                //! \return The size of the shared memory allocated for a block.
-                template<
-                    typename TVec,
-                    typename TIndex,
-                    typename TElem>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    MatMulKernel const & matMulKernel,
-                    TVec const & blockThreadExtent,
-                    TVec const & threadElemExtent,
-                    TIndex const & m,
-                    TIndex const & n,
-                    TIndex const & k,
-                    TElem const & alpha,
-                    TElem const * const A,
-                    TIndex const & lda,
-                    TElem const * const B,
-                    TIndex const & ldb,
-                    TElem const & beta,
-                    TElem * const C,
-                    TIndex const & ldc)
-                -> TIndex
-                {
-                    alpaka::ignore_unused(matMulKernel);
-                    alpaka::ignore_unused(m);
-                    alpaka::ignore_unused(n);
-                    alpaka::ignore_unused(k);
-                    alpaka::ignore_unused(alpha);
-                    alpaka::ignore_unused(A);
-                    alpaka::ignore_unused(lda);
-                    alpaka::ignore_unused(B);
-                    alpaka::ignore_unused(ldb);
-                    alpaka::ignore_unused(beta);
-                    alpaka::ignore_unused(C);
-                    alpaka::ignore_unused(ldc);
-
-                    // Reserve the buffer for the two blocks of A and B.
-                    return 2u * blockThreadExtent.prod() * threadElemExtent.prod() * sizeof(TElem);
-                }
-            };
-        }
-    }
-}
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<2u>,
-    std::uint32_t>;
-
-TEMPLATE_LIST_TEST_CASE( "matMul", "[matMul]", TestAccs)
+                alpaka::ignore_unused(matMulKernel);
+                alpaka::ignore_unused(m);
+                alpaka::ignore_unused(n);
+                alpaka::ignore_unused(k);
+                alpaka::ignore_unused(alpha);
+                alpaka::ignore_unused(A);
+                alpaka::ignore_unused(lda);
+                alpaka::ignore_unused(B);
+                alpaka::ignore_unused(ldb);
+                alpaka::ignore_unused(beta);
+                alpaka::ignore_unused(C);
+                alpaka::ignore_unused(ldc);
+
+                // Reserve the buffer for the two blocks of A and B.
+                return static_cast<std::size_t>(2u * blockThreadExtent.prod() * threadElemExtent.prod())
+                    * sizeof(TElem);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<2u>, std::uint32_t>;
+
+TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
     Idx const m(64u);
     Idx const n(79u);
     Idx const k(23u);
 
     using Val = std::uint32_t;
-    using Vec2 = alpaka::vec::Vec<Dim, Idx>;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<alpaka::dev::Dev<Acc>>;
-    using PltfHost = alpaka::pltf::PltfCpu;
-    using DevHost = alpaka::dev::Dev<PltfHost>;
-    using QueueHost = alpaka::queue::QueueCpuNonBlocking;
+    using Vec2 = alpaka::Vec<Dim, Idx>;
+    using DevAcc = alpaka::Dev<Acc>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+    using QueueAcc = alpaka::test::DefaultQueue<alpaka::Dev<Acc>>;
+    using PltfHost = alpaka::PltfCpu;
+    using DevHost = alpaka::Dev<PltfHost>;
+    using QueueHost = alpaka::QueueCpuNonBlocking;
 
     // Create the kernel function object.
     MatMulKernel kernel;
 
     // Get the host device.
-    DevHost const devHost(
-        alpaka::pltf::getDevByIdx<PltfHost>(0u));
+    DevHost const devHost(alpaka::getDevByIdx<PltfHost>(0u));
 
     // Get a queue on the host device.
-    QueueHost queueHost(
-        devHost);
+    QueueHost queueHost(devHost);
 
     // Select a device to execute on.
-    DevAcc const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0u));
+    DevAcc const devAcc(alpaka::getDevByIdx<PltfAcc>(0u));
 
     // Get a queue on the accelerator device.
-    QueueAcc queueAcc(
-        devAcc);
+    QueueAcc queueAcc(devAcc);
 
     // Specify the input matrix extents.
-    Vec2 const extentA(
-        static_cast<Idx>(m),
-        static_cast<Idx>(k));
+    Vec2 const extentA(static_cast<Idx>(m), static_cast<Idx>(k));
 
-    Vec2 const extentB(
-        static_cast<Idx>(k),
-        static_cast<Idx>(n));
+    Vec2 const extentB(static_cast<Idx>(k), static_cast<Idx>(n));
 
     // Result matrix is MxN. We create one worker per result matrix cell.
-    Vec2 const extentC(
-        static_cast<Idx>(m),
-        static_cast<Idx>(n));
+    Vec2 const extentC(static_cast<Idx>(m), static_cast<Idx>(n));
 
     // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extentC,
-            alpaka::vec::Vec<Dim, Idx>::ones(),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::EqualExtent));
-
-    std::cout
-        << "MatMulKernel("
-        << "m:" << m
-        << ", n:" << n
-        << ", k:" << k
-        << ", accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", kernel: " << typeid(kernel).name()
-        << ", workDiv: " << workDiv
-        << ")" << std::endl;
+    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        extentC,
+        alpaka::Vec<Dim, Idx>::ones(),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::EqualExtent));
+
+    std::cout << "MatMulKernel("
+              << "m:" << m << ", n:" << n << ", k:" << k << ", accelerator: " << alpaka::getAccName<Acc>()
+              << ", kernel: " << typeid(kernel).name() << ", workDiv: " << workDiv << ")" << std::endl;
 
     // Allocate the A and B matrices as std::vectors because this allows them to be filled with uint32_t(1).
-    // alpaka::mem::view::set only supports setting all bytes leading to a value of 16843009 in all elements.
+    // alpaka::set only supports setting all bytes leading to a value of 16843009 in all elements.
     std::vector<Val> bufAHost1d(m * k, static_cast<Val>(1));
     std::vector<Val> bufBHost1d(k * n, static_cast<Val>(1));
     // Wrap the std::vectors into a memory buffer object.
-    // For 1D data this would not be required because alpaka::mem::view::copy is specialized for std::vector and std::array.
-    // For multi dimensional data you could directly create them using alpaka::mem::buf::alloc<Type>(devHost, extent), which is not used here.
-    // Instead we use ViewPlainPtr to wrap the data.
-    using BufWrapper = alpaka::mem::view::ViewPlainPtr<
-        DevHost,
-        Val,
-        Dim,
-        Idx>;
+    // For 1D data this would not be required because alpaka::copy is specialized for std::vector and std::array.
+    // For multi dimensional data you could directly create them using alpaka::malloc<Type>(devHost, extent), which is
+    // not used here. Instead we use ViewPlainPtr to wrap the data.
+    using BufWrapper = alpaka::ViewPlainPtr<DevHost, Val, Dim, Idx>;
     BufWrapper bufAHost(bufAHost1d.data(), devHost, extentA);
     BufWrapper bufBHost(bufBHost1d.data(), devHost, extentB);
 
     // Allocate C and set it to zero.
-    auto bufCHost(alpaka::mem::buf::alloc<Val, Idx>(devHost, extentC));
-    alpaka::mem::view::set(queueHost, bufCHost, 0u, extentC);
+    auto bufCHost(alpaka::allocBuf<Val, Idx>(devHost, extentC));
+    alpaka::memset(queueHost, bufCHost, 0u, extentC);
 
     // Allocate the buffers on the accelerator.
-    auto bufAAcc(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extentA));
-    auto bufBAcc(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extentB));
-    auto bufCAcc(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extentC));
+    auto bufAAcc(alpaka::allocBuf<Val, Idx>(devAcc, extentA));
+    auto bufBAcc(alpaka::allocBuf<Val, Idx>(devAcc, extentB));
+    auto bufCAcc(alpaka::allocBuf<Val, Idx>(devAcc, extentC));
 
     // Copy Host -> Acc.
-    alpaka::mem::view::copy(queueAcc, bufAAcc, bufAHost, extentA);
-    alpaka::mem::view::copy(queueAcc, bufBAcc, bufBHost, extentB);
-    alpaka::wait::wait(queueHost);
-    alpaka::mem::view::copy(queueAcc, bufCAcc, bufCHost, extentC);
+    alpaka::memcpy(queueAcc, bufAAcc, bufAHost, extentA);
+    alpaka::memcpy(queueAcc, bufBAcc, bufBHost, extentB);
+    alpaka::wait(queueHost);
+    alpaka::memcpy(queueAcc, bufCAcc, bufCHost, extentC);
 
     // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
+    auto const taskKernel(alpaka::createTaskKernel<Acc>(
         workDiv,
         kernel,
         m,
         n,
         k,
         static_cast<Val>(1),
-        alpaka::mem::view::getPtrNative(bufAAcc),
-        static_cast<Idx>(alpaka::mem::view::getPitchBytes<1u>(bufAAcc) / sizeof(Val)),
-        alpaka::mem::view::getPtrNative(bufBAcc),
-        static_cast<Idx>(alpaka::mem::view::getPitchBytes<1u>(bufBAcc) / sizeof(Val)),
+        alpaka::getPtrNative(bufAAcc),
+        static_cast<Idx>(alpaka::getPitchBytes<1u>(bufAAcc) / sizeof(Val)),
+        alpaka::getPtrNative(bufBAcc),
+        static_cast<Idx>(alpaka::getPitchBytes<1u>(bufBAcc) / sizeof(Val)),
         static_cast<Val>(1),
-        alpaka::mem::view::getPtrNative(bufCAcc),
-        static_cast<Idx>(alpaka::mem::view::getPitchBytes<1u>(bufCAcc) / sizeof(Val))));
+        alpaka::getPtrNative(bufCAcc),
+        static_cast<Idx>(alpaka::getPitchBytes<1u>(bufCAcc) / sizeof(Val))));
 
     // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queueAcc,
-            taskKernel)
-        << " ms"
-        << std::endl;
+    std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queueAcc, taskKernel) << " ms"
+              << std::endl;
 
     // Copy back the result.
-    alpaka::mem::view::copy(queueAcc, bufCHost, bufCAcc, extentC);
+    alpaka::memcpy(queueAcc, bufCHost, bufCAcc, extentC);
 
     // Wait for the queue to finish the memory operation.
-    alpaka::wait::wait(queueAcc);
+    alpaka::wait(queueAcc);
 
     // Assert that the results are correct.
     // When multiplying square matrices filled with ones, the result of each cell is the size of the matrix.
     auto const correctResult(static_cast<Val>(k));
 
     bool resultCorrect(true);
-    auto const pHostData(alpaka::mem::view::getPtrNative(bufCHost));
-    for(Idx i(0u);
-        i < m * n;
-        ++i)
+    auto const pHostData(alpaka::getPtrNative(bufCHost));
+    for(Idx i(0u); i < m * n; ++i)
     {
-        auto const & val(pHostData[i]);
+        auto const& val(pHostData[i]);
         if(val != correctResult)
         {
             std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
diff --git a/thirdParty/cupla/alpaka/test/integ/separableCompilation/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/separableCompilation/CMakeLists.txt
index c394d439b2..a2db94da1d 100644
--- a/thirdParty/cupla/alpaka/test/integ/separableCompilation/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/integ/separableCompilation/CMakeLists.txt
@@ -1,34 +1,35 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-IF((NOT ALPAKA_ACC_GPU_CUDA_ENABLE AND NOT ALPAKA_ACC_GPU_HIP_ENABLE) OR (ALPAKA_ACC_GPU_CUDA_ENABLE AND ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION AND ALPAKA_CUDA_COMPILER MATCHES "nvcc"))
+if((NOT ALPAKA_ACC_GPU_CUDA_ENABLE AND NOT ALPAKA_ACC_GPU_HIP_ENABLE
+   AND NOT ALPAKA_ACC_ANY_BT_OMP5_ENABLE AND NOT ALPAKA_ACC_ANY_BT_OACC_ENABLE) OR
+   (ALPAKA_ACC_GPU_CUDA_ENABLE AND ALPAKA_CUDA_NVCC_SEPARABLE_COMPILATION AND ALPAKA_CUDA_COMPILER MATCHES "nvcc"))
 
-SET(_TARGET_NAME "separableCompilation")
+    set(_TARGET_NAME "separableCompilation")
 
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-append_recursive_files_add_to_src_group("include/" "include/" "hpp" _FILES_HEADER)
+    append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+    append_recursive_files_add_to_src_group("include/" "include/" "hpp" _FILES_HEADER)
 
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE}
-    ${_FILES_HEADER})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE "include"
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
+    alpaka_add_executable(
+        ${_TARGET_NAME}
+        ${_FILES_SOURCE}
+        ${_FILES_HEADER})
+    target_include_directories(
+        ${_TARGET_NAME}
+        PRIVATE "include")
+    target_link_libraries(
+        ${_TARGET_NAME}
+        PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
+    set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+    add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
 
-ENDIF()
+endif()
diff --git a/thirdParty/cupla/alpaka/test/integ/separableCompilation/include/mysqrt.hpp b/thirdParty/cupla/alpaka/test/integ/separableCompilation/include/mysqrt.hpp
index cc9c83838e..0c32c3fdb9 100644
--- a/thirdParty/cupla/alpaka/test/integ/separableCompilation/include/mysqrt.hpp
+++ b/thirdParty/cupla/alpaka/test/integ/separableCompilation/include/mysqrt.hpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
diff --git a/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/main.cpp b/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/main.cpp
index 317f4694b1..ac7731271c 100644
--- a/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/main.cpp
+++ b/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/main.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -34,34 +34,28 @@ class SqrtKernel
     //! \param C The destination vector.
     //! \param numElements The number of elements.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem,
-        typename TIdx>
+    template<typename TAcc, typename TElem, typename TIdx>
     ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        TElem const * const A,
-        TElem const * const B,
-        TElem * const C,
-        TIdx const & numElements) const
-    -> void
+        TAcc const& acc,
+        TElem const* const A,
+        TElem const* const B,
+        TElem* const C,
+        TIdx const& numElements) const -> void
     {
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 1,
-            "The VectorAddKernel expects 1-dimensional indices!");
+        static_assert(alpaka::Dim<TAcc>::value == 1, "The VectorAddKernel expects 1-dimensional indices!");
 
-        auto const gridThreadIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
-        auto const threadElemExtent(alpaka::workdiv::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
+        auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+        auto const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
         auto const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
 
         if(threadFirstElemIdx < numElements)
         {
             // Calculate the number of elements to compute in this thread.
             // The result is uniform for all but the last thread.
-            auto const threadLastElemIdx(threadFirstElemIdx+threadElemExtent);
+            auto const threadLastElemIdx(threadFirstElemIdx + threadElemExtent);
             auto const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
 
-            for(TIdx i(threadFirstElemIdx); i<threadLastElemIdxClipped; ++i)
+            for(TIdx i(threadFirstElemIdx); i < threadLastElemIdxClipped; ++i)
             {
                 C[i] = mysqrt(A[i]) + mysqrt(B[i]);
             }
@@ -69,22 +63,20 @@ class SqrtKernel
     }
 };
 
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::size_t>;
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::size_t>;
 
-TEMPLATE_LIST_TEST_CASE( "separableCompilation", "[separableCompilation]", TestAccs)
+TEMPLATE_LIST_TEST_CASE("separableCompilation", "[separableCompilation]", TestAccs)
 {
     using Acc = TestType;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
     using Val = double;
 
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<alpaka::dev::Dev<Acc>>;
-    using PltfHost = alpaka::pltf::PltfCpu;
-    using DevHost = alpaka::dev::Dev<PltfHost>;
+    using DevAcc = alpaka::Dev<Acc>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+    using QueueAcc = alpaka::test::DefaultQueue<alpaka::Dev<Acc>>;
+    using PltfHost = alpaka::PltfCpu;
+    using DevHost = alpaka::Dev<PltfHost>;
 
     Idx const numElements(32);
 
@@ -92,88 +84,76 @@ TEMPLATE_LIST_TEST_CASE( "separableCompilation", "[separableCompilation]", TestA
     SqrtKernel kernel;
 
     // Get the host device.
-    DevHost const devHost(
-        alpaka::pltf::getDevByIdx<PltfHost>(0u));
+    DevHost const devHost(alpaka::getDevByIdx<PltfHost>(0u));
 
     // Select a device to execute on.
-    DevAcc const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0));
+    DevAcc const devAcc(alpaka::getDevByIdx<PltfAcc>(0));
 
     // Get a queue on this device.
     QueueAcc queueAcc(devAcc);
 
     // The data extent.
-    alpaka::vec::Vec<alpaka::dim::DimInt<1u>, Idx> const extent(
-        numElements);
+    alpaka::Vec<alpaka::DimInt<1u>, Idx> const extent(numElements);
 
     // Let alpaka calculate good block and grid sizes given our full problem extent.
-    alpaka::workdiv::WorkDivMembers<alpaka::dim::DimInt<1u>, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            extent,
-            static_cast<Idx>(3u),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout
-        << typeid(kernel).name() << "("
-        << "accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", workDiv: " << workDiv
-        << ", numElements:" << numElements
-        << ")" << std::endl;
+    alpaka::WorkDivMembers<alpaka::DimInt<1u>, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        extent,
+        static_cast<Idx>(3u),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+    std::cout << typeid(kernel).name() << "("
+              << "accelerator: " << alpaka::getAccName<Acc>() << ", workDiv: " << workDiv
+              << ", numElements:" << numElements << ")" << std::endl;
 
     // Allocate host memory buffers.
-    auto memBufHostA(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    auto memBufHostB(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
-    auto memBufHostC(alpaka::mem::buf::alloc<Val, Idx>(devHost, extent));
+    auto memBufHostA(alpaka::allocBuf<Val, Idx>(devHost, extent));
+    auto memBufHostB(alpaka::allocBuf<Val, Idx>(devHost, extent));
+    auto memBufHostC(alpaka::allocBuf<Val, Idx>(devHost, extent));
 
     // Initialize the host input vectors
-    for (Idx i(0); i < numElements; ++i)
+    for(Idx i(0); i < numElements; ++i)
     {
-        alpaka::mem::view::getPtrNative(memBufHostA)[i] = static_cast<Val>(rand()) / static_cast<Val>(RAND_MAX);
-        alpaka::mem::view::getPtrNative(memBufHostB)[i] = static_cast<Val>(rand()) / static_cast<Val>(RAND_MAX);
+        alpaka::getPtrNative(memBufHostA)[i] = static_cast<Val>(rand()) / static_cast<Val>(RAND_MAX);
+        alpaka::getPtrNative(memBufHostB)[i] = static_cast<Val>(rand()) / static_cast<Val>(RAND_MAX);
     }
 
     // Allocate the buffers on the accelerator.
-    auto memBufAccA(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-    auto memBufAccB(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
-    auto memBufAccC(alpaka::mem::buf::alloc<Val, Idx>(devAcc, extent));
+    auto memBufAccA(alpaka::allocBuf<Val, Idx>(devAcc, extent));
+    auto memBufAccB(alpaka::allocBuf<Val, Idx>(devAcc, extent));
+    auto memBufAccC(alpaka::allocBuf<Val, Idx>(devAcc, extent));
 
     // Copy Host -> Acc.
-    alpaka::mem::view::copy(queueAcc, memBufAccA, memBufHostA, extent);
-    alpaka::mem::view::copy(queueAcc, memBufAccB, memBufHostB, extent);
+    alpaka::memcpy(queueAcc, memBufAccA, memBufHostA, extent);
+    alpaka::memcpy(queueAcc, memBufAccB, memBufHostB, extent);
 
     // Create the executor task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
+    auto const taskKernel(alpaka::createTaskKernel<Acc>(
         workDiv,
         kernel,
-        alpaka::mem::view::getPtrNative(memBufAccA),
-        alpaka::mem::view::getPtrNative(memBufAccB),
-        alpaka::mem::view::getPtrNative(memBufAccC),
+        alpaka::getPtrNative(memBufAccA),
+        alpaka::getPtrNative(memBufAccB),
+        alpaka::getPtrNative(memBufAccC),
         numElements));
 
     // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queueAcc,
-            taskKernel)
-        << " ms"
-        << std::endl;
+    std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queueAcc, taskKernel) << " ms"
+              << std::endl;
 
     // Copy back the result.
-    alpaka::mem::view::copy(queueAcc, memBufHostC, memBufAccC, extent);
-    alpaka::wait::wait(queueAcc);
+    alpaka::memcpy(queueAcc, memBufHostC, memBufAccC, extent);
+    alpaka::wait(queueAcc);
 
     bool resultCorrect(true);
-    auto const pHostData(alpaka::mem::view::getPtrNative(memBufHostC));
-    for(Idx i(0u);
-        i < numElements;
-        ++i)
+    auto const pHostData(alpaka::getPtrNative(memBufHostC));
+    for(Idx i(0u); i < numElements; ++i)
     {
-        auto const & val(pHostData[i]);
-        auto const correctResult(std::sqrt(alpaka::mem::view::getPtrNative(memBufHostA)[i]) + std::sqrt(alpaka::mem::view::getPtrNative(memBufHostB)[i]));
+        auto const& val(pHostData[i]);
+        auto const correctResult(
+            std::sqrt(alpaka::getPtrNative(memBufHostA)[i]) + std::sqrt(alpaka::getPtrNative(memBufHostB)[i]));
         auto const absDiff = (val - correctResult);
-        if( absDiff > std::numeric_limits<Val>::epsilon() )
+        if(absDiff > std::numeric_limits<Val>::epsilon())
         {
             std::cout << "C[" << i << "] == " << val << " != " << correctResult << std::endl;
             resultCorrect = false;
diff --git a/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/mysqrt.cpp b/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/mysqrt.cpp
index a5ea90a625..4a6d240b3c 100644
--- a/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/mysqrt.cpp
+++ b/thirdParty/cupla/alpaka/test/integ/separableCompilation/src/mysqrt.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,21 +10,23 @@
 #include "mysqrt.hpp"
 
 // a square root calculation using simple operations
-ALPAKA_FN_HOST_ACC auto mysqrt(double x)
--> double
+ALPAKA_FN_HOST_ACC auto mysqrt(double x) -> double
 {
-  if (x <= 0) {
-    return 0.0;
-  }
+    if(x <= 0)
+    {
+        return 0.0;
+    }
 
-  double result = x;
+    double result = x;
 
-  for (int i = 0; i < 100; ++i) {
-    if (result <= 0) {
-      result = 0.1;
+    for(int i = 0; i < 100; ++i)
+    {
+        if(result <= 0)
+        {
+            result = 0.1;
+        }
+        double delta = x - (result * result);
+        result = result + 0.5 * delta / result;
     }
-    double delta = x - (result * result);
-    result = result + 0.5 * delta / result;
-  }
-  return result;
+    return result;
 }
diff --git a/thirdParty/cupla/alpaka/test/integ/sharedMem/CMakeLists.txt b/thirdParty/cupla/alpaka/test/integ/sharedMem/CMakeLists.txt
index 5f083f5210..4a5d5eb7bb 100644
--- a/thirdParty/cupla/alpaka/test/integ/sharedMem/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/integ/sharedMem/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "sharedMem")
+set(_TARGET_NAME "sharedMem")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/integ")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/integ/sharedMem/src/sharedMem.cpp b/thirdParty/cupla/alpaka/test/integ/sharedMem/src/sharedMem.cpp
index e1e49b59a1..75050bf20e 100644
--- a/thirdParty/cupla/alpaka/test/integ/sharedMem/src/sharedMem.cpp
+++ b/thirdParty/cupla/alpaka/test/integ/sharedMem/src/sharedMem.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,7 +8,6 @@
  */
 
 #include <alpaka/alpaka.hpp>
-
 #include <alpaka/test/MeasureKernelRunTime.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/queue/Queue.hpp>
@@ -20,42 +19,34 @@
 #include <vector>
 
 //#############################################################################
-//! A kernel using atomicOp, syncBlockThreads, getMem, getIdx, getWorkDiv and global memory to compute a (useless) result.
-//! \tparam TnumUselessWork The number of useless calculations done in each kernel execution.
-template<
-    typename TnumUselessWork,
-    typename Val>
+//! A kernel using atomicOp, syncBlockThreads, getDynSharedMem, getIdx, getWorkDiv and global memory to compute a
+//! (useless) result. \tparam TnumUselessWork The number of useless calculations done in each kernel execution.
+template<typename TnumUselessWork, typename Val>
 class SharedMemKernel
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        Val * const puiBlockRetVals) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, Val* const puiBlockRetVals) const -> void
     {
-        using Idx = alpaka::idx::Idx<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
 
-        static_assert(
-            alpaka::dim::Dim<TAcc>::value == 1,
-            "The SharedMemKernel expects 1-dimensional indices!");
+        static_assert(alpaka::Dim<TAcc>::value == 1, "The SharedMemKernel expects 1-dimensional indices!");
 
         // The number of threads in this block.
-        Idx const blockThreadCount(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
+        Idx const blockThreadCount(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
 
         // Get the dynamically allocated shared memory.
-        Val * const pBlockShared(alpaka::block::shared::dyn::getMem<Val>(acc));
+        Val* const pBlockShared(alpaka::getDynSharedMem<Val>(acc));
 
         // Calculate linearized index of the thread in the block.
-        Idx const blockThreadIdx1d(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+        Idx const blockThreadIdx1d(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
 
 
         // Fill the shared block with the thread ids [1+X, 2+X, 3+X, ..., #Threads+X].
-        auto sum1 = static_cast<Val>(blockThreadIdx1d+1);
-        for(Val i(0); i<static_cast<Val>(TnumUselessWork::value); ++i)
+        auto sum1 = static_cast<Val>(blockThreadIdx1d + 1);
+        for(Val i(0); i < static_cast<Val>(TnumUselessWork::value); ++i)
         {
             sum1 += i;
         }
@@ -63,35 +54,35 @@ class SharedMemKernel
 
 
         // Synchronize all threads because now we are writing to the memory again but inverse.
-        alpaka::block::sync::syncBlockThreads(acc);
+        alpaka::syncBlockThreads(acc);
 
         // Do something useless.
         auto sum2 = static_cast<Val>(blockThreadIdx1d);
-        for(Val i(0); i<static_cast<Val>(TnumUselessWork::value); ++i)
+        for(Val i(0); i < static_cast<Val>(TnumUselessWork::value); ++i)
         {
             sum2 -= i;
         }
         // Add the inverse so that every cell is filled with [#Threads, #Threads, ..., #Threads].
-        pBlockShared[(blockThreadCount-1)-blockThreadIdx1d] += sum2;
+        pBlockShared[(blockThreadCount - 1) - blockThreadIdx1d] += sum2;
 
 
         // Synchronize all threads again.
-        alpaka::block::sync::syncBlockThreads(acc);
+        alpaka::syncBlockThreads(acc);
 
         // Now add up all the cells atomically and write the result to cell 0 of the shared memory.
         if(blockThreadIdx1d > 0)
         {
-            alpaka::atomic::atomicOp<alpaka::atomic::op::Add>(acc, &pBlockShared[0], pBlockShared[blockThreadIdx1d]);
+            alpaka::atomicAdd(acc, &pBlockShared[0], pBlockShared[blockThreadIdx1d]);
         }
 
 
-        alpaka::block::sync::syncBlockThreads(acc);
+        alpaka::syncBlockThreads(acc);
 
         // Only master writes result to global memory.
-        if(blockThreadIdx1d==0)
+        if(blockThreadIdx1d == 0)
         {
             // Calculate linearized block id.
-            Idx const gridBlockIdx(alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+            Idx const gridBlockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
 
             puiBlockRetVals[gridBlockIdx] = pBlockShared[0];
         }
@@ -100,126 +91,97 @@ class SharedMemKernel
 
 namespace alpaka
 {
-    namespace kernel
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The trait for getting the size of the block shared dynamic memory for a kernel.
+        template<typename TnumUselessWork, typename Val, typename TAcc>
+        struct BlockSharedMemDynSizeBytes<SharedMemKernel<TnumUselessWork, Val>, TAcc>
         {
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory for a kernel.
-            template<
-                typename TnumUselessWork,
-                typename Val,
-                typename TAcc>
-            struct BlockSharedMemDynSizeBytes<
-                SharedMemKernel<TnumUselessWork, Val>,
-                TAcc>
+            //-----------------------------------------------------------------------------
+            //! \return The size of the shared memory allocated for a block.
+            template<typename TVec, typename... TArgs>
+            ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+                SharedMemKernel<TnumUselessWork, Val> const& sharedMemKernel,
+                TVec const& blockThreadExtent,
+                TVec const& threadElemExtent,
+                TArgs&&...) -> std::size_t
             {
-                //-----------------------------------------------------------------------------
-                //! \return The size of the shared memory allocated for a block.
-                template<
-                    typename TVec,
-                    typename... TArgs>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    SharedMemKernel<TnumUselessWork, Val> const & sharedMemKernel,
-                    TVec const & blockThreadExtent,
-                    TVec const & threadElemExtent,
-                    TArgs && ...)
-                -> idx::Idx<TAcc>
-                {
-                    alpaka::ignore_unused(sharedMemKernel);
-                    return blockThreadExtent.prod() * threadElemExtent.prod() * static_cast<idx::Idx<TAcc>>(sizeof(Val));
-                }
-            };
-        }
-    }
-}
+                alpaka::ignore_unused(sharedMemKernel);
+                return static_cast<std::size_t>(blockThreadExtent.prod() * threadElemExtent.prod()) * sizeof(Val);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::uint32_t>;
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
 
-TEMPLATE_LIST_TEST_CASE( "sharedMem", "[sharedMem]", TestAccs)
+TEMPLATE_LIST_TEST_CASE("sharedMem", "[sharedMem]", TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    Idx const numElements = 1u<<16u;
+    Idx const numElements = 1u << 16u;
 
     using Val = std::int32_t;
     using TnumUselessWork = std::integral_constant<Idx, 100>;
 
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
+    using DevAcc = alpaka::Dev<Acc>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+    using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
 
 
     // Create the kernel function object.
     SharedMemKernel<TnumUselessWork, Val> kernel;
 
     // Select a device to execute on.
-    auto const devAcc(
-        alpaka::pltf::getDevByIdx<PltfAcc>(0u));
+    auto const devAcc(alpaka::getDevByIdx<PltfAcc>(0u));
 
     // Get a queue on this device.
-    QueueAcc queue(
-        devAcc);
+    QueueAcc queue(devAcc);
 
     // Set the grid blocks extent.
-    alpaka::workdiv::WorkDivMembers<Dim, Idx> const workDiv(
-        alpaka::workdiv::getValidWorkDiv<Acc>(
-            devAcc,
-            numElements,
-            static_cast<Idx>(1u),
-            false,
-            alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted));
-
-    std::cout
-        << "SharedMemKernel("
-        << " accelerator: " << alpaka::acc::getAccName<Acc>()
-        << ", kernel: " << typeid(kernel).name()
-        << ", workDiv: " << workDiv
-        << ")" << std::endl;
-
-    Idx const gridBlocksCount(
-        alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Blocks>(workDiv)[0u]);
-    Idx const blockThreadCount(
-        alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(workDiv)[0u]);
+    alpaka::WorkDivMembers<Dim, Idx> const workDiv(alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        numElements,
+        static_cast<Idx>(1u),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+    std::cout << "SharedMemKernel("
+              << " accelerator: " << alpaka::getAccName<Acc>() << ", kernel: " << typeid(kernel).name()
+              << ", workDiv: " << workDiv << ")" << std::endl;
+
+    Idx const gridBlocksCount(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(workDiv)[0u]);
+    Idx const blockThreadCount(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(workDiv)[0u]);
 
     // An array for the return values calculated by the blocks.
     std::vector<Val> blockRetVals(static_cast<std::size_t>(gridBlocksCount));
 
     // Allocate accelerator buffers and copy.
     Idx const resultElemCount(gridBlocksCount);
-    auto blockRetValsAcc(alpaka::mem::buf::alloc<Val, Idx>(devAcc, resultElemCount));
-    alpaka::mem::view::copy(queue, blockRetValsAcc, blockRetVals, resultElemCount);
+    auto blockRetValsAcc(alpaka::allocBuf<Val, Idx>(devAcc, resultElemCount));
+    alpaka::memcpy(queue, blockRetValsAcc, blockRetVals, resultElemCount);
 
     // Create the kernel execution task.
-    auto const taskKernel(alpaka::kernel::createTaskKernel<Acc>(
-        workDiv,
-        kernel,
-        alpaka::mem::view::getPtrNative(blockRetValsAcc)));
+    auto const taskKernel(alpaka::createTaskKernel<Acc>(workDiv, kernel, alpaka::getPtrNative(blockRetValsAcc)));
 
     // Profile the kernel execution.
-    std::cout << "Execution time: "
-        << alpaka::test::integ::measureTaskRunTimeMs(
-            queue,
-            taskKernel)
-        << " ms"
-        << std::endl;
+    std::cout << "Execution time: " << alpaka::test::integ::measureTaskRunTimeMs(queue, taskKernel) << " ms"
+              << std::endl;
 
     // Copy back the result.
-    alpaka::mem::view::copy(queue, blockRetVals, blockRetValsAcc, resultElemCount);
+    alpaka::memcpy(queue, blockRetVals, blockRetValsAcc, resultElemCount);
 
     // Wait for the queue to finish the memory operation.
-    alpaka::wait::wait(queue);
+    alpaka::wait(queue);
 
     // Assert that the results are correct.
-    Val const correctResult(
-        static_cast<Val>(blockThreadCount*blockThreadCount));
+    Val const correctResult(static_cast<Val>(blockThreadCount * blockThreadCount));
 
     bool resultCorrect(true);
-    for(Idx i(0); i<gridBlocksCount; ++i)
+    for(Idx i(0); i < gridBlocksCount; ++i)
     {
         auto const val(blockRetVals[static_cast<std::size_t>(i)]);
         if(val != correctResult)
diff --git a/thirdParty/cupla/alpaka/test/unit/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/CMakeLists.txt
index 02f21adda5..fbaceee4d2 100644
--- a/thirdParty/cupla/alpaka/test/unit/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2015-2019 Benjamin Worpitz
+# Copyright 2015-2020 Benjamin Worpitz, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,26 +12,31 @@
 # Required CMake version.
 ################################################################################
 
-CMAKE_MINIMUM_REQUIRED(VERSION 3.11.0)
+cmake_minimum_required(VERSION 3.15)
 
 ################################################################################
 # Add subdirectories.
 ################################################################################
 
-ADD_SUBDIRECTORY("acc/")
-ADD_SUBDIRECTORY("atomic/")
-ADD_SUBDIRECTORY("block/shared/")
-ADD_SUBDIRECTORY("block/sync/")
-ADD_SUBDIRECTORY("core/")
-ADD_SUBDIRECTORY("event/")
-ADD_SUBDIRECTORY("idx/")
-ADD_SUBDIRECTORY("kernel/")
-ADD_SUBDIRECTORY("math/sincos/")
-ADD_SUBDIRECTORY("mem/buf/")
-ADD_SUBDIRECTORY("mem/view/")
-ADD_SUBDIRECTORY("mem/p2p/")
-ADD_SUBDIRECTORY("meta/")
-ADD_SUBDIRECTORY("queue/")
-ADD_SUBDIRECTORY("rand/")
-ADD_SUBDIRECTORY("time/")
-ADD_SUBDIRECTORY("vec/")
+add_subdirectory("acc/")
+add_subdirectory("atomic/")
+add_subdirectory("block/shared/")
+add_subdirectory("block/sync/")
+add_subdirectory("core/")
+add_subdirectory("dev/")
+add_subdirectory("event/")
+add_subdirectory("idx/")
+add_subdirectory("intrinsic/")
+add_subdirectory("kernel/")
+add_subdirectory("math/")
+add_subdirectory("mem/buf/")
+add_subdirectory("mem/copy/")
+add_subdirectory("mem/view/")
+add_subdirectory("mem/p2p/")
+add_subdirectory("meta/")
+add_subdirectory("queue/")
+add_subdirectory("rand/")
+add_subdirectory("time/")
+add_subdirectory("vec/")
+add_subdirectory("warp/")
+add_subdirectory("workDiv/")
diff --git a/thirdParty/cupla/alpaka/test/unit/acc/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/acc/CMakeLists.txt
index b913847dc0..1811ae105a 100644
--- a/thirdParty/cupla/alpaka/test/unit/acc/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/acc/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "acc")
+set(_TARGET_NAME "acc")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/acc/src/AccDevPropsTest.cpp b/thirdParty/cupla/alpaka/test/unit/acc/src/AccDevPropsTest.cpp
new file mode 100644
index 0000000000..1e1fec7cf6
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/acc/src/AccDevPropsTest.cpp
@@ -0,0 +1,35 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/acc/AccDevProps.hpp>
+#include <alpaka/acc/Traits.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/catch.hpp>
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("getAccDevProps", "[acc]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const devProps = alpaka::getAccDevProps<Acc>(dev);
+
+    REQUIRE(devProps.m_gridBlockExtentMax.prod() > 0);
+    // Note: this causes signed overflow for some configurations,
+    // will be fixed separately
+    // REQUIRE(devProps.m_blockThreadExtentMax.prod() > 0);
+    REQUIRE(devProps.m_threadElemExtentMax.prod() > 0);
+    REQUIRE(devProps.m_gridBlockCountMax > 0);
+    REQUIRE(devProps.m_blockThreadCountMax > 0);
+    REQUIRE(devProps.m_threadElemCountMax > 0);
+    REQUIRE(devProps.m_multiProcessorCount > 0);
+    REQUIRE(devProps.m_sharedMemSizeBytes > 0);
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/acc/src/AccNameTest.cpp b/thirdParty/cupla/alpaka/test/unit/acc/src/AccNameTest.cpp
index 31e139f31c..2212623f25 100644
--- a/thirdParty/cupla/alpaka/test/unit/acc/src/AccNameTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/acc/src/AccNameTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,7 +8,6 @@
  */
 
 #include <alpaka/acc/Traits.hpp>
-
 #include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
@@ -16,7 +15,7 @@
 #include <iostream>
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "getAccName", "[acc]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("getAccName", "[acc]", alpaka::test::TestAccs)
 {
-    std::cout << alpaka::acc::getAccName<TestType>() << std::endl;
+    std::cout << alpaka::getAccName<TestType>() << std::endl;
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/atomic/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/atomic/CMakeLists.txt
index d9b4717f1c..dc60a1faa5 100644
--- a/thirdParty/cupla/alpaka/test/unit/atomic/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/atomic/CMakeLists.txt
@@ -1,27 +1,25 @@
 #
-# Copyright 2016-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2016-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "atomic")
+set(_TARGET_NAME "atomic")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_ATOMIC")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/atomic/src/AtomicTest.cpp b/thirdParty/cupla/alpaka/test/unit/atomic/src/AtomicTest.cpp
index 0fa4d9cf30..f7a7c5f391 100644
--- a/thirdParty/cupla/alpaka/test/unit/atomic/src/AtomicTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/atomic/src/AtomicTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,331 +8,292 @@
  */
 
 #include <alpaka/atomic/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/core/Unused.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
 #include <climits>
+#include <type_traits>
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicAdd(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicAdd(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Add>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig + value;
-    ALPAKA_CHECK(*success, operand == reference);
+    T const reference = static_cast<T>(operandOrig + value);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicAdd>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicAdd(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicSub(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicSub(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Sub>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig - value;
-    ALPAKA_CHECK(*success, operand == reference);
+    T const reference = static_cast<T>(operandOrig - value);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicSub>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicSub(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicMin(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicMin(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Min>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
     T const reference = (operandOrig < value) ? operandOrig : value;
-    ALPAKA_CHECK(*success, operand == reference);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicMin>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicMin(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicMax(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicMax(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Max>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
     T const reference = (operandOrig > value) ? operandOrig : value;
-    ALPAKA_CHECK(*success, operand == reference);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicMax>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicMax(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicExch(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicExch(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Exch>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
     T const reference = value;
-    ALPAKA_CHECK(*success, operand == reference);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicExch>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicExch(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicInc(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicInc(TAcc const& acc, bool* success, T operandOrig) -> void
 {
     // \TODO: Check reset to 0 at 'value'.
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(42);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Inc>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig + 1;
-    ALPAKA_CHECK(*success, operand == reference);
+    T const reference = static_cast<T>(operandOrig + 1);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicInc>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicInc(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicDec(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicDec(TAcc const& acc, bool* success, T operandOrig) -> void
 {
     // \TODO: Check reset to 'value' at 0.
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(42);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Dec>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
-    T const reference = operandOrig - 1;
-    ALPAKA_CHECK(*success, operand == reference);
+    T const reference = static_cast<T>(operandOrig - 1);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicDec>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicDec(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicAnd(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicAnd(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::And>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
     T const reference = operandOrig & value;
-    ALPAKA_CHECK(*success, operand == reference);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicAnd>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicAnd(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicOr(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicOr(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
     T const value = static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Or>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
     T const reference = operandOrig | value;
-    ALPAKA_CHECK(*success, operand == reference);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicOr>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOr(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicXor(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicXor(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
-    operand = operandOrig;
-    T const value = operandOrig + static_cast<T>(4);
-    T const ret =
-        alpaka::atomic::atomicOp<
-            alpaka::atomic::op::Xor>(
-                acc,
-                &operand,
-                value);
-    ALPAKA_CHECK(*success, operandOrig == ret);
+    T const value = static_cast<T>(operandOrig + static_cast<T>(4));
     T const reference = operandOrig ^ value;
-    ALPAKA_CHECK(*success, operand == reference);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicOp<alpaka::AtomicXor>(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
+    {
+        operand = operandOrig;
+        T const ret = alpaka::atomicXor(acc, &operand, value);
+        ALPAKA_CHECK(*success, operandOrig == ret);
+        ALPAKA_CHECK(*success, operand == reference);
+    }
 }
 
 //-----------------------------------------------------------------------------
 ALPAKA_NO_HOST_ACC_WARNING
-template<
-    typename TAcc,
-    typename T>
-ALPAKA_FN_ACC auto testAtomicCas(
-    TAcc const & acc,
-    bool * success,
-    T operandOrig)
--> void
+template<typename TAcc, typename T>
+ALPAKA_FN_ACC auto testAtomicCas(TAcc const& acc, bool* success, T operandOrig) -> void
 {
-    auto && operand = alpaka::block::shared::st::allocVar<T, __COUNTER__>(acc);
+    T const value = static_cast<T>(4);
+    auto& operand = alpaka::declareSharedVar<T, __COUNTER__>(acc);
 
     //-----------------------------------------------------------------------------
     // with match
     {
-        operand = operandOrig;
         T const compare = operandOrig;
-        T const value = static_cast<T>(4);
-        T const ret =
-            alpaka::atomic::atomicOp<
-                alpaka::atomic::op::Cas>(
-                    acc,
-                    &operand,
-                    compare,
-                    value);
-        ALPAKA_CHECK(*success, operandOrig == ret);
         T const reference = value;
-        ALPAKA_CHECK(*success, operand == reference);
+        {
+            operand = operandOrig;
+            T const ret = alpaka::atomicOp<alpaka::AtomicCas>(acc, &operand, compare, value);
+            ALPAKA_CHECK(*success, operandOrig == ret);
+            ALPAKA_CHECK(*success, operand == reference);
+        }
+        {
+            operand = operandOrig;
+            T const ret = alpaka::atomicCas(acc, &operand, compare, value);
+            ALPAKA_CHECK(*success, operandOrig == ret);
+            ALPAKA_CHECK(*success, operand == reference);
+        }
     }
 
     //-----------------------------------------------------------------------------
     // without match
     {
-        operand = operandOrig;
-        T const compare = operandOrig + static_cast<T>(1);
-        T const value = static_cast<T>(4);
-        T const ret =
-            alpaka::atomic::atomicOp<
-                alpaka::atomic::op::Cas>(
-                    acc,
-                    &operand,
-                    compare,
-                    value);
-        ALPAKA_CHECK(*success, operandOrig == ret);
+        T const compare = static_cast<T>(operandOrig + static_cast<T>(1));
         T const reference = operandOrig;
-        ALPAKA_CHECK(*success, operand == reference);
+        {
+            operand = operandOrig;
+            T const ret = alpaka::atomicOp<alpaka::AtomicCas>(acc, &operand, compare, value);
+            ALPAKA_CHECK(*success, operandOrig == ret);
+            ALPAKA_CHECK(*success, operand == reference);
+        }
+        {
+            operand = operandOrig;
+            T const ret = alpaka::atomicCas(acc, &operand, compare, value);
+            ALPAKA_CHECK(*success, operandOrig == ret);
+            ALPAKA_CHECK(*success, operand == reference);
+        }
     }
 }
 
 //#############################################################################
-template<
-    typename TAcc,
-    typename T,
-    typename Sfinae = void>
+template<typename TAcc, typename T, typename Sfinae = void>
 class AtomicTestKernel
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success,
-        T operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success, T operandOrig) const -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         testAtomicSub(acc, success, operandOrig);
@@ -353,23 +314,36 @@ class AtomicTestKernel
     }
 };
 
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+//#############################################################################
+// Skip all atomic tests for the unified CUDA/HIP backend.
+// CUDA and HIP atomics will be tested separate.
+template<typename T, typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuUniformCudaHipRt<TDim, TIdx>, T>
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuUniformCudaHipRt<TDim, TIdx> const& acc, bool* success, T operandOrig)
+        const -> void
+    {
+        alpaka::ignore_unused(acc);
+        alpaka::ignore_unused(success);
+        alpaka::ignore_unused(operandOrig);
+    }
+};
+#endif
+
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    int>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuCudaRt<TDim, TIdx>, int>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        int operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuCudaRt<TDim, TIdx> const& acc, bool* success, int operandOrig) const
+        -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         testAtomicSub(acc, success, operandOrig);
@@ -380,8 +354,8 @@ class AtomicTestKernel<
         testAtomicExch(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
+        // testAtomicInc(acc, success, operandOrig);
+        // testAtomicDec(acc, success, operandOrig);
 
         testAtomicAnd(acc, success, operandOrig);
         testAtomicOr(acc, success, operandOrig);
@@ -393,21 +367,14 @@ class AtomicTestKernel<
 
 //#############################################################################
 // NOTE: unsigned int is the only type supported by all atomic CUDA operations.
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    unsigned int>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuCudaRt<TDim, TIdx>, unsigned int>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned int operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuCudaRt<TDim, TIdx> const& acc, bool* success, unsigned int operandOrig)
+        const -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         testAtomicSub(acc, success, operandOrig);
@@ -429,203 +396,169 @@ class AtomicTestKernel<
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    unsigned long int>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuCudaRt<TDim, TIdx>, unsigned long int>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
     ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned long int operandOrig) const
-    -> void
+        alpaka::AccGpuCudaRt<TDim, TIdx> const& acc,
+        bool* success,
+        unsigned long int operandOrig) const -> void
     {
         testAtomicAdd(acc, success, operandOrig);
-#if UINT_MAX == ULONG_MAX // LLP64
+#    if UINT_MAX == ULONG_MAX // LLP64
         testAtomicSub(acc, success, operandOrig);
-#endif
+#    endif
 
-#if ULONG_MAX == ULLONG_MAX // LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+#    if ULONG_MAX == ULLONG_MAX // LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
         testAtomicMin(acc, success, operandOrig);
         testAtomicMax(acc, success, operandOrig);
-#endif
-#endif
+#        endif
+#    endif
 
         testAtomicExch(acc, success, operandOrig);
 
-#if UINT_MAX == ULONG_MAX // LLP64
+#    if UINT_MAX == ULONG_MAX // LLP64
         testAtomicInc(acc, success, operandOrig);
         testAtomicDec(acc, success, operandOrig);
-#endif
+#    endif
 
-#if ULONG_MAX == ULLONG_MAX // LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+#    if ULONG_MAX == ULLONG_MAX // LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
         testAtomicAnd(acc, success, operandOrig);
         testAtomicOr(acc, success, operandOrig);
         testAtomicXor(acc, success, operandOrig);
-#endif
-#endif
+#        endif
+#    endif
 
         testAtomicCas(acc, success, operandOrig);
     }
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    unsigned long long int>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuCudaRt<TDim, TIdx>, unsigned long long int>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
     ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned long long int operandOrig) const
-    -> void
+        alpaka::AccGpuCudaRt<TDim, TIdx> const& acc,
+        bool* success,
+        unsigned long long int operandOrig) const -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         // Not supported
-        //testAtomicSub(acc, success, operandOrig);
+        // testAtomicSub(acc, success, operandOrig);
 
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
         testAtomicMin(acc, success, operandOrig);
         testAtomicMax(acc, success, operandOrig);
-#endif
+#    endif
 
         testAtomicExch(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
+        // testAtomicInc(acc, success, operandOrig);
+        // testAtomicDec(acc, success, operandOrig);
 
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
         testAtomicAnd(acc, success, operandOrig);
         testAtomicOr(acc, success, operandOrig);
         testAtomicXor(acc, success, operandOrig);
-#endif
+#    endif
 
         testAtomicCas(acc, success, operandOrig);
     }
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    float>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuCudaRt<TDim, TIdx>, float>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        float operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuCudaRt<TDim, TIdx> const& acc, bool* success, float operandOrig) const
+        -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         // Not supported
-        //testAtomicSub(acc, success, operandOrig);
+        // testAtomicSub(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicMin(acc, success, operandOrig);
-        //testAtomicMax(acc, success, operandOrig);
+        // testAtomicMin(acc, success, operandOrig);
+        // testAtomicMax(acc, success, operandOrig);
 
         testAtomicExch(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
+        // testAtomicInc(acc, success, operandOrig);
+        // testAtomicDec(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicAnd(acc, success, operandOrig);
-        //testAtomicOr(acc, success, operandOrig);
-        //testAtomicXor(acc, success, operandOrig);
+        // testAtomicAnd(acc, success, operandOrig);
+        // testAtomicOr(acc, success, operandOrig);
+        // testAtomicXor(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicCas(acc, success, operandOrig);
+        // testAtomicCas(acc, success, operandOrig);
     }
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
-    double>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuCudaRt<TDim, TIdx>, double>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        double operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuCudaRt<TDim, TIdx> const& acc, bool* success, double operandOrig) const
+        -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         // Not supported
-        //testAtomicSub(acc, success, operandOrig);
+        // testAtomicSub(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicMin(acc, success, operandOrig);
-        //testAtomicMax(acc, success, operandOrig);
+        // testAtomicMin(acc, success, operandOrig);
+        // testAtomicMax(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicExch(acc, success, operandOrig);
+        // testAtomicExch(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
+        // testAtomicInc(acc, success, operandOrig);
+        // testAtomicDec(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicAnd(acc, success, operandOrig);
-        //testAtomicOr(acc, success, operandOrig);
-        //testAtomicXor(acc, success, operandOrig);
+        // testAtomicAnd(acc, success, operandOrig);
+        // testAtomicOr(acc, success, operandOrig);
+        // testAtomicXor(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicCas(acc, success, operandOrig);
+        // testAtomicCas(acc, success, operandOrig);
     }
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx,
-    typename T>
+template<typename TDim, typename TIdx, typename T>
 class AtomicTestKernel<
-    alpaka::acc::AccGpuCudaRt<TDim, TIdx>,
+    alpaka::AccGpuCudaRt<TDim, TIdx>,
     T,
-    typename std::enable_if<
-        !std::is_same<int, T>::value
-        && !std::is_same<unsigned int, T>::value
-        && !std::is_same<unsigned long int, T>::value
-        && !std::is_same<unsigned long long int, T>::value
-        && !std::is_same<float, T>::value
-        && !std::is_same<double, T>::value
-    >::type>
+    std::enable_if_t<
+        !std::is_same<int, T>::value && !std::is_same<unsigned int, T>::value
+        && !std::is_same<unsigned long int, T>::value && !std::is_same<unsigned long long int, T>::value
+        && !std::is_same<float, T>::value && !std::is_same<double, T>::value>>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuCudaRt<TDim, TIdx> const & acc,
-        bool * success,
-        T operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuCudaRt<TDim, TIdx> const& acc, bool* success, T operandOrig) const
+        -> void
     {
         alpaka::ignore_unused(acc);
         alpaka::ignore_unused(operandOrig);
@@ -638,21 +571,14 @@ class AtomicTestKernel<
 
 #if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    int>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuHipRt<TDim, TIdx>, int>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        int operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuHipRt<TDim, TIdx> const& acc, bool* success, int operandOrig) const
+        -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         testAtomicSub(acc, success, operandOrig);
@@ -663,8 +589,8 @@ class AtomicTestKernel<
         testAtomicExch(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
+        // testAtomicInc(acc, success, operandOrig);
+        // testAtomicDec(acc, success, operandOrig);
 
         testAtomicAnd(acc, success, operandOrig);
         testAtomicOr(acc, success, operandOrig);
@@ -676,21 +602,14 @@ class AtomicTestKernel<
 
 //#############################################################################
 // NOTE: unsigned int is the only type supported by all atomic HIP operations.
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    unsigned int>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuHipRt<TDim, TIdx>, unsigned int>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned int operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuHipRt<TDim, TIdx> const& acc, bool* success, unsigned int operandOrig)
+        const -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         testAtomicSub(acc, success, operandOrig);
@@ -712,203 +631,169 @@ class AtomicTestKernel<
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    unsigned long int>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuHipRt<TDim, TIdx>, unsigned long int>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
     ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned long int operandOrig) const
-    -> void
+        alpaka::AccGpuHipRt<TDim, TIdx> const& acc,
+        bool* success,
+        unsigned long int operandOrig) const -> void
     {
         testAtomicAdd(acc, success, operandOrig);
-#if UINT_MAX == ULONG_MAX // LLP64
+#    if UINT_MAX == ULONG_MAX // LLP64
         testAtomicSub(acc, success, operandOrig);
-#endif
+#    endif
 
-#if ULONG_MAX == ULLONG_MAX // LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+#    if ULONG_MAX == ULLONG_MAX // LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
         testAtomicMin(acc, success, operandOrig);
         testAtomicMax(acc, success, operandOrig);
-#endif
-#endif
+#        endif
+#    endif
 
         testAtomicExch(acc, success, operandOrig);
 
-#if UINT_MAX == ULONG_MAX // LLP64
+#    if UINT_MAX == ULONG_MAX // LLP64
         testAtomicInc(acc, success, operandOrig);
         testAtomicDec(acc, success, operandOrig);
-#endif
+#    endif
 
-#if ULONG_MAX == ULLONG_MAX // LP64
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+#    if ULONG_MAX == ULLONG_MAX // LP64
+#        if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
         testAtomicAnd(acc, success, operandOrig);
         testAtomicOr(acc, success, operandOrig);
         testAtomicXor(acc, success, operandOrig);
-#endif
-#endif
+#        endif
+#    endif
 
         testAtomicCas(acc, success, operandOrig);
     }
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    unsigned long long int>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuHipRt<TDim, TIdx>, unsigned long long int>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
     ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        unsigned long long int operandOrig) const
-    -> void
+        alpaka::AccGpuHipRt<TDim, TIdx> const& acc,
+        bool* success,
+        unsigned long long int operandOrig) const -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         // Not supported
-        //testAtomicSub(acc, success, operandOrig);
+        // testAtomicSub(acc, success, operandOrig);
 
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
         testAtomicMin(acc, success, operandOrig);
         testAtomicMax(acc, success, operandOrig);
-#endif
+#    endif
 
         testAtomicExch(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
+        // testAtomicInc(acc, success, operandOrig);
+        // testAtomicDec(acc, success, operandOrig);
 
-#if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
+#    if BOOST_ARCH_PTX >= BOOST_VERSION_NUMBER(3, 5, 0)
         testAtomicAnd(acc, success, operandOrig);
         testAtomicOr(acc, success, operandOrig);
         testAtomicXor(acc, success, operandOrig);
-#endif
+#    endif
 
         testAtomicCas(acc, success, operandOrig);
     }
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    float>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuHipRt<TDim, TIdx>, float>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        float operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuHipRt<TDim, TIdx> const& acc, bool* success, float operandOrig) const
+        -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         // Not supported
-        //testAtomicSub(acc, success, operandOrig);
+        // testAtomicSub(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicMin(acc, success, operandOrig);
-        //testAtomicMax(acc, success, operandOrig);
+        // testAtomicMin(acc, success, operandOrig);
+        // testAtomicMax(acc, success, operandOrig);
 
         testAtomicExch(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
+        // testAtomicInc(acc, success, operandOrig);
+        // testAtomicDec(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicAnd(acc, success, operandOrig);
-        //testAtomicOr(acc, success, operandOrig);
-        //testAtomicXor(acc, success, operandOrig);
+        // testAtomicAnd(acc, success, operandOrig);
+        // testAtomicOr(acc, success, operandOrig);
+        // testAtomicXor(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicCas(acc, success, operandOrig);
+        // testAtomicCas(acc, success, operandOrig);
     }
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
-class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
-    double>
+template<typename TDim, typename TIdx>
+class AtomicTestKernel<alpaka::AccGpuHipRt<TDim, TIdx>, double>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        double operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuHipRt<TDim, TIdx> const& acc, bool* success, double operandOrig) const
+        -> void
     {
         testAtomicAdd(acc, success, operandOrig);
         // Not supported
-        //testAtomicSub(acc, success, operandOrig);
+        // testAtomicSub(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicMin(acc, success, operandOrig);
-        //testAtomicMax(acc, success, operandOrig);
+        // testAtomicMin(acc, success, operandOrig);
+        // testAtomicMax(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicExch(acc, success, operandOrig);
+        // testAtomicExch(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicInc(acc, success, operandOrig);
-        //testAtomicDec(acc, success, operandOrig);
+        // testAtomicInc(acc, success, operandOrig);
+        // testAtomicDec(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicAnd(acc, success, operandOrig);
-        //testAtomicOr(acc, success, operandOrig);
-        //testAtomicXor(acc, success, operandOrig);
+        // testAtomicAnd(acc, success, operandOrig);
+        // testAtomicOr(acc, success, operandOrig);
+        // testAtomicXor(acc, success, operandOrig);
 
         // Not supported
-        //testAtomicCas(acc, success, operandOrig);
+        // testAtomicCas(acc, success, operandOrig);
     }
 };
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx,
-    typename T>
+template<typename TDim, typename TIdx, typename T>
 class AtomicTestKernel<
-    alpaka::acc::AccGpuHipRt<TDim, TIdx>,
+    alpaka::AccGpuHipRt<TDim, TIdx>,
     T,
-    typename std::enable_if<
-        !std::is_same<int, T>::value
-        && !std::is_same<unsigned int, T>::value
-        && !std::is_same<unsigned long int, T>::value
-        && !std::is_same<unsigned long long int, T>::value
-        && !std::is_same<float, T>::value
-        && !std::is_same<double, T>::value
-    >::type>
+    std::enable_if_t<
+        !std::is_same<int, T>::value && !std::is_same<unsigned int, T>::value
+        && !std::is_same<unsigned long int, T>::value && !std::is_same<unsigned long long int, T>::value
+        && !std::is_same<float, T>::value && !std::is_same<double, T>::value>>
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    ALPAKA_FN_ACC auto operator()(
-        alpaka::acc::AccGpuHipRt<TDim, TIdx> const & acc,
-        bool * success,
-        T operandOrig) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(alpaka::AccGpuHipRt<TDim, TIdx> const& acc, bool* success, T operandOrig) const
+        -> void
     {
         alpaka::ignore_unused(acc);
         alpaka::ignore_unused(operandOrig);
@@ -921,20 +806,16 @@ class AtomicTestKernel<
 
 
 //#############################################################################
-template<
-    typename TAcc,
-    typename T>
+template<typename TAcc, typename T>
 struct TestAtomicOperations
 {
     //-----------------------------------------------------------------------------
-    static auto testAtomicOperations()
-    -> void
+    static auto testAtomicOperations() -> void
     {
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
 
-        alpaka::test::KernelExecutionFixture<TAcc> fixture(
-            alpaka::vec::Vec<Dim, Idx>::ones());
+        alpaka::test::KernelExecutionFixture<TAcc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
         AtomicTestKernel<TAcc, T> kernel;
 
@@ -943,12 +824,10 @@ struct TestAtomicOperations
     }
 };
 
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::size_t>;
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::size_t>;
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "atomicOperationsWorking", "[atomic]", TestAccs)
+TEMPLATE_LIST_TEST_CASE("atomicOperationsWorking", "[atomic]", TestAccs)
 {
     using Acc = TestType;
     TestAtomicOperations<Acc, unsigned char>::testAtomicOperations();
@@ -965,6 +844,6 @@ TEMPLATE_LIST_TEST_CASE( "atomicOperationsWorking", "[atomic]", TestAccs)
     TestAtomicOperations<Acc, long long>::testAtomicOperations();
 
     // Not all atomic operations are possible with floating point values.
-    //TestAtomicOperations<Acc, float>::testAtomicOperations();
-    //TestAtomicOperations<Acc, double>::testAtomicOperations();
+    // TestAtomicOperations<Acc, float>::testAtomicOperations();
+    // TestAtomicOperations<Acc, double>::testAtomicOperations();
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/block/shared/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/block/shared/CMakeLists.txt
index fa059864b6..3504b65ee1 100644
--- a/thirdParty/cupla/alpaka/test/unit/block/shared/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/block/shared/CMakeLists.txt
@@ -1,27 +1,25 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "blockShared")
+set(_TARGET_NAME "blockShared")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_BLOCK_SHARED")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp b/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp
index 8b7a89c1fc..3e58438850 100644
--- a/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemDyn.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,10 +8,9 @@
  */
 
 #include <alpaka/block/shared/dyn/Traits.hpp>
-
+#include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -21,75 +20,60 @@ class BlockSharedMemDynTestKernel
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
         // Assure that the pointer is non null.
-        auto && a = alpaka::block::shared::dyn::getMem<std::uint32_t>(acc);
-        ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != a);
+        auto a = alpaka::getDynSharedMem<std::uint32_t>(acc);
+        ALPAKA_CHECK(*success, static_cast<std::uint32_t*>(nullptr) != a);
 
         // Each call should return the same pointer ...
-        auto && b = alpaka::block::shared::dyn::getMem<std::uint32_t>(acc);
+        auto b = alpaka::getDynSharedMem<std::uint32_t>(acc);
         ALPAKA_CHECK(*success, a == b);
 
         // ... even for different types.
-        auto && c = alpaka::block::shared::dyn::getMem<float>(acc);
-        ALPAKA_CHECK(*success, a == reinterpret_cast<std::uint32_t *>(c));
+        auto c = alpaka::getDynSharedMem<float>(acc);
+        ALPAKA_CHECK(*success, a == reinterpret_cast<std::uint32_t*>(c));
     }
 };
 
 namespace alpaka
 {
-    namespace kernel
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The trait for getting the size of the block shared dynamic memory for a kernel.
+        template<typename TAcc>
+        struct BlockSharedMemDynSizeBytes<BlockSharedMemDynTestKernel, TAcc>
         {
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory for a kernel.
-            template<
-                typename TAcc>
-            struct BlockSharedMemDynSizeBytes<
-                BlockSharedMemDynTestKernel,
-                TAcc>
+            //-----------------------------------------------------------------------------
+            //! \return The size of the shared memory allocated for a block.
+            template<typename TVec>
+            ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+                BlockSharedMemDynTestKernel const& blockSharedMemDyn,
+                TVec const& blockThreadExtent,
+                TVec const& threadElemExtent,
+                bool* success) -> std::size_t
             {
-                //-----------------------------------------------------------------------------
-                //! \return The size of the shared memory allocated for a block.
-                template<
-                    typename TVec>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    BlockSharedMemDynTestKernel const & blockSharedMemDyn,
-                    TVec const & blockThreadExtent,
-                    TVec const & threadElemExtent,
-                    bool * success)
-                -> idx::Idx<TAcc>
-                {
-                    alpaka::ignore_unused(blockSharedMemDyn);
-                    alpaka::ignore_unused(success);
-                    return
-                        static_cast<idx::Idx<TAcc>>(sizeof(std::uint32_t)) * blockThreadExtent.prod() * threadElemExtent.prod();
-                }
-            };
-        }
-    }
-}
+                alpaka::ignore_unused(blockSharedMemDyn);
+                alpaka::ignore_unused(success);
+                auto const gridSize = blockThreadExtent.prod() * threadElemExtent.prod();
+                return static_cast<std::size_t>(gridSize) * sizeof(std::uint32_t);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "sameNonNullAdress", "[blockSharedMemDyn]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("sameNonNullAdress", "[blockSharedMemDyn]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     BlockSharedMemDynTestKernel kernel;
 
-    REQUIRE(
-        fixture(
-            kernel));
+    REQUIRE(fixture(kernel));
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp b/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp
index 28f2625098..359621fc08 100644
--- a/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/block/shared/src/BlockSharedMemSt.cpp
@@ -8,11 +8,10 @@
  */
 
 #include <alpaka/block/shared/st/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
 #include <alpaka/test/Array.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -22,61 +21,57 @@ class BlockSharedMemStNonNullTestKernel
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
 #if BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(6, 0, 0)
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Waddress"  // warning: the compiler can assume that the address of �a� will never be NULL [-Waddress]
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Waddress" // warning: the compiler can assume that the address of 'a' will never be NULL [-Waddress]
 #endif
         // Multiple runs to make sure it really works.
-        for(std::size_t i=0u; i<10; ++i)
+        for(std::size_t i = 0u; i < 10; ++i)
         {
-            auto && a = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != &a);
+            auto& a = alpaka::declareSharedVar<std::uint32_t, __COUNTER__>(acc);
+            ALPAKA_CHECK(*success, static_cast<std::uint32_t*>(nullptr) != &a);
 
-            auto && b = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != &b);
+            auto& b = alpaka::declareSharedVar<std::uint32_t, __COUNTER__>(acc);
+            ALPAKA_CHECK(*success, static_cast<std::uint32_t*>(nullptr) != &b);
 
-            auto && c = alpaka::block::shared::st::allocVar<float, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<float *>(nullptr) != &c);
+            auto& c = alpaka::declareSharedVar<float, __COUNTER__>(acc);
+            ALPAKA_CHECK(*success, static_cast<float*>(nullptr) != &c);
 
-            auto && d = alpaka::block::shared::st::allocVar<double, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<double *>(nullptr) != &d);
+            auto& d = alpaka::declareSharedVar<double, __COUNTER__>(acc);
+            ALPAKA_CHECK(*success, static_cast<double*>(nullptr) != &d);
 
-            auto && e = alpaka::block::shared::st::allocVar<std::uint64_t, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint64_t *>(nullptr) != &e);
+            auto& e = alpaka::declareSharedVar<std::uint64_t, __COUNTER__>(acc);
+            ALPAKA_CHECK(*success, static_cast<std::uint64_t*>(nullptr) != &e);
 
 
-            auto && f = alpaka::block::shared::st::allocVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != &f[0]);
+            auto& f = alpaka::declareSharedVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
+            ALPAKA_CHECK(*success, static_cast<std::uint32_t*>(nullptr) != &f[0]);
 
-            auto && g = alpaka::block::shared::st::allocVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<std::uint32_t *>(nullptr) != &g[0]);
+            auto& g = alpaka::declareSharedVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
+            ALPAKA_CHECK(*success, static_cast<std::uint32_t*>(nullptr) != &g[0]);
 
-            auto && h = alpaka::block::shared::st::allocVar<alpaka::test::Array<double, 16>, __COUNTER__>(acc);
-            ALPAKA_CHECK(*success, static_cast<double *>(nullptr) != &h[0]);
+            auto& h = alpaka::declareSharedVar<alpaka::test::Array<double, 16>, __COUNTER__>(acc);
+            ALPAKA_CHECK(*success, static_cast<double*>(nullptr) != &h[0]);
         }
 #if BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(6, 0, 0)
-    #pragma GCC diagnostic pop
+#    pragma GCC diagnostic pop
 #endif
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "nonNull", "[blockSharedMemSt]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("nonNull", "[blockSharedMemSt]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
     // Use multiple threads to make sure the synchronization really works.
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(3u)));
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(static_cast<Idx>(3u)));
 
     BlockSharedMemStNonNullTestKernel kernel;
 
@@ -89,29 +84,25 @@ class BlockSharedMemStSameTypeDifferentAdressTestKernel
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
         // Multiple runs to make sure it really works.
-        for(std::size_t i=0u; i<10; ++i)
+        for(std::size_t i = 0u; i < 10; ++i)
         {
-            auto && a = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
-            auto && b = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
+            auto& a = alpaka::declareSharedVar<std::uint32_t, __COUNTER__>(acc);
+            auto& b = alpaka::declareSharedVar<std::uint32_t, __COUNTER__>(acc);
             ALPAKA_CHECK(*success, &a != &b);
-            auto && c = alpaka::block::shared::st::allocVar<std::uint32_t, __COUNTER__>(acc);
+            auto& c = alpaka::declareSharedVar<std::uint32_t, __COUNTER__>(acc);
             ALPAKA_CHECK(*success, &b != &c);
             ALPAKA_CHECK(*success, &a != &c);
             ALPAKA_CHECK(*success, &b != &c);
 
-            auto && d = alpaka::block::shared::st::allocVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
+            auto& d = alpaka::declareSharedVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
             ALPAKA_CHECK(*success, &a != &d[0]);
             ALPAKA_CHECK(*success, &b != &d[0]);
             ALPAKA_CHECK(*success, &c != &d[0]);
-            auto && e = alpaka::block::shared::st::allocVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
+            auto& e = alpaka::declareSharedVar<alpaka::test::Array<std::uint32_t, 32>, __COUNTER__>(acc);
             ALPAKA_CHECK(*success, &a != &e[0]);
             ALPAKA_CHECK(*success, &b != &e[0]);
             ALPAKA_CHECK(*success, &c != &e[0]);
@@ -121,15 +112,14 @@ class BlockSharedMemStSameTypeDifferentAdressTestKernel
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "sameTypeDifferentAddress", "[blockSharedMemSt]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("sameTypeDifferentAddress", "[blockSharedMemSt]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
     // Use multiple threads to make sure the synchronization really works.
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(3u)));
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(static_cast<Idx>(3u)));
 
     BlockSharedMemStSameTypeDifferentAdressTestKernel kernel;
 
diff --git a/thirdParty/cupla/alpaka/test/unit/block/sync/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/block/sync/CMakeLists.txt
index 63af2dc830..582a646441 100644
--- a/thirdParty/cupla/alpaka/test/unit/block/sync/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/block/sync/CMakeLists.txt
@@ -1,27 +1,25 @@
 #
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "blockSync")
+set(_TARGET_NAME "blockSync")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_BLOCK_SYNC")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSync.cpp b/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSync.cpp
index a5b6d90888..e51d280fa7 100644
--- a/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSync.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSync.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,9 +8,8 @@
  */
 
 #include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -22,29 +21,25 @@ class BlockSyncTestKernel
 
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
-        using Idx = alpaka::idx::Idx<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
 
         // Get the index of the current thread within the block and the block extent and map them to 1D.
-        auto const blockThreadIdx = alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc);
-        auto const blockThreadExtent = alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
-        auto const blockThreadIdx1D = alpaka::idx::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u];
+        auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockThreadIdx1D = alpaka::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u];
         auto const blockThreadExtent1D = blockThreadExtent.prod();
 
         // Allocate shared memory.
-        Idx * const pBlockSharedArray = alpaka::block::shared::dyn::getMem<Idx>(acc);
-   
+        Idx* const pBlockSharedArray = alpaka::getDynSharedMem<Idx>(acc);
+
         // Write the thread index into the shared memory.
         pBlockSharedArray[blockThreadIdx1D] = blockThreadIdx1D;
 
         // Synchronize the threads in the block.
-        alpaka::block::sync::syncBlockThreads(acc);
+        alpaka::syncBlockThreads(acc);
 
         // All other threads within the block should now have written their index into the shared memory.
         for(auto i(static_cast<Idx>(0u)); i < blockThreadExtent1D; ++i)
@@ -56,55 +51,44 @@ class BlockSyncTestKernel
 
 namespace alpaka
 {
-    namespace kernel
+    namespace traits
     {
-        namespace traits
+        //#############################################################################
+        //! The trait for getting the size of the block shared dynamic memory for a kernel.
+        template<typename TAcc>
+        struct BlockSharedMemDynSizeBytes<BlockSyncTestKernel, TAcc>
         {
-            //#############################################################################
-            //! The trait for getting the size of the block shared dynamic memory for a kernel.
-            template<
-                typename TAcc>
-            struct BlockSharedMemDynSizeBytes<
-                BlockSyncTestKernel,
-                TAcc>
+            //-----------------------------------------------------------------------------
+            //! \return The size of the shared memory allocated for a block.
+            template<typename TVec>
+            ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+                BlockSyncTestKernel const& blockSharedMemDyn,
+                TVec const& blockThreadExtent,
+                TVec const& threadElemExtent,
+                bool* success) -> std::size_t
             {
-                //-----------------------------------------------------------------------------
-                //! \return The size of the shared memory allocated for a block.
-                template<
-                    typename TVec>
-                ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                    BlockSyncTestKernel const & blockSharedMemDyn,
-                    TVec const & blockThreadExtent,
-                    TVec const & threadElemExtent,
-                    bool * success)
-                -> idx::Idx<TAcc>
-                {
-                    using Idx = alpaka::idx::Idx<TAcc>;
+                using Idx = alpaka::Idx<TAcc>;
 
-                    alpaka::ignore_unused(blockSharedMemDyn);
-                    alpaka::ignore_unused(threadElemExtent);
-                    alpaka::ignore_unused(success);
-                    return
-                        static_cast<idx::Idx<TAcc>>(sizeof(Idx)) * blockThreadExtent.prod();
-                }
-            };
-        }
-    }
-}
+                alpaka::ignore_unused(blockSharedMemDyn);
+                alpaka::ignore_unused(threadElemExtent);
+                alpaka::ignore_unused(success);
+                return static_cast<std::size_t>(blockThreadExtent.prod()) * sizeof(Idx);
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "synchronize", "[blockSync]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("synchronize", "[blockSync]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
     alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(BlockSyncTestKernel::gridThreadExtentPerDim)));
+        alpaka::Vec<Dim, Idx>::all(static_cast<Idx>(BlockSyncTestKernel::gridThreadExtentPerDim)));
 
     BlockSyncTestKernel kernel;
 
-    REQUIRE(
-        fixture(
-            kernel));
+    REQUIRE(fixture(kernel));
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp b/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp
index 62c17299e2..54fed43b11 100644
--- a/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/block/sync/src/BlockSyncPredicate.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,9 +8,8 @@
  */
 
 #include <alpaka/block/sync/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -20,99 +19,90 @@ class BlockSyncPredicateTestKernel
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
-        using Idx = alpaka::idx::Idx<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
 
         // Get the index of the current thread within the block and the block extent and map them to 1D.
-        auto const blockThreadIdx(alpaka::idx::getIdx<alpaka::Block, alpaka::Threads>(acc));
-        auto const blockThreadExtent(alpaka::workdiv::getWorkDiv<alpaka::Block, alpaka::Threads>(acc));
-        auto const blockThreadIdx1D(alpaka::idx::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u]);
+        auto const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc));
+        auto const blockThreadExtent(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc));
+        auto const blockThreadIdx1D(alpaka::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u]);
         auto const blockThreadExtent1D(blockThreadExtent.prod());
 
-        // syncBlockThreadsPredicate<alpaka::block::sync::op::Count>
+        // syncBlockThreadsPredicate<alpaka::BlockCount>
         {
             Idx const modulus(2u);
             int const predicate(static_cast<int>(blockThreadIdx1D % modulus));
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::Count>(acc, predicate));
+            auto const result(alpaka::syncBlockThreadsPredicate<alpaka::BlockCount>(acc, predicate));
             auto const expectedResult(static_cast<int>(blockThreadExtent1D / modulus));
             ALPAKA_CHECK(*success, expectedResult == result);
         }
         {
             Idx const modulus(3u);
             int const predicate(static_cast<int>(blockThreadIdx1D % modulus));
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::Count>(acc, predicate));
-            auto const expectedResult(static_cast<int>(blockThreadExtent1D - ((blockThreadExtent1D + modulus - static_cast<Idx>(1u)) / modulus)));
+            auto const result(alpaka::syncBlockThreadsPredicate<alpaka::BlockCount>(acc, predicate));
+            auto const expectedResult(static_cast<int>(
+                blockThreadExtent1D - ((blockThreadExtent1D + modulus - static_cast<Idx>(1u)) / modulus)));
             ALPAKA_CHECK(*success, expectedResult == result);
         }
 
-        // syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>
+        // syncBlockThreadsPredicate<alpaka::BlockAnd>
         {
             int const predicate(1);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>(acc, predicate));
+            auto const result(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, predicate));
             ALPAKA_CHECK(*success, result == 1);
         }
         {
             int const predicate(0);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>(acc, predicate));
+            auto const result(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, predicate));
             ALPAKA_CHECK(*success, result == 0);
         }
         {
             int const predicate(blockThreadIdx1D != 0);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalAnd>(acc, predicate));
+            auto const result(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, predicate));
             ALPAKA_CHECK(*success, result == 0);
         }
 
-        // syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>
+        // syncBlockThreadsPredicate<alpaka::BlockOr>
         {
             int const predicate(1);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>(acc, predicate));
+            auto const result(alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, predicate));
             ALPAKA_CHECK(*success, result == 1);
         }
         {
             int const predicate(0);
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>(acc, predicate));
+            auto const result(alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, predicate));
             ALPAKA_CHECK(*success, result == 0);
         }
         {
             int const predicate(static_cast<int>(blockThreadIdx1D != 1));
-            auto const result(alpaka::block::sync::syncBlockThreadsPredicate<alpaka::block::sync::op::LogicalOr>(acc, predicate));
+            auto const result(alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, predicate));
             ALPAKA_CHECK(*success, result == 1);
         }
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "synchronizePredicate", "[blockSync]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("synchronizePredicate", "[blockSync]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
     BlockSyncPredicateTestKernel kernel;
 
     // 4^Dim
     {
-        alpaka::test::KernelExecutionFixture<Acc> fixture(
-            alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(4u)));
+        alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(static_cast<Idx>(4u)));
 
-        REQUIRE(
-            fixture(
-                kernel));
+        REQUIRE(fixture(kernel));
     }
 
     // 1^Dim
     {
-        alpaka::test::KernelExecutionFixture<Acc> fixture(
-            alpaka::vec::Vec<Dim, Idx>::ones());
+        alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
-        REQUIRE(
-            fixture(
-                kernel));
+        REQUIRE(fixture(kernel));
     }
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/core/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/core/CMakeLists.txt
index 6a8a33d144..89e9d9742a 100644
--- a/thirdParty/cupla/alpaka/test/unit/core/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/core/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2018-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2018-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "core")
+set(_TARGET_NAME "core")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/core/src/BoostPredefTest.cpp b/thirdParty/cupla/alpaka/test/unit/core/src/BoostPredefTest.cpp
index 33d0c6b02f..68d3678df5 100644
--- a/thirdParty/cupla/alpaka/test/unit/core/src/BoostPredefTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/core/src/BoostPredefTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -31,9 +31,6 @@ TEST_CASE("printDefines", "[core]")
 #if BOOST_COMP_NVCC
     std::cout << "BOOST_COMP_NVCC:" << BOOST_COMP_NVCC << std::endl;
 #endif
-#if BOOST_COMP_HCC
-    std::cout << "BOOST_COMP_HCC:" << BOOST_COMP_HCC << std::endl;
-#endif
 #if BOOST_COMP_HIP
     std::cout << "BOOST_COMP_HIP:" << BOOST_COMP_HIP << std::endl;
 #endif
@@ -43,9 +40,15 @@ TEST_CASE("printDefines", "[core]")
 #if BOOST_COMP_GNUC
     std::cout << "BOOST_COMP_GNUC:" << BOOST_COMP_GNUC << std::endl;
 #endif
+#if BOOST_COMP_INTEL
+    std::cout << "BOOST_COMP_INTEL:" << BOOST_COMP_INTEL << std::endl;
+#endif
 #if BOOST_COMP_MSVC
     std::cout << "BOOST_COMP_MSVC:" << BOOST_COMP_MSVC << std::endl;
 #endif
+#if defined(BOOST_COMP_MSVC_EMULATED)
+    std::cout << "BOOST_COMP_MSVC_EMULATED:" << BOOST_COMP_MSVC_EMULATED << std::endl;
+#endif
 #if BOOST_COMP_CLANG_CUDA
     std::cout << "BOOST_COMP_CLANG_CUDA:" << BOOST_COMP_CLANG_CUDA << std::endl;
 #endif
diff --git a/thirdParty/cupla/alpaka/test/unit/core/src/ClipCastTest.cpp b/thirdParty/cupla/alpaka/test/unit/core/src/ClipCastTest.cpp
index 19ec2792a6..c1013a097d 100644
--- a/thirdParty/cupla/alpaka/test/unit/core/src/ClipCastTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/core/src/ClipCastTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,100 +12,94 @@
 #include <catch2/catch.hpp>
 
 //-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastNoCastShouldNotChangeTheValue", "[core]")
+TEST_CASE("clipCastNoCastShouldNotChangeTheValue", "[core]")
 {
     CHECK(
-        std::numeric_limits<std::int8_t>::max() ==
-        alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::int8_t>::max()));
+        std::numeric_limits<std::int8_t>::max()
+        == alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::int8_t>::max()));
     CHECK(
-        std::numeric_limits<std::uint16_t>::min() ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::uint16_t>::min()));
+        std::numeric_limits<std::uint16_t>::min()
+        == alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::uint16_t>::min()));
     CHECK(
-        std::numeric_limits<std::int32_t>::min() ==
-        alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::int32_t>::min()));
+        std::numeric_limits<std::int32_t>::min()
+        == alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::int32_t>::min()));
     CHECK(
-        std::numeric_limits<std::uint64_t>::max() ==
-        alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::uint64_t>::max()));
+        std::numeric_limits<std::uint64_t>::max()
+        == alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::uint64_t>::max()));
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastUpCastEqualSigndnessShouldNotChangeTheValue", "[core]")
+TEST_CASE("clipCastUpCastEqualSigndnessShouldNotChangeTheValue", "[core]")
 {
     CHECK(
-        static_cast<std::int16_t>(std::numeric_limits<std::int8_t>::max()) ==
-        alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::int8_t>::max()));
+        static_cast<std::int16_t>(std::numeric_limits<std::int8_t>::max())
+        == alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::int8_t>::max()));
     CHECK(
-        static_cast<std::uint32_t>(std::numeric_limits<std::uint16_t>::min()) ==
-        alpaka::core::clipCast<std::uint32_t>(std::numeric_limits<std::uint16_t>::min()));
+        static_cast<std::uint32_t>(std::numeric_limits<std::uint16_t>::min())
+        == alpaka::core::clipCast<std::uint32_t>(std::numeric_limits<std::uint16_t>::min()));
     CHECK(
-        static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()) ==
-        alpaka::core::clipCast<std::int64_t>(std::numeric_limits<std::int32_t>::min()));
+        static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min())
+        == alpaka::core::clipCast<std::int64_t>(std::numeric_limits<std::int32_t>::min()));
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastUpCastDifferentSigndnessShouldNotChangeTheValueForPositives", "[core]")
+TEST_CASE("clipCastUpCastDifferentSigndnessShouldNotChangeTheValueForPositives", "[core]")
 {
     CHECK(
-        static_cast<std::uint16_t>(std::numeric_limits<std::int8_t>::max()) ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int8_t>::max()));
+        static_cast<std::uint16_t>(std::numeric_limits<std::int8_t>::max())
+        == alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int8_t>::max()));
     CHECK(
-        static_cast<std::int32_t>(std::numeric_limits<std::uint16_t>::max()) ==
-        alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::uint16_t>::max()));
+        static_cast<std::int32_t>(std::numeric_limits<std::uint16_t>::max())
+        == alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::uint16_t>::max()));
     CHECK(
-        static_cast<std::uint64_t>(std::numeric_limits<std::int32_t>::max()) ==
-        alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::int32_t>::max()));
+        static_cast<std::uint64_t>(std::numeric_limits<std::int32_t>::max())
+        == alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::int32_t>::max()));
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastUpCastDifferentSigndnessCanChangeTheValueForNegatives", "[core]")
+TEST_CASE("clipCastUpCastDifferentSigndnessCanChangeTheValueForNegatives", "[core]")
 {
     CHECK(
-        std::numeric_limits<std::uint16_t>::min() ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int8_t>::min()));
+        std::numeric_limits<std::uint16_t>::min()
+        == alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int8_t>::min()));
     CHECK(
-        static_cast<std::int32_t>(std::numeric_limits<std::uint16_t>::min()) ==
-        alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::uint16_t>::min()));
+        static_cast<std::int32_t>(std::numeric_limits<std::uint16_t>::min())
+        == alpaka::core::clipCast<std::int32_t>(std::numeric_limits<std::uint16_t>::min()));
     CHECK(
-        std::numeric_limits<uint64_t>::min() ==
-        alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::int32_t>::min()));
+        std::numeric_limits<uint64_t>::min()
+        == alpaka::core::clipCast<std::uint64_t>(std::numeric_limits<std::int32_t>::min()));
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastDownCastEqualSigndnessCanChangeTheValue", "[core]")
+TEST_CASE("clipCastDownCastEqualSigndnessCanChangeTheValue", "[core]")
 {
     CHECK(
-        std::numeric_limits<std::uint8_t>::max() ==
-        alpaka::core::clipCast<std::uint8_t>(std::numeric_limits<std::uint16_t>::max()));
+        std::numeric_limits<std::uint8_t>::max()
+        == alpaka::core::clipCast<std::uint8_t>(std::numeric_limits<std::uint16_t>::max()));
     CHECK(
-        std::numeric_limits<std::int16_t>::min() ==
-        alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::int32_t>::min()));
+        std::numeric_limits<std::int16_t>::min()
+        == alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::int32_t>::min()));
     CHECK(
-        std::numeric_limits<std::uint16_t>::max() ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::uint64_t>::max()));
+        std::numeric_limits<std::uint16_t>::max()
+        == alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::uint64_t>::max()));
     CHECK(
-        std::numeric_limits<std::int8_t>::min() ==
-        alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::int64_t>::min()));
+        std::numeric_limits<std::int8_t>::min()
+        == alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::int64_t>::min()));
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE(
-    "clipCastDownCastDifferentSigndnessCanChangeTheValue", "[core]")
+TEST_CASE("clipCastDownCastDifferentSigndnessCanChangeTheValue", "[core]")
 {
     CHECK(
-        std::numeric_limits<std::int8_t>::max() ==
-        alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::uint16_t>::max()));
+        std::numeric_limits<std::int8_t>::max()
+        == alpaka::core::clipCast<std::int8_t>(std::numeric_limits<std::uint16_t>::max()));
     CHECK(
-        std::numeric_limits<std::uint16_t>::min() ==
-        alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int32_t>::min()));
+        std::numeric_limits<std::uint16_t>::min()
+        == alpaka::core::clipCast<std::uint16_t>(std::numeric_limits<std::int32_t>::min()));
     CHECK(
-        static_cast<std::int16_t>(std::numeric_limits<std::uint64_t>::min()) ==
-        alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::uint64_t>::min()));
+        static_cast<std::int16_t>(std::numeric_limits<std::uint64_t>::min())
+        == alpaka::core::clipCast<std::int16_t>(std::numeric_limits<std::uint64_t>::min()));
     CHECK(
-        std::numeric_limits<std::uint8_t>::max() ==
-        alpaka::core::clipCast<std::uint8_t>(std::numeric_limits<std::int64_t>::max()));
+        std::numeric_limits<std::uint8_t>::max()
+        == alpaka::core::clipCast<std::uint8_t>(std::numeric_limits<std::int64_t>::max()));
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/core/src/ConceptsTest.cpp b/thirdParty/cupla/alpaka/test/unit/core/src/ConceptsTest.cpp
index 4e656111a4..d5a30edaec 100644
--- a/thirdParty/cupla/alpaka/test/unit/core/src/ConceptsTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/core/src/ConceptsTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -13,8 +13,12 @@
 
 #include <type_traits>
 
-struct ConceptExample;
-struct ConceptNonMatchingExample;
+struct ConceptExample
+{
+};
+struct ConceptNonMatchingExample
+{
+};
 
 struct ImplementerNotTagged
 {
@@ -25,8 +29,7 @@ struct ImplementerNotTaggedButNonMatchingTagged
 {
 };
 
-struct ImplementerTagged
-    : public alpaka::concepts::Implements<ConceptExample, ImplementerTagged>
+struct ImplementerTagged : public alpaka::concepts::Implements<ConceptExample, ImplementerTagged>
 {
 };
 
@@ -36,13 +39,11 @@ struct ImplementerTaggedButAlsoNonMatchingTagged
 {
 };
 
-struct ImplementerWithTaggedBase
-    : public ImplementerTagged
+struct ImplementerWithTaggedBase : public ImplementerTagged
 {
 };
 
-struct ImplementerWithTaggedBaseAlsoNonMatchingTagged
-    : public ImplementerTaggedButAlsoNonMatchingTagged
+struct ImplementerWithTaggedBaseAlsoNonMatchingTagged : public ImplementerTaggedButAlsoNonMatchingTagged
 {
 };
 
@@ -66,118 +67,96 @@ struct ImplementerNonMatchingTaggedTaggedToBase
 };
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerNotTagged", "[meta]")
+TEST_CASE("ImplementerNotTagged", "[core]")
 {
     using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerNotTagged>;
 
     static_assert(
-        std::is_same<
-            ImplementerNotTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerNotTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerNotTaggedButNonMatchingTagged", "[meta]")
+TEST_CASE("ImplementerNotTaggedButNonMatchingTagged", "[core]")
 {
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerNotTaggedButNonMatchingTagged>;
+    using ImplementationBase
+        = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerNotTaggedButNonMatchingTagged>;
 
     static_assert(
-        std::is_same<
-            ImplementerNotTaggedButNonMatchingTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerNotTaggedButNonMatchingTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerTagged", "[meta]")
+TEST_CASE("ImplementerTagged", "[core]")
 {
     using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTagged>;
 
     static_assert(
-        std::is_same<
-            ImplementerTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerTaggedButAlsoNonMatchingTagged", "[meta]")
+TEST_CASE("ImplementerTaggedButAlsoNonMatchingTagged", "[core]")
 {
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTaggedButAlsoNonMatchingTagged>;
+    using ImplementationBase
+        = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTaggedButAlsoNonMatchingTagged>;
 
     static_assert(
-        std::is_same<
-            ImplementerTaggedButAlsoNonMatchingTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerTaggedButAlsoNonMatchingTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerWithTaggedBaseAlsoNonMatchingTagged", "[meta]")
+TEST_CASE("ImplementerWithTaggedBaseAlsoNonMatchingTagged", "[core]")
 {
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerWithTaggedBaseAlsoNonMatchingTagged>;
+    using ImplementationBase
+        = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerWithTaggedBaseAlsoNonMatchingTagged>;
 
     static_assert(
-        std::is_same<
-            ImplementerTaggedButAlsoNonMatchingTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerTaggedButAlsoNonMatchingTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerWithTaggedBase", "[meta]")
+TEST_CASE("ImplementerWithTaggedBase", "[core]")
 {
     using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerWithTaggedBase>;
 
     static_assert(
-        std::is_same<
-            ImplementerTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerTaggedToBase", "[meta]")
+TEST_CASE("ImplementerTaggedToBase", "[core]")
 {
     using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTaggedToBase>;
 
     static_assert(
-        std::is_same<
-            ImplementerNotTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerNotTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerTaggedToBaseAlsoNonMatchingTagged", "[meta]")
+TEST_CASE("ImplementerTaggedToBaseAlsoNonMatchingTagged", "[core]")
 {
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTaggedToBaseAlsoNonMatchingTagged>;
+    using ImplementationBase
+        = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerTaggedToBaseAlsoNonMatchingTagged>;
 
     static_assert(
-        std::is_same<
-            ImplementerNotTaggedButNonMatchingTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerNotTaggedButNonMatchingTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
 
 //-----------------------------------------------------------------------------
-TEST_CASE("ImplementerNonMatchingTaggedTaggedToBase", "[meta]")
+TEST_CASE("ImplementerNonMatchingTaggedTaggedToBase", "[core]")
 {
-    using ImplementationBase = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerNonMatchingTaggedTaggedToBase>;
+    using ImplementationBase
+        = alpaka::concepts::ImplementationBase<ConceptExample, ImplementerNonMatchingTaggedTaggedToBase>;
 
     static_assert(
-        std::is_same<
-            ImplementerNotTagged,
-            ImplementationBase
-        >::value,
-        "alpaka::meta::ImplementationBase failed!");
+        std::is_same<ImplementerNotTagged, ImplementationBase>::value,
+        "alpaka::concepts::ImplementationBase failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/core/src/OmpScheduleTest.cpp b/thirdParty/cupla/alpaka/test/unit/core/src/OmpScheduleTest.cpp
new file mode 100644
index 0000000000..73f4f79635
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/core/src/OmpScheduleTest.cpp
@@ -0,0 +1,72 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/core/OmpSchedule.hpp>
+#include <alpaka/core/Unused.hpp>
+
+#include <catch2/catch.hpp>
+
+//-----------------------------------------------------------------------------
+TEST_CASE("ompScheduleDefaultConstructor", "[core]")
+{
+    auto const schedule = alpaka::omp::Schedule{};
+    alpaka::ignore_unused(schedule);
+}
+
+//-----------------------------------------------------------------------------
+TEST_CASE("ompScheduleConstructor", "[core]")
+{
+    auto const staticSchedule = alpaka::omp::Schedule{alpaka::omp::Schedule::Static, 5};
+    alpaka::ignore_unused(staticSchedule);
+
+    auto const guidedSchedule = alpaka::omp::Schedule{alpaka::omp::Schedule::Guided};
+    alpaka::ignore_unused(guidedSchedule);
+}
+
+//-----------------------------------------------------------------------------
+TEST_CASE("ompScheduleConstexprConstructor", "[core]")
+{
+    constexpr auto schedule = alpaka::omp::Schedule{alpaka::omp::Schedule::Dynamic};
+    alpaka::ignore_unused(schedule);
+}
+
+//-----------------------------------------------------------------------------
+TEST_CASE("ompGetSchedule", "[core]")
+{
+    auto const schedule = alpaka::omp::getSchedule();
+    alpaka::ignore_unused(schedule);
+}
+
+//-----------------------------------------------------------------------------
+TEST_CASE("ompSetSchedule", "[core]")
+{
+    auto const expectedSchedule = alpaka::omp::Schedule{alpaka::omp::Schedule::Dynamic, 3};
+    alpaka::omp::setSchedule(expectedSchedule);
+    // The check makes sense only when this feature is supported
+#if defined _OPENMP && _OPENMP >= 200805 && ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+    auto const actualSchedule = alpaka::omp::getSchedule();
+    REQUIRE(expectedSchedule.kind == actualSchedule.kind);
+    REQUIRE(expectedSchedule.chunkSize == actualSchedule.chunkSize);
+#endif
+}
+
+//-----------------------------------------------------------------------------
+TEST_CASE("ompSetNoSchedule", "[core]")
+{
+    auto const expectedSchedule = alpaka::omp::Schedule{alpaka::omp::Schedule::Guided, 2};
+    alpaka::omp::setSchedule(expectedSchedule);
+    auto const noSchedule = alpaka::omp::Schedule{alpaka::omp::Schedule::NoSchedule};
+    alpaka::omp::setSchedule(noSchedule);
+    // The check makes sense only when this feature is supported
+#if defined _OPENMP && _OPENMP >= 200805 && ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+    auto const actualSchedule = alpaka::omp::getSchedule();
+    REQUIRE(expectedSchedule.kind == actualSchedule.kind);
+    REQUIRE(expectedSchedule.chunkSize == actualSchedule.chunkSize);
+#endif
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/dev/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/dev/CMakeLists.txt
new file mode 100644
index 0000000000..55d8831066
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/dev/CMakeLists.txt
@@ -0,0 +1,24 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
+#
+# This file is part of Alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+set(_TARGET_NAME "dev")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp b/thirdParty/cupla/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp
new file mode 100644
index 0000000000..269afd2a9c
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/dev/src/DevWarpSizeTest.cpp
@@ -0,0 +1,23 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/dev/Traits.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/catch.hpp>
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("getWarpSize", "[dev]", alpaka::test::TestAccs)
+{
+    using Dev = alpaka::Dev<TestType>;
+    using Pltf = alpaka::Pltf<Dev>;
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const warpExtent = alpaka::getWarpSize(dev);
+    REQUIRE(warpExtent > 0);
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/event/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/event/CMakeLists.txt
index d0ebb51977..055a388e68 100644
--- a/thirdParty/cupla/alpaka/test/unit/event/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/event/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "event")
+set(_TARGET_NAME "event")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/event/src/EventTest.cpp b/thirdParty/cupla/alpaka/test/unit/event/src/EventTest.cpp
index af6b70c985..b2331f9d0d 100644
--- a/thirdParty/cupla/alpaka/test/unit/event/src/EventTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/event/src/EventTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,204 +8,206 @@
  */
 
 #include <alpaka/event/Traits.hpp>
-
 #include <alpaka/test/event/EventHostManualTrigger.hpp>
 #include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/queue/QueueTestFixture.hpp>
 #include <alpaka/test/queue/QueueCpuOmp2Collective.hpp>
+#include <alpaka/test/queue/QueueTestFixture.hpp>
 
 #include <catch2/catch.hpp>
 
 using TestQueues = alpaka::meta::Concatenate<
-        alpaka::test::queue::TestQueues
- #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-        ,
-        std::tuple<std::tuple<alpaka::dev::DevCpu, alpaka::queue::QueueCpuOmp2Collective>>
+    alpaka::test::TestQueues
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+    ,
+    std::tuple<std::tuple<alpaka::DevCpu, alpaka::QueueCpuOmp2Collective>>
 #endif
     >;
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "eventTestShouldInitiallyBeTrue", "[event]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("eventTestShouldInitiallyBeTrue", "[event]", TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     using Queue = typename Fixture::Queue;
 
     Fixture f;
-    alpaka::event::Event<Queue> event(f.m_dev);
+    alpaka::Event<Queue> event(f.m_dev);
 
-    REQUIRE(alpaka::event::test(event));
+    REQUIRE(alpaka::isComplete(event));
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "eventTestShouldBeFalseWhileInQueueAndTrueAfterBeingProcessed", "[event]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("eventTestShouldBeFalseWhileInQueueAndTrueAfterBeingProcessed", "[event]", TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     using Queue = typename Fixture::Queue;
     using Dev = typename Fixture::Dev;
 
     Fixture f1;
-    if(alpaka::test::event::isEventHostManualTriggerSupported(f1.m_dev))
+    if(alpaka::test::isEventHostManualTriggerSupported(f1.m_dev))
     {
         auto q1 = f1.m_queue;
-        alpaka::event::Event<Queue> e1(f1.m_dev);
-        alpaka::test::event::EventHostManualTrigger<Dev> k1(f1.m_dev);
+        alpaka::Event<Queue> e1(f1.m_dev);
+        alpaka::test::EventHostManualTrigger<Dev> k1(f1.m_dev);
 
-        if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
+        if(!alpaka::test::IsBlockingQueue<Queue>::value)
         {
-            alpaka::queue::enqueue(q1, k1);
+            alpaka::enqueue(q1, k1);
         }
 
-        alpaka::queue::enqueue(q1, e1);
+        alpaka::enqueue(q1, e1);
 
-        if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
+        if(!alpaka::test::IsBlockingQueue<Queue>::value)
         {
-            REQUIRE(alpaka::event::test(e1) == false);
+            REQUIRE(alpaka::isComplete(e1) == false);
 
             k1.trigger();
 
-            alpaka::wait::wait(q1);
+            alpaka::wait(q1);
         }
 
-        REQUIRE(alpaka::event::test(e1));
+        REQUIRE(alpaka::isComplete(e1));
     }
     else
     {
-        std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!" << std::endl;
+        std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!"
+                  << std::endl;
     }
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "eventReEnqueueShouldBePossibleIfNobodyWaitsFor", "[event]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("eventReEnqueueShouldBePossibleIfNobodyWaitsFor", "[event]", TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     using Queue = typename Fixture::Queue;
     using Dev = typename Fixture::Dev;
 
-    if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
+    if(!alpaka::test::IsBlockingQueue<Queue>::value)
     {
         Fixture f1;
-        if(alpaka::test::event::isEventHostManualTriggerSupported(f1.m_dev))
+        if(alpaka::test::isEventHostManualTriggerSupported(f1.m_dev))
         {
             auto q1 = f1.m_queue;
-            alpaka::event::Event<Queue> e1(f1.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k1(f1.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k2(f1.m_dev);
+            alpaka::Event<Queue> e1(f1.m_dev);
+            alpaka::test::EventHostManualTrigger<Dev> k1(f1.m_dev);
+            alpaka::test::EventHostManualTrigger<Dev> k2(f1.m_dev);
 
             // q1 = [k1]
-            alpaka::queue::enqueue(q1, k1);
-            REQUIRE(!alpaka::event::test(k1));
+            alpaka::enqueue(q1, k1);
+            REQUIRE(!alpaka::isComplete(k1));
 
             // q1 = [k1, e1]
-            alpaka::queue::enqueue(q1, e1);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(e1));
+            alpaka::enqueue(q1, e1);
+            REQUIRE(!alpaka::isComplete(k1));
+            REQUIRE(!alpaka::isComplete(e1));
 
             // q1 = [k1, e1, k2]
-            alpaka::queue::enqueue(q1, k2);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(e1));
-            REQUIRE(!alpaka::event::test(k2));
+            alpaka::enqueue(q1, k2);
+            REQUIRE(!alpaka::isComplete(k1));
+            REQUIRE(!alpaka::isComplete(e1));
+            REQUIRE(!alpaka::isComplete(k2));
 
             // re-enqueue should be possible
             // q1 = [k1, k2, e1]
-            alpaka::queue::enqueue(q1, e1);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(k2));
-            REQUIRE(!alpaka::event::test(e1));
+            alpaka::enqueue(q1, e1);
+            REQUIRE(!alpaka::isComplete(k1));
+            REQUIRE(!alpaka::isComplete(k2));
+            REQUIRE(!alpaka::isComplete(e1));
 
             // q1 = [k2, e1]
             k1.trigger();
-            REQUIRE(alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(k2));
-            REQUIRE(!alpaka::event::test(e1));
+            REQUIRE(alpaka::isComplete(k1));
+            REQUIRE(!alpaka::isComplete(k2));
+            REQUIRE(!alpaka::isComplete(e1));
 
             // q1 = [e1]
             k2.trigger();
-            REQUIRE(alpaka::event::test(k2));
-            alpaka::wait::wait(e1);
-            REQUIRE(alpaka::event::test(e1));
+            REQUIRE(alpaka::isComplete(k2));
+            alpaka::wait(e1);
+            REQUIRE(alpaka::isComplete(e1));
         }
         else
         {
-            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!" << std::endl;
+            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!"
+                      << std::endl;
         }
     }
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "eventReEnqueueShouldBePossibleIfSomeoneWaitsFor", "[event]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("eventReEnqueueShouldBePossibleIfSomeoneWaitsFor", "[event]", TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     using Queue = typename Fixture::Queue;
     using Dev = typename Fixture::Dev;
 
-    if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
+    if(!alpaka::test::IsBlockingQueue<Queue>::value)
     {
         Fixture f1;
         Fixture f2;
-        if(alpaka::test::event::isEventHostManualTriggerSupported(f1.m_dev)
-            && alpaka::test::event::isEventHostManualTriggerSupported(f2.m_dev))
+        if(alpaka::test::isEventHostManualTriggerSupported(f1.m_dev)
+           && alpaka::test::isEventHostManualTriggerSupported(f2.m_dev))
         {
             auto q1 = f1.m_queue;
             auto q2 = f2.m_queue;
-            alpaka::event::Event<Queue> e1(f1.m_dev);
-            alpaka::event::Event<Queue> e2(f2.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k1(f1.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k2(f1.m_dev);
+            alpaka::Event<Queue> e1(f1.m_dev);
+            alpaka::Event<Queue> e2(f2.m_dev);
+            alpaka::test::EventHostManualTrigger<Dev> k1(f1.m_dev);
+            alpaka::test::EventHostManualTrigger<Dev> k2(f1.m_dev);
 
             // q1 = [k1]
-            alpaka::queue::enqueue(q1, k1);
-            REQUIRE(!alpaka::event::test(k1));
+            alpaka::enqueue(q1, k1);
+            REQUIRE(!alpaka::isComplete(k1));
 
             // q1 = [k1, e1]
-            alpaka::queue::enqueue(q1, e1);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(e1));
+            alpaka::enqueue(q1, e1);
+            REQUIRE(!alpaka::isComplete(k1));
+            REQUIRE(!alpaka::isComplete(e1));
 
             // q1 = [k1, e1, k2]
-            alpaka::queue::enqueue(q1, k2);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(e1));
-            REQUIRE(!alpaka::event::test(k2));
+            alpaka::enqueue(q1, k2);
+            REQUIRE(!alpaka::isComplete(k1));
+            REQUIRE(!alpaka::isComplete(e1));
+            REQUIRE(!alpaka::isComplete(k2));
 
             // wait for e1
             // q2 = [->e1]
-            alpaka::wait::wait(q2, e1);
+            alpaka::wait(q2, e1);
 
             // q2 = [->e1, e2]
-            alpaka::queue::enqueue(q2, e2);
-            REQUIRE(!alpaka::event::test(e2));
+            alpaka::enqueue(q2, e2);
+            REQUIRE(!alpaka::isComplete(e2));
 
             // re-enqueue should be possible
             // q1 = [k1, e1-old, k2, e1]
-            alpaka::queue::enqueue(q1, e1);
-            REQUIRE(!alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(k2));
-            REQUIRE(!alpaka::event::test(e1));
-            REQUIRE(!alpaka::event::test(e2));
+            alpaka::enqueue(q1, e1);
+            REQUIRE(!alpaka::isComplete(k1));
+            REQUIRE(!alpaka::isComplete(k2));
+            REQUIRE(!alpaka::isComplete(e1));
+            REQUIRE(!alpaka::isComplete(e2));
 
             // q1 = [k2, e1]
             k1.trigger();
-            REQUIRE(alpaka::event::test(k1));
-            REQUIRE(!alpaka::event::test(k2));
-            REQUIRE(!alpaka::event::test(e1));
-            REQUIRE(!alpaka::event::test(e2));
+            REQUIRE(alpaka::isComplete(k1));
+            REQUIRE(!alpaka::isComplete(k2));
+            REQUIRE(!alpaka::isComplete(e1));
+            REQUIRE(!alpaka::isComplete(e2));
 
             // q1 = [e1]
             k2.trigger();
-            REQUIRE(alpaka::event::test(k2));
-            alpaka::wait::wait(e1);
-            REQUIRE(alpaka::event::test(e1));
-            alpaka::wait::wait(e2);
-            REQUIRE(alpaka::event::test(e2));
+            REQUIRE(alpaka::isComplete(k2));
+            alpaka::wait(e1);
+            REQUIRE(alpaka::isComplete(e1));
+            alpaka::wait(e2);
+            REQUIRE(alpaka::isComplete(e2));
         }
         else
         {
-            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!" << std::endl;
+            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!"
+                      << std::endl;
         }
     }
 }
@@ -213,40 +215,40 @@ TEMPLATE_LIST_TEST_CASE( "eventReEnqueueShouldBePossibleIfSomeoneWaitsFor", "[ev
 
 //-----------------------------------------------------------------------------
 // github issue #388
-TEMPLATE_LIST_TEST_CASE( "waitForEventThatAlreadyFinishedShouldBeSkipped", "[event]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("waitForEventThatAlreadyFinishedShouldBeSkipped", "[event]", TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     using Queue = typename Fixture::Queue;
     using Dev = typename Fixture::Dev;
 
-    if(!alpaka::test::queue::IsBlockingQueue<Queue>::value)
+    if(!alpaka::test::IsBlockingQueue<Queue>::value)
     {
         Fixture f1;
         Fixture f2;
-        if(alpaka::test::event::isEventHostManualTriggerSupported(f1.m_dev)
-            && alpaka::test::event::isEventHostManualTriggerSupported(f2.m_dev))
+        if(alpaka::test::isEventHostManualTriggerSupported(f1.m_dev)
+           && alpaka::test::isEventHostManualTriggerSupported(f2.m_dev))
         {
             auto q1 = f1.m_queue;
             auto q2 = f2.m_queue;
-            alpaka::test::event::EventHostManualTrigger<Dev> k1(f1.m_dev);
-            alpaka::test::event::EventHostManualTrigger<Dev> k2(f2.m_dev);
-            alpaka::event::Event<Queue> e1(f1.m_dev);
+            alpaka::test::EventHostManualTrigger<Dev> k1(f1.m_dev);
+            alpaka::test::EventHostManualTrigger<Dev> k2(f2.m_dev);
+            alpaka::Event<Queue> e1(f1.m_dev);
 
             // 1. kernel k1 is enqueued into queue q1
             // q1 = [k1]
-            alpaka::queue::enqueue(q1, k1);
+            alpaka::enqueue(q1, k1);
             // 2. kernel k2 is enqueued into queue q2
             // q2 = [k2]
-            alpaka::queue::enqueue(q2, k2);
+            alpaka::enqueue(q2, k2);
 
             // 3. event e1 is enqueued into queue q1
             // q1 = [k1, e1]
-            alpaka::queue::enqueue(q1, e1);
+            alpaka::enqueue(q1, e1);
 
             // 4. q2 waits for e1
             // q2 = [k2, ->e1]
-            alpaka::wait::wait(q2, e1);
+            alpaka::wait(q2, e1);
 
             // 5. kernel k1 finishes
             // q1 = [e1]
@@ -254,28 +256,29 @@ TEMPLATE_LIST_TEST_CASE( "waitForEventThatAlreadyFinishedShouldBeSkipped", "[eve
 
             // 6. e1 is finished
             // q1 = []
-            alpaka::wait::wait(e1);
-            REQUIRE(alpaka::event::test(e1));
+            alpaka::wait(e1);
+            REQUIRE(alpaka::isComplete(e1));
 
             // 7. e1 is re-enqueued again but this time into q2
             // q2 = [k2, ->e1, e1]
-            alpaka::queue::enqueue(q2, e1);
+            alpaka::enqueue(q2, e1);
 
             // 8. kernel k2 finishes
             // q2 = [->e1, e1]
             k2.trigger();
 
-            // 9. e1 had already been signaled so there should not be waited even though the event is now reused within q2 and its current state is 'unfinished' again.
-            // q2 = [e1]
+            // 9. e1 had already been signaled so there should not be waited even though the event is now reused within
+            // q2 and its current state is 'unfinished' again. q2 = [e1]
 
             // Both queues should successfully finish
-            alpaka::wait::wait(q1);
+            alpaka::wait(q1);
             // q2 = []
-            alpaka::wait::wait(q2);
+            alpaka::wait(q2);
         }
         else
         {
-            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!" << std::endl;
+            std::cerr << "Can not execute test because CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS is not supported!"
+                      << std::endl;
         }
     }
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/idx/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/idx/CMakeLists.txt
index 5ca5024141..f1df3296dd 100644
--- a/thirdParty/cupla/alpaka/test/unit/idx/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/idx/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "idx")
+set(_TARGET_NAME "idx")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdx.cpp b/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdx.cpp
index d6de1145ed..4b06719f9f 100644
--- a/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdx.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdx.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,45 +9,26 @@
 
 #include <alpaka/idx/Accessors.hpp>
 #include <alpaka/idx/MapIdx.hpp>
-
 #include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/Extent.hpp>
 #include <alpaka/test/dim/TestDims.hpp>
 
 #include <catch2/catch.hpp>
 
-//#############################################################################
-//! 1D: (17)
-//! 2D: (17, 14)
-//! 3D: (17, 14, 11)
-//! 4D: (17, 14, 11, 8)
-template<
-    std::size_t Tidx>
-struct CreateExtentVal
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TIdx>
-    ALPAKA_FN_HOST_ACC static auto create(
-        TIdx)
-    -> TIdx
-    {
-        return  static_cast<TIdx>(17u - (Tidx*3u));
-    }
-};
-
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "mapIdx", "[idx]", alpaka::test::dim::TestDims)
+TEMPLATE_LIST_TEST_CASE("mapIdx", "[idx]", alpaka::test::TestDims)
 {
     using Dim = TestType;
     using Idx = std::size_t;
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
+    using Vec = alpaka::Vec<Dim, Idx>;
 
-    auto const extentNd(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, CreateExtentVal>(Idx()));
+    auto const extentNd(
+        alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
     auto const idxNd(extentNd - Vec::all(4u));
 
-    auto const idx1d(alpaka::idx::mapIdx<1u>(idxNd, extentNd));
+    auto const idx1d(alpaka::mapIdx<1u>(idxNd, extentNd));
 
-    auto const idxNdResult(alpaka::idx::mapIdx<Dim::value>(idx1d, extentNd));
+    auto const idxNdResult(alpaka::mapIdx<Dim::value>(idx1d, extentNd));
 
     REQUIRE(idxNd == idxNdResult);
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdxPitchBytes.cpp b/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdxPitchBytes.cpp
new file mode 100644
index 0000000000..19023ec9be
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/idx/src/MapIdxPitchBytes.cpp
@@ -0,0 +1,53 @@
+/* Copyright 2020 Jeffrey Kelling
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/dev/Traits.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/idx/Accessors.hpp>
+#include <alpaka/idx/MapIdx.hpp>
+#include <alpaka/mem/view/ViewPlainPtr.hpp>
+#include <alpaka/mem/view/ViewSubView.hpp>
+#include <alpaka/test/Extent.hpp>
+#include <alpaka/test/dim/TestDims.hpp>
+
+#include <catch2/catch.hpp>
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("mapIdxPitchBytes", "[idx]", alpaka::test::TestDims)
+{
+    using Dim = TestType;
+    using Idx = std::size_t;
+    using Vec = alpaka::Vec<Dim, Idx>;
+
+    auto const extentNd(
+        alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
+
+    using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Dev = alpaka::Dev<Acc>;
+    using Elem = std::uint8_t;
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
+    alpaka::ViewPlainPtr<Dev, Elem, Dim, Idx> parentView(nullptr, devAcc, extentNd);
+
+    auto const offset(Vec::all(4u));
+    auto const extent(Vec::all(4u));
+    auto const idxNd(Vec::all(2u));
+    alpaka::ViewSubView<Dev, Elem, Dim, Idx> view(parentView, extent, offset);
+    auto pitch = alpaka::getPitchBytesVec(view);
+
+    auto const idx1d(alpaka::mapIdxPitchBytes<1u>(idxNd, pitch));
+    auto const idx1dDelta(alpaka::mapIdx<1u>(idxNd + offset, extentNd) - alpaka::mapIdx<1u>(offset, extentNd));
+
+    auto const idxNdResult(alpaka::mapIdxPitchBytes<Dim::value>(idx1d, pitch));
+
+    // linear index in pitched offset box should be the difference between
+    // linear index in parent box and linear index of offset
+    REQUIRE(idx1d == idx1dDelta);
+    // roundtrip
+    REQUIRE(idxNd == idxNdResult);
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/intrinsic/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/intrinsic/CMakeLists.txt
new file mode 100644
index 0000000000..370e4532fd
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/intrinsic/CMakeLists.txt
@@ -0,0 +1,25 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+set(_TARGET_NAME "intrinsic")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_INTRINSIC")
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Ffs.cpp b/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Ffs.cpp
new file mode 100644
index 0000000000..5cb18b5c69
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Ffs.cpp
@@ -0,0 +1,80 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/intrinsic/Traits.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+#include <limits>
+
+//#############################################################################
+template<typename TInput>
+class FfsTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        TInput const inputs[]
+            = {0,
+               1,
+               3,
+               64,
+               256,
+               51362,
+               std::numeric_limits<TInput>::max(),
+               -1,
+               -32,
+               -1352,
+               -4096,
+               std::numeric_limits<TInput>::min()};
+        for(auto const input : inputs)
+        {
+            std::int32_t const expected = ffsNaive(input);
+            std::int32_t const actual = alpaka::ffs(acc, input);
+            ALPAKA_CHECK(*success, actual == expected);
+        }
+    }
+
+private:
+    ALPAKA_FN_ACC static auto ffsNaive(TInput value) -> std::int32_t
+    {
+        if(value == 0)
+            return 0;
+        std::int32_t result = 1;
+        while((value & 1) == 0)
+        {
+            value >>= 1;
+            result++;
+        }
+        return result;
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("ffs", "[intrinsic]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+    FfsTestKernel<std::int32_t> kernel32bit;
+    REQUIRE(fixture(kernel32bit));
+
+    FfsTestKernel<std::int64_t> kernel64bit;
+    REQUIRE(fixture(kernel64bit));
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Popcount.cpp b/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Popcount.cpp
new file mode 100644
index 0000000000..3f7dd70490
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/intrinsic/src/Popcount.cpp
@@ -0,0 +1,74 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/intrinsic/Traits.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+
+#include <catch2/catch.hpp>
+
+//#############################################################################
+template<typename TInput>
+class PopcountTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        // Use negative values to get inputs near the max value of TInput type
+        TInput const inputs[]
+            = {0u,
+               1u,
+               3u,
+               54u,
+               163u,
+               51362u,
+               static_cast<TInput>(-43631),
+               static_cast<TInput>(-1352),
+               static_cast<TInput>(-642),
+               static_cast<TInput>(-1)};
+        for(auto const input : inputs)
+        {
+            int const expected = popcountNaive(input);
+            int const actual = alpaka::popcount(acc, input);
+            ALPAKA_CHECK(*success, actual == expected);
+        }
+    }
+
+private:
+    ALPAKA_FN_ACC static auto popcountNaive(TInput value) -> int
+    {
+        int result = 0;
+        while(value)
+        {
+            result += static_cast<int>(value & 1u);
+            value >>= 1u;
+        }
+        return result;
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("popcount", "[intrinsic]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+    PopcountTestKernel<std::uint32_t> kernel32bit;
+    REQUIRE(fixture(kernel32bit));
+
+    PopcountTestKernel<std::uint64_t> kernel64bit;
+    REQUIRE(fixture(kernel64bit));
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/kernel/CMakeLists.txt
index b3ca1be4fb..bcca4636c9 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/CMakeLists.txt
@@ -1,27 +1,25 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "kernel")
+set(_TARGET_NAME "kernel")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_KERNEL")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp
index 380bb571a1..6dd7fed147 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelGenericLambda.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,69 +8,50 @@
  */
 
 #include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
-// Generic lambdas are a C++14 feature.
-#if !defined(BOOST_NO_CXX14_GENERIC_LAMBDAS)
 // CUDA C Programming guide says: "__host__ __device__ extended lambdas cannot be generic lambdas"
 #if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "genericLambdaKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("genericLambdaKernelIsWorking", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
-    auto kernel =
-        [] ALPAKA_FN_ACC (
-            auto const & acc,
-            bool * success)
-        -> void
-        {
-            ALPAKA_CHECK(
-                *success,
-                static_cast<alpaka::idx::Idx<Acc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
-        };
+    auto kernel = [] ALPAKA_FN_ACC(auto const& acc, bool* success) -> void {
+        ALPAKA_CHECK(
+            *success,
+            static_cast<alpaka::Idx<Acc>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+    };
 
     REQUIRE(fixture(kernel));
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "variadicGenericLambdaKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("variadicGenericLambdaKernelIsWorking", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     std::uint32_t const arg1 = 42u;
     std::uint32_t const arg2 = 43u;
-    auto kernel =
-        [] ALPAKA_FN_ACC (
-            Acc const & acc,
-            bool * success,
-            auto ... args)
-        -> void
-        {
-            alpaka::ignore_unused(acc);
+    auto kernel = [] ALPAKA_FN_ACC(Acc const& acc, bool* success, auto... args) -> void {
+        alpaka::ignore_unused(acc);
 
-            ALPAKA_CHECK(
-                *success,
-                alpaka::meta::foldr([](auto a, auto b){return a + b;}, args...) == (42u + 43u));
-        };
+        ALPAKA_CHECK(*success, alpaka::meta::foldr([](auto a, auto b) { return a + b; }, args...) == (42u + 43u));
+    };
 
     REQUIRE(fixture(kernel, arg1, arg2));
 }
 
 #endif
-#endif
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelLambda.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelLambda.cpp
index 07c4da01cb..8b54fa2ea2 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelLambda.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelLambda.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -10,120 +10,107 @@
 // NVCC needs --expt-extended-lambda
 #if !defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_EXTENDED_LAMBDA__))
 
-#include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/core/BoostPredef.hpp>
+#    include <alpaka/kernel/Traits.hpp>
+#    include <alpaka/test/KernelExecutionFixture.hpp>
+#    include <alpaka/test/acc/TestAccs.hpp>
 
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-#include <alpaka/core/BoostPredef.hpp>
-
-#include <catch2/catch.hpp>
+#    include <catch2/catch.hpp>
 
 //-----------------------------------------------------------------------------
 struct TestTemplateLambda
 {
-template< typename TAcc >
-void operator()()
-{
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    auto kernel =
-        [] ALPAKA_FN_ACC (
-            TAcc const & acc,
-            bool * success)
-        -> void
-        {
+    template<typename TAcc>
+    void operator()()
+    {
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
+
+        alpaka::test::KernelExecutionFixture<TAcc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#        pragma warning(push)
+#        pragma warning(disable : 4702) // warning C4702: unreachable code
+#    endif
+        auto kernel = [] ALPAKA_FN_ACC(TAcc const& acc, bool* success) -> void {
             ALPAKA_CHECK(
                 *success,
-                static_cast<alpaka::idx::Idx<TAcc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+                static_cast<alpaka::Idx<TAcc>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
         };
+#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#        pragma warning(pop)
+#    endif
 
-    REQUIRE(fixture(kernel));
-}
+        REQUIRE(fixture(kernel));
+    }
 };
 
 //-----------------------------------------------------------------------------
 struct TestTemplateArg
 {
-template< typename TAcc >
-void operator()()
-{
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
-
-    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    std::uint32_t const arg = 42u;
-    auto kernel =
-        [] ALPAKA_FN_ACC (
-            TAcc const & acc,
-            bool * success,
-            std::uint32_t const & arg1)
-        -> void
-        {
+    template<typename TAcc>
+    void operator()()
+    {
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
+
+        alpaka::test::KernelExecutionFixture<TAcc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+        std::uint32_t const arg = 42u;
+        auto kernel = [] ALPAKA_FN_ACC(TAcc const& acc, bool* success, std::uint32_t const& arg1) -> void {
             alpaka::ignore_unused(acc);
 
             ALPAKA_CHECK(*success, 42u == arg1);
         };
 
-    REQUIRE(fixture(kernel, arg));
-}
+        REQUIRE(fixture(kernel, arg));
+    }
 };
 
 //-----------------------------------------------------------------------------
 struct TestTemplateCapture
 {
-template< typename TAcc >
-void operator()()
-{
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
+    template<typename TAcc>
+    void operator()()
+    {
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
 
-    alpaka::test::KernelExecutionFixture<TAcc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+        alpaka::test::KernelExecutionFixture<TAcc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
-    std::uint32_t const arg = 42u;
+        std::uint32_t const arg = 42u;
 
-#if BOOST_COMP_CLANG >= BOOST_VERSION_NUMBER(5,0,0)
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wunused-lambda-capture"
-#endif
-    auto kernel =
-        [arg] ALPAKA_FN_ACC (
-            TAcc const & acc,
-            bool * success)
-        -> void
-        {
+#    if BOOST_COMP_CLANG >= BOOST_VERSION_NUMBER(5, 0, 0)
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wunused-lambda-capture"
+#    endif
+        auto kernel = [arg] ALPAKA_FN_ACC(TAcc const& acc, bool* success) -> void {
             alpaka::ignore_unused(acc);
 
             ALPAKA_CHECK(*success, 42u == arg);
         };
-#if BOOST_COMP_CLANG >= BOOST_VERSION_NUMBER(5,0,0)
-    #pragma clang diagnostic pop
-#endif
+#    if BOOST_COMP_CLANG >= BOOST_VERSION_NUMBER(5, 0, 0)
+#        pragma clang diagnostic pop
+#    endif
 
-    REQUIRE(fixture(kernel));
-}
+        REQUIRE(fixture(kernel));
+    }
 };
 
 
-TEST_CASE( "lambdaKernelIsWorking", "[kernel]")
+TEST_CASE("lambdaKernelIsWorking", "[kernel]")
 {
-    alpaka::meta::forEachType< alpaka::test::acc::TestAccs >( TestTemplateLambda() );
+    alpaka::meta::forEachType<alpaka::test::TestAccs>(TestTemplateLambda());
 }
 
-TEST_CASE( "lambdaKernelWithArgumentIsWorking", "[kernel]")
+TEST_CASE("lambdaKernelWithArgumentIsWorking", "[kernel]")
 {
-    alpaka::meta::forEachType< alpaka::test::acc::TestAccs >( TestTemplateArg() );
+    alpaka::meta::forEachType<alpaka::test::TestAccs>(TestTemplateArg());
 }
 
-TEST_CASE( "lambdaKernelWithCapturingIsWorking", "[kernel]")
+TEST_CASE("lambdaKernelWithCapturingIsWorking", "[kernel]")
 {
-    alpaka::meta::forEachType< alpaka::test::acc::TestAccs >( TestTemplateCapture() );
+    alpaka::meta::forEachType<alpaka::test::TestAccs>(TestTemplateCapture());
 }
 
 #endif
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelStdFunction.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelStdFunction.cpp
index 0856ea3cd8..925ca43230 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelStdFunction.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelStdFunction.cpp
@@ -1,32 +1,32 @@
 /* Copyright 2019 Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
-#include <alpaka/kernel/Traits.hpp>
+#define TEST_UNIT_KERNEL_KERNEL_STD_FUNCTION
+// clang thinks a macro is unused when only used as the second operand of an &&
+// where the first operand evaluates to false, so we use it here:
+TEST_UNIT_KERNEL_KERNEL_STD_FUNCTION
 
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/core/BoostPredef.hpp>
+#include <alpaka/kernel/Traits.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
 #include <functional>
 #if BOOST_LANG_CUDA
-#include <nvfunctional>
+#    include <nvfunctional>
 #endif
 
 //-----------------------------------------------------------------------------
-template<
-    typename Acc>
-void ALPAKA_FN_ACC kernelFn(
-    Acc const & acc,
-    bool * success,
-    std::int32_t val)
+template<typename Acc>
+void ALPAKA_FN_ACC kernelFn(Acc const& acc, bool* success, std::int32_t val)
 {
     alpaka::ignore_unused(acc);
 
@@ -36,30 +36,28 @@ void ALPAKA_FN_ACC kernelFn(
 // std::function and std::bind is only allowed on CPU
 #if !BOOST_LANG_CUDA && !BOOST_LANG_HIP
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "stdFunctionKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("stdFunctionKernelIsWorking", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
-    const auto kernel = std::function<void(Acc const &, bool *, std::int32_t)>( kernelFn<Acc> );
+    const auto kernel = std::function<void(Acc const&, bool*, std::int32_t)>(kernelFn<Acc>);
     REQUIRE(fixture(kernel, 42));
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "stdBindKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("stdBindKernelIsWorking", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
-    const auto kernel = std::bind( kernelFn<Acc>, std::placeholders::_1, std::placeholders::_2, 42 );
+    const auto kernel = std::bind(kernelFn<Acc>, std::placeholders::_1, std::placeholders::_2, 42);
     REQUIRE(fixture(kernel));
 }
 #endif
@@ -68,21 +66,23 @@ TEMPLATE_LIST_TEST_CASE( "stdBindKernelIsWorking", "[kernel]", alpaka::test::acc
 #if 0
 //#if BOOST_LANG_CUDA
 // clang as a native CUDA compiler does not seem to support nvstd::function when ALPAKA_ACC_GPU_CUDA_ONLY_MODE is used.
-// error: reference to __device__ function 'kernelFn<alpaka::acc::AccGpuCudaRt<std::__1::integral_constant<unsigned long, 1>, unsigned long> >' in __host__ function const auto kernel = nvstd::function<void(Acc const &, bool *, std::int32_t)>( kernelFn<Acc> );
-#if !(defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) && BOOST_COMP_CLANG_CUDA)
+// error: reference to __device__ function 'kernelFn<alpaka::AccGpuCudaRt<std::__1::integral_constant<unsigned long, 1>, unsigned long> >' in __host__ function const auto kernel = nvstd::function<void(Acc const &, bool *, std::int32_t)>( kernelFn<Acc> );
+#    if !(defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) && BOOST_COMP_CLANG_CUDA)
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "nvstdFunctionKernelIsWorking", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE( "nvstdFunctionKernelIsWorking", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
     alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+        alpaka::Vec<Dim, Idx>::ones());
 
     const auto kernel = nvstd::function<void(Acc const &, bool *, std::int32_t)>( kernelFn<Acc> );
     REQUIRE(fixture(kernel, 42));
 }
 
+#    endif
 #endif
-#endif
+
+#undef TEST_UNIT_KERNEL_KERNEL_STD_FUNCTION
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp
index 8436adebea..3835aa4046 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithAdditionalParam.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,10 +8,9 @@
  */
 
 #include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -21,13 +20,8 @@ class KernelWithAdditionalParamByValue
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success,
-        std::int32_t val) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success, std::int32_t val) const -> void
     {
         alpaka::ignore_unused(acc);
 
@@ -36,14 +30,13 @@ class KernelWithAdditionalParamByValue
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByValue", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByValue", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelWithAdditionalParamByValue kernel;
 
@@ -73,14 +66,14 @@ class KernelWithAdditionalParamByRef
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByRef", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByRef", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
     alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+        alpaka::Vec<Dim, Idx>::ones());
 
     KernelWithAdditionalParamByRef kernel;
 
@@ -93,11 +86,8 @@ class KernelWithAdditionalParamByConstRef
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template <typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const &acc,
-        bool *success,
-        std::int32_t const &val) const -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success, std::int32_t const& val) const -> void
     {
         alpaka::ignore_unused(acc);
 
@@ -106,14 +96,13 @@ class KernelWithAdditionalParamByConstRef
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByConstRef", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("KernelWithAdditionalParamByConstRef", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelWithAdditionalParamByConstRef kernel;
 
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp
index 1e6d347eb9..785764232a 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithConstructorAndMember.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,10 +8,9 @@
  */
 
 #include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -20,19 +19,14 @@ class KernelWithConstructorAndMember
 {
 public:
     //-----------------------------------------------------------------------------
-    ALPAKA_FN_HOST KernelWithConstructorAndMember(
-        std::int32_t const val = 42) :
-        m_val(val)
-    {}
+    ALPAKA_FN_HOST KernelWithConstructorAndMember(std::int32_t const val = 42) : m_val(val)
+    {
+    }
 
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
         alpaka::ignore_unused(acc);
 
@@ -44,14 +38,13 @@ class KernelWithConstructorAndMember
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelWithConstructorAndMember", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelWithConstructorAndMember", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelWithConstructorAndMember kernel(42);
 
@@ -59,14 +52,13 @@ TEMPLATE_LIST_TEST_CASE( "kernelWithConstructorAndMember", "[kernel]", alpaka::t
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelWithConstructorDefaultParamAndMember", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelWithConstructorDefaultParamAndMember", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelWithConstructorAndMember kernel;
 
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp
index e4ca38031e..f0cca7ff45 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithHostConstexpr.cpp
@@ -1,20 +1,16 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
-// NVCC needs --expt-relaxed-constexpr
-#if !defined(__NVCC__) || (defined(__NVCC__) && defined(__CUDACC_RELAXED_CONSTEXPR__))
-
 #include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -27,40 +23,34 @@ class KernelWithHostConstexpr
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
     template<typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool* success) const
-    -> void
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
         alpaka::ignore_unused(acc);
 
-#if BOOST_COMP_MSVC
-    #pragma warning(push)
-    #pragma warning(disable: 4127)  // warning C4127: conditional expression is constant
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(push)
+#    pragma warning(disable : 4127) // warning C4127: conditional expression is constant
 #endif
 
-        constexpr auto max = std::numeric_limits< std::uint32_t >::max();
+        constexpr auto max = std::numeric_limits<std::uint32_t>::max();
 
         ALPAKA_CHECK(*success, 0 != max);
-#if BOOST_COMP_MSVC
-    #pragma warning(pop)
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(pop)
 #endif
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelWithHostConstexpr", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelWithHostConstexpr", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelWithHostConstexpr kernel;
 
     REQUIRE(fixture(kernel));
 }
-
-#endif
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithOmpSchedule.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithOmpSchedule.cpp
new file mode 100644
index 0000000000..e62713a6a7
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithOmpSchedule.cpp
@@ -0,0 +1,145 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/core/OmpSchedule.hpp>
+#include <alpaka/kernel/Traits.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+
+//#############################################################################
+// Schedule to be used by all kernels in this file
+static constexpr auto expectedSchedule = alpaka::omp::Schedule{alpaka::omp::Schedule::Dynamic, 10};
+
+// Base kernel, not to be used directly in unit tests
+struct KernelWithOmpScheduleBase
+{
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        // By default no run-time check is performed
+        alpaka::ignore_unused(acc);
+        ALPAKA_CHECK(*success, true);
+    }
+
+    // Only check when the schedule feature is active
+#if defined _OPENMP && _OPENMP >= 200805 && ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC auto operator()(alpaka::AccCpuOmp2Blocks<TDim, TIdx> const& acc, bool* success) const -> void
+    {
+        alpaka::ignore_unused(acc);
+        omp_sched_t kind;
+        int actualChunkSize = 0;
+        omp_get_schedule(&kind, &actualChunkSize);
+        auto const actualKind = static_cast<std::uint32_t>(kind);
+        bool result = (expectedSchedule.kind == actualKind) && (expectedSchedule.chunkSize == actualChunkSize);
+        ALPAKA_CHECK(*success, result);
+    }
+#endif
+};
+
+// Kernel that sets the schedule via constexpr ompSchedule.
+// Checks that this variable is only declared and not defined, It also tests that
+// alpaka never odr-uses it.
+struct KernelWithConstexprMemberOmpSchedule : KernelWithOmpScheduleBase
+{
+    static constexpr auto ompSchedule = expectedSchedule;
+};
+
+// Kernel that sets the schedule via non-constexpr ompSchedule.
+struct KernelWithMemberOmpSchedule : KernelWithOmpScheduleBase
+{
+    static const alpaka::omp::Schedule ompSchedule;
+};
+// In this case, the member has to be defined externally
+const alpaka::omp::Schedule KernelWithMemberOmpSchedule::ompSchedule = expectedSchedule;
+
+// Kernel that sets the schedule via partial specialization of a trait
+struct KernelWithTraitOmpSchedule : KernelWithOmpScheduleBase
+{
+};
+
+// Kernel that sets the schedule via both member and partial specialization of a trait.
+// In this case test that the trait is used, not the member.
+struct KernelWithMemberAndTraitOmpSchedule : KernelWithOmpScheduleBase
+{
+    // Set to be different from expected so that it this is used the test would fail
+    static constexpr auto ompSchedule = alpaka::omp::Schedule{expectedSchedule.kind, expectedSchedule.chunkSize + 1};
+};
+
+namespace alpaka
+{
+    namespace traits
+    {
+        // Specialize the trait for all kernels
+        template<typename TKernelFnObj, typename TAcc>
+        struct OmpSchedule<TKernelFnObj, TAcc>
+        {
+            template<typename TDim, typename... TArgs>
+            ALPAKA_FN_HOST static auto getOmpSchedule(
+                TKernelFnObj const& kernelFnObj,
+                Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+                Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+                TArgs const&... args) -> alpaka::omp::Schedule
+            {
+                alpaka::ignore_unused(kernelFnObj);
+                alpaka::ignore_unused(blockThreadExtent);
+                alpaka::ignore_unused(threadElemExtent);
+                alpaka::ignore_unused(args...);
+
+                return expectedSchedule;
+            }
+        };
+    } // namespace traits
+} // namespace alpaka
+
+// Generic testing routine for the given kernel type
+template<typename TAcc, typename TKernel>
+void test()
+{
+    using Acc = TAcc;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+    TKernel kernel;
+
+    REQUIRE(fixture(kernel));
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("kernelWithConstexprMemberOmpSchedule", "[kernel]", alpaka::test::TestAccs)
+{
+    test<TestType, KernelWithConstexprMemberOmpSchedule>();
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("kernelWithMemberOmpSchedule", "[kernel]", alpaka::test::TestAccs)
+{
+    test<TestType, KernelWithMemberOmpSchedule>();
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("kernelWithTraitOmpSchedule", "[kernel]", alpaka::test::TestAccs)
+{
+    test<TestType, KernelWithTraitOmpSchedule>();
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("kernelWithMemberAndTraitOmpSchedule", "[kernel]", alpaka::test::TestAccs)
+{
+    test<TestType, KernelWithMemberAndTraitOmpSchedule>();
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp
index 7a78cf74bf..3b7ccb30aa 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplate.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,49 +8,40 @@
  */
 
 #include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
 #include <type_traits>
 
 //#############################################################################
-template<
-    typename T>
+template<typename T>
 class KernelFuntionObjectTemplate
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
         ALPAKA_CHECK(
             *success,
-            static_cast<alpaka::idx::Idx<TAcc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+            static_cast<alpaka::Idx<TAcc>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
 
-        static_assert(
-            std::is_same<std::int32_t, T>::value,
-            "Incorrect additional kernel template parameter type!");
+        static_assert(std::is_same<std::int32_t, T>::value, "Incorrect additional kernel template parameter type!");
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplate", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectTemplate", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelFuntionObjectTemplate<std::int32_t> kernel;
 
@@ -63,34 +54,25 @@ class KernelInvocationWithAdditionalTemplate
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename T>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success,
-        T const &) const
-    -> void
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success, T const&) const -> void
     {
         ALPAKA_CHECK(
             *success,
-            static_cast<alpaka::idx::Idx<TAcc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+            static_cast<alpaka::Idx<TAcc>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
 
-        static_assert(
-            std::is_same<std::int32_t, T>::value,
-            "Incorrect additional kernel template parameter type!");
+        static_assert(std::is_same<std::int32_t, T>::value, "Incorrect additional kernel template parameter type!");
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectExtraTemplate", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectExtraTemplate", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelInvocationWithAdditionalTemplate kernel;
 
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp
index 4873e5763d..b1f7c7f6e1 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,38 +8,32 @@
  */
 
 #include <alpaka/kernel/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
 #include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
 #include <type_traits>
 
 //#############################################################################
-template< typename TExpected >
+template<typename TExpected>
 class KernelInvocationTemplateDeductionValueSemantics
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-        template<
-        typename Acc,
-        typename TByValue,
-        typename TByConstValue,
-        typename TByConstReference>
-        ALPAKA_FN_ACC auto operator()(
-            Acc const & acc,
-            bool * success,
-            TByValue,
-            TByConstValue const,
-            TByConstReference const &) const
-        -> void
+    template<typename Acc, typename TByValue, typename TByConstValue, typename TByConstReference>
+    ALPAKA_FN_ACC auto operator()(
+        Acc const& acc,
+        bool* success,
+        TByValue,
+        TByConstValue const,
+        TByConstReference const&) const -> void
     {
         ALPAKA_CHECK(
             *success,
-            static_cast<alpaka::idx::Idx<Acc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+            static_cast<alpaka::Idx<Acc>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
 
         static_assert(
             std::is_same<TByValue, TExpected>::value,
@@ -50,84 +44,69 @@ class KernelInvocationTemplateDeductionValueSemantics
         static_assert(
             std::is_same<TByConstReference, TExpected>::value,
             "Incorrect third additional kernel template parameter type!");
-
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromValue", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectTemplateDeductionFromValue", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     using Value = std::int32_t;
-    KernelInvocationTemplateDeductionValueSemantics< Value > kernel;
+    KernelInvocationTemplateDeductionValueSemantics<Value> kernel;
 
-    Value value{ };
+    Value value{};
     REQUIRE(fixture(kernel, value, value, value));
 }
 
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstValue", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectTemplateDeductionFromConstValue", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     using Value = std::int32_t;
-    KernelInvocationTemplateDeductionValueSemantics< Value > kernel;
+    KernelInvocationTemplateDeductionValueSemantics<Value> kernel;
 
-    Value const constValue{ };
+    Value const constValue{};
     REQUIRE(fixture(kernel, constValue, constValue, constValue));
 }
 
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstReference", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectTemplateDeductionFromConstReference", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     using Value = std::int32_t;
-    KernelInvocationTemplateDeductionValueSemantics< Value > kernel;
+    KernelInvocationTemplateDeductionValueSemantics<Value> kernel;
 
-    Value value{ };
-    Value const & constReference = value;
+    Value value{};
+    Value const& constReference = value;
     REQUIRE(fixture(kernel, constReference, constReference, constReference));
 }
 
 //#############################################################################
-template<
-    typename TExpectedFirst,
-    typename TExpectedSecond = TExpectedFirst
->
+template<typename TExpectedFirst, typename TExpectedSecond = TExpectedFirst>
 class KernelInvocationTemplateDeductionPointerSemantics
 {
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-        template<
-        typename Acc,
-        typename TByPointer,
-        typename TByPointerToConst>
-        ALPAKA_FN_ACC auto operator()(
-            Acc const & acc,
-            bool * success,
-            TByPointer *,
-            TByPointerToConst const *) const
-        -> void
+    template<typename Acc, typename TByPointer, typename TByPointerToConst>
+    ALPAKA_FN_ACC auto operator()(Acc const& acc, bool* success, TByPointer*, TByPointerToConst const*) const -> void
     {
         ALPAKA_CHECK(
             *success,
-            static_cast<alpaka::idx::Idx<Acc>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+            static_cast<alpaka::Idx<Acc>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
 
         static_assert(
             std::is_same<TByPointer, TExpectedFirst>::value,
@@ -135,73 +114,68 @@ class KernelInvocationTemplateDeductionPointerSemantics
         static_assert(
             std::is_same<TByPointerToConst, TExpectedSecond>::value,
             "Incorrect second additional kernel template parameter type!");
-
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromPointer", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectTemplateDeductionFromPointer", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     using Value = std::int32_t;
-    KernelInvocationTemplateDeductionPointerSemantics< Value > kernel;
+    KernelInvocationTemplateDeductionPointerSemantics<Value> kernel;
 
-    Value value{ };
-    Value * pointer = &value;
+    Value value{};
+    Value* pointer = &value;
     REQUIRE(fixture(kernel, pointer, pointer));
 }
 
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromPointerToConst", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectTemplateDeductionFromPointerToConst", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     using Value = std::int32_t;
-    KernelInvocationTemplateDeductionPointerSemantics< Value const, Value > kernel;
+    KernelInvocationTemplateDeductionPointerSemantics<Value const, Value> kernel;
 
-    Value const constValue{ };
-    Value const * pointerToConst = &constValue;
+    Value const constValue{};
+    Value const* pointerToConst = &constValue;
     REQUIRE(fixture(kernel, pointerToConst, pointerToConst));
 }
 
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromStaticArray", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectTemplateDeductionFromStaticArray", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     using Value = std::int32_t;
-    KernelInvocationTemplateDeductionPointerSemantics< Value > kernel;
+    KernelInvocationTemplateDeductionPointerSemantics<Value> kernel;
 
-    Value staticArray[4] = { };
+    Value staticArray[4] = {};
     REQUIRE(fixture(kernel, staticArray, staticArray));
 }
 
-TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstStaticArray", "[kernel]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("kernelFuntionObjectTemplateDeductionFromConstStaticArray", "[kernel]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     using Value = std::int32_t;
-    KernelInvocationTemplateDeductionPointerSemantics< Value const, Value > kernel;
+    KernelInvocationTemplateDeductionPointerSemantics<Value const, Value> kernel;
 
-    Value const constStaticArray[4] = { };
+    Value const constStaticArray[4] = {};
     REQUIRE(fixture(kernel, constStaticArray, constStaticArray));
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp
index a855584e86..662d443399 100644
--- a/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/kernel/src/KernelWithoutTemplatedAccParam.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,35 +8,37 @@
  */
 
 #include <alpaka/kernel/Traits.hpp>
-
 #include <alpaka/test/KernelExecutionFixture.hpp>
 
 #include <catch2/catch.hpp>
 
 //#############################################################################
-//! It is not possible to use a alpaka kernel function object without a templated operator() when the CUDA accelerator is hard-coded.
+//! It is not possible to use a alpaka kernel function object without a templated operator() when the CUDA accelerator
+//! is hard-coded.
 //!
 //! However, compiling such kernels with a CPU device works fine.
 //!
 //! When the CUDA accelerator is used, the following error is triggered:
-//! /alpaka/include/alpaka/workdiv/Traits.hpp(...): error: calling a __device__ function("getWorkDiv") from a __host__ __device__ function("getWorkDiv") is not allowed
-//! The kernel function objects function call operator is attributed with ALPAKA_FN_ACC which is identical to __host__ __device__.
-//! The 'alpaka::workdiv::getWorkDiv<...>(acc)' function that is called has the ALPAKA_FN_HOST_ACC attribute (also equal to __host__ __device__).
-//! The underlying trait calls the CUDA specialized method which has the __device__ attribute.
-//! Because this call chain does not contain any templates and therefore no calls depending on input types,
-//! everything can be resolved at the first time the template is parsed which results in the given error.
+//! /alpaka/include/alpaka/workdiv/Traits.hpp(...): error: calling a __device__ function("getWorkDiv") from a __host__
+//! __device__ function("getWorkDiv") is not allowed The kernel function objects function call operator is attributed
+//! with ALPAKA_FN_ACC which is identical to __host__ __device__. The 'alpaka::getWorkDiv<...>(acc)' function that is
+//! called has the ALPAKA_FN_HOST_ACC attribute (also equal to __host__ __device__). The underlying trait calls the
+//! CUDA specialized method which has the __device__ attribute. Because this call chain does not contain any templates
+//! and therefore no calls depending on input types, everything can be resolved at the first time the template is
+//! parsed which results in the given error.
 //!
-//! Currently, the only possible way to solve this is to make the function call operator a template nonetheless by providing an unused template parameter.
+//! Currently, the only possible way to solve this is to make the function call operator a template nonetheless by
+//! providing an unused template parameter.
 
-using Dim = alpaka::dim::DimInt<2u>;
+using Dim = alpaka::DimInt<2u>;
 using Idx = std::uint32_t;
 #if defined(ALPAKA_ACC_CPU_SERIAL_ENABLED)
-using AccCpu = alpaka::acc::AccCpuSerial<Dim, Idx>;
+using AccCpu = alpaka::AccCpuSerial<Dim, Idx>;
 #endif
 #if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
-using AccGpu = alpaka::acc::AccGpuHipRt<Dim, Idx>;
+using AccGpu = alpaka::AccGpuHipRt<Dim, Idx>;
 #elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA
-using AccGpu = alpaka::acc::AccGpuCudaRt<Dim, Idx>;
+using AccGpu = alpaka::AccGpuCudaRt<Dim, Idx>;
 #endif
 
 #if defined(ALPAKA_ACC_CPU_SERIAL_ENABLED)
@@ -45,22 +47,18 @@ struct KernelNoTemplateCpu
 {
     //-----------------------------------------------------------------------------
     ALPAKA_FN_ACC
-    auto operator()(
-        AccCpu const & acc,
-        bool* success) const
-    -> void
+    auto operator()(AccCpu const& acc, bool* success) const -> void
     {
         ALPAKA_CHECK(
             *success,
-            static_cast<alpaka::idx::Idx<AccCpu>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+            static_cast<alpaka::Idx<AccCpu>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
     }
 };
 
 //-----------------------------------------------------------------------------
 TEST_CASE("kernelNoTemplateCpu", "[kernel]")
 {
-    alpaka::test::KernelExecutionFixture<AccCpu> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<AccCpu> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelNoTemplateCpu kernel;
 
@@ -82,7 +80,7 @@ struct KernelNoTemplateGpu
     {
         ALPAKA_CHECK(
             *success,
-            static_cast<alpaka::idx::Idx<AccGpu>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+            static_cast<alpaka::Idx<AccGpu>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
     }
 };
 
@@ -90,7 +88,7 @@ struct KernelNoTemplateGpu
 TEST_CASE("kernelNoTemplateGpu", "[kernel]")
 {
     alpaka::test::KernelExecutionFixture<AccGpu> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+        alpaka::Vec<Dim, Idx>::ones());
 
     KernelNoTemplateGpu kernel;
 
@@ -103,25 +101,19 @@ TEST_CASE("kernelNoTemplateGpu", "[kernel]")
 struct KernelWithoutTemplateParamCpu
 {
     //-----------------------------------------------------------------------------
-    template<
-        typename TNotUsed = void>
-    ALPAKA_FN_ACC
-    auto operator()(
-        AccCpu const & acc,
-        bool* success) const
-    -> void
+    template<typename TNotUsed = void>
+    ALPAKA_FN_ACC auto operator()(AccCpu const& acc, bool* success) const -> void
     {
         ALPAKA_CHECK(
             *success,
-            static_cast<alpaka::idx::Idx<AccCpu>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+            static_cast<alpaka::Idx<AccCpu>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
     }
 };
 
 //-----------------------------------------------------------------------------
 TEST_CASE("kernelWithoutTemplateParamCpu", "[kernel]")
 {
-    alpaka::test::KernelExecutionFixture<AccCpu> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<AccCpu> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelWithoutTemplateParamCpu kernel;
 
@@ -129,31 +121,24 @@ TEST_CASE("kernelWithoutTemplateParamCpu", "[kernel]")
 }
 #endif
 
-#if (defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) \
-  || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA) || (defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP)
 //#############################################################################
 struct KernelWithoutTemplateParamGpu
 {
     //-----------------------------------------------------------------------------
-    template<
-        typename TNotUsed = void>
-    ALPAKA_FN_ACC
-    auto operator()(
-        AccGpu const & acc,
-        bool* success) const
-    -> void
+    template<typename TNotUsed = void>
+    ALPAKA_FN_ACC auto operator()(AccGpu const& acc, bool* success) const -> void
     {
         ALPAKA_CHECK(
             *success,
-            static_cast<alpaka::idx::Idx<AccGpu>>(1) == (alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
+            static_cast<alpaka::Idx<AccGpu>>(1) == (alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)).prod());
     }
 };
 
 //-----------------------------------------------------------------------------
 TEST_CASE("kernelWithoutTemplateParamGpu", "[kernel]")
 {
-    alpaka::test::KernelExecutionFixture<AccGpu> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<AccGpu> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     KernelWithoutTemplateParamGpu kernel;
 
diff --git a/thirdParty/cupla/alpaka/test/unit/math/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/math/CMakeLists.txt
new file mode 100644
index 0000000000..c18bb6eea1
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/math/CMakeLists.txt
@@ -0,0 +1,46 @@
+#
+# Copyright 2017-2019 Benjamin Worpitz, Jakob Krude
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+set(_TARGET_NAME "math")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+append_recursive_files_add_to_src_group("src/" "src/" "hpp" _FILES_HEADER)
+
+if(ALPAKA_ACC_GPU_CUDA_ENABLE)
+    list(REMOVE_ITEM
+            CUDA_NVCC_FLAGS "--ftz=true" "--prec-div=false" "--prec-sqrt=false" "--fmad=true" "--use_fast_math" "-use_fast_math")
+
+endif()
+if(ALPAKA_ACC_GPU_HIP_ENABLE)
+    list(REMOVE_ITEM
+            HIP_NVCC_FLAGS "--ftz=true" "--prec-div=false" "--prec-sqrt=false" "--fmad=true" "--use_fast_math" "-use_fast_math")
+endif()
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE}
+    ${_FILES_HEADER})
+target_include_directories(
+    ${_TARGET_NAME}
+    PRIVATE ${Boost_INCLUDE_DIRS})
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+set_target_properties(
+    ${_TARGET_NAME}
+    PROPERTIES
+    COMPILE_OPTIONS
+    $<$<CXX_COMPILER_ID:clang>:"-ffp-contract=off"> # ffp-contract: https://llvm.org/docs/CompileCudaWithLLVM.html#id5
+    )
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_MATH")
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/math/sincos/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/math/sincos/CMakeLists.txt
deleted file mode 100644
index dfdf0b12f1..0000000000
--- a/thirdParty/cupla/alpaka/test/unit/math/sincos/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl, Matthias Werner
-#
-# This file is part of Alpaka.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#
-
-SET(_TARGET_NAME "sincos")
-
-append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
-
-ALPAKA_ADD_EXECUTABLE(
-    ${_TARGET_NAME}
-    ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
-    ${_TARGET_NAME}
-    PRIVATE common)
-
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
-
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/math/sincos/src/sincos.cpp b/thirdParty/cupla/alpaka/test/unit/math/sincos/src/sincos.cpp
deleted file mode 100644
index 5dbd4ff10f..0000000000
--- a/thirdParty/cupla/alpaka/test/unit/math/sincos/src/sincos.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
- *
- * This file is part of Alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-
-#include <alpaka/math/sincos/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/KernelExecutionFixture.hpp>
-
-#include <catch2/catch.hpp>
-
-// https://en.cppreference.com/w/cpp/types/numeric_limits/epsilon
-template <typename TAcc, typename FP>
-ALPAKA_FN_ACC
-typename std::enable_if< !std::numeric_limits<FP>::is_integer, bool >::type
-almost_equal(TAcc const & acc, FP x, FP y, int ulp)
-{
-    // the machine epsilon has to be scaled to the magnitude of the values used
-    // and multiplied by the desired precision in ULPs (units in the last place)
-    return alpaka::math::abs(acc, x-y)
-        <= std::numeric_limits<FP>::epsilon() * alpaka::math::abs(acc, x+y) * ulp
-        // unless the result is subnormal
-        || alpaka::math::abs(acc, x-y) < std::numeric_limits<FP>::min();
-}
-
-
-class SinCosTestKernel
-{
-public:
-    //-----------------------------------------------------------------------------
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-    typename TAcc,
-    typename FP
-    >
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success,
-        FP const arg) const
-    -> void
-    {
-        // if arg is hardcoded then compiler can optimize it out
-        // (PTX kernel (float) was just empty)
-        FP check_sin = alpaka::math::sin(acc, arg);
-        FP check_cos = alpaka::math::cos(acc, arg);
-        FP result_sin = 0.;
-        FP result_cos = 0.;
-        alpaka::math::sincos(acc, arg, result_sin, result_cos);
-        ALPAKA_CHECK(*success,
-                     almost_equal(acc, result_sin, check_sin, 1)
-                     &&
-                     almost_equal(acc, result_cos, check_cos, 1)
-            );
-    }
-};
-
-using TestAccs = alpaka::test::acc::EnabledAccs<
-    alpaka::dim::DimInt<1u>,
-    std::size_t>;
-
-//-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "sincos", "[sincos]", TestAccs)
-{
-    using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
-
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
-
-    SinCosTestKernel kernel;
-
-    REQUIRE(fixture( kernel, 0.42f )); // float
-    REQUIRE(fixture( kernel, 0.42 ));  // double
-}
diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/Buffer.hpp b/thirdParty/cupla/alpaka/test/unit/math/src/Buffer.hpp
new file mode 100644
index 0000000000..cbd447d208
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/math/src/Buffer.hpp
@@ -0,0 +1,117 @@
+/** Copyright 2019 Jakob Krude, Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include "Defines.hpp"
+
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <ostream>
+
+namespace alpaka
+{
+    namespace test
+    {
+        namespace unit
+        {
+            namespace math
+            {
+                //! Provides alpaka-style buffer with arguments' data.
+                //! TData can be a plain value or a complex data-structure.
+                //! The operator() is overloaded and returns the value from the correct Buffer,
+                //! either from the host (index) or device buffer (index, acc).
+                //! Index out of range errors are not checked.
+                //! @brief Encapsulates buffer initialisation and communication with Device.
+                //! @tparam TAcc Used accelerator, not interchangeable
+                //! @tparam TData The Data-type, only restricted by the alpaka-interface.
+                //! @tparam Tcapacity The size of the buffer.
+                template<typename TAcc, typename TData, size_t Tcapacity>
+                struct Buffer
+                {
+                    using value_type = TData;
+                    static constexpr size_t capacity = Tcapacity;
+                    using Dim = typename alpaka::traits::DimType<TAcc>::type;
+                    using Idx = typename alpaka::traits::IdxType<TAcc>::type;
+
+                    // Defines using's for alpaka-buffer.
+                    using DevAcc = alpaka::Dev<TAcc>;
+                    using DevHost = alpaka::DevCpu;
+                    using PltfHost = alpaka::Pltf<DevHost>;
+
+                    using BufHost = alpaka::Buf<DevHost, TData, Dim, Idx>;
+                    using BufAcc = alpaka::Buf<DevAcc, TData, Dim, Idx>;
+
+                    DevHost devHost;
+
+                    BufHost hostBuffer;
+                    BufAcc devBuffer;
+
+                    // Native pointer to access buffer.
+                    TData* const pHostBuffer;
+                    TData* const pDevBuffer;
+
+
+                    // This constructor cant be used,
+                    // because BufHost and BufAcc need to be initialised.
+                    Buffer() = delete;
+
+                    // Constructor needs to initialize all Buffer.
+                    Buffer(const DevAcc& devAcc)
+                        : devHost{alpaka::getDevByIdx<PltfHost>(0u)}
+                        , hostBuffer{alpaka::allocBuf<TData, Idx>(devHost, Tcapacity)}
+                        , devBuffer{alpaka::allocBuf<TData, Idx>(devAcc, Tcapacity)}
+                        , pHostBuffer{alpaka::getPtrNative(hostBuffer)}
+                        , pDevBuffer{alpaka::getPtrNative(devBuffer)}
+                    {
+                    }
+
+                    // Copy Host -> Acc.
+                    template<typename Queue>
+                    auto copyToDevice(Queue queue) -> void
+                    {
+                        alpaka::memcpy(queue, devBuffer, hostBuffer, Tcapacity);
+                    }
+
+                    // Copy Acc -> Host.
+                    template<typename Queue>
+                    auto copyFromDevice(Queue queue) -> void
+                    {
+                        alpaka::memcpy(queue, hostBuffer, devBuffer, Tcapacity);
+                    }
+
+                    ALPAKA_FN_ACC
+                    auto operator()(size_t idx, TAcc const& acc) const -> TData&
+                    {
+                        alpaka::ignore_unused(acc);
+                        return pDevBuffer[idx];
+                    }
+
+                    ALPAKA_FN_HOST
+                    auto operator()(size_t idx) const -> TData&
+                    {
+                        return pHostBuffer[idx];
+                    }
+
+                    ALPAKA_FN_HOST
+                    friend std::ostream& operator<<(std::ostream& os, const Buffer& buffer)
+                    {
+                        os << "capacity: " << capacity << "\n";
+                        for(size_t i = 0; i < capacity; ++i)
+                        {
+                            os << i << ": " << buffer.pHostBuffer[i] << "\n";
+                        }
+                        return os;
+                    }
+                };
+
+            } // namespace math
+        } // namespace unit
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/DataGen.hpp b/thirdParty/cupla/alpaka/test/unit/math/src/DataGen.hpp
new file mode 100644
index 0000000000..fea3064a33
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/math/src/DataGen.hpp
@@ -0,0 +1,133 @@
+/** Copyright 2019 Jakob Krude, Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include "Defines.hpp"
+
+#include <cassert>
+#include <limits>
+#include <random>
+
+namespace alpaka
+{
+    namespace test
+    {
+        namespace unit
+        {
+            namespace math
+            {
+                /**
+                 * Fills buffer with random numbers (host-only).
+                 *
+                 * @tparam TData The used data-type (float || double).
+                 * @tparam TArgs The args-buffer to be filled.
+                 * @tparam TFunctor The used Functor-type.
+                 * @param args The buffer that should be filled.
+                 * @param functor The Functor, needed for ranges.
+                 * @param seed The used seed.
+                 */
+                template<typename TData, typename TArgs, typename TFunctor>
+                auto fillWithRndArgs(TArgs& args, TFunctor functor, unsigned int const& seed) -> void
+                {
+                    /*
+                     * Each "sub-buffer" is filled with zero and/or max and/or lowest,
+                     * depending on the specified range (at [0] - [2]).
+                     *
+                     * Every switch case needs to return!
+                     * If no switch case was matched an assert(false) will be triggered.
+                     *
+                     * This function is easily extendable. It is only necessary to add extra
+                     * definitions in the switch case, for more Range-types.
+                     */
+                    static_assert(
+                        TArgs::value_type::arity == TFunctor::arity,
+                        "Buffer properties must match TFunctor::arity");
+                    static_assert(TArgs::capacity > 2, "Set of args must provide > 2 entries.");
+                    constexpr auto max = std::numeric_limits<TData>::max();
+                    constexpr auto low = std::numeric_limits<TData>::lowest();
+                    std::default_random_engine eng{static_cast<std::default_random_engine::result_type>(seed)};
+
+                    // These pseudo-random numbers are implementation/platform specific!
+                    std::uniform_real_distribution<TData> dist(0, 1000);
+                    std::uniform_real_distribution<TData> distOne(-1, 1);
+                    for(size_t k = 0; k < TFunctor::arity_nr; ++k)
+                    {
+                        bool matchedSwitch = false;
+                        switch(functor.ranges[k])
+                        {
+                        case Range::OneNeighbourhood:
+                            matchedSwitch = true;
+                            for(size_t i = 0; i < TArgs::capacity; ++i)
+                            {
+                                args(i).arg[k] = distOne(eng);
+                            }
+                            break;
+
+                        case Range::PositiveOnly:
+                            matchedSwitch = true;
+                            args(0).arg[k] = max;
+                            for(size_t i = 1; i < TArgs::capacity; ++i)
+                            {
+                                args(i).arg[k] = dist(eng) + static_cast<TData>(1);
+                            }
+                            break;
+
+                        case Range::PositiveAndZero:
+                            matchedSwitch = true;
+                            args(0).arg[k] = 0.0;
+                            args(1).arg[k] = max;
+                            for(size_t i = 2; i < TArgs::capacity; ++i)
+                            {
+                                args(i).arg[k] = dist(eng);
+                            }
+                            break;
+
+                        case Range::NotZero:
+                            matchedSwitch = true;
+                            args(0).arg[k] = max;
+                            args(1).arg[k] = low;
+                            for(size_t i = 2; i < TArgs::capacity; ++i)
+                            {
+                                TData arg;
+                                do
+                                {
+                                    arg = dist(eng);
+                                } while(std::equal_to<TData>()(arg, 1));
+                                if(i % 2 == 0)
+                                    args(i).arg[k] = arg;
+                                else
+                                    args(i).arg[k] = -arg;
+                            }
+                            break;
+
+                        case Range::Unrestricted:
+                            matchedSwitch = true;
+                            args(0).arg[k] = 0.0;
+                            args(1).arg[k] = max;
+                            args(2).arg[k] = low;
+                            for(size_t i = 3; i < TArgs::capacity; ++i)
+                            {
+                                if(i % 2 == 0)
+                                    args(i).arg[k] = dist(eng);
+                                else
+                                    args(i).arg[k] = -dist(eng);
+                            }
+                            break;
+                        }
+                        // disable gcc-warning "unused variable"
+                        alpaka::ignore_unused(matchedSwitch);
+                        assert(matchedSwitch);
+                    }
+                }
+
+            } // namespace math
+        } // namespace unit
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/Defines.hpp b/thirdParty/cupla/alpaka/test/unit/math/src/Defines.hpp
new file mode 100644
index 0000000000..f3b1489269
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/math/src/Defines.hpp
@@ -0,0 +1,70 @@
+/** Copyright 2019 Jakob Krude, Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+
+namespace alpaka
+{
+    namespace test
+    {
+        namespace unit
+        {
+            namespace math
+            {
+                // New types need to be added to the switch-case in DataGen.hpp
+                enum class Range
+                {
+                    OneNeighbourhood,
+                    PositiveOnly,
+                    PositiveAndZero,
+                    NotZero,
+                    Unrestricted
+                };
+
+                // New types need to be added to the operator() function in Functor.hpp
+                enum class Arity
+                {
+                    Unary = 1,
+                    Binary = 2
+                };
+
+                template<typename T, Arity Tarity>
+                struct ArgsItem
+                {
+                    static constexpr Arity arity = Tarity;
+                    static constexpr size_t arity_nr = static_cast<size_t>(Tarity);
+
+                    T arg[arity_nr]; // represents arg0, arg1, ...
+
+                    friend std::ostream& operator<<(std::ostream& os, const ArgsItem& argsItem)
+                    {
+                        os.precision(17);
+                        os << "[ ";
+                        for(size_t i = 0; i < argsItem.arity_nr; ++i)
+                            os << std::setprecision(std::numeric_limits<T>::digits10 + 1) << argsItem.arg[i] << ", ";
+                        os << "]";
+                        return os;
+                    }
+                };
+
+                template<typename T>
+                auto rsqrt(T const& arg) -> decltype(std::sqrt(arg))
+                {
+                    return static_cast<T>(1) / std::sqrt(arg);
+                }
+
+            } // namespace math
+        } // namespace unit
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/Functor.hpp b/thirdParty/cupla/alpaka/test/unit/math/src/Functor.hpp
new file mode 100644
index 0000000000..2f65fef163
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/math/src/Functor.hpp
@@ -0,0 +1,230 @@
+/** Copyright 2019 Jakob Krude, Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include "Defines.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+#include <type_traits>
+
+namespace alpaka
+{
+    namespace test
+    {
+        namespace unit
+        {
+            namespace math
+            {
+// Can be used with operator() that will use either the std. function or the
+// equivalent alpaka function (if an accelerator is passed additionally).
+//! @param NAME The Name used for the Functor, e.g. OpAbs
+//! @param ARITY Enum-type can be one ... n
+//! @param STD_OP Function used for the host side, e.g. std::abs
+//! @param ALPAKA_OP Function used for the device side, e.g. alpaka::math::abs.
+//! @param ... List of Ranges. Needs to match the arity.
+#define ALPAKA_TEST_MATH_OP_FUNCTOR(NAME, ARITY, STD_OP, ALPAKA_OP, ...)                                              \
+    struct NAME                                                                                                       \
+    {                                                                                                                 \
+        /* ranges is not a constexpr, so that it's accessible via for loop*/                                          \
+        static constexpr Arity arity = ARITY;                                                                         \
+        static constexpr size_t arity_nr = static_cast<size_t>(ARITY);                                                \
+        const Range ranges[arity_nr] = {__VA_ARGS__};                                                                 \
+                                                                                                                      \
+        ALPAKA_NO_HOST_ACC_WARNING                                                                                    \
+        template<                                                                                                     \
+            typename TAcc,                                                                                            \
+            typename... TArgs, /* SFINAE: Enables if called from device. */                                           \
+            typename std::enable_if<!std::is_same<TAcc, std::nullptr_t>::value, int>::type = 0>                       \
+        ALPAKA_FN_ACC auto execute(TAcc const& acc, TArgs const&... args) const                                       \
+        {                                                                                                             \
+            return ALPAKA_OP(acc, args...);                                                                           \
+        }                                                                                                             \
+                                                                                                                      \
+        ALPAKA_NO_HOST_ACC_WARNING                                                                                    \
+        template<                                                                                                     \
+            typename TAcc = std::nullptr_t,                                                                           \
+            typename... TArgs, /* SFINAE: Enables if called from host. */                                             \
+            typename std::enable_if<std::is_same<TAcc, std::nullptr_t>::value, int>::type = 0>                        \
+        ALPAKA_FN_HOST auto execute(TAcc const& acc, TArgs const&... args) const                                      \
+        {                                                                                                             \
+            alpaka::ignore_unused(acc);                                                                               \
+            return STD_OP(args...);                                                                                   \
+        }                                                                                                             \
+                                                                                                                      \
+        /* assigns args by arity */                                                                                   \
+        ALPAKA_NO_HOST_ACC_WARNING                                                                                    \
+        template<typename T, typename TAcc = std::nullptr_t>                                                          \
+        ALPAKA_FN_HOST_ACC auto operator()(ArgsItem<T, Arity::Unary> const& args, TAcc const& acc = nullptr) const    \
+        {                                                                                                             \
+            return execute(acc, args.arg[0]);                                                                         \
+        }                                                                                                             \
+                                                                                                                      \
+        /* assigns args by arity */                                                                                   \
+        ALPAKA_NO_HOST_ACC_WARNING                                                                                    \
+        template<typename T, typename TAcc = std::nullptr_t>                                                          \
+        ALPAKA_FN_HOST_ACC auto operator()(ArgsItem<T, Arity::Binary> const& args, TAcc const& acc = nullptr) const   \
+        {                                                                                                             \
+            return execute(acc, args.arg[0], args.arg[1]);                                                            \
+        }                                                                                                             \
+                                                                                                                      \
+        friend std::ostream& operator<<(std::ostream& out, const NAME& op)                                            \
+        {                                                                                                             \
+            out << #NAME;                                                                                             \
+            alpaka::ignore_unused(op);                                                                                \
+            return out;                                                                                               \
+        }                                                                                                             \
+    };
+
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpAbs, Arity::Unary, std::abs, alpaka::math::abs, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpAcos,
+                    Arity::Unary,
+                    std::acos,
+                    alpaka::math::acos,
+                    Range::OneNeighbourhood)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpAsin,
+                    Arity::Unary,
+                    std::asin,
+                    alpaka::math::asin,
+                    Range::OneNeighbourhood)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpAtan, Arity::Unary, std::atan, alpaka::math::atan, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpCbrt, Arity::Unary, std::cbrt, alpaka::math::cbrt, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpCeil, Arity::Unary, std::ceil, alpaka::math::ceil, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpCos, Arity::Unary, std::cos, alpaka::math::cos, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpErf, Arity::Unary, std::erf, alpaka::math::erf, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpExp, Arity::Unary, std::exp, alpaka::math::exp, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpFloor,
+                    Arity::Unary,
+                    std::floor,
+                    alpaka::math::floor,
+                    Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpLog, Arity::Unary, std::log, alpaka::math::log, Range::PositiveOnly)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpRound,
+                    Arity::Unary,
+                    std::round,
+                    alpaka::math::round,
+                    Range::Unrestricted)
+
+                // There is no std implementation look in Defines.hpp.
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpRsqrt,
+                    Arity::Unary,
+                    alpaka::test::unit::math::rsqrt,
+                    alpaka::math::rsqrt,
+                    Range::PositiveOnly)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpSin, Arity::Unary, std::sin, alpaka::math::sin, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpSqrt,
+                    Arity::Unary,
+                    std::sqrt,
+                    alpaka::math::sqrt,
+                    Range::PositiveAndZero)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(OpTan, Arity::Unary, std::tan, alpaka::math::tan, Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpTrunc,
+                    Arity::Unary,
+                    std::trunc,
+                    alpaka::math::trunc,
+                    Range::Unrestricted)
+
+                // All binary operators.
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpAtan2,
+                    Arity::Binary,
+                    std::atan2,
+                    alpaka::math::atan2,
+                    Range::NotZero,
+                    Range::NotZero)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpFmod,
+                    Arity::Binary,
+                    std::fmod,
+                    alpaka::math::fmod,
+                    Range::Unrestricted,
+                    Range::NotZero)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpMax,
+                    Arity::Binary,
+                    std::max,
+                    alpaka::math::max,
+                    Range::Unrestricted,
+                    Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpMin,
+                    Arity::Binary,
+                    std::min,
+                    alpaka::math::min,
+                    Range::Unrestricted,
+                    Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpPow,
+                    Arity::Binary,
+                    std::pow,
+                    alpaka::math::pow,
+                    Range::PositiveAndZero,
+                    Range::Unrestricted)
+
+                ALPAKA_TEST_MATH_OP_FUNCTOR(
+                    OpRemainder,
+                    Arity::Binary,
+                    std::remainder,
+                    alpaka::math::remainder,
+                    Range::Unrestricted,
+                    Range::NotZero)
+
+                using BinaryFunctors = std::tuple<OpAtan2, OpFmod, OpMax, OpMin, OpPow, OpRemainder>;
+
+                using UnaryFunctors = std::tuple<
+                    OpAbs,
+                    OpAcos,
+                    OpAsin,
+                    OpAtan,
+                    OpCbrt,
+                    OpCeil,
+                    OpCos,
+                    OpErf,
+                    OpExp,
+                    OpFloor,
+                    OpLog,
+                    OpRound,
+                    OpRsqrt,
+                    OpSin,
+                    OpSqrt,
+                    OpTan,
+                    OpTrunc>;
+
+            } // namespace math
+        } // namespace unit
+    } // namespace test
+} // namespace alpaka
diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/math.cpp b/thirdParty/cupla/alpaka/test/unit/math/src/math.cpp
new file mode 100644
index 0000000000..6a2107dc5e
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/math/src/math.cpp
@@ -0,0 +1,187 @@
+/** Copyright 2019 Jakob Krude, Benjamin Worpitz
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include "Buffer.hpp"
+#include "DataGen.hpp"
+#include "Defines.hpp"
+#include "Functor.hpp"
+
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+
+#include <catch2/catch.hpp>
+
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::size_t>;
+
+using Functors
+    = alpaka::meta::Concatenate<alpaka::test::unit::math::UnaryFunctors, alpaka::test::unit::math::BinaryFunctors>;
+
+using TestAccFunctorTuples = alpaka::meta::CartesianProduct<std::tuple, TestAccs, Functors>;
+
+using DataTypes = std::tuple<float, double>;
+
+template<std::size_t TCapacity>
+struct TestKernel
+{
+    //! @tparam TAcc Accelerator.
+    //! @tparam TFunctor Functor defined in Functor.hpp.
+    //! @param acc Accelerator given from alpaka.
+    //! @param functor Accessible with operator().
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename TResults, typename TFunctor, typename TArgs>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, TResults* results, TFunctor const& functor, TArgs const* args)
+        const noexcept -> void
+    {
+        for(size_t i = 0; i < TCapacity; ++i)
+        {
+            results[i] = functor(args[i], acc);
+        }
+    }
+};
+
+//#############################################################################
+template<typename TAcc, typename TFunctor>
+struct TestTemplate
+{
+    template<typename TData>
+    auto operator()() -> void
+    {
+        std::random_device rd{};
+        auto const seed = rd();
+        std::cout << "testing"
+                  << " acc:" << typeid(TAcc).name() << " data type:" << typeid(TData).name()
+                  << " functor:" << typeid(TFunctor).name() << " seed:" << seed << std::endl;
+
+        // SETUP (defines and initialising)
+        // DevAcc and DevHost are defined in Buffer.hpp too.
+        using DevAcc = alpaka::Dev<TAcc>;
+        using DevHost = alpaka::DevCpu;
+        using PltfAcc = alpaka::Pltf<DevAcc>;
+        using PltfHost = alpaka::Pltf<DevHost>;
+
+        using Dim = alpaka::DimInt<1u>;
+        using Idx = std::size_t;
+        using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+        using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
+        using TArgsItem = alpaka::test::unit::math::ArgsItem<TData, TFunctor::arity>;
+
+        static constexpr auto capacity = 1000;
+
+        using Args = alpaka::test::unit::math::Buffer<TAcc, TArgsItem, capacity>;
+        using Results = alpaka::test::unit::math::Buffer<TAcc, TData, capacity>;
+
+        // Every functor is executed individual on one kernel.
+        static constexpr size_t elementsPerThread = 1u;
+        static constexpr size_t sizeExtent = 1u;
+
+        DevAcc const devAcc{alpaka::getDevByIdx<PltfAcc>(0u)};
+        DevHost const devHost{alpaka::getDevByIdx<PltfHost>(0u)};
+
+        QueueAcc queue{devAcc};
+
+        TestKernel<capacity> kernel;
+        TFunctor functor;
+        Args args{devAcc};
+        Results results{devAcc};
+
+        WorkDiv const workDiv{alpaka::getValidWorkDiv<TAcc>(
+            devAcc,
+            sizeExtent,
+            elementsPerThread,
+            false,
+            alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
+        // SETUP COMPLETED.
+
+        // Fill the buffer with random test-numbers.
+        alpaka::test::unit::math::fillWithRndArgs<TData>(args, functor, seed);
+        for(size_t i = 0; i < Results::capacity; ++i)
+            results(i) = static_cast<TData>(std::nan(""));
+
+        // Copy both buffer to the device
+        args.copyToDevice(queue);
+        results.copyToDevice(queue);
+
+        auto const taskKernel(
+            alpaka::createTaskKernel<TAcc>(workDiv, kernel, results.pDevBuffer, functor, args.pDevBuffer));
+        // Enqueue the kernel execution task.
+        alpaka::enqueue(queue, taskKernel);
+        // Copy back the results (encapsulated in the buffer class).
+        results.copyFromDevice(queue);
+        alpaka::wait(queue);
+        std::cout.precision(std::numeric_limits<TData>::digits10 + 1);
+
+        INFO("Operator: " << functor)
+        INFO("Type: " << typeid(TData).name()) // Compiler specific.
+#if ALPAKA_DEBUG_FULL
+        INFO("The args buffer: \n" << std::setprecision(std::numeric_limits<TData>::digits10 + 1) << args << "\n")
+#endif
+        for(size_t i = 0; i < Args::capacity; ++i)
+        {
+            INFO("Idx i: " << i)
+            TData std_result = functor(args(i));
+            REQUIRE(results(i) == Approx(std_result));
+        }
+    }
+};
+
+TEMPLATE_LIST_TEST_CASE("mathOps", "[math] [operator]", TestAccFunctorTuples)
+{
+    /*
+     * All alpaka::math:: functions are tested here except sincos.
+     * The function will be called with a buffer from the custom Buffer class.
+     * This argument Buffer contains ArgsItems from Defines.hpp and can be
+     * accessed with the overloaded operator().
+     * The args Buffer looks similar like [[0, 1], [2, 3], [4, 5]],
+     * where every sub-list makes one functor-call so the result Buffer would be:
+     * [f(0, 1), f(2, 3), f(4, 5)].
+     * The results are saved in a different Buffer witch contains plain data.
+     * The results are than compared to the result of a std:: implementation.
+     * The default result is nan and should fail a test.
+     *
+     * BE AWARE that:
+     * - ALPAKA_CUDA_FAST_MATH should be disabled
+     * - not all casts between float and double can be detected.
+     * - no explicit edge cases are tested, rather than 0, maximum and minimum
+     *   - but it is easy to add a new Range:: enum-type with custom edge cases
+     *  - some tests may fail if ALPAKA_CUDA_FAST_MATH is turned on
+     * - nan typically fails every test, but could be normal defined behaviour
+     * - inf/-inf typically dont fail a test
+     * - for easy debugging the << operator is overloaded for Buffer objects
+     * - arguments are generated between 0 and 1000
+     *     and the default argument-buffer-extent is 1000
+     * The arguments are generated in DataGen.hpp and can easily be modified.
+     * The arguments depend on the Range:: enum-type specified for each functor.
+     * ----------------------------------------------------------------------
+     * - each functor has an arity and a array of ranges
+     *     - there is one args Buffer and one results Buffer
+     *         - each buffer encapsulated the host/device communication
+     *         - as well as the data access and the initialisation
+     * - all operators are tested independent, one per kernel
+     * - tests the results against the std implementation ( catch REQUIRES)
+     *
+     * TestKernel
+     * - uses the alpaka::math:: option from the functor
+     * - uses the device-buffer  option from the args
+     *
+     * EXTENSIBILITY:
+     * - Add new operators in Functor.hpp and add them to the ...Functors tuple.
+     * - Add a new Range:: enum-type in Defines.hpp
+     *     - specify a fill-method in DataGen.hpp
+     * - Add a new Arity:: enum-type in Defines.hpp
+     *     - add a matching operator() function in Functor.hpp,
+     *     - add a new ...Functors tuple
+     *     - call alpaka::meta::forEachType with the tuple in ForEachFunctor
+     */
+
+    using Acc = std::tuple_element_t<0u, TestType>;
+    using Functor = std::tuple_element_t<1u, TestType>;
+
+    alpaka::meta::forEachType<DataTypes>(TestTemplate<Acc, Functor>());
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/math/src/sincos.cpp b/thirdParty/cupla/alpaka/test/unit/math/src/sincos.cpp
new file mode 100644
index 0000000000..65da1beb60
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/math/src/sincos.cpp
@@ -0,0 +1,71 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/math/sincos/Traits.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <type_traits>
+
+// https://en.cppreference.com/w/cpp/types/numeric_limits/epsilon
+template<typename TAcc, typename FP>
+ALPAKA_FN_ACC std::enable_if_t<!std::numeric_limits<FP>::is_integer, bool> almost_equal(
+    TAcc const& acc,
+    FP x,
+    FP y,
+    int ulp)
+{
+    // the machine epsilon has to be scaled to the magnitude of the values used
+    // and multiplied by the desired precision in ULPs (units in the last place)
+    return alpaka::math::abs(acc, x - y)
+        <= std::numeric_limits<FP>::epsilon() * alpaka::math::abs(acc, x + y) * static_cast<FP>(ulp)
+        // unless the result is subnormal
+        || alpaka::math::abs(acc, x - y) < std::numeric_limits<FP>::min();
+}
+
+class SinCosTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename FP>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success, FP const arg) const -> void
+    {
+        // if arg is hardcoded then compiler can optimize it out
+        // (PTX kernel (float) was just empty)
+        FP check_sin = alpaka::math::sin(acc, arg);
+        FP check_cos = alpaka::math::cos(acc, arg);
+        FP result_sin = 0.;
+        FP result_cos = 0.;
+        alpaka::math::sincos(acc, arg, result_sin, result_cos);
+        ALPAKA_CHECK(
+            *success,
+            almost_equal(acc, result_sin, check_sin, 1) && almost_equal(acc, result_cos, check_cos, 1));
+    }
+};
+
+using TestAccs = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::size_t>;
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("sincos", "[sincos]", TestAccs)
+{
+    using Acc = TestType;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
+
+    SinCosTestKernel kernel;
+
+    REQUIRE(fixture(kernel, 0.42f)); // float
+    REQUIRE(fixture(kernel, 0.42)); // double
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/buf/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/mem/buf/CMakeLists.txt
index bc7afdf1dc..3a30a099a9 100644
--- a/thirdParty/cupla/alpaka/test/unit/mem/buf/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/mem/buf/CMakeLists.txt
@@ -1,27 +1,25 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "memBuf")
+set(_TARGET_NAME "memBuf")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_MEM_BUF")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/buf/src/BufTest.cpp b/thirdParty/cupla/alpaka/test/unit/mem/buf/src/BufTest.cpp
index a13a80384e..9fb2b80cd6 100644
--- a/thirdParty/cupla/alpaka/test/unit/mem/buf/src/BufTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/mem/buf/src/BufTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,124 +8,100 @@
  */
 
 #include <alpaka/mem/buf/Traits.hpp>
-
+#include <alpaka/test/Extent.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
 #include <alpaka/test/mem/view/ViewTest.hpp>
-#include <alpaka/test/Extent.hpp>
+#include <alpaka/test/queue/Queue.hpp>
 
 #include <catch2/catch.hpp>
 
-#include <type_traits>
 #include <numeric>
+#include <type_traits>
 
 //-----------------------------------------------------------------------------
-template<
-    typename TAcc>
-static auto testBufferMutable(
-    alpaka::vec::Vec<alpaka::dim::Dim<TAcc>, alpaka::idx::Idx<TAcc>> const & extent)
--> void
+template<typename TAcc>
+static auto testBufferMutable(alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const& extent) -> void
 {
-    using Dev = alpaka::dev::Dev<TAcc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-    using Queue = alpaka::test::queue::DefaultQueue<Dev>;
+    using Dev = alpaka::Dev<TAcc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Queue = alpaka::test::DefaultQueue<Dev>;
 
     using Elem = float;
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
+    using Dim = alpaka::Dim<TAcc>;
+    using Idx = alpaka::Idx<TAcc>;
 
-    Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
     Queue queue(dev);
 
     //-----------------------------------------------------------------------------
-    // alpaka::mem::buf::alloc
-    auto buf(alpaka::mem::buf::alloc<Elem, Idx>(dev, extent));
+    // alpaka::malloc
+    auto buf(alpaka::allocBuf<Elem, Idx>(dev, extent));
 
     //-----------------------------------------------------------------------------
-    auto const offset(alpaka::vec::Vec<Dim, Idx>::zeros());
-    alpaka::test::mem::view::testViewImmutable<
-        Elem>(
-            buf,
-            dev,
-            extent,
-            offset);
+    auto const offset(alpaka::Vec<Dim, Idx>::zeros());
+    alpaka::test::testViewImmutable<Elem>(buf, dev, extent, offset);
 
     //-----------------------------------------------------------------------------
-    alpaka::test::mem::view::testViewMutable<
-        TAcc>(
-            queue,
-            buf);
+    alpaka::test::testViewMutable<TAcc>(queue, buf);
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "memBufBasicTest", "[memBuf]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("memBufBasicTest", "[memBuf]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
+    auto const extent(
+        alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
 
-    testBufferMutable<
-        Acc>(
-            extent);
+    testBufferMutable<Acc>(extent);
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "memBufZeroSizeTest", "[memBuf]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("memBufZeroSizeTest", "[memBuf]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    auto const extent(alpaka::vec::Vec<Dim, Idx>::zeros());
+    auto const extent(alpaka::Vec<Dim, Idx>::zeros());
 
-    testBufferMutable<
-        Acc>(
-            extent);
+    testBufferMutable<Acc>(extent);
 }
 
 
 //-----------------------------------------------------------------------------
-template<
-    typename TAcc>
-static auto testBufferImmutable(
-    alpaka::vec::Vec<alpaka::dim::Dim<TAcc>, alpaka::idx::Idx<TAcc>> const & extent)
--> void
+template<typename TAcc>
+static auto testBufferImmutable(alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const& extent) -> void
 {
-    using Dev = alpaka::dev::Dev<TAcc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
+    using Dev = alpaka::Dev<TAcc>;
+    using Pltf = alpaka::Pltf<Dev>;
 
     using Elem = float;
-    using Dim = alpaka::dim::Dim<TAcc>;
-    using Idx = alpaka::idx::Idx<TAcc>;
+    using Dim = alpaka::Dim<TAcc>;
+    using Idx = alpaka::Idx<TAcc>;
 
-    Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
 
     //-----------------------------------------------------------------------------
-    // alpaka::mem::buf::alloc
-    auto const buf(alpaka::mem::buf::alloc<Elem, Idx>(dev, extent));
+    // alpaka::malloc
+    auto const buf(alpaka::allocBuf<Elem, Idx>(dev, extent));
 
     //-----------------------------------------------------------------------------
-    auto const offset(alpaka::vec::Vec<Dim, Idx>::zeros());
-    alpaka::test::mem::view::testViewImmutable<
-        Elem>(
-            buf,
-            dev,
-            extent,
-            offset);
+    auto const offset(alpaka::Vec<Dim, Idx>::zeros());
+    alpaka::test::testViewImmutable<Elem>(buf, dev, extent, offset);
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "memBufConstTest", "[memBuf]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("memBufConstTest", "[memBuf]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
+    auto const extent(
+        alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
 
-    testBufferImmutable<
-        Acc>(
-            extent);
+    testBufferImmutable<Acc>(extent);
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/copy/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/mem/copy/CMakeLists.txt
new file mode 100644
index 0000000000..d033384547
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/mem/copy/CMakeLists.txt
@@ -0,0 +1,28 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl
+#
+# This file is part of Alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+set(_TARGET_NAME "bufSlicing")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+        ${_TARGET_NAME}
+        ${_FILES_SOURCE})
+target_link_libraries(
+        ${_TARGET_NAME}
+        PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
+    target_compile_options(${_TARGET_NAME} PRIVATE "-Wc,--pending_instantiations=196")
+endif()
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/copy/src/BufSlicing.cpp b/thirdParty/cupla/alpaka/test/unit/mem/copy/src/BufSlicing.cpp
new file mode 100644
index 0000000000..9d13fcfead
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/mem/copy/src/BufSlicing.cpp
@@ -0,0 +1,169 @@
+/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Jakob Krude
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/test/Extent.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/mem/view/Iterator.hpp>
+
+#include <catch2/catch.hpp>
+
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(push)
+#    pragma warning(disable : 4127) // suppress warning for c++17 conditional expression is constant
+#endif
+
+template<typename TDim, typename TIdx, typename TAcc, typename TData, typename Vec = alpaka::Vec<TDim, TIdx>>
+struct TestContainer
+{
+    using AccQueueProperty = alpaka::Blocking;
+    using DevQueue = alpaka::Queue<TAcc, AccQueueProperty>;
+    using DevAcc = alpaka::Dev<TAcc>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+
+    using DevHost = alpaka::DevCpu;
+    using PltfHost = alpaka::Pltf<DevHost>;
+
+    using BufHost = alpaka::Buf<DevHost, TData, TDim, TIdx>;
+    using BufDevice = alpaka::Buf<DevAcc, TData, TDim, TIdx>;
+
+    using SubView = alpaka::ViewSubView<DevAcc, TData, TDim, TIdx>;
+
+    DevAcc const devAcc;
+    DevHost const devHost;
+    DevQueue devQueue;
+
+
+    // Constructor
+    TestContainer()
+        : devAcc(alpaka::getDevByIdx<PltfAcc>(0u))
+        , devHost(alpaka::getDevByIdx<PltfHost>(0u))
+        , devQueue(devAcc)
+    {
+    }
+
+
+    auto createHostBuffer(Vec extents, bool indexed) -> BufHost
+    {
+        BufHost bufHost(alpaka::allocBuf<TData, TIdx>(devHost, extents));
+        if(indexed)
+        {
+            TData* const ptr = alpaka::getPtrNative(bufHost);
+            for(TIdx i(0); i < extents.prod(); ++i)
+            {
+                ptr[i] = static_cast<TData>(i);
+            }
+        }
+        return bufHost;
+    }
+
+
+    auto createDeviceBuffer(Vec extents) -> BufDevice
+    {
+        BufDevice bufDevice(alpaka::allocBuf<TData, TIdx>(devAcc, extents));
+        return bufDevice;
+    }
+
+
+    auto copyToAcc(BufHost bufHost, BufDevice bufAcc, Vec extents) -> void
+    {
+        alpaka::memcpy(devQueue, bufAcc, bufHost, extents);
+    }
+
+
+    auto copyToHost(BufDevice bufAcc, BufHost bufHost, Vec extents) -> void
+    {
+        alpaka::memcpy(devQueue, bufHost, bufAcc, extents);
+    }
+
+
+    auto sliceOnDevice(BufDevice bufferToBeSliced, Vec subViewExtents, Vec offsets) -> BufDevice
+    {
+        BufDevice slicedBuffer = createDeviceBuffer(subViewExtents);
+        // Create a subView with a possible offset.
+        SubView subView = SubView(bufferToBeSliced, subViewExtents, offsets);
+        // Copy the subView into a new buffer.
+        alpaka::memcpy(devQueue, slicedBuffer, subView, subViewExtents);
+        return slicedBuffer;
+    }
+
+
+    auto compareBuffer(BufHost const& bufferA, BufHost const& bufferB, Vec const& extents) const
+    {
+        TData const* const ptrA = alpaka::getPtrNative(bufferA);
+        TData const* const ptrB = alpaka::getPtrNative(bufferB);
+        for(TIdx i(0); i < extents.prod(); ++i)
+        {
+            INFO("Dim: " << TDim::value)
+            INFO("Idx: " << typeid(TIdx).name())
+            INFO("Acc: " << alpaka::traits::GetAccName<TAcc>::getAccName())
+            INFO("i: " << i)
+            REQUIRE(ptrA[i] == Approx(ptrB[i]));
+        }
+    }
+};
+
+using DataTypes = std::tuple<int, float, double>;
+
+using TestAccWithDataTypes = alpaka::meta::CartesianProduct<std::tuple, alpaka::test::TestAccs, DataTypes>;
+
+TEMPLATE_LIST_TEST_CASE("memBufSlicingTest", "[memBuf]", TestAccWithDataTypes)
+{
+    using Acc = std::tuple_element_t<0, TestType>;
+    using Data = std::tuple_element_t<1, TestType>;
+    using Dim = alpaka::Dim<Acc>;
+    // fourth-dimension is not supposed to be tested currently
+    if(Dim::value == 4)
+    {
+        return;
+    }
+    using Idx = alpaka::Idx<Acc>;
+    TestContainer<Dim, Idx, Acc, Data> slicingTest;
+
+    auto const extents(
+        alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
+
+    auto const extentsSubView(
+        alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentSubView>());
+    auto const offsets(alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForOffset>());
+
+    // This is the initial buffer.
+    auto const indexedBuffer = slicingTest.createHostBuffer(extents, true);
+    // This buffer will hold the sliced-buffer when it was copied to the host.
+    auto resultBuffer = slicingTest.createHostBuffer(extentsSubView, false);
+
+    // Copy of the indexBuffer on the deviceSide.
+    auto deviceBuffer = slicingTest.createDeviceBuffer(extents);
+
+    // Start: Main-Test
+    slicingTest.copyToAcc(indexedBuffer, deviceBuffer, extents);
+
+    auto slicedBuffer = slicingTest.sliceOnDevice(deviceBuffer, extentsSubView, offsets);
+
+    slicingTest.copyToHost(slicedBuffer, resultBuffer, extentsSubView);
+
+    auto correctResults = slicingTest.createHostBuffer(extentsSubView, false);
+    Data* ptrNative = alpaka::getPtrNative(correctResults);
+    using Dim1 = alpaka::DimInt<1u>;
+
+    for(Idx i(0); i < extentsSubView.prod(); ++i)
+    {
+        auto mappedToND = alpaka::mapIdx<Dim::value, Dim1::value>(alpaka::Vec<Dim1, Idx>(i), extentsSubView);
+        auto addedOffset = mappedToND + offsets;
+        auto mappedTo1D = alpaka::mapIdx<Dim1::value>(addedOffset,
+                                                      extents)[0]; // take the only element in the vector
+        ptrNative[i] = static_cast<Data>(mappedTo1D);
+    }
+
+    // resultBuffer will be compared with the manually computed results.
+    slicingTest.compareBuffer(resultBuffer, correctResults, extentsSubView);
+}
+
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(pop)
+#endif
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/p2p/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/mem/p2p/CMakeLists.txt
index 25a83bc6ec..4685b42b75 100644
--- a/thirdParty/cupla/alpaka/test/unit/mem/p2p/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/mem/p2p/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,16 +12,13 @@ SET(_TARGET_NAME "memP2P")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/p2p/src/P2P.cpp b/thirdParty/cupla/alpaka/test/unit/mem/p2p/src/P2P.cpp
index a6bad97c64..476284b055 100644
--- a/thirdParty/cupla/alpaka/test/unit/mem/p2p/src/P2P.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/mem/p2p/src/P2P.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,72 +8,68 @@
  */
 
 #include <alpaka/mem/buf/Traits.hpp>
-
+#include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/Extent.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
 #include <alpaka/test/mem/view/ViewTest.hpp>
-#include <alpaka/test/Extent.hpp>
-#include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/queue/Queue.hpp>
 
 #include <catch2/catch.hpp>
 
-#include <type_traits>
 #include <numeric>
+#include <type_traits>
 
 //-----------------------------------------------------------------------------
-template<
-    typename TAcc>
-static auto testP2P(
-    alpaka::vec::Vec<alpaka::dim::Dim<TAcc>, alpaka::idx::Idx<TAcc>> const & extent)
--> void
+template<typename TAcc>
+static auto testP2P(alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> const& extent) -> void
 {
-    using Dev = alpaka::dev::Dev<TAcc>;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
-    using Queue = alpaka::test::queue::DefaultQueue<Dev>;
+    using Dev = alpaka::Dev<TAcc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Queue = alpaka::test::DefaultQueue<Dev>;
 
     using Elem = std::uint32_t;
-    using Idx = alpaka::idx::Idx<TAcc>;
+    using Idx = alpaka::Idx<TAcc>;
 
-    if(alpaka::pltf::getDevCount<Pltf>()<2) {
-      std::cerr << "No two devices found to test peer-to-peer copy." << std::endl;
-      CHECK(true);
-      return;
+    if(alpaka::getDevCount<Pltf>() < 2)
+    {
+        std::cerr << "No two devices found to test peer-to-peer copy." << std::endl;
+        CHECK(true);
+        return;
     }
 
-    Dev const dev0(alpaka::pltf::getDevByIdx<Pltf>(0u));
-    Dev const dev1(alpaka::pltf::getDevByIdx<Pltf>(1u));
+    Dev const dev0(alpaka::getDevByIdx<Pltf>(0u));
+    Dev const dev1(alpaka::getDevByIdx<Pltf>(1u));
     Queue queue0(dev0);
 
     //-----------------------------------------------------------------------------
-    auto buf0(alpaka::mem::buf::alloc<Elem, Idx>(dev0, extent));
-    auto buf1(alpaka::mem::buf::alloc<Elem, Idx>(dev1, extent));
+    auto buf0(alpaka::allocBuf<Elem, Idx>(dev0, extent));
+    auto buf1(alpaka::allocBuf<Elem, Idx>(dev1, extent));
 
     //-----------------------------------------------------------------------------
     std::uint8_t const byte(static_cast<uint8_t>(42u));
-    alpaka::mem::view::set(queue0, buf0, byte, extent);
+    alpaka::memset(queue0, buf0, byte, extent);
 
     //-----------------------------------------------------------------------------
-    alpaka::mem::view::copy(queue0, buf1, buf0, extent);
-    alpaka::wait::wait(queue0);
-    alpaka::test::mem::view::verifyBytesSet<TAcc>(buf1, byte);
+    alpaka::memcpy(queue0, buf1, buf0, extent);
+    alpaka::wait(queue0);
+    alpaka::test::verifyBytesSet<TAcc>(buf1, byte);
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "memP2PTest", "[memP2P]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("memP2PTest", "[memP2P]", alpaka::test::TestAccs)
 {
-#if defined(ALPAKA_CI) &&                             \
-    BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(7,2,0) && \
-    BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(8,0,0) && \
-    defined(ALPAKA_ACC_CPU_BT_OMP4_ENABLED)
-    std::cerr << "Currently, memP2P is not working with gcc7.2 / gcc7.3 on Ubuntu14.04 on travis/CI." << std::endl;
+#if defined(ALPAKA_CI) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(7, 2, 0)                                            \
+    && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(8, 0, 0) && defined(ALPAKA_ACC_ANY_BT_OMP5_ENABLED)
+    std::cerr << "Currently, memP2P is not working with gcc7.2 / gcc7.3 on CI." << std::endl;
     CHECK(true);
 #else
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    auto const extent(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
+    auto const extent(
+        alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
 
-    testP2P<Acc>( extent );
+    testP2P<Acc>(extent);
 #endif
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/view/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/mem/view/CMakeLists.txt
index 9f7349e710..1dc7715040 100644
--- a/thirdParty/cupla/alpaka/test/unit/mem/view/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/mem/view/CMakeLists.txt
@@ -1,27 +1,25 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "memView")
+set(_TARGET_NAME "memView")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_MEM_VIEW")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp
index 1f186bc1a8..a0614cd77d 100644
--- a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewPlainPtrTest.cpp
@@ -1,206 +1,169 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
+#include <alpaka/core/BoostPredef.hpp>
 #include <alpaka/mem/view/ViewPlainPtr.hpp>
-
+#include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/Extent.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
 #include <alpaka/test/mem/view/ViewTest.hpp>
-#include <alpaka/test/Extent.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-#include <alpaka/core/BoostPredef.hpp>
+#include <alpaka/test/queue/Queue.hpp>
 
 #include <catch2/catch.hpp>
 
-#include <type_traits>
 #include <numeric>
+#include <type_traits>
 
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'std::uint8_t*' to 'Elem*' increases required alignment of target type"
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'std::uint8_t*' to 'Elem*' increases required alignment of target type"
 #endif
 
 namespace alpaka
 {
-namespace test
-{
-namespace mem
-{
-namespace view
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TElem,
-        typename TDim,
-        typename TIdx>
-    auto testViewPlainPtrImmutable(
-        alpaka::mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> const & view,
-        TDev const & dev,
-        alpaka::vec::Vec<TDim, TIdx> const & extentView,
-        alpaka::vec::Vec<TDim, TIdx> const & offsetView)
-    -> void
+    namespace test
     {
         //-----------------------------------------------------------------------------
-        alpaka::test::mem::view::testViewImmutable<
-            TElem>(
-                view,
-                dev,
-                extentView,
-                offsetView);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TElem,
-        typename TDim,
-        typename TIdx>
-    auto testViewPlainPtrMutable(
-        alpaka::mem::view::ViewPlainPtr<TDev, TElem, TDim, TIdx> & view,
-        TDev const & dev,
-        alpaka::vec::Vec<TDim, TIdx> const & extentView,
-        alpaka::vec::Vec<TDim, TIdx> const & offsetView)
-    -> void
-    {
+        template<typename TAcc, typename TDev, typename TElem, typename TDim, typename TIdx>
+        auto testViewPlainPtrImmutable(
+            alpaka::ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view,
+            TDev const& dev,
+            alpaka::Vec<TDim, TIdx> const& extentView,
+            alpaka::Vec<TDim, TIdx> const& offsetView) -> void
+        {
+            //-----------------------------------------------------------------------------
+            alpaka::test::testViewImmutable<TElem>(view, dev, extentView, offsetView);
+        }
+
         //-----------------------------------------------------------------------------
-        testViewPlainPtrImmutable<
-            TAcc>(
-                view,
-                dev,
-                extentView,
-                offsetView);
-
-        using Queue = alpaka::test::queue::DefaultQueue<TDev>;
-        Queue queue(dev);
+        template<typename TAcc, typename TDev, typename TElem, typename TDim, typename TIdx>
+        auto testViewPlainPtrMutable(
+            alpaka::ViewPlainPtr<TDev, TElem, TDim, TIdx>& view,
+            TDev const& dev,
+            alpaka::Vec<TDim, TIdx> const& extentView,
+            alpaka::Vec<TDim, TIdx> const& offsetView) -> void
+        {
+            //-----------------------------------------------------------------------------
+            testViewPlainPtrImmutable<TAcc>(view, dev, extentView, offsetView);
+
+            using Queue = alpaka::test::DefaultQueue<TDev>;
+            Queue queue(dev);
+            //-----------------------------------------------------------------------------
+            alpaka::test::testViewMutable<TAcc>(queue, view);
+        }
+
         //-----------------------------------------------------------------------------
-        alpaka::test::mem::view::testViewMutable<
-            TAcc>(
-                queue,
-                view);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewPlainPtr()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
-
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewPlainPtr<Dev, TElem, Dim, Idx>;
-
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
-
-        auto const extentView(extentBuf);
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
-        View view(
-            alpaka::mem::view::getPtrNative(buf),
-            alpaka::dev::getDev(buf),
-            alpaka::extent::getExtentVec(buf),
-            alpaka::mem::view::getPitchBytesVec(buf));
-
-        alpaka::test::mem::view::testViewPlainPtrMutable<TAcc>(view, dev, extentView, offsetView);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewPlainPtrConst()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
-
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewPlainPtr<Dev, TElem, Dim, Idx>;
-
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
-
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
-
-        auto const extentView(extentBuf);
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
-        View const view(
-            alpaka::mem::view::getPtrNative(buf),
-            alpaka::dev::getDev(buf),
-            alpaka::extent::getExtentVec(buf),
-            alpaka::mem::view::getPitchBytesVec(buf));
-
-        alpaka::test::mem::view::testViewPlainPtrImmutable<TAcc>(view, dev, extentView, offsetView);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewPlainPtrOperators()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
+        template<typename TAcc, typename TElem>
+        auto testViewPlainPtr() -> void
+        {
+            using Dev = alpaka::Dev<TAcc>;
+            using Pltf = alpaka::Pltf<Dev>;
 
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewPlainPtr<Dev, TElem, Dim, Idx>;
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using View = alpaka::ViewPlainPtr<Dev, TElem, Dim, Idx>;
 
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
+            Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
 
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
+            auto const extentBuf(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
+            auto buf(alpaka::allocBuf<TElem, Idx>(dev, extentBuf));
 
-        View view(
-            alpaka::mem::view::getPtrNative(buf),
-            alpaka::dev::getDev(buf),
-            alpaka::extent::getExtentVec(buf),
-            alpaka::mem::view::getPitchBytesVec(buf));
+            auto const extentView(extentBuf);
+            auto const offsetView(alpaka::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
+            View view(
+                alpaka::getPtrNative(buf),
+                alpaka::getDev(buf),
+                alpaka::extent::getExtentVec(buf),
+                alpaka::getPitchBytesVec(buf));
 
-        // copy-constructor
-        View viewCopy(view);
+            alpaka::test::testViewPlainPtrMutable<TAcc>(view, dev, extentView, offsetView);
+        }
 
-        // move-constructor
-        View viewMove(std::move(viewCopy));
-    }
-}
-}
-}
-}
+        //-----------------------------------------------------------------------------
+        template<typename TAcc, typename TElem>
+        auto testViewPlainPtrConst() -> void
+        {
+            using Dev = alpaka::Dev<TAcc>;
+            using Pltf = alpaka::Pltf<Dev>;
+
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using View = alpaka::ViewPlainPtr<Dev, TElem, Dim, Idx>;
+
+            Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+
+            auto const extentBuf(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
+            auto buf(alpaka::allocBuf<TElem, Idx>(dev, extentBuf));
+
+            auto const extentView(extentBuf);
+            auto const offsetView(alpaka::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
+            View const view(
+                alpaka::getPtrNative(buf),
+                alpaka::getDev(buf),
+                alpaka::extent::getExtentVec(buf),
+                alpaka::getPitchBytesVec(buf));
+
+            alpaka::test::testViewPlainPtrImmutable<TAcc>(view, dev, extentView, offsetView);
+        }
+
+        //-----------------------------------------------------------------------------
+        template<typename TAcc, typename TElem>
+        auto testViewPlainPtrOperators() -> void
+        {
+            using Dev = alpaka::Dev<TAcc>;
+            using Pltf = alpaka::Pltf<Dev>;
+
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using View = alpaka::ViewPlainPtr<Dev, TElem, Dim, Idx>;
+
+            Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+
+            auto const extentBuf(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
+            auto buf(alpaka::allocBuf<TElem, Idx>(dev, extentBuf));
+
+            View view(
+                alpaka::getPtrNative(buf),
+                alpaka::getDev(buf),
+                alpaka::extent::getExtentVec(buf),
+                alpaka::getPitchBytesVec(buf));
+
+            // copy-constructor
+            View viewCopy(view);
+
+            // move-constructor
+            View viewMove(std::move(viewCopy));
+        }
+    } // namespace test
+} // namespace alpaka
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
+#    pragma GCC diagnostic pop
 #endif
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewPlainPtrTest", "[memView]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("viewPlainPtrTest", "[memView]", alpaka::test::TestAccs)
 {
-    alpaka::test::mem::view::testViewPlainPtr<TestType, float>();
+    alpaka::test::testViewPlainPtr<TestType, float>();
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewPlainPtrConstTest", "[memView]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("viewPlainPtrConstTest", "[memView]", alpaka::test::TestAccs)
 {
-    alpaka::test::mem::view::testViewPlainPtrConst<TestType, float>();
+    alpaka::test::testViewPlainPtrConst<TestType, float>();
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewPlainPtrOperatorTest", "[memView]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("viewPlainPtrOperatorTest", "[memView]", alpaka::test::TestAccs)
 {
-    alpaka::test::mem::view::testViewPlainPtrOperators<TestType, float>();
+    alpaka::test::testViewPlainPtrOperators<TestType, float>();
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp
index 5f53ff5b4d..c3646cf61d 100644
--- a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewStaticAccMem.cpp
@@ -1,24 +1,23 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
+#include <alpaka/core/BoostPredef.hpp>
 #include <alpaka/core/Common.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/meta/ForEachType.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/meta/ForEachType.hpp>
-#include <alpaka/core/BoostPredef.hpp>
 
 #include <catch2/catch.hpp>
 
 using Elem = std::uint32_t;
-using Dim = alpaka::dim::DimInt<2u>;
+using Dim = alpaka::DimInt<2u>;
 using Idx = std::uint32_t;
 
 // These forward declarations are only necessary when you want to access those variables
@@ -28,12 +27,7 @@ using Idx = std::uint32_t;
 extern ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DInitialized[3][2];
 extern ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DUninitialized[3][2];
 
-ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DInitialized[3][2] =
-    {
-        {0u, 1u},
-        {2u, 3u},
-        {4u, 5u}
-    };
+ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DInitialized[3][2] = {{0u, 1u}, {2u, 3u}, {4u, 5u}};
 
 ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DUninitialized[3][2];
 
@@ -42,18 +36,11 @@ ALPAKA_STATIC_ACC_MEM_CONSTANT Elem g_constantMemory2DUninitialized[3][2];
 struct StaticDeviceMemoryTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename TElem>
-    ALPAKA_FN_ACC void operator()(
-        TAcc const & acc,
-        bool * success,
-        TElem const * const pConstantMem) const
+    template<typename TAcc, typename TElem>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, bool* success, TElem const* const pConstantMem) const
     {
-        auto const gridThreadExtent =
-            alpaka::workdiv::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
-        auto const gridThreadIdx =
-            alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+        auto const gridThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
 
         auto const offset = gridThreadExtent[1u] * gridThreadIdx[0u] + gridThreadIdx[1u];
         auto const val = offset;
@@ -62,61 +49,51 @@ struct StaticDeviceMemoryTestKernel
     }
 };
 
-using TestAccs = alpaka::test::acc::EnabledAccs<Dim, Idx>;
+using TestAccs = alpaka::test::EnabledAccs<Dim, Idx>;
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAccs)
+TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAccs)
 {
     using Acc = TestType;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    DevAcc devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
+    using DevAcc = alpaka::Dev<Acc>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+    DevAcc devAcc(alpaka::getDevByIdx<PltfAcc>(0u));
 
-    alpaka::vec::Vec<Dim, Idx> const extent(3u, 2u);
+    alpaka::Vec<Dim, Idx> const extent(3u, 2u);
 
     alpaka::test::KernelExecutionFixture<Acc> fixture(extent);
 
     StaticDeviceMemoryTestKernel kernel;
 
     //-----------------------------------------------------------------------------
-    // FIXME: constant memory in HIP(HCC) is still not working
-#if !BOOST_COMP_HCC && !BOOST_COMP_HIP
+    // FIXME: constant memory in HIP is still not working
+#if !BOOST_COMP_HIP
     // initialized static constant device memory
     {
         auto const viewConstantMemInitialized(
-            alpaka::mem::view::createStaticDevMemView(
-                &g_constantMemory2DInitialized[0u][0u],
-                devAcc,
-                extent));
-
-        REQUIRE(fixture(
-            kernel,
-            alpaka::mem::view::getPtrNative(viewConstantMemInitialized)));
+            alpaka::createStaticDevMemView(&g_constantMemory2DInitialized[0u][0u], devAcc, extent));
+
+        REQUIRE(fixture(kernel, alpaka::getPtrNative(viewConstantMemInitialized)));
     }
     //-----------------------------------------------------------------------------
     // uninitialized static constant device memory
     {
-        using PltfHost = alpaka::pltf::PltfCpu;
-        auto devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
+        using PltfHost = alpaka::PltfCpu;
+        auto devHost(alpaka::getDevByIdx<PltfHost>(0u));
 
-        using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
+        using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
         QueueAcc queueAcc(devAcc);
 
         std::vector<Elem> const data{0u, 1u, 2u, 3u, 4u, 5u};
-        alpaka::mem::view::ViewPlainPtr<decltype(devHost), const Elem, Dim, Idx> bufHost(data.data(), devHost, extent);
+        alpaka::ViewPlainPtr<decltype(devHost), const Elem, Dim, Idx> bufHost(data.data(), devHost, extent);
 
         auto viewConstantMemUninitialized(
-            alpaka::mem::view::createStaticDevMemView(
-                &g_constantMemory2DUninitialized[0u][0u],
-                devAcc,
-                extent));
+            alpaka::createStaticDevMemView(&g_constantMemory2DUninitialized[0u][0u], devAcc, extent));
 
-        alpaka::mem::view::copy(queueAcc, viewConstantMemUninitialized, bufHost, extent);
-        alpaka::wait::wait(queueAcc);
+        alpaka::memcpy(queueAcc, viewConstantMemUninitialized, bufHost, extent);
+        alpaka::wait(queueAcc);
 
-        REQUIRE(fixture(
-            kernel,
-            alpaka::mem::view::getPtrNative(viewConstantMemUninitialized)));
+        REQUIRE(fixture(kernel, alpaka::getPtrNative(viewConstantMemUninitialized)));
     }
 #endif
 }
@@ -128,71 +105,54 @@ TEMPLATE_LIST_TEST_CASE( "staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestA
 extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DInitialized[3][2];
 extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
 
-ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DInitialized[3][2] =
-    {
-        {0u, 1u},
-        {2u, 3u},
-        {4u, 5u}
-    };
+ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DInitialized[3][2] = {{0u, 1u}, {2u, 3u}, {4u, 5u}};
 
 ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "staticDeviceMemoryConstant", "[viewStaticAccMem]", TestAccs)
+TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", TestAccs)
 {
     using Acc = TestType;
-    using DevAcc = alpaka::dev::Dev<Acc>;
-    using PltfAcc = alpaka::pltf::Pltf<DevAcc>;
-    DevAcc devAcc(alpaka::pltf::getDevByIdx<PltfAcc>(0u));
+    using DevAcc = alpaka::Dev<Acc>;
+    using PltfAcc = alpaka::Pltf<DevAcc>;
+    DevAcc devAcc(alpaka::getDevByIdx<PltfAcc>(0u));
 
-    alpaka::vec::Vec<Dim, Idx> const extent(3u, 2u);
+    alpaka::Vec<Dim, Idx> const extent(3u, 2u);
 
     alpaka::test::KernelExecutionFixture<Acc> fixture(extent);
 
     StaticDeviceMemoryTestKernel kernel;
 
     //-----------------------------------------------------------------------------
-    // FIXME: static device memory in HIP(HCC) is still not working
-#if !BOOST_COMP_HCC && !BOOST_COMP_HIP
+    // FIXME: static device memory in HIP is still not working
+#if !BOOST_COMP_HIP
     // initialized static global device memory
     {
         auto const viewGlobalMemInitialized(
-            alpaka::mem::view::createStaticDevMemView(
-                &g_globalMemory2DInitialized[0u][0u],
-                devAcc,
-                extent));
-
-        REQUIRE(
-            fixture(
-                kernel,
-                alpaka::mem::view::getPtrNative(viewGlobalMemInitialized)));
+            alpaka::createStaticDevMemView(&g_globalMemory2DInitialized[0u][0u], devAcc, extent));
+
+        REQUIRE(fixture(kernel, alpaka::getPtrNative(viewGlobalMemInitialized)));
     }
 
     //-----------------------------------------------------------------------------
     // uninitialized static global device memory
     {
-        using PltfHost = alpaka::pltf::PltfCpu;
-        auto devHost(alpaka::pltf::getDevByIdx<PltfHost>(0u));
+        using PltfHost = alpaka::PltfCpu;
+        auto devHost(alpaka::getDevByIdx<PltfHost>(0u));
 
-        using QueueAcc = alpaka::test::queue::DefaultQueue<DevAcc>;
+        using QueueAcc = alpaka::test::DefaultQueue<DevAcc>;
         QueueAcc queueAcc(devAcc);
 
         std::vector<Elem> const data{0u, 1u, 2u, 3u, 4u, 5u};
-        alpaka::mem::view::ViewPlainPtr<decltype(devHost), const Elem, Dim, Idx> bufHost(data.data(), devHost, extent);
+        alpaka::ViewPlainPtr<decltype(devHost), const Elem, Dim, Idx> bufHost(data.data(), devHost, extent);
 
         auto viewGlobalMemUninitialized(
-            alpaka::mem::view::createStaticDevMemView(
-                &g_globalMemory2DUninitialized[0u][0u],
-                devAcc,
-                extent));
-
-        alpaka::mem::view::copy(queueAcc, viewGlobalMemUninitialized, bufHost, extent);
-        alpaka::wait::wait(queueAcc);
-
-        REQUIRE(
-            fixture(
-                kernel,
-                alpaka::mem::view::getPtrNative(viewGlobalMemUninitialized)));
+            alpaka::createStaticDevMemView(&g_globalMemory2DUninitialized[0u][0u], devAcc, extent));
+
+        alpaka::memcpy(queueAcc, viewGlobalMemUninitialized, bufHost, extent);
+        alpaka::wait(queueAcc);
+
+        REQUIRE(fixture(kernel, alpaka::getPtrNative(viewGlobalMemUninitialized)));
     }
 #endif
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp
index ea6a0f4960..140d73ff5e 100644
--- a/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/mem/view/src/ViewSubViewTest.cpp
@@ -1,230 +1,188 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
+#include <alpaka/core/BoostPredef.hpp>
 #include <alpaka/mem/view/ViewSubView.hpp>
-
+#include <alpaka/test/Extent.hpp>
 #include <alpaka/test/acc/TestAccs.hpp>
-#include <alpaka/test/queue/Queue.hpp>
 #include <alpaka/test/mem/view/ViewTest.hpp>
-#include <alpaka/test/Extent.hpp>
-
-#include <alpaka/core/BoostPredef.hpp>
+#include <alpaka/test/queue/Queue.hpp>
 
 #include <catch2/catch.hpp>
 
-#include <type_traits>
 #include <numeric>
+#include <type_traits>
 
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wcast-align" // "cast from 'std::uint8_t*' to 'Elem*' increases required alignment of target type"
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'std::uint8_t*' to 'Elem*' increases required alignment of target type"
 #endif
 
 namespace alpaka
 {
-namespace test
-{
-namespace mem
-{
-namespace view
-{
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TElem,
-        typename TDim,
-        typename TIdx,
-        typename TBuf>
-    auto testViewSubViewImmutable(
-        alpaka::mem::view::ViewSubView<TDev, TElem, TDim, TIdx> const & view,
-        TBuf & buf,
-        TDev const & dev,
-        alpaka::vec::Vec<TDim, TIdx> const & extentView,
-        alpaka::vec::Vec<TDim, TIdx> const & offsetView)
-    -> void
+    namespace test
     {
         //-----------------------------------------------------------------------------
-        alpaka::test::mem::view::testViewImmutable<
-            TElem>(
-                view,
-                dev,
-                extentView,
-                offsetView);
-
-        //-----------------------------------------------------------------------------
-        // alpaka::mem::view::traits::GetPitchBytes
-        // The pitch of the view has to be identical to the pitch of the underlying buffer in all dimensions.
+        template<typename TAcc, typename TDev, typename TElem, typename TDim, typename TIdx, typename TBuf>
+        auto testViewSubViewImmutable(
+            alpaka::ViewSubView<TDev, TElem, TDim, TIdx> const& view,
+            TBuf& buf,
+            TDev const& dev,
+            alpaka::Vec<TDim, TIdx> const& extentView,
+            alpaka::Vec<TDim, TIdx> const& offsetView) -> void
         {
-            auto const pitchBuf(alpaka::mem::view::getPitchBytesVec(buf));
-            auto const pitchView(alpaka::mem::view::getPitchBytesVec(view));
+            //-----------------------------------------------------------------------------
+            alpaka::test::testViewImmutable<TElem>(view, dev, extentView, offsetView);
 
-            for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::GetPitchBytes
+            // The pitch of the view has to be identical to the pitch of the underlying buffer in all dimensions.
             {
-                REQUIRE(
-                    pitchBuf[i-static_cast<TIdx>(1u)] ==
-                    pitchView[i-static_cast<TIdx>(1u)]);
+                auto const pitchBuf(alpaka::getPitchBytesVec(buf));
+                auto const pitchView(alpaka::getPitchBytesVec(view));
+
+                for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
+                {
+                    REQUIRE(pitchBuf[i - static_cast<TIdx>(1u)] == pitchView[i - static_cast<TIdx>(1u)]);
+                }
             }
-        }
 
-        //-----------------------------------------------------------------------------
-        // alpaka::mem::view::traits::GetPtrNative
-        // The native pointer has to be exactly the value we calculate here.
-        {
-            auto viewPtrNative(
-                reinterpret_cast<std::uint8_t *>(
-                    alpaka::mem::view::getPtrNative(buf)));
-            auto const pitchBuf(alpaka::mem::view::getPitchBytesVec(buf));
-            for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
+            //-----------------------------------------------------------------------------
+            // alpaka::traits::GetPtrNative
+            // The native pointer has to be exactly the value we calculate here.
             {
-                auto const pitch = (i < static_cast<TIdx>(TDim::value)) ? pitchBuf[i] : static_cast<TIdx>(sizeof(TElem));
-                viewPtrNative += offsetView[i - static_cast<TIdx>(1u)] * pitch;
+                auto viewPtrNative(reinterpret_cast<std::uint8_t*>(alpaka::getPtrNative(buf)));
+                auto const pitchBuf(alpaka::getPitchBytesVec(buf));
+                for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
+                {
+                    auto const pitch
+                        = (i < static_cast<TIdx>(TDim::value)) ? pitchBuf[i] : static_cast<TIdx>(sizeof(TElem));
+                    viewPtrNative += offsetView[i - static_cast<TIdx>(1u)] * pitch;
+                }
+                REQUIRE(reinterpret_cast<TElem*>(viewPtrNative) == alpaka::getPtrNative(view));
             }
-            REQUIRE(
-                reinterpret_cast<TElem *>(viewPtrNative) ==
-                alpaka::mem::view::getPtrNative(view));
         }
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TElem,
-        typename TDim,
-        typename TIdx,
-        typename TBuf>
-    auto testViewSubViewMutable(
-        alpaka::mem::view::ViewSubView<TDev, TElem, TDim, TIdx> & view,
-        TBuf & buf,
-        TDev const & dev,
-        alpaka::vec::Vec<TDim, TIdx> const & extentView,
-        alpaka::vec::Vec<TDim, TIdx> const & offsetView)
-    -> void
-    {
+
         //-----------------------------------------------------------------------------
-        testViewSubViewImmutable<
-            TAcc>(
-                view,
-                buf,
-                dev,
-                extentView,
-                offsetView);
-
-        using Queue = alpaka::test::queue::DefaultQueue<TDev>;
-        Queue queue(dev);
+        template<typename TAcc, typename TDev, typename TElem, typename TDim, typename TIdx, typename TBuf>
+        auto testViewSubViewMutable(
+            alpaka::ViewSubView<TDev, TElem, TDim, TIdx>& view,
+            TBuf& buf,
+            TDev const& dev,
+            alpaka::Vec<TDim, TIdx> const& extentView,
+            alpaka::Vec<TDim, TIdx> const& offsetView) -> void
+        {
+            //-----------------------------------------------------------------------------
+            testViewSubViewImmutable<TAcc>(view, buf, dev, extentView, offsetView);
+
+            using Queue = alpaka::test::DefaultQueue<TDev>;
+            Queue queue(dev);
+            //-----------------------------------------------------------------------------
+            alpaka::test::testViewMutable<TAcc>(queue, view);
+        }
+
         //-----------------------------------------------------------------------------
-        alpaka::test::mem::view::testViewMutable<
-            TAcc>(
-                queue,
-                view);
-    }
-
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewSubViewNoOffset()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
+        template<typename TAcc, typename TElem>
+        auto testViewSubViewNoOffset() -> void
+        {
+            using Dev = alpaka::Dev<TAcc>;
+            using Pltf = alpaka::Pltf<Dev>;
 
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewSubView<Dev, TElem, Dim, Idx>;
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using View = alpaka::ViewSubView<Dev, TElem, Dim, Idx>;
 
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
+            Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
 
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
+            auto const extentBuf(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
+            auto buf(alpaka::allocBuf<TElem, Idx>(dev, extentBuf));
 
-        auto const extentView(extentBuf);
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
-        View view(buf);
+            auto const extentView(extentBuf);
+            auto const offsetView(alpaka::Vec<Dim, Idx>::all(static_cast<Idx>(0)));
+            View view(buf);
 
-        alpaka::test::mem::view::testViewSubViewMutable<TAcc>(view, buf, dev, extentView, offsetView);
-    }
+            alpaka::test::testViewSubViewMutable<TAcc>(view, buf, dev, extentView, offsetView);
+        }
 
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewSubViewOffset()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
+        //-----------------------------------------------------------------------------
+        template<typename TAcc, typename TElem>
+        auto testViewSubViewOffset() -> void
+        {
+            using Dev = alpaka::Dev<TAcc>;
+            using Pltf = alpaka::Pltf<Dev>;
 
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewSubView<Dev, TElem, Dim, Idx>;
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using View = alpaka::ViewSubView<Dev, TElem, Dim, Idx>;
 
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
+            Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
 
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
+            auto const extentBuf(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
+            auto buf(alpaka::allocBuf<TElem, Idx>(dev, extentBuf));
 
-        auto const extentView(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, alpaka::test::CreateExtentViewVal>(Idx()));
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(1)));
-        View view(buf, extentView, offsetView);
+            auto const extentView(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentSubView>());
+            auto const offsetView(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForOffset>());
+            View view(buf, extentView, offsetView);
 
-        alpaka::test::mem::view::testViewSubViewMutable<TAcc>(view, buf, dev, extentView, offsetView);
-    }
+            alpaka::test::testViewSubViewMutable<TAcc>(view, buf, dev, extentView, offsetView);
+        }
 
-    //-----------------------------------------------------------------------------
-    template<
-        typename TAcc,
-        typename TElem>
-    auto testViewSubViewOffsetConst()
-    -> void
-    {
-        using Dev = alpaka::dev::Dev<TAcc>;
-        using Pltf = alpaka::pltf::Pltf<Dev>;
+        //-----------------------------------------------------------------------------
+        template<typename TAcc, typename TElem>
+        auto testViewSubViewOffsetConst() -> void
+        {
+            using Dev = alpaka::Dev<TAcc>;
+            using Pltf = alpaka::Pltf<Dev>;
 
-        using Dim = alpaka::dim::Dim<TAcc>;
-        using Idx = alpaka::idx::Idx<TAcc>;
-        using View = alpaka::mem::view::ViewSubView<Dev, TElem, Dim, Idx>;
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using View = alpaka::ViewSubView<Dev, TElem, Dim, Idx>;
 
-        Dev const dev(alpaka::pltf::getDevByIdx<Pltf>(0u));
+            Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
 
-        auto const extentBuf(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, CreateExtentBufVal>(Idx()));
-        auto buf(alpaka::mem::buf::alloc<TElem, Idx>(dev, extentBuf));
+            auto const extentBuf(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>());
+            auto buf(alpaka::allocBuf<TElem, Idx>(dev, extentBuf));
 
-        auto const extentView(alpaka::vec::createVecFromIndexedFnWorkaround<Dim, Idx, CreateExtentViewVal>(Idx()));
-        auto const offsetView(alpaka::vec::Vec<Dim, Idx>::all(static_cast<Idx>(1)));
-        View const view(buf, extentView, offsetView);
+            auto const extentView(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentSubView>());
+            auto const offsetView(
+                alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForOffset>());
+            View const view(buf, extentView, offsetView);
 
-        alpaka::test::mem::view::testViewSubViewImmutable<TAcc>(view, buf, dev, extentView, offsetView);
-    }
-}
-}
-}
-}
+            alpaka::test::testViewSubViewImmutable<TAcc>(view, buf, dev, extentView, offsetView);
+        }
+    } // namespace test
+} // namespace alpaka
 #if BOOST_COMP_GNUC
-    #pragma GCC diagnostic pop
+#    pragma GCC diagnostic pop
 #endif
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewSubViewNoOffsetTest", "[memView]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("viewSubViewNoOffsetTest", "[memView]", alpaka::test::TestAccs)
 {
-    alpaka::test::mem::view::testViewSubViewNoOffset<TestType, float>();
+    alpaka::test::testViewSubViewNoOffset<TestType, float>();
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewSubViewOffsetTest", "[memView]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("viewSubViewOffsetTest", "[memView]", alpaka::test::TestAccs)
 {
-    alpaka::test::mem::view::testViewSubViewOffset<TestType, float>();
+    alpaka::test::testViewSubViewOffset<TestType, float>();
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "viewSubViewOffsetConstTest", "[memView]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("viewSubViewOffsetConstTest", "[memView]", alpaka::test::TestAccs)
 {
-    alpaka::test::mem::view::testViewSubViewOffsetConst<TestType, float>();
+    alpaka::test::testViewSubViewOffsetConst<TestType, float>();
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/meta/CMakeLists.txt
index 1c4b5af7cc..87382e861b 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/meta/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "meta")
+set(_TARGET_NAME "meta")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTest.cpp
index 7da938ec7c..703c969e2a 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -14,36 +14,19 @@
 #include <tuple>
 #include <type_traits>
 
-template<
-    typename... T>
+template<typename... T>
 struct TypeList
-{};
+{
+};
 
 //-----------------------------------------------------------------------------
 TEST_CASE("apply", "[meta]")
 {
-    using ApplyInput =
-        std::tuple<
-            int,
-            float,
-            long>;
-
-    using ApplyResult =
-        alpaka::meta::Apply<
-            ApplyInput,
-            TypeList
-        >;
-
-    using ApplyReference =
-        TypeList<
-            int,
-            float,
-            long>;
-
-    static_assert(
-        std::is_same<
-            ApplyReference,
-            ApplyResult
-        >::value,
-        "alpaka::meta::Apply failed!");
+    using ApplyInput = std::tuple<int, float, long>;
+
+    using ApplyResult = alpaka::meta::Apply<ApplyInput, TypeList>;
+
+    using ApplyReference = TypeList<int, float, long>;
+
+    static_assert(std::is_same<ApplyReference, ApplyResult>::value, "alpaka::meta::Apply failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTupleTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTupleTest.cpp
index 4670fe85c4..738ed3d021 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTupleTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/ApplyTupleTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -12,9 +12,15 @@
 #include <catch2/catch.hpp>
 
 //#############################################################################
-struct Foo {
-    Foo(int num) : num_(num) {}
-    auto add(int i) const -> int { return num_ + i; }
+struct Foo
+{
+    Foo(int num) : num_(num)
+    {
+    }
+    auto add(int i) const -> int
+    {
+        return num_ + i;
+    }
     int num_;
 };
 
@@ -26,7 +32,8 @@ auto abs_num(int i) -> int
 }
 
 //#############################################################################
-struct AbsNum {
+struct AbsNum
+{
     auto operator()(int i) const -> int
     {
         return std::abs(i);
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/CartesianProductTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/CartesianProductTest.cpp
index 1ea9a9081c..06b56c3cf2 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/CartesianProductTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/CartesianProductTest.cpp
@@ -1,15 +1,14 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
-#include <alpaka/meta/CartesianProduct.hpp>
-
 #include <alpaka/dim/DimIntegralConst.hpp>
+#include <alpaka/meta/CartesianProduct.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -19,37 +18,21 @@
 //-----------------------------------------------------------------------------
 TEST_CASE("cartesianProduct", "[meta]")
 {
-    using TestDims =
-        std::tuple<
-            alpaka::dim::DimInt<1u>,
-            alpaka::dim::DimInt<2u>,
-            alpaka::dim::DimInt<3u>>;
-
-    using TestIdxs =
-        std::tuple<
-            std::size_t,
-            std::int64_t>;
-
-    using CartesianProductResult =
-        alpaka::meta::CartesianProduct<
-            std::tuple,
-            TestDims,
-            TestIdxs
-        >;
-
-    using CartesianProductReference =
-        std::tuple<
-            std::tuple<alpaka::dim::DimInt<1u>, std::size_t>,
-            std::tuple<alpaka::dim::DimInt<2u>, std::size_t>,
-            std::tuple<alpaka::dim::DimInt<3u>, std::size_t>,
-            std::tuple<alpaka::dim::DimInt<1u>, std::int64_t>,
-            std::tuple<alpaka::dim::DimInt<2u>, std::int64_t>,
-            std::tuple<alpaka::dim::DimInt<3u>, std::int64_t>>;
+    using TestDims = std::tuple<alpaka::DimInt<1u>, alpaka::DimInt<2u>, alpaka::DimInt<3u>>;
+
+    using TestIdxs = std::tuple<std::size_t, std::int64_t>;
+
+    using CartesianProductResult = alpaka::meta::CartesianProduct<std::tuple, TestDims, TestIdxs>;
+
+    using CartesianProductReference = std::tuple<
+        std::tuple<alpaka::DimInt<1u>, std::size_t>,
+        std::tuple<alpaka::DimInt<2u>, std::size_t>,
+        std::tuple<alpaka::DimInt<3u>, std::size_t>,
+        std::tuple<alpaka::DimInt<1u>, std::int64_t>,
+        std::tuple<alpaka::DimInt<2u>, std::int64_t>,
+        std::tuple<alpaka::DimInt<3u>, std::int64_t>>;
 
     static_assert(
-        std::is_same<
-            CartesianProductReference,
-            CartesianProductResult
-        >::value,
+        std::is_same<CartesianProductReference, CartesianProductResult>::value,
         "alpaka::meta::CartesianProduct failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/ConcatenateTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/ConcatenateTest.cpp
index 2376e7217c..458cf03a2f 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/ConcatenateTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/ConcatenateTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -18,35 +18,13 @@
 //-----------------------------------------------------------------------------
 TEST_CASE("concatenate", "[meta]")
 {
-    using TestTuple1 =
-        std::tuple<
-            float,
-            int,
-            std::tuple<double, unsigned long>>;
-
-    using TestTuple2 =
-        std::tuple<
-            bool,
-            std::string>;
-
-    using ConcatenateResult =
-        alpaka::meta::Concatenate<
-            TestTuple1,
-            TestTuple2
-        >;
-
-    using ConcatenateReference =
-        std::tuple<
-            float,
-            int,
-            std::tuple<double, unsigned long>,
-            bool,
-            std::string>;
-
-    static_assert(
-        std::is_same<
-            ConcatenateReference,
-            ConcatenateResult
-        >::value,
-        "alpaka::meta::Concatenate failed!");
+    using TestTuple1 = std::tuple<float, int, std::tuple<double, unsigned long>>;
+
+    using TestTuple2 = std::tuple<bool, std::string>;
+
+    using ConcatenateResult = alpaka::meta::Concatenate<TestTuple1, TestTuple2>;
+
+    using ConcatenateReference = std::tuple<float, int, std::tuple<double, unsigned long>, bool, std::string>;
+
+    static_assert(std::is_same<ConcatenateReference, ConcatenateResult>::value, "alpaka::meta::Concatenate failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/FilterTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/FilterTest.cpp
index 180d5a333e..6d601e1684 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/FilterTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/FilterTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -17,27 +17,11 @@
 //-----------------------------------------------------------------------------
 TEST_CASE("filter", "[meta]")
 {
-    using FilterInput =
-        std::tuple<
-            int,
-            float,
-            long>;
+    using FilterInput = std::tuple<int, float, long>;
 
-    using FilterResult =
-        alpaka::meta::Filter<
-            FilterInput,
-            std::is_integral
-        >;
+    using FilterResult = alpaka::meta::Filter<FilterInput, std::is_integral>;
 
-    using FilterReference =
-        std::tuple<
-            int,
-            long>;
+    using FilterReference = std::tuple<int, long>;
 
-    static_assert(
-        std::is_same<
-            FilterReference,
-            FilterResult
-        >::value,
-        "alpaka::meta::Filter failed!");
+    static_assert(std::is_same<FilterReference, FilterResult>::value, "alpaka::meta::Filter failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/IntegralTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/IntegralTest.cpp
index 41e2b8f2ca..87b7b22b1b 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/IntegralTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/IntegralTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp
index 350353200e..a85e69a8ce 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/IsStrictBaseTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -14,70 +14,52 @@
 #include <tuple>
 #include <type_traits>
 
-class A {};
-class B : A {};
-class C {};
+class A
+{
+};
+class B : A
+{
+};
+class C
+{
+};
 
 //-----------------------------------------------------------------------------
 TEST_CASE("isStrictBaseTrue", "[meta]")
 {
-    constexpr bool IsStrictBaseResult =
-        alpaka::meta::IsStrictBase<
-            A, B
-        >::value;
+    constexpr bool IsStrictBaseResult = alpaka::meta::IsStrictBase<A, B>::value;
 
-    constexpr bool IsStrictBaseReference =
-        true;
+    constexpr bool IsStrictBaseReference = true;
 
-    static_assert(
-        IsStrictBaseReference == IsStrictBaseResult,
-        "alpaka::meta::IsStrictBase failed!");
+    static_assert(IsStrictBaseReference == IsStrictBaseResult, "alpaka::meta::IsStrictBase failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("isStrictBaseIdentity", "[meta]")
 {
-    constexpr bool IsStrictBaseResult =
-        alpaka::meta::IsStrictBase<
-            A, A
-        >::value;
+    constexpr bool IsStrictBaseResult = alpaka::meta::IsStrictBase<A, A>::value;
 
-    constexpr bool IsStrictBaseReference =
-        false;
+    constexpr bool IsStrictBaseReference = false;
 
-    static_assert(
-        IsStrictBaseReference == IsStrictBaseResult,
-        "alpaka::meta::IsStrictBase failed!");
+    static_assert(IsStrictBaseReference == IsStrictBaseResult, "alpaka::meta::IsStrictBase failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("isStrictBaseNoInheritance", "[meta]")
 {
-    constexpr bool IsStrictBaseResult =
-        alpaka::meta::IsStrictBase<
-            A, C
-        >::value;
+    constexpr bool IsStrictBaseResult = alpaka::meta::IsStrictBase<A, C>::value;
 
-    constexpr bool IsStrictBaseReference =
-        false;
+    constexpr bool IsStrictBaseReference = false;
 
-    static_assert(
-        IsStrictBaseReference == IsStrictBaseResult,
-        "alpaka::meta::IsStrictBase failed!");
+    static_assert(IsStrictBaseReference == IsStrictBaseResult, "alpaka::meta::IsStrictBase failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("isStrictBaseWrongOrder", "[meta]")
 {
-    constexpr bool IsStrictBaseResult =
-        alpaka::meta::IsStrictBase<
-            B, A
-        >::value;
+    constexpr bool IsStrictBaseResult = alpaka::meta::IsStrictBase<B, A>::value;
 
-    constexpr bool IsStrictBaseReference =
-        false;
+    constexpr bool IsStrictBaseReference = false;
 
-    static_assert(
-        IsStrictBaseReference == IsStrictBaseResult,
-        "alpaka::meta::IsStrictBase failed!");
+    static_assert(IsStrictBaseReference == IsStrictBaseResult, "alpaka::meta::IsStrictBase failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/MetafunctionsTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/MetafunctionsTest.cpp
index 2134ebcf1e..7e46267f62 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/MetafunctionsTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/MetafunctionsTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -17,97 +17,55 @@
 //-----------------------------------------------------------------------------
 TEST_CASE("conjunctionTrue", "[meta]")
 {
-    using ConjunctionResult =
-        alpaka::meta::Conjunction<
-            std::true_type,
-            std::true_type,
-            std::integral_constant<bool, true>
-        >;
-
-    static_assert(
-        ConjunctionResult::value == true,
-        "alpaka::meta::Conjunction failed!");
+    using ConjunctionResult
+        = alpaka::meta::Conjunction<std::true_type, std::true_type, std::integral_constant<bool, true>>;
+
+    static_assert(ConjunctionResult::value == true, "alpaka::meta::Conjunction failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("conjunctionFalse", "[meta]")
 {
-    using ConjunctionResult =
-        alpaka::meta::Conjunction<
-            std::true_type,
-            std::false_type,
-            std::integral_constant<bool, true>
-        >;
-
-    static_assert(
-        ConjunctionResult::value == false,
-        "alpaka::meta::Conjunction failed!");
+    using ConjunctionResult
+        = alpaka::meta::Conjunction<std::true_type, std::false_type, std::integral_constant<bool, true>>;
+
+    static_assert(ConjunctionResult::value == false, "alpaka::meta::Conjunction failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("disjunctionTrue", "[meta]")
 {
-    using DisjunctionResult =
-        alpaka::meta::Disjunction<
-            std::false_type,
-            std::true_type,
-            std::integral_constant<bool, false>
-        >;
-
-    static_assert(
-        DisjunctionResult::value == true,
-        "alpaka::meta::Disjunction failed!");
+    using DisjunctionResult
+        = alpaka::meta::Disjunction<std::false_type, std::true_type, std::integral_constant<bool, false>>;
+
+    static_assert(DisjunctionResult::value == true, "alpaka::meta::Disjunction failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("disjunctionFalse", "[meta]")
 {
-    using DisjunctionResult =
-        alpaka::meta::Disjunction<
-            std::false_type,
-            std::false_type,
-            std::integral_constant<bool, false>
-        >;
-
-    static_assert(
-        DisjunctionResult::value == false,
-        "alpaka::meta::Disjunction failed!");
+    using DisjunctionResult
+        = alpaka::meta::Disjunction<std::false_type, std::false_type, std::integral_constant<bool, false>>;
+
+    static_assert(DisjunctionResult::value == false, "alpaka::meta::Disjunction failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("negationFalse", "[meta]")
 {
-    using NegationResult =
-        alpaka::meta::Negation<
-            std::true_type
-        >;
-
-    using NegationReference =
-        std::false_type;
-
-    static_assert(
-        std::is_same<
-            NegationReference,
-            NegationResult
-        >::value,
-        "alpaka::meta::Negation failed!");
+    using NegationResult = alpaka::meta::Negation<std::true_type>;
+
+    using NegationReference = std::false_type;
+
+    static_assert(std::is_same<NegationReference, NegationResult>::value, "alpaka::meta::Negation failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("negationTrue", "[meta]")
 {
-    using NegationResult =
-        alpaka::meta::Negation<
-            std::false_type
-        >;
-
-    using NegationReference =
-        std::true_type;
-
-    static_assert(
-        std::is_same<
-            NegationReference,
-            NegationResult
-        >::value,
-        "alpaka::meta::Negation failed!");
+    using NegationResult = alpaka::meta::Negation<std::false_type>;
+
+    using NegationReference = std::true_type;
+
+    static_assert(std::is_same<NegationReference, NegationResult>::value, "alpaka::meta::Negation failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/SetTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/SetTest.cpp
index eb0ffc77af..3b7ad6f4dd 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/SetTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/SetTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -17,43 +17,23 @@
 //-----------------------------------------------------------------------------
 TEST_CASE("isSetTrue", "[meta]")
 {
-    using IsSetInput =
-        std::tuple<
-            int,
-            float,
-            long>;
-
-    constexpr bool IsSetResult =
-        alpaka::meta::IsSet<
-            IsSetInput
-        >::value;
-
-    constexpr bool IsSetReference =
-        true;
-
-    static_assert(
-        IsSetReference == IsSetResult,
-        "alpaka::meta::IsSet failed!");
+    using IsSetInput = std::tuple<int, float, long>;
+
+    constexpr bool IsSetResult = alpaka::meta::IsSet<IsSetInput>::value;
+
+    constexpr bool IsSetReference = true;
+
+    static_assert(IsSetReference == IsSetResult, "alpaka::meta::IsSet failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("isSetFalse", "[meta]")
 {
-    using IsSetInput =
-        std::tuple<
-            int,
-            float,
-            int>;
-
-    constexpr bool IsSetResult =
-        alpaka::meta::IsSet<
-            IsSetInput
-        >::value;
-
-    constexpr bool IsSetReference =
-        false;
-
-    static_assert(
-        IsSetReference == IsSetResult,
-        "alpaka::meta::IsSet failed!");
+    using IsSetInput = std::tuple<int, float, int>;
+
+    constexpr bool IsSetResult = alpaka::meta::IsSet<IsSetInput>::value;
+
+    constexpr bool IsSetReference = false;
+
+    static_assert(IsSetReference == IsSetResult, "alpaka::meta::IsSet failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/TransformTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/TransformTest.cpp
index dba8dd0785..d8b21d6d8b 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/TransformTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/TransformTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -14,64 +14,29 @@
 #include <tuple>
 #include <type_traits>
 
-template<
-    typename T>
+template<typename T>
 using AddConst = T const;
 
 //-----------------------------------------------------------------------------
 TEST_CASE("transform", "[meta]")
 {
-    using TransformInput =
-        std::tuple<
-            int,
-            float,
-            long>;
+    using TransformInput = std::tuple<int, float, long>;
 
-    using TransformResult =
-        alpaka::meta::Transform<
-            TransformInput,
-            AddConst
-        >;
+    using TransformResult = alpaka::meta::Transform<TransformInput, AddConst>;
 
-    using TransformReference =
-        std::tuple<
-            int const,
-            float const,
-            long const>;
+    using TransformReference = std::tuple<int const, float const, long const>;
 
-    static_assert(
-        std::is_same<
-            TransformReference,
-            TransformResult
-        >::value,
-        "alpaka::meta::Transform failed!");
+    static_assert(std::is_same<TransformReference, TransformResult>::value, "alpaka::meta::Transform failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("transformVariadic", "[meta]")
 {
-    using TransformInput =
-        std::tuple<
-            int,
-            float,
-            long>;
+    using TransformInput = std::tuple<int, float, long>;
 
-    using TransformResult =
-        alpaka::meta::Transform<
-            TransformInput,
-            std::tuple
-        >;
+    using TransformResult = alpaka::meta::Transform<TransformInput, std::tuple>;
 
-    using TransformReference =
-        std::tuple<
-            std::tuple<int>,
-            std::tuple<float>,
-            std::tuple<long>>;
+    using TransformReference = std::tuple<std::tuple<int>, std::tuple<float>, std::tuple<long>>;
 
-    static_assert(
-        std::is_same<
-            TransformReference,
-            TransformResult
-        >::value,
-        "alpaka::meta::Transform failed!");
+    static_assert(std::is_same<TransformReference, TransformResult>::value, "alpaka::meta::Transform failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/UniqueTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/UniqueTest.cpp
index 36cb4665d4..0338ed883b 100644
--- a/thirdParty/cupla/alpaka/test/unit/meta/src/UniqueTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/UniqueTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -17,54 +17,23 @@
 //-----------------------------------------------------------------------------
 TEST_CASE("uniqueWithDuplicate", "[meta]")
 {
-    using UniqueInput =
-        std::tuple<
-            int,
-            float,
-            int,
-            float,
-            float,
-            int>;
+    using UniqueInput = std::tuple<int, float, int, float, float, int>;
 
-    using UniqueResult =
-        alpaka::meta::Unique<
-            UniqueInput
-        >;
+    using UniqueResult = alpaka::meta::Unique<UniqueInput>;
 
-    using UniqueReference =
-        std::tuple<
-            int,
-            float>;
+    using UniqueReference = std::tuple<int, float>;
 
-    static_assert(
-        std::is_same<
-            UniqueReference,
-            UniqueResult
-        >::value,
-        "alpaka::meta::Unique failed!");
+    static_assert(std::is_same<UniqueReference, UniqueResult>::value, "alpaka::meta::Unique failed!");
 }
 
 //-----------------------------------------------------------------------------
 TEST_CASE("uniqueWithoutDuplicate", "[meta]")
 {
-    using UniqueInput =
-        std::tuple<
-            int,
-            float,
-            double>;
+    using UniqueInput = std::tuple<int, float, double>;
 
-    using UniqueResult =
-        alpaka::meta::Unique<
-            UniqueInput
-        >;
+    using UniqueResult = alpaka::meta::Unique<UniqueInput>;
 
-    using UniqueReference =
-        UniqueInput;
+    using UniqueReference = UniqueInput;
 
-    static_assert(
-        std::is_same<
-            UniqueReference,
-            UniqueResult
-        >::value,
-        "alpaka::meta::Unique failed!");
+    static_assert(std::is_same<UniqueReference, UniqueResult>::value, "alpaka::meta::Unique failed!");
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/meta/src/VoidTest.cpp b/thirdParty/cupla/alpaka/test/unit/meta/src/VoidTest.cpp
new file mode 100644
index 0000000000..c2c0d21d43
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/meta/src/VoidTest.cpp
@@ -0,0 +1,53 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/meta/Void.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <type_traits>
+#include <vector>
+
+//-----------------------------------------------------------------------------
+TEST_CASE("voidNonEmpty", "[meta]")
+{
+    using Result = alpaka::meta::Void<int, float, int>;
+    REQUIRE(std::is_same<void, Result>::value);
+}
+
+//-----------------------------------------------------------------------------
+TEST_CASE("voidEmpty", "[meta]")
+{
+    using Result = alpaka::meta::Void<>;
+    REQUIRE(std::is_same<void, Result>::value);
+}
+
+//-----------------------------------------------------------------------------
+//#############################################################################
+//! Trait to detect if the given class has a method size().
+//! This illustrates and tests the technique of using Void<> to compile-time
+//! check for methods (and members can be treated similarly).
+template<class T, class = void>
+struct HasMethodSize : std::false_type
+{
+};
+
+template<class T>
+struct HasMethodSize<T, alpaka::meta::Void<decltype(std::declval<T&>().size())>> : std::true_type
+{
+};
+
+TEST_CASE("voidSFINAE", "[meta]")
+{
+    using DoesIntHaveMethodSize = HasMethodSize<int>;
+    REQUIRE(false == DoesIntHaveMethodSize::value);
+
+    using DoesVectorHaveMethodSize = HasMethodSize<std::vector<float>>;
+    REQUIRE(true == DoesVectorHaveMethodSize::value);
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/queue/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/queue/CMakeLists.txt
index f59808d505..6b586ed1c6 100644
--- a/thirdParty/cupla/alpaka/test/unit/queue/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/queue/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "queue")
+set(_TARGET_NAME "queue")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/queue/src/CollectiveQueue.cpp b/thirdParty/cupla/alpaka/test/unit/queue/src/CollectiveQueue.cpp
index 3fb220a3b3..84a9e68214 100644
--- a/thirdParty/cupla/alpaka/test/unit/queue/src/CollectiveQueue.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/queue/src/CollectiveQueue.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -9,28 +9,25 @@
 
 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
 
-#if _OPENMP < 200203
-    #error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#endif
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
 
-#include <alpaka/alpaka.hpp>
-#include <alpaka/test/queue/Queue.hpp>
-#include <alpaka/test/queue/QueueTestFixture.hpp>
-#include <alpaka/test/queue/QueueCpuOmp2Collective.hpp>
+#    include <alpaka/alpaka.hpp>
+#    include <alpaka/test/queue/Queue.hpp>
+#    include <alpaka/test/queue/QueueCpuOmp2Collective.hpp>
+#    include <alpaka/test/queue/QueueTestFixture.hpp>
 
-#include <vector>
+#    include <catch2/catch.hpp>
 
-#include <catch2/catch.hpp>
+#    include <vector>
 
 struct QueueCollectiveTestKernel
 {
     template<typename TAcc>
-    auto operator()(
-        TAcc const & acc,
-        int* resultsPtr) const
-    -> void
+    auto operator()(TAcc const& acc, int* resultsPtr) const -> void
     {
-        size_t threadId = alpaka::idx::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0];
+        size_t threadId = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0];
         // avoid that one thread is doing all the work
         std::this_thread::sleep_for(std::chrono::milliseconds(200u * threadId));
         resultsPtr[threadId] = static_cast<int>(threadId);
@@ -40,43 +37,36 @@ struct QueueCollectiveTestKernel
 TEST_CASE("queueCollective", "[queue]")
 {
     // Define the index domain
-    using Dim = alpaka::dim::DimInt<1>;
+    using Dim = alpaka::DimInt<1>;
     using Idx = size_t;
 
     // Define the accelerator
-    using Acc = alpaka::acc::AccCpuOmp2Blocks<Dim, Idx>;
-    using Dev = alpaka::dev::Dev<Acc>;
+    using Acc = alpaka::AccCpuOmp2Blocks<Dim, Idx>;
+    using Dev = alpaka::Dev<Acc>;
 
-    using Queue = alpaka::queue::QueueCpuOmp2Collective;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
+    using Queue = alpaka::QueueCpuOmp2Collective;
+    using Pltf = alpaka::Pltf<Dev>;
 
-    auto dev = alpaka::pltf::getDevByIdx<Pltf>(0u);
+    auto dev = alpaka::getDevByIdx<Pltf>(0u);
     Queue queue(dev);
 
     std::vector<int> results(4, -1);
 
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
+    using Vec = alpaka::Vec<Dim, Idx>;
     Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
     Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
     Vec const blocksPerGrid(results.size());
 
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    WorkDiv const workDiv(blocksPerGrid, threadsPerBlock, elementsPerThread);
 
-    #pragma omp parallel num_threads(static_cast<int>(results.size()))
+#    pragma omp parallel num_threads(static_cast <int>(results.size()))
     {
         // The kernel will be performed collectively.
         // OpenMP will distribute the work between the threads from the parallel region
-        alpaka::kernel::exec<Acc>(
-               queue,
-               workDiv,
-               QueueCollectiveTestKernel{},
-               results.data());
+        alpaka::exec<Acc>(queue, workDiv, QueueCollectiveTestKernel{}, results.data());
 
-        alpaka::wait::wait(queue);
+        alpaka::wait(queue);
     }
 
     for(size_t i = 0; i < results.size(); ++i)
@@ -87,60 +77,49 @@ TEST_CASE("queueCollective", "[queue]")
 
 TEST_CASE("TestCollectiveMemcpy", "[queue]")
 {
-     // Define the index domain
-    using Dim = alpaka::dim::DimInt<1>;
+    // Define the index domain
+    using Dim = alpaka::DimInt<1>;
     using Idx = size_t;
 
     // Define the accelerator
-    using Acc = alpaka::acc::AccCpuOmp2Blocks<Dim, Idx>;
-    using Dev = alpaka::dev::Dev<Acc>;
+    using Acc = alpaka::AccCpuOmp2Blocks<Dim, Idx>;
+    using Dev = alpaka::Dev<Acc>;
 
-    using Queue = alpaka::queue::QueueCpuOmp2Collective;
-    using Pltf = alpaka::pltf::Pltf<Dev>;
+    using Queue = alpaka::QueueCpuOmp2Collective;
+    using Pltf = alpaka::Pltf<Dev>;
 
-    auto dev = alpaka::pltf::getDevByIdx<Pltf>(0u);
+    auto dev = alpaka::getDevByIdx<Pltf>(0u);
     Queue queue(dev);
 
     std::vector<int> results(4, -1);
 
     // Define the work division
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
+    using Vec = alpaka::Vec<Dim, Idx>;
     Vec const elementsPerThread(Vec::all(static_cast<Idx>(1)));
     Vec const threadsPerBlock(Vec::all(static_cast<Idx>(1)));
     Vec const blocksPerGrid(results.size());
 
-    using WorkDiv = alpaka::workdiv::WorkDivMembers<Dim, Idx>;
-    WorkDiv const workDiv(
-        blocksPerGrid,
-        threadsPerBlock,
-        elementsPerThread);
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    WorkDiv const workDiv(blocksPerGrid, threadsPerBlock, elementsPerThread);
 
-    #pragma omp parallel num_threads(static_cast<int>(results.size()))
+#    pragma omp parallel num_threads(static_cast <int>(results.size()))
     {
         int threadId = omp_get_thread_num();
 
-        using View = alpaka::mem::view::ViewPlainPtr<Dev, int, Dim, Idx>;
+        using View = alpaka::ViewPlainPtr<Dev, int, Dim, Idx>;
 
-        View dst(
-            results.data() + threadId,
-            dev,
-            Vec(static_cast<Idx>(1u)),
-            Vec(sizeof(int)));
+        View dst(results.data() + threadId, dev, Vec(static_cast<Idx>(1u)), Vec(sizeof(int)));
 
-        View src(
-            &threadId,
-            dev,
-            Vec(static_cast<Idx>(1u)),
-            Vec(sizeof(int)));
+        View src(&threadId, dev, Vec(static_cast<Idx>(1u)), Vec(sizeof(int)));
 
         // avoid that the first thread is executing the copy (can not be guaranteed)
         size_t sleep_ms = (results.size() - static_cast<uint32_t>(threadId)) * 100u;
         std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
 
         // only one thread will perform this memcpy
-        alpaka::mem::view::copy(queue, dst, src, Vec(static_cast<Idx>(1u)));
+        alpaka::memcpy(queue, dst, src, Vec(static_cast<Idx>(1u)));
 
-        alpaka::wait::wait(queue);
+        alpaka::wait(queue);
     }
 
     uint32_t numFlippedValues = 0u;
diff --git a/thirdParty/cupla/alpaka/test/unit/queue/src/QueueTest.cpp b/thirdParty/cupla/alpaka/test/unit/queue/src/QueueTest.cpp
index 9b9b6461d9..d2e7489ac7 100644
--- a/thirdParty/cupla/alpaka/test/unit/queue/src/QueueTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/queue/src/QueueTest.cpp
@@ -1,18 +1,16 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
-#include <alpaka/queue/Traits.hpp>
 #include <alpaka/meta/Concatenate.hpp>
-
-#include <alpaka/test/queue/QueueCpuOmp2Collective.hpp>
-
+#include <alpaka/queue/Traits.hpp>
 #include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/test/queue/QueueCpuOmp2Collective.hpp>
 #include <alpaka/test/queue/QueueTestFixture.hpp>
 
 #include <catch2/catch.hpp>
@@ -21,99 +19,91 @@
 #include <thread>
 
 using TestQueues = alpaka::meta::Concatenate<
-        alpaka::test::queue::TestQueues
- #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-        ,
-        std::tuple<std::tuple<alpaka::dev::DevCpu, alpaka::queue::QueueCpuOmp2Collective>>
+    alpaka::test::TestQueues
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+    ,
+    std::tuple<std::tuple<alpaka::DevCpu, alpaka::QueueCpuOmp2Collective>>
 #endif
     >;
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueIsInitiallyEmpty", "[queue]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("queueIsInitiallyEmpty", "[queue]", TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     Fixture f;
 
-    CHECK(alpaka::queue::empty(f.m_queue));
+    CHECK(alpaka::empty(f.m_queue));
 }
 
 #if !BOOST_COMP_HIP // HIP-clang is currently not supporting callbacks
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueCallbackIsWorking", "[queue]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("queueCallbackIsWorking", "[queue]", TestQueues)
 {
 // Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
+#    if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_PTX)
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     Fixture f;
 
     std::promise<bool> promise;
 
-    alpaka::queue::enqueue(
-        f.m_queue,
-        [&](){
-            promise.set_value(true);
-        }
-    );
+    alpaka::enqueue(f.m_queue, [&]() { promise.set_value(true); });
 
     CHECK(promise.get_future().get());
-#endif
+#    endif
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueWaitShouldWork", "[queue]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("queueWaitShouldWork", "[queue]", TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     Fixture f;
 
     bool CallbackFinished = false;
-    alpaka::queue::enqueue(
-        f.m_queue,
-        [&CallbackFinished]() noexcept
-        {
-            std::this_thread::sleep_for(std::chrono::milliseconds(100u));
-            CallbackFinished = true;
-        });
+    alpaka::enqueue(f.m_queue, [&CallbackFinished]() noexcept {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100u));
+        CallbackFinished = true;
+    });
 
-    alpaka::wait::wait(f.m_queue);
+    alpaka::wait(f.m_queue);
     CHECK(CallbackFinished);
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinished", "[queue]", TestQueues)
+TEMPLATE_LIST_TEST_CASE(
+    "queueShouldNotBeEmptyWhenLastTaskIsStillExecutingAndIsEmptyAfterProcessingFinished",
+    "[queue]",
+    TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     Fixture f;
 
     bool CallbackFinished = false;
-    alpaka::queue::enqueue(
-        f.m_queue,
-        [&f, &CallbackFinished]() noexcept
-        {
-            CHECK(!alpaka::queue::empty(f.m_queue));
-            std::this_thread::sleep_for(std::chrono::milliseconds(100u));
-            CallbackFinished = true;
-        });
+    alpaka::enqueue(f.m_queue, [&f, &CallbackFinished]() noexcept {
+        CHECK(!alpaka::empty(f.m_queue));
+        std::this_thread::sleep_for(std::chrono::milliseconds(100u));
+        CallbackFinished = true;
+    });
 
     // A non-blocking queue will always stay empty because the task has been executed immediately.
-    if(!alpaka::test::queue::IsBlockingQueue<typename Fixture::Queue>::value)
+    if(!alpaka::test::IsBlockingQueue<typename Fixture::Queue>::value)
     {
-        alpaka::wait::wait(f.m_queue);
+        alpaka::wait(f.m_queue);
     }
 
-    CHECK(alpaka::queue::empty(f.m_queue));
+    CHECK(alpaka::empty(f.m_queue));
     CHECK(CallbackFinished);
 }
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "queueShouldNotExecuteTasksInParallel", "[queue]", TestQueues)
+TEMPLATE_LIST_TEST_CASE("queueShouldNotExecuteTasksInParallel", "[queue]", TestQueues)
 {
     using DevQueue = TestType;
-    using Fixture = alpaka::test::queue::QueueTestFixture<DevQueue>;
+    using Fixture = alpaka::test::QueueTestFixture<DevQueue>;
     Fixture f;
 
     std::atomic<bool> taskIsExecuting(false);
@@ -122,35 +112,29 @@ TEMPLATE_LIST_TEST_CASE( "queueShouldNotExecuteTasksInParallel", "[queue]", Test
     std::promise<void> secondTaskFinished;
     std::future<void> secondTaskFinishedFuture = secondTaskFinished.get_future();
 
-    std::thread thread1([&f, &taskIsExecuting, &firstTaskFinished](){
-        alpaka::queue::enqueue(
-            f.m_queue,
-            [&taskIsExecuting, &firstTaskFinished]() noexcept
-            {
-                CHECK(!taskIsExecuting.exchange(true));
-                std::this_thread::sleep_for(std::chrono::milliseconds(100u));
-                CHECK(taskIsExecuting.exchange(false));
-                firstTaskFinished.set_value();
-            });
+    std::thread thread1([&f, &taskIsExecuting, &firstTaskFinished]() {
+        alpaka::enqueue(f.m_queue, [&taskIsExecuting, &firstTaskFinished]() noexcept {
+            CHECK(!taskIsExecuting.exchange(true));
+            std::this_thread::sleep_for(std::chrono::milliseconds(100u));
+            CHECK(taskIsExecuting.exchange(false));
+            firstTaskFinished.set_value();
+        });
     });
 
-    std::thread thread2([&f, &taskIsExecuting, &secondTaskFinished](){
-        alpaka::queue::enqueue(
-            f.m_queue,
-            [&taskIsExecuting, &secondTaskFinished]() noexcept
-            {
-                CHECK(!taskIsExecuting.exchange(true));
-                std::this_thread::sleep_for(std::chrono::milliseconds(100u));
-                CHECK(taskIsExecuting.exchange(false));
-                secondTaskFinished.set_value();
-            });
+    std::thread thread2([&f, &taskIsExecuting, &secondTaskFinished]() {
+        alpaka::enqueue(f.m_queue, [&taskIsExecuting, &secondTaskFinished]() noexcept {
+            CHECK(!taskIsExecuting.exchange(true));
+            std::this_thread::sleep_for(std::chrono::milliseconds(100u));
+            CHECK(taskIsExecuting.exchange(false));
+            secondTaskFinished.set_value();
+        });
     });
 
     // Both tasks have to be enqueued
     thread1.join();
     thread2.join();
 
-    alpaka::wait::wait(f.m_queue);
+    alpaka::wait(f.m_queue);
 
     firstTaskFinishedFuture.get();
     secondTaskFinishedFuture.get();
diff --git a/thirdParty/cupla/alpaka/test/unit/rand/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/rand/CMakeLists.txt
index bc7f7c274d..21101813e7 100644
--- a/thirdParty/cupla/alpaka/test/unit/rand/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/rand/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2017-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2017-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "rand")
+set(_TARGET_NAME "rand")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/rand/src/RandTest.cpp b/thirdParty/cupla/alpaka/test/unit/rand/src/RandTest.cpp
index 15fa2f3867..0222d1c8be 100644
--- a/thirdParty/cupla/alpaka/test/unit/rand/src/RandTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/rand/src/RandTest.cpp
@@ -1,6 +1,6 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -8,9 +8,8 @@
  */
 
 #include <alpaka/rand/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -18,16 +17,8 @@
 class RandTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc,
-        typename T_Generator
-    >
-    ALPAKA_FN_ACC void
-    genNumbers(
-        TAcc const & acc,
-        bool * success,
-        T_Generator & gen
-    ) const
+    template<typename TAcc, typename T_Generator>
+    ALPAKA_FN_ACC void genNumbers(TAcc const& acc, bool* success, T_Generator& gen) const
     {
         {
             auto dist(alpaka::rand::distribution::createNormalReal<float>(acc));
@@ -70,66 +61,47 @@ class RandTestKernel
     }
 
 public:
-
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
         // default generator for accelerator
-        auto genDefault = alpaka::rand::generator::createDefault(
-            acc,
-            12345u,
-            6789u
-        );
-        genNumbers( acc, success, genDefault );
+        auto genDefault = alpaka::rand::generator::createDefault(acc, 12345u, 6789u);
+        genNumbers(acc, success, genDefault);
 
-#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && \
-  !defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#    ifndef ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+        // TODO: These ifdefs are wrong: They will reduce the test to the
+        // smallest common denominator from all enabled backends
         // std::random_device
-        auto genRandomDevice = alpaka::rand::generator::createDefault(
-            alpaka::rand::RandomDevice{},
-            12345u,
-            6789u
-        );
-        genNumbers( acc, success, genRandomDevice );
+        auto genRandomDevice = alpaka::rand::generator::createDefault(alpaka::rand::RandomDevice{}, 12345u, 6789u);
+        genNumbers(acc, success, genRandomDevice);
 
         // MersenneTwister
-        auto genMersenneTwister = alpaka::rand::generator::createDefault(
-            alpaka::rand::MersenneTwister{},
-            12345u,
-            6789u
-        );
-        genNumbers( acc, success, genMersenneTwister );
+        auto genMersenneTwister
+            = alpaka::rand::generator::createDefault(alpaka::rand::MersenneTwister{}, 12345u, 6789u);
+        genNumbers(acc, success, genMersenneTwister);
+#    endif
 
         // TinyMersenneTwister
-        auto genTinyMersenneTwister = alpaka::rand::generator::createDefault(
-            alpaka::rand::TinyMersenneTwister{},
-            12345u,
-            6789u
-        );
-        genNumbers( acc, success, genTinyMersenneTwister );
+        auto genTinyMersenneTwister
+            = alpaka::rand::generator::createDefault(alpaka::rand::TinyMersenneTwister{}, 12345u, 6789u);
+        genNumbers(acc, success, genTinyMersenneTwister);
 #endif
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "defaultRandomGeneratorIsWorking", "[rand]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorIsWorking", "[rand]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     RandTestKernel kernel;
 
-    REQUIRE(
-        fixture(
-            kernel));
+    REQUIRE(fixture(kernel));
 }
diff --git a/thirdParty/cupla/alpaka/test/unit/time/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/time/CMakeLists.txt
index f92d82b3c0..0cc9a1e35a 100644
--- a/thirdParty/cupla/alpaka/test/unit/time/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/time/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2016-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2016-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "time")
+set(_TARGET_NAME "time")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/time/src/ClockTest.cpp b/thirdParty/cupla/alpaka/test/unit/time/src/ClockTest.cpp
index dd99b86d02..13957f0608 100644
--- a/thirdParty/cupla/alpaka/test/unit/time/src/ClockTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/time/src/ClockTest.cpp
@@ -1,16 +1,15 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
-#include <alpaka/time/Traits.hpp>
-
-#include <alpaka/test/acc/TestAccs.hpp>
 #include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/time/Traits.hpp>
 
 #include <catch2/catch.hpp>
 
@@ -20,19 +19,13 @@ class ClockTestKernel
 public:
     //-----------------------------------------------------------------------------
     ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TAcc>
-    ALPAKA_FN_ACC auto operator()(
-        TAcc const & acc,
-        bool * success) const
-    -> void
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
     {
-        std::uint64_t const start(
-            alpaka::time::clock(acc));
+        std::uint64_t const start(alpaka::clock(acc));
         ALPAKA_CHECK(*success, 0u != start);
 
-        std::uint64_t const end(
-            alpaka::time::clock(acc));
+        std::uint64_t const end(alpaka::clock(acc));
         ALPAKA_CHECK(*success, 0u != end);
 
         // 'end' has to be greater equal 'start'.
@@ -42,14 +35,13 @@ class ClockTestKernel
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "clockIsWorking", "[timeClock]", alpaka::test::acc::TestAccs)
+TEMPLATE_LIST_TEST_CASE("clockIsWorking", "[timeClock]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
-    using Dim = alpaka::dim::Dim<Acc>;
-    using Idx = alpaka::idx::Idx<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
 
-    alpaka::test::KernelExecutionFixture<Acc> fixture(
-        alpaka::vec::Vec<Dim, Idx>::ones());
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::ones());
 
     ClockTestKernel kernel;
 
diff --git a/thirdParty/cupla/alpaka/test/unit/vec/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/vec/CMakeLists.txt
index bcacd96981..229b0ef2e7 100644
--- a/thirdParty/cupla/alpaka/test/unit/vec/CMakeLists.txt
+++ b/thirdParty/cupla/alpaka/test/unit/vec/CMakeLists.txt
@@ -1,27 +1,24 @@
 #
-# Copyright 2014-2019 Benjamin Worpitz, Axel Huebl
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
 #
-# This file is part of Alpaka.
+# This file is part of alpaka.
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #
 
-SET(_TARGET_NAME "vec")
+set(_TARGET_NAME "vec")
 
 append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
 
-ALPAKA_ADD_EXECUTABLE(
+alpaka_add_executable(
     ${_TARGET_NAME}
     ${_FILES_SOURCE})
-TARGET_INCLUDE_DIRECTORIES(
-    ${_TARGET_NAME}
-    PRIVATE ${Boost_INCLUDE_DIRS})
-TARGET_LINK_LIBRARIES(
+target_link_libraries(
     ${_TARGET_NAME}
     PRIVATE common)
 
-SET_TARGET_PROPERTIES(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
 
-ADD_TEST(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/vec/src/VecTest.cpp b/thirdParty/cupla/alpaka/test/unit/vec/src/VecTest.cpp
index 20ce0ffb52..3cd920a033 100644
--- a/thirdParty/cupla/alpaka/test/unit/vec/src/VecTest.cpp
+++ b/thirdParty/cupla/alpaka/test/unit/vec/src/VecTest.cpp
@@ -1,68 +1,53 @@
 /* Copyright 2019 Axel Huebl, Benjamin Worpitz, Erik Zenker
  *
- * This file is part of Alpaka.
+ * This file is part of alpaka.
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
-#include <alpaka/vec/Vec.hpp>
-
-#include <alpaka/test/dim/TestDims.hpp>
 #include <alpaka/meta/ForEachType.hpp>
+#include <alpaka/test/dim/TestDims.hpp>
+#include <alpaka/vec/Vec.hpp>
 
 #include <catch2/catch.hpp>
 
+#include <utility>
+
 //-----------------------------------------------------------------------------
 TEST_CASE("basicVecTraits", "[vec]")
 {
-    using Dim = alpaka::dim::DimInt<3u>;
+    using Dim = alpaka::DimInt<3u>;
     using Idx = std::size_t;
-    using Vec = alpaka::vec::Vec<Dim, Idx>;
-
-    Vec const vec(
-        static_cast<Idx>(0u),
-        static_cast<Idx>(8u),
-        static_cast<Idx>(15u));
+    using Vec = alpaka::Vec<Dim, Idx>;
 
+    Vec const vec(static_cast<Idx>(0u), static_cast<Idx>(8u), static_cast<Idx>(15u));
 
 
     //-----------------------------------------------------------------------------
-    // alpaka::vec::Vec zero elements
+    // alpaka::Vec zero elements
     {
-        using Dim0 = alpaka::dim::DimInt<0u>;
-        alpaka::vec::Vec<Dim0, Idx> const vec0{};
+        using Dim0 = alpaka::DimInt<0u>;
+        alpaka::Vec<Dim0, Idx> const vec0{};
     }
 
     //-----------------------------------------------------------------------------
-    // alpaka::vec::subVecFromIndices
+    // alpaka::subVecFromIndices
     {
-        using IdxSequence =
-            alpaka::meta::IntegerSequence<
-                std::size_t,
-                0u,
-                Dim::value -1u,
-                0u>;
-        auto const vecSubIndices(
-            alpaka::vec::subVecFromIndices<
-                IdxSequence>(
-                    vec));
+        using IdxSequence = std::integer_sequence<std::size_t, 0u, Dim::value - 1u, 0u>;
+        auto const vecSubIndices(alpaka::subVecFromIndices<IdxSequence>(vec));
 
         REQUIRE(vecSubIndices[0u] == vec[0u]);
-        REQUIRE(vecSubIndices[1u] == vec[Dim::value -1u]);
+        REQUIRE(vecSubIndices[1u] == vec[Dim::value - 1u]);
         REQUIRE(vecSubIndices[2u] == vec[0u]);
     }
 
     //-----------------------------------------------------------------------------
-    // alpaka::vec::subVecBegin
+    // alpaka::subVecBegin
     {
-        using DimSubVecEnd =
-            alpaka::dim::DimInt<2u>;
-        auto const vecSubBegin(
-            alpaka::vec::subVecBegin<
-                DimSubVecEnd>(
-                    vec));
+        using DimSubVecEnd = alpaka::DimInt<2u>;
+        auto const vecSubBegin(alpaka::subVecBegin<DimSubVecEnd>(vec));
 
         for(typename Dim::value_type i(0); i < DimSubVecEnd::value; ++i)
         {
@@ -71,14 +56,10 @@ TEST_CASE("basicVecTraits", "[vec]")
     }
 
     //-----------------------------------------------------------------------------
-    // alpaka::vec::subVecEnd
+    // alpaka::subVecEnd
     {
-        using DimSubVecEnd =
-            alpaka::dim::DimInt<2u>;
-        auto const vecSubEnd(
-            alpaka::vec::subVecEnd<
-                DimSubVecEnd>(
-                    vec));
+        using DimSubVecEnd = alpaka::DimInt<2u>;
+        auto const vecSubEnd(alpaka::subVecEnd<DimSubVecEnd>(vec));
 
         for(typename Dim::value_type i(0); i < DimSubVecEnd::value; ++i)
         {
@@ -87,19 +68,16 @@ TEST_CASE("basicVecTraits", "[vec]")
     }
 
     //-----------------------------------------------------------------------------
-    // alpaka::vec::cast
+    // alpaka::castVec
     {
         using SizeCast = std::uint16_t;
-        auto const vecCast(
-            alpaka::vec::cast<
-                SizeCast>(
-                    vec));
+        auto const vecCast(alpaka::castVec<SizeCast>(vec));
 
         /*using VecCastConst = decltype(vecCast);
-        using VecCast = typename std::decay<VecCastConst>::type;
+        using VecCast = std::decay_t<VecCastConst>;
         static_assert(
             std::is_same<
-                alpaka::idx::Idx<VecCast>,
+                alpaka::Idx<VecCast>,
                 SizeCast
             >::value,
             "The idx type of the casted vec is wrong");*/
@@ -111,11 +89,9 @@ TEST_CASE("basicVecTraits", "[vec]")
     }
 
     //-----------------------------------------------------------------------------
-    // alpaka::vec::reverse
+    // alpaka::reverseVec
     {
-        auto const vecReverse(
-            alpaka::vec::reverse(
-                vec));
+        auto const vecReverse(alpaka::reverseVec(vec));
 
         for(typename Dim::value_type i(0); i < Dim::value; ++i)
         {
@@ -124,20 +100,15 @@ TEST_CASE("basicVecTraits", "[vec]")
     }
 
     //-----------------------------------------------------------------------------
-    // alpaka::vec::concat
+    // alpaka::concatVec
     {
-        using Dim2 = alpaka::dim::DimInt<2u>;
-        alpaka::vec::Vec<Dim2, Idx> const vec2(
-            static_cast<Idx>(47u),
-            static_cast<Idx>(11u));
+        using Dim2 = alpaka::DimInt<2u>;
+        alpaka::Vec<Dim2, Idx> const vec2(static_cast<Idx>(47u), static_cast<Idx>(11u));
 
-        auto const vecConcat(
-            alpaka::vec::concat(
-                vec,
-                vec2));
+        auto const vecConcat(alpaka::concatVec(vec, vec2));
 
         static_assert(
-            std::is_same<alpaka::dim::Dim<std::decay<decltype(vecConcat)>::type>, alpaka::dim::DimInt<5u>>::value,
+            std::is_same<alpaka::Dim<std::decay<decltype(vecConcat)>::type>, alpaka::DimInt<5u>>::value,
             "Result dimension type of concatenation incorrect!");
 
         for(typename Dim::value_type i(0); i < Dim::value; ++i)
@@ -151,25 +122,22 @@ TEST_CASE("basicVecTraits", "[vec]")
     }
 
     {
-        alpaka::vec::Vec<Dim, Idx> const vec3(
-            static_cast<Idx>(47u),
-            static_cast<Idx>(8u),
-            static_cast<Idx>(3u));
+        alpaka::Vec<Dim, Idx> const vec3(static_cast<Idx>(47u), static_cast<Idx>(8u), static_cast<Idx>(3u));
 
         //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator +
+        // alpaka::Vec operator +
         {
             auto const vecLessEqual(vec + vec3);
 
             static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
+                std::is_same<alpaka::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
                 "Result dimension type of operator <= incorrect!");
 
             static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
+                std::is_same<alpaka::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
                 "Result idx type of operator <= incorrect!");
 
-            alpaka::vec::Vec<Dim, Idx> const referenceVec(
+            alpaka::Vec<Dim, Idx> const referenceVec(
                 static_cast<Idx>(47u),
                 static_cast<Idx>(16u),
                 static_cast<Idx>(18u));
@@ -178,19 +146,19 @@ TEST_CASE("basicVecTraits", "[vec]")
         }
 
         //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator -
+        // alpaka::Vec operator -
         {
             auto const vecLessEqual(vec - vec3);
 
             static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
+                std::is_same<alpaka::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
                 "Result dimension type of operator <= incorrect!");
 
             static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
+                std::is_same<alpaka::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
                 "Result idx type of operator <= incorrect!");
 
-            alpaka::vec::Vec<Dim, Idx> const referenceVec(
+            alpaka::Vec<Dim, Idx> const referenceVec(
                 static_cast<Idx>(-47),
                 static_cast<Idx>(0u),
                 static_cast<Idx>(12u));
@@ -199,19 +167,19 @@ TEST_CASE("basicVecTraits", "[vec]")
         }
 
         //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator *
+        // alpaka::Vec operator *
         {
             auto const vecLessEqual(vec * vec3);
 
             static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
+                std::is_same<alpaka::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
                 "Result dimension type of operator <= incorrect!");
 
             static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
+                std::is_same<alpaka::Idx<std::decay<decltype(vecLessEqual)>::type>, Idx>::value,
                 "Result idx type of operator <= incorrect!");
 
-            alpaka::vec::Vec<Dim, Idx> const referenceVec(
+            alpaka::Vec<Dim, Idx> const referenceVec(
                 static_cast<Idx>(0u),
                 static_cast<Idx>(64u),
                 static_cast<Idx>(45u));
@@ -220,85 +188,73 @@ TEST_CASE("basicVecTraits", "[vec]")
         }
 
         //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator <
+        // alpaka::Vec operator <
         {
             auto const vecLessEqual(vec < vec3);
 
             static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
+                std::is_same<alpaka::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
                 "Result dimension type of operator <= incorrect!");
 
             static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
+                std::is_same<alpaka::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
                 "Result idx type of operator <= incorrect!");
 
-            alpaka::vec::Vec<Dim, bool> const referenceVec(
-                true,
-                false,
-                false);
+            alpaka::Vec<Dim, bool> const referenceVec(true, false, false);
 
             REQUIRE(referenceVec == vecLessEqual);
         }
 
         //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator <=
+        // alpaka::Vec operator <=
         {
             auto const vecLessEqual(vec <= vec3);
 
             static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
+                std::is_same<alpaka::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
                 "Result dimension type of operator <= incorrect!");
 
             static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
+                std::is_same<alpaka::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
                 "Result idx type of operator <= incorrect!");
 
-            alpaka::vec::Vec<Dim, bool> const referenceVec(
-                true,
-                true,
-                false);
+            alpaka::Vec<Dim, bool> const referenceVec(true, true, false);
 
             REQUIRE(referenceVec == vecLessEqual);
         }
 
         //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator >=
+        // alpaka::Vec operator >=
         {
             auto const vecLessEqual(vec >= vec3);
 
             static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
+                std::is_same<alpaka::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
                 "Result dimension type of operator <= incorrect!");
 
             static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
+                std::is_same<alpaka::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
                 "Result idx type of operator <= incorrect!");
 
-            alpaka::vec::Vec<Dim, bool> const referenceVec(
-                false,
-                true,
-                true);
+            alpaka::Vec<Dim, bool> const referenceVec(false, true, true);
 
             REQUIRE(referenceVec == vecLessEqual);
         }
 
         //-----------------------------------------------------------------------------
-        // alpaka::vec::Vec operator >
+        // alpaka::Vec operator >
         {
             auto const vecLessEqual(vec > vec3);
 
             static_assert(
-                std::is_same<alpaka::dim::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
+                std::is_same<alpaka::Dim<std::decay<decltype(vecLessEqual)>::type>, Dim>::value,
                 "Result dimension type of operator <= incorrect!");
 
             static_assert(
-                std::is_same<alpaka::idx::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
+                std::is_same<alpaka::Idx<std::decay<decltype(vecLessEqual)>::type>, bool>::value,
                 "Result idx type of operator <= incorrect!");
 
-            alpaka::vec::Vec<Dim, bool> const referenceVec(
-                false,
-                false,
-                true);
+            alpaka::Vec<Dim, bool> const referenceVec(false, false, true);
 
             REQUIRE(referenceVec == vecLessEqual);
         }
@@ -306,20 +262,13 @@ TEST_CASE("basicVecTraits", "[vec]")
 }
 
 //#############################################################################
-template<
-    typename TDim,
-    typename TIdx>
+template<typename TDim, typename TIdx>
 struct NonAlpakaVec
 {
     //-----------------------------------------------------------------------------
-    operator ::alpaka::vec::Vec<
-        TDim,
-        TIdx>() const
+    operator ::alpaka::Vec<TDim, TIdx>() const
     {
-        using AlpakaVector = ::alpaka::vec::Vec<
-            TDim,
-            TIdx
-        >;
+        using AlpakaVector = ::alpaka::Vec<TDim, TIdx>;
         AlpakaVector result(AlpakaVector::zeros());
 
         for(TIdx d(0); d < TDim::value; ++d)
@@ -330,21 +279,20 @@ struct NonAlpakaVec
         return result;
     }
     //-----------------------------------------------------------------------------
-    auto operator [](TIdx /*idx*/) const
-    -> TIdx
+    auto operator[](TIdx /*idx*/) const -> TIdx
     {
         return static_cast<TIdx>(0);
     }
 };
 
 //-----------------------------------------------------------------------------
-TEMPLATE_LIST_TEST_CASE( "vecNDConstructionFromNonAlpakaVec", "[vec]", alpaka::test::dim::TestDims)
+TEMPLATE_LIST_TEST_CASE("vecNDConstructionFromNonAlpakaVec", "[vec]", alpaka::test::TestDims)
 {
     using Dim = TestType;
     using Idx = std::size_t;
 
     NonAlpakaVec<Dim, Idx> nonAlpakaVec;
-    auto const alpakaVec(static_cast<alpaka::vec::Vec<Dim, Idx>>(nonAlpakaVec));
+    auto const alpakaVec(static_cast<alpaka::Vec<Dim, Idx>>(nonAlpakaVec));
 
     for(Idx d(0); d < Dim::value; ++d)
     {
diff --git a/thirdParty/cupla/alpaka/test/unit/warp/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/warp/CMakeLists.txt
new file mode 100644
index 0000000000..892a36ec05
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/warp/CMakeLists.txt
@@ -0,0 +1,25 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
+#
+# This file is part of Alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+set(_TARGET_NAME "warp")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+target_compile_definitions(${_TARGET_NAME} PRIVATE "-DTEST_UNIT_WARP")
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/Activemask.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/Activemask.cpp
new file mode 100644
index 0000000000..4c0d0c3e07
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/warp/src/Activemask.cpp
@@ -0,0 +1,101 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+
+//#############################################################################
+class ActivemaskSingleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent == 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::activemask(acc) == 1u);
+    }
+};
+
+//#############################################################################
+class ActivemaskMultipleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success, std::uint64_t inactiveThreadIdx) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        // Test relies on having a single warp per thread block
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const threadIdxInWarp = static_cast<std::uint64_t>(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
+
+        if(threadIdxInWarp == inactiveThreadIdx)
+            return;
+
+        auto const actual = alpaka::warp::activemask(acc);
+        using Result = decltype(actual);
+        Result const allActive = (Result{1} << static_cast<Result>(warpExtent)) - 1;
+        Result const expected = allActive & ~(Result{1} << inactiveThreadIdx);
+        ALPAKA_CHECK(*success, actual == expected);
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const warpExtent = alpaka::getWarpSize(dev);
+    if(warpExtent == 1)
+    {
+        Idx const gridThreadExtentPerDim = 4;
+        alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(gridThreadExtentPerDim));
+        ActivemaskSingleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+    }
+    else
+    {
+        // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+        return;
+#else
+        using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+        auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+        // Enforce one warp per thread block
+        auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+        blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+        auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+        auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+        auto fixture = ExecutionFixture{workDiv};
+        ActivemaskMultipleThreadWarpTestKernel kernel;
+        for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+            REQUIRE(fixture(kernel, inactiveThreadIdx));
+#endif
+    }
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/All.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/All.cpp
new file mode 100644
index 0000000000..3ee4d74a1c
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/warp/src/All.cpp
@@ -0,0 +1,107 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+
+//#############################################################################
+class AllSingleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent == 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::all(acc, 42) != 0);
+        ALPAKA_CHECK(*success, alpaka::warp::all(acc, 0) == 0);
+    }
+};
+
+//#############################################################################
+class AllMultipleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::all(acc, 0) == 0);
+        ALPAKA_CHECK(*success, alpaka::warp::all(acc, 42) != 0);
+
+        // Test relies on having a single warp per thread block
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const threadIdxInWarp = static_cast<std::int32_t>(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
+
+        // Some threads quit the kernel to test that the warp operations
+        // properly operate on the active threads only
+        if(threadIdxInWarp % 3)
+            return;
+
+        for(auto idx = 0; idx < warpExtent; idx++)
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::all(acc, threadIdxInWarp == idx ? 1 : 0) == 0);
+            std::int32_t const expected = idx % 3 ? 1 : 0;
+            ALPAKA_CHECK(*success, alpaka::warp::all(acc, threadIdxInWarp == idx ? 0 : 1) == expected);
+        }
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const warpExtent = alpaka::getWarpSize(dev);
+    if(warpExtent == 1)
+    {
+        Idx const gridThreadExtentPerDim = 4;
+        alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(gridThreadExtentPerDim));
+        AllSingleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+    }
+    else
+    {
+        // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+        return;
+#else
+        using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+        auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+        // Enforce one warp per thread block
+        auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+        blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+        auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+        auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+        auto fixture = ExecutionFixture{workDiv};
+        AllMultipleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+#endif
+    }
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/Any.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/Any.cpp
new file mode 100644
index 0000000000..fb52b0834c
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/warp/src/Any.cpp
@@ -0,0 +1,107 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+
+//#############################################################################
+class AnySingleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent == 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::any(acc, 42) != 0);
+        ALPAKA_CHECK(*success, alpaka::warp::any(acc, 0) == 0);
+    }
+};
+
+//#############################################################################
+class AnyMultipleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::any(acc, 0) == 0);
+        ALPAKA_CHECK(*success, alpaka::warp::any(acc, 42) != 0);
+
+        // Test relies on having a single warp per thread block
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const threadIdxInWarp = static_cast<std::int32_t>(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
+
+        // Some threads quit the kernel to test that the warp operations
+        // properly operate on the active threads only
+        if(threadIdxInWarp % 5)
+            return;
+
+        for(auto idx = 0; idx < warpExtent; idx++)
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::any(acc, threadIdxInWarp == idx ? 0 : 1) == 1);
+            std::int32_t const expected = idx % 5 ? 0 : 1;
+            ALPAKA_CHECK(*success, alpaka::warp::any(acc, threadIdxInWarp == idx ? 1 : 0) == expected);
+        }
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const warpExtent = alpaka::getWarpSize(dev);
+    if(warpExtent == 1)
+    {
+        Idx const gridThreadExtentPerDim = 4;
+        alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(gridThreadExtentPerDim));
+        AnySingleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+    }
+    else
+    {
+        // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+        return;
+#else
+        using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+        auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+        // Enforce one warp per thread block
+        auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+        blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+        auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+        auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+        auto fixture = ExecutionFixture{workDiv};
+        AnyMultipleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+#endif
+    }
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/Ballot.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/Ballot.cpp
new file mode 100644
index 0000000000..86612c3596
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/warp/src/Ballot.cpp
@@ -0,0 +1,110 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+
+//#############################################################################
+class BallotSingleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent == 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, 42) == 1u);
+        ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, 0) == 0u);
+    }
+};
+
+//#############################################################################
+class BallotMultipleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, 42) == (std::uint64_t{1} << warpExtent) - 1);
+        ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, 0) == 0u);
+
+        // Test relies on having a single warp per thread block
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const threadIdxInWarp = static_cast<std::int32_t>(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
+
+        // Some threads quit the kernel to test that the warp operations
+        // properly operate on the active threads only
+        if(threadIdxInWarp >= warpExtent / 2)
+            return;
+
+        for(auto idx = 0; idx < warpExtent / 2; idx++)
+        {
+            ALPAKA_CHECK(
+                *success,
+                alpaka::warp::ballot(acc, threadIdxInWarp == idx ? 1 : 0) == std::uint64_t{1} << idx);
+            // First warpExtent / 2 bits are 1 except bit idx
+            std::uint64_t const expected = ((std::uint64_t{1} << warpExtent / 2) - 1) & ~(std::uint64_t{1} << idx);
+            ALPAKA_CHECK(*success, alpaka::warp::ballot(acc, threadIdxInWarp == idx ? 0 : 1) == expected);
+        }
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const warpExtent = alpaka::getWarpSize(dev);
+    if(warpExtent == 1)
+    {
+        Idx const gridThreadExtentPerDim = 4;
+        alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(gridThreadExtentPerDim));
+        BallotSingleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+    }
+    else
+    {
+        // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+        return;
+#else
+        using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+        auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+        // Enforce one warp per thread block
+        auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+        blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+        auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+        auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+        auto fixture = ExecutionFixture{workDiv};
+        BallotMultipleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+#endif
+    }
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/warp/src/GetSize.cpp b/thirdParty/cupla/alpaka/test/unit/warp/src/GetSize.cpp
new file mode 100644
index 0000000000..0154bcca17
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/warp/src/GetSize.cpp
@@ -0,0 +1,48 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+
+//#############################################################################
+class GetSizeTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success, std::int32_t expectedWarpSize) const -> void
+    {
+        std::int32_t const actualWarpSize = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, actualWarpSize == expectedWarpSize);
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const expectedWarpSize = static_cast<int>(alpaka::getWarpSize(dev));
+    Idx const gridThreadExtentPerDim = 8;
+    alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(gridThreadExtentPerDim));
+    GetSizeTestKernel kernel;
+    REQUIRE(fixture(kernel, expectedWarpSize));
+}
diff --git a/thirdParty/cupla/alpaka/test/unit/workDiv/CMakeLists.txt b/thirdParty/cupla/alpaka/test/unit/workDiv/CMakeLists.txt
new file mode 100644
index 0000000000..766afb38d2
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/workDiv/CMakeLists.txt
@@ -0,0 +1,24 @@
+#
+# Copyright 2014-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+set(_TARGET_NAME "workDiv")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/unit")
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_ALPAKA_TEST_OPTIONS})
diff --git a/thirdParty/cupla/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp b/thirdParty/cupla/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp
new file mode 100644
index 0000000000..e47b6e1c4b
--- /dev/null
+++ b/thirdParty/cupla/alpaka/test/unit/workDiv/src/WorkDivHelpersTest.cpp
@@ -0,0 +1,63 @@
+/* Copyright 2020 Sergei Bastrakov
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/acc/AccDevProps.hpp>
+#include <alpaka/core/Unused.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/workdiv/WorkDivHelpers.hpp>
+
+#include <catch2/catch.hpp>
+
+//-----------------------------------------------------------------------------
+namespace
+{
+    template<typename TAcc>
+    auto getWorkDiv()
+    {
+        using Dev = alpaka::Dev<TAcc>;
+        using Pltf = alpaka::Pltf<Dev>;
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
+
+        Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+        auto const gridThreadExtent = alpaka::Vec<Dim, Idx>::all(10);
+        auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+        auto workDiv = alpaka::getValidWorkDiv<TAcc>(
+            dev,
+            gridThreadExtent,
+            threadElementExtent,
+            false,
+            alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
+        return workDiv;
+    }
+} // namespace
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("getValidWorkDiv", "[workDiv]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    // Note: getValidWorkDiv() is called inside getWorkDiv
+    auto workDiv = getWorkDiv<Acc>();
+    alpaka::ignore_unused(workDiv);
+}
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("isValidWorkDiv", "[workDiv]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+
+    Dev dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto workDiv = getWorkDiv<Acc>();
+    // Test both overloads
+    REQUIRE(alpaka::isValidWorkDiv(alpaka::getAccDevProps<Acc>(dev), workDiv));
+    REQUIRE(alpaka::isValidWorkDiv<Acc>(dev, workDiv));
+}
diff --git a/thirdParty/cupla/alpaka/thirdParty/.clang-format b/thirdParty/cupla/alpaka/thirdParty/.clang-format
new file mode 100644
index 0000000000..ef2ae21faf
--- /dev/null
+++ b/thirdParty/cupla/alpaka/thirdParty/.clang-format
@@ -0,0 +1,4 @@
+---
+DisableFormat: true
+SortIncludes: false
+...
diff --git a/thirdParty/cupla/alpaka/thirdParty/catch2/include/catch2/catch.hpp b/thirdParty/cupla/alpaka/thirdParty/catch2/include/catch2/catch.hpp
index b4eccfc148..2a2d77a27f 100644
--- a/thirdParty/cupla/alpaka/thirdParty/catch2/include/catch2/catch.hpp
+++ b/thirdParty/cupla/alpaka/thirdParty/catch2/include/catch2/catch.hpp
@@ -1,9 +1,9 @@
 /*
- *  Catch v2.11.0
- *  Generated: 2019-11-15 15:01:56.628356
+ *  Catch v2.13.3
+ *  Generated: 2020-10-31 18:20:31.045274
  *  ----------------------------------------------------------
  *  This file has been merged from multiple headers. Please don't edit it directly
- *  Copyright (c) 2019 Two Blue Cubes Ltd. All rights reserved.
+ *  Copyright (c) 2020 Two Blue Cubes Ltd. All rights reserved.
  *
  *  Distributed under the Boost Software License, Version 1.0. (See accompanying
  *  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -14,8 +14,8 @@
 
 
 #define CATCH_VERSION_MAJOR 2
-#define CATCH_VERSION_MINOR 11
-#define CATCH_VERSION_PATCH 0
+#define CATCH_VERSION_MINOR 13
+#define CATCH_VERSION_PATCH 3
 
 #ifdef __clang__
 #    pragma clang system_header
@@ -132,15 +132,14 @@ namespace Catch {
 
 #endif
 
-#if defined(CATCH_CPP17_OR_GREATER)
-#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#endif
-
 // We have to avoid both ICC and Clang, because they try to mask themselves
 // as gcc, and we want only GCC in this block
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__)
 #    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" )
 #    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "GCC diagnostic pop" )
+
+#    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__)
+
 #endif
 
 #if defined(__clang__)
@@ -148,6 +147,21 @@ namespace Catch {
 #    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" )
 #    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "clang diagnostic pop" )
 
+// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug
+// which results in calls to destructors being emitted for each temporary,
+// without a matching initialization. In practice, this can result in something
+// like `std::string::~string` being called on an uninitialized value.
+//
+// For example, this code will likely segfault under IBM XL:
+// ```
+// REQUIRE(std::string("12") + "34" == "1234")
+// ```
+//
+// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented.
+#  if !defined(__ibmxl__) && !defined(__CUDACC__)
+#    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */
+#  endif
+
 #    define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
          _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \
          _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"")
@@ -226,10 +240,6 @@ namespace Catch {
 #  define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma( warning(push) )
 #  define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  __pragma( warning(pop) )
 
-#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
-#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#  endif
-
 // Universal Windows platform does not support SEH
 // Or console colours (or console at all...)
 #  if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP)
@@ -241,9 +251,12 @@ namespace Catch {
 // MSVC traditional preprocessor needs some workaround for __VA_ARGS__
 // _MSVC_TRADITIONAL == 0 means new conformant preprocessor
 // _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor
-#  if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL)
-#    define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
-#  endif
+#  if !defined(__clang__) // Handle Clang masquerading for msvc
+#    if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL)
+#      define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#    endif // MSVC_TRADITIONAL
+#  endif // __clang__
+
 #endif // _MSC_VER
 
 #if defined(_REENTRANT) || defined(_MSC_VER)
@@ -291,7 +304,7 @@ namespace Catch {
     #define CATCH_CONFIG_COLOUR_NONE
 #endif
 
-#if defined(__UCLIBC__)
+#if !defined(_GLIBCXX_USE_C99_MATH_TR1)
 #define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER
 #endif
 
@@ -309,7 +322,10 @@ namespace Catch {
 
   // Check if byte is available and usable
   #  if __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
-  #    define CATCH_INTERNAL_CONFIG_CPP17_BYTE
+  #    include <cstddef>
+  #    if __cpp_lib_byte > 0
+  #      define CATCH_INTERNAL_CONFIG_CPP17_BYTE
+  #    endif
   #  endif // __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
 
   // Check if variant is available and usable
@@ -352,10 +368,6 @@ namespace Catch {
 #  define CATCH_CONFIG_CPP17_OPTIONAL
 #endif
 
-#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
-#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#endif
-
 #if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW)
 #  define CATCH_CONFIG_CPP17_STRING_VIEW
 #endif
@@ -417,6 +429,12 @@ namespace Catch {
 #   define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS
 #endif
 
+// The goal of this macro is to avoid evaluation of the arguments, but
+// still have the compiler warn on problems inside...
+#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN)
+#   define CATCH_INTERNAL_IGNORE_BUT_WARN(...)
+#endif
+
 #if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10)
 #   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
 #elif defined(__clang__) && (__clang_major__ < 5)
@@ -748,7 +766,7 @@ constexpr auto operator "" _catch_sr( char const* rawChars, std::size_t size ) n
 #define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3)
 #define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4)
 #define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5)
-#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _4, _5, _6)
+#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6)
 #define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7)
 #define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8)
 #define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9)
@@ -917,13 +935,13 @@ namespace Catch {
 
 #if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703
     // std::result_of is deprecated in C++17 and removed in C++20. Hence, it is
-    // replaced with std::invoke_result here. Also *_t format is preferred over
-    // typename *::type format.
-    template <typename Func, typename U>
-    using FunctionReturnType = std::remove_reference_t<std::remove_cv_t<std::invoke_result_t<Func, U>>>;
+    // replaced with std::invoke_result here.
+    template <typename Func, typename... U>
+    using FunctionReturnType = std::remove_reference_t<std::remove_cv_t<std::invoke_result_t<Func, U...>>>;
 #else
-    template <typename Func, typename U>
-    using FunctionReturnType = typename std::remove_reference<typename std::remove_cv<typename std::result_of<Func(U)>::type>::type>::type;
+    // Keep ::type here because we still support C++11
+    template <typename Func, typename... U>
+    using FunctionReturnType = typename std::remove_reference<typename std::remove_cv<typename std::result_of<Func(U...)>::type>::type>::type;
 #endif
 
 } // namespace Catch
@@ -1078,7 +1096,7 @@ struct AutoReg : NonCopyable {
                     int index = 0;                                    \
                     constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\
                     using expander = int[];\
-                    (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++, 0)... };/* NOLINT */ \
+                    (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \
                 }\
             };\
             static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
@@ -1124,7 +1142,7 @@ struct AutoReg : NonCopyable {
                     constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\
                     constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\
                     constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\
-                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++, 0)... };/* NOLINT */\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++)... };/* NOLINT */\
                 }                                                     \
             };                                                        \
             static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \
@@ -1168,7 +1186,7 @@ struct AutoReg : NonCopyable {
             void reg_tests() {                                          \
                 int index = 0;                                    \
                 using expander = int[];                           \
-                (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++, 0)... };/* NOLINT */\
+                (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */\
             }                                                     \
         };\
         static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \
@@ -1202,7 +1220,7 @@ struct AutoReg : NonCopyable {
                     int index = 0;                                    \
                     constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\
                     using expander = int[];\
-                    (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++, 0)... };/* NOLINT */ \
+                    (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \
                 }\
             };\
             static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
@@ -1251,7 +1269,7 @@ struct AutoReg : NonCopyable {
                     constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\
                     constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\
                     constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\
-                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++, 0)... };/* NOLINT */ \
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++)... };/* NOLINT */ \
                 }\
             };\
             static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
@@ -1298,7 +1316,7 @@ struct AutoReg : NonCopyable {
                 void reg_tests(){\
                     int index = 0;\
                     using expander = int[];\
-                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++, 0)... };/* NOLINT */ \
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */ \
                 }\
             };\
             static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
@@ -1802,8 +1820,8 @@ namespace Catch {
 #endif
 
     namespace Detail {
-        template<typename InputIterator>
-        std::string rangeToString(InputIterator first, InputIterator last) {
+        template<typename InputIterator, typename Sentinel = InputIterator>
+        std::string rangeToString(InputIterator first, Sentinel last) {
             ReusableStringStream rss;
             rss << "{ ";
             if (first != last) {
@@ -1961,20 +1979,27 @@ namespace Catch {
 #endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
 
 namespace Catch {
-    struct not_this_one {}; // Tag type for detecting which begin/ end are being selected
-
-    // Import begin/ end from std here so they are considered alongside the fallback (...) overloads in this namespace
+    // Import begin/ end from std here
     using std::begin;
     using std::end;
 
-    not_this_one begin( ... );
-    not_this_one end( ... );
+    namespace detail {
+        template <typename...>
+        struct void_type {
+            using type = void;
+        };
+
+        template <typename T, typename = void>
+        struct is_range_impl : std::false_type {
+        };
+
+        template <typename T>
+        struct is_range_impl<T, typename void_type<decltype(begin(std::declval<T>()))>::type> : std::true_type {
+        };
+    } // namespace detail
 
     template <typename T>
-    struct is_range {
-        static const bool value =
-            !std::is_same<decltype(begin(std::declval<T>())), not_this_one>::value &&
-            !std::is_same<decltype(end(std::declval<T>())), not_this_one>::value;
+    struct is_range : detail::is_range_impl<T> {
     };
 
 #if defined(_MANAGED) // Managed types are never ranges
@@ -2342,6 +2367,18 @@ namespace Catch {
         auto operator <= ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
             return { static_cast<bool>(m_lhs <= rhs), m_lhs, "<=", rhs };
         }
+        template <typename RhsT>
+        auto operator | (RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs | rhs), m_lhs, "|", rhs };
+        }
+        template <typename RhsT>
+        auto operator & (RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs & rhs), m_lhs, "&", rhs };
+        }
+        template <typename RhsT>
+        auto operator ^ (RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {
+            return { static_cast<bool>(m_lhs ^ rhs), m_lhs, "^", rhs };
+        }
 
         template<typename RhsT>
         auto operator && ( RhsT const& ) -> BinaryExpr<LhsT, RhsT const&> const {
@@ -2422,7 +2459,7 @@ namespace Catch {
         virtual void sectionEnded( SectionEndInfo const& endInfo ) = 0;
         virtual void sectionEndedEarly( SectionEndInfo const& endInfo ) = 0;
 
-        virtual auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& = 0;
+        virtual auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& = 0;
 
 #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
         virtual void benchmarkPreparing( std::string const& name ) = 0;
@@ -2660,6 +2697,7 @@ namespace Catch {
 ///////////////////////////////////////////////////////////////////////////////
 #define INTERNAL_CATCH_TEST( macroName, resultDisposition, ... ) \
     do { \
+        CATCH_INTERNAL_IGNORE_BUT_WARN(__VA_ARGS__); \
         Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
         INTERNAL_CATCH_TRY { \
             CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
@@ -2668,8 +2706,7 @@ namespace Catch {
             CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
         } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
         INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( (void)0, (false) && static_cast<bool>( !!(__VA_ARGS__) ) ) // the expression here is never evaluated at runtime but it forces the compiler to give it a look
-    // The double negation silences MSVC's C4800 warning, the static_cast forces short-circuit evaluation if the type has overloaded &&.
+    } while( (void)0, (false) && static_cast<bool>( !!(__VA_ARGS__) ) )
 
 ///////////////////////////////////////////////////////////////////////////////
 #define INTERNAL_CATCH_IF( macroName, resultDisposition, ... ) \
@@ -2986,6 +3023,9 @@ namespace Catch {
             {}
 
             std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const override {
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+                return "";
+#else
                 try {
                     if( it == itEnd )
                         std::rethrow_exception(std::current_exception());
@@ -2995,6 +3035,7 @@ namespace Catch {
                 catch( T& ex ) {
                     return m_translateFunction( ex );
                 }
+#endif
             }
 
         protected:
@@ -3263,9 +3304,10 @@ namespace Matchers {
                 return description;
             }
 
-            MatchAllOf<ArgT>& operator && ( MatcherBase<ArgT> const& other ) {
-                m_matchers.push_back( &other );
-                return *this;
+            MatchAllOf<ArgT> operator && ( MatcherBase<ArgT> const& other ) {
+                auto copy(*this);
+                copy.m_matchers.push_back( &other );
+                return copy;
             }
 
             std::vector<MatcherBase<ArgT> const*> m_matchers;
@@ -3296,9 +3338,10 @@ namespace Matchers {
                 return description;
             }
 
-            MatchAnyOf<ArgT>& operator || ( MatcherBase<ArgT> const& other ) {
-                m_matchers.push_back( &other );
-                return *this;
+            MatchAnyOf<ArgT> operator || ( MatcherBase<ArgT> const& other ) {
+                auto copy(*this);
+                copy.m_matchers.push_back( &other );
+                return copy;
             }
 
             std::vector<MatcherBase<ArgT> const*> m_matchers;
@@ -3555,12 +3598,12 @@ namespace Catch {
 namespace Matchers {
 
     namespace Vector {
-        template<typename T>
-        struct ContainsElementMatcher : MatcherBase<std::vector<T>> {
+        template<typename T, typename Alloc>
+        struct ContainsElementMatcher : MatcherBase<std::vector<T, Alloc>> {
 
             ContainsElementMatcher(T const &comparator) : m_comparator( comparator) {}
 
-            bool match(std::vector<T> const &v) const override {
+            bool match(std::vector<T, Alloc> const &v) const override {
                 for (auto const& el : v) {
                     if (el == m_comparator) {
                         return true;
@@ -3576,12 +3619,12 @@ namespace Matchers {
             T const& m_comparator;
         };
 
-        template<typename T>
-        struct ContainsMatcher : MatcherBase<std::vector<T>> {
+        template<typename T, typename AllocComp, typename AllocMatch>
+        struct ContainsMatcher : MatcherBase<std::vector<T, AllocMatch>> {
 
-            ContainsMatcher(std::vector<T> const &comparator) : m_comparator( comparator ) {}
+            ContainsMatcher(std::vector<T, AllocComp> const &comparator) : m_comparator( comparator ) {}
 
-            bool match(std::vector<T> const &v) const override {
+            bool match(std::vector<T, AllocMatch> const &v) const override {
                 // !TBD: see note in EqualsMatcher
                 if (m_comparator.size() > v.size())
                     return false;
@@ -3603,18 +3646,18 @@ namespace Matchers {
                 return "Contains: " + ::Catch::Detail::stringify( m_comparator );
             }
 
-            std::vector<T> const& m_comparator;
+            std::vector<T, AllocComp> const& m_comparator;
         };
 
-        template<typename T>
-        struct EqualsMatcher : MatcherBase<std::vector<T>> {
+        template<typename T, typename AllocComp, typename AllocMatch>
+        struct EqualsMatcher : MatcherBase<std::vector<T, AllocMatch>> {
 
-            EqualsMatcher(std::vector<T> const &comparator) : m_comparator( comparator ) {}
+            EqualsMatcher(std::vector<T, AllocComp> const &comparator) : m_comparator( comparator ) {}
 
-            bool match(std::vector<T> const &v) const override {
+            bool match(std::vector<T, AllocMatch> const &v) const override {
                 // !TBD: This currently works if all elements can be compared using !=
                 // - a more general approach would be via a compare template that defaults
-                // to using !=. but could be specialised for, e.g. std::vector<T> etc
+                // to using !=. but could be specialised for, e.g. std::vector<T, Alloc> etc
                 // - then just call that directly
                 if (m_comparator.size() != v.size())
                     return false;
@@ -3626,15 +3669,15 @@ namespace Matchers {
             std::string describe() const override {
                 return "Equals: " + ::Catch::Detail::stringify( m_comparator );
             }
-            std::vector<T> const& m_comparator;
+            std::vector<T, AllocComp> const& m_comparator;
         };
 
-        template<typename T>
-        struct ApproxMatcher : MatcherBase<std::vector<T>> {
+        template<typename T, typename AllocComp, typename AllocMatch>
+        struct ApproxMatcher : MatcherBase<std::vector<T, AllocMatch>> {
 
-            ApproxMatcher(std::vector<T> const& comparator) : m_comparator( comparator ) {}
+            ApproxMatcher(std::vector<T, AllocComp> const& comparator) : m_comparator( comparator ) {}
 
-            bool match(std::vector<T> const &v) const override {
+            bool match(std::vector<T, AllocMatch> const &v) const override {
                 if (m_comparator.size() != v.size())
                     return false;
                 for (std::size_t i = 0; i < v.size(); ++i)
@@ -3661,16 +3704,14 @@ namespace Matchers {
                 return *this;
             }
 
-            std::vector<T> const& m_comparator;
+            std::vector<T, AllocComp> const& m_comparator;
             mutable Catch::Detail::Approx approx = Catch::Detail::Approx::custom();
         };
 
-        template<typename T>
-        struct UnorderedEqualsMatcher : MatcherBase<std::vector<T>> {
-            UnorderedEqualsMatcher(std::vector<T> const& target) : m_target(target) {}
-            bool match(std::vector<T> const& vec) const override {
-                // Note: This is a reimplementation of std::is_permutation,
-                //       because I don't want to include <algorithm> inside the common path
+        template<typename T, typename AllocComp, typename AllocMatch>
+        struct UnorderedEqualsMatcher : MatcherBase<std::vector<T, AllocMatch>> {
+            UnorderedEqualsMatcher(std::vector<T, AllocComp> const& target) : m_target(target) {}
+            bool match(std::vector<T, AllocMatch> const& vec) const override {
                 if (m_target.size() != vec.size()) {
                     return false;
                 }
@@ -3681,7 +3722,7 @@ namespace Matchers {
                 return "UnorderedEquals: " + ::Catch::Detail::stringify(m_target);
             }
         private:
-            std::vector<T> const& m_target;
+            std::vector<T, AllocComp> const& m_target;
         };
 
     } // namespace Vector
@@ -3689,29 +3730,29 @@ namespace Matchers {
     // The following functions create the actual matcher objects.
     // This allows the types to be inferred
 
-    template<typename T>
-    Vector::ContainsMatcher<T> Contains( std::vector<T> const& comparator ) {
-        return Vector::ContainsMatcher<T>( comparator );
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    Vector::ContainsMatcher<T, AllocComp, AllocMatch> Contains( std::vector<T, AllocComp> const& comparator ) {
+        return Vector::ContainsMatcher<T, AllocComp, AllocMatch>( comparator );
     }
 
-    template<typename T>
-    Vector::ContainsElementMatcher<T> VectorContains( T const& comparator ) {
-        return Vector::ContainsElementMatcher<T>( comparator );
+    template<typename T, typename Alloc = std::allocator<T>>
+    Vector::ContainsElementMatcher<T, Alloc> VectorContains( T const& comparator ) {
+        return Vector::ContainsElementMatcher<T, Alloc>( comparator );
     }
 
-    template<typename T>
-    Vector::EqualsMatcher<T> Equals( std::vector<T> const& comparator ) {
-        return Vector::EqualsMatcher<T>( comparator );
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    Vector::EqualsMatcher<T, AllocComp, AllocMatch> Equals( std::vector<T, AllocComp> const& comparator ) {
+        return Vector::EqualsMatcher<T, AllocComp, AllocMatch>( comparator );
     }
 
-    template<typename T>
-    Vector::ApproxMatcher<T> Approx( std::vector<T> const& comparator ) {
-        return Vector::ApproxMatcher<T>( comparator );
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    Vector::ApproxMatcher<T, AllocComp, AllocMatch> Approx( std::vector<T, AllocComp> const& comparator ) {
+        return Vector::ApproxMatcher<T, AllocComp, AllocMatch>( comparator );
     }
 
-    template<typename T>
-    Vector::UnorderedEqualsMatcher<T> UnorderedEquals(std::vector<T> const& target) {
-        return Vector::UnorderedEqualsMatcher<T>(target);
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    Vector::UnorderedEqualsMatcher<T, AllocComp, AllocMatch> UnorderedEquals(std::vector<T, AllocComp> const& target) {
+        return Vector::UnorderedEqualsMatcher<T, AllocComp, AllocMatch>( target );
     }
 
 } // namespace Matchers
@@ -3907,7 +3948,6 @@ namespace Generators {
     class SingleValueGenerator final : public IGenerator<T> {
         T m_value;
     public:
-        SingleValueGenerator(T const& value) : m_value( value ) {}
         SingleValueGenerator(T&& value) : m_value(std::move(value)) {}
 
         T const& get() const override {
@@ -3970,21 +4010,21 @@ namespace Generators {
             m_generators.emplace_back(std::move(generator));
         }
         void populate(T&& val) {
-            m_generators.emplace_back(value(std::move(val)));
+            m_generators.emplace_back(value(std::forward<T>(val)));
         }
         template<typename U>
         void populate(U&& val) {
-            populate(T(std::move(val)));
+            populate(T(std::forward<U>(val)));
         }
         template<typename U, typename... Gs>
-        void populate(U&& valueOrGenerator, Gs... moreGenerators) {
+        void populate(U&& valueOrGenerator, Gs &&... moreGenerators) {
             populate(std::forward<U>(valueOrGenerator));
             populate(std::forward<Gs>(moreGenerators)...);
         }
 
     public:
         template <typename... Gs>
-        Generators(Gs... moreGenerators) {
+        Generators(Gs &&... moreGenerators) {
             m_generators.reserve(sizeof...(Gs));
             populate(std::forward<Gs>(moreGenerators)...);
         }
@@ -4015,7 +4055,7 @@ namespace Generators {
     struct as {};
 
     template<typename T, typename... Gs>
-    auto makeGenerators( GeneratorWrapper<T>&& generator, Gs... moreGenerators ) -> Generators<T> {
+    auto makeGenerators( GeneratorWrapper<T>&& generator, Gs &&... moreGenerators ) -> Generators<T> {
         return Generators<T>(std::move(generator), std::forward<Gs>(moreGenerators)...);
     }
     template<typename T>
@@ -4023,24 +4063,24 @@ namespace Generators {
         return Generators<T>(std::move(generator));
     }
     template<typename T, typename... Gs>
-    auto makeGenerators( T&& val, Gs... moreGenerators ) -> Generators<T> {
+    auto makeGenerators( T&& val, Gs &&... moreGenerators ) -> Generators<T> {
         return makeGenerators( value( std::forward<T>( val ) ), std::forward<Gs>( moreGenerators )... );
     }
     template<typename T, typename U, typename... Gs>
-    auto makeGenerators( as<T>, U&& val, Gs... moreGenerators ) -> Generators<T> {
+    auto makeGenerators( as<T>, U&& val, Gs &&... moreGenerators ) -> Generators<T> {
         return makeGenerators( value( T( std::forward<U>( val ) ) ), std::forward<Gs>( moreGenerators )... );
     }
 
-    auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker&;
+    auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker&;
 
     template<typename L>
     // Note: The type after -> is weird, because VS2015 cannot parse
     //       the expression used in the typedef inside, when it is in
     //       return type. Yeah.
-    auto generate( SourceLineInfo const& lineInfo, L const& generatorExpression ) -> decltype(std::declval<decltype(generatorExpression())>().get()) {
+    auto generate( StringRef generatorName, SourceLineInfo const& lineInfo, L const& generatorExpression ) -> decltype(std::declval<decltype(generatorExpression())>().get()) {
         using UnderlyingType = typename decltype(generatorExpression())::type;
 
-        IGeneratorTracker& tracker = acquireGeneratorTracker( lineInfo );
+        IGeneratorTracker& tracker = acquireGeneratorTracker( generatorName, lineInfo );
         if (!tracker.hasGenerator()) {
             tracker.setGenerator(pf::make_unique<Generators<UnderlyingType>>(generatorExpression()));
         }
@@ -4053,11 +4093,17 @@ namespace Generators {
 } // namespace Catch
 
 #define GENERATE( ... ) \
-    Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } )
+    Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
 #define GENERATE_COPY( ... ) \
-    Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } )
+    Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
 #define GENERATE_REF( ... ) \
-    Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } )
+    Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
 
 // end catch_generators.hpp
 // start catch_generators_generic.hpp
@@ -4409,6 +4455,7 @@ namespace Catch {
 } // end namespace Catch
 
 // end catch_option.hpp
+#include <chrono>
 #include <iosfwd>
 #include <string>
 #include <vector>
@@ -4466,6 +4513,7 @@ namespace Catch {
         virtual int abortAfter() const = 0;
         virtual bool showInvisibles() const = 0;
         virtual ShowDurations::OrNot showDurations() const = 0;
+        virtual double minDuration() const = 0;
         virtual TestSpec const& testSpec() const = 0;
         virtual bool hasTestFilters() const = 0;
         virtual std::vector<std::string> const& getTestsOrTags() const = 0;
@@ -4479,6 +4527,7 @@ namespace Catch {
         virtual int benchmarkSamples() const = 0;
         virtual double benchmarkConfidenceInterval() const = 0;
         virtual unsigned int benchmarkResamples() const = 0;
+        virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
     };
 
     using IConfigPtr = std::shared_ptr<IConfig const>;
@@ -5232,10 +5281,12 @@ namespace Catch {
         unsigned int benchmarkSamples = 100;
         double benchmarkConfidenceInterval = 0.95;
         unsigned int benchmarkResamples = 100000;
+        std::chrono::milliseconds::rep benchmarkWarmupTime = 100;
 
         Verbosity verbosity = Verbosity::Normal;
         WarnAbout::What warnings = WarnAbout::Nothing;
         ShowDurations::OrNot showDurations = ShowDurations::DefaultForReporter;
+        double minDuration = -1;
         RunTests::InWhatOrder runOrder = RunTests::InDeclarationOrder;
         UseColour::YesOrNo useColour = UseColour::Auto;
         WaitForKeypress::When waitForKeypress = WaitForKeypress::Never;
@@ -5286,6 +5337,7 @@ namespace Catch {
         bool warnAboutMissingAssertions() const override;
         bool warnAboutNoTests() const override;
         ShowDurations::OrNot showDurations() const override;
+        double minDuration() const override;
         RunTests::InWhatOrder runOrder() const override;
         unsigned int rngSeed() const override;
         UseColour::YesOrNo useColour() const override;
@@ -5297,6 +5349,7 @@ namespace Catch {
         int benchmarkSamples() const override;
         double benchmarkConfidenceInterval() const override;
         unsigned int benchmarkResamples() const override;
+        std::chrono::milliseconds benchmarkWarmupTime() const override;
 
     private:
 
@@ -5662,6 +5715,9 @@ namespace Catch {
     // Returns double formatted as %.3f (format expected on output)
     std::string getFormattedDuration( double duration );
 
+    //! Should the reporter show
+    bool shouldShowDuration( IConfig const& config, double duration );
+
     std::string serializeFilters( std::vector<std::string> const& container );
 
     template<typename DerivedT>
@@ -6055,8 +6111,6 @@ namespace Catch {
 
         static std::string getDescription();
 
-        ReporterPreferences getPreferences() const override;
-
         void noMatchingTestCases(std::string const& spec) override;
 
         void assertionStarting(AssertionInfo const&) override;
@@ -6504,20 +6558,18 @@ namespace Catch {
                     return {};
                 }
             };
-            template <typename Sig>
-            using ResultOf_t = typename std::result_of<Sig>::type;
 
             // invoke and not return void :(
             template <typename Fun, typename... Args>
-            CompleteType_t<ResultOf_t<Fun(Args...)>> complete_invoke(Fun&& fun, Args&&... args) {
-                return CompleteInvoker<ResultOf_t<Fun(Args...)>>::invoke(std::forward<Fun>(fun), std::forward<Args>(args)...);
+            CompleteType_t<FunctionReturnType<Fun, Args...>> complete_invoke(Fun&& fun, Args&&... args) {
+                return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(std::forward<Fun>(fun), std::forward<Args>(args)...);
             }
 
             const std::string benchmarkErrorMsg = "a benchmark failed to run successfully";
         } // namespace Detail
 
         template <typename Fun>
-        Detail::CompleteType_t<Detail::ResultOf_t<Fun()>> user_code(Fun&& fun) {
+        Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
             CATCH_TRY{
                 return Detail::complete_invoke(std::forward<Fun>(fun));
             } CATCH_CATCH_ALL{
@@ -6762,8 +6814,8 @@ namespace Catch {
             Result result;
             int iterations;
         };
-        template <typename Clock, typename Sig>
-        using TimingOf = Timing<ClockDuration<Clock>, Detail::CompleteType_t<Detail::ResultOf_t<Sig>>>;
+        template <typename Clock, typename Func, typename... Args>
+        using TimingOf = Timing<ClockDuration<Clock>, Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
     } // namespace Benchmark
 } // namespace Catch
 
@@ -6774,7 +6826,7 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun, typename... Args>
-            TimingOf<Clock, Fun(Args...)> measure(Fun&& fun, Args&&... args) {
+            TimingOf<Clock, Fun, Args...> measure(Fun&& fun, Args&&... args) {
                 auto start = Clock::now();
                 auto&& r = Detail::complete_invoke(fun, std::forward<Args>(args)...);
                 auto end = Clock::now();
@@ -6793,11 +6845,11 @@ namespace Catch {
     namespace Benchmark {
         namespace Detail {
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun(int)> measure_one(Fun&& fun, int iters, std::false_type) {
+            TimingOf<Clock, Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
                 return Detail::measure<Clock>(fun, iters);
             }
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun(Chronometer)> measure_one(Fun&& fun, int iters, std::true_type) {
+            TimingOf<Clock, Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
                 Detail::ChronometerModel<Clock> meter;
                 auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));
 
@@ -6814,7 +6866,7 @@ namespace Catch {
             };
 
             template <typename Clock, typename Fun>
-            TimingOf<Clock, Fun(run_for_at_least_argument_t<Clock, Fun>)> run_for_at_least(ClockDuration<Clock> how_long, int seed, Fun&& fun) {
+            TimingOf<Clock, Fun, run_for_at_least_argument_t<Clock, Fun>> run_for_at_least(ClockDuration<Clock> how_long, int seed, Fun&& fun) {
                 auto iters = seed;
                 while (iters < (1 << 30)) {
                     auto&& Timing = measure_one<Clock>(fun, iters, is_callable<Fun(Chronometer)>());
@@ -6882,11 +6934,13 @@ namespace Catch {
 #include <algorithm>
 #include <functional>
 #include <vector>
+#include <iterator>
 #include <numeric>
 #include <tuple>
 #include <cmath>
 #include <utility>
 #include <cstddef>
+#include <random>
 
 namespace Catch {
     namespace Benchmark {
@@ -7236,10 +7290,10 @@ namespace Catch {
             template <typename Clock>
             ExecutionPlan<FloatDuration<Clock>> prepare(const IConfig &cfg, Environment<FloatDuration<Clock>> env) const {
                 auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
-                auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(Detail::warmup_time));
+                auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(cfg.benchmarkWarmupTime()));
                 auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(run_time), 1, fun);
                 int new_iters = static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
-                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FloatDuration<Clock>>(Detail::warmup_time), Detail::warmup_iterations };
+                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FloatDuration<Clock>>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
             }
 
             template <typename Clock = default_clock>
@@ -7320,60 +7374,65 @@ namespace Catch {
 #include <type_traits>
 
 namespace Catch {
-    namespace Detail {
-        template <typename T, bool Destruct>
-        struct ObjectStorage
-        {
-            using TStorage = typename std::aligned_storage<sizeof(T), std::alignment_of<T>::value>::type;
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T, bool Destruct>
+            struct ObjectStorage
+            {
+                using TStorage = typename std::aligned_storage<sizeof(T), std::alignment_of<T>::value>::type;
 
-            ObjectStorage() : data() {}
+                ObjectStorage() : data() {}
 
-            ObjectStorage(const ObjectStorage& other)
-            {
-                new(&data) T(other.stored_object());
-            }
+                ObjectStorage(const ObjectStorage& other)
+                {
+                    new(&data) T(other.stored_object());
+                }
 
-            ObjectStorage(ObjectStorage&& other)
-            {
-                new(&data) T(std::move(other.stored_object()));
-            }
+                ObjectStorage(ObjectStorage&& other)
+                {
+                    new(&data) T(std::move(other.stored_object()));
+                }
 
-            ~ObjectStorage() { destruct_on_exit<T>(); }
+                ~ObjectStorage() { destruct_on_exit<T>(); }
 
-            template <typename... Args>
-            void construct(Args&&... args)
-            {
-                new (&data) T(std::forward<Args>(args)...);
-            }
+                template <typename... Args>
+                void construct(Args&&... args)
+                {
+                    new (&data) T(std::forward<Args>(args)...);
+                }
 
-            template <bool AllowManualDestruction = !Destruct>
-            typename std::enable_if<AllowManualDestruction>::type destruct()
-            {
-                stored_object().~T();
-            }
+                template <bool AllowManualDestruction = !Destruct>
+                typename std::enable_if<AllowManualDestruction>::type destruct()
+                {
+                    stored_object().~T();
+                }
 
-        private:
-            // If this is a constructor benchmark, destruct the underlying object
-            template <typename U>
-            void destruct_on_exit(typename std::enable_if<Destruct, U>::type* = 0) { destruct<true>(); }
-            // Otherwise, don't
-            template <typename U>
-            void destruct_on_exit(typename std::enable_if<!Destruct, U>::type* = 0) { }
-
-            T& stored_object()
-            {
-                return *static_cast<T*>(static_cast<void*>(&data));
-            }
+            private:
+                // If this is a constructor benchmark, destruct the underlying object
+                template <typename U>
+                void destruct_on_exit(typename std::enable_if<Destruct, U>::type* = 0) { destruct<true>(); }
+                // Otherwise, don't
+                template <typename U>
+                void destruct_on_exit(typename std::enable_if<!Destruct, U>::type* = 0) { }
+
+                T& stored_object() {
+                    return *static_cast<T*>(static_cast<void*>(&data));
+                }
 
-            TStorage data;
-        };
-    }
+                T const& stored_object() const {
+                    return *static_cast<T*>(static_cast<void*>(&data));
+                }
 
-    template <typename T>
-    using storage_for = Detail::ObjectStorage<T, true>;
+                TStorage data;
+            };
+        }
 
-    template <typename T>
-    using destructable_object = Detail::ObjectStorage<T, false>;
+        template <typename T>
+        using storage_for = Detail::ObjectStorage<T, true>;
+
+        template <typename T>
+        using destructable_object = Detail::ObjectStorage<T, false>;
+    }
 }
 
 // end catch_constructor.hpp
@@ -7405,23 +7464,37 @@ namespace TestCaseTracking {
         SourceLineInfo location;
 
         NameAndLocation( std::string const& _name, SourceLineInfo const& _location );
+        friend bool operator==(NameAndLocation const& lhs, NameAndLocation const& rhs) {
+            return lhs.name == rhs.name
+                && lhs.location == rhs.location;
+        }
     };
 
-    struct ITracker;
+    class ITracker;
 
     using ITrackerPtr = std::shared_ptr<ITracker>;
 
-    struct ITracker {
-        virtual ~ITracker();
+    class  ITracker {
+        NameAndLocation m_nameAndLocation;
+
+    public:
+        ITracker(NameAndLocation const& nameAndLoc) :
+            m_nameAndLocation(nameAndLoc)
+        {}
 
         // static queries
-        virtual NameAndLocation const& nameAndLocation() const = 0;
+        NameAndLocation const& nameAndLocation() const {
+            return m_nameAndLocation;
+        }
+
+        virtual ~ITracker();
 
         // dynamic queries
         virtual bool isComplete() const = 0; // Successfully completed or failed
         virtual bool isSuccessfullyCompleted() const = 0;
         virtual bool isOpen() const = 0; // Started but not complete
         virtual bool hasChildren() const = 0;
+        virtual bool hasStarted() const = 0;
 
         virtual ITracker& parent() = 0;
 
@@ -7476,7 +7549,6 @@ namespace TestCaseTracking {
         };
 
         using Children = std::vector<ITrackerPtr>;
-        NameAndLocation m_nameAndLocation;
         TrackerContext& m_ctx;
         ITracker* m_parent;
         Children m_children;
@@ -7485,11 +7557,13 @@ namespace TestCaseTracking {
     public:
         TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent );
 
-        NameAndLocation const& nameAndLocation() const override;
         bool isComplete() const override;
         bool isSuccessfullyCompleted() const override;
         bool isOpen() const override;
         bool hasChildren() const override;
+        bool hasStarted() const override {
+            return m_runState != NotStarted;
+        }
 
         void addChild( ITrackerPtr const& child ) override;
 
@@ -7528,6 +7602,10 @@ namespace TestCaseTracking {
 
         void addInitialFilters( std::vector<std::string> const& filters );
         void addNextFilters( std::vector<std::string> const& filters );
+        //! Returns filters active in this tracker
+        std::vector<std::string> const& getFilters() const;
+        //! Returns whitespace-trimmed name of the tracked section
+        std::string const& trimmedName() const;
     };
 
 } // namespace TestCaseTracking
@@ -7852,7 +7930,24 @@ namespace Catch {
 
 #ifdef CATCH_PLATFORM_MAC
 
-    #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */
+    #if defined(__i386__) || defined(__x86_64__)
+        #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */
+    #elif defined(__aarch64__)
+        #define CATCH_TRAP()  __asm__(".inst 0xd4200000")
+    #endif
+
+#elif defined(CATCH_PLATFORM_IPHONE)
+
+    // use inline assembler
+    #if defined(__i386__) || defined(__x86_64__)
+        #define CATCH_TRAP()  __asm__("int $3")
+    #elif defined(__aarch64__)
+        #define CATCH_TRAP()  __asm__(".inst 0xd4200000")
+    #elif defined(__arm__) && !defined(__thumb__)
+        #define CATCH_TRAP()  __asm__(".inst 0xe7f001f0")
+    #elif defined(__arm__) &&  defined(__thumb__)
+        #define CATCH_TRAP()  __asm__(".inst 0xde01")
+    #endif
 
 #elif defined(CATCH_PLATFORM_LINUX)
     // If we can use inline assembler, do it because this allows us to break
@@ -7872,10 +7967,12 @@ namespace Catch {
     #define CATCH_TRAP() DebugBreak()
 #endif
 
-#ifdef CATCH_TRAP
-    #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }()
-#else
-    #define CATCH_BREAK_INTO_DEBUGGER() []{}()
+#ifndef CATCH_BREAK_INTO_DEBUGGER
+    #ifdef CATCH_TRAP
+        #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }()
+    #else
+        #define CATCH_BREAK_INTO_DEBUGGER() []{}()
+    #endif
 #endif
 
 // end catch_debugger.h
@@ -8022,7 +8119,7 @@ namespace Catch {
         void sectionEnded( SectionEndInfo const& endInfo ) override;
         void sectionEndedEarly( SectionEndInfo const& endInfo ) override;
 
-        auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& override;
+        auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& override;
 
 #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
         void benchmarkPreparing( std::string const& name ) override;
@@ -8998,7 +9095,7 @@ namespace detail {
     }
     inline auto convertInto( std::string const &source, bool &target ) -> ParserResult {
         std::string srcLC = source;
-        std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( char c ) { return static_cast<char>( std::tolower(c) ); } );
+        std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( unsigned char c ) { return static_cast<char>( std::tolower(c) ); } );
         if (srcLC == "y" || srcLC == "1" || srcLC == "true" || srcLC == "yes" || srcLC == "on")
             target = true;
         else if (srcLC == "n" || srcLC == "0" || srcLC == "false" || srcLC == "no" || srcLC == "off")
@@ -9647,8 +9744,7 @@ namespace Catch {
                         if( !startsWith( line, '"' ) )
                             line = '"' + line + '"';
                         config.testsOrTags.push_back( line );
-                        config.testsOrTags.push_back( "," );
-
+                        config.testsOrTags.emplace_back( "," );
                     }
                 }
                 //Remove comma in the end
@@ -9689,14 +9785,16 @@ namespace Catch {
             };
         auto const setWaitForKeypress = [&]( std::string const& keypress ) {
                 auto keypressLc = toLower( keypress );
-                if( keypressLc == "start" )
+                if (keypressLc == "never")
+                    config.waitForKeypress = WaitForKeypress::Never;
+                else if( keypressLc == "start" )
                     config.waitForKeypress = WaitForKeypress::BeforeStart;
                 else if( keypressLc == "exit" )
                     config.waitForKeypress = WaitForKeypress::BeforeExit;
                 else if( keypressLc == "both" )
                     config.waitForKeypress = WaitForKeypress::BeforeStartAndExit;
                 else
-                    return ParserResult::runtimeError( "keypress argument must be one of: start, exit or both. '" + keypress + "' not recognised" );
+                    return ParserResult::runtimeError( "keypress argument must be one of: never, start, exit or both. '" + keypress + "' not recognised" );
             return ParserResult::ok( ParseResultType::Matched );
             };
         auto const setVerbosity = [&]( std::string const& verbosity ) {
@@ -9766,6 +9864,9 @@ namespace Catch {
             | Opt( [&]( bool flag ) { config.showDurations = flag ? ShowDurations::Always : ShowDurations::Never; }, "yes|no" )
                 ["-d"]["--durations"]
                 ( "show test durations" )
+            | Opt( config.minDuration, "seconds" )
+                ["-D"]["--min-duration"]
+                ( "show test durations for tests taking at least the given number of seconds" )
             | Opt( loadTestNamesFromFile, "filename" )
                 ["-f"]["--input-file"]
                 ( "load test names to run from a file" )
@@ -9796,7 +9897,7 @@ namespace Catch {
             | Opt( config.libIdentify )
                 ["--libidentify"]
                 ( "report name and version according to libidentify standard" )
-            | Opt( setWaitForKeypress, "start|exit|both" )
+            | Opt( setWaitForKeypress, "never|start|exit|both" )
                 ["--wait-for-keypress"]
                 ( "waits for a keypress before exiting" )
             | Opt( config.benchmarkSamples, "samples" )
@@ -9811,7 +9912,10 @@ namespace Catch {
             | Opt( config.benchmarkNoAnalysis )
                 ["--benchmark-no-analysis"]
                 ( "perform only measurements; do not perform any analysis" )
-			| Arg( config.testsOrTags, "test name|pattern|tags" )
+            | Opt( config.benchmarkWarmupTime, "benchmarkWarmupTime" )
+                ["--benchmark-warmup-time"]
+                ( "amount of time in milliseconds spent on warming up each test (default: 100)" )
+            | Arg( config.testsOrTags, "test name|pattern|tags" )
                 ( "which test or tests to use" );
 
         return cli;
@@ -9910,6 +10014,7 @@ namespace Catch {
     bool Config::warnAboutMissingAssertions() const    { return !!(m_data.warnings & WarnAbout::NoAssertions); }
     bool Config::warnAboutNoTests() const              { return !!(m_data.warnings & WarnAbout::NoTests); }
     ShowDurations::OrNot Config::showDurations() const { return m_data.showDurations; }
+    double Config::minDuration() const                 { return m_data.minDuration; }
     RunTests::InWhatOrder Config::runOrder() const     { return m_data.runOrder; }
     unsigned int Config::rngSeed() const               { return m_data.rngSeed; }
     UseColour::YesOrNo Config::useColour() const       { return m_data.useColour; }
@@ -9918,10 +10023,11 @@ namespace Catch {
     bool Config::showInvisibles() const                { return m_data.showInvisibles; }
     Verbosity Config::verbosity() const                { return m_data.verbosity; }
 
-    bool Config::benchmarkNoAnalysis() const           { return m_data.benchmarkNoAnalysis; }
-    int Config::benchmarkSamples() const               { return m_data.benchmarkSamples; }
-    double Config::benchmarkConfidenceInterval() const { return m_data.benchmarkConfidenceInterval; }
-    unsigned int Config::benchmarkResamples() const    { return m_data.benchmarkResamples; }
+    bool Config::benchmarkNoAnalysis() const                      { return m_data.benchmarkNoAnalysis; }
+    int Config::benchmarkSamples() const                          { return m_data.benchmarkSamples; }
+    double Config::benchmarkConfidenceInterval() const            { return m_data.benchmarkConfidenceInterval; }
+    unsigned int Config::benchmarkResamples() const               { return m_data.benchmarkResamples; }
+    std::chrono::milliseconds Config::benchmarkWarmupTime() const { return std::chrono::milliseconds(m_data.benchmarkWarmupTime); }
 
     IStream const* Config::openStream() {
         return Catch::makeStream(m_data.outputFilename);
@@ -9962,7 +10068,7 @@ namespace Catch {
         };
 
         struct NoColourImpl : IColourImpl {
-            void use( Colour::Code ) {}
+            void use( Colour::Code ) override {}
 
             static IColourImpl* instance() {
                 static NoColourImpl s_instance;
@@ -10094,7 +10200,7 @@ namespace {
 
     bool useColourOnPlatform() {
         return
-#ifdef CATCH_PLATFORM_MAC
+#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)
             !isDebuggerActive() &&
 #endif
 #if !(defined(__DJGPP__) && defined(__STRICT_ANSI__))
@@ -10135,13 +10241,13 @@ namespace Catch {
 namespace Catch {
 
     Colour::Colour( Code _colourCode ) { use( _colourCode ); }
-    Colour::Colour( Colour&& rhs ) noexcept {
-        m_moved = rhs.m_moved;
-        rhs.m_moved = true;
+    Colour::Colour( Colour&& other ) noexcept {
+        m_moved = other.m_moved;
+        other.m_moved = true;
     }
-    Colour& Colour::operator=( Colour&& rhs ) noexcept {
-        m_moved = rhs.m_moved;
-        rhs.m_moved  = true;
+    Colour& Colour::operator=( Colour&& other ) noexcept {
+        m_moved = other.m_moved;
+        other.m_moved  = true;
         return *this;
     }
 
@@ -10153,7 +10259,7 @@ namespace Catch {
         // However, under some conditions it does happen (see #1626),
         // and this change is small enough that we can let practicality
         // triumph over purity in this case.
-        if (impl != NULL) {
+        if (impl != nullptr) {
             impl->use( _colourCode );
         }
     }
@@ -10271,10 +10377,9 @@ namespace Catch {
 // end catch_debug_console.cpp
 // start catch_debugger.cpp
 
-#ifdef CATCH_PLATFORM_MAC
+#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)
 
-#  include <assert.h>
-#  include <stdbool.h>
+#  include <cassert>
 #  include <sys/types.h>
 #  include <unistd.h>
 #  include <cstddef>
@@ -10506,7 +10611,7 @@ namespace Catch {
             assert( valueNames.size() == values.size() );
             std::size_t i = 0;
             for( auto value : values )
-                enumInfo->m_values.push_back({ value, valueNames[i++] });
+                enumInfo->m_values.emplace_back(value, valueNames[i++]);
 
             return enumInfo;
         }
@@ -10806,8 +10911,8 @@ namespace Generators {
 
     GeneratorUntypedBase::~GeneratorUntypedBase() {}
 
-    auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& {
-        return getResultCapture().acquireGeneratorTracker( lineInfo );
+    auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& {
+        return getResultCapture().acquireGeneratorTracker( generatorName, lineInfo );
     }
 
 } // namespace Generators
@@ -11082,7 +11187,7 @@ namespace Catch {
 namespace Catch {
 
     std::size_t listTests( Config const& config ) {
-        TestSpec testSpec = config.testSpec();
+        TestSpec const& testSpec = config.testSpec();
         if( config.hasTestFilters() )
             Catch::cout() << "Matching test cases:\n";
         else {
@@ -11116,7 +11221,7 @@ namespace Catch {
     }
 
     std::size_t listTestsNamesOnly( Config const& config ) {
-        TestSpec testSpec = config.testSpec();
+        TestSpec const& testSpec = config.testSpec();
         std::size_t matchedTests = 0;
         std::vector<TestCase> matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
         for( auto const& testCaseInfo : matchedTestCases ) {
@@ -11154,7 +11259,7 @@ namespace Catch {
     }
 
     std::size_t listTags( Config const& config ) {
-        TestSpec testSpec = config.testSpec();
+        TestSpec const& testSpec = config.testSpec();
         if( config.hasTestFilters() )
             Catch::cout() << "Tags for matching test cases:\n";
         else {
@@ -11346,16 +11451,8 @@ namespace {
         return static_cast<uint64_t>(ulpDiff) <= maxUlpDiff;
     }
 
-} //end anonymous namespace
-
 #if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
 
-#if defined(__clang__)
-#pragma clang diagnostic push
-// The long double overload is currently unused
-#pragma clang diagnostic ignored "-Wunused-function"
-#endif
-
     float nextafter(float x, float y) {
         return ::nextafterf(x, y);
     }
@@ -11364,18 +11461,8 @@ namespace {
         return ::nextafter(x, y);
     }
 
-    long double nextafter(long double x, long double y) {
-        return ::nextafterl(x, y);
-    }
-
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
 #endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^
 
-namespace {
-
 template <typename FP>
 FP step(FP start, FP direction, uint64_t steps) {
     for (uint64_t i = 0; i < steps; ++i) {
@@ -11712,10 +11799,10 @@ namespace Catch {
 
     Capturer::Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names ) {
         auto trimmed = [&] (size_t start, size_t end) {
-            while (names[start] == ',' || isspace(names[start])) {
+            while (names[start] == ',' || isspace(static_cast<unsigned char>(names[start]))) {
                 ++start;
             }
-            while (names[end] == ',' || isspace(names[end])) {
+            while (names[end] == ',' || isspace(static_cast<unsigned char>(names[end]))) {
                 --end;
             }
             return names.substr(start, end - start + 1);
@@ -11754,7 +11841,7 @@ namespace Catch {
                 pos = skipq(pos, c);
                 break;
             case ',':
-                if (start != pos && openings.size() == 0) {
+                if (start != pos && openings.empty()) {
                     m_messages.emplace_back(macroName, lineInfo, resultType);
                     m_messages.back().message = static_cast<std::string>(trimmed(start, pos));
                     m_messages.back().message += " := ";
@@ -11762,7 +11849,7 @@ namespace Catch {
                 }
             }
         }
-        assert(openings.size() == 0 && "Mismatched openings");
+        assert(openings.empty() && "Mismatched openings");
         m_messages.emplace_back(macroName, lineInfo, resultType);
         m_messages.back().message = static_cast<std::string>(trimmed(start, names.size() - 1));
         m_messages.back().message += " := ";
@@ -11950,7 +12037,7 @@ namespace Catch {
         if (tmpnam_s(m_buffer)) {
             CATCH_RUNTIME_ERROR("Could not get a temp filename");
         }
-        if (fopen_s(&m_file, m_buffer, "w")) {
+        if (fopen_s(&m_file, m_buffer, "w+")) {
             char buffer[100];
             if (strerror_s(buffer, errno)) {
                 CATCH_RUNTIME_ERROR("Could not translate errno to a string");
@@ -12245,11 +12332,13 @@ namespace Catch {
 namespace Catch {
 
     class StartupExceptionRegistry {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
     public:
         void add(std::exception_ptr const& exception) noexcept;
         std::vector<std::exception_ptr> const& getExceptions() const noexcept;
     private:
         std::vector<std::exception_ptr> m_exceptions;
+#endif
     };
 
 } // end namespace Catch
@@ -12332,7 +12421,11 @@ namespace Catch {
                 m_tagAliasRegistry.add( alias, tag, lineInfo );
             }
             void registerStartupException() noexcept override {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
                 m_exceptionRegistry.add(std::current_exception());
+#else
+                CATCH_INTERNAL_ERROR("Attempted to register active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
+#endif
             }
             IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() override {
                 return m_enumValuesRegistry;
@@ -12436,17 +12529,32 @@ namespace Catch {
                 std::shared_ptr<GeneratorTracker> tracker;
 
                 ITracker& currentTracker = ctx.currentTracker();
-                if( TestCaseTracking::ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) {
+                // Under specific circumstances, the generator we want
+                // to acquire is also the current tracker. If this is
+                // the case, we have to avoid looking through current
+                // tracker's children, and instead return the current
+                // tracker.
+                // A case where this check is important is e.g.
+                //     for (int i = 0; i < 5; ++i) {
+                //         int n = GENERATE(1, 2);
+                //     }
+                //
+                // without it, the code above creates 5 nested generators.
+                if (currentTracker.nameAndLocation() == nameAndLocation) {
+                    auto thisTracker = currentTracker.parent().findChild(nameAndLocation);
+                    assert(thisTracker);
+                    assert(thisTracker->isGeneratorTracker());
+                    tracker = std::static_pointer_cast<GeneratorTracker>(thisTracker);
+                } else if ( TestCaseTracking::ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) {
                     assert( childTracker );
                     assert( childTracker->isGeneratorTracker() );
                     tracker = std::static_pointer_cast<GeneratorTracker>( childTracker );
-                }
-                else {
+                } else {
                     tracker = std::make_shared<GeneratorTracker>( nameAndLocation, ctx, &currentTracker );
                     currentTracker.addChild( tracker );
                 }
 
-                if( !ctx.completedCycle() && !tracker->isComplete() ) {
+                if( !tracker->isComplete() ) {
                     tracker->open();
                 }
 
@@ -12460,8 +12568,68 @@ namespace Catch {
             }
             void close() override {
                 TrackerBase::close();
-                // Generator interface only finds out if it has another item on atual move
-                if (m_runState == CompletedSuccessfully && m_generator->next()) {
+                // If a generator has a child (it is followed by a section)
+                // and none of its children have started, then we must wait
+                // until later to start consuming its values.
+                // This catches cases where `GENERATE` is placed between two
+                // `SECTION`s.
+                // **The check for m_children.empty cannot be removed**.
+                // doing so would break `GENERATE` _not_ followed by `SECTION`s.
+                const bool should_wait_for_child = [&]() {
+                    // No children -> nobody to wait for
+                    if ( m_children.empty() ) {
+                        return false;
+                    }
+                    // If at least one child started executing, don't wait
+                    if ( std::find_if(
+                             m_children.begin(),
+                             m_children.end(),
+                             []( TestCaseTracking::ITrackerPtr tracker ) {
+                                 return tracker->hasStarted();
+                             } ) != m_children.end() ) {
+                        return false;
+                    }
+
+                    // No children have started. We need to check if they _can_
+                    // start, and thus we should wait for them, or they cannot
+                    // start (due to filters), and we shouldn't wait for them
+                    auto* parent = m_parent;
+                    // This is safe: there is always at least one section
+                    // tracker in a test case tracking tree
+                    while ( !parent->isSectionTracker() ) {
+                        parent = &( parent->parent() );
+                    }
+                    assert( parent &&
+                            "Missing root (test case) level section" );
+
+                    auto const& parentSection =
+                        static_cast<SectionTracker&>( *parent );
+                    auto const& filters = parentSection.getFilters();
+                    // No filters -> no restrictions on running sections
+                    if ( filters.empty() ) {
+                        return true;
+                    }
+
+                    for ( auto const& child : m_children ) {
+                        if ( child->isSectionTracker() &&
+                             std::find( filters.begin(),
+                                        filters.end(),
+                                        static_cast<SectionTracker&>( *child )
+                                            .trimmedName() ) !=
+                                 filters.end() ) {
+                            return true;
+                        }
+                    }
+                    return false;
+                }();
+
+                // This check is a bit tricky, because m_generator->next()
+                // has a side-effect, where it consumes generator's current
+                // value, but we do not want to invoke the side-effect if
+                // this generator is still waiting for any child to start.
+                if ( should_wait_for_child ||
+                     ( m_runState == CompletedSuccessfully &&
+                       m_generator->next() ) ) {
                     m_children.clear();
                     m_runState = Executing;
                 }
@@ -12597,10 +12765,10 @@ namespace Catch {
 
         return true;
     }
-    auto RunContext::acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& {
+    auto RunContext::acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& {
         using namespace Generators;
-        GeneratorTracker& tracker = GeneratorTracker::acquire( m_trackerContext, TestCaseTracking::NameAndLocation( "generator", lineInfo ) );
-        assert( tracker.isOpen() );
+        GeneratorTracker& tracker = GeneratorTracker::acquire(m_trackerContext,
+                                                              TestCaseTracking::NameAndLocation( static_cast<std::string>(generatorName), lineInfo ) );
         m_lastAssertionInfo.lineInfo = lineInfo;
         return tracker;
     }
@@ -12643,17 +12811,17 @@ namespace Catch {
 
 #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)
     void RunContext::benchmarkPreparing(std::string const& name) {
-		m_reporter->benchmarkPreparing(name);
-	}
+        m_reporter->benchmarkPreparing(name);
+    }
     void RunContext::benchmarkStarting( BenchmarkInfo const& info ) {
         m_reporter->benchmarkStarting( info );
     }
     void RunContext::benchmarkEnded( BenchmarkStats<> const& stats ) {
         m_reporter->benchmarkEnded( stats );
     }
-	void RunContext::benchmarkFailed(std::string const & error) {
-		m_reporter->benchmarkFailed(error);
-	}
+    void RunContext::benchmarkFailed(std::string const & error) {
+        m_reporter->benchmarkFailed(error);
+    }
 #endif // CATCH_CONFIG_ENABLE_BENCHMARKING
 
     void RunContext::pushScopedMessage(MessageInfo const & message) {
@@ -13250,11 +13418,11 @@ namespace Catch {
         char **utf8Argv = new char *[ argc ];
 
         for ( int i = 0; i < argc; ++i ) {
-            int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, NULL, 0, NULL, NULL );
+            int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, nullptr, 0, nullptr, nullptr );
 
             utf8Argv[ i ] = new char[ bufSize ];
 
-            WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, NULL, NULL );
+            WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, nullptr, nullptr );
         }
 
         int returnCode = applyCommandLine( argc, utf8Argv );
@@ -13374,6 +13542,7 @@ namespace Catch {
 // end catch_singletons.cpp
 // start catch_startup_exception_registry.cpp
 
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
 namespace Catch {
 void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexcept {
         CATCH_TRY {
@@ -13389,6 +13558,7 @@ void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexce
     }
 
 } // end namespace Catch
+#endif
 // end catch_startup_exception_registry.cpp
 // start catch_stream.cpp
 
@@ -13573,7 +13743,7 @@ namespace Catch {
 
     namespace {
         char toLowerCh(char c) {
-            return static_cast<char>( std::tolower( c ) );
+            return static_cast<char>( std::tolower( static_cast<unsigned char>(c) ) );
         }
     }
 
@@ -13853,7 +14023,8 @@ namespace Catch {
             }
         }
         if( isHidden ) {
-            tags.push_back( "." );
+            // Add all "hidden" tags to make them behave identically
+            tags.insert( tags.end(), { ".", "!hide" } );
         }
 
         TestCaseInfo info( static_cast<std::string>(nameAndTags.name), _className, desc, tags, _lineInfo );
@@ -13948,27 +14119,77 @@ namespace Catch {
 // end catch_test_case_info.cpp
 // start catch_test_case_registry_impl.cpp
 
+#include <algorithm>
 #include <sstream>
 
 namespace Catch {
 
-    std::vector<TestCase> sortTests( IConfig const& config, std::vector<TestCase> const& unsortedTestCases ) {
+    namespace {
+        struct TestHasher {
+            explicit TestHasher(Catch::SimplePcg32& rng_instance) {
+                basis = rng_instance();
+                basis <<= 32;
+                basis |= rng_instance();
+            }
 
-        std::vector<TestCase> sorted = unsortedTestCases;
+            uint64_t basis;
 
+            uint64_t operator()(TestCase const& t) const {
+                // Modified FNV-1a hash
+                static constexpr uint64_t prime = 1099511628211;
+                uint64_t hash = basis;
+                for (const char c : t.name) {
+                    hash ^= c;
+                    hash *= prime;
+                }
+                return hash;
+            }
+        };
+    } // end unnamed namespace
+
+    std::vector<TestCase> sortTests( IConfig const& config, std::vector<TestCase> const& unsortedTestCases ) {
         switch( config.runOrder() ) {
-            case RunTests::InLexicographicalOrder:
-                std::sort( sorted.begin(), sorted.end() );
-                break;
-            case RunTests::InRandomOrder:
-                seedRng( config );
-                std::shuffle( sorted.begin(), sorted.end(), rng() );
-                break;
             case RunTests::InDeclarationOrder:
                 // already in declaration order
                 break;
+
+            case RunTests::InLexicographicalOrder: {
+                std::vector<TestCase> sorted = unsortedTestCases;
+                std::sort( sorted.begin(), sorted.end() );
+                return sorted;
+            }
+
+            case RunTests::InRandomOrder: {
+                seedRng( config );
+                TestHasher h( rng() );
+
+                using hashedTest = std::pair<uint64_t, TestCase const*>;
+                std::vector<hashedTest> indexed_tests;
+                indexed_tests.reserve( unsortedTestCases.size() );
+
+                for (auto const& testCase : unsortedTestCases) {
+                    indexed_tests.emplace_back(h(testCase), &testCase);
+                }
+
+                std::sort(indexed_tests.begin(), indexed_tests.end(),
+                          [](hashedTest const& lhs, hashedTest const& rhs) {
+                          if (lhs.first == rhs.first) {
+                              return lhs.second->name < rhs.second->name;
+                          }
+                          return lhs.first < rhs.first;
+                });
+
+                std::vector<TestCase> sorted;
+                sorted.reserve( indexed_tests.size() );
+
+                for (auto const& hashed : indexed_tests) {
+                    sorted.emplace_back(*hashed.second);
+                }
+
+                return sorted;
+            }
         }
-        return sorted;
+        return unsortedTestCases;
     }
 
     bool isThrowSafe( TestCase const& testCase, IConfig const& config ) {
@@ -14105,15 +14326,12 @@ namespace TestCaseTracking {
         m_currentTracker = tracker;
     }
 
-    TrackerBase::TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent )
-    :   m_nameAndLocation( nameAndLocation ),
+    TrackerBase::TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent ):
+        ITracker(nameAndLocation),
         m_ctx( ctx ),
         m_parent( parent )
     {}
 
-    NameAndLocation const& TrackerBase::nameAndLocation() const {
-        return m_nameAndLocation;
-    }
     bool TrackerBase::isComplete() const {
         return m_runState == CompletedSuccessfully || m_runState == Failed;
     }
@@ -14229,7 +14447,8 @@ namespace TestCaseTracking {
     bool SectionTracker::isComplete() const {
         bool complete = true;
 
-        if ((m_filters.empty() || m_filters[0] == "")
+        if (m_filters.empty()
+            || m_filters[0] == ""
             || std::find(m_filters.begin(), m_filters.end(), m_trimmed_name) != m_filters.end()) {
             complete = TrackerBase::isComplete();
         }
@@ -14264,8 +14483,8 @@ namespace TestCaseTracking {
     void SectionTracker::addInitialFilters( std::vector<std::string> const& filters ) {
         if( !filters.empty() ) {
             m_filters.reserve( m_filters.size() + filters.size() + 2 );
-            m_filters.push_back(""); // Root - should never be consulted
-            m_filters.push_back(""); // Test Case - not a section filter
+            m_filters.emplace_back(""); // Root - should never be consulted
+            m_filters.emplace_back(""); // Test Case - not a section filter
             m_filters.insert( m_filters.end(), filters.begin(), filters.end() );
         }
     }
@@ -14274,6 +14493,14 @@ namespace TestCaseTracking {
             m_filters.insert( m_filters.end(), filters.begin()+1, filters.end() );
     }
 
+    std::vector<std::string> const& SectionTracker::getFilters() const {
+        return m_filters;
+    }
+
+    std::string const& SectionTracker::trimmedName() const {
+        return m_trimmed_name;
+    }
+
 } // namespace TestCaseTracking
 
 using TestCaseTracking::ITracker;
@@ -14562,6 +14789,7 @@ namespace Catch {
          m_pos = m_arg.size();
          m_substring.clear();
          m_patternName.clear();
+         m_realPatternPos = 0;
          return false;
       }
       endMode();
@@ -14580,6 +14808,7 @@ namespace Catch {
         }
 
         m_patternName.clear();
+        m_realPatternPos = 0;
 
         return token;
     }
@@ -15006,11 +15235,48 @@ namespace Catch {
 // end catch_totals.cpp
 // start catch_uncaught_exceptions.cpp
 
+// start catch_config_uncaught_exceptions.hpp
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE_1_0.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP
+#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP
+
+#if defined(_MSC_VER)
+#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
+#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#  endif
+#endif
+
+#include <exception>
+
+#if defined(__cpp_lib_uncaught_exceptions) \
+    && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif // __cpp_lib_uncaught_exceptions
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif
+
+#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP
+// end catch_config_uncaught_exceptions.hpp
 #include <exception>
 
 namespace Catch {
     bool uncaught_exceptions() {
-#if defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        return false;
+#elif defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
         return std::uncaught_exceptions() > 0;
 #else
         return std::uncaught_exception();
@@ -15050,7 +15316,7 @@ namespace Catch {
     }
 
     Version const& libraryVersion() {
-        static Version version( 2, 11, 0, "", 0 );
+        static Version version( 2, 13, 3, "", 0 );
         return version;
     }
 
@@ -15100,8 +15366,6 @@ namespace Catch {
 #include <iomanip>
 #include <type_traits>
 
-using uchar = unsigned char;
-
 namespace Catch {
 
 namespace {
@@ -15174,7 +15438,7 @@ namespace {
         // (see: http://www.w3.org/TR/xml/#syntax)
 
         for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
-            uchar c = m_str[idx];
+            unsigned char c = m_str[idx];
             switch (c) {
             case '<':   os << "&lt;"; break;
             case '&':   os << "&amp;"; break;
@@ -15234,7 +15498,7 @@ namespace {
                 bool valid = true;
                 uint32_t value = headerValue(c);
                 for (std::size_t n = 1; n < encBytes; ++n) {
-                    uchar nc = m_str[idx + n];
+                    unsigned char nc = m_str[idx + n];
                     valid &= ((nc & 0xC0) == 0x80);
                     value = (value << 6) | (nc & 0x3F);
                 }
@@ -15454,6 +15718,17 @@ namespace Catch {
         return std::string(buffer);
     }
 
+    bool shouldShowDuration( IConfig const& config, double duration ) {
+        if ( config.showDurations() == ShowDurations::Always ) {
+            return true;
+        }
+        if ( config.showDurations() == ShowDurations::Never ) {
+            return false;
+        }
+        const double min = config.minDuration();
+        return min >= 0 && duration >= min;
+    }
+
     std::string serializeFilters( std::vector<std::string> const& container ) {
         ReusableStringStream oss;
         bool first = true;
@@ -15720,10 +15995,6 @@ class AssertionPrinter {
             return "Reports test results on a single line, suitable for IDEs";
         }
 
-        ReporterPreferences CompactReporter::getPreferences() const {
-            return m_reporterPrefs;
-        }
-
         void CompactReporter::noMatchingTestCases( std::string const& spec ) {
             stream << "No test cases matched '" << spec << '\'' << std::endl;
         }
@@ -15750,8 +16021,9 @@ class AssertionPrinter {
         }
 
         void CompactReporter::sectionEnded(SectionStats const& _sectionStats) {
-            if (m_config->showDurations() == ShowDurations::Always) {
-                stream << getFormattedDuration(_sectionStats.durationInSeconds) << " s: " << _sectionStats.sectionInfo.name << std::endl;
+            double dur = _sectionStats.durationInSeconds;
+            if ( shouldShowDuration( *m_config, dur ) ) {
+                stream << getFormattedDuration( dur ) << " s: " << _sectionStats.sectionInfo.name << std::endl;
             }
         }
 
@@ -15963,15 +16235,11 @@ class Duration {
     static const uint64_t s_nanosecondsInASecond = 1000 * s_nanosecondsInAMillisecond;
     static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond;
 
-    uint64_t m_inNanoseconds;
+    double m_inNanoseconds;
     Unit m_units;
 
 public:
-	explicit Duration(double inNanoseconds, Unit units = Unit::Auto)
-        : Duration(static_cast<uint64_t>(inNanoseconds), units) {
-    }
-
-    explicit Duration(uint64_t inNanoseconds, Unit units = Unit::Auto)
+    explicit Duration(double inNanoseconds, Unit units = Unit::Auto)
         : m_inNanoseconds(inNanoseconds),
         m_units(units) {
         if (m_units == Unit::Auto) {
@@ -16000,7 +16268,7 @@ class Duration {
         case Unit::Minutes:
             return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMinute);
         default:
-            return static_cast<double>(m_inNanoseconds);
+            return m_inNanoseconds;
         }
     }
     auto unitsAsString() const -> std::string {
@@ -16119,7 +16387,7 @@ ConsoleReporter::ConsoleReporter(ReporterConfig const& config)
         else
         {
             return{
-                { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 32, ColumnInfo::Left },
+                { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, ColumnInfo::Left },
                 { "samples      mean       std dev", 14, ColumnInfo::Right },
                 { "iterations   low mean   low std dev", 14, ColumnInfo::Right },
                 { "estimated    high mean  high std dev", 14, ColumnInfo::Right }
@@ -16175,8 +16443,9 @@ void ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) {
             stream << "\nNo assertions in test case";
         stream << " '" << _sectionStats.sectionInfo.name << "'\n" << std::endl;
     }
-    if (m_config->showDurations() == ShowDurations::Always) {
-        stream << getFormattedDuration(_sectionStats.durationInSeconds) << " s: " << _sectionStats.sectionInfo.name << std::endl;
+    double dur = _sectionStats.durationInSeconds;
+    if (shouldShowDuration(*m_config, dur)) {
+        stream << getFormattedDuration(dur) << " s: " << _sectionStats.sectionInfo.name << std::endl;
     }
     if (m_headerPrinted) {
         m_headerPrinted = false;
@@ -16436,8 +16705,10 @@ void ConsoleReporter::printSummaryDivider() {
 }
 
 void ConsoleReporter::printTestFilters() {
-    if (m_config->testSpec().hasFilters())
-        stream << Colour(Colour::BrightYellow) << "Filters: " << serializeFilters( m_config->getTestsOrTags() ) << '\n';
+    if (m_config->testSpec().hasFilters()) {
+        Colour guard(Colour::BrightYellow);
+        stream << "Filters: " << serializeFilters(m_config->getTestsOrTags()) << '\n';
+    }
 }
 
 CATCH_REGISTER_REPORTER("console", ConsoleReporter)
@@ -16633,6 +16904,11 @@ namespace Catch {
                 xml.writeAttribute( "name", name );
             }
             xml.writeAttribute( "time", ::Catch::Detail::stringify( sectionNode.stats.durationInSeconds ) );
+            // This is not ideal, but it should be enough to mimic gtest's
+            // junit output.
+            // Ideally the JUnit reporter would also handle `skipTest`
+            // events and write those out appropriately.
+            xml.writeAttribute( "status", "run" );
 
             writeAssertions( sectionNode );
 
@@ -16663,11 +16939,7 @@ namespace Catch {
                     elementName = "error";
                     break;
                 case ResultWas::ExplicitFailure:
-                    elementName = "failure";
-                    break;
                 case ResultWas::ExpressionFailed:
-                    elementName = "failure";
-                    break;
                 case ResultWas::DidntThrowException:
                     elementName = "failure";
                     break;
@@ -17071,6 +17343,10 @@ namespace Catch {
             .writeAttribute( "successes", testGroupStats.totals.assertions.passed )
             .writeAttribute( "failures", testGroupStats.totals.assertions.failed )
             .writeAttribute( "expectedFailures", testGroupStats.totals.assertions.failedButOk );
+        m_xml.scopedElement( "OverallResultsCases")
+            .writeAttribute( "successes", testGroupStats.totals.testCases.passed )
+            .writeAttribute( "failures", testGroupStats.totals.testCases.failed )
+            .writeAttribute( "expectedFailures", testGroupStats.totals.testCases.failedButOk );
         m_xml.endElement();
     }
 
@@ -17080,6 +17356,10 @@ namespace Catch {
             .writeAttribute( "successes", testRunStats.totals.assertions.passed )
             .writeAttribute( "failures", testRunStats.totals.assertions.failed )
             .writeAttribute( "expectedFailures", testRunStats.totals.assertions.failedButOk );
+        m_xml.scopedElement( "OverallResultsCases")
+            .writeAttribute( "successes", testRunStats.totals.testCases.passed )
+            .writeAttribute( "failures", testRunStats.totals.testCases.failed )
+            .writeAttribute( "expectedFailures", testRunStats.totals.testCases.failedButOk );
         m_xml.endElement();
     }
 
@@ -17093,16 +17373,16 @@ namespace Catch {
         m_xml.writeAttribute("samples", info.samples)
             .writeAttribute("resamples", info.resamples)
             .writeAttribute("iterations", info.iterations)
-            .writeAttribute("clockResolution", static_cast<uint64_t>(info.clockResolution))
-            .writeAttribute("estimatedDuration", static_cast<uint64_t>(info.estimatedDuration))
+            .writeAttribute("clockResolution", info.clockResolution)
+            .writeAttribute("estimatedDuration", info.estimatedDuration)
             .writeComment("All values in nano seconds");
     }
 
     void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
         m_xml.startElement("mean")
-            .writeAttribute("value", static_cast<uint64_t>(benchmarkStats.mean.point.count()))
-            .writeAttribute("lowerBound", static_cast<uint64_t>(benchmarkStats.mean.lower_bound.count()))
-            .writeAttribute("upperBound", static_cast<uint64_t>(benchmarkStats.mean.upper_bound.count()))
+            .writeAttribute("value", benchmarkStats.mean.point.count())
+            .writeAttribute("lowerBound", benchmarkStats.mean.lower_bound.count())
+            .writeAttribute("upperBound", benchmarkStats.mean.upper_bound.count())
             .writeAttribute("ci", benchmarkStats.mean.confidence_interval);
         m_xml.endElement();
         m_xml.startElement("standardDeviation")
@@ -17153,7 +17433,7 @@ namespace Catch {
 
 #ifndef __OBJC__
 
-#if defined(CATCH_CONFIG_WCHAR) && defined(WIN32) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN)
+#if defined(CATCH_CONFIG_WCHAR) && defined(CATCH_PLATFORM_WINDOWS) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN)
 // Standard C/C++ Win32 Unicode wmain entry point
 extern "C" int wmain (int argc, wchar_t * argv[], wchar_t * []) {
 #else
diff --git a/thirdParty/cupla/cuplaConfig.cmake b/thirdParty/cupla/cuplaConfig.cmake
index 60826daa34..636f97cd5d 100644
--- a/thirdParty/cupla/cuplaConfig.cmake
+++ b/thirdParty/cupla/cuplaConfig.cmake
@@ -22,7 +22,7 @@
 # Required cmake version.
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 ################################################################################
 # CMake policies
@@ -114,35 +114,18 @@ set_property(CACHE cupla_ALPAKA_PROVIDER PROPERTY STRINGS "intern;extern")
 mark_as_advanced(cupla_ALPAKA_PROVIDER)
 
 if(${cupla_ALPAKA_PROVIDER} STREQUAL "intern")
-
-    find_package(alpaka
-        PATHS "${_cupla_ROOT_DIR}/alpaka"
-        NO_DEFAULT_PATH
-        NO_CMAKE_ENVIRONMENT_PATH
-        NO_CMAKE_PATH
-        NO_SYSTEM_ENVIRONMENT_PATH
-        NO_CMAKE_PACKAGE_REGISTRY
-        NO_CMAKE_BUILDS_PATH
-        NO_CMAKE_SYSTEM_PATH
-        NO_CMAKE_SYSTEM_PACKAGE_REGISTRY
-        NO_CMAKE_FIND_ROOT_PATH
-    )
+    set(alpaka_BUILD_EXAMPLES OFF)
+    set(BUILD_TESTING OFF)
+    add_subdirectory(${_cupla_ROOT_DIR}/alpaka ${CMAKE_BINARY_DIR}/alpaka)
 else()
     find_package(alpaka HINTS $ENV{ALPAKA_ROOT})
 endif()
 
-if(NOT alpaka_FOUND)
+if(NOT TARGET alpaka::alpaka)
     message(WARNING "Required cupla dependency alpaka could not be found!")
     set(_cupla_FOUND FALSE)
-else()
-    # TODO: use imported targets instead of chain of variables
-    list(APPEND _cupla_COMPILE_OPTIONS_PUBLIC ${alpaka_COMPILE_OPTIONS})
-    list(APPEND _cupla_COMPILE_DEFINITIONS_PUBLIC ${alpaka_COMPILE_DEFINITIONS})
-    list(APPEND _cupla_INCLUDE_DIRECTORIES_PUBLIC ${alpaka_INCLUDE_DIRS})
-    list(APPEND _cupla_LINK_LIBRARIES_PUBLIC ${alpaka_LIBRARIES})
 endif()
 
-
 ################################################################################
 # Compiler settings.
 ################################################################################
@@ -212,15 +195,6 @@ alpaka_add_library(
 # Even if there are no sources CMAKE has to know the language.
 set_target_properties("cupla" PROPERTIES LINKER_LANGUAGE CXX)
 
-# properties
-target_compile_features("cupla"
-    PUBLIC cxx_std_11
-    )
-set_target_properties("cupla" PROPERTIES
-    CXX_EXTENSIONS OFF
-    CXX_STANDARD_REQUIRED ON
-    )
-
 # Compile options.
 message(STATUS "_cupla_COMPILE_OPTIONS_PUBLIC: ${_cupla_COMPILE_OPTIONS_PUBLIC}")
 list(
@@ -259,21 +233,16 @@ endif()
 
 # Link libraries.
 message(STATUS "_cupla_LINK_LIBRARIES_PUBLIC: ${_cupla_LINK_LIBRARIES_PUBLIC}")
-list(
-    LENGTH
-    _cupla_LINK_LIBRARIES_PUBLIC
-    _cupla_LINK_LIBRARIES_PUBLIC_LENGTH)
-if("${_cupla_LINK_LIBRARIES_PUBLIC_LENGTH}")
-    target_link_libraries(
-        "cupla"
-        PUBLIC alpaka ${_cupla_LINK_LIBRARIES_PUBLIC})
-endif()
+
+target_link_libraries(
+    "cupla"
+    PUBLIC alpaka::alpaka ${_cupla_LINK_LIBRARIES_PUBLIC})
 
 ################################################################################
 # Find cupla version.
 ################################################################################
 # Please also update the version in `include/cupla/version.hpp`
-set(_cupla_VERSION "0.2.0")
+set(_cupla_VERSION "0.3.0")
 
 ################################################################################
 # Set return values.
diff --git a/thirdParty/cupla/doc/ConfigurationHeader.md b/thirdParty/cupla/doc/ConfigurationHeader.md
index 0cefa25af3..c329d960fc 100644
--- a/thirdParty/cupla/doc/ConfigurationHeader.md
+++ b/thirdParty/cupla/doc/ConfigurationHeader.md
@@ -12,7 +12,7 @@ The definitions must be passed via a compiler flag or be defined before the acce
 The default value will be used if the configuration header is included without defining any of the following options.
 
 - `CUPLA_STREAM_ASYNC_ENABLED`: `0` use synchronous streams (default), `1` use asynchronous streams
-- `CUPLA_HEADER_ONLY`: `1` *cupla* will be used as header-only library (default), otherwise you must compile all `.cpp` files in [`src/`](https://github.com/ComputationalRadiationPhysics/cupla/tree/master/src)
+- `CUPLA_HEADER_ONLY`: `1` *cupla* will be used as header-only library (default), otherwise you must compile all `.cpp` files in [`src/`](https://github.com/alpaka-group/cupla/tree/master/src)
 
 
 linker dependencies
@@ -24,7 +24,7 @@ Depending of the used accelerator you must link the library `pthread` and/or act
 Example
 =======
 
-To select an accelerator you must include the corresponding accelerator header from [`cupla/config/`](https://github.com/ComputationalRadiationPhysics/cupla/tree/master/include/cupla/config)
+To select an accelerator you must include the corresponding accelerator header from [`cupla/config/`](https://github.com/alpaka-group/cupla/tree/master/include/cupla/config)
 
 
 ```C++
diff --git a/thirdParty/cupla/doc/PortingGuide.md b/thirdParty/cupla/doc/PortingGuide.md
index 510f82ce59..05c1767180 100644
--- a/thirdParty/cupla/doc/PortingGuide.md
+++ b/thirdParty/cupla/doc/PortingGuide.md
@@ -2,7 +2,7 @@ Requirements to Port Your Project to *cupla*
 ============================================
 
 - your build system must be `CMake`
-- your code must be compileable with C++11
+- your code must be compatible with C++14
 
 
 Reserved Variable Names
@@ -176,7 +176,7 @@ Porting Step by Step
        auto result = deviceFunction( acc, x );
 
 - Cupla code can be mixed with
-  [**alpaka**](https://github.com/ComputationalRadiationPhysics/alpaka)
+  [**alpaka**](https://github.com/alpaka-group/alpaka)
   low level code. This becomes necessary as you are progressing to write more
   general, performance portable code. Additional functionality provided by
   alpaka includes for example, platform independent math functions inside
diff --git a/thirdParty/cupla/example/CUDASamples/asyncAPI/CMakeLists.txt b/thirdParty/cupla/example/CUDASamples/asyncAPI/CMakeLists.txt
index b63e43ba9c..ccecbd711b 100644
--- a/thirdParty/cupla/example/CUDASamples/asyncAPI/CMakeLists.txt
+++ b/thirdParty/cupla/example/CUDASamples/asyncAPI/CMakeLists.txt
@@ -22,7 +22,7 @@
 # Required CMake version.
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
diff --git a/thirdParty/cupla/example/CUDASamples/asyncAPI_tuned/CMakeLists.txt b/thirdParty/cupla/example/CUDASamples/asyncAPI_tuned/CMakeLists.txt
index e84fbb2aea..ba4452f4a1 100644
--- a/thirdParty/cupla/example/CUDASamples/asyncAPI_tuned/CMakeLists.txt
+++ b/thirdParty/cupla/example/CUDASamples/asyncAPI_tuned/CMakeLists.txt
@@ -23,7 +23,7 @@
 # Required CMake version.
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
diff --git a/thirdParty/cupla/example/CUDASamples/blackScholes/CMakeLists.txt b/thirdParty/cupla/example/CUDASamples/blackScholes/CMakeLists.txt
index 7cd8d38e41..eb2fbec7c2 100644
--- a/thirdParty/cupla/example/CUDASamples/blackScholes/CMakeLists.txt
+++ b/thirdParty/cupla/example/CUDASamples/blackScholes/CMakeLists.txt
@@ -23,7 +23,7 @@
 # Required CMake version.
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
diff --git a/thirdParty/cupla/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp b/thirdParty/cupla/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp
index f0efc5f664..a6be31a85d 100644
--- a/thirdParty/cupla/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp
+++ b/thirdParty/cupla/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp
@@ -12,7 +12,7 @@
 
 
 #include <math.h>
-
+#include <cupla.hpp>
 
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -28,10 +28,10 @@ static double CND(double d)
     const double RSQRT2PI = 0.39894228040143267793994605993438;
 
     double
-    K = 1.0 / (1.0 + 0.2316419 * fabs(d));
+    K = 1.0 / (1.0 + 0.2316419 * cupla::abs(d));
 
     double
-    cnd = RSQRT2PI * exp(- 0.5 * d * d) *
+    cnd = RSQRT2PI * cupla::exp(- 0.5 * d * d) *
           (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
 
     if (d > 0)
@@ -56,8 +56,8 @@ static void BlackScholesBodyCPU(
 {
     double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
 
-    double sqrtT = sqrt(T);
-    double    d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
+    double sqrtT = cupla::sqrt(T);
+    double    d1 = (cupla::log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
     double    d2 = d1 - V * sqrtT;
     double CNDD1 = CND(d1);
     double CNDD2 = CND(d2);
diff --git a/thirdParty/cupla/example/CUDASamples/blackScholes/src/BlackScholes_kernel.cuh b/thirdParty/cupla/example/CUDASamples/blackScholes/src/BlackScholes_kernel.cuh
index 050b4a15d3..ede25a9856 100644
--- a/thirdParty/cupla/example/CUDASamples/blackScholes/src/BlackScholes_kernel.cuh
+++ b/thirdParty/cupla/example/CUDASamples/blackScholes/src/BlackScholes_kernel.cuh
@@ -13,7 +13,7 @@
 
 #include <cuda_to_cupla.hpp>
 #include <stdio.h>
-#ifndef __CUDACC__
+#if !(BOOST_LANG_CUDA || BOOST_LANG_HIP)
 struct float2{
     float x;
     float y;
@@ -39,8 +39,8 @@ float cndGPU(T_Acc const & acc, float d)
     const float RSQRT2PI = 0.39894228040143267793994605993438f;
 
     float
-    K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));
-    float cnd = RSQRT2PI * __expf(- 0.5f * d * d) *
+    K = __fdividef(1.0f, (1.0f + 0.2316419f * cupla::abs(d)));
+    float cnd = RSQRT2PI * cupla::exp(- 0.5f * d * d) *
           (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
     if (d > 0)
         cnd = 1.0f - cnd;
@@ -66,8 +66,8 @@ ALPAKA_FN_ACC void BlackScholesBodyGPU(
 {
     float sqrtT, expRT;
     float d1, d2, CNDD1, CNDD2;
-    sqrtT = sqrtf(T); /// __fdividef(1.0F, rsqrtf(T));
-    d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
+    sqrtT = cupla::sqrt(T); /// __fdividef(1.0F, rsqrtf(T));
+    d1 = __fdividef(cupla::log(S / X) + (R + 0.5f * V * V) * T, V * sqrtT);
 
     d2 = d1 - V * sqrtT;
 
@@ -75,7 +75,7 @@ ALPAKA_FN_ACC void BlackScholesBodyGPU(
     CNDD2 = cndGPU(acc, d2);
 
     //Calculate Call and Put simultaneously
-    expRT = __expf(- R * T);
+    expRT = cupla::exp(- R * T);
     CallResult = S * CNDD1 - X * expRT * CNDD2;
     PutResult  = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
 }
diff --git a/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt b/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt
new file mode 100644
index 0000000000..8411cfa57b
--- /dev/null
+++ b/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/CMakeLists.txt
@@ -0,0 +1,67 @@
+#
+# Copyright 2016-2020 Rene Widera, Benjamin Worpitz, Vincent Ridder
+#
+# This file is part of cupla.
+#
+# cupla is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cupla is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with cupla.
+# If not, see <http://www.gnu.org/licenses/>.
+#
+
+################################################################################
+# Required CMake version.
+################################################################################
+
+cmake_minimum_required(VERSION 3.11.4)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+################################################################################
+
+project(cuplaVectorAdd)
+
+################################################################################
+# CMake policies
+#
+# Search in <PackageName>_ROOT:
+#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html#
+################################################################################
+
+if(POLICY CMP0074)
+    cmake_policy(SET CMP0074 NEW)
+endif()
+
+################################################################################
+# Find cupla
+################################################################################
+
+set(cupla_ROOT "$ENV{CUPLA_ROOT}" CACHE STRING  "The location of the cupla library")
+
+list(APPEND CMAKE_MODULE_PATH "${cupla_ROOT}")
+find_package(cupla REQUIRED)
+
+
+################################################################################
+# Add library.
+################################################################################
+
+set(_SOURCE_DIR "src/")
+
+# Add all the source files in all recursive subdirectories and group them accordingly.
+append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX)
+
+# Always add all files to the target executable build call to add them to the build project.
+cupla_add_executable(${PROJECT_NAME} ${_FILES_SOURCE_CXX})
+
diff --git a/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/README.md b/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/README.md
new file mode 100644
index 0000000000..96a1779c72
--- /dev/null
+++ b/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/README.md
@@ -0,0 +1,9 @@
+# vector add example with native cupla interface
+
+This example is equal to `vectorAdd` but is not relying on the compatibility header included with (`cuda_to_cupla.hpp`) 
+to allow the usage of CUDA function names and types.
+
+CUDA prefixed functions/types are prefix with cupla instead.
+CUDA functions/types those are not prefixed life in the namespace `cupla`.
+Functions call need always the current used accelerator instance.
+Non standard global variables like `threadIdx`, `blockDim` should be used as functions from the namespace `cupla`.
\ No newline at end of file
diff --git a/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp b/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp
new file mode 100644
index 0000000000..4c91d53f41
--- /dev/null
+++ b/thirdParty/cupla/example/CUDASamples/cuplaVectorAdd/src/vectorAdd.cpp
@@ -0,0 +1,288 @@
+/* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/** @file Vector addition: C = A + B.
+ *
+ * This sample is a very basic sample that implements element by element
+ * vector addition. It is the same as the sample illustrating Chapter 2
+ * of the programming guide with some additions like error checking.
+ */
+
+#include <stdio.h>
+#include <iostream> //std:cout
+// For the CUDA runtime routines (prefixed with "cupla_")
+#include <cupla.hpp>
+//Timer for test purpose
+#include <chrono>
+#include <boost/lexical_cast.hpp>
+#include <vector>
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+struct vectorAdd {
+    template<typename T_Acc>
+    ALPAKA_FN_HOST_ACC
+    void operator()(T_Acc const &acc, const float *A, const float *B, float *C, const int numElements) const {
+        int begin = cupla::blockDim(acc).x * cupla::blockIdx(acc).x * cupla::threadDim(acc).x + cupla::threadIdx(acc).x * cupla::threadDim(acc).x;
+        if (begin < numElements) {
+            int end = (begin + cupla::threadDim(acc).x < numElements) ? begin+cupla::threadDim(acc).x : numElements;
+            for (int i=begin; i <end; ++i) {
+                C[i] = A[i] + B[i], cupla::hierarchy::Blocks{};
+            }
+        }
+    }
+};
+
+void benchmarkTest(int first, int last , int stepSize);
+/**
+ * Host main routine
+ */
+int
+main(int argc, char *argv[])
+{
+    // Error code to check return values for CUDA calls
+    cuplaError_t err = cuplaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    size_t size = numElements * sizeof(float);
+    printf("[Vector addition of %d elements]\n", numElements);
+
+    // Allocate the host input vector A
+    float *h_A = (float *)malloc(size);
+
+    // Allocate the host input vector B
+    float *h_B = (float *)malloc(size);
+
+    // Allocate the host output vector C
+    float *h_C = (float *)malloc(size);
+
+    // Verify that allocations succeeded
+    if (h_A == NULL || h_B == NULL || h_C == NULL)
+    {
+        fprintf(stderr, "Failed to allocate host vectors!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Initialize the host input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float)RAND_MAX;
+        h_B[i] = rand()/(float)RAND_MAX;
+    }
+
+    // Allocate the device input vector A
+    float *d_A = NULL;
+    err = cuplaMalloc((void **)&d_A, size);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Allocate the device input vector B
+    float *d_B = NULL;
+    err = cuplaMalloc((void **)&d_B, size);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Allocate the device output vector C
+    float *d_C = NULL;
+    err = cuplaMalloc((void **)&d_C, size);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Copy the host input vectors A and B in host memory to the device input vectors in
+    // device memory
+    printf("Copy input data from the host memory to the CUDA device\n");
+    err = cuplaMemcpy(d_A, h_A, size, cuplaMemcpyHostToDevice);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    err = cuplaMemcpy(d_B, h_B, size, cuplaMemcpyHostToDevice);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+    CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock,0,0)(d_A, d_B, d_C, numElements);
+    err = cuplaGetLastError();
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+    printf("Copy output data from the CUDA device to the host memory\n");
+    err = cuplaMemcpy(h_C, d_C, size, cuplaMemcpyDeviceToHost);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Verify that the result vector is correct
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
+        {
+            fprintf(stderr, "Result verification failed at element %d!\n", i);
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    printf("Test PASSED\n");
+
+    // Free device global memory
+    err = cuplaFree(d_A);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    err = cuplaFree(d_B);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+    err = cuplaFree(d_C);
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Free host memory
+    free(h_A);
+    free(h_B);
+    free(h_C);
+
+    // Reset the device and exit
+    // cuplaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cuplaDeviceReset causes all profile data to be
+    // flushed before the application exits
+    err = cuplaDeviceReset();
+
+    if (err != cuplaSuccess)
+    {
+        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cuplaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+    printf("Done\n");
+
+    using boost::lexical_cast;
+    using boost::bad_lexical_cast;
+    std::vector<int> args;
+    while (*++argv){
+        try{
+            args.push_back(lexical_cast<int>(*argv));
+        }
+        catch( const bad_lexical_cast &){
+            args.push_back(0);
+        }
+    }
+    //run benchmartest
+    int first = 50000;
+    int last = 100000;
+    int stepSize= 50000;
+    if (args.size() >1){
+        first=args[0];
+        last=args[1];
+    }
+    if (args.size()>2){
+        stepSize=args[2];
+    }
+    benchmarkTest(first, last, stepSize);
+    cuplaDeviceReset();
+    return 0;
+}
+
+void
+benchmarkTest(int first, int last, int stepSize)
+{
+
+    for (int numElements = first; numElements <=last ; numElements+= stepSize) {
+        std::cout <<"N= " <<numElements << "; ";
+        size_t size = numElements * sizeof(float);
+        //alloc host memory
+        float *h_A = (float *)malloc(size);
+        float *h_B = (float *)malloc(size);
+        //init
+        for (int i = 0; i < numElements; ++i) {
+            h_A[i] = rand()/(float)RAND_MAX;
+            h_B[i] = rand()/(float)RAND_MAX;
+        }
+        //alloc device memory
+        float *d_A = NULL;
+        cuplaMalloc((void **) &d_A, size);
+        float *d_B = NULL;
+        cuplaMalloc((void **) &d_B, size);
+        float *d_C = NULL;
+        cuplaMalloc((void **) &d_C, size);
+
+        // copy host device
+        cuplaMemcpy(d_A, h_A, size, cuplaMemcpyHostToDevice);
+        cuplaMemcpy(d_B, h_B, size, cuplaMemcpyHostToDevice);
+
+        int threadsPerBlock=1024;
+        int blocksPerGrid= (numElements+threadsPerBlock-1)/threadsPerBlock;
+
+        //Run Kernel
+        std::chrono::high_resolution_clock::time_point start =
+            std::chrono::high_resolution_clock::now();
+
+        CUPLA_KERNEL_OPTI(vectorAdd)(blocksPerGrid, threadsPerBlock, 0, 0)(d_A, d_B, d_C, numElements);
+        cuplaDeviceSynchronize();
+
+        std::chrono::high_resolution_clock::time_point end =
+                std::chrono::high_resolution_clock::now();
+
+        std::cout << "Time: "<< std::chrono::duration_cast<std::chrono::milliseconds>
+                                        (end-start).count() <<"ms"<<std::endl;
+        //Free Device memory
+        cuplaFree(d_A);
+        cuplaFree(d_B);
+        cuplaFree(d_C);
+    }
+}
+
diff --git a/thirdParty/cupla/example/CUDASamples/matrixMul/CMakeLists.txt b/thirdParty/cupla/example/CUDASamples/matrixMul/CMakeLists.txt
index 5f3d94be6c..b9e42f9066 100644
--- a/thirdParty/cupla/example/CUDASamples/matrixMul/CMakeLists.txt
+++ b/thirdParty/cupla/example/CUDASamples/matrixMul/CMakeLists.txt
@@ -23,7 +23,7 @@
 # Required CMake version.
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
diff --git a/thirdParty/cupla/example/CUDASamples/vectorAdd/CMakeLists.txt b/thirdParty/cupla/example/CUDASamples/vectorAdd/CMakeLists.txt
index 53cea78d2f..b0662c486c 100644
--- a/thirdParty/cupla/example/CUDASamples/vectorAdd/CMakeLists.txt
+++ b/thirdParty/cupla/example/CUDASamples/vectorAdd/CMakeLists.txt
@@ -22,7 +22,7 @@
 # Required CMake version.
 ################################################################################
 
-cmake_minimum_required(VERSION 3.11.4)
+cmake_minimum_required(VERSION 3.15.0)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
diff --git a/thirdParty/cupla/include/cuda_to_cupla.hpp b/thirdParty/cupla/include/cuda_to_cupla.hpp
index e36a37d05b..2070e92ccc 100644
--- a/thirdParty/cupla/include/cuda_to_cupla.hpp
+++ b/thirdParty/cupla/include/cuda_to_cupla.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2016 Rene Widera
+/* Copyright 2016-2020 Rene Widera
  *
  * This file is part of cupla.
  *
@@ -21,7 +21,9 @@
 
 #pragma once
 
-#include "cupla_runtime.hpp"
+#include "cupla.hpp"
+
+#include "cupla/device_functions.hpp"
 
 #include "cupla/cudaToCupla/driverTypes.hpp"
 #include "cupla/cudaToCupla/runtime.hpp"
diff --git a/thirdParty/cupla/include/cupla.hpp b/thirdParty/cupla/include/cupla.hpp
new file mode 100644
index 0000000000..85284e6d7c
--- /dev/null
+++ b/thirdParty/cupla/include/cupla.hpp
@@ -0,0 +1,25 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla_runtime.hpp"
+#include "cupla/device_functions.hpp"
diff --git a/thirdParty/cupla/include/cupla/c/datatypes/cuplaExtent.hpp b/thirdParty/cupla/include/cupla/c/datatypes/cuplaExtent.hpp
index b0d8742ee4..59721eb160 100644
--- a/thirdParty/cupla/include/cupla/c/datatypes/cuplaExtent.hpp
+++ b/thirdParty/cupla/include/cupla/c/datatypes/cuplaExtent.hpp
@@ -52,7 +52,7 @@ struct cuplaExtent{
     >
     ALPAKA_FN_HOST_ACC
     cuplaExtent(
-        ::alpaka::vec::Vec<
+        ::alpaka::Vec<
             TDim,
             TSize
         > const &vec
@@ -65,12 +65,12 @@ struct cuplaExtent{
     }
 
     ALPAKA_FN_HOST_ACC
-    operator ::alpaka::vec::Vec<
+    operator ::alpaka::Vec<
         cupla::AlpakaDim< 3u >,
         cupla::MemSizeType
     >(void) const
     {
-        ::alpaka::vec::Vec<
+        ::alpaka::Vec<
             cupla::AlpakaDim< 3u >,
             cupla::MemSizeType
         > vec( depth, height, width );
@@ -83,8 +83,6 @@ struct cuplaExtent{
 
 namespace alpaka
 {
-namespace dim
-{
 namespace traits
 {
 
@@ -93,14 +91,11 @@ namespace traits
     struct DimType<
         cuplaExtent
     >{
-      using type = ::alpaka::dim::DimInt<3u>;
+      using type = ::alpaka::DimInt<3u>;
     };
 
 } // namespace traits
-} // namespace dim
 
-namespace elem
-{
 namespace traits
 {
 
@@ -113,7 +108,6 @@ namespace traits
     };
 
 } // namespace traits
-} // namspace elem
 
 namespace extent
 {
@@ -167,8 +161,6 @@ namespace traits
 } // namespace traits
 } // namespace extent
 
-namespace offset
-{
 namespace traits
 {
 
@@ -216,10 +208,7 @@ namespace traits
         }
     };
 } // namespace traits
-} // namespace offset
 
-namespace idx
-{
 namespace traits
 {
 
@@ -232,5 +221,4 @@ namespace traits
     };
 
 } // namespace traits
-} // namespace idx
 } // namespave alpaka
diff --git a/thirdParty/cupla/include/cupla/c/datatypes/cuplaPos.hpp b/thirdParty/cupla/include/cupla/c/datatypes/cuplaPos.hpp
index dd2850ef65..b99a26de34 100644
--- a/thirdParty/cupla/include/cupla/c/datatypes/cuplaPos.hpp
+++ b/thirdParty/cupla/include/cupla/c/datatypes/cuplaPos.hpp
@@ -52,7 +52,7 @@ struct cuplaPos{
     >
     ALPAKA_FN_HOST_ACC
     cuplaPos(
-        ::alpaka::vec::Vec<
+        ::alpaka::Vec<
             TDim,
             TSize
         > const &vec
@@ -65,12 +65,12 @@ struct cuplaPos{
     }
 
     ALPAKA_FN_HOST_ACC
-    operator ::alpaka::vec::Vec<
+    operator ::alpaka::Vec<
         cupla::AlpakaDim< 3u >,
         cupla::MemSizeType
     >(void) const
     {
-        ::alpaka::vec::Vec<
+        ::alpaka::Vec<
             cupla::AlpakaDim< 3u >,
             cupla::MemSizeType
         > vec( x, y, z );
@@ -82,8 +82,6 @@ struct cuplaPos{
 
 namespace alpaka
 {
-namespace dim
-{
 namespace traits
 {
 
@@ -92,14 +90,11 @@ namespace traits
     struct DimType<
         cuplaPos
     >{
-      using type = ::alpaka::dim::DimInt<3u>;
+      using type = ::alpaka::DimInt<3u>;
     };
 
 } // namespace traits
-} // namespace dim
 
-namespace elem
-{
 namespace traits
 {
 
@@ -112,7 +107,6 @@ namespace traits
     };
 
 } // namespace traits
-} // namspace elem
 
 namespace extent
 {
@@ -166,8 +160,6 @@ namespace traits
 } // namespace traits
 } // namespace extent
 
-namespace offset
-{
 namespace traits
 {
 
@@ -215,10 +207,7 @@ namespace traits
         }
     };
 } // namespace traits
-} // namespace offset
 
-namespace idx
-{
 namespace traits
 {
 
@@ -231,5 +220,4 @@ namespace traits
     };
 
 } // namespace traits
-} // namespace idx
 } // namespave alpaka
diff --git a/thirdParty/cupla/include/cupla/config/CpuOmp2Blocks.hpp b/thirdParty/cupla/include/cupla/config/CpuOmp2Blocks.hpp
index bcca06f7e0..34881724b8 100644
--- a/thirdParty/cupla/include/cupla/config/CpuOmp2Blocks.hpp
+++ b/thirdParty/cupla/include/cupla/config/CpuOmp2Blocks.hpp
@@ -40,4 +40,4 @@
 #   include "cupla/../../src/stream.cpp"
 #endif
 
-#include "cuda_to_cupla.hpp"
+#include "cupla.hpp"
diff --git a/thirdParty/cupla/include/cupla/config/CpuOmp2Threads.hpp b/thirdParty/cupla/include/cupla/config/CpuOmp2Threads.hpp
index 5e5ee8f73d..287bef45cb 100644
--- a/thirdParty/cupla/include/cupla/config/CpuOmp2Threads.hpp
+++ b/thirdParty/cupla/include/cupla/config/CpuOmp2Threads.hpp
@@ -40,4 +40,4 @@
 #   include "cupla/../../src/stream.cpp"
 #endif
 
-#include "cuda_to_cupla.hpp"
+#include "cupla.hpp"
diff --git a/thirdParty/cupla/include/cupla/config/CpuOmp4.hpp b/thirdParty/cupla/include/cupla/config/CpuOmp4.hpp
index 73d3312ea7..b3e14d3ef3 100644
--- a/thirdParty/cupla/include/cupla/config/CpuOmp4.hpp
+++ b/thirdParty/cupla/include/cupla/config/CpuOmp4.hpp
@@ -40,4 +40,4 @@
 #   include "cupla/../../src/stream.cpp"
 #endif
 
-#include "cuda_to_cupla.hpp"
+#include "cupla.hpp"
diff --git a/thirdParty/cupla/include/cupla/config/CpuSerial.hpp b/thirdParty/cupla/include/cupla/config/CpuSerial.hpp
index 4823147d50..a88f3541ad 100644
--- a/thirdParty/cupla/include/cupla/config/CpuSerial.hpp
+++ b/thirdParty/cupla/include/cupla/config/CpuSerial.hpp
@@ -40,4 +40,4 @@
 #   include "cupla/../../src/stream.cpp"
 #endif
 
-#include "cuda_to_cupla.hpp"
+#include "cupla.hpp"
diff --git a/thirdParty/cupla/include/cupla/config/CpuTbbBlocks.hpp b/thirdParty/cupla/include/cupla/config/CpuTbbBlocks.hpp
index 03c3c25201..643c23d53c 100644
--- a/thirdParty/cupla/include/cupla/config/CpuTbbBlocks.hpp
+++ b/thirdParty/cupla/include/cupla/config/CpuTbbBlocks.hpp
@@ -40,4 +40,4 @@
 #   include "cupla/../../src/stream.cpp"
 #endif
 
-#include "cuda_to_cupla.hpp"
+#include "cupla.hpp"
diff --git a/thirdParty/cupla/include/cupla/config/CpuThreads.hpp b/thirdParty/cupla/include/cupla/config/CpuThreads.hpp
index d1fbe9461e..036c963f8c 100644
--- a/thirdParty/cupla/include/cupla/config/CpuThreads.hpp
+++ b/thirdParty/cupla/include/cupla/config/CpuThreads.hpp
@@ -40,4 +40,4 @@
 #   include "cupla/../../src/stream.cpp"
 #endif
 
-#include "cuda_to_cupla.hpp"
+#include "cupla.hpp"
diff --git a/thirdParty/cupla/include/cupla/config/GpuCudaRt.hpp b/thirdParty/cupla/include/cupla/config/GpuCudaRt.hpp
index db9829ed6a..e6d52ad480 100644
--- a/thirdParty/cupla/include/cupla/config/GpuCudaRt.hpp
+++ b/thirdParty/cupla/include/cupla/config/GpuCudaRt.hpp
@@ -40,4 +40,4 @@
 #   include "cupla/../../src/stream.cpp"
 #endif
 
-#include "cuda_to_cupla.hpp"
+#include "cupla.hpp"
diff --git a/thirdParty/cupla/include/cupla/config/GpuHipRt.hpp b/thirdParty/cupla/include/cupla/config/GpuHipRt.hpp
index 302128029f..6195cdf134 100644
--- a/thirdParty/cupla/include/cupla/config/GpuHipRt.hpp
+++ b/thirdParty/cupla/include/cupla/config/GpuHipRt.hpp
@@ -40,4 +40,4 @@
 #   include "cupla/../../src/stream.cpp"
 #endif
 
-#include "cuda_to_cupla.hpp"
+#include "cupla.hpp"
diff --git a/thirdParty/cupla/include/cupla/cudaToCupla/driverTypes.hpp b/thirdParty/cupla/include/cupla/cudaToCupla/driverTypes.hpp
index 087b918f96..8a7985672f 100644
--- a/thirdParty/cupla/include/cupla/cudaToCupla/driverTypes.hpp
+++ b/thirdParty/cupla/include/cupla/cudaToCupla/driverTypes.hpp
@@ -23,15 +23,17 @@
 #pragma once
 
 #include "cupla/datatypes/Array.hpp"
+#include "cupla/device/SharedMemory.hpp"
+#include "cupla/device_functions.hpp"
 
-#define __syncthreads(...) ::alpaka::block::sync::syncBlockThreads(acc)
+#define __syncthreads(...) ::cupla::syncThreads(acc)
 
 #define cudaSuccess cuplaSuccess
 #define cudaErrorMemoryAllocation cuplaErrorMemoryAllocation
 #define cudaErrorInitializationError cuplaErrorInitializationError
 #define cudaErrorNotReady cuplaErrorNotReady
 #define cudaErrorDeviceAlreadyInUse cuplaErrorDeviceAlreadyInUse
-#define cuplaErrorInvalidDevice cuplaErrorInvalidDevice
+#define cudaErrorInvalidDevice cuplaErrorInvalidDevice
 
 #define cudaError_t cuplaError_t
 #define cudaError cuplaError
@@ -54,7 +56,7 @@
 /* cudaEventBlockingSync is a define in CUDA, hence we must remove
  * the old definition with the cupla enum
  */
-#define cudaEventBlockingSync cuplaEventBlockingSync 
+#define cudaEventBlockingSync cuplaEventBlockingSync
 
 #ifdef cudaEventDisableTiming
 #undef cudaEventDisableTiming
@@ -64,14 +66,6 @@
  */
 #define cudaEventDisableTiming cuplaEventDisableTiming
 
-#define sharedMem(ppName, ...)                                                 \
-  __VA_ARGS__ &ppName =                                                        \
-      ::alpaka::block::shared::st::allocVar<__VA_ARGS__, __COUNTER__>(acc)
-
-#define sharedMemExtern(ppName, ...)                                           \
-    __VA_ARGS__* ppName =                                                      \
-        ::alpaka::block::shared::dyn::getMem<__VA_ARGS__>(acc)
-
 #define cudaMemcpyKind cuplaMemcpyKind
 #define cudaMemcpyHostToDevice cuplaMemcpyHostToDevice
 #define cudaMemcpyDeviceToHost cuplaMemcpyDeviceToHost
@@ -79,22 +73,11 @@
 #define cudaMemcpyHostToHost cuplaMemcpyHostToHost
 
 // index renaming
-#define blockIdx                                                               \
-  static_cast<uint3>(                                                \
-      ::alpaka::idx::getIdx<::alpaka::Grid, ::alpaka::Blocks>(acc))
-#define threadIdx                                                              \
-  static_cast<uint3>(                                                \
-      ::alpaka::idx::getIdx<::alpaka::Block, ::alpaka::Threads>(acc))
-
-#define gridDim                                                                \
-  static_cast<uint3>(                                                \
-      ::alpaka::workdiv::getWorkDiv<::alpaka::Grid, ::alpaka::Blocks>(acc))
-#define blockDim                                                               \
-  static_cast<uint3>(                                                \
-      ::alpaka::workdiv::getWorkDiv<::alpaka::Block, ::alpaka::Threads>(acc))
-#define elemDim                                                               \
-  static_cast<uint3>(                                                \
-      ::alpaka::workdiv::getWorkDiv<::alpaka::Thread, ::alpaka::Elems>(acc))
+#define blockIdx cupla::blockIdx(acc)
+#define threadIdx cupla::threadIdx(acc)
+#define gridDim cupla::gridDim(acc)
+#define blockDim cupla::blockDim(acc)
+#define elemDim cupla::threadDim(acc)
 
 /** Atomic functions
  *
@@ -106,17 +89,17 @@
  *
  * @{
  */
-#define atomicAdd(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, __VA_ARGS__)
-#define atomicSub(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Sub>(acc, __VA_ARGS__)
-#define atomicMin(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Min>(acc, __VA_ARGS__)
-#define atomicMax(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Max>(acc, __VA_ARGS__)
-#define atomicInc(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Inc>(acc, __VA_ARGS__)
-#define atomicDec(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Dec>(acc, __VA_ARGS__)
-#define atomicExch(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, __VA_ARGS__)
-#define atomicCAS(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Cas>(acc, __VA_ARGS__)
-#define atomicAnd(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::And>(acc, __VA_ARGS__)
-#define atomicXor(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Xor>(acc, __VA_ARGS__)
-#define atomicOr(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Or>(acc, __VA_ARGS__)
+#define atomicAdd(...) cupla::atomicAdd(acc, __VA_ARGS__)
+#define atomicSub(...) cupla::atomicSub(acc, __VA_ARGS__)
+#define atomicMin(...) cupla::atomicMin(acc, __VA_ARGS__)
+#define atomicMax(...) cupla::atomicMax(acc, __VA_ARGS__)
+#define atomicInc(...) cupla::atomicInc(acc, __VA_ARGS__)
+#define atomicDec(...) cupla::atomicDec(acc, __VA_ARGS__)
+#define atomicExch(...) cupla::atomicExch(acc, __VA_ARGS__)
+#define atomicCAS(...) cupla::atomicCAS(acc, __VA_ARGS__)
+#define atomicAnd(...) cupla::atomicAnd(acc, __VA_ARGS__)
+#define atomicXor(...) cupla::atomicXor(acc, __VA_ARGS__)
+#define atomicOr(...) cupla::atomicOr(acc, __VA_ARGS__)
 /** @} */
 
 #define uint3 ::cupla::uint3
diff --git a/thirdParty/cupla/include/cupla/cudaToCupla/runtime.hpp b/thirdParty/cupla/include/cupla/cudaToCupla/runtime.hpp
index 59239929a3..cea4d4a970 100644
--- a/thirdParty/cupla/include/cupla/cudaToCupla/runtime.hpp
+++ b/thirdParty/cupla/include/cupla/cudaToCupla/runtime.hpp
@@ -84,8 +84,8 @@
  * to avoid negative performance impact intrinsic function redefinitions
  * are disabled in CUDA
  */
-#if !defined(__CUDA_ARCH__)
-#define __fdividef(a,b) ((a)/(b))
-#define __expf(a) alpaka::math::exp(acc,a)
-#define __logf(a) alpaka::math::log(acc,a)
+#if CUPLA_DEVICE_COMPILE == 0
+#   define __fdividef(a,b) ((a)/(b))
+#   define __expf(a) cupla::math::exp(a)
+#   define __logf(a) cupla::math::log(a)
 #endif
diff --git a/thirdParty/cupla/include/cupla/datatypes/uint.hpp b/thirdParty/cupla/include/cupla/datatypes/uint.hpp
index aeeedbb9c0..11c1d509fb 100644
--- a/thirdParty/cupla/include/cupla/datatypes/uint.hpp
+++ b/thirdParty/cupla/include/cupla/datatypes/uint.hpp
@@ -43,7 +43,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
         >
         ALPAKA_FN_HOST_ACC
         uint3(
-          ::alpaka::vec::Vec<
+          ::alpaka::Vec<
               TDim,
               TSize
           > const &vec
@@ -66,12 +66,12 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 #endif
 
         ALPAKA_FN_HOST_ACC
-        operator ::alpaka::vec::Vec<
+        operator ::alpaka::Vec<
             cupla::AlpakaDim< 3u >,
             IdxType
         >(void) const
         {
-            ::alpaka::vec::Vec<
+            ::alpaka::Vec<
                 cupla::AlpakaDim< 3u >,
                 IdxType
             > vec(z, y, x);
@@ -85,8 +85,6 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 
 namespace alpaka
 {
-namespace dim
-{
 namespace traits
 {
 
@@ -95,14 +93,11 @@ namespace traits
     struct DimType<
         cupla::uint3
     >{
-      using type = ::alpaka::dim::DimInt<3u>;
+      using type = ::alpaka::DimInt<3u>;
     };
 
 } // namespace traits
-} // namespace dim
 
-namespace elem
-{
 namespace traits
 {
 
@@ -115,7 +110,6 @@ namespace traits
     };
 
 } // namespace traits
-} // namspace elem
 
 namespace extent
 {
@@ -168,8 +162,6 @@ namespace traits
 } // namespace traits
 } // namespace extent
 
-namespace offset
-{
 namespace traits
 {
 
@@ -217,10 +209,7 @@ namespace traits
         }
     };
 } // namespace traits
-} // namespace offset
 
-namespace idx
-{
 namespace traits
 {
 
@@ -233,5 +222,4 @@ namespace traits
     };
 
 } // namespace traits
-} // namespace idx
 } // namespave alpaka
diff --git a/thirdParty/cupla/include/cupla/defines.hpp b/thirdParty/cupla/include/cupla/defines.hpp
index 3f1786927e..649029d19e 100644
--- a/thirdParty/cupla/include/cupla/defines.hpp
+++ b/thirdParty/cupla/include/cupla/defines.hpp
@@ -111,3 +111,17 @@
 #ifndef CUPLA_HEADER_ONLY_FUNC_SPEC
 #   define CUPLA_HEADER_ONLY_FUNC_SPEC
 #endif
+
+/*! device compile flag
+ *
+ * Enabled if the compiler processes currently a separate compile path for the device code
+ *
+ * @attention value is always 0 for alpaka CPU accelerators
+ *
+ * Value is 1 if device path is compiled else 0
+ */
+#if defined(__CUDA_ARCH__) || ( defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__== 1 && defined(__HIP__) )
+    #define CUPLA_DEVICE_COMPILE 1
+#else
+    #define CUPLA_DEVICE_COMPILE 0
+#endif
diff --git a/thirdParty/cupla/include/cupla/device/Atomic.hpp b/thirdParty/cupla/include/cupla/device/Atomic.hpp
new file mode 100644
index 0000000000..7e9bc60dbd
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/Atomic.hpp
@@ -0,0 +1,179 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/datatypes/uint.hpp"
+#include "cupla/device/Hierarchy.hpp"
+#include "cupla/types.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+
+#define CUPLA_UNARY_ATOMIC_OP(functionName, alpakaOp)                          \
+        /*!                                                                    \
+         * Compared to their CUDA/HIP counterparts, these functions take an additional last \
+         * parameter to denote atomicity (synchronization) level. This parameter is \
+         * of type cupla::hierarchy::{Grids|Blocks|Threads}. Grids corresponds \
+         * to atomicity between different kernels, Blocks - to different blocks \
+         * in the same grid/kernel, Threads - to threads of the same block.    \
+         * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*] \
+         * @tparam T_Acc alpaka accelerator [alpaka::*]                   \
+         * @tparam T_Type type of the value                                    \
+         * @param acc alpaka accelerator                                       \
+         * @param ptr destination pointer                                      \
+         * @param value source value                                           \
+         * @{                                                                  \
+         */                                                                    \
+        template<                                                              \
+            typename T_Hierarchy,                                              \
+            typename T_Acc,                                                    \
+            typename T_Type                                                    \
+        >                                                                      \
+        ALPAKA_FN_ACC ALPAKA_FN_INLINE                                         \
+        T_Type functionName(                                                   \
+            T_Acc const & acc,                                                 \
+            T_Type *ptr,                                                       \
+            T_Type const & value                                               \
+        )                                                                      \
+        {                                                                      \
+            return ::alpaka::atomicOp< alpakaOp >(                     \
+                acc,                                                           \
+                ptr,                                                           \
+                value,                                                         \
+                T_Hierarchy{}                                                  \
+            );                                                                 \
+        }                                                                      \
+                                                                               \
+        /*! @param hierarchy hierarchy level within the operation is atomic    \
+         */                                                                    \
+        template<                                                              \
+            typename T_Acc,                                                    \
+            typename T_Type,                                                   \
+            typename T_Hierarchy = alpaka::hierarchy::Grids                    \
+        >                                                                      \
+        ALPAKA_FN_ACC ALPAKA_FN_INLINE                                         \
+        T_Type functionName(                                                   \
+            T_Acc const & acc,                                                 \
+            T_Type *ptr,                                                       \
+            T_Type const & value,                                              \
+            T_Hierarchy const & hierarchy = T_Hierarchy()                      \
+        )                                                                      \
+        {                                                                      \
+            return functionName< T_Hierarchy >(                                \
+                acc,                                                           \
+                ptr,                                                           \
+                value                                                          \
+            );                                                                 \
+        }                                                                      \
+        /*!@}                                                                  \
+         */
+
+        /// atomic addition
+        CUPLA_UNARY_ATOMIC_OP( atomicAdd, ::alpaka::AtomicAdd )
+        /// atomic subtraction
+        CUPLA_UNARY_ATOMIC_OP( atomicSub, ::alpaka::AtomicSub )
+        /// atomic minimum
+        CUPLA_UNARY_ATOMIC_OP( atomicMin, ::alpaka::AtomicMin )
+        /// atomic maximum
+        CUPLA_UNARY_ATOMIC_OP( atomicMax, ::alpaka::AtomicMax )
+        /// atomic increment
+        CUPLA_UNARY_ATOMIC_OP( atomicInc, ::alpaka::AtomicInc )
+        /// atomic decrement
+        CUPLA_UNARY_ATOMIC_OP( atomicDec, ::alpaka::AtomicDec )
+        /// atomic bit-wise and
+        CUPLA_UNARY_ATOMIC_OP( atomicAnd, ::alpaka::AtomicAnd )
+        /// atomic bit-wise or
+        CUPLA_UNARY_ATOMIC_OP( atomicOr, ::alpaka::AtomicOr )
+        /// atomic exchange
+        CUPLA_UNARY_ATOMIC_OP( atomicExch, ::alpaka::AtomicExch )
+        /// atomic bit-wise xor
+        CUPLA_UNARY_ATOMIC_OP( atomicXor, ::alpaka::AtomicXor )
+
+#undef CUPLA_UNARY_ATOMIC_OP
+
+        /** atomic compare and swap
+         *
+         * @{
+         * @tparam T_Hierarchy parallelism hierarchy level within the operation is atomic [type cupla::hierarchy::*]
+         * @tparam T_Acc alpaka accelerator [alpaka::*]
+         * @tparam T_Type type of the value
+         * @param acc alpaka accelerator
+         * @param ptr destination pointer
+         * @param value source value
+         */
+        template<
+            typename T_Hierarchy,
+            typename T_Acc,
+            typename T_Type
+        >
+        ALPAKA_FN_ACC ALPAKA_FN_INLINE
+        T_Type atomicCas(
+            T_Acc const & acc,
+            T_Type *ptr,
+            T_Type const & compare,
+            T_Type const & value
+        )
+        {
+            return ::alpaka::atomicOp< ::alpaka::AtomicCas >(
+                acc,
+                ptr,
+                compare,
+                value,
+                T_Hierarchy{}
+            );
+        }
+
+        /*! @param hierarchy hierarchy level within the operation is atomic
+         */
+        template<
+            typename T_Acc,
+            typename T_Type,
+            typename T_Hierarchy = hierarchy::Grids
+        >
+        ALPAKA_FN_ACC ALPAKA_FN_INLINE
+        T_Type atomicCas(
+            T_Acc const & acc,
+            T_Type *ptr,
+            T_Type const & compare,
+            T_Type const & value,
+            T_Hierarchy const & hierarchy = T_Hierarchy()
+        )
+        {
+            return atomicCas< T_Hierarchy >(
+                acc,
+                ptr,
+                compare,
+                value
+            );
+        }
+        /*!@}
+         */
+
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/Hierarchy.hpp b/thirdParty/cupla/include/cupla/device/Hierarchy.hpp
new file mode 100644
index 0000000000..130faa31be
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/Hierarchy.hpp
@@ -0,0 +1,43 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/types.hpp"
+
+#include <alpaka/core/Positioning.hpp>
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+namespace hierarchy
+{
+
+    //! hierarchy definitions for atomic operation
+    using namespace ::alpaka::hierarchy;
+
+} // namespace layer
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/Index.hpp b/thirdParty/cupla/include/cupla/device/Index.hpp
new file mode 100644
index 0000000000..c7bd2d8afa
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/Index.hpp
@@ -0,0 +1,123 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/datatypes/uint.hpp"
+#include "cupla/types.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+
+    /** number of blocks within the grid layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 gridDim( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::getWorkDiv<
+                ::alpaka::Grid,
+                ::alpaka::Blocks
+            >( acc )
+        );
+    }
+
+    /** number of threads within the block layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 blockDim( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::getWorkDiv<
+                ::alpaka::Block,
+                ::alpaka::Threads
+            >( acc )
+        );
+    }
+
+    /** number of elements within the thread layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 threadDim( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::getWorkDiv<
+                ::alpaka::Thread,
+                ::alpaka::Elems
+            >( acc )
+        );
+    }
+
+    /** index of the thread within the block layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 threadIdx( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::getIdx<
+                ::alpaka::Block,
+                ::alpaka::Threads
+            >( acc )
+        );
+    }
+
+    /** index of the block within the grid layer
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::*]
+     * @param acc alpaka accelerator
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    cupla::uint3 blockIdx( T_Acc const & acc )
+    {
+        return static_cast< uint3 >(
+            ::alpaka::getIdx<
+                ::alpaka::Grid,
+                ::alpaka::Blocks
+            >( acc )
+        );
+    }
+
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/SharedMemory.hpp b/thirdParty/cupla/include/cupla/device/SharedMemory.hpp
new file mode 100644
index 0000000000..03d99398a5
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/SharedMemory.hpp
@@ -0,0 +1,32 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+#define sharedMem(ppName, ...)                                                 \
+     __VA_ARGS__& ppName =                                                     \
+        ::alpaka::declareSharedVar< __VA_ARGS__, __COUNTER__ >( acc )
+
+#define sharedMemExtern(ppName, ...)                                           \
+    __VA_ARGS__* ppName =                                                      \
+        ::alpaka::getDynSharedMem< __VA_ARGS__ >( acc )
diff --git a/thirdParty/cupla/include/cupla/device/Synchronization.hpp b/thirdParty/cupla/include/cupla/device/Synchronization.hpp
new file mode 100644
index 0000000000..a095dacb27
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/Synchronization.hpp
@@ -0,0 +1,60 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/types.hpp"
+
+#include <alpaka/alpaka.hpp>
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+
+    /** synchronize threads within the block
+     *
+     * @tparam T_Acc alpaka accelerator [alpaka::*]
+     * @param acc alpaka accelerator
+     *
+     * @{
+     */
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    void syncThreads( T_Acc const & acc )
+    {
+        ::alpaka::syncBlockThreads( acc );
+    }
+
+    template< typename T_Acc >
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE
+    void __syncthreads( T_Acc const & acc )
+    {
+        syncThreads( acc );
+    }
+
+    //!@}
+
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math.hpp b/thirdParty/cupla/include/cupla/device/math.hpp
new file mode 100644
index 0000000000..363f40c187
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math.hpp
@@ -0,0 +1,33 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Abs.hpp"
+#include "cupla/device/math/Comparison.hpp"
+#include "cupla/device/math/Erf.hpp"
+#include "cupla/device/math/Exp.hpp"
+#include "cupla/device/math/Log.hpp"
+#include "cupla/device/math/Mod.hpp"
+#include "cupla/device/math/Pow.hpp"
+#include "cupla/device/math/Root.hpp"
+#include "cupla/device/math/Round.hpp"
+#include "cupla/device/math/Trigo.hpp"
diff --git a/thirdParty/cupla/include/cupla/device/math/Abs.hpp b/thirdParty/cupla/include/cupla/device/math/Abs.hpp
new file mode 100644
index 0000000000..b90f8a4849
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Abs.hpp
@@ -0,0 +1,43 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes the absolute value.
+    CUPLA_UNARY_MATH_FN( abs, alpaka::math::ConceptMathAbs, Abs )
+
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Common.hpp b/thirdParty/cupla/include/cupla/device/math/Common.hpp
new file mode 100644
index 0000000000..1f407f476d
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Common.hpp
@@ -0,0 +1,178 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/types.hpp"
+#include "cupla/defines.hpp"
+
+#include <alpaka/core/Concepts.hpp>
+
+#include <type_traits>
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+namespace detail
+{
+    /** Get the concept implementation of the current accelerator
+     *
+     * @tparam T_AccOrMathImpl accelerator or math implementation [type alpaka::* or alpaka::math::MathStdLib]
+     * @tparam T_Concept alpaka concept
+     * @return implementation of the concept
+     */
+    ALPAKA_NO_HOST_ACC_WARNING
+    template< typename T_AccOrMathImpl, typename T_Concept >
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto getConcept()
+    {
+        using ResultMathConcept = alpaka::concepts::ImplementationBase<
+            T_Concept,
+            T_AccOrMathImpl
+        >;
+
+        using AccMathConcept = alpaka::concepts::ImplementationBase<
+            T_Concept,
+            Acc
+        >;
+
+        using AccThreadSeqMathConcept = alpaka::concepts::ImplementationBase<
+            T_Concept,
+            AccThreadSeq
+        >;
+
+        // cupla Acc and AccThreadSeq should use the same math concept implementation
+        static_assert(
+            std::is_same<
+                AccMathConcept,
+                AccThreadSeqMathConcept
+            >::value,
+            "The math concept implementation for the type 'Acc' and 'AccThreadSeq' must be equal"
+        );
+
+        return ResultMathConcept{};
+    }
+} // namespace detail
+
+#define CUPLA_UNARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait)  \
+    /**                                                                        \
+     * @tparam T_Type argument type                                            \
+     * @param arg input argument                                               \
+     */                                                                        \
+    ALPAKA_NO_HOST_ACC_WARNING                                                 \
+    template< typename T_Type >                                                \
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName(                     \
+        T_Type const & arg                                                     \
+    )                                                                          \
+    /* return type is required for the compiler to detect host, device         \
+     * function qualifier correctly                                            \
+     */                                                                        \
+    ->  decltype(                                                              \
+        alpaka::math::traits::alpakaMathTrait<                                 \
+            alpaka::concepts::ImplementationBase<                              \
+                alpakaMathConcept,                                             \
+                accOrMathImpl                                                  \
+            >,                                                                 \
+            T_Type                                                             \
+        >::functionName(                                                       \
+            detail::getConcept< accOrMathImpl, alpakaMathConcept >(),          \
+            arg                                                                \
+        )                                                                      \
+    )                                                                          \
+    {                                                                          \
+        return alpaka::math::traits::alpakaMathTrait<                          \
+            alpaka::concepts::ImplementationBase<                              \
+                alpakaMathConcept,                                             \
+                accOrMathImpl                                                  \
+            >,                                                                 \
+            T_Type                                                             \
+        >::functionName(                                                       \
+            detail::getConcept< accOrMathImpl, alpakaMathConcept >(),          \
+            arg                                                                \
+        );                                                                     \
+    }
+
+/* Using the free alpaka functions `alpaka::math::*` will result into `__host__ __device__`
+ * errors, therefore the alpaka math trait must be used.
+ */
+#define CUPLA_BINARY_MATH_FN_DETAIL(functionName, accOrMathImpl, alpakaMathConcept, alpakaMathTrait) \
+    /**                                                                        \
+     * @tparam T_Type argument type                                            \
+     * @param arg1 first input argument                                        \
+     * @param arg2 second input argument                                       \
+     */                                                                        \
+    ALPAKA_NO_HOST_ACC_WARNING                                                 \
+    template<                                                                  \
+        typename T_Type1,                                                      \
+        typename T_Type2                                                       \
+    >                                                                          \
+    ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE auto functionName(                     \
+        T_Type1 const & arg1,                                                  \
+        T_Type2 const & arg2                                                   \
+    )                                                                          \
+    /* return type is required for the compiler to detect host, device         \
+     * function qualifier correctly                                            \
+     */                                                                        \
+    ->  decltype(                                                              \
+        alpaka::math::traits::alpakaMathTrait<                                 \
+            alpaka::concepts::ImplementationBase<                              \
+                alpakaMathConcept,                                             \
+                accOrMathImpl                                                  \
+            >,                                                                 \
+            T_Type1,                                                           \
+            T_Type2                                                            \
+        >::functionName(                                                       \
+            detail::getConcept< accOrMathImpl, alpakaMathConcept >(),          \
+            arg1,                                                              \
+            arg2                                                               \
+        )                                                                      \
+    )                                                                          \
+    {                                                                          \
+        return alpaka::math::traits::alpakaMathTrait<                          \
+            alpaka::concepts::ImplementationBase<                              \
+                alpakaMathConcept,                                             \
+                accOrMathImpl                                                  \
+            >,                                                                 \
+            T_Type1,                                                           \
+            T_Type2                                                            \
+        >::functionName(                                                       \
+            detail::getConcept< accOrMathImpl, alpakaMathConcept >(),          \
+            arg1,                                                              \
+            arg2                                                               \
+        );                                                                     \
+    }
+
+#if CUPLA_DEVICE_COMPILE == 0
+    #define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_UNARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait)
+    #define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_BINARY_MATH_FN_DETAIL(functionName, alpaka::math::MathStdLib, alpakaMathConcept, alpakaMathTrait)
+#else
+    #define CUPLA_UNARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_UNARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait)
+    #define CUPLA_BINARY_MATH_FN(functionName, alpakaMathConcept, alpakaMathTrait) CUPLA_BINARY_MATH_FN_DETAIL(functionName, Acc, alpakaMathConcept, alpakaMathTrait)
+#endif
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Comparison.hpp b/thirdParty/cupla/include/cupla/device/math/Comparison.hpp
new file mode 100644
index 0000000000..0cca2fd9ce
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Comparison.hpp
@@ -0,0 +1,45 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Calculates the smaller value of two arguments.
+    CUPLA_BINARY_MATH_FN( min, alpaka::math::ConceptMathMin, Min )
+
+    //! Calculates the larger value of two arguments.
+    CUPLA_BINARY_MATH_FN( max, alpaka::math::ConceptMathMax, Max )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Erf.hpp b/thirdParty/cupla/include/cupla/device/math/Erf.hpp
new file mode 100644
index 0000000000..dd7be4c659
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Erf.hpp
@@ -0,0 +1,42 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes the error function.
+    CUPLA_UNARY_MATH_FN( erf, alpaka::math::ConceptMathErf, Erf )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Exp.hpp b/thirdParty/cupla/include/cupla/device/math/Exp.hpp
new file mode 100644
index 0000000000..6f343fb538
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Exp.hpp
@@ -0,0 +1,42 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes e (Euler's number, 2.7182818...) raised to the given power.
+    CUPLA_UNARY_MATH_FN( exp, alpaka::math::ConceptMathExp, Exp )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Log.hpp b/thirdParty/cupla/include/cupla/device/math/Log.hpp
new file mode 100644
index 0000000000..7b8c621bf9
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Log.hpp
@@ -0,0 +1,44 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes the natural (base e) logarithm.
+    CUPLA_UNARY_MATH_FN( log, alpaka::math::ConceptMathLog, Log )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Mod.hpp b/thirdParty/cupla/include/cupla/device/math/Mod.hpp
new file mode 100644
index 0000000000..2ea026a29c
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Mod.hpp
@@ -0,0 +1,45 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes the floating-point remainder of the division operation x/y.
+    CUPLA_BINARY_MATH_FN( fmod, alpaka::math::ConceptMathFmod, Fmod )
+
+    //! Computes the IEEE remainder of the floating point division operation x/y.
+    CUPLA_BINARY_MATH_FN( remainder, alpaka::math::ConceptMathRemainder, Remainder )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Pow.hpp b/thirdParty/cupla/include/cupla/device/math/Pow.hpp
new file mode 100644
index 0000000000..56a4e464a8
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Pow.hpp
@@ -0,0 +1,42 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes the value of base raised to the power exp.
+    CUPLA_BINARY_MATH_FN( pow, alpaka::math::ConceptMathPow, Pow )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Root.hpp b/thirdParty/cupla/include/cupla/device/math/Root.hpp
new file mode 100644
index 0000000000..eafbb769ae
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Root.hpp
@@ -0,0 +1,48 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes the square root.
+    CUPLA_UNARY_MATH_FN( sqrt, alpaka::math::ConceptMathSqrt, Sqrt )
+
+    //! Computes the inverse square root.
+    CUPLA_UNARY_MATH_FN( rsqrt, alpaka::math::ConceptMathRsqrt, Rsqrt )
+
+    //! Computes the cubic root.
+    CUPLA_UNARY_MATH_FN( cbrt, alpaka::math::ConceptMathCbrt, Cbrt )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Round.hpp b/thirdParty/cupla/include/cupla/device/math/Round.hpp
new file mode 100644
index 0000000000..35bf08344f
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Round.hpp
@@ -0,0 +1,66 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes the smallest integer value not less than arg.
+    CUPLA_UNARY_MATH_FN( ceil, alpaka::math::ConceptMathCeil, Ceil )
+
+    //! Computes the largest integer value not greater than arg.
+    CUPLA_UNARY_MATH_FN( floor, alpaka::math::ConceptMathFloor, Floor )
+
+    //! Computes the nearest integer not greater in magnitude than arg.
+    CUPLA_UNARY_MATH_FN( trunc, alpaka::math::ConceptMathTrunc, Trunc )
+
+    /** Computes the nearest integer value to arg (in floating-point format).
+     *
+     * Rounding halfway cases away from zero, regardless of the current rounding mode.
+     */
+    CUPLA_UNARY_MATH_FN( round, alpaka::math::ConceptMathRound, Round )
+
+    /** Computes the nearest integer value to arg (in integer format).
+     *
+     * Rounding halfway cases away from zero, regardless of the current rounding mode.
+     */
+    CUPLA_UNARY_MATH_FN( lround, alpaka::math::ConceptMathRound, Lround )
+
+    /** Computes the nearest integer value to arg (in integer format).
+     *
+     * Rounding halfway cases away from zero, regardless of the current rounding mode.
+     */
+    CUPLA_UNARY_MATH_FN( llround, alpaka::math::ConceptMathRound, Llround )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device/math/Trigo.hpp b/thirdParty/cupla/include/cupla/device/math/Trigo.hpp
new file mode 100644
index 0000000000..15ce6a3695
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device/math/Trigo.hpp
@@ -0,0 +1,60 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/math/Common.hpp"
+#include "cupla/types.hpp"
+
+namespace cupla
+{
+inline namespace CUPLA_ACCELERATOR_NAMESPACE
+{
+inline namespace device
+{
+inline namespace math
+{
+
+    //! Computes the sine (measured in radians).
+    CUPLA_UNARY_MATH_FN( sin, alpaka::math::ConceptMathSin, Sin )
+
+    //! Computes the cosine (measured in radians).
+    CUPLA_UNARY_MATH_FN( cos, alpaka::math::ConceptMathCos, Cos )
+
+    //! Computes the tangent (measured in radians).
+    CUPLA_UNARY_MATH_FN( tan, alpaka::math::ConceptMathTan, Tan )
+
+    //! Computes the principal value of the arc sine.
+    CUPLA_UNARY_MATH_FN( asin, alpaka::math::ConceptMathAsin, Asin )
+
+    //! Computes the principal value of the arc cosine.
+    CUPLA_UNARY_MATH_FN( acos, alpaka::math::ConceptMathAcos, Acos )
+
+    //! Computes the principal value of the arc tangent.
+    CUPLA_UNARY_MATH_FN( atan, alpaka::math::ConceptMathAtan, Atan )
+
+    //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
+    CUPLA_BINARY_MATH_FN( atan2, alpaka::math::ConceptMathAtan2, Atan2 )
+
+} // namespace math
+} // namespace device
+} // namespace CUPLA_ACCELERATOR_NAMESPACE
+} // namespace cupla
diff --git a/thirdParty/cupla/include/cupla/device_functions.hpp b/thirdParty/cupla/include/cupla/device_functions.hpp
new file mode 100644
index 0000000000..fe164900ec
--- /dev/null
+++ b/thirdParty/cupla/include/cupla/device_functions.hpp
@@ -0,0 +1,28 @@
+/* Copyright 2020 Rene Widera
+ *
+ * This file is part of cupla.
+ *
+ * cupla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * cupla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with cupla.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+
+#pragma once
+
+#include "cupla/device/Synchronization.hpp"
+#include "cupla/device/Index.hpp"
+#include "cupla/device/Atomic.hpp"
+#include "cupla/device/SharedMemory.hpp"
+#include "cupla/device/math.hpp"
diff --git a/thirdParty/cupla/include/cupla/kernel.hpp b/thirdParty/cupla/include/cupla/kernel.hpp
index d0d27506f7..fdd92ec161 100644
--- a/thirdParty/cupla/include/cupla/kernel.hpp
+++ b/thirdParty/cupla/include/cupla/kernel.hpp
@@ -118,7 +118,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
         template< typename... T_Args >
         void operator()( T_Args && ... args ) const
         {
-            ::alpaka::workdiv::WorkDivMembers<
+            ::alpaka::WorkDivMembers<
               KernelDim,
               IdxType
             > workDiv(
@@ -127,7 +127,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
                 m_elemSize
             );
             auto const exec(
-                ::alpaka::kernel::createTaskKernel< T_Acc >(
+                ::alpaka::createTaskKernel< T_Acc >(
                     workDiv,
                     CuplaKernel< T_KernelType >{ m_dynSharedMemSize },
                     std::forward< T_Args >( args )...
@@ -138,7 +138,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
                 cupla::AccStream
             >::get().stream( m_stream );
 
-            ::alpaka::queue::enqueue(stream, exec);
+            ::alpaka::enqueue(stream, exec);
         }
     };
 
@@ -172,6 +172,10 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     /* Kernel configuration interface with element support
      *
      * The kernel must support the alpaka element layer.
+     *
+     * Swap the blockSize and the elemSize depending on the activated accelerator.
+     * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) )
+     * and the elemSize is set to the user defined blockSize
      */
     template<
         typename T_KernelType
@@ -201,11 +205,8 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     };
 
     /** Kernel configuration interface with element support
-     * The kernel must support the alpaka element level
      *
-     * Swap the blockSize and the elemSize depending on the activated accelerator.
-     * This mean that in some devices the blockSize is set to one ( dim3(1,1,1) )
-     * and the elemSize is set to the user defined blockSize
+     * The kernel must support the alpaka element level
      */
     template<
         typename T_KernelType
@@ -237,8 +238,6 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 
 namespace alpaka
 {
-namespace kernel
-{
 namespace traits
 {
     //! CuplaKernel has defined the extern shared memory as member
@@ -259,13 +258,12 @@ namespace traits
         getBlockSharedMemDynSizeBytes(
             ::cupla::CuplaKernel< T_UserKernel > const & userKernel,
             TArgs const & ...)
-        -> ::alpaka::idx::Idx<T_Acc>
+        -> ::alpaka::Idx<T_Acc>
         {
             return userKernel.m_dynSharedMemBytes;
         }
     };
 } // namespace traits
-} // namespace kernel
 } // namespace alpaka
 
 
diff --git a/thirdParty/cupla/include/cupla/manager/Device.hpp b/thirdParty/cupla/include/cupla/manager/Device.hpp
index 0a3c06c95f..3476e6635c 100644
--- a/thirdParty/cupla/include/cupla/manager/Device.hpp
+++ b/thirdParty/cupla/include/cupla/manager/Device.hpp
@@ -75,7 +75,7 @@ namespace manager
             }
             else
             {
-                using Pltf = ::alpaka::pltf::Pltf< DeviceType >;
+                using Pltf = ::alpaka::Pltf< DeviceType >;
 
                 const int numDevices = count();
                 if( idx >= numDevices )
@@ -98,7 +98,7 @@ namespace manager
                      */
                     dev.reset(
                         new DeviceType(
-                            alpaka::pltf::getDevByIdx<
+                            alpaka::getDevByIdx<
                                 Pltf
                             >( idx )
                         )
@@ -127,7 +127,7 @@ namespace manager
          */
         bool reset()
         {
-            ::alpaka::dev::reset( this->current( ) );
+            ::alpaka::reset( this->current( ) );
             auto iter = m_map.find( this->id( ) );
 
             if( iter == m_map.end() )
@@ -162,8 +162,8 @@ namespace manager
         count()
         -> int
         {
-            using Pltf = ::alpaka::pltf::Pltf< DeviceType >;
-            return static_cast< int >( ::alpaka::pltf::getDevCount< Pltf >( ) );
+            using Pltf = ::alpaka::Pltf< DeviceType >;
+            return static_cast< int >( ::alpaka::getDevCount< Pltf >( ) );
         }
 
     protected:
diff --git a/thirdParty/cupla/include/cupla/manager/Event.hpp b/thirdParty/cupla/include/cupla/manager/Event.hpp
index 20212fe015..8d3ee193ec 100644
--- a/thirdParty/cupla/include/cupla/manager/Event.hpp
+++ b/thirdParty/cupla/include/cupla/manager/Event.hpp
@@ -57,7 +57,7 @@ namespace detail
         TimePoint time;
 
     public:
-        using AlpakaEvent = ::alpaka::event::Event< T_QueueType >;
+        using AlpakaEvent = ::alpaka::Event< T_QueueType >;
         std::unique_ptr< AlpakaEvent > event;
 
         EmulatedEvent( uint32_t flags ) :
@@ -83,10 +83,10 @@ namespace detail
 
         void record( T_QueueType & stream )
         {
-            ::alpaka::queue::enqueue( stream, *event );
+            ::alpaka::enqueue( stream, *event );
             if( hasTimer )
             {
-                ::alpaka::wait::wait( *event );
+                ::alpaka::wait( *event );
                 time = std::chrono::high_resolution_clock::now();
             }
         }
diff --git a/thirdParty/cupla/include/cupla/manager/Memory.hpp b/thirdParty/cupla/include/cupla/manager/Memory.hpp
index fe1fd94ddf..1d3edc64a7 100644
--- a/thirdParty/cupla/include/cupla/manager/Memory.hpp
+++ b/thirdParty/cupla/include/cupla/manager/Memory.hpp
@@ -46,7 +46,7 @@ namespace manager
         using DeviceType = T_DeviceType;
         static constexpr uint32_t dim = T_Dim::value;
 
-        using BufType = ::alpaka::mem::buf::Buf<
+        using BufType = ::alpaka::Buf<
             DeviceType,
             uint8_t,
             T_Dim,
@@ -86,15 +86,15 @@ namespace manager
                 BufType
             > bufPtr(
                 new BufType(
-                    ::alpaka::mem::buf::alloc<uint8_t, MemSizeType>(
-                     device.current(),
-                     extent
+                    ::alpaka::allocBuf<uint8_t, MemSizeType>(
+                         device.current(),
+                         extent
                     )
                 )
             );
 
 
-            uint8_t *nativePtr = ::alpaka::mem::view::getPtrNative(*bufPtr);
+            uint8_t *nativePtr = ::alpaka::getPtrNative(*bufPtr);
             m_mapVector[ device.id() ].insert(
                 std::make_pair( nativePtr, std::move( bufPtr ) )
             );
diff --git a/thirdParty/cupla/include/cupla/manager/Stream.hpp b/thirdParty/cupla/include/cupla/manager/Stream.hpp
index 96e9090900..43fc4d3324 100644
--- a/thirdParty/cupla/include/cupla/manager/Stream.hpp
+++ b/thirdParty/cupla/include/cupla/manager/Stream.hpp
@@ -69,23 +69,7 @@ namespace manager
         create( )
         -> cuplaStream_t
         {
-
-            auto& device = Device< DeviceType >::get();
-
-            std::unique_ptr<
-                QueueType
-            > streamPtr(
-                new QueueType(
-                    device.current()
-                )
-            );
-            cuplaStream_t streamId = reinterpret_cast< cuplaStream_t >(
-                m_id++
-            );
-            m_mapVector[ device.id() ].insert(
-                std::make_pair( streamId, std::move( streamPtr ) )
-            );
-            return streamId;
+            return createNewStream(reinterpret_cast< cuplaStream_t >(m_id++));
         }
 
         auto
@@ -102,7 +86,7 @@ namespace manager
             {
                 if( streamId == 0 )
                 {
-                    this->create( );
+                    createNewStream( streamId );
                     return this->stream( streamId );
                 }
                 else
@@ -153,8 +137,6 @@ namespace manager
             const auto deviceId = device.id();
 
             m_mapVector[ deviceId ].clear( );
-            // reset id to allow that this instance can be reused
-            m_id = 0u;
 
             // @todo: check if clear creates errors
             return true;
@@ -165,8 +147,26 @@ namespace manager
         {
         }
 
-        //! unique if for the next stream
-        size_t m_id = 0u;
+        auto
+        createNewStream( cuplaStream_t streamId  )
+        -> cuplaStream_t
+        {
+
+            auto& device = Device< DeviceType >::get();
+
+            auto streamPtr = std::make_unique< QueueType >( device.current() );
+            m_mapVector[ device.id() ].insert(
+                std::make_pair( streamId, std::move( streamPtr ) )
+            );
+            return streamId;
+        }
+
+        /** unique id for the next stream
+         *
+         * The enumeration starts with id one. Id zero is reserved
+         * for the default stream.
+         */
+        size_t m_id = 1u;
 
     };
 
diff --git a/thirdParty/cupla/include/cupla/traits/IsThreadSeqAcc.hpp b/thirdParty/cupla/include/cupla/traits/IsThreadSeqAcc.hpp
index 3d6b894802..5bced67679 100644
--- a/thirdParty/cupla/include/cupla/traits/IsThreadSeqAcc.hpp
+++ b/thirdParty/cupla/include/cupla/traits/IsThreadSeqAcc.hpp
@@ -49,7 +49,7 @@ namespace traits
         typename T_IndexType
     >
     struct IsThreadSeqAcc<
-        ::alpaka::acc::AccCpuOmp2Blocks<
+        ::alpaka::AccCpuOmp2Blocks<
             T_KernelDim,
             T_IndexType
         >
@@ -65,7 +65,7 @@ namespace traits
         typename T_IndexType
     >
     struct IsThreadSeqAcc<
-        ::alpaka::acc::AccCpuSerial<
+        ::alpaka::AccCpuSerial<
             T_KernelDim,
             T_IndexType
         >
@@ -81,7 +81,7 @@ namespace traits
         typename T_IndexType
     >
     struct IsThreadSeqAcc<
-        ::alpaka::acc::AccCpuTbbBlocks<
+        ::alpaka::AccCpuTbbBlocks<
             T_KernelDim,
             T_IndexType
         >
diff --git a/thirdParty/cupla/include/cupla/types.hpp b/thirdParty/cupla/include/cupla/types.hpp
index 5d2db88f6f..013a58e262 100644
--- a/thirdParty/cupla/include/cupla/types.hpp
+++ b/thirdParty/cupla/include/cupla/types.hpp
@@ -39,11 +39,11 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     template<
         uint32_t T_dim
     >
-    using AlpakaDim = ::alpaka::dim::DimInt< T_dim >;
+    using AlpakaDim = ::alpaka::DimInt< T_dim >;
 
     using KernelDim = AlpakaDim< Dimensions >;
 
-    using IdxVec3 = ::alpaka::vec::Vec<
+    using IdxVec3 = ::alpaka::Vec<
         KernelDim,
         IdxType
     >;
@@ -51,13 +51,13 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     template<
         uint32_t T_dim
     >
-    using MemVec = ::alpaka::vec::Vec<
+    using MemVec = ::alpaka::Vec<
         AlpakaDim< T_dim >,
         MemSizeType
     >;
 
-    using AccHost = ::alpaka::dev::DevCpu;
-    using AccHostStream = ::alpaka::queue::QueueCpuBlocking;
+    using AccHost = ::alpaka::DevCpu;
+    using AccHostStream = ::alpaka::QueueCpuBlocking;
 
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) ||                            \
     defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) ||                         \
@@ -66,15 +66,15 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) ||                             \
     defined(ALPAKA_ACC_CPU_BT_OMP4_ENABLED)
 
-    using AccDev = ::alpaka::dev::DevCpu;
+    using AccDev = ::alpaka::DevCpu;
 #   if (CUPLA_STREAM_ASYNC_ENABLED == 1)
-        using AccStream = ::alpaka::queue::QueueCpuNonBlocking;
+        using AccStream = ::alpaka::QueueCpuNonBlocking;
 #   else
-        using AccStream = ::alpaka::queue::QueueCpuBlocking;
+        using AccStream = ::alpaka::QueueCpuBlocking;
 #   endif
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-    using Acc = ::alpaka::acc::AccCpuOmp2Threads<
+    using Acc = ::alpaka::AccCpuOmp2Threads<
         KernelDim,
         IdxType
     >;
@@ -82,12 +82,12 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 
 #if (ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED == 1)
     #if (CUPLA_NUM_SELECTED_DEVICES == 1)
-        using Acc = ::alpaka::acc::AccCpuOmp2Blocks<
+        using Acc = ::alpaka::AccCpuOmp2Blocks<
             KernelDim,
             IdxType
         >;
     #else
-        using AccThreadSeq = ::alpaka::acc::AccCpuOmp2Blocks<
+        using AccThreadSeq = ::alpaka::AccCpuOmp2Blocks<
             KernelDim,
             IdxType
         >;
@@ -95,7 +95,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 #endif
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-    using Acc = ::alpaka::acc::AccCpuThreads<
+    using Acc = ::alpaka::AccCpuThreads<
         KernelDim,
         IdxType
     >;
@@ -103,12 +103,12 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
     #if (CUPLA_NUM_SELECTED_DEVICES == 1)
-        using Acc = ::alpaka::acc::AccCpuSerial<
+        using Acc = ::alpaka::AccCpuSerial<
             KernelDim,
             IdxType
         >;
     #else
-        using AccThreadSeq = ::alpaka::acc::AccCpuSerial<
+        using AccThreadSeq = ::alpaka::AccCpuSerial<
             KernelDim,
             IdxType
         >;
@@ -117,12 +117,12 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 
 #if (ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED == 1)
     #if (CUPLA_NUM_SELECTED_DEVICES == 1)
-        using Acc = ::alpaka::acc::AccCpuTbbBlocks<
+        using Acc = ::alpaka::AccCpuTbbBlocks<
             KernelDim,
             IdxType
         >;
     #else
-        using AccThreadSeq = ::alpaka::acc::AccCpuTbbBlocks<
+        using AccThreadSeq = ::alpaka::AccCpuTbbBlocks<
             KernelDim,
             IdxType
         >;
@@ -130,7 +130,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 #endif
 
 #ifdef ALPAKA_ACC_CPU_BT_OMP4_ENABLED
-    using Acc = ::alpaka::acc::AccCpuOmp4<
+    using Acc = ::alpaka::AccCpuOmp4<
         KernelDim,
         IdxType
     >;
@@ -140,26 +140,26 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
 
 
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-    using AccDev = ::alpaka::dev::DevCudaRt;
+    using AccDev = ::alpaka::DevCudaRt;
 #   if (CUPLA_STREAM_ASYNC_ENABLED == 1)
-        using AccStream = ::alpaka::queue::QueueCudaRtNonBlocking;
+        using AccStream = ::alpaka::QueueCudaRtNonBlocking;
 #   else
-        using AccStream = ::alpaka::queue::QueueCudaRtBlocking;
+        using AccStream = ::alpaka::QueueCudaRtBlocking;
 #   endif
-    using Acc = ::alpaka::acc::AccGpuCudaRt<
+    using Acc = ::alpaka::AccGpuCudaRt<
         KernelDim,
         IdxType
     >;
 #endif
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-    using AccDev = ::alpaka::dev::DevHipRt;
+    using AccDev = ::alpaka::DevHipRt;
 #   if (CUPLA_STREAM_ASYNC_ENABLED == 1)
-        using AccStream = ::alpaka::queue::QueueHipRtNonBlocking;
+        using AccStream = ::alpaka::QueueHipRtNonBlocking;
 #   else
-        using AccStream = ::alpaka::queue::QueueHipRtBlocking;
+        using AccStream = ::alpaka::QueueHipRtBlocking;
 #   endif
-    using Acc = ::alpaka::acc::AccGpuHipRt<
+    using Acc = ::alpaka::AccGpuHipRt<
         KernelDim,
         IdxType
     >;
@@ -177,7 +177,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     template<
         uint32_t T_dim
     >
-    using AccBuf = ::alpaka::mem::buf::Buf<
+    using AccBuf = ::alpaka::Buf<
         AccDev,
         uint8_t,
         AlpakaDim< T_dim >,
@@ -187,7 +187,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
     template<
         uint32_t T_dim
     >
-    using HostBuf = ::alpaka::mem::buf::Buf<
+    using HostBuf = ::alpaka::Buf<
         AccHost,
         uint8_t,
         AlpakaDim< T_dim >,
@@ -198,7 +198,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
         unsigned T_dim
     >
     using HostBufWrapper =
-        ::alpaka::mem::view::ViewPlainPtr<
+        ::alpaka::ViewPlainPtr<
             AccHost,
             uint8_t,
             AlpakaDim< T_dim >,
@@ -209,7 +209,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
         unsigned T_dim
     >
     using HostViewWrapper =
-        ::alpaka::mem::view::ViewSubView<
+        ::alpaka::ViewSubView<
             AccHost,
             uint8_t,
             AlpakaDim< T_dim >,
@@ -220,7 +220,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
         unsigned T_dim
     >
     using DeviceBufWrapper =
-        ::alpaka::mem::view::ViewPlainPtr<
+        ::alpaka::ViewPlainPtr<
             AccDev,
             uint8_t,
             AlpakaDim< T_dim >,
@@ -231,7 +231,7 @@ inline namespace CUPLA_ACCELERATOR_NAMESPACE
         unsigned T_dim
     >
     using DeviceViewWrapper =
-        ::alpaka::mem::view::ViewSubView<
+        ::alpaka::ViewSubView<
             AccDev,
             uint8_t,
             AlpakaDim< T_dim >,
diff --git a/thirdParty/cupla/include/cupla/version.hpp b/thirdParty/cupla/include/cupla/version.hpp
index 187ee9d8e9..b337ddf5c6 100644
--- a/thirdParty/cupla/include/cupla/version.hpp
+++ b/thirdParty/cupla/include/cupla/version.hpp
@@ -22,6 +22,6 @@
 
 // Please also update the version in `cuplaConfig.cmake`
 #define CUPLA_VERSION_MAJOR 0
-#define CUPLA_VERSION_MINOR 2
+#define CUPLA_VERSION_MINOR 3
 #define CUPLA_VERSION_PATCH 0
-#define CUPLA_VERSION_LABEL ""
+#define CUPLA_VERSION_LABEL "dev"
diff --git a/thirdParty/cupla/script/compiler_base.yml b/thirdParty/cupla/script/compiler_base.yml
new file mode 100644
index 0000000000..8e3e676fc4
--- /dev/null
+++ b/thirdParty/cupla/script/compiler_base.yml
@@ -0,0 +1,73 @@
+.base_gcc:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-gcc:1.1
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+    ALPAKA_ACCS: "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE
+                  ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE
+                  ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE"
+                  # ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE
+  script:
+    - source script/run_test.sh
+  # x86_64 tag is used to get a multi-core CPU for the tests
+  tags:
+    - x86_64
+
+.base_clang:
+  image: registry.gitlab.com/hzdr/crp/alpaka-group-container/alpaka-ci-clang:1.1
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+    ALPAKA_ACCS: "ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE
+                  ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE"
+                  # -DALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=ON
+                  # -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON
+  script:
+      - source script/run_test.sh
+  # x86_64 tag is used to get a multi-core CPU for the tests
+  tags:
+    - x86_64
+
+.base_cuda:
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+    CUPLA_CXX: g++
+    ALPAKA_ACCS: "ALPAKA_ACC_GPU_CUDA_ENABLE"
+  before_script:
+    - nvidia-smi
+    - nvcc --version
+  script:
+      - source script/run_test.sh
+  tags:
+    - cuda
+    - intel
+
+.base_cuda_clang:
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+    ALPAKA_ACCS: "ALPAKA_ACC_GPU_CUDA_ENABLE"
+    CUPLA_CMAKE_ARGS: "-DALPAKA_CUDA_COMPILER=clang"
+  before_script:
+    - nvidia-smi
+    - nvcc --version
+  script:
+      - source script/run_test.sh
+  tags:
+    - cuda
+    - intel
+
+.base_hip:
+  variables:
+    GIT_SUBMODULE_STRATEGY: normal
+    CUPLA_CXX: "hipcc"
+    ALPAKA_ACCS: "ALPAKA_ACC_GPU_HIP_ENABLE"
+    # architecture of the Vega 64
+    CUPLA_CMAKE_ARGS: "-DALPAKA_HIP_ARCH=900"
+  before_script:
+    - rocm-smi
+    - hipcc --version
+  script:
+    # use Vega 64 of the CI node
+    - export HIP_VISIBLE_DEVICES=2
+    - source script/run_test.sh
+  tags:
+    - amd
+    - rocm
diff --git a/thirdParty/cupla/script/run_test.sh b/thirdParty/cupla/script/run_test.sh
new file mode 100755
index 0000000000..55457abfd8
--- /dev/null
+++ b/thirdParty/cupla/script/run_test.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+# the default build type is Release
+# if neccesary, you can rerun the pipeline with another build type-> https://docs.gitlab.com/ee/ci/pipelines.html#manually-executing-pipelines
+# to change the build type, you must set the environment variable CUPLA_BUILD_TYPE
+
+if [[ ! -v CUPLA_BUILD_TYPE ]] ; then
+    CUPLA_BUILD_TYPE=Release ;
+fi
+
+###################################################
+# cmake config builder
+###################################################
+
+CUPLA_CONST_ARGS=""
+CUPLA_CONST_ARGS="${CUPLA_CONST_ARGS} -DCMAKE_BUILD_TYPE=${CUPLA_BUILD_TYPE}"
+CUPLA_CONST_ARGS="${CUPLA_CONST_ARGS} ${CUPLA_CMAKE_ARGS}"
+
+CMAKE_CONFIGS=()
+for CXX_VERSION in $CUPLA_CXX; do
+    for BOOST_VERSION in ${CUPLA_BOOST_VERSIONS}; do
+	for ACC in ${ALPAKA_ACCS}; do
+	    CMAKE_CONFIGS+=("${CUPLA_CONST_ARGS} -DCMAKE_CXX_COMPILER=${CXX_VERSION} -DBOOST_ROOT=/opt/boost/${BOOST_VERSION} -D${ACC}=ON")
+	done
+    done
+done
+
+###################################################
+# build an run tests
+###################################################
+
+# use one build directory for all build configurations
+mkdir build
+cd build
+
+export cupla_DIR=$CI_PROJECT_DIR
+
+# ALPAKA_ACCS contains the backends, which are used for each build
+# the backends are set in the sepcialized base jobs .base_gcc,.base_clang and.base_cuda
+for CONFIG in $(seq 0 $((${#CMAKE_CONFIGS[*]} - 1))); do
+    CMAKE_ARGS=${CMAKE_CONFIGS[$CONFIG]}
+    echo -e "\033[0;32m///////////////////////////////////////////////////"
+    echo "number of processor threads -> $(nproc)"
+    cmake --version | head -n 1
+    echo "CMAKE_ARGS -> ${CMAKE_ARGS}"
+    echo -e "/////////////////////////////////////////////////// \033[0m \n\n"
+
+    echo "###################################################"
+    echo "# Example Matrix Multiplication (adapted original)"
+    echo "###################################################"
+    echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example"
+    echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (256)"
+    if [[ $CMAKE_ARGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then
+        cmake $cupla_DIR/example/CUDASamples/matrixMul/ \
+	      $CMAKE_ARGS
+        make -j
+        time ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64
+        rm -r * ;
+    fi
+
+    echo "###################################################"
+    echo "# Example Async API (adapted original)"
+    echo "###################################################"
+    echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example"
+    echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (512)"
+    if [[ $CMAKE_ARGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then
+        cmake $cupla_DIR/example/CUDASamples/asyncAPI/ \
+	      $CMAKE_ARGS
+        make -j
+        time ./asyncAPI
+        rm -r * ;
+    fi
+
+    echo "###################################################"
+    echo "# Example Async API (added elements layer)"
+    echo "###################################################"
+    cmake $cupla_DIR/example/CUDASamples/asyncAPI_tuned/ \
+	  $CMAKE_ARGS
+    make -j
+    time ./asyncAPI_tuned
+    rm -r *
+
+    echo "###################################################"
+    echo "Example vectorAdd (added elements layer)"
+    echo "###################################################"
+    cmake $cupla_DIR/example/CUDASamples/vectorAdd/ \
+	  $CMAKE_ARGS
+    make -j
+    time ./vectorAdd 100000
+    rm -r * ;
+done
diff --git a/thirdParty/cupla/script/run_test.yml b/thirdParty/cupla/script/run_test.yml
new file mode 100644
index 0000000000..223a241f79
--- /dev/null
+++ b/thirdParty/cupla/script/run_test.yml
@@ -0,0 +1,57 @@
+.test_job:
+  script:
+    # the default build type is Release
+    # if neccesary, you can rerun the pipeline with another build type-> https://docs.gitlab.com/ee/ci/pipelines.html#manually-executing-pipelines
+    # to change the build type, you must set the environment variable CUPLA_BUILD_TYPE
+    - if [[ ! -v CUPLA_BUILD_TYPE ]] ; then
+        CUPLA_BUILD_TYPE=Release ;
+      fi
+    - echo "number of processor threads $(nproc)"
+    - $CXX --version
+    - cmake --version
+    - echo "Boost version-> $BOOST_VERSION"
+    - export cupla_DIR=$CI_PROJECT_DIR
+    # use one build directory for all build configurations
+    - mkdir build
+    - cd build
+    - echo "Build type-> $CUPLA_BUILD_TYPE"
+    # ALPAKA_ACCS contains the backends, which are used for each build
+    # the backends are set in the sepcialized base jobs .base_gcc,.base_clang and.base_cuda
+    - for CMAKE_FLAGS in $ALPAKA_ACCS ; do
+        echo "###################################################"
+        && echo "# Example Matrix Multiplication (adapted original)"
+        && echo "###################################################"
+        && echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example"
+        && echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (256)"
+        && if [[ $CMAKE_FLAGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then
+          cmake $cupla_DIR/example/CUDASamples/matrixMul/ -DBOOST_ROOT=/opt/boost/$BOOST_VERSION $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
+          && make -j
+          && time ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64
+          && rm -r * ;
+        fi
+        && echo "###################################################"
+        && echo "# Example Async API (adapted original)"
+        && echo "###################################################"
+        && echo "can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original SDK example"
+        && echo "CPU_B_SEQ_T_OMP2/THREADS too many threads necessary (512)"
+        && if [[ $CMAKE_FLAGS =~ -*DALPAKA_ACC_GPU_CUDA_ENABLE=ON.* ]]; then
+          cmake $cupla_DIR/example/CUDASamples/asyncAPI/ -DBOOST_ROOT=/opt/boost/$BOOST_VERSION $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
+          && make -j
+          && time ./asyncAPI
+          && rm -r * ;
+        fi
+        && echo "###################################################"
+        && echo "# Example Async API (added elements layer)"
+        && echo "###################################################"
+        && cmake $cupla_DIR/example/CUDASamples/asyncAPI_tuned/ -DBOOST_ROOT=/opt/boost/$BOOST_VERSION $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
+        && make -j
+        && time ./asyncAPI_tuned
+        && rm -r *
+        && echo "###################################################"
+        && echo "Example vectorAdd (added elements layer)"
+        && echo "###################################################"
+        && cmake $cupla_DIR/example/CUDASamples/vectorAdd/ -DBOOST_ROOT=/opt/boost/$BOOST_VERSION $CMAKE_FLAGS -DCMAKE_BUILD_TYPE=$CUPLA_BUILD_TYPE
+        && make -j
+        && time ./vectorAdd 100000
+        && rm -r * ;
+      done
diff --git a/thirdParty/cupla/src/device.cpp b/thirdParty/cupla/src/device.cpp
index 4a06ea6ee7..e2127784f9 100644
--- a/thirdParty/cupla/src/device.cpp
+++ b/thirdParty/cupla/src/device.cpp
@@ -105,7 +105,7 @@ CUPLA_HEADER_ONLY_FUNC_SPEC
 cuplaError_t
 cuplaDeviceSynchronize( )
 {
-    ::alpaka::wait::wait(
+    ::alpaka::wait(
         cupla::manager::Device< cupla::AccDev >::get( ).current( )
     );
     return cuplaSuccess;
@@ -123,8 +123,8 @@ cuplaMemGetInfo(
             cupla::AccDev
         >::get().current()
     );
-    *total = ::alpaka::dev::getMemBytes( device );
-    *free = ::alpaka::dev::getFreeMemBytes( device );
+    *total = ::alpaka::getMemBytes( device );
+    *free = ::alpaka::getFreeMemBytes( device );
     return cuplaSuccess;
 }
 
diff --git a/thirdParty/cupla/src/event.cpp b/thirdParty/cupla/src/event.cpp
index e4abb1c811..9cbf779c2a 100644
--- a/thirdParty/cupla/src/event.cpp
+++ b/thirdParty/cupla/src/event.cpp
@@ -43,7 +43,7 @@ cuplaEventCreateWithFlags(
     >::get().create( flags );
 
     return cuplaSuccess;
-};
+}
 
 
 CUPLA_HEADER_ONLY_FUNC_SPEC
@@ -58,7 +58,7 @@ cuplaEventCreate(
     >::get().create( 0 );
 
     return cuplaSuccess;
-};
+}
 
 CUPLA_HEADER_ONLY_FUNC_SPEC
 cuplaError_t
@@ -73,7 +73,7 @@ cuplaEventDestroy( cuplaEvent_t event )
         return cuplaSuccess;
     else
         return cuplaErrorInitializationError;
-};
+}
 
 CUPLA_HEADER_ONLY_FUNC_SPEC
 cuplaError_t
@@ -125,7 +125,7 @@ cuplaEventSynchronize(
         cupla::AccDev,
         cupla::AccStream
     >::get().event( event );
-    ::alpaka::wait::wait( *eventObject );
+    ::alpaka::wait( *eventObject );
     return cuplaSuccess;
 }
 
@@ -138,7 +138,7 @@ cuplaEventQuery( cuplaEvent_t event )
         cupla::AccStream
     >::get().event( event );
 
-    if( ::alpaka::event::test( *eventObject ) )
+    if( ::alpaka::isComplete( *eventObject ) )
     {
         return cuplaSuccess;
     }
diff --git a/thirdParty/cupla/src/memory.cpp b/thirdParty/cupla/src/memory.cpp
index 07486beeae..77a8fc56f3 100644
--- a/thirdParty/cupla/src/memory.cpp
+++ b/thirdParty/cupla/src/memory.cpp
@@ -38,7 +38,7 @@ cuplaMalloc(
 )
 {
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<1u>,
         cupla::MemSizeType
     > extent( size );
@@ -49,7 +49,7 @@ cuplaMalloc(
     >::get().alloc( extent );
 
     // @toto catch errors
-    *ptrptr = ::alpaka::mem::view::getPtrNative(buf);
+    *ptrptr = ::alpaka::getPtrNative(buf);
     return cuplaSuccess;
 }
 
@@ -62,7 +62,7 @@ cuplaMallocPitch(
     size_t const height
 )
 {
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim< 2u >,
         cupla::MemSizeType
     > extent( height, width );
@@ -73,11 +73,11 @@ cuplaMallocPitch(
     >::get().alloc( extent );
 
     // @toto catch errors
-    *devPtr = ::alpaka::mem::view::getPtrNative(buf);
-    *pitch = ::alpaka::mem::view::getPitchBytes< 1u >( buf );
+    *devPtr = ::alpaka::getPtrNative(buf);
+    *pitch = ::alpaka::getPitchBytes< 1u >( buf );
 
     return cuplaSuccess;
-};
+}
 
 CUPLA_HEADER_ONLY_FUNC_SPEC
 cuplaError_t
@@ -94,8 +94,8 @@ cuplaMalloc3D(
 
     // @toto catch errors
     *pitchedDevPtr = make_cuplaPitchedPtr(
-        ::alpaka::mem::view::getPtrNative(buf),
-        ::alpaka::mem::view::getPitchBytes< 2u >( buf ),
+        ::alpaka::getPtrNative(buf),
+        ::alpaka::getPitchBytes< 2u >( buf ),
         extent.width,
         extent.height
     );
@@ -144,7 +144,7 @@ cuplaMallocHost(
     size_t size
 )
 {
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<1u>,
         cupla::MemSizeType
     > extent( size );
@@ -157,7 +157,7 @@ cuplaMallocHost(
     prepareForAsyncCopy( buf );
 
     // @toto catch errors
-    *ptrptr = ::alpaka::mem::view::getPtrNative(buf);
+    *ptrptr = ::alpaka::getPtrNative(buf);
     return cuplaSuccess;
 }
 
@@ -216,7 +216,7 @@ cuplaError_t cuplaMemcpyAsync(
     cuplaStream_t stream
 )
 {
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<1u>,
         cupla::MemSizeType
     > numBytes(count);
@@ -259,7 +259,7 @@ cuplaError_t cuplaMemcpyAsync(
                 numBytes
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 dBuf,
                 hBuf,
@@ -290,7 +290,7 @@ cuplaError_t cuplaMemcpyAsync(
                 numBytes
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 hBuf,
                 dBuf,
@@ -316,7 +316,7 @@ cuplaError_t cuplaMemcpyAsync(
                 numBytes
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 dDestBuf,
                 dSrcBuf,
@@ -353,7 +353,7 @@ cuplaError_t cuplaMemcpyAsync(
                 numBytes
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 hostStreamObject,
                 hDestBuf,
                 hSrcBuf,
@@ -391,7 +391,7 @@ cuplaMemcpy(
             cupla::AccStream
         >::get().stream( 0 )
     );
-    ::alpaka::wait::wait( streamObject );
+    ::alpaka::wait( streamObject );
 
     return cuplaSuccess;
 }
@@ -418,7 +418,7 @@ cuplaMemsetAsync(
         >::get().stream( stream )
     );
 
-    ::alpaka::vec::Vec<
+    ::alpaka::Vec<
         cupla::AlpakaDim<1u>,
         cupla::MemSizeType
     > const
@@ -431,7 +431,7 @@ cuplaMemsetAsync(
         numBytes
     );
 
-    ::alpaka::mem::view::set(
+    ::alpaka::memset(
         streamObject,
         dBuf,
         value,
@@ -464,7 +464,7 @@ cuplaMemset(
             cupla::AccStream
         >::get().stream( 0 )
     );
-    ::alpaka::wait::wait( streamObject );
+    ::alpaka::wait( streamObject );
 
     return cuplaSuccess;
 }
@@ -482,17 +482,17 @@ cuplaMemcpy2DAsync(
     cuplaStream_t const stream
 )
 {
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<2u>,
         cupla::MemSizeType
     > numBytes( height, width );
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<2u>,
         cupla::MemSizeType
     > dstPitch( dPitch * height , dPitch );
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<2u>,
         cupla::MemSizeType
     > srcPitch( sPitch * height , sPitch );
@@ -537,7 +537,7 @@ cuplaMemcpy2DAsync(
                 dstPitch
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 dBuf,
                 hBuf,
@@ -570,7 +570,7 @@ cuplaMemcpy2DAsync(
                 dstPitch
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 hBuf,
                 dBuf,
@@ -598,7 +598,7 @@ cuplaMemcpy2DAsync(
                 dstPitch
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 dDestBuf,
                 dSrcBuf,
@@ -637,7 +637,7 @@ cuplaMemcpy2DAsync(
                 dstPitch
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 hostStreamObject,
                 hDestBuf,
                 hSrcBuf,
@@ -681,7 +681,7 @@ cuplaMemcpy2D(
             cupla::AccStream
         >::get().stream( 0 )
     );
-    ::alpaka::wait::wait( streamObject );
+    ::alpaka::wait( streamObject );
 
     return cuplaSuccess;
 }
@@ -693,12 +693,12 @@ cuplaMemcpy3DAsync(
     cuplaStream_t stream
 )
 {
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<3u>,
         cupla::MemSizeType
     > numBytes( p->extent );
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<3u>,
         cupla::MemSizeType
     > extentSrc(
@@ -707,7 +707,7 @@ cuplaMemcpy3DAsync(
         p->srcPtr.xsize
     );
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<3u>,
         cupla::MemSizeType
     > extentDst(
@@ -716,7 +716,7 @@ cuplaMemcpy3DAsync(
         p->dstPtr.xsize
     );
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<3u>,
         cupla::MemSizeType
     > offsetSrc(
@@ -725,7 +725,7 @@ cuplaMemcpy3DAsync(
         p->srcPos.x
     );
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<3u>,
         cupla::MemSizeType
     > offsetDst(
@@ -734,7 +734,7 @@ cuplaMemcpy3DAsync(
         p->dstPos.x
     );
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<3u>,
         cupla::MemSizeType
     > dstPitch(
@@ -743,7 +743,7 @@ cuplaMemcpy3DAsync(
         p->dstPtr.pitch
     );
 
-    const ::alpaka::vec::Vec<
+    const ::alpaka::Vec<
         cupla::AlpakaDim<3u>,
         cupla::MemSizeType
     > srcPitch(
@@ -798,7 +798,7 @@ cuplaMemcpy3DAsync(
                 offsetDst
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 dView,
                 cupla::HostViewWrapper< 3u >(
@@ -841,7 +841,7 @@ cuplaMemcpy3DAsync(
                 offsetDst
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 hView,
                 cupla::DeviceViewWrapper< 3u >(
@@ -879,7 +879,7 @@ cuplaMemcpy3DAsync(
                 offsetDst
             );
 
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 streamObject,
                 dView,
                 cupla::DeviceViewWrapper< 3u >(
@@ -928,7 +928,7 @@ cuplaMemcpy3DAsync(
                 extentDst - offsetDst,
                 offsetDst
             );
-            ::alpaka::mem::view::copy(
+            ::alpaka::memcpy(
                 hostStreamObject,
                 hView,
                 cupla::HostViewWrapper< 3u >(
@@ -961,7 +961,7 @@ cuplaMemcpy3D(
             cupla::AccStream
         >::get().stream( 0 )
     );
-    ::alpaka::wait::wait( streamObject );
+    ::alpaka::wait( streamObject );
 
     return cuplaSuccess;
 }
diff --git a/thirdParty/cupla/src/stream.cpp b/thirdParty/cupla/src/stream.cpp
index ba8f579d5a..e936286ceb 100644
--- a/thirdParty/cupla/src/stream.cpp
+++ b/thirdParty/cupla/src/stream.cpp
@@ -43,7 +43,7 @@ cuplaStreamCreate(
     >::get().create();
 
     return cuplaSuccess;
-};
+}
 
 CUPLA_HEADER_ONLY_FUNC_SPEC
 cuplaError_t
@@ -58,7 +58,7 @@ cuplaStreamDestroy( cuplaStream_t stream )
         return cuplaSuccess;
     else
         return cuplaErrorInitializationError;
-};
+}
 
 CUPLA_HEADER_ONLY_FUNC_SPEC
 cuplaError_t
@@ -70,7 +70,7 @@ cuplaStreamSynchronize(
         cupla::AccDev,
         cupla::AccStream
     >::get().stream( stream );
-    ::alpaka::wait::wait( streamObject );
+    ::alpaka::wait( streamObject );
     return cuplaSuccess;
 }
 
@@ -92,7 +92,7 @@ cuplaStreamWaitEvent(
         cupla::AccStream
     >::get().event( event );
 
-    ::alpaka::wait::wait(streamObject,eventObject);
+    ::alpaka::wait(streamObject,eventObject);
     return cuplaSuccess;
 }
 
@@ -105,10 +105,10 @@ cuplaStreamQuery( cuplaStream_t stream )
         cupla::AccStream
     >::get().stream( stream );
 
-    if( alpaka::queue::empty( streamObject ) )
+    if( alpaka::empty( streamObject ) )
         return cuplaSuccess;
     else
         return cuplaErrorNotReady;
-};
+}
 
 } //namespace CUPLA_ACCELERATOR_NAMESPACE
diff --git a/thirdParty/cupla/test/system/config/kernel.cpp b/thirdParty/cupla/test/system/config/kernel.cpp
index 31b1e3e971..2768e0aabc 100644
--- a/thirdParty/cupla/test/system/config/kernel.cpp
+++ b/thirdParty/cupla/test/system/config/kernel.cpp
@@ -35,6 +35,8 @@
 #   include <cupla/config/GpuHipRt.hpp>
 #endif
 
+#include "cuda_to_cupla.hpp"
+
 struct IncrementKernel
 {
     template<typename T_Acc>
diff --git a/thirdParty/cupla/test/system/config/main.cpp b/thirdParty/cupla/test/system/config/main.cpp
index fd93bba6d3..e7d0903941 100644
--- a/thirdParty/cupla/test/system/config/main.cpp
+++ b/thirdParty/cupla/test/system/config/main.cpp
@@ -35,6 +35,8 @@
 #   include <cupla/config/GpuHipRt.hpp>
 #endif
 
+#include "cuda_to_cupla.hpp"
+
 extern void callIncrementKernel(int* pr_d);
 
 int main()
diff --git a/thirdParty/cupla/test/system/config/test.sh b/thirdParty/cupla/test/system/config/test.sh
index e776479ee5..ec531df5b8 100755
--- a/thirdParty/cupla/test/system/config/test.sh
+++ b/thirdParty/cupla/test/system/config/test.sh
@@ -18,8 +18,8 @@ function compile {
     if [ $# -eq 4 ] ; then
         compiler_flags="$4"
     fi
-    echo "execute: "${compiler_name} ${test_code_dir}/main.cpp ${test_code_dir}/kernel.cpp  -I${test_code_dir}/../../../include -I${test_code_dir}/../../../alpaka/include -std=c++11 -DCUPLA_ACC_${acc_name} -DCUPLA_HEADER_ONLY -o ${acc_name} ${compiler_flags}
-    ret=$(${compiler_name} ${test_code_dir}/main.cpp ${test_code_dir}/kernel.cpp  -I${test_code_dir}/../../../include -I${test_code_dir}/../../../alpaka/include -std=c++11 -DCUPLA_ACC_${acc_name} -DCUPLA_HEADER_ONLY -o ${acc_name} ${compiler_flags} && \
+    echo "execute: "${compiler_name} ${test_code_dir}/main.cpp ${test_code_dir}/kernel.cpp  -I${test_code_dir}/../../../include -I${test_code_dir}/../../../alpaka/include -std=c++14 -DCUPLA_ACC_${acc_name} -DCUPLA_HEADER_ONLY -o ${acc_name} ${compiler_flags}
+    ret=$(${compiler_name} ${test_code_dir}/main.cpp ${test_code_dir}/kernel.cpp  -I${test_code_dir}/../../../include -I${test_code_dir}/../../../alpaka/include -std=c++14 -DCUPLA_ACC_${acc_name} -DCUPLA_HEADER_ONLY -o ${acc_name} ${compiler_flags} && \
         { echo 0; } || { echo 1; })
     if [ $ret -eq 0 ] && [ $execute_bin -eq 1 ] ; then
         ret=$(./$acc_name && { echo 0; } || { echo 1; })
@@ -44,8 +44,8 @@ compile 0 "$1" CpuThreads "-pthread -lpthread -DCUPLA_STREAM_ASYNC_ENABLE=1 $boo
 
 nvcc_found=$(which nvcc >/dev/null && { echo 0; } || { echo 1; })
 if [ $nvcc_found -eq 0 ] ; then
-    compile 0 "nvcc -x cu" GpuCudaRt "-DALPAKA_ACC_GPU_CUDA_ENABLE=ON -DALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON $boost_include"
-    compile 0 "nvcc -x cu" GpuCudaRt "-DALPAKA_ACC_GPU_CUDA_ENABLE=ON -DALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON -DCUPLA_STREAM_ASYNC_ENABLE=1 $boost_include"
+    compile 0 "nvcc -x cu" GpuCudaRt "-DALPAKA_ACC_GPU_CUDA_ENABLE=ON -DALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON $boost_include --expt-relaxed-constexpr"
+    compile 0 "nvcc -x cu" GpuCudaRt "-DALPAKA_ACC_GPU_CUDA_ENABLE=ON -DALPAKA_ACC_GPU_CUDA_ONLY_MODE=ON -DCUPLA_STREAM_ASYNC_ENABLE=1 $boost_include --expt-relaxed-constexpr"
 else 
     echo "skip GpuCudaRt: nvcc not found" >&2
 fi
diff --git a/thirdParty/mallocMC/.clang-format b/thirdParty/mallocMC/.clang-format
new file mode 100644
index 0000000000..05f8601928
--- /dev/null
+++ b/thirdParty/mallocMC/.clang-format
@@ -0,0 +1,114 @@
+---
+# General options
+Language: Cpp
+Standard: c++17
+DisableFormat: false
+
+AccessModifierOffset: -4
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveBitFields: false
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: false
+AlignEscapedNewlines: Right
+AlignOperands: DontAlign
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortEnumsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Allman
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeComma
+BreakStringLiterals: true
+ColumnLimit: 119
+CommentPragmas:  '^ COMMENT pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks: Regroup
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseBlocks: true
+IndentCaseLabels: false
+IndentExternBlock: AfterExternBlock
+IndentGotoLabels: true
+IndentPPDirectives: AfterHash
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: All
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 1000
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: Never
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+TabWidth: 4
+UseCRLF: false
+UseTab: Never
+
+# Project specific options -- uncomment and modify as needed
+#IncludeCategories:
+#  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+#    Priority:        2
+#    SortPriority:    0
+#  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+#    Priority:        3
+#    SortPriority:    0
+#  - Regex:           '.*'
+#    Priority:        1
+#    SortPriority:    0
+
+# Future options - not supported in clang-format 11
+# BitFieldColonSpacing: Both
+# OperandAlignmentStyle: Align
+
+...
diff --git a/thirdParty/mallocMC/.clang-tidy b/thirdParty/mallocMC/.clang-tidy
new file mode 100644
index 0000000000..3292a65bda
--- /dev/null
+++ b/thirdParty/mallocMC/.clang-tidy
@@ -0,0 +1,3 @@
+---
+Checks:            '*,-llvm-header-guard,-fuchsia-default-arguments-declarations,-cppcoreguidelines-no-malloc,-cppcoreguidelines-owning-memory,-misc-non-private-member-variables-in-classes'
+HeaderFilterRegex: '.*'
diff --git a/thirdParty/mallocMC/.gitignore b/thirdParty/mallocMC/.gitignore
index b8056488e8..486ae784cc 100644
--- a/thirdParty/mallocMC/.gitignore
+++ b/thirdParty/mallocMC/.gitignore
@@ -1,7 +1,11 @@
+# tmp files
+*~
+
 # Compiled Object files
 *.slo
 *.lo
 *.o
+/build
 
 # Compiled Dynamic libraries
 *.so
@@ -12,5 +16,19 @@
 *.la
 *.a
 
-*~
-/nbproject
+# netbeans project files
+/nbproject/
+
+# Code::Blocks project files
+/*.cbp
+/*.layout
+
+# Visual Studio Code configuration files
+.vscode
+.vs
+
+# JetBrains project files
+.idea/
+
+# original backup files
+*.orig
diff --git a/thirdParty/mallocMC/.travis.yml b/thirdParty/mallocMC/.travis.yml
index 8bf68cedbc..9f988a7330 100644
--- a/thirdParty/mallocMC/.travis.yml
+++ b/thirdParty/mallocMC/.travis.yml
@@ -2,7 +2,12 @@ language: cpp
 
 sudo: required
 
-dist: trusty
+dist: bionic
+
+cache:
+  apt: true
+  directories:
+    - $HOME/.cache/cmake-3.15.0
 
 compiler:
   - gcc
@@ -11,13 +16,23 @@ env:
   global:
     - INSTALL_DIR=~/mylibs
     - CXXFLAGS="-Werror"
+    - PATH: $HOME/.cache/cmake-3.15.0/bin:$PATH
+    
+install:
+  if [ ! -f $HOME/.cache/cmake-3.15.0/bin/cmake ]; then
+    wget -O cmake.sh https://cmake.org/files/v3.15/cmake-3.15.0-Linux-x86_64.sh &&
+    sh cmake.sh --skip-license --exclude-subdir --prefix=$HOME/.cache/cmake-3.15.0 &&
+    rm cmake.sh;
+  fi;
 
 script:
   - mkdir build_tmp && cd build_tmp
-  - cmake -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR $TRAVIS_BUILD_DIR
+  - CXX=g++-5 && CC=gcc-5
+  - cmake -DALPAKA_ACC_GPU_CUDA_ENABLE=ON -DALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE=ON -DALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE=ON -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR $TRAVIS_BUILD_DIR
   - make
   - make install
   - make examples
+  - make tests
 
 before_script:
   - cat /etc/apt/sources.list
@@ -27,14 +42,14 @@ before_script:
   - sudo dpkg --configure -a
   - sudo apt-get install -f -qq
   - sudo dpkg --get-selections | grep hold || { echo "All packages OK."; }
-  - sudo apt-get install -q -y cmake-data cmake
-  - sudo apt-get install -qq build-essential
-  - gcc --version && g++ --version # 4.8
+  - sudo apt-get install -qq build-essential g++-5
+  - gcc-5 --version && g++-5 --version # 5.5.0
   - apt-cache search nvidia-*
   - sudo apt-get install -qq nvidia-common
-  - sudo apt-get install -qq nvidia-cuda-dev nvidia-cuda-toolkit # 5.5
-  - sudo apt-get install -qq libboost-dev # 1.54.0
+  - sudo apt-get install -qq nvidia-cuda-dev nvidia-cuda-toolkit # 9.1.85
+  - sudo apt-get install -qq libboost-dev # 1.65.1
   - sudo find /usr/ -name libcuda*.so
+  - cmake --version
 
 after_script:
   - ls -halR $INSTALL_DIR
diff --git a/thirdParty/mallocMC/CHANGELOG.md b/thirdParty/mallocMC/CHANGELOG.md
index 928f30158f..3b3e50253a 100644
--- a/thirdParty/mallocMC/CHANGELOG.md
+++ b/thirdParty/mallocMC/CHANGELOG.md
@@ -1,6 +1,27 @@
 Change Log / Release Log for mallocMC
 ================================================================
 
+2.4.0crp
+--------
+**Date:** 2020-05-28
+
+This release removes the Boost dependency and switched to C++11.
+
+### Changes to mallocMC 2.3.1crp
+
+**Features**
+  - Cleaning, remove Boost dependency & C++11 Migration #169
+
+**Bug fixes**
+  - Choose the value for the -arch nvcc flag depending on CUDA version #164 #165
+ 
+**Misc:**
+  - Travis CI: GCC 5.5.0 + CUDA 9.1.85 #170
+  - Adding headers to projects and applied clang-tidy #171
+  - clang-format #172
+
+Thanks to  Sergei Bastrakov, Bernhard Manfred Gruber and Axel Huebl for contributing to this release!
+
 2.3.1crp
 --------
 **Date:** 2019-02-14
diff --git a/thirdParty/mallocMC/CMakeLists.txt b/thirdParty/mallocMC/CMakeLists.txt
index 4376bd1321..a92927eecc 100644
--- a/thirdParty/mallocMC/CMakeLists.txt
+++ b/thirdParty/mallocMC/CMakeLists.txt
@@ -1,114 +1,87 @@
-project(mallocMC)
-cmake_minimum_required(VERSION 2.8.12.2)
+project(mallocMC LANGUAGES CXX)
+cmake_minimum_required(VERSION 3.8)
 
-# helper for libs and packages
-set(CMAKE_PREFIX_PATH "/usr/lib/x86_64-linux-gnu/"
-    "$ENV{CUDA_ROOT}" "$ENV{BOOST_ROOT}")
-
-
-################################################################################
-# CMake policies
-#
-# Search in <PackageName>_ROOT:
-#   https://cmake.org/cmake/help/v3.12/policy/CMP0074.html
-################################################################################
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
+  cmake_policy(SET CMP0074 NEW)
 endif()
 
-
-###############################################################################
-# CUDA
-###############################################################################
-find_package(CUDA REQUIRED)
-set(CUDA_NVCC_FLAGS "-arch=sm_20;-use_fast_math;")
-set(CUDA_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR})
-include_directories(${CUDA_INCLUDE_DIRS})
-cuda_include_directories(${CUDA_INCLUDE_DIRS})
-
-OPTION(CUDA_OUTPUT_INTERMEDIATE_CODE "Output ptx code" OFF)
-if(CUDA_OUTPUT_INTERMEDIATE_CODE)
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xptxas;-v;--keep")
-endif(CUDA_OUTPUT_INTERMEDIATE_CODE)
-
-SET(CUDA_OPTIMIZATION_TYPE "unset" CACHE STRING "CUDA Optimization")
-set_property(CACHE CUDA_OPTIMIZATION_TYPE PROPERTY STRINGS "unset;-G0;-O0;-O1;-O2;-O3")
-if(NOT ${CUDA_OPTIMIZATION_TYPE} STREQUAL  "unset")
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_OPTIMIZATION_TYPE}")
+# find alpaka
+set(mallocMC_ALPAKA_PROVIDER "intern" CACHE STRING "Select which alpaka is used")
+set_property(CACHE mallocMC_ALPAKA_PROVIDER PROPERTY STRINGS "intern;extern")
+mark_as_advanced(mallocMC_ALPAKA_PROVIDER)
+if(${mallocMC_ALPAKA_PROVIDER} STREQUAL "intern")
+    set(alpaka_BUILD_EXAMPLES OFF)
+    set(BUILD_TESTING OFF)
+    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/alpaka ${CMAKE_BINARY_DIR}/alpaka)
+else()
+    find_package(alpaka HINTS $ENV{ALPAKA_ROOT})
 endif()
 
+if(NOT TARGET alpaka::alpaka)
+    message(FATAL "Required mallocMC dependency alpaka could not be found!")
+endif()
 
-###############################################################################
-# Boost
-###############################################################################
-find_package(Boost 1.48.0 REQUIRED)
-include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
-set(LIBS ${LIBS} ${Boost_LIBRARIES})
-
-# nvcc + boost 1.55 work around
-if(Boost_VERSION EQUAL 105500)
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
-endif(Boost_VERSION EQUAL 105500)
+# Catch2
+set(mallocMC_CATCH2_PROVIDER "intern" CACHE STRING "Select which Catch2 is used")
+set_property(CACHE mallocMC_CATCH2_PROVIDER PROPERTY STRINGS "intern;extern")
+mark_as_advanced(mallocMC_CATCH2_PROVIDER)
+if(${mallocMC_CATCH2_PROVIDER} STREQUAL "intern")
+    add_library(Catch2::Catch2 INTERFACE IMPORTED)
+    target_include_directories(Catch2::Catch2 INTERFACE ${CMAKE_CURRENT_LIST_DIR}/thirdParty/catch2/include)
+else()
+    find_package(Catch2 CONFIG REQUIRED)
+endif()
 
+# for installation, just copy include folder to install folder
+install(
+    DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/src/include/."
+    DESTINATION include
+)
 
-################################################################################
-# Warnings
-################################################################################
-# GNU
+# warnings
+add_library(warnings INTERFACE)
 if(CMAKE_COMPILER_IS_GNUCXX)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshadow")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
-  # new warning in gcc 4.8 (flag ignored in previous version)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs")
-# ICC
+    target_compile_options(warnings INTERFACE -Wall -Wshadow -Wno-unknown-pragmas -Wextra -Wno-unused-parameter -Wno-unused-local-typedefs)
 elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshadow")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_VARIADIC_TEMPLATES")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_CXX11_VARIADIC_TEMPLATES")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBOOST_NO_FENV_H")
-# PGI
+    target_compile_options(warnings INTERFACE -Wall -Wshadow)
 elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Minform=inform")
+    target_compile_options(warnings INTERFACE -Minform=inform)
 endif()
 
-
-###############################################################################
-# Installation
-###############################################################################
-
-# copy include folder to install folder
-INSTALL(
-  DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/src/include/."
-  DESTINATION include
-  PATTERN ".git" EXCLUDE
-  PATTERN "mallocMC_config.hpp" EXCLUDE
-  )
-
-
-###############################################################################
 # Executables
-###############################################################################
-add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example02 mallocMC_Example03 VerifyHeap)
-
-cuda_add_executable(mallocMC_Example01
-                    EXCLUDE_FROM_ALL
-                    examples/mallocMC_example01.cu )
-cuda_add_executable(mallocMC_Example02
-                    EXCLUDE_FROM_ALL
-                    examples/mallocMC_example02.cu )
-cuda_add_executable(mallocMC_Example03
-                    EXCLUDE_FROM_ALL
-                    examples/mallocMC_example03.cu )
-cuda_add_executable(VerifyHeap
-                    EXCLUDE_FROM_ALL
-                    tests/verify_heap.cu )
+file(GLOB_RECURSE headers src/include/**)
+add_custom_target(mallocMCIde SOURCES ${headers}) # create a target with the header files for IDE projects
+source_group(TREE ${CMAKE_CURRENT_LIST_DIR}/src/include FILES ${headers})
+
+alpaka_add_executable(mallocMC_Example01 EXCLUDE_FROM_ALL examples/mallocMC_example01.cpp)
+target_include_directories(mallocMC_Example01 PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
+target_link_libraries(mallocMC_Example01 PUBLIC alpaka::alpaka warnings)
+
+alpaka_add_executable(mallocMC_Example03 EXCLUDE_FROM_ALL examples/mallocMC_example03.cpp)
+target_include_directories(mallocMC_Example03 PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
+target_link_libraries(mallocMC_Example03 PUBLIC alpaka::alpaka warnings)
+
+alpaka_add_executable(VerifyHeap EXCLUDE_FROM_ALL tests/verify_heap.cpp tests/verify_heap_config.hpp)
+target_include_directories(VerifyHeap PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
+target_link_libraries(VerifyHeap PUBLIC alpaka::alpaka warnings)
+
+if (CUDA_VERSION VERSION_LESS 10.2) # TODO(bgruber): I do not know exactly where it breaks. 9.1 does not work, 10.2 works
+    # the catch2 main needs to be in a non-CUDA file before CUDA 10.2, because nvcc fails to compile the catch2 header
+    # TODO: merge the test_main back into the tests exe, once CUDA 10.2 is the minimum version
+    add_library(tests_main EXCLUDE_FROM_ALL tests/main.cpp)
+    target_include_directories(tests_main PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
+    target_link_libraries(tests_main PUBLIC Catch2::Catch2 warnings)
+
+    alpaka_add_executable(tests EXCLUDE_FROM_ALL tests/dimensions.cpp tests/policies.cpp)
+    target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
+    target_link_libraries(tests PUBLIC tests_main alpaka::alpaka Catch2::Catch2 warnings)
+else()
+    alpaka_add_executable(tests EXCLUDE_FROM_ALL tests/main.cpp tests/dimensions.cpp tests/policies.cpp)
+    target_include_directories(tests PUBLIC ${CMAKE_CURRENT_LIST_DIR}/src/include)
+    target_link_libraries(tests PUBLIC alpaka::alpaka Catch2::Catch2 warnings)
+endif()
 
-target_link_libraries(mallocMC_Example01 ${LIBS})
-target_link_libraries(mallocMC_Example02 ${LIBS})
-target_link_libraries(mallocMC_Example03 ${LIBS})
-target_link_libraries(VerifyHeap ${LIBS})
+add_custom_target(examples DEPENDS mallocMC_Example01 mallocMC_Example03 VerifyHeap)
diff --git a/thirdParty/mallocMC/CONTRIBUTING.md b/thirdParty/mallocMC/CONTRIBUTING.md
new file mode 100644
index 0000000000..64e12b31af
--- /dev/null
+++ b/thirdParty/mallocMC/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+# Contributing
+
+## Formatting
+
+Please format your code before before opening pull requests using clang-format and the .clang-format file placed in the repository root.
+
+### Visual Studio and CLion
+Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1.
+The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey.
+
+### Bash
+First install clang-format. Instructions therefore can be found on the web.
+To format your changes since branching off `dev`, you can run this command in bash:
+```
+git clang-format dev
+```
+To format all code in your working copy, you can run this command in bash:
+```
+find -iname *.cpp -o -iname *.hpp | xargs clang-format -i
+```
diff --git a/thirdParty/mallocMC/INSTALL.md b/thirdParty/mallocMC/INSTALL.md
index 3aabdc83fb..ccab5ce955 100644
--- a/thirdParty/mallocMC/INSTALL.md
+++ b/thirdParty/mallocMC/INSTALL.md
@@ -4,18 +4,14 @@ Install
  - `gcc` 4.4 - 4.8 (depends on current CUDA version)
   - *Debian/Ubuntu:* `sudo apt-get install gcc-4.4 build-essential`
   - *Arch Linux:* `sudo pacman -S base-devel`
- - [CUDA 5.0](https://developer.nvidia.com/cuda-downloads) or higher
-  - *Debian/Ubuntu:* `sudo apt-get install nvidia-common nvidia-current nvidia-cuda-toolkit nvidia-cuda-dev`
-  - *Arch Linux:* `sudo pacman -S cuda`
- - one Nvidia **CUDA** compatible **GPU** with compute capability >= 2.0
-  - [full list](https://developer.nvidia.com/cuda-gpus) of CUDA GPUs and their *compute capability*
- - `boost` >= 1.48
-   - compile time headers
-   - `boost::program_options`
+ - `alpaka`
+  - included as git submodule
+ - `boost` >= 1.65.1
+   - dependency of alpaka
    - *Debian/Ubuntu:* `sudo apt-get install libboost-dev libboost-program-options-dev`
    - *Arch Linux:* `sudo pacman -S boost`
    - or download from [http://www.boost.org/](http://sourceforge.net/projects/boost/files/boost/1.55.0/boost_1_55_0.tar.gz/download)
- - `CMake` >= 2.8.12.2
+ - `CMake` >= 3.15
   - *Debian/Ubuntu:* `sudo apt-get install cmake file cmake-curses-gui`
   - *Arch Linux:* `sudo pacman -S cmake`
  - `git` >= 1.7.9.5
@@ -23,11 +19,6 @@ Install
   - *Arch Linux:* `sudo pacman -S git`
 
 
-### Mandatory environment variables
- - `CUDA_ROOT`: CUDA installation directory, e.g. `export CUDA_ROOT=<CUDA_INSTALL>`
-  - this might be already set through your CUDA toolkit
- - `BOOST_ROOT`: Boost installation directory, e.g. `export BOOST_ROOT=<BOOST_INSTALL>`
-
 ### Examples
 This is an example how to compile `mallocMC` and test the example code snippets
 
@@ -54,7 +45,7 @@ To use mallocMC in your project, you must include the header `mallocMC/mallocMC.
 add the correct include path.
 
 Because we are linking to Boost and CUDA, the following **external dependencies** must be linked:
-- `-lboost`, `-lcudart`
+- `-lboost`
 
 If you are using CMake you can download our `FindmallocMC.cmake` module with
 ```bash
@@ -71,21 +62,10 @@ cmake_minimum_required(VERSION 2.8.12.2)
 # add path to FindmallocMC.cmake, e.g., in the directory in cmake/
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/)
 
-# find the packages that are required by mallocMC. This has to be done BEFORE
-# loading mallocMC
-find_package(Boost REQUIRED)
-set(LIBS ${LIBS} ${Boost_LIBRARIES})
-
-find_package(CUDA REQUIRED)
-cuda_include_directories(${CUDA_INCLUDE_DIRS})
-
 # find mallocMC installation
 find_package(mallocMC 2.0.1 REQUIRED)
 
-# where to find headers (-I includes for compiler)
-include_directories(SYSTEM ${mallocMC_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS})
-
-add_executable(yourBinary ${SOURCES})
-
-target_link_libraries(yourBinary ${LIBS})
+alpaka_add_executable(yourBinary ${SOURCES})
+target_include_directories(yourBinary PUBLIC ${mallocMC_INCLUDE_DIRS})
+target_link_libraries(yourBinary PUBLIC alpaka::alpaka)
 ```
diff --git a/thirdParty/mallocMC/README.md b/thirdParty/mallocMC/README.md
index d1c3afacfa..10f13c3953 100644
--- a/thirdParty/mallocMC/README.md
+++ b/thirdParty/mallocMC/README.md
@@ -4,8 +4,8 @@ mallocMC
 mallocMC: *Memory Allocator for Many Core Architectures*
 
 This project provides a framework for **fast memory managers** on **many core
-accelerators**. Currently, it supports **NVIDIA GPUs** of compute capability
-`sm_20` or higher through the *ScatterAlloc* algorithm.
+accelerators**. It is based on [alpaka](https://github.com/alpaka-group/alpaka)
+to run on many different accelerators and implements the *ScatterAlloc* algorithm.
 
 
 Usage
@@ -22,6 +22,11 @@ mallocMC is header-only, but requires a few other C++ libraries to be
 available. Our installation notes can be found in [INSTALL.md](INSTALL.md).
 
 
+Contributing
+------------
+
+Rules for contributions are found in [CONTRIBUTING.md](CONTRIBUTING.md).
+
 On the ScatterAlloc Algorithm
 -----------------------------
 
diff --git a/thirdParty/mallocMC/Usage.md b/thirdParty/mallocMC/Usage.md
index 8d25057aeb..3f8049eabe 100644
--- a/thirdParty/mallocMC/Usage.md
+++ b/thirdParty/mallocMC/Usage.md
@@ -19,15 +19,15 @@ Currently, there are the following policy classes available:
 
 |Policy                 | Policy Classes (implementations) | description |
 |-------                |----------------------------------| ----------- |
-|**CreationPolicy**     | Scatter`<conf1,conf2>`           | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters|
+|**CreationPolicy**     | Scatter`<conf1,conf2>`         | A scattered allocation to tradeoff fragmentation for allocation time, as proposed in [ScatterAlloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). `conf1` configures the heap layout, `conf2` determines the hashing parameters|
 |                       | OldMalloc                        | device-side malloc/new and free/delete syscalls as implemented on NVidia CUDA graphics cards with compute capability sm_20 and higher |
-|**DistributionPolicy** | XMallocSIMD`<conf>`              | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match |
+|**DistributionPolicy** | XMallocSIMD`<conf>`             | SIMD optimization for warp-wide allocation on NVIDIA CUDA accelerators, as proposed by [XMalloc](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=5577907). `conf` is used to determine the pagesize. If used in combination with *Scatter*, the pagesizes must match |
 |                       | Noop                             | no workload distribution at all |
-|**OOMPolicy**          | ReturnNull                       | pointers will be *NULL*, if the request could not be fulfilled |
+|**OOMPolicy**          | ReturnNull                       | pointers will be *nullptr*, if the request could not be fulfilled |
 |                       | ~~BadAllocException~~            | will throw a `std::bad_alloc` exception. The accelerator has to support exceptions |
 |**ReservePoolPolicy**  | SimpleCudaMalloc                 | allocate a fixed heap with `CudaMalloc` |
 |                       | CudaSetLimits                    | call to `CudaSetLimits` to increase the available Heap (e.g. when using *OldMalloc*) |
-|**AlignmentPolicy**    | Shrink`<conf>`                   | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment|
+|**AlignmentPolicy**    | Shrink`<conf>`                  | shrinks the pool so that the starting pointer is well aligned, applies padding to requested memory chunks. `conf` is used to determine the alignment|
 |                       | Noop                             | no alignment at all |
 
 The user has to choose one of each policy that will form a useful allocator
@@ -45,7 +45,7 @@ to the policy class:
 ```c++
 // configure the AlignmentPolicy "Shrink"
 struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties {
-  typedef boost::mpl::int_<16> dataAlignment;
+  static constexpr auto dataAlignment = 16;
 };
 ```
 
@@ -57,29 +57,29 @@ parameters to create the desired allocator type:
 ```c++
 using namespace mallocMC;
 
-typedef mallocMC::Allocator<
+using Allocator1 = mallocMC::Allocator<
   CreationPolicy::OldMalloc,
   DistributionPolicy::Noop,
   OOMPolicy::ReturnNull,
   ReservePoolPolicy::CudaSetLimits,
   AlignmentPolicy::Noop
-> Allocator1;
+>;
 ```
 
 `Allocator1` will resemble the behaviour of classical device-side allocation known
 from NVIDIA CUDA since compute capability sm_20. To get a more novel allocator, one
-could create the following typedef instead:
+could create the following alias instead:
 
 ```c++
 using namespace mallocMC;
 
-typedef mallocMC::Allocator<
+using ScatterAllocator = mallocMC::Allocator<
   CreationPolicies::Scatter<>,
   DistributionPolicies::XMallocSIMD<>,
   OOMPolicies::ReturnNull,
   ReservePoolPolicies::SimpleCudaMalloc,
   AlignmentPolicies::Shrink<ShrinkConfig>
-> ScatterAllocator;
+>;
 ```
 
 Notice, how the policy classes `Scatter` and `XMallocSIMD` are instantiated without
@@ -122,13 +122,13 @@ A simplistic example would look like this:
 
 namespace mallocMC = MC;
 
-typedef MC::Allocator<
+using ScatterAllocator = MC::Allocator<
   MC::CreationPolicies::Scatter<>,
   MC::DistributionPolicies::XMallocSIMD<>,
   MC::OOMPolicies::ReturnNull,
   MC::ReservePoolPolicies::SimpleCudaMalloc,
   MC::AlignmentPolicies::Shrink<ShrinkConfig>
-  > ScatterAllocator;
+>;
 
 __global__ exampleKernel(ScatterAllocator::AllocatorHandle sah)
 {
diff --git a/thirdParty/mallocMC/examples/mallocMC_example01.cpp b/thirdParty/mallocMC/examples/mallocMC_example01.cpp
new file mode 100644
index 0000000000..dac743268a
--- /dev/null
+++ b/thirdParty/mallocMC/examples/mallocMC_example01.cpp
@@ -0,0 +1,206 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  https://www.hzdr.de/crp
+
+  Copyright 2014 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include <alpaka/alpaka.hpp>
+#include <iostream>
+#include <mallocMC/mallocMC.hpp>
+#include <numeric>
+
+using Dim = alpaka::DimInt<1>;
+using Idx = std::size_t;
+// using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+// using Acc = alpaka::AccCpuOmp2Threads<Dim, Idx>;
+using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+
+struct ScatterHeapConfig
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
+};
+
+struct ScatterHashConfig
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
+};
+
+struct XMallocConfig
+{
+    static constexpr auto pagesize = ScatterHeapConfig::pagesize;
+};
+
+struct ShrinkConfig
+{
+    static constexpr auto dataAlignment = 16;
+};
+
+using ScatterAllocator = mallocMC::Allocator<
+    Acc,
+    mallocMC::CreationPolicies::Scatter<ScatterHeapConfig, ScatterHashConfig>,
+    mallocMC::DistributionPolicies::XMallocSIMD<XMallocConfig>,
+    mallocMC::OOMPolicies::ReturnNull,
+    mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+    mallocMC::AlignmentPolicies::Shrink<ShrinkConfig>>;
+
+ALPAKA_STATIC_ACC_MEM_GLOBAL int** arA;
+ALPAKA_STATIC_ACC_MEM_GLOBAL int** arB;
+ALPAKA_STATIC_ACC_MEM_GLOBAL int** arC;
+
+auto main() -> int
+{
+    constexpr auto block = 32;
+    constexpr auto grid = 32;
+    constexpr auto length = 100;
+    static_assert(length <= block * grid, ""); // necessary for used algorithm
+
+    const auto dev = alpaka::getDevByIdx<Acc>(0);
+    auto queue = alpaka::Queue<Acc, alpaka::Blocking>{dev};
+
+    // init the heap
+    std::cerr << "initHeap...";
+    ScatterAllocator scatterAlloc(dev, queue, 1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc
+    std::cerr << "done\n";
+    std::cout << ScatterAllocator::info("\n") << '\n';
+
+    // create arrays of arrays on the device
+    {
+        auto createArrayPointers
+            = [] ALPAKA_FN_ACC(const Acc& acc, int x, int y, ScatterAllocator::AllocatorHandle allocHandle) {
+                  arA = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y);
+                  arB = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y);
+                  arC = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y);
+              };
+        const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{1}, Idx{1}, Idx{1}};
+        alpaka::enqueue(
+            queue,
+            alpaka::createTaskKernel<Acc>(
+                workDiv,
+                createArrayPointers,
+                grid,
+                block,
+                scatterAlloc.getAllocatorHandle()));
+    }
+
+    // fill 2 of them all with ascending values
+    {
+        auto fillArrays = [] ALPAKA_FN_ACC(const Acc& acc, int length, ScatterAllocator::AllocatorHandle allocHandle) {
+            const auto id = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+
+            arA[id] = (int*) allocHandle.malloc(acc, length * sizeof(int));
+            arB[id] = (int*) allocHandle.malloc(acc, length * sizeof(int));
+            arC[id] = (int*) allocHandle.malloc(acc, length * sizeof(int));
+
+            for(int i = 0; i < length; ++i)
+            {
+                arA[id][i] = static_cast<int>(id * length + i);
+                arB[id][i] = static_cast<int>(id * length + i);
+            }
+        };
+        const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{grid}, Idx{block}, Idx{1}};
+        alpaka::enqueue(
+            queue,
+            alpaka::createTaskKernel<Acc>(workDiv, fillArrays, length, scatterAlloc.getAllocatorHandle()));
+    }
+
+    // add the 2 arrays (vector addition within each thread)
+    // and do a thread-wise reduce to sums
+    {
+        auto sumsBufferAcc = alpaka::allocBuf<int, Idx>(dev, Idx{block * grid});
+
+        auto addArrays = [] ALPAKA_FN_ACC(const Acc& acc, int length, int* sums) {
+            const auto id = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+
+            sums[id] = 0;
+            for(int i = 0; i < length; ++i)
+            {
+                arC[id][i] = arA[id][i] + arB[id][i];
+                sums[id] += arC[id][i];
+            }
+        };
+        const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{grid}, Idx{block}, Idx{1}};
+        alpaka::enqueue(
+            queue,
+            alpaka::createTaskKernel<Acc>(
+                workDiv,
+                addArrays,
+                length,
+                alpaka::getPtrNative(sumsBufferAcc)));
+
+        const auto hostDev = alpaka::getDevByIdx<alpaka::DevCpu>(0);
+        auto sumsBufferHost = alpaka::allocBuf<int, Idx>(hostDev, Idx{block * grid});
+        alpaka::memcpy(queue, sumsBufferHost, sumsBufferAcc, Idx{block * grid});
+        alpaka::wait(queue);
+
+        const auto* sumsPtr = alpaka::getPtrNative(sumsBufferHost);
+        const auto sum = std::accumulate(sumsPtr, sumsPtr + block * grid, size_t{0});
+        std::cout << "The sum of the arrays on GPU is " << sum << '\n';
+    }
+
+    const auto n = static_cast<size_t>(block * grid * length);
+    const auto gaussian = n * (n - 1);
+    std::cout << "The gaussian sum as comparison: " << gaussian << '\n';
+
+    /*constexpr*/ if(mallocMC::Traits<ScatterAllocator>::providesAvailableSlots)
+    {
+        std::cout << "there are ";
+        std::cout << scatterAlloc.getAvailableSlots(dev, queue, 1024U * 1024U);
+        std::cout << " Slots of size 1MB available\n";
+    }
+
+    {
+        auto freeArrays = [] ALPAKA_FN_ACC(const Acc& acc, ScatterAllocator::AllocatorHandle allocHandle) {
+            const auto id = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+            allocHandle.free(acc, arA[id]);
+            allocHandle.free(acc, arB[id]);
+            allocHandle.free(acc, arC[id]);
+        };
+        const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{grid}, Idx{block}, Idx{1}};
+        alpaka::enqueue(
+            queue,
+            alpaka::createTaskKernel<Acc>(workDiv, freeArrays, scatterAlloc.getAllocatorHandle()));
+    }
+
+    {
+        auto freeArrayPointers = [] ALPAKA_FN_ACC(const Acc& acc, ScatterAllocator::AllocatorHandle allocHandle) {
+            allocHandle.free(acc, arA);
+            allocHandle.free(acc, arB);
+            allocHandle.free(acc, arC);
+        };
+        const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{1}, Idx{1}, Idx{1}};
+        alpaka::enqueue(
+            queue,
+            alpaka::createTaskKernel<Acc>(workDiv, freeArrayPointers, scatterAlloc.getAllocatorHandle()));
+    }
+
+    return 0;
+}
diff --git a/thirdParty/mallocMC/examples/mallocMC_example01.cu b/thirdParty/mallocMC/examples/mallocMC_example01.cu
deleted file mode 100644
index 7ac9c4ef08..0000000000
--- a/thirdParty/mallocMC/examples/mallocMC_example01.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#include <iostream>
-#include <cassert>
-#include <vector>
-#include <numeric>
-
-#include <cuda.h>
-#include "mallocMC_example01_config.cu"
-
-void run();
-
-int main()
-{
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, 0);
-
-  if( deviceProp.major < 2 ) {
-    std::cerr << "Error: Compute Capability >= 2.0 required. (is ";
-    std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl;
-    return 1;
-  }
-
-  cudaSetDevice(0);
-  run();
-  cudaDeviceReset();
-
-  return 0;
-}
-
-
-__device__ int** arA;
-__device__ int** arB;
-__device__ int** arC;
-
-
-__global__ void createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle mMC){
-  arA = (int**) mMC.malloc(sizeof(int*) * x*y);
-  arB = (int**) mMC.malloc(sizeof(int*) * x*y);
-  arC = (int**) mMC.malloc(sizeof(int*) * x*y);
-}
-
-
-__global__ void fillArrays(int length, int* d, ScatterAllocator::AllocatorHandle mMC){
-  int id = threadIdx.x + blockIdx.x*blockDim.x;
-
-  arA[id] = (int*) mMC.malloc(length*sizeof(int));
-  arB[id] = (int*) mMC.malloc(length*sizeof(int));
-  arC[id] = (int*) mMC.malloc(sizeof(int)*length);
-
-  for(int i=0 ; i<length; ++i){
-    arA[id][i] = id*length+i;
-    arB[id][i] = id*length+i;
-  }
-}
-
-
-__global__ void addArrays(int length, int* d){
-  int id = threadIdx.x + blockIdx.x*blockDim.x;
-
-  d[id] = 0;
-  for(int i=0 ; i<length; ++i){
-    arC[id][i] = arA[id][i] + arB[id][i];
-    d[id] += arC[id][i];
-  }
-}
-
-
-__global__ void freeArrays(ScatterAllocator::AllocatorHandle mMC){
-  int id = threadIdx.x + blockIdx.x*blockDim.x;
-  mMC.free(arA[id]);
-  mMC.free(arB[id]);
-  mMC.free(arC[id]);
-}
-
-
-__global__ void freeArrayPointers(ScatterAllocator::AllocatorHandle mMC){
-  mMC.free(arA);
-  mMC.free(arB);
-  mMC.free(arC);
-}
-
-
-void run()
-{
-  size_t block = 32;
-  size_t grid = 32;
-  int length = 100;
-  assert((unsigned)length<= block*grid); //necessary for used algorithm
-
-  //init the heap
-  std::cerr << "initHeap...";
-  ScatterAllocator mMC(1U*1024U*1024U*1024U); //1GB for device-side malloc
-  std::cerr << "done" << std::endl;
-
-  std::cout << ScatterAllocator::info("\n") << std::endl;
-
-  // device-side pointers
-  int*  d;
-  cudaMalloc((void**) &d, sizeof(int)*block*grid);
-
-  // host-side pointers
-  std::vector<int> array_sums(block*grid,0);
-
-  // create arrays of arrays on the device
-  createArrayPointers<<<1,1>>>(grid,block, mMC );
-
-  // fill 2 of them all with ascending values
-  fillArrays<<<grid,block>>>(length, d, mMC );
-
-  // add the 2 arrays (vector addition within each thread)
-  // and do a thread-wise reduce to d
-  addArrays<<<grid,block>>>(length, d);
-
-  cudaMemcpy(&array_sums[0],d,sizeof(int)*block*grid,cudaMemcpyDeviceToHost);
-
-  mMC.getAvailableSlots(1024U*1024U); //get available megabyte-sized slots
-
-  int sum = std::accumulate(array_sums.begin(),array_sums.end(),0);
-  std::cout << "The sum of the arrays on GPU is " << sum << std::endl;
-
-  int n = block*grid*length;
-  int gaussian = n*(n-1);
-  std::cout << "The gaussian sum as comparison: " << gaussian << std::endl;
-
-  freeArrays<<<grid,block>>>( mMC );
-  freeArrayPointers<<<1,1>>>( mMC );
-  cudaFree(d);
-
-}
diff --git a/thirdParty/mallocMC/examples/mallocMC_example01_config.cu b/thirdParty/mallocMC/examples/mallocMC_example01_config.cu
deleted file mode 100644
index 14ebf74318..0000000000
--- a/thirdParty/mallocMC/examples/mallocMC_example01_config.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <boost/mpl/int.hpp>
-#include <boost/mpl/bool.hpp>
-
-// basic files for mallocMC
-#include "src/include/mallocMC/mallocMC_hostclass.hpp"
-
-// Load all available policies for mallocMC
-#include "src/include/mallocMC/CreationPolicies.hpp"
-#include "src/include/mallocMC/DistributionPolicies.hpp"
-#include "src/include/mallocMC/OOMPolicies.hpp"
-#include "src/include/mallocMC/ReservePoolPolicies.hpp"
-#include "src/include/mallocMC/AlignmentPolicies.hpp"
-    
-
-
-// configurate the CreationPolicy "Scatter" to modify the default behaviour
-struct ScatterHeapConfig : mallocMC::CreationPolicies::Scatter<>::HeapProperties{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
-};
-
-struct ScatterHashConfig : mallocMC::CreationPolicies::Scatter<>::HashingProperties{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
-};
-
-// configure the DistributionPolicy "XMallocSIMD"
-struct XMallocConfig : mallocMC::DistributionPolicies::XMallocSIMD<>::Properties {
-  typedef ScatterHeapConfig::pagesize pagesize;
-};
-
-// configure the AlignmentPolicy "Shrink"
-struct ShrinkConfig : mallocMC::AlignmentPolicies::Shrink<>::Properties {
-  typedef boost::mpl::int_<16> dataAlignment;
-};
-
-// Define a new allocator and call it ScatterAllocator
-// which resembles the behaviour of ScatterAlloc
-typedef mallocMC::Allocator< 
-  mallocMC::CreationPolicies::Scatter<ScatterHeapConfig, ScatterHashConfig>,
-  mallocMC::DistributionPolicies::XMallocSIMD<XMallocConfig>,
-  mallocMC::OOMPolicies::ReturnNull,
-  mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-  mallocMC::AlignmentPolicies::Shrink<ShrinkConfig>
-  > ScatterAllocator;
diff --git a/thirdParty/mallocMC/examples/mallocMC_example02.cu b/thirdParty/mallocMC/examples/mallocMC_example02.cu
deleted file mode 100644
index d00d00e742..0000000000
--- a/thirdParty/mallocMC/examples/mallocMC_example02.cu
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#include <iostream>
-#include <cassert>
-#include <vector>
-#include <numeric>
-
-#include <cuda.h>
-#include <boost/mpl/int.hpp>
-#include <boost/mpl/bool.hpp>
-
-///////////////////////////////////////////////////////////////////////////////
-// includes for mallocMC
-///////////////////////////////////////////////////////////////////////////////
-// basic files for mallocMC
-#include "src/include/mallocMC/mallocMC_hostclass.hpp"
-
-// Load all available policies for mallocMC
-#include "src/include/mallocMC/CreationPolicies.hpp"
-#include "src/include/mallocMC/DistributionPolicies.hpp"
-#include "src/include/mallocMC/OOMPolicies.hpp"
-#include "src/include/mallocMC/ReservePoolPolicies.hpp"
-#include "src/include/mallocMC/AlignmentPolicies.hpp"
-
-///////////////////////////////////////////////////////////////////////////////
-// Configuration for mallocMC
-///////////////////////////////////////////////////////////////////////////////
-
-// configurate the CreationPolicy "Scatter"
-struct ScatterConfig{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
-};
-
-struct ScatterHashParams{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
-};
-
-// configure the DistributionPolicy "XMallocSIMD"
-struct DistributionConfig{
-  typedef ScatterConfig::pagesize pagesize;
-};
-
-// configure the AlignmentPolicy "Shrink"
-struct AlignmentConfig{
-  typedef boost::mpl::int_<16> dataAlignment;
-};
-
-// Define a new mMCator and call it ScatterAllocator
-// which resembles the behaviour of ScatterAlloc
-typedef mallocMC::Allocator<
-  mallocMC::CreationPolicies::Scatter<ScatterConfig,ScatterHashParams>,
-  mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
-  mallocMC::OOMPolicies::ReturnNull,
-  mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-  mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>
-  > ScatterAllocator;
-
-
-///////////////////////////////////////////////////////////////////////////////
-// End of mallocMC configuration
-///////////////////////////////////////////////////////////////////////////////
-
-
-void run();
-
-int main()
-{
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-
-    if( deviceProp.major < int(2) ) {
-        std::cerr << "Error: Compute Capability >= 2.0 required. (is ";
-        std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl;
-        return 1;
-    }
-
-    cudaSetDevice(0);
-    run();
-    cudaDeviceReset();
-
-    return 0;
-}
-
-
-__device__ int** arA;
-__device__ int** arB;
-__device__ int** arC;
-
-
-__global__ void createArrayPointers(int x, int y, ScatterAllocator::AllocatorHandle  mMC){
-    arA = (int**) mMC.malloc(sizeof(int*) * x*y);
-    arB = (int**) mMC.malloc(sizeof(int*) * x*y);
-    arC = (int**) mMC.malloc(sizeof(int*) * x*y);
-}
-
-
-__global__ void fillArrays(int length, int* d, ScatterAllocator::AllocatorHandle mMC){
-    int id = threadIdx.x + blockIdx.x*blockDim.x;
-
-    arA[id] = (int*) mMC.malloc(sizeof(int)*length);
-    arB[id] = (int*) mMC.malloc(sizeof(int)*length);
-    arC[id] = (int*) mMC.malloc(sizeof(int)*length);
-
-    for(int i=0 ; i<length; ++i){
-        arA[id][i] = id*length+i;
-        arB[id][i] = id*length+i;
-    }
-}
-
-
-__global__ void addArrays(int length, int* d){
-    int id = threadIdx.x + blockIdx.x*blockDim.x;
-
-    d[id] = 0;
-    for(int i=0 ; i<length; ++i){
-        arC[id][i] = arA[id][i] + arB[id][i];
-        d[id] += arC[id][i];
-    }
-}
-
-
-__global__ void freeArrays(ScatterAllocator::AllocatorHandle mMC){
-    int id = threadIdx.x + blockIdx.x*blockDim.x;
-    mMC.free(arA[id]);
-    mMC.free(arB[id]);
-    mMC.free(arC[id]);
-}
-
-
-__global__ void freeArrayPointers(ScatterAllocator::AllocatorHandle mMC){
-    mMC.free(arA);
-    mMC.free(arB);
-    mMC.free(arC);
-}
-
-
-void run()
-{
-    size_t block = 32;
-    size_t grid = 32;
-    int length = 100;
-    assert((unsigned)length <= block*grid); //necessary for used algorithm
-
-    //init the heap
-    std::cerr << "initHeap...";
-    ScatterAllocator mMC(1U*1024U*1024U*1024U); //1GB for device-side malloc
-    std::cerr << "done" << std::endl;
-
-    // device-side pointers
-    int*  d;
-    cudaMalloc((void**) &d, sizeof(int)*block*grid);
-
-    // host-side pointers
-    std::vector<int> array_sums(block*grid,0);
-
-    // create arrays of arrays on the device
-    createArrayPointers<<<1,1>>>(grid, block, mMC );
-
-    // fill 2 of them all with ascending values
-    fillArrays<<<grid,block>>>(length, d, mMC );
-
-    // add the 2 arrays (vector addition within each thread)
-    // and do a thread-wise reduce to d
-    addArrays<<<grid,block>>>(length, d);
-
-    cudaMemcpy(&array_sums[0], d, sizeof(int)*block*grid, cudaMemcpyDeviceToHost);
-
-    int sum = std::accumulate(array_sums.begin(), array_sums.end(), 0);
-    std::cout << "The sum of the arrays on GPU is " << sum << std::endl;
-
-    int n = block*grid*length;
-    int gaussian = n*(n-1);
-    std::cout << "The gaussian sum as comparison: " << gaussian << std::endl;
-
-    // checking the free memory of the allocator
-    if(mallocMC::Traits<ScatterAllocator>::providesAvailableSlots){
-        std::cout << "there are ";
-        std::cout << mMC.getAvailableSlots(1024U*1024U);
-        std::cout << " Slots of size 1MB available" << std::endl;
-    }
-
-    freeArrays<<<grid, block>>>( mMC );
-    freeArrayPointers<<<1, 1>>>( mMC );
-    cudaFree(d);
-
-}
diff --git a/thirdParty/mallocMC/examples/mallocMC_example03.cpp b/thirdParty/mallocMC/examples/mallocMC_example03.cpp
new file mode 100644
index 0000000000..79079dd28d
--- /dev/null
+++ b/thirdParty/mallocMC/examples/mallocMC_example03.cpp
@@ -0,0 +1,114 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  https://www.hzdr.de/crp
+
+  Copyright 2014 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include <alpaka/alpaka.hpp>
+#include <cassert>
+#include <iostream>
+#include <mallocMC/mallocMC.hpp>
+#include <numeric>
+#include <vector>
+
+using Dim = alpaka::DimInt<1>;
+using Idx = std::size_t;
+// using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+// using Acc = alpaka::AccCpuOmp2Threads<Dim, Idx>;
+using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+
+struct ScatterConfig
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
+};
+
+struct ScatterHashParams
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
+};
+
+struct AlignmentConfig
+{
+    static constexpr auto dataAlignment = 16;
+};
+
+using ScatterAllocator = mallocMC::Allocator<
+    Acc,
+    mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+    mallocMC::DistributionPolicies::Noop,
+    mallocMC::OOMPolicies::ReturnNull,
+    mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+    mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+
+ALPAKA_STATIC_ACC_MEM_GLOBAL int* arA = nullptr;
+
+struct ExampleKernel
+{
+    ALPAKA_FN_ACC void operator()(const Acc& acc, ScatterAllocator::AllocatorHandle allocHandle) const
+    {
+        const auto id = static_cast<uint32_t>(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0]);
+        if(id == 0)
+            arA = (int*) allocHandle.malloc(acc, sizeof(int) * 32);
+        // wait the the malloc from thread zero is not changing the result for some threads
+        alpaka::syncBlockThreads(acc);
+        const auto slots = allocHandle.getAvailableSlots(acc, 1);
+        if(arA != nullptr)
+        {
+            arA[id] = id;
+            printf("id: %u array: %d slots %u\n", id, arA[id], slots);
+        }
+        else
+            printf("error: device size allocation failed");
+
+        // wait that all thread read from `arA`
+        alpaka::syncBlockThreads(acc);
+        if(id == 0)
+            allocHandle.free(acc, arA);
+    }
+};
+
+auto main() -> int
+{
+    const auto dev = alpaka::getDevByIdx<Acc>(0);
+    auto queue = alpaka::Queue<Acc, alpaka::Blocking>{dev};
+
+    ScatterAllocator scatterAlloc(dev, queue, 1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc
+
+    const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{1}, Idx{32}, Idx{1}};
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(workDiv, ExampleKernel{}, scatterAlloc.getAllocatorHandle()));
+
+    std::cout << "Slots from Host: " << scatterAlloc.getAvailableSlots(dev, queue, 1) << '\n';
+
+    return 0;
+}
diff --git a/thirdParty/mallocMC/examples/mallocMC_example03.cu b/thirdParty/mallocMC/examples/mallocMC_example03.cu
deleted file mode 100644
index a3a271b936..0000000000
--- a/thirdParty/mallocMC/examples/mallocMC_example03.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#include <iostream>
-#include <assert.h>
-#include <vector>
-#include <numeric>
-#include <stdio.h>
-
-#include <cuda.h>
-#include <boost/mpl/int.hpp>
-#include <boost/mpl/bool.hpp>
-
-
-///////////////////////////////////////////////////////////////////////////////
-// includes for mallocMC
-///////////////////////////////////////////////////////////////////////////////
-#include "src/include/mallocMC/mallocMC_hostclass.hpp"
-
-#include "src/include/mallocMC/CreationPolicies.hpp"
-#include "src/include/mallocMC/DistributionPolicies.hpp"
-#include "src/include/mallocMC/OOMPolicies.hpp"
-#include "src/include/mallocMC/ReservePoolPolicies.hpp"
-#include "src/include/mallocMC/AlignmentPolicies.hpp"
-
-
-///////////////////////////////////////////////////////////////////////////////
-// Configuration for mallocMC
-///////////////////////////////////////////////////////////////////////////////
-
-// configurate the CreationPolicy "Scatter"
-struct ScatterConfig{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
-};
-
-struct ScatterHashParams{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
-};
-
-
-// configure the AlignmentPolicy "Shrink"
-struct AlignmentConfig{
-    typedef boost::mpl::int_<16> dataAlignment;
-};
-
-// Define a new mMCator and call it ScatterAllocator
-// which resembles the behaviour of ScatterAlloc
-typedef mallocMC::Allocator<
-    mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
-    mallocMC::DistributionPolicies::Noop,
-    mallocMC::OOMPolicies::ReturnNull,
-    mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-    mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>
-> ScatterAllocator;
-
-///////////////////////////////////////////////////////////////////////////////
-// End of mallocMC configuration
-///////////////////////////////////////////////////////////////////////////////
-
-
-__device__ int* arA;
-
-
-__global__ void exampleKernel(ScatterAllocator::AllocatorHandle mMC){
-    unsigned x = 42;
-    if(threadIdx.x==0)
-        arA = (int*) mMC.malloc(sizeof(int) * 32);
-
-    x = mMC.getAvailableSlots(1);
-    __syncthreads();
-    arA[threadIdx.x] = threadIdx.x;
-    printf("tid: %d array: %d slots %d\n", threadIdx.x, arA[threadIdx.x],x);
-
-    if(threadIdx.x == 0)
-        mMC.free(arA);
-}
-
-
-int main()
-{
-    ScatterAllocator mMC(1U*1024U*1024U*1024U); //1GB for device-side malloc
-
-    exampleKernel<<<1,32>>>( mMC );
-    std::cout << "Slots from Host: " << mMC.getAvailableSlots(1) << std::endl;
-
-    return 0;
-}
diff --git a/thirdParty/mallocMC/src/include/mallocMC/AlignmentPolicies.hpp b/thirdParty/mallocMC/src/include/mallocMC/AlignmentPolicies.hpp
deleted file mode 100644
index c471b696ab..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/AlignmentPolicies.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include "alignmentPolicies/Shrink.hpp"
-#include "alignmentPolicies/Shrink_impl.hpp"
-
-#include "alignmentPolicies/Noop.hpp"
-#include "alignmentPolicies/Noop_impl.hpp"
-
diff --git a/thirdParty/mallocMC/src/include/mallocMC/CreationPolicies.hpp b/thirdParty/mallocMC/src/include/mallocMC/CreationPolicies.hpp
deleted file mode 100644
index 56f6e23f14..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/CreationPolicies.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include "creationPolicies/Scatter.hpp"
-#include "creationPolicies/Scatter_impl.hpp"
-
-#include "creationPolicies/OldMalloc.hpp"
-#include "creationPolicies/OldMalloc_impl.hpp"
diff --git a/thirdParty/mallocMC/src/include/mallocMC/DistributionPolicies.hpp b/thirdParty/mallocMC/src/include/mallocMC/DistributionPolicies.hpp
deleted file mode 100644
index f534c57d0c..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/DistributionPolicies.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include "distributionPolicies/Noop.hpp"
-#include "distributionPolicies/Noop_impl.hpp"
-
-#include "distributionPolicies/XMallocSIMD.hpp"
-#include "distributionPolicies/XMallocSIMD_impl.hpp"
diff --git a/thirdParty/mallocMC/src/include/mallocMC/OOMPolicies.hpp b/thirdParty/mallocMC/src/include/mallocMC/OOMPolicies.hpp
deleted file mode 100644
index 67eda5197d..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/OOMPolicies.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include "oOMPolicies/ReturnNull.hpp"
-#include "oOMPolicies/ReturnNull_impl.hpp"
-
-#include "oOMPolicies/BadAllocException.hpp"
-#include "oOMPolicies/BadAllocException_impl.hpp"
diff --git a/thirdParty/mallocMC/src/include/mallocMC/ReservePoolPolicies.hpp b/thirdParty/mallocMC/src/include/mallocMC/ReservePoolPolicies.hpp
deleted file mode 100644
index 1cc3aa7b15..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/ReservePoolPolicies.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include "reservePoolPolicies/SimpleCudaMalloc.hpp"
-#include "reservePoolPolicies/SimpleCudaMalloc_impl.hpp"
-
-#include "reservePoolPolicies/CudaSetLimits.hpp"
-#include "reservePoolPolicies/CudaSetLimits_impl.hpp"
diff --git a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop.hpp b/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop.hpp
index 3879da6faf..3f43c93654 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop.hpp
@@ -27,16 +27,42 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace AlignmentPolicies{
-
-  /**
-   * @brief a policy that does nothing
-   *
-   * This AlignmentPolicy will not perform any distribution, but only return
-   * its input (identity function)
-   */
-  class Noop;
-
-} //namespace AlignmentPolicies
-} //namespace mallocMC
+#include "Noop.hpp"
+
+#include <alpaka/core/Common.hpp>
+#include <cstdint>
+#include <string>
+#include <tuple>
+
+namespace mallocMC
+{
+    namespace AlignmentPolicies
+    {
+        /**
+         * @brief a policy that does nothing
+         *
+         * This AlignmentPolicy will not perform any distribution, but only
+         * return its input (identity function)
+         */
+        class Noop
+        {
+        public:
+            static auto alignPool(void* memory, size_t memsize) -> std::tuple<void*, size_t>
+            {
+                return std::make_tuple(memory, memsize);
+            }
+
+            ALPAKA_FN_HOST_ACC
+            static auto applyPadding(size_t bytes) -> size_t
+            {
+                return bytes;
+            }
+
+            static auto classname() -> std::string
+            {
+                return "Noop";
+            }
+        };
+
+    } // namespace AlignmentPolicies
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp
deleted file mode 100644
index e1c034d052..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Noop_impl.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <boost/cstdint.hpp>
-#include <string>
-
-#include "Noop.hpp"
-#include "../mallocMC_prefixes.hpp"
-
-namespace mallocMC{
-namespace AlignmentPolicies{
-
-  class Noop{
-    typedef boost::uint32_t uint32;
-
-    public:
-
-    static boost::tuple<void*,size_t> alignPool(void* memory, size_t memsize){
-      return boost::make_tuple(memory,memsize);
-    }
-
-    MAMC_HOST MAMC_ACCELERATOR
-    static uint32 applyPadding(uint32 bytes){
-      return bytes;
-    }
-
-    static std::string classname(){
-      return "Noop";
-    }
-
-  };
-
-} //namespace AlignmentPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink.hpp b/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink.hpp
index 4bafeff86f..759d5ea580 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink.hpp
@@ -31,31 +31,119 @@
 
 #pragma once
 
-#include <boost/mpl/int.hpp>
-
-namespace mallocMC{
-namespace AlignmentPolicies{
-
-namespace ShrinkConfig{
-  struct DefaultShrinkConfig{
-    typedef boost::mpl::int_<16> dataAlignment;
-  };
-}
-
-  /**
-   * @brief Provides proper alignment of pool and pads memory requests
-   *
-   * This AlignmentPolicy is based on ideas from ScatterAlloc
-   * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604). It
-   * performs alignment operations on big memory pools and requests to allocate
-   * memory. Memory pools are truncated at the beginning until the pointer to
-   * the memory fits the alignment. Requests to allocate memory are padded
-   * until their size is a multiple of the alignment.
-   *
-   * @tparam T_Config (optional) The alignment to use
-   */
-  template<typename T_Config = ShrinkConfig::DefaultShrinkConfig>
-  class Shrink;
-
-} //namespace AlignmentPolicies
-} //namespace mallocMC
+#include "Shrink.hpp"
+
+#include <alpaka/core/Common.hpp>
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace mallocMC
+{
+    namespace AlignmentPolicies
+    {
+        namespace Shrink2NS
+        {
+            template<int PSIZE>
+            struct __PointerEquivalent
+            {
+                using type = unsigned int;
+            };
+            template<>
+            struct __PointerEquivalent<8>
+            {
+                using type = unsigned long long;
+            };
+        } // namespace Shrink2NS
+
+        namespace ShrinkConfig
+        {
+            struct DefaultShrinkConfig
+            {
+                static constexpr auto dataAlignment = 16;
+            };
+        } // namespace ShrinkConfig
+
+        /**
+         * @brief Provides proper alignment of pool and pads memory requests
+         *
+         * This AlignmentPolicy is based on ideas from ScatterAlloc
+         * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604).
+         * It performs alignment operations on big memory pools and requests to
+         * allocate memory. Memory pools are truncated at the beginning until
+         * the pointer to the memory fits the alignment. Requests to allocate
+         * memory are padded until their size is a multiple of the alignment.
+         *
+         * @tparam T_Config (optional) The alignment to use
+         */
+        template<typename T_Config = ShrinkConfig::DefaultShrinkConfig>
+        class Shrink
+        {
+        public:
+            using Properties = T_Config;
+
+        private:
+            using PointerEquivalent = Shrink2NS::__PointerEquivalent<sizeof(char*)>::type;
+
+/** Allow for a hierarchical validation of parameters:
+ *
+ * shipped default-parameters (in the inherited struct) have lowest precedence.
+ * They will be overridden by a given configuration struct. However, even the
+ * given configuration struct can be overridden by compile-time command line
+ * parameters (e.g. -D MALLOCMC_AP_SHRINK_DATAALIGNMENT 128)
+ *
+ * default-struct < template-struct < command-line parameter
+ */
+#ifndef MALLOCMC_AP_SHRINK_DATAALIGNMENT
+#    define MALLOCMC_AP_SHRINK_DATAALIGNMENT (Properties::dataAlignment)
+#endif
+            static constexpr size_t dataAlignment = MALLOCMC_AP_SHRINK_DATAALIGNMENT;
+
+            // dataAlignment must be a power of 2!
+            static_assert(
+                dataAlignment != 0 && (dataAlignment & (dataAlignment - 1)) == 0,
+                "dataAlignment must also be a power of 2");
+
+        public:
+            static auto alignPool(void* memory, size_t memsize) -> std::tuple<void*, size_t>
+            {
+                PointerEquivalent alignmentstatus = ((PointerEquivalent) memory) & (dataAlignment - 1);
+                if(alignmentstatus != 0)
+                {
+                    std::cout << "Heap Warning: memory to use not " << dataAlignment << " byte aligned...\n"
+                              << "Before:\n"
+                              << "dataAlignment:   " << dataAlignment << '\n'
+                              << "Alignmentstatus: " << alignmentstatus << '\n'
+                              << "size_t memsize   " << memsize << " byte" << '\n'
+                              << "void *memory     " << memory << '\n';
+
+                    memory = (void*) (((PointerEquivalent) memory) + dataAlignment - alignmentstatus);
+                    memsize -= dataAlignment + (size_t) alignmentstatus;
+
+                    std::cout << "Was shrunk automatically to: " << '\n'
+                              << "size_t memsize   " << memsize << " byte" << '\n'
+                              << "void *memory     " << memory << '\n';
+                }
+
+                return std::make_tuple(memory, memsize);
+            }
+
+            ALPAKA_FN_HOST_ACC
+            static auto applyPadding(size_t bytes) -> size_t
+            {
+                constexpr auto bitsToClear = dataAlignment - 1;
+                return (bytes + bitsToClear) & ~bitsToClear;
+            }
+
+            ALPAKA_FN_HOST
+            static auto classname() -> std::string
+            {
+                std::stringstream ss;
+                ss << "Shrink[" << dataAlignment << "]";
+                return ss.str();
+            }
+        };
+
+    } // namespace AlignmentPolicies
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp
deleted file mode 100644
index 607575198b..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/alignmentPolicies/Shrink_impl.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  http://www.icg.tugraz.at/project/mvp
-
-  Copyright (C) 2012 Institute for Computer Graphics and Vision,
-                     Graz University of Technology
-  Copyright (C) 2014 Institute of Radiation Physics,
-                     Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
-              Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <boost/cstdint.hpp>
-#include <boost/static_assert.hpp>
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <boost/tuple/tuple.hpp>
-
-#include "Shrink.hpp"
-#include "../mallocMC_prefixes.hpp"
-
-namespace mallocMC{
-namespace AlignmentPolicies{
-
-namespace Shrink2NS{
-    
-  template<int PSIZE> struct __PointerEquivalent{ typedef unsigned int type;};
-  template<>
-  struct __PointerEquivalent<8>{ typedef unsigned long long int type; };
-}// namespace ShrinkNS
-
-  template<typename T_Config>
-  class Shrink{
-    public:
-    typedef T_Config Properties;
-
-    private:
-    typedef boost::uint32_t uint32;
-    typedef Shrink2NS::__PointerEquivalent<sizeof(char*)>::type PointerEquivalent;
-
-/** Allow for a hierarchical validation of parameters:
- *
- * shipped default-parameters (in the inherited struct) have lowest precedence.
- * They will be overridden by a given configuration struct. However, even the
- * given configuration struct can be overridden by compile-time command line
- * parameters (e.g. -D MALLOCMC_AP_SHRINK_DATAALIGNMENT 128)
- *
- * default-struct < template-struct < command-line parameter
- */
-#ifndef MALLOCMC_AP_SHRINK_DATAALIGNMENT
-#define MALLOCMC_AP_SHRINK_DATAALIGNMENT Properties::dataAlignment::value
-#endif
-    BOOST_STATIC_CONSTEXPR uint32 dataAlignment = MALLOCMC_AP_SHRINK_DATAALIGNMENT;
-
-    // \TODO: The static_cast can be removed once the minimal dependencies of
-    //        this project is are at least CUDA 7.0 and gcc 4.8.2
-    BOOST_STATIC_ASSERT(static_cast<uint32>(dataAlignment) > 0);
-    //dataAlignment must also be a power of 2!
-    BOOST_STATIC_ASSERT(dataAlignment && !(dataAlignment & (dataAlignment-1)) ); 
-
-    public:
-    static boost::tuple<void*,size_t> alignPool(void* memory, size_t memsize){
-      PointerEquivalent alignmentstatus = ((PointerEquivalent)memory) & (dataAlignment -1);
-      if(alignmentstatus != 0)
-      {
-        std::cout << "Heap Warning: memory to use not ";
-        std::cout << dataAlignment << " byte aligned..."        << std::endl;
-        std::cout << "Before:"                                  << std::endl;
-        std::cout << "dataAlignment:   " << dataAlignment       << std::endl;
-        std::cout << "Alignmentstatus: " << alignmentstatus     << std::endl;
-        std::cout << "size_t memsize   " << memsize << " byte"  << std::endl;
-        std::cout << "void *memory     " << memory              << std::endl;
-
-        memory   = (void*)(((PointerEquivalent)memory) + dataAlignment - alignmentstatus);
-        memsize -= (size_t)dataAlignment + (size_t)alignmentstatus;
-
-        std::cout << "Was shrunk automatically to: "            << std::endl;
-        std::cout << "size_t memsize   " << memsize << " byte"  << std::endl;
-        std::cout << "void *memory     " << memory              << std::endl;
-      }
-
-      return boost::make_tuple(memory,memsize);
-    }
-
-    MAMC_HOST
-    MAMC_ACCELERATOR
-    static uint32 applyPadding(uint32 bytes){
-      return (bytes + dataAlignment - 1) & ~(dataAlignment-1);
-    }
-
-    MAMC_HOST
-    static std::string classname(){
-      std::stringstream ss;
-      ss << "Shrink[" << dataAlignment << "]";
-      return ss.str();
-    }
-
-  };
-
-} //namespace AlignmentPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/allocator.hpp b/thirdParty/mallocMC/src/include/mallocMC/allocator.hpp
index a811b48853..0c0be0f5e1 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/allocator.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/allocator.hpp
@@ -28,54 +28,51 @@
 
 #pragma once
 
-#include "mallocMC_utils.hpp"
+#include "device_allocator.hpp"
+#include "mallocMC_allocator_handle.hpp"
 #include "mallocMC_constraints.hpp"
-#include "mallocMC_prefixes.hpp"
 #include "mallocMC_traits.hpp"
-#include "mallocMC_allocator_handle.hpp"
+#include "mallocMC_utils.hpp"
 
-#include <boost/cstdint.hpp>
-#include <boost/tuple/tuple.hpp>
-#include <boost/static_assert.hpp>
+#include <alpaka/alpaka.hpp>
+#include <cstdint>
+#include <memory>
 #include <sstream>
+#include <tuple>
 #include <vector>
 
-namespace mallocMC{
-
-namespace detail{
-
-    template<
-        typename T_Allocator,
-        bool T_providesAvailableSlots
-    >
-    struct GetAvailableSlotsIfAvailHost
+namespace mallocMC
+{
+    namespace detail
     {
-        MAMC_HOST static
-        unsigned
-        getAvailableSlots(
-            size_t,
-            T_Allocator &
-        )
+        template<typename T_Allocator, bool T_providesAvailableSlots>
+        struct GetAvailableSlotsIfAvailHost
         {
-            return 0;
-        }
-    };
-
-    template<class T_Allocator>
-    struct GetAvailableSlotsIfAvailHost<T_Allocator, true>
-    {
-        MAMC_HOST
-        static unsigned
-        getAvailableSlots(
-            size_t slotSize,
-            T_Allocator& alloc
-        ){
-            return T_Allocator::CreationPolicy::getAvailableSlotsHost(slotSize, alloc.getAllocatorHandle().devAllocator);
-        }
-    };
-
-}
-
+            template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue>
+            ALPAKA_FN_HOST static auto getAvailableSlots(AlpakaDevice&, AlpakaQueue&, size_t, T_Allocator&) -> unsigned
+            {
+                return 0;
+            }
+        };
+
+        template<class T_Allocator>
+        struct GetAvailableSlotsIfAvailHost<T_Allocator, true>
+        {
+            template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue>
+            ALPAKA_FN_HOST static auto getAvailableSlots(
+                AlpakaDevice& dev,
+                AlpakaQueue& queue,
+                size_t slotSize,
+                T_Allocator& alloc) -> unsigned
+            {
+                return T_Allocator::CreationPolicy::template getAvailableSlotsHost<AlpakaAcc>(
+                    dev,
+                    queue,
+                    slotSize,
+                    alloc.getAllocatorHandle().devAllocator);
+            }
+        };
+    } // namespace detail
 
     struct HeapInfo
     {
@@ -86,9 +83,9 @@ namespace detail{
     /**
      * @brief "HostClass" that combines all policies to a useful allocator
      *
-     * This class implements the necessary glue-logic to form an actual allocator
-     * from the provided policies. It implements the public interface and
-     * executes some constraint checking based on an instance of the class
+     * This class implements the necessary glue-logic to form an actual
+     * allocator from the provided policies. It implements the public interface
+     * and executes some constraint checking based on an instance of the class
      * PolicyConstraints.
      *
      * @tparam T_CreationPolicy The desired type of a CreationPolicy
@@ -98,72 +95,65 @@ namespace detail{
      * @tparam T_AlignmentPolicy The desired type of a AlignmentPolicy
      */
     template<
-       typename T_CreationPolicy,
-       typename T_DistributionPolicy,
-       typename T_OOMPolicy,
-       typename T_ReservePoolPolicy,
-       typename T_AlignmentPolicy
-    >
-    class Allocator :
-        public PolicyConstraints<
-            T_CreationPolicy,
-            T_DistributionPolicy,
-            T_OOMPolicy,
-            T_ReservePoolPolicy,
-            T_AlignmentPolicy
-        >
+        typename AlpakaAcc,
+        typename T_CreationPolicy,
+        typename T_DistributionPolicy,
+        typename T_OOMPolicy,
+        typename T_ReservePoolPolicy,
+        typename T_AlignmentPolicy>
+    class Allocator
+        : public PolicyConstraints<
+              T_CreationPolicy,
+              T_DistributionPolicy,
+              T_OOMPolicy,
+              T_ReservePoolPolicy,
+              T_AlignmentPolicy>
     {
-        typedef boost::uint32_t uint32;
+        using uint32 = std::uint32_t;
 
     public:
-        typedef T_CreationPolicy CreationPolicy;
-        typedef T_DistributionPolicy DistributionPolicy;
-        typedef T_OOMPolicy OOMPolicy;
-        typedef T_ReservePoolPolicy ReservePoolPolicy;
-        typedef T_AlignmentPolicy AlignmentPolicy;
-        typedef std::vector< HeapInfo > HeapInfoVector;
-        typedef DeviceAllocator<
-            CreationPolicy,
-            DistributionPolicy,
-            OOMPolicy,
-            AlignmentPolicy
-        > DevAllocator;
-        typedef AllocatorHandleImpl<Allocator> AllocatorHandle;
+        using CreationPolicy = T_CreationPolicy;
+        using DistributionPolicy = T_DistributionPolicy;
+        using OOMPolicy = T_OOMPolicy;
+        using ReservePoolPolicy = T_ReservePoolPolicy;
+        using AlignmentPolicy = T_AlignmentPolicy;
+        using HeapInfoVector = std::vector<HeapInfo>;
+        using DevAllocator = DeviceAllocator<CreationPolicy, DistributionPolicy, OOMPolicy, AlignmentPolicy>;
+        using AllocatorHandle = AllocatorHandleImpl<Allocator>;
 
     private:
-        AllocatorHandle allocatorHandle;
+        ReservePoolPolicy reservePolicy;
+        using DevAllocatorStorageBufferType
+            = alpaka::Buf<alpaka::Dev<AlpakaAcc>, DevAllocator, alpaka::DimInt<1>, int>;
+        std::unique_ptr<DevAllocatorStorageBufferType>
+            devAllocatorBuffer; // FIXME(bgruber): replace by std::optional<>
         HeapInfo heapInfos;
 
         /** allocate heap memory
          *
          * @param size number of bytes
          */
-        MAMC_HOST
-        void
-        alloc(
-            size_t size
-        )
+        template<typename AlpakaDevice, typename AlpakaQueue>
+        ALPAKA_FN_HOST void
+        /* `volatile size_t size` is required to break clang optimizations which
+         * results into runtime errors. Observed in PIConGPU if size is known at
+         * compile time. The volatile workaround has no negative effects on the
+         * register usage in CUDA.
+         */
+        alloc(AlpakaDevice& dev, AlpakaQueue& queue, volatile size_t size)
         {
-            void* pool = ReservePoolPolicy::setMemPool( size );
-            boost::tie(
-                pool,
-                size
-            ) = AlignmentPolicy::alignPool(
+            void* pool = reservePolicy.setMemPool(dev, size);
+            std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size);
+
+            devAllocatorBuffer
+                = std::make_unique<DevAllocatorStorageBufferType>(alpaka::allocBuf<DevAllocator, int>(dev, 1));
+            CreationPolicy::template initHeap<AlpakaAcc>(
+                dev,
+                queue,
+                alpaka::getPtrNative(*devAllocatorBuffer),
                 pool,
-                size
-            );
-            DevAllocator* devAllocatorPtr;
-            cudaMalloc(
-                ( void** ) &devAllocatorPtr,
-                sizeof( DevAllocator )
-            );
-            CreationPolicy::initHeap(
-                devAllocatorPtr,
-                pool,
-                size
-            );
+                size);
 
-            allocatorHandle.devAllocator = devAllocatorPtr;
             heapInfos.p = pool;
             heapInfos.size = size;
         }
@@ -173,104 +163,82 @@ namespace detail{
          * Free all allocated memory.
          * After this call the instance is an in invalid state
          */
-        MAMC_HOST
-        void free()
+        ALPAKA_FN_HOST void free()
         {
-            cudaFree( allocatorHandle.devAllocator );
-            ReservePoolPolicy::resetMemPool( heapInfos.p );
-            allocatorHandle.devAllocator = NULL;
+            devAllocatorBuffer = {};
+            reservePolicy.resetMemPool(heapInfos.p);
             heapInfos.size = 0;
-            heapInfos.p = NULL;
+            heapInfos.p = nullptr;
         }
 
         /* forbid to copy the allocator */
-        MAMC_HOST
-        Allocator( const Allocator& );
+        ALPAKA_FN_HOST
+        Allocator(const Allocator&) = delete;
 
     public:
-
-
-        MAMC_HOST
-        Allocator(
-            size_t size = 8U * 1024U * 1024U
-        ) :
-            allocatorHandle( NULL )
+        template<typename AlpakaDevice, typename AlpakaQueue>
+        ALPAKA_FN_HOST Allocator(AlpakaDevice& dev, AlpakaQueue& queue, size_t size = 8U * 1024U * 1024U)
         {
-            alloc( size );
+            alloc(dev, queue, size);
         }
 
-        MAMC_HOST
-        ~Allocator( )
+        ALPAKA_FN_HOST
+        ~Allocator()
         {
-            free( );
+            free();
         }
 
         /** destroy current heap data and resize the heap
          *
          * @param size number of bytes
          */
-        MAMC_HOST
-        void
-        destructiveResize(
-            size_t size
-        )
+        template<typename AlpakaDevice, typename AlpakaQueue>
+        ALPAKA_FN_HOST void destructiveResize(AlpakaDevice& dev, AlpakaQueue& queue, size_t size)
         {
-            free( );
-            alloc( size );
+            free();
+            alloc(dev, queue, size);
         }
 
-        MAMC_HOST
-        AllocatorHandle
-        getAllocatorHandle( )
+        ALPAKA_FN_HOST
+        auto getAllocatorHandle() -> AllocatorHandle
         {
-            return allocatorHandle;
+            return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)};
         }
 
-        MAMC_HOST
+        ALPAKA_FN_HOST
         operator AllocatorHandle()
         {
             return getAllocatorHandle();
         }
 
-        MAMC_HOST static
-        std::string
-        info(
-            std::string linebreak = " "
-        )
+        ALPAKA_FN_HOST static auto info(std::string linebreak = " ") -> std::string
         {
             std::stringstream ss;
-            ss << "CreationPolicy:      " << CreationPolicy::classname( ) << "    " << linebreak;
-            ss << "DistributionPolicy:  " << DistributionPolicy::classname( ) << "" << linebreak;
-            ss << "OOMPolicy:           " << OOMPolicy::classname( ) << "         " << linebreak;
-            ss << "ReservePoolPolicy:   " << ReservePoolPolicy::classname( ) << " " << linebreak;
-            ss << "AlignmentPolicy:     " << AlignmentPolicy::classname( ) << "   " << linebreak;
+            ss << "CreationPolicy:      " << CreationPolicy::classname() << "    " << linebreak;
+            ss << "DistributionPolicy:  " << DistributionPolicy::classname() << "" << linebreak;
+            ss << "OOMPolicy:           " << OOMPolicy::classname() << "         " << linebreak;
+            ss << "ReservePoolPolicy:   " << ReservePoolPolicy::classname() << " " << linebreak;
+            ss << "AlignmentPolicy:     " << AlignmentPolicy::classname() << "   " << linebreak;
             return ss.str();
         }
 
-        // polymorphism over the availability of getAvailableSlots for calling from the host
-        MAMC_HOST
-        unsigned
-        getAvailableSlots(
-            size_t slotSize
-        )
+        // polymorphism over the availability of getAvailableSlots for calling
+        // from the host
+        template<typename AlpakaDevice, typename AlpakaQueue>
+        ALPAKA_FN_HOST auto getAvailableSlots(AlpakaDevice& dev, AlpakaQueue& queue, size_t slotSize) -> unsigned
         {
-            slotSize = AlignmentPolicy::applyPadding( slotSize );
-            return detail::GetAvailableSlotsIfAvailHost<
-                Allocator,
-                Traits<Allocator>::providesAvailableSlots
-            >::getAvailableSlots( slotSize, *this );
+            slotSize = AlignmentPolicy::applyPadding(slotSize);
+            return detail::GetAvailableSlotsIfAvailHost<Allocator, Traits<Allocator>::providesAvailableSlots>::
+                template getAvailableSlots<AlpakaAcc>(dev, queue, slotSize, *this);
         }
 
-        MAMC_HOST
-        HeapInfoVector
-        getHeapLocations( )
+        ALPAKA_FN_HOST
+        auto getHeapLocations() -> HeapInfoVector
         {
-          HeapInfoVector v;
-          v.push_back( heapInfos );
-          return v;
+            HeapInfoVector v;
+            v.push_back(heapInfos);
+            return v;
         }
-
     };
 
-} //namespace mallocMC
-
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc.hpp
index e48b48902a..3f7c130e1e 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc.hpp
@@ -27,18 +27,61 @@
 
 #pragma once
 
+#include "OldMalloc.hpp"
 
-namespace mallocMC{
-namespace CreationPolicies{
-    
-  /**
-   * @brief classic malloc/free behaviour known from CUDA
-   *
-   * This CreationPolicy implements the classic device-side malloc and free
-   * system calls that is offered by CUDA-capable accelerator of compute
-   * capability 2.0 and higher
-   */
-  class OldMalloc;
-
-} //namespace CreationPolicies
-} //namespace mallocMC
+#include <alpaka/core/Common.hpp>
+#include <cstdint>
+
+namespace mallocMC
+{
+    namespace CreationPolicies
+    {
+        /**
+         * @brief classic malloc/free behaviour known from CUDA
+         *
+         * This CreationPolicy implements the classic device-side malloc and
+         * free system calls that is offered by CUDA-capable accelerator of
+         * compute capability 2.0 and higher
+         */
+        class OldMalloc
+        {
+            using uint32 = std::uint32_t;
+
+        public:
+            static constexpr auto providesAvailableSlots = false;
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32 bytes) const -> void*
+            {
+                return ::malloc(static_cast<size_t>(bytes));
+            }
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC void destroy(const AlpakaAcc& /*acc*/, void* mem) const
+            {
+                ::free(mem);
+            }
+
+            ALPAKA_FN_ACC auto isOOM(void* p, size_t s) const -> bool
+            {
+                return s != 0 && (p == nullptr);
+            }
+
+            template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue, typename T_DeviceAllocator>
+            static void initHeap(
+                AlpakaDevice& dev,
+                AlpakaQueue& queue,
+                T_DeviceAllocator* heap,
+                void* pool,
+                size_t memsize)
+            {
+            }
+
+            static auto classname() -> std::string
+            {
+                return "OldMalloc";
+            }
+        };
+
+    } // namespace CreationPolicies
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp
deleted file mode 100644
index 162eb661c3..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/OldMalloc_impl.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <boost/cstdint.hpp>
-#include <boost/mpl/bool.hpp>
-
-#include "OldMalloc.hpp"
-
-namespace mallocMC{
-namespace CreationPolicies{
-
-  class OldMalloc
-  {
-    typedef boost::uint32_t uint32;
-
-    public:
-    typedef boost::mpl::bool_<false> providesAvailableSlots;
-
-    __device__ void* create(uint32 bytes)
-    {
-      return ::malloc(static_cast<size_t>(bytes));
-    }
-
-    __device__ void destroy(void* mem)
-    {
-      free(mem);
-    }
-
-    __device__ bool isOOM(void* p, size_t s){
-      return s && (p == NULL);
-    }
-
-    template < typename T >
-    static void* initHeap(T* dAlloc, void*, size_t){
-      return dAlloc;
-    }
-
-    static std::string classname(){
-      return "OldMalloc";
-    }
-
-  };
-
-} //namespace CreationPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter.hpp
index 499bd73ee4..42992da7f8 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter.hpp
@@ -4,8 +4,8 @@
 
   Copyright (C) 2012 Institute for Computer Graphics and Vision,
                      Graz University of Technology
-  Copyright (C) 2014 Institute of Radiation Physics,
-                     Helmholtz-Zentrum Dresden - Rossendorf
+  Copyright (C) 2014-2016 Institute of Radiation Physics,
+                          Helmholtz-Zentrum Dresden - Rossendorf
 
   Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
               Rene Widera - r.widera ( at ) hzdr.de
@@ -33,52 +33,1200 @@
 
 #pragma once
 
-#include <boost/mpl/bool.hpp>
-#include <boost/mpl/int.hpp>
-
-namespace mallocMC{
-namespace CreationPolicies{
-namespace ScatterConf{
-  struct DefaultScatterConfig{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
-  };
-
-  struct DefaultScatterHashingParams{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
-  };  
-}
-
-  /**
-   * @brief fast memory allocation based on ScatterAlloc
-   *
-   * This CreationPolicy implements a fast memory allocator that trades speed
-   * for fragmentation of memory. This is based on the memory allocator
-   * "ScatterAlloc"
-   * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604), and
-   * is extended to report free memory slots of a given size (both on host and
-   * accelerator).
-   * To work properly, this policy class requires a pre-allocated heap on the
-   * accelerator and works only with Nvidia CUDA capable accelerators that have
-   * at least compute capability 2.0.
-   *
-   * @tparam T_Config (optional) configure the heap layout. The
-   *        default can be obtained through Scatter<>::HeapProperties
-   * @tparam T_Hashing (optional) configure the parameters for
-   *        the hashing formula. The default can be obtained through
-   *        Scatter<>::HashingProperties
-   */
-  template<
-  class T_Config = ScatterConf::DefaultScatterConfig,
-  class T_Hashing = ScatterConf::DefaultScatterHashingParams
-  >
-  class Scatter;
-
-}// namespace CreationPolicies
-}// namespace mallocMC
+#include "../mallocMC_utils.hpp"
+#include "Scatter.hpp"
+
+#include <alpaka/alpaka.hpp>
+#include <atomic>
+#include <cassert>
+#include <cstdint> /* uint32_t */
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+namespace mallocMC
+{
+    namespace CreationPolicies
+    {
+        namespace ScatterConf
+        {
+            struct DefaultScatterConfig
+            {
+                static constexpr auto pagesize = 4096;
+                static constexpr auto accessblocks = 8;
+                static constexpr auto regionsize = 16;
+                static constexpr auto wastefactor = 2;
+                static constexpr auto resetfreedpages = false;
+            };
+
+            struct DefaultScatterHashingParams
+            {
+                static constexpr auto hashingK = 38183;
+                static constexpr auto hashingDistMP = 17497;
+                static constexpr auto hashingDistWP = 1;
+                static constexpr auto hashingDistWPRel = 1;
+            };
+        } // namespace ScatterConf
+
+        /**
+         * @brief fast memory allocation based on ScatterAlloc
+         *
+         * This CreationPolicy implements a fast memory allocator that trades
+         * speed for fragmentation of memory. This is based on the memory
+         * allocator "ScatterAlloc"
+         * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604),
+         * and is extended to report free memory slots of a given size (both on
+         * host and accelerator). To work properly, this policy class requires a
+         * pre-allocated heap on the accelerator and works only with Nvidia CUDA
+         * capable accelerators that have at least compute capability 2.0.
+         *
+         * @tparam T_Config (optional) configure the heap layout. The
+         *        default can be obtained through Scatter<>::HeapProperties
+         * @tparam T_Hashing (optional) configure the parameters for
+         *        the hashing formula. The default can be obtained through
+         *        Scatter<>::HashingProperties
+         */
+        template<
+            class T_Config = ScatterConf::DefaultScatterConfig,
+            class T_Hashing = ScatterConf::DefaultScatterHashingParams>
+        class Scatter
+        {
+        public:
+            using HeapProperties = T_Config;
+            using HashingProperties = T_Hashing;
+            struct Properties
+                : HeapProperties
+                , HashingProperties
+            {
+            };
+            static constexpr auto providesAvailableSlots = true;
+
+        private:
+            using uint32 = std::uint32_t;
+
+/** Allow for a hierarchical validation of parameters:
+ *
+ * shipped default-parameters (in the inherited struct) have lowest precedence.
+ * They will be overridden by a given configuration struct. However, even the
+ * given configuration struct can be overridden by compile-time command line
+ * parameters (e.g. -D MALLOCMC_CP_SCATTER_PAGESIZE 1024)
+ *
+ * default-struct < template-struct < command-line parameter
+ */
+#ifndef MALLOCMC_CP_SCATTER_PAGESIZE
+#    define MALLOCMC_CP_SCATTER_PAGESIZE (HeapProperties::pagesize)
+#endif
+            static constexpr uint32 pagesize = MALLOCMC_CP_SCATTER_PAGESIZE;
+
+#ifndef MALLOCMC_CP_SCATTER_ACCESSBLOCKS
+#    define MALLOCMC_CP_SCATTER_ACCESSBLOCKS (HeapProperties::accessblocks)
+#endif
+            static constexpr uint32 accessblocks = MALLOCMC_CP_SCATTER_ACCESSBLOCKS;
+
+#ifndef MALLOCMC_CP_SCATTER_REGIONSIZE
+#    define MALLOCMC_CP_SCATTER_REGIONSIZE (HeapProperties::regionsize)
+#endif
+            static constexpr uint32 regionsize = MALLOCMC_CP_SCATTER_REGIONSIZE;
+
+#ifndef MALLOCMC_CP_SCATTER_WASTEFACTOR
+#    define MALLOCMC_CP_SCATTER_WASTEFACTOR (HeapProperties::wastefactor)
+#endif
+            static constexpr uint32 wastefactor = MALLOCMC_CP_SCATTER_WASTEFACTOR;
+
+#ifndef MALLOCMC_CP_SCATTER_RESETFREEDPAGES
+#    define MALLOCMC_CP_SCATTER_RESETFREEDPAGES (HeapProperties::resetfreedpages)
+#endif
+            static constexpr bool resetfreedpages = MALLOCMC_CP_SCATTER_RESETFREEDPAGES;
+
+        public:
+            static constexpr uint32 _pagesize = pagesize;
+            static constexpr uint32 _accessblocks = accessblocks;
+            static constexpr uint32 _regionsize = regionsize;
+            static constexpr uint32 _wastefactor = wastefactor;
+            static constexpr bool _resetfreedpages = resetfreedpages;
+
+        private:
+#if _DEBUG || ANALYSEHEAP
+        public:
+#endif
+            static constexpr uint32 minChunkSize1 = 0x10;
+            static constexpr uint32 HierarchyThreshold = (pagesize - 2 * sizeof(uint32)) / 33;
+            static constexpr uint32 minSegmentSize = 32 * minChunkSize1 + sizeof(uint32);
+            static constexpr uint32 tmp_maxOPM
+                = minChunkSize1 > HierarchyThreshold ? 0 : (pagesize + (minSegmentSize - 1)) / minSegmentSize;
+            static constexpr uint32 maxOnPageMasks = 32 > tmp_maxOPM ? tmp_maxOPM : 32;
+
+#ifndef MALLOCMC_CP_SCATTER_HASHINGK
+#    define MALLOCMC_CP_SCATTER_HASHINGK (HashingProperties::hashingK)
+#endif
+            static constexpr uint32 hashingK = MALLOCMC_CP_SCATTER_HASHINGK;
+
+#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTMP
+#    define MALLOCMC_CP_SCATTER_HASHINGDISTMP (HashingProperties::hashingDistMP)
+#endif
+            static constexpr uint32 hashingDistMP = MALLOCMC_CP_SCATTER_HASHINGDISTMP;
+
+#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWP
+#    define MALLOCMC_CP_SCATTER_HASHINGDISTWP (HashingProperties::hashingDistWP)
+#endif
+            static constexpr uint32 hashingDistWP = MALLOCMC_CP_SCATTER_HASHINGDISTWP;
+
+#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWPREL
+#    define MALLOCMC_CP_SCATTER_HASHINGDISTWPREL (HashingProperties::hashingDistWPRel)
+#endif
+            static constexpr uint32 hashingDistWPRel = MALLOCMC_CP_SCATTER_HASHINGDISTWPREL;
+
+            /**
+             * Page Table Entry struct
+             * The PTE holds basic information about each page
+             */
+            struct PTE
+            {
+                uint32 chunksize;
+                uint32 count;
+                uint32 bitmask;
+
+                ALPAKA_FN_ACC void init()
+                {
+                    chunksize = 0;
+                    count = 0;
+                    bitmask = 0;
+                }
+            };
+
+            /**
+             * Page struct
+             * The page struct is used to access the data on the page more
+             * efficiently and to clear the area on the page, which might hold
+             * bitsfields later one
+             */
+            struct Page
+            {
+                char data[pagesize];
+
+                /**
+                 * The pages init method
+                 * This method initializes the region on the page which might
+                 * hold bit fields when the page is used for a small chunk size
+                 * @param previous_chunksize the chunksize which was uses for
+                 * the page before
+                 */
+                ALPAKA_FN_ACC void init()
+                {
+                    // clear the entire data which can hold bitfields
+                    uint32* write = (uint32*) (data + pagesize - (int) (sizeof(uint32) * maxOnPageMasks));
+                    while(write < (uint32*) (data + pagesize))
+                        *write++ = 0;
+                }
+            };
+
+            // the data used by the allocator
+
+            volatile PTE* _ptes;
+            volatile uint32* _regions;
+            Page* _page;
+            uint32 _numpages;
+            size_t _memsize;
+            uint32 _pagebasedMutex;
+            volatile uint32 _firstFreePageBased;
+            volatile uint32 _firstfreeblock;
+
+            /**
+             * randInit should create an random offset which can be used
+             * as the initial position in a bitfield
+             */
+            static ALPAKA_FN_ACC inline auto randInit() -> uint32
+            {
+                // start with the laneid offset
+                return laneid();
+            }
+
+            /**
+             * randInextspot delivers the next free spot in a bitfield
+             * it searches for the next unset bit to the left of spot and
+             * returns its offset. if there are no unset bits to the left
+             * then it wraps around
+             * @param bitfield the bitfield to be searched for
+             * @param spot the spot from which to search to the left
+             * @param spots number of bits that can be used
+             * @return next free spot in the bitfield
+             */
+            static ALPAKA_FN_ACC inline auto nextspot(uint32 bitfield, uint32 spot, uint32 spots) -> uint32
+            {
+                // wrap around the bitfields from the current spot to the left
+                bitfield = ((bitfield >> (spot + 1)) | (bitfield << (spots - (spot + 1)))) & ((1 << spots) - 1);
+                // compute the step from the current spot in the bitfield
+                const uint32 step = ffs(~bitfield);
+                // and return the new spot
+                return (spot + step) % spots;
+            }
+
+            /**
+             * onPageMasksPosition returns a pointer to the beginning of the
+             * onpagemasks inside a page.
+             * @param page the page that holds the masks
+             * @param the number of hierarchical page tables (bitfields) that
+             * are used inside this mask.
+             * @return pointer to the first address inside the page that holds
+             * metadata bitfields.
+             */
+            ALPAKA_FN_ACC inline auto onPageMasksPosition(uint32 page, uint32 nMasks) -> uint32*
+            {
+                return (uint32*) (_page[page].data + pagesize - (int) sizeof(uint32) * nMasks);
+            }
+
+            /**
+             * usespot marks finds one free spot in the bitfield, marks it and
+             * returns its offset
+             * @param bitfield pointer to the bitfield to use
+             * @param spots overall number of spots the bitfield is responsible
+             * for
+             * @return if there is a free spot it returns the spot'S offset,
+             * otherwise -1
+             */
+            template<typename AlpakaAcc>
+            static ALPAKA_FN_ACC inline auto usespot(const AlpakaAcc& acc, uint32* bitfield, uint32 spots) -> int
+            {
+                // get first spot
+                uint32 spot = randInit() % spots;
+                for(;;)
+                {
+                    const uint32 mask = 1 << spot;
+                    const uint32 old = alpaka::atomicOp<alpaka::AtomicOr>(acc, bitfield, mask);
+                    if((old & mask) == 0)
+                        return spot;
+                    // note: popc(old) == spots should be sufficient,
+                    // but if someone corrupts the memory we end up in an
+                    // endless loop in here...
+                    if(popc(old) >= spots)
+                        return -1;
+                    spot = nextspot(old, spot, spots);
+                }
+            }
+
+            /**
+             * calcAdditionalChunks determines the number of chunks that are
+             * contained in the last segment of a hierarchical page
+             *
+             * The additional checks are necessary to ensure correct results for
+             * very large pages and small chunksizes
+             *
+             * @param fullsegments the number of segments that can be completely
+             * filled in a page. This may NEVER be bigger than 32!
+             * @param segmentsize the number of bytes that are contained in a
+             * completely filled segment (32 chunks)
+             * @param chunksize the chosen allocation size within the page
+             * @return the number of additional chunks that will not fit in one
+             * of the fullsegments. For any correct input, this number is
+             * smaller than 32
+             */
+            template<typename AlpakaAcc>
+            static ALPAKA_FN_ACC inline auto calcAdditionalChunks(
+                const AlpakaAcc& acc,
+                uint32 fullsegments,
+                uint32 segmentsize,
+                uint32 chunksize) -> uint32
+            {
+                if(fullsegments != 32)
+                    return alpaka::math::max(
+                               acc,
+                               0u,
+                               (int) pagesize - (int) fullsegments * segmentsize - (int) sizeof(uint32))
+                        / chunksize;
+                else
+                    return 0;
+            }
+
+            /**
+             * addChunkHierarchy finds a free chunk on a page which uses bit
+             * fields on the page
+             * @param chunksize the chunksize of the page
+             * @param fullsegments the number of full segments on the page (a 32
+             * bits on the page)
+             * @param additional_chunks the number of additional chunks in last
+             * segment (less than 32 bits on the page)
+             * @param page the page to use
+             * @return pointer to a free chunk on the page, 0 if we were unable
+             * to obtain a free chunk
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC inline auto addChunkHierarchy(
+                const AlpakaAcc& acc,
+                uint32 chunksize,
+                uint32 fullsegments,
+                uint32 additional_chunks,
+                uint32 page) -> void*
+            {
+                const uint32 segments = fullsegments + (additional_chunks > 0 ? 1 : 0);
+                uint32 spot = randInit() % segments;
+                const uint32 mask = _ptes[page].bitmask;
+                if((mask & (1 << spot)) != 0)
+                    spot = nextspot(mask, spot, segments);
+                const uint32 tries = segments - popc(mask);
+                uint32* onpagemasks = onPageMasksPosition(page, segments);
+                for(uint32 i = 0; i < tries; ++i)
+                {
+                    const int hspot = usespot(acc, &onpagemasks[spot], spot < fullsegments ? 32 : additional_chunks);
+                    if(hspot != -1)
+                        return _page[page].data + (32 * spot + hspot) * chunksize;
+                    alpaka::atomicOp<alpaka::AtomicOr>(acc, (uint32*) &_ptes[page].bitmask, 1u << spot);
+                    spot = nextspot(mask, spot, segments);
+                }
+                return 0;
+            }
+
+            /**
+             * addChunkNoHierarchy finds a free chunk on a page which uses the
+             * bit fields of the pte only
+             * @param chunksize the chunksize of the page
+             * @param page the page to use
+             * @param spots the number of chunks which fit on the page
+             * @return pointer to a free chunk on the page, 0 if we were unable
+             * to obtain a free chunk
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC inline auto addChunkNoHierarchy(
+                const AlpakaAcc& acc,
+                uint32 chunksize,
+                uint32 page,
+                uint32 spots) -> void*
+            {
+                const int spot = usespot(acc, (uint32*) &_ptes[page].bitmask, spots);
+                if(spot == -1)
+                    return 0; // that should be impossible :)
+                return _page[page].data + spot * chunksize;
+            }
+
+            /**
+             * tryUsePage tries to use the page for the allocation request
+             * @param page the page to use
+             * @param chunksize the chunksize of the page
+             * @return pointer to a free chunk on the page, 0 if we were unable
+             * to obtain a free chunk
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC inline auto tryUsePage(const AlpakaAcc& acc, uint32 page, uint32 chunksize) -> void*
+            {
+                void* chunk_ptr = nullptr;
+
+                // increse the fill level
+                const uint32 filllevel
+                    = alpaka::atomicOp<alpaka::AtomicAdd>(acc, (uint32*) &(_ptes[page].count), 1u);
+                // recheck chunck size (it could be that the page got freed in
+                // the meanwhile...)
+                if(!resetfreedpages || _ptes[page].chunksize == chunksize)
+                {
+                    if(chunksize <= HierarchyThreshold)
+                    {
+                        // more chunks than can be covered by the pte's single
+                        // bitfield can be used
+                        const uint32 segmentsize = chunksize * 32 + sizeof(uint32);
+                        const uint32 fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize);
+                        const uint32 additional_chunks
+                            = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize);
+                        if(filllevel < fullsegments * 32 + additional_chunks)
+                            chunk_ptr = addChunkHierarchy(acc, chunksize, fullsegments, additional_chunks, page);
+                    }
+                    else
+                    {
+                        const uint32 chunksinpage = alpaka::math::min(acc, pagesize / chunksize, 32u);
+                        if(filllevel < chunksinpage)
+                            chunk_ptr = addChunkNoHierarchy(acc, chunksize, page, chunksinpage);
+                    }
+                }
+
+                // this one is full/not useable
+                if(chunk_ptr == nullptr)
+                    alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &(_ptes[page].count), 1u);
+
+                return chunk_ptr;
+            }
+
+            /**
+             * allocChunked tries to allocate the demanded number of bytes on
+             * one of the pages
+             * @param bytes the number of bytes to allocate
+             * @return pointer to a free chunk on a page, 0 if we were unable to
+             * obtain a free chunk
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto allocChunked(const AlpakaAcc& acc, uint32 bytes) -> void*
+            {
+                const uint32 pagesperblock = _numpages / accessblocks;
+                const uint32 reloff = warpSize * bytes / pagesize;
+                const uint32 startpage = (bytes * hashingK + hashingDistMP * smid()
+                                          + (hashingDistWP + hashingDistWPRel * reloff) * warpid())
+                    % pagesperblock;
+                const uint32 maxchunksize = alpaka::math::min(acc, +pagesize, wastefactor * bytes);
+                uint32 startblock = _firstfreeblock;
+                uint32 ptetry = startpage + startblock * pagesperblock;
+                uint32 checklevel = regionsize * 3 / 4;
+                for(uint32 finder = 0; finder < 2; ++finder)
+                {
+                    for(uint32 b = startblock; b < accessblocks; ++b)
+                    {
+                        while(ptetry < (b + 1) * pagesperblock)
+                        {
+                            const uint32 region = ptetry / regionsize;
+                            const uint32 regionfilllevel = _regions[region];
+                            if(regionfilllevel < checklevel)
+                            {
+                                for(; ptetry < (region + 1) * regionsize; ++ptetry)
+                                {
+                                    const uint32 chunksize = _ptes[ptetry].chunksize;
+                                    if(chunksize >= bytes && chunksize <= maxchunksize)
+                                    {
+                                        void* res = tryUsePage(acc, ptetry, chunksize);
+                                        if(res != 0)
+                                            return res;
+                                    }
+                                    else if(chunksize == 0)
+                                    {
+                                        // lets open up a new page
+                                        // it is already padded
+                                        const uint32 new_chunksize = alpaka::math::max(acc, bytes, +minChunkSize1);
+                                        const uint32 beforechunksize
+                                            = alpaka::atomicOp<alpaka::AtomicCas>(
+                                                acc,
+                                                (uint32*) &_ptes[ptetry].chunksize,
+                                                0u,
+                                                new_chunksize);
+                                        if(beforechunksize == 0)
+                                        {
+                                            void* res = tryUsePage(acc, ptetry, new_chunksize);
+                                            if(res != 0)
+                                                return res;
+                                        }
+                                        else if(beforechunksize >= bytes && beforechunksize <= maxchunksize)
+                                        {
+                                            // someone else aquired the page,
+                                            // but we can also use it
+                                            void* res = tryUsePage(acc, ptetry, beforechunksize);
+                                            if(res != 0)
+                                                return res;
+                                        }
+                                    }
+                                }
+                                // could not alloc in region, tell that
+                                if(regionfilllevel + 1 <= regionsize)
+                                    alpaka::atomicOp<alpaka::AtomicMax>(
+                                        acc,
+                                        (uint32*) (_regions + region),
+                                        regionfilllevel + 1);
+                            }
+                            else
+                                ptetry += regionsize;
+                            // ptetry = (region+1)*regionsize;
+                        }
+                        // randomize the thread writing the info
+                        // if(warpid() + laneid() == 0)
+                        if(b > startblock)
+                            _firstfreeblock = b;
+                    }
+
+                    // we are really full :/ so lets search every page for a
+                    // spot!
+                    startblock = 0;
+                    checklevel = regionsize + 1;
+                    ptetry = 0;
+                }
+                return 0;
+            }
+
+            /**
+             * deallocChunked frees the chunk on the page and updates all data
+             * accordingly
+             * @param mem pointer to the chunk
+             * @param page the page the chunk is on
+             * @param chunksize the chunksize used for the page
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC void deallocChunked(const AlpakaAcc& acc, void* mem, uint32 page, uint32 chunksize)
+            {
+                const auto inpage_offset = static_cast<uint32>((char*) mem - _page[page].data);
+                if(chunksize <= HierarchyThreshold)
+                {
+                    // one more level in hierarchy
+                    const uint32 segmentsize = chunksize * 32 + sizeof(uint32);
+                    const uint32 fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize);
+                    const uint32 additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize);
+                    const uint32 segment = inpage_offset / (chunksize * 32);
+                    const uint32 withinsegment = (inpage_offset - segment * (chunksize * 32)) / chunksize;
+                    // mark it as free
+                    const uint32 nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0);
+                    uint32* onpagemasks = onPageMasksPosition(page, nMasks);
+                    uint32 old = alpaka::atomicOp<alpaka::AtomicAnd>(
+                        acc,
+                        &onpagemasks[segment],
+                        ~(1u << withinsegment));
+
+                    // always do this, since it might fail due to a
+                    // race-condition with addChunkHierarchy
+                    alpaka::atomicOp<alpaka::AtomicAnd>(
+                        acc,
+                        (uint32*) &_ptes[page].bitmask,
+                        ~(1u << segment));
+                }
+                else
+                {
+                    const uint32 segment = inpage_offset / chunksize;
+                    alpaka::atomicOp<alpaka::AtomicAnd>(
+                        acc,
+                        (uint32*) &_ptes[page].bitmask,
+                        ~(1u << segment));
+                }
+                // reduce filllevel as free
+                const uint32 oldfilllevel
+                    = alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &_ptes[page].count, 1u);
+
+                if(resetfreedpages)
+                {
+                    if(oldfilllevel == 1)
+                    {
+                        // this page now got free!
+                        // -> try lock it
+                        const uint32 old = alpaka::atomicOp<alpaka::AtomicCas>(
+                            acc,
+                            (uint32*) &_ptes[page].count,
+                            0u,
+                            +pagesize);
+                        if(old == 0)
+                        {
+                            // clean the bits for the hierarchy
+                            _page[page].init();
+                            // remove chunk information
+                            _ptes[page].chunksize = 0;
+
+                            threadfenceDevice(acc);
+
+                            // unlock it
+                            alpaka::atomicOp<alpaka::AtomicSub>(
+                                acc,
+                                (uint32*) &_ptes[page].count,
+                                +pagesize);
+                        }
+                    }
+                }
+
+                // meta information counters ... should not be changed by too
+                // many threads, so..
+                if(oldfilllevel == pagesize / 2 / chunksize)
+                {
+                    const uint32 region = page / regionsize;
+                    _regions[region] = 0;
+                    const uint32 block = region * regionsize * accessblocks / _numpages;
+                    if(warpid() + laneid() == 0)
+                        alpaka::atomicOp<alpaka::AtomicMin>(acc, (uint32*) &_firstfreeblock, block);
+                }
+            }
+
+            /**
+             * markpages markes a fixed number of pages as used
+             * @param startpage first page to mark
+             * @param pages number of pages to mark
+             * @param bytes number of overall bytes to mark pages for
+             * @return true on success, false if one of the pages is not free
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto markpages(const AlpakaAcc& acc, uint32 startpage, uint32 pages, uint32 bytes) -> bool
+            {
+                int abord = -1;
+                for(uint32 trypage = startpage; trypage < startpage + pages; ++trypage)
+                {
+                    const uint32 old = alpaka::atomicOp<alpaka::AtomicCas>(
+                        acc,
+                        (uint32*) &_ptes[trypage].chunksize,
+                        0u,
+                        bytes);
+                    if(old != 0)
+                    {
+                        abord = trypage;
+                        break;
+                    }
+                }
+                if(abord == -1)
+                    return true;
+                for(uint32 trypage = startpage; trypage < abord; ++trypage)
+                    alpaka::atomicOp<alpaka::AtomicCas>(
+                        acc,
+                        (uint32*) &_ptes[trypage].chunksize,
+                        bytes,
+                        0u);
+                return false;
+            }
+
+            /**
+             * allocPageBasedSingleRegion tries to allocate the demanded number
+             * of bytes on a continues sequence of pages
+             * @param startpage first page to be used
+             * @param endpage last page to be used
+             * @param bytes number of overall bytes to mark pages for
+             * @return pointer to the first page to use, 0 if we were unable to
+             * use all the requested pages
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto allocPageBasedSingleRegion(
+                const AlpakaAcc& acc,
+                uint32 startpage,
+                uint32 endpage,
+                uint32 bytes) -> void*
+            {
+                const uint32 pagestoalloc = divup(bytes, pagesize);
+                uint32 freecount = 0;
+                bool left_free = false;
+                for(uint32 search_page = startpage + 1; search_page > endpage;)
+                {
+                    --search_page;
+                    if(_ptes[search_page].chunksize == 0)
+                    {
+                        if(++freecount == pagestoalloc)
+                        {
+                            // try filling it up
+                            if(markpages(acc, search_page, pagestoalloc, bytes))
+                            {
+                                // mark that we filled up everything up to here
+                                if(!left_free)
+                                    alpaka::atomicOp<alpaka::AtomicCas>(
+                                        acc,
+                                        (uint32*) &_firstFreePageBased,
+                                        startpage,
+                                        search_page - 1);
+                                return _page[search_page].data;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        left_free = true;
+                        freecount = 0;
+                    }
+                }
+                return 0;
+            }
+
+            /**
+             * allocPageBasedSingle tries to allocate the demanded number of
+             * bytes on a continues sequence of pages
+             * @param bytes number of overall bytes to mark pages for
+             * @return pointer to the first page to use, 0 if we were unable to
+             * use all the requested pages
+             * @pre only a single thread of a warp is allowed to call the
+             * function concurrently
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto allocPageBasedSingle(const AlpakaAcc& acc, uint32 bytes) -> void*
+            {
+                // acquire mutex
+                while(alpaka::atomicOp<alpaka::AtomicExch>(acc, &_pagebasedMutex, 1u) != 0)
+                    ;
+                // search for free spot from the back
+                const uint32 spage = _firstFreePageBased;
+                void* res = allocPageBasedSingleRegion(acc, spage, 0, bytes);
+                if(res == 0)
+                    // also check the rest of the pages
+                    res = allocPageBasedSingleRegion(acc, _numpages, spage, bytes);
+
+                // free mutex
+                alpaka::atomicOp<alpaka::AtomicExch>(acc, &_pagebasedMutex, 0u);
+                return res;
+            }
+            /**
+             * allocPageBased tries to allocate the demanded number of bytes on
+             * a continues sequence of pages
+             * @param bytes number of overall bytes to mark pages for
+             * @return pointer to the first page to use, 0 if we were unable to
+             * use all the requested pages
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto allocPageBased(const AlpakaAcc& acc, uint32 bytes) -> void*
+            {
+                // this is rather slow, but we dont expect that to happen often
+                // anyway
+
+                // only one thread per warp can acquire the mutex
+                void* res = 0;
+                // based on the alpaka backend the lanemask type can be 64bit
+                const auto mask = activemask();
+                const uint32_t num = popc(mask);
+                // based on the alpaka backend the lanemask type can be 64bit
+                const auto lanemask = lanemask_lt();
+                const uint32_t local_id = popc(lanemask & mask);
+                for(unsigned int active = 0; active < num; ++active)
+                    if(active == local_id)
+                        res = allocPageBasedSingle(acc, bytes);
+                return res;
+            }
+
+            /**
+             * deallocPageBased frees the memory placed on a sequence of pages
+             * @param mem pointer to the first page
+             * @param page the first page
+             * @param bytes the number of bytes to be freed
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC void deallocPageBased(const AlpakaAcc& acc, void* mem, uint32 page, uint32 bytes)
+            {
+                const uint32 pages = divup(bytes, pagesize);
+                for(uint32 p = page; p < page + pages; ++p)
+                    _page[p].init();
+
+                threadfenceDevice(acc);
+
+                for(uint32 p = page; p < page + pages; ++p)
+                    alpaka::atomicOp<alpaka::AtomicCas>(acc, (uint32*) &_ptes[p].chunksize, bytes, 0u);
+                alpaka::atomicOp<alpaka::AtomicMax>(
+                    acc,
+                    (uint32*) &_firstFreePageBased,
+                    page + pages - 1);
+            }
+
+        public:
+            /**
+             * create allocates the requested number of bytes via the heap.
+             * Coalescing has to be done before by another policy.
+             * @param bytes number of bytes to allocate
+             * @return pointer to the allocated memory
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC
+                auto
+                create(const AlpakaAcc& acc, uint32 bytes) -> void*
+            {
+                if(bytes == 0)
+                    return 0;
+                // take care of padding
+                // bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1); //
+                // in alignment-policy
+                if(bytes < pagesize)
+                    // chunck based
+                    return allocChunked(acc, bytes);
+                else
+                    // allocate a range of pages
+                    return allocPageBased(acc, bytes);
+            }
+
+            /**
+             * destroy frees the memory regions previously acllocted via create
+             * @param mempointer to the memory region to free
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC void destroy(const AlpakaAcc& acc, void* mem)
+            {
+                if(mem == 0)
+                    return;
+                // lets see on which page we are on
+                const auto page = static_cast<uint32>(((char*) mem - (char*) _page) / pagesize);
+                const uint32 chunksize = _ptes[page].chunksize;
+
+                // is the pointer the beginning of a chunk?
+                const auto inpage_offset = static_cast<uint32>((char*) mem - _page[page].data);
+                const uint32 block = inpage_offset / chunksize;
+                const uint32 inblockoffset = inpage_offset - block * chunksize;
+                if(inblockoffset != 0)
+                {
+                    uint32* counter = (uint32*) (_page[page].data + block * chunksize);
+                    // coalesced mem free
+
+                    const uint32 old = alpaka::atomicOp<alpaka::AtomicSub>(acc, counter, 1u);
+                    if(old != 1)
+                        return;
+                    mem = (void*) counter;
+                }
+
+                if(chunksize < pagesize)
+                    deallocChunked(acc, mem, page, chunksize);
+                else
+                    deallocPageBased(acc, mem, page, chunksize);
+            }
+
+            /**
+             * init inits the heap data structures
+             * the init method must be called before the heap can be used. the
+             * method can be called with an arbitrary number of threads, which
+             * will increase the inits efficiency
+             * @param memory pointer to the memory used for the heap
+             * @param memsize size of the memory in bytes
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC void initDeviceFunction(const AlpakaAcc& acc, void* memory, size_t memsize)
+            {
+                const auto linid = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc).sum();
+                const auto totalThreads = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc).prod();
+
+                uint32 numregions = ((unsigned long long) memsize)
+                    / (((unsigned long long) regionsize) * (sizeof(PTE) + pagesize) + sizeof(uint32));
+
+                uint32 numpages = numregions * regionsize;
+                // pointer is copied (copy is called page)
+                Page* page = (Page*) memory;
+                // sec check for alignment
+                // copy is checked
+                // PointerEquivalent alignmentstatus = ((PointerEquivalent)page)
+                // & (16 -1); if(alignmentstatus != 0)
+                //{
+                //  if(linid == 0){
+                //    printf("c Before:\n");
+                //    printf("c dataAlignment:   %d\n",16);
+                //    printf("c Alignmentstatus: %d\n",alignmentstatus);
+                //    printf("c size_t memsize   %llu byte\n", memsize);
+                //    printf("c void *memory     %p\n", page);
+                //  }
+                //  //copy is adjusted, potentially pointer to higher address
+                //  now. page =(Page*)(((PointerEquivalent)page) + 16 -
+                //  alignmentstatus); if(linid == 0) printf("c Heap Warning:
+                //  memory to use not 16 byte aligned...\n");
+                //}
+                PTE* ptes = (PTE*) (page + numpages);
+                uint32* regions = (uint32*) (ptes + numpages);
+                // sec check for mem size
+                // this check refers to the original memory-pointer, which was
+                // not adjusted!
+                if((char*) (regions + numregions) > (((char*) memory) + memsize))
+                {
+                    --numregions;
+                    numpages = alpaka::math::min(acc, numregions * regionsize, numpages);
+                    if(linid == 0)
+                        printf("c Heap Warning: needed to reduce number of "
+                               "regions to stay within memory limit\n");
+                }
+                // if(linid == 0) printf("Heap info: wasting %d
+                // bytes\n",(((POINTEREQUIVALENT)memory) + memsize) -
+                // (POINTEREQUIVALENT)(regions + numregions));
+
+                // if(linid == 0 && alignmentstatus != 0){
+                //  printf("c Was shrinked automatically to:\n");
+                //  printf("c size_t memsize   %llu byte\n", memsize);
+                //  printf("c void *memory     %p\n", page);
+                //}
+
+                for(uint32 i = linid; i < numpages; i += totalThreads)
+                {
+                    ptes[i].init();
+                    page[i].init();
+                }
+                for(uint32 i = linid; i < numregions; i += totalThreads)
+                    regions[i] = 0;
+
+                if(linid == 0)
+                {
+                    _memsize = memsize;
+                    _numpages = numpages;
+                    _ptes = (volatile PTE*) ptes;
+                    _page = page;
+                    _regions = regions;
+                    _firstfreeblock = 0;
+                    _pagebasedMutex = 0;
+                    _firstFreePageBased = numpages - 1;
+
+                    if((char*) &_page[numpages] > (char*) memory + memsize)
+                        printf("error in heap alloc: numpages too high\n");
+                }
+            }
+
+            static ALPAKA_FN_ACC auto isOOM(void* p, size_t s) -> bool
+            {
+                // one thread that requested memory returned null
+                return s && (p == nullptr);
+            }
+
+            template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue, typename T_DeviceAllocator>
+            static void initHeap(
+                AlpakaDevice& dev,
+                AlpakaQueue& queue,
+                T_DeviceAllocator* heap,
+                void* pool,
+                size_t memsize)
+            {
+                if(pool == nullptr && memsize != 0)
+                {
+                    throw std::invalid_argument("Scatter policy cannot use nullptr for non-empty "
+                                                "memory pools. "
+                                                "Maybe you are using an incompatible ReservePoolPolicy "
+                                                "or AlignmentPolicy.");
+                }
+                auto initKernel
+                    = [] ALPAKA_FN_ACC(const AlpakaAcc& m_acc, T_DeviceAllocator* m_heap, void* m_heapmem, size_t m_memsize) {
+                          m_heap->pool = m_heapmem;
+                          m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize);
+                      };
+                using Dim = typename alpaka::traits::DimType<AlpakaAcc>::type;
+                using Idx = typename alpaka::traits::IdxType<AlpakaAcc>::type;
+                using VecType = alpaka::Vec<Dim, Idx>;
+
+                auto threadsPerBlock = VecType::ones();
+                threadsPerBlock[Dim::value - 1] = 256u;
+
+                const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+                    VecType::ones(),
+                    threadsPerBlock,
+                    VecType::ones()}; // Dim may be any dimension, but workDiv is 1D
+                alpaka::enqueue(
+                    queue,
+                    alpaka::createTaskKernel<AlpakaAcc>(workDiv, initKernel, heap, pool, memsize));
+            }
+
+            /** counts how many elements of a size fit inside a given page
+             *
+             * Examines a (potentially already used) page to find how many
+             * elements of size chunksize still fit on the page. This includes
+             * hierarchically organized pages and empty pages. The algorithm
+             * determines the number of chunks in the page in a manner similar
+             * to the allocation algorithm of CreationPolicies::Scatter.
+             *
+             * @param page the number of the page to examine. The page needs to
+             * be formatted with a chunksize and potentially a hierarchy.
+             * @param chunksize the size of element that should be placed inside
+             * the page. This size must be appropriate to the formatting of the
+             *        page.
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto countFreeChunksInPage(const AlpakaAcc& acc, uint32 page, uint32 chunksize) -> unsigned
+            {
+                const uint32 filledChunks = _ptes[page].count;
+                if(chunksize <= HierarchyThreshold)
+                {
+                    const uint32 segmentsize = chunksize * 32 + sizeof(uint32); // each segment can hold 32 2nd-level
+                                                                                // chunks
+                    const uint32 fullsegments = alpaka::math::min(
+                        acc,
+                        32u,
+                        pagesize / segmentsize); // there might be space for
+                                                 // more than 32 segments
+                                                 // with 32 2nd-level chunks
+                    const uint32 additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize);
+                    const uint32 level2Chunks = fullsegments * 32 + additional_chunks;
+                    return level2Chunks - filledChunks;
+                }
+                else
+                {
+                    const uint32 chunksinpage = alpaka::math::min(
+                        acc,
+                        pagesize / chunksize,
+                        32u); // without hierarchy, there can not be more than
+                              // 32 chunks
+                    return chunksinpage - filledChunks;
+                }
+            }
+
+            /** counts the number of available slots inside the heap
+             *
+             * Searches the heap for all possible locations of an element with
+             * size slotSize. The used traversal algorithms are similar to the
+             * allocation strategy of CreationPolicies::Scatter, to ensure
+             * comparable results. There are 3 different algorithms, based on
+             * the size of the requested slot: 1 slot spans over multiple pages,
+             * 1 slot fits in one chunk within a page, 1 slot fits in a fraction
+             * of a chunk.
+             *
+             * @param slotSize the amount of bytes that a single slot accounts
+             * for
+             * @param gid the id of the thread. this id does not have to
+             * correspond with threadId.x, but there must be a continous range
+             * @param stride the stride should be equal to the number of
+             * different gids (and therefore of value max(gid)-1)
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto getAvailaibleSlotsDeviceFunction(
+                const AlpakaAcc& acc,
+                size_t slotSize,
+                uint32 gid,
+                uint32 stride) -> unsigned
+            {
+                unsigned slotcount = 0;
+                if(slotSize < pagesize)
+                { // multiple slots per page
+                    for(uint32 currentpage = gid; currentpage < _numpages; currentpage += stride)
+                    {
+                        const uint32 maxchunksize = alpaka::math::min(acc, +pagesize, wastefactor * (uint32) slotSize);
+                        const uint32 region = currentpage / regionsize;
+                        const uint32 regionfilllevel = _regions[region];
+
+                        uint32 chunksize = _ptes[currentpage].chunksize;
+                        if(chunksize >= slotSize && chunksize <= maxchunksize)
+                        { // how many chunks left? (each chunk is big enough)
+                            slotcount += countFreeChunksInPage(acc, currentpage, chunksize);
+                        }
+                        else if(chunksize == 0)
+                        {
+                            chunksize = alpaka::math::max(
+                                acc,
+                                (uint32) slotSize,
+                                +minChunkSize1); // ensure minimum chunk size
+                            slotcount += countFreeChunksInPage(
+                                acc,
+                                currentpage,
+                                chunksize); // how many chunks fit in one page?
+                        }
+                        else
+                        {
+                            continue; // the chunks on this page are too small
+                                      // for the request :(
+                        }
+                    }
+                }
+                else
+                { // 1 slot needs multiple pages
+                    if(gid > 0)
+                        return 0; // do this serially
+                    const uint32 pagestoalloc = divup((uint32) slotSize, pagesize);
+                    uint32 freecount = 0;
+                    for(uint32 currentpage = _numpages; currentpage > 0;)
+                    { // this already includes all superblocks
+                        --currentpage;
+                        if(_ptes[currentpage].chunksize == 0)
+                        {
+                            if(++freecount == pagestoalloc)
+                            {
+                                freecount = 0;
+                                ++slotcount;
+                            }
+                        }
+                        else
+                        { // the sequence of free pages was interrupted
+                            freecount = 0;
+                        }
+                    }
+                }
+                return slotcount;
+            }
+
+            /** Count, how many elements can be allocated at maximum
+             *
+             * Takes an input size and determines, how many elements of this
+             * size can be allocated with the CreationPolicy Scatter. This will
+             * return the maximum number of free slots of the indicated size. It
+             * is not guaranteed where these slots are (regarding
+             * fragmentation). Therefore, the practically usable number of slots
+             * might be smaller. This function is executed in parallel. Speedup
+             * can possibly increased by a higher amount ofparallel workers.
+             *
+             * @param slotSize the size of allocatable elements to count
+             * @param obj a reference to the allocator instance (host-side)
+             */
+        public:
+            template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue, typename T_DeviceAllocator>
+            static auto getAvailableSlotsHost(
+                AlpakaDevice& dev,
+                AlpakaQueue& queue,
+                size_t const slotSize,
+                T_DeviceAllocator* heap) -> unsigned
+            {
+                auto d_slots = alpaka::allocBuf<unsigned, int>(dev, 1);
+                alpaka::memset(queue, d_slots, 0, 1);
+
+                auto getAvailableSlotsKernel
+                    = [] ALPAKA_FN_ACC(const AlpakaAcc& acc, T_DeviceAllocator* heap, size_t slotSize, unsigned* slots)
+                    -> void {
+                    const auto gid = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc).sum();
+
+                    const auto nWorker = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc).prod();
+                    const unsigned temp = heap->getAvailaibleSlotsDeviceFunction(acc, slotSize, gid, nWorker);
+                    if(temp)
+                        alpaka::atomicOp<alpaka::AtomicAdd>(acc, slots, temp);
+                };
+
+                using Dim = typename alpaka::traits::DimType<AlpakaAcc>::type;
+                using Idx = typename alpaka::traits::IdxType<AlpakaAcc>::type;
+
+                using VecType = alpaka::Vec<Dim, Idx>;
+
+                auto numBlocks = VecType::ones();
+                numBlocks[Dim::value - 1] = 64u;
+                auto threadsPerBlock = VecType::ones();
+                threadsPerBlock[Dim::value - 1] = 256u;
+
+                const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+                    numBlocks,
+                    threadsPerBlock,
+                    VecType::ones()}; // Dim may be any dimension, but workDiv is 1D
+
+                alpaka::enqueue(
+                    queue,
+                    alpaka::createTaskKernel<AlpakaAcc>(
+                        workDiv,
+                        getAvailableSlotsKernel,
+                        heap,
+                        slotSize,
+                        alpaka::getPtrNative(d_slots)));
+
+                const auto hostDev = alpaka::getDevByIdx<alpaka::Pltf<alpaka::DevCpu>>(0);
+                auto h_slots = alpaka::allocBuf<unsigned, int>(hostDev, 1);
+                alpaka::memcpy(queue, h_slots, d_slots, 1);
+                alpaka::wait(queue);
+
+                return *alpaka::getPtrNative(h_slots);
+            }
+
+            /** Count, how many elements can be allocated at maximum
+             *
+             * Takes an input size and determines, how many elements of this
+             * size can be allocated with the CreationPolicy Scatter. This will
+             * return the maximum number of free slots of the indicated size. It
+             * is not guaranteed where these slots are (regarding
+             * fragmentation). Therefore, the practically usable number of slots
+             * might be smaller. This function is executed separately for each
+             * warp and does not cooperate with other warps. Maximum speed is
+             * expected if every thread in the warp executes the function. Uses
+             * 256 byte of shared memory.
+             *
+             * @param slotSize the size of allocatable elements to count
+             */
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(const AlpakaAcc& acc, size_t slotSize) -> unsigned
+            {
+                const int wId = warpid_withinblock(acc); // do not use warpid-function, since
+                                                         // this value is not guaranteed to
+                                                         // be stable across warp lifetime
+
+                const uint32 activeThreads = popc(activemask());
+
+                auto& activePerWarp = alpaka::declareSharedVar<
+                    std::uint32_t[maxThreadsPerBlock / warpSize],
+                    __COUNTER__>(acc); // maximum number of warps in a block
+
+                auto& warpResults
+                    = alpaka::declareSharedVar<unsigned[maxThreadsPerBlock / warpSize], __COUNTER__>(acc);
+
+                warpResults[wId] = 0;
+                activePerWarp[wId] = 0;
+
+                // wait that all shared memory is initialized
+                alpaka::syncBlockThreads(acc);
+
+                // the active threads obtain an id from 0 to activeThreads-1
+                if(slotSize == 0)
+                    return 0;
+                const auto linearId = alpaka::atomicOp<alpaka::AtomicAdd>(acc, &activePerWarp[wId], 1u);
+
+                // printf("Block %d, id %d: activeThreads=%d
+                // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId);
+                const unsigned temp = getAvailaibleSlotsDeviceFunction(acc, slotSize, linearId, activeThreads);
+                if(temp)
+                    alpaka::atomicOp<alpaka::AtomicAdd>(acc, &warpResults[wId], temp);
+
+                alpaka::syncBlockThreads(acc);
+                threadfenceBlock(acc);
+
+                return warpResults[wId];
+            }
+
+            static auto classname() -> std::string
+            {
+                std::stringstream ss;
+                ss << "Scatter[";
+                ss << pagesize << ",";
+                ss << accessblocks << ",";
+                ss << regionsize << ",";
+                ss << wastefactor << ",";
+                ss << resetfreedpages << ",";
+                ss << hashingK << ",";
+                ss << hashingDistMP << ",";
+                ss << hashingDistWP << ",";
+                ss << hashingDistWPRel << "]";
+                return ss.str();
+            }
+        };
+
+    } // namespace CreationPolicies
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter_impl.hpp
deleted file mode 100644
index 88e3a611fc..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/creationPolicies/Scatter_impl.hpp
+++ /dev/null
@@ -1,978 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  http://www.icg.tugraz.at/project/mvp
-
-  Copyright (C) 2012 Institute for Computer Graphics and Vision,
-                     Graz University of Technology
-  Copyright (C) 2014-2016 Institute of Radiation Physics,
-                          Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
-              Rene Widera - r.widera ( at ) hzdr.de
-              Axel Huebl - a.huebl ( at ) hzdr.de
-              Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <cstdio>
-#include <boost/cstdint.hpp> /* uint32_t */
-#include <iostream>
-#include <string>
-#include <cassert>
-#include <stdexcept>
-#include <boost/mpl/bool.hpp>
-
-#include "../mallocMC_utils.hpp"
-#include "Scatter.hpp"
-
-namespace mallocMC{
-namespace CreationPolicies{
-
-namespace ScatterKernelDetail{
-  template <typename T_Allocator>
-  __global__ void initKernel(T_Allocator* heap, void* heapmem, size_t memsize){
-    heap->pool = heapmem;
-    heap->initDeviceFunction(heapmem, memsize);
-  }
-
-
-  template < typename T_Allocator >
-  __global__ void getAvailableSlotsKernel(T_Allocator* heap, size_t slotSize, unsigned* slots){
-    int gid       = threadIdx.x + blockIdx.x*blockDim.x;
-    int nWorker   = gridDim.x * blockDim.x;
-    unsigned temp = heap->getAvailaibleSlotsDeviceFunction(slotSize, gid, nWorker);
-    if(temp) atomicAdd(slots, temp);
-  }
-
-
-  template <typename T_Allocator>
-  __global__ void finalizeKernel(T_Allocator* heap){
-    heap->finalizeDeviceFunction();
-  }
-
-} //namespace ScatterKernelDetail
-
-  template<class T_Config, class T_Hashing>
-  class Scatter
-  {
-
-    public:
-      typedef T_Config  HeapProperties;
-      typedef T_Hashing HashingProperties;
-      struct  Properties : HeapProperties, HashingProperties{};
-      typedef boost::mpl::bool_<true>  providesAvailableSlots;
-
-    private:
-      typedef boost::uint32_t uint32;
-
-
-/** Allow for a hierarchical validation of parameters:
- *
- * shipped default-parameters (in the inherited struct) have lowest precedence.
- * They will be overridden by a given configuration struct. However, even the
- * given configuration struct can be overridden by compile-time command line
- * parameters (e.g. -D MALLOCMC_CP_SCATTER_PAGESIZE 1024)
- *
- * default-struct < template-struct < command-line parameter
- */
-#ifndef MALLOCMC_CP_SCATTER_PAGESIZE
-#define MALLOCMC_CP_SCATTER_PAGESIZE  static_cast<uint32>(HeapProperties::pagesize::value)
-#endif
-      BOOST_STATIC_CONSTEXPR uint32 pagesize      = MALLOCMC_CP_SCATTER_PAGESIZE;
-
-#ifndef MALLOCMC_CP_SCATTER_ACCESSBLOCKS
-#define MALLOCMC_CP_SCATTER_ACCESSBLOCKS static_cast<uint32>(HeapProperties::accessblocks::value)
-#endif
-      BOOST_STATIC_CONSTEXPR uint32 accessblocks  = MALLOCMC_CP_SCATTER_ACCESSBLOCKS;
-
-#ifndef MALLOCMC_CP_SCATTER_REGIONSIZE
-#define MALLOCMC_CP_SCATTER_REGIONSIZE static_cast<uint32>(HeapProperties::regionsize::value)
-#endif
-      BOOST_STATIC_CONSTEXPR uint32 regionsize    = MALLOCMC_CP_SCATTER_REGIONSIZE;
-
-#ifndef MALLOCMC_CP_SCATTER_WASTEFACTOR
-#define MALLOCMC_CP_SCATTER_WASTEFACTOR static_cast<uint32>(HeapProperties::wastefactor::value)
-#endif
-      BOOST_STATIC_CONSTEXPR uint32 wastefactor   = MALLOCMC_CP_SCATTER_WASTEFACTOR;
-
-#ifndef MALLOCMC_CP_SCATTER_RESETFREEDPAGES
-#define MALLOCMC_CP_SCATTER_RESETFREEDPAGES static_cast<bool>(HeapProperties::resetfreedpages::value)
-#endif
-      BOOST_STATIC_CONSTEXPR bool resetfreedpages = MALLOCMC_CP_SCATTER_RESETFREEDPAGES;
-
-
-    public:
-      BOOST_STATIC_CONSTEXPR uint32 _pagesize       = pagesize;
-      BOOST_STATIC_CONSTEXPR uint32 _accessblocks   = accessblocks;
-      BOOST_STATIC_CONSTEXPR uint32 _regionsize     = regionsize;
-      BOOST_STATIC_CONSTEXPR uint32 _wastefactor    = wastefactor;
-      BOOST_STATIC_CONSTEXPR bool _resetfreedpages  = resetfreedpages;
-
-    private:
-#if _DEBUG || ANALYSEHEAP
-    public:
-#endif
-      //BOOST_STATIC_CONSTEXPR uint32 minChunkSize0 = pagesize/(32*32);
-      BOOST_STATIC_CONSTEXPR uint32 minChunkSize1 = 0x10;
-      BOOST_STATIC_CONSTEXPR uint32 HierarchyThreshold =  (pagesize - 2*sizeof(uint32))/33;
-      BOOST_STATIC_CONSTEXPR uint32 minSegmentSize = 32*minChunkSize1 + sizeof(uint32);
-      BOOST_STATIC_CONSTEXPR uint32 tmp_maxOPM = minChunkSize1 > HierarchyThreshold ? 0 : (pagesize + (minSegmentSize-1)) / minSegmentSize;
-      BOOST_STATIC_CONSTEXPR uint32 maxOnPageMasks = 32 > tmp_maxOPM ? tmp_maxOPM : 32;
-
-#ifndef MALLOCMC_CP_SCATTER_HASHINGK
-#define MALLOCMC_CP_SCATTER_HASHINGK    static_cast<uint32>(HashingProperties::hashingK::value)
-#endif
-     BOOST_STATIC_CONSTEXPR uint32 hashingK       = MALLOCMC_CP_SCATTER_HASHINGK;
-
-#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTMP
-#define MALLOCMC_CP_SCATTER_HASHINGDISTMP static_cast<uint32>(HashingProperties::hashingDistMP::value)
-#endif
-     BOOST_STATIC_CONSTEXPR uint32 hashingDistMP  = MALLOCMC_CP_SCATTER_HASHINGDISTMP;
-
-#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWP
-#define MALLOCMC_CP_SCATTER_HASHINGDISTWP static_cast<uint32>(HashingProperties::hashingDistWP::value)
-#endif
-     BOOST_STATIC_CONSTEXPR uint32 hashingDistWP  = MALLOCMC_CP_SCATTER_HASHINGDISTWP;
-
-#ifndef MALLOCMC_CP_SCATTER_HASHINGDISTWPREL
-#define MALLOCMC_CP_SCATTER_HASHINGDISTWPREL static_cast<uint32>(HashingProperties::hashingDistWPRel::value)
-#endif
-     BOOST_STATIC_CONSTEXPR uint32 hashingDistWPRel = MALLOCMC_CP_SCATTER_HASHINGDISTWPREL;
-
-
-      /**
-       * Page Table Entry struct
-       * The PTE holds basic information about each page
-       */
-      struct PTE
-      {
-        uint32 chunksize;
-        uint32 count;
-        uint32 bitmask;
-
-        __device__ void init()
-        {
-          chunksize = 0;
-          count = 0;
-          bitmask = 0;
-        }
-      };
-
-      /**
-       * Page struct
-       * The page struct is used to access the data on the page more efficiently
-       * and to clear the area on the page, which might hold bitsfields later one
-       */
-      struct PAGE
-      {
-        char data[pagesize];
-
-        /**
-         * The pages init method
-         * This method initializes the region on the page which might hold
-         * bit fields when the page is used for a small chunk size
-         * @param previous_chunksize the chunksize which was uses for the page before
-         */
-        __device__ void init()
-        {
-          //clear the entire data which can hold bitfields
-          uint32* write = (uint32*)(data + pagesize - (int)(sizeof(uint32)*maxOnPageMasks));
-          while(write < (uint32*)(data + pagesize))
-            *write++ = 0;
-        }
-      };
-
-      // the data used by the allocator
-
-      volatile PTE* _ptes;
-      volatile uint32* _regions;
-      PAGE* _page;
-      uint32 _numpages;
-      size_t _memsize;
-      uint32 _pagebasedMutex;
-      volatile uint32 _firstFreePageBased;
-      volatile uint32 _firstfreeblock;
-
-
-      /**
-       * randInit should create an random offset which can be used
-       * as the initial position in a bitfield
-       */
-      __device__ inline uint32 randInit()
-      {
-        //start with the laneid offset
-        return laneid();
-      }
-
-      /**
-       * randInextspot delivers the next free spot in a bitfield
-       * it searches for the next unset bit to the left of spot and
-       * returns its offset. if there are no unset bits to the left
-       * then it wraps around
-       * @param bitfield the bitfield to be searched for
-       * @param spot the spot from which to search to the left
-       * @param spots number of bits that can be used
-       * @return next free spot in the bitfield
-       */
-      __device__ inline uint32 nextspot(uint32 bitfield, uint32 spot, uint32 spots)
-      {
-        //wrap around the bitfields from the current spot to the left
-        bitfield = ((bitfield >> (spot + 1)) | (bitfield << (spots - (spot + 1))))&((1<<spots)-1);
-        //compute the step from the current spot in the bitfield
-        uint32 step = __ffs(~bitfield);
-        //and return the new spot
-        return (spot + step) % spots;
-      }
-
-
-      /**
-       * onPageMasksPosition returns a pointer to the beginning of the onpagemasks inside a page.
-       * @param page the page that holds the masks
-       * @param the number of hierarchical page tables (bitfields) that are used inside this mask.
-       * @return pointer to the first address inside the page that holds metadata bitfields.
-       */
-      __device__ inline uint32* onPageMasksPosition(uint32 page, uint32 nMasks){
-        return (uint32*)(_page[page].data + pagesize - (int)sizeof(uint32)*nMasks);
-      }
-
-      /**
-       * usespot marks finds one free spot in the bitfield, marks it and returns its offset
-       * @param bitfield pointer to the bitfield to use
-       * @param spots overall number of spots the bitfield is responsible for
-       * @return if there is a free spot it returns the spot'S offset, otherwise -1
-       */
-      __device__ inline int usespot(uint32 *bitfield, uint32 spots)
-      {
-        //get first spot
-        uint32 spot = randInit() % spots;
-        for(;;)
-        {
-          uint32 mask = 1 << spot;
-          uint32 old = atomicOr(bitfield, mask);
-          if( (old & mask) == 0)
-            return spot;
-          // note: __popc(old) == spots should be sufficient,
-          //but if someone corrupts the memory we end up in an endless loop in here...
-          if(__popc(old) >= spots)
-            return -1;
-          spot = nextspot(old, spot, spots);
-        }
-      }
-
-
-      /**
-       * calcAdditionalChunks determines the number of chunks that are contained in the last segment of a hierarchical page
-       *
-       * The additional checks are necessary to ensure correct results for very large pages and small chunksizes
-       *
-       * @param fullsegments the number of segments that can be completely filled in a page. This may NEVER be bigger than 32!
-       * @param segmentsize the number of bytes that are contained in a completely filled segment (32 chunks)
-       * @param chunksize the chosen allocation size within the page
-       * @return the number of additional chunks that will not fit in one of the fullsegments. For any correct input, this number is smaller than 32
-       */
-      __device__ inline uint32 calcAdditionalChunks(uint32 fullsegments, uint32 segmentsize, uint32 chunksize){
-        if(fullsegments != 32){
-          return max(0,(int)pagesize - (int)fullsegments*segmentsize - (int)sizeof(uint32))/chunksize;
-        }else
-          return 0;
-      }
-
-
-      /**
-       * addChunkHierarchy finds a free chunk on a page which uses bit fields on the page
-       * @param chunksize the chunksize of the page
-       * @param fullsegments the number of full segments on the page (a 32 bits on the page)
-       * @param additional_chunks the number of additional chunks in last segment (less than 32 bits on the page)
-       * @param page the page to use
-       * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk
-       */
-      __device__ inline void* addChunkHierarchy(uint32 chunksize, uint32 fullsegments, uint32 additional_chunks, uint32 page)
-      {
-        uint32 segments = fullsegments + (additional_chunks > 0 ? 1 : 0);
-        uint32 spot = randInit() % segments;
-        uint32 mask = _ptes[page].bitmask;
-        if((mask & (1 << spot)) != 0)
-          spot = nextspot(mask, spot, segments);
-        uint32 tries = segments - __popc(mask);
-        uint32* onpagemasks = onPageMasksPosition(page,segments);
-        for(uint32 i = 0; i < tries; ++i)
-        {
-          int hspot = usespot(onpagemasks + spot, spot < fullsegments ? 32 : additional_chunks);
-          if(hspot != -1)
-            return _page[page].data + (32*spot + hspot)*chunksize;
-          else
-            atomicOr((uint32*)&_ptes[page].bitmask, 1 << spot);
-          spot = nextspot(mask, spot, segments);
-        }
-        return 0;
-      }
-
-      /**
-       * addChunkNoHierarchy finds a free chunk on a page which uses the bit fields of the pte only
-       * @param chunksize the chunksize of the page
-       * @param page the page to use
-       * @param spots the number of chunks which fit on the page
-       * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk
-       */
-      __device__ inline void* addChunkNoHierarchy(uint32 chunksize, uint32 page, uint32 spots)
-      {
-        int spot = usespot((uint32*)&_ptes[page].bitmask, spots);
-        if(spot == -1)
-          return 0; //that should be impossible :)
-        return _page[page].data + spot*chunksize;
-      }
-
-      /**
-       * tryUsePage tries to use the page for the allocation request
-       * @param page the page to use
-       * @param chunksize the chunksize of the page
-       * @return pointer to a free chunk on the page, 0 if we were unable to obtain a free chunk
-       */
-      __device__ inline void* tryUsePage(uint32 page, uint32 chunksize)
-      {
-
-        void* chunk_ptr = NULL;
-
-        //increse the fill level
-        uint32 filllevel = atomicAdd((uint32*)&(_ptes[page].count), 1);
-        //recheck chunck size (it could be that the page got freed in the meanwhile...)
-        if(!resetfreedpages || _ptes[page].chunksize == chunksize)
-        {
-          if(chunksize <= HierarchyThreshold)
-          {
-            //more chunks than can be covered by the pte's single bitfield can be used
-            uint32 segmentsize = chunksize*32 + sizeof(uint32);
-            uint32 fullsegments = min(32,pagesize / segmentsize);
-            uint32 additional_chunks = calcAdditionalChunks(fullsegments, segmentsize, chunksize);
-            if(filllevel < fullsegments * 32 + additional_chunks)
-              chunk_ptr = addChunkHierarchy(chunksize, fullsegments, additional_chunks, page);
-          }
-          else
-          {
-            uint32 chunksinpage = min(pagesize / chunksize, 32);
-            if(filllevel < chunksinpage)
-              chunk_ptr = addChunkNoHierarchy(chunksize, page, chunksinpage);
-          }
-        }
-
-        //this one is full/not useable
-        if(chunk_ptr == NULL)
-          atomicSub((uint32*)&(_ptes[page].count), 1);
-
-        return chunk_ptr;
-      }
-
-
-      /**
-       * allocChunked tries to allocate the demanded number of bytes on one of the pages
-       * @param bytes the number of bytes to allocate
-       * @return pointer to a free chunk on a page, 0 if we were unable to obtain a free chunk
-       */
-      __device__ void* allocChunked(uint32 bytes)
-      {
-        uint32 pagesperblock = _numpages/accessblocks;
-        uint32 reloff = warpSize*bytes / pagesize;
-        uint32 startpage = (bytes*hashingK + hashingDistMP*smid() + (hashingDistWP+hashingDistWPRel*reloff)*warpid() ) % pagesperblock;
-        uint32 maxchunksize = min(pagesize,wastefactor*bytes);
-        uint32 startblock = _firstfreeblock;
-        uint32 ptetry = startpage + startblock*pagesperblock;
-        uint32 checklevel = regionsize*3/4;
-        for(uint32 finder = 0; finder < 2; ++finder)
-        {
-          for(uint32 b = startblock; b < accessblocks; ++b)
-          {
-            while(ptetry < (b+1)*pagesperblock)
-            {
-              uint32 region = ptetry/regionsize;
-              uint32 regionfilllevel = _regions[region];
-              if(regionfilllevel < checklevel )
-              {
-                for( ; ptetry < (region+1)*regionsize; ++ptetry)
-                {
-                  uint32 chunksize = _ptes[ptetry].chunksize;
-                  if(chunksize >= bytes && chunksize <= maxchunksize)
-                  {
-                    void * res = tryUsePage(ptetry, chunksize);
-                    if(res != 0)  return res;
-                  }
-                  else if(chunksize == 0)
-                  {
-                    //lets open up a new page
-                    //it is already padded
-                    uint32 new_chunksize = max(bytes,minChunkSize1);
-                    uint32 beforechunksize = atomicCAS((uint32*)&_ptes[ptetry].chunksize, 0, new_chunksize);
-                    if(beforechunksize == 0)
-                    {
-                      void * res = tryUsePage(ptetry, new_chunksize);
-                      if(res != 0)  return res;
-                    }
-                    else if(beforechunksize >= bytes &&  beforechunksize <= maxchunksize)
-                    {
-                      //someone else aquired the page, but we can also use it
-                      void * res = tryUsePage(ptetry, beforechunksize);
-                      if(res != 0)  return res;
-                    }
-                  }
-                }
-                //could not alloc in region, tell that
-                if(regionfilllevel + 1 <= regionsize)
-                  atomicMax((uint32*)(_regions + region), regionfilllevel+1);
-              }
-              else
-                ptetry += regionsize;
-              //ptetry = (region+1)*regionsize;
-            }
-            //randomize the thread writing the info
-            //if(warpid() + laneid() == 0)
-            if(b > startblock)
-              _firstfreeblock = b;
-          }
-
-          //we are really full :/ so lets search every page for a spot!
-          startblock = 0;
-          checklevel = regionsize + 1;
-          ptetry = 0;
-        }
-        return 0;
-      }
-
-
-      /**
-       * deallocChunked frees the chunk on the page and updates all data accordingly
-       * @param mem pointer to the chunk
-       * @param page the page the chunk is on
-       * @param chunksize the chunksize used for the page
-       */
-      __device__ void deallocChunked(void* mem, uint32 page, uint32 chunksize)
-      {
-        uint32 inpage_offset = ((char*)mem - _page[page].data);
-        if(chunksize <= HierarchyThreshold)
-        {
-          //one more level in hierarchy
-          uint32 segmentsize = chunksize*32 + sizeof(uint32);
-          uint32 fullsegments = min(32,pagesize / segmentsize);
-          uint32 additional_chunks = calcAdditionalChunks(fullsegments,segmentsize,chunksize);
-          uint32 segment = inpage_offset / (chunksize*32);
-          uint32 withinsegment = (inpage_offset - segment*(chunksize*32))/chunksize;
-          //mark it as free
-          uint32 nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0);
-          uint32* onpagemasks = onPageMasksPosition(page,nMasks);
-          uint32 old = atomicAnd(onpagemasks + segment, ~(1 << withinsegment));
-
-          // always do this, since it might fail due to a race-condition with addChunkHierarchy
-          atomicAnd((uint32*)&_ptes[page].bitmask, ~(1 << segment));
-        }
-        else
-        {
-          uint32 segment = inpage_offset / chunksize;
-          atomicAnd((uint32*)&_ptes[page].bitmask, ~(1 << segment));
-        }
-        //reduce filllevel as free
-        uint32 oldfilllevel = atomicSub((uint32*)&_ptes[page].count, 1);
-
-
-        if(resetfreedpages)
-        {
-          if(oldfilllevel == 1)
-          {
-            //this page now got free!
-            // -> try lock it
-            uint32 old = atomicCAS((uint32*)&_ptes[page].count, 0, pagesize);
-            if(old == 0)
-            {
-              //clean the bits for the hierarchy
-              _page[page].init();
-              //remove chunk information
-              _ptes[page].chunksize = 0;
-              __threadfence();
-              //unlock it
-              atomicSub((uint32*)&_ptes[page].count, pagesize);
-            }
-          }
-        }
-
-        //meta information counters ... should not be changed by too many threads, so..
-        if(oldfilllevel == pagesize / 2 / chunksize)
-        {
-          uint32 region = page / regionsize;
-          _regions[region] = 0;
-          uint32 block = region * regionsize * accessblocks / _numpages ;
-          if(warpid() + laneid() == 0)
-            atomicMin((uint32*)&_firstfreeblock, block);
-        }
-      }
-
-      /**
-       * markpages markes a fixed number of pages as used
-       * @param startpage first page to mark
-       * @param pages number of pages to mark
-       * @param bytes number of overall bytes to mark pages for
-       * @return true on success, false if one of the pages is not free
-       */
-      __device__ bool markpages(uint32 startpage, uint32 pages, uint32 bytes)
-      {
-        int abord = -1;
-        for(uint32 trypage = startpage; trypage < startpage + pages; ++trypage)
-        {
-          uint32 old = atomicCAS((uint32*)&_ptes[trypage].chunksize, 0, bytes);
-          if(old != 0)
-          {
-            abord = trypage;
-            break;
-          }
-        }
-        if(abord == -1)
-          return true;
-        for(uint32 trypage = startpage; trypage < abord; ++trypage)
-          atomicCAS((uint32*)&_ptes[trypage].chunksize, bytes, 0);
-        return false;
-      }
-
-      /**
-       * allocPageBasedSingleRegion tries to allocate the demanded number of bytes on a continues sequence of pages
-       * @param startpage first page to be used
-       * @param endpage last page to be used
-       * @param bytes number of overall bytes to mark pages for
-       * @return pointer to the first page to use, 0 if we were unable to use all the requested pages
-       */
-      __device__ void* allocPageBasedSingleRegion(uint32 startpage, uint32 endpage, uint32 bytes)
-      {
-        uint32 pagestoalloc = divup(bytes, pagesize);
-        uint32 freecount = 0;
-        bool left_free = false;
-        for(uint32 search_page = startpage+1; search_page > endpage; )
-        {
-          --search_page;
-          if(_ptes[search_page].chunksize == 0)
-          {
-            if(++freecount == pagestoalloc)
-            {
-              //try filling it up
-              if(markpages(search_page, pagestoalloc, bytes))
-              {
-                //mark that we filled up everything up to here
-                if(!left_free)
-                  atomicCAS((uint32*)&_firstFreePageBased, startpage, search_page - 1);
-                return _page[search_page].data;
-              }
-            }
-          }
-          else
-          {
-            left_free = true;
-            freecount = 0;
-          }
-        }
-        return 0;
-      }
-
-      /**
-       * allocPageBasedSingle tries to allocate the demanded number of bytes on a continues sequence of pages
-       * @param bytes number of overall bytes to mark pages for
-       * @return pointer to the first page to use, 0 if we were unable to use all the requested pages
-       * @pre only a single thread of a warp is allowed to call the function concurrently
-       */
-      __device__ void* allocPageBasedSingle(uint32 bytes)
-      {
-        //acquire mutex
-        while(atomicExch(&_pagebasedMutex,1) != 0);
-        //search for free spot from the back
-        uint32 spage = _firstFreePageBased;
-        void* res = allocPageBasedSingleRegion(spage, 0, bytes);
-        if(res == 0)
-          //also check the rest of the pages
-          res = allocPageBasedSingleRegion(_numpages, spage, bytes);
-
-        //free mutex
-        atomicExch(&_pagebasedMutex,0);
-        return res;
-      }
-      /**
-       * allocPageBased tries to allocate the demanded number of bytes on a continues sequence of pages
-       * @param bytes number of overall bytes to mark pages for
-       * @return pointer to the first page to use, 0 if we were unable to use all the requested pages
-       */
-      __device__ void* allocPageBased(uint32 bytes)
-      {
-        //this is rather slow, but we dont expect that to happen often anyway
-
-        //only one thread per warp can acquire the mutex
-        void* res = 0;
-        for(
-#if(__CUDACC_VER_MAJOR__ >= 9)
-          unsigned int __mask = __activemask(),
-#else
-          unsigned int __mask = __ballot(1),
-#endif
-          __num = __popc(__mask),
-          __lanemask = mallocMC::lanemask_lt(),
-          __local_id = __popc(__lanemask & __mask),
-          __active = 0;
-          __active < __num;
-          ++__active
-        )
-          if (__active == __local_id)
-            res = allocPageBasedSingle(bytes);
-        return res;
-      }
-
-      /**
-       * deallocPageBased frees the memory placed on a sequence of pages
-       * @param mem pointer to the first page
-       * @param page the first page
-       * @param bytes the number of bytes to be freed
-       */
-      __device__ void deallocPageBased(void* mem, uint32 page, uint32 bytes)
-      {
-        uint32 pages = divup(bytes,pagesize);
-        for(uint32 p = page; p < page+pages; ++p)
-          _page[p].init();
-        __threadfence();
-        for(uint32 p = page; p < page+pages; ++p)
-          atomicCAS((uint32*)&_ptes[p].chunksize, bytes, 0);
-        atomicMax((uint32*)&_firstFreePageBased, page+pages-1);
-      }
-
-
-    public:
-      /**
-       * create allocates the requested number of bytes via the heap. Coalescing has to be done before by another policy.
-       * @param bytes number of bytes to allocate
-       * @return pointer to the allocated memory
-       */
-      __device__ void* create(uint32 bytes)
-      {
-        if(bytes == 0)
-          return 0;
-        //take care of padding
-        //bytes = (bytes + dataAlignment - 1) & ~(dataAlignment-1); // in alignment-policy
-        if(bytes < pagesize)
-          //chunck based
-          return allocChunked(bytes);
-        else
-          //allocate a range of pages
-          return allocPageBased(bytes);
-      }
-
-      /**
-       * destroy frees the memory regions previously acllocted via create
-       * @param mempointer to the memory region to free
-       */
-      __device__ void destroy(void* mem)
-      {
-        if(mem == 0)
-          return;
-        //lets see on which page we are on
-        uint32 page = ((char*)mem - (char*)_page)/pagesize;
-        uint32 chunksize = _ptes[page].chunksize;
-
-        //is the pointer the beginning of a chunk?
-        uint32 inpage_offset = ((char*)mem - _page[page].data);
-        uint32 block = inpage_offset/chunksize;
-        uint32 inblockoffset = inpage_offset - block*chunksize;
-        if(inblockoffset != 0)
-        {
-          uint32* counter = (uint32*)(_page[page].data + block*chunksize);
-          //coalesced mem free
-          uint32 old = atomicSub(counter, 1);
-          if(old != 1)
-            return;
-          mem = (void*) counter;
-        }
-
-        if(chunksize < pagesize)
-          deallocChunked(mem, page, chunksize);
-        else
-          deallocPageBased(mem, page, chunksize);
-      }
-
-      /**
-       * init inits the heap data structures
-       * the init method must be called before the heap can be used. the method can be called
-       * with an arbitrary number of threads, which will increase the inits efficiency
-       * @param memory pointer to the memory used for the heap
-       * @param memsize size of the memory in bytes
-       */
-      __device__ void initDeviceFunction(void* memory, size_t memsize)
-      {
-        uint32 linid = threadIdx.x + blockDim.x*(threadIdx.y + threadIdx.z*blockDim.y);
-        uint32 threads = blockDim.x*blockDim.y*blockDim.z;
-        uint32 linblockid = blockIdx.x + gridDim.x*(blockIdx.y + blockIdx.z*gridDim.y);
-        uint32 blocks =  gridDim.x*gridDim.y*gridDim.z;
-        linid = linid + linblockid*threads;
-
-        uint32 numregions = ((unsigned long long)memsize)/( ((unsigned long long)regionsize)*(sizeof(PTE)+pagesize)+sizeof(uint32));
-        uint32 numpages = numregions*regionsize;
-        //pointer is copied (copy is called page)
-        PAGE* page = (PAGE*)(memory);
-        //sec check for alignment
-        //copy is checked
-        //PointerEquivalent alignmentstatus = ((PointerEquivalent)page) & (16 -1);
-        //if(alignmentstatus != 0)
-        //{
-        //  if(linid == 0){
-        //    printf("c Before:\n");
-        //    printf("c dataAlignment:   %d\n",16);
-        //    printf("c Alignmentstatus: %d\n",alignmentstatus);
-        //    printf("c size_t memsize   %llu byte\n", memsize);
-        //    printf("c void *memory     %p\n", page);
-        //  }
-        //  //copy is adjusted, potentially pointer to higher address now.
-        //  page =(PAGE*)(((PointerEquivalent)page) + 16 - alignmentstatus);
-        //  if(linid == 0) printf("c Heap Warning: memory to use not 16 byte aligned...\n");
-        //}
-        PTE* ptes = (PTE*)(page + numpages);
-        uint32* regions = (uint32*)(ptes + numpages);
-        //sec check for mem size
-        //this check refers to the original memory-pointer, which was not adjusted!
-        if( (void*)(regions + numregions) > (((char*)memory) + memsize) )
-        {
-          --numregions;
-          numpages = min(numregions*regionsize,numpages);
-          if(linid == 0) printf("c Heap Warning: needed to reduce number of regions to stay within memory limit\n");
-        }
-        //if(linid == 0) printf("Heap info: wasting %d bytes\n",(((POINTEREQUIVALENT)memory) + memsize) - (POINTEREQUIVALENT)(regions + numregions));
-
-        //if(linid == 0 && alignmentstatus != 0){
-        //  printf("c Was shrinked automatically to:\n");
-        //  printf("c size_t memsize   %llu byte\n", memsize);
-        //  printf("c void *memory     %p\n", page);
-        //}
-        threads = threads*blocks;
-
-        for(uint32 i = linid; i < numpages; i+= threads)
-        {
-          ptes[i].init();
-          page[i].init();
-        }
-        for(uint32 i = linid; i < numregions; i+= threads)
-          regions[i] = 0;
-
-        if(linid == 0)
-        {
-          _memsize = memsize;
-          _numpages = numpages;
-          _ptes = (volatile PTE*)ptes;
-          _page = page;
-          _regions =  regions;
-          _firstfreeblock = 0;
-          _pagebasedMutex = 0;
-          _firstFreePageBased = numpages-1;
-
-          if( (char*) (_page+numpages) > (char*)(memory) + memsize)
-            printf("error in heap alloc: numpages too high\n");
-        }
-
-      }
-
-      __device__ bool isOOM(void* p, size_t s){
-        // one thread that requested memory returned null
-        return  s && (p == NULL);
-      }
-
-
-      template < typename T_DeviceAllocator >
-      static void* initHeap( T_DeviceAllocator* heap, void* pool, size_t memsize){
-        if( pool == NULL && memsize != 0 )
-        {
-          throw std::invalid_argument(
-            "Scatter policy cannot use NULL for non-empty memory pools. "
-            "Maybe you are using an incompatible ReservePoolPolicy or AlignmentPolicy."
-          );
-        }
-        ScatterKernelDetail::initKernel<<<1,256>>>(heap, pool, memsize);
-        return heap;
-      }
-
-      /** counts how many elements of a size fit inside a given page
-       *
-       * Examines a (potentially already used) page to find how many elements
-       * of size chunksize still fit on the page. This includes hierarchically
-       * organized pages and empty pages. The algorithm determines the number
-       * of chunks in the page in a manner similar to the allocation algorithm
-       * of CreationPolicies::Scatter.
-       *
-       * @param page the number of the page to examine. The page needs to be
-       *        formatted with a chunksize and potentially a hierarchy.
-       * @param chunksize the size of element that should be placed inside the
-       *        page. This size must be appropriate to the formatting of the
-       *        page.
-       */
-      __device__ unsigned countFreeChunksInPage(uint32 page, uint32 chunksize){
-        uint32 filledChunks = _ptes[page].count;
-        if(chunksize <= HierarchyThreshold)
-        {
-          uint32 segmentsize = chunksize*32 + sizeof(uint32); //each segment can hold 32 2nd-level chunks
-          uint32 fullsegments = min(32,pagesize / segmentsize); //there might be space for more than 32 segments with 32 2nd-level chunks
-          uint32 additional_chunks = calcAdditionalChunks(fullsegments, segmentsize, chunksize);
-          uint32 level2Chunks = fullsegments * 32 + additional_chunks;
-          return level2Chunks - filledChunks;
-        }else{
-          uint32 chunksinpage = min(pagesize / chunksize, 32); //without hierarchy, there can not be more than 32 chunks
-          return chunksinpage - filledChunks;
-        }
-      }
-
-
-      /** counts the number of available slots inside the heap
-       *
-       * Searches the heap for all possible locations of an element with size
-       * slotSize. The used traversal algorithms are similar to the allocation
-       * strategy of CreationPolicies::Scatter, to ensure comparable results.
-       * There are 3 different algorithms, based on the size of the requested
-       * slot: 1 slot spans over multiple pages, 1 slot fits in one chunk
-       * within a page, 1 slot fits in a fraction of a chunk.
-       *
-       * @param slotSize the amount of bytes that a single slot accounts for
-       * @param gid the id of the thread. this id does not have to correspond
-       *        with threadId.x, but there must be a continous range of ids
-       *        beginning from 0.
-       * @param stride the stride should be equal to the number of different
-       *        gids (and therefore of value max(gid)-1)
-       */
-      __device__ unsigned getAvailaibleSlotsDeviceFunction(size_t slotSize, int gid, int stride)
-      {
-        unsigned slotcount = 0;
-        if(slotSize < pagesize){ // multiple slots per page
-          for(uint32 currentpage = gid; currentpage < _numpages; currentpage += stride){
-            uint32 maxchunksize = min(pagesize, wastefactor*(uint32)slotSize);
-            uint32 region = currentpage/regionsize;
-            uint32 regionfilllevel = _regions[region];
-
-            uint32 chunksize = _ptes[currentpage].chunksize;
-            if(chunksize >= slotSize && chunksize <= maxchunksize){ //how many chunks left? (each chunk is big enough)
-              slotcount += countFreeChunksInPage(currentpage, chunksize);
-            }else if(chunksize == 0){
-              chunksize  = max((uint32)slotSize, minChunkSize1); //ensure minimum chunk size
-              slotcount += countFreeChunksInPage(currentpage, chunksize); //how many chunks fit in one page?
-            }else{
-              continue; //the chunks on this page are too small for the request :(
-            }
-          }
-        }else{ // 1 slot needs multiple pages
-          if(gid > 0) return 0; //do this serially
-          uint32 pagestoalloc = divup((uint32)slotSize, pagesize);
-          uint32 freecount = 0;
-          for(uint32 currentpage = _numpages; currentpage > 0;){ //this already includes all superblocks
-            --currentpage;
-            if(_ptes[currentpage].chunksize == 0){
-              if(++freecount == pagestoalloc){
-                freecount = 0;
-                ++slotcount;
-              }
-            }else{ // the sequence of free pages was interrupted
-              freecount = 0;
-            }
-          }
-        }
-        return slotcount;
-      }
-
-
-      /** Count, how many elements can be allocated at maximum
-       *
-       * Takes an input size and determines, how many elements of this size can
-       * be allocated with the CreationPolicy Scatter. This will return the
-       * maximum number of free slots of the indicated size. It is not
-       * guaranteed where these slots are (regarding fragmentation). Therefore,
-       * the practically usable number of slots might be smaller. This function
-       * is executed in parallel. Speedup can possibly increased by a higher
-       * amount ofparallel workers.
-       *
-       * @param slotSize the size of allocatable elements to count
-       * @param obj a reference to the allocator instance (host-side)
-       */
-    public:
-      template<typename T_DeviceAllocator>
-      static unsigned getAvailableSlotsHost(size_t const slotSize, T_DeviceAllocator* heap){
-        unsigned h_slots = 0;
-        unsigned* d_slots;
-        cudaMalloc((void**) &d_slots, sizeof(unsigned));
-        cudaMemcpy(d_slots, &h_slots, sizeof(unsigned), cudaMemcpyHostToDevice);
-
-        ScatterKernelDetail::getAvailableSlotsKernel<<<64,256>>>(heap, slotSize, d_slots);
-
-        cudaMemcpy(&h_slots, d_slots, sizeof(unsigned), cudaMemcpyDeviceToHost);
-        cudaFree(d_slots);
-        return h_slots;
-      }
-
-
-      /** Count, how many elements can be allocated at maximum
-       *
-       * Takes an input size and determines, how many elements of this size can
-       * be allocated with the CreationPolicy Scatter. This will return the
-       * maximum number of free slots of the indicated size. It is not
-       * guaranteed where these slots are (regarding fragmentation). Therefore,
-       * the practically usable number of slots might be smaller. This function
-       * is executed separately for each warp and does not cooperate with other
-       * warps. Maximum speed is expected if every thread in the warp executes
-       * the function.
-       * Uses 256 byte of shared memory.
-       *
-       * @param slotSize the size of allocatable elements to count
-       */
-      __device__ unsigned getAvailableSlotsAccelerator(size_t slotSize){
-        int linearId;
-        int wId = warpid_withinblock(); //do not use warpid-function, since this value is not guaranteed to be stable across warp lifetime
-
-#if(__CUDACC_VER_MAJOR__ >= 9)
-        uint32 activeThreads  = __popc(__activemask());
-#else
-        uint32 activeThreads  = __popc(__ballot(true));
-#endif
-        __shared__ uint32 activePerWarp[MaxThreadsPerBlock::value / WarpSize::value]; //maximum number of warps in a block
-        __shared__ unsigned warpResults[MaxThreadsPerBlock::value / WarpSize::value];
-        warpResults[wId]   = 0;
-        activePerWarp[wId] = 0;
-
-        // the active threads obtain an id from 0 to activeThreads-1
-        if(slotSize>0) linearId = atomicAdd(&activePerWarp[wId], 1);
-        else return 0;
-
-        //printf("Block %d, id %d: activeThreads=%d linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId);
-        unsigned temp = getAvailaibleSlotsDeviceFunction(slotSize, linearId, activeThreads);
-        if(temp) atomicAdd(&warpResults[wId], temp);
-        __threadfence_block();
-        return warpResults[wId];
-      }
-
-
-      static std::string classname(){
-        std::stringstream ss;
-        ss << "Scatter[";
-        ss << pagesize        << ",";
-        ss << accessblocks    << ",";
-        ss << regionsize      << ",";
-        ss << wastefactor     << ",";
-        ss << resetfreedpages << ",";
-        ss << hashingK        << ",";
-        ss << hashingDistMP   << ",";
-        ss << hashingDistWP   << ",";
-        ss << hashingDistWPRel<< "]";
-        return ss.str();
-      }
-
-  };
-
-} //namespace CreationPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/device_allocator.hpp b/thirdParty/mallocMC/src/include/mallocMC/device_allocator.hpp
index 6c24fcfba7..b180161cf3 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/device_allocator.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/device_allocator.hpp
@@ -28,74 +28,57 @@
 
 #pragma once
 
-#include "mallocMC_utils.hpp"
 #include "mallocMC_constraints.hpp"
-#include "mallocMC_prefixes.hpp"
 #include "mallocMC_traits.hpp"
+#include "mallocMC_utils.hpp"
 
-#include <boost/cstdint.hpp>
-#include <stdio.h>
-
-namespace mallocMC{
-
-namespace detail{
+#include <alpaka/core/Common.hpp>
+#include <cstdint>
+#include <cstdio>
 
-    /**
-     * @brief Template class to call getAvailableSlots[Host|Accelerator] if the CreationPolicy provides it.
-     *
-     * Returns 0 else.
-     *
-     * @tparam T_Allocator The type of the Allocator to be used
-     * @tparam T_isHost True for the host call, false for the accelerator call
-     * @tparam T_providesAvailableSlots If the CreationPolicy provides getAvailableSlots[Host|Accelerator] (auto filled, do not set)
-     */
-    template<
-        typename T_Allocator,
-        bool T_providesAvailableSlots
-    >
-    struct GetAvailableSlotsIfAvailAcc
+namespace mallocMC
+{
+    namespace detail
     {
-        MAMC_ACCELERATOR static
-        unsigned
-        getAvailableSlots(
-            size_t,
-            T_Allocator &
-        )
+        /**
+         * @brief Template class to call getAvailableSlots[Host|Accelerator] if
+         * the CreationPolicy provides it.
+         *
+         * Returns 0 else.
+         *
+         * @tparam T_Allocator The type of the Allocator to be used
+         * @tparam T_isHost True for the host call, false for the accelerator
+         * call
+         * @tparam T_providesAvailableSlots If the CreationPolicy provides
+         * getAvailableSlots[Host|Accelerator] (auto filled, do not set)
+         */
+        template<typename AlpakaAcc, typename T_Allocator, bool T_providesAvailableSlots>
+        struct GetAvailableSlotsIfAvailAcc
         {
-            return 0;
-        }
-
-    };
-
-    template<
-        typename T_Allocator
-    >
-    struct GetAvailableSlotsIfAvailAcc<
-        T_Allocator,
-        true
-    >{
-        MAMC_ACCELERATOR static
-        unsigned
-        getAvailableSlots(
-            size_t slotSize,
-            T_Allocator& alloc
-        )
+            ALPAKA_FN_ACC static auto getAvailableSlots(const AlpakaAcc&, size_t, T_Allocator&) -> unsigned
+            {
+                return 0;
+            }
+        };
+
+        template<typename AlpakaAcc, typename T_Allocator>
+        struct GetAvailableSlotsIfAvailAcc<AlpakaAcc, T_Allocator, true>
         {
-            return alloc.T_Allocator::CreationPolicy
-                ::getAvailableSlotsAccelerator( slotSize );
-        }
-
-    };
-
-} // namespace detail
+            ALPAKA_FN_ACC static auto getAvailableSlots(const AlpakaAcc& acc, size_t slotSize, T_Allocator& alloc)
+                -> unsigned
+            {
+                return alloc.T_Allocator::CreationPolicy ::getAvailableSlotsAccelerator(acc, slotSize);
+            }
+        };
 
+    } // namespace detail
 
     /**
      * @brief "HostClass" that combines all policies to a useful allocator
      *
-     * This class implements the necessary glue-logic to form an actual allocator
-     * from the provided policies. It implements the public interface and
-     * executes some constraint checking based on an instance of the class
+     * This class implements the necessary glue-logic to form an actual
+     * allocator from the provided policies. It implements the public interface
+     * and executes some constraint checking based on an instance of the class
      * PolicyConstraints.
      *
      * @tparam T_CreationPolicy The desired type of a CreationPolicy
@@ -108,63 +91,49 @@ namespace detail{
         typename T_CreationPolicy,
         typename T_DistributionPolicy,
         typename T_OOMPolicy,
-        typename T_AlignmentPolicy
-    >
-    class DeviceAllocator :
-        public T_CreationPolicy
+        typename T_AlignmentPolicy>
+    class DeviceAllocator : public T_CreationPolicy
     {
-        typedef boost::uint32_t uint32;
+        using uint32 = std::uint32_t;
+
     public:
-        typedef T_CreationPolicy CreationPolicy;
-        typedef T_DistributionPolicy DistributionPolicy;
-        typedef T_OOMPolicy OOMPolicy;
-        typedef T_AlignmentPolicy AlignmentPolicy;
+        using CreationPolicy = T_CreationPolicy;
+        using DistributionPolicy = T_DistributionPolicy;
+        using OOMPolicy = T_OOMPolicy;
+        using AlignmentPolicy = T_AlignmentPolicy;
 
         void* pool;
 
-        MAMC_ACCELERATOR
-        void*
-        malloc(
-            size_t bytes
-        )
+        template<typename AlpakaAcc>
+        ALPAKA_FN_ACC auto malloc(const AlpakaAcc& acc, size_t bytes) -> void*
         {
-            DistributionPolicy distributionPolicy;
-            bytes = AlignmentPolicy::applyPadding( bytes );
-            uint32 req_size = distributionPolicy.collect( bytes );
-            void* memBlock = CreationPolicy::create( req_size );
-            const bool oom = CreationPolicy::isOOM( memBlock, req_size );
-            if( oom )
-                memBlock = OOMPolicy::handleOOM( memBlock );
-            void* myPart = distributionPolicy.distribute( memBlock );
-            return myPart;
+            bytes = AlignmentPolicy::applyPadding(bytes);
+            DistributionPolicy distributionPolicy(acc);
+            const uint32 req_size = distributionPolicy.collect(acc, bytes);
+            void* memBlock = CreationPolicy::create(acc, req_size);
+            if(CreationPolicy::isOOM(memBlock, req_size))
+                memBlock = OOMPolicy::handleOOM(memBlock);
+            return distributionPolicy.distribute(acc, memBlock);
         }
 
-        MAMC_ACCELERATOR
-        void
-        free(
-            void* p
-        )
+        template<typename AlpakaAcc>
+        ALPAKA_FN_ACC void free(const AlpakaAcc& acc, void* p)
         {
-            CreationPolicy::destroy( p );
+            CreationPolicy::destroy(acc, p);
         }
 
-
         /* polymorphism over the availability of getAvailableSlots for calling
          * from the accelerator
          */
-        MAMC_ACCELERATOR
-        unsigned
-        getAvailableSlots(
-            size_t slotSize
-        )
+        template<typename AlpakaAcc>
+        ALPAKA_FN_ACC auto getAvailableSlots(const AlpakaAcc& acc, size_t slotSize) -> unsigned
         {
-            slotSize = AlignmentPolicy::applyPadding( slotSize );
+            slotSize = AlignmentPolicy::applyPadding(slotSize);
             return detail::GetAvailableSlotsIfAvailAcc<
+                AlpakaAcc,
                 DeviceAllocator,
-                Traits< DeviceAllocator >::providesAvailableSlots
-            >::getAvailableSlots( slotSize, *this );
+                Traits<DeviceAllocator>::providesAvailableSlots>::getAvailableSlots(acc, slotSize, *this);
         }
-
     };
 
 } // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop.hpp b/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop.hpp
index 608a5f2f69..9bf11c0c1e 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop.hpp
@@ -27,17 +27,51 @@
 
 #pragma once
 
+#include "Noop.hpp"
 
-namespace mallocMC{
-namespace DistributionPolicies{
+#include <alpaka/core/Common.hpp>
+#include <cstdint>
+#include <string>
 
-  /**
-   * @brief a policy that does nothing
-   *
-   * This DistributionPolicy will not perform any distribution, but only return
-   * its input (identity function)
-   */
-  class Noop;
+namespace mallocMC
+{
+    namespace DistributionPolicies
+    {
+        /**
+         * @brief a policy that does nothing
+         *
+         * This DistributionPolicy will not perform any distribution, but only
+         * return its input (identity function)
+         */
+        class Noop
+        {
+            using uint32 = std::uint32_t;
 
-} //namespace DistributionPolicies
-} //namespace mallocMC
+        public:
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC Noop(const AlpakaAcc & /*acc*/)
+            {}
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto
+            collect(const AlpakaAcc & /*acc*/, uint32 bytes) const -> uint32
+            {
+                return bytes;
+            }
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto
+            distribute(const AlpakaAcc & /*acc*/, void * allocatedMem) const
+                -> void *
+            {
+                return allocatedMem;
+            }
+
+            static auto classname() -> std::string
+            {
+                return "Noop";
+            }
+        };
+
+    } // namespace DistributionPolicies
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop_impl.hpp
deleted file mode 100644
index 6d55c2b906..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/Noop_impl.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <boost/cstdint.hpp>
-#include <string>
-
-#include "Noop.hpp"
-#include "../mallocMC_prefixes.hpp"
-
-namespace mallocMC{
-namespace DistributionPolicies{
-    
-  class Noop 
-  {
-    typedef boost::uint32_t uint32;
-
-    public:
-
-    MAMC_ACCELERATOR
-    uint32 collect(uint32 bytes){
-      return bytes;
-    }
-
-    MAMC_ACCELERATOR
-    void* distribute(void* allocatedMem){
-      return allocatedMem;
-    }
-
-    static std::string classname(){
-      return "Noop";
-    }
-
-  };
-
-} //namespace DistributionPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
index c80b785a95..8b59ecc0f1 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
@@ -33,39 +33,160 @@
 
 #pragma once
 
-#include <boost/mpl/int.hpp>
-
-namespace mallocMC{
-namespace DistributionPolicies{
-    
-  namespace XMallocSIMDConf{
-    struct DefaultXMallocConfig{
-      typedef boost::mpl::int_<4096>     pagesize;
-    };  
-  }
-
-  /**
-   * @brief SIMD optimized chunk resizing in the style of XMalloc
-   *
-   * This DistributionPolicy can take the memory requests from a group of
-   * worker threads and combine them, so that only one of the workers will
-   * allocate the whole request. Later, each worker gets an appropriate offset
-   * into the allocated chunk. This is beneficial for SIMD architectures since
-   * only one of the workers has to compete for the resource.  This algorithm
-   * is inspired by the XMalloc memory allocator
-   * (http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5577907&tag=1) and
-   * its implementation in ScatterAlloc
-   * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604)
-   * XMallocSIMD is inteded to be used with Nvidia CUDA capable accelerators
-   * that support at least compute capability 2.0
-   *
-   * @tparam T_Config (optional) The configuration struct to overwrite
-   *        default configuration. The default can be obtained through
-   *        XMallocSIMD<>::Properties
-   */
-  template<class T_Config=XMallocSIMDConf::DefaultXMallocConfig>
-  class XMallocSIMD;
-
-
-} //namespace DistributionPolicies
-} //namespace mallocMC
+#include "../mallocMC_utils.hpp"
+#include "XMallocSIMD.hpp"
+
+#include <alpaka/alpaka.hpp>
+#include <cstdint>
+#include <limits>
+#include <sstream>
+#include <string>
+
+namespace mallocMC
+{
+    namespace DistributionPolicies
+    {
+        namespace XMallocSIMDConf
+        {
+            struct DefaultXMallocConfig
+            {
+                static constexpr auto pagesize = 4096;
+            };
+        } // namespace XMallocSIMDConf
+
+        /**
+         * @brief SIMD optimized chunk resizing in the style of XMalloc
+         *
+         * This DistributionPolicy can take the memory requests from a group of
+         * worker threads and combine them, so that only one of the workers will
+         * allocate the whole request. Later, each worker gets an appropriate
+         * offset into the allocated chunk. This is beneficial for SIMD
+         * architectures since only one of the workers has to compete for the
+         * resource.  This algorithm is inspired by the XMalloc memory allocator
+         * (http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5577907&tag=1)
+         * and its implementation in ScatterAlloc
+         * (http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6339604)
+         * XMallocSIMD is inteded to be used with Nvidia CUDA capable
+         * accelerators that support at least compute capability 2.0
+         *
+         * @tparam T_Config (optional) The configuration struct to overwrite
+         *        default configuration. The default can be obtained through
+         *        XMallocSIMD<>::Properties
+         */
+        template<typename T_Config = XMallocSIMDConf::DefaultXMallocConfig>
+        class XMallocSIMD
+        {
+        private:
+            using uint32 = std::uint32_t;
+            bool can_use_coalescing;
+            uint32 warpid;
+            uint32 myoffset;
+            uint32 threadcount;
+            uint32 req_size;
+
+        public:
+            using Properties = T_Config;
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC XMallocSIMD(const AlpakaAcc& acc)
+                : can_use_coalescing(false)
+                , warpid(warpid_withinblock(acc))
+                , myoffset(0)
+                , threadcount(0)
+                , req_size(0)
+            {
+            }
+
+        private:
+/** Allow for a hierarchical validation of parameters:
+ *
+ * shipped default-parameters (in the inherited struct) have lowest precedence.
+ * They will be overridden by a given configuration struct. However, even the
+ * given configuration struct can be overridden by compile-time command line
+ * parameters (e.g. -D MALLOCMC_DP_XMALLOCSIMD_PAGESIZE 1024)
+ *
+ * default-struct < template-struct < command-line parameter
+ */
+#ifndef MALLOCMC_DP_XMALLOCSIMD_PAGESIZE
+#    define MALLOCMC_DP_XMALLOCSIMD_PAGESIZE (Properties::pagesize)
+#endif
+            static constexpr uint32 pagesize = MALLOCMC_DP_XMALLOCSIMD_PAGESIZE;
+
+        public:
+            static constexpr uint32 _pagesize = pagesize;
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto collect(const AlpakaAcc& acc, uint32 bytes) -> uint32
+            {
+                can_use_coalescing = false;
+                myoffset = 0;
+                threadcount = 0;
+
+                // init with initial counter
+                auto& warp_sizecounter
+                    = alpaka::declareSharedVar<std::uint32_t[maxThreadsPerBlock / warpSize], __COUNTER__>(
+                        acc);
+                warp_sizecounter[warpid] = 16;
+
+                // second half: make sure that all coalesced allocations can fit
+                // within one page necessary for offset calculation
+                const bool coalescible = bytes > 0 && bytes < (pagesize / 32);
+
+#if(MALLOCMC_DEVICE_COMPILE)
+                threadcount = popc(ballot(coalescible));
+#else
+                threadcount = 1; // TODO
+#endif
+                if(coalescible && threadcount > 1)
+                {
+                    myoffset
+                        = alpaka::atomicOp<alpaka::AtomicAdd>(acc, &warp_sizecounter[warpid], bytes);
+                    can_use_coalescing = true;
+                }
+
+                req_size = bytes;
+                if(can_use_coalescing)
+                    req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0;
+
+                return req_size;
+            }
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto distribute(const AlpakaAcc& acc, void* allocatedMem) -> void*
+            {
+                auto& warp_res
+                    = alpaka::declareSharedVar<char * [maxThreadsPerBlock / warpSize], __COUNTER__>(acc);
+
+                char* myalloc = (char*) allocatedMem;
+                if(req_size && can_use_coalescing)
+                {
+                    warp_res[warpid] = myalloc;
+                    if(myalloc != 0)
+                        *(uint32*) myalloc = threadcount;
+                }
+
+                threadfenceBlock(acc);
+
+                void* myres = myalloc;
+                if(can_use_coalescing)
+                {
+                    if(warp_res[warpid] != 0)
+                        myres = warp_res[warpid] + myoffset;
+                    else
+                        myres = 0;
+                }
+                return myres;
+            }
+
+            ALPAKA_FN_HOST
+            static auto classname() -> std::string
+            {
+                std::stringstream ss;
+                ss << "XMallocSIMD[" << pagesize << "]";
+                return ss.str();
+            }
+        };
+
+    } // namespace DistributionPolicies
+
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp
deleted file mode 100644
index 37afe7f898..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/distributionPolicies/XMallocSIMD_impl.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  http://www.icg.tugraz.at/project/mvp
-
-  Copyright (C) 2012 Institute for Computer Graphics and Vision,
-                     Graz University of Technology
-  Copyright (C) 2014 Institute of Radiation Physics,
-                     Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
-              Rene Widera - r.widera ( at ) hzdr.de
-              Axel Huebl - a.huebl ( at ) hzdr.de
-              Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <boost/cstdint.hpp>
-#include <boost/static_assert.hpp>
-#include <limits>
-#include <string>
-#include <sstream>
-
-#include "../mallocMC_utils.hpp"
-#include "../mallocMC_prefixes.hpp"
-#include "XMallocSIMD.hpp"
-
-namespace mallocMC{
-namespace DistributionPolicies{
-
-  template<class T_Config>
-  class XMallocSIMD
-  {
-    private:
-
-      typedef boost::uint32_t uint32;
-      bool can_use_coalescing;
-      uint32 warpid;
-      uint32 myoffset;
-      uint32 threadcount;
-      uint32 req_size;
-    public:
-      typedef T_Config Properties;
-
-      MAMC_ACCELERATOR
-      XMallocSIMD() : can_use_coalescing(false), warpid(warpid_withinblock()),
-        myoffset(0), threadcount(0), req_size(0)
-      {}
-
-    private:
-/** Allow for a hierarchical validation of parameters:
- *
- * shipped default-parameters (in the inherited struct) have lowest precedence.
- * They will be overridden by a given configuration struct. However, even the
- * given configuration struct can be overridden by compile-time command line
- * parameters (e.g. -D MALLOCMC_DP_XMALLOCSIMD_PAGESIZE 1024)
- *
- * default-struct < template-struct < command-line parameter
- */
-#ifndef MALLOCMC_DP_XMALLOCSIMD_PAGESIZE
-#define MALLOCMC_DP_XMALLOCSIMD_PAGESIZE Properties::pagesize::value
-#endif
-      BOOST_STATIC_CONSTEXPR uint32 pagesize      = MALLOCMC_DP_XMALLOCSIMD_PAGESIZE;
-
-      //all the properties must be unsigned integers > 0
-      BOOST_STATIC_ASSERT(!std::numeric_limits<typename Properties::pagesize::type>::is_signed);
-
-      // \TODO: The static_cast can be removed once the minimal dependencies of
-      //        this project is are at least CUDA 7.0 and gcc 4.8.2
-      BOOST_STATIC_ASSERT(static_cast<uint32>(pagesize) > 0);
-
-    public:
-      BOOST_STATIC_CONSTEXPR uint32 _pagesize = pagesize;
-
-      MAMC_ACCELERATOR
-      uint32 collect(uint32 bytes){
-
-        can_use_coalescing = false;
-        myoffset = 0;
-        threadcount = 0;
-
-        //init with initial counter
-        __shared__ uint32 warp_sizecounter[MaxThreadsPerBlock::value / WarpSize::value];
-        warp_sizecounter[warpid] = 16;
-
-        //second half: make sure that all coalesced allocations can fit within one page
-        //necessary for offset calculation
-        bool coalescible = bytes > 0 && bytes < (pagesize / 32);
-#if(__CUDACC_VER_MAJOR__ >= 9)
-        threadcount = __popc(__ballot_sync(__activemask(), coalescible));
-#else
-        threadcount = __popc(__ballot(coalescible));
-#endif
-        if (coalescible && threadcount > 1)
-        {
-          myoffset = atomicAdd(&warp_sizecounter[warpid], bytes);
-          can_use_coalescing = true;
-        }
-
-        req_size = bytes;
-        if (can_use_coalescing)
-          req_size = (myoffset == 16) ? warp_sizecounter[warpid] : 0;
-
-        return req_size;
-      }
-
-
-      MAMC_ACCELERATOR
-      void* distribute(void* allocatedMem){
-        __shared__ char* warp_res[MaxThreadsPerBlock::value / WarpSize::value];
-
-        char* myalloc = (char*) allocatedMem;
-        if (req_size && can_use_coalescing)
-        {
-          warp_res[warpid] = myalloc;
-          if (myalloc != 0)
-            *(uint32*)myalloc = threadcount;
-        }
-        __threadfence_block();
-
-        void *myres = myalloc;
-        if(can_use_coalescing)
-        {
-          if(warp_res[warpid] != 0)
-            myres = warp_res[warpid] + myoffset;
-          else
-            myres = 0;
-        }
-        return myres;
-      }
-
-      MAMC_HOST
-      static std::string classname(){
-        std::stringstream ss;
-        ss << "XMallocSIMD[" << pagesize << "]";
-        return ss.str();
-      }
-
-  };
-
-} //namespace DistributionPolicies
-
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC.hpp
index 4e4210321e..e511f84883 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/mallocMC.hpp
@@ -43,8 +43,13 @@
 #include "mallocMC_hostclass.hpp"
 
 // all the policies
-#include "CreationPolicies.hpp"
-#include "DistributionPolicies.hpp"
-#include "ReservePoolPolicies.hpp"
-#include "AlignmentPolicies.hpp"
-#include "OOMPolicies.hpp"
+#include "alignmentPolicies/Noop.hpp"
+#include "alignmentPolicies/Shrink.hpp"
+#include "creationPolicies/OldMalloc.hpp"
+#include "creationPolicies/Scatter.hpp"
+#include "distributionPolicies/Noop.hpp"
+#include "distributionPolicies/XMallocSIMD.hpp"
+#include "oOMPolicies/BadAllocException.hpp"
+#include "oOMPolicies/ReturnNull.hpp"
+#include "reservePoolPolicies/AlpakaBuf.hpp"
+#include "reservePoolPolicies/CudaSetLimits.hpp"
diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_allocator_handle.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_allocator_handle.hpp
index 2832496d43..4d5361a75d 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_allocator_handle.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_allocator_handle.hpp
@@ -28,51 +28,38 @@
 
 #pragma once
 
-#include "mallocMC_prefixes.hpp"
+#include <alpaka/core/Common.hpp>
 
-namespace mallocMC{
-
-    template <typename T_HostAllocator>
+namespace mallocMC
+{
+    template<typename T_HostAllocator>
     struct AllocatorHandleImpl
     {
-        typedef typename T_HostAllocator::DevAllocator DevAllocator;
+        using DevAllocator = typename T_HostAllocator::DevAllocator;
 
         DevAllocator* devAllocator;
 
-        AllocatorHandleImpl(
-            DevAllocator* p
-        ) :
-            devAllocator( p )
+        explicit AllocatorHandleImpl(DevAllocator* p) : devAllocator(p)
         {
         }
 
-        MAMC_ACCELERATOR
-        void*
-        malloc(
-            size_t size
-        )
+        template<typename AlpakaAcc>
+        ALPAKA_FN_ACC auto malloc(const AlpakaAcc& acc, size_t size) -> void*
         {
-            return devAllocator->malloc( size );
+            return devAllocator->malloc(acc, size);
         }
 
-        MAMC_ACCELERATOR
-        void
-        free(
-            void* p
-        )
+        template<typename AlpakaAcc>
+        ALPAKA_FN_ACC void free(const AlpakaAcc& acc, void* p)
         {
-            devAllocator->free( p );
+            devAllocator->free(acc, p);
         }
 
-        MAMC_ACCELERATOR
-        unsigned
-        getAvailableSlots(
-            size_t slotSize
-        )
+        template<typename AlpakaAcc>
+        ALPAKA_FN_ACC auto getAvailableSlots(const AlpakaAcc& acc, size_t slotSize) -> unsigned
         {
-            return devAllocator->getAvailableSlots( slotSize );
+            return devAllocator->getAvailableSlots(acc, slotSize);
         }
-
     };
 
 } // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_constraints.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_constraints.hpp
index 867e97f73f..36c7034916 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_constraints.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_constraints.hpp
@@ -30,58 +30,61 @@
 
 #include "creationPolicies/Scatter.hpp"
 #include "distributionPolicies/XMallocSIMD.hpp"
-#include <boost/mpl/assert.hpp>
 
-namespace mallocMC{
-
-  /** The default PolicyCheckers (do always succeed)
-   */
-  template<typename Policy1>
-  class PolicyCheck1{};
-
-  template<typename Policy1, typename Policy2>
-  class PolicyCheck2{};
-
-  template<typename Policy1, typename Policy2, typename Policy3>
-  class PolicyCheck3{};
-
-  template<typename Policy1, typename Policy2, typename Policy3, typename Policy4>
-  class PolicyCheck4{};
-
-  template<typename Policy1, typename Policy2, typename Policy3, typename Policy4, typename Policy5>
-  class PolicyCheck5{};
-
-
-  /** Enforces constraints on policies or combinations of polices
-   * 
-   * Uses template specialization of PolicyChecker
-   */
-  template < 
-     typename T_CreationPolicy, 
-     typename T_DistributionPolicy, 
-     typename T_OOMPolicy, 
-     typename T_GetHeapPolicy,
-     typename T_AlignmentPolicy
-       >
-
-  class PolicyConstraints:PolicyCheck2<T_CreationPolicy, T_DistributionPolicy>{
-
-  };
-
-
-  /** Scatter and XMallocSIMD need the same pagesize!
-   *
-   * This constraint ensures that if the CreationPolicy "Scatter" and the
-   * DistributionPolicy "XMallocSIMD" are selected, they are configured to use
-   * the same value for their "pagesize"-parameter.
-   */
-  template<typename x, typename y, typename z >
-  class PolicyCheck2<
-    typename CreationPolicies::Scatter<x,y>,
-    typename DistributionPolicies::XMallocSIMD<z> 
-  >{
-    BOOST_MPL_ASSERT_MSG(x::pagesize::value == z::pagesize::value,
-        Pagesize_must_be_the_same_when_combining_Scatter_and_XMallocSIMD, () );
-  };
-
-}//namespace mallocMC
+namespace mallocMC
+{
+    /** The default PolicyCheckers (do always succeed)
+     */
+    template<typename Policy1>
+    class PolicyCheck1
+    {
+    };
+
+    template<typename Policy1, typename Policy2>
+    class PolicyCheck2
+    {
+    };
+
+    template<typename Policy1, typename Policy2, typename Policy3>
+    class PolicyCheck3
+    {
+    };
+
+    template<typename Policy1, typename Policy2, typename Policy3, typename Policy4>
+    class PolicyCheck4
+    {
+    };
+
+    template<typename Policy1, typename Policy2, typename Policy3, typename Policy4, typename Policy5>
+    class PolicyCheck5
+    {
+    };
+
+    /** Enforces constraints on policies or combinations of polices
+     *
+     * Uses template specialization of PolicyChecker
+     */
+    template<
+        typename T_CreationPolicy,
+        typename T_DistributionPolicy,
+        typename T_OOMPolicy,
+        typename T_GetHeapPolicy,
+        typename T_AlignmentPolicy>
+
+    class PolicyConstraints : PolicyCheck2<T_CreationPolicy, T_DistributionPolicy>
+    {
+    };
+
+    /** Scatter and XMallocSIMD need the same pagesize!
+     *
+     * This constraint ensures that if the CreationPolicy "Scatter" and the
+     * DistributionPolicy "XMallocSIMD" are selected, they are configured to use
+     * the same value for their "pagesize"-parameter.
+     */
+    template<typename x, typename y, typename z>
+    class PolicyCheck2<typename CreationPolicies::Scatter<x, y>, typename DistributionPolicies::XMallocSIMD<z>>
+    {
+        static_assert(x::pagesize == z::pagesize, "Pagesize must be the same when combining Scatter and XMallocSIMD");
+    };
+
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_hostclass.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_hostclass.hpp
index c47e6a1507..48bc1f748b 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_hostclass.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_hostclass.hpp
@@ -28,6 +28,6 @@
 
 #pragma once
 
-#include "mallocMC_traits.hpp"
-#include "device_allocator.hpp"
 #include "allocator.hpp"
+#include "device_allocator.hpp"
+#include "mallocMC_traits.hpp"
diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_prefixes.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_prefixes.hpp
deleted file mode 100644
index 1199e3e1ca..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_prefixes.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright (C) 2014 Institute of Radiation Physics,
-                     Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#define MAMC_HOST __host__
-#define MAMC_ACCELERATOR __device__
-
diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_traits.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_traits.hpp
index 0e811d7485..45e4cafe86 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_traits.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_traits.hpp
@@ -28,15 +28,11 @@
 
 #pragma once
 
-#include <boost/config.hpp>
-
-
-namespace mallocMC{
-
-    template <class T_Allocator>
-    struct  Traits{
-        BOOST_STATIC_CONSTEXPR bool providesAvailableSlots = T_Allocator::CreationPolicy::providesAvailableSlots::value;
+namespace mallocMC
+{
+    template<class T_Allocator>
+    struct Traits
+    {
+        static constexpr bool providesAvailableSlots = T_Allocator::CreationPolicy::providesAvailableSlots;
     };
-
-} //namespace mallocMC
-
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_utils.hpp b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_utils.hpp
index 2353cd373b..0fc2913872 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/mallocMC_utils.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/mallocMC_utils.hpp
@@ -33,204 +33,256 @@
 
 #pragma once
 
+#include <alpaka/alpaka.hpp>
+
 #ifdef _MSC_VER
-#include <intrin.h>
+#    include <intrin.h>
 #endif
 
-#include <string>
+#include <atomic>
+#include <cstdint>
 #include <sstream>
 #include <stdexcept>
-#include <boost/cstdint.hpp>
+#include <string>
+#include <type_traits>
+
+/* HIP-clang is doing something wrong and uses the host path of the code when __HIP_DEVICE_COMPILE__
+ * only is used to detect the device compile path.
+ * Since we require devices with support for ballot we can high-jack __HIP_ARCH_HAS_WARP_BALLOT__.
+ */
+#if(defined(__HIP_ARCH_HAS_WARP_BALLOT__) || defined(__CUDA_ARCH__) || __HIP_DEVICE_COMPILE__ == 1)
+#    define MALLOCMC_DEVICE_COMPILE 1
+#endif
 
-#include "mallocMC_prefixes.hpp"
+namespace mallocMC
+{
+    template<int PSIZE>
+    class __PointerEquivalent
+    {
+    public:
+        using type = unsigned int;
+    };
+    template<>
+    class __PointerEquivalent<8>
+    {
+    public:
+        using type = unsigned long long;
+    };
+
+#if defined(__CUDA_ARCH__)
+    constexpr auto warpSize = 32; // TODO
+#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+    constexpr auto warpSize = 64;
+#else
+    constexpr auto warpSize = 1;
+#endif
 
+    using PointerEquivalent = mallocMC::__PointerEquivalent<sizeof(char*)>::type;
 
-namespace CUDA
-{
-  class error : public std::runtime_error
-  {
-  private:
-    static std::string genErrorString(cudaError errorValue, const char* file, int line)
-    {
-      std::ostringstream msg;
-      msg << file << '(' << line << "): error: " << cudaGetErrorString(errorValue);
-      return msg.str();
+    ALPAKA_FN_ACC inline auto laneid()
+    {
+#if defined(__CUDA_ARCH__)
+        std::uint32_t mylaneid;
+        asm("mov.u32 %0, %%laneid;" : "=r"(mylaneid));
+        return mylaneid;
+#elif defined(__HIP_DEVICE_COMPILE__) && defined(__HIP__)
+        return __lane_id();
+#else
+        return 0u;
+#endif
+    }
+
+    /** warp index within a multiprocessor
+     *
+     * Index of the warp within the multiprocessor at the moment of the query.
+     * The result is volatile and can be different with each query.
+     *
+     * @return current index of the warp
+     */
+    ALPAKA_FN_ACC inline auto warpid()
+    {
+#if defined(__CUDA_ARCH__)
+        std::uint32_t mywarpid;
+        asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid));
+        return mywarpid;
+#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+        // get wave id
+        // https://github.com/ROCm-Developer-Tools/HIP/blob/f72a669487dd352e45321c4b3038f8fe2365c236/include/hip/hcc_detail/device_functions.h#L974-L1024
+        return __builtin_amdgcn_s_getreg(GETREG_IMMED(3, 0, 4));
+#else
+        return 0u;
+#endif
+    }
+
+    ALPAKA_FN_ACC inline auto smid()
+    {
+#if defined(__CUDA_ARCH__)
+        std::uint32_t mysmid;
+        asm("mov.u32 %0, %%smid;" : "=r"(mysmid));
+        return mysmid;
+#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+        return __smid();
+#else
+        return 0u;
+#endif
+    }
+
+    ALPAKA_FN_ACC inline auto lanemask_lt()
+    {
+#if defined(__CUDA_ARCH__)
+        std::uint32_t lanemask;
+        asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask));
+        return lanemask;
+#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+        return __lanemask_lt();
+#else
+        return 0u;
+#endif
     }
-  public:
-    error(cudaError errorValue, const char* file, int line)
-      : runtime_error(genErrorString(errorValue, file, line))
+
+    ALPAKA_FN_ACC inline auto ballot(int pred)
     {
+#if defined(__CUDA_ARCH__)
+        return __ballot_sync(__activemask(), pred);
+#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+        // return value is 64bit for HIP-clang
+        return __ballot(pred);
+#else
+        return 1u;
+#endif
     }
 
-    error(cudaError errorValue)
-      : runtime_error(cudaGetErrorString(errorValue))
+
+    ALPAKA_FN_ACC inline auto activemask()
     {
+#if defined(__CUDA_ARCH__)
+        return __activemask();
+#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+        // return value is 64bit for HIP-clang
+        return ballot(1);
+#else
+        return 1u;
+#endif
     }
 
-    error(const std::string& msg)
-      : runtime_error(msg)
+    template<class T>
+    ALPAKA_FN_HOST_ACC inline auto divup(T a, T b) -> T
     {
+        return (a + b - 1) / b;
     }
-  };
 
-  inline void checkError(cudaError errorValue, const char* file, int line)
-  {
-    if (errorValue != cudaSuccess)
-      throw CUDA::error(errorValue, file, line);
-  }
+    /** the maximal number threads per block, valid for sm_2.X - sm_7.5
+     *
+     * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+     */
+    constexpr uint32_t maxThreadsPerBlock = 1024;
+
+    /** warp id within a cuda block
+     *
+     * The id is constant over the lifetime of the thread.
+     * The id is not equal to warpid().
+     *
+     * @return warp id within the block
+     */
+    template<typename AlpakaAcc>
+    ALPAKA_FN_ACC inline auto warpid_withinblock(const AlpakaAcc& acc) -> std::uint32_t
+    {
+        const auto localId = alpaka::mapIdx<1>(
+            alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc),
+            alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc))[0];
+        return localId / warpSize;
+    }
 
-  inline void checkError(const char* file, int line)
-  {
-    checkError(cudaGetLastError(), file, line);
-  }
+    template<typename T>
+    ALPAKA_FN_ACC inline auto ffs(T mask) -> std::uint32_t
+    {
+#if defined(__CUDA_ARCH__)
+        return ::__ffs(mask);
+#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+        // return value is 64bit for HIP-clang
+        return ::__ffsll(static_cast<unsigned long long int>(mask));
+#else
+        if(mask == 0)
+            return 0;
+        auto i = 1u;
+        while((mask & 1) == 0)
+        {
+            mask >>= 1;
+            i++;
+        }
+        return i;
+#endif
+    }
 
-  inline void checkError()
-  {
-    cudaError errorValue = cudaGetLastError();
-    if (errorValue != cudaSuccess)
-      throw CUDA::error(errorValue);
-  }
+    template<typename T>
+    ALPAKA_FN_ACC inline auto popc(T mask) -> std::uint32_t
+    {
+#if defined(__CUDA_ARCH__)
+        return ::__popc(mask);
+#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+        // return value is 64bit for HIP-clang
+        return ::__popcll(static_cast<unsigned long long int>(mask));
+#else
+        // cf.
+        // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
+        std::uint32_t count = 0;
+        while(mask)
+        {
+            count++;
+            mask &= mask - 1;
+        }
+        return count;
+#endif
+    }
 
-#define MALLOCMC_CUDA_CHECKED_CALL(call) CUDA::checkError(call, __FILE__, __LINE__)
-#define MALLOCMC_CUDA_CHECK_ERROR() CUDA::checkError(__FILE__, __LINE__)
-}
+    // Threadfence implementations will maybe moved later into alpaka
+    template<typename T_Acc, typename T_Sfinae = void>
+    struct ThreadFence
+    {
+        // CPU only implementation
+        static void device()
+        {
+            std::atomic_thread_fence(std::memory_order::memory_order_seq_cst);
+        }
+
+        static void block()
+        {
+            std::atomic_thread_fence(std::memory_order::memory_order_seq_cst);
+        }
+    };
+
+    template<typename T_Acc>
+    struct ThreadFence<
+        T_Acc,
+        typename std::enable_if<
+            alpaka::concepts::ImplementsConcept<alpaka::ConceptUniformCudaHip, T_Acc>::value>::type>
+    {
+        static ALPAKA_FN_ACC void device()
+        {
+#if MALLOCMC_DEVICE_COMPILE
+            __threadfence();
+#endif
+        }
 
+        static ALPAKA_FN_ACC void block()
+        {
+#if MALLOCMC_DEVICE_COMPILE
+            __threadfence_block();
+#endif
+        }
+    };
 
-namespace mallocMC
-{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T_Acc>
+    ALPAKA_FN_ACC void threadfenceDevice(T_Acc const& acc)
+    {
+        ThreadFence<T_Acc>::device();
+    }
 
-  template<int PSIZE>
-  class __PointerEquivalent
-  {
-  public:
-    typedef unsigned int type;
-  };
-  template<>
-  class __PointerEquivalent<8>
-  {
-  public:
-    typedef unsigned long long int type;
-  };
-
-  typedef mallocMC::__PointerEquivalent<sizeof(char*)>::type PointerEquivalent;
-
-
-  MAMC_ACCELERATOR inline boost::uint32_t laneid()
-  {
-    boost::uint32_t mylaneid;
-    asm("mov.u32 %0, %%laneid;" : "=r" (mylaneid));
-    return mylaneid;
-  }
-
-  /** warp index within a multiprocessor
-   *
-   * Index of the warp within the multiprocessor at the moment of the query.
-   * The result is volatile and can be different with each query.
-   *
-   * @return current index of the warp
-   */
-  MAMC_ACCELERATOR inline boost::uint32_t warpid()
-  {
-    boost::uint32_t mywarpid;
-    asm("mov.u32 %0, %%warpid;" : "=r" (mywarpid));
-    return mywarpid;
-  }
-
-  /** maximum number of warps on a multiprocessor
-   *
-   * @return maximum number of warps on a multiprocessor
-   */
-  MAMC_ACCELERATOR inline boost::uint32_t nwarpid()
-  {
-    boost::uint32_t mynwarpid;
-    asm("mov.u32 %0, %%nwarpid;" : "=r" (mynwarpid));
-    return mynwarpid;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t smid()
-  {
-    boost::uint32_t mysmid;
-    asm("mov.u32 %0, %%smid;" : "=r" (mysmid));
-    return mysmid;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t nsmid()
-  {
-    boost::uint32_t mynsmid;
-    asm("mov.u32 %0, %%nsmid;" : "=r" (mynsmid));
-    return mynsmid;
-  }
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_eq;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask_le()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_le;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask_lt()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_lt;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask_ge()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_ge;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  MAMC_ACCELERATOR inline boost::uint32_t lanemask_gt()
-  {
-    boost::uint32_t lanemask;
-    asm("mov.u32 %0, %%lanemask_gt;" : "=r" (lanemask));
-    return lanemask;
-  }
-
-  template<class T>
-  MAMC_HOST MAMC_ACCELERATOR inline T divup(T a, T b) { return (a + b - 1)/b; }
-
-  /** the maximal number threads per block
-   *
-   * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-   */
-  struct MaxThreadsPerBlock
-  {
-    // valid for sm_2.X - sm_7.5
-    BOOST_STATIC_CONSTEXPR uint32_t value = 1024;
-  };
-
-  /** number of threads within a warp
-   *
-   * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-   */
-  struct WarpSize
-  {
-    // valid for sm_2.X - sm_7.5
-    BOOST_STATIC_CONSTEXPR uint32_t value = 32;
-  };
-
-  /** warp id within a cuda block
-   *
-   * The id is constant over the lifetime of the thread.
-   * The id is not equal to warpid().
-   *
-   * @return warp id within the block
-   */
-  MAMC_ACCELERATOR inline boost::uint32_t warpid_withinblock()
-  {
-    return (
-      threadIdx.z * blockDim.y * blockDim.x +
-      threadIdx.y * blockDim.x +
-      threadIdx.x
-    ) / WarpSize::value;
-  }
-}
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T_Acc>
+    ALPAKA_FN_ACC void threadfenceBlock(T_Acc const& acc)
+    {
+        ThreadFence<T_Acc>::block();
+    }
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException.hpp b/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException.hpp
index 9f25ecaead..9c28b84e80 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException.hpp
@@ -27,19 +27,51 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace OOMPolicies{
-
-  /**
-   * @brief Throws a std::bad_alloc exception on OutOfMemory
-   *
-   * This OOMPolicy will throw a std::bad_alloc exception, if the accelerator
-   * supports it. Currently, Nvidia CUDA does not support any form of exception
-   * handling, therefore handleOOM() does not have any effect on these
-   * accelerators. Using this policy on other types of accelerators that do not
-   * support exceptions results in undefined behaviour.
-   */
-  struct BadAllocException;
-
-} //namespace OOMPolicies
-} //namespace mallocMC
+#include "BadAllocException.hpp"
+
+#include <alpaka/core/Common.hpp>
+#include <cassert>
+#include <string>
+
+namespace mallocMC
+{
+    namespace OOMPolicies
+    {
+        /**
+         * @brief Throws a std::bad_alloc exception on OutOfMemory
+         *
+         * This OOMPolicy will throw a std::bad_alloc exception, if the
+         * accelerator supports it. Currently, Nvidia CUDA does not support any
+         * form of exception handling, therefore handleOOM() does not have any
+         * effect on these accelerators. Using this policy on other types of
+         * accelerators that do not support exceptions results in undefined
+         * behaviour.
+         */
+        struct BadAllocException
+        {
+            ALPAKA_FN_ACC
+            static auto handleOOM(void* mem) -> void*
+            {
+#if BOOST_LANG_CUDA || BOOST_COMP_HIP
+//#if __CUDA_ARCH__ < 350
+#    define PM_EXCEPTIONS_NOT_SUPPORTED_HERE
+//#endif
+#endif
+
+#ifdef PM_EXCEPTIONS_NOT_SUPPORTED_HERE
+#    undef PM_EXCEPTIONS_NOT_SUPPORTED_HERE
+                assert(false);
+#else
+                throw std::bad_alloc{};
+#endif
+                return mem;
+            }
+
+            static auto classname() -> std::string
+            {
+                return "BadAllocException";
+            }
+        };
+
+    } // namespace OOMPolicies
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp
deleted file mode 100644
index fb180370d9..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/BadAllocException_impl.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <cassert>
-#include <string>
-
-#include "BadAllocException.hpp"
-#include "../mallocMC_prefixes.hpp"
-
-namespace mallocMC{
-namespace OOMPolicies{
-
-  struct BadAllocException
-  {
-    MAMC_ACCELERATOR
-    static void* handleOOM(void* mem){
-#ifdef __CUDACC__
-//#if __CUDA_ARCH__ < 350
-#define PM_EXCEPTIONS_NOT_SUPPORTED_HERE
-//#endif
-#endif
-
-#ifdef PM_EXCEPTIONS_NOT_SUPPORTED_HERE
-#undef PM_EXCEPTIONS_NOT_SUPPORTED_HERE
-      assert(false);
-#else
-      std::bad_alloc exception;
-      throw exception;
-#endif
-      return mem;
-    }
-
-    static std::string classname(){
-      return "BadAllocException";
-    }
-  };
-
-} //namespace OOMPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull.hpp b/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull.hpp
index 2db8568a0e..5ddd698a7d 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull.hpp
@@ -27,15 +27,34 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace OOMPolicies{
-
-  /**
-   * @brief Returns a NULL pointer on OutOfMemory conditions
-   *
-   * This OOMPolicy will return NULL, if handleOOM() is called.
-   */
-  class ReturnNull;
-    
-} //namespace OOMPolicies
-} //namespace mallocMC
+#include "ReturnNull.hpp"
+
+#include <alpaka/core/Common.hpp>
+#include <string>
+
+namespace mallocMC
+{
+    namespace OOMPolicies
+    {
+        /**
+         * @brief Returns a nullptr pointer on OutOfMemory conditions
+         *
+         * This OOMPolicy will return nullptr, if handleOOM() is called.
+         */
+        class ReturnNull
+        {
+        public:
+            ALPAKA_FN_ACC
+            static auto handleOOM(void* mem) -> void*
+            {
+                return nullptr;
+            }
+
+            static auto classname() -> std::string
+            {
+                return "ReturnNull";
+            }
+        };
+
+    } // namespace OOMPolicies
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp
deleted file mode 100644
index f18bd0e20d..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/oOMPolicies/ReturnNull_impl.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <string>
-
-#include "ReturnNull.hpp"
-#include "../mallocMC_prefixes.hpp"
-
-namespace mallocMC{
-namespace OOMPolicies{
-
-  class ReturnNull
-  {
-    public:
-      MAMC_ACCELERATOR
-      static void* handleOOM(void* mem){
-        return NULL;
-      }
-
-      static std::string classname(){
-        return "ReturnNull";
-      }
-  };
-
-} //namespace OOMPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp
new file mode 100644
index 0000000000..19989eb1f1
--- /dev/null
+++ b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp
@@ -0,0 +1,65 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+
+  Copyright 2020 Helmholtz-Zentrum Dresden - Rossendorf,
+                 CERN
+
+  Author(s):  Bernhard Manfred Gruber
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+#include <memory>
+#include <string>
+
+namespace mallocMC
+{
+    namespace ReservePoolPolicies
+    {
+        template<typename AlpakaAcc>
+        struct AlpakaBuf
+        {
+            template<typename AlpakaDev>
+            auto setMemPool(const AlpakaDev& dev, size_t memsize) -> void*
+            {
+                poolBuffer
+                    = std::make_unique<PoolBufferType>(alpaka::allocBuf<unsigned char, size_t>(dev, memsize));
+                return alpaka::getPtrNative(*poolBuffer);
+            }
+
+            void resetMemPool(void* p)
+            {
+                poolBuffer = {};
+            }
+
+            static auto classname() -> std::string
+            {
+                return "AlpakaBuf";
+            }
+
+        private:
+            using PoolBufferType
+                = alpaka::Buf<alpaka::Dev<AlpakaAcc>, unsigned char, alpaka::DimInt<1>, size_t>;
+            std::unique_ptr<PoolBufferType> poolBuffer; // FIXME(bgruber): replace by std::optional<>
+        };
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
index cfbceea2be..1835ad664c 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
@@ -27,18 +27,57 @@
 
 #pragma once
 
-namespace mallocMC{
-namespace ReservePoolPolicies{
-
-  /**
-   * @brief set CUDA internal heap for device-side malloc calls
-   *
-   * This ReservePoolPolicy is intended for use with CUDA capable accelerators
-   * that support at least compute capability 2.0. It should be used in
-   * conjunction with a CreationPolicy that actually requires the CUDA-internal
-   * heap to be sized by calls to cudaDeviceSetLimit()
-   */
-  struct CudaSetLimits;
-
-} //namespace ReservePoolPolicies
-} //namespace mallocMC
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#    include "CudaSetLimits.hpp"
+
+#    include <cuda_runtime_api.h>
+#    include <mutex>
+#    include <string>
+
+namespace mallocMC
+{
+    namespace ReservePoolPolicies
+    {
+        /**
+         * @brief set CUDA internal heap for device-side malloc calls
+         *
+         * This ReservePoolPolicy is intended for use with CUDA capable
+         * accelerators that support at least compute capability 2.0. It should
+         * be used in conjunction with a CreationPolicy that actually requires
+         * the CUDA-internal heap to be sized by calls to cudaDeviceSetLimit().
+         *
+         * This policy sets the cudaLimitMallocHeapSize device limit. This value
+         * can no longer be changed once a kernel using ::malloc()/::free() has
+         * been run. Subsequent attempts will result in errors unless the device
+         * is reset via cudaDeviceReset(). See:
+         * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g05956f16eaa47ef3a4efee84563ccb7d
+         */
+        // TODO alpaka
+        struct CudaSetLimits
+        {
+            template<typename AlpakaDev>
+            auto setMemPool(const AlpakaDev& dev, size_t memsize) -> void*
+            {
+                cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize);
+                return nullptr;
+            }
+
+            static void resetMemPool(void* p = nullptr)
+            {
+                cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8192U);
+                cudaGetLastError(); // cudaDeviceSetLimit() usually fails if any
+                                    // kernel before used ::malloc(), so let's
+                                    // clear the error state
+            }
+
+            static auto classname() -> std::string
+            {
+                return "CudaSetLimits";
+            }
+        };
+
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC
+
+#endif
diff --git a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp
deleted file mode 100644
index eb4ddfa27f..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/CudaSetLimits_impl.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <cuda_runtime_api.h>
-#include <string>
-
-#include "CudaSetLimits.hpp"
-
-namespace mallocMC{
-namespace ReservePoolPolicies{
-
-  struct CudaSetLimits{
-    static void* setMemPool(size_t memsize){
-      cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize);
-      return NULL;
-    }
-
-    static void resetMemPool(void *p=NULL){
-      cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8192U);
-    }
-
-    static std::string classname(){
-      return "CudaSetLimits";
-    }
-
-  };
-
-} //namespace ReservePoolPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp
deleted file mode 100644
index 1bb386f73f..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-namespace mallocMC{
-namespace ReservePoolPolicies{
-
-  /**
-   * @brief creates/allocates a fixed memory pool on the accelerator
-   *
-   * This ReservePoolPolicy will create a memory pool of a fixed size on the
-   * accelerator by using a host-side call to cudaMalloc(). The pool is later
-   * freed through cudaFree(). This can only be used with accelerators that
-   * support CUDA and compute capability 2.0 or higher.
-   */
-  struct SimpleCudaMalloc;
-
-} //namespace ReservePoolPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp b/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp
deleted file mode 100644
index 23563d97ad..0000000000
--- a/thirdParty/mallocMC/src/include/mallocMC/reservePoolPolicies/SimpleCudaMalloc_impl.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-#pragma once
-
-#include <string>
-
-#include "../mallocMC_utils.hpp"
-
-#include "SimpleCudaMalloc.hpp"
-
-namespace mallocMC{
-namespace ReservePoolPolicies{
-
-  struct SimpleCudaMalloc{
-    static void* setMemPool(size_t memsize){
-      void* pool = NULL;
-      MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc(&pool, memsize));
-      return pool;
-    }
-
-    static void resetMemPool(void* p){
-      MALLOCMC_CUDA_CHECKED_CALL(cudaFree(p));
-    }
-
-    static std::string classname(){
-      return "SimpleCudaMalloc";
-    }
-
-  };
-
-} //namespace ReservePoolPolicies
-} //namespace mallocMC
diff --git a/thirdParty/mallocMC/src/include/mallocMC/version.hpp b/thirdParty/mallocMC/src/include/mallocMC/version.hpp
index f2c15e1e19..c482b73e0d 100644
--- a/thirdParty/mallocMC/src/include/mallocMC/version.hpp
+++ b/thirdParty/mallocMC/src/include/mallocMC/version.hpp
@@ -38,10 +38,10 @@
 
 /** the mallocMC version: major API changes should be reflected here */
 #define MALLOCMC_VERSION_MAJOR 2
-#define MALLOCMC_VERSION_MINOR 3
-#define MALLOCMC_VERSION_PATCH 1
+#define MALLOCMC_VERSION_MINOR 6
+#define MALLOCMC_VERSION_PATCH 0
 
 /** the mallocMC flavor is used to differentiate the releases of the
  *  Computational Radiation Physics group (crp) from other releases
  *  This should be useful to avoid versioning conflicts */
-#define MALLOCMC_FLAVOR "crp"
+#define MALLOCMC_FLAVOR "crp-dev"
diff --git a/thirdParty/mallocMC/tests/dimensions.cpp b/thirdParty/mallocMC/tests/dimensions.cpp
new file mode 100644
index 0000000000..b7cb967a2e
--- /dev/null
+++ b/thirdParty/mallocMC/tests/dimensions.cpp
@@ -0,0 +1,352 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+
+  Copyright 2020 Helmholtz-Zentrum Dresden - Rossendorf,
+                 CERN
+
+  Author(s):  Bernhard Manfred Gruber
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include <alpaka/alpaka.hpp>
+#include <catch2/catch.hpp>
+#include <mallocMC/mallocMC.hpp>
+
+using Idx = std::size_t;
+
+struct ScatterConfig
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
+};
+
+struct ScatterHashParams
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
+};
+
+struct DistributionConfig
+{
+    static constexpr auto pagesize = ScatterConfig::pagesize;
+};
+
+struct AlignmentConfig
+{
+    static constexpr auto dataAlignment = 16;
+};
+
+ALPAKA_STATIC_ACC_MEM_GLOBAL int** deviceArray;
+
+template<template<typename, typename> typename AccTemplate>
+void test1D()
+{
+    using Dim = alpaka::DimInt<1>;
+    using Acc = AccTemplate<Dim, Idx>;
+
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+        // mallocMC::CreationPolicies::OldMalloc,
+        mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+        // mallocMC::ReservePoolPolicies::CudaSetLimits,
+        mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+
+    const auto dev = alpaka::getDevByIdx<Acc>(0);
+    auto queue = alpaka::Queue<Acc, alpaka::Blocking>{dev};
+
+    constexpr auto N = 16;
+    static_assert(N <= mallocMC::maxThreadsPerBlock, "");
+
+    ScatterAllocator scatterAlloc(dev, queue, 1024U * 1024U); // 1 MiB
+
+    // make 1 allocation from 1 thread for N * N pointers
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{Idx{1}, Idx{1}, Idx{1}},
+            [] ALPAKA_FN_ACC(const Acc& acc, int N, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * N * N);
+            },
+            N,
+            scatterAlloc.getAllocatorHandle()));
+
+    // make N * N allocations from N block of N threads for ints
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{Idx{N}, Idx{N}, Idx{1}},
+            [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                const auto i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+                deviceArray[i] = (int*) allocHandle.malloc(acc, sizeof(int));
+            },
+            scatterAlloc.getAllocatorHandle()));
+
+    const auto slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int));
+    const auto heapInfo = scatterAlloc.getHeapLocations().at(0);
+    std::cout << alpaka::traits::GetAccName<Acc>::getAccName() << " slots: " << slots
+              << " heap size: " << heapInfo.size << '\n';
+
+    // free N * N allocations from N block of N threads for ints
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{Idx{N}, Idx{N}, Idx{1}},
+            [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                const auto i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+                allocHandle.free(acc, deviceArray[i]);
+            },
+            scatterAlloc.getAllocatorHandle()));
+
+    // free 1 allocation from 1 thread for N * N pointers
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{Idx{1}, Idx{1}, Idx{1}},
+            [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                allocHandle.free(acc, deviceArray);
+            },
+            scatterAlloc.getAllocatorHandle()));
+}
+
+template<template<typename, typename> typename AccTemplate>
+void test2D()
+{
+    using Dim = alpaka::DimInt<2>;
+    using Acc = AccTemplate<Dim, Idx>;
+
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+        mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+        mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+
+    const auto dev = alpaka::getDevByIdx<Acc>(0);
+    auto queue = alpaka::Queue<Acc, alpaka::Blocking>{dev};
+
+    constexpr auto N = 8;
+    static_assert(N * N <= mallocMC::maxThreadsPerBlock, "");
+
+    ScatterAllocator scatterAlloc(dev, queue, 1024U * 1024U); // 1 MiB
+
+    // make 1 allocation from 1 thread for N*N * N*N pointers
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{
+                alpaka::Vec<Dim, Idx>::all(1),
+                alpaka::Vec<Dim, Idx>::all(1),
+                alpaka::Vec<Dim, Idx>::all(1)},
+            [] ALPAKA_FN_ACC(const Acc& acc, int N, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * N * N * N * N);
+            },
+            N,
+            scatterAlloc.getAllocatorHandle()));
+
+    // make N*N * N*N allocations from N*N block of N*N threads for ints
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{
+                alpaka::Vec<Dim, Idx>::all(N),
+                alpaka::Vec<Dim, Idx>::all(N),
+                alpaka::Vec<Dim, Idx>::all(1)},
+            [] ALPAKA_FN_ACC(const Acc& acc, int N, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+                deviceArray[idx[0] * N * N + idx[1]] = (int*) allocHandle.malloc(acc, sizeof(int));
+            },
+            N,
+            scatterAlloc.getAllocatorHandle()));
+
+    const auto slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int));
+    const auto heapInfo = scatterAlloc.getHeapLocations().at(0);
+    std::cout << alpaka::traits::GetAccName<Acc>::getAccName() << " slots: " << slots
+              << " heap size: " << heapInfo.size << '\n';
+
+    // free N*N * N*N allocations from N*N block of N*N threads for ints
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{
+                alpaka::Vec<Dim, Idx>::all(N),
+                alpaka::Vec<Dim, Idx>::all(N),
+                alpaka::Vec<Dim, Idx>::all(1)},
+            [] ALPAKA_FN_ACC(const Acc& acc, int N, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+                allocHandle.free(acc, deviceArray[idx[0] * N * N + idx[1]]);
+            },
+            N,
+            scatterAlloc.getAllocatorHandle()));
+
+    // free 1 allocation from 1 thread for N*N * N*N pointers
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{
+                alpaka::Vec<Dim, Idx>::all(1),
+                alpaka::Vec<Dim, Idx>::all(1),
+                alpaka::Vec<Dim, Idx>::all(1)},
+            [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                allocHandle.free(acc, deviceArray);
+            },
+            scatterAlloc.getAllocatorHandle()));
+}
+
+template<template<typename, typename> typename AccTemplate>
+void test3D()
+{
+    using Dim = alpaka::DimInt<3>;
+    using Acc = AccTemplate<Dim, Idx>;
+
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+        mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+        mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+
+    const auto dev = alpaka::getDevByIdx<Acc>(0);
+    auto queue = alpaka::Queue<Acc, alpaka::Blocking>{dev};
+
+    constexpr auto N = 4;
+    static_assert(N * N * N <= mallocMC::maxThreadsPerBlock, "");
+
+    ScatterAllocator scatterAlloc(dev, queue, 1024U * 1024U); // 1 MiB
+
+    // make 1 allocation from 1 thread for N*N*N * N*N*N pointers
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{
+                alpaka::Vec<Dim, Idx>::all(1),
+                alpaka::Vec<Dim, Idx>::all(1),
+                alpaka::Vec<Dim, Idx>::all(1)},
+            [] ALPAKA_FN_ACC(const Acc& acc, int N, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * N * N * N * N * N * N);
+            },
+            N,
+            scatterAlloc.getAllocatorHandle()));
+
+    // make N*N*N * N*N*N allocations from N*N*N blocks of N*N*N threads for
+    // ints
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{
+                alpaka::Vec<Dim, Idx>::all(N),
+                alpaka::Vec<Dim, Idx>::all(N),
+                alpaka::Vec<Dim, Idx>::all(1)},
+            [] ALPAKA_FN_ACC(const Acc& acc, int N, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+                deviceArray[idx[0] * N * N * N * N + idx[1] * N * N + idx[0]]
+                    = (int*) allocHandle.malloc(acc, sizeof(int));
+            },
+            N,
+            scatterAlloc.getAllocatorHandle()));
+
+    const auto slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int));
+    const auto heapInfo = scatterAlloc.getHeapLocations().at(0);
+    std::cout << alpaka::traits::GetAccName<Acc>::getAccName() << " slots: " << slots
+              << " heap size: " << heapInfo.size << '\n';
+
+    // free N*N*N * N*N*N allocations from N*N*N blocks of N*N*N threads for
+    // ints
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{
+                alpaka::Vec<Dim, Idx>::all(N),
+                alpaka::Vec<Dim, Idx>::all(N),
+                alpaka::Vec<Dim, Idx>::all(1)},
+            [] ALPAKA_FN_ACC(const Acc& acc, int N, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+                allocHandle.free(acc, deviceArray[idx[0] * N * N * N * N + idx[1] * N * N + idx[0]]);
+            },
+            N,
+            scatterAlloc.getAllocatorHandle()));
+
+    // free 1 allocation from 1 thread for N*N*N * N*N*N pointers
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{
+                alpaka::Vec<Dim, Idx>::all(1),
+                alpaka::Vec<Dim, Idx>::all(1),
+                alpaka::Vec<Dim, Idx>::all(1)},
+            [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                allocHandle.free(acc, deviceArray);
+            },
+            scatterAlloc.getAllocatorHandle()));
+}
+
+TEST_CASE("1D AccGpuCudaRt")
+{
+    test1D<alpaka::AccGpuCudaRt>();
+}
+
+TEST_CASE("2D AccGpuCudaRt")
+{
+    test2D<alpaka::AccGpuCudaRt>();
+}
+
+TEST_CASE("3D AccGpuCudaRt")
+{
+    test3D<alpaka::AccGpuCudaRt>();
+}
+
+TEST_CASE("1D AccCpuThreads")
+{
+    test1D<alpaka::AccCpuThreads>();
+}
+
+TEST_CASE("2D AccCpuThreads")
+{
+    test2D<alpaka::AccCpuThreads>();
+}
+
+TEST_CASE("3D AccCpuThreads")
+{
+    test3D<alpaka::AccCpuThreads>();
+}
+
+TEST_CASE("1D AccCpuOmp2Threads")
+{
+    test1D<alpaka::AccCpuOmp2Threads>();
+}
+
+TEST_CASE("2D AccCpuOmp2Threads")
+{
+    test2D<alpaka::AccCpuOmp2Threads>();
+}
+
+TEST_CASE("3D AccCpuOmp2Threads")
+{
+    test3D<alpaka::AccCpuOmp2Threads>();
+}
diff --git a/thirdParty/mallocMC/tests/main.cpp b/thirdParty/mallocMC/tests/main.cpp
new file mode 100644
index 0000000000..4ed06df1f7
--- /dev/null
+++ b/thirdParty/mallocMC/tests/main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch.hpp>
diff --git a/thirdParty/mallocMC/tests/policies.cpp b/thirdParty/mallocMC/tests/policies.cpp
new file mode 100644
index 0000000000..be56b1a218
--- /dev/null
+++ b/thirdParty/mallocMC/tests/policies.cpp
@@ -0,0 +1,185 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+
+  Copyright 2020 Helmholtz-Zentrum Dresden - Rossendorf,
+                 CERN
+
+  Author(s):  Bernhard Manfred Gruber
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include <alpaka/alpaka.hpp>
+#include <catch2/catch.hpp>
+#include <mallocMC/mallocMC.hpp>
+
+using Idx = std::size_t;
+using Dim = alpaka::DimInt<1>;
+using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+
+struct ScatterConfig
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
+};
+
+struct ScatterHashParams
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
+};
+
+struct DistributionConfig
+{
+    static constexpr auto pagesize = ScatterConfig::pagesize;
+};
+
+struct AlignmentConfig
+{
+    static constexpr auto dataAlignment = 16;
+};
+
+template<typename ScatterAllocator>
+void run()
+{
+    const auto dev = alpaka::getDevByIdx<Acc>(0);
+    auto queue = alpaka::Queue<Acc, alpaka::Blocking>{dev};
+
+    ScatterAllocator scatterAlloc(dev, queue, 1024U * 1024U); // 1 MiB
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            alpaka::WorkDivMembers<Dim, Idx>{Idx{1}, Idx{1}, Idx{1}},
+            [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) {
+                auto* ptr = allocHandle.malloc(acc, sizeof(int) * 1000);
+                allocHandle.free(acc, ptr);
+            },
+            scatterAlloc.getAllocatorHandle()));
+}
+
+TEST_CASE("Scatter XMallocSIMD ReturnNull AlpakaBuf Shrink")
+{
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+        mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+        mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+    run<ScatterAllocator>();
+}
+
+TEST_CASE("Scatter XMallocSIMD ReturnNull AlpakaBuf Noop")
+{
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+        mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+        mallocMC::AlignmentPolicies::Noop>;
+    run<ScatterAllocator>();
+}
+
+TEST_CASE("Scatter Noop ReturnNull AlpakaBuf Shrink")
+{
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+        mallocMC::DistributionPolicies::Noop,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+        mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+    run<ScatterAllocator>();
+}
+
+TEST_CASE("Scatter Noop ReturnNull AlpakaBuf Noop")
+{
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+        mallocMC::DistributionPolicies::Noop,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+        mallocMC::AlignmentPolicies::Noop>;
+    run<ScatterAllocator>();
+}
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+TEST_CASE("OldMalloc XMallocSIMD ReturnNull CudaSetLimits Shrink")
+{
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::OldMalloc,
+        mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::CudaSetLimits,
+        mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+    run<ScatterAllocator>();
+
+    cudaDeviceReset();
+}
+
+TEST_CASE("OldMalloc XMallocSIMD ReturnNull CudaSetLimits Noop")
+{
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::OldMalloc,
+        mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::CudaSetLimits,
+        mallocMC::AlignmentPolicies::Noop>;
+    run<ScatterAllocator>();
+
+    cudaDeviceReset();
+}
+
+TEST_CASE("OldMalloc Noop ReturnNull CudaSetLimits Shrink")
+{
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::OldMalloc,
+        mallocMC::DistributionPolicies::Noop,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::CudaSetLimits,
+        mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+    run<ScatterAllocator>();
+
+    cudaDeviceReset();
+}
+
+TEST_CASE("OldMalloc Noop ReturnNull CudaSetLimits Noop")
+{
+    using ScatterAllocator = mallocMC::Allocator<
+        Acc,
+        mallocMC::CreationPolicies::OldMalloc,
+        mallocMC::DistributionPolicies::Noop,
+        mallocMC::OOMPolicies::ReturnNull,
+        mallocMC::ReservePoolPolicies::CudaSetLimits,
+        mallocMC::AlignmentPolicies::Noop>;
+    run<ScatterAllocator>();
+
+    cudaDeviceReset();
+}
+#endif
diff --git a/thirdParty/mallocMC/tests/verify_heap.cpp b/thirdParty/mallocMC/tests/verify_heap.cpp
new file mode 100644
index 0000000000..fe385d65f1
--- /dev/null
+++ b/thirdParty/mallocMC/tests/verify_heap.cpp
@@ -0,0 +1,729 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  https://www.hzdr.de/crp
+
+  Copyright 2014 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+// each pointer in the datastructure will point to this many
+// elements of type allocElem_t
+constexpr auto ELEMS_PER_SLOT = 750;
+
+#include "verify_heap_config.hpp"
+
+#include <alpaka/alpaka.hpp>
+#include <cstdio>
+#include <iostream>
+#include <mallocMC/mallocMC_utils.hpp>
+#include <sstream>
+#include <typeinfo>
+#include <vector>
+
+using Device = alpaka::Dev<Acc>;
+using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
+
+// global variable for verbosity, might change due to user input '--verbose'
+bool verbose = false;
+
+// the type of the elements to allocate
+using allocElem_t = unsigned long long;
+
+auto run_heap_verification(const size_t, const unsigned, const unsigned, const bool) -> bool;
+void parse_cmdline(const int, char**, size_t*, unsigned*, unsigned*, bool*);
+void print_help(char**);
+
+// used to create an empty stream for non-verbose output
+struct nullstream : std::ostream
+{
+    nullstream() : std::ostream(0)
+    {
+    }
+};
+
+// uses global verbosity to switch between std::cout and a nullptr-output
+auto dout() -> std::ostream&
+{
+    static nullstream n;
+    return verbose ? std::cout : n;
+}
+
+// define some defaults
+static constexpr unsigned threads_default = 128;
+static constexpr unsigned blocks_default = 64;
+static constexpr size_t heapInMB_default = 1024; // 1GB
+
+/**
+ * will do a basic verification of scatterAlloc.
+ *
+ * @param argv if -q or --quiet is supplied as a
+ *        command line argument, verbosity will be reduced
+ *
+ * @return will return 0 if the verification was successful,
+ *         otherwise returns 1
+ */
+auto main(int argc, char** argv) -> int
+{
+    bool machine_readable = false;
+    size_t heapInMB = heapInMB_default;
+    unsigned threads = threads_default;
+    unsigned blocks = blocks_default;
+
+    parse_cmdline(argc, argv, &heapInMB, &threads, &blocks, &machine_readable);
+
+    const auto correct = run_heap_verification(heapInMB, threads, blocks, machine_readable);
+    if(!machine_readable || verbose)
+    {
+        if(correct)
+        {
+            std::cout << "\033[0;32mverification successful ✔\033[0m\n";
+            return 0;
+        }
+        else
+        {
+            std::cerr << "\033[0;31mverification failed\033[0m\n";
+            return 1;
+        }
+    }
+}
+
+/**
+ * will parse command line arguments
+ *
+ * for more details, see print_help()
+ *
+ * @param argc argc from main()
+ * @param argv argv from main()
+ * @param heapInMP will be filled with the heapsize, if given as a parameter
+ * @param threads will be filled with number of threads, if given as a parameter
+ * @param blocks will be filled with number of blocks, if given as a parameter
+ */
+void parse_cmdline(
+    const int argc,
+    char** argv,
+    size_t* heapInMB,
+    unsigned* threads,
+    unsigned* blocks,
+    bool* machine_readable)
+{
+    std::vector<std::pair<std::string, std::string>> parameters;
+
+    // Parse Commandline, tokens are shaped like ARG=PARAM or ARG
+    // This requires to use '=', if you want to supply a value with a parameter
+    for(int i = 1; i < argc; ++i)
+    {
+        char* pos = strtok(argv[i], "=");
+        std::pair<std::string, std::string> p(std::string(pos), std::string(""));
+        pos = strtok(nullptr, "=");
+        if(pos != nullptr)
+        {
+            p.second = std::string(pos);
+        }
+        parameters.push_back(p);
+    }
+
+    // go through all parameters that were found
+    for(unsigned i = 0; i < parameters.size(); ++i)
+    {
+        std::pair<std::string, std::string> p = parameters.at(i);
+
+        if(p.first == "-v" || p.first == "--verbose")
+        {
+            verbose = true;
+        }
+
+        if(p.first == "--threads")
+        {
+            *threads = atoi(p.second.c_str());
+        }
+
+        if(p.first == "--blocks")
+        {
+            *blocks = atoi(p.second.c_str());
+        }
+
+        if(p.first == "--heapsize")
+        {
+            *heapInMB = size_t(atoi(p.second.c_str()));
+        }
+
+        if(p.first == "-h" || p.first == "--help")
+        {
+            print_help(argv);
+            exit(0);
+        }
+
+        if(p.first == "-m" || p.first == "--machine_readable")
+        {
+            *machine_readable = true;
+        }
+    }
+}
+
+/**
+ * prints a helpful message about program use
+ *
+ * @param argv the argv-parameter from main, used to find the program name
+ */
+void print_help(char** argv)
+{
+    std::stringstream s;
+
+    s << "SYNOPSIS:" << '\n';
+    s << argv[0] << " [OPTIONS]" << '\n';
+    s << "" << '\n';
+    s << "OPTIONS:" << '\n';
+    s << "  -h, --help" << '\n';
+    s << "    Print this help message and exit" << '\n';
+    s << "" << '\n';
+    s << "  -v, --verbose" << '\n';
+    s << "    Print information about parameters and progress" << '\n';
+    s << "" << '\n';
+    s << "  -m, --machine_readable" << '\n';
+    s << "    Print all relevant parameters as CSV. This will" << '\n';
+    s << "    suppress all other output unless explicitly" << '\n';
+    s << "    requested with --verbose or -v" << '\n';
+    s << "" << '\n';
+    s << "  --threads=N" << '\n';
+    s << "    Set the number of threads per block (default ";
+    s << threads_default << "128)" << '\n';
+    s << "" << '\n';
+    s << "  --blocks=N" << '\n';
+    s << "    Set the number of blocks in the grid (default ";
+    s << blocks_default << ")" << '\n';
+    s << "" << '\n';
+    s << "  --heapsize=N" << '\n';
+    s << "    Set the heapsize to N Megabyte (default ";
+    s << heapInMB_default << "1024)" << '\n';
+
+    std::cout << s.str() << std::flush;
+}
+
+/**
+ * checks validity of memory for each single cell
+ *
+ * checks on a per thread basis, if the values written during
+ * allocation are still the same. Also calculates the sum over
+ * all allocated values for a more in-depth verification that
+ * could be done on the host
+ *
+ * @param data the data to verify
+ * @param counter should be initialized with 0 and will
+ *        be used to count how many verifications were
+ *        already done
+ * @param globalSum will be filled with the sum over all
+ *        allocated values in the structure
+ * @param nSlots the size of the datastructure
+ * @param correct should be initialized with 1.
+ *        Will change to 0, if there was a value that didn't match
+ */
+struct Check_content
+{
+    ALPAKA_FN_ACC void operator()(
+        const Acc& acc,
+        allocElem_t** data,
+        unsigned long long* counter,
+        unsigned long long* globalSum,
+        const size_t nSlots,
+        int* correct) const
+    {
+        unsigned long long sum = 0;
+        while(true)
+        {
+            const size_t pos = alpaka::atomicOp<alpaka::AtomicAdd>(acc, counter, 1ull);
+            if(pos >= nSlots)
+            {
+                break;
+            }
+            const size_t offset = pos * ELEMS_PER_SLOT;
+            for(size_t i = 0; i < ELEMS_PER_SLOT; ++i)
+            {
+                if(static_cast<allocElem_t>(data[pos][i]) != static_cast<allocElem_t>(offset + i))
+                {
+                    // printf("\nError in Kernel: data[%llu][%llu] is %#010x
+                    // (should be %#010x)\n",
+                    //    pos,i,static_cast<allocElem_t>(data[pos][i]),allocElem_t(offset+i));
+                    alpaka::atomicOp<alpaka::AtomicAnd>(acc, correct, 0);
+                }
+                sum += static_cast<unsigned long long>(data[pos][i]);
+            }
+        }
+        alpaka::atomicOp<alpaka::AtomicAdd>(acc, globalSum, sum);
+    }
+};
+
+/**
+ * checks validity of memory for each single cell
+ *
+ * checks on a per thread basis, if the values written during
+ * allocation are still the same.
+ *
+ * @param data the data to verify
+ * @param counter should be initialized with 0 and will
+ *        be used to count how many verifications were
+ *        already done
+ * @param nSlots the size of the datastructure
+ * @param correct should be initialized with 1.
+ *        Will change to 0, if there was a value that didn't match
+ */
+struct Check_content_fast
+{
+    ALPAKA_FN_ACC void operator()(
+        const Acc& acc,
+        allocElem_t** data,
+        unsigned long long* counter,
+        const size_t nSlots,
+        int* correct) const
+    {
+        int c = 1;
+        while(true)
+        {
+            size_t pos = alpaka::atomicOp<alpaka::AtomicAdd>(acc, counter, 1ull);
+            if(pos >= nSlots)
+            {
+                break;
+            }
+            const size_t offset = pos * ELEMS_PER_SLOT;
+            for(size_t i = 0; i < ELEMS_PER_SLOT; ++i)
+            {
+                if(static_cast<allocElem_t>(data[pos][i]) != static_cast<allocElem_t>(offset + i))
+                {
+                    c = 0;
+                }
+            }
+        }
+        alpaka::atomicOp<alpaka::AtomicAnd>(acc, correct, c);
+    }
+};
+
+/**
+ * allocate a lot of small arrays and fill them
+ *
+ * Each array has the size ELEMS_PER_SLOT and the type allocElem_t.
+ * Each element will be filled with a number that is related to its
+ * position in the datastructure.
+ *
+ * @param data the datastructure to allocate
+ * @param counter should be initialized with 0 and will
+ *        hold, how many allocations were done
+ * @param globalSum will hold the sum of all values over all
+ *        allocated structures (for verification purposes)
+ */
+struct AllocAll
+{
+    ALPAKA_FN_ACC void operator()(
+        const Acc& acc,
+        allocElem_t** data,
+        unsigned long long* counter,
+        unsigned long long* globalSum,
+        ScatterAllocator::AllocatorHandle mMC) const
+    {
+        unsigned long long sum = 0;
+        while(true)
+        {
+            allocElem_t* p = (allocElem_t*) mMC.malloc(acc, sizeof(allocElem_t) * ELEMS_PER_SLOT);
+            if(p == nullptr)
+                break;
+
+            size_t pos = alpaka::atomicOp<alpaka::AtomicAdd>(acc, counter, 1ull);
+            const size_t offset = pos * ELEMS_PER_SLOT;
+            for(size_t i = 0; i < ELEMS_PER_SLOT; ++i)
+            {
+                p[i] = static_cast<allocElem_t>(offset + i);
+                sum += static_cast<unsigned long long>(p[i]);
+            }
+            data[pos] = p;
+        }
+
+        alpaka::atomicOp<alpaka::AtomicAdd>(acc, globalSum, sum);
+    }
+};
+
+/**
+ * free all the values again
+ *
+ * @param data the datastructure to free
+ * @param counter should be an empty space on device memory,
+ *        counts how many elements were freed
+ * @param max the maximum number of elements to free
+ */
+struct DeallocAll
+{
+    ALPAKA_FN_ACC void operator()(
+        const Acc& acc,
+        allocElem_t** data,
+        unsigned long long* counter,
+        const size_t nSlots,
+        ScatterAllocator::AllocatorHandle mMC) const
+    {
+        while(true)
+        {
+            size_t pos = alpaka::atomicOp<alpaka::AtomicAdd>(acc, counter, 1ull);
+            if(pos >= nSlots)
+                break;
+            mMC.free(acc, data[pos]);
+        }
+    }
+};
+
+/**
+ * damages one element in the data
+ *
+ * With help of this function, you can verify that
+ * the checks actually work as expected and can find
+ * an error, if one should exist
+ *
+ * @param data the datastructure to damage
+ */
+struct DamageElement
+{
+    ALPAKA_FN_ACC void operator()(const Acc& acc, allocElem_t** data) const
+    {
+        data[1][0] = static_cast<allocElem_t>(5 * ELEMS_PER_SLOT - 1);
+    }
+};
+
+/**
+ * wrapper function to allocate memory on device
+ *
+ * allocates memory with mallocMC. Returns the number of
+ * created elements as well as the sum of these elements
+ *
+ * @param d_testData the datastructure which will hold
+ *        pointers to the created elements
+ * @param h_nSlots will be filled with the number of elements
+ *        that were allocated
+ * @param h_sum will be filled with the sum of all elements created
+ * @param blocks the size of the CUDA grid
+ * @param threads the number of CUDA threads per block
+ */
+void allocate(
+    const Device& dev,
+    Queue& queue,
+    alpaka::Buf<Device, allocElem_t*, Dim, Idx>& d_testData,
+    unsigned long long* nSlots,
+    unsigned long long* sum,
+    const unsigned blocks,
+    const unsigned threads,
+    ScatterAllocator& mMC)
+{
+    dout() << "allocating on device...";
+
+    auto d_sum = alpaka::allocBuf<unsigned long long, Idx>(dev, Idx{1});
+    auto d_nSlots = alpaka::allocBuf<unsigned long long, Idx>(dev, Idx{1});
+
+    alpaka::memset(queue, d_sum, 0, 1);
+    alpaka::memset(queue, d_nSlots, 0, 1);
+
+    const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{blocks}, Idx{threads}, Idx{1}};
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            workDiv,
+            AllocAll{},
+            alpaka::getPtrNative(d_testData),
+            alpaka::getPtrNative(d_nSlots),
+            alpaka::getPtrNative(d_sum),
+            mMC.getAllocatorHandle()));
+
+    const auto hostDev = alpaka::getDevByIdx<alpaka::Pltf<alpaka::DevCpu>>(0);
+    auto h_sum = alpaka::allocBuf<unsigned long long, Idx>(hostDev, Idx{1});
+    auto h_nSlots = alpaka::allocBuf<unsigned long long, Idx>(hostDev, Idx{1});
+
+    alpaka::memcpy(queue, h_sum, d_sum, Idx{1});
+    alpaka::memcpy(queue, h_nSlots, d_nSlots, Idx{1});
+    alpaka::wait(queue);
+
+    *sum = *alpaka::getPtrNative(h_sum);
+    *nSlots = *alpaka::getPtrNative(h_nSlots);
+
+    dout() << "done\n";
+}
+
+/**
+ * Wrapper function to verify allocation on device
+ *
+ * Generates the same number that was written into each position of
+ * the datastructure during allocation and compares the values.
+ *
+ * @param d_testData the datastructure which holds
+ *        pointers to the elements you want to verify
+ * @param nSlots the size of d_testData
+ * @param blocks the size of the CUDA grid
+ * @param threads the number of CUDA threads per block
+ * @return true if the verification was successful, false otherwise
+ */
+auto verify(
+    const Device& dev,
+    Queue& queue,
+    alpaka::Buf<Device, allocElem_t*, Dim, Idx>& d_testData,
+    const unsigned long long nSlots,
+    const unsigned blocks,
+    const unsigned threads) -> bool
+{
+    dout() << "verifying on device... ";
+
+    const auto hostDev = alpaka::getDevByIdx<alpaka::Pltf<alpaka::DevCpu>>(0);
+    auto h_correct = alpaka::allocBuf<int, Idx>(hostDev, Idx{1});
+    *alpaka::getPtrNative(h_correct) = 1;
+
+    auto d_sum = alpaka::allocBuf<unsigned long long, Idx>(dev, Idx{1});
+    auto d_counter = alpaka::allocBuf<unsigned long long, Idx>(dev, Idx{1});
+    auto d_correct = alpaka::allocBuf<int, Idx>(dev, Idx{1});
+
+    alpaka::memset(queue, d_sum, 0, 1);
+    alpaka::memset(queue, d_counter, 0, 1);
+    alpaka::memcpy(queue, d_correct, h_correct, 1);
+
+    // can be replaced by a call to check_content_fast,
+    // if the gaussian sum (see below) is not used and you
+    // want to be a bit faster
+    const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{blocks}, Idx{threads}, Idx{1}};
+    alpaka::enqueue(
+        queue,
+        alpaka::createTaskKernel<Acc>(
+            workDiv,
+            Check_content{},
+            alpaka::getPtrNative(d_testData),
+            alpaka::getPtrNative(d_counter),
+            alpaka::getPtrNative(d_sum),
+            static_cast<size_t>(nSlots),
+            alpaka::getPtrNative(d_correct)));
+
+    alpaka::memcpy(queue, h_correct, d_correct, 1);
+    alpaka::wait(queue);
+
+    const auto correct = *alpaka::getPtrNative(h_correct);
+    dout() << (correct ? "done\n" : "failed\n");
+    return correct != 0;
+}
+
+/**
+ * prints all parameters machine readable
+ *
+ * for params, see run_heap_verification-internal parameters
+ */
+void print_machine_readable(
+    const unsigned pagesize,
+    const unsigned accessblocks,
+    const unsigned regionsize,
+    const unsigned wastefactor,
+    const bool resetfreedpages,
+    const unsigned blocks,
+    const unsigned threads,
+    const unsigned elemsPerSlot,
+    const size_t allocElemSize,
+    const size_t heapSize,
+    const size_t maxSpace,
+    const size_t maxSlots,
+    const unsigned long long usedSlots,
+    const float allocFrac,
+    const size_t wasted,
+    const bool correct)
+{
+    std::string sep = ",";
+    std::stringstream h;
+    std::stringstream v;
+
+    h << "PagesizeByte" << sep;
+    v << pagesize << sep;
+
+    h << "Accessblocks" << sep;
+    v << accessblocks << sep;
+
+    h << "Regionsize" << sep;
+    v << regionsize << sep;
+
+    h << "Wastefactor" << sep;
+    v << wasted << sep;
+
+    h << "ResetFreedPage" << sep;
+    v << resetfreedpages << sep;
+
+    h << "Gridsize" << sep;
+    v << blocks << sep;
+
+    h << "Blocksize" << sep;
+    v << threads << sep;
+
+    h << "ELEMS_PER_SLOT" << sep;
+    v << elemsPerSlot << sep;
+
+    h << "allocElemByte" << sep;
+    v << allocElemSize << sep;
+
+    h << "heapsizeByte" << sep;
+    v << heapSize << sep;
+
+    h << "maxSpaceByte" << sep;
+    v << maxSpace << sep;
+
+    h << "maxSlots" << sep;
+    v << maxSlots << sep;
+
+    h << "usedSlots" << sep;
+    v << usedSlots << sep;
+
+    h << "allocFraction" << sep;
+    v << allocFrac << sep;
+
+    h << "wastedBytes" << sep;
+    v << wasted << sep;
+
+    h << "correct";
+    v << correct;
+
+    std::cout << h.str() << '\n';
+    std::cout << v.str() << '\n';
+}
+
+/**
+ * Verify the heap allocation of mallocMC
+ *
+ * Allocates as much memory as the heap allows. Make sure that allocated
+ * memory actually holds the correct values without corrupting them. Will
+ * fill the datastructure with values that are relative to the index and
+ * later evalute, if the values inside stayed the same after allocating all
+ * memory.
+ * Datastructure: Array that holds up to nPointers pointers to arrays of size
+ * ELEMS_PER_SLOT, each being of type allocElem_t.
+ *
+ * @return true if the verification was successful,
+ *         false otherwise
+ */
+auto run_heap_verification(
+    const size_t heapMB,
+    const unsigned blocks,
+    const unsigned threads,
+    const bool machine_readable) -> bool
+{
+    const auto dev = alpaka::getDevByIdx<Acc>(0);
+    auto queue = Queue{dev};
+
+    const size_t heapSize = size_t(1024U * 1024U) * heapMB;
+    const size_t slotSize = sizeof(allocElem_t) * ELEMS_PER_SLOT;
+    const size_t nPointers = (heapSize + slotSize - 1) / slotSize;
+    const size_t maxSlots = heapSize / slotSize;
+    const size_t maxSpace = maxSlots * slotSize + nPointers * sizeof(allocElem_t*);
+    bool correct = true;
+
+    dout() << "CreationPolicy Arguments:\n";
+    dout() << "Pagesize:              " << ScatterConfig::pagesize << '\n';
+    dout() << "Accessblocks:          " << ScatterConfig::accessblocks << '\n';
+    dout() << "Regionsize:            " << ScatterConfig::regionsize << '\n';
+    dout() << "Wastefactor:           " << ScatterConfig::wastefactor << '\n';
+    dout() << "ResetFreedPages        " << ScatterConfig::resetfreedpages << '\n';
+    dout() << "\n";
+    dout() << "Gridsize:              " << blocks << '\n';
+    dout() << "Blocksize:             " << threads << '\n';
+    dout() << "Allocated elements:    " << ELEMS_PER_SLOT << " x " << sizeof(allocElem_t);
+    dout() << "    Byte (" << slotSize << " Byte)\n";
+    dout() << "Heap:                  " << heapSize << " Byte";
+    dout() << " (" << heapSize / pow(1024, 2) << " MByte)\n";
+    dout() << "max space w/ pointers: " << maxSpace << " Byte";
+    dout() << " (" << maxSpace / pow(1024, 2) << " MByte)\n";
+    dout() << "maximum of elements:   " << maxSlots << '\n';
+
+    unsigned long long usedSlots = 0;
+    unsigned long long sumAllocElems = 0;
+    float allocFrac = 0;
+    size_t wasted = 0;
+
+    {
+        ScatterAllocator mMC(dev, queue, heapSize);
+
+        // allocating with mallocMC
+        auto d_testData = alpaka::allocBuf<allocElem_t*, Idx>(dev, Idx{nPointers});
+        allocate(dev, queue, d_testData, &usedSlots, &sumAllocElems, blocks, threads, mMC);
+
+        allocFrac = static_cast<float>(usedSlots) * 100 / maxSlots;
+        wasted = heapSize - static_cast<size_t>(usedSlots) * slotSize;
+        dout() << "allocated elements:    " << usedSlots;
+        dout() << " (" << allocFrac << "%)\n";
+        dout() << "wasted heap space:     " << wasted << " Byte";
+        dout() << " (" << wasted / pow(1024, 2) << " MByte)\n";
+
+        // verifying on device
+        correct = correct && verify(dev, queue, d_testData, usedSlots, blocks, threads);
+
+        // damaging one cell
+        dout() << "damaging of element... ";
+        {
+            const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{1}, Idx{1}, Idx{1}};
+            alpaka::enqueue(
+                queue,
+                alpaka::createTaskKernel<Acc>(
+                    workDiv,
+                    DamageElement{},
+                    alpaka::getPtrNative(d_testData)));
+        }
+        dout() << "done\n";
+
+        // verifying on device
+        // THIS SHOULD FAIL (damage was done before!). Therefore, we must
+        // inverse the logic
+        correct = correct && !verify(dev, queue, d_testData, usedSlots, blocks, threads);
+
+        // release all memory
+        dout() << "deallocation...        ";
+        auto d_dealloc_counter = alpaka::allocBuf<unsigned long long, Idx>(dev, Idx{1});
+        alpaka::memset(queue, d_dealloc_counter, 0, 1);
+        {
+            const auto workDiv = alpaka::WorkDivMembers<Dim, Idx>{Idx{blocks}, Idx{threads}, Idx{1}};
+            alpaka::enqueue(
+                queue,
+                alpaka::createTaskKernel<Acc>(
+                    workDiv,
+                    DeallocAll{},
+                    alpaka::getPtrNative(d_testData),
+                    alpaka::getPtrNative(d_dealloc_counter),
+                    static_cast<size_t>(usedSlots),
+                    mMC.getAllocatorHandle()));
+        }
+    }
+
+    dout() << "done \n";
+
+    if(machine_readable)
+    {
+        print_machine_readable(
+            ScatterConfig::pagesize,
+            ScatterConfig::accessblocks,
+            ScatterConfig::regionsize,
+            ScatterConfig::wastefactor,
+            ScatterConfig::resetfreedpages,
+            blocks,
+            threads,
+            ELEMS_PER_SLOT,
+            sizeof(allocElem_t),
+            heapSize,
+            maxSpace,
+            maxSlots,
+            usedSlots,
+            allocFrac,
+            wasted,
+            correct);
+    }
+
+    return correct;
+}
diff --git a/thirdParty/mallocMC/tests/verify_heap.cu b/thirdParty/mallocMC/tests/verify_heap.cu
deleted file mode 100644
index 9cee1b1217..0000000000
--- a/thirdParty/mallocMC/tests/verify_heap.cu
+++ /dev/null
@@ -1,701 +0,0 @@
-/*
-  mallocMC: Memory Allocator for Many Core Architectures.
-  https://www.hzdr.de/crp
-
-  Copyright 2014 Institute of Radiation Physics,
-                 Helmholtz-Zentrum Dresden - Rossendorf
-
-  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
-
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-
-
-// get a CUDA error and print it nicely
-#define CUDA_CHECK(cmd) {cudaError_t error = cmd; \
-  if(error!=cudaSuccess){\
-    printf("<%s>:%i ",__FILE__,__LINE__);\
-    printf("[CUDA] Error: %s\n", cudaGetErrorString(error));}}
-
-// start kernel, wait for finish and check errors
-#define CUDA_CHECK_KERNEL_SYNC(...) __VA_ARGS__;CUDA_CHECK(cudaDeviceSynchronize())
-
-// each pointer in the datastructure will point to this many
-// elements of type allocElem_t
-#define ELEMS_PER_SLOT 750
-
-#include <cuda.h>
-#include <iostream>
-#include <cstdio>
-#include <typeinfo>
-#include <vector>
-
-//include the Heap with the arguments given in the config
-#include "src/include/mallocMC/mallocMC_utils.hpp"
-#include "pmacc/verify_heap_config.hpp"
-
-// global variable for verbosity, might change due to user input '--verbose'
-bool verbose = false;
-
-// the type of the elements to allocate
-typedef unsigned long long allocElem_t;
-
-bool run_heap_verification(const size_t, const unsigned, const unsigned, const bool);
-void parse_cmdline(const int, char**, size_t*, unsigned*, unsigned*, bool*);
-void print_help(char**);
-
-
-// used to create an empty stream for non-verbose output
-struct nullstream : std::ostream {
-  nullstream() : std::ostream(0) { }
-};
-
-// uses global verbosity to switch between std::cout and a NULL-output
-std::ostream& dout() {
-  static nullstream n;
-  return verbose ? std::cout : n;
-}
-
-// define some defaults
-BOOST_STATIC_CONSTEXPR unsigned threads_default = 128;
-BOOST_STATIC_CONSTEXPR unsigned blocks_default  = 64;
-BOOST_STATIC_CONSTEXPR size_t heapInMB_default  = 1024; // 1GB
-
-
-/**
- * will do a basic verification of scatterAlloc.
- *
- * @param argv if -q or --quiet is supplied as a
- *        command line argument, verbosity will be reduced
- *
- * @return will return 0 if the verification was successful,
- *         otherwise returns 1
- */
-int main(int argc, char** argv){
-  bool correct          = false;
-  bool machine_readable = false;
-  size_t heapInMB       = heapInMB_default;
-  unsigned threads      = threads_default;
-  unsigned blocks       = blocks_default;
-
-  parse_cmdline(argc, argv, &heapInMB, &threads, &blocks, &machine_readable);
-
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, 0);
-
-  if( deviceProp.major < 2 ) {
-    std::cerr << "Error: Compute Capability >= 2.0 required. (is ";
-    std::cerr << deviceProp.major << "."<< deviceProp.minor << ")" << std::endl;
-    return 1;
-  }
-
-  cudaSetDevice(0);
-  correct = run_heap_verification(heapInMB, threads, blocks, machine_readable);
-  cudaDeviceReset();
-
-  if(!machine_readable || verbose){
-    if(correct){
-      std::cout << "\033[0;32mverification successful ✔\033[0m" << std::endl;
-      return 0;
-    }else{
-      std::cerr << "\033[0;31mverification failed\033[0m" << std::endl;
-      return 1;
-    }
-  }
-}
-
-
-/**
- * will parse command line arguments
- *
- * for more details, see print_help()
- *
- * @param argc argc from main()
- * @param argv argv from main()
- * @param heapInMP will be filled with the heapsize, if given as a parameter
- * @param threads will be filled with number of threads, if given as a parameter
- * @param blocks will be filled with number of blocks, if given as a parameter
- */
-void parse_cmdline(
-    const int argc,
-    char**argv,
-    size_t *heapInMB,
-    unsigned *threads,
-    unsigned *blocks,
-    bool *machine_readable
-    ){
-
-  std::vector<std::pair<std::string, std::string> > parameters;
-
-  // Parse Commandline, tokens are shaped like ARG=PARAM or ARG
-  // This requires to use '=', if you want to supply a value with a parameter
-  for (int i = 1; i < argc; ++i) {
-    char* pos = strtok(argv[i], "=");
-    std::pair < std::string, std::string > p(std::string(pos), std::string(""));
-    pos = strtok(NULL, "=");
-    if (pos != NULL) {
-      p.second = std::string(pos);
-    }
-    parameters.push_back(p);
-  }
-
-  // go through all parameters that were found
-  for (unsigned i = 0; i < parameters.size(); ++i) {
-    std::pair < std::string, std::string > p = parameters.at(i);
-
-    if (p.first == "-v" || p.first == "--verbose") {
-      verbose = true;
-    }
-
-    if (p.first == "--threads") {
-      *threads = atoi(p.second.c_str());
-    }
-
-    if (p.first == "--blocks") {
-      *blocks = atoi(p.second.c_str());
-    }
-
-    if(p.first == "--heapsize") {
-      *heapInMB = size_t(atoi(p.second.c_str()));
-    }
-
-    if(p.first == "-h" || p.first == "--help"){
-      print_help(argv);
-      exit(0);
-    }
-
-    if(p.first == "-m" || p.first == "--machine_readable"){
-      *machine_readable = true;
-    }
-  }
-}
-
-
-/**
- * prints a helpful message about program use
- *
- * @param argv the argv-parameter from main, used to find the program name
- */
-void print_help(char** argv){
-  std::stringstream s;
-
-  s << "SYNOPSIS:"                                              << std::endl;
-  s << argv[0] << " [OPTIONS]"                                  << std::endl;
-  s << ""                                                       << std::endl;
-  s << "OPTIONS:"                                               << std::endl;
-  s << "  -h, --help"                                           << std::endl;
-  s << "    Print this help message and exit"                   << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  -v, --verbose"                                        << std::endl;
-  s << "    Print information about parameters and progress"    << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  -m, --machine_readable"                               << std::endl;
-  s << "    Print all relevant parameters as CSV. This will"    << std::endl;
-  s << "    suppress all other output unless explicitly"        << std::endl;
-  s << "    requested with --verbose or -v"                     << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  --threads=N"                                          << std::endl;
-  s << "    Set the number of threads per block (default "                  ;
-  s <<                               threads_default << "128)"  << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  --blocks=N"                                           << std::endl;
-  s << "    Set the number of blocks in the grid (default "                 ;
-  s <<                                   blocks_default << ")"  << std::endl;
-  s << ""                                                       << std::endl;
-  s << "  --heapsize=N"                                         << std::endl;
-  s << "    Set the heapsize to N Megabyte (default "                       ;
-  s <<                         heapInMB_default << "1024)"      << std::endl;
-
-  std::cout << s.str();
-}
-
-
-/**
- * checks validity of memory for each single cell
- *
- * checks on a per thread basis, if the values written during
- * allocation are still the same. Also calculates the sum over
- * all allocated values for a more in-depth verification that
- * could be done on the host
- *
- * @param data the data to verify
- * @param counter should be initialized with 0 and will
- *        be used to count how many verifications were
- *        already done
- * @param globalSum will be filled with the sum over all
- *        allocated values in the structure
- * @param nSlots the size of the datastructure
- * @param correct should be initialized with 1.
- *        Will change to 0, if there was a value that didn't match
- */
-__global__ void check_content(
-    allocElem_t** data,
-    unsigned long long *counter,
-    unsigned long long* globalSum,
-    const size_t nSlots,
-    int* correct
-    ){
-
-  unsigned long long sum=0;
-  while(true){
-    size_t pos = atomicAdd(counter,1);
-    if(pos >= nSlots){break;}
-    const size_t offset = pos*ELEMS_PER_SLOT;
-    for(size_t i=0;i<ELEMS_PER_SLOT;++i){
-      if (static_cast<allocElem_t>(data[pos][i]) != static_cast<allocElem_t>(offset+i)){
-        //printf("\nError in Kernel: data[%llu][%llu] is %#010x (should be %#010x)\n",
-        //    pos,i,static_cast<allocElem_t>(data[pos][i]),allocElem_t(offset+i));
-        atomicAnd(correct,0);
-      }
-      sum += static_cast<unsigned long long>(data[pos][i]);
-    }
-  }
-  atomicAdd(globalSum,sum);
-}
-
-
-/**
- * checks validity of memory for each single cell
- *
- * checks on a per thread basis, if the values written during
- * allocation are still the same.
- *
- * @param data the data to verify
- * @param counter should be initialized with 0 and will
- *        be used to count how many verifications were
- *        already done
- * @param nSlots the size of the datastructure
- * @param correct should be initialized with 1.
- *        Will change to 0, if there was a value that didn't match
- */
-__global__ void check_content_fast(
-    allocElem_t** data,
-    unsigned long long *counter,
-    const size_t nSlots,
-    int* correct
-    ){
-
-  int c = 1;
-  while(true){
-    size_t pos = atomicAdd(counter,1);
-    if(pos >= nSlots){break;}
-    const size_t offset = pos*ELEMS_PER_SLOT;
-    for(size_t i=0;i<ELEMS_PER_SLOT;++i){
-      if (static_cast<allocElem_t>(data[pos][i]) != static_cast<allocElem_t>(offset+i)){
-        c=0;
-      }
-    }
-  }
-  atomicAnd(correct,c);
-}
-
-
-/**
- * allocate a lot of small arrays and fill them
- *
- * Each array has the size ELEMS_PER_SLOT and the type allocElem_t.
- * Each element will be filled with a number that is related to its
- * position in the datastructure.
- *
- * @param data the datastructure to allocate
- * @param counter should be initialized with 0 and will
- *        hold, how many allocations were done
- * @param globalSum will hold the sum of all values over all
- *        allocated structures (for verification purposes)
- */
-__global__ void allocAll(
-    allocElem_t** data,
-    unsigned long long* counter,
-    unsigned long long* globalSum,
-    ScatterAllocator::AllocatorHandle mMC
-    ){
-
-  unsigned long long sum=0;
-  while(true){
-    allocElem_t* p = (allocElem_t*) mMC.malloc(sizeof(allocElem_t) * ELEMS_PER_SLOT);
-    if(p == NULL) break;
-
-    size_t pos = atomicAdd(counter,1);
-    const size_t offset = pos*ELEMS_PER_SLOT;
-    for(size_t i=0;i<ELEMS_PER_SLOT;++i){
-      p[i] = static_cast<allocElem_t>(offset + i);
-      sum += static_cast<unsigned long long>(p[i]);
-    }
-    data[pos] = p;
-  }
-
-  atomicAdd(globalSum,sum);
-}
-
-
-/**
- * free all the values again
- *
- * @param data the datastructure to free
- * @param counter should be an empty space on device memory,
- *        counts how many elements were freed
- * @param max the maximum number of elements to free
- */
-__global__ void deallocAll(
-    allocElem_t** data,
-    unsigned long long* counter,
-    const size_t nSlots,
-    ScatterAllocator::AllocatorHandle mMC
-    ){
-
-  while(true){
-    size_t pos = atomicAdd(counter,1);
-    if(pos >= nSlots) break;
-    mMC.free(data[pos]);
-  }
-}
-
-
-/**
- * damages one element in the data
- *
- * With help of this function, you can verify that
- * the checks actually work as expected and can find
- * an error, if one should exist
- *
- * @param data the datastructure to damage
- */
-__global__ void damageElement(allocElem_t** data){
-  data[1][0] = static_cast<allocElem_t>(5*ELEMS_PER_SLOT - 1);
-}
-
-
-/**
- * wrapper function to allocate memory on device
- *
- * allocates memory with mallocMC. Returns the number of
- * created elements as well as the sum of these elements
- *
- * @param d_testData the datastructure which will hold
- *        pointers to the created elements
- * @param h_nSlots will be filled with the number of elements
- *        that were allocated
- * @param h_sum will be filled with the sum of all elements created
- * @param blocks the size of the CUDA grid
- * @param threads the number of CUDA threads per block
- */
-void allocate(
-    allocElem_t** d_testData,
-    unsigned long long* h_nSlots,
-    unsigned long long* h_sum,
-    const unsigned blocks,
-    const unsigned threads,
-    ScatterAllocator* mMC
-    ){
-
-  dout() << "allocating on device...";
-
-  unsigned long long zero = 0;
-  unsigned long long *d_sum;
-  unsigned long long *d_nSlots;
-
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_sum,sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_nSlots, sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_sum,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_nSlots,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-
-  CUDA_CHECK_KERNEL_SYNC(allocAll<<<blocks,threads>>>(d_testData, d_nSlots, d_sum, *mMC ));
-
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(h_sum,d_sum,sizeof(unsigned long long),cudaMemcpyDeviceToHost));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(h_nSlots,d_nSlots,sizeof(unsigned long long),cudaMemcpyDeviceToHost));
-  cudaFree(d_sum);
-  cudaFree(d_nSlots);
-  dout() << "done" << std::endl;
-}
-
-
-/**
- * Wrapper function to verify allocation on device
- *
- * Generates the same number that was written into each position of
- * the datastructure during allocation and compares the values.
- *
- * @param d_testData the datastructure which holds
- *        pointers to the elements you want to verify
- * @param nSlots the size of d_testData
- * @param blocks the size of the CUDA grid
- * @param threads the number of CUDA threads per block
- * @return true if the verification was successful, false otherwise
- */
-bool verify(
-    allocElem_t **d_testData,
-    const unsigned long long nSlots,
-    const unsigned blocks,
-    const unsigned threads
-    ){
-
-  dout() << "verifying on device... ";
-
-  const unsigned long long zero = 0;
-  int  h_correct = 1;
-  int* d_correct;
-  unsigned long long *d_sum;
-  unsigned long long *d_counter;
-
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_sum, sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_counter, sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_correct, sizeof(int)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_sum,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_counter,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_correct,&h_correct,sizeof(int),cudaMemcpyHostToDevice));
-
-  // can be replaced by a call to check_content_fast,
-  // if the gaussian sum (see below) is not used and you
-  // want to be a bit faster
-  CUDA_CHECK_KERNEL_SYNC(check_content<<<blocks,threads>>>(
-        d_testData,
-        d_counter,
-        d_sum,
-        static_cast<size_t>(nSlots),
-        d_correct
-        ));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_correct,d_correct,sizeof(int),cudaMemcpyDeviceToHost));
-
-  // This only works, if the type "allocElem_t"
-  // can hold all the IDs (usually unsigned long long)
-  /*
-  dout() << "verifying on host...";
-  unsigned long long h_sum, h_counter;
-  unsigned long long gaussian_sum = (ELEMS_PER_SLOT*nSlots * (ELEMS_PER_SLOT*nSlots-1))/2;
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_sum,d_sum,sizeof(unsigned long long),cudaMemcpyDeviceToHost));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(&h_counter,d_counter,sizeof(unsigned long long),cudaMemcpyDeviceToHost));
-  if(gaussian_sum != h_sum){
-    dout() << "\nGaussian Sum doesn't match: is " << h_sum;
-    dout() << " (should be " << gaussian_sum << ")" << std::endl;
-    h_correct=false;
-  }
-  if(nSlots != h_counter-(blocks*threads)){
-    dout() << "\nallocated number of elements doesn't match: is " << h_counter;
-    dout() << " (should be " << nSlots << ")" << std::endl;
-    h_correct=false;
-  }
-  */
-
-  if(h_correct){
-    dout() << "done" << std::endl;
-  }else{
-    dout() << "failed" << std::endl;
-  }
-
-  cudaFree(d_correct);
-  cudaFree(d_sum);
-  cudaFree(d_counter);
-  return static_cast<bool>(h_correct);
-}
-
-
-/**
- * prints all parameters machine readable
- *
- * for params, see run_heap_verification-internal parameters
- */
-void print_machine_readable(
-        const unsigned pagesize,
-        const unsigned accessblocks,
-        const unsigned regionsize,
-        const unsigned wastefactor,
-        const bool resetfreedpages,
-        const unsigned blocks,
-        const unsigned threads,
-        const unsigned elemsPerSlot,
-        const size_t allocElemSize,
-        const size_t heapSize,
-        const size_t maxSpace,
-        const size_t maxSlots,
-        const unsigned long long usedSlots,
-        const float allocFrac,
-        const size_t wasted,
-        const bool correct
-        ){
-
-  std::string sep = ",";
-  std::stringstream h;
-  std::stringstream v;
-
-  h << "PagesizeByte"   << sep;
-  v << pagesize         << sep;
-
-  h << "Accessblocks"   << sep;
-  v << accessblocks     << sep;
-
-  h << "Regionsize"     << sep;
-  v << regionsize       << sep;
-
-  h << "Wastefactor"    << sep;
-  v << wasted           << sep;
-
-  h << "ResetFreedPage" << sep;
-  v << resetfreedpages  << sep;
-
-  h << "Gridsize"       << sep;
-  v <<  blocks          << sep;
-
-  h << "Blocksize"      << sep;
-  v << threads          << sep;
-
-  h << "ELEMS_PER_SLOT" << sep;
-  v << elemsPerSlot     << sep;
-
-  h << "allocElemByte"  << sep;
-  v << allocElemSize    << sep;
-
-  h << "heapsizeByte"   << sep;
-  v << heapSize         << sep;
-
-  h << "maxSpaceByte"   << sep;
-  v << maxSpace         << sep;
-
-  h << "maxSlots"       << sep;
-  v << maxSlots         << sep;
-
-  h << "usedSlots"      << sep;
-  v << usedSlots        << sep;
-
-  h << "allocFraction"  << sep;
-  v << allocFrac        << sep;
-
-  h << "wastedBytes"    << sep;
-  v << wasted           << sep;
-
-  h << "correct"        ;
-  v << correct          ;
-
-  std::cout << h.str() << std::endl;
-  std::cout << v.str() << std::endl;
-}
-
-
-/**
- * Verify the heap allocation of mallocMC
- *
- * Allocates as much memory as the heap allows. Make sure that allocated
- * memory actually holds the correct values without corrupting them. Will
- * fill the datastructure with values that are relative to the index and
- * later evalute, if the values inside stayed the same after allocating all
- * memory.
- * Datastructure: Array that holds up to nPointers pointers to arrays of size
- * ELEMS_PER_SLOT, each being of type allocElem_t.
- *
- * @return true if the verification was successful,
- *         false otherwise
- */
-bool run_heap_verification(
-    const size_t heapMB,
-    const unsigned blocks,
-    const unsigned threads,
-    const bool machine_readable
-    ){
-
-  cudaSetDeviceFlags(cudaDeviceMapHost);
-
-  const size_t heapSize         = size_t(1024U*1024U) * heapMB;
-  const size_t slotSize         = sizeof(allocElem_t)*ELEMS_PER_SLOT;
-  const size_t nPointers        = ceil(static_cast<float>(heapSize) / slotSize);
-  const size_t maxSlots         = heapSize/slotSize;
-  const size_t maxSpace         = maxSlots*slotSize + nPointers*sizeof(allocElem_t*);
-  bool correct                  = true;
-  const unsigned long long zero = 0;
-
-  dout() << "CreationPolicy Arguments:"                                         << std::endl;
-  dout() << "Pagesize:              "     << ScatterConfig::pagesize::value        << std::endl;
-  dout() << "Accessblocks:          "     << ScatterConfig::accessblocks::value    << std::endl;
-  dout() << "Regionsize:            "     << ScatterConfig::regionsize::value      << std::endl;
-  dout() << "Wastefactor:           "     << ScatterConfig::wastefactor::value     << std::endl;
-  dout() << "ResetFreedPages        "     << ScatterConfig::resetfreedpages::value << std::endl;
-  dout() << ""                                                                  << std::endl;
-  dout() << "Gridsize:              "     << blocks                             << std::endl;
-  dout() << "Blocksize:             "     << threads                            << std::endl;
-  dout() << "Allocated elements:    "     << ELEMS_PER_SLOT << " x "  << sizeof(allocElem_t);
-  dout() << "    Byte ("  << slotSize     << " Byte)"                           << std::endl;
-  dout() << "Heap:                  "     << heapSize << " Byte";
-  dout() << " (" << heapSize/pow(1024,2)  << " MByte)"                          << std::endl;
-  dout() << "max space w/ pointers: "     << maxSpace << " Byte";
-  dout() << " (" << maxSpace/pow(1024,2)  << " MByte)"                          << std::endl;
-  dout() << "maximum of elements:   "     << maxSlots                           << std::endl;
-
-  // initializing the heap
-  ScatterAllocator* mMC = new ScatterAllocator(heapSize);
-  allocElem_t** d_testData;
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_testData, nPointers*sizeof(allocElem_t*)));
-
-  // allocating with mallocMC
-  unsigned long long usedSlots = 0;
-  unsigned long long sumAllocElems = 0;
-  allocate(d_testData, &usedSlots, &sumAllocElems, blocks, threads, mMC);
-
-  const float allocFrac = static_cast<float>(usedSlots)*100/maxSlots;
-  const size_t wasted = heapSize - static_cast<size_t>(usedSlots) * slotSize;
-  dout() << "allocated elements:    "   << usedSlots;
-  dout() << " (" << allocFrac << "%)"   << std::endl;
-  dout() << "wasted heap space:     "   << wasted << " Byte";
-  dout() << " (" << wasted/pow(1024,2)  << " MByte)" << std::endl;
-
-  // verifying on device
-  correct = correct && verify(d_testData,usedSlots,blocks,threads);
-
-  // damaging one cell
-  dout() << "damaging of element... ";
-  CUDA_CHECK_KERNEL_SYNC(damageElement<<<1,1>>>(d_testData));
-  dout() << "done" << std::endl;
-
-  // verifying on device
-  // THIS SHOULD FAIL (damage was done before!). Therefore, we must inverse the logic
-  correct = correct && !verify(d_testData,usedSlots,blocks,threads);
-
-
-  // release all memory
-  dout() << "deallocation...        ";
-  unsigned long long* d_dealloc_counter;
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMalloc((void**) &d_dealloc_counter, sizeof(unsigned long long)));
-  MALLOCMC_CUDA_CHECKED_CALL(cudaMemcpy(d_dealloc_counter,&zero,sizeof(unsigned long long),cudaMemcpyHostToDevice));
-  CUDA_CHECK_KERNEL_SYNC(deallocAll<<<blocks,threads>>>(d_testData,d_dealloc_counter,static_cast<size_t>(usedSlots), *mMC ));
-  cudaFree(d_dealloc_counter);
-  cudaFree(d_testData);
-  delete mMC;
-
-  dout() << "done "<< std::endl;
-
-  if(machine_readable){
-    print_machine_readable(
-        ScatterConfig::pagesize::value,
-        ScatterConfig::accessblocks::value,
-        ScatterConfig::regionsize::value,
-        ScatterConfig::wastefactor::value,
-        ScatterConfig::resetfreedpages::value,
-        blocks,
-        threads,
-        ELEMS_PER_SLOT,
-        sizeof(allocElem_t),
-        heapSize,
-        maxSpace,
-        maxSlots,
-        usedSlots,
-        allocFrac,
-        wasted,
-        correct
-        );
-  }
-
-  return correct;
-}
diff --git a/thirdParty/mallocMC/tests/verify_heap_config.hpp b/thirdParty/mallocMC/tests/verify_heap_config.hpp
index e76337a61a..224ede0d2d 100644
--- a/thirdParty/mallocMC/tests/verify_heap_config.hpp
+++ b/thirdParty/mallocMC/tests/verify_heap_config.hpp
@@ -28,52 +28,51 @@
 
 #pragma once
 
-#include <boost/mpl/int.hpp>
-#include <boost/mpl/bool.hpp>
+#include <alpaka/alpaka.hpp>
+#include <mallocMC/mallocMC.hpp>
 
-// basic files for mallocMC
-#include "src/include/mallocMC/mallocMC_hostclass.hpp"
-
-// Load all available policies for mallocMC
-#include "src/include/mallocMC/CreationPolicies.hpp"
-#include "src/include/mallocMC/DistributionPolicies.hpp"
-#include "src/include/mallocMC/OOMPolicies.hpp"
-#include "src/include/mallocMC/ReservePoolPolicies.hpp"
-#include "src/include/mallocMC/AlignmentPolicies.hpp"
-    
+using Dim = alpaka::DimInt<1>;
+using Idx = std::size_t;
+// using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+// using Acc = alpaka::AccCpuOmp2Threads<Dim, Idx>;
+using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
 
 // configurate the CreationPolicy "Scatter"
-struct ScatterConfig{
-    typedef boost::mpl::int_<4096>  pagesize;
-    typedef boost::mpl::int_<8>     accessblocks;
-    typedef boost::mpl::int_<16>    regionsize;
-    typedef boost::mpl::int_<2>     wastefactor;
-    typedef boost::mpl::bool_<false> resetfreedpages;
+struct ScatterConfig
+{
+    static constexpr auto pagesize = 4096;
+    static constexpr auto accessblocks = 8;
+    static constexpr auto regionsize = 16;
+    static constexpr auto wastefactor = 2;
+    static constexpr auto resetfreedpages = false;
 };
 
-struct ScatterHashParams{
-    typedef boost::mpl::int_<38183> hashingK;
-    typedef boost::mpl::int_<17497> hashingDistMP;
-    typedef boost::mpl::int_<1>     hashingDistWP;
-    typedef boost::mpl::int_<1>     hashingDistWPRel;
+struct ScatterHashParams
+{
+    static constexpr auto hashingK = 38183;
+    static constexpr auto hashingDistMP = 17497;
+    static constexpr auto hashingDistWP = 1;
+    static constexpr auto hashingDistWPRel = 1;
 };
 
 // configure the DistributionPolicy "XMallocSIMD"
-struct DistributionConfig{
-  typedef ScatterConfig::pagesize pagesize;
+struct DistributionConfig
+{
+    static constexpr auto pagesize = ScatterConfig::pagesize;
 };
 
 // configure the AlignmentPolicy "Shrink"
-struct AlignmentConfig{
-  typedef boost::mpl::int_<16> dataAlignment;
+struct AlignmentConfig
+{
+    static constexpr auto dataAlignment = 16;
 };
 
 // Define a new allocator and call it ScatterAllocator
 // which resembles the behaviour of ScatterAlloc
-typedef mallocMC::Allocator< 
-  mallocMC::CreationPolicies::Scatter<ScatterConfig,ScatterHashParams>,
-  mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
-  mallocMC::OOMPolicies::ReturnNull,
-  mallocMC::ReservePoolPolicies::SimpleCudaMalloc,
-  mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>
-  > ScatterAllocator;
+using ScatterAllocator = mallocMC::Allocator<
+    Acc,
+    mallocMC::CreationPolicies::Scatter<ScatterConfig, ScatterHashParams>,
+    mallocMC::DistributionPolicies::XMallocSIMD<DistributionConfig>,
+    mallocMC::OOMPolicies::ReturnNull,
+    mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
+    mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
diff --git a/thirdParty/alpaka/thirdParty/catch2/include/catch2/catch.hpp b/thirdParty/mallocMC/thirdParty/catch2/include/catch2/catch.hpp
similarity index 100%
rename from thirdParty/alpaka/thirdParty/catch2/include/catch2/catch.hpp
rename to thirdParty/mallocMC/thirdParty/catch2/include/catch2/catch.hpp
diff --git a/thirdParty/nlohmann_json/CMakeLists.txt b/thirdParty/nlohmann_json/CMakeLists.txt
new file mode 100644
index 0000000000..fa77a5aed2
--- /dev/null
+++ b/thirdParty/nlohmann_json/CMakeLists.txt
@@ -0,0 +1,164 @@
+cmake_minimum_required(VERSION 3.1)
+
+##
+## PROJECT
+## name and version
+##
+project(nlohmann_json VERSION 3.9.1 LANGUAGES CXX)
+
+##
+## INCLUDE
+##
+##
+include(ExternalProject)
+
+##
+## OPTIONS
+##
+
+if (POLICY CMP0077)
+    # Allow CMake 3.13+ to override options when using FetchContent / add_subdirectory.
+    cmake_policy(SET CMP0077 NEW)
+endif ()
+
+option(JSON_BuildTests "Build the unit tests when BUILD_TESTING is enabled." ON)
+option(JSON_Install "Install CMake targets during install step." ON)
+option(JSON_MultipleHeaders "Use non-amalgamated version of the library." OFF)
+option(JSON_ImplicitConversions "Enable implicit conversions." ON)
+
+##
+## CONFIGURATION
+##
+include(GNUInstallDirs)
+
+set(NLOHMANN_JSON_TARGET_NAME               ${PROJECT_NAME})
+set(NLOHMANN_JSON_CONFIG_INSTALL_DIR        "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" CACHE INTERNAL "")
+set(NLOHMANN_JSON_INCLUDE_INSTALL_DIR       "${CMAKE_INSTALL_INCLUDEDIR}")
+set(NLOHMANN_JSON_TARGETS_EXPORT_NAME       "${PROJECT_NAME}Targets")
+set(NLOHMANN_JSON_CMAKE_CONFIG_TEMPLATE     "cmake/config.cmake.in")
+set(NLOHMANN_JSON_CMAKE_CONFIG_DIR          "${CMAKE_CURRENT_BINARY_DIR}")
+set(NLOHMANN_JSON_CMAKE_VERSION_CONFIG_FILE "${NLOHMANN_JSON_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake")
+set(NLOHMANN_JSON_CMAKE_PROJECT_CONFIG_FILE "${NLOHMANN_JSON_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake")
+set(NLOHMANN_JSON_CMAKE_PROJECT_TARGETS_FILE "${NLOHMANN_JSON_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake")
+set(NLOHMANN_JSON_PKGCONFIG_INSTALL_DIR     "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+if (JSON_MultipleHeaders)
+    set(NLOHMANN_JSON_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/")
+    message(STATUS "Using the multi-header code from ${NLOHMANN_JSON_INCLUDE_BUILD_DIR}")
+else()
+    set(NLOHMANN_JSON_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/single_include/")
+    message(STATUS "Using the single-header code from ${NLOHMANN_JSON_INCLUDE_BUILD_DIR}")
+endif()
+
+if (NOT JSON_ImplicitConversions)
+    message(STATUS "Implicit conversions are disabled")
+endif()
+
+##
+## TARGET
+## create target and add include path
+##
+add_library(${NLOHMANN_JSON_TARGET_NAME} INTERFACE)
+add_library(${PROJECT_NAME}::${NLOHMANN_JSON_TARGET_NAME} ALIAS ${NLOHMANN_JSON_TARGET_NAME})
+if (${CMAKE_VERSION} VERSION_LESS "3.8.0")
+    target_compile_features(${NLOHMANN_JSON_TARGET_NAME} INTERFACE cxx_range_for)
+else()
+    target_compile_features(${NLOHMANN_JSON_TARGET_NAME} INTERFACE cxx_std_11)
+endif()
+
+target_compile_definitions(
+    ${NLOHMANN_JSON_TARGET_NAME}
+    INTERFACE
+    JSON_USE_IMPLICIT_CONVERSIONS=$<BOOL:${JSON_ImplicitConversions}>
+)
+
+target_include_directories(
+    ${NLOHMANN_JSON_TARGET_NAME}
+    INTERFACE
+    $<BUILD_INTERFACE:${NLOHMANN_JSON_INCLUDE_BUILD_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+## add debug view definition file for msvc (natvis)
+if (MSVC)
+    set(NLOHMANN_ADD_NATVIS TRUE)
+    set(NLOHMANN_NATVIS_FILE "nlohmann_json.natvis")
+    target_sources(
+        ${NLOHMANN_JSON_TARGET_NAME}
+        INTERFACE
+            $<INSTALL_INTERFACE:${NLOHMANN_NATVIS_FILE}>
+            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/${NLOHMANN_NATVIS_FILE}>
+    )
+endif()
+
+# Install a pkg-config file, so other tools can find this.
+CONFIGURE_FILE(
+  "${CMAKE_CURRENT_SOURCE_DIR}/cmake/pkg-config.pc.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc"
+)
+
+##
+## TESTS
+## create and configure the unit test target
+##
+include(CTest) #adds option BUILD_TESTING (default ON)
+
+if(BUILD_TESTING AND JSON_BuildTests)
+    enable_testing()
+    add_subdirectory(test)
+endif()
+
+##
+## INSTALL
+## install header files, generate and install cmake config files for find_package()
+##
+include(CMakePackageConfigHelpers)
+# use a custom package version config file instead of
+# write_basic_package_version_file to ensure that it's architecture-independent
+# https://github.com/nlohmann/json/issues/1697
+configure_file(
+    "cmake/nlohmann_jsonConfigVersion.cmake.in"
+    ${NLOHMANN_JSON_CMAKE_VERSION_CONFIG_FILE}
+    @ONLY
+)
+configure_file(
+    ${NLOHMANN_JSON_CMAKE_CONFIG_TEMPLATE}
+    ${NLOHMANN_JSON_CMAKE_PROJECT_CONFIG_FILE}
+    @ONLY
+)
+
+if(JSON_Install)
+    install(
+        DIRECTORY ${NLOHMANN_JSON_INCLUDE_BUILD_DIR}
+        DESTINATION ${NLOHMANN_JSON_INCLUDE_INSTALL_DIR}
+    )
+    install(
+        FILES ${NLOHMANN_JSON_CMAKE_PROJECT_CONFIG_FILE} ${NLOHMANN_JSON_CMAKE_VERSION_CONFIG_FILE}
+        DESTINATION ${NLOHMANN_JSON_CONFIG_INSTALL_DIR}
+    )
+    if (NLOHMANN_ADD_NATVIS)
+        install(
+            FILES ${NLOHMANN_NATVIS_FILE}
+            DESTINATION .
+    )
+endif()
+    export(
+        TARGETS ${NLOHMANN_JSON_TARGET_NAME}
+        NAMESPACE ${PROJECT_NAME}::
+        FILE ${NLOHMANN_JSON_CMAKE_PROJECT_TARGETS_FILE}
+    )
+    install(
+        TARGETS ${NLOHMANN_JSON_TARGET_NAME}
+        EXPORT ${NLOHMANN_JSON_TARGETS_EXPORT_NAME}
+        INCLUDES DESTINATION ${NLOHMANN_JSON_INCLUDE_INSTALL_DIR}
+    )
+    install(
+        EXPORT ${NLOHMANN_JSON_TARGETS_EXPORT_NAME}
+        NAMESPACE ${PROJECT_NAME}::
+        DESTINATION ${NLOHMANN_JSON_CONFIG_INSTALL_DIR}
+    )
+    install(
+        FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc"
+        DESTINATION ${NLOHMANN_JSON_PKGCONFIG_INSTALL_DIR}
+    )
+endif()
diff --git a/thirdParty/nlohmann_json/LICENSE.MIT b/thirdParty/nlohmann_json/LICENSE.MIT
new file mode 100644
index 0000000000..ffef714b96
--- /dev/null
+++ b/thirdParty/nlohmann_json/LICENSE.MIT
@@ -0,0 +1,21 @@
+MIT License 
+
+Copyright (c) 2013-2020 Niels Lohmann
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/thirdParty/nlohmann_json/cmake/config.cmake.in b/thirdParty/nlohmann_json/cmake/config.cmake.in
new file mode 100644
index 0000000000..9a17a7d7b2
--- /dev/null
+++ b/thirdParty/nlohmann_json/cmake/config.cmake.in
@@ -0,0 +1,15 @@
+include(FindPackageHandleStandardArgs)
+set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG ${CMAKE_CURRENT_LIST_FILE})
+find_package_handle_standard_args(@PROJECT_NAME@ CONFIG_MODE)
+
+if(NOT TARGET @PROJECT_NAME@::@NLOHMANN_JSON_TARGET_NAME@)
+    include("${CMAKE_CURRENT_LIST_DIR}/@NLOHMANN_JSON_TARGETS_EXPORT_NAME@.cmake")
+    if((NOT TARGET @NLOHMANN_JSON_TARGET_NAME@) AND
+       (NOT @PROJECT_NAME@_FIND_VERSION OR
+        @PROJECT_NAME@_FIND_VERSION VERSION_LESS 3.2.0))
+        add_library(@NLOHMANN_JSON_TARGET_NAME@ INTERFACE IMPORTED)
+        set_target_properties(@NLOHMANN_JSON_TARGET_NAME@ PROPERTIES
+            INTERFACE_LINK_LIBRARIES @PROJECT_NAME@::@NLOHMANN_JSON_TARGET_NAME@
+        )
+    endif()
+endif()
diff --git a/thirdParty/nlohmann_json/cmake/download_test_data.cmake b/thirdParty/nlohmann_json/cmake/download_test_data.cmake
new file mode 100644
index 0000000000..a3f3f199f1
--- /dev/null
+++ b/thirdParty/nlohmann_json/cmake/download_test_data.cmake
@@ -0,0 +1,56 @@
+set(JSON_TEST_DATA_URL     https://github.com/nlohmann/json_test_data)
+set(JSON_TEST_DATA_VERSION 3.0.0)
+
+# if variable is set, use test data from given directory rather than downloading them
+if(JSON_TestDataDirectory)
+    message(STATUS "Using test data in ${JSON_TestDataDirectory}.")
+    add_custom_target(download_test_data)
+    file(WRITE ${CMAKE_BINARY_DIR}/include/test_data.hpp "#define TEST_DATA_DIRECTORY \"${JSON_TestDataDirectory}\"\n")
+else()
+    find_package(Git)
+    # target to download test data
+    add_custom_target(download_test_data
+        COMMAND test -d json_test_data || ${GIT_EXECUTABLE} clone -c advice.detachedHead=false --branch v${JSON_TEST_DATA_VERSION} ${JSON_TEST_DATA_URL}.git --quiet --depth 1
+        COMMENT "Downloading test data from ${JSON_TEST_DATA_URL} (v${JSON_TEST_DATA_VERSION})"
+        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+    )
+    # create a header with the path to the downloaded test data
+    file(WRITE ${CMAKE_BINARY_DIR}/include/test_data.hpp "#define TEST_DATA_DIRECTORY \"${CMAKE_BINARY_DIR}/json_test_data\"\n")
+endif()
+
+# determine the operating system (for debug and support purposes)
+find_program(UNAME_COMMAND uname)
+find_program(VER_COMMAND ver)
+find_program(LSB_RELEASE_COMMAND lsb_release)
+find_program(SW_VERS_COMMAND sw_vers)
+set(OS_VERSION_STRINGS "${CMAKE_SYSTEM}")
+if (VER_COMMAND)
+    execute_process(COMMAND ${VER_COMMAND} OUTPUT_VARIABLE VER_COMMAND_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+    set(OS_VERSION_STRINGS "${OS_VERSION_STRINGS}; ${VER_COMMAND_RESULT}")
+endif()
+if (SW_VERS_COMMAND)
+    execute_process(COMMAND ${SW_VERS_COMMAND} OUTPUT_VARIABLE SW_VERS_COMMAND_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
+    string(REGEX REPLACE "[ ]*\n" "; " SW_VERS_COMMAND_RESULT "${SW_VERS_COMMAND_RESULT}")
+    set(OS_VERSION_STRINGS "${OS_VERSION_STRINGS}; ${SW_VERS_COMMAND_RESULT}")
+endif()
+if (LSB_RELEASE_COMMAND)
+    execute_process(COMMAND ${LSB_RELEASE_COMMAND} -a OUTPUT_VARIABLE LSB_RELEASE_COMMAND_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
+    string(REGEX REPLACE "[ ]*\n" "; " LSB_RELEASE_COMMAND_RESULT "${LSB_RELEASE_COMMAND_RESULT}")
+    set(OS_VERSION_STRINGS "${OS_VERSION_STRINGS}; ${LSB_RELEASE_COMMAND_RESULT}")
+endif()
+if (UNAME_COMMAND)
+    execute_process(COMMAND ${UNAME_COMMAND} -a OUTPUT_VARIABLE UNAME_COMMAND_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
+    set(OS_VERSION_STRINGS "${OS_VERSION_STRINGS}; ${UNAME_COMMAND_RESULT}")
+endif()
+
+message(STATUS "Operating system: ${OS_VERSION_STRINGS}")
+
+# determine the compiler (for debug and support purposes)
+if (MSVC)
+    execute_process(COMMAND ${CMAKE_CXX_COMPILER} OUTPUT_VARIABLE CXX_VERSION_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_VARIABLE CXX_VERSION_RESULT ERROR_STRIP_TRAILING_WHITESPACE)
+    set(CMAKE_CXX_COMPILER "${CXX_VERSION_RESULT}; MSVC_VERSION=${MSVC_VERSION}; MSVC_TOOLSET_VERSION=${MSVC_TOOLSET_VERSION}")
+else()
+    execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE CXX_VERSION_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+string(REGEX REPLACE "[ ]*\n" "; " CXX_VERSION_RESULT "${CXX_VERSION_RESULT}")
+message(STATUS "Compiler: ${CXX_VERSION_RESULT}")
diff --git a/thirdParty/nlohmann_json/cmake/nlohmann_jsonConfigVersion.cmake.in b/thirdParty/nlohmann_json/cmake/nlohmann_jsonConfigVersion.cmake.in
new file mode 100644
index 0000000000..1091085973
--- /dev/null
+++ b/thirdParty/nlohmann_json/cmake/nlohmann_jsonConfigVersion.cmake.in
@@ -0,0 +1,20 @@
+# This is essentially cmake's BasicConfigVersion-SameMajorVersion.cmake.in but
+# without the 32/64-bit check.  Since json is a header-only library, it doesn't
+# matter if it was built on a different platform than what it is used on (see
+# https://github.com/nlohmann/json/issues/1697).
+set(PACKAGE_VERSION "@PROJECT_VERSION@")
+
+if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+
+  if(PACKAGE_FIND_VERSION_MAJOR STREQUAL "@PROJECT_VERSION_MAJOR@")
+    set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  else()
+    set(PACKAGE_VERSION_COMPATIBLE FALSE)
+  endif()
+
+  if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
+      set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/thirdParty/nlohmann_json/cmake/pkg-config.pc.in b/thirdParty/nlohmann_json/cmake/pkg-config.pc.in
new file mode 100644
index 0000000000..3541abf0ba
--- /dev/null
+++ b/thirdParty/nlohmann_json/cmake/pkg-config.pc.in
@@ -0,0 +1,4 @@
+Name: ${PROJECT_NAME}
+Description: JSON for Modern C++
+Version: ${PROJECT_VERSION}
+Cflags: -I${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}
diff --git a/thirdParty/nlohmann_json/single_include/nlohmann/json.hpp b/thirdParty/nlohmann_json/single_include/nlohmann/json.hpp
new file mode 100644
index 0000000000..a70aaf8cbc
--- /dev/null
+++ b/thirdParty/nlohmann_json/single_include/nlohmann/json.hpp
@@ -0,0 +1,25447 @@
+/*
+    __ _____ _____ _____
+ __|  |   __|     |   | |  JSON for Modern C++
+|  |  |__   |  |  | | | |  version 3.9.1
+|_____|_____|_____|_|___|  https://github.com/nlohmann/json
+
+Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+SPDX-License-Identifier: MIT
+Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
+
+Permission is hereby  granted, free of charge, to any  person obtaining a copy
+of this software and associated  documentation files (the "Software"), to deal
+in the Software  without restriction, including without  limitation the rights
+to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
+copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
+IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
+FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
+AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
+LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef INCLUDE_NLOHMANN_JSON_HPP_
+#define INCLUDE_NLOHMANN_JSON_HPP_
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3
+#define NLOHMANN_JSON_VERSION_MINOR 9
+#define NLOHMANN_JSON_VERSION_PATCH 1
+
+#include <algorithm> // all_of, find, for_each
+#include <cstddef> // nullptr_t, ptrdiff_t, size_t
+#include <functional> // hash, less
+#include <initializer_list> // initializer_list
+#include <iosfwd> // istream, ostream
+#include <iterator> // random_access_iterator_tag
+#include <memory> // unique_ptr
+#include <numeric> // accumulate
+#include <string> // string, stoi, to_string
+#include <utility> // declval, forward, move, pair, swap
+#include <vector> // vector
+
+// #include <nlohmann/adl_serializer.hpp>
+
+
+#include <utility>
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+
+#include <algorithm> // transform
+#include <array> // array
+#include <forward_list> // forward_list
+#include <iterator> // inserter, front_inserter, end
+#include <map> // map
+#include <string> // string
+#include <tuple> // tuple, make_tuple
+#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
+#include <unordered_map> // unordered_map
+#include <utility> // pair, declval
+#include <valarray> // valarray
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+
+#include <exception> // exception
+#include <stdexcept> // runtime_error
+#include <string> // to_string
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+
+#include <cstddef> // size_t
+
+namespace nlohmann
+{
+namespace detail
+{
+/// struct to capture the start position of the current token
+struct position_t
+{
+    /// the total number of characters read
+    std::size_t chars_read_total = 0;
+    /// the number of characters read in the current line
+    std::size_t chars_read_current_line = 0;
+    /// the number of lines read
+    std::size_t lines_read = 0;
+
+    /// conversion to size_t to preserve SAX interface
+    constexpr operator size_t() const
+    {
+        return chars_read_total;
+    }
+};
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+#include <utility> // pair
+// #include <nlohmann/thirdparty/hedley/hedley.hpp>
+/* Hedley - https://nemequ.github.io/hedley
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all
+ * copyright and related and neighboring rights to this software to
+ * the public domain worldwide. This software is distributed without
+ * any warranty.
+ *
+ * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ * SPDX-License-Identifier: CC0-1.0
+ */
+
+#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 13)
+#if defined(JSON_HEDLEY_VERSION)
+    #undef JSON_HEDLEY_VERSION
+#endif
+#define JSON_HEDLEY_VERSION 13
+
+#if defined(JSON_HEDLEY_STRINGIFY_EX)
+    #undef JSON_HEDLEY_STRINGIFY_EX
+#endif
+#define JSON_HEDLEY_STRINGIFY_EX(x) #x
+
+#if defined(JSON_HEDLEY_STRINGIFY)
+    #undef JSON_HEDLEY_STRINGIFY
+#endif
+#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)
+
+#if defined(JSON_HEDLEY_CONCAT_EX)
+    #undef JSON_HEDLEY_CONCAT_EX
+#endif
+#define JSON_HEDLEY_CONCAT_EX(a,b) a##b
+
+#if defined(JSON_HEDLEY_CONCAT)
+    #undef JSON_HEDLEY_CONCAT
+#endif
+#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b)
+
+#if defined(JSON_HEDLEY_CONCAT3_EX)
+    #undef JSON_HEDLEY_CONCAT3_EX
+#endif
+#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c
+
+#if defined(JSON_HEDLEY_CONCAT3)
+    #undef JSON_HEDLEY_CONCAT3
+#endif
+#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c)
+
+#if defined(JSON_HEDLEY_VERSION_ENCODE)
+    #undef JSON_HEDLEY_VERSION_ENCODE
+#endif
+#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
+    #undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
+
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #undef JSON_HEDLEY_GNUC_VERSION
+#endif
+#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#elif defined(__GNUC__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION)
+    #undef JSON_HEDLEY_MSVC_VERSION
+#endif
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
+#elif defined(_MSC_FULL_VER)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
+#elif defined(_MSC_VER)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
+    #undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#endif
+#if !defined(_MSC_VER)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
+#else
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #undef JSON_HEDLEY_INTEL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
+#elif defined(__INTEL_COMPILER)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
+    #undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #undef JSON_HEDLEY_PGI_VERSION
+#endif
+#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
+    #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
+    #undef JSON_HEDLEY_PGI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #undef JSON_HEDLEY_SUNPRO_VERSION
+#endif
+#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
+#elif defined(__SUNPRO_C)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
+#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
+#elif defined(__SUNPRO_CC)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
+    #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#endif
+#if defined(__EMSCRIPTEN__)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #undef JSON_HEDLEY_ARM_VERSION
+#endif
+#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
+#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
+    #undef JSON_HEDLEY_ARM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #undef JSON_HEDLEY_IBM_VERSION
+#endif
+#if defined(__ibmxl__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
+#elif defined(__xlC__) && defined(__xlC_ver__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
+#elif defined(__xlC__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
+    #undef JSON_HEDLEY_IBM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #undef JSON_HEDLEY_TI_VERSION
+#endif
+#if \
+    defined(__TI_COMPILER_VERSION__) && \
+    ( \
+      defined(__TMS470__) || defined(__TI_ARM__) || \
+      defined(__MSP430__) || \
+      defined(__TMS320C2000__) \
+    )
+#if (__TI_COMPILER_VERSION__ >= 16000000)
+    #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+    #undef JSON_HEDLEY_TI_CL2000_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
+    #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+    #undef JSON_HEDLEY_TI_CL430_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
+    #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+    #undef JSON_HEDLEY_TI_ARMCL_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
+    #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+    #undef JSON_HEDLEY_TI_CL6X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
+    #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+    #undef JSON_HEDLEY_TI_CL7X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
+    #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+    #undef JSON_HEDLEY_TI_CLPRU_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
+    #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #undef JSON_HEDLEY_CRAY_VERSION
+#endif
+#if defined(_CRAYC)
+    #if defined(_RELEASE_PATCHLEVEL)
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
+    #else
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
+    #undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #undef JSON_HEDLEY_IAR_VERSION
+#endif
+#if defined(__IAR_SYSTEMS_ICC__)
+    #if __VER__ > 1000
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
+    #else
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(VER / 100, __VER__ % 100, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
+    #undef JSON_HEDLEY_IAR_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #undef JSON_HEDLEY_TINYC_VERSION
+#endif
+#if defined(__TINYC__)
+    #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
+    #undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #undef JSON_HEDLEY_DMC_VERSION
+#endif
+#if defined(__DMC__)
+    #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
+    #undef JSON_HEDLEY_DMC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #undef JSON_HEDLEY_COMPCERT_VERSION
+#endif
+#if defined(__COMPCERT_VERSION__)
+    #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
+    #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #undef JSON_HEDLEY_PELLES_VERSION
+#endif
+#if defined(__POCC__)
+    #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
+    #undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #undef JSON_HEDLEY_GCC_VERSION
+#endif
+#if \
+    defined(JSON_HEDLEY_GNUC_VERSION) && \
+    !defined(__clang__) && \
+    !defined(JSON_HEDLEY_INTEL_VERSION) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_ARM_VERSION) && \
+    !defined(JSON_HEDLEY_TI_VERSION) && \
+    !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL430_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \
+    !defined(__COMPCERT__)
+    #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#endif
+#if \
+    defined(__has_cpp_attribute) && \
+    defined(__cplusplus) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#endif
+#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#elif \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_IAR_VERSION) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
+    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_BUILTIN)
+    #undef JSON_HEDLEY_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_FEATURE)
+    #undef JSON_HEDLEY_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_HAS_FEATURE(feature) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GCC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_EXTENSION)
+    #undef JSON_HEDLEY_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_WARNING)
+    #undef JSON_HEDLEY_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_HAS_WARNING(warning) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
+    #undef JSON_HEDLEY_GNUC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
+    #undef JSON_HEDLEY_GCC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#endif
+#if defined(__cplusplus)
+#  if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
+#    if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions")
+#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    else
+#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    endif
+#  endif
+#endif
+#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
+#endif
+
+#if defined(JSON_HEDLEY_CONST_CAST)
+    #undef JSON_HEDLEY_CONST_CAST
+#endif
+#if defined(__cplusplus)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
+#elif \
+  JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_REINTERPRET_CAST)
+    #undef JSON_HEDLEY_REINTERPRET_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_CAST)
+    #undef JSON_HEDLEY_STATIC_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_CPP_CAST)
+    #undef JSON_HEDLEY_CPP_CAST
+#endif
+#if defined(__cplusplus)
+#  if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast")
+#    define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
+    ((T) (expr)) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0)
+#    define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("diag_suppress=Pe137") \
+    JSON_HEDLEY_DIAGNOSTIC_POP \
+#  else
+#    define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr))
+#  endif
+#else
+#  define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
+#endif
+
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    defined(__clang__) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
+    #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_PRAGMA(value) __pragma(value)
+#else
+    #define JSON_HEDLEY_PRAGMA(value)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
+    #undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#endif
+#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
+    #undef JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
+    #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
+#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH
+    #define JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+
+#if defined(JSON_HEDLEY_DEPRECATED)
+    #undef JSON_HEDLEY_DEPRECATED
+#endif
+#if defined(JSON_HEDLEY_DEPRECATED_FOR)
+    #undef JSON_HEDLEY_DEPRECATED_FOR
+#endif
+#if JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
+#elif defined(__cplusplus) && (__cplusplus >= 201402L)
+    #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
+#elif \
+    JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
+#else
+    #define JSON_HEDLEY_DEPRECATED(since)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
+#endif
+
+#if defined(JSON_HEDLEY_UNAVAILABLE)
+    #undef JSON_HEDLEY_UNAVAILABLE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
+#else
+    #define JSON_HEDLEY_UNAVAILABLE(available_since)
+#endif
+
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#endif
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#endif
+#if (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
+#elif defined(_Check_return_) /* SAL */
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
+#else
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
+#endif
+
+#if defined(JSON_HEDLEY_SENTINEL)
+    #undef JSON_HEDLEY_SENTINEL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0)
+    #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
+#else
+    #define JSON_HEDLEY_SENTINEL(position)
+#endif
+
+#if defined(JSON_HEDLEY_NO_RETURN)
+    #undef JSON_HEDLEY_NO_RETURN
+#endif
+#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NO_RETURN __noreturn
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+    #define JSON_HEDLEY_NO_RETURN _Noreturn
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+    #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#else
+    #define JSON_HEDLEY_NO_RETURN
+#endif
+
+#if defined(JSON_HEDLEY_NO_ESCAPE)
+    #undef JSON_HEDLEY_NO_ESCAPE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
+    #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
+#else
+    #define JSON_HEDLEY_NO_ESCAPE
+#endif
+
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #undef JSON_HEDLEY_UNREACHABLE
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
+    #undef JSON_HEDLEY_UNREACHABLE_RETURN
+#endif
+#if defined(JSON_HEDLEY_ASSUME)
+    #undef JSON_HEDLEY_ASSUME
+#endif
+#if \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_ASSUME(expr) __assume(expr)
+#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
+    #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
+#elif \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+    #if defined(__cplusplus)
+        #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
+    #endif
+#endif
+#if \
+    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5)
+    #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
+#elif defined(JSON_HEDLEY_ASSUME)
+    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+#if !defined(JSON_HEDLEY_ASSUME)
+    #if defined(JSON_HEDLEY_UNREACHABLE)
+        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1)))
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr)
+    #endif
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #if  \
+        JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value))
+    #else
+        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
+    #endif
+#else
+    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value)
+#endif
+#if !defined(JSON_HEDLEY_UNREACHABLE)
+    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+
+JSON_HEDLEY_DIAGNOSTIC_PUSH
+#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
+    #pragma clang diagnostic ignored "-Wpedantic"
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
+    #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#endif
+#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
+    #if defined(__clang__)
+        #pragma clang diagnostic ignored "-Wvariadic-macros"
+    #elif defined(JSON_HEDLEY_GCC_VERSION)
+        #pragma GCC diagnostic ignored "-Wvariadic-macros"
+    #endif
+#endif
+#if defined(JSON_HEDLEY_NON_NULL)
+    #undef JSON_HEDLEY_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
+#else
+    #define JSON_HEDLEY_NON_NULL(...)
+#endif
+JSON_HEDLEY_DIAGNOSTIC_POP
+
+#if defined(JSON_HEDLEY_PRINTF_FORMAT)
+    #undef JSON_HEDLEY_PRINTF_FORMAT
+#endif
+#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
+#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(format) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
+#else
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
+#endif
+
+#if defined(JSON_HEDLEY_CONSTEXPR)
+    #undef JSON_HEDLEY_CONSTEXPR
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
+    #endif
+#endif
+#if !defined(JSON_HEDLEY_CONSTEXPR)
+    #define JSON_HEDLEY_CONSTEXPR
+#endif
+
+#if defined(JSON_HEDLEY_PREDICT)
+    #undef JSON_HEDLEY_PREDICT
+#endif
+#if defined(JSON_HEDLEY_LIKELY)
+    #undef JSON_HEDLEY_LIKELY
+#endif
+#if defined(JSON_HEDLEY_UNLIKELY)
+    #undef JSON_HEDLEY_UNLIKELY
+#endif
+#if defined(JSON_HEDLEY_UNPREDICTABLE)
+    #undef JSON_HEDLEY_UNPREDICTABLE
+#endif
+#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
+#endif
+#if \
+  JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0)
+#  define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
+#  define JSON_HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
+#  define JSON_HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
+#elif \
+  JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
+  JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0)
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) \
+    (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \
+    (__extension__ ({ \
+        double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \
+    (__extension__ ({ \
+        double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
+#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_LIKELY(expr) (!!(expr))
+#  define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
+#endif
+#if !defined(JSON_HEDLEY_UNPREDICTABLE)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
+#endif
+
+#if defined(JSON_HEDLEY_MALLOC)
+    #undef JSON_HEDLEY_MALLOC
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
+    #define JSON_HEDLEY_MALLOC __declspec(restrict)
+#else
+    #define JSON_HEDLEY_MALLOC
+#endif
+
+#if defined(JSON_HEDLEY_PURE)
+    #undef JSON_HEDLEY_PURE
+#endif
+#if \
+  JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+#  define JSON_HEDLEY_PURE __attribute__((__pure__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+#  define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
+#elif defined(__cplusplus) && \
+    ( \
+      JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
+      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
+    )
+#  define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
+#else
+#  define JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_CONST)
+    #undef JSON_HEDLEY_CONST
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(const) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_CONST __attribute__((__const__))
+#elif \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_CONST _Pragma("no_side_effect")
+#else
+    #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_RESTRICT)
+    #undef JSON_HEDLEY_RESTRICT
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT restrict
+#elif \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    defined(__clang__)
+    #define JSON_HEDLEY_RESTRICT __restrict
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT _Restrict
+#else
+    #define JSON_HEDLEY_RESTRICT
+#endif
+
+#if defined(JSON_HEDLEY_INLINE)
+    #undef JSON_HEDLEY_INLINE
+#endif
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    (defined(__cplusplus) && (__cplusplus >= 199711L))
+    #define JSON_HEDLEY_INLINE inline
+#elif \
+    defined(JSON_HEDLEY_GCC_VERSION) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0)
+    #define JSON_HEDLEY_INLINE __inline__
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_INLINE __inline
+#else
+    #define JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_ALWAYS_INLINE)
+    #undef JSON_HEDLEY_ALWAYS_INLINE
+#endif
+#if \
+  JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE __forceinline
+#elif defined(__cplusplus) && \
+    ( \
+      JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+      JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+      JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+      JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
+    )
+#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
+#else
+#  define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_NEVER_INLINE)
+    #undef JSON_HEDLEY_NEVER_INLINE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#else
+    #define JSON_HEDLEY_NEVER_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_PRIVATE)
+    #undef JSON_HEDLEY_PRIVATE
+#endif
+#if defined(JSON_HEDLEY_PUBLIC)
+    #undef JSON_HEDLEY_PUBLIC
+#endif
+#if defined(JSON_HEDLEY_IMPORT)
+    #undef JSON_HEDLEY_IMPORT
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  define JSON_HEDLEY_PRIVATE
+#  define JSON_HEDLEY_PUBLIC   __declspec(dllexport)
+#  define JSON_HEDLEY_IMPORT   __declspec(dllimport)
+#else
+#  if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    ( \
+      defined(__TI_EABI__) && \
+      ( \
+        (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
+      ) \
+    )
+#    define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
+#    define JSON_HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
+#  else
+#    define JSON_HEDLEY_PRIVATE
+#    define JSON_HEDLEY_PUBLIC
+#  endif
+#  define JSON_HEDLEY_IMPORT    extern
+#endif
+
+#if defined(JSON_HEDLEY_NO_THROW)
+    #undef JSON_HEDLEY_NO_THROW
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NO_THROW __declspec(nothrow)
+#else
+    #define JSON_HEDLEY_NO_THROW
+#endif
+
+#if defined(JSON_HEDLEY_FALL_THROUGH)
+    #undef JSON_HEDLEY_FALL_THROUGH
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0)
+    #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
+#elif defined(__fallthrough) /* SAL */
+    #define JSON_HEDLEY_FALL_THROUGH __fallthrough
+#else
+    #define JSON_HEDLEY_FALL_THROUGH
+#endif
+
+#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
+    #undef JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0)
+    #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
+#elif defined(_Ret_notnull_) /* SAL */
+    #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
+#else
+    #define JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+
+#if defined(JSON_HEDLEY_ARRAY_PARAM)
+    #undef JSON_HEDLEY_ARRAY_PARAM
+#endif
+#if \
+    defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
+    !defined(__STDC_NO_VLA__) && \
+    !defined(__cplusplus) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_ARRAY_PARAM(name) (name)
+#else
+    #define JSON_HEDLEY_ARRAY_PARAM(name)
+#endif
+
+#if defined(JSON_HEDLEY_IS_CONSTANT)
+    #undef JSON_HEDLEY_IS_CONSTANT
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
+    #undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#endif
+/* JSON_HEDLEY_IS_CONSTEXPR_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #undef JSON_HEDLEY_IS_CONSTEXPR_
+#endif
+#if \
+    JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0)
+    #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
+#endif
+#if !defined(__cplusplus)
+#  if \
+       JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+       JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
+       JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
+#endif
+#  elif \
+       ( \
+          defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+          !defined(JSON_HEDLEY_SUNPRO_VERSION) && \
+          !defined(JSON_HEDLEY_PGI_VERSION) && \
+          !defined(JSON_HEDLEY_IAR_VERSION)) || \
+       JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
+#endif
+#  elif \
+       defined(JSON_HEDLEY_GCC_VERSION) || \
+       defined(JSON_HEDLEY_INTEL_VERSION) || \
+       defined(JSON_HEDLEY_TINYC_VERSION) || \
+       defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \
+       JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
+       defined(JSON_HEDLEY_TI_CL2000_VERSION) || \
+       defined(JSON_HEDLEY_TI_CL6X_VERSION) || \
+       defined(JSON_HEDLEY_TI_CL7X_VERSION) || \
+       defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \
+       defined(__clang__)
+#    define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \
+        sizeof(void) != \
+        sizeof(*( \
+                  1 ? \
+                  ((void*) ((expr) * 0L) ) : \
+((struct { char v[sizeof(void) * 2]; } *) 1) \
+                ) \
+              ) \
+                                            )
+#  endif
+#endif
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
+#else
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) (0)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
+    #undef JSON_HEDLEY_BEGIN_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_END_C_DECLS)
+    #undef JSON_HEDLEY_END_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_C_DECL)
+    #undef JSON_HEDLEY_C_DECL
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
+    #define JSON_HEDLEY_END_C_DECLS }
+    #define JSON_HEDLEY_C_DECL extern "C"
+#else
+    #define JSON_HEDLEY_BEGIN_C_DECLS
+    #define JSON_HEDLEY_END_C_DECLS
+    #define JSON_HEDLEY_C_DECL
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_ASSERT)
+    #undef JSON_HEDLEY_STATIC_ASSERT
+#endif
+#if \
+  !defined(__cplusplus) && ( \
+      (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+      JSON_HEDLEY_HAS_FEATURE(c_static_assert) || \
+      JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
+      JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+      defined(_Static_assert) \
+    )
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
+#elif \
+  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0)
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
+#else
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message)
+#endif
+
+#if defined(JSON_HEDLEY_NULL)
+    #undef JSON_HEDLEY_NULL
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
+    #elif defined(NULL)
+        #define JSON_HEDLEY_NULL NULL
+    #else
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
+    #endif
+#elif defined(NULL)
+    #define JSON_HEDLEY_NULL NULL
+#else
+    #define JSON_HEDLEY_NULL ((void*) 0)
+#endif
+
+#if defined(JSON_HEDLEY_MESSAGE)
+    #undef JSON_HEDLEY_MESSAGE
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_MESSAGE(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(message msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
+#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_WARNING)
+    #undef JSON_HEDLEY_WARNING
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_WARNING(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(clang warning msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_REQUIRE)
+    #undef JSON_HEDLEY_REQUIRE
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_MSG)
+    #undef JSON_HEDLEY_REQUIRE_MSG
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
+#  if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
+#    define JSON_HEDLEY_REQUIRE(expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), #expr, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), msg, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  else
+#    define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
+#  endif
+#else
+#  define JSON_HEDLEY_REQUIRE(expr)
+#  define JSON_HEDLEY_REQUIRE_MSG(expr,msg)
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS)
+    #undef JSON_HEDLEY_FLAGS
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum)
+    #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS_CAST)
+    #undef JSON_HEDLEY_FLAGS_CAST
+#endif
+#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0)
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        _Pragma("warning(disable:188)") \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
+#endif
+
+#if defined(JSON_HEDLEY_EMPTY_BASES)
+    #undef JSON_HEDLEY_EMPTY_BASES
+#endif
+#if JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)
+    #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
+#else
+    #define JSON_HEDLEY_EMPTY_BASES
+#endif
+
+/* Remaining macros are deprecated. */
+
+#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
+#else
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
+    #undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#endif
+#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
+    #undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
+    #undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#endif
+#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
+    #undef JSON_HEDLEY_CLANG_HAS_WARNING
+#endif
+#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)
+
+#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */
+
+
+// This file contains all internal macro definitions
+// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them
+
+// exclude unsupported compilers
+#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
+    #if defined(__clang__)
+        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
+            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
+        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
+            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #endif
+#endif
+
+// C++ language standard detection
+#if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+    #define JSON_HAS_CPP_20
+    #define JSON_HAS_CPP_17
+    #define JSON_HAS_CPP_14
+#elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+    #define JSON_HAS_CPP_17
+    #define JSON_HAS_CPP_14
+#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
+    #define JSON_HAS_CPP_14
+#endif
+
+// disable float-equal warnings on GCC/clang
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+
+// disable documentation warnings on clang
+#if defined(__clang__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wdocumentation"
+#endif
+
+// allow to disable exceptions
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
+    #define JSON_THROW(exception) throw exception
+    #define JSON_TRY try
+    #define JSON_CATCH(exception) catch(exception)
+    #define JSON_INTERNAL_CATCH(exception) catch(exception)
+#else
+    #include <cstdlib>
+    #define JSON_THROW(exception) std::abort()
+    #define JSON_TRY if(true)
+    #define JSON_CATCH(exception) if(false)
+    #define JSON_INTERNAL_CATCH(exception) if(false)
+#endif
+
+// override exception macros
+#if defined(JSON_THROW_USER)
+    #undef JSON_THROW
+    #define JSON_THROW JSON_THROW_USER
+#endif
+#if defined(JSON_TRY_USER)
+    #undef JSON_TRY
+    #define JSON_TRY JSON_TRY_USER
+#endif
+#if defined(JSON_CATCH_USER)
+    #undef JSON_CATCH
+    #define JSON_CATCH JSON_CATCH_USER
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
+#endif
+#if defined(JSON_INTERNAL_CATCH_USER)
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
+#endif
+
+// allow to override assert
+#if !defined(JSON_ASSERT)
+    #include <cassert> // assert
+    #define JSON_ASSERT(x) assert(x)
+#endif
+
+/*!
+@brief macro to briefly define a mapping between an enum and JSON
+@def NLOHMANN_JSON_SERIALIZE_ENUM
+@since version 3.4.0
+*/
+#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                            \
+    template<typename BasicJsonType>                                                            \
+    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
+    {                                                                                           \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
+        {                                                                                       \
+            return ej_pair.first == e;                                                          \
+        });                                                                                     \
+        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                 \
+    }                                                                                           \
+    template<typename BasicJsonType>                                                            \
+    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
+    {                                                                                           \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
+        {                                                                                       \
+            return ej_pair.second == j;                                                         \
+        });                                                                                     \
+        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                  \
+    }
+
+// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
+// may be removed in the future once the class is split.
+
+#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
+    template<template<typename, typename, typename...> class ObjectType,   \
+             template<typename, typename...> class ArrayType,              \
+             class StringType, class BooleanType, class NumberIntegerType, \
+             class NumberUnsignedType, class NumberFloatType,              \
+             template<typename> class AllocatorType,                       \
+             template<typename, typename = void> class JSONSerializer,     \
+             class BinaryType>
+
+#define NLOHMANN_BASIC_JSON_TPL                                            \
+    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
+    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
+    AllocatorType, JSONSerializer, BinaryType>
+
+// Macros to simplify conversion from/to types
+
+#define NLOHMANN_JSON_EXPAND( x ) x
+#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME
+#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \
+        NLOHMANN_JSON_PASTE64, \
+        NLOHMANN_JSON_PASTE63, \
+        NLOHMANN_JSON_PASTE62, \
+        NLOHMANN_JSON_PASTE61, \
+        NLOHMANN_JSON_PASTE60, \
+        NLOHMANN_JSON_PASTE59, \
+        NLOHMANN_JSON_PASTE58, \
+        NLOHMANN_JSON_PASTE57, \
+        NLOHMANN_JSON_PASTE56, \
+        NLOHMANN_JSON_PASTE55, \
+        NLOHMANN_JSON_PASTE54, \
+        NLOHMANN_JSON_PASTE53, \
+        NLOHMANN_JSON_PASTE52, \
+        NLOHMANN_JSON_PASTE51, \
+        NLOHMANN_JSON_PASTE50, \
+        NLOHMANN_JSON_PASTE49, \
+        NLOHMANN_JSON_PASTE48, \
+        NLOHMANN_JSON_PASTE47, \
+        NLOHMANN_JSON_PASTE46, \
+        NLOHMANN_JSON_PASTE45, \
+        NLOHMANN_JSON_PASTE44, \
+        NLOHMANN_JSON_PASTE43, \
+        NLOHMANN_JSON_PASTE42, \
+        NLOHMANN_JSON_PASTE41, \
+        NLOHMANN_JSON_PASTE40, \
+        NLOHMANN_JSON_PASTE39, \
+        NLOHMANN_JSON_PASTE38, \
+        NLOHMANN_JSON_PASTE37, \
+        NLOHMANN_JSON_PASTE36, \
+        NLOHMANN_JSON_PASTE35, \
+        NLOHMANN_JSON_PASTE34, \
+        NLOHMANN_JSON_PASTE33, \
+        NLOHMANN_JSON_PASTE32, \
+        NLOHMANN_JSON_PASTE31, \
+        NLOHMANN_JSON_PASTE30, \
+        NLOHMANN_JSON_PASTE29, \
+        NLOHMANN_JSON_PASTE28, \
+        NLOHMANN_JSON_PASTE27, \
+        NLOHMANN_JSON_PASTE26, \
+        NLOHMANN_JSON_PASTE25, \
+        NLOHMANN_JSON_PASTE24, \
+        NLOHMANN_JSON_PASTE23, \
+        NLOHMANN_JSON_PASTE22, \
+        NLOHMANN_JSON_PASTE21, \
+        NLOHMANN_JSON_PASTE20, \
+        NLOHMANN_JSON_PASTE19, \
+        NLOHMANN_JSON_PASTE18, \
+        NLOHMANN_JSON_PASTE17, \
+        NLOHMANN_JSON_PASTE16, \
+        NLOHMANN_JSON_PASTE15, \
+        NLOHMANN_JSON_PASTE14, \
+        NLOHMANN_JSON_PASTE13, \
+        NLOHMANN_JSON_PASTE12, \
+        NLOHMANN_JSON_PASTE11, \
+        NLOHMANN_JSON_PASTE10, \
+        NLOHMANN_JSON_PASTE9, \
+        NLOHMANN_JSON_PASTE8, \
+        NLOHMANN_JSON_PASTE7, \
+        NLOHMANN_JSON_PASTE6, \
+        NLOHMANN_JSON_PASTE5, \
+        NLOHMANN_JSON_PASTE4, \
+        NLOHMANN_JSON_PASTE3, \
+        NLOHMANN_JSON_PASTE2, \
+        NLOHMANN_JSON_PASTE1)(__VA_ARGS__))
+#define NLOHMANN_JSON_PASTE2(func, v1) func(v1)
+#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2)
+#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3)
+#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4)
+#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5)
+#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6)
+#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7)
+#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8)
+#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9)
+#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10)
+#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11)
+#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12)
+#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13)
+#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14)
+#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)
+#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16)
+#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17)
+#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18)
+#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19)
+#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
+#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21)
+#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22)
+#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23)
+#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24)
+#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25)
+#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26)
+#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27)
+#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28)
+#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29)
+#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30)
+#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32)
+#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33)
+#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34)
+#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35)
+#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36)
+#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37)
+#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38)
+#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39)
+#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40)
+#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41)
+#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42)
+#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43)
+#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44)
+#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45)
+#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46)
+#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47)
+#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48)
+#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49)
+#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50)
+#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51)
+#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52)
+#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53)
+#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54)
+#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55)
+#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56)
+#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57)
+#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58)
+#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59)
+#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60)
+#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61)
+#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62)
+#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63)
+
+#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
+#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+#ifndef JSON_USE_IMPLICIT_CONVERSIONS
+    #define JSON_USE_IMPLICIT_CONVERSIONS 1
+#endif
+
+#if JSON_USE_IMPLICIT_CONVERSIONS
+    #define JSON_EXPLICIT
+#else
+    #define JSON_EXPLICIT explicit
+#endif
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////////
+// exceptions //
+////////////////
+
+/*!
+@brief general exception of the @ref basic_json class
+
+This class is an extension of `std::exception` objects with a member @a id for
+exception ids. It is used as the base class for all exceptions thrown by the
+@ref basic_json class. This class can hence be used as "wildcard" to catch
+exceptions.
+
+Subclasses:
+- @ref parse_error for exceptions indicating a parse error
+- @ref invalid_iterator for exceptions indicating errors with iterators
+- @ref type_error for exceptions indicating executing a member function with
+                  a wrong type
+- @ref out_of_range for exceptions indicating access out of the defined range
+- @ref other_error for exceptions indicating other library errors
+
+@internal
+@note To have nothrow-copy-constructible exceptions, we internally use
+      `std::runtime_error` which can cope with arbitrary-length error messages.
+      Intermediate strings are built with static functions and then passed to
+      the actual constructor.
+@endinternal
+
+@liveexample{The following code shows how arbitrary library exceptions can be
+caught.,exception}
+
+@since version 3.0.0
+*/
+class exception : public std::exception
+{
+  public:
+    /// returns the explanatory string
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char* what() const noexcept override
+    {
+        return m.what();
+    }
+
+    /// the id of the exception
+    const int id;
+
+  protected:
+    JSON_HEDLEY_NON_NULL(3)
+    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {}
+
+    static std::string name(const std::string& ename, int id_)
+    {
+        return "[json.exception." + ename + "." + std::to_string(id_) + "] ";
+    }
+
+  private:
+    /// an exception object as storage for error messages
+    std::runtime_error m;
+};
+
+/*!
+@brief exception indicating a parse error
+
+This exception is thrown by the library when a parse error occurs. Parse errors
+can occur during the deserialization of JSON text, CBOR, MessagePack, as well
+as when using JSON Patch.
+
+Member @a byte holds the byte index of the last read character in the input
+file.
+
+Exceptions have ids 1xx.
+
+name / id                      | example message | description
+------------------------------ | --------------- | -------------------------
+json.exception.parse_error.101 | parse error at 2: unexpected end of input; expected string literal | This error indicates a syntax error while deserializing a JSON text. The error message describes that an unexpected token (character) was encountered, and the member @a byte indicates the error position.
+json.exception.parse_error.102 | parse error at 14: missing or wrong low surrogate | JSON uses the `\uxxxx` format to describe Unicode characters. Code points above above 0xFFFF are split into two `\uxxxx` entries ("surrogate pairs"). This error indicates that the surrogate pair is incomplete or contains an invalid code point.
+json.exception.parse_error.103 | parse error: code points above 0x10FFFF are invalid | Unicode supports code points up to 0x10FFFF. Code points above 0x10FFFF are invalid.
+json.exception.parse_error.104 | parse error: JSON patch must be an array of objects | [RFC 6902](https://tools.ietf.org/html/rfc6902) requires a JSON Patch document to be a JSON document that represents an array of objects.
+json.exception.parse_error.105 | parse error: operation must have string member 'op' | An operation of a JSON Patch document must contain exactly one "op" member, whose value indicates the operation to perform. Its value must be one of "add", "remove", "replace", "move", "copy", or "test"; other values are errors.
+json.exception.parse_error.106 | parse error: array index '01' must not begin with '0' | An array index in a JSON Pointer ([RFC 6901](https://tools.ietf.org/html/rfc6901)) may be `0` or any number without a leading `0`.
+json.exception.parse_error.107 | parse error: JSON pointer must be empty or begin with '/' - was: 'foo' | A JSON Pointer must be a Unicode string containing a sequence of zero or more reference tokens, each prefixed by a `/` character.
+json.exception.parse_error.108 | parse error: escape character '~' must be followed with '0' or '1' | In a JSON Pointer, only `~0` and `~1` are valid escape sequences.
+json.exception.parse_error.109 | parse error: array index 'one' is not a number | A JSON Pointer array index must be a number.
+json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vector | When parsing CBOR or MessagePack, the byte vector ends before the complete value has been read.
+json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read.
+json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read.
+json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet).
+json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed.
+
+@note For an input with n bytes, 1 is the index of the first character and n+1
+      is the index of the terminating null byte or the end of file. This also
+      holds true when reading a byte vector (CBOR or MessagePack).
+
+@liveexample{The following code shows how a `parse_error` exception can be
+caught.,parse_error}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class parse_error : public exception
+{
+  public:
+    /*!
+    @brief create a parse error exception
+    @param[in] id_       the id of the exception
+    @param[in] pos       the position where the error occurred (or with
+                         chars_read_total=0 if the position cannot be
+                         determined)
+    @param[in] what_arg  the explanatory string
+    @return parse_error object
+    */
+    static parse_error create(int id_, const position_t& pos, const std::string& what_arg)
+    {
+        std::string w = exception::name("parse_error", id_) + "parse error" +
+                        position_string(pos) + ": " + what_arg;
+        return parse_error(id_, pos.chars_read_total, w.c_str());
+    }
+
+    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg)
+    {
+        std::string w = exception::name("parse_error", id_) + "parse error" +
+                        (byte_ != 0 ? (" at byte " + std::to_string(byte_)) : "") +
+                        ": " + what_arg;
+        return parse_error(id_, byte_, w.c_str());
+    }
+
+    /*!
+    @brief byte index of the parse error
+
+    The byte index of the last read character in the input file.
+
+    @note For an input with n bytes, 1 is the index of the first character and
+          n+1 is the index of the terminating null byte or the end of file.
+          This also holds true when reading a byte vector (CBOR or MessagePack).
+    */
+    const std::size_t byte;
+
+  private:
+    parse_error(int id_, std::size_t byte_, const char* what_arg)
+        : exception(id_, what_arg), byte(byte_) {}
+
+    static std::string position_string(const position_t& pos)
+    {
+        return " at line " + std::to_string(pos.lines_read + 1) +
+               ", column " + std::to_string(pos.chars_read_current_line);
+    }
+};
+
+/*!
+@brief exception indicating errors with iterators
+
+This exception is thrown if iterators passed to a library function do not match
+the expected semantics.
+
+Exceptions have ids 2xx.
+
+name / id                           | example message | description
+----------------------------------- | --------------- | -------------------------
+json.exception.invalid_iterator.201 | iterators are not compatible | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
+json.exception.invalid_iterator.202 | iterator does not fit current value | In an erase or insert function, the passed iterator @a pos does not belong to the JSON value for which the function was called. It hence does not define a valid position for the deletion/insertion.
+json.exception.invalid_iterator.203 | iterators do not fit current value | Either iterator passed to function @ref erase(IteratorType first, IteratorType last) does not belong to the JSON value from which values shall be erased. It hence does not define a valid range to delete values from.
+json.exception.invalid_iterator.204 | iterators out of range | When an iterator range for a primitive type (number, boolean, or string) is passed to a constructor or an erase function, this range has to be exactly (@ref begin(), @ref end()), because this is the only way the single stored value is expressed. All other ranges are invalid.
+json.exception.invalid_iterator.205 | iterator out of range | When an iterator for a primitive type (number, boolean, or string) is passed to an erase function, the iterator has to be the @ref begin() iterator, because it is the only way to address the stored value. All other iterators are invalid.
+json.exception.invalid_iterator.206 | cannot construct with iterators from null | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) belong to a JSON null value and hence to not define a valid range.
+json.exception.invalid_iterator.207 | cannot use key() for non-object iterators | The key() member function can only be used on iterators belonging to a JSON object, because other types do not have a concept of a key.
+json.exception.invalid_iterator.208 | cannot use operator[] for object iterators | The operator[] to specify a concrete offset cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
+json.exception.invalid_iterator.209 | cannot use offsets with object iterators | The offset operators (+, -, +=, -=) cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
+json.exception.invalid_iterator.210 | iterators do not fit | The iterator range passed to the insert function are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
+json.exception.invalid_iterator.211 | passed iterators may not belong to container | The iterator range passed to the insert function must not be a subrange of the container to insert to.
+json.exception.invalid_iterator.212 | cannot compare iterators of different containers | When two iterators are compared, they must belong to the same container.
+json.exception.invalid_iterator.213 | cannot compare order of object iterators | The order of object iterators cannot be compared, because JSON objects are unordered.
+json.exception.invalid_iterator.214 | cannot get value | Cannot get value for iterator: Either the iterator belongs to a null value or it is an iterator to a primitive type (number, boolean, or string), but the iterator is different to @ref begin().
+
+@liveexample{The following code shows how an `invalid_iterator` exception can be
+caught.,invalid_iterator}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class invalid_iterator : public exception
+{
+  public:
+    static invalid_iterator create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("invalid_iterator", id_) + what_arg;
+        return invalid_iterator(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    invalid_iterator(int id_, const char* what_arg)
+        : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating executing a member function with a wrong type
+
+This exception is thrown in case of a type error; that is, a library function is
+executed on a JSON value whose type does not match the expected semantics.
+
+Exceptions have ids 3xx.
+
+name / id                     | example message | description
+----------------------------- | --------------- | -------------------------
+json.exception.type_error.301 | cannot create object from initializer list | To create an object from an initializer list, the initializer list must consist only of a list of pairs whose first element is a string. When this constraint is violated, an array is created instead.
+json.exception.type_error.302 | type must be object, but is array | During implicit or explicit value conversion, the JSON type must be compatible to the target type. For instance, a JSON string can only be converted into string types, but not into numbers or boolean types.
+json.exception.type_error.303 | incompatible ReferenceType for get_ref, actual type is object | To retrieve a reference to a value stored in a @ref basic_json object with @ref get_ref, the type of the reference must match the value type. For instance, for a JSON array, the @a ReferenceType must be @ref array_t &.
+json.exception.type_error.304 | cannot use at() with string | The @ref at() member functions can only be executed for certain JSON types.
+json.exception.type_error.305 | cannot use operator[] with string | The @ref operator[] member functions can only be executed for certain JSON types.
+json.exception.type_error.306 | cannot use value() with string | The @ref value() member functions can only be executed for certain JSON types.
+json.exception.type_error.307 | cannot use erase() with string | The @ref erase() member functions can only be executed for certain JSON types.
+json.exception.type_error.308 | cannot use push_back() with string | The @ref push_back() and @ref operator+= member functions can only be executed for certain JSON types.
+json.exception.type_error.309 | cannot use insert() with | The @ref insert() member functions can only be executed for certain JSON types.
+json.exception.type_error.310 | cannot use swap() with number | The @ref swap() member functions can only be executed for certain JSON types.
+json.exception.type_error.311 | cannot use emplace_back() with string | The @ref emplace_back() member function can only be executed for certain JSON types.
+json.exception.type_error.312 | cannot use update() with string | The @ref update() member functions can only be executed for certain JSON types.
+json.exception.type_error.313 | invalid value to unflatten | The @ref unflatten function converts an object whose keys are JSON Pointers back into an arbitrary nested JSON value. The JSON Pointers must not overlap, because then the resulting value would not be well defined.
+json.exception.type_error.314 | only objects can be unflattened | The @ref unflatten function only works for an object whose keys are JSON Pointers.
+json.exception.type_error.315 | values in object must be primitive | The @ref unflatten function only works for an object whose keys are JSON Pointers and whose values are primitive.
+json.exception.type_error.316 | invalid UTF-8 byte at index 10: 0x7E | The @ref dump function only works with UTF-8 encoded strings; that is, if you assign a `std::string` to a JSON value, make sure it is UTF-8 encoded. |
+json.exception.type_error.317 | JSON value cannot be serialized to requested format | The dynamic type of the object cannot be represented in the requested serialization format (e.g. a raw `true` or `null` JSON object cannot be serialized to BSON) |
+
+@liveexample{The following code shows how a `type_error` exception can be
+caught.,type_error}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class type_error : public exception
+{
+  public:
+    static type_error create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("type_error", id_) + what_arg;
+        return type_error(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating access out of the defined range
+
+This exception is thrown in case a library function is called on an input
+parameter that exceeds the expected range, for instance in case of array
+indices or nonexisting object keys.
+
+Exceptions have ids 4xx.
+
+name / id                       | example message | description
+------------------------------- | --------------- | -------------------------
+json.exception.out_of_range.401 | array index 3 is out of range | The provided array index @a i is larger than @a size-1.
+json.exception.out_of_range.402 | array index '-' (3) is out of range | The special array index `-` in a JSON Pointer never describes a valid element of the array, but the index past the end. That is, it can only be used to add elements at this position, but not to read it.
+json.exception.out_of_range.403 | key 'foo' not found | The provided key was not found in the JSON object.
+json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved.
+json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value.
+json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF.
+json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) |
+json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. |
+json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string |
+
+@liveexample{The following code shows how an `out_of_range` exception can be
+caught.,out_of_range}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class out_of_range : public exception
+{
+  public:
+    static out_of_range create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("out_of_range", id_) + what_arg;
+        return out_of_range(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating other library errors
+
+This exception is thrown in case of errors that cannot be classified with the
+other exception types.
+
+Exceptions have ids 5xx.
+
+name / id                      | example message | description
+------------------------------ | --------------- | -------------------------
+json.exception.other_error.501 | unsuccessful: {"op":"test","path":"/baz", "value":"bar"} | A JSON Patch operation 'test' failed. The unsuccessful operation is also printed.
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+
+@liveexample{The following code shows how an `other_error` exception can be
+caught.,other_error}
+
+@since version 3.0.0
+*/
+class other_error : public exception
+{
+  public:
+    static other_error create(int id_, const std::string& what_arg)
+    {
+        std::string w = exception::name("other_error", id_) + what_arg;
+        return other_error(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+#include <cstddef> // size_t
+#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
+
+namespace nlohmann
+{
+namespace detail
+{
+// alias templates to reduce boilerplate
+template<bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template<typename T>
+using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+// implementation of C++14 index_sequence and affiliates
+// source: https://stackoverflow.com/a/32223343
+template<std::size_t... Ints>
+struct index_sequence
+{
+    using type = index_sequence;
+    using value_type = std::size_t;
+    static constexpr std::size_t size() noexcept
+    {
+        return sizeof...(Ints);
+    }
+};
+
+template<class Sequence1, class Sequence2>
+struct merge_and_renumber;
+
+template<std::size_t... I1, std::size_t... I2>
+struct merge_and_renumber<index_sequence<I1...>, index_sequence<I2...>>
+        : index_sequence < I1..., (sizeof...(I1) + I2)... > {};
+
+template<std::size_t N>
+struct make_index_sequence
+    : merge_and_renumber < typename make_index_sequence < N / 2 >::type,
+      typename make_index_sequence < N - N / 2 >::type > {};
+
+template<> struct make_index_sequence<0> : index_sequence<> {};
+template<> struct make_index_sequence<1> : index_sequence<0> {};
+
+template<typename... Ts>
+using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+// dispatch utility (taken from ranges-v3)
+template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
+template<> struct priority_tag<0> {};
+
+// taken from ranges-v3
+template<typename T>
+struct static_const
+{
+    static constexpr T value{};
+};
+
+template<typename T>
+constexpr T static_const<T>::value;
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+#include <limits> // numeric_limits
+#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
+#include <utility> // declval
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+
+#include <iterator> // random_access_iterator_tag
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename ...Ts> struct make_void
+{
+    using type = void;
+};
+template<typename ...Ts> using void_t = typename make_void<Ts...>::type;
+} // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename It, typename = void>
+struct iterator_types {};
+
+template<typename It>
+struct iterator_types <
+    It,
+    void_t<typename It::difference_type, typename It::value_type, typename It::pointer,
+    typename It::reference, typename It::iterator_category >>
+{
+    using difference_type = typename It::difference_type;
+    using value_type = typename It::value_type;
+    using pointer = typename It::pointer;
+    using reference = typename It::reference;
+    using iterator_category = typename It::iterator_category;
+};
+
+// This is required as some compilers implement std::iterator_traits in a way that
+// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
+template<typename T, typename = void>
+struct iterator_traits
+{
+};
+
+template<typename T>
+struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
+            : iterator_types<T>
+{
+};
+
+template<typename T>
+struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>>
+{
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = T;
+    using difference_type = ptrdiff_t;
+    using pointer = T*;
+    using reference = T&;
+};
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+
+#include <type_traits>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+// https://en.cppreference.com/w/cpp/experimental/is_detected
+namespace nlohmann
+{
+namespace detail
+{
+struct nonesuch
+{
+    nonesuch() = delete;
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const&) = delete;
+    nonesuch(nonesuch const&&) = delete;
+    void operator=(nonesuch const&) = delete;
+    void operator=(nonesuch&&) = delete;
+};
+
+template<class Default,
+         class AlwaysVoid,
+         template<class...> class Op,
+         class... Args>
+struct detector
+{
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template<class Default, template<class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...>
+{
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+template<template<class...> class Op, class... Args>
+using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+template<template<class...> class Op, class... Args>
+using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+template<class Default, template<class...> class Op, class... Args>
+using detected_or = detector<Default, void, Op, Args...>;
+
+template<class Default, template<class...> class Op, class... Args>
+using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+template<class Expected, template<class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template<class To, template<class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/json_fwd.hpp>
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+#include <cstdint> // int64_t, uint64_t
+#include <map> // map
+#include <memory> // allocator
+#include <string> // string
+#include <vector> // vector
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template<typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+template<template<typename U, typename V, typename... Args> class ObjectType =
+         std::map,
+         template<typename U, typename... Args> class ArrayType = std::vector,
+         class StringType = std::string, class BooleanType = bool,
+         class NumberIntegerType = std::int64_t,
+         class NumberUnsignedType = std::uint64_t,
+         class NumberFloatType = double,
+         template<typename U> class AllocatorType = std::allocator,
+         template<typename T, typename SFINAE = void> class JSONSerializer =
+         adl_serializer,
+         class BinaryType = std::vector<std::uint8_t>>
+class basic_json;
+
+/*!
+@brief JSON Pointer
+
+A JSON pointer defines a string syntax for identifying a specific value
+within a JSON document. It can be used with functions `at` and
+`operator[]`. Furthermore, JSON pointers are the base for JSON patches.
+
+@sa [RFC 6901](https://tools.ietf.org/html/rfc6901)
+
+@since version 2.0.0
+*/
+template<typename BasicJsonType>
+class json_pointer;
+
+/*!
+@brief default JSON class
+
+This type is the default specialization of the @ref basic_json class which
+uses the standard template types.
+
+@since version 1.0.0
+*/
+using json = basic_json<>;
+
+template<class Key, class T, class IgnoredLess, class Allocator>
+struct ordered_map;
+
+/*!
+@brief ordered JSON class
+
+This type preserves the insertion order of object keys.
+
+@since version 3.9.0
+*/
+using ordered_json = basic_json<nlohmann::ordered_map>;
+
+}  // namespace nlohmann
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+
+namespace nlohmann
+{
+/*!
+@brief detail namespace with internal helper functions
+
+This namespace collects functions that should not be exposed,
+implementations of some @ref basic_json methods, and meta-programming helpers.
+
+@since version 2.1.0
+*/
+namespace detail
+{
+/////////////
+// helpers //
+/////////////
+
+// Note to maintainers:
+//
+// Every trait in this file expects a non CV-qualified type.
+// The only exceptions are in the 'aliases for detected' section
+// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
+//
+// In this case, T has to be properly CV-qualified to constraint the function arguments
+// (e.g. to_json(BasicJsonType&, const T&))
+
+template<typename> struct is_basic_json : std::false_type {};
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
+
+//////////////////////
+// json_ref helpers //
+//////////////////////
+
+template<typename>
+class json_ref;
+
+template<typename>
+struct is_json_ref : std::false_type {};
+
+template<typename T>
+struct is_json_ref<json_ref<T>> : std::true_type {};
+
+//////////////////////////
+// aliases for detected //
+//////////////////////////
+
+template<typename T>
+using mapped_type_t = typename T::mapped_type;
+
+template<typename T>
+using key_type_t = typename T::key_type;
+
+template<typename T>
+using value_type_t = typename T::value_type;
+
+template<typename T>
+using difference_type_t = typename T::difference_type;
+
+template<typename T>
+using pointer_t = typename T::pointer;
+
+template<typename T>
+using reference_t = typename T::reference;
+
+template<typename T>
+using iterator_category_t = typename T::iterator_category;
+
+template<typename T>
+using iterator_t = typename T::iterator;
+
+template<typename T, typename... Args>
+using to_json_function = decltype(T::to_json(std::declval<Args>()...));
+
+template<typename T, typename... Args>
+using from_json_function = decltype(T::from_json(std::declval<Args>()...));
+
+template<typename T, typename U>
+using get_template_function = decltype(std::declval<T>().template get<U>());
+
+// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
+template<typename BasicJsonType, typename T, typename = void>
+struct has_from_json : std::false_type {};
+
+// trait checking if j.get<T> is valid
+// use this trait instead of std::is_constructible or std::is_convertible,
+// both rely on, or make use of implicit conversions, and thus fail when T
+// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958)
+template <typename BasicJsonType, typename T>
+struct is_getable
+{
+    static constexpr bool value = is_detected<get_template_function, const BasicJsonType&, T>::value;
+};
+
+template<typename BasicJsonType, typename T>
+struct has_from_json < BasicJsonType, T,
+           enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, from_json_function, serializer,
+        const BasicJsonType&, T&>::value;
+};
+
+// This trait checks if JSONSerializer<T>::from_json(json const&) exists
+// this overload is used for non-default-constructible user-defined-types
+template<typename BasicJsonType, typename T, typename = void>
+struct has_non_default_from_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<T, from_json_function, serializer,
+        const BasicJsonType&>::value;
+};
+
+// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
+// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
+template<typename BasicJsonType, typename T, typename = void>
+struct has_to_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
+        T>::value;
+};
+
+
+///////////////////
+// is_ functions //
+///////////////////
+
+template<typename T, typename = void>
+struct is_iterator_traits : std::false_type {};
+
+template<typename T>
+struct is_iterator_traits<iterator_traits<T>>
+{
+  private:
+    using traits = iterator_traits<T>;
+
+  public:
+    static constexpr auto value =
+        is_detected<value_type_t, traits>::value &&
+        is_detected<difference_type_t, traits>::value &&
+        is_detected<pointer_t, traits>::value &&
+        is_detected<iterator_category_t, traits>::value &&
+        is_detected<reference_t, traits>::value;
+};
+
+// source: https://stackoverflow.com/a/37193089/4116453
+
+template<typename T, typename = void>
+struct is_complete_type : std::false_type {};
+
+template<typename T>
+struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};
+
+template<typename BasicJsonType, typename CompatibleObjectType,
+         typename = void>
+struct is_compatible_object_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type_impl <
+    BasicJsonType, CompatibleObjectType,
+    enable_if_t < is_detected<mapped_type_t, CompatibleObjectType>::value&&
+    is_detected<key_type_t, CompatibleObjectType>::value >>
+{
+
+    using object_t = typename BasicJsonType::object_t;
+
+    // macOS's is_constructible does not play well with nonesuch...
+    static constexpr bool value =
+        std::is_constructible<typename object_t::key_type,
+        typename CompatibleObjectType::key_type>::value &&
+        std::is_constructible<typename object_t::mapped_type,
+        typename CompatibleObjectType::mapped_type>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type
+    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         typename = void>
+struct is_constructible_object_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type_impl <
+    BasicJsonType, ConstructibleObjectType,
+    enable_if_t < is_detected<mapped_type_t, ConstructibleObjectType>::value&&
+    is_detected<key_type_t, ConstructibleObjectType>::value >>
+{
+    using object_t = typename BasicJsonType::object_t;
+
+    static constexpr bool value =
+        (std::is_default_constructible<ConstructibleObjectType>::value &&
+         (std::is_move_assignable<ConstructibleObjectType>::value ||
+          std::is_copy_assignable<ConstructibleObjectType>::value) &&
+         (std::is_constructible<typename ConstructibleObjectType::key_type,
+          typename object_t::key_type>::value &&
+          std::is_same <
+          typename object_t::mapped_type,
+          typename ConstructibleObjectType::mapped_type >::value)) ||
+        (has_from_json<BasicJsonType,
+         typename ConstructibleObjectType::mapped_type>::value ||
+         has_non_default_from_json <
+         BasicJsonType,
+         typename ConstructibleObjectType::mapped_type >::value);
+};
+
+template<typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type
+    : is_constructible_object_type_impl<BasicJsonType,
+      ConstructibleObjectType> {};
+
+template<typename BasicJsonType, typename CompatibleStringType,
+         typename = void>
+struct is_compatible_string_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleStringType>
+struct is_compatible_string_type_impl <
+    BasicJsonType, CompatibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, CompatibleStringType>::value >>
+{
+    static constexpr auto value =
+        std::is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
+};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_compatible_string_type
+    : is_compatible_string_type_impl<BasicJsonType, ConstructibleStringType> {};
+
+template<typename BasicJsonType, typename ConstructibleStringType,
+         typename = void>
+struct is_constructible_string_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type_impl <
+    BasicJsonType, ConstructibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, ConstructibleStringType>::value >>
+{
+    static constexpr auto value =
+        std::is_constructible<ConstructibleStringType,
+        typename BasicJsonType::string_t>::value;
+};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type
+    : is_constructible_string_type_impl<BasicJsonType, ConstructibleStringType> {};
+
+template<typename BasicJsonType, typename CompatibleArrayType, typename = void>
+struct is_compatible_array_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type_impl <
+    BasicJsonType, CompatibleArrayType,
+    enable_if_t < is_detected<value_type_t, CompatibleArrayType>::value&&
+    is_detected<iterator_t, CompatibleArrayType>::value&&
+// This is needed because json_reverse_iterator has a ::iterator type...
+// Therefore it is detected as a CompatibleArrayType.
+// The real fix would be to have an Iterable concept.
+    !is_iterator_traits <
+    iterator_traits<CompatibleArrayType >>::value >>
+{
+    static constexpr bool value =
+        std::is_constructible<BasicJsonType,
+        typename CompatibleArrayType::value_type>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type
+    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType, typename = void>
+struct is_constructible_array_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t<std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value >>
+            : std::true_type {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t < !std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value&&
+    std::is_default_constructible<ConstructibleArrayType>::value&&
+(std::is_move_assignable<ConstructibleArrayType>::value ||
+ std::is_copy_assignable<ConstructibleArrayType>::value)&&
+is_detected<value_type_t, ConstructibleArrayType>::value&&
+is_detected<iterator_t, ConstructibleArrayType>::value&&
+is_complete_type <
+detected_t<value_type_t, ConstructibleArrayType >>::value >>
+{
+    static constexpr bool value =
+        // This is needed because json_reverse_iterator has a ::iterator type,
+        // furthermore, std::back_insert_iterator (and other iterators) have a
+        // base class `iterator`... Therefore it is detected as a
+        // ConstructibleArrayType. The real fix would be to have an Iterable
+        // concept.
+        !is_iterator_traits<iterator_traits<ConstructibleArrayType>>::value &&
+
+        (std::is_same<typename ConstructibleArrayType::value_type,
+         typename BasicJsonType::array_t::value_type>::value ||
+         has_from_json<BasicJsonType,
+         typename ConstructibleArrayType::value_type>::value ||
+         has_non_default_from_json <
+         BasicJsonType, typename ConstructibleArrayType::value_type >::value);
+};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type
+    : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType,
+         typename = void>
+struct is_compatible_integer_type_impl : std::false_type {};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type_impl <
+    RealIntegerType, CompatibleNumberIntegerType,
+    enable_if_t < std::is_integral<RealIntegerType>::value&&
+    std::is_integral<CompatibleNumberIntegerType>::value&&
+    !std::is_same<bool, CompatibleNumberIntegerType>::value >>
+{
+    // is there an assert somewhere on overflows?
+    using RealLimits = std::numeric_limits<RealIntegerType>;
+    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
+
+    static constexpr auto value =
+        std::is_constructible<RealIntegerType,
+        CompatibleNumberIntegerType>::value &&
+        CompatibleLimits::is_integer &&
+        RealLimits::is_signed == CompatibleLimits::is_signed;
+};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type
+    : is_compatible_integer_type_impl<RealIntegerType,
+      CompatibleNumberIntegerType> {};
+
+template<typename BasicJsonType, typename CompatibleType, typename = void>
+struct is_compatible_type_impl: std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type_impl <
+    BasicJsonType, CompatibleType,
+    enable_if_t<is_complete_type<CompatibleType>::value >>
+{
+    static constexpr bool value =
+        has_to_json<BasicJsonType, CompatibleType>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type
+    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};
+
+// https://en.cppreference.com/w/cpp/types/conjunction
+template<class...> struct conjunction : std::true_type { };
+template<class B1> struct conjunction<B1> : B1 { };
+template<class B1, class... Bn>
+struct conjunction<B1, Bn...>
+: std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
+
+template<typename T1, typename T2>
+struct is_constructible_tuple : std::false_type {};
+
+template<typename T1, typename... Args>
+struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<std::is_constructible<T1, Args>...> {};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t
+#include <string> // string
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////////////
+// JSON type enumeration //
+///////////////////////////
+
+/*!
+@brief the JSON type enumeration
+
+This enumeration collects the different JSON types. It is internally used to
+distinguish the stored values, and the functions @ref basic_json::is_null(),
+@ref basic_json::is_object(), @ref basic_json::is_array(),
+@ref basic_json::is_string(), @ref basic_json::is_boolean(),
+@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
+@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
+@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
+@ref basic_json::is_structured() rely on it.
+
+@note There are three enumeration entries (number_integer, number_unsigned, and
+number_float), because the library distinguishes these three types for numbers:
+@ref basic_json::number_unsigned_t is used for unsigned integers,
+@ref basic_json::number_integer_t is used for signed integers, and
+@ref basic_json::number_float_t is used for floating-point numbers or to
+approximate integers which do not fit in the limits of their respective type.
+
+@sa @ref basic_json::basic_json(const value_t value_type) -- create a JSON
+value with the default value for a given type
+
+@since version 1.0.0
+*/
+enum class value_t : std::uint8_t
+{
+    null,             ///< null value
+    object,           ///< object (unordered set of name/value pairs)
+    array,            ///< array (ordered collection of values)
+    string,           ///< string value
+    boolean,          ///< boolean value
+    number_integer,   ///< number value (signed integer)
+    number_unsigned,  ///< number value (unsigned integer)
+    number_float,     ///< number value (floating-point)
+    binary,           ///< binary array (ordered collection of bytes)
+    discarded         ///< discarded by the parser callback function
+};
+
+/*!
+@brief comparison operator for JSON types
+
+Returns an ordering that is similar to Python:
+- order: null < boolean < number < object < array < string < binary
+- furthermore, each type is not smaller than itself
+- discarded values are not comparable
+- binary is represented as a b"" string in python and directly comparable to a
+  string; however, making a binary array directly comparable with a string would
+  be surprising behavior in a JSON file.
+
+@since version 1.0.0
+*/
+inline bool operator<(const value_t lhs, const value_t rhs) noexcept
+{
+    static constexpr std::array<std::uint8_t, 9> order = {{
+            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
+            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */,
+            6 /* binary */
+        }
+    };
+
+    const auto l_index = static_cast<std::size_t>(lhs);
+    const auto r_index = static_cast<std::size_t>(rhs);
+    return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index];
+}
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_null()))
+    {
+        JSON_THROW(type_error::create(302, "type must be null, but is " + std::string(j.type_name())));
+    }
+    n = nullptr;
+}
+
+// overloads for basic_json template parameters
+template < typename BasicJsonType, typename ArithmeticType,
+           enable_if_t < std::is_arithmetic<ArithmeticType>::value&&
+                         !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+                         int > = 0 >
+void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+
+        default:
+            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name())));
+    }
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_boolean()))
+    {
+        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(j.type_name())));
+    }
+    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name())));
+    }
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template <
+    typename BasicJsonType, typename ConstructibleStringType,
+    enable_if_t <
+        is_constructible_string_type<BasicJsonType, ConstructibleStringType>::value&&
+        !std::is_same<typename BasicJsonType::string_t,
+                      ConstructibleStringType>::value,
+        int > = 0 >
+void from_json(const BasicJsonType& j, ConstructibleStringType& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name())));
+    }
+
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void from_json(const BasicJsonType& j, EnumType& e)
+{
+    typename std::underlying_type<EnumType>::type val;
+    get_arithmetic_value(j, val);
+    e = static_cast<EnumType>(val);
+}
+
+// forward_list doesn't have an insert method
+template<typename BasicJsonType, typename T, typename Allocator,
+         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+void from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    l.clear();
+    std::transform(j.rbegin(), j.rend(),
+                   std::front_inserter(l), [](const BasicJsonType & i)
+    {
+        return i.template get<T>();
+    });
+}
+
+// valarray doesn't have an insert method
+template<typename BasicJsonType, typename T,
+         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+void from_json(const BasicJsonType& j, std::valarray<T>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    l.resize(j.size());
+    std::transform(j.begin(), j.end(), std::begin(l),
+                   [](const BasicJsonType & elem)
+    {
+        return elem.template get<T>();
+    });
+}
+
+template<typename BasicJsonType, typename T, std::size_t N>
+auto from_json(const BasicJsonType& j, T (&arr)[N])
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType>
+void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
+{
+    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
+}
+
+template<typename BasicJsonType, typename T, std::size_t N>
+auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
+                          priority_tag<2> /*unused*/)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
+-> decltype(
+    arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
+    j.template get<typename ConstructibleArrayType::value_type>(),
+    void())
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    ret.reserve(j.size());
+    std::transform(j.begin(), j.end(),
+                   std::inserter(ret, end(ret)), [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr,
+                          priority_tag<0> /*unused*/)
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    std::transform(
+        j.begin(), j.end(), std::inserter(ret, end(ret)),
+        [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template < typename BasicJsonType, typename ConstructibleArrayType,
+           enable_if_t <
+               is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !std::is_same<ConstructibleArrayType, typename BasicJsonType::binary_t>::value&&
+               !is_basic_json<ConstructibleArrayType>::value,
+               int > = 0 >
+auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
+-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
+j.template get<typename ConstructibleArrayType::value_type>(),
+void())
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " +
+                                      std::string(j.type_name())));
+    }
+
+    from_json_array_impl(j, arr, priority_tag<3> {});
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_binary()))
+    {
+        JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(j.type_name())));
+    }
+
+    bin = *j.template get_ptr<const typename BasicJsonType::binary_t*>();
+}
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
+void from_json(const BasicJsonType& j, ConstructibleObjectType& obj)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
+    {
+        JSON_THROW(type_error::create(302, "type must be object, but is " + std::string(j.type_name())));
+    }
+
+    ConstructibleObjectType ret;
+    auto inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
+    using value_type = typename ConstructibleObjectType::value_type;
+    std::transform(
+        inner_object->begin(), inner_object->end(),
+        std::inserter(ret, ret.begin()),
+        [](typename BasicJsonType::object_t::value_type const & p)
+    {
+        return value_type(p.first, p.second.template get<typename ConstructibleObjectType::mapped_type>());
+    });
+    obj = std::move(ret);
+}
+
+// overload for arithmetic types, not chosen for basic_json template arguments
+// (BooleanType, etc..); note: Is it really necessary to provide explicit
+// overloads for boolean_t etc. in case of a custom BooleanType which is not
+// an arithmetic type?
+template < typename BasicJsonType, typename ArithmeticType,
+           enable_if_t <
+               std::is_arithmetic<ArithmeticType>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+               int > = 0 >
+void from_json(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+        case value_t::boolean:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
+            break;
+        }
+
+        default:
+            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name())));
+    }
+}
+
+template<typename BasicJsonType, typename A1, typename A2>
+void from_json(const BasicJsonType& j, std::pair<A1, A2>& p)
+{
+    p = {j.at(0).template get<A1>(), j.at(1).template get<A2>()};
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+void from_json_tuple_impl(const BasicJsonType& j, Tuple& t, index_sequence<Idx...> /*unused*/)
+{
+    t = std::make_tuple(j.at(Idx).template get<typename std::tuple_element<Idx, Tuple>::type>()...);
+}
+
+template<typename BasicJsonType, typename... Args>
+void from_json(const BasicJsonType& j, std::tuple<Args...>& t)
+{
+    from_json_tuple_impl(j, t, index_sequence_for<Args...> {});
+}
+
+template < typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
+           typename = enable_if_t < !std::is_constructible <
+                                        typename BasicJsonType::string_t, Key >::value >>
+void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name())));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+template < typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
+           typename = enable_if_t < !std::is_constructible <
+                                        typename BasicJsonType::string_t, Key >::value >>
+void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name())));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name())));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+struct from_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(const BasicJsonType& j, T& val) const
+    noexcept(noexcept(from_json(j, val)))
+    -> decltype(from_json(j, val), void())
+    {
+        return from_json(j, val);
+    }
+};
+}  // namespace detail
+
+/// namespace to hold default `from_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace
+{
+constexpr const auto& from_json = detail::static_const<detail::from_json_fn>::value;
+} // namespace
+} // namespace nlohmann
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+
+#include <algorithm> // copy
+#include <iterator> // begin, end
+#include <string> // string
+#include <tuple> // tuple, get
+#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
+#include <utility> // move, forward, declval, pair
+#include <valarray> // valarray
+#include <vector> // vector
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+
+#include <cstddef> // size_t
+#include <iterator> // input_iterator_tag
+#include <string> // string, to_string
+#include <tuple> // tuple_size, get, tuple_element
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename string_type>
+void int_to_string( string_type& target, std::size_t value )
+{
+    // For ADL
+    using std::to_string;
+    target = to_string(value);
+}
+template<typename IteratorType> class iteration_proxy_value
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = iteration_proxy_value;
+    using pointer = value_type * ;
+    using reference = value_type & ;
+    using iterator_category = std::input_iterator_tag;
+    using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;
+
+  private:
+    /// the iterator
+    IteratorType anchor;
+    /// an index for arrays (used to create key names)
+    std::size_t array_index = 0;
+    /// last stringified array index
+    mutable std::size_t array_index_last = 0;
+    /// a string representation of the array index
+    mutable string_type array_index_str = "0";
+    /// an empty string (to return a reference for primitive values)
+    const string_type empty_str = "";
+
+  public:
+    explicit iteration_proxy_value(IteratorType it) noexcept : anchor(it) {}
+
+    /// dereference operator (needed for range-based for)
+    iteration_proxy_value& operator*()
+    {
+        return *this;
+    }
+
+    /// increment operator (needed for range-based for)
+    iteration_proxy_value& operator++()
+    {
+        ++anchor;
+        ++array_index;
+
+        return *this;
+    }
+
+    /// equality operator (needed for InputIterator)
+    bool operator==(const iteration_proxy_value& o) const
+    {
+        return anchor == o.anchor;
+    }
+
+    /// inequality operator (needed for range-based for)
+    bool operator!=(const iteration_proxy_value& o) const
+    {
+        return anchor != o.anchor;
+    }
+
+    /// return key of the iterator
+    const string_type& key() const
+    {
+        JSON_ASSERT(anchor.m_object != nullptr);
+
+        switch (anchor.m_object->type())
+        {
+            // use integer array index as key
+            case value_t::array:
+            {
+                if (array_index != array_index_last)
+                {
+                    int_to_string( array_index_str, array_index );
+                    array_index_last = array_index;
+                }
+                return array_index_str;
+            }
+
+            // use key from the object
+            case value_t::object:
+                return anchor.key();
+
+            // use an empty key for all primitive types
+            default:
+                return empty_str;
+        }
+    }
+
+    /// return value of the iterator
+    typename IteratorType::reference value() const
+    {
+        return anchor.value();
+    }
+};
+
+/// proxy class for the items() function
+template<typename IteratorType> class iteration_proxy
+{
+  private:
+    /// the container to iterate
+    typename IteratorType::reference container;
+
+  public:
+    /// construct iteration proxy from a container
+    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
+        : container(cont) {}
+
+    /// return iterator begin (needed for range-based for)
+    iteration_proxy_value<IteratorType> begin() noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container.begin());
+    }
+
+    /// return iterator end (needed for range-based for)
+    iteration_proxy_value<IteratorType> end() noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container.end());
+    }
+};
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
+{
+    return i.key();
+}
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
+{
+    return i.value();
+}
+}  // namespace detail
+}  // namespace nlohmann
+
+// The Addition to the STD Namespace is required to add
+// Structured Bindings Support to the iteration_proxy_value class
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+namespace std
+{
+#if defined(__clang__)
+    // Fix: https://github.com/nlohmann/json/issues/1401
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template<typename IteratorType>
+class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>>
+            : public std::integral_constant<std::size_t, 2> {};
+
+template<std::size_t N, typename IteratorType>
+class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >>
+{
+  public:
+    using type = decltype(
+                     get<N>(std::declval <
+                            ::nlohmann::detail::iteration_proxy_value<IteratorType >> ()));
+};
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+} // namespace std
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////
+// constructors //
+//////////////////
+
+template<value_t> struct external_constructor;
+
+template<>
+struct external_constructor<value_t::boolean>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
+    {
+        j.m_type = value_t::boolean;
+        j.m_value = b;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::string>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
+    {
+        j.m_type = value_t::string;
+        j.m_value = s;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+    {
+        j.m_type = value_t::string;
+        j.m_value = std::move(s);
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleStringType,
+               enable_if_t < !std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
+                             int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleStringType& str)
+    {
+        j.m_type = value_t::string;
+        j.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::binary>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b)
+    {
+        j.m_type = value_t::binary;
+        typename BasicJsonType::binary_t value{b};
+        j.m_value = value;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b)
+    {
+        j.m_type = value_t::binary;
+        typename BasicJsonType::binary_t value{std::move(b)};
+        j.m_value = value;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_float>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
+    {
+        j.m_type = value_t::number_float;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_unsigned>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
+    {
+        j.m_type = value_t::number_unsigned;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_integer>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
+    {
+        j.m_type = value_t::number_integer;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::array>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = arr;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = std::move(arr);
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleArrayType,
+               enable_if_t < !std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
+                             int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
+    {
+        using std::begin;
+        using std::end;
+        j.m_type = value_t::array;
+        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->reserve(arr.size());
+        for (const bool x : arr)
+        {
+            j.m_value.array->push_back(x);
+        }
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename T,
+             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
+    {
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->resize(arr.size());
+        if (arr.size() > 0)
+        {
+            std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin());
+        }
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::object>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
+    {
+        j.m_type = value_t::object;
+        j.m_value = obj;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+    {
+        j.m_type = value_t::object;
+        j.m_value = std::move(obj);
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleObjectType,
+               enable_if_t < !std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
+    {
+        using std::begin;
+        using std::end;
+
+        j.m_type = value_t::object;
+        j.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
+        j.assert_invariant();
+    }
+};
+
+/////////////
+// to_json //
+/////////////
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
+void to_json(BasicJsonType& j, T b) noexcept
+{
+    external_constructor<value_t::boolean>::construct(j, b);
+}
+
+template<typename BasicJsonType, typename CompatibleString,
+         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
+void to_json(BasicJsonType& j, const CompatibleString& s)
+{
+    external_constructor<value_t::string>::construct(j, s);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+{
+    external_constructor<value_t::string>::construct(j, std::move(s));
+}
+
+template<typename BasicJsonType, typename FloatType,
+         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
+void to_json(BasicJsonType& j, FloatType val) noexcept
+{
+    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
+{
+    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberIntegerType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
+{
+    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void to_json(BasicJsonType& j, EnumType e) noexcept
+{
+    using underlying_type = typename std::underlying_type<EnumType>::type;
+    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, const std::vector<bool>& e)
+{
+    external_constructor<value_t::array>::construct(j, e);
+}
+
+template < typename BasicJsonType, typename CompatibleArrayType,
+           enable_if_t < is_compatible_array_type<BasicJsonType,
+                         CompatibleArrayType>::value&&
+                         !is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value&&
+                         !is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value&&
+                         !std::is_same<typename BasicJsonType::binary_t, CompatibleArrayType>::value&&
+                         !is_basic_json<CompatibleArrayType>::value,
+                         int > = 0 >
+void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin)
+{
+    external_constructor<value_t::binary>::construct(j, bin);
+}
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+void to_json(BasicJsonType& j, const std::valarray<T>& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template < typename BasicJsonType, typename CompatibleObjectType,
+           enable_if_t < is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value&& !is_basic_json<CompatibleObjectType>::value, int > = 0 >
+void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
+{
+    external_constructor<value_t::object>::construct(j, obj);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+{
+    external_constructor<value_t::object>::construct(j, std::move(obj));
+}
+
+template <
+    typename BasicJsonType, typename T, std::size_t N,
+    enable_if_t < !std::is_constructible<typename BasicJsonType::string_t,
+                  const T(&)[N]>::value,
+                  int > = 0 >
+void to_json(BasicJsonType& j, const T(&arr)[N])
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible<BasicJsonType, T1>::value&& std::is_constructible<BasicJsonType, T2>::value, int > = 0 >
+void to_json(BasicJsonType& j, const std::pair<T1, T2>& p)
+{
+    j = { p.first, p.second };
+}
+
+// for https://github.com/nlohmann/json/pull/1134
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
+void to_json(BasicJsonType& j, const T& b)
+{
+    j = { {b.key(), b.value()} };
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/)
+{
+    j = { std::get<Idx>(t)... };
+}
+
+template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
+void to_json(BasicJsonType& j, const T& t)
+{
+    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value> {});
+}
+
+struct to_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
+    -> decltype(to_json(j, std::forward<T>(val)), void())
+    {
+        return to_json(j, std::forward<T>(val));
+    }
+};
+}  // namespace detail
+
+/// namespace to hold default `to_json` function
+namespace
+{
+constexpr const auto& to_json = detail::static_const<detail::to_json_fn>::value;
+} // namespace
+} // namespace nlohmann
+
+
+namespace nlohmann
+{
+
+template<typename, typename>
+struct adl_serializer
+{
+    /*!
+    @brief convert a JSON value to any value type
+
+    This function is usually called by the `get()` function of the
+    @ref basic_json class (either explicit or via conversion operators).
+
+    @param[in] j        JSON value to read from
+    @param[in,out] val  value to write to
+    */
+    template<typename BasicJsonType, typename ValueType>
+    static auto from_json(BasicJsonType&& j, ValueType& val) noexcept(
+        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
+    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void())
+    {
+        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
+    }
+
+    /*!
+    @brief convert any value type to a JSON value
+
+    This function is usually called by the constructors of the @ref basic_json
+    class.
+
+    @param[in,out] j  JSON value to write to
+    @param[in] val    value to read from
+    */
+    template<typename BasicJsonType, typename ValueType>
+    static auto to_json(BasicJsonType& j, ValueType&& val) noexcept(
+        noexcept(::nlohmann::to_json(j, std::forward<ValueType>(val))))
+    -> decltype(::nlohmann::to_json(j, std::forward<ValueType>(val)), void())
+    {
+        ::nlohmann::to_json(j, std::forward<ValueType>(val));
+    }
+};
+
+}  // namespace nlohmann
+
+// #include <nlohmann/byte_container_with_subtype.hpp>
+
+
+#include <cstdint> // uint8_t
+#include <tuple> // tie
+#include <utility> // move
+
+namespace nlohmann
+{
+
+/*!
+@brief an internal type for a backed binary type
+
+This type extends the template parameter @a BinaryType provided to `basic_json`
+with a subtype used by BSON and MessagePack. This type exists so that the user
+does not have to specify a type themselves with a specific naming scheme in
+order to override the binary type.
+
+@tparam BinaryType container to store bytes (`std::vector<std::uint8_t>` by
+                   default)
+
+@since version 3.8.0
+*/
+template<typename BinaryType>
+class byte_container_with_subtype : public BinaryType
+{
+  public:
+    /// the type of the underlying container
+    using container_type = BinaryType;
+
+    byte_container_with_subtype() noexcept(noexcept(container_type()))
+        : container_type()
+    {}
+
+    byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b)))
+        : container_type(b)
+    {}
+
+    byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b))
+    {}
+
+    byte_container_with_subtype(const container_type& b, std::uint8_t subtype) noexcept(noexcept(container_type(b)))
+        : container_type(b)
+        , m_subtype(subtype)
+        , m_has_subtype(true)
+    {}
+
+    byte_container_with_subtype(container_type&& b, std::uint8_t subtype) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b))
+        , m_subtype(subtype)
+        , m_has_subtype(true)
+    {}
+
+    bool operator==(const byte_container_with_subtype& rhs) const
+    {
+        return std::tie(static_cast<const BinaryType&>(*this), m_subtype, m_has_subtype) ==
+               std::tie(static_cast<const BinaryType&>(rhs), rhs.m_subtype, rhs.m_has_subtype);
+    }
+
+    bool operator!=(const byte_container_with_subtype& rhs) const
+    {
+        return !(rhs == *this);
+    }
+
+    /*!
+    @brief sets the binary subtype
+
+    Sets the binary subtype of the value, also flags a binary JSON value as
+    having a subtype, which has implications for serialization.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa @ref subtype() -- return the binary subtype
+    @sa @ref clear_subtype() -- clears the binary subtype
+    @sa @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0
+    */
+    void set_subtype(std::uint8_t subtype) noexcept
+    {
+        m_subtype = subtype;
+        m_has_subtype = true;
+    }
+
+    /*!
+    @brief return the binary subtype
+
+    Returns the numerical subtype of the value if it has a subtype. If it does
+    not have a subtype, this function will return size_t(-1) as a sentinel
+    value.
+
+    @return the numerical subtype of the binary value
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa @ref set_subtype() -- sets the binary subtype
+    @sa @ref clear_subtype() -- clears the binary subtype
+    @sa @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0
+    */
+    constexpr std::uint8_t subtype() const noexcept
+    {
+        return m_subtype;
+    }
+
+    /*!
+    @brief return whether the value has a subtype
+
+    @return whether the value has a subtype
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa @ref subtype() -- return the binary subtype
+    @sa @ref set_subtype() -- sets the binary subtype
+    @sa @ref clear_subtype() -- clears the binary subtype
+
+    @since version 3.8.0
+    */
+    constexpr bool has_subtype() const noexcept
+    {
+        return m_has_subtype;
+    }
+
+    /*!
+    @brief clears the binary subtype
+
+    Clears the binary subtype and flags the value as not having a subtype, which
+    has implications for serialization; for instance MessagePack will prefer the
+    bin family over the ext family.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa @ref subtype() -- return the binary subtype
+    @sa @ref set_subtype() -- sets the binary subtype
+    @sa @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0
+    */
+    void clear_subtype() noexcept
+    {
+        m_subtype = 0;
+        m_has_subtype = false;
+    }
+
+  private:
+    std::uint8_t m_subtype = 0;
+    bool m_has_subtype = false;
+};
+
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/hash.hpp>
+
+
+#include <cstddef> // size_t, uint8_t
+#include <functional> // hash
+
+namespace nlohmann
+{
+namespace detail
+{
+
+// boost::hash_combine
+inline std::size_t combine(std::size_t seed, std::size_t h) noexcept
+{
+    seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U);
+    return seed;
+}
+
+/*!
+@brief hash a JSON value
+
+The hash function tries to rely on std::hash where possible. Furthermore, the
+type of the JSON value is taken into account to have different hash values for
+null, 0, 0U, and false, etc.
+
+@tparam BasicJsonType basic_json specialization
+@param j JSON value to hash
+@return hash value of j
+*/
+template<typename BasicJsonType>
+std::size_t hash(const BasicJsonType& j)
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+    const auto type = static_cast<std::size_t>(j.type());
+    switch (j.type())
+    {
+        case BasicJsonType::value_t::null:
+        case BasicJsonType::value_t::discarded:
+        {
+            return combine(type, 0);
+        }
+
+        case BasicJsonType::value_t::object:
+        {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j.items())
+            {
+                const auto h = std::hash<string_t> {}(element.key());
+                seed = combine(seed, h);
+                seed = combine(seed, hash(element.value()));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::array:
+        {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j)
+            {
+                seed = combine(seed, hash(element));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::string:
+        {
+            const auto h = std::hash<string_t> {}(j.template get_ref<const string_t&>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::boolean:
+        {
+            const auto h = std::hash<bool> {}(j.template get<bool>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_integer:
+        {
+            const auto h = std::hash<number_integer_t> {}(j.template get<number_integer_t>());
+            return combine(type, h);
+        }
+
+        case nlohmann::detail::value_t::number_unsigned:
+        {
+            const auto h = std::hash<number_unsigned_t> {}(j.template get<number_unsigned_t>());
+            return combine(type, h);
+        }
+
+        case nlohmann::detail::value_t::number_float:
+        {
+            const auto h = std::hash<number_float_t> {}(j.template get<number_float_t>());
+            return combine(type, h);
+        }
+
+        case nlohmann::detail::value_t::binary:
+        {
+            auto seed = combine(type, j.get_binary().size());
+            const auto h = std::hash<bool> {}(j.get_binary().has_subtype());
+            seed = combine(seed, h);
+            seed = combine(seed, j.get_binary().subtype());
+            for (const auto byte : j.get_binary())
+            {
+                seed = combine(seed, std::hash<std::uint8_t> {}(byte));
+            }
+            return seed;
+        }
+
+        default: // LCOV_EXCL_LINE
+            JSON_ASSERT(false); // LCOV_EXCL_LINE
+    }
+}
+
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+
+#include <algorithm> // generate_n
+#include <array> // array
+#include <cmath> // ldexp
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstdio> // snprintf
+#include <cstring> // memcpy
+#include <iterator> // back_inserter
+#include <limits> // numeric_limits
+#include <string> // char_traits, string
+#include <utility> // make_pair, move
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <cstdio> //FILE *
+#include <cstring> // strlen
+#include <istream> // istream
+#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
+#include <memory> // shared_ptr, make_shared, addressof
+#include <numeric> // accumulate
+#include <string> // string, char_traits
+#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
+#include <utility> // pair, declval
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// the supported input formats
+enum class input_format_t { json, cbor, msgpack, ubjson, bson };
+
+////////////////////
+// input adapters //
+////////////////////
+
+/*!
+Input adapter for stdio file access. This adapter read only 1 byte and do not use any
+ buffer. This adapter is a very low level adapter.
+*/
+class file_input_adapter
+{
+  public:
+    using char_type = char;
+
+    JSON_HEDLEY_NON_NULL(2)
+    explicit file_input_adapter(std::FILE* f) noexcept
+        : m_file(f)
+    {}
+
+    // make class move-only
+    file_input_adapter(const file_input_adapter&) = delete;
+    file_input_adapter(file_input_adapter&&) = default;
+    file_input_adapter& operator=(const file_input_adapter&) = delete;
+    file_input_adapter& operator=(file_input_adapter&&) = delete;
+
+    std::char_traits<char>::int_type get_character() noexcept
+    {
+        return std::fgetc(m_file);
+    }
+
+  private:
+    /// the file pointer to read from
+    std::FILE* m_file;
+};
+
+
+/*!
+Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
+beginning of input. Does not support changing the underlying std::streambuf
+in mid-input. Maintains underlying std::istream and std::streambuf to support
+subsequent use of standard std::istream operations to process any input
+characters following those used in parsing the JSON input.  Clears the
+std::istream flags; any input errors (e.g., EOF) will be detected by the first
+subsequent call for input from the std::istream.
+*/
+class input_stream_adapter
+{
+  public:
+    using char_type = char;
+
+    ~input_stream_adapter()
+    {
+        // clear stream flags; we use underlying streambuf I/O, do not
+        // maintain ifstream flags, except eof
+        if (is != nullptr)
+        {
+            is->clear(is->rdstate() & std::ios::eofbit);
+        }
+    }
+
+    explicit input_stream_adapter(std::istream& i)
+        : is(&i), sb(i.rdbuf())
+    {}
+
+    // delete because of pointer members
+    input_stream_adapter(const input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&& rhs) = delete;
+
+    input_stream_adapter(input_stream_adapter&& rhs) noexcept : is(rhs.is), sb(rhs.sb)
+    {
+        rhs.is = nullptr;
+        rhs.sb = nullptr;
+    }
+
+    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
+    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
+    // end up as the same value, eg. 0xFFFFFFFF.
+    std::char_traits<char>::int_type get_character()
+    {
+        auto res = sb->sbumpc();
+        // set eof manually, as we don't use the istream interface.
+        if (JSON_HEDLEY_UNLIKELY(res == EOF))
+        {
+            is->clear(is->rdstate() | std::ios::eofbit);
+        }
+        return res;
+    }
+
+  private:
+    /// the associated input stream
+    std::istream* is = nullptr;
+    std::streambuf* sb = nullptr;
+};
+
+// General-purpose iterator-based adapter. It might not be as fast as
+// theoretically possible for some containers, but it is extremely versatile.
+template<typename IteratorType>
+class iterator_input_adapter
+{
+  public:
+    using char_type = typename std::iterator_traits<IteratorType>::value_type;
+
+    iterator_input_adapter(IteratorType first, IteratorType last)
+        : current(std::move(first)), end(std::move(last)) {}
+
+    typename std::char_traits<char_type>::int_type get_character()
+    {
+        if (JSON_HEDLEY_LIKELY(current != end))
+        {
+            auto result = std::char_traits<char_type>::to_int_type(*current);
+            std::advance(current, 1);
+            return result;
+        }
+        else
+        {
+            return std::char_traits<char_type>::eof();
+        }
+    }
+
+  private:
+    IteratorType current;
+    IteratorType end;
+
+    template<typename BaseInputAdapter, size_t T>
+    friend struct wide_string_input_helper;
+
+    bool empty() const
+    {
+        return current == end;
+    }
+
+};
+
+
+template<typename BaseInputAdapter, size_t T>
+struct wide_string_input_helper;
+
+template<typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 4>
+{
+    // UTF-32
+    static void fill_buffer(BaseInputAdapter& input,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty()))
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u) & 0x1Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (wc <= 0xFFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u) & 0x0Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else if (wc <= 0x10FFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((static_cast<unsigned int>(wc) >> 18u) & 0x07u));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 12u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 4;
+            }
+            else
+            {
+                // unknown character
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+};
+
+template<typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 2>
+{
+    // UTF-16
+    static void fill_buffer(BaseInputAdapter& input,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty()))
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (0xD800 > wc || wc >= 0xE000)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else
+            {
+                if (JSON_HEDLEY_UNLIKELY(!input.empty()))
+                {
+                    const auto wc2 = static_cast<unsigned int>(input.get_character());
+                    const auto charcode = 0x10000u + (((static_cast<unsigned int>(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
+                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
+                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
+                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
+                    utf8_bytes_filled = 4;
+                }
+                else
+                {
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+};
+
+// Wraps another input apdater to convert wide character types into individual bytes.
+template<typename BaseInputAdapter, typename WideCharType>
+class wide_string_input_adapter
+{
+  public:
+    using char_type = char;
+
+    wide_string_input_adapter(BaseInputAdapter base)
+        : base_adapter(base) {}
+
+    typename std::char_traits<char>::int_type get_character() noexcept
+    {
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled)
+        {
+            fill_buffer<sizeof(WideCharType)>();
+
+            JSON_ASSERT(utf8_bytes_filled > 0);
+            JSON_ASSERT(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        JSON_ASSERT(utf8_bytes_filled > 0);
+        JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled);
+        return utf8_bytes[utf8_bytes_index++];
+    }
+
+  private:
+    BaseInputAdapter base_adapter;
+
+    template<size_t T>
+    void fill_buffer()
+    {
+        wide_string_input_helper<BaseInputAdapter, T>::fill_buffer(base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
+    }
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
+};
+
+
+template<typename IteratorType, typename Enable = void>
+struct iterator_input_adapter_factory
+{
+    using iterator_type = IteratorType;
+    using char_type = typename std::iterator_traits<iterator_type>::value_type;
+    using adapter_type = iterator_input_adapter<iterator_type>;
+
+    static adapter_type create(IteratorType first, IteratorType last)
+    {
+        return adapter_type(std::move(first), std::move(last));
+    }
+};
+
+template<typename T>
+struct is_iterator_of_multibyte
+{
+    using value_type = typename std::iterator_traits<T>::value_type;
+    enum
+    {
+        value = sizeof(value_type) > 1
+    };
+};
+
+template<typename IteratorType>
+struct iterator_input_adapter_factory<IteratorType, enable_if_t<is_iterator_of_multibyte<IteratorType>::value>>
+{
+    using iterator_type = IteratorType;
+    using char_type = typename std::iterator_traits<iterator_type>::value_type;
+    using base_adapter_type = iterator_input_adapter<iterator_type>;
+    using adapter_type = wide_string_input_adapter<base_adapter_type, char_type>;
+
+    static adapter_type create(IteratorType first, IteratorType last)
+    {
+        return adapter_type(base_adapter_type(std::move(first), std::move(last)));
+    }
+};
+
+// General purpose iterator-based input
+template<typename IteratorType>
+typename iterator_input_adapter_factory<IteratorType>::adapter_type input_adapter(IteratorType first, IteratorType last)
+{
+    using factory_type = iterator_input_adapter_factory<IteratorType>;
+    return factory_type::create(first, last);
+}
+
+// Convenience shorthand from container to iterator
+template<typename ContainerType>
+auto input_adapter(const ContainerType& container) -> decltype(input_adapter(begin(container), end(container)))
+{
+    // Enable ADL
+    using std::begin;
+    using std::end;
+
+    return input_adapter(begin(container), end(container));
+}
+
+// Special cases with fast paths
+inline file_input_adapter input_adapter(std::FILE* file)
+{
+    return file_input_adapter(file);
+}
+
+inline input_stream_adapter input_adapter(std::istream& stream)
+{
+    return input_stream_adapter(stream);
+}
+
+inline input_stream_adapter input_adapter(std::istream&& stream)
+{
+    return input_stream_adapter(stream);
+}
+
+using contiguous_bytes_input_adapter = decltype(input_adapter(std::declval<const char*>(), std::declval<const char*>()));
+
+// Null-delimited strings, and the like.
+template < typename CharT,
+           typename std::enable_if <
+               std::is_pointer<CharT>::value&&
+               !std::is_array<CharT>::value&&
+               std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
+               sizeof(typename std::remove_pointer<CharT>::type) == 1,
+               int >::type = 0 >
+contiguous_bytes_input_adapter input_adapter(CharT b)
+{
+    auto length = std::strlen(reinterpret_cast<const char*>(b));
+    const auto* ptr = reinterpret_cast<const char*>(b);
+    return input_adapter(ptr, ptr + length);
+}
+
+template<typename T, std::size_t N>
+auto input_adapter(T (&array)[N]) -> decltype(input_adapter(array, array + N))
+{
+    return input_adapter(array, array + N);
+}
+
+// This class only handles inputs of input_buffer_adapter type.
+// It's required so that expressions like {ptr, len} can be implicitely casted
+// to the correct adapter.
+class span_input_adapter
+{
+  public:
+    template < typename CharT,
+               typename std::enable_if <
+                   std::is_pointer<CharT>::value&&
+                   std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
+                   sizeof(typename std::remove_pointer<CharT>::type) == 1,
+                   int >::type = 0 >
+    span_input_adapter(CharT b, std::size_t l)
+        : ia(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(b) + l) {}
+
+    template<class IteratorType,
+             typename std::enable_if<
+                 std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
+                 int>::type = 0>
+    span_input_adapter(IteratorType first, IteratorType last)
+        : ia(input_adapter(first, last)) {}
+
+    contiguous_bytes_input_adapter&& get()
+    {
+        return std::move(ia);
+    }
+
+  private:
+    contiguous_bytes_input_adapter ia;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+
+#include <cstddef>
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+
+/*!
+@brief SAX interface
+
+This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
+Each function is called in different situations while the input is parsed. The
+boolean return value informs the parser whether to continue processing the
+input.
+*/
+template<typename BasicJsonType>
+struct json_sax
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    /*!
+    @brief a null value was read
+    @return whether parsing should proceed
+    */
+    virtual bool null() = 0;
+
+    /*!
+    @brief a boolean value was read
+    @param[in] val  boolean value
+    @return whether parsing should proceed
+    */
+    virtual bool boolean(bool val) = 0;
+
+    /*!
+    @brief an integer number was read
+    @param[in] val  integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_integer(number_integer_t val) = 0;
+
+    /*!
+    @brief an unsigned integer number was read
+    @param[in] val  unsigned integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_unsigned(number_unsigned_t val) = 0;
+
+    /*!
+    @brief an floating-point number was read
+    @param[in] val  floating-point value
+    @param[in] s    raw token value
+    @return whether parsing should proceed
+    */
+    virtual bool number_float(number_float_t val, const string_t& s) = 0;
+
+    /*!
+    @brief a string was read
+    @param[in] val  string value
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool string(string_t& val) = 0;
+
+    /*!
+    @brief a binary string was read
+    @param[in] val  binary value
+    @return whether parsing should proceed
+    @note It is safe to move the passed binary.
+    */
+    virtual bool binary(binary_t& val) = 0;
+
+    /*!
+    @brief the beginning of an object was read
+    @param[in] elements  number of object elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_object(std::size_t elements) = 0;
+
+    /*!
+    @brief an object key was read
+    @param[in] val  object key
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool key(string_t& val) = 0;
+
+    /*!
+    @brief the end of an object was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_object() = 0;
+
+    /*!
+    @brief the beginning of an array was read
+    @param[in] elements  number of array elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_array(std::size_t elements) = 0;
+
+    /*!
+    @brief the end of an array was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_array() = 0;
+
+    /*!
+    @brief a parse error occurred
+    @param[in] position    the position in the input where the error occurs
+    @param[in] last_token  the last read token
+    @param[in] ex          an exception object describing the error
+    @return whether parsing should proceed (must return false)
+    */
+    virtual bool parse_error(std::size_t position,
+                             const std::string& last_token,
+                             const detail::exception& ex) = 0;
+
+    virtual ~json_sax() = default;
+};
+
+
+namespace detail
+{
+/*!
+@brief SAX implementation to create a JSON value from SAX events
+
+This class implements the @ref json_sax interface and processes the SAX events
+to create a JSON value which makes it basically a DOM parser. The structure or
+hierarchy of the JSON value is managed by the stack `ref_stack` which contains
+a pointer to the respective array or object for each recursion depth.
+
+After successful parsing, the value that is passed by reference to the
+constructor contains the parsed value.
+
+@tparam BasicJsonType  the JSON type
+*/
+template<typename BasicJsonType>
+class json_sax_dom_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    /*!
+    @param[in, out] r  reference to a JSON value that is manipulated while
+                       parsing
+    @param[in] allow_exceptions_  whether parse errors yield exceptions
+    */
+    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
+        : root(r), allow_exceptions(allow_exceptions_)
+    {}
+
+    // make class move-only
+    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser(json_sax_dom_parser&&) = default;
+    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default;
+    ~json_sax_dom_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
+
+        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408,
+                                            "excessive object size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        // add null at given key and store the reference for later
+        object_element = &(ref_stack.back()->m_value.object->operator[](val));
+        return true;
+    }
+
+    bool end_object()
+    {
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
+
+        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408,
+                                            "excessive array size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        ref_stack.pop_back();
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+    */
+    template<typename Value>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    BasicJsonType* handle_value(Value&& v)
+    {
+        if (ref_stack.empty())
+        {
+            root = BasicJsonType(std::forward<Value>(v));
+            return &root;
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v));
+            return &(ref_stack.back()->m_value.array->back());
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_object());
+        JSON_ASSERT(object_element);
+        *object_element = BasicJsonType(std::forward<Value>(v));
+        return object_element;
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+template<typename BasicJsonType>
+class json_sax_dom_callback_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using parser_callback_t = typename BasicJsonType::parser_callback_t;
+    using parse_event_t = typename BasicJsonType::parse_event_t;
+
+    json_sax_dom_callback_parser(BasicJsonType& r,
+                                 const parser_callback_t cb,
+                                 const bool allow_exceptions_ = true)
+        : root(r), callback(cb), allow_exceptions(allow_exceptions_)
+    {
+        keep_stack.push_back(true);
+    }
+
+    // make class move-only
+    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default;
+    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default;
+    ~json_sax_dom_callback_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        // check callback for object start
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::object, true);
+        ref_stack.push_back(val.second);
+
+        // check object limit
+        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        BasicJsonType k = BasicJsonType(val);
+
+        // check callback for key
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
+        key_keep_stack.push_back(keep);
+
+        // add discarded value at given key and store the reference for later
+        if (keep && ref_stack.back())
+        {
+            object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded);
+        }
+
+        return true;
+    }
+
+    bool end_object()
+    {
+        if (ref_stack.back() && !callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
+        {
+            // discard object
+            *ref_stack.back() = discarded;
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
+        {
+            // remove discarded value
+            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
+            {
+                if (it->is_discarded())
+                {
+                    ref_stack.back()->erase(it);
+                    break;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::array, true);
+        ref_stack.push_back(val.second);
+
+        // check array limit
+        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len)));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        bool keep = true;
+
+        if (ref_stack.back())
+        {
+            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
+            if (!keep)
+            {
+                // discard array
+                *ref_stack.back() = discarded;
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        // remove discarded value
+        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->pop_back();
+        }
+
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @param[in] v  value to add to the JSON value we build during parsing
+    @param[in] skip_callback  whether we should skip calling the callback
+               function; this is required after start_array() and
+               start_object() SAX events, because otherwise we would call the
+               callback function with an empty array or object, respectively.
+
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+
+    @return pair of boolean (whether value should be kept) and pointer (to the
+            passed value in the ref_stack hierarchy; nullptr if not kept)
+    */
+    template<typename Value>
+    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
+    {
+        JSON_ASSERT(!keep_stack.empty());
+
+        // do not handle this value if we know it would be added to a discarded
+        // container
+        if (!keep_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // create value
+        auto value = BasicJsonType(std::forward<Value>(v));
+
+        // check callback
+        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
+
+        // do not handle this value if we just learnt it shall be discarded
+        if (!keep)
+        {
+            return {false, nullptr};
+        }
+
+        if (ref_stack.empty())
+        {
+            root = std::move(value);
+            return {true, &root};
+        }
+
+        // skip this value if we already decided to skip the parent
+        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
+        if (!ref_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // we now only expect arrays and objects
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        // array
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->push_back(std::move(value));
+            return {true, &(ref_stack.back()->m_value.array->back())};
+        }
+
+        // object
+        JSON_ASSERT(ref_stack.back()->is_object());
+        // check if we should store an element for the current key
+        JSON_ASSERT(!key_keep_stack.empty());
+        const bool store_element = key_keep_stack.back();
+        key_keep_stack.pop_back();
+
+        if (!store_element)
+        {
+            return {false, nullptr};
+        }
+
+        JSON_ASSERT(object_element);
+        *object_element = std::move(value);
+        return {true, object_element};
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// stack to manage which values to keep
+    std::vector<bool> keep_stack {};
+    /// stack to manage which object keys to keep
+    std::vector<bool> key_keep_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// a discarded value for the callback
+    BasicJsonType discarded = BasicJsonType::value_t::discarded;
+};
+
+template<typename BasicJsonType>
+class json_sax_acceptor
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    bool null()
+    {
+        return true;
+    }
+
+    bool boolean(bool /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_integer(number_integer_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool string(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool binary(binary_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool start_object(std::size_t /*unused*/ = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool key(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool end_object()
+    {
+        return true;
+    }
+
+    bool start_array(std::size_t /*unused*/ = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool end_array()
+    {
+        return true;
+    }
+
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
+    {
+        return false;
+    }
+};
+}  // namespace detail
+
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+
+#include <array> // array
+#include <clocale> // localeconv
+#include <cstddef> // size_t
+#include <cstdio> // snprintf
+#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
+#include <initializer_list> // initializer_list
+#include <string> // char_traits, string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////
+// lexer //
+///////////
+
+template<typename BasicJsonType>
+class lexer_base
+{
+  public:
+    /// token types for the parser
+    enum class token_type
+    {
+        uninitialized,    ///< indicating the scanner is uninitialized
+        literal_true,     ///< the `true` literal
+        literal_false,    ///< the `false` literal
+        literal_null,     ///< the `null` literal
+        value_string,     ///< a string -- use get_string() for actual value
+        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
+        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
+        value_float,      ///< an floating point number -- use get_number_float() for actual value
+        begin_array,      ///< the character for array begin `[`
+        begin_object,     ///< the character for object begin `{`
+        end_array,        ///< the character for array end `]`
+        end_object,       ///< the character for object end `}`
+        name_separator,   ///< the name separator `:`
+        value_separator,  ///< the value separator `,`
+        parse_error,      ///< indicating a parse error
+        end_of_input,     ///< indicating the end of the input buffer
+        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
+    };
+
+    /// return name of values of type token_type (only used for errors)
+    JSON_HEDLEY_RETURNS_NON_NULL
+    JSON_HEDLEY_CONST
+    static const char* token_type_name(const token_type t) noexcept
+    {
+        switch (t)
+        {
+            case token_type::uninitialized:
+                return "<uninitialized>";
+            case token_type::literal_true:
+                return "true literal";
+            case token_type::literal_false:
+                return "false literal";
+            case token_type::literal_null:
+                return "null literal";
+            case token_type::value_string:
+                return "string literal";
+            case token_type::value_unsigned:
+            case token_type::value_integer:
+            case token_type::value_float:
+                return "number literal";
+            case token_type::begin_array:
+                return "'['";
+            case token_type::begin_object:
+                return "'{'";
+            case token_type::end_array:
+                return "']'";
+            case token_type::end_object:
+                return "'}'";
+            case token_type::name_separator:
+                return "':'";
+            case token_type::value_separator:
+                return "','";
+            case token_type::parse_error:
+                return "<parse error>";
+            case token_type::end_of_input:
+                return "end of input";
+            case token_type::literal_or_value:
+                return "'[', '{', or a literal";
+            // LCOV_EXCL_START
+            default: // catch non-enum values
+                return "unknown token";
+                // LCOV_EXCL_STOP
+        }
+    }
+};
+/*!
+@brief lexical analysis
+
+This class organizes the lexical analysis during JSON deserialization.
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class lexer : public lexer_base<BasicJsonType>
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using char_type = typename InputAdapterType::char_type;
+    using char_int_type = typename std::char_traits<char_type>::int_type;
+
+  public:
+    using token_type = typename lexer_base<BasicJsonType>::token_type;
+
+    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false)
+        : ia(std::move(adapter))
+        , ignore_comments(ignore_comments_)
+        , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
+    {}
+
+    // delete because of pointer members
+    lexer(const lexer&) = delete;
+    lexer(lexer&&) = default;
+    lexer& operator=(lexer&) = delete;
+    lexer& operator=(lexer&&) = default;
+    ~lexer() = default;
+
+  private:
+    /////////////////////
+    // locales
+    /////////////////////
+
+    /// return the locale-dependent decimal point
+    JSON_HEDLEY_PURE
+    static char get_decimal_point() noexcept
+    {
+        const auto* loc = localeconv();
+        JSON_ASSERT(loc != nullptr);
+        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
+    }
+
+    /////////////////////
+    // scan functions
+    /////////////////////
+
+    /*!
+    @brief get codepoint from 4 hex characters following `\u`
+
+    For input "\u c1 c2 c3 c4" the codepoint is:
+      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
+    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
+
+    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
+    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
+    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
+    between the ASCII value of the character and the desired integer value.
+
+    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
+            non-hex character)
+    */
+    int get_codepoint()
+    {
+        // this function only makes sense after reading `\u`
+        JSON_ASSERT(current == 'u');
+        int codepoint = 0;
+
+        const auto factors = { 12u, 8u, 4u, 0u };
+        for (const auto factor : factors)
+        {
+            get();
+
+            if (current >= '0' && current <= '9')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
+            }
+            else if (current >= 'A' && current <= 'F')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
+            }
+            else if (current >= 'a' && current <= 'f')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
+            }
+            else
+            {
+                return -1;
+            }
+        }
+
+        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
+        return codepoint;
+    }
+
+    /*!
+    @brief check if the next byte(s) are inside a given range
+
+    Adds the current byte and, for each passed range, reads a new byte and
+    checks if it is inside the range. If a violation was detected, set up an
+    error message and return false. Otherwise, return true.
+
+    @param[in] ranges  list of integers; interpreted as list of pairs of
+                       inclusive lower and upper bound, respectively
+
+    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
+         1, 2, or 3 pairs. This precondition is enforced by an assertion.
+
+    @return true if and only if no range violation was detected
+    */
+    bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
+    {
+        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
+        add(current);
+
+        for (auto range = ranges.begin(); range != ranges.end(); ++range)
+        {
+            get();
+            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
+            {
+                add(current);
+            }
+            else
+            {
+                error_message = "invalid string: ill-formed UTF-8 byte";
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief scan a string literal
+
+    This function scans a string according to Sect. 7 of RFC 7159. While
+    scanning, bytes are escaped and copied into buffer token_buffer. Then the
+    function returns successfully, token_buffer is *not* null-terminated (as it
+    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
+    string.
+
+    @return token_type::value_string if string could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note In case of errors, variable error_message contains a textual
+          description.
+    */
+    token_type scan_string()
+    {
+        // reset token_buffer (ignore opening quote)
+        reset();
+
+        // we entered the function by reading an open quote
+        JSON_ASSERT(current == '\"');
+
+        while (true)
+        {
+            // get next character
+            switch (get())
+            {
+                // end of file while parsing string
+                case std::char_traits<char_type>::eof():
+                {
+                    error_message = "invalid string: missing closing quote";
+                    return token_type::parse_error;
+                }
+
+                // closing quote
+                case '\"':
+                {
+                    return token_type::value_string;
+                }
+
+                // escapes
+                case '\\':
+                {
+                    switch (get())
+                    {
+                        // quotation mark
+                        case '\"':
+                            add('\"');
+                            break;
+                        // reverse solidus
+                        case '\\':
+                            add('\\');
+                            break;
+                        // solidus
+                        case '/':
+                            add('/');
+                            break;
+                        // backspace
+                        case 'b':
+                            add('\b');
+                            break;
+                        // form feed
+                        case 'f':
+                            add('\f');
+                            break;
+                        // line feed
+                        case 'n':
+                            add('\n');
+                            break;
+                        // carriage return
+                        case 'r':
+                            add('\r');
+                            break;
+                        // tab
+                        case 't':
+                            add('\t');
+                            break;
+
+                        // unicode escapes
+                        case 'u':
+                        {
+                            const int codepoint1 = get_codepoint();
+                            int codepoint = codepoint1; // start with codepoint1
+
+                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
+                            {
+                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                return token_type::parse_error;
+                            }
+
+                            // check if code point is a high surrogate
+                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
+                            {
+                                // expect next \uxxxx entry
+                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
+                                {
+                                    const int codepoint2 = get_codepoint();
+
+                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
+                                    {
+                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                        return token_type::parse_error;
+                                    }
+
+                                    // check if codepoint2 is a low surrogate
+                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
+                                    {
+                                        // overwrite codepoint
+                                        codepoint = static_cast<int>(
+                                                        // high surrogate occupies the most significant 22 bits
+                                                        (static_cast<unsigned int>(codepoint1) << 10u)
+                                                        // low surrogate occupies the least significant 15 bits
+                                                        + static_cast<unsigned int>(codepoint2)
+                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                                                        // in the result so we have to subtract with:
+                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                                                        - 0x35FDC00u);
+                                    }
+                                    else
+                                    {
+                                        error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                        return token_type::parse_error;
+                                    }
+                                }
+                                else
+                                {
+                                    error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+                            else
+                            {
+                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
+                                {
+                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+
+                            // result of the above calculation yields a proper codepoint
+                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
+
+                            // translate codepoint into bytes
+                            if (codepoint < 0x80)
+                            {
+                                // 1-byte characters: 0xxxxxxx (ASCII)
+                                add(static_cast<char_int_type>(codepoint));
+                            }
+                            else if (codepoint <= 0x7FF)
+                            {
+                                // 2-byte characters: 110xxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else if (codepoint <= 0xFFFF)
+                            {
+                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else
+                            {
+                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+
+                            break;
+                        }
+
+                        // other characters after escape
+                        default:
+                            error_message = "invalid string: forbidden character after backslash";
+                            return token_type::parse_error;
+                    }
+
+                    break;
+                }
+
+                // invalid control characters
+                case 0x00:
+                {
+                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
+                    return token_type::parse_error;
+                }
+
+                case 0x01:
+                {
+                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
+                    return token_type::parse_error;
+                }
+
+                case 0x02:
+                {
+                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
+                    return token_type::parse_error;
+                }
+
+                case 0x03:
+                {
+                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
+                    return token_type::parse_error;
+                }
+
+                case 0x04:
+                {
+                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
+                    return token_type::parse_error;
+                }
+
+                case 0x05:
+                {
+                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
+                    return token_type::parse_error;
+                }
+
+                case 0x06:
+                {
+                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
+                    return token_type::parse_error;
+                }
+
+                case 0x07:
+                {
+                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
+                    return token_type::parse_error;
+                }
+
+                case 0x08:
+                {
+                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
+                    return token_type::parse_error;
+                }
+
+                case 0x09:
+                {
+                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
+                    return token_type::parse_error;
+                }
+
+                case 0x0A:
+                {
+                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
+                    return token_type::parse_error;
+                }
+
+                case 0x0B:
+                {
+                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
+                    return token_type::parse_error;
+                }
+
+                case 0x0C:
+                {
+                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
+                    return token_type::parse_error;
+                }
+
+                case 0x0D:
+                {
+                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
+                    return token_type::parse_error;
+                }
+
+                case 0x0E:
+                {
+                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
+                    return token_type::parse_error;
+                }
+
+                case 0x0F:
+                {
+                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
+                    return token_type::parse_error;
+                }
+
+                case 0x10:
+                {
+                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
+                    return token_type::parse_error;
+                }
+
+                case 0x11:
+                {
+                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
+                    return token_type::parse_error;
+                }
+
+                case 0x12:
+                {
+                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
+                    return token_type::parse_error;
+                }
+
+                case 0x13:
+                {
+                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
+                    return token_type::parse_error;
+                }
+
+                case 0x14:
+                {
+                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
+                    return token_type::parse_error;
+                }
+
+                case 0x15:
+                {
+                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
+                    return token_type::parse_error;
+                }
+
+                case 0x16:
+                {
+                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
+                    return token_type::parse_error;
+                }
+
+                case 0x17:
+                {
+                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
+                    return token_type::parse_error;
+                }
+
+                case 0x18:
+                {
+                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
+                    return token_type::parse_error;
+                }
+
+                case 0x19:
+                {
+                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
+                    return token_type::parse_error;
+                }
+
+                case 0x1A:
+                {
+                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
+                    return token_type::parse_error;
+                }
+
+                case 0x1B:
+                {
+                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
+                    return token_type::parse_error;
+                }
+
+                case 0x1C:
+                {
+                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
+                    return token_type::parse_error;
+                }
+
+                case 0x1D:
+                {
+                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
+                    return token_type::parse_error;
+                }
+
+                case 0x1E:
+                {
+                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
+                    return token_type::parse_error;
+                }
+
+                case 0x1F:
+                {
+                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
+                    return token_type::parse_error;
+                }
+
+                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
+                case 0x20:
+                case 0x21:
+                case 0x23:
+                case 0x24:
+                case 0x25:
+                case 0x26:
+                case 0x27:
+                case 0x28:
+                case 0x29:
+                case 0x2A:
+                case 0x2B:
+                case 0x2C:
+                case 0x2D:
+                case 0x2E:
+                case 0x2F:
+                case 0x30:
+                case 0x31:
+                case 0x32:
+                case 0x33:
+                case 0x34:
+                case 0x35:
+                case 0x36:
+                case 0x37:
+                case 0x38:
+                case 0x39:
+                case 0x3A:
+                case 0x3B:
+                case 0x3C:
+                case 0x3D:
+                case 0x3E:
+                case 0x3F:
+                case 0x40:
+                case 0x41:
+                case 0x42:
+                case 0x43:
+                case 0x44:
+                case 0x45:
+                case 0x46:
+                case 0x47:
+                case 0x48:
+                case 0x49:
+                case 0x4A:
+                case 0x4B:
+                case 0x4C:
+                case 0x4D:
+                case 0x4E:
+                case 0x4F:
+                case 0x50:
+                case 0x51:
+                case 0x52:
+                case 0x53:
+                case 0x54:
+                case 0x55:
+                case 0x56:
+                case 0x57:
+                case 0x58:
+                case 0x59:
+                case 0x5A:
+                case 0x5B:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
+                case 0x60:
+                case 0x61:
+                case 0x62:
+                case 0x63:
+                case 0x64:
+                case 0x65:
+                case 0x66:
+                case 0x67:
+                case 0x68:
+                case 0x69:
+                case 0x6A:
+                case 0x6B:
+                case 0x6C:
+                case 0x6D:
+                case 0x6E:
+                case 0x6F:
+                case 0x70:
+                case 0x71:
+                case 0x72:
+                case 0x73:
+                case 0x74:
+                case 0x75:
+                case 0x76:
+                case 0x77:
+                case 0x78:
+                case 0x79:
+                case 0x7A:
+                case 0x7B:
+                case 0x7C:
+                case 0x7D:
+                case 0x7E:
+                case 0x7F:
+                {
+                    add(current);
+                    break;
+                }
+
+                // U+0080..U+07FF: bytes C2..DF 80..BF
+                case 0xC2:
+                case 0xC3:
+                case 0xC4:
+                case 0xC5:
+                case 0xC6:
+                case 0xC7:
+                case 0xC8:
+                case 0xC9:
+                case 0xCA:
+                case 0xCB:
+                case 0xCC:
+                case 0xCD:
+                case 0xCE:
+                case 0xCF:
+                case 0xD0:
+                case 0xD1:
+                case 0xD2:
+                case 0xD3:
+                case 0xD4:
+                case 0xD5:
+                case 0xD6:
+                case 0xD7:
+                case 0xD8:
+                case 0xD9:
+                case 0xDA:
+                case 0xDB:
+                case 0xDC:
+                case 0xDD:
+                case 0xDE:
+                case 0xDF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
+                case 0xE0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
+                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
+                case 0xE1:
+                case 0xE2:
+                case 0xE3:
+                case 0xE4:
+                case 0xE5:
+                case 0xE6:
+                case 0xE7:
+                case 0xE8:
+                case 0xE9:
+                case 0xEA:
+                case 0xEB:
+                case 0xEC:
+                case 0xEE:
+                case 0xEF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
+                case 0xED:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+                case 0xF0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+                case 0xF1:
+                case 0xF2:
+                case 0xF3:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+                case 0xF4:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // remaining bytes (80..C1 and F5..FF) are ill-formed
+                default:
+                {
+                    error_message = "invalid string: ill-formed UTF-8 byte";
+                    return token_type::parse_error;
+                }
+            }
+        }
+    }
+
+    /*!
+     * @brief scan a comment
+     * @return whether comment could be scanned successfully
+     */
+    bool scan_comment()
+    {
+        switch (get())
+        {
+            // single-line comments skip input until a newline or EOF is read
+            case '/':
+            {
+                while (true)
+                {
+                    switch (get())
+                    {
+                        case '\n':
+                        case '\r':
+                        case std::char_traits<char_type>::eof():
+                        case '\0':
+                            return true;
+
+                        default:
+                            break;
+                    }
+                }
+            }
+
+            // multi-line comments skip input until */ is read
+            case '*':
+            {
+                while (true)
+                {
+                    switch (get())
+                    {
+                        case std::char_traits<char_type>::eof():
+                        case '\0':
+                        {
+                            error_message = "invalid comment; missing closing '*/'";
+                            return false;
+                        }
+
+                        case '*':
+                        {
+                            switch (get())
+                            {
+                                case '/':
+                                    return true;
+
+                                default:
+                                {
+                                    unget();
+                                    continue;
+                                }
+                            }
+                        }
+
+                        default:
+                            continue;
+                    }
+                }
+            }
+
+            // unexpected character after reading '/'
+            default:
+            {
+                error_message = "invalid comment; expecting '/' or '*' after '/'";
+                return false;
+            }
+        }
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(float& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtof(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtod(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(long double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtold(str, endptr);
+    }
+
+    /*!
+    @brief scan a number literal
+
+    This function scans a string according to Sect. 6 of RFC 7159.
+
+    The function is realized with a deterministic finite state machine derived
+    from the grammar described in RFC 7159. Starting in state "init", the
+    input is read and used to determined the next state. Only state "done"
+    accepts the number. State "error" is a trap state to model errors. In the
+    table below, "anything" means any character but the ones listed before.
+
+    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
+    ---------|----------|----------|----------|---------|---------|----------|-----------
+    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
+    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
+    zero     | done     | done     | exponent | done    | done    | decimal1 | done
+    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
+    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
+    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
+    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
+    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
+    any2     | any2     | any2     | done     | done    | done    | done     | done
+
+    The state machine is realized with one label per state (prefixed with
+    "scan_number_") and `goto` statements between them. The state machine
+    contains cycles, but any cycle can be left when EOF is read. Therefore,
+    the function is guaranteed to terminate.
+
+    During scanning, the read bytes are stored in token_buffer. This string is
+    then converted to a signed integer, an unsigned integer, or a
+    floating-point number.
+
+    @return token_type::value_unsigned, token_type::value_integer, or
+            token_type::value_float if number could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note The scanner is independent of the current locale. Internally, the
+          locale's decimal point is used instead of `.` to work with the
+          locale-dependent converters.
+    */
+    token_type scan_number()  // lgtm [cpp/use-of-goto]
+    {
+        // reset token_buffer to store the number's bytes
+        reset();
+
+        // the type of the parsed number; initially set to unsigned; will be
+        // changed if minus sign, decimal point or exponent is read
+        token_type number_type = token_type::value_unsigned;
+
+        // state (init): we just found out we need to scan a number
+        switch (current)
+        {
+            case '-':
+            {
+                add(current);
+                goto scan_number_minus;
+            }
+
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            // all other characters are rejected outside scan_number()
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+
+scan_number_minus:
+        // state: we just parsed a leading minus sign
+        number_type = token_type::value_integer;
+        switch (get())
+        {
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '-'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_zero:
+        // state: we just parse a zero (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '.':
+            {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_any1:
+        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            case '.':
+            {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_decimal1:
+        // state: we just parsed a decimal point
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '.'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_decimal2:
+        // we just parsed at least one number after a decimal point
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_exponent:
+        // we just parsed an exponent
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '+':
+            case '-':
+            {
+                add(current);
+                goto scan_number_sign;
+            }
+
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message =
+                    "invalid number; expected '+', '-', or digit after exponent";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_sign:
+        // we just parsed an exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after exponent sign";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_any2:
+        // we just parsed a number after the exponent or exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_done:
+        // unget the character after the number (we only read it to know that
+        // we are done scanning a number)
+        unget();
+
+        char* endptr = nullptr;
+        errno = 0;
+
+        // try to parse integers first and fall back to floats
+        if (number_type == token_type::value_unsigned)
+        {
+            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0)
+            {
+                value_unsigned = static_cast<number_unsigned_t>(x);
+                if (value_unsigned == x)
+                {
+                    return token_type::value_unsigned;
+                }
+            }
+        }
+        else if (number_type == token_type::value_integer)
+        {
+            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0)
+            {
+                value_integer = static_cast<number_integer_t>(x);
+                if (value_integer == x)
+                {
+                    return token_type::value_integer;
+                }
+            }
+        }
+
+        // this code is reached if we parse a floating-point number or if an
+        // integer conversion above failed
+        strtof(value_float, token_buffer.data(), &endptr);
+
+        // we checked the number format before
+        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+        return token_type::value_float;
+    }
+
+    /*!
+    @param[in] literal_text  the literal text to expect
+    @param[in] length        the length of the passed literal text
+    @param[in] return_type   the token type to return on success
+    */
+    JSON_HEDLEY_NON_NULL(2)
+    token_type scan_literal(const char_type* literal_text, const std::size_t length,
+                            token_type return_type)
+    {
+        JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
+        for (std::size_t i = 1; i < length; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
+            {
+                error_message = "invalid literal";
+                return token_type::parse_error;
+            }
+        }
+        return return_type;
+    }
+
+    /////////////////////
+    // input management
+    /////////////////////
+
+    /// reset token_buffer; current character is beginning of token
+    void reset() noexcept
+    {
+        token_buffer.clear();
+        token_string.clear();
+        token_string.push_back(std::char_traits<char_type>::to_char_type(current));
+    }
+
+    /*
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a
+    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
+    for use in error messages.
+
+    @return character read from the input
+    */
+    char_int_type get()
+    {
+        ++position.chars_read_total;
+        ++position.chars_read_current_line;
+
+        if (next_unget)
+        {
+            // just reset the next_unget variable and work with current
+            next_unget = false;
+        }
+        else
+        {
+            current = ia.get_character();
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
+        {
+            token_string.push_back(std::char_traits<char_type>::to_char_type(current));
+        }
+
+        if (current == '\n')
+        {
+            ++position.lines_read;
+            position.chars_read_current_line = 0;
+        }
+
+        return current;
+    }
+
+    /*!
+    @brief unget current character (read it again on next get)
+
+    We implement unget by setting variable next_unget to true. The input is not
+    changed - we just simulate ungetting by modifying chars_read_total,
+    chars_read_current_line, and token_string. The next call to get() will
+    behave as if the unget character is read again.
+    */
+    void unget()
+    {
+        next_unget = true;
+
+        --position.chars_read_total;
+
+        // in case we "unget" a newline, we have to also decrement the lines_read
+        if (position.chars_read_current_line == 0)
+        {
+            if (position.lines_read > 0)
+            {
+                --position.lines_read;
+            }
+        }
+        else
+        {
+            --position.chars_read_current_line;
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
+        {
+            JSON_ASSERT(!token_string.empty());
+            token_string.pop_back();
+        }
+    }
+
+    /// add a character to token_buffer
+    void add(char_int_type c)
+    {
+        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
+    }
+
+  public:
+    /////////////////////
+    // value getters
+    /////////////////////
+
+    /// return integer value
+    constexpr number_integer_t get_number_integer() const noexcept
+    {
+        return value_integer;
+    }
+
+    /// return unsigned integer value
+    constexpr number_unsigned_t get_number_unsigned() const noexcept
+    {
+        return value_unsigned;
+    }
+
+    /// return floating-point value
+    constexpr number_float_t get_number_float() const noexcept
+    {
+        return value_float;
+    }
+
+    /// return current string value (implicitly resets the token; useful only once)
+    string_t& get_string()
+    {
+        return token_buffer;
+    }
+
+    /////////////////////
+    // diagnostics
+    /////////////////////
+
+    /// return position of last read token
+    constexpr position_t get_position() const noexcept
+    {
+        return position;
+    }
+
+    /// return the last read token (for errors only).  Will never contain EOF
+    /// (an arbitrary value that is not a valid char value, often -1), because
+    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
+    std::string get_token_string() const
+    {
+        // escape control characters
+        std::string result;
+        for (const auto c : token_string)
+        {
+            if (static_cast<unsigned char>(c) <= '\x1F')
+            {
+                // escape control characters
+                std::array<char, 9> cs{{}};
+                (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c));
+                result += cs.data();
+            }
+            else
+            {
+                // add character as is
+                result.push_back(static_cast<std::string::value_type>(c));
+            }
+        }
+
+        return result;
+    }
+
+    /// return syntax error message
+    JSON_HEDLEY_RETURNS_NON_NULL
+    constexpr const char* get_error_message() const noexcept
+    {
+        return error_message;
+    }
+
+    /////////////////////
+    // actual scanner
+    /////////////////////
+
+    /*!
+    @brief skip the UTF-8 byte order mark
+    @return true iff there is no BOM or the correct BOM has been skipped
+    */
+    bool skip_bom()
+    {
+        if (get() == 0xEF)
+        {
+            // check if we completely parse the BOM
+            return get() == 0xBB && get() == 0xBF;
+        }
+
+        // the first character is not the beginning of the BOM; unget it to
+        // process is later
+        unget();
+        return true;
+    }
+
+    void skip_whitespace()
+    {
+        do
+        {
+            get();
+        }
+        while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
+    }
+
+    token_type scan()
+    {
+        // initially, skip the BOM
+        if (position.chars_read_total == 0 && !skip_bom())
+        {
+            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
+            return token_type::parse_error;
+        }
+
+        // read next character and ignore whitespace
+        skip_whitespace();
+
+        // ignore comments
+        while (ignore_comments && current == '/')
+        {
+            if (!scan_comment())
+            {
+                return token_type::parse_error;
+            }
+
+            // skip following whitespace
+            skip_whitespace();
+        }
+
+        switch (current)
+        {
+            // structural characters
+            case '[':
+                return token_type::begin_array;
+            case ']':
+                return token_type::end_array;
+            case '{':
+                return token_type::begin_object;
+            case '}':
+                return token_type::end_object;
+            case ':':
+                return token_type::name_separator;
+            case ',':
+                return token_type::value_separator;
+
+            // literals
+            case 't':
+            {
+                std::array<char_type, 4> true_literal = {{'t', 'r', 'u', 'e'}};
+                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
+            }
+            case 'f':
+            {
+                std::array<char_type, 5> false_literal = {{'f', 'a', 'l', 's', 'e'}};
+                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
+            }
+            case 'n':
+            {
+                std::array<char_type, 4> null_literal = {{'n', 'u', 'l', 'l'}};
+                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
+            }
+
+            // string
+            case '\"':
+                return scan_string();
+
+            // number
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+                return scan_number();
+
+            // end of input (the null byte is needed when parsing from
+            // string literals)
+            case '\0':
+            case std::char_traits<char_type>::eof():
+                return token_type::end_of_input;
+
+            // error
+            default:
+                error_message = "invalid literal";
+                return token_type::parse_error;
+        }
+    }
+
+  private:
+    /// input adapter
+    InputAdapterType ia;
+
+    /// whether comments should be ignored (true) or signaled as errors (false)
+    const bool ignore_comments = false;
+
+    /// the current character
+    char_int_type current = std::char_traits<char_type>::eof();
+
+    /// whether the next get() call should just return current
+    bool next_unget = false;
+
+    /// the start position of the current token
+    position_t position {};
+
+    /// raw input token string (for error messages)
+    std::vector<char_type> token_string {};
+
+    /// buffer for variable-length tokens (numbers, strings)
+    string_t token_buffer {};
+
+    /// a description of occurred lexer errors
+    const char* error_message = "";
+
+    // number values
+    number_integer_t value_integer = 0;
+    number_unsigned_t value_unsigned = 0;
+    number_float_t value_float = 0;
+
+    /// the decimal point
+    const char_int_type decimal_point_char = '.';
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+
+#include <cstdint> // size_t
+#include <utility> // declval
+#include <string> // string
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename T>
+using null_function_t = decltype(std::declval<T&>().null());
+
+template<typename T>
+using boolean_function_t =
+    decltype(std::declval<T&>().boolean(std::declval<bool>()));
+
+template<typename T, typename Integer>
+using number_integer_function_t =
+    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));
+
+template<typename T, typename Unsigned>
+using number_unsigned_function_t =
+    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));
+
+template<typename T, typename Float, typename String>
+using number_float_function_t = decltype(std::declval<T&>().number_float(
+                                    std::declval<Float>(), std::declval<const String&>()));
+
+template<typename T, typename String>
+using string_function_t =
+    decltype(std::declval<T&>().string(std::declval<String&>()));
+
+template<typename T, typename Binary>
+using binary_function_t =
+    decltype(std::declval<T&>().binary(std::declval<Binary&>()));
+
+template<typename T>
+using start_object_function_t =
+    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));
+
+template<typename T, typename String>
+using key_function_t =
+    decltype(std::declval<T&>().key(std::declval<String&>()));
+
+template<typename T>
+using end_object_function_t = decltype(std::declval<T&>().end_object());
+
+template<typename T>
+using start_array_function_t =
+    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));
+
+template<typename T>
+using end_array_function_t = decltype(std::declval<T&>().end_array());
+
+template<typename T, typename Exception>
+using parse_error_function_t = decltype(std::declval<T&>().parse_error(
+        std::declval<std::size_t>(), std::declval<const std::string&>(),
+        std::declval<const Exception&>()));
+
+template<typename SAX, typename BasicJsonType>
+struct is_sax
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static constexpr bool value =
+        is_detected_exact<bool, null_function_t, SAX>::value &&
+        is_detected_exact<bool, boolean_function_t, SAX>::value &&
+        is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value &&
+        is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value &&
+        is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value &&
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value &&
+        is_detected_exact<bool, start_object_function_t, SAX>::value &&
+        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, end_object_function_t, SAX>::value &&
+        is_detected_exact<bool, start_array_function_t, SAX>::value &&
+        is_detected_exact<bool, end_array_function_t, SAX>::value &&
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
+};
+
+template<typename SAX, typename BasicJsonType>
+struct is_sax_static_asserts
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
+                  "Missing/invalid function: bool null()");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(
+        is_detected_exact<bool, number_integer_function_t, SAX,
+        number_integer_t>::value,
+        "Missing/invalid function: bool number_integer(number_integer_t)");
+    static_assert(
+        is_detected_exact<bool, number_unsigned_function_t, SAX,
+        number_unsigned_t>::value,
+        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
+    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
+                  number_float_t, string_t>::value,
+                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
+    static_assert(
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
+        "Missing/invalid function: bool string(string_t&)");
+    static_assert(
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value,
+        "Missing/invalid function: bool binary(binary_t&)");
+    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_object(std::size_t)");
+    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
+                  "Missing/invalid function: bool key(string_t&)");
+    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_object()");
+    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_array(std::size_t)");
+    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_array()");
+    static_assert(
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
+        "Missing/invalid function: bool parse_error(std::size_t, const "
+        "std::string&, const exception&)");
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/// how to treat CBOR tags
+enum class cbor_tag_handler_t
+{
+    error,  ///< throw a parse_error exception in case of a tag
+    ignore   ///< ignore tags
+};
+
+/*!
+@brief determine system byte order
+
+@return true if and only if system's byte order is little endian
+
+@note from https://stackoverflow.com/a/1001328/266378
+*/
+static inline bool little_endianess(int num = 1) noexcept
+{
+    return *reinterpret_cast<char*>(&num) == 1;
+}
+
+
+///////////////////
+// binary reader //
+///////////////////
+
+/*!
+@brief deserialization of CBOR, MessagePack, and UBJSON values
+*/
+template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType>>
+class binary_reader
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using json_sax_t = SAX;
+    using char_type = typename InputAdapterType::char_type;
+    using char_int_type = typename std::char_traits<char_type>::int_type;
+
+  public:
+    /*!
+    @brief create a binary reader
+
+    @param[in] adapter  input adapter to read from
+    */
+    explicit binary_reader(InputAdapterType&& adapter) : ia(std::move(adapter))
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+    }
+
+    // make class move-only
+    binary_reader(const binary_reader&) = delete;
+    binary_reader(binary_reader&&) = default;
+    binary_reader& operator=(const binary_reader&) = delete;
+    binary_reader& operator=(binary_reader&&) = default;
+    ~binary_reader() = default;
+
+    /*!
+    @param[in] format  the binary format to parse
+    @param[in] sax_    a SAX event processor
+    @param[in] strict  whether to expect the input to be consumed completed
+    @param[in] tag_handler  how to treat CBOR tags
+
+    @return
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool sax_parse(const input_format_t format,
+                   json_sax_t* sax_,
+                   const bool strict = true,
+                   const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        sax = sax_;
+        bool result = false;
+
+        switch (format)
+        {
+            case input_format_t::bson:
+                result = parse_bson_internal();
+                break;
+
+            case input_format_t::cbor:
+                result = parse_cbor_internal(true, tag_handler);
+                break;
+
+            case input_format_t::msgpack:
+                result = parse_msgpack_internal();
+                break;
+
+            case input_format_t::ubjson:
+                result = parse_ubjson_internal();
+                break;
+
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+
+        // strict mode: next byte must be EOF
+        if (result && strict)
+        {
+            if (format == input_format_t::ubjson)
+            {
+                get_ignore_noop();
+            }
+            else
+            {
+                get();
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(current != std::char_traits<char_type>::eof()))
+            {
+                return sax->parse_error(chars_read, get_token_string(),
+                                        parse_error::create(110, chars_read, exception_message(format, "expected end of input; last byte: 0x" + get_token_string(), "value")));
+            }
+        }
+
+        return result;
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @brief Reads in a BSON-object and passes it to the SAX-parser.
+    @return whether a valid BSON-value was passed to the SAX parser
+    */
+    bool parse_bson_internal()
+    {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/false)))
+        {
+            return false;
+        }
+
+        return sax->end_object();
+    }
+
+    /*!
+    @brief Parses a C-style string from the BSON input.
+    @param[in, out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @return `true` if the \x00-byte indicating the end of the string was
+             encountered before the EOF; false` indicates an unexpected EOF.
+    */
+    bool get_bson_cstr(string_t& result)
+    {
+        auto out = std::back_inserter(result);
+        while (true)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring")))
+            {
+                return false;
+            }
+            if (current == 0x00)
+            {
+                return true;
+            }
+            *out++ = static_cast<typename string_t::value_type>(current);
+        }
+    }
+
+    /*!
+    @brief Parses a zero-terminated string of length @a len from the BSON
+           input.
+    @param[in] len  The length (including the zero-byte at the end) of the
+                    string to be read.
+    @param[in, out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 1
+    @return `true` if the string was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_string(const NumberType len, string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 1))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "string length must be at least 1, is " + std::to_string(len), "string")));
+        }
+
+        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != std::char_traits<char_type>::eof();
+    }
+
+    /*!
+    @brief Parses a byte array input of length @a len from the BSON input.
+    @param[in] len  The length of the byte array to be read.
+    @param[in, out] result  A reference to the binary variable where the read
+                            array is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 0
+    @return `true` if the byte array was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_binary(const NumberType len, binary_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 0))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "byte array length cannot be negative, is " + std::to_string(len), "binary")));
+        }
+
+        // All BSON binary values have a subtype
+        std::uint8_t subtype{};
+        get_number<std::uint8_t>(input_format_t::bson, subtype);
+        result.set_subtype(subtype);
+
+        return get_binary(input_format_t::bson, len, result);
+    }
+
+    /*!
+    @brief Read a BSON document element of the given @a element_type.
+    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
+    @param[in] element_type_parse_position The position in the input stream,
+               where the `element_type` was read.
+    @warning Not all BSON element types are supported yet. An unsupported
+             @a element_type will give rise to a parse_error.114:
+             Unsupported BSON record type 0x...
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_internal(const char_int_type element_type,
+                                     const std::size_t element_type_parse_position)
+    {
+        switch (element_type)
+        {
+            case 0x01: // double
+            {
+                double number{};
+                return get_number<double, true>(input_format_t::bson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0x02: // string
+            {
+                std::int32_t len{};
+                string_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value);
+            }
+
+            case 0x03: // object
+            {
+                return parse_bson_internal();
+            }
+
+            case 0x04: // array
+            {
+                return parse_bson_array();
+            }
+
+            case 0x05: // binary
+            {
+                std::int32_t len{};
+                binary_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value);
+            }
+
+            case 0x08: // boolean
+            {
+                return sax->boolean(get() != 0);
+            }
+
+            case 0x0A: // null
+            {
+                return sax->null();
+            }
+
+            case 0x10: // int32
+            {
+                std::int32_t value{};
+                return get_number<std::int32_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            case 0x12: // int64
+            {
+                std::int64_t value{};
+                return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            default: // anything else not supported (yet)
+            {
+                std::array<char, 3> cr{{}};
+                (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type));
+                return sax->parse_error(element_type_parse_position, std::string(cr.data()), parse_error::create(114, element_type_parse_position, "Unsupported BSON record type 0x" + std::string(cr.data())));
+            }
+        }
+    }
+
+    /*!
+    @brief Read a BSON element list (as specified in the BSON-spec)
+
+    The same binary layout is used for objects and arrays, hence it must be
+    indicated with the argument @a is_array which one is expected
+    (true --> array, false --> object).
+
+    @param[in] is_array Determines if the element list being read is to be
+                        treated as an object (@a is_array == false), or as an
+                        array (@a is_array == true).
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_list(const bool is_array)
+    {
+        string_t key;
+
+        while (auto element_type = get())
+        {
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list")))
+            {
+                return false;
+            }
+
+            const std::size_t element_type_parse_position = chars_read;
+            if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key)))
+            {
+                return false;
+            }
+
+            if (!is_array && !sax->key(key))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position)))
+            {
+                return false;
+            }
+
+            // get_bson_cstr only appends
+            key.clear();
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief Reads an array from the BSON input and passes it to the SAX-parser.
+    @return whether a valid BSON-array was passed to the SAX parser
+    */
+    bool parse_bson_array()
+    {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/true)))
+        {
+            return false;
+        }
+
+        return sax->end_array();
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true) or whether the last read character should
+                         be considered instead (false)
+    @param[in] tag_handler how CBOR tags should be treated
+
+    @return whether a valid CBOR value was passed to the SAX parser
+    */
+    bool parse_cbor_internal(const bool get_char,
+                             const cbor_tag_handler_t tag_handler)
+    {
+        switch (get_char ? get() : current)
+        {
+            // EOF
+            case std::char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::cbor, "value");
+
+            // Integer 0x00..0x17 (0..23)
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            case 0x18: // Unsigned integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x19: // Unsigned integer (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            // Negative integer -1-0x00..-1-0x17 (-1..-24)
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));
+
+            case 0x38: // Negative integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1)
+                        - static_cast<number_integer_t>(number));
+            }
+
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58: // Binary data (one-byte uint8_t for n follows)
+            case 0x59: // Binary data (two-byte uint16_t for n follow)
+            case 0x5A: // Binary data (four-byte uint32_t for n follow)
+            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
+            case 0x5F: // Binary data (indefinite length)
+            {
+                binary_t b;
+                return get_cbor_binary(b) && sax->binary(b);
+            }
+
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                string_t s;
+                return get_cbor_string(s) && sax->string(s);
+            }
+
+            // array (0x00..0x17 data items follow)
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+                return get_cbor_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
+
+            case 0x98: // array (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x99: // array (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9A: // array (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9B: // array (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9F: // array (indefinite length)
+                return get_cbor_array(std::size_t(-1), tag_handler);
+
+            // map (0x00..0x17 pairs of data items follow)
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+                return get_cbor_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
+
+            case 0xB8: // map (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xB9: // map (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBA: // map (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBB: // map (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBF: // map (indefinite length)
+                return get_cbor_object(std::size_t(-1), tag_handler);
+
+            case 0xC6: // tagged item
+            case 0xC7:
+            case 0xC8:
+            case 0xC9:
+            case 0xCA:
+            case 0xCB:
+            case 0xCC:
+            case 0xCD:
+            case 0xCE:
+            case 0xCF:
+            case 0xD0:
+            case 0xD1:
+            case 0xD2:
+            case 0xD3:
+            case 0xD4:
+            case 0xD8: // tagged item (1 bytes follow)
+            case 0xD9: // tagged item (2 bytes follow)
+            case 0xDA: // tagged item (4 bytes follow)
+            case 0xDB: // tagged item (8 bytes follow)
+            {
+                switch (tag_handler)
+                {
+                    case cbor_tag_handler_t::error:
+                    {
+                        auto last_token = get_token_string();
+                        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value")));
+                    }
+
+                    case cbor_tag_handler_t::ignore:
+                    {
+                        switch (current)
+                        {
+                            case 0xD8:
+                            {
+                                std::uint8_t len{};
+                                get_number(input_format_t::cbor, len);
+                                break;
+                            }
+                            case 0xD9:
+                            {
+                                std::uint16_t len{};
+                                get_number(input_format_t::cbor, len);
+                                break;
+                            }
+                            case 0xDA:
+                            {
+                                std::uint32_t len{};
+                                get_number(input_format_t::cbor, len);
+                                break;
+                            }
+                            case 0xDB:
+                            {
+                                std::uint64_t len{};
+                                get_number(input_format_t::cbor, len);
+                                break;
+                            }
+                            default:
+                                break;
+                        }
+                        return parse_cbor_internal(true, tag_handler);
+                    }
+
+                    default:            // LCOV_EXCL_LINE
+                        JSON_ASSERT(false);  // LCOV_EXCL_LINE
+                }
+            }
+
+            case 0xF4: // false
+                return sax->boolean(false);
+
+            case 0xF5: // true
+                return sax->boolean(true);
+
+            case 0xF6: // null
+                return sax->null();
+
+            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
+            {
+                const auto byte1_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+                const auto byte2_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+
+                const auto byte1 = static_cast<unsigned char>(byte1_raw);
+                const auto byte2 = static_cast<unsigned char>(byte2_raw);
+
+                // code from RFC 7049, Appendix D, Figure 3:
+                // As half-precision floating-point numbers were only added
+                // to IEEE 754 in 2008, today's programming platforms often
+                // still only have limited support for them. It is very
+                // easy to include at least decoding support for them even
+                // without such support. An example of a small decoder for
+                // half-precision floating-point numbers in the C language
+                // is shown in Fig. 3.
+                const auto half = static_cast<unsigned int>((byte1 << 8u) + byte2);
+                const double val = [&half]
+                {
+                    const int exp = (half >> 10u) & 0x1Fu;
+                    const unsigned int mant = half & 0x3FFu;
+                    JSON_ASSERT(0 <= exp&& exp <= 32);
+                    JSON_ASSERT(mant <= 1024);
+                    switch (exp)
+                    {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0)
+                            ? std::numeric_limits<double>::infinity()
+                            : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float((half & 0x8000u) != 0
+                                         ? static_cast<number_float_t>(-val)
+                                         : static_cast<number_float_t>(val), "");
+            }
+
+            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
+            {
+                float number{};
+                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
+            {
+                double number{};
+                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            default: // anything else (0xFF is handled inside the other types)
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+    Additionally, CBOR's strings with indefinite lengths are supported.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_cbor_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            {
+                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    string_t chunk;
+                    if (!get_cbor_string(chunk))
+                    {
+                        return false;
+                    }
+                    result.append(chunk);
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x" + last_token, "string")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into the byte array.
+    Additionally, CBOR's byte arrays with indefinite lengths are supported.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool get_cbor_binary(binary_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            {
+                return get_binary(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x58: // Binary data (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x59: // Binary data (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5A: // Binary data (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5F: // Binary data (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    binary_t chunk;
+                    if (!get_cbor_binary(chunk))
+                    {
+                        return false;
+                    }
+                    result.insert(result.end(), chunk.begin(), chunk.end());
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x40-0x5B) or indefinite binary array type (0x5F); last byte: 0x" + last_token, "binary")));
+            }
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array or std::size_t(-1) for an
+                    array of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether array creation completed
+    */
+    bool get_cbor_array(const std::size_t len,
+                        const cbor_tag_handler_t tag_handler)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
+        {
+            return false;
+        }
+
+        if (len != std::size_t(-1))
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                {
+                    return false;
+                }
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler)))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object or std::size_t(-1) for an
+                    object of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether object creation completed
+    */
+    bool get_cbor_object(const std::size_t len,
+                         const cbor_tag_handler_t tag_handler)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
+        {
+            return false;
+        }
+
+        string_t key;
+        if (len != std::size_t(-1))
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
+                {
+                    return false;
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                {
+                    return false;
+                }
+                key.clear();
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
+                {
+                    return false;
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                {
+                    return false;
+                }
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    /*!
+    @return whether a valid MessagePack value was passed to the SAX parser
+    */
+    bool parse_msgpack_internal()
+    {
+        switch (get())
+        {
+            // EOF
+            case std::char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::msgpack, "value");
+
+            // positive fixint
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+            case 0x18:
+            case 0x19:
+            case 0x1A:
+            case 0x1B:
+            case 0x1C:
+            case 0x1D:
+            case 0x1E:
+            case 0x1F:
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+            case 0x38:
+            case 0x39:
+            case 0x3A:
+            case 0x3B:
+            case 0x3C:
+            case 0x3D:
+            case 0x3E:
+            case 0x3F:
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58:
+            case 0x59:
+            case 0x5A:
+            case 0x5B:
+            case 0x5C:
+            case 0x5D:
+            case 0x5E:
+            case 0x5F:
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78:
+            case 0x79:
+            case 0x7A:
+            case 0x7B:
+            case 0x7C:
+            case 0x7D:
+            case 0x7E:
+            case 0x7F:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            // fixmap
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+                return get_msgpack_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixarray
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+            case 0x98:
+            case 0x99:
+            case 0x9A:
+            case 0x9B:
+            case 0x9C:
+            case 0x9D:
+            case 0x9E:
+            case 0x9F:
+                return get_msgpack_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            case 0xD9: // str 8
+            case 0xDA: // str 16
+            case 0xDB: // str 32
+            {
+                string_t s;
+                return get_msgpack_string(s) && sax->string(s);
+            }
+
+            case 0xC0: // nil
+                return sax->null();
+
+            case 0xC2: // false
+                return sax->boolean(false);
+
+            case 0xC3: // true
+                return sax->boolean(true);
+
+            case 0xC4: // bin 8
+            case 0xC5: // bin 16
+            case 0xC6: // bin 32
+            case 0xC7: // ext 8
+            case 0xC8: // ext 16
+            case 0xC9: // ext 32
+            case 0xD4: // fixext 1
+            case 0xD5: // fixext 2
+            case 0xD6: // fixext 4
+            case 0xD7: // fixext 8
+            case 0xD8: // fixext 16
+            {
+                binary_t b;
+                return get_msgpack_binary(b) && sax->binary(b);
+            }
+
+            case 0xCA: // float 32
+            {
+                float number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCB: // float 64
+            {
+                double number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCC: // uint 8
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCD: // uint 16
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCE: // uint 32
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCF: // uint 64
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xD0: // int 8
+            {
+                std::int8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD1: // int 16
+            {
+                std::int16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD2: // int 32
+            {
+                std::int32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD3: // int 64
+            {
+                std::int64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xDC: // array 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDD: // array 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDE: // map 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xDF: // map 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            // negative fixint
+            case 0xE0:
+            case 0xE1:
+            case 0xE2:
+            case 0xE3:
+            case 0xE4:
+            case 0xE5:
+            case 0xE6:
+            case 0xE7:
+            case 0xE8:
+            case 0xE9:
+            case 0xEA:
+            case 0xEB:
+            case 0xEC:
+            case 0xED:
+            case 0xEE:
+            case 0xEF:
+            case 0xF0:
+            case 0xF1:
+            case 0xF2:
+            case 0xF3:
+            case 0xF4:
+            case 0xF5:
+            case 0xF6:
+            case 0xF7:
+            case 0xF8:
+            case 0xF9:
+            case 0xFA:
+            case 0xFB:
+            case 0xFC:
+            case 0xFD:
+            case 0xFE:
+            case 0xFF:
+                return sax->number_integer(static_cast<std::int8_t>(current));
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::msgpack, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_msgpack_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            {
+                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0xD9: // str 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDA: // str 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDB: // str 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::msgpack, "expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x" + last_token, "string")));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into a byte array.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool get_msgpack_binary(binary_t& result)
+    {
+        // helper function to set the subtype
+        auto assign_and_return_true = [&result](std::int8_t subtype)
+        {
+            result.set_subtype(static_cast<std::uint8_t>(subtype));
+            return true;
+        };
+
+        switch (current)
+        {
+            case 0xC4: // bin 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC5: // bin 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC6: // bin 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC7: // ext 8
+            {
+                std::uint8_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xC8: // ext 16
+            {
+                std::uint16_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xC9: // ext 32
+            {
+                std::uint32_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD4: // fixext 1
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 1, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD5: // fixext 2
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 2, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD6: // fixext 4
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 4, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD7: // fixext 8
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 8, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD8: // fixext 16
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 16, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            default:           // LCOV_EXCL_LINE
+                return false;  // LCOV_EXCL_LINE
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array
+    @return whether array creation completed
+    */
+    bool get_msgpack_array(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
+        {
+            return false;
+        }
+
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
+            {
+                return false;
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object
+    @return whether object creation completed
+    */
+    bool get_msgpack_object(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
+        {
+            return false;
+        }
+
+        string_t key;
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key)))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
+            {
+                return false;
+            }
+            key.clear();
+        }
+
+        return sax->end_object();
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether a valid UBJSON value was passed to the SAX parser
+    */
+    bool parse_ubjson_internal(const bool get_char = true)
+    {
+        return get_ubjson_value(get_char ? get_ignore_noop() : current);
+    }
+
+    /*!
+    @brief reads a UBJSON string
+
+    This function is either called after reading the 'S' byte explicitly
+    indicating a string, or in case of an object key where the 'S' byte can be
+    left out.
+
+    @param[out] result   created string
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether string creation completed
+    */
+    bool get_ubjson_string(string_t& result, const bool get_char = true)
+    {
+        if (get_char)
+        {
+            get();  // TODO(niels): may we ignore N here?
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            case 'U':
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'i':
+            {
+                std::int8_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'I':
+            {
+                std::int16_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'l':
+            {
+                std::int32_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'L':
+            {
+                std::int64_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            default:
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token, "string")));
+        }
+    }
+
+    /*!
+    @param[out] result  determined size
+    @return whether size determination completed
+    */
+    bool get_ubjson_size_value(std::size_t& result)
+    {
+        switch (get_ignore_noop())
+        {
+            case 'U':
+            {
+                std::uint8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'i':
+            {
+                std::int8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'I':
+            {
+                std::int16_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'l':
+            {
+                std::int32_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'L':
+            {
+                std::int64_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token, "size")));
+            }
+        }
+    }
+
+    /*!
+    @brief determine the type and size for a container
+
+    In the optimized UBJSON format, a type and a size can be provided to allow
+    for a more compact representation.
+
+    @param[out] result  pair of the size and the type
+
+    @return whether pair creation completed
+    */
+    bool get_ubjson_size_type(std::pair<std::size_t, char_int_type>& result)
+    {
+        result.first = string_t::npos; // size
+        result.second = 0; // type
+
+        get_ignore_noop();
+
+        if (current == '$')
+        {
+            result.second = get();  // must not ignore 'N', because 'N' maybe the type
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "type")))
+            {
+                return false;
+            }
+
+            get_ignore_noop();
+            if (JSON_HEDLEY_UNLIKELY(current != '#'))
+            {
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
+                {
+                    return false;
+                }
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "expected '#' after type information; last byte: 0x" + last_token, "size")));
+            }
+
+            return get_ubjson_size_value(result.first);
+        }
+
+        if (current == '#')
+        {
+            return get_ubjson_size_value(result.first);
+        }
+
+        return true;
+    }
+
+    /*!
+    @param prefix  the previously read or set type prefix
+    @return whether value creation completed
+    */
+    bool get_ubjson_value(const char_int_type prefix)
+    {
+        switch (prefix)
+        {
+            case std::char_traits<char_type>::eof():  // EOF
+                return unexpect_eof(input_format_t::ubjson, "value");
+
+            case 'T':  // true
+                return sax->boolean(true);
+            case 'F':  // false
+                return sax->boolean(false);
+
+            case 'Z':  // null
+                return sax->null();
+
+            case 'U':
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_unsigned(number);
+            }
+
+            case 'i':
+            {
+                std::int8_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'I':
+            {
+                std::int16_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'l':
+            {
+                std::int32_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'L':
+            {
+                std::int64_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'd':
+            {
+                float number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'D':
+            {
+                double number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'H':
+            {
+                return get_ubjson_high_precision_number();
+            }
+
+            case 'C':  // char
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "char")))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(current > 127))
+                {
+                    auto last_token = get_token_string();
+                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token, "char")));
+                }
+                string_t s(1, static_cast<typename string_t::value_type>(current));
+                return sax->string(s);
+            }
+
+            case 'S':  // string
+            {
+                string_t s;
+                return get_ubjson_string(s) && sax->string(s);
+            }
+
+            case '[':  // array
+                return get_ubjson_array();
+
+            case '{':  // object
+                return get_ubjson_object();
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "invalid byte: 0x" + last_token, "value")));
+            }
+        }
+    }
+
+    /*!
+    @return whether array creation completed
+    */
+    bool get_ubjson_array()
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                if (size_and_type.second != 'N')
+                {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i)
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != ']')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false)))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @return whether object creation completed
+    */
+    bool get_ubjson_object()
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        string_t key;
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != '}')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key)))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    // Note, no reader for UBJSON binary types is implemented because they do
+    // not exist
+
+    bool get_ubjson_high_precision_number()
+    {
+        // get size of following number string
+        std::size_t size{};
+        auto res = get_ubjson_size_value(size);
+        if (JSON_HEDLEY_UNLIKELY(!res))
+        {
+            return res;
+        }
+
+        // get number string
+        std::vector<char> number_vector;
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number")))
+            {
+                return false;
+            }
+            number_vector.push_back(static_cast<char>(current));
+        }
+
+        // parse number string
+        auto number_ia = detail::input_adapter(std::forward<decltype(number_vector)>(number_vector));
+        auto number_lexer = detail::lexer<BasicJsonType, decltype(number_ia)>(std::move(number_ia), false);
+        const auto result_number = number_lexer.scan();
+        const auto number_string = number_lexer.get_token_string();
+        const auto result_remainder = number_lexer.scan();
+
+        using token_type = typename detail::lexer_base<BasicJsonType>::token_type;
+
+        if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input))
+        {
+            return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number")));
+        }
+
+        switch (result_number)
+        {
+            case token_type::value_integer:
+                return sax->number_integer(number_lexer.get_number_integer());
+            case token_type::value_unsigned:
+                return sax->number_unsigned(number_lexer.get_number_unsigned());
+            case token_type::value_float:
+                return sax->number_float(number_lexer.get_number_float(), std::move(number_string));
+            default:
+                return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number")));
+        }
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*!
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a -'ve valued
+    `std::char_traits<char_type>::eof()` in that case.
+
+    @return character read from the input
+    */
+    char_int_type get()
+    {
+        ++chars_read;
+        return current = ia.get_character();
+    }
+
+    /*!
+    @return character read from the input after ignoring all 'N' entries
+    */
+    char_int_type get_ignore_noop()
+    {
+        do
+        {
+            get();
+        }
+        while (current == 'N');
+
+        return current;
+    }
+
+    /*
+    @brief read a number from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format   the current format (for diagnostics)
+    @param[out] result  number of type @a NumberType
+
+    @return whether conversion completed
+
+    @note This function needs to respect the system's endianess, because
+          bytes in CBOR, MessagePack, and UBJSON are stored in network order
+          (big endian) and therefore need reordering on little endian systems.
+    */
+    template<typename NumberType, bool InputIsLittleEndian = false>
+    bool get_number(const input_format_t format, NumberType& result)
+    {
+        // step 1: read input into array with system's byte order
+        std::array<std::uint8_t, sizeof(NumberType)> vec;
+        for (std::size_t i = 0; i < sizeof(NumberType); ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "number")))
+            {
+                return false;
+            }
+
+            // reverse byte order prior to conversion if necessary
+            if (is_little_endian != InputIsLittleEndian)
+            {
+                vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
+            }
+            else
+            {
+                vec[i] = static_cast<std::uint8_t>(current); // LCOV_EXCL_LINE
+            }
+        }
+
+        // step 2: convert array into number of type T and return
+        std::memcpy(&result, vec.data(), sizeof(NumberType));
+        return true;
+    }
+
+    /*!
+    @brief create a string by reading characters from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of characters to read
+    @param[out] result string created by reading @a len bytes
+
+    @return whether string creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of string memory.
+    */
+    template<typename NumberType>
+    bool get_string(const input_format_t format,
+                    const NumberType len,
+                    string_t& result)
+    {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string")))
+            {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<typename string_t::value_type>(current));
+        };
+        return success;
+    }
+
+    /*!
+    @brief create a byte array by reading bytes from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of bytes to read
+    @param[out] result byte array created by reading @a len bytes
+
+    @return whether byte array creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of memory.
+    */
+    template<typename NumberType>
+    bool get_binary(const input_format_t format,
+                    const NumberType len,
+                    binary_t& result)
+    {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary")))
+            {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<std::uint8_t>(current));
+        }
+        return success;
+    }
+
+    /*!
+    @param[in] format   the current format (for diagnostics)
+    @param[in] context  further context information (for diagnostics)
+    @return whether the last read character is not EOF
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool unexpect_eof(const input_format_t format, const char* context) const
+    {
+        if (JSON_HEDLEY_UNLIKELY(current == std::char_traits<char_type>::eof()))
+        {
+            return sax->parse_error(chars_read, "<end of file>",
+                                    parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context)));
+        }
+        return true;
+    }
+
+    /*!
+    @return a string representation of the last read byte
+    */
+    std::string get_token_string() const
+    {
+        std::array<char, 3> cr{{}};
+        (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(current));
+        return std::string{cr.data()};
+    }
+
+    /*!
+    @param[in] format   the current format
+    @param[in] detail   a detailed error message
+    @param[in] context  further context information
+    @return a message string to use in the parse_error exceptions
+    */
+    std::string exception_message(const input_format_t format,
+                                  const std::string& detail,
+                                  const std::string& context) const
+    {
+        std::string error_msg = "syntax error while parsing ";
+
+        switch (format)
+        {
+            case input_format_t::cbor:
+                error_msg += "CBOR";
+                break;
+
+            case input_format_t::msgpack:
+                error_msg += "MessagePack";
+                break;
+
+            case input_format_t::ubjson:
+                error_msg += "UBJSON";
+                break;
+
+            case input_format_t::bson:
+                error_msg += "BSON";
+                break;
+
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+
+        return error_msg + " " + context + ": " + detail;
+    }
+
+  private:
+    /// input adapter
+    InputAdapterType ia;
+
+    /// the current character
+    char_int_type current = std::char_traits<char_type>::eof();
+
+    /// the number of characters read
+    std::size_t chars_read = 0;
+
+    /// whether we can assume little endianess
+    const bool is_little_endian = little_endianess();
+
+    /// the SAX parser
+    json_sax_t* sax = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/input/parser.hpp>
+
+
+#include <cmath> // isfinite
+#include <cstdint> // uint8_t
+#include <functional> // function
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////
+// parser //
+////////////
+
+enum class parse_event_t : uint8_t
+{
+    /// the parser read `{` and started to process a JSON object
+    object_start,
+    /// the parser read `}` and finished processing a JSON object
+    object_end,
+    /// the parser read `[` and started to process a JSON array
+    array_start,
+    /// the parser read `]` and finished processing a JSON array
+    array_end,
+    /// the parser read a key of a value in an object
+    key,
+    /// the parser finished reading a JSON value
+    value
+};
+
+template<typename BasicJsonType>
+using parser_callback_t =
+    std::function<bool(int depth, parse_event_t event, BasicJsonType& parsed)>;
+
+/*!
+@brief syntax analysis
+
+This class implements a recursive descent parser.
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class parser
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
+    using token_type = typename lexer_t::token_type;
+
+  public:
+    /// a parser reading from an input adapter
+    explicit parser(InputAdapterType&& adapter,
+                    const parser_callback_t<BasicJsonType> cb = nullptr,
+                    const bool allow_exceptions_ = true,
+                    const bool skip_comments = false)
+        : callback(cb)
+        , m_lexer(std::move(adapter), skip_comments)
+        , allow_exceptions(allow_exceptions_)
+    {
+        // read first token
+        get_token();
+    }
+
+    /*!
+    @brief public parser interface
+
+    @param[in] strict      whether to expect the last token to be EOF
+    @param[in,out] result  parsed JSON value
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    void parse(const bool strict, BasicJsonType& result)
+    {
+        if (callback)
+        {
+            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
+            sax_parse_internal(&sdp);
+            result.assert_invariant();
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(),
+                                                    exception_message(token_type::end_of_input, "value")));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+
+            // set top-level value to null if it was discarded by the callback
+            // function
+            if (result.is_discarded())
+            {
+                result = nullptr;
+            }
+        }
+        else
+        {
+            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
+            sax_parse_internal(&sdp);
+            result.assert_invariant();
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(),
+                                                    exception_message(token_type::end_of_input, "value")));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+        }
+    }
+
+    /*!
+    @brief public accept interface
+
+    @param[in] strict  whether to expect the last token to be EOF
+    @return whether the input is a proper JSON text
+    */
+    bool accept(const bool strict = true)
+    {
+        json_sax_acceptor<BasicJsonType> sax_acceptor;
+        return sax_parse(&sax_acceptor, strict);
+    }
+
+    template<typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse(SAX* sax, const bool strict = true)
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+        const bool result = sax_parse_internal(sax);
+
+        // strict mode: next byte must be EOF
+        if (result && strict && (get_token() != token_type::end_of_input))
+        {
+            return sax->parse_error(m_lexer.get_position(),
+                                    m_lexer.get_token_string(),
+                                    parse_error::create(101, m_lexer.get_position(),
+                                            exception_message(token_type::end_of_input, "value")));
+        }
+
+        return result;
+    }
+
+  private:
+    template<typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse_internal(SAX* sax)
+    {
+        // stack to remember the hierarchy of structured values we are parsing
+        // true = array; false = object
+        std::vector<bool> states;
+        // value to avoid a goto (see comment where set to true)
+        bool skip_to_state_evaluation = false;
+
+        while (true)
+        {
+            if (!skip_to_state_evaluation)
+            {
+                // invariant: get_token() was called before each iteration
+                switch (last_token)
+                {
+                    case token_type::begin_object:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing } -> we are done
+                        if (get_token() == token_type::end_object)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // parse key
+                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(),
+                                                            exception_message(token_type::value_string, "object key")));
+                        }
+                        if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        // parse separator (:)
+                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(),
+                                                            exception_message(token_type::name_separator, "object separator")));
+                        }
+
+                        // remember we are now inside an object
+                        states.push_back(false);
+
+                        // parse values
+                        get_token();
+                        continue;
+                    }
+
+                    case token_type::begin_array:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing ] -> we are done
+                        if (get_token() == token_type::end_array)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // remember we are now inside an array
+                        states.push_back(true);
+
+                        // parse values (no need to call get_token)
+                        continue;
+                    }
+
+                    case token_type::value_float:
+                    {
+                        const auto res = m_lexer.get_number_float();
+
+                        if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res)))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'"));
+                        }
+
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        break;
+                    }
+
+                    case token_type::literal_false:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_null:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->null()))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_true:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_integer:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_string:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_unsigned:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::parse_error:
+                    {
+                        // using "uninitialized" to avoid "expected" message
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::uninitialized, "value")));
+                    }
+
+                    default: // the last token was unexpected
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::literal_or_value, "value")));
+                    }
+                }
+            }
+            else
+            {
+                skip_to_state_evaluation = false;
+            }
+
+            // we reached this line after we successfully parsed a value
+            if (states.empty())
+            {
+                // empty stack: we reached the end of the hierarchy: done
+                return true;
+            }
+
+            if (states.back())  // array
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator)
+                {
+                    // parse a new value
+                    get_token();
+                    continue;
+                }
+
+                // closing ]
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
+                    {
+                        return false;
+                    }
+
+                    // We are done with this array. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    JSON_ASSERT(!states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(m_lexer.get_position(),
+                                        m_lexer.get_token_string(),
+                                        parse_error::create(101, m_lexer.get_position(),
+                                                exception_message(token_type::end_array, "array")));
+            }
+            else  // object
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator)
+                {
+                    // parse key
+                    if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::value_string, "object key")));
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
+                    {
+                        return false;
+                    }
+
+                    // parse separator (:)
+                    if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(),
+                                                        exception_message(token_type::name_separator, "object separator")));
+                    }
+
+                    // parse values
+                    get_token();
+                    continue;
+                }
+
+                // closing }
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
+                    {
+                        return false;
+                    }
+
+                    // We are done with this object. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    JSON_ASSERT(!states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(m_lexer.get_position(),
+                                        m_lexer.get_token_string(),
+                                        parse_error::create(101, m_lexer.get_position(),
+                                                exception_message(token_type::end_object, "object")));
+            }
+        }
+    }
+
+    /// get next token from lexer
+    token_type get_token()
+    {
+        return last_token = m_lexer.scan();
+    }
+
+    std::string exception_message(const token_type expected, const std::string& context)
+    {
+        std::string error_msg = "syntax error ";
+
+        if (!context.empty())
+        {
+            error_msg += "while parsing " + context + " ";
+        }
+
+        error_msg += "- ";
+
+        if (last_token == token_type::parse_error)
+        {
+            error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" +
+                         m_lexer.get_token_string() + "'";
+        }
+        else
+        {
+            error_msg += "unexpected " + std::string(lexer_t::token_type_name(last_token));
+        }
+
+        if (expected != token_type::uninitialized)
+        {
+            error_msg += "; expected " + std::string(lexer_t::token_type_name(expected));
+        }
+
+        return error_msg;
+    }
+
+  private:
+    /// callback function
+    const parser_callback_t<BasicJsonType> callback = nullptr;
+    /// the type of the last read token
+    token_type last_token = token_type::uninitialized;
+    /// the lexer
+    lexer_t m_lexer;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <limits>  // numeric_limits
+
+namespace nlohmann
+{
+namespace detail
+{
+/*
+@brief an iterator for primitive JSON types
+
+This class models an iterator for primitive JSON types (boolean, number,
+string). It's only purpose is to allow the iterator/const_iterator classes
+to "iterate" over primitive values. Internally, the iterator is modeled by
+a `difference_type` variable. Value begin_value (`0`) models the begin,
+end_value (`1`) models past the end.
+*/
+class primitive_iterator_t
+{
+  private:
+    using difference_type = std::ptrdiff_t;
+    static constexpr difference_type begin_value = 0;
+    static constexpr difference_type end_value = begin_value + 1;
+
+    /// iterator as signed integer type
+    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();
+
+  public:
+    constexpr difference_type get_value() const noexcept
+    {
+        return m_it;
+    }
+
+    /// set iterator to a defined beginning
+    void set_begin() noexcept
+    {
+        m_it = begin_value;
+    }
+
+    /// set iterator to a defined past the end
+    void set_end() noexcept
+    {
+        m_it = end_value;
+    }
+
+    /// return whether the iterator can be dereferenced
+    constexpr bool is_begin() const noexcept
+    {
+        return m_it == begin_value;
+    }
+
+    /// return whether the iterator is at end
+    constexpr bool is_end() const noexcept
+    {
+        return m_it == end_value;
+    }
+
+    friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it == rhs.m_it;
+    }
+
+    friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it < rhs.m_it;
+    }
+
+    primitive_iterator_t operator+(difference_type n) noexcept
+    {
+        auto result = *this;
+        result += n;
+        return result;
+    }
+
+    friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it - rhs.m_it;
+    }
+
+    primitive_iterator_t& operator++() noexcept
+    {
+        ++m_it;
+        return *this;
+    }
+
+    primitive_iterator_t const operator++(int) noexcept
+    {
+        auto result = *this;
+        ++m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator--() noexcept
+    {
+        --m_it;
+        return *this;
+    }
+
+    primitive_iterator_t const operator--(int) noexcept
+    {
+        auto result = *this;
+        --m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator+=(difference_type n) noexcept
+    {
+        m_it += n;
+        return *this;
+    }
+
+    primitive_iterator_t& operator-=(difference_type n) noexcept
+    {
+        m_it -= n;
+        return *this;
+    }
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/*!
+@brief an iterator value
+
+@note This structure could easily be a union, but MSVC currently does not allow
+unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
+*/
+template<typename BasicJsonType> struct internal_iterator
+{
+    /// iterator for JSON objects
+    typename BasicJsonType::object_t::iterator object_iterator {};
+    /// iterator for JSON arrays
+    typename BasicJsonType::array_t::iterator array_iterator {};
+    /// generic iterator for all other types
+    primitive_iterator_t primitive_iterator {};
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/iter_impl.hpp>
+
+
+#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
+#include <type_traits> // conditional, is_const, remove_const
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+// forward declare, to be able to friend it later on
+template<typename IteratorType> class iteration_proxy;
+template<typename IteratorType> class iteration_proxy_value;
+
+/*!
+@brief a template for a bidirectional iterator for the @ref basic_json class
+This class implements a both iterators (iterator and const_iterator) for the
+@ref basic_json class.
+@note An iterator is called *initialized* when a pointer to a JSON value has
+      been set (e.g., by a constructor or a copy assignment). If the iterator is
+      default-constructed, it is *uninitialized* and most methods are undefined.
+      **The library uses assertions to detect calls on uninitialized iterators.**
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
+       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
+*/
+template<typename BasicJsonType>
+class iter_impl
+{
+    /// allow basic_json to access private members
+    friend iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
+    friend BasicJsonType;
+    friend iteration_proxy<iter_impl>;
+    friend iteration_proxy_value<iter_impl>;
+
+    using object_t = typename BasicJsonType::object_t;
+    using array_t = typename BasicJsonType::array_t;
+    // make sure BasicJsonType is basic_json or const basic_json
+    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
+                  "iter_impl only accepts (const) basic_json");
+
+  public:
+
+    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
+    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
+    /// A user-defined iterator should provide publicly accessible typedefs named
+    /// iterator_category, value_type, difference_type, pointer, and reference.
+    /// Note that value_type is required to be non-const, even for constant iterators.
+    using iterator_category = std::bidirectional_iterator_tag;
+
+    /// the type of the values when the iterator is dereferenced
+    using value_type = typename BasicJsonType::value_type;
+    /// a type to represent differences between iterators
+    using difference_type = typename BasicJsonType::difference_type;
+    /// defines a pointer to the type iterated over (value_type)
+    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
+          typename BasicJsonType::const_pointer,
+          typename BasicJsonType::pointer>::type;
+    /// defines a reference to the type iterated over (value_type)
+    using reference =
+        typename std::conditional<std::is_const<BasicJsonType>::value,
+        typename BasicJsonType::const_reference,
+        typename BasicJsonType::reference>::type;
+
+    /// default constructor
+    iter_impl() = default;
+
+    /*!
+    @brief constructor for a given JSON instance
+    @param[in] object  pointer to a JSON object for this iterator
+    @pre object != nullptr
+    @post The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    explicit iter_impl(pointer object) noexcept : m_object(object)
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = typename object_t::iterator();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = typename array_t::iterator();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator = primitive_iterator_t();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @note The conventional copy constructor and copy assignment are implicitly
+          defined. Combined with the following converting constructor and
+          assignment, they support: (1) copy from iterator to iterator, (2)
+          copy from const iterator to const iterator, and (3) conversion from
+          iterator to const iterator. However conversion from const iterator
+          to iterator is not defined.
+    */
+
+    /*!
+    @brief const copy constructor
+    @param[in] other const iterator to copy from
+    @note This copy constructor had to be defined explicitly to circumvent a bug
+          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
+          information refer to: https://github.com/nlohmann/json/issues/1608
+    */
+    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<const BasicJsonType>& other) noexcept
+    {
+        m_object = other.m_object;
+        m_it = other.m_it;
+        return *this;
+    }
+
+    /*!
+    @brief converting constructor
+    @param[in] other  non-const iterator to copy from
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other  non-const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+    {
+        m_object = other.m_object;
+        m_it = other.m_it;
+        return *this;
+    }
+
+  private:
+    /*!
+    @brief set the iterator to the first value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_begin() noexcept
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_value.object->begin();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_value.array->begin();
+                break;
+            }
+
+            case value_t::null:
+            {
+                // set to end so begin()==end() is true: null is empty
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator.set_begin();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief set the iterator past the last value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_end() noexcept
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_value.object->end();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_value.array->end();
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+        }
+    }
+
+  public:
+    /*!
+    @brief return a reference to the value pointed to by the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator*() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
+                return m_it.object_iterator->second;
+            }
+
+            case value_t::array:
+            {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
+                return *m_it.array_iterator;
+            }
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief dereference the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    pointer operator->() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
+                return &(m_it.object_iterator->second);
+            }
+
+            case value_t::array:
+            {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
+                return &*m_it.array_iterator;
+            }
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief post-increment (it++)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl const operator++(int)
+    {
+        auto result = *this;
+        ++(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-increment (++it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator++()
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, 1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, 1);
+                break;
+            }
+
+            default:
+            {
+                ++m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief post-decrement (it--)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl const operator--(int)
+    {
+        auto result = *this;
+        --(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-decrement (--it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator--()
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, -1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, -1);
+                break;
+            }
+
+            default:
+            {
+                --m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief  comparison: equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator==(const iter_impl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers"));
+        }
+
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                return (m_it.object_iterator == other.m_it.object_iterator);
+
+            case value_t::array:
+                return (m_it.array_iterator == other.m_it.array_iterator);
+
+            default:
+                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief  comparison: not equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator!=(const iter_impl& other) const
+    {
+        return !operator==(other);
+    }
+
+    /*!
+    @brief  comparison: smaller
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator<(const iter_impl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers"));
+        }
+
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators"));
+
+            case value_t::array:
+                return (m_it.array_iterator < other.m_it.array_iterator);
+
+            default:
+                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief  comparison: less than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator<=(const iter_impl& other) const
+    {
+        return !other.operator < (*this);
+    }
+
+    /*!
+    @brief  comparison: greater than
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator>(const iter_impl& other) const
+    {
+        return !operator<=(other);
+    }
+
+    /*!
+    @brief  comparison: greater than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator>=(const iter_impl& other) const
+    {
+        return !operator<(other);
+    }
+
+    /*!
+    @brief  add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator+=(difference_type i)
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators"));
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, i);
+                break;
+            }
+
+            default:
+            {
+                m_it.primitive_iterator += i;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief  subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator-=(difference_type i)
+    {
+        return operator+=(-i);
+    }
+
+    /*!
+    @brief  add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator+(difference_type i) const
+    {
+        auto result = *this;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief  addition of distance and iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    friend iter_impl operator+(difference_type i, const iter_impl& it)
+    {
+        auto result = it;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief  subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator-(difference_type i) const
+    {
+        auto result = *this;
+        result -= i;
+        return result;
+    }
+
+    /*!
+    @brief  return difference
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    difference_type operator-(const iter_impl& other) const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators"));
+
+            case value_t::array:
+                return m_it.array_iterator - other.m_it.array_iterator;
+
+            default:
+                return m_it.primitive_iterator - other.m_it.primitive_iterator;
+        }
+    }
+
+    /*!
+    @brief  access to successor
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator[](difference_type n) const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators"));
+
+            case value_t::array:
+                return *std::next(m_it.array_iterator, n);
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value"));
+            }
+        }
+    }
+
+    /*!
+    @brief  return the key of an object iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    const typename object_t::key_type& key() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        if (JSON_HEDLEY_LIKELY(m_object->is_object()))
+        {
+            return m_it.object_iterator->first;
+        }
+
+        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators"));
+    }
+
+    /*!
+    @brief  return the value of an iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference value() const
+    {
+        return operator*();
+    }
+
+  private:
+    /// associated JSON instance
+    pointer m_object = nullptr;
+    /// the actual iterator of the associated instance
+    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it {};
+};
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <iterator> // reverse_iterator
+#include <utility> // declval
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////////
+// reverse_iterator //
+//////////////////////
+
+/*!
+@brief a template for a reverse iterator class
+
+@tparam Base the base iterator type to reverse. Valid types are @ref
+iterator (to create @ref reverse_iterator) and @ref const_iterator (to
+create @ref const_reverse_iterator).
+
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
+  It is possible to write to the pointed-to element (only if @a Base is
+  @ref iterator).
+
+@since version 1.0.0
+*/
+template<typename Base>
+class json_reverse_iterator : public std::reverse_iterator<Base>
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    /// shortcut to the reverse iterator adapter
+    using base_iterator = std::reverse_iterator<Base>;
+    /// the reference type for the pointed-to element
+    using reference = typename Base::reference;
+
+    /// create reverse iterator from iterator
+    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
+        : base_iterator(it) {}
+
+    /// create reverse iterator from base class
+    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
+
+    /// post-increment (it++)
+    json_reverse_iterator const operator++(int)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
+    }
+
+    /// pre-increment (++it)
+    json_reverse_iterator& operator++()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
+    }
+
+    /// post-decrement (it--)
+    json_reverse_iterator const operator--(int)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
+    }
+
+    /// pre-decrement (--it)
+    json_reverse_iterator& operator--()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
+    }
+
+    /// add to iterator
+    json_reverse_iterator& operator+=(difference_type i)
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
+    }
+
+    /// add to iterator
+    json_reverse_iterator operator+(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
+    }
+
+    /// subtract from iterator
+    json_reverse_iterator operator-(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
+    }
+
+    /// return difference
+    difference_type operator-(const json_reverse_iterator& other) const
+    {
+        return base_iterator(*this) - base_iterator(other);
+    }
+
+    /// access to successor
+    reference operator[](difference_type n) const
+    {
+        return *(this->operator+(n));
+    }
+
+    /// return the key of an object iterator
+    auto key() const -> decltype(std::declval<Base>().key())
+    {
+        auto it = --this->base();
+        return it.key();
+    }
+
+    /// return the value of an iterator
+    reference value() const
+    {
+        auto it = --this->base();
+        return it.operator * ();
+    }
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/json_pointer.hpp>
+
+
+#include <algorithm> // all_of
+#include <cctype> // isdigit
+#include <limits> // max
+#include <numeric> // accumulate
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+template<typename BasicJsonType>
+class json_pointer
+{
+    // allow basic_json to access private members
+    NLOHMANN_BASIC_JSON_TPL_DECLARATION
+    friend class basic_json;
+
+  public:
+    /*!
+    @brief create JSON pointer
+
+    Create a JSON pointer according to the syntax described in
+    [Section 3 of RFC6901](https://tools.ietf.org/html/rfc6901#section-3).
+
+    @param[in] s  string representing the JSON pointer; if omitted, the empty
+                  string is assumed which references the whole JSON value
+
+    @throw parse_error.107 if the given JSON pointer @a s is nonempty and does
+                           not begin with a slash (`/`); see example below
+
+    @throw parse_error.108 if a tilde (`~`) in the given JSON pointer @a s is
+    not followed by `0` (representing `~`) or `1` (representing `/`); see
+    example below
+
+    @liveexample{The example shows the construction several valid JSON pointers
+    as well as the exceptional behavior.,json_pointer}
+
+    @since version 2.0.0
+    */
+    explicit json_pointer(const std::string& s = "")
+        : reference_tokens(split(s))
+    {}
+
+    /*!
+    @brief return a string representation of the JSON pointer
+
+    @invariant For each JSON pointer `ptr`, it holds:
+    @code {.cpp}
+    ptr == json_pointer(ptr.to_string());
+    @endcode
+
+    @return a string representation of the JSON pointer
+
+    @liveexample{The example shows the result of `to_string`.,json_pointer__to_string}
+
+    @since version 2.0.0
+    */
+    std::string to_string() const
+    {
+        return std::accumulate(reference_tokens.begin(), reference_tokens.end(),
+                               std::string{},
+                               [](const std::string & a, const std::string & b)
+        {
+            return a + "/" + escape(b);
+        });
+    }
+
+    /// @copydoc to_string()
+    operator std::string() const
+    {
+        return to_string();
+    }
+
+    /*!
+    @brief append another JSON pointer at the end of this JSON pointer
+
+    @param[in] ptr  JSON pointer to append
+    @return JSON pointer with @a ptr appended
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::string) to append a reference token
+    @sa @ref operator/=(std::size_t) to append an array index
+    @sa @ref operator/(const json_pointer&, const json_pointer&) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(const json_pointer& ptr)
+    {
+        reference_tokens.insert(reference_tokens.end(),
+                                ptr.reference_tokens.begin(),
+                                ptr.reference_tokens.end());
+        return *this;
+    }
+
+    /*!
+    @brief append an unescaped reference token at the end of this JSON pointer
+
+    @param[in] token  reference token to append
+    @return JSON pointer with @a token appended without escaping @a token
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Amortized constant.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+    @sa @ref operator/=(std::size_t) to append an array index
+    @sa @ref operator/(const json_pointer&, std::size_t) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(std::string token)
+    {
+        push_back(std::move(token));
+        return *this;
+    }
+
+    /*!
+    @brief append an array index at the end of this JSON pointer
+
+    @param[in] array_idx  array index to append
+    @return JSON pointer with @a array_idx appended
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Amortized constant.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+    @sa @ref operator/=(std::string) to append a reference token
+    @sa @ref operator/(const json_pointer&, std::string) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(std::size_t array_idx)
+    {
+        return *this /= std::to_string(array_idx);
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer
+
+    @param[in] lhs  JSON pointer
+    @param[in] rhs  JSON pointer
+    @return a new JSON pointer with @a rhs appended to @a lhs
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a lhs and @a rhs.
+
+    @sa @ref operator/=(const json_pointer&) to append a JSON pointer
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& lhs,
+                                  const json_pointer& rhs)
+    {
+        return json_pointer(lhs) /= rhs;
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer
+
+    @param[in] ptr  JSON pointer
+    @param[in] token  reference token
+    @return a new JSON pointer with unescaped @a token appended to @a ptr
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::string) to append a reference token
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& ptr, std::string token)
+    {
+        return json_pointer(ptr) /= std::move(token);
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer
+
+    @param[in] ptr  JSON pointer
+    @param[in] array_idx  array index
+    @return a new JSON pointer with @a array_idx appended to @a ptr
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa @ref operator/=(std::size_t) to append an array index
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& ptr, std::size_t array_idx)
+    {
+        return json_pointer(ptr) /= array_idx;
+    }
+
+    /*!
+    @brief returns the parent of this JSON pointer
+
+    @return parent of this JSON pointer; in case this JSON pointer is the root,
+            the root itself is returned
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @liveexample{The example shows the result of `parent_pointer` for different
+    JSON Pointers.,json_pointer__parent_pointer}
+
+    @since version 3.6.0
+    */
+    json_pointer parent_pointer() const
+    {
+        if (empty())
+        {
+            return *this;
+        }
+
+        json_pointer res = *this;
+        res.pop_back();
+        return res;
+    }
+
+    /*!
+    @brief remove last reference token
+
+    @pre not `empty()`
+
+    @liveexample{The example shows the usage of `pop_back`.,json_pointer__pop_back}
+
+    @complexity Constant.
+
+    @throw out_of_range.405 if JSON pointer has no parent
+
+    @since version 3.6.0
+    */
+    void pop_back()
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        reference_tokens.pop_back();
+    }
+
+    /*!
+    @brief return last reference token
+
+    @pre not `empty()`
+    @return last reference token
+
+    @liveexample{The example shows the usage of `back`.,json_pointer__back}
+
+    @complexity Constant.
+
+    @throw out_of_range.405 if JSON pointer has no parent
+
+    @since version 3.6.0
+    */
+    const std::string& back() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        return reference_tokens.back();
+    }
+
+    /*!
+    @brief append an unescaped token at the end of the reference pointer
+
+    @param[in] token  token to add
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows the result of `push_back` for different
+    JSON Pointers.,json_pointer__push_back}
+
+    @since version 3.6.0
+    */
+    void push_back(const std::string& token)
+    {
+        reference_tokens.push_back(token);
+    }
+
+    /// @copydoc push_back(const std::string&)
+    void push_back(std::string&& token)
+    {
+        reference_tokens.push_back(std::move(token));
+    }
+
+    /*!
+    @brief return whether pointer points to the root document
+
+    @return true iff the JSON pointer points to the root document
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example shows the result of `empty` for different JSON
+    Pointers.,json_pointer__empty}
+
+    @since version 3.6.0
+    */
+    bool empty() const noexcept
+    {
+        return reference_tokens.empty();
+    }
+
+  private:
+    /*!
+    @param[in] s  reference token to be converted into an array index
+
+    @return integer representation of @a s
+
+    @throw parse_error.106  if an array index begins with '0'
+    @throw parse_error.109  if an array index begins not with a digit
+    @throw out_of_range.404 if string @a s could not be converted to an integer
+    @throw out_of_range.410 if an array index exceeds size_type
+    */
+    static typename BasicJsonType::size_type array_index(const std::string& s)
+    {
+        using size_type = typename BasicJsonType::size_type;
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0'))
+        {
+            JSON_THROW(detail::parse_error::create(106, 0,
+                                                   "array index '" + s +
+                                                   "' must not begin with '0'"));
+        }
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9')))
+        {
+            JSON_THROW(detail::parse_error::create(109, 0, "array index '" + s + "' is not a number"));
+        }
+
+        std::size_t processed_chars = 0;
+        unsigned long long res = 0;
+        JSON_TRY
+        {
+            res = std::stoull(s, &processed_chars);
+        }
+        JSON_CATCH(std::out_of_range&)
+        {
+            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'"));
+        }
+
+        // check if the string was completely read
+        if (JSON_HEDLEY_UNLIKELY(processed_chars != s.size()))
+        {
+            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'"));
+        }
+
+        // only triggered on special platforms (like 32bit), see also
+        // https://github.com/nlohmann/json/pull/2203
+        if (res >= static_cast<unsigned long long>((std::numeric_limits<size_type>::max)()))
+        {
+            JSON_THROW(detail::out_of_range::create(410, "array index " + s + " exceeds size_type")); // LCOV_EXCL_LINE
+        }
+
+        return static_cast<size_type>(res);
+    }
+
+    json_pointer top() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent"));
+        }
+
+        json_pointer result = *this;
+        result.reference_tokens = {reference_tokens[0]};
+        return result;
+    }
+
+    /*!
+    @brief create and return a reference to the pointed to value
+
+    @complexity Linear in the number of reference tokens.
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.313 if value cannot be unflattened
+    */
+    BasicJsonType& get_and_create(BasicJsonType& j) const
+    {
+        auto result = &j;
+
+        // in case no reference tokens exist, return a reference to the JSON value
+        // j which will be overwritten by a primitive value
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (result->type())
+            {
+                case detail::value_t::null:
+                {
+                    if (reference_token == "0")
+                    {
+                        // start a new array if reference token is 0
+                        result = &result->operator[](0);
+                    }
+                    else
+                    {
+                        // start a new object otherwise
+                        result = &result->operator[](reference_token);
+                    }
+                    break;
+                }
+
+                case detail::value_t::object:
+                {
+                    // create an entry in the object
+                    result = &result->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    // create an entry in the array
+                    result = &result->operator[](array_index(reference_token));
+                    break;
+                }
+
+                /*
+                The following code is only reached if there exists a reference
+                token _and_ the current value is primitive. In this case, we have
+                an error situation, because primitive values may only occur as
+                single value; that is, with an empty list of reference tokens.
+                */
+                default:
+                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten"));
+            }
+        }
+
+        return *result;
+    }
+
+    /*!
+    @brief return a reference to the pointed to value
+
+    @note This version does not throw if a value is not present, but tries to
+          create nested values instead. For instance, calling this function
+          with pointer `"/this/that"` on a null value is equivalent to calling
+          `operator[]("this").operator[]("that")` on that value, effectively
+          changing the null value to an object.
+
+    @param[in] ptr  a JSON value
+
+    @return reference to the JSON value pointed to by the JSON pointer
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            // convert null values to arrays or objects before continuing
+            if (ptr->is_null())
+            {
+                // check if reference token is a number
+                const bool nums =
+                    std::all_of(reference_token.begin(), reference_token.end(),
+                                [](const unsigned char x)
+                {
+                    return std::isdigit(x);
+                });
+
+                // change value to array for numbers or "-" or to object otherwise
+                *ptr = (nums || reference_token == "-")
+                       ? detail::value_t::array
+                       : detail::value_t::object;
+            }
+
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (reference_token == "-")
+                    {
+                        // explicitly treat "-" as index beyond the end
+                        ptr = &ptr->operator[](ptr->m_value.array->size());
+                    }
+                    else
+                    {
+                        // convert array index to number; unchecked access
+                        ptr = &ptr->operator[](array_index(reference_token));
+                    }
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    BasicJsonType& get_checked(BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index(reference_token));
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @brief return a const reference to the pointed to value
+
+    @param[in] ptr  a JSON value
+
+    @return const reference to the JSON value pointed to by the JSON
+    pointer
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" cannot be used for const access
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // use unchecked array access
+                    ptr = &ptr->operator[](array_index(reference_token));
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range"));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index(reference_token));
+                    break;
+                }
+
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'"));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    */
+    bool contains(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    if (!ptr->contains(reference_token))
+                    {
+                        // we did not find the key in the object
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 && !("0" <= reference_token && reference_token <= "9")))
+                    {
+                        // invalid char
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1))
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9')))
+                        {
+                            // first char should be between '1' and '9'
+                            return false;
+                        }
+                        for (std::size_t i = 1; i < reference_token.size(); i++)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9')))
+                            {
+                                // other char should be between '0' and '9'
+                                return false;
+                            }
+                        }
+                    }
+
+                    const auto idx = array_index(reference_token);
+                    if (idx >= ptr->size())
+                    {
+                        // index out of range
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](idx);
+                    break;
+                }
+
+                default:
+                {
+                    // we do not expect primitive values if there is still a
+                    // reference token to process
+                    return false;
+                }
+            }
+        }
+
+        // no reference token left means we found a primitive value
+        return true;
+    }
+
+    /*!
+    @brief split the string input to reference tokens
+
+    @note This function is only called by the json_pointer constructor.
+          All exceptions below are documented there.
+
+    @throw parse_error.107  if the pointer is not empty or begins with '/'
+    @throw parse_error.108  if character '~' is not followed by '0' or '1'
+    */
+    static std::vector<std::string> split(const std::string& reference_string)
+    {
+        std::vector<std::string> result;
+
+        // special case: empty reference string -> no reference tokens
+        if (reference_string.empty())
+        {
+            return result;
+        }
+
+        // check if nonempty reference string begins with slash
+        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/'))
+        {
+            JSON_THROW(detail::parse_error::create(107, 1,
+                                                   "JSON pointer must be empty or begin with '/' - was: '" +
+                                                   reference_string + "'"));
+        }
+
+        // extract the reference tokens:
+        // - slash: position of the last read slash (or end of string)
+        // - start: position after the previous slash
+        for (
+            // search for the first slash after the first character
+            std::size_t slash = reference_string.find_first_of('/', 1),
+            // set the beginning of the first reference token
+            start = 1;
+            // we can stop if start == 0 (if slash == std::string::npos)
+            start != 0;
+            // set the beginning of the next reference token
+            // (will eventually be 0 if slash == std::string::npos)
+            start = (slash == std::string::npos) ? 0 : slash + 1,
+            // find next slash
+            slash = reference_string.find_first_of('/', start))
+        {
+            // use the text between the beginning of the reference token
+            // (start) and the last slash (slash).
+            auto reference_token = reference_string.substr(start, slash - start);
+
+            // check reference tokens are properly escaped
+            for (std::size_t pos = reference_token.find_first_of('~');
+                    pos != std::string::npos;
+                    pos = reference_token.find_first_of('~', pos + 1))
+            {
+                JSON_ASSERT(reference_token[pos] == '~');
+
+                // ~ must be followed by 0 or 1
+                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 ||
+                                         (reference_token[pos + 1] != '0' &&
+                                          reference_token[pos + 1] != '1')))
+                {
+                    JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'"));
+                }
+            }
+
+            // finally, store the reference token
+            unescape(reference_token);
+            result.push_back(reference_token);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief replace all occurrences of a substring by another string
+
+    @param[in,out] s  the string to manipulate; changed so that all
+                   occurrences of @a f are replaced with @a t
+    @param[in]     f  the substring to replace with @a t
+    @param[in]     t  the string to replace @a f
+
+    @pre The search string @a f must not be empty. **This precondition is
+    enforced with an assertion.**
+
+    @since version 2.0.0
+    */
+    static void replace_substring(std::string& s, const std::string& f,
+                                  const std::string& t)
+    {
+        JSON_ASSERT(!f.empty());
+        for (auto pos = s.find(f);                // find first occurrence of f
+                pos != std::string::npos;         // make sure f was found
+                s.replace(pos, f.size(), t),      // replace with t, and
+                pos = s.find(f, pos + t.size()))  // find next occurrence of f
+        {}
+    }
+
+    /// escape "~" to "~0" and "/" to "~1"
+    static std::string escape(std::string s)
+    {
+        replace_substring(s, "~", "~0");
+        replace_substring(s, "/", "~1");
+        return s;
+    }
+
+    /// unescape "~1" to tilde and "~0" to slash (order is important!)
+    static void unescape(std::string& s)
+    {
+        replace_substring(s, "~1", "/");
+        replace_substring(s, "~0", "~");
+    }
+
+    /*!
+    @param[in] reference_string  the reference string to the current value
+    @param[in] value             the value to consider
+    @param[in,out] result        the result object to insert values to
+
+    @note Empty objects or arrays are flattened to `null`.
+    */
+    static void flatten(const std::string& reference_string,
+                        const BasicJsonType& value,
+                        BasicJsonType& result)
+    {
+        switch (value.type())
+        {
+            case detail::value_t::array:
+            {
+                if (value.m_value.array->empty())
+                {
+                    // flatten empty array as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate array and use index as reference string
+                    for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
+                    {
+                        flatten(reference_string + "/" + std::to_string(i),
+                                value.m_value.array->operator[](i), result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::object:
+            {
+                if (value.m_value.object->empty())
+                {
+                    // flatten empty object as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate object and use keys as reference string
+                    for (const auto& element : *value.m_value.object)
+                    {
+                        flatten(reference_string + "/" + escape(element.first), element.second, result);
+                    }
+                }
+                break;
+            }
+
+            default:
+            {
+                // add primitive value with its reference string
+                result[reference_string] = value;
+                break;
+            }
+        }
+    }
+
+    /*!
+    @param[in] value  flattened JSON
+
+    @return unflattened JSON
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+    @throw type_error.313  if value cannot be unflattened
+    */
+    static BasicJsonType
+    unflatten(const BasicJsonType& value)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!value.is_object()))
+        {
+            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened"));
+        }
+
+        BasicJsonType result;
+
+        // iterate the JSON object values
+        for (const auto& element : *value.m_value.object)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive()))
+            {
+                JSON_THROW(detail::type_error::create(315, "values in object must be primitive"));
+            }
+
+            // assign value to reference pointed to by JSON pointer; Note that if
+            // the JSON pointer is "" (i.e., points to the whole value), function
+            // get_and_create returns a reference to result itself. An assignment
+            // will then create a primitive value.
+            json_pointer(element.first).get_and_create(result) = element.second;
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief compares two JSON pointers for equality
+
+    @param[in] lhs  JSON pointer to compare
+    @param[in] rhs  JSON pointer to compare
+    @return whether @a lhs is equal to @a rhs
+
+    @complexity Linear in the length of the JSON pointer
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+    */
+    friend bool operator==(json_pointer const& lhs,
+                           json_pointer const& rhs) noexcept
+    {
+        return lhs.reference_tokens == rhs.reference_tokens;
+    }
+
+    /*!
+    @brief compares two JSON pointers for inequality
+
+    @param[in] lhs  JSON pointer to compare
+    @param[in] rhs  JSON pointer to compare
+    @return whether @a lhs is not equal @a rhs
+
+    @complexity Linear in the length of the JSON pointer
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+    */
+    friend bool operator!=(json_pointer const& lhs,
+                           json_pointer const& rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /// the reference tokens
+    std::vector<std::string> reference_tokens;
+};
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/json_ref.hpp>
+
+
+#include <initializer_list>
+#include <utility>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename BasicJsonType>
+class json_ref
+{
+  public:
+    using value_type = BasicJsonType;
+
+    json_ref(value_type&& value)
+        : owned_value(std::move(value))
+        , value_ref(&owned_value)
+        , is_rvalue(true)
+    {}
+
+    json_ref(const value_type& value)
+        : value_ref(const_cast<value_type*>(&value))
+        , is_rvalue(false)
+    {}
+
+    json_ref(std::initializer_list<json_ref> init)
+        : owned_value(init)
+        , value_ref(&owned_value)
+        , is_rvalue(true)
+    {}
+
+    template <
+        class... Args,
+        enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0 >
+    json_ref(Args && ... args)
+        : owned_value(std::forward<Args>(args)...)
+        , value_ref(&owned_value)
+        , is_rvalue(true)
+    {}
+
+    // class should be movable only
+    json_ref(json_ref&&) = default;
+    json_ref(const json_ref&) = delete;
+    json_ref& operator=(const json_ref&) = delete;
+    json_ref& operator=(json_ref&&) = delete;
+    ~json_ref() = default;
+
+    value_type moved_or_copied() const
+    {
+        if (is_rvalue)
+        {
+            return std::move(*value_ref);
+        }
+        return *value_ref;
+    }
+
+    value_type const& operator*() const
+    {
+        return *static_cast<value_type const*>(value_ref);
+    }
+
+    value_type const* operator->() const
+    {
+        return static_cast<value_type const*>(value_ref);
+    }
+
+  private:
+    mutable value_type owned_value = nullptr;
+    value_type* value_ref = nullptr;
+    const bool is_rvalue = true;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+
+#include <algorithm> // reverse
+#include <array> // array
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstring> // memcpy
+#include <limits> // numeric_limits
+#include <string> // string
+#include <cmath> // isnan, isinf
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+
+#include <algorithm> // copy
+#include <cstddef> // size_t
+#include <ios> // streamsize
+#include <iterator> // back_inserter
+#include <memory> // shared_ptr, make_shared
+#include <ostream> // basic_ostream
+#include <string> // basic_string
+#include <vector> // vector
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// abstract output adapter interface
+template<typename CharType> struct output_adapter_protocol
+{
+    virtual void write_character(CharType c) = 0;
+    virtual void write_characters(const CharType* s, std::size_t length) = 0;
+    virtual ~output_adapter_protocol() = default;
+};
+
+/// a type to simplify interfaces
+template<typename CharType>
+using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;
+
+/// output adapter for byte vectors
+template<typename CharType>
+class output_vector_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_vector_adapter(std::vector<CharType>& vec) noexcept
+        : v(vec)
+    {}
+
+    void write_character(CharType c) override
+    {
+        v.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        std::copy(s, s + length, std::back_inserter(v));
+    }
+
+  private:
+    std::vector<CharType>& v;
+};
+
+/// output adapter for output streams
+template<typename CharType>
+class output_stream_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept
+        : stream(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        stream.put(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        stream.write(s, static_cast<std::streamsize>(length));
+    }
+
+  private:
+    std::basic_ostream<CharType>& stream;
+};
+
+/// output adapter for basic_string
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_string_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_string_adapter(StringType& s) noexcept
+        : str(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        str.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        str.append(s, length);
+    }
+
+  private:
+    StringType& str;
+};
+
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_adapter
+{
+  public:
+    output_adapter(std::vector<CharType>& vec)
+        : oa(std::make_shared<output_vector_adapter<CharType>>(vec)) {}
+
+    output_adapter(std::basic_ostream<CharType>& s)
+        : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
+
+    output_adapter(StringType& s)
+        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}
+
+    operator output_adapter_t<CharType>()
+    {
+        return oa;
+    }
+
+  private:
+    output_adapter_t<CharType> oa = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// binary writer //
+///////////////////
+
+/*!
+@brief serialization to CBOR and MessagePack values
+*/
+template<typename BasicJsonType, typename CharType>
+class binary_writer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+  public:
+    /*!
+    @brief create a binary writer
+
+    @param[in] adapter  output adapter to write to
+    */
+    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(adapter)
+    {
+        JSON_ASSERT(oa);
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void write_bson(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+            {
+                write_bson_object(*j.m_value.object);
+                break;
+            }
+
+            default:
+            {
+                JSON_THROW(type_error::create(317, "to serialize to BSON, top-level type must be object, but is " + std::string(j.type_name())));
+            }
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_cbor(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                oa->write_character(to_char_type(0xF6));
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                oa->write_character(j.m_value.boolean
+                                    ? to_char_type(0xF5)
+                                    : to_char_type(0xF4));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_value.number_integer >= 0)
+                {
+                    // CBOR does not differentiate between positive signed
+                    // integers and unsigned integers. Therefore, we used the
+                    // code from the value_t::number_unsigned case here.
+                    if (j.m_value.number_integer <= 0x17)
+                    {
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x18));
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x19));
+                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x1A));
+                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x1B));
+                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    // The conversions below encode the sign in the first
+                    // byte, and the value is converted to a positive number.
+                    const auto positive_number = -1 - j.m_value.number_integer;
+                    if (j.m_value.number_integer >= -24)
+                    {
+                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x38));
+                        write_number(static_cast<std::uint8_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x39));
+                        write_number(static_cast<std::uint16_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x3A));
+                        write_number(static_cast<std::uint32_t>(positive_number));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x3B));
+                        write_number(static_cast<std::uint64_t>(positive_number));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x18));
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x19));
+                    write_number(static_cast<std::uint16_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x1A));
+                    write_number(static_cast<std::uint32_t>(j.m_value.number_unsigned));
+                }
+                else
+                {
+                    oa->write_character(to_char_type(0x1B));
+                    write_number(static_cast<std::uint64_t>(j.m_value.number_unsigned));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                if (std::isnan(j.m_value.number_float))
+                {
+                    // NaN is 0xf97e00 in CBOR
+                    oa->write_character(to_char_type(0xF9));
+                    oa->write_character(to_char_type(0x7E));
+                    oa->write_character(to_char_type(0x00));
+                }
+                else if (std::isinf(j.m_value.number_float))
+                {
+                    // Infinity is 0xf97c00, -Infinity is 0xf9fc00
+                    oa->write_character(to_char_type(0xf9));
+                    oa->write_character(j.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
+                    oa->write_character(to_char_type(0x00));
+                }
+                else
+                {
+                    write_compact_float(j.m_value.number_float, detail::input_format_t::cbor);
+                }
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_value.string->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x60 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x78));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x79));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_value.array->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x80 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x98));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x99));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_cbor(el);
+                }
+                break;
+            }
+
+            case value_t::binary:
+            {
+                if (j.m_value.binary->has_subtype())
+                {
+                    write_number(static_cast<std::uint8_t>(0xd8));
+                    write_number(j.m_value.binary->subtype());
+                }
+
+                // step 1: write control byte and the binary array size
+                const auto N = j.m_value.binary->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x40 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x58));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x59));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x5A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x5B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                    N);
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_value.object->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0xA0 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB8));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB9));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBA));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBB));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_cbor(el.first);
+                    write_cbor(el.second);
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_msgpack(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null: // nil
+            {
+                oa->write_character(to_char_type(0xC0));
+                break;
+            }
+
+            case value_t::boolean: // true and false
+            {
+                oa->write_character(j.m_value.boolean
+                                    ? to_char_type(0xC3)
+                                    : to_char_type(0xC2));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_value.number_integer >= 0)
+                {
+                    // MessagePack does not differentiate between positive
+                    // signed integers and unsigned integers. Therefore, we used
+                    // the code from the value_t::number_unsigned case here.
+                    if (j.m_value.number_unsigned < 128)
+                    {
+                        // positive fixnum
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        // uint 8
+                        oa->write_character(to_char_type(0xCC));
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        // uint 16
+                        oa->write_character(to_char_type(0xCD));
+                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        // uint 32
+                        oa->write_character(to_char_type(0xCE));
+                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                    {
+                        // uint 64
+                        oa->write_character(to_char_type(0xCF));
+                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    if (j.m_value.number_integer >= -32)
+                    {
+                        // negative fixnum
+                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                    {
+                        // int 8
+                        oa->write_character(to_char_type(0xD0));
+                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                    {
+                        // int 16
+                        oa->write_character(to_char_type(0xD1));
+                        write_number(static_cast<std::int16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                    {
+                        // int 32
+                        oa->write_character(to_char_type(0xD2));
+                        write_number(static_cast<std::int32_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                    {
+                        // int 64
+                        oa->write_character(to_char_type(0xD3));
+                        write_number(static_cast<std::int64_t>(j.m_value.number_integer));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned < 128)
+                {
+                    // positive fixnum
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // uint 8
+                    oa->write_character(to_char_type(0xCC));
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // uint 16
+                    oa->write_character(to_char_type(0xCD));
+                    write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // uint 32
+                    oa->write_character(to_char_type(0xCE));
+                    write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    // uint 64
+                    oa->write_character(to_char_type(0xCF));
+                    write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_compact_float(j.m_value.number_float, detail::input_format_t::msgpack);
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_value.string->size();
+                if (N <= 31)
+                {
+                    // fixstr
+                    write_number(static_cast<std::uint8_t>(0xA0 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // str 8
+                    oa->write_character(to_char_type(0xD9));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // str 16
+                    oa->write_character(to_char_type(0xDA));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // str 32
+                    oa->write_character(to_char_type(0xDB));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_value.array->size();
+                if (N <= 15)
+                {
+                    // fixarray
+                    write_number(static_cast<std::uint8_t>(0x90 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // array 16
+                    oa->write_character(to_char_type(0xDC));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // array 32
+                    oa->write_character(to_char_type(0xDD));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_msgpack(el);
+                }
+                break;
+            }
+
+            case value_t::binary:
+            {
+                // step 0: determine if the binary type has a set subtype to
+                // determine whether or not to use the ext or fixext types
+                const bool use_ext = j.m_value.binary->has_subtype();
+
+                // step 1: write control byte and the byte string length
+                const auto N = j.m_value.binary->size();
+                if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    std::uint8_t output_type{};
+                    bool fixed = true;
+                    if (use_ext)
+                    {
+                        switch (N)
+                        {
+                            case 1:
+                                output_type = 0xD4; // fixext 1
+                                break;
+                            case 2:
+                                output_type = 0xD5; // fixext 2
+                                break;
+                            case 4:
+                                output_type = 0xD6; // fixext 4
+                                break;
+                            case 8:
+                                output_type = 0xD7; // fixext 8
+                                break;
+                            case 16:
+                                output_type = 0xD8; // fixext 16
+                                break;
+                            default:
+                                output_type = 0xC7; // ext 8
+                                fixed = false;
+                                break;
+                        }
+
+                    }
+                    else
+                    {
+                        output_type = 0xC4; // bin 8
+                        fixed = false;
+                    }
+
+                    oa->write_character(to_char_type(output_type));
+                    if (!fixed)
+                    {
+                        write_number(static_cast<std::uint8_t>(N));
+                    }
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    std::uint8_t output_type = use_ext
+                                               ? 0xC8 // ext 16
+                                               : 0xC5; // bin 16
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    std::uint8_t output_type = use_ext
+                                               ? 0xC9 // ext 32
+                                               : 0xC6; // bin 32
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 1.5: if this is an ext type, write the subtype
+                if (use_ext)
+                {
+                    write_number(static_cast<std::int8_t>(j.m_value.binary->subtype()));
+                }
+
+                // step 2: write the byte string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                    N);
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_value.object->size();
+                if (N <= 15)
+                {
+                    // fixmap
+                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // map 16
+                    oa->write_character(to_char_type(0xDE));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // map 32
+                    oa->write_character(to_char_type(0xDF));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_msgpack(el.first);
+                    write_msgpack(el.second);
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @param[in] use_count   whether to use '#' prefixes (optimized format)
+    @param[in] use_type    whether to use '$' prefixes (optimized format)
+    @param[in] add_prefix  whether prefixes need to be used for this value
+    */
+    void write_ubjson(const BasicJsonType& j, const bool use_count,
+                      const bool use_type, const bool add_prefix = true)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('Z'));
+                }
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(j.m_value.boolean
+                                        ? to_char_type('T')
+                                        : to_char_type('F'));
+                }
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix);
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix);
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix);
+                break;
+            }
+
+            case value_t::string:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('S'));
+                }
+                write_number_with_ubjson_prefix(j.m_value.string->size(), true);
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_value.array->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.array->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_ubjson(el, use_count, use_type, prefix_required);
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::binary:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                if (use_type && !j.m_value.binary->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    oa->write_character(to_char_type('$'));
+                    oa->write_character('U');
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.binary->size(), true);
+                }
+
+                if (use_type)
+                {
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                        j.m_value.binary->size());
+                }
+                else
+                {
+                    for (size_t i = 0; i < j.m_value.binary->size(); ++i)
+                    {
+                        oa->write_character(to_char_type('U'));
+                        oa->write_character(j.m_value.binary->data()[i]);
+                    }
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('{'));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_value.object->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin(), j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.object->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_number_with_ubjson_prefix(el.first.size(), true);
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(el.first.c_str()),
+                        el.first.size());
+                    write_ubjson(el.second, use_count, use_type, prefix_required);
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type('}'));
+                }
+
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @return The size of a BSON document entry header, including the id marker
+            and the entry name size (and its null-terminator).
+    */
+    static std::size_t calc_bson_entry_header_size(const string_t& name)
+    {
+        const auto it = name.find(static_cast<typename string_t::value_type>(0));
+        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos))
+        {
+            JSON_THROW(out_of_range::create(409,
+                                            "BSON key cannot contain code point U+0000 (at byte " + std::to_string(it) + ")"));
+        }
+
+        return /*id*/ 1ul + name.size() + /*zero-terminator*/1u;
+    }
+
+    /*!
+    @brief Writes the given @a element_type and @a name to the output adapter
+    */
+    void write_bson_entry_header(const string_t& name,
+                                 const std::uint8_t element_type)
+    {
+        oa->write_character(to_char_type(element_type)); // boolean
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(name.c_str()),
+            name.size() + 1u);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and boolean value @a value
+    */
+    void write_bson_boolean(const string_t& name,
+                            const bool value)
+    {
+        write_bson_entry_header(name, 0x08);
+        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and double value @a value
+    */
+    void write_bson_double(const string_t& name,
+                           const double value)
+    {
+        write_bson_entry_header(name, 0x01);
+        write_number<double, true>(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded string in @a value
+    */
+    static std::size_t calc_bson_string_size(const string_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and string value @a value
+    */
+    void write_bson_string(const string_t& name,
+                           const string_t& value)
+    {
+        write_bson_entry_header(name, 0x02);
+
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size() + 1ul));
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(value.c_str()),
+            value.size() + 1);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and null value
+    */
+    void write_bson_null(const string_t& name)
+    {
+        write_bson_entry_header(name, 0x0A);
+    }
+
+    /*!
+    @return The size of the BSON-encoded integer @a value
+    */
+    static std::size_t calc_bson_integer_size(const std::int64_t value)
+    {
+        return (std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)()
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and integer @a value
+    */
+    void write_bson_integer(const string_t& name,
+                            const std::int64_t value)
+    {
+        if ((std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            write_bson_entry_header(name, 0x10); // int32
+            write_number<std::int32_t, true>(static_cast<std::int32_t>(value));
+        }
+        else
+        {
+            write_bson_entry_header(name, 0x12); // int64
+            write_number<std::int64_t, true>(static_cast<std::int64_t>(value));
+        }
+    }
+
+    /*!
+    @return The size of the BSON-encoded unsigned integer in @a j
+    */
+    static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept
+    {
+        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and unsigned @a value
+    */
+    void write_bson_unsigned(const string_t& name,
+                             const std::uint64_t value)
+    {
+        if (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x10 /* int32 */);
+            write_number<std::int32_t, true>(static_cast<std::int32_t>(value));
+        }
+        else if (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x12 /* int64 */);
+            write_number<std::int64_t, true>(static_cast<std::int64_t>(value));
+        }
+        else
+        {
+            JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(value) + " cannot be represented by BSON as it does not fit int64"));
+        }
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and object @a value
+    */
+    void write_bson_object_entry(const string_t& name,
+                                 const typename BasicJsonType::object_t& value)
+    {
+        write_bson_entry_header(name, 0x03); // object
+        write_bson_object(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded array @a value
+    */
+    static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value)
+    {
+        std::size_t array_index = 0ul;
+
+        const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), std::size_t(0), [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el)
+        {
+            return result + calc_bson_element_size(std::to_string(array_index++), el);
+        });
+
+        return sizeof(std::int32_t) + embedded_document_size + 1ul;
+    }
+
+    /*!
+    @return The size of the BSON-encoded binary array @a value
+    */
+    static std::size_t calc_bson_binary_size(const typename BasicJsonType::binary_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and array @a value
+    */
+    void write_bson_array(const string_t& name,
+                          const typename BasicJsonType::array_t& value)
+    {
+        write_bson_entry_header(name, 0x04); // array
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_array_size(value)));
+
+        std::size_t array_index = 0ul;
+
+        for (const auto& el : value)
+        {
+            write_bson_element(std::to_string(array_index++), el);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and binary value @a value
+    */
+    void write_bson_binary(const string_t& name,
+                           const binary_t& value)
+    {
+        write_bson_entry_header(name, 0x05);
+
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size()));
+        write_number(value.has_subtype() ? value.subtype() : std::uint8_t(0x00));
+
+        oa->write_characters(reinterpret_cast<const CharType*>(value.data()), value.size());
+    }
+
+    /*!
+    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
+    @return The calculated size for the BSON document entry for @a j with the given @a name.
+    */
+    static std::size_t calc_bson_element_size(const string_t& name,
+            const BasicJsonType& j)
+    {
+        const auto header_size = calc_bson_entry_header_size(name);
+        switch (j.type())
+        {
+            case value_t::object:
+                return header_size + calc_bson_object_size(*j.m_value.object);
+
+            case value_t::array:
+                return header_size + calc_bson_array_size(*j.m_value.array);
+
+            case value_t::binary:
+                return header_size + calc_bson_binary_size(*j.m_value.binary);
+
+            case value_t::boolean:
+                return header_size + 1ul;
+
+            case value_t::number_float:
+                return header_size + 8ul;
+
+            case value_t::number_integer:
+                return header_size + calc_bson_integer_size(j.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned);
+
+            case value_t::string:
+                return header_size + calc_bson_string_size(*j.m_value.string);
+
+            case value_t::null:
+                return header_size + 0ul;
+
+            // LCOV_EXCL_START
+            default:
+                JSON_ASSERT(false);
+                return 0ul;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Serializes the JSON value @a j to BSON and associates it with the
+           key @a name.
+    @param name The name to associate with the JSON entity @a j within the
+                current BSON document
+    @return The size of the BSON entry
+    */
+    void write_bson_element(const string_t& name,
+                            const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+                return write_bson_object_entry(name, *j.m_value.object);
+
+            case value_t::array:
+                return write_bson_array(name, *j.m_value.array);
+
+            case value_t::binary:
+                return write_bson_binary(name, *j.m_value.binary);
+
+            case value_t::boolean:
+                return write_bson_boolean(name, j.m_value.boolean);
+
+            case value_t::number_float:
+                return write_bson_double(name, j.m_value.number_float);
+
+            case value_t::number_integer:
+                return write_bson_integer(name, j.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return write_bson_unsigned(name, j.m_value.number_unsigned);
+
+            case value_t::string:
+                return write_bson_string(name, *j.m_value.string);
+
+            case value_t::null:
+                return write_bson_null(name);
+
+            // LCOV_EXCL_START
+            default:
+                JSON_ASSERT(false);
+                return;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Calculates the size of the BSON serialization of the given
+           JSON-object @a j.
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
+    {
+        std::size_t document_size = std::accumulate(value.begin(), value.end(), std::size_t(0),
+                                    [](size_t result, const typename BasicJsonType::object_t::value_type & el)
+        {
+            return result += calc_bson_element_size(el.first, el.second);
+        });
+
+        return sizeof(std::int32_t) + document_size + 1ul;
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void write_bson_object(const typename BasicJsonType::object_t& value)
+    {
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_object_size(value)));
+
+        for (const auto& el : value)
+        {
+            write_bson_element(el.first, el.second);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    static constexpr CharType get_cbor_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xFA);  // Single-Precision Float
+    }
+
+    static constexpr CharType get_cbor_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xFB);  // Double-Precision Float
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    static constexpr CharType get_msgpack_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xCA);  // float 32
+    }
+
+    static constexpr CharType get_msgpack_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xCB);  // float 64
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    // UBJSON: write number (floating point)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_floating_point<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (add_prefix)
+        {
+            oa->write_character(get_ubjson_float_prefix(n));
+        }
+        write_number(n);
+    }
+
+    // UBJSON: write number (unsigned integer)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_unsigned<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if (n <= (std::numeric_limits<std::uint8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n));
+        }
+        else
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true);
+            for (std::size_t i = 0; i < number.size(); ++i)
+            {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+    }
+
+    // UBJSON: write number (signed integer)
+    template < typename NumberType, typename std::enable_if <
+                   std::is_signed<NumberType>::value&&
+                   !std::is_floating_point<NumberType>::value, int >::type = 0 >
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if ((std::numeric_limits<std::int8_t>::min)() <= n && n <= (std::numeric_limits<std::int8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::int8_t>(n));
+        }
+        else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if ((std::numeric_limits<std::int16_t>::min)() <= n && n <= (std::numeric_limits<std::int16_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n));
+        }
+        else if ((std::numeric_limits<std::int32_t>::min)() <= n && n <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n));
+        }
+        else if ((std::numeric_limits<std::int64_t>::min)() <= n && n <= (std::numeric_limits<std::int64_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n));
+        }
+        // LCOV_EXCL_START
+        else
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true);
+            for (std::size_t i = 0; i < number.size(); ++i)
+            {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    /*!
+    @brief determine the type prefix of container values
+    */
+    CharType ubjson_prefix(const BasicJsonType& j) const noexcept
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+                return 'Z';
+
+            case value_t::boolean:
+                return j.m_value.boolean ? 'T' : 'F';
+
+            case value_t::number_integer:
+            {
+                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                {
+                    return 'i';
+                }
+                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    return 'U';
+                }
+                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                {
+                    return 'I';
+                }
+                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                {
+                    return 'l';
+                }
+                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                {
+                    return 'L';
+                }
+                // anything else is treated as high-precision number
+                return 'H'; // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+                {
+                    return 'i';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
+                {
+                    return 'U';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+                {
+                    return 'I';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+                {
+                    return 'l';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+                {
+                    return 'L';
+                }
+                // anything else is treated as high-precision number
+                return 'H'; // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_float:
+                return get_ubjson_float_prefix(j.m_value.number_float);
+
+            case value_t::string:
+                return 'S';
+
+            case value_t::array: // fallthrough
+            case value_t::binary:
+                return '[';
+
+            case value_t::object:
+                return '{';
+
+            default:  // discarded values
+                return 'N';
+        }
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(float /*unused*/)
+    {
+        return 'd';  // float 32
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(double /*unused*/)
+    {
+        return 'D';  // float 64
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*
+    @brief write a number to output input
+    @param[in] n number of type @a NumberType
+    @tparam NumberType the type of the number
+    @tparam OutputIsLittleEndian Set to true if output data is
+                                 required to be little endian
+
+    @note This function needs to respect the system's endianess, because bytes
+          in CBOR, MessagePack, and UBJSON are stored in network order (big
+          endian) and therefore need reordering on little endian systems.
+    */
+    template<typename NumberType, bool OutputIsLittleEndian = false>
+    void write_number(const NumberType n)
+    {
+        // step 1: write number to array of length NumberType
+        std::array<CharType, sizeof(NumberType)> vec;
+        std::memcpy(vec.data(), &n, sizeof(NumberType));
+
+        // step 2: write array to output (with possible reordering)
+        if (is_little_endian != OutputIsLittleEndian)
+        {
+            // reverse byte order prior to conversion if necessary
+            std::reverse(vec.begin(), vec.end());
+        }
+
+        oa->write_characters(vec.data(), sizeof(NumberType));
+    }
+
+    void write_compact_float(const number_float_t n, detail::input_format_t format)
+    {
+        if (static_cast<double>(n) >= static_cast<double>(std::numeric_limits<float>::lowest()) &&
+                static_cast<double>(n) <= static_cast<double>((std::numeric_limits<float>::max)()) &&
+                static_cast<double>(static_cast<float>(n)) == static_cast<double>(n))
+        {
+            oa->write_character(format == detail::input_format_t::cbor
+                                ? get_cbor_float_prefix(static_cast<float>(n))
+                                : get_msgpack_float_prefix(static_cast<float>(n)));
+            write_number(static_cast<float>(n));
+        }
+        else
+        {
+            oa->write_character(format == detail::input_format_t::cbor
+                                ? get_cbor_float_prefix(n)
+                                : get_msgpack_float_prefix(n));
+            write_number(n);
+        }
+    }
+
+  public:
+    // The following to_char_type functions are implement the conversion
+    // between uint8_t and CharType. In case CharType is not unsigned,
+    // such a conversion is required to allow values greater than 128.
+    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value && std::is_signed<char>::value > * = nullptr >
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return *reinterpret_cast<char*>(&x);
+    }
+
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value && std::is_unsigned<char>::value > * = nullptr >
+    static CharType to_char_type(std::uint8_t x) noexcept
+    {
+        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
+        static_assert(std::is_trivial<CharType>::value, "CharType must be trivial");
+        CharType result;
+        std::memcpy(&result, &x, sizeof(x));
+        return result;
+    }
+
+    template<typename C = CharType,
+             enable_if_t<std::is_unsigned<C>::value>* = nullptr>
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return x;
+    }
+
+    template < typename InputCharType, typename C = CharType,
+               enable_if_t <
+                   std::is_signed<C>::value &&
+                   std::is_signed<char>::value &&
+                   std::is_same<char, typename std::remove_cv<InputCharType>::type>::value
+                   > * = nullptr >
+    static constexpr CharType to_char_type(InputCharType x) noexcept
+    {
+        return x;
+    }
+
+  private:
+    /// whether we can assume little endianess
+    const bool is_little_endian = little_endianess();
+
+    /// the output
+    output_adapter_t<CharType> oa = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/output/serializer.hpp>
+
+
+#include <algorithm> // reverse, remove, fill, find, none_of
+#include <array> // array
+#include <clocale> // localeconv, lconv
+#include <cmath> // labs, isfinite, isnan, signbit
+#include <cstddef> // size_t, ptrdiff_t
+#include <cstdint> // uint8_t
+#include <cstdio> // snprintf
+#include <limits> // numeric_limits
+#include <string> // string, char_traits
+#include <type_traits> // is_same
+#include <utility> // move
+
+// #include <nlohmann/detail/conversions/to_chars.hpp>
+
+
+#include <array> // array
+#include <cmath>   // signbit, isfinite
+#include <cstdint> // intN_t, uintN_t
+#include <cstring> // memcpy, memmove
+#include <limits> // numeric_limits
+#include <type_traits> // conditional
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/*!
+@brief implements the Grisu2 algorithm for binary to decimal floating-point
+conversion.
+
+This implementation is a slightly modified version of the reference
+implementation which may be obtained from
+http://florian.loitsch.com/publications (bench.tar.gz).
+
+The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.
+
+For a detailed description of the algorithm see:
+
+[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
+    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
+    Language Design and Implementation, PLDI 2010
+[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
+    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
+    Design and Implementation, PLDI 1996
+*/
+namespace dtoa_impl
+{
+
+template<typename Target, typename Source>
+Target reinterpret_bits(const Source source)
+{
+    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
+
+    Target target;
+    std::memcpy(&target, &source, sizeof(Source));
+    return target;
+}
+
+struct diyfp // f * 2^e
+{
+    static constexpr int kPrecision = 64; // = q
+
+    std::uint64_t f = 0;
+    int e = 0;
+
+    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
+
+    /*!
+    @brief returns x - y
+    @pre x.e == y.e and x.f >= y.f
+    */
+    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
+    {
+        JSON_ASSERT(x.e == y.e);
+        JSON_ASSERT(x.f >= y.f);
+
+        return {x.f - y.f, x.e};
+    }
+
+    /*!
+    @brief returns x * y
+    @note The result is rounded. (Only the upper q bits are returned.)
+    */
+    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
+    {
+        static_assert(kPrecision == 64, "internal error");
+
+        // Computes:
+        //  f = round((x.f * y.f) / 2^q)
+        //  e = x.e + y.e + q
+
+        // Emulate the 64-bit * 64-bit multiplication:
+        //
+        // p = u * v
+        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
+        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
+        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
+        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
+        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
+        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
+        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
+        //
+        // (Since Q might be larger than 2^32 - 1)
+        //
+        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
+        //
+        // (Q_hi + H does not overflow a 64-bit int)
+        //
+        //   = p_lo + 2^64 p_hi
+
+        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
+        const std::uint64_t u_hi = x.f >> 32u;
+        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
+        const std::uint64_t v_hi = y.f >> 32u;
+
+        const std::uint64_t p0 = u_lo * v_lo;
+        const std::uint64_t p1 = u_lo * v_hi;
+        const std::uint64_t p2 = u_hi * v_lo;
+        const std::uint64_t p3 = u_hi * v_hi;
+
+        const std::uint64_t p0_hi = p0 >> 32u;
+        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
+        const std::uint64_t p1_hi = p1 >> 32u;
+        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
+        const std::uint64_t p2_hi = p2 >> 32u;
+
+        std::uint64_t Q = p0_hi + p1_lo + p2_lo;
+
+        // The full product might now be computed as
+        //
+        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
+        // p_lo = p0_lo + (Q << 32)
+        //
+        // But in this particular case here, the full p_lo is not required.
+        // Effectively we only need to add the highest bit in p_lo to p_hi (and
+        // Q_hi + 1 does not overflow).
+
+        Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
+
+        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
+
+        return {h, x.e + y.e + 64};
+    }
+
+    /*!
+    @brief normalize x such that the significand is >= 2^(q-1)
+    @pre x.f != 0
+    */
+    static diyfp normalize(diyfp x) noexcept
+    {
+        JSON_ASSERT(x.f != 0);
+
+        while ((x.f >> 63u) == 0)
+        {
+            x.f <<= 1u;
+            x.e--;
+        }
+
+        return x;
+    }
+
+    /*!
+    @brief normalize x such that the result has the exponent E
+    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
+    */
+    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
+    {
+        const int delta = x.e - target_exponent;
+
+        JSON_ASSERT(delta >= 0);
+        JSON_ASSERT(((x.f << delta) >> delta) == x.f);
+
+        return {x.f << delta, target_exponent};
+    }
+};
+
+struct boundaries
+{
+    diyfp w;
+    diyfp minus;
+    diyfp plus;
+};
+
+/*!
+Compute the (normalized) diyfp representing the input number 'value' and its
+boundaries.
+
+@pre value must be finite and positive
+*/
+template<typename FloatType>
+boundaries compute_boundaries(FloatType value)
+{
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // Convert the IEEE representation into a diyfp.
+    //
+    // If v is denormal:
+    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
+    // If v is normalized:
+    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
+
+    static_assert(std::numeric_limits<FloatType>::is_iec559,
+                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");
+
+    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
+    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
+    constexpr int      kMinExp    = 1 - kBias;
+    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1)
+
+    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t >::type;
+
+    const std::uint64_t bits = reinterpret_bits<bits_type>(value);
+    const std::uint64_t E = bits >> (kPrecision - 1);
+    const std::uint64_t F = bits & (kHiddenBit - 1);
+
+    const bool is_denormal = E == 0;
+    const diyfp v = is_denormal
+                    ? diyfp(F, kMinExp)
+                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
+
+    // Compute the boundaries m- and m+ of the floating-point value
+    // v = f * 2^e.
+    //
+    // Determine v- and v+, the floating-point predecessor and successor if v,
+    // respectively.
+    //
+    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
+    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
+    //
+    //      v+ = v + 2^e
+    //
+    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
+    // between m- and m+ round to v, regardless of how the input rounding
+    // algorithm breaks ties.
+    //
+    //      ---+-------------+-------------+-------------+-------------+---  (A)
+    //         v-            m-            v             m+            v+
+    //
+    //      -----------------+------+------+-------------+-------------+---  (B)
+    //                       v-     m-     v             m+            v+
+
+    const bool lower_boundary_is_closer = F == 0 && E > 1;
+    const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
+    const diyfp m_minus = lower_boundary_is_closer
+                          ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
+                          : diyfp(2 * v.f - 1, v.e - 1); // (A)
+
+    // Determine the normalized w+ = m+.
+    const diyfp w_plus = diyfp::normalize(m_plus);
+
+    // Determine w- = m- such that e_(w-) = e_(w+).
+    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
+
+    return {diyfp::normalize(v), w_minus, w_plus};
+}
+
+// Given normalized diyfp w, Grisu needs to find a (normalized) cached
+// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
+// within a certain range [alpha, gamma] (Definition 3.2 from [1])
+//
+//      alpha <= e = e_c + e_w + q <= gamma
+//
+// or
+//
+//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
+//                          <= f_c * f_w * 2^gamma
+//
+// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
+//
+//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
+//
+// or
+//
+//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
+//
+// The choice of (alpha,gamma) determines the size of the table and the form of
+// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
+// in practice:
+//
+// The idea is to cut the number c * w = f * 2^e into two parts, which can be
+// processed independently: An integral part p1, and a fractional part p2:
+//
+//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
+//              = (f div 2^-e) + (f mod 2^-e) * 2^e
+//              = p1 + p2 * 2^e
+//
+// The conversion of p1 into decimal form requires a series of divisions and
+// modulos by (a power of) 10. These operations are faster for 32-bit than for
+// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
+// achieved by choosing
+//
+//      -e >= 32   or   e <= -32 := gamma
+//
+// In order to convert the fractional part
+//
+//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
+//
+// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
+// d[-i] are extracted in order:
+//
+//      (10 * p2) div 2^-e = d[-1]
+//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
+//
+// The multiplication by 10 must not overflow. It is sufficient to choose
+//
+//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
+//
+// Since p2 = f mod 2^-e < 2^-e,
+//
+//      -e <= 60   or   e >= -60 := alpha
+
+constexpr int kAlpha = -60;
+constexpr int kGamma = -32;
+
+struct cached_power // c = f * 2^e ~= 10^k
+{
+    std::uint64_t f;
+    int e;
+    int k;
+};
+
+/*!
+For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
+power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
+satisfies (Definition 3.2 from [1])
+
+     alpha <= e_c + e + q <= gamma.
+*/
+inline cached_power get_cached_power_for_binary_exponent(int e)
+{
+    // Now
+    //
+    //      alpha <= e_c + e + q <= gamma                                    (1)
+    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
+    //
+    // and since the c's are normalized, 2^(q-1) <= f_c,
+    //
+    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
+    //      ==> 2^(alpha - e - 1) <= c
+    //
+    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
+    //
+    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
+    //        = ceil( (alpha - e - 1) * log_10(2) )
+    //
+    // From the paper:
+    // "In theory the result of the procedure could be wrong since c is rounded,
+    //  and the computation itself is approximated [...]. In practice, however,
+    //  this simple function is sufficient."
+    //
+    // For IEEE double precision floating-point numbers converted into
+    // normalized diyfp's w = f * 2^e, with q = 64,
+    //
+    //      e >= -1022      (min IEEE exponent)
+    //           -52        (p - 1)
+    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
+    //           -11        (normalize the diyfp)
+    //         = -1137
+    //
+    // and
+    //
+    //      e <= +1023      (max IEEE exponent)
+    //           -52        (p - 1)
+    //           -11        (normalize the diyfp)
+    //         = 960
+    //
+    // This binary exponent range [-1137,960] results in a decimal exponent
+    // range [-307,324]. One does not need to store a cached power for each
+    // k in this range. For each such k it suffices to find a cached power
+    // such that the exponent of the product lies in [alpha,gamma].
+    // This implies that the difference of the decimal exponents of adjacent
+    // table entries must be less than or equal to
+    //
+    //      floor( (gamma - alpha) * log_10(2) ) = 8.
+    //
+    // (A smaller distance gamma-alpha would require a larger table.)
+
+    // NB:
+    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
+
+    constexpr int kCachedPowersMinDecExp = -300;
+    constexpr int kCachedPowersDecStep = 8;
+
+    static constexpr std::array<cached_power, 79> kCachedPowers =
+    {
+        {
+            { 0xAB70FE17C79AC6CA, -1060, -300 },
+            { 0xFF77B1FCBEBCDC4F, -1034, -292 },
+            { 0xBE5691EF416BD60C, -1007, -284 },
+            { 0x8DD01FAD907FFC3C,  -980, -276 },
+            { 0xD3515C2831559A83,  -954, -268 },
+            { 0x9D71AC8FADA6C9B5,  -927, -260 },
+            { 0xEA9C227723EE8BCB,  -901, -252 },
+            { 0xAECC49914078536D,  -874, -244 },
+            { 0x823C12795DB6CE57,  -847, -236 },
+            { 0xC21094364DFB5637,  -821, -228 },
+            { 0x9096EA6F3848984F,  -794, -220 },
+            { 0xD77485CB25823AC7,  -768, -212 },
+            { 0xA086CFCD97BF97F4,  -741, -204 },
+            { 0xEF340A98172AACE5,  -715, -196 },
+            { 0xB23867FB2A35B28E,  -688, -188 },
+            { 0x84C8D4DFD2C63F3B,  -661, -180 },
+            { 0xC5DD44271AD3CDBA,  -635, -172 },
+            { 0x936B9FCEBB25C996,  -608, -164 },
+            { 0xDBAC6C247D62A584,  -582, -156 },
+            { 0xA3AB66580D5FDAF6,  -555, -148 },
+            { 0xF3E2F893DEC3F126,  -529, -140 },
+            { 0xB5B5ADA8AAFF80B8,  -502, -132 },
+            { 0x87625F056C7C4A8B,  -475, -124 },
+            { 0xC9BCFF6034C13053,  -449, -116 },
+            { 0x964E858C91BA2655,  -422, -108 },
+            { 0xDFF9772470297EBD,  -396, -100 },
+            { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
+            { 0xF8A95FCF88747D94,  -343,  -84 },
+            { 0xB94470938FA89BCF,  -316,  -76 },
+            { 0x8A08F0F8BF0F156B,  -289,  -68 },
+            { 0xCDB02555653131B6,  -263,  -60 },
+            { 0x993FE2C6D07B7FAC,  -236,  -52 },
+            { 0xE45C10C42A2B3B06,  -210,  -44 },
+            { 0xAA242499697392D3,  -183,  -36 },
+            { 0xFD87B5F28300CA0E,  -157,  -28 },
+            { 0xBCE5086492111AEB,  -130,  -20 },
+            { 0x8CBCCC096F5088CC,  -103,  -12 },
+            { 0xD1B71758E219652C,   -77,   -4 },
+            { 0x9C40000000000000,   -50,    4 },
+            { 0xE8D4A51000000000,   -24,   12 },
+            { 0xAD78EBC5AC620000,     3,   20 },
+            { 0x813F3978F8940984,    30,   28 },
+            { 0xC097CE7BC90715B3,    56,   36 },
+            { 0x8F7E32CE7BEA5C70,    83,   44 },
+            { 0xD5D238A4ABE98068,   109,   52 },
+            { 0x9F4F2726179A2245,   136,   60 },
+            { 0xED63A231D4C4FB27,   162,   68 },
+            { 0xB0DE65388CC8ADA8,   189,   76 },
+            { 0x83C7088E1AAB65DB,   216,   84 },
+            { 0xC45D1DF942711D9A,   242,   92 },
+            { 0x924D692CA61BE758,   269,  100 },
+            { 0xDA01EE641A708DEA,   295,  108 },
+            { 0xA26DA3999AEF774A,   322,  116 },
+            { 0xF209787BB47D6B85,   348,  124 },
+            { 0xB454E4A179DD1877,   375,  132 },
+            { 0x865B86925B9BC5C2,   402,  140 },
+            { 0xC83553C5C8965D3D,   428,  148 },
+            { 0x952AB45CFA97A0B3,   455,  156 },
+            { 0xDE469FBD99A05FE3,   481,  164 },
+            { 0xA59BC234DB398C25,   508,  172 },
+            { 0xF6C69A72A3989F5C,   534,  180 },
+            { 0xB7DCBF5354E9BECE,   561,  188 },
+            { 0x88FCF317F22241E2,   588,  196 },
+            { 0xCC20CE9BD35C78A5,   614,  204 },
+            { 0x98165AF37B2153DF,   641,  212 },
+            { 0xE2A0B5DC971F303A,   667,  220 },
+            { 0xA8D9D1535CE3B396,   694,  228 },
+            { 0xFB9B7CD9A4A7443C,   720,  236 },
+            { 0xBB764C4CA7A44410,   747,  244 },
+            { 0x8BAB8EEFB6409C1A,   774,  252 },
+            { 0xD01FEF10A657842C,   800,  260 },
+            { 0x9B10A4E5E9913129,   827,  268 },
+            { 0xE7109BFBA19C0C9D,   853,  276 },
+            { 0xAC2820D9623BF429,   880,  284 },
+            { 0x80444B5E7AA7CF85,   907,  292 },
+            { 0xBF21E44003ACDD2D,   933,  300 },
+            { 0x8E679C2F5E44FF8F,   960,  308 },
+            { 0xD433179D9C8CB841,   986,  316 },
+            { 0x9E19DB92B4E31BA9,  1013,  324 },
+        }
+    };
+
+    // This computation gives exactly the same results for k as
+    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
+    // for |e| <= 1500, but doesn't require floating-point operations.
+    // NB: log_10(2) ~= 78913 / 2^18
+    JSON_ASSERT(e >= -1500);
+    JSON_ASSERT(e <=  1500);
+    const int f = kAlpha - e - 1;
+    const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
+
+    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
+    JSON_ASSERT(index >= 0);
+    JSON_ASSERT(static_cast<std::size_t>(index) < kCachedPowers.size());
+
+    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
+    JSON_ASSERT(kAlpha <= cached.e + e + 64);
+    JSON_ASSERT(kGamma >= cached.e + e + 64);
+
+    return cached;
+}
+
+/*!
+For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
+For n == 0, returns 1 and sets pow10 := 1.
+*/
+inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10)
+{
+    // LCOV_EXCL_START
+    if (n >= 1000000000)
+    {
+        pow10 = 1000000000;
+        return 10;
+    }
+    // LCOV_EXCL_STOP
+    else if (n >= 100000000)
+    {
+        pow10 = 100000000;
+        return  9;
+    }
+    else if (n >= 10000000)
+    {
+        pow10 = 10000000;
+        return  8;
+    }
+    else if (n >= 1000000)
+    {
+        pow10 = 1000000;
+        return  7;
+    }
+    else if (n >= 100000)
+    {
+        pow10 = 100000;
+        return  6;
+    }
+    else if (n >= 10000)
+    {
+        pow10 = 10000;
+        return  5;
+    }
+    else if (n >= 1000)
+    {
+        pow10 = 1000;
+        return  4;
+    }
+    else if (n >= 100)
+    {
+        pow10 = 100;
+        return  3;
+    }
+    else if (n >= 10)
+    {
+        pow10 = 10;
+        return  2;
+    }
+    else
+    {
+        pow10 = 1;
+        return 1;
+    }
+}
+
+inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta,
+                         std::uint64_t rest, std::uint64_t ten_k)
+{
+    JSON_ASSERT(len >= 1);
+    JSON_ASSERT(dist <= delta);
+    JSON_ASSERT(rest <= delta);
+    JSON_ASSERT(ten_k > 0);
+
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    //                                  ten_k
+    //                                <------>
+    //                                       <---- rest ---->
+    // --------------[------------------+----+--------------]--------------
+    //                                  w    V
+    //                                       = buf * 10^k
+    //
+    // ten_k represents a unit-in-the-last-place in the decimal representation
+    // stored in buf.
+    // Decrement buf by ten_k while this takes buf closer to w.
+
+    // The tests are written in this order to avoid overflow in unsigned
+    // integer arithmetic.
+
+    while (rest < dist
+            && delta - rest >= ten_k
+            && (rest + ten_k < dist || dist - rest > rest + ten_k - dist))
+    {
+        JSON_ASSERT(buf[len - 1] != '0');
+        buf[len - 1]--;
+        rest += ten_k;
+    }
+}
+
+/*!
+Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
+M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
+*/
+inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
+                             diyfp M_minus, diyfp w, diyfp M_plus)
+{
+    static_assert(kAlpha >= -60, "internal error");
+    static_assert(kGamma <= -32, "internal error");
+
+    // Generates the digits (and the exponent) of a decimal floating-point
+    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
+    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
+    //
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    // Grisu2 generates the digits of M+ from left to right and stops as soon as
+    // V is in [M-,M+].
+
+    JSON_ASSERT(M_plus.e >= kAlpha);
+    JSON_ASSERT(M_plus.e <= kGamma);
+
+    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
+    std::uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)
+
+    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
+    //
+    //      M+ = f * 2^e
+    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
+    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
+    //         = p1 + p2 * 2^e
+
+    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
+
+    auto p1 = static_cast<std::uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
+    std::uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e
+
+    // 1)
+    //
+    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
+
+    JSON_ASSERT(p1 > 0);
+
+    std::uint32_t pow10;
+    const int k = find_largest_pow10(p1, pow10);
+
+    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
+    //
+    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
+    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
+    //
+    //      M+ = p1                                             + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
+    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
+    //
+    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
+    //
+    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
+    //
+    // but stop as soon as
+    //
+    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
+
+    int n = k;
+    while (n > 0)
+    {
+        // Invariants:
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        //
+        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
+        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
+        //
+        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
+        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
+        //
+        p1 = r;
+        n--;
+        //
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
+        //      pow10 = 10^n
+        //
+
+        // Now check if enough digits have been generated.
+        // Compute
+        //
+        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
+        //
+        // Note:
+        // Since rest and delta share the same exponent e, it suffices to
+        // compare the significands.
+        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
+        if (rest <= delta)
+        {
+            // V = buffer * 10^n, with M- <= V <= M+.
+
+            decimal_exponent += n;
+
+            // We may now just stop. But instead look if the buffer could be
+            // decremented to bring V closer to w.
+            //
+            // pow10 = 10^n is now 1 ulp in the decimal representation V.
+            // The rounding procedure works with diyfp's with an implicit
+            // exponent of e.
+            //
+            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
+            //
+            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
+            grisu2_round(buffer, length, dist, delta, rest, ten_n);
+
+            return;
+        }
+
+        pow10 /= 10;
+        //
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        // Invariants restored.
+    }
+
+    // 2)
+    //
+    // The digits of the integral part have been generated:
+    //
+    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
+    //         = buffer            + p2 * 2^e
+    //
+    // Now generate the digits of the fractional part p2 * 2^e.
+    //
+    // Note:
+    // No decimal point is generated: the exponent is adjusted instead.
+    //
+    // p2 actually represents the fraction
+    //
+    //      p2 * 2^e
+    //          = p2 / 2^-e
+    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
+    //
+    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
+    //
+    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
+    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
+    //
+    // using
+    //
+    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
+    //                = (                   d) * 2^-e + (                   r)
+    //
+    // or
+    //      10^m * p2 * 2^e = d + r * 2^e
+    //
+    // i.e.
+    //
+    //      M+ = buffer + p2 * 2^e
+    //         = buffer + 10^-m * (d + r * 2^e)
+    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
+    //
+    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
+
+    JSON_ASSERT(p2 > delta);
+
+    int m = 0;
+    for (;;)
+    {
+        // Invariant:
+        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
+        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
+        //
+        JSON_ASSERT(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
+        p2 *= 10;
+        const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
+        const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
+        //
+        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
+        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        p2 = r;
+        m++;
+        //
+        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
+        // Invariant restored.
+
+        // Check if enough digits have been generated.
+        //
+        //      10^-m * p2 * 2^e <= delta * 2^e
+        //              p2 * 2^e <= 10^m * delta * 2^e
+        //                    p2 <= 10^m * delta
+        delta *= 10;
+        dist  *= 10;
+        if (p2 <= delta)
+        {
+            break;
+        }
+    }
+
+    // V = buffer * 10^-m, with M- <= V <= M+.
+
+    decimal_exponent -= m;
+
+    // 1 ulp in the decimal representation is now 10^-m.
+    // Since delta and dist are now scaled by 10^m, we need to do the
+    // same with ulp in order to keep the units in sync.
+    //
+    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
+    //
+    const std::uint64_t ten_m = one.f;
+    grisu2_round(buffer, length, dist, delta, p2, ten_m);
+
+    // By construction this algorithm generates the shortest possible decimal
+    // number (Loitsch, Theorem 6.2) which rounds back to w.
+    // For an input number of precision p, at least
+    //
+    //      N = 1 + ceil(p * log_10(2))
+    //
+    // decimal digits are sufficient to identify all binary floating-point
+    // numbers (Matula, "In-and-Out conversions").
+    // This implies that the algorithm does not produce more than N decimal
+    // digits.
+    //
+    //      N = 17 for p = 53 (IEEE double precision)
+    //      N = 9  for p = 24 (IEEE single precision)
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline void grisu2(char* buf, int& len, int& decimal_exponent,
+                   diyfp m_minus, diyfp v, diyfp m_plus)
+{
+    JSON_ASSERT(m_plus.e == m_minus.e);
+    JSON_ASSERT(m_plus.e == v.e);
+
+    //  --------(-----------------------+-----------------------)--------    (A)
+    //          m-                      v                       m+
+    //
+    //  --------------------(-----------+-----------------------)--------    (B)
+    //                      m-          v                       m+
+    //
+    // First scale v (and m- and m+) such that the exponent is in the range
+    // [alpha, gamma].
+
+    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
+
+    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
+
+    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
+    const diyfp w       = diyfp::mul(v,       c_minus_k);
+    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
+    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);
+
+    //  ----(---+---)---------------(---+---)---------------(---+---)----
+    //          w-                      w                       w+
+    //          = c*m-                  = c*v                   = c*m+
+    //
+    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
+    // w+ are now off by a small amount.
+    // In fact:
+    //
+    //      w - v * 10^k < 1 ulp
+    //
+    // To account for this inaccuracy, add resp. subtract 1 ulp.
+    //
+    //  --------+---[---------------(---+---)---------------]---+--------
+    //          w-  M-                  w                   M+  w+
+    //
+    // Now any number in [M-, M+] (bounds included) will round to w when input,
+    // regardless of how the input rounding algorithm breaks ties.
+    //
+    // And digit_gen generates the shortest possible such number in [M-, M+].
+    // Note that this does not mean that Grisu2 always generates the shortest
+    // possible number in the interval (m-, m+).
+    const diyfp M_minus(w_minus.f + 1, w_minus.e);
+    const diyfp M_plus (w_plus.f  - 1, w_plus.e );
+
+    decimal_exponent = -cached.k; // = -(-k) = k
+
+    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+template<typename FloatType>
+JSON_HEDLEY_NON_NULL(1)
+void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
+{
+    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
+                  "internal error: not enough precision");
+
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
+    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
+    // decimal representations are not exactly "short".
+    //
+    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
+    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
+    // and since sprintf promotes float's to double's, I think this is exactly what 'std::to_chars'
+    // does.
+    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
+    // representation using the corresponding std::from_chars function recovers value exactly". That
+    // indicates that single precision floating-point numbers should be recovered using
+    // 'std::strtof'.
+    //
+    // NB: If the neighbors are computed for single-precision numbers, there is a single float
+    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
+    //     value is off by 1 ulp.
+#if 0
+    const boundaries w = compute_boundaries(static_cast<double>(value));
+#else
+    const boundaries w = compute_boundaries(value);
+#endif
+
+    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
+}
+
+/*!
+@brief appends a decimal representation of e to buf
+@return a pointer to the element following the exponent.
+@pre -1000 < e < 1000
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* append_exponent(char* buf, int e)
+{
+    JSON_ASSERT(e > -1000);
+    JSON_ASSERT(e <  1000);
+
+    if (e < 0)
+    {
+        e = -e;
+        *buf++ = '-';
+    }
+    else
+    {
+        *buf++ = '+';
+    }
+
+    auto k = static_cast<std::uint32_t>(e);
+    if (k < 10)
+    {
+        // Always print at least two digits in the exponent.
+        // This is for compatibility with printf("%g").
+        *buf++ = '0';
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else if (k < 100)
+    {
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else
+    {
+        *buf++ = static_cast<char>('0' + k / 100);
+        k %= 100;
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+
+    return buf;
+}
+
+/*!
+@brief prettify v = buf * 10^decimal_exponent
+
+If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
+notation. Otherwise it will be printed in exponential notation.
+
+@pre min_exp < 0
+@pre max_exp > 0
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* format_buffer(char* buf, int len, int decimal_exponent,
+                           int min_exp, int max_exp)
+{
+    JSON_ASSERT(min_exp < 0);
+    JSON_ASSERT(max_exp > 0);
+
+    const int k = len;
+    const int n = len + decimal_exponent;
+
+    // v = buf * 10^(n-k)
+    // k is the length of the buffer (number of decimal digits)
+    // n is the position of the decimal point relative to the start of the buffer.
+
+    if (k <= n && n <= max_exp)
+    {
+        // digits[000]
+        // len <= max_exp + 2
+
+        std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
+        // Make it look like a floating-point number (#362, #378)
+        buf[n + 0] = '.';
+        buf[n + 1] = '0';
+        return buf + (static_cast<size_t>(n) + 2);
+    }
+
+    if (0 < n && n <= max_exp)
+    {
+        // dig.its
+        // len <= max_digits10 + 1
+
+        JSON_ASSERT(k > n);
+
+        std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, static_cast<size_t>(k) - static_cast<size_t>(n));
+        buf[n] = '.';
+        return buf + (static_cast<size_t>(k) + 1U);
+    }
+
+    if (min_exp < n && n <= 0)
+    {
+        // 0.[000]digits
+        // len <= 2 + (-min_exp - 1) + max_digits10
+
+        std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, static_cast<size_t>(k));
+        buf[0] = '0';
+        buf[1] = '.';
+        std::memset(buf + 2, '0', static_cast<size_t>(-n));
+        return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
+    }
+
+    if (k == 1)
+    {
+        // dE+123
+        // len <= 1 + 5
+
+        buf += 1;
+    }
+    else
+    {
+        // d.igitsE+123
+        // len <= max_digits10 + 1 + 5
+
+        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
+        buf[1] = '.';
+        buf += 1 + static_cast<size_t>(k);
+    }
+
+    *buf++ = 'e';
+    return append_exponent(buf, n - 1);
+}
+
+} // namespace dtoa_impl
+
+/*!
+@brief generates a decimal representation of the floating-point number value in [first, last).
+
+The format of the resulting decimal representation is similar to printf's %g
+format. Returns an iterator pointing past-the-end of the decimal representation.
+
+@note The input number must be finite, i.e. NaN's and Inf's are not supported.
+@note The buffer must be large enough.
+@note The result is NOT null-terminated.
+*/
+template<typename FloatType>
+JSON_HEDLEY_NON_NULL(1, 2)
+JSON_HEDLEY_RETURNS_NON_NULL
+char* to_chars(char* first, const char* last, FloatType value)
+{
+    static_cast<void>(last); // maybe unused - fix warning
+    JSON_ASSERT(std::isfinite(value));
+
+    // Use signbit(value) instead of (value < 0) since signbit works for -0.
+    if (std::signbit(value))
+    {
+        value = -value;
+        *first++ = '-';
+    }
+
+    if (value == 0) // +-0
+    {
+        *first++ = '0';
+        // Make it look like a floating-point number (#362, #378)
+        *first++ = '.';
+        *first++ = '0';
+        return first;
+    }
+
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10);
+
+    // Compute v = buffer * 10^decimal_exponent.
+    // The decimal digits are stored in the buffer, which needs to be interpreted
+    // as an unsigned decimal integer.
+    // len is the length of the buffer, i.e. the number of decimal digits.
+    int len = 0;
+    int decimal_exponent = 0;
+    dtoa_impl::grisu2(first, len, decimal_exponent, value);
+
+    JSON_ASSERT(len <= std::numeric_limits<FloatType>::max_digits10);
+
+    // Format the buffer like printf("%.*g", prec, value)
+    constexpr int kMinExp = -4;
+    // Use digits10 here to increase compatibility with version 2.
+    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;
+
+    JSON_ASSERT(last - first >= kMaxExp + 2);
+    JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);
+
+    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
+}
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// serialization //
+///////////////////
+
+/// how to treat decoding errors
+enum class error_handler_t
+{
+    strict,  ///< throw a type_error exception in case of invalid UTF-8
+    replace, ///< replace invalid UTF-8 sequences with U+FFFD
+    ignore   ///< ignore invalid UTF-8 sequences
+};
+
+template<typename BasicJsonType>
+class serializer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using binary_char_t = typename BasicJsonType::binary_t::value_type;
+    static constexpr std::uint8_t UTF8_ACCEPT = 0;
+    static constexpr std::uint8_t UTF8_REJECT = 1;
+
+  public:
+    /*!
+    @param[in] s  output stream to serialize to
+    @param[in] ichar  indentation character to use
+    @param[in] error_handler_  how to react on decoding errors
+    */
+    serializer(output_adapter_t<char> s, const char ichar,
+               error_handler_t error_handler_ = error_handler_t::strict)
+        : o(std::move(s))
+        , loc(std::localeconv())
+        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->thousands_sep)))
+        , decimal_point(loc->decimal_point == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->decimal_point)))
+        , indent_char(ichar)
+        , indent_string(512, indent_char)
+        , error_handler(error_handler_)
+    {}
+
+    // delete because of pointer members
+    serializer(const serializer&) = delete;
+    serializer& operator=(const serializer&) = delete;
+    serializer(serializer&&) = delete;
+    serializer& operator=(serializer&&) = delete;
+    ~serializer() = default;
+
+    /*!
+    @brief internal implementation of the serialization function
+
+    This function is called by the public member function dump and organizes
+    the serialization internally. The indentation level is propagated as
+    additional parameter. In case of arrays and objects, the function is
+    called recursively.
+
+    - strings and object keys are escaped using `escape_string()`
+    - integer numbers are converted implicitly via `operator<<`
+    - floating-point numbers are converted to a string using `"%g"` format
+    - binary values are serialized as objects containing the subtype and the
+      byte array
+
+    @param[in] val               value to serialize
+    @param[in] pretty_print      whether the output shall be pretty-printed
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] indent_step       the indent level
+    @param[in] current_indent    the current indent level (only used internally)
+    */
+    void dump(const BasicJsonType& val,
+              const bool pretty_print,
+              const bool ensure_ascii,
+              const unsigned int indent_step,
+              const unsigned int current_indent = 0)
+    {
+        switch (val.m_type)
+        {
+            case value_t::object:
+            {
+                if (val.m_value.object->empty())
+                {
+                    o->write_characters("{}", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    auto i = val.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\": ", 3);
+                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\": ", 3);
+                    dump(i->second, true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_character('{');
+
+                    // first n-1 elements
+                    auto i = val.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\":", 2);
+                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\":", 2);
+                    dump(i->second, false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character('}');
+                }
+
+                return;
+            }
+
+            case value_t::array:
+            {
+                if (val.m_value.array->empty())
+                {
+                    o->write_characters("[]", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("[\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    for (auto i = val.m_value.array->cbegin();
+                            i != val.m_value.array->cend() - 1; ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        dump(*i, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_value.array->empty());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character(']');
+                }
+                else
+                {
+                    o->write_character('[');
+
+                    // first n-1 elements
+                    for (auto i = val.m_value.array->cbegin();
+                            i != val.m_value.array->cend() - 1; ++i)
+                    {
+                        dump(*i, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_value.array->empty());
+                    dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character(']');
+                }
+
+                return;
+            }
+
+            case value_t::string:
+            {
+                o->write_character('\"');
+                dump_escaped(*val.m_value.string, ensure_ascii);
+                o->write_character('\"');
+                return;
+            }
+
+            case value_t::binary:
+            {
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"bytes\": [", 10);
+
+                    if (!val.m_value.binary->empty())
+                    {
+                        for (auto i = val.m_value.binary->cbegin();
+                                i != val.m_value.binary->cend() - 1; ++i)
+                        {
+                            dump_integer(*i);
+                            o->write_characters(", ", 2);
+                        }
+                        dump_integer(val.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\n", 3);
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"subtype\": ", 11);
+                    if (val.m_value.binary->has_subtype())
+                    {
+                        dump_integer(val.m_value.binary->subtype());
+                    }
+                    else
+                    {
+                        o->write_characters("null", 4);
+                    }
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_characters("{\"bytes\":[", 10);
+
+                    if (!val.m_value.binary->empty())
+                    {
+                        for (auto i = val.m_value.binary->cbegin();
+                                i != val.m_value.binary->cend() - 1; ++i)
+                        {
+                            dump_integer(*i);
+                            o->write_character(',');
+                        }
+                        dump_integer(val.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\"subtype\":", 12);
+                    if (val.m_value.binary->has_subtype())
+                    {
+                        dump_integer(val.m_value.binary->subtype());
+                        o->write_character('}');
+                    }
+                    else
+                    {
+                        o->write_characters("null}", 5);
+                    }
+                }
+                return;
+            }
+
+            case value_t::boolean:
+            {
+                if (val.m_value.boolean)
+                {
+                    o->write_characters("true", 4);
+                }
+                else
+                {
+                    o->write_characters("false", 5);
+                }
+                return;
+            }
+
+            case value_t::number_integer:
+            {
+                dump_integer(val.m_value.number_integer);
+                return;
+            }
+
+            case value_t::number_unsigned:
+            {
+                dump_integer(val.m_value.number_unsigned);
+                return;
+            }
+
+            case value_t::number_float:
+            {
+                dump_float(val.m_value.number_float);
+                return;
+            }
+
+            case value_t::discarded:
+            {
+                o->write_characters("<discarded>", 11);
+                return;
+            }
+
+            case value_t::null:
+            {
+                o->write_characters("null", 4);
+                return;
+            }
+
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+    }
+
+  private:
+    /*!
+    @brief dump escaped string
+
+    Escape a string by replacing certain special characters by a sequence of an
+    escape character (backslash) and another character and other control
+    characters by a sequence of "\u" followed by a four-digit hex
+    representation. The escaped string is written to output stream @a o.
+
+    @param[in] s  the string to escape
+    @param[in] ensure_ascii  whether to escape non-ASCII characters with
+                             \uXXXX sequences
+
+    @complexity Linear in the length of string @a s.
+    */
+    void dump_escaped(const string_t& s, const bool ensure_ascii)
+    {
+        std::uint32_t codepoint;
+        std::uint8_t state = UTF8_ACCEPT;
+        std::size_t bytes = 0;  // number of bytes written to string_buffer
+
+        // number of bytes written at the point of the last valid byte
+        std::size_t bytes_after_last_accept = 0;
+        std::size_t undumped_chars = 0;
+
+        for (std::size_t i = 0; i < s.size(); ++i)
+        {
+            const auto byte = static_cast<uint8_t>(s[i]);
+
+            switch (decode(state, codepoint, byte))
+            {
+                case UTF8_ACCEPT:  // decode found a new code point
+                {
+                    switch (codepoint)
+                    {
+                        case 0x08: // backspace
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'b';
+                            break;
+                        }
+
+                        case 0x09: // horizontal tab
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 't';
+                            break;
+                        }
+
+                        case 0x0A: // newline
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'n';
+                            break;
+                        }
+
+                        case 0x0C: // formfeed
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'f';
+                            break;
+                        }
+
+                        case 0x0D: // carriage return
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'r';
+                            break;
+                        }
+
+                        case 0x22: // quotation mark
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\"';
+                            break;
+                        }
+
+                        case 0x5C: // reverse solidus
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\\';
+                            break;
+                        }
+
+                        default:
+                        {
+                            // escape control characters (0x00..0x1F) or, if
+                            // ensure_ascii parameter is used, non-ASCII characters
+                            if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F)))
+                            {
+                                if (codepoint <= 0xFFFF)
+                                {
+                                    (std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x",
+                                                    static_cast<std::uint16_t>(codepoint));
+                                    bytes += 6;
+                                }
+                                else
+                                {
+                                    (std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
+                                                    static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
+                                                    static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu)));
+                                    bytes += 12;
+                                }
+                            }
+                            else
+                            {
+                                // copy byte to buffer (all previous bytes
+                                // been copied have in default case above)
+                                string_buffer[bytes++] = s[i];
+                            }
+                            break;
+                        }
+                    }
+
+                    // write buffer and reset index; there must be 13 bytes
+                    // left, as this is the maximal number of bytes to be
+                    // written ("\uxxxx\uxxxx\0") for one code point
+                    if (string_buffer.size() - bytes < 13)
+                    {
+                        o->write_characters(string_buffer.data(), bytes);
+                        bytes = 0;
+                    }
+
+                    // remember the byte position of this accept
+                    bytes_after_last_accept = bytes;
+                    undumped_chars = 0;
+                    break;
+                }
+
+                case UTF8_REJECT:  // decode found invalid UTF-8 byte
+                {
+                    switch (error_handler)
+                    {
+                        case error_handler_t::strict:
+                        {
+                            std::string sn(3, '\0');
+                            (std::snprintf)(&sn[0], sn.size(), "%.2X", byte);
+                            JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
+                        }
+
+                        case error_handler_t::ignore:
+                        case error_handler_t::replace:
+                        {
+                            // in case we saw this character the first time, we
+                            // would like to read it again, because the byte
+                            // may be OK for itself, but just not OK for the
+                            // previous sequence
+                            if (undumped_chars > 0)
+                            {
+                                --i;
+                            }
+
+                            // reset length buffer to the last accepted index;
+                            // thus removing/ignoring the invalid characters
+                            bytes = bytes_after_last_accept;
+
+                            if (error_handler == error_handler_t::replace)
+                            {
+                                // add a replacement character
+                                if (ensure_ascii)
+                                {
+                                    string_buffer[bytes++] = '\\';
+                                    string_buffer[bytes++] = 'u';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'd';
+                                }
+                                else
+                                {
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
+                                }
+
+                                // write buffer and reset index; there must be 13 bytes
+                                // left, as this is the maximal number of bytes to be
+                                // written ("\uxxxx\uxxxx\0") for one code point
+                                if (string_buffer.size() - bytes < 13)
+                                {
+                                    o->write_characters(string_buffer.data(), bytes);
+                                    bytes = 0;
+                                }
+
+                                bytes_after_last_accept = bytes;
+                            }
+
+                            undumped_chars = 0;
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
+                        default:            // LCOV_EXCL_LINE
+                            JSON_ASSERT(false);  // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+
+                default:  // decode found yet incomplete multi-byte code point
+                {
+                    if (!ensure_ascii)
+                    {
+                        // code point will not be escaped - copy byte to buffer
+                        string_buffer[bytes++] = s[i];
+                    }
+                    ++undumped_chars;
+                    break;
+                }
+            }
+        }
+
+        // we finished processing the string
+        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT))
+        {
+            // write buffer
+            if (bytes > 0)
+            {
+                o->write_characters(string_buffer.data(), bytes);
+            }
+        }
+        else
+        {
+            // we finish reading, but do not accept: string was incomplete
+            switch (error_handler)
+            {
+                case error_handler_t::strict:
+                {
+                    std::string sn(3, '\0');
+                    (std::snprintf)(&sn[0], sn.size(), "%.2X", static_cast<std::uint8_t>(s.back()));
+                    JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
+                }
+
+                case error_handler_t::ignore:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    break;
+                }
+
+                case error_handler_t::replace:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    // add a replacement character
+                    if (ensure_ascii)
+                    {
+                        o->write_characters("\\ufffd", 6);
+                    }
+                    else
+                    {
+                        o->write_characters("\xEF\xBF\xBD", 3);
+                    }
+                    break;
+                }
+
+                default:            // LCOV_EXCL_LINE
+                    JSON_ASSERT(false);  // LCOV_EXCL_LINE
+            }
+        }
+    }
+
+    /*!
+    @brief count digits
+
+    Count the number of decimal (base 10) digits for an input unsigned integer.
+
+    @param[in] x  unsigned integer number to count its digits
+    @return    number of decimal digits
+    */
+    inline unsigned int count_digits(number_unsigned_t x) noexcept
+    {
+        unsigned int n_digits = 1;
+        for (;;)
+        {
+            if (x < 10)
+            {
+                return n_digits;
+            }
+            if (x < 100)
+            {
+                return n_digits + 1;
+            }
+            if (x < 1000)
+            {
+                return n_digits + 2;
+            }
+            if (x < 10000)
+            {
+                return n_digits + 3;
+            }
+            x = x / 10000u;
+            n_digits += 4;
+        }
+    }
+
+    /*!
+    @brief dump an integer
+
+    Dump a given integer to output stream @a o. Works internally with
+    @a number_buffer.
+
+    @param[in] x  integer number (signed or unsigned) to dump
+    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
+    */
+    template < typename NumberType, detail::enable_if_t <
+                   std::is_same<NumberType, number_unsigned_t>::value ||
+                   std::is_same<NumberType, number_integer_t>::value ||
+                   std::is_same<NumberType, binary_char_t>::value,
+                   int > = 0 >
+    void dump_integer(NumberType x)
+    {
+        static constexpr std::array<std::array<char, 2>, 100> digits_to_99
+        {
+            {
+                {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}},
+                {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}},
+                {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}},
+                {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}},
+                {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}},
+                {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}},
+                {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
+                {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}},
+                {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}},
+                {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}},
+            }
+        };
+
+        // special case for "0"
+        if (x == 0)
+        {
+            o->write_character('0');
+            return;
+        }
+
+        // use a pointer to fill the buffer
+        auto buffer_ptr = number_buffer.begin();
+
+        const bool is_negative = std::is_same<NumberType, number_integer_t>::value && !(x >= 0); // see issue #755
+        number_unsigned_t abs_value;
+
+        unsigned int n_chars;
+
+        if (is_negative)
+        {
+            *buffer_ptr = '-';
+            abs_value = remove_sign(static_cast<number_integer_t>(x));
+
+            // account one more byte for the minus sign
+            n_chars = 1 + count_digits(abs_value);
+        }
+        else
+        {
+            abs_value = static_cast<number_unsigned_t>(x);
+            n_chars = count_digits(abs_value);
+        }
+
+        // spare 1 byte for '\0'
+        JSON_ASSERT(n_chars < number_buffer.size() - 1);
+
+        // jump to the end to generate the string from backward
+        // so we later avoid reversing the result
+        buffer_ptr += n_chars;
+
+        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
+        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
+        while (abs_value >= 100)
+        {
+            const auto digits_index = static_cast<unsigned>((abs_value % 100));
+            abs_value /= 100;
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+
+        if (abs_value >= 10)
+        {
+            const auto digits_index = static_cast<unsigned>(abs_value);
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+        else
+        {
+            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
+        }
+
+        o->write_characters(number_buffer.data(), n_chars);
+    }
+
+    /*!
+    @brief dump a floating-point number
+
+    Dump a given floating-point number to output stream @a o. Works internally
+    with @a number_buffer.
+
+    @param[in] x  floating-point number to dump
+    */
+    void dump_float(number_float_t x)
+    {
+        // NaN / inf
+        if (!std::isfinite(x))
+        {
+            o->write_characters("null", 4);
+            return;
+        }
+
+        // If number_float_t is an IEEE-754 single or double precision number,
+        // use the Grisu2 algorithm to produce short numbers which are
+        // guaranteed to round-trip, using strtof and strtod, resp.
+        //
+        // NB: The test below works if <long double> == <double>.
+        static constexpr bool is_ieee_single_or_double
+            = (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 24 && std::numeric_limits<number_float_t>::max_exponent == 128) ||
+              (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 53 && std::numeric_limits<number_float_t>::max_exponent == 1024);
+
+        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
+    }
+
+    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
+    {
+        char* begin = number_buffer.data();
+        char* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);
+
+        o->write_characters(begin, static_cast<size_t>(end - begin));
+    }
+
+    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
+    {
+        // get number of digits for a float -> text -> float round-trip
+        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;
+
+        // the actual conversion
+        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);
+
+        // negative value indicates an error
+        JSON_ASSERT(len > 0);
+        // check if buffer was large enough
+        JSON_ASSERT(static_cast<std::size_t>(len) < number_buffer.size());
+
+        // erase thousands separator
+        if (thousands_sep != '\0')
+        {
+            const auto end = std::remove(number_buffer.begin(),
+                                         number_buffer.begin() + len, thousands_sep);
+            std::fill(end, number_buffer.end(), '\0');
+            JSON_ASSERT((end - number_buffer.begin()) <= len);
+            len = (end - number_buffer.begin());
+        }
+
+        // convert decimal point to '.'
+        if (decimal_point != '\0' && decimal_point != '.')
+        {
+            const auto dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
+            if (dec_pos != number_buffer.end())
+            {
+                *dec_pos = '.';
+            }
+        }
+
+        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));
+
+        // determine if need to append ".0"
+        const bool value_is_int_like =
+            std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1,
+                         [](char c)
+        {
+            return c == '.' || c == 'e';
+        });
+
+        if (value_is_int_like)
+        {
+            o->write_characters(".0", 2);
+        }
+    }
+
+    /*!
+    @brief check whether a string is UTF-8 encoded
+
+    The function checks each byte of a string whether it is UTF-8 encoded. The
+    result of the check is stored in the @a state parameter. The function must
+    be called initially with state 0 (accept). State 1 means the string must
+    be rejected, because the current byte is not allowed. If the string is
+    completely processed, but the state is non-zero, the string ended
+    prematurely; that is, the last byte indicated more bytes should have
+    followed.
+
+    @param[in,out] state  the state of the decoding
+    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
+    @param[in] byte       next byte to decode
+    @return               new state
+
+    @note The function has been edited: a std::array is used.
+
+    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    */
+    static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
+    {
+        static const std::array<std::uint8_t, 400> utf8d =
+        {
+            {
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
+                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
+                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
+                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
+                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
+                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
+                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
+                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
+            }
+        };
+
+        const std::uint8_t type = utf8d[byte];
+
+        codep = (state != UTF8_ACCEPT)
+                ? (byte & 0x3fu) | (codep << 6u)
+                : (0xFFu >> type) & (byte);
+
+        std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
+        JSON_ASSERT(index < 400);
+        state = utf8d[index];
+        return state;
+    }
+
+    /*
+     * Overload to make the compiler happy while it is instantiating
+     * dump_integer for number_unsigned_t.
+     * Must never be called.
+     */
+    number_unsigned_t remove_sign(number_unsigned_t x)
+    {
+        JSON_ASSERT(false); // LCOV_EXCL_LINE
+        return x; // LCOV_EXCL_LINE
+    }
+
+    /*
+     * Helper function for dump_integer
+     *
+     * This function takes a negative signed integer and returns its absolute
+     * value as unsigned integer. The plus/minus shuffling is necessary as we can
+     * not directly remove the sign of an arbitrary signed integer as the
+     * absolute values of INT_MIN and INT_MAX are usually not the same. See
+     * #1708 for details.
+     */
+    inline number_unsigned_t remove_sign(number_integer_t x) noexcept
+    {
+        JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)());
+        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
+    }
+
+  private:
+    /// the output of the serializer
+    output_adapter_t<char> o = nullptr;
+
+    /// a (hopefully) large enough character buffer
+    std::array<char, 64> number_buffer{{}};
+
+    /// the locale
+    const std::lconv* loc = nullptr;
+    /// the locale's thousand separator character
+    const char thousands_sep = '\0';
+    /// the locale's decimal point character
+    const char decimal_point = '\0';
+
+    /// string buffer
+    std::array<char, 512> string_buffer{{}};
+
+    /// the indentation character
+    const char indent_char;
+    /// the indentation string
+    string_t indent_string;
+
+    /// error_handler how to react on decoding errors
+    const error_handler_t error_handler;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+// #include <nlohmann/json_fwd.hpp>
+
+// #include <nlohmann/ordered_map.hpp>
+
+
+#include <functional> // less
+#include <memory> // allocator
+#include <utility> // pair
+#include <vector> // vector
+
+namespace nlohmann
+{
+
+/// ordered_map: a minimal map-like container that preserves insertion order
+/// for use within nlohmann::basic_json<ordered_map>
+template <class Key, class T, class IgnoredLess = std::less<Key>,
+          class Allocator = std::allocator<std::pair<const Key, T>>>
+                  struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
+{
+    using key_type = Key;
+    using mapped_type = T;
+    using Container = std::vector<std::pair<const Key, T>, Allocator>;
+    using typename Container::iterator;
+    using typename Container::const_iterator;
+    using typename Container::size_type;
+    using typename Container::value_type;
+
+    // Explicit constructors instead of `using Container::Container`
+    // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4)
+    ordered_map(const Allocator& alloc = Allocator()) : Container{alloc} {}
+    template <class It>
+    ordered_map(It first, It last, const Allocator& alloc = Allocator())
+        : Container{first, last, alloc} {}
+    ordered_map(std::initializer_list<T> init, const Allocator& alloc = Allocator() )
+        : Container{init, alloc} {}
+
+    std::pair<iterator, bool> emplace(const key_type& key, T&& t)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return {it, false};
+            }
+        }
+        Container::emplace_back(key, t);
+        return {--this->end(), true};
+    }
+
+    T& operator[](const Key& key)
+    {
+        return emplace(key, T{}).first->second;
+    }
+
+    const T& operator[](const Key& key) const
+    {
+        return at(key);
+    }
+
+    T& at(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it->second;
+            }
+        }
+
+        throw std::out_of_range("key not found");
+    }
+
+    const T& at(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it->second;
+            }
+        }
+
+        throw std::out_of_range("key not found");
+    }
+
+    size_type erase(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                // Since we cannot move const Keys, re-construct them in place
+                for (auto next = it; ++next != this->end(); ++it)
+                {
+                    it->~value_type(); // Destroy but keep allocation
+                    new (&*it) value_type{std::move(*next)};
+                }
+                Container::pop_back();
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator erase(iterator pos)
+    {
+        auto it = pos;
+
+        // Since we cannot move const Keys, re-construct them in place
+        for (auto next = it; ++next != this->end(); ++it)
+        {
+            it->~value_type(); // Destroy but keep allocation
+            new (&*it) value_type{std::move(*next)};
+        }
+        Container::pop_back();
+        return pos;
+    }
+
+    size_type count(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator find(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    const_iterator find(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    std::pair<iterator, bool> insert( value_type&& value )
+    {
+        return emplace(value.first, std::move(value.second));
+    }
+
+    std::pair<iterator, bool> insert( const value_type& value )
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == value.first)
+            {
+                return {it, false};
+            }
+        }
+        Container::push_back(value);
+        return {--this->end(), true};
+    }
+};
+
+}  // namespace nlohmann
+
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+
+/*!
+@brief a class to store JSON values
+
+@tparam ObjectType type for JSON objects (`std::map` by default; will be used
+in @ref object_t)
+@tparam ArrayType type for JSON arrays (`std::vector` by default; will be used
+in @ref array_t)
+@tparam StringType type for JSON strings and object keys (`std::string` by
+default; will be used in @ref string_t)
+@tparam BooleanType type for JSON booleans (`bool` by default; will be used
+in @ref boolean_t)
+@tparam NumberIntegerType type for JSON integer numbers (`int64_t` by
+default; will be used in @ref number_integer_t)
+@tparam NumberUnsignedType type for JSON unsigned integer numbers (@c
+`uint64_t` by default; will be used in @ref number_unsigned_t)
+@tparam NumberFloatType type for JSON floating-point numbers (`double` by
+default; will be used in @ref number_float_t)
+@tparam BinaryType type for packed binary data for compatibility with binary
+serialization formats (`std::vector<std::uint8_t>` by default; will be used in
+@ref binary_t)
+@tparam AllocatorType type of the allocator to use (`std::allocator` by
+default)
+@tparam JSONSerializer the serializer to resolve internal calls to `to_json()`
+and `from_json()` (@ref adl_serializer by default)
+
+@requirement The class satisfies the following concept requirements:
+- Basic
+ - [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible):
+   JSON values can be default constructed. The result will be a JSON null
+   value.
+ - [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible):
+   A JSON value can be constructed from an rvalue argument.
+ - [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible):
+   A JSON value can be copy-constructed from an lvalue expression.
+ - [MoveAssignable](https://en.cppreference.com/w/cpp/named_req/MoveAssignable):
+   A JSON value van be assigned from an rvalue argument.
+ - [CopyAssignable](https://en.cppreference.com/w/cpp/named_req/CopyAssignable):
+   A JSON value can be copy-assigned from an lvalue expression.
+ - [Destructible](https://en.cppreference.com/w/cpp/named_req/Destructible):
+   JSON values can be destructed.
+- Layout
+ - [StandardLayoutType](https://en.cppreference.com/w/cpp/named_req/StandardLayoutType):
+   JSON values have
+   [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
+   All non-static data members are private and standard layout types, the
+   class has no virtual functions or (virtual) base classes.
+- Library-wide
+ - [EqualityComparable](https://en.cppreference.com/w/cpp/named_req/EqualityComparable):
+   JSON values can be compared with `==`, see @ref
+   operator==(const_reference,const_reference).
+ - [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable):
+   JSON values can be compared with `<`, see @ref
+   operator<(const_reference,const_reference).
+ - [Swappable](https://en.cppreference.com/w/cpp/named_req/Swappable):
+   Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
+   other compatible types, using unqualified function call @ref swap().
+ - [NullablePointer](https://en.cppreference.com/w/cpp/named_req/NullablePointer):
+   JSON values can be compared against `std::nullptr_t` objects which are used
+   to model the `null` value.
+- Container
+ - [Container](https://en.cppreference.com/w/cpp/named_req/Container):
+   JSON values can be used like STL containers and provide iterator access.
+ - [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer);
+   JSON values can be used like STL containers and provide reverse iterator
+   access.
+
+@invariant The member variables @a m_value and @a m_type have the following
+relationship:
+- If `m_type == value_t::object`, then `m_value.object != nullptr`.
+- If `m_type == value_t::array`, then `m_value.array != nullptr`.
+- If `m_type == value_t::string`, then `m_value.string != nullptr`.
+The invariants are checked by member function assert_invariant().
+
+@internal
+@note ObjectType trick from https://stackoverflow.com/a/9860911
+@endinternal
+
+@see [RFC 7159: The JavaScript Object Notation (JSON) Data Interchange
+Format](http://rfc7159.net/rfc7159)
+
+@since version 1.0.0
+
+@nosubgrouping
+*/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+class basic_json
+{
+  private:
+    template<detail::value_t> friend struct detail::external_constructor;
+    friend ::nlohmann::json_pointer<basic_json>;
+
+    template<typename BasicJsonType, typename InputType>
+    friend class ::nlohmann::detail::parser;
+    friend ::nlohmann::detail::serializer<basic_json>;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::iter_impl;
+    template<typename BasicJsonType, typename CharType>
+    friend class ::nlohmann::detail::binary_writer;
+    template<typename BasicJsonType, typename InputType, typename SAX>
+    friend class ::nlohmann::detail::binary_reader;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_parser;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
+
+    /// workaround type for MSVC
+    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
+
+    // convenience aliases for types residing in namespace detail;
+    using lexer = ::nlohmann::detail::lexer_base<basic_json>;
+
+    template<typename InputAdapterType>
+    static ::nlohmann::detail::parser<basic_json, InputAdapterType> parser(
+        InputAdapterType adapter,
+        detail::parser_callback_t<basic_json>cb = nullptr,
+        const bool allow_exceptions = true,
+        const bool ignore_comments = false
+                                 )
+    {
+        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
+                std::move(cb), allow_exceptions, ignore_comments);
+    }
+
+    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
+    template<typename BasicJsonType>
+    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
+    template<typename BasicJsonType>
+    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
+    template<typename Iterator>
+    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
+    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;
+
+    template<typename CharType>
+    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;
+
+    template<typename InputType>
+    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
+    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;
+
+    using serializer = ::nlohmann::detail::serializer<basic_json>;
+
+  public:
+    using value_t = detail::value_t;
+    /// JSON Pointer, see @ref nlohmann::json_pointer
+    using json_pointer = ::nlohmann::json_pointer<basic_json>;
+    template<typename T, typename SFINAE>
+    using json_serializer = JSONSerializer<T, SFINAE>;
+    /// how to treat decoding errors
+    using error_handler_t = detail::error_handler_t;
+    /// how to treat CBOR tags
+    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
+    /// helper type for initializer lists of basic_json values
+    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
+
+    using input_format_t = detail::input_format_t;
+    /// SAX interface type, see @ref nlohmann::json_sax
+    using json_sax_t = json_sax<basic_json>;
+
+    ////////////////
+    // exceptions //
+    ////////////////
+
+    /// @name exceptions
+    /// Classes to implement user-defined exceptions.
+    /// @{
+
+    /// @copydoc detail::exception
+    using exception = detail::exception;
+    /// @copydoc detail::parse_error
+    using parse_error = detail::parse_error;
+    /// @copydoc detail::invalid_iterator
+    using invalid_iterator = detail::invalid_iterator;
+    /// @copydoc detail::type_error
+    using type_error = detail::type_error;
+    /// @copydoc detail::out_of_range
+    using out_of_range = detail::out_of_range;
+    /// @copydoc detail::other_error
+    using other_error = detail::other_error;
+
+    /// @}
+
+
+    /////////////////////
+    // container types //
+    /////////////////////
+
+    /// @name container types
+    /// The canonic container types to use @ref basic_json like any other STL
+    /// container.
+    /// @{
+
+    /// the type of elements in a basic_json container
+    using value_type = basic_json;
+
+    /// the type of an element reference
+    using reference = value_type&;
+    /// the type of an element const reference
+    using const_reference = const value_type&;
+
+    /// a type to represent differences between iterators
+    using difference_type = std::ptrdiff_t;
+    /// a type to represent container sizes
+    using size_type = std::size_t;
+
+    /// the allocator type
+    using allocator_type = AllocatorType<basic_json>;
+
+    /// the type of an element pointer
+    using pointer = typename std::allocator_traits<allocator_type>::pointer;
+    /// the type of an element const pointer
+    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
+
+    /// an iterator for a basic_json container
+    using iterator = iter_impl<basic_json>;
+    /// a const iterator for a basic_json container
+    using const_iterator = iter_impl<const basic_json>;
+    /// a reverse iterator for a basic_json container
+    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
+    /// a const reverse iterator for a basic_json container
+    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
+
+    /// @}
+
+
+    /*!
+    @brief returns the allocator associated with the container
+    */
+    static allocator_type get_allocator()
+    {
+        return allocator_type();
+    }
+
+    /*!
+    @brief returns version information on the library
+
+    This function returns a JSON object with information about the library,
+    including the version number and information on the platform and compiler.
+
+    @return JSON object holding version information
+    key         | description
+    ----------- | ---------------
+    `compiler`  | Information on the used compiler. It is an object with the following keys: `c++` (the used C++ standard), `family` (the compiler family; possible values are `clang`, `icc`, `gcc`, `ilecpp`, `msvc`, `pgcpp`, `sunpro`, and `unknown`), and `version` (the compiler version).
+    `copyright` | The copyright line for the library as string.
+    `name`      | The name of the library as string.
+    `platform`  | The used platform as string. Possible values are `win32`, `linux`, `apple`, `unix`, and `unknown`.
+    `url`       | The URL of the project as string.
+    `version`   | The version of the library. It is an object with the following keys: `major`, `minor`, and `patch` as defined by [Semantic Versioning](http://semver.org), and `string` (the version string).
+
+    @liveexample{The following code shows an example output of the `meta()`
+    function.,meta}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @complexity Constant.
+
+    @since 2.1.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json meta()
+    {
+        basic_json result;
+
+        result["copyright"] = "(C) 2013-2020 Niels Lohmann";
+        result["name"] = "JSON for Modern C++";
+        result["url"] = "https://github.com/nlohmann/json";
+        result["version"]["string"] =
+            std::to_string(NLOHMANN_JSON_VERSION_MAJOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_MINOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_PATCH);
+        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
+        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
+        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;
+
+#ifdef _WIN32
+        result["platform"] = "win32";
+#elif defined __linux__
+        result["platform"] = "linux";
+#elif defined __APPLE__
+        result["platform"] = "apple";
+#elif defined __unix__
+        result["platform"] = "unix";
+#else
+        result["platform"] = "unknown";
+#endif
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
+#elif defined(__clang__)
+        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
+#elif defined(__GNUC__) || defined(__GNUG__)
+        result["compiler"] = {{"family", "gcc"}, {"version", std::to_string(__GNUC__) + "." + std::to_string(__GNUC_MINOR__) + "." + std::to_string(__GNUC_PATCHLEVEL__)}};
+#elif defined(__HP_cc) || defined(__HP_aCC)
+        result["compiler"] = "hp"
+#elif defined(__IBMCPP__)
+        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
+#elif defined(_MSC_VER)
+        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
+#elif defined(__PGI)
+        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
+#elif defined(__SUNPRO_CC)
+        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
+#else
+        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
+#endif
+
+#ifdef __cplusplus
+        result["compiler"]["c++"] = std::to_string(__cplusplus);
+#else
+        result["compiler"]["c++"] = "unknown";
+#endif
+        return result;
+    }
+
+
+    ///////////////////////////
+    // JSON value data types //
+    ///////////////////////////
+
+    /// @name JSON value data types
+    /// The data types to store a JSON value. These types are derived from
+    /// the template arguments passed to class @ref basic_json.
+    /// @{
+
+#if defined(JSON_HAS_CPP_14)
+    // Use transparent comparator if possible, combined with perfect forwarding
+    // on find() and count() calls prevents unnecessary string construction.
+    using object_comparator_t = std::less<>;
+#else
+    using object_comparator_t = std::less<StringType>;
+#endif
+
+    /*!
+    @brief a type for an object
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON objects as follows:
+    > An object is an unordered collection of zero or more name/value pairs,
+    > where a name is a string and a value is a string, number, boolean, null,
+    > object, or array.
+
+    To store objects in C++, a type is defined by the template parameters
+    described below.
+
+    @tparam ObjectType  the container to store objects (e.g., `std::map` or
+    `std::unordered_map`)
+    @tparam StringType the type of the keys or names (e.g., `std::string`).
+    The comparison function `std::less<StringType>` is used to order elements
+    inside the container.
+    @tparam AllocatorType the allocator to use for objects (e.g.,
+    `std::allocator`)
+
+    #### Default type
+
+    With the default values for @a ObjectType (`std::map`), @a StringType
+    (`std::string`), and @a AllocatorType (`std::allocator`), the default
+    value for @a object_t is:
+
+    @code {.cpp}
+    std::map<
+      std::string, // key_type
+      basic_json, // value_type
+      std::less<std::string>, // key_compare
+      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
+    >
+    @endcode
+
+    #### Behavior
+
+    The choice of @a object_t influences the behavior of the JSON class. With
+    the default type, objects have the following behavior:
+
+    - When all names are unique, objects will be interoperable in the sense
+      that all software implementations receiving that object will agree on
+      the name-value mappings.
+    - When the names within an object are not unique, it is unspecified which
+      one of the values for a given key will be chosen. For instance,
+      `{"key": 2, "key": 1}` could be equal to either `{"key": 1}` or
+      `{"key": 2}`.
+    - Internally, name/value pairs are stored in lexicographical order of the
+      names. Objects will also be serialized (see @ref dump) in this order.
+      For instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored
+      and serialized as `{"a": 2, "b": 1}`.
+    - When comparing objects, the order of the name/value pairs is irrelevant.
+      This makes objects interoperable in the sense that they will not be
+      affected by these differences. For instance, `{"b": 1, "a": 2}` and
+      `{"a": 2, "b": 1}` will be treated as equal.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the object's limit of nesting is not explicitly constrained.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the
+    @ref max_size function of a JSON object.
+
+    #### Storage
+
+    Objects are stored as pointers in a @ref basic_json type. That is, for any
+    access to object values, a pointer of type `object_t*` must be
+    dereferenced.
+
+    @sa @ref array_t -- type for an array value
+
+    @since version 1.0.0
+
+    @note The order name/value pairs are added to the object is *not*
+    preserved by the library. Therefore, iterating an object may return
+    name/value pairs in a different order than they were originally stored. In
+    fact, keys will be traversed in alphabetical order as `std::map` with
+    `std::less` is used by default. Please note this behavior conforms to [RFC
+    7159](http://rfc7159.net/rfc7159), because any order implements the
+    specified "unordered" nature of JSON objects.
+    */
+    using object_t = ObjectType<StringType,
+          basic_json,
+          object_comparator_t,
+          AllocatorType<std::pair<const StringType,
+          basic_json>>>;
+
+    /*!
+    @brief a type for an array
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON arrays as follows:
+    > An array is an ordered sequence of zero or more values.
+
+    To store objects in C++, a type is defined by the template parameters
+    explained below.
+
+    @tparam ArrayType  container type to store arrays (e.g., `std::vector` or
+    `std::list`)
+    @tparam AllocatorType allocator to use for arrays (e.g., `std::allocator`)
+
+    #### Default type
+
+    With the default values for @a ArrayType (`std::vector`) and @a
+    AllocatorType (`std::allocator`), the default value for @a array_t is:
+
+    @code {.cpp}
+    std::vector<
+      basic_json, // value_type
+      std::allocator<basic_json> // allocator_type
+    >
+    @endcode
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the array's limit of nesting is not explicitly constrained.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the
+    @ref max_size function of a JSON array.
+
+    #### Storage
+
+    Arrays are stored as pointers in a @ref basic_json type. That is, for any
+    access to array values, a pointer of type `array_t*` must be dereferenced.
+
+    @sa @ref object_t -- type for an object value
+
+    @since version 1.0.0
+    */
+    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
+
+    /*!
+    @brief a type for a string
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON strings as follows:
+    > A string is a sequence of zero or more Unicode characters.
+
+    To store objects in C++, a type is defined by the template parameter
+    described below. Unicode values are split by the JSON class into
+    byte-sized characters during deserialization.
+
+    @tparam StringType  the container to store strings (e.g., `std::string`).
+    Note this container is used for keys/names in objects, see @ref object_t.
+
+    #### Default type
+
+    With the default values for @a StringType (`std::string`), the default
+    value for @a string_t is:
+
+    @code {.cpp}
+    std::string
+    @endcode
+
+    #### Encoding
+
+    Strings are stored in UTF-8 encoding. Therefore, functions like
+    `std::string::size()` or `std::string::length()` return the number of
+    bytes in the string rather than the number of characters or glyphs.
+
+    #### String comparison
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > Software implementations are typically required to test names of object
+    > members for equality. Implementations that transform the textual
+    > representation into sequences of Unicode code units and then perform the
+    > comparison numerically, code unit by code unit, are interoperable in the
+    > sense that implementations will agree in all cases on equality or
+    > inequality of two strings. For example, implementations that compare
+    > strings with escaped characters unconverted may incorrectly find that
+    > `"a\\b"` and `"a\u005Cb"` are not equal.
+
+    This implementation is interoperable as it does compare strings code unit
+    by code unit.
+
+    #### Storage
+
+    String values are stored as pointers in a @ref basic_json type. That is,
+    for any access to string values, a pointer of type `string_t*` must be
+    dereferenced.
+
+    @since version 1.0.0
+    */
+    using string_t = StringType;
+
+    /*!
+    @brief a type for a boolean
+
+    [RFC 7159](http://rfc7159.net/rfc7159) implicitly describes a boolean as a
+    type which differentiates the two literals `true` and `false`.
+
+    To store objects in C++, a type is defined by the template parameter @a
+    BooleanType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a BooleanType (`bool`), the default value for
+    @a boolean_t is:
+
+    @code {.cpp}
+    bool
+    @endcode
+
+    #### Storage
+
+    Boolean values are stored directly inside a @ref basic_json type.
+
+    @since version 1.0.0
+    */
+    using boolean_t = BooleanType;
+
+    /*!
+    @brief a type for a number (integer)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store integer numbers in C++, a type is defined by the template
+    parameter @a NumberIntegerType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberIntegerType (`int64_t`), the default
+    value for @a number_integer_t is:
+
+    @code {.cpp}
+    int64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`.
+      During deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `9223372036854775807` (INT64_MAX) and the minimal integer number
+    that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers
+    that are out of range will yield over/underflow when used in a
+    constructor. During deserialization, too large or small integer numbers
+    will be automatically be stored as @ref number_unsigned_t or @ref
+    number_float_t.
+
+    [RFC 7159](http://rfc7159.net/rfc7159) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange of the exactly supported range [INT64_MIN,
+    INT64_MAX], this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a @ref basic_json type.
+
+    @sa @ref number_float_t -- type for number values (floating-point)
+
+    @sa @ref number_unsigned_t -- type for number values (unsigned integer)
+
+    @since version 1.0.0
+    */
+    using number_integer_t = NumberIntegerType;
+
+    /*!
+    @brief a type for a number (unsigned)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store unsigned integer numbers in C++, a type is defined by the
+    template parameter @a NumberUnsignedType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberUnsignedType (`uint64_t`), the
+    default value for @a number_unsigned_t is:
+
+    @code {.cpp}
+    uint64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`.
+      During deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `18446744073709551615` (UINT64_MAX) and the minimal integer
+    number that can be stored is `0`. Integer numbers that are out of range
+    will yield over/underflow when used in a constructor. During
+    deserialization, too large or small integer numbers will be automatically
+    be stored as @ref number_integer_t or @ref number_float_t.
+
+    [RFC 7159](http://rfc7159.net/rfc7159) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange (when considered in conjunction with the
+    number_integer_t type) of the exactly supported range [0, UINT64_MAX],
+    this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a @ref basic_json type.
+
+    @sa @ref number_float_t -- type for number values (floating-point)
+    @sa @ref number_integer_t -- type for number values (integer)
+
+    @since version 2.0.0
+    */
+    using number_unsigned_t = NumberUnsignedType;
+
+    /*!
+    @brief a type for a number (floating-point)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store floating-point numbers in C++, a type is defined by the template
+    parameter @a NumberFloatType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberFloatType (`double`), the default
+    value for @a number_float_t is:
+
+    @code {.cpp}
+    double
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in floating-point literals will be ignored. Internally,
+      the value will be stored as decimal number. For instance, the C++
+      floating-point literal `01.2` will be serialized to `1.2`. During
+      deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > This specification allows implementations to set limits on the range and
+    > precision of numbers accepted. Since software that implements IEEE
+    > 754-2008 binary64 (double precision) numbers is generally available and
+    > widely used, good interoperability can be achieved by implementations
+    > that expect no more precision or range than these provide, in the sense
+    > that implementations will approximate JSON numbers within the expected
+    > precision.
+
+    This implementation does exactly follow this approach, as it uses double
+    precision floating-point numbers. Note values smaller than
+    `-1.79769313486232e+308` and values greater than `1.79769313486232e+308`
+    will be stored as NaN internally and be serialized to `null`.
+
+    #### Storage
+
+    Floating-point number values are stored directly inside a @ref basic_json
+    type.
+
+    @sa @ref number_integer_t -- type for number values (integer)
+
+    @sa @ref number_unsigned_t -- type for number values (unsigned integer)
+
+    @since version 1.0.0
+    */
+    using number_float_t = NumberFloatType;
+
+    /*!
+    @brief a type for a packed binary type
+
+    This type is a type designed to carry binary data that appears in various
+    serialized formats, such as CBOR's Major Type 2, MessagePack's bin, and
+    BSON's generic binary subtype. This type is NOT a part of standard JSON and
+    exists solely for compatibility with these binary types. As such, it is
+    simply defined as an ordered sequence of zero or more byte values.
+
+    Additionally, as an implementation detail, the subtype of the binary data is
+    carried around as a `std::uint8_t`, which is compatible with both of the
+    binary data formats that use binary subtyping, (though the specific
+    numbering is incompatible with each other, and it is up to the user to
+    translate between them).
+
+    [CBOR's RFC 7049](https://tools.ietf.org/html/rfc7049) describes this type
+    as:
+    > Major type 2: a byte string. The string's length in bytes is represented
+    > following the rules for positive integers (major type 0).
+
+    [MessagePack's documentation on the bin type
+    family](https://github.com/msgpack/msgpack/blob/master/spec.md#bin-format-family)
+    describes this type as:
+    > Bin format family stores an byte array in 2, 3, or 5 bytes of extra bytes
+    > in addition to the size of the byte array.
+
+    [BSON's specifications](http://bsonspec.org/spec.html) describe several
+    binary types; however, this type is intended to represent the generic binary
+    type which has the description:
+    > Generic binary subtype - This is the most commonly used binary subtype and
+    > should be the 'default' for drivers and tools.
+
+    None of these impose any limitations on the internal representation other
+    than the basic unit of storage be some type of array whose parts are
+    decomposable into bytes.
+
+    The default representation of this binary format is a
+    `std::vector<std::uint8_t>`, which is a very common way to represent a byte
+    array in modern C++.
+
+    #### Default type
+
+    The default values for @a BinaryType is `std::vector<std::uint8_t>`
+
+    #### Storage
+
+    Binary Arrays are stored as pointers in a @ref basic_json type. That is,
+    for any access to array values, a pointer of the type `binary_t*` must be
+    dereferenced.
+
+    #### Notes on subtypes
+
+    - CBOR
+       - Binary values are represented as byte strings. No subtypes are
+         supported and will be ignored when CBOR is written.
+    - MessagePack
+       - If a subtype is given and the binary array contains exactly 1, 2, 4, 8,
+         or 16 elements, the fixext family (fixext1, fixext2, fixext4, fixext8)
+         is used. For other sizes, the ext family (ext8, ext16, ext32) is used.
+         The subtype is then added as singed 8-bit integer.
+       - If no subtype is given, the bin family (bin8, bin16, bin32) is used.
+    - BSON
+       - If a subtype is given, it is used and added as unsigned 8-bit integer.
+       - If no subtype is given, the generic binary subtype 0x00 is used.
+
+    @sa @ref binary -- create a binary array
+
+    @since version 3.8.0
+    */
+    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;
+    /// @}
+
+  private:
+
+    /// helper for exception-safe object creation
+    template<typename T, typename... Args>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    static T* create(Args&& ... args)
+    {
+        AllocatorType<T> alloc;
+        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
+
+        auto deleter = [&](T * object)
+        {
+            AllocatorTraits::deallocate(alloc, object, 1);
+        };
+        std::unique_ptr<T, decltype(deleter)> object(AllocatorTraits::allocate(alloc, 1), deleter);
+        AllocatorTraits::construct(alloc, object.get(), std::forward<Args>(args)...);
+        JSON_ASSERT(object != nullptr);
+        return object.release();
+    }
+
+    ////////////////////////
+    // JSON value storage //
+    ////////////////////////
+
+    /*!
+    @brief a JSON value
+
+    The actual storage for a JSON value of the @ref basic_json class. This
+    union combines the different storage types for the JSON value types
+    defined in @ref value_t.
+
+    JSON type | value_t type    | used type
+    --------- | --------------- | ------------------------
+    object    | object          | pointer to @ref object_t
+    array     | array           | pointer to @ref array_t
+    string    | string          | pointer to @ref string_t
+    boolean   | boolean         | @ref boolean_t
+    number    | number_integer  | @ref number_integer_t
+    number    | number_unsigned | @ref number_unsigned_t
+    number    | number_float    | @ref number_float_t
+    binary    | binary          | pointer to @ref binary_t
+    null      | null            | *no value is stored*
+
+    @note Variable-length types (objects, arrays, and strings) are stored as
+    pointers. The size of the union should not exceed 64 bits if the default
+    value types are used.
+
+    @since version 1.0.0
+    */
+    union json_value
+    {
+        /// object (stored with pointer to save storage)
+        object_t* object;
+        /// array (stored with pointer to save storage)
+        array_t* array;
+        /// string (stored with pointer to save storage)
+        string_t* string;
+        /// binary (stored with pointer to save storage)
+        binary_t* binary;
+        /// boolean
+        boolean_t boolean;
+        /// number (integer)
+        number_integer_t number_integer;
+        /// number (unsigned integer)
+        number_unsigned_t number_unsigned;
+        /// number (floating-point)
+        number_float_t number_float;
+
+        /// default constructor (for null values)
+        json_value() = default;
+        /// constructor for booleans
+        json_value(boolean_t v) noexcept : boolean(v) {}
+        /// constructor for numbers (integer)
+        json_value(number_integer_t v) noexcept : number_integer(v) {}
+        /// constructor for numbers (unsigned)
+        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
+        /// constructor for numbers (floating-point)
+        json_value(number_float_t v) noexcept : number_float(v) {}
+        /// constructor for empty values of a given type
+        json_value(value_t t)
+        {
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    object = create<object_t>();
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    array = create<array_t>();
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    string = create<string_t>("");
+                    break;
+                }
+
+                case value_t::binary:
+                {
+                    binary = create<binary_t>();
+                    break;
+                }
+
+                case value_t::boolean:
+                {
+                    boolean = boolean_t(false);
+                    break;
+                }
+
+                case value_t::number_integer:
+                {
+                    number_integer = number_integer_t(0);
+                    break;
+                }
+
+                case value_t::number_unsigned:
+                {
+                    number_unsigned = number_unsigned_t(0);
+                    break;
+                }
+
+                case value_t::number_float:
+                {
+                    number_float = number_float_t(0.0);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    break;
+                }
+
+                default:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
+                    {
+                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.9.1")); // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+            }
+        }
+
+        /// constructor for strings
+        json_value(const string_t& value)
+        {
+            string = create<string_t>(value);
+        }
+
+        /// constructor for rvalue strings
+        json_value(string_t&& value)
+        {
+            string = create<string_t>(std::move(value));
+        }
+
+        /// constructor for objects
+        json_value(const object_t& value)
+        {
+            object = create<object_t>(value);
+        }
+
+        /// constructor for rvalue objects
+        json_value(object_t&& value)
+        {
+            object = create<object_t>(std::move(value));
+        }
+
+        /// constructor for arrays
+        json_value(const array_t& value)
+        {
+            array = create<array_t>(value);
+        }
+
+        /// constructor for rvalue arrays
+        json_value(array_t&& value)
+        {
+            array = create<array_t>(std::move(value));
+        }
+
+        /// constructor for binary arrays
+        json_value(const typename binary_t::container_type& value)
+        {
+            binary = create<binary_t>(value);
+        }
+
+        /// constructor for rvalue binary arrays
+        json_value(typename binary_t::container_type&& value)
+        {
+            binary = create<binary_t>(std::move(value));
+        }
+
+        /// constructor for binary arrays (internal type)
+        json_value(const binary_t& value)
+        {
+            binary = create<binary_t>(value);
+        }
+
+        /// constructor for rvalue binary arrays (internal type)
+        json_value(binary_t&& value)
+        {
+            binary = create<binary_t>(std::move(value));
+        }
+
+        void destroy(value_t t) noexcept
+        {
+            // flatten the current json_value to a heap-allocated stack
+            std::vector<basic_json> stack;
+
+            // move the top-level items to stack
+            if (t == value_t::array)
+            {
+                stack.reserve(array->size());
+                std::move(array->begin(), array->end(), std::back_inserter(stack));
+            }
+            else if (t == value_t::object)
+            {
+                stack.reserve(object->size());
+                for (auto&& it : *object)
+                {
+                    stack.push_back(std::move(it.second));
+                }
+            }
+
+            while (!stack.empty())
+            {
+                // move the last item to local variable to be processed
+                basic_json current_item(std::move(stack.back()));
+                stack.pop_back();
+
+                // if current_item is array/object, move
+                // its children to the stack to be processed later
+                if (current_item.is_array())
+                {
+                    std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(),
+                              std::back_inserter(stack));
+
+                    current_item.m_value.array->clear();
+                }
+                else if (current_item.is_object())
+                {
+                    for (auto&& it : *current_item.m_value.object)
+                    {
+                        stack.push_back(std::move(it.second));
+                    }
+
+                    current_item.m_value.object->clear();
+                }
+
+                // it's now safe that current_item get destructed
+                // since it doesn't have any children
+            }
+
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    AllocatorType<object_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    AllocatorType<array_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
+                    break;
+                }
+
+                case value_t::binary:
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
+                    break;
+                }
+
+                default:
+                {
+                    break;
+                }
+            }
+        }
+    };
+
+    /*!
+    @brief checks the class invariants
+
+    This function asserts the class invariants. It needs to be called at the
+    end of every constructor to make sure that created objects respect the
+    invariant. Furthermore, it has to be called each time the type of a JSON
+    value is changed, because the invariant expresses a relationship between
+    @a m_type and @a m_value.
+    */
+    void assert_invariant() const noexcept
+    {
+        JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr);
+        JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr);
+        JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr);
+        JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr);
+    }
+
+  public:
+    //////////////////////////
+    // JSON parser callback //
+    //////////////////////////
+
+    /*!
+    @brief parser event types
+
+    The parser callback distinguishes the following events:
+    - `object_start`: the parser read `{` and started to process a JSON object
+    - `key`: the parser read a key of a value in an object
+    - `object_end`: the parser read `}` and finished processing a JSON object
+    - `array_start`: the parser read `[` and started to process a JSON array
+    - `array_end`: the parser read `]` and finished processing a JSON array
+    - `value`: the parser finished reading a JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    @sa @ref parser_callback_t for more information and examples
+    */
+    using parse_event_t = detail::parse_event_t;
+
+    /*!
+    @brief per-element parser callback type
+
+    With a parser callback function, the result of parsing a JSON text can be
+    influenced. When passed to @ref parse, it is called on certain events
+    (passed as @ref parse_event_t via parameter @a event) with a set recursion
+    depth @a depth and context JSON value @a parsed. The return value of the
+    callback function is a boolean indicating whether the element that emitted
+    the callback shall be kept or not.
+
+    We distinguish six scenarios (determined by the event type) in which the
+    callback function can be called. The following table describes the values
+    of the parameters @a depth, @a event, and @a parsed.
+
+    parameter @a event | description | parameter @a depth | parameter @a parsed
+    ------------------ | ----------- | ------------------ | -------------------
+    parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded
+    parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key
+    parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object
+    parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded
+    parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
+    parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    Discarding a value (i.e., returning `false`) has different effects
+    depending on the context in which function was called:
+
+    - Discarded values in structured types are skipped. That is, the parser
+      will behave as if the discarded value was never read.
+    - In case a value outside a structured type is skipped, it is replaced
+      with `null`. This case happens if the top-level element is skipped.
+
+    @param[in] depth  the depth of the recursion during parsing
+
+    @param[in] event  an event of type parse_event_t indicating the context in
+    the callback function has been called
+
+    @param[in,out] parsed  the current intermediate parse result; note that
+    writing to this value has no effect for parse_event_t::key events
+
+    @return Whether the JSON value which called the function during parsing
+    should be kept (`true`) or not (`false`). In the latter case, it is either
+    skipped completely or replaced by an empty discarded object.
+
+    @sa @ref parse for examples
+
+    @since version 1.0.0
+    */
+    using parser_callback_t = detail::parser_callback_t<basic_json>;
+
+    //////////////////
+    // constructors //
+    //////////////////
+
+    /// @name constructors and destructors
+    /// Constructors of class @ref basic_json, copy/move constructor, copy
+    /// assignment, static functions creating objects, and the destructor.
+    /// @{
+
+    /*!
+    @brief create an empty value with a given type
+
+    Create an empty JSON value with a given type. The value will be default
+    initialized with an empty value which depends on the type:
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+    binary      | empty array
+
+    @param[in] v  the type of the value to create
+
+    @complexity Constant.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows the constructor for different @ref
+    value_t values,basic_json__value_t}
+
+    @sa @ref clear() -- restores the postcondition of this constructor
+
+    @since version 1.0.0
+    */
+    basic_json(const value_t v)
+        : m_type(v), m_value(v)
+    {
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a null object
+
+    Create a `null` JSON value. It either takes a null pointer as parameter
+    (explicitly creating `null`) or no parameter (implicitly creating `null`).
+    The passed null pointer itself is not read -- it is only used to choose
+    the right constructor.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this constructor never throws
+    exceptions.
+
+    @liveexample{The following code shows the constructor with and without a
+    null pointer parameter.,basic_json__nullptr_t}
+
+    @since version 1.0.0
+    */
+    basic_json(std::nullptr_t = nullptr) noexcept
+        : basic_json(value_t::null)
+    {
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a JSON value
+
+    This is a "catch all" constructor for all compatible JSON types; that is,
+    types for which a `to_json()` method exists. The constructor forwards the
+    parameter @a val to that method (to `json_serializer<U>::to_json` method
+    with `U = uncvref_t<CompatibleType>`, to be exact).
+
+    Template type @a CompatibleType includes, but is not limited to, the
+    following types:
+    - **arrays**: @ref array_t and all kinds of compatible containers such as
+      `std::vector`, `std::deque`, `std::list`, `std::forward_list`,
+      `std::array`, `std::valarray`, `std::set`, `std::unordered_set`,
+      `std::multiset`, and `std::unordered_multiset` with a `value_type` from
+      which a @ref basic_json value can be constructed.
+    - **objects**: @ref object_t and all kinds of compatible associative
+      containers such as `std::map`, `std::unordered_map`, `std::multimap`,
+      and `std::unordered_multimap` with a `key_type` compatible to
+      @ref string_t and a `value_type` from which a @ref basic_json value can
+      be constructed.
+    - **strings**: @ref string_t, string literals, and all compatible string
+      containers can be used.
+    - **numbers**: @ref number_integer_t, @ref number_unsigned_t,
+      @ref number_float_t, and all convertible number types such as `int`,
+      `size_t`, `int64_t`, `float` or `double` can be used.
+    - **boolean**: @ref boolean_t / `bool` can be used.
+    - **binary**: @ref binary_t / `std::vector<uint8_t>` may be used,
+      unfortunately because string literals cannot be distinguished from binary
+      character arrays by the C++ type system, all types compatible with `const
+      char*` will be directed to the string constructor instead.  This is both
+      for backwards compatibility, and due to the fact that a binary type is not
+      a standard JSON type.
+
+    See the examples below.
+
+    @tparam CompatibleType a type such that:
+    - @a CompatibleType is not derived from `std::istream`,
+    - @a CompatibleType is not @ref basic_json (to avoid hijacking copy/move
+         constructors),
+    - @a CompatibleType is not a different @ref basic_json type (i.e. with different template arguments)
+    - @a CompatibleType is not a @ref basic_json nested type (e.g.,
+         @ref json_pointer, @ref iterator, etc ...)
+    - @ref @ref json_serializer<U> has a
+         `to_json(basic_json_t&, CompatibleType&&)` method
+
+    @tparam U = `uncvref_t<CompatibleType>`
+
+    @param[in] val the value to be forwarded to the respective constructor
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @liveexample{The following code shows the constructor with several
+    compatible types.,basic_json__CompatibleType}
+
+    @since version 2.1.0
+    */
+    template < typename CompatibleType,
+               typename U = detail::uncvref_t<CompatibleType>,
+               detail::enable_if_t <
+                   !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
+    basic_json(CompatibleType && val) noexcept(noexcept(
+                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
+                                           std::forward<CompatibleType>(val))))
+    {
+        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a JSON value from an existing one
+
+    This is a constructor for existing @ref basic_json types.
+    It does not hijack copy/move constructors, since the parameter has different
+    template arguments than the current ones.
+
+    The constructor tries to convert the internal @ref m_value of the parameter.
+
+    @tparam BasicJsonType a type such that:
+    - @a BasicJsonType is a @ref basic_json type.
+    - @a BasicJsonType has different template arguments than @ref basic_json_t.
+
+    @param[in] val the @ref basic_json value to be converted.
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @since version 3.2.0
+    */
+    template < typename BasicJsonType,
+               detail::enable_if_t <
+                   detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
+    basic_json(const BasicJsonType& val)
+    {
+        using other_boolean_t = typename BasicJsonType::boolean_t;
+        using other_number_float_t = typename BasicJsonType::number_float_t;
+        using other_number_integer_t = typename BasicJsonType::number_integer_t;
+        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+        using other_string_t = typename BasicJsonType::string_t;
+        using other_object_t = typename BasicJsonType::object_t;
+        using other_array_t = typename BasicJsonType::array_t;
+        using other_binary_t = typename BasicJsonType::binary_t;
+
+        switch (val.type())
+        {
+            case value_t::boolean:
+                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
+                break;
+            case value_t::number_float:
+                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
+                break;
+            case value_t::number_integer:
+                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
+                break;
+            case value_t::number_unsigned:
+                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
+                break;
+            case value_t::string:
+                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
+                break;
+            case value_t::object:
+                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
+                break;
+            case value_t::array:
+                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
+                break;
+            case value_t::binary:
+                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
+                break;
+            case value_t::null:
+                *this = nullptr;
+                break;
+            case value_t::discarded:
+                m_type = value_t::discarded;
+                break;
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // LCOV_EXCL_LINE
+        }
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a container (array or object) from an initializer list
+
+    Creates a JSON value of type array or object from the passed initializer
+    list @a init. In case @a type_deduction is `true` (default), the type of
+    the JSON value to be created is deducted from the initializer list @a init
+    according to the following rules:
+
+    1. If the list is empty, an empty JSON object value `{}` is created.
+    2. If the list consists of pairs whose first element is a string, a JSON
+       object value is created where the first elements of the pairs are
+       treated as keys and the second elements are as values.
+    3. In all other cases, an array is created.
+
+    The rules aim to create the best fit between a C++ initializer list and
+    JSON values. The rationale is as follows:
+
+    1. The empty initializer list is written as `{}` which is exactly an empty
+       JSON object.
+    2. C++ has no way of describing mapped types other than to list a list of
+       pairs. As JSON requires that keys must be of type string, rule 2 is the
+       weakest constraint one can pose on initializer lists to interpret them
+       as an object.
+    3. In all other cases, the initializer list could not be interpreted as
+       JSON object type, so interpreting it as JSON array type is safe.
+
+    With the rules described above, the following JSON values cannot be
+    expressed by an initializer list:
+
+    - the empty array (`[]`): use @ref array(initializer_list_t)
+      with an empty initializer list in this case
+    - arrays whose elements satisfy rule 2: use @ref
+      array(initializer_list_t) with the same initializer list
+      in this case
+
+    @note When used without parentheses around an empty initializer list, @ref
+    basic_json() is called instead of this function, yielding the JSON null
+    value.
+
+    @param[in] init  initializer list with JSON values
+
+    @param[in] type_deduction internal parameter; when set to `true`, the type
+    of the JSON value is deducted from the initializer list @a init; when set
+    to `false`, the type provided via @a manual_type is forced. This mode is
+    used by the functions @ref array(initializer_list_t) and
+    @ref object(initializer_list_t).
+
+    @param[in] manual_type internal parameter; when @a type_deduction is set
+    to `false`, the created JSON value will use the provided type (only @ref
+    value_t::array and @ref value_t::object are valid); when @a type_deduction
+    is set to `true`, this parameter has no effect
+
+    @throw type_error.301 if @a type_deduction is `false`, @a manual_type is
+    `value_t::object`, but @a init contains an element which is not a pair
+    whose first element is a string. In this case, the constructor could not
+    create an object. If @a type_deduction would have be `true`, an array
+    would have been created. See @ref object(initializer_list_t)
+    for an example.
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The example below shows how JSON values are created from
+    initializer lists.,basic_json__list_init_t}
+
+    @sa @ref array(initializer_list_t) -- create a JSON array
+    value from an initializer list
+    @sa @ref object(initializer_list_t) -- create a JSON object
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    basic_json(initializer_list_t init,
+               bool type_deduction = true,
+               value_t manual_type = value_t::array)
+    {
+        // check if each element is an array with two elements whose first
+        // element is a string
+        bool is_an_object = std::all_of(init.begin(), init.end(),
+                                        [](const detail::json_ref<basic_json>& element_ref)
+        {
+            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string();
+        });
+
+        // adjust type if type deduction is not wanted
+        if (!type_deduction)
+        {
+            // if array is wanted, do not create an object though possible
+            if (manual_type == value_t::array)
+            {
+                is_an_object = false;
+            }
+
+            // if object is wanted but impossible, throw an exception
+            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object))
+            {
+                JSON_THROW(type_error::create(301, "cannot create object from initializer list"));
+            }
+        }
+
+        if (is_an_object)
+        {
+            // the initializer list is a list of pairs -> create object
+            m_type = value_t::object;
+            m_value = value_t::object;
+
+            std::for_each(init.begin(), init.end(), [this](const detail::json_ref<basic_json>& element_ref)
+            {
+                auto element = element_ref.moved_or_copied();
+                m_value.object->emplace(
+                    std::move(*((*element.m_value.array)[0].m_value.string)),
+                    std::move((*element.m_value.array)[1]));
+            });
+        }
+        else
+        {
+            // the initializer list describes an array -> create array
+            m_type = value_t::array;
+            m_value.array = create<array_t>(init.begin(), init.end());
+        }
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief explicitly create a binary array (without subtype)
+
+    Creates a JSON binary array value from a given binary container. Binary
+    values are part of various binary formats, such as CBOR, MessagePack, and
+    BSON. This constructor is used to create a value for serialization to those
+    formats.
+
+    @note Note, this function exists because of the difficulty in correctly
+    specifying the correct template overload in the standard value ctor, as both
+    JSON arrays and JSON binary arrays are backed with some form of a
+    `std::vector`. Because JSON binary arrays are a non-standard extension it
+    was decided that it would be best to prevent automatic initialization of a
+    binary array type, for backwards compatibility and so it does not happen on
+    accident.
+
+    @param[in] init container containing bytes to use as binary type
+
+    @return JSON binary array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @since version 3.8.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(const typename binary_t::container_type& init)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = init;
+        return res;
+    }
+
+    /*!
+    @brief explicitly create a binary array (with subtype)
+
+    Creates a JSON binary array value from a given binary container. Binary
+    values are part of various binary formats, such as CBOR, MessagePack, and
+    BSON. This constructor is used to create a value for serialization to those
+    formats.
+
+    @note Note, this function exists because of the difficulty in correctly
+    specifying the correct template overload in the standard value ctor, as both
+    JSON arrays and JSON binary arrays are backed with some form of a
+    `std::vector`. Because JSON binary arrays are a non-standard extension it
+    was decided that it would be best to prevent automatic initialization of a
+    binary array type, for backwards compatibility and so it does not happen on
+    accident.
+
+    @param[in] init container containing bytes to use as binary type
+    @param[in] subtype subtype to use in MessagePack and BSON
+
+    @return JSON binary array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @since version 3.8.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(const typename binary_t::container_type& init, std::uint8_t subtype)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = binary_t(init, subtype);
+        return res;
+    }
+
+    /// @copydoc binary(const typename binary_t::container_type&)
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(typename binary_t::container_type&& init)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = std::move(init);
+        return res;
+    }
+
+    /// @copydoc binary(const typename binary_t::container_type&, std::uint8_t)
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(typename binary_t::container_type&& init, std::uint8_t subtype)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = binary_t(std::move(init), subtype);
+        return res;
+    }
+
+    /*!
+    @brief explicitly create an array from an initializer list
+
+    Creates a JSON array value from a given initializer list. That is, given a
+    list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the
+    initializer list is empty, the empty array `[]` is created.
+
+    @note This function is only needed to express two edge cases that cannot
+    be realized with the initializer list constructor (@ref
+    basic_json(initializer_list_t, bool, value_t)). These cases
+    are:
+    1. creating an array whose elements are all pairs whose first element is a
+    string -- in this case, the initializer list constructor would create an
+    object, taking the first elements as keys
+    2. creating an empty array -- passing the empty initializer list to the
+    initializer list constructor yields an empty object
+
+    @param[in] init  initializer list with JSON values to create an array from
+    (optional)
+
+    @return JSON array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows an example for the `array`
+    function.,array}
+
+    @sa @ref basic_json(initializer_list_t, bool, value_t) --
+    create a JSON value from an initializer list
+    @sa @ref object(initializer_list_t) -- create a JSON object
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json array(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::array);
+    }
+
+    /*!
+    @brief explicitly create an object from an initializer list
+
+    Creates a JSON object value from a given initializer list. The initializer
+    lists elements must be pairs, and their first elements must be strings. If
+    the initializer list is empty, the empty object `{}` is created.
+
+    @note This function is only added for symmetry reasons. In contrast to the
+    related function @ref array(initializer_list_t), there are
+    no cases which can only be expressed by this function. That is, any
+    initializer list @a init can also be passed to the initializer list
+    constructor @ref basic_json(initializer_list_t, bool, value_t).
+
+    @param[in] init  initializer list to create an object from (optional)
+
+    @return JSON object value
+
+    @throw type_error.301 if @a init is not a list of pairs whose first
+    elements are strings. In this case, no object can be created. When such a
+    value is passed to @ref basic_json(initializer_list_t, bool, value_t),
+    an array would have been created from the passed initializer list @a init.
+    See example below.
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows an example for the `object`
+    function.,object}
+
+    @sa @ref basic_json(initializer_list_t, bool, value_t) --
+    create a JSON value from an initializer list
+    @sa @ref array(initializer_list_t) -- create a JSON array
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json object(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::object);
+    }
+
+    /*!
+    @brief construct an array with count copies of given value
+
+    Constructs a JSON array value by creating @a cnt copies of a passed value.
+    In case @a cnt is `0`, an empty array is created.
+
+    @param[in] cnt  the number of JSON copies of @a val to create
+    @param[in] val  the JSON value to copy
+
+    @post `std::distance(begin(),end()) == cnt` holds.
+
+    @complexity Linear in @a cnt.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows examples for the @ref
+    basic_json(size_type\, const basic_json&)
+    constructor.,basic_json__size_type_basic_json}
+
+    @since version 1.0.0
+    */
+    basic_json(size_type cnt, const basic_json& val)
+        : m_type(value_t::array)
+    {
+        m_value.array = create<array_t>(cnt, val);
+        assert_invariant();
+    }
+
+    /*!
+    @brief construct a JSON container given an iterator range
+
+    Constructs the JSON value with the contents of the range `[first, last)`.
+    The semantics depends on the different types a JSON value can have:
+    - In case of a null type, invalid_iterator.206 is thrown.
+    - In case of other primitive types (number, boolean, or string), @a first
+      must be `begin()` and @a last must be `end()`. In this case, the value is
+      copied. Otherwise, invalid_iterator.204 is thrown.
+    - In case of structured types (array, object), the constructor behaves as
+      similar versions for `std::vector` or `std::map`; that is, a JSON array
+      or object is constructed from the values in the range.
+
+    @tparam InputIT an input iterator type (@ref iterator or @ref
+    const_iterator)
+
+    @param[in] first begin of the range to copy from (included)
+    @param[in] last end of the range to copy from (excluded)
+
+    @pre Iterators @a first and @a last must be initialized. **This
+         precondition is enforced with an assertion (see warning).** If
+         assertions are switched off, a violation of this precondition yields
+         undefined behavior.
+
+    @pre Range `[first, last)` is valid. Usually, this precondition cannot be
+         checked efficiently. Only certain edge cases are detected; see the
+         description of the exceptions below. A violation of this precondition
+         yields undefined behavior.
+
+    @warning A precondition is enforced with a runtime assertion that will
+             result in calling `std::abort` if this precondition is not met.
+             Assertions can be disabled by defining `NDEBUG` at compile time.
+             See https://en.cppreference.com/w/cpp/error/assert for more
+             information.
+
+    @throw invalid_iterator.201 if iterators @a first and @a last are not
+    compatible (i.e., do not belong to the same JSON value). In this case,
+    the range `[first, last)` is undefined.
+    @throw invalid_iterator.204 if iterators @a first and @a last belong to a
+    primitive type (number, boolean, or string), but @a first does not point
+    to the first element any more. In this case, the range `[first, last)` is
+    undefined. See example code below.
+    @throw invalid_iterator.206 if iterators @a first and @a last belong to a
+    null value. In this case, the range `[first, last)` is undefined.
+
+    @complexity Linear in distance between @a first and @a last.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The example below shows several ways to create JSON values by
+    specifying a subrange with iterators.,basic_json__InputIt_InputIt}
+
+    @since version 1.0.0
+    */
+    template < class InputIT, typename std::enable_if <
+                   std::is_same<InputIT, typename basic_json_t::iterator>::value ||
+                   std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
+    basic_json(InputIT first, InputIT last)
+    {
+        JSON_ASSERT(first.m_object != nullptr);
+        JSON_ASSERT(last.m_object != nullptr);
+
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible"));
+        }
+
+        // copy type from first iterator
+        m_type = first.m_object->m_type;
+
+        // check if iterator range is complete for primitive values
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            {
+                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin()
+                                         || !last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range"));
+                }
+                break;
+            }
+
+            default:
+                break;
+        }
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = first.m_object->m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value.number_unsigned = first.m_object->m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = first.m_object->m_value.number_float;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = first.m_object->m_value.boolean;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *first.m_object->m_value.string;
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object = create<object_t>(first.m_it.object_iterator,
+                                                  last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array = create<array_t>(first.m_it.array_iterator,
+                                                last.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value = *first.m_object->m_value.binary;
+                break;
+            }
+
+            default:
+                JSON_THROW(invalid_iterator::create(206, "cannot construct with iterators from " +
+                                                    std::string(first.m_object->type_name())));
+        }
+
+        assert_invariant();
+    }
+
+
+    ///////////////////////////////////////
+    // other constructors and destructor //
+    ///////////////////////////////////////
+
+    template<typename JsonRef,
+             detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
+                                 std::is_same<typename JsonRef::value_type, basic_json>>::value, int> = 0 >
+    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}
+
+    /*!
+    @brief copy constructor
+
+    Creates a copy of a given JSON value.
+
+    @param[in] other  the JSON value to copy
+
+    @post `*this == other`
+
+    @complexity Linear in the size of @a other.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+    - As postcondition, it holds: `other == basic_json(other)`.
+
+    @liveexample{The following code shows an example for the copy
+    constructor.,basic_json__basic_json}
+
+    @since version 1.0.0
+    */
+    basic_json(const basic_json& other)
+        : m_type(other.m_type)
+    {
+        // check of passed value is valid
+        other.assert_invariant();
+
+        switch (m_type)
+        {
+            case value_t::object:
+            {
+                m_value = *other.m_value.object;
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value = *other.m_value.array;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *other.m_value.string;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value = other.m_value.boolean;
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                m_value = other.m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value = other.m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value = other.m_value.number_float;
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value = *other.m_value.binary;
+                break;
+            }
+
+            default:
+                break;
+        }
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief move constructor
+
+    Move constructor. Constructs a JSON value with the contents of the given
+    value @a other using move semantics. It "steals" the resources from @a
+    other and leaves it as JSON null value.
+
+    @param[in,out] other  value to move to this object
+
+    @post `*this` has the same value as @a other before the call.
+    @post @a other is a JSON null value.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this constructor never throws
+    exceptions.
+
+    @requirement This function helps `basic_json` satisfying the
+    [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible)
+    requirements.
+
+    @liveexample{The code below shows the move constructor explicitly called
+    via std::move.,basic_json__moveconstructor}
+
+    @since version 1.0.0
+    */
+    basic_json(basic_json&& other) noexcept
+        : m_type(std::move(other.m_type)),
+          m_value(std::move(other.m_value))
+    {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        // invalidate payload
+        other.m_type = value_t::null;
+        other.m_value = {};
+
+        assert_invariant();
+    }
+
+    /*!
+    @brief copy assignment
+
+    Copy assignment operator. Copies a JSON value via the "copy and swap"
+    strategy: It is expressed in terms of the copy constructor, destructor,
+    and the `swap()` member function.
+
+    @param[in] other  value to copy from
+
+    @complexity Linear.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+
+    @liveexample{The code below shows and example for the copy assignment. It
+    creates a copy of value `a` which is then swapped with `b`. Finally\, the
+    copy of `a` (which is the null value after the swap) is
+    destroyed.,basic_json__copyassignment}
+
+    @since version 1.0.0
+    */
+    basic_json& operator=(basic_json other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        using std::swap;
+        swap(m_type, other.m_type);
+        swap(m_value, other.m_value);
+
+        assert_invariant();
+        return *this;
+    }
+
+    /*!
+    @brief destructor
+
+    Destroys the JSON value and frees all allocated memory.
+
+    @complexity Linear.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+    - All stored elements are destroyed and all memory is freed.
+
+    @since version 1.0.0
+    */
+    ~basic_json() noexcept
+    {
+        assert_invariant();
+        m_value.destroy(m_type);
+    }
+
+    /// @}
+
+  public:
+    ///////////////////////
+    // object inspection //
+    ///////////////////////
+
+    /// @name object inspection
+    /// Functions to inspect the type of a JSON value.
+    /// @{
+
+    /*!
+    @brief serialization
+
+    Serialization function for JSON values. The function tries to mimic
+    Python's `json.dumps()` function, and currently supports its @a indent
+    and @a ensure_ascii parameters.
+
+    @param[in] indent If indent is nonnegative, then array elements and object
+    members will be pretty-printed with that indent level. An indent level of
+    `0` will only insert newlines. `-1` (the default) selects the most compact
+    representation.
+    @param[in] indent_char The character to use for indentation if @a indent is
+    greater than `0`. The default is ` ` (space).
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] error_handler  how to react on decoding errors; there are three
+    possible values: `strict` (throws and exception in case a decoding error
+    occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
+    and `ignore` (ignore invalid UTF-8 sequences during serialization; all
+    bytes are copied to the output unchanged).
+
+    @return string containing the serialization of the JSON value
+
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded and @a error_handler is set to strict
+
+    @note Binary values are serialized as object containing two keys:
+      - "bytes": an array of bytes as integers
+      - "subtype": the subtype as integer or "null" if the binary has no subtype
+
+    @complexity Linear.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @liveexample{The following example shows the effect of different @a indent\,
+    @a indent_char\, and @a ensure_ascii parameters to the result of the
+    serialization.,dump}
+
+    @see https://docs.python.org/2/library/json.html#json.dump
+
+    @since version 1.0.0; indentation character @a indent_char, option
+           @a ensure_ascii and exceptions added in version 3.0.0; error
+           handlers added in version 3.4.0; serialization of binary values added
+           in version 3.8.0.
+    */
+    string_t dump(const int indent = -1,
+                  const char indent_char = ' ',
+                  const bool ensure_ascii = false,
+                  const error_handler_t error_handler = error_handler_t::strict) const
+    {
+        string_t result;
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
+
+        if (indent >= 0)
+        {
+            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
+        }
+        else
+        {
+            s.dump(*this, false, ensure_ascii, 0);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief return the type of the JSON value (explicit)
+
+    Return the type of the JSON value as a value from the @ref value_t
+    enumeration.
+
+    @return the type of the JSON value
+            Value type                | return value
+            ------------------------- | -------------------------
+            null                      | value_t::null
+            boolean                   | value_t::boolean
+            string                    | value_t::string
+            number (integer)          | value_t::number_integer
+            number (unsigned integer) | value_t::number_unsigned
+            number (floating-point)   | value_t::number_float
+            object                    | value_t::object
+            array                     | value_t::array
+            binary                    | value_t::binary
+            discarded                 | value_t::discarded
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `type()` for all JSON
+    types.,type}
+
+    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
+    @sa @ref type_name() -- return the type as string
+
+    @since version 1.0.0
+    */
+    constexpr value_t type() const noexcept
+    {
+        return m_type;
+    }
+
+    /*!
+    @brief return whether type is primitive
+
+    This function returns true if and only if the JSON type is primitive
+    (string, number, boolean, or null).
+
+    @return `true` if type is primitive (string, number, boolean, or null),
+    `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_primitive()` for all JSON
+    types.,is_primitive}
+
+    @sa @ref is_structured() -- returns whether JSON value is structured
+    @sa @ref is_null() -- returns whether JSON value is `null`
+    @sa @ref is_string() -- returns whether JSON value is a string
+    @sa @ref is_boolean() -- returns whether JSON value is a boolean
+    @sa @ref is_number() -- returns whether JSON value is a number
+    @sa @ref is_binary() -- returns whether JSON value is a binary array
+
+    @since version 1.0.0
+    */
+    constexpr bool is_primitive() const noexcept
+    {
+        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
+    }
+
+    /*!
+    @brief return whether type is structured
+
+    This function returns true if and only if the JSON type is structured
+    (array or object).
+
+    @return `true` if type is structured (array or object), `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_structured()` for all JSON
+    types.,is_structured}
+
+    @sa @ref is_primitive() -- returns whether value is primitive
+    @sa @ref is_array() -- returns whether value is an array
+    @sa @ref is_object() -- returns whether value is an object
+
+    @since version 1.0.0
+    */
+    constexpr bool is_structured() const noexcept
+    {
+        return is_array() || is_object();
+    }
+
+    /*!
+    @brief return whether value is null
+
+    This function returns true if and only if the JSON value is null.
+
+    @return `true` if type is null, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_null()` for all JSON
+    types.,is_null}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_null() const noexcept
+    {
+        return m_type == value_t::null;
+    }
+
+    /*!
+    @brief return whether value is a boolean
+
+    This function returns true if and only if the JSON value is a boolean.
+
+    @return `true` if type is boolean, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_boolean()` for all JSON
+    types.,is_boolean}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_boolean() const noexcept
+    {
+        return m_type == value_t::boolean;
+    }
+
+    /*!
+    @brief return whether value is a number
+
+    This function returns true if and only if the JSON value is a number. This
+    includes both integer (signed and unsigned) and floating-point values.
+
+    @return `true` if type is number (regardless whether integer, unsigned
+    integer or floating-type), `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number()` for all JSON
+    types.,is_number}
+
+    @sa @ref is_number_integer() -- check if value is an integer or unsigned
+    integer number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number() const noexcept
+    {
+        return is_number_integer() || is_number_float();
+    }
+
+    /*!
+    @brief return whether value is an integer number
+
+    This function returns true if and only if the JSON value is a signed or
+    unsigned integer number. This excludes floating-point values.
+
+    @return `true` if type is an integer or unsigned integer number, `false`
+    otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_integer()` for all
+    JSON types.,is_number_integer}
+
+    @sa @ref is_number() -- check if value is a number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number_integer() const noexcept
+    {
+        return m_type == value_t::number_integer || m_type == value_t::number_unsigned;
+    }
+
+    /*!
+    @brief return whether value is an unsigned integer number
+
+    This function returns true if and only if the JSON value is an unsigned
+    integer number. This excludes floating-point and signed integer values.
+
+    @return `true` if type is an unsigned integer number, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_unsigned()` for all
+    JSON types.,is_number_unsigned}
+
+    @sa @ref is_number() -- check if value is a number
+    @sa @ref is_number_integer() -- check if value is an integer or unsigned
+    integer number
+    @sa @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 2.0.0
+    */
+    constexpr bool is_number_unsigned() const noexcept
+    {
+        return m_type == value_t::number_unsigned;
+    }
+
+    /*!
+    @brief return whether value is a floating-point number
+
+    This function returns true if and only if the JSON value is a
+    floating-point number. This excludes signed and unsigned integer values.
+
+    @return `true` if type is a floating-point number, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_float()` for all
+    JSON types.,is_number_float}
+
+    @sa @ref is_number() -- check if value is number
+    @sa @ref is_number_integer() -- check if value is an integer number
+    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number_float() const noexcept
+    {
+        return m_type == value_t::number_float;
+    }
+
+    /*!
+    @brief return whether value is an object
+
+    This function returns true if and only if the JSON value is an object.
+
+    @return `true` if type is object, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_object()` for all JSON
+    types.,is_object}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_object() const noexcept
+    {
+        return m_type == value_t::object;
+    }
+
+    /*!
+    @brief return whether value is an array
+
+    This function returns true if and only if the JSON value is an array.
+
+    @return `true` if type is array, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_array()` for all JSON
+    types.,is_array}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_array() const noexcept
+    {
+        return m_type == value_t::array;
+    }
+
+    /*!
+    @brief return whether value is a string
+
+    This function returns true if and only if the JSON value is a string.
+
+    @return `true` if type is string, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_string()` for all JSON
+    types.,is_string}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_string() const noexcept
+    {
+        return m_type == value_t::string;
+    }
+
+    /*!
+    @brief return whether value is a binary array
+
+    This function returns true if and only if the JSON value is a binary array.
+
+    @return `true` if type is binary array, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_binary()` for all JSON
+    types.,is_binary}
+
+    @since version 3.8.0
+    */
+    constexpr bool is_binary() const noexcept
+    {
+        return m_type == value_t::binary;
+    }
+
+    /*!
+    @brief return whether value is discarded
+
+    This function returns true if and only if the JSON value was discarded
+    during parsing with a callback function (see @ref parser_callback_t).
+
+    @note This function will always be `false` for JSON values after parsing.
+    That is, discarded values can only occur during parsing, but will be
+    removed when inside a structured value or replaced by null in other cases.
+
+    @return `true` if type is discarded, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_discarded()` for all JSON
+    types.,is_discarded}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_discarded() const noexcept
+    {
+        return m_type == value_t::discarded;
+    }
+
+    /*!
+    @brief return the type of the JSON value (implicit)
+
+    Implicitly return the type of the JSON value as a value from the @ref
+    value_t enumeration.
+
+    @return the type of the JSON value
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies the @ref value_t operator for
+    all JSON types.,operator__value_t}
+
+    @sa @ref type() -- return the type of the JSON value (explicit)
+    @sa @ref type_name() -- return the type as string
+
+    @since version 1.0.0
+    */
+    constexpr operator value_t() const noexcept
+    {
+        return m_type;
+    }
+
+    /// @}
+
+  private:
+    //////////////////
+    // value access //
+    //////////////////
+
+    /// get a boolean (explicit)
+    boolean_t get_impl(boolean_t* /*unused*/) const
+    {
+        if (JSON_HEDLEY_LIKELY(is_boolean()))
+        {
+            return m_value.boolean;
+        }
+
+        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(type_name())));
+    }
+
+    /// get a pointer to the value (object)
+    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (object)
+    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
+    {
+        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
+    {
+        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
+    {
+        return is_binary() ? m_value.binary : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
+    {
+        return is_binary() ? m_value.binary : nullptr;
+    }
+
+    /*!
+    @brief helper function to implement get_ref()
+
+    This function helps to implement get_ref() without code duplication for
+    const and non-const overloads
+
+    @tparam ThisType will be deduced as `basic_json` or `const basic_json`
+
+    @throw type_error.303 if ReferenceType does not match underlying value
+    type of the current JSON
+    */
+    template<typename ReferenceType, typename ThisType>
+    static ReferenceType get_ref_impl(ThisType& obj)
+    {
+        // delegate the call to get_ptr<>()
+        auto ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();
+
+        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
+        {
+            return *ptr;
+        }
+
+        JSON_THROW(type_error::create(303, "incompatible ReferenceType for get_ref, actual type is " + std::string(obj.type_name())));
+    }
+
+  public:
+    /// @name value access
+    /// Direct access to the stored value of a JSON value.
+    /// @{
+
+    /*!
+    @brief get special-case overload
+
+    This overloads avoids a lot of template boilerplate, it can be seen as the
+    identity method
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this
+
+    @complexity Constant.
+
+    @since version 2.1.0
+    */
+    template<typename BasicJsonType, detail::enable_if_t<
+                 std::is_same<typename std::remove_const<BasicJsonType>::type, basic_json_t>::value,
+                 int> = 0>
+    basic_json get() const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads converts the current @ref basic_json in a different
+    @ref basic_json type
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this, converted into @tparam BasicJsonType
+
+    @complexity Depending on the implementation of the called `from_json()`
+                method.
+
+    @since version 3.2.0
+    */
+    template < typename BasicJsonType, detail::enable_if_t <
+                   !std::is_same<BasicJsonType, basic_json>::value&&
+                   detail::is_basic_json<BasicJsonType>::value, int > = 0 >
+    BasicJsonType get() const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType ret;
+    JSONSerializer<ValueType>::from_json(*this, ret);
+    return ret;
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+    - @ref json_serializer<ValueType> does not have a `from_json()` method of
+      the form `ValueType from_json(const basic_json&)`
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get__ValueType_const}
+
+    @since version 2.1.0
+    */
+    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
+               detail::enable_if_t <
+                   !detail::is_basic_json<ValueType>::value &&
+                   detail::has_from_json<basic_json_t, ValueType>::value &&
+                   !detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType get() const noexcept(noexcept(
+                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
+    {
+        // we cannot static_assert on ValueTypeCV being non-const, because
+        // there is support for get<const basic_json_t>(), which is why we
+        // still need the uncvref
+        static_assert(!std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        static_assert(std::is_default_constructible<ValueType>::value,
+                      "types must be DefaultConstructible when used with get()");
+
+        ValueType ret;
+        JSONSerializer<ValueType>::from_json(*this, ret);
+        return ret;
+    }
+
+    /*!
+    @brief get a value (explicit); special case
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    return JSONSerializer<ValueTypeCV>::from_json(*this);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json and
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `ValueType from_json(const basic_json&)`
+
+    @note If @ref json_serializer<ValueType> has both overloads of
+    `from_json()`, this one is chosen.
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @since version 2.1.0
+    */
+    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
+               detail::enable_if_t < !std::is_same<basic_json_t, ValueType>::value &&
+                                     detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                                     int > = 0 >
+    ValueType get() const noexcept(noexcept(
+                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
+    {
+        static_assert(!std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        return JSONSerializer<ValueType>::from_json(*this);
+    }
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value.
+    The value is filled into the input parameter by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType v;
+    JSONSerializer<ValueType>::from_json(*this, v);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+
+    @tparam ValueType the input parameter type.
+
+    @return the input parameter, allowing chaining calls.
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get_to}
+
+    @since version 3.3.0
+    */
+    template < typename ValueType,
+               detail::enable_if_t <
+                   !detail::is_basic_json<ValueType>::value&&
+                   detail::has_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType & get_to(ValueType& v) const noexcept(noexcept(
+                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<ValueType>::from_json(*this, v);
+        return v;
+    }
+
+    // specialization to allow to call get_to with a basic_json value
+    // see https://github.com/nlohmann/json/issues/2175
+    template<typename ValueType,
+             detail::enable_if_t <
+                 detail::is_basic_json<ValueType>::value,
+                 int> = 0>
+    ValueType & get_to(ValueType& v) const
+    {
+        v = *this;
+        return v;
+    }
+
+    template <
+        typename T, std::size_t N,
+        typename Array = T (&)[N],
+        detail::enable_if_t <
+            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
+    Array get_to(T (&v)[N]) const
+    noexcept(noexcept(JSONSerializer<Array>::from_json(
+                          std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<Array>::from_json(*this, v);
+        return v;
+    }
+
+
+    /*!
+    @brief get a pointer value (implicit)
+
+    Implicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t. Enforced by a static
+    assertion.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get_ptr}
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+    @copydoc get_ptr()
+    */
+    template < typename PointerType, typename std::enable_if <
+                   std::is_pointer<PointerType>::value&&
+                   std::is_const<typename std::remove_pointer<PointerType>::type>::value, int >::type = 0 >
+    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+
+    Explicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning The pointer becomes invalid if the underlying JSON object
+    changes.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get__PointerType}
+
+    @sa @ref get_ptr() for explicit pointer-member access
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+    @copydoc get()
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    constexpr auto get() const noexcept -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a reference value (implicit)
+
+    Implicit reference access to the internally stored JSON value. No copies
+    are made.
+
+    @warning Writing data to the referee of the result yields an undefined
+    state.
+
+    @tparam ReferenceType reference type; must be a reference to @ref array_t,
+    @ref object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or
+    @ref number_float_t. Enforced by static assertion.
+
+    @return reference to the internally stored JSON value if the requested
+    reference type @a ReferenceType fits to the JSON value; throws
+    type_error.303 otherwise
+
+    @throw type_error.303 in case passed type @a ReferenceType is incompatible
+    with the stored JSON value; see example below
+
+    @complexity Constant.
+
+    @liveexample{The example shows several calls to `get_ref()`.,get_ref}
+
+    @since version 1.1.0
+    */
+    template<typename ReferenceType, typename std::enable_if<
+                 std::is_reference<ReferenceType>::value, int>::type = 0>
+    ReferenceType get_ref()
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a reference value (implicit)
+    @copydoc get_ref()
+    */
+    template < typename ReferenceType, typename std::enable_if <
+                   std::is_reference<ReferenceType>::value&&
+                   std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int >::type = 0 >
+    ReferenceType get_ref() const
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a value (implicit)
+
+    Implicit type conversion between the JSON value and a compatible value.
+    The call is realized by calling @ref get() const.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays. The character type of @ref string_t
+    as well as an initializer list of this type is excluded to avoid
+    ambiguities as these types implicitly convert to `std::string`.
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw type_error.302 in case passed type @a ValueType is incompatible
+    to the JSON value type (e.g., the JSON value is of type boolean, but a
+    string is requested); see example below
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,operator__ValueType}
+
+    @since version 1.0.0
+    */
+    template < typename ValueType, typename std::enable_if <
+                   !std::is_pointer<ValueType>::value&&
+                   !std::is_same<ValueType, detail::json_ref<basic_json>>::value&&
+                   !std::is_same<ValueType, typename string_t::value_type>::value&&
+                   !detail::is_basic_json<ValueType>::value
+                   && !std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>::value
+#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
+                   && !std::is_same<ValueType, typename std::string_view>::value
+#endif
+                   && detail::is_detected<detail::get_template_function, const basic_json_t&, ValueType>::value
+                   , int >::type = 0 >
+    JSON_EXPLICIT operator ValueType() const
+    {
+        // delegate the call to get<>() const
+        return get<ValueType>();
+    }
+
+    /*!
+    @return reference to the binary value
+
+    @throw type_error.302 if the value is not binary
+
+    @sa @ref is_binary() to check if the value is binary
+
+    @since version 3.8.0
+    */
+    binary_t& get_binary()
+    {
+        if (!is_binary())
+        {
+            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name())));
+        }
+
+        return *get_ptr<binary_t*>();
+    }
+
+    /// @copydoc get_binary()
+    const binary_t& get_binary() const
+    {
+        if (!is_binary())
+        {
+            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name())));
+        }
+
+        return *get_ptr<const binary_t*>();
+    }
+
+    /// @}
+
+
+    ////////////////////
+    // element access //
+    ////////////////////
+
+    /// @name element access
+    /// Access to the JSON value.
+    /// @{
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a reference to the element at specified location @a idx, with
+    bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw type_error.304 if the JSON value is not an array; in this case,
+    calling `at` with an index makes no sense. See example below.
+    @throw out_of_range.401 if the index @a idx is out of range of the array;
+    that is, `idx >= size()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how array elements can be read and
+    written using `at()`. It also demonstrates the different exceptions that
+    can be thrown.,at__size_type}
+    */
+    reference at(size_type idx)
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return m_value.array->at(idx);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a const reference to the element at specified location @a idx,
+    with bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw type_error.304 if the JSON value is not an array; in this case,
+    calling `at` with an index makes no sense. See example below.
+    @throw out_of_range.401 if the index @a idx is out of range of the array;
+    that is, `idx >= size()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how array elements can be read using
+    `at()`. It also demonstrates the different exceptions that can be thrown.,
+    at__size_type_const}
+    */
+    const_reference at(size_type idx) const
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return m_value.array->at(idx);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a reference to the element at with specified key @a key, with
+    bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.304 if the JSON value is not an object; in this case,
+    calling `at` with a key makes no sense. See example below.
+    @throw out_of_range.403 if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Logarithmic in the size of the container.
+
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how object elements can be read and
+    written using `at()`. It also demonstrates the different exceptions that
+    can be thrown.,at__object_t_key_type}
+    */
+    reference at(const typename object_t::key_type& key)
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_TRY
+            {
+                return m_value.object->at(key);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a const reference to the element at with specified key @a key,
+    with bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @throw type_error.304 if the JSON value is not an object; in this case,
+    calling `at` with a key makes no sense. See example below.
+    @throw out_of_range.403 if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Logarithmic in the size of the container.
+
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how object elements can be read using
+    `at()`. It also demonstrates the different exceptions that can be thrown.,
+    at__object_t_key_type_const}
+    */
+    const_reference at(const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_TRY
+            {
+                return m_value.object->at(key);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found"));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a reference to the element at specified location @a idx.
+
+    @note If @a idx is beyond the range of the array (i.e., `idx >= size()`),
+    then the array is silently filled up with `null` values to make `idx` a
+    valid reference to the last stored element.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw type_error.305 if the JSON value is not an array or null; in that
+    cases, using the [] operator with an index makes no sense.
+
+    @complexity Constant if @a idx is in the range of the array. Otherwise
+    linear in `idx - size()`.
+
+    @liveexample{The example below shows how array elements can be read and
+    written using `[]` operator. Note the addition of `null`
+    values.,operatorarray__size_type}
+
+    @since version 1.0.0
+    */
+    reference operator[](size_type idx)
+    {
+        // implicitly convert null value to an empty array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value.array = create<array_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // fill up array with null values if given idx is outside range
+            if (idx >= m_value.array->size())
+            {
+                m_value.array->insert(m_value.array->end(),
+                                      idx - m_value.array->size() + 1,
+                                      basic_json());
+            }
+
+            return m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a const reference to the element at specified location @a idx.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw type_error.305 if the JSON value is not an array; in that case,
+    using the [] operator with an index makes no sense.
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read using
+    the `[]` operator.,operatorarray__size_type_const}
+
+    @since version 1.0.0
+    */
+    const_reference operator[](size_type idx) const
+    {
+        // const operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            return m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.305 if the JSON value is not an object or null; in that
+    cases, using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the `[]` operator.,operatorarray__key_type}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+    */
+    reference operator[](const typename object_t::key_type& key)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->operator[](key);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief read-only access specified object element
+
+    Returns a const reference to the element at with specified key @a key. No
+    bounds checking is performed.
+
+    @warning If the element with key @a key does not exist, the behavior is
+    undefined.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @pre The element with key @a key must exist. **This precondition is
+         enforced with an assertion.**
+
+    @throw type_error.305 if the JSON value is not an object; in that case,
+    using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the `[]` operator.,operatorarray__key_type_const}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.0.0
+    */
+    const_reference operator[](const typename object_t::key_type& key) const
+    {
+        // const operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
+            return m_value.object->find(key)->second;
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.305 if the JSON value is not an object or null; in that
+    cases, using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the `[]` operator.,operatorarray__key_type}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.1.0
+    */
+    template<typename T>
+    JSON_HEDLEY_NON_NULL(2)
+    reference operator[](T* key)
+    {
+        // implicitly convert null to object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->operator[](key);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief read-only access specified object element
+
+    Returns a const reference to the element at with specified key @a key. No
+    bounds checking is performed.
+
+    @warning If the element with key @a key does not exist, the behavior is
+    undefined.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @pre The element with key @a key must exist. **This precondition is
+         enforced with an assertion.**
+
+    @throw type_error.305 if the JSON value is not an object; in that case,
+    using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the `[]` operator.,operatorarray__key_type_const}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref value() for access by value with a default value
+
+    @since version 1.1.0
+    */
+    template<typename T>
+    JSON_HEDLEY_NON_NULL(2)
+    const_reference operator[](T* key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
+            return m_value.object->find(key)->second;
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief access specified object element with default value
+
+    Returns either a copy of an object's element at the specified key @a key
+    or a given default value if no element with key @a key exists.
+
+    The function is basically equivalent to executing
+    @code {.cpp}
+    try {
+        return at(key);
+    } catch(out_of_range) {
+        return default_value;
+    }
+    @endcode
+
+    @note Unlike @ref at(const typename object_t::key_type&), this function
+    does not throw if the given key @a key was not found.
+
+    @note Unlike @ref operator[](const typename object_t::key_type& key), this
+    function does not implicitly add an element to the position defined by @a
+    key. This function is furthermore also applicable to const objects.
+
+    @param[in] key  key of the element to access
+    @param[in] default_value  the value to return if @a key is not found
+
+    @tparam ValueType type compatible to JSON values, for instance `int` for
+    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
+    JSON arrays. Note the type of the expected value at @a key and the default
+    value @a default_value must be compatible.
+
+    @return copy of the element at key @a key or @a default_value if @a key
+    is not found
+
+    @throw type_error.302 if @a default_value does not match the type of the
+    value at @a key
+    @throw type_error.306 if the JSON value is not an object; in that case,
+    using `value()` with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be queried
+    with a default value.,basic_json__value}
+
+    @sa @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+
+    @since version 1.0.0
+    */
+    // using std::is_convertible in a std::enable_if will fail when using explicit conversions
+    template < class ValueType, typename std::enable_if <
+                   detail::is_getable<basic_json_t, ValueType>::value
+                   && !std::is_same<value_t, ValueType>::value, int >::type = 0 >
+    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(key);
+            if (it != end())
+            {
+                return it->template get<ValueType>();
+            }
+
+            return default_value;
+        }
+
+        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief overload for a default value of type const char*
+    @copydoc basic_json::value(const typename object_t::key_type&, const ValueType&) const
+    */
+    string_t value(const typename object_t::key_type& key, const char* default_value) const
+    {
+        return value(key, string_t(default_value));
+    }
+
+    /*!
+    @brief access specified object element via JSON Pointer with default value
+
+    Returns either a copy of an object's element at the specified key @a key
+    or a given default value if no element with key @a key exists.
+
+    The function is basically equivalent to executing
+    @code {.cpp}
+    try {
+        return at(ptr);
+    } catch(out_of_range) {
+        return default_value;
+    }
+    @endcode
+
+    @note Unlike @ref at(const json_pointer&), this function does not throw
+    if the given key @a key was not found.
+
+    @param[in] ptr  a JSON pointer to the element to access
+    @param[in] default_value  the value to return if @a ptr found no value
+
+    @tparam ValueType type compatible to JSON values, for instance `int` for
+    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
+    JSON arrays. Note the type of the expected value at @a key and the default
+    value @a default_value must be compatible.
+
+    @return copy of the element at key @a key or @a default_value if @a key
+    is not found
+
+    @throw type_error.302 if @a default_value does not match the type of the
+    value at @a ptr
+    @throw type_error.306 if the JSON value is not an object; in that case,
+    using `value()` with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be queried
+    with a default value.,basic_json__value_ptr}
+
+    @sa @ref operator[](const json_pointer&) for unchecked access by reference
+
+    @since version 2.0.2
+    */
+    template<class ValueType, typename std::enable_if<
+                 detail::is_getable<basic_json_t, ValueType>::value, int>::type = 0>
+    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if pointer resolves a value, return it or use default value
+            JSON_TRY
+            {
+                return ptr.get_checked(this).template get<ValueType>();
+            }
+            JSON_INTERNAL_CATCH (out_of_range&)
+            {
+                return default_value;
+            }
+        }
+
+        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief overload for a default value of type const char*
+    @copydoc basic_json::value(const json_pointer&, ValueType) const
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    string_t value(const json_pointer& ptr, const char* default_value) const
+    {
+        return value(ptr, string_t(default_value));
+    }
+
+    /*!
+    @brief access the first element
+
+    Returns a reference to the first element in the container. For a JSON
+    container `c`, the expression `c.front()` is equivalent to `*c.begin()`.
+
+    @return In case of a structured type (array or object), a reference to the
+    first element is returned. In case of number, string, boolean, or binary
+    values, a reference to the value is returned.
+
+    @complexity Constant.
+
+    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
+    or an empty array or object (undefined behavior, **guarded by
+    assertions**).
+    @post The JSON value remains unchanged.
+
+    @throw invalid_iterator.214 when called on `null` value
+
+    @liveexample{The following code shows an example for `front()`.,front}
+
+    @sa @ref back() -- access the last element
+
+    @since version 1.0.0
+    */
+    reference front()
+    {
+        return *begin();
+    }
+
+    /*!
+    @copydoc basic_json::front()
+    */
+    const_reference front() const
+    {
+        return *cbegin();
+    }
+
+    /*!
+    @brief access the last element
+
+    Returns a reference to the last element in the container. For a JSON
+    container `c`, the expression `c.back()` is equivalent to
+    @code {.cpp}
+    auto tmp = c.end();
+    --tmp;
+    return *tmp;
+    @endcode
+
+    @return In case of a structured type (array or object), a reference to the
+    last element is returned. In case of number, string, boolean, or binary
+    values, a reference to the value is returned.
+
+    @complexity Constant.
+
+    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
+    or an empty array or object (undefined behavior, **guarded by
+    assertions**).
+    @post The JSON value remains unchanged.
+
+    @throw invalid_iterator.214 when called on a `null` value. See example
+    below.
+
+    @liveexample{The following code shows an example for `back()`.,back}
+
+    @sa @ref front() -- access the first element
+
+    @since version 1.0.0
+    */
+    reference back()
+    {
+        auto tmp = end();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @copydoc basic_json::back()
+    */
+    const_reference back() const
+    {
+        auto tmp = cend();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @brief remove element given an iterator
+
+    Removes the element specified by iterator @a pos. The iterator @a pos must
+    be valid and dereferenceable. Thus the `end()` iterator (which is valid,
+    but is not dereferenceable) cannot be used as a value for @a pos.
+
+    If called on a primitive type other than `null`, the resulting JSON value
+    will be `null`.
+
+    @param[in] pos iterator to the element to remove
+    @return Iterator following the last removed element. If the iterator @a
+    pos refers to the last element, the `end()` iterator is returned.
+
+    @tparam IteratorType an @ref iterator or @ref const_iterator
+
+    @post Invalidates iterators and references at or after the point of the
+    erase, including the `end()` iterator.
+
+    @throw type_error.307 if called on a `null` value; example: `"cannot use
+    erase() with null"`
+    @throw invalid_iterator.202 if called on an iterator which does not belong
+    to the current JSON value; example: `"iterator does not fit current
+    value"`
+    @throw invalid_iterator.205 if called on a primitive type with invalid
+    iterator (i.e., any iterator which is not `begin()`); example: `"iterator
+    out of range"`
+
+    @complexity The complexity depends on the type:
+    - objects: amortized constant
+    - arrays: linear in distance between @a pos and the end of the container
+    - strings and binary: linear in the length of the member
+    - other types: constant
+
+    @liveexample{The example shows the result of `erase()` for different JSON
+    types.,erase__IteratorType}
+
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    template < class IteratorType, typename std::enable_if <
+                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
+               = 0 >
+    IteratorType erase(IteratorType pos)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        IteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary:
+            {
+                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin()))
+                {
+                    JSON_THROW(invalid_iterator::create(205, "iterator out of range"));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
+                    m_value.string = nullptr;
+                }
+                else if (is_binary())
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
+                    m_value.binary = nullptr;
+                }
+
+                m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
+                break;
+            }
+
+            default:
+                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove elements given an iterator range
+
+    Removes the element specified by the range `[first; last)`. The iterator
+    @a first does not need to be dereferenceable if `first == last`: erasing
+    an empty range is a no-op.
+
+    If called on a primitive type other than `null`, the resulting JSON value
+    will be `null`.
+
+    @param[in] first iterator to the beginning of the range to remove
+    @param[in] last iterator past the end of the range to remove
+    @return Iterator following the last removed element. If the iterator @a
+    second refers to the last element, the `end()` iterator is returned.
+
+    @tparam IteratorType an @ref iterator or @ref const_iterator
+
+    @post Invalidates iterators and references at or after the point of the
+    erase, including the `end()` iterator.
+
+    @throw type_error.307 if called on a `null` value; example: `"cannot use
+    erase() with null"`
+    @throw invalid_iterator.203 if called on iterators which does not belong
+    to the current JSON value; example: `"iterators do not fit current value"`
+    @throw invalid_iterator.204 if called on a primitive type with invalid
+    iterators (i.e., if `first != begin()` and `last != end()`); example:
+    `"iterators out of range"`
+
+    @complexity The complexity depends on the type:
+    - objects: `log(size()) + std::distance(first, last)`
+    - arrays: linear in the distance between @a first and @a last, plus linear
+      in the distance between @a last and end of the container
+    - strings and binary: linear in the length of the member
+    - other types: constant
+
+    @liveexample{The example shows the result of `erase()` for different JSON
+    types.,erase__IteratorType_IteratorType}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    template < class IteratorType, typename std::enable_if <
+                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
+               = 0 >
+    IteratorType erase(IteratorType first, IteratorType last)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value"));
+        }
+
+        IteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary:
+            {
+                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin()
+                                       || !last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range"));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
+                    m_value.string = nullptr;
+                }
+                else if (is_binary())
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
+                    m_value.binary = nullptr;
+                }
+
+                m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
+                                              last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
+                                             last.m_it.array_iterator);
+                break;
+            }
+
+            default:
+                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove element from a JSON object given a key
+
+    Removes elements from a JSON object with the key value @a key.
+
+    @param[in] key value of the elements to remove
+
+    @return Number of elements removed. If @a ObjectType is the default
+    `std::map` type, the return value will always be `0` (@a key was not
+    found) or `1` (@a key was found).
+
+    @post References and iterators to the erased elements are invalidated.
+    Other references and iterators are not affected.
+
+    @throw type_error.307 when called on a type other than JSON object;
+    example: `"cannot use erase() with null"`
+
+    @complexity `log(size()) + count(key)`
+
+    @liveexample{The example shows the effect of `erase()`.,erase__key_type}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    size_type erase(const typename object_t::key_type& key)
+    {
+        // this erase only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->erase(key);
+        }
+
+        JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief remove element from a JSON array given an index
+
+    Removes element from a JSON array at the index @a idx.
+
+    @param[in] idx index of the element to remove
+
+    @throw type_error.307 when called on a type other than JSON object;
+    example: `"cannot use erase() with null"`
+    @throw out_of_range.401 when `idx >= size()`; example: `"array index 17
+    is out of range"`
+
+    @complexity Linear in distance between @a idx and the end of the container.
+
+    @liveexample{The example shows the effect of `erase()`.,erase__size_type}
+
+    @sa @ref erase(IteratorType) -- removes the element at a given position
+    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+
+    @since version 1.0.0
+    */
+    void erase(const size_type idx)
+    {
+        // this erase only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
+            {
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+            }
+
+            m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
+        }
+        else
+        {
+            JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
+        }
+    }
+
+    /// @}
+
+
+    ////////////
+    // lookup //
+    ////////////
+
+    /// @name lookup
+    /// @{
+
+    /*!
+    @brief find an element in a JSON object
+
+    Finds an element in a JSON object with key equivalent to @a key. If the
+    element is not found or the JSON value is not an object, end() is
+    returned.
+
+    @note This method always returns @ref end() when executed on a JSON type
+          that is not an object.
+
+    @param[in] key key value of the element to search for.
+
+    @return Iterator to an element with key equivalent to @a key. If no such
+    element is found or the JSON value is not an object, past-the-end (see
+    @ref end()) iterator is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how `find()` is used.,find__key_type}
+
+    @sa @ref contains(KeyT&&) const -- checks whether a key exists
+
+    @since version 1.0.0
+    */
+    template<typename KeyT>
+    iterator find(KeyT&& key)
+    {
+        auto result = end();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief find an element in a JSON object
+    @copydoc find(KeyT&&)
+    */
+    template<typename KeyT>
+    const_iterator find(KeyT&& key) const
+    {
+        auto result = cend();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief returns the number of occurrences of a key in a JSON object
+
+    Returns the number of elements with key @a key. If ObjectType is the
+    default `std::map` type, the return value will always be `0` (@a key was
+    not found) or `1` (@a key was found).
+
+    @note This method always returns `0` when executed on a JSON type that is
+          not an object.
+
+    @param[in] key key value of the element to count
+
+    @return Number of elements with key @a key. If the JSON value is not an
+    object, the return value will be `0`.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how `count()` is used.,count}
+
+    @since version 1.0.0
+    */
+    template<typename KeyT>
+    size_type count(KeyT&& key) const
+    {
+        // return 0 for all nonobject types
+        return is_object() ? m_value.object->count(std::forward<KeyT>(key)) : 0;
+    }
+
+    /*!
+    @brief check the existence of an element in a JSON object
+
+    Check whether an element exists in a JSON object with key equivalent to
+    @a key. If the element is not found or the JSON value is not an object,
+    false is returned.
+
+    @note This method always returns false when executed on a JSON type
+          that is not an object.
+
+    @param[in] key key value to check its existence.
+
+    @return true if an element with specified @a key exists. If no such
+    element with such key is found or the JSON value is not an object,
+    false is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The following code shows an example for `contains()`.,contains}
+
+    @sa @ref find(KeyT&&) -- returns an iterator to an object element
+    @sa @ref contains(const json_pointer&) const -- checks the existence for a JSON pointer
+
+    @since version 3.6.0
+    */
+    template < typename KeyT, typename std::enable_if <
+                   !std::is_same<typename std::decay<KeyT>::type, json_pointer>::value, int >::type = 0 >
+    bool contains(KeyT && key) const
+    {
+        return is_object() && m_value.object->find(std::forward<KeyT>(key)) != m_value.object->end();
+    }
+
+    /*!
+    @brief check the existence of an element in a JSON object given a JSON pointer
+
+    Check whether the given JSON pointer @a ptr can be resolved in the current
+    JSON value.
+
+    @note This method can be executed on any JSON value type.
+
+    @param[in] ptr JSON pointer to check its existence.
+
+    @return true if the JSON pointer can be resolved to a stored value, false
+    otherwise.
+
+    @post If `j.contains(ptr)` returns true, it is safe to call `j[ptr]`.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The following code shows an example for `contains()`.,contains_json_pointer}
+
+    @sa @ref contains(KeyT &&) const -- checks the existence of a key
+
+    @since version 3.7.0
+    */
+    bool contains(const json_pointer& ptr) const
+    {
+        return ptr.contains(this);
+    }
+
+    /// @}
+
+
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /// @name iterators
+    /// @{
+
+    /*!
+    @brief returns an iterator to the first element
+
+    Returns an iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for `begin()`.,begin}
+
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref cend() -- returns a const iterator to the end
+
+    @since version 1.0.0
+    */
+    iterator begin() noexcept
+    {
+        iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cbegin()
+    */
+    const_iterator begin() const noexcept
+    {
+        return cbegin();
+    }
+
+    /*!
+    @brief returns a const iterator to the first element
+
+    Returns a const iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.
+
+    @liveexample{The following code shows an example for `cbegin()`.,cbegin}
+
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref cend() -- returns a const iterator to the end
+
+    @since version 1.0.0
+    */
+    const_iterator cbegin() const noexcept
+    {
+        const_iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to one past the last element
+
+    Returns an iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for `end()`.,end}
+
+    @sa @ref cend() -- returns a const iterator to the end
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+
+    @since version 1.0.0
+    */
+    iterator end() noexcept
+    {
+        iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cend()
+    */
+    const_iterator end() const noexcept
+    {
+        return cend();
+    }
+
+    /*!
+    @brief returns a const iterator to one past the last element
+
+    Returns a const iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).end()`.
+
+    @liveexample{The following code shows an example for `cend()`.,cend}
+
+    @sa @ref end() -- returns an iterator to the end
+    @sa @ref begin() -- returns an iterator to the beginning
+    @sa @ref cbegin() -- returns a const iterator to the beginning
+
+    @since version 1.0.0
+    */
+    const_iterator cend() const noexcept
+    {
+        const_iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-beginning
+
+    Returns an iterator to the reverse-beginning; that is, the last element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(end())`.
+
+    @liveexample{The following code shows an example for `rbegin()`.,rbegin}
+
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref crend() -- returns a const reverse iterator to the end
+
+    @since version 1.0.0
+    */
+    reverse_iterator rbegin() noexcept
+    {
+        return reverse_iterator(end());
+    }
+
+    /*!
+    @copydoc basic_json::crbegin()
+    */
+    const_reverse_iterator rbegin() const noexcept
+    {
+        return crbegin();
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-end
+
+    Returns an iterator to the reverse-end; that is, one before the first
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(begin())`.
+
+    @liveexample{The following code shows an example for `rend()`.,rend}
+
+    @sa @ref crend() -- returns a const reverse iterator to the end
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+
+    @since version 1.0.0
+    */
+    reverse_iterator rend() noexcept
+    {
+        return reverse_iterator(begin());
+    }
+
+    /*!
+    @copydoc basic_json::crend()
+    */
+    const_reverse_iterator rend() const noexcept
+    {
+        return crend();
+    }
+
+    /*!
+    @brief returns a const reverse iterator to the last element
+
+    Returns a const iterator to the reverse-beginning; that is, the last
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.
+
+    @liveexample{The following code shows an example for `crbegin()`.,crbegin}
+
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref crend() -- returns a const reverse iterator to the end
+
+    @since version 1.0.0
+    */
+    const_reverse_iterator crbegin() const noexcept
+    {
+        return const_reverse_iterator(cend());
+    }
+
+    /*!
+    @brief returns a const reverse iterator to one before the first
+
+    Returns a const reverse iterator to the reverse-end; that is, one before
+    the first element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.
+
+    @liveexample{The following code shows an example for `crend()`.,crend}
+
+    @sa @ref rend() -- returns a reverse iterator to the end
+    @sa @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
+
+    @since version 1.0.0
+    */
+    const_reverse_iterator crend() const noexcept
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+  public:
+    /*!
+    @brief wrapper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without iterator_wrapper:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without iterator proxy:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with iterator proxy:
+
+    @code{cpp}
+    for (auto it : json::iterator_wrapper(j_object))
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example).
+
+    @param[in] ref  reference to a JSON value
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the wrapper is used,iterator_wrapper}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @note The name of this function is not yet final and may change in the
+    future.
+
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use @ref items() instead;
+                that is, replace `json::iterator_wrapper(j)` with `j.items()`.
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /*!
+    @copydoc iterator_wrapper(reference)
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /*!
+    @brief helper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without `items()` function:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without `items()` function:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with `items()` function:
+
+    @code{cpp}
+    for (auto& el : j_object.items())
+    {
+        std::cout << "key: " << el.key() << ", value:" << el.value() << '\n';
+    }
+    @endcode
+
+    The `items()` function also allows to use
+    [structured bindings](https://en.cppreference.com/w/cpp/language/structured_binding)
+    (C++17):
+
+    @code{cpp}
+    for (auto& [key, val] : j_object.items())
+    {
+        std::cout << "key: " << key << ", value:" << val << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example). For primitive types (e.g., numbers),
+          `key()` returns an empty string.
+
+    @warning Using `items()` on temporary objects is dangerous. Make sure the
+             object's lifetime exeeds the iteration. See
+             <https://github.com/nlohmann/json/issues/2040> for more
+             information.
+
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the function is used.,items}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 3.1.0, structured bindings support since 3.5.0.
+    */
+    iteration_proxy<iterator> items() noexcept
+    {
+        return iteration_proxy<iterator>(*this);
+    }
+
+    /*!
+    @copydoc items()
+    */
+    iteration_proxy<const_iterator> items() const noexcept
+    {
+        return iteration_proxy<const_iterator>(*this);
+    }
+
+    /// @}
+
+
+    //////////////
+    // capacity //
+    //////////////
+
+    /// @name capacity
+    /// @{
+
+    /*!
+    @brief checks whether the container is empty.
+
+    Checks if a JSON value has no elements (i.e. whether its @ref size is `0`).
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `true`
+            boolean     | `false`
+            string      | `false`
+            number      | `false`
+            binary      | `false`
+            object      | result of function `object_t::empty()`
+            array       | result of function `array_t::empty()`
+
+    @liveexample{The following code uses `empty()` to check if a JSON
+    object contains any elements.,empty}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their `empty()` functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @note This function does not return whether a string stored as JSON value
+    is empty - it returns whether the JSON container itself is empty which is
+    false in the case of a string.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `begin() == end()`.
+
+    @sa @ref size() -- returns the number of elements
+
+    @since version 1.0.0
+    */
+    bool empty() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return true;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::empty()
+                return m_value.array->empty();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::empty()
+                return m_value.object->empty();
+            }
+
+            default:
+            {
+                // all other types are nonempty
+                return false;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the number of elements
+
+    Returns the number of elements in a JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `0`
+            boolean     | `1`
+            string      | `1`
+            number      | `1`
+            binary      | `1`
+            object      | result of function object_t::size()
+            array       | result of function array_t::size()
+
+    @liveexample{The following code calls `size()` on the different value
+    types.,size}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their size() functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @note This function does not return the length of a string stored as JSON
+    value - it returns the number of elements in the JSON value which is 1 in
+    the case of a string.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `std::distance(begin(), end())`.
+
+    @sa @ref empty() -- checks whether the container is empty
+    @sa @ref max_size() -- returns the maximal number of elements
+
+    @since version 1.0.0
+    */
+    size_type size() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return 0;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::size()
+                return m_value.array->size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::size()
+                return m_value.object->size();
+            }
+
+            default:
+            {
+                // all other types have size 1
+                return 1;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the maximum possible number of elements
+
+    Returns the maximum number of elements a JSON value is able to hold due to
+    system or library implementation limitations, i.e. `std::distance(begin(),
+    end())` for the JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `0` (same as `size()`)
+            boolean     | `1` (same as `size()`)
+            string      | `1` (same as `size()`)
+            number      | `1` (same as `size()`)
+            binary      | `1` (same as `size()`)
+            object      | result of function `object_t::max_size()`
+            array       | result of function `array_t::max_size()`
+
+    @liveexample{The following code calls `max_size()` on the different value
+    types. Note the output is implementation specific.,max_size}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their `max_size()` functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of returning `b.size()` where `b` is the largest
+      possible JSON value.
+
+    @sa @ref size() -- returns the number of elements
+
+    @since version 1.0.0
+    */
+    size_type max_size() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::array:
+            {
+                // delegate call to array_t::max_size()
+                return m_value.array->max_size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::max_size()
+                return m_value.object->max_size();
+            }
+
+            default:
+            {
+                // all other types have max_size() == size()
+                return size();
+            }
+        }
+    }
+
+    /// @}
+
+
+    ///////////////
+    // modifiers //
+    ///////////////
+
+    /// @name modifiers
+    /// @{
+
+    /*!
+    @brief clears the contents
+
+    Clears the content of a JSON value and resets it to the default value as
+    if @ref basic_json(value_t) would have been called with the current value
+    type from @ref type():
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    binary      | An empty byte vector
+    object      | `{}`
+    array       | `[]`
+
+    @post Has the same effect as calling
+    @code {.cpp}
+    *this = basic_json(type());
+    @endcode
+
+    @liveexample{The example below shows the effect of `clear()` to different
+    JSON types.,clear}
+
+    @complexity Linear in the size of the JSON value.
+
+    @iterators All iterators, pointers and references related to this container
+               are invalidated.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @sa @ref basic_json(value_t) -- constructor that creates an object with the
+        same value than calling `clear()`
+
+    @since version 1.0.0
+    */
+    void clear() noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = 0;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value.number_unsigned = 0;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = 0.0;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = false;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value.string->clear();
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value.binary->clear();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array->clear();
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object->clear();
+                break;
+            }
+
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Appends the given element @a val to the end of the JSON value. If the
+    function is called on a JSON null value, an empty array is created before
+    appending @a val.
+
+    @param[in] val the value to add to the JSON array
+
+    @throw type_error.308 when called on a type other than JSON array or
+    null; example: `"cannot use push_back() with number"`
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back()` and `+=` can be used to
+    add elements to a JSON array. Note how the `null` value was silently
+    converted to a JSON array.,push_back}
+
+    @since version 1.0.0
+    */
+    void push_back(basic_json&& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (move semantics)
+        m_value.array->push_back(std::move(val));
+        // if val is moved from, basic_json move constructor marks it null so we do not call the destructor
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(basic_json&& val)
+    {
+        push_back(std::move(val));
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    void push_back(const basic_json& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array
+        m_value.array->push_back(val);
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(const basic_json& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    Inserts the given element @a val to the JSON object. If the function is
+    called on a JSON null value, an empty object is created before inserting
+    @a val.
+
+    @param[in] val the value to add to the JSON object
+
+    @throw type_error.308 when called on a type other than JSON object or
+    null; example: `"cannot use push_back() with number"`
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `push_back()` and `+=` can be used to
+    add elements to a JSON object. Note how the `null` value was silently
+    converted to a JSON object.,push_back__object_t__value}
+
+    @since version 1.0.0
+    */
+    void push_back(const typename object_t::value_type& val)
+    {
+        // push_back only works for null objects or objects
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array
+        m_value.object->insert(val);
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(const typename object_t::value_type&)
+    */
+    reference operator+=(const typename object_t::value_type& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    This function allows to use `push_back` with an initializer list. In case
+
+    1. the current value is an object,
+    2. the initializer list @a init contains only two elements, and
+    3. the first element of @a init is a string,
+
+    @a init is converted into an object element and added using
+    @ref push_back(const typename object_t::value_type&). Otherwise, @a init
+    is converted to a JSON value and added using @ref push_back(basic_json&&).
+
+    @param[in] init  an initializer list
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @note This function is required to resolve an ambiguous overload error,
+          because pairs like `{"key", "value"}` can be both interpreted as
+          `object_t::value_type` or `std::initializer_list<basic_json>`, see
+          https://github.com/nlohmann/json/issues/235 for more information.
+
+    @liveexample{The example shows how initializer lists are treated as
+    objects when possible.,push_back__initializer_list}
+    */
+    void push_back(initializer_list_t init)
+    {
+        if (is_object() && init.size() == 2 && (*init.begin())->is_string())
+        {
+            basic_json&& key = init.begin()->moved_or_copied();
+            push_back(typename object_t::value_type(
+                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
+        }
+        else
+        {
+            push_back(basic_json(init));
+        }
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(initializer_list_t)
+    */
+    reference operator+=(initializer_list_t init)
+    {
+        push_back(init);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Creates a JSON value from the passed parameters @a args to the end of the
+    JSON value. If the function is called on a JSON null value, an empty array
+    is created before appending the value created from @a args.
+
+    @param[in] args arguments to forward to a constructor of @ref basic_json
+    @tparam Args compatible types to create a @ref basic_json object
+
+    @return reference to the inserted element
+
+    @throw type_error.311 when called on a type other than JSON array or
+    null; example: `"cannot use emplace_back() with number"`
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back()` can be used to add
+    elements to a JSON array. Note how the `null` value was silently converted
+    to a JSON array.,emplace_back}
+
+    @since version 2.0.8, returns reference since 3.7.0
+    */
+    template<class... Args>
+    reference emplace_back(Args&& ... args)
+    {
+        // emplace_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(311, "cannot use emplace_back() with " + std::string(type_name())));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+#ifdef JSON_HAS_CPP_17
+        return m_value.array->emplace_back(std::forward<Args>(args)...);
+#else
+        m_value.array->emplace_back(std::forward<Args>(args)...);
+        return m_value.array->back();
+#endif
+    }
+
+    /*!
+    @brief add an object to an object if key does not exist
+
+    Inserts a new element into a JSON object constructed in-place with the
+    given @a args if there is no element with the key in the container. If the
+    function is called on a JSON null value, an empty object is created before
+    appending the value created from @a args.
+
+    @param[in] args arguments to forward to a constructor of @ref basic_json
+    @tparam Args compatible types to create a @ref basic_json object
+
+    @return a pair consisting of an iterator to the inserted element, or the
+            already-existing element if no insertion happened, and a bool
+            denoting whether the insertion took place.
+
+    @throw type_error.311 when called on a type other than JSON object or
+    null; example: `"cannot use emplace() with number"`
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `emplace()` can be used to add elements
+    to a JSON object. Note how the `null` value was silently converted to a
+    JSON object. Further note how no value is added if there was already one
+    value stored with the same key.,emplace}
+
+    @since version 2.0.8
+    */
+    template<class... Args>
+    std::pair<iterator, bool> emplace(Args&& ... args)
+    {
+        // emplace only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
+        {
+            JSON_THROW(type_error::create(311, "cannot use emplace() with " + std::string(type_name())));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        auto res = m_value.object->emplace(std::forward<Args>(args)...);
+        // create result iterator and set iterator to the result of emplace
+        auto it = begin();
+        it.m_it.object_iterator = res.first;
+
+        // return pair of iterator and boolean
+        return {it, res.second};
+    }
+
+    /// Helper for insertion of an iterator
+    /// @note: This uses std::distance to support GCC 4.8,
+    ///        see https://github.com/nlohmann/json/pull/1257
+    template<typename... Args>
+    iterator insert_iterator(const_iterator pos, Args&& ... args)
+    {
+        iterator result(this);
+        JSON_ASSERT(m_value.array != nullptr);
+
+        auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator);
+        m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
+        result.m_it.array_iterator = m_value.array->begin() + insert_pos;
+
+        // This could have been written as:
+        // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
+        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
+
+        return result;
+    }
+
+    /*!
+    @brief inserts element
+
+    Inserts element @a val before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] val element to insert
+    @return iterator pointing to the inserted @a val.
+
+    @throw type_error.309 if called on JSON values other than arrays;
+    example: `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @complexity Constant plus linear in the distance between @a pos and end of
+    the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, const basic_json& val)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, val);
+        }
+
+        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief inserts element
+    @copydoc insert(const_iterator, const basic_json&)
+    */
+    iterator insert(const_iterator pos, basic_json&& val)
+    {
+        return insert(pos, val);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts @a cnt copies of @a val before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] cnt number of copies of @a val to insert
+    @param[in] val element to insert
+    @return iterator pointing to the first element inserted, or @a pos if
+    `cnt==0`
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @complexity Linear in @a cnt plus linear in the distance between @a pos
+    and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__count}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, size_type cnt, const basic_json& val)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, cnt, val);
+        }
+
+        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)` before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+    @throw invalid_iterator.211 if @a first or @a last are iterators into
+    container for which insert is called; example: `"passed iterators may not
+    belong to container"`
+
+    @return iterator pointing to the first element inserted, or @a pos if
+    `first==last`
+
+    @complexity Linear in `std::distance(first, last)` plus linear in the
+    distance between @a pos and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__range}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
+        {
+            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container"));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from initializer list @a ilist before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] ilist initializer list to insert the values from
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @return iterator pointing to the first element inserted, or @a pos if
+    `ilist` is empty
+
+    @complexity Linear in `ilist.size()` plus linear in the distance between
+    @a pos and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__ilist}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, initializer_list_t ilist)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, ilist.begin(), ilist.end());
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)`.
+
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.309 if called on JSON values other than objects; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if iterator @a first or @a last does does not
+    point to an object; example: `"iterators first and last must point to
+    objects"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+
+    @complexity Logarithmic: `O(N*log(size() + N))`, where `N` is the number
+    of elements to insert.
+
+    @liveexample{The example shows how `insert()` is used.,insert__range_object}
+
+    @since version 3.0.0
+    */
+    void insert(const_iterator first, const_iterator last)
+    {
+        // insert only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
+        }
+
+        m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
+    }
+
+    /*!
+    @brief updates a JSON object from another object, overwriting existing keys
+
+    Inserts all values from JSON object @a j and overwrites existing keys.
+
+    @param[in] j  JSON object to read values from
+
+    @throw type_error.312 if called on JSON values other than objects; example:
+    `"cannot use update() with string"`
+
+    @complexity O(N*log(size() + N)), where N is the number of elements to
+                insert.
+
+    @liveexample{The example shows how `update()` is used.,update}
+
+    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
+
+    @since version 3.0.0
+    */
+    void update(const_reference j)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name())));
+        }
+        if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(j.type_name())));
+        }
+
+        for (auto it = j.cbegin(); it != j.cend(); ++it)
+        {
+            m_value.object->operator[](it.key()) = it.value();
+        }
+    }
+
+    /*!
+    @brief updates a JSON object from another object, overwriting existing keys
+
+    Inserts all values from from range `[first, last)` and overwrites existing
+    keys.
+
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.312 if called on JSON values other than objects; example:
+    `"cannot use update() with string"`
+    @throw invalid_iterator.202 if iterator @a first or @a last does does not
+    point to an object; example: `"iterators first and last must point to
+    objects"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+
+    @complexity O(N*log(size() + N)), where N is the number of elements to
+                insert.
+
+    @liveexample{The example shows how `update()` is used__range.,update}
+
+    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
+
+    @since version 3.0.0
+    */
+    void update(const_iterator first, const_iterator last)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name())));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()
+                                 || !last.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
+        }
+
+        for (auto it = first; it != last; ++it)
+        {
+            m_value.object->operator[](it.key()) = it.value();
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be swapped with
+    `swap()`.,swap__reference}
+
+    @since version 1.0.0
+    */
+    void swap(reference other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        std::swap(m_type, other.m_type);
+        std::swap(m_value, other.m_value);
+        assert_invariant();
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value from @a left with those of @a right. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated. implemented as a friend function callable via ADL.
+
+    @param[in,out] left JSON value to exchange the contents with
+    @param[in,out] right JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be swapped with
+    `swap()`.,swap__reference}
+
+    @since version 1.0.0
+    */
+    friend void swap(reference left, reference right) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        left.swap(right);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON array with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other array to exchange the contents with
+
+    @throw type_error.310 when JSON value is not an array; example: `"cannot
+    use swap() with string"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how arrays can be swapped with
+    `swap()`.,swap__array_t}
+
+    @since version 1.0.0
+    */
+    void swap(array_t& other)
+    {
+        // swap only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            std::swap(*(m_value.array), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON object with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other object to exchange the contents with
+
+    @throw type_error.310 when JSON value is not an object; example:
+    `"cannot use swap() with string"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how objects can be swapped with
+    `swap()`.,swap__object_t}
+
+    @since version 1.0.0
+    */
+    void swap(object_t& other)
+    {
+        // swap only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            std::swap(*(m_value.object), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other string to exchange the contents with
+
+    @throw type_error.310 when JSON value is not a string; example: `"cannot
+    use swap() with boolean"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how strings can be swapped with
+    `swap()`.,swap__string_t}
+
+    @since version 1.0.0
+    */
+    void swap(string_t& other)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_string()))
+        {
+            std::swap(*(m_value.string), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other binary to exchange the contents with
+
+    @throw type_error.310 when JSON value is not a string; example: `"cannot
+    use swap() with boolean"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how strings can be swapped with
+    `swap()`.,swap__binary_t}
+
+    @since version 3.8.0
+    */
+    void swap(binary_t& other)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary()))
+        {
+            std::swap(*(m_value.binary), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /// @copydoc swap(binary_t)
+    void swap(typename binary_t::container_type& other)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary()))
+        {
+            std::swap(*(m_value.binary), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
+        }
+    }
+
+    /// @}
+
+  public:
+    //////////////////////////////////////////
+    // lexicographical comparison operators //
+    //////////////////////////////////////////
+
+    /// @name lexicographical comparison operators
+    /// @{
+
+    /*!
+    @brief comparison: equal
+
+    Compares two JSON values for equality according to the following rules:
+    - Two JSON values are equal if (1) they are from the same type and (2)
+      their stored values are the same according to their respective
+      `operator==`.
+    - Integer and floating-point numbers are automatically converted before
+      comparison. Note that two NaN values are always treated as unequal.
+    - Two JSON null values are equal.
+
+    @note Floating-point inside JSON values numbers are compared with
+    `json::number_float_t::operator==` which is `double::operator==` by
+    default. To compare floating-point while respecting an epsilon, an alternative
+    [comparison function](https://github.com/mariokonrad/marnav/blob/master/include/marnav/math/floatingpoint.hpp#L34-#L39)
+    could be used, for instance
+    @code {.cpp}
+    template<typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, T>::type>
+    inline bool is_same(T a, T b, T epsilon = std::numeric_limits<T>::epsilon()) noexcept
+    {
+        return std::abs(a - b) <= epsilon;
+    }
+    @endcode
+    Or you can self-defined operator equal function like this:
+    @code {.cpp}
+    bool my_equal(const_reference lhs, const_reference rhs) {
+    const auto lhs_type lhs.type();
+    const auto rhs_type rhs.type();
+    if (lhs_type == rhs_type) {
+        switch(lhs_type)
+            // self_defined case
+            case value_t::number_float:
+                return std::abs(lhs - rhs) <= std::numeric_limits<float>::epsilon();
+            // other cases remain the same with the original
+            ...
+    }
+    ...
+    }
+    @endcode
+
+    @note NaN values never compare equal to themselves or to other NaN values.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are equal
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__equal}
+
+    @since version 1.0.0
+    */
+    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case value_t::array:
+                    return *lhs.m_value.array == *rhs.m_value.array;
+
+                case value_t::object:
+                    return *lhs.m_value.object == *rhs.m_value.object;
+
+                case value_t::null:
+                    return true;
+
+                case value_t::string:
+                    return *lhs.m_value.string == *rhs.m_value.string;
+
+                case value_t::boolean:
+                    return lhs.m_value.boolean == rhs.m_value.boolean;
+
+                case value_t::number_integer:
+                    return lhs.m_value.number_integer == rhs.m_value.number_integer;
+
+                case value_t::number_unsigned:
+                    return lhs.m_value.number_unsigned == rhs.m_value.number_unsigned;
+
+                case value_t::number_float:
+                    return lhs.m_value.number_float == rhs.m_value.number_float;
+
+                case value_t::binary:
+                    return *lhs.m_value.binary == *rhs.m_value.binary;
+
+                default:
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) == rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
+        {
+            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_integer;
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_integer == static_cast<number_integer_t>(rhs.m_value.number_unsigned);
+        }
+
+        return false;
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs == basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) == rhs;
+    }
+
+    /*!
+    @brief comparison: not equal
+
+    Compares two JSON values for inequality by calculating `not (lhs == rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are not equal
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__notequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs != basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) != rhs;
+    }
+
+    /*!
+    @brief comparison: less than
+
+    Compares whether one JSON value @a lhs is less than another JSON value @a
+    rhs according to the following rules:
+    - If @a lhs and @a rhs have the same type, the values are compared using
+      the default `<` operator.
+    - Integer and floating-point numbers are automatically converted before
+      comparison
+    - In case @a lhs and @a rhs have different types, the values are ignored
+      and the order of the types is considered, see
+      @ref operator<(const value_t, const value_t).
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__less}
+
+    @since version 1.0.0
+    */
+    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case value_t::array:
+                    // note parentheses are necessary, see
+                    // https://github.com/nlohmann/json/issues/1530
+                    return (*lhs.m_value.array) < (*rhs.m_value.array);
+
+                case value_t::object:
+                    return (*lhs.m_value.object) < (*rhs.m_value.object);
+
+                case value_t::null:
+                    return false;
+
+                case value_t::string:
+                    return (*lhs.m_value.string) < (*rhs.m_value.string);
+
+                case value_t::boolean:
+                    return (lhs.m_value.boolean) < (rhs.m_value.boolean);
+
+                case value_t::number_integer:
+                    return (lhs.m_value.number_integer) < (rhs.m_value.number_integer);
+
+                case value_t::number_unsigned:
+                    return (lhs.m_value.number_unsigned) < (rhs.m_value.number_unsigned);
+
+                case value_t::number_float:
+                    return (lhs.m_value.number_float) < (rhs.m_value.number_float);
+
+                case value_t::binary:
+                    return (*lhs.m_value.binary) < (*rhs.m_value.binary);
+
+                default:
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) < rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_integer < static_cast<number_integer_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
+        {
+            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_integer;
+        }
+
+        // We only reach this line if we cannot compare values. In that case,
+        // we compare types. Note we have to call the operator explicitly,
+        // because MSVC has problems otherwise.
+        return operator<(lhs_type, rhs_type);
+    }
+
+    /*!
+    @brief comparison: less than
+    @copydoc operator<(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs < basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: less than
+    @copydoc operator<(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) < rhs;
+    }
+
+    /*!
+    @brief comparison: less than or equal
+
+    Compares whether one JSON value @a lhs is less than or equal to another
+    JSON value by calculating `not (rhs < lhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than or equal to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greater}
+
+    @since version 1.0.0
+    */
+    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(rhs < lhs);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @copydoc operator<=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs <= basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @copydoc operator<=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) <= rhs;
+    }
+
+    /*!
+    @brief comparison: greater than
+
+    Compares whether one JSON value @a lhs is greater than another
+    JSON value by calculating `not (lhs <= rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__lessequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs <= rhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @copydoc operator>(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs > basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @copydoc operator>(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) > rhs;
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+
+    Compares whether one JSON value @a lhs is greater than or equal to another
+    JSON value by calculating `not (lhs < rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than or equal to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greaterequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs < rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @copydoc operator>=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(const_reference lhs, const ScalarType rhs) noexcept
+    {
+        return lhs >= basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @copydoc operator>=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(const ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) >= rhs;
+    }
+
+    /// @}
+
+    ///////////////////
+    // serialization //
+    ///////////////////
+
+    /// @name serialization
+    /// @{
+
+    /*!
+    @brief serialize to stream
+
+    Serialize the given JSON value @a j to the output stream @a o. The JSON
+    value will be serialized using the @ref dump member function.
+
+    - The indentation of the output can be controlled with the member variable
+      `width` of the output stream @a o. For instance, using the manipulator
+      `std::setw(4)` on @a o sets the indentation level to `4` and the
+      serialization result is the same as calling `dump(4)`.
+
+    - The indentation character can be controlled with the member variable
+      `fill` of the output stream @a o. For instance, the manipulator
+      `std::setfill('\\t')` sets indentation to use a tab character rather than
+      the default space character.
+
+    @param[in,out] o  stream to serialize to
+    @param[in] j  JSON value to serialize
+
+    @return the stream @a o
+
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded
+
+    @complexity Linear.
+
+    @liveexample{The example below shows the serialization with different
+    parameters to `width` to adjust the indentation level.,operator_serialize}
+
+    @since version 1.0.0; indentation character added in version 3.0.0
+    */
+    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
+    {
+        // read width member and use it as indentation parameter if nonzero
+        const bool pretty_print = o.width() > 0;
+        const auto indentation = pretty_print ? o.width() : 0;
+
+        // reset width to 0 for subsequent calls to this stream
+        o.width(0);
+
+        // do the actual serialization
+        serializer s(detail::output_adapter<char>(o), o.fill());
+        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
+        return o;
+    }
+
+    /*!
+    @brief serialize to stream
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use
+                @ref operator<<(std::ostream&, const basic_json&)
+                instead; that is, replace calls like `j >> o;` with `o << j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
+    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
+    {
+        return o << j;
+    }
+
+    /// @}
+
+
+    /////////////////////
+    // deserialization //
+    /////////////////////
+
+    /// @name deserialization
+    /// @{
+
+    /*!
+    @brief deserialize from a compatible input
+
+    @tparam InputType A compatible input, for instance
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i  input to read from
+    @param[in] cb  a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb or reading from the input @a i has a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from an array.,parse__array__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function with
+    and without callback function.,parse__string__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function with
+    and without callback function.,parse__istream__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from a contiguous container.,parse__contiguouscontainer__parser_callback_t}
+
+    @since version 2.0.3 (contiguous containers); version 3.9.0 allowed to
+    ignore comments.
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(InputType&& i,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(detail::input_adapter(std::forward<InputType>(i)), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    /*!
+    @brief deserialize from a pair of character iterators
+
+    The value_type of the iterator must be a integral type with size of 1, 2 or
+    4 bytes, which will be interpreted respectively as UTF-8, UTF-16 and UTF-32.
+
+    @param[in] first iterator to start of character range
+    @param[in] last  iterator to end of character range
+    @param[in] cb  a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(IteratorType first,
+                            IteratorType last,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
+    static basic_json parse(detail::span_input_adapter&& i,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    /*!
+    @brief check if the input is valid JSON
+
+    Unlike the @ref parse(InputType&&, const parser_callback_t,const bool)
+    function, this function neither throws an exception in case of invalid JSON
+    input (i.e., a parse error) nor creates diagnostic information.
+
+    @tparam InputType A compatible input, for instance
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i input to read from
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return Whether the input read from @a i is valid JSON.
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `accept()` function reading
+    from a string.,accept__string}
+    */
+    template<typename InputType>
+    static bool accept(InputType&& i,
+                       const bool ignore_comments = false)
+    {
+        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    template<typename IteratorType>
+    static bool accept(IteratorType first, IteratorType last,
+                       const bool ignore_comments = false)
+    {
+        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
+    static bool accept(detail::span_input_adapter&& i,
+                       const bool ignore_comments = false)
+    {
+        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
+    }
+
+    /*!
+    @brief generate SAX events
+
+    The SAX event lister must follow the interface of @ref json_sax.
+
+    This function reads from a compatible input. Examples are:
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i  input to read from
+    @param[in,out] sax  SAX event listener
+    @param[in] format  the format to parse (JSON, CBOR, MessagePack, or UBJSON)
+    @param[in] strict  whether the input has to be consumed completely
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default); only applies to the JSON file format.
+
+    @return return value of the last processed SAX event
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the SAX consumer @a sax has
+    a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `sax_parse()` function
+    reading from string and processing the events with a user-defined SAX
+    event consumer.,sax_parse}
+
+    @since version 3.2.0
+    */
+    template <typename InputType, typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(InputType&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+
+    template<class IteratorType, class SAX>
+    JSON_HEDLEY_NON_NULL(3)
+    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+
+    template <typename SAX>
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(detail::span_input_adapter&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = i.get();
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+
+    /*!
+    @brief deserialize from stream
+    @deprecated This stream operator is deprecated and will be removed in
+                version 4.0.0 of the library. Please use
+                @ref operator>>(std::istream&, basic_json&)
+                instead; that is, replace calls like `j << i;` with `i >> j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
+    friend std::istream& operator<<(basic_json& j, std::istream& i)
+    {
+        return operator>>(i, j);
+    }
+
+    /*!
+    @brief deserialize from stream
+
+    Deserializes an input stream to a JSON value.
+
+    @param[in,out] i  input stream to read a serialized JSON value from
+    @param[in,out] j  JSON value to write the deserialized input to
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below shows how a JSON value is constructed by
+    reading a serialization from a stream.,operator_deserialize}
+
+    @sa parse(std::istream&, const parser_callback_t) for a variant with a
+    parser callback function to filter values while parsing
+
+    @since version 1.0.0
+    */
+    friend std::istream& operator>>(std::istream& i, basic_json& j)
+    {
+        parser(detail::input_adapter(i)).parse(false, j);
+        return i;
+    }
+
+    /// @}
+
+    ///////////////////////////
+    // convenience functions //
+    ///////////////////////////
+
+    /*!
+    @brief return the type as string
+
+    Returns the type name as string to be used in error messages - usually to
+    indicate that a function was called on a wrong JSON type.
+
+    @return a string representation of a the @a m_type member:
+            Value type  | return value
+            ----------- | -------------
+            null        | `"null"`
+            boolean     | `"boolean"`
+            string      | `"string"`
+            number      | `"number"` (for all number types)
+            object      | `"object"`
+            array       | `"array"`
+            binary      | `"binary"`
+            discarded   | `"discarded"`
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies `type_name()` for all JSON
+    types.,type_name}
+
+    @sa @ref type() -- return the type of the JSON value
+    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
+
+    @since version 1.0.0, public since 2.1.0, `const char*` and `noexcept`
+    since 3.0.0
+    */
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char* type_name() const noexcept
+    {
+        {
+            switch (m_type)
+            {
+                case value_t::null:
+                    return "null";
+                case value_t::object:
+                    return "object";
+                case value_t::array:
+                    return "array";
+                case value_t::string:
+                    return "string";
+                case value_t::boolean:
+                    return "boolean";
+                case value_t::binary:
+                    return "binary";
+                case value_t::discarded:
+                    return "discarded";
+                default:
+                    return "number";
+            }
+        }
+    }
+
+
+  private:
+    //////////////////////
+    // member variables //
+    //////////////////////
+
+    /// the type of the current element
+    value_t m_type = value_t::null;
+
+    /// the value of the current element
+    json_value m_value = {};
+
+    //////////////////////////////////////////
+    // binary serialization/deserialization //
+    //////////////////////////////////////////
+
+    /// @name binary serialization/deserialization support
+    /// @{
+
+  public:
+    /*!
+    @brief create a CBOR serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the CBOR (Concise
+    Binary Object Representation) serialization format. CBOR is a binary
+    serialization format which aims to be more compact than JSON itself, yet
+    more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    CBOR types according to the CBOR specification (RFC 7049):
+
+    JSON value type | value/range                                | CBOR type                          | first byte
+    --------------- | ------------------------------------------ | ---------------------------------- | ---------------
+    null            | `null`                                     | Null                               | 0xF6
+    boolean         | `true`                                     | True                               | 0xF5
+    boolean         | `false`                                    | False                              | 0xF4
+    number_integer  | -9223372036854775808..-2147483649          | Negative integer (8 bytes follow)  | 0x3B
+    number_integer  | -2147483648..-32769                        | Negative integer (4 bytes follow)  | 0x3A
+    number_integer  | -32768..-129                               | Negative integer (2 bytes follow)  | 0x39
+    number_integer  | -128..-25                                  | Negative integer (1 byte follow)   | 0x38
+    number_integer  | -24..-1                                    | Negative integer                   | 0x20..0x37
+    number_integer  | 0..23                                      | Integer                            | 0x00..0x17
+    number_integer  | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
+    number_integer  | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
+    number_integer  | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_integer  | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_unsigned | 0..23                                      | Integer                            | 0x00..0x17
+    number_unsigned | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
+    number_unsigned | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
+    number_unsigned | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_unsigned | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_float    | *any value representable by a float*       | Single-Precision Float             | 0xFA
+    number_float    | *any value NOT representable by a float*   | Double-Precision Float             | 0xFB
+    string          | *length*: 0..23                            | UTF-8 string                       | 0x60..0x77
+    string          | *length*: 23..255                          | UTF-8 string (1 byte follow)       | 0x78
+    string          | *length*: 256..65535                       | UTF-8 string (2 bytes follow)      | 0x79
+    string          | *length*: 65536..4294967295                | UTF-8 string (4 bytes follow)      | 0x7A
+    string          | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow)      | 0x7B
+    array           | *size*: 0..23                              | array                              | 0x80..0x97
+    array           | *size*: 23..255                            | array (1 byte follow)              | 0x98
+    array           | *size*: 256..65535                         | array (2 bytes follow)             | 0x99
+    array           | *size*: 65536..4294967295                  | array (4 bytes follow)             | 0x9A
+    array           | *size*: 4294967296..18446744073709551615   | array (8 bytes follow)             | 0x9B
+    object          | *size*: 0..23                              | map                                | 0xA0..0xB7
+    object          | *size*: 23..255                            | map (1 byte follow)                | 0xB8
+    object          | *size*: 256..65535                         | map (2 bytes follow)               | 0xB9
+    object          | *size*: 65536..4294967295                  | map (4 bytes follow)               | 0xBA
+    object          | *size*: 4294967296..18446744073709551615   | map (8 bytes follow)               | 0xBB
+    binary          | *size*: 0..23                              | byte string                        | 0x40..0x57
+    binary          | *size*: 23..255                            | byte string (1 byte follow)        | 0x58
+    binary          | *size*: 256..65535                         | byte string (2 bytes follow)       | 0x59
+    binary          | *size*: 65536..4294967295                  | byte string (4 bytes follow)       | 0x5A
+    binary          | *size*: 4294967296..18446744073709551615   | byte string (8 bytes follow)       | 0x5B
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a CBOR value.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The following CBOR types are not used in the conversion:
+          - UTF-8 strings terminated by "break" (0x7F)
+          - arrays terminated by "break" (0x9F)
+          - maps terminated by "break" (0xBF)
+          - byte strings terminated by "break" (0x5F)
+          - date/time (0xC0..0xC1)
+          - bignum (0xC2..0xC3)
+          - decimal fraction (0xC4)
+          - bigfloat (0xC5)
+          - expected conversions (0xD5..0xD7)
+          - simple values (0xE0..0xF3, 0xF8)
+          - undefined (0xF7)
+          - half-precision floats (0xF9)
+          - break (0xFF)
+
+    @param[in] j  JSON value to serialize
+    @return CBOR serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in CBOR format.,to_cbor}
+
+    @sa http://cbor.io
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
+        analogous deserialization
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+
+    @since version 2.0.9; compact representation of floating-point numbers
+           since version 3.8.0
+    */
+    static std::vector<uint8_t> to_cbor(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_cbor(j, result);
+        return result;
+    }
+
+    static void to_cbor(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_cbor(j);
+    }
+
+    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_cbor(j);
+    }
+
+    /*!
+    @brief create a MessagePack serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the MessagePack
+    serialization format. MessagePack is a binary serialization format which
+    aims to be more compact than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    MessagePack types according to the MessagePack specification:
+
+    JSON value type | value/range                       | MessagePack type | first byte
+    --------------- | --------------------------------- | ---------------- | ----------
+    null            | `null`                            | nil              | 0xC0
+    boolean         | `true`                            | true             | 0xC3
+    boolean         | `false`                           | false            | 0xC2
+    number_integer  | -9223372036854775808..-2147483649 | int64            | 0xD3
+    number_integer  | -2147483648..-32769               | int32            | 0xD2
+    number_integer  | -32768..-129                      | int16            | 0xD1
+    number_integer  | -128..-33                         | int8             | 0xD0
+    number_integer  | -32..-1                           | negative fixint  | 0xE0..0xFF
+    number_integer  | 0..127                            | positive fixint  | 0x00..0x7F
+    number_integer  | 128..255                          | uint 8           | 0xCC
+    number_integer  | 256..65535                        | uint 16          | 0xCD
+    number_integer  | 65536..4294967295                 | uint 32          | 0xCE
+    number_integer  | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_unsigned | 0..127                            | positive fixint  | 0x00..0x7F
+    number_unsigned | 128..255                          | uint 8           | 0xCC
+    number_unsigned | 256..65535                        | uint 16          | 0xCD
+    number_unsigned | 65536..4294967295                 | uint 32          | 0xCE
+    number_unsigned | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_float    | *any value representable by a float*     | float 32 | 0xCA
+    number_float    | *any value NOT representable by a float* | float 64 | 0xCB
+    string          | *length*: 0..31                   | fixstr           | 0xA0..0xBF
+    string          | *length*: 32..255                 | str 8            | 0xD9
+    string          | *length*: 256..65535              | str 16           | 0xDA
+    string          | *length*: 65536..4294967295       | str 32           | 0xDB
+    array           | *size*: 0..15                     | fixarray         | 0x90..0x9F
+    array           | *size*: 16..65535                 | array 16         | 0xDC
+    array           | *size*: 65536..4294967295         | array 32         | 0xDD
+    object          | *size*: 0..15                     | fix map          | 0x80..0x8F
+    object          | *size*: 16..65535                 | map 16           | 0xDE
+    object          | *size*: 65536..4294967295         | map 32           | 0xDF
+    binary          | *size*: 0..255                    | bin 8            | 0xC4
+    binary          | *size*: 256..65535                | bin 16           | 0xC5
+    binary          | *size*: 65536..4294967295         | bin 32           | 0xC6
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a MessagePack value.
+
+    @note The following values can **not** be converted to a MessagePack value:
+          - strings with more than 4294967295 bytes
+          - byte strings with more than 4294967295 bytes
+          - arrays with more than 4294967295 elements
+          - objects with more than 4294967295 elements
+
+    @note Any MessagePack output created @ref to_msgpack can be successfully
+          parsed by @ref from_msgpack.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @param[in] j  JSON value to serialize
+    @return MessagePack serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in MessagePack format.,to_msgpack}
+
+    @sa http://msgpack.org
+    @sa @ref from_msgpack for the analogous deserialization
+    @sa @ref to_cbor(const basic_json& for the related CBOR format
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+
+    @since version 2.0.9
+    */
+    static std::vector<uint8_t> to_msgpack(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_msgpack(j, result);
+        return result;
+    }
+
+    static void to_msgpack(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_msgpack(j);
+    }
+
+    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_msgpack(j);
+    }
+
+    /*!
+    @brief create a UBJSON serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the UBJSON
+    (Universal Binary JSON) serialization format. UBJSON aims to be more compact
+    than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    UBJSON types according to the UBJSON specification:
+
+    JSON value type | value/range                       | UBJSON type | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | `Z`
+    boolean         | `true`                            | true        | `T`
+    boolean         | `false`                           | false       | `F`
+    number_integer  | -9223372036854775808..-2147483649 | int64       | `L`
+    number_integer  | -2147483648..-32769               | int32       | `l`
+    number_integer  | -32768..-129                      | int16       | `I`
+    number_integer  | -128..127                         | int8        | `i`
+    number_integer  | 128..255                          | uint8       | `U`
+    number_integer  | 256..32767                        | int16       | `I`
+    number_integer  | 32768..2147483647                 | int32       | `l`
+    number_integer  | 2147483648..9223372036854775807   | int64       | `L`
+    number_unsigned | 0..127                            | int8        | `i`
+    number_unsigned | 128..255                          | uint8       | `U`
+    number_unsigned | 256..32767                        | int16       | `I`
+    number_unsigned | 32768..2147483647                 | int32       | `l`
+    number_unsigned | 2147483648..9223372036854775807   | int64       | `L`
+    number_unsigned | 2147483649..18446744073709551615  | high-precision | `H`
+    number_float    | *any value*                       | float64     | `D`
+    string          | *with shortest length indicator*  | string      | `S`
+    array           | *see notes on optimized format*   | array       | `[`
+    object          | *see notes on optimized format*   | map         | `{`
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a UBJSON value.
+
+    @note The following values can **not** be converted to a UBJSON value:
+          - strings with more than 9223372036854775807 bytes (theoretical)
+
+    @note The following markers are not used in the conversion:
+          - `Z`: no-op values are not created.
+          - `C`: single-byte strings are serialized with `S` markers.
+
+    @note Any UBJSON output created @ref to_ubjson can be successfully parsed
+          by @ref from_ubjson.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The optimized formats for containers are supported: Parameter
+          @a use_size adds size information to the beginning of a container and
+          removes the closing marker. Parameter @a use_type further checks
+          whether all elements of a container have the same type and adds the
+          type marker to the beginning of the container. The @a use_type
+          parameter must only be used together with @a use_size = true. Note
+          that @a use_size = true alone may result in larger representations -
+          the benefit of this parameter is that the receiving side is
+          immediately informed on the number of elements of the container.
+
+    @note If the JSON data contains the binary type, the value stored is a list
+          of integers, as suggested by the UBJSON documentation.  In particular,
+          this means that serialization and the deserialization of a JSON
+          containing binary values into UBJSON and back will result in a
+          different JSON object.
+
+    @param[in] j  JSON value to serialize
+    @param[in] use_size  whether to add size annotations to container types
+    @param[in] use_type  whether to add type annotations to container types
+                         (must be combined with @a use_size = true)
+    @return UBJSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in UBJSON format.,to_ubjson}
+
+    @sa http://ubjson.org
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        analogous deserialization
+    @sa @ref to_cbor(const basic_json& for the related CBOR format
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+
+    @since version 3.1.0
+    */
+    static std::vector<uint8_t> to_ubjson(const basic_json& j,
+                                          const bool use_size = false,
+                                          const bool use_type = false)
+    {
+        std::vector<uint8_t> result;
+        to_ubjson(j, result, use_size, use_type);
+        return result;
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<uint8_t> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<uint8_t>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
+    }
+
+
+    /*!
+    @brief Serializes the given JSON object `j` to BSON and returns a vector
+           containing the corresponding BSON-representation.
+
+    BSON (Binary JSON) is a binary format in which zero or more ordered key/value pairs are
+    stored as a single entity (a so-called document).
+
+    The library uses the following mapping from JSON values types to BSON types:
+
+    JSON value type | value/range                       | BSON type   | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | 0x0A
+    boolean         | `true`, `false`                   | boolean     | 0x08
+    number_integer  | -9223372036854775808..-2147483649 | int64       | 0x12
+    number_integer  | -2147483648..2147483647           | int32       | 0x10
+    number_integer  | 2147483648..9223372036854775807   | int64       | 0x12
+    number_unsigned | 0..2147483647                     | int32       | 0x10
+    number_unsigned | 2147483648..9223372036854775807   | int64       | 0x12
+    number_unsigned | 9223372036854775808..18446744073709551615| --   | --
+    number_float    | *any value*                       | double      | 0x01
+    string          | *any value*                       | string      | 0x02
+    array           | *any value*                       | document    | 0x04
+    object          | *any value*                       | document    | 0x03
+    binary          | *any value*                       | binary      | 0x05
+
+    @warning The mapping is **incomplete**, since only JSON-objects (and things
+    contained therein) can be serialized to BSON.
+    Also, integers larger than 9223372036854775807 cannot be serialized to BSON,
+    and the keys may not contain U+0000, since they are serialized a
+    zero-terminated c-strings.
+
+    @throw out_of_range.407  if `j.is_number_unsigned() && j.get<std::uint64_t>() > 9223372036854775807`
+    @throw out_of_range.409  if a key in `j` contains a NULL (U+0000)
+    @throw type_error.317    if `!j.is_object()`
+
+    @pre The input `j` is required to be an object: `j.is_object() == true`.
+
+    @note Any BSON output created via @ref to_bson can be successfully parsed
+          by @ref from_bson.
+
+    @param[in] j  JSON value to serialize
+    @return BSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in BSON format.,to_bson}
+
+    @sa http://bsonspec.org/spec.html
+    @sa @ref from_bson(detail::input_adapter&&, const bool strict) for the
+        analogous deserialization
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+    @sa @ref to_cbor(const basic_json&) for the related CBOR format
+    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
+    */
+    static std::vector<uint8_t> to_bson(const basic_json& j)
+    {
+        std::vector<uint8_t> result;
+        to_bson(j, result);
+        return result;
+    }
+
+    /*!
+    @brief Serializes the given JSON object `j` to BSON and forwards the
+           corresponding BSON-representation to the given output_adapter `o`.
+    @param j The JSON object to convert to BSON.
+    @param o The output adapter that receives the binary BSON representation.
+    @pre The input `j` shall be an object: `j.is_object() == true`
+    @sa @ref to_bson(const basic_json&)
+    */
+    static void to_bson(const basic_json& j, detail::output_adapter<uint8_t> o)
+    {
+        binary_writer<uint8_t>(o).write_bson(j);
+    }
+
+    /*!
+    @copydoc to_bson(const basic_json&, detail::output_adapter<uint8_t>)
+    */
+    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_bson(j);
+    }
+
+
+    /*!
+    @brief create a JSON value from an input in CBOR format
+
+    Deserializes a given input @a i to a JSON value using the CBOR (Concise
+    Binary Object Representation) serialization format.
+
+    The library maps CBOR types to JSON value types as follows:
+
+    CBOR type              | JSON value type | first byte
+    ---------------------- | --------------- | ----------
+    Integer                | number_unsigned | 0x00..0x17
+    Unsigned integer       | number_unsigned | 0x18
+    Unsigned integer       | number_unsigned | 0x19
+    Unsigned integer       | number_unsigned | 0x1A
+    Unsigned integer       | number_unsigned | 0x1B
+    Negative integer       | number_integer  | 0x20..0x37
+    Negative integer       | number_integer  | 0x38
+    Negative integer       | number_integer  | 0x39
+    Negative integer       | number_integer  | 0x3A
+    Negative integer       | number_integer  | 0x3B
+    Byte string            | binary          | 0x40..0x57
+    Byte string            | binary          | 0x58
+    Byte string            | binary          | 0x59
+    Byte string            | binary          | 0x5A
+    Byte string            | binary          | 0x5B
+    UTF-8 string           | string          | 0x60..0x77
+    UTF-8 string           | string          | 0x78
+    UTF-8 string           | string          | 0x79
+    UTF-8 string           | string          | 0x7A
+    UTF-8 string           | string          | 0x7B
+    UTF-8 string           | string          | 0x7F
+    array                  | array           | 0x80..0x97
+    array                  | array           | 0x98
+    array                  | array           | 0x99
+    array                  | array           | 0x9A
+    array                  | array           | 0x9B
+    array                  | array           | 0x9F
+    map                    | object          | 0xA0..0xB7
+    map                    | object          | 0xB8
+    map                    | object          | 0xB9
+    map                    | object          | 0xBA
+    map                    | object          | 0xBB
+    map                    | object          | 0xBF
+    False                  | `false`         | 0xF4
+    True                   | `true`          | 0xF5
+    Null                   | `null`          | 0xF6
+    Half-Precision Float   | number_float    | 0xF9
+    Single-Precision Float | number_float    | 0xFA
+    Double-Precision Float | number_float    | 0xFB
+
+    @warning The mapping is **incomplete** in the sense that not all CBOR
+             types can be converted to a JSON value. The following CBOR types
+             are not supported and will yield parse errors (parse_error.112):
+             - date/time (0xC0..0xC1)
+             - bignum (0xC2..0xC3)
+             - decimal fraction (0xC4)
+             - bigfloat (0xC5)
+             - expected conversions (0xD5..0xD7)
+             - simple values (0xE0..0xF3, 0xF8)
+             - undefined (0xF7)
+
+    @warning CBOR allows map keys of any type, whereas JSON only allows
+             strings as keys in object values. Therefore, CBOR maps with keys
+             other than UTF-8 strings are rejected (parse_error.113).
+
+    @note Any CBOR output created @ref to_cbor can be successfully parsed by
+          @ref from_cbor.
+
+    @param[in] i  an input in CBOR format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] tag_handler how to treat CBOR tags (optional, error by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if unsupported features from CBOR were
+    used in the given input @a v or if the input is not valid CBOR
+    @throw parse_error.113 if a string was expected as map key, but not found
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in CBOR
+    format to a JSON value.,from_cbor}
+
+    @sa http://cbor.io
+    @sa @ref to_cbor(const basic_json&) for the analogous serialization
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for the
+        related MessagePack format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        related UBJSON format
+
+    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
+           consume input adapters, removed start_index parameter, and added
+           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
+           since 3.2.0; added @a tag_handler parameter since 3.9.0.
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(InputType&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(IteratorType first, IteratorType last,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json from_cbor(const T* ptr, std::size_t len,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
+    }
+
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json from_cbor(detail::span_input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @brief create a JSON value from an input in MessagePack format
+
+    Deserializes a given input @a i to a JSON value using the MessagePack
+    serialization format.
+
+    The library maps MessagePack types to JSON value types as follows:
+
+    MessagePack type | JSON value type | first byte
+    ---------------- | --------------- | ----------
+    positive fixint  | number_unsigned | 0x00..0x7F
+    fixmap           | object          | 0x80..0x8F
+    fixarray         | array           | 0x90..0x9F
+    fixstr           | string          | 0xA0..0xBF
+    nil              | `null`          | 0xC0
+    false            | `false`         | 0xC2
+    true             | `true`          | 0xC3
+    float 32         | number_float    | 0xCA
+    float 64         | number_float    | 0xCB
+    uint 8           | number_unsigned | 0xCC
+    uint 16          | number_unsigned | 0xCD
+    uint 32          | number_unsigned | 0xCE
+    uint 64          | number_unsigned | 0xCF
+    int 8            | number_integer  | 0xD0
+    int 16           | number_integer  | 0xD1
+    int 32           | number_integer  | 0xD2
+    int 64           | number_integer  | 0xD3
+    str 8            | string          | 0xD9
+    str 16           | string          | 0xDA
+    str 32           | string          | 0xDB
+    array 16         | array           | 0xDC
+    array 32         | array           | 0xDD
+    map 16           | object          | 0xDE
+    map 32           | object          | 0xDF
+    bin 8            | binary          | 0xC4
+    bin 16           | binary          | 0xC5
+    bin 32           | binary          | 0xC6
+    ext 8            | binary          | 0xC7
+    ext 16           | binary          | 0xC8
+    ext 32           | binary          | 0xC9
+    fixext 1         | binary          | 0xD4
+    fixext 2         | binary          | 0xD5
+    fixext 4         | binary          | 0xD6
+    fixext 8         | binary          | 0xD7
+    fixext 16        | binary          | 0xD8
+    negative fixint  | number_integer  | 0xE0-0xFF
+
+    @note Any MessagePack output created @ref to_msgpack can be successfully
+          parsed by @ref from_msgpack.
+
+    @param[in] i  an input in MessagePack format convertible to an input
+                  adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if unsupported features from MessagePack were
+    used in the given input @a i or if the input is not valid MessagePack
+    @throw parse_error.113 if a string was expected as map key, but not found
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    MessagePack format to a JSON value.,from_msgpack}
+
+    @sa http://msgpack.org
+    @sa @ref to_msgpack(const basic_json&) for the analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for
+        the related UBJSON format
+    @sa @ref from_bson(detail::input_adapter&&, const bool, const bool) for
+        the related BSON format
+
+    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
+           consume input adapters, removed start_index parameter, and added
+           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
+           since 3.2.0
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(InputType&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_msgpack(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(IteratorType first, IteratorType last,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json from_msgpack(const T* ptr, std::size_t len,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json from_msgpack(detail::span_input_adapter&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    /*!
+    @brief create a JSON value from an input in UBJSON format
+
+    Deserializes a given input @a i to a JSON value using the UBJSON (Universal
+    Binary JSON) serialization format.
+
+    The library maps UBJSON types to JSON value types as follows:
+
+    UBJSON type | JSON value type                         | marker
+    ----------- | --------------------------------------- | ------
+    no-op       | *no value, next value is read*          | `N`
+    null        | `null`                                  | `Z`
+    false       | `false`                                 | `F`
+    true        | `true`                                  | `T`
+    float32     | number_float                            | `d`
+    float64     | number_float                            | `D`
+    uint8       | number_unsigned                         | `U`
+    int8        | number_integer                          | `i`
+    int16       | number_integer                          | `I`
+    int32       | number_integer                          | `l`
+    int64       | number_integer                          | `L`
+    high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H'
+    string      | string                                  | `S`
+    char        | string                                  | `C`
+    array       | array (optimized values are supported)  | `[`
+    object      | object (optimized values are supported) | `{`
+
+    @note The mapping is **complete** in the sense that any UBJSON value can
+          be converted to a JSON value.
+
+    @param[in] i  an input in UBJSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if a parse error occurs
+    @throw parse_error.113 if a string could not be parsed successfully
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    UBJSON format to a JSON value.,from_ubjson}
+
+    @sa http://ubjson.org
+    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for
+        the related MessagePack format
+    @sa @ref from_bson(detail::input_adapter&&, const bool, const bool) for
+        the related BSON format
+
+    @since version 3.1.0; added @a allow_exceptions parameter since 3.2.0
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(InputType&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_ubjson(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(IteratorType first, IteratorType last,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json from_ubjson(const T* ptr, std::size_t len,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json from_ubjson(detail::span_input_adapter&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    /*!
+    @brief Create a JSON value from an input in BSON format
+
+    Deserializes a given input @a i to a JSON value using the BSON (Binary JSON)
+    serialization format.
+
+    The library maps BSON record types to JSON value types as follows:
+
+    BSON type       | BSON marker byte | JSON value type
+    --------------- | ---------------- | ---------------------------
+    double          | 0x01             | number_float
+    string          | 0x02             | string
+    document        | 0x03             | object
+    array           | 0x04             | array
+    binary          | 0x05             | still unsupported
+    undefined       | 0x06             | still unsupported
+    ObjectId        | 0x07             | still unsupported
+    boolean         | 0x08             | boolean
+    UTC Date-Time   | 0x09             | still unsupported
+    null            | 0x0A             | null
+    Regular Expr.   | 0x0B             | still unsupported
+    DB Pointer      | 0x0C             | still unsupported
+    JavaScript Code | 0x0D             | still unsupported
+    Symbol          | 0x0E             | still unsupported
+    JavaScript Code | 0x0F             | still unsupported
+    int32           | 0x10             | number_integer
+    Timestamp       | 0x11             | still unsupported
+    128-bit decimal float | 0x13       | still unsupported
+    Max Key         | 0x7F             | still unsupported
+    Min Key         | 0xFF             | still unsupported
+
+    @warning The mapping is **incomplete**. The unsupported mappings
+             are indicated in the table above.
+
+    @param[in] i  an input in BSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.114 if an unsupported BSON record type is encountered
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    BSON format to a JSON value.,from_bson}
+
+    @sa http://bsonspec.org/spec.html
+    @sa @ref to_bson(const basic_json&) for the analogous serialization
+    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for
+        the related MessagePack format
+    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
+        related UBJSON format
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(InputType&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_bson(detail::input_adapter&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(IteratorType first, IteratorType last,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json from_bson(const T* ptr, std::size_t len,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        return from_bson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json from_bson(detail::span_input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+    /// @}
+
+    //////////////////////////
+    // JSON Pointer support //
+    //////////////////////////
+
+    /// @name JSON Pointer functions
+    /// @{
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Uses a JSON pointer to retrieve a reference to the respective JSON value.
+    No bound checking is performed. Similar to @ref operator[](const typename
+    object_t::key_type&), `null` values are created in arrays and objects if
+    necessary.
+
+    In particular:
+    - If the JSON pointer points to an object key that does not exist, it
+      is created an filled with a `null` value before a reference to it
+      is returned.
+    - If the JSON pointer points to an array index that does not exist, it
+      is created an filled with a `null` value before a reference to it
+      is returned. All indices between the current maximum and the given
+      index are also filled with `null`.
+    - The special value `-` is treated as a synonym for the index past the
+      end.
+
+    @param[in] ptr  a JSON pointer
+
+    @return reference to the element pointed to by @a ptr
+
+    @complexity Constant.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+
+    @liveexample{The behavior is shown in the example.,operatorjson_pointer}
+
+    @since version 2.0.0
+    */
+    reference operator[](const json_pointer& ptr)
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Uses a JSON pointer to retrieve a reference to the respective JSON value.
+    No bound checking is performed. The function does not change the JSON
+    value; no `null` values are created. In particular, the special value
+    `-` yields an exception.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return const reference to the element pointed to by @a ptr
+
+    @complexity Constant.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+
+    @liveexample{The behavior is shown in the example.,operatorjson_pointer_const}
+
+    @since version 2.0.0
+    */
+    const_reference operator[](const json_pointer& ptr) const
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Returns a reference to the element at with specified JSON pointer @a ptr,
+    with bounds checking.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return reference to the element pointed to by @a ptr
+
+    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
+    begins with '0'. See example below.
+
+    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
+    is not a number. See example below.
+
+    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
+    is out of range. See example below.
+
+    @throw out_of_range.402 if the array index '-' is used in the passed JSON
+    pointer @a ptr. As `at` provides checked access (and no elements are
+    implicitly inserted), the index '-' is always invalid. See example below.
+
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
+    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
+    See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 2.0.0
+
+    @liveexample{The behavior is shown in the example.,at_json_pointer}
+    */
+    reference at(const json_pointer& ptr)
+    {
+        return ptr.get_checked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Returns a const reference to the element at with specified JSON pointer @a
+    ptr, with bounds checking.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return reference to the element pointed to by @a ptr
+
+    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
+    begins with '0'. See example below.
+
+    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
+    is not a number. See example below.
+
+    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
+    is out of range. See example below.
+
+    @throw out_of_range.402 if the array index '-' is used in the passed JSON
+    pointer @a ptr. As `at` provides checked access (and no elements are
+    implicitly inserted), the index '-' is always invalid. See example below.
+
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
+    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
+    See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 2.0.0
+
+    @liveexample{The behavior is shown in the example.,at_json_pointer_const}
+    */
+    const_reference at(const json_pointer& ptr) const
+    {
+        return ptr.get_checked(this);
+    }
+
+    /*!
+    @brief return flattened JSON value
+
+    The function creates a JSON object whose keys are JSON pointers (see [RFC
+    6901](https://tools.ietf.org/html/rfc6901)) and whose values are all
+    primitive. The original JSON value can be restored using the @ref
+    unflatten() function.
+
+    @return an object that maps JSON pointers to primitive values
+
+    @note Empty objects and arrays are flattened to `null` and will not be
+          reconstructed correctly by the @ref unflatten() function.
+
+    @complexity Linear in the size the JSON value.
+
+    @liveexample{The following code shows how a JSON object is flattened to an
+    object whose keys consist of JSON pointers.,flatten}
+
+    @sa @ref unflatten() for the reverse function
+
+    @since version 2.0.0
+    */
+    basic_json flatten() const
+    {
+        basic_json result(value_t::object);
+        json_pointer::flatten("", *this, result);
+        return result;
+    }
+
+    /*!
+    @brief unflatten a previously flattened JSON value
+
+    The function restores the arbitrary nesting of a JSON value that has been
+    flattened before using the @ref flatten() function. The JSON value must
+    meet certain constraints:
+    1. The value must be an object.
+    2. The keys must be JSON pointers (see
+       [RFC 6901](https://tools.ietf.org/html/rfc6901))
+    3. The mapped values must be primitive JSON types.
+
+    @return the original JSON from a flattened version
+
+    @note Empty objects and arrays are flattened by @ref flatten() to `null`
+          values and can not unflattened to their original type. Apart from
+          this example, for a JSON value `j`, the following is always true:
+          `j == j.flatten().unflatten()`.
+
+    @complexity Linear in the size the JSON value.
+
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+
+    @liveexample{The following code shows how a flattened JSON object is
+    unflattened into the original nested JSON object.,unflatten}
+
+    @sa @ref flatten() for the reverse function
+
+    @since version 2.0.0
+    */
+    basic_json unflatten() const
+    {
+        return json_pointer::unflatten(*this);
+    }
+
+    /// @}
+
+    //////////////////////////
+    // JSON Patch functions //
+    //////////////////////////
+
+    /// @name JSON Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON patch
+
+    [JSON Patch](http://jsonpatch.com) defines a JSON document structure for
+    expressing a sequence of operations to apply to a JSON) document. With
+    this function, a JSON Patch is applied to the current JSON value by
+    executing all operations from the patch.
+
+    @param[in] json_patch  JSON patch document
+    @return patched document
+
+    @note The application of a patch is atomic: Either all operations succeed
+          and the patched document is returned or an exception is thrown. In
+          any case, the original value is not changed: the patch is applied
+          to a copy of the value.
+
+    @throw parse_error.104 if the JSON patch does not consist of an array of
+    objects
+
+    @throw parse_error.105 if the JSON patch is malformed (e.g., mandatory
+    attributes are missing); example: `"operation add must have member path"`
+
+    @throw out_of_range.401 if an array index is out of range.
+
+    @throw out_of_range.403 if a JSON pointer inside the patch could not be
+    resolved successfully in the current JSON value; example: `"key baz not
+    found"`
+
+    @throw out_of_range.405 if JSON pointer has no parent ("add", "remove",
+    "move")
+
+    @throw other_error.501 if "test" operation was unsuccessful
+
+    @complexity Linear in the size of the JSON value and the length of the
+    JSON patch. As usually only a fraction of the JSON value is affected by
+    the patch, the complexity can usually be neglected.
+
+    @liveexample{The following code shows how a JSON patch is applied to a
+    value.,patch}
+
+    @sa @ref diff -- create a JSON patch by comparing two JSON values
+
+    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
+    @sa [RFC 6901 (JSON Pointer)](https://tools.ietf.org/html/rfc6901)
+
+    @since version 2.0.0
+    */
+    basic_json patch(const basic_json& json_patch) const
+    {
+        // make a working copy to apply the patch to
+        basic_json result = *this;
+
+        // the valid JSON Patch operations
+        enum class patch_operations {add, remove, replace, move, copy, test, invalid};
+
+        const auto get_op = [](const std::string & op)
+        {
+            if (op == "add")
+            {
+                return patch_operations::add;
+            }
+            if (op == "remove")
+            {
+                return patch_operations::remove;
+            }
+            if (op == "replace")
+            {
+                return patch_operations::replace;
+            }
+            if (op == "move")
+            {
+                return patch_operations::move;
+            }
+            if (op == "copy")
+            {
+                return patch_operations::copy;
+            }
+            if (op == "test")
+            {
+                return patch_operations::test;
+            }
+
+            return patch_operations::invalid;
+        };
+
+        // wrapper for "add" operation; add value at ptr
+        const auto operation_add = [&result](json_pointer & ptr, basic_json val)
+        {
+            // adding to the root of the target document means replacing it
+            if (ptr.empty())
+            {
+                result = val;
+                return;
+            }
+
+            // make sure the top element of the pointer exists
+            json_pointer top_pointer = ptr.top();
+            if (top_pointer != ptr)
+            {
+                result.at(top_pointer);
+            }
+
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result[ptr];
+
+            switch (parent.m_type)
+            {
+                case value_t::null:
+                case value_t::object:
+                {
+                    // use operator[] to add value
+                    parent[last_path] = val;
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    if (last_path == "-")
+                    {
+                        // special case: append to back
+                        parent.push_back(val);
+                    }
+                    else
+                    {
+                        const auto idx = json_pointer::array_index(last_path);
+                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size()))
+                        {
+                            // avoid undefined behavior
+                            JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
+                        }
+
+                        // default case: insert add offset
+                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
+                    }
+                    break;
+                }
+
+                // if there exists a parent it cannot be primitive
+                default:            // LCOV_EXCL_LINE
+                    JSON_ASSERT(false);  // LCOV_EXCL_LINE
+            }
+        };
+
+        // wrapper for "remove" operation; remove value at ptr
+        const auto operation_remove = [&result](json_pointer & ptr)
+        {
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result.at(ptr);
+
+            // remove child
+            if (parent.is_object())
+            {
+                // perform range check
+                auto it = parent.find(last_path);
+                if (JSON_HEDLEY_LIKELY(it != parent.end()))
+                {
+                    parent.erase(it);
+                }
+                else
+                {
+                    JSON_THROW(out_of_range::create(403, "key '" + last_path + "' not found"));
+                }
+            }
+            else if (parent.is_array())
+            {
+                // note erase performs range check
+                parent.erase(json_pointer::array_index(last_path));
+            }
+        };
+
+        // type check: top level value must be an array
+        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array()))
+        {
+            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects"));
+        }
+
+        // iterate and apply the operations
+        for (const auto& val : json_patch)
+        {
+            // wrapper to get a value for an operation
+            const auto get_value = [&val](const std::string & op,
+                                          const std::string & member,
+                                          bool string_type) -> basic_json &
+            {
+                // find value
+                auto it = val.m_value.object->find(member);
+
+                // context-sensitive error message
+                const auto error_msg = (op == "op") ? "operation" : "operation '" + op + "'";
+
+                // check if desired value is present
+                if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end()))
+                {
+                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have member '" + member + "'"));
+                }
+
+                // check if result is of type string
+                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string()))
+                {
+                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have string member '" + member + "'"));
+                }
+
+                // no error: return value
+                return it->second;
+            };
+
+            // type check: every element of the array must be an object
+            if (JSON_HEDLEY_UNLIKELY(!val.is_object()))
+            {
+                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects"));
+            }
+
+            // collect mandatory members
+            const auto op = get_value("op", "op", true).template get<std::string>();
+            const auto path = get_value(op, "path", true).template get<std::string>();
+            json_pointer ptr(path);
+
+            switch (get_op(op))
+            {
+                case patch_operations::add:
+                {
+                    operation_add(ptr, get_value("add", "value", false));
+                    break;
+                }
+
+                case patch_operations::remove:
+                {
+                    operation_remove(ptr);
+                    break;
+                }
+
+                case patch_operations::replace:
+                {
+                    // the "path" location must exist - use at()
+                    result.at(ptr) = get_value("replace", "value", false);
+                    break;
+                }
+
+                case patch_operations::move:
+                {
+                    const auto from_path = get_value("move", "from", true).template get<std::string>();
+                    json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json v = result.at(from_ptr);
+
+                    // The move operation is functionally identical to a
+                    // "remove" operation on the "from" location, followed
+                    // immediately by an "add" operation at the target
+                    // location with the value that was just removed.
+                    operation_remove(from_ptr);
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::copy:
+                {
+                    const auto from_path = get_value("copy", "from", true).template get<std::string>();
+                    const json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json v = result.at(from_ptr);
+
+                    // The copy is functionally identical to an "add"
+                    // operation at the target location using the value
+                    // specified in the "from" member.
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::test:
+                {
+                    bool success = false;
+                    JSON_TRY
+                    {
+                        // check if "value" matches the one at "path"
+                        // the "path" location must exist - use at()
+                        success = (result.at(ptr) == get_value("test", "value", false));
+                    }
+                    JSON_INTERNAL_CATCH (out_of_range&)
+                    {
+                        // ignore out of range errors: success remains false
+                    }
+
+                    // throw an exception if test fails
+                    if (JSON_HEDLEY_UNLIKELY(!success))
+                    {
+                        JSON_THROW(other_error::create(501, "unsuccessful: " + val.dump()));
+                    }
+
+                    break;
+                }
+
+                default:
+                {
+                    // op must be "add", "remove", "replace", "move", "copy", or
+                    // "test"
+                    JSON_THROW(parse_error::create(105, 0, "operation value '" + op + "' is invalid"));
+                }
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief creates a diff as a JSON patch
+
+    Creates a [JSON Patch](http://jsonpatch.com) so that value @a source can
+    be changed into the value @a target by calling @ref patch function.
+
+    @invariant For two JSON values @a source and @a target, the following code
+    yields always `true`:
+    @code {.cpp}
+    source.patch(diff(source, target)) == target;
+    @endcode
+
+    @note Currently, only `remove`, `add`, and `replace` operations are
+          generated.
+
+    @param[in] source  JSON value to compare from
+    @param[in] target  JSON value to compare against
+    @param[in] path    helper value to create JSON pointers
+
+    @return a JSON patch to convert the @a source to @a target
+
+    @complexity Linear in the lengths of @a source and @a target.
+
+    @liveexample{The following code shows how a JSON patch is created as a
+    diff for two JSON values.,diff}
+
+    @sa @ref patch -- apply a JSON patch
+    @sa @ref merge_patch -- apply a JSON Merge Patch
+
+    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
+
+    @since version 2.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json diff(const basic_json& source, const basic_json& target,
+                           const std::string& path = "")
+    {
+        // the patch
+        basic_json result(value_t::array);
+
+        // if the values are the same, return empty patch
+        if (source == target)
+        {
+            return result;
+        }
+
+        if (source.type() != target.type())
+        {
+            // different types: replace value
+            result.push_back(
+            {
+                {"op", "replace"}, {"path", path}, {"value", target}
+            });
+            return result;
+        }
+
+        switch (source.type())
+        {
+            case value_t::array:
+            {
+                // first pass: traverse common elements
+                std::size_t i = 0;
+                while (i < source.size() && i < target.size())
+                {
+                    // recursive call to compare array values at index i
+                    auto temp_diff = diff(source[i], target[i], path + "/" + std::to_string(i));
+                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    ++i;
+                }
+
+                // i now reached the end of at least one array
+                // in a second pass, traverse the remaining elements
+
+                // remove my remaining elements
+                const auto end_index = static_cast<difference_type>(result.size());
+                while (i < source.size())
+                {
+                    // add operations in reverse order to avoid invalid
+                    // indices
+                    result.insert(result.begin() + end_index, object(
+                    {
+                        {"op", "remove"},
+                        {"path", path + "/" + std::to_string(i)}
+                    }));
+                    ++i;
+                }
+
+                // add other remaining elements
+                while (i < target.size())
+                {
+                    result.push_back(
+                    {
+                        {"op", "add"},
+                        {"path", path + "/-"},
+                        {"value", target[i]}
+                    });
+                    ++i;
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // first pass: traverse this object's elements
+                for (auto it = source.cbegin(); it != source.cend(); ++it)
+                {
+                    // escape the key name to be used in a JSON patch
+                    const auto key = json_pointer::escape(it.key());
+
+                    if (target.find(it.key()) != target.end())
+                    {
+                        // recursive call to compare object values at key it
+                        auto temp_diff = diff(it.value(), target[it.key()], path + "/" + key);
+                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    }
+                    else
+                    {
+                        // found a key that is not in o -> remove it
+                        result.push_back(object(
+                        {
+                            {"op", "remove"}, {"path", path + "/" + key}
+                        }));
+                    }
+                }
+
+                // second pass: traverse other object's elements
+                for (auto it = target.cbegin(); it != target.cend(); ++it)
+                {
+                    if (source.find(it.key()) == source.end())
+                    {
+                        // found a key that is not in this -> add it
+                        const auto key = json_pointer::escape(it.key());
+                        result.push_back(
+                        {
+                            {"op", "add"}, {"path", path + "/" + key},
+                            {"value", it.value()}
+                        });
+                    }
+                }
+
+                break;
+            }
+
+            default:
+            {
+                // both primitive type: replace value
+                result.push_back(
+                {
+                    {"op", "replace"}, {"path", path}, {"value", target}
+                });
+                break;
+            }
+        }
+
+        return result;
+    }
+
+    /// @}
+
+    ////////////////////////////////
+    // JSON Merge Patch functions //
+    ////////////////////////////////
+
+    /// @name JSON Merge Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON Merge Patch
+
+    The merge patch format is primarily intended for use with the HTTP PATCH
+    method as a means of describing a set of modifications to a target
+    resource's content. This function applies a merge patch to the current
+    JSON value.
+
+    The function implements the following algorithm from Section 2 of
+    [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396):
+
+    ```
+    define MergePatch(Target, Patch):
+      if Patch is an Object:
+        if Target is not an Object:
+          Target = {} // Ignore the contents and set it to an empty Object
+        for each Name/Value pair in Patch:
+          if Value is null:
+            if Name exists in Target:
+              remove the Name/Value pair from Target
+          else:
+            Target[Name] = MergePatch(Target[Name], Value)
+        return Target
+      else:
+        return Patch
+    ```
+
+    Thereby, `Target` is the current object; that is, the patch is applied to
+    the current value.
+
+    @param[in] apply_patch  the patch to apply
+
+    @complexity Linear in the lengths of @a patch.
+
+    @liveexample{The following code shows how a JSON Merge Patch is applied to
+    a JSON document.,merge_patch}
+
+    @sa @ref patch -- apply a JSON patch
+    @sa [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396)
+
+    @since version 3.0.0
+    */
+    void merge_patch(const basic_json& apply_patch)
+    {
+        if (apply_patch.is_object())
+        {
+            if (!is_object())
+            {
+                *this = object();
+            }
+            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
+            {
+                if (it.value().is_null())
+                {
+                    erase(it.key());
+                }
+                else
+                {
+                    operator[](it.key()).merge_patch(it.value());
+                }
+            }
+        }
+        else
+        {
+            *this = apply_patch;
+        }
+    }
+
+    /// @}
+};
+
+/*!
+@brief user-defined to_string function for JSON values
+
+This function implements a user-defined to_string  for JSON objects.
+
+@param[in] j  a JSON object
+@return a std::string object
+*/
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
+{
+    return j.dump();
+}
+} // namespace nlohmann
+
+///////////////////////
+// nonmember support //
+///////////////////////
+
+// specialization of std::swap, and std::hash
+namespace std
+{
+
+/// hash value for JSON objects
+template<>
+struct hash<nlohmann::json>
+{
+    /*!
+    @brief return a hash value for a JSON object
+
+    @since version 1.0.0
+    */
+    std::size_t operator()(const nlohmann::json& j) const
+    {
+        return nlohmann::detail::hash(j);
+    }
+};
+
+/// specialization for std::less<value_t>
+/// @note: do not remove the space after '<',
+///        see https://github.com/nlohmann/json/pull/679
+template<>
+struct less<::nlohmann::detail::value_t>
+{
+    /*!
+    @brief compare two value_t enum values
+    @since version 3.0.0
+    */
+    bool operator()(nlohmann::detail::value_t lhs,
+                    nlohmann::detail::value_t rhs) const noexcept
+    {
+        return nlohmann::detail::operator<(lhs, rhs);
+    }
+};
+
+// C++20 prohibit function specialization in the std namespace.
+#ifndef JSON_HAS_CPP_20
+
+/*!
+@brief exchanges the values of two JSON objects
+
+@since version 1.0.0
+*/
+template<>
+inline void swap<nlohmann::json>(nlohmann::json& j1, nlohmann::json& j2) noexcept(
+    is_nothrow_move_constructible<nlohmann::json>::value&&
+    is_nothrow_move_assignable<nlohmann::json>::value
+                              )
+{
+    j1.swap(j2);
+}
+
+#endif
+
+} // namespace std
+
+/*!
+@brief user-defined string literal for JSON values
+
+This operator implements a user-defined string literal for JSON objects. It
+can be used by adding `"_json"` to a string literal and returns a JSON object
+if no parse error occurred.
+
+@param[in] s  a string representation of a JSON object
+@param[in] n  the length of string @a s
+@return a JSON object
+
+@since version 1.0.0
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline nlohmann::json operator "" _json(const char* s, std::size_t n)
+{
+    return nlohmann::json::parse(s, s + n);
+}
+
+/*!
+@brief user-defined string literal for JSON pointer
+
+This operator implements a user-defined string literal for JSON Pointers. It
+can be used by adding `"_json_pointer"` to a string literal and returns a JSON pointer
+object if no parse error occurred.
+
+@param[in] s  a string representation of a JSON Pointer
+@param[in] n  the length of string @a s
+@return a JSON pointer object
+
+@since version 2.0.0
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
+{
+    return nlohmann::json::json_pointer(std::string(s, n));
+}
+
+// #include <nlohmann/detail/macro_unscope.hpp>
+
+
+// restore GCC/clang diagnostic settings
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+    #pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+    #pragma GCC diagnostic pop
+#endif
+
+// clean up
+#undef JSON_ASSERT
+#undef JSON_INTERNAL_CATCH
+#undef JSON_CATCH
+#undef JSON_THROW
+#undef JSON_TRY
+#undef JSON_HAS_CPP_14
+#undef JSON_HAS_CPP_17
+#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
+#undef NLOHMANN_BASIC_JSON_TPL
+#undef JSON_EXPLICIT
+
+// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
+#undef JSON_HEDLEY_ALWAYS_INLINE
+#undef JSON_HEDLEY_ARM_VERSION
+#undef JSON_HEDLEY_ARM_VERSION_CHECK
+#undef JSON_HEDLEY_ARRAY_PARAM
+#undef JSON_HEDLEY_ASSUME
+#undef JSON_HEDLEY_BEGIN_C_DECLS
+#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#undef JSON_HEDLEY_CLANG_HAS_WARNING
+#undef JSON_HEDLEY_COMPCERT_VERSION
+#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#undef JSON_HEDLEY_CONCAT
+#undef JSON_HEDLEY_CONCAT3
+#undef JSON_HEDLEY_CONCAT3_EX
+#undef JSON_HEDLEY_CONCAT_EX
+#undef JSON_HEDLEY_CONST
+#undef JSON_HEDLEY_CONSTEXPR
+#undef JSON_HEDLEY_CONST_CAST
+#undef JSON_HEDLEY_CPP_CAST
+#undef JSON_HEDLEY_CRAY_VERSION
+#undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#undef JSON_HEDLEY_C_DECL
+#undef JSON_HEDLEY_DEPRECATED
+#undef JSON_HEDLEY_DEPRECATED_FOR
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#undef JSON_HEDLEY_DIAGNOSTIC_POP
+#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#undef JSON_HEDLEY_DMC_VERSION
+#undef JSON_HEDLEY_DMC_VERSION_CHECK
+#undef JSON_HEDLEY_EMPTY_BASES
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#undef JSON_HEDLEY_END_C_DECLS
+#undef JSON_HEDLEY_FLAGS
+#undef JSON_HEDLEY_FLAGS_CAST
+#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#undef JSON_HEDLEY_GCC_HAS_FEATURE
+#undef JSON_HEDLEY_GCC_HAS_WARNING
+#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#undef JSON_HEDLEY_GCC_VERSION
+#undef JSON_HEDLEY_GCC_VERSION_CHECK
+#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#undef JSON_HEDLEY_GNUC_HAS_WARNING
+#undef JSON_HEDLEY_GNUC_VERSION
+#undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#undef JSON_HEDLEY_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_BUILTIN
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_EXTENSION
+#undef JSON_HEDLEY_HAS_FEATURE
+#undef JSON_HEDLEY_HAS_WARNING
+#undef JSON_HEDLEY_IAR_VERSION
+#undef JSON_HEDLEY_IAR_VERSION_CHECK
+#undef JSON_HEDLEY_IBM_VERSION
+#undef JSON_HEDLEY_IBM_VERSION_CHECK
+#undef JSON_HEDLEY_IMPORT
+#undef JSON_HEDLEY_INLINE
+#undef JSON_HEDLEY_INTEL_VERSION
+#undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#undef JSON_HEDLEY_IS_CONSTANT
+#undef JSON_HEDLEY_IS_CONSTEXPR_
+#undef JSON_HEDLEY_LIKELY
+#undef JSON_HEDLEY_MALLOC
+#undef JSON_HEDLEY_MESSAGE
+#undef JSON_HEDLEY_MSVC_VERSION
+#undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#undef JSON_HEDLEY_NEVER_INLINE
+#undef JSON_HEDLEY_NON_NULL
+#undef JSON_HEDLEY_NO_ESCAPE
+#undef JSON_HEDLEY_NO_RETURN
+#undef JSON_HEDLEY_NO_THROW
+#undef JSON_HEDLEY_NULL
+#undef JSON_HEDLEY_PELLES_VERSION
+#undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#undef JSON_HEDLEY_PGI_VERSION
+#undef JSON_HEDLEY_PGI_VERSION_CHECK
+#undef JSON_HEDLEY_PREDICT
+#undef JSON_HEDLEY_PRINTF_FORMAT
+#undef JSON_HEDLEY_PRIVATE
+#undef JSON_HEDLEY_PUBLIC
+#undef JSON_HEDLEY_PURE
+#undef JSON_HEDLEY_REINTERPRET_CAST
+#undef JSON_HEDLEY_REQUIRE
+#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#undef JSON_HEDLEY_REQUIRE_MSG
+#undef JSON_HEDLEY_RESTRICT
+#undef JSON_HEDLEY_RETURNS_NON_NULL
+#undef JSON_HEDLEY_SENTINEL
+#undef JSON_HEDLEY_STATIC_ASSERT
+#undef JSON_HEDLEY_STATIC_CAST
+#undef JSON_HEDLEY_STRINGIFY
+#undef JSON_HEDLEY_STRINGIFY_EX
+#undef JSON_HEDLEY_SUNPRO_VERSION
+#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#undef JSON_HEDLEY_TINYC_VERSION
+#undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#undef JSON_HEDLEY_TI_ARMCL_VERSION
+#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL2000_VERSION
+#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL430_VERSION
+#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL6X_VERSION
+#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL7X_VERSION
+#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CLPRU_VERSION
+#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#undef JSON_HEDLEY_TI_VERSION
+#undef JSON_HEDLEY_TI_VERSION_CHECK
+#undef JSON_HEDLEY_UNAVAILABLE
+#undef JSON_HEDLEY_UNLIKELY
+#undef JSON_HEDLEY_UNPREDICTABLE
+#undef JSON_HEDLEY_UNREACHABLE
+#undef JSON_HEDLEY_UNREACHABLE_RETURN
+#undef JSON_HEDLEY_VERSION
+#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#undef JSON_HEDLEY_VERSION_ENCODE
+#undef JSON_HEDLEY_WARNING
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#undef JSON_HEDLEY_FALL_THROUGH
+
+
+
+#endif  // INCLUDE_NLOHMANN_JSON_HPP_